# HG changeset patch
# User Chris Cannam <cannam@all-day-breakfast.com>
# Date 1363793750 0
# Node ID 89f5e221ed7b7bf9960584a737393ab5460e9acd
# Parent  d278df1123f9a9590d0fd1980c9934ae8f74820c
Add FFTW3

diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/AUTHORS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/AUTHORS	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,14 @@
+Authors of FFTW (reachable at fftw@fftw.org):
+
+Matteo Frigo <athena@fftw.org>
+Stevenj G. Johnson <stevenj@alum.mit.edu>
+
+Stefan Kral <skral@fftw.org> wrote genfft-k7/*.ml*, which was
+added in fftw-3.0 and removed in fftw-3.2.
+
+Support for the Cell Broadband Engine was graciously donated by the
+IBM Austin Research Lab, which was added in fftw-3.2 and removed in
+fftw-3.3.
+
+Support for MIPS64 paired-single SIMD instructions was graciously
+donated by CodeSourcery, Inc.
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/CONVENTIONS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/CONVENTIONS	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,65 @@
+Code conventions used internally by fftw3 (not in API):
+
+LEARN FROM THE MASTERS: read Ken Thompson's C compiler in Plan 9.
+   Avoid learning from C++/Java programs.
+
+INDENTATION: K&R, 5 spaces/tab.  In case of doubt, indent -kr -i5.
+
+NAMES: keep them short.  Shorter than you think.  The Bible was written
+   without vowels.  Don't outsmart the Bible.
+
+   Common names:
+
+   R       : real type, aka fftw_real
+   E       : real type for local variables (possibly extra precision)
+   C       : complex type
+   sz      : size
+   vecsz   : vector size
+   is, os  : input/output stride
+   ri, ii  : real/imag input (complex data)
+   ro, io  : real/imag output (complex data)
+   I, O    : real input/output (real data)
+   A       : assert
+   CK      : check
+   S       : solver, defined internally to each solver file
+   P       : plan, defined internally to each solver file
+   k       : codelet
+   X(...)  : used for mangling of external names (see below)
+   K(...)  : floating-point constant, in E precision
+
+   If a name is used often and must have the form fftw_foo to avoid
+   namespace pollution, #define FOO fftw_foo and use the short name.
+
+   Leave that hungarian crap to MS.  foo_t counts as hungarian: use
+   foo instead.  foo is lowercase so that it does not look like a DOS
+   program. Exception: typedef struct foo_s {...} foo;  instead of
+   typedef struct foo {...} foo;  for C++ compatibility.
+
+NAME MANGLING: use X(foo) for external names instead of fftw_foo.
+    X(foo) expands to fftwf_foo or fftw_foo, depending on the
+    precision.  (Unfortunately, this is a ugly form of hungarian
+    notation.  Grrr...)  Names that are not exported do not need to be
+    mangled.
+
+REPEATED CODE: favor a table.  E.g., do not write
+
+    foo("xxx", 1);
+    foo("yyy", 2);
+    foo("zzz", -1);
+
+    Instead write
+
+      struct { const char *nam, int arg } footab[] = {
+	{ "xxx", 1 },
+	{ "yyy", 2 },
+	{ "zzz", -1 }
+      };
+
+    and loop over footab.  Rationale: it saves code space.
+    Similarly, replace a switch statement with a table whenever
+    possible.
+
+C++: The code should compile as a C++ program. Run the code through
+    gcc -xc++ .  The extra C++ restrictions are unnecessary, of
+    course, but this will save us from a flood of complaints when
+    we release the code.
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/COPYING
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/COPYING	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,340 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+     51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year  name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/COPYRIGHT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/COPYRIGHT	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/ChangeLog
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/ChangeLog	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27526 @@
+Sat Nov 24 22:37:54 EST 2012  stevenj@fftw.org
+  * fixed deadlock bug caused by bogosity flag getting out of synch between processes; thanks to Michael Pippig for the bug report
+
+    M ./kernel/ifftw.h +1
+    M ./kernel/planner.c -3 +6
+    M ./mpi/api.c +12
+
+Wed Nov 21 18:34:29 EST 2012  athena@fftw.org
+  * Updated NEWS
+
+    M ./NEWS -2 +7
+
+Wed Nov 21 18:33:15 EST 2012  athena@fftw.org
+  * use 2x2 AVX transposition instead of individual stores.
+  
+  This seems to improve single-precision AVX on Sandy Bridge machines.
+  
+
+    M ./simd-support/simd-avx.h -2 +14
+
+Tue Nov 20 12:18:00 EST 2012  stevenj@fftw.org
+  * revert part of Taylor patch to acx_mpi.m4: do not link -lmpi if mpicc works without libraries, as -lmpi may be some completely different MPI implementation
+
+    M ./m4/acx_mpi.m4 -3 +3
+
+Tue Nov 20 11:44:57 EST 2012  stevenj@fftw.org
+  * fix deadlock bug (thanks to Michael Pippig for the bug report and patch, and to Graham Dennis for the bug report) in which some processes called MPI_Alltoall and some called MPI_Alltoallv
+
+    M ./mpi/transpose-alltoall.c -3 +2
+
+Mon Oct 29 15:20:01 EDT 2012  athena@fftw.org
+  * fix texinfo quirk
+
+    M ./doc/tutorial.texi -2 +2
+
+Mon Oct 29 09:16:43 EDT 2012  athena@fftw.org
+  * clarify that padding only applies to in-place transforms
+
+    M ./doc/tutorial.texi -5 +10
+
+Sun Oct 28 18:42:48 EDT 2012  athena@fftw.org
+  * make the index-computation logic less paranoid
+  
+  The problem is that for each K and for each expression of the form P[I
+  + STRIDE * K] in a loop, most compilers will try to lift an induction
+  variable PK := &P[I + STRIDE * K].  In large codelets we have many
+  such values of K.  For example, a codelet of size 32 with 4 input
+  pointers will generate O(128) induction variables, which will likely
+  overflow the register set, which is likely worse than doing the index
+  computation in the first place.
+  
+  In the past we (wisely and correctly) assumed that compilers will do
+  the wrong thing, and consequently we disabled the induction-variable
+  "optimization" altogether by setting STRIDE ^= ZERO, where ZERO is a
+  value guaranteed to be 0.  Since the compiler does not know that
+  ZERO=0, it cannot perform its "optimization" and it is forced to
+  behave sensibly.
+  
+  With this patch, FFTW is a little bit less paranoid.  FFTW now
+  disables the induction-variable optimization" only when we estimate
+  that the codelet uses more than ESTIMATED_AVAILABLE_INDEX_REGISTERS
+  induction variables.
+  
+  Currently we set ESTIMATED_AVAILABLE_INDEX_REGISTERS=16.  16 registers ought
+  to be enough for anybody (or so the amd64 and ARM ISA's seem to imply).
+  
+
+    M ./genfft/gen_hc2c.ml -1 +1
+    M ./genfft/gen_hc2cdft.ml -1 +1
+    M ./genfft/gen_hc2cdft_c.ml -1 +1
+    M ./genfft/gen_hc2hc.ml -1 +1
+    M ./genfft/gen_notw.ml -2 +2
+    M ./genfft/gen_notw_c.ml -2 +2
+    M ./genfft/gen_r2cb.ml -3 +3
+    M ./genfft/gen_r2cf.ml -3 +3
+    M ./genfft/gen_r2r.ml -2 +2
+    M ./genfft/gen_twiddle.ml -1 +1
+    M ./genfft/gen_twiddle_c.ml -1 +1
+    M ./genfft/gen_twidsq.ml -2 +2
+    M ./genfft/gen_twidsq_c.ml -2 +2
+    M ./genfft/genutil.ml -1 +2
+    M ./kernel/ifftw.h -3 +20
+
+Sun Oct 28 18:33:24 EDT 2012  athena@fftw.org
+  * silence warnings
+
+    M ./kernel/buffered.c +1
+    M ./rdft/rank0.c +1
+
+Sat Oct 27 09:58:49 EDT 2012  athena@fftw.org
+  * bump version to 3.3.3
+
+    M ./NEWS +7
+    M ./configure.ac -1 +1
+
+Sat Oct 27 09:55:15 EDT 2012  athena@fftw.org
+  * evaluate plans for >1ms when using gettimeofday()
+  
+  The previous limit 10ms was too paranoid, and it made life difficult
+  on machines without an "official" cycle counter, such as ARM.
+
+    M ./kernel/timer.c -1 +1
+
+Sat Oct 27 09:46:04 EDT 2012  athena@fftw.org
+  * use 4-way NEON SIMD instead of 2-way
+  
+  Kai-Uwe Bloem tried to warn me a year ago that 128-bit NEON was better
+  than 64-bit NEON even on machines with a 64-bit pipe, but I foolishly
+  did not listen.  Now that 128-bit NEON pipes are starting to appear on
+  the market it is definitely time to switch.
+  
+
+    M ./simd-support/simd-neon.h -55 +100
+
+Wed Sep 26 14:21:12 EDT 2012  athena@fftw.org
+  * Note that fftw-3.3 includes MPI support
+
+    M ./doc/intro.texi -5 +4
+
+Wed Jul 18 11:25:40 EDT 2012  athena@fftw.org
+  * remove obsolete unused function
+
+    M ./dft/bluestein.c -14
+
+Fri Jun 29 15:57:14 EDT 2012  stevenj@fftw.org
+  * whoops, call omp_get_max_threads; thanks to Hanno Rein for the bug report
+
+    M ./doc/threads.texi -1 +1
+
+Sat Apr 28 10:55:09 EDT 2012  athena@fftw.org
+  * Fix libfftw3/libfftw3_threads chicken-egg problem
+  
+  On most systems we want to build libfftw3 first, so that
+  libfftw3_threads can depend upon libfftw3.  When producing a single
+  combined-thread library (e.g. on Windows) we want the opposite,
+  so that libfftw3 can include libfftw3_threads.
+  
+
+    M ./Makefile.am -1 +18
+    M ./threads/Makefile.am +5
+
+Sat Apr 28 10:11:28 EDT 2012  athena@fftw.org
+  * updated NEWS for 3.3.2
+
+    M ./NEWS +15
+
+Thu Apr 26 19:36:11 EDT 2012  athena@fftw.org
+  * change revision to 3.3.2
+
+    M ./configure.ac -2 +2
+
+Thu Apr 26 19:31:02 EDT 2012  athena@fftw.org
+  * Remove old aligned_main() hack.
+  
+  On i386, in our benchmark program we used to manually aligned the
+  stack to 16-byte boundary via asm trickery.  This was a good idea in
+  1999 (and it was actually necessary to make things work) but the hack
+  is now obsolete and it seems to break gcc-4.7.  So the hack is now
+  gone.
+  
+
+    M ./libbench2/Makefile.am -6 +6
+    R ./libbench2/aligned-main.c
+    M ./libbench2/main.c -1 +1
+
+Thu Mar 29 16:26:16 EDT 2012  athena@fftw.org
+  * Bugfix: a couple of uninitialized values in the benchmark program
+
+    M ./libbench2/problem.c -1 +1
+
+Tue Mar 20 19:03:47 EDT 2012  athena@fftw.org
+  * make libfftw{threads,mpi} depend upon libfftw for libtool purposes
+  
+  Thanks Julian Taylor for the patch
+
+    M ./Makefile.am -1 +1
+    M ./m4/acx_mpi.m4 -3 +3
+    M ./mpi/Makefile.am +1
+    M ./threads/Makefile.am +1
+
+Tue Mar  6 04:44:00 EST 2012  stevenj@fftw.org
+  * formatting tweak
+
+    M ./doc/reference.texi -1 +1
+
+Mon Mar  5 21:05:27 EST 2012  athena@fftw.org
+  * destroying => overwriting
+
+    M ./doc/tutorial.texi -1 +1
+
+Fri Mar  2 10:31:20 EST 2012  stevenj@fftw.org
+  * note that WISDOM_ONLY is a documented flag
+
+    M ./api/fftw3.h -1 +1
+
+Fri Mar  2 10:27:08 EST 2012  stevenj@fftw.org
+  * check for icc pretending to be gcc before including quad-precision decls; thanks to Michael Anselmi for the bug report
+
+    M ./api/fftw3.h +1
+
+Fri Mar  2 10:23:19 EST 2012  stevenj@fftw.org
+  * foo_CFLAGS needs to manually include AM_CFLAGS; thanks to Henry Gomersall for the Windows bug report
+
+    M ./threads/Makefile.am -2 +2
+
+Sat Feb 25 15:21:39 EST 2012  athena@fftw.org
+  * update for latest mingw
+
+    M ./support/BUILD-MINGW32.sh -3 +3
+    M ./support/BUILD-MINGW64.sh -2 +7
+
+Sat Feb 25 15:21:17 EST 2012  athena@fftw.org
+  tagged fftw-3.3.1
+
+
+Mon Feb 20 23:06:13 EST 2012  stevenj@fftw.org
+  * added Fortran NEWS
+
+    M ./NEWS +6
+
+Mon Feb 20 23:00:13 EST 2012  stevenj@fftw.org
+  * move non-portable extended/quad precision F03 interfaces into separate .f03 files (while keeping double/single in fftw3.f03 for minimal ABI breakage)
+
+    M ./api/Makefile.am -3 +10
+    M ./api/f03api.sh -2 +4
+    M ./api/genf03.pl -5 +5
+    M ./doc/modern-fortran.texi -2 +39
+    M ./doc/mpi.texi -2 +2
+    M ./mpi/Makefile.am -5 +11
+    M ./mpi/f03api.sh -2 +4
+
+Mon Feb 20 11:21:57 EST 2012  athena@fftw.org
+  * rm mpi/fftw3-mpi.f03 at make clean time; thanks Tyler Luchko for the bug report.
+
+    M ./mpi/Makefile.am +1
+
+Mon Feb 20 11:18:24 EST 2012  athena@fftw.org
+  * Disable a Visual Studion warning that was obnoxious enough for Sebastian Schuberth to send us a patch.
+
+    M ./simd-support/simd-sse2.h +14
+
+Mon Feb 20 11:18:06 EST 2012  athena@fftw.org
+  * Change version to 3.3.1
+
+    M ./configure.ac -1 +1
+
+Mon Feb 20 11:03:15 EST 2012  athena@fftw.org
+  * Integrated Visual Studio AVX patches by Carsten Steger
+
+    M ./NEWS +3
+    M ./simd-support/amd64-cpuid.h -10 +40
+    M ./simd-support/simd-avx.h +6
+    M ./simd-support/simd-sse2.h +8
+    M ./simd-support/x86-cpuid.h +7
+
+Wed Nov  9 10:13:32 EST 2011  stevenj@fftw.org
+  * typo
+
+    M ./doc/modern-fortran.texi -2 +2
+
+Tue Nov  8 22:45:09 EST 2011  stevenj@fftw.org
+  * add missing F77 set_timelimit function; thanks to Martin Diehl for the bug repory
+
+    M ./api/f77funcs.h +5
+
+Sun Sep 25 10:54:56 EDT 2011  athena@fftw.org
+  * note requirement of /machine:x64 in windows x64 README
+
+    M ./support/BUILD-MINGW64.sh +7
+
+Sun Sep 18 09:28:20 EDT 2011  athena@fftw.org
+  * AVX detection for MSVC
+
+    M ./simd-support/amd64-cpuid.h -2 +10
+
+Tue Sep 13 14:58:29 EDT 2011  athena@fftw.org
+  * compile with C89
+
+    M ./dft/bluestein.c -1 +1
+    M ./dft/generic.c -1 +1
+    M ./rdft/dht-rader.c -1 +1
+
+Sat Sep  3 16:25:50 EDT 2011  athena@fftw.org
+  * use the same search pruning heuristics for threaded plans as for nonthreaded plans
+
+    M ./threads/ct.c -1 +3
+    M ./threads/hc2hc.c -1 +3
+
+Sat Sep  3 16:12:11 EDT 2011  athena@fftw.org
+  * shorten ESTIMATE planning time for certain weird sizes
+  
+  FFTW includes a collection of "solvers" that apply to a subset of
+  "problems".  Assume for simplicity that a "problem" is a single 1D
+  complex transform of size N, even though real "problems" are much more
+  general than that.  FFTW includes three "prime" solvers called
+  "generic", "bluestein", and "rader", which implement different
+  algorithms for prime sizes.
+  
+  Now, for a "problem" of size 13 (say) FFTW also includes special code
+  that handles that size at high speed.  It would be a waste of time to
+  measure the execution time of the prime solvers, since we know that
+  the special code is way faster.  However, FFTW is modular and one may
+  or may not include the special code for size 13, in which case we must
+  resort to one of the "prime" solvers.  To address this issue, the
+  "prime" solvers (and others) are proclaimed to be SLOW".  When
+  planning, FFTW first tries to produce a plan ignoring all the SLOW
+  solvers, and if this fails FFTW tries again allowing SLOW solvers.
+  
+  This heuristic works ok unless the sizes are too large.  For example
+  for 1044000=2*2*2*2*2*3*3*5*5*5*29 FFTW explores a huge search tree of
+  all zillion factorizations of 1044000/29, failing every time because
+  29 is SLOW; then it finally allows SLOW solvers and finds a solution
+  immediately.
+  
+  This patch proclaims solvers to be SLOW only for small values of N.
+  For example, the "generic" solver implements an O(n^2) DFT algorithm;
+  we say that it is SLOW only for N<=16.
+  
+  The side effects of this choice are as follows.  If one modifies FFTW to
+  include a fast solver of size 17, then planning for N=17*K will be
+  slower than today, because FFTW till try both the fast solver and the
+  generic solver (which is SLOW today and therefore not tried, but is no
+  longer SLOW after the patch).  If one removes a fast solver, of size say
+  13, then he may still fall into the current exponential-search behavior
+  for "problems" of size 13*HIGHLY_FACTORIZABLE_N.
+  
+  If somebody had compleined about transforms of size 1044000 ten years
+  ago, "don't do that" would have been an acceptable answer.  I guess the
+  bar is higher today, so I am going to include this patch in our 3.3.1
+  release despite their side-effects for people who want to modify FFTW.
+  
+
+    M ./dft/bluestein.c -5 +14
+    M ./dft/generic.c -15 +6
+    M ./dft/rader.c -7 +7
+    M ./kernel/ifftw.h +8
+    M ./kernel/primes.c +6
+    M ./rdft/dht-rader.c -7 +8
+    M ./rdft/generic.c -14 +4
+
+Sat Aug 27 13:55:24 EDT 2011  athena@fftw.org
+  * Fix typo fftw_execute_dft_r2r => fftw_execute_r2r
+  
+  Thanks KIU Shueng Chuan for the bug report.
+
+    M ./doc/reference.texi -1 +1
+
+Fri Aug 26 06:13:55 EDT 2011  athena@fftw.org
+  * In Rader's algorithm, compute the generator lazily.  
+  
+  The planner was spending a lot of time computing generators for
+  plans that were immediately discarded.  Now we compute generators
+  only when absolutely needed.
+
+    M ./NEWS +5
+    M ./dft/rader.c -3 +4
+    M ./rdft/dht-rader.c -3 +4
+
+Sun Aug 21 16:27:31 EDT 2011  athena@fftw.org
+  tagged fftw-3.3.1-beta1
+
+
+Sun Aug 21 16:16:38 EDT 2011  athena@fftw.org
+  * Release notes for 3.3.1-beta1
+
+    M ./NEWS +4
+    M ./configure.ac -1 +1
+    M ./doc/install.texi -8 +18
+    M ./doc/other.texi -2 +3
+
+Fri Aug 19 19:59:17 EDT 2011  stevenj@alum.mit.edu
+  * make fftw_mpi_block routine 10x faster, since it is being called zillions of times (thanks to Tom Vacek for the profiling)
+
+    M ./mpi/block.c -6 +2
+
+Thu Aug 18 14:19:36 EDT 2011  athena@fftw.org
+  * Implement autodetection of NEON extensions
+
+    M ./simd-support/neon.c -4 +51
+
+Sun Aug 14 14:12:29 EDT 2011  athena@fftw.org
+  * Update the FSF address.
+  
+  The FSF moved downtown.
+
+    M ./COPYRIGHT -1 +1
+    M ./api/api.h -1 +1
+    M ./api/apiplan.c -1 +1
+    M ./api/configure.c -1 +1
+    M ./api/execute-dft-c2r.c -1 +1
+    M ./api/execute-dft-r2c.c -1 +1
+    M ./api/execute-dft.c -1 +1
+    M ./api/execute-r2r.c -1 +1
+    M ./api/execute-split-dft-c2r.c -1 +1
+    M ./api/execute-split-dft-r2c.c -1 +1
+    M ./api/execute-split-dft.c -1 +1
+    M ./api/execute.c -1 +1
+    M ./api/export-wisdom-to-file.c -1 +1
+    M ./api/export-wisdom-to-string.c -1 +1
+    M ./api/export-wisdom.c -1 +1
+    M ./api/f77api.c -1 +1
+    M ./api/f77funcs.h -1 +1
+    M ./api/flops.c -1 +1
+    M ./api/forget-wisdom.c -1 +1
+    M ./api/import-system-wisdom.c -1 +1
+    M ./api/import-wisdom-from-file.c -1 +1
+    M ./api/import-wisdom-from-string.c -1 +1
+    M ./api/import-wisdom.c -1 +1
+    M ./api/malloc.c -1 +1
+    M ./api/map-r2r-kind.c -1 +1
+    M ./api/mapflags.c -1 +1
+    M ./api/mkprinter-file.c -1 +1
+    M ./api/mktensor-iodims.h -1 +1
+    M ./api/mktensor-rowmajor.c -1 +1
+    M ./api/plan-dft-1d.c -1 +1
+    M ./api/plan-dft-2d.c -1 +1
+    M ./api/plan-dft-3d.c -1 +1
+    M ./api/plan-dft-c2r-1d.c -1 +1
+    M ./api/plan-dft-c2r-2d.c -1 +1
+    M ./api/plan-dft-c2r-3d.c -1 +1
+    M ./api/plan-dft-c2r.c -1 +1
+    M ./api/plan-dft-r2c-1d.c -1 +1
+    M ./api/plan-dft-r2c-2d.c -1 +1
+    M ./api/plan-dft-r2c-3d.c -1 +1
+    M ./api/plan-dft-r2c.c -1 +1
+    M ./api/plan-dft.c -1 +1
+    M ./api/plan-guru-dft-c2r.h -1 +1
+    M ./api/plan-guru-dft-r2c.h -1 +1
+    M ./api/plan-guru-dft.h -1 +1
+    M ./api/plan-guru-r2r.h -1 +1
+    M ./api/plan-guru-split-dft-c2r.h -1 +1
+    M ./api/plan-guru-split-dft-r2c.h -1 +1
+    M ./api/plan-guru-split-dft.h -1 +1
+    M ./api/plan-many-dft-c2r.c -1 +1
+    M ./api/plan-many-dft-r2c.c -1 +1
+    M ./api/plan-many-dft.c -1 +1
+    M ./api/plan-many-r2r.c -1 +1
+    M ./api/plan-r2r-1d.c -1 +1
+    M ./api/plan-r2r-2d.c -1 +1
+    M ./api/plan-r2r-3d.c -1 +1
+    M ./api/plan-r2r.c -1 +1
+    M ./api/print-plan.c -1 +1
+    M ./api/rdft2-pad.c -1 +1
+    M ./api/the-planner.c -1 +1
+    M ./api/version.c -1 +1
+    M ./api/x77.h -1 +1
+    M ./commercialize.sh -2 +2
+    M ./dft/bluestein.c -1 +1
+    M ./dft/buffered.c -1 +1
+    M ./dft/codelet-dft.h -1 +1
+    M ./dft/conf.c -1 +1
+    M ./dft/ct.c -1 +1
+    M ./dft/ct.h -1 +1
+    M ./dft/dft.h -1 +1
+    M ./dft/dftw-direct.c -1 +1
+    M ./dft/dftw-directsq.c -1 +1
+    M ./dft/dftw-generic.c -1 +1
+    M ./dft/dftw-genericbuf.c -1 +1
+    M ./dft/direct.c -1 +1
+    M ./dft/generic.c -1 +1
+    M ./dft/indirect-transpose.c -1 +1
+    M ./dft/indirect.c -1 +1
+    M ./dft/kdft-dif.c -1 +1
+    M ./dft/kdft-difsq.c -1 +1
+    M ./dft/kdft-dit.c -1 +1
+    M ./dft/kdft.c -1 +1
+    M ./dft/nop.c -1 +1
+    M ./dft/plan.c -1 +1
+    M ./dft/problem.c -1 +1
+    M ./dft/rader.c -1 +1
+    M ./dft/rank-geq2.c -1 +1
+    M ./dft/scalar/n.c -1 +1
+    M ./dft/scalar/n.h -1 +1
+    M ./dft/scalar/t.c -1 +1
+    M ./dft/scalar/t.h -1 +1
+    M ./dft/simd/common/genus.c -1 +1
+    M ./dft/simd/n1b.h -1 +1
+    M ./dft/simd/n1f.h -1 +1
+    M ./dft/simd/n2b.h -1 +1
+    M ./dft/simd/n2f.h -1 +1
+    M ./dft/simd/n2s.h -1 +1
+    M ./dft/simd/q1b.h -1 +1
+    M ./dft/simd/q1f.h -1 +1
+    M ./dft/simd/t1b.h -1 +1
+    M ./dft/simd/t1bu.h -1 +1
+    M ./dft/simd/t1f.h -1 +1
+    M ./dft/simd/t1fu.h -1 +1
+    M ./dft/simd/t2b.h -1 +1
+    M ./dft/simd/t2f.h -1 +1
+    M ./dft/simd/t3b.h -1 +1
+    M ./dft/simd/t3f.h -1 +1
+    M ./dft/simd/ts.h -1 +1
+    M ./dft/solve.c -1 +1
+    M ./dft/vrank-geq1.c -1 +1
+    M ./dft/zero.c -1 +1
+    M ./doc/f77_wisdom.f -1 +1
+    M ./doc/license.texi -3 +3
+    M ./genfft/algsimp.ml -1 +1
+    M ./genfft/algsimp.mli -1 +1
+    M ./genfft/annotate.ml -1 +1
+    M ./genfft/annotate.mli -1 +1
+    M ./genfft/assoctable.ml -1 +1
+    M ./genfft/assoctable.mli -1 +1
+    M ./genfft/c.ml -1 +1
+    M ./genfft/c.mli -1 +1
+    M ./genfft/complex.ml -1 +1
+    M ./genfft/complex.mli -1 +1
+    M ./genfft/conv.ml -1 +1
+    M ./genfft/conv.mli -1 +1
+    M ./genfft/dag.ml -1 +1
+    M ./genfft/dag.mli -1 +1
+    M ./genfft/expr.ml -1 +1
+    M ./genfft/expr.mli -1 +1
+    M ./genfft/fft.ml -1 +1
+    M ./genfft/fft.mli -1 +1
+    M ./genfft/gen_hc2c.ml -1 +1
+    M ./genfft/gen_hc2cdft.ml -1 +1
+    M ./genfft/gen_hc2cdft_c.ml -1 +1
+    M ./genfft/gen_hc2hc.ml -1 +1
+    M ./genfft/gen_mdct.ml -1 +1
+    M ./genfft/gen_notw.ml -1 +1
+    M ./genfft/gen_notw_c.ml -1 +1
+    M ./genfft/gen_r2cb.ml -1 +1
+    M ./genfft/gen_r2cf.ml -1 +1
+    M ./genfft/gen_r2r.ml -1 +1
+    M ./genfft/gen_twiddle.ml -1 +1
+    M ./genfft/gen_twiddle_c.ml -1 +1
+    M ./genfft/gen_twidsq.ml -1 +1
+    M ./genfft/gen_twidsq_c.ml -1 +1
+    M ./genfft/genutil.ml -1 +1
+    M ./genfft/littlesimp.ml -1 +1
+    M ./genfft/littlesimp.mli -1 +1
+    M ./genfft/magic.ml -1 +1
+    M ./genfft/monads.ml -1 +1
+    M ./genfft/number.ml -1 +1
+    M ./genfft/number.mli -1 +1
+    M ./genfft/oracle.ml -1 +1
+    M ./genfft/oracle.mli -1 +1
+    M ./genfft/schedule.ml -1 +1
+    M ./genfft/schedule.mli -1 +1
+    M ./genfft/simd.ml -1 +1
+    M ./genfft/simd.mli -1 +1
+    M ./genfft/simdmagic.ml -1 +1
+    M ./genfft/to_alist.ml -1 +1
+    M ./genfft/to_alist.mli -1 +1
+    M ./genfft/trig.ml -1 +1
+    M ./genfft/trig.mli -1 +1
+    M ./genfft/twiddle.ml -1 +1
+    M ./genfft/twiddle.mli -1 +1
+    M ./genfft/unique.ml -1 +1
+    M ./genfft/unique.mli -1 +1
+    M ./genfft/util.ml -1 +1
+    M ./genfft/util.mli -1 +1
+    M ./genfft/variable.ml -1 +1
+    M ./genfft/variable.mli -1 +1
+    M ./kernel/align.c -1 +1
+    M ./kernel/alloc.c -1 +1
+    M ./kernel/assert.c -1 +1
+    M ./kernel/awake.c -1 +1
+    M ./kernel/buffered.c -1 +1
+    M ./kernel/cpy1d.c -1 +1
+    M ./kernel/cpy2d-pair.c -1 +1
+    M ./kernel/cpy2d.c -1 +1
+    M ./kernel/ct.c -1 +1
+    M ./kernel/debug.c -1 +1
+    M ./kernel/extract-reim.c -1 +1
+    M ./kernel/hash.c -1 +1
+    M ./kernel/iabs.c -1 +1
+    M ./kernel/ifftw.h -1 +1
+    M ./kernel/kalloc.c -1 +1
+    M ./kernel/md5-1.c -1 +1
+    M ./kernel/md5.c -1 +1
+    M ./kernel/minmax.c -1 +1
+    M ./kernel/ops.c -1 +1
+    M ./kernel/pickdim.c -1 +1
+    M ./kernel/plan.c -1 +1
+    M ./kernel/planner.c -1 +1
+    M ./kernel/primes.c -1 +1
+    M ./kernel/print.c -1 +1
+    M ./kernel/problem.c -1 +1
+    M ./kernel/rader.c -1 +1
+    M ./kernel/scan.c -1 +1
+    M ./kernel/solver.c -1 +1
+    M ./kernel/solvtab.c -1 +1
+    M ./kernel/stride.c -1 +1
+    M ./kernel/tensor.c -1 +1
+    M ./kernel/tensor1.c -1 +1
+    M ./kernel/tensor2.c -1 +1
+    M ./kernel/tensor3.c -1 +1
+    M ./kernel/tensor4.c -1 +1
+    M ./kernel/tensor5.c -1 +1
+    M ./kernel/tensor7.c -1 +1
+    M ./kernel/tensor8.c -1 +1
+    M ./kernel/tensor9.c -1 +1
+    M ./kernel/tile2d.c -1 +1
+    M ./kernel/timer.c -1 +1
+    M ./kernel/transpose.c -1 +1
+    M ./kernel/trig.c -1 +1
+    M ./kernel/twiddle.c -1 +1
+    M ./libbench2/aligned-main.c -1 +1
+    M ./libbench2/bench-main.c -1 +1
+    M ./libbench2/bench-user.h -1 +1
+    M ./libbench2/bench.h -1 +1
+    M ./libbench2/can-do.c -1 +1
+    M ./libbench2/dotens2.c -1 +1
+    M ./libbench2/info.c -1 +1
+    M ./libbench2/main.c -1 +1
+    M ./libbench2/my-getopt.c -1 +1
+    M ./libbench2/my-getopt.h -1 +1
+    M ./libbench2/problem.c -1 +1
+    M ./libbench2/report.c -1 +1
+    M ./libbench2/speed.c -1 +1
+    M ./libbench2/tensor.c -1 +1
+    M ./libbench2/timer.c -1 +1
+    M ./libbench2/useropt.c -1 +1
+    M ./libbench2/util.c -1 +1
+    M ./libbench2/verify-dft.c -1 +1
+    M ./libbench2/verify-lib.c -1 +1
+    M ./libbench2/verify-r2r.c -1 +1
+    M ./libbench2/verify-rdft2.c -1 +1
+    M ./libbench2/verify.c -1 +1
+    M ./libbench2/verify.h -1 +1
+    M ./libbench2/zero.c -1 +1
+    M ./mpi/any-true.c -1 +1
+    M ./mpi/api.c -1 +1
+    M ./mpi/block.c -1 +1
+    M ./mpi/choose-radix.c -1 +1
+    M ./mpi/conf.c -1 +1
+    M ./mpi/dft-problem.c -1 +1
+    M ./mpi/dft-rank-geq2-transposed.c -1 +1
+    M ./mpi/dft-rank-geq2.c -1 +1
+    M ./mpi/dft-rank1-bigvec.c -1 +1
+    M ./mpi/dft-rank1.c -1 +1
+    M ./mpi/dft-serial.c -1 +1
+    M ./mpi/dft-solve.c -1 +1
+    M ./mpi/dtensor.c -1 +1
+    M ./mpi/ifftw-mpi.h -1 +1
+    M ./mpi/mpi-dft.h -1 +1
+    M ./mpi/mpi-rdft.h -1 +1
+    M ./mpi/mpi-rdft2.h -1 +1
+    M ./mpi/mpi-transpose.h -1 +1
+    M ./mpi/rdft-problem.c -1 +1
+    M ./mpi/rdft-rank-geq2-transposed.c -1 +1
+    M ./mpi/rdft-rank-geq2.c -1 +1
+    M ./mpi/rdft-rank1-bigvec.c -1 +1
+    M ./mpi/rdft-serial.c -1 +1
+    M ./mpi/rdft-solve.c -1 +1
+    M ./mpi/rdft2-problem.c -1 +1
+    M ./mpi/rdft2-rank-geq2-transposed.c -1 +1
+    M ./mpi/rdft2-rank-geq2.c -1 +1
+    M ./mpi/rdft2-serial.c -1 +1
+    M ./mpi/rdft2-solve.c -1 +1
+    M ./mpi/rearrange.c -1 +1
+    M ./mpi/testsched.c -1 +1
+    M ./mpi/transpose-alltoall.c -1 +1
+    M ./mpi/transpose-pairwise.c -1 +1
+    M ./mpi/transpose-problem.c -1 +1
+    M ./mpi/transpose-recurse.c -1 +1
+    M ./mpi/transpose-solve.c -1 +1
+    M ./mpi/wisdom-api.c -1 +1
+    M ./rdft/buffered.c -1 +1
+    M ./rdft/buffered2.c -1 +1
+    M ./rdft/codelet-rdft.h -1 +1
+    M ./rdft/conf.c -1 +1
+    M ./rdft/ct-hc2c-direct.c -1 +1
+    M ./rdft/ct-hc2c.c -1 +1
+    M ./rdft/ct-hc2c.h -1 +1
+    M ./rdft/dft-r2hc.c -1 +1
+    M ./rdft/dht-r2hc.c -1 +1
+    M ./rdft/dht-rader.c -1 +1
+    M ./rdft/direct-r2c.c -1 +1
+    M ./rdft/direct-r2r.c -1 +1
+    M ./rdft/direct2.c -1 +1
+    M ./rdft/generic.c -1 +1
+    M ./rdft/hc2hc-direct.c -1 +1
+    M ./rdft/hc2hc-generic.c -1 +1
+    M ./rdft/hc2hc.c -1 +1
+    M ./rdft/hc2hc.h -1 +1
+    M ./rdft/indirect.c -1 +1
+    M ./rdft/khc2c.c -1 +1
+    M ./rdft/khc2hc.c -1 +1
+    M ./rdft/kr2c.c -1 +1
+    M ./rdft/kr2r.c -1 +1
+    M ./rdft/nop.c -1 +1
+    M ./rdft/nop2.c -1 +1
+    M ./rdft/plan.c -1 +1
+    M ./rdft/plan2.c -1 +1
+    M ./rdft/problem.c -1 +1
+    M ./rdft/problem2.c -1 +1
+    M ./rdft/rank-geq2-rdft2.c -1 +1
+    M ./rdft/rank-geq2.c -1 +1
+    M ./rdft/rank0-rdft2.c -1 +1
+    M ./rdft/rank0.c -1 +1
+    M ./rdft/rdft-dht.c -1 +1
+    M ./rdft/rdft.h -1 +1
+    M ./rdft/rdft2-inplace-strides.c -1 +1
+    M ./rdft/rdft2-rdft.c -1 +1
+    M ./rdft/rdft2-strides.c -1 +1
+    M ./rdft/rdft2-tensor-max-index.c -1 +1
+    M ./rdft/scalar/hb.h -1 +1
+    M ./rdft/scalar/hc2c.c -1 +1
+    M ./rdft/scalar/hc2cb.h -1 +1
+    M ./rdft/scalar/hc2cf.h -1 +1
+    M ./rdft/scalar/hf.h -1 +1
+    M ./rdft/scalar/hfb.c -1 +1
+    M ./rdft/scalar/r2c.c -1 +1
+    M ./rdft/scalar/r2cb.h -1 +1
+    M ./rdft/scalar/r2cbIII.h -1 +1
+    M ./rdft/scalar/r2cf.h -1 +1
+    M ./rdft/scalar/r2cfII.h -1 +1
+    M ./rdft/scalar/r2r.c -1 +1
+    M ./rdft/scalar/r2r.h -1 +1
+    M ./rdft/simd/common/genus.c -1 +1
+    M ./rdft/simd/hc2cbv.h -1 +1
+    M ./rdft/simd/hc2cfv.h -1 +1
+    M ./rdft/solve.c -1 +1
+    M ./rdft/solve2.c -1 +1
+    M ./rdft/vrank-geq1-rdft2.c -1 +1
+    M ./rdft/vrank-geq1.c -1 +1
+    M ./rdft/vrank3-transpose.c -1 +1
+    M ./reodft/conf.c -1 +1
+    M ./reodft/redft00e-r2hc-pad.c -1 +1
+    M ./reodft/redft00e-r2hc.c -1 +1
+    M ./reodft/reodft.h -1 +1
+    M ./reodft/reodft00e-splitradix.c -1 +1
+    M ./reodft/reodft010e-r2hc.c -1 +1
+    M ./reodft/reodft11e-r2hc-odd.c -1 +1
+    M ./reodft/reodft11e-r2hc.c -1 +1
+    M ./reodft/reodft11e-radix2.c -1 +1
+    M ./reodft/rodft00e-r2hc-pad.c -1 +1
+    M ./reodft/rodft00e-r2hc.c -1 +1
+    M ./simd-support/altivec.c -1 +1
+    M ./simd-support/amd64-cpuid.h -1 +1
+    M ./simd-support/avx.c -1 +1
+    M ./simd-support/neon.c -1 +1
+    M ./simd-support/simd-altivec.h -1 +1
+    M ./simd-support/simd-avx.h -1 +1
+    M ./simd-support/simd-common.h -1 +1
+    M ./simd-support/simd-neon.h -1 +1
+    M ./simd-support/simd-sse2.h -1 +1
+    M ./simd-support/sse2-nonportable.c -1 +1
+    M ./simd-support/sse2.c -1 +1
+    M ./simd-support/taint.c -1 +1
+    M ./simd-support/x86-cpuid.h -1 +1
+    M ./simd/altivec.c -1 +1
+    M ./simd/mips_ps.c -1 +1
+    M ./simd/mips_ps.h -1 +1
+    M ./simd/nonportable/sse.c -1 +1
+    M ./simd/nonportable/sse2.c -1 +1
+    M ./simd/simd-altivec.h -1 +1
+    M ./simd/simd-mips_ps.h -1 +1
+    M ./simd/simd-sse.h -1 +1
+    M ./simd/simd-sse2.h -1 +1
+    M ./simd/simd.h -1 +1
+    M ./simd/sse.c -1 +1
+    M ./simd/sse2.c -1 +1
+    M ./simd/taint.c -1 +1
+    M ./simd/x86-cpuid.h -1 +1
+    M ./threads/api.c -1 +1
+    M ./threads/conf.c -1 +1
+    M ./threads/ct.c -1 +1
+    M ./threads/dft-vrank-geq1.c -1 +1
+    M ./threads/f77api.c -1 +1
+    M ./threads/f77funcs.h -1 +1
+    M ./threads/hc2hc.c -1 +1
+    M ./threads/openmp.c -1 +1
+    M ./threads/rdft-vrank-geq1.c -1 +1
+    M ./threads/threads.c -1 +1
+    M ./threads/threads.h -1 +1
+    M ./threads/vrank-geq1-rdft2.c -1 +1
+    M ./tools/fftw-wisdom-to-conf.1 -1 +1
+    M ./tools/fftw-wisdom-to-conf.in -1 +1
+    M ./tools/fftw-wisdom.c -1 +1
+    M ./tools/fftw_wisdom.1.in -1 +1
+
+Thu Aug 11 14:54:38 EDT 2011  stevenj@fftw.org
+  * allow specifying TRANSPOSE_{IN/OUT} transpose plans, since libbench does not canonicalize rnk=1 n=1 plans as rnk=0
+
+    M ./mpi/mpi-bench.c -2 +2
+
+Thu Aug 11 14:17:24 EDT 2011  stevenj@fftw.org
+  * check.pl should occasionally check DESTROY_INPUT problems too (especially since those enable slightly different algorithms in MPI)
+
+    M ./tests/check.pl +1
+
+Thu Aug 11 12:37:51 EDT 2011  stevenj@fftw.org
+  * unify post-MPI transpose handling in pairwise and alltoall solvers; should make the former faster in the destroy-input out-of-place case, and the latter more widely applicable
+
+    M ./mpi/mpi-transpose.h +5
+    M ./mpi/transpose-alltoall.c -69 +24
+    M ./mpi/transpose-pairwise.c -66 +97
+
+Mon Aug  8 10:06:14 EDT 2011  athena@fftw.org
+  * Add support for ARM NEON
+
+    M ./Makefile.am -1 +6
+    M ./api/version.c +4
+    M ./configure.ac -5 +23
+    M ./dft/codelet-dft.h +1
+    M ./dft/conf.c +4
+    M ./dft/simd/Makefile.am -1 +1
+    A ./dft/simd/neon/
+    A ./dft/simd/neon/Makefile.am
+    M ./kernel/ifftw.h +2
+    M ./m4/ax_cc_maxopt.m4 +10
+    M ./rdft/codelet-rdft.h +1
+    M ./rdft/conf.c +4
+    M ./rdft/simd/Makefile.am -1 +1
+    A ./rdft/simd/neon/
+    A ./rdft/simd/neon/Makefile.am
+    M ./simd-support/Makefile.am -1 +2
+    A ./simd-support/neon.c
+    M ./simd-support/simd-common.h +3
+    A ./simd-support/simd-neon.h
+
+Fri Aug  5 17:25:32 EDT 2011  stevenj@fftw.org
+  * more C++ paranoia
+
+    M ./tests/fftw-bench.h +10
+
+Fri Aug  5 17:02:00 EDT 2011  stevenj@fftw.org
+  * tentative version bump for 3.3.1
+
+    M ./NEWS +5
+    M ./configure.ac -2 +2
+
+Fri Aug  5 16:52:28 EDT 2011  stevenj@fftw.org
+  * fixes so that MPI code compiles when MPICC is a C++ compiler, even if the serial code is compiled with a C compiler; thanks to Kyle Spyksma for the bug report
+
+    M ./api/api.h +12
+    M ./api/plan-guru-r2r.h -2
+    M ./api/plan-many-r2r.c -2
+    M ./dft/dft.h +9
+    M ./kernel/ifftw.h +9
+    M ./mpi/api.c -3 +1
+    M ./mpi/dtensor.c -1 +1
+    M ./mpi/ifftw-mpi.h -1 +4
+    M ./mpi/mpi-bench.c -1 +1
+    M ./rdft/rdft.h +9
+
+Fri Aug  5 16:04:06 EDT 2011  stevenj@fftw.org
+  * use correct precision in f03-wrap.c, avoiding a (harmless) implicit pointer cast that prevented compilation under C++; thanks to Kyle Spyksma for the bug report
+
+    M ./mpi/genf03-wrap.pl +2
+
+Fri Aug  5 14:04:32 EDT 2011  stevenj@fftw.org
+  * manual typo
+
+    M ./doc/modern-fortran.texi -1 +1
+
+Wed Jul 27 12:50:25 EDT 2011  athena@fftw.org
+  tagged fftw-3.3
+
+
+Tue Jul 26 20:55:45 EDT 2011  athena@fftw.org
+  * Honor WITH_OUR_MALLOC in libbench2
+
+    M ./libbench2/util.c -6 +5
+
+Tue Jul 26 20:27:28 EDT 2011  athena@fftw.org
+  * fixed typo: incorrect name of combined threads library on Windows
+
+    M ./Makefile.am -1 +1
+
+Mon Jul 25 14:38:20 EDT 2011  stevenj@fftw.org
+  * 3.3 version bump & NEWS
+
+    M ./NEWS -5 +16
+    M ./configure.ac -1 +1
+
+Mon Jul 25 14:37:48 EDT 2011  stevenj@fftw.org
+  * use int(..., C_SIZE_T) rather than declaring another variable in the Fortran examples
+
+    M ./doc/modern-fortran.texi -12 +9
+
+Wed Jul 13 05:02:32 EDT 2011  stevenj@fftw.org
+  * typo, thanks to Rhys Ulerich for the comment
+
+    M ./doc/mpi.texi -1 +1
+
+Mon Jul 11 14:39:52 EDT 2011  athena@fftw.org
+  * Fix bug in bubblesort
+  
+  Bubblesort was not sorting.  This was a bug in the benchmark library
+  (not in FFTW per se), and it impacted the benchmark program
+  with --report-time and --report-mflops causing it to output
+  an incorrect value for the median.  (The minimum, maximum, and
+  average value were correct.)  Thanks Dima Baksheev of Intel for
+  reporting this bug.
+  
+
+    M ./libbench2/report.c -2 +2
+
+Fri Jul  8 13:35:59 EDT 2011  stevenj@fftw.org
+  * small manual typos
+
+    M ./doc/modern-fortran.texi -2 +2
+
+Wed Jul  6 10:49:40 EDT 2011  athena@fftw.org
+  * Detection of altivec.h requires $ALTIVEC_CFLAGS
+
+    M ./configure.ac -5 +4
+
+Tue Jul  5 19:58:47 EDT 2011  athena@fftw.org
+  * Introduce fake dependency so that my-getopt.c is recompiled
+  
+  my-getopt.c does not depend on anything, and so it is not rebuilt when
+  reconfiguring for a different ISA (e.g., CC="gcc -m32" vs CC="gcc
+  -m64").  Add a fake dependency on <config.h> so that the file is
+  recompiled.
+  
+  
+
+    M ./libbench2/my-getopt.c +1
+
+Tue Jul  5 18:53:36 EDT 2011  stevenj@fftw.org
+  * support compiling/installing --enable-threads --enable-openmp at the same time, although in this case the test program only uses the threads variety.  Update documentation accordingly, and in general expand the documentation of the OpenMP support
+
+    M ./Makefile.am -1 +1
+    M ./configure.ac -21 +13
+    M ./doc/install.texi -14 +13
+    M ./doc/threads.texi -19 +43
+    M ./mpi/Makefile.am -2 +6
+    M ./tests/Makefile.am -2 +6
+    M ./tests/fftw-bench.c -1 +3
+    M ./threads/Makefile.am -6 +20
+    M ./threads/conf.c -4
+    M ./threads/openmp.c -4
+    M ./threads/threads.c -3
+    M ./tools/Makefile.am -4 +8
+
+Tue Jul  5 16:04:03 EDT 2011  stevenj@fftw.org
+  * call omp_set_num_threads in fftw-bench so that the number of OpenMP threads corresponds with the number of FFTW threads
+
+    M ./tests/fftw-bench.c +5
+
+Tue Jul  5 16:03:06 EDT 2011  stevenj@fftw.org
+  * when --enable-openmp, install as fftw3_omp rather than fftw3_threads, so that both the POSIX threads and OpenMP variants of FFTW can be installed at once
+
+    M ./Makefile.am -1 +1
+    M ./configure.ac +4
+    M ./threads/Makefile.am -4 +4
+
+Sat Jul  2 02:21:22 EDT 2011  stevenj@fftw.org
+  * don't even declare an fftw_execute interface in Fortran, since it is unsafe and we recommend against it anyway; thanks to Arjen Markus for the suggestion
+
+    M ./api/f03api.sh -1 +1
+    M ./doc/modern-fortran.texi -2 +4
+
+Fri Jul  1 14:35:44 EDT 2011  athena@fftw.org
+  * consistently use the order single, double, long double
+
+    M ./support/BUILD-MINGW32.sh -1 +1
+    M ./support/BUILD-MINGW64.sh -1 +1
+
+Wed Jun 29 17:27:06 EDT 2011  athena@fftw.org
+  * MSVC AVX 64-bit detection does not work, punt for now.
+
+    M ./simd-support/amd64-cpuid.h -16 +2
+
+Wed Jun 29 15:52:27 EDT 2011  stevenj@fftw.org
+  * fixed typo, added note on transposed flags for r2c/c2r; thanks to Rhys Ulerich for the suggestions
+
+    M ./doc/mpi.texi -1 +14
+
+Wed Jun 29 09:41:39 EDT 2011  athena@fftw.org
+  * fixes for compiling with MSVC (untested)
+
+    M ./simd-support/amd64-cpuid.h -2 +16
+    M ./simd-support/simd-avx.h -11 +16
+    M ./simd-support/x86-cpuid.h -1 +7
+
+Tue Jun 28 16:48:36 EDT 2011  athena@fftw.org
+  * comment
+
+    M ./simd-support/sse2-nonportable.c +3
+
+Mon Jun 27 21:01:56 EDT 2011  stevenj@fftw.org
+  * rm extraneous line break in HTML output ... I hate texinfo
+
+    M ./doc/reference.texi -6 +3
+
+Mon Jun 27 06:43:57 EDT 2011  athena@fftw.org
+  tagged fftw-3.3-beta1
+
+
+Mon Jun 27 00:47:33 EDT 2011  stevenj@fftw.org
+  * maintainer-clean should delete html directory (otherwwise we keep obsolete HTML files in the dist tarball, sigh)
+
+    M ./doc/Makefile.am -2 +2
+
+Mon Jun 27 00:45:28 EDT 2011  stevenj@fftw.org
+  tagged fftw-3.3-beta1
+
+
+Sun Jun 26 23:36:32 EDT 2011  stevenj@fftw.org
+  * update copyright year in manual
+
+    M ./doc/license.texi -2 +2
+
+Sun Jun 26 22:52:54 EDT 2011  stevenj@fftw.org
+  * whoops, don't dist .f03 headers, since those are built by the user's Makefile
+
+    M ./api/Makefile.am -1 +2
+    M ./mpi/Makefile.am -1 +2
+
+Sun Jun 26 22:43:49 EDT 2011  stevenj@fftw.org
+  * fix embarrassing deadlock/crashing bug in my previous nowisdom_hook fix -- I forgot to handle the case where one process has wisdom and another one doesn't, requiring a nowisdom_hook in the latter case; this should only affect MPI transforms since otherwise these hook functions are NULL
+
+    M ./kernel/ifftw.h +1
+    M ./kernel/planner.c -44 +48
+    M ./mpi/api.c +14
+
+Sun Jun 26 21:02:15 EDT 2011  stevenj@fftw.org
+  * subsubheadings, MPI transpose reference
+
+    M ./doc/mpi.texi -7 +53
+
+Sun Jun 26 20:48:53 EDT 2011  stevenj@fftw.org
+  * add MPI plan reference
+
+    M ./doc/mpi.texi -5 +197
+
+Sun Jun 26 17:07:21 EDT 2011  stevenj@fftw.org
+  * portions of MPI reference docs; tweaks to NEWS
+
+    M ./NEWS -2 +3
+    M ./doc/mpi.texi -12 +264
+
+Sun Jun 26 12:40:43 EDT 2011  stevenj@fftw.org
+  * use $(CHECK_PL_OPTS) more consistently
+
+    M ./tests/Makefile.am -1 +1
+
+Sun Jun 26 10:04:54 EDT 2011  athena@fftw.org
+  * accept \r\n as well as \n.  Grrr...
+
+    M ./tests/check.pl -1 +2
+
+Sun Jun 26 09:52:11 EDT 2011  athena@fftw.org
+  * new configure option --with-incoming-stack-boundary=N
+  
+  This option selects CFLAGS to align the stack at all externally-callable
+  functions.  This currently comprises api/* and threads/*
+  
+
+    M ./api/Makefile.am +1
+    M ./configure.ac +14
+    M ./support/BUILD-MINGW32.sh -1 +1
+    M ./threads/Makefile.am +1
+
+Sun Jun 26 09:51:37 EDT 2011  athena@fftw.org
+  * add -fomit-frame-pointer back
+  
+  Somehow -O3 does not imply -fomit-frame-pointer on ia32
+  
+
+    M ./m4/ax_cc_maxopt.m4 -2 +3
+
+Sun Jun 26 07:20:27 EDT 2011  athena@fftw.org
+  * Note that removal of mips-ps is temporary.
+
+    M ./NEWS -3 +5
+
+Sat Jun 25 23:15:03 EDT 2011  stevenj@fftw.org
+  * update copyright year
+
+    M ./COPYRIGHT -2 +2
+    M ./api/api.h -2 +2
+    M ./api/apiplan.c -2 +2
+    M ./api/configure.c -2 +2
+    M ./api/execute-dft-c2r.c -2 +2
+    M ./api/execute-dft-r2c.c -2 +2
+    M ./api/execute-dft.c -2 +2
+    M ./api/execute-r2r.c -2 +2
+    M ./api/execute-split-dft-c2r.c -2 +2
+    M ./api/execute-split-dft-r2c.c -2 +2
+    M ./api/execute-split-dft.c -2 +2
+    M ./api/execute.c -2 +2
+    M ./api/export-wisdom-to-file.c -2 +2
+    M ./api/export-wisdom-to-string.c -2 +2
+    M ./api/export-wisdom.c -2 +2
+    M ./api/f77api.c -2 +2
+    M ./api/f77funcs.h -2 +2
+    M ./api/fftw3.h -2 +2
+    M ./api/flops.c -2 +2
+    M ./api/forget-wisdom.c -2 +2
+    M ./api/import-system-wisdom.c -2 +2
+    M ./api/import-wisdom-from-file.c -2 +2
+    M ./api/import-wisdom-from-string.c -2 +2
+    M ./api/import-wisdom.c -2 +2
+    M ./api/malloc.c -2 +2
+    M ./api/map-r2r-kind.c -2 +2
+    M ./api/mapflags.c -2 +2
+    M ./api/mkprinter-file.c -2 +2
+    M ./api/mktensor-iodims.h -2 +2
+    M ./api/mktensor-rowmajor.c -2 +2
+    M ./api/plan-dft-1d.c -2 +2
+    M ./api/plan-dft-2d.c -2 +2
+    M ./api/plan-dft-3d.c -2 +2
+    M ./api/plan-dft-c2r-1d.c -2 +2
+    M ./api/plan-dft-c2r-2d.c -2 +2
+    M ./api/plan-dft-c2r-3d.c -2 +2
+    M ./api/plan-dft-c2r.c -2 +2
+    M ./api/plan-dft-r2c-1d.c -2 +2
+    M ./api/plan-dft-r2c-2d.c -2 +2
+    M ./api/plan-dft-r2c-3d.c -2 +2
+    M ./api/plan-dft-r2c.c -2 +2
+    M ./api/plan-dft.c -2 +2
+    M ./api/plan-guru-dft-c2r.h -2 +2
+    M ./api/plan-guru-dft-r2c.h -2 +2
+    M ./api/plan-guru-dft.h -2 +2
+    M ./api/plan-guru-r2r.h -2 +2
+    M ./api/plan-guru-split-dft-c2r.h -2 +2
+    M ./api/plan-guru-split-dft-r2c.h -2 +2
+    M ./api/plan-guru-split-dft.h -2 +2
+    M ./api/plan-many-dft-c2r.c -2 +2
+    M ./api/plan-many-dft-r2c.c -2 +2
+    M ./api/plan-many-dft.c -2 +2
+    M ./api/plan-many-r2r.c -2 +2
+    M ./api/plan-r2r-1d.c -2 +2
+    M ./api/plan-r2r-2d.c -2 +2
+    M ./api/plan-r2r-3d.c -2 +2
+    M ./api/plan-r2r.c -2 +2
+    M ./api/print-plan.c -2 +2
+    M ./api/rdft2-pad.c -2 +2
+    M ./api/the-planner.c -2 +2
+    M ./api/version.c -2 +2
+    M ./api/x77.h -2 +2
+    M ./dft/bluestein.c -2 +2
+    M ./dft/buffered.c -2 +2
+    M ./dft/codelet-dft.h -2 +2
+    M ./dft/conf.c -2 +2
+    M ./dft/ct.c -2 +2
+    M ./dft/ct.h -2 +2
+    M ./dft/dft.h -2 +2
+    M ./dft/dftw-direct.c -2 +2
+    M ./dft/dftw-directsq.c -2 +2
+    M ./dft/dftw-generic.c -2 +2
+    M ./dft/dftw-genericbuf.c -2 +2
+    M ./dft/direct.c -2 +2
+    M ./dft/generic.c -2 +2
+    M ./dft/indirect-transpose.c -2 +2
+    M ./dft/indirect.c -2 +2
+    M ./dft/kdft-dif.c -2 +2
+    M ./dft/kdft-difsq.c -2 +2
+    M ./dft/kdft-dit.c -2 +2
+    M ./dft/kdft.c -2 +2
+    M ./dft/nop.c -2 +2
+    M ./dft/plan.c -2 +2
+    M ./dft/problem.c -2 +2
+    M ./dft/rader.c -2 +2
+    M ./dft/rank-geq2.c -2 +2
+    M ./dft/scalar/n.c -2 +2
+    M ./dft/scalar/n.h -2 +2
+    M ./dft/scalar/t.c -2 +2
+    M ./dft/scalar/t.h -2 +2
+    M ./dft/simd/common/genus.c -2 +2
+    M ./dft/simd/n1b.h -2 +2
+    M ./dft/simd/n1f.h -2 +2
+    M ./dft/simd/n2b.h -2 +2
+    M ./dft/simd/n2f.h -2 +2
+    M ./dft/simd/n2s.h -2 +2
+    M ./dft/simd/q1b.h -2 +2
+    M ./dft/simd/q1f.h -2 +2
+    M ./dft/simd/t1b.h -2 +2
+    M ./dft/simd/t1bu.h -2 +2
+    M ./dft/simd/t1f.h -2 +2
+    M ./dft/simd/t1fu.h -2 +2
+    M ./dft/simd/t2b.h -2 +2
+    M ./dft/simd/t2f.h -2 +2
+    M ./dft/simd/t3b.h -2 +2
+    M ./dft/simd/t3f.h -2 +2
+    M ./dft/simd/ts.h -2 +2
+    M ./dft/solve.c -2 +2
+    M ./dft/vrank-geq1.c -2 +2
+    M ./dft/zero.c -2 +2
+    M ./doc/f77_wisdom.f -2 +2
+    M ./genfft/algsimp.ml -2 +2
+    M ./genfft/algsimp.mli -2 +2
+    M ./genfft/annotate.ml -2 +2
+    M ./genfft/annotate.mli -2 +2
+    M ./genfft/assoctable.ml -2 +2
+    M ./genfft/assoctable.mli -2 +2
+    M ./genfft/c.ml -2 +2
+    M ./genfft/c.mli -2 +2
+    M ./genfft/complex.ml -2 +2
+    M ./genfft/complex.mli -2 +2
+    M ./genfft/conv.ml -2 +2
+    M ./genfft/conv.mli -2 +2
+    M ./genfft/dag.ml -2 +2
+    M ./genfft/dag.mli -2 +2
+    M ./genfft/expr.ml -2 +2
+    M ./genfft/expr.mli -2 +2
+    M ./genfft/fft.ml -2 +2
+    M ./genfft/fft.mli -2 +2
+    M ./genfft/gen_hc2c.ml -2 +2
+    M ./genfft/gen_hc2cdft.ml -2 +2
+    M ./genfft/gen_hc2cdft_c.ml -2 +2
+    M ./genfft/gen_hc2hc.ml -2 +2
+    M ./genfft/gen_mdct.ml -2 +2
+    M ./genfft/gen_notw.ml -2 +2
+    M ./genfft/gen_notw_c.ml -2 +2
+    M ./genfft/gen_r2cb.ml -2 +2
+    M ./genfft/gen_r2cf.ml -2 +2
+    M ./genfft/gen_r2r.ml -2 +2
+    M ./genfft/gen_twiddle.ml -2 +2
+    M ./genfft/gen_twiddle_c.ml -2 +2
+    M ./genfft/gen_twidsq.ml -2 +2
+    M ./genfft/gen_twidsq_c.ml -2 +2
+    M ./genfft/genutil.ml -2 +2
+    M ./genfft/littlesimp.ml -2 +2
+    M ./genfft/littlesimp.mli -2 +2
+    M ./genfft/magic.ml -2 +2
+    M ./genfft/monads.ml -2 +2
+    M ./genfft/number.ml -2 +2
+    M ./genfft/number.mli -2 +2
+    M ./genfft/oracle.ml -2 +2
+    M ./genfft/oracle.mli -2 +2
+    M ./genfft/schedule.ml -2 +2
+    M ./genfft/schedule.mli -2 +2
+    M ./genfft/simd.ml -2 +2
+    M ./genfft/simd.mli -2 +2
+    M ./genfft/simdmagic.ml -2 +2
+    M ./genfft/to_alist.ml -2 +2
+    M ./genfft/to_alist.mli -2 +2
+    M ./genfft/trig.ml -2 +2
+    M ./genfft/trig.mli -2 +2
+    M ./genfft/twiddle.ml -2 +2
+    M ./genfft/twiddle.mli -2 +2
+    M ./genfft/unique.ml -2 +2
+    M ./genfft/unique.mli -2 +2
+    M ./genfft/util.ml -2 +2
+    M ./genfft/util.mli -2 +2
+    M ./genfft/variable.ml -2 +2
+    M ./genfft/variable.mli -2 +2
+    M ./kernel/align.c -2 +2
+    M ./kernel/alloc.c -2 +2
+    M ./kernel/assert.c -2 +2
+    M ./kernel/awake.c -2 +2
+    M ./kernel/buffered.c -2 +2
+    M ./kernel/cpy1d.c -2 +2
+    M ./kernel/cpy2d-pair.c -2 +2
+    M ./kernel/cpy2d.c -2 +2
+    M ./kernel/ct.c -2 +2
+    M ./kernel/cycle.h -2 +2
+    M ./kernel/debug.c -2 +2
+    M ./kernel/extract-reim.c -2 +2
+    M ./kernel/hash.c -2 +2
+    M ./kernel/iabs.c -2 +2
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/kalloc.c -2 +2
+    M ./kernel/md5-1.c -2 +2
+    M ./kernel/md5.c -2 +2
+    M ./kernel/minmax.c -2 +2
+    M ./kernel/ops.c -2 +2
+    M ./kernel/pickdim.c -2 +2
+    M ./kernel/plan.c -2 +2
+    M ./kernel/primes.c -2 +2
+    M ./kernel/print.c -2 +2
+    M ./kernel/problem.c -2 +2
+    M ./kernel/rader.c -2 +2
+    M ./kernel/scan.c -2 +2
+    M ./kernel/solver.c -2 +2
+    M ./kernel/solvtab.c -2 +2
+    M ./kernel/stride.c -2 +2
+    M ./kernel/tensor.c -2 +2
+    M ./kernel/tensor1.c -2 +2
+    M ./kernel/tensor2.c -2 +2
+    M ./kernel/tensor3.c -2 +2
+    M ./kernel/tensor4.c -2 +2
+    M ./kernel/tensor5.c -2 +2
+    M ./kernel/tensor7.c -2 +2
+    M ./kernel/tensor8.c -2 +2
+    M ./kernel/tensor9.c -2 +2
+    M ./kernel/tile2d.c -2 +2
+    M ./kernel/timer.c -2 +2
+    M ./kernel/transpose.c -2 +2
+    M ./kernel/trig.c -2 +2
+    M ./kernel/twiddle.c -2 +2
+    M ./libbench2/dotens2.c -2 +2
+    M ./libbench2/my-getopt.c -2 +2
+    M ./libbench2/my-getopt.h -2 +2
+    M ./libbench2/verify-dft.c -2 +2
+    M ./libbench2/verify-lib.c -2 +2
+    M ./libbench2/verify-r2r.c -2 +2
+    M ./libbench2/verify-rdft2.c -2 +2
+    M ./libbench2/verify.h -2 +2
+    M ./mpi/any-true.c -2 +2
+    M ./mpi/api.c -2 +2
+    M ./mpi/block.c -2 +2
+    M ./mpi/choose-radix.c -2 +2
+    M ./mpi/conf.c -2 +2
+    M ./mpi/dft-problem.c -2 +2
+    M ./mpi/dft-rank-geq2-transposed.c -2 +2
+    M ./mpi/dft-rank-geq2.c -2 +2
+    M ./mpi/dft-rank1-bigvec.c -2 +2
+    M ./mpi/dft-rank1.c -2 +2
+    M ./mpi/dft-serial.c -2 +2
+    M ./mpi/dft-solve.c -2 +2
+    M ./mpi/dtensor.c -2 +2
+    M ./mpi/fftw3-mpi.h -2 +2
+    M ./mpi/ifftw-mpi.h -2 +2
+    M ./mpi/mpi-dft.h -2 +2
+    M ./mpi/mpi-rdft.h -2 +2
+    M ./mpi/mpi-rdft2.h -2 +2
+    M ./mpi/mpi-transpose.h -2 +2
+    M ./mpi/rdft-problem.c -2 +2
+    M ./mpi/rdft-rank-geq2-transposed.c -2 +2
+    M ./mpi/rdft-rank-geq2.c -2 +2
+    M ./mpi/rdft-rank1-bigvec.c -2 +2
+    M ./mpi/rdft-serial.c -2 +2
+    M ./mpi/rdft-solve.c -2 +2
+    M ./mpi/rdft2-problem.c -2 +2
+    M ./mpi/rdft2-rank-geq2-transposed.c -2 +2
+    M ./mpi/rdft2-rank-geq2.c -2 +2
+    M ./mpi/rdft2-serial.c -2 +2
+    M ./mpi/rdft2-solve.c -2 +2
+    M ./mpi/rearrange.c -2 +2
+    M ./mpi/testsched.c -1 +1
+    M ./mpi/transpose-alltoall.c -2 +2
+    M ./mpi/transpose-pairwise.c -2 +2
+    M ./mpi/transpose-problem.c -2 +2
+    M ./mpi/transpose-recurse.c -2 +2
+    M ./mpi/transpose-solve.c -2 +2
+    M ./mpi/wisdom-api.c -2 +2
+    M ./rdft/buffered.c -2 +2
+    M ./rdft/buffered2.c -2 +2
+    M ./rdft/codelet-rdft.h -2 +2
+    M ./rdft/conf.c -2 +2
+    M ./rdft/ct-hc2c-direct.c -2 +2
+    M ./rdft/ct-hc2c.c -2 +2
+    M ./rdft/ct-hc2c.h -2 +2
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/dht-r2hc.c -2 +2
+    M ./rdft/dht-rader.c -2 +2
+    M ./rdft/direct-r2c.c -2 +2
+    M ./rdft/direct-r2r.c -2 +2
+    M ./rdft/direct2.c -2 +2
+    M ./rdft/generic.c -2 +2
+    M ./rdft/hc2hc-direct.c -2 +2
+    M ./rdft/hc2hc-generic.c -2 +2
+    M ./rdft/hc2hc.c -2 +2
+    M ./rdft/hc2hc.h -2 +2
+    M ./rdft/indirect.c -2 +2
+    M ./rdft/khc2c.c -2 +2
+    M ./rdft/khc2hc.c -2 +2
+    M ./rdft/kr2c.c -2 +2
+    M ./rdft/kr2r.c -2 +2
+    M ./rdft/nop.c -2 +2
+    M ./rdft/nop2.c -2 +2
+    M ./rdft/plan.c -2 +2
+    M ./rdft/plan2.c -2 +2
+    M ./rdft/problem.c -2 +2
+    M ./rdft/problem2.c -2 +2
+    M ./rdft/rank-geq2-rdft2.c -2 +2
+    M ./rdft/rank-geq2.c -2 +2
+    M ./rdft/rank0-rdft2.c -2 +2
+    M ./rdft/rank0.c -2 +2
+    M ./rdft/rdft-dht.c -2 +2
+    M ./rdft/rdft.h -2 +2
+    M ./rdft/rdft2-inplace-strides.c -2 +2
+    M ./rdft/rdft2-rdft.c -2 +2
+    M ./rdft/rdft2-strides.c -2 +2
+    M ./rdft/rdft2-tensor-max-index.c -2 +2
+    M ./rdft/scalar/hb.h -2 +2
+    M ./rdft/scalar/hc2c.c -2 +2
+    M ./rdft/scalar/hc2cb.h -2 +2
+    M ./rdft/scalar/hc2cf.h -2 +2
+    M ./rdft/scalar/hf.h -2 +2
+    M ./rdft/scalar/hfb.c -2 +2
+    M ./rdft/scalar/r2c.c -2 +2
+    M ./rdft/scalar/r2cb.h -2 +2
+    M ./rdft/scalar/r2cbIII.h -2 +2
+    M ./rdft/scalar/r2cf.h -2 +2
+    M ./rdft/scalar/r2cfII.h -2 +2
+    M ./rdft/scalar/r2r.c -2 +2
+    M ./rdft/scalar/r2r.h -2 +2
+    M ./rdft/simd/common/genus.c -2 +2
+    M ./rdft/simd/hc2cbv.h -2 +2
+    M ./rdft/simd/hc2cfv.h -2 +2
+    M ./rdft/solve.c -2 +2
+    M ./rdft/solve2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+    M ./rdft/vrank3-transpose.c -2 +2
+    M ./reodft/conf.c -2 +2
+    M ./reodft/redft00e-r2hc-pad.c -2 +2
+    M ./reodft/redft00e-r2hc.c -2 +2
+    M ./reodft/reodft.h -2 +2
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -2 +2
+    M ./reodft/reodft11e-radix2.c -2 +2
+    M ./reodft/rodft00e-r2hc-pad.c -2 +2
+    M ./reodft/rodft00e-r2hc.c -2 +2
+    M ./simd/altivec.c -2 +2
+    M ./simd/nonportable/sse.c -2 +2
+    M ./simd/nonportable/sse2.c -2 +2
+    M ./simd/simd-altivec.h -2 +2
+    M ./simd/simd-sse.h -2 +2
+    M ./simd/simd-sse2.h -2 +2
+    M ./simd/simd.h -2 +2
+    M ./simd/sse.c -2 +2
+    M ./simd/sse2.c -2 +2
+    M ./simd/taint.c -2 +2
+    M ./simd/x86-cpuid.h -2 +2
+    M ./simd-support/altivec.c -2 +2
+    M ./simd-support/amd64-cpuid.h -2 +2
+    M ./simd-support/avx.c -2 +2
+    M ./simd-support/simd-altivec.h -2 +2
+    M ./simd-support/simd-avx.h -2 +2
+    M ./simd-support/simd-common.h -2 +2
+    M ./simd-support/simd-sse2.h -2 +2
+    M ./simd-support/sse2-nonportable.c -2 +2
+    M ./simd-support/sse2.c -2 +2
+    M ./simd-support/taint.c -2 +2
+    M ./simd-support/x86-cpuid.h -2 +2
+    M ./threads/api.c -2 +2
+    M ./threads/conf.c -2 +2
+    M ./threads/ct.c -2 +2
+    M ./threads/dft-vrank-geq1.c -2 +2
+    M ./threads/f77api.c -2 +2
+    M ./threads/f77funcs.h -2 +2
+    M ./threads/hc2hc.c -2 +2
+    M ./threads/openmp.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -2 +2
+    M ./threads/threads.c -2 +2
+    M ./threads/threads.h -2 +2
+    M ./threads/vrank-geq1-rdft2.c -2 +2
+    M ./tools/fftw-wisdom-to-conf.1 -4 +4
+    M ./tools/fftw-wisdom-to-conf.in -2 +2
+    M ./tools/fftw-wisdom.c -2 +2
+    M ./tools/fftw_wisdom.1.in -4 +4
+
+Sat Jun 25 21:33:13 EDT 2011  stevenj@fftw.org
+  * updated NEWS
+
+    M ./NEWS -4 +53
+
+Sat Jun 25 20:29:55 EDT 2011  stevenj@fftw.org
+  * fixes to Fortran interface and docs
+
+    M ./api/genf03.pl -12 +22
+    M ./doc/modern-fortran.texi -5 +14
+    M ./doc/mpi.texi -13 +16
+
+Sat Jun 25 17:43:31 EDT 2011  stevenj@fftw.org
+  * initial stab at MPI Fortran docs
+
+    M ./doc/mpi.texi +138
+
+Sat Jun 25 16:43:31 EDT 2011  stevenj@fftw.org
+  * correct description of what MPI standard says about I/O (I can't believe this crap)
+
+    M ./doc/mpi.texi -11 +21
+
+Sat Jun 25 15:14:07 EDT 2011  stevenj@fftw.org
+  * more MPI documentation; mention `fftw_alloc' functions earlier in the manual
+
+    M ./doc/fftw3.texi +12
+    M ./doc/mpi.texi -30 +158
+    M ./doc/other.texi +9
+    M ./doc/tutorial.texi -11 +14
+
+Sat Jun 25 13:40:19 EDT 2011  stevenj@fftw.org
+  * clarification about --enable-sse2
+
+    M ./doc/install.texi -6 +8
+
+Sat Jun 25 13:31:25 EDT 2011  athena@fftw.org
+  * Update mingw build scripts for fftw-3.3
+
+    M ./kernel/kalloc.c -2 +2
+    M ./support/BUILD-MINGW32.sh -3 +3
+    M ./support/BUILD-MINGW64.sh -4 +4
+
+Sat Jun 25 08:52:13 EDT 2011  athena@fftw.org
+  * Fix typo: EXTRADIST => EXTRA_DIST
+
+    M ./mpi/Makefile.am -1 +1
+
+Fri Jun 24 23:52:19 EDT 2011  stevenj@fftw.org
+  * finished draft "modern fortran" chapter
+
+    M ./doc/modern-fortran.texi -9 +91
+
+Fri Jun 24 20:47:49 EDT 2011  stevenj@fftw.org
+  * include FFTW_EXTERN prototypes for wrappers, so that they are properly exported to DLLs on Windows (sigh)
+
+    M ./mpi/f03-wrap.sh +4
+
+Fri Jun 24 16:52:30 EDT 2011  athena@fftw.org
+  * use malloc() instead of alloca() for large buffers
+  
+  The proximate cause for this patch is that OpenBSD/i386 reserves 256KB
+  stack size per thread.  We were allocating a buffer of size
+  128*130*sizeof(fftw_complex) that exceeds the stack.
+  
+  While 128*130*sizeof(fftw_complex) = 260KiB is the worst case for
+  normal configurations, it is a good idea to limit stack allocation
+  just in case.  Also, the generic solver might in principle generate
+  unbounded buffers, even though it is normally disabled for n > 137.
+  
+  So, as an added precaution, we now never stack-allocate buffers larger
+  than 64KiB, which ought to be enough for anybody.
+  
+
+    M ./dft/dftw-direct.c -2 +3
+    M ./dft/direct.c -2 +3
+    M ./dft/generic.c -2 +3
+    M ./kernel/ifftw.h -7 +31
+    M ./rdft/ct-hc2c-direct.c -2 +3
+    M ./rdft/direct-r2c.c -2 +3
+    M ./rdft/generic.c -4 +6
+    M ./rdft/hc2hc-direct.c -2 +3
+
+Fri Jun 24 16:32:30 EDT 2011  stevenj@fftw.org
+  * don't imply that AVX is available on Pentium III; note that MIPS Paired Single is currently only in FFTW 3.2.x
+
+    M ./doc/install.texi -3 +2
+    M ./doc/other.texi -3 +4
+
+Fri Jun 24 16:05:27 EDT 2011  stevenj@fftw.org
+  * silence annoying gfortran warnings
+
+    M ./api/genf03.pl -1 +6
+    M ./configure.ac +14
+    M ./mpi/Makefile.am -3 +6
+
+Fri Jun 24 14:59:30 EDT 2011  stevenj@fftw.org
+  * a couple MPI Fortran 2003 fixes; changed MPI flags to not use 1<<31 since Fortran (not having unsigned integers) does not allow us to declare that constant in a portable way
+
+    M ./mpi/api.c -1 +1
+    M ./mpi/f03api.sh -2 +2
+    M ./mpi/fftw3-mpi.h -4 +4
+
+Fri Jun 24 15:05:05 EDT 2011  athena@fftw.org
+  * Fix libtool shared version info.
+  
+  FFTW-3.3.x should be a direct drop-in replacement for all FFTW-3.x.y
+  versions.
+  
+
+    M ./configure.ac -1 +4
+
+Fri Jun 24 14:38:47 EDT 2011  stevenj@fftw.org
+  * add MPI Fortran API and wrappers
+
+    M ./api/Makefile.am -2 +3
+    M ./api/f03api.sh -3 +5
+    M ./api/genf03.pl -2 +14
+    M ./mpi/Makefile.am -3 +14
+    R ./mpi/README
+    A ./mpi/f03-wrap.sh
+    A ./mpi/f03api.sh
+    M ./mpi/fftw3-mpi.h -4 +4
+    A ./mpi/genf03-wrap.pl
+
+Fri Jun 24 14:51:12 EDT 2011  athena@fftw.org
+  * Do not require fig2dev on the user's machine
+  
+  Distribute the manual's figures in PDF/PS/PNG form instead.
+
+    M ./doc/Makefile.am -10 +16
+
+Fri Jun 24 11:52:44 EDT 2011  athena@fftw.org
+  * Remove --enable-portable-binary, --with-gcc-arch from documentation.
+
+    M ./NEWS +3
+    M ./doc/install.texi -23 +2
+
+Fri Jun 24 11:48:48 EDT 2011  athena@fftw.org
+  * Forget about specifying nonportable CFLAGS.  Let the user do it if he wants.
+
+    M ./m4/Makefile.am -4 +3
+    M ./m4/ax_cc_maxopt.m4 -24 +10
+    R ./m4/ax_gcc_archflag.m4
+    R ./m4/ax_gcc_x86_cpuid.m4
+
+Fri Jun 24 11:48:25 EDT 2011  athena@fftw.org
+  * Add "-avx" to version string when appropriate.
+
+    M ./api/version.c +4
+
+Fri Jun 24 10:26:38 EDT 2011  athena@fftw.org
+  * change 3.3-alpha => 3.3-beta1
+
+    M ./configure.ac -1 +1
+
+Fri Jun 24 09:25:49 EDT 2011  athena@fftw.org
+  * Extend OUR_MALLOC16 to larger alignments
+  
+  Make it work for 32-byte alignment and beyond, as needed by AVX.
+  Rename --with-our-malloc16 to --with-our-malloc.  Keep old --with-our-malloc16
+  flag for compatibility.
+  
+
+    M ./configure.ac -3 +4
+    M ./kernel/kalloc.c -9 +10
+
+Fri Jun 24 09:19:38 EDT 2011  athena@fftw.org
+  * Fix typo
+
+    M ./doc/install.texi -1 +1
+
+Fri Jun 24 09:10:26 EDT 2011  athena@fftw.org
+  * One pass over the manual.
+
+    M ./NEWS -1 +6
+    M ./doc/Makefile.am -5 +2
+    M ./doc/install.texi -16 +10
+    M ./doc/intro.texi -1 +1
+    M ./doc/other.texi -2 +2
+    M ./doc/reference.texi -5 +5
+    M ./doc/tutorial.texi -32 +32
+
+Fri Jun 24 08:19:03 EDT 2011  athena@fftw.org
+  * eliminate the WITH_ALIGNED_STACK hack
+  
+  This is 2011 and I have no system with incorrect stack alignment.
+  
+
+    M ./TODO -2
+    M ./api/apiplan.c -9 +4
+    M ./api/execute-dft-c2r.c -2 +2
+    M ./api/execute-dft-r2c.c -2 +2
+    M ./api/execute-dft.c -2 +2
+    M ./api/execute-r2r.c -2 +2
+    M ./api/execute-split-dft-c2r.c -2 +2
+    M ./api/execute-split-dft-r2c.c -2 +2
+    M ./api/execute-split-dft.c -2 +2
+    M ./api/execute.c -2 +2
+    M ./api/f77funcs.h -16 +16
+    M ./doc/Makefile.am -2 +2
+    M ./doc/fftw3.texi -1 +1
+    M ./doc/intro.texi -3 +4
+    M ./doc/legacy-fortran.texi -1 +1
+    M ./doc/other.texi -51 +4
+    M ./doc/reference.texi -1 +1
+    M ./doc/tutorial.texi -9 +5
+    M ./kernel/ifftw.h -51
+    M ./threads/ct.c -2 +2
+    M ./threads/dft-vrank-geq1.c -2 +2
+    M ./threads/hc2hc.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -2 +2
+    M ./threads/vrank-geq1-rdft2.c -2 +2
+
+Fri Jun 24 07:49:47 EDT 2011  athena@fftw.org
+  * enable both threaded and unthreaded wisdom in tools/fftw-wisdom
+
+    M ./TODO -7 +2
+    M ./tools/fftw-wisdom.c -18 +14
+
+Fri Jun 24 02:40:04 EDT 2011  stevenj@fftw.org
+  * clarification
+
+    M ./doc/modern-fortran.texi -2 +2
+
+Fri Jun 24 02:24:01 EDT 2011  stevenj@fftw.org
+  * check for error code in example
+
+    M ./doc/modern-fortran.texi +2
+
+Fri Jun 24 02:22:18 EDT 2011  stevenj@fftw.org
+  * cleanup - since NATIVE_MALLOC is always malloc, delete this #define
+
+    M ./api/export-wisdom-to-string.c -1 +1
+    M ./kernel/ifftw.h -2
+
+Fri Jun 24 02:19:44 EDT 2011  stevenj@fftw.org
+  * document wisdom string import/export in Fortran
+
+    M ./doc/modern-fortran.texi +49
+
+Fri Jun 24 02:11:40 EDT 2011  stevenj@fftw.org
+  * bug fix - NATIVE_MALLOC should always be plain malloc, even in debug_malloc mode, because it is used in the API to return things that should be deallocated with free(); correspondingly, be sure to use free() ansd not X(free) with this
+
+    M ./kernel/ifftw.h -1 +1
+    M ./mpi/wisdom-api.c -1 +1
+
+Fri Jun 24 01:35:27 EDT 2011  stevenj@fftw.org
+  * declaration style
+
+    M ./doc/modern-fortran.texi -2 +4
+
+Fri Jun 24 01:25:36 EDT 2011  stevenj@fftw.org
+  * document wisdom file export/import from Fortran; add export/import_to/from_filename functions for convenience 
+
+    M ./api/export-wisdom-to-file.c +11
+    M ./api/fftw3.h +2
+    M ./api/import-wisdom-from-file.c +10
+    M ./doc/modern-fortran.texi -9 +79
+    M ./doc/other.texi -5 +6
+    M ./doc/reference.texi -11 +22
+
+Thu Jun 23 19:19:43 EDT 2011  stevenj@fftw.org
+  * more fortran docs
+
+    M ./doc/legacy-fortran.texi -5 +6
+    M ./doc/modern-fortran.texi -10 +167
+    M ./doc/reference.texi +2
+
+Thu Jun 23 17:50:30 EDT 2011  stevenj@fftw.org
+  * enforce 132-character line-length limit that is the default in Fortran
+
+    M ./api/genf03.pl -3 +26
+
+Wed Jun 22 23:27:31 EDT 2011  stevenj@fftw.org
+  * the F03 standard is ambiguous about whether types can be assigned to wider types as formal parameters with VALUE attributes, and e.g. gfortran interprets it to disallow this code
+
+    M ./doc/modern-fortran.texi -2 +4
+
+Thu Jun 23 18:12:10 EDT 2011  athena@fftw.org
+  * Add md5 hash of fftw's configuration to wisdom file
+  
+  People were already confused by threaded vs unthreaded wisdom, and now
+  things will be even worse because we enable/disable AVX codelets at
+  runtime.  Accept incoming wisdom only if it was produced by the same
+  configuration (modulo MD5).
+
+    M ./kernel/planner.c -2 +33
+
+Thu Jun 23 09:01:27 EDT 2011  athena@fftw.org
+  * distribute fftw3.f03.in
+
+    M ./api/Makefile.am -1 +1
+
+Wed Jun 22 22:02:18 EDT 2011  stevenj@fftw.org
+  * more Fortran documentation
+
+    M ./doc/modern-fortran.texi -1 +95
+
+Wed Jun 22 20:10:39 EDT 2011  stevenj@fftw.org
+  * correct comment
+
+    M ./api/genf03.pl -1 +2
+
+Wed Jun 22 20:26:18 EDT 2011  athena@fftw.org
+  * Use "sh FOO.sh" instead of "./FOO.sh" to avoid chmod +x.
+
+    M ./api/Makefile.am -1 +1
+
+Wed Jun 22 19:19:05 EDT 2011  stevenj@fftw.org
+  * document fftw_alloc_real/complex ... should we switch to using these in the tutorial examples?
+
+    M ./doc/reference.texi +19
+
+Wed Jun 22 19:07:49 EDT 2011  stevenj@fftw.org
+  * whoops, added missing file
+
+    A ./api/genf03.pl
+
+Wed Jun 22 18:46:01 EDT 2011  athena@fftw.org
+  * Note addition of AVX.
+
+    M ./NEWS +2
+
+Wed Jun 22 18:43:43 EDT 2011  athena@fftw.org
+  * In SSE2, AVX: use FMA macros when applicable.
+  
+  Makes it easier to play with fma4 and fma3 when it comes out.
+
+    M ./simd-support/simd-avx.h -21 +21
+    M ./simd-support/simd-sse2.h -19 +19
+
+Wed Jun 22 18:16:45 EDT 2011  stevenj@fftw.org
+  * all modern Fortran compilers can call FFTW's C interfface directly -- support this, and in particular generate a Fortran 2003 interface file from fftw3.h so that Fortran code calling FFTW can be typechecked ((addressing the source of a lot of Fortran-user problems)
+
+     ./doc/fortran.texi -> ./doc/legacy-fortran.texi
+    M ./api/Makefile.am -3 +12
+    A ./api/f03api.sh
+    M ./api/fftw3.h -3 +11
+    M ./api/malloc.c +18
+    M ./configure.ac +12
+    M ./doc/Makefile.am -1 +1
+    M ./doc/fftw3.texi -2 +4
+    M ./doc/install.texi -2 +2
+    M ./doc/intro.texi -5 +6
+    M ./doc/legacy-fortran.texi -23 +38
+    A ./doc/modern-fortran.texi
+    M ./doc/mpi.texi -1 +1
+    M ./doc/other.texi -2 +4
+    M ./doc/upgrading.texi -1 +1
+
+Wed Jun 22 13:10:02 EDT 2011  athena@fftw.org
+  * some cleanup of SSE2 macros
+
+    M ./simd-support/simd-sse2.h -27 +16
+
+Wed Jun 22 07:38:18 EDT 2011  athena@fftw.org
+  * don't use -xHost on ICC
+  
+  -xHost with ICC is problematic.  On icc-12.0.0, "-mavx -xHost"
+  overrides -mavx with -xHost, generating SSE2 code instead of AVX code.
+  ICC does not seem to support -mtune=host or equivalent non-ABI
+  changing flag.
+  
+
+    M ./m4/ax_cc_maxopt.m4 -31 +5
+
+Tue Jun 21 20:35:36 EDT 2011  athena@fftw.org
+  * Complete AVX implementation for split codelets
+
+    M ./configure.ac -36 +38
+    M ./simd-support/simd-avx.h -41 +26
+
+Tue Jun 21 19:37:14 EDT 2011  stevenj@fftw.org
+  * whoops, missing altivec conf patches
+
+    M ./dft/codelet-dft.h +1
+    M ./dft/conf.c +4
+    M ./rdft/codelet-rdft.h +1
+    M ./rdft/conf.c +4
+
+Tue Jun 21 19:12:45 EDT 2011  stevenj@fftw.org
+  * some BSD ar versions (e.g. on MacOS X) give an error if there are no object files, so we cannot build empty libraries
+
+    M ./dft/simd/altivec/Makefile.am -4 +3
+    M ./dft/simd/avx/Makefile.am -3 +4
+    M ./dft/simd/sse2/Makefile.am -4 +3
+    M ./rdft/simd/altivec/Makefile.am -4 +3
+    M ./rdft/simd/avx/Makefile.am -3 +4
+    M ./rdft/simd/sse2/Makefile.am -3 +4
+
+Tue Jun 21 19:12:12 EDT 2011  stevenj@fftw.org
+  * re-insertion of Altivec code
+
+    M ./Makefile.am -1 +6
+    M ./configure.ac -8 +13
+    M ./dft/simd/Makefile.am -1 +1
+    A ./dft/simd/altivec/
+    A ./dft/simd/altivec/Makefile.am
+    M ./kernel/ifftw.h -2 +9
+    M ./rdft/simd/Makefile.am -1 +1
+    A ./rdft/simd/altivec/
+    A ./rdft/simd/altivec/Makefile.am
+    M ./simd-support/Makefile.am -1 +1
+    A ./simd-support/altivec.c
+    A ./simd-support/simd-altivec.h
+    M ./simd-support/simd-common.h +3
+
+Tue Jun 21 16:26:09 EDT 2011  athena@fftw.org
+  * Implement faster AVX loads/stores.
+
+    M ./simd-support/simd-avx.h -21 +28
+
+Tue Jun 21 16:03:24 EDT 2011  athena@fftw.org
+  * Initial AVX256/single implementation
+  
+  This should be correct but slow.  I need to figure out how to implement
+  noncontiguous loads/stores efficiently.
+
+    M ./simd-support/simd-avx.h -53 +133
+
+Tue Jun 21 14:13:57 EDT 2011  athena@fftw.org
+  * fix AVX alignment
+
+    M ./simd-support/simd-common.h -6 +8
+
+Tue Jun 21 14:07:28 EDT 2011  athena@fftw.org
+  * rename avx256d -> avx
+  
+  AVX will work in both double and single precision, like SSE2.
+
+     ./dft/simd/avx256d -> ./dft/simd/avx
+     ./rdft/simd/avx256d -> ./rdft/simd/avx
+     ./simd-support/avx256d.c -> ./simd-support/avx.c
+     ./simd-support/simd-avx256d.h -> ./simd-support/simd-avx.h
+    M ./Makefile.am -4 +4
+    M ./configure.ac -11 +6
+    M ./dft/codelet-dft.h -1 +1
+    M ./dft/conf.c -3 +3
+    M ./dft/simd/Makefile.am -1 +1
+    M ./dft/simd/avx/Makefile.am -4 +4
+    M ./kernel/ifftw.h -2 +2
+    M ./rdft/codelet-rdft.h -1 +1
+    M ./rdft/conf.c -3 +3
+    M ./rdft/simd/Makefile.am -1 +1
+    M ./rdft/simd/avx/Makefile.am -4 +4
+    M ./simd-support/Makefile.am -1 +1
+    M ./simd-support/avx.c -3 +3
+    M ./simd-support/simd-avx.h -2 +2
+
+Tue Jun 21 13:52:20 EDT 2011  athena@fftw.org
+  * remove CODELET_OPTIM
+  
+  In the old 32-bit gcc-3.x days we used to play games with gcc to force
+  it to produce decent code.  Now gcc has gotten smarter and it produces
+  indecent code no matter what we do, so it is safe to remove these hacks.
+  
+
+    M ./api/version.c -5 +5
+    M ./configure.ac -40
+    M ./support/Makefile.codelets -6
+
+Tue Jun 21 09:57:31 EDT 2011  athena@fftw.org
+  * work around gcc/icc quirks
+
+    M ./simd-support/simd-avx256d.h -14 +45
+
+Tue Jun 21 09:56:07 EDT 2011  athena@fftw.org
+  * Add remarks in places where we work around gcc quirks
+
+    M ./simd-support/simd-sse2.h -7 +16
+
+Mon Jun 20 21:17:59 EDT 2011  stevenj@fftw.org
+  * remove the libbench directory (which we have kept lingering in the repository for years due to CVS's inability to remove directories)
+
+    R ./libbench/Makefile.am
+    R ./libbench/accopy-from.c
+    R ./libbench/accopy-to.c
+    R ./libbench/acopy.c
+    R ./libbench/allocate.c
+    R ./libbench/ascale.c
+    R ./libbench/aset.c
+    R ./libbench/bench-main.c
+    R ./libbench/bench-user.h
+    R ./libbench/bench.h
+    R ./libbench/caadd.c
+    R ./libbench/cacopy.c
+    R ./libbench/can-do.c
+    R ./libbench/cascale.c
+    R ./libbench/caset.c
+    R ./libbench/casub.c
+    R ./libbench/ccopy-from.c
+    R ./libbench/ccopy-to.c
+    R ./libbench/copy-c2c-from.c
+    R ./libbench/copy-c2c-to.c
+    R ./libbench/copy-c2h-1d-fftpack.c
+    R ./libbench/copy-c2h-1d-halfcomplex.c
+    R ./libbench/copy-c2h-1d-packed.c
+    R ./libbench/copy-c2h-1d-unpacked-ri.c
+    R ./libbench/copy-c2h-unpacked.c
+    R ./libbench/copy-c2h.c
+    R ./libbench/copy-c2r-packed.c
+    R ./libbench/copy-c2r-unpacked.c
+    R ./libbench/copy-c2r.c
+    R ./libbench/copy-c2ri.c
+    R ./libbench/copy-h2c-1d-fftpack.c
+    R ./libbench/copy-h2c-1d-halfcomplex.c
+    R ./libbench/copy-h2c-1d-packed.c
+    R ./libbench/copy-h2c-1d-unpacked-ri.c
+    R ./libbench/copy-h2c-unpacked.c
+    R ./libbench/copy-h2c.c
+    R ./libbench/copy-r2c-packed.c
+    R ./libbench/copy-r2c-unpacked.c
+    R ./libbench/copy-r2c.c
+    R ./libbench/copy-ri2c.c
+    R ./libbench/deallocate.c
+    R ./libbench/getopt-utils.c
+    R ./libbench/getopt.c
+    R ./libbench/getopt.h
+    R ./libbench/getopt1.c
+    R ./libbench/info.c
+    R ./libbench/log2.c
+    R ./libbench/main.c
+    R ./libbench/mflops.c
+    R ./libbench/mp.c
+    R ./libbench/ovtpvt.c
+    R ./libbench/pow2.c
+    R ./libbench/prime.c
+    R ./libbench/problem.c
+    R ./libbench/report.c
+    R ./libbench/speed.c
+    R ./libbench/timer.c
+    R ./libbench/unnormalize.c
+    R ./libbench/util.c
+    R ./libbench/verify.c
+    R ./libbench/zero.c
+    R ./libbench/
+
+Mon Jun 20 21:17:14 EDT 2011  stevenj@fftw.org
+  * update URLs
+
+    M ./doc/install.texi -2 +2
+    M ./doc/intro.texi -1 +1
+    M ./doc/license.texi -1 +1
+    M ./doc/mpi.texi -1 +1
+    M ./doc/other.texi -1 +1
+
+Mon Jun 20 20:53:31 EDT 2011  stevenj@fftw.org
+  * whoops, forgot to check in alignment change
+
+    M ./simd-support/simd-common.h -3 +6
+
+Mon Jun 20 20:22:23 EDT 2011  athena@fftw.org
+  * "test X = Y" requires spaces around "="
+
+    M ./configure.ac -1 +1
+
+Mon Jun 20 19:18:52 EDT 2011  stevenj@fftw.org
+  * indenting
+
+    M ./simd-support/simd-sse2.h -3 +3
+
+Mon Jun 20 18:57:10 EDT 2011  stevenj@fftw.org
+  * merge back in SSE support, now combined with SSE2; --enable-sse2 now works in both single and double precision, and simd-sse2.h contains both the double- and single-precision code (which overlap a lot); in single precision it is still compiled for SSE-only (SSE2 is only required for double)
+
+    M ./configure.ac -16 +9
+    M ./simd-support/simd-sse2.h -54 +181
+    M ./simd-support/sse2-nonportable.c -2 +6
+    M ./simd-support/sse2.c -4 +11
+
+Mon Jun 20 16:02:07 EDT 2011  athena@fftw.org
+  * Implement AVX autodetection (gcc-only so far)
+
+    M ./simd-support/Makefile.am -1 +1
+    A ./simd-support/amd64-cpuid.h
+    M ./simd-support/avx256d.c -2 +32
+    M ./simd-support/x86-cpuid.h +33
+
+Mon Jun 20 14:25:54 EDT 2011  athena@fftw.org
+  * Add VZEROUPPER at the end of AVX codelets
+  
+  If the Intel Optimization Manual is to be believed, we need to wave a
+  dead chicken before transitioning from AVX code to SSE code.  I am
+  supposed to believe that there is a transition penalty for doing so,
+  unless one uses a magic VZEROUPPER instruction that apparently has
+  zero cost.  Whatever.
+  
+
+    M ./genfft/c.ml +2
+    M ./genfft/c.mli +1
+    M ./genfft/gen_hc2c.ml -1 +1
+    M ./genfft/gen_hc2cdft.ml -1 +1
+    M ./genfft/gen_hc2cdft_c.ml -1 +1
+    M ./genfft/gen_hc2hc.ml -1 +1
+    M ./genfft/gen_mdct.ml -1 +1
+    M ./genfft/gen_notw.ml -1 +1
+    M ./genfft/gen_notw_c.ml -1 +1
+    M ./genfft/gen_r2cb.ml -1 +1
+    M ./genfft/gen_r2cf.ml -1 +1
+    M ./genfft/gen_r2r.ml -1 +1
+    M ./genfft/gen_twiddle.ml -1 +1
+    M ./genfft/gen_twiddle_c.ml -1 +1
+    M ./genfft/gen_twidsq.ml -1 +1
+    M ./genfft/gen_twidsq_c.ml -1 +1
+    M ./genfft/genutil.ml -2 +2
+    M ./genfft/simd.ml +1
+    M ./simd-support/simd-avx256d.h +5
+    M ./simd-support/simd-sse2.h +2
+
+Mon Jun 20 10:21:25 EDT 2011  athena@fftw.org
+  * Move RDFT to new simd scheme
+
+    A ./rdft/simd/avx256d/
+    R ./rdft/simd/codelets/Makefile.am
+    R ./rdft/simd/codelets/
+    A ./rdft/simd/common/
+    A ./rdft/simd/sse2/
+    M ./Makefile.am -2 +4
+    M ./configure.ac +4
+    M ./dft/simd/Makefile.am -1 +1
+    M ./genfft/gen_hc2cdft_c.ml -2 +2
+    M ./kernel/ifftw.h +1
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/codelet-rdft.h -1 +2
+    M ./rdft/conf.c -2 +7
+    M ./rdft/simd/Makefile.am -6 +2
+    A ./rdft/simd/avx256d/Makefile.am
+    A ./rdft/simd/codlist.mk
+    A ./rdft/simd/common/Makefile.am
+    A ./rdft/simd/common/genus.c
+    R ./rdft/simd/hc2cbv.c
+    M ./rdft/simd/hc2cbv.h -2 +2
+    R ./rdft/simd/hc2cfv.c
+    M ./rdft/simd/hc2cfv.h -2 +2
+    A ./rdft/simd/simd.mk
+    A ./rdft/simd/sse2/Makefile.am
+
+Mon Jun 20 09:23:38 EDT 2011  athena@fftw.org
+  * New SIMD build system
+  
+  We now support multiple SIMD extensions in the same binary, e.g.
+  --enable-sse2 --enable-avx.  This patch adds the necessary
+  infrastructure for SSE2/AVX and complex DFT.  Later patches will add
+  RDFT and SSE/ALTIVEC/etc.
+  
+
+    A ./dft/simd/avx256d/
+    A ./simd-support/
+    R ./dft/simd/codelets/Makefile.am
+    R ./dft/simd/codelets/
+    A ./dft/simd/common/
+    A ./dft/simd/sse2/
+    M ./Makefile.am -11 +13
+    M ./configure.ac -47 +74
+    M ./dft/codelet-dft.h -4 +2
+    M ./dft/conf.c -2 +7
+    M ./dft/scalar/codelets/Makefile.am -1 +1
+    M ./dft/simd/Makefile.am -6 +3
+    A ./dft/simd/avx256d/Makefile.am
+    A ./dft/simd/codlist.mk
+    A ./dft/simd/common/Makefile.am
+    A ./dft/simd/common/genus.c
+    R ./dft/simd/n1b.c
+    M ./dft/simd/n1b.h -2 +2
+    R ./dft/simd/n1f.c
+    M ./dft/simd/n1f.h -2 +2
+    R ./dft/simd/n2b.c
+    M ./dft/simd/n2b.h -2 +2
+    R ./dft/simd/n2f.c
+    M ./dft/simd/n2f.h -2 +2
+    R ./dft/simd/n2s.c
+    M ./dft/simd/n2s.h -2 +2
+    R ./dft/simd/q1b.c
+    M ./dft/simd/q1b.h -2 +2
+    R ./dft/simd/q1f.c
+    M ./dft/simd/q1f.h -2 +2
+    A ./dft/simd/simd.mk
+    A ./dft/simd/sse2/Makefile.am
+    R ./dft/simd/t.c
+    M ./dft/simd/t1b.h -2 +2
+    M ./dft/simd/t1bu.h -2 +2
+    M ./dft/simd/t1f.h -2 +2
+    M ./dft/simd/t1fu.h -2 +2
+    M ./dft/simd/t2b.h -2 +2
+    M ./dft/simd/t2f.h -2 +2
+    M ./dft/simd/t3b.h -2 +2
+    M ./dft/simd/t3f.h -2 +2
+    R ./dft/simd/ts.c
+    M ./dft/simd/ts.h -2 +2
+    M ./genfft/gen_twiddle.ml -2 +2
+    M ./genfft/genutil.ml -2 +5
+    M ./kernel/ifftw.h -2 +6
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/conf.c -1 +1
+    M ./rdft/scalar/r2cb/Makefile.am +1
+    M ./rdft/scalar/r2cf/Makefile.am +1
+    M ./rdft/scalar/r2r/Makefile.am +1
+    A ./simd-support/Makefile.am
+    A ./simd-support/avx256d.c
+    A ./simd-support/simd-avx256d.h
+    A ./simd-support/simd-common.h
+    A ./simd-support/simd-sse2.h
+    A ./simd-support/sse2-nonportable.c
+    A ./simd-support/sse2.c
+    A ./simd-support/taint.c
+    A ./simd-support/x86-cpuid.h
+    M ./support/Makefile.codelets -21 +22
+
+Sun Jun 19 12:29:27 EDT 2011  stevenj@fftw.org
+  * italicize Latin quote
+
+    M ./doc/reference.texi -2 +2
+
+Sun Jun 19 12:26:34 EDT 2011  stevenj@fftw.org
+  * work around incredibly annoying makeinfo bug -- for HTML output, in any paragraph ending with an @index command, two blank lines are needed to create a paragraph break  ... our HTML output has apparently been screwed up for years
+
+    M ./doc/acknowledgements.texi +2
+    M ./doc/fortran.texi +1
+    M ./doc/install.texi +2
+    M ./doc/intro.texi +1
+    M ./doc/mpi.texi +20
+    M ./doc/other.texi +2
+    M ./doc/reference.texi +21
+    M ./doc/threads.texi +2
+    M ./doc/tutorial.texi +13
+
+Sun Jun 19 12:01:39 EDT 2011  stevenj@fftw.org
+  * conjugate-pair algorithm turns out not to be due to djb, but it was pointed out to us by djb
+
+    M ./doc/intro.texi -3 +4
+
+Sun Jun 19 11:59:21 EDT 2011  stevenj@fftw.org
+  * new-array execute functions are *not* the same for MPI, since the problems are different
+
+    M ./doc/mpi.texi -2 +3
+
+Sun Jun 19 11:57:52 EDT 2011  stevenj@fftw.org
+  * tweaks to MPI manual
+
+    M ./doc/mpi.texi -19 +44
+    M ./doc/reference.texi -1 +1
+
+Sun Jun 19 00:47:23 EDT 2011  stevenj@fftw.org
+  * only call MPI_Init_thread for MPI version >= 2
+
+    M ./mpi/mpi-bench.c +5
+
+Sun Jun 19 00:35:44 EDT 2011  stevenj@fftw.org
+  * document quad precision in Fortran
+
+    M ./doc/fortran.texi -9 +12
+
+Sun Jun 19 00:16:10 EDT 2011  stevenj@fftw.org
+  * use -lquadmath for quad-precision library in pkgconfig file
+
+    M ./configure.ac +2
+    M ./fftw.pc.in -1 +1
+
+Sun Jun 19 00:10:33 EDT 2011  stevenj@fftw.org
+  * document quad precision
+
+    M ./doc/install.texi -1 +11
+    M ./doc/reference.texi +8
+
+Sat Jun 18 21:26:24 EDT 2011  stevenj@fftw.org
+  * there is currently no standard quad-precision type in MPI, so don't allow MPI support to be compiled with --enable-quad-precision
+
+    M ./configure.ac +3
+    M ./mpi/ifftw-mpi.h +2
+    M ./mpi/mpi-bench.c +2
+
+Sat Jun 18 21:25:51 EDT 2011  stevenj@fftw.org
+  * make quad-precision library libfftwq
+
+    M ./configure.ac +1
+
+Sat Jun 18 21:19:50 EDT 2011  stevenj@fftw.org
+  * use --estimate in check script with --enable-random-estimator
+
+    M ./configure.ac +2
+    M ./mpi/Makefile.am -1 +1
+    M ./tests/Makefile.am -5 +5
+
+Sat Jun 18 20:36:51 EDT 2011  stevenj@fftw.org
+  * bug fix - correct crashing interaction between threads and debug-malloc
+
+    M ./threads/threads.c +4
+
+Sat Jun 18 18:41:48 EDT 2011  stevenj@fftw.org
+  * whoops, typo
+
+    M ./doc/reference.texi -1 +1
+
+Sat Jun 18 18:35:14 EDT 2011  stevenj@fftw.org
+  * clarification of when fftw_cost may return 0
+
+    M ./doc/reference.texi -2 +5
+
+Sat Jun 18 18:31:26 EDT 2011  stevenj@fftw.org
+  * corrected manual and test program for proper interaction of MPI and threads
+
+    M ./doc/mpi.texi -20 +59
+    M ./mpi/mpi-bench.c +13
+    M ./tests/fftw-bench.c -4 +10
+    M ./tests/fftw-bench.h +5
+
+Sat Jun 18 17:13:52 EDT 2011  stevenj@fftw.org
+  * need --mpi restriction in mpi+threads check
+
+    M ./mpi/Makefile.am -1 +1
+
+Sat Jun 18 12:09:04 EDT 2011  stevenj@fftw.org
+  * split fftw3.texi into multiple files for ease of editing
+
+    M ./doc/Makefile.am -2 +1
+    A ./doc/acknowledgements.texi
+    A ./doc/cindex.texi
+    M ./doc/fftw3.texi -6158 +14
+    A ./doc/findex.texi
+    A ./doc/fortran.texi
+    A ./doc/install.texi
+    A ./doc/intro.texi
+    A ./doc/license.texi
+    A ./doc/mpi.texi
+    A ./doc/other.texi
+    A ./doc/reference.texi
+    A ./doc/threads.texi
+    A ./doc/tutorial.texi
+    A ./doc/upgrading.texi
+
+Fri Jun 17 17:26:50 EDT 2011  stevenj@fftw.org
+  * merge recent Cell deletion with MPI branch
+
+    M ./Makefile.am -3 +2
+    M ./NEWS +40
+    M ./configure.ac -1 +1
+    M ./doc/fftw3.texi -95 +909
+
+Fri Jun 17 01:52:59 EDT 2011  stevenj@fftw.org
+  * whoops, incorrect assertion
+
+    M ./mpi/transpose-problem.c -2 +2
+
+Fri Jun 17 01:52:51 EDT 2011  stevenj@fftw.org
+  * comment fix
+
+    M ./mpi/block.c -2 +2
+
+Thu Jun 16 23:30:27 EDT 2011  stevenj@fftw.org
+  * check if pln creation failed (e.g. for split input) bbefore calling setup_gather_scatter, to prevent crashes
+
+    M ./mpi/mpi-bench.c -1 +1
+
+Thu Jun 16 23:26:48 EDT 2011  stevenj@fftw.org
+  * bug fix -- transpose-recurse is only applicable if subtransposes fit in the same space (unless I change the allocation routine, but this would seem to require looking at all possible recursive invocations of transpose-recurse)
+
+    M ./mpi/transpose-recurse.c +28
+
+Mon Apr 11 17:58:03 EDT 2011  stevenj@fftw.org
+  * yikes, any_true check on subplan creation should be in comm, not comm2, so that all processes know if failure occurred
+
+    M ./mpi/transpose-recurse.c -6 +4
+
+Mon Apr 11 17:00:46 EDT 2011  stevenj@fftw.org
+  * add wisdom_ok_hook to enforce wisdom synchronization on MPI problems, apparently fixing a longstanding deadlock/crash bug
+
+    M ./kernel/ifftw.h +1
+    M ./kernel/planner.c +7
+    M ./mpi/api.c -13 +61
+
+Mon Apr 11 15:10:22 EDT 2011  stevenj@fftw.org
+  * add a check (in DEBUG mode only) that all processes produce the same hash of MPI problems; don't include alignment in MPI problem hash because it may differ between processes for unaligned malloc
+
+    M ./mpi/any-true.c +21
+    M ./mpi/dft-problem.c -2 +6
+    M ./mpi/ifftw-mpi.h +1
+    M ./mpi/rdft-problem.c -2 +6
+    M ./mpi/rdft2-problem.c -2 +6
+    M ./mpi/transpose-problem.c -2 +6
+
+Fri Apr  8 18:46:54 EDT 2011  stevenj@fftw.org
+  * use cost_hook in random_estimate
+
+    M ./kernel/planner.c -1 +1
+
+Sun Mar  6 23:33:53 EST 2011  stevenj@fftw.org
+  * added mpi new-array execute functions; thanks to Guo Luo for the bug report
+
+    M ./mpi/api.c +23
+    M ./mpi/fftw3-mpi.h -1 +6
+
+Wed Feb  9 21:29:17 EST 2011  stevenj@fftw.org
+  * MPI may not support tags > 2^15-1 (e.g. Cray MPI requires tags < 2^24); thanks to Jonathan Bentz for the bug report.
+
+    M ./mpi/transpose-pairwise.c -4 +4
+
+Wed Feb  2 12:21:30 EST 2011  stevenj@fftw.org
+  * fix merge conflicts
+
+    M! ./configure.ac -1 +1
+
+Sat Nov 15 22:33:20 EST 2008  stevenj@fftw.org
+  tagged fftw-3.3alpha1
+
+
+Sat Nov 15 20:33:33 EST 2008  stevenj@fftw.org
+  * version bump for 3.3alpha1
+
+    M! ./NEWS -40
+    M! ./configure.ac -2 +2
+
+Sun Oct 26 22:47:07 EDT 2008  stevenj@fftw.org
+  * re-added mpi/Makefile
+
+    M ./configure.ac +2
+
+Sat Oct 25 17:14:42 EDT 2008  stevenj@fftw.org
+  * re-add MPI to dist
+
+    M! ./Makefile.am -2 +3
+    M! ./doc/fftw3.texi -9 +134
+
+Sat Jun 18 08:50:13 EDT 2011  athena@fftw.org
+  * remove obsolete Cell code
+
+    M ./kernel/align.c -1 +1
+    M ./kernel/ifftw.h -9 +2
+    M ./rdft/rank0.c -83
+    M ./simd/simd.h -6
+    M ./simd/taint.c -1 +1
+    M ./tests/fftw-bench.c -7
+
+Fri Jun 17 23:31:33 EDT 2011  stevenj@fftw.org
+  * bug fix in accuracy test, which prevented us from consiistently determining accuracy in > double precision
+
+    M ./libbench2/mp.c -2 +2
+
+Fri Jun 17 20:05:13 EDT 2011  athena@fftw.org
+  * do not check for gcc version before checking for gcc
+
+    M ./configure.ac -1 +1
+
+Fri Jun 17 18:56:37 EDT 2011  stevenj@fftw.org
+  * require gcc 4.6.0 or later for --enable-quad-precision, to match fftw3.h header file; no need to mark this as EXPERIMENTAL (make check passes, and support in gcc 4.6 seems reasonably complete)
+
+    M ./configure.ac -1 +2
+
+Fri Jun 17 18:51:41 EDT 2011  stevenj@fftw.org
+  * need ugly __attribute__ to use __float128 with _Complex, ugh
+
+    M ./api/fftw3.h +9
+
+Fri Jun 17 18:23:05 EDT 2011  stevenj@fftw.org
+  * --verify tolerance in quad precision changed to 1e-29
+
+    M ./libbench2/bench-main.c -1 +1
+
+Fri Jun 17 18:22:38 EDT 2011  stevenj@fftw.org
+  * quad-precision F77 api should use "qfftw" prefix
+
+    M ./api/x77.h +3
+
+Fri Jun 17 18:22:27 EDT 2011  stevenj@fftw.org
+  * rm extraneous space from fftw3.h
+
+    M ./api/fftw3.h -1 +1
+
+Fri Jun 17 18:05:10 EDT 2011  stevenj@fftw.org
+  * use cosq etcetera with libquadmath in libbench2, so that --verify correctly gives ~33 decimal places in shift test
+
+    M ./libbench2/verify.h -1 +4
+
+Fri Jun 17 17:52:51 EDT 2011  stevenj@fftw.org
+  * libquadmath ships with gcc 4.6.0, so we should require this library for sinq/cosq with --enable-quad-precision; also, include the __float128 FFTW functions in the header file for gcc >= 4.6 on i86/x86??_64/ia64
+
+    M ./Makefile.am +5
+    M ./api/fftw3.h -1 +6
+    M ./configure.ac -1 +5
+    M ./kernel/trig.c -13 +4
+
+Fri Jun 17 16:54:01 EDT 2011  stevenj@fftw.org
+  * typo in manual for fftw_cost
+
+    M ./doc/fftw3.texi -1 +1
+
+Fri Jun 17 16:48:24 EDT 2011  stevenj@fftw.org
+  * fix fftw_cost function: pcost needs to be saved in mkapiplan, since the plan is re-created from wisdom
+
+    M ./api/apiplan.c +5
+    M ./tests/fftw-bench.c -2 +3
+
+Fri Jun 17 16:42:25 EDT 2011  athena@fftw.org
+  * removed support for the Cell Broadband Engine
+
+    R ./cell/spu/Makefile.am
+    R ./cell/spu/alloc.spuc
+    R ./cell/spu/copy.spuc
+    R ./cell/spu/dft.spuc
+    R ./cell/spu/dma.spuc
+    R ./cell/spu/execute.spuc
+    R ./cell/spu/fftw-spu.h
+    R ./cell/spu/main.spuc
+    R ./cell/spu/planner.spuc
+    R ./cell/spu/spu-double.h
+    R ./cell/spu/spu-single.h
+    R ./cell/spu/transpose.spuc
+    R ./cell/spu/
+    R ./cell/Makefile.am
+    R ./cell/cell.c
+    R ./cell/conf.c
+    R ./cell/copy.c
+    R ./cell/dft-direct-cell.c
+    R ./cell/fftw-cell.h
+    R ./cell/plans-double.c
+    R ./cell/plans-single.c
+    R ./cell/spufftw-embed.S
+    R ./cell/transpose.c
+    R ./cell/
+    M ./AUTHORS -2 +3
+    M ./Makefile.am -11 +5
+    M ./NEWS +2
+    R ./README.Cell
+    M ./api/configure.c -3
+    M ./configure.ac -18 +1
+    M ./dft/dft.h -4
+    M ./dft/rank-geq2.c -6
+    M ./doc/fftw3.texi -128 +4
+
+Tue May 24 06:51:07 EDT 2011  athena@fftw.org
+  * Undo previous change; the typo was not a typo after all.
+
+    M ./doc/fftw3.texi -1 +1
+
+Mon May 23 05:08:05 EDT 2011  athena@fftw.org
+  * Fix typo in manual
+
+    M ./doc/fftw3.texi -1 +1
+
+Sat May 21 17:37:50 EDT 2011  athena@fftw.org
+  * clarify intent about canonicalization of tensor in tensor_compress_contiguous()
+
+    M ./kernel/tensor7.c -1 +9
+
+Sat May 21 17:30:31 EDT 2011  athena@fftw.org
+  * avoid useless canonicalization in tensor_compress_contiguous()
+
+    M ./kernel/tensor7.c -1 +1
+
+Sat May 21 17:24:57 EDT 2011  athena@fftw.org
+  * Fix tensor_compress_contiguous
+  
+  tensor_compress_contiguous() was supposed to sort dimensions by
+  descending istride, and then compress adjacent dimensions.  This
+  property was lost once we changed the canonical order of strides to be
+  sorted by descending min{istride,ostride}.  
+  
+  Change tensor_compress_contiguous() to sort by descending istride
+  again, which is necessary for its correctness, and then canonicalize
+  at the end.
+  
+
+    M ./kernel/tensor7.c -20 +41
+
+Sun May  8 18:47:26 EDT 2011  athena@fftw.org
+  * Don't distribute obsolete .depend
+
+    M ./genfft/Makefile.am -2 +2
+
+Sun May  8 18:05:36 EDT 2011  athena@fftw.org
+  * Use ocamlbuild for building genfft
+  
+  Remove the old Makefile cruft to support ocaml, and use ocamlbuild
+  instead.
+
+    M ./bootstrap.sh -2
+    M ./configure.ac -10 +1
+    M ./genfft/Makefile.am -151 +8
+    M ./m4/Makefile.am -1 +1
+    R ./m4/ocaml.m4
+    M ./support/Makefile.codelets -13 +13
+
+Sun May  8 18:03:07 EDT 2011  athena@fftw.org
+  * Do not use __float128 unless BENCHFFT_QUAD is defined
+  
+  Otherwise, compilation fails on compilers that do not support
+  __float128.
+  
+
+    M ./libbench2/bench-user.h +5
+
+Fri Apr  8 13:15:54 EDT 2011  stevenj@fftw.org
+  * fix configure --help string for --disable-alloca (since default is enabled)
+
+    M ./configure.ac -1 +1
+
+Fri Apr  8 13:09:56 EDT 2011  stevenj@fftw.org
+  * add "random estimator" for debugging purposes; note  that this is best used with ESTIMATE_PATIENT mode
+
+    M ./api/apiplan.c +4
+    M ./configure.ac +5
+    M ./kernel/ifftw.h +4
+    M ./kernel/planner.c +27
+    M ./tests/fftw-bench.c +7
+
+Tue Apr  5 14:47:56 EDT 2011  stevenj@fftw.org
+  * add AC_CHECK_DECLS for srand48; thanks to Ralf Wildenhues for the bug report
+
+    M ./configure.ac -1 +1
+    M ./libbench2/util.c +3
+
+Sat Feb  5 17:00:40 EST 2011  stevenj@fftw.org
+  * experimental support for gcc's __float128 quad-precision type
+
+    M ./api/fftw3.h +3
+    M ./configure.ac -2 +14
+    M ./kernel/ifftw.h -2 +10
+    M ./kernel/trig.c -1 +16
+    M ./libbench2/bench-main.c +2
+    M ./libbench2/bench-user.h +4
+    M ./libbench2/info.c -1 +2
+    M ./libbench2/verify.h +6
+    M ./simd/simd-sse2.h -1 +1
+    M ./tests/fftw-bench.h +2
+    M ./tools/fftw-wisdom.c +2
+
+Sun Oct 24 14:33:59 EDT 2010  athena@fftw.org
+  * guarantee that "timelimit < 0" means "no timeout"
+  "timelimit < 0" was always meant to be equivalent to
+  "timelimit = HUGENUM", but this was not true in all cases,
+  causing some obscure wisdom behavior.
+  
+  Thanks William Andrew Burnson for the bug report.
+  
+
+    M ./api/mapflags.c -1 +1
+
+Sun Oct 24 14:32:20 EDT 2010  athena@fftw.org
+  * compile with --enable-fma and SSE, SSE2
+  Allow compilation with --enable-fma and --enable-sse, --enable-sse2.
+  This is a bad idea performance-wise, but people will try anyway.
+
+    M ./simd/simd-sse.h +5
+    M ./simd/simd-sse2.h +5
+
+Sun Jul 11 13:34:06 EDT 2010  athena@fftw.org
+  * Make threads.c compiler with c++
+
+    M ./threads/threads.c -1 +1
+
+Sun Jul 11 10:05:05 EDT 2010  athena@fftw.org
+  * Attempt at clarifying the advanced interface doc.
+
+    M ./doc/fftw3.texi -72 +151
+
+Sun Jul 11 07:37:27 EDT 2010  athena@fftw.org
+  * rename rfftwnd html picture
+  
+  It turns out that texinfo with pdf output reads .png
+  files in preference to .pdf files (when did this change?).
+  I renamed the .png figure to avoid producing an ugly pdf file.
+
+    M ./doc/Makefile.am -4 +4
+    M ./doc/fftw3.texi +5
+
+Tue Mar 30 19:43:22 EDT 2010  stevenj@fftw.org
+  * added fftw_cost function; this is the second time people have asked for this, and there is a reasonable use for it in comparing e.g. oout-of-place vs. in-place plans
+
+    M ./NEWS +4
+    M ./api/f77funcs.h +10
+    M ./api/fftw3.h +1
+    M ./api/flops.c +5
+    M ./doc/fftw3.texi +17
+
+Tue Mar  2 18:55:49 EST 2010  stevenj@fftw.org
+  * documented that --enable-debug-malloc causes fftw_execute to be thread-unsafe (thanks to Alexis Rohou for the problem report)
+
+    M ./doc/fftw3.texi +8
+
+Fri Jan 22 19:42:08 EST 2010  athena@fftw.org
+  * Added FAQ about how to transpose matrices using FFTW.
+
+    M ./doc/FAQ/fftw-faq.bfnn +33
+
+Thu Jan  7 20:16:57 EST 2010  stevenj@fftw.org
+  * catch FMS (instead of generating FMA(_,_,NEG(_)) with h -generic-arith option
+
+    M ./genfft/c.ml +2
+
+Fri Dec 11 07:01:26 EST 2009  athena@fftw.org
+  * note future wisdom enhancements.
+
+    M ./TODO +5
+
+Mon Oct 19 20:21:05 EDT 2009  athena@fftw.org
+  * Use SIMD flags when checking for xmmintrin.h
+  
+  This prevents an obnoxious warning from configure.
+
+    M ./configure.ac -3 +3
+
+Sat Aug 29 20:47:56 EDT 2009  athena@fftw.org
+  * new bug
+
+    M ./TODO +4
+
+Sun Jul 26 00:40:11 EDT 2009  stevenj@fftw.org
+  * typo (s/man1/many)
+
+    M ./doc/fftw3.texi -1 +1
+
+Tue Jul 14 14:19:08 EDT 2009  stevenj@fftw.org
+  * BUILD-MINGW32 script, updated Windows README
+
+    A ./support/BUILD-MINGW32.sh
+    M ./support/BUILD-MINGW64.sh -4 +4
+
+Mon Jul 13 09:40:38 EDT 2009  fftw@fftw.org
+  * cleanup BUILD-MINGW64.sh
+
+    M ./support/BUILD-MINGW64.sh -19 +27
+
+Sun Jul 12 06:34:46 EDT 2009  athena@fftw.org
+  * Update NEWS, version number for 3.2.2 release.
+
+    M ./NEWS +3
+    M ./configure.ac -2 +2
+
+Sat Jul 11 22:28:38 EDT 2009  athena@fftw.org
+  * Reintroduce the pruning heuristic in ESTIMATE mode for r2r problems.
+  
+  Somehow, we lost this feature between fftw-3.1.3 and fftw-3.2.
+
+    M ./rdft/hc2hc-direct.c +3
+
+Thu Jun 25 07:39:04 EDT 2009  athena@fftw.org
+  * don't use pshared=1 in sem_init
+  
+  pshared is really not necessary, and it is not supported on
+  GNU/kFreeBSD.  Thanks Petr Salinger for the bug report.
+
+    M ./threads/threads.c -2 +2
+
+Thu Jun 11 19:35:40 EDT 2009  fftw@fftw.org
+  * Add mingw64 build file so that we can track it.
+
+    A ./support/BUILD-MINGW64.sh
+
+Wed Jun 10 12:10:58 EDT 2009  fftw@fftw.org
+  * note 3.2.2 NEWS
+
+    M ./NEWS +7
+
+Wed Jun 10 12:04:54 EDT 2009  fftw@fftw.org
+  * add --disable-alloca to configure
+  
+  It looks like alloca() is broken on mingw64, and thus
+  we need to disable it explicitly.
+
+    M ./configure.ac -1 +6
+    M ./kernel/ifftw.h -1 +1
+
+Sun Apr 26 16:33:10 EDT 2009  athena@fftw.org
+  * Note in FAQ that --enable-k7 has been discontinued.
+
+    M ./doc/FAQ/fftw-faq.bfnn -4 +7
+
+Tue Mar 24 09:16:18 EDT 2009  athena@fftw.org
+  * clarified small confusion in fftw_cleanup documentation
+
+    M ./doc/fftw3.texi -2 +3
+
+Thu Mar 19 13:18:06 EDT 2009  stevenj@fftw.org
+  * fix documentation of dfftw_init_threads to indicate thaat it takes an argument (since the C version returns a value); thanks t Hans Johnnston for the bug report
+
+    M ./doc/fftw3.texi -1 +5
+
+Thu Mar 12 13:12:13 EDT 2009  fftw@fftw.org
+  * if possible, use a 128-bit type for copy
+
+    M ./configure.ac -1 +1
+    M ./kernel/cpy2d.c -9 +38
+
+Tue Mar 10 12:49:51 EDT 2009  fftw@fftw.org
+  * add size-128 simd codelets
+  
+  It's about time
+
+    M ./dft/simd/codelets/Makefile.am -2 +2
+
+Mon Mar  9 20:29:16 EDT 2009  athena@fftw.org
+  * copy two floats as a double when possible
+  
+  Resurrect the old hack of copying two floats as a double,
+  which makes some difference in these days of 64 bit boxes.
+  
+
+    M ./kernel/cpy2d.c -16 +34
+
+Sun Mar  8 18:08:04 EDT 2009  athena@fftw.org
+  * fixed (harmless) confusion of strides
+  
+  RS and VS were swapped in dftw-direct.c.  This is a bug, but
+  it is harmless unless one uses fixed-stride codelets, which we
+  do not.
+
+    M ./dft/dftw-direct.c -7 +7
+
+Sun Mar  8 10:29:49 EDT 2009  athena@fftw.org
+  * oops, I checked in debug code accidentally.
+
+    M ./kernel/ct.c -1
+
+Sat Feb 14 19:01:00 EST 2009  athena@fftw.org
+  * Change TLO email address since Magdalen is no longer there.
+
+    M ./commercialize.sh -1 +1
+
+Sat Feb 14 18:18:45 EST 2009  stevenj@fftw.org
+  * quote arguments to bench in test script on the off-chance that '*' would be expanded by the shell into a valid filename, and also to avoid shell confusion on Cygwin that "//" begins the name of a Windows network mountpoint
+
+    M ./tests/check.pl -1 +1
+
+Sat Feb 14 18:17:23 EST 2009  stevenj@fftw.org
+  tagged fftw-3.2.1
+
+
+Sun Mar  8 10:02:59 EDT 2009  athena@fftw.org
+  * stricter conditions for Cooley-Tukey being ugly
+  
+  It turns out that m=2 in the leaf of Cooley-Tukey may be
+  advantageous in certain cases, eg. i512v512 on AMD Shanghai:
+  
+  (dft-buffered-512-x128/512-6
+    (dft-ct-dit/4
+      (dftw-direct-4/24-x128 "t2fv_4")
+      (dft-vrank>=1-x4/1
+        (dft-ct-dit/64
+          (dftw-direct-64/504-x128 "t2fv_64")
+          (dft-vrank>=1-x64/1
+            (dft-direct-2-x128 "n2fv_2")))))
+    (dft-r2hc-1
+      (rdft-rank0-tiled/2-x128-x512))
+    (dft-nop))
+  
+  Presumably this works around the 2 way associativity of the L1 cache.
+
+    M ./dft/dftw-direct.c -1 +1
+    M ./kernel/ct.c -2 +3
+    M ./kernel/ifftw.h -1 +1
+    M ./rdft/ct-hc2c-direct.c -1 +1
+    M ./rdft/hc2hc-direct.c -3 +3
+
+Mon Feb  9 19:46:00 EST 2009  stevenj@fftw.org
+  * disable Windows QueryPerformanceCounter code, since it requires us to pull in windows.h in ifftw.h and causes namespace conflicts; gettimeofday seems to work well enough and has had few complaints
+
+    M ./kernel/ifftw.h -4 +1
+    M ./kernel/timer.c -1 +8
+
+Wed Feb  4 22:55:54 EST 2009  stevenj@fftw.org
+  * version bump for 3.2.1, updated NEWS
+
+    M ./NEWS +10
+    M ./configure.ac -1 +1
+
+Wed Feb  4 22:27:28 EST 2009  stevenj@fftw.org
+  * recommend that users avoid fftw_execute in Fortran, instead using dfftw_execute_dft and friends so that the compiler knows that the input/output arrays are used
+
+    M ./doc/fftw3.texi -10 +80
+
+Wed Jan 21 16:02:08 EST 2009  stevenj@fftw.org
+  * prefer windows queryperformancecounter to gettimeofday on Windows, thanks to David Price for the suggestion
+
+    M ./kernel/ifftw.h -1 +4
+    M ./kernel/timer.c -1 +20
+
+Sun Feb  1 14:34:49 EST 2009  athena@fftw.org
+  * compilation fixes in case snprintf() is defined as a macro.
+
+    M ./libbench2/report.c -8 +7
+
+Wed Jan 28 20:19:04 EST 2009  athena@fftw.org
+  * Automake does not like continuation lines beginning with a comment.
+
+    M ./rdft/scalar/r2cb/Makefile.am -1 +1
+
+Wed Jan 28 18:24:39 EST 2009  athena@fftw.org
+  * Add r2cb_2.c
+  
+  r2cb_2.c is needed for problem rb2, which is not equivalent to
+  rf2 (unlike kb2, which is equivalent to kf2).
+  
+  This change would not matter much except that rb2 is generated
+  when reducing backward rdft2 to dft, and the absence of the codelet was
+  preventing radix 2 from being employed at all in this case.
+
+    M ./rdft/scalar/r2cb/Makefile.am -6 +4
+
+Sat Jan 10 06:47:22 EST 2009  athena@fftw.org
+  * handle the case vecsz->rnk == 0 correctly.
+
+    M ./dft/buffered.c -1 +3
+    M ./rdft/buffered.c -1 +3
+    M ./rdft/buffered2.c -1 +3
+
+Fri Dec 19 15:20:36 EST 2008  stevenj@fftw.org
+  * Macs are no longer ppc-based; thanks to Charles Collicutt for the FAQ update
+
+    M ./doc/FAQ/fftw-faq.bfnn -3 +3
+
+Mon Dec  8 18:08:33 EST 2008  stevenj@fftw.org
+  * use new multiple-nbuf code in rdft/buffered, like for dft/buffered
+
+    M ./rdft/buffered.c -11 +25
+    M ./rdft/buffered2.c -12 +26
+
+Sat Dec  6 16:34:36 EST 2008  stevenj@fftw.org
+  * make x86_cpuid macro work on x86_64
+
+    M ./m4/ax_gcc_x86_cpuid.m4 -1 +15
+
+Sat Dec  6 09:20:37 EST 2008  athena@fftw.org
+  * Allow automatic choice of buffer size in dft/buffered.c
+  
+  Try a couple of different buffer sizes in buffered transforms,
+  since this seems to make a difference on some Core2 models.
+  
+
+    M ./api/mapflags.c -2
+    M ./dft/buffered.c -10 +25
+    M ./kernel/buffered.c -3 +19
+    M ./kernel/ifftw.h -1 +5
+    M ./rdft/buffered.c -2 +3
+    M ./rdft/buffered2.c -2 +3
+    M ./rdft/rdft2-rdft.c -1 +1
+
+Tue Dec  2 19:18:30 EST 2008  athena@fftw.org
+  * libbench2: do not assume that split-complex arrays are stride-1
+
+    M ./libbench2/verify-dft.c -4 +3
+    M ./libbench2/verify.c -1 +3
+    M ./tests/bench.c -3 +2
+
+Tue Dec  2 18:39:43 EST 2008  stevenj@fftw.org
+  * updated NEWS
+
+    M ./NEWS +14
+
+Tue Dec  2 18:30:00 EST 2008  stevenj@fftw.org
+  * date fix
+
+    M ./m4/ax_gcc_archflag.m4 -1 +1
+
+Tue Dec  2 18:29:06 EST 2008  stevenj@fftw.org
+  * updated icc flags -- now prefer -xHost (-xN etc. seem t be obsolete), check for new spelling -ansi-alias, and use -malign-double like we do for gcc
+
+    M ./m4/ax_cc_maxopt.m4 -2 +14
+
+Tue Dec  2 18:28:03 EST 2008  stevenj@fftw.org
+  * use $ax_cv_c_compiler_vendor rather than $GCC, as the former is more reliable (icc incorrectly self-identifies as gcc on MacOS where we don't use -no-gcc)
+
+    M ./configure.ac -1 +1
+
+Tue Dec  2 17:55:36 EST 2008  stevenj@fftw.org
+  * don't use -no-gcc for icc on MacOS
+
+    M ./configure.ac -2 +7
+
+Tue Dec  2 17:34:04 EST 2008  stevenj@fftw.org
+  * document some more bench options
+
+    M ./tests/README -1 +17
+
+Wed Nov 19 16:55:13 EST 2008  stevenj@fftw.org
+  * make it clearer that --enable-openmp and --enable-threads are mutually exclusive; thanks to Long To for his comments
+
+    M ./doc/fftw3.texi -6 +10
+
+Mon Nov 17 20:16:28 EST 2008  stevenj@fftw.org
+  * version bump to 3.2.1, use explicit Makefile.am for m4 subdirectory so that tarball does not include random files in there when you do 'make dist'
+
+    M ./Makefile.am -3 +2
+    M ./configure.ac -2 +4
+    A ./m4/Makefile.am
+
+Sat Nov 15 21:12:58 EST 2008  stevenj@fftw.org
+  * document behavior of FFTW guru arrays, and in particular the odd behavior of the plan_guru_r2r routine in Fortran (thanks to Alexander Pozdneev for the bug report)
+
+    M ./doc/fftw3.texi +10
+
+Sat Nov 15 21:03:14 EST 2008  stevenj@fftw.org
+  tagged fftw-3.2
+
+
+Mon Nov 10 20:21:32 EST 2008  stevenj@fftw.org
+  * version bump to 3.2, updated copyright year
+
+    M ./COPYRIGHT -2 +2
+    M ./NEWS -1 +6
+    M ./api/api.h -2 +2
+    M ./api/apiplan.c -2 +2
+    M ./api/configure.c -2 +2
+    M ./api/execute-dft-c2r.c -2 +2
+    M ./api/execute-dft-r2c.c -2 +2
+    M ./api/execute-dft.c -2 +2
+    M ./api/execute-r2r.c -2 +2
+    M ./api/execute-split-dft-c2r.c -2 +2
+    M ./api/execute-split-dft-r2c.c -2 +2
+    M ./api/execute-split-dft.c -2 +2
+    M ./api/execute.c -2 +2
+    M ./api/export-wisdom-to-file.c -2 +2
+    M ./api/export-wisdom-to-string.c -2 +2
+    M ./api/export-wisdom.c -2 +2
+    M ./api/f77api.c -2 +2
+    M ./api/f77funcs.h -2 +2
+    M ./api/fftw3.h -2 +2
+    M ./api/flops.c -2 +2
+    M ./api/forget-wisdom.c -2 +2
+    M ./api/import-system-wisdom.c -2 +2
+    M ./api/import-wisdom-from-file.c -2 +2
+    M ./api/import-wisdom-from-string.c -2 +2
+    M ./api/import-wisdom.c -2 +2
+    M ./api/malloc.c -2 +2
+    M ./api/map-r2r-kind.c -2 +2
+    M ./api/mapflags.c -2 +2
+    M ./api/mkprinter-file.c -2 +2
+    M ./api/mktensor-iodims.h -2 +2
+    M ./api/mktensor-rowmajor.c -2 +2
+    M ./api/plan-dft-1d.c -2 +2
+    M ./api/plan-dft-2d.c -2 +2
+    M ./api/plan-dft-3d.c -2 +2
+    M ./api/plan-dft-c2r-1d.c -2 +2
+    M ./api/plan-dft-c2r-2d.c -2 +2
+    M ./api/plan-dft-c2r-3d.c -2 +2
+    M ./api/plan-dft-c2r.c -2 +2
+    M ./api/plan-dft-r2c-1d.c -2 +2
+    M ./api/plan-dft-r2c-2d.c -2 +2
+    M ./api/plan-dft-r2c-3d.c -2 +2
+    M ./api/plan-dft-r2c.c -2 +2
+    M ./api/plan-dft.c -2 +2
+    M ./api/plan-guru-dft-c2r.h -2 +2
+    M ./api/plan-guru-dft-r2c.h -2 +2
+    M ./api/plan-guru-dft.h -2 +2
+    M ./api/plan-guru-r2r.h -2 +2
+    M ./api/plan-guru-split-dft-c2r.h -2 +2
+    M ./api/plan-guru-split-dft-r2c.h -2 +2
+    M ./api/plan-guru-split-dft.h -2 +2
+    M ./api/plan-many-dft-c2r.c -2 +2
+    M ./api/plan-many-dft-r2c.c -2 +2
+    M ./api/plan-many-dft.c -2 +2
+    M ./api/plan-many-r2r.c -2 +2
+    M ./api/plan-r2r-1d.c -2 +2
+    M ./api/plan-r2r-2d.c -2 +2
+    M ./api/plan-r2r-3d.c -2 +2
+    M ./api/plan-r2r.c -2 +2
+    M ./api/print-plan.c -2 +2
+    M ./api/rdft2-pad.c -2 +2
+    M ./api/the-planner.c -2 +2
+    M ./api/version.c -2 +2
+    M ./api/x77.h -2 +2
+    M ./configure.ac -2 +2
+    M ./dft/bluestein.c -2 +2
+    M ./dft/buffered.c -2 +2
+    M ./dft/codelet-dft.h -2 +2
+    M ./dft/conf.c -2 +2
+    M ./dft/ct.c -2 +2
+    M ./dft/ct.h -2 +2
+    M ./dft/dft.h -2 +2
+    M ./dft/dftw-direct.c -2 +2
+    M ./dft/dftw-directsq.c -2 +2
+    M ./dft/dftw-generic.c -2 +2
+    M ./dft/dftw-genericbuf.c -2 +2
+    M ./dft/direct.c -2 +2
+    M ./dft/generic.c -2 +2
+    M ./dft/indirect-transpose.c -2 +2
+    M ./dft/indirect.c -2 +2
+    M ./dft/kdft-dif.c -2 +2
+    M ./dft/kdft-difsq.c -2 +2
+    M ./dft/kdft-dit.c -2 +2
+    M ./dft/kdft.c -2 +2
+    M ./dft/nop.c -2 +2
+    M ./dft/plan.c -2 +2
+    M ./dft/problem.c -2 +2
+    M ./dft/rader.c -2 +2
+    M ./dft/rank-geq2.c -2 +2
+    M ./dft/scalar/n.c -2 +2
+    M ./dft/scalar/n.h -2 +2
+    M ./dft/scalar/t.c -2 +2
+    M ./dft/scalar/t.h -2 +2
+    M ./dft/simd/n1b.c -2 +2
+    M ./dft/simd/n1b.h -2 +2
+    M ./dft/simd/n1f.c -2 +2
+    M ./dft/simd/n1f.h -2 +2
+    M ./dft/simd/n2b.c -2 +2
+    M ./dft/simd/n2b.h -2 +2
+    M ./dft/simd/n2f.c -2 +2
+    M ./dft/simd/n2f.h -2 +2
+    M ./dft/simd/n2s.c -2 +2
+    M ./dft/simd/n2s.h -2 +2
+    M ./dft/simd/q1b.c -2 +2
+    M ./dft/simd/q1b.h -2 +2
+    M ./dft/simd/q1f.c -2 +2
+    M ./dft/simd/q1f.h -2 +2
+    M ./dft/simd/t.c -2 +2
+    M ./dft/simd/t1b.h -2 +2
+    M ./dft/simd/t1bu.h -2 +2
+    M ./dft/simd/t1f.h -2 +2
+    M ./dft/simd/t1fu.h -2 +2
+    M ./dft/simd/t2b.h -2 +2
+    M ./dft/simd/t2f.h -2 +2
+    M ./dft/simd/t3b.h -2 +2
+    M ./dft/simd/t3f.h -2 +2
+    M ./dft/simd/ts.c -2 +2
+    M ./dft/simd/ts.h -2 +2
+    M ./dft/solve.c -2 +2
+    M ./dft/vrank-geq1.c -2 +2
+    M ./dft/zero.c -2 +2
+    M ./doc/f77_wisdom.f -2 +2
+    M ./genfft/algsimp.ml -2 +2
+    M ./genfft/algsimp.mli -2 +2
+    M ./genfft/annotate.ml -2 +2
+    M ./genfft/annotate.mli -2 +2
+    M ./genfft/assoctable.ml -2 +2
+    M ./genfft/assoctable.mli -2 +2
+    M ./genfft/c.ml -2 +2
+    M ./genfft/c.mli -2 +2
+    M ./genfft/complex.ml -2 +2
+    M ./genfft/complex.mli -2 +2
+    M ./genfft/conv.ml -2 +2
+    M ./genfft/conv.mli -2 +2
+    M ./genfft/dag.ml -2 +2
+    M ./genfft/dag.mli -2 +2
+    M ./genfft/expr.ml -2 +2
+    M ./genfft/expr.mli -2 +2
+    M ./genfft/fft.ml -2 +2
+    M ./genfft/fft.mli -2 +2
+    M ./genfft/gen_hc2c.ml -2 +2
+    M ./genfft/gen_hc2cdft.ml -2 +2
+    M ./genfft/gen_hc2cdft_c.ml -2 +2
+    M ./genfft/gen_hc2hc.ml -2 +2
+    M ./genfft/gen_mdct.ml -2 +2
+    M ./genfft/gen_notw.ml -2 +2
+    M ./genfft/gen_notw_c.ml -2 +2
+    M ./genfft/gen_r2cb.ml -2 +2
+    M ./genfft/gen_r2cf.ml -2 +2
+    M ./genfft/gen_r2r.ml -2 +2
+    M ./genfft/gen_twiddle.ml -2 +2
+    M ./genfft/gen_twiddle_c.ml -2 +2
+    M ./genfft/gen_twidsq.ml -2 +2
+    M ./genfft/gen_twidsq_c.ml -2 +2
+    M ./genfft/genutil.ml -2 +2
+    M ./genfft/littlesimp.ml -2 +2
+    M ./genfft/littlesimp.mli -2 +2
+    M ./genfft/magic.ml -2 +2
+    M ./genfft/monads.ml -2 +2
+    M ./genfft/number.ml -2 +2
+    M ./genfft/number.mli -2 +2
+    M ./genfft/oracle.ml -2 +2
+    M ./genfft/oracle.mli -2 +2
+    M ./genfft/schedule.ml -2 +2
+    M ./genfft/schedule.mli -2 +2
+    M ./genfft/simd.ml -2 +2
+    M ./genfft/simd.mli -2 +2
+    M ./genfft/simdmagic.ml -2 +2
+    M ./genfft/to_alist.ml -2 +2
+    M ./genfft/to_alist.mli -2 +2
+    M ./genfft/trig.ml -2 +2
+    M ./genfft/trig.mli -2 +2
+    M ./genfft/twiddle.ml -2 +2
+    M ./genfft/twiddle.mli -2 +2
+    M ./genfft/unique.ml -2 +2
+    M ./genfft/unique.mli -2 +2
+    M ./genfft/util.ml -2 +2
+    M ./genfft/util.mli -2 +2
+    M ./genfft/variable.ml -2 +2
+    M ./genfft/variable.mli -2 +2
+    M ./kernel/align.c -2 +2
+    M ./kernel/alloc.c -2 +2
+    M ./kernel/assert.c -2 +2
+    M ./kernel/awake.c -2 +2
+    M ./kernel/buffered.c -2 +2
+    M ./kernel/cpy1d.c -2 +2
+    M ./kernel/cpy2d-pair.c -2 +2
+    M ./kernel/cpy2d.c -2 +2
+    M ./kernel/ct.c -2 +2
+    M ./kernel/cycle.h -2 +2
+    M ./kernel/debug.c -2 +2
+    M ./kernel/extract-reim.c -2 +2
+    M ./kernel/hash.c -2 +2
+    M ./kernel/iabs.c -2 +2
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/kalloc.c -2 +2
+    M ./kernel/md5-1.c -2 +2
+    M ./kernel/md5.c -2 +2
+    M ./kernel/minmax.c -2 +2
+    M ./kernel/ops.c -2 +2
+    M ./kernel/pickdim.c -2 +2
+    M ./kernel/plan.c -2 +2
+    M ./kernel/primes.c -2 +2
+    M ./kernel/print.c -2 +2
+    M ./kernel/problem.c -2 +2
+    M ./kernel/rader.c -2 +2
+    M ./kernel/scan.c -2 +2
+    M ./kernel/solver.c -2 +2
+    M ./kernel/solvtab.c -2 +2
+    M ./kernel/stride.c -2 +2
+    M ./kernel/tensor.c -2 +2
+    M ./kernel/tensor1.c -2 +2
+    M ./kernel/tensor2.c -2 +2
+    M ./kernel/tensor3.c -2 +2
+    M ./kernel/tensor4.c -2 +2
+    M ./kernel/tensor5.c -2 +2
+    M ./kernel/tensor7.c -2 +2
+    M ./kernel/tensor8.c -2 +2
+    M ./kernel/tensor9.c -2 +2
+    M ./kernel/tile2d.c -2 +2
+    M ./kernel/timer.c -2 +2
+    M ./kernel/transpose.c -2 +2
+    M ./kernel/trig.c -2 +2
+    M ./kernel/twiddle.c -2 +2
+    M ./libbench2/dotens2.c -2 +2
+    M ./libbench2/my-getopt.c -2 +2
+    M ./libbench2/my-getopt.h -2 +2
+    M ./libbench2/verify-dft.c -2 +2
+    M ./libbench2/verify-lib.c -2 +2
+    M ./libbench2/verify-r2r.c -2 +2
+    M ./libbench2/verify-rdft2.c -2 +2
+    M ./libbench2/verify.h -2 +2
+    M ./mpi/any-true.c -2 +2
+    M ./mpi/api.c -2 +2
+    M ./mpi/block.c -2 +2
+    M ./mpi/choose-radix.c -2 +2
+    M ./mpi/conf.c -2 +2
+    M ./mpi/dft-problem.c -2 +2
+    M ./mpi/dft-rank-geq2-transposed.c -2 +2
+    M ./mpi/dft-rank-geq2.c -2 +2
+    M ./mpi/dft-rank1-bigvec.c -2 +2
+    M ./mpi/dft-rank1.c -2 +2
+    M ./mpi/dft-serial.c -2 +2
+    M ./mpi/dft-solve.c -2 +2
+    M ./mpi/dtensor.c -2 +2
+    M ./mpi/fftw3-mpi.h -2 +2
+    M ./mpi/ifftw-mpi.h -2 +2
+    M ./mpi/mpi-dft.h -2 +2
+    M ./mpi/mpi-rdft.h -2 +2
+    M ./mpi/mpi-rdft2.h -2 +2
+    M ./mpi/mpi-transpose.h -2 +2
+    M ./mpi/rdft-problem.c -2 +2
+    M ./mpi/rdft-rank-geq2-transposed.c -2 +2
+    M ./mpi/rdft-rank-geq2.c -2 +2
+    M ./mpi/rdft-rank1-bigvec.c -2 +2
+    M ./mpi/rdft-serial.c -2 +2
+    M ./mpi/rdft-solve.c -2 +2
+    M ./mpi/rdft2-problem.c -2 +2
+    M ./mpi/rdft2-rank-geq2-transposed.c -2 +2
+    M ./mpi/rdft2-rank-geq2.c -2 +2
+    M ./mpi/rdft2-serial.c -2 +2
+    M ./mpi/rdft2-solve.c -2 +2
+    M ./mpi/rearrange.c -2 +2
+    M ./mpi/testsched.c -2 +2
+    M ./mpi/transpose-alltoall.c -2 +2
+    M ./mpi/transpose-pairwise.c -2 +2
+    M ./mpi/transpose-problem.c -2 +2
+    M ./mpi/transpose-recurse.c -2 +2
+    M ./mpi/transpose-solve.c -2 +2
+    M ./mpi/wisdom-api.c -2 +2
+    M ./rdft/buffered.c -2 +2
+    M ./rdft/buffered2.c -2 +2
+    M ./rdft/codelet-rdft.h -2 +2
+    M ./rdft/conf.c -2 +2
+    M ./rdft/ct-hc2c-direct.c -2 +2
+    M ./rdft/ct-hc2c.c -2 +2
+    M ./rdft/ct-hc2c.h -2 +2
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/dht-r2hc.c -2 +2
+    M ./rdft/dht-rader.c -2 +2
+    M ./rdft/direct-r2c.c -2 +2
+    M ./rdft/direct-r2r.c -2 +2
+    M ./rdft/direct2.c -2 +2
+    M ./rdft/generic.c -2 +2
+    M ./rdft/hc2hc-direct.c -2 +2
+    M ./rdft/hc2hc-generic.c -2 +2
+    M ./rdft/hc2hc.c -2 +2
+    M ./rdft/hc2hc.h -2 +2
+    M ./rdft/indirect.c -2 +2
+    M ./rdft/khc2c.c -2 +2
+    M ./rdft/khc2hc.c -2 +2
+    M ./rdft/kr2c.c -2 +2
+    M ./rdft/kr2r.c -2 +2
+    M ./rdft/nop.c -2 +2
+    M ./rdft/nop2.c -2 +2
+    M ./rdft/plan.c -2 +2
+    M ./rdft/plan2.c -2 +2
+    M ./rdft/problem.c -2 +2
+    M ./rdft/problem2.c -2 +2
+    M ./rdft/rank-geq2-rdft2.c -2 +2
+    M ./rdft/rank-geq2.c -2 +2
+    M ./rdft/rank0-rdft2.c -2 +2
+    M ./rdft/rank0.c -2 +2
+    M ./rdft/rdft-dht.c -2 +2
+    M ./rdft/rdft.h -2 +2
+    M ./rdft/rdft2-inplace-strides.c -2 +2
+    M ./rdft/rdft2-rdft.c -2 +2
+    M ./rdft/rdft2-strides.c -2 +2
+    M ./rdft/rdft2-tensor-max-index.c -2 +2
+    M ./rdft/scalar/hb.h -2 +2
+    M ./rdft/scalar/hc2c.c -2 +2
+    M ./rdft/scalar/hc2cb.h -2 +2
+    M ./rdft/scalar/hc2cf.h -2 +2
+    M ./rdft/scalar/hf.h -2 +2
+    M ./rdft/scalar/hfb.c -2 +2
+    M ./rdft/scalar/r2c.c -2 +2
+    M ./rdft/scalar/r2cb.h -2 +2
+    M ./rdft/scalar/r2cbIII.h -2 +2
+    M ./rdft/scalar/r2cf.h -2 +2
+    M ./rdft/scalar/r2cfII.h -2 +2
+    M ./rdft/scalar/r2r.c -2 +2
+    M ./rdft/scalar/r2r.h -2 +2
+    M ./rdft/simd/hc2cbv.c -2 +2
+    M ./rdft/simd/hc2cbv.h -2 +2
+    M ./rdft/simd/hc2cfv.c -2 +2
+    M ./rdft/simd/hc2cfv.h -2 +2
+    M ./rdft/solve.c -2 +2
+    M ./rdft/solve2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+    M ./rdft/vrank3-transpose.c -2 +2
+    M ./reodft/conf.c -2 +2
+    M ./reodft/redft00e-r2hc-pad.c -2 +2
+    M ./reodft/redft00e-r2hc.c -2 +2
+    M ./reodft/reodft.h -2 +2
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -2 +2
+    M ./reodft/reodft11e-radix2.c -2 +2
+    M ./reodft/rodft00e-r2hc-pad.c -2 +2
+    M ./reodft/rodft00e-r2hc.c -2 +2
+    M ./simd/altivec.c -2 +2
+    M ./simd/nonportable/sse.c -2 +2
+    M ./simd/nonportable/sse2.c -2 +2
+    M ./simd/simd-altivec.h -2 +2
+    M ./simd/simd-sse.h -2 +2
+    M ./simd/simd-sse2.h -2 +2
+    M ./simd/simd.h -2 +2
+    M ./simd/sse.c -2 +2
+    M ./simd/sse2.c -2 +2
+    M ./simd/taint.c -2 +2
+    M ./simd/x86-cpuid.h -2 +2
+    M ./threads/api.c -2 +2
+    M ./threads/conf.c -2 +2
+    M ./threads/ct.c -2 +2
+    M ./threads/dft-vrank-geq1.c -2 +2
+    M ./threads/f77api.c -2 +2
+    M ./threads/f77funcs.h -2 +2
+    M ./threads/hc2hc.c -2 +2
+    M ./threads/openmp.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -2 +2
+    M ./threads/threads.c -2 +2
+    M ./threads/threads.h -2 +2
+    M ./threads/vrank-geq1-rdft2.c -2 +2
+    M ./tools/fftw-wisdom-to-conf.1 -4 +4
+    M ./tools/fftw-wisdom-to-conf.in -2 +2
+    M ./tools/fftw-wisdom.c -2 +2
+    M ./tools/fftw_wisdom.1.in -4 +4
+
+Wed Nov  5 16:40:31 EST 2008  athena@fftw.org
+  * Store GPLv2 in darcs because automake installs GPLv3 these days.
+
+    A ./COPYING
+
+Thu Oct 30 15:03:41 EDT 2008  athena@fftw.org
+  * stylistic changes, comments
+
+    M ./threads/threads.c -13 +17
+
+Thu Oct 30 14:40:14 EDT 2008  athena@fftw.org
+  * oops
+
+    M ./threads/threads.c -27 +28
+
+Thu Oct 30 14:30:08 EDT 2008  athena@fftw.org
+  * simplification of the threading machinery
+
+    M ./threads/threads.c -12 +7
+
+Thu Oct 30 14:22:40 EDT 2008  athena@fftw.org
+  * typo
+
+    M ./threads/threads.c -1 +1
+
+Thu Oct 30 13:42:07 EDT 2008  athena@fftw.org
+  * [SECOND ATTEMPT] do not assume that a semaphore can be freed just because nobody is using it
+  
+    Let S be a semaphore, initially 0.  Let thread A execute UP(S);
+    let thread B execute DOWN(S); free(&S);  It is unclear whether this
+    code is correct with posix semaphores.  The problem is whether UP()
+    uses S after allowing DOWN() to continue; this seems to be the
+    case in the glibc-2.7 implementation, and thus the pattern above
+    seems to be incorrect.  Avoid using such a pattern, and introduce
+    a global semaphore for the unavoidable case when nothing else
+    can be depended upon.
+  
+
+    M ./threads/threads.c -61 +80
+
+Wed Oct 29 20:09:39 EDT 2008  stevenj@fftw.org
+  * updated cpu codes from x86-1.21
+
+    M ./m4/ax_gcc_archflag.m4 -5 +7
+
+Wed Oct 29 16:24:16 EDT 2008  athena@fftw.org
+  * Previous change was bogus, need to find another way.
+
+    M ./threads/threads.c -56 +42
+
+Wed Oct 29 12:22:20 EDT 2008  athena@fftw.org
+  * do not assume that a semaphore can be freed just because nobody is using it
+  
+  Let S be a semaphore, initially 0.  Let thread A execute UP(S);
+  let thread B execute DOWN(S); free(&S);  It is unclear whether this
+  code is correct with posix semaphores.  The problem is whether UP()
+  uses S after allowing DOWN() to continue; this seems to be the
+  case in the glibc-2.7 implementation, and thus the pattern above
+  seems to be incorrect.  Avoid using such a pattern, and introduce
+  a global semaphore for the unavoidable case when nothing else
+  can be depended upon.
+  
+
+    M ./threads/threads.c -42 +56
+
+Mon Oct 27 23:38:02 EDT 2008  stevenj@fftw.org
+  * don't need PROG_AS any more
+
+    M ./configure.ac -1
+    R ./m4/amx_prog_as.m4
+
+Sun Oct 26 23:41:11 EDT 2008  stevenj@fftw.org
+  * use AC_CONFIG_MACRO_DIR macro
+
+    M ./Makefile.am -1 +1
+    M ./configure.ac +1
+
+Sun Oct 26 10:08:44 EDT 2008  athena@fftw.org
+  * Remove mpi/Makefile from configure.ac
+  Otherwise, the tarball breaks because mpi/ is not in
+  the distribution.
+
+    M ./configure.ac -2
+
+Sat Oct 25 17:13:50 EDT 2008  stevenj@fftw.org
+  * remove MPI from dist until FFTW 3.3
+
+    M ./Makefile.am -1 +1
+    M ./NEWS +3
+    M ./doc/fftw3.texi -948 +9
+
+Sat Oct 25 17:12:35 EDT 2008  stevenj@fftw.org
+  * use MPIRUN even for -np 1
+
+    M ./mpi/Makefile.am -1 +1
+
+Fri Jul 18 17:17:08 EDT 2008  stevenj@fftw.org
+  * use new gcc arch=native flag as fallback
+
+    M ./m4/ax_gcc_archflag.m4 -5 +5
+
+Sat Oct 25 13:36:40 EDT 2008  athena@fftw.org
+  * Use sem_t to implement mutexes
+  Use sem_t instead of pthread_mutex_t to implement mutexes.
+  It seems like pthread mutexes hang on linux-2.6.22 after several
+  days of tests; the hang does not occur on linux >= 2.6.24 
+  or when we use sem_t instead of pthread_mutex_t.  The
+  situation is still quite mysterious but this code seems to
+  work.
+
+    M ./threads/threads.c -22 +34
+
+Thu Oct 23 13:32:45 EDT 2008  athena@fftw.org
+  * print informative message when pstring is NULL.
+
+    M ./libbench2/verify.c -1 +2
+
+Sun Oct 19 16:00:07 EDT 2008  athena@fftw.org
+  * Fix incorrect alignment in dftw-generic.
+  
+  Multithreaded dftw-generic is supposed to process only a slice
+  of the array, but we were planning with the alignment of the
+  original array rather than the slice.  This led to unaligned
+  accesses in certain obscure situations.
+  
+
+    M ./NEWS +3
+    M ./configure.ac -1 +1
+    M ./dft/dftw-generic.c -1 +2
+
+Mon Aug 18 17:27:26 EDT 2008  Matteo Frigo <athena@fftw.org>
+  * Paranoia: do not create OS threads while holding locks.
+  
+  Glibc at least plays silly games such as keeping a global variable
+  that records whether there is more than one thread in the process, and
+  it does not perform atomic operations if the variable says that there
+  is only one thread.  Who knows how this interacts with creating
+  threads while holding a lock.  Some day some genius will come up with
+  some ``optimization'' that breaks everything.
+
+    M ./threads/threads.c -7 +13
+
+Wed Aug  6 07:41:46 EDT 2008  athena@fftw.org
+  * Welcome to the quadcore era
+
+    M ./mkdist.sh -1 +1
+
+Mon Jun 16 16:46:39 EDT 2008  stevenj@fftw.org
+  * backslash is technically not allowed in "echo" arguments; thanks to Debian Bug#486046 for pointing out problem and solution (and Raphael Geissert and Vincent Zweije, in particular)
+
+    M ./tools/fftw-wisdom-to-conf.in -1 +1
+
+Sun May  4 12:15:24 EDT 2008  stevenj@fftw.org
+  * note problem with test program in gcc 4.1.2-4.2; thanks to Raymond Rogers for reporting it
+
+    M ./doc/FAQ/fftw-faq.bfnn +6
+
+Fri May  2 19:21:30 EDT 2008  stevenj@fftw.org
+  * output count of constants along with other statistics
+
+    M ./genfft/c.ml -3 +8
+
+Sat Apr 19 14:15:03 EDT 2008  athena@fftw.org
+  * Lower priority of unaligned SIMD codelets.
+  List t1[fb]uv_* codelets before the corresponding
+  aligned codelets, since the estimator picks the
+  latter ones in case of a tie and aligned codelets
+  are preferable.
+  
+  In other words, this is a hack.
+
+    M ./dft/simd/codelets/Makefile.am -3 +2
+
+Sat Apr 19 08:55:46 EDT 2008  Matteo Frigo <athena@fftw.org>
+  * There is no point in using higher radices for unaligned codelets.
+  
+
+    M ./dft/simd/codelets/Makefile.am -4 +2
+
+Fri Apr 18 19:01:27 EDT 2008  stevenj@fftw.org
+  * support generating loopless, strideless r2r codelets
+
+    M ./genfft/gen_r2r.ml -7 +17
+
+Fri Apr 18 19:00:25 EDT 2008  stevenj@fftw.org
+  * added Magic.threemult to use 3+3 complex-multiply variant when possible
+
+    M ./genfft/complex.ml -2 +13
+    M ./genfft/magic.ml +4
+
+Thu Apr 10 19:53:31 EDT 2008  stevenj@fftw.org
+  * fix documentation bug - export_wisdom_to_string returns a string that should be deallocated with free, not fftw_free (thanks to Stein Vidar Hagfors Haugan for the bug report)
+
+    M ./doc/fftw3.texi -1 +1
+
+Mon Jan 21 01:11:44 EST 2008  stevenj@fftw.org
+  * bsd calls x86_64 "amd64"; thanks to Fernando Herrero Carron for the bug report
+
+    M ./m4/ax_gcc_archflag.m4 -3 +3
+
+Tue Jan  1 12:29:56 EST 2008  stevenj@fftw.org
+  * fix typo in manual, thanks to Yinon Ehrlich
+
+    M ./doc/fftw3.texi -1 +1
+
+Mon Dec  3 13:57:13 EST 2007  stevenj@fftw.org
+  * note problem with gcc 3.4.4 on x86_64, thanks to  Uwe Hollerbach for the report
+
+    M ./doc/FAQ/fftw-faq.bfnn -2 +4
+
+Tue Nov 13 16:19:22 EST 2007  stevenj@fftw.org
+  * bump shared-lib revision
+
+    M ./configure.ac -1 +1
+
+Tue Nov 13 16:16:49 EST 2007  stevenj@fftw.org
+  * update NEWS for alpha3
+
+    M ./NEWS +23
+
+Tue Oct  2 13:53:04 EDT 2007  stevenj@fftw.org
+  * fixed URL
+
+    M ./m4/ax_openmp.m4 -1 +1
+
+Mon Sep 17 19:38:29 EDT 2007  stevenj@fftw.org
+  * added missing prototype
+
+    M ./tests/fftw-bench.c +4
+
+Tue Aug 14 22:35:06 EDT 2007  stevenj@fftw.org
+  * terminology tweak
+
+    M ./doc/fftw3.texi -1 +1
+
+Wed Aug  1 18:44:21 EDT 2007  stevenj@fftw.org
+  * check for pathscale compilers (thanks to Julian Cummings)
+
+    M ./m4/ax_compiler_vendor.m4 -2 +2
+
+Sat Sep 15 18:02:32 EDT 2007  athena@fftw.org
+  * Avoid possible conflict with Windows include files.
+
+    M ./threads/threads.c +5
+
+Tue Aug  7 21:26:05 EDT 2007  athena@fftw.org
+  * Distribute codlist.c for SIMD codelets in the commercial tarball.
+
+    M ./commercialize.sh -1 +1
+
+Wed Aug  1 10:33:41 EDT 2007  stevenj@fftw.org
+  * some documentation clarifications, and documented FFTW_WISDOM_ONLY, at the suggestion of Mario Emmenlauer and Phil Dumont
+
+    M ./api/apiplan.c -3 +3
+    M ./doc/fftw3.texi -3 +24
+
+Tue Jul 31 16:52:56 EDT 2007  stevenj@fftw.org
+  * bug fix in test program for vrank-3 transpose plans with vl=1
+
+    M ./mpi/mpi-bench.c -1 +1
+
+Sun Jul 29 17:02:46 EDT 2007  stevenj@fftw.org
+  * only run mpi checks for --enable-mpi
+
+    M ./mpi/Makefile.am +4
+
+Sun Jul 29 16:45:30 EDT 2007  stevenj@fftw.org
+  * check for NULL return from spe_context_create in case SPE_MAP_PS not supported
+
+    M ./cell/cell.c -4 +9
+
+Sun Jul 29 15:56:57 EDT 2007  stevenj@fftw.org
+  * use problem-state pointer to write SPE mailbox with lower latency (makes a significant performance difference for N < 32k), thanks to Jan Wagner for suggestion
+
+    M ./cell/cell.c +16
+
+Sun Jul 29 14:22:08 EDT 2007  stevenj@fftw.org
+  * port cell code to SDK2.1 (libspe2), since libspe1 API is deprecated and can't be used in code that also uses libspe2 API
+
+    M ./cell/cell.c -3 +49
+    M ./cell/copy.c -1 +1
+    M ./cell/dft-direct-cell.c -1 +1
+    M ./cell/fftw-cell.h -4 +5
+    M ./cell/spu/main.spuc -4 +4
+    M ./cell/transpose.c -1 +1
+    M ./configure.ac -2 +2
+
+Sun Jul 29 11:46:24 EDT 2007  stevenj@fftw.org
+  * bug fix: ego->W allocated with cell_aligned_malloc, so deallocate with free, not X(ifree0)
+
+    M ./cell/dft-direct-cell.c -1 +1
+
+Mon Jul  2 15:57:12 EDT 2007  stevenj@fftw.org
+  * removed obsolete reference to CVS id
+
+    M ./api/fftw3.h -4
+
+Mon May 21 14:25:39 EDT 2007  athena@fftw.org
+  * cycle counter for sun compiler
+
+    M ./kernel/cycle.h -1 +1
+
+Wed May  9 19:49:11 EDT 2007  stevenj@fftw.org
+  * use __inline instead of inline for AIX routines (__inline is supported by gcc and xlc, whereas apparently "inline" is only supported by xlc if you specify -qlanglvl=stdc99 or similar); thanks to Jeff Haferman for the bug report
+
+    M ./kernel/cycle.h -2 +2
+
+Mon Apr 30 15:37:56 EDT 2007  stevenj@fftw.org
+  * fixed incorrect type prefix (fftw_ vs. X(...)) in mpi/wisdom-api.c; thanks to Eric A. Borisch for the bug report
+
+    M ./mpi/wisdom-api.c -6 +6
+
+Wed Apr 25 21:21:39 EDT 2007  stevenj@fftw.org
+  * some cleanups in MPI make check
+
+    M ./mpi/Makefile.am -7 +11
+
+Wed Apr 25 21:19:27 EDT 2007  stevenj@fftw.org
+  * re-enable heuristic in the common case where we are not compiling for Cell
+
+    M ./dft/rank-geq2.c -1 +3
+
+Tue Apr 24 17:42:43 EDT 2007  athena@fftw.org
+  * Removed duplicate codelet names, was breaking linker.
+
+    M ./dft/simd/codelets/Makefile.am -4 +4
+
+Tue Apr 24 11:38:16 EDT 2007  stevenj@fftw.org
+  * added more codelets of sizes 5/10/20/25 to improve speed for round decimal sizes (speed improvements of 10-20%, at cost of 10-30% in library size)
+
+    M ./dft/scalar/codelets/Makefile.am -3 +6
+    M ./dft/simd/codelets/Makefile.am -14 +26
+    M ./rdft/scalar/r2cb/Makefile.am -8 +16
+    M ./rdft/scalar/r2cf/Makefile.am -8 +16
+    M ./rdft/simd/codelets/Makefile.am -2 +4
+
+Sat Mar 24 18:40:47 EDT 2007  stevenj@fftw.org
+  * for 1d prime sizes, punt and return serial plan
+
+    M ./mpi/api.c +4
+
+Sat Mar 24 18:24:55 EDT 2007  stevenj@fftw.org
+  * output reminders of the problem during bench --verify
+
+    M ./libbench2/verify.c -2 +5
+
+Sat Mar 24 18:10:24 EDT 2007  stevenj@fftw.org
+  * bug fix - missing solver->destroy initializer in rdft2-rdft
+
+    M ./rdft/rdft2-rdft.c -1 +1
+
+Fri Mar 23 11:12:19 EDT 2007  stevenj@fftw.org
+  * -static, in --enable-debug, doesn't work on MacOS X (according to Daniel Oberhoff)
+
+    M ./configure.ac -1
+
+Wed Mar 21 22:23:06 EDT 2007  stevenj@fftw.org
+  * fix MPI r2c/c2r to work with howmany > 1
+
+    M ./mpi/rdft2-rank-geq2-transposed.c -1 +2
+    M ./mpi/rdft2-rank-geq2.c -1 +2
+    M ./mpi/rdft2-serial.c -2 +3
+    M ./tests/check.pl -1
+
+Wed Mar 21 18:44:41 EDT 2007  stevenj@fftw.org
+  * rm MPI version from TODO
+
+    M ./TODO -2
+
+Wed Mar 21 18:34:40 EDT 2007  stevenj@fftw.org
+  * added 'make bigcheck' for MPI (no paranoid-check, unfortunately), and properly get MPIRUN from configure
+
+    M ./configure.ac +2
+    M ./mpi/Makefile.am -7 +10
+
+Wed Mar 21 18:23:18 EDT 2007  stevenj@fftw.org
+  * bug fix - incorrect local_size returned for 1d bigvec case
+
+    M ./mpi/api.c -1 +1
+
+Wed Mar 21 03:13:54 EDT 2007  stevenj@fftw.org
+  * hack to specify MPI_TRANSPOSED_IN/OUT via "[" and "]" in libbench2 problem
+
+    M ./libbench2/bench-user.h +1
+    M ./libbench2/problem.c +6
+    M ./mpi/mpi-bench.c +12
+    M ./tests/check.pl -2 +7
+
+Wed Mar 21 02:58:11 EDT 2007  stevenj@fftw.org
+  * added MPI 'make check', still needs a bit of work
+
+    M ./mpi/Makefile.am +28
+    M ./tests/check.pl +31
+
+Wed Mar 21 02:47:10 EDT 2007  stevenj@fftw.org
+  * bug fix in r2r transposed-input case
+
+    M ./mpi/rdft-problem.c +3
+
+Wed Mar 21 02:46:25 EDT 2007  stevenj@fftw.org
+  * don't output more than 300 erroneous outputs (unless verbose > 2)
+
+    M ./libbench2/verify-lib.c -2 +3
+    M ./libbench2/verify-r2r.c -2 +3
+
+Wed Mar 21 01:48:54 EDT 2007  stevenj@fftw.org
+  * fixed bug in transposed-in c2r MPI transforms ... seems to be working, finally
+
+    M ./mpi/rdft2-problem.c -5 +6
+
+Wed Mar 21 00:41:32 EDT 2007  stevenj@fftw.org
+  * some fixes to MPI r2c/c2r transforms with transposed output/input
+
+    M ./mpi/rdft2-rank-geq2-transposed.c -11 +17
+
+Wed Mar 21 00:40:25 EDT 2007  stevenj@fftw.org
+  * typos
+
+    M ./NEWS -4 +4
+
+Tue Mar 20 19:53:02 EDT 2007  stevenj@fftw.org
+  * bug fix for mpi-bench with r2c/c2r: allocate a little bit extra to make sure that padding is allocated
+
+    M ./libbench2/allocate.c -2 +4
+
+Tue Mar 20 19:19:13 EDT 2007  stevenj@fftw.org
+  * fix typo, thanks to Ernest Turro for the bug report
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Tue Mar 20 01:39:06 EDT 2007  stevenj@fftw.org
+  * spacing tweaks
+
+    M ./doc/fftw3.texi -2 +2
+
+Tue Mar 20 00:53:11 EDT 2007  stevenj@fftw.org
+  * Ralf Wildenhues is the one who pointed out that the self-communication could fill in the stalls in the pairwise schedule
+
+    M ./mpi/testsched.c -3 +2
+
+Tue Mar 20 00:22:25 EDT 2007  stevenj@fftw.org
+  * add TRANSPOSED_OUT/IN support for r2c/c2r, respectively
+
+    M ./doc/fftw3.texi -4 +1
+    M ./mpi/Makefile.am -1 +1
+    M ./mpi/README -3
+    M ./mpi/api.c -6 +15
+    M ./mpi/conf.c -1 +1
+    M ./mpi/rdft2-problem.c -18 +9
+    A ./mpi/rdft2-rank-geq2-transposed.c
+
+Mon Mar 19 21:45:34 EDT 2007  stevenj@fftw.org
+  * yikes! fixed likely deadlock bug in MPI
+
+    M ./mpi/transpose-recurse.c -2 +2
+
+Mon Mar 19 21:38:52 EDT 2007  stevenj@fftw.org
+  * comment
+
+    M ./mpi/transpose-pairwise.c -2 +8
+
+Mon Mar 19 21:30:44 EDT 2007  stevenj@fftw.org
+  * s/alpha1/alpha2/
+
+    M ./NEWS -1 +1
+
+Mon Mar 19 00:39:47 EDT 2007  stevenj@fftw.org
+  * include README in dist tarball
+
+    M ./mpi/Makefile.am -1 +1
+
+Mon Mar 19 00:35:43 EDT 2007  stevenj@fftw.org
+  * added MPI r2c/c2r transforms, some more documentation
+
+    M ./NEWS -4 +9
+    M ./doc/fftw3.texi -6 +73
+    M ./libbench2/bench-user.h +2
+    M ./libbench2/problem.c -2 +4
+    M ./mpi/Makefile.am -1 +2
+    A ./mpi/README
+    M ./mpi/api.c +140
+    M ./mpi/conf.c +4
+    M ./mpi/fftw3-mpi.h -88 +116
+    M ./mpi/mpi-bench.c -2 +102
+    A ./mpi/mpi-rdft2.h
+    A ./mpi/rdft2-problem.c
+    A ./mpi/rdft2-rank-geq2.c
+    A ./mpi/rdft2-serial.c
+    A ./mpi/rdft2-solve.c
+
+Sun Mar 18 23:14:29 EDT 2007  stevenj@fftw.org
+  * set version to 3.2alpha2
+
+    M ./configure.ac -1 +1
+
+Sun Mar 18 19:12:18 EDT 2007  stevenj@fftw.org
+  * changed --enable-mips_ps to --enable-mips-ps; added Cell section to manual (from README.Cell); many minor updates to manual
+
+    M ./configure.ac -1 +1
+    M ./doc/fftw3.texi -15 +243
+
+Sun Mar 18 15:27:06 EDT 2007  stevenj@fftw.org
+  * whoops, need to sync costs in problem_mpi_rdft
+
+    M ./mpi/api.c +3
+
+Sun Mar 18 12:44:49 EDT 2007  stevenj@fftw.org
+  * documented guru64 interface
+
+    M ./NEWS -1 +1
+    M ./doc/fftw3.texi -11 +81
+
+Sun Mar 18 02:57:46 EDT 2007  stevenj@fftw.org
+  * typo
+
+    M ./doc/fftw3.texi -1 +1
+
+Sun Mar 18 02:45:09 EDT 2007  stevenj@fftw.org
+  * bumped copyright year to 2007
+
+    M ./COPYRIGHT -2 +2
+    M ./api/api.h -2 +2
+    M ./api/apiplan.c -2 +2
+    M ./api/configure.c -2 +2
+    M ./api/execute-dft-c2r.c -2 +2
+    M ./api/execute-dft-r2c.c -2 +2
+    M ./api/execute-dft.c -2 +2
+    M ./api/execute-r2r.c -2 +2
+    M ./api/execute-split-dft-c2r.c -2 +2
+    M ./api/execute-split-dft-r2c.c -2 +2
+    M ./api/execute-split-dft.c -2 +2
+    M ./api/execute.c -2 +2
+    M ./api/export-wisdom-to-file.c -2 +2
+    M ./api/export-wisdom-to-string.c -2 +2
+    M ./api/export-wisdom.c -2 +2
+    M ./api/f77api.c -2 +2
+    M ./api/f77funcs.h -2 +2
+    M ./api/fftw3.h -2 +2
+    M ./api/flops.c -2 +2
+    M ./api/forget-wisdom.c -2 +2
+    M ./api/import-system-wisdom.c -2 +2
+    M ./api/import-wisdom-from-file.c -2 +2
+    M ./api/import-wisdom-from-string.c -2 +2
+    M ./api/import-wisdom.c -2 +2
+    M ./api/malloc.c -2 +2
+    M ./api/map-r2r-kind.c -2 +2
+    M ./api/mapflags.c -2 +2
+    M ./api/mkprinter-file.c -2 +2
+    M ./api/mktensor-iodims.h -2 +2
+    M ./api/mktensor-rowmajor.c -2 +2
+    M ./api/plan-dft-1d.c -2 +2
+    M ./api/plan-dft-2d.c -2 +2
+    M ./api/plan-dft-3d.c -2 +2
+    M ./api/plan-dft-c2r-1d.c -2 +2
+    M ./api/plan-dft-c2r-2d.c -2 +2
+    M ./api/plan-dft-c2r-3d.c -2 +2
+    M ./api/plan-dft-c2r.c -2 +2
+    M ./api/plan-dft-r2c-1d.c -2 +2
+    M ./api/plan-dft-r2c-2d.c -2 +2
+    M ./api/plan-dft-r2c-3d.c -2 +2
+    M ./api/plan-dft-r2c.c -2 +2
+    M ./api/plan-dft.c -2 +2
+    M ./api/plan-guru-dft-c2r.h -2 +2
+    M ./api/plan-guru-dft-r2c.h -2 +2
+    M ./api/plan-guru-dft.h -2 +2
+    M ./api/plan-guru-r2r.h -2 +2
+    M ./api/plan-guru-split-dft-c2r.h -2 +2
+    M ./api/plan-guru-split-dft-r2c.h -2 +2
+    M ./api/plan-guru-split-dft.h -2 +2
+    M ./api/plan-many-dft-c2r.c -2 +2
+    M ./api/plan-many-dft-r2c.c -2 +2
+    M ./api/plan-many-dft.c -2 +2
+    M ./api/plan-many-r2r.c -2 +2
+    M ./api/plan-r2r-1d.c -2 +2
+    M ./api/plan-r2r-2d.c -2 +2
+    M ./api/plan-r2r-3d.c -2 +2
+    M ./api/plan-r2r.c -2 +2
+    M ./api/print-plan.c -2 +2
+    M ./api/rdft2-pad.c -2 +2
+    M ./api/the-planner.c -2 +2
+    M ./api/version.c -2 +2
+    M ./api/x77.h -2 +2
+    M ./dft/bluestein.c -2 +2
+    M ./dft/buffered.c -2 +2
+    M ./dft/codelet-dft.h -2 +2
+    M ./dft/conf.c -2 +2
+    M ./dft/ct.c -2 +2
+    M ./dft/ct.h -2 +2
+    M ./dft/dft.h -2 +2
+    M ./dft/dftw-direct.c -2 +2
+    M ./dft/dftw-directsq.c -2 +2
+    M ./dft/dftw-generic.c -2 +2
+    M ./dft/dftw-genericbuf.c -2 +2
+    M ./dft/direct.c -2 +2
+    M ./dft/generic.c -2 +2
+    M ./dft/indirect-transpose.c -2 +2
+    M ./dft/indirect.c -2 +2
+    M ./dft/kdft-dif.c -2 +2
+    M ./dft/kdft-difsq.c -2 +2
+    M ./dft/kdft-dit.c -2 +2
+    M ./dft/kdft.c -2 +2
+    M ./dft/nop.c -2 +2
+    M ./dft/plan.c -2 +2
+    M ./dft/problem.c -2 +2
+    M ./dft/rader.c -2 +2
+    M ./dft/rank-geq2.c -2 +2
+    M ./dft/scalar/n.c -2 +2
+    M ./dft/scalar/n.h -2 +2
+    M ./dft/scalar/t.c -2 +2
+    M ./dft/scalar/t.h -2 +2
+    M ./dft/simd/n1b.c -2 +2
+    M ./dft/simd/n1b.h -2 +2
+    M ./dft/simd/n1f.c -2 +2
+    M ./dft/simd/n1f.h -2 +2
+    M ./dft/simd/n2b.c -2 +2
+    M ./dft/simd/n2b.h -2 +2
+    M ./dft/simd/n2f.c -2 +2
+    M ./dft/simd/n2f.h -2 +2
+    M ./dft/simd/n2s.c -2 +2
+    M ./dft/simd/n2s.h -2 +2
+    M ./dft/simd/q1b.c -2 +2
+    M ./dft/simd/q1b.h -2 +2
+    M ./dft/simd/q1f.c -2 +2
+    M ./dft/simd/q1f.h -2 +2
+    M ./dft/simd/t.c -2 +2
+    M ./dft/simd/t1b.h -2 +2
+    M ./dft/simd/t1bu.h -2 +2
+    M ./dft/simd/t1f.h -2 +2
+    M ./dft/simd/t1fu.h -2 +2
+    M ./dft/simd/t2b.h -2 +2
+    M ./dft/simd/t2f.h -2 +2
+    M ./dft/simd/t3b.h -2 +2
+    M ./dft/simd/t3f.h -2 +2
+    M ./dft/simd/ts.c -2 +2
+    M ./dft/simd/ts.h -2 +2
+    M ./dft/solve.c -2 +2
+    M ./dft/vrank-geq1.c -2 +2
+    M ./dft/zero.c -2 +2
+    M ./doc/f77_wisdom.f -2 +2
+    M ./genfft/algsimp.ml -2 +2
+    M ./genfft/algsimp.mli -2 +2
+    M ./genfft/annotate.ml -2 +2
+    M ./genfft/annotate.mli -2 +2
+    M ./genfft/assoctable.ml -2 +2
+    M ./genfft/assoctable.mli -2 +2
+    M ./genfft/c.ml -2 +2
+    M ./genfft/c.mli -2 +2
+    M ./genfft/complex.ml -2 +2
+    M ./genfft/complex.mli -2 +2
+    M ./genfft/conv.ml -2 +2
+    M ./genfft/conv.mli -2 +2
+    M ./genfft/dag.ml -2 +2
+    M ./genfft/dag.mli -2 +2
+    M ./genfft/expr.ml -2 +2
+    M ./genfft/expr.mli -2 +2
+    M ./genfft/fft.ml -2 +2
+    M ./genfft/fft.mli -2 +2
+    M ./genfft/gen_hc2c.ml -2 +2
+    M ./genfft/gen_hc2cdft.ml -2 +2
+    M ./genfft/gen_hc2cdft_c.ml -2 +2
+    M ./genfft/gen_hc2hc.ml -2 +2
+    M ./genfft/gen_mdct.ml -2 +2
+    M ./genfft/gen_notw.ml -2 +2
+    M ./genfft/gen_notw_c.ml -2 +2
+    M ./genfft/gen_r2cb.ml -2 +2
+    M ./genfft/gen_r2cf.ml -2 +2
+    M ./genfft/gen_r2r.ml -2 +2
+    M ./genfft/gen_twiddle.ml -2 +2
+    M ./genfft/gen_twiddle_c.ml -2 +2
+    M ./genfft/gen_twidsq.ml -2 +2
+    M ./genfft/gen_twidsq_c.ml -2 +2
+    M ./genfft/genutil.ml -2 +2
+    M ./genfft/littlesimp.ml -2 +2
+    M ./genfft/littlesimp.mli -2 +2
+    M ./genfft/magic.ml -2 +2
+    M ./genfft/monads.ml -2 +2
+    M ./genfft/number.ml -2 +2
+    M ./genfft/number.mli -2 +2
+    M ./genfft/oracle.ml -2 +2
+    M ./genfft/oracle.mli -2 +2
+    M ./genfft/schedule.ml -2 +2
+    M ./genfft/schedule.mli -2 +2
+    M ./genfft/simd.ml -2 +2
+    M ./genfft/simd.mli -2 +2
+    M ./genfft/simdmagic.ml -2 +2
+    M ./genfft/to_alist.ml -2 +2
+    M ./genfft/to_alist.mli -2 +2
+    M ./genfft/trig.ml -2 +2
+    M ./genfft/trig.mli -2 +2
+    M ./genfft/twiddle.ml -2 +2
+    M ./genfft/twiddle.mli -2 +2
+    M ./genfft/unique.ml -2 +2
+    M ./genfft/unique.mli -2 +2
+    M ./genfft/util.ml -2 +2
+    M ./genfft/util.mli -2 +2
+    M ./genfft/variable.ml -2 +2
+    M ./genfft/variable.mli -2 +2
+    M ./kernel/align.c -2 +2
+    M ./kernel/alloc.c -2 +2
+    M ./kernel/assert.c -2 +2
+    M ./kernel/awake.c -2 +2
+    M ./kernel/buffered.c -2 +2
+    M ./kernel/cpy1d.c -2 +2
+    M ./kernel/cpy2d-pair.c -2 +2
+    M ./kernel/cpy2d.c -2 +2
+    M ./kernel/ct.c -2 +2
+    M ./kernel/cycle.h -2 +2
+    M ./kernel/debug.c -2 +2
+    M ./kernel/extract-reim.c -2 +2
+    M ./kernel/hash.c -2 +2
+    M ./kernel/iabs.c -2 +2
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/kalloc.c -2 +2
+    M ./kernel/md5-1.c -2 +2
+    M ./kernel/md5.c -2 +2
+    M ./kernel/minmax.c -2 +2
+    M ./kernel/ops.c -2 +2
+    M ./kernel/pickdim.c -2 +2
+    M ./kernel/plan.c -2 +2
+    M ./kernel/primes.c -2 +2
+    M ./kernel/print.c -2 +2
+    M ./kernel/problem.c -2 +2
+    M ./kernel/rader.c -2 +2
+    M ./kernel/scan.c -2 +2
+    M ./kernel/solver.c -2 +2
+    M ./kernel/solvtab.c -2 +2
+    M ./kernel/stride.c -2 +2
+    M ./kernel/tensor.c -2 +2
+    M ./kernel/tensor1.c -2 +2
+    M ./kernel/tensor2.c -2 +2
+    M ./kernel/tensor3.c -2 +2
+    M ./kernel/tensor4.c -2 +2
+    M ./kernel/tensor5.c -2 +2
+    M ./kernel/tensor7.c -2 +2
+    M ./kernel/tensor8.c -2 +2
+    M ./kernel/tensor9.c -2 +2
+    M ./kernel/tile2d.c -2 +2
+    M ./kernel/timer.c -2 +2
+    M ./kernel/transpose.c -2 +2
+    M ./kernel/trig.c -2 +2
+    M ./kernel/twiddle.c -2 +2
+    M ./libbench2/dotens2.c -2 +2
+    M ./libbench2/my-getopt.c -2 +2
+    M ./libbench2/my-getopt.h -2 +2
+    M ./libbench2/verify-dft.c -2 +2
+    M ./libbench2/verify-lib.c -2 +2
+    M ./libbench2/verify-r2r.c -2 +2
+    M ./libbench2/verify-rdft2.c -2 +2
+    M ./libbench2/verify.h -2 +2
+    M ./mpi/any-true.c -2 +2
+    M ./mpi/api.c -2 +2
+    M ./mpi/block.c -2 +2
+    M ./mpi/choose-radix.c -2 +2
+    M ./mpi/conf.c -2 +2
+    M ./mpi/dft-problem.c -2 +2
+    M ./mpi/dft-rank-geq2-transposed.c -2 +2
+    M ./mpi/dft-rank-geq2.c -2 +2
+    M ./mpi/dft-rank1-bigvec.c -2 +2
+    M ./mpi/dft-rank1.c -2 +2
+    M ./mpi/dft-serial.c -2 +2
+    M ./mpi/dft-solve.c -2 +2
+    M ./mpi/dtensor.c -2 +2
+    M ./mpi/fftw3-mpi.h -2 +2
+    M ./mpi/ifftw-mpi.h -2 +2
+    M ./mpi/mpi-dft.h -2 +2
+    M ./mpi/mpi-rdft.h -2 +2
+    M ./mpi/mpi-transpose.h -2 +2
+    M ./mpi/rdft-problem.c -2 +2
+    M ./mpi/rdft-rank-geq2-transposed.c -2 +2
+    M ./mpi/rdft-rank-geq2.c -2 +2
+    M ./mpi/rdft-rank1-bigvec.c -2 +2
+    M ./mpi/rdft-serial.c -2 +2
+    M ./mpi/rdft-solve.c -2 +2
+    M ./mpi/rearrange.c -2 +2
+    M ./mpi/testsched.c -2 +2
+    M ./mpi/transpose-alltoall.c -2 +2
+    M ./mpi/transpose-pairwise.c -2 +2
+    M ./mpi/transpose-problem.c -2 +2
+    M ./mpi/transpose-recurse.c -2 +2
+    M ./mpi/transpose-solve.c -2 +2
+    M ./mpi/wisdom-api.c -2 +2
+    M ./rdft/buffered.c -2 +2
+    M ./rdft/buffered2.c -2 +2
+    M ./rdft/codelet-rdft.h -2 +2
+    M ./rdft/conf.c -2 +2
+    M ./rdft/ct-hc2c-direct.c -2 +2
+    M ./rdft/ct-hc2c.c -2 +2
+    M ./rdft/ct-hc2c.h -2 +2
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/dht-r2hc.c -2 +2
+    M ./rdft/dht-rader.c -2 +2
+    M ./rdft/direct-r2c.c -2 +2
+    M ./rdft/direct-r2r.c -2 +2
+    M ./rdft/direct2.c -2 +2
+    M ./rdft/generic.c -2 +2
+    M ./rdft/hc2hc-direct.c -2 +2
+    M ./rdft/hc2hc-generic.c -2 +2
+    M ./rdft/hc2hc.c -2 +2
+    M ./rdft/hc2hc.h -2 +2
+    M ./rdft/indirect.c -2 +2
+    M ./rdft/khc2c.c -2 +2
+    M ./rdft/khc2hc.c -2 +2
+    M ./rdft/kr2c.c -2 +2
+    M ./rdft/kr2r.c -2 +2
+    M ./rdft/nop.c -2 +2
+    M ./rdft/nop2.c -2 +2
+    M ./rdft/plan.c -2 +2
+    M ./rdft/plan2.c -2 +2
+    M ./rdft/problem.c -2 +2
+    M ./rdft/problem2.c -2 +2
+    M ./rdft/rank-geq2-rdft2.c -2 +2
+    M ./rdft/rank-geq2.c -2 +2
+    M ./rdft/rank0-rdft2.c -2 +2
+    M ./rdft/rank0.c -2 +2
+    M ./rdft/rdft-dht.c -2 +2
+    M ./rdft/rdft.h -2 +2
+    M ./rdft/rdft2-inplace-strides.c -2 +2
+    M ./rdft/rdft2-rdft.c -2 +2
+    M ./rdft/rdft2-strides.c -2 +2
+    M ./rdft/rdft2-tensor-max-index.c -2 +2
+    M ./rdft/scalar/hb.h -2 +2
+    M ./rdft/scalar/hc2c.c -2 +2
+    M ./rdft/scalar/hc2cb.h -2 +2
+    M ./rdft/scalar/hc2cf.h -2 +2
+    M ./rdft/scalar/hf.h -2 +2
+    M ./rdft/scalar/hfb.c -2 +2
+    M ./rdft/scalar/r2c.c -2 +2
+    M ./rdft/scalar/r2cb.h -2 +2
+    M ./rdft/scalar/r2cbIII.h -2 +2
+    M ./rdft/scalar/r2cf.h -2 +2
+    M ./rdft/scalar/r2cfII.h -2 +2
+    M ./rdft/scalar/r2r.c -2 +2
+    M ./rdft/scalar/r2r.h -2 +2
+    M ./rdft/simd/hc2cbv.c -2 +2
+    M ./rdft/simd/hc2cbv.h -2 +2
+    M ./rdft/simd/hc2cfv.c -2 +2
+    M ./rdft/simd/hc2cfv.h -2 +2
+    M ./rdft/solve.c -2 +2
+    M ./rdft/solve2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+    M ./rdft/vrank3-transpose.c -2 +2
+    M ./reodft/conf.c -2 +2
+    M ./reodft/redft00e-r2hc-pad.c -2 +2
+    M ./reodft/redft00e-r2hc.c -2 +2
+    M ./reodft/reodft.h -2 +2
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -2 +2
+    M ./reodft/reodft11e-radix2.c -2 +2
+    M ./reodft/rodft00e-r2hc-pad.c -2 +2
+    M ./reodft/rodft00e-r2hc.c -2 +2
+    M ./simd/altivec.c -2 +2
+    M ./simd/nonportable/sse.c -2 +2
+    M ./simd/nonportable/sse2.c -2 +2
+    M ./simd/simd-altivec.h -2 +2
+    M ./simd/simd-sse.h -2 +2
+    M ./simd/simd-sse2.h -2 +2
+    M ./simd/simd.h -2 +2
+    M ./simd/sse.c -2 +2
+    M ./simd/sse2.c -2 +2
+    M ./simd/taint.c -2 +2
+    M ./simd/x86-cpuid.h -2 +2
+    M ./threads/api.c -2 +2
+    M ./threads/conf.c -2 +2
+    M ./threads/ct.c -2 +2
+    M ./threads/dft-vrank-geq1.c -2 +2
+    M ./threads/f77api.c -2 +2
+    M ./threads/f77funcs.h -2 +2
+    M ./threads/hc2hc.c -2 +2
+    M ./threads/openmp.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -2 +2
+    M ./threads/threads.c -2 +2
+    M ./threads/threads.h -2 +2
+    M ./threads/vrank-geq1-rdft2.c -2 +2
+    M ./tools/fftw-wisdom-to-conf.1 -4 +4
+    M ./tools/fftw-wisdom-to-conf.in -2 +2
+    M ./tools/fftw-wisdom.c -2 +2
+    M ./tools/fftw_wisdom.1.in -4 +4
+
+Sun Mar 18 01:41:40 EDT 2007  stevenj@fftw.org
+  * noted CodeSourcery in AUTHORS
+
+    M ./AUTHORS -1 +2
+
+Sun Mar 18 01:25:00 EDT 2007  stevenj@fftw.org
+  * more MPI documentation
+
+    M ./doc/fftw3.texi -2 +61
+
+Sat Mar 17 23:15:04 EDT 2007  stevenj@fftw.org
+  * added MPI multi-dimensional rdft solvers & tests
+
+    M ./mpi/Makefile.am -2 +4
+    M ./mpi/api.c +104
+    M ./mpi/conf.c +5
+    M ./mpi/fftw3-mpi.h -73 +90
+    M ./mpi/mpi-bench.c -1 +94
+    A ./mpi/mpi-rdft.h
+    A ./mpi/rdft-problem.c
+    A ./mpi/rdft-rank-geq2-transposed.c
+    A ./mpi/rdft-rank-geq2.c
+    A ./mpi/rdft-rank1-bigvec.c
+    A ./mpi/rdft-serial.c
+    A ./mpi/rdft-solve.c
+
+Sat Mar 17 22:52:00 EDT 2007  stevenj@fftw.org
+  * whoops
+
+    M ./mpi/dft-rank1-bigvec.c -1 +1
+
+Sat Mar 17 22:43:54 EDT 2007  stevenj@fftw.org
+  * clarification - fftw_mpi_init should be called before importing wisdom
+
+    M ./doc/fftw3.texi -2 +4
+
+Sat Mar 17 19:49:37 EDT 2007  stevenj@fftw.org
+  * kindx/y/z -> kind0/1/2 for consistency
+
+    M ./api/fftw3.h -3 +3
+    M ./doc/fftw3.texi -9 +9
+
+Sat Mar 17 19:34:02 EDT 2007  stevenj@fftw.org
+  * typo
+
+    M ./mpi/dft-rank-geq2-transposed.c -1 +1
+
+Sat Mar 17 19:14:16 EDT 2007  stevenj@fftw.org
+  * some refactoring in preparation for mpi-rdft
+
+    M ./mpi/Makefile.am -1 +1
+    M ./mpi/api.c -1 +1
+    M ./mpi/dft-problem.c -2 +2
+    M ./mpi/dft-rank1-bigvec.c -53 +5
+    M ./mpi/dtensor.c -3 +3
+    M ./mpi/ifftw-mpi.h -1 +29
+    A ./mpi/rearrange.c
+
+Sat Mar 17 18:12:45 EDT 2007  stevenj@fftw.org
+  * documented more stuff for MPI
+
+    M ./doc/fftw3.texi -2 +112
+
+Sat Mar 17 15:41:23 EDT 2007  stevenj@fftw.org
+  * added NEWS for 3.2alpha
+
+    M ./NEWS -1 +37
+
+Sat Mar 17 14:50:22 EDT 2007  stevenj@fftw.org
+  * documented MPI transpose routines
+
+    M ./doc/fftw3.texi -1 +130
+
+Sat Mar 17 08:57:30 EDT 2007  athena@fftw.org
+  * Removed unused variables
+
+    M ./rdft/direct-r2c.c -3
+
+Fri Mar 16 14:47:10 EDT 2007  athena@fftw.org
+  * Preparing for interim release of Cell code.
+
+    M ./AUTHORS +2
+    M ./configure.ac -1 +1
+    M ./doc/fftw3.texi +3
+
+Thu Feb  8 12:23:43 EST 2007  athena@fftw.org
+  * Added README.Cell
+
+    A ./README.Cell
+    M ./Makefile.am -1 +2
+    M ./README.Cell +103
+
+Sat Mar 10 19:17:40 EST 2007  athena@fftw.org
+  * Synchronized with main branch
+
+    M ./simd/simd.h +10
+
+Mon Jan 22 17:43:56 EST 2007  athena@fftw.org
+  * Adapted vrecur heuristic to Cell.
+
+    M ./cell/dft-direct-cell.c -1 +28
+    M ./dft/dft.h +4
+
+Sun Jan 21 19:09:33 EST 2007  athena@fftw.org
+  * synchronize with main branch.
+
+
+Sun Jan 21 14:42:00 EST 2007  athena@fftw.org
+  * synchronized with main branch
+
+
+Thu Jan 18 20:29:22 EST 2007  athena@fftw.org
+  * Increased MAX_N to 32K/sizeof(R).
+
+    M ./cell/dft-direct-cell.c -23 +35
+    M ./cell/fftw-cell.h -3 +3
+    M ./cell/plans-double.c -9 +1033
+    M ./cell/plans-single.c -13 +525
+    M ./cell/spu/dft.spuc -2 +8
+
+Thu Jan 18 13:43:51 EST 2007  Matteo Frigo <athena@fftw.org>
+  * Added pointer to solver->destroy which is used in the Cell branch.
+
+    M ./mpi/dft-rank-geq2-transposed.c -1 +1
+    M ./mpi/dft-rank1-bigvec.c -1 +1
+    M ./mpi/dft-rank1.c -1 +1
+    M ./mpi/transpose-recurse.c -1 +1
+
+Thu Jan 18 12:09:26 EST 2007  athena@fftw.org
+  * Updated copyright notices
+
+    R ./mpi/transpose-radix2.c
+    M ./cell/cell.c -2 +1
+    M ./cell/conf.c -2 +1
+    M ./cell/copy.c -2 +1
+    M ./cell/dft-direct-cell.c -2 +1
+    M ./cell/fftw-cell.h +19
+    M ./cell/spu/alloc.spuc +19
+    M ./cell/spu/copy.spuc +19
+    M ./cell/spu/dft.spuc +18
+    M ./cell/spu/dma.spuc +19
+    M ./cell/spu/execute.spuc +19
+    M ./cell/spu/fftw-spu.h +19
+    M ./cell/spu/main.spuc +19
+    M ./cell/spu/planner.spuc +19
+    M ./cell/spu/spu-double.h +19
+    M ./cell/spu/spu-single.h +19
+    M ./cell/spu/transpose.spuc +19
+    M ./cell/spufftw-embed.S +19
+    M ./cell/transpose.c -2 +1
+
+Fri Jan 12 12:54:43 EST 2007  athena@fftw.org
+  * Use mfc_read_tag_status_all() instead of spu_mfcstat(2), since the former seems to be standardized.
+
+    M ./cell/spu/dma.spuc -1 +1
+
+Thu Jan 11 14:55:08 EST 2007  athena@fftw.org
+  * Silence some int/INT warnings.
+
+    M ./cell/dft-direct-cell.c -21 +24
+
+Wed Jan 10 18:19:53 EST 2007  athena@fftw.org
+  * Note incompatibility of --enable-cell with --enable-threads
+
+    M ./configure.ac +3
+
+Wed Jan 10 17:57:10 EST 2007  athena@fftw.org
+  * forgot to add file
+
+    A ./cell/spufftw-embed.S
+
+Wed Jan 10 17:45:16 EST 2007  athena@fftw.org
+  * 64-bit cleanup
+
+    M ./cell/Makefile.am -4 +3
+    M ./cell/cell.c -7 +21
+    M ./cell/copy.c -2 +2
+    M ./cell/dft-direct-cell.c -11 +28
+    M ./cell/fftw-cell.h -8 +10
+    M ./cell/spu/dft.spuc -3 +3
+    M ./cell/spu/dma.spuc -7 +9
+    M ./cell/spu/fftw-spu.h -2 +2
+    M ./cell/transpose.c -1 +1
+    M ./configure.ac +3
+    M ./m4/ax_gcc_archflag.m4 -1 +1
+
+Wed Jan 10 13:47:20 EST 2007  athena@fftw.org
+  * Use -mcpu=cell where appropriate.
+
+    M ./m4/ax_gcc_archflag.m4 +1
+
+Tue Dec 26 21:35:59 EST 2006  athena@fftw.org
+  * synchronized with main
+
+    M ./dft/buffered.c -14 +1
+
+Sun Dec 24 20:58:25 EST 2006  athena@fftw.org
+  * synchronized with main branch
+
+    M ./Makefile.am -12 +14
+
+Sun Dec 24 13:47:37 EST 2006  athena@fftw.org
+  * synchronized with main branch, updated to new sdk.
+
+    M! ./cell/spu/Makefile.am -4 +3
+    M! ./cell/spu/dma.spuc -1
+    M! ./cell/spu/execute.spuc -2 +2
+    M! ./cell/spu/fftw-spu.h -29 +29
+
+Thu Dec 21 17:17:41 EST 2006  athena@fftw.org
+  * removed obsolete file
+
+    R ./rdft/rdft2-radix2.c
+
+Tue Dec 19 15:17:20 EST 2006  athena@fftw.org
+  * synchronized with main branch
+
+    R ./rdft/direct.c
+    M ./rdft/direct-r2c.c -1 +1
+    M ./rdft/direct-r2r.c -1 +1
+
+Tue Dec 19 11:27:38 EST 2006  athena@fftw.org
+  * Synchronized with main branch
+
+    M! ./rdft/ct-hc2c.c -1 +1
+    M! ./rdft/direct2.c -13 +1
+
+Fri Dec 15 16:04:31 EST 2006  athena@fftw.org
+  * resolved conflict with main branch
+
+    M ./tests/fftw-bench.c -3 +2
+
+Fri Dec  8 14:43:50 EST 2006  athena@fftw.org
+  * Fixes for compilation in subdirectories
+
+    M ./cell/spu/Makefile.am -5 +3
+
+Fri Dec  8 12:46:00 EST 2006  athena@fftw.org
+  * Silence warning
+
+    M ./rdft/rank0.c -2
+
+Fri Dec  8 12:24:19 EST 2006  athena@fftw.org
+  * silence warning
+
+    M ./rdft/rank0.c -1 +1
+
+Thu Dec  7 15:18:17 EST 2006  athena@fftw.org
+  * Commented a particularly obscure piece of code.
+
+    M ./cell/spu/dma.spuc -1 +14
+
+Thu Dec  7 11:53:29 EST 2006  athena@fftw.org
+  * Reorganized, clarified conditions for applicability of the DFT solver.
+
+    M ./cell/dft-direct-cell.c -193 +178
+
+Mon Dec  4 21:33:49 EST 2006  athena@fftw.org
+  * Minor changes
+
+    M ./cell/dft-direct-cell.c -1 +6
+    M ./cell/spu/dft.spuc -1 +7
+
+Mon Dec  4 17:43:28 EST 2006  athena@fftw.org
+  * Clarified comment
+
+    M ./cell/dft-direct-cell.c -2 +2
+
+Mon Dec  4 16:49:06 EST 2006  athena@fftw.org
+  * Less incorrect conditions for fitting into local store.
+
+    M ./cell/dft-direct-cell.c -5 +12
+
+Mon Dec  4 16:08:24 EST 2006  athena@fftw.org
+  * Implemented DECDIF+TRANSPOSE on Cell
+
+    R ./dft/ctsq.c
+    M ./cell/dft-direct-cell.c -93 +142
+    M ./cell/fftw-cell.h -8 +3
+    M ./cell/spu/dft.spuc -12 +21
+    M ./dft/indirect-transpose.c -1 +2
+
+Fri Dec  1 17:42:55 EST 2006  athena@fftw.org
+  * relaxed conditions of applicability of SPE
+
+    M ./cell/dft-direct-cell.c -31 +42
+
+Fri Dec  1 16:28:10 EST 2006  athena@fftw.org
+  * tweaks
+
+    M ./cell/dft-direct-cell.c -3 +2
+
+Fri Dec  1 14:35:17 EST 2006  athena@fftw.org
+  * Implemented Cell opcounts
+
+    M ./cell/dft-direct-cell.c -4 +55
+    M ./cell/spu/execute.spuc +1
+
+Fri Dec  1 13:38:44 EST 2006  athena@fftw.org
+  * minor cleanup
+
+    M ./cell/dft-direct-cell.c -9 +4
+
+Fri Dec  1 11:16:52 EST 2006  athena@fftw.org
+  * use [c0 s0 c1 s1] format for Cell twiddle factors, rather than [c0 c1 s0 s1].  This makes life easier and there is no speed penalty on Cell (unlike Altivec).
+
+    M ./cell/copy.c -6 +6
+    M ./cell/dft-direct-cell.c -46 +15
+    M ./cell/spu/planner.spuc -2 +3
+    M ./cell/spu/spu-single.h -4 +4
+
+Wed Nov 29 18:02:54 EST 2006  athena@fftw.org
+  * Implemented SPE-accelerated copies
+
+    A ./cell/copy.c
+    M ./cell/Makefile.am -2 +2
+    M ./cell/copy.c +81
+    M ./cell/fftw-cell.h +14
+    M ./cell/spu/copy.spuc -69 +31
+    M ./cell/spu/dma.spuc -2 +78
+    M ./cell/spu/fftw-spu.h -4 +6
+    M ./cell/spu/main.spuc +4
+    M ./cell/spu/transpose.spuc -14 +6
+    M ./cell/transpose.c -4 +4
+    M ./kernel/ifftw.h -1 +3
+    M ./rdft/rank0.c +52
+
+Wed Nov 29 12:11:08 EST 2006  athena@fftw.org
+  * allow SPEs to compute vrank-0 problems.
+
+    M ./cell/dft-direct-cell.c -10 +8
+    M ./cell/spu/dma.spuc -4 +4
+
+Tue Nov 28 18:03:07 EST 2006  athena@fftw.org
+  * eliminated DMA lists
+
+    M ./cell/spu/dft.spuc -4 +4
+    M ./cell/spu/dma.spuc -119 +62
+    M ./cell/spu/fftw-spu.h -3
+    M ./cell/spu/main.spuc -2 +2
+    M ./cell/spu/transpose.spuc -6 +6
+
+Tue Nov 28 14:22:05 EST 2006  athena@fftw.org
+  * Conservatively force all dimensions to be 0 (mod VL) in cell, since otherwise it is too hard to get all cases right.
+
+    M ./cell/dft-direct-cell.c -11 +10
+    M ./cell/spu/dft.spuc -1 +2
+
+Tue Nov 28 12:39:01 EST 2006  athena@fftw.org
+  * Check alignment of strides when transposing on Cell.
+
+    M ./rdft/rank0.c -1 +10
+
+Tue Nov 28 12:19:09 EST 2006  athena@fftw.org
+  * consistent usage of FFT_SIGN
+
+    M ./cell/dft-direct-cell.c -3 +3
+    M ./cell/spu/dft.spuc -2 +2
+
+Tue Nov 28 11:35:38 EST 2006  athena@fftw.org
+  * clever transposition algorithm without buffering
+
+    M ./cell/spu/copy.spuc -178 +59
+    M ./cell/spu/dft.spuc -52 +8
+    M ./cell/spu/dma.spuc -54 +97
+    M ./cell/spu/fftw-spu.h -13 +3
+    M ./cell/spu/transpose.spuc -13 +11
+
+Mon Nov 27 14:08:28 EST 2006  athena@fftw.org
+  * Fixed tracking of dependencies
+
+    M ./cell/spu/Makefile.am -1 +3
+
+Mon Nov 27 14:03:53 EST 2006  athena@fftw.org
+  * implemented 1D transforms, various tweaks
+
+    M ./cell/cell.c -4 +14
+    M ./cell/conf.c +1
+    M ./cell/dft-direct-cell.c -15 +196
+    M ./cell/fftw-cell.h -2 +13
+    M ./cell/spu/Makefile.am +1
+    M ./cell/spu/alloc.spuc +1
+    M ./cell/spu/copy.spuc -34 +151
+    M ./cell/spu/dft.spuc -2 +37
+    M ./cell/spu/dma.spuc -13 +9
+    M ./cell/spu/fftw-spu.h -5 +9
+    M ./cell/spu/main.spuc -6 +3
+
+Wed Nov 22 15:43:36 EST 2006  athena@fftw.org
+  * no need to poll mailbox on spu side
+
+    M ./cell/spu/main.spuc -1
+
+Wed Nov 22 14:08:24 EST 2006  athena@fftw.org
+  * increased maximum size handled by spe
+
+    M ./cell/fftw-cell.h -2 +2
+    M ./cell/plans-double.c -2 +1027
+    M ./cell/plans-single.c -2 +514
+    M ./cell/spu/alloc.spuc -1 +1
+    M ./cell/spu/planner.spuc -1 +1
+
+Tue Nov 21 16:23:17 EST 2006  athena@fftw.org
+  * allow vrank<=2 problems in SPEs to avoid the vecloop overhead (grrr...)
+
+    M ./cell/dft-direct-cell.c -40 +114
+    M ./cell/fftw-cell.h -1 +15
+    M ./cell/spu/dft.spuc -27 +37
+    M ./dft/rank-geq2.c +4
+
+Mon Nov 20 14:41:45 EST 2006  athena@fftw.org
+  * added emacs mode
+
+    M ./cell/spu/copy.spuc +1
+
+Mon Nov 20 09:34:12 EST 2006  athena@fftw.org
+  * revised transpose, cleanup
+
+    A ./cell/spu/transpose.spuc
+    A ./cell/spu/dft.spuc
+    M ./cell/spu/Makefile.am -1 +1
+    M ./cell/spu/dft.spuc +165
+    M ./cell/spu/fftw-spu.h +5
+    M ./cell/spu/main.spuc -224 +2
+    M ./cell/spu/transpose.spuc +70
+
+Sun Nov 19 20:20:23 EST 2006  athena@fftw.org
+  * added file
+
+    A ./cell/spu/Makefile.am
+
+Sun Nov 19 20:18:35 EST 2006  athena@fftw.org
+  * removed file
+
+    R ./cell/spu/generate.sh
+
+Sun Nov 19 20:15:38 EST 2006  athena@fftw.org
+  * better automake integration
+
+    M! ./cell/Makefile.am -9 +4
+    R! ./cell/spu/alloc.c
+    A! ./cell/spu/alloc.spuc
+    R! ./cell/spu/copy.c
+    A! ./cell/spu/copy.spuc
+    R! ./cell/spu/dma.c
+    A! ./cell/spu/dma.spuc
+    R! ./cell/spu/execute.c
+    A! ./cell/spu/execute.spuc
+    R! ./cell/spu/main.c
+    A! ./cell/spu/main.spuc
+    R! ./cell/spu/planner.c
+    A! ./cell/spu/planner.spuc
+    R! ./cell/spu/spu_n2fv_10.c
+    R! ./cell/spu/spu_n2fv_11.c
+    R! ./cell/spu/spu_n2fv_12.c
+    R! ./cell/spu/spu_n2fv_13.c
+    R! ./cell/spu/spu_n2fv_14.c
+    R! ./cell/spu/spu_n2fv_15.c
+    R! ./cell/spu/spu_n2fv_16.c
+    R! ./cell/spu/spu_n2fv_2.c
+    R! ./cell/spu/spu_n2fv_3.c
+    R! ./cell/spu/spu_n2fv_32.c
+    R! ./cell/spu/spu_n2fv_4.c
+    R! ./cell/spu/spu_n2fv_5.c
+    R! ./cell/spu/spu_n2fv_6.c
+    R! ./cell/spu/spu_n2fv_7.c
+    R! ./cell/spu/spu_n2fv_8.c
+    R! ./cell/spu/spu_n2fv_9.c
+    R! ./cell/spu/spu_t1fv_10.c
+    R! ./cell/spu/spu_t1fv_12.c
+    R! ./cell/spu/spu_t1fv_15.c
+    R! ./cell/spu/spu_t1fv_16.c
+    R! ./cell/spu/spu_t1fv_2.c
+    R! ./cell/spu/spu_t1fv_3.c
+    R! ./cell/spu/spu_t1fv_32.c
+    R! ./cell/spu/spu_t1fv_4.c
+    R! ./cell/spu/spu_t1fv_5.c
+    R! ./cell/spu/spu_t1fv_6.c
+    R! ./cell/spu/spu_t1fv_7.c
+    R! ./cell/spu/spu_t1fv_8.c
+    R! ./cell/spu/spu_t1fv_9.c
+    M! ./dft/indirect-transpose.c -1 +1
+    M! ./rdft/direct2.c -1 +13
+
+Sat Nov 18 20:14:29 EST 2006  athena@fftw.org
+  * changed algorithm for computing chunk size
+
+    M ./cell/spu/main.c -32 +36
+
+Sat Nov 18 19:18:11 EST 2006  athena@fftw.org
+  * implemented transpose, various fixes.
+
+    M! ./cell/Makefile.am -2 +2
+    M! ./cell/cell.c -2 +10
+    M! ./cell/conf.c -1 +1
+    M! ./cell/dft-direct-cell.c -13 +29
+    M! ./cell/fftw-cell.h -3 +13
+    A! ./cell/spu/copy.c
+    A! ./cell/spu/dma.c
+    M! ./cell/spu/fftw-spu.h +27
+    M! ./cell/spu/main.c -169 +83
+    A! ./cell/transpose.c
+    M! ./configure.ac -2 +5
+    M! ./kernel/ifftw.h +5
+    M! ./rdft/rank0.c +24
+    M! ./tests/fftw-bench.c +4
+
+Thu Nov 16 16:33:50 EST 2006  athena@fftw.org
+  * Added explicit destructor to all solvers to help with the cell port.
+
+    M! ./Makefile.am -2 +6
+    M! ./cell/cell.c -4 +22
+    M! ./cell/dft-direct-cell.c -1 +8
+    M! ./cell/fftw-cell.h -1 +3
+    M! ./configure.ac +1
+    M! ./dft/bluestein.c -1 +1
+    M! ./dft/buffered.c -1 +1
+    M! ./dft/ct.c -1 +1
+    A! ./dft/ctsq.c
+    M! ./dft/direct.c -1 +1
+    M! ./dft/generic.c -1 +1
+    M! ./dft/indirect-transpose.c -1 +1
+    M! ./dft/indirect.c -1 +1
+    M! ./dft/nop.c -1 +1
+    M! ./dft/rader.c -1 +1
+    M! ./dft/rank-geq2.c -1 +1
+    M! ./dft/vrank-geq1.c -1 +1
+    M! ./kernel/ifftw.h +1
+    M! ./kernel/solver.c -1 +4
+    M! ./mpi/dft-rank-geq2.c -1 +1
+    M! ./mpi/dft-serial.c -1 +1
+    M! ./mpi/transpose-alltoall.c -1 +1
+    M! ./mpi/transpose-pairwise.c -1 +1
+    A! ./mpi/transpose-radix2.c
+    M! ./rdft/buffered.c -1 +1
+    M! ./rdft/buffered2.c -1 +1
+    M! ./rdft/dft-r2hc.c -1 +1
+    M! ./rdft/dht-r2hc.c -1 +1
+    M! ./rdft/dht-rader.c -1 +1
+    A! ./rdft/direct.c
+    M! ./rdft/direct2.c -1 +1
+    M! ./rdft/generic.c -1 +1
+    M! ./rdft/hc2hc.c -1 +1
+    M! ./rdft/indirect.c -1 +1
+    M! ./rdft/nop.c -1 +1
+    M! ./rdft/nop2.c -1 +1
+    M! ./rdft/rank-geq2-rdft2.c -1 +1
+    M! ./rdft/rank-geq2.c -1 +1
+    M! ./rdft/rank0-rdft2.c -1 +1
+    M! ./rdft/rank0.c -1 +1
+    M! ./rdft/rdft-dht.c -1 +1
+    A! ./rdft/rdft2-radix2.c
+    M! ./rdft/vrank-geq1-rdft2.c -1 +1
+    M! ./rdft/vrank-geq1.c -1 +1
+    M! ./rdft/vrank3-transpose.c -1 +1
+    M! ./reodft/redft00e-r2hc-pad.c -1 +1
+    M! ./reodft/redft00e-r2hc.c -1 +1
+    M! ./reodft/reodft00e-splitradix.c -1 +1
+    M! ./reodft/reodft010e-r2hc.c -1 +1
+    M! ./reodft/reodft11e-r2hc-odd.c -1 +1
+    M! ./reodft/reodft11e-r2hc.c -1 +1
+    M! ./reodft/reodft11e-radix2.c -1 +1
+    M! ./reodft/rodft00e-r2hc-pad.c -1 +1
+    M! ./reodft/rodft00e-r2hc.c -1 +1
+    M! ./tests/fftw-bench.c -1
+    M! ./threads/ct.c -1 +1
+    M! ./threads/dft-vrank-geq1.c -1 +1
+    M! ./threads/hc2hc.c -1 +1
+    M! ./threads/rdft-vrank-geq1.c -1 +1
+    M! ./threads/vrank-geq1-rdft2.c -1 +1
+
+Thu Nov 16 15:22:15 EST 2006  athena@fftw.org
+  * consistent use of #if vs. #ifdef
+
+    M ./api/configure.c +2
+    M ./cell/cell.c -1 +1
+    M ./cell/conf.c -1 +1
+    M ./cell/dft-direct-cell.c -1 +1
+    M ./cell/plans-double.c -1 +1
+    M ./cell/plans-single.c -1 +1
+
+Thu Nov 16 15:15:34 EST 2006  athena@fftw.org
+  * Additional Cell double codelets, better automake integration
+
+    A! ./cell/spu/spu_n2fv_11.c
+    A! ./cell/spu/spu_n2fv_13.c
+    A! ./cell/spu/spu_n2fv_15.c
+    A! ./cell/spu/spu_n2fv_3.c
+    A! ./cell/spu/spu_n2fv_5.c
+    A! ./cell/spu/spu_n2fv_7.c
+    A! ./cell/spu/spu_n2fv_9.c
+    M! ./Makefile.am -7 +5
+    M! ./cell/Makefile.am -2 +7
+    M! ./cell/cell.c -11 +7
+    M! ./cell/conf.c +26
+    M! ./cell/dft-direct-cell.c -9 +14
+    M! ./cell/fftw-cell.h -2
+    M! ./cell/plans-double.c -196 +201
+    M! ./cell/plans-single.c -3 +9
+    M! ./cell/spu/execute.c +10
+    M! ./cell/spu/fftw-spu.h -8 +22
+    M! ./cell/spu/generate.sh -1 +6
+    M! ./cell/spu/main.c -2 +1
+    M! ./cell/spu/planner.c +3
+    M! ./cell/spu/spu_n2fv_11.c +118
+    M! ./cell/spu/spu_n2fv_13.c +155
+    M! ./cell/spu/spu_n2fv_15.c +132
+    M! ./cell/spu/spu_n2fv_3.c +37
+    M! ./cell/spu/spu_n2fv_5.c +52
+    M! ./cell/spu/spu_n2fv_7.c +68
+    M! ./cell/spu/spu_n2fv_9.c +103
+    M! ./configure.ac +6
+    M! ./dft/buffered.c -1 +14
+    M! ./kernel/align.c -1 +1
+    M! ./kernel/ifftw.h -2 +2
+    M! ./simd/simd.h -4
+    M! ./simd/taint.c -1 +1
+
+Thu Nov 16 12:43:34 EST 2006  athena@fftw.org
+  * Use dma lists.
+
+    M ./cell/spu/main.c -21 +86
+
+Thu Nov 16 11:03:46 EST 2006  athena@fftw.org
+  * converted to automake
+
+    R ./cell/spu/Makefile.in
+    M ./cell/Makefile.am +1
+
+Wed Nov 15 18:00:12 EST 2006  athena@fftw.org
+  * Initial port to Cell Broadband Engine.
+
+    A! ./cell/
+    A! ./cell/spu/
+    A! ./cell/Makefile.am
+    A! ./cell/fftw-cell.h
+    A! ./cell/plans-single.c
+    A! ./cell/spu/Makefile.in
+    A! ./cell/spu/execute.c
+    A! ./cell/spu/fftw-spu.h
+    A! ./cell/spu/generate.sh
+    A! ./cell/spu/main.c
+    A! ./cell/spu/planner.c
+    A! ./cell/spu/spu-single.h
+    A! ./cell/spu/spu_n2fv_10.c
+    A! ./cell/spu/spu_n2fv_12.c
+    A! ./cell/spu/spu_n2fv_14.c
+    A! ./cell/spu/spu_n2fv_16.c
+    A! ./cell/spu/spu_n2fv_2.c
+    A! ./cell/spu/spu_n2fv_32.c
+    A! ./cell/spu/spu_n2fv_4.c
+    A! ./cell/spu/spu_n2fv_6.c
+    A! ./cell/spu/spu_n2fv_8.c
+    A! ./cell/spu/spu_t1fv_10.c
+    A! ./cell/spu/spu_t1fv_12.c
+    A! ./cell/spu/spu_t1fv_15.c
+    A! ./cell/spu/spu_t1fv_16.c
+    A! ./cell/spu/spu_t1fv_2.c
+    A! ./cell/spu/spu_t1fv_3.c
+    A! ./cell/spu/spu_t1fv_32.c
+    A! ./cell/spu/spu_t1fv_4.c
+    A! ./cell/spu/spu_t1fv_5.c
+    A! ./cell/spu/spu_t1fv_6.c
+    A! ./cell/spu/spu_t1fv_7.c
+    A! ./cell/spu/spu_t1fv_8.c
+    A! ./cell/spu/spu_t1fv_9.c
+    A! ./cell/cell.c
+    A! ./cell/conf.c
+    A! ./cell/dft-direct-cell.c
+    A! ./cell/spu/spu-double.h
+    A! ./cell/plans-double.c
+    A! ./cell/spu/alloc.c
+    M! ./Makefile.am -14 +14
+    M! ./api/configure.c +1
+    M! ./cell/Makefile.am +22
+    M! ./cell/cell.c +90
+    M! ./cell/conf.c +12
+    M! ./cell/dft-direct-cell.c +302
+    M! ./cell/fftw-cell.h +64
+    M! ./cell/plans-double.c +2056
+    M! ./cell/plans-single.c +1032
+    M! ./cell/spu/Makefile.in +35
+    M! ./cell/spu/alloc.c +24
+    M! ./cell/spu/execute.c +68
+    M! ./cell/spu/fftw-spu.h +86
+    M! ./cell/spu/generate.sh +16
+    M! ./cell/spu/main.c +281
+    M! ./cell/spu/planner.c +187
+    M! ./cell/spu/spu-double.h +105
+    M! ./cell/spu/spu-single.h +120
+    M! ./cell/spu/spu_n2fv_10.c +100
+    M! ./cell/spu/spu_n2fv_12.c +109
+    M! ./cell/spu/spu_n2fv_14.c +142
+    M! ./cell/spu/spu_n2fv_16.c +146
+    M! ./cell/spu/spu_n2fv_2.c +33
+    M! ./cell/spu/spu_n2fv_32.c +313
+    M! ./cell/spu/spu_n2fv_4.c +44
+    M! ./cell/spu/spu_n2fv_6.c +62
+    M! ./cell/spu/spu_n2fv_8.c +75
+    M! ./cell/spu/spu_t1fv_10.c +104
+    M! ./cell/spu/spu_t1fv_12.c +114
+    M! ./cell/spu/spu_t1fv_15.c +162
+    M! ./cell/spu/spu_t1fv_16.c +153
+    M! ./cell/spu/spu_t1fv_2.c +32
+    M! ./cell/spu/spu_t1fv_3.c +41
+    M! ./cell/spu/spu_t1fv_32.c +329
+    M! ./cell/spu/spu_t1fv_4.c +44
+    M! ./cell/spu/spu_t1fv_5.c +60
+    M! ./cell/spu/spu_t1fv_6.c +63
+    M! ./cell/spu/spu_t1fv_7.c +81
+    M! ./cell/spu/spu_t1fv_8.c +77
+    M! ./cell/spu/spu_t1fv_9.c +120
+    M! ./configure.ac +2
+    M! ./kernel/ifftw.h -17 +21
+    M! ./libbench2/util.c -1 +1
+    M! ./tests/fftw-bench.c +1
+
+Wed Mar 14 10:19:53 EDT 2007  athena@fftw.org
+  * Remove Codesourcery contributions from commercial tarball.
+
+    M ./commercialize.sh +7
+
+Wed Mar 14 08:59:18 EDT 2007  athena@fftw.org
+  * Added FFTW_WISDOM_ONLY, at the request of Phil Dumont.
+
+    M ./api/apiplan.c -19 +29
+    M ./api/fftw3.h +1
+    M ./tests/fftw-bench.c +1
+
+Tue Mar 13 00:32:05 EDT 2007  stevenj@fftw.org
+  * fixed potential MPI deadlock if timer misbehaves
+
+    M ./kernel/timer.c -3 +3
+
+Mon Mar 12 23:31:52 EDT 2007  stevenj@fftw.org
+  * more work on MPI documentation
+
+    M ./doc/fftw3.texi -31 +309
+    M ./mpi/api.c -3 +4
+    M ./mpi/fftw3-mpi.h -19 +19
+
+Tue Feb 27 13:48:43 EST 2007  stevenj@fftw.org
+  * index
+
+    M ./doc/fftw3.texi +1
+
+Tue Feb 27 13:46:45 EST 2007  stevenj@fftw.org
+  * rename "new-data execute" to "new-array execute", since of course you do not need a new array to have new data
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +2
+    M ./doc/fftw3.texi -12 +12
+
+Tue Feb 27 13:43:55 EST 2007  stevenj@fftw.org
+  * consistency with manual (guru execute -> new-data execute)
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Tue Feb 27 13:42:24 EST 2007  stevenj@fftw.org
+  * texinfo fixes; renamed "guru execute" section to "new-data execute", since previously it seemed to lead to endless confusion with the guru planner API
+
+    M ./doc/fftw3.texi -60 +83
+
+Mon Feb 26 18:57:11 EST 2007  stevenj@fftw.org
+  * consistently use n0/n1/.. everywhere instead of nx/ny/... (for consistency with d-dimensional case n[0], n[1], ...) ... first start at MPI documentation
+
+    M ./api/fftw3.h -10 +10
+    M ./doc/fftw3.texi -138 +374
+    M ./mpi/fftw3-mpi.h -27 +27
+
+Sat Mar 10 18:48:05 EST 2007  athena@fftw.org
+  * Changed C++-style comment into K&R
+
+    M ./kernel/cycle.h -1 +1
+
+Sat Mar 10 18:47:12 EST 2007  athena@fftw.org
+  * Forgot to add file
+
+    A ./simd/simd-mips_ps.h
+
+Sat Mar 10 18:44:39 EST 2007  athena@fftw.org
+  * Note removal of K7 support.
+
+    M ./AUTHORS -1 +4
+
+Sat Mar 10 18:41:52 EST 2007  athena@fftw.org
+  * Updated manual for MIPS PS
+
+    M ./doc/fftw3.texi -9 +12
+
+Sat Mar 10 18:37:07 EST 2007  athena@fftw.org
+  * Adopted MIPS_PS patches from Codesourcery.
+
+    A ./simd/mips_ps.c
+    A ./simd/mips_ps.h
+    M ./configure.ac +23
+    M ./kernel/cycle.h +43
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/timer.c -1 +3
+    M ./simd/Makefile.am -1 +1
+    M ./simd/mips_ps.c +45
+    M ./simd/mips_ps.h +273
+    M ./simd/simd.h +4
+
+Sun Feb 25 11:34:51 EST 2007  athena@fftw.org
+  * Incorrect initialization of win32 semaphores
+
+    M ./threads/threads.c -1 +1
+
+Tue Jan 30 11:43:09 EST 2007  stevenj@fftw.org
+  * win32 fixes (I think, still untested)
+
+    M ./threads/threads.c -9 +10
+
+Fri Jan 19 17:31:47 EST 2007  stevenj@fftw.org
+  * message-size heuristic in tranpose-recurse
+
+    M ./mpi/transpose-recurse.c -3 +15
+
+Tue Jan 30 08:53:55 EST 2007  athena@fftw.org
+  * Threading layer for Win32, completely untested.
+
+    M ./threads/threads.c +69
+
+Mon Jan 29 14:26:30 EST 2007  athena@fftw.org
+  * Check for EINTR after sem_wait(), as suggested by Chip Salzenberg.
+
+    M ./threads/threads.c -1 +11
+
+Mon Jan 22 13:58:23 EST 2007  athena@fftw.org
+  * Force vector recursion by means of a separate function pointer.  I need this for Cell.
+
+    M ./dft/ct.c -3 +7
+    M ./dft/ct.h -2 +8
+    M ./dft/dft.h -15
+    M ./dft/dftw-direct.c -2 +2
+    M ./dft/dftw-directsq.c -2 +3
+    M ./dft/dftw-generic.c -2 +2
+    M ./dft/dftw-genericbuf.c -2 +2
+    M ./dft/vrank-geq1.c -4
+    M ./threads/ct.c -1 +3
+    M ./threads/threads.h -1 +3
+
+Mon Jan 22 09:28:35 EST 2007  athena@fftw.org
+  * Merge multiplications by twiddle with multiplications by i for faster r2c transforms.
+
+    M ./genfft/algsimp.ml +4
+    M ./genfft/simd.ml +4
+    M ./simd/simd-altivec.h +24
+    M ./simd/simd-sse.h +18
+    M ./simd/simd-sse2.h +18
+
+Sun Jan 21 19:02:44 EST 2007  athena@fftw.org
+  * Disabled vector recursion, too messy.
+
+    M ./dft/dft.h -2 +13
+
+Sun Jan 21 14:23:35 EST 2007  athena@fftw.org
+  * Changed heuristics for vector recursion.
+  
+  As in fftw-3.1, NO_VRECURSE disables vector recursion.  As an
+  exception, however, vector recursion is allowed when the predicate
+  VRECURSE_ANYWAYP is true.  We need some form of vector recursion to
+  obtain decent plans on Cell, and this solution captures the common
+  cases without increasing planning time too much.
+  
+
+    M ./api/fftw3.h -1 +1
+    M ./api/mapflags.c -2 +2
+    M ./dft/ct.c -10 +6
+    M ./dft/dft.h +4
+    M ./dft/vrank-geq1.c -6 +3
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/planner.c -1 +1
+    M ./rdft/ct-hc2c.c -9 +4
+    M ./rdft/hc2hc.c -9 +4
+    M ./rdft/vrank-geq1.c -7
+
+Sun Dec 17 22:31:17 EST 2006  athena@fftw.org
+  * fixed hc2c for vector-recursion branch
+
+    M ./rdft/ct-hc2c.c -2 +7
+
+Tue Dec  5 12:52:36 EST 2006  athena@fftw.org
+  * switch to default vector recursion
+
+    M ./api/fftw3.h -1 +1
+    M ./api/mapflags.c -2 +2
+    M ./dft/ct.c -4 +6
+    M ./dft/vrank-geq1.c +7
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/planner.c -1 +1
+    M ./rdft/hc2hc.c -2 +7
+    M ./rdft/vrank-geq1.c -2 +9
+
+Sat Jan 20 23:37:33 EST 2007  athena@fftw.org
+  * Smarter algorithm for selection of nbuf.
+
+    M ./kernel/buffered.c -8 +6
+
+Sat Jan 20 22:15:33 EST 2007  athena@fftw.org
+  * Increased buffer sizes according to Moore's law.
+
+    M ./kernel/buffered.c -3 +5
+
+Fri Jan 19 16:02:00 EST 2007  stevenj@fftw.org
+  * fix another MPI synchronization bug -- several more places where cost_hook must be called to synchronized process timings (sigh)
+
+    M ./api/flops.c -3 +3
+    M ./kernel/ifftw.h -4 +5
+    M ./kernel/planner.c -10 +6
+    M ./kernel/timer.c -4 +15
+    M ./mpi/api.c -3 +1
+
+Fri Jan 19 12:08:07 EST 2007  athena@fftw.org
+  * Set havewisdom=0 when calling forget_wisdom() in the test program.
+
+    M ./tests/fftw-bench.c -1 +3
+
+Fri Jan 19 10:29:56 EST 2007  stevenj@fftw.org
+  * remove redundant check
+
+    M ./kernel/planner.c -4 +1
+
+Thu Jan 18 22:37:59 EST 2007  stevenj@fftw.org
+  * fixed potential (unlikely) bug in wisdom import (triggered when importing impatient wisdom after creating more patient plans, but apparently only for nonstandard configure.c configurations)
+
+    M ./kernel/planner.c -1 +5
+
+Thu Jan 18 21:50:14 EST 2007  stevenj@fftw.org
+  * added functions to gather/broadcast wisdom for MPI
+
+    M ./configure.ac -3 +8
+    M ./mpi/Makefile.am -1 +1
+    M ./mpi/api.c -3 +3
+    M ./mpi/fftw3-mpi.h -1 +5
+    M ./mpi/mpi-bench.c -19 +40
+    A ./mpi/wisdom-api.c
+    M ./tests/bench.c +10
+    M ./tests/fftw-bench.c -3 +3
+    M ./tests/fftw-bench.h +2
+
+Thu Jan 11 18:33:17 EST 2007  stevenj@fftw.org
+  * whoops, another int/INT bug
+
+    M ./mpi/transpose-pairwise.c -1 +1
+
+Thu Jan 11 17:42:24 EST 2007  stevenj@fftw.org
+  * whoops, fixed bug in transpose-recurse for r != m
+
+    M ./mpi/transpose-recurse.c -10 +10
+
+Thu Jan 11 17:25:36 EST 2007  stevenj@fftw.org
+  * canonicalize mpi-transposed flags by setting TRANSPOSED_IN/OUT where possible
+
+    M ./mpi/transpose-problem.c -1 +8
+
+Thu Jan 11 17:16:24 EST 2007  stevenj@fftw.org
+  * replace transpose-radix2 with much more general transpose-recurse solver
+
+    M ./kernel/Makefile.am -3 +3
+    M ./kernel/ifftw.h +5
+    M ./kernel/tensor2.c -21
+    A ./kernel/tensor3.c
+    M ./mpi/Makefile.am -1 +1
+    M ./mpi/conf.c -1 +1
+    M ./mpi/dft-rank-geq2-transposed.c -1
+    M ./mpi/mpi-bench.c +1
+    M ./mpi/mpi-transpose.h -1 +1
+    R ./mpi/transpose-radix2.c
+    A ./mpi/transpose-recurse.c
+
+Wed Jan 10 20:23:48 EST 2007  stevenj@fftw.org
+  * rename transpose-inplace to transpose-pairwise, as the algorithm is not restricted to inplace operation
+
+     ./mpi/transpose-inplace.c -> ./mpi/transpose-pairwise.c
+    M ./mpi/Makefile.am -1 +1
+    M ./mpi/conf.c -1 +1
+    M ./mpi/mpi-transpose.h -1 +1
+    M ./mpi/testsched.c -1 +1
+    M ./mpi/transpose-pairwise.c -9 +9
+
+Wed Jan 10 14:39:08 EST 2007  stevenj@fftw.org
+  * whoops, some int/INT bugs
+
+    M ./mpi/api.c -2 +2
+    M ./mpi/block.c -3 +3
+    M ./mpi/ifftw-mpi.h -5 +5
+
+Tue Jan  9 18:50:07 EST 2007  stevenj@fftw.org
+  * fix FAQ Makefile for vpath builds
+
+    M ./doc/FAQ/Makefile.am -4 +7
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Tue Jan  9 20:22:11 EST 2007  athena@fftw.org
+  * Missing ``static'' keyword.
+
+    M ./rdft/direct-r2c.c -2 +2
+
+Tue Jan  9 20:13:18 EST 2007  athena@fftw.org
+  * Minor cleanup.
+
+    M ./dft/direct.c -27 +16
+
+Tue Jan  9 00:04:03 EST 2007  stevenj@fftw.org
+  * interleave twiddle mults with DFTs (should we use dftw?)
+
+    M ./mpi/dft-rank1.c -41 +44
+
+Mon Jan  8 18:35:41 EST 2007  stevenj@fftw.org
+  * simplified (and somewhat sped up) dft-rank1 by exploiting dft-rank1-bigvec
+
+    M ./mpi/dft-rank-geq2-transposed.c -1 +1
+    M ./mpi/dft-rank-geq2.c -1 +1
+    M ./mpi/dft-rank1-bigvec.c -2 +4
+    M ./mpi/dft-rank1.c -175 +181
+    M ./mpi/transpose-alltoall.c -6 +3
+    M ./mpi/transpose-inplace.c -6 +3
+    M ./mpi/transpose-radix2.c -7 +2
+
+Sun Jan  7 00:31:31 EST 2007  stevenj@fftw.org
+  * rearranged TRANSPOSED format, numerous speedups
+  
+  Split the TRANSPOSED and non-TRANSPOSED rank-geq2 solvers, and changed
+  the DFT TRANSPOSED format to be more like fftw2 (both globally and
+  locally transposed).  In general, more emphasis on arranging the data
+  contiguously for the DFTs, and more flexibility in intermediate
+  transposed formats.  Also disable NO_SLOW when planning transposes,
+  since otherwise non-square in-place transposes gratuitously put the
+  planner in SLOW mode.
+  
+  Currently, dft-rank1-bigvec has 5 variants (or 10, if DESTROY_INPUT).
+  It looks like only 2 of these are commonly used, so I should probably
+  add some UGLY tags once I do more benchmarking.
+
+    M ./mpi/Makefile.am -1 +1
+    M ./mpi/api.c -14 +34
+    M ./mpi/conf.c +1
+    M ./mpi/dft-problem.c -2 +12
+    A ./mpi/dft-rank-geq2-transposed.c
+    M ./mpi/dft-rank-geq2.c -131 +60
+    M ./mpi/dft-rank1-bigvec.c -77 +123
+    M ./mpi/dft-rank1.c -2 +2
+    M ./mpi/ifftw-mpi.h -2 +3
+    M ./mpi/mpi-bench.c +13
+    M ./mpi/mpi-dft.h -1 +2
+    M ./mpi/transpose-alltoall.c -32 +39
+    M ./mpi/transpose-inplace.c -53 +69
+    M ./mpi/transpose-radix2.c -37 +43
+
+Thu Jan  4 19:13:17 EST 2007  stevenj@fftw.org
+  * add bench_cost_postprocess to prevent deadlocks in mpi-bench
+
+    M ./libbench2/Makefile.am -5 +6
+    A ./libbench2/bench-cost-postprocess.c
+    M ./libbench2/bench-user.h +1
+    M ./libbench2/speed.c -2 +2
+    M ./mpi/mpi-bench.c +7
+
+Thu Jan  4 16:46:29 EST 2007  stevenj@fftw.org
+  * whoops
+
+    M ./m4/acx_pthread.m4 -1 +1
+
+Wed Jan  3 14:23:42 EST 2007  stevenj@fftw.org
+  * pass proper pointer types as arguments, so that ACX_PTHREAD still works with C++ and -Werror (thanks to Ewald Arnold for the suggestion)
+
+    M ./m4/acx_pthread.m4 -3 +6
+
+Mon Jan  1 19:30:43 EST 2007  athena@fftw.org
+  * Renamed [io]vs => [io]vs_by_nbuf, which is more appropriate and would have saved me 30mins debugging.
+
+    M ./dft/buffered.c -6 +6
+
+Mon Jan  1 18:52:38 EST 2007  stevenj@fftw.org
+  * add --with-g77-wrappers option & always include g77 wrappers on GNU systems and/or with gfortran
+  
+  Upcoming GNU/Linux distros will most likely switch to configuring FFTW 
+  with gfortran by default, since g77 isn't even included with recent gcc 
+  versions.  However, we still want to include g77-compatible wrappers in
+  this case (two underscores) in addition to gfortran wrappers (one
+  underscore) lest we silently break binary compatibility and provoke
+  lots of annoying emails.
+
+    M ./api/x77.h +6
+    M ./configure.ac +11
+
+Mon Jan  1 16:48:36 EST 2007  stevenj@fftw.org
+  * use AC_HELP_STRING for --disable-fortran
+
+    M ./configure.ac -1 +1
+
+Mon Jan  1 15:56:12 EST 2007  stevenj@fftw.org
+  * terminology
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Sat Dec 30 16:18:35 EST 2006  athena@fftw.org
+  * Free buffers before calling cldrest.
+
+    M ./dft/buffered.c -2 +2
+    M ./rdft/buffered.c -4 +4
+
+Fri Dec 29 10:52:15 EST 2006  athena@fftw.org
+  * Removed obsolete code.
+
+    M ./simd/simd-sse.h -22 +9
+
+Thu Dec 28 21:37:48 EST 2006  athena@fftw.org
+  * Attempt to work-around old gcc bugs in a more efficient fashion that does not lose performance on newer gcc's.
+
+    M ./simd/simd-sse.h -13 +24
+
+Thu Dec 28 16:10:33 EST 2006  athena@fftw.org
+  * Make sure that the speed() input is zero even in paranoid mode.
+
+    M ./libbench2/speed.c +6
+
+Thu Dec 28 11:41:46 EST 2006  athena@fftw.org
+  * cld0 and cldm problems must be tainted because they are used in a v-loop.
+
+    M ./rdft/ct-hc2c-direct.c -2 +5
+    M ./rdft/hc2hc-direct.c -2 +3
+
+Wed Dec 27 17:17:45 EST 2006  athena@fftw.org
+  * Run paranoid-check in patient mode.
+
+    M ./tests/Makefile.am -5 +5
+
+Wed Dec 27 10:51:42 EST 2006  athena@fftw.org
+  * Fixed incorrect initialization to zero.
+
+    M ./rdft/problem2.c -1 +1
+
+Wed Dec 27 09:33:02 EST 2006  athena@fftw.org
+  * Fixed wrong TAINT()
+
+    M ./rdft/buffered2.c -1 +1
+
+Tue Dec 26 22:50:28 EST 2006  athena@fftw.org
+  * Grrrrr...
+
+    M ./libbench2/verify-rdft2.c -1 +1
+
+Tue Dec 26 22:48:44 EST 2006  athena@fftw.org
+  * Give up trying to verify rdft2 when vrank=-infinity.
+
+    M ./libbench2/verify-rdft2.c +3
+
+Tue Dec 26 22:31:38 EST 2006  athena@fftw.org
+  * typo
+
+    M ./dft/direct.c -1 +2
+
+Tue Dec 26 21:54:53 EST 2006  athena@fftw.org
+  * Correctly verify rdft2 when vrank = -infinity.
+
+    M ./libbench2/verify-rdft2.c -1 +1
+
+Tue Dec 26 21:25:02 EST 2006  athena@fftw.org
+  * rdft/buffered2.c now generates rdft2 subproblems, not rdft.
+  
+  The old rdft2->rdft reduction is now in rdft/rdft2-rdft.c
+  and still does way too much.
+
+    M ./TODO -3
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/buffered.c -1 +1
+    M ./rdft/buffered2.c -190 +200
+    M ./rdft/conf.c +1
+    M ./rdft/problem2.c -5 +6
+    M ./rdft/rdft.h -1 +2
+    M ./rdft/rdft2-inplace-strides.c -5 +5
+    A ./rdft/rdft2-rdft.c
+    M ./rdft/rdft2-strides.c -5 +5
+    M ./rdft/vrank-geq1-rdft2.c -11 +10
+
+Tue Dec 26 14:03:27 EST 2006  athena@fftw.org
+  * Buffer the input in hc2r problems, as opposed to the output.
+
+    M ./rdft/buffered.c -34 +104
+    M ./rdft/buffered2.c -1
+
+Tue Dec 26 10:02:59 EST 2006  athena@fftw.org
+  * streamlined buffered solvers
+
+    M ./dft/buffered.c -66 +14
+    M ./kernel/buffered.c -4 +25
+    M ./kernel/ifftw.h -1 +4
+    M ./kernel/primes.c +11
+    M ./rdft/buffered.c -63 +13
+    M ./rdft/buffered2.c -54 +7
+
+Mon Dec 25 16:08:22 EST 2006  athena@fftw.org
+  * c++ compatibility
+
+    M ./libbench2/verify-r2r.c -2 +2
+
+Sun Dec 24 20:27:23 EST 2006  athena@fftw.org
+  * Gratuitous renaming of directories and files since the old naming was becoming too inconsistent for my taste.
+
+     ./dft/codelets -> ./dft/scalar
+     ./dft/scalar/standard -> ./dft/scalar/codelets
+     ./rdft/codelets -> ./rdft/scalar
+    M ./Makefile.am -14 +15
+    M ./configure.ac -6 +6
+    M ./dft/Makefile.am -1 +1
+    M ./dft/scalar/Makefile.am -3 +3
+    M ./dft/scalar/codelets/Makefile.am -3 +3
+    M ./dft/simd/codelets/Makefile.am -2 +2
+    M ./doc/fftw3.texi -4 +4
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/scalar/Makefile.am -2 +2
+    M ./rdft/scalar/r2cb/Makefile.am -3 +3
+    M ./rdft/scalar/r2cf/Makefile.am -3 +3
+    M ./rdft/scalar/r2r/Makefile.am -3 +3
+
+Sun Dec 24 20:11:50 EST 2006  athena@fftw.org
+  * Fixed another dftw bug (sigh)
+
+    M ./threads/ct.c -7 +3
+
+Sun Dec 24 11:48:01 EST 2006  athena@fftw.org
+  * Removed debugging leftovers.
+
+    M ./dft/dftw-genericbuf.c -2 +2
+
+Sun Dec 24 11:34:17 EST 2006  athena@fftw.org
+  * Moved dftw-generic* to new dftw protocol.
+
+    M ./dft/dftw-generic.c -30 +24
+    M ./dft/dftw-genericbuf.c -13 +13
+
+Sun Dec 24 09:37:19 EST 2006  athena@fftw.org
+  * Oops
+
+    M ./tests/hook.c -1 +1
+
+Sun Dec 24 09:31:46 EST 2006  athena@fftw.org
+  * Fixed wrong verification of rank-1 rdft2
+
+    M ./tests/hook.c -4 +6
+
+Sat Dec 23 20:11:29 EST 2006  athena@fftw.org
+  * minor tweaks
+
+    M ./dft/dftw-direct.c -2 +4
+
+Sat Dec 23 19:50:03 EST 2006  athena@fftw.org
+  * Removed obsolete items.
+
+    M ./TODO -13
+
+Sat Dec 23 17:56:37 EST 2006  athena@fftw.org
+  * Modified the problem_dftw invocation protocol.
+  
+  apply() now requires pointers to the beginning of the full array.
+  Each thread processes a slice mb <= m < me.  This protocol is
+  consistent with the one used in hc2hc, where there is no other choice.
+  
+  
+
+    M ./dft/dftw-direct.c -27 +31
+    M ./dft/dftw-directsq.c -1 +3
+    M ./threads/ct.c -7 +2
+
+Sat Dec 23 16:18:25 EST 2006  athena@fftw.org
+  * typo
+
+    M ./simd/simd-altivec.h -1 +1
+
+Sat Dec 23 16:06:56 EST 2006  athena@fftw.org
+  * changed hc2hc twiddle storage to be the same as hc2c
+
+    M ./genfft/gen_hc2hc.ml -2 +3
+    M ./rdft/hc2hc-direct.c -14 +11
+
+Sat Dec 23 15:16:36 EST 2006  athena@fftw.org
+  * Allowed extra_iter in dftw-direct.  Rationalized twiddle factors in hc2c.
+
+    M ./dft/dftw-direct.c -10 +38
+    M ./dft/simd/Makefile.am -1 +1
+    M ./dft/simd/codelets/Makefile.am -1 +12
+    M ./dft/simd/t.c -2 +45
+    A ./dft/simd/t1bu.h
+    A ./dft/simd/t1fu.h
+    M ./genfft/gen_hc2c.ml -2 +3
+    M ./genfft/gen_hc2cdft.ml -2 +3
+    M ./genfft/gen_hc2cdft_c.ml -1 +1
+    M ./genfft/gen_hc2hc.ml -1 +1
+    M ./genfft/gen_twiddle.ml -1 +1
+    M ./genfft/gen_twiddle_c.ml -1 +1
+    M ./genfft/gen_twidsq.ml -1 +1
+    M ./genfft/gen_twidsq_c.ml -1 +1
+    M ./genfft/twiddle.ml -12 +10
+    M ./genfft/twiddle.mli -1 +1
+    M ./kernel/twiddle.c -4 +8
+    M ./rdft/ct-hc2c-direct.c -4 +7
+    M ./rdft/simd/hc2cbv.h -2 +2
+    M ./rdft/simd/hc2cfv.h -2 +2
+    M ./simd/simd-altivec.h -12 +9
+    M ./simd/simd-sse.h -12 +9
+    M ./simd/simd-sse2.h -11 +6
+
+Sat Dec 23 10:37:11 EST 2006  athena@fftw.org
+  * Implemented unmentionable hack to use 4-way SIMD with an odd number of
+  iterations.
+
+    M ./dft/direct.c -9 +34
+    M ./rdft/ct-hc2c-direct.c -43 +88
+    M ./simd/simd-altivec.h -1 +3
+    M ./simd/simd-sse.h -1 +3
+
+Fri Dec 22 22:13:30 EST 2006  athena@fftw.org
+  * altivec support for new codelets
+
+    M ./simd/simd-altivec.h -7 +28
+
+Fri Dec 22 19:09:15 EST 2006  athena@fftw.org
+  * fixed incorrect computation of W
+
+    M ./genfft/gen_hc2cdft_c.ml -1 +1
+
+Fri Dec 22 18:51:22 EST 2006  athena@fftw.org
+  * Implemented 4-way simd hc2cdftv
+  
+  Also eliminated the twiddle_shift hack.  A zillion changes dictated 
+  by this choice, which was in turn necessary for the hc2cdftv thing
+  to work.
+  
+
+    M ./Makefile.am -13 +8
+    M ./configure.ac -3
+    M ./dft/codelet-dft.h -9 +9
+    M ./dft/codelets/t.c -4 +4
+    M ./dft/dftw-direct.c -35 +30
+    M ./dft/dftw-directsq.c -7 +4
+    M ./dft/simd/q1b.c -6 +6
+    M ./dft/simd/q1f.c -6 +6
+    M ./dft/simd/t.c -14 +14
+    M ./dft/simd/t3b.h +1
+    M ./dft/simd/t3f.h +1
+    M ./dft/simd/ts.c -6 +6
+    M ./dft/simd/ts.h +1
+    M ./genfft/Makefile.am -17 +10
+    R ./genfft/gen_conv.ml
+    M ./genfft/gen_hc2c.ml -25 +19
+    M ./genfft/gen_hc2cdft.ml -19 +19
+    M ./genfft/gen_hc2cdft_c.ml -20 +24
+    M ./genfft/gen_hc2hc.ml -23 +17
+    M ./genfft/gen_mdct.ml -9 +9
+    M ./genfft/gen_notw.ml -8 +8
+    M ./genfft/gen_notw_c.ml -6 +6
+    M ./genfft/gen_r2cb.ml -9 +9
+    M ./genfft/gen_r2cf.ml -9 +9
+    M ./genfft/gen_r2r.ml -6 +6
+    M ./genfft/gen_twiddle.ml -37 +39
+    M ./genfft/gen_twiddle_c.ml -38 +42
+    M ./genfft/gen_twidsq.ml -42 +44
+    M ./genfft/gen_twidsq_c.ml -44 +47
+    M ./genfft/genutil.ml -6 +6
+    M ./genfft/simd.ml -6 +7
+    M ./genfft/simd.mli -2 +1
+    M ./genfft/variable.ml -7 +12
+    M ./genfft/variable.mli -1 +3
+    M ./kernel/ifftw.h -7
+    M ./kernel/twiddle.c -14
+    M ./rdft/codelet-rdft.h -5 +5
+    M ./rdft/codelets/hc2c.c -2 +2
+    M ./rdft/conf.c -1 +1
+    M ./rdft/ct-hc2c-direct.c -47 +36
+    M ./rdft/ct-hc2c.c -1 +3
+    M ./rdft/hc2hc-direct.c -39 +31
+    M ./rdft/simd/codelets/Makefile.am -2 +2
+    M ./rdft/simd/hc2cbv.c -7 +8
+    M ./rdft/simd/hc2cbv.h -4 +3
+    M ./rdft/simd/hc2cfv.c -7 +8
+    M ./rdft/simd/hc2cfv.h -4 +3
+    M ./simd/simd-sse.h +4
+    M ./simd/simd-sse2.h -3 +5
+    M ./simd/simd.h -2
+    M ./tests/Makefile.am +9
+
+Fri Dec 22 08:45:46 EST 2006  athena@fftw.org
+  * Fixed verification of rdft2 problems with new format.
+
+    M ./tests/hook.c -1 +5
+
+Fri Dec 22 00:05:59 EST 2006  athena@fftw.org
+  * Added file
+
+    A ./rdft/simd/Makefile.am
+
+Fri Dec 22 00:02:50 EST 2006  athena@fftw.org
+  * Hmm, previous commit did not work
+
+    A ./rdft/simd/
+    A ./rdft/simd/codelets/
+    A ./rdft/simd/codelets/Makefile.am
+    A ./rdft/simd/hc2hcv.h
+     ./rdft/simd/hc2hcv.h -> ./rdft/simd/hc2cfv.h
+    A ./rdft/simd/hc2cbv.h
+    A ./rdft/simd/hc2cv.c
+     ./rdft/simd/hc2cv.c -> ./rdft/simd/hc2cfv.c
+    M ./rdft/simd/codelets/Makefile.am +51
+    A ./rdft/simd/hc2cbv.c
+    M ./rdft/simd/hc2cbv.h +29
+    M ./rdft/simd/hc2cfv.c +41
+    M ./rdft/simd/hc2cfv.h +29
+
+Thu Dec 21 23:58:33 EST 2006  athena@fftw.org
+  * Added SIMD r2cdft codelets.
+
+    A ./genfft/gen_hc2cdft_c.ml
+    M ./Makefile.am -14 +22
+    M ./configure.ac +5
+    M ./dft/simd/n1b.h -1
+    M ./dft/simd/n1f.h -1
+    M ./dft/simd/n2b.h -1
+    M ./dft/simd/n2f.h -1
+    M ./dft/simd/n2s.h -1
+    M ./genfft/Makefile.am -8 +14
+    M ./genfft/algsimp.ml -2 +12
+    M ./genfft/c.ml +1
+    M ./genfft/expr.ml -1 +3
+    M ./genfft/expr.mli -1 +1
+    M ./genfft/gen_hc2cdft_c.ml +217
+    M ./genfft/simd.ml -2 +9
+    M ./genfft/to_alist.ml +1
+    M ./kernel/ifftw.h +6
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/codelet-rdft.h +5
+    M ./rdft/codelets/hc2c.c -2 +12
+    M ./rdft/conf.c +4
+    M ./rdft/ct-hc2c-direct.c -5 +60
+    M ./rdft/hc2hc-direct.c -5
+    M ./rdft/hc2hc.h -2
+    M ./rdft/khc2hc.c -1
+    M ./simd/nonportable/sse.c -9 +4
+    M ./simd/nonportable/sse2.c -9 +4
+    M ./simd/simd-sse.h -4 +4
+    M ./simd/simd-sse2.h -4 +7
+    M ./simd/sse.c -2 +2
+    M ./simd/sse2.c -2 +2
+    M ./support/Makefile.codelets +1
+
+Thu Dec 21 21:19:21 EST 2006  athena@fftw.org
+  * Bug in buffering, grrr...
+
+    M ./rdft/ct-hc2c-direct.c -1 +1
+
+Thu Dec 21 20:58:14 EST 2006  athena@fftw.org
+  * Oops, memory leak.
+
+    M ./rdft/hc2hc-direct.c +1
+
+Thu Dec 21 17:12:31 EST 2006  athena@fftw.org
+  * minor changes, cleanup.
+
+    M ./rdft/ct-hc2c.c -2
+    M ./rdft/hc2hc.c -16 +10
+    M ./threads/hc2hc.c -17 +12
+
+Wed Dec 20 22:09:28 EST 2006  athena@fftw.org
+  * Unified hc2hc-direct, hc2hc-directbuf.  Cleanup.
+
+    M ./rdft/Makefile.am -4 +3
+    M ./rdft/ct-hc2c-direct.c -8 +8
+    R ./rdft/hc2hc-common.c
+    M ./rdft/hc2hc-direct.c -44 +153
+    R ./rdft/hc2hc-directbuf.c
+    M ./rdft/hc2hc.h -4
+
+Wed Dec 20 17:55:56 EST 2006  athena@fftw.org
+  * removed obsolete rdft2-radix2
+
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/conf.c -1
+    M ./rdft/rdft.h -1
+    R ./rdft/rdft2-radix2.c
+
+Wed Dec 20 17:51:20 EST 2006  athena@fftw.org
+  * implemented reduction rdft2->dft
+
+    A ./genfft/gen_hc2cdft.ml
+    M ./genfft/Makefile.am -9 +16
+    M ./genfft/gen_hc2c.ml -1 +1
+    M ./genfft/gen_hc2cdft.ml +207
+    M ./rdft/codelet-rdft.h -2 +7
+    M ./rdft/codelets/r2cb/Makefile.am -1 +12
+    M ./rdft/codelets/r2cf/Makefile.am -1 +12
+    M ./rdft/ct-hc2c-direct.c -5 +8
+    M ./rdft/ct-hc2c.c -24 +95
+    M ./rdft/ct-hc2c.h -2 +6
+    M ./rdft/khc2c.c -2 +3
+    M ./support/Makefile.codelets +1
+
+Wed Dec 20 09:29:39 EST 2006  athena@fftw.org
+  * Implemented buffered direct-r2c, direct-hc2c.
+  Also, removed some old cruft:
+  
+  * okp() functions were never used and a pain to maintain---now they
+    are gone.
+  
+  * ``m'' in hc2hc and hc2c codelets is now the number of iterations,
+    not the ``logical'' m.
+
+    M ./genfft/gen_hc2c.ml -5 +3
+    M ./genfft/gen_hc2hc.ml -9 +9
+    M ./genfft/gen_r2cb.ml -4 +1
+    M ./genfft/gen_r2cf.ml -4 +1
+    M ./genfft/gen_r2r.ml -5 +1
+    M ./rdft/codelet-rdft.h -30 +4
+    M ./rdft/codelets/hc2c.c -14 +2
+    M ./rdft/codelets/hfb.c -14 +2
+    M ./rdft/codelets/r2c.c -18 +4
+    M ./rdft/codelets/r2r.c -15 +1
+    M ./rdft/ct-hc2c-direct.c -37 +127
+    M ./rdft/direct-r2c.c -41 +189
+    M ./rdft/direct-r2r.c -15 +6
+    M ./rdft/direct2.c -20 +11
+    M ./rdft/hc2hc-direct.c -23 +9
+    M ./rdft/hc2hc-directbuf.c -79 +42
+    M ./rdft/kr2c.c +1
+    M ./rdft/rdft.h +1
+
+Tue Dec 19 17:07:04 EST 2006  stevenj@fftw.org
+  * added memcpy-loop rank0 solver (it makes a 5-20% difference for transposes of large tuples)
+
+    M ./rdft/rank0.c +33
+
+Tue Dec 19 16:15:54 EST 2006  stevenj@fftw.org
+  * new variable to disable libbench2's problem allocation during speed benchmarking (to benchmark MPI transforms where the array does not fit into the memory of a single process)
+
+    M ./libbench2/bench-user.h +2
+    M ./libbench2/speed.c -3 +8
+    M ./mpi/mpi-bench.c -5 +15
+
+Tue Dec 19 14:55:08 EST 2006  stevenj@fftw.org
+  * allow transpose-inplace to use input as scratch for DESTROY_INPUT plans (to avoid non-square in-place transpositions) ... on supersgj, the planner often prefers transpose-inplace to transpose-alltoall in this case (apparently MPI_Alltoall in LAM MPI isn't that great)
+
+    M ./mpi/transpose-inplace.c -52 +99
+
+Tue Dec 19 17:07:14 EST 2006  athena@fftw.org
+  * For some reason HB2 codelets were not generated.
+
+    M ./rdft/codelets/r2cb/Makefile.am -2 +2
+    M ./rdft/codelets/r2cf/Makefile.am -1 +1
+
+Tue Dec 19 15:12:39 EST 2006  athena@fftw.org
+  * split rdft/direct.c into direct-r2r and direct-r2c, since the file was getting out of control.
+
+    A ./rdft/direct-r2c.c
+    A ./rdft/direct-r2r.c
+    R ./rdft/direct.c
+    M ./rdft/Makefile.am -4 +5
+    M ./rdft/direct-r2c.c +195
+    M ./rdft/direct-r2r.c +154
+
+Tue Dec 19 02:59:35 EST 2006  stevenj@fftw.org
+  * added dft-rank1 solver - MPI now supports 1d complex DFTs!
+
+    M ./mpi/Makefile.am -2 +2
+    M ./mpi/api.c -9 +54
+    A ./mpi/choose-radix.c
+    M ./mpi/conf.c +1
+    M ./mpi/dft-problem.c -2 +2
+    A ./mpi/dft-rank1.c
+    M ./mpi/fftw3-mpi.h -2 +8
+    M ./mpi/ifftw-mpi.h +4
+    M ./mpi/mpi-bench.c -25 +24
+    M ./mpi/mpi-dft.h -4 +5
+    M ./mpi/mpi-transpose.h -3 +3
+    M ./mpi/transpose-problem.c -1 +1
+    M ./tests/bench.c -8 +8
+    M ./tests/fftw-bench.h -1 +1
+
+Tue Dec 19 01:27:20 EST 2006  stevenj@fftw.org
+  * fftw_flops must call cost_hook directly; iestimate_cost always uses COST_MAX
+
+    M ./api/flops.c -1 +7
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/planner.c -4 +3
+
+Tue Dec 19 01:16:54 EST 2006  stevenj@fftw.org
+  * fftw_flops and fftw_estimate_cost must now be called from every process, to prevent deadlocks in the MPI code (since they sum/max the cost over all processes)
+
+    M ./tests/fftw-bench.c -7 +11
+
+Tue Dec 19 00:55:34 EST 2006  stevenj@fftw.org
+  * whoops, typo in assert
+
+    M ./mpi/block.c -1 +1
+
+Tue Dec 19 00:51:07 EST 2006  stevenj@fftw.org
+  * remove multiplication by FFT_SIGN
+
+    M ./kernel/trig.c -2 +7
+
+Mon Dec 18 22:51:45 EST 2006  stevenj@fftw.org
+  * need to synchronize ESTIMATE costs in MPI planner, and sum ESTIMATE costs for flop reporting: generalize measure_hook to cost_hook(..., {COST_SUM, COST_MAX})
+
+    M ./api/flops.c -1 +1
+    M ./kernel/ifftw.h -3 +6
+    M ./kernel/planner.c -8 +12
+    M ./mpi/api.c -6 +7
+
+Mon Dec 18 15:36:15 EST 2006  stevenj@fftw.org
+  * previous patch slowed down transpose-alltoall when TRANSPOSED_IN and DESTROY_INPUT; now allow planner to choose old behavior in this case 
+
+    M ./mpi/transpose-alltoall.c -11 +31
+
+Mon Dec 18 15:15:39 EST 2006  stevenj@fftw.org
+  * transpose-alltoall doesn't require input to be destroyed if TRANSPOSED_IN is set
+
+    M ./mpi/transpose-alltoall.c -26 +41
+
+Mon Dec 18 17:41:25 EST 2006  athena@fftw.org
+  * Added t2-style hc2c codelets, fixed typos.
+
+    M ./api/plan-guru-dft-c2r.h -1 +1
+    M ./api/plan-guru-dft-r2c.h -1 +1
+    M ./api/plan-guru-dft.h -2 +2
+    M ./rdft/codelets/r2cb/Makefile.am -1 +9
+    M ./rdft/codelets/r2cf/Makefile.am -1 +9
+
+Mon Dec 18 16:02:42 EST 2006  athena@fftw.org
+  * Renamed certain variables to avoid calling an output stride `is'.
+
+    M ./rdft/codelet-rdft.h -1 +1
+    M ./rdft/direct.c -17 +17
+    M ./rdft/direct2.c -7 +7
+
+Mon Dec 18 15:54:43 EST 2006  athena@fftw.org
+  * Oops---wrong stride
+
+    M ./rdft/ct-hc2c.c -1 +1
+
+Mon Dec 18 14:59:16 EST 2006  athena@fftw.org
+  * Oops, forgot to add file
+
+    A ./rdft/kr2c.c
+
+Mon Dec 18 14:17:02 EST 2006  athena@fftw.org
+  * Renamed r2hc/hc2r codelets to r2c
+  After the recent changes, r2hc/hc2r codelets became rdft2
+  problems, so I renamed them accordingly to r2cf/r2cb. 
+  Codelet parameters are now a real array and a complex array, instead
+  of an input array and an output array, and forward and backward
+  codelets have the same type, which removes some clutter from the rdft
+  code.
+
+     ./genfft/gen_hc2r.ml -> ./genfft/gen_r2cb.ml
+     ./genfft/gen_r2hc.ml -> ./genfft/gen_r2cf.ml
+     ./rdft/codelets/hc2r -> ./rdft/codelets/r2cb
+     ./rdft/codelets/r2hc -> ./rdft/codelets/r2cf
+    A ./rdft/codelets/r2c.c
+    A ./rdft/codelets/r2cb.h
+    A ./rdft/codelets/r2cbIII.h
+    A ./rdft/codelets/r2cf.h
+    A ./rdft/codelets/r2cfII.h
+    M ./Makefile.am -2 +2
+    M ./configure.ac -2 +2
+    M ./doc/fftw3.texi -1 +1
+    M ./genfft/Makefile.am -9 +9
+    M ./genfft/gen_r2cb.ml -58 +51
+    M ./genfft/gen_r2cf.ml -58 +51
+    M ./rdft/Makefile.am -3 +3
+    M ./rdft/codelet-rdft.h -59 +16
+    M ./rdft/codelets/Makefile.am -3 +3
+    R ./rdft/codelets/hc2r.c
+    R ./rdft/codelets/hc2r.h
+    R ./rdft/codelets/hc2rIII.h
+    M ./rdft/codelets/r2c.c +51
+    M ./rdft/codelets/r2cb/Makefile.am -21 +21
+    M ./rdft/codelets/r2cb.h +23
+    M ./rdft/codelets/r2cbIII.h +23
+    M ./rdft/codelets/r2cf/Makefile.am -20 +20
+    M ./rdft/codelets/r2cf.h +23
+    M ./rdft/codelets/r2cfII.h +23
+    R ./rdft/codelets/r2hc.c
+    R ./rdft/codelets/r2hc.h
+    R ./rdft/codelets/r2hcII.h
+    M ./rdft/conf.c -2 +2
+    M ./rdft/ct-hc2c.c -2 +2
+    M ./rdft/direct.c -46 +28
+    M ./rdft/direct2.c -72 +23
+    R ./rdft/khc2r.c
+    R ./rdft/kr2hc.c
+    M ./rdft/rdft.h -4 +2
+    M ./support/Makefile.codelets -2 +2
+
+Mon Dec 18 10:48:07 EST 2006  athena@fftw.org
+  * Implemented backward radix-2k rdft2.
+
+    A ./rdft/codelets/hc2cf.h
+    A ./rdft/codelets/hc2cb.h
+    M ./genfft/gen_hc2c.ml -26 +30
+    M ./rdft/codelets/Makefile.am -1 +1
+    M ./rdft/codelets/hc2c.c -1 +5
+    R ./rdft/codelets/hc2c.h
+    M ./rdft/codelets/hc2cb.h +23
+    M ./rdft/codelets/hc2cf.h +23
+    M ./rdft/codelets/hc2r/Makefile.am -1 +11
+    M ./rdft/codelets/r2hc/Makefile.am -7 +7
+    M ./rdft/ct-hc2c.c -3 +25
+    M ./rdft/rdft2-radix2.c -72 +1
+
+Mon Dec 18 13:56:09 EST 2006  stevenj@fftw.org
+  * move extract_reim into kernel, since it is used by internal MPI stuff and not just in the API code any more
+
+     ./api/extract-reim.c -> ./kernel/extract-reim.c
+    M ./api/Makefile.am -1 +1
+    M ./api/api.h -1 +1
+    M ./api/plan-many-dft-c2r.c -1 +1
+    M ./api/plan-many-dft-r2c.c -1 +1
+    M ./api/plan-many-dft.c -2 +2
+    M ./kernel/Makefile.am -5 +6
+    M ./kernel/extract-reim.c -6 +6
+    M ./kernel/ifftw.h +1
+    M ./mpi/dft-rank-geq2.c -3
+    M ./mpi/dft-rank1-bigvec.c -3
+    M ./mpi/dft-serial.c -3
+
+Mon Dec 18 08:40:14 EST 2006  athena@fftw.org
+  * Do not check r1==cr unless rnk>0
+
+    M ./rdft/problem2.c -1 +5
+
+Sun Dec 17 21:03:50 EST 2006  athena@fftw.org
+  * Implemented radix-2k RDFT2, forward only for now
+
+    M ./api/execute-dft-c2r.c -1 +2
+    M ./api/execute-dft-r2c.c -1 +2
+    M ./api/execute-split-dft-c2r.c -1 +2
+    M ./api/execute-split-dft-r2c.c -1 +2
+    M ./api/f77funcs.h -4 +8
+    M ./api/plan-guru-dft-c2r.h -8 +8
+    M ./api/plan-guru-dft-r2c.h -9 +9
+    M ./api/plan-guru-split-dft-c2r.h -6 +6
+    M ./api/plan-guru-split-dft-r2c.h -9 +9
+    M ./api/plan-many-dft-c2r.c -3 +3
+    M ./api/plan-many-dft-r2c.c -1 +1
+    M ./dft/problem.c -3 +5
+    M ./genfft/Makefile.am -5 +11
+    A ./genfft/gen_hc2c.ml
+    M ./genfft/gen_hc2hc.ml -30 +30
+    M ./genfft/gen_hc2r.ml -7 +16
+    M ./genfft/gen_r2hc.ml -8 +18
+    M ./rdft/Makefile.am -1 +2
+    M ./rdft/buffered2.c -31 +45
+    M ./rdft/codelet-rdft.h -11 +41
+    M ./rdft/codelets/Makefile.am -1 +1
+    A ./rdft/codelets/hc2c.c
+    A ./rdft/codelets/hc2c.h
+    M ./rdft/codelets/hc2r.c -4 +5
+    M ./rdft/codelets/hfb.c -4 +4
+    M ./rdft/codelets/r2hc.c -4 +5
+    M ./rdft/codelets/r2hc/Makefile.am -1 +11
+    A ./rdft/ct-hc2c-direct.c
+    A ./rdft/ct-hc2c.c
+    A ./rdft/ct-hc2c.h
+    M ./rdft/direct.c -16 +21
+    M ./rdft/direct2.c -15 +32
+    M ./rdft/hc2hc-common.c -2 +2
+    M ./rdft/hc2hc-direct.c -4 +4
+    M ./rdft/hc2hc-directbuf.c -8 +8
+    M ./rdft/hc2hc-generic.c -4 +3
+    M ./rdft/hc2hc.c -4 +4
+    A ./rdft/khc2c.c
+    M ./rdft/nop2.c -7 +10
+    M ./rdft/problem.c -2 +1
+    M ./rdft/problem2.c -38 +117
+    M ./rdft/rank-geq2-rdft2.c -13 +17
+    M ./rdft/rank0-rdft2.c -35 +44
+    M ./rdft/rdft.h -8 +27
+    M ./rdft/rdft2-inplace-strides.c -2 +6
+    M ./rdft/rdft2-radix2.c -131 +37
+    M ./rdft/solve2.c -1 +3
+    M ./rdft/vrank-geq1-rdft2.c -7 +7
+    M ./support/Makefile.codelets +1
+    M ./tests/hook.c -8 +23
+    M ./threads/vrank-geq1-rdft2.c -10 +10
+
+Mon Dec 18 01:23:45 EST 2006  stevenj@fftw.org
+  * separate TRANSPOSED/SCRAMBLED flags internally (this is required so that dft-rank1-bigvec and the future dft-rank1 won't have incompatible SCRAMBLED formats)
+
+    M ./mpi/api.c -16 +2
+    M ./mpi/dft-rank-geq2.c -7 +7
+    M ./mpi/dft-rank1-bigvec.c -8 +9
+    M ./mpi/dft-serial.c -1 +1
+    M ./mpi/ifftw-mpi.h -1 +8
+    M ./mpi/mpi-dft.h -1 +2
+    M ./mpi/mpi-transpose.h -2 +2
+    M ./mpi/transpose-alltoall.c -3 +4
+    M ./mpi/transpose-inplace.c -6 +7
+    M ./mpi/transpose-radix2.c -5 +6
+
+Mon Dec 18 01:02:27 EST 2006  stevenj@fftw.org
+  * ops_add -> ops_add2 where possible, to shrink code
+
+    M ./mpi/dft-rank-geq2.c -8 +4
+    M ./mpi/dft-rank1-bigvec.c -4 +2
+    M ./mpi/transpose-alltoall.c -6 +3
+    M ./mpi/transpose-inplace.c -8 +4
+    M ./mpi/transpose-radix2.c -10 +5
+    M ./rdft/vrank3-transpose.c -9 +5
+
+Mon Dec 18 00:43:02 EST 2006  stevenj@fftw.org
+  * added dft-rank1-bigvec solver (easy case for 1d parallel transforms)
+
+    M ./mpi/Makefile.am -1 +1
+    M ./mpi/api.c -2 +14
+    M ./mpi/conf.c +1
+    A ./mpi/dft-rank1-bigvec.c
+    M ./mpi/mpi-dft.h +1
+
+Sun Dec 17 20:42:21 EST 2006  stevenj@fftw.org
+  * rewrote MPI stuff to use dtensor data structure
+  
+  A dtensor is an ordered tuple of triplets (n, ib, ob) giving the size of
+  a dimension (n) and its input and output block sizes of a distributed
+  row-major multi-dimensional array.  An MPI DFT (etc.) is now specified
+  in terms of dtensors, which provide a much more flexible data layout. 
+  
+  For example, we can now describe multidimensional block distributions,
+  which are important if the number of processors is greater than the
+  size of any given dimension.  Currently, we only have solvers for
+  1d slab distributions, and this is all that is supported in the basic
+  and advanced APIs.  The guru API allows one to specify more general
+  distributions, however, which will be useful when/if we have solvers
+  for this case.
+  
+  We now also don't need a TRANSPOSED flag, at least internally, since
+  TRANSPOSED multi-dimensional DFT plans just correspond to dtensors
+  where the input and output block distributions are different.
+  
+  Other changes include the use of the XM(foo) macro for X(mpi_foo).
+  
+
+    M ./mpi/Makefile.am -1 +1
+    M ./mpi/any-true.c -1 +1
+    M ./mpi/api.c -129 +335
+    M ./mpi/block.c -32 +95
+    M ./mpi/conf.c -6 +6
+    M ./mpi/dft-problem.c -46 +33
+    M ./mpi/dft-rank-geq2.c -60 +72
+    M ./mpi/dft-serial.c -26 +20
+    M ./mpi/dft-solve.c -1 +1
+    A ./mpi/dtensor.c
+    M ./mpi/fftw3-mpi.h -24 +38
+    M ./mpi/ifftw-mpi.h -22 +60
+    M ./mpi/mpi-bench.c -115 +289
+    M ./mpi/mpi-dft.h -12 +13
+    M ./mpi/mpi-transpose.h -10 +11
+    M ./mpi/transpose-alltoall.c -17 +16
+    M ./mpi/transpose-inplace.c -24 +21
+    M ./mpi/transpose-problem.c -19 +18
+    M ./mpi/transpose-radix2.c -13 +11
+    M ./mpi/transpose-solve.c -1 +1
+    M ./tests/bench.c +4
+    M ./tests/fftw-bench.c +2
+    M ./tests/fftw-bench.h +1
+
+Fri Dec 15 16:01:23 EST 2006  athena@fftw.org
+  * Distinguished mutexes from semaphores.  
+  The distinction is useful because the linux implementation of
+  sem_post() in unnecessarily slow when semaphores are used for mutual
+  exclusion.  This change made spinlocks messier to implement, so I
+  excised them.
+
+    M ./tests/fftw-bench.c -8
+    M ./threads/threads.c -105 +40
+
+Fri Dec 15 12:46:11 EST 2006  athena@fftw.org
+  * Use posix semaphores where available.  
+  Paranoid declaration of all shared variables as ``volatile''.  Paranoid
+  initialization of all shared variables within locks.
+
+    M ./threads/threads.c -36 +68
+
+Wed Dec 13 20:12:13 EST 2006  Matteo Frigo <athena@fftw.org>
+  * paranoia
+
+    M ./threads/threads.c -2 +2
+
+Tue Dec 12 17:28:13 EST 2006  stevenj@fftw.org
+  * punt on detecting unsolvable rdft2 problems; make r==iio rdft2 problems unsolvable, since it doesn't look like we've consistently checked for this case and it's not clear why we would want to support it (it was also not documented in the manual)
+
+    M ./doc/fftw3.texi -1 +2
+    M ./rdft/problem2.c -30 +2
+
+Sat Dec  9 12:14:13 EST 2006  athena@fftw.org
+  * Obey stupid const rules
+
+    M ./api/api.h -2 +2
+    M ./api/apiplan.c -1 +1
+    M ./dft/dft.h -3 +3
+    M ./dft/indirect.c -4 +4
+    M ./dft/problem.c -9 +8
+    M ./kernel/ifftw.h -5 +5
+    M ./kernel/planner.c -2 +2
+    M ./kernel/problem.c -4 +4
+    M ./rdft/buffered2.c -1 +1
+    M ./rdft/dht-rader.c -1 +1
+    M ./rdft/indirect.c -4 +4
+    M ./rdft/problem.c -14 +13
+    M ./rdft/problem2.c -9 +8
+    M ./rdft/rank-geq2-rdft2.c -1 +1
+    M ./rdft/rdft-dht.c -1 +1
+    M ./rdft/rdft.h -13 +13
+    M ./rdft/rdft2-radix2.c -5 +5
+
+Fri Dec  8 18:21:50 EST 2006  stevenj@fftw.org
+  * added unsolvable check for rdft2 problem
+  
+  An in-place rdft2 problem is ill-formed if the real data, including the 
+  extra "padding" elements, do not coincide with the complex data.
+  
+  CHANGE: the new code considers all in-place split r2c and c2r problems 
+  to be ill-formed.  Previously, these could be done, but only if the
+  entire multi-dimensional array fit into the buffer, which is kind of 
+  stupid.  I'm not sure it's worth it to even try to support the
+  split in-place r2c case.
+
+    M ./rdft/problem2.c -1 +33
+
+Fri Dec  8 13:47:53 EST 2006  stevenj@fftw.org
+  * check in-placeness after joining taints
+
+    M ./dft/problem.c -9 +9
+
+Fri Dec  8 13:43:44 EST 2006  athena@fftw.org
+  * Grrr... paranoid-check was not testing in exhaustive mode
+
+    M ./tests/Makefile.am -5 +5
+
+Fri Dec  8 10:00:30 EST 2006  Matteo Frigo <athena@fftw.org>
+  * Implemented PROBLEM_UNSOLVABLE.  
+  In-place DFT and RDFT problems with inconsistent I/O strides are
+  now unsolvable, and we don't check for them any longer in solvers.
+  
+  While I was at it, declared all problem pointers to be ``const''
+  for extra safety.
+
+    M ./api/api.h -2 +2
+    M ./api/apiplan.c -3 +5
+    M ./dft/buffered.c -4 +1
+    M ./dft/dft.h -4 +4
+    M ./dft/direct.c -8 +6
+    M ./dft/indirect.c -4 +4
+    M ./dft/problem.c -11 +15
+    M ./dft/vrank-geq1.c -3
+    M ./kernel/ifftw.h -6 +10
+    M ./kernel/planner.c -6 +6
+    M ./kernel/problem.c -1 +40
+    M ./rdft/buffered.c -4 +1
+    M ./rdft/buffered2.c -1 +1
+    M ./rdft/dht-rader.c -1 +1
+    M ./rdft/direct.c -4 +3
+    M ./rdft/indirect.c -4 +4
+    M ./rdft/problem.c -14 +17
+    M ./rdft/problem2.c -10 +10
+    M ./rdft/rank-geq2-rdft2.c -1 +1
+    M ./rdft/rdft-dht.c -1 +1
+    M ./rdft/rdft.h -13 +13
+    M ./rdft/rdft2-radix2.c -5 +5
+    M ./rdft/vrank-geq1.c -2
+
+Thu Dec  7 20:13:46 EST 2006  Matteo Frigo <athena@fftw.org>
+  * Avoid qsort'ing one element.
+
+    M ./kernel/tensor7.c -6 +2
+
+Thu Dec  7 18:25:47 EST 2006  Matteo Frigo <athena@fftw.org>
+  * In-place vrank>=1 is now applicable only if the problem is really in-place.
+
+    M ./dft/buffered.c -7 +12
+    M ./dft/direct.c -11 +11
+    M ./dft/vrank-geq1.c +3
+    M ./kernel/ifftw.h +2
+    M ./kernel/tensor7.c +41
+    M ./rdft/buffered.c -7 +12
+    M ./rdft/direct.c -7 +5
+    M ./rdft/vrank-geq1.c +2
+
+Tue Dec  5 12:21:38 EST 2006  athena@fftw.org
+  * unused variable
+
+    M ./kernel/align.c +1
+
+Sun Dec  3 19:16:33 EST 2006  Matteo Frigo <athena@fftw.org>
+  * Removed CVS $Id$ everywhere, since darcs does not update them.
+
+    M ./api/fftw3.h -1
+    M ./api/version.c -1
+    M ./commercialize.sh +1
+    M ./dft/buffered.c -1
+    M ./dft/codelet-dft.h -1
+    M ./dft/codelets/standard/Makefile.am +1
+    M ./dft/conf.c -1
+    M ./dft/ct.c -1
+    M ./dft/dft.h -1
+    M ./dft/dftw-direct.c -1
+    M ./dft/dftw-directsq.c -1
+    M ./dft/direct.c -1
+    M ./dft/indirect.c -1
+    M ./dft/kdft-dif.c -1
+    M ./dft/kdft-difsq.c -1
+    M ./dft/kdft-dit.c -1
+    M ./dft/kdft.c -1
+    M ./dft/nop.c -1
+    M ./dft/plan.c -1
+    M ./dft/problem.c -1
+    M ./dft/rank-geq2.c -1
+    M ./dft/solve.c -1
+    M ./dft/vrank-geq1.c -1
+    M ./dft/zero.c -1
+    M ./doc/fftw3.texi -1
+    M ./genfft/algsimp.ml -2
+    M ./genfft/algsimp.mli -2
+    M ./genfft/annotate.ml -2
+    M ./genfft/annotate.mli -1
+    M ./genfft/assoctable.ml -1
+    M ./genfft/assoctable.mli -1
+    M ./genfft/c.ml -1
+    M ./genfft/c.mli -1
+    M ./genfft/complex.ml -1
+    M ./genfft/complex.mli -1
+    M ./genfft/conv.mli -1
+    M ./genfft/dag.ml -1
+    M ./genfft/dag.mli -1
+    M ./genfft/expr.ml -1
+    M ./genfft/expr.mli -1
+    M ./genfft/fft.ml -2
+    M ./genfft/fft.mli -2
+    M ./genfft/gen_conv.ml -3 +1
+    M ./genfft/gen_hc2hc.ml -3 +1
+    M ./genfft/gen_hc2r.ml -3 +1
+    M ./genfft/gen_mdct.ml -3 +1
+    M ./genfft/gen_notw.ml -3 +1
+    M ./genfft/gen_notw_c.ml -3 +1
+    M ./genfft/gen_r2hc.ml -3 +1
+    M ./genfft/gen_r2r.ml -3 +1
+    M ./genfft/gen_twiddle.ml -3 +1
+    M ./genfft/gen_twiddle_c.ml -3 +1
+    M ./genfft/gen_twidsq.ml -3 +1
+    M ./genfft/gen_twidsq_c.ml -3 +1
+    M ./genfft/genutil.ml -8 +1
+    M ./genfft/littlesimp.ml -1
+    M ./genfft/littlesimp.mli -1
+    M ./genfft/magic.ml -1
+    M ./genfft/monads.ml -1
+    M ./genfft/number.ml -1
+    M ./genfft/number.mli -1
+    M ./genfft/oracle.ml -1
+    M ./genfft/oracle.mli -1
+    M ./genfft/schedule.ml -1
+    M ./genfft/schedule.mli -1
+    M ./genfft/simd.ml -1
+    M ./genfft/simd.mli -1
+    M ./genfft/simdmagic.ml -1
+    M ./genfft/to_alist.ml -1
+    M ./genfft/to_alist.mli -1
+    M ./genfft/trig.ml -1
+    M ./genfft/trig.mli -1
+    M ./genfft/twiddle.ml -1
+    M ./genfft/twiddle.mli -1
+    M ./genfft/unique.ml -1
+    M ./genfft/unique.mli -1
+    M ./genfft/util.ml -1
+    M ./genfft/util.mli -1
+    M ./genfft/variable.ml -1
+    M ./genfft/variable.mli -1
+    M ./kernel/align.c -1
+    M ./kernel/alloc.c -1
+    M ./kernel/assert.c -1
+    M ./kernel/awake.c -1
+    M ./kernel/cycle.h -1
+    M ./kernel/debug.c -1
+    M ./kernel/iabs.c -1
+    M ./kernel/ifftw.h -1
+    M ./kernel/kalloc.c -1
+    M ./kernel/minmax.c -1
+    M ./kernel/ops.c -1
+    M ./kernel/pickdim.c -1
+    M ./kernel/plan.c -1
+    M ./kernel/planner.c -1
+    M ./kernel/primes.c -1
+    M ./kernel/print.c -1
+    M ./kernel/problem.c -1
+    M ./kernel/scan.c -1
+    M ./kernel/solver.c -1
+    M ./kernel/solvtab.c -1
+    M ./kernel/stride.c -1
+    M ./kernel/tensor.c -1
+    M ./kernel/tensor1.c -1
+    M ./kernel/tensor2.c -1
+    M ./kernel/tensor4.c -1
+    M ./kernel/tensor5.c -1
+    M ./kernel/tensor7.c -1
+    M ./kernel/tensor8.c -1
+    M ./kernel/tensor9.c -1
+    M ./kernel/timer.c -1
+    M ./kernel/trig.c -1
+    M ./kernel/twiddle.c -1
+    M ./libbench/accopy-from.c -1
+    M ./libbench/accopy-to.c -1
+    M ./libbench/allocate.c -1
+    M ./libbench/bench-main.c -1
+    M ./libbench/bench-user.h -1
+    M ./libbench/bench.h -1
+    M ./libbench/can-do.c -1
+    M ./libbench/ccopy-from.c -1
+    M ./libbench/ccopy-to.c -1
+    M ./libbench/deallocate.c -1
+    M ./libbench/getopt-utils.c -1
+    M ./libbench/info.c -1
+    M ./libbench/main.c -1
+    M ./libbench/prime.c -1
+    M ./libbench/problem.c -1
+    M ./libbench/report.c -1
+    M ./libbench/speed.c -1
+    M ./libbench/timer.c -1
+    M ./libbench/verify.c -1
+    M ./libbench/zero.c -1
+    M ./libbench2/aligned-main.c -1
+    M ./libbench2/allocate.c -1
+    M ./libbench2/bench-main.c -1
+    M ./libbench2/bench-user.h -1
+    M ./libbench2/bench.h -1
+    M ./libbench2/can-do.c -1
+    M ./libbench2/dotens2.c -1
+    M ./libbench2/info.c -1
+    M ./libbench2/main.c -1
+    M ./libbench2/problem.c -1
+    M ./libbench2/report.c -1
+    M ./libbench2/speed.c -1
+    M ./libbench2/tensor.c -1
+    M ./libbench2/timer.c -1
+    M ./libbench2/useropt.c -1
+    M ./libbench2/verify-dft.c -1
+    M ./libbench2/verify-lib.c -1
+    M ./libbench2/verify-rdft2.c -1
+    M ./libbench2/verify.c -1
+    M ./libbench2/zero.c -1
+    M ./mpi/conf.c -1
+    M ./rdft/buffered.c -1
+    M ./rdft/buffered2.c -1
+    M ./rdft/codelets/hc2r/Makefile.am +1
+    M ./rdft/codelets/r2hc/Makefile.am +1
+    M ./rdft/conf.c -1
+    M ./rdft/dft-r2hc.c -1
+    M ./rdft/dht-r2hc.c -1
+    M ./rdft/direct.c -1
+    M ./rdft/direct2.c -1
+    M ./rdft/hc2hc-direct.c -1
+    M ./rdft/hc2hc-directbuf.c -1
+    M ./rdft/indirect.c -1
+    M ./rdft/khc2hc.c -1
+    M ./rdft/khc2r.c -1
+    M ./rdft/kr2hc.c -1
+    M ./rdft/kr2r.c -1
+    M ./rdft/nop.c -1
+    M ./rdft/nop2.c -1
+    M ./rdft/plan.c -1
+    M ./rdft/plan2.c -1
+    M ./rdft/problem.c -1
+    M ./rdft/problem2.c -1
+    M ./rdft/rank-geq2-rdft2.c -1
+    M ./rdft/rank-geq2.c -1
+    M ./rdft/rank0-rdft2.c -1
+    M ./rdft/rank0.c -1
+    M ./rdft/rdft-dht.c -1
+    M ./rdft/rdft2-inplace-strides.c -1
+    M ./rdft/rdft2-radix2.c -1
+    M ./rdft/rdft2-tensor-max-index.c -1
+    M ./rdft/solve.c -1
+    M ./rdft/solve2.c -1
+    M ./rdft/vrank-geq1-rdft2.c -1
+    M ./rdft/vrank-geq1.c -1
+    M ./rdft/vrank3-transpose.c -1
+    M ./reodft/conf.c -1
+    M ./reodft/redft00e-r2hc-pad.c -1
+    M ./reodft/redft00e-r2hc.c -1
+    M ./reodft/reodft00e-splitradix.c -1
+    M ./reodft/reodft010e-r2hc.c -1
+    M ./reodft/reodft11e-r2hc-odd.c -1
+    M ./reodft/reodft11e-r2hc.c -1
+    M ./reodft/reodft11e-radix2.c -1
+    M ./reodft/rodft00e-r2hc-pad.c -1
+    M ./reodft/rodft00e-r2hc.c -1
+    M ./simd/altivec.c -1
+    M ./simd/nonportable/sse.c -1
+    M ./simd/nonportable/sse2.c -1
+    M ./simd/sse.c -1
+    M ./simd/sse2.c -1
+    M ./simd/taint.c -1
+    M ./simd/x86-cpuid.h -1
+    M ./threads/conf.c -1
+    M ./threads/ct.c -1
+    M ./threads/dft-vrank-geq1.c -1
+    M ./threads/rdft-vrank-geq1.c -1
+    M ./threads/vrank-geq1-rdft2.c -1
+
+Sun Dec  3 16:11:17 EST 2006  Matteo Frigo <athena@fftw.org>
+  * generalized dftw to encompass q codelets.  As a side effect, q codelets are now threaded.
+
+    R ./dft/ctsq.c
+    A ./dft/dftw-directsq.c
+    M ./dft/Makefile.am -2 +2
+    M ./dft/ct.c -25 +55
+    M ./dft/ct.h -6 +9
+    M ./dft/dftw-direct.c -59 +72
+    M ./dft/dftw-directsq.c +163
+    M ./dft/dftw-generic.c -24 +32
+    M ./dft/dftw-genericbuf.c -30 +37
+    M ./dft/indirect-transpose.c -1 +1
+    M ./dft/kdft-difsq.c -1 +1
+    M ./libbench2/my-getopt.c -3 +3
+    M ./threads/ct.c -28 +53
+
+Sat Nov 25 16:34:38 EST 2006  stevenj@fftw.org
+  * add missing __declspec attribute to threads API functions when compiling for Windows (thanks to Robert O. Morris for the bug report)
+
+    M ./api/f77api.c -29 +1
+    M ./api/x77.h +28
+    M ./threads/f77api.c +13
+    M ./threads/f77funcs.h -3 +3
+
+Mon Nov 20 17:39:20 EST 2006  stevenj@fftw.org
+  * add AC_SUBST to AX_OPENMP, thanks to Sebastien Maret for the suggestion
+
+    M ./m4/ax_openmp.m4 -1 +2
+
+Mon Oct 16 23:02:29 EDT 2006  stevenj@fftw.org
+  * not gcc bug for MIPS (thanks to Jonathan Day)
+
+    M ./doc/FAQ/fftw-faq.bfnn +4
+
+Sat Sep 23 17:52:36 EDT 2006  stevenj@fftw.org
+  * in maintainer/debug mode, don't modify CFLAGS if they were explicitly set (-pedantic seems to cause problems with LAM's mpicc, so I need a way to override)
+
+    M ./configure.ac +2
+
+Wed Nov 22 18:14:47 EST 2006  athena@fftw.org
+  * Removed obsolete comment.
+
+    M ./dft/dftw-generic.c -1
+
+Sun Nov 19 11:21:44 EST 2006  athena@fftw.org
+  * Use p->v when comparing TW_FULL fields.
+
+    M ./kernel/twiddle.c -2 +1
+
+Thu Nov 16 14:49:05 EST 2006  athena@fftw.org
+  * removed useless definition
+
+    M ./simd/simd.h -1
+
+Mon Nov 13 09:18:32 EST 2006  athena@fftw.org
+  * paranoid avoidance of integer overflows
+
+    M ./libbench2/timer.c -3 +3
+
+Mon Nov 13 09:00:11 EST 2006  athena@fftw.org
+  * avoid potential overflows in cycle counters
+  At the suggestion of Alex Cichowski, convert all ticks
+  to double before operating on them, to avoid potential
+  signed/unsigned confusion and integer overflow.
+
+    M ./kernel/cycle.h -5 +8
+
+Sun Nov  5 09:00:52 EST 2006  Matteo Frigo <athena@fftw.org>
+  * Removed unused struct field
+
+    M ./threads/threads.c -1
+
+Sat Nov  4 09:43:13 EST 2006  Matteo Frigo <athena@fftw.org>
+  * use pthread condition variables instead of semaphores
+  Condition variables are more likely to be portable everywhere, and
+  somehow they appear to introduce less overhead at least on my
+  linux box.
+
+    M ./threads/threads.c -6 +19
+
+Tue Oct 31 20:45:24 EST 2006  Matteo Frigo <athena@fftw.org>
+  * Imprecise help message.
+
+    M ./libbench2/my-getopt.c -1 +1
+
+Mon Oct 30 20:13:35 EST 2006  athena@fftw.org
+  * Experimental implementation of spinlocks.
+  This patch implements spinlocks via a semi-portable hack, and adds
+  the -ospinlocks option to the bench program so that we can play with them.
+
+    M ./tests/fftw-bench.c +8
+    M ./threads/threads.c -28 +104
+
+Thu Oct 26 22:29:18 EDT 2006  athena@fftw.org
+  * Updated manual for new openmp configure options.
+
+    M ./doc/fftw3.texi -20 +17
+
+Thu Oct 26 21:52:39 EDT 2006  athena@fftw.org
+  * Added back openmp.
+  Rationalized threads naming conventions:
+  
+    * threads explicitly managed by us are enabled by --enable-threads,
+      predicated on HAVE_THREADS, etc.
+    * openmp is enabled by --enable-openmp, predicated on HAVE_OPENMP, etc.
+    * SMP denotes either THREADS or OPENMP.
+
+    M ./Makefile.am -1 +1
+    M ./configure.ac -10 +27
+    M ./kernel/alloc.c -1 +1
+    M ./kernel/ifftw.h -1 +1
+    M ./mpi/Makefile.am -1 +1
+    M ./tests/Makefile.am -5 +5
+    M ./tests/fftw-bench.c -3 +7
+    M ./threads/Makefile.am -2 +2
+    M ./threads/api.c -1 +16
+    M ./threads/conf.c -2 +2
+    A ./threads/openmp.c
+    M ./threads/threads.c -13 +19
+    M ./threads/threads.h -2 +2
+    M ./tools/Makefile.am -1 +1
+    M ./tools/fftw-wisdom.c -5 +9
+
+Mon Oct 23 20:14:31 EDT 2006  athena@fftw.org
+  * different thread protocols
+
+    M ./threads/threads.c -80 +71
+
+Sun Oct 22 14:49:32 EDT 2006  athena@fftw.org
+  * fix memory leak
+  Added pthread_attr_destroy to avoid memory leak.
+
+    M ./threads/threads.c +1
+
+Sun Oct 22 14:23:30 EDT 2006  athena@fftw.org
+  * Experimental new pthread implementation that recycles threads.
+
+    M ./threads/threads.c -528 +155
+
+Tue Oct 24 23:28:10 EDT 2006  athena@fftw.org
+  * switched buddies
+  Switched order of buddies in rdft2 rank-geq2 for consistency
+  with analogous dft and rdft solvers.  Furthermore, this change reduces
+  the MEASURE planning time for rank == 3.
+
+    M ./rdft/rank-geq2-rdft2.c -4 +1
+
+Sat Oct 21 10:10:00 EDT 2006  Matteo Frigo <athena@fftw.org>
+  * Typo.
+
+    M ./tools/fftw_wisdom.1.in -1 +1
+
+Sat Oct 21 09:56:50 EDT 2006  Matteo Frigo <athena@fftw.org>
+  * Out of place is the default.  Thanks to Kirk Kern for pointing this out.
+
+    M ./tests/README -2 +2
+
+Fri Sep 29 01:36:11 EDT 2006  stevenj@fftw.org
+  * rename "test" to "tst", since a user (Igor Levicki) reports that "test" is a reserved words in some x86 assemblers
+
+    M ./simd/x86-cpuid.h -10 +10
+
+Tue Sep 26 09:01:08 EDT 2006  Matteo Frigo <athena@fftw.org>
+  * Stylistic change.
+
+    M ./kernel/timer.c -2 +1
+
+Tue Sep 26 08:45:37 EDT 2006  athena@fftw.org
+  * Do not set tmin=1e10, since a large FFT may take longer than that.
+
+    M ./kernel/timer.c -5 +4
+
+Sat Sep 23 22:07:10 EDT 2006  athena@fftw.org
+  * Disable certain gcc optimizations.
+  When PRECOMPUTE_ARRAY_INDICES is #define'd, array indices have the
+  form array[stride[k]] for compile-time constant k.  Apparently new
+  gcc's copy stride[k] onto the stack before the codelet loop, which is
+  an idiotic optimization if ever there was one.  This patch confuses
+  gcc enough to prevent this optimization.
+
+    M ./kernel/ifftw.h -3 +6
+    M ./kernel/stride.c -1 +1
+
+Sat Sep 23 13:02:58 EDT 2006  stevenj@fftw.org
+  * re-enable TOMS algorithm - it is the best for large vector lengths, since for such sizes the cache line is not an issue and the bookkeeping overhead is negligible
+
+    M ./rdft/vrank3-transpose.c -18 +15
+
+Thu Sep 21 15:40:15 EDT 2006  stevenj@fftw.org
+  * add measure_hook so that MPI can synchronize timing measurements (otherwise different processors might end up with different MPI plans, yikes!)
+
+    M ./kernel/ifftw.h +2
+    M ./kernel/planner.c +5
+    M ./mpi/api.c -2 +26
+
+Tue Sep 19 21:26:19 EDT 2006  stevenj@fftw.org
+  * added O(p log p) transpose algorithm (radix 2)
+
+    M ./mpi/Makefile.am -1 +1
+    M ./mpi/conf.c +1
+    M ./mpi/mpi-transpose.h +1
+    A ./mpi/transpose-radix2.c
+
+Tue Sep 19 21:05:09 EDT 2006  stevenj@fftw.org
+  * comments
+
+    M ./mpi/testsched.c -8 +10
+
+Tue Sep 19 19:54:58 EDT 2006  stevenj@fftw.org
+  * whoops
+
+     ./mpi/any_true.c -> ./mpi/any-true.c
+    M ./mpi/Makefile.am -1 +1
+
+Tue Sep 19 19:49:01 EDT 2006  stevenj@fftw.org
+  * synchronize planning so that if one process fails to create a plan then all of them do.
+
+    M ./mpi/Makefile.am -1 +1
+    A ./mpi/any_true.c
+    M ./mpi/dft-rank-geq2.c -4 +4
+    M ./mpi/dft-serial.c -1 +1
+    M ./mpi/ifftw-mpi.h +3
+    M ./mpi/transpose-alltoall.c -5 +5
+    M ./mpi/transpose-inplace.c -5 +5
+
+Tue Sep 19 18:17:38 EDT 2006  stevenj@fftw.org
+  * call MPI_Alltoall instead of MPI_Alltoallv for equal-blocks case, in case MPI implementation has special optimizations for the common case of equal sizes
+
+    M ./mpi/transpose-alltoall.c -6 +19
+
+Tue Sep 19 12:07:35 EDT 2006  stevenj@fftw.org
+  * whoops
+
+    M ./tools/Makefile.am -1 +1
+
+Tue Sep 19 02:20:06 EDT 2006  stevenj@fftw.org
+  * typo in comment
+
+    M ./mpi/testsched.c -3 +3
+
+Tue Sep 19 01:58:55 EDT 2006  stevenj@fftw.org
+  * more filename simplifications
+
+     ./mpi/mpi-dft-rank-geq2.c -> ./mpi/dft-rank-geq2.c
+     ./mpi/mpi-dft-serial.c -> ./mpi/dft-serial.c
+     ./mpi/mpi-transpose-alltoall.c -> ./mpi/transpose-alltoall.c
+     ./mpi/mpi-transpose-inplace.c -> ./mpi/transpose-inplace.c
+     ./mpi/problem.c -> ./mpi/dft-problem.c
+     ./mpi/solve.c -> ./mpi/dft-solve.c
+     ./mpi/tproblem.c -> ./mpi/transpose-problem.c
+     ./mpi/tsolve.c -> ./mpi/transpose-solve.c
+    M ./mpi/Makefile.am -3 +3
+
+Tue Sep 19 01:49:52 EDT 2006  stevenj@fftw.org
+  * canonicalize file names (hyphens, not underscores)
+
+     ./mpi/fftw3_mpi.h -> ./mpi/fftw3-mpi.h
+     ./mpi/ifftw_mpi.h -> ./mpi/ifftw-mpi.h
+     ./mpi/mpi_bench.c -> ./mpi/mpi-bench.c
+     ./mpi/mpi_dft.h -> ./mpi/mpi-dft.h
+     ./mpi/mpi_transpose.h -> ./mpi/mpi-transpose.h
+     ./mpi/mpi_transpose_alltoall.c -> ./mpi/mpi-transpose-alltoall.c
+     ./mpi/mpi_transpose_inplace.c -> ./mpi/mpi-transpose-inplace.c
+     ./tests/fftw_bench.h -> ./tests/fftw-bench.h
+     ./tests/fftw_bench_common.c -> ./tests/fftw-bench.c
+    M ./mpi/Makefile.am -5 +5
+    M ./mpi/api.c -4 +4
+    M ./mpi/block.c -1 +1
+    M ./mpi/conf.c -2 +2
+    M ./mpi/mpi-bench.c -2 +2
+    M ./mpi/mpi-dft-rank-geq2.c -2 +2
+    M ./mpi/mpi-dft-serial.c -1 +1
+    M ./mpi/mpi-dft.h -1 +1
+    M ./mpi/mpi-transpose-alltoall.c -1 +1
+    M ./mpi/mpi-transpose-inplace.c -1 +1
+    M ./mpi/mpi-transpose.h -1 +1
+    M ./mpi/problem.c -1 +1
+    M ./mpi/solve.c -1 +1
+    M ./mpi/tproblem.c -1 +1
+    M ./mpi/tsolve.c -1 +1
+    M ./tests/Makefile.am -1 +1
+    M ./tests/bench.c -2 +2
+    M ./tests/fftw-bench.c -1 +1
+
+Tue Sep 19 01:34:07 EDT 2006  stevenj@fftw.org
+  * add mpi-dft-serial
+
+    M ./mpi/Makefile.am -1 +1
+    M ./mpi/conf.c +1
+    M ./mpi/mpi-dft-rank-geq2.c -3 +9
+    A ./mpi/mpi-dft-serial.c
+    M ./mpi/mpi_dft.h +1
+    M ./mpi/mpi_transpose_alltoall.c -1 +1
+    M ./mpi/mpi_transpose_inplace.c -1 +1
+
+Tue Sep 19 01:21:47 EDT 2006  stevenj@fftw.org
+  * silence warnings
+
+    M ./dft/indirect-transpose.c -1
+    M ./libbench2/problem.c -2
+
+Tue Sep 19 00:31:59 EDT 2006  stevenj@fftw.org
+  * make "t" problem semantics match FFTW_MPI_TRANSPOSED
+
+    M ./libbench2/problem.c -4 +3
+    M ./mpi/mpi_bench.c -8 +18
+
+Mon Sep 18 23:50:43 EDT 2006  stevenj@fftw.org
+  * whoops, fixed backwards mpi_dft
+
+    M ./mpi/mpi-dft-rank-geq2.c -7 +7
+
+Mon Sep 18 22:26:31 EDT 2006  stevenj@fftw.org
+  * initial stab at rank-geq2 mpi-dft; seems to be mostly working
+
+    M ./libbench2/bench-user.h -1 +1
+    M ./mpi/Makefile.am -1 +4
+    M ./mpi/api.c -26 +180
+    M ./mpi/conf.c +2
+    M ./mpi/fftw3_mpi.h -11 +51
+    M ./mpi/ifftw_mpi.h -4 +1
+    A ./mpi/mpi-dft-rank-geq2.c
+    M ./mpi/mpi_bench.c -55 +166
+    M ./mpi/mpi_dft.h -2 +2
+    M ./mpi/mpi_transpose.h -6 +1
+    M ./mpi/mpi_transpose_alltoall.c -7 +10
+    M ./mpi/mpi_transpose_inplace.c -7 +8
+    M ./mpi/problem.c -2 +7
+    A ./mpi/solve.c
+    M ./mpi/tproblem.c -2 +2
+
+Sun Sep 17 13:41:32 EDT 2006  stevenj@fftw.org
+  * support SCRAMBLED_OUT in alltoall transpose
+
+    M ./kernel/ifftw.h +4
+    M ./kernel/tensor2.c +20
+    M ./mpi/ifftw_mpi.h -1 +16
+    M ./mpi/mpi_dft.h -1 +1
+    M ./mpi/mpi_transpose_alltoall.c -23 +48
+    M ./mpi/mpi_transpose_inplace.c -23 +2
+    M ./mpi/problem.c -2 +3
+
+Sun Sep 17 12:34:30 EDT 2006  stevenj@fftw.org
+  * skeleton of future support for block-cyclic
+
+    M ./mpi/block.c -7 +27
+    M ./mpi/ifftw_mpi.h +1
+    M ./mpi/mpi_transpose_alltoall.c +2
+    M ./mpi/mpi_transpose_inplace.c -2 +5
+
+Sun Sep 17 12:11:19 EDT 2006  stevenj@fftw.org
+  * test program now checks scrambled in/out via -obflag=28/29
+
+    M ./mpi/api.c -1 +1
+    M ./mpi/fftw3_mpi.h -1 +1
+    M ./mpi/mpi_bench.c -13 +59
+    M ./mpi/problem.c -1
+
+Sun Sep 17 11:58:36 EDT 2006  stevenj@fftw.org
+  * added -obflag to make it easier to set high-order bits
+
+    M ./tests/fftw_bench_common.c +1
+
+Sun Sep 17 01:30:51 EDT 2006  stevenj@fftw.org
+  * use proper child plans for 2nd transpose in transpose_alltoall; implement opcount in transpose_inplace
+
+    M ./mpi/mpi_transpose_alltoall.c -47 +47
+    M ./mpi/mpi_transpose_inplace.c -1 +10
+
+Sun Sep 17 01:08:01 EDT 2006  stevenj@fftw.org
+  * fix in test program for transposes of vectors -- transpose routines seem to completely work now (except for scrambled in/out, which is untested)
+
+    M ./mpi/mpi_bench.c -1 +1
+
+Sun Sep 17 01:01:16 EDT 2006  stevenj@fftw.org
+  * fixed bug in transpose_alltoall for unequal blocks
+
+    M ./mpi/mpi_transpose_alltoall.c -2 +2
+    M ./mpi/mpi_transpose_inplace.c -2 +2
+
+Sat Sep 16 15:29:46 EDT 2006  stevenj@fftw.org
+  * correctly handle cld2rest
+
+    M ./mpi/mpi_transpose_inplace.c -24 +28
+
+Sat Sep 16 15:29:31 EDT 2006  stevenj@fftw.org
+  * some debugging code and other fixes
+
+    M ./mpi/mpi_bench.c -13 +34
+
+Sat Sep 16 14:54:30 EDT 2006  stevenj@fftw.org
+  * whoops, forgot to check in mpi_bench.c file
+
+    A ./mpi/mpi_bench.c
+
+Sat Sep 16 14:54:02 EDT 2006  stevenj@fftw.org
+  * added bench_exit routine so that it can be overridden (by MPI_Abort) if needed
+
+    M ./libbench2/Makefile.am -5 +5
+    A ./libbench2/bench-exit.c
+    M ./libbench2/bench-user.h +1
+    M ./libbench2/util.c -1 +1
+    M ./libbench2/verify-lib.c -1 +1
+    M ./libbench2/verify-r2r.c -1 +1
+
+Sat Sep 16 14:52:56 EDT 2006  stevenj@fftw.org
+  * bug fix in mpi_transpose_inplace for case where some processors are idle
+
+    M ./mpi/mpi_transpose_inplace.c -3 +4
+
+Fri Sep 15 18:47:13 EDT 2006  stevenj@fftw.org
+  * allow vecloop for sz->rnk==0 in exceptional (SLOW) cases, e.g. it is necessary for loops of non-square transposes (otherwise e.g. ik1v5:200:200x10:20:1x20:1:10 planning fails)
+
+    M ./rdft/vrank-geq1.c -2 +6
+
+Thu Sep 14 23:36:48 EDT 2006  stevenj@fftw.org
+  * first pass at working mpi_bench test program; transpose seems to work iff dimensions are divisible by #processors
+
+    M ./libbench2/Makefile.am -1 +3
+    A ./libbench2/after-ccopy-from.c
+    A ./libbench2/after-ccopy-to.c
+    A ./libbench2/after-hccopy-from.c
+    A ./libbench2/after-hccopy-to.c
+    A ./libbench2/after-rcopy-from.c
+    A ./libbench2/after-rcopy-to.c
+    M ./libbench2/bench-main.c -4 +6
+    M ./libbench2/bench-user.h +10
+    M ./libbench2/verify-dft.c +2
+    M ./libbench2/verify-r2r.c +4
+    M ./libbench2/verify-rdft2.c +4
+    M ./mpi/api.c -2 +6
+
+Wed Sep 13 17:28:07 EDT 2006  stevenj@fftw.org
+  * whoops
+
+    M ./mpi/mpi_transpose_inplace.c -2 +2
+
+Tue Sep 12 22:27:03 EDT 2006  stevenj@fftw.org
+  * initial stub for mpi_bench
+
+    M ./Makefile.am -2 +2
+    M ./libbench2/bench-main.c +3
+    M ./libbench2/bench-user.h +1
+    M ./libbench2/verify-lib.c -1 +1
+    M ./mpi/Makefile.am -1 +14
+    M ./tests/bench.c +9
+    M ./tests/fftw_bench.h +1
+    M ./tests/fftw_bench_common.c +2
+
+Tue Sep 12 21:54:31 EDT 2006  stevenj@fftw.org
+  * do no output at all if verbose < 0 (for use with MPI, where we only want output from process 0)
+
+    M ./libbench2/bench-main.c -1 +1
+    M ./libbench2/bench.h +1
+    M ./libbench2/ovtpvt.c -1 +15
+    M ./libbench2/useropt.c -1 +1
+    M ./libbench2/util.c -2 +1
+    M ./libbench2/verify-lib.c -5 +4
+    M ./libbench2/verify-r2r.c -4 +4
+
+Tue Sep 12 21:39:15 EDT 2006  stevenj@fftw.org
+  * whoops
+
+    M ./tests/fftw_bench_common.c +3
+
+Tue Sep 12 21:31:40 EDT 2006  stevenj@fftw.org
+  * split bench.c into bench.c and fftw_bench_common.c so that we can re-use some of the code in the MPI test program
+
+    M ./tests/Makefile.am -1 +1
+    M ./tests/bench.c -210 +6
+    A ./tests/fftw_bench.h
+    A ./tests/fftw_bench_common.c
+    M ./tools/Makefile.am -1 +2
+
+Tue Sep 12 21:00:36 EDT 2006  stevenj@fftw.org
+  * MPI stuff at least compiles now
+
+    M ./configure.ac -9 +11
+    M ./kernel/ifftw.h +7
+    M ./mpi/api.c -3 +4
+    M ./mpi/fftw3_mpi.h +1
+    M ./mpi/mpi_transpose_alltoall.c -10 +12
+    M ./mpi/mpi_transpose_inplace.c -11 +14
+    M ./mpi/problem.c -5 +4
+    M ./mpi/tproblem.c -4 +3
+
+Mon Sep 11 22:26:36 EDT 2006  stevenj@fftw.org
+  * initial (nonfunctional) start at MPI support (similar to FFTW 2.x in spirit, but mostly rewritten)
+
+    A ./mpi/
+    M ./Makefile.am -1 +1
+    M ./configure.ac +11
+    A ./m4/acx_mpi.m4
+    A ./mpi/Makefile.am
+    A ./mpi/api.c
+    A ./mpi/block.c
+    A ./mpi/conf.c
+    A ./mpi/fftw3_mpi.h
+    A ./mpi/ifftw_mpi.h
+    A ./mpi/mpi_dft.h
+    A ./mpi/mpi_transpose.h
+    A ./mpi/mpi_transpose_alltoall.c
+    A ./mpi/mpi_transpose_inplace.c
+    A ./mpi/problem.c
+    A ./mpi/testsched.c
+    A ./mpi/tproblem.c
+    A ./mpi/tsolve.c
+
+Mon Sep 11 22:25:38 EDT 2006  stevenj@fftw.org
+  * make X(plan_awake) work for NULL argument to reduce code size
+
+    M ./kernel/plan.c -3 +5
+    M ./rdft/vrank3-transpose.c -3 +3
+
+Fri Sep 15 23:47:08 EDT 2006  stevenj@fftw.org
+  * -mt should go before -mthreads to avoid spurious warnings on HPUX (thanks to Peter O'Gorman for the bug report)
+
+    M ./m4/acx_pthread.m4 -2 +3
+
+Mon Sep 11 13:53:44 EDT 2006  stevenj@fftw.org
+  * Fortran init_threads wrapper didn't return result; thanks to Markus Wetzstein for the bug report
+
+    M ./threads/f77funcs.h -2 +2
+
+Thu Sep  7 18:43:55 EDT 2006  stevenj@fftw.org
+  * make sure wrappers are included even if Fortran compiler was not detected (unless --disable-fortran was specified explicitly) ... this was supposed to be done before, but the definition was in the wrong place, grr
+
+    M ./api/f77api.c -21 +21
+
+Thu Aug 31 19:33:29 EDT 2006  stevenj@fftw.org
+  * I'm sick of answering this question about non-deterministic results
+
+    M ./doc/FAQ/fftw-faq.bfnn +13
+
+Tue Aug 22 21:27:29 EDT 2006  Matteo Frigo <athena@fftw.org>
+  * Add --tag=CC flag to libtool.
+  
+  This change is consistent with the libtool invocation in the latest
+  automake, and is required to compile with (some version of) xlc.
+  
+  
+
+    M ./support/Makefile.codelets -2 +3
+
+Mon Aug 21 21:40:36 EDT 2006  athena@fftw.org
+  * avoid ``fma'' because it is defined in c99.
+
+    M ./tests/bench.c -3 +3
+
+Sun Aug 20 11:40:53 EDT 2006  Matteo Frigo <athena@fftw.org>
+  * Obey -standalone flag.
+
+    M ./genfft/gen_twiddle.ml -2 +4
+
+Sat Aug 19 13:34:27 EDT 2006  Matteo Frigo <athena@fftw.org>
+  * obey -standalone when generating simd codelets
+
+    M ./genfft/gen_notw_c.ml -2 +1
+    M ./genfft/gen_twiddle_c.ml -2 +3
+
+Sat Aug 19 13:33:43 EDT 2006  Matteo Frigo <athena@fftw.org>
+  * removed obsolete athfft
+
+    M ./genfft/Makefile.am -10 +9
+    R ./genfft/gen_athnotw.ml
+    R ./genfft/gen_athtw.ml
+
+Thu Aug 17 21:50:50 EDT 2006  stevenj@fftw.org
+  * updated citation to Proc. IEEE paper
+
+    M ./doc/fftw3.texi -8 +5
+
+Thu Aug 17 21:47:05 EDT 2006  stevenj@fftw.org
+  * use darcs changes --summary to make nice changelog; emacs fill-region hack is obsolete
+
+    M ./mkdist.sh -3 +1
+
+Mon Aug 14 17:53:19 EDT 2006  athena@fftw.org
+  * removed timer calibration
+  Timer calibration seems not to work any longer on recent processors---
+  too much noise.  I have remove it completely.
+
+    M ./libbench2/Makefile.am -1 +1
+    M ./libbench2/bench.h -2
+    M ./libbench2/timer.c -70 +3
+    R ./libbench2/timer2.c
+
+Mon Aug 14 10:47:15 EDT 2006  Matteo Frigo <athena@fftw.org>
+  * removed k7
+  Removed obsolete k7 support.
+
+    M ./Makefile.am -10 +2
+    M ./NEWS +3
+    M ./TODO -5
+    M ./api/version.c -3
+    M ./bootstrap.sh -2
+    M ./commercialize.sh -8 +1
+    M ./configure.ac -13
+    M ./dft/Makefile.am -1 +1
+    M ./dft/codelet-dft.h -4
+    M ./dft/conf.c -3
+    R ./dft/k7/Makefile.am
+    R ./dft/k7/codelets/Makefile.am
+    R ./dft/k7/codelets/
+    R ./dft/k7/k7.c
+    R ./dft/k7/
+    M ./doc/fftw3.texi -31 +22
+    R ./genfft-k7/Makefile.am
+    R ./genfft-k7/algsimp.ml
+    R ./genfft-k7/algsimp.mli
+    R ./genfft-k7/assignmentsToVfpinstrs.ml
+    R ./genfft-k7/assignmentsToVfpinstrs.mli
+    R ./genfft-k7/assoctable.ml
+    R ./genfft-k7/assoctable.mli
+    R ./genfft-k7/balanceVfpinstrs.ml
+    R ./genfft-k7/balanceVfpinstrs.mli
+    R ./genfft-k7/complex.ml
+    R ./genfft-k7/complex.mli
+    R ./genfft-k7/expr.ml
+    R ./genfft-k7/expr.mli
+    R ./genfft-k7/fft.ml
+    R ./genfft-k7/fft.mli
+    R ./genfft-k7/genUtil.ml
+    R ./genfft-k7/gen_notw.ml
+    R ./genfft-k7/gen_twiddle.ml
+    R ./genfft-k7/id.ml
+    R ./genfft-k7/id.mli
+    R ./genfft-k7/k7Basics.ml
+    R ./genfft-k7/k7Basics.mli
+    R ./genfft-k7/k7ExecutionModel.ml
+    R ./genfft-k7/k7ExecutionModel.mli
+    R ./genfft-k7/k7FlatInstructionScheduling.ml
+    R ./genfft-k7/k7FlatInstructionScheduling.mli
+    R ./genfft-k7/k7InstructionSchedulingBasics.ml
+    R ./genfft-k7/k7InstructionSchedulingBasics.mli
+    R ./genfft-k7/k7RegisterAllocationBasics.ml
+    R ./genfft-k7/k7RegisterAllocationBasics.mli
+    R ./genfft-k7/k7RegisterAllocator.ml
+    R ./genfft-k7/k7RegisterAllocator.mli
+    R ./genfft-k7/k7RegisterAllocatorEATranslation.ml
+    R ./genfft-k7/k7RegisterAllocatorEATranslation.mli
+    R ./genfft-k7/k7RegisterAllocatorInit.ml
+    R ./genfft-k7/k7RegisterAllocatorInit.mli
+    R ./genfft-k7/k7RegisterReallocation.ml
+    R ./genfft-k7/k7RegisterReallocation.mli
+    R ./genfft-k7/k7Translate.ml
+    R ./genfft-k7/k7Translate.mli
+    R ./genfft-k7/k7Unparsing.ml
+    R ./genfft-k7/k7Unparsing.mli
+    R ./genfft-k7/k7Vectorization.ml
+    R ./genfft-k7/k7Vectorization.mli
+    R ./genfft-k7/littlesimp.ml
+    R ./genfft-k7/littlesimp.mli
+    R ./genfft-k7/magic.ml
+    R ./genfft-k7/memoMonad.ml
+    R ./genfft-k7/memoMonad.mli
+    R ./genfft-k7/monads.ml
+    R ./genfft-k7/nonDetMonad.ml
+    R ./genfft-k7/nonDetMonad.mli
+    R ./genfft-k7/nullVectorization.ml
+    R ./genfft-k7/nullVectorization.mli
+    R ./genfft-k7/number.ml
+    R ./genfft-k7/number.mli
+    R ./genfft-k7/oracle.ml
+    R ./genfft-k7/oracle.mli
+    R ./genfft-k7/stateMonad.ml
+    R ./genfft-k7/stateMonad.mli
+    R ./genfft-k7/to_alist.ml
+    R ./genfft-k7/to_alist.mli
+    R ./genfft-k7/twiddle.ml
+    R ./genfft-k7/twiddle.mli
+    R ./genfft-k7/util.ml
+    R ./genfft-k7/util.mli
+    R ./genfft-k7/vAnnotatedScheduler.ml
+    R ./genfft-k7/vAnnotatedScheduler.mli
+    R ./genfft-k7/vDag.ml
+    R ./genfft-k7/vDag.mli
+    R ./genfft-k7/vFpBasics.ml
+    R ./genfft-k7/vFpBasics.mli
+    R ./genfft-k7/vFpUnparsing.ml
+    R ./genfft-k7/vFpUnparsing.mli
+    R ./genfft-k7/vImproveSchedule.ml
+    R ./genfft-k7/vImproveSchedule.mli
+    R ./genfft-k7/vK7Optimization.ml
+    R ./genfft-k7/vK7Optimization.mli
+    R ./genfft-k7/vScheduler.ml
+    R ./genfft-k7/vScheduler.mli
+    R ./genfft-k7/vSimdBasics.ml
+    R ./genfft-k7/vSimdBasics.mli
+    R ./genfft-k7/vSimdIndexing.ml
+    R ./genfft-k7/vSimdIndexing.mli
+    R ./genfft-k7/vSimdUnparsing.ml
+    R ./genfft-k7/vSimdUnparsing.mli
+    R ./genfft-k7/variable.ml
+    R ./genfft-k7/variable.mli
+    R ./genfft-k7/
+    M ./kernel/align.c -2
+    M ./kernel/ifftw.h -5 +1
+    M ./mkdist.sh -1 +1
+    M ./support/Makefile.am -1 +1
+    M ./support/Makefile.codelets -6 +1
+    R ./support/codelet_asmprelude
+
+Sun Aug 13 11:02:11 EDT 2006  athena@fftw.org
+  * Use darcs instead of cvs.
+
+    M ./mkdist.sh -5 +2
+
+Wed Jul 19 08:52:15 EDT 2006  athena
+  * [project @ 2006-07-19 12:52:15 by athena]
+  Treat a the string "-" as a nonoption.
+
+    M ./libbench2/my-getopt.c -1 +4
+
+Tue Jul  4 17:10:47 EDT 2006  stevenj
+  * [project @ 2006-07-04 21:10:47 by stevenj]
+  comment out pkginclude dir for now
+
+    M ./Makefile.am -2 +2
+    M ./api/Makefile.am -2 +2
+    M ./dft/Makefile.am -2 +2
+    M ./kernel/Makefile.am -2 +2
+    M ./rdft/Makefile.am -2 +2
+    M ./reodft/Makefile.am -2 +2
+    M ./threads/Makefile.am -2 +2
+
+Mon Jul  3 20:51:08 EDT 2006  stevenj
+  * [project @ 2006-07-04 00:51:08 by stevenj]
+  make sure CCAS = CC to avoid libtool confusion
+
+    M ./configure.ac -1 +1
+
+Fri Jun 23 04:07:31 EDT 2006  stevenj
+  * [project @ 2006-06-23 08:07:31 by stevenj]
+  install x77.h guru.h guru64.h in pkgincludedir
+
+    M ./api/Makefile.am -1 +1
+
+Fri Jun 23 04:03:42 EDT 2006  stevenj
+  * [project @ 2006-06-23 08:03:42 by stevenj]
+  whitespace
+
+    M ./configure.ac -3
+
+Fri Jun 23 02:33:45 EDT 2006  stevenj
+  * [project @ 2006-06-23 06:33:45 by stevenj]
+  support cycle counter with xlc on Linux/ppc
+
+    M ./kernel/cycle.h -2 +2
+
+Tue Jun 20 08:16:08 EDT 2006  athena
+  * [project @ 2006-06-20 12:16:08 by athena]
+  Stylistic change.
+
+    M ./tools/fftw-wisdom.c -1 +1
+
+Tue Jun 20 02:20:34 EDT 2006  stevenj
+  * [project @ 2006-06-20 06:20:34 by stevenj]
+  bump date
+
+    M ./m4/ax_cc_maxopt.m4 -1 +1
+
+Tue Jun 20 02:20:06 EDT 2006  stevenj
+  * [project @ 2006-06-20 06:20:06 by stevenj]
+  correct bug reported by Andrew Salamon ... --enable-portable-binary was
+  ignored (or rather, treated unpredictably) due to typo, grrr
+
+    M ./m4/ax_cc_maxopt.m4 -1 +1
+
+Thu Jun  1 20:30:06 EDT 2006  stevenj
+  * [project @ 2006-06-02 00:30:06 by stevenj]
+  install 'internal' header files into includedir/fftw3/, includedir/fftw3f/, etcetera....this will make it easier to write external libraries that plug into FFTW internals, e.g. to add new solvers
+
+    M ./Makefile.am +3
+    M ./api/Makefile.am +3
+    M ./dft/Makefile.am +3
+    M ./kernel/Makefile.am +3
+    M ./rdft/Makefile.am +3
+    M ./reodft/Makefile.am +3
+    M ./threads/Makefile.am +3
+
+Mon May 29 23:59:19 EDT 2006  stevenj
+  * [project @ 2006-05-30 03:59:19 by stevenj]
+  bug fix, thanks to James Donald for the bug report (only affects experimental semaphore stuff)
+
+    M ./threads/threads.c +1
+
+Mon May 29 23:58:16 EDT 2006  stevenj
+  * [project @ 2006-05-30 03:58:16 by stevenj]
+  comment
+
+    M ./NEWS -3 +4
+
+Mon May 29 21:02:50 EDT 2006  stevenj
+  * [project @ 2006-05-30 01:02:50 by stevenj]
+  whoops
+
+    M ./m4/acx_pthread.m4 -1 +3
+
+Sat May 27 19:36:15 EDT 2006  stevenj
+  * [project @ 2006-05-27 23:36:15 by stevenj]
+  version bump
+
+    M ./m4/acx_pthread.m4 -1 +1
+
+Sat May 27 14:54:47 EDT 2006  stevenj
+  * [project @ 2006-05-27 18:54:47 by stevenj]
+  only check for xlc_r/cc_r if we are not using gcc
+
+    M ./m4/acx_pthread.m4 -2 +3
+
+Fri May 26 15:00:38 EDT 2006  stevenj
+  * [project @ 2006-05-26 19:00:38 by stevenj]
+  use ptrdiff_t (it's C89 and standard C++, hooray)
+
+    M ./api/fftw3.h -10 +5
+
+Fri May 26 12:59:33 EDT 2006  stevenj
+  * [project @ 2006-05-26 16:59:33 by stevenj]
+  version bump
+
+    M ./configure.ac -2 +2
+
+Fri May 26 12:57:32 EDT 2006  stevenj
+  * [project @ 2006-05-26 16:57:32 by stevenj]
+  noted 64-bit guru API
+
+    M ./NEWS +7
+
+Fri May 26 12:53:09 EDT 2006  stevenj
+  * [project @ 2006-05-26 16:53:09 by stevenj]
+  note that newer versions of VC++ support long long
+
+    M ./api/fftw3.h -2 +3
+
+Fri May 26 12:46:09 EDT 2006  stevenj
+  * [project @ 2006-05-26 16:46:09 by stevenj]
+  try harder to get a portable 64-bit type
+
+    M ./api/fftw3.h -2 +6
+
+Thu May 25 22:04:18 EDT 2006  stevenj
+  * [project @ 2006-05-26 02:04:18 by stevenj]
+  added draft guru64 API
+
+    A ./api/guru.h
+    A ./api/guru64.h
+    A ./api/mktensor-iodims.h
+    A ./api/mktensor-iodims64.c
+    A ./api/plan-guru-dft-c2r.h
+    A ./api/plan-guru-dft-r2c.h
+    A ./api/plan-guru-dft.h
+    A ./api/plan-guru-r2r.h
+    A ./api/plan-guru-split-dft-c2r.h
+    A ./api/plan-guru-split-dft-r2c.h
+    A ./api/plan-guru-split-dft.h
+    A ./api/plan-guru64-dft-c2r.c
+    A ./api/plan-guru64-dft-r2c.c
+    A ./api/plan-guru64-dft.c
+    A ./api/plan-guru64-r2r.c
+    A ./api/plan-guru64-split-dft-c2r.c
+    A ./api/plan-guru64-split-dft-r2c.c
+    A ./api/plan-guru64-split-dft.c
+    M ./api/Makefile.am -1 +7
+    M ./api/api.h +3
+    M ./api/fftw3.h -2 +56
+    M ./api/guru.h +4
+    M ./api/guru64.h +4
+    M ./api/mktensor-iodims.c -62 +2
+    M ./api/mktensor-iodims.h +62
+    M ./api/mktensor-iodims64.c +2
+    M ./api/plan-guru-dft-c2r.c -44 +2
+    M ./api/plan-guru-dft-c2r.h +44
+    M ./api/plan-guru-dft-r2c.c -43 +2
+    M ./api/plan-guru-dft-r2c.h +43
+    M ./api/plan-guru-dft.c -44 +2
+    M ./api/plan-guru-dft.h +44
+    M ./api/plan-guru-r2r.c -47 +2
+    M ./api/plan-guru-r2r.h +47
+    M ./api/plan-guru-split-dft-c2r.c -40 +2
+    M ./api/plan-guru-split-dft-c2r.h +40
+    M ./api/plan-guru-split-dft-r2c.c -39 +2
+    M ./api/plan-guru-split-dft-r2c.h +39
+    M ./api/plan-guru-split-dft.c -39 +2
+    M ./api/plan-guru-split-dft.h +39
+    M ./api/plan-guru64-dft-c2r.c +2
+    M ./api/plan-guru64-dft-r2c.c +2
+    M ./api/plan-guru64-dft.c +2
+    M ./api/plan-guru64-r2r.c +2
+    M ./api/plan-guru64-split-dft-c2r.c +2
+    M ./api/plan-guru64-split-dft-r2c.c +2
+    M ./api/plan-guru64-split-dft.c +2
+
+Mon May 22 16:41:44 EDT 2006  stevenj
+  * [project @ 2006-05-22 20:41:44 by stevenj]
+  added FIXME note
+
+    M ./m4/acx_pthread.m4 +1
+
+Mon May 22 16:40:30 EDT 2006  stevenj
+  * [project @ 2006-05-22 20:40:30 by stevenj]
+  check for xlc_r in addition to cc_r; thanks to Guy Moebs for the bug report
+
+    M ./m4/acx_pthread.m4 -3 +3
+
+Fri Apr 21 12:35:25 EDT 2006  stevenj
+  * [project @ 2006-04-21 16:35:25 by stevenj]
+  added note about gcc 4.0.1 on MacOS/Intel
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +4
+
+Thu Apr 20 23:08:42 EDT 2006  stevenj
+  * [project @ 2006-04-21 03:08:42 by stevenj]
+  added code for Core Duo; thanks to Eric Branlund
+
+    M ./m4/ax_gcc_archflag.m4 -2 +3
+
+Thu Apr 20 20:21:03 EDT 2006  stevenj
+  * [project @ 2006-04-21 00:21:03 by stevenj]
+  fixed failure for -fPIC or for gcc-4 on Apple Intel machines; thanks to
+  Eric Branlund for the bug report
+
+    M ./m4/ax_gcc_x86_cpuid.m4 -4 +12
+
+Tue Apr 11 20:00:31 EDT 2006  athena
+  * [project @ 2006-04-12 00:00:31 by athena]
+  Use -maltivec when checking for altivec.h.
+
+    M ./configure.ac -1 +15
+
+Mon Apr  3 15:52:44 EDT 2006  stevenj
+  * [project @ 2006-04-03 19:52:44 by stevenj]
+  note planner overwriting input in planner-flags reference
+
+    M ./doc/fftw3.texi -1 +6
+
+Tue Mar 28 09:05:26 EST 2006  athena
+  * [project @ 2006-03-28 14:05:26 by athena]
+  FAQ entry about --enable-k7 in 64-bit mode.
+
+    M ./doc/FAQ/fftw-faq.bfnn +11
+
+Mon Mar 27 23:41:05 EST 2006  stevenj
+  * [project @ 2006-03-28 04:41:05 by stevenj]
+  sprintf -> snprintf, to avoid (harmless) complaints by users/compilers
+
+    M ./configure.ac -1 +1
+    M ./libbench2/report.c -12 +19
+    M ./tools/fftw-wisdom.c +4
+
+Mon Mar 27 23:30:22 EST 2006  stevenj
+  * [project @ 2006-03-28 04:30:22 by stevenj]
+  silence compiler warning
+
+    M ./kernel/align.c -5 +6
+
+Fri Mar 17 09:20:10 EST 2006  athena
+  * [project @ 2006-03-17 14:20:10 by athena]
+  Remove dft/codelets/inplace, add simd/nonportable to list of
+  directories to be compiled on non-unix systems.
+
+    M ./doc/fftw3.texi -13 +14
+
+Sat Aug 12 23:52:49 EDT 2006  Unknown tagger
+  tagged fftw-3-1-1
+
+
+Sat Mar  4 16:17:56 EST 2006  stevenj
+  * [project @ 2006-03-04 21:17:56 by stevenj]
+  whoops
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Mar  4 16:13:08 EST 2006  stevenj
+  * [project @ 2006-03-04 21:13:08 by stevenj]
+  note that we align the stack ourselves if necessary, with gcc and icc
+
+    M ./doc/fftw3.texi -8 +10
+
+Sat Mar  4 16:08:16 EST 2006  stevenj
+  * [project @ 2006-03-04 21:08:16 by stevenj]
+  clearer distinction between static and automatic storage in C
+
+    M ./doc/fftw3.texi -29 +30
+
+Sat Feb 25 20:27:01 EST 2006  stevenj
+  * [project @ 2006-02-26 01:27:01 by stevenj]
+  rm unused var
+
+    M ./libbench2/verify-lib.c -2 +1
+
+Sat Feb 25 17:30:28 EST 2006  athena
+  * [project @ 2006-02-25 22:30:28 by athena]
+  Improved usage of goto (Dijkstra miserere nostri)
+
+    M ./libbench2/my-getopt.c -18 +17
+
+Sat Feb 25 14:19:15 EST 2006  stevenj
+  * [project @ 2006-02-25 19:19:15 by stevenj]
+  boilerplate
+
+    M ./libbench2/my-getopt.h +13
+
+Sat Feb 25 14:14:40 EST 2006  stevenj
+  * [project @ 2006-02-25 19:14:40 by stevenj]
+  update for upcoming 3.1.1
+
+    M ./NEWS +17
+
+Sat Feb 25 13:57:34 EST 2006  stevenj
+  * [project @ 2006-02-25 18:57:34 by stevenj]
+  replace obsolete IMPATIENT with MEASURE
+
+    M ./tools/fftw-wisdom.c -1 +3
+    M ./tools/fftw_wisdom.1.in -2 +2
+
+Sat Feb 25 13:52:25 EST 2006  stevenj
+  * [project @ 2006-02-25 18:52:25 by stevenj]
+  corrected comment
+
+    M ./tools/fftw-wisdom.c -1 +1
+
+Sat Feb 25 10:19:26 EST 2006  athena
+  * [project @ 2006-02-25 15:19:26 by athena]
+  -v does not take an argument.
+
+    M ./tools/fftw-wisdom.c -6 +3
+
+Sat Feb 25 10:17:18 EST 2006  athena
+  * [project @ 2006-02-25 15:17:18 by athena]
+  Obey the unix convention that -ab = -a -b
+
+    M ./libbench2/my-getopt.c -15 +24
+
+Fri Feb 24 23:13:49 EST 2006  stevenj
+  * [project @ 2006-02-25 04:13:49 by stevenj]
+  minor fixes (return error on unrecognized option)
+
+    M ./libbench2/bench-main.c -2 +3
+    M ./libbench2/my-getopt.c +5
+    M ./tools/fftw-wisdom.c -3 +4
+
+Fri Feb 24 22:46:12 EST 2006  stevenj
+  * [project @ 2006-02-25 03:46:12 by stevenj]
+  ugh
+
+    M ./tools/fftw-wisdom.c -34 +30
+
+Fri Feb 24 21:42:56 EST 2006  athena
+  * [project @ 2006-02-25 02:42:56 by athena]
+  require exact match for long options.
+
+    M ./libbench2/my-getopt.c -1 +2
+
+Fri Feb 24 21:38:02 EST 2006  athena
+  * [project @ 2006-02-25 02:38:02 by athena]
+  better fix
+
+    M ./libbench2/my-getopt.c -1 +2
+
+Fri Feb 24 21:37:06 EST 2006  athena
+  * [project @ 2006-02-25 02:37:06 by athena]
+  Fix
+
+    M ./libbench2/my-getopt.c -1 +1
+
+Fri Feb 24 21:25:48 EST 2006  athena
+  * [project @ 2006-02-25 02:25:48 by athena]
+  nothing
+
+    A ./libbench2/my-getopt.c
+    A ./libbench2/my-getopt.h
+    M ./libbench2/Makefile.am -5 +4
+    M ./libbench2/bench-main.c -53 +48
+    M ./libbench2/bench.h -5 +1
+    R ./libbench2/getopt-utils.c
+    R ./libbench2/getopt.c
+    R ./libbench2/getopt.h
+    R ./libbench2/getopt1.c
+    M ./libbench2/my-getopt.c +153
+    M ./libbench2/my-getopt.h +33
+
+Mon Feb 20 17:37:21 EST 2006  stevenj
+  * [project @ 2006-02-20 22:37:21 by stevenj]
+  rm transpose-indirect-inplace solver, which was buggy
+
+    M ./dft/indirect-transpose.c -54 +11
+
+Wed Feb 15 08:43:05 EST 2006  athena
+  * [project @ 2006-02-15 13:43:05 by athena]
+  Comment fix.
+
+    M ./kernel/cycle.h -2 +2
+
+Wed Feb 15 08:18:41 EST 2006  athena
+  * [project @ 2006-02-15 13:18:41 by athena]
+  Cycle counter for Visual C++ x86-64, courtesy of Dirk Michaelis
+
+    M ./kernel/cycle.h -4 +5
+
+Tue Feb 14 19:17:30 EST 2006  stevenj
+  * [project @ 2006-02-15 00:17:30 by stevenj]
+  rfftwnd.png is in builddir
+
+    M ./doc/Makefile.am -1 +1
+
+Tue Feb 14 19:03:27 EST 2006  stevenj
+  * [project @ 2006-02-15 00:03:27 by stevenj]
+  fixed typo: --enable-portable-binary, not --with
+
+    M ./doc/fftw3.texi -3 +3
+
+Mon Feb 13 07:59:06 EST 2006  athena
+  * [project @ 2006-02-13 12:59:06 by athena]
+  estimator tweaks.
+
+    M ./dft/dftw-direct.c -2 +3
+    M ./rdft/hc2hc-direct.c -2 +2
+
+Sun Feb 12 20:43:39 EST 2006  athena
+  * [project @ 2006-02-13 01:43:39 by athena]
+  sse/sse2 support for t3?v codelets
+
+    M ./simd/simd-sse.h +22
+    M ./simd/simd-sse2.h -8 +20
+
+Sun Feb 12 20:39:22 EST 2006  athena
+  * [project @ 2006-02-13 01:39:22 by athena]
+  Use CEXP instead of SIN/COS.
+
+    M ./simd/simd-altivec.h -1 +1
+
+Sun Feb 12 20:12:10 EST 2006  athena
+  * [project @ 2006-02-13 01:12:10 by athena]
+  bug in randomized cse eliminator.
+
+    M ./genfft/oracle.ml -3 +5
+
+Sun Feb 12 18:34:12 EST 2006  athena
+  * [project @ 2006-02-12 23:34:12 by athena]
+  Added support for t2-style simd codelets.  This is altivec only for
+  now; sse/sse2 don't even compile yet.
+
+    A ./dft/simd/t3b.h
+    A ./dft/simd/t3f.h
+    M ./dft/simd/Makefile.am -2 +2
+    M ./dft/simd/codelets/Makefile.am -2 +11
+    M ./dft/simd/t3b.h +34
+    M ./dft/simd/t3f.h +34
+    M ./genfft/algsimp.ml -4 +26
+    M ./genfft/annotate.ml -5 +3
+    M ./genfft/c.ml -10 +9
+    M ./genfft/c.mli -2 +2
+    M ./genfft/complex.ml -93 +6
+    M ./genfft/complex.mli -5 +3
+    M ./genfft/expr.ml -6 +15
+    M ./genfft/expr.mli -2 +4
+    M ./genfft/gen_athtw.ml -3 +3
+    M ./genfft/gen_conv.ml -5 +3
+    M ./genfft/gen_hc2hc.ml -4 +3
+    M ./genfft/gen_hc2r.ml -5 +3
+    M ./genfft/gen_mdct.ml -7 +3
+    M ./genfft/gen_notw.ml -5 +2
+    M ./genfft/gen_notw_c.ml -5 +2
+    M ./genfft/gen_r2hc.ml -4 +3
+    M ./genfft/gen_r2r.ml -4 +3
+    M ./genfft/gen_twiddle.ml -3 +3
+    M ./genfft/gen_twiddle_c.ml -10 +4
+    M ./genfft/gen_twidsq.ml -7 +3
+    M ./genfft/gen_twidsq_c.ml -15 +5
+    M ./genfft/magic.ml -4 +4
+    M ./genfft/oracle.ml -1 +5
+    M ./genfft/schedule.ml -3 +3
+    M ./genfft/simd.ml -5 +11
+    M ./genfft/to_alist.ml -4 +16
+    M ./genfft/trig.ml -3 +2
+    M ./genfft/twiddle.ml -265 +35
+    M ./genfft/twiddle.mli -4 +4
+    M ./simd/simd-altivec.h +28
+
+Sun Feb 12 15:30:27 EST 2006  athena
+  * [project @ 2006-02-12 20:30:27 by athena]
+  Added support for t2-style simd split-complex codelets.
+
+    A ./dft/simd/ts.c
+    A ./dft/simd/ts.h
+    M ./dft/simd/Makefile.am -2 +2
+    M ./dft/simd/codelets/Makefile.am -2 +7
+    R ./dft/simd/t1s.c
+    R ./dft/simd/t1s.h
+    M ./dft/simd/ts.c +50
+    M ./dft/simd/ts.h +33
+    M ./genfft/twiddle.ml -5 +3
+
+Fri Feb 10 18:21:28 EST 2006  stevenj
+  * [project @ 2006-02-10 23:21:28 by stevenj]
+
+    M ./m4/ax_openmp.m4 -1 +1
+
+Fri Feb 10 18:19:46 EST 2006  stevenj
+  * [project @ 2006-02-10 23:19:46 by stevenj]
+  punctuation
+
+    M ./m4/ax_openmp.m4 -2 +2
+
+Fri Feb 10 18:00:35 EST 2006  stevenj
+  * [project @ 2006-02-10 23:00:35 by stevenj]
+  windows DLL stuff for Fortran interface
+
+    M ./api/f77api.c +7
+    M ./api/f77funcs.h -43 +43
+
+Fri Feb 10 09:48:52 EST 2006  athena
+  * [project @ 2006-02-10 14:48:52 by athena]
+  Bumped version to 3.1.1
+
+    M ./configure.ac -2 +2
+
+Fri Feb 10 09:18:39 EST 2006  athena
+  * [project @ 2006-02-10 14:18:39 by athena]
+  Precompute array indices on x86-64.  Speeds up Pentium IV and makes no
+  appreciable difference on AMD.
+
+    M ./kernel/ifftw.h -2 +2
+
+Tue Feb  7 22:01:36 EST 2006  athena
+  * [project @ 2006-02-08 03:01:36 by athena]
+  Check whether the processor supports CPUID before issuing the
+  instruction. (Grrr...) Code contributed by Eric J. Korpela.
+
+    A ./simd/x86-cpuid.h
+    M ./simd/Makefile.am -1 +1
+    M ./simd/sse.c -24 +6
+    M ./simd/sse2.c -26 +8
+    M ./simd/x86-cpuid.h +127
+
+Tue Feb  7 21:36:47 EST 2006  athena
+  * [project @ 2006-02-08 02:36:47 by athena]
+  icc supports x86_64 these days.
+
+    M ./kernel/cycle.h -2 +2
+
+Sun Feb  5 18:19:55 EST 2006  athena
+  * [project @ 2006-02-05 23:19:55 by athena]
+  Paranoia.
+
+    M ./kernel/primes.c -2 +2
+
+Mon Jan 30 15:27:53 EST 2006  stevenj
+  * [project @ 2006-01-30 20:27:53 by stevenj]
+  whoops, fixed assert (y <= x)
+
+    M ./kernel/primes.c -2 +2
+
+Mon Jan 30 15:26:22 EST 2006  stevenj
+  * [project @ 2006-01-30 20:26:22 by stevenj]
+  note that safe_mulmod requires {x,y} < p (or at least < 2p), and added
+  assert
+
+    M ./kernel/primes.c -2 +4
+
+Mon Jan 30 11:09:32 EST 2006  athena
+  * [project @ 2006-01-30 16:09:32 by athena]
+  fixed aix/xlc lossage
+
+    M ./libbench2/bench-user.h -2 +2
+    M ./libbench2/timer.c -4 +4
+
+Sun Jan 29 20:42:51 EST 2006  athena
+  * [project @ 2006-01-30 01:42:51 by athena]
+  In the impuse test, normalize the impulse so that the impulse and the
+  random vectors have roughly the same L2 norm.  This change reduces the
+  number of bits that we lose because of floating-point cancellation, so
+  that we can focus on the bits that we lose because of bugs.
+
+    M ./libbench2/verify-lib.c -3 +3
+
+Sun Jan 29 20:37:47 EST 2006  athena
+  * [project @ 2006-01-30 01:37:47 by athena]
+  Compute omega in trigreal precision, as opposed to R.
+
+    M ./rdft/dht-rader.c -2 +2
+
+Sat Aug 12 23:51:14 EDT 2006  Unknown tagger
+  tagged fftw-3-1
+
+
+Fri Jan 27 19:16:22 EST 2006  stevenj
+  * [project @ 2006-01-28 00:16:22 by stevenj]
+  add --with-combined-threads option as workaround to Windows inability to build shared libs with dependencies
+
+    M ./Makefile.am -1 +7
+    M ./configure.ac -1 +3
+    M ./tests/Makefile.am +2
+    M ./threads/Makefile.am +4
+    M ./tools/Makefile.am +2
+
+Fri Jan 27 17:20:45 EST 2006  stevenj
+  * [project @ 2006-01-27 22:20:45 by stevenj]
+  libfftw3_threads should *not* used -no-undefined because, in fact, it is not true -- this library depends on -lfftw3, and is not self-contained
+
+    M ./threads/Makefile.am -1 +1
+
+Thu Jan 26 22:04:34 EST 2006  stevenj
+  * [project @ 2006-01-27 03:04:34 by stevenj]
+  updated
+
+    M ./NEWS -3 +10
+
+Thu Jan 26 21:10:50 EST 2006  athena
+  * [project @ 2006-01-27 02:10:50 by athena]
+  Added paranoid stack alignment when awaking plans.  While I was at it,
+  removed obsolete, redundant AWAKE macro.
+
+    M ./api/apiplan.c -3 +8
+    M ./dft/bluestein.c -1 +1
+    M ./dft/buffered.c -4 +4
+    M ./dft/ct.c -3 +3
+    M ./dft/ctsq.c -2 +2
+    M ./dft/dftw-generic.c -1 +1
+    M ./dft/dftw-genericbuf.c -1 +1
+    M ./dft/indirect-transpose.c -3 +3
+    M ./dft/indirect.c -3 +3
+    M ./dft/rader.c -3 +3
+    M ./dft/rank-geq2.c -3 +3
+    M ./dft/vrank-geq1.c -2 +2
+    M ./kernel/ifftw.h -2 +1
+    M ./kernel/timer.c -3 +3
+    M ./rdft/buffered.c -4 +4
+    M ./rdft/buffered2.c -3 +3
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/dht-r2hc.c -2 +2
+    M ./rdft/dht-rader.c -3 +3
+    M ./rdft/hc2hc-direct.c -3 +3
+    M ./rdft/hc2hc-directbuf.c -3 +3
+    M ./rdft/hc2hc-generic.c -2 +2
+    M ./rdft/hc2hc.c -2 +2
+    M ./rdft/indirect.c -3 +3
+    M ./rdft/rank-geq2-rdft2.c -3 +3
+    M ./rdft/rank-geq2.c -3 +3
+    M ./rdft/rank0-rdft2.c -2 +2
+    M ./rdft/rdft-dht.c -2 +2
+    M ./rdft/rdft2-radix2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+    M ./rdft/vrank3-transpose.c -4 +4
+    M ./reodft/redft00e-r2hc-pad.c -3 +3
+    M ./reodft/redft00e-r2hc.c -2 +2
+    M ./reodft/reodft00e-splitradix.c -3 +3
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -2 +2
+    M ./reodft/reodft11e-radix2.c -2 +2
+    M ./reodft/rodft00e-r2hc-pad.c -3 +3
+    M ./reodft/rodft00e-r2hc.c -2 +2
+    M ./tests/hook.c -2 +2
+    M ./threads/ct.c -3 +3
+    M ./threads/dft-vrank-geq1.c -2 +2
+    M ./threads/hc2hc.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -2 +2
+    M ./threads/vrank-geq1-rdft2.c -2 +2
+
+Thu Jan 26 20:54:39 EST 2006  athena
+  * [project @ 2006-01-27 01:54:39 by athena]
+  Updated for 3.1.
+
+    M ./NEWS -1 +1
+
+Thu Jan 26 19:15:12 EST 2006  athena
+  * [project @ 2006-01-27 00:15:12 by athena]
+  ditched one alignment check and noted that we should eliminate the rest as well
+
+    M ./TODO +3
+    M ./libbench2/bench-main.c -13 +1
+
+Wed Jan 25 23:02:19 EST 2006  athena
+  * [project @ 2006-01-26 04:02:19 by athena]
+  alignment hack
+
+    M ./libbench2/bench-main.c -2 +4
+
+Wed Jan 25 22:05:11 EST 2006  athena
+  * [project @ 2006-01-26 03:05:11 by athena]
+  detect pentium M
+
+    M ./m4/ax_gcc_archflag.m4 -1 +2
+
+Wed Jan 25 18:42:58 EST 2006  stevenj
+  * [project @ 2006-01-25 23:42:58 by stevenj]
+  don't trust host_cpu if it claims we are on i386/i486, and call cpuid anyway (if it fails we use no arch flag).  This is needed on FreeBSD
+
+    M ./m4/ax_gcc_archflag.m4 -3 +1
+
+Wed Jan 25 18:00:04 EST 2006  stevenj
+  * [project @ 2006-01-25 23:00:04 by stevenj]
+  suggest --with-our-malloc16 in error message
+
+    M ./kernel/kalloc.c -2 +2
+
+Tue Jan 24 19:53:34 EST 2006  stevenj
+  * [project @ 2006-01-25 00:53:34 by stevenj]
+  ditto for -no-gcc
+
+    M ./configure.ac -2 +4
+
+Tue Jan 24 19:51:08 EST 2006  stevenj
+  * [project @ 2006-01-25 00:51:08 by stevenj]
+  flags required for successfull compilation should be added even if the
+  user overrides CFLAGS
+
+    M ./configure.ac -9 +12
+
+Tue Jan 24 18:43:59 EST 2006  stevenj
+  * [project @ 2006-01-24 23:43:59 by stevenj]
+  upcoming gcc OpenMP support uses -fopenmp
+
+    M ./m4/ax_openmp.m4 -4 +4
+
+Tue Jan 24 18:26:59 EST 2006  stevenj
+  * [project @ 2006-01-24 23:26:59 by stevenj]
+  note that PGI uses -mp as well
+
+    M ./m4/ax_openmp.m4 -1 +1
+
+Mon Jan 23 15:31:24 EST 2006  athena
+  * [project @ 2006-01-23 20:31:24 by athena]
+  my best guess at how to fix the microsoft crap du jour
+
+    M ./kernel/cycle.h -2 +2
+    M ./simd/sse.c -65 +70
+    M ./simd/sse2.c -62 +67
+
+Mon Jan 23 14:05:14 EST 2006  stevenj
+  * [project @ 2006-01-23 19:05:14 by stevenj]
+  use -Masmkeyword for PGI cycle counter, grr
+
+    M ./configure.ac +5
+    M ./kernel/cycle.h -2 +4
+
+Sun Jan 22 18:09:06 EST 2006  athena
+  * [project @ 2006-01-22 23:09:06 by athena]
+  Bumped version number to 3.1.
+
+    M ./configure.ac -1 +1
+
+Sat Jan 21 10:03:59 EST 2006  athena
+  * [project @ 2006-01-21 15:03:59 by athena]
+  Report that --enable-k7 is incompatible with --enable-shared.
+
+    M ./configure.ac +3
+
+Sat Jan 21 09:17:54 EST 2006  athena
+  * [project @ 2006-01-21 14:17:54 by athena]
+  Do not use empty libraries in LIBADD, since otherwise the linker fails
+  on Solaris.
+
+    M ./Makefile.am -7 +20
+
+Wed Jan 18 10:47:59 EST 2006  stevenj
+  * [project @ 2006-01-18 15:47:59 by stevenj]
+  warn end-users away from this file
+
+    M ./bootstrap.sh -1 +10
+
+Tue Jan 17 16:16:42 EST 2006  athena
+  * [project @ 2006-01-17 21:16:42 by athena]
+  Gcc sucks.
+
+    M ./simd/simd-sse.h -1 +2
+
+Tue Jan 17 11:48:55 EST 2006  athena
+  * [project @ 2006-01-17 16:48:55 by athena]
+  Disabled checks that may turn out to be too paranoid.
+
+    M ./tests/hook.c -1 +2
+
+Tue Jan 17 10:35:03 EST 2006  athena
+  * [project @ 2006-01-17 15:35:03 by athena]
+  Some paranoid checks.
+
+    M ./tests/hook.c +16
+
+Tue Jan 17 09:31:08 EST 2006  athena
+  * [project @ 2006-01-17 14:31:08 by athena]
+  Flush stdout after printing.
+
+    M ./libbench2/ovtpvt.c +1
+
+Tue Jan 17 08:28:18 EST 2006  athena
+  * [project @ 2006-01-17 13:28:18 by athena]
+  Run the leak detector in all cases, not just when verbose > 2.
+
+    M ./kernel/alloc.c -2 +7
+    M ./tests/bench.c -2 +1
+
+Tue Jan 17 08:11:41 EST 2006  athena
+  * [project @ 2006-01-17 13:11:41 by athena]
+  Eliminate calls to pow(), rint().
+
+    M ./api/mapflags.c -8 +3
+
+Tue Jan 17 00:45:06 EST 2006  stevenj
+  * [project @ 2006-01-17 05:45:06 by stevenj]
+  put # in first column, for stylistic consistency
+
+    M ./kernel/ifftw.h -3 +3
+
+Tue Jan 17 00:17:27 EST 2006  athena
+  * [project @ 2006-01-17 05:17:27 by athena]
+  Made timeout part of impatience flags, in order to improve the
+  usability of wisdom.  Also, fixed bogus error recovery logic in
+  planner.c:imprt().
+
+    M ./api/mapflags.c -1 +36
+    M ./kernel/ifftw.h -4 +7
+    M ./kernel/planner.c -23 +70
+
+Mon Jan 16 23:03:34 EST 2006  stevenj
+  * [project @ 2006-01-17 04:03:33 by stevenj]
+  make timelimit < 0 .eq. FFTW_NO_TIMELIMIT
+
+    M ./api/apiplan.c -1 +1
+    M ./api/fftw3.h -1 +3
+    M ./doc/fftw3.texi -5 +9
+    M ./kernel/planner.c -3 +3
+
+Mon Jan 16 21:52:01 EST 2006  athena
+  * [project @ 2006-01-17 02:52:01 by athena]
+  Eliminated the FFTW_TIMELIMIT flag in favor of this simpler logic:
+  fftw_set_timelimit(0) disables time limit.
+  fftw_set_timelimit(X), X>0 sets the time limit to X.
+
+    M ./api/apiplan.c -6 +1
+    M ./api/fftw3.h -2 +1
+    M ./api/the-planner.c +7
+    M ./doc/fftw3.texi -30 +27
+    M ./kernel/planner.c -2 +4
+    M ./tests/bench.c -1
+
+Mon Jan 16 08:38:04 EST 2006  athena
+  * [project @ 2006-01-16 13:38:04 by athena]
+  Force the use of the estimator when wisdom fails because of md5
+  collisions, otherwise the planner takes forever.
+
+    M ./api/apiplan.c -2 +10
+
+Sun Jan 15 21:30:31 EST 2006  athena
+  * [project @ 2006-01-16 02:30:31 by athena]
+  Ranted about how broken gcc-4 is.
+
+    M ./kernel/ifftw.h -5 +40
+
+Sun Jan 15 19:59:38 EST 2006  stevenj
+  * [project @ 2006-01-16 00:59:38 by stevenj]
+  change fftw_timelimit global var to fftw_set_timelimit(double) function, for simpler usage with shared libraries and for consistency with e.g. set_numthreads
+
+    M ./api/apiplan.c -2 +4
+    M ./api/fftw3.h -2 +3
+    M ./doc/fftw3.texi -11 +11
+    M ./tests/bench.c -1 +1
+
+Sun Jan 15 19:32:27 EST 2006  athena
+  * [project @ 2006-01-16 00:32:27 by athena]
+  Minor tweaks.
+
+    M ./doc/fftw3.texi -8 +11
+
+Sun Jan 15 16:32:54 EST 2006  athena
+  * [project @ 2006-01-15 21:32:54 by athena]
+  tweaks to make sure that time_n() is always called from the same stack position.
+
+    M ./libbench2/timer.c -12 +12
+
+Sun Jan 15 16:09:53 EST 2006  athena
+  * [project @ 2006-01-15 21:09:53 by athena]
+  Major simplification of the timer calibration logic.  Also, use an FFT
+  as a unit of work instead of the old pointer chasing, because God
+  knows how pointer chasing interacts with the idiotic cache-hit
+  speculation on the Pentium IV.
+
+    M ./libbench2/bench.h -2 +3
+    M ./libbench2/speed.c -5 +8
+    M ./libbench2/timer.c -96 +35
+    M ./libbench2/timer2.c -7 +99
+
+Sun Jan 15 15:12:08 EST 2006  athena
+  * [project @ 2006-01-15 20:12:08 by athena]
+  Fixed broken aligment checks when sizeof(R)==12.
+
+    M ./kernel/align.c -3 +8
+
+Sun Jan 15 10:36:40 EST 2006  athena
+  * [project @ 2006-01-15 15:36:40 by athena]
+  Manual unrolling of loop.
+
+    M ./libbench2/timer2.c -5 +2
+
+Sun Jan 15 10:12:55 EST 2006  athena
+  * [project @ 2006-01-15 15:12:55 by athena]
+  Various improvements to timer calibration routines.
+
+    A ./libbench2/timer2.c
+    M ./libbench2/Makefile.am -1 +1
+    M ./libbench2/bench.h -1 +2
+    M ./libbench2/timer.c -18 +32
+    M ./libbench2/timer2.c +36
+
+Sat Jan 14 22:16:09 EST 2006  athena
+  * [project @ 2006-01-15 03:16:09 by athena]
+  cygwin defines __CYGWIN__, not __WIN32__ etc.
+
+    M ./libbench2/timer.c -16 +17
+
+Sat Jan 14 20:40:12 EST 2006  athena
+  * [project @ 2006-01-15 01:40:12 by athena]
+  fixed confusion between libbench and user timers
+
+    M ./libbench2/bench-user.h -3 +7
+    M ./libbench2/speed.c -5 +5
+    M ./libbench2/timer.c -6 +8
+    M ./tests/bench.c -8 +8
+
+Sat Jan 14 12:32:44 EST 2006  stevenj
+  * [project @ 2006-01-14 17:32:44 by stevenj]
+  update
+
+    M ./NEWS -2 +4
+
+Sat Jan 14 10:24:11 EST 2006  athena
+  * [project @ 2006-01-14 15:24:11 by athena]
+  Comment.
+
+    M ./simd/simd-sse.h +5
+
+Sat Jan 14 10:19:28 EST 2006  athena
+  * [project @ 2006-01-14 15:19:28 by athena]
+  Workaround gcc bug.
+
+    M ./simd/simd-sse.h -2 +3
+
+Fri Jan 13 19:13:18 EST 2006  athena
+  * [project @ 2006-01-14 00:13:18 by athena]
+  Switched to -beta2.
+
+    M ./configure.ac -1 +1
+
+Thu Jan 12 22:21:57 EST 2006  athena
+  * [project @ 2006-01-13 03:21:57 by athena]
+  Fixed technically correct but highly obfuscated use of the enum tag
+  R2HC as a null pointer.
+
+    M ./rdft/buffered.c -5 +3
+    M ./rdft/indirect.c -4 +3
+    M ./rdft/problem.c -1 +8
+    M ./rdft/rank0-rdft2.c -4 +3
+    M ./rdft/rdft.h +1
+    M ./rdft/vrank3-transpose.c -25 +13
+
+Thu Jan 12 19:25:20 EST 2006  stevenj
+  * [project @ 2006-01-13 00:25:20 by stevenj]
+  --enable-unsafe-mulmod is obsolete
+
+    M ./configure.ac -4
+
+Thu Jan 12 19:23:18 EST 2006  athena
+  * [project @ 2006-01-13 00:23:18 by athena]
+  More thoughts.
+
+    M ./TODO +2
+
+Thu Jan 12 19:17:57 EST 2006  athena
+  * [project @ 2006-01-13 00:17:57 by athena]
+  Removed loop unrolling because it slows things down on at least one
+  powerpc and it generates clumsy x86 code.
+
+    M ./rdft/buffered2.c -50 +12
+
+Thu Jan 12 19:17:35 EST 2006  stevenj
+  * [project @ 2006-01-13 00:17:35 by stevenj]
+  tweaks
+
+    M ./kernel/kalloc.c -6 +6
+
+Thu Jan 12 15:55:52 EST 2006  stevenj
+  * [project @ 2006-01-12 20:55:52 by stevenj]
+  MacOSX x86 ABI specifies that the stack is kept 16-byte aligned
+
+    M ./kernel/ifftw.h -2 +3
+
+Thu Jan 12 12:46:49 EST 2006  athena
+  * [project @ 2006-01-12 17:46:49 by athena]
+  ``ret'' is a reserved word in the evil empire.
+
+    M ./kernel/cycle.h -5 +5
+
+Thu Jan 12 08:31:43 EST 2006  athena
+  * [project @ 2006-01-12 13:31:09 by athena]
+  Changed ret => result because ret ``is a reserved word'' in the evil
+  empire.
+
+    M ./simd/sse.c -4 +4
+    M ./simd/sse2.c -4 +4
+
+Wed Jan 11 19:30:42 EST 2006  athena
+  * [project @ 2006-01-12 00:30:42 by athena]
+  Workaround Visual c++ lossage.
+
+    M ./simd/simd-sse2.h -5 +4
+
+Wed Jan 11 19:26:16 EST 2006  athena
+  * [project @ 2006-01-12 00:26:16 by athena]
+  Workaround visual c++ lossage.
+
+    M ./simd/simd-sse.h +38
+
+Wed Jan 11 19:10:52 EST 2006  athena
+  * [project @ 2006-01-12 00:10:52 by athena]
+  isprint() is guaranteed to work for unsigned char + EOF only.
+
+    M ./libbench2/getopt-utils.c -2 +3
+
+Wed Jan 11 13:47:49 EST 2006  stevenj
+  * [project @ 2006-01-11 18:47:49 by stevenj]
+  rm obsolete fixme
+
+    M ./rdft/vrank3-transpose.c -2 +2
+
+Wed Jan 11 13:38:46 EST 2006  stevenj
+  * [project @ 2006-01-11 18:38:46 by stevenj]
+
+    M ./rdft/vrank3-transpose.c -2 +2
+
+Wed Jan 11 13:32:26 EST 2006  stevenj
+  * [project @ 2006-01-11 18:32:26 by stevenj]
+  fix comment
+
+    M ./rdft/vrank3-transpose.c -2 +3
+
+Wed Jan 11 12:27:05 EST 2006  athena
+  * [project @ 2006-01-11 17:27:05 by athena]
+  Paranoid use of K(x) for all constants x, to avoid runtime double->float conversions on sufficiently stupid compilers.
+
+    M ./dft/bluestein.c -2 +2
+    M ./rdft/buffered2.c -2 +2
+    M ./rdft/dht-rader.c -1 +1
+    M ./rdft/rank0-rdft2.c -11 +11
+    M ./reodft/rodft00e-r2hc-pad.c -3 +3
+
+Tue Jan 10 20:10:38 EST 2006  athena
+  * [project @ 2006-01-11 01:10:38 by athena]
+  Workaround to gcc nonsense.
+
+    M ./simd/simd-sse.h -1 +15
+
+Tue Jan 10 18:44:28 EST 2006  stevenj
+  * [project @ 2006-01-10 23:44:28 by stevenj]
+  bug fix: infinite loop in transpose-cut planning
+
+    M ./rdft/vrank3-transpose.c -6 +14
+
+Tue Jan 10 18:12:14 EST 2006  stevenj
+  * [project @ 2006-01-10 23:12:14 by stevenj]
+  clarified comment
+
+    M ./api/fftw3.h -6 +6
+
+Tue Jan 10 18:10:32 EST 2006  stevenj
+  * [project @ 2006-01-10 23:10:32 by stevenj]
+  more Windows decorations
+
+    M ./tests/bench.c -1 +1
+
+Tue Jan 10 17:57:45 EST 2006  stevenj
+  * [project @ 2006-01-10 22:57:45 by stevenj]
+  added FIXME comment
+
+    M ./support/Makefile.codelets +1
+
+Tue Jan 10 17:52:07 EST 2006  stevenj
+  * [project @ 2006-01-10 22:52:07 by stevenj]
+  'make clean' should not delete codlist.c since it is included in the dist tarball
+
+    M ./support/Makefile.codelets -2 +3
+
+Tue Jan 10 17:50:12 EST 2006  athena
+  * [project @ 2006-01-10 22:50:12 by athena]
+  Change threshold for ``large'' Cooley-Tukey to 256K from 64K, since it
+  seems to benefit the Pentium IV with sse and the planning cost is not
+  too horrible.
+
+    M ./dft/dftw-direct.c -2 +2
+
+Tue Jan 10 17:45:11 EST 2006  stevenj
+  * [project @ 2006-01-10 22:45:11 by stevenj]
+  more missing Windows DLL decorations
+
+    M ./kernel/ifftw.h -3 +3
+
+Tue Jan 10 17:41:28 EST 2006  stevenj
+  * [project @ 2006-01-10 22:41:28 by stevenj]
+  remove unused var
+
+    M ./rdft/dht-rader.c -2
+
+Tue Jan 10 14:00:50 EST 2006  stevenj
+  * [project @ 2006-01-10 19:00:50 by stevenj]
+  allow compiler threads, if enabled, to take precedence over explicit threads
+
+    M ./threads/threads.c -17 +17
+
+Tue Jan 10 12:30:09 EST 2006  stevenj
+  * [project @ 2006-01-10 17:30:09 by stevenj]
+
+    M ./api/api.h -2 +2
+
+Tue Jan 10 12:21:56 EST 2006  stevenj
+  * [project @ 2006-01-10 17:21:56 by stevenj]
+
+    M ./kernel/planner.c -2 +2
+
+Tue Jan 10 09:13:20 EST 2006  athena
+  * [project @ 2006-01-10 14:13:20 by athena]
+  Fixed comment typo.
+
+    M ./kernel/planner.c -2 +2
+
+Tue Jan 10 08:59:22 EST 2006  athena
+  * [project @ 2006-01-10 13:59:22 by athena]
+  Rearranged timeout checks so as to eliminate one of them.
+
+    M ./kernel/planner.c -9 +4
+
+Tue Jan 10 08:56:55 EST 2006  athena
+  * [project @ 2006-01-10 13:56:55 by athena]
+  Converted residual CK() -> A().
+
+    M ./kernel/plan.c -2 +2
+
+Tue Jan 10 08:36:13 EST 2006  athena
+  * [project @ 2006-01-10 13:36:13 by athena]
+  Maintain the invariant TIMED_OUT ==> NEED_TIMEOUT_CHECK.
+
+    M ./kernel/planner.c -5 +10
+
+Tue Jan 10 08:24:41 EST 2006  athena
+  * [project @ 2006-01-10 13:24:41 by athena]
+  silence some 64-bit warnings
+
+    M ./api/mapflags.c -1 +1
+    M ./dft/rank-geq2.c -2 +2
+    M ./dft/vrank-geq1.c -2 +2
+    M ./kernel/buffered.c -1 +1
+    M ./kernel/md5.c -1 +1
+    M ./kernel/scan.c -2 +3
+    M ./rdft/rank-geq2-rdft2.c -2 +2
+    M ./rdft/rank-geq2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+    M ./threads/dft-vrank-geq1.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -2 +2
+    M ./threads/vrank-geq1-rdft2.c -2 +2
+
+Tue Jan 10 07:58:48 EST 2006  athena
+  * [project @ 2006-01-10 12:58:48 by athena]
+  Assertions.
+
+    M ./tests/hook.c +3
+
+Tue Jan 10 00:14:00 EST 2006  stevenj
+  * [project @ 2006-01-10 05:14:00 by stevenj]
+  some condensing
+
+    M ./kernel/timer.c -10 +5
+
+Tue Jan 10 00:03:32 EST 2006  stevenj
+  * [project @ 2006-01-10 05:03:32 by stevenj]
+  eliminate X(seconds) in favor of X(elapsed_since), in paranoia of clock wrap
+
+    M ./api/apiplan.c -2 +2
+    M ./kernel/ifftw.h -3 +33
+    M ./kernel/planner.c -4 +4
+    M ./kernel/timer.c -58 +25
+
+Mon Jan  9 23:21:21 EST 2006  stevenj
+  * [project @ 2006-01-10 04:21:21 by stevenj]
+
+    M ./kernel/timer.c -2 +2
+
+Mon Jan  9 23:21:06 EST 2006  stevenj
+  * [project @ 2006-01-10 04:21:06 by stevenj]
+  hmm, a bit more pessimistic about clock wrapping
+
+    M ./kernel/timer.c -4 +4
+
+Mon Jan  9 23:20:26 EST 2006  athena
+  * [project @ 2006-01-10 04:20:26 by athena]
+  Revert to md5uint = unsigned int whenever possible, so as to
+  avoid wasting space for unsigned long on 64-bit machines.
+
+    M ./configure.ac +2
+    M ./kernel/ifftw.h -1 +5
+
+Mon Jan  9 23:12:27 EST 2006  stevenj
+  * [project @ 2006-01-10 04:12:27 by stevenj]
+  note why clock() wrap should not be a concern
+
+    M ./kernel/timer.c -1 +6
+
+Mon Jan  9 22:57:16 EST 2006  stevenj
+  * [project @ 2006-01-10 03:57:16 by stevenj]
+  bugfix in recent timeout changes - check for case where last solver times out
+
+    M ./kernel/planner.c -2 +8
+
+Mon Jan  9 22:40:26 EST 2006  stevenj
+  * [project @ 2006-01-10 03:40:26 by stevenj]
+  started changes list from beta
+
+    M ./NEWS -1 +11
+
+Mon Jan  9 22:34:13 EST 2006  athena
+  * [project @ 2006-01-10 03:34:13 by athena]
+  Paranoia.
+
+    M ./api/mapflags.c +4
+
+Mon Jan  9 22:27:37 EST 2006  athena
+  * [project @ 2006-01-10 03:27:37 by athena]
+  Paranoid assertions.
+
+    M ./kernel/planner.c -3 +7
+
+Mon Jan  9 22:13:32 EST 2006  athena
+  * [project @ 2006-01-10 03:13:32 by athena]
+  Added FIXME comment stating the 64-bit uncleaniness of
+  fftw_tensor_to_bench_tensor().
+
+    M ./tests/hook.c +1
+
+Mon Jan  9 22:06:05 EST 2006  athena
+  * [project @ 2006-01-10 03:06:05 by athena]
+  Another 64-bit bug.
+
+    M ./dft/simd/t.c -1 +1
+
+Mon Jan  9 21:54:07 EST 2006  stevenj
+  * [project @ 2006-01-10 02:54:07 by stevenj]
+  more Windows DLL nonsense
+
+    M ./api/api.h -5 +15
+    M ./kernel/ifftw.h -3 +5
+    M ./tests/hook.c +2
+
+Mon Jan  9 21:18:25 EST 2006  stevenj
+  * [project @ 2006-01-10 02:18:25 by stevenj]
+  some additional dllexport tags required to build the test program, due to internal stuff called by hook.c
+
+    M ./api/api.h -2 +3
+    M ./kernel/ifftw.h -6 +17
+
+Mon Jan  9 20:31:15 EST 2006  stevenj
+  * [project @ 2006-01-10 01:31:15 by stevenj]
+
+    M ./api/fftw3.h -2 +2
+
+Mon Jan  9 20:30:19 EST 2006  stevenj
+  * [project @ 2006-01-10 01:30:19 by stevenj]
+  comment
+
+    M ./api/fftw3.h -4 +6
+
+Mon Jan  9 20:20:28 EST 2006  stevenj
+  * [project @ 2006-01-10 01:17:11 by stevenj]
+
+    M ./api/api.h -7 +7
+    M ./api/fftw3.h -4 +3
+
+Mon Jan  9 20:16:50 EST 2006  stevenj
+  * [project @ 2006-01-10 01:16:50 by stevenj]
+  clarification
+
+    M ./api/fftw3.h -3 +3
+
+Mon Jan  9 20:12:23 EST 2006  stevenj
+  * [project @ 2006-01-10 01:12:23 by stevenj]
+  define FFTW_DLL if DLL_EXPORT (defined by libtool) is supplied
+
+    M ./api/api.h +10
+
+Mon Jan  9 20:05:11 EST 2006  stevenj
+  * [project @ 2006-01-10 01:05:11 by stevenj]
+  whoops
+
+    M ./api/fftw3.h -2 +2
+
+Mon Jan  9 20:00:47 EST 2006  stevenj
+  * [project @ 2006-01-10 01:00:47 by stevenj]
+  another stab at Windows DLL mess
+
+    M ./api/fftw3.h -172 +185
+
+Mon Jan  9 19:23:42 EST 2006  athena
+  * [project @ 2006-01-10 00:23:42 by athena]
+  64-bit clean SIMD header file.  I missed those because sparse
+  does not know vector types.  Grrr...
+
+    M ./simd/simd-altivec.h -11 +11
+    M ./simd/simd-sse.h -6 +6
+    M ./simd/simd-sse2.h -4 +4
+
+Mon Jan  9 19:08:36 EST 2006  stevenj
+  * [project @ 2006-01-10 00:08:36 by stevenj]
+  this option is called AC_DISABLE_SHARED in the documentation
+
+    M ./configure.ac -1 +1
+
+Mon Jan  9 17:34:13 EST 2006  stevenj
+  * [project @ 2006-01-09 22:34:13 by stevenj]
+  fixed --with-gcc-arch to work when cross-compiling
+
+    M ./m4/ax_gcc_archflag.m4 -3 +4
+
+Mon Jan  9 12:04:04 EST 2006  athena
+  * [project @ 2006-01-09 17:04:04 by athena]
+  Moved the timeout check back into the search loop, sicut erat in
+  principio.  This gives us a precise control over the timeout.  To
+  avoid the overhead of X(seconds)(), only call X(seconds)() if some
+  time measurement was taken since the last call to X(seconds)().
+
+    M ./api/apiplan.c -11 +1
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -5 +36
+
+Mon Jan  9 00:07:40 EST 2006  stevenj
+  * [project @ 2006-01-09 05:07:40 by stevenj]
+  comments
+
+    M ./rdft/vrank3-transpose.c -5 +5
+
+Sun Jan  8 23:58:23 EST 2006  stevenj
+  * [project @ 2006-01-09 04:58:23 by stevenj]
+  generalized transpose-cut routine to be able to call transpose-gcd recursivly; TOMS follow-the-cycles algorithm now seems to be completely superseded
+
+    M ./rdft/vrank3-transpose.c -111 +142
+
+Sun Jan  8 20:53:18 EST 2006  stevenj
+  * [project @ 2006-01-09 01:53:18 by stevenj]
+
+    M ./threads/threads.c -3 +2
+
+Sun Jan  8 20:52:16 EST 2006  stevenj
+  * [project @ 2006-01-09 01:52:16 by stevenj]
+  ignore errors from setscope -- POSIX standard does not require PTHREAD_SCOPE_SYSTEM to be supported, and PTHREAD_SCOPE_PROCESS is usually okay in that case
+
+    M ./threads/threads.c -7 +9
+
+Sun Jan  8 15:58:40 EST 2006  stevenj
+  * [project @ 2006-01-08 20:58:40 by stevenj]
+  added TODO comment
+
+    M ./rdft/vrank3-transpose.c -1 +7
+
+Sun Jan  8 15:39:28 EST 2006  stevenj
+  * [project @ 2006-01-08 20:39:28 by stevenj]
+  whoops
+
+    M ./rdft/vrank3-transpose.c -8 +18
+
+Sun Jan  8 14:44:23 EST 2006  athena
+  * [project @ 2006-01-08 19:44:23 by athena]
+  Boasted ``much faster altivec performance''.
+
+    M ./NEWS -1 +1
+
+Sun Jan  8 11:44:52 EST 2006  athena
+  * [project @ 2006-01-08 16:44:52 by athena]
+  Added a new pass to the generator to schedule for the pipeline
+  latency.  (This schedule modifies the ``optimal'' cache-oblivious
+  schedule and hence it uses more registers.)
+  
+  This pass is currently:
+  
+  * disabled for non-fma code, under the assumption that this will
+    run on a register-starved fma.
+  
+  * enabled for non-simd fma code, under the assumption that this will
+    run on a processor with 32 or more FP registers.  The latency of 4
+    is conservative and does not introduce too much register pressure.
+  
+  * enabled for simd fma code, under the assumption that this will run
+    on altivec.  The latency of 8 seems to produce the best results.
+
+    M ./configure.ac -1 +1
+    M ./dft/simd/codelets/Makefile.am -1 +1
+    M ./genfft/annotate.ml -3 +58
+    M ./genfft/magic.ml -3 +5
+    M ./genfft/schedule.ml -33 +5
+    M ./support/Makefile.codelets -2 +2
+    M ./support/twovers.sh -1 +1
+
+Sun Jan  8 03:13:53 EST 2006  stevenj
+  * [project @ 2006-01-08 08:13:53 by stevenj]
+  fixed estimator for vrank3-transpose
+
+    M ./rdft/vrank3-transpose.c -6 +22
+
+Sun Jan  8 02:02:11 EST 2006  stevenj
+  * [project @ 2006-01-08 07:02:11 by stevenj]
+  more detail on VC++ workaround
+
+    M ./NEWS -1 +2
+
+Sun Jan  8 00:19:19 EST 2006  stevenj
+  * [project @ 2006-01-08 05:19:19 by stevenj]
+  typo
+
+    M ./rdft/vrank3-transpose.c -2 +2
+
+Sun Jan  8 00:16:20 EST 2006  stevenj
+  * [project @ 2006-01-08 05:16:20 by stevenj]
+  screw it, just use planner for all sub-transposes in vrank3-transpose (still just use memcpy for contiguous copies, though)
+
+    M ./rdft/vrank3-transpose.c -95 +185
+
+Sat Jan  7 23:13:45 EST 2006  stevenj
+  * [project @ 2006-01-08 04:13:45 by stevenj]
+  add an assert
+
+    M ./kernel/tile2d.c +2
+
+Sat Jan  7 21:57:34 EST 2006  stevenj
+  * [project @ 2006-01-08 02:57:34 by stevenj]
+  vrank3-transpose now uses planner to decide whether to use cpy2d, cpy2d_tiled, etc.
+
+    M ./kernel/ifftw.h -1 +7
+    M ./rdft/rank0.c -7 +5
+    M ./rdft/vrank3-transpose.c -217 +231
+
+Sat Jan  7 20:57:16 EST 2006  stevenj
+  * [project @ 2006-01-08 01:57:16 by stevenj]
+  too annoying to have isqrt unexpectedly fail for n==0
+
+    M ./kernel/primes.c -2 +4
+
+Sat Jan  7 17:49:37 EST 2006  stevenj
+  * [project @ 2006-01-07 22:49:37 by stevenj]
+  clarifications
+
+    M ./NEWS -1 +1
+    M ./doc/fftw3.texi -2 +5
+
+Sat Jan  7 16:39:20 EST 2006  stevenj
+  * [project @ 2006-01-07 21:39:20 by stevenj]
+  comment fix
+
+    M ./rdft/vrank3-transpose.c -4 +4
+
+Sat Jan  7 15:16:22 EST 2006  stevenj
+  * [project @ 2006-01-07 20:16:22 by stevenj]
+  more faq updates
+
+    M ./doc/FAQ/fftw-faq.bfnn -8 +9
+
+Sat Jan  7 15:12:16 EST 2006  stevenj
+  * [project @ 2006-01-07 20:12:16 by stevenj]
+  enable fma on hppa, update FAQ entry
+
+    M ./configure.ac +1
+    M ./doc/FAQ/fftw-faq.bfnn -3 +11
+
+Sat Aug 12 23:47:06 EDT 2006  Unknown tagger
+  tagged fftw-3-1-beta1
+
+
+Sat Jan  7 14:06:31 EST 2006  athena
+  * [project @ 2006-01-07 19:06:31 by athena]
+  Accomodate different semantics of 'const' in C and C++
+
+    M ./dft/simd/t.c +4
+
+Fri Jan  6 23:40:53 EST 2006  athena
+  * [project @ 2006-01-07 04:40:53 by athena]
+  Altivec is called VMX in IBM land.
+
+    M ./NEWS -1 +1
+
+Fri Jan  6 23:40:16 EST 2006  athena
+  * [project @ 2006-01-07 04:40:16 by athena]
+  Noted faster altivec support.
+
+    M ./NEWS +2
+
+Fri Jan  6 21:49:10 EST 2006  stevenj
+  * [project @ 2006-01-07 02:49:10 by stevenj]
+  updated icc flag detection
+
+    M ./m4/ax_cc_maxopt.m4 -1 +1
+
+Fri Jan  6 10:01:50 EST 2006  athena
+  * [project @ 2006-01-06 15:01:50 by athena]
+  Note ``memoize triggen''.
+
+    M ./TODO -2 +1
+
+Fri Jan  6 09:36:51 EST 2006  athena
+  * [project @ 2006-01-06 14:36:51 by athena]
+  Use --enable-threads to generate dependencies in the threads/ directory.
+
+    M ./mkdist.sh -1 +1
+
+Fri Jan  6 09:26:29 EST 2006  athena
+  * [project @ 2006-01-06 14:26:29 by athena]
+  Workaround to icc #defining __GNUC__.
+
+    M ./kernel/ifftw.h -14 +14
+
+Fri Jan  6 09:21:19 EST 2006  athena
+  * [project @ 2006-01-06 14:21:19 by athena]
+  Switched name to 3.1-beta1.
+
+    M ./configure.ac -1 +1
+
+Thu Jan  5 23:08:44 EST 2006  athena
+  * [project @ 2006-01-06 04:08:44 by athena]
+  More thoughts.
+
+    M ./TODO +12
+
+Thu Jan  5 22:30:51 EST 2006  athena
+  * [project @ 2006-01-06 03:30:51 by athena]
+  Note wish that (block_size % 4) == 0.
+
+    M ./TODO +3
+
+Thu Jan  5 22:19:09 EST 2006  athena
+  * [project @ 2006-01-06 03:19:09 by athena]
+  Check alignment of mstart, mcount in SIMD codelets.
+
+    M ./dft/codelet-dft.h -2 +3
+    M ./dft/codelets/t.c -2 +3
+    M ./dft/ctsq.c -2 +3
+    M ./dft/dftw-direct.c -11 +17
+    M ./dft/k7/k7.c -3 +3
+    M ./dft/simd/q1b.c -1 +3
+    M ./dft/simd/q1f.c -1 +3
+    M ./dft/simd/t.c -9 +11
+    M ./dft/simd/t1s.c -3 +5
+    M ./threads/ct.c -3 +1
+    M ./threads/hc2hc.c -2
+
+Thu Jan  5 21:56:19 EST 2006  athena
+  * [project @ 2006-01-06 02:56:19 by athena]
+  Enable threads at bootstrap time, so I get the compiler warnings that
+  I would otherwise ignore.
+
+    M ./bootstrap.sh -1 +1
+
+Thu Jan  5 18:23:15 EST 2006  athena
+  * [project @ 2006-01-05 23:23:15 by athena]
+  made compilable by c++
+
+    M ./threads/dft-vrank-geq1.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -2 +2
+    M ./threads/vrank-geq1-rdft2.c -2 +2
+
+Thu Jan  5 17:39:02 EST 2006  athena
+  * [project @ 2006-01-05 22:39:02 by athena]
+  FIXED: incorrect twiddle_shift()
+
+    M ./kernel/twiddle.c -13 +19
+
+Thu Jan  5 16:01:51 EST 2006  athena
+  * [project @ 2006-01-05 21:01:51 by athena]
+  Replaced remnants of awake flag with the new enum wakefulness type.
+
+    M ./reodft/redft00e-r2hc.c -4 +5
+    M ./reodft/reodft11e-r2hc.c -5 +7
+    M ./reodft/rodft00e-r2hc.c -4 +5
+    M ./threads/ct.c -4 +4
+    M ./threads/dft-vrank-geq1.c -3 +3
+    M ./threads/hc2hc.c -3 +3
+    M ./threads/rdft-vrank-geq1.c -3 +3
+    M ./threads/vrank-geq1-rdft2.c -3 +3
+
+Thu Jan  5 11:20:59 EST 2006  athena
+  * [project @ 2006-01-05 16:20:59 by athena]
+  Oops---there is no need to find a free slot.
+
+    M ./kernel/planner.c -10 +1
+
+Thu Jan  5 09:41:58 EST 2006  athena
+  * [project @ 2006-01-05 14:41:58 by athena]
+  Assertions.
+
+    M ./kernel/planner.c -1 +3
+
+Thu Jan  5 09:29:55 EST 2006  athena
+  * [project @ 2006-01-05 14:29:55 by athena]
+  Commented the hash table lookup algorithm.
+
+    M ./kernel/planner.c -10 +14
+
+Thu Jan  5 09:12:00 EST 2006  athena
+  * [project @ 2006-01-05 14:12:00 by athena]
+  Fixed infinite loop in hashtable lookup/insert.  Grrr...
+
+    M ./kernel/planner.c -12 +31
+
+Wed Jan  4 22:04:28 EST 2006  stevenj
+  * [project @ 2006-01-05 03:04:26 by stevenj]
+  updated copyright years to 2006
+
+    M ./COPYRIGHT -2 +2
+    M ./api/api.h -2 +2
+    M ./api/apiplan.c -2 +2
+    M ./api/configure.c -2 +2
+    M ./api/execute-dft-c2r.c -2 +2
+    M ./api/execute-dft-r2c.c -2 +2
+    M ./api/execute-dft.c -2 +2
+    M ./api/execute-r2r.c -2 +2
+    M ./api/execute-split-dft-c2r.c -2 +2
+    M ./api/execute-split-dft-r2c.c -2 +2
+    M ./api/execute-split-dft.c -2 +2
+    M ./api/execute.c -2 +2
+    M ./api/export-wisdom-to-file.c -2 +2
+    M ./api/export-wisdom-to-string.c -2 +2
+    M ./api/export-wisdom.c -2 +2
+    M ./api/extract-reim.c -2 +2
+    M ./api/f77api.c -2 +2
+    M ./api/f77funcs.h -2 +2
+    M ./api/fftw3.h -3 +3
+    M ./api/flops.c -2 +2
+    M ./api/forget-wisdom.c -2 +2
+    M ./api/import-system-wisdom.c -2 +2
+    M ./api/import-wisdom-from-file.c -2 +2
+    M ./api/import-wisdom-from-string.c -2 +2
+    M ./api/import-wisdom.c -2 +2
+    M ./api/malloc.c -2 +2
+    M ./api/map-r2r-kind.c -2 +2
+    M ./api/mapflags.c -2 +2
+    M ./api/mkprinter-file.c -2 +2
+    M ./api/mktensor-iodims.c -2 +2
+    M ./api/mktensor-rowmajor.c -2 +2
+    M ./api/plan-dft-1d.c -2 +2
+    M ./api/plan-dft-2d.c -2 +2
+    M ./api/plan-dft-3d.c -2 +2
+    M ./api/plan-dft-c2r-1d.c -2 +2
+    M ./api/plan-dft-c2r-2d.c -2 +2
+    M ./api/plan-dft-c2r-3d.c -2 +2
+    M ./api/plan-dft-c2r.c -2 +2
+    M ./api/plan-dft-r2c-1d.c -2 +2
+    M ./api/plan-dft-r2c-2d.c -2 +2
+    M ./api/plan-dft-r2c-3d.c -2 +2
+    M ./api/plan-dft-r2c.c -2 +2
+    M ./api/plan-dft.c -2 +2
+    M ./api/plan-guru-dft-c2r.c -2 +2
+    M ./api/plan-guru-dft-r2c.c -2 +2
+    M ./api/plan-guru-dft.c -2 +2
+    M ./api/plan-guru-r2r.c -2 +2
+    M ./api/plan-guru-split-dft-c2r.c -2 +2
+    M ./api/plan-guru-split-dft-r2c.c -2 +2
+    M ./api/plan-guru-split-dft.c -2 +2
+    M ./api/plan-many-dft-c2r.c -2 +2
+    M ./api/plan-many-dft-r2c.c -2 +2
+    M ./api/plan-many-dft.c -2 +2
+    M ./api/plan-many-r2r.c -2 +2
+    M ./api/plan-r2r-1d.c -2 +2
+    M ./api/plan-r2r-2d.c -2 +2
+    M ./api/plan-r2r-3d.c -2 +2
+    M ./api/plan-r2r.c -2 +2
+    M ./api/print-plan.c -2 +2
+    M ./api/rdft2-pad.c -2 +2
+    M ./api/the-planner.c -2 +2
+    M ./api/version.c -3 +3
+    M ./api/x77.h -2 +2
+    M ./dft/bluestein.c -2 +2
+    M ./dft/buffered.c -3 +3
+    M ./dft/codelet-dft.h -3 +3
+    M ./dft/codelets/n.c -2 +2
+    M ./dft/codelets/n.h -2 +2
+    M ./dft/codelets/t.c -2 +2
+    M ./dft/codelets/t.h -2 +2
+    M ./dft/conf.c -3 +3
+    M ./dft/ct.c -3 +3
+    M ./dft/ct.h -2 +2
+    M ./dft/ctsq.c -3 +3
+    M ./dft/dft.h -3 +3
+    M ./dft/dftw-direct.c -3 +3
+    M ./dft/dftw-generic.c -2 +2
+    M ./dft/dftw-genericbuf.c -2 +2
+    M ./dft/direct.c -3 +3
+    M ./dft/generic.c -2 +2
+    M ./dft/indirect-transpose.c -2 +2
+    M ./dft/indirect.c -3 +3
+    M ./dft/k7/k7.c -3 +3
+    M ./dft/kdft-dif.c -3 +3
+    M ./dft/kdft-difsq.c -3 +3
+    M ./dft/kdft-dit.c -3 +3
+    M ./dft/kdft.c -3 +3
+    M ./dft/nop.c -3 +3
+    M ./dft/plan.c -3 +3
+    M ./dft/problem.c -3 +3
+    M ./dft/rader.c -2 +2
+    M ./dft/rank-geq2.c -3 +3
+    M ./dft/simd/n1b.c -2 +2
+    M ./dft/simd/n1b.h -2 +2
+    M ./dft/simd/n1f.c -2 +2
+    M ./dft/simd/n1f.h -2 +2
+    M ./dft/simd/n2b.c -2 +2
+    M ./dft/simd/n2b.h -2 +2
+    M ./dft/simd/n2f.c -2 +2
+    M ./dft/simd/n2f.h -2 +2
+    M ./dft/simd/n2s.c -2 +2
+    M ./dft/simd/n2s.h -2 +2
+    M ./dft/simd/q1b.c -2 +2
+    M ./dft/simd/q1b.h -2 +2
+    M ./dft/simd/q1f.c -2 +2
+    M ./dft/simd/q1f.h -2 +2
+    M ./dft/simd/t.c -2 +2
+    M ./dft/simd/t1b.h -2 +2
+    M ./dft/simd/t1f.h -2 +2
+    M ./dft/simd/t1s.c -2 +2
+    M ./dft/simd/t1s.h -2 +2
+    M ./dft/simd/t2b.h -2 +2
+    M ./dft/simd/t2f.h -2 +2
+    M ./dft/solve.c -3 +3
+    M ./dft/vrank-geq1.c -3 +3
+    M ./dft/zero.c -3 +3
+    M ./doc/f77_wisdom.f -2 +2
+    M ./doc/fftw3.texi -1 +1
+    M ./genfft/algsimp.ml -4 +4
+    M ./genfft/algsimp.mli -3 +3
+    M ./genfft/annotate.ml -4 +4
+    M ./genfft/annotate.mli -3 +3
+    M ./genfft/assoctable.ml -3 +3
+    M ./genfft/assoctable.mli -3 +3
+    M ./genfft/c.ml -3 +3
+    M ./genfft/c.mli -3 +3
+    M ./genfft/complex.ml -3 +3
+    M ./genfft/complex.mli -3 +3
+    M ./genfft/conv.ml -2 +2
+    M ./genfft/conv.mli -3 +3
+    M ./genfft/dag.ml -3 +3
+    M ./genfft/dag.mli -3 +3
+    M ./genfft/expr.ml -3 +3
+    M ./genfft/expr.mli -3 +3
+    M ./genfft/fft.ml -4 +4
+    M ./genfft/fft.mli -3 +3
+    M ./genfft/gen_athnotw.ml -4 +4
+    M ./genfft/gen_athtw.ml -4 +4
+    M ./genfft/gen_conv.ml -4 +4
+    M ./genfft/gen_hc2hc.ml -4 +4
+    M ./genfft/gen_hc2r.ml -4 +4
+    M ./genfft/gen_mdct.ml -4 +4
+    M ./genfft/gen_notw.ml -4 +4
+    M ./genfft/gen_notw_c.ml -4 +4
+    M ./genfft/gen_r2hc.ml -4 +4
+    M ./genfft/gen_r2r.ml -4 +4
+    M ./genfft/gen_twiddle.ml -4 +4
+    M ./genfft/gen_twiddle_c.ml -4 +4
+    M ./genfft/gen_twidsq.ml -4 +4
+    M ./genfft/gen_twidsq_c.ml -4 +4
+    M ./genfft/genutil.ml -3 +3
+    M ./genfft/littlesimp.ml -3 +3
+    M ./genfft/littlesimp.mli -3 +3
+    M ./genfft/magic.ml -3 +3
+    M ./genfft/monads.ml -3 +3
+    M ./genfft/number.ml -3 +3
+    M ./genfft/number.mli -3 +3
+    M ./genfft/oracle.ml -3 +3
+    M ./genfft/oracle.mli -3 +3
+    M ./genfft/schedule.ml -3 +3
+    M ./genfft/schedule.mli -3 +3
+    M ./genfft/simd.ml -3 +3
+    M ./genfft/simd.mli -3 +3
+    M ./genfft/simdmagic.ml -3 +3
+    M ./genfft/to_alist.ml -3 +3
+    M ./genfft/to_alist.mli -3 +3
+    M ./genfft/trig.ml -3 +3
+    M ./genfft/trig.mli -3 +3
+    M ./genfft/twiddle.ml -3 +3
+    M ./genfft/twiddle.mli -3 +3
+    M ./genfft/unique.ml -3 +3
+    M ./genfft/unique.mli -3 +3
+    M ./genfft/util.ml -3 +3
+    M ./genfft/util.mli -3 +3
+    M ./genfft/variable.ml -3 +3
+    M ./genfft/variable.mli -3 +3
+    M ./genfft-k7/algsimp.ml -4 +4
+    M ./genfft-k7/algsimp.mli -3 +3
+    M ./genfft-k7/assoctable.ml -3 +3
+    M ./genfft-k7/assoctable.mli -3 +3
+    M ./genfft-k7/complex.ml -1 +1
+    M ./genfft-k7/complex.mli -1 +1
+    M ./genfft-k7/expr.ml -3 +3
+    M ./genfft-k7/expr.mli -3 +3
+    M ./genfft-k7/fft.ml -4 +4
+    M ./genfft-k7/gen_notw.ml -1 +1
+    M ./genfft-k7/littlesimp.ml -3 +3
+    M ./genfft-k7/littlesimp.mli -3 +3
+    M ./genfft-k7/monads.ml -3 +3
+    M ./genfft-k7/number.ml -3 +3
+    M ./genfft-k7/number.mli -3 +3
+    M ./genfft-k7/oracle.ml -3 +3
+    M ./genfft-k7/oracle.mli -3 +3
+    M ./genfft-k7/to_alist.ml -3 +3
+    M ./genfft-k7/to_alist.mli -3 +3
+    M ./genfft-k7/twiddle.ml -3 +3
+    M ./genfft-k7/twiddle.mli -3 +3
+    M ./genfft-k7/vScheduler.mli -1 +1
+    M ./kernel/align.c -3 +3
+    M ./kernel/alloc.c -3 +3
+    M ./kernel/assert.c -3 +3
+    M ./kernel/awake.c -3 +3
+    M ./kernel/buffered.c -2 +2
+    M ./kernel/cpy1d.c -2 +2
+    M ./kernel/cpy2d-pair.c -2 +2
+    M ./kernel/cpy2d.c -2 +2
+    M ./kernel/ct.c -2 +2
+    M ./kernel/cycle.h -3 +3
+    M ./kernel/debug.c -3 +3
+    M ./kernel/hash.c -2 +2
+    M ./kernel/iabs.c -3 +3
+    M ./kernel/ifftw.h -3 +3
+    M ./kernel/kalloc.c -3 +3
+    M ./kernel/md5-1.c -2 +2
+    M ./kernel/md5.c -2 +2
+    M ./kernel/minmax.c -3 +3
+    M ./kernel/ops.c -3 +3
+    M ./kernel/pickdim.c -3 +3
+    M ./kernel/plan.c -3 +3
+    M ./kernel/primes.c -3 +3
+    M ./kernel/print.c -3 +3
+    M ./kernel/problem.c -3 +3
+    M ./kernel/rader.c -2 +2
+    M ./kernel/scan.c -3 +3
+    M ./kernel/solver.c -3 +3
+    M ./kernel/solvtab.c -3 +3
+    M ./kernel/stride.c -3 +3
+    M ./kernel/tensor.c -3 +3
+    M ./kernel/tensor1.c -3 +3
+    M ./kernel/tensor2.c -3 +3
+    M ./kernel/tensor4.c -3 +3
+    M ./kernel/tensor5.c -3 +3
+    M ./kernel/tensor7.c -3 +3
+    M ./kernel/tensor8.c -3 +3
+    M ./kernel/tensor9.c -3 +3
+    M ./kernel/tile2d.c -2 +2
+    M ./kernel/timer.c -3 +3
+    M ./kernel/transpose.c -2 +2
+    M ./kernel/trig.c -3 +3
+    M ./kernel/twiddle.c -3 +3
+    M ./libbench/accopy-from.c -1 +1
+    M ./libbench/accopy-to.c -1 +1
+    M ./libbench/allocate.c -1 +1
+    M ./libbench/bench-main.c -1 +1
+    M ./libbench/bench-user.h -1 +1
+    M ./libbench/bench.h -1 +1
+    M ./libbench/can-do.c -1 +1
+    M ./libbench/ccopy-from.c -1 +1
+    M ./libbench/ccopy-to.c -1 +1
+    M ./libbench/deallocate.c -1 +1
+    M ./libbench/getopt-utils.c -1 +1
+    M ./libbench/info.c -1 +1
+    M ./libbench/main.c -1 +1
+    M ./libbench/prime.c -1 +1
+    M ./libbench/problem.c -1 +1
+    M ./libbench/report.c -1 +1
+    M ./libbench/speed.c -1 +1
+    M ./libbench/timer.c -1 +1
+    M ./libbench/verify.c -1 +1
+    M ./libbench/zero.c -1 +1
+    M ./libbench2/aligned-main.c -1 +1
+    M ./libbench2/allocate.c -1 +1
+    M ./libbench2/can-do.c -1 +1
+    M ./libbench2/dotens2.c -3 +3
+    M ./libbench2/getopt-utils.c -1 +1
+    M ./libbench2/info.c -1 +1
+    M ./libbench2/main.c -1 +1
+    M ./libbench2/report.c -1 +1
+    M ./libbench2/tensor.c -1 +1
+    M ./libbench2/useropt.c -1 +1
+    M ./libbench2/verify-dft.c -3 +3
+    M ./libbench2/verify-lib.c -3 +3
+    M ./libbench2/verify-r2r.c -2 +2
+    M ./libbench2/verify-rdft2.c -3 +3
+    M ./libbench2/verify.c -1 +1
+    M ./libbench2/verify.h -2 +2
+    M ./libbench2/zero.c -1 +1
+    M ./m4/ax_gcc_archflag.m4 -1 +1
+    M ./rdft/buffered.c -3 +3
+    M ./rdft/buffered2.c -3 +3
+    M ./rdft/codelet-rdft.h -2 +2
+    M ./rdft/codelets/hb.h -2 +2
+    M ./rdft/codelets/hc2r.c -2 +2
+    M ./rdft/codelets/hc2r.h -2 +2
+    M ./rdft/codelets/hc2rIII.h -2 +2
+    M ./rdft/codelets/hf.h -2 +2
+    M ./rdft/codelets/hfb.c -2 +2
+    M ./rdft/codelets/r2hc.c -2 +2
+    M ./rdft/codelets/r2hc.h -2 +2
+    M ./rdft/codelets/r2hcII.h -2 +2
+    M ./rdft/codelets/r2r.c -2 +2
+    M ./rdft/codelets/r2r.h -2 +2
+    M ./rdft/conf.c -3 +3
+    M ./rdft/dft-r2hc.c -3 +3
+    M ./rdft/dht-r2hc.c -3 +3
+    M ./rdft/dht-rader.c -2 +2
+    M ./rdft/direct.c -3 +3
+    M ./rdft/direct2.c -3 +3
+    M ./rdft/generic.c -2 +2
+    M ./rdft/hc2hc-common.c -2 +2
+    M ./rdft/hc2hc-direct.c -3 +3
+    M ./rdft/hc2hc-directbuf.c -3 +3
+    M ./rdft/hc2hc-generic.c -2 +2
+    M ./rdft/hc2hc.c -2 +2
+    M ./rdft/hc2hc.h -2 +2
+    M ./rdft/indirect.c -3 +3
+    M ./rdft/khc2hc.c -3 +3
+    M ./rdft/khc2r.c -3 +3
+    M ./rdft/kr2hc.c -3 +3
+    M ./rdft/kr2r.c -3 +3
+    M ./rdft/nop.c -3 +3
+    M ./rdft/nop2.c -3 +3
+    M ./rdft/plan.c -3 +3
+    M ./rdft/plan2.c -3 +3
+    M ./rdft/problem.c -3 +3
+    M ./rdft/problem2.c -3 +3
+    M ./rdft/rank-geq2-rdft2.c -3 +3
+    M ./rdft/rank-geq2.c -3 +3
+    M ./rdft/rank0-rdft2.c -3 +3
+    M ./rdft/rank0.c -3 +3
+    M ./rdft/rdft-dht.c -3 +3
+    M ./rdft/rdft.h -2 +2
+    M ./rdft/rdft2-inplace-strides.c -3 +3
+    M ./rdft/rdft2-radix2.c -3 +3
+    M ./rdft/rdft2-strides.c -2 +2
+    M ./rdft/rdft2-tensor-max-index.c -3 +3
+    M ./rdft/solve.c -3 +3
+    M ./rdft/solve2.c -3 +3
+    M ./rdft/vrank-geq1-rdft2.c -3 +3
+    M ./rdft/vrank-geq1.c -3 +3
+    M ./rdft/vrank3-transpose.c -3 +3
+    M ./reodft/conf.c -3 +3
+    M ./reodft/redft00e-r2hc-pad.c -3 +3
+    M ./reodft/redft00e-r2hc.c -3 +3
+    M ./reodft/reodft.h -2 +2
+    M ./reodft/reodft010e-r2hc.c -3 +3
+    M ./reodft/reodft11e-r2hc-odd.c -3 +3
+    M ./reodft/reodft11e-r2hc.c -3 +3
+    M ./reodft/reodft11e-radix2.c -3 +3
+    M ./reodft/rodft00e-r2hc-pad.c -3 +3
+    M ./reodft/rodft00e-r2hc.c -3 +3
+    M ./simd/altivec.c -3 +3
+    M ./simd/nonportable/sse.c -3 +3
+    M ./simd/nonportable/sse2.c -3 +3
+    M ./simd/simd-altivec.h -2 +2
+    M ./simd/simd-sse.h -2 +2
+    M ./simd/simd-sse2.h -2 +2
+    M ./simd/simd.h -2 +2
+    M ./simd/sse.c -3 +3
+    M ./simd/sse2.c -3 +3
+    M ./simd/taint.c -3 +3
+    M ./threads/api.c -2 +2
+    M ./threads/conf.c -3 +3
+    M ./threads/ct.c -3 +3
+    M ./threads/dft-vrank-geq1.c -3 +3
+    M ./threads/f77api.c -2 +2
+    M ./threads/f77funcs.h -2 +2
+    M ./threads/hc2hc.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -3 +3
+    M ./threads/threads.c -2 +2
+    M ./threads/threads.h -2 +2
+    M ./threads/vrank-geq1-rdft2.c -3 +3
+    M ./tools/fftw-wisdom-to-conf.1 -4 +4
+    M ./tools/fftw-wisdom-to-conf.in -2 +2
+    M ./tools/fftw-wisdom.c -2 +2
+    M ./tools/fftw_wisdom.1.in -4 +4
+
+Wed Jan  4 21:57:23 EST 2006  stevenj
+  * [project @ 2006-01-05 02:57:23 by stevenj]
+  whoops
+
+    M ./m4/ax_gcc_archflag.m4 -7 +4
+
+Wed Jan  4 21:52:18 EST 2006  stevenj
+  * [project @ 2006-01-05 02:52:18 by stevenj]
+  whoops
+
+    M ./m4/ax_gcc_archflag.m4 -1 +1
+
+Wed Jan  4 21:51:40 EST 2006  stevenj
+  * [project @ 2006-01-05 02:51:40 by stevenj]
+  more updates for recent pentia/amd
+
+    M ./m4/ax_gcc_archflag.m4 -5 +12
+
+Wed Jan  4 20:57:47 EST 2006  athena
+  * [project @ 2006-01-05 01:57:47 by athena]
+  Pruned TODO.
+
+    M ./TODO -6 +2
+
+Wed Jan  4 20:43:41 EST 2006  athena
+  * [project @ 2006-01-05 01:43:41 by athena]
+  Prototype of problem_destroy()
+
+    M ./libbench2/bench-user.h -1 +2
+    M ./libbench2/bench.h -2 +1
+
+Wed Jan  4 20:43:13 EST 2006  stevenj
+  * [project @ 2006-01-05 01:43:13 by stevenj]
+  rm obsoleted TODOs
+
+    M ./TODO -5
+
+Wed Jan  4 20:37:24 EST 2006  athena
+  * [project @ 2006-01-05 01:37:24 by athena]
+  Fallback to 970 if neither -mcpu=power5 nor -mcpu=power4 are supported.
+
+    M ./m4/ax_gcc_archflag.m4 -3 +3
+
+Wed Jan  4 20:29:07 EST 2006  stevenj
+  * [project @ 2006-01-05 01:29:07 by stevenj]
+  NEWS updates, clarifications, and reorganization
+
+    M ./NEWS -6 +10
+
+Wed Jan  4 19:54:41 EST 2006  stevenj
+  * [project @ 2006-01-05 00:54:40 by stevenj]
+  remove some compiler warnings, add an assert check, make estimator work properly for nop plans
+
+    M ./dft/dftw-genericbuf.c +3
+    M ./kernel/planner.c -1 +2
+    M ./kernel/trig.c -1 +3
+    M ./m4/ax_gcc_x86_cpuid.m4 +2
+    M ./rdft/dft-r2hc.c -1 +2
+
+Tue Jan  3 19:34:04 EST 2006  athena
+  * [project @ 2006-01-04 00:34:03 by athena]
+  Two big changes:
+  
+  1) revised the twiddle generation machinery, to avoid generating
+     twiddles when measuring, and to use a faster O(sqrt(N)) table
+     when this entails no loss of precision.
+  
+  2) implemented new ALLOW_PRUNING estimator hack.
+
+    M ./api/apiplan.c -2 +10
+    M ./api/fftw3.h -1 +2
+    M ./api/mapflags.c -1 +4
+    M ./configure.ac -8 +9
+    M ./dft/bluestein.c -12 +18
+    M ./dft/buffered.c -5 +5
+    M ./dft/ct.c -4 +7
+    M ./dft/ctsq.c -4 +4
+    M ./dft/dftw-direct.c -3 +4
+    M ./dft/dftw-generic.c -131 +19
+    M ./dft/dftw-genericbuf.c -76 +36
+    M ./dft/direct.c -1 +2
+    M ./dft/generic.c -2 +2
+    M ./dft/indirect-transpose.c -4 +4
+    M ./dft/indirect.c -4 +4
+    M ./dft/rader.c -17 +22
+    M ./dft/rank-geq2.c -4 +4
+    M ./dft/vrank-geq1.c -3 +3
+    M ./genfft/twiddle.ml -22 +12
+    M ./kernel/awake.c -3 +3
+    M ./kernel/ifftw.h -23 +48
+    M ./kernel/plan.c -14 +9
+    M ./kernel/planner.c -1 +8
+    M ./kernel/timer.c -3 +3
+    M ./kernel/trig.c -41 +166
+    M ./kernel/twiddle.c -64 +103
+    M ./libbench2/bench-main.c -3 +8
+    M ./libbench2/bench.h -2 +2
+    M ./libbench2/problem.c -4 +3
+    M ./libbench2/speed.c -2 +8
+    M ./rdft/buffered.c -5 +5
+    M ./rdft/buffered2.c -4 +4
+    M ./rdft/dft-r2hc.c -3 +3
+    M ./rdft/dht-r2hc.c -3 +3
+    M ./rdft/dht-rader.c -16 +23
+    M ./rdft/direct.c -1 +3
+    M ./rdft/direct2.c -1 +2
+    M ./rdft/generic.c -2 +2
+    M ./rdft/hc2hc-direct.c -5 +7
+    M ./rdft/hc2hc-directbuf.c -5 +5
+    M ./rdft/hc2hc-generic.c -6 +7
+    M ./rdft/hc2hc.c -3 +7
+    M ./rdft/indirect.c -4 +4
+    M ./rdft/rank-geq2-rdft2.c -4 +4
+    M ./rdft/rank-geq2.c -4 +4
+    M ./rdft/rank0-rdft2.c -3 +3
+    M ./rdft/rdft-dht.c -3 +3
+    M ./rdft/rdft2-radix2.c -4 +5
+    M ./rdft/vrank-geq1-rdft2.c -3 +3
+    M ./rdft/vrank-geq1.c -3 +3
+    M ./reodft/redft00e-r2hc-pad.c -4 +4
+    M ./reodft/reodft00e-splitradix.c -5 +6
+    M ./reodft/reodft010e-r2hc.c -4 +5
+    M ./reodft/reodft11e-r2hc-odd.c -3 +3
+    M ./reodft/reodft11e-radix2.c -5 +7
+    M ./reodft/rodft00e-r2hc-pad.c -4 +4
+    M ./tests/hook.c -3 +6
+
+Sat Dec 24 22:08:29 EST 2005  athena
+  * [project @ 2005-12-25 03:08:29 by athena]
+  Estimator tweaks, mostly to favor generic over rader for small n.
+
+    M ./dft/generic.c +2
+    M ./rdft/generic.c +2
+
+Sat Dec 24 17:55:47 EST 2005  athena
+  * [project @ 2005-12-24 22:55:47 by athena]
+  Grrr... missing break statement in switch.
+
+    M ./tests/hook.c +1
+
+Sat Dec 24 16:08:50 EST 2005  athena
+  * [project @ 2005-12-24 21:08:49 by athena]
+  Swapped fields TW and OPS in struct ct_desc_s, to make k7 asm
+  code insensitive to -malign-double.  For consistency, changed
+  struct hc2hc_desc_s in the same way.
+
+    M ./dft/codelet-dft.h -2 +2
+    M ./genfft/gen_hc2hc.ml -3 +3
+    M ./genfft/gen_twiddle.ml -3 +3
+    M ./genfft/gen_twiddle_c.ml -3 +3
+    M ./genfft/gen_twidsq.ml -3 +3
+    M ./genfft/gen_twidsq_c.ml -3 +3
+    M ./genfft-k7/gen_twiddle.ml -3 +2
+    M ./rdft/codelet-rdft.h -1 +1
+
+Sat Dec 24 16:00:42 EST 2005  athena
+  * [project @ 2005-12-24 21:00:42 by athena]
+  Wrong check for infeasible slvndx in imprt().
+
+    M ./kernel/planner.c -4 +4
+
+Sat Dec 24 15:56:59 EST 2005  athena
+  * [project @ 2005-12-24 20:56:59 by athena]
+  Removed obsolete function invoke_solver_if_correct_kind().
+
+    M ./kernel/planner.c -10 +1
+
+Sat Dec 24 14:22:12 EST 2005  athena
+  * [project @ 2005-12-24 19:22:12 by athena]
+  Faster implementation of safe_mulmod(), avoiding divisions altogether.
+  Works for 0 <= p <= INT_MAX.
+
+    M ./kernel/primes.c -13 +22
+
+Sat Dec 24 12:05:54 EST 2005  athena
+  * [project @ 2005-12-24 17:05:54 by athena]
+  FFTW_ALLOW_LARGE_GENERIC must belong to flags->l, it cannot be
+  overridden by fftw.
+
+    M ./api/mapflags.c -2 +2
+
+Fri Dec 23 20:46:24 EST 2005  stevenj
+  * [project @ 2005-12-24 01:46:24 by stevenj]
+  no more need for limits.h, add some explanatory comments
+
+    M ./kernel/primes.c -4 +9
+
+Fri Dec 23 17:50:25 EST 2005  athena
+  * [project @ 2005-12-23 22:50:25 by athena]
+  Paranoia.
+
+    M ./dft/k7/k7.c -1 +8
+
+Fri Dec 23 17:40:41 EST 2005  athena
+  * [project @ 2005-12-23 22:40:41 by athena]
+  Fixed subtle bug involving overflow of the slvndx field in flags_t.
+
+    M ./kernel/ifftw.h -5 +6
+    M ./kernel/planner.c -18 +26
+
+Fri Dec 23 16:33:56 EST 2005  athena
+  * [project @ 2005-12-23 21:33:56 by athena]
+  Note 64-bit clean.
+
+    M ./NEWS +2
+
+Fri Dec 23 15:34:32 EST 2005  athena
+  * [project @ 2005-12-23 20:34:32 by athena]
+  Threads are now 64-bit clean
+
+    M ./threads/ct.c -12 +13
+    M ./threads/dft-vrank-geq1.c -20 +17
+    M ./threads/hc2hc.c -7 +8
+    M ./threads/rdft-vrank-geq1.c -18 +15
+    M ./threads/threads.h -2 +2
+    M ./threads/vrank-geq1-rdft2.c -20 +20
+
+Fri Dec 23 13:00:31 EST 2005  athena
+  * [project @ 2005-12-23 18:00:31 by athena]
+  Restored the old numbering TW_NEXT=3 etc, because the k7 code depends
+  on it.
+
+    M ./kernel/ifftw.h -3 +3
+
+Fri Dec 23 11:58:00 EST 2005  athena
+  * [project @ 2005-12-23 16:58:00 by athena]
+  Portable implementation of MULMOD() and safe_mulmod().
+  Removed all unnecessary AC_CHECK_SIZEOF() from configure.ac.
+
+    M ./configure.ac -5
+    M ./kernel/ifftw.h -27 +3
+    M ./kernel/primes.c -8 +3
+
+Thu Dec 22 11:12:29 EST 2005  athena
+  * [project @ 2005-12-22 16:12:29 by athena]
+  Inline the loop body in r2r codelets like we do everywhere else.
+
+    M ./genfft/gen_r2r.ml -44 +34
+
+Thu Dec 22 10:48:53 EST 2005  athena
+  * [project @ 2005-12-22 15:48:53 by athena]
+  Oops.
+
+    M ./dft/conf.c -3 +3
+
+Thu Dec 22 10:25:15 EST 2005  athena
+  * [project @ 2005-12-22 15:25:15 by athena]
+  Renamed X(sin_and_cos)() to X(cexp)().
+
+    M ./dft/bluestein.c -1 +1
+    M ./dft/dftw-generic.c -2 +2
+    M ./dft/dftw-genericbuf.c -2 +2
+    M ./dft/rader.c -1 +1
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/trig.c -2 +2
+    M ./kernel/twiddle.c -4 +3
+    M ./rdft/dht-rader.c -1 +1
+
+Wed Dec 21 22:49:58 EST 2005  athena
+  * [project @ 2005-12-22 03:49:58 by athena]
+  Somewhat faster generation of twiddle factors.
+
+    M ./dft/bluestein.c -2 +1
+    M ./dft/conf.c -3 +3
+    M ./dft/dftw-generic.c -8 +5
+    M ./dft/dftw-genericbuf.c -8 +6
+    M ./dft/rader.c -3 +4
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/ifftw.h -4 +3
+    M ./kernel/trig.c -5 +67
+    R ./kernel/trig1.c
+    M ./kernel/twiddle.c -11 +15
+    M ./rdft/dht-rader.c -2 +4
+
+Tue Dec 20 23:50:01 EST 2005  athena
+  * [project @ 2005-12-21 04:50:01 by athena]
+  tweaks
+
+    M ./kernel/md5.c -2 +1
+
+Tue Dec 20 22:29:19 EST 2005  athena
+  * [project @ 2005-12-21 03:29:19 by athena]
+  Sped up planner, esp. in estimate mode.  The planner now classifies
+  all solvers into DFT, RDFT, and RDFT2, and it only invokes solvers
+  appropriate for the problem being planned.  Because we have several
+  hundred solvers, the overhead of calling irrelevant solvers is
+  significant, and this modification mitigates the issue somewhat.
+
+    M ./dft/bluestein.c -14 +10
+    M ./dft/buffered.c -31 +30
+    M ./dft/ct.c -16 +13
+    M ./dft/ctsq.c -23 +20
+    M ./dft/dft.h -4 +1
+    M ./dft/direct.c -69 +62
+    M ./dft/generic.c -12 +8
+    M ./dft/indirect-transpose.c -23 +19
+    M ./dft/indirect.c -37 +33
+    M ./dft/nop.c -16 +15
+    M ./dft/problem.c -6 +2
+    M ./dft/rader.c -11 +7
+    M ./dft/rank-geq2.c -13 +9
+    M ./dft/vrank-geq1.c -16 +12
+    M ./kernel/ifftw.h -1 +25
+    M ./kernel/planner.c -5 +29
+    M ./rdft/buffered.c -31 +30
+    M ./rdft/buffered2.c -8 +5
+    M ./rdft/dft-r2hc.c -10 +6
+    M ./rdft/dht-r2hc.c -12 +9
+    M ./rdft/dht-rader.c -13 +9
+    M ./rdft/direct.c -49 +45
+    M ./rdft/direct2.c -45 +41
+    M ./rdft/generic.c -13 +9
+    M ./rdft/hc2hc.c -19 +15
+    M ./rdft/indirect.c -30 +26
+    M ./rdft/nop.c -16 +13
+    M ./rdft/nop2.c -17 +15
+    M ./rdft/problem.c -6 +2
+    M ./rdft/problem2.c -6 +2
+    M ./rdft/rank-geq2-rdft2.c -20 +16
+    M ./rdft/rank-geq2.c -13 +9
+    M ./rdft/rank0-rdft2.c -13 +10
+    M ./rdft/rank0.c -14 +10
+    M ./rdft/rdft-dht.c -15 +13
+    M ./rdft/rdft.h -6
+    M ./rdft/rdft2-radix2.c -25 +17
+    M ./rdft/vrank-geq1-rdft2.c -13 +12
+    M ./rdft/vrank-geq1.c -15 +11
+    M ./rdft/vrank3-transpose.c -27 +24
+    M ./reodft/redft00e-r2hc-pad.c -12 +9
+    M ./reodft/redft00e-r2hc.c -12 +9
+    M ./reodft/reodft00e-splitradix.c -17 +14
+    M ./reodft/reodft010e-r2hc.c -12 +9
+    M ./reodft/reodft11e-r2hc-odd.c -12 +9
+    M ./reodft/reodft11e-r2hc.c -11 +9
+    M ./reodft/reodft11e-radix2.c -12 +9
+    M ./reodft/rodft00e-r2hc-pad.c -12 +8
+    M ./reodft/rodft00e-r2hc.c -11 +8
+    M ./tests/hook.c -92 +98
+
+Mon Dec 19 22:04:00 EST 2005  athena
+  * [project @ 2005-12-20 03:04:00 by athena]
+  Eliminated all calls to sprintf() in favor of own routines, so as not
+  to force users to link stdio and the associated locale/pthreads crap.
+
+    M ./kernel/print.c -35 +50
+
+Mon Dec 19 21:27:25 EST 2005  athena
+  * [project @ 2005-12-20 02:27:25 by athena]
+  Implemented routine to print INT, removing the need for c99's
+  %td format.
+
+    M ./kernel/ifftw.h -2 +1
+    M ./kernel/print.c -7 +27
+
+Mon Dec 19 12:06:33 EST 2005  athena
+  * [project @ 2005-12-19 17:06:33 by athena]
+  info->n is size_t
+
+    M ./kernel/alloc.c -2 +2
+
+Sun Dec 18 18:15:04 EST 2005  athena
+  * [project @ 2005-12-18 23:15:04 by athena]
+  Explicit casts in front of pointer difference in printf() context,
+  just in case INT != ptrdiff_t.
+
+    M ./configure.ac -1 +1
+    M ./dft/problem.c -3 +3
+    M ./rdft/problem.c -2 +2
+    M ./rdft/problem2.c -3 +3
+
+Sun Dec 18 16:52:38 EST 2005  athena
+  * [project @ 2005-12-18 21:52:38 by athena]
+  Forgot to add %D to print.c
+
+    M ./kernel/print.c -1 +6
+
+Sun Dec 18 16:43:26 EST 2005  athena
+  * [project @ 2005-12-18 21:43:26 by athena]
+  Use %D as format character for type INT.
+
+    M ./dft/bluestein.c -1 +1
+    M ./dft/buffered.c -2 +2
+    M ./dft/ct.c -2 +2
+    M ./dft/ctsq.c -2 +2
+    M ./dft/dftw-direct.c -3 +3
+    M ./dft/dftw-generic.c -1 +1
+    M ./dft/dftw-genericbuf.c -1 +1
+    M ./dft/direct.c -3 +3
+    M ./dft/generic.c -1 +1
+    M ./dft/problem.c -2 +2
+    M ./dft/rader.c -1 +1
+    M ./dft/vrank-geq1.c -2 +2
+    M ./kernel/print.c -9 +1
+    M ./kernel/tensor.c -2 +2
+    M ./rdft/buffered.c -2 +2
+    M ./rdft/buffered2.c -2 +2
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/dht-r2hc.c -2 +2
+    M ./rdft/dht-rader.c -1 +1
+    M ./rdft/direct.c -2 +2
+    M ./rdft/direct2.c -2 +2
+    M ./rdft/generic.c -1 +1
+    M ./rdft/hc2hc-direct.c -2 +2
+    M ./rdft/hc2hc-directbuf.c -2 +2
+    M ./rdft/hc2hc-generic.c -1 +1
+    M ./rdft/hc2hc.c -1 +1
+    M ./rdft/problem.c -2 +2
+    M ./rdft/problem2.c -2 +2
+    M ./rdft/rank0.c -2 +2
+    M ./rdft/rdft-dht.c -2 +2
+    M ./rdft/rdft2-radix2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+    M ./rdft/vrank3-transpose.c -2 +2
+    M ./reodft/redft00e-r2hc-pad.c -2 +2
+    M ./reodft/redft00e-r2hc.c -2 +2
+    M ./reodft/reodft00e-splitradix.c -3 +3
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -2 +2
+    M ./reodft/reodft11e-radix2.c -2 +2
+    M ./reodft/rodft00e-r2hc-pad.c -2 +2
+    M ./reodft/rodft00e-r2hc.c -2 +2
+
+Sun Dec 18 15:14:03 EST 2005  athena
+  * [project @ 2005-12-18 20:14:03 by athena]
+  Changed type of an_int_guaranteed_to_be_zero.  Changed name as well.
+
+    M ./kernel/ifftw.h -3 +3
+    M ./kernel/stride.c -2 +2
+
+Sun Dec 18 14:41:31 EST 2005  athena
+  * [project @ 2005-12-18 19:41:31 by athena]
+  converted %o -> INT
+
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -2 +2
+    M ./kernel/print.c -8 +6
+
+Sat Dec 17 20:28:50 EST 2005  athena
+  * [project @ 2005-12-18 01:28:50 by athena]
+  Major 64-bit cleanup.
+
+    M ./dft/bluestein.c -14 +15
+    M ./dft/buffered.c -20 +20
+    M ./dft/codelet-dft.h -17 +17
+    M ./dft/codelets/n.c -1 +1
+    M ./dft/codelets/t.c -1 +1
+    M ./dft/ct.c -6 +6
+    M ./dft/ct.h -5 +5
+    M ./dft/ctsq.c -6 +6
+    M ./dft/dftw-direct.c -17 +17
+    M ./dft/dftw-generic.c -28 +28
+    M ./dft/dftw-genericbuf.c -25 +25
+    M ./dft/direct.c -16 +14
+    M ./dft/generic.c -8 +8
+    M ./dft/indirect-transpose.c -5 +5
+    M ./dft/problem.c -3 +3
+    M ./dft/rader.c -10 +10
+    M ./dft/simd/n1b.c -1 +1
+    M ./dft/simd/n1f.c -1 +1
+    M ./dft/simd/n2b.c -1 +1
+    M ./dft/simd/n2f.c -1 +1
+    M ./dft/simd/n2s.c -1 +1
+    M ./dft/simd/q1b.c -1 +1
+    M ./dft/simd/q1f.c -1 +1
+    M ./dft/simd/t.c -5 +5
+    M ./dft/simd/t1s.c -1 +1
+    M ./dft/vrank-geq1.c -5 +5
+    M ./dft/zero.c -3 +3
+    M ./genfft/gen_hc2hc.ml -5 +5
+    M ./genfft/gen_hc2r.ml -6 +6
+    M ./genfft/gen_notw.ml -6 +6
+    M ./genfft/gen_notw_c.ml -6 +6
+    M ./genfft/gen_r2hc.ml -6 +6
+    M ./genfft/gen_r2r.ml -6 +6
+    M ./genfft/gen_twiddle.ml -5 +5
+    M ./genfft/gen_twiddle_c.ml -5 +5
+    M ./genfft/gen_twidsq.ml -5 +5
+    M ./genfft/gen_twidsq_c.ml -5 +5
+    M ./kernel/buffered.c -2 +2
+    M ./kernel/cpy1d.c -2 +2
+    M ./kernel/cpy2d-pair.c -9 +7
+    M ./kernel/cpy2d.c -20 +19
+    M ./kernel/ct.c -1 +1
+    M ./kernel/iabs.c -3 +3
+    M ./kernel/ifftw.h -81 +99
+    M ./kernel/md5-1.c -5 +5
+    M ./kernel/minmax.c -3 +3
+    M ./kernel/ops.c -4 +4
+    M ./kernel/planner.c -2 +2
+    M ./kernel/primes.c -27 +29
+    M ./kernel/rader.c -3 +3
+    M ./kernel/solvtab.c -1 +2
+    M ./kernel/stride.c -3 +3
+    M ./kernel/tensor.c -7 +8
+    M ./kernel/tensor1.c -2 +2
+    M ./kernel/tensor2.c -3 +21
+    M ./kernel/tensor4.c -8 +8
+    M ./kernel/tensor7.c -10 +16
+    M ./kernel/tile2d.c -31 +8
+    M ./kernel/transpose.c -12 +12
+    M ./kernel/trig.c -13 +3
+    M ./kernel/twiddle.c -23 +23
+    M ./rdft/buffered.c -15 +15
+    M ./rdft/buffered2.c -30 +30
+    M ./rdft/codelet-rdft.h -29 +29
+    M ./rdft/codelets/hc2r.c -1 +1
+    M ./rdft/codelets/hfb.c -1 +1
+    M ./rdft/codelets/r2hc.c -1 +1
+    M ./rdft/codelets/r2r.c -1 +1
+    M ./rdft/dft-r2hc.c -12 +12
+    M ./rdft/dht-r2hc.c -5 +5
+    M ./rdft/dht-rader.c -12 +12
+    M ./rdft/direct.c -11 +11
+    M ./rdft/direct2.c -8 +8
+    M ./rdft/generic.c -14 +14
+    M ./rdft/hc2hc-common.c -3 +3
+    M ./rdft/hc2hc-direct.c -13 +13
+    M ./rdft/hc2hc-directbuf.c -24 +24
+    M ./rdft/hc2hc-generic.c -48 +31
+    M ./rdft/hc2hc.c -5 +5
+    M ./rdft/hc2hc.h -7 +7
+    M ./rdft/problem.c -3 +3
+    M ./rdft/problem2.c -2 +2
+    M ./rdft/rank0-rdft2.c -7 +7
+    M ./rdft/rank0.c -11 +11
+    M ./rdft/rdft-dht.c -9 +9
+    M ./rdft/rdft.h -2 +2
+    M ./rdft/rdft2-inplace-strides.c -3 +3
+    M ./rdft/rdft2-radix2.c -23 +23
+    M ./rdft/rdft2-strides.c -1 +1
+    M ./rdft/rdft2-tensor-max-index.c -4 +4
+    M ./rdft/vrank-geq1-rdft2.c -6 +6
+    M ./rdft/vrank-geq1.c -5 +5
+    M ./rdft/vrank3-transpose.c -53 +54
+    M ./reodft/redft00e-r2hc-pad.c -11 +11
+    M ./reodft/redft00e-r2hc.c -11 +11
+    M ./reodft/reodft00e-splitradix.c -16 +17
+    M ./reodft/reodft010e-r2hc.c -26 +26
+    M ./reodft/reodft11e-r2hc-odd.c -19 +19
+    M ./reodft/reodft11e-r2hc.c -16 +16
+    M ./reodft/reodft11e-radix2.c -25 +25
+    M ./reodft/rodft00e-r2hc-pad.c -11 +11
+    M ./reodft/rodft00e-r2hc.c -11 +11
+    M ./simd/sse2.c -2 +3
+    M ./simd/taint.c -2 +2
+
+Sat Aug 12 23:34:43 EDT 2006  Unknown tagger
+  tagged before-64bit-rewrite
+
+
+Wed Dec  7 22:39:01 EST 2005  stevenj
+  * [project @ 2005-12-08 03:39:01 by stevenj]
+  PGI x86-64 cycle counter, courtesy Cristiano Calonaci
+
+    M ./kernel/cycle.h -1 +12
+
+Mon Dec  5 21:25:57 EST 2005  athena
+  * [project @ 2005-12-06 02:25:57 by athena]
+  Must insert into hash table when wisdom_state == WISDOM_ONLY,
+  otherwise wisdom does not work.
+
+    M ./kernel/planner.c -2 +3
+
+Sat Oct  8 18:08:44 EDT 2005  stevenj
+  * [project @ 2005-10-08 22:07:37 by stevenj]
+  comment
+
+    M ./m4/acx_pthread.m4 -1 +2
+
+Sun Oct  2 11:49:13 EDT 2005  athena
+  * [project @ 2005-10-02 15:49:13 by athena]
+  Paranoia: made planner robust against MD5 collisions.
+
+    M ./api/apiplan.c -6 +37
+    M ./kernel/ifftw.h -1 +19
+    M ./kernel/planner.c -25 +55
+
+Tue Sep 27 22:33:18 EDT 2005  athena
+  * [project @ 2005-09-28 02:33:18 by athena]
+  Note that --enable-3dnow is unsupported.
+
+    M ./doc/FAQ/fftw-faq.bfnn -2 +2
+
+Tue Sep 27 22:31:04 EDT 2005  athena
+  * [project @ 2005-09-28 02:31:04 by athena]
+  * Removed --enable-3dnow support.
+  
+  * SIMD support for split complex arrays.
+
+    M ./NEWS +4
+
+Tue Sep 27 22:28:41 EDT 2005  athena
+  * [project @ 2005-09-28 02:28:40 by athena]
+  Removed --enabled-3dnow, since it is becoming useless as the world
+  moves to x86-64, and it is a pain to maintain.  (We should probably
+  remove the k7 stuff as well.)
+
+    M ./api/version.c -5 +1
+    M ./configure.ac -9
+    M ./genfft/gen_notw.ml -6 +4
+    M ./genfft/gen_notw_c.ml -4 +2
+    M ./genfft/gen_twiddle.ml -5 +3
+    M ./genfft/gen_twiddle_c.ml -4 +2
+    M ./genfft/gen_twidsq_c.ml -4 +2
+    M ./kernel/align.c -4 +2
+    M ./kernel/ifftw.h -2 +2
+    R ./simd/3dnow.c
+    M ./simd/Makefile.am -2 +2
+    R ./simd/simd-3dnow.h
+    M ./simd/simd-altivec.h -3
+    M ./simd/simd-sse.h -3
+    M ./simd/simd-sse2.h -3
+    M ./simd/simd.h -4
+
+Tue Sep 27 21:59:16 EDT 2005  athena
+  * [project @ 2005-09-28 01:59:16 by athena]
+  Missing BEGIN_SIMD(), END_SIMD() statements.
+
+    M ./genfft/gen_notw.ml -4 +6
+    M ./genfft/gen_twiddle.ml -3 +5
+
+Tue Sep 27 12:16:08 EDT 2005  athena
+  * [project @ 2005-09-27 16:16:08 by athena]
+  Tweaks
+
+    M ./simd/simd-sse.h -12 +9
+
+Tue Sep 27 10:04:32 EDT 2005  athena
+  * [project @ 2005-09-27 14:04:32 by athena]
+  Fixed wrong opcount for simd codelets.
+
+    M ./genfft/to_alist.ml -9 +9
+
+Tue Sep 27 09:25:50 EDT 2005  athena
+  * [project @ 2005-09-27 13:25:50 by athena]
+  Fixed wrong opcount for simd codelets.
+
+    M ./dft/dftw-direct.c -2 +2
+
+Mon Sep 26 22:58:19 EDT 2005  athena
+  * [project @ 2005-09-27 02:58:19 by athena]
+  fixed flop counts
+
+    M ./genfft/c.ml -23 +17
+    M ./simd/simd-altivec.h -2 +2
+    M ./simd/simd-sse2.h -9 +9
+
+Mon Sep 26 22:34:40 EDT 2005  athena
+  * [project @ 2005-09-27 02:34:40 by athena]
+  Silence warnings
+
+    M ./simd/simd-sse2.h +6
+
+Mon Sep 26 20:52:36 EDT 2005  athena
+  * [project @ 2005-09-27 00:52:36 by athena]
+  Implemented split-complex SIMD codelets
+
+    A ./dft/simd/n2s.c
+    A ./dft/simd/n2s.h
+    A ./dft/simd/t1s.c
+    A ./dft/simd/t1s.h
+    M ./dft/simd/Makefile.am -2 +3
+    M ./dft/simd/codelets/Makefile.am -6 +18
+    M ./dft/simd/n2s.c +50
+    M ./dft/simd/n2s.h +28
+    M ./dft/simd/t1s.c +48
+    M ./dft/simd/t1s.h +33
+    M ./genfft/annotate.ml -3 +3
+    M ./genfft/c.ml -1 +3
+    M ./genfft/gen_hc2hc.ml -3 +3
+    M ./genfft/gen_notw.ml -5 +26
+    M ./genfft/gen_twiddle.ml -4 +4
+    M ./genfft/gen_twiddle_c.ml -3 +3
+    M ./genfft/gen_twidsq_c.ml -3 +3
+    M ./genfft/genutil.ml -3 +3
+    M ./genfft/simd.ml -14 +8
+    M ./genfft/twiddle.ml -3 +3
+    M ./genfft/twiddle.mli -2 +2
+    M ./simd/simd-altivec.h -7 +33
+    M ./simd/simd-sse.h +33
+    M ./simd/simd-sse2.h -2 +38
+    M ./simd/simd.h +2
+
+Sun Sep 25 22:25:35 EDT 2005  athena
+  * [project @ 2005-09-26 02:25:35 by athena]
+  Generalized the ``store pairs'' trick (now called ``store multiple'').
+
+    M ./dft/simd/codelets/Makefile.am -2 +2
+    M ./genfft/annotate.ml -37 +25
+    M ./genfft/annotate.mli -4 +2
+    M ./genfft/expr.ml -6 +6
+    M ./genfft/expr.mli -2 +2
+    M ./genfft/gen_notw_c.ml -11 +13
+    M ./genfft/simd.ml -6 +10
+    M ./genfft/simdmagic.ml -3 +3
+    M ./simd/simd-3dnow.h -2 +2
+    M ./simd/simd-altivec.h -2 +4
+    M ./simd/simd-sse.h -3 +3
+    M ./simd/simd-sse2.h -2 +2
+
+Sun Sep 25 18:58:20 EDT 2005  athena
+  * [project @ 2005-09-25 22:58:20 by athena]
+  Silence some warnings.
+
+    M ./simd/simd-altivec.h -4 +4
+
+Sat Sep 24 12:37:16 EDT 2005  athena
+  * [project @ 2005-09-24 16:37:16 by athena]
+  Removed obsolete cruft
+
+    M ./simd/simd-altivec.h -6 +6
+
+Mon Sep 19 22:55:19 EDT 2005  athena
+  * [project @ 2005-09-20 02:55:19 by athena]
+  Re-enabled check for <altivec.h> because OSX requires it.
+
+    M ./configure.ac -1 +1
+    M ./simd/simd-altivec.h +2
+
+Sun Sep 11 11:03:03 EDT 2005  athena
+  * [project @ 2005-09-11 15:03:03 by athena]
+  Check for sizeof(unsigned int) unconditionally, because the
+  result is used by ifftw.h.
+
+    M ./configure.ac -2 +2
+
+Sun Sep 11 10:59:40 EDT 2005  athena
+  * [project @ 2005-09-11 14:59:40 by athena]
+  Higher size limit for t2 codelets.
+
+    M ./dft/simd/t.c -1 +1
+
+Sun Sep 11 10:50:37 EDT 2005  athena
+  * [project @ 2005-09-11 14:50:37 by athena]
+  Heuristic: do not use t2 simd codelets for N>1024.
+
+    A ./dft/simd/t.c
+    M ./dft/simd/Makefile.am -2 +2
+    M ./dft/simd/t.c +98
+    R ./dft/simd/t1b.c
+    R ./dft/simd/t1f.c
+    M ./dft/simd/t2b.h -2 +1
+    M ./dft/simd/t2f.h -2 +1
+
+Mon Sep  5 22:22:50 EDT 2005  athena
+  * [project @ 2005-09-06 02:22:50 by athena]
+  Larger tolerance in timer calibration routine.
+
+    M ./libbench2/timer.c -2 +2
+
+Mon Sep  5 16:03:33 EDT 2005  athena
+  * [project @ 2005-09-05 20:03:33 by athena]
+  #include <altivec.h> unconditionally.  (There is no point in checking.)
+
+    M ./configure.ac -2 +1
+    M ./simd/simd-altivec.h -2
+
+Mon Sep  5 15:23:27 EDT 2005  athena
+  * [project @ 2005-09-05 19:23:27 by athena]
+  Removed SSE and SSE2 asm because it was bitrotting.  Use the Intel
+  API instead, which seems to be supported by gcc >= 3.3.
+  Moved files that require -msse, -msse2 to new directory.
+
+    A ./simd/nonportable/
+    A ./simd/nonportable/Makefile.am
+    A ./simd/nonportable/sse.c
+    A ./simd/nonportable/sse2.c
+    M ./Makefile.am +1
+    M ./configure.ac +1
+    M ./simd/Makefile.am -4 +3
+    M ./simd/nonportable/Makefile.am +8
+    M ./simd/nonportable/sse.c +43
+    M ./simd/nonportable/sse2.c +43
+    M ./simd/simd-sse.h -101 +19
+    M ./simd/simd-sse2.h -75 +7
+    R ./simd/sse-aux.c
+    M ./simd/sse.c -5 +7
+    R ./simd/sse2-aux.c
+    M ./simd/sse2.c -5 +7
+
+Mon Sep  5 12:56:28 EDT 2005  athena
+  * [project @ 2005-09-05 16:56:28 by athena]
+  Parse cputypes of the form 7447A,altivecsupported
+
+    M ./m4/ax_gcc_archflag.m4 -1 +1
+
+Mon Sep  5 12:52:30 EDT 2005  athena
+  * [project @ 2005-09-05 16:52:30 by athena]
+  Distinguish powerpc 7400 from the 7450, which has a different
+  pipeline.
+
+    M ./m4/ax_gcc_archflag.m4 +2
+
+Mon Sep  5 12:46:00 EDT 2005  athena
+  * [project @ 2005-09-05 16:46:00 by athena]
+  Paranoia: define RIGHT_CPU unconditionally.
+
+    M ./simd/simd-altivec.h -3 +3
+
+Thu Aug 11 20:56:41 EDT 2005  athena
+  * [project @ 2005-08-12 00:56:41 by athena]
+  Removed obsolete name fftw-wisdom2c.
+
+    M ./tools/fftw-wisdom-to-conf.in -1 +1
+
+Thu Aug 11 20:55:59 EDT 2005  athena
+  * [project @ 2005-08-12 00:55:59 by athena]
+  Avoid creation of temporary files---use cpp magic instead.
+  This fix solves a security bug and avoids nonportable tempfile
+  creation hacks.
+
+    M ./tools/fftw-wisdom-to-conf.in -13 +20
+
+Fri Aug  5 10:03:02 EDT 2005  athena
+  * [project @ 2005-08-05 14:03:02 by athena]
+  Workaround for with gcc-3.3 altivec bug.
+
+    M ./configure.ac -1 +3
+    M ./simd/altivec.c -3 +2
+    M ./simd/simd-altivec.h -2 +2
+
+Wed Jun 15 21:36:46 EDT 2005  stevenj
+  * [project @ 2005-06-16 01:36:46 by stevenj]
+  solaris fix: check -pthreads first since gcc does not like -pthread but chokes due to stubbed libc (grr)
+
+    M ./m4/acx_pthread.m4 -2 +2
+
+Fri Jun  3 17:19:56 EDT 2005  stevenj
+  * [project @ 2005-06-03 21:19:56 by stevenj]
+  note that VC++ bug was fixed in 2005
+
+    M ./doc/FAQ/fftw-faq.bfnn -4 +4
+
+Mon May 30 16:30:45 EDT 2005  stevenj
+  * [project @ 2005-05-30 20:30:32 by stevenj]
+  generalized ax_cc_vendor to ax_compiler_vendor
+
+    A ./m4/ax_compiler_vendor.m4
+    M ./configure.ac -2 +2
+    M ./m4/ax_cc_maxopt.m4 -5 +5
+    R ./m4/ax_cc_vendor.m4
+    M ./m4/ax_compiler_vendor.m4 +30
+
+Mon May 30 15:55:07 EDT 2005  stevenj
+  * [project @ 2005-05-30 19:55:07 by stevenj]
+  updated message
+
+    M ./m4/ax_cc_maxopt.m4 -2 +2
+
+Mon May 30 15:45:14 EDT 2005  stevenj
+  * [project @ 2005-05-30 19:45:14 by stevenj]
+  update for new AC archive format
+
+    M ./m4/acx_pthread.m4 -1 +4
+    M ./m4/ax_cc_maxopt.m4 -4 +9
+    M ./m4/ax_cc_vendor.m4 +6
+    M ./m4/ax_check_compiler_flags.m4 -1 +4
+    M ./m4/ax_gcc_aligns_stack.m4 -1 +4
+    M ./m4/ax_gcc_archflag.m4 -2 +5
+    M ./m4/ax_gcc_version.m4 -1 +4
+    M ./m4/ax_gcc_x86_cpuid.m4 -1 +4
+    M ./m4/ax_openmp.m4 -1 +4
+
+Mon May 23 23:12:22 EDT 2005  stevenj
+  * [project @ 2005-05-24 03:12:22 by stevenj]
+
+    M ./api/fftw3.h -2 +2
+
+Mon May 23 18:17:38 EDT 2005  stevenj
+  * [project @ 2005-05-23 22:17:38 by stevenj]
+
+    M ./NEWS -1 +1
+
+Mon May 23 18:13:08 EDT 2005  stevenj
+  * [project @ 2005-05-23 22:13:08 by stevenj]
+  more notes
+
+    M ./NEWS -1 +9
+
+Sun May 22 23:37:08 EDT 2005  stevenj
+  * [project @ 2005-05-23 03:37:08 by stevenj]
+  whoops
+
+    M ./m4/ax_cc_maxopt.m4 -8 +8
+
+Sun May 22 22:37:50 EDT 2005  stevenj
+  * [project @ 2005-05-23 02:37:50 by stevenj]
+  note icc 8.x annoyance
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +3
+
+Sun May 22 22:36:04 EDT 2005  stevenj
+  * [project @ 2005-05-23 02:36:04 by stevenj]
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Sun May 22 22:35:34 EDT 2005  stevenj
+  * [project @ 2005-05-23 02:35:34 by stevenj]
+  note gcc 3.4.[0123] bug, which is fixed in gcc 3.4.4
+
+    M ./doc/FAQ/fftw-faq.bfnn +5
+
+Sun May 22 22:21:26 EDT 2005  stevenj
+  * [project @ 2005-05-23 02:21:26 by stevenj]
+  added automatic detection of icc architecture flag
+
+    M ./m4/ax_cc_maxopt.m4 -1 +34
+
+Sun May 22 21:47:19 EDT 2005  stevenj
+  * [project @ 2005-05-23 01:47:19 by stevenj]
+  add -no-gcc to icc flags...even if it is Intel's fault, I'm sick of dealing with bug reports about this
+
+    M ./configure.ac +2
+
+Sun May 22 21:40:59 EDT 2005  stevenj
+  * [project @ 2005-05-23 01:40:59 by stevenj]
+  added @cindex portability
+
+    M ./doc/fftw3.texi -2 +11
+
+Sun May 22 21:34:10 EDT 2005  stevenj
+  * [project @ 2005-05-23 01:34:10 by stevenj]
+  note --without-gcc-arch
+
+    M ./doc/fftw3.texi -2 +4
+
+Sun May 22 20:54:54 EDT 2005  stevenj
+  * [project @ 2005-05-23 00:54:54 by stevenj]
+  bsd ppc detection; some odd 603 types
+
+    M ./m4/ax_gcc_archflag.m4 -2 +3
+
+Sun May 22 11:53:20 EDT 2005  stevenj
+  * [project @ 2005-05-22 15:53:20 by stevenj]
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Sat May 21 20:34:52 EDT 2005  stevenj
+  * [project @ 2005-05-22 00:34:52 by stevenj]
+  ensure no spaces in cputype
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Sat May 21 20:31:41 EDT 2005  stevenj
+  * [project @ 2005-05-22 00:31:41 by stevenj]
+  nevermind
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Sat May 21 20:30:08 EDT 2005  stevenj
+  * [project @ 2005-05-22 00:30:08 by stevenj]
+  more bsd stuff
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Sat May 21 20:28:40 EDT 2005  stevenj
+  * [project @ 2005-05-22 00:28:40 by stevenj]
+  added BSD cpu detection for SPARC and better super/hypersparc detection
+
+    M ./m4/ax_gcc_archflag.m4 -5 +5
+
+Sat May 21 20:22:11 EDT 2005  stevenj
+  * [project @ 2005-05-22 00:22:11 by stevenj]
+  comment
+
+    M ./m4/ax_gcc_archflag.m4 -1 +4
+
+Fri May 20 19:40:09 EDT 2005  stevenj
+  * [project @ 2005-05-20 23:40:09 by stevenj]
+  "alternate" == "alternative" is US-centric
+
+    M ./doc/fftw3.texi -3 +3
+
+Fri May 20 19:36:26 EDT 2005  stevenj
+  * [project @ 2005-05-20 23:36:26 by stevenj]
+  typo
+
+    M ./doc/fftw3.texi -2 +2
+
+Fri May 20 01:28:34 EDT 2005  stevenj
+  * [project @ 2005-05-20 05:28:34 by stevenj]
+  clarification
+
+    M ./doc/FAQ/fftw-faq.bfnn -2 +2
+
+Tue May 17 18:56:46 EDT 2005  stevenj
+  * [project @ 2005-05-17 22:56:46 by stevenj]
+  print out estimate-planner time from can_do in verbose>2 mode
+
+    M ./tests/bench.c -1 +7
+
+Mon May  9 00:47:19 EDT 2005  stevenj
+  * [project @ 2005-05-09 04:47:19 by stevenj]
+  comment
+
+    M ./m4/ax_cc_vendor.m4 +1
+
+Thu May  5 23:47:55 EDT 2005  stevenj
+  * [project @ 2005-05-06 03:47:55 by stevenj]
+  fixes for building Windows DLLs with Cygwin; thanks in part to Stephane Fillod
+
+    M ./Makefile.am -1 +1
+    M ./api/api.h +2
+    M ./api/fftw3.h -2 +12
+    M ./configure.ac -2 +4
+    M ./threads/Makefile.am -1 +1
+
+Fri Apr 22 19:47:43 EDT 2005  stevenj
+  * [project @ 2005-04-22 23:47:43 by stevenj]
+  -ffast-math seems to produce code that is either about the same speed or slightly faster (gcc 3.3 and 4.0, x86)
+
+    M ./m4/ax_cc_maxopt.m4 -1 +4
+
+Fri Apr 22 19:18:23 EDT 2005  stevenj
+  * [project @ 2005-04-22 23:18:23 by stevenj]
+  power5 fallback to power4 sched for older gcc's
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Fri Apr 22 19:14:53 EDT 2005  stevenj
+  * [project @ 2005-04-22 23:14:53 by stevenj]
+  check for power5
+
+    M ./m4/ax_gcc_archflag.m4 -1 +2
+
+Tue Apr 19 21:55:13 EDT 2005  athena
+  * [project @ 2005-04-20 01:55:13 by athena]
+  Removed clause #3
+
+    M ./api/fftw3.h -5 +1
+
+Tue Apr 19 21:44:57 EDT 2005  stevenj
+  * [project @ 2005-04-20 01:44:57 by stevenj]
+  license clarification
+
+    M ./api/fftw3.h -1 +4
+
+Tue Apr 19 21:42:51 EDT 2005  athena
+  * [project @ 2005-04-20 01:42:51 by athena]
+  Changed license of fftw3.h to X11.
+
+    M ./api/fftw3.h -12 +24
+
+Mon Apr 11 13:15:12 EDT 2005  stevenj
+  * [project @ 2005-04-11 17:15:12 by stevenj]
+  delete fixed-input code
+
+    M ./genfft/gen_conv.ml -3 +2
+
+Sun Apr 10 16:33:24 EDT 2005  athena
+  * [project @ 2005-04-10 20:33:24 by athena]
+  joned L-U-planner branch
+
+    M ./api/apiplan.c -2 +2
+    M ./api/fftw3.h -4 +5
+    M ./api/mapflags.c -19 +35
+    M ./dft/bluestein.c -6 +7
+    M ./dft/buffered.c -18 +11
+    M ./dft/ct.c -7 +5
+    M ./dft/dftw-direct.c -1 +4
+    M ./dft/dftw-generic.c -2 +4
+    M ./dft/generic.c -1 +1
+    M ./dft/indirect-transpose.c -1 +1
+    M ./dft/indirect.c -8 +8
+    M ./dft/rader.c -14 +16
+    M ./dft/vrank-geq1.c -2 +2
+    M ./kernel/ifftw.h -49 +71
+    M ./kernel/planner.c -97 +146
+    M ./rdft/buffered.c -15 +9
+    M ./rdft/buffered2.c -9 +6
+    M ./rdft/dft-r2hc.c -7 +4
+    M ./rdft/dht-r2hc.c -7 +7
+    M ./rdft/dht-rader.c -13 +15
+    M ./rdft/generic.c -1 +1
+    M ./rdft/hc2hc-generic.c -1 +1
+    M ./rdft/hc2hc.c -3 +2
+    M ./rdft/indirect.c -6 +4
+    M ./rdft/rank-geq2-rdft2.c -2 +2
+    M ./rdft/rdft-dht.c -5 +5
+    M ./rdft/rdft2-radix2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+    M ./rdft/vrank3-transpose.c -6 +6
+    M ./reodft/redft00e-r2hc-pad.c -2 +2
+    M ./reodft/redft00e-r2hc.c -2 +2
+    M ./reodft/reodft00e-splitradix.c -34 +17
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -2 +2
+    M ./reodft/reodft11e-radix2.c -2 +2
+    M ./reodft/rodft00e-r2hc-pad.c -2 +2
+    M ./reodft/rodft00e-r2hc.c -2 +2
+    M ./tests/bench.c -1 +1
+    M ./tests/hook.c -1 +1
+
+Thu Apr  7 23:15:02 EDT 2005  stevenj
+  * [project @ 2005-04-08 03:15:02 by stevenj]
+  ref
+
+    M ./reodft/reodft00e-splitradix.c -16 +33
+
+Thu Apr  7 00:11:13 EDT 2005  stevenj
+  * [project @ 2005-04-07 04:10:26 by stevenj]
+  whoops
+
+    M ./genfft/gen_r2r.ml -3 +3
+
+Wed Apr  6 22:06:21 EDT 2005  stevenj
+  * [project @ 2005-04-07 02:06:21 by stevenj]
+  added (optional) new split-radix algorithm, enabled with -newsplit; also new -standalone option to omit desc; also -unitary, -normalization, and -normsqr options to generate r2r codelets with various normalization (to match lit. in DCT-II, use: -unitary -normsqr 2)
+
+    M ./genfft/complex.ml -1 +22
+    M ./genfft/complex.mli -1 +7
+    M ./genfft/fft.ml -4 +86
+    M ./genfft/gen_athtw.ml -3 +3
+    M ./genfft/gen_hc2r.ml -4 +4
+    M ./genfft/gen_notw.ml -5 +4
+    M ./genfft/gen_notw_c.ml -3 +3
+    M ./genfft/gen_r2hc.ml -4 +4
+    M ./genfft/gen_r2r.ml -19 +43
+    M ./genfft/magic.ml -1 +6
+    M ./genfft/number.ml -4 +15
+    M ./genfft/number.mli -1 +2
+
+Fri Mar 25 08:59:43 EST 2005  athena
+  * [project @ 2005-03-25 13:59:43 by athena]
+  Moved timeout check outside the search loop, because X(seconds) is
+  expensive.
+
+    M ./kernel/planner.c -9 +7
+
+Sun Mar 20 18:35:53 EST 2005  athena
+  * [project @ 2005-03-20 23:35:53 by athena]
+  Enable vector recursion for in-place problems, otherwise
+  dftw-genericbuf works only in PATIENT mode.
+
+    M ./dft/ct.c -2 +4
+
+Sun Mar 20 17:53:58 EST 2005  athena
+  * [project @ 2005-03-20 22:53:58 by athena]
+  oops
+
+    M ./dft/dftw-genericbuf.c -1 +1
+
+Sun Mar 20 17:49:13 EST 2005  athena
+  * [project @ 2005-03-20 22:49:13 by athena]
+  make solver UGLY for small N
+
+    M ./dft/dftw-genericbuf.c -3 +13
+
+Sun Mar 20 17:16:37 EST 2005  athena
+  * [project @ 2005-03-20 22:16:37 by athena]
+  new dftw-genericbuf solver
+
+    A ./dft/dftw-genericbuf.c
+
+Sun Mar 20 16:12:44 EST 2005  athena
+  * [project @ 2005-03-20 21:12:44 by athena]
+  new dftw-genericbuf solver
+
+    M ./dft/Makefile.am -4 +4
+    M ./dft/conf.c -1 +2
+    M ./dft/dft.h -1 +2
+
+Thu Mar 17 21:48:19 EST 2005  athena
+  * [project @ 2005-03-18 02:48:19 by athena]
+  Hmm... what was I thinking?
+
+    M ./simd/sse2-aux.c -2 +2
+
+Thu Mar 17 19:20:54 EST 2005  athena
+  * [project @ 2005-03-18 00:20:54 by athena]
+  Workaround for a MSVC bug.
+
+    M ./simd/simd-sse2.h -1 +6
+    M ./simd/sse2-aux.c -1 +8
+
+Thu Mar 17 08:18:39 EST 2005  athena
+  * [project @ 2005-03-17 13:18:39 by athena]
+  Workaround for a MSVC bug that was reported by Eddie Yee.
+
+    M ./simd/simd-sse.h -1 +6
+    M ./simd/sse-aux.c -1 +8
+
+Tue Mar 15 13:25:53 EST 2005  athena
+  * [project @ 2005-03-15 18:25:53 by athena]
+  try both contiguous input and contiguous output when in doubt
+
+    M ./rdft/rank0.c -3 +26
+
+Tue Mar 15 08:44:41 EST 2005  athena
+  * [project @ 2005-03-15 13:44:41 by athena]
+  Added genfft flag -precompute-twiddles which moves the computation of
+  the twiddle factors before the main schedule.  This flag produces
+  smaller code everywhere, and slightly faster code on powerpc.
+  I observe no speed difference on x86.
+
+    M ./dft/codelets/standard/Makefile.am -3 +3
+    M ./genfft/genutil.ml -3 +5
+    M ./genfft/magic.ml -1 +5
+    M ./genfft/schedule.ml -1 +53
+    M ./genfft/schedule.mli -1 +2
+    M ./rdft/codelets/hc2r/Makefile.am -1 +1
+    M ./rdft/codelets/r2hc/Makefile.am -1 +1
+
+Mon Mar 14 21:43:53 EST 2005  stevenj
+  * [project @ 2005-03-15 02:43:53 by stevenj]
+  sp
+
+    M ./kernel/kalloc.c -2 +2
+
+Mon Mar 14 21:43:05 EST 2005  stevenj
+  * [project @ 2005-03-15 02:43:05 by stevenj]
+  whoops, spelling error (thanks to Steve Eddins for bug report)
+
+    M ./kernel/alloc.c -2 +2
+
+Sat Mar 12 15:03:47 EST 2005  athena
+  * [project @ 2005-03-12 20:03:45 by athena]
+  Do not approximate pcost = vl * child->pcost unless child is guaranteed
+  not to be a simple codelet.
+
+    M ./dft/vrank-geq1.c -2 +4
+    M ./rdft/vrank-geq1-rdft2.c -2 +4
+    M ./rdft/vrank-geq1.c -2 +4
+
+Wed Mar  9 20:00:02 EST 2005  athena
+  * [project @ 2005-03-10 01:00:02 by athena]
+  Relaxed applicability conditions.
+
+    M ./dft/direct.c -6 +3
+
+Wed Mar  9 00:05:47 EST 2005  athena
+  * [project @ 2005-03-09 05:05:47 by athena]
+  Minor optimization
+
+    M ./dft/dftw-generic.c -8 +8
+
+Tue Mar  8 22:14:02 EST 2005  athena
+  * [project @ 2005-03-09 03:14:02 by athena]
+  Interpret <N>K to mean <N>*1024.  Similarly for <N>M.
+
+    M ./libbench2/problem.c -1 +12
+
+Tue Mar  8 20:44:25 EST 2005  athena
+  * [project @ 2005-03-09 01:44:25 by athena]
+  Hmm... somehow some previous commit got lost.
+
+    M ./kernel/primes.c -2 +8
+
+Tue Mar  8 20:30:42 EST 2005  athena
+  * [project @ 2005-03-09 01:30:42 by athena]
+  Paranoia
+
+    M ./dft/ct.c -2 +2
+
+Mon Mar  7 14:30:01 EST 2005  stevenj
+  * [project @ 2005-03-07 19:30:01 by stevenj]
+  whoops
+
+    M ./configure.ac +1
+
+Mon Mar  7 14:29:43 EST 2005  stevenj
+  * [project @ 2005-03-07 19:29:43 by stevenj]
+  move fftw-specific HP/UX tweak into configure.ac
+
+    M ./configure.ac +3
+    M ./m4/ax_cc_maxopt.m4 -2 +2
+
+Mon Mar  7 14:19:24 EST 2005  stevenj
+  * [project @ 2005-03-07 19:19:24 by stevenj]
+  ax_cc_family -> ax_cc_vendor (vendor names are easier to remember), add checks for many new compilers, use in ax_cc_maxopt
+
+    A ./m4/ax_cc_vendor.m4
+    M ./configure.ac -3 +3
+    R ./m4/ax_cc_family.m4
+    M ./m4/ax_cc_maxopt.m4 -65 +42
+    M ./m4/ax_cc_vendor.m4 +23
+
+Sun Mar  6 21:36:05 EST 2005  athena
+  * [project @ 2005-03-07 02:36:05 by athena]
+  Count FMA as one flop in estimator when HAVE_FMA
+
+    M ./kernel/planner.c -1 +7
+
+Sun Mar  6 19:16:06 EST 2005  athena
+  * [project @ 2005-03-07 00:16:06 by athena]
+  Do not try radix-2 generic.
+
+    M ./dft/dftw-generic.c -1 +1
+
+Sun Mar  6 13:04:23 EST 2005  athena
+  * [project @ 2005-03-06 18:04:23 by athena]
+  Use -O3 for xlc now that we use -O for CODELET_OPTIM
+
+    M ./m4/ax_cc_maxopt.m4 -4 +4
+
+Sun Mar  6 13:02:41 EST 2005  athena
+  * [project @ 2005-03-06 18:02:41 by athena]
+  New AX_CC_FAMILY macro, that detects the compiler based on symbols
+  that it defines (as opposed to the name of the compiler).
+  We need to start use this strategy everywhere else.
+
+    A ./m4/ax_cc_family.m4
+    M ./configure.ac -25 +29
+    M ./m4/ax_cc_family.m4 +18
+
+Sun Mar  6 11:33:15 EST 2005  athena
+  * [project @ 2005-03-06 16:33:15 by athena]
+  Runtime checks to guarantee small strides.
+
+    M ./dft/direct.c -2 +18
+
+Sat Mar  5 20:09:25 EST 2005  athena
+  * [project @ 2005-03-06 01:09:25 by athena]
+  Reduced the search space for rank-0 transforms
+
+    M ./dft/vrank-geq1.c -6 +6
+    M ./kernel/tensor7.c -8 +20
+    M ./rdft/rank0.c -50 +48
+    M ./rdft/vrank-geq1.c -6 +5
+
+Fri Mar  4 17:50:29 EST 2005  stevenj
+  * [project @ 2005-03-04 22:50:29 by stevenj]
+  little assert
+
+    M ./kernel/primes.c -8 +3
+
+Tue Mar  1 09:19:16 EST 2005  athena
+  * [project @ 2005-03-01 14:19:16 by athena]
+  Implemented directbuf, enabled for now.
+
+    M ./dft/dft.h -1 +2
+    M ./dft/dftw-direct.c -10 +7
+    M ./dft/direct.c -11 +135
+    M ./dft/kdft.c -3 +3
+
+Mon Feb 28 22:21:14 EST 2005  athena
+  * [project @ 2005-03-01 03:21:14 by athena]
+  Unified dftw-direct, dftw-directbuf in an attempt to tame code
+  growth
+
+    M ./dft/Makefile.am -4 +4
+    M ./dft/dftw-direct.c -15 +130
+    R ./dft/dftw-directbuf.c
+    M ./dft/kdft-dif.c -2 +1
+    M ./dft/kdft-dit.c -2 +1
+
+Sun Feb 27 13:51:24 EST 2005  stevenj
+  * [project @ 2005-02-27 18:51:24 by stevenj]
+  fixed copyright
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Feb 26 22:21:03 EST 2005  athena
+  * [project @ 2005-02-27 03:21:03 by athena]
+  silence warnings
+
+    M ./rdft/rank0.c -4 +5
+
+Sat Feb 26 22:19:16 EST 2005  athena
+  * [project @ 2005-02-27 03:19:16 by athena]
+  oops
+
+    M ./rdft/rank0.c -2 +2
+
+Sat Feb 26 21:28:39 EST 2005  athena
+  * [project @ 2005-02-27 02:28:39 by athena]
+  Tweaking while thinking about a higher-rank transposer (bitreverser)
+
+    M ./rdft/rank0.c -44 +63
+
+Sat Feb 26 20:06:49 EST 2005  athena
+  * [project @ 2005-02-27 01:06:49 by athena]
+  Transposed the buffer, and skewed it.  This allows for contiguous
+  copy operations, and the codelet should not incur associativity
+  conflicts if the buffer is large.
+
+    M ./dft/dftw-directbuf.c -20 +26
+
+Sat Feb 26 18:14:11 EST 2005  stevenj
+  * [project @ 2005-02-26 23:14:11 by stevenj]
+  make tensor_max_index more reasonable (take maximum of input and output
+  max indices, computed separately)
+
+    M ./kernel/tensor4.c -4 +5
+
+Sat Feb 26 10:04:30 EST 2005  athena
+  * [project @ 2005-02-26 15:04:30 by athena]
+  Use cpy2d instead of cpy2d_tiled, because vl may be too large.
+
+    M ./rdft/vrank3-transpose.c -5 +5
+
+Sat Feb 26 00:31:52 EST 2005  athena
+  * [project @ 2005-02-26 05:31:52 by athena]
+  Fixed old bug that was introduced with yesterday's changes.
+
+    M ./genfft/annotate.ml -4 +6
+
+Fri Feb 25 21:54:23 EST 2005  athena
+  * [project @ 2005-02-26 02:54:23 by athena]
+  ``Interesting'' switch statement.
+
+    M ./kernel/cpy1d.c -3 +23
+
+Fri Feb 25 12:29:54 EST 2005  athena
+  * [project @ 2005-02-25 17:29:54 by athena]
+  Disabled -reorder-loads -reorder-stores, since they seem to do
+  nothing.
+
+    M ./support/Makefile.codelets -1 +1
+
+Fri Feb 25 12:19:10 EST 2005  stevenj
+  * [project @ 2005-02-25 17:19:03 by stevenj]
+  Because of the recent changes to kernel/pickdim.c, splitrnk=0 is no
+  longer equivalent to splitrnk=1 for rnk < 4, where the latter is the
+  FFTW2 behavior.  For small rnk, however, I observe the planner to pretty
+  consistently choose the FFTW2 behavior (splitrnk=1), despite its not
+  being asymptotically optimal in the cache oblivious sense.  So, make
+  splitrnk=1 instead of splitrnk=0 the default in FFTW_MEASURE and
+  FFTW_ESTIMATE modes (rnk > 3 is pretty rare in practice anyway).
+
+    M ./dft/rank-geq2.c -5 +2
+    M ./rdft/rank-geq2.c -5 +2
+
+Fri Feb 25 00:33:27 EST 2005  stevenj
+  * [project @ 2005-02-25 05:33:27 by stevenj]
+  tweak
+
+    M ./dft/indirect-transpose.c -3 +4
+
+Fri Feb 25 00:29:09 EST 2005  stevenj
+  * [project @ 2005-02-25 05:29:09 by stevenj]
+  slight relaxation
+
+    M ./dft/indirect-transpose.c -2 +4
+
+Fri Feb 25 00:21:00 EST 2005  stevenj
+  * [project @ 2005-02-25 05:21:00 by stevenj]
+  cruft
+
+    M ./dft/indirect-transpose.c -2
+
+Fri Feb 25 00:03:14 EST 2005  stevenj
+  * [project @ 2005-02-25 05:03:13 by stevenj]
+  added experimental indirect-transpose solver: when transforming the columns of the matrix, allow us to do a transpose to make the DFTs contiguous
+
+    A ./dft/indirect-transpose.c
+    M ./dft/Makefile.am -3 +3
+    M ./dft/conf.c -1 +2
+    M ./dft/dft.h -1 +2
+    M ./dft/indirect-transpose.c +280
+    M ./dft/indirect.c -3 +10
+    M ./kernel/ifftw.h -1 +3
+    M ./kernel/tensor4.c -1 +32
+
+Thu Feb 24 23:04:58 EST 2005  stevenj
+  * [project @ 2005-02-25 04:04:58 by stevenj]
+  check for abort()
+
+    M ./configure.ac -1 +1
+
+Thu Feb 24 23:04:43 EST 2005  stevenj
+  * [project @ 2005-02-25 04:04:43 by stevenj]
+  call abort() on failed assertion
+
+    M ./kernel/assert.c -1 +5
+
+Thu Feb 24 21:17:23 EST 2005  athena
+  * [project @ 2005-02-25 02:17:23 by athena]
+  Forgot to change X(isqrt) -> isqrt_maybe
+
+    M ./kernel/primes.c -2 +2
+
+Thu Feb 24 20:18:59 EST 2005  stevenj
+  * [project @ 2005-02-25 01:17:59 by stevenj]
+  require finite_rnk
+
+    M ./dft/rank-geq2.c -1 +2
+    M ./rdft/rank-geq2-rdft2.c -1 +2
+    M ./rdft/rank-geq2.c -1 +2
+
+Thu Feb 24 20:07:38 EST 2005  stevenj
+  * [project @ 2005-02-25 01:07:38 by stevenj]
+  #ifdef HAVE_STRING_H must come after rdft.h so that we get config.h
+
+    M ./rdft/vrank3-transpose.c -3 +3
+
+Thu Feb 24 18:59:40 EST 2005  athena
+  * [project @ 2005-02-24 23:59:38 by athena]
+  Implemented reordering of loads and stores so that the real and
+  imaginary part are loaded/stored together.  This should improve
+  out-of-cache performance in the presence of associativity conflicts,
+  and maybe worsen in-cache performance because of worse scheduling.
+  Enabled for now, for experimental purposes.
+
+    M ./genfft/annotate.ml -30 +76
+    M ./genfft/magic.ml -4 +10
+    M ./support/Makefile.codelets -1 +1
+    M ./support/twovers.sh -1 +1
+
+Thu Feb 24 18:10:49 EST 2005  stevenj
+  * [project @ 2005-02-24 23:10:49 by stevenj]
+  fix comment
+
+    M ./m4/ax_gcc_aligns_stack.m4 -2 +2
+
+Thu Feb 24 18:10:23 EST 2005  stevenj
+  * [project @ 2005-02-24 23:10:23 by stevenj]
+  better message
+
+    M ./m4/ax_gcc_aligns_stack.m4 -2 +2
+
+Thu Feb 24 18:08:36 EST 2005  stevenj
+  * [project @ 2005-02-24 23:08:36 by stevenj]
+  use gcc version > 3.0 as fallback in check for alignment bug
+
+    M ./m4/ax_gcc_aligns_stack.m4 -2 +2
+
+Thu Feb 24 18:02:31 EST 2005  stevenj
+  * [project @ 2005-02-24 23:02:31 by stevenj]
+  don't use -malign-double unconditionally (it is only available on x86)
+
+    M ./m4/ax_gcc_aligns_stack.m4 -2 +3
+
+Thu Feb 24 12:03:30 EST 2005  athena
+  * [project @ 2005-02-24 17:03:30 by athena]
+  Subtler selection of tilesz.
+
+    M ./kernel/transpose.c -3 +8
+
+Thu Feb 24 11:52:25 EST 2005  athena
+  * [project @ 2005-02-24 16:52:25 by athena]
+  Call cpy2d_tiledbuf, not cpy2d_tiled.
+
+    M ./rdft/rank0.c -5 +5
+
+Thu Feb 24 11:29:28 EST 2005  athena
+  * [project @ 2005-02-24 16:29:28 by athena]
+  buffer sizes were wrong :-(
+
+    M ./kernel/cpy2d.c -1 +1
+    M ./kernel/transpose.c -2 +2
+
+Thu Feb 24 11:19:01 EST 2005  athena
+  * [project @ 2005-02-24 16:19:01 by athena]
+  Single function for computing tile size.  Eliminate spurious assertions.
+
+    M ./kernel/cpy2d.c -3 +8
+    M ./kernel/ifftw.h -1 +3
+    M ./kernel/tile2d.c +5
+    M ./kernel/transpose.c -4 +6
+    M ./rdft/rank0.c -3 +1
+
+Thu Feb 24 10:00:02 EST 2005  athena
+  * [project @ 2005-02-24 15:00:02 by athena]
+  Do tiling recursively.
+
+    M ./kernel/tile2d.c +28
+
+Thu Feb 24 09:40:30 EST 2005  athena
+  * [project @ 2005-02-24 14:40:30 by athena]
+  Reworked tiled transposes; provide tiling with and without buffering.
+  I can't believe that one has to waste his life with this @#$%.
+
+    A ./kernel/tile2d.c
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/cpy2d.c -36 +60
+    M ./kernel/ifftw.h -2 +10
+    M ./kernel/tile2d.c +41
+    M ./kernel/transpose.c -64 +122
+    M ./rdft/rank0.c -6 +41
+    M ./rdft/vrank3-transpose.c -2 +2
+
+Wed Feb 23 22:21:19 EST 2005  athena
+  * [project @ 2005-02-24 03:21:19 by athena]
+  Clarified logic.  I am not sure why the code was so confusing to begin
+  with.  The computation of *dp in the which_dim == 0 case was also
+  wrong, returning e.g. *dp == -1 if sz->rnk == 1.
+
+    M ./kernel/pickdim.c -6 +6
+
+Wed Feb 23 22:00:15 EST 2005  athena
+  * [project @ 2005-02-24 03:00:15 by athena]
+  Enable aggressive inlining in codelets only, to avoid code bloat.
+
+    M ./configure.ac -9 +10
+
+Wed Feb 23 21:51:50 EST 2005  athena
+  * [project @ 2005-02-24 02:51:50 by athena]
+  Removed cache-oblivious copy/transpose algorithms in favor of
+  explicitly blocked algorithms.  The cache-oblivious algorithms fail if
+  there are associativity conflicts, in which case buffering is
+  necessary, as per Carter and Gatlin.  Once you set the buffer size,
+  there is no point whatsoever to do the algorithm recursively, and you
+  may as well use blocking.
+
+    M ./kernel/Makefile.am -2 +1
+    M ./kernel/cpy2d.c -62 +40
+    M ./kernel/ifftw.h -11 +11
+    M ./kernel/primes.c -4 +10
+    R ./kernel/transpose-rec.c
+    M ./kernel/transpose.c +54
+    M ./rdft/rank0.c -49 +28
+    M ./rdft/vrank3-transpose.c -6 +6
+
+Wed Feb 23 18:46:12 EST 2005  stevenj
+  * [project @ 2005-02-23 23:46:12 by stevenj]
+  --disable-fortran now differs from --enable-fortran that fails
+
+    M ./configure.ac -2 +4
+
+Wed Feb 23 18:42:21 EST 2005  stevenj
+  * [project @ 2005-02-23 23:42:21 by stevenj]
+  comment tweak
+
+    M ./api/f77api.c -1 +1
+
+Wed Feb 23 18:41:14 EST 2005  stevenj
+  * [project @ 2005-02-23 23:41:14 by stevenj]
+  If a Fortran compiler was not detected, just make our best guess at
+  what wrappers to use...I'm sick of dealing with user complaints from
+  cases where wrapper detection fails for whatever reason.
+
+    M ./api/f77api.c +21
+
+Wed Feb 23 18:10:40 EST 2005  stevenj
+  * [project @ 2005-02-23 23:10:40 by stevenj]
+  fflush(stdout) after print_plan, in case F77 doesn't
+
+    M ./api/f77funcs.h +1
+
+Tue Feb 22 22:54:42 EST 2005  athena
+  * [project @ 2005-02-23 03:54:42 by athena]
+  --enable-sse is necessary after all, to generate all dependencies
+  correctly.
+
+    M ./mkdist.sh -1 +1
+
+Tue Feb 22 22:32:06 EST 2005  athena
+  * [project @ 2005-02-23 03:32:06 by athena]
+  Put cpy2d_pair into its own file, so that I can experiment with
+  buffering of nontwiddle codelets.
+
+    A ./kernel/cpy2d-pair.c
+    M ./dft/dftw-directbuf.c -28 +7
+    M ./kernel/Makefile.am -5 +6
+    M ./kernel/cpy2d-pair.c +61
+    M ./kernel/ifftw.h -1 +11
+
+Tue Feb 22 20:07:11 EST 2005  athena
+  * [project @ 2005-02-23 01:07:11 by athena]
+  Copy rfftwnd.png from ${srcdir}, not $PWD
+
+    M ./doc/Makefile.am -1 +1
+
+Tue Feb 22 17:08:48 EST 2005  athena
+  * [project @ 2005-02-22 22:08:48 by athena]
+  Do not bother memcpy-ing complex numbers.
+
+    M ./rdft/rank0.c -2 +6
+
+Tue Feb 22 16:20:46 EST 2005  athena
+  * [project @ 2005-02-22 21:20:46 by athena]
+  Tighther layout of buffers.  I am not sure it matters, but just in case...
+
+    M ./kernel/cpy2d.c -2 +2
+    M ./kernel/transpose-rec.c -4 +4
+
+Tue Feb 22 10:13:02 EST 2005  athena
+  * [project @ 2005-02-22 15:13:02 by athena]
+  Usec cpy1d for rank-0 copies
+
+    M ./rdft/rank0.c -3 +2
+
+Tue Feb 22 10:06:13 EST 2005  athena
+  * [project @ 2005-02-22 15:06:13 by athena]
+  Implemented in-place transposes with buffering.  Moved
+  copy/transposition routines into own files, so that we can reuse them
+  from multiple places.  TODO: merge vrank3-transpose.c with rank0.c, or
+  rename vrank3-transpose.c to rank0-fancy.c or something like that;
+  decide whether square in-place transposes should be in rank0.c or
+  vrank3-transpose.c; apply FIXME's in vrank3-transpose.c.
+
+    A ./kernel/cpy1d.c
+    A ./kernel/cpy2d.c
+    A ./kernel/transpose-rec.c
+    A ./kernel/transpose.c
+    M ./kernel/Makefile.am -5 +5
+    M ./kernel/cpy1d.c +50
+    M ./kernel/cpy2d.c +154
+    M ./kernel/ifftw.h -1 +28
+    M ./kernel/transpose-rec.c +143
+    M ./kernel/transpose.c +72
+    M ./rdft/rank0.c -160 +85
+    M ./rdft/vrank3-transpose.c -231 +24
+
+Mon Feb 21 23:29:52 EST 2005  athena
+  * [project @ 2005-02-22 04:29:52 by athena]
+  Indentation should be printed after newline, not at the beginning
+  of print()
+
+    M ./kernel/print.c -6 +11
+
+Mon Feb 21 10:07:24 EST 2005  athena
+  * [project @ 2005-02-21 15:07:24 by athena]
+  generalized in anticipation of more complicated solvers.
+
+    M ./rdft/rank0.c -24 +45
+
+Sun Feb 20 22:18:59 EST 2005  athena
+  * [project @ 2005-02-21 03:18:59 by athena]
+  Implemented buffered recursive transpose
+
+    M ./rdft/rank0.c -7 +85
+
+Sun Feb 20 18:27:29 EST 2005  athena
+  * [project @ 2005-02-20 23:27:29 by athena]
+  Fixed comment
+
+    M ./rdft/rank0.c -2 +2
+
+Sun Feb 20 18:22:15 EST 2005  athena
+  * [project @ 2005-02-20 23:22:15 by athena]
+  grand unification of rank0 solvers
+
+    M ./rdft/Makefile.am -3 +2
+    M ./rdft/conf.c -2 +1
+    R ./rdft/rank0-vrank2.c
+    M ./rdft/rank0.c -93 +173
+    M ./rdft/rdft.h -1
+
+Sun Feb 20 15:35:24 EST 2005  athena
+  * [project @ 2005-02-20 20:35:24 by athena]
+  manual tail-recursion optimization
+
+    M ./rdft/vrank3-transpose.c -7 +11
+
+Sat Feb 19 17:57:44 EST 2005  athena
+  * [project @ 2005-02-19 22:57:44 by athena]
+  implemented check for transpositions
+
+    M ./libbench2/verify-lib.c -11 +11
+    M ./libbench2/verify-r2r.c -1 +1
+    M ./tests/check.pl -1 +19
+
+Sat Feb 19 17:28:43 EST 2005  athena
+  * [project @ 2005-02-19 22:28:43 by athena]
+  Previous fix was wrong for rdft2 problems.
+
+    M ./libbench2/verify-lib.c -3 +3
+
+Sat Feb 19 17:23:36 EST 2005  athena
+  * [project @ 2005-02-19 22:23:36 by athena]
+  vecsz->rnk must be finite for this solver to apply.
+
+    M ./rdft/dft-r2hc.c -2 +2
+
+Sat Feb 19 17:15:19 EST 2005  athena
+  * [project @ 2005-02-19 22:15:19 by athena]
+  unified the various simple'' transposers
+
+    M ./rdft/vrank3-transpose.c -101 +51
+
+Sat Feb 19 16:55:29 EST 2005  athena
+  * [project @ 2005-02-19 21:55:29 by athena]
+  Fixed stupid bug in rec_transpose_swap.  Fixed stupid verifier that did not catch the bug.
+
+    M ./libbench2/verify-lib.c -17 +26
+    M ./libbench2/verify-r2r.c -2 +3
+    M ./rdft/vrank3-transpose.c -5 +5
+
+Sat Feb 19 15:24:03 EST 2005  athena
+  * [project @ 2005-02-19 20:24:03 by athena]
+  Minor cleanup of transposition routines.
+
+    M ./rdft/vrank3-transpose.c -188 +83
+
+Sat Feb 19 09:31:14 EST 2005  athena
+  * [project @ 2005-02-19 14:31:14 by athena]
+  Make the batch size B=Theta(r) instead of B=Theta(1) in buffered
+  twiddle solvers.  Theory: for cache line size L, we want B = Omega(L)
+  to utilize the cache line fully.  We also want B*r =O(Z), where Z is
+  the size of the cache.  It is safe to assume that Z = Theta(L^2):
+  cache designers will tend to make L as large as they can get away
+  with, because they don't have to program the machines that they build,
+  and Z < Theta(L^2) will screw up the little matrix transposition
+  benchmarks that they use to design the cache.  Hence, B=Theta(r) is
+  the right number.
+
+    M ./dft/dftw-directbuf.c -12 +17
+    M ./rdft/hc2hc-directbuf.c -16 +20
+
+Fri Feb 18 23:47:22 EST 2005  stevenj
+  * [project @ 2005-02-19 04:47:22 by stevenj]
+  for --enable-portable-binary, only try -mcpu=$arch and -m$arch on x86,
+  since these generate non-portable code on every other target (and
+  some other targets, like Alpha, don't support -mtune=$arch).
+
+    M ./m4/ax_gcc_archflag.m4 -2 +5
+
+Thu Feb 17 21:15:42 EST 2005  athena
+  * [project @ 2005-02-18 02:15:42 by athena]
+  gcc/aix defines _POWER, not __powerpc__ like the rest of the world
+  does.
+
+    M ./kernel/ifftw.h -2 +2
+
+Wed Feb 16 22:30:27 EST 2005  athena
+  * [project @ 2005-02-17 03:30:27 by athena]
+  enable fma for ia64, since it seems to help with the hpux compiler.
+
+    M ./configure.ac +1
+
+Wed Feb 16 21:47:48 EST 2005  athena
+  * [project @ 2005-02-17 02:47:48 by athena]
+
+    M ./TODO -1 +2
+
+Wed Feb 16 15:27:18 EST 2005  athena
+  * [project @ 2005-02-16 20:27:18 by athena]
+  Fixes for darwin
+
+    M ./simd/simd-altivec.h -2 +2
+
+Wed Feb 16 14:27:42 EST 2005  athena
+  * [project @ 2005-02-16 19:27:42 by athena]
+  Made the correctness of the code more obvious.
+
+    M ./api/apiplan.c -14 +16
+
+Wed Feb 16 12:30:29 EST 2005  stevenj
+  * [project @ 2005-02-16 17:30:29 by stevenj]
+  s/with-portable-binary/enable-portable-binary/ to be GNUlly correct; I'm sticking with --with-gcc-arch=arch, however, as --enable-gcc-arch=arch has the wrong connotations for me
+
+    M ./NEWS -2 +2
+    M ./m4/ax_cc_maxopt.m4 -3 +3
+
+Wed Feb 16 11:44:48 EST 2005  stevenj
+  * [project @ 2005-02-16 16:44:48 by stevenj]
+  whoops
+
+    M ./api/apiplan.c -2 +2
+
+Wed Feb 16 11:23:38 EST 2005  stevenj
+  * [project @ 2005-02-16 16:23:38 by stevenj]
+  bless wisdom with patience used to create it
+
+    M ./api/apiplan.c -1 +1
+
+Wed Feb 16 11:18:56 EST 2005  stevenj
+  * [project @ 2005-02-16 16:18:56 by stevenj]
+  whoops
+
+    M ./api/apiplan.c -7 +10
+
+Wed Feb 16 10:50:28 EST 2005  stevenj
+  * [project @ 2005-02-16 15:50:28 by stevenj]
+  whoops
+
+    M ./api/apiplan.c -1 +2
+
+Tue Feb 15 23:53:53 EST 2005  stevenj
+  * [project @ 2005-02-16 04:53:53 by stevenj]
+  added 'timed' planner option
+
+    M ./NEWS +3
+    M ./TODO -6
+    M ./api/apiplan.c -16 +56
+    M ./api/fftw3.h -1 +3
+    M ./doc/fftw3.texi -1 +27
+    M ./kernel/ifftw.h -1 +6
+    M ./kernel/planner.c -2 +14
+    M ./kernel/timer.c -1 +13
+    M ./tests/bench.c +5
+
+Tue Feb 15 23:08:29 EST 2005  athena
+  * [project @ 2005-02-16 04:08:27 by athena]
+  Do not use SIMD_CFLAGS.  The theory is that if taint.c is unsafe
+  with SIMD_CFLAGS, then all files in this directory are as well.
+  Conversely, if these files require SIMD_CFLAGS because they include
+  "simd.h", then taint.c requires SIMD_CFLAGS as well, and thus we need
+  some other hack.
+
+    M ./dft/simd/Makefile.am -1
+    M ./simd/Makefile.am -8 +3
+
+Tue Feb 15 22:49:05 EST 2005  athena
+  * [project @ 2005-02-16 03:49:05 by athena]
+  Do not override CFLAGS in Makefile.am.
+
+    M ./dft/codelets/standard/Makefile.am -1
+    M ./dft/simd/Makefile.am -1 +1
+    M ./dft/simd/codelets/Makefile.am -1 +1
+    M ./rdft/codelets/hc2r/Makefile.am -1
+    M ./rdft/codelets/r2hc/Makefile.am -1
+    M ./rdft/codelets/r2r/Makefile.am -1
+    M ./support/Makefile.codelets -1 +6
+
+Tue Feb 15 10:30:12 EST 2005  athena
+  * [project @ 2005-02-15 15:30:12 by athena]
+  Allow users to build long double version even if sizeof(long double)
+  == sizeof(double)
+
+    M ./configure.ac -3
+
+Mon Feb 14 19:55:38 EST 2005  athena
+  * [project @ 2005-02-15 00:55:38 by athena]
+  Updated for 3.1
+
+    M ./commercialize.sh -12 +20
+
+Mon Feb 14 19:07:14 EST 2005  athena
+  * [project @ 2005-02-15 00:07:14 by athena]
+  Oops, version.h is no longer used
+
+    M ./api/version.c -2 +1
+
+Mon Feb 14 18:51:05 EST 2005  athena
+  * [project @ 2005-02-14 23:51:05 by athena]
+  unified fma and non-fma versions
+
+    A ./support/twovers.sh
+    M ./api/Makefile.am -5 +2
+    M ./api/version.c -2 +6
+    M ./configure.ac -5 +10
+    M ./dft/codelets/standard/Makefile.am -7 +7
+    M ./dft/simd/codelets/Makefile.am -10 +10
+    M ./m4/ocaml.m4 -9 +3
+    M ./mkdist.sh -5
+    M ./rdft/codelets/hc2r/Makefile.am -4 +4
+    M ./rdft/codelets/r2hc/Makefile.am -4 +4
+    M ./rdft/codelets/r2r/Makefile.am -9 +9
+    M ./support/Makefile.am -2 +2
+    M ./support/Makefile.codelets -7 +2
+    M ./support/twovers.sh +17
+
+Mon Feb 14 14:12:09 EST 2005  athena
+  * [project @ 2005-02-14 19:12:09 by athena]
+  forgot to remove inplace/Makefile from configure.ac
+
+    M ./configure.ac -1
+
+Mon Feb 14 12:08:52 EST 2005  athena
+  * [project @ 2005-02-14 17:08:49 by athena]
+  Merged dft/codelets/inplace with the main dft/codelets/standard
+  directory.  This step makes dft codelets consistent with the rest
+  of the naming conventions, and will simplify the eventual merge
+  of fma and non-fma codelets.
+
+    M ./Makefile.am -1
+    M ./dft/codelet-dft.h -2 +1
+    M ./dft/codelets/Makefile.am -1 +1
+    R ./dft/codelets/inplace/Makefile.am
+    R ./dft/codelets/inplace/
+    M ./dft/codelets/standard/Makefile.am -9 +31
+    M ./dft/conf.c -2 +1
+
+Mon Feb 14 11:16:15 EST 2005  athena
+  * [project @ 2005-02-14 16:16:15 by athena]
+  inline altivec constants, since gcc seems to generate better code this way.
+
+    M ./simd/altivec.c -21 +1
+    M ./simd/simd-altivec.h -16 +18
+
+Sun Feb 13 18:17:32 EST 2005  athena
+  * [project @ 2005-02-13 23:17:32 by athena]
+  group altivec constants into a single array, for faster access
+
+    M ./simd/altivec.c -11 +17
+    M ./simd/simd-altivec.h -20 +11
+
+Sun Feb 13 18:15:37 EST 2005  athena
+  * [project @ 2005-02-13 23:15:37 by athena]
+  code cleanup
+
+    M ./genfft/c.ml -23 +9
+    M ./genfft/c.mli -2 +1
+    M ./genfft/simd.ml -6 +6
+
+Sun Feb 13 10:29:32 EST 2005  athena
+  * [project @ 2005-02-13 15:29:32 by athena]
+  removed some unused stuff
+
+    M ./genfft/c.ml -3 +1
+    M ./genfft/c.mli -2 +1
+
+Sat Feb 12 22:04:40 EST 2005  athena
+  * [project @ 2005-02-13 03:04:40 by athena]
+  New twiddle scheme for altivec, 3dnow
+
+    M ./simd/simd-3dnow.h -4 +34
+    M ./simd/simd-altivec.h -4 +27
+
+Sat Feb 12 20:17:35 EST 2005  athena
+  * [project @ 2005-02-13 01:17:35 by athena]
+  Implemented new twiddle scheme for sse2
+
+    M ./simd/simd-sse2.h -4 +26
+
+Sat Feb 12 19:57:46 EST 2005  athena
+  * [project @ 2005-02-13 00:57:40 by athena]
+  Implemented experimental t2* codelets, which store twiddle factors
+  in a more convenient format, at the expense of twice the storage.
+  Currently only SSE works; I have to port SSE2, altivec, etc. to the
+  new scheme.  After this, we will decide whether these codelets
+  are worth the price.
+
+    A ./dft/simd/t2b.h
+    A ./dft/simd/t2f.h
+    M ./dft/simd/Makefile.am -1 +1
+    M ./dft/simd/codelets/Makefile.am -2 +14
+    M ./dft/simd/q1b.h +5
+    M ./dft/simd/q1f.h +5
+    M ./dft/simd/t1b.h +5
+    M ./dft/simd/t1f.h +5
+    M ./dft/simd/t2b.h +36
+    M ./dft/simd/t2f.h +36
+    M ./simd/simd-sse.h -36 +26
+
+Fri Feb 11 08:07:12 EST 2005  athena
+  * [project @ 2005-02-11 13:07:12 by athena]
+  Forgot to define SIMD_STRIDE_OKPAIR
+
+    M ./simd/simd-altivec.h +1
+
+Thu Feb 10 22:20:00 EST 2005  athena
+  * [project @ 2005-02-11 03:20:00 by athena]
+  fixed sse2, 3dnow, and altivec, as promised
+
+    M ./simd/simd-3dnow.h -5 +3
+    M ./simd/simd-altivec.h -1 +4
+    M ./simd/simd-sse.h +1
+    M ./simd/simd-sse2.h -5 +3
+
+Thu Feb 10 21:47:40 EST 2005  athena
+  * [project @ 2005-02-11 02:47:33 by athena]
+  Generate n2?v_* codelets in such a way that we may or may not
+  pair stores, depending on which mode happens to work best on
+  a particular SIMD implementation.  sse2, 3dnow, and altivec
+  are currently broken---will fix soon.
+
+    M ./dft/simd/n2b.c -1 +1
+    M ./dft/simd/n2f.c -1 +1
+    M ./genfft/annotate.ml -7 +9
+    M ./genfft/expr.ml -4 +6
+    M ./genfft/expr.mli -2 +2
+    M ./genfft/simd.ml -3 +7
+    M ./simd/simd-sse.h -1 +6
+
+Thu Feb 10 08:53:22 EST 2005  athena
+  * [project @ 2005-02-10 13:53:22 by athena]
+  instantiate altivec constants only once
+
+    M ./simd/altivec.c -1 +6
+    M ./simd/simd-altivec.h -4 +10
+
+Thu Feb 10 06:37:56 EST 2005  athena
+  * [project @ 2005-02-10 11:37:56 by athena]
+  Fixed alignment checks for new SIMD scheme
+
+    M ./dft/simd/n2b.c -3 +3
+    M ./dft/simd/n2f.c -3 +3
+
+Wed Feb  9 21:35:01 EST 2005  athena
+  * [project @ 2005-02-10 02:35:01 by athena]
+  Change n2?v_* codelets to store pairs of vectors, with implicit
+  2x2 transposition.  Works for 2-way SIMD as well.  Tested with sse
+  and sse2.  I haven't tried altivec yet, but I observed a huge
+  speedup when I transformed one codelet by hand.
+
+    M ./dft/simd/codelets/Makefile.am -8 +6
+    M ./genfft/annotate.ml -3 +48
+    M ./genfft/annotate.mli -2 +5
+    M ./genfft/expr.ml -2 +5
+    M ./genfft/expr.mli -2 +2
+    M ./genfft/gen_notw_c.ml -3 +17
+    M ./genfft/genutil.ml -3 +2
+    M ./genfft/simd.ml -5 +9
+    M ./genfft/simdmagic.ml -1 +3
+    M ./simd/simd-3dnow.h +6
+    M ./simd/simd-altivec.h +10
+    M ./simd/simd-sse.h +6
+    M ./simd/simd-sse2.h +6
+
+Tue Feb  8 21:28:38 EST 2005  athena
+  * [project @ 2005-02-09 02:28:38 by athena]
+  Resurrected old DIF codelets for experimental purposes.  They
+  are disabled for now, but I am keeping the setup around for
+  future reference.
+
+    M ./dft/codelets/standard/Makefile.am -1 +14
+
+Tue Feb  8 20:10:19 EST 2005  stevenj
+  * [project @ 2005-02-09 01:09:12 by stevenj]
+
+    M ./doc/fftw3.texi -7 +7
+
+Tue Feb  8 19:37:09 EST 2005  stevenj
+  * [project @ 2005-02-09 00:37:09 by stevenj]
+  clarifications, document --with-portable-binary and --with-gcc-arch
+
+    M ./doc/fftw3.texi -17 +30
+
+Tue Feb  8 19:23:41 EST 2005  stevenj
+  * [project @ 2005-02-09 00:20:56 by stevenj]
+
+    M ./NEWS -7 +7
+
+Tue Feb  8 01:36:22 EST 2005  stevenj
+  * [project @ 2005-02-08 06:36:22 by stevenj]
+  more change comments
+
+    M ./NEWS +61
+
+Tue Feb  8 00:41:38 EST 2005  stevenj
+  * [project @ 2005-02-08 05:41:38 by stevenj]
+  fma is definitely beneficial on Itanium with the HP/UX compiler
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Mon Feb  7 22:58:47 EST 2005  athena
+  * [project @ 2005-02-08 03:58:47 by athena]
+  Silence warnings.
+
+    M ./libbench2/bench-main.c -3 +4
+
+Mon Feb  7 22:55:49 EST 2005  stevenj
+  * [project @ 2005-02-08 03:55:49 by stevenj]
+  when we compile our own getopt, change symbol names to avoid conflicts (e.g. avoid build failure on MacOS X with --enable-shared)
+
+    M ./libbench2/getopt.h +26
+
+Mon Feb  7 22:36:42 EST 2005  stevenj
+  * [project @ 2005-02-08 03:36:42 by stevenj]
+  grr, more bugfixes for in-place case
+
+    M ./reodft/reodft00e-splitradix.c -9 +13
+
+Mon Feb  7 22:29:35 EST 2005  athena
+  * [project @ 2005-02-08 03:29:35 by athena]
+  removed relics of FRANZ mode
+
+    M ./dft/codelets/standard/Makefile.am -19
+
+Mon Feb  7 18:48:36 EST 2005  athena
+  * [project @ 2005-02-07 23:48:36 by athena]
+  Somehow xlc does not like ``vector int dummy;''
+
+    M ./simd/altivec.c -2 +2
+
+Mon Feb  7 13:59:47 EST 2005  athena
+  * [project @ 2005-02-07 18:59:47 by athena]
+  There is no need to enable sse to make the distribution.  This might
+  have been true in the past but not anymore.
+
+    M ./mkdist.sh -6 +8
+
+Mon Feb  7 13:55:17 EST 2005  athena
+  * [project @ 2005-02-07 18:55:17 by athena]
+  Oops---included fortran file in C sources
+
+    M ./api/Makefile.am -1 +1
+
+Mon Feb  7 13:42:45 EST 2005  athena
+  * [project @ 2005-02-07 18:42:45 by athena]
+  Set version string at ``make dist'' time, not at ``configure'' time,
+  so we know whether a user is using the fma version or not.
+
+    M ./api/Makefile.am -2 +9
+    M ./api/version.c -2 +3
+
+Sun Feb  6 17:00:33 EST 2005  athena
+  * [project @ 2005-02-06 22:00:33 by athena]
+  Removed useless files
+
+    R ./genfft/gen_hc2r_noinline.ml
+    R ./genfft/gen_notw_noinline.ml
+    R ./genfft/gen_notw_noinline_c.ml
+    R ./genfft/gen_r2hc_noinline.ml
+
+Sun Feb  6 16:59:39 EST 2005  athena
+  * [project @ 2005-02-06 21:59:39 by athena]
+  Different (simpler?) way to prevent the compiler from optimizing loop
+  inductive variables.  We now explicitly corrupt stride variables by
+  xor-ing them with another variable that happens to be zero (but the
+  compiler does not know it).  In this way, the compiler does not
+  attempt to extract a zillion loop indices from codelets, which would
+  overflow the register set.  Set the -fno-loop-optimize flag to further
+  help the process.
+  
+  Consequences: removed m* codelets.  Smaller library size.  Slightly
+  faster code with gcc/powerpc (including altivec).  Much faster code
+  with xlc/powerpc.  No changes for gcc/pentium.  Maybe slightly faster
+  with icc/pentium.
+
+    M ./configure.ac -11 +9
+    M ./dft/codelets/standard/Makefile.am -52 +3
+    M ./dft/simd/codelets/Makefile.am -30 +6
+    M ./genfft/Makefile.am -36 +10
+    M ./genfft/c.ml -1 +4
+    M ./genfft/c.mli -1 +2
+    M ./genfft/gen_hc2hc.ml -3 +5
+    M ./genfft/gen_hc2r.ml -3 +7
+    M ./genfft/gen_notw.ml -3 +6
+    M ./genfft/gen_notw_c.ml -3 +6
+    M ./genfft/gen_r2hc.ml -3 +7
+    M ./genfft/gen_twiddle.ml -3 +5
+    M ./genfft/gen_twiddle_c.ml -3 +5
+    M ./genfft/gen_twidsq.ml -3 +6
+    M ./genfft/gen_twidsq_c.ml -3 +6
+    M ./genfft/genutil.ml -1 +2
+    M ./genfft/simd.ml -1 +2
+    M ./kernel/ifftw.h -3 +7
+    M ./kernel/stride.c -1 +3
+    M ./rdft/codelets/hc2r/Makefile.am -13 +3
+    M ./rdft/codelets/r2hc/Makefile.am -13 +3
+    M ./support/Makefile.codelets -4
+
+Sat Feb  5 18:51:08 EST 2005  stevenj
+  * [project @ 2005-02-05 23:51:08 by stevenj]
+  paranoia about in-place rodft00 plans
+
+    M ./reodft/reodft00e-splitradix.c -3 +17
+
+Sat Feb  5 18:39:55 EST 2005  stevenj
+  * [project @ 2005-02-05 23:39:55 by stevenj]
+  don't believe pcost when using the estimator...there is no point, and
+  it screws up estimator hacks to prefer in-codelet loops to vecloops
+
+    M ./kernel/planner.c -2 +2
+
+Sat Feb  5 18:34:25 EST 2005  athena
+  * [project @ 2005-02-05 23:34:25 by athena]
+  Reduced optimization level from -O3 to -O for xlc, since -O generates
+  faster code.
+
+    M ./m4/ax_cc_maxopt.m4 -4 +4
+
+Sat Feb  5 16:26:58 EST 2005  stevenj
+  * [project @ 2005-02-05 21:26:58 by stevenj]
+  whoops, only applicable to redft00/rodft00 plans
+
+    M ./reodft/reodft00e-splitradix.c -1 +2
+
+Sat Feb  5 16:22:39 EST 2005  stevenj
+  * [project @ 2005-02-05 21:22:39 by stevenj]
+  fixed in-place operation, and don't create size-0 sub-plans
+
+    M ./reodft/reodft00e-splitradix.c -17 +16
+
+Fri Feb  4 11:30:30 EST 2005  athena
+  * [project @ 2005-02-04 16:30:30 by athena]
+  Autodetect altivec on linux.  This code works with gcc-3.4 and
+  -maltivec, with or without -mabi=altivec.  The code *should* work with
+  gcc-3.3 without -mabi=altivec.  However, disabling -mabi=altivec on
+  gcc-3.4 produces much worse code (I don't know why).
+
+    M ./simd/altivec.c -5 +31
+
+Fri Jan 28 00:04:58 EST 2005  stevenj
+  * [project @ 2005-01-28 05:04:58 by stevenj]
+  update reference
+
+    M ./doc/fftw3.texi -3 +3
+
+Thu Jan 27 15:48:28 EST 2005  stevenj
+  * [project @ 2005-01-27 20:48:28 by stevenj]
+  note that DCT-II/III are often called the'' DCT/DCT
+
+    M ./doc/fftw3.texi -11 +17
+
+Fri Jan 21 14:42:04 EST 2005  stevenj
+  * [project @ 2005-01-21 19:42:04 by stevenj]
+  added MSVC++ for ia64 (based on information at http://www.intel.com/cd/ids/developer/asmo-na/eng/19949.htm?prn=Y)
+
+    M ./kernel/cycle.h -1 +21
+
+Fri Jan 21 14:22:50 EST 2005  stevenj
+  * [project @ 2005-01-21 19:22:50 by stevenj]
+  vc++ defines _M_AMD64 on x86-64, apparently
+
+    M ./kernel/cycle.h -3 +3
+
+Tue Jan 18 22:30:27 EST 2005  stevenj
+  * [project @ 2005-01-19 03:30:27 by stevenj]
+  avoid gratuitous breakage with -Werror, requested by Simon Perreault
+
+    M ./m4/acx_pthread.m4 -2 +2
+
+Mon Jan 17 18:54:55 EST 2005  stevenj
+  * [project @ 2005-01-17 23:54:55 by stevenj]
+  comment typo
+
+    M ./m4/ax_gcc_aligns_stack.m4 -2 +2
+
+Sat Jan 15 16:56:23 EST 2005  stevenj
+  * [project @ 2005-01-15 21:56:23 by stevenj]
+  bumped shared-lib revision#
+
+    M ./configure.ac -1 +1
+
+Sat Jan 15 16:35:42 EST 2005  stevenj
+  * [project @ 2005-01-15 21:35:42 by stevenj]
+  add X(estimate_cost) to get estimator cost, and print from bench, to aid in tweaking estimator
+
+    M ./api/fftw3.h -1 +2
+    M ./api/flops.c +5
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -6 +11
+    M ./tests/bench.c -1 +1
+
+Sat Jan 15 14:57:56 EST 2005  stevenj
+  * [project @ 2005-01-15 19:57:56 by stevenj]
+
+    M ./doc/fftw3.texi -3 +3
+
+Sat Jan 15 12:57:07 EST 2005  stevenj
+  * [project @ 2005-01-15 17:57:07 by stevenj]
+  formatting fix
+
+    M ./doc/fftw3.texi -3 +9
+
+Sat Jan 15 12:31:28 EST 2005  stevenj
+  * [project @ 2005-01-15 17:31:28 by stevenj]
+  tweaks
+
+    M ./doc/fftw3.texi -17 +17
+    M ./reodft/Makefile.am +1
+    M ./reodft/conf.c -4 +7
+
+Sat Jan 15 12:03:24 EST 2005  stevenj
+  * [project @ 2005-01-15 17:03:24 by stevenj]
+  use less buffer space
+
+    M ./reodft/reodft00e-splitradix.c -22 +23
+
+Sat Jan 15 01:41:58 EST 2005  stevenj
+  * [project @ 2005-01-15 06:41:58 by stevenj]
+  added split-radix-based dct/dst I for odd n
+
+    A ./reodft/reodft00e-splitradix.c
+    M ./doc/fftw3.texi -16 +21
+    M ./reodft/Makefile.am -1 +1
+    M ./reodft/conf.c -1 +2
+    M ./reodft/redft00e-r2hc.c -2 +4
+    M ./reodft/reodft.h +1
+    M ./reodft/reodft00e-splitradix.c +337
+    M ./reodft/rodft00e-r2hc.c -2 +4
+
+Fri Jan 14 21:50:08 EST 2005  stevenj
+  * [project @ 2005-01-15 02:50:08 by stevenj]
+
+    M ./api/fftw3.h -3 +3
+
+Fri Jan 14 21:49:55 EST 2005  stevenj
+  * [project @ 2005-01-15 02:49:55 by stevenj]
+  warn silly users who confuse CVS id with FFTW version
+
+    M ./api/fftw3.h -1 +3
+
+Fri Jan 14 16:57:36 EST 2005  stevenj
+  * [project @ 2005-01-14 21:57:36 by stevenj]
+  get sparc cpu type on solaris as well as with linux
+
+    M ./m4/ax_gcc_archflag.m4 -13 +11
+
+Thu Jan 13 19:21:58 EST 2005  stevenj
+  * [project @ 2005-01-14 00:21:58 by stevenj]
+  detect prescott mobile (f37)
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Thu Jan 13 18:09:52 EST 2005  stevenj
+  * [project @ 2005-01-13 23:09:52 by stevenj]
+  use cpuid for x86_64 as well as i[56]86
+
+    M ./bootstrap.sh +1
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Thu Jan 13 17:59:55 EST 2005  stevenj
+  * [project @ 2005-01-13 22:59:55 by stevenj]
+  update with x86info 1.7 and other sources (identify k8, nocona, etc), handle nonzero leading bytes in eax
+
+    M ./m4/ax_gcc_archflag.m4 -23 +27
+
+Thu Jan 13 16:30:33 EST 2005  stevenj
+  * [project @ 2005-01-13 21:30:33 by stevenj]
+  compactified check for JOINABLE; use AC_DEFINE_UNQUOTED instead of AC_DEFINE for PTHREAD_CREATE_JOINABLE (thanks to Oliver Niekrenz for the bug report)
+
+    M ./m4/acx_pthread.m4 -23 +16
+
+Wed Jan 12 12:22:13 EST 2005  athena
+  * [project @ 2005-01-12 17:22:13 by athena]
+  The scheduler hack was incorrect because it swapped instructions
+  of the form  A = *B and  *B = C.  Fixed.
+
+    M ./genfft/annotate.ml -7 +13
+
+Tue Jan 11 22:13:24 EST 2005  athena
+  * [project @ 2005-01-12 03:13:24 by athena]
+  Quote expressions such as ``if test $FOO = yes'' when $FOO may be
+  empty.  Also, $GCC is set to either ``yes'' or empty, never to ``no''.
+
+    M ./m4/ax_cc_maxopt.m4 -2 +2
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Tue Jan 11 19:30:47 EST 2005  athena
+  * [project @ 2005-01-12 00:30:47 by athena]
+  Hmm---somehow the previous commit did not work.
+
+    M ./TODO -6
+    M ./configure.ac +9
+    M ./simd/altivec.c -3 +3
+
+Tue Jan 11 16:54:45 EST 2005  athena
+  * [project @ 2005-01-11 21:54:45 by athena]
+  Fixed various gcc-related problems on powerpc:
+    - gcc-3.4 becomes totally confused by expressions like
+        vec_add(a, vec_add(b, vec_add(c, ...)))
+      The compiler uses gigabytes of memory and then crashes, presumably
+      because of the exponential-time search problem involved in typing the
+      above expression (since vec_add can take either ints or floats).
+      I changed VADD and similar macros to be inline functions, thus
+      constraining the type system.
+  
+    - New flags
+        --param inline-unit-growth=1000 --param large-function-growth=1000
+      to work around limitations of the gcc-3.4 inliner.
+
+    M ./simd/simd-altivec.h -14 +12
+
+Mon Jan 10 21:27:24 EST 2005  athena
+  * [project @ 2005-01-11 02:27:24 by athena]
+  Check for HAVE_ALTIVEC_H
+
+    M ./simd/simd-altivec.h -1 +2
+
+Mon Jan 10 21:09:30 EST 2005  athena
+  * [project @ 2005-01-11 02:09:30 by athena]
+  Remove support for altivec using gcc builtins, since these keep
+  changing across gcc versions.  These changes work on gcc-3.4/linux; I
+  haven't tried MacOS X yet.  (The altivec ``spec'' differs between
+  Motorola/Apple and gcc, grrr...)
+
+    M ./configure.ac -1 +2
+    M ./simd/altivec.c -8 +5
+    M ./simd/simd-altivec.h -69 +5
+
+Mon Jan 10 18:57:30 EST 2005  athena
+  * [project @ 2005-01-10 23:57:30 by athena]
+  Stylistic changes
+
+    M ./rdft/rank0-vrank2.c -21 +21
+
+Mon Jan 10 17:34:41 EST 2005  athena
+  * [project @ 2005-01-10 22:34:41 by athena]
+  Changed incorrect ugliness condition.
+
+    M ./rdft/dft-r2hc.c -4 +2
+
+Mon Jan 10 16:09:43 EST 2005  stevenj
+  * [project @ 2005-01-10 21:09:43 by stevenj]
+  note x86info version number that was used, to make it easier to update
+  the cpuid for changes in later versions
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Mon Jan 10 15:00:51 EST 2005  athena
+  * [project @ 2005-01-10 20:00:51 by athena]
+  Make dft-r2hc non-UGLY for rank-0 problems
+
+    M ./rdft/dft-r2hc.c -1 +6
+
+Mon Jan 10 14:50:23 EST 2005  athena
+  * [project @ 2005-01-10 19:50:23 by athena]
+  Do not use -mcpu=970 on power4 processors, because power4 does
+  not have altivec.
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Mon Jan 10 14:48:47 EST 2005  athena
+  * [project @ 2005-01-10 19:48:47 by athena]
+  Note gcc-3.4 problem with inlining.
+
+    M ./TODO +5
+
+Mon Jan 10 13:51:08 EST 2005  athena
+  * [project @ 2005-01-10 18:51:08 by athena]
+  Oops, forgot to remove ``static'' from the declaration of noninlinable
+  functions.
+
+    M ./genfft/gen_hc2r_noinline.ml -3 +3
+    M ./genfft/gen_notw_noinline_c.ml -3 +3
+
+Mon Jan 10 12:31:26 EST 2005  athena
+  * [project @ 2005-01-10 17:31:26 by athena]
+  Recognize power4.  Use ``head -n COUNT'' instead of obsolete ``head
+  -COUNT'' (which fails on gentoo).
+
+    M ./m4/ax_gcc_archflag.m4 -3 +4
+
+Sun Jan  9 22:12:16 EST 2005  athena
+  * [project @ 2005-01-10 03:12:16 by athena]
+  Remind to add FAQ entry concerning gcc-3.4.[1-3] crashes.
+
+    M ./TODO +4
+
+Sun Jan  9 21:53:08 EST 2005  stevenj
+  * [project @ 2005-01-10 02:53:08 by stevenj]
+  whoops
+
+    M ./m4/ax_gcc_version.m4 -3 +3
+
+Sun Jan  9 21:48:02 EST 2005  stevenj
+  * [project @ 2005-01-10 02:48:02 by stevenj]
+  support checking for major.minor.patchlevel
+
+    M ./m4/ax_gcc_version.m4 -10 +11
+
+Sun Jan  9 21:40:18 EST 2005  athena
+  * [project @ 2005-01-10 02:40:18 by athena]
+  Revert CODELET_OPTIM to -O on IA32, which is faster than -O2.
+
+    M ./configure.ac -3 +4
+
+Sun Jan  9 20:30:12 EST 2005  athena
+  * [project @ 2005-01-10 01:30:12 by athena]
+  /bin/sh allows no spaces in assignments.
+
+    M ./configure.ac -1 +1
+
+Sun Jan  9 20:05:55 EST 2005  athena
+  * [project @ 2005-01-10 01:05:55 by athena]
+  Make non-inlinable functions external, so that gcc becomes confused
+  and does not try to inline them.
+
+    M ./genfft/gen_hc2r_noinline.ml -5 +5
+    M ./genfft/gen_notw_noinline.ml -5 +5
+    M ./genfft/gen_notw_noinline_c.ml -5 +5
+
+Sun Jan  9 13:44:25 EST 2005  athena
+  * [project @ 2005-01-09 18:44:25 by athena]
+  Add -fno-web to CFLAGS, because -fweb destroys FMAs.
+
+    M ./configure.ac -13 +18
+
+Sun Jan  9 10:31:47 EST 2005  athena
+  * [project @ 2005-01-09 15:31:47 by athena]
+  Allow -mcpu=970 besides -mcpu=G5
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Sun Jan  9 10:26:20 EST 2005  athena
+  * [project @ 2005-01-09 15:26:20 by athena]
+  configure was not using -fno-schedule-insns :-(
+
+    M ./configure.ac -2 +1
+
+Sun Jan  9 08:52:40 EST 2005  athena
+  * [project @ 2005-01-09 13:52:40 by athena]
+  In mkplan() and elsewhere, use solver index instead of solver
+  *pointer*, which looks marginally clearer.
+
+    M ./kernel/planner.c -18 +22
+
+Sun Jan  9 08:15:36 EST 2005  athena
+  * [project @ 2005-01-09 13:15:36 by athena]
+  Split planner hash table into two tables, for blessed and unblessed
+  solutions respectively.  Now an unblessed solution never overwrites a
+  blessed solution, thus avoiding wisdom leakage by construction.
+  Further, forget() is now a O(1) operation, which speeds up the
+  estimator when the wisdom table is large.
+
+    M ./TODO -6
+    M ./kernel/ifftw.h -14 +17
+    M ./kernel/planner.c -157 +126
+
+Sat Jan  8 21:19:45 EST 2005  athena
+  * [project @ 2005-01-09 02:19:45 by athena]
+  New TODO idea.
+
+    M ./TODO +7
+
+Thu Jan  6 11:02:29 EST 2005  athena
+  * [project @ 2005-01-06 16:02:29 by athena]
+  Split search() into two routines to make the UGLY/NO_UGLY logic
+  obvious.
+
+    M ./kernel/planner.c -41 +48
+
+Fri Dec 17 16:08:54 EST 2004  stevenj
+  * [project @ 2004-12-17 21:08:54 by stevenj]
+  push/pop 64-bit registers on ia64; thanks to Orion Poplawski for the fix
+
+    M ./simd/3dnow.c -1 +13
+    M ./simd/sse.c -1 +7
+    M ./simd/sse2.c -1 +7
+
+Thu Dec  9 21:41:09 EST 2004  stevenj
+  * [project @ 2004-12-10 02:41:09 by stevenj]
+  patch from FreeBSD ports - FreeBSD does not have memalign, but its
+  malloc is 16-byte aligned
+
+    M ./kernel/kalloc.c -1 +5
+
+Tue Nov 23 17:06:47 EST 2004  stevenj
+  * [project @ 2004-11-23 22:06:47 by stevenj]
+  don't compile taint.c with SIMD_CFLAGS (fixed Debian bug #259612)
+
+    M ./simd/Makefile.am -3 +10
+
+Thu Nov 18 11:37:32 EST 2004  stevenj
+  * [project @ 2004-11-18 16:37:32 by stevenj]
+  revert incorrect change -- codlist.c should be rebuilt, but it is built in the build directory and not in the source directory
+
+    M ./support/Makefile.codelets -1 +1
+
+Wed Nov 17 22:53:53 EST 2004  stevenj
+  * [project @ 2004-11-18 03:53:53 by stevenj]
+  $(CODLIST) should be rebuilt only if Makefile.am changes, or
+  alternatively only in maintainer mode, to prevent stomping in the
+  source directory during user builds.  (Thanks to Grant Cook for the
+  bug report.)
+
+    M ./support/Makefile.codelets -1 +1
+
+Sat Nov 13 13:43:01 EST 2004  stevenj
+  * [project @ 2004-11-13 18:43:01 by stevenj]
+  corrected #ifdef for icc/ia64, thanks to Matt Boman
+
+    M ./kernel/cycle.h -16 +18
+
+Sat Nov 13 13:34:55 EST 2004  stevenj
+  * [project @ 2004-11-13 18:34:55 by stevenj]
+  spelling correction (Larsen, not Larson)
+
+    M ./NEWS -1 +1
+
+Mon Nov  8 22:12:39 EST 2004  stevenj
+  * [project @ 2004-11-09 03:12:39 by stevenj]
+  use standard withval
+
+    M ./m4/ax_gcc_archflag.m4 -3 +3
+
+Mon Nov  8 22:09:16 EST 2004  stevenj
+  * [project @ 2004-11-09 03:09:16 by stevenj]
+  match doc
+
+    M ./m4/ax_gcc_x86_cpuid.m4 -2 +2
+
+Mon Nov  8 22:00:34 EST 2004  stevenj
+  * [project @ 2004-11-09 03:00:34 by stevenj]
+  formatting
+
+    M ./m4/ax_openmp.m4 -3 +3
+
+Mon Nov  8 21:59:33 EST 2004  stevenj
+  * [project @ 2004-11-09 02:59:33 by stevenj]
+  make sure OPENMP_CFLAGS environment variable is used correctly
+
+    M ./m4/ax_openmp.m4 -2 +6
+
+Mon Nov  8 21:46:50 EST 2004  stevenj
+  * [project @ 2004-11-09 02:46:50 by stevenj]
+  replace ax_check_cc_flags with more generic ax_check_compiler_flags
+
+    A ./m4/ax_check_compiler_flags.m4
+    M ./configure.ac -13 +8
+    M ./m4/ax_cc_maxopt.m4 -7 +6
+    R ./m4/ax_check_cc_flags.m4
+    M ./m4/ax_check_compiler_flags.m4 +37
+    M ./m4/ax_gcc_aligns_stack.m4 -5 +4
+    M ./m4/ax_gcc_archflag.m4 -3 +3
+
+Mon Nov  8 17:49:42 EST 2004  stevenj
+  * [project @ 2004-11-08 22:49:42 by stevenj]
+  separate macro for OpenMP test
+
+    A ./m4/ax_openmp.m4
+    M ./configure.ac -40 +6
+    M ./m4/ax_cc_maxopt.m4 -3 +5
+    M ./m4/ax_openmp.m4 +58
+
+Fri Nov  5 16:24:22 EST 2004  stevenj
+  * [project @ 2004-11-05 21:24:22 by stevenj]
+  typo
+
+    M ./doc/fftw3.texi -2 +2
+
+Fri Oct 29 00:48:13 EDT 2004  stevenj
+  * [project @ 2004-10-29 04:48:13 by stevenj]
+
+    M ./configure.ac -12 +19
+
+Thu Oct 28 00:09:38 EDT 2004  stevenj
+  * [project @ 2004-10-28 04:09:38 by stevenj]
+  better guessing of sparc type on Linux
+
+    M ./m4/ax_gcc_archflag.m4 -2 +15
+
+Wed Oct 27 13:44:08 EDT 2004  stevenj
+  * [project @ 2004-10-27 17:44:08 by stevenj]
+  note default
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Wed Oct 27 13:41:57 EDT 2004  stevenj
+  * [project @ 2004-10-27 17:41:57 by stevenj]
+  tweak
+
+    M ./m4/ax_gcc_archflag.m4 -2 +2
+
+Wed Oct 27 13:34:25 EDT 2004  stevenj
+  * [project @ 2004-10-27 17:34:25 by stevenj]
+  comment
+
+    M ./m4/ax_gcc_x86_cpuid.m4 -2 +3
+
+Wed Oct 27 13:31:10 EDT 2004  stevenj
+  * [project @ 2004-10-27 17:31:10 by stevenj]
+  whoops, m4 is EXTRA_DIST, not SUBDIR, since it doesn't have a Makefile
+
+    M ./Makefile.am -2 +2
+
+Wed Oct 27 13:16:57 EDT 2004  stevenj
+  * [project @ 2004-10-27 17:16:57 by stevenj]
+  silence warnings
+
+    M ./m4/ocaml.m4 -2 +2
+
+Wed Oct 27 13:14:22 EDT 2004  stevenj
+  * [project @ 2004-10-27 17:14:22 by stevenj]
+  clean up m4 macros; try to detect correct gcc -march flag on x86; new --with-portable-binary, --with-gcc-arch=<arch> flags; use -O2 for codelets with gcc 3.4 to work around bug
+
+    A ./m4/
+    A ./m4/acx_pthread.m4
+    A ./m4/amx_prog_as.m4
+    A ./m4/ax_cc_maxopt.m4
+    A ./m4/ax_check_cc_flags.m4
+    A ./m4/ax_gcc_aligns_stack.m4
+    A ./m4/ax_gcc_archflag.m4
+    A ./m4/ax_gcc_version.m4
+    A ./m4/ax_gcc_x86_cpuid.m4
+    A ./m4/ocaml.m4
+    M ./Makefile.am -2 +4
+    R ./acinclude.m4
+    R ./acx_pthread.m4
+    M ./bootstrap.sh -12 +10
+    M ./configure.ac -11 +26
+    M ./m4/acx_pthread.m4 +240
+    M ./m4/amx_prog_as.m4 +11
+    M ./m4/ax_cc_maxopt.m4 +126
+    M ./m4/ax_check_cc_flags.m4 +37
+    M ./m4/ax_gcc_aligns_stack.m4 +47
+    M ./m4/ax_gcc_archflag.m4 +147
+    M ./m4/ax_gcc_version.m4 +34
+    M ./m4/ax_gcc_x86_cpuid.m4 +37
+    M ./m4/ocaml.m4 +84
+
+Tue Oct 26 16:46:14 EDT 2004  stevenj
+  * [project @ 2004-10-26 20:46:14 by stevenj]
+  rename cexp -> mcexp to avoid conflict with C99 builtin
+
+    M ./libbench2/mp.c -3 +3
+
+Mon Oct 25 16:58:23 EDT 2004  stevenj
+  * [project @ 2004-10-25 20:58:23 by stevenj]
+  use basename , w/o args, for compiler-name comparisons; also detect Compaq ccc on alpha-linus
+
+    M ./acinclude.m4 -4 +10
+
+Sun Oct 24 22:05:10 EDT 2004  stevenj
+  * [project @ 2004-10-25 02:05:10 by stevenj]
+  note recent icc problems
+
+    M ./doc/FAQ/fftw-faq.bfnn -6 +9
+
+Sun Oct 24 02:10:12 EDT 2004  stevenj
+  * [project @ 2004-10-24 06:10:12 by stevenj]
+  whoops, disable semaphores again (for now)
+
+    M ./threads/threads.c -1 +1
+
+Sun Oct 24 02:04:58 EDT 2004  stevenj
+  * [project @ 2004-10-24 06:04:58 by stevenj]
+  POSIX semaphores are *not* the same as SYSV semaphores
+
+    M ./threads/threads.c -2 +2
+
+Sun Oct 24 01:18:14 EDT 2004  stevenj
+  * [project @ 2004-10-24 05:18:14 by stevenj]
+  re-implement threaded stuff; dftw now takes parameters to indicate a portion of m loop
+
+    A ./rdft/hc2hc.c
+    A ./rdft/hc2hc.h
+    A ./threads/ct.c
+    A ./threads/hc2hc.c
+    M ./dft/conf.c -2 +2
+    M ./dft/ct.c -8 +11
+    M ./dft/ct.h -8 +11
+    M ./dft/ctsq.c -2 +2
+    M ./dft/dft.h -5 +2
+    M ./dft/dftw-direct.c -15 +29
+    M ./dft/dftw-directbuf.c -15 +30
+    M ./dft/dftw-generic.c -14 +31
+    M ./dft/kdft-dif.c -7 +3
+    M ./dft/kdft-difsq.c -2 +2
+    M ./dft/kdft-dit.c -7 +3
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/twiddle.c -1 +10
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/conf.c -2 +2
+    R ./rdft/ct.c
+    R ./rdft/ct.h
+    M ./rdft/hc2hc-common.c -4 +33
+    M ./rdft/hc2hc-direct.c -26 +42
+    M ./rdft/hc2hc-directbuf.c -25 +44
+    M ./rdft/hc2hc-generic.c -37 +104
+    M ./rdft/hc2hc.c +221
+    M ./rdft/hc2hc.h +60
+    M ./rdft/khc2hc.c -8 +4
+    M ./rdft/rdft.h -3 +1
+    M ./threads/Makefile.am -2 +2
+    R ./threads/ct-dit.c
+    M ./threads/ct.c +253
+    M ./threads/dft-vrank-geq1.c -2 +2
+    R ./threads/hc2hc-dif.c
+    R ./threads/hc2hc-dit.c
+    M ./threads/hc2hc.c +238
+    M ./threads/threads.c -27 +8
+    M ./threads/threads.h -5 +4
+
+Thu Oct 21 20:44:51 EDT 2004  stevenj
+  * [project @ 2004-10-22 00:44:51 by stevenj]
+  more C++ notes
+
+    M ./doc/fftw3.texi -4 +11
+
+Thu Oct 14 09:50:38 EDT 2004  stevenj
+  * [project @ 2004-10-14 13:50:38 by stevenj]
+  note bug report for VC++ 6.0 from Dale Dickerhoof
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +4
+
+Fri Oct  1 16:06:59 EDT 2004  stevenj
+  * [project @ 2004-10-01 20:06:59 by stevenj]
+  fmt
+
+    M ./api/fftw3.h -3 +3
+
+Fri Oct  1 15:59:17 EDT 2004  stevenj
+  * [project @ 2004-10-01 19:58:50 by stevenj]
+  comment typo
+
+    M ./rdft/vrank3-transpose.c -3 +3
+
+Fri Oct  1 15:48:09 EDT 2004  stevenj
+  * [project @ 2004-10-01 19:48:09 by stevenj]
+  bug fix -- ishift/oshift only apply to execution of child plan
+
+    M ./rdft/dft-r2hc.c -9 +4
+
+Thu Sep 30 21:12:47 EDT 2004  athena
+  * [project @ 2004-10-01 01:12:47 by athena]
+  New planner that tries never to lose wisdom.
+
+    M ./api/fftw3.h -2 +1
+    M ./api/mapflags.c -1
+    M ./kernel/ifftw.h -6 +4
+    M ./kernel/planner.c -137 +183
+
+Thu Sep 30 13:36:43 EDT 2004  athena
+  * [project @ 2004-09-30 17:36:43 by athena]
+  Nested comment was triggering a warning.
+
+    M ./api/fftw3.h -2 +2
+
+Fri Sep 10 15:20:07 EDT 2004  stevenj
+  * [project @ 2004-09-10 19:20:07 by stevenj]
+  system "root" under dgjpp is /dev/env/DJDIR, not /dev/env/DJGPP,
+  according to djgpp's libc.info; patch confirmed with J. M. Guerrero
+
+    M ./api/import-system-wisdom.c -1 +1
+
+Wed Sep  8 18:50:03 EDT 2004  stevenj
+  * [project @ 2004-09-08 22:50:03 by stevenj]
+  some minor portability fixes for djgpp; thanks to Juan Manuel Guerrero for the patch
+
+    M ./api/import-system-wisdom.c -1 +8
+    M ./tests/Makefile.am -4 +4
+    M ./tools/fftw-wisdom-to-conf.in -1 +1
+
+Thu Aug 19 12:41:23 EDT 2004  stevenj
+  * [project @ 2004-08-19 16:41:23 by stevenj]
+  pointer to tutorial for quick start
+
+    M ./README +3
+
+Thu Aug 19 12:39:50 EDT 2004  stevenj
+  * [project @ 2004-08-19 16:39:50 by stevenj]
+  point users to manual
+
+    M ./api/fftw3.h -1 +16
+
+Sat Aug  7 13:42:22 EDT 2004  stevenj
+  * [project @ 2004-08-07 17:42:22 by stevenj]
+  minor typo
+
+    M ./doc/fftw3.texi -2 +2
+
+Sun Jul 18 18:54:18 EDT 2004  stevenj
+  * [project @ 2004-07-18 22:54:18 by stevenj]
+  use __DECCXX for Compaq cxx, not Linux-specific symbol
+
+    M ./kernel/cycle.h -2 +2
+
+Fri Jul 16 13:55:25 EDT 2004  stevenj
+  * [project @ 2004-07-16 17:55:25 by stevenj]
+  patch by John Bowman to make cycle counter work with DEC cxx under Linux
+
+    M ./kernel/cycle.h -2 +2
+
+Wed Jun 30 00:45:10 EDT 2004  stevenj
+  * [project @ 2004-06-30 04:45:10 by stevenj]
+  updated pruned FFT discussion, with link to further details on www.fftw.org/pruned.html
+
+    M ./doc/FAQ/fftw-faq.bfnn -8 +16
+    M ./doc/FAQ/html.refs +1
+
+Mon Jun 14 20:08:27 EDT 2004  stevenj
+  * [project @ 2004-06-15 00:08:27 by stevenj]
+  darwin is based on freebsd
+
+    M ./acx_pthread.m4 -2 +2
+
+Thu Jun  3 14:23:41 EDT 2004  stevenj
+  * [project @ 2004-06-03 18:23:41 by stevenj]
+  in --with-windows-f77-mangling, add lowercase + single underscore for Intel compilers, etc. (thanks to David Gomez for the bug report)
+
+    M ./api/f77api.c +5
+
+Wed Apr  7 00:46:07 EDT 2004  stevenj
+  * [project @ 2004-04-07 04:46:07 by stevenj]
+  whoops, extra alignment check
+
+    M ./rdft/rank0-vrank2.c -1 +5
+
+Wed Apr  7 00:16:49 EDT 2004  stevenj
+  * [project @ 2004-04-07 04:16:49 by stevenj]
+  disable most 2-float-as-double copying, add alignment check in one remaining place
+
+    M ./kernel/ifftw.h -1 +11
+    M ./rdft/rank0-vrank2.c -22 +25
+    M ./rdft/vrank3-transpose.c -6 +6
+
+Tue Apr  6 13:49:13 EDT 2004  stevenj
+  * [project @ 2004-04-06 17:49:13 by stevenj]
+  make sure it is clear that real-even/odd refers to symmetry, not size
+
+    M ./doc/fftw3.texi -7 +7
+
+Mon Apr  5 20:18:29 EDT 2004  stevenj
+  * [project @ 2004-04-06 00:18:29 by stevenj]
+  optimization
+
+    M ./rdft/vrank3-transpose.c -11 +48
+
+Fri Apr  2 21:31:00 EST 2004  stevenj
+  * [project @ 2004-04-03 02:31:00 by stevenj]
+  separate cutoff for ugliness...these cutoffs are still not ideal
+
+    M ./rdft/vrank3-transpose.c -3 +4
+
+Fri Apr  2 21:30:17 EST 2004  stevenj
+  * [project @ 2004-04-03 02:30:17 by stevenj]
+  transpose.c is gone
+
+    M ./kernel/ifftw.h -15 +1
+
+Fri Apr  2 21:18:27 EST 2004  stevenj
+  * [project @ 2004-04-03 02:18:27 by stevenj]
+  move all rank0 transforms to rdft
+
+    A ./rdft/rank0-vrank2.c
+    A ./rdft/vrank3-transpose.c
+    M ./configure.ac +1
+    M ./dft/Makefile.am -2 +2
+    M ./dft/conf.c -4 +1
+    R ./dft/rank0.c
+    R ./dft/vrank2-transpose.c
+    R ./dft/vrank3-transpose.c
+    M ./kernel/Makefile.am -2 +2
+    R ./kernel/transpose.c
+    M ./rdft/Makefile.am -2 +3
+    M ./rdft/conf.c -1 +3
+    M ./rdft/dft-r2hc.c -20 +36
+    M ./rdft/rank0-vrank2.c +289
+    M ./rdft/rdft.h -2 +2
+    M ./rdft/vrank3-transpose.c +943
+
+Fri Apr  2 20:35:35 EST 2004  stevenj
+  * [project @ 2004-04-03 01:35:35 by stevenj]
+  enable fp-moves/us comparison of rank-0 transforms
+
+    M ./libbench2/mflops.c -2 +8
+    M ./libbench2/report.c -3 +7
+
+Thu Apr  1 16:13:22 EST 2004  stevenj
+  * [project @ 2004-04-01 21:13:22 by stevenj]
+  whoops
+
+    M ./kernel/transpose.c -1 +1
+
+Thu Apr  1 15:25:30 EST 2004  stevenj
+  * [project @ 2004-04-01 20:25:30 by stevenj]
+  whoops
+
+    M ./kernel/tensor7.c -3 +3
+
+Wed Mar 31 18:11:02 EST 2004  stevenj
+  * [project @ 2004-03-31 23:11:02 by stevenj]
+  sort tensor dims by stride absolute values, not strides
+
+    M ./kernel/tensor7.c -5 +7
+
+Tue Mar 30 20:22:50 EST 2004  stevenj
+  * [project @ 2004-03-31 01:22:50 by stevenj]
+
+    M ./kernel/transpose.c -2 +1
+
+Tue Mar 30 19:44:54 EST 2004  stevenj
+  * [project @ 2004-03-31 00:44:54 by stevenj]
+  added improved transpose algorithm for N x M where |N-M| is small
+
+    M ./dft/dftw-generic.c +1
+    M ./dft/vrank2-transpose.c -7 +10
+    M ./dft/vrank3-transpose.c -6 +7
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/transpose.c -15 +73
+    M ./libbench2/problem.c -2 +4
+
+Tue Mar 30 19:41:14 EST 2004  stevenj
+  * [project @ 2004-03-31 00:41:14 by stevenj]
+  check to make sure SIMD matches precision, and make sure user doesn't select both SSE and SSE2
+
+    M ./configure.ac -2 +7
+
+Sun Mar 28 09:26:38 EST 2004  athena
+  * [project @ 2004-03-28 14:26:38 by athena]
+  Implemented hc2hc-generic hc2r.
+
+    M ./rdft/hc2hc-generic.c -29 +79
+
+Thu Mar 25 11:19:25 EST 2004  athena
+  * [project @ 2004-03-25 16:19:25 by athena]
+  Inverted loop for stride-1 access.
+
+    M ./rdft/hc2hc-generic.c -12 +18
+
+Thu Mar 25 11:18:49 EST 2004  athena
+  * [project @ 2004-03-25 16:18:49 by athena]
+  Swapped j <-> k for consistency
+
+    M ./dft/dftw-generic.c -3 +3
+
+Tue Mar 23 12:08:07 EST 2004  athena
+  * [project @ 2004-03-23 17:08:07 by athena]
+  Require that R be odd
+
+    M ./rdft/hc2hc-generic.c -1 +1
+
+Tue Mar 23 11:49:01 EST 2004  athena
+  * [project @ 2004-03-23 16:49:01 by athena]
+  Implemented hc2hc-generic (DIT only for now).
+
+    A ./rdft/hc2hc-generic.c
+    M ./rdft/Makefile.am -3 +3
+    M ./rdft/conf.c -5 +4
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/hc2hc-generic.c +216
+    M ./rdft/rdft.h +1
+
+Mon Mar 22 14:43:16 EST 2004  athena
+  * [project @ 2004-03-22 19:43:16 by athena]
+  Relax equality of twiddle description, since the `i' field
+  is not used by TW_FULL or TW_HALF.
+
+    M ./kernel/twiddle.c -5 +17
+
+Mon Mar 22 13:22:44 EST 2004  athena
+  * [project @ 2004-03-22 18:22:41 by athena]
+  Do not allocate tw_instr's on the stack.  Thus, the ``consistency check''
+  in twiddle.c becomes wrong.
+
+    M ./dft/dftw-generic.c -8 +1
+    M ./kernel/twiddle.c -2 +1
+
+Mon Mar 22 13:21:28 EST 2004  athena
+  * [project @ 2004-03-22 18:21:28 by athena]
+  Fixed incorrect malloc()/free() logic.
+
+    M ./libbench2/mp.c -3 +6
+
+Mon Mar 22 09:04:37 EST 2004  athena
+  * [project @ 2004-03-22 14:04:37 by athena]
+  Silence warnings
+
+    M ./rdft/hc2hc-directbuf.c -2 +2
+
+Mon Mar 22 09:02:55 EST 2004  athena
+  * [project @ 2004-03-22 14:02:55 by athena]
+  Separate file for hc2hc common routines
+
+    A ./rdft/hc2hc-common.c
+    M ./rdft/Makefile.am -4 +4
+    M ./rdft/ct.c -35
+    M ./rdft/hc2hc-common.c +58
+
+Mon Mar 22 08:23:56 EST 2004  athena
+  * [project @ 2004-03-22 13:23:56 by athena]
+  (re)Implemented buffered hc2hc.  Slight simplification of
+  twiddle-factors management.
+
+    A ./rdft/hc2hc-directbuf.c
+    M ./dft/dftw-directbuf.c -2 +2
+    M ./rdft/Makefile.am -2 +3
+    M ./rdft/ct.h +2
+    M ./rdft/hc2hc-direct.c -16 +8
+    M ./rdft/hc2hc-directbuf.c +238
+    M ./rdft/khc2hc.c -1 +2
+
+Sun Mar 21 19:53:05 EST 2004  athena
+  * [project @ 2004-03-22 00:53:05 by athena]
+  Incremented libtool revision number before we forget.
+
+    M ./configure.ac -1 +1
+
+Sun Mar 21 19:25:56 EST 2004  athena
+  * [project @ 2004-03-22 00:25:56 by athena]
+  Fixed opcnt
+
+    M ./rdft/hc2hc-direct.c -2 +5
+
+Sun Mar 21 17:56:15 EST 2004  athena
+  * [project @ 2004-03-21 22:56:15 by athena]
+  Renamed files.  These solvers are not really cooley-tukey.
+
+    A ./dft/dftw-direct.c
+    A ./dft/dftw-directbuf.c
+    A ./dft/dftw-generic.c
+    M ./dft/Makefile.am -5 +5
+    R ./dft/ct-directw.c
+    R ./dft/ct-directwbuf.c
+    R ./dft/ct-generic.c
+    M ./dft/dftw-direct.c +153
+    M ./dft/dftw-directbuf.c +213
+    M ./dft/dftw-generic.c +304
+
+Sun Mar 21 12:38:45 EST 2004  athena
+  * [project @ 2004-03-21 17:38:45 by athena]
+  Started moving rdft/ to the new cooley-tukey ontology
+
+    A ./rdft/ct.c
+    A ./rdft/ct.h
+    A ./rdft/hc2hc-direct.c
+    A ./rdft/khc2hc.c
+    M ./dft/ct.h -1 +1
+    M ./genfft/gen_hc2hc.ml -6 +3
+    M ./rdft/Makefile.am -5 +4
+    M ./rdft/codelet-rdft.h -3 +1
+    M ./rdft/ct.c +253
+    M ./rdft/ct.h +54
+    R ./rdft/hc2hc-buf.c
+    R ./rdft/hc2hc-dif.c
+    M ./rdft/hc2hc-direct.c +182
+    R ./rdft/hc2hc-dit.c
+    R ./rdft/hc2hc.c
+    R ./rdft/hc2hc.h
+    R ./rdft/khc2hc-dif.c
+    R ./rdft/khc2hc-dit.c
+    M ./rdft/khc2hc.c +32
+    M ./rdft/rdft.h -6 +1
+
+Sun Mar 21 10:59:42 EST 2004  athena
+  * [project @ 2004-03-21 15:59:42 by athena]
+  Plans in ct-*.c are subtypes of plan_dftw, not plan_dft
+
+    M ./dft/ct-directw.c -2 +2
+    M ./dft/ct-directwbuf.c -2 +2
+    M ./dft/ct-generic.c -1 +1
+
+Sun Mar 21 10:38:18 EST 2004  athena
+  * [project @ 2004-03-21 15:38:18 by athena]
+  Slight simplification
+
+    M ./dft/ct-directw.c -2 +2
+
+Sun Mar 21 10:20:06 EST 2004  athena
+  * [project @ 2004-03-21 15:20:06 by athena]
+  Minor simplification
+
+    M ./dft/ct.c -4 +2
+
+Sat Mar 20 08:43:57 EST 2004  athena
+  * [project @ 2004-03-20 13:43:57 by athena]
+  Workarounds for icc-8.0 nonsense.
+
+    M ./simd/simd-sse.h +6
+    M ./simd/simd-sse2.h +6
+
+Sun Mar  7 07:56:08 EST 2004  athena
+  * [project @ 2004-03-07 12:56:08 by athena]
+  FFTW_FORWARD is not technically an ``option''.
+
+    M ./doc/fftw3.texi -2 +2
+
+Tue Feb 24 12:17:06 EST 2004  stevenj
+  * [project @ 2004-02-24 17:17:06 by stevenj]
+  Alejandro requested that his name be removed from @author
+
+    M ./acx_pthread.m4 -4 +5
+
+Mon Feb 23 17:42:56 EST 2004  stevenj
+  * [project @ 2004-02-23 22:42:56 by stevenj]
+  GNU Pth emulation library check
+
+    M ./acx_pthread.m4 -3 +12
+
+Sat Feb 21 17:51:13 EST 2004  stevenj
+  * [project @ 2004-02-21 22:51:13 by stevenj]
+  calling can-do calls the estimating-planner, which creates wisdom that we don't want ...we should be able to do all of the documented problems, anyway
+
+    M ./tools/fftw-wisdom.c -1 +1
+
+Sat Feb 21 17:46:06 EST 2004  stevenj
+  * [project @ 2004-02-21 22:46:06 by stevenj]
+  don't forget_wisdom because of side effects
+
+    M ./tests/bench.c -1
+
+Sat Feb 21 17:42:47 EST 2004  stevenj
+  * [project @ 2004-02-21 22:42:47 by stevenj]
+  forget wisdom from can_do
+
+    M ./tests/bench.c +1
+
+Thu Feb 19 14:11:14 EST 2004  stevenj
+  * [project @ 2004-02-19 19:11:14 by stevenj]
+  parenthesization
+
+    M ./api/malloc.c -1 +1
+
+Fri Feb 13 07:20:31 EST 2004  athena
+  * [project @ 2004-02-13 12:20:31 by athena]
+  Split malloc into kernel_malloc and API malloc
+
+    A ./api/malloc.c
+    A ./kernel/kalloc.c
+    M ./api/Makefile.am -14 +14
+    M ./api/malloc.c +32
+    M ./kernel/Makefile.am -5 +5
+    M ./kernel/alloc.c -127 +5
+    M ./kernel/ifftw.h -1 +6
+    M ./kernel/kalloc.c +140
+    M ./tests/bench.c +4
+
+Thu Feb 12 15:42:20 EST 2004  stevenj
+  * [project @ 2004-02-12 20:42:20 by stevenj]
+  X(malloc) must be extern "C"
+
+    M ./kernel/alloc.c -1 +4
+
+Thu Feb 12 15:41:44 EST 2004  stevenj
+  * [project @ 2004-02-12 20:41:44 by stevenj]
+  satsify C++ compiler
+
+    M ./dft/bluestein.c -2 +2
+
+Thu Feb  5 20:39:14 EST 2004  stevenj
+  * [project @ 2004-02-06 01:39:14 by stevenj]
+  with the new flags, fma is definitely beneficial on PA-RISC with HP/UX cc
+
+    M ./doc/FAQ/fftw-faq.bfnn -6 +7
+
+Thu Feb  5 19:52:17 EST 2004  stevenj
+  * [project @ 2004-02-06 00:52:17 by stevenj]
+  grr, Ofaster etcetera are not supported under older versions of the compiler.  Note that +Ofltacc *disables* fp-reordering optimizations (which are enabled by +Oall).  +Optrs_ansi is the older version of the aliasing stuff
+
+    M ./acinclude.m4 -1 +1
+
+Thu Feb  5 19:26:01 EST 2004  stevenj
+  * [project @ 2004-02-06 00:26:01 by stevenj]
+  +Otype_safety=ansi on hpux
+
+    M ./acinclude.m4 -1 +1
+
+Thu Feb  5 19:22:34 EST 2004  stevenj
+  * [project @ 2004-02-06 00:22:34 by stevenj]
+  just use +Ofaster on hpux (+O3 +Onolimit +Olibcalls +Ofltacc=relaxed -Wl,+mergeseg)
+
+    M ./acinclude.m4 -1 +1
+
+Fri Jan 30 14:17:15 EST 2004  stevenj
+  * [project @ 2004-01-30 19:17:15 by stevenj]
+  check for win32 threads for mingw32; thanks to Alessio Massaro
+
+    M ./configure.ac +8
+
+Thu Jan 29 15:23:33 EST 2004  stevenj
+  * [project @ 2004-01-29 20:23:33 by stevenj]
+  added missing 'static', thanks to Alessio Massaro
+
+    M ./threads/threads.c -3 +3
+
+Fri Jan  9 16:36:48 EST 2004  stevenj
+  * [project @ 2004-01-09 21:36:48 by stevenj]
+  print more like bluestein
+
+    M ./rdft/dht-rader.c -1 +1
+
+Fri Jan  9 15:45:22 EST 2004  stevenj
+  * [project @ 2004-01-09 20:45:22 by stevenj]
+  fixed op count for R2HC_ONLY_CONV
+
+    M ./rdft/dht-rader.c -3 +2
+
+Fri Jan  9 15:41:50 EST 2004  stevenj
+  * [project @ 2004-01-09 20:41:50 by stevenj]
+  include DESTROY_INPUT in buffered flags for in-place...otherwise in-place hc2r uses rdft-dhtcvs diff
+
+    M ./dft/buffered.c -1 +10
+    M ./rdft/buffered.c -1 +10
+    M ./rdft/buffered2.c -1 +9
+
+Fri Jan  9 15:41:09 EST 2004  stevenj
+  * [project @ 2004-01-09 20:41:09 by stevenj]
+  resurrected R2HC_ONLY_CONV option to share plans and save on planning time
+
+    M ./rdft/dht-rader.c -3 +46
+
+Fri Jan  9 14:47:00 EST 2004  stevenj
+  * [project @ 2004-01-09 19:47:00 by stevenj]
+  precompute folding for cyclic convolution
+
+    M ./rdft/dht-rader.c -13 +9
+
+Wed Jan  7 16:48:39 EST 2004  stevenj
+  * [project @ 2004-01-07 21:48:39 by stevenj]
+  minor
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Wed Jan  7 16:48:25 EST 2004  stevenj
+  * [project @ 2004-01-07 21:48:25 by stevenj]
+  note reports of successful compilation on Windows
+
+    M ./doc/FAQ/fftw-faq.bfnn -5 +5
+
+Wed Jan  7 14:16:16 EST 2004  stevenj
+  * [project @ 2004-01-07 19:16:16 by stevenj]
+  citation year
+
+    M ./reodft/reodft010e-r2hc.c -2 +2
+
+Tue Jan  6 01:07:36 EST 2004  stevenj
+  * [project @ 2004-01-06 06:07:36 by stevenj]
+  comment
+
+    M ./rdft/dht-rader.c -1 +1
+
+Tue Jan  6 01:07:08 EST 2004  stevenj
+  * [project @ 2004-01-06 06:07:08 by stevenj]
+  comment fix
+
+    M ./rdft/dht-rader.c -1 +1
+
+Tue Jan  6 01:06:57 EST 2004  stevenj
+  * [project @ 2004-01-06 06:06:57 by stevenj]
+  fixed naming cruft
+
+    M ./rdft/dht-rader.c -18 +18
+
+Tue Jan  6 00:56:16 EST 2004  stevenj
+  * [project @ 2004-01-06 05:56:16 by stevenj]
+  space
+
+    M ./rdft/dht-rader.c -1
+
+Tue Jan  6 00:55:53 EST 2004  stevenj
+  * [project @ 2004-01-06 05:55:45 by stevenj]
+  comment
+
+    M ./rdft/dht-rader.c -1 +1
+
+Tue Jan  6 00:54:07 EST 2004  stevenj
+  * [project @ 2004-01-06 05:54:07 by stevenj]
+  moved assert
+
+    M ./rdft/dht-rader.c -1 +1
+
+Tue Jan  6 00:49:16 EST 2004  stevenj
+  * [project @ 2004-01-06 05:49:16 by stevenj]
+  comment
+
+    M ./rdft/dht-rader.c +5
+
+Tue Jan  6 00:41:06 EST 2004  stevenj
+  * [project @ 2004-01-06 05:41:06 by stevenj]
+  delete old R2HC_ONLY_CONV hack, now defunct
+
+    M ./rdft/dht-rader.c -46
+
+Tue Jan  6 00:32:58 EST 2004  stevenj
+  * [project @ 2004-01-06 05:32:58 by stevenj]
+  added padded real rader
+
+    M ./rdft/dht-rader.c -56 +95
+
+Mon Jan  5 22:56:58 EST 2004  stevenj
+  * [project @ 2004-01-06 03:56:58 by stevenj]
+  removed unused var
+
+    M ./rdft/generic.c -1 +1
+
+Mon Jan  5 21:20:29 EST 2004  stevenj
+  * [project @ 2004-01-06 02:20:29 by stevenj]
+  handle both FFT_SIGN values
+
+    M ./rdft/generic.c -2 +14
+
+Fri Jan  2 06:07:51 EST 2004  athena
+  * [project @ 2004-01-02 11:07:51 by athena]
+  Oops: d->ros  ==>  d->ios
+
+    M ./rdft/codelets/r2hc.c -1 +1
+
+Fri Jan  2 06:05:10 EST 2004  athena
+  * [project @ 2004-01-02 11:05:10 by athena]
+  Oops: d->ris should have been d->iis
+
+    M ./rdft/codelets/hc2r.c -1 +1
+
+Thu Jan  1 16:00:07 EST 2004  athena
+  * [project @ 2004-01-01 21:00:07 by athena]
+  Removed rdft rader cooley-tukey, to be superseded by a generic
+  reduction of rdft twiddle problems to dft + pre/post processing
+
+    M ./dft/Makefile.am -2 +2
+    M ./dft/dft.h -5 +1
+    R ./dft/rader-omega.c
+    M ./dft/rader.c -2 +38
+    M ./rdft/Makefile.am -2 +2
+    M ./rdft/conf.c -2 +1
+    R ./rdft/rader-hc2hc.c
+
+Thu Jan  1 15:44:09 EST 2004  athena
+  * [project @ 2004-01-01 20:44:09 by athena]
+  In anticipation of the upcoming revision of rdft, removed rdft generic
+  dit/dif cooley-tukey, in favor of generic rh2c and hc2r solvers.
+  Cleaned up stuff that became unused after this change, such as
+  TW_GENERIC.
+
+    M ./dft/ct.c -33 +3
+    M ./dft/generic.c -1 +2
+    M ./kernel/ifftw.h -2 +3
+    M ./kernel/primes.c -1 +32
+    M ./kernel/twiddle.c -18 +1
+    M ./rdft/generic.c -258 +103
+
+Thu Jan  1 12:59:30 EST 2004  athena
+  * [project @ 2004-01-01 17:59:30 by athena]
+  Removed useless file
+
+    M ./kernel/Makefile.am -3 +3
+    M ./kernel/ifftw.h -2 +1
+    R ./kernel/square.c
+
+Fri Dec 26 13:54:00 EST 2003  stevenj
+  * [project @ 2003-12-26 18:54:00 by stevenj]
+  whoops, don't call AC_F77_DUMMY_MAIN if no Fortran compiler is found; thanks to Charles Radley for the bug report.
+
+    M ./configure.ac -2 +3
+
+Fri Dec 19 13:58:05 EST 2003  stevenj
+  * [project @ 2003-12-19 18:58:05 by stevenj]
+  guess good flags for Solaris/intel, suggested by J. Gregory Wright
+
+    M ./acinclude.m4 -1 +1
+
+Fri Dec  5 19:55:13 EST 2003  stevenj
+  * [project @ 2003-12-06 00:55:13 by stevenj]
+  blah
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+    M ./doc/FAQ/html.refs -1 +1
+
+Sun Nov 30 06:59:41 EST 2003  athena
+  * [project @ 2003-11-30 11:59:41 by athena]
+  DIF generic solver was destroying the input.
+
+    M ./rdft/generic.c -2 +16
+
+Sat Nov 29 19:28:39 EST 2003  athena
+  * [project @ 2003-11-30 00:28:39 by athena]
+  Fixed bug that caused HC2R transforms to destroy the input in
+  certain cases, even if the user specified FFTW_PRESERVE_INPUT.
+
+    M ./NEWS +5
+    M ./rdft/rader-hc2hc.c -2 +14
+
+Sat Nov 29 16:49:01 EST 2003  athena
+  * [project @ 2003-11-29 21:49:01 by athena]
+  Implemented swap_io hack for r2r verifier.
+
+    M ./libbench2/verify-r2r.c -1 +8
+
+Thu Nov 20 22:00:53 EST 2003  stevenj
+  * [project @ 2003-11-21 03:00:53 by stevenj]
+  citation
+
+    M ./reodft/reodft010e-r2hc.c -2 +6
+
+Fri Nov 14 20:57:55 EST 2003  athena
+  * [project @ 2003-11-15 01:57:55 by athena]
+  Trying to get ``make paranoid-check'' to work.  (Still broken.)
+
+    M ./kernel/ifftw.h -2 +3
+    M ./kernel/planner.c -2 +2
+    M ./tests/hook.c -9 +8
+
+Fri Nov 14 20:05:54 EST 2003  stevenj
+  * [project @ 2003-11-15 01:05:54 by stevenj]
+  fixes for input-preservation tests
+
+    M ./libbench2/bench-user.h -1 +2
+    M ./libbench2/tensor.c -1 +15
+    M ./libbench2/verify-dft.c -3 +12
+    M ./libbench2/verify-lib.c -1 +4
+    M ./libbench2/verify-r2r.c -13 +33
+    M ./libbench2/verify-rdft2.c -6 +22
+    M ./libbench2/verify.h -1 +1
+
+Fri Nov 14 19:19:31 EST 2003  athena
+  * [project @ 2003-11-15 00:19:31 by athena]
+  Assume FFTW_PRESERVE_INPUT unless either the `d' flag is given in the
+  problem, or the problem is multidimensional c2r (which fftw3 cannot
+  without destroying the input).  With this change, we can at least test
+  that FFTW_PRESERVE_INPUT works in the c2r 1d case.
+
+    M ./tests/bench.c -12 +21
+
+Fri Nov 14 19:14:40 EST 2003  stevenj
+  * [project @ 2003-11-15 00:14:40 by stevenj]
+  apply should copy back input for input-preservation check
+
+    M ./libbench2/verify-dft.c -1 +3
+    M ./libbench2/verify-r2r.c +13
+    M ./libbench2/verify-rdft2.c -1 +4
+
+Fri Nov 14 19:01:36 EST 2003  athena
+  * [project @ 2003-11-15 00:01:14 by athena]
+  Undone previous bogus changes
+
+    M ./rdft/rank-geq2-rdft2.c -1 +10
+    M ./tests/bench.c -14 +10
+    M ./tests/check.pl -1
+
+Fri Nov 14 18:27:12 EST 2003  athena
+  * [project @ 2003-11-14 23:27:12 by athena]
+  Check dr[fb] in addition to r[fb]
+
+    M ./tests/check.pl +1
+
+Fri Nov 14 17:33:44 EST 2003  athena
+  * [project @ 2003-11-14 22:33:42 by athena]
+  Fixed conditions under which the rank-geq2-rdft2 solver is applicable.
+  
+  The old solver was not applicable for out-of-place problems
+  unless DESTROY_INPUT.  This is bogus.  As long as the subsolvers
+  honor !DESTROY_INPUT, the solver is always applicable.
+  
+  Changed semantics of test program, so that PRESERVE_INPUT is always
+  true unless the problem specifies destroy_input explicitly.  Without
+  this change, there is no way to test the new solver.
+
+    M ./rdft/rank-geq2-rdft2.c -10 +1
+    M ./tests/bench.c -10 +14
+
+Thu Oct 30 15:10:42 EST 2003  stevenj
+  * [project @ 2003-10-30 20:10:42 by stevenj]
+  added AIX OpenMP (-qsmp=omp) support; thanks to Greg Bauer
+
+    M ./configure.ac +5
+
+Thu Oct 30 10:11:39 EST 2003  athena
+  * [project @ 2003-10-30 15:11:39 by athena]
+  G5 CFLAGS
+
+    M ./acinclude.m4 +6
+
+Fri Oct 24 04:17:39 EDT 2003  stevenj
+  * [project @ 2003-10-24 08:17:39 by stevenj]
+  western FAQ
+
+    M ./doc/FAQ/fftw-faq.bfnn +5
+
+Thu Oct 23 11:34:11 EDT 2003  athena
+  * [project @ 2003-10-23 15:34:11 by athena]
+  Oops.
+
+    M ./simd/altivec.c -1 +2
+
+Thu Oct 23 11:28:28 EDT 2003  athena
+  * [project @ 2003-10-23 15:28:27 by athena]
+  Autodetect altivec
+
+    M ./configure.ac -2 +3
+    M ./simd/altivec.c -1 +28
+    M ./simd/simd-altivec.h -1 +2
+
+Wed Oct 22 01:14:10 EDT 2003  stevenj
+  * [project @ 2003-10-22 05:14:10 by stevenj]
+  MinGW gets confused by a single /
+
+    M ./tests/check.pl -4 +4
+
+Fri Oct 17 10:46:41 EDT 2003  athena
+  * [project @ 2003-10-17 14:46:41 by athena]
+  Paranoid portability fix
+
+    M ./libbench2/mp.c -2 +2
+
+Thu Oct 16 11:07:46 EDT 2003  athena
+  * [project @ 2003-10-16 15:07:46 by athena]
+  size -> length, which should make clear that we are not talking
+  about arbitrary precision.
+
+    M ./doc/fftw3.texi -4 +4
+
+Wed Oct 15 15:01:40 EDT 2003  stevenj
+  * [project @ 2003-10-15 19:01:40 by stevenj]
+  pruned transforms are a FAQ
+
+    M ./doc/FAQ/fftw-faq.bfnn +11
+
+Wed Oct  8 23:54:17 EDT 2003  stevenj
+  * [project @ 2003-10-09 03:54:17 by stevenj]
+  NO_SEARCH has already been mapped to FFTW_WISDOM_ONLY
+
+    M ./TODO -1 +1
+
+Wed Oct  8 23:53:19 EDT 2003  stevenj
+  * [project @ 2003-10-09 03:53:19 by stevenj]
+  newline
+
+    M ./TODO +1
+
+Sat Sep 27 20:27:32 EDT 2003  stevenj
+  * [project @ 2003-09-28 00:27:32 by stevenj]
+  fix
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Sep 27 20:24:39 EDT 2003  stevenj
+  * [project @ 2003-09-28 00:24:39 by stevenj]
+  clarification
+
+    M ./doc/fftw3.texi -2 +4
+
+Sat Sep 27 17:43:57 EDT 2003  stevenj
+  * [project @ 2003-09-27 21:43:57 by stevenj]
+  minor fix
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Sep 27 17:42:30 EDT 2003  stevenj
+  * [project @ 2003-09-27 21:42:30 by stevenj]
+  grammar
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Sep 27 17:29:04 EDT 2003  stevenj
+  * [project @ 2003-09-27 21:29:04 by stevenj]
+  html output fix
+
+    M ./doc/fftw3.texi -5 +5
+
+Sat Sep 27 17:22:48 EDT 2003  stevenj
+  * [project @ 2003-09-27 21:22:48 by stevenj]
+  mentioned sqrt(2) factors for DCT/DST
+
+    M ./doc/fftw3.texi -5 +48
+
+Sat Sep 27 17:07:18 EDT 2003  stevenj
+  * [project @ 2003-09-27 21:07:18 by stevenj]
+  FFTW_WISDOM_ONLY flag (undocumented for now), suggested by Phil Dumont
+
+    M ./api/fftw3.h -1 +2
+    M ./api/mapflags.c +1
+
+Tue Sep 23 23:36:19 EDT 2003  stevenj
+  * [project @ 2003-09-24 03:36:19 by stevenj]
+  removed UpTime code
+
+    M ./kernel/cycle.h -20 +3
+
+Tue Sep 23 23:27:29 EDT 2003  stevenj
+  * [project @ 2003-09-24 03:27:29 by stevenj]
+  updated documentation for mach_absolute_time
+
+    M ./kernel/cycle.h -3 +3
+
+Tue Sep 23 23:25:52 EDT 2003  stevenj
+  * [project @ 2003-09-24 03:25:52 by stevenj]
+  use mach_absolute_time on MacOS/Darwin, as a fallback; don't bother checking for UpTime since it requires extra libs
+
+    M ./configure.ac -3 +2
+    M ./kernel/cycle.h -1 +11
+
+Tue Sep 23 22:59:29 EDT 2003  stevenj
+  * [project @ 2003-09-24 02:59:29 by stevenj]
+  support Apple UpTime function for asm-less xlc, grrr...
+
+    M ./configure.ac -2 +3
+    M ./kernel/cycle.h -1 +19
+
+Tue Sep 23 15:42:29 EDT 2003  stevenj
+  * [project @ 2003-09-23 19:42:29 by stevenj]
+  additional paranoia for xlc etc.
+
+    M ./api/api.h -1 +1
+    M ./api/fftw3.h -2 +2
+
+Mon Sep 22 15:28:56 EDT 2003  stevenj
+  * [project @ 2003-09-22 19:28:56 by stevenj]
+  work around _Complex_I weirdness in xlc, reported by Greg Allen
+
+    M ./api/api.h -2 +4
+    M ./api/fftw3.h -2 +2
+
+Fri Sep  5 18:03:11 EDT 2003  stevenj
+  * [project @ 2003-09-05 22:03:11 by stevenj]
+  typo
+
+    M ./doc/FAQ/fftw-faq.bfnn -2 +2
+
+Fri Sep  5 13:11:40 EDT 2003  athena
+  * [project @ 2003-09-05 17:11:40 by athena]
+  New script that produces commercial version.
+
+    A ./commercialize.sh
+
+Fri Sep  5 07:27:06 EDT 2003  athena
+  * [project @ 2003-09-05 11:27:06 by athena]
+  Noted that VC++ is buggy.  Noted that we know nothing about Windows.
+  Noted that the sky is blue as well.
+
+    M ./doc/FAQ/fftw-faq.bfnn -10 +22
+
+Tue Sep  2 09:04:19 EDT 2003  athena
+  * [project @ 2003-09-02 13:04:19 by athena]
+  Noted that certain arrays are no longer used after the planner has
+  completed.
+
+    M ./doc/fftw3.texi -2 +12
+
+Tue Aug 26 08:22:38 EDT 2003  athena
+  * [project @ 2003-08-26 12:22:38 by athena]
+  Typo
+
+    M ./doc/fftw3.texi -2 +2
+
+Mon Aug 25 21:27:43 EDT 2003  athena
+  * [project @ 2003-08-26 01:27:43 by athena]
+  New item
+
+    M ./TODO -2 +1
+
+Thu Aug 21 17:36:08 EDT 2003  stevenj
+  * [project @ 2003-08-21 21:36:08 by stevenj]
+  try creating output file before planning (thanks to Phil Dumont for the suggestion)
+
+    M ./tools/fftw-wisdom.c -10 +10
+
+Tue Aug 19 10:08:07 EDT 2003  athena
+  * [project @ 2003-08-19 14:08:07 by athena]
+  Clarified fftw_cleanup()
+
+    M ./doc/fftw3.texi -4 +7
+
+Sat Aug 16 03:13:41 EDT 2003  stevenj
+  * [project @ 2003-08-16 07:13:41 by stevenj]
+  typo
+
+    M ./doc/fftw3.texi -2 +2
+
+Mon Jul 28 18:01:13 EDT 2003  stevenj
+  * [project @ 2003-07-28 22:01:13 by stevenj]
+  use time() instead of clock() (FIXME: what to do for non-POSIX systems?) ...thanks to JP Sugarbroad and James A. Treacy for the bug report
+
+    M ./tools/fftw-wisdom.c -4 +4
+
+Thu Jul 24 18:58:10 EDT 2003  athena
+  * [project @ 2003-07-24 22:58:10 by athena]
+  Need __volatile__ in sparc cycle counter.  This is why the debian
+  port hangs.
+
+    M ./kernel/cycle.h -2 +2
+
+Sun Jul 20 16:02:43 EDT 2003  stevenj
+  * [project @ 2003-07-20 20:02:43 by stevenj]
+  merged 3.0.1 notes
+
+    M ./NEWS +30
+
+Sun Jul 13 20:57:34 EDT 2003  stevenj
+  * [project @ 2003-07-14 00:57:34 by stevenj]
+  whoops
+
+    M ./libbench2/bench-main.c -2 +2
+
+Thu Jul 10 11:48:50 EDT 2003  athena
+  * [project @ 2003-07-10 15:48:50 by athena]
+  Dealing with constants in a way that seems to confuse gcc less.
+
+    M ./simd/simd-sse.h -5 +2
+    M ./simd/simd-sse2.h -6 +3
+
+Wed Jul  9 17:39:23 EDT 2003  athena
+  * [project @ 2003-07-09 21:39:16 by athena]
+  Enabled scheduler hack for FMA, where it seems to help.
+
+    M ./genfft/annotate.ml -7 +3
+    M ./genfft/magic.ml -1 +5
+    M ./support/Makefile.codelets -1 +1
+
+Wed Jul  9 12:57:38 EDT 2003  athena
+  * [project @ 2003-07-09 16:57:38 by athena]
+  Hmm---the new scheduler seems make things worse for gcc/x86, better
+  for gcc/ppc, and about the same for icc/x86.  Disabled for now.
+
+    M ./genfft/annotate.ml -3 +6
+
+Wed Jul  9 08:09:53 EDT 2003  athena
+  * [project @ 2003-07-09 12:09:53 by athena]
+  New scheduling pass that keeps ``x = a + b'' and ``y = a - b'' close
+  together.  This property was no longer automatic for the dags
+  generated in SIMD mode.
+  
+  I cannot measure any speed difference due to this change.  However,
+  the change is justified by a minimal-screwup argument.  Moreover, the
+  sse2 fftw library is now 1% smaller than it was before.
+
+    M ./genfft/annotate.ml -32 +70
+
+Tue Jul  8 20:42:22 EDT 2003  athena
+  * [project @ 2003-07-09 00:42:22 by athena]
+  -(FNMS()) => FMS()
+
+    M ./genfft/c.ml -1 +2
+
+Sun Jul  6 13:53:23 EDT 2003  stevenj
+  * [project @ 2003-07-06 17:53:23 by stevenj]
+  added more convenient target name
+
+    M ./doc/FAQ/Makefile.am +2
+
+Sat Jul  5 13:30:10 EDT 2003  stevenj
+  * [project @ 2003-07-05 17:30:10 by stevenj]
+  typo
+
+    M ./kernel/ifftw.h -2 +2
+
+Sat Jul  5 13:19:36 EDT 2003  athena
+  * [project @ 2003-07-05 17:19:36 by athena]
+  Consistent naming
+
+    M ./dft/ct-generic.c -2 +2
+
+Sat Jul  5 13:05:51 EDT 2003  athena
+  * [project @ 2003-07-05 17:05:51 by athena]
+  Got rid of problemw.
+
+    A ./dft/ct-directw.c
+    A ./dft/ct-directwbuf.c
+    A ./dft/ct-generic.c
+    A ./dft/ct.h
+    M ./dft/Makefile.am -5 +4
+    M ./dft/conf.c -3 +2
+    M ./dft/ct-directw.c +153
+    M ./dft/ct-directwbuf.c +213
+    M ./dft/ct-generic.c +304
+    M ./dft/ct.c -62 +24
+    M ./dft/ct.h +56
+    M ./dft/ctsq.c -3 +3
+    M ./dft/dft.h -39 +2
+    R ./dft/dftw-dft.c
+    M ./dft/direct.c -2 +1
+    R ./dft/directw.c
+    R ./dft/directwbuf.c
+    M ./dft/generic.c -2
+    M ./dft/kdft-dif.c -4 +4
+    M ./dft/kdft-difsq.c -3 +3
+    M ./dft/kdft-dit.c -4 +4
+    M ./dft/plan.c -11 +1
+    R ./dft/problemw.c
+    M ./dft/rader.c -2
+    M ./dft/solve.c -8 +1
+
+Fri Jul  4 06:56:26 EDT 2003  athena
+  * [project @ 2003-07-04 10:56:26 by athena]
+  Increase TIME_MIN on intel only
+
+    M ./kernel/cycle.h -1 +3
+    M ./kernel/timer.c -2 +2
+
+Fri Jul  4 06:36:02 EDT 2003  athena
+  * [project @ 2003-07-04 10:36:02 by athena]
+  A little hack to get more consistent scheduling.
+
+    M ./genfft/schedule.ml -7 +8
+
+Thu Jul  3 16:47:42 EDT 2003  athena
+  * [project @ 2003-07-03 20:47:42 by athena]
+  New experimental scheduler (currently disabled).
+  
+  The old scheduler is ``optimal'' in the sense that it minimizes
+  register pressure.  The only way to reduce register pressure is to
+  schedule dependent instructions as closely as possible, so as to
+  minimize the life time of registers.  This strategy maximizes the
+  number of pipeline stalls, however.  With enough registers and short
+  enough pipelines, this tradeoff is fine.  This is no longer the case
+  for the devilish pipeline of the Pentium IV or (probably) the PowerPC
+  970.
+  
+  The new scheduler switches to a ``list scheduler'' for dags smaller
+  than a specified size.  The list scheduler executes a butterfly left
+  to right one column at the time.  This amounts to the best possible
+  pipeline utilization, and the worst possible register pressure.
+  
+  The ``specified size'' defaults to 0, i.e., no change from fftw2 and
+  fftw-3.0.  It seems like a value of 7--10 produces the best results
+  for Pentium IV (probably screwing the G3/G4 powerpcs and sparc, but I
+  haven't tried.)  As time goes by, we may want to increase this number
+  to favor newer processors over older processors.
+
+    M ./genfft/magic.ml -1 +3
+    M ./genfft/schedule.ml -9 +35
+
+Wed Jun 25 17:43:59 EDT 2003  stevenj
+  * [project @ 2003-06-25 21:43:59 by stevenj]
+  remove non-portable use of tempfile; thanks to Nicolas Decoster for the patch
+
+    M ./tools/fftw-wisdom-to-conf.in -1 +1
+
+Wed Jun 25 17:14:03 EDT 2003  stevenj
+  * [project @ 2003-06-25 21:14:03 by stevenj]
+  increase stupid HP preprocessor limits
+
+    M ./acinclude.m4 -1 +1
+
+Thu Jun 19 15:21:52 EDT 2003  athena
+  * [project @ 2003-06-19 19:21:52 by athena]
+  Distribute gen_mdct.ml
+
+    M ./genfft/Makefile.am -8 +8
+
+Wed Jun 11 06:55:21 EDT 2003  athena
+  * [project @ 2003-06-11 10:55:21 by athena]
+  Cleared int/ptrdiff_t confusions
+
+    M ./rdft/buffered2.c -2 +3
+    M ./rdft/rdft2-radix2.c -5 +5
+
+Tue Jun 10 22:15:42 EDT 2003  athena
+  * [project @ 2003-06-11 02:15:41 by athena]
+  Cleared int/ptrdiff_t confusion
+
+    M ./dft/dftw-dft.c -1 +1
+    M ./dft/directwbuf.c -2 +2
+    M ./dft/rank0.c -2 +2
+    M ./dft/vrank2-transpose.c -2 +2
+    M ./dft/vrank3-transpose.c -2 +2
+    M ./kernel/planner.c -2 +2
+
+Sun Jun  8 09:52:57 EDT 2003  athena
+  * [project @ 2003-06-08 13:52:57 by athena]
+  Increased TIME_MIN.  This seems to produce more reliable plans
+  on Pentium IV.
+
+    M ./kernel/timer.c -2 +2
+
+Sat Jun  7 21:43:00 EDT 2003  athena
+  * [project @ 2003-06-08 01:43:00 by athena]
+  Removed relic -trivial-stores, which dates back to Franz's early
+  experiments.  Speed improved on SSE2, both with gcc and icc.
+
+    M ./dft/simd/codelets/Makefile.am -1 +1
+
+Thu Jun  5 22:29:52 EDT 2003  stevenj
+  * [project @ 2003-06-06 02:29:52 by stevenj]
+  fix direntry
+
+    M ./doc/fftw3.texi -2 +2
+
+Thu Jun  5 13:41:34 EDT 2003  stevenj
+  * [project @ 2003-06-05 17:41:34 by stevenj]
+  added imdct
+
+    M ./genfft/gen_mdct.ml -13 +66
+
+Wed Jun  4 19:54:38 EDT 2003  athena
+  * [project @ 2003-06-04 23:54:38 by athena]
+  Collect pattern (a * b) +- (c * d) in generic-arith, because this
+  operation can usually be computed with one rounding in fixed-point
+  (and it possibly exposes a FMA instruction)
+
+    M ./genfft/c.ml -5 +15
+
+Wed Jun  4 15:11:29 EDT 2003  athena
+  * [project @ 2003-06-04 19:11:29 by athena]
+  Generic-arithmetic unparser
+
+    M ./genfft/c.ml -8 +31
+    M ./genfft/magic.ml -1 +5
+
+Sun Jun  1 09:05:30 EDT 2003  athena
+  * [project @ 2003-06-01 13:05:30 by athena]
+  Oops---randomized CSE was using the same random numbers
+  over and over
+
+    M ./genfft/oracle.ml -8 +6
+    M ./genfft-k7/oracle.ml -11 +11
+
+Sun Jun  1 07:01:17 EDT 2003  athena
+  * [project @ 2003-06-01 11:01:17 by athena]
+  Paranoia.
+
+    M ./genfft/c.ml -3 +3
+
+Sun Jun  1 07:00:54 EDT 2003  athena
+  * [project @ 2003-06-01 11:00:54 by athena]
+  Use relative error instead of absolute error, to avoid problems
+  when normalization factors are used.
+
+    M ./genfft/oracle.ml -3 +5
+
+Sat May 31 22:11:28 EDT 2003  stevenj
+  * [project @ 2003-06-01 02:11:28 by stevenj]
+  slight opt
+
+    M ./reodft/reodft11e-radix2.c -3 +3
+
+Sat May 31 22:10:45 EDT 2003  stevenj
+  * [project @ 2003-06-01 02:10:45 by stevenj]
+  slight optimization
+
+    M ./reodft/reodft11e-radix2.c -3 +3
+
+Sat May 31 20:43:31 EDT 2003  stevenj
+  * [project @ 2003-06-01 00:43:31 by stevenj]
+  *W is const
+
+    M ./genfft/gen_mdct.ml -3 +3
+
+Sat May 31 20:41:15 EDT 2003  stevenj
+  * [project @ 2003-06-01 00:41:15 by stevenj]
+  comment
+
+    M ./genfft/gen_mdct.ml -2 +4
+
+Thu May 29 21:31:31 EDT 2003  stevenj
+  * [project @ 2003-05-30 01:31:31 by stevenj]
+  added experimental MDCT
+
+    A ./genfft/gen_mdct.ml
+    M ./genfft/Makefile.am -1 +7
+    M ./genfft/gen_mdct.ml +208
+
+Wed May 28 22:01:37 EDT 2003  stevenj
+  * [project @ 2003-05-29 02:01:37 by stevenj]
+  altivec (fma) needs simd codlist.c too
+
+    M ./mkdist.sh -1 +1
+
+Wed May 28 22:00:49 EDT 2003  stevenj
+  * [project @ 2003-05-29 02:00:49 by stevenj]
+  make sure we include SIMD codlist.c for non-Unix folks
+
+    M ./mkdist.sh -1 +1
+
+Tue May 27 20:31:25 EDT 2003  stevenj
+  * [project @ 2003-05-28 00:31:25 by stevenj]
+  noted howmany_rank == 0 is a single transform
+
+    M ./doc/fftw3.texi -1 +3
+
+Tue May 27 20:02:31 EDT 2003  stevenj
+  * [project @ 2003-05-28 00:02:31 by stevenj]
+  further stride clarification
+
+    M ./doc/fftw3.texi -4 +4
+
+Mon May 26 10:21:22 EDT 2003  athena
+  * [project @ 2003-05-26 14:21:22 by athena]
+  Removed transposed dftw problems.
+  
+  I now consider transposed dftw a Bad Idea, since it does not
+  apply to the case that it was originally meant for (speed up four-step)
+  and it complicates the implementation of the other thing I want to try
+  (dftw m-slices).
+
+    M ./dft/Makefile.am -3 +3
+    M ./dft/conf.c -2 +1
+    M ./dft/ct.c -5 +3
+    M ./dft/ctsq.c -74 +57
+    M ./dft/dft.h -9 +5
+    M ./dft/dftw-dft.c -83
+    M ./dft/directw.c -5 +1
+    M ./dft/directwbuf.c -5 +1
+    R ./dft/directwsq.c
+    M ./dft/kdft-difsq.c -2 +2
+    M ./dft/problemw.c -13 +9
+
+Mon May 26 07:22:59 EDT 2003  athena
+  * [project @ 2003-05-26 11:22:59 by athena]
+  Obsolete comment
+
+    M ./dft/buffered.c -6 +1
+
+Sat May 24 15:00:53 EDT 2003  athena
+  * [project @ 2003-05-24 19:00:53 by athena]
+  comment
+
+    M ./dft/ct.c -2 +2
+
+Sat May 24 07:20:35 EDT 2003  athena
+  * [project @ 2003-05-24 11:20:35 by athena]
+  Oops---wrong test NO_UGLYP instead of !NO_UGLYP
+
+    M ./dft/dftw-dft.c -3 +3
+
+Sat May 24 07:05:34 EDT 2003  athena
+  * [project @ 2003-05-24 11:05:34 by athena]
+  Implemented radix r, where n=r^2 * p
+
+    M ./dft/ct.c -19 +53
+
+Wed May 21 01:54:32 EDT 2003  stevenj
+  * [project @ 2003-05-21 05:54:32 by stevenj]
+  xlc seems to properly use fma as well
+
+    M ./doc/FAQ/fftw-faq.bfnn -2 +2
+
+Tue May 20 23:07:43 EDT 2003  stevenj
+  * [project @ 2003-05-21 03:07:43 by stevenj]
+  print warning if there is no cycle counter
+
+    M ./configure.ac +18
+    M ./doc/fftw3.texi -2 +3
+
+Tue May 20 17:32:04 EDT 2003  stevenj
+  * [project @ 2003-05-20 21:32:04 by stevenj]
+  updated Funda reference
+
+    M ./libbench2/verify-lib.c -1 +4
+    M ./libbench2/verify-r2r.c +3
+
+Mon May 19 20:12:36 EDT 2003  athena
+  * [project @ 2003-05-20 00:12:36 by athena]
+  const
+
+    M ./dft/ct.c -2 +2
+
+Mon May 19 15:41:09 EDT 2003  athena
+  * [project @ 2003-05-19 19:41:09 by athena]
+  Implemented generic dif square transposed (q-style) solver.
+
+    M ./dft/dftw-dft.c -8 +88
+    M ./dft/directwsq.c -2 +2
+
+Mon May 19 07:00:36 EDT 2003  athena
+  * [project @ 2003-05-19 11:00:36 by athena]
+  applicable() is now a property of the solver (in anticipation of
+  transposed solvers)
+
+    M ./dft/dftw-dft.c -35 +35
+
+Mon May 19 06:33:40 EDT 2003  athena
+  * [project @ 2003-05-19 10:33:40 by athena]
+  Slight cleanup
+
+    M ./dft/dftw-dft.c -7 +7
+
+Sun May 18 13:05:51 EDT 2003  athena
+  * [project @ 2003-05-18 17:05:51 by athena]
+  Nothing, really
+
+    M ./dft/bluestein.c -9 +3
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/primes.c -1 +9
+
+Sun May 18 09:05:20 EDT 2003  athena
+  * [project @ 2003-05-18 13:05:20 by athena]
+  Moved vector loop inside bytwiddle(), in anticipation of
+  a q-style dftw-dit transposed solver.
+
+    M ./dft/dftw-dft.c -38 +41
+
+Sun May 18 08:52:02 EDT 2003  athena
+  * [project @ 2003-05-18 12:52:02 by athena]
+  Fixed flops count
+
+    M ./dft/dftw-dft.c -1 +1
+
+Sun May 18 08:47:20 EDT 2003  athena
+  * [project @ 2003-05-18 12:47:20 by athena]
+  style
+
+    M ./dft/dftw-dft.c -5 +3
+
+Sun May 18 07:16:34 EDT 2003  athena
+  * [project @ 2003-05-18 11:16:34 by athena]
+  Faster inner loop.
+
+    M ./dft/dftw-dft.c -13 +25
+
+Sat May 17 08:02:38 EDT 2003  athena
+  * [project @ 2003-05-17 12:02:38 by athena]
+  Print vector length
+
+    M ./dft/dftw-dft.c -2 +2
+
+Sat May 17 07:55:33 EDT 2003  athena
+  * [project @ 2003-05-17 11:55:33 by athena]
+  Oops
+
+    M ./dft/dftw-dft.c -7 +7
+
+Sat May 17 07:50:35 EDT 2003  athena
+  * [project @ 2003-05-17 11:50:35 by athena]
+  Allow vl > 1
+
+    M ./dft/dftw-dft.c -7 +16
+
+Sat May 17 07:01:42 EDT 2003  athena
+  * [project @ 2003-05-17 11:01:42 by athena]
+  Radix can be derived from problem---no need to pre-specify it.
+
+    M ./dft/ctsq.c -21 +12
+
+Fri May 16 22:50:50 EDT 2003  stevenj
+  * [project @ 2003-05-17 02:50:50 by stevenj]
+  fixed comment
+
+    M ./kernel/transpose.c -1 +1
+
+Fri May 16 22:48:20 EDT 2003  stevenj
+  * [project @ 2003-05-17 02:48:20 by stevenj]
+  whoops, gcd should be static
+
+    M ./kernel/transpose.c -1 +1
+
+Fri May 16 22:40:32 EDT 2003  stevenj
+  * [project @ 2003-05-17 02:40:32 by stevenj]
+  more unrolling
+
+    M ./kernel/transpose.c -7 +31
+
+Fri May 16 20:03:48 EDT 2003  athena
+  * [project @ 2003-05-17 00:03:48 by athena]
+  Hack to avoid infinite recursion.
+
+    M ./dft/bluestein.c +3
+
+Fri May 16 19:52:43 EDT 2003  stevenj
+  * [project @ 2003-05-16 23:52:43 by stevenj]
+  consistency
+
+    M ./dft/codelet-dft.h -4 +4
+
+Fri May 16 19:45:15 EDT 2003  athena
+  * [project @ 2003-05-16 23:45:15 by athena]
+  Wrong comment.
+
+    M ./dft/bluestein.c -1 +1
+
+Fri May 16 19:45:03 EDT 2003  athena
+  * [project @ 2003-05-16 23:45:03 by athena]
+  Style.
+
+    M ./dft/bluestein.c -3 +3
+
+Fri May 16 18:35:27 EDT 2003  stevenj
+  * [project @ 2003-05-16 22:35:27 by stevenj]
+  punctuation
+
+    M ./doc/FAQ/fftw-faq.bfnn -2 +2
+
+Fri May 16 18:33:45 EDT 2003  stevenj
+  * [project @ 2003-05-16 22:33:45 by stevenj]
+  added allzero FAQ
+
+    M ./doc/FAQ/fftw-faq.bfnn +7
+
+Fri May 16 18:22:45 EDT 2003  stevenj
+  * [project @ 2003-05-16 22:22:45 by stevenj]
+  simplification: instead of cldb, just use cldf with inputs/output values swapped
+
+    M ./dft/bluestein.c -22 +11
+
+Fri May 16 15:47:17 EDT 2003  athena
+  * [project @ 2003-05-16 19:47:17 by athena]
+  Allow more general transform sizes.
+
+    M ./dft/bluestein.c -15 +21
+
+Fri May 16 14:22:37 EDT 2003  stevenj
+  * [project @ 2003-05-16 18:22:37 by stevenj]
+  slight change
+
+    M ./kernel/ifftw.h -4 +4
+
+Fri May 16 14:22:05 EDT 2003  stevenj
+  * [project @ 2003-05-16 18:22:05 by stevenj]
+  MS has __int64 type, not long long (grr)
+
+    M ./kernel/ifftw.h -1 +4
+
+Fri May 16 13:34:16 EDT 2003  athena
+  * [project @ 2003-05-16 17:34:16 by athena]
+  Fixed printout
+
+    M ./dft/ct.c -2 +4
+
+Fri May 16 13:23:00 EDT 2003  athena
+  * [project @ 2003-05-16 17:23:00 by athena]
+  Fixed flop count
+
+    M ./dft/bluestein.c +5
+
+Fri May 16 13:02:06 EDT 2003  athena
+  * [project @ 2003-05-16 17:02:06 by athena]
+  New bluestein solver
+
+    A ./dft/bluestein.c
+    M ./dft/Makefile.am -5 +6
+    M ./dft/bluestein.c +255
+    M ./dft/conf.c -1 +2
+    M ./dft/dft.h -1 +2
+
+Fri May 16 09:51:05 EDT 2003  athena
+  * [project @ 2003-05-16 13:51:05 by athena]
+  Implemented generic radix.
+
+    M ./dft/ct.c -14 +35
+
+Fri May 16 08:19:38 EDT 2003  athena
+  * [project @ 2003-05-16 12:19:38 by athena]
+  Removed conditional branch from inner loop in generic.c
+
+    M ./dft/generic.c -18 +20
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/twiddle.c -1 +18
+
+Fri May 16 07:48:28 EDT 2003  athena
+  * [project @ 2003-05-16 11:48:28 by athena]
+  Simplified indexing
+
+    M ./dft/generic.c -29 +20
+
+Fri May 16 06:53:56 EDT 2003  athena
+  * [project @ 2003-05-16 10:53:56 by athena]
+  Better still.
+
+    M ./dft/generic.c -26 +37
+
+Fri May 16 06:24:31 EDT 2003  athena
+  * [project @ 2003-05-16 10:24:31 by athena]
+  Further improvement of generic solver
+
+    M ./dft/generic.c -8 +8
+
+Fri May 16 05:57:07 EDT 2003  athena
+  * [project @ 2003-05-16 09:57:07 by athena]
+  Cleanup
+
+    M ./dft/rader.c -54 +51
+
+Fri May 16 05:42:57 EDT 2003  athena
+  * [project @ 2003-05-16 09:42:57 by athena]
+  Cleanup
+
+    M ./dft/dftw-dft.c -30 +25
+
+Fri May 16 05:31:40 EDT 2003  athena
+  * [project @ 2003-05-16 09:31:40 by athena]
+  Generic now only works for odd sized.  Added check.
+
+    M ./dft/generic.c -1 +2
+
+Thu May 15 21:53:25 EDT 2003  athena
+  * [project @ 2003-05-16 01:53:25 by athena]
+  Increased GENERIC_MIN_BAD because of new algorithm.
+
+    M ./kernel/ifftw.h -2 +2
+
+Thu May 15 21:40:27 EDT 2003  athena
+  * [project @ 2003-05-16 01:40:27 by athena]
+  Much, much better.
+
+    M ./dft/generic.c -9 +30
+
+Thu May 15 21:25:00 EDT 2003  athena
+  * [project @ 2003-05-16 01:25:00 by athena]
+  Still trying to understand why rdft-generic-dit is faster
+  then dft-generic...
+
+    M ./dft/generic.c -9 +8
+
+Thu May 15 21:04:33 EDT 2003  athena
+  * [project @ 2003-05-16 01:04:33 by athena]
+  Nothing, really
+
+    M ./dft/generic.c -2 +2
+
+Thu May 15 20:59:45 EDT 2003  athena
+  * [project @ 2003-05-16 00:59:45 by athena]
+  Never be clever for the sake of being clever.
+
+    M ./dft/generic.c -1 +1
+
+Thu May 15 20:58:06 EDT 2003  athena
+  * [project @ 2003-05-16 00:58:06 by athena]
+  Simplified.  generic-dit is gone.  The solver is now out-of-place
+  only---buffering is done by the buffered solver.
+
+    M ./dft/generic.c -94 +45
+
+Thu May 15 19:18:18 EDT 2003  athena
+  * [project @ 2003-05-15 23:18:18 by athena]
+  rader-dit is gone.
+
+    M ./dft/rader.c -200
+
+Thu May 15 19:13:03 EDT 2003  athena
+  * [project @ 2003-05-15 23:13:03 by athena]
+  Cast
+
+    M ./dft/plan.c -2 +2
+
+Thu May 15 19:09:07 EDT 2003  athena
+  * [project @ 2003-05-15 23:09:07 by athena]
+  Introduced twiddle problem ``dftw''.  Changed most other things
+  to deal with this change.
+
+    A ./dft/ctsq.c
+    A ./dft/dftw-dft.c
+    A ./dft/directw.c
+    A ./dft/directwbuf.c
+    A ./dft/directwsq.c
+    A ./dft/problemw.c
+    M ./configure.ac -1 +1
+    M ./dft/Makefile.am -5 +5
+    M ./dft/buffered.c -2 +1
+    M ./dft/codelet-dft.h -13 +9
+    M ./dft/conf.c -1 +4
+    R ./dft/ct-dif.c
+    R ./dft/ct-dit.c
+    R ./dft/ct-ditbuf.c
+    R ./dft/ct-ditf.c
+    M ./dft/ct.c -91 +157
+    R ./dft/ct.h
+    M ./dft/ctsq.c +213
+    M ./dft/dft.h -7 +45
+    M ./dft/dftw-dft.c +297
+    M ./dft/directw.c +172
+    M ./dft/directwbuf.c +230
+    M ./dft/directwsq.c +148
+    M ./dft/kdft-dif.c -4 +5
+    M ./dft/kdft-difsq.c -3 +3
+    M ./dft/kdft-dit.c -5 +5
+    M ./dft/plan.c -1 +11
+    M ./dft/problemw.c +103
+    M ./dft/rader.c -1 +1
+    M ./dft/solve.c -1 +8
+
+Sat Aug 12 23:17:14 EDT 2006  Unknown tagger
+  tagged fftw-3-0-1-branch
+
+
+Thu May 15 18:47:18 EDT 2003  stevenj
+  * [project @ 2003-05-15 22:47:18 by stevenj]
+  whoops, X(safe_mulmod) not fftw_safe_mulmod
+
+    M ./kernel/primes.c -3 +3
+
+Thu May 15 16:53:16 EDT 2003  stevenj
+  * [project @ 2003-05-15 20:53:16 by stevenj]
+  add VC++ versions of asm
+
+    M ./simd/sse.c -1 +17
+    M ./simd/sse2.c -1 +17
+
+Thu May 15 15:03:06 EDT 2003  stevenj
+  * [project @ 2003-05-15 19:03:05 by stevenj]
+  VC++ reportedly supports the intel intrinsics, but requires __inline instead of __inline__
+
+    M ./simd/simd-sse.h -1 +4
+    M ./simd/simd-sse2.h -1 +5
+
+Thu May 15 14:32:06 EDT 2003  stevenj
+  * [project @ 2003-05-15 18:32:06 by stevenj]
+  precompute array indices with VC++
+
+    M ./kernel/ifftw.h -2 +2
+
+Wed May 14 21:57:39 EDT 2003  stevenj
+  * [project @ 2003-05-15 01:57:39 by stevenj]
+  added doc note
+
+    M ./acx_pthread.m4 -1 +5
+
+Wed May 14 19:45:54 EDT 2003  stevenj
+  * [project @ 2003-05-14 23:45:54 by stevenj]
+  autodetect windows
+
+    M ./threads/threads.c -3 +4
+
+Wed May 14 15:08:49 EDT 2003  stevenj
+  * [project @ 2003-05-14 19:08:49 by stevenj]
+  don't bother with #ifdef HAVE_CONFIG_H, since non-Unix users always forget to define it
+
+    M ./libbench2/getopt.c -2
+
+Tue May 13 16:58:07 EDT 2003  stevenj
+  * [project @ 2003-05-13 20:58:07 by stevenj]
+  VC++ uses __inline
+
+    M ./kernel/cycle.h -2 +2
+
+Tue May 13 14:51:26 EDT 2003  stevenj
+  * [project @ 2003-05-13 18:51:26 by stevenj]
+  added leak question
+
+    M ./doc/FAQ/fftw-faq.bfnn +8
+
+Mon May 12 18:26:51 EDT 2003  stevenj
+  * [project @ 2003-05-12 22:26:51 by stevenj]
+  LARGE_INTEGER needs windows.h (supposedly, there is some problem converting _itnt64 to double...damn MS and their nonstandard types)
+
+    M ./kernel/cycle.h -1 +2
+
+Mon May 12 18:22:16 EDT 2003  stevenj
+  * [project @ 2003-05-12 22:22:16 by stevenj]
+  whoops
+
+    M ./libbench2/timer.c -2 +2
+
+Mon May 12 17:16:19 EDT 2003  stevenj
+  * [project @ 2003-05-12 21:16:19 by stevenj]
+  added 256x256 to canonical list
+
+    M ./tools/fftw-wisdom.c -1 +1
+
+Mon May 12 07:02:06 EDT 2003  athena
+  * [project @ 2003-05-12 11:02:06 by athena]
+  Oops...
+
+    M ./kernel/transpose.c +1
+
+Sun May 11 11:04:46 EDT 2003  athena
+  * [project @ 2003-05-11 15:04:46 by athena]
+  Unrolled loops, changed cutoff
+
+    M ./kernel/transpose.c -23 +80
+
+Sun May 11 10:20:04 EDT 2003  athena
+  * [project @ 2003-05-11 14:20:04 by athena]
+  Do not multiply strides by 2 twice.
+
+    M ./tests/bench.c -16 +15
+
+Wed May  7 21:09:43 EDT 2003  stevenj
+  * [project @ 2003-05-08 01:09:43 by stevenj]
+  added 'make smallcheck'
+
+    M ./tests/Makefile.am +13
+
+Wed May  7 20:46:10 EDT 2003  stevenj
+  * [project @ 2003-05-08 00:46:10 by stevenj]
+  --without-cycle-counter becomes --with-slow-timer, updated docs
+
+    M ./configure.ac -3 +3
+    M ./doc/fftw3.texi -15 +15
+    M ./kernel/timer.c -3 +3
+
+Wed May  7 18:05:29 EDT 2003  stevenj
+  * [project @ 2003-05-07 22:05:29 by stevenj]
+  remove duplicate -openmp check; Sun requires -xopenmp
+
+    M ./configure.ac -5 +5
+
+Wed May  7 17:59:23 EDT 2003  stevenj
+  * [project @ 2003-05-07 21:59:23 by stevenj]
+  fixed compilation under Sun C++
+
+    M ./dft/ct-ditbuf.c -3 +3
+    M ./rdft/hc2hc-buf.c -3 +3
+
+Wed May  7 14:24:46 EDT 2003  athena
+  * [project @ 2003-05-07 18:24:46 by athena]
+  Use estimator if cycle counter is unavailable, regardless
+  of the FFTW_MEASURE/ESTIMATE setting.
+
+    M ./kernel/planner.c -3 +13
+    M ./kernel/timer.c -57 +67
+
+Tue May  6 23:15:34 EDT 2003  stevenj
+  * [project @ 2003-05-07 03:15:34 by stevenj]
+  _WIN32 (not __WIN32__) is always defined
+
+    M ./kernel/cycle.h -3 +2
+
+Tue May  6 23:11:52 EDT 2003  stevenj
+  * [project @ 2003-05-07 03:11:52 by stevenj]
+  minor cleanup
+
+    M ./kernel/cycle.h -2 +2
+
+Tue May  6 22:50:07 EDT 2003  stevenj
+  * [project @ 2003-05-07 02:50:07 by stevenj]
+  tentative VC++ stuff, some consolidation
+
+    M ./kernel/cycle.h -51 +56
+
+Tue May  6 12:17:56 EDT 2003  stevenj
+  * [project @ 2003-05-06 16:17:56 by stevenj]
+  made cycle.h more self-contained
+
+    M ./kernel/cycle.h -3 +13
+    M ./kernel/timer.c -15 +18
+
+Tue May  6 08:30:39 EDT 2003  athena
+  * [project @ 2003-05-06 12:30:39 by athena]
+  Use ``%'' flag to denote commutative operations.
+
+    M ./simd/simd-3dnow.h -3 +3
+    M ./simd/simd-sse.h -3 +3
+    M ./simd/simd-sse2.h -3 +3
+
+Mon May  5 20:42:30 EDT 2003  stevenj
+  * [project @ 2003-05-06 00:42:30 by stevenj]
+  MIT license, brief documentation
+
+    M ./kernel/cycle.h -12 +41
+
+Mon May  5 20:31:16 EDT 2003  stevenj
+  * [project @ 2003-05-06 00:31:16 by stevenj]
+  whoops, forgot f77_wisdom.f
+
+    M ./doc/Makefile.am -1 +1
+
+Sun May  4 19:37:09 EDT 2003  athena
+  * [project @ 2003-05-04 23:37:09 by athena]
+  Improved speed of accuracy test.
+
+    M ./dft/problem.c -6 +3
+    M ./libbench/mp.c -16 +26
+    M ./libbench2/bench.h -1 +2
+    M ./libbench2/mp.c -46 +92
+    M ./libbench2/verify-lib.c -1 +3
+    M ./rdft/problem2.c -3 +2
+
+Tue Apr 29 11:45:34 EDT 2003  athena
+  * [project @ 2003-04-29 15:45:34 by athena]
+  s390 cycle counter
+
+    M ./kernel/cycle.h -1 +21
+
+Sat Apr 26 12:26:15 EDT 2003  stevenj
+  * [project @ 2003-04-26 16:26:15 by stevenj]
+  forgot r2r directory
+
+    M ./doc/fftw3.texi -10 +10
+
+Fri Apr 25 20:52:23 EDT 2003  stevenj
+  * [project @ 2003-04-26 00:52:23 by stevenj]
+  delete unused files, since they don't compile any more
+
+    M ./rdft/Makefile.am -2
+    R ./rdft/vrank2-transpose.c
+    R ./rdft/vrank3-transpose.c
+
+Thu Apr 24 06:37:41 EDT 2003  athena
+  * [project @ 2003-04-24 10:37:41 by athena]
+  Better gcc code generation
+
+    M ./simd/simd-sse2.h -3 +5
+
+Wed Apr 23 15:30:50 EDT 2003  stevenj
+  * [project @ 2003-04-23 19:30:50 by stevenj]
+  ccc is the Compaq C compiler on Linux/alpha
+
+    M ./acinclude.m4 -1 +1
+
+Wed Apr 23 00:06:03 EDT 2003  stevenj
+  * [project @ 2003-04-23 04:06:03 by stevenj]
+  whoops
+
+    M ./doc/fftw3.texi -3 +3
+
+Sat Aug 12 23:16:25 EDT 2006  Unknown tagger
+  tagged fftw-3-0
+
+
+Sat Apr 19 09:18:25 EDT 2003  athena
+  * [project @ 2003-04-19 13:18:25 by athena]
+  ia64 cycle counter with intel compiler.
+
+    M ./kernel/cycle.h -1 +19
+
+Fri Apr 18 18:27:30 EDT 2003  athena
+  * [project @ 2003-04-18 22:27:30 by athena]
+  More gcc bugs.  Sigh.
+
+    M ./doc/FAQ/fftw-faq.bfnn -4 +12
+
+Fri Apr 18 18:01:49 EDT 2003  athena
+  * [project @ 2003-04-18 22:01:49 by athena]
+  touch ChangeLog to observe GNU standards
+
+    M ./bootstrap.sh +1
+
+Fri Apr 18 18:01:12 EDT 2003  athena
+  * [project @ 2003-04-18 22:01:12 by athena]
+  We now build ChangeLog automatically at distribution time
+
+    R ./ChangeLog
+
+Fri Apr 18 18:00:17 EDT 2003  athena
+  * [project @ 2003-04-18 22:00:17 by athena]
+  Automatic ChangeLog hackery
+
+    M ./mkdist.sh +7
+
+Fri Apr 18 13:25:26 EDT 2003  stevenj
+  * [project @ 2003-04-18 17:25:26 by stevenj]
+  plural
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Fri Apr 18 13:25:01 EDT 2003  stevenj
+  * [project @ 2003-04-18 17:25:01 by stevenj]
+  updated
+
+    M ./NEWS +12
+
+Fri Apr 18 12:59:41 EDT 2003  athena
+  * [project @ 2003-04-18 16:59:41 by athena]
+  Updated
+
+    M ./ChangeLog -6789 +3415
+
+Fri Apr 18 11:48:39 EDT 2003  stevenj
+  * [project @ 2003-04-18 15:48:39 by stevenj]
+  a -> an
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Fri Apr 18 11:47:56 EDT 2003  stevenj
+  * [project @ 2003-04-18 15:47:56 by stevenj]
+  hyphen
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Fri Apr 18 11:47:28 EDT 2003  stevenj
+  * [project @ 2003-04-18 15:47:28 by stevenj]
+  comma
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Fri Apr 18 11:46:59 EDT 2003  stevenj
+  * [project @ 2003-04-18 15:46:59 by stevenj]
+  minor
+
+    M ./doc/FAQ/fftw-faq.bfnn -4 +4
+
+Fri Apr 18 10:37:31 EDT 2003  athena
+  * [project @ 2003-04-18 14:37:31 by athena]
+  Updated
+
+    M ./doc/FAQ/fftw-faq.bfnn -9 +36
+
+Fri Apr 18 10:14:59 EDT 2003  athena
+  * [project @ 2003-04-18 14:14:59 by athena]
+  New script that builds the distributions
+
+    A ./mkdist.sh
+
+Fri Apr 18 08:51:07 EDT 2003  athena
+  * [project @ 2003-04-18 12:51:07 by athena]
+  Oops again
+
+    M ./dft/simd/codelets/Makefile.am -4 +4
+
+Fri Apr 18 08:39:05 EDT 2003  athena
+  * [project @ 2003-04-18 12:39:05 by athena]
+  Oops, forgot -sign 1
+
+    M ./dft/simd/codelets/Makefile.am -4 +4
+
+Fri Apr 18 08:28:25 EDT 2003  athena
+  * [project @ 2003-04-18 12:28:25 by athena]
+  Reorganization of simd codelets
+
+    M ./configure.ac -4
+    M ./dft/simd/codelets/Makefile.am -136 +22
+    M ./dft/simd/n1b.c -3 +3
+    M ./dft/simd/n1b.h -3
+    M ./dft/simd/n1f.c -3 +3
+    M ./dft/simd/n1f.h -3
+    M ./dft/simd/n2b.c -3 +3
+    M ./dft/simd/n2b.h +3
+    M ./dft/simd/n2f.c -3 +3
+    M ./dft/simd/n2f.h +3
+
+Thu Apr 17 21:21:45 EDT 2003  athena
+  * [project @ 2003-04-18 01:21:45 by athena]
+  k7 assembly was not updated after conversion of opcnt from
+  int to double
+
+    M ./genfft-k7/gen_notw.ml -5 +5
+    M ./genfft-k7/gen_twiddle.ml -5 +6
+
+Thu Apr 17 19:15:53 EDT 2003  athena
+  * [project @ 2003-04-17 23:15:53 by athena]
+  Capital `X' looks bad in all-lowercase plans
+
+    M ./dft/vrank2-transpose.c -2 +2
+    M ./dft/vrank3-transpose.c -2 +2
+
+Thu Apr 17 18:53:29 EDT 2003  athena
+  * [project @ 2003-04-17 22:53:29 by athena]
+  Removed redundant inline/noinline codelets
+
+    M ./dft/codelets/standard/Makefile.am -3 +2
+    M ./dft/simd/codelets/Makefile.am -27 +12
+    M ./rdft/codelets/hc2r/Makefile.am -8 +4
+    M ./rdft/codelets/r2hc/Makefile.am -8 +4
+
+Thu Apr 17 15:25:50 EDT 2003  athena
+  * [project @ 2003-04-17 19:25:50 by athena]
+  New noinline
+  Noinline real codelets
+
+    A ./genfft/gen_hc2r_noinline.ml
+    A ./genfft/gen_r2hc_noinline.ml
+    M ./genfft/Makefile.am -10 +24
+    M ./genfft/gen_hc2hc.ml -5 +5
+    M ./genfft/gen_hc2r.ml -48 +31
+    M ./genfft/gen_hc2r_noinline.ml +185
+    M ./genfft/gen_r2hc.ml -49 +31
+    M ./genfft/gen_r2hc_noinline.ml +181
+    M ./genfft/gen_r2r.ml -3 +3
+    M ./kernel/ifftw.h -4 +1
+    M ./rdft/codelets/hc2r/Makefile.am -1 +15
+    M ./rdft/codelets/r2hc/Makefile.am -1 +15
+    M ./support/Makefile.codelets -1 +3
+
+Thu Apr 17 15:23:03 EDT 2003  stevenj
+  * [project @ 2003-04-17 19:23:03 by stevenj]
+  more ideas
+
+    M ./TODO +6
+
+Thu Apr 17 13:18:45 EDT 2003  athena
+  * [project @ 2003-04-17 17:18:45 by athena]
+  Removed duplicate rules.
+
+    M ./dft/simd/codelets/Makefile.am -6
+
+Thu Apr 17 10:51:09 EDT 2003  athena
+  * [project @ 2003-04-17 14:51:09 by athena]
+  acx_pthread.m4 was not distributed
+
+    M ./Makefile.am -1 +1
+
+Thu Apr 17 07:21:17 EDT 2003  athena
+  * [project @ 2003-04-17 11:21:17 by athena]
+  Oops
+
+    M ./support/Makefile.codelets -1 +1
+
+Thu Apr 17 07:07:19 EDT 2003  athena
+  * [project @ 2003-04-17 11:07:19 by athena]
+  Both inlined and non-inlined notw codelets.
+
+    A ./genfft/gen_notw_noinline.ml
+    M ./dft/codelets/standard/Makefile.am -1 +10
+    M ./genfft/Makefile.am -11 +17
+    M ./genfft/gen_notw.ml -51 +37
+    M ./genfft/gen_notw_noinline.ml +164
+    M ./support/Makefile.codelets +1
+
+Thu Apr 17 06:44:21 EDT 2003  athena
+  * [project @ 2003-04-17 10:44:21 by athena]
+  Initial experiment with both inlined and non-inlined simd codelets.
+  Both are included for now.
+
+    A ./genfft/gen_notw_noinline_c.ml
+    M ./dft/simd/codelets/Makefile.am -3 +68
+    M ./genfft/Makefile.am -9 +17
+    M ./genfft/gen_notw_noinline_c.ml +160
+    M ./support/Makefile.codelets -1 +2
+
+Thu Apr 17 05:57:36 EDT 2003  athena
+  * [project @ 2003-04-17 09:57:36 by athena]
+  --enable-fma to build FMA distribution
+
+    M ./configure.ac +7
+    M ./support/Makefile.codelets -1 +7
+
+Wed Apr 16 17:21:53 EDT 2003  athena
+  * [project @ 2003-04-16 21:21:53 by athena]
+  Inline SIMD nontwiddle codelets
+
+    M ./genfft/gen_notw_c.ml -47 +41
+
+Wed Apr 16 16:18:29 EDT 2003  athena
+  * [project @ 2003-04-16 20:18:29 by athena]
+  Pathetic attempt at saving a couple of registers...
+
+    M ./simd/simd-sse.h -6 +9
+    M ./simd/simd-sse2.h -6 +9
+
+Wed Apr 16 15:51:27 EDT 2003  athena
+  * [project @ 2003-04-16 19:51:27 by athena]
+  for (i = 0; i < m; ++i)  ==>  for (i = m; i > 0; --i)
+  No proof of evidence that this is any faster, but just in case...
+
+    M ./genfft/gen_hc2r.ml -3 +3
+    M ./genfft/gen_notw.ml -3 +3
+    M ./genfft/gen_notw_c.ml -3 +3
+    M ./genfft/gen_r2hc.ml -3 +3
+    M ./genfft/gen_r2r.ml -3 +3
+    M ./genfft/gen_twiddle.ml -5 +5
+    M ./genfft/gen_twiddle_c.ml -5 +5
+
+Tue Apr 15 15:03:20 EDT 2003  stevenj
+  * [project @ 2003-04-15 19:03:20 by stevenj]
+  added hack to make sure that codelet loops are preferred to vecloop solvers in the estimator
+
+    M ./dft/vrank-geq1.c -1 +2
+    M ./rdft/vrank-geq1-rdft2.c -1 +2
+    M ./rdft/vrank-geq1.c -1 +2
+
+Tue Apr 15 14:53:44 EDT 2003  stevenj
+  * [project @ 2003-04-15 18:53:44 by stevenj]
+  use double for flops
+
+    M ./api/f77funcs.h -1 +1
+    M ./api/fftw3.h -2 +2
+    M ./api/flops.c -1 +1
+    M ./doc/fftw3.texi -3 +6
+    M ./kernel/ifftw.h -5 +5
+    M ./tests/bench.c -2 +2
+
+Tue Apr 15 14:51:50 EDT 2003  stevenj
+  * [project @ 2003-04-15 18:51:50 by stevenj]
+  metrowerks reportedly supports gcc assembly extensions on ppc
+
+    M ./kernel/cycle.h -2 +2
+
+Mon Apr 14 15:00:50 EDT 2003  athena
+  * [project @ 2003-04-14 19:00:48 by athena]
+  foo_CFLAGS generates some automake junk that breaks the build
+  on Redhat 7.3.  Screw it.
+
+    M ./dft/simd/Makefile.am -1 +1
+    M ./simd/Makefile.am -1 +1
+
+Mon Apr 14 12:22:59 EDT 2003  athena
+  * [project @ 2003-04-14 16:22:59 by athena]
+  Carefully check return status
+
+    M ./tests/check.pl -1 +21
+
+Sun Apr 13 16:46:12 EDT 2003  athena
+  * [project @ 2003-04-13 20:46:12 by athena]
+  Removed annoying -FMA() expressions.
+
+    M ./genfft/c.ml -2 +6
+    M ./genfft/simd.ml -2 +2
+    M ./kernel/ifftw.h -1 +9
+    M ./support/Makefile.codelets -1 +1
+
+Sat Apr 12 14:32:22 EDT 2003  athena
+  * [project @ 2003-04-12 18:32:22 by athena]
+  Major fma hackery
+
+    M ./kernel/ifftw.h -5 +17
+
+Sat Apr 12 14:25:43 EDT 2003  athena
+  * [project @ 2003-04-12 18:25:43 by athena]
+  Slight cleanup
+
+    M ./api/apiplan.c -8 +5
+
+Sat Apr 12 10:04:51 EDT 2003  athena
+  * [project @ 2003-04-12 14:04:51 by athena]
+  Updated version number
+
+    M ./configure.ac -1 +1
+
+Sat Apr 12 08:03:07 EDT 2003  athena
+  * [project @ 2003-04-12 12:03:07 by athena]
+  Damn autoconf
+
+    M ./acinclude.m4 -2 +2
+
+Sat Apr 12 07:54:20 EDT 2003  athena
+  * [project @ 2003-04-12 11:54:20 by athena]
+  Recognize all 74xx processors
+
+    M ./acinclude.m4 -4 +5
+
+Sat Apr 12 07:35:17 EDT 2003  athena
+  * [project @ 2003-04-12 11:35:17 by athena]
+  Detect 7400 processor.
+
+    M ./acinclude.m4 -3 +7
+
+Fri Apr 11 20:42:11 EDT 2003  athena
+  * [project @ 2003-04-12 00:42:11 by athena]
+  No need to check for gcc-2.95
+
+    M ./acinclude.m4 -4 +2
+
+Fri Apr 11 16:14:39 EDT 2003  stevenj
+  * [project @ 2003-04-11 20:14:39 by stevenj]
+  removed duplicate
+
+    M ./NEWS -3
+
+Fri Apr 11 08:45:37 EDT 2003  athena
+  * [project @ 2003-04-11 12:45:37 by athena]
+  mflops ==> ``mflops''
+
+    M ./libbench2/report.c -2 +2
+
+Fri Apr 11 07:00:53 EDT 2003  athena
+  * [project @ 2003-04-11 11:00:53 by athena]
+  Print setup time as well
+
+    M ./libbench2/report.c -3 +5
+
+Sat Aug 12 23:14:42 EDT 2006  Unknown tagger
+  tagged fftw-3-0-beta3
+
+
+Thu Apr 10 15:36:18 EDT 2003  athena
+  * [project @ 2003-04-10 19:36:15 by athena]
+  Enforce pointer equality for in-place problems.
+
+    M ./dft/problem.c -4 +8
+    M ./kernel/ifftw.h -1 +4
+    M ./rdft/problem.c -4 +3
+    M ./rdft/problem2.c -4 +7
+    M ./simd/taint.c -1 +8
+
+Wed Apr  9 17:47:54 EDT 2003  stevenj
+  * [project @ 2003-04-09 21:47:26 by stevenj]
+  updated
+
+    M ./ChangeLog +406
+    M ./NEWS +28
+
+Wed Apr  9 14:53:38 EDT 2003  stevenj
+  * [project @ 2003-04-09 18:53:38 by stevenj]
+  cross-ref fftw-wisdom man page
+
+    M ./tests/README -1 +1
+
+Wed Apr  9 10:13:00 EDT 2003  athena
+  * [project @ 2003-04-09 14:13:00 by athena]
+  Undone previous change, committed by mistake.
+
+    M ./kernel/planner.c -2 +1
+
+Wed Apr  9 10:12:24 EDT 2003  athena
+  * [project @ 2003-04-09 14:12:18 by athena]
+  Quick and dirty README for bench
+
+    A ./tests/README
+    M ./kernel/planner.c -1 +2
+    M ./tests/Makefile.am -1 +1
+    M ./tests/README +57
+
+Wed Apr  9 08:50:25 EDT 2003  athena
+  * [project @ 2003-04-09 12:50:25 by athena]
+  Consider additional command-line arguments as problems to be
+  benchmarked.
+
+    M ./libbench2/bench-main.c -7 +6
+    M ./libbench2/timer.c -1 +7
+
+Wed Apr  9 08:44:13 EDT 2003  athena
+  * [project @ 2003-04-09 12:44:13 by athena]
+  Default report format is now human-readable.  Removed
+  unnecessary complexity in benchmark reporting.
+
+    M ./libbench2/bench-main.c -18 +7
+    M ./libbench2/bench.h -5 +2
+    M ./libbench2/report.c -71 +39
+
+Wed Apr  9 06:10:40 EDT 2003  athena
+  * [project @ 2003-04-09 10:10:40 by athena]
+  Updated for new interleaved/split api.
+
+    M ./doc/fftw3.texi -100 +180
+
+Wed Apr  9 03:01:03 EDT 2003  stevenj
+  * [project @ 2003-04-09 07:01:03 by stevenj]
+  updated citation
+
+    M ./doc/fftw3.texi -3 +3
+
+Tue Apr  8 19:35:59 EDT 2003  athena
+  * [project @ 2003-04-08 23:35:59 by athena]
+  Time for beta3
+
+    M ./configure.ac -1 +1
+
+Tue Apr  8 17:40:59 EDT 2003  stevenj
+  * [project @ 2003-04-08 21:40:59 by stevenj]
+  whoops, added
+
+    A ./reodft/redft00e-r2hc-pad.c
+
+Tue Apr  8 17:33:47 EDT 2003  stevenj
+  * [project @ 2003-04-08 21:33:47 by stevenj]
+  more comparison of different R*DFT types
+
+    M ./doc/fftw3.texi -1 +32
+
+Tue Apr  8 16:48:08 EDT 2003  stevenj
+  * [project @ 2003-04-08 20:48:08 by stevenj]
+  comments
+
+    M ./reodft/redft00e-r2hc.c -5 +18
+    M ./reodft/rodft00e-r2hc.c -5 +18
+
+Tue Apr  8 16:19:39 EDT 2003  stevenj
+  * [project @ 2003-04-08 20:19:39 by stevenj]
+  more accurate DCT-I and DST-I, at the expense of up to a factor of 2 in speed and memory
+
+    A ./reodft/rodft00e-r2hc-pad.c
+    M ./reodft/Makefile.am -4 +5
+    M ./reodft/conf.c -3 +5
+    M ./reodft/reodft.h +2
+    M ./reodft/rodft00e-r2hc-pad.c +200
+
+Tue Apr  8 05:38:09 EDT 2003  athena
+  * [project @ 2003-04-08 09:38:09 by athena]
+  Workaround gcc/sparc bug
+
+    M ./kernel/planner.c -2 +3
+
+Tue Apr  8 01:34:12 EDT 2003  stevenj
+  * [project @ 2003-04-08 05:34:12 by stevenj]
+  rumors
+
+    M ./doc/fftw3.texi -2 +2
+
+Mon Apr  7 18:54:11 EDT 2003  stevenj
+  * [project @ 2003-04-07 22:54:11 by stevenj]
+  added rdft2 paranoid mode
+
+    M ./tests/hook.c +33
+
+Mon Apr  7 18:47:37 EDT 2003  stevenj
+  * [project @ 2003-04-07 22:47:37 by stevenj]
+  added paranoid mode for r2r
+
+    M ./tests/hook.c +51
+
+Mon Apr  7 15:10:08 EDT 2003  stevenj
+  * [project @ 2003-04-07 19:10:08 by stevenj]
+  whoops, sincos is predefined on some systems
+
+    M ./libbench2/verify-r2r.c -3 +3
+
+Sat Apr  5 16:50:57 EST 2003  athena
+  * [project @ 2003-04-05 21:50:57 by athena]
+  bp->destroy_input was not initialized
+
+    M ./tests/hook.c +1
+
+Sat Apr  5 09:29:11 EST 2003  athena
+  * [project @ 2003-04-05 14:29:11 by athena]
+  Asserted correctness conditions for tainted pointers.
+  
+  (For now, use CK() while we test.  They should be changed into
+  A() at some point.)
+
+    M ./dft/problem.c -7 +16
+    M ./kernel/ifftw.h -1 +3
+    M ./rdft/problem.c -3 +7
+    M ./rdft/problem2.c -6 +12
+
+Sat Apr  5 08:18:23 EST 2003  athena
+  * [project @ 2003-04-05 13:18:23 by athena]
+  Untaint pointers before zero'ing arrays and before hashing
+
+    M ./dft/problem.c -8 +10
+    M ./rdft/problem.c -4 +4
+    M ./rdft/problem2.c -9 +9
+
+Sat Apr  5 07:11:56 EST 2003  athena
+  * [project @ 2003-04-05 12:11:56 by athena]
+  Alignment check did not work with icc, which seems to be
+  confused by the fact that the variable is not used.
+
+    M ./libbench2/bench-main.c -5 +4
+
+Sat Apr  5 06:41:20 EST 2003  athena
+  * [project @ 2003-04-05 11:41:20 by athena]
+  More paranoid paranoid-check
+
+    M ./tests/Makefile.am -1 +7
+
+Sat Apr  5 06:19:25 EST 2003  athena
+  * [project @ 2003-04-05 11:19:25 by athena]
+  0 == x & 7 parses as (0 == x) & 7, which is wrong
+
+    M ./kernel/ifftw.h -2 +2
+
+Fri Apr  4 21:35:49 EST 2003  stevenj
+  * [project @ 2003-04-05 02:35:49 by stevenj]
+  alignment checks
+
+    M ./dft/direct.c -1 +2
+    M ./kernel/ifftw.h -1 +12
+    M ./kernel/planner.c -1 +2
+    M ./libbench2/bench-main.c -1 +3
+    M ./rdft/direct.c -1 +4
+    M ./rdft/direct2.c -1 +3
+
+Fri Apr  4 21:04:14 EST 2003  stevenj
+  * [project @ 2003-04-05 02:04:14 by stevenj]
+  prevent infinite loops in exhaustive planning
+
+    M ./rdft/rdft-dht.c -1 +6
+
+Fri Apr  4 20:58:20 EST 2003  stevenj
+  * [project @ 2003-04-05 01:58:20 by stevenj]
+  split/unsplit guru interface
+
+    A ./api/execute-split-dft-c2r.c
+    A ./api/execute-split-dft-r2c.c
+    A ./api/execute-split-dft.c
+    A ./api/plan-guru-split-dft-c2r.c
+    A ./api/plan-guru-split-dft-r2c.c
+    A ./api/plan-guru-split-dft.c
+    M ./api/Makefile.am -1 +3
+    M ./api/api.h -2 +3
+    M ./api/apiplan.c -3 +4
+    M ./api/execute-dft-c2r.c -2 +2
+    M ./api/execute-dft-r2c.c -2 +2
+    M ./api/execute-dft.c -3 +6
+    M ./api/execute-split-dft-c2r.c +29
+    M ./api/execute-split-dft-r2c.c +29
+    M ./api/execute-split-dft.c +29
+    M ./api/f77funcs.h -8 +73
+    M ./api/fftw3.h -5 +25
+    M ./api/mktensor-iodims.c -3 +3
+    M ./api/plan-guru-dft-c2r.c -4 +9
+    M ./api/plan-guru-dft-r2c.c -4 +9
+    M ./api/plan-guru-dft.c -4 +10
+    M ./api/plan-guru-r2r.c -3 +4
+    M ./api/plan-guru-split-dft-c2r.c +40
+    M ./api/plan-guru-split-dft-r2c.c +39
+    M ./api/plan-guru-split-dft.c +39
+    M ./api/plan-many-dft-c2r.c -1 +1
+    M ./api/plan-many-dft-r2c.c -1 +1
+    M ./api/plan-many-dft.c -1 +1
+    M ./api/plan-many-r2r.c -1 +1
+    M ./tests/bench.c -29 +15
+
+Fri Apr  4 20:39:55 EST 2003  athena
+  * [project @ 2003-04-05 01:39:55 by athena]
+  Need UNTAINT in verifier too.
+
+    M ./tests/hook.c -4 +4
+
+Fri Apr  4 19:36:46 EST 2003  athena
+  * [project @ 2003-04-05 00:36:46 by athena]
+  Forgot #if HAVE_SIMD
+
+    M ./simd/taint.c -1 +5
+
+Fri Apr  4 19:30:37 EST 2003  athena
+  * [project @ 2003-04-05 00:30:37 by athena]
+  Keep track of two separate taint bits
+
+    A ./simd/taint.c
+    M ./api/fftw3.h -2 +2
+    M ./kernel/align.c -15 +1
+    M ./kernel/ifftw.h -2 +2
+    M ./simd/Makefile.am -1 +1
+    M ./simd/simd.h -2 +16
+    M ./simd/taint.c +33
+
+Fri Apr  4 19:16:32 EST 2003  stevenj
+  * [project @ 2003-04-05 00:16:32 by stevenj]
+  added NO_SIMD problem flag, made UNALIGNED an API issue (taints input pointers)
+
+    M ./api/api.h +2
+    M ./api/fftw3.h -1 +3
+    M ./api/mapflags.c -1 +1
+    M ./api/plan-guru-dft-c2r.c -1 +3
+    M ./api/plan-guru-dft-r2c.c -1 +3
+    M ./api/plan-guru-dft.c -1 +4
+    M ./api/plan-guru-r2r.c -1 +2
+    M ./api/plan-many-dft-c2r.c -1 +3
+    M ./api/plan-many-dft-r2c.c -1 +3
+    M ./api/plan-many-dft.c -1 +4
+    M ./api/plan-many-r2r.c -1 +2
+    M ./dft/k7/k7.c -5 +13
+    M ./dft/simd/n1b.c -1 +1
+    M ./dft/simd/n1f.c -1 +1
+    M ./dft/simd/n2b.c +1
+    M ./dft/simd/n2f.c +1
+    M ./dft/simd/q1b.c -1 +1
+    M ./dft/simd/q1f.c -1 +1
+    M ./dft/simd/t1b.c -1 +1
+    M ./dft/simd/t1f.c -1 +1
+    M ./kernel/ifftw.h -3 +3
+    M ./tests/bench.c +1
+
+Fri Apr  4 18:14:14 EST 2003  stevenj
+  * [project @ 2003-04-04 23:14:14 by stevenj]
+  bugfix in buffered: wrong pointers passed for cldrest; also use TAINT instead of UNALIGNED in buffered2
+
+    M ./dft/buffered.c -6 +10
+    M ./rdft/buffered.c -6 +10
+    M ./rdft/buffered2.c -7 +5
+
+Fri Apr  4 17:19:51 EST 2003  athena
+  * [project @ 2003-04-04 22:19:51 by athena]
+  Reverted previous change, committed accidentally
+
+    M ./dft/vrank-geq1.c -3 +3
+
+Fri Apr  4 17:18:39 EST 2003  athena
+  * [project @ 2003-04-04 22:18:39 by athena]
+  What was I thinking?
+
+    M ./kernel/align.c -1 +2
+
+Fri Apr  4 17:18:21 EST 2003  athena
+  * [project @ 2003-04-04 22:18:21 by athena]
+
+    M ./dft/vrank-geq1.c -3 +3
+
+Fri Apr  4 16:48:32 EST 2003  stevenj
+  * [project @ 2003-04-04 21:48:32 by stevenj]
+  added --enable-debug-alignment
+
+    M ./configure.ac +5
+    M ./libbench2/aligned-main.c -1 +7
+
+Fri Apr  4 16:29:43 EST 2003  stevenj
+  * [project @ 2003-04-04 21:29:43 by stevenj]
+  X(taint) prototype, define corresponding function only if HAVE_SIMD
+
+    M ./kernel/align.c -1 +3
+    M ./kernel/ifftw.h -1 +2
+
+Fri Apr  4 16:15:53 EST 2003  athena
+  * [project @ 2003-04-04 21:15:53 by athena]
+  Initial checkin of tained pointers
+
+    M ./dft/buffered.c -4 +9
+    M ./dft/solve.c -2 +4
+    M ./dft/vrank-geq1.c -3 +3
+    M ./kernel/align.c -6 +4
+    M ./kernel/ifftw.h -2 +9
+    M ./rdft/buffered.c -4 +4
+    M ./rdft/buffered2.c -3 +2
+    M ./rdft/solve.c -2 +2
+    M ./rdft/solve2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -3 +4
+    M ./rdft/vrank-geq1.c -3 +3
+
+Fri Apr  4 13:12:58 EST 2003  athena
+  * [project @ 2003-04-04 18:12:52 by athena]
+  More conservative preservation of alignment
+
+    M ./dft/buffered.c -1 +2
+    M ./dft/rader.c -4 +3
+    M ./dft/simd/n2b.c -1
+    M ./dft/simd/n2f.c -1
+    M ./dft/vrank-geq1.c -6 +4
+    M ./kernel/align.c -35 +8
+    M ./kernel/ifftw.h -3 +2
+    M ./rdft/buffered.c -1 +2
+    M ./rdft/buffered2.c -1 +4
+    M ./rdft/dht-rader.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -5 +3
+    M ./rdft/vrank-geq1.c -5 +5
+
+Thu Apr  3 23:16:27 EST 2003  stevenj
+  * [project @ 2003-04-04 04:16:27 by stevenj]
+  plan/execute with aligned stack
+
+    M ./api/apiplan.c -19 +24
+    M ./api/execute-dft-c2r.c -2 +2
+    M ./api/execute-dft-r2c.c -2 +2
+    M ./api/execute-dft.c -2 +2
+    M ./api/execute-r2r.c -2 +2
+    M ./api/execute.c -2 +2
+    M ./api/f77funcs.h -10 +10
+
+Thu Apr  3 15:40:01 EST 2003  stevenj
+  * [project @ 2003-04-03 20:40:01 by stevenj]
+  whoops, missed FFTW_MEASURE in fftw3.f
+
+    M ./api/Makefile.am -1 +1
+
+Thu Apr  3 13:44:46 EST 2003  stevenj
+  * [project @ 2003-04-03 18:44:46 by stevenj]
+  use WITH_ALIGNED_STACK for experimental semaphore stuff, too
+
+    M ./threads/threads.c -8 +3
+
+Thu Apr  3 09:04:23 EST 2003  athena
+  * [project @ 2003-04-03 14:04:23 by athena]
+  Removed old file
+
+    R ./kernel/stack.c
+
+Thu Apr  3 07:50:43 EST 2003  athena
+  * [project @ 2003-04-03 12:50:43 by athena]
+  Improved stack-alignment hack
+
+    M ./kernel/Makefile.am -3 +3
+    M ./kernel/ifftw.h -4 +52
+    M ./threads/ct-dit.c -7 +6
+    M ./threads/dft-vrank-geq1.c -7 +6
+    M ./threads/hc2hc-dif.c -7 +6
+    M ./threads/hc2hc-dit.c -7 +6
+    M ./threads/rdft-vrank-geq1.c -6 +4
+    M ./threads/threads.h -5
+    M ./threads/vrank-geq1-rdft2.c -7 +6
+
+Thu Apr  3 02:37:57 EST 2003  stevenj
+  * [project @ 2003-04-03 07:37:57 by stevenj]
+  use aligned stack for experimental semaphores, too
+
+    M ./threads/threads.c -2 +8
+
+Thu Apr  3 02:17:58 EST 2003  stevenj
+  * [project @ 2003-04-03 07:17:58 by stevenj]
+  whoops
+
+    M ./kernel/ifftw.h -2 +4
+    M ./kernel/stack.c -2 +2
+    M ./threads/ct-dit.c -2 +4
+    M ./threads/dft-vrank-geq1.c -2 +4
+    M ./threads/hc2hc-dif.c -2 +4
+    M ./threads/hc2hc-dit.c -2 +4
+    M ./threads/rdft-vrank-geq1.c -2 +4
+    M ./threads/threads.c -24 +3
+    M ./threads/threads.h +5
+    M ./threads/vrank-geq1-rdft2.c -2 +4
+
+Thu Apr  3 01:58:32 EST 2003  stevenj
+  * [project @ 2003-04-03 06:58:32 by stevenj]
+  fix(?) for SIMD thread problems
+
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/stack.c -3 +3
+    M ./threads/ct-dit.c -7 +10
+    M ./threads/hc2hc-dif.c -2 +2
+    M ./threads/hc2hc-dit.c -2 +2
+    M ./threads/threads.c -6 +27
+
+Wed Apr  2 20:33:12 EST 2003  stevenj
+  * [project @ 2003-04-03 01:33:12 by stevenj]
+  noted n=1 REDFT01 case
+
+    M ./doc/fftw3.texi -1 +11
+
+Wed Apr  2 20:32:07 EST 2003  stevenj
+  * [project @ 2003-04-03 01:32:07 by stevenj]
+  note about n=2 REDFT00 formula
+
+    M ./doc/fftw3.texi -2 +3
+
+Wed Apr  2 20:30:10 EST 2003  stevenj
+  * [project @ 2003-04-03 01:30:10 by stevenj]
+  note about undefined REDFT00
+
+    M ./doc/fftw3.texi -1 +2
+
+Wed Apr  2 20:18:03 EST 2003  stevenj
+  * [project @ 2003-04-03 01:18:03 by stevenj]
+  noted n=1 RODFT01 case
+
+    M ./doc/fftw3.texi -2 +12
+
+Wed Apr  2 20:14:07 EST 2003  stevenj
+  * [project @ 2003-04-03 01:14:07 by stevenj]
+  corrected definitions
+
+    M ./doc/equation-redft11.png
+    M ./doc/equation-rodft01.png
+    M ./doc/equation-rodft11.png
+    M ./doc/fftw3.texi -6 +6
+
+Wed Apr  2 19:43:59 EST 2003  stevenj
+  * [project @ 2003-04-03 00:43:59 by stevenj]
+  added REODFT_KINDP, fixed nontrivial test for R2HC11 and HC2R11 (not that we support these yet anyway)
+
+    M ./rdft/codelet-rdft.h +1
+    M ./rdft/problem.c -3 +3
+    M ./rdft/vrank-geq1.c -3 +3
+
+Wed Apr  2 19:16:54 EST 2003  stevenj
+  * [project @ 2003-04-03 00:16:53 by stevenj]
+  size 2 hc2r and dht are equivalent to r2hc
+
+    M ./rdft/codelets/hc2r/Makefile.am -3 +5
+    M ./rdft/problem.c -3 +5
+
+Wed Apr  2 15:09:08 EST 2003  stevenj
+  * [project @ 2003-04-02 20:09:08 by stevenj]
+  noted overwriting in upgrading section
+
+    M ./doc/fftw3.texi -1 +4
+
+Wed Apr  2 05:25:56 EST 2003  athena
+  * [project @ 2003-04-02 10:25:56 by athena]
+  Moved with_aligned_stack  to its own file
+
+    A ./kernel/stack.c
+    M ./kernel/Makefile.am -3 +3
+    M ./kernel/align.c -46 +1
+    M ./kernel/stack.c +67
+
+Tue Apr  1 21:11:31 EST 2003  athena
+  * [project @ 2003-04-02 02:11:31 by athena]
+  Fixed comments
+
+    M ./kernel/align.c -6 +2
+    M ./libbench2/aligned-main.c -6 +2
+
+Tue Apr  1 20:57:39 EST 2003  athena
+  * [project @ 2003-04-02 01:57:39 by athena]
+  Alignment hacks
+
+    M ./kernel/align.c -1 +50
+    M ./kernel/ifftw.h -2 +3
+    M ./libbench2/aligned-main.c -26 +14
+    M ./libbench2/bench-main.c -1 +9
+
+Tue Apr  1 14:26:48 EST 2003  stevenj
+  * [project @ 2003-04-01 19:26:48 by stevenj]
+  phew, no, previous version was okay
+
+    M ./threads/threads.c -1 +1
+
+Tue Apr  1 14:26:15 EST 2003  stevenj
+  * [project @ 2003-04-01 19:26:15 by stevenj]
+  whoops, crap
+
+    M ./threads/threads.c -2 +2
+
+Tue Apr  1 08:01:06 EST 2003  athena
+  * [project @ 2003-04-01 13:01:06 by athena]
+  support sse2 in forthcoming gcc-3.3
+
+    M ./simd/simd-sse2.h -3 +11
+
+Tue Apr  1 01:17:15 EST 2003  stevenj
+  * [project @ 2003-04-01 06:17:15 by stevenj]
+  comment
+
+    M ./kernel/cycle.h -2 +2
+
+Tue Apr  1 01:16:46 EST 2003  stevenj
+  * [project @ 2003-04-01 06:16:46 by stevenj]
+  noted ac_check_headers
+
+    M ./kernel/cycle.h -1 +4
+
+Tue Apr  1 01:11:31 EST 2003  stevenj
+  * [project @ 2003-04-01 06:11:31 by stevenj]
+  comment
+
+    M ./kernel/cycle.h -1 +2
+
+Tue Apr  1 01:06:53 EST 2003  stevenj
+  * [project @ 2003-04-01 06:06:53 by stevenj]
+  documented autoconf tests, so that cycle.h can be distributed separately
+
+    M ./kernel/cycle.h -1 +27
+
+Sat Aug 12 23:11:17 EDT 2006  Unknown tagger
+  tagged fftw-3-0-beta2
+
+
+Mon Mar 31 22:12:02 EST 2003  stevenj
+  * [project @ 2003-04-01 03:12:02 by stevenj]
+  IRIX is all-caps
+
+    M ./NEWS -1 +1
+
+Mon Mar 31 22:11:42 EST 2003  stevenj
+  * [project @ 2003-04-01 03:11:42 by stevenj]
+  noted Irix fix
+
+    M ./NEWS +3
+
+Mon Mar 31 22:10:33 EST 2003  stevenj
+  * [project @ 2003-04-01 03:10:33 by stevenj]
+  whoops
+
+    M ./threads/api.c -1 +1
+    M ./threads/threads.h -1 +1
+
+Mon Mar 31 22:04:35 EST 2003  stevenj
+  * [project @ 2003-04-01 03:04:35 by stevenj]
+  use ithreads_init so as not to confuse fftw 2 users
+
+    M ./threads/threads.c -6 +8
+
+Mon Mar 31 22:00:42 EST 2003  stevenj
+  * [project @ 2003-04-01 03:00:42 by stevenj]
+  IRIX lossage
+
+    M ./threads/threads.c -1 +5
+
+Mon Mar 31 21:19:20 EST 2003  stevenj
+  * [project @ 2003-04-01 02:19:20 by stevenj]
+  check for -openmp (icc) among the OpenMP flags (TODO: make this a
+  separate macro, with a loop instead of repeated checks)
+
+    M ./configure.ac +5
+
+Mon Mar 31 17:12:19 EST 2003  stevenj
+  * [project @ 2003-03-31 22:12:19 by stevenj]
+  clarification
+
+    M ./doc/fftw3.texi -1 +4
+
+Mon Mar 31 17:01:16 EST 2003  athena
+  * [project @ 2003-03-31 22:01:16 by athena]
+  More liberal test for solaris CC
+
+    M ./acinclude.m4 -1 +1
+
+Mon Mar 31 15:13:33 EST 2003  athena
+  * [project @ 2003-03-31 20:13:33 by athena]
+  Allow x86-64 simd
+
+    M ./simd/simd-sse.h -1 +1
+    M ./simd/simd-sse2.h -1 +1
+
+Mon Mar 31 15:13:21 EST 2003  athena
+  * [project @ 2003-03-31 20:13:21 by athena]
+  Added x86-64 timer code
+
+    M ./kernel/cycle.h -1 +22
+
+Mon Mar 31 13:10:54 EST 2003  stevenj
+  * [project @ 2003-03-31 18:10:54 by stevenj]
+  updated
+
+    M ./NEWS +4
+
+Mon Mar 31 13:07:19 EST 2003  stevenj
+  * [project @ 2003-03-31 18:07:19 by stevenj]
+  updated
+
+    M ./ChangeLog -1378 +3497
+
+Mon Mar 31 13:05:27 EST 2003  stevenj
+  * [project @ 2003-03-31 18:05:27 by stevenj]
+  colon
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Mon Mar 31 07:20:20 EST 2003  athena
+  * [project @ 2003-03-31 12:20:20 by athena]
+  Reorganized compiler bugs section (which is growing out of control)
+
+    M ./doc/FAQ/fftw-faq.bfnn -11 +18
+
+Mon Mar 31 07:15:20 EST 2003  athena
+  * [project @ 2003-03-31 12:15:20 by athena]
+  solaris gcc bug appears to be also in 2.95.2
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Mon Mar 31 07:13:45 EST 2003  athena
+  * [project @ 2003-03-31 12:13:45 by athena]
+  Workaround works---there is another gcc/sparc bug elsehwere
+
+    M ./kernel/planner.c -3 +3
+
+Mon Mar 31 07:08:56 EST 2003  athena
+  * [project @ 2003-03-31 12:08:56 by athena]
+  Grrr, workaround does not work.
+
+    M ./kernel/planner.c -3 +3
+
+Mon Mar 31 07:02:23 EST 2003  athena
+  * [project @ 2003-03-31 12:02:23 by athena]
+  ADDMOD is now function, which seems to avoid gcc bugs.
+
+    M ./kernel/planner.c -10 +16
+
+Sun Mar 30 16:40:26 EST 2003  athena
+  * [project @ 2003-03-30 21:40:26 by athena]
+  Workaround sparc gcc bug
+
+    M ./kernel/planner.c -3 +5
+
+Sun Mar 30 15:51:59 EST 2003  stevenj
+  * [project @ 2003-03-30 20:50:59 by stevenj]
+  note
+
+    M ./doc/fftw3.texi -4 +5
+
+Sun Mar 30 15:34:57 EST 2003  stevenj
+  * [project @ 2003-03-30 20:34:57 by stevenj]
+  make non-square UGLY, for now
+
+    M ./dft/vrank2-transpose.c -4 +4
+    M ./dft/vrank3-transpose.c -1 +4
+
+Sun Mar 30 15:33:57 EST 2003  stevenj
+  * [project @ 2003-03-30 20:33:57 by stevenj]
+  added -o amnesia to forget_wisdom before each plan
+
+    M ./tests/bench.c +5
+
+Sun Mar 30 09:41:27 EST 2003  athena
+  * [project @ 2003-03-30 14:41:27 by athena]
+  Report setup time in benchmark
+
+    M ./libbench2/bench-user.h -1 +4
+    M ./libbench2/report.c -2 +2
+    M ./libbench2/speed.c -1 +4
+
+Sat Mar 29 20:21:15 EST 2003  stevenj
+  * [project @ 2003-03-30 01:20:52 by stevenj]
+  comment
+
+    M ./kernel/transpose.c -1 +2
+
+Sat Mar 29 19:11:10 EST 2003  stevenj
+  * [project @ 2003-03-30 00:11:10 by stevenj]
+  slight change
+
+    M ./doc/fftw3.texi -4 +4
+
+Sat Mar 29 18:46:16 EST 2003  athena
+  * [project @ 2003-03-29 23:46:16 by athena]
+  More relaxed definition of UGLYness
+
+    M ./kernel/ct.c -1 +1
+
+Sat Mar 29 15:28:01 EST 2003  stevenj
+  * [project @ 2003-03-29 20:28:01 by stevenj]
+  no more cvs id strings in header files...I'm tired of having to rebuild everything after a commit
+
+    M ./rdft/codelet-rdft.h -2
+    M ./rdft/hc2hc.h -2
+    M ./rdft/rdft.h -2
+    M ./reodft/reodft.h -2
+    M ./threads/threads.h -2
+
+Sat Mar 29 15:22:28 EST 2003  stevenj
+  * [project @ 2003-03-29 20:22:28 by stevenj]
+  rdft2 stride unification
+
+    A ./rdft/rdft2-strides.c
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/buffered2.c -20 +11
+    M ./rdft/direct2.c -5 +5
+    M ./rdft/rdft.h -1 +2
+    M ./rdft/rdft2-inplace-strides.c -9 +2
+    M ./rdft/rdft2-strides.c +38
+    M ./rdft/rdft2-tensor-max-index.c -7 +4
+    M ./rdft/vrank-geq1-rdft2.c -26 +9
+    M ./threads/vrank-geq1-rdft2.c -9 +3
+
+Sat Mar 29 14:38:23 EST 2003  stevenj
+  * [project @ 2003-03-29 19:38:23 by stevenj]
+  preserve in-place-ness
+
+    M ./rdft/vrank-geq1-rdft2.c -3 +3
+
+Sat Mar 29 14:23:31 EST 2003  stevenj
+  * [project @ 2003-03-29 19:23:31 by stevenj]
+  make nowisdom the default
+
+    M ./tests/Makefile.am -7 +7
+    M ./tests/bench.c -2 +2
+    M ./tests/check.pl -3 +3
+
+Sat Mar 29 14:13:18 EST 2003  athena
+  * [project @ 2003-03-29 19:13:18 by athena]
+  --verbose in paranoid-check produces too much output.  Make it quiet.
+
+    M ./tests/Makefile.am -1 +1
+
+Sat Mar 29 13:45:13 EST 2003  stevenj
+  * [project @ 2003-03-29 18:45:13 by stevenj]
+  fixed transpose bugs...need to check ri-ii before deciding whether Ntuple fits
+
+    M ./dft/vrank2-transpose.c -5 +9
+    M ./dft/vrank3-transpose.c -7 +13
+    M ./kernel/ifftw.h -3 +5
+    M ./kernel/transpose.c -11 +21
+
+Sat Mar 29 08:10:40 EST 2003  athena
+  * [project @ 2003-03-29 13:10:40 by athena]
+  try more 2^k
+
+    M ./tests/check.pl +2
+
+Sat Mar 29 08:05:41 EST 2003  athena
+  * [project @ 2003-03-29 13:05:41 by athena]
+  MIN_ALIGNMENT was defined after being used, causing crash in sse2.
+
+    M ./kernel/ifftw.h -4 +4
+
+Sat Mar 29 03:07:34 EST 2003  stevenj
+  * [project @ 2003-03-29 08:07:34 by stevenj]
+  real transposes are currently unused, and are not needed for MPI code either
+
+    M ./kernel/Makefile.am -2 +2
+    M ./kernel/ifftw.h -2 +1
+    R ./kernel/tensor10.c
+    M ./kernel/transpose.c -180 +18
+    M ./rdft/Makefile.am -2 +3
+    M ./rdft/conf.c -3 +5
+
+Sat Mar 29 02:58:39 EST 2003  stevenj
+  * [project @ 2003-03-29 07:58:39 by stevenj]
+  added general transpose
+
+    A ./kernel/transpose.c
+    M ./dft/vrank2-transpose.c -9 +43
+    M ./dft/vrank3-transpose.c -25 +57
+    M ./kernel/Makefile.am -2 +2
+    M ./kernel/ifftw.h -1 +13
+    M ./kernel/transpose.c +523
+
+Fri Mar 28 22:49:04 EST 2003  stevenj
+  * [project @ 2003-03-29 03:49:04 by stevenj]
+  added transposition option
+
+    M ./libbench2/problem.c -1 +18
+
+Fri Mar 28 22:09:22 EST 2003  stevenj
+  * [project @ 2003-03-29 03:09:22 by stevenj]
+  yikes, fixed incorrect applicability of transpose plans
+
+    A ./kernel/tensor10.c
+    M ./dft/vrank2-transpose.c -3 +3
+    M ./dft/vrank3-transpose.c -2 +2
+    M ./kernel/Makefile.am -2 +2
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/tensor10.c +31
+    M ./rdft/vrank2-transpose.c -2 +2
+    M ./rdft/vrank3-transpose.c -2 +2
+
+Fri Mar 28 22:06:14 EST 2003  stevenj
+  * [project @ 2003-03-29 03:06:14 by stevenj]
+  in the future, we might want to allow sz->rnk == 0, vecsz->rnk arbitrary to be converted to r2hc (the apply function already should work for this case)...disabled for now, though
+
+    M ./rdft/dft-r2hc.c -7 +21
+
+Fri Mar 28 19:12:08 EST 2003  stevenj
+  * [project @ 2003-03-29 00:12:08 by stevenj]
+  use most_unaligned in rdft2
+
+    M ./kernel/align.c -6 +1
+    M ./kernel/ifftw.h -2 +1
+    M ./rdft/vrank-geq1-rdft2.c -6 +11
+
+Fri Mar 28 19:11:47 EST 2003  stevenj
+  * [project @ 2003-03-29 00:11:47 by stevenj]
+  slight change
+
+    M ./tests/Makefile.am -2 +2
+
+Fri Mar 28 19:00:21 EST 2003  stevenj
+  * [project @ 2003-03-29 00:00:21 by stevenj]
+  output message when checks pass
+
+    M ./tests/Makefile.am +12
+
+Fri Mar 28 17:21:47 EST 2003  stevenj
+  * [project @ 2003-03-28 22:21:47 by stevenj]
+  added ifndef alloca around alloca stuff
+
+    M ./kernel/ifftw.h -1 +3
+
+Fri Mar 28 13:45:50 EST 2003  athena
+  * [project @ 2003-03-28 18:45:43 by athena]
+  Proper alignment in rader
+
+    M ./dft/rader.c -2 +4
+    M ./dft/vrank-geq1.c -4 +3
+    M ./kernel/align.c -3 +2
+    M ./kernel/ifftw.h -2 +2
+    M ./rdft/dht-rader.c -2 +5
+
+Fri Mar 28 12:43:23 EST 2003  stevenj
+  * [project @ 2003-03-28 17:43:23 by stevenj]
+  whitespace
+
+    M ./kernel/ifftw.h -1 +2
+
+Fri Mar 28 12:41:39 EST 2003  stevenj
+  * [project @ 2003-03-28 17:41:39 by stevenj]
+  whoops, alloca stuff inside HAVE_ALLOCA
+
+    M ./kernel/ifftw.h -3 +4
+
+Fri Mar 28 12:35:21 EST 2003  stevenj
+  * [project @ 2003-03-28 17:35:21 by stevenj]
+  make check can afford to be a little bigger
+
+    M ./tests/Makefile.am -2 +2
+
+Fri Mar 28 12:31:32 EST 2003  stevenj
+  * [project @ 2003-03-28 17:31:32 by stevenj]
+  use same alloca macrology as configure script
+
+    M ./kernel/ifftw.h -28 +45
+
+Fri Mar 28 03:05:15 EST 2003  stevenj
+  * [project @ 2003-03-28 08:05:15 by stevenj]
+  fallback is no longer needed for mingw
+
+    M ./kernel/ifftw.h -6 +1
+
+Fri Mar 28 02:58:45 EST 2003  stevenj
+  * [project @ 2003-03-28 07:58:45 by stevenj]
+  alloca fallback for gcc
+
+    M ./kernel/ifftw.h -3 +4
+
+Fri Mar 28 02:49:59 EST 2003  stevenj
+  * [project @ 2003-03-28 07:49:59 by stevenj]
+  _alloca was added for MinGW, but it causes problems there
+
+    M ./configure.ac -1 +1
+
+Thu Mar 27 22:06:07 EST 2003  stevenj
+  * [project @ 2003-03-28 03:06:07 by stevenj]
+  fixed most_unaligned for split format
+
+    M ./kernel/align.c -5 +8
+
+Thu Mar 27 19:01:58 EST 2003  stevenj
+  * [project @ 2003-03-28 00:01:58 by stevenj]
+  whoops
+
+    M ./Makefile.am -1 +1
+
+Thu Mar 27 19:00:20 EST 2003  stevenj
+  * [project @ 2003-03-28 00:00:20 by stevenj]
+  added pkg-config
+
+    A ./fftw.pc.in
+    M ./Makefile.am -1 +6
+    M ./configure.ac +1
+    M ./fftw.pc.in +10
+
+Thu Mar 27 15:59:01 EST 2003  stevenj
+  * [project @ 2003-03-27 20:59:01 by stevenj]
+  fixed asserts
+
+    M ./dft/vrank-geq1.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +3
+    M ./rdft/vrank-geq1.c -2 +2
+
+Thu Mar 27 15:49:53 EST 2003  athena
+  * [project @ 2003-03-27 20:49:53 by athena]
+  Do not adjust r/i pointers separately.
+
+    M ./kernel/align.c -4 +5
+
+Thu Mar 27 15:17:40 EST 2003  athena
+  * [project @ 2003-03-27 20:17:40 by athena]
+  iForgot to add files
+
+    A ./dft/simd/n2b.h
+    A ./dft/simd/n2f.h
+    M ./dft/simd/n2b.h +25
+    M ./dft/simd/n2f.h +25
+
+Thu Mar 27 15:10:41 EST 2003  athena
+  * [project @ 2003-03-27 20:10:36 by athena]
+  Specialized n simd codelets for unit vector stride.
+
+    A ./dft/simd/n2b.c
+    A ./dft/simd/n2f.c
+    M ./configure.ac +4
+    M ./dft/simd/Makefile.am -2 +2
+    M ./dft/simd/codelets/Makefile.am -8 +54
+    M ./dft/simd/n1b.c -3 +3
+    M ./dft/simd/n1b.h +4
+    M ./dft/simd/n1f.c -3 +3
+    M ./dft/simd/n1f.h +4
+    M ./dft/simd/n2b.c +49
+    M ./dft/simd/n2f.c +49
+
+Thu Mar 27 08:22:03 EST 2003  athena
+  * [project @ 2003-03-27 13:22:03 by athena]
+  Changed version number to beta2
+
+    M ./configure.ac -1 +1
+
+Thu Mar 27 06:37:07 EST 2003  athena
+  * [project @ 2003-03-27 11:37:07 by athena]
+  Changed alignment requirements for n1 simd codelets.  Changed
+  mechanism for detecting lack of alignment.
+
+    M ./api/mapflags.c -1 +1
+    M ./dft/simd/n1b.c -4 +4
+    M ./dft/simd/n1b.h -3
+    M ./dft/simd/n1f.c -4 +4
+    M ./dft/simd/n1f.h -3
+    M ./dft/simd/q1b.c -1 +1
+    M ./dft/simd/q1f.c -1 +1
+    M ./dft/simd/t1b.c -1 +1
+    M ./dft/simd/t1f.c -1 +1
+    M ./dft/vrank-geq1.c -7 +7
+    M ./kernel/align.c -5 +42
+    M ./kernel/ifftw.h -3 +5
+    M ./rdft/vrank-geq1-rdft2.c -4 +7
+    M ./rdft/vrank-geq1.c -5 +5
+
+Thu Mar 27 04:25:06 EST 2003  athena
+  * [project @ 2003-03-27 09:25:06 by athena]
+  Oops, wrong place for hook
+
+    M ./tests/bench.c -2 +1
+
+Thu Mar 27 02:37:52 EST 2003  stevenj
+  * [project @ 2003-03-27 07:37:52 by stevenj]
+  added comments to codelet makefiles, to aid people wanting to generate their own code
+
+    M ./dft/codelets/inplace/Makefile.am -1 +23
+    M ./dft/codelets/standard/Makefile.am +24
+    M ./dft/k7/codelets/Makefile.am +25
+    M ./dft/simd/codelets/Makefile.am -1 +32
+    M ./rdft/codelets/hc2r/Makefile.am +25
+    M ./rdft/codelets/r2hc/Makefile.am +25
+    M ./rdft/codelets/r2r/Makefile.am +18
+
+Thu Mar 27 01:42:27 EST 2003  stevenj
+  * [project @ 2003-03-27 06:42:27 by stevenj]
+  Matteo is also a copyright holder
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Thu Mar 27 01:41:08 EST 2003  stevenj
+  * [project @ 2003-03-27 06:41:08 by stevenj]
+  FORTRAN is officially Fortran, these days
+
+    M ./doc/FAQ/fftw-faq.bfnn -2 +2
+
+Thu Mar 27 01:40:32 EST 2003  stevenj
+  * [project @ 2003-03-27 06:40:32 by stevenj]
+  punctuation
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Thu Mar 27 01:40:14 EST 2003  stevenj
+  * [project @ 2003-03-27 06:40:14 by stevenj]
+  don't use "wrapper"
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Thu Mar 27 01:37:53 EST 2003  stevenj
+  * [project @ 2003-03-27 06:37:53 by stevenj]
+  plural
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Thu Mar 27 01:35:32 EST 2003  stevenj
+  * [project @ 2003-03-27 06:35:32 by stevenj]
+  grammar
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Thu Mar 27 01:33:35 EST 2003  stevenj
+  * [project @ 2003-03-27 06:33:35 by stevenj]
+  better phrasing
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Wed Mar 26 22:47:58 EST 2003  stevenj
+  * [project @ 2003-03-27 03:47:58 by stevenj]
+  stddef.h should not be needed anymore for this file
+
+    M ./kernel/align.c -2 +1
+
+Wed Mar 26 22:13:48 EST 2003  stevenj
+  * [project @ 2003-03-27 03:13:48 by stevenj]
+  added comments for Franz mode
+
+    M ./dft/codelets/standard/Makefile.am -5 +5
+
+Wed Mar 26 22:11:58 EST 2003  stevenj
+  * [project @ 2003-03-27 03:10:11 by stevenj]
+  clarification
+
+    M ./dft/simd/codelets/Makefile.am -3 +4
+
+Wed Mar 26 22:08:51 EST 2003  stevenj
+  * [project @ 2003-03-27 03:08:51 by stevenj]
+  commented on FRANZ codelets
+
+    M ./dft/simd/codelets/Makefile.am -5 +5
+
+Wed Mar 26 22:06:45 EST 2003  stevenj
+  * [project @ 2003-03-27 03:06:45 by stevenj]
+  updated
+
+    M ./NEWS +23
+
+Wed Mar 26 21:51:15 EST 2003  stevenj
+  * [project @ 2003-03-27 02:51:15 by stevenj]
+  disable DIF codelets, since they are never used (apparently) except
+  for some non-power-of-two sizes...improve support for the latter by
+  adding size 3, 5, and 6 q^2 codelets.
+
+    M ./dft/codelets/inplace/Makefile.am -4 +8
+
+Wed Mar 26 20:07:11 EST 2003  stevenj
+  * [project @ 2003-03-27 01:07:11 by stevenj]
+  DHT has no forward/backward
+
+    M ./doc/fftw3.texi -3 +3
+
+Wed Mar 26 19:46:12 EST 2003  fftw
+  * [project @ 2003-03-27 00:46:12 by fftw]
+  added hacky way to use an arbitrary flag
+
+    M ./tests/bench.c +1
+
+Wed Mar 26 19:44:31 EST 2003  athena
+  * [project @ 2003-03-27 00:44:31 by athena]
+  Better place to install hook
+
+    M ./tests/bench.c -2 +2
+
+Wed Mar 26 19:40:28 EST 2003  stevenj
+  * [project @ 2003-03-27 00:40:28 by stevenj]
+  noted that the user should run make check if they think FFTW has a bug
+
+    M ./doc/FAQ/fftw-faq.bfnn -6 +8
+
+Wed Mar 26 17:31:16 EST 2003  athena
+  * [project @ 2003-03-26 22:31:16 by athena]
+  Oops, what am I thinking
+
+    M ./kernel/planner.c -5 +8
+
+Wed Mar 26 17:23:56 EST 2003  athena
+  * [project @ 2003-03-26 22:23:56 by athena]
+  Grrr.... fixed bug in estimator
+
+    M ./kernel/planner.c -8 +5
+
+Wed Mar 26 17:16:19 EST 2003  athena
+  * [project @ 2003-03-26 22:16:19 by athena]
+  Oops---the flop count was right.  The estimator is broken elsewhere.
+
+    M ./genfft/c.ml -5 +2
+
+Wed Mar 26 14:28:41 EST 2003  athena
+  * [project @ 2003-03-26 19:28:41 by athena]
+  Fixed SIMD estimator
+
+    M ./genfft/c.ml -2 +5
+
+Wed Mar 26 07:45:03 EST 2003  athena
+  * [project @ 2003-03-26 12:45:03 by athena]
+  Added twidsq simd codelets
+
+    A ./dft/simd/q1b.c
+    A ./dft/simd/q1b.h
+    A ./dft/simd/q1f.c
+    A ./dft/simd/q1f.h
+    A ./genfft/gen_twidsq_c.ml
+    M ./dft/simd/Makefile.am -1 +2
+    M ./dft/simd/codelets/Makefile.am -1 +12
+    M ./dft/simd/q1b.c +44
+    M ./dft/simd/q1b.h +25
+    M ./dft/simd/q1f.c +44
+    M ./dft/simd/q1f.h +25
+    M ./dft/simd/t1b.c +1
+    M ./dft/simd/t1f.c +1
+    M ./genfft/Makefile.am -8 +14
+    M ./genfft/gen_twiddle_c.ml -3 +3
+    M ./genfft/gen_twidsq_c.ml +195
+    M ./support/Makefile.codelets -1 +2
+
+Tue Mar 25 23:33:03 EST 2003  stevenj
+  * [project @ 2003-03-26 04:33:03 by stevenj]
+  gensrc -> genfft
+
+    M ./doc/fftw3.texi -2 +2
+
+Tue Mar 25 23:32:16 EST 2003  stevenj
+  * [project @ 2003-03-26 04:32:16 by stevenj]
+  newline
+
+    M ./TODO +1
+
+Tue Mar 25 19:17:08 EST 2003  athena
+  * [project @ 2003-03-26 00:17:08 by athena]
+  Noted need to add dif simd codelets
+
+    M ./TODO +1
+
+Tue Mar 25 13:03:47 EST 2003  stevenj
+  * [project @ 2003-03-25 18:03:47 by stevenj]
+  noted shift
+
+    M ./doc/fftw3.texi -3 +2
+
+Tue Mar 25 13:02:47 EST 2003  stevenj
+  * [project @ 2003-03-25 18:02:47 by stevenj]
+  clarification
+
+    M ./doc/fftw3.texi -4 +14
+
+Tue Mar 25 12:46:44 EST 2003  stevenj
+  * [project @ 2003-03-25 17:46:44 by stevenj]
+  need make after bootstrap
+
+    M ./doc/fftw3.texi -2 +3
+
+Tue Mar 25 12:31:49 EST 2003  stevenj
+  * [project @ 2003-03-25 17:31:49 by stevenj]
+  slight change
+
+    M ./doc/fftw3.texi -3 +3
+
+Tue Mar 25 12:30:56 EST 2003  stevenj
+  * [project @ 2003-03-25 17:30:56 by stevenj]
+  libtool is also needed
+
+    M ./doc/fftw3.texi -4 +4
+
+Tue Mar 25 12:29:52 EST 2003  stevenj
+  * [project @ 2003-03-25 17:29:52 by stevenj]
+  added code generator introduction
+
+    M ./doc/fftw3.texi -13 +63
+
+Tue Mar 25 11:51:49 EST 2003  stevenj
+  * [project @ 2003-03-25 16:51:49 by stevenj]
+  added support for REDFT/RODFT/DHT direct codelets
+
+    A ./genfft/gen_r2r.ml
+    A ./rdft/kr2r.c
+    A ./rdft/codelets/r2r/
+    A ./rdft/codelets/r2r.c
+    A ./rdft/codelets/r2r.h
+    A ./rdft/codelets/r2r/Makefile.am
+    M ./Makefile.am +1
+    M ./configure.ac +1
+    M ./genfft/Makefile.am -6 +6
+    M ./genfft/complex.ml -1 +8
+    M ./genfft/complex.mli -1 +2
+    M ./genfft/gen_r2r.ml +240
+    R ./genfft/gen_trig.ml
+    M ./genfft/trig.ml -60 +47
+    M ./rdft/Makefile.am -3 +3
+    M ./rdft/codelet-rdft.h -2 +2
+    M ./rdft/codelets/Makefile.am -2 +2
+    M ./rdft/codelets/r2r/Makefile.am +72
+    M ./rdft/codelets/r2r.c +38
+    M ./rdft/codelets/r2r.h +23
+    M ./rdft/conf.c -1 +2
+    M ./rdft/direct.c -16 +58
+    M ./rdft/kr2r.c +28
+    M ./rdft/rdft.h -1 +2
+    M ./support/Makefile.codelets -1 +2
+
+Tue Mar 25 11:29:29 EST 2003  stevenj
+  * [project @ 2003-03-25 16:29:29 by stevenj]
+  noted ARM bug; thanks to Jay Treacy
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +2
+
+Tue Mar 25 07:55:54 EST 2003  athena
+  * [project @ 2003-03-25 12:55:54 by athena]
+  bugfix from Stefan
+
+    M ./genfft-k7/vK7Optimization.ml -1 +2
+
+Mon Mar 24 15:59:08 EST 2003  stevenj
+  * [project @ 2003-03-24 20:59:08 by stevenj]
+  slight change
+
+    M ./doc/fftw3.texi -2 +2
+
+Mon Mar 24 15:58:44 EST 2003  stevenj
+  * [project @ 2003-03-24 20:58:44 by stevenj]
+  caveat
+
+    M ./doc/fftw3.texi -2 +2
+
+Mon Mar 24 15:58:04 EST 2003  stevenj
+  * [project @ 2003-03-24 20:58:04 by stevenj]
+  warning about DHT
+
+    M ./doc/fftw3.texi -1 +8
+
+Mon Mar 24 08:34:14 EST 2003  athena
+  * [project @ 2003-03-24 13:34:14 by athena]
+  Oops
+
+    M ./dft/k7/codelets/Makefile.am -2 +2
+
+Mon Mar 24 08:13:15 EST 2003  athena
+  * [project @ 2003-03-24 13:13:15 by athena]
+  Regression test for p4fftwgel
+
+    M ./dft/k7/codelets/Makefile.am -2 +2
+    M ./tests/Makefile.am +3
+    M ./tests/check.pl +4
+
+Mon Mar 24 03:09:06 EST 2003  stevenj
+  * [project @ 2003-03-24 08:09:06 by stevenj]
+  make check is faster, old tests are in make bigcheck
+
+    M ./tests/Makefile.am -1 +6
+
+Sat Mar 22 00:41:21 EST 2003  stevenj
+  * [project @ 2003-03-22 05:41:21 by stevenj]
+  note
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Sat Mar 22 00:40:05 EST 2003  stevenj
+  * [project @ 2003-03-22 05:40:05 by stevenj]
+  whoops, line wrapping
+
+    M ./doc/FAQ/fftw-faq.bfnn -2 +1
+
+Fri Mar 21 15:10:00 EST 2003  athena
+  * [project @ 2003-03-21 20:10:00 by athena]
+  Franz-mode codelets even without SIMD.  (disabled)
+
+    M ./dft/codelets/standard/Makefile.am -2 +59
+    M ./genfft/gen_notw.ml -5 +6
+    M ./genfft/gen_twiddle.ml -4 +5
+
+Fri Mar 21 09:09:30 EST 2003  athena
+  * [project @ 2003-03-21 14:09:30 by athena]
+  Bug is in netbsd-1.6, not 1.5
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Fri Mar 21 07:45:48 EST 2003  athena
+  * [project @ 2003-03-21 12:45:48 by athena]
+  const cast, should placate c++ compilers.
+
+    M ./simd/simd-altivec.h -1 +1
+
+Thu Mar 20 18:49:49 EST 2003  stevenj
+  * [project @ 2003-03-20 23:49:49 by stevenj]
+  added FAQ on why plans are array-specific
+
+    M ./doc/FAQ/fftw-faq.bfnn +26
+
+Thu Mar 20 16:12:56 EST 2003  stevenj
+  * [project @ 2003-03-20 21:12:56 by stevenj]
+  comment fix
+
+    M ./reodft/reodft010e-r2hc.c -3 +3
+
+Thu Mar 20 16:12:15 EST 2003  stevenj
+  * [project @ 2003-03-20 21:12:15 by stevenj]
+  noted comparison to NR
+
+    M ./reodft/reodft010e-r2hc.c -1 +5
+
+Wed Mar 19 20:13:16 EST 2003  stevenj
+  * [project @ 2003-03-20 01:13:16 by stevenj]
+  whoops, C99 complex didn't work if complex is a macro (as it is with glibc); thanks to Keh-Cheng Chu for the bug report
+
+    M ./api/fftw3.h -5 +4
+
+Wed Mar 19 16:52:54 EST 2003  stevenj
+  * [project @ 2003-03-19 21:52:54 by stevenj]
+  noted in help that --enable-k7 enables 3dnow, and that --enable-3dnow is only a fallback
+
+    M ./configure.ac -2 +2
+
+Wed Mar 19 15:09:52 EST 2003  athena
+  * [project @ 2003-03-19 20:09:52 by athena]
+  New gcc bug.  html.refs was not in repository/distribution.
+
+    A ./doc/FAQ/html.refs
+    M ./doc/FAQ/Makefile.am -1 +1
+    M ./doc/FAQ/fftw-faq.bfnn -1 +13
+    M ./doc/FAQ/html.refs +6
+
+Wed Mar 19 10:09:16 EST 2003  athena
+  * [project @ 2003-03-19 15:09:16 by athena]
+  Don't write wisdom if you don't have it.
+
+    M ./tests/bench.c -1 +1
+
+Tue Mar 18 15:44:41 EST 2003  athena
+  * [project @ 2003-03-18 20:44:41 by athena]
+  Added index entries for DHT.  Similarly for DCT, DST
+
+    M ./doc/fftw3.texi -1 +6
+
+Tue Mar 18 14:50:04 EST 2003  stevenj
+  * [project @ 2003-03-18 19:50:04 by stevenj]
+  execute should not go through C api, for efficiency
+
+    M ./api/f77api.c +2
+    M ./api/f77funcs.h -5 +10
+
+Tue Mar 18 06:14:51 EST 2003  athena
+  * [project @ 2003-03-18 11:14:51 by athena]
+  Renamed FFTW_IODIM, FFTW_R2R_KIND
+
+    M ./api/fftw3.h -6 +6
+
+Tue Mar 18 00:30:17 EST 2003  stevenj
+  * [project @ 2003-03-18 05:30:17 by stevenj]
+  added rfftwnd.eps to dist, so that transfig is not required for people trying to build other formats (e.g. ps); thanks to Brian Gough for the bug report
+
+    M ./doc/Makefile.am -1 +1
+
+Mon Mar 17 15:17:59 EST 2003  stevenj
+  * [project @ 2003-03-17 20:17:59 by stevenj]
+  pointer to upgrading section from tutorial
+
+    M ./doc/fftw3.texi -1 +4
+
+Mon Mar 17 14:44:40 EST 2003  stevenj
+  * [project @ 2003-03-17 19:44:40 by stevenj]
+  make print_plan and fprint_plan, so that the former can be more easily called from other languages
+
+    M ./api/f77funcs.h -2 +2
+    M ./api/fftw3.h -2 +3
+    M ./api/print-plan.c -1 +6
+    M ./doc/fftw3.texi -3 +5
+    M ./tests/bench.c -1 +1
+
+Mon Mar 17 14:19:10 EST 2003  stevenj
+  * [project @ 2003-03-17 19:19:10 by stevenj]
+  whoops, forgot to change equation image links to .png
+
+    M ./doc/fftw3.texi -14 +14
+
+Mon Mar 17 04:15:50 EST 2003  athena
+  * [project @ 2003-03-17 09:15:50 by athena]
+  fixed c++ linkage problems
+
+    M ./api/fftw3.h -4 +4
+    M ./api/version.c -8 +5
+    M ./support/Makefile.codelets -1 +2
+
+Mon Mar 17 03:25:17 EST 2003  athena
+  * [project @ 2003-03-17 08:25:17 by athena]
+  Removed ``const'', otherwise c++ link fails
+
+    M ./api/fftw3.h -4 +4
+    M ./api/version.c -4 +4
+
+Sun Mar 16 20:24:31 EST 2003  stevenj
+  * [project @ 2003-03-17 01:24:31 by stevenj]
+  fixed C++ annoyances: void* casts, and global variables are static by default(?!?)
+
+    M ./api/f77api.c -4 +5
+    M ./api/f77funcs.h -3 +5
+    M ./api/version.c -4 +4
+    M ./libbench2/allocate.c -9 +9
+    M ./libbench2/getopt-utils.c -2 +2
+    M ./libbench2/problem.c -3 +3
+    M ./libbench2/speed.c -2 +2
+    M ./libbench2/timer.c -2 +2
+    M ./libbench2/verify-r2r.c -3 +3
+    M ./libbench2/zero.c -9 +9
+    M ./support/Makefile.codelets -1 +1
+    M ./tests/bench.c -37 +66
+    M ./tests/hook.c -2 +2
+    M ./tools/fftw-wisdom.c -1 +1
+
+Sat Aug 12 23:06:56 EDT 2006  Unknown tagger
+  tagged fftw-3-0-beta1
+
+
+Sun Mar 16 15:29:11 EST 2003  stevenj
+  * [project @ 2003-03-16 20:29:11 by stevenj]
+  ranlib bug is in binutils
+
+    M ./doc/FAQ/fftw-faq.bfnn -4 +5
+
+Sun Mar 16 15:26:42 EST 2003  stevenj
+  * [project @ 2003-03-16 20:26:42 by stevenj]
+  ranlib Irix bug
+
+    M ./doc/FAQ/fftw-faq.bfnn +5
+
+Sun Mar 16 15:13:35 EST 2003  stevenj
+  * [project @ 2003-03-16 20:13:35 by stevenj]
+  start with random tests
+
+    M ./tests/check.pl -1 +1
+
+Sun Mar 16 15:00:04 EST 2003  stevenj
+  * [project @ 2003-03-16 20:00:03 by stevenj]
+  silenced some compiler warnings, eliminated unused variables, and fixed Makefile.am for f77funcs.h
+
+    M ./api/Makefile.am -5 +1
+    M ./dft/direct.c -3 +2
+    M ./kernel/ifftw.h -4 +4
+    M ./libbench2/verify-r2r.c -3
+    M ./rdft/direct.c -3 +2
+    M ./rdft/direct2.c -3 +2
+    M ./threads/Makefile.am -3 +1
+
+Sun Mar 16 14:55:13 EST 2003  stevenj
+  * [project @ 2003-03-16 19:55:13 by stevenj]
+  whoops
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Sun Mar 16 14:28:22 EST 2003  stevenj
+  * [project @ 2003-03-16 19:28:22 by stevenj]
+  3dnow is float
+
+    M ./doc/fftw3.texi -2 +2
+
+Sun Mar 16 14:27:45 EST 2003  stevenj
+  * [project @ 2003-03-16 19:27:45 by stevenj]
+  fixed k7 docs
+
+    M ./doc/fftw3.texi -17 +10
+
+Sun Mar 16 14:19:10 EST 2003  stevenj
+  * [project @ 2003-03-16 19:19:10 by stevenj]
+  SGI compilers now support inline
+
+    M ./kernel/cycle.h -3 +3
+
+Sun Mar 16 14:18:32 EST 2003  stevenj
+  * [project @ 2003-03-16 19:18:32 by stevenj]
+  cruft
+
+    M ./kernel/cycle.h -2 +1
+
+Sun Mar 16 14:15:47 EST 2003  stevenj
+  * [project @ 2003-03-16 19:15:47 by stevenj]
+  texinfo doesn't like commas in nodes
+
+    M ./doc/fftw3.texi -16 +15
+
+Sun Mar 16 13:52:04 EST 2003  stevenj
+  * [project @ 2003-03-16 18:48:30 by stevenj]
+  updated
+
+    M ./ChangeLog -47 +1389
+    M ./README +28
+
+Sun Mar 16 13:47:44 EST 2003  stevenj
+  * [project @ 2003-03-16 18:47:44 by stevenj]
+  f77funcs.c -> f77funcs.h so that people don't try to compile it
+
+    A ./api/f77funcs.h
+    A ./threads/f77funcs.h
+    M ./api/f77api.c -4 +4
+    R ./api/f77funcs.c
+    M ./api/f77funcs.h +366
+    M ./threads/f77api.c -4 +4
+    R ./threads/f77funcs.c
+    M ./threads/f77funcs.h +39
+
+Sun Mar 16 13:46:11 EST 2003  stevenj
+  * [project @ 2003-03-16 18:46:11 by stevenj]
+  minor changes
+
+    M ./doc/FAQ/fftw-faq.bfnn -7 +4
+
+Sun Mar 16 13:39:24 EST 2003  stevenj
+  * [project @ 2003-03-16 18:39:24 by stevenj]
+  updated compiler bug list
+
+    M ./doc/FAQ/fftw-faq.bfnn -40 +20
+
+Sun Mar 16 13:39:07 EST 2003  stevenj
+  * [project @ 2003-03-16 18:39:07 by stevenj]
+  noted how to set CC
+
+    M ./doc/fftw3.texi -4 +10
+
+Sun Mar 16 13:01:01 EST 2003  stevenj
+  * [project @ 2003-03-16 18:01:01 by stevenj]
+  TODONE
+
+    M ./TODO -10
+
+Sun Mar 16 13:00:42 EST 2003  stevenj
+  * [project @ 2003-03-16 18:00:42 by stevenj]
+  yikes, bugfix
+
+    M ./threads/vrank-geq1-rdft2.c -19 +11
+
+Sun Mar 16 10:26:28 EST 2003  stevenj
+  * [project @ 2003-03-16 15:26:28 by stevenj]
+  whoops
+
+    M ./kernel/ifftw.h -2 +2
+
+Sun Mar 16 09:24:19 EST 2003  athena
+  * [project @ 2003-03-16 14:24:19 by athena]
+  Report SIMD extensions in version string
+
+    M ./api/version.c -2 +26
+
+Sat Mar 15 18:56:11 EST 2003  stevenj
+  * [project @ 2003-03-15 23:56:11 by stevenj]
+  more verbose output
+
+    M ./tests/bench.c +2
+
+Sat Mar 15 17:41:25 EST 2003  stevenj
+  * [project @ 2003-03-15 22:41:25 by stevenj]
+  a couple of additional non-Unix instructions
+
+    M ./doc/fftw3.texi -1 +9
+
+Sat Mar 15 17:15:26 EST 2003  stevenj
+  * [project @ 2003-03-15 22:15:26 by stevenj]
+  hyphen
+
+    M ./doc/FAQ/fftw-faq.bfnn -1 +1
+
+Sat Mar 15 17:12:29 EST 2003  stevenj
+  * [project @ 2003-03-15 22:12:29 by stevenj]
+  softened
+
+    M ./doc/FAQ/fftw-faq.bfnn -3 +3
+
+Sat Mar 15 17:09:44 EST 2003  stevenj
+  * [project @ 2003-03-15 22:09:44 by stevenj]
+  added FAQ, used PNGs
+
+    A ./doc/equation-dft.png
+    A ./doc/equation-dht.png
+    A ./doc/equation-idft.png
+    A ./doc/equation-redft00.png
+    A ./doc/equation-redft01.png
+    A ./doc/equation-redft10.png
+    A ./doc/equation-redft11.png
+    A ./doc/equation-rodft00.png
+    A ./doc/equation-rodft01.png
+    A ./doc/equation-rodft10.png
+    A ./doc/FAQ/
+    A ./doc/FAQ/Makefile.am
+    A ./doc/FAQ/bfnnconv.pl
+    A ./doc/FAQ/fftw-faq.bfnn
+    A ./doc/FAQ/m-ascii.pl
+    A ./doc/FAQ/m-html.pl
+    A ./doc/FAQ/m-info.pl
+    A ./doc/FAQ/m-lout.pl
+    A ./doc/FAQ/m-post.pl
+    A ./doc/equation-rodft11.png
+    M ./configure.ac +1
+    M ./doc/FAQ/Makefile.am +14
+    M ./doc/FAQ/bfnnconv.pl +298
+    M ./doc/FAQ/fftw-faq.bfnn +492
+    M ./doc/FAQ/m-ascii.pl +189
+    M ./doc/FAQ/m-html.pl +337
+    M ./doc/FAQ/m-info.pl +226
+    M ./doc/FAQ/m-lout.pl +242
+    M ./doc/FAQ/m-post.pl +189
+    M ./doc/Makefile.am -4 +6
+    R ./doc/equation-dft.gif
+    M ./doc/equation-dft.png
+    R ./doc/equation-dht.gif
+    M ./doc/equation-dht.png
+    R ./doc/equation-idft.gif
+    M ./doc/equation-idft.png
+    R ./doc/equation-redft00.gif
+    M ./doc/equation-redft00.png
+    R ./doc/equation-redft01.gif
+    M ./doc/equation-redft01.png
+    R ./doc/equation-redft10.gif
+    M ./doc/equation-redft10.png
+    R ./doc/equation-redft11.gif
+    M ./doc/equation-redft11.png
+    R ./doc/equation-rodft00.gif
+    M ./doc/equation-rodft00.png
+    R ./doc/equation-rodft01.gif
+    M ./doc/equation-rodft01.png
+    R ./doc/equation-rodft10.gif
+    M ./doc/equation-rodft10.png
+    R ./doc/equation-rodft11.gif
+    M ./doc/equation-rodft11.png
+
+Sat Mar 15 15:29:43 EST 2003  stevenj
+  * [project @ 2003-03-15 20:29:42 by stevenj]
+  great copyright update
+
+    M ./COPYRIGHT -2 +2
+    M ./TODO -6
+    M ./api/api.h -2 +2
+    M ./api/apiplan.c -2 +2
+    M ./api/configure.c -2 +2
+    M ./api/execute-dft-c2r.c -2 +2
+    M ./api/execute-dft-r2c.c -2 +2
+    M ./api/execute-dft.c -2 +2
+    M ./api/execute-r2r.c -2 +2
+    M ./api/execute.c -2 +2
+    M ./api/export-wisdom-to-file.c -2 +2
+    M ./api/export-wisdom-to-string.c -2 +2
+    M ./api/export-wisdom.c -2 +2
+    M ./api/extract-reim.c -2 +2
+    M ./api/f77api.c -2 +2
+    M ./api/f77funcs.c -2 +2
+    M ./api/fftw3.h -3 +3
+    M ./api/flops.c -2 +2
+    M ./api/forget-wisdom.c -2 +2
+    M ./api/import-system-wisdom.c -2 +2
+    M ./api/import-wisdom-from-file.c -2 +2
+    M ./api/import-wisdom-from-string.c -2 +2
+    M ./api/import-wisdom.c -2 +2
+    M ./api/map-r2r-kind.c -2 +2
+    M ./api/mapflags.c -2 +2
+    M ./api/mkprinter-file.c -2 +2
+    M ./api/mktensor-iodims.c -2 +2
+    M ./api/mktensor-rowmajor.c -2 +2
+    M ./api/plan-dft-1d.c -2 +2
+    M ./api/plan-dft-2d.c -2 +2
+    M ./api/plan-dft-3d.c -2 +2
+    M ./api/plan-dft-c2r-1d.c -2 +2
+    M ./api/plan-dft-c2r-2d.c -2 +2
+    M ./api/plan-dft-c2r-3d.c -2 +2
+    M ./api/plan-dft-c2r.c -2 +2
+    M ./api/plan-dft-r2c-1d.c -2 +2
+    M ./api/plan-dft-r2c-2d.c -2 +2
+    M ./api/plan-dft-r2c-3d.c -2 +2
+    M ./api/plan-dft-r2c.c -2 +2
+    M ./api/plan-dft.c -2 +2
+    M ./api/plan-guru-dft-c2r.c -2 +2
+    M ./api/plan-guru-dft-r2c.c -2 +2
+    M ./api/plan-guru-dft.c -2 +2
+    M ./api/plan-guru-r2r.c -2 +2
+    M ./api/plan-many-dft-c2r.c -2 +2
+    M ./api/plan-many-dft-r2c.c -2 +2
+    M ./api/plan-many-dft.c -2 +2
+    M ./api/plan-many-r2r.c -2 +2
+    M ./api/plan-r2r-1d.c -2 +2
+    M ./api/plan-r2r-2d.c -2 +2
+    M ./api/plan-r2r-3d.c -2 +2
+    M ./api/plan-r2r.c -2 +2
+    M ./api/print-plan.c -2 +2
+    M ./api/rdft2-pad.c -2 +2
+    M ./api/the-planner.c -2 +2
+    M ./api/version.c -3 +3
+    M ./api/x77.h -2 +2
+    M ./dft/buffered.c -3 +3
+    M ./dft/codelet-dft.h -3 +3
+    M ./dft/codelets/n.c -2 +2
+    M ./dft/codelets/n.h -2 +2
+    M ./dft/codelets/t.c -2 +2
+    M ./dft/codelets/t.h -2 +2
+    M ./dft/conf.c -3 +3
+    M ./dft/ct-dif.c -3 +3
+    M ./dft/ct-dit.c -3 +3
+    M ./dft/ct-ditbuf.c -3 +3
+    M ./dft/ct-ditf.c -3 +3
+    M ./dft/ct.c -3 +3
+    M ./dft/ct.h -3 +3
+    M ./dft/dft.h -3 +3
+    M ./dft/direct.c -3 +3
+    M ./dft/generic.c -2 +2
+    M ./dft/indirect.c -3 +3
+    M ./dft/k7/k7.c -3 +3
+    M ./dft/kdft-dif.c -3 +3
+    M ./dft/kdft-difsq.c -3 +3
+    M ./dft/kdft-dit.c -3 +3
+    M ./dft/kdft.c -3 +3
+    M ./dft/nop.c -3 +3
+    M ./dft/plan.c -3 +3
+    M ./dft/problem.c -3 +3
+    M ./dft/rader-omega.c -2 +2
+    M ./dft/rader.c -2 +2
+    M ./dft/rank-geq2.c -3 +3
+    M ./dft/rank0.c -3 +3
+    M ./dft/simd/n1b.c -2 +2
+    M ./dft/simd/n1b.h -2 +2
+    M ./dft/simd/n1f.c -2 +2
+    M ./dft/simd/n1f.h -2 +2
+    M ./dft/simd/t1b.c -2 +2
+    M ./dft/simd/t1b.h -2 +2
+    M ./dft/simd/t1f.c -2 +2
+    M ./dft/simd/t1f.h -2 +2
+    M ./dft/solve.c -3 +3
+    M ./dft/vrank-geq1.c -3 +3
+    M ./dft/vrank2-transpose.c -3 +3
+    M ./dft/vrank3-transpose.c -3 +3
+    M ./dft/zero.c -3 +3
+    M ./doc/f77_wisdom.f -2 +2
+    M ./doc/fftw3.texi -4 +4
+    M ./genfft/algsimp.ml -4 +4
+    M ./genfft/algsimp.mli -3 +3
+    M ./genfft/annotate.ml -4 +4
+    M ./genfft/annotate.mli -3 +3
+    M ./genfft/assoctable.ml -3 +3
+    M ./genfft/assoctable.mli -3 +3
+    M ./genfft/c.ml -3 +3
+    M ./genfft/c.mli -3 +3
+    M ./genfft/complex.ml -3 +3
+    M ./genfft/complex.mli -3 +3
+    M ./genfft/conv.ml -2 +2
+    M ./genfft/conv.mli -3 +3
+    M ./genfft/dag.ml -3 +3
+    M ./genfft/dag.mli -3 +3
+    M ./genfft/expr.ml -3 +3
+    M ./genfft/expr.mli -3 +3
+    M ./genfft/fft.ml -4 +4
+    M ./genfft/fft.mli -3 +3
+    M ./genfft/gen_athnotw.ml -4 +4
+    M ./genfft/gen_athtw.ml -4 +4
+    M ./genfft/gen_conv.ml -4 +4
+    M ./genfft/gen_hc2hc.ml -4 +4
+    M ./genfft/gen_hc2r.ml -4 +4
+    M ./genfft/gen_notw.ml -4 +4
+    M ./genfft/gen_notw_c.ml -4 +4
+    M ./genfft/gen_r2hc.ml -4 +4
+    M ./genfft/gen_trig.ml -4 +4
+    M ./genfft/gen_twiddle.ml -4 +4
+    M ./genfft/gen_twiddle_c.ml -4 +4
+    M ./genfft/gen_twidsq.ml -4 +4
+    M ./genfft/genutil.ml -3 +3
+    M ./genfft/littlesimp.ml -3 +3
+    M ./genfft/littlesimp.mli -3 +3
+    M ./genfft/magic.ml -3 +3
+    M ./genfft/monads.ml -3 +3
+    M ./genfft/number.ml -3 +3
+    M ./genfft/number.mli -3 +3
+    M ./genfft/oracle.ml -3 +3
+    M ./genfft/oracle.mli -3 +3
+    M ./genfft/schedule.ml -3 +3
+    M ./genfft/schedule.mli -3 +3
+    M ./genfft/simd.ml -3 +3
+    M ./genfft/simd.mli -3 +3
+    M ./genfft/simdmagic.ml -3 +3
+    M ./genfft/to_alist.ml -3 +3
+    M ./genfft/to_alist.mli -3 +3
+    M ./genfft/trig.ml -3 +3
+    M ./genfft/trig.mli -3 +3
+    M ./genfft/twiddle.ml -3 +3
+    M ./genfft/twiddle.mli -3 +3
+    M ./genfft/unique.ml -3 +3
+    M ./genfft/unique.mli -3 +3
+    M ./genfft/util.ml -3 +3
+    M ./genfft/util.mli -3 +3
+    M ./genfft/variable.ml -3 +3
+    M ./genfft/variable.mli -3 +3
+    M ./genfft-k7/algsimp.ml -4 +4
+    M ./genfft-k7/algsimp.mli -3 +3
+    M ./genfft-k7/assoctable.ml -3 +3
+    M ./genfft-k7/assoctable.mli -3 +3
+    M ./genfft-k7/expr.ml -3 +3
+    M ./genfft-k7/expr.mli -3 +3
+    M ./genfft-k7/fft.ml -4 +4
+    M ./genfft-k7/littlesimp.ml -3 +3
+    M ./genfft-k7/littlesimp.mli -3 +3
+    M ./genfft-k7/monads.ml -3 +3
+    M ./genfft-k7/number.ml -3 +3
+    M ./genfft-k7/number.mli -3 +3
+    M ./genfft-k7/oracle.ml -3 +3
+    M ./genfft-k7/oracle.mli -3 +3
+    M ./genfft-k7/to_alist.ml -3 +3
+    M ./genfft-k7/to_alist.mli -3 +3
+    M ./genfft-k7/twiddle.ml -3 +3
+    M ./genfft-k7/twiddle.mli -3 +3
+    M ./kernel/align.c -3 +3
+    M ./kernel/alloc.c -3 +3
+    M ./kernel/assert.c -3 +3
+    M ./kernel/awake.c -3 +3
+    M ./kernel/buffered.c -2 +2
+    M ./kernel/ct.c -2 +2
+    M ./kernel/cycle.h -3 +3
+    M ./kernel/debug.c -3 +3
+    M ./kernel/hash.c -2 +2
+    M ./kernel/iabs.c -3 +3
+    M ./kernel/ifftw.h -3 +3
+    M ./kernel/md5-1.c -2 +2
+    M ./kernel/md5.c -2 +2
+    M ./kernel/minmax.c -3 +3
+    M ./kernel/ops.c -3 +3
+    M ./kernel/pickdim.c -3 +3
+    M ./kernel/plan.c -3 +3
+    M ./kernel/planner.c -2 +2
+    M ./kernel/primes.c -3 +3
+    M ./kernel/print.c -3 +3
+    M ./kernel/problem.c -3 +3
+    M ./kernel/rader.c -2 +2
+    M ./kernel/scan.c -3 +3
+    M ./kernel/solver.c -3 +3
+    M ./kernel/solvtab.c -3 +3
+    M ./kernel/square.c -3 +3
+    M ./kernel/stride.c -3 +3
+    M ./kernel/tensor.c -3 +3
+    M ./kernel/tensor1.c -3 +3
+    M ./kernel/tensor2.c -3 +3
+    M ./kernel/tensor4.c -3 +3
+    M ./kernel/tensor5.c -3 +3
+    M ./kernel/tensor7.c -3 +3
+    M ./kernel/tensor8.c -3 +3
+    M ./kernel/tensor9.c -3 +3
+    M ./kernel/timer.c -3 +3
+    M ./kernel/trig.c -3 +3
+    M ./kernel/trig1.c -3 +3
+    M ./kernel/twiddle.c -3 +3
+    M ./libbench/bench-main.c -2 +2
+    M ./libbench/bench-user.h -2 +2
+    M ./libbench/bench.h -2 +2
+    M ./libbench/can-do.c -2 +2
+    M ./libbench/getopt-utils.c -2 +2
+    M ./libbench/info.c -2 +2
+    M ./libbench/main.c -2 +2
+    M ./libbench/prime.c -2 +2
+    M ./libbench/problem.c -2 +2
+    M ./libbench/report.c -2 +2
+    M ./libbench/speed.c -2 +2
+    M ./libbench/timer.c -2 +2
+    M ./libbench/util.c -1 +1
+    M ./libbench/verify.c -2 +2
+    M ./libbench/zero.c -2 +2
+    M ./libbench2/aligned-main.c -2 +2
+    M ./libbench2/bench-main.c -2 +2
+    M ./libbench2/bench-user.h -2 +2
+    M ./libbench2/bench.h -2 +2
+    M ./libbench2/can-do.c -2 +2
+    M ./libbench2/dotens2.c -3 +3
+    M ./libbench2/getopt-utils.c -2 +2
+    M ./libbench2/info.c -2 +2
+    M ./libbench2/main.c -2 +2
+    M ./libbench2/problem.c -2 +2
+    M ./libbench2/report.c -2 +2
+    M ./libbench2/speed.c -2 +2
+    M ./libbench2/tensor.c -2 +2
+    M ./libbench2/timer.c -2 +2
+    M ./libbench2/useropt.c -2 +2
+    M ./libbench2/util.c -1 +1
+    M ./libbench2/verify-dft.c -3 +3
+    M ./libbench2/verify-lib.c -3 +3
+    M ./libbench2/verify-r2r.c -2 +2
+    M ./libbench2/verify-rdft2.c -3 +3
+    M ./libbench2/verify.c -2 +2
+    M ./libbench2/verify.h -2 +2
+    M ./libbench2/zero.c -2 +2
+    M ./rdft/buffered.c -3 +3
+    M ./rdft/buffered2.c -3 +3
+    M ./rdft/codelet-rdft.h -3 +3
+    M ./rdft/codelets/hb.h -2 +2
+    M ./rdft/codelets/hc2r.c -2 +2
+    M ./rdft/codelets/hc2r.h -2 +2
+    M ./rdft/codelets/hc2rIII.h -2 +2
+    M ./rdft/codelets/hf.h -2 +2
+    M ./rdft/codelets/hfb.c -2 +2
+    M ./rdft/codelets/r2hc.c -2 +2
+    M ./rdft/codelets/r2hc.h -2 +2
+    M ./rdft/codelets/r2hcII.h -2 +2
+    M ./rdft/conf.c -3 +3
+    M ./rdft/dft-r2hc.c -3 +3
+    M ./rdft/dht-r2hc.c -3 +3
+    M ./rdft/dht-rader.c -2 +2
+    M ./rdft/direct.c -3 +3
+    M ./rdft/direct2.c -3 +3
+    M ./rdft/generic.c -2 +2
+    M ./rdft/hc2hc-buf.c -3 +3
+    M ./rdft/hc2hc-dif.c -3 +3
+    M ./rdft/hc2hc-dit.c -3 +3
+    M ./rdft/hc2hc.c -3 +3
+    M ./rdft/hc2hc.h -3 +3
+    M ./rdft/indirect.c -3 +3
+    M ./rdft/khc2hc-dif.c -3 +3
+    M ./rdft/khc2hc-dit.c -3 +3
+    M ./rdft/khc2r.c -3 +3
+    M ./rdft/kr2hc.c -3 +3
+    M ./rdft/nop.c -3 +3
+    M ./rdft/nop2.c -3 +3
+    M ./rdft/plan.c -3 +3
+    M ./rdft/plan2.c -3 +3
+    M ./rdft/problem.c -3 +3
+    M ./rdft/problem2.c -3 +3
+    M ./rdft/rader-hc2hc.c -2 +2
+    M ./rdft/rank-geq2-rdft2.c -3 +3
+    M ./rdft/rank-geq2.c -3 +3
+    M ./rdft/rank0-rdft2.c -3 +3
+    M ./rdft/rank0.c -3 +3
+    M ./rdft/rdft-dht.c -3 +3
+    M ./rdft/rdft.h -3 +3
+    M ./rdft/rdft2-inplace-strides.c -3 +3
+    M ./rdft/rdft2-radix2.c -3 +3
+    M ./rdft/rdft2-tensor-max-index.c -3 +3
+    M ./rdft/solve.c -3 +3
+    M ./rdft/solve2.c -3 +3
+    M ./rdft/vrank-geq1-rdft2.c -3 +3
+    M ./rdft/vrank-geq1.c -3 +3
+    M ./rdft/vrank2-transpose.c -3 +3
+    M ./rdft/vrank3-transpose.c -3 +3
+    M ./reodft/conf.c -3 +3
+    M ./reodft/redft00e-r2hc.c -3 +3
+    M ./reodft/reodft.h -3 +3
+    M ./reodft/reodft010e-r2hc.c -3 +3
+    M ./reodft/reodft11e-r2hc-odd.c -3 +3
+    M ./reodft/reodft11e-r2hc.c -3 +3
+    M ./reodft/reodft11e-radix2.c -3 +3
+    M ./reodft/rodft00e-r2hc.c -3 +3
+    M ./simd/3dnow.c -3 +3
+    M ./simd/altivec.c -3 +3
+    M ./simd/simd-3dnow.h -2 +2
+    M ./simd/simd-altivec.h -2 +2
+    M ./simd/simd-sse.h -2 +2
+    M ./simd/simd-sse2.h -2 +2
+    M ./simd/simd.h -2 +2
+    M ./simd/sse-aux.c -3 +3
+    M ./simd/sse.c -3 +3
+    M ./simd/sse2-aux.c -3 +3
+    M ./simd/sse2.c -3 +3
+    M ./threads/api.c -2 +2
+    M ./threads/conf.c -3 +3
+    M ./threads/ct-dit.c -3 +3
+    M ./threads/dft-vrank-geq1.c -3 +3
+    M ./threads/f77api.c -2 +2
+    M ./threads/f77funcs.c -2 +2
+    M ./threads/hc2hc-dif.c -3 +3
+    M ./threads/hc2hc-dit.c -3 +3
+    M ./threads/rdft-vrank-geq1.c -3 +3
+    M ./threads/threads.c -2 +2
+    M ./threads/threads.h -3 +3
+    M ./threads/vrank-geq1-rdft2.c -3 +3
+    M ./tools/fftw-wisdom-to-conf.1 -4 +4
+    M ./tools/fftw-wisdom-to-conf.in -2 +2
+    M ./tools/fftw-wisdom.c -2 +2
+    M ./tools/fftw_wisdom.1.in -4 +4
+
+Sat Mar 15 15:14:02 EST 2003  stevenj
+  * [project @ 2003-03-15 20:14:02 by stevenj]
+  threads in make check
+
+    M ./TODO -4
+    M ./tests/Makefile.am -1 +5
+    M ./tests/check.pl +3
+
+Sat Mar 15 15:11:24 EST 2003  stevenj
+  * [project @ 2003-03-15 20:11:24 by stevenj]
+  fixed const warnings
+
+    M ./threads/ct-dit.c -2 +2
+    M ./threads/hc2hc-dif.c -2 +2
+    M ./threads/hc2hc-dit.c -2 +2
+
+Sat Mar 15 15:08:25 EST 2003  stevenj
+  * [project @ 2003-03-15 20:08:25 by stevenj]
+  make sure spawn_loop size > 1 (it has to be at least > 0 lest we crash, but > 1 is an optimization)
+
+    M ./threads/ct-dit.c -1 +2
+    M ./threads/hc2hc-dif.c -1 +2
+    M ./threads/hc2hc-dit.c -1 +2
+
+Sat Mar 15 14:00:17 EST 2003  athena
+  * [project @ 2003-03-15 19:00:17 by athena]
+  hpux seems to want machine/sys/inline.h as opposed to
+  machine/inline.h.
+
+    M ./kernel/cycle.h -2 +2
+
+Sat Mar 15 13:36:56 EST 2003  stevenj
+  * [project @ 2003-03-15 18:36:56 by stevenj]
+  Sourceforge is really SourceForge.net, and is run by VA
+
+    M ./doc/fftw3.texi -3 +4
+
+Sat Mar 15 13:34:05 EST 2003  stevenj
+  * [project @ 2003-03-15 18:33:07 by stevenj]
+  comma
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Mar 15 13:31:42 EST 2003  stevenj
+  * [project @ 2003-03-15 18:31:42 by stevenj]
+  fixed AMD company name
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Mar 15 13:29:41 EST 2003  stevenj
+  * [project @ 2003-03-15 18:29:41 by stevenj]
+  minor changes
+
+    M ./doc/fftw3.texi -19 +19
+
+Sat Mar 15 13:13:55 EST 2003  stevenj
+  * [project @ 2003-03-15 18:13:55 by stevenj]
+  more emitter->read_char renaming
+
+    M ./api/f77api.c -5 +5
+    M ./api/f77funcs.c -4 +4
+
+Sat Mar 15 13:08:45 EST 2003  stevenj
+  * [project @ 2003-03-15 18:08:45 by stevenj]
+  more wisdom docs, noted wisdom utilities
+
+    M ./doc/fftw3.texi -35 +95
+
+Sat Mar 15 11:41:32 EST 2003  stevenj
+  * [project @ 2003-03-15 16:41:32 by stevenj]
+  compound adjectives are hyphenated
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Mar 15 11:40:30 EST 2003  stevenj
+  * [project @ 2003-03-15 16:40:30 by stevenj]
+  fftw does support another type of packed array via r2r
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Mar 15 11:29:12 EST 2003  stevenj
+  * [project @ 2003-03-15 16:29:12 by stevenj]
+  write_char/read_char for export/import functions
+
+    M ./api/export-wisdom.c -4 +4
+    M ./api/f77api.c -5 +5
+    M ./api/f77funcs.c -4 +4
+    M ./api/fftw3.h -3 +3
+    M ./api/import-wisdom.c -4 +4
+    M ./doc/f77_wisdom.f -6 +6
+    M ./doc/fftw3.texi -5 +5
+
+Sat Mar 15 11:19:19 EST 2003  stevenj
+  * [project @ 2003-03-15 16:19:19 by stevenj]
+  comments
+
+    M ./threads/threads.c -4 +8
+
+Sat Mar 15 10:08:26 EST 2003  athena
+  * [project @ 2003-03-15 15:08:26 by athena]
+  Enabled randomized-cse
+
+    M ./support/Makefile.codelets -1 +1
+
+Sat Mar 15 09:47:49 EST 2003  athena
+  * [project @ 2003-03-15 14:47:49 by athena]
+  Changed to 3.0-beta1
+
+    M ./configure.ac -1 +1
+
+Sat Mar 15 09:07:31 EST 2003  athena
+  * [project @ 2003-03-15 14:07:31 by athena]
+  First complete draft
+
+    M ./doc/fftw3.texi -18 +131
+
+Sat Mar 15 08:37:52 EST 2003  athena
+  * [project @ 2003-03-15 13:37:52 by athena]
+  EMITTER is a misnomer
+
+    M ./api/fftw3.h -2 +2
+    M ./api/import-wisdom.c -8 +6
+
+Sat Mar 15 05:50:50 EST 2003  athena
+  * [project @ 2003-03-15 10:50:50 by athena]
+  Revision, wisdom tutorial, acks.
+
+    M ./doc/fftw3.texi -44 +219
+
+Fri Mar 14 22:59:04 EST 2003  stevenj
+  * [project @ 2003-03-15 03:59:04 by stevenj]
+  noted OpenMP
+
+    M ./NEWS +2
+
+Fri Mar 14 22:38:49 EST 2003  stevenj
+  * [project @ 2003-03-15 03:38:49 by stevenj]
+  comment
+
+    M ./threads/threads.c -1 +1
+
+Fri Mar 14 22:38:30 EST 2003  stevenj
+  * [project @ 2003-03-15 03:38:30 by stevenj]
+  comments
+
+    M ./threads/threads.c -2 +2
+
+Fri Mar 14 22:38:05 EST 2003  stevenj
+  * [project @ 2003-03-15 03:38:05 by stevenj]
+  reformatting
+
+    M ./threads/threads.c -3 +1
+
+Fri Mar 14 22:26:28 EST 2003  stevenj
+  * [project @ 2003-03-15 03:26:28 by stevenj]
+  whoops
+
+    M ./threads/threads.c -2 +2
+
+Fri Mar 14 22:11:23 EST 2003  stevenj
+  * [project @ 2003-03-15 03:11:23 by stevenj]
+  some threads fixes, and added experimental semaphore (pre-thread-spawning) and Linux spinlock support
+
+    M ./tests/bench.c -1 +10
+    M ./threads/api.c +1
+    M ./threads/threads.c -8 +159
+    M ./threads/threads.h -1 +2
+
+Fri Mar 14 20:50:46 EST 2003  stevenj
+  * [project @ 2003-03-15 01:50:46 by stevenj]
+  whoops
+
+    M ./threads/f77funcs.c -2 +2
+
+Fri Mar 14 18:23:03 EST 2003  stevenj
+  * [project @ 2003-03-14 23:23:03 by stevenj]
+  added note that FFTW_PATIENT will disable threads if they are not beneficial
+
+    M ./doc/fftw3.texi -2 +5
+
+Fri Mar 14 18:20:44 EST 2003  stevenj
+  * [project @ 2003-03-14 23:20:44 by stevenj]
+  made fftw_cleanup* more restrictive, in that we don't want to
+  guarantee that previously created plans will still work (they won't,
+  in the case of threaded plans and fftw_cleanup_threads), and there is
+  no reason to provide such a guarantee anyway.
+
+    M ./doc/fftw3.texi -6 +14
+
+Fri Mar 14 17:23:13 EST 2003  athena
+  * [project @ 2003-03-14 22:23:13 by athena]
+  Moved version.c from kernel/ into api/
+
+    A ./api/version.c
+    M ./api/Makefile.am -1 +2
+    M ./api/version.c +28
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/ifftw.h -4 +1
+    R ./kernel/version.c
+
+Fri Mar 14 17:19:50 EST 2003  athena
+  * [project @ 2003-03-14 22:19:50 by athena]
+  icc-7.0 requires -openmp
+
+    M ./configure.ac +5
+
+Fri Mar 14 14:47:52 EST 2003  athena
+  * [project @ 2003-03-14 19:47:52 by athena]
+  Ensure that one can do make dist given the distribution
+
+    M ./doc/Makefile.am -5 +5
+
+Fri Mar 14 14:38:11 EST 2003  athena
+  * [project @ 2003-03-14 19:38:11 by athena]
+  Dist fftw3.pdf, not fftw.pdf
+
+    M ./doc/Makefile.am -1 +2
+
+Fri Mar 14 14:36:25 EST 2003  athena
+  * [project @ 2003-03-14 19:36:25 by athena]
+  Support -onthreads=%d
+
+    M ./tests/bench.c -4 +8
+
+Fri Mar 14 14:34:21 EST 2003  stevenj
+  * [project @ 2003-03-14 19:34:21 by stevenj]
+  comment
+
+    M ./kernel/alloc.c -2 +2
+
+Fri Mar 14 14:33:27 EST 2003  stevenj
+  * [project @ 2003-03-14 19:33:27 by stevenj]
+  whoops
+
+    M ./threads/Makefile.am -1 +3
+
+Fri Mar 14 12:32:18 EST 2003  stevenj
+  * [project @ 2003-03-14 17:32:18 by stevenj]
+  fftw_real is gone
+
+    M ./doc/rfftwnd.fig -1 +1
+
+Fri Mar 14 12:26:04 EST 2003  stevenj
+  * [project @ 2003-03-14 17:26:04 by stevenj]
+  typos
+
+    M ./doc/fftw3.texi -3 +3
+
+Fri Mar 14 06:21:43 EST 2003  athena
+  * [project @ 2003-03-14 11:21:43 by athena]
+  More BENCH_DOC strings
+
+    M ./api/fftw3.h -3 +8
+    M ./tests/bench.c -4 +11
+
+Fri Mar 14 05:58:53 EST 2003  athena
+  * [project @ 2003-03-14 10:58:53 by athena]
+  Fixed xref's
+
+    M ./doc/fftw3.texi -15 +15
+
+Fri Mar 14 05:38:26 EST 2003  athena
+  * [project @ 2003-03-14 10:38:26 by athena]
+  Revised manual (esp. intro and tutorial), fixed texinfo hackery
+  for figures.
+
+    M ./doc/Makefile.am -8 +13
+    M ./doc/fftw3.texi -165 +280
+    R ./doc/rfftwnd.gif
+
+Wed Mar 12 02:42:33 EST 2003  stevenj
+  * [project @ 2003-03-12 07:42:33 by stevenj]
+  redirect users from guru execute to advanced interface, if possible
+
+    M ./doc/fftw3.texi -1 +5
+
+Wed Mar 12 02:35:22 EST 2003  stevenj
+  * [project @ 2003-03-12 07:35:22 by stevenj]
+  punctuation
+
+    M ./doc/fftw3.texi -2 +2
+
+Wed Mar 12 02:28:51 EST 2003  stevenj
+  * [project @ 2003-03-12 07:28:51 by stevenj]
+  use correct heading level
+
+    M ./doc/fftw3.texi -7 +7
+
+Wed Mar 12 02:24:37 EST 2003  stevenj
+  * [project @ 2003-03-12 07:24:37 by stevenj]
+  html generation
+
+    M ./doc/Makefile.am +15
+    M ./doc/fftw3.texi -91 +90
+
+Wed Mar 12 01:44:00 EST 2003  stevenj
+  * [project @ 2003-03-12 06:44:00 by stevenj]
+  added equation GIFs
+
+    A ./doc/equation-dft.gif
+    A ./doc/equation-dht.gif
+    A ./doc/equation-idft.gif
+    A ./doc/equation-redft00.gif
+    A ./doc/equation-redft01.gif
+    A ./doc/equation-redft10.gif
+    A ./doc/equation-redft11.gif
+    A ./doc/equation-rodft00.gif
+    A ./doc/equation-rodft01.gif
+    A ./doc/equation-rodft10.gif
+    A ./doc/equation-rodft11.gif
+    M ./doc/equation-dft.gif
+    M ./doc/equation-dht.gif
+    M ./doc/equation-idft.gif
+    M ./doc/equation-redft00.gif
+    M ./doc/equation-redft01.gif
+    M ./doc/equation-redft10.gif
+    M ./doc/equation-redft11.gif
+    M ./doc/equation-rodft00.gif
+    M ./doc/equation-rodft01.gif
+    M ./doc/equation-rodft10.gif
+    M ./doc/equation-rodft11.gif
+
+Wed Mar 12 01:43:27 EST 2003  stevenj
+  * [project @ 2003-03-12 06:43:27 by stevenj]
+  punctuation
+
+    M ./doc/fftw3.texi -18 +18
+
+Wed Mar 12 01:26:46 EST 2003  stevenj
+  * [project @ 2003-03-12 06:26:46 by stevenj]
+  punctuation
+
+    M ./doc/fftw3.texi -14 +14
+
+Wed Mar 12 01:25:12 EST 2003  stevenj
+  * [project @ 2003-03-12 06:25:12 by stevenj]
+  added multi-dimensional transform definitions
+
+    M ./doc/fftw3.texi -3 +124
+
+Wed Mar 12 00:14:03 EST 2003  stevenj
+  * [project @ 2003-03-12 05:14:03 by stevenj]
+  slight changes
+
+    M ./doc/fftw3.texi -3 +5
+
+Wed Mar 12 00:06:34 EST 2003  stevenj
+  * [project @ 2003-03-12 05:06:34 by stevenj]
+  typo
+
+    M ./doc/fftw3.texi -2 +2
+
+Tue Mar 11 23:50:43 EST 2003  stevenj
+  * [project @ 2003-03-12 04:50:43 by stevenj]
+  added 1d version of What FFTW Really Computes
+
+    M ./doc/fftw3.texi -1 +432
+
+Tue Mar 11 21:17:54 EST 2003  stevenj
+  * [project @ 2003-03-12 02:17:54 by stevenj]
+  note in upgrading section about FFTW_PATIENT
+
+    M ./doc/fftw3.texi -1 +7
+
+Tue Mar 11 15:18:39 EST 2003  stevenj
+  * [project @ 2003-03-11 20:18:39 by stevenj]
+  added cycle-counter section
+
+    M ./doc/fftw3.texi -1 +32
+
+Tue Mar 11 14:53:44 EST 2003  stevenj
+  * [project @ 2003-03-11 19:53:44 by stevenj]
+  more ideas
+
+    M ./TODO +7
+
+Mon Mar 10 17:41:35 EST 2003  stevenj
+  * [project @ 2003-03-10 22:41:35 by stevenj]
+  noted that indirect should probably be merged with rank-geq2, to make a rank-split solver
+
+    M ./dft/indirect.c -1 +4
+    M ./rdft/indirect.c -1 +4
+
+Fri Mar  7 03:01:52 EST 2003  stevenj
+  * [project @ 2003-03-07 08:01:52 by stevenj]
+  added non-Unix installation instructions
+
+    M ./doc/fftw3.texi -1 +38
+
+Fri Mar  7 02:30:59 EST 2003  stevenj
+  * [project @ 2003-03-07 07:30:59 by stevenj]
+  also talk about stack alignment with SSE/SSE2
+
+    M ./doc/fftw3.texi -1 +7
+
+Fri Mar  7 02:24:07 EST 2003  stevenj
+  * [project @ 2003-03-07 07:24:07 by stevenj]
+  made warning more dire
+
+    M ./doc/fftw3.texi -3 +4
+
+Fri Mar  7 02:13:25 EST 2003  stevenj
+  * [project @ 2003-03-07 07:13:25 by stevenj]
+  fix
+
+    M ./doc/fftw3.texi -2 +2
+
+Fri Mar  7 02:09:55 EST 2003  stevenj
+  * [project @ 2003-03-07 07:09:55 by stevenj]
+  number
+
+    M ./doc/fftw3.texi -2 +2
+
+Fri Mar  7 02:09:08 EST 2003  stevenj
+  * [project @ 2003-03-07 07:09:08 by stevenj]
+  fix
+
+    M ./doc/fftw3.texi -3 +3
+
+Fri Mar  7 02:08:01 EST 2003  stevenj
+  * [project @ 2003-03-07 07:08:01 by stevenj]
+  minor
+
+    M ./doc/fftw3.texi -2 +2
+
+Fri Mar  7 02:04:45 EST 2003  stevenj
+  * [project @ 2003-03-07 07:04:45 by stevenj]
+  minor fix
+
+    M ./doc/fftw3.texi -3 +3
+
+Fri Mar  7 01:58:15 EST 2003  stevenj
+  * [project @ 2003-03-07 06:58:15 by stevenj]
+  cross-ref
+
+    M ./doc/fftw3.texi -2 +3
+
+Fri Mar  7 01:57:31 EST 2003  stevenj
+  * [project @ 2003-03-07 06:57:31 by stevenj]
+  minor
+
+    M ./doc/fftw3.texi -7 +8
+
+Fri Mar  7 01:53:28 EST 2003  stevenj
+  * [project @ 2003-03-07 06:53:28 by stevenj]
+  more installation manual
+
+    M ./doc/fftw3.texi -7 +151
+
+Fri Mar  7 00:43:40 EST 2003  stevenj
+  * [project @ 2003-03-07 05:43:40 by stevenj]
+  GNU-lly correct
+
+    M ./doc/fftw3.texi -2 +2
+
+Fri Mar  7 00:38:48 EST 2003  stevenj
+  * [project @ 2003-03-07 05:38:48 by stevenj]
+  started installation section
+
+    M ./doc/fftw3.texi -3 +33
+
+Fri Mar  7 00:25:02 EST 2003  stevenj
+  * [project @ 2003-03-07 05:25:02 by stevenj]
+  added --without-cycle-counter option as a last resort
+
+    M ./configure.ac +5
+    M ./kernel/timer.c -11 +24
+
+Fri Mar  7 00:07:12 EST 2003  stevenj
+  * [project @ 2003-03-07 05:07:12 by stevenj]
+  macros with () arguments were only standardized in C99, and we don't need them anyway
+
+    M ./kernel/cycle.h -3 +3
+
+Thu Mar  6 23:10:41 EST 2003  stevenj
+  * [project @ 2003-03-07 04:10:41 by stevenj]
+  wording
+
+    M ./doc/fftw3.texi -2 +2
+
+Thu Mar  6 23:03:03 EST 2003  stevenj
+  * [project @ 2003-03-07 04:03:03 by stevenj]
+  parallelism
+
+    M ./doc/fftw3.texi -2 +2
+
+Thu Mar  6 23:01:47 EST 2003  stevenj
+  * [project @ 2003-03-07 04:01:47 by stevenj]
+  additions to upgrading chapter
+
+    M ./doc/fftw3.texi -1 +31
+
+Thu Mar  6 22:39:36 EST 2003  stevenj
+  * [project @ 2003-03-07 03:39:36 by stevenj]
+  noted additional humility of FFTW 3 wisdom
+
+    M ./doc/fftw3.texi -1 +9
+
+Thu Mar  6 22:32:44 EST 2003  stevenj
+  * [project @ 2003-03-07 03:32:44 by stevenj]
+  renaming
+
+    M ./doc/fftw3.texi -6 +6
+
+Thu Mar  6 22:31:00 EST 2003  stevenj
+  * [project @ 2003-03-07 03:31:00 by stevenj]
+  added placeholder for wisdom reference
+
+    M ./doc/fftw3.texi -3 +10
+
+Thu Mar  6 22:29:38 EST 2003  stevenj
+  * [project @ 2003-03-07 03:29:38 by stevenj]
+  wrote upgrading chapter
+
+    M ./doc/fftw3.texi -1 +139
+
+Thu Mar  6 18:01:10 EST 2003  stevenj
+  * [project @ 2003-03-06 23:01:10 by stevenj]
+  slight change
+
+    M ./doc/fftw3.texi -6 +6
+
+Thu Mar  6 18:00:43 EST 2003  stevenj
+  * [project @ 2003-03-06 23:00:43 by stevenj]
+  placeholder for upgrade chapter
+
+    M ./doc/fftw3.texi -3 +18
+
+Thu Mar  6 13:47:49 EST 2003  stevenj
+  * [project @ 2003-03-06 18:47:49 by stevenj]
+  whoops
+
+    M ./tools/fftw-wisdom.c -2 +2
+
+Thu Mar  6 13:36:38 EST 2003  stevenj
+  * [project @ 2003-03-06 18:36:38 by stevenj]
+  strengthed warning about time
+
+    M ./tools/fftw_wisdom.1.in -1 +1
+
+Thu Mar  6 13:35:42 EST 2003  stevenj
+  * [project @ 2003-03-06 18:35:42 by stevenj]
+  noted -t in example
+
+    M ./tools/fftw_wisdom.1.in -1 +3
+
+Thu Mar  6 13:21:03 EST 2003  stevenj
+  * [project @ 2003-03-06 18:21:03 by stevenj]
+  pay attention to WINDOWS_F77_MANGLING
+
+    M ./threads/f77api.c -3 +26
+
+Thu Mar  6 02:52:30 EST 2003  stevenj
+  * [project @ 2003-03-06 07:52:30 by stevenj]
+  punctuation
+
+    M ./doc/fftw3.texi -2 +2
+
+Thu Mar  6 02:51:02 EST 2003  stevenj
+  * [project @ 2003-03-06 07:51:02 by stevenj]
+  index
+
+    M ./doc/fftw3.texi -1 +3
+
+Thu Mar  6 02:50:38 EST 2003  stevenj
+  * [project @ 2003-03-06 07:50:38 by stevenj]
+  documented C++ <complex> usage
+
+    M ./doc/fftw3.texi -3 +18
+
+Thu Mar  6 02:25:32 EST 2003  stevenj
+  * [project @ 2003-03-06 07:25:32 by stevenj]
+  got rid of overfull hbox TeX warnings
+
+    M ./doc/fftw3.texi -6 +6
+
+Thu Mar  6 02:20:38 EST 2003  stevenj
+  * [project @ 2003-03-06 07:20:38 by stevenj]
+  whoops
+
+    M ./doc/fftw3.texi -2 +2
+
+Thu Mar  6 02:20:13 EST 2003  stevenj
+  * [project @ 2003-03-06 07:20:13 by stevenj]
+  noted fftw_iodim split for Fortran guru interface
+
+    M ./doc/fftw3.texi -5 +16
+
+Thu Mar  6 02:14:21 EST 2003  stevenj
+  * [project @ 2003-03-06 07:14:21 by stevenj]
+  added guru reference
+
+    M ./doc/fftw3.texi -3 +187
+
+Wed Mar  5 22:56:05 EST 2003  stevenj
+  * [project @ 2003-03-06 03:56:05 by stevenj]
+  minor
+
+    M ./doc/fftw3.texi -3 +3
+
+Wed Mar  5 22:45:31 EST 2003  stevenj
+  * [project @ 2003-03-06 03:45:31 by stevenj]
+  use @r{...} for comment text in code examples
+
+    M ./doc/fftw3.texi -3 +3
+
+Wed Mar  5 13:14:04 EST 2003  stevenj
+  * [project @ 2003-03-05 18:14:04 by stevenj]
+  eliminate warning
+
+    M ./simd/sse.c -1 +2
+
+Wed Mar  5 13:12:56 EST 2003  stevenj
+  * [project @ 2003-03-05 18:12:56 by stevenj]
+  SIMD_CFLAGS only for simd code
+
+    M ./configure.ac -5 +8
+    M ./dft/simd/Makefile.am -1 +1
+    M ./dft/simd/codelets/Makefile.am -1 +1
+    M ./kernel/align.c -12 +9
+    M ./simd/Makefile.am +1
+
+Wed Mar  5 11:06:41 EST 2003  athena
+  * [project @ 2003-03-05 16:06:41 by athena]
+  Minor changes.
+
+    M ./doc/fftw3.texi -89 +96
+
+Wed Mar  5 02:13:34 EST 2003  stevenj
+  * [project @ 2003-03-05 07:13:34 by stevenj]
+  cross-compiling with MinGW can't detect f77 mangling, so add an option to use what seems to be the most common styles
+
+    M ./api/f77api.c -3 +34
+    M ./configure.ac +5
+
+Tue Mar  4 20:00:31 EST 2003  stevenj
+  * [project @ 2003-03-05 01:00:31 by stevenj]
+  comment
+
+    M ./libbench2/util.c -2 +3
+
+Tue Mar  4 20:00:13 EST 2003  stevenj
+  * [project @ 2003-03-05 01:00:13 by stevenj]
+  we only use our-malloc-16 on machines where size_t == uintptr_t, so don't bother doing the right thing with the benchmark
+
+    M ./libbench2/util.c -1 +1
+
+Tue Mar  4 19:46:09 EST 2003  stevenj
+  * [project @ 2003-03-05 00:46:09 by stevenj]
+  support WITH_OUR_MALLOC16
+
+    M ./libbench2/util.c -2 +19
+
+Tue Mar  4 18:50:53 EST 2003  fftw
+  * [project @ 2003-03-04 23:50:53 by fftw]
+  automatically add -msse etcetera for --enable-sse etcetera
+
+    M ./configure.ac +22
+
+Tue Mar  4 18:24:26 EST 2003  fftw
+  * [project @ 2003-03-04 23:24:26 by fftw]
+  got rid of const warning
+
+    M ./tools/fftw-wisdom.c -2 +2
+
+Tue Mar  4 18:22:48 EST 2003  fftw
+  * [project @ 2003-03-04 23:22:48 by fftw]
+  missing header
+
+    M ./libbench2/problem.c -1 +2
+
+Tue Mar  4 15:55:47 EST 2003  stevenj
+  * [project @ 2003-03-04 20:55:47 by stevenj]
+  fixes
+
+    M ./doc/fftw3.texi -15 +49
+
+Tue Mar  4 15:53:26 EST 2003  stevenj
+  * [project @ 2003-03-04 20:53:26 by stevenj]
+  whoops
+
+    M ./api/import-system-wisdom.c -1 +1
+
+Tue Mar  4 02:22:14 EST 2003  stevenj
+  * [project @ 2003-03-04 07:22:14 by stevenj]
+  started guru reference
+
+    M ./doc/fftw3.texi -1 +110
+
+Tue Mar  4 01:44:09 EST 2003  stevenj
+  * [project @ 2003-03-04 06:44:09 by stevenj]
+  use same FFTW_IODIM between precisions
+
+    M ./api/fftw3.h -6 +8
+
+Tue Mar  4 00:25:57 EST 2003  stevenj
+  * [project @ 2003-03-04 05:25:57 by stevenj]
+  renamed section
+
+    M ./doc/fftw3.texi -5 +5
+
+Tue Mar  4 00:21:49 EST 2003  stevenj
+  * [project @ 2003-03-04 05:21:49 by stevenj]
+  no need for "advanced" in subheadings
+
+    M ./doc/fftw3.texi -4 +4
+
+Tue Mar  4 00:20:05 EST 2003  stevenj
+  * [project @ 2003-03-04 05:20:05 by stevenj]
+  typo
+
+    M ./doc/fftw3.texi -2 +2
+
+Tue Mar  4 00:17:23 EST 2003  stevenj
+  * [project @ 2003-03-04 05:17:23 by stevenj]
+  finished advanced interface
+
+    M ./doc/fftw3.texi -1 +34
+
+Mon Mar  3 23:26:12 EST 2003  stevenj
+  * [project @ 2003-03-04 04:26:12 by stevenj]
+  more advance interface docs
+
+    M ./doc/fftw3.texi -1 +46
+
+Mon Mar  3 23:12:09 EST 2003  stevenj
+  * [project @ 2003-03-04 04:12:09 by stevenj]
+  fail for win32
+
+    M ./api/import-system-wisdom.c -2 +2
+
+Mon Mar  3 17:18:48 EST 2003  fftw
+  * [project @ 2003-03-03 22:18:48 by fftw]
+  shortened help string
+
+    M ./configure.ac -1 +1
+
+Mon Mar  3 17:16:17 EST 2003  fftw
+  * [project @ 2003-03-03 22:16:17 by fftw]
+  fixed cross-refs
+
+    M ./doc/fftw3.texi -3 +3
+
+Mon Mar  3 17:07:27 EST 2003  fftw
+  * [project @ 2003-03-03 22:07:27 by fftw]
+  FFTW_POSSIBLY_UNALIGNED -> simpler FFTW_UNALIGNED in API, added bench option
+
+    M ./api/fftw3.h -2 +2
+    M ./api/mapflags.c -1 +1
+    M ./doc/fftw3.texi -3 +3
+    M ./tests/bench.c +1
+
+Mon Mar  3 16:58:07 EST 2003  fftw
+  * [project @ 2003-03-03 21:58:07 by fftw]
+  whoops
+
+    M ./kernel/alloc.c -3 +3
+
+Mon Mar  3 16:52:58 EST 2003  fftw
+  * [project @ 2003-03-03 21:52:58 by fftw]
+  noted assumption
+
+    M ./kernel/alloc.c -2 +2
+
+Mon Mar  3 16:50:33 EST 2003  fftw
+  * [project @ 2003-03-03 21:50:33 by fftw]
+  provide our own malloc16 routine because of Windows lossage
+
+    M ./configure.ac +5
+    M ./kernel/alloc.c -3 +35
+
+Mon Mar  3 13:28:12 EST 2003  stevenj
+  * [project @ 2003-03-03 18:28:12 by stevenj]
+  capitalization
+
+    M ./doc/fftw3.texi -6 +6
+
+Mon Mar  3 13:26:32 EST 2003  stevenj
+  * [project @ 2003-03-03 18:26:32 by stevenj]
+  whoops
+
+    M ./doc/fftw3.texi -2 +2
+
+Mon Mar  3 12:55:57 EST 2003  stevenj
+  * [project @ 2003-03-03 17:55:57 by stevenj]
+  vertical skip looks better than indenting for setting off short paragraphs
+
+    M ./doc/fftw3.texi -1 +6
+
+Mon Mar  3 06:34:09 EST 2003  athena
+  * [project @ 2003-03-03 11:34:09 by athena]
+  Removed franz-mode.  Automake was distributing franz files
+  whether franz mode was enabled or not.
+
+    M ./configure.ac -3
+    M ./dft/simd/codelets/Makefile.am -7 +2
+
+Mon Mar  3 01:44:00 EST 2003  stevenj
+  * [project @ 2003-03-03 06:44:00 by stevenj]
+  made output boundary conditions more prominent; they are important,
+  because they make the different transform types inequivalent in
+  parity
+
+    M ./doc/fftw3.texi -4 +8
+
+Mon Mar  3 01:17:28 EST 2003  stevenj
+  * [project @ 2003-03-03 06:17:28 by stevenj]
+  clarification
+
+    M ./doc/fftw3.texi -2 +2
+
+Mon Mar  3 01:17:07 EST 2003  stevenj
+  * [project @ 2003-03-03 06:17:07 by stevenj]
+  typo
+
+    M ./doc/fftw3.texi -2 +2
+
+Mon Mar  3 01:10:28 EST 2003  stevenj
+  * [project @ 2003-03-03 06:10:28 by stevenj]
+  started advanced reference
+
+    M ./doc/fftw3.texi -6 +51
+
+Mon Mar  3 00:52:02 EST 2003  stevenj
+  * [project @ 2003-03-03 05:52:02 by stevenj]
+  r2r reference
+
+    M ./doc/fftw3.texi -2 +209
+
+Sun Mar  2 23:51:21 EST 2003  stevenj
+  * [project @ 2003-03-03 04:51:21 by stevenj]
+  workaround for info formatting bug
+
+    M ./doc/fftw3.texi -5 +4
+
+Sun Mar  2 23:47:19 EST 2003  stevenj
+  * [project @ 2003-03-03 04:47:19 by stevenj]
+  noted lack of fftw_malloc in Fortran
+
+    M ./doc/fftw3.texi -3 +9
+
+Sun Mar  2 23:42:52 EST 2003  stevenj
+  * [project @ 2003-03-03 04:42:32 by stevenj]
+  parallelism
+
+    M ./doc/fftw3.texi -4 +4
+
+Sun Mar  2 23:39:54 EST 2003  stevenj
+  * [project @ 2003-03-03 04:39:05 by stevenj]
+  whoops
+
+    M ./doc/fftw3.texi -7 +7
+
+Sun Mar  2 23:33:02 EST 2003  stevenj
+  * [project @ 2003-03-03 04:33:02 by stevenj]
+  r2c/c2r reference
+
+    M ./doc/fftw3.texi -40 +248
+
+Sun Mar  2 22:44:10 EST 2003  stevenj
+  * [project @ 2003-03-03 03:44:10 by stevenj]
+  table of contents was being included twice
+
+    M ./doc/fftw3.texi -2 +1
+
+Sun Mar  2 22:42:29 EST 2003  stevenj
+  * [project @ 2003-03-03 03:42:29 by stevenj]
+  minor changes
+
+    M ./doc/fftw3.texi -15 +16
+
+Sun Mar  2 21:54:13 EST 2003  stevenj
+  * [project @ 2003-03-03 02:54:13 by stevenj]
+  started reference section
+
+    M ./doc/fftw3.texi -2 +238
+
+Sun Mar  2 19:10:02 EST 2003  stevenj
+  * [project @ 2003-03-03 00:10:02 by stevenj]
+  whoops
+
+    M ./doc/Makefile.am -2 +2
+
+Sun Mar  2 19:03:23 EST 2003  stevenj
+  * [project @ 2003-03-03 00:03:23 by stevenj]
+  started ref. section
+
+    M ./doc/fftw3.texi -2 +146
+
+Sun Mar  2 18:50:58 EST 2003  stevenj
+  * [project @ 2003-03-02 23:50:58 by stevenj]
+  fftw_flops takes const plan
+
+    M ./api/fftw3.h -2 +2
+    M ./api/flops.c -1 +1
+
+Sun Mar  2 15:54:14 EST 2003  stevenj
+  * [project @ 2003-03-02 20:54:14 by stevenj]
+  typo
+
+    M ./doc/fftw3.texi -2 +2
+
+Sun Mar  2 15:52:41 EST 2003  stevenj
+  * [project @ 2003-03-02 20:52:41 by stevenj]
+  added "Wisdom of Fortran?" section
+
+    M ./doc/fftw3.texi -1 +39
+
+Sun Mar  2 15:50:37 EST 2003  stevenj
+  * [project @ 2003-03-02 20:50:37 by stevenj]
+  typo
+
+    M ./doc/f77_wisdom.f -2 +2
+
+Sun Mar  2 15:49:57 EST 2003  stevenj
+  * [project @ 2003-03-02 20:49:57 by stevenj]
+  wording
+
+    M ./doc/f77_wisdom.f -1 +1
+
+Sun Mar  2 15:46:13 EST 2003  stevenj
+  * [project @ 2003-03-02 20:46:13 by stevenj]
+  added comments
+
+    M ./doc/f77_wisdom.f +25
+
+Sun Mar  2 15:44:01 EST 2003  stevenj
+  * [project @ 2003-03-02 20:44:01 by stevenj]
+  added example file
+
+    A ./doc/f77_wisdom.f
+
+Sun Mar  2 15:37:32 EST 2003  stevenj
+  * [project @ 2003-03-02 20:37:32 by stevenj]
+  don't print out READ WISDOM unless we have
+
+    M ./tests/bench.c -6 +11
+
+Sun Mar  2 15:36:28 EST 2003  stevenj
+  * [project @ 2003-03-02 20:36:28 by stevenj]
+  EOF is not a space
+
+    M ./kernel/scan.c -2 +2
+
+Sun Mar  2 09:14:37 EST 2003  athena
+  * [project @ 2003-03-02 14:14:37 by athena]
+  Turn on inline by default
+
+    M ./kernel/ifftw.h -2 +3
+
+Sun Mar  2 07:11:56 EST 2003  athena
+  * [project @ 2003-03-02 12:11:56 by athena]
+  Optionally inline loop in notw codelets
+
+    M ./genfft/gen_hc2r.ml -3 +3
+    M ./genfft/gen_notw.ml -3 +3
+    M ./genfft/gen_notw_c.ml -3 +3
+    M ./genfft/gen_r2hc.ml -3 +3
+    M ./kernel/ifftw.h -1 +3
+
+Sun Mar  2 01:37:41 EST 2003  stevenj
+  * [project @ 2003-03-02 06:37:41 by stevenj]
+  updated nodes
+
+    M ./doc/fftw3.texi -5 +9
+
+Sun Mar  2 01:37:19 EST 2003  stevenj
+  * [project @ 2003-03-02 06:37:19 by stevenj]
+  wrote most of Fortran chapter
+
+    M ./doc/fftw3.texi -1 +208
+
+Sun Mar  2 00:58:37 EST 2003  stevenj
+  * [project @ 2003-03-02 05:58:37 by stevenj]
+  citation
+
+    M ./doc/fftw3.texi -2 +2
+
+Sun Mar  2 00:57:22 EST 2003  stevenj
+  * [project @ 2003-03-02 05:57:22 by stevenj]
+  added parallel FFTW chapter
+
+    M ./doc/fftw3.texi -3 +209
+
+Sat Mar  1 20:42:23 EST 2003  stevenj
+  * [project @ 2003-03-02 01:42:23 by stevenj]
+  typo
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Mar  1 20:34:38 EST 2003  stevenj
+  * [project @ 2003-03-02 01:34:38 by stevenj]
+  added inlining to TODO
+
+    M ./TODO +2
+
+Sat Mar  1 19:36:26 EST 2003  stevenj
+  * [project @ 2003-03-02 00:36:26 by stevenj]
+  added K
+
+    M ./CONVENTIONS -1 +2
+
+Sat Mar  1 19:15:18 EST 2003  stevenj
+  * [project @ 2003-03-02 00:15:18 by stevenj]
+  use K for constants
+
+    M ./dft/zero.c -3 +3
+    M ./kernel/trig1.c -5 +5
+    M ./rdft/generic.c -8 +8
+    M ./rdft/problem.c -3 +3
+    M ./rdft/rdft-dht.c -3 +3
+    M ./rdft/rdft2-radix2.c -11 +11
+    M ./reodft/redft00e-r2hc.c -3 +3
+    M ./reodft/reodft010e-r2hc.c -11 +11
+    M ./reodft/reodft11e-r2hc.c -7 +7
+    M ./reodft/reodft11e-radix2.c -11 +11
+    M ./reodft/rodft00e-r2hc.c -3 +3
+
+Sat Mar  1 19:14:54 EST 2003  stevenj
+  * [project @ 2003-03-02 00:14:54 by stevenj]
+  fixed cross-ref
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Mar  1 19:14:16 EST 2003  stevenj
+  * [project @ 2003-03-02 00:14:16 by stevenj]
+  whoops
+
+    M ./doc/fftw3.texi -6 +6
+
+Sat Mar  1 18:50:43 EST 2003  stevenj
+  * [project @ 2003-03-01 23:50:43 by stevenj]
+  cleanup
+
+    M ./doc/fftw3.texi -8 +8
+
+Sat Mar  1 18:46:38 EST 2003  stevenj
+  * [project @ 2003-03-01 23:46:38 by stevenj]
+  "words of wisdom" by itself is a little too obscure
+
+    M ./doc/fftw3.texi -6 +6
+
+Sat Mar  1 18:43:21 EST 2003  stevenj
+  * [project @ 2003-03-01 23:43:21 by stevenj]
+  re-added multi-dimensional array stuff
+
+    M ./doc/fftw3.texi -1 +203
+
+Sat Mar  1 18:15:22 EST 2003  stevenj
+  * [project @ 2003-03-01 23:15:22 by stevenj]
+  added alignment section
+
+    M ./doc/fftw3.texi -4 +93
+
+Sat Mar  1 16:34:21 EST 2003  stevenj
+  * [project @ 2003-03-01 21:34:21 by stevenj]
+  shrunk code
+
+    M ./reodft/reodft11e-r2hc-odd.c -147 +57
+
+Fri Feb 28 20:22:00 EST 2003  stevenj
+  * [project @ 2003-03-01 01:22:00 by stevenj]
+  slight compression
+
+    M ./reodft/reodft11e-r2hc-odd.c -146 +122
+
+Fri Feb 28 19:01:20 EST 2003  stevenj
+  * [project @ 2003-03-01 00:01:20 by stevenj]
+  style
+
+    M ./doc/fftw3.texi -7 +7
+    M ./reodft/reodft11e-radix2.c -2 +2
+
+Fri Feb 28 18:46:53 EST 2003  stevenj
+  * [project @ 2003-02-28 23:46:53 by stevenj]
+  noted not in API
+
+    M ./CONVENTIONS -1 +1
+
+Fri Feb 28 18:43:14 EST 2003  stevenj
+  * [project @ 2003-02-28 23:43:14 by stevenj]
+  more updates
+
+    M ./CONVENTIONS -3 +4
+
+Fri Feb 28 18:38:42 EST 2003  stevenj
+  * [project @ 2003-02-28 23:38:42 by stevenj]
+  slight updates
+
+    M ./CONVENTIONS -3 +4
+
+Fri Feb 28 18:28:58 EST 2003  stevenj
+  * [project @ 2003-02-28 23:28:58 by stevenj]
+  great const-ification of apply/solve and print
+
+    M ./api/f77funcs.c -1 +1
+    M ./api/fftw3.h -2 +2
+    M ./api/print-plan.c -1 +1
+    M ./dft/buffered.c -5 +5
+    M ./dft/ct-dif.c -3 +3
+    M ./dft/ct-dit.c -3 +3
+    M ./dft/ct-ditbuf.c -3 +3
+    M ./dft/ct-ditf.c -3 +3
+    M ./dft/ct.c -3 +3
+    M ./dft/dft.h -3 +3
+    M ./dft/direct.c -5 +5
+    M ./dft/generic.c -4 +4
+    M ./dft/indirect.c -7 +7
+    M ./dft/nop.c -3 +3
+    M ./dft/rader.c -12 +12
+    M ./dft/rank-geq2.c -5 +5
+    M ./dft/rank0.c -12 +12
+    M ./dft/solve.c -3 +3
+    M ./dft/vrank-geq1.c -5 +5
+    M ./dft/vrank2-transpose.c -5 +5
+    M ./dft/vrank3-transpose.c -5 +5
+    M ./kernel/ifftw.h -3 +3
+    M ./rdft/buffered.c -5 +5
+    M ./rdft/buffered2.c -7 +7
+    M ./rdft/dft-r2hc.c -5 +5
+    M ./rdft/dht-r2hc.c -5 +5
+    M ./rdft/dht-rader.c -4 +4
+    M ./rdft/direct.c -7 +7
+    M ./rdft/direct2.c -7 +7
+    M ./rdft/generic.c -6 +6
+    M ./rdft/hc2hc-buf.c -5 +5
+    M ./rdft/hc2hc-dif.c -3 +3
+    M ./rdft/hc2hc-dit.c -3 +3
+    M ./rdft/hc2hc.c -3 +3
+    M ./rdft/indirect.c -7 +7
+    M ./rdft/nop.c -3 +3
+    M ./rdft/nop2.c -3 +3
+    M ./rdft/rader-hc2hc.c -6 +6
+    M ./rdft/rank-geq2-rdft2.c -7 +7
+    M ./rdft/rank-geq2.c -5 +5
+    M ./rdft/rank0-rdft2.c -9 +9
+    M ./rdft/rank0.c -8 +8
+    M ./rdft/rdft-dht.c -9 +9
+    M ./rdft/rdft.h -5 +5
+    M ./rdft/rdft2-radix2.c -12 +12
+    M ./rdft/solve.c -3 +3
+    M ./rdft/solve2.c -3 +3
+    M ./rdft/vrank-geq1-rdft2.c -7 +7
+    M ./rdft/vrank-geq1.c -5 +5
+    M ./rdft/vrank2-transpose.c -5 +5
+    M ./rdft/vrank3-transpose.c -5 +5
+    M ./reodft/redft00e-r2hc.c -5 +5
+    M ./reodft/reodft010e-r2hc.c -11 +11
+    M ./reodft/reodft11e-r2hc-odd.c -7 +7
+    M ./reodft/reodft11e-r2hc.c -7 +7
+    M ./reodft/reodft11e-radix2.c -9 +9
+    M ./reodft/rodft00e-r2hc.c -5 +5
+    M ./threads/ct-dit.c -3 +3
+    M ./threads/dft-vrank-geq1.c -5 +5
+    M ./threads/hc2hc-dif.c -3 +3
+    M ./threads/hc2hc-dit.c -3 +3
+    M ./threads/rdft-vrank-geq1.c -5 +5
+    M ./threads/vrank-geq1-rdft2.c -7 +7
+
+Fri Feb 28 17:51:15 EST 2003  stevenj
+  * [project @ 2003-02-28 22:51:15 by stevenj]
+  make fftw_execute take a const plan, to remind the user that it is re-entrant (or should be)...
+
+    M ./api/execute-dft-c2r.c -1 +1
+    M ./api/execute-dft-r2c.c -1 +1
+    M ./api/execute-dft.c -1 +1
+    M ./api/execute-r2r.c -1 +1
+    M ./api/execute.c -1 +1
+    M ./api/f77funcs.c -5 +6
+    M ./api/fftw3.h -6 +6
+    M ./doc/fftw3.texi -2 +2
+
+Fri Feb 28 17:29:40 EST 2003  stevenj
+  * [project @ 2003-02-28 22:29:40 by stevenj]
+  weakening
+
+    M ./doc/fftw3.texi -2 +2
+
+Fri Feb 28 17:28:48 EST 2003  stevenj
+  * [project @ 2003-02-28 22:28:48 by stevenj]
+  note
+
+    M ./doc/fftw3.texi -4 +4
+
+Fri Feb 28 17:27:10 EST 2003  stevenj
+  * [project @ 2003-02-28 22:27:10 by stevenj]
+  footnote about why DHT is provided
+
+    M ./doc/fftw3.texi -7 +12
+
+Fri Feb 28 15:07:03 EST 2003  stevenj
+  * [project @ 2003-02-28 20:07:03 by stevenj]
+  index
+
+    M ./doc/fftw3.texi -1 +2
+
+Fri Feb 28 15:05:48 EST 2003  stevenj
+  * [project @ 2003-02-28 20:05:48 by stevenj]
+  added DHT tutorial
+
+    M ./doc/fftw3.texi -2 +32
+
+Fri Feb 28 14:36:45 EST 2003  stevenj
+  * [project @ 2003-02-28 19:36:45 by stevenj]
+  fixed O(n log n)
+
+    M ./doc/fftw3.texi -4 +16
+
+Fri Feb 28 14:12:15 EST 2003  stevenj
+  * [project @ 2003-02-28 19:08:03 by stevenj]
+  whoops
+
+    M ./doc/fftw3.texi -4 +10
+
+Fri Feb 28 14:06:22 EST 2003  stevenj
+  * [project @ 2003-02-28 19:06:22 by stevenj]
+  slight improvements
+
+    M ./doc/fftw3.texi -2 +12
+
+Fri Feb 28 00:55:50 EST 2003  stevenj
+  * [project @ 2003-02-28 05:55:50 by stevenj]
+  addition
+
+    M ./doc/fftw3.texi -3 +5
+
+Fri Feb 28 00:54:09 EST 2003  stevenj
+  * [project @ 2003-02-28 05:54:09 by stevenj]
+  clarification
+
+    M ./doc/fftw3.texi -7 +8
+
+Thu Feb 27 23:49:37 EST 2003  stevenj
+  * [project @ 2003-02-28 04:49:37 by stevenj]
+  fix
+
+    M ./doc/fftw3.texi -2 +2
+
+Thu Feb 27 23:43:56 EST 2003  stevenj
+  * [project @ 2003-02-28 04:43:56 by stevenj]
+  slight changes
+
+    M ./doc/fftw3.texi -15 +21
+
+Thu Feb 27 23:27:48 EST 2003  stevenj
+  * [project @ 2003-02-28 04:27:48 by stevenj]
+  added R{E,O}DFTab tutorial
+
+    M ./doc/fftw3.texi -9 +105
+
+Thu Feb 27 17:24:20 EST 2003  stevenj
+  * [project @ 2003-02-27 22:24:20 by stevenj]
+  fixes
+
+    M ./doc/fftw3.texi -12 +11
+
+Thu Feb 27 17:20:42 EST 2003  stevenj
+  * [project @ 2003-02-27 22:20:42 by stevenj]
+  fixes
+
+    M ./doc/fftw3.texi -24 +26
+
+Thu Feb 27 17:11:54 EST 2003  stevenj
+  * [project @ 2003-02-27 22:11:22 by stevenj]
+  slight change
+
+    M ./doc/fftw3.texi -4 +4
+
+Thu Feb 27 17:07:45 EST 2003  stevenj
+  * [project @ 2003-02-27 22:07:45 by stevenj]
+  documented r2hc/hc2r
+
+    M ./doc/fftw3.texi -6 +86
+
+Thu Feb 27 16:19:16 EST 2003  stevenj
+  * [project @ 2003-02-27 21:19:16 by stevenj]
+  minor changes
+
+    M ./doc/fftw3.texi -24 +26
+
+Thu Feb 27 13:54:06 EST 2003  stevenj
+  * [project @ 2003-02-27 18:54:06 by stevenj]
+  timed planner and unifying radix-2 butterfly loops are not critical for release
+
+    M ./TODO -8 +8
+
+Thu Feb 27 13:51:20 EST 2003  stevenj
+  * [project @ 2003-02-27 18:51:20 by stevenj]
+  reodft/verify.c no longer exists
+
+    M ./TODO -2
+
+Thu Feb 27 13:44:19 EST 2003  stevenj
+  * [project @ 2003-02-27 18:44:19 by stevenj]
+  optimization: REDFT00 of size 2 is same as R2HC
+
+    M ./rdft/problem.c -1 +5
+
+Thu Feb 27 12:35:33 EST 2003  stevenj
+  * [project @ 2003-02-27 17:35:33 by stevenj]
+  R{E,O}DFT01 of size-1 is identity
+
+    M ./rdft/problem.c -3 +4
+
+Thu Feb 27 12:15:10 EST 2003  stevenj
+  * [project @ 2003-02-27 17:15:10 by stevenj]
+  minor simplification
+
+    M ./reodft/reodft11e-r2hc-odd.c -31 +31
+
+Thu Feb 27 02:46:31 EST 2003  stevenj
+  * [project @ 2003-02-27 07:46:31 by stevenj]
+  fixed add count
+
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+
+Thu Feb 27 02:25:04 EST 2003  stevenj
+  * [project @ 2003-02-27 07:25:04 by stevenj]
+  whoops
+
+    M ./reodft/reodft11e-r2hc-odd.c -5 +5
+
+Thu Feb 27 02:22:03 EST 2003  stevenj
+  * [project @ 2003-02-27 07:22:03 by stevenj]
+  another optimization
+
+    M ./reodft/reodft11e-r2hc-odd.c -53 +53
+
+Thu Feb 27 01:43:00 EST 2003  stevenj
+  * [project @ 2003-02-27 06:43:00 by stevenj]
+  added op counts
+
+    M ./reodft/reodft11e-r2hc-odd.c -2 +4
+    M ./reodft/reodft11e-radix2.c -2 +9
+
+Thu Feb 27 01:29:32 EST 2003  stevenj
+  * [project @ 2003-02-27 06:29:32 by stevenj]
+  cleanup
+
+    M ./reodft/reodft11e-r2hc-odd.c -53 +55
+
+Thu Feb 27 01:17:23 EST 2003  stevenj
+  * [project @ 2003-02-27 06:17:23 by stevenj]
+  typo in comment
+
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+
+Thu Feb 27 01:13:49 EST 2003  stevenj
+  * [project @ 2003-02-27 06:13:49 by stevenj]
+  fixed comment
+
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+
+Thu Feb 27 01:12:05 EST 2003  stevenj
+  * [project @ 2003-02-27 06:12:05 by stevenj]
+  use E instead of R
+
+    M ./reodft/reodft11e-r2hc-odd.c -9 +9
+
+Thu Feb 27 01:05:39 EST 2003  stevenj
+  * [project @ 2003-02-27 06:05:39 by stevenj]
+  more unrolling to eliminate if statements in loops, for speedups of 25-40%
+
+    M ./reodft/reodft11e-r2hc-odd.c -25 +28
+
+Thu Feb 27 00:27:00 EST 2003  stevenj
+  * [project @ 2003-02-27 05:27:00 by stevenj]
+  some loop splitting to touch each element of output buf only once and eliminate some conditionals...speeds up by 30-40%
+
+    M ./reodft/reodft11e-r2hc-odd.c -31 +167
+
+Wed Feb 26 17:48:26 EST 2003  stevenj
+  * [project @ 2003-02-26 22:48:26 by stevenj]
+  comma
+
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+
+Wed Feb 26 17:46:17 EST 2003  stevenj
+  * [project @ 2003-02-26 22:46:17 by stevenj]
+  pointer to odd case
+
+    M ./reodft/reodft11e-radix2.c -1 +3
+
+Wed Feb 26 17:40:54 EST 2003  stevenj
+  * [project @ 2003-02-26 22:40:54 by stevenj]
+  precision -> accuracy (c.f. Kahan)
+
+    M ./reodft/reodft11e-r2hc.c -2 +2
+
+Wed Feb 26 17:36:13 EST 2003  stevenj
+  * [project @ 2003-02-26 22:36:13 by stevenj]
+  added time limit for wisdom generation
+
+    M ./Makefile.am -1 +4
+    M ./libbench2/bench-user.h -1 +2
+    M ./libbench2/problem.c -1 +4
+    M ./tools/fftw-wisdom.c -16 +59
+    M ./tools/fftw_wisdom.1.in +6
+
+Wed Feb 26 13:24:36 EST 2003  stevenj
+  * [project @ 2003-02-26 18:24:36 by stevenj]
+  caps
+
+    M ./reodft/reodft11e-r2hc-odd.c -2 +2
+
+Tue Feb 25 20:56:01 EST 2003  stevenj
+  * [project @ 2003-02-26 01:56:01 by stevenj]
+  another note
+
+    M ./reodft/reodft11e-r2hc-odd.c -2 +3
+
+Tue Feb 25 20:54:57 EST 2003  stevenj
+  * [project @ 2003-02-26 01:54:57 by stevenj]
+  note
+
+    M ./reodft/reodft11e-r2hc-odd.c -4 +5
+
+Tue Feb 25 20:42:08 EST 2003  stevenj
+  * [project @ 2003-02-26 01:42:08 by stevenj]
+  added new, more accurate (hopefully) reodft11 algorithms; added --disable-debug-malloc; added --impulse-accuracy-rounds=rounds flags to libbench2 for impulse-response accuracy tests
+
+    A ./reodft/reodft11e-r2hc-odd.c
+    A ./reodft/reodft11e-radix2.c
+    M ./configure.ac +8
+    M ./kernel/alloc.c -2 +2
+    M ./kernel/ifftw.h -3 +3
+    M ./libbench2/bench-main.c -2 +8
+    M ./libbench2/bench.h -2 +2
+    M ./libbench2/verify-dft.c -3 +4
+    M ./libbench2/verify-lib.c -18 +76
+    M ./libbench2/verify-r2r.c -7 +39
+    M ./libbench2/verify-rdft2.c -3 +4
+    M ./libbench2/verify.c -7 +10
+    M ./libbench2/verify.h -4 +7
+    M ./reodft/Makefile.am -1 +4
+    M ./reodft/conf.c -2 +4
+    M ./reodft/reodft.h -1 +3
+    M ./reodft/reodft11e-r2hc-odd.c +273
+    M ./reodft/reodft11e-r2hc.c -2 +8
+    M ./reodft/reodft11e-radix2.c +506
+    M ./tests/bench.c -1 +1
+
+Sun Feb 23 14:07:48 EST 2003  athena
+  * [project @ 2003-02-23 19:07:48 by athena]
+  fftw_wisdom.1 is in $builddir, not $srcdir
+
+    M ./tools/Makefile.am -1 +1
+
+Mon Feb 17 03:42:19 EST 2003  stevenj
+  * [project @ 2003-02-17 08:42:19 by stevenj]
+  pde
+
+    M ./doc/fftw3.texi -2 +2
+
+Mon Feb 17 03:40:19 EST 2003  stevenj
+  * [project @ 2003-02-17 08:40:19 by stevenj]
+  consistent number
+
+    M ./doc/fftw3.texi -4 +4
+
+Mon Feb 17 03:39:02 EST 2003  stevenj
+  * [project @ 2003-02-17 08:39:02 by stevenj]
+  started r2r doc
+
+    M ./doc/fftw3.texi -1 +65
+
+Mon Feb 17 02:31:51 EST 2003  stevenj
+  * [project @ 2003-02-17 07:31:50 by stevenj]
+  rfftwnd
+
+    A ./doc/rfftwnd.fig
+    A ./doc/rfftwnd.gif
+    M ./doc/Makefile.am +11
+    M ./doc/fftw3.texi -26 +147
+    M ./doc/rfftwnd.fig +1148
+    M ./doc/rfftwnd.gif
+
+Sat Feb 15 17:02:07 EST 2003  stevenj
+  * [project @ 2003-02-15 22:02:07 by stevenj]
+  continued
+
+    M ./doc/fftw3.texi -4 +32
+
+Sat Feb 15 15:16:26 EST 2003  stevenj
+  * [project @ 2003-02-15 20:16:26 by stevenj]
+  started r2c/c2r docs
+
+    M ./doc/fftw3.texi -8 +87
+
+Sat Feb 15 01:12:52 EST 2003  stevenj
+  * [project @ 2003-02-15 06:12:52 by stevenj]
+  added r{e,o}dft11 accuracy test
+
+    M ./libbench2/verify-r2r.c -23 +45
+
+Sat Feb 15 00:42:48 EST 2003  stevenj
+  * [project @ 2003-02-15 05:42:48 by stevenj]
+  added more r2r accuracy checks
+
+    M ./libbench2/verify-dft.c -2 +2
+    M ./libbench2/verify-lib.c -10 +4
+    M ./libbench2/verify-r2r.c -1 +127
+    M ./libbench2/verify-rdft2.c -2 +2
+    M ./libbench2/verify.h -2 +3
+
+Fri Feb 14 19:19:54 EST 2003  athena
+  * [project @ 2003-02-15 00:19:54 by athena]
+  $< is a GNUism
+
+    M ./tools/Makefile.am -1 +1
+
+Wed Feb 12 21:02:16 EST 2003  stevenj
+  * [project @ 2003-02-13 02:02:16 by stevenj]
+  r2r test cases are in
+
+    M ./TODO -1 +1
+
+Wed Feb 12 21:01:28 EST 2003  stevenj
+  * [project @ 2003-02-13 02:01:28 by stevenj]
+  added vector radix to TODO
+
+    M ./TODO +2
+
+Wed Feb 12 17:21:33 EST 2003  stevenj
+  * [project @ 2003-02-12 22:21:33 by stevenj]
+  fixed cross-ref
+
+    M ./tools/fftw_wisdom.1.in -1 +1
+
+Wed Feb 12 17:19:56 EST 2003  stevenj
+  * [project @ 2003-02-12 22:19:56 by stevenj]
+  shorter synopsis
+
+    M ./tools/fftw_wisdom.1.in -1 +1
+
+Wed Feb 12 12:53:19 EST 2003  stevenj
+  * [project @ 2003-02-12 17:53:19 by stevenj]
+  obsolete
+
+    R ./tests/debug.h
+
+Wed Feb 12 12:52:53 EST 2003  stevenj
+  * [project @ 2003-02-12 17:52:53 by stevenj]
+  removed old dotens
+
+    R ./tests/dotens.c
+    R ./tests/dotens2.c
+
+Wed Feb 12 12:52:16 EST 2003  stevenj
+  * [project @ 2003-02-12 17:52:16 by stevenj]
+  removed old verify files
+
+    R ./tests/verify-dft.c
+    R ./tests/verify-lib.c
+    R ./tests/verify-rdft.c
+    R ./tests/verify-reodft.c
+    R ./tests/verify.h
+
+Wed Feb 12 12:37:17 EST 2003  stevenj
+  * [project @ 2003-02-12 17:37:17 by stevenj]
+  disable threads support by default
+
+    M ./tools/fftw-wisdom.c -4 +11
+    M ./tools/fftw_wisdom.1.in -7
+
+Wed Feb 12 11:03:28 EST 2003  athena
+  * [project @ 2003-02-12 16:03:28 by athena]
+  Removed old test program
+
+    M ./tests/bench.c -390
+
+Tue Feb 11 22:30:55 EST 2003  stevenj
+  * [project @ 2003-02-12 03:30:55 by stevenj]
+  joke
+
+    M ./tools/fftw-wisdom-to-conf.in -1 +1
+
+Tue Feb 11 22:27:44 EST 2003  stevenj
+  * [project @ 2003-02-12 03:27:44 by stevenj]
+  add --help and --version, to be GNU-lly correct
+
+    M ./tools/fftw-wisdom-to-conf.1 +7
+    M ./tools/fftw-wisdom-to-conf.in +36
+
+Tue Feb 11 22:27:18 EST 2003  stevenj
+  * [project @ 2003-02-12 03:27:18 by stevenj]
+  whoops
+
+    M ./tools/fftw_wisdom.1.in -1
+
+Tue Feb 11 22:17:35 EST 2003  stevenj
+  * [project @ 2003-02-12 03:15:03 by stevenj]
+  better help
+
+    M ./tools/fftw-wisdom.c -10 +43
+
+Tue Feb 11 21:47:35 EST 2003  stevenj
+  * [project @ 2003-02-12 02:47:35 by stevenj]
+  comma
+
+    M ./tools/fftw-wisdom-to-conf.1 -2 +1
+
+Tue Feb 11 21:46:12 EST 2003  stevenj
+  * [project @ 2003-02-12 02:46:12 by stevenj]
+  formatting
+
+    M ./tools/fftw-wisdom-to-conf.1 -1 +1
+
+Tue Feb 11 21:45:23 EST 2003  stevenj
+  * [project @ 2003-02-12 02:45:23 by stevenj]
+  man pages for tools
+
+    A ./tools/fftw-wisdom-to-conf.1
+    A ./tools/fftw_wisdom.1.in
+    M ./configure.ac +1
+    M ./tools/Makefile.am -1 +7
+    M ./tools/fftw-wisdom-to-conf.1 +85
+    M ./tools/fftw_wisdom.1.in +190
+
+Tue Feb 11 19:07:12 EST 2003  stevenj
+  * [project @ 2003-02-12 00:07:12 by stevenj]
+  added -V
+
+    M ./tools/fftw-wisdom.c +25
+
+Tue Feb 11 18:42:17 EST 2003  stevenj
+  * [project @ 2003-02-11 23:42:17 by stevenj]
+  added install-wisdom target
+
+    M ./Makefile.am +10
+
+Tue Feb 11 18:23:02 EST 2003  stevenj
+  * [project @ 2003-02-11 23:23:02 by stevenj]
+  another note
+
+    M ./NEWS -1 +2
+
+Tue Feb 11 17:32:56 EST 2003  stevenj
+  * [project @ 2003-02-11 22:32:56 by stevenj]
+  started r2r accuracy tests (only three kinds covered so far)
+
+    M ./libbench2/verify-dft.c -2 +2
+    M ./libbench2/verify-lib.c -10 +13
+    M ./libbench2/verify-r2r.c +103
+    M ./libbench2/verify-rdft2.c -2 +2
+    M ./libbench2/verify.c -2 +2
+    M ./libbench2/verify.h -1 +4
+
+Mon Feb 10 22:04:18 EST 2003  stevenj
+  * [project @ 2003-02-11 03:04:18 by stevenj]
+  silence warning
+
+    M ./kernel/ifftw.h -3 +3
+
+Mon Feb 10 20:55:20 EST 2003  athena
+  * [project @ 2003-02-11 01:55:20 by athena]
+  gcc bug is now avoided.
+
+    M ./TODO -5
+
+Mon Feb 10 20:37:54 EST 2003  athena
+  * [project @ 2003-02-11 01:37:54 by athena]
+  Accuracy test
+
+    A ./libbench2/mp.c
+    M ./libbench2/Makefile.am -5 +6
+    M ./libbench2/bench-user.h -7 +1
+    M ./libbench2/mp.c +592
+    M ./libbench2/verify-dft.c -1 +22
+    M ./libbench2/verify-lib.c -16 +39
+    M ./libbench2/verify-r2r.c -17 +2
+    M ./libbench2/verify-rdft2.c -1 +23
+    M ./libbench2/verify.c -2 +21
+    M ./libbench2/verify.h +27
+
+Mon Feb 10 07:59:57 EST 2003  athena
+  * [project @ 2003-02-10 12:59:57 by athena]
+  There is no point in precomputing strides for the long-double code, as
+  multiplication by sizeof(long double) cannot be folded into the
+  addressing mode.  This change also fixes the gcc-2.95 bug that causes
+  miscompilation of certain codelets.
+
+    M ./kernel/ifftw.h -2 +2
+
+Mon Feb 10 02:54:35 EST 2003  stevenj
+  * [project @ 2003-02-10 07:54:35 by stevenj]
+  added random r2r tests
+
+    M ./tests/check.pl -3 +19
+
+Mon Feb 10 02:44:58 EST 2003  stevenj
+  * [project @ 2003-02-10 07:44:58 by stevenj]
+  whoops, bugfix: missing stride for ro10
+
+    M ./reodft/reodft010e-r2hc.c -2 +2
+
+Mon Feb 10 02:21:50 EST 2003  stevenj
+  * [project @ 2003-02-10 07:21:50 by stevenj]
+  formatting
+
+    M ./api/mapflags.c -1 +2
+
+Sun Feb  9 23:24:52 EST 2003  stevenj
+  * [project @ 2003-02-10 04:24:52 by stevenj]
+  flop counts for reodft
+
+    M ./reodft/redft00e-r2hc.c -5 +12
+    M ./reodft/reodft010e-r2hc.c -3 +16
+    M ./reodft/reodft11e-r2hc.c -3 +10
+    M ./reodft/rodft00e-r2hc.c -3 +12
+
+Sun Feb  9 23:22:15 EST 2003  stevenj
+  * [project @ 2003-02-10 04:22:15 by stevenj]
+  declare aligned_main
+
+    M ./libbench2/bench.h -1 +2
+
+Sun Feb  9 20:56:06 EST 2003  stevenj
+  * [project @ 2003-02-10 01:56:06 by stevenj]
+  corrected rader op counts
+
+    M ./rdft/dht-rader.c -6 +5
+    M ./rdft/rader-hc2hc.c -2 +2
+
+Sun Feb  9 20:25:32 EST 2003  stevenj
+  * [project @ 2003-02-10 01:25:32 by stevenj]
+  punctuation
+
+    M ./TODO -1 +1
+
+Sun Feb  9 20:25:17 EST 2003  stevenj
+  * [project @ 2003-02-10 01:25:17 by stevenj]
+  noted need for better estimator
+
+    M ./TODO +4
+
+Sun Feb  9 19:58:59 EST 2003  stevenj
+  * [project @ 2003-02-10 00:58:59 by stevenj]
+  noted F77 api fix for g77 mangling incompatibility
+
+    M ./NEWS +3
+
+Sun Feb  9 19:30:55 EST 2003  stevenj
+  * [project @ 2003-02-10 00:30:55 by stevenj]
+  build f77 header file of constants from fftw3.h
+
+    M ./api/Makefile.am -1 +10
+
+Sun Feb  9 19:04:53 EST 2003  stevenj
+  * [project @ 2003-02-10 00:04:53 by stevenj]
+  updates
+
+    M ./TODO -7 +7
+
+Sun Feb  9 19:03:34 EST 2003  stevenj
+  * [project @ 2003-02-10 00:03:34 by stevenj]
+  threads f77 api
+
+    A ./api/x77.h
+    A ./threads/f77api.c
+    A ./threads/f77funcs.c
+    M ./api/Makefile.am -1 +1
+    M ./api/f77api.c -12 +1
+    M ./api/x77.h +32
+    M ./threads/Makefile.am -1 +1
+    M ./threads/f77api.c +39
+    M ./threads/f77funcs.c +39
+
+Sun Feb  9 18:54:00 EST 2003  stevenj
+  * [project @ 2003-02-09 23:54:00 by stevenj]
+  finished f77 serial api
+
+    M ./api/f77api.c +15
+    M ./api/f77funcs.c +220
+
+Sun Feb  9 18:32:26 EST 2003  stevenj
+  * [project @ 2003-02-09 23:32:26 by stevenj]
+  added flops, slight cleanups
+
+    M ./api/f77api.c -8 +5
+    M ./api/f77funcs.c -21 +27
+
+Sun Feb  9 18:11:48 EST 2003  athena
+  * [project @ 2003-02-09 23:11:48 by athena]
+  Oops, forgot #include
+
+    M ./libbench2/aligned-main.c -1 +3
+
+Sun Feb  9 18:08:26 EST 2003  athena
+  * [project @ 2003-02-09 23:08:26 by athena]
+  Removed duplication of stack-alignment code
+
+    A ./libbench2/aligned-main.c
+    M ./libbench2/Makefile.am -5 +5
+    M ./libbench2/aligned-main.c +81
+    M ./libbench2/bench-main.c -62 +2
+    M ./libbench2/main.c -2 +2
+    M ./tools/fftw-wisdom.c -61 +1
+
+Sun Feb  9 15:48:15 EST 2003  stevenj
+  * [project @ 2003-02-09 20:48:15 by stevenj]
+  allow - to read problems from stdin
+
+    M ./tools/fftw-wisdom.c -2 +14
+
+Sun Feb  9 15:22:23 EST 2003  stevenj
+  * [project @ 2003-02-09 20:22:23 by stevenj]
+  added fftw-wisdom tool
+
+    A ./tools/fftw-wisdom.c
+    M ./tools/Makefile.am +15
+    M ./tools/fftw-wisdom.c +298
+
+Sun Feb  9 15:06:38 EST 2003  stevenj
+  * [project @ 2003-02-09 20:06:38 by stevenj]
+  elim. warning
+
+    M ./tests/bench.c -1 +1
+
+Sun Feb  9 14:24:19 EST 2003  stevenj
+  * [project @ 2003-02-09 19:24:19 by stevenj]
+  destroy_input should not contaminate flags of other problems
+
+    M ./tests/bench.c -1 +5
+
+Sun Feb  9 13:06:11 EST 2003  stevenj
+  * [project @ 2003-02-09 18:06:11 by stevenj]
+  updated
+
+    M ./ChangeLog -53 +1740
+
+Sun Feb  9 13:01:45 EST 2003  stevenj
+  * [project @ 2003-02-09 18:01:44 by stevenj]
+  removed overzealous inplace check, which caused problems for rdft2
+
+    M ./dft/rank-geq2.c -12 +1
+    M ./rdft/rank-geq2-rdft2.c -3 +2
+    M ./rdft/rank-geq2.c -12 +1
+
+Sun Feb  9 08:14:03 EST 2003  athena
+  * [project @ 2003-02-09 13:14:03 by athena]
+  Consistent syntax for RNK_MINFTY tensors
+
+    M ./kernel/tensor.c -4 +4
+
+Sun Feb  9 07:31:13 EST 2003  athena
+  * [project @ 2003-02-09 12:31:13 by athena]
+  lisply-correct tensor print.  We no longer need to parse tensors.
+
+    M ./kernel/tensor.c -3 +9
+
+Sun Feb  9 03:35:56 EST 2003  stevenj
+  * [project @ 2003-02-09 08:35:56 by stevenj]
+  removed completed items
+
+    M ./TODO -4
+
+Sun Feb  9 03:27:56 EST 2003  stevenj
+  * [project @ 2003-02-09 08:27:56 by stevenj]
+  slight renaming
+
+    M ./libbench2/verify-r2r.c -4 +4
+
+Sun Feb  9 03:15:28 EST 2003  stevenj
+  * [project @ 2003-02-09 08:15:28 by stevenj]
+  multi-dimensional r2r verifier
+
+    M ./libbench2/problem.c -2 +2
+    M ./libbench2/verify-r2r.c -81 +118
+
+Sun Feb  9 02:40:22 EST 2003  stevenj
+  * [project @ 2003-02-09 07:40:22 by stevenj]
+  comments
+
+    M ./libbench2/verify-r2r.c -1 +3
+
+Sun Feb  9 02:38:26 EST 2003  stevenj
+  * [project @ 2003-02-09 07:38:26 by stevenj]
+  slight simplification
+
+    M ./libbench2/verify-r2r.c -7 +2
+
+Sun Feb  9 02:36:25 EST 2003  stevenj
+  * [project @ 2003-02-09 07:36:25 by stevenj]
+  added 1d r2r verifier (triple ugh)
+
+    A ./libbench2/verify-r2r.c
+    M ./libbench2/Makefile.am -2 +2
+    M ./libbench2/allocate.c -1 +17
+    M ./libbench2/bench-user.h -2 +10
+    M ./libbench2/mflops.c +1
+    M ./libbench2/problem.c -7 +68
+    M ./libbench2/verify-r2r.c +616
+    M ./libbench2/verify.c -2 +2
+    M ./libbench2/zero.c -1 +4
+    M ./tests/bench.c +101
+
+Sat Feb  8 22:23:00 EST 2003  stevenj
+  * [project @ 2003-02-09 03:23:00 by stevenj]
+  added vector transforms to random tests
+
+    M ./tests/check.pl -3 +11
+
+Sat Feb  8 20:59:07 EST 2003  stevenj
+  * [project @ 2003-02-09 01:59:07 by stevenj]
+  whoops
+
+    M ./rdft/direct2.c -2 +1
+
+Sat Feb  8 19:52:58 EST 2003  stevenj
+  * [project @ 2003-02-09 00:52:58 by stevenj]
+  fixed interaction between dwims for sz/vecsz with rdft2 transforms
+
+    M ./libbench2/problem.c -15 +17
+
+Sat Feb  8 19:35:56 EST 2003  stevenj
+  * [project @ 2003-02-09 00:35:56 by stevenj]
+  added destroy_input flag/check
+
+    M ./libbench2/bench-user.h -1 +2
+    M ./libbench2/problem.c -1 +3
+    M ./libbench2/verify-dft.c -1 +4
+    M ./libbench2/verify-lib.c -1 +35
+    M ./libbench2/verify-rdft2.c -1 +5
+    M ./libbench2/verify.h +2
+    M ./tests/bench.c +7
+
+Sat Feb  8 19:11:58 EST 2003  stevenj
+  * [project @ 2003-02-09 00:11:57 by stevenj]
+  added rdft2 verifier
+
+    A ./api/extract-reim.c
+    A ./libbench2/aset.c
+    A ./libbench2/verify-rdft2.c
+    M ./api/Makefile.am -3 +3
+    R ./api/dfthelp.c
+    M ./api/extract-reim.c +36
+    M ./api/plan-guru-dft-c2r.c -1 +2
+    M ./api/plan-many-dft-c2r.c +2
+    M ./libbench2/Makefile.am -5 +5
+    M ./libbench2/allocate.c -7 +44
+    M ./libbench2/aset.c +10
+    M ./libbench2/bench-user.h -1 +4
+    M ./libbench2/bench.h -1 +2
+    M ./libbench2/problem.c -10 +42
+    M ./libbench2/tensor.c -1 +47
+    M ./libbench2/verify-dft.c -1 +3
+    M ./libbench2/verify-lib.c -16 +13
+    M ./libbench2/verify-rdft2.c +255
+    M ./libbench2/verify.c -2 +2
+    M ./libbench2/verify.h -1 +1
+    M ./libbench2/zero.c -2 +8
+    M ./tests/bench.c -8 +189
+    M ./tests/check.pl -1 +7
+
+Sat Feb  8 13:31:14 EST 2003  stevenj
+  * [project @ 2003-02-08 18:31:14 by stevenj]
+  an additional check for in-place case
+
+    M ./rdft/rdft2-radix2.c -3 +7
+
+Fri Feb  7 17:36:56 EST 2003  stevenj
+  * [project @ 2003-02-07 22:36:56 by stevenj]
+  slight fix: hc2r constraints are mostly determined by sub-plan
+
+    M ./rdft/rank0-rdft2.c -6 +7
+
+Fri Feb  7 16:28:55 EST 2003  stevenj
+  * [project @ 2003-02-07 21:28:55 by stevenj]
+  make radix2-dft inapplicable to in-place/split case (r == rio, iio >= rio + n/2+1 != r + 1)
+
+    M ./rdft/rdft2-radix2.c -3 +22
+
+Tue Feb  4 06:36:29 EST 2003  athena
+  * [project @ 2003-02-04 11:36:29 by athena]
+  Allow plnr->hook to be 0
+
+    M ./kernel/planner.c -12 +12
+    M ./tests/hook.c -7 +2
+
+Tue Feb  4 03:25:36 EST 2003  stevenj
+  * [project @ 2003-02-04 08:25:36 by stevenj]
+  moved dft stuff into verify-dft
+
+    M ./libbench2/bench-user.h -4 +2
+    M ./libbench2/verify-dft.c -9 +84
+    M ./libbench2/verify.c -84 +2
+
+Tue Feb  4 03:25:00 EST 2003  stevenj
+  * [project @ 2003-02-04 08:25:00 by stevenj]
+  cruft
+
+    M ./tests/hook.c -1
+
+Tue Feb  4 03:18:28 EST 2003  stevenj
+  * [project @ 2003-02-04 08:18:28 by stevenj]
+  further unify libbench2 and paranoid verifiers
+
+    M ./libbench2/bench-user.h -1 +6
+    M ./libbench2/problem.c -1 +2
+    M ./libbench2/verify.c -16 +21
+    M ./tests/bench.c -3 +5
+    M ./tests/hook.c -125 +58
+
+Sun Feb  2 01:45:37 EST 2003  stevenj
+  * [project @ 2003-02-02 06:45:37 by stevenj]
+  typo in comment
+
+    M ./api/import-wisdom-from-file.c -1 +1
+
+Sat Feb  1 09:30:03 EST 2003  athena
+  * [project @ 2003-02-01 14:30:03 by athena]
+  Fixed p==2 case
+
+    M ./kernel/primes.c -1 +4
+
+Sat Feb  1 09:23:43 EST 2003  athena
+  * [project @ 2003-02-01 14:23:43 by athena]
+  Incorporated new find_generator by Greg Dionne.
+
+    M ./kernel/primes.c -21 +35
+
+Fri Jan 31 20:46:24 EST 2003  athena
+  * [project @ 2003-02-01 01:46:24 by athena]
+  Removed nonportable call to gettext()
+
+    M ./libbench2/getopt.c +7
+
+Wed Jan 29 19:03:43 EST 2003  athena
+  * [project @ 2003-01-30 00:03:43 by athena]
+  uintptr_t is in <inttypes.h> in openbsd
+
+    M ./kernel/ifftw.h -1 +5
+
+Wed Jan 29 15:41:56 EST 2003  athena
+  * [project @ 2003-01-29 20:41:56 by athena]
+  Huge speedups in wisdom I/O.
+
+    M ./api/export-wisdom-to-string.c -2 +2
+    M ./api/export-wisdom.c -1 +1
+    M ./api/import-wisdom-from-file.c -4 +27
+    M ./api/mkprinter-file.c -5 +25
+    M ./kernel/debug.c -2 +2
+    M ./kernel/ifftw.h -2 +5
+    M ./kernel/planner.c -5 +9
+    M ./kernel/print.c -2 +7
+    M ./kernel/scan.c -4 +25
+    M ./tests/bench.c -1 +5
+
+Tue Jan 28 19:36:51 EST 2003  athena
+  * [project @ 2003-01-29 00:36:51 by athena]
+  Added appropriate warning against likely future bug.
+
+    M ./kernel/planner.c -1 +4
+
+Tue Jan 28 19:00:24 EST 2003  athena
+  * [project @ 2003-01-29 00:00:24 by athena]
+  Don't attempt to remove bogus wisdom entries.
+
+    M ./kernel/planner.c -10 +3
+
+Tue Jan 28 18:16:24 EST 2003  athena
+  * [project @ 2003-01-28 23:16:24 by athena]
+  Fixed a couple of very very very nasty bugs---pointers became
+  invalid after the hash table was relocated.
+
+    M ./kernel/planner.c -19 +29
+
+Tue Jan 28 07:34:10 EST 2003  athena
+  * [project @ 2003-01-28 12:34:10 by athena]
+  Read wisdom at can_do() time, otherwise wisdom is destroyed.
+
+    M ./tests/bench.c -2 +5
+
+Tue Jan 28 06:54:38 EST 2003  athena
+  * [project @ 2003-01-28 11:54:38 by athena]
+  More conservative inheritance of blessings
+
+    M ./kernel/planner.c -19 +24
+
+Tue Jan 28 06:50:20 EST 2003  athena
+  * [project @ 2003-01-28 11:50:20 by athena]
+  Print the same info as it is hashed
+
+    M ./dft/problem.c -3 +4
+
+Tue Jan 28 06:49:48 EST 2003  athena
+  * [project @ 2003-01-28 11:49:48 by athena]
+  Print name of executable when FAILURE
+
+    M ./tests/check.pl -2 +2
+
+Mon Jan 27 06:59:40 EST 2003  athena
+  * [project @ 2003-01-27 11:59:40 by athena]
+  New NO_SEARCH planner flag, which avoids searching altogether.
+  A wisdom entry must lead to a NO_SEARCH-grade plan, or else the
+  wisdom entry is bogus.
+
+    M ./kernel/ifftw.h -5 +10
+    M ./kernel/planner.c -13 +30
+
+Sun Jan 26 20:45:21 EST 2003  athena
+  * [project @ 2003-01-27 01:45:21 by athena]
+  Use cosl()/sinl() when appropriate
+
+    M ./libbench2/verify-lib.c -3 +20
+
+Sun Jan 26 16:29:18 EST 2003  athena
+  * [project @ 2003-01-26 21:29:18 by athena]
+  Use null pointers when estimating.  The estimator should never
+  time anything.
+
+    M ./kernel/planner.c -1 +6
+    M ./libbench2/problem.c -2 +1
+    M ./libbench2/speed.c -1 +2
+    M ./libbench2/verify.c -1 +3
+
+Sun Jan 26 15:19:01 EST 2003  stevenj
+  * [project @ 2003-01-26 20:19:01 by stevenj]
+  note
+
+    M ./api/f77api.c -1 +1
+
+Sun Jan 26 15:16:22 EST 2003  stevenj
+  * [project @ 2003-01-26 20:16:22 by stevenj]
+  support multiple mangling schemes with g77
+
+    A ./api/f77funcs.c
+    M ./api/Makefile.am +4
+    M ./api/f77api.c -130 +41
+    M ./api/f77funcs.c +139
+    M ./configure.ac +6
+
+Sun Jan 26 12:58:57 EST 2003  stevenj
+  * [project @ 2003-01-26 17:58:57 by stevenj]
+  fixed verbose, made random tests only use selected rank, use rank <= 4, fixed final flush_problems call
+
+    M ./tests/check.pl -4 +9
+
+Sun Jan 26 12:42:49 EST 2003  stevenj
+  * [project @ 2003-01-26 17:42:49 by stevenj]
+  fixed typo (count instead of maxcount)
+
+    M ./tests/check.pl -2 +2
+
+Sun Jan 26 12:12:07 EST 2003  stevenj
+  * [project @ 2003-01-26 17:12:07 by stevenj]
+  hypot is no longer used
+
+    M ./configure.ac -1 +1
+
+Sun Jan 26 12:07:43 EST 2003  stevenj
+  * [project @ 2003-01-26 17:07:43 by stevenj]
+  check for _alloca (MSVC)
+
+    M ./configure.ac -1 +1
+    M ./kernel/ifftw.h -1 +5
+
+Sun Jan 26 11:56:53 EST 2003  stevenj
+  * [project @ 2003-01-26 16:56:53 by stevenj]
+  slight fix in assert
+
+    M ./kernel/alloc.c -2 +2
+
+Sun Jan 26 11:55:39 EST 2003  athena
+  * [project @ 2003-01-26 16:55:39 by athena]
+  Allocate problem in all cases--- can_do may need correct pointers.
+
+    M ./libbench2/problem.c -1 +2
+    M ./libbench2/speed.c -2 +1
+    M ./libbench2/verify.c -3 +1
+    M ./tests/bench.c -6
+
+Sun Jan 26 11:51:27 EST 2003  athena
+  * [project @ 2003-01-26 16:51:27 by athena]
+  Nastier checks
+
+    M ./tests/bench.c -1 +8
+    M ./tests/check.pl -9 +23
+
+Sun Jan 26 11:51:16 EST 2003  athena
+  * [project @ 2003-01-26 16:51:16 by athena]
+  X(use_plan) is a relic.
+
+    M ./kernel/ifftw.h -3 +1
+    M ./kernel/plan.c -11 +3
+    M ./kernel/planner.c -5 +2
+
+Sun Jan 26 09:23:16 EST 2003  athena
+  * [project @ 2003-01-26 14:23:16 by athena]
+  Print full pathname of the bench executable, so that I don't get
+  confused when running multiple tests for different configurations.
+
+    M ./tests/Makefile.am -1 +1
+
+Sun Jan 26 07:35:46 EST 2003  athena
+  * [project @ 2003-01-26 12:35:46 by athena]
+  Split done() into done() and cleanup(), in order to test
+  multiple problems with the same planner from the command line.
+
+    M ./libbench2/bench-main.c -1 +2
+    M ./libbench2/bench-user.h -1 +2
+    M ./tests/bench.c -1 +8
+
+Sat Jan 25 20:44:49 EST 2003  athena
+  * [project @ 2003-01-26 01:44:49 by athena]
+  Improved readability
+
+    M ./kernel/alloc.c -6 +16
+
+Sat Jan 25 19:17:26 EST 2003  stevenj
+  * [project @ 2003-01-26 00:17:26 by stevenj]
+  comment
+
+    M ./kernel/alloc.c -3 +3
+
+Sat Jan 25 19:16:53 EST 2003  stevenj
+  * [project @ 2003-01-26 00:16:53 by stevenj]
+  added macos9 mpallocatealigned function
+
+    M ./kernel/alloc.c -1 +19
+
+Sat Jan 25 18:59:55 EST 2003  stevenj
+  * [project @ 2003-01-25 23:59:55 by stevenj]
+  sometimes __APPLE__ is defined instead of __MACOSX__
+
+    M ./kernel/alloc.c -2 +3
+
+Sat Jan 25 18:54:39 EST 2003  stevenj
+  * [project @ 2003-01-25 23:54:39 by stevenj]
+  macos x malloc is already 16-byte aligned
+
+    M ./kernel/alloc.c -2 +3
+
+Sat Jan 25 13:38:32 EST 2003  athena
+  * [project @ 2003-01-25 18:38:32 by athena]
+  Include <sys/types.h> because uintptr_t is defined there
+  on solaris.
+
+    M ./kernel/ifftw.h -2 +6
+
+Sat Jan 25 13:22:59 EST 2003  athena
+  * [project @ 2003-01-25 18:22:59 by athena]
+  Oops---forgot getopt_long
+
+    A ./libbench2/getopt1.c
+    M ./libbench2/Makefile.am -2 +2
+    M ./libbench2/getopt1.c +188
+
+Sat Jan 25 13:17:29 EST 2003  athena
+  * [project @ 2003-01-25 18:17:29 by athena]
+  Include default includes when checking for uintptr_t.
+  (Otherwise solaris breaks.)
+
+    M ./configure.ac -1 +1
+
+Sat Jan 25 12:39:52 EST 2003  athena
+  * [project @ 2003-01-25 17:39:52 by athena]
+  distribute check.pl
+
+    M ./tests/Makefile.am +1
+
+Sat Jan 25 12:38:34 EST 2003  athena
+  * [project @ 2003-01-25 17:38:34 by athena]
+  Check split format, too.
+
+    M ./tests/check.pl +4
+
+Sat Jan 25 11:48:19 EST 2003  athena
+  * [project @ 2003-01-25 16:48:19 by athena]
+  New tests, added make check
+
+    M ./tests/Makefile.am -1 +2
+    M ./tests/check.pl -3 +66
+
+Thu Jan 23 08:34:24 EST 2003  athena
+  * [project @ 2003-01-23 13:34:24 by athena]
+  More tests
+
+    M ./tests/check.pl -17 +83
+
+Tue Jan 21 20:32:12 EST 2003  athena
+  * [project @ 2003-01-22 01:32:09 by athena]
+  Deal with rnk(sz)=-infinity
+
+    M ./api/mktensor-iodims.c -4 +9
+    M ./api/mktensor-rowmajor.c -4 +6
+    M ./libbench2/problem.c -5 +2
+
+Tue Jan 21 10:07:16 EST 2003  athena
+  * [project @ 2003-01-21 15:07:16 by athena]
+  Crazy idea
+
+    M ./TODO +3
+
+Tue Jan 21 07:14:22 EST 2003  athena
+  * [project @ 2003-01-21 12:14:22 by athena]
+  Test program, still barely worthy of the name.
+
+    A ./tests/check.pl
+
+Mon Jan 20 08:29:21 EST 2003  athena
+  * [project @ 2003-01-20 13:29:21 by athena]
+  Stylistic changes
+
+    M ./libbench2/problem.c -15 +10
+
+Mon Jan 20 07:03:38 EST 2003  athena
+  * [project @ 2003-01-20 12:03:38 by athena]
+  Implemented flops api
+
+    A ./api/flops.c
+    M ./api/Makefile.am -10 +10
+    M ./api/fftw3.h -2 +4
+    M ./api/flops.c +27
+    M ./tests/bench.c +3
+
+Sun Jan 19 14:27:21 EST 2003  stevenj
+  * [project @ 2003-01-19 19:27:21 by stevenj]
+  cleanup
+
+    M ./libbench2/problem.c -31 +24
+
+Sun Jan 19 14:14:49 EST 2003  stevenj
+  * [project @ 2003-01-19 19:14:49 by stevenj]
+  'v' syntax now defaults to an 'internal' (stride 1) vector, which is a more interesting case and corresponds more closely to the intuitive notion of a 'vector' transform, while '*' does the old 'external' (stride n) vector
+
+    M ./libbench2/problem.c -4 +24
+
+Sun Jan 19 13:55:35 EST 2003  stevenj
+  * [project @ 2003-01-19 18:55:35 by stevenj]
+  removed '/' overloading
+
+    M ./libbench2/problem.c -2 +2
+
+Sun Jan 19 13:52:09 EST 2003  stevenj
+  * [project @ 2003-01-19 18:52:09 by stevenj]
+  get rid of '*' and ',' synonyms for 'x' in problem parser; there's no need to clutter the namespace with syntax we never use
+
+    M ./libbench2/problem.c -2 +2
+
+Sun Jan 19 07:28:27 EST 2003  athena
+  * [project @ 2003-01-19 12:28:27 by athena]
+  Signed/unsigned fixes.
+
+    M ./kernel/planner.c -4 +4
+
+Sun Jan 19 07:09:54 EST 2003  athena
+  * [project @ 2003-01-19 12:09:54 by athena]
+  Test split arrays.
+
+    M ./libbench2/bench-user.h -3 +9
+    M ./libbench2/verify-dft.c -14 +11
+    M ./libbench2/verify.c -13 +43
+    M ./libbench2/verify.h +1
+    M ./tests/bench.c -13 +50
+    M ./tests/hook.c -1 +4
+
+Sat Jan 18 23:46:57 EST 2003  stevenj
+  * [project @ 2003-01-19 04:46:57 by stevenj]
+  clarification
+
+    M ./doc/fftw3.texi -17 +16
+
+Sat Jan 18 21:53:18 EST 2003  stevenj
+  * [project @ 2003-01-19 02:53:18 by stevenj]
+  caps
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Jan 18 21:52:51 EST 2003  stevenj
+  * [project @ 2003-01-19 02:52:51 by stevenj]
+  brackets
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Jan 18 21:52:32 EST 2003  stevenj
+  * [project @ 2003-01-19 02:52:32 by stevenj]
+  quote
+
+    M ./doc/fftw3.texi -1 +7
+
+Sat Jan 18 20:53:11 EST 2003  stevenj
+  * [project @ 2003-01-19 01:53:11 by stevenj]
+  referencing
+
+    M ./doc/fftw3.texi -4 +7
+
+Sat Jan 18 20:33:28 EST 2003  stevenj
+  * [project @ 2003-01-19 01:33:28 by stevenj]
+  fix
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Jan 18 20:31:41 EST 2003  stevenj
+  * [project @ 2003-01-19 01:31:41 by stevenj]
+  slight change
+
+    M ./doc/fftw3.texi -8 +8
+
+Sat Jan 18 20:31:22 EST 2003  athena
+  * [project @ 2003-01-19 01:31:22 by athena]
+  Print errors when --verify.
+
+    M ./libbench2/verify-dft.c -7 +12
+    M ./libbench2/verify-lib.c -30 +38
+    M ./libbench2/verify.h -12 +14
+    M ./tests/bench.c -5 +5
+
+Sat Jan 18 20:30:27 EST 2003  stevenj
+  * [project @ 2003-01-19 01:30:27 by stevenj]
+  improved description, noted that FFTW_ESTIMATE does not destroy arrays
+
+    M ./doc/fftw3.texi -7 +8
+
+Sat Jan 18 20:23:12 EST 2003  stevenj
+  * [project @ 2003-01-19 01:23:12 by stevenj]
+  FFTW_DEFAULTS isn't really needed
+
+    M ./api/fftw3.h -2 +1
+
+Sat Jan 18 20:21:09 EST 2003  stevenj
+  * [project @ 2003-01-19 01:21:09 by stevenj]
+  added FFTW_MEASURE synonym for FFTW_DEFAULTS
+
+    M ./api/fftw3.h -1 +2
+    M ./doc/fftw3.texi -6 +6
+
+Sat Jan 18 20:18:29 EST 2003  stevenj
+  * [project @ 2003-01-19 01:18:29 by stevenj]
+  slight change
+
+    M ./kernel/alloc.c -2 +2
+
+Sat Jan 18 20:16:08 EST 2003  athena
+  * [project @ 2003-01-19 01:16:08 by athena]
+  Clearer name
+
+    M ./tests/bench.c -2 +2
+
+Sat Jan 18 20:13:14 EST 2003  athena
+  * [project @ 2003-01-19 01:13:14 by athena]
+  Completed dft api test
+
+    M ./api/fftw3.h -12 +12
+    M ./libbench2/tensor.c -3 +3
+    M ./tests/bench.c -8 +65
+
+Sat Jan 18 20:07:33 EST 2003  stevenj
+  * [project @ 2003-01-19 01:07:33 by stevenj]
+  index
+
+    M ./doc/fftw3.texi -1 +2
+
+Sat Jan 18 20:05:50 EST 2003  stevenj
+  * [project @ 2003-01-19 01:05:50 by stevenj]
+  fix
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Jan 18 20:04:11 EST 2003  stevenj
+  * [project @ 2003-01-19 01:04:11 by stevenj]
+  parallel structure
+
+    M ./doc/fftw3.texi -2 +2
+
+Sat Jan 18 20:03:18 EST 2003  stevenj
+  * [project @ 2003-01-19 01:03:18 by stevenj]
+  fix
+
+    M ./doc/fftw3.texi -4 +4
+
+Sat Jan 18 20:00:24 EST 2003  stevenj
+  * [project @ 2003-01-19 01:00:24 by stevenj]
+  joke
+
+    M ./doc/fftw3.texi -5 +6
+
+Sat Jan 18 19:59:28 EST 2003  stevenj
+  * [project @ 2003-01-19 00:59:28 by stevenj]
+  recommendation to read tutorial in-order
+
+    M ./doc/fftw3.texi -1 +6
+
+Sat Jan 18 19:54:55 EST 2003  stevenj
+  * [project @ 2003-01-19 00:54:55 by stevenj]
+  expanded outline
+
+    M ./doc/fftw3.texi -7 +42
+
+Sat Jan 18 19:35:52 EST 2003  stevenj
+  * [project @ 2003-01-19 00:35:52 by stevenj]
+  clarification
+
+    M ./doc/fftw3.texi -3 +5
+
+Sat Jan 18 19:17:27 EST 2003  stevenj
+  * [project @ 2003-01-19 00:17:27 by stevenj]
+  draft complex-dft tutorial
+
+    M ./doc/fftw3.texi -25 +183
+
+Sat Jan 18 17:27:15 EST 2003  athena
+  * [project @ 2003-01-18 22:27:15 by athena]
+  Paranoid mode is back.  Fixed dwim to do what I mean.
+
+    A ./tests/hook.c
+    M ./libbench2/allocate.c -2 +1
+    M ./libbench2/bench-main.c -2 +1
+    M ./libbench2/bench-user.h -8 +1
+    M ./libbench2/bench.h -2 +1
+    M ./libbench2/can-do.c -2 +1
+    M ./libbench2/dotens2.c -2 +2
+    M ./libbench2/info.c -2 +1
+    M ./libbench2/problem.c -18 +21
+    M ./libbench2/report.c -2 +1
+    M ./libbench2/speed.c -2 +1
+    M ./libbench2/timer.c -2 +1
+    M ./libbench2/util.c -1
+    M ./libbench2/verify.c -4 +3
+    M ./libbench2/verify.h +7
+    M ./libbench2/zero.c -2 +1
+    M ./tests/Makefile.am -1 +1
+    M ./tests/bench.c +4
+    M ./tests/hook.c +191
+
+Sat Jan 18 17:13:51 EST 2003  stevenj
+  * [project @ 2003-01-18 22:13:51 by stevenj]
+  started tut.
+
+    M ./doc/fftw3.texi -14 +46
+
+Sat Jan 18 16:13:15 EST 2003  athena
+  * [project @ 2003-01-18 21:13:15 by athena]
+  Great renaming, so that we can include both bench-user.h and
+  ifftw.h to implement the paranoid-mode hook.
+
+    M ./libbench2/allocate.c -5 +5
+    M ./libbench2/bench-user.h -30 +28
+    M ./libbench2/bench.h -12 +12
+    M ./libbench2/can-do.c -2 +2
+    M ./libbench2/dotens2.c -3 +3
+    M ./libbench2/mflops.c -1 +1
+    M ./libbench2/problem.c -12 +12
+    M ./libbench2/report.c -10 +10
+    M ./libbench2/speed.c -2 +2
+    M ./libbench2/tensor.c -22 +22
+    M ./libbench2/verify-dft.c -2 +2
+    M ./libbench2/verify-lib.c -6 +6
+    M ./libbench2/verify.c -9 +9
+    M ./libbench2/verify.h -2 +2
+    M ./libbench2/zero.c -2 +2
+    M ./tests/bench.c -29 +29
+
+Sat Jan 18 15:41:18 EST 2003  athena
+  * [project @ 2003-01-18 20:41:18 by athena]
+  Trying to tweak the verifier so that I can use it in
+  bench.c for paranoid mode
+
+    M ./libbench2/bench-user.h -1 +23
+    M ./libbench2/problem.c -3 +3
+    M ./libbench2/verify-dft.c -66 +9
+    M ./libbench2/verify-lib.c -21 +18
+    M ./libbench2/verify.c -5 +62
+    M ./libbench2/verify.h -18 +3
+
+Sat Jan 18 10:24:05 EST 2003  athena
+  * [project @ 2003-01-18 15:24:05 by athena]
+  Added stride_factor for complex arrays.
+
+    M ./tests/bench.c -5 +5
+
+Sat Jan 18 10:02:11 EST 2003  athena
+  * [project @ 2003-01-18 15:02:11 by athena]
+  can_do now calls the planner.
+
+    M ./tests/bench.c -1 +6
+
+Sat Jan 18 09:59:24 EST 2003  athena
+  * [project @ 2003-01-18 14:59:24 by athena]
+  Call guru api in bench.c
+
+    M ./api/plan-guru-dft.c -2 +1
+    M ./tests/bench.c -3 +47
+
+Sat Jan 18 08:17:23 EST 2003  athena
+  * [project @ 2003-01-18 13:17:23 by athena]
+  Fixed prototype.
+
+    M ./libbench2/bench.h -1 +3
+    M ./libbench2/zero.c -2 +2
+
+Sat Jan 18 08:14:48 EST 2003  athena
+  * [project @ 2003-01-18 13:14:48 by athena]
+  Attempt to make the signed/unsigned use of flags consistent.
+
+    M ./api/api.h -2 +2
+    M ./api/apiplan.c -1 +1
+    M ./api/fftw3.h -25 +25
+    M ./api/mapflags.c -8 +10
+    M ./api/plan-dft-1d.c -1 +1
+    M ./api/plan-dft-2d.c -1 +1
+    M ./api/plan-dft-3d.c -1 +1
+    M ./api/plan-dft-c2r-1d.c -1 +1
+    M ./api/plan-dft-c2r-2d.c -1 +1
+    M ./api/plan-dft-c2r-3d.c -1 +1
+    M ./api/plan-dft-c2r.c -1 +1
+    M ./api/plan-dft-r2c-1d.c -1 +1
+    M ./api/plan-dft-r2c-2d.c -1 +1
+    M ./api/plan-dft-r2c-3d.c -1 +1
+    M ./api/plan-dft-r2c.c -1 +1
+    M ./api/plan-dft.c -1 +1
+    M ./api/plan-guru-dft-c2r.c -1 +1
+    M ./api/plan-guru-dft-r2c.c -1 +1
+    M ./api/plan-guru-dft.c -1 +1
+    M ./api/plan-guru-r2r.c -1 +1
+    M ./api/plan-many-dft-c2r.c -1 +1
+    M ./api/plan-many-dft-r2c.c -1 +1
+    M ./api/plan-many-dft.c -1 +1
+    M ./api/plan-many-r2r.c -1 +1
+    M ./api/plan-r2r-1d.c -1 +1
+    M ./api/plan-r2r-2d.c -1 +1
+    M ./api/plan-r2r-3d.c -1 +1
+    M ./api/plan-r2r.c -1 +1
+    M ./kernel/ifftw.h -2 +2
+
+Sat Jan 18 08:03:07 EST 2003  athena
+  * [project @ 2003-01-18 13:03:07 by athena]
+  Implemented useropt.
+
+    A ./libbench2/useropt.c
+    M ./libbench2/Makefile.am -2 +3
+    M ./libbench2/bench-main.c -1 +5
+    M ./libbench2/bench-user.h -1 +2
+    M ./libbench2/useropt.c +30
+    M ./tests/bench.c -7 +24
+
+Sat Jan 18 08:02:05 EST 2003  athena
+  * [project @ 2003-01-18 13:02:05 by athena]
+  The first map_flags pass must be transitive, i.e., always use the
+  latest flags value as opposed to the original value. (I think.)
+
+    M ./api/mapflags.c -11 +12
+
+Sat Jan 18 07:20:19 EST 2003  athena
+  * [project @ 2003-01-18 12:20:18 by athena]
+  Started working on verifier
+
+    A ./libbench2/dotens2.c
+    A ./libbench2/verify-dft.c
+    A ./libbench2/verify-lib.c
+    A ./libbench2/verify.h
+    M ./libbench2/Makefile.am -3 +3
+    M ./libbench2/bench-user.h -1 +4
+    M ./libbench2/dotens2.c +55
+    M ./libbench2/tensor.c -1 +8
+    M ./libbench2/verify-dft.c +119
+    M ./libbench2/verify-lib.c +399
+    M ./libbench2/verify.c -2 +9
+    M ./libbench2/verify.h +65
+    M ./tests/Makefile.am -4 +1
+
+Fri Jan 17 14:53:28 EST 2003  stevenj
+  * [project @ 2003-01-17 19:53:28 by stevenj]
+  added X(threads_cleanup)
+
+    M ./api/fftw3.h -1 +2
+    M ./threads/api.c +9
+    M ./threads/threads.c -6 +19
+    M ./threads/threads.h -1 +2
+
+Fri Jan 17 10:35:56 EST 2003  athena
+  * [project @ 2003-01-17 15:35:56 by athena]
+  Use C style for upper and lower array bounds.  Free tensors properly.
+
+    M ./libbench2/allocate.c -3 +5
+    M ./libbench2/tensor.c -2 +2
+
+Fri Jan 17 08:50:42 EST 2003  athena
+  * [project @ 2003-01-17 13:50:42 by athena]
+  Fixed ambiguous syntax
+
+    M ./libbench2/problem.c -2 +2
+
+Fri Jan 17 08:20:57 EST 2003  athena
+  * [project @ 2003-01-17 13:20:57 by athena]
+  Parse minus sign, bugfixes
+
+    M ./libbench2/problem.c -4 +17
+
+Fri Jan 17 08:11:56 EST 2003  athena
+  * [project @ 2003-01-17 13:11:56 by athena]
+  Skeleton libbench2 implemented (probably still buggy)
+
+    A ./libbench2/
+    A ./libbench2/Makefile.am
+    A ./libbench2/allocate.c
+    A ./libbench2/bench-main.c
+    A ./libbench2/bench-user.h
+    A ./libbench2/bench.h
+    A ./libbench2/can-do.c
+    A ./libbench2/caset.c
+    A ./libbench2/getopt-utils.c
+    A ./libbench2/getopt.c
+    A ./libbench2/getopt.h
+    A ./libbench2/info.c
+    A ./libbench2/main.c
+    A ./libbench2/mflops.c
+    A ./libbench2/ovtpvt.c
+    A ./libbench2/pow2.c
+    A ./libbench2/problem.c
+    A ./libbench2/report.c
+    A ./libbench2/speed.c
+    A ./libbench2/tensor.c
+    A ./libbench2/timer.c
+    A ./libbench2/util.c
+    A ./libbench2/verify.c
+    A ./libbench2/zero.c
+    M ./Makefile.am -1 +1
+    M ./configure.ac -1 +1
+    M ./libbench2/Makefile.am +13
+    M ./libbench2/allocate.c +55
+    M ./libbench2/bench-main.c +250
+    M ./libbench2/bench-user.h +206
+    M ./libbench2/bench.h +67
+    M ./libbench2/can-do.c +33
+    M ./libbench2/caset.c +12
+    M ./libbench2/getopt-utils.c +104
+    M ./libbench2/getopt.c +1062
+    M ./libbench2/getopt.h +180
+    M ./libbench2/info.c +59
+    M ./libbench2/main.c +40
+    M ./libbench2/mflops.c +25
+    M ./libbench2/ovtpvt.c +13
+    M ./libbench2/pow2.c +6
+    M ./libbench2/problem.c +172
+    M ./libbench2/report.c +159
+    M ./libbench2/speed.c +73
+    M ./libbench2/tensor.c +173
+    M ./libbench2/timer.c +241
+    M ./libbench2/util.c +216
+    M ./libbench2/verify.c +53
+    M ./libbench2/zero.c +36
+    M ./tests/Makefile.am -2 +2
+    M ./tests/bench.c -56 +93
+
+Fri Jan 17 04:23:37 EST 2003  athena
+  * [project @ 2003-01-17 09:23:37 by athena]
+  Formatting
+
+    M ./kernel/tensor4.c -10 +10
+
+Fri Jan 17 03:15:24 EST 2003  fftw
+  * [project @ 2003-01-17 08:15:24 by fftw]
+  slight updates
+
+    M ./doc/fftw3.texi -27 +34
+
+Fri Jan 17 01:44:44 EST 2003  stevenj
+  * [project @ 2003-01-17 06:44:44 by stevenj]
+  eliminated obsolete uimin/uimax
+
+    M ./dft/vrank-geq1.c -2 +2
+    M ./kernel/buffered.c -2 +2
+    M ./kernel/ifftw.h -3 +1
+    M ./kernel/minmax.c -11 +1
+    M ./kernel/tensor4.c -4 +4
+    M ./rdft/buffered2.c -4 +4
+    M ./rdft/rdft2-inplace-strides.c -2 +2
+    M ./rdft/rdft2-tensor-max-index.c -4 +4
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+
+Fri Jan 17 01:40:10 EST 2003  stevenj
+  * [project @ 2003-01-17 06:40:10 by stevenj]
+  threads needs to have its own library, lest all programs linking to libfftw3.so need -lpthread
+
+    A ./threads/api.c
+    M ./Makefile.am -1
+    M ./api/Makefile.am -4 +3
+    M ./api/configure.c -2
+    M ./api/fftw3.h -2 +3
+    R ./api/plan-with-nthreads.c
+    M ./tests/Makefile.am -1 +8
+    M ./tests/bench.c +5
+    M ./threads/Makefile.am -9 +6
+    M ./threads/api.c +56
+    M ./threads/threads.c -1 +1
+
+Thu Jan 16 19:53:46 EST 2003  stevenj
+  * [project @ 2003-01-17 00:53:46 by stevenj]
+  whoops
+
+    M ./api/f77api.c -2 +2
+
+Thu Jan 16 19:53:30 EST 2003  stevenj
+  * [project @ 2003-01-17 00:53:30 by stevenj]
+  better name
+
+    M ./api/f77api.c -2 +2
+
+Thu Jan 16 19:52:36 EST 2003  stevenj
+  * [project @ 2003-01-17 00:52:36 by stevenj]
+  added more functions
+
+    M ./api/f77api.c +68
+
+Thu Jan 16 16:57:06 EST 2003  stevenj
+  * [project @ 2003-01-16 21:57:06 by stevenj]
+  if 'long' is big enough, use it for mulmod in preference to 'long long'
+
+    M ./kernel/ifftw.h -1 +3
+
+Thu Jan 16 14:53:41 EST 2003  stevenj
+  * [project @ 2003-01-16 19:53:41 by stevenj]
+  use uintptr_t for pointer alignment arithmetic
+
+    M ./configure.ac -2 +9
+    M ./kernel/align.c -2 +2
+    M ./kernel/ifftw.h -5 +22
+
+Thu Jan 16 07:58:28 EST 2003  athena
+  * [project @ 2003-01-16 12:58:28 by athena]
+  More signed/unsigned cleanup
+
+    M ./kernel/planner.c -10 +10
+    M ./kernel/print.c -3 +3
+    M ./kernel/tensor.c -2 +2
+    M ./kernel/twiddle.c -2 +2
+    M ./rdft/problem.c -2 +2
+
+Thu Jan 16 07:57:40 EST 2003  athena
+  * [project @ 2003-01-16 12:57:40 by athena]
+  null function pointers are technically nonportable
+
+    M ./kernel/solvtab.c -2 +2
+
+Thu Jan 16 07:17:45 EST 2003  athena
+  * [project @ 2003-01-16 12:17:45 by athena]
+  Free short_options
+
+    M ./libbench/bench-main.c -2 +3
+
+Thu Jan 16 05:48:30 EST 2003  athena
+  * [project @ 2003-01-16 10:48:30 by athena]
+  Oops, forgot STACK_FREE
+
+    M ./kernel/alloc.c -15 +17
+    M ./kernel/ifftw.h -2 +3
+    M ./tests/bench.c -1 +1
+
+Thu Jan 16 05:40:39 EST 2003  athena
+  * [project @ 2003-01-16 10:40:39 by athena]
+  Do not require memalign() unless HAVE_SIMD
+
+    M ./kernel/alloc.c -6 +15
+    M ./kernel/ifftw.h -13 +18
+
+Thu Jan 16 01:03:31 EST 2003  stevenj
+  * [project @ 2003-01-16 06:03:31 by stevenj]
+  MS VC++ _aligned_malloc
+
+    M ./kernel/alloc.c -1 +7
+
+Thu Jan 16 00:44:45 EST 2003  stevenj
+  * [project @ 2003-01-16 05:44:45 by stevenj]
+  added api fftw_malloc/free
+
+    M ./api/fftw3.h -2 +5
+    M ./kernel/alloc.c -2 +10
+
+Thu Jan 16 00:43:48 EST 2003  stevenj
+  * [project @ 2003-01-16 05:43:48 by stevenj]
+  silence warning
+
+    M ./api/map-r2r-kind.c -1 +1
+
+Wed Jan 15 22:39:04 EST 2003  stevenj
+  * [project @ 2003-01-16 03:39:04 by stevenj]
+  send error output to stderr
+
+    M ./tools/fftw-wisdom-to-conf.in -1 +1
+
+Wed Jan 15 13:20:35 EST 2003  athena
+  * [project @ 2003-01-15 18:20:35 by athena]
+  Pure paranoia.
+
+    M ./kernel/tensor7.c -3 +6
+
+Wed Jan 15 06:51:34 EST 2003  athena
+  * [project @ 2003-01-15 11:51:34 by athena]
+  Fixed formatting that was messed up by the conversion uint->int.
+  Ensure that iodims etc are kosher.
+
+    A ./kernel/tensor9.c
+    M ./api/api.h -13 +20
+    M ./api/apiplan.c -9 +9
+    M ./api/configure.c -5 +5
+    M ./api/dfthelp.c -1 +1
+    M ./api/execute-dft-c2r.c -1 +1
+    M ./api/execute-dft-r2c.c -1 +1
+    M ./api/execute-dft.c -1 +1
+    M ./api/execute-r2r.c -1 +1
+    M ./api/execute.c -1 +1
+    M ./api/export-wisdom-to-file.c -4 +4
+    M ./api/export-wisdom-to-string.c -6 +6
+    M ./api/export-wisdom.c -6 +6
+    M ./api/f77api.c -58 +46
+    M ./api/fftw3.h -14 +11
+    M ./api/forget-wisdom.c -2 +2
+    M ./api/import-system-wisdom.c -2 +2
+    M ./api/import-wisdom-from-file.c -5 +5
+    M ./api/import-wisdom-from-string.c -4 +4
+    M ./api/import-wisdom.c -6 +6
+    M ./api/map-r2r-kind.c -37 +17
+    M ./api/mapflags.c -4 +4
+    M ./api/mkprinter-file.c -2 +2
+    M ./api/mktensor-iodims.c -2 +23
+    M ./api/mktensor-rowmajor.c -4 +20
+    M ./api/plan-dft-1d.c -2 +2
+    M ./api/plan-dft-2d.c -2 +2
+    M ./api/plan-dft-3d.c -3 +3
+    M ./api/plan-dft-c2r-1d.c -2 +2
+    M ./api/plan-dft-c2r-2d.c -2 +2
+    M ./api/plan-dft-c2r-3d.c -3 +3
+    M ./api/plan-dft-c2r.c -3 +3
+    M ./api/plan-dft-r2c-1d.c -2 +2
+    M ./api/plan-dft-r2c-2d.c -2 +2
+    M ./api/plan-dft-r2c-3d.c -3 +3
+    M ./api/plan-dft-r2c.c -3 +5
+    M ./api/plan-dft.c -4 +6
+    M ./api/plan-guru-dft-c2r.c -10 +10
+    M ./api/plan-guru-dft-r2c.c -10 +11
+    M ./api/plan-guru-dft.c -10 +11
+    M ./api/plan-guru-r2r.c -14 +16
+    M ./api/plan-many-dft-c2r.c -34 +21
+    M ./api/plan-many-dft-r2c.c -34 +21
+    M ./api/plan-many-dft.c -18 +19
+    M ./api/plan-many-r2r.c -22 +21
+    M ./api/plan-r2r-1d.c -2 +2
+    M ./api/plan-r2r-2d.c -3 +3
+    M ./api/plan-r2r-3d.c -4 +4
+    M ./api/plan-r2r.c -4 +4
+    M ./api/plan-with-nthreads.c -3 +3
+    M ./api/print-plan.c -3 +3
+    M ./api/rdft2-pad.c -2 +2
+    M ./api/the-planner.c -5 +5
+    M ./dft/buffered.c -2 +2
+    M ./dft/ct.c -2 +2
+    M ./dft/direct.c -2 +2
+    M ./dft/generic.c -1 +1
+    M ./dft/problem.c -4 +5
+    M ./dft/rader.c -1 +1
+    M ./dft/vrank-geq1.c -2 +2
+    M ./dft/vrank2-transpose.c -2 +2
+    M ./dft/vrank3-transpose.c -2 +2
+    M ./kernel/Makefile.am -2 +2
+    M ./kernel/alloc.c -2 +2
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -10 +10
+    M ./kernel/print.c -3 +3
+    M ./kernel/tensor.c -2 +4
+    M ./kernel/tensor9.c +37
+    M ./rdft/buffered.c -2 +2
+    M ./rdft/buffered2.c -2 +2
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/dht-r2hc.c -2 +2
+    M ./rdft/dht-rader.c -1 +1
+    M ./rdft/direct.c -2 +2
+    M ./rdft/direct2.c -2 +2
+    M ./rdft/generic.c -1 +1
+    M ./rdft/hc2hc.c -2 +2
+    M ./rdft/problem.c -2 +6
+    M ./rdft/problem2.c -2 +5
+    M ./rdft/rader-hc2hc.c -1 +1
+    M ./rdft/rdft-dht.c -2 +2
+    M ./rdft/rdft2-radix2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+    M ./rdft/vrank2-transpose.c -2 +2
+    M ./rdft/vrank3-transpose.c -2 +2
+    M ./reodft/redft00e-r2hc.c -2 +2
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -2 +2
+    M ./reodft/rodft00e-r2hc.c -2 +2
+    M ./threads/dft-vrank-geq1.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -2 +2
+    M ./threads/vrank-geq1-rdft2.c -2 +2
+
+Wed Jan 15 01:32:18 EST 2003  stevenj
+  * [project @ 2003-01-15 06:32:18 by stevenj]
+  added version stamp
+
+    M ./tools/fftw-wisdom-to-conf.in -1 +2
+
+Wed Jan 15 01:28:20 EST 2003  stevenj
+  * [project @ 2003-01-15 06:28:20 by stevenj]
+  added warning
+
+    M ./tools/fftw-wisdom-to-conf.in +1
+
+Wed Jan 15 01:23:25 EST 2003  stevenj
+  * [project @ 2003-01-15 06:23:25 by stevenj]
+  add fftw-wisdom-to-conf to BUILT_SOURCES
+
+    M ./tools/Makefile.am -1 +1
+
+Wed Jan 15 01:09:29 EST 2003  stevenj
+  * [project @ 2003-01-15 06:09:29 by stevenj]
+  added const
+
+    M ./tools/fftw-wisdom-to-conf.in -2 +2
+
+Wed Jan 15 01:04:10 EST 2003  stevenj
+  * [project @ 2003-01-15 06:04:10 by stevenj]
+  added wisdom-to-conf
+
+    A ./tools/
+    A ./tools/Makefile.am
+    A ./tools/fftw-wisdom-to-conf.in
+    M ./Makefile.am -1 +1
+    M ./configure.ac +3
+    M ./tools/Makefile.am +3
+    M ./tools/fftw-wisdom-to-conf.in +37
+
+Wed Jan 15 00:23:36 EST 2003  stevenj
+  * [project @ 2003-01-15 05:23:36 by stevenj]
+  include type prefix in wisdom preamble
+
+    M ./kernel/planner.c -2 +2
+
+Wed Jan 15 00:02:31 EST 2003  stevenj
+  * [project @ 2003-01-15 05:02:31 by stevenj]
+  updates
+
+    M ./TODO -4 +2
+
+Tue Jan 14 23:59:26 EST 2003  stevenj
+  * [project @ 2003-01-15 04:59:26 by stevenj]
+  check the_plan before printing
+
+    M ./tests/bench.c -2 +3
+
+Tue Jan 14 21:10:25 EST 2003  athena
+  * [project @ 2003-01-15 02:10:25 by athena]
+  Eliminated those unsigned values that would break LP64 machines.
+
+    M ./ChangeLog -1 +1
+    M ./api/api.h -16 +16
+    M ./api/apiplan.c -19 +19
+    M ./api/configure.c -5 +5
+    M ./api/dfthelp.c -5 +5
+    M ./api/execute-dft-c2r.c -1 +1
+    M ./api/execute-dft-r2c.c -1 +1
+    M ./api/execute-dft.c -1 +1
+    M ./api/execute-r2r.c -1 +1
+    M ./api/execute.c -1 +1
+    M ./api/export-wisdom-to-file.c -4 +4
+    M ./api/export-wisdom-to-string.c -13 +13
+    M ./api/export-wisdom.c -7 +7
+    M ./api/f77api.c -71 +69
+    M ./api/fftw3.h -134 +136
+    M ./api/forget-wisdom.c -2 +2
+    M ./api/import-system-wisdom.c -8 +8
+    M ./api/import-wisdom-from-file.c -6 +6
+    M ./api/import-wisdom-from-string.c -6 +6
+    M ./api/import-wisdom.c -9 +11
+    M ./api/map-r2r-kind.c -17 +40
+    M ./api/mapflags.c -18 +18
+    M ./api/mkprinter-file.c -3 +3
+    M ./api/mktensor-iodims.c -9 +9
+    M ./api/mktensor-rowmajor.c -6 +5
+    M ./api/plan-dft-1d.c -3 +2
+    M ./api/plan-dft-2d.c -5 +5
+    M ./api/plan-dft-3d.c -5 +7
+    M ./api/plan-dft-c2r-1d.c -2 +2
+    M ./api/plan-dft-c2r-2d.c -5 +5
+    M ./api/plan-dft-c2r-3d.c -5 +7
+    M ./api/plan-dft-c2r.c -6 +3
+    M ./api/plan-dft-r2c-1d.c -2 +2
+    M ./api/plan-dft-r2c-2d.c -5 +5
+    M ./api/plan-dft-r2c-3d.c -5 +7
+    M ./api/plan-dft-r2c.c -6 +3
+    M ./api/plan-dft.c -6 +4
+    M ./api/plan-guru-dft-c2r.c -10 +10
+    M ./api/plan-guru-dft-r2c.c -10 +10
+    M ./api/plan-guru-dft.c -10 +10
+    M ./api/plan-guru-r2r.c -13 +14
+    M ./api/plan-many-dft-c2r.c -23 +36
+    M ./api/plan-many-dft-r2c.c -23 +36
+    M ./api/plan-many-dft.c -17 +18
+    M ./api/plan-many-r2r.c -18 +22
+    M ./api/plan-r2r-1d.c -3 +2
+    M ./api/plan-r2r-2d.c -7 +8
+    M ./api/plan-r2r-3d.c -8 +11
+    M ./api/plan-r2r.c -3 +4
+    M ./api/plan-with-nthreads.c -7 +7
+    M ./api/print-plan.c -3 +3
+    M ./api/rdft2-pad.c -10 +9
+    M ./api/the-planner.c -7 +7
+    M ./configure.ac -9 +3
+    M ./dft/buffered.c -11 +11
+    M ./dft/codelet-dft.h -11 +11
+    M ./dft/codelets/n.c -1 +1
+    M ./dft/codelets/t.c -1 +1
+    M ./dft/ct-dif.c -4 +4
+    M ./dft/ct-dit.c -4 +4
+    M ./dft/ct-ditbuf.c -6 +6
+    M ./dft/ct-ditf.c -2 +2
+    M ./dft/ct.c -5 +5
+    M ./dft/ct.h -2 +2
+    M ./dft/dft.h -2 +2
+    M ./dft/direct.c -3 +3
+    M ./dft/generic.c -5 +5
+    M ./dft/indirect.c -2 +2
+    M ./dft/k7/k7.c -9 +9
+    M ./dft/problem.c -3 +3
+    M ./dft/rader-omega.c -2 +2
+    M ./dft/rader.c -13 +13
+    M ./dft/rank-geq2.c -9 +9
+    M ./dft/rank0.c -7 +7
+    M ./dft/simd/n1b.c -1 +1
+    M ./dft/simd/n1f.c -1 +1
+    M ./dft/simd/t1b.c -1 +1
+    M ./dft/simd/t1f.c -1 +1
+    M ./dft/vrank-geq1.c -11 +11
+    M ./dft/vrank2-transpose.c -4 +4
+    M ./dft/vrank3-transpose.c -10 +10
+    M ./dft/zero.c -3 +3
+    M ./genfft/gen_hc2hc.ml -4 +4
+    M ./genfft/gen_hc2r.ml -4 +4
+    M ./genfft/gen_notw.ml -4 +4
+    M ./genfft/gen_notw_c.ml -4 +4
+    M ./genfft/gen_r2hc.ml -4 +4
+    M ./genfft/gen_twiddle.ml -4 +4
+    M ./genfft/gen_twiddle_c.ml -4 +4
+    M ./genfft/gen_twidsq.ml -4 +4
+    M ./kernel/align.c -5 +5
+    M ./kernel/buffered.c -3 +3
+    M ./kernel/ct.c -1 +1
+    M ./kernel/hash.c -3 +3
+    M ./kernel/iabs.c -3 +3
+    M ./kernel/ifftw.h -71 +70
+    M ./kernel/md5-1.c -3 +3
+    M ./kernel/md5.c -4 +4
+    M ./kernel/minmax.c -3 +3
+    M ./kernel/ops.c -4 +4
+    M ./kernel/pickdim.c -6 +6
+    M ./kernel/planner.c -32 +32
+    M ./kernel/primes.c -14 +14
+    M ./kernel/print.c -5 +5
+    M ./kernel/rader.c -3 +3
+    M ./kernel/scan.c -3 +3
+    M ./kernel/tensor.c -9 +9
+    M ./kernel/tensor1.c -2 +2
+    M ./kernel/tensor2.c -3 +3
+    M ./kernel/tensor4.c -10 +10
+    M ./kernel/tensor5.c -6 +6
+    M ./kernel/tensor7.c -4 +4
+    M ./kernel/trig.c -4 +4
+    M ./kernel/twiddle.c -14 +14
+    M ./libbench/acopy.c -2 +2
+    M ./libbench/allocate.c -2 +2
+    M ./libbench/ascale.c -2 +2
+    M ./libbench/aset.c -2 +2
+    M ./libbench/bench-user.h -23 +23
+    M ./libbench/bench.h -2 +2
+    M ./libbench/caadd.c -2 +2
+    M ./libbench/cacopy.c -2 +2
+    M ./libbench/cascale.c -2 +2
+    M ./libbench/caset.c -2 +2
+    M ./libbench/casub.c -2 +2
+    M ./libbench/copy-c2h-1d-fftpack.c -1 +1
+    M ./libbench/copy-c2h-1d-halfcomplex.c -1 +1
+    M ./libbench/copy-c2h-1d-packed.c -2 +2
+    M ./libbench/copy-c2h-1d-unpacked-ri.c -1 +1
+    M ./libbench/copy-c2h-unpacked.c -1 +1
+    M ./libbench/copy-c2r-packed.c -2 +2
+    M ./libbench/copy-c2r-unpacked.c -1 +1
+    M ./libbench/copy-c2ri.c -2 +2
+    M ./libbench/copy-h2c-1d-fftpack.c -1 +1
+    M ./libbench/copy-h2c-1d-halfcomplex.c -1 +1
+    M ./libbench/copy-h2c-1d-packed.c -2 +2
+    M ./libbench/copy-h2c-1d-unpacked-ri.c -1 +1
+    M ./libbench/copy-h2c-unpacked.c -9 +9
+    M ./libbench/copy-r2c-packed.c -2 +2
+    M ./libbench/copy-r2c-unpacked.c -1 +1
+    M ./libbench/copy-ri2c.c -2 +2
+    M ./libbench/getopt-utils.c -2 +2
+    M ./libbench/getopt.c -3 +3
+    M ./libbench/log2.c -2 +2
+    M ./libbench/mp.c -20 +20
+    M ./libbench/pow2.c -1 +1
+    M ./libbench/prime.c -4 +4
+    M ./libbench/problem.c -2 +2
+    M ./libbench/timer.c -2 +2
+    M ./libbench/verify.c -36 +36
+    M ./rdft/buffered.c -10 +10
+    M ./rdft/buffered2.c -22 +22
+    M ./rdft/codelet-rdft.h -17 +17
+    M ./rdft/codelets/hc2r.c -1 +1
+    M ./rdft/codelets/hfb.c -1 +1
+    M ./rdft/codelets/r2hc.c -1 +1
+    M ./rdft/dft-r2hc.c -4 +4
+    M ./rdft/dht-r2hc.c -3 +3
+    M ./rdft/dht-rader.c -6 +6
+    M ./rdft/direct.c -6 +6
+    M ./rdft/direct2.c -5 +5
+    M ./rdft/generic.c -8 +8
+    M ./rdft/hc2hc-buf.c -7 +7
+    M ./rdft/hc2hc-dif.c -4 +4
+    M ./rdft/hc2hc-dit.c -4 +4
+    M ./rdft/hc2hc.c -5 +5
+    M ./rdft/hc2hc.h -2 +2
+    M ./rdft/indirect.c -2 +2
+    M ./rdft/problem.c -11 +11
+    M ./rdft/problem2.c -3 +3
+    M ./rdft/rader-hc2hc.c -12 +12
+    M ./rdft/rank-geq2-rdft2.c -9 +9
+    M ./rdft/rank-geq2.c -9 +9
+    M ./rdft/rank0-rdft2.c -4 +4
+    M ./rdft/rank0.c -5 +5
+    M ./rdft/rdft-dht.c -5 +5
+    M ./rdft/rdft.h -4 +4
+    M ./rdft/rdft2-inplace-strides.c -4 +4
+    M ./rdft/rdft2-radix2.c -15 +15
+    M ./rdft/rdft2-tensor-max-index.c -4 +4
+    M ./rdft/vrank-geq1-rdft2.c -12 +12
+    M ./rdft/vrank-geq1.c -11 +11
+    M ./rdft/vrank2-transpose.c -4 +4
+    M ./rdft/vrank3-transpose.c -10 +10
+    M ./reodft/redft00e-r2hc.c -7 +7
+    M ./reodft/reodft010e-r2hc.c -16 +16
+    M ./reodft/reodft11e-r2hc.c -10 +10
+    M ./reodft/rodft00e-r2hc.c -7 +7
+    M ./simd/3dnow.c -5 +5
+    M ./simd/sse.c -3 +3
+    M ./simd/sse2.c -3 +3
+    M ./tests/bench.c -13 +7
+    M ./tests/dotens.c -3 +3
+    M ./tests/dotens2.c -3 +3
+    M ./tests/trigtest.c -7 +7
+    M ./tests/verify-dft.c -4 +4
+    M ./tests/verify-lib.c -41 +41
+    M ./tests/verify-rdft.c -13 +13
+    M ./tests/verify-reodft.c -48 +48
+    M ./tests/verify.h -15 +15
+    M ./threads/ct-dit.c -5 +5
+    M ./threads/dft-vrank-geq1.c -15 +15
+    M ./threads/hc2hc-dif.c -6 +6
+    M ./threads/hc2hc-dit.c -6 +6
+    M ./threads/rdft-vrank-geq1.c -15 +15
+    M ./threads/threads.c -4 +4
+    M ./threads/threads.h -3 +3
+    M ./threads/vrank-geq1-rdft2.c -15 +15
+
+Tue Jan 14 15:14:29 EST 2003  stevenj
+  * [project @ 2003-01-14 20:14:29 by stevenj]
+  comments
+
+    M ./kernel/primes.c -4 +4
+
+Tue Jan 14 08:00:08 EST 2003  athena
+  * [project @ 2003-01-14 13:00:08 by athena]
+  Oops
+
+    M ./dft/generic.c -1 +1
+    M ./rdft/generic.c -1 +1
+
+Tue Jan 14 07:59:14 EST 2003  athena
+  * [project @ 2003-01-14 12:59:14 by athena]
+  int/uint confusion
+
+    M ./dft/generic.c -1 +2
+    M ./rdft/generic.c -1 +2
+
+Tue Jan 14 02:25:33 EST 2003  stevenj
+  * [project @ 2003-01-14 07:25:33 by stevenj]
+  updated introduction and some organization
+
+    M ./doc/fftw3.texi -79 +117
+
+Tue Jan 14 01:34:46 EST 2003  stevenj
+  * [project @ 2003-01-14 06:34:46 by stevenj]
+  whoops
+
+    M ./api/f77api.c -18 +18
+
+Tue Jan 14 01:33:04 EST 2003  stevenj
+  * [project @ 2003-01-14 06:33:04 by stevenj]
+  newline
+
+    M ./Makefile.am +1
+
+Tue Jan 14 00:23:04 EST 2003  stevenj
+  * [project @ 2003-01-14 05:23:04 by stevenj]
+  added win32 timer
+
+    M ./libbench/timer.c -1 +23
+
+Tue Jan 14 00:12:21 EST 2003  stevenj
+  * [project @ 2003-01-14 05:12:21 by stevenj]
+  sync with kernel/alloc.c
+
+    M ./libbench/util.c -2 +5
+
+Tue Jan 14 00:03:20 EST 2003  stevenj
+  * [project @ 2003-01-14 05:03:20 by stevenj]
+  handle missing F77_FUNC_
+
+    M ./api/f77api.c +4
+
+Mon Jan 13 17:42:50 EST 2003  stevenj
+  * [project @ 2003-01-13 22:42:50 by stevenj]
+  used fint instead of int to make Fortran integer type easier to change
+
+    M ./api/f77api.c -24 +31
+
+Mon Jan 13 17:38:56 EST 2003  stevenj
+  * [project @ 2003-01-13 22:38:56 by stevenj]
+  slight abbreviation
+
+    M ./api/f77api.c -4 +4
+
+Mon Jan 13 17:35:20 EST 2003  stevenj
+  * [project @ 2003-01-13 22:35:20 by stevenj]
+  the great lengthening, part I: int -> long in api; mv mktensor-rowmajor to api
+
+    A ./api/mktensor-rowmajor.c
+    M ./api/Makefile.am -2 +2
+    M ./api/api.h -2 +5
+    M ./api/f77api.c -7 +7
+    M ./api/fftw3.h -44 +46
+    M ./api/mktensor-rowmajor.c +44
+    M ./api/plan-dft-1d.c -1 +1
+    M ./api/plan-dft-2d.c -2 +2
+    M ./api/plan-dft-3d.c -2 +2
+    M ./api/plan-dft-c2r-1d.c -1 +1
+    M ./api/plan-dft-c2r-2d.c -2 +2
+    M ./api/plan-dft-c2r-3d.c -2 +2
+    M ./api/plan-dft-c2r.c -1 +1
+    M ./api/plan-dft-r2c-1d.c -1 +1
+    M ./api/plan-dft-r2c-2d.c -2 +2
+    M ./api/plan-dft-r2c-3d.c -2 +2
+    M ./api/plan-dft-r2c.c -1 +1
+    M ./api/plan-dft.c -1 +1
+    M ./api/plan-many-dft-c2r.c -7 +7
+    M ./api/plan-many-dft-r2c.c -7 +7
+    M ./api/plan-many-dft.c -6 +6
+    M ./api/plan-many-r2r.c -6 +6
+    M ./api/plan-r2r-1d.c -1 +1
+    M ./api/plan-r2r-2d.c -2 +2
+    M ./api/plan-r2r-3d.c -2 +2
+    M ./api/plan-r2r.c -1 +1
+    M ./api/rdft2-pad.c -4 +4
+    M ./kernel/Makefile.am -3 +3
+    M ./kernel/ifftw.h -4 +1
+    R ./kernel/tensor3.c
+    M ./tests/bench.c -1 +9
+
+Mon Jan 13 15:23:22 EST 2003  stevenj
+  * [project @ 2003-01-13 20:23:22 by stevenj]
+  long types
+
+    M ./configure.ac +4
+
+Mon Jan 13 04:20:37 EST 2003  athena
+  * [project @ 2003-01-13 09:20:36 by athena]
+  Renamed fftw_malloc -> MALLOC, X(free) -> X(ifree), X(free0) ->
+  X(ifree0), non_fftw_malloc -> NATIVE_MALLOC
+
+    M ./api/apiplan.c -2 +2
+    M ./api/export-wisdom-to-string.c -1 +1
+    M ./api/f77api.c -8 +8
+    M ./api/map-r2r-kind.c -1 +1
+    M ./api/plan-guru-r2r.c -1 +1
+    M ./api/plan-many-dft-c2r.c -2 +2
+    M ./api/plan-many-dft-r2c.c -2 +2
+    M ./api/plan-many-r2r.c -1 +1
+    M ./api/rdft2-pad.c -1 +1
+    M ./dft/buffered.c -6 +6
+    M ./dft/generic.c -1 +1
+    M ./dft/problem.c -2 +2
+    M ./dft/rader-omega.c -1 +1
+    M ./dft/rader.c -10 +10
+    M ./kernel/alloc.c -6 +6
+    M ./kernel/ifftw.h -11 +11
+    M ./kernel/plan.c -3 +3
+    M ./kernel/planner.c -12 +12
+    M ./kernel/print.c -3 +3
+    M ./kernel/problem.c -2 +2
+    M ./kernel/rader.c -3 +3
+    M ./kernel/scan.c -3 +3
+    M ./kernel/solver.c -3 +3
+    M ./kernel/stride.c -3 +3
+    M ./kernel/tensor.c -9 +9
+    M ./kernel/twiddle.c -5 +5
+    M ./rdft/buffered.c -6 +6
+    M ./rdft/buffered2.c -8 +8
+    M ./rdft/dht-rader.c -6 +6
+    M ./rdft/generic.c -1 +1
+    M ./rdft/problem.c -4 +4
+    M ./rdft/problem2.c -2 +2
+    M ./rdft/rader-hc2hc.c -10 +10
+    M ./reodft/redft00e-r2hc.c -5 +5
+    M ./reodft/reodft010e-r2hc.c -11 +11
+    M ./reodft/reodft11e-r2hc.c -7 +7
+    M ./reodft/rodft00e-r2hc.c -5 +5
+    M ./tests/verify-dft.c -15 +15
+    M ./tests/verify-rdft.c -29 +29
+    M ./tests/verify-reodft.c -15 +15
+    M ./threads/dft-vrank-geq1.c -4 +4
+    M ./threads/rdft-vrank-geq1.c -4 +4
+    M ./threads/threads.c -1 +1
+    M ./threads/vrank-geq1-rdft2.c -4 +4
+
+Mon Jan 13 02:37:22 EST 2003  stevenj
+  * [project @ 2003-01-13 07:37:22 by stevenj]
+  added beginning of Fortran interface
+
+    A ./api/f77api.c
+    M ./api/Makefile.am -1 +1
+    M ./api/f77api.c +145
+
+Mon Jan 13 01:05:29 EST 2003  stevenj
+  * [project @ 2003-01-13 06:05:29 by stevenj]
+  add fortran mangling check
+
+    M ./configure.ac +18
+
+Mon Jan 13 00:33:28 EST 2003  stevenj
+  * [project @ 2003-01-13 05:33:28 by stevenj]
+  added guru r2r interface
+
+    A ./api/execute-r2r.c
+    A ./api/plan-guru-r2r.c
+    M ./api/Makefile.am -3 +4
+    M ./api/execute-r2r.c +29
+    M ./api/fftw3.h -1 +8
+    M ./api/plan-guru-r2r.c +42
+
+Mon Jan 13 00:23:26 EST 2003  stevenj
+  * [project @ 2003-01-13 05:23:26 by stevenj]
+  whoops
+
+    M ./api/fftw3.h -5 +7
+    M ./api/plan-r2r-1d.c -2 +2
+    M ./api/plan-r2r-2d.c -1 +4
+    M ./api/plan-r2r-3d.c -2 +5
+
+Mon Jan 13 00:16:20 EST 2003  stevenj
+  * [project @ 2003-01-13 05:16:20 by stevenj]
+  added r2r planner
+
+    A ./api/map-r2r-kind.c
+    A ./api/plan-many-r2r.c
+    A ./api/plan-r2r-1d.c
+    A ./api/plan-r2r-2d.c
+    A ./api/plan-r2r-3d.c
+    A ./api/plan-r2r.c
+    M ./api/Makefile.am -2 +3
+    M ./api/fftw3.h -1 +29
+    M ./api/map-r2r-kind.c +47
+    M ./api/plan-many-r2r.c +48
+    M ./api/plan-r2r-1d.c +27
+    M ./api/plan-r2r-2d.c +29
+    M ./api/plan-r2r-3d.c +30
+    M ./api/plan-r2r.c +27
+
+Sun Jan 12 22:58:18 EST 2003  stevenj
+  * [project @ 2003-01-13 03:58:18 by stevenj]
+  more long-double checks
+
+    M ./configure.ac -2 +13
+
+Sun Jan 12 20:01:51 EST 2003  stevenj
+  * [project @ 2003-01-13 01:01:51 by stevenj]
+  slight regrouping
+
+    M ./kernel/planner.c -5 +7
+
+Sun Jan 12 19:58:46 EST 2003  stevenj
+  * [project @ 2003-01-13 00:58:46 by stevenj]
+  added joke
+
+    M ./kernel/planner.c -2 +2
+
+Sun Jan 12 19:53:58 EST 2003  stevenj
+  * [project @ 2003-01-13 00:53:58 by stevenj]
+  simplified rdft2 padding
+
+    A ./api/rdft2-pad.c
+    M ./api/Makefile.am -7 +7
+    M ./api/api.h -4 +2
+    R ./api/mktensor-rowmajor-pad.c
+    M ./api/plan-many-dft-c2r.c -19 +16
+    M ./api/plan-many-dft-r2c.c -6 +18
+    M ./api/rdft2-pad.c +40
+
+Sun Jan 12 19:02:09 EST 2003  stevenj
+  * [project @ 2003-01-13 00:02:09 by stevenj]
+  added comment
+
+    M ./api/fftw3.h -2 +2
+
+Sun Jan 12 18:54:49 EST 2003  stevenj
+  * [project @ 2003-01-12 23:54:49 by stevenj]
+  use latest api
+
+    M ./tests/bench.c -2 +1
+
+Sun Jan 12 18:49:58 EST 2003  stevenj
+  * [project @ 2003-01-12 23:49:58 by stevenj]
+  nembed should only be in advanced (many) interface, not basic interface...only a handful of people over the years have ever requested that functionality.
+
+    M ./api/fftw3.h -10 +4
+    M ./api/plan-dft-1d.c -1 +1
+    M ./api/plan-dft-2d.c -1 +1
+    M ./api/plan-dft-3d.c -1 +1
+    M ./api/plan-dft-c2r-1d.c -1 +1
+    M ./api/plan-dft-c2r-2d.c -1 +1
+    M ./api/plan-dft-c2r-3d.c -1 +1
+    M ./api/plan-dft-c2r.c -5 +3
+    M ./api/plan-dft-r2c-1d.c -1 +1
+    M ./api/plan-dft-r2c-2d.c -1 +1
+    M ./api/plan-dft-r2c-3d.c -1 +1
+    M ./api/plan-dft-r2c.c -5 +3
+    M ./api/plan-dft.c -5 +3
+
+Sun Jan 12 18:41:57 EST 2003  stevenj
+  * [project @ 2003-01-12 23:41:57 by stevenj]
+  impatient is default; generalize mapping functions using xor trick
+
+    M ./api/fftw3.h -2 +2
+    M ./api/mapflags.c -55 +61
+
+Sun Jan 12 14:39:42 EST 2003  stevenj
+  * [project @ 2003-01-12 19:39:41 by stevenj]
+  use NULL nembed to signal padding
+
+    M ./api/mktensor-rowmajor-pad.c -5 +9
+    M ./api/plan-dft-c2r-1d.c -1 +1
+    M ./api/plan-dft-c2r-2d.c -1 +1
+    M ./api/plan-dft-c2r-3d.c -1 +1
+    M ./api/plan-dft-r2c-1d.c -1 +1
+    M ./api/plan-dft-r2c-2d.c -1 +1
+    M ./api/plan-dft-r2c-3d.c -1 +1
+
+Sun Jan 12 14:23:00 EST 2003  stevenj
+  * [project @ 2003-01-12 19:23:00 by stevenj]
+  accept NULL nembed
+
+    M ./api/plan-many-dft.c -1 +4
+
+Sun Jan 12 13:57:13 EST 2003  stevenj
+  * [project @ 2003-01-12 18:57:13 by stevenj]
+  added execute-dft-r2c/c2r
+
+    A ./api/execute-dft-c2r.c
+    A ./api/execute-dft-r2c.c
+    M ./api/Makefile.am -5 +6
+    M ./api/execute-dft-c2r.c +29
+    M ./api/execute-dft-r2c.c +29
+    M ./api/fftw3.h -1 +4
+
+Sun Jan 12 13:43:20 EST 2003  stevenj
+  * [project @ 2003-01-12 18:43:20 by stevenj]
+  don't need dft.h
+
+    M ./api/plan-dft.c -1
+
+Sun Jan 12 13:22:14 EST 2003  stevenj
+  * [project @ 2003-01-12 18:22:14 by stevenj]
+  tensors are compressed in the problem, duh
+
+    M ./api/plan-many-dft-c2r.c -2 +1
+    M ./api/plan-many-dft-r2c.c -2 +1
+
+Sun Jan 12 12:45:26 EST 2003  stevenj
+  * [project @ 2003-01-12 17:45:26 by stevenj]
+  noted that posix_memalign bug is now fixed, thanks to bug report by yours truly
+
+    M ./kernel/alloc.c -2 +3
+
+Sun Jan 12 12:44:43 EST 2003  athena
+  * [project @ 2003-01-12 17:44:43 by athena]
+  Bug: n[3] instead of n[2].  Bug was propagated by copy-and-paste.
+  Grrr...
+
+    M ./api/plan-dft-3d.c -1 +1
+    M ./api/plan-dft-c2r-3d.c -1 +1
+    M ./api/plan-dft-r2c-3d.c -1 +1
+
+Sun Jan 12 12:41:43 EST 2003  athena
+  * [project @ 2003-01-12 17:41:43 by athena]
+  Express plan_dft() in terms of plan_many_dft()
+
+    M ./api/plan-dft.c -11 +4
+
+Sun Jan 12 12:19:53 EST 2003  stevenj
+  * [project @ 2003-01-12 17:19:38 by stevenj]
+  whoops
+
+    A ./api/plan-guru-dft-c2r.c
+    A ./api/plan-guru-dft-r2c.c
+    A ./api/plan-guru-dft.c
+    M ./api/plan-guru-dft-c2r.c +36
+    M ./api/plan-guru-dft-r2c.c +35
+    M ./api/plan-guru-dft.c +35
+
+Sun Jan 12 06:00:46 EST 2003  athena
+  * [project @ 2003-01-12 11:00:46 by athena]
+  Manual skeleton.
+
+    A ./doc/
+    A ./doc/Makefile.am
+    A ./doc/fftw3.texi
+    M ./Makefile.am -1 +1
+    M ./configure.ac +1
+    M ./doc/Makefile.am -1 +2
+    M ./doc/fftw3.texi +318
+    M ./genfft-k7/vK7Optimization.ml -1 +1
+
+Sat Jan 11 23:46:34 EST 2003  stevenj
+  * [project @ 2003-01-12 04:46:34 by stevenj]
+  added r2c/c2r guru api
+
+    M ./api/Makefile.am -1 +2
+    M ./api/fftw3.h -1 +12
+
+Sat Jan 11 23:42:10 EST 2003  stevenj
+  * [project @ 2003-01-12 04:42:10 by stevenj]
+  FFTW_DESTROY_INPUT is default for c2r transforms
+
+    M ./api/plan-many-dft-c2r.c -1 +1
+
+Sat Jan 11 23:36:26 EST 2003  stevenj
+  * [project @ 2003-01-12 04:36:26 by stevenj]
+  added more of r2c/c2r api
+
+    A ./api/plan-dft-c2r-1d.c
+    A ./api/plan-dft-c2r-2d.c
+    A ./api/plan-dft-c2r-3d.c
+    A ./api/plan-dft-c2r.c
+    A ./api/plan-dft-r2c-1d.c
+    A ./api/plan-dft-r2c-2d.c
+    A ./api/plan-dft-r2c-3d.c
+    A ./api/plan-dft-r2c.c
+    M ./api/Makefile.am -1 +3
+    M ./api/fftw3.h -69 +90
+    M ./api/plan-dft-c2r-1d.c +26
+    M ./api/plan-dft-c2r-2d.c +29
+    M ./api/plan-dft-c2r-3d.c +29
+    M ./api/plan-dft-c2r.c +32
+    M ./api/plan-dft-r2c-1d.c +26
+    M ./api/plan-dft-r2c-2d.c +29
+    M ./api/plan-dft-r2c-3d.c +29
+    M ./api/plan-dft-r2c.c +32
+
+Sat Jan 11 21:09:41 EST 2003  stevenj
+  * [project @ 2003-01-12 02:09:41 by stevenj]
+  r2c doesn't have adjustible sign
+
+    M ./api/fftw3.h -3 +3
+    M ./api/plan-many-dft-c2r.c -2 +2
+    M ./api/plan-many-dft-r2c.c -2 +2
+
+Sat Jan 11 21:07:55 EST 2003  stevenj
+  * [project @ 2003-01-12 02:07:55 by stevenj]
+  note that copyright year is out of date
+
+    M ./TODO -1 +1
+
+Sat Jan 11 21:04:23 EST 2003  stevenj
+  * [project @ 2003-01-12 02:04:23 by stevenj]
+  updated api for r2c
+
+    M ./api/fftw3.h -1 +17
+
+Sat Jan 11 21:00:07 EST 2003  stevenj
+  * [project @ 2003-01-12 02:00:07 by stevenj]
+  removed annoying nophys == niphys case
+
+    M ./api/mktensor-rowmajor-pad.c -11
+
+Sat Jan 11 20:58:13 EST 2003  stevenj
+  * [project @ 2003-01-12 01:58:13 by stevenj]
+  added basic r2c/c2r planner
+
+    A ./api/mktensor-rowmajor-pad.c
+    A ./api/plan-many-dft-c2r.c
+    A ./api/plan-many-dft-r2c.c
+    M ./api/Makefile.am -2 +3
+    M ./api/api.h +5
+    M ./api/mktensor-rowmajor-pad.c +88
+    M ./api/plan-many-dft-c2r.c +59
+    M ./api/plan-many-dft-r2c.c +44
+
+Sat Jan 11 19:34:14 EST 2003  stevenj
+  * [project @ 2003-01-12 00:34:14 by stevenj]
+  dist should be in terms of complex values
+
+    M ./api/plan-many-dft.c -1 +1
+
+Sat Jan 11 19:14:24 EST 2003  stevenj
+  * [project @ 2003-01-12 00:14:24 by stevenj]
+  added plan-with-nthreads
+
+    M ./api/fftw3.h -2 +5
+
+Sat Jan 11 19:12:51 EST 2003  stevenj
+  * [project @ 2003-01-12 00:12:51 by stevenj]
+  added function to set nthr
+
+    A ./api/plan-with-nthreads.c
+    M ./api/Makefile.am -1 +1
+    M ./api/plan-with-nthreads.c +39
+
+Sat Jan 11 18:04:57 EST 2003  stevenj
+  * [project @ 2003-01-11 23:04:57 by stevenj]
+  slight cleanup
+
+    M ./api/fftw3.h -4 +5
+
+Sat Jan 11 17:57:29 EST 2003  stevenj
+  * [project @ 2003-01-11 22:57:29 by stevenj]
+  whoops
+
+    A ./api/mktensor-iodims.c
+
+Sat Jan 11 17:55:39 EST 2003  stevenj
+  * [project @ 2003-01-11 22:55:39 by stevenj]
+  maxlen is maximum string length, not including null termination
+
+    M ./kernel/scan.c -2 +2
+
+Sat Jan 11 17:50:49 EST 2003  stevenj
+  * [project @ 2003-01-11 22:50:49 by stevenj]
+  imprt reverts hashtable on failure
+
+    M ./kernel/planner.c -3 +14
+
+Sat Jan 11 16:43:54 EST 2003  stevenj
+  * [project @ 2003-01-11 21:43:54 by stevenj]
+  slight move
+
+    M ./api/fftw3.h -14 +14
+
+Sat Jan 11 16:34:56 EST 2003  stevenj
+  * [project @ 2003-01-11 21:34:56 by stevenj]
+  stdio.h should be inlcuded outside of extern "C"
+
+    M ./api/fftw3.h -3 +3
+
+Sat Jan 11 16:26:35 EST 2003  stevenj
+  * [project @ 2003-01-11 21:26:35 by stevenj]
+  added guru planner API
+
+    M ./api/Makefile.am -2 +3
+    M ./api/api.h +1
+    M ./api/fftw3.h -1 +15
+
+Sat Jan 11 15:54:57 EST 2003  stevenj
+  * [project @ 2003-01-11 20:54:57 by stevenj]
+  added FFTW_FORWARD/BACKWARD
+
+    M ./api/fftw3.h -1 +4
+
+Sat Jan 11 15:52:17 EST 2003  stevenj
+  * [project @ 2003-01-11 20:52:17 by stevenj]
+  added plan_many_dft
+
+    A ./api/plan-many-dft.c
+    M ./api/Makefile.am -1 +1
+    M ./api/fftw3.h -1 +9
+    M ./api/plan-many-dft.c +43
+
+Sat Jan 11 15:44:37 EST 2003  stevenj
+  * [project @ 2003-01-11 20:44:37 by stevenj]
+  indenting
+
+    M ./kernel/tensor3.c -3 +3
+
+Sat Jan 11 14:49:08 EST 2003  athena
+  * [project @ 2003-01-11 19:49:08 by athena]
+  Final \n
+
+    M ./tests/bench.c -2 +4
+
+Sat Jan 11 14:47:31 EST 2003  athena
+  * [project @ 2003-01-11 19:47:31 by athena]
+  Do not compile if not defined(FFTW_DEBUG), in order to avoid
+  unused code in the shared library.
+
+    M ./kernel/debug.c -1 +3
+
+Sat Jan 11 14:45:56 EST 2003  athena
+  * [project @ 2003-01-11 19:45:56 by athena]
+  Implemented print_plan()
+
+    A ./api/mkprinter-file.c
+    A ./api/print-plan.c
+    M ./api/Makefile.am -4 +4
+    M ./api/api.h +1
+    M ./api/export-wisdom-to-file.c -19 +1
+    M ./api/fftw3.h -2 +3
+    M ./api/mkprinter-file.c +39
+    M ./api/print-plan.c +29
+    M ./tests/bench.c -2 +3
+
+Sat Jan 11 13:12:01 EST 2003  stevenj
+  * [project @ 2003-01-11 18:12:01 by stevenj]
+  changed the OOP-like plan_destroy to the more-grammatical destroy_plan
+
+    M ./api/apiplan.c -1 +1
+    M ./api/fftw3.h -2 +2
+    M ./tests/bench.c -1 +1
+
+Sat Jan 11 12:58:04 EST 2003  stevenj
+  * [project @ 2003-01-11 17:55:49 by stevenj]
+  added guru execute_dft
+
+    A ./api/execute-dft.c
+    M ./api/Makefile.am -4 +5
+    M ./api/execute-dft.c +29
+    M ./api/fftw3.h -2 +3
+
+Sat Jan 11 12:38:40 EST 2003  stevenj
+  * [project @ 2003-01-11 17:38:40 by stevenj]
+  allow for malloc errors in wisdom string, since non-fftw-malloc
+
+    M ./api/export-wisdom-to-string.c -4 +5
+
+Sat Jan 11 12:16:05 EST 2003  stevenj
+  * [project @ 2003-01-11 17:16:05 by stevenj]
+  cleanup should reset plnr to zero so that fftw can be restarted
+
+    M ./api/the-planner.c -1 +3
+
+Sat Jan 11 12:13:18 EST 2003  stevenj
+  * [project @ 2003-01-11 17:13:18 by stevenj]
+  NO_UGLY is an internal planner flag
+
+    M ./api/fftw3.h -4 +3
+    M ./api/mapflags.c -3
+
+Sat Jan 11 11:23:13 EST 2003  athena
+  * [project @ 2003-01-11 16:23:13 by athena]
+  Written 1d api in terms of generic n-d api.  The code is less compact
+  but easier to test
+
+    M ./api/plan-dft-1d.c -9 +1
+    M ./tests/bench.c -27 +36
+
+Sat Jan 11 11:07:25 EST 2003  athena
+  * [project @ 2003-01-11 16:07:24 by athena]
+  Added wisdom to header file, made scanners/printer static.  stdio.h
+  no longer needed in fftw.h, removed.  Probably the printer_file
+  should be reintroduced in a separate file if we ever want to
+  print plans...
+
+    M ./api/export-wisdom-to-file.c -2 +2
+    M ./api/export-wisdom-to-string.c -4 +4
+    M ./api/fftw3.h -2 +13
+    M ./api/import-wisdom-from-file.c -2 +2
+    M ./api/import-wisdom-from-string.c -2 +2
+    M ./kernel/alloc.c -1 +3
+    M ./kernel/assert.c -2 +2
+    M ./kernel/debug.c -2 +22
+    M ./kernel/ifftw.h -15 +1
+    M ./kernel/print.c -1 +2
+    M ./kernel/scan.c -1 +2
+    M ./tests/bench.c -13 +49
+    M ./tests/verify-lib.c -1 +2
+    M ./tests/verify-reodft.c -1 +2
+
+Sat Jan 11 09:49:30 EST 2003  athena
+  * [project @ 2003-01-11 14:49:30 by athena]
+  Implemented more APIs
+
+    A ./api/plan-dft-2d.c
+    A ./api/plan-dft-3d.c
+    A ./api/plan-dft.c
+    M ./api/Makefile.am -6 +5
+    M ./api/apiplan.c -1 +1
+    M ./api/fftw3.h -13 +25
+    M ./api/plan-dft-2d.c +30
+    M ./api/plan-dft-3d.c +30
+    M ./api/plan-dft.c +40
+    M ./tests/bench.c +18
+
+Sat Jan 11 09:21:53 EST 2003  athena
+  * [project @ 2003-01-11 14:21:53 by athena]
+  Added cleanup() to API
+
+    M ./api/fftw3.h -3 +3
+    M ./api/the-planner.c +6
+    M ./tests/bench.c +1
+
+Sat Jan 11 09:17:34 EST 2003  athena
+  * [project @ 2003-01-11 14:17:34 by athena]
+  Started new bench.c.  I had to rename plan_destroy ->
+  plan_destroy_internal to avoid conflicts with API
+
+    M ./api/api.h -1
+    M ./api/apiplan.c -3 +3
+    M ./api/fftw3.h -1 +3
+    M ./dft/buffered.c -7 +7
+    M ./dft/ct.c -2 +2
+    M ./dft/generic.c -2 +2
+    M ./dft/indirect.c -5 +5
+    M ./dft/rader.c -8 +8
+    M ./dft/rank-geq2.c -5 +5
+    M ./dft/vrank-geq1.c -2 +2
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/plan.c -2 +2
+    M ./kernel/planner.c -3 +3
+    M ./libbench/bench-user.h -1 +4
+    M ./libbench/bench.h -3 +1
+    M ./rdft/buffered.c -7 +7
+    M ./rdft/buffered2.c -5 +5
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/dht-r2hc.c -2 +2
+    M ./rdft/dht-rader.c -6 +6
+    M ./rdft/generic.c -2 +2
+    M ./rdft/hc2hc.c -7 +7
+    M ./rdft/indirect.c -5 +5
+    M ./rdft/rader-hc2hc.c -7 +7
+    M ./rdft/rank-geq2-rdft2.c -5 +5
+    M ./rdft/rank-geq2.c -5 +5
+    M ./rdft/rank0-rdft2.c -2 +2
+    M ./rdft/rdft-dht.c -2 +2
+    M ./rdft/rdft2-radix2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+    M ./reodft/redft00e-r2hc.c -2 +2
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -2 +2
+    M ./reodft/rodft00e-r2hc.c -2 +2
+    M ./tests/Makefile.am -1 +1
+    M ./tests/bench.c -2 +103
+    M ./threads/dft-vrank-geq1.c -3 +3
+    M ./threads/rdft-vrank-geq1.c -3 +3
+    M ./threads/vrank-geq1-rdft2.c -3 +3
+
+Sat Jan 11 02:45:39 EST 2003  stevenj
+  * [project @ 2003-01-11 07:45:39 by stevenj]
+  fix types
+
+    A ./api/import-system-wisdom.c
+    M ./api/Makefile.am -3 +3
+    M ./api/export-wisdom.c -2 +2
+    M ./api/import-system-wisdom.c +46
+    M ./api/import-wisdom-from-file.c -2 +3
+    M ./api/import-wisdom-from-string.c -3 +4
+    M ./api/import-wisdom.c -3 +5
+
+Sat Jan 11 02:13:25 EST 2003  stevenj
+  * [project @ 2003-01-11 07:13:25 by stevenj]
+  whoops
+
+    M ./api/export-wisdom-to-string.c -1 +2
+
+Sat Jan 11 02:10:50 EST 2003  stevenj
+  * [project @ 2003-01-11 07:10:50 by stevenj]
+  added wisdom api
+
+    A ./api/export-wisdom-to-file.c
+    A ./api/export-wisdom-to-string.c
+    A ./api/export-wisdom.c
+    A ./api/forget-wisdom.c
+    A ./api/import-wisdom-from-file.c
+    A ./api/import-wisdom-from-string.c
+    A ./api/import-wisdom.c
+    M ./api/Makefile.am -2 +6
+    M ./api/export-wisdom-to-file.c +47
+    M ./api/export-wisdom-to-string.c +80
+    M ./api/export-wisdom.c +44
+    M ./api/forget-wisdom.c +27
+    M ./api/import-wisdom-from-file.c +47
+    M ./api/import-wisdom-from-string.c +49
+    M ./api/import-wisdom.c +44
+    M ./kernel/Makefile.am -4 +4
+    M ./kernel/ifftw.h -1 +3
+    R ./kernel/printers.c
+    R ./kernel/scanners.c
+
+Sat Jan 11 01:01:17 EST 2003  stevenj
+  * [project @ 2003-01-11 06:01:17 by stevenj]
+  grammar
+
+    M ./api/mapflags.c -1 +1
+
+Sat Jan 11 00:54:54 EST 2003  stevenj
+  * [project @ 2003-01-11 05:54:54 by stevenj]
+  slight change
+
+    M ./api/mapflags.c -1 +1
+
+Sat Jan 11 00:52:04 EST 2003  stevenj
+  * [project @ 2003-01-11 05:52:04 by stevenj]
+  implemented api/mapflags
+
+    M ./api/fftw3.h -1 +23
+    M ./api/mapflags.c -1 +83
+
+Sat Jan 11 00:48:27 EST 2003  stevenj
+  * [project @ 2003-01-11 05:48:27 by stevenj]
+  IMPATIENT is an api issue
+
+    M ./kernel/ifftw.h -11 +1
+
+Fri Jan 10 01:57:41 EST 2003  stevenj
+  * [project @ 2003-01-10 06:57:41 by stevenj]
+  removed un-needed headers
+
+    M ./api/the-planner.c -4
+
+Fri Jan 10 01:56:59 EST 2003  stevenj
+  * [project @ 2003-01-10 06:56:59 by stevenj]
+  mkplanner initializes nthr to 1 already
+
+    M ./api/the-planner.c -1
+
+Thu Jan  9 18:53:09 EST 2003  stevenj
+  * [project @ 2003-01-09 23:53:09 by stevenj]
+  boilerplate
+
+    M ./api/fftw3.h -1 +14
+
+Thu Jan  9 18:16:39 EST 2003  stevenj
+  * [project @ 2003-01-09 23:16:39 by stevenj]
+  fold vecloop into r{e,o}dft apply function to share buffer, etcetera
+
+    M ./rdft/vrank-geq1.c -1 +6
+    M ./reodft/redft00e-r2hc.c -42 +45
+    M ./reodft/reodft010e-r2hc.c -138 +153
+    M ./reodft/reodft11e-r2hc.c -98 +107
+    M ./reodft/rodft00e-r2hc.c -39 +42
+
+Thu Jan  9 18:10:19 EST 2003  stevenj
+  * [project @ 2003-01-09 23:10:19 by stevenj]
+  whoops, bugfix in impulse test for vecn > 1
+
+    M ./tests/verify-reodft.c -2 +2
+
+Thu Jan  9 14:23:51 EST 2003  stevenj
+  * [project @ 2003-01-09 19:23:51 by stevenj]
+  bugfix, grr
+
+    M ./rdft/hc2hc-buf.c -2 +2
+
+Thu Jan  9 14:21:16 EST 2003  stevenj
+  * [project @ 2003-01-09 19:21:16 by stevenj]
+  fixed signed-ness enum problem
+
+    M ./rdft/codelet-rdft.h -2 +5
+
+Thu Jan  9 14:12:42 EST 2003  athena
+  * [project @ 2003-01-09 19:12:42 by athena]
+  Explicit cast
+
+    M ./kernel/md5-1.c -1 +1
+
+Thu Jan  9 13:41:51 EST 2003  athena
+  * [project @ 2003-01-09 18:41:51 by athena]
+  Added configure_planner().  mkplan() behaves properly when plan is null.
+
+    A ./api/configure.c
+    M ./api/Makefile.am -2 +2
+    M ./api/api.h +1
+    M ./api/apiplan.c -14 +21
+    M ./api/configure.c +33
+    M ./api/fftw3.h -3 +3
+    M ./api/the-planner.c -4 +1
+
+Thu Jan  9 06:48:53 EST 2003  athena
+  * [project @ 2003-01-09 11:48:53 by athena]
+  More API work
+
+    A ./api/apiplan.c
+    A ./api/execute.c
+    A ./api/mapflags.c
+    M ./api/Makefile.am -4 +2
+    M ./api/api.h -2 +3
+    M ./api/apiplan.c +60
+    M ./api/execute.c +27
+    M ./api/fftw3.h -7 +3
+    M ./api/mapflags.c +26
+    M ./api/plan-dft-1d.c -11 +4
+    M ./tests/bench.c -1 +1
+
+Thu Jan  9 05:40:34 EST 2003  athena
+  * [project @ 2003-01-09 10:40:34 by athena]
+  First skeleton of API infrastructure
+
+    A ./api/
+    A ./api/Makefile.am
+    A ./api/api.h
+    A ./api/dfthelp.c
+    A ./api/fftw3.h
+    A ./api/plan-dft-1d.c
+    A ./api/the-planner.c
+    M ./Makefile.am -5 +7
+    M ./api/Makefile.am +10
+    M ./api/api.h +51
+    M ./api/dfthelp.c +37
+    M ./api/fftw3.h +67
+    M ./api/plan-dft-1d.c +43
+    M ./api/the-planner.c +42
+    M ./configure.ac +2
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/trig.c -3 +1
+    M ./tests/Makefile.am -1 +1
+
+Thu Jan  9 03:19:35 EST 2003  stevenj
+  * [project @ 2003-01-09 08:19:35 by stevenj]
+  unsigned strikes again
+
+    M ./rdft/rdft2-tensor-max-index.c -3 +3
+
+Thu Jan  9 01:51:45 EST 2003  stevenj
+  * [project @ 2003-01-09 06:51:45 by stevenj]
+  put rdft2_inplace_strides and rdft2_tensor_max_index in their own files for tighter linking
+
+    A ./rdft/rdft2-inplace-strides.c
+    A ./rdft/rdft2-tensor-max-index.c
+    M ./rdft/Makefile.am -1 +2
+    M ./rdft/problem2.c -46 +1
+    M ./rdft/rdft2-inplace-strides.c +68
+    M ./rdft/rdft2-tensor-max-index.c +47
+    M ./rdft/vrank-geq1-rdft2.c -25 +1
+
+Thu Jan  9 01:43:13 EST 2003  stevenj
+  * [project @ 2003-01-09 06:43:13 by stevenj]
+  added rdft2_tensor_max_index...incorrect use of tensor_max_index was preventing proper loop ordering for rnk > 2 rdft2
+
+    M ./rdft/rank-geq2-rdft2.c -2 +3
+    M ./rdft/rdft.h -1 +2
+    M ./rdft/vrank-geq1-rdft2.c -4 +27
+
+Thu Jan  9 00:44:45 EST 2003  stevenj
+  * [project @ 2003-01-09 05:44:45 by stevenj]
+  arbitrary spltrnk in rdft2 rank-geq2
+
+    M ./rdft/rank-geq2-rdft2.c -11 +53
+
+Thu Jan  9 00:40:17 EST 2003  stevenj
+  * [project @ 2003-01-09 05:40:17 by stevenj]
+  don't mention wisdom when non-verbose
+
+    M ./tests/bench.c -2 +4
+
+Thu Jan  9 00:02:35 EST 2003  stevenj
+  * [project @ 2003-01-09 05:02:35 by stevenj]
+  bug fix: printing %T should pass tensor *, not tensor **
+
+    M ./dft/problem.c -3 +3
+    M ./rdft/problem.c -3 +3
+    M ./rdft/problem2.c -3 +3
+
+Wed Jan  8 23:40:48 EST 2003  stevenj
+  * [project @ 2003-01-09 04:40:48 by stevenj]
+  correct(?) normalization for rodft00 ... all of the even/odd transforms should be normalized according to the expanded'' DFT of ~twice the length
+
+    M ./reodft/rodft00e-r2hc.c -4 +4
+    M ./tests/verify-reodft.c -2 +2
+
+Wed Jan  8 23:18:23 EST 2003  stevenj
+  * [project @ 2003-01-09 04:18:23 by stevenj]
+  fixed tests for n=1
+
+    M ./tests/verify-reodft.c -3 +6
+
+Wed Jan  8 22:10:08 EST 2003  stevenj
+  * [project @ 2003-01-09 03:10:08 by stevenj]
+  fixed bug in vector tests for rdft(2)
+
+    M ./tests/bench.c -3 +5
+
+Wed Jan  8 20:12:00 EST 2003  stevenj
+  * [project @ 2003-01-09 01:12:00 by stevenj]
+  fixed handling when first rnk-1 dimensions compress to nothing (ugh)
+
+    M ./rdft/problem2.c -3 +6
+
+Wed Jan  8 20:02:35 EST 2003  stevenj
+  * [project @ 2003-01-09 01:02:35 by stevenj]
+  fixed incorrect/missing rdft2 rank-0 handling
+
+    A ./rdft/rank0-rdft2.c
+    M ./rdft/Makefile.am -2 +2
+    M ./rdft/conf.c -1 +2
+    M ./rdft/nop2.c -3 +6
+    M ./rdft/rank0-rdft2.c +194
+    M ./rdft/rdft.h -1 +2
+
+Wed Jan  8 19:49:05 EST 2003  stevenj
+  * [project @ 2003-01-09 00:49:05 by stevenj]
+  bug fix: for rnk > 1, must compress rnk-1 dims separately (ugh)
+
+    M ./rdft/problem2.c -2 +11
+
+Wed Jan  8 17:39:14 EST 2003  stevenj
+  * [project @ 2003-01-08 22:39:14 by stevenj]
+  added trailing newline
+
+    M ./configure.ac +1
+
+Wed Jan  8 17:38:02 EST 2003  stevenj
+  * [project @ 2003-01-08 22:38:02 by stevenj]
+  updated
+
+    M ./ChangeLog +66
+
+Wed Jan  8 16:53:16 EST 2003  stevenj
+  * [project @ 2003-01-08 21:53:16 by stevenj]
+  got rid of compiler warning
+
+    M ./rdft/problem.c -2 +2
+
+Wed Jan  8 16:49:48 EST 2003  stevenj
+  * [project @ 2003-01-08 21:49:48 by stevenj]
+  whoops, test r2hc and not rodft00 by default
+
+    M ./tests/bench.c -1 +1
+
+Wed Jan  8 16:46:24 EST 2003  stevenj
+  * [project @ 2003-01-08 21:46:24 by stevenj]
+  got rid of real_n...use physical n everywhere in rdft; fixed rdft sz compression; fixed rodft00 verify bug
+
+    M ./rdft/buffered.c -3 +2
+    M ./rdft/indirect.c -10 +6
+    M ./rdft/problem.c -44 +43
+    M ./rdft/rank-geq2.c -10 +6
+    M ./rdft/rdft.h -3 +1
+    M ./reodft/redft00e-r2hc.c -4 +5
+    M ./reodft/reodft010e-r2hc.c -4 +3
+    M ./reodft/reodft11e-r2hc.c -4 +3
+    M ./reodft/rodft00e-r2hc.c -4 +3
+    M ./tests/bench.c -2 +2
+    M ./tests/verify-reodft.c -33 +35
+
+Wed Jan  8 07:20:47 EST 2003  athena
+  * [project @ 2003-01-08 12:20:47 by athena]
+  icc-6.0 bug workaround
+
+    A ./simd/sse-aux.c
+    A ./simd/sse2-aux.c
+    M ./simd/Makefile.am -2 +2
+    M ./simd/sse-aux.c +33
+    M ./simd/sse.c -3 +1
+    M ./simd/sse2-aux.c +34
+    M ./simd/sse2.c -3 +1
+
+Wed Jan  8 04:21:40 EST 2003  athena
+  * [project @ 2003-01-08 09:21:40 by athena]
+  Reclaimed the fftw_real identifier, because I need it for the API
+
+    M ./kernel/ifftw.h -12 +9
+    M ./rdft/buffered2.c -5 +5
+    M ./rdft/rader-hc2hc.c -1 +1
+    M ./tests/bench.c +1
+
+Wed Jan  8 04:14:55 EST 2003  athena
+  * [project @ 2003-01-08 09:14:55 by athena]
+  Use recommended AC_OUTPUT syntax
+
+    M ./configure.ac -1 +2
+
+Wed Jan  8 04:00:22 EST 2003  athena
+  * [project @ 2003-01-08 09:00:22 by athena]
+  Removed FFTW(foo) as a synonym for X(foo).  This is an API issue.
+
+    M ./kernel/ifftw.h -8 +7
+    M ./tests/bench.c +1
+
+Tue Jan  7 17:45:52 EST 2003  stevenj
+  * [project @ 2003-01-07 22:45:52 by stevenj]
+  get rid of warning
+
+    M ./simd/sse2.c -1 +2
+
+Tue Jan  7 16:22:39 EST 2003  athena
+  * [project @ 2003-01-07 21:22:39 by athena]
+  Renamed conflicting files */codelet.h into dft/codelet-dft.h and
+  rdft/codelet-rdft.h
+
+    A ./dft/codelet-dft.h
+    A ./rdft/codelet-rdft.h
+    A ./support/codelet_prelude.dft
+    A ./support/codelet_prelude.rdft
+    M ./dft/Makefile.am -1 +1
+    M ./dft/codelet-dft.h +113
+    R ./dft/codelet.h
+    M ./dft/codelets/inplace/Makefile.am -4 +4
+    M ./dft/codelets/n.c -1 +1
+    M ./dft/codelets/standard/Makefile.am -3 +3
+    M ./dft/codelets/t.c -1 +1
+    M ./dft/dft.h -2 +2
+    M ./dft/simd/codelets/Makefile.am -6 +6
+    M ./dft/simd/n1b.c -1 +1
+    M ./dft/simd/n1f.c -1 +1
+    M ./dft/simd/t1b.c -1 +1
+    M ./dft/simd/t1f.c -1 +1
+    M ./rdft/Makefile.am -5 +5
+    M ./rdft/codelet-rdft.h +190
+    R ./rdft/codelet.h
+    M ./rdft/codelets/hc2r/Makefile.am -4 +4
+    M ./rdft/codelets/hc2r.c -1 +1
+    M ./rdft/codelets/hfb.c -1 +1
+    M ./rdft/codelets/r2hc/Makefile.am -4 +4
+    M ./rdft/codelets/r2hc.c -1 +1
+    M ./rdft/rdft.h -2 +2
+    M ./support/Makefile.am -1 +2
+    M ./support/Makefile.codelets -2 +4
+    R ./support/codelet_prelude
+    M ./support/codelet_prelude.dft +8
+    M ./support/codelet_prelude.rdft +8
+
+Tue Jan  7 16:21:16 EST 2003  stevenj
+  * [project @ 2003-01-07 21:21:16 by stevenj]
+  updated
+
+    M ./ChangeLog +737
+
+Tue Jan  7 15:47:24 EST 2003  athena
+  * [project @ 2003-01-07 20:47:24 by athena]
+  Silence warnings
+
+    M ./simd/simd-3dnow.h -1 +1
+    M ./simd/simd-sse.h -1
+    M ./simd/simd-sse2.h -1 +1
+    M ./simd/sse2.c -2 +1
+
+Tue Jan  7 15:00:14 EST 2003  stevenj
+  * [project @ 2003-01-07 20:00:14 by stevenj]
+  fftw2 used spltrnk=1
+
+    M ./dft/rank-geq2.c -2 +4
+    M ./rdft/rank-geq2.c -2 +4
+
+Tue Jan  7 14:32:06 EST 2003  athena
+  * [project @ 2003-01-07 19:32:06 by athena]
+  Silence warning
+
+    M ./dft/codelet.h -2 +1
+    M ./rdft/codelet.h -2 +1
+    M ./simd/simd-sse.h -1 +2
+    M ./simd/sse.c -2 +1
+
+Tue Jan  7 12:13:50 EST 2003  stevenj
+  * [project @ 2003-01-07 17:13:50 by stevenj]
+  noted deficiency
+
+    M ./TODO +2
+
+Tue Jan  7 07:18:51 EST 2003  athena
+  * [project @ 2003-01-07 12:18:51 by athena]
+  Strengthened conditions for a problem to be POSSIBLY_UNALIGNED
+
+    M ./rdft/vrank-geq1-rdft2.c -4 +2
+    M ./rdft/vrank-geq1.c -2 +2
+
+Tue Jan  7 05:09:42 EST 2003  athena
+  * [project @ 2003-01-07 10:09:42 by athena]
+  Strengthened conditions for a plan to be POSSIBLY_UNALIGNED
+
+    M ./dft/vrank-geq1.c -5 +2
+    M ./kernel/align.c -1 +6
+    M ./kernel/ifftw.h -1 +2
+
+Sun Jan  5 02:43:45 EST 2003  stevenj
+  * [project @ 2003-01-05 07:43:45 by stevenj]
+  added copyright todo
+
+    M ./TODO +2
+
+Sun Jan  5 02:37:31 EST 2003  stevenj
+  * [project @ 2003-01-05 07:37:31 by stevenj]
+  modified comment
+
+    M ./kernel/planner.c -2 +3
+
+Sun Jan  5 02:34:36 EST 2003  stevenj
+  * [project @ 2003-01-05 07:33:41 by stevenj]
+  fixed comment
+
+    M ./tests/verify-rdft.c -3 +3
+
+Sun Jan  5 02:31:56 EST 2003  stevenj
+  * [project @ 2003-01-05 07:31:56 by stevenj]
+  implemented rdft2 verify
+
+    M ./TODO -4 +2
+    M ./tests/verify-rdft.c -5 +15
+
+Sat Jan  4 16:20:42 EST 2003  stevenj
+  * [project @ 2003-01-04 21:20:42 by stevenj]
+  fix --enable-single
+
+    M ./configure.ac -1 +1
+
+Wed Oct 23 12:59:12 EDT 2002  stevenj
+  * [project @ 2002-10-23 16:59:12 by stevenj]
+  slight fixes
+
+    M ./threads/threads.c -20 +14
+
+Wed Oct 23 12:42:39 EDT 2002  stevenj
+  * [project @ 2002-10-23 16:42:39 by stevenj]
+  typo
+
+    M ./threads/threads.c -1 +1
+
+Tue Oct  1 09:32:56 EDT 2002  athena
+  * [project @ 2002-10-01 13:32:56 by athena]
+  Experimental stuff
+
+    M ./genfft/annotate.ml -2 +9
+    M ./genfft/annotate.mli -3 +2
+    M ./genfft/c.ml -4 +16
+    M ./genfft/genutil.ml -1 +11
+    M ./genfft/magic.ml -1 +5
+
+Sat Sep 28 13:03:53 EDT 2002  athena
+  * [project @ 2002-09-28 17:03:53 by athena]
+  Experimental Franz mode
+
+    M ./configure.ac +3
+    M ./dft/simd/codelets/Makefile.am -3 +68
+    M ./genfft/gen_notw_c.ml -7 +8
+    M ./genfft/gen_twiddle_c.ml -6 +7
+    M ./genfft/genutil.ml -2 +10
+
+Thu Sep 26 15:14:38 EDT 2002  athena
+  * [project @ 2002-09-26 19:14:38 by athena]
+  const-correct
+
+    M ./kernel/tensor.c -2 +2
+
+Thu Sep 26 15:06:38 EDT 2002  athena
+  * [project @ 2002-09-26 19:06:38 by athena]
+  Reuse dimcmp routine for other purposes
+
+    M ./dft/vrank2-transpose.c -4 +3
+    M ./dft/vrank3-transpose.c -4 +2
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/tensor7.c -5 +5
+    M ./rdft/vrank2-transpose.c -4 +2
+    M ./rdft/vrank3-transpose.c -4 +2
+
+Wed Sep 25 07:37:38 EDT 2002  athena
+  * [project @ 2002-09-25 11:37:38 by athena]
+  Use tornk1 correctly.
+
+    M ./dft/direct.c -3 +3
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/tensor.c -2 +3
+    M ./rdft/direct.c -3 +3
+    M ./rdft/direct2.c -3 +3
+
+Wed Sep 25 07:36:38 EDT 2002  athena
+  * [project @ 2002-09-25 11:36:38 by athena]
+  Hmm... I thought I had fixed this before...
+
+    M ./rdft/rdft2-radix2.c -2 +2
+
+Tue Sep 24 21:27:49 EDT 2002  athena
+  * [project @ 2002-09-25 01:27:49 by athena]
+  Collect more common idioms
+
+    M ./dft/buffered.c -5 +3
+    M ./dft/rank0.c -16 +3
+    M ./kernel/tensor.c -2 +2
+    M ./rdft/buffered.c -5 +2
+    M ./rdft/buffered2.c -5 +2
+    M ./rdft/rank0.c -16 +3
+
+Tue Sep 24 21:15:57 EDT 2002  athena
+  * [project @ 2002-09-25 01:15:57 by athena]
+  Still collecting common idioms...
+
+    M ./dft/direct.c -10 +3
+    M ./rdft/direct.c -10 +2
+    M ./rdft/direct2.c -9 +2
+
+Tue Sep 24 21:13:00 EDT 2002  athena
+  * [project @ 2002-09-25 01:13:00 by athena]
+  More garbage collection.
+
+    M ./dft/direct.c -6 +5
+    M ./rdft/direct.c -6 +5
+    M ./rdft/direct2.c -6 +5
+
+Tue Sep 24 21:08:19 EDT 2002  athena
+  * [project @ 2002-09-25 01:08:19 by athena]
+  More compact code
+
+    M ./dft/buffered.c -6 +3
+
+Tue Sep 24 20:54:43 EDT 2002  athena
+  * [project @ 2002-09-25 00:54:43 by athena]
+  Collect common pattern if (foo) free(foo) ==> free0(foo)
+
+    M ./dft/buffered.c -7 +6
+    M ./dft/generic.c -2 +1
+    M ./dft/rader.c -6 +3
+    M ./kernel/alloc.c -1 +7
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -8 +4
+    M ./kernel/stride.c -3 +2
+    M ./kernel/tensor.c -5 +2
+    M ./rdft/buffered.c -3 +2
+    M ./rdft/buffered2.c -3 +2
+    M ./rdft/dht-rader.c -2 +1
+    M ./rdft/generic.c -2 +1
+    M ./rdft/problem.c -3 +2
+    M ./rdft/rader-hc2hc.c -6 +3
+
+Tue Sep 24 20:08:44 EDT 2002  athena
+  * [project @ 2002-09-25 00:08:44 by athena]
+  Collect some common code in */buffered*.c
+
+    A ./kernel/buffered.c
+    M ./dft/buffered.c -18 +2
+    M ./kernel/Makefile.am -7 +6
+    M ./kernel/buffered.c +44
+    M ./kernel/ifftw.h -1 +2
+    M ./rdft/buffered.c -18 +3
+    M ./rdft/buffered2.c -18 +2
+
+Tue Sep 24 19:39:22 EDT 2002  stevenj
+  * [project @ 2002-09-24 23:39:22 by stevenj]
+  use STRUCT_HACK #define to determing rdft kind[] allocation
+
+    M ./rdft/problem.c -8 +23
+    M ./rdft/rdft.h -2 +8
+
+Tue Sep 24 17:21:09 EDT 2002  stevenj
+  * [project @ 2002-09-24 21:21:09 by stevenj]
+  report total pcost of measured/estimated plans...epcost is especially useful to estimate the effects of various impatience flags on planning time for large transforms
+
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -1 +6
+
+Mon Sep 23 18:49:10 EDT 2002  athena
+  * [project @ 2002-09-23 22:49:10 by athena]
+  Prevent unwanted inlining
+
+    A ./kernel/trig1.c
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/trig.c -54 +7
+    M ./kernel/trig1.c +70
+
+Mon Sep 23 18:37:59 EDT 2002  athena
+  * [project @ 2002-09-23 22:37:59 by athena]
+  Space compaction
+
+    M ./kernel/ifftw.h -2 +1
+    M ./kernel/trig.c -21 +27
+
+Mon Sep 23 11:49:32 EDT 2002  athena
+  * [project @ 2002-09-23 15:49:32 by athena]
+  Still reducing size
+
+    A ./kernel/hash.c
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/hash.c +31
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/md5-1.c +1
+    M ./kernel/planner.c -12 +3
+    M ./kernel/scan.c -22 +1
+
+Sun Sep 22 16:03:30 EDT 2002  athena
+  * [project @ 2002-09-22 20:03:30 by athena]
+  Saved another 5KB by redesigning opcnt protocol. (gasp!)
+
+    M ./dft/buffered.c -5 +6
+    M ./dft/ct-dif.c -4 +3
+    M ./dft/ct-dit.c -4 +3
+    M ./dft/ct-ditbuf.c -6 +6
+    M ./dft/ct-ditf.c -4 +3
+    M ./dft/direct.c -2 +3
+    M ./dft/generic.c -4 +2
+    M ./dft/indirect.c -2 +2
+    M ./dft/nop.c -2 +2
+    M ./dft/rader.c -4 +3
+    M ./dft/rank-geq2.c -2 +2
+    M ./dft/rank0.c -2 +2
+    M ./dft/vrank-geq1.c -2 +3
+    M ./dft/vrank2-transpose.c -2 +2
+    M ./dft/vrank3-transpose.c -2 +2
+    M ./kernel/ifftw.h -6 +14
+    M ./kernel/ops.c -23 +28
+    M ./kernel/plan.c -2 +2
+    M ./rdft/buffered.c -5 +6
+    M ./rdft/buffered2.c -3 +3
+    M ./rdft/dht-rader.c -1 +1
+    M ./rdft/direct.c -3 +8
+    M ./rdft/direct2.c -3 +8
+    M ./rdft/generic.c -4 +3
+    M ./rdft/hc2hc-buf.c -10 +9
+    M ./rdft/hc2hc-dif.c -8 +8
+    M ./rdft/hc2hc-dit.c -8 +8
+    M ./rdft/indirect.c -2 +2
+    M ./rdft/nop.c -2 +2
+    M ./rdft/nop2.c -2 +2
+    M ./rdft/rader-hc2hc.c -8 +5
+    M ./rdft/rank-geq2-rdft2.c -2 +2
+    M ./rdft/rank-geq2.c -2 +2
+    M ./rdft/rank0.c -2 +2
+    M ./rdft/rdft2-radix2.c -5 +4
+    M ./rdft/vrank-geq1-rdft2.c -2 +3
+    M ./rdft/vrank-geq1.c -2 +3
+    M ./rdft/vrank2-transpose.c -2 +2
+    M ./rdft/vrank3-transpose.c -2 +2
+    M ./threads/ct-dit.c -4 +3
+    M ./threads/dft-vrank-geq1.c -4 +3
+    M ./threads/hc2hc-dif.c -8 +8
+    M ./threads/hc2hc-dit.c -8 +8
+    M ./threads/rdft-vrank-geq1.c -4 +3
+    M ./threads/vrank-geq1-rdft2.c -4 +3
+
+Sun Sep 22 15:00:59 EDT 2002  athena
+  * [project @ 2002-09-22 19:00:59 by athena]
+  More code compression
+
+    A ./kernel/tensor8.c
+    M ./dft/buffered.c -4 +3
+    M ./dft/direct.c -3 +2
+    M ./dft/indirect.c -4 +3
+    M ./dft/problem.c -5 +3
+    M ./dft/rank-geq2.c -9 +3
+    M ./kernel/Makefile.am -2 +2
+    M ./kernel/ifftw.h -1 +5
+    M ./kernel/tensor1.c -1 +6
+    M ./kernel/tensor4.c -1 +6
+    M ./kernel/tensor8.c +35
+    M ./rdft/buffered.c -4 +3
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/direct.c -3 +2
+    M ./rdft/hc2hc.c -7 +6
+    M ./rdft/indirect.c -4 +3
+    M ./rdft/problem.c -7 +4
+    M ./rdft/problem2.c -5 +3
+    M ./rdft/rank-geq2-rdft2.c -9 +3
+    M ./rdft/rank-geq2.c -13 +5
+
+Sun Sep 22 13:27:46 EDT 2002  athena
+  * [project @ 2002-09-22 17:27:46 by athena]
+  Smaller code size.
+
+    M ./kernel/ifftw.h -2 +4
+    M ./kernel/solver.c -1 +6
+
+Sun Sep 22 12:50:36 EDT 2002  athena
+  * [project @ 2002-09-22 16:50:36 by athena]
+  Started unification of rader
+
+    A ./dft/rader-omega.c
+    M ./dft/Makefile.am -3 +3
+    M ./dft/dft.h -1 +5
+    M ./dft/rader-omega.c +57
+    M ./dft/rader.c -38 +3
+    M ./rdft/rader-hc2hc.c -37 +3
+
+Sun Sep 22 12:35:30 EDT 2002  athena
+  * [project @ 2002-09-22 16:35:30 by athena]
+  Typo
+
+    M ./rdft/rdft2-radix2.c -2 +2
+
+Sun Sep 22 12:25:20 EDT 2002  athena
+  * [project @ 2002-09-22 16:25:20 by athena]
+  Changed protocol for destroy_plan so as to save space.
+
+    M ./dft/buffered.c -8 +4
+    M ./dft/ct.c -2 +1
+    M ./dft/direct.c -2 +1
+    M ./dft/generic.c -3 +1
+    M ./dft/indirect.c -6 +3
+    M ./dft/nop.c -7 +2
+    M ./dft/rader.c -7 +3
+    M ./dft/rank-geq2.c -6 +3
+    M ./dft/rank0.c -7 +2
+    M ./dft/vrank-geq1.c -2 +1
+    M ./dft/vrank2-transpose.c -7 +2
+    M ./dft/vrank3-transpose.c -7 +2
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/plan.c -3 +11
+    M ./kernel/problem.c -2 +3
+    M ./rdft/buffered.c -8 +4
+    M ./rdft/buffered2.c -6 +3
+    M ./rdft/dft-r2hc.c -2 +1
+    M ./rdft/dht-r2hc.c -2 +1
+    M ./rdft/dht-rader.c -7 +3
+    M ./rdft/direct.c -2 +1
+    M ./rdft/direct2.c -2 +1
+    M ./rdft/generic.c -3 +1
+    M ./rdft/hc2hc.c -14 +7
+    M ./rdft/indirect.c -6 +3
+    M ./rdft/nop.c -7 +2
+    M ./rdft/nop2.c -7 +2
+    M ./rdft/rader-hc2hc.c -9 +4
+    M ./rdft/rank-geq2-rdft2.c -6 +3
+    M ./rdft/rank-geq2.c -6 +3
+    M ./rdft/rank0.c -7 +2
+    M ./rdft/rdft-dht.c -2 +1
+    M ./rdft/rdft2-radix2.c -2 +1
+    M ./rdft/vrank-geq1-rdft2.c -2 +1
+    M ./rdft/vrank-geq1.c -2 +1
+    M ./rdft/vrank2-transpose.c -7 +2
+    M ./rdft/vrank3-transpose.c -7 +2
+    M ./reodft/redft00e-r2hc.c -2 +1
+    M ./reodft/reodft010e-r2hc.c -2 +1
+    M ./reodft/reodft11e-r2hc.c -2 +1
+    M ./reodft/rodft00e-r2hc.c -2 +1
+    M ./threads/dft-vrank-geq1.c -4 +2
+    M ./threads/rdft-vrank-geq1.c -4 +2
+    M ./threads/vrank-geq1-rdft2.c -4 +2
+
+Sun Sep 22 11:08:57 EDT 2002  athena
+  * [project @ 2002-09-22 15:08:57 by athena]
+  Introduced convenient function X(mkplan_d)
+
+    M ./dft/buffered.c -31 +18
+    M ./dft/ct.c -5 +2
+    M ./dft/generic.c -10 +5
+    M ./dft/indirect.c -14 +8
+    M ./dft/rader.c -38 +20
+    M ./dft/rank-geq2.c -16 +12
+    M ./dft/vrank-geq1.c -9 +7
+    M ./kernel/ifftw.h -2 +4
+    M ./kernel/planner.c -1 +8
+    M ./rdft/buffered.c -30 +20
+    M ./rdft/buffered2.c -9 +3
+    M ./rdft/dft-r2hc.c -7 +5
+    M ./rdft/dht-r2hc.c -7 +4
+    M ./rdft/dht-rader.c -23 +12
+    M ./rdft/generic.c -4 +1
+    M ./rdft/hc2hc.c -16 +10
+    M ./rdft/indirect.c -14 +9
+    M ./rdft/rader-hc2hc.c -41 +21
+    M ./rdft/rank-geq2-rdft2.c -12 +8
+    M ./rdft/rank-geq2.c -16 +12
+    M ./rdft/rdft-dht.c -5 +3
+    M ./rdft/rdft2-radix2.c -7 +3
+    M ./rdft/vrank-geq1-rdft2.c -9 +7
+    M ./rdft/vrank-geq1.c -10 +7
+    M ./reodft/redft00e-r2hc.c -3 +2
+    M ./reodft/reodft010e-r2hc.c -3 +2
+    M ./reodft/reodft11e-r2hc.c -3 +2
+    M ./reodft/rodft00e-r2hc.c -3 +2
+    M ./threads/ct-dit.c -4 +4
+    M ./threads/dft-vrank-geq1.c -12 +10
+    M ./threads/hc2hc-dif.c -4 +4
+    M ./threads/hc2hc-dit.c -4 +4
+    M ./threads/rdft-vrank-geq1.c -11 +9
+    M ./threads/vrank-geq1-rdft2.c -11 +9
+
+Sun Sep 22 10:21:36 EDT 2002  athena
+  * [project @ 2002-09-22 14:21:36 by athena]
+  Split tensor/md5 into separate files to allow independent linking
+  and/or prevent undesidred inlining
+
+    A ./kernel/md5-1.c
+    A ./kernel/tensor1.c
+    A ./kernel/tensor2.c
+    A ./kernel/tensor3.c
+    A ./kernel/tensor4.c
+    A ./kernel/tensor5.c
+    A ./kernel/tensor7.c
+    M ./kernel/Makefile.am -3 +4
+    M ./kernel/md5-1.c +53
+    M ./kernel/md5.c -31
+    M ./kernel/tensor.c -270 +2
+    M ./kernel/tensor1.c +32
+    M ./kernel/tensor2.c +37
+    M ./kernel/tensor3.c +46
+    M ./kernel/tensor4.c +68
+    M ./kernel/tensor5.c +93
+    M ./kernel/tensor7.c +127
+
+Sun Sep 22 09:49:09 EDT 2002  athena
+  * [project @ 2002-09-22 13:49:08 by athena]
+  Treat all tensors as dynamically allocated objects.  They were
+  dynamically allocated in part anyway, so there is no point in
+  complicating the object code with the clumsy calling conventions
+  for by-value structs.
+
+    M ./dft/buffered.c -19 +19
+    M ./dft/ct-dif.c -5 +5
+    M ./dft/ct-dit.c -5 +5
+    M ./dft/ct-ditbuf.c -4 +5
+    M ./dft/ct-ditf.c -5 +5
+    M ./dft/ct.c -14 +14
+    M ./dft/dft.h -4 +4
+    M ./dft/direct.c -13 +13
+    M ./dft/generic.c -7 +7
+    M ./dft/indirect.c -18 +16
+    M ./dft/nop.c -5 +5
+    M ./dft/problem.c -12 +12
+    M ./dft/rader.c -12 +12
+    M ./dft/rank-geq2.c -22 +22
+    M ./dft/rank0.c -14 +14
+    M ./dft/vrank-geq1.c -11 +11
+    M ./dft/vrank2-transpose.c -9 +9
+    M ./dft/vrank3-transpose.c -12 +12
+    M ./dft/zero.c -18 +20
+    M ./kernel/ifftw.h -13 +22
+    M ./kernel/tensor.c -92 +102
+    M ./rdft/buffered.c -23 +21
+    M ./rdft/buffered2.c -24 +24
+    M ./rdft/dft-r2hc.c -11 +11
+    M ./rdft/dht-r2hc.c -6 +6
+    M ./rdft/dht-rader.c -7 +7
+    M ./rdft/direct.c -18 +18
+    M ./rdft/direct2.c -16 +16
+    M ./rdft/generic.c -8 +8
+    M ./rdft/hc2hc-buf.c -4 +5
+    M ./rdft/hc2hc-dif.c -5 +5
+    M ./rdft/hc2hc-dit.c -5 +5
+    M ./rdft/hc2hc.c -23 +23
+    M ./rdft/indirect.c -20 +18
+    M ./rdft/nop.c -5 +5
+    M ./rdft/nop2.c -4 +4
+    M ./rdft/problem.c -39 +40
+    M ./rdft/problem2.c -34 +34
+    M ./rdft/rader-hc2hc.c -11 +11
+    M ./rdft/rank-geq2-rdft2.c -23 +23
+    M ./rdft/rank-geq2.c -28 +28
+    M ./rdft/rank0.c -10 +10
+    M ./rdft/rdft-dht.c -10 +10
+    M ./rdft/rdft.h -9 +9
+    M ./rdft/rdft2-radix2.c -21 +21
+    M ./rdft/vrank-geq1-rdft2.c -11 +11
+    M ./rdft/vrank-geq1.c -11 +11
+    M ./rdft/vrank2-transpose.c -9 +9
+    M ./rdft/vrank3-transpose.c -12 +12
+    M ./reodft/redft00e-r2hc.c -10 +10
+    M ./reodft/reodft010e-r2hc.c -10 +10
+    M ./reodft/reodft11e-r2hc.c -10 +10
+    M ./reodft/rodft00e-r2hc.c -10 +10
+    M ./tests/debug.h -2 +2
+    M ./tests/dotens.c -4 +4
+    M ./tests/dotens2.c -5 +5
+    M ./tests/verify-dft.c -10 +11
+    M ./tests/verify-lib.c -15 +16
+    M ./tests/verify-rdft.c -43 +47
+    M ./tests/verify-reodft.c -28 +28
+    M ./tests/verify.h -3 +3
+
+Sat Sep 21 18:24:55 EDT 2002  stevenj
+  * [project @ 2002-09-21 22:24:55 by stevenj]
+  typo
+
+    M ./kernel/ifftw.h -2 +2
+
+Sat Sep 21 18:10:07 EDT 2002  athena
+  * [project @ 2002-09-21 22:10:07 by athena]
+  Avoid generating NaN when n = 0.
+
+    M ./tests/verify-lib.c -13 +16
+
+Sat Sep 21 18:04:05 EDT 2002  athena
+  * [project @ 2002-09-21 22:04:05 by athena]
+  Saved more.
+
+    M ./dft/dft.h -2 +2
+    M ./dft/problem.c -5 +5
+    M ./dft/rank-geq2.c -2 +2
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/dht-r2hc.c -2 +2
+    M ./rdft/hc2hc.c -5 +7
+    M ./rdft/problem.c -11 +11
+    M ./rdft/problem2.c -5 +5
+    M ./rdft/rank-geq2.c -2 +2
+    M ./rdft/rdft-dht.c -3 +3
+    M ./rdft/rdft.h -4 +4
+    M ./reodft/redft00e-r2hc.c -2 +2
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -2 +2
+    M ./reodft/rodft00e-r2hc.c -2 +2
+    M ./threads/dft-vrank-geq1.c -2 +2
+
+Sat Sep 21 17:47:36 EDT 2002  athena
+  * [project @ 2002-09-21 21:47:35 by athena]
+  Save 1200 bytes of object code.  Do not pass structs by value whenever
+  practical, because the calling protocol generates clumsy code.
+
+    M ./dft/buffered.c -6 +6
+    M ./dft/ct.c -5 +5
+    M ./dft/direct.c -3 +3
+    M ./dft/indirect.c -12 +12
+    M ./dft/nop.c -2 +2
+    M ./dft/problem.c -11 +11
+    M ./dft/rank-geq2.c -21 +21
+    M ./dft/vrank-geq1.c -6 +6
+    M ./dft/vrank3-transpose.c -8 +8
+    M ./kernel/ifftw.h -20 +22
+    M ./kernel/pickdim.c -9 +9
+    M ./kernel/print.c -2 +2
+    M ./kernel/tensor.c -69 +71
+    M ./rdft/buffered.c -6 +6
+    M ./rdft/buffered2.c -2 +2
+    M ./rdft/dft-r2hc.c -3 +3
+    M ./rdft/direct.c -3 +3
+    M ./rdft/hc2hc.c -9 +9
+    M ./rdft/indirect.c -14 +14
+    M ./rdft/nop.c -2 +2
+    M ./rdft/problem.c -15 +15
+    M ./rdft/problem2.c -15 +15
+    M ./rdft/rank-geq2-rdft2.c -19 +19
+    M ./rdft/rank-geq2.c -27 +27
+    M ./rdft/rdft-dht.c -3 +3
+    M ./rdft/rdft.h -2 +2
+    M ./rdft/rdft2-radix2.c -7 +7
+    M ./rdft/vrank-geq1-rdft2.c -6 +7
+    M ./rdft/vrank-geq1.c -6 +6
+    M ./rdft/vrank3-transpose.c -8 +8
+    M ./reodft/redft00e-r2hc.c -2 +2
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -2 +2
+    M ./reodft/rodft00e-r2hc.c -2 +2
+    M ./tests/verify-dft.c -6 +6
+    M ./tests/verify-lib.c -2 +2
+    M ./tests/verify-rdft.c -21 +21
+    M ./tests/verify-reodft.c -10 +10
+    M ./threads/dft-vrank-geq1.c -3 +3
+    M ./threads/rdft-vrank-geq1.c -3 +3
+    M ./threads/vrank-geq1-rdft2.c -3 +4
+
+Sat Sep 21 12:10:21 EDT 2002  athena
+  * [project @ 2002-09-21 16:10:21 by athena]
+  Do not allocate buffers for rader omegas.  Let the planner do it
+  if necessary.
+
+    M ./rdft/dht-rader.c -15 +6
+
+Sat Sep 21 12:03:46 EDT 2002  athena
+  * [project @ 2002-09-21 16:03:46 by athena]
+  Check rank *before* reading kind[0], which may be undefined if rnk < 1
+
+    M ./tests/verify-rdft.c -2 +2
+    M ./tests/verify-reodft.c -2 +2
+
+Sat Sep 21 11:48:50 EDT 2002  athena
+  * [project @ 2002-09-21 15:48:50 by athena]
+  Second step towards rader unification.
+
+    M ./dft/rader.c -17 +7
+    M ./rdft/rader-hc2hc.c -1 +1
+
+Sat Sep 21 11:37:06 EDT 2002  athena
+  * [project @ 2002-09-21 15:37:06 by athena]
+  First step towards unification of Rader code
+
+    A ./kernel/rader.c
+    M ./dft/rader.c -57 +12
+    M ./kernel/Makefile.am -3 +3
+    M ./kernel/ifftw.h -1 +9
+    M ./kernel/rader.c +68
+    M ./rdft/dht-rader.c -50 +6
+    M ./rdft/rader-hc2hc.c -57 +11
+
+Sat Sep 21 07:58:11 EDT 2002  athena
+  * [project @ 2002-09-21 11:58:11 by athena]
+  Fix ugliness condition for cooley-tukey.
+
+    A ./kernel/ct.c
+    M ./dft/ct-dif.c -5 +3
+    M ./dft/ct-dit.c -4 +2
+    M ./dft/ct-ditbuf.c -5 +3
+    M ./kernel/Makefile.am -4 +5
+    M ./kernel/ct.c +31
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -3 +2
+    M ./rdft/dht-r2hc.c -3 +3
+    M ./rdft/dht-rader.c -3 +3
+    M ./rdft/hc2hc-buf.c -9 +3
+    M ./rdft/hc2hc-dif.c -4 +2
+    M ./rdft/hc2hc-dit.c -4 +2
+    M ./rdft/rdft-dht.c -4 +2
+    M ./reodft/redft00e-r2hc.c -4 +2
+    M ./reodft/reodft010e-r2hc.c -4 +2
+    M ./reodft/reodft11e-r2hc.c -4 +2
+    M ./reodft/rodft00e-r2hc.c -4 +2
+    M ./threads/ct-dit.c -5 +3
+    M ./threads/hc2hc-dif.c -5 +3
+    M ./threads/hc2hc-dit.c -5 +3
+
+Fri Sep 20 16:53:45 EDT 2002  athena
+  * [project @ 2002-09-20 20:53:45 by athena]
+  Removed RADER_MIN_GOOD and associated machinery
+
+    M ./dft/rader.c -29 +6
+    M ./kernel/ifftw.h -2 +1
+    M ./rdft/dht-rader.c -14 +3
+    M ./rdft/rader-hc2hc.c -17 +5
+
+Fri Sep 20 14:49:12 EDT 2002  athena
+  * [project @ 2002-09-20 18:49:12 by athena]
+  Proper cast
+
+    M ./rdft/dht-r2hc.c -4 +3
+
+Fri Sep 20 14:45:54 EDT 2002  athena
+  * [project @ 2002-09-20 18:45:54 by athena]
+  Typo
+
+    M ./kernel/planner.c -2 +2
+
+Fri Sep 20 14:38:13 EDT 2002  athena
+  * [project @ 2002-09-20 18:38:13 by athena]
+  Implemented NO_LARGE_GENERIC
+
+    M ./dft/generic.c -1 +8
+    M ./kernel/ifftw.h -20 +23
+    M ./rdft/dht-rader.c -1 +1
+    M ./rdft/generic.c -2 +8
+    M ./rdft/rdft-dht.c -10 +2
+    M ./tests/bench.c +1
+
+Thu Sep 19 07:48:25 EDT 2002  athena
+  * [project @ 2002-09-19 11:48:24 by athena]
+  Consistent macroization of NO_DHT_R2HC
+
+    M ./kernel/ifftw.h -1 +2
+    M ./rdft/dht-r2hc.c -4 +4
+
+Wed Sep 18 21:47:17 EDT 2002  athena
+  * [project @ 2002-09-19 01:47:17 by athena]
+  NO_DHT_R2HC is a planner flag, otherwise the EXHAUSTIVE planner loops.
+
+    M ./kernel/ifftw.h -7 +6
+    M ./kernel/planner.c -10 +6
+    M ./rdft/dht-r2hc.c -3 +3
+    M ./tests/bench.c +1
+
+Wed Sep 18 20:47:31 EDT 2002  athena
+  * [project @ 2002-09-19 00:47:31 by athena]
+  Resurrected NO_EXHAUSTIVE
+
+    M ./kernel/ifftw.h -1 +3
+    M ./kernel/planner.c -8 +24
+
+Wed Sep 18 19:31:57 EDT 2002  stevenj
+  * [project @ 2002-09-18 23:31:57 by stevenj]
+  au revoir, score()
+
+    M ./threads/ct-dit.c -25 +22
+    M ./threads/dft-vrank-geq1.c -14 +10
+    M ./threads/hc2hc-dif.c -25 +21
+    M ./threads/hc2hc-dit.c -25 +21
+    M ./threads/rdft-vrank-geq1.c -14 +10
+    M ./threads/vrank-geq1-rdft2.c -14 +10
+
+Wed Sep 18 19:31:05 EDT 2002  stevenj
+  * [project @ 2002-09-18 23:31:05 by stevenj]
+  eliminated unused
+
+    M ./tests/bench.c +2
+    M ./tests/verify-reodft.c -7 +7
+
+Wed Sep 18 18:28:44 EDT 2002  stevenj
+  * [project @ 2002-09-18 22:28:44 by stevenj]
+  capitalize and parenthesize SUBSUMES
+
+    M ./kernel/planner.c -9 +8
+
+Wed Sep 18 18:26:58 EDT 2002  stevenj
+  * [project @ 2002-09-18 22:26:58 by stevenj]
+  comment
+
+    M ./kernel/ifftw.h -2 +2
+
+Wed Sep 18 18:03:18 EDT 2002  athena
+  * [project @ 2002-09-18 22:03:18 by athena]
+  Use flags from wisdom if wisdom is applicable.
+
+    M ./kernel/ifftw.h -2 +3
+    M ./kernel/planner.c -70 +32
+
+Wed Sep 18 17:16:17 EDT 2002  athena
+  * [project @ 2002-09-18 21:16:16 by athena]
+  Removed score() machinery
+
+    M ./dft/buffered.c -16 +11
+    M ./dft/ct-dif.c -23 +21
+    M ./dft/ct-dit.c -29 +25
+    M ./dft/ct-ditbuf.c -30 +24
+    M ./dft/ct-ditf.c -10 +2
+    M ./dft/direct.c -8 +2
+    M ./dft/generic.c -9 +7
+    M ./dft/indirect.c -9 +12
+    M ./dft/nop.c -8 +2
+    M ./dft/rader.c -23 +25
+    M ./dft/rank-geq2.c -14 +13
+    M ./dft/rank0.c -8 +2
+    M ./dft/vrank-geq1.c -30 +29
+    M ./dft/vrank2-transpose.c -8 +2
+    M ./dft/vrank3-transpose.c -13 +15
+    M ./kernel/ifftw.h -12 +2
+    M ./kernel/planner.c -31 +19
+    M ./rdft/buffered.c -18 +11
+    M ./rdft/buffered2.c -18 +11
+    M ./rdft/dft-r2hc.c -12 +14
+    M ./rdft/dht-r2hc.c -8 +8
+    M ./rdft/dht-rader.c -12 +12
+    M ./rdft/direct.c -9 +3
+    M ./rdft/direct2.c -9 +3
+    M ./rdft/generic.c -8 +7
+    M ./rdft/hc2hc-buf.c -30 +29
+    M ./rdft/hc2hc-dif.c -33 +25
+    M ./rdft/hc2hc-dit.c -32 +27
+    M ./rdft/indirect.c -9 +14
+    M ./rdft/nop.c -8 +2
+    M ./rdft/nop2.c -8 +2
+    M ./rdft/rader-hc2hc.c -13 +12
+    M ./rdft/rank-geq2-rdft2.c -15 +17
+    M ./rdft/rank-geq2.c -17 +19
+    M ./rdft/rank0.c -8 +2
+    M ./rdft/rdft-dht.c -11 +13
+    M ./rdft/rdft2-radix2.c -12 +2
+    M ./rdft/vrank-geq1-rdft2.c -32 +29
+    M ./rdft/vrank-geq1.c -31 +31
+    M ./rdft/vrank2-transpose.c -8 +2
+    M ./rdft/vrank3-transpose.c -18 +14
+    M ./reodft/redft00e-r2hc.c -6 +7
+    M ./reodft/reodft010e-r2hc.c -6 +7
+    M ./reodft/reodft11e-r2hc.c -6 +7
+    M ./reodft/rodft00e-r2hc.c -6 +7
+    M ./tests/bench.c -2
+
+Wed Sep 18 14:12:21 EDT 2002  athena
+  * [project @ 2002-09-18 18:12:21 by athena]
+  Revised planner hack
+
+    M ./kernel/planner.c -4 +7
+
+Wed Sep 18 10:14:41 EDT 2002  athena
+  * [project @ 2002-09-18 14:14:41 by athena]
+  Fix warning
+
+    M ./simd/simd-altivec.h +2
+
+Tue Sep 17 17:54:07 EDT 2002  athena
+  * [project @ 2002-09-17 21:54:07 by athena]
+  Type qualifiers.
+
+    M ./dft/indirect.c -2 +2
+    M ./rdft/indirect.c -2 +2
+
+Tue Sep 17 16:17:55 EDT 2002  athena
+  * [project @ 2002-09-17 20:17:55 by athena]
+  ESTIMATE is no longer subsumed by everything else.
+
+    M ./kernel/planner.c -2 +1
+
+Tue Sep 17 10:55:15 EDT 2002  athena
+  * [project @ 2002-09-17 14:55:15 by athena]
+  NO_BUFFERING is a planner flag, not a problem flag
+
+    M ./dft/indirect.c -2 +2
+    M ./rdft/indirect.c -2 +2
+
+Tue Sep 17 09:36:16 EDT 2002  athena
+  * [project @ 2002-09-17 13:36:16 by athena]
+  Maintain flags in canonical form.
+
+    M ./kernel/ifftw.h -4 +6
+    M ./kernel/planner.c -13 +15
+
+Tue Sep 17 09:09:57 EDT 2002  athena
+  * [project @ 2002-09-17 13:09:56 by athena]
+  In dramatic break with tradition, SUBSUME is now a partial order.  I
+  swear.
+
+    M ./kernel/ifftw.h -4 +1
+    M ./kernel/planner.c -15 +54
+
+Tue Sep 17 07:29:00 EDT 2002  athena
+  * [project @ 2002-09-17 11:29:00 by athena]
+  Added comment
+
+    M ./kernel/planner.c -1 +3
+
+Tue Sep 17 07:27:17 EDT 2002  athena
+  * [project @ 2002-09-17 11:27:17 by athena]
+  Inverted ESTIMATE flag, renamed USE_SCORE for consistency with the
+  convention that 0 subsumes 1.
+
+    M ./kernel/ifftw.h -9 +6
+    M ./kernel/planner.c -10 +7
+    M ./tests/bench.c +2
+
+Tue Sep 17 02:50:15 EDT 2002  stevenj
+  * [project @ 2002-09-17 06:50:15 by stevenj]
+  NO_INDIRECT -> NO_INDIRECT_OP (out-of-place only)
+
+    M ./dft/indirect.c -4 +5
+    M ./kernel/ifftw.h -3 +3
+    M ./rdft/indirect.c -4 +5
+    M ./tests/bench.c -1 +1
+
+Tue Sep 17 00:40:04 EDT 2002  stevenj
+  * [project @ 2002-09-17 04:40:04 by stevenj]
+  hpux needs -D_REENTRANT (thanks to Clinton Roy for the bug report)
+
+    M ./acx_pthread.m4 -2 +2
+
+Mon Sep 16 23:54:34 EDT 2002  athena
+  * [project @ 2002-09-17 03:54:34 by athena]
+  Oops.
+
+    M ./kernel/planner.c -2 +2
+
+Mon Sep 16 23:44:47 EDT 2002  athena
+  * [project @ 2002-09-17 03:44:47 by athena]
+  Yet another attempt at getting the planner right.
+
+    M ./kernel/ifftw.h -3 +3
+    M ./kernel/planner.c -28 +27
+
+Mon Sep 16 21:56:14 EDT 2002  athena
+  * [project @ 2002-09-17 01:56:14 by athena]
+  Better coding.
+
+    M ./kernel/planner.c -21 +11
+
+Mon Sep 16 21:51:06 EDT 2002  athena
+  * [project @ 2002-09-17 01:51:06 by athena]
+  NO_UGLY is no longer a flag, but a separate planner field that does not
+  interfere with wisdom.
+
+    M ./kernel/ifftw.h -3 +2
+    M ./kernel/planner.c -14 +15
+
+Mon Sep 16 19:04:41 EDT 2002  athena
+  * [project @ 2002-09-16 23:04:41 by athena]
+  Did not compile without FFTW_DEBUG
+
+    M ./tests/verify-reodft.c -3 +1
+
+Mon Sep 16 18:37:06 EDT 2002  athena
+  * [project @ 2002-09-16 22:37:06 by athena]
+  Changed scoring mechanism.
+
+    M ./kernel/ifftw.h -5 +4
+    M ./kernel/plan.c -5 +1
+    M ./kernel/planner.c -51 +28
+    M ./tests/bench.c -5 +2
+
+Mon Sep 16 17:13:45 EDT 2002  athena
+  * [project @ 2002-09-16 21:13:45 by athena]
+  Count infeasible plans
+
+    M ./kernel/planner.c -4 +12
+
+Mon Sep 16 16:36:12 EDT 2002  athena
+  * [project @ 2002-09-16 20:36:12 by athena]
+  curse subsumed plans before export
+
+    M ./kernel/planner.c -27 +35
+
+Mon Sep 16 15:40:46 EDT 2002  stevenj
+  * [project @ 2002-09-16 19:40:46 by stevenj]
+  removed ESTIMATE_BIT vs. ESTIMATE... ESTIMATE | IMPATIENT is a UI issue
+
+    M ./kernel/ifftw.h -6 +4
+    M ./kernel/planner.c -2 +2
+
+Mon Sep 16 15:31:39 EDT 2002  stevenj
+  * [project @ 2002-09-16 19:31:39 by stevenj]
+  cleanup
+
+    M ./rdft/buffered2.c -6 +3
+
+Mon Sep 16 15:28:47 EDT 2002  stevenj
+  * [project @ 2002-09-16 19:28:47 by stevenj]
+  use CONSERVE_MEMORY flag to prevent buffered for large sizes
+
+    M ./dft/buffered.c -4 +7
+    M ./rdft/buffered.c -4 +7
+    M ./rdft/buffered2.c -5 +9
+
+Mon Sep 16 15:16:16 EDT 2002  stevenj
+  * [project @ 2002-09-16 19:16:16 by stevenj]
+  moved NO_DHT_R2HC back into planner flags: there's no reason we would want this flag to block plan reuse
+
+    M ./kernel/ifftw.h -4 +4
+
+Mon Sep 16 14:59:14 EDT 2002  stevenj
+  * [project @ 2002-09-16 18:59:14 by stevenj]
+  whoops, commas
+
+    M ./kernel/ifftw.h -3 +3
+
+Mon Sep 16 14:58:26 EDT 2002  stevenj
+  * [project @ 2002-09-16 18:58:26 by stevenj]
+  problem_flags == checked in applicable, planner_flags == checked in score
+
+    M ./kernel/ifftw.h -5 +7
+
+Mon Sep 16 14:53:16 EDT 2002  stevenj
+  * [project @ 2002-09-16 18:53:16 by stevenj]
+  ESTIMATE should not *include* all impatience flags, even if it subsumes them; some impatience flags, like NO_INDIRECT, might make a problem unsolvable
+
+    M ./kernel/ifftw.h -6 +6
+    M ./kernel/planner.c -3 +4
+
+Mon Sep 16 00:56:29 EDT 2002  stevenj
+  * [project @ 2002-09-16 04:56:29 by stevenj]
+  quotatio marks
+
+    M ./kernel/planner.c -3 +3
+
+Sun Sep 15 23:55:44 EDT 2002  stevenj
+  * [project @ 2002-09-16 03:55:44 by stevenj]
+  delete blank line
+
+    M ./kernel/planner.c -2 +1
+
+Sun Sep 15 23:51:14 EDT 2002  stevenj
+  * [project @ 2002-09-16 03:51:14 by stevenj]
+  substitution
+
+    M ./kernel/planner.c -2 +2
+
+Sun Sep 15 23:49:50 EDT 2002  stevenj
+  * [project @ 2002-09-16 03:49:50 by stevenj]
+  note that we are not GNUlly correct
+
+    M ./kernel/planner.c -1 +10
+
+Sun Sep 15 23:41:01 EDT 2002  stevenj
+  * [project @ 2002-09-16 03:41:01 by stevenj]
+  indenting
+
+    M ./kernel/planner.c -3 +3
+
+Sun Sep 15 23:37:46 EDT 2002  stevenj
+  * [project @ 2002-09-16 03:37:46 by stevenj]
+  more jokes
+
+    M ./kernel/planner.c -2 +5
+
+Sun Sep 15 23:20:14 EDT 2002  stevenj
+  * [project @ 2002-09-16 03:20:14 by stevenj]
+  NONTHREADED_ICKYP includes nthr > 1 check
+
+    M ./dft/ct-dit.c -2 +2
+    M ./dft/vrank-geq1.c -2 +2
+    M ./kernel/ifftw.h -2 +3
+    M ./rdft/hc2hc-dif.c -2 +2
+    M ./rdft/hc2hc-dit.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+
+Sun Sep 15 22:56:44 EDT 2002  stevenj
+  * [project @ 2002-09-16 02:56:44 by stevenj]
+  use md5sig
+
+    M ./kernel/md5.c -1 +1
+
+Sun Sep 15 22:55:41 EDT 2002  stevenj
+  * [project @ 2002-09-16 02:55:41 by stevenj]
+  md5sig typedef
+
+    M ./kernel/ifftw.h -2 +4
+    M ./kernel/planner.c -9 +9
+
+Sun Sep 15 22:35:13 EDT 2002  stevenj
+  * [project @ 2002-09-16 02:35:13 by stevenj]
+  updated
+
+    M ./ChangeLog +324
+
+Sun Sep 15 22:30:26 EDT 2002  stevenj
+  * [project @ 2002-09-16 02:30:26 by stevenj]
+  partially-ordered impatience
+
+    M ./dft/buffered.c -2 +2
+    M ./dft/ct-dif.c -3 +3
+    M ./dft/ct-dit.c -3 +3
+    M ./dft/ct-ditbuf.c -2 +2
+    M ./dft/ct.c -5 +1
+    M ./dft/indirect.c -4 +4
+    M ./dft/rank-geq2.c -3 +2
+    M ./dft/vrank-geq1.c -13 +3
+    M ./kernel/ifftw.h -18 +51
+    M ./kernel/planner.c -11 +49
+    M ./rdft/buffered.c -2 +2
+    M ./rdft/buffered2.c -2 +2
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/dht-r2hc.c -3 +3
+    M ./rdft/hc2hc-buf.c -4 +3
+    M ./rdft/hc2hc-dif.c -4 +4
+    M ./rdft/hc2hc-dit.c -3 +3
+    M ./rdft/hc2hc.c -5 +1
+    M ./rdft/indirect.c -4 +4
+    M ./rdft/rank-geq2-rdft2.c -3 +2
+    M ./rdft/rank-geq2.c -3 +2
+    M ./rdft/rdft-dht.c -3 +3
+    M ./rdft/rdft2-radix2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -13 +3
+    M ./rdft/vrank-geq1.c -13 +3
+    M ./tests/bench.c -3
+    M ./threads/dft-vrank-geq1.c -13 +2
+    M ./threads/hc2hc-dif.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -12 +2
+    M ./threads/vrank-geq1-rdft2.c -12 +2
+
+Sat Sep 14 19:47:56 EDT 2002  athena
+  * [project @ 2002-09-14 23:47:56 by athena]
+  Removed all that planner inheritance crap.
+
+    M ./kernel/Makefile.am -4 +3
+    M ./kernel/ifftw.h -15 +5
+    R ./kernel/planner-naive.c
+    R ./kernel/planner-score.c
+    M ./kernel/planner.c -35 +128
+    M ./tests/bench.c -2 +2
+
+Sat Sep 14 16:35:28 EDT 2002  stevenj
+  * [project @ 2002-09-14 20:35:28 by stevenj]
+  string.h is used for more than strlen
+
+    M ./kernel/planner.c -2 +2
+
+Sat Sep 14 12:19:13 EDT 2002  athena
+  * [project @ 2002-09-14 16:19:13 by athena]
+  Reduced hashtable size by 1/6 (on 32-bit machines) at the expense
+  of messier planner.
+
+    M ./kernel/ifftw.h -13 +17
+    M ./kernel/planner.c -80 +87
+
+Sat Sep 14 08:31:29 EDT 2002  athena
+  * [project @ 2002-09-14 12:31:29 by athena]
+  Only print wisdom if verbose > 3
+
+    M ./tests/bench.c -2 +4
+
+Sat Sep 14 07:56:56 EDT 2002  athena
+  * [project @ 2002-09-14 11:56:56 by athena]
+  Changed syntax of temporaries to avoid shadowing library functions
+  (which is harmless but I hate the warning)
+
+    M ./genfft/variable.ml -2 +2
+    M ./genfft-k7/variable.ml -1 +1
+
+Fri Sep 13 23:07:39 EDT 2002  stevenj
+  * [project @ 2002-09-14 03:07:39 by stevenj]
+  only add warnings in debug/maintainer mode, and add a few more warning flags; eliminate more warnings; add support for posix_memalign (broken in glibc, grrr)
+
+    M ./acinclude.m4 -1 +1
+    M ./configure.ac -1 +9
+    M ./dft/rader.c -1 +1
+    M ./kernel/alloc.c -3 +16
+    M ./kernel/assert.c -2 +2
+    M ./kernel/ifftw.h -5 +5
+    M ./kernel/md5.c -1 +1
+    M ./kernel/planner-score.c -11 +13
+    M ./kernel/primes.c -4 +4
+    M ./kernel/scan.c -2 +2
+    M ./libbench/bench-user.h -2 +2
+    M ./libbench/bench.h -3 +4
+    M ./libbench/report.c -5 +5
+    M ./libbench/timer.c -1 +2
+    M ./libbench/util.c -14 +26
+    M ./libbench/verify.c -4 +1
+    M ./rdft/rader-hc2hc.c -6 +6
+    M ./tests/bench.c -3
+    M ./tests/verify-lib.c -1 +4
+
+Fri Sep 13 21:57:50 EDT 2002  athena
+  * [project @ 2002-09-14 01:57:50 by athena]
+  Explicit cast
+
+    M ./kernel/twiddle.c -2 +2
+
+Fri Sep 13 21:54:50 EDT 2002  athena
+  * [project @ 2002-09-14 01:54:50 by athena]
+  Use double-hashing.  This allows a slightly higher load factor
+  at the expense of a messier computation of the hashtable size.
+
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -57 +63
+    M ./kernel/primes.c -1 +7
+
+Fri Sep 13 17:53:13 EDT 2002  stevenj
+  * [project @ 2002-09-13 21:53:13 by stevenj]
+  typo
+
+    M ./genfft/magic.ml -2 +2
+
+Fri Sep 13 15:36:07 EDT 2002  athena
+  * [project @ 2002-09-13 19:36:07 by athena]
+  Slight change in hash table growth functions.
+
+    M ./kernel/planner.c -3 +12
+
+Fri Sep 13 14:58:22 EDT 2002  athena
+  * [project @ 2002-09-13 18:58:22 by athena]
+  More statistics.
+
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -1 +8
+
+Fri Sep 13 10:13:02 EDT 2002  athena
+  * [project @ 2002-09-13 14:13:02 by athena]
+  Clearer logic.
+
+    M ./kernel/planner.c -2 +3
+
+Fri Sep 13 10:11:10 EDT 2002  athena
+  * [project @ 2002-09-13 14:11:10 by athena]
+  Oops.
+
+    M ./kernel/planner.c -2 +1
+
+Fri Sep 13 09:31:46 EDT 2002  athena
+  * [project @ 2002-09-13 13:31:46 by athena]
+  Cleaned up
+
+    M ./kernel/planner.c -5 +6
+
+Fri Sep 13 09:16:07 EDT 2002  athena
+  * [project @ 2002-09-13 13:16:07 by athena]
+  Deal properly with infeasible problems.
+
+    M ./kernel/planner.c -4 +7
+
+Fri Sep 13 07:15:06 EDT 2002  athena
+  * [project @ 2002-09-13 11:15:06 by athena]
+  Redundantly initialize hash table to prevent valgrind warnings.
+
+    M ./kernel/planner.c -3 +7
+
+Thu Sep 12 19:00:22 EDT 2002  athena
+  * [project @ 2002-09-12 23:00:22 by athena]
+  Removed relics from past.
+
+    M ./kernel/md5.c -5 +3
+
+Thu Sep 12 18:53:44 EDT 2002  athena
+  * [project @ 2002-09-12 22:53:44 by athena]
+  md5hash a problem only once.
+
+    M ./kernel/ifftw.h -6 +11
+    M ./kernel/planner.c -39 +32
+
+Thu Sep 12 16:33:49 EDT 2002  athena
+  * [project @ 2002-09-12 20:33:49 by athena]
+  Renamed k7 codelets
+
+    M ./genfft-k7/genUtil.ml -1 +1
+    M ./genfft-k7/gen_notw.ml -3 +3
+    M ./genfft-k7/gen_twiddle.ml -4 +4
+
+Thu Sep 12 16:32:03 EDT 2002  stevenj
+  * [project @ 2002-09-12 20:32:03 by stevenj]
+  FORBID_DHT_R2HC -> DHT_R2HC_VERBOTEN for consistency
+
+    M ./kernel/ifftw.h -2 +2
+    M ./rdft/dht-r2hc.c -3 +3
+
+Thu Sep 12 16:28:43 EDT 2002  stevenj
+  * [project @ 2002-09-12 20:28:43 by stevenj]
+  removed obsolete macro
+
+    M ./kernel/ifftw.h -3 +1
+
+Thu Sep 12 16:20:39 EDT 2002  athena
+  * [project @ 2002-09-12 20:20:39 by athena]
+  Split flags in SIMD code.
+
+    M ./dft/simd/n1b.c -1 +1
+    M ./dft/simd/n1f.c -1 +1
+    M ./dft/simd/t1b.c -1 +1
+    M ./dft/simd/t1f.c -1 +1
+
+Thu Sep 12 16:18:51 EDT 2002  athena
+  * [project @ 2002-09-12 20:18:51 by athena]
+  Forgot to fix threads
+
+    M ./threads/dft-vrank-geq1.c -5 +6
+    M ./threads/hc2hc-dif.c -2 +2
+    M ./threads/rdft-vrank-geq1.c -5 +6
+    M ./threads/vrank-geq1-rdft2.c -5 +6
+
+Thu Sep 12 16:10:05 EDT 2002  athena
+  * [project @ 2002-09-12 20:10:05 by athena]
+  Split flags into planner_flags and problem_flags
+
+    M ./dft/buffered.c -2 +2
+    M ./dft/ct-dif.c -3 +3
+    M ./dft/ct-dit.c -3 +3
+    M ./dft/ct-ditbuf.c -2 +2
+    M ./dft/ct.c -3 +3
+    M ./dft/indirect.c -4 +4
+    M ./dft/rader.c -1 +1
+    M ./dft/rank-geq2.c -2 +3
+    M ./dft/rank0.c -2 +2
+    M ./dft/vrank-geq1.c -7 +8
+    M ./kernel/ifftw.h -22 +21
+    M ./kernel/planner-naive.c -3 +3
+    M ./kernel/planner-score.c -3 +3
+    M ./kernel/planner.c -33 +28
+    M ./rdft/buffered.c -2 +2
+    M ./rdft/buffered2.c -3 +3
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/dht-r2hc.c -3 +5
+    M ./rdft/dht-rader.c -1 +1
+    M ./rdft/hc2hc-buf.c -3 +3
+    M ./rdft/hc2hc-dif.c -4 +4
+    M ./rdft/hc2hc-dit.c -3 +3
+    M ./rdft/hc2hc.c -3 +3
+    M ./rdft/indirect.c -4 +4
+    M ./rdft/rank-geq2-rdft2.c -2 +3
+    M ./rdft/rank-geq2.c -2 +3
+    M ./rdft/rdft-dht.c -3 +3
+    M ./rdft/rdft2-radix2.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -7 +8
+    M ./rdft/vrank-geq1.c -7 +8
+    M ./tests/bench.c -8 +8
+
+Thu Sep 12 15:46:56 EDT 2002  stevenj
+  * [project @ 2002-09-12 19:46:56 by stevenj]
+  tetrameter
+
+    M ./kernel/planner.c -2 +3
+
+Thu Sep 12 15:11:21 EDT 2002  athena
+  * [project @ 2002-09-12 19:11:21 by athena]
+  Overwrite less impatient solutions properly.
+
+    M ./kernel/planner.c -23 +15
+
+Thu Sep 12 11:29:16 EDT 2002  athena
+  * [project @ 2002-09-12 15:29:16 by athena]
+  Oops.
+
+    M ./kernel/planner.c -3 +3
+
+Thu Sep 12 10:58:56 EDT 2002  athena
+  * [project @ 2002-09-12 14:58:56 by athena]
+  Keep less impatient solution in case of conflict.  Paranoid
+  cast to uint in certain places.
+
+    M ./kernel/planner.c -4 +5
+
+Thu Sep 12 10:02:51 EDT 2002  athena
+  * [project @ 2002-09-12 14:02:51 by athena]
+  Complete reimplementation of planner hash table.
+
+    M ./kernel/ifftw.h -6 +7
+    M ./kernel/planner.c -143 +146
+    M ./tests/bench.c -2 +4
+
+Thu Sep 12 07:58:45 EDT 2002  athena
+  * [project @ 2002-09-12 11:58:45 by athena]
+  planner->cnt was not properly decremented.
+
+    M ./kernel/planner.c -13 +14
+
+Wed Sep 11 17:52:39 EDT 2002  stevenj
+  * [project @ 2002-09-11 21:52:39 by stevenj]
+  typo
+
+    M ./NEWS -1 +1
+
+Mon Sep  9 17:10:45 EDT 2002  athena
+  * [project @ 2002-09-09 21:10:45 by athena]
+  Simplified
+
+    M ./kernel/planner.c -15 +5
+
+Mon Sep  9 17:03:32 EDT 2002  athena
+  * [project @ 2002-09-09 21:03:32 by athena]
+  Always overwrite old wisdom with new, in case the old is
+  corrupt/conclicting.
+
+    M ./kernel/planner.c -16 +8
+
+Mon Sep  9 16:56:03 EDT 2002  stevenj
+  * [project @ 2002-09-09 20:56:03 by stevenj]
+  added quote/joke
+
+    M ./kernel/plan.c -1 +5
+
+Mon Sep  9 15:04:47 EDT 2002  athena
+  * [project @ 2002-09-09 19:04:47 by athena]
+  Completed wisdom import
+
+    M ./kernel/ifftw.h -10 +11
+    M ./kernel/md5.c -1 +1
+    M ./kernel/planner.c -37 +54
+    M ./kernel/print.c -2 +3
+    M ./kernel/scan.c -73 +25
+    M ./tests/bench.c -3 +3
+
+Mon Sep  9 10:14:22 EDT 2002  athena
+  * [project @ 2002-09-09 14:14:22 by athena]
+  Slight cleanup of md5 interface.
+
+    M ./dft/problem.c -2 +2
+    M ./kernel/ifftw.h -2 +4
+    M ./kernel/md5.c -12 +24
+    M ./rdft/problem.c -2 +2
+    M ./rdft/problem2.c -2 +2
+
+Tue Sep  3 22:32:43 EDT 2002  athena
+  * [project @ 2002-09-04 02:32:43 by athena]
+  More consistent protocol between planner and inferior.
+
+    M ./kernel/planner-naive.c -2 +5
+    M ./kernel/planner-score.c -1 +2
+    M ./kernel/planner.c -3 +1
+
+Tue Sep  3 21:08:30 EDT 2002  athena
+  * [project @ 2002-09-04 01:08:30 by athena]
+  I can't think of any situation where saving infeasible problems would
+  be desirable.  Removed relevant code.
+
+    M ./kernel/planner.c -9 +3
+
+Tue Sep  3 20:57:03 EDT 2002  athena
+  * [project @ 2002-09-04 00:57:03 by athena]
+  Encoder registrar's names in wisdom.  Remove export_conf, since
+  a separate program can now generate it.
+
+    M ./kernel/ifftw.h -5 +5
+    M ./kernel/planner.c -108 +32
+    M ./kernel/solvtab.c -1 +2
+    M ./tests/bench.c -3 +1
+
+Tue Sep  3 15:11:06 EDT 2002  athena
+  * [project @ 2002-09-03 19:11:06 by athena]
+  Fixed typo
+
+    M ./kernel/planner.c -2 +2
+
+Tue Sep  3 14:52:45 EDT 2002  athena
+  * [project @ 2002-09-03 18:52:45 by athena]
+  Fixed broken trochaic meter.
+
+    M ./kernel/planner.c -2 +2
+
+Tue Sep  3 09:49:50 EDT 2002  athena
+  * [project @ 2002-09-03 13:49:50 by athena]
+  Initialize planner->score.  It is correct to leave it uninitialized,
+  but I don't want people to send reports about purify complaining.
+
+    M ./kernel/planner.c -1 +2
+
+Tue Sep  3 09:03:46 EDT 2002  athena
+  * [project @ 2002-09-03 13:03:46 by athena]
+  More latin silliness
+
+    M ./kernel/planner.c -3 +7
+
+Mon Sep  2 17:57:32 EDT 2002  stevenj
+  * [project @ 2002-09-02 21:57:32 by stevenj]
+  updated
+
+    M ./ChangeLog +193
+
+Mon Sep  2 17:33:49 EDT 2002  stevenj
+  * [project @ 2002-09-02 21:33:49 by stevenj]
+  added clock() getseconds timer
+
+    M ./kernel/timer.c -1 +16
+
+Mon Sep  2 16:16:58 EDT 2002  athena
+  * [project @ 2002-09-02 20:16:58 by athena]
+  Oops
+
+    M ./rdft/indirect.c -2 +1
+
+Mon Sep  2 15:58:19 EDT 2002  athena
+  * [project @ 2002-09-02 19:58:19 by athena]
+  Experimental INDIRECT_VERBOTEN flag (not used)
+
+    M ./dft/indirect.c -1 +3
+    M ./kernel/ifftw.h -5 +6
+    M ./rdft/indirect.c -3 +4
+
+Mon Sep  2 15:36:21 EDT 2002  athena
+  * [project @ 2002-09-02 19:36:21 by athena]
+  Do not allow buffering in children of indirect solvers.
+
+    M ./dft/buffered.c -1 +4
+    M ./dft/indirect.c -1 +3
+    M ./kernel/ifftw.h -1 +2
+    M ./rdft/buffered.c -1 +4
+    M ./rdft/buffered2.c -1 +4
+    M ./rdft/indirect.c -1 +3
+
+Mon Sep  2 15:02:11 EDT 2002  athena
+  * [project @ 2002-09-02 19:02:11 by athena]
+  Oops
+
+    M ./kernel/planner.c -2 +2
+
+Mon Sep  2 14:32:28 EDT 2002  athena
+  * [project @ 2002-09-02 18:32:28 by athena]
+  Hash sizeof(R) as part of wisdom.
+
+    M ./kernel/planner.c -1 +2
+
+Mon Sep  2 13:47:57 EDT 2002  stevenj
+  * [project @ 2002-09-02 17:47:57 by stevenj]
+  added --enable-float synonym for --enable-single (since with have --enable-long-double)
+
+    M ./configure.ac +1
+
+Mon Sep  2 13:46:08 EDT 2002  athena
+  * [project @ 2002-09-02 17:46:08 by athena]
+  zerotens is now in its own file, so it does not cause dft to be linked
+  in if only rdft is used.
+
+    A ./dft/zero.c
+    M ./dft/Makefile.am -1 +1
+    M ./dft/problem.c -25 +1
+    M ./dft/zero.c +49
+
+Mon Sep  2 11:56:37 EDT 2002  athena
+  * [project @ 2002-09-02 15:56:37 by athena]
+  Removed unused var.
+
+    M ./kernel/planner.c -2 +1
+
+Mon Sep  2 11:55:33 EDT 2002  athena
+  * [project @ 2002-09-02 15:55:33 by athena]
+  Split insert() in preparation for wisdom import
+
+    M ./kernel/planner.c -5 +11
+
+Mon Sep  2 11:46:57 EDT 2002  athena
+  * [project @ 2002-09-02 15:46:57 by athena]
+  Moved debugging infrastructure to test directory so that it is not
+  linked into the shared library.
+
+    A ./tests/debug.h
+    A ./tests/dotens.c
+    A ./tests/dotens2.c
+    A ./tests/verify-dft.c
+    A ./tests/verify-lib.c
+    A ./tests/verify-rdft.c
+    A ./tests/verify-reodft.c
+    A ./tests/verify.h
+    M ./dft/Makefile.am -1 +1
+    M ./dft/dft.h -4 +1
+    R ./dft/verify.c
+    M ./kernel/Makefile.am -6 +5
+    R ./kernel/dotens.c
+    R ./kernel/dotens2.c
+    M ./kernel/ifftw.h -17 +1
+    R ./kernel/verify-lib.c
+    R ./kernel/verify.h
+    M ./rdft/Makefile.am -2 +2
+    M ./rdft/rdft.h -4 +1
+    R ./rdft/verify.c
+    M ./reodft/Makefile.am -1 +1
+    M ./reodft/reodft.h -4 +1
+    R ./reodft/verify.c
+    M ./tests/Makefile.am -2 +7
+    M ./tests/bench.c +3
+    M ./tests/debug.h +18
+    M ./tests/dotens.c +48
+    M ./tests/dotens2.c +56
+    M ./tests/verify-dft.c +131
+    M ./tests/verify-lib.c +386
+    M ./tests/verify-rdft.c +420
+    M ./tests/verify-reodft.c +536
+    M ./tests/verify.h +83
+
+Mon Sep  2 11:04:54 EDT 2002  athena
+  * [project @ 2002-09-02 15:04:53 by athena]
+  Reactivated wisdom export
+
+    M ./kernel/planner.c -42 +9
+    M ./kernel/print.c -1 +12
+
+Sun Sep  1 21:30:58 EDT 2002  athena
+  * [project @ 2002-09-02 01:30:58 by athena]
+  Dump errors to stderr, not stdout.
+
+    M ./kernel/verify-lib.c -2 +3
+
+Sun Sep  1 21:26:38 EDT 2002  athena
+  * [project @ 2002-09-02 01:26:38 by athena]
+  Removed traverse.c.
+  traverse.c is no longer need for plan blessing.  I figured out
+  a way to avoid using it in planner-score.c, so the file is
+  now redundant.
+
+    M ./kernel/Makefile.am -2 +2
+    M ./kernel/ifftw.h -9 +2
+    M ./kernel/planner-score.c -30 +37
+    R ./kernel/traverse.c
+    M ./tests/bench.c -20
+
+Sun Sep  1 19:51:50 EDT 2002  athena
+  * [project @ 2002-09-01 23:51:50 by athena]
+  Removed code made obsolete by new MD5 scheme: problem equality
+  tests, scanners, and associated list of problem kinds.
+
+    M ./dft/conf.c -3 +1
+    M ./dft/dft.h -3 +1
+    M ./dft/problem.c -58 +3
+    M ./kernel/align.c -7 +1
+    M ./kernel/ifftw.h -24 +4
+    M ./kernel/planner.c -25 +10
+    M ./kernel/problem.c -11 +2
+    M ./kernel/scan.c -44 +2
+    M ./kernel/scanners.c -5 +5
+    M ./kernel/tensor.c -46 +1
+    M ./rdft/conf.c -4 +1
+    M ./rdft/problem.c -78 +3
+    M ./rdft/problem2.c -58 +3
+    M ./rdft/rdft.h -4 +1
+
+Sun Sep  1 19:22:54 EDT 2002  athena
+  * [project @ 2002-09-01 23:22:53 by athena]
+  Started md5 implementation
+
+    A ./kernel/md5.c
+    M ./dft/problem.c -9 +9
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/ifftw.h -3 +29
+    M ./kernel/md5.c +164
+    M ./kernel/planner-naive.c -3 +3
+    M ./kernel/planner-score.c -7 +11
+    M ./kernel/planner.c -27 +47
+    M ./kernel/tensor.c -9 +7
+    M ./rdft/problem.c -12 +11
+    M ./rdft/problem2.c -10 +11
+
+Sat Aug 31 14:00:04 EDT 2002  athena
+  * [project @ 2002-08-31 18:00:04 by athena]
+  Keep track of hit rate
+
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -3 +9
+
+Sat Aug 31 12:44:04 EDT 2002  athena
+  * [project @ 2002-08-31 16:44:04 by athena]
+  Only dump when verbose > 4
+
+    M ./kernel/planner.c -2 +2
+
+Sat Aug 31 09:55:57 EDT 2002  athena
+  * [project @ 2002-08-31 13:55:57 by athena]
+  Debugging infrastructure
+
+    M ./dft/indirect.c -2 +1
+    M ./kernel/ifftw.h -5 +6
+    M ./kernel/plan.c -1 +6
+    M ./kernel/planner.c -7 +12
+    M ./tests/bench.c -3 +7
+
+Sat Aug 31 09:21:48 EDT 2002  athena
+  * [project @ 2002-08-31 13:21:48 by athena]
+  Use debug infrastructure to dump planner.
+
+    M ./kernel/planner.c -19 +10
+    M ./kernel/print.c -2 +5
+
+Fri Aug 30 21:29:10 EDT 2002  athena
+  * [project @ 2002-08-31 01:29:10 by athena]
+  Do not store plans in planner, plus general planner cleanup.
+
+    M ./kernel/alloc.c -2 +2
+    M ./kernel/ifftw.h -34 +37
+    M ./kernel/plan.c -15 +1
+    M ./kernel/planner-naive.c -7 +7
+    M ./kernel/planner-score.c -8 +7
+    M ./kernel/planner.c -103 +80
+    M ./kernel/scan.c -3 +3
+    M ./kernel/scanners.c -3 +3
+    M ./tests/bench.c -3 +16
+
+Fri Aug 30 18:07:52 EDT 2002  stevenj
+  * [project @ 2002-08-30 22:07:52 by stevenj]
+  renamed IN_DHT_R2HC to the more general FORBID_DHT_R2HC
+
+    M ./kernel/ifftw.h -2 +2
+    M ./rdft/dht-r2hc.c -3 +3
+
+Fri Aug 30 18:07:21 EDT 2002  stevenj
+  * [project @ 2002-08-30 22:07:21 by stevenj]
+  eliminated unused var
+
+    M ./kernel/planner.c -2 +1
+
+Fri Aug 30 12:09:48 EDT 2002  athena
+  * [project @ 2002-08-30 16:09:48 by athena]
+  Score planner was not working correctly when using wisdom.  Fixed.
+
+    M ./kernel/planner-naive.c -2 +8
+    M ./kernel/planner-score.c -2 +13
+    M ./kernel/planner.c -13 +18
+
+Fri Aug 30 08:20:48 EDT 2002  athena
+  * [project @ 2002-08-30 12:20:48 by athena]
+  Use hash table in debug malloc
+
+    M ./kernel/alloc.c -12 +25
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/stride.c -2 +2
+
+Fri Aug 30 02:45:15 EDT 2002  stevenj
+  * [project @ 2002-08-30 06:45:15 by stevenj]
+  listed some good stuff
+
+    M ./NEWS +47
+
+Fri Aug 30 02:17:30 EDT 2002  stevenj
+  * [project @ 2002-08-30 06:17:30 by stevenj]
+  timed planner
+
+    M ./TODO +4
+
+Fri Aug 30 02:07:00 EDT 2002  stevenj
+  * [project @ 2002-08-30 06:07:00 by stevenj]
+  fma?
+
+    M ./TODO +2
+
+Fri Aug 30 02:05:55 EDT 2002  stevenj
+  * [project @ 2002-08-30 06:05:55 by stevenj]
+  update
+
+    M ./TODO -3 +3
+
+Fri Aug 30 01:31:47 EDT 2002  stevenj
+  * [project @ 2002-08-30 05:31:47 by stevenj]
+  rader-dht -> dht-rader
+
+    A ./rdft/dht-rader.c
+    M ./rdft/Makefile.am -4 +4
+    M ./rdft/conf.c -2 +2
+    M ./rdft/dht-rader.c +422
+    R ./rdft/rader-dht.c
+    M ./rdft/rdft.h -2 +2
+
+Fri Aug 30 01:21:37 EDT 2002  stevenj
+  * [project @ 2002-08-30 05:21:37 by stevenj]
+  add DHT solver, and break up rader-dht and r2hc-hc2r
+
+    A ./rdft/dht-r2hc.c
+    A ./rdft/rdft-dht.c
+    M ./kernel/ifftw.h -2 +3
+    M ./rdft/Makefile.am -6 +6
+    M ./rdft/buffered2.c -2 +4
+    M ./rdft/conf.c -4 +5
+    M ./rdft/dht-r2hc.c +151
+    R ./rdft/r2hc-hc2r.c
+    M ./rdft/rader-dht.c -73 +21
+    M ./rdft/rank-geq2.c -1 +12
+    M ./rdft/rdft-dht.c +229
+    M ./rdft/rdft.h -3 +4
+
+Thu Aug 29 23:20:35 EDT 2002  stevenj
+  * [project @ 2002-08-30 03:20:35 by stevenj]
+  another option
+
+    M ./tests/bench.c +1
+
+Thu Aug 29 22:55:29 EDT 2002  stevenj
+  * [project @ 2002-08-30 02:55:29 by stevenj]
+  generalized indirect solvers for fftw2-like buffering and more
+
+    M ./dft/indirect.c -22 +35
+    M ./kernel/ifftw.h -1 +3
+    M ./kernel/tensor.c -13 +17
+    M ./rdft/indirect.c -18 +35
+
+Thu Aug 29 18:08:16 EDT 2002  stevenj
+  * [project @ 2002-08-29 22:08:16 by stevenj]
+  tensor_max_index and tensor_min_stride are now both unsigned
+
+    M ./dft/vrank-geq1.c -2 +3
+    M ./kernel/ifftw.h -3 +3
+    M ./kernel/tensor.c -11 +7
+    M ./rdft/vrank-geq1-rdft2.c -2 +3
+    M ./rdft/vrank-geq1.c -2 +3
+
+Thu Aug 29 17:58:35 EDT 2002  stevenj
+  * [project @ 2002-08-29 21:58:35 by stevenj]
+  added iabs.c, and tensor_min_stride returns min absolute value
+
+    A ./kernel/iabs.c
+    M ./kernel/Makefile.am -5 +5
+    M ./kernel/iabs.c +28
+    M ./kernel/ifftw.h -1 +5
+    M ./kernel/tensor.c -12 +6
+    M ./rdft/buffered2.c -9 +4
+    M ./rdft/problem2.c -8 +3
+
+Thu Aug 29 17:31:39 EDT 2002  stevenj
+  * [project @ 2002-08-29 21:31:39 by stevenj]
+  bug fix in cldrest hc2c/c2hc copy loops
+
+    M ./rdft/buffered2.c -9 +14
+
+Thu Aug 29 13:45:08 EDT 2002  athena
+  * [project @ 2002-08-29 17:45:08 by athena]
+  Added things to do.
+
+    M ./TODO -1 +1
+
+Thu Aug 29 13:10:04 EDT 2002  stevenj
+  * [project @ 2002-08-29 17:10:04 by stevenj]
+  added automake prereq
+
+    M ./configure.ac -1 +1
+
+Thu Aug 29 08:36:36 EDT 2002  athena
+  * [project @ 2002-08-29 12:36:36 by athena]
+  Use indexed addressing
+
+    M ./rdft/rdft2-radix2.c -61 +61
+
+Thu Aug 29 08:20:55 EDT 2002  athena
+  * [project @ 2002-08-29 12:20:55 by athena]
+  Ooops
+
+    M ./libbench/verify.c -3 +6
+    M ./rdft/rdft2-radix2.c -4 +7
+
+Thu Aug 29 07:45:37 EDT 2002  athena
+  * [project @ 2002-08-29 11:45:37 by athena]
+  Oops
+
+    M ./kernel/ifftw.h -10 +10
+
+Thu Aug 29 02:32:13 EDT 2002  stevenj
+  * [project @ 2002-08-29 06:32:13 by stevenj]
+  updates to win32 threads code (ick)
+
+    M ./threads/threads.c -4 +22
+
+Thu Aug 29 01:44:33 EDT 2002  stevenj
+  * [project @ 2002-08-29 05:44:33 by stevenj]
+  added threaded version
+
+    A ./acx_pthread.m4
+    A ./threads/
+    A ./threads/Makefile.am
+    A ./threads/conf.c
+    A ./threads/ct-dit.c
+    A ./threads/dft-vrank-geq1.c
+    A ./threads/hc2hc-dif.c
+    A ./threads/hc2hc-dit.c
+    A ./threads/rdft-vrank-geq1.c
+    A ./threads/threads.c
+    A ./threads/threads.h
+    A ./threads/vrank-geq1-rdft2.c
+    M ./Makefile.am -2 +3
+    M ./acx_pthread.m4 +226
+    M ./configure.ac +56
+    M ./dft/ct-dif.c -2 +2
+    M ./dft/ct-dit.c -2 +5
+    M ./dft/ct-ditbuf.c -2 +2
+    M ./dft/ct-ditf.c -2 +2
+    M ./dft/ct.c -2 +3
+    M ./dft/ct.h -1 +2
+    M ./dft/dft.h -1 +4
+    M ./dft/kdft-dif.c -1 +5
+    M ./dft/kdft-dit.c -1 +5
+    M ./dft/vrank-geq1.c -1 +4
+    M ./kernel/alloc.c -42 +56
+    M ./kernel/ifftw.h -7 +18
+    M ./kernel/planner.c -15 +24
+    M ./rdft/hc2hc-buf.c -1 +3
+    M ./rdft/hc2hc-dif.c -1 +5
+    M ./rdft/hc2hc-dit.c -1 +5
+    M ./rdft/hc2hc.c -2 +3
+    M ./rdft/hc2hc.h -1 +2
+    M ./rdft/khc2hc-dif.c -1 +5
+    M ./rdft/khc2hc-dit.c -1 +5
+    M ./rdft/rdft.h -1 +4
+    M ./rdft/vrank-geq1-rdft2.c -1 +4
+    M ./rdft/vrank-geq1.c -1 +4
+    M ./tests/Makefile.am -2 +4
+    M ./tests/bench.c -1 +5
+    M ./threads/Makefile.am +15
+    M ./threads/conf.c +41
+    M ./threads/ct-dit.c +151
+    M ./threads/dft-vrank-geq1.c +249
+    M ./threads/hc2hc-dif.c +168
+    M ./threads/hc2hc-dit.c +166
+    M ./threads/rdft-vrank-geq1.c +246
+    M ./threads/threads.c +467
+    M ./threads/threads.h +53
+    M ./threads/vrank-geq1-rdft2.c +267
+
+Wed Aug 28 19:47:21 EDT 2002  stevenj
+  * [project @ 2002-08-28 23:47:21 by stevenj]
+  fix make dist
+
+    M ./kernel/Makefile.am -2 +3
+
+Wed Aug 28 15:09:03 EDT 2002  stevenj
+  * [project @ 2002-08-28 19:09:03 by stevenj]
+  whoops, bugfix for inverse
+
+    M ./rdft/rank-geq2-rdft2.c -4 +9
+
+Wed Aug 28 14:50:34 EDT 2002  athena
+  * [project @ 2002-08-28 18:50:34 by athena]
+  Use C9x convention for naming (fftwf etc.).  Removed installable header
+  files since they will be part of the API.
+
+    M ./Makefile.am -5 +5
+    M ./configure.ac -2 +9
+    M ./kernel/Makefile.am -1
+    R ./kernel/dfftw3.h
+    R ./kernel/fftw3.h
+    M ./kernel/ifftw.h -2 +17
+    R ./kernel/lfftw3.h
+    R ./kernel/sfftw3.h
+    M ./tests/Makefile.am -1 +1
+
+Tue Aug 27 23:34:00 EDT 2002  stevenj
+  * [project @ 2002-08-28 03:34:00 by stevenj]
+  allow _1 variants to accept rnk 0 (sz 1) problems
+
+    M ./rdft/problem.c -3 +3
+
+Tue Aug 27 15:56:09 EDT 2002  stevenj
+  * [project @ 2002-08-27 19:56:09 by stevenj]
+  updated
+
+    M ./ChangeLog +619
+
+Mon Aug 26 20:14:56 EDT 2002  athena
+  * [project @ 2002-08-27 00:14:56 by athena]
+  Loop unroll is useless
+
+    M ./dft/rank0.c -28 +6
+
+Mon Aug 26 20:00:41 EDT 2002  athena
+  * [project @ 2002-08-27 00:00:41 by athena]
+  Use indexed addressing
+
+    M ./dft/ct-ditbuf.c -12 +9
+
+Mon Aug 26 19:46:46 EDT 2002  athena
+  * [project @ 2002-08-26 23:46:46 by athena]
+  Use indexed addressing in transpose routines.  (Seems to be
+  slightly better on athlon.)
+
+    M ./dft/vrank2-transpose.c -16 +8
+    M ./dft/vrank3-transpose.c -15 +8
+
+Mon Aug 26 12:59:44 EDT 2002  stevenj
+  * [project @ 2002-08-26 16:59:44 by stevenj]
+  added comment about stability
+
+    M ./reodft/redft00e-r2hc.c -1 +2
+    M ./reodft/reodft11e-r2hc.c -1 +3
+    M ./reodft/rodft00e-r2hc.c -1 +2
+
+Mon Aug 26 07:43:53 EDT 2002  athena
+  * [project @ 2002-08-26 11:43:53 by athena]
+  Approximate opcount
+
+    M ./rdft/rdft2-radix2.c -6 +10
+
+Mon Aug 26 06:38:49 EDT 2002  athena
+  * [project @ 2002-08-26 10:38:49 by athena]
+  Finished rdft2 via dft/rdft
+
+    M ./dft/rank-geq2.c -2 +2
+    M ./rdft/rank-geq2.c -2 +2
+    M ./rdft/rdft2-radix2.c -9 +149
+
+Mon Aug 26 00:15:59 EDT 2002  stevenj
+  * [project @ 2002-08-26 04:15:59 by stevenj]
+  some updates
+
+    M ./TODO -9 +9
+
+Mon Aug 26 00:05:53 EDT 2002  stevenj
+  * [project @ 2002-08-26 04:05:52 by stevenj]
+  rdft kind is now per-dimension, added rdft/rank-geq2
+
+    A ./rdft/rank-geq2.c
+    M ./rdft/Makefile.am -2 +3
+    M ./rdft/buffered.c -3 +4
+    M ./rdft/buffered2.c -5 +5
+    M ./rdft/conf.c -3 +2
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/direct.c -6 +6
+    M ./rdft/generic.c -5 +5
+    M ./rdft/hc2hc.c -10 +10
+    M ./rdft/indirect.c -2 +2
+    M ./rdft/problem.c -21 +75
+    M ./rdft/r2hc-hc2r.c -4 +4
+    M ./rdft/rader-dht.c -4 +4
+    M ./rdft/rader-hc2hc.c -4 +4
+    M ./rdft/rank-geq2.c +231
+    M ./rdft/rdft.h -4 +9
+    M ./rdft/rdft2-radix2.c -2 +2
+    M ./rdft/verify.c -5 +5
+    M ./reodft/redft00e-r2hc.c -3 +3
+    M ./reodft/reodft010e-r2hc.c -6 +6
+    M ./reodft/reodft11e-r2hc.c -5 +5
+    M ./reodft/rodft00e-r2hc.c -3 +3
+    M ./reodft/verify.c -4 +4
+    M ./tests/bench.c -2 +4
+
+Sun Aug 25 22:45:38 EDT 2002  stevenj
+  * [project @ 2002-08-26 02:45:38 by stevenj]
+  added note
+
+    M ./rdft/problem.c -2 +3
+
+Sun Aug 25 22:28:12 EDT 2002  stevenj
+  * [project @ 2002-08-26 02:28:12 by stevenj]
+  must zero real sz
+
+    M ./rdft/problem.c -2 +4
+
+Sun Aug 25 22:06:52 EDT 2002  stevenj
+  * [project @ 2002-08-26 02:06:52 by stevenj]
+  unified pickdim funcs
+
+    A ./kernel/pickdim.c
+    M ./dft/rank-geq2.c -32 +10
+    M ./dft/vrank-geq1.c -45 +3
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/ifftw.h -1 +6
+    M ./kernel/pickdim.c +82
+    M ./rdft/vrank-geq1-rdft2.c -45 +3
+    M ./rdft/vrank-geq1.c -45 +3
+    M ./tests/bench.c -1 +1
+
+Sun Aug 25 14:10:55 EDT 2002  fftw
+  * [project @ 2002-08-25 18:10:55 by fftw]
+  silence warnings
+
+    M ./libbench/mp.c -14
+    M ./rdft/codelet.h -2 +2
+    M ./rdft/indirect.c -3 +1
+    M ./rdft/rank-geq2-rdft2.c -5 +2
+    M ./rdft/verify.c -10 +7
+    M ./reodft/reodft010e-r2hc.c -2 +2
+    M ./reodft/reodft11e-r2hc.c -7 +2
+    M ./reodft/verify.c -2 +2
+
+Sun Aug 25 13:16:49 EDT 2002  athena
+  * [project @ 2002-08-25 17:16:49 by athena]
+  I had to add another planner flag to record whether pointers could
+  become unaligned because of vrank-geq1 solvers (these solvers only
+  plan the first element of a vector problem, but the second element
+  may have a different alignment).  This addition is ugly, but I don't
+  see any way around it.
+
+    M ./dft/codelet.h -3 +5
+    M ./dft/codelets/n.c -2 +3
+    M ./dft/codelets/t.c -2 +2
+    M ./dft/ct-dif.c -3 +3
+    M ./dft/ct-dit.c -3 +3
+    M ./dft/ct-ditbuf.c -3 +3
+    M ./dft/ct-ditf.c -2 +2
+    M ./dft/direct.c -5 +6
+    M ./dft/simd/n1b.c -1 +3
+    M ./dft/simd/n1f.c -1 +3
+    M ./dft/simd/t1b.c -1 +3
+    M ./dft/simd/t1f.c -1 +3
+    M ./dft/vrank-geq1.c -2 +11
+    M ./kernel/ifftw.h -4 +8
+    M ./rdft/vrank-geq1-rdft2.c -2 +8
+    M ./rdft/vrank-geq1.c -2 +7
+
+Sun Aug 25 10:18:25 EDT 2002  athena
+  * [project @ 2002-08-25 14:18:25 by athena]
+  Added thoughts
+
+    M ./TODO -1 +2
+
+Sun Aug 25 10:08:59 EDT 2002  athena
+  * [project @ 2002-08-25 14:08:59 by athena]
+  Implemented rdft2 via vector rdft + radix2 step
+
+    A ./rdft/rdft2-radix2.c
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/conf.c -2 +2
+    M ./rdft/rdft.h -2 +2
+    R ./rdft/rdft2-dft.c
+    M ./rdft/rdft2-radix2.c +325
+
+Sat Aug 24 17:43:54 EDT 2002  athena
+  * [project @ 2002-08-24 21:43:54 by athena]
+  Stylistic changes
+
+    M ./rdft/rdft2-dft.c -5 +5
+
+Sat Aug 24 11:19:30 EDT 2002  athena
+  * [project @ 2002-08-24 15:19:30 by athena]
+  Simplified mktwiddle interface
+
+    M ./dft/ct.c -7 +3
+    M ./dft/generic.c -5 +2
+    M ./kernel/ifftw.h -1 +3
+    M ./kernel/twiddle.c -1 +11
+    M ./rdft/generic.c -6 +3
+    M ./rdft/rdft2-dft.c -11 +5
+    M ./reodft/redft00e-r2hc.c -6 +2
+    M ./reodft/reodft010e-r2hc.c -5 +2
+    M ./reodft/reodft11e-r2hc.c -8 +3
+    M ./reodft/rodft00e-r2hc.c -5 +2
+
+Sat Aug 24 11:05:08 EDT 2002  athena
+  * [project @ 2002-08-24 15:05:08 by athena]
+  Unification of certain vector computations.  rdft2-dft is now a
+  vector transform.
+
+    M ./dft/ct-dif.c -2 +2
+    M ./dft/ct-dit.c -2 +2
+    M ./dft/ct.c -16 +2
+    M ./dft/ct.h -4 +1
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/tensor.c -1 +17
+    M ./rdft/hc2hc-dif.c -2 +2
+    M ./rdft/hc2hc-dit.c -2 +2
+    M ./rdft/hc2hc.c -18 +2
+    M ./rdft/hc2hc.h -4 +1
+    M ./rdft/rdft2-dft.c -54 +66
+
+Fri Aug 23 20:21:25 EDT 2002  athena
+  * [project @ 2002-08-24 00:21:25 by athena]
+  Intel compiler seems to be still buggy
+
+    M ./configure.ac +1
+    M ./simd/sse.c -1 +3
+    M ./simd/sse2.c -1 +3
+
+Fri Aug 23 16:07:12 EDT 2002  athena
+  * [project @ 2002-08-23 20:07:12 by athena]
+  Streamlined twiddle protocol
+
+    M ./dft/ct-dif.c -2 +3
+    M ./dft/ct-dit.c -2 +3
+    M ./dft/ct-ditbuf.c -2 +2
+    M ./dft/ct-ditf.c -2 +2
+    M ./dft/ct.c -13 +6
+    M ./dft/ct.h -3 +2
+    M ./dft/generic.c -16 +6
+    M ./dft/indirect.c -3 +1
+    M ./kernel/ifftw.h -3 +3
+    M ./kernel/twiddle.c -7 +14
+    M ./rdft/generic.c -18 +8
+    M ./rdft/hc2hc.c -10 +6
+    M ./rdft/rdft2-dft.c -19 +10
+    M ./reodft/redft00e-r2hc.c -15 +6
+    M ./reodft/reodft010e-r2hc.c -19 +10
+    M ./reodft/reodft11e-r2hc.c -21 +9
+    M ./reodft/rodft00e-r2hc.c -15 +6
+
+Fri Aug 23 13:22:17 EDT 2002  athena
+  * [project @ 2002-08-23 17:22:17 by athena]
+  Implemented rdft2 via dft (forward only for now)
+
+    A ./rdft/rdft2-dft.c
+    M ./libbench/verify.c -1 +10
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/conf.c -1 +2
+    M ./rdft/rdft.h -1 +2
+    M ./rdft/rdft2-dft.c +213
+
+Thu Aug 22 11:29:29 EDT 2002  athena
+  * [project @ 2002-08-22 15:29:29 by athena]
+  More cleanup of verify
+
+    M ./kernel/verify-lib.c -21 +13
+    M ./libbench/verify.c -45 +12
+
+Thu Aug 22 11:16:03 EDT 2002  athena
+  * [project @ 2002-08-22 15:16:03 by athena]
+  Changed error criterion because old one was too strict
+
+    M ./kernel/verify-lib.c -40 +21
+
+Thu Aug 22 11:15:17 EDT 2002  athena
+  * [project @ 2002-08-22 15:15:17 by athena]
+  Disable shared
+
+    M ./bootstrap.sh -1 +1
+
+Thu Aug 22 09:19:12 EDT 2002  athena
+  * [project @ 2002-08-22 13:19:12 by athena]
+  Added thoughts
+
+    M ./TODO +4
+
+Thu Aug 22 09:17:28 EDT 2002  athena
+  * [project @ 2002-08-22 13:17:28 by athena]
+  Oops
+
+    M ./dft/generic.c -1 +2
+
+Thu Aug 22 09:11:34 EDT 2002  athena
+  * [project @ 2002-08-22 13:11:34 by athena]
+  Do not use inline.  Minor changes.
+
+    M ./dft/generic.c -1 +1
+    M ./kernel/alloc.c -2 +2
+    M ./kernel/planner-score.c -2 +2
+    M ./kernel/tensor.c -3 +3
+
+Wed Aug 21 16:23:26 EDT 2002  stevenj
+  * [project @ 2002-08-21 20:23:26 by stevenj]
+  more commented flags
+
+    M ./tests/bench.c +2
+
+Tue Aug 20 19:44:43 EDT 2002  stevenj
+  * [project @ 2002-08-20 23:44:43 by stevenj]
+  added DCT-IV and DST-IV
+
+    A ./reodft/reodft11e-r2hc.c
+    M ./reodft/Makefile.am -1 +2
+    M ./reodft/conf.c -1 +2
+    M ./reodft/reodft11e-r2hc.c +297
+    M ./reodft/verify.c -1 +15
+    M ./tests/bench.c -1 +5
+
+Tue Aug 20 16:01:36 EDT 2002  athena
+  * [project @ 2002-08-20 20:01:36 by athena]
+  Slight improvement in twiddle scheme
+
+    M ./genfft/twiddle.ml -2 +9
+
+Tue Aug 20 15:31:54 EDT 2002  stevenj
+  * [project @ 2002-08-20 19:31:54 by stevenj]
+  name fix
+
+    M ./reodft/conf.c -2 +2
+    M ./reodft/reodft.h -6 +4
+    M ./reodft/reodft010e-r2hc.c -2 +2
+
+Tue Aug 20 15:16:48 EDT 2002  stevenj
+  * [project @ 2002-08-20 19:16:48 by stevenj]
+  removed extraneous variable
+
+    M ./reodft/reodft010e-r2hc.c -5 +1
+
+Tue Aug 20 11:46:29 EDT 2002  athena
+  * [project @ 2002-08-20 15:46:29 by athena]
+  Oops
+
+    M ./libbench/mp.c -33 +79
+    M ./libbench/verify.c -33 +25
+
+Tue Aug 20 08:37:45 EDT 2002  athena
+  * [project @ 2002-08-20 12:37:45 by athena]
+  Still playing around
+
+    M ./genfft/twiddle.ml -35 +62
+    M ./kernel/trig.c -1 +3
+
+Mon Aug 19 19:56:29 EDT 2002  athena
+  * [project @ 2002-08-19 23:56:29 by athena]
+  Playing around with addition chain
+
+    M ./TODO -6 +11
+    M ./genfft/algsimp.ml -3 +4
+    M ./genfft/expr.ml -1 +6
+    M ./genfft/expr.mli -1 +2
+    M ./genfft/twiddle.ml -46 +64
+    M ./support/addchain.c -8 +25
+
+Mon Aug 19 19:48:56 EDT 2002  stevenj
+  * [project @ 2002-08-19 23:48:56 by stevenj]
+  comments
+
+    M ./reodft/redft00e-r2hc.c -1 +4
+    M ./reodft/rodft00e-r2hc.c -1 +4
+
+Mon Aug 19 19:45:35 EDT 2002  stevenj
+  * [project @ 2002-08-19 23:45:35 by stevenj]
+  comment fixes
+
+    M ./reodft/reodft010e-r2hc.c -5 +2
+
+Mon Aug 19 19:40:18 EDT 2002  stevenj
+  * [project @ 2002-08-19 23:40:18 by stevenj]
+  added reodft stuff
+
+    A ./reodft/
+    A ./reodft/Makefile.am
+    A ./reodft/conf.c
+    A ./reodft/redft00e-r2hc.c
+    A ./reodft/reodft.h
+    A ./reodft/reodft010e-r2hc.c
+    A ./reodft/rodft00e-r2hc.c
+    A ./reodft/verify.c
+    M ./Makefile.am -2 +3
+    M ./configure.ac +2
+    M ./dft/dft.h -1 +5
+    M ./rdft/rdft.h -1 +5
+    M ./reodft/Makefile.am +6
+    M ./reodft/conf.c +37
+    M ./reodft/redft00e-r2hc.c +204
+    M ./reodft/reodft.h +44
+    M ./reodft/reodft010e-r2hc.c +400
+    M ./reodft/rodft00e-r2hc.c +199
+    M ./reodft/verify.c +521
+    M ./tests/Makefile.am -1 +1
+    M ./tests/bench.c -2 +11
+
+Sun Aug 18 19:44:14 EDT 2002  athena
+  * [project @ 2002-08-18 23:44:14 by athena]
+  Sync with nbenchfft
+
+    M ./libbench/Makefile.am +4
+    M ./libbench/verify.c -4 +3
+
+Sun Aug 18 16:02:37 EDT 2002  athena
+  * [project @ 2002-08-18 20:02:37 by athena]
+  Economy of thought
+
+    M ./genfft/complex.ml -23 +12
+    M ./genfft/complex.mli -3 +3
+    M ./genfft/twiddle.ml -5 +5
+
+Sat Aug 17 15:52:05 EDT 2002  stevenj
+  * [project @ 2002-08-17 19:52:05 by stevenj]
+  distribute addchain.c
+
+    M ./support/Makefile.am -1 +1
+
+Sat Aug 17 14:09:11 EDT 2002  athena
+  * [project @ 2002-08-17 18:09:11 by athena]
+  Nothing serious
+
+    M ./support/addchain.c -4 +7
+
+Sat Aug 17 10:47:59 EDT 2002  athena
+  * [project @ 2002-08-17 14:47:59 by athena]
+  New twiddle policy (disabled for now)
+
+    A ./support/addchain.c
+    M ./genfft/twiddle.ml -1 +98
+    M ./support/addchain.c +151
+
+Fri Aug 16 23:44:28 EDT 2002  stevenj
+  * [project @ 2002-08-17 03:44:28 by stevenj]
+  bug fix for hc2r (must use inverse dft)
+
+    M ./rdft/rank-geq2-rdft2.c -2 +2
+
+Fri Aug 16 20:27:10 EDT 2002  athena
+  * [project @ 2002-08-17 00:27:10 by athena]
+  New log3 twiddle policy
+
+    M ./dft/codelets/inplace/Makefile.am -2 +2
+    M ./dft/codelets/standard/Makefile.am -1 +1
+    M ./genfft/twiddle.ml -9 +88
+    M ./rdft/codelets/hc2r/Makefile.am +6
+    M ./rdft/codelets/r2hc/Makefile.am -1 +1
+
+Fri Aug 16 18:10:33 EDT 2002  athena
+  * [project @ 2002-08-16 22:10:33 by athena]
+  More verify cleanup
+
+    M ./dft/verify.c -20 +2
+    M ./kernel/verify-lib.c -1 +19
+    M ./kernel/verify.h +3
+    M ./rdft/verify.c -23 +5
+
+Fri Aug 16 16:31:19 EDT 2002  athena
+  * [project @ 2002-08-16 20:31:19 by athena]
+  Oops
+
+    M ./rdft/verify.c -3 +3
+
+Fri Aug 16 15:22:36 EDT 2002  athena
+  * [project @ 2002-08-16 19:22:36 by athena]
+  Economy of thought (and code)
+
+    A ./kernel/verify-lib.c
+    A ./kernel/verify.h
+    M ./dft/verify.c -305 +12
+    M ./kernel/Makefile.am -2 +2
+    M ./kernel/verify-lib.c +394
+    M ./kernel/verify.h +80
+    M ./rdft/verify.c -364 +23
+
+Fri Aug 16 14:05:45 EDT 2002  athena
+  * [project @ 2002-08-16 18:05:45 by athena]
+  Added comment
+
+    M ./TODO +2
+
+Fri Aug 16 12:57:43 EDT 2002  athena
+  * [project @ 2002-08-16 16:57:43 by athena]
+  Cleaner rounding algorithm
+
+    M ./libbench/mp.c -16 +27
+
+Fri Aug 16 11:27:43 EDT 2002  athena
+  * [project @ 2002-08-16 15:27:43 by athena]
+  Can get away with shorter length in bluestein (I think).
+
+    M ./libbench/mp.c -1 +1
+
+Fri Aug 16 11:08:09 EDT 2002  athena
+  * [project @ 2002-08-16 15:08:09 by athena]
+  Portability improvements
+
+    M ./libbench/mp.c -11 +4
+
+Fri Aug 16 08:06:31 EDT 2002  athena
+  * [project @ 2002-08-16 12:06:31 by athena]
+  Optionally average accuracy test over many rounds
+
+    M ./libbench/bench-main.c -2 +9
+    M ./libbench/bench.h -2 +2
+    M ./libbench/verify.c -29 +43
+
+Fri Aug 16 07:50:24 EDT 2002  athena
+  * [project @ 2002-08-16 11:50:24 by athena]
+  More accurate formula for trig tables
+
+    M ./dft/rader.c -3 +3
+    M ./rdft/rader-dht.c -3 +3
+    M ./rdft/rader-hc2hc.c -3 +3
+
+Fri Aug 16 06:42:02 EDT 2002  athena
+  * [project @ 2002-08-16 10:42:02 by athena]
+  Implemented accuracy test for all integers
+
+    M ./libbench/mp.c -6 +123
+    M ./libbench/verify.c -2 +1
+
+Thu Aug 15 18:54:44 EDT 2002  athena
+  * [project @ 2002-08-15 22:54:44 by athena]
+  inv, neg: make static
+
+    M ./libbench/mp.c -2 +2
+
+Thu Aug 15 17:25:37 EDT 2002  athena
+  * [project @ 2002-08-15 21:25:37 by athena]
+  Verify was not complete for real transforms
+
+    M ./libbench/verify.c -9 +34
+
+Thu Aug 15 16:30:03 EDT 2002  athena
+  * [project @ 2002-08-15 20:30:03 by athena]
+  Oops
+
+    M ./libbench/verify.c -3 +1
+
+Thu Aug 15 16:29:16 EDT 2002  athena
+  * [project @ 2002-08-15 20:29:16 by athena]
+  Fixed hb codelets
+
+    M ./genfft/gen_hc2hc.ml -3 +5
+    M ./libbench/verify.c -1 +4
+
+Thu Aug 15 14:10:45 EDT 2002  athena
+  * [project @ 2002-08-15 18:10:45 by athena]
+  Changed twiddle policy
+
+    M ./dft/codelets/inplace/Makefile.am -2 +2
+    M ./dft/codelets/standard/Makefile.am -1 +1
+    M ./rdft/codelets/r2hc/Makefile.am -1 +1
+
+Thu Aug 15 13:32:24 EDT 2002  stevenj
+  * [project @ 2002-08-15 17:32:24 by stevenj]
+  whoops
+
+    M ./rdft/direct2.c -3 +3
+
+Thu Aug 15 11:01:04 EDT 2002  athena
+  * [project @ 2002-08-15 15:01:04 by athena]
+  No point in libbench being a shared library
+
+    M ./libbench/Makefile.am -2 +2
+    M ./tests/Makefile.am -1 +1
+
+Thu Aug 15 09:48:37 EDT 2002  athena
+  * [project @ 2002-08-15 13:48:37 by athena]
+  Moved accuracy test to libbench
+
+    A ./libbench/mp.c
+    M ./libbench/Makefile.am -1 +1
+    M ./libbench/bench-main.c -4 +13
+    M ./libbench/bench.h -1 +3
+    M ./libbench/mp.c +439
+    M ./libbench/util.c +8
+    M ./libbench/verify.c -3 +66
+    M ./tests/Makefile.am -5 +1
+    R ./tests/accuracy.c
+    R ./tests/mp.c
+
+Wed Aug 14 19:48:23 EDT 2002  athena
+  * [project @ 2002-08-14 23:48:23 by athena]
+  Modified accuracy test
+
+    M ./tests/accuracy.c -33 +20
+
+Wed Aug 14 08:34:26 EDT 2002  athena
+  * [project @ 2002-08-14 12:34:26 by athena]
+  Fixes for long double
+
+    M ./tests/accuracy.c -2 +3
+    M ./tests/mp.c -1 +1
+
+Wed Aug 14 08:17:57 EDT 2002  athena
+  * [project @ 2002-08-14 12:17:57 by athena]
+  Normalize input
+
+    M ./tests/accuracy.c -3 +17
+
+Wed Aug 14 07:26:41 EDT 2002  athena
+  * [project @ 2002-08-14 11:26:41 by athena]
+  Oops
+
+    M ./tests/accuracy.c +1
+
+Wed Aug 14 07:25:34 EDT 2002  athena
+  * [project @ 2002-08-14 11:25:34 by athena]
+  Also compute relative error
+
+    M ./tests/accuracy.c -2 +13
+
+Wed Aug 14 07:08:20 EDT 2002  athena
+  * [project @ 2002-08-14 11:08:20 by athena]
+  Loop over N
+
+    M ./tests/accuracy.c -24 +32
+
+Wed Aug 14 06:54:50 EDT 2002  athena
+  * [project @ 2002-08-14 10:54:50 by athena]
+  simple-minded accuracy test
+
+    A ./tests/accuracy.c
+    A ./tests/mp.c
+    M ./tests/Makefile.am -1 +6
+    M ./tests/accuracy.c +48
+    M ./tests/mp.c +434
+
+Wed Aug 14 03:26:06 EDT 2002  stevenj
+  * [project @ 2002-08-14 07:26:06 by stevenj]
+  whoops
+
+    A ./rdft/rank-geq2-rdft2.c
+
+Tue Aug 13 11:42:41 EDT 2002  athena
+  * [project @ 2002-08-13 15:42:41 by athena]
+  fma() stuff is too nonportable, removed
+
+    M ./kernel/trig.c -45 +1
+
+Mon Aug 12 14:07:44 EDT 2002  stevenj
+  * [project @ 2002-08-12 18:07:44 by stevenj]
+  slight fix
+
+    M ./rdft/problem.c -2 +2
+
+Mon Aug 12 14:07:18 EDT 2002  stevenj
+  * [project @ 2002-08-12 18:07:18 by stevenj]
+  use table for rdft_kind_str
+
+    M ./rdft/problem.c -26 +12
+
+Mon Aug 12 13:43:08 EDT 2002  stevenj
+  * [project @ 2002-08-12 17:43:08 by stevenj]
+  slight fixes
+
+    M ./rdft/problem2.c -6 +11
+
+Mon Aug 12 13:31:37 EDT 2002  stevenj
+  * [project @ 2002-08-12 17:31:37 by stevenj]
+  multidimensional rdft2
+
+    M ./kernel/ifftw.h -2 +3
+    M ./kernel/planner.c -1 +2
+    M ./kernel/tensor.c -5 +6
+    M ./rdft/Makefile.am -2 +2
+    M ./rdft/buffered2.c -13 +18
+    M ./rdft/conf.c -1 +2
+    M ./rdft/direct2.c -9 +14
+    M ./rdft/nop2.c -2 +12
+    M ./rdft/problem2.c -36 +41
+    M ./rdft/rdft.h -7 +9
+    M ./rdft/vrank-geq1-rdft2.c -6 +23
+    M ./tests/bench.c -11 +36
+
+Sat Aug 10 19:33:23 EDT 2002  stevenj
+  * [project @ 2002-08-10 23:33:23 by stevenj]
+  use tensor_copy_inplace
+
+    M ./rdft/indirect.c -13 +5
+
+Sat Aug 10 19:32:03 EDT 2002  stevenj
+  * [project @ 2002-08-10 23:32:03 by stevenj]
+  bugfix, use tensor_copy_inplace
+
+    M ./dft/rank-geq2.c -4 +10
+
+Sat Aug 10 19:30:39 EDT 2002  stevenj
+  * [project @ 2002-08-10 23:30:39 by stevenj]
+  use tensor_copy_inplace
+
+    M ./dft/indirect.c -13 +5
+
+Sat Aug 10 19:28:07 EDT 2002  stevenj
+  * [project @ 2002-08-10 23:28:07 by stevenj]
+  added tensor_copy_inplace
+
+    M ./kernel/ifftw.h -1 +3
+    M ./kernel/tensor.c -1 +19
+
+Sat Aug 10 19:25:50 EDT 2002  stevenj
+  * [project @ 2002-08-10 23:25:50 by stevenj]
+  fixed trig-function table type
+
+    M ./kernel/twiddle.c -2 +2
+
+Sat Aug 10 14:41:04 EDT 2002  athena
+  * [project @ 2002-08-10 18:41:04 by athena]
+  Improved trig scheme
+
+    M ./kernel/trig.c -8 +53
+    M ./tests/trigtest.c -8 +74
+
+Fri Aug  9 21:05:01 EDT 2002  athena
+  * [project @ 2002-08-10 01:05:01 by athena]
+  Allow for testing using long double instead of pari
+
+    M ./tests/trigtest.c -15 +27
+
+Fri Aug  9 20:49:32 EDT 2002  athena
+  * [project @ 2002-08-10 00:49:32 by athena]
+  Yet another trig scheme.
+
+    M ./kernel/trig.c -25 +24
+    M ./tests/trigtest.c -25 +22
+
+Fri Aug  9 20:38:07 EDT 2002  athena
+  * [project @ 2002-08-10 00:38:07 by athena]
+  Yet another scheme
+
+    M ./kernel/trig.c -4 +12
+    M ./tests/trigtest.c +7
+
+Fri Aug  9 20:31:16 EDT 2002  athena
+  * [project @ 2002-08-10 00:31:16 by athena]
+  Careful with overflow
+
+    M ./kernel/ifftw.h -4 +4
+    M ./kernel/trig.c -23 +23
+    M ./tests/trigtest.c -24 +17
+
+Fri Aug  9 20:16:23 EDT 2002  athena
+  * [project @ 2002-08-10 00:16:23 by athena]
+  Avoid overflow
+
+    M ./kernel/ifftw.h -4 +4
+    M ./kernel/trig.c -20 +24
+    M ./tests/trigtest.c -21 +43
+
+Fri Aug  9 19:26:57 EDT 2002  athena
+  * [project @ 2002-08-09 23:26:57 by athena]
+  New(er) trig routines
+
+    M ./dft/rader.c -8 +6
+    M ./dft/verify.c -4 +3
+    M ./kernel/ifftw.h -4 +4
+    M ./kernel/trig.c -22 +17
+    M ./kernel/twiddle.c -9 +7
+    M ./rdft/rader-dht.c -4 +3
+    M ./rdft/rader-hc2hc.c -8 +6
+    M ./rdft/verify.c -4 +3
+    M ./tests/trigtest.c -20 +19
+
+Fri Aug  9 19:25:44 EDT 2002  athena
+  * [project @ 2002-08-09 23:25:44 by athena]
+  Oops
+
+    M ./tests/bench.c -1 +1
+
+Fri Aug  9 18:49:04 EDT 2002  athena
+  * [project @ 2002-08-09 22:49:04 by athena]
+  New file
+
+    A ./tests/trigtest.c
+
+Fri Aug  9 13:04:00 EDT 2002  athena
+  * [project @ 2002-08-09 17:04:00 by athena]
+  Commented about likely gcc bug
+
+    M ./TODO +4
+
+Fri Aug  9 13:01:49 EDT 2002  athena
+  * [project @ 2002-08-09 17:01:49 by athena]
+  Improved accuracy of twiddle factors
+
+    A ./kernel/trig.c
+    M ./dft/rader.c -8 +8
+    M ./dft/verify.c -5 +4
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/ifftw.h -10 +7
+    M ./kernel/trig.c +79
+    M ./kernel/twiddle.c -8 +10
+    M ./rdft/rader-dht.c -4 +4
+    M ./rdft/rader-hc2hc.c -8 +8
+    M ./rdft/verify.c -5 +4
+    M ./tests/bench.c -1 +1
+
+Thu Aug  8 06:36:23 EDT 2002  athena
+  * [project @ 2002-08-08 10:36:23 by athena]
+  Wrong comment
+
+    M ./simd/simd-3dnow.h -1
+
+Wed Aug  7 17:14:09 EDT 2002  athena
+  * [project @ 2002-08-07 21:14:09 by athena]
+  Experimental 3dnow port using gcc, to compare it with Stefan's stuff.
+
+    A ./simd/3dnow.c
+    A ./simd/simd-3dnow.h
+    M ./configure.ac +6
+    M ./genfft/gen_notw_c.ml -3 +6
+    M ./genfft/gen_twiddle_c.ml -2 +4
+    M ./kernel/ifftw.h -2 +2
+    M ./simd/3dnow.c +66
+    M ./simd/Makefile.am -2 +2
+    M ./simd/simd-3dnow.h +164
+    M ./simd/simd-altivec.h +2
+    M ./simd/simd-sse.h +3
+    M ./simd/simd-sse2.h +3
+    M ./simd/simd.h +4
+
+Wed Aug  7 12:58:10 EDT 2002  athena
+  * [project @ 2002-08-07 16:58:10 by athena]
+  End of AREF experiment
+
+    M ./genfft/c.ml -2 +2
+    M ./kernel/ifftw.h -3 +1
+
+Wed Aug  7 07:47:19 EDT 2002  athena
+  * [project @ 2002-08-07 11:47:19 by athena]
+  Oops
+
+    M ./configure.ac -4 +1
+
+Wed Aug  7 07:46:38 EDT 2002  athena
+  * [project @ 2002-08-07 11:46:38 by athena]
+  Pathetic attempt to reduce size of configure script
+
+    M ./configure.ac -16 +8
+
+Tue Aug  6 20:38:11 EDT 2002  athena
+  * [project @ 2002-08-07 00:38:11 by athena]
+  Changed array syntax for experiments.
+
+    M ./genfft/c.ml -8 +10
+    M ./kernel/ifftw.h -1 +3
+
+Tue Aug  6 19:58:20 EDT 2002  athena
+  * [project @ 2002-08-06 23:58:20 by athena]
+  Fix warning
+
+    M ./simd/simd-sse2.h +2
+
+Tue Aug  6 13:35:28 EDT 2002  athena
+  * [project @ 2002-08-06 17:35:28 by athena]
+  Move nonportable stuff in one place.
+
+    M ./dft/problem.c -2 +2
+    M ./kernel/align.c -1 +8
+    M ./kernel/ifftw.h -1 +2
+    M ./rdft/problem.c -2 +2
+    M ./rdft/problem2.c -2 +2
+
+Tue Aug  6 10:32:53 EDT 2002  athena
+  * [project @ 2002-08-06 14:32:53 by athena]
+  Economy of thought: I didn't like having two algorithms for removing
+  solutions, both correct.  At least now we have the same algorithm
+  copied twice.
+
+    M ./kernel/planner.c -12 +22
+
+Tue Aug  6 09:12:21 EDT 2002  athena
+  * [project @ 2002-08-06 13:12:21 by athena]
+  Added things to do
+
+    M ./TODO -1 +1
+
+Mon Aug  5 19:54:31 EDT 2002  stevenj
+  * [project @ 2002-08-05 23:54:31 by stevenj]
+  improved interaction of planner with patience flags
+
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner.c -12 +34
+
+Mon Aug  5 14:17:58 EDT 2002  stevenj
+  * [project @ 2002-08-05 18:17:58 by stevenj]
+  set up for real-even/odd DFTs, where n is not the size of the data
+
+    M ./rdft/buffered.c -3 +3
+    M ./rdft/codelet.h -7 +25
+    M ./rdft/indirect.c -4 +8
+    M ./rdft/problem.c -13 +58
+    M ./rdft/rader-hc2hc.c -1 +1
+    M ./rdft/rdft.h -1 +3
+    M ./tests/bench.c -2 +3
+
+Sun Aug  4 23:57:51 EDT 2002  stevenj
+  * [project @ 2002-08-05 03:57:51 by stevenj]
+  DESTROY_INPUT flag
+
+    M ./dft/ct-dif.c -4 +5
+    M ./dft/ct-dit.c -3 +5
+    M ./dft/ct-ditbuf.c -3 +5
+    M ./dft/ct-ditf.c -3 +5
+    M ./dft/ct.c -2 +2
+    M ./dft/ct.h -2 +3
+    M ./kernel/ifftw.h -2 +3
+    M ./rdft/hc2hc-buf.c -3 +6
+    M ./rdft/hc2hc-dif.c -3 +5
+    M ./rdft/hc2hc-dit.c -3 +5
+    M ./rdft/hc2hc.c -2 +2
+    M ./rdft/hc2hc.h -2 +3
+    M ./rdft/r2hc-hc2r.c -13 +20
+    M ./tests/bench.c +2
+
+Sun Aug  4 22:50:19 EDT 2002  stevenj
+  * [project @ 2002-08-05 02:50:19 by stevenj]
+  CLASSIC -> IMPATIENT
+
+    M ./dft/rank-geq2.c -2 +2
+    M ./dft/vrank-geq1.c -2 +2
+    M ./kernel/ifftw.h -4 +4
+    M ./kernel/planner.c -3 +3
+    M ./rdft/dft-r2hc.c -2 +2
+    M ./rdft/vrank-geq1-rdft2.c -2 +2
+    M ./rdft/vrank-geq1.c -2 +2
+    M ./tests/bench.c -1 +1
+
+Sun Aug  4 19:05:43 EDT 2002  athena
+  * [project @ 2002-08-04 23:05:43 by athena]
+  Require make maintainer-clean to remove the generator, as opposed
+  to make clean.  In this way we can type make clean without regenerating
+  all codelets.
+
+    M ./genfft/Makefile.am -11 +11
+    M ./genfft-k7/Makefile.am -3 +3
+
+Sun Aug  4 17:34:04 EDT 2002  stevenj
+  * [project @ 2002-08-04 21:34:04 by stevenj]
+  ESTIMATE plans are not blessed
+
+    M ./kernel/planner.c -10 +13
+
+Sun Aug  4 17:24:37 EDT 2002  stevenj
+  * [project @ 2002-08-04 21:24:37 by stevenj]
+  use flags in wisdom
+
+    M ./kernel/ifftw.h -2 +4
+    M ./kernel/planner.c -9 +14
+
+Sun Aug  4 17:03:45 EDT 2002  stevenj
+  * [project @ 2002-08-04 21:03:45 by stevenj]
+  score now takes plnr, not flags, as arg
+
+    M ./dft/buffered.c -3 +3
+    M ./dft/ct-dif.c -4 +3
+    M ./dft/ct-dit.c -3 +3
+    M ./dft/ct-ditbuf.c -3 +3
+    M ./dft/ct-ditf.c -3 +3
+    M ./dft/direct.c -3 +3
+    M ./dft/generic.c -2 +2
+    M ./dft/indirect.c -3 +3
+    M ./dft/nop.c -3 +3
+    M ./dft/rader.c -4 +4
+    M ./dft/rank-geq2.c -3 +3
+    M ./dft/rank0.c -3 +3
+    M ./dft/vrank-geq1.c -4 +4
+    M ./dft/vrank2-transpose.c -3 +3
+    M ./dft/vrank3-transpose.c -3 +3
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/planner-score.c -4 +3
+    M ./kernel/tensor.c -3 +3
+    M ./rdft/buffered.c -3 +3
+    M ./rdft/buffered2.c -3 +3
+    M ./rdft/dft-r2hc.c -3 +3
+    M ./rdft/direct.c -3 +3
+    M ./rdft/direct2.c -3 +3
+    M ./rdft/generic.c -2 +2
+    M ./rdft/hc2hc-buf.c -3 +3
+    M ./rdft/hc2hc-dif.c -3 +3
+    M ./rdft/hc2hc-dit.c -3 +3
+    M ./rdft/indirect.c -3 +3
+    M ./rdft/nop.c -3 +3
+    M ./rdft/nop2.c -3 +3
+    M ./rdft/r2hc-hc2r.c -3 +3
+    M ./rdft/rader-dht.c -2 +2
+    M ./rdft/rader-hc2hc.c -2 +2
+    M ./rdft/rank0.c -3 +3
+    M ./rdft/vrank-geq1-rdft2.c -4 +4
+    M ./rdft/vrank-geq1.c -4 +4
+    M ./rdft/vrank2-transpose.c -3 +3
+    M ./rdft/vrank3-transpose.c -3 +3
+    M ./tests/bench.c -1 +1
+
+Sun Aug  4 16:37:46 EDT 2002  stevenj
+  * [project @ 2002-08-04 20:37:46 by stevenj]
+  align initial stack in alignment check, which should now pass for gcc 3.1.1
+
+    M ./acinclude.m4 -2 +3
+
+Sat Aug  3 20:04:57 EDT 2002  athena
+  * [project @ 2002-08-04 00:04:57 by athena]
+  Detect ultrasparc (sort of)
+
+    M ./acinclude.m4 +3
+
+Sat Aug  3 19:38:17 EDT 2002  stevenj
+  * [project @ 2002-08-03 23:38:17 by stevenj]
+  added solvtab_rdft_r2r placeholder
+
+    M ./rdft/codelet.h -1 +3
+
+Sat Aug  3 19:34:49 EDT 2002  athena
+  * [project @ 2002-08-03 23:34:49 by athena]
+  Damn solaris
+
+    M ./support/Makefile.codelets -2 +2
+
+Sat Aug  3 17:55:44 EDT 2002  stevenj
+  * [project @ 2002-08-03 21:55:44 by stevenj]
+  use E extended precision in solvers
+
+    M ./rdft/problem.c -7 +9
+
+Sat Aug  3 17:53:29 EDT 2002  stevenj
+  * [project @ 2002-08-03 21:53:29 by stevenj]
+  an alternative notation for D{C,S}T: DXTio, where i/o are {0,1}
+  according to whether the input/output are shifted, respectively.
+  Alternatively, io is the binary representation of the usual
+  DXT-{I,II,III,IV} nomenclature, minus 1.
+
+    M ./rdft/codelet.h -2 +2
+
+Sat Aug  3 17:49:11 EDT 2002  stevenj
+  * [project @ 2002-08-03 21:49:11 by stevenj]
+  use E extended precision in solvers
+
+    M ./dft/generic.c -5 +5
+    M ./dft/rader.c -2 +2
+    M ./rdft/generic.c -21 +21
+    M ./rdft/r2hc-hc2r.c -3 +3
+    M ./rdft/rader-dht.c -4 +4
+
+Sat Aug  3 15:39:49 EDT 2002  athena
+  * [project @ 2002-08-03 19:39:49 by athena]
+  More portability fixes, compiler bugs workarounds, etc.
+
+    M ./configure.ac -2 +4
+    M ./kernel/cycle.h -2 +2
+    M ./kernel/planner.c -2 +2
+    M ./rdft/problem2.c -2 +4
+
+Sat Aug  3 15:09:56 EDT 2002  athena
+  * [project @ 2002-08-03 19:09:56 by athena]
+  More portability work
+
+    M ./configure.ac -2 +7
+    M ./kernel/cycle.h -2 +2
+    M ./kernel/ifftw.h -9 +2
+
+Sat Aug  3 14:33:40 EDT 2002  athena
+  * [project @ 2002-08-03 18:33:40 by athena]
+  Improved portability, removed gnu make dependencies
+
+    M ./acinclude.m4 -1 +1
+    M ./configure.ac -4 +3
+    M ./kernel/cycle.h -2 +2
+    M ./kernel/ifftw.h -2 +2
+    M ./support/Makefile.codelets -14 +6
+
+Sat Aug  3 13:48:53 EDT 2002  athena
+  * [project @ 2002-08-03 17:48:53 by athena]
+  Remember to thank XXX
+
+    M ./TODO +8
+
+Fri Aug  2 17:38:18 EDT 2002  athena
+  * [project @ 2002-08-02 21:38:18 by athena]
+  Multiplication on altivec requires FMA with -0.0 to be IEEE754 compliant.
+
+    M ./simd/simd-altivec.h -1 +1
+
+Fri Aug  2 15:26:37 EDT 2002  athena
+  * [project @ 2002-08-02 19:26:37 by athena]
+  Allow for extended precision in codelets
+
+    M ./genfft/c.ml -3 +4
+    M ./kernel/ifftw.h -7 +9
+
+Fri Aug  2 08:52:04 EDT 2002  athena
+  * [project @ 2002-08-02 12:52:04 by athena]
+  Shortened names
+
+    M ./dft/codelets/inplace/Makefile.am -5 +5
+
+Fri Aug  2 03:49:09 EDT 2002  stevenj
+  * [project @ 2002-08-02 07:49:09 by stevenj]
+  added infrastructure for future r2r transforms
+
+    M ./TODO -3 +1
+    M ./rdft/codelet.h -3 +33
+    M ./rdft/problem.c -1 +8
+
+Thu Aug  1 21:29:14 EDT 2002  athena
+  * [project @ 2002-08-02 01:29:05 by athena]
+  Version info
+
+    M ./Makefile.am +1
+    M ./configure.ac +2
+
+Thu Aug  1 21:06:22 EDT 2002  athena
+  * [project @ 2002-08-02 01:06:22 by athena]
+  Listened to one customer and added radix-12.  Added radix-15 for
+  consistency (whatever that is)
+
+    M ./dft/codelets/inplace/Makefile.am -1 +1
+    M ./dft/codelets/standard/Makefile.am -1 +1
+    M ./dft/k7/codelets/Makefile.am -3 +4
+    M ./dft/simd/codelets/Makefile.am -2 +2
+    M ./kernel/align.c -2 +8
+    M ./rdft/codelets/hc2r/Makefile.am -3 +3
+    M ./rdft/codelets/r2hc/Makefile.am -3 +3
+
+Thu Aug  1 19:50:53 EDT 2002  stevenj
+  * [project @ 2002-08-01 23:50:53 by stevenj]
+  whoops again, fixed the wrong line
+
+    M ./kernel/cycle.h -3 +3
+
+Thu Aug  1 19:50:16 EDT 2002  stevenj
+  * [project @ 2002-08-01 23:50:16 by stevenj]
+  whoops
+
+    M ./kernel/cycle.h -2 +2
+
+Thu Aug  1 16:01:15 EDT 2002  stevenj
+  * [project @ 2002-08-01 20:01:15 by stevenj]
+  use new AC_INIT and add VERSION to wisdom
+
+    M ./configure.ac -3 +4
+    M ./kernel/planner.c -4 +6
+
+Thu Aug  1 14:56:45 EDT 2002  stevenj
+  * [project @ 2002-08-01 18:56:45 by stevenj]
+  mygetR -> getR
+
+    M ./kernel/scan.c -3 +3
+
+Thu Aug  1 14:56:02 EDT 2002  stevenj
+  * [project @ 2002-08-01 18:56:02 by stevenj]
+  scanner cleanups: just return 0/1, simplify integer reads
+
+    M ./dft/problem.c -6 +4
+    M ./kernel/planner.c -14 +12
+    M ./kernel/scan.c -78 +42
+    M ./kernel/tensor.c -9 +8
+    M ./rdft/problem.c -7 +5
+    M ./rdft/problem2.c -7 +5
+    M ./tests/bench.c +7
+
+Thu Aug  1 08:04:01 EDT 2002  athena
+  * [project @ 2002-08-01 12:04:01 by athena]
+  Reverted back to casting pointer to ulong
+
+    M ./kernel/align.c -2 +2
+
+Thu Aug  1 08:03:46 EDT 2002  athena
+  * [project @ 2002-08-01 12:03:46 by athena]
+  Cast to unsigned long, not long
+
+    M ./kernel/ifftw.h -6 +6
+
+Thu Aug  1 03:14:50 EDT 2002  stevenj
+  * [project @ 2002-08-01 07:14:50 by stevenj]
+  additional comment
+
+    M ./kernel/scan.c -2 +4
+
+Thu Aug  1 03:12:37 EDT 2002  stevenj
+  * [project @ 2002-08-01 07:12:37 by stevenj]
+  added comment
+
+    M ./kernel/scan.c -1 +6
+
+Thu Aug  1 03:03:18 EDT 2002  stevenj
+  * [project @ 2002-08-01 07:03:18 by stevenj]
+  added wisdom import
+
+    A ./kernel/printers.c
+    A ./kernel/scan.c
+    A ./kernel/scanners.c
+    M ./dft/conf.c -1 +3
+    M ./dft/dft.h -1 +3
+    M ./dft/problem.c -2 +31
+    M ./dft/verify.c -2 +1
+    M ./kernel/Makefile.am -3 +4
+    M ./kernel/alloc.c -2 +1
+    M ./kernel/assert.c -2 +1
+    M ./kernel/debug.c -9 +2
+    M ./kernel/ifftw.h -6 +53
+    M ./kernel/planner.c -22 +106
+    M ./kernel/print.c -2 +1
+    M ./kernel/printers.c +80
+    M ./kernel/problem.c -1 +2
+    M ./kernel/scan.c +322
+    M ./kernel/scanners.c +61
+    M ./kernel/tensor.c -4 +30
+    M ./kernel/timer.c -2 +1
+    M ./rdft/conf.c -1 +4
+    M ./rdft/problem.c -2 +32
+    M ./rdft/problem2.c -2 +32
+    M ./rdft/rdft.h -1 +4
+    M ./rdft/verify.c -2 +1
+    M ./tests/bench.c -9 +31
+
+Wed Jul 31 23:12:05 EDT 2002  stevenj
+  * [project @ 2002-08-01 03:12:05 by stevenj]
+  whoops
+
+    M ./kernel/align.c -1 +2
+
+Wed Jul 31 22:06:46 EDT 2002  stevenj
+  * [project @ 2002-08-01 02:06:46 by stevenj]
+  use %u for alignment_of
+
+    M ./dft/problem.c -3 +3
+    M ./rdft/problem.c -2 +2
+    M ./rdft/problem2.c -2 +2
+
+Wed Jul 31 21:47:15 EDT 2002  stevenj
+  * [project @ 2002-08-01 01:47:15 by stevenj]
+  ptrdiff_t form
+
+    M ./kernel/align.c -2 +2
+
+Wed Jul 31 21:33:35 EDT 2002  athena
+  * [project @ 2002-08-01 01:33:35 by athena]
+  Cast to avoid warning from C++ compiler
+
+    M ./kernel/ifftw.h -2 +2
+
+Wed Jul 31 18:57:04 EDT 2002  athena
+  * [project @ 2002-07-31 22:57:04 by athena]
+  Make problem equality depend on alignments.
+
+    A ./kernel/align.c
+    M ./dft/problem.c -4 +10
+    M ./kernel/Makefile.am -5 +6
+    M ./kernel/align.c +35
+    M ./kernel/ifftw.h -1 +2
+    M ./rdft/problem.c -4 +9
+    M ./rdft/problem2.c -8 +13
+    M ./simd/simd.h +2
+
+Wed Jul 31 15:45:31 EDT 2002  athena
+  * [project @ 2002-07-31 19:45:31 by athena]
+  Shorter names
+
+    M ./dft/simd/codelets/Makefile.am -22 +18
+
+Wed Jul 31 14:38:00 EDT 2002  athena
+  * [project @ 2002-07-31 18:38:00 by athena]
+  Oops
+
+    M ./simd/simd-sse.h -1 +1
+
+Wed Jul 31 14:37:19 EDT 2002  athena
+  * [project @ 2002-07-31 18:37:19 by athena]
+  Fix warning
+
+    M ./simd/simd-sse.h +2
+
+Wed Jul 31 07:52:53 EDT 2002  athena
+  * [project @ 2002-07-31 11:52:53 by athena]
+  Removed silly abstraction barrier.  Also, cons() terminology was
+  no longer appropriate.
+
+    M ./kernel/alloc.c -2 +2
+    M ./kernel/ifftw.h -16 +19
+    M ./kernel/planner-naive.c -2 +2
+    M ./kernel/planner-score.c -2 +2
+    M ./kernel/planner.c -26 +13
+
+Tue Jul 30 22:35:24 EDT 2002  stevenj
+  * [project @ 2002-07-31 02:35:24 by stevenj]
+  removed register_registrar and solvtab_exec_reverse hacks
+
+    M ./kernel/ifftw.h -4 +3
+    M ./kernel/planner.c -24 +15
+    M ./kernel/solvtab.c -19 +6
+
+Tue Jul 30 19:54:41 EDT 2002  stevenj
+  * [project @ 2002-07-30 23:54:41 by stevenj]
+  register_registrar doesn't search whole solver list (maybe we should change register_solver instead)
+
+    M ./kernel/planner.c -1 +3
+
+Tue Jul 30 19:36:37 EDT 2002  stevenj
+  * [project @ 2002-07-30 23:36:37 by stevenj]
+  credit
+
+    M ./kernel/cycle.h -2 +2
+
+Tue Jul 30 19:34:16 EDT 2002  stevenj
+  * [project @ 2002-07-30 23:34:16 by stevenj]
+  added HP/UX ia64 support, courtesy of Teresa L. Johnson
+
+    M ./kernel/cycle.h -1 +23
+
+Tue Jul 30 13:28:33 EDT 2002  athena
+  * [project @ 2002-07-30 17:28:33 by athena]
+  Fixed alignment checks
+
+    M ./dft/simd/n1b.c -1 +1
+    M ./dft/simd/n1f.c -1 +1
+    M ./dft/simd/t1b.c -1 +1
+    M ./dft/simd/t1f.c -1 +1
+    M ./kernel/alloc.c -1 +2
+
+Tue Jul 30 01:20:11 EDT 2002  stevenj
+  * [project @ 2002-07-30 05:20:11 by stevenj]
+  ugh, wisdom id fixes in exprt_conf
+
+    M ./kernel/ifftw.h -2 +4
+    M ./kernel/planner.c -2 +4
+    M ./kernel/solvtab.c -5 +18
+
+Tue Jul 30 00:41:15 EDT 2002  stevenj
+  * [project @ 2002-07-30 04:41:15 by stevenj]
+  exprt_registrars -> exprt_conf, added missing SOLVTAB_END
+
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/planner.c -5 +6
+    M ./tests/bench.c -1 +1
+
+Tue Jul 30 00:36:26 EDT 2002  stevenj
+  * [project @ 2002-07-30 04:36:26 by stevenj]
+  exprt_registrars should output self-contained configuration
+
+    M ./kernel/planner.c -9 +20
+
+Mon Jul 29 23:52:07 EDT 2002  stevenj
+  * [project @ 2002-07-30 03:52:07 by stevenj]
+  added exprt_registrars
+
+    M ./dft/conf.c -12 +12
+    M ./kernel/ifftw.h -2 +7
+    M ./kernel/planner.c -4 +91
+    M ./kernel/solvtab.c -3 +5
+    M ./rdft/conf.c -18 +18
+    M ./support/Makefile.codelets -3 +3
+    M ./tests/bench.c +2
+
+Mon Jul 29 23:42:27 EDT 2002  stevenj
+  * [project @ 2002-07-30 03:42:27 by stevenj]
+  whoops
+
+    M ./kernel/print.c -2 +3
+
+Mon Jul 29 21:24:51 EDT 2002  athena
+  * [project @ 2002-07-30 01:24:51 by athena]
+  More stringent requirements on strides for SIMD codelets
+
+    M ./dft/simd/n1b.c -4 +4
+    M ./dft/simd/n1b.h +3
+    M ./dft/simd/n1f.c -4 +4
+    M ./dft/simd/n1f.h +3
+    M ./dft/simd/t1b.c -2 +2
+    M ./dft/simd/t1b.h -1 +6
+    M ./dft/simd/t1f.c -2 +2
+    M ./dft/simd/t1f.h +5
+    M ./simd/simd-altivec.h +2
+    M ./simd/simd-sse.h +2
+    M ./simd/simd-sse2.h +2
+    M ./simd/simd.h -2 +3
+
+Mon Jul 29 21:05:49 EDT 2002  stevenj
+  * [project @ 2002-07-30 01:05:49 by stevenj]
+  remove warning
+
+    M ./rdft/buffered2.c -2 +3
+
+Mon Jul 29 20:51:19 EDT 2002  stevenj
+  * [project @ 2002-07-30 00:51:01 by stevenj]
+  use %td for ptrdiff_t and %T for tensors
+
+    M ./dft/problem.c -2 +2
+    M ./kernel/print.c -3 +13
+    M ./kernel/traverse.c -2 +7
+    M ./rdft/problem.c -2 +2
+    M ./rdft/problem2.c -2 +2
+
+Mon Jul 29 16:17:11 EDT 2002  athena
+  * [project @ 2002-07-29 20:17:11 by athena]
+  Fix for SIMD
+
+    M ./dft/buffered.c -1 +5
+
+Mon Jul 29 15:40:53 EDT 2002  athena
+  * [project @ 2002-07-29 19:40:53 by athena]
+  Missing lfftw_mkstride and lfftw_stride_destroy
+
+    M ./kernel/ifftw.h -1 +3
+
+Mon Jul 29 14:34:46 EDT 2002  athena
+  * [project @ 2002-07-29 18:34:46 by athena]
+  Implement LDA/STA
+
+    M ./simd/simd-altivec.h +10
+
+Mon Jul 29 14:19:21 EDT 2002  athena
+  * [project @ 2002-07-29 18:19:21 by athena]
+  More SIMD work
+
+    M ./dft/simd/n1b.c -4 +4
+    M ./dft/simd/n1f.c -4 +4
+    M ./dft/simd/t1b.c -2 +2
+    M ./dft/simd/t1f.c -2 +2
+    M ./simd/simd-altivec.h -3 +2
+    M ./simd/simd-sse.h -3 +2
+    M ./simd/simd-sse2.h -2 +1
+    M ./simd/simd.h +3
+
+Mon Jul 29 13:16:12 EDT 2002  athena
+  * [project @ 2002-07-29 17:16:12 by athena]
+  Cleanup
+
+    M ./simd/simd-altivec.h -9 +9
+    M ./simd/simd-sse.h -16 +28
+    M ./simd/simd-sse2.h -13 +25
+
+Mon Jul 29 13:02:38 EDT 2002  stevenj
+  * [project @ 2002-07-29 17:02:38 by stevenj]
+  update
+
+    M ./ChangeLog +461
+
+Mon Jul 29 12:45:33 EDT 2002  athena
+  * [project @ 2002-07-29 16:45:33 by athena]
+  Also check strides in SIMD codelets
+
+    M ./dft/simd/n1b.c +4
+    M ./dft/simd/n1f.c +4
+    M ./dft/simd/t1b.c +2
+    M ./dft/simd/t1f.c +2
+    M ./simd/simd-altivec.h -1 +2
+    M ./simd/simd-sse.h +2
+    M ./simd/simd-sse2.h +2
+
+Mon Jul 29 11:26:08 EDT 2002  athena
+  * [project @ 2002-07-29 15:26:08 by athena]
+  Minor changes, mostly for consistency with the big-endian processor
+
+    M ./simd/simd-altivec.h -18 +18
+
+Mon Jul 29 00:50:06 EDT 2002  stevenj
+  * [project @ 2002-07-29 04:50:06 by stevenj]
+  added comment
+
+    M ./rdft/rader-dht.c +1
+
+Sun Jul 28 21:19:35 EDT 2002  stevenj
+  * [project @ 2002-07-29 01:19:35 by stevenj]
+  added code for icc's _mm_malloc (memalign replacement)
+
+    M ./configure.ac -1 +1
+    M ./kernel/alloc.c -4 +11
+
+Sun Jul 28 17:33:07 EDT 2002  stevenj
+  * [project @ 2002-07-28 21:33:07 by stevenj]
+  slight fixes
+
+    M ./rdft/problem2.c -6 +11
+    M ./rdft/verify.c -4 +7
+    M ./tests/bench.c -1 +2
+
+Sun Jul 28 16:28:43 EDT 2002  stevenj
+  * [project @ 2002-07-28 20:28:43 by stevenj]
+  whoops
+
+    M ./rdft/problem2.c -3 +3
+
+Sun Jul 28 16:13:19 EDT 2002  athena
+  * [project @ 2002-07-28 20:13:19 by athena]
+  Use vec_xor to change sign
+
+    M ./simd/altivec.c -2 +4
+    M ./simd/simd-altivec.h -3 +10
+
+Sun Jul 28 16:10:59 EDT 2002  stevenj
+  * [project @ 2002-07-28 20:10:59 by stevenj]
+  added rdft2
+
+    A ./rdft/buffered2.c
+    A ./rdft/direct2.c
+    A ./rdft/nop2.c
+    A ./rdft/plan2.c
+    A ./rdft/problem2.c
+    A ./rdft/solve2.c
+    A ./rdft/vrank-geq1-rdft2.c
+    M ./rdft/Makefile.am -2 +4
+    M ./rdft/buffered2.c +451
+    M ./rdft/conf.c -1 +7
+    M ./rdft/direct2.c +221
+    M ./rdft/khc2r.c -3 +3
+    M ./rdft/kr2hc.c -3 +3
+    M ./rdft/nop2.c +88
+    M ./rdft/plan2.c +33
+    M ./rdft/problem2.c +175
+    M ./rdft/rdft.h -1 +53
+    M ./rdft/solve2.c +31
+    M ./rdft/verify.c -17 +195
+    M ./rdft/vrank-geq1-rdft2.c +259
+    M ./tests/bench.c -6 +35
+
+Sun Jul 28 15:45:54 EDT 2002  athena
+  * [project @ 2002-07-28 19:45:54 by athena]
+  Optimized
+
+    M ./simd/simd-altivec.h -17 +10
+
+Sun Jul 28 15:11:14 EDT 2002  athena
+  * [project @ 2002-07-28 19:11:14 by athena]
+  Changed ALIGNMENT
+
+    M ./simd/simd-altivec.h -1 +1
+
+Sun Jul 28 15:09:40 EDT 2002  athena
+  * [project @ 2002-07-28 19:09:40 by athena]
+  alignment := 8
+
+    M ./simd/simd-sse.h -1 +1
+
+Sun Jul 28 14:57:22 EDT 2002  athena
+  * [project @ 2002-07-28 18:57:22 by athena]
+  Avoid warning
+
+    M ./simd/simd-altivec.h -1 +1
+
+Sun Jul 28 14:53:03 EDT 2002  athena
+  * [project @ 2002-07-28 18:53:03 by athena]
+  Oops
+
+    M ./simd/simd-sse2.h -2 +2
+
+Sun Jul 28 14:50:09 EDT 2002  athena
+  * [project @ 2002-07-28 18:50:09 by athena]
+  New altivec experiment
+
+    M ./genfft/annotate.ml -5 +5
+    M ./genfft/genutil.ml -7 +7
+    M ./genfft/simd.ml -3 +6
+    M ./genfft/variable.ml -38 +16
+    M ./genfft/variable.mli -8 +4
+    M ./simd/simd-altivec.h -15 +15
+    M ./simd/simd-sse.h -3 +5
+    M ./simd/simd-sse2.h -2 +2
+
+Sun Jul 28 13:48:20 EDT 2002  athena
+  * [project @ 2002-07-28 17:48:20 by athena]
+  Nothing
+
+    M ./simd/simd-altivec.h -6 +6
+
+Sun Jul 28 13:47:50 EDT 2002  athena
+  * [project @ 2002-07-28 17:47:50 by athena]
+  Oops
+
+    M ./simd/simd-altivec.h -26 +19
+
+Sun Jul 28 13:44:28 EDT 2002  athena
+  * [project @ 2002-07-28 17:44:28 by athena]
+  Nothing
+
+    M ./simd/simd-altivec.h -19 +26
+
+Sun Jul 28 10:38:10 EDT 2002  athena
+  * [project @ 2002-07-28 14:38:10 by athena]
+  Constants are now in separate file.
+
+    A ./simd/altivec.c
+    M ./simd/Makefile.am -1 +1
+    M ./simd/altivec.c +38
+    M ./simd/simd-altivec.h -20 +13
+
+Sun Jul 28 07:58:37 EDT 2002  athena
+  * [project @ 2002-07-28 11:58:37 by athena]
+  More precise comment
+
+    M ./simd/simd-altivec.h -1 +1
+
+Sun Jul 28 07:56:40 EDT 2002  athena
+  * [project @ 2002-07-28 11:56:40 by athena]
+  gcc-3.1 bug workaround
+
+    M ./simd/simd-altivec.h -42 +24
+
+Sun Jul 28 01:39:54 EDT 2002  stevenj
+  * [project @ 2002-07-28 05:39:54 by stevenj]
+  slight optimization, and exported zerotens functions
+
+    M ./dft/buffered.c -4 +4
+    M ./dft/dft.h -1 +2
+    M ./dft/problem.c -4 +4
+    M ./rdft/buffered.c -4 +4
+    M ./rdft/problem.c -4 +4
+    M ./rdft/rdft.h -1 +2
+
+Sun Jul 28 00:54:59 EDT 2002  stevenj
+  * [project @ 2002-07-28 04:54:59 by stevenj]
+  should be a plan_dft, not a plan_rdft
+
+    M ./rdft/dft-r2hc.c -2 +2
+
+Sat Jul 27 21:36:46 EDT 2002  athena
+  * [project @ 2002-07-28 01:36:46 by athena]
+  Optimizations.  Make it work with vanilla non-Apple gcc.
+
+    M ./simd/simd-altivec.h -21 +82
+
+Sat Jul 27 19:20:09 EDT 2002  stevenj
+  * [project @ 2002-07-27 23:20:09 by stevenj]
+  whoops
+
+    M ./rdft/generic.c -5
+
+Sat Jul 27 18:54:01 EDT 2002  stevenj
+  * [project @ 2002-07-27 22:54:01 by stevenj]
+  added hc2r (dif)
+
+    M ./rdft/generic.c -13 +122
+
+Sat Jul 27 18:31:43 EDT 2002  stevenj
+  * [project @ 2002-07-27 22:31:43 by stevenj]
+  add hc2r (dif) case
+
+    M ./rdft/rader-hc2hc.c -2 +158
+
+Sat Jul 27 15:09:40 EDT 2002  athena
+  * [project @ 2002-07-27 19:09:40 by athena]
+  Altivec port
+
+    M ./simd/simd-altivec.h -38 +131
+    M ./support/Makefile.codelets -3 +3
+
+Sat Jul 27 15:06:21 EDT 2002  athena
+  * [project @ 2002-07-27 19:06:21 by athena]
+  Fixed signed/unsigned bug.
+
+    M ./kernel/twiddle.c -2 +3
+
+Thu Jul 25 20:11:26 EDT 2002  athena
+  * [project @ 2002-07-26 00:11:26 by athena]
+  Make rank0 unapplicable to in-place problems.
+
+    M ./dft/rank0.c -1 +2
+    M ./rdft/rank0.c -1 +2
+
+Thu Jul 25 17:10:52 EDT 2002  stevenj
+  * [project @ 2002-07-25 21:10:52 by stevenj]
+  only works for r odd
+
+    M ./rdft/generic.c -1 +1
+
+Thu Jul 25 15:30:06 EDT 2002  athena
+  * [project @ 2002-07-25 19:30:06 by athena]
+  Reinserted much better timing-avoidance heuristic
+
+    M ./kernel/planner-score.c -2 +7
+
+Thu Jul 25 15:21:13 EDT 2002  athena
+  * [project @ 2002-07-25 19:21:13 by athena]
+  Score is now a property of the plan, not of the solver.
+  Revised representation of closures.
+
+    M ./dft/buffered.c -5 +1
+    M ./kernel/ifftw.h -2 +7
+    M ./kernel/plan.c -4 +6
+    M ./kernel/planner-score.c -17 +30
+    M ./kernel/traverse.c -17 +16
+    M ./rdft/buffered.c -5 +1
+    M ./tests/bench.c -7 +14
+
+Thu Jul 25 06:36:51 EDT 2002  athena
+  * [project @ 2002-07-25 10:36:51 by athena]
+  Cosmetic changes.  Added hc2r_128.c
+
+    M ./genfft/gen_hc2r.ml -3 +3
+    M ./genfft/gen_r2hc.ml -3 +3
+    M ./rdft/codelets/hc2r/Makefile.am -1 +1
+
+Thu Jul 25 01:37:53 EDT 2002  stevenj
+  * [project @ 2002-07-25 05:37:53 by stevenj]
+  added hc2r
+
+    M ./rdft/rader-dht.c -16 +55
+
+Thu Jul 25 00:51:45 EDT 2002  stevenj
+  * [project @ 2002-07-25 04:51:45 by stevenj]
+  added hc2hc-difbuf
+
+    A ./rdft/hc2hc-buf.c
+    M ./rdft/Makefile.am -3 +3
+    M ./rdft/hc2hc-buf.c +248
+    R ./rdft/hc2hc-ditbuf.c
+    M ./rdft/khc2hc-dif.c -3 +1
+
+Thu Jul 25 00:25:06 EDT 2002  stevenj
+  * [project @ 2002-07-25 04:25:06 by stevenj]
+  added rdft-dif
+
+    A ./rdft/hc2hc-dif.c
+    M ./rdft/Makefile.am -3 +3
+    M ./rdft/hc2hc-dif.c +128
+    M ./rdft/hc2hc.c -3 +5
+    M ./rdft/khc2hc-dif.c -4 +3
+    M ./rdft/rdft.h -1 +3
+
+Thu Jul 25 00:22:36 EDT 2002  stevenj
+  * [project @ 2002-07-25 04:22:36 by stevenj]
+  whoops, hc2r must be conjugated to have right sign
+
+    M ./rdft/verify.c -3 +3
+
+Wed Jul 24 23:27:45 EDT 2002  stevenj
+  * [project @ 2002-07-25 03:27:45 by stevenj]
+  slight change
+
+    M ./dft/ct-dif.c -4 +6
+
+Wed Jul 24 23:24:24 EDT 2002  stevenj
+  * [project @ 2002-07-25 03:24:24 by stevenj]
+  whoops
+
+    M ./rdft/verify.c -3 +2
+
+Wed Jul 24 22:46:39 EDT 2002  stevenj
+  * [project @ 2002-07-25 02:46:39 by stevenj]
+  support hc2r codelets
+
+    A ./rdft/direct.c
+    M ./rdft/Makefile.am -5 +5
+    M ./rdft/codelet.h -1 +3
+    R ./rdft/direct-r2hc.c
+    M ./rdft/direct.c +227
+    M ./rdft/khc2r.c -5 +2
+    M ./rdft/rdft.h -2 +2
+
+Wed Jul 24 22:01:53 EDT 2002  stevenj
+  * [project @ 2002-07-25 02:01:53 by stevenj]
+  use vector plan for r/i instead of two separate plans
+
+    M ./rdft/dft-r2hc.c -36 +20
+
+Wed Jul 24 20:36:34 EDT 2002  stevenj
+  * [project @ 2002-07-25 00:36:34 by stevenj]
+  hack to allow rader/generic to work in-place for small prime sizes, instead of always using buffered
+
+    M ./dft/buffered.c -1 +5
+    M ./dft/rader.c -2 +2
+    M ./kernel/ifftw.h -1 +3
+    M ./rdft/buffered.c -1 +5
+    M ./rdft/rader-dht.c -1 +1
+    M ./rdft/rader-hc2hc.c -1 +1
+
+Wed Jul 24 18:04:41 EDT 2002  stevenj
+  * [project @ 2002-07-24 22:04:41 by stevenj]
+  added rdft-generic
+
+    A ./rdft/generic.c
+    M ./rdft/Makefile.am -4 +4
+    M ./rdft/conf.c -2 +2
+    M ./rdft/generic.c +281
+
+Wed Jul 24 17:27:34 EDT 2002  stevenj
+  * [project @ 2002-07-24 21:27:34 by stevenj]
+  fixed add count
+
+    M ./dft/generic.c -1 +1
+
+Wed Jul 24 14:52:26 EDT 2002  stevenj
+  * [project @ 2002-07-24 18:52:26 by stevenj]
+  again
+
+    M ./rdft/rader-hc2hc.c -2 +2
+
+Wed Jul 24 14:51:58 EDT 2002  stevenj
+  * [project @ 2002-07-24 18:51:58 by stevenj]
+  slight fix
+
+    M ./rdft/rader-hc2hc.c -4 +5
+
+Wed Jul 24 14:51:07 EDT 2002  stevenj
+  * [project @ 2002-07-24 18:51:07 by stevenj]
+  fixed comment
+
+    M ./rdft/rader-hc2hc.c -3 +3
+
+Wed Jul 24 14:41:24 EDT 2002  stevenj
+  * [project @ 2002-07-24 18:41:24 by stevenj]
+  whoops
+
+    M ./tests/bench.c -2
+
+Wed Jul 24 14:38:15 EDT 2002  stevenj
+  * [project @ 2002-07-24 18:38:15 by stevenj]
+  added rader-hc2hc
+
+    A ./rdft/rader-hc2hc.c
+    M ./rdft/Makefile.am -2 +2
+    M ./rdft/conf.c -2 +2
+    M ./rdft/rader-hc2hc.c +482
+    M ./rdft/rdft.h -2 +2
+    M ./tests/bench.c +2
+
+Wed Jul 24 00:07:59 EDT 2002  stevenj
+  * [project @ 2002-07-24 04:07:59 by stevenj]
+  whoops, initialize W
+
+    M ./dft/rader.c +1
+
+Tue Jul 23 23:03:09 EDT 2002  stevenj
+  * [project @ 2002-07-24 03:03:09 by stevenj]
+  strides should not be unsigned
+
+    M ./rdft/rader-dht.c -2 +4
+
+Tue Jul 23 23:02:08 EDT 2002  stevenj
+  * [project @ 2002-07-24 03:02:08 by stevenj]
+  more stride sign fixes
+
+    M ./dft/rader.c -3 +5
+
+Tue Jul 23 23:01:04 EDT 2002  stevenj
+  * [project @ 2002-07-24 03:01:04 by stevenj]
+  strides should not be unsigned!
+
+    M ./dft/rader.c -1 +2
+
+Tue Jul 23 14:55:25 EDT 2002  stevenj
+  * [project @ 2002-07-23 18:55:25 by stevenj]
+  added comment
+
+    M ./rdft/dft-r2hc.c -1 +4
+
+Tue Jul 23 14:52:04 EDT 2002  stevenj
+  * [project @ 2002-07-23 18:52:04 by stevenj]
+  another fix to op count
+
+    M ./rdft/r2hc-hc2r.c -1 +4
+
+Tue Jul 23 14:51:01 EDT 2002  stevenj
+  * [project @ 2002-07-23 18:51:01 by stevenj]
+  whoops
+
+    M ./rdft/r2hc-hc2r.c -2 +2
+
+Tue Jul 23 14:49:43 EDT 2002  stevenj
+  * [project @ 2002-07-23 18:49:43 by stevenj]
+  slight fix to op counts
+
+    M ./rdft/dft-r2hc.c -3 +3
+    M ./rdft/r2hc-hc2r.c -3 +3
+
+Tue Jul 23 14:09:19 EDT 2002  stevenj
+  * [project @ 2002-07-23 18:09:18 by stevenj]
+  added dft-r2hc
+
+    A ./rdft/dft-r2hc.c
+    M ./rdft/Makefile.am -5 +6
+    M ./rdft/conf.c -2 +3
+    M ./rdft/dft-r2hc.c +187
+    M ./rdft/rdft.h -1 +2
+
+Tue Jul 23 02:50:12 EDT 2002  stevenj
+  * [project @ 2002-07-23 06:50:12 by stevenj]
+  better comment and var. name
+
+    M ./rdft/rader-dht.c -7 +7
+
+Tue Jul 23 02:39:11 EDT 2002  stevenj
+  * [project @ 2002-07-23 06:39:11 by stevenj]
+  fixed tests for hc2r, and added r2hc-hc2r
+
+    A ./rdft/r2hc-hc2r.c
+    M ./rdft/Makefile.am -3 +3
+    M ./rdft/conf.c -1 +2
+    M ./rdft/r2hc-hc2r.c +182
+    M ./rdft/rdft.h -1 +2
+    M ./rdft/verify.c -1 +46
+    M ./tests/bench.c -1 +6
+
+Tue Jul 23 00:45:23 EDT 2002  stevenj
+  * [project @ 2002-07-23 04:45:23 by stevenj]
+  added rader-dht
+
+    A ./rdft/rader-dht.c
+    M ./rdft/Makefile.am -2 +3
+    M ./rdft/conf.c -1 +2
+    M ./rdft/rader-dht.c +433
+    M ./rdft/rdft.h -1 +2
+
+Mon Jul 22 21:05:12 EDT 2002  athena
+  * [project @ 2002-07-23 01:05:12 by athena]
+  Added r2hc_128, what the hell.
+
+    M ./rdft/codelets/r2hc/Makefile.am -1 +1
+
+Mon Jul 22 20:48:59 EDT 2002  athena
+  * [project @ 2002-07-23 00:48:59 by athena]
+  Added codelets that compute twiddle factors
+
+    M ./rdft/codelets/r2hc/Makefile.am -1 +7
+
+Mon Jul 22 19:57:16 EDT 2002  stevenj
+  * [project @ 2002-07-22 23:57:16 by stevenj]
+  added rdft-buffered
+
+    A ./rdft/buffered.c
+    M ./rdft/Makefile.am -2 +2
+    M ./rdft/buffered.c +330
+    M ./rdft/conf.c -2 +2
+
+Mon Jul 22 19:43:39 EDT 2002  stevenj
+  * [project @ 2002-07-22 23:43:39 by stevenj]
+  added hc2hc-ditbuf
+
+    A ./rdft/hc2hc-ditbuf.c
+    M ./rdft/Makefile.am -3 +3
+    M ./rdft/hc2hc-ditbuf.c +189
+    M ./rdft/khc2hc-dit.c -3 +1
+
+Mon Jul 22 14:29:04 EDT 2002  stevenj
+  * [project @ 2002-07-22 18:29:04 by stevenj]
+  use STACK_MALLOC (alloca), since generic radix is always small
+
+    M ./dft/generic.c -3 +3
+
+Mon Jul 22 14:22:43 EDT 2002  stevenj
+  * [project @ 2002-07-22 18:22:43 by stevenj]
+  small cleanup
+
+    M ./rdft/hc2hc-dit.c -2 +2
+
+Mon Jul 22 07:42:13 EDT 2002  athena
+  * [project @ 2002-07-22 11:42:13 by athena]
+  What the hell was I thinking?
+
+    M ./rdft/problem.c -18 +8
+
+Mon Jul 22 07:37:12 EDT 2002  athena
+  * [project @ 2002-07-22 11:37:12 by athena]
+  Reduced code size by using table instead of switch statement.
+
+    M ./rdft/problem.c -12 +16
+
+Mon Jul 22 07:27:06 EDT 2002  athena
+  * [project @ 2002-07-22 11:27:06 by athena]
+  Changed hash function to avoid collisions with DFT.
+
+    M ./rdft/problem.c -4 +4
+
+Mon Jul 22 01:37:06 EDT 2002  stevenj
+  * [project @ 2002-07-22 05:37:06 by stevenj]
+  added missing file, whoops
+
+    A ./rdft/hc2hc-dit.c
+
+Mon Jul 22 01:24:17 EDT 2002  stevenj
+  * [project @ 2002-07-22 05:24:17 by stevenj]
+  whoops, generate enough twiddles for odd m
+
+    M ./rdft/hc2hc.c -2 +2
+
+Mon Jul 22 01:10:21 EDT 2002  stevenj
+  * [project @ 2002-07-22 05:10:21 by stevenj]
+  don't try to verify R2HCII or HC2RIII plans
+
+    M ./rdft/verify.c -5 +7
+
+Mon Jul 22 01:05:00 EDT 2002  stevenj
+  * [project @ 2002-07-22 05:05:00 by stevenj]
+  recursive case now works, I think
+
+    M ./rdft/hc2hc.c -9 +4
+
+Mon Jul 22 01:04:40 EDT 2002  stevenj
+  * [project @ 2002-07-22 05:04:40 by stevenj]
+  add extra impulse test for debugging
+
+    M ./rdft/verify.c -1 +4
+
+Mon Jul 22 01:02:38 EDT 2002  stevenj
+  * [project @ 2002-07-22 05:02:38 by stevenj]
+  whoops, multiply ios offset by stride (and rename to ioffset)
+
+    M ./rdft/direct-r2hc.c -4 +5
+
+Mon Jul 22 00:22:02 EDT 2002  stevenj
+  * [project @ 2002-07-22 04:22:02 by stevenj]
+  whoops
+
+    M ./rdft/verify.c -2 +2
+
+Sun Jul 21 23:58:14 EDT 2002  stevenj
+  * [project @ 2002-07-22 03:58:14 by stevenj]
+  whoops
+
+    M ./rdft/verify.c -2 +2
+
+Sun Jul 21 23:43:03 EDT 2002  stevenj
+  * [project @ 2002-07-22 03:43:03 by stevenj]
+  added hc2hc-dit
+
+    M ./genfft/gen_hc2hc.ml -3 +3
+    M ./rdft/Makefile.am -2 +2
+    M ./rdft/hc2hc.c -10 +21
+    M ./rdft/khc2hc-dit.c -4 +3
+
+Sun Jul 21 23:15:12 EDT 2002  stevenj
+  * [project @ 2002-07-22 03:15:12 by stevenj]
+  twiddles can be shared with smaller m's
+
+    M ./kernel/twiddle.c -4 +4
+
+Sun Jul 21 22:34:28 EDT 2002  stevenj
+  * [project @ 2002-07-22 02:34:28 by stevenj]
+  preparing for recursive rdft...
+
+    A ./rdft/hc2hc.c
+    A ./rdft/hc2hc.h
+    M ./rdft/Makefile.am -3 +3
+    M ./rdft/codelet.h -1 +2
+    M ./rdft/codelets/hfb.c -2 +2
+    M ./rdft/hc2hc.c +249
+    M ./rdft/hc2hc.h +69
+
+Sun Jul 21 19:31:22 EDT 2002  stevenj
+  * [project @ 2002-07-21 23:31:22 by stevenj]
+  slight fix, to match libbench/verify.c
+
+    M ./rdft/verify.c -3 +3
+
+Sun Jul 21 18:43:12 EDT 2002  stevenj
+  * [project @ 2002-07-21 22:43:12 by stevenj]
+  r2hcII has imag parts offset by n-1, not n.  We can also allocate fewer strides.
+
+    M ./rdft/direct-r2hc.c -4 +7
+
+Sun Jul 21 18:27:09 EDT 2002  stevenj
+  * [project @ 2002-07-21 22:27:09 by stevenj]
+  delete unused var
+
+    M ./rdft/rank0.c -2 +2
+
+Sun Jul 21 02:06:53 EDT 2002  stevenj
+  * [project @ 2002-07-21 06:06:53 by stevenj]
+  added some rdft solvers
+
+    A ./rdft/direct-r2hc.c
+    A ./rdft/indirect.c
+    A ./rdft/nop.c
+    A ./rdft/rank0.c
+    A ./rdft/vrank-geq1.c
+    A ./rdft/vrank2-transpose.c
+    A ./rdft/vrank3-transpose.c
+    M ./rdft/Makefile.am -2 +4
+    M ./rdft/codelet.h -3 +8
+    M ./rdft/codelets/hc2r.c -2 +2
+    M ./rdft/codelets/r2hc.c -2 +2
+    M ./rdft/conf.c -4 +4
+    M ./rdft/direct-r2hc.c +174
+    M ./rdft/indirect.c +235
+    R ./rdft/khc2rIII.c
+    M ./rdft/kr2hc.c -5 +2
+    R ./rdft/kr2hcII.c
+    M ./rdft/nop.c +97
+    M ./rdft/problem.c -3 +20
+    M ./rdft/rank0.c +217
+    M ./rdft/rdft.h -5 +4
+    M ./rdft/vrank-geq1.c +260
+    M ./rdft/vrank2-transpose.c +130
+    M ./rdft/vrank3-transpose.c +171
+
+Sun Jul 21 01:52:54 EDT 2002  stevenj
+  * [project @ 2002-07-21 05:52:54 by stevenj]
+  pass identifier in FFTW() through another macro so that the mangled name
+  can itself be a preprocessor symbol
+
+    M ./kernel/fftw3.h -4 +5
+
+Sun Jul 21 01:05:21 EDT 2002  stevenj
+  * [project @ 2002-07-21 05:05:21 by stevenj]
+  fix in comment
+
+    M ./dft/vrank-geq1.c -2 +2
+
+Sun Jul 21 00:47:03 EDT 2002  stevenj
+  * [project @ 2002-07-21 04:47:03 by stevenj]
+  bench tests rdft plans
+
+    M ./Makefile.am -2 +5
+    M ./rdft/rdft.h -2 +3
+    M ./tests/bench.c -16 +36
+
+Sun Jul 21 00:22:14 EDT 2002  stevenj
+  * [project @ 2002-07-21 04:22:14 by stevenj]
+  make rdft.h and dft.h compatible
+
+    M ./rdft/codelet.h -4 +4
+    M ./tests/Makefile.am -1 +1
+    M ./tests/bench.c -1 +1
+
+Sun Jul 21 00:12:19 EDT 2002  stevenj
+  * [project @ 2002-07-21 04:12:19 by stevenj]
+  first-draft rdft verify
+
+    A ./rdft/verify.c
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/problem.c -8 +8
+    M ./rdft/rdft.h -5 +5
+    M ./rdft/verify.c +554
+
+Sat Jul 20 22:09:15 EDT 2002  stevenj
+  * [project @ 2002-07-21 02:09:15 by stevenj]
+  got rid of annoying warnings
+
+    M ./rdft/khc2hc-dif.c -1 +2
+    M ./rdft/khc2hc-dit.c -1 +2
+    M ./rdft/khc2r.c -1 +2
+    M ./rdft/khc2rIII.c -1 +2
+    M ./rdft/kr2hc.c -1 +2
+    M ./rdft/kr2hcII.c -1 +2
+
+Sat Jul 20 22:07:37 EDT 2002  stevenj
+  * [project @ 2002-07-21 02:07:37 by stevenj]
+  added stub codelet registration for linking purposes
+
+    A ./rdft/khc2hc-dif.c
+    A ./rdft/khc2hc-dit.c
+    A ./rdft/khc2r.c
+    A ./rdft/khc2rIII.c
+    A ./rdft/kr2hc.c
+    A ./rdft/kr2hcII.c
+    M ./rdft/Makefile.am -1 +2
+    M ./rdft/khc2hc-dif.c +31
+    M ./rdft/khc2hc-dit.c +31
+    M ./rdft/khc2r.c +31
+    M ./rdft/khc2rIII.c +31
+    M ./rdft/kr2hc.c +31
+    M ./rdft/kr2hcII.c +31
+    M ./rdft/rdft.h -2 +2
+
+Sat Jul 20 21:46:03 EDT 2002  stevenj
+  * [project @ 2002-07-21 01:46:03 by stevenj]
+  basic rdft stuff
+
+    A ./rdft/conf.c
+    A ./rdft/plan.c
+    A ./rdft/problem.c
+    A ./rdft/rdft.h
+    A ./rdft/solve.c
+    M ./rdft/Makefile.am -1 +1
+    M ./rdft/conf.c +47
+    M ./rdft/plan.c +33
+    M ./rdft/problem.c +145
+    M ./rdft/rdft.h +82
+    M ./rdft/solve.c +31
+
+Sat Jul 20 21:06:50 EDT 2002  stevenj
+  * [project @ 2002-07-21 01:06:50 by stevenj]
+  rdft codelets now compile
+
+    A ./rdft/Makefile.am
+    A ./rdft/codelet.h
+    A ./rdft/codelets/Makefile.am
+    A ./rdft/codelets/hb.h
+    A ./rdft/codelets/hc2r.c
+    A ./rdft/codelets/hc2r.h
+    A ./rdft/codelets/hc2rIII.h
+    A ./rdft/codelets/hf.h
+    A ./rdft/codelets/hfb.c
+    A ./rdft/codelets/r2hc.c
+    A ./rdft/codelets/r2hc.h
+    A ./rdft/codelets/r2hcII.h
+    M ./Makefile.am -1 +1
+    M ./configure.ac +2
+    M ./dft/codelet.h -33 +1
+    M ./genfft/gen_hc2hc.ml -3 +3
+    M ./kernel/ifftw.h -1 +35
+    M ./rdft/Makefile.am +6
+    M ./rdft/codelet.h +133
+    M ./rdft/codelets/Makefile.am +7
+    M ./rdft/codelets/hb.h +23
+    M ./rdft/codelets/hc2r/Makefile.am -3 +3
+    M ./rdft/codelets/hc2r.c +44
+    M ./rdft/codelets/hc2r.h +23
+    M ./rdft/codelets/hc2rIII.h +23
+    M ./rdft/codelets/hf.h +23
+    M ./rdft/codelets/hfb.c +41
+    M ./rdft/codelets/r2hc/Makefile.am -2 +2
+    M ./rdft/codelets/r2hc.c +44
+    M ./rdft/codelets/r2hc.h +23
+    M ./rdft/codelets/r2hcII.h +23
+
+Sat Jul 20 18:40:31 EDT 2002  athena
+  * [project @ 2002-07-20 22:40:31 by athena]
+  Oops, was generating rdfts instead of hdfts
+
+    M ./genfft/gen_hc2r.ml -3 +3
+
+Sat Jul 20 18:25:47 EDT 2002  athena
+  * [project @ 2002-07-20 22:25:47 by athena]
+  Added hc2r codelets
+
+    A ./rdft/codelets/hc2r/
+    A ./rdft/codelets/hc2r/Makefile.am
+    M ./TODO +4
+    M ./configure.ac +1
+    M ./genfft/twiddle.ml -7 +14
+    M ./genfft-k7/twiddle.ml -12 +7
+    M ./kernel/twiddle.c -26 +39
+    M ./rdft/codelets/hc2r/Makefile.am -1 +42
+
+Sat Jul 20 17:54:39 EDT 2002  athena
+  * [project @ 2002-07-20 21:54:39 by athena]
+  return W in hc2hc codelets
+
+    M ./genfft/gen_hc2hc.ml -3 +4
+
+Sat Jul 20 17:51:06 EDT 2002  athena
+  * [project @ 2002-07-20 21:51:06 by athena]
+  Some work on rdft codelets
+
+    A ./rdft/
+    A ./rdft/codelets/
+    A ./rdft/codelets/r2hc/
+    A ./rdft/codelets/r2hc/Makefile.am
+    M ./configure.ac +2
+    M ./dft/codelets/inplace/Makefile.am +4
+    M ./dft/codelets/standard/Makefile.am +3
+    M ./dft/simd/codelets/Makefile.am -1 +1
+    M ./genfft/gen_hc2hc.ml -62 +18
+    M ./genfft/gen_hc2r.ml -5 +24
+    M ./genfft/gen_r2hc.ml -5 +21
+    M ./genfft/trig.ml -22 +22
+    M ./rdft/codelets/r2hc/Makefile.am -1 +42
+    M ./support/Makefile.codelets -9 +6
+
+Tue Jul 16 13:55:50 EDT 2002  athena
+  * [project @ 2002-07-16 17:55:50 by athena]
+  fix const
+
+    M ./kernel/fftw3.h -3 +4
+
+Tue Jul 16 07:00:10 EDT 2002  athena
+  * [project @ 2002-07-16 11:00:10 by athena]
+  Separate CFLAGS in codelets.  Fix const in certain places.
+
+    M ./acinclude.m4 -2 +6
+    M ./configure.ac +1
+    M ./dft/codelets/inplace/Makefile.am +1
+    M ./dft/codelets/standard/Makefile.am +1
+    M ./dft/indirect.c -2 +2
+    M ./dft/rank0.c -2 +2
+    M ./dft/simd/codelets/Makefile.am +1
+    M ./kernel/version.c -3 +4
+    M ./tests/bench.c -2 +9
+
+Mon Jul 15 21:10:42 EDT 2002  stevenj
+  * [project @ 2002-07-16 01:10:42 by stevenj]
+  note buffering problem
+
+    M ./TODO +2
+
+Mon Jul 15 20:27:51 EDT 2002  athena
+  * [project @ 2002-07-16 00:27:51 by athena]
+  Removed unpredictable branch from inner loop
+
+    M ./dft/generic.c -2 +2
+
+Mon Jul 15 19:35:04 EDT 2002  stevenj
+  * [project @ 2002-07-15 23:35:04 by stevenj]
+  update
+
+    M ./TODO -2
+
+Mon Jul 15 19:31:39 EDT 2002  stevenj
+  * [project @ 2002-07-15 23:31:39 by stevenj]
+  optimization
+
+    M ./dft/generic.c -1 +2
+
+Mon Jul 15 19:28:30 EDT 2002  stevenj
+  * [project @ 2002-07-15 23:28:30 by stevenj]
+  added generic dit
+
+    A ./dft/generic.c
+    M ./dft/Makefile.am -4 +4
+    M ./dft/conf.c -1 +2
+    M ./dft/dft.h -1 +2
+    M ./dft/generic.c +219
+    M ./kernel/ifftw.h -2 +3
+    M ./kernel/twiddle.c -1 +13
+
+Mon Jul 15 17:03:53 EDT 2002  stevenj
+  * [project @ 2002-07-15 21:03:53 by stevenj]
+  whoops, mksolver should be static
+
+    M ./dft/rader.c -2 +2
+
+Mon Jul 15 16:46:36 EDT 2002  athena
+  * [project @ 2002-07-15 20:46:35 by athena]
+  First implementation of gen_hc2hc, probably still buggy.
+
+    A ./genfft/gen_hc2hc.ml
+    M ./genfft/Makefile.am -9 +15
+    M ./genfft/algsimp.ml -4 +5
+    M ./genfft/c.ml -3 +10
+    M ./genfft/c.mli -1 +2
+    M ./genfft/gen_hc2hc.ml +220
+    M ./genfft/gen_hc2r.ml -24 +69
+    M ./genfft/gen_r2hc.ml -22 +66
+    M ./genfft/genutil.ml -13 +17
+
+Mon Jul 15 16:40:23 EDT 2002  stevenj
+  * [project @ 2002-07-15 20:40:23 by stevenj]
+  don't count loading of twiddle factors in ops.other, since it isn't
+  counted for the codelets
+
+    M ./dft/rader.c -1
+
+Mon Jul 15 15:13:19 EDT 2002  stevenj
+  * [project @ 2002-07-15 19:13:19 by stevenj]
+  plan_destroy puts plan to sleep before deallocating it, to eliminate duplicate free calls in solvers
+
+    M ./dft/ct.c -2 +1
+    M ./dft/rader.c -2
+    M ./kernel/plan.c -2 +5
+
+Mon Jul 15 15:07:41 EDT 2002  stevenj
+  * [project @ 2002-07-15 19:07:41 by stevenj]
+  fftw2-like vector recursion flag
+
+    M ./dft/ct-dif.c -1 +6
+    M ./dft/ct-dit.c -2 +2
+    M ./dft/ct-ditbuf.c -2 +2
+    M ./dft/ct.c -1 +5
+    M ./dft/vrank-geq1.c -2 +10
+    M ./kernel/ifftw.h -2 +6
+    M ./tests/bench.c -2 +1
+
+Sun Jul 14 21:01:44 EDT 2002  athena
+  * [project @ 2002-07-15 01:01:44 by athena]
+  More jokes
+
+    M ./kernel/planner.c -1 +2
+
+Sun Jul 14 20:36:01 EDT 2002  athena
+  * [project @ 2002-07-15 00:36:01 by athena]
+  Bless plan for testing purposes
+
+    M ./tests/bench.c +1
+
+Sun Jul 14 20:35:49 EDT 2002  athena
+  * [project @ 2002-07-15 00:35:49 by athena]
+  Canonical linked-list deletion (hope it is right)
+
+    M ./kernel/planner.c -12 +13
+
+Sun Jul 14 18:26:19 EDT 2002  stevenj
+  * [project @ 2002-07-14 22:26:19 by stevenj]
+  use estimating planner for cld_omega
+
+    M ./dft/rader.c -1 +1
+
+Sun Jul 14 18:10:56 EDT 2002  stevenj
+  * [project @ 2002-07-14 22:10:56 by stevenj]
+  better internal naming
+
+    M ./dft/rader.c -3 +3
+
+Sun Jul 14 18:10:01 EDT 2002  stevenj
+  * [project @ 2002-07-14 22:10:01 by stevenj]
+  printing should really be fixed now, grrr
+
+    M ./dft/rader.c -10 +12
+
+Sun Jul 14 17:57:12 EDT 2002  stevenj
+  * [project @ 2002-07-14 21:57:12 by stevenj]
+  print all distinct child plans
+
+    M ./dft/rader.c -2 +7
+
+Sun Jul 14 17:49:21 EDT 2002  stevenj
+  * [project @ 2002-07-14 21:49:21 by stevenj]
+  whoops
+
+    M ./tests/bench.c -1 +1
+
+Sun Jul 14 17:45:54 EDT 2002  stevenj
+  * [project @ 2002-07-14 21:45:54 by stevenj]
+  whoops, destroy should delete twiddle/omega from list
+
+    M ./dft/rader.c -4 +2
+
+Sun Jul 14 17:33:02 EDT 2002  stevenj
+  * [project @ 2002-07-14 21:33:02 by stevenj]
+  whoops
+
+    M ./kernel/planner.c -3 +6
+
+Sun Jul 14 17:12:14 EDT 2002  stevenj
+  * [project @ 2002-07-14 21:12:14 by stevenj]
+  added plan_bless and FORGET_ACCURSED
+
+    M ./kernel/ifftw.h -2 +6
+    M ./kernel/plan.c -1 +14
+    M ./kernel/planner.c -13 +24
+
+Sun Jul 14 16:15:43 EDT 2002  stevenj
+  * [project @ 2002-07-14 20:15:43 by stevenj]
+  further cleanup
+
+    M ./kernel/traverse.c -6 +4
+
+Sun Jul 14 16:14:15 EDT 2002  stevenj
+  * [project @ 2002-07-14 20:14:15 by stevenj]
+  slight cleanup
+
+    M ./kernel/traverse.c -8 +7
+
+Sun Jul 14 16:09:17 EDT 2002  stevenj
+  * [project @ 2002-07-14 20:09:17 by stevenj]
+  added traverse_plan via print (ugh)
+
+    A ./kernel/traverse.c
+    M ./kernel/Makefile.am -3 +3
+    M ./kernel/ifftw.h -1 +5
+    M ./kernel/traverse.c +115
+    M ./tests/bench.c +12
+
+Sun Jul 14 15:08:29 EDT 2002  stevenj
+  * [project @ 2002-07-14 19:08:29 by stevenj]
+  added TW_FULL, and additional n parameter for twiddles
+
+    M ./dft/ct.c -3 +4
+    M ./kernel/ifftw.h -5 +5
+    M ./kernel/twiddle.c -18 +34
+
+Sun Jul 14 15:03:51 EDT 2002  stevenj
+  * [project @ 2002-07-14 19:03:51 by stevenj]
+  whoops
+
+    M ./kernel/planner.c -2 +4
+
+Sun Jul 14 13:49:20 EDT 2002  stevenj
+  * [project @ 2002-07-14 17:49:20 by stevenj]
+  save flags before invoking solver mkplan
+
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/planner-naive.c -2 +2
+    M ./kernel/planner-score.c -2 +2
+    M ./kernel/planner.c -3 +10
+
+Sun Jul 14 09:28:37 EDT 2002  athena
+  * [project @ 2002-07-14 13:28:37 by athena]
+
+    M ./TODO -6 +1
+
+Sat Jul 13 22:17:29 EDT 2002  stevenj
+  * [project @ 2002-07-14 02:17:29 by stevenj]
+  added support for UNICOS _rtc() real-time-clock intrinsic function
+
+    M ./configure.ac -1 +10
+    M ./kernel/cycle.h -2 +20
+
+Sat Jul 13 22:06:35 EDT 2002  stevenj
+  * [project @ 2002-07-14 02:06:35 by stevenj]
+  fixed typo: HAVE_TIME_H should include time.h, not sys/time.h
+
+    M ./kernel/timer.c -2 +2
+
+Sat Jul 13 21:46:02 EDT 2002  stevenj
+  * [project @ 2002-07-14 01:46:02 by stevenj]
+  support AIX read_real_time timer
+
+    M ./configure.ac +1
+    M ./kernel/cycle.h -1 +24
+
+Sat Jul 13 17:02:51 EDT 2002  stevenj
+  * [project @ 2002-07-13 21:02:51 by stevenj]
+  use && instead of the (sigh) unportable -a
+
+    M ./configure.ac -1 +1
+
+Sat Jul 13 16:38:18 EDT 2002  stevenj
+  * [project @ 2002-07-13 20:38:18 by stevenj]
+  use AC_HELP_STRING
+
+    M ./configure.ac -11 +11
+
+Sat Jul 13 16:05:43 EDT 2002  stevenj
+  * [project @ 2002-07-13 20:05:43 by stevenj]
+  support long-double precision
+
+    A ./kernel/lfftw3.h
+    M ./configure.ac -2 +22
+    M ./dft/codelet.h -2 +6
+    M ./dft/verify.c -10 +29
+    M ./kernel/Makefile.am -3 +3
+    M ./kernel/dfftw3.h -1 +2
+    M ./kernel/fftw3.h -2 +5
+    M ./kernel/ifftw.h -9 +14
+    M ./kernel/lfftw3.h +25
+    M ./kernel/sfftw3.h -1 +2
+    M ./libbench/bench-main.c -3 +5
+    M ./libbench/bench-user.h -4 +8
+    M ./libbench/info.c -2 +4
+    M ./libbench/verify.c -10 +38
+    M ./simd/simd-sse2.h -1 +1
+
+Sat Jul 13 15:48:10 EDT 2002  stevenj
+  * [project @ 2002-07-13 19:48:10 by stevenj]
+  whoops whoops
+
+    M ./dft/rader.c -2 +2
+
+Sat Jul 13 15:47:39 EDT 2002  stevenj
+  * [project @ 2002-07-13 19:47:39 by stevenj]
+  whoops
+
+    M ./dft/rader.c -1 +1
+
+Sat Jul 13 14:13:42 EDT 2002  stevenj
+  * [project @ 2002-07-13 18:13:42 by stevenj]
+  buffered solver strides have been fixed
+
+    M ./TODO -7
+
+Sat Jul 13 13:48:13 EDT 2002  stevenj
+  * [project @ 2002-07-13 17:48:13 by stevenj]
+  convention
+
+    M ./dft/rader.c -1 +1
+
+Sat Jul 13 12:50:06 EDT 2002  stevenj
+  * [project @ 2002-07-13 16:50:06 by stevenj]
+  share twiddle arrays in Rader
+
+    M ./TODO -2
+    M ./dft/rader.c -5 +68
+
+Sat Jul 13 12:48:10 EDT 2002  stevenj
+  * [project @ 2002-07-13 16:48:10 by stevenj]
+  call done() after verify
+
+    M ./libbench/verify.c -1 +2
+
+Fri Jul 12 15:42:04 EDT 2002  stevenj
+  * [project @ 2002-07-12 19:42:04 by stevenj]
+  output planner time with -v
+
+    M ./tests/bench.c +7
+
+Fri Jul 12 15:40:14 EDT 2002  stevenj
+  * [project @ 2002-07-12 19:40:14 by stevenj]
+  support double outputs
+
+    M ./kernel/print.c -2 +9
+
+Fri Jul 12 15:09:19 EDT 2002  stevenj
+  * [project @ 2002-07-12 19:09:19 by stevenj]
+  removed extraneous parens
+
+    M ./dft/vrank-geq1.c -2 +2
+
+Fri Jul 12 15:08:13 EDT 2002  stevenj
+  * [project @ 2002-07-12 19:08:13 by stevenj]
+  increase maxbufsz to 64k; makes a big difference for large 2d transforms
+
+    M ./dft/buffered.c -2 +2
+
+Fri Jul 12 05:59:26 EDT 2002  athena
+  * [project @ 2002-07-12 09:59:26 by athena]
+  Fix
+
+    M ./dft/vrank-geq1.c -2 +2
+
+Fri Jul 12 01:22:38 EDT 2002  stevenj
+  * [project @ 2002-07-12 05:22:38 by stevenj]
+  fix comment
+
+    M ./dft/rank-geq2.c -2 +2
+
+Fri Jul 12 00:59:29 EDT 2002  stevenj
+  * [project @ 2002-07-12 04:59:29 by stevenj]
+  fix in comment
+
+    M ./kernel/tensor.c -2 +2
+
+Fri Jul 12 00:13:13 EDT 2002  stevenj
+  * [project @ 2002-07-12 04:13:13 by stevenj]
+  updated
+
+    M ./ChangeLog +873
+
+Thu Jul 11 23:39:27 EDT 2002  stevenj
+  * [project @ 2002-07-12 03:39:27 by stevenj]
+  buffered malloc's buffers
+
+    M ./TODO -3
+
+Thu Jul 11 23:30:26 EDT 2002  stevenj
+  * [project @ 2002-07-12 03:30:26 by stevenj]
+  share more code between apply and apply_dit in Rader
+
+    M ./TODO -2
+    M ./dft/rader.c -69 +30
+
+Mon Jul  8 12:30:34 EDT 2002  athena
+  * [project @ 2002-07-08 16:30:34 by athena]
+  Polished
+
+    M ./simd/simd-sse.h -18 +28
+    M ./simd/simd-sse2.h -25 +31
+    M ./simd/sse.c -4 +2
+    M ./simd/sse2.c -3 +2
+
+Mon Jul  8 09:47:11 EDT 2002  athena
+  * [project @ 2002-07-08 13:47:11 by athena]
+
+    M ./support/Makefile.codelets -1 +1
+
+Mon Jul  8 09:42:08 EDT 2002  athena
+  * [project @ 2002-07-08 13:42:08 by athena]
+  SIMD/FMA stuff
+
+    M ./dft/simd/codelets/Makefile.am -1 +1
+    M ./genfft/c.ml -7 +2
+    M ./genfft/gen_notw_c.ml -2 +3
+    M ./genfft/simd.ml -2 +13
+    M ./genfft/to_alist.ml -1 +3
+    M ./genfft/to_alist.mli -1 +2
+    M ./simd/simd-sse.h +3
+    M ./simd/simd-sse2.h +3
+    M ./support/Makefile.codelets -1 +2
+
+Mon Jul  8 07:43:51 EDT 2002  athena
+  * [project @ 2002-07-08 11:43:51 by athena]
+  Avoid code duplication
+
+    M ./simd/simd-sse.h -12 +6
+
+Sun Jul  7 20:56:15 EDT 2002  athena
+  * [project @ 2002-07-08 00:56:15 by athena]
+  Fixes for FMA+SIMD
+
+    M ./genfft/Makefile.am -3 +3
+    M ./genfft/to_alist.ml -7 +14
+
+Sun Jul  7 20:32:01 EDT 2002  athena
+  * [project @ 2002-07-08 00:32:01 by athena]
+  Major changes in SIMD fftw
+
+    A ./dft/simd/n1b.c
+    A ./dft/simd/n1b.h
+    A ./dft/simd/n1f.c
+    A ./dft/simd/n1f.h
+    A ./dft/simd/t1b.c
+    A ./dft/simd/t1b.h
+    A ./dft/simd/t1f.c
+    A ./dft/simd/t1f.h
+    A ./genfft/gen_notw_c.ml
+    A ./genfft/gen_twiddle_c.ml
+    M ./dft/buffered.c -2 +2
+    M ./dft/codelets/standard/Makefile.am -6 +5
+    M ./dft/simd/Makefile.am -3 +1
+    R ./dft/simd/NAMING
+    M ./dft/simd/codelets/Makefile.am -26 +23
+    M ./dft/simd/n1b.c +43
+    M ./dft/simd/n1b.h +24
+    M ./dft/simd/n1f.c +43
+    M ./dft/simd/n1f.h +24
+    R ./dft/simd/n2f.c
+    R ./dft/simd/n2f.h
+    R ./dft/simd/n3f.h
+    R ./dft/simd/n4.c
+    R ./dft/simd/n4.h
+    M ./dft/simd/t1b.c +38
+    M ./dft/simd/t1b.h +25
+    M ./dft/simd/t1f.c +38
+    M ./dft/simd/t1f.h +25
+    R ./dft/simd/t2f.c
+    R ./dft/simd/t2f.h
+    R ./dft/simd/t3f.h
+    R ./dft/simd/t4.c
+    R ./dft/simd/t4.h
+    M ./genfft/Makefile.am -29 +27
+    M ./genfft/algsimp.ml -16 +27
+    M ./genfft/annotate.ml -131 +11
+    M ./genfft/annotate.mli -17 +2
+    M ./genfft/c.ml -7 +23
+    M ./genfft/complex.ml -1 +3
+    M ./genfft/complex.mli -1 +2
+    M ./genfft/expr.ml -3 +21
+    M ./genfft/expr.mli -1 +6
+    M ./genfft/gen_athnotw.ml -3 +3
+    M ./genfft/gen_athtw.ml -3 +3
+    M ./genfft/gen_conv.ml -4 +4
+    M ./genfft/gen_hc2r.ml -4 +4
+    M ./genfft/gen_notw.ml -4 +4
+    M ./genfft/gen_notw_c.ml +155
+    M ./genfft/gen_r2hc.ml -4 +4
+    M ./genfft/gen_trig.ml -4 +4
+    M ./genfft/gen_twiddle.ml -4 +4
+    M ./genfft/gen_twiddle_c.ml +165
+    M ./genfft/gen_twidsq.ml -4 +4
+    M ./genfft/genutil.ml -4 +5
+    M ./genfft/oracle.ml -1 +2
+    M ./genfft/simd.ml -177 +95
+    M ./genfft/simd.mli -2 +2
+    M ./genfft/simdmagic.ml -47 +1
+    M ./genfft/to_alist.ml -3 +9
+    M ./genfft/trig.ml -1 +10
+    M ./genfft/trig.mli -1 +2
+    M ./genfft/twiddle.ml -5 +6
+    M ./kernel/ifftw.h -2 +2
+    M ./simd/simd-sse.h -61 +88
+    M ./simd/simd-sse2.h -67 +57
+    M ./simd/sse.c -1 +4
+    M ./simd/sse2.c -1 +4
+    M ./support/Makefile.codelets -67 +2
+
+Fri Jul  5 17:32:09 EDT 2002  athena
+  * [project @ 2002-07-05 21:32:09 by athena]
+  Use unpck instructions instead of shuffles
+
+    M ./dft/buffered.c -2 +2
+    M ./simd/simd-altivec.h -13 +13
+    M ./simd/simd-sse.h -23 +10
+
+Fri Jul  5 15:49:14 EDT 2002  athena
+  * [project @ 2002-07-05 19:49:14 by athena]
+  Minor tweaks
+
+    M ./dft/codelets/n.c +1
+    M ./dft/codelets/t.c +1
+    M ./dft/ct-ditbuf.c -2 +2
+    M ./dft/verify.c -4 +1
+    M ./kernel/ifftw.h -10 +9
+    M ./kernel/planner.c -5 +4
+    M ./tests/bench.c -2 +2
+
+Fri Jul  5 15:02:54 EDT 2002  athena
+  * [project @ 2002-07-05 19:02:54 by athena]
+  Use score planner
+
+    M ./tests/bench.c -1 +1
+
+Fri Jul  5 14:49:59 EDT 2002  athena
+  * [project @ 2002-07-05 18:49:59 by athena]
+  Added verifier
+
+    A ./dft/verify.c
+    A ./kernel/dotens.c
+    A ./kernel/dotens2.c
+    M ./CONVENTIONS +1
+    M ./dft/Makefile.am -1 +1
+    M ./dft/dft.h -1 +4
+    M ./dft/verify.c +428
+    M ./kernel/Makefile.am -1 +2
+    M ./kernel/debug.c -2 +2
+    M ./kernel/dotens.c +47
+    M ./kernel/dotens2.c +55
+    M ./kernel/ifftw.h -8 +24
+    M ./tests/bench.c -8 +14
+
+Wed Jul  3 20:32:28 EDT 2002  athena
+  * [project @ 2002-07-04 00:32:28 by athena]
+  More simd codelets
+
+    A ./dft/simd/n2f.c
+    A ./dft/simd/n2f.h
+    A ./dft/simd/n3f.h
+    A ./dft/simd/t2f.c
+    A ./dft/simd/t2f.h
+    A ./dft/simd/t3f.h
+    M ./dft/buffered.c -3 +3
+    M ./dft/codelet.h -1 +3
+    M ./dft/codelets/n.c +1
+    M ./dft/codelets/t.c +1
+    M ./dft/ct-dif.c -1 +6
+    M ./dft/ct-dit.c -1 +6
+    M ./dft/ct-ditbuf.c -9 +19
+    M ./dft/ct.c -11 +17
+    M ./dft/ct.h -3 +5
+    M ./dft/simd/Makefile.am -1 +2
+    M ./dft/simd/NAMING -7 +20
+    M ./dft/simd/codelets/Makefile.am -4 +27
+    M ./dft/simd/n2f.c +43
+    M ./dft/simd/n2f.h +25
+    M ./dft/simd/n3f.h +1
+    M ./dft/simd/n4.c -2 +1
+    M ./dft/simd/t2f.c +40
+    M ./dft/simd/t2f.h +25
+    M ./dft/simd/t3f.h +1
+    M ./dft/simd/t4.c -1 +1
+    M ./genfft/annotate.ml -3 +3
+    M ./genfft/gen_notw.ml -9 +23
+    M ./genfft/gen_twiddle.ml -6 +14
+    M ./genfft/gen_twidsq.ml -3 +9
+    M ./genfft/simd.ml -43 +69
+    M ./genfft/simdmagic.ml -11 +36
+    M ./genfft-k7/gen_notw.ml -1 +2
+    M ./genfft-k7/gen_twiddle.ml -1 +2
+    M ./kernel/alloc.c -5 +2
+    M ./kernel/ifftw.h -3 +9
+    M ./simd/simd-sse.h -28 +106
+    M ./simd/simd-sse2.h -24 +47
+
+Tue Jul  2 16:18:09 EDT 2002  athena
+  * [project @ 2002-07-02 20:18:09 by athena]
+  Oops
+
+    M ./dft/rank-geq2.c -3 +4
+
+Tue Jul  2 16:13:24 EDT 2002  athena
+  * [project @ 2002-07-02 20:13:24 by athena]
+  Fixed classic mode
+
+    M ./dft/rank-geq2.c -6 +5
+    M ./dft/vrank-geq1.c -6 +6
+
+Tue Jul  2 15:38:36 EDT 2002  athena
+  * [project @ 2002-07-02 19:38:36 by athena]
+  Use LDK for constants so that we can play games.
+
+    M ./genfft/simd.ml -2 +2
+    M ./simd/simd-altivec.h +1
+    M ./simd/simd-sse.h -5 +5
+    M ./simd/simd-sse2.h -2 +5
+
+Tue Jul  2 13:15:58 EDT 2002  athena
+  * [project @ 2002-07-02 17:15:58 by athena]
+  Improved support for fixed strides
+
+    M ./dft/codelet.h -1 +2
+    M ./dft/codelets/n.c -1 +2
+    M ./dft/simd/n4.c +3
+    M ./dft/simd/t4.c +2
+    M ./genfft/gen_notw.ml -8 +21
+    M ./genfft/genutil.ml -1 +6
+    M ./genfft/simd.ml -7 +7
+    M ./genfft/simd.mli -3 +3
+    M ./genfft-k7/gen_notw.ml -1 +2
+    M ./simd/simd-sse.h -17 +54
+
+Tue Jul  2 10:30:58 EDT 2002  athena
+  * [project @ 2002-07-02 14:30:58 by athena]
+  Changed accounting of flops
+
+    M ./dft/codelet.h -12 +25
+    M ./dft/codelets/n.c -4 +5
+    M ./dft/codelets/n.h -6 +2
+    M ./dft/codelets/t.c -3 +5
+    M ./dft/codelets/t.h -3 +2
+    M ./dft/ct-dif.c -3 +5
+    M ./dft/ct-dit.c -3 +5
+    M ./dft/ct-ditbuf.c -3 +4
+    M ./dft/ct-ditf.c -3 +6
+    M ./dft/direct.c -5 +5
+    M ./dft/k7/k7.c -14 +23
+    M ./dft/simd/n4.c -3 +5
+    M ./dft/simd/n4.h -5 +2
+    M ./dft/simd/t4.c -3 +5
+    M ./dft/simd/t4.h -3 +2
+    M ./genfft/gen_notw.ml -3 +3
+    M ./genfft/gen_twiddle.ml -3 +3
+    M ./genfft/gen_twidsq.ml -3 +3
+    M ./genfft-k7/gen_notw.ml -2 +2
+    M ./genfft-k7/gen_twiddle.ml -2 +2
+
+Tue Jul  2 08:51:38 EDT 2002  athena
+  * [project @ 2002-07-02 12:51:38 by athena]
+  Wrong code in non-fma mode
+
+    M ./genfft/algsimp.ml -11 +15
+    M ./genfft-k7/algsimp.ml -8 +24
+    M ./genfft-k7/to_alist.ml -3 +3
+    M ./simd/simd-sse2.h -4 +9
+
+Mon Jul  1 23:17:06 EDT 2002  athena
+  * [project @ 2002-07-02 03:17:06 by athena]
+  sse2 stuff
+
+    A ./simd/simd-sse2.h
+    A ./simd/sse2.c
+    M ./genfft/simdmagic.ml -2 +2
+    M ./kernel/alloc.c -2 +2
+    M ./simd/Makefile.am -1 +2
+    M ./simd/simd-sse2.h +125
+    M ./simd/sse2.c +73
+
+Mon Jul  1 14:05:56 EDT 2002  athena
+  * [project @ 2002-07-01 18:05:56 by athena]
+  Identify CPUs for special codelets
+
+    A ./simd/sse.c
+    M ./Makefile.am +1
+    M ./dft/ct.c -4 +4
+    M ./dft/direct.c -2 +2
+    M ./dft/k7/k7.c -5 +43
+    M ./dft/simd/n4.c -1 +1
+    M ./dft/simd/t4.c -1 +1
+    M ./kernel/alloc.c -2 +2
+    M ./simd/Makefile.am -1 +3
+    M ./simd/simd-altivec.h +2
+    M ./simd/simd-sse.h +3
+    M ./simd/sse.c +73
+
+Mon Jul  1 09:26:42 EDT 2002  athena
+  * [project @ 2002-07-01 13:26:42 by athena]
+  Change split problem syntax
+
+    M ./libbench/problem.c -2 +2
+
+Mon Jul  1 09:11:39 EDT 2002  athena
+  * [project @ 2002-07-01 13:11:39 by athena]
+  Removed -fma flag
+
+    M ./dft/simd/codelets/Makefile.am -1 +1
+
+Sun Jun 30 20:08:26 EDT 2002  athena
+  * [project @ 2002-07-01 00:08:26 by athena]
+  Work around gcc bug
+
+    M ./simd/simd-altivec.h -3 +17
+
+Sun Jun 30 18:34:06 EDT 2002  athena
+  * [project @ 2002-06-30 22:34:06 by athena]
+  New simd stuff
+
+    M ./genfft/algsimp.ml -8 +20
+    M ./genfft/magic.ml -1 +3
+    M ./genfft/oracle.ml -2 +2
+    M ./genfft/simd.ml -2 +4
+    M ./genfft/to_alist.ml -3 +3
+
+Sun Jun 30 17:00:09 EDT 2002  athena
+  * [project @ 2002-06-30 21:00:09 by athena]
+  Added altivec support
+
+    A ./simd/simd-altivec.h
+    M ./dft/simd/codelets/Makefile.am -1 +1
+    M ./simd/Makefile.am -1 +1
+    M ./simd/simd-altivec.h +66
+    M ./simd/simd-sse.h -1
+
+Sun Jun 30 14:47:47 EDT 2002  athena
+  * [project @ 2002-06-30 18:47:47 by athena]
+  Forgot file
+
+    A ./dft/simd/t4.c
+
+Sun Jun 30 14:37:55 EDT 2002  athena
+  * [project @ 2002-06-30 18:37:55 by athena]
+  Progress towards simd implementation
+
+    A ./dft/codelets/f.h
+    A ./dft/codelets/n.c
+    A ./dft/codelets/n.h
+    A ./dft/codelets/q.h
+    A ./dft/codelets/t.c
+    A ./dft/codelets/t.h
+    A ./dft/k7/k7.c
+    A ./dft/simd/
+    A ./dft/simd/Makefile.am
+    A ./dft/simd/NAMING
+    A ./dft/simd/codelets/
+    A ./dft/simd/codelets/Makefile.am
+    A ./dft/simd/n4.c
+    A ./dft/simd/n4.h
+    A ./dft/simd/t4.h
+    A ./simd/
+    A ./simd/Makefile.am
+    A ./simd/README
+    A ./simd/simd-sse.h
+    A ./simd/simd.h
+    M ./Makefile.am -2 +5
+    M ./configure.ac -6 +32
+    M ./dft/Makefile.am -3 +3
+    R ./dft/codelet-k7.h
+    M ./dft/codelet.h -11 +23
+    M ./dft/codelets/Makefile.am +4
+    M ./dft/codelets/f.h +1
+    M ./dft/codelets/inplace/Makefile.am -13 +14
+    M ./dft/codelets/n.c +34
+    M ./dft/codelets/n.h +27
+    M ./dft/codelets/q.h +1
+    M ./dft/codelets/standard/Makefile.am -13 +14
+    M ./dft/codelets/t.c +32
+    M ./dft/codelets/t.h +24
+    M ./dft/conf.c -2 +5
+    M ./dft/ct-dif.c -3 +3
+    M ./dft/ct-dit.c -5 +3
+    M ./dft/ct-ditbuf.c -2 +2
+    M ./dft/ct-ditf.c -4 +3
+    M ./dft/ct.c -6 +6
+    M ./dft/ct.h -3 +1
+    M ./dft/direct.c -6 +11
+    M ./dft/k7/Makefile.am -2 +1
+    M ./dft/k7/codelets/Makefile.am -33 +29
+    R ./dft/k7/ct-dif.c
+    R ./dft/k7/ct-dit.c
+    R ./dft/k7/ct-ditbuf.c
+    R ./dft/k7/direct.c
+    M ./dft/k7/k7.c +69
+    R ./dft/k7/kdft-dif.c
+    R ./dft/k7/kdft-dit.c
+    R ./dft/k7/kdft.c
+    M ./dft/simd/Makefile.am +6
+    M ./dft/simd/NAMING +17
+    M ./dft/simd/codelets/Makefile.am +36
+    M ./dft/simd/n4.c +39
+    M ./dft/simd/n4.h +28
+    M ./dft/simd/t4.h +26
+    M ./genfft/annotate.ml -7 +7
+    M ./genfft/c.ml -1 +6
+    M ./genfft/c.mli -1 +2
+    M ./genfft/gen_notw.ml -12 +20
+    M ./genfft/gen_twiddle.ml -14 +18
+    M ./genfft/gen_twidsq.ml -7 +7
+    M ./genfft/genutil.ml -4 +17
+    M ./genfft/magic.ml -1 +3
+    M ./genfft/simd.ml -66 +34
+    M ./genfft/simd.mli -1 +7
+    M ./genfft/simdmagic.ml -2 +12
+    M ./genfft/twiddle.ml -1 +12
+    M ./genfft/twiddle.mli -1 +2
+    M ./genfft-k7/genUtil.ml -1 +1
+    M ./genfft-k7/gen_notw.ml -11 +18
+    M ./genfft-k7/gen_twiddle.ml -12 +17
+    M ./kernel/alloc.c -9 +28
+    M ./kernel/ifftw.h -10 +10
+    M ./libbench/bench-user.h -1 +2
+    M ./libbench/problem.c -1 +3
+    M ./libbench/util.c -1
+    M ./simd/Makefile.am +1
+    M ./simd/README +2
+    M ./simd/simd-sse.h +94
+    M ./simd/simd.h +33
+    M ./support/Makefile.codelets -33 +36
+    M ./tests/Makefile.am +1
+    M ./tests/bench.c -14 +63
+
+Tue Jun 25 20:23:29 EDT 2002  athena
+  * [project @ 2002-06-26 00:23:29 by athena]
+  Add 128- codelet
+
+    M ./dft/k7/codelets/Makefile.am -2 +3
+
+Sat Jun 22 20:47:28 EDT 2002  athena
+  * [project @ 2002-06-23 00:47:28 by athena]
+  More simd changes.  Ensure proper stack alignment in k7 codelets.
+
+    M ./configure.ac +1
+    M ./genfft/c.ml -25 +4
+    M ./genfft/c.mli -2 +4
+    M ./genfft/expr.ml -1 +18
+    M ./genfft/expr.mli -1 +4
+    M ./genfft/gen_hc2r.ml -3 +3
+    M ./genfft/gen_notw.ml -3 +3
+    M ./genfft/gen_r2hc.ml -3 +3
+    M ./genfft/gen_trig.ml -3 +3
+    M ./genfft/gen_twiddle.ml -3 +3
+    M ./genfft/gen_twidsq.ml -3 +3
+    M ./genfft/genutil.ml -1 +13
+    M ./genfft/simd.ml -1 +9
+    M ./genfft/simd.mli -1 +2
+    M ./genfft-k7/genUtil.ml -34 +6
+    M ./genfft-k7/gen_notw.ml -2 +2
+    M ./genfft-k7/gen_twiddle.ml -2 +2
+    M ./libbench/bench-main.c -1 +11
+
+Sat Jun 22 13:01:33 EDT 2002  athena
+  * [project @ 2002-06-22 17:01:33 by athena]
+  Fixed prototypes
+
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/solvtab.c -2 +2
+
+Sat Jun 22 12:53:26 EDT 2002  athena
+  * [project @ 2002-06-22 16:53:26 by athena]
+  Sparc cycle counter requires v9
+
+    M ./kernel/cycle.h -2 +2
+
+Sat Jun 22 11:45:48 EDT 2002  athena
+  * [project @ 2002-06-22 15:45:48 by athena]
+  Minor fixes
+
+    M ./configure.ac -1 +1
+    M ./kernel/cycle.h -15 +36
+    M ./kernel/ifftw.h -1 +5
+
+Sat Jun 22 11:11:46 EDT 2002  athena
+  * [project @ 2002-06-22 15:11:46 by athena]
+  Fixed ev67 detection
+
+    M ./acinclude.m4 -1 +1
+
+Sat Jun 22 10:19:11 EDT 2002  athena
+  * [project @ 2002-06-22 14:19:11 by athena]
+  Print flops
+
+    M ./tests/bench.c +2
+
+Sat Jun 22 09:36:47 EDT 2002  athena
+  * [project @ 2002-06-22 13:36:47 by athena]
+  Nothing really
+
+    M ./genfft/simd.ml -7 +3
+
+Fri Jun 21 22:19:20 EDT 2002  athena
+  * [project @ 2002-06-22 02:19:20 by athena]
+  More simd work
+
+    M ./dft/codelet-k7.h -2 +2
+    M ./dft/codelet.h -3 +3
+    M ./genfft/Makefile.am -2 +2
+    M ./genfft/c.ml -19 +1
+    M ./genfft/c.mli -2 +2
+    M ./genfft/gen_conv.ml -3 +3
+    M ./genfft/gen_hc2r.ml -3 +3
+    M ./genfft/gen_notw.ml -3 +3
+    M ./genfft/gen_r2hc.ml -3 +3
+    M ./genfft/gen_trig.ml -3 +3
+    M ./genfft/gen_twiddle.ml -3 +3
+    M ./genfft/gen_twidsq.ml -3 +3
+    M ./genfft/genutil.ml -2 +20
+    M ./genfft/magic.ml -1 +5
+    M ./genfft/simd.ml -47 +39
+    M ./genfft/simd.mli -1 +3
+    M ./genfft/simdmagic.ml -4 +2
+    M ./genfft/to_alist.ml -2 +5
+    M ./genfft-k7/magic.ml +4
+    M ./genfft-k7/to_alist.ml -2 +5
+    M ./genfft-k7/to_alist.mli -1 +1
+    M ./kernel/ifftw.h -2 +2
+
+Thu Jun 20 21:22:41 EDT 2002  athena
+  * [project @ 2002-06-21 01:22:41 by athena]
+  More simd work
+
+    A ./genfft/simd.mli
+    A ./genfft/simdmagic.ml
+    M ./genfft/Makefile.am -7 +8
+    M ./genfft/annotate.ml -13 +13
+    M ./genfft/annotate.mli -1 +2
+    M ./genfft/magic.ml -21 +1
+    M ./genfft/simd.ml -21 +10
+    M ./genfft/simd.mli +21
+    M ./genfft/simdmagic.ml +43
+
+Thu Jun 20 18:51:33 EDT 2002  athena
+  * [project @ 2002-06-20 22:51:33 by athena]
+  More simd work
+
+    A ./genfft/simd.ml
+    M ./genfft/Makefile.am -23 +7
+    M ./genfft/annotate.ml -18 +2
+    M ./genfft/c.ml -6 +1
+    M ./genfft/c.mli -5 +8
+    M ./genfft/magic.ml -1 +6
+    M ./genfft/simd.ml +288
+    M ./genfft/variable.ml -1 +18
+    M ./genfft/variable.mli -1 +4
+
+Thu Jun 20 15:04:37 EDT 2002  athena
+  * [project @ 2002-06-20 19:04:37 by athena]
+  Moving towards incorporation of simd stuff
+
+    M ./genfft/annotate.ml -11 +144
+    M ./genfft/annotate.mli -2 +16
+    M ./genfft/gen_athnotw.ml -5 +5
+    M ./genfft/gen_athtw.ml -6 +6
+    M ./genfft/gen_conv.ml -3 +3
+    M ./genfft/gen_hc2r.ml -3 +3
+    M ./genfft/gen_notw.ml -3 +3
+    M ./genfft/gen_r2hc.ml -3 +3
+    M ./genfft/gen_trig.ml -3 +3
+    M ./genfft/gen_twiddle.ml -4 +4
+    M ./genfft/gen_twidsq.ml -6 +5
+    M ./genfft/genutil.ml -26 +17
+    M ./genfft/magic.ml -2 +17
+    M ./genfft/twiddle.ml -4 +4
+    M ./genfft/twiddle.mli -2 +2
+    M ./genfft/variable.ml -12 +22
+    M ./genfft/variable.mli -3 +6
+
+Wed Jun 19 18:47:55 EDT 2002  athena
+  * [project @ 2002-06-19 22:47:55 by athena]
+  Reorganized k7 stuff into own directory
+
+    A ./dft/k7/
+    A ./dft/k7/Makefile.am
+    A ./dft/k7/codelets/
+    A ./dft/k7/codelets/Makefile.am
+    A ./dft/k7/ct-dif.c
+    A ./dft/k7/ct-dit.c
+    A ./dft/k7/ct-ditbuf.c
+    A ./dft/k7/direct.c
+    A ./dft/k7/kdft-dif.c
+    A ./dft/k7/kdft-dit.c
+    A ./dft/k7/kdft.c
+    M ./Makefile.am -1 +2
+    M ./configure.ac -2 +3
+    M ./dft/Makefile.am -6 +5
+    M ./dft/codelets/Makefile.am -1 +1
+    R ./dft/ct-dif-k7.c
+    R ./dft/ct-dit-k7.c
+    R ./dft/ct-ditbuf-k7.c
+    R ./dft/direct-k7.c
+    M ./dft/k7/Makefile.am +7
+    M ./dft/k7/codelets/Makefile.am +74
+    M ./dft/k7/ct-dif.c +129
+    M ./dft/k7/ct-dit.c +137
+    M ./dft/k7/ct-ditbuf.c +176
+    M ./dft/k7/direct.c +172
+    M ./dft/k7/kdft-dif.c +30
+    M ./dft/k7/kdft-dit.c +31
+    M ./dft/k7/kdft.c +31
+    R ./dft/kdft-dif-k7.c
+    R ./dft/kdft-dit-k7.c
+    R ./dft/kdft-k7.c
+
+Wed Jun 19 13:21:13 EDT 2002  athena
+  * [project @ 2002-06-19 17:20:37 by athena]
+  Minor experimental stuff
+
+    M ./genfft/expr.ml -3 +2
+    M ./genfft/expr.mli -2 +2
+    M ./genfft/genutil.ml -2 +11
+    M ./genfft/magic.ml -1 +3
+    M ./genfft-k7/expr.ml -1 +3
+    M ./genfft-k7/expr.mli -1 +3
+
+Wed Jun 19 11:20:29 EDT 2002  athena
+  * [project @ 2002-06-19 15:20:29 by athena]
+  Cosmetic changes
+
+    M ./genfft/expr.ml -1 +4
+    M ./genfft/expr.mli -1 +3
+    M ./genfft/genutil.ml -9 +2
+
+Wed Jun 19 01:43:31 EDT 2002  fftw
+  * [project @ 2002-06-19 05:43:31 by fftw]
+  allocate buffers on the fly
+
+    M ./dft/buffered.c -18 +7
+    M ./dft/rader.c -17 +20
+
+Tue Jun 18 17:48:41 EDT 2002  athena
+  * [project @ 2002-06-18 21:48:41 by athena]
+  Added ct-ditbuf-k7.c .  Major changes required in generator.
+
+    A ./dft/ct-ditbuf-k7.c
+    M ./dft/Makefile.am -1 +1
+    M ./dft/codelet-k7.h -1 +2
+    M ./dft/ct-dif-k7.c -17 +3
+    M ./dft/ct-dif.c -16 +2
+    M ./dft/ct-dit-k7.c -18 +10
+    M ./dft/ct-dit.c -16 +2
+    M ./dft/ct-ditbuf-k7.c +176
+    M ./dft/ct-ditbuf.c -16 +2
+    M ./dft/ct.c -1 +30
+    M ./dft/ct.h -1 +3
+    M ./dft/kdft-dit-k7.c -1 +2
+    M ./dft/rader.c -1 +1
+    M ./genfft/number.ml -2 +2
+    M ./genfft/to_alist.ml -2 +2
+    M ./genfft-k7/Makefile.am -4 +4
+    M ./genfft-k7/assignmentsToVfpinstrs.ml -1 +4
+    M ./genfft-k7/gen_twiddle.ml -2 +5
+    M ./genfft-k7/k7Basics.ml -1 +1
+    M ./genfft-k7/k7Basics.mli +1
+    M ./genfft-k7/k7RegisterAllocationBasics.ml +3
+    M ./genfft-k7/k7RegisterAllocationBasics.mli -2 +4
+    M ./genfft-k7/k7RegisterAllocator.ml -15 +28
+    M ./genfft-k7/k7RegisterAllocatorInit.ml -6 +12
+    M ./genfft-k7/number.ml -2 +2
+    M ./genfft-k7/to_alist.ml -2 +2
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/planner.c -3 +5
+    M ./kernel/primes.c -2 +2
+
+Tue Jun 18 11:55:57 EDT 2002  athena
+  * [project @ 2002-06-18 15:55:57 by athena]
+  Nothing, really
+
+    M ./genfft-k7/gen_twiddle.ml -2 +2
+    M ./kernel/ifftw.h -18 +19
+
+Tue Jun 18 11:19:59 EDT 2002  athena
+  * [project @ 2002-06-18 15:19:59 by athena]
+  !SINGLE ==> !K7_MODE  (for some reason the contrapositive sounds wrong)
+
+    M ./configure.ac +1
+
+Tue Jun 18 11:07:13 EDT 2002  athena
+  * [project @ 2002-06-18 15:07:13 by athena]
+  Buffer is now symmetric wrt forward/backward transform
+
+    M ./dft/buffered.c -12 +22
+
+Tue Jun 18 10:33:58 EDT 2002  athena
+  * [project @ 2002-06-18 14:33:58 by athena]
+  Fixed applicable() in indirect.c
+
+    A ./kernel/debug.c
+    M ./dft/ct-dif.c -2 +1
+    M ./dft/indirect.c -3 +3
+    M ./dft/vrank2-transpose.c -10 +14
+    M ./dft/vrank3-transpose.c -10 +13
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/debug.c +39
+    M ./kernel/ifftw.h -1 +5
+    M ./kernel/print.c -4 +10
+
+Tue Jun 18 08:41:18 EDT 2002  athena
+  * [project @ 2002-06-18 12:41:18 by athena]
+  Fixed attempt to free() uninitialized pointer.
+
+    M ./dft/rader.c -2 +3
+
+Tue Jun 18 08:13:55 EDT 2002  athena
+  * [project @ 2002-06-18 12:13:55 by athena]
+  Added reference counts for awake()
+
+    M ./CONVENTIONS -1 +1
+    M ./TODO +5
+    M ./dft/rader.c -11 +3
+    M ./kernel/ifftw.h -4 +7
+    M ./kernel/plan.c -2 +15
+    M ./kernel/planner.c -3 +3
+    M ./tests/bench.c -2 +3
+
+Mon Jun 17 20:49:05 EDT 2002  stevenj
+  * [project @ 2002-06-18 00:49:05 by stevenj]
+  updated comment
+
+    M ./dft/rader.c -2 +2
+
+Mon Jun 17 20:46:45 EDT 2002  stevenj
+  * [project @ 2002-06-18 00:46:45 by stevenj]
+  slight update
+
+    M ./TODO -1 +3
+
+Mon Jun 17 02:30:16 EDT 2002  fftw
+  * [project @ 2002-06-17 06:30:16 by fftw]
+  moved prime-number stuff into primes.c, so it can be shared with generic codelet and with rfftw rader
+
+    A ./kernel/primes.c
+    M ./dft/rader.c -104 +5
+    M ./kernel/Makefile.am -3 +3
+    M ./kernel/ifftw.h -1 +24
+    M ./kernel/primes.c +112
+
+Mon Jun 17 02:01:58 EDT 2002  fftw
+  * [project @ 2002-06-17 06:01:58 by fftw]
+  added comment
+
+    M ./dft/rader.c -6 +9
+
+Mon Jun 17 01:39:55 EDT 2002  fftw
+  * [project @ 2002-06-17 05:39:55 by fftw]
+  added rader-dit
+
+    M ./dft/rader.c -66 +308
+    M ./kernel/ifftw.h -1 +10
+    M ./kernel/twiddle.c -8 +1
+
+Sun Jun 16 23:50:16 EDT 2002  fftw
+  * [project @ 2002-06-17 03:50:16 by fftw]
+  added initial Rader (no DIT yet)
+
+    A ./dft/rader.c
+    M ./configure.ac +8
+    M ./dft/Makefile.am -1 +1
+    M ./dft/conf.c -1 +2
+    M ./dft/dft.h -1 +2
+    M ./dft/rader.c +457
+
+Sun Jun 16 22:29:51 EDT 2002  fftw
+  * [project @ 2002-06-17 02:29:51 by fftw]
+  don't warn about long long
+
+    M ./acinclude.m4 -1 +1
+
+Sun Jun 16 21:30:42 EDT 2002  athena
+  * [project @ 2002-06-17 01:30:42 by athena]
+  Added k7 DIF codelets
+
+    A ./dft/ct-dif-k7.c
+    A ./dft/kdft-dif-k7.c
+    M ./dft/Makefile.am -5 +6
+    M ./dft/codelet-k7.h -5 +10
+    M ./dft/ct-dif-k7.c +143
+    M ./dft/ct.h -1 +2
+    M ./dft/kdft-dif-k7.c +30
+    M ./genfft-k7/gen_twiddle.ml -4 +25
+    M ./kernel/planner.c -3 +2
+    M ./tests/bench.c +1
+
+Sun Jun 16 19:13:31 EDT 2002  athena
+  * [project @ 2002-06-16 23:13:31 by athena]
+  Added stuff to do
+
+    M ./TODO +6
+
+Sun Jun 16 19:05:58 EDT 2002  athena
+  * [project @ 2002-06-16 23:05:58 by athena]
+  Handle dual case R = I + 1
+
+    M ./dft/rank0.c -6 +30
+
+Sun Jun 16 18:54:31 EDT 2002  athena
+  * [project @ 2002-06-16 22:54:31 by athena]
+  Removed useless flag
+
+    M ./bootstrap.sh -1 +1
+
+Sun Jun 16 18:30:32 EDT 2002  athena
+  * [project @ 2002-06-16 22:30:32 by athena]
+  Removed useless file
+
+    R ./mkdist.sh
+
+Sun Jun 16 18:30:18 EDT 2002  athena
+  * [project @ 2002-06-16 22:30:18 by athena]
+  More k7 work.  Switched to runtime CLASSIC mode.
+
+    A ./dft/ct-dit-k7.c
+    A ./dft/kdft-dit-k7.c
+    R ./CLASSIC-MODE
+    M ./Makefile.am -10 +1
+    M ./configure.ac -21
+    M ./dft/Makefile.am -5 +5
+    M ./dft/buffered.c -2 +4
+    M ./dft/codelet-k7.h -5 +7
+    M ./dft/codelet.h -1 +2
+    M ./dft/codelets/inplace/Makefile.am -2
+    M ./dft/ct-dif.c -3 +4
+    M ./dft/ct-dit-k7.c +145
+    M ./dft/ct-dit.c -6 +9
+    M ./dft/ct-ditbuf.c -9 +11
+    M ./dft/ct-ditf.c -2 +3
+    M ./dft/ct.h -5 +3
+    M ./dft/direct-k7.c -2 +3
+    M ./dft/direct.c -2 +3
+    M ./dft/indirect.c -2 +3
+    M ./dft/kdft-dit-k7.c +30
+    M ./dft/kdft-dit.c -4 +2
+    M ./dft/nop.c -2 +3
+    M ./dft/rank-geq2.c -8 +8
+    M ./dft/rank0.c -2 +3
+    M ./dft/vrank-geq1.c -6 +7
+    M ./dft/vrank2-transpose.c -3 +3
+    M ./dft/vrank3-transpose.c -3 +3
+    M ./genfft/gen_twiddle.ml -4 +4
+    M ./genfft/gen_twidsq.ml -4 +4
+    M ./genfft/twiddle.ml -7 +28
+    M ./genfft/twiddle.mli -2 +5
+    M ./genfft-k7/Makefile.am -25 +24
+    M ./genfft-k7/genUtil.ml -2 +3
+    M ./genfft-k7/gen_notw.ml -1 +2
+    M ./genfft-k7/gen_twiddle.ml -23 +29
+    M ./genfft-k7/twiddle.ml -7 +36
+    M ./genfft-k7/twiddle.mli -3 +7
+    M ./kernel/ifftw.h -15 +13
+    M ./kernel/planner-naive.c -3 +3
+    M ./kernel/planner-score.c -5 +6
+    M ./kernel/planner.c -6 +5
+    M ./kernel/version.c -7 +2
+    M ./support/Makefile.codelets +1
+    M ./tests/bench.c -12 +2
+
+Sun Jun 16 17:15:18 EDT 2002  stevenj
+  * [project @ 2002-06-16 21:15:18 by stevenj]
+  spelling
+
+    M ./kernel/tensor.c -2 +2
+
+Sun Jun 16 15:51:44 EDT 2002  athena
+  * [project @ 2002-06-16 19:51:44 by athena]
+  Do not compile if not K7_MODE
+
+    M ./dft/kdft-k7.c -1 +3
+
+Sun Jun 16 15:35:02 EDT 2002  athena
+  * [project @ 2002-06-16 19:35:02 by athena]
+  Do not require K7 definitions to compile
+
+    M ./dft/codelet-k7.h -1 +2
+    M ./dft/dft.h -2 +1
+
+Sun Jun 16 08:05:17 EDT 2002  athena
+  * [project @ 2002-06-16 12:05:17 by athena]
+  More k7 stuff
+
+    A ./dft/codelet-k7.h
+    A ./genfft-k7/gen_twiddle.ml
+    M ./dft/Makefile.am -4 +4
+    M ./dft/codelet-k7.h +40
+    M ./dft/codelet.h -20 +5
+    M ./dft/direct-k7.c -1 +5
+    M ./genfft-k7/Makefile.am -1 +9
+    M ./genfft-k7/complex.ml -1 +2
+    M ./genfft-k7/complex.mli -1 +2
+    M ./genfft-k7/genUtil.ml -1 +12
+    M ./genfft-k7/gen_notw.ml -5 +1
+    M ./genfft-k7/gen_twiddle.ml +149
+
+Sat Jun 15 18:30:43 EDT 2002  athena
+  * [project @ 2002-06-15 22:30:43 by athena]
+  Try to be compatible with automake-1.6
+
+    M ./acinclude.m4 -1 +5
+
+Sat Jun 15 18:23:40 EDT 2002  athena
+  * [project @ 2002-06-15 22:23:40 by athena]
+  More merging of Stefan's generator with main genfft branch
+
+    A ./genfft-k7/algsimp.ml
+    A ./genfft-k7/algsimp.mli
+    A ./genfft-k7/assoctable.ml
+    A ./genfft-k7/assoctable.mli
+    A ./genfft-k7/littlesimp.ml
+    A ./genfft-k7/littlesimp.mli
+    A ./genfft-k7/monads.ml
+    A ./genfft-k7/oracle.ml
+    A ./genfft-k7/oracle.mli
+    A ./genfft-k7/to_alist.ml
+    A ./genfft-k7/to_alist.mli
+    A ./genfft-k7/twiddle.mli
+    A ./support/codelet_asmprelude
+    M ./acinclude.m4 +8
+    M ./configure.ac +1
+    M ./genfft/number.ml -7 +8
+    M ./genfft-k7/Makefile.am -23 +27
+    M ./genfft-k7/algsimp.ml +517
+    M ./genfft-k7/algsimp.mli +24
+    M ./genfft-k7/assignmentsToVfpinstrs.ml -11 +10
+    M ./genfft-k7/assoctable.ml +66
+    M ./genfft-k7/assoctable.mli +30
+    M ./genfft-k7/complex.ml -4 +19
+    M ./genfft-k7/complex.mli -5 +6
+    M ./genfft-k7/expr.ml -27 +58
+    M ./genfft-k7/expr.mli -4 +11
+    R ./genfft-k7/exprdag.ml
+    R ./genfft-k7/exprdag.mli
+    M ./genfft-k7/genUtil.ml -13 +18
+    M ./genfft-k7/gen_notw.ml -16 +31
+    M ./genfft-k7/k7Unparsing.ml -2 +2
+    M ./genfft-k7/littlesimp.ml +72
+    M ./genfft-k7/littlesimp.mli +26
+    M ./genfft-k7/magic.ml -16 +41
+    M ./genfft-k7/monads.ml +76
+    M ./genfft-k7/number.ml -9 +5
+    M ./genfft-k7/number.mli -8 +4
+    M ./genfft-k7/oracle.ml +138
+    M ./genfft-k7/oracle.mli +25
+    M ./genfft-k7/to_alist.ml +258
+    M ./genfft-k7/to_alist.mli +24
+    M ./genfft-k7/twiddle.ml -5 +22
+    M ./genfft-k7/twiddle.mli +29
+    M ./genfft-k7/util.ml -4 +4
+    M ./genfft-k7/util.mli -2 +1
+    M ./genfft-k7/vFpUnparsing.ml -1 +1
+    M ./genfft-k7/vSimdBasics.ml -2 +3
+    M ./genfft-k7/vSimdUnparsing.ml -1 +1
+    M ./genfft-k7/variable.ml +1
+    M ./genfft-k7/variable.mli +1
+    M ./support/Makefile.am -1 +1
+    M ./support/Makefile.codelets -1 +13
+    M ./support/codelet_asmprelude +8
+
+Sat Jun 15 13:51:39 EDT 2002  athena
+  * [project @ 2002-06-15 17:51:39 by athena]
+  Slowly merging genfft-k7 with main genfft branch
+
+    A ./genfft-k7/gen_notw.ml
+    M ./genfft/expr.ml -1 +3
+    M ./genfft/expr.mli -1 +2
+    M ./genfft/genutil.ml -1 +9
+    M ./genfft-k7/Makefile.am -26 +23
+    M ./genfft-k7/complex.ml -34 +92
+    M ./genfft-k7/complex.mli -1 +11
+    M ./genfft-k7/expr.ml -1 +19
+    M ./genfft-k7/expr.mli -4 +3
+    M ./genfft-k7/exprdag.ml -73 +103
+    M ./genfft-k7/exprdag.mli -1
+    M ./genfft-k7/fft.ml -167 +83
+    M ./genfft-k7/fft.mli -29 +1
+    M ./genfft-k7/genUtil.ml -56 +33
+    R ./genfft-k7/gen_hc2hc.ml
+    R ./genfft-k7/gen_hc2real.ml
+    M ./genfft-k7/gen_notw.ml +143
+    R ./genfft-k7/gen_notwiddle.ml
+    R ./genfft-k7/gen_notwiddle_fixedstride.ml
+    R ./genfft-k7/gen_real2hc.ml
+    R ./genfft-k7/gen_realeven.ml
+    R ./genfft-k7/gen_realeven2.ml
+    R ./genfft-k7/gen_realodd.ml
+    R ./genfft-k7/gen_realodd2.ml
+    R ./genfft-k7/gen_twiddle.ml
+    M ./genfft-k7/magic.ml -11 +35
+    R ./genfft-k7/symmetry.ml
+    M ./genfft-k7/twiddle.ml -63 +95
+    M ./genfft-k7/util.ml +15
+    M ./genfft-k7/util.mli +3
+    M ./genfft-k7/variable.ml +12
+    M ./genfft-k7/variable.mli +3
+    M ./support/Makefile.codelets -1 +1
+
+Fri Jun 14 21:33:02 EDT 2002  athena
+  * [project @ 2002-06-15 01:33:02 by athena]
+  Fixed, really
+
+    M ./genfft-k7/Makefile.am -1 +1
+    M ./genfft-k7/genUtil.ml -25
+    M ./genfft-k7/magic.ml -1 +1
+    R ./genfft-k7/magic.mli
+    M ./genfft-k7/twiddle.ml -2 +2
+    M ./support/Makefile.codelets -4 +8
+
+Fri Jun 14 21:27:12 EDT 2002  athena
+  * [project @ 2002-06-15 01:27:12 by athena]
+  Oops...
+
+    M ./support/Makefile.codelets -1 +1
+
+Fri Jun 14 21:25:34 EDT 2002  athena
+  * [project @ 2002-06-15 01:25:34 by athena]
+  Work properly when $(ALL_CODELETS) = ""
+
+    M ./support/Makefile.codelets -4 +4
+
+Fri Jun 14 21:11:16 EDT 2002  athena
+  * [project @ 2002-06-15 01:11:16 by athena]
+  Fixed k7 build machinery
+
+    M ./Makefile.am -1 +2
+    M ./configure.ac +8
+    M ./dft/codelet.h -1 +4
+    M ./dft/codelets/Makefile.am -1 +1
+    M ./dft/conf.c -1 +4
+    M ./genfft-k7/gen_notwiddle.ml -1 +3
+    M ./kernel/ifftw.h -1 +10
+    M ./support/Makefile.codelets +2
+
+Fri Jun 14 17:42:35 EDT 2002  athena
+  * [project @ 2002-06-14 21:42:35 by athena]
+  More work on k7 stuff
+
+    M ./Makefile.am -1 +1
+    M ./configure.ac +1
+    M ./dft/codelet.h -1 +4
+    M ./dft/direct-k7.c -3 +3
+    M ./genfft-k7/Makefile.am -24 +24
+    R ./genfft-k7/codeletMisc.ml
+    R ./genfft-k7/codeletMisc.mli
+    M ./genfft-k7/genUtil.ml -10 +128
+    R ./genfft-k7/genUtil.mli
+    R ./genfft-k7/gen_hc2hc.mli
+    R ./genfft-k7/gen_hc2real.mli
+    M ./genfft-k7/gen_notwiddle.ml -3 +33
+    R ./genfft-k7/gen_notwiddle.mli
+    R ./genfft-k7/gen_real2hc.mli
+    R ./genfft-k7/gen_realeven.mli
+    R ./genfft-k7/gen_realeven2.mli
+    R ./genfft-k7/gen_realodd.mli
+    R ./genfft-k7/gen_realodd2.mli
+    R ./genfft-k7/gen_twiddle.mli
+    R ./genfft-k7/genfft.ml
+    M ./genfft-k7/k7Basics.ml +19
+    M ./genfft-k7/k7Basics.mli +2
+    M ./genfft-k7/k7Unparsing.ml -1 +1
+    M ./genfft-k7/magic.ml -1
+    M ./genfft-k7/magic.mli -1
+    M ./kernel/ifftw.h -1 +4
+    M ./libbench/bench-user.h -1 +3
+    M ./support/Makefile.am -1 +2
+
+Fri Jun 14 15:54:29 EDT 2002  athena
+  * [project @ 2002-06-14 19:54:29 by athena]
+  More work on k7 stuff
+
+    A ./dft/direct-k7.c
+    A ./dft/kdft-k7.c
+    M ./dft/Makefile.am -1 +2
+    M ./dft/codelet.h -1 +13
+    M ./dft/conf.c -9 +15
+    M ./dft/dft.h -1 +2
+    M ./dft/direct-k7.c +167
+    M ./dft/kdft-k7.c +29
+    M ./genfft-k7/codeletMisc.ml -91
+    M ./genfft-k7/codeletMisc.mli -13 +1
+    M ./genfft-k7/genUtil.ml -24 +19
+    M ./genfft-k7/gen_notwiddle.ml -2 +44
+    M ./genfft-k7/genfft.ml +4
+    M ./genfft-k7/magic.ml +1
+    M ./genfft-k7/magic.mli +1
+    M ./support/Makefile.codelets -1 +1
+
+Fri Jun 14 14:18:15 EDT 2002  athena
+  * [project @ 2002-06-14 18:18:15 by athena]
+  Changed my mind again
+
+    M ./dft/codelet.h -4 +1
+    M ./dft/direct.c -7 +4
+    M ./genfft/gen_notw.ml -5 +4
+
+Fri Jun 14 11:53:09 EDT 2002  athena
+  * [project @ 2002-06-14 15:53:09 by athena]
+  Removed some useless stuff.
+
+    M ./genfft-k7/gen_notwiddle.ml -8 +3
+
+Fri Jun 14 11:01:39 EDT 2002  athena
+  * [project @ 2002-06-14 15:01:39 by athena]
+  Hmm...
+
+    M ./genfft-k7/gen_notwiddle.ml -3 +8
+
+Fri Jun 14 10:28:12 EDT 2002  athena
+  * [project @ 2002-06-14 14:28:12 by athena]
+  More work in preparation for k7 stuff
+
+    M ./dft/codelet.h -1 +2
+    M ./dft/direct.c -3 +4
+    M ./genfft/gen_notw.ml -4 +5
+    M ./genfft-k7/gen_notwiddle.ml -18 +13
+
+Fri Jun 14 07:25:28 EDT 2002  athena
+  * [project @ 2002-06-14 11:25:28 by athena]
+  Still preparing to include k7 stuff
+
+    M ./TODO +4
+    M ./dft/codelet.h -4 +6
+    M ./dft/direct.c -1 +5
+    M ./genfft/gen_notw.ml -7 +9
+
+Fri Jun 14 07:06:02 EDT 2002  athena
+  * [project @ 2002-06-14 11:06:02 by athena]
+  Create .depend
+
+    M ./bootstrap.sh -2 +2
+
+Fri Jun 14 06:56:15 EDT 2002  athena
+  * [project @ 2002-06-14 10:56:14 by athena]
+  Imported Stefan's K7 generator
+
+    A ./genfft-k7/
+    A ./genfft-k7/Makefile.am
+    A ./genfft-k7/assignmentsToVfpinstrs.ml
+    A ./genfft-k7/assignmentsToVfpinstrs.mli
+    A ./genfft-k7/balanceVfpinstrs.ml
+    A ./genfft-k7/balanceVfpinstrs.mli
+    A ./genfft-k7/codeletMisc.ml
+    A ./genfft-k7/codeletMisc.mli
+    A ./genfft-k7/complex.ml
+    A ./genfft-k7/complex.mli
+    A ./genfft-k7/expr.ml
+    A ./genfft-k7/expr.mli
+    A ./genfft-k7/exprdag.ml
+    A ./genfft-k7/exprdag.mli
+    A ./genfft-k7/fft.ml
+    A ./genfft-k7/fft.mli
+    A ./genfft-k7/genUtil.ml
+    A ./genfft-k7/genUtil.mli
+    A ./genfft-k7/gen_hc2hc.ml
+    A ./genfft-k7/gen_hc2hc.mli
+    A ./genfft-k7/gen_hc2real.ml
+    A ./genfft-k7/gen_hc2real.mli
+    A ./genfft-k7/gen_notwiddle.ml
+    A ./genfft-k7/gen_notwiddle.mli
+    A ./genfft-k7/gen_notwiddle_fixedstride.ml
+    A ./genfft-k7/gen_real2hc.ml
+    A ./genfft-k7/gen_real2hc.mli
+    A ./genfft-k7/gen_realeven.ml
+    A ./genfft-k7/gen_realeven.mli
+    A ./genfft-k7/gen_realeven2.ml
+    A ./genfft-k7/gen_realeven2.mli
+    A ./genfft-k7/gen_realodd.ml
+    A ./genfft-k7/gen_realodd.mli
+    A ./genfft-k7/gen_realodd2.ml
+    A ./genfft-k7/gen_realodd2.mli
+    A ./genfft-k7/gen_twiddle.ml
+    A ./genfft-k7/gen_twiddle.mli
+    A ./genfft-k7/genfft.ml
+    A ./genfft-k7/id.ml
+    A ./genfft-k7/id.mli
+    A ./genfft-k7/k7Basics.ml
+    A ./genfft-k7/k7Basics.mli
+    A ./genfft-k7/k7ExecutionModel.ml
+    A ./genfft-k7/k7ExecutionModel.mli
+    A ./genfft-k7/k7FlatInstructionScheduling.ml
+    A ./genfft-k7/k7FlatInstructionScheduling.mli
+    A ./genfft-k7/k7InstructionSchedulingBasics.ml
+    A ./genfft-k7/k7InstructionSchedulingBasics.mli
+    A ./genfft-k7/k7RegisterAllocationBasics.ml
+    A ./genfft-k7/k7RegisterAllocationBasics.mli
+    A ./genfft-k7/k7RegisterAllocator.ml
+    A ./genfft-k7/k7RegisterAllocator.mli
+    A ./genfft-k7/k7RegisterAllocatorEATranslation.ml
+    A ./genfft-k7/k7RegisterAllocatorEATranslation.mli
+    A ./genfft-k7/k7RegisterAllocatorInit.ml
+    A ./genfft-k7/k7RegisterAllocatorInit.mli
+    A ./genfft-k7/k7RegisterReallocation.ml
+    A ./genfft-k7/k7RegisterReallocation.mli
+    A ./genfft-k7/k7Translate.ml
+    A ./genfft-k7/k7Translate.mli
+    A ./genfft-k7/k7Unparsing.ml
+    A ./genfft-k7/k7Unparsing.mli
+    A ./genfft-k7/k7Vectorization.ml
+    A ./genfft-k7/k7Vectorization.mli
+    A ./genfft-k7/magic.ml
+    A ./genfft-k7/magic.mli
+    A ./genfft-k7/memoMonad.ml
+    A ./genfft-k7/memoMonad.mli
+    A ./genfft-k7/nonDetMonad.ml
+    A ./genfft-k7/nonDetMonad.mli
+    A ./genfft-k7/nullVectorization.ml
+    A ./genfft-k7/nullVectorization.mli
+    A ./genfft-k7/number.ml
+    A ./genfft-k7/number.mli
+    A ./genfft-k7/stateMonad.ml
+    A ./genfft-k7/stateMonad.mli
+    A ./genfft-k7/symmetry.ml
+    A ./genfft-k7/twiddle.ml
+    A ./genfft-k7/util.ml
+    A ./genfft-k7/util.mli
+    A ./genfft-k7/vAnnotatedScheduler.ml
+    A ./genfft-k7/vAnnotatedScheduler.mli
+    A ./genfft-k7/vDag.ml
+    A ./genfft-k7/vDag.mli
+    A ./genfft-k7/vFpBasics.ml
+    A ./genfft-k7/vFpBasics.mli
+    A ./genfft-k7/vFpUnparsing.ml
+    A ./genfft-k7/vFpUnparsing.mli
+    A ./genfft-k7/vImproveSchedule.ml
+    A ./genfft-k7/vImproveSchedule.mli
+    A ./genfft-k7/vK7Optimization.ml
+    A ./genfft-k7/vK7Optimization.mli
+    A ./genfft-k7/vScheduler.ml
+    A ./genfft-k7/vScheduler.mli
+    A ./genfft-k7/vSimdBasics.ml
+    A ./genfft-k7/vSimdBasics.mli
+    A ./genfft-k7/vSimdIndexing.ml
+    A ./genfft-k7/vSimdIndexing.mli
+    A ./genfft-k7/vSimdUnparsing.ml
+    A ./genfft-k7/vSimdUnparsing.mli
+    A ./genfft-k7/variable.ml
+    A ./genfft-k7/variable.mli
+    M ./AUTHORS -2 +5
+    M ./Makefile.am -1 +1
+    M ./bootstrap.sh +2
+    M ./configure.ac +1
+    M ./genfft-k7/Makefile.am +108
+    M ./genfft-k7/assignmentsToVfpinstrs.ml +225
+    M ./genfft-k7/assignmentsToVfpinstrs.mli +38
+    M ./genfft-k7/balanceVfpinstrs.ml +195
+    M ./genfft-k7/balanceVfpinstrs.mli +29
+    M ./genfft-k7/codeletMisc.ml +127
+    M ./genfft-k7/codeletMisc.mli +45
+    M ./genfft-k7/complex.ml +142
+    M ./genfft-k7/complex.mli +44
+    M ./genfft-k7/expr.ml +46
+    M ./genfft-k7/expr.mli -1 +34
+    M ./genfft-k7/exprdag.ml +879
+    M ./genfft-k7/exprdag.mli +42
+    M ./genfft-k7/fft.ml +311
+    M ./genfft-k7/fft.mli +50
+    M ./genfft-k7/genUtil.ml +377
+    M ./genfft-k7/genUtil.mli +41
+    M ./genfft-k7/gen_hc2hc.ml +133
+    M ./genfft-k7/gen_hc2hc.mli +25
+    M ./genfft-k7/gen_hc2real.ml +77
+    M ./genfft-k7/gen_hc2real.mli +24
+    M ./genfft-k7/gen_notwiddle.ml +69
+    M ./genfft-k7/gen_notwiddle.mli +25
+    M ./genfft-k7/gen_notwiddle_fixedstride.ml +45
+    M ./genfft-k7/gen_real2hc.ml +72
+    M ./genfft-k7/gen_real2hc.mli +24
+    M ./genfft-k7/gen_realeven.ml +55
+    M ./genfft-k7/gen_realeven.mli +24
+    M ./genfft-k7/gen_realeven2.ml +57
+    M ./genfft-k7/gen_realeven2.mli +24
+    M ./genfft-k7/gen_realodd.ml +60
+    M ./genfft-k7/gen_realodd.mli +24
+    M ./genfft-k7/gen_realodd2.ml +60
+    M ./genfft-k7/gen_realodd2.mli +24
+    M ./genfft-k7/gen_twiddle.ml +98
+    M ./genfft-k7/gen_twiddle.mli +25
+    M ./genfft-k7/genfft.ml +282
+    M ./genfft-k7/id.ml +61
+    M ./genfft-k7/id.mli +79
+    M ./genfft-k7/k7Basics.ml +524
+    M ./genfft-k7/k7Basics.mli +219
+    M ./genfft-k7/k7ExecutionModel.ml +111
+    M ./genfft-k7/k7ExecutionModel.mli +24
+    M ./genfft-k7/k7FlatInstructionScheduling.ml +177
+    M ./genfft-k7/k7FlatInstructionScheduling.mli +22
+    M ./genfft-k7/k7InstructionSchedulingBasics.ml +205
+    M ./genfft-k7/k7InstructionSchedulingBasics.mli +74
+    M ./genfft-k7/k7RegisterAllocationBasics.ml +162
+    M ./genfft-k7/k7RegisterAllocationBasics.mli +88
+    M ./genfft-k7/k7RegisterAllocator.ml +516
+    M ./genfft-k7/k7RegisterAllocator.mli +23
+    M ./genfft-k7/k7RegisterAllocatorEATranslation.ml +134
+    M ./genfft-k7/k7RegisterAllocatorEATranslation.mli +27
+    M ./genfft-k7/k7RegisterAllocatorInit.ml +97
+    M ./genfft-k7/k7RegisterAllocatorInit.mli +30
+    M ./genfft-k7/k7RegisterReallocation.ml +259
+    M ./genfft-k7/k7RegisterReallocation.mli +22
+    M ./genfft-k7/k7Translate.ml +142
+    M ./genfft-k7/k7Translate.mli +87
+    M ./genfft-k7/k7Unparsing.ml +326
+    M ./genfft-k7/k7Unparsing.mli +53
+    M ./genfft-k7/k7Vectorization.ml +739
+    M ./genfft-k7/k7Vectorization.mli +25
+    M ./genfft-k7/magic.ml +80
+    M ./genfft-k7/magic.mli +55
+    M ./genfft-k7/memoMonad.ml +32
+    M ./genfft-k7/memoMonad.mli +24
+    M ./genfft-k7/nonDetMonad.ml +136
+    M ./genfft-k7/nonDetMonad.mli +94
+    M ./genfft-k7/nullVectorization.ml +78
+    M ./genfft-k7/nullVectorization.mli +24
+    M ./genfft-k7/number.ml +158
+    M ./genfft-k7/number.mli +53
+    M ./genfft-k7/stateMonad.ml +71
+    M ./genfft-k7/stateMonad.mli +41
+    M ./genfft-k7/symmetry.ml +314
+    M ./genfft-k7/twiddle.ml +116
+    M ./genfft-k7/util.ml -1 +322
+    M ./genfft-k7/util.mli +124
+    M ./genfft-k7/vAnnotatedScheduler.ml +182
+    M ./genfft-k7/vAnnotatedScheduler.mli +39
+    M ./genfft-k7/vDag.ml +123
+    M ./genfft-k7/vDag.mli +52
+    M ./genfft-k7/vFpBasics.ml +203
+    M ./genfft-k7/vFpBasics.mli +106
+    M ./genfft-k7/vFpUnparsing.ml +79
+    M ./genfft-k7/vFpUnparsing.mli +28
+    M ./genfft-k7/vImproveSchedule.ml +155
+    M ./genfft-k7/vImproveSchedule.mli +22
+    M ./genfft-k7/vK7Optimization.ml +626
+    M ./genfft-k7/vK7Optimization.mli +22
+    M ./genfft-k7/vScheduler.ml +166
+    M ./genfft-k7/vScheduler.mli +31
+    M ./genfft-k7/vSimdBasics.ml +296
+    M ./genfft-k7/vSimdBasics.mli +247
+    M ./genfft-k7/vSimdIndexing.ml +119
+    M ./genfft-k7/vSimdIndexing.mli +68
+    M ./genfft-k7/vSimdUnparsing.ml +104
+    M ./genfft-k7/vSimdUnparsing.mli +33
+    M ./genfft-k7/variable.ml -1 +119
+    M ./genfft-k7/variable.mli +53
+
+Thu Jun 13 15:30:41 EDT 2002  athena
+  * [project @ 2002-06-13 19:30:41 by athena]
+  Generator for real->halfcomplex and halfcomplex->real codelets
+
+    A ./genfft/gen_hc2r.ml
+    A ./genfft/gen_r2hc.ml
+    M ./genfft/Makefile.am -9 +27
+    M ./genfft/c.ml -2 +2
+    M ./genfft/complex.ml -1 +8
+    M ./genfft/complex.mli -1 +3
+    M ./genfft/gen_hc2r.ml +121
+    M ./genfft/gen_r2hc.ml +121
+    M ./genfft/genutil.ml -8 +9
+    M ./genfft/trig.ml -2 +2
+
+Thu Jun 13 11:54:02 EDT 2002  athena
+  * [project @ 2002-06-13 15:54:02 by athena]
+  Improved hash functions, printers
+
+    M ./dft/problem.c -5 +10
+    M ./kernel/planner.c -11 +5
+    M ./kernel/tensor.c -7 +6
+    M ./tests/bench.c -1 +1
+
+Thu Jun 13 11:17:31 EDT 2002  athena
+  * [project @ 2002-06-13 15:17:31 by athena]
+  Only regenerate codlist.c in maintainer mode
+
+    M ./support/Makefile.codelets -18 +18
+
+Thu Jun 13 11:04:24 EDT 2002  athena
+  * [project @ 2002-06-13 15:04:24 by athena]
+  Planner can export solution list
+
+    M ./dft/problem.c -10 +21
+    M ./dft/rank-geq2.c -2 +2
+    M ./kernel/ifftw.h -11 +13
+    M ./kernel/planner-naive.c -7 +7
+    M ./kernel/planner-score.c -8 +7
+    M ./kernel/planner.c -42 +47
+    M ./kernel/print.c -9 +27
+    M ./kernel/tensor.c -4 +4
+    M ./tests/bench.c -5 +6
+
+Thu Jun 13 08:59:53 EDT 2002  athena
+  * [project @ 2002-06-13 12:59:53 by athena]
+  Fixed for intel compiler
+
+    M ./dft/ct-ditbuf.c -2 +3
+    M ./dft/dft.h -2 +1
+    M ./dft/direct.c -3 +1
+    M ./kernel/cycle.h -2 +2
+    M ./libbench/bench-user.h -12 +1
+
+Thu Jun 13 08:48:51 EDT 2002  athena
+  * [project @ 2002-06-13 12:48:51 by athena]
+  Revised strategy for constants in codelets
+
+    M ./dft/codelet.h -13 +2
+    M ./genfft/c.ml -50 +43
+    M ./genfft/c.mli -1 +4
+    M ./genfft/gen_notw.ml -5 +5
+    M ./genfft/gen_trig.ml -3 +3
+    M ./genfft/gen_twiddle.ml -3 +3
+    M ./genfft/gen_twidsq.ml -3 +3
+    M ./genfft/magic.ml -5 +1
+
+Thu Jun 13 06:21:31 EDT 2002  athena
+  * [project @ 2002-06-13 10:21:31 by athena]
+  Enable score planner in classic mode, naive planner in pro mode.
+
+    M ./tests/bench.c -3 +3
+
+Wed Jun 12 19:18:18 EDT 2002  athena
+  * [project @ 2002-06-12 23:18:18 by athena]
+  Report classic/pro
+
+    M ./tests/bench.c -2 +7
+
+Wed Jun 12 19:07:48 EDT 2002  athena
+  * [project @ 2002-06-12 23:07:48 by athena]
+  Fixed behavior of buffered solver for large buffers.
+
+    M ./dft/buffered.c -5 +5
+    M ./tests/bench.c -1 +1
+
+Wed Jun 12 18:57:19 EDT 2002  athena
+  * [project @ 2002-06-12 22:57:19 by athena]
+  Make assumption COST(vector) = length * COST(scalar) in classic mode.
+
+    M ./dft/rank-geq2.c -2 +2
+    M ./dft/vrank-geq1.c -1 +2
+    M ./kernel/ifftw.h -4 +7
+    M ./kernel/planner-naive.c -4 +2
+    M ./kernel/planner-score.c -5 +3
+    M ./kernel/planner.c -13 +20
+    M ./kernel/timer.c -8 +2
+    M ./libbench/timer.c -2 +2
+    M ./tests/bench.c -1 +1
+
+Wed Jun 12 18:19:48 EDT 2002  athena
+  * [project @ 2002-06-12 22:19:48 by athena]
+  Revised planner implementation in preparation for wisdom.
+
+    M ./kernel/ifftw.h -5 +6
+    M ./kernel/plan.c -1 +2
+    M ./kernel/planner-naive.c -3 +7
+    M ./kernel/planner-score.c -3 +7
+    M ./kernel/planner.c -46 +81
+    M ./support/Makefile.codelets -19 +19
+
+Wed Jun 12 08:27:36 EDT 2002  athena
+  * [project @ 2002-06-12 12:27:36 by athena]
+  Manually hoist loop invariants.
+
+    M ./dft/ct-ditbuf.c -5 +11
+
+Wed Jun 12 07:47:41 EDT 2002  athena
+  * [project @ 2002-06-12 11:47:41 by athena]
+  Revised loop to compile better with gcc -O
+
+    M ./dft/rank-geq2.c -16 +8
+    M ./dft/rank0.c -23 +22
+    M ./dft/vrank-geq1.c -2 +2
+
+Tue Jun 11 17:24:09 EDT 2002  athena
+  * [project @ 2002-06-11 21:24:09 by athena]
+  Changed tensor syntax
+
+    M ./kernel/tensor.c -4 +5
+
+Tue Jun 11 16:39:45 EDT 2002  athena
+  * [project @ 2002-06-11 20:39:45 by athena]
+  Added stuff to do.
+
+    M ./TODO +15
+
+Tue Jun 11 16:28:14 EDT 2002  athena
+  * [project @ 2002-06-11 20:28:14 by athena]
+  Report classic/pro in version number
+
+    M ./kernel/version.c -2 +7
+
+Tue Jun 11 14:22:49 EDT 2002  athena
+  * [project @ 2002-06-11 18:22:49 by athena]
+  Renamed versions into classic/pro
+
+    A ./CLASSIC-MODE
+    M ./Makefile.am -7 +7
+    R ./RESEARCH-MODE
+    M ./bootstrap.sh -1 +1
+    M ./configure.ac -13 +13
+    M ./dft/codelets/inplace/Makefile.am -1 +1
+    M ./dft/ct-dit.c -2 +2
+    M ./dft/ct-ditbuf.c -2 +2
+    M ./dft/kdft-dit.c -2 +2
+    M ./dft/rank-geq2.c -4 +4
+    M ./dft/vrank-geq1.c -4 +4
+    M ./kernel/ifftw.h -3 +3
+    M ./mkdist.sh -2 +2
+    M ./tests/bench.c -1 +1
+
+Tue Jun 11 14:06:06 EDT 2002  athena
+  * [project @ 2002-06-11 18:06:06 by athena]
+  Revised planners, estimator
+
+    M ./kernel/Makefile.am -4 +4
+    M ./kernel/ifftw.h -6 +7
+    R ./kernel/planner-estimate.c
+    M ./kernel/planner-naive.c -4 +4
+    M ./kernel/planner-score.c -4 +4
+    M ./kernel/planner.c -2 +17
+    M ./tests/bench.c -5 +5
+
+Tue Jun 11 11:45:41 EDT 2002  athena
+  * [project @ 2002-06-11 15:45:41 by athena]
+  I don't know what I am doing.
+
+    M ./Makefile.am -2 +2
+    M ./dft/buffered.c -3 +2
+    M ./dft/ct-dit.c -4 +5
+    M ./dft/ct-ditbuf.c -4 +6
+    M ./dft/kdft-dif.c -3 +2
+    M ./dft/kdft-difsq.c -3 +2
+    M ./dft/kdft-dit.c -7 +4
+    M ./kernel/ifftw.h -3 +2
+
+Tue Jun 11 10:35:52 EDT 2002  athena
+  * [project @ 2002-06-11 14:35:52 by athena]
+  Massive revision of estimator
+
+    A ./kernel/ops.c
+    M ./Makefile.am -1 +1
+    M ./dft/buffered.c -8 +7
+    M ./dft/codelet.h -3 +3
+    M ./dft/ct-dif.c -5 +4
+    M ./dft/ct-dit.c -5 +7
+    M ./dft/ct-ditbuf.c -4 +9
+    M ./dft/ct-ditf.c -4 +3
+    M ./dft/ct.c -6 +1
+    M ./dft/dft.h -1 +3
+    M ./dft/direct.c -3 +2
+    M ./dft/indirect.c -3 +2
+    M ./dft/nop.c -1 +3
+    M ./dft/rank-geq2.c -3 +7
+    M ./dft/rank0.c -3 +4
+    M ./dft/vrank-geq1.c -10 +18
+    M ./dft/vrank2-transpose.c -3 +3
+    M ./dft/vrank3-transpose.c -3 +4
+    M ./genfft/c.ml -2 +2
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/cycle.h -3 +3
+    R ./kernel/flops.c
+    M ./kernel/ifftw.h -13 +21
+    M ./kernel/ops.c +58
+    M ./kernel/plan.c -2 +2
+    M ./kernel/planner-estimate.c -2 +9
+    M ./kernel/planner-naive.c -3 +3
+    M ./kernel/planner-score.c -4 +4
+    M ./tests/bench.c -1 +3
+
+Tue Jun 11 07:32:20 EDT 2002  athena
+  * [project @ 2002-06-11 11:32:20 by athena]
+  Many changes
+
+    A ./dft/vrank-geq1.c
+    M ./dft/Makefile.am -1 +1
+    M ./dft/buffered.c -9 +8
+    M ./dft/codelets/inplace/Makefile.am -2 +3
+    M ./dft/codelets/standard/Makefile.am -2
+    M ./dft/conf.c -2 +2
+    M ./dft/ct-dit.c -2 +5
+    M ./dft/ct.c -2 +2
+    M ./dft/dft.h -6 +4
+    M ./dft/indirect.c -2 +2
+    M ./dft/problem.c -1 +13
+    M ./dft/rank-geq2.c -5 +4
+    R ./dft/vecloop.c
+    M ./dft/vrank-geq1.c +243
+    M ./kernel/ifftw.h -2 +6
+    M ./kernel/planner-estimate.c -2 +2
+    M ./kernel/planner-naive.c -2 +2
+    M ./kernel/planner-score.c -4 +17
+    M ./kernel/planner.c -4 +6
+    M ./kernel/print.c -6 +27
+    M ./kernel/tensor.c -1 +15
+    M ./kernel/timer.c -2 +2
+    M ./libbench/bench-main.c -3 +6
+    M ./tests/bench.c -15 +15
+
+Mon Jun 10 21:35:29 EDT 2002  athena
+  * [project @ 2002-06-11 01:35:29 by athena]
+  Keep it simple, stupid.
+
+    M ./dft/ct-ditbuf.c -21 +4
+
+Mon Jun 10 19:24:28 EDT 2002  athena
+  * [project @ 2002-06-10 23:24:28 by athena]
+  Fixed when #undef PRECOMPUTE_ARRAY_INDICES
+
+    M ./kernel/ifftw.h -3 +5
+
+Mon Jun 10 17:58:13 EDT 2002  athena
+  * [project @ 2002-06-10 21:58:13 by athena]
+  Minor changes
+
+    M ./dft/vrank3-transpose.c -5 +3
+    M ./kernel/print.c -2 +2
+
+Mon Jun 10 16:30:37 EDT 2002  athena
+  * [project @ 2002-06-10 20:30:37 by athena]
+  Added ct-ditbuf.c, many changes everywhere
+
+    A ./dft/ct-ditbuf.c
+    M ./CONVENTIONS -2 +3
+    M ./configure.ac +2
+    M ./dft/Makefile.am -3 +3
+    M ./dft/buffered.c -11 +11
+    M ./dft/ct-dif.c -5 +3
+    M ./dft/ct-dit.c -6 +4
+    M ./dft/ct-ditbuf.c +187
+    M ./dft/ct.c -3 +3
+    M ./dft/ct.h -1 +3
+    M ./dft/dft.h -1 +2
+    M ./dft/direct.c -6 +6
+    M ./dft/indirect.c -6 +6
+    M ./dft/kdft-dif.c -2 +2
+    M ./dft/kdft-difsq.c -2 +2
+    M ./dft/kdft-dit.c -3 +8
+    M ./dft/kdft.c -2 +2
+    M ./dft/nop.c -2 +2
+    M ./dft/rank-geq2.c -6 +6
+    M ./dft/rank0.c -2 +2
+    M ./dft/vecloop.c -10 +8
+    M ./dft/vrank2-transpose.c -2 +2
+    M ./dft/vrank3-transpose.c -2 +2
+    M ./kernel/ifftw.h -3 +25
+    M ./kernel/plan.c -5 +4
+    M ./kernel/problem.c -2 +2
+    M ./kernel/timer.c -3 +3
+    M ./tests/bench.c -7 +8
+
+Mon Jun 10 10:55:40 EDT 2002  athena
+  * [project @ 2002-06-10 14:55:40 by athena]
+  More name mangling
+
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/planner.c -2 +2
+    M ./tests/bench.c -58 +47
+
+Mon Jun 10 10:08:27 EDT 2002  athena
+  * [project @ 2002-06-10 14:08:27 by athena]
+  Fixed build system for single/double precision
+
+    M ./Makefile.am -4 +3
+    M ./acinclude.m4 -13
+    M ./configure.ac +4
+    M ./tests/Makefile.am -1 +1
+
+Mon Jun 10 09:04:21 EDT 2002  athena
+  * [project @ 2002-06-10 13:04:21 by athena]
+  Massive renaming to support both single and double precision.
+  (Must recompile everything twice).
+
+    A ./kernel/dfftw3.h
+    A ./kernel/fftw3.h
+    A ./kernel/sfftw3.h
+    M ./CONVENTIONS -3 +9
+    M ./configure.ac -1 +1
+    M ./dft/buffered.c -96 +102
+    M ./dft/codelet.h -16 +16
+    M ./dft/codelets/inplace/Makefile.am -1 +1
+    M ./dft/codelets/standard/Makefile.am -1 +1
+    M ./dft/conf.c -12 +12
+    M ./dft/ct-dif.c -31 +33
+    M ./dft/ct-dit.c -29 +31
+    M ./dft/ct-ditf.c -27 +29
+    M ./dft/ct.c -40 +41
+    M ./dft/ct.h -10 +12
+    M ./dft/dft.h -25 +27
+    M ./dft/direct.c -26 +26
+    M ./dft/indirect.c -50 +63
+    M ./dft/kdft-dif.c -3 +3
+    M ./dft/kdft-difsq.c -3 +3
+    M ./dft/kdft-dit.c -3 +3
+    M ./dft/kdft.c -3 +3
+    M ./dft/nop.c -11 +11
+    M ./dft/plan.c -3 +3
+    M ./dft/problem.c -39 +40
+    M ./dft/rank-geq2.c -57 +59
+    M ./dft/rank0.c -54 +62
+    M ./dft/solve.c -2 +2
+    M ./dft/vecloop.c -49 +49
+    M ./dft/vrank2-transpose.c -27 +28
+    M ./dft/vrank3-transpose.c -44 +47
+    M ./genfft/gen_notw.ml -3 +3
+    M ./genfft/gen_twiddle.ml -4 +4
+    M ./genfft/gen_twidsq.ml -4 +4
+    M ./genfft/genutil.ml -2 +2
+    M ./kernel/Makefile.am -1 +2
+    M ./kernel/alloc.c -76 +76
+    M ./kernel/assert.c -2 +2
+    M ./kernel/awake.c -2 +2
+    M ./kernel/cycle.h -11 +11
+    M ./kernel/dfftw3.h +24
+    R ./kernel/fftw.h
+    M ./kernel/fftw3.h +39
+    M ./kernel/flops.c -4 +6
+    M ./kernel/ifftw.h -90 +89
+    M ./kernel/minmax.c -5 +5
+    M ./kernel/plan.c -6 +6
+    M ./kernel/planner-estimate.c -6 +6
+    M ./kernel/planner-naive.c -7 +7
+    M ./kernel/planner-score.c -11 +11
+    M ./kernel/planner.c -73 +71
+    M ./kernel/print.c -12 +14
+    M ./kernel/problem.c -5 +5
+    M ./kernel/sfftw3.h +24
+    M ./kernel/solver.c -5 +5
+    M ./kernel/solvtab.c -4 +4
+    M ./kernel/square.c -2 +2
+    M ./kernel/stride.c -6 +6
+    M ./kernel/tensor.c -96 +96
+    M ./kernel/timer.c -39 +40
+    M ./kernel/twiddle.c -33 +38
+    M ./kernel/version.c -3 +4
+    M ./support/Makefile.codelets -3 +3
+    M ./tests/bench.c -39 +53
+
+Mon Jun 10 06:49:55 EDT 2002  athena
+  * [project @ 2002-06-10 10:49:55 by athena]
+  Preliminary crude support for vector transforms in benchmark library.
+
+    M ./libbench/allocate.c -7 +8
+    M ./libbench/bench-user.h -2 +5
+    M ./libbench/mflops.c -2 +2
+    M ./libbench/problem.c -2 +31
+    M ./tests/bench.c -1 +2
+
+Sun Jun  9 21:11:51 EDT 2002  athena
+  * [project @ 2002-06-10 01:11:51 by athena]
+  Wrong cast
+
+    M ./kernel/tensor.c -3 +3
+
+Sun Jun  9 16:48:54 EDT 2002  athena
+  * [project @ 2002-06-09 20:48:54 by athena]
+  Added things to do.
+
+    M ./TODO +1
+
+Sun Jun  9 16:07:12 EDT 2002  athena
+  * [project @ 2002-06-09 20:07:12 by athena]
+  twlen0: make static
+
+    M ./kernel/twiddle.c -2 +2
+
+Sun Jun  9 15:34:54 EDT 2002  athena
+  * [project @ 2002-06-09 19:34:54 by athena]
+  Nothing
+
+    M ./dft/buffered.c -3 +3
+
+Sun Jun  9 15:30:13 EDT 2002  athena
+  * [project @ 2002-06-09 19:30:13 by athena]
+  Forgot break in switch statement.
+
+    M ./kernel/print.c -1 +2
+
+Sun Jun  9 15:27:24 EDT 2002  athena
+  * [project @ 2002-06-09 19:27:24 by athena]
+  Fix for c++ compatibility
+
+    M ./kernel/print.c -4 +6
+
+Sun Jun  9 15:16:43 EDT 2002  athena
+  * [project @ 2002-06-09 19:16:43 by athena]
+  Added printer, changed everything
+
+    A ./kernel/print.c
+    M ./TODO -4
+    M ./dft/buffered.c -26 +22
+    M ./dft/ct.c -10 +5
+    M ./dft/direct.c -8 +3
+    M ./dft/indirect.c -20 +16
+    M ./dft/nop.c -4 +6
+    M ./dft/rank-geq2.c -7 +3
+    M ./dft/rank0.c -6 +3
+    M ./dft/vecloop.c -5 +3
+    M ./dft/vrank2-transpose.c -3 +3
+    M ./dft/vrank3-transpose.c -3 +3
+    M ./kernel/Makefile.am -2 +2
+    M ./kernel/ifftw.h -4 +14
+    M ./kernel/planner.c -3 +10
+    M ./kernel/print.c +122
+    M ./tests/bench.c -2 +22
+
+Sun Jun  9 11:37:07 EDT 2002  athena
+  * [project @ 2002-06-09 15:37:07 by athena]
+  Removed redundant nop solver
+
+    M ./dft/buffered.c -8 +3
+    M ./dft/nop.c -2 +12
+    M ./dft/rank0.c -25 +3
+    M ./tests/bench.c -1 +1
+
+Sun Jun  9 11:06:31 EDT 2002  athena
+  * [project @ 2002-06-09 15:06:31 by athena]
+  More things to do
+
+    M ./TODO +10
+
+Sun Jun  9 11:01:41 EDT 2002  athena
+  * [project @ 2002-06-09 15:01:41 by athena]
+  Introduced idea of rank -infinity and associated NOP plans
+
+    A ./TODO
+    A ./dft/nop.c
+    M ./TODO +4
+    M ./dft/Makefile.am -4 +4
+    M ./dft/buffered.c -88 +66
+    M ./dft/conf.c -1 +2
+    M ./dft/dft.h -1 +2
+    M ./dft/direct.c -2 +3
+    M ./dft/indirect.c -1 +2
+    M ./dft/nop.c +84
+    M ./dft/problem.c -2 +5
+    M ./dft/rank0.c -3 +12
+    M ./dft/vecloop.c -1 +2
+    M ./kernel/ifftw.h -1 +11
+    M ./kernel/tensor.c -11 +39
+
+Sun Jun  9 08:36:27 EDT 2002  athena
+  * [project @ 2002-06-09 12:36:27 by athena]
+  Fixed comment
+
+    M ./dft/buffered.c -2 +2
+
+Sun Jun  9 08:20:13 EDT 2002  athena
+  * [project @ 2002-06-09 12:20:13 by athena]
+  Removed useless assertions.
+
+    M ./kernel/tensor.c -4 +3
+
+Sun Jun  9 08:19:26 EDT 2002  athena
+  * [project @ 2002-06-09 12:19:26 by athena]
+  Don't malloc(0).
+
+    M ./kernel/tensor.c -3 +7
+
+Sun Jun  9 08:08:13 EDT 2002  athena
+  * [project @ 2002-06-09 12:08:13 by athena]
+  Fixed signed/unsigned puns
+
+    M ./dft/buffered.c -11 +11
+
+Sun Jun  9 07:52:22 EDT 2002  athena
+  * [project @ 2002-06-09 11:52:22 by athena]
+  Added buffered.c
+
+    A ./dft/buffered.c
+    M ./dft/Makefile.am -1 +1
+    M ./dft/buffered.c +374
+    M ./dft/conf.c -1 +2
+    M ./dft/ct-dif.c -2 +2
+    M ./dft/ct-dit.c -2 +2
+    M ./dft/ct-ditf.c -2 +2
+    M ./dft/dft.h -1 +2
+    M ./dft/direct.c -2 +2
+    M ./dft/indirect.c -3 +3
+    M ./dft/rank-geq2.c -2 +2
+    M ./dft/rank0.c -6 +6
+    M ./dft/vecloop.c -2 +2
+    M ./dft/vrank2-transpose.c -2 +2
+    M ./dft/vrank3-transpose.c -2 +2
+    M ./libbench/bench-main.c -2 +7
+    M ./libbench/bench-user.h -1 +3
+    M ./tests/bench.c -1 +1
+
+Sat Jun  8 16:57:54 EDT 2002  athena
+  * [project @ 2002-06-08 20:57:54 by athena]
+  Fixed printout
+
+    M ./dft/ct.c -6 +5
+
+Sat Jun  8 16:42:52 EDT 2002  athena
+  * [project @ 2002-06-08 20:42:52 by athena]
+  Fixed comment
+
+    M ./dft/vrank3-transpose.c -2 +2
+
+Sat Jun  8 16:40:58 EDT 2002  athena
+  * [project @ 2002-06-08 20:40:58 by athena]
+  Added vrank3-transpose, renamed vrank0-transpose -> vrank2-transpose
+
+    A ./dft/vrank2-transpose.c
+    A ./dft/vrank3-transpose.c
+    M ./dft/Makefile.am -2 +2
+    M ./dft/conf.c -2 +3
+    M ./dft/dft.h -1 +3
+    R ./dft/vrank0-transpose.c
+    M ./dft/vrank2-transpose.c +132
+    M ./dft/vrank3-transpose.c +173
+    M ./tests/bench.c -1 +1
+
+Sat Jun  8 15:51:46 EDT 2002  athena
+  * [project @ 2002-06-08 19:51:46 by athena]
+  Added vrank0-transpose
+
+    A ./dft/rank-geq2.c
+    A ./dft/vrank0-transpose.c
+    M ./bootstrap.sh -1 +1
+    M ./dft/Makefile.am -1 +2
+    M ./dft/conf.c -1 +2
+    M ./dft/direct.c -4 +4
+    M ./dft/rank-geq2.c +257
+    M ./dft/rank0.c -2 +2
+    R ./dft/rank_geq2.c
+    M ./dft/vrank0-transpose.c +132
+    M ./tests/bench.c -1 +2
+
+Sat Jun  8 15:11:09 EDT 2002  athena
+  * [project @ 2002-06-08 19:11:09 by athena]
+  Added planner-score.c
+
+    A ./kernel/planner-score.c
+    M ./dft/Makefile.am -2 +2
+    M ./dft/ct-dif.c -2 +2
+    M ./dft/ct-dit.c -2 +2
+    M ./dft/ct-ditf.c -2 +2
+    M ./dft/direct.c -2 +2
+    M ./dft/indirect.c -2 +2
+    M ./dft/rank0.c -2 +2
+    M ./dft/rank_geq2.c -2 +2
+    M ./dft/vecloop.c -2 +2
+    M ./kernel/Makefile.am -3 +3
+    M ./kernel/ifftw.h -4 +5
+    M ./kernel/planner-score.c +70
+    M ./kernel/planner.c -12 +13
+    M ./tests/bench.c -1 +4
+
+Sat Jun  8 11:10:44 EDT 2002  athena
+  * [project @ 2002-06-08 15:10:44 by athena]
+  Added indirect.c
+
+    A ./dft/indirect.c
+    M ./dft/Makefile.am -1 +1
+    M ./dft/conf.c -1 +2
+    M ./dft/ct-dif.c -2 +2
+    M ./dft/ct-dit.c -2 +2
+    M ./dft/ct-ditf.c -4 +6
+    M ./dft/dft.h -1 +3
+    M ./dft/indirect.c +227
+    M ./dft/rank_geq2.c -5 +5
+    M ./dft/vecloop.c -2 +1
+
+Sat Jun  8 09:34:58 EDT 2002  athena
+  * [project @ 2002-06-08 13:34:58 by athena]
+  dif, ditf solvers
+
+    A ./mkdist.sh
+    A ./dft/ct-dif.c
+    A ./dft/ct-ditf.c
+    A ./dft/kdft-dif.c
+    A ./dft/kdft-difsq.c
+    M ./Makefile.am -4 +6
+    M ./dft/Makefile.am -5 +3
+    M ./dft/codelet.h -2 +3
+    M ./dft/codelets/Makefile.am -5
+    M ./dft/codelets/inplace/Makefile.am -2 +4
+    M ./dft/conf.c -2 +3
+    M ./dft/ct-dif.c +120
+    M ./dft/ct-dit.c -2 +1
+    M ./dft/ct-ditf.c +106
+    M ./dft/dft.h -1 +2
+    M ./dft/direct.c -2 +2
+    M ./dft/kdft-dif.c +29
+    M ./dft/kdft-difsq.c +29
+    M ./mkdist.sh +6
+    M ./tests/Makefile.am -2 +2
+
+Fri Jun  7 18:07:53 EDT 2002  athena
+  * [project @ 2002-06-07 22:07:53 by athena]
+  Implemented rank_geq2.  Revised build system
+
+    A ./RESEARCH-MODE
+    A ./dft/rank_geq2.c
+    M ./Makefile.am -3 +12
+    M ./RESEARCH-MODE +1
+    M ./bootstrap.sh -1 +1
+    M ./configure.ac -1 +10
+    M ./dft/Makefile.am -1 +3
+    M ./dft/conf.c -1 +2
+    M ./dft/dft.h -1 +2
+    M ./dft/rank_geq2.c +257
+    M ./dft/vecloop.c -42 +48
+    M ./kernel/ifftw.h -1 +3
+    M ./kernel/minmax.c -1 +11
+    M ./kernel/planner.c -1 +2
+    M ./support/Makefile.codelets -1 +1
+
+Fri Jun  7 07:12:25 EDT 2002  athena
+  * [project @ 2002-06-07 11:12:25 by athena]
+  Fixed printout
+
+    M ./kernel/alloc.c -2 +2
+
+Fri Jun  7 07:07:46 EDT 2002  athena
+  * [project @ 2002-06-07 11:07:46 by athena]
+  Added rank0.  Revised codelet organization.
+
+    A ./dft/codelet.h
+    A ./dft/conf.c
+    A ./dft/codelets/
+    A ./dft/codelets/Makefile.am
+    A ./dft/codelets/inplace/
+    A ./dft/codelets/inplace/Makefile.am
+    A ./dft/codelets/standard/
+    A ./dft/codelets/standard/Makefile.am
+    A ./dft/rank0.c
+    M ./Makefile.am -3 +2
+    M ./bootstrap.sh -5 +1
+    M ./configure.ac -3 +18
+    M ./dft/Makefile.am -3 +4
+    M ./dft/codelet.h +118
+    M ./dft/codelets/Makefile.am +7
+    M ./dft/codelets/inplace/Makefile.am -1 +35
+    M ./dft/codelets/standard/Makefile.am -1 +37
+    M ./dft/conf.c +30
+    M ./dft/dft.h -1 +5
+    M ./dft/rank0.c +263
+    M ./genfft/gen_notw.ml -3 +3
+    M ./tests/Makefile.am -4 +5
+    M ./tests/bench.c -6 +1
+
+Thu Jun  6 18:03:17 EDT 2002  athena
+  * [project @ 2002-06-06 22:03:17 by athena]
+  Added memoization
+
+    M ./dft/ct.c -2 +2
+    M ./dft/vecloop.c -2 +2
+    M ./genfft/trig.ml -5 +5
+    M ./kernel/ifftw.h -2 +13
+    M ./kernel/planner-estimate.c -2 +2
+    M ./kernel/planner-naive.c -2 +2
+    M ./kernel/planner.c -12 +191
+    M ./libbench/bench-user.h -1 +2
+    M ./tests/bench.c -2 +16
+
+Thu Jun  6 08:07:33 EDT 2002  athena
+  * [project @ 2002-06-06 12:07:33 by athena]
+  Added vecloop
+
+    A ./dft/vecloop.c
+    M ./dft/Makefile.am -1 +1
+    M ./dft/dft.h -1 +3
+    M ./dft/direct.c -9 +7
+    M ./dft/vecloop.c +242
+    M ./kernel/alloc.c -11 +38
+    M ./kernel/ifftw.h -2 +4
+    M ./kernel/planner.c -2 +2
+    M ./tests/bench.c +5
+
+Wed Jun  5 19:02:56 EDT 2002  athena
+  * [project @ 2002-06-05 23:02:56 by athena]
+  First DIT solver/plan
+
+    A ./dft/ct-dit.c
+    M ./dft/Makefile.am -2 +2
+    M ./dft/ct-dit.c +118
+    M ./dft/ct.c -9 +16
+    M ./dft/ct.h -3 +14
+    M ./dft/dft.h -1 +2
+    M ./dft/direct.c -10 +6
+    M ./dft/kdft-dit.c -2 +3
+    M ./kernel/alloc.c -1 +2
+    M ./kernel/twiddle.c -3 +11
+
+Wed Jun  5 16:03:44 EDT 2002  athena
+  * [project @ 2002-06-05 20:03:44 by athena]
+  More work on ct
+
+    A ./dft/ct.c
+    A ./dft/ct.h
+    M ./dft/Makefile.am -2 +2
+    M ./dft/ct.c +166
+    M ./dft/ct.h +56
+    M ./kernel/ifftw.h -1 +2
+    M ./kernel/stride.c -2 +3
+    M ./kernel/twiddle.c -16 +26
+
+Wed Jun  5 11:28:09 EDT 2002  athena
+  * [project @ 2002-06-05 15:28:09 by athena]
+  Only use cycle counters
+
+    M ./kernel/ifftw.h -2 +2
+    M ./kernel/planner-naive.c -2 +2
+    M ./kernel/timer.c -166 +18
+
+Tue Jun  4 20:22:23 EDT 2002  athena
+  * [project @ 2002-06-05 00:22:23 by athena]
+  Signed/unsigned fixup
+
+    M ./CONVENTIONS +2
+    M ./bootstrap.sh -1 +1
+    M ./kernel/ifftw.h -7 +6
+    M ./kernel/twiddle.c -28 +16
+
+Tue Jun  4 20:03:56 EDT 2002  athena
+  * [project @ 2002-06-05 00:03:56 by athena]
+  New file twiddle.c
+
+    A ./kernel/twiddle.c
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/ifftw.h -1 +24
+    M ./kernel/twiddle.c +137
+
+Tue Jun  4 17:49:39 EDT 2002  athena
+  * [project @ 2002-06-04 21:49:39 by athena]
+  Made tensor ranks and vector lengths unsigned.  Hopefully fixed
+  all places where it matters.
+
+    A ./dft/kdft-dit.c
+    M ./configure.ac +1
+    M ./dft/Makefile.am -1 +2
+    M ./dft/direct.c -3 +4
+    M ./dft/kdft-dit.c +28
+    M ./dft/problem.c -5 +7
+    M ./genfft/gen_notw.ml -4 +4
+    M ./genfft/gen_twiddle.ml -4 +4
+    M ./genfft/gen_twidsq.ml -4 +4
+    M ./kernel/Makefile.am -2 +2
+    R ./kernel/codelet.h
+    M ./kernel/flops.c -2 +2
+    M ./kernel/ifftw.h -17 +20
+    M ./kernel/tensor.c -35 +38
+    M ./kernel/timer.c -3 +2
+    M ./tests/bench.c +1
+
+Tue Jun  4 16:28:58 EDT 2002  athena
+  * [project @ 2002-06-04 20:28:58 by athena]
+  System is in working state now (but very incomplete)
+
+    A ./dft/kdft.c
+    A ./kernel/planner-estimate.c
+    A ./kernel/solvtab.c
+    A ./tests/
+    A ./tests/Makefile.am
+    A ./tests/bench.c
+    M ./Makefile.am -1 +1
+    M ./configure.ac +2
+    M ./dft/Makefile.am -2 +2
+    M ./dft/dft.h -1 +10
+    M ./dft/direct.c -3 +2
+    M ./dft/kdft.c +29
+    M ./kernel/Makefile.am -3 +4
+    M ./kernel/codelet.h -1 +3
+    M ./kernel/fftw.h -1 +5
+    M ./kernel/ifftw.h -2 +21
+    M ./kernel/planner-estimate.c +55
+    M ./kernel/planner-naive.c -9 +9
+    M ./kernel/solvtab.c +30
+    M ./libbench/Makefile.am -2 +2
+    M ./support/Makefile.codelets +23
+    M ./tests/Makefile.am -1 +7
+    M ./tests/bench.c +92
+
+Mon Jun  3 18:10:12 EDT 2002  athena
+  * [project @ 2002-06-03 22:10:12 by athena]
+  Started implementing planners
+
+    A ./kernel/planner-naive.c
+    A ./kernel/planner.c
+    M ./CONVENTIONS +2
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/ifftw.h -6 +55
+    M ./kernel/planner-naive.c +58
+    M ./kernel/planner.c +113
+
+Mon Jun  3 11:44:18 EDT 2002  athena
+  * [project @ 2002-06-03 15:44:18 by athena]
+  Imported libbench from the new benchfft.  We will use libbench
+  for benchmarking and testing.
+
+    A ./libbench/
+    A ./libbench/Makefile.am
+    A ./libbench/accopy-from.c
+    A ./libbench/accopy-to.c
+    A ./libbench/acopy.c
+    A ./libbench/allocate.c
+    A ./libbench/ascale.c
+    A ./libbench/aset.c
+    A ./libbench/bench-main.c
+    A ./libbench/bench-user.h
+    A ./libbench/bench.h
+    A ./libbench/caadd.c
+    A ./libbench/cacopy.c
+    A ./libbench/can-do.c
+    A ./libbench/cascale.c
+    A ./libbench/caset.c
+    A ./libbench/casub.c
+    A ./libbench/ccopy-from.c
+    A ./libbench/ccopy-to.c
+    A ./libbench/copy-c2c-from.c
+    A ./libbench/copy-c2c-to.c
+    A ./libbench/copy-c2h-1d-fftpack.c
+    A ./libbench/copy-c2h-1d-halfcomplex.c
+    A ./libbench/copy-c2h-1d-packed.c
+    A ./libbench/copy-c2h-1d-unpacked-ri.c
+    A ./libbench/copy-c2h-unpacked.c
+    A ./libbench/copy-c2h.c
+    A ./libbench/copy-c2r-packed.c
+    A ./libbench/copy-c2r-unpacked.c
+    A ./libbench/copy-c2r.c
+    A ./libbench/copy-c2ri.c
+    A ./libbench/copy-h2c-1d-fftpack.c
+    A ./libbench/copy-h2c-1d-halfcomplex.c
+    A ./libbench/copy-h2c-1d-packed.c
+    A ./libbench/copy-h2c-1d-unpacked-ri.c
+    A ./libbench/copy-h2c-unpacked.c
+    A ./libbench/copy-h2c.c
+    A ./libbench/copy-r2c-packed.c
+    A ./libbench/copy-r2c-unpacked.c
+    A ./libbench/copy-r2c.c
+    A ./libbench/copy-ri2c.c
+    A ./libbench/deallocate.c
+    A ./libbench/getopt-utils.c
+    A ./libbench/getopt.c
+    A ./libbench/getopt.h
+    A ./libbench/getopt1.c
+    A ./libbench/info.c
+    A ./libbench/log2.c
+    A ./libbench/main.c
+    A ./libbench/mflops.c
+    A ./libbench/ovtpvt.c
+    A ./libbench/pow2.c
+    A ./libbench/prime.c
+    A ./libbench/problem.c
+    A ./libbench/report.c
+    A ./libbench/speed.c
+    A ./libbench/timer.c
+    A ./libbench/unnormalize.c
+    A ./libbench/util.c
+    A ./libbench/verify.c
+    A ./libbench/zero.c
+    M ./Makefile.am -1 +1
+    M ./configure.ac -1 +4
+    M ./libbench/Makefile.am +18
+    M ./libbench/accopy-from.c +10
+    M ./libbench/accopy-to.c +10
+    M ./libbench/acopy.c +11
+    M ./libbench/allocate.c +40
+    M ./libbench/ascale.c +13
+    M ./libbench/aset.c +10
+    M ./libbench/bench-main.c +218
+    M ./libbench/bench-user.h +247
+    M ./libbench/bench.h +66
+    M ./libbench/caadd.c +15
+    M ./libbench/cacopy.c +11
+    M ./libbench/can-do.c +33
+    M ./libbench/cascale.c +15
+    M ./libbench/caset.c +10
+    M ./libbench/casub.c +15
+    M ./libbench/ccopy-from.c +21
+    M ./libbench/ccopy-to.c +20
+    M ./libbench/copy-c2c-from.c +7
+    M ./libbench/copy-c2c-to.c +7
+    M ./libbench/copy-c2h-1d-fftpack.c +29
+    M ./libbench/copy-c2h-1d-halfcomplex.c +29
+    M ./libbench/copy-c2h-1d-packed.c +40
+    M ./libbench/copy-c2h-1d-unpacked-ri.c +25
+    M ./libbench/copy-c2h-unpacked.c +32
+    M ./libbench/copy-c2h.c +10
+    M ./libbench/copy-c2r-packed.c +11
+    M ./libbench/copy-c2r-unpacked.c +24
+    M ./libbench/copy-c2r.c +6
+    M ./libbench/copy-c2ri.c +15
+    M ./libbench/copy-h2c-1d-fftpack.c +31
+    M ./libbench/copy-h2c-1d-halfcomplex.c +34
+    M ./libbench/copy-h2c-1d-packed.c +44
+    M ./libbench/copy-h2c-1d-unpacked-ri.c +35
+    M ./libbench/copy-h2c-unpacked.c +51
+    M ./libbench/copy-h2c.c +10
+    M ./libbench/copy-r2c-packed.c +13
+    M ./libbench/copy-r2c-unpacked.c +26
+    M ./libbench/copy-r2c.c +6
+    M ./libbench/copy-ri2c.c +15
+    M ./libbench/deallocate.c +14
+    M ./libbench/getopt-utils.c +104
+    M ./libbench/getopt.c +1062
+    M ./libbench/getopt.h +180
+    M ./libbench/getopt1.c +188
+    M ./libbench/info.c +57
+    M ./libbench/log2.c +13
+    M ./libbench/main.c +40
+    M ./libbench/mflops.c +22
+    M ./libbench/ovtpvt.c +13
+    M ./libbench/pow2.c +6
+    M ./libbench/prime.c +49
+    M ./libbench/problem.c +126
+    M ./libbench/report.c +159
+    M ./libbench/speed.c +73
+    M ./libbench/timer.c +218
+    M ./libbench/unnormalize.c +13
+    M ./libbench/util.c +189
+    M ./libbench/verify.c +408
+    M ./libbench/zero.c +37
+
+Mon Jun  3 09:18:46 EDT 2002  athena
+  * [project @ 2002-06-03 13:18:46 by athena]
+  Removed useless rand.c
+
+    M ./kernel/Makefile.am -2 +2
+    M ./kernel/ifftw.h -4 +1
+    R ./kernel/rand.c
+    M ./kernel/timer.c -2 +2
+
+Mon Jun  3 08:09:05 EDT 2002  athena
+  * [project @ 2002-06-03 12:09:05 by athena]
+  Added timer
+
+    A ./kernel/cycle.h
+    A ./kernel/timer.c
+    M ./CONVENTIONS +1
+    M ./dft/problem.c -2 +2
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/alloc.c -2 +2
+    M ./kernel/cycle.h +204
+    M ./kernel/ifftw.h -2 +10
+    M ./kernel/plan.c -2 +4
+    M ./kernel/timer.c +293
+
+Sun Jun  2 21:03:51 EDT 2002  athena
+  * [project @ 2002-06-03 01:03:51 by athena]
+  Split codelets into standard and inplace
+
+    M ./configure.ac -2 +4
+
+Sun Jun  2 19:49:03 EDT 2002  athena
+  * [project @ 2002-06-02 23:49:03 by athena]
+  Many many changes
+
+    A ./CONVENTIONS
+    A ./dft/direct.c
+    A ./dft/plan.c
+    A ./dft/solve.c
+    A ./kernel/awake.c
+    A ./kernel/square.c
+    M ./CONVENTIONS +49
+    M ./Makefile.am -4 +4
+    M ./dft/Makefile.am -1 +1
+    M ./dft/dft.h -3 +22
+    M ./dft/direct.c +171
+    M ./dft/plan.c +33
+    M ./dft/problem.c -10 +10
+    M ./dft/solve.c +31
+    M ./kernel/Makefile.am -1 +1
+    M ./kernel/awake.c +30
+    M ./kernel/ifftw.h -13 +33
+    M ./kernel/square.c +28
+
+Sun Jun  2 15:00:11 EDT 2002  athena
+  * [project @ 2002-06-02 19:00:11 by athena]
+  Fixed anachronism
+
+    M ./kernel/codelet.h -4 +4
+
+Sat Aug 12 21:43:16 EDT 2006  Unknown tagger
+  tagged bar
+
+
+Sun Jun  2 14:42:32 EDT 2002  athena
+  * [project @ 2002-06-02 18:42:32 by athena]
+  Initial revision
+
+    A ./AUTHORS
+    A ./COPYRIGHT
+    A ./ChangeLog
+    A ./Makefile.am
+    A ./NEWS
+    A ./README
+    A ./acinclude.m4
+    A ./bootstrap.sh
+    A ./configure.ac
+    A ./dft/
+    A ./dft/Makefile.am
+    A ./dft/dft.h
+    A ./dft/problem.c
+    A ./genfft/
+    A ./genfft/Makefile.am
+    A ./genfft/algsimp.ml
+    A ./genfft/algsimp.mli
+    A ./genfft/annotate.ml
+    A ./genfft/annotate.mli
+    A ./genfft/assoctable.ml
+    A ./genfft/assoctable.mli
+    A ./genfft/c.ml
+    A ./genfft/c.mli
+    A ./genfft/complex.ml
+    A ./genfft/complex.mli
+    A ./genfft/conv.ml
+    A ./genfft/conv.mli
+    A ./genfft/dag.ml
+    A ./genfft/dag.mli
+    A ./genfft/expr.ml
+    A ./genfft/expr.mli
+    A ./genfft/fft.ml
+    A ./genfft/fft.mli
+    A ./genfft/gen_athnotw.ml
+    A ./genfft/gen_athtw.ml
+    A ./genfft/gen_conv.ml
+    A ./genfft/gen_notw.ml
+    A ./genfft/gen_trig.ml
+    A ./genfft/gen_twiddle.ml
+    A ./genfft/gen_twidsq.ml
+    A ./genfft/genutil.ml
+    A ./genfft/littlesimp.ml
+    A ./genfft/littlesimp.mli
+    A ./genfft/magic.ml
+    A ./genfft/monads.ml
+    A ./genfft/number.ml
+    A ./genfft/number.mli
+    A ./genfft/oracle.ml
+    A ./genfft/oracle.mli
+    A ./genfft/schedule.ml
+    A ./genfft/schedule.mli
+    A ./genfft/to_alist.ml
+    A ./genfft/to_alist.mli
+    A ./genfft/trig.ml
+    A ./genfft/trig.mli
+    A ./genfft/twiddle.ml
+    A ./genfft/twiddle.mli
+    A ./genfft/unique.ml
+    A ./genfft/unique.mli
+    A ./genfft/util.ml
+    A ./genfft/util.mli
+    A ./genfft/variable.ml
+    A ./genfft/variable.mli
+    A ./kernel/
+    A ./kernel/Makefile.am
+    A ./kernel/alloc.c
+    A ./kernel/assert.c
+    A ./kernel/codelet.h
+    A ./kernel/fftw.h
+    A ./kernel/flops.c
+    A ./kernel/ifftw.h
+    A ./kernel/minmax.c
+    A ./kernel/plan.c
+    A ./kernel/problem.c
+    A ./kernel/rand.c
+    A ./kernel/solver.c
+    A ./kernel/stride.c
+    A ./kernel/tensor.c
+    A ./kernel/version.c
+    A ./support/
+    A ./support/Makefile.am
+    A ./support/Makefile.codelets
+    A ./support/codelet_prelude
+    M ./AUTHORS +4
+    M ./COPYRIGHT +19
+    M ./Makefile.am +21
+    M ./acinclude.m4 +356
+    M ./bootstrap.sh +26
+    M ./configure.ac +92
+    M ./dft/Makefile.am +6
+    M ./dft/dft.h +30
+    M ./dft/problem.c +121
+    M ./genfft/Makefile.am +136
+    M ./genfft/algsimp.ml +517
+    M ./genfft/algsimp.mli +24
+    M ./genfft/annotate.ml +180
+    M ./genfft/annotate.mli +37
+    M ./genfft/assoctable.ml +66
+    M ./genfft/assoctable.mli +30
+    M ./genfft/c.ml +445
+    M ./genfft/c.mli +65
+    M ./genfft/complex.ml +198
+    M ./genfft/complex.mli +61
+    M ./genfft/conv.ml +130
+    M ./genfft/conv.mli +23
+    M ./genfft/dag.ml +110
+    M ./genfft/dag.mli +44
+    M ./genfft/expr.ml +93
+    M ./genfft/expr.mli +38
+    M ./genfft/fft.ml +227
+    M ./genfft/fft.mli +24
+    M ./genfft/gen_athnotw.ml +80
+    M ./genfft/gen_athtw.ml +108
+    M ./genfft/gen_conv.ml +89
+    M ./genfft/gen_notw.ml +126
+    M ./genfft/gen_trig.ml +159
+    M ./genfft/gen_twiddle.ml +144
+    M ./genfft/gen_twidsq.ml +172
+    M ./genfft/genutil.ml +256
+    M ./genfft/littlesimp.ml +72
+    M ./genfft/littlesimp.mli +26
+    M ./genfft/magic.ml +121
+    M ./genfft/monads.ml +76
+    M ./genfft/number.ml +153
+    M ./genfft/number.mli +49
+    M ./genfft/oracle.ml +138
+    M ./genfft/oracle.mli +25
+    M ./genfft/schedule.ml +186
+    M ./genfft/schedule.mli +30
+    M ./genfft/to_alist.ml +258
+    M ./genfft/to_alist.mli +24
+    M ./genfft/trig.ml +158
+    M ./genfft/trig.mli +35
+    M ./genfft/twiddle.ml +165
+    M ./genfft/twiddle.mli +29
+    M ./genfft/unique.ml +39
+    M ./genfft/unique.mli +25
+    M ./genfft/util.ml +177
+    M ./genfft/util.mli +50
+    M ./genfft/variable.ml +99
+    M ./genfft/variable.mli +35
+    M ./kernel/Makefile.am +5
+    M ./kernel/alloc.c +217
+    M ./kernel/assert.c +32
+    M ./kernel/codelet.h +126
+    M ./kernel/fftw.h +29
+    M ./kernel/flops.c +41
+    M ./kernel/ifftw.h +215
+    M ./kernel/minmax.c +33
+    M ./kernel/plan.c +50
+    M ./kernel/problem.c +47
+    M ./kernel/rand.c +36
+    M ./kernel/solver.c +43
+    M ./kernel/stride.c +41
+    M ./kernel/tensor.c +318
+    M ./kernel/version.c +26
+    M ./support/Makefile.am -1 +1
+    M ./support/Makefile.codelets +99
+    M ./support/codelet_prelude +8
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/INSTALL
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/INSTALL	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,370 @@
+Installation Instructions
+*************************
+
+Copyright (C) 1994-1996, 1999-2002, 2004-2011 Free Software Foundation,
+Inc.
+
+   Copying and distribution of this file, with or without modification,
+are permitted in any medium without royalty provided the copyright
+notice and this notice are preserved.  This file is offered as-is,
+without warranty of any kind.
+
+Basic Installation
+==================
+
+   Briefly, the shell commands `./configure; make; make install' should
+configure, build, and install this package.  The following
+more-detailed instructions are generic; see the `README' file for
+instructions specific to this package.  Some packages provide this
+`INSTALL' file but do not implement all of the features documented
+below.  The lack of an optional feature in a given package is not
+necessarily a bug.  More recommendations for GNU packages can be found
+in *note Makefile Conventions: (standards)Makefile Conventions.
+
+   The `configure' shell script attempts to guess correct values for
+various system-dependent variables used during compilation.  It uses
+those values to create a `Makefile' in each directory of the package.
+It may also create one or more `.h' files containing system-dependent
+definitions.  Finally, it creates a shell script `config.status' that
+you can run in the future to recreate the current configuration, and a
+file `config.log' containing compiler output (useful mainly for
+debugging `configure').
+
+   It can also use an optional file (typically called `config.cache'
+and enabled with `--cache-file=config.cache' or simply `-C') that saves
+the results of its tests to speed up reconfiguring.  Caching is
+disabled by default to prevent problems with accidental use of stale
+cache files.
+
+   If you need to do unusual things to compile the package, please try
+to figure out how `configure' could check whether to do them, and mail
+diffs or instructions to the address given in the `README' so they can
+be considered for the next release.  If you are using the cache, and at
+some point `config.cache' contains results you don't want to keep, you
+may remove or edit it.
+
+   The file `configure.ac' (or `configure.in') is used to create
+`configure' by a program called `autoconf'.  You need `configure.ac' if
+you want to change it or regenerate `configure' using a newer version
+of `autoconf'.
+
+   The simplest way to compile this package is:
+
+  1. `cd' to the directory containing the package's source code and type
+     `./configure' to configure the package for your system.
+
+     Running `configure' might take a while.  While running, it prints
+     some messages telling which features it is checking for.
+
+  2. Type `make' to compile the package.
+
+  3. Optionally, type `make check' to run any self-tests that come with
+     the package, generally using the just-built uninstalled binaries.
+
+  4. Type `make install' to install the programs and any data files and
+     documentation.  When installing into a prefix owned by root, it is
+     recommended that the package be configured and built as a regular
+     user, and only the `make install' phase executed with root
+     privileges.
+
+  5. Optionally, type `make installcheck' to repeat any self-tests, but
+     this time using the binaries in their final installed location.
+     This target does not install anything.  Running this target as a
+     regular user, particularly if the prior `make install' required
+     root privileges, verifies that the installation completed
+     correctly.
+
+  6. You can remove the program binaries and object files from the
+     source code directory by typing `make clean'.  To also remove the
+     files that `configure' created (so you can compile the package for
+     a different kind of computer), type `make distclean'.  There is
+     also a `make maintainer-clean' target, but that is intended mainly
+     for the package's developers.  If you use it, you may have to get
+     all sorts of other programs in order to regenerate files that came
+     with the distribution.
+
+  7. Often, you can also type `make uninstall' to remove the installed
+     files again.  In practice, not all packages have tested that
+     uninstallation works correctly, even though it is required by the
+     GNU Coding Standards.
+
+  8. Some packages, particularly those that use Automake, provide `make
+     distcheck', which can by used by developers to test that all other
+     targets like `make install' and `make uninstall' work correctly.
+     This target is generally not run by end users.
+
+Compilers and Options
+=====================
+
+   Some systems require unusual options for compilation or linking that
+the `configure' script does not know about.  Run `./configure --help'
+for details on some of the pertinent environment variables.
+
+   You can give `configure' initial values for configuration parameters
+by setting variables in the command line or in the environment.  Here
+is an example:
+
+     ./configure CC=c99 CFLAGS=-g LIBS=-lposix
+
+   *Note Defining Variables::, for more details.
+
+Compiling For Multiple Architectures
+====================================
+
+   You can compile the package for more than one kind of computer at the
+same time, by placing the object files for each architecture in their
+own directory.  To do this, you can use GNU `make'.  `cd' to the
+directory where you want the object files and executables to go and run
+the `configure' script.  `configure' automatically checks for the
+source code in the directory that `configure' is in and in `..'.  This
+is known as a "VPATH" build.
+
+   With a non-GNU `make', it is safer to compile the package for one
+architecture at a time in the source code directory.  After you have
+installed the package for one architecture, use `make distclean' before
+reconfiguring for another architecture.
+
+   On MacOS X 10.5 and later systems, you can create libraries and
+executables that work on multiple system types--known as "fat" or
+"universal" binaries--by specifying multiple `-arch' options to the
+compiler but only a single `-arch' option to the preprocessor.  Like
+this:
+
+     ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
+                 CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
+                 CPP="gcc -E" CXXCPP="g++ -E"
+
+   This is not guaranteed to produce working output in all cases, you
+may have to build one architecture at a time and combine the results
+using the `lipo' tool if you have problems.
+
+Installation Names
+==================
+
+   By default, `make install' installs the package's commands under
+`/usr/local/bin', include files under `/usr/local/include', etc.  You
+can specify an installation prefix other than `/usr/local' by giving
+`configure' the option `--prefix=PREFIX', where PREFIX must be an
+absolute file name.
+
+   You can specify separate installation prefixes for
+architecture-specific files and architecture-independent files.  If you
+pass the option `--exec-prefix=PREFIX' to `configure', the package uses
+PREFIX as the prefix for installing programs and libraries.
+Documentation and other data files still use the regular prefix.
+
+   In addition, if you use an unusual directory layout you can give
+options like `--bindir=DIR' to specify different values for particular
+kinds of files.  Run `configure --help' for a list of the directories
+you can set and what kinds of files go in them.  In general, the
+default for these options is expressed in terms of `${prefix}', so that
+specifying just `--prefix' will affect all of the other directory
+specifications that were not explicitly provided.
+
+   The most portable way to affect installation locations is to pass the
+correct locations to `configure'; however, many packages provide one or
+both of the following shortcuts of passing variable assignments to the
+`make install' command line to change installation locations without
+having to reconfigure or recompile.
+
+   The first method involves providing an override variable for each
+affected directory.  For example, `make install
+prefix=/alternate/directory' will choose an alternate location for all
+directory configuration variables that were expressed in terms of
+`${prefix}'.  Any directories that were specified during `configure',
+but not in terms of `${prefix}', must each be overridden at install
+time for the entire installation to be relocated.  The approach of
+makefile variable overrides for each directory variable is required by
+the GNU Coding Standards, and ideally causes no recompilation.
+However, some platforms have known limitations with the semantics of
+shared libraries that end up requiring recompilation when using this
+method, particularly noticeable in packages that use GNU Libtool.
+
+   The second method involves providing the `DESTDIR' variable.  For
+example, `make install DESTDIR=/alternate/directory' will prepend
+`/alternate/directory' before all installation names.  The approach of
+`DESTDIR' overrides is not required by the GNU Coding Standards, and
+does not work on platforms that have drive letters.  On the other hand,
+it does better at avoiding recompilation issues, and works well even
+when some directory options were not specified in terms of `${prefix}'
+at `configure' time.
+
+Optional Features
+=================
+
+   If the package supports it, you can cause programs to be installed
+with an extra prefix or suffix on their names by giving `configure' the
+option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
+
+   Some packages pay attention to `--enable-FEATURE' options to
+`configure', where FEATURE indicates an optional part of the package.
+They may also pay attention to `--with-PACKAGE' options, where PACKAGE
+is something like `gnu-as' or `x' (for the X Window System).  The
+`README' should mention any `--enable-' and `--with-' options that the
+package recognizes.
+
+   For packages that use the X Window System, `configure' can usually
+find the X include and library files automatically, but if it doesn't,
+you can use the `configure' options `--x-includes=DIR' and
+`--x-libraries=DIR' to specify their locations.
+
+   Some packages offer the ability to configure how verbose the
+execution of `make' will be.  For these packages, running `./configure
+--enable-silent-rules' sets the default to minimal output, which can be
+overridden with `make V=1'; while running `./configure
+--disable-silent-rules' sets the default to verbose, which can be
+overridden with `make V=0'.
+
+Particular systems
+==================
+
+   On HP-UX, the default C compiler is not ANSI C compatible.  If GNU
+CC is not installed, it is recommended to use the following options in
+order to use an ANSI C compiler:
+
+     ./configure CC="cc -Ae -D_XOPEN_SOURCE=500"
+
+and if that doesn't work, install pre-built binaries of GCC for HP-UX.
+
+   HP-UX `make' updates targets which have the same time stamps as
+their prerequisites, which makes it generally unusable when shipped
+generated files such as `configure' are involved.  Use GNU `make'
+instead.
+
+   On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
+parse its `<wchar.h>' header file.  The option `-nodtk' can be used as
+a workaround.  If GNU CC is not installed, it is therefore recommended
+to try
+
+     ./configure CC="cc"
+
+and if that doesn't work, try
+
+     ./configure CC="cc -nodtk"
+
+   On Solaris, don't put `/usr/ucb' early in your `PATH'.  This
+directory contains several dysfunctional programs; working variants of
+these programs are available in `/usr/bin'.  So, if you need `/usr/ucb'
+in your `PATH', put it _after_ `/usr/bin'.
+
+   On Haiku, software installed for all users goes in `/boot/common',
+not `/usr/local'.  It is recommended to use the following options:
+
+     ./configure --prefix=/boot/common
+
+Specifying the System Type
+==========================
+
+   There may be some features `configure' cannot figure out
+automatically, but needs to determine by the type of machine the package
+will run on.  Usually, assuming the package is built to be run on the
+_same_ architectures, `configure' can figure that out, but if it prints
+a message saying it cannot guess the machine type, give it the
+`--build=TYPE' option.  TYPE can either be a short name for the system
+type, such as `sun4', or a canonical name which has the form:
+
+     CPU-COMPANY-SYSTEM
+
+where SYSTEM can have one of these forms:
+
+     OS
+     KERNEL-OS
+
+   See the file `config.sub' for the possible values of each field.  If
+`config.sub' isn't included in this package, then this package doesn't
+need to know the machine type.
+
+   If you are _building_ compiler tools for cross-compiling, you should
+use the option `--target=TYPE' to select the type of system they will
+produce code for.
+
+   If you want to _use_ a cross compiler, that generates code for a
+platform different from the build platform, you should specify the
+"host" platform (i.e., that on which the generated programs will
+eventually be run) with `--host=TYPE'.
+
+Sharing Defaults
+================
+
+   If you want to set default values for `configure' scripts to share,
+you can create a site shell script called `config.site' that gives
+default values for variables like `CC', `cache_file', and `prefix'.
+`configure' looks for `PREFIX/share/config.site' if it exists, then
+`PREFIX/etc/config.site' if it exists.  Or, you can set the
+`CONFIG_SITE' environment variable to the location of the site script.
+A warning: not all `configure' scripts look for a site script.
+
+Defining Variables
+==================
+
+   Variables not defined in a site shell script can be set in the
+environment passed to `configure'.  However, some packages may run
+configure again during the build, and the customized values of these
+variables may be lost.  In order to avoid this problem, you should set
+them in the `configure' command line, using `VAR=value'.  For example:
+
+     ./configure CC=/usr/local2/bin/gcc
+
+causes the specified `gcc' to be used as the C compiler (unless it is
+overridden in the site shell script).
+
+Unfortunately, this technique does not work for `CONFIG_SHELL' due to
+an Autoconf bug.  Until the bug is fixed you can use this workaround:
+
+     CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
+
+`configure' Invocation
+======================
+
+   `configure' recognizes the following options to control how it
+operates.
+
+`--help'
+`-h'
+     Print a summary of all of the options to `configure', and exit.
+
+`--help=short'
+`--help=recursive'
+     Print a summary of the options unique to this package's
+     `configure', and exit.  The `short' variant lists options used
+     only in the top level, while the `recursive' variant lists options
+     also present in any nested packages.
+
+`--version'
+`-V'
+     Print the version of Autoconf used to generate the `configure'
+     script, and exit.
+
+`--cache-file=FILE'
+     Enable the cache: use and save the results of the tests in FILE,
+     traditionally `config.cache'.  FILE defaults to `/dev/null' to
+     disable caching.
+
+`--config-cache'
+`-C'
+     Alias for `--cache-file=config.cache'.
+
+`--quiet'
+`--silent'
+`-q'
+     Do not print messages saying which checks are being made.  To
+     suppress all normal output, redirect it to `/dev/null' (any error
+     messages will still be shown).
+
+`--srcdir=DIR'
+     Look for the package's source code in directory DIR.  Usually
+     `configure' can determine that directory automatically.
+
+`--prefix=DIR'
+     Use DIR as the installation prefix.  *note Installation Names::
+     for more details, including other options available for fine-tuning
+     the installation locations.
+
+`--no-create'
+`-n'
+     Run the configure checks, but stop before creating any output
+     files.
+
+`configure' also accepts some other, not widely useful, options.  Run
+`configure --help' for more details.
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,108 @@
+OPTIONS_AUTOMAKE=gnu
+lib_LTLIBRARIES = libfftw3@PREC_SUFFIX@.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# nodist_pkginclude_HEADERS = config.h
+
+# recompile genfft if maintainer mode is true
+if MAINTAINER_MODE
+GENFFT = genfft
+else
+GENFFT =
+endif
+
+ACLOCAL_AMFLAGS=-I m4
+
+# when using combined thread libraries (necessary on Windows), we want
+# to build threads/ first, because libfftw3_threads is added to
+# libfftw3.
+#
+# Otherwise, we want to build libfftw3_threads after libfftw3
+# so that we can track the fact that libfftw3_threads depends upon
+# libfftw3.
+#
+# This is the inescapable result of combining three bad ideas
+# (threads, Windows, and shared libraries).
+#
+if COMBINED_THREADS
+CHICKEN_EGG=threads .
+else
+CHICKEN_EGG=. threads
+endif
+
+SUBDIRS=support $(GENFFT) kernel simd-support dft rdft reodft api	\
+libbench2 $(CHICKEN_EGG) tests mpi doc tools m4
+EXTRA_DIST=COPYRIGHT bootstrap.sh CONVENTIONS fftw.pc.in
+
+SIMD_LIBS =						\
+	simd-support/libsimd_support.la			\
+	simd-support/libsimd_sse2_nonportable.la
+
+if HAVE_SSE2
+SSE2_LIBS = dft/simd/sse2/libdft_sse2_codelets.la	\
+rdft/simd/sse2/librdft_sse2_codelets.la
+endif
+
+if HAVE_AVX
+AVX_LIBS = dft/simd/avx/libdft_avx_codelets.la	\
+rdft/simd/avx/librdft_avx_codelets.la
+endif
+
+if HAVE_ALTIVEC
+ALTIVEC_LIBS = dft/simd/altivec/libdft_altivec_codelets.la	\
+rdft/simd/altivec/librdft_altivec_codelets.la
+endif
+
+if HAVE_NEON
+NEON_LIBS = dft/simd/neon/libdft_neon_codelets.la	\
+rdft/simd/neon/librdft_neon_codelets.la
+endif
+
+if THREADS
+if COMBINED_THREADS
+COMBINED_THREADLIBS=threads/libfftw3@PREC_SUFFIX@_threads.la
+endif
+endif
+
+libfftw3@PREC_SUFFIX@_la_SOURCES = 
+
+libfftw3@PREC_SUFFIX@_la_LIBADD =			\
+	kernel/libkernel.la				\
+	dft/libdft.la					\
+	dft/scalar/libdft_scalar.la			\
+	dft/scalar/codelets/libdft_scalar_codelets.la	\
+	rdft/librdft.la					\
+	rdft/scalar/librdft_scalar.la			\
+	rdft/scalar/r2cf/librdft_scalar_r2cf.la		\
+	rdft/scalar/r2cb/librdft_scalar_r2cb.la		\
+	rdft/scalar/r2r/librdft_scalar_r2r.la		\
+	reodft/libreodft.la				\
+	api/libapi.la					\
+        $(SIMD_LIBS) $(SSE2_LIBS) $(AVX_LIBS) $(ALTIVEC_LIBS) $(NEON_LIBS)    	\
+	$(COMBINED_THREADLIBS)
+
+if QUAD
+# cannot use -no-undefined since dependent on libquadmath
+libfftw3@PREC_SUFFIX@_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
+else
+libfftw3@PREC_SUFFIX@_la_LDFLAGS = -no-undefined -version-info	\
+@SHARED_VERSION_INFO@
+endif
+
+fftw3@PREC_SUFFIX@.pc: fftw.pc
+	cp -f fftw.pc fftw3@PREC_SUFFIX@.pc
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = fftw3@PREC_SUFFIX@.pc
+
+WISDOM_DIR = /etc/fftw
+WISDOM = wisdom@PREC_SUFFIX@
+
+WISDOM_TIME=12 # default to 12-hour limit, i.e. overnight
+WISDOM_FLAGS=--verbose --canonical --time-limit=$(WISDOM_TIME)
+
+wisdom:
+	tools/fftw@PREC_SUFFIX@-wisdom -o $@ $(WISDOM_FLAGS)
+
+install-wisdom: wisdom
+	$(mkinstalldirs) $(WISDOM_DIR)
+	$(INSTALL_DATA) wisdom $(WISDOM_DIR)/$(WISDOM)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1008 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = .
+DIST_COMMON = README $(am__configure_deps) $(srcdir)/Makefile.am \
+	$(srcdir)/Makefile.in $(srcdir)/config.h.in \
+	$(srcdir)/fftw.pc.in $(top_srcdir)/configure AUTHORS COPYING \
+	ChangeLog INSTALL NEWS TODO compile config.guess config.sub \
+	depcomp install-sh ltmain.sh missing
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
+ configure.lineno config.status.lineno
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = config.h
+CONFIG_CLEAN_FILES = fftw.pc
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(pkgconfigdir)"
+LTLIBRARIES = $(lib_LTLIBRARIES)
+libfftw3@PREC_SUFFIX@_la_DEPENDENCIES = kernel/libkernel.la \
+	dft/libdft.la dft/scalar/libdft_scalar.la \
+	dft/scalar/codelets/libdft_scalar_codelets.la rdft/librdft.la \
+	rdft/scalar/librdft_scalar.la \
+	rdft/scalar/r2cf/librdft_scalar_r2cf.la \
+	rdft/scalar/r2cb/librdft_scalar_r2cb.la \
+	rdft/scalar/r2r/librdft_scalar_r2r.la reodft/libreodft.la \
+	api/libapi.la $(SIMD_LIBS) $(SSE2_LIBS) $(AVX_LIBS) \
+	$(ALTIVEC_LIBS) $(NEON_LIBS) $(COMBINED_THREADLIBS)
+am_libfftw3@PREC_SUFFIX@_la_OBJECTS =
+libfftw3@PREC_SUFFIX@_la_OBJECTS =  \
+	$(am_libfftw3@PREC_SUFFIX@_la_OBJECTS)
+libfftw3@PREC_SUFFIX@_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(libfftw3@PREC_SUFFIX@_la_LDFLAGS) $(LDFLAGS) -o $@
+DEFAULT_INCLUDES = -I.@am__isrc@
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libfftw3@PREC_SUFFIX@_la_SOURCES)
+DIST_SOURCES = $(libfftw3@PREC_SUFFIX@_la_SOURCES)
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+DATA = $(pkgconfig_DATA)
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
+	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
+	distdir dist dist-all distcheck
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = support genfft kernel simd-support dft rdft reodft api \
+	libbench2 . threads tests mpi doc tools m4
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+distdir = $(PACKAGE)-$(VERSION)
+top_distdir = $(distdir)
+am__remove_distdir = \
+  if test -d "$(distdir)"; then \
+    find "$(distdir)" -type d ! -perm -200 -exec chmod u+w {} ';' \
+      && rm -rf "$(distdir)" \
+      || { sleep 5 && rm -rf "$(distdir)"; }; \
+  else :; fi
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+DIST_ARCHIVES = $(distdir).tar.gz
+GZIP_ENV = --best
+distuninstallcheck_listfiles = find . -type f -print
+am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \
+  | sed 's|^\./|$(prefix)/|' | grep -v '$(infodir)/dir$$'
+distcleancheck_listfiles = find . -type f -print
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+OPTIONS_AUTOMAKE = gnu
+lib_LTLIBRARIES = libfftw3@PREC_SUFFIX@.la
+@MAINTAINER_MODE_FALSE@GENFFT = 
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# nodist_pkginclude_HEADERS = config.h
+
+# recompile genfft if maintainer mode is true
+@MAINTAINER_MODE_TRUE@GENFFT = genfft
+ACLOCAL_AMFLAGS = -I m4
+@COMBINED_THREADS_FALSE@CHICKEN_EGG = . threads
+
+# when using combined thread libraries (necessary on Windows), we want
+# to build threads/ first, because libfftw3_threads is added to
+# libfftw3.
+#
+# Otherwise, we want to build libfftw3_threads after libfftw3
+# so that we can track the fact that libfftw3_threads depends upon
+# libfftw3.
+#
+# This is the inescapable result of combining three bad ideas
+# (threads, Windows, and shared libraries).
+#
+@COMBINED_THREADS_TRUE@CHICKEN_EGG = threads .
+SUBDIRS = support $(GENFFT) kernel simd-support dft rdft reodft api	\
+libbench2 $(CHICKEN_EGG) tests mpi doc tools m4
+
+EXTRA_DIST = COPYRIGHT bootstrap.sh CONVENTIONS fftw.pc.in
+SIMD_LIBS = \
+	simd-support/libsimd_support.la			\
+	simd-support/libsimd_sse2_nonportable.la
+
+@HAVE_SSE2_TRUE@SSE2_LIBS = dft/simd/sse2/libdft_sse2_codelets.la	\
+@HAVE_SSE2_TRUE@rdft/simd/sse2/librdft_sse2_codelets.la
+
+@HAVE_AVX_TRUE@AVX_LIBS = dft/simd/avx/libdft_avx_codelets.la	\
+@HAVE_AVX_TRUE@rdft/simd/avx/librdft_avx_codelets.la
+
+@HAVE_ALTIVEC_TRUE@ALTIVEC_LIBS = dft/simd/altivec/libdft_altivec_codelets.la	\
+@HAVE_ALTIVEC_TRUE@rdft/simd/altivec/librdft_altivec_codelets.la
+
+@HAVE_NEON_TRUE@NEON_LIBS = dft/simd/neon/libdft_neon_codelets.la	\
+@HAVE_NEON_TRUE@rdft/simd/neon/librdft_neon_codelets.la
+
+@COMBINED_THREADS_TRUE@@THREADS_TRUE@COMBINED_THREADLIBS = threads/libfftw3@PREC_SUFFIX@_threads.la
+libfftw3@PREC_SUFFIX@_la_SOURCES = 
+libfftw3@PREC_SUFFIX@_la_LIBADD = \
+	kernel/libkernel.la				\
+	dft/libdft.la					\
+	dft/scalar/libdft_scalar.la			\
+	dft/scalar/codelets/libdft_scalar_codelets.la	\
+	rdft/librdft.la					\
+	rdft/scalar/librdft_scalar.la			\
+	rdft/scalar/r2cf/librdft_scalar_r2cf.la		\
+	rdft/scalar/r2cb/librdft_scalar_r2cb.la		\
+	rdft/scalar/r2r/librdft_scalar_r2r.la		\
+	reodft/libreodft.la				\
+	api/libapi.la					\
+        $(SIMD_LIBS) $(SSE2_LIBS) $(AVX_LIBS) $(ALTIVEC_LIBS) $(NEON_LIBS)    	\
+	$(COMBINED_THREADLIBS)
+
+@QUAD_FALSE@libfftw3@PREC_SUFFIX@_la_LDFLAGS = -no-undefined -version-info	\
+@QUAD_FALSE@@SHARED_VERSION_INFO@
+
+
+# cannot use -no-undefined since dependent on libquadmath
+@QUAD_TRUE@libfftw3@PREC_SUFFIX@_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = fftw3@PREC_SUFFIX@.pc
+WISDOM_DIR = /etc/fftw
+WISDOM = wisdom@PREC_SUFFIX@
+WISDOM_TIME = 12 # default to 12-hour limit, i.e. overnight
+WISDOM_FLAGS = --verbose --canonical --time-limit=$(WISDOM_TIME)
+all: config.h
+	$(MAKE) $(AM_MAKEFLAGS) all-recursive
+
+.SUFFIXES:
+am--refresh: Makefile
+	@:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      echo ' cd $(srcdir) && $(AUTOMAKE) --gnu'; \
+	      $(am__cd) $(srcdir) && $(AUTOMAKE) --gnu \
+		&& exit 0; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    echo ' $(SHELL) ./config.status'; \
+	    $(SHELL) ./config.status;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	$(SHELL) ./config.status --recheck
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	$(am__cd) $(srcdir) && $(AUTOCONF)
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	$(am__cd) $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
+$(am__aclocal_m4_deps):
+
+config.h: stamp-h1
+	@if test ! -f $@; then rm -f stamp-h1; else :; fi
+	@if test ! -f $@; then $(MAKE) $(AM_MAKEFLAGS) stamp-h1; else :; fi
+
+stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
+	@rm -f stamp-h1
+	cd $(top_builddir) && $(SHELL) ./config.status config.h
+$(srcdir)/config.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) 
+	($(am__cd) $(top_srcdir) && $(AUTOHEADER))
+	rm -f stamp-h1
+	touch $@
+
+distclean-hdr:
+	-rm -f config.h stamp-h1
+fftw.pc: $(top_builddir)/config.status $(srcdir)/fftw.pc.in
+	cd $(top_builddir) && $(SHELL) ./config.status $@
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	list2=; for p in $$list; do \
+	  if test -f $$p; then \
+	    list2="$$list2 $$p"; \
+	  else :; fi; \
+	done; \
+	test -z "$$list2" || { \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
+	}
+
+uninstall-libLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
+	done
+
+clean-libLTLIBRARIES:
+	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libfftw3@PREC_SUFFIX@.la: $(libfftw3@PREC_SUFFIX@_la_OBJECTS) $(libfftw3@PREC_SUFFIX@_la_DEPENDENCIES) $(EXTRA_libfftw3@PREC_SUFFIX@_la_DEPENDENCIES) 
+	$(libfftw3@PREC_SUFFIX@_la_LINK) -rpath $(libdir) $(libfftw3@PREC_SUFFIX@_la_OBJECTS) $(libfftw3@PREC_SUFFIX@_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+distclean-libtool:
+	-rm -f libtool config.lt
+install-pkgconfigDATA: $(pkgconfig_DATA)
+	@$(NORMAL_INSTALL)
+	@list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(pkgconfigdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(pkgconfigdir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(pkgconfigdir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(pkgconfigdir)" || exit $$?; \
+	done
+
+uninstall-pkgconfigDATA:
+	@$(NORMAL_UNINSTALL)
+	@list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(pkgconfigdir)'; $(am__uninstall_files_from_dir)
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	$(am__remove_distdir)
+	test -d "$(distdir)" || mkdir "$(distdir)"
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+	-test -n "$(am__skip_mode_fix)" \
+	|| find "$(distdir)" -type d ! -perm -755 \
+		-exec chmod u+rwx,go+rx {} \; -o \
+	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -444 -exec $(install_sh) -c -m a+r {} {} \; \
+	|| chmod -R a+r "$(distdir)"
+dist-gzip: distdir
+	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
+	$(am__remove_distdir)
+
+dist-bzip2: distdir
+	tardir=$(distdir) && $(am__tar) | BZIP2=$${BZIP2--9} bzip2 -c >$(distdir).tar.bz2
+	$(am__remove_distdir)
+
+dist-lzip: distdir
+	tardir=$(distdir) && $(am__tar) | lzip -c $${LZIP_OPT--9} >$(distdir).tar.lz
+	$(am__remove_distdir)
+
+dist-lzma: distdir
+	tardir=$(distdir) && $(am__tar) | lzma -9 -c >$(distdir).tar.lzma
+	$(am__remove_distdir)
+
+dist-xz: distdir
+	tardir=$(distdir) && $(am__tar) | XZ_OPT=$${XZ_OPT--e} xz -c >$(distdir).tar.xz
+	$(am__remove_distdir)
+
+dist-tarZ: distdir
+	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
+	$(am__remove_distdir)
+
+dist-shar: distdir
+	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
+	$(am__remove_distdir)
+
+dist-zip: distdir
+	-rm -f $(distdir).zip
+	zip -rq $(distdir).zip $(distdir)
+	$(am__remove_distdir)
+
+dist dist-all: distdir
+	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
+	$(am__remove_distdir)
+
+# This target untars the dist file and tries a VPATH configuration.  Then
+# it guarantees that the distribution is self-contained by making another
+# tarfile.
+distcheck: dist
+	case '$(DIST_ARCHIVES)' in \
+	*.tar.gz*) \
+	  GZIP=$(GZIP_ENV) gzip -dc $(distdir).tar.gz | $(am__untar) ;;\
+	*.tar.bz2*) \
+	  bzip2 -dc $(distdir).tar.bz2 | $(am__untar) ;;\
+	*.tar.lzma*) \
+	  lzma -dc $(distdir).tar.lzma | $(am__untar) ;;\
+	*.tar.lz*) \
+	  lzip -dc $(distdir).tar.lz | $(am__untar) ;;\
+	*.tar.xz*) \
+	  xz -dc $(distdir).tar.xz | $(am__untar) ;;\
+	*.tar.Z*) \
+	  uncompress -c $(distdir).tar.Z | $(am__untar) ;;\
+	*.shar.gz*) \
+	  GZIP=$(GZIP_ENV) gzip -dc $(distdir).shar.gz | unshar ;;\
+	*.zip*) \
+	  unzip $(distdir).zip ;;\
+	esac
+	chmod -R a-w $(distdir); chmod u+w $(distdir)
+	mkdir $(distdir)/_build
+	mkdir $(distdir)/_inst
+	chmod a-w $(distdir)
+	test -d $(distdir)/_build || exit 0; \
+	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
+	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
+	  && am__cwd=`pwd` \
+	  && $(am__cd) $(distdir)/_build \
+	  && ../configure --srcdir=.. --prefix="$$dc_install_base" \
+	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
+	    $(DISTCHECK_CONFIGURE_FLAGS) \
+	  && $(MAKE) $(AM_MAKEFLAGS) \
+	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
+	  && $(MAKE) $(AM_MAKEFLAGS) check \
+	  && $(MAKE) $(AM_MAKEFLAGS) install \
+	  && $(MAKE) $(AM_MAKEFLAGS) installcheck \
+	  && $(MAKE) $(AM_MAKEFLAGS) uninstall \
+	  && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
+	        distuninstallcheck \
+	  && chmod -R a-w "$$dc_install_base" \
+	  && ({ \
+	       (cd ../.. && umask 077 && mkdir "$$dc_destdir") \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
+	            distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
+	      } || { rm -rf "$$dc_destdir"; exit 1; }) \
+	  && rm -rf "$$dc_destdir" \
+	  && $(MAKE) $(AM_MAKEFLAGS) dist \
+	  && rm -rf $(DIST_ARCHIVES) \
+	  && $(MAKE) $(AM_MAKEFLAGS) distcleancheck \
+	  && cd "$$am__cwd" \
+	  || exit 1
+	$(am__remove_distdir)
+	@(echo "$(distdir) archives ready for distribution: "; \
+	  list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \
+	  sed -e 1h -e 1s/./=/g -e 1p -e 1x -e '$$p' -e '$$x'
+distuninstallcheck:
+	@test -n '$(distuninstallcheck_dir)' || { \
+	  echo 'ERROR: trying to run $@ with an empty' \
+	       '$$(distuninstallcheck_dir)' >&2; \
+	  exit 1; \
+	}; \
+	$(am__cd) '$(distuninstallcheck_dir)' || { \
+	  echo 'ERROR: cannot chdir into $(distuninstallcheck_dir)' >&2; \
+	  exit 1; \
+	}; \
+	test `$(am__distuninstallcheck_listfiles) | wc -l` -eq 0 \
+	   || { echo "ERROR: files left after uninstall:" ; \
+	        if test -n "$(DESTDIR)"; then \
+	          echo "  (check DESTDIR support)"; \
+	        fi ; \
+	        $(distuninstallcheck_listfiles) ; \
+	        exit 1; } >&2
+distcleancheck: distclean
+	@if test '$(srcdir)' = . ; then \
+	  echo "ERROR: distcleancheck can only run from a VPATH build" ; \
+	  exit 1 ; \
+	fi
+	@test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
+	  || { echo "ERROR: files left in build directory after distclean:" ; \
+	       $(distcleancheck_listfiles) ; \
+	       exit 1; } >&2
+check-am: all-am
+check: check-recursive
+all-am: Makefile $(LTLIBRARIES) $(DATA) config.h
+installdirs: installdirs-recursive
+installdirs-am:
+	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(pkgconfigdir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
+	mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-hdr distclean-libtool distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am: install-pkgconfigDATA
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am: install-libLTLIBRARIES
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
+	-rm -rf $(top_srcdir)/autom4te.cache
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am: uninstall-libLTLIBRARIES uninstall-pkgconfigDATA
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) all \
+	ctags-recursive install-am install-strip tags-recursive
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am am--refresh check check-am clean clean-generic \
+	clean-libLTLIBRARIES clean-libtool ctags ctags-recursive dist \
+	dist-all dist-bzip2 dist-gzip dist-lzip dist-lzma dist-shar \
+	dist-tarZ dist-xz dist-zip distcheck distclean \
+	distclean-compile distclean-generic distclean-hdr \
+	distclean-libtool distclean-tags distcleancheck distdir \
+	distuninstallcheck dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am \
+	install-libLTLIBRARIES install-man install-pdf install-pdf-am \
+	install-pkgconfigDATA install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs installdirs-am \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags tags-recursive uninstall uninstall-am \
+	uninstall-libLTLIBRARIES uninstall-pkgconfigDATA
+
+
+fftw3@PREC_SUFFIX@.pc: fftw.pc
+	cp -f fftw.pc fftw3@PREC_SUFFIX@.pc
+
+wisdom:
+	tools/fftw@PREC_SUFFIX@-wisdom -o $@ $(WISDOM_FLAGS)
+
+install-wisdom: wisdom
+	$(mkinstalldirs) $(WISDOM_DIR)
+	$(INSTALL_DATA) wisdom $(WISDOM_DIR)/$(WISDOM)
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/NEWS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/NEWS	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,525 @@
+FFTW 3.3.3
+
+* Fix deadlock bug in MPI transforms (thanks to Michael Pippig for the
+  bug report and patch, and to Graham Dennis for the bug report).
+
+* Use 128-bit ARM NEON instructions instead of 64-bits.  This change
+  appears to speed up even ARM processors with a 64-bit NEON pipe.
+
+* Speed improvements for single-precision AVX.
+
+* Speed up planner on machines without "official" cycle counters, such as ARM.
+
+FFTW 3.3.2
+
+* Removed an archaic stack-alignment hack that was failing with
+  gcc-4.7/i386.
+
+* Added stack-alignment hack necessary for gcc on Windows/i386.  We
+  will regret this in ten years (see previous change).
+  
+* Fix incompatibility with Intel icc which pretends to be gcc
+  but does not support quad precision.
+
+* make libfftw{threads,mpi} depend upon libfftw when using libtool;
+  this is consistent with most other libraries and simplifies the life
+  of various distributors of GNU/Linux.
+
+FFTW 3.3.1
+
+* Changes since 3.3.1-beta1:
+  
+  - Reduced planning time in estimate mode for sizes with large
+    prime factors.
+
+  - Added AVX autodetection under Visual Studio.  Thanks Carsten
+    Steger for submitting the necessary code.
+
+  - Modern Fortran interface now uses a separate fftw3l.f03 interface
+    file for the long double interface, which is not supported by
+    some Fortran compilers.  Provided new fftw3q.f03 interface file
+    to access the quadruple-precision FFTW routines with recent
+    versions of gcc/gfortran.
+
+* Added support for the NEON extensions to the ARM ISA.  (Note to beta
+  users: an ARM cycle counter is not yet implemented; please contact
+  fftw@fftw.org if you know how to do it right.)
+
+* MPI code now compiles even if mpicc is a C++ compiler; thanks to
+  Kyle Spyksma for the bug report.
+
+FFTW 3.3
+
+* Changes since 3.3-beta1:
+
+  - Compiling OpenMP support (--enable-openmp) now installs a
+    fftw3_omp library, instead of fftw3_threads, so that OpenMP
+    and POSIX threads (--enable-threads) libraries can be built
+    and installed at the same time.
+
+  - Various minor compilation fixes, corrections of manual typos, and
+    improvements to the benchmark test program.
+
+* Add support for the AVX extensions to x86 and x86-64.  The AVX code
+  works with 16-byte alignment (as opposed to 32-byte alignment),
+  so there is no ABI change compared to FFTW 3.2.2.
+
+* Added Fortran 2003 interface, which should be usable on most modern
+  Fortran compilers (e.g. gfortran) and provides type-checked access
+  to the the C FFTW interface.  (The legacy Fortran-77 interface is
+  still included also.)
+
+* Added MPI distributed-memory transforms.  Compared to 3.3alpha,
+  the major changes in the MPI transforms are:
+    - Fixed some deadlock and crashing bugs.
+    - Added Fortran 2003 interface.
+    - Added new-array execute functions for MPI plans.
+    - Eliminated use of large MPI tags, since Cray MPI requires tags < 2^24;
+      thanks to Jonathan Bentz for the bug report.
+    - Expanded documentation.
+    - 'make check' now runs MPI tests
+    - Some ABI changes - not binary-compatible with 3.3alpha MPI.
+
+* Add support for quad-precision __float128 in gcc 4.6 or later (on x86.
+  x86-64, and Itanium).  The new routines use the fftwq_ prefix.
+
+* Removed support for MIPS paired-single instructions due to lack of
+  available hardware for testing.  Users who want this functionality
+  should continue using FFTW 3.2.x.  (Note that FFTW 3.3 still works
+  on MIPS; this only concerns special instructions available on some
+  MIPS chips.)
+
+* Removed support for the Cell Broadband Engine.  Cell users should
+  use FFTW 3.2.x.
+
+* New convenience functions fftw_alloc_real and fftw_alloc_complex
+  to use fftw_malloc for real and complex arrays without typecasts
+  or sizeof.
+
+* New convenience functions fftw_export_wisdom_to_filename and
+  fftw_import_wisdom_from_filename that export/import wisdom
+  to a file, which don't require you to open/close the file yourself.
+
+* New function fftw_cost to return FFTW's internal cost metric for 
+  a given plan; thanks to Rhys Ulerich and Nathanael Schaeffer for the
+  suggestion.
+
+* The --enable-sse2 configure flag now works in both double and single
+  precision (and is equivalent to --enable-sse in the latter case).
+
+* Remove --enable-portable-binary flag: we new produce portable binaries
+  by default.
+
+* Remove the automatic detection of native architecture flag for gcc
+  which was introduced in fftw-3.1, since new gcc supports -mtune=native.
+  Remove the --with-gcc-arch flag; if you want to specify a particlar
+  arch to configure, use ./configure CC="gcc -mtune=...".
+
+* --with-our-malloc16 configure flag is now renamed --with-our-malloc.
+
+* Fixed build problem failure when srand48 declaration is missing;
+  thanks to Ralf Wildenhues for the bug report.
+
+* Fixed bug in fftw_set_timelimit: ensure that a negative timelimit
+  is equivalent to no timelimit in all cases.  Thanks to William Andrew
+  Burnson for the bug report.
+
+* Fixed stack-overflow problem on OpenBSD caused by using alloca with
+  too large a buffer.
+
+FFTW 3.2.2
+
+* Improve performance of some copy operations of complex arrays on
+  x86 machines.
+
+* Add configure flag to disable alloca(), which is broken in mingw64.
+
+* Planning in FFTW_ESTIMATE mode for r2r transforms became slower
+  between fftw-3.1.3 and 3.2.  This regression has now been fixed.
+
+FFTW 3.2.1
+
+* Performance improvements for some multidimensional r2c/c2r transforms;
+  thanks to Eugene Miloslavsky for his benchmark reports.
+
+* Compile with icc on MacOS X, use better icc compiler flags.
+
+* Compilation fixes for systems where snprintf is defined as a macro;
+  thanks to Marcus Mae for the bug report.
+
+* Fortran documentation now recommends not using dfftw_execute,
+  because of reports of problems with various Fortran compilers;
+  it is better to use dfftw_execute_dft etcetera.
+
+* Some documentation clarifications, e.g. of fact that --enable-openmp
+  and --enable-threads are mutually exclusive (thanks to Long To),
+  and document slightly odd behavior of plan_guru_r2r in Fortran
+  (thanks to Alexander Pozdneev).
+
+* FAQ was accidentally omitted from 3.2 tarball.
+
+* Remove some extraneous (harmless) files accidentally included in 
+  a subdirectory of the 3.2 tarball.
+
+FFTW 3.2
+
+* Worked around apparent glibc bug that leads to rare hangs when freeing
+  semaphores.
+
+* Fixed segfault due to unaligned access in certain obscure problems
+  that use SSE and multiple threads.
+
+* MPI transforms not included, as they are still in alpha; the alpha
+  versions of the MPI transforms have been moved to FFTW 3.3alpha1.
+
+FFTW 3.2alpha3
+
+* Performance improvements for sizes with factors of 5 and 10.
+
+* Documented FFTW_WISDOM_ONLY flag, at the suggestion of Mario
+  Emmenlauer and Phil Dumont.
+
+* Port Cell code to SDK2.1 (libspe2), as opposed to the old libspe1 code.
+
+* Performance improvements in Cell code for N < 32k, thanks to Jan Wagner
+  for the suggestions.
+
+* Cycle counter for Sun x86_64 compiler, and compilation fix in cycle
+  counter for AIX/xlc (thanks to Jeff Haferman for the bug report).
+
+* Fixed incorrect type prefix in MPI code that prevented wisdom routines
+  from working in single precision (thanks to Eric A. Borisch for the report).
+
+* Added 'make check' for MPI code (which still fails in a couple corner
+  cases, but should be much better than in alpha2).
+
+* Many other small fixes.
+
+FFTW 3.2alpha2
+
+* Support for the Cell processor, donated by IBM Research; see README.Cell
+  and the Cell section of the manual.
+
+* New 64-bit API: for every "plan_guru" function there is a new "plan_guru64"
+  function with the same semantics, but which takes fftw_iodim64 instead of
+  fftw_iodim.  fftw_iodim64 is the same as fftw_iodim, except that it takes
+  ptrdiff_t integer types as parameters, which is a 64-bit type on
+  64-bit machines.  This is only useful for specifying very large transforms
+  on 64-bit machines.  (Internally, FFTW uses ptrdiff_t everywhere
+  regardless of what API you choose.)
+
+* Experimental MPI support.  Complex one- and multi-dimensional FFTs,
+  multi-dimensional r2r, multi-dimensional r2c/c2r transforms, and
+  distributed transpose operations, with 1d block distributions.
+  (This is an alpha preview: routines have not been exhaustively
+  tested, documentation is incomplete, and some functionality is
+  missing, e.g. Fortran support.)  See mpi/README and also the MPI
+  section of the manual.
+
+* Significantly faster r2c/c2r transforms, especially on machines with SIMD.
+
+* Rewritten multi-threaded support for better performance by
+  re-using a fixed pool of threads rather than continually
+  respawning and joining (which nowadays is much slower).
+
+* Support for MIPS paired-single SIMD instructions, donated by
+  Codesourcery.
+
+* FFTW_WISDOM_ONLY planner flag, to create plan only if wisdom is
+  available and return NULL otherwise.
+
+* Removed k7 support, which only worked in 32-bit mode and is
+  becoming obsolete.  Use --enable-sse instead.
+
+* Added --with-g77-wrappers configure option to force inclusion
+  of g77 wrappers, in addition to whatever is needed for the
+  detected Fortran compilers.  This is mainly intended for GNU/Linux
+  distros switching to gfortran that wish to include both
+  gfortran and g77 support in FFTW.
+
+* In manual, renamed "guru execute" functions to "new-array execute"
+  functions, to reduce confusion with the guru planner interface.
+  (The programming interface is unchanged.)
+
+* Add missing __declspec attribute to threads API functions when compiling
+  for Windows; thanks to Robert O. Morris for the bug report.
+
+* Fixed missing return value from dfftw_init_threads in Fortran;
+  thanks to Markus Wetzstein for the bug report.
+
+FFTW 3.1.1
+
+* Performance improvements for Intel EMT64.
+
+* Performance improvements for large-size transforms with SIMD.
+
+* Cycle counter support for Intel icc and Visual C++ on x86-64.
+
+* In fftw-wisdom tool, replaced obsolete --impatient with --measure.
+
+* Fixed compilation failure with AIX/xlc; thanks to Joseph Thomas.
+
+* Windows DLL support for Fortran API (added missing __declspec(dllexport)).
+
+* SSE/SSE2 code works properly (i.e. disables itself) on older 386 and 486
+  CPUs lacking a CPUID instruction; thanks to Eric Korpela.
+
+FFTW 3.1
+
+* Faster FFTW_ESTIMATE planner.
+
+* New (faster) algorithm for REDFT00/RODFT00 (type-I DCT/DST) of odd size.
+
+* "4-step" algorithm for faster FFTs of very large sizes (> 2^18).
+
+* Faster in-place real-data DFTs (for R2HC and HC2R r2r formats).
+
+* Faster in-place non-square transpositions (FFTW uses these internally
+  for in-place FFTs, and you can also perform them explicitly using
+  the guru interface).
+
+* Faster prime-size DFTs: implemented Bluestein's algorithm, as well
+  as a zero-padded Rader variant to limit recursive use of Rader's algorithm.
+
+* SIMD support for split complex arrays.
+
+* Much faster Altivec/VMX performance.
+
+* New fftw_set_timelimit function to specify a (rough) upper bound to the
+  planning time (does not affect ESTIMATE mode).
+
+* Removed --enable-3dnow support; use --enable-k7 instead.
+
+* FMA (fused multiply-add) version is now included in "standard" FFTW,
+  and is enabled with --enable-fma (the default on PowerPC and Itanium).
+
+* Automatic detection of native architecture flag for gcc.  New
+  configure options: --enable-portable-binary and --with-gcc-arch=<arch>,
+  for people distributing compiled binaries of FFTW (see manual).
+
+* Automatic detection of Altivec under Linux with gcc 3.4 (so that
+  same binary should work on both Altivec and non-Altivec PowerPCs).
+
+* Compiler-specific tweaks/flags/workarounds for gcc 3.4, xlc, HP/UX,
+  Solaris/Intel.
+
+* Various documentation clarifications.
+
+* 64-bit clean.  (Fixes a bug affecting the split guru planner on 
+  64-bit machines, reported by David Necas.)
+
+* Fixed Debian bug #259612: inadvertent use of SSE instructions on
+  non-SSE machines (causing a crash) for --enable-sse binaries.
+
+* Fixed bug that caused HC2R transforms to destroy the input in
+  certain cases, even if the user specified FFTW_PRESERVE_INPUT.
+
+* Fixed bug where wisdom would be lost under rare circumstances,
+  causing excessive planning time.
+
+* FAQ notes bug in gcc-3.4.[1-3] that causes FFTW to crash with SSE/SSE2.
+
+* Fixed accidentally exported symbol that prohibited simultaneous
+  linking to double/single multithreaded FFTW (thanks to Alessio Massaro).
+
+* Support Win32 threads under MinGW (thanks to Alessio Massaro).
+
+* Fixed problem with building DLL under Cygwin; thanks to Stephane Fillod.
+
+* Fix build failure if no Fortran compiler is found (thanks to Charles
+  Radley for the bug report).
+
+* Fixed compilation failure with icc 8.0 and SSE/SSE2.  Automatic
+  detection of icc architecture flag (e.g. -xW).
+
+* Fixed compilation with OpenMP on AIX (thanks to Greg Bauer).
+
+* Fixed compilation failure on x86-64 with gcc (thanks to Orion Poplawski).
+
+* Incorporated patch from FreeBSD ports (FreeBSD does not have memalign,
+  but its malloc is 16-byte aligned).
+
+* Cycle-counter compilation fixes for Itanium, Alpha, x86-64, Sparc,
+  MacOS (thanks to Matt Boman, John Bowman, and James A. Treacy for
+  reports/fixes).  Added x86-64 cycle counter for PGI compilers,
+  courtesy Cristiano Calonaci.
+
+* Fix compilation problem in test program due to C99 conflict.
+
+* Portability fix for import_system_wisdom with djgpp (thanks to Juan
+  Manuel Guerrero).
+
+* Fixed compilation failure on MacOS 10.3 due to getopt conflict.
+
+* Work around Visual C++ (version 6/7) bug in SSE compilation;
+  thanks to Eddie Yee for his detailed report.
+
+Changes from FFTW 3.1 beta 2:
+
+* Several minor compilation fixes.
+
+* Eliminate FFTW_TIMELIMIT flag and replace fftw_timelimit global with
+  fftw_set_timelimit function.  Make wisdom work with time-limited plans.
+
+Changes from FFTW 3.1 beta 1:
+
+* Fixes for creating DLLs under Windows; thanks to John Pavel for his feedback.
+
+* Fixed more 64-bit problems, thanks to John Pavel for the bug report.
+
+* Further speed improvements for Altivec/VMX.
+
+* Further speed improvements for non-square transpositions.
+
+* Many minor tweaks.
+
+FFTW 3.0.1
+
+* Some speed improvements in SIMD code.
+
+* --without-cycle-counter option is removed.  If no cycle counter is found,
+  then the estimator is always used.  A --with-slow-timer option is provided
+  to force the use of lower-resolution timers.
+
+* Several fixes for compilation under Visual C++, with help from Stefane Ruel.
+
+* Added x86 cycle counter for Visual C++, with help from Morten Nissov.
+
+* Added S390 cycle counter, courtesy of James Treacy.
+
+* Added missing static keyword that prevented simultaneous linkage
+  of different-precision versions; thanks to Rasmus Larsen for the bug report.
+
+* Corrected accidental omission of f77_wisdom.f file; thanks to Alan Watson.
+
+* Support -xopenmp flag for SunOS; thanks to John Lou for the bug report.
+
+* Compilation with HP/UX cc requires -Wp,-H128000 flag to increase
+  preprocessor limits; thanks to Peter Vouras for the bug report.
+
+* Removed non-portable use of 'tempfile' in fftw-wisdom-to-conf script;
+  thanks to Nicolas Decoster for the patch.
+
+* Added 'make smallcheck' target in tests/ directory, at the request of
+  James Treacy.
+
+FFTW 3.0
+
+Major goals of this release:
+
+* Speed: often 20% or more faster than FFTW 2.x, even without SIMD (see below).
+
+* Complete rewrite, to make it easier to add new algorithms and transforms.
+
+* New API, to support more general semantics.
+
+Other enhancements:
+
+* SIMD acceleration on supporting CPUs (SSE, SSE2, 3DNow!, and AltiVec).
+ (With special thanks to Franz Franchetti for many experimental prototypes
+  and to Stefan Kral for the vectorizing generator from fftwgel.)
+
+* True in-place 1d transforms of large sizes (as well as compressed
+  twiddle tables for additional memory/cache savings).
+
+* More arbitrary placement of real & imaginary data, e.g. including
+  interleaved (as in FFTW 2.x) as well as separate real/imag arrays.
+
+* Efficient prime-size transforms of real data.
+
+* Multidimensional transforms can operate on a subset of a larger matrix,
+  and/or transform selected dimensions of a multidimensional array.
+
+* By popular demand, simultaneous linking to double precision (fftw),
+  single precision (fftwf), and long-double precision (fftwl) versions
+  of FFTW is now supported.
+
+* Cycle counters (on all modern CPUs) are exploited to speed planning.
+
+* Efficient transforms of real even/odd arrays, a.k.a. discrete
+  cosine/sine transforms (types I-IV).  (Currently work via pre/post
+  processing of real transforms, ala FFTPACK, so are not optimal.)
+
+* DHTs (Discrete Hartley Transforms), again via post-processing
+  of real transforms (and thus suboptimal, for now).
+
+* Support for linking to just those parts of FFTW that you need,
+  greatly reducing the size of statically linked programs when
+  only a limited set of transform sizes/types are required.
+
+* Canonical global wisdom file (/etc/fftw/wisdom) on Unix, along
+  with a command-line tool (fftw-wisdom) to generate/update it.
+
+* Fortran API can be used with both g77 and non-g77 compilers
+  simultaneously.
+
+* Multi-threaded version has optional OpenMP support.
+
+* Authors' good looks have greatly improved with age.
+
+Changes from 3.0beta3:
+
+* Separate FMA distribution to better exploit fused multiply-add instructions
+  on PowerPC (and possibly other) architectures.
+
+* Performance improvements via some inlining tweaks.
+
+* fftw_flops now returns double arguments, not int, to avoid overflows
+  for large sizes.
+
+* Workarounds for automake bugs.
+
+Changes from 3.0beta2:
+
+* The standard REDFT00/RODFT00 (DCT-I/DST-I) algorithm (used in
+  FFTPACK, NR, etcetera) turns out to have poor numerical accuracy, so
+  we replaced it with a slower routine that is more accurate.
+
+* The guru planner and execute functions now have two variants, one that
+  takes complex arguments and one that takes separate real/imag pointers.
+
+* Execute and planner routines now automatically align the stack on x86,
+  in case the calling program is misaligned.
+
+* README file for test program.
+
+* Fixed bugs in the combination of SIMD with multi-threaded transforms.
+
+* Eliminated internal fftw_threads_init function, which some people were
+  calling accidentally instead of the fftw_init_threads API function.
+
+* Check for -openmp flag (Intel C compiler) when --enable-openmp is used.
+
+* Support AMD x86-64 SIMD and cycle counter.
+
+* Support SSE2 intrinsics in forthcoming gcc 3.3.
+
+Changes from 3.0beta1:
+
+* Faster in-place 1d transforms of non-power-of-two sizes.
+
+* SIMD improvements for in-place, multi-dimensional, and/or non-FFTW_PATIENT
+  transforms.
+
+* Added support for hard-coded DCT/DST/DHT codelets of small sizes; the
+  default distribution only includes hard-coded size-8 DCT-II/III, however.
+
+* Many minor improvements to the manual.  Added section on using the
+  codelet generator to customize and enhance FFTW.
+
+* The default 'make check' should now only take a few minutes; for more
+  strenuous tests (which may take a day or so), do 'cd tests; make bigcheck'.
+
+* fftw_print_plan is split into fftw_fprint_plan and fftw_print_plan, where
+  the latter uses stdout.
+
+* Fixed ability to compile with a C++ compiler.
+
+* Fixed support for C99 complex type under glibc.
+
+* Fixed problems with alloca under MinGW, AIX.
+
+* Workaround for gcc/SPARC bug.
+
+* Fixed multi-threaded initialization failure on IRIX due to lack of
+  user-accessible PTHREAD_SCOPE_SYSTEM there.
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/README
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/README	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,31 @@
+FFTW is a free collection of fast C routines for computing the
+Discrete Fourier Transform in one or more dimensions.  It includes
+complex, real, symmetric, and parallel transforms, and can handle
+arbitrary array sizes efficiently.  FFTW is typically faster than
+other publically-available FFT implementations, and is even
+competitive with vendor-tuned libraries.  (See our web page for
+extensive benchmarks.)  To achieve this performance, FFTW uses novel
+code-generation and runtime self-optimization techniques (along with
+many other tricks).
+
+The doc/ directory contains the manual in texinfo, PDF, info, and HTML
+formats.  Frequently asked questions and answers can be found in the
+doc/FAQ/ directory in ASCII and HTML.
+
+For a quick introduction to calling FFTW, see the "Tutorial" section
+of the manual.
+
+Installation instructions are provided in the manual (don't worry, it
+is straightforward).
+
+CONTACTS
+--------
+
+FFTW was written by Matteo Frigo and Steven G. Johnson.  You can
+contact them at fftw@fftw.org.  The latest version of FFTW,
+benchmarks, links, and other information can be found at the FFTW home
+page (http://www.fftw.org).  You can also sign up to the fftw-announce
+mailing list to receive (infrequent) updates and information about new
+releases; to do so, go to:
+
+	http://www.fftw.org/mailman/listinfo/fftw-announce
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/TODO
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/TODO	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,39 @@
+TODO before FFTW-$2\pi$:
+
+* Wisdom: make it clear that it is specific to the exact fftw version
+  and configuration.  Report error codes when reading wisdom.  Maybe
+  have multiple system wisdom files, one per version?
+
+* DCT/DST codelets?  which kinds?
+
+* investigate the addition-chain trig computation
+
+* I can't believe that there isn't a closed form for the omega
+  array in Rader.
+
+* convolution problem type(s)
+
+* Explore the idea of having n < 0 in tensors, possibly to mean
+  inverse DFT.
+
+* better estimator: possibly, let "other" cost be coef * n, where
+  coef is a per-solver constant determined via some big numerical
+  optimization/fit.
+
+* vector radix, multidimensional codelets
+
+* it may be a good idea to unify all those little loops that do
+  copying, (X[i], X[n-i]) <- (X[i] + X[n-i], X[i] - X[n-i]),
+  and multiplication of vectors by twiddle factors.
+
+* Pruned FFTs (basically, a vecloop that skips zeros).
+
+* Try FFTPACK-style back-and-forth (Stockham) FFT.  (We tried this a
+  few years ago and it was slower, but perhaps matters have changed.)
+
+* Generate assembly directly for more processors, or maybe fork gcc.  =)
+
+* ensure that threaded solvers generate (block_size % 4 == 0)
+  to allow SIMD to be used.
+
+* memoize triggen.
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/aclocal.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/aclocal.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1071 @@
+# generated automatically by aclocal 1.11.6 -*- Autoconf -*-
+
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
+# 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
+# Inc.
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+m4_ifndef([AC_AUTOCONF_VERSION],
+  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
+m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
+[m4_warning([this file was generated for autoconf 2.69.
+You have another version of autoconf.  It may work, but is not guaranteed to.
+If you have problems, you may need to regenerate the build system entirely.
+To do so, use the procedure documented by the package, typically `autoreconf'.])])
+
+# Copyright (C) 2002, 2003, 2005, 2006, 2007, 2008, 2011 Free Software
+# Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 1
+
+# AM_AUTOMAKE_VERSION(VERSION)
+# ----------------------------
+# Automake X.Y traces this macro to ensure aclocal.m4 has been
+# generated from the m4 files accompanying Automake X.Y.
+# (This private macro should not be called outside this file.)
+AC_DEFUN([AM_AUTOMAKE_VERSION],
+[am__api_version='1.11'
+dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
+dnl require some minimum version.  Point them to the right macro.
+m4_if([$1], [1.11.6], [],
+      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
+])
+
+# _AM_AUTOCONF_VERSION(VERSION)
+# -----------------------------
+# aclocal traces this macro to find the Autoconf version.
+# This is a private macro too.  Using m4_define simplifies
+# the logic in aclocal, which can simply ignore this definition.
+m4_define([_AM_AUTOCONF_VERSION], [])
+
+# AM_SET_CURRENT_AUTOMAKE_VERSION
+# -------------------------------
+# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
+# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
+AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
+[AM_AUTOMAKE_VERSION([1.11.6])dnl
+m4_ifndef([AC_AUTOCONF_VERSION],
+  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
+_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
+
+# AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
+
+# Copyright (C) 2001, 2003, 2005, 2011 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 1
+
+# For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets
+# $ac_aux_dir to `$srcdir/foo'.  In other projects, it is set to
+# `$srcdir', `$srcdir/..', or `$srcdir/../..'.
+#
+# Of course, Automake must honor this variable whenever it calls a
+# tool from the auxiliary directory.  The problem is that $srcdir (and
+# therefore $ac_aux_dir as well) can be either absolute or relative,
+# depending on how configure is run.  This is pretty annoying, since
+# it makes $ac_aux_dir quite unusable in subdirectories: in the top
+# source directory, any form will work fine, but in subdirectories a
+# relative path needs to be adjusted first.
+#
+# $ac_aux_dir/missing
+#    fails when called from a subdirectory if $ac_aux_dir is relative
+# $top_srcdir/$ac_aux_dir/missing
+#    fails if $ac_aux_dir is absolute,
+#    fails when called from a subdirectory in a VPATH build with
+#          a relative $ac_aux_dir
+#
+# The reason of the latter failure is that $top_srcdir and $ac_aux_dir
+# are both prefixed by $srcdir.  In an in-source build this is usually
+# harmless because $srcdir is `.', but things will broke when you
+# start a VPATH build or use an absolute $srcdir.
+#
+# So we could use something similar to $top_srcdir/$ac_aux_dir/missing,
+# iff we strip the leading $srcdir from $ac_aux_dir.  That would be:
+#   am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"`
+# and then we would define $MISSING as
+#   MISSING="\${SHELL} $am_aux_dir/missing"
+# This will work as long as MISSING is not called from configure, because
+# unfortunately $(top_srcdir) has no meaning in configure.
+# However there are other variables, like CC, which are often used in
+# configure, and could therefore not use this "fixed" $ac_aux_dir.
+#
+# Another solution, used here, is to always expand $ac_aux_dir to an
+# absolute PATH.  The drawback is that using absolute paths prevent a
+# configured tree to be moved without reconfiguration.
+
+AC_DEFUN([AM_AUX_DIR_EXPAND],
+[dnl Rely on autoconf to set up CDPATH properly.
+AC_PREREQ([2.50])dnl
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`
+])
+
+# AM_CONDITIONAL                                            -*- Autoconf -*-
+
+# Copyright (C) 1997, 2000, 2001, 2003, 2004, 2005, 2006, 2008
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 9
+
+# AM_CONDITIONAL(NAME, SHELL-CONDITION)
+# -------------------------------------
+# Define a conditional.
+AC_DEFUN([AM_CONDITIONAL],
+[AC_PREREQ(2.52)dnl
+ ifelse([$1], [TRUE],  [AC_FATAL([$0: invalid condition: $1])],
+	[$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl
+AC_SUBST([$1_TRUE])dnl
+AC_SUBST([$1_FALSE])dnl
+_AM_SUBST_NOTMAKE([$1_TRUE])dnl
+_AM_SUBST_NOTMAKE([$1_FALSE])dnl
+m4_define([_AM_COND_VALUE_$1], [$2])dnl
+if $2; then
+  $1_TRUE=
+  $1_FALSE='#'
+else
+  $1_TRUE='#'
+  $1_FALSE=
+fi
+AC_CONFIG_COMMANDS_PRE(
+[if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then
+  AC_MSG_ERROR([[conditional "$1" was never defined.
+Usually this means the macro was only invoked conditionally.]])
+fi])])
+
+# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2009,
+# 2010, 2011 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 12
+
+# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
+# written in clear, in which case automake, when reading aclocal.m4,
+# will think it sees a *use*, and therefore will trigger all it's
+# C support machinery.  Also note that it means that autoscan, seeing
+# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
+
+
+# _AM_DEPENDENCIES(NAME)
+# ----------------------
+# See how the compiler implements dependency checking.
+# NAME is "CC", "CXX", "GCJ", or "OBJC".
+# We try a few techniques and use that to set a single cache variable.
+#
+# We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was
+# modified to invoke _AM_DEPENDENCIES(CC); we would have a circular
+# dependency, and given that the user is not expected to run this macro,
+# just rely on AC_PROG_CC.
+AC_DEFUN([_AM_DEPENDENCIES],
+[AC_REQUIRE([AM_SET_DEPDIR])dnl
+AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl
+AC_REQUIRE([AM_MAKE_INCLUDE])dnl
+AC_REQUIRE([AM_DEP_TRACK])dnl
+
+ifelse([$1], CC,   [depcc="$CC"   am_compiler_list=],
+       [$1], CXX,  [depcc="$CXX"  am_compiler_list=],
+       [$1], OBJC, [depcc="$OBJC" am_compiler_list='gcc3 gcc'],
+       [$1], UPC,  [depcc="$UPC"  am_compiler_list=],
+       [$1], GCJ,  [depcc="$GCJ"  am_compiler_list='gcc3 gcc'],
+                   [depcc="$$1"   am_compiler_list=])
+
+AC_CACHE_CHECK([dependency style of $depcc],
+               [am_cv_$1_dependencies_compiler_type],
+[if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named `D' -- because `-MD' means `put the output
+  # in D'.
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_$1_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp`
+  fi
+  am__universal=false
+  m4_case([$1], [CC],
+    [case " $depcc " in #(
+     *\ -arch\ *\ -arch\ *) am__universal=true ;;
+     esac],
+    [CXX],
+    [case " $depcc " in #(
+     *\ -arch\ *\ -arch\ *) am__universal=true ;;
+     esac])
+
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
+      # Solaris 8's {/usr,}/bin/sh.
+      touch sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    # We check with `-c' and `-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle `-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # after this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok `-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_$1_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_$1_dependencies_compiler_type=none
+fi
+])
+AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type])
+AM_CONDITIONAL([am__fastdep$1], [
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_$1_dependencies_compiler_type" = gcc3])
+])
+
+
+# AM_SET_DEPDIR
+# -------------
+# Choose a directory name for dependency files.
+# This macro is AC_REQUIREd in _AM_DEPENDENCIES
+AC_DEFUN([AM_SET_DEPDIR],
+[AC_REQUIRE([AM_SET_LEADING_DOT])dnl
+AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl
+])
+
+
+# AM_DEP_TRACK
+# ------------
+AC_DEFUN([AM_DEP_TRACK],
+[AC_ARG_ENABLE(dependency-tracking,
+[  --disable-dependency-tracking  speeds up one-time build
+  --enable-dependency-tracking   do not reject slow dependency extractors])
+if test "x$enable_dependency_tracking" != xno; then
+  am_depcomp="$ac_aux_dir/depcomp"
+  AMDEPBACKSLASH='\'
+  am__nodep='_no'
+fi
+AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno])
+AC_SUBST([AMDEPBACKSLASH])dnl
+_AM_SUBST_NOTMAKE([AMDEPBACKSLASH])dnl
+AC_SUBST([am__nodep])dnl
+_AM_SUBST_NOTMAKE([am__nodep])dnl
+])
+
+# Generate code to set up dependency tracking.              -*- Autoconf -*-
+
+# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+#serial 5
+
+# _AM_OUTPUT_DEPENDENCY_COMMANDS
+# ------------------------------
+AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
+[{
+  # Autoconf 2.62 quotes --file arguments for eval, but not when files
+  # are listed without --file.  Let's play safe and only enable the eval
+  # if we detect the quoting.
+  case $CONFIG_FILES in
+  *\'*) eval set x "$CONFIG_FILES" ;;
+  *)   set x $CONFIG_FILES ;;
+  esac
+  shift
+  for mf
+  do
+    # Strip MF so we end up with the name of the file.
+    mf=`echo "$mf" | sed -e 's/:.*$//'`
+    # Check whether this is an Automake generated Makefile or not.
+    # We used to match only the files named `Makefile.in', but
+    # some people rename them; so instead we look at the file content.
+    # Grep'ing the first line is not enough: some people post-process
+    # each Makefile.in and add a new line on top of each file to say so.
+    # Grep'ing the whole file is not good either: AIX grep has a line
+    # limit of 2048, but all sed's we know have understand at least 4000.
+    if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
+      dirpart=`AS_DIRNAME("$mf")`
+    else
+      continue
+    fi
+    # Extract the definition of DEPDIR, am__include, and am__quote
+    # from the Makefile without running `make'.
+    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
+    test -z "$DEPDIR" && continue
+    am__include=`sed -n 's/^am__include = //p' < "$mf"`
+    test -z "am__include" && continue
+    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
+    # When using ansi2knr, U may be empty or an underscore; expand it
+    U=`sed -n 's/^U = //p' < "$mf"`
+    # Find all dependency output files, they are included files with
+    # $(DEPDIR) in their names.  We invoke sed twice because it is the
+    # simplest approach to changing $(DEPDIR) to its actual value in the
+    # expansion.
+    for file in `sed -n "
+      s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
+	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
+      # Make sure the directory exists.
+      test -f "$dirpart/$file" && continue
+      fdir=`AS_DIRNAME(["$file"])`
+      AS_MKDIR_P([$dirpart/$fdir])
+      # echo "creating $dirpart/$file"
+      echo '# dummy' > "$dirpart/$file"
+    done
+  done
+}
+])# _AM_OUTPUT_DEPENDENCY_COMMANDS
+
+
+# AM_OUTPUT_DEPENDENCY_COMMANDS
+# -----------------------------
+# This macro should only be invoked once -- use via AC_REQUIRE.
+#
+# This code is only required when automatic dependency tracking
+# is enabled.  FIXME.  This creates each `.P' file that we will
+# need in order to bootstrap the dependency handling code.
+AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
+[AC_CONFIG_COMMANDS([depfiles],
+     [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS],
+     [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"])
+])
+
+# Copyright (C) 1996, 1997, 2000, 2001, 2003, 2005
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 8
+
+# AM_CONFIG_HEADER is obsolete.  It has been replaced by AC_CONFIG_HEADERS.
+AU_DEFUN([AM_CONFIG_HEADER], [AC_CONFIG_HEADERS($@)])
+
+# Do all the work for Automake.                             -*- Autoconf -*-
+
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
+# 2005, 2006, 2008, 2009 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 16
+
+# This macro actually does too much.  Some checks are only needed if
+# your package does certain things.  But this isn't really a big deal.
+
+# AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
+# AM_INIT_AUTOMAKE([OPTIONS])
+# -----------------------------------------------
+# The call with PACKAGE and VERSION arguments is the old style
+# call (pre autoconf-2.50), which is being phased out.  PACKAGE
+# and VERSION should now be passed to AC_INIT and removed from
+# the call to AM_INIT_AUTOMAKE.
+# We support both call styles for the transition.  After
+# the next Automake release, Autoconf can make the AC_INIT
+# arguments mandatory, and then we can depend on a new Autoconf
+# release and drop the old call support.
+AC_DEFUN([AM_INIT_AUTOMAKE],
+[AC_PREREQ([2.62])dnl
+dnl Autoconf wants to disallow AM_ names.  We explicitly allow
+dnl the ones we care about.
+m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
+AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl
+AC_REQUIRE([AC_PROG_INSTALL])dnl
+if test "`cd $srcdir && pwd`" != "`pwd`"; then
+  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
+  # is not polluted with repeated "-I."
+  AC_SUBST([am__isrc], [' -I$(srcdir)'])_AM_SUBST_NOTMAKE([am__isrc])dnl
+  # test to see if srcdir already configured
+  if test -f $srcdir/config.status; then
+    AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
+  fi
+fi
+
+# test whether we have cygpath
+if test -z "$CYGPATH_W"; then
+  if (cygpath --version) >/dev/null 2>/dev/null; then
+    CYGPATH_W='cygpath -w'
+  else
+    CYGPATH_W=echo
+  fi
+fi
+AC_SUBST([CYGPATH_W])
+
+# Define the identity of the package.
+dnl Distinguish between old-style and new-style calls.
+m4_ifval([$2],
+[m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
+ AC_SUBST([PACKAGE], [$1])dnl
+ AC_SUBST([VERSION], [$2])],
+[_AM_SET_OPTIONS([$1])dnl
+dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
+m4_if(m4_ifdef([AC_PACKAGE_NAME], 1)m4_ifdef([AC_PACKAGE_VERSION], 1), 11,,
+  [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
+ AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
+ AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl
+
+_AM_IF_OPTION([no-define],,
+[AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
+ AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])dnl
+
+# Some tools Automake needs.
+AC_REQUIRE([AM_SANITY_CHECK])dnl
+AC_REQUIRE([AC_ARG_PROGRAM])dnl
+AM_MISSING_PROG(ACLOCAL, aclocal-${am__api_version})
+AM_MISSING_PROG(AUTOCONF, autoconf)
+AM_MISSING_PROG(AUTOMAKE, automake-${am__api_version})
+AM_MISSING_PROG(AUTOHEADER, autoheader)
+AM_MISSING_PROG(MAKEINFO, makeinfo)
+AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
+AC_REQUIRE([AM_PROG_INSTALL_STRIP])dnl
+AC_REQUIRE([AM_PROG_MKDIR_P])dnl
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
+AC_REQUIRE([AC_PROG_AWK])dnl
+AC_REQUIRE([AC_PROG_MAKE_SET])dnl
+AC_REQUIRE([AM_SET_LEADING_DOT])dnl
+_AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])],
+	      [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])],
+			     [_AM_PROG_TAR([v7])])])
+_AM_IF_OPTION([no-dependencies],,
+[AC_PROVIDE_IFELSE([AC_PROG_CC],
+		  [_AM_DEPENDENCIES(CC)],
+		  [define([AC_PROG_CC],
+			  defn([AC_PROG_CC])[_AM_DEPENDENCIES(CC)])])dnl
+AC_PROVIDE_IFELSE([AC_PROG_CXX],
+		  [_AM_DEPENDENCIES(CXX)],
+		  [define([AC_PROG_CXX],
+			  defn([AC_PROG_CXX])[_AM_DEPENDENCIES(CXX)])])dnl
+AC_PROVIDE_IFELSE([AC_PROG_OBJC],
+		  [_AM_DEPENDENCIES(OBJC)],
+		  [define([AC_PROG_OBJC],
+			  defn([AC_PROG_OBJC])[_AM_DEPENDENCIES(OBJC)])])dnl
+])
+_AM_IF_OPTION([silent-rules], [AC_REQUIRE([AM_SILENT_RULES])])dnl
+dnl The `parallel-tests' driver may need to know about EXEEXT, so add the
+dnl `am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This macro
+dnl is hooked onto _AC_COMPILER_EXEEXT early, see below.
+AC_CONFIG_COMMANDS_PRE(dnl
+[m4_provide_if([_AM_COMPILER_EXEEXT],
+  [AM_CONDITIONAL([am__EXEEXT], [test -n "$EXEEXT"])])])dnl
+])
+
+dnl Hook into `_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
+dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
+dnl mangled by Autoconf and run in a shell conditional statement.
+m4_define([_AC_COMPILER_EXEEXT],
+m4_defn([_AC_COMPILER_EXEEXT])[m4_provide([_AM_COMPILER_EXEEXT])])
+
+
+# When config.status generates a header, we must update the stamp-h file.
+# This file resides in the same directory as the config header
+# that is generated.  The stamp files are numbered to have different names.
+
+# Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the
+# loop where config.status creates the headers, so we can generate
+# our stamp files there.
+AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK],
+[# Compute $1's index in $config_headers.
+_am_arg=$1
+_am_stamp_count=1
+for _am_header in $config_headers :; do
+  case $_am_header in
+    $_am_arg | $_am_arg:* )
+      break ;;
+    * )
+      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
+  esac
+done
+echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
+
+# Copyright (C) 2001, 2003, 2005, 2008, 2011 Free Software Foundation,
+# Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 1
+
+# AM_PROG_INSTALL_SH
+# ------------------
+# Define $install_sh.
+AC_DEFUN([AM_PROG_INSTALL_SH],
+[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+if test x"${install_sh}" != xset; then
+  case $am_aux_dir in
+  *\ * | *\	*)
+    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
+  *)
+    install_sh="\${SHELL} $am_aux_dir/install-sh"
+  esac
+fi
+AC_SUBST(install_sh)])
+
+# Copyright (C) 2003, 2005  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 2
+
+# Check whether the underlying file-system supports filenames
+# with a leading dot.  For instance MS-DOS doesn't.
+AC_DEFUN([AM_SET_LEADING_DOT],
+[rm -rf .tst 2>/dev/null
+mkdir .tst 2>/dev/null
+if test -d .tst; then
+  am__leading_dot=.
+else
+  am__leading_dot=_
+fi
+rmdir .tst 2>/dev/null
+AC_SUBST([am__leading_dot])])
+
+# Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
+# From Jim Meyering
+
+# Copyright (C) 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008,
+# 2011 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 5
+
+# AM_MAINTAINER_MODE([DEFAULT-MODE])
+# ----------------------------------
+# Control maintainer-specific portions of Makefiles.
+# Default is to disable them, unless `enable' is passed literally.
+# For symmetry, `disable' may be passed as well.  Anyway, the user
+# can override the default with the --enable/--disable switch.
+AC_DEFUN([AM_MAINTAINER_MODE],
+[m4_case(m4_default([$1], [disable]),
+       [enable], [m4_define([am_maintainer_other], [disable])],
+       [disable], [m4_define([am_maintainer_other], [enable])],
+       [m4_define([am_maintainer_other], [enable])
+        m4_warn([syntax], [unexpected argument to AM@&t@_MAINTAINER_MODE: $1])])
+AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
+  dnl maintainer-mode's default is 'disable' unless 'enable' is passed
+  AC_ARG_ENABLE([maintainer-mode],
+[  --][am_maintainer_other][-maintainer-mode  am_maintainer_other make rules and dependencies not useful
+			  (and sometimes confusing) to the casual installer],
+      [USE_MAINTAINER_MODE=$enableval],
+      [USE_MAINTAINER_MODE=]m4_if(am_maintainer_other, [enable], [no], [yes]))
+  AC_MSG_RESULT([$USE_MAINTAINER_MODE])
+  AM_CONDITIONAL([MAINTAINER_MODE], [test $USE_MAINTAINER_MODE = yes])
+  MAINT=$MAINTAINER_MODE_TRUE
+  AC_SUBST([MAINT])dnl
+]
+)
+
+AU_DEFUN([jm_MAINTAINER_MODE], [AM_MAINTAINER_MODE])
+
+# Check to see how 'make' treats includes.	            -*- Autoconf -*-
+
+# Copyright (C) 2001, 2002, 2003, 2005, 2009  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 4
+
+# AM_MAKE_INCLUDE()
+# -----------------
+# Check to see how make treats includes.
+AC_DEFUN([AM_MAKE_INCLUDE],
+[am_make=${MAKE-make}
+cat > confinc << 'END'
+am__doit:
+	@echo this is the am__doit target
+.PHONY: am__doit
+END
+# If we don't find an include directive, just comment out the code.
+AC_MSG_CHECKING([for style of include used by $am_make])
+am__include="#"
+am__quote=
+_am_result=none
+# First try GNU make style include.
+echo "include confinc" > confmf
+# Ignore all kinds of additional output from `make'.
+case `$am_make -s -f confmf 2> /dev/null` in #(
+*the\ am__doit\ target*)
+  am__include=include
+  am__quote=
+  _am_result=GNU
+  ;;
+esac
+# Now try BSD make style include.
+if test "$am__include" = "#"; then
+   echo '.include "confinc"' > confmf
+   case `$am_make -s -f confmf 2> /dev/null` in #(
+   *the\ am__doit\ target*)
+     am__include=.include
+     am__quote="\""
+     _am_result=BSD
+     ;;
+   esac
+fi
+AC_SUBST([am__include])
+AC_SUBST([am__quote])
+AC_MSG_RESULT([$_am_result])
+rm -f confinc confmf
+])
+
+# Copyright (C) 1999, 2000, 2001, 2003, 2004, 2005, 2008
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 6
+
+# AM_PROG_CC_C_O
+# --------------
+# Like AC_PROG_CC_C_O, but changed for automake.
+AC_DEFUN([AM_PROG_CC_C_O],
+[AC_REQUIRE([AC_PROG_CC_C_O])dnl
+AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+AC_REQUIRE_AUX_FILE([compile])dnl
+# FIXME: we rely on the cache variable name because
+# there is no other way.
+set dummy $CC
+am_cc=`echo $[2] | sed ['s/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/']`
+eval am_t=\$ac_cv_prog_cc_${am_cc}_c_o
+if test "$am_t" != yes; then
+   # Losing compiler, so override with the script.
+   # FIXME: It is wrong to rewrite CC.
+   # But if we don't then we get into trouble of one sort or another.
+   # A longer-term fix would be to have automake use am__CC in this case,
+   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
+   CC="$am_aux_dir/compile $CC"
+fi
+dnl Make sure AC_PROG_CC is never called again, or it will override our
+dnl setting of CC.
+m4_define([AC_PROG_CC],
+          [m4_fatal([AC_PROG_CC cannot be called after AM_PROG_CC_C_O])])
+])
+
+# Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
+
+# Copyright (C) 1997, 1999, 2000, 2001, 2003, 2004, 2005, 2008
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 6
+
+# AM_MISSING_PROG(NAME, PROGRAM)
+# ------------------------------
+AC_DEFUN([AM_MISSING_PROG],
+[AC_REQUIRE([AM_MISSING_HAS_RUN])
+$1=${$1-"${am_missing_run}$2"}
+AC_SUBST($1)])
+
+
+# AM_MISSING_HAS_RUN
+# ------------------
+# Define MISSING if not defined so far and test if it supports --run.
+# If it does, set am_missing_run to use it, otherwise, to nothing.
+AC_DEFUN([AM_MISSING_HAS_RUN],
+[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+AC_REQUIRE_AUX_FILE([missing])dnl
+if test x"${MISSING+set}" != xset; then
+  case $am_aux_dir in
+  *\ * | *\	*)
+    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
+  *)
+    MISSING="\${SHELL} $am_aux_dir/missing" ;;
+  esac
+fi
+# Use eval to expand $SHELL
+if eval "$MISSING --run true"; then
+  am_missing_run="$MISSING --run "
+else
+  am_missing_run=
+  AC_MSG_WARN([`missing' script is too old or missing])
+fi
+])
+
+# Copyright (C) 2003, 2004, 2005, 2006, 2011 Free Software Foundation,
+# Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 1
+
+# AM_PROG_MKDIR_P
+# ---------------
+# Check for `mkdir -p'.
+AC_DEFUN([AM_PROG_MKDIR_P],
+[AC_PREREQ([2.60])dnl
+AC_REQUIRE([AC_PROG_MKDIR_P])dnl
+dnl Automake 1.8 to 1.9.6 used to define mkdir_p.  We now use MKDIR_P,
+dnl while keeping a definition of mkdir_p for backward compatibility.
+dnl @MKDIR_P@ is magic: AC_OUTPUT adjusts its value for each Makefile.
+dnl However we cannot define mkdir_p as $(MKDIR_P) for the sake of
+dnl Makefile.ins that do not define MKDIR_P, so we do our own
+dnl adjustment using top_builddir (which is defined more often than
+dnl MKDIR_P).
+AC_SUBST([mkdir_p], ["$MKDIR_P"])dnl
+case $mkdir_p in
+  [[\\/$]]* | ?:[[\\/]]*) ;;
+  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
+esac
+])
+
+# Helper functions for option handling.                     -*- Autoconf -*-
+
+# Copyright (C) 2001, 2002, 2003, 2005, 2008, 2010 Free Software
+# Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 5
+
+# _AM_MANGLE_OPTION(NAME)
+# -----------------------
+AC_DEFUN([_AM_MANGLE_OPTION],
+[[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])])
+
+# _AM_SET_OPTION(NAME)
+# --------------------
+# Set option NAME.  Presently that only means defining a flag for this option.
+AC_DEFUN([_AM_SET_OPTION],
+[m4_define(_AM_MANGLE_OPTION([$1]), 1)])
+
+# _AM_SET_OPTIONS(OPTIONS)
+# ------------------------
+# OPTIONS is a space-separated list of Automake options.
+AC_DEFUN([_AM_SET_OPTIONS],
+[m4_foreach_w([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])])
+
+# _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET])
+# -------------------------------------------
+# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
+AC_DEFUN([_AM_IF_OPTION],
+[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
+
+# Check to make sure that the build environment is sane.    -*- Autoconf -*-
+
+# Copyright (C) 1996, 1997, 2000, 2001, 2003, 2005, 2008
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 5
+
+# AM_SANITY_CHECK
+# ---------------
+AC_DEFUN([AM_SANITY_CHECK],
+[AC_MSG_CHECKING([whether build environment is sane])
+# Just in case
+sleep 1
+echo timestamp > conftest.file
+# Reject unsafe characters in $srcdir or the absolute working directory
+# name.  Accept space and tab only in the latter.
+am_lf='
+'
+case `pwd` in
+  *[[\\\"\#\$\&\'\`$am_lf]]*)
+    AC_MSG_ERROR([unsafe absolute working directory name]);;
+esac
+case $srcdir in
+  *[[\\\"\#\$\&\'\`$am_lf\ \	]]*)
+    AC_MSG_ERROR([unsafe srcdir value: `$srcdir']);;
+esac
+
+# Do `set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
+   if test "$[*]" = "X"; then
+      # -L didn't work.
+      set X `ls -t "$srcdir/configure" conftest.file`
+   fi
+   rm -f conftest.file
+   if test "$[*]" != "X $srcdir/configure conftest.file" \
+      && test "$[*]" != "X conftest.file $srcdir/configure"; then
+
+      # If neither matched, then we have a broken ls.  This can happen
+      # if, for instance, CONFIG_SHELL is bash and it inherits a
+      # broken ls alias from the environment.  This has actually
+      # happened.  Such a system could not be considered "sane".
+      AC_MSG_ERROR([ls -t appears to fail.  Make sure there is not a broken
+alias in your environment])
+   fi
+
+   test "$[2]" = conftest.file
+   )
+then
+   # Ok.
+   :
+else
+   AC_MSG_ERROR([newly created file is older than distributed files!
+Check your system clock])
+fi
+AC_MSG_RESULT(yes)])
+
+# Copyright (C) 2001, 2003, 2005, 2011 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 1
+
+# AM_PROG_INSTALL_STRIP
+# ---------------------
+# One issue with vendor `install' (even GNU) is that you can't
+# specify the program used to strip binaries.  This is especially
+# annoying in cross-compiling environments, where the build's strip
+# is unlikely to handle the host's binaries.
+# Fortunately install-sh will honor a STRIPPROG variable, so we
+# always use install-sh in `make install-strip', and initialize
+# STRIPPROG with the value of the STRIP variable (set by the user).
+AC_DEFUN([AM_PROG_INSTALL_STRIP],
+[AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
+# Installed binaries are usually stripped using `strip' when the user
+# run `make install-strip'.  However `strip' might not be the right
+# tool to use in cross-compilation environments, therefore Automake
+# will honor the `STRIP' environment variable to overrule this program.
+dnl Don't test for $cross_compiling = yes, because it might be `maybe'.
+if test "$cross_compiling" != no; then
+  AC_CHECK_TOOL([STRIP], [strip], :)
+fi
+INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
+AC_SUBST([INSTALL_STRIP_PROGRAM])])
+
+# Copyright (C) 2006, 2008, 2010 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 3
+
+# _AM_SUBST_NOTMAKE(VARIABLE)
+# ---------------------------
+# Prevent Automake from outputting VARIABLE = @VARIABLE@ in Makefile.in.
+# This macro is traced by Automake.
+AC_DEFUN([_AM_SUBST_NOTMAKE])
+
+# AM_SUBST_NOTMAKE(VARIABLE)
+# --------------------------
+# Public sister of _AM_SUBST_NOTMAKE.
+AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
+
+# Check how to create a tarball.                            -*- Autoconf -*-
+
+# Copyright (C) 2004, 2005, 2012 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 2
+
+# _AM_PROG_TAR(FORMAT)
+# --------------------
+# Check how to create a tarball in format FORMAT.
+# FORMAT should be one of `v7', `ustar', or `pax'.
+#
+# Substitute a variable $(am__tar) that is a command
+# writing to stdout a FORMAT-tarball containing the directory
+# $tardir.
+#     tardir=directory && $(am__tar) > result.tar
+#
+# Substitute a variable $(am__untar) that extract such
+# a tarball read from stdin.
+#     $(am__untar) < result.tar
+AC_DEFUN([_AM_PROG_TAR],
+[# Always define AMTAR for backward compatibility.  Yes, it's still used
+# in the wild :-(  We should find a proper way to deprecate it ...
+AC_SUBST([AMTAR], ['$${TAR-tar}'])
+m4_if([$1], [v7],
+     [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'],
+     [m4_case([$1], [ustar],, [pax],,
+              [m4_fatal([Unknown tar format])])
+AC_MSG_CHECKING([how to create a $1 tar archive])
+# Loop over all known methods to create a tar archive until one works.
+_am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none'
+_am_tools=${am_cv_prog_tar_$1-$_am_tools}
+# Do not fold the above two line into one, because Tru64 sh and
+# Solaris sh will not grok spaces in the rhs of `-'.
+for _am_tool in $_am_tools
+do
+  case $_am_tool in
+  gnutar)
+    for _am_tar in tar gnutar gtar;
+    do
+      AM_RUN_LOG([$_am_tar --version]) && break
+    done
+    am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
+    am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
+    am__untar="$_am_tar -xf -"
+    ;;
+  plaintar)
+    # Must skip GNU tar: if it does not support --format= it doesn't create
+    # ustar tarball either.
+    (tar --version) >/dev/null 2>&1 && continue
+    am__tar='tar chf - "$$tardir"'
+    am__tar_='tar chf - "$tardir"'
+    am__untar='tar xf -'
+    ;;
+  pax)
+    am__tar='pax -L -x $1 -w "$$tardir"'
+    am__tar_='pax -L -x $1 -w "$tardir"'
+    am__untar='pax -r'
+    ;;
+  cpio)
+    am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
+    am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
+    am__untar='cpio -i -H $1 -d'
+    ;;
+  none)
+    am__tar=false
+    am__tar_=false
+    am__untar=false
+    ;;
+  esac
+
+  # If the value was cached, stop now.  We just wanted to have am__tar
+  # and am__untar set.
+  test -n "${am_cv_prog_tar_$1}" && break
+
+  # tar/untar a dummy directory, and stop if the command works
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  echo GrepMe > conftest.dir/file
+  AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
+  rm -rf conftest.dir
+  if test -s conftest.tar; then
+    AM_RUN_LOG([$am__untar <conftest.tar])
+    grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+  fi
+done
+rm -rf conftest.dir
+
+AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
+AC_MSG_RESULT([$am_cv_prog_tar_$1])])
+AC_SUBST([am__tar])
+AC_SUBST([am__untar])
+]) # _AM_PROG_TAR
+
+m4_include([m4/acx_mpi.m4])
+m4_include([m4/acx_pthread.m4])
+m4_include([m4/ax_cc_maxopt.m4])
+m4_include([m4/ax_check_compiler_flags.m4])
+m4_include([m4/ax_compiler_vendor.m4])
+m4_include([m4/ax_gcc_aligns_stack.m4])
+m4_include([m4/ax_gcc_version.m4])
+m4_include([m4/ax_openmp.m4])
+m4_include([m4/libtool.m4])
+m4_include([m4/ltoptions.m4])
+m4_include([m4/ltsugar.m4])
+m4_include([m4/ltversion.m4])
+m4_include([m4/lt~obsolete.m4])
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,63 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft		\
+-I$(top_srcdir)/rdft -I$(top_srcdir)/reodft
+AM_CFLAGS = $(STACK_ALIGN_CFLAGS)
+
+EXTRA_DIST = f03api.sh genf03.pl fftw3.f03.in
+
+include_HEADERS = fftw3.h fftw3.f fftw3l.f03 fftw3q.f03
+nodist_include_HEADERS = fftw3.f03
+noinst_LTLIBRARIES = libapi.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = api.h x77.h guru.h guru64.h
+
+libapi_la_SOURCES = apiplan.c configure.c execute-dft-c2r.c		\
+execute-dft-r2c.c execute-dft.c execute-r2r.c execute-split-dft-c2r.c	\
+execute-split-dft-r2c.c execute-split-dft.c execute.c			\
+export-wisdom-to-file.c export-wisdom-to-string.c export-wisdom.c	\
+f77api.c flops.c forget-wisdom.c import-system-wisdom.c			\
+import-wisdom-from-file.c import-wisdom-from-string.c import-wisdom.c	\
+malloc.c map-r2r-kind.c mapflags.c mkprinter-file.c mktensor-iodims.c	\
+mktensor-rowmajor.c plan-dft-1d.c plan-dft-2d.c plan-dft-3d.c		\
+plan-dft-c2r-1d.c plan-dft-c2r-2d.c plan-dft-c2r-3d.c plan-dft-c2r.c	\
+plan-dft-r2c-1d.c plan-dft-r2c-2d.c plan-dft-r2c-3d.c plan-dft-r2c.c	\
+plan-dft.c plan-guru-dft-c2r.c plan-guru-dft-r2c.c plan-guru-dft.c	\
+plan-guru-r2r.c plan-guru-split-dft-c2r.c plan-guru-split-dft-r2c.c	\
+plan-guru-split-dft.c plan-many-dft-c2r.c plan-many-dft-r2c.c		\
+plan-many-dft.c plan-many-r2r.c plan-r2r-1d.c plan-r2r-2d.c		\
+plan-r2r-3d.c plan-r2r.c print-plan.c rdft2-pad.c the-planner.c		\
+version.c api.h f77funcs.h fftw3.h x77.h guru.h guru64.h		\
+mktensor-iodims.h plan-guru-dft-c2r.h plan-guru-dft-r2c.h		\
+plan-guru-dft.h plan-guru-r2r.h plan-guru-split-dft-c2r.h		\
+plan-guru-split-dft-r2c.h plan-guru-split-dft.h plan-guru64-dft-c2r.c	\
+plan-guru64-dft-r2c.c plan-guru64-dft.c plan-guru64-r2r.c		\
+plan-guru64-split-dft-c2r.c plan-guru64-split-dft-r2c.c			\
+plan-guru64-split-dft.c mktensor-iodims64.c
+
+BUILT_SOURCES = fftw3.f fftw3.f03.in fftw3.f03 fftw3l.f03 fftw3q.f03
+CLEANFILES = fftw3.f03
+
+fftw3.f03: fftw3.f03.in
+	(echo "! Generated automatically.  DO NOT EDIT!"; echo; \
+         echo "  integer, parameter :: C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@"; \
+         grep -v "Generated automatically" $(srcdir)/fftw3.f03.in) > $@
+
+if MAINTAINER_MODE
+
+# convert constants to F77 PARAMETER statements
+fftw3.f: fftw3.h
+	rm -f $@
+	perl -pe 's/([A-Z0-9_]+)=([+-]?[0-9]+)/\n      INTEGER \1\n      PARAMETER (\1=\2)\n/g' $< |egrep 'PARAMETER|INTEGER' > $@
+	perl -pe 's/#define +([A-Z0-9_]+) +\(([+-]?[0-9]+)U?\)/\n      INTEGER \1\n      PARAMETER (\1=\2)\n/g' $< |egrep 'PARAMETER|INTEGER' >> $@
+	perl -pe 'if (/#define +([A-Z0-9_]+) +\(([0-9]+)U? *<< *([0-9]+)\)/) { print "\n      INTEGER $$1\n      PARAMETER ($$1=",$$2 << $$3,")\n"; }' $< |egrep 'PARAMETER|INTEGER' >> $@
+
+fftw3.f03.in: fftw3.h f03api.sh genf03.pl
+	sh $(srcdir)/f03api.sh d f > $@
+
+fftw3l.f03: fftw3.h f03api.sh genf03.pl
+	sh $(srcdir)/f03api.sh l | grep -v parameter > $@
+
+fftw3q.f03: fftw3.h f03api.sh genf03.pl
+	sh $(srcdir)/f03api.sh q | grep -v parameter > $@
+
+endif # MAINTAINER_MODE
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,756 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = api
+DIST_COMMON = $(include_HEADERS) $(srcdir)/Makefile.am \
+	$(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libapi_la_LIBADD =
+am_libapi_la_OBJECTS = apiplan.lo configure.lo execute-dft-c2r.lo \
+	execute-dft-r2c.lo execute-dft.lo execute-r2r.lo \
+	execute-split-dft-c2r.lo execute-split-dft-r2c.lo \
+	execute-split-dft.lo execute.lo export-wisdom-to-file.lo \
+	export-wisdom-to-string.lo export-wisdom.lo f77api.lo flops.lo \
+	forget-wisdom.lo import-system-wisdom.lo \
+	import-wisdom-from-file.lo import-wisdom-from-string.lo \
+	import-wisdom.lo malloc.lo map-r2r-kind.lo mapflags.lo \
+	mkprinter-file.lo mktensor-iodims.lo mktensor-rowmajor.lo \
+	plan-dft-1d.lo plan-dft-2d.lo plan-dft-3d.lo \
+	plan-dft-c2r-1d.lo plan-dft-c2r-2d.lo plan-dft-c2r-3d.lo \
+	plan-dft-c2r.lo plan-dft-r2c-1d.lo plan-dft-r2c-2d.lo \
+	plan-dft-r2c-3d.lo plan-dft-r2c.lo plan-dft.lo \
+	plan-guru-dft-c2r.lo plan-guru-dft-r2c.lo plan-guru-dft.lo \
+	plan-guru-r2r.lo plan-guru-split-dft-c2r.lo \
+	plan-guru-split-dft-r2c.lo plan-guru-split-dft.lo \
+	plan-many-dft-c2r.lo plan-many-dft-r2c.lo plan-many-dft.lo \
+	plan-many-r2r.lo plan-r2r-1d.lo plan-r2r-2d.lo plan-r2r-3d.lo \
+	plan-r2r.lo print-plan.lo rdft2-pad.lo the-planner.lo \
+	version.lo plan-guru64-dft-c2r.lo plan-guru64-dft-r2c.lo \
+	plan-guru64-dft.lo plan-guru64-r2r.lo \
+	plan-guru64-split-dft-c2r.lo plan-guru64-split-dft-r2c.lo \
+	plan-guru64-split-dft.lo mktensor-iodims64.lo
+libapi_la_OBJECTS = $(am_libapi_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libapi_la_SOURCES)
+DIST_SOURCES = $(libapi_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(includedir)" "$(DESTDIR)$(includedir)"
+HEADERS = $(include_HEADERS) $(nodist_include_HEADERS)
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft		\
+-I$(top_srcdir)/rdft -I$(top_srcdir)/reodft
+
+AM_CFLAGS = $(STACK_ALIGN_CFLAGS)
+EXTRA_DIST = f03api.sh genf03.pl fftw3.f03.in
+include_HEADERS = fftw3.h fftw3.f fftw3l.f03 fftw3q.f03
+nodist_include_HEADERS = fftw3.f03
+noinst_LTLIBRARIES = libapi.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = api.h x77.h guru.h guru64.h
+libapi_la_SOURCES = apiplan.c configure.c execute-dft-c2r.c		\
+execute-dft-r2c.c execute-dft.c execute-r2r.c execute-split-dft-c2r.c	\
+execute-split-dft-r2c.c execute-split-dft.c execute.c			\
+export-wisdom-to-file.c export-wisdom-to-string.c export-wisdom.c	\
+f77api.c flops.c forget-wisdom.c import-system-wisdom.c			\
+import-wisdom-from-file.c import-wisdom-from-string.c import-wisdom.c	\
+malloc.c map-r2r-kind.c mapflags.c mkprinter-file.c mktensor-iodims.c	\
+mktensor-rowmajor.c plan-dft-1d.c plan-dft-2d.c plan-dft-3d.c		\
+plan-dft-c2r-1d.c plan-dft-c2r-2d.c plan-dft-c2r-3d.c plan-dft-c2r.c	\
+plan-dft-r2c-1d.c plan-dft-r2c-2d.c plan-dft-r2c-3d.c plan-dft-r2c.c	\
+plan-dft.c plan-guru-dft-c2r.c plan-guru-dft-r2c.c plan-guru-dft.c	\
+plan-guru-r2r.c plan-guru-split-dft-c2r.c plan-guru-split-dft-r2c.c	\
+plan-guru-split-dft.c plan-many-dft-c2r.c plan-many-dft-r2c.c		\
+plan-many-dft.c plan-many-r2r.c plan-r2r-1d.c plan-r2r-2d.c		\
+plan-r2r-3d.c plan-r2r.c print-plan.c rdft2-pad.c the-planner.c		\
+version.c api.h f77funcs.h fftw3.h x77.h guru.h guru64.h		\
+mktensor-iodims.h plan-guru-dft-c2r.h plan-guru-dft-r2c.h		\
+plan-guru-dft.h plan-guru-r2r.h plan-guru-split-dft-c2r.h		\
+plan-guru-split-dft-r2c.h plan-guru-split-dft.h plan-guru64-dft-c2r.c	\
+plan-guru64-dft-r2c.c plan-guru64-dft.c plan-guru64-r2r.c		\
+plan-guru64-split-dft-c2r.c plan-guru64-split-dft-r2c.c			\
+plan-guru64-split-dft.c mktensor-iodims64.c
+
+BUILT_SOURCES = fftw3.f fftw3.f03.in fftw3.f03 fftw3l.f03 fftw3q.f03
+CLEANFILES = fftw3.f03
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu api/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu api/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libapi.la: $(libapi_la_OBJECTS) $(libapi_la_DEPENDENCIES) $(EXTRA_libapi_la_DEPENDENCIES) 
+	$(LINK)  $(libapi_la_OBJECTS) $(libapi_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/apiplan.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/configure.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/execute-dft-c2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/execute-dft-r2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/execute-dft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/execute-r2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/execute-split-dft-c2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/execute-split-dft-r2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/execute-split-dft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/execute.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/export-wisdom-to-file.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/export-wisdom-to-string.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/export-wisdom.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/f77api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/flops.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/forget-wisdom.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/import-system-wisdom.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/import-wisdom-from-file.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/import-wisdom-from-string.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/import-wisdom.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/malloc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/map-r2r-kind.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mapflags.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mkprinter-file.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mktensor-iodims.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mktensor-iodims64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mktensor-rowmajor.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-1d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-2d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-3d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-c2r-1d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-c2r-2d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-c2r-3d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-c2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-r2c-1d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-r2c-2d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-r2c-3d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft-r2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-dft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru-dft-c2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru-dft-r2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru-dft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru-r2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru-split-dft-c2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru-split-dft-r2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru-split-dft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru64-dft-c2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru64-dft-r2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru64-dft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru64-r2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru64-split-dft-c2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru64-split-dft-r2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-guru64-split-dft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-many-dft-c2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-many-dft-r2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-many-dft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-many-r2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-r2r-1d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-r2r-2d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-r2r-3d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan-r2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/print-plan.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-pad.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/the-planner.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/version.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+install-includeHEADERS: $(include_HEADERS)
+	@$(NORMAL_INSTALL)
+	@list='$(include_HEADERS)'; test -n "$(includedir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(includedir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(includedir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(includedir)'"; \
+	  $(INSTALL_HEADER) $$files "$(DESTDIR)$(includedir)" || exit $$?; \
+	done
+
+uninstall-includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(include_HEADERS)'; test -n "$(includedir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(includedir)'; $(am__uninstall_files_from_dir)
+install-nodist_includeHEADERS: $(nodist_include_HEADERS)
+	@$(NORMAL_INSTALL)
+	@list='$(nodist_include_HEADERS)'; test -n "$(includedir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(includedir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(includedir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(includedir)'"; \
+	  $(INSTALL_HEADER) $$files "$(DESTDIR)$(includedir)" || exit $$?; \
+	done
+
+uninstall-nodist_includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(nodist_include_HEADERS)'; test -n "$(includedir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(includedir)'; $(am__uninstall_files_from_dir)
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES) $(HEADERS)
+installdirs:
+	for dir in "$(DESTDIR)$(includedir)" "$(DESTDIR)$(includedir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-includeHEADERS install-nodist_includeHEADERS
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-includeHEADERS uninstall-nodist_includeHEADERS
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-includeHEADERS install-info \
+	install-info-am install-man install-nodist_includeHEADERS \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am \
+	uninstall-includeHEADERS uninstall-nodist_includeHEADERS
+
+
+fftw3.f03: fftw3.f03.in
+	(echo "! Generated automatically.  DO NOT EDIT!"; echo; \
+         echo "  integer, parameter :: C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@"; \
+         grep -v "Generated automatically" $(srcdir)/fftw3.f03.in) > $@
+
+# convert constants to F77 PARAMETER statements
+@MAINTAINER_MODE_TRUE@fftw3.f: fftw3.h
+@MAINTAINER_MODE_TRUE@	rm -f $@
+@MAINTAINER_MODE_TRUE@	perl -pe 's/([A-Z0-9_]+)=([+-]?[0-9]+)/\n      INTEGER \1\n      PARAMETER (\1=\2)\n/g' $< |egrep 'PARAMETER|INTEGER' > $@
+@MAINTAINER_MODE_TRUE@	perl -pe 's/#define +([A-Z0-9_]+) +\(([+-]?[0-9]+)U?\)/\n      INTEGER \1\n      PARAMETER (\1=\2)\n/g' $< |egrep 'PARAMETER|INTEGER' >> $@
+@MAINTAINER_MODE_TRUE@	perl -pe 'if (/#define +([A-Z0-9_]+) +\(([0-9]+)U? *<< *([0-9]+)\)/) { print "\n      INTEGER $$1\n      PARAMETER ($$1=",$$2 << $$3,")\n"; }' $< |egrep 'PARAMETER|INTEGER' >> $@
+
+@MAINTAINER_MODE_TRUE@fftw3.f03.in: fftw3.h f03api.sh genf03.pl
+@MAINTAINER_MODE_TRUE@	sh $(srcdir)/f03api.sh d f > $@
+
+@MAINTAINER_MODE_TRUE@fftw3l.f03: fftw3.h f03api.sh genf03.pl
+@MAINTAINER_MODE_TRUE@	sh $(srcdir)/f03api.sh l | grep -v parameter > $@
+
+@MAINTAINER_MODE_TRUE@fftw3q.f03: fftw3.h f03api.sh genf03.pl
+@MAINTAINER_MODE_TRUE@	sh $(srcdir)/f03api.sh q | grep -v parameter > $@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/api.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/api.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* internal API definitions */
+#ifndef __API_H__
+#define __API_H__
+
+#ifndef CALLING_FFTW /* defined in hook.c, when calling internal functions */
+#  define COMPILING_FFTW /* used for DLL symbol exporting in fftw3.h */
+#endif
+
+/* When compiling with GNU libtool on Windows, DLL_EXPORT is #defined
+   for compiling the shared-library code.  In this case, we'll #define
+   FFTW_DLL to add dllexport attributes to the specified functions in
+   fftw3.h.
+
+   If we don't specify dllexport explicitly, then libtool
+   automatically exports all symbols.  However, if we specify
+   dllexport explicitly for any functions, then libtool apparently
+   doesn't do any automatic exporting.  (Not documented, grrr, but
+   this is the observed behavior with libtool 1.5.8.)  Thus, using
+   this forces us to correctly dllexport every exported symbol, or
+   linking bench.exe will fail.  This has the advantage of forcing
+   us to mark things correctly, which is necessary for other compilers
+   (such as MS VC++). */
+#ifdef DLL_EXPORT
+#  define FFTW_DLL
+#endif
+
+/* just in case: force <fftw3.h> not to use C99 complex numbers
+   (we need this for IBM xlc because _Complex_I is treated specially
+   and is defined even if <complex.h> is not included) */
+#define FFTW_NO_Complex
+
+#include "fftw3.h"
+#include "ifftw.h"
+#include "rdft.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+/* the API ``plan'' contains both the kernel plan and problem */
+struct X(plan_s) {
+     plan *pln;
+     problem *prb;
+     int sign;
+};
+
+/* shorthand */
+typedef struct X(plan_s) apiplan;
+
+/* complex type for internal use */
+typedef R C[2];
+
+#define EXTRACT_REIM(sign, c, r, i) X(extract_reim)(sign, (c)[0], r, i)
+
+#define TAINT_UNALIGNED(p, flg) TAINT(p, ((flg) & FFTW_UNALIGNED) != 0)
+
+tensor *X(mktensor_rowmajor)(int rnk, const int *n,
+			     const int *niphys, const int *nophys,
+			     int is, int os);
+
+tensor *X(mktensor_iodims)(int rank, const X(iodim) *dims, int is, int os);
+tensor *X(mktensor_iodims64)(int rank, const X(iodim64) *dims, int is, int os);
+const int *X(rdft2_pad)(int rnk, const int *n, const int *nembed,
+			int inplace, int cmplx, int **nfree);
+
+int X(many_kosherp)(int rnk, const int *n, int howmany);
+int X(guru_kosherp)(int rank, const X(iodim) *dims,
+		    int howmany_rank, const X(iodim) *howmany_dims);
+int X(guru64_kosherp)(int rank, const X(iodim64) *dims,
+		    int howmany_rank, const X(iodim64) *howmany_dims);
+
+/* Note: FFTW_EXTERN is used for "internal" functions used in tests/hook.c */
+
+FFTW_EXTERN printer *X(mkprinter_file)(FILE *f);
+
+FFTW_EXTERN planner *X(the_planner)(void);
+void X(configure_planner)(planner *plnr);
+
+void X(mapflags)(planner *, unsigned);
+
+apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb);
+
+rdft_kind *X(map_r2r_kind)(int rank, const X(r2r_kind) * kind);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif				/* __API_H__ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/apiplan.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/apiplan.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+static plan *mkplan0(planner *plnr, unsigned flags, 
+		     const problem *prb, int hash_info, 
+		     wisdom_state_t wisdom_state)
+{
+     /* map API flags into FFTW flags */
+     X(mapflags)(plnr, flags);
+
+     plnr->flags.hash_info = hash_info;
+     plnr->wisdom_state = wisdom_state;
+
+     /* create plan */
+     return plnr->adt->mkplan(plnr, prb);
+}
+
+static unsigned force_estimator(unsigned flags)
+{
+     flags &= ~(FFTW_MEASURE | FFTW_PATIENT | FFTW_EXHAUSTIVE);
+     return (flags | FFTW_ESTIMATE);
+}
+
+static plan *mkplan(planner *plnr, unsigned flags, 
+		    const problem *prb, int hash_info)
+{
+     plan *pln;
+
+     pln = mkplan0(plnr, flags, prb, hash_info, WISDOM_NORMAL);
+
+     if (plnr->wisdom_state == WISDOM_NORMAL && !pln) {
+	  /* maybe the planner failed because of inconsistent wisdom;
+	     plan again ignoring infeasible wisdom */
+	  pln = mkplan0(plnr, force_estimator(flags), prb, 
+			hash_info, WISDOM_IGNORE_INFEASIBLE);
+     }
+
+     if (plnr->wisdom_state == WISDOM_IS_BOGUS) {
+	  /* if the planner detected a wisdom inconsistency,
+	     forget all wisdom and plan again */
+	  plnr->adt->forget(plnr, FORGET_EVERYTHING);
+
+	  A(!pln);
+	  pln = mkplan0(plnr, flags, prb, hash_info, WISDOM_NORMAL);
+
+	  if (plnr->wisdom_state == WISDOM_IS_BOGUS) {
+	       /* if it still fails, plan without wisdom */
+	       plnr->adt->forget(plnr, FORGET_EVERYTHING);
+
+	       A(!pln);
+	       pln = mkplan0(plnr, force_estimator(flags), 
+			     prb, hash_info, WISDOM_IGNORE_ALL);
+	  }
+     }
+
+     return pln;
+}
+
+apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb)
+{
+     apiplan *p = 0;
+     plan *pln;
+     unsigned flags_used_for_planning;
+     planner *plnr = X(the_planner)();
+     unsigned int pats[] = {FFTW_ESTIMATE, FFTW_MEASURE,
+			    FFTW_PATIENT, FFTW_EXHAUSTIVE};
+     int pat, pat_max;
+     double pcost = 0;
+
+     if (flags & FFTW_WISDOM_ONLY) {
+	  /* Special mode that returns a plan only if wisdom is present,
+	     and returns 0 otherwise.  This is now documented in the manual,
+	     as a way to detect whether wisdom is available for a problem. */
+	  flags_used_for_planning = flags;
+	  pln = mkplan0(plnr, flags, prb, 0, WISDOM_ONLY);
+     } else {
+	  pat_max = flags & FFTW_ESTIMATE ? 0 :
+	       (flags & FFTW_EXHAUSTIVE ? 3 :
+		(flags & FFTW_PATIENT ? 2 : 1));
+	  pat = plnr->timelimit >= 0 ? 0 : pat_max;
+
+	  flags &= ~(FFTW_ESTIMATE | FFTW_MEASURE | 
+		     FFTW_PATIENT | FFTW_EXHAUSTIVE);
+
+	  plnr->start_time = X(get_crude_time)();
+	  
+	  /* plan at incrementally increasing patience until we run
+	     out of time */
+	  for (pln = 0, flags_used_for_planning = 0; pat <= pat_max; ++pat) {
+	       plan *pln1;
+	       unsigned tmpflags = flags | pats[pat];
+	       pln1 = mkplan(plnr, tmpflags, prb, 0);
+
+	       if (!pln1) {
+		    /* don't bother continuing if planner failed or timed out */
+		    A(!pln || plnr->timed_out);
+		    break;
+	       }
+
+	       X(plan_destroy_internal)(pln);
+	       pln = pln1;
+	       flags_used_for_planning = tmpflags;
+	       pcost = pln->pcost;
+	  }
+     }
+
+     if (pln) {
+	  /* build apiplan */
+	  p = (apiplan *) MALLOC(sizeof(apiplan), PLANS);
+	  p->prb = prb;
+	  p->sign = sign; /* cache for execute_dft */
+	  
+	  /* re-create plan from wisdom, adding blessing */
+	  p->pln = mkplan(plnr, flags_used_for_planning, prb, BLESSING);
+
+	  /* record pcost from most recent measurement for use in X(cost) */
+	  p->pln->pcost = pcost;
+
+	  if (sizeof(trigreal) > sizeof(R)) {
+	       /* this is probably faster, and we have enough trigreal
+		  bits to maintain accuracy */
+	       X(plan_awake)(p->pln, AWAKE_SQRTN_TABLE);
+	  } else {
+	       /* more accurate */
+	       X(plan_awake)(p->pln, AWAKE_SINCOS);
+	  }
+	  
+	  /* we don't use pln for p->pln, above, since by re-creating the
+	     plan we might use more patient wisdom from a timed-out mkplan */
+	  X(plan_destroy_internal)(pln);
+     } else
+	  X(problem_destroy)(prb);
+     
+     /* discard all information not necessary to reconstruct the plan */
+     plnr->adt->forget(plnr, FORGET_ACCURSED);
+
+#ifdef FFTW_RANDOM_ESTIMATOR
+     X(random_estimate_seed)++; /* subsequent "random" plans are distinct */
+#endif
+     
+     return p;
+}
+
+void X(destroy_plan)(X(plan) p)
+{
+     if (p) {
+          X(plan_awake)(p->pln, SLEEPY);
+          X(plan_destroy_internal)(p->pln);
+          X(problem_destroy)(p->prb);
+          X(ifree)(p);
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/configure.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/configure.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "dft.h"
+#include "rdft.h"
+#include "reodft.h"
+
+void X(configure_planner)(planner *plnr)
+{
+     X(dft_conf_standard)(plnr);
+     X(rdft_conf_standard)(plnr);
+     X(reodft_conf_standard)(plnr);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/execute-dft-c2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/execute-dft-c2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+/* guru interface: requires care in alignment, r - i, etcetera. */
+void X(execute_dft_c2r)(const X(plan) p, C *in, R *out)
+{
+     plan_rdft2 *pln = (plan_rdft2 *) p->pln;
+     problem_rdft2 *prb = (problem_rdft2 *) p->prb;
+     pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), in[0], in[0]+1);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/execute-dft-r2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/execute-dft-r2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+/* guru interface: requires care in alignment, r - i, etcetera. */
+void X(execute_dft_r2c)(const X(plan) p, R *in, C *out)
+{
+     plan_rdft2 *pln = (plan_rdft2 *) p->pln;
+     problem_rdft2 *prb = (problem_rdft2 *) p->prb;
+     pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), out[0], out[0]+1);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/execute-dft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/execute-dft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "dft.h"
+
+/* guru interface: requires care in alignment etcetera. */
+void X(execute_dft)(const X(plan) p, C *in, C *out)
+{
+     plan_dft *pln = (plan_dft *) p->pln;
+     if (p->sign == FFT_SIGN)
+	  pln->apply((plan *) pln, in[0], in[0]+1, out[0], out[0]+1);
+     else
+	  pln->apply((plan *) pln, in[0]+1, in[0], out[0]+1, out[0]);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/execute-r2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/execute-r2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+/* guru interface: requires care in alignment, etcetera. */
+void X(execute_r2r)(const X(plan) p, R *in, R *out)
+{
+     plan_rdft *pln = (plan_rdft *) p->pln;
+     pln->apply((plan *) pln, in, out);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/execute-split-dft-c2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/execute-split-dft-c2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+/* guru interface: requires care in alignment, r - i, etcetera. */
+void X(execute_split_dft_c2r)(const X(plan) p, R *ri, R *ii, R *out)
+{
+     plan_rdft2 *pln = (plan_rdft2 *) p->pln;
+     problem_rdft2 *prb = (problem_rdft2 *) p->prb;
+     pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), ri, ii);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/execute-split-dft-r2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/execute-split-dft-r2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+/* guru interface: requires care in alignment, r - i, etcetera. */
+void X(execute_split_dft_r2c)(const X(plan) p, R *in, R *ro, R *io)
+{
+     plan_rdft2 *pln = (plan_rdft2 *) p->pln;
+     problem_rdft2 *prb = (problem_rdft2 *) p->prb;
+     pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), ro, io);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/execute-split-dft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/execute-split-dft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "dft.h"
+
+/* guru interface: requires care in alignment, r - i, etcetera. */
+void X(execute_split_dft)(const X(plan) p, R *ri, R *ii, R *ro, R *io)
+{
+     plan_dft *pln = (plan_dft *) p->pln;
+     pln->apply((plan *) pln, ri, ii, ro, io);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/execute.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/execute.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+void X(execute)(const X(plan) p)
+{
+     plan *pln = p->pln;
+     pln->adt->solve(pln, p->prb);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/export-wisdom-to-file.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/export-wisdom-to-file.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+void X(export_wisdom_to_file)(FILE *output_file)
+{
+     printer *p = X(mkprinter_file)(output_file);
+     planner *plnr = X(the_planner)();
+     plnr->adt->exprt(plnr, p);
+     X(printer_destroy)(p);
+}
+
+int X(export_wisdom_to_filename)(const char *filename)
+{
+     FILE *f = fopen(filename, "w");
+     int ret;
+     if (!f) return 0; /* error opening file */
+     X(export_wisdom_to_file)(f);
+     ret = !ferror(f);
+     if (fclose(f)) ret = 0; /* error closing file */
+     return ret;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/export-wisdom-to-string.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/export-wisdom-to-string.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+typedef struct {
+     printer super;
+     int *cnt;
+} P_cnt;
+
+static void putchr_cnt(printer * p_, char c)
+{
+     P_cnt *p = (P_cnt *) p_;
+     UNUSED(c);
+     ++*p->cnt;
+}
+
+static printer *mkprinter_cnt(int *cnt)
+{
+     P_cnt *p = (P_cnt *) X(mkprinter)(sizeof(P_cnt), putchr_cnt, 0);
+     p->cnt = cnt;
+     *cnt = 0;
+     return &p->super;
+}
+
+typedef struct {
+     printer super;
+     char *s;
+} P_str;
+
+static void putchr_str(printer * p_, char c)
+{
+     P_str *p = (P_str *) p_;
+     *p->s++ = c;
+     *p->s = 0;
+}
+
+static printer *mkprinter_str(char *s)
+{
+     P_str *p = (P_str *) X(mkprinter)(sizeof(P_str), putchr_str, 0);
+     p->s = s;
+     *s = 0;
+     return &p->super;
+}
+
+char *X(export_wisdom_to_string)(void)
+{
+     printer *p;
+     planner *plnr = X(the_planner)();
+     int cnt;
+     char *s;
+
+     p = mkprinter_cnt(&cnt);
+     plnr->adt->exprt(plnr, p);
+     X(printer_destroy)(p);
+
+     s = (char *) malloc(sizeof(char) * (cnt + 1));
+     if (s) {
+          p = mkprinter_str(s);
+          plnr->adt->exprt(plnr, p);
+          X(printer_destroy)(p);
+     }
+
+     return s;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/export-wisdom.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/export-wisdom.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+typedef struct {
+     printer super;
+     void (*write_char)(char c, void *);
+     void *data;
+} P;
+
+static void putchr_generic(printer * p_, char c)
+{
+     P *p = (P *) p_;
+     (p->write_char)(c, p->data);
+}
+
+void X(export_wisdom)(void (*write_char)(char c, void *), void *data)
+{
+     P *p = (P *) X(mkprinter)(sizeof(P), putchr_generic, 0);
+     planner *plnr = X(the_planner)();
+
+     p->write_char = write_char;
+     p->data = data;
+     plnr->adt->exprt(plnr, (printer *) p);
+     X(printer_destroy)((printer *) p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/f03api.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/f03api.sh	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,42 @@
+#! /bin/sh
+
+# Script to generate Fortran 2003 interface declarations for FFTW from
+# the fftw3.h header file.
+
+# This is designed so that the Fortran caller can do:
+#   use, intrinsic :: iso_c_binding
+#   implicit none
+#   include 'fftw3.f03'
+# and then call the C FFTW functions directly, with type checking.
+
+echo "! Generated automatically.  DO NOT EDIT!"
+echo
+
+# C_FFTW_R2R_KIND is determined by configure and inserted by the Makefile
+# echo "  integer, parameter :: C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@"
+
+# Extract constants
+perl -pe 's/([A-Z0-9_]+)=([+-]?[0-9]+)/\n  integer\(C_INT\), parameter :: \1 = \2\n/g' < fftw3.h | grep 'integer(C_INT)'
+perl -pe 's/#define +([A-Z0-9_]+) +\(([+-]?[0-9]+)U?\)/\n  integer\(C_INT\), parameter :: \1 = \2\n/g' < fftw3.h | grep 'integer(C_INT)'
+perl -pe 'if (/#define +([A-Z0-9_]+) +\(([0-9]+)U? *<< *([0-9]+)\)/) { print "\n  integer\(C_INT\), parameter :: $1 = ",$2 << $3,"\n"; }' < fftw3.h | grep 'integer(C_INT)'
+
+# Extract function declarations
+for p in $*; do
+    if test "$p" = "d"; then p=""; fi
+
+    echo
+    cat <<EOF
+  type, bind(C) :: fftw${p}_iodim
+     integer(C_INT) n, is, os
+  end type fftw${p}_iodim
+  type, bind(C) :: fftw${p}_iodim64
+     integer(C_INTPTR_T) n, is, os
+  end type fftw${p}_iodim64
+EOF
+
+    echo
+    echo "  interface"
+    gcc -D__GNUC__=5 -D__i386__ -E fftw3.h |grep "fftw${p}_plan_dft" |tr ';' '\n' | grep -v "fftw${p}_execute(" | perl genf03.pl
+    echo "  end interface"
+
+done
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/f77api.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/f77api.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "dft.h"
+#include "rdft.h"
+
+#include "x77.h"
+
+/* if F77_FUNC is not defined, then we don't know how to mangle identifiers
+   for the Fortran linker, and we must omit the f77 API. */
+#if defined(F77_FUNC) || defined(WINDOWS_F77_MANGLING)
+
+/*-----------------------------------------------------------------------*/
+/* some internal functions used by the f77 api */
+
+/* in fortran, the natural array ordering is column-major, which
+   corresponds to reversing the dimensions relative to C's row-major */
+static int *reverse_n(int rnk, const int *n)
+{
+     int *nrev;
+     int i;
+     A(FINITE_RNK(rnk));
+     nrev = (int *) MALLOC(sizeof(int) * rnk, PROBLEMS);
+     for (i = 0; i < rnk; ++i)
+          nrev[rnk - i - 1] = n[i];
+     return nrev;
+}
+
+/* f77 doesn't have data structures, so we have to pass iodims as
+   parallel arrays */
+static X(iodim) *make_dims(int rnk, const int *n,
+			   const int *is, const int *os)
+{
+     X(iodim) *dims;
+     int i;
+     A(FINITE_RNK(rnk));
+     dims = (X(iodim) *) MALLOC(sizeof(X(iodim)) * rnk, PROBLEMS);
+     for (i = 0; i < rnk; ++i) {
+          dims[i].n = n[i];
+          dims[i].is = is[i];
+          dims[i].os = os[i];
+     }
+     return dims;
+}
+
+typedef struct {
+     void (*f77_write_char)(char *, void *);
+     void *data;
+} write_char_data;
+
+static void write_char(char c, void *d)
+{
+     write_char_data *ad = (write_char_data *) d;
+     ad->f77_write_char(&c, ad->data);
+}
+
+typedef struct {
+     void (*f77_read_char)(int *, void *);
+     void *data;
+} read_char_data;
+
+static int read_char(void *d)
+{
+     read_char_data *ed = (read_char_data *) d;
+     int c;
+     ed->f77_read_char(&c, ed->data);
+     return (c < 0 ? EOF : c);
+}
+
+static X(r2r_kind) *ints2kinds(int rnk, const int *ik)
+{
+     if (!FINITE_RNK(rnk) || rnk == 0)
+	  return 0;
+     else {
+	  int i;
+	  X(r2r_kind) *k;
+
+	  k = (X(r2r_kind) *) MALLOC(sizeof(X(r2r_kind)) * rnk, PROBLEMS);
+	  /* reverse order for Fortran -> C */
+	  for (i = 0; i < rnk; ++i)
+	       k[i] = (X(r2r_kind)) ik[rnk - 1 - i];
+	  return k;
+     }
+}
+
+/*-----------------------------------------------------------------------*/
+
+#define F77(a, A) F77x(x77(a), X77(A))
+
+#ifndef WINDOWS_F77_MANGLING
+
+#if defined(F77_FUNC)
+#  define F77x(a, A) F77_FUNC(a, A)
+#  include "f77funcs.h"
+#endif
+
+/* If identifiers with underscores are mangled differently than those
+   without underscores, then we include *both* mangling versions.  The
+   reason is that the only Fortran compiler that does such differing
+   mangling is currently g77 (which adds an extra underscore to names
+   with underscores), whereas other compilers running on the same
+   machine are likely to use non-underscored mangling.  (I'm sick
+   of users complaining that FFTW works with g77 but not with e.g.
+   pgf77 or ifc on the same machine.)  Note that all FFTW identifiers
+   contain underscores, and configure picks g77 by default. */
+#if defined(F77_FUNC_) && !defined(F77_FUNC_EQUIV)
+#  undef F77x
+#  define F77x(a, A) F77_FUNC_(a, A)
+#  include "f77funcs.h"
+#endif
+
+#else /* WINDOWS_F77_MANGLING */
+
+/* Various mangling conventions common (?) under Windows. */
+
+/* g77 */
+#  define WINDOWS_F77_FUNC(a, A) a ## __
+#  define F77x(a, A) WINDOWS_F77_FUNC(a, A)
+#  include "f77funcs.h"
+
+/* Intel, etc. */
+#  undef WINDOWS_F77_FUNC
+#  define WINDOWS_F77_FUNC(a, A) a ## _
+#  include "f77funcs.h"
+
+/* Digital/Compaq/HP Visual Fortran, Intel Fortran.  stdcall attribute
+   is apparently required to adjust for calling conventions (callee
+   pops stack in stdcall).  See also:
+       http://msdn.microsoft.com/library/en-us/vccore98/html/_core_mixed.2d.language_programming.3a_.overview.asp
+*/
+#  undef WINDOWS_F77_FUNC
+#  if defined(__GNUC__)
+#    define WINDOWS_F77_FUNC(a, A) __attribute__((stdcall)) A
+#  elif defined(_MSC_VER) || defined(_ICC) || defined(_STDCALL_SUPPORTED)
+#    define WINDOWS_F77_FUNC(a, A) __stdcall A
+#  else
+#    define WINDOWS_F77_FUNC(a, A) A /* oh well */
+#  endif
+#  include "f77funcs.h"
+
+#endif /* WINDOWS_F77_MANGLING */
+
+#endif				/* F77_FUNC */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/f77funcs.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/f77funcs.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Functions in the FFTW Fortran API, mangled according to the
+   F77(...) macro.  This file is designed to be #included by
+   f77api.c, possibly multiple times in order to support multiple
+   compiler manglings (via redefinition of F77). */
+
+FFTW_VOIDFUNC F77(execute, EXECUTE)(X(plan) * const p)
+{
+     plan *pln = (*p)->pln;
+     pln->adt->solve(pln, (*p)->prb);
+}
+
+FFTW_VOIDFUNC F77(destroy_plan, DESTROY_PLAN)(X(plan) *p)
+{
+     X(destroy_plan)(*p);
+}
+
+FFTW_VOIDFUNC F77(cleanup, CLEANUP)(void)
+{
+     X(cleanup)();
+}
+
+FFTW_VOIDFUNC F77(forget_wisdom, FORGET_WISDOM)(void)
+{
+     X(forget_wisdom)();
+}
+
+FFTW_VOIDFUNC F77(export_wisdom, EXPORT_WISDOM)(void (*f77_write_char)(char *, void *),
+				       void *data)
+{
+     write_char_data ad;
+     ad.f77_write_char = f77_write_char;
+     ad.data = data;
+     X(export_wisdom)(write_char, (void *) &ad);
+}
+
+FFTW_VOIDFUNC F77(import_wisdom, IMPORT_WISDOM)(int *isuccess,
+				       void (*f77_read_char)(int *, void *),
+				       void *data)
+{
+     read_char_data ed;
+     ed.f77_read_char = f77_read_char;
+     ed.data = data;
+     *isuccess = X(import_wisdom)(read_char, (void *) &ed);
+}
+
+FFTW_VOIDFUNC F77(import_system_wisdom, IMPORT_SYSTEM_WISDOM)(int *isuccess)
+{
+     *isuccess = X(import_system_wisdom)();
+}
+
+FFTW_VOIDFUNC F77(print_plan, PRINT_PLAN)(X(plan) * const p)
+{
+     X(print_plan)(*p);
+     fflush(stdout);
+}
+
+FFTW_VOIDFUNC F77(flops,FLOPS)(X(plan) *p, double *add, double *mul, double *fma)
+{
+     X(flops)(*p, add, mul, fma);
+}
+
+FFTW_VOIDFUNC F77(estimate_cost,ESTIMATE_COST)(double *cost, X(plan) * const p)
+{
+     *cost = X(estimate_cost)(*p);
+}
+
+FFTW_VOIDFUNC F77(cost,COST)(double *cost, X(plan) * const p)
+{
+     *cost = X(cost)(*p);
+}
+
+FFTW_VOIDFUNC F77(set_timelimit,SET_TIMELIMIT)(double *t)
+{
+     X(set_timelimit)(*t);
+}
+
+/******************************** DFT ***********************************/
+
+FFTW_VOIDFUNC F77(plan_dft, PLAN_DFT)(X(plan) *p, int *rank, const int *n,
+			     C *in, C *out, int *sign, int *flags)
+{
+     int *nrev = reverse_n(*rank, n);
+     *p = X(plan_dft)(*rank, nrev, in, out, *sign, *flags);
+     X(ifree0)(nrev);
+}
+
+FFTW_VOIDFUNC F77(plan_dft_1d, PLAN_DFT_1D)(X(plan) *p, int *n, C *in, C *out,
+				   int *sign, int *flags)
+{
+     *p = X(plan_dft_1d)(*n, in, out, *sign, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_dft_2d, PLAN_DFT_2D)(X(plan) *p, int *nx, int *ny,
+				   C *in, C *out, int *sign, int *flags)
+{
+     *p = X(plan_dft_2d)(*ny, *nx, in, out, *sign, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_dft_3d, PLAN_DFT_3D)(X(plan) *p, int *nx, int *ny, int *nz,
+				   C *in, C *out,
+				   int *sign, int *flags)
+{
+     *p = X(plan_dft_3d)(*nz, *ny, *nx, in, out, *sign, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_many_dft, PLAN_MANY_DFT)(X(plan) *p, int *rank, const int *n,
+				       int *howmany,
+				       C *in, const int *inembed,
+				       int *istride, int *idist,
+				       C *out, const int *onembed,
+				       int *ostride, int *odist,
+				       int *sign, int *flags)
+{
+     int *nrev = reverse_n(*rank, n);
+     int *inembedrev = reverse_n(*rank, inembed);
+     int *onembedrev = reverse_n(*rank, onembed);
+     *p = X(plan_many_dft)(*rank, nrev, *howmany,
+			   in, inembedrev, *istride, *idist,
+			   out, onembedrev, *ostride, *odist,
+			   *sign, *flags);
+     X(ifree0)(onembedrev);
+     X(ifree0)(inembedrev);
+     X(ifree0)(nrev);
+}
+
+FFTW_VOIDFUNC F77(plan_guru_dft, PLAN_GURU_DFT)(X(plan) *p, int *rank, const int *n,
+				       const int *is, const int *os,
+				       int *howmany_rank, const int *h_n,
+				       const int *h_is, const int *h_os,
+				       C *in, C *out, int *sign, int *flags)
+{
+     X(iodim) *dims = make_dims(*rank, n, is, os);
+     X(iodim) *howmany_dims = make_dims(*howmany_rank, h_n, h_is, h_os);
+     *p = X(plan_guru_dft)(*rank, dims, *howmany_rank, howmany_dims,
+			   in, out, *sign, *flags);
+     X(ifree0)(howmany_dims);
+     X(ifree0)(dims);
+}
+
+FFTW_VOIDFUNC F77(plan_guru_split_dft, PLAN_GURU_SPLIT_DFT)(X(plan) *p, int *rank, const int *n,
+				       const int *is, const int *os,
+				       int *howmany_rank, const int *h_n,
+				       const int *h_is, const int *h_os,
+				       R *ri, R *ii, R *ro, R *io, int *flags)
+{
+     X(iodim) *dims = make_dims(*rank, n, is, os);
+     X(iodim) *howmany_dims = make_dims(*howmany_rank, h_n, h_is, h_os);
+     *p = X(plan_guru_split_dft)(*rank, dims, *howmany_rank, howmany_dims,
+			   ri, ii, ro, io, *flags);
+     X(ifree0)(howmany_dims);
+     X(ifree0)(dims);
+}
+
+FFTW_VOIDFUNC F77(execute_dft, EXECUTE_DFT)(X(plan) * const p, C *in, C *out)
+{
+     plan_dft *pln = (plan_dft *) (*p)->pln;
+     if ((*p)->sign == FFT_SIGN)
+          pln->apply((plan *) pln, in[0], in[0]+1, out[0], out[0]+1);
+     else
+          pln->apply((plan *) pln, in[0]+1, in[0], out[0]+1, out[0]);
+}
+
+FFTW_VOIDFUNC F77(execute_split_dft, EXECUTE_SPLIT_DFT)(X(plan) * const p,
+					       R *ri, R *ii, R *ro, R *io)
+{
+     plan_dft *pln = (plan_dft *) (*p)->pln;
+     pln->apply((plan *) pln, ri, ii, ro, io);
+}
+
+/****************************** DFT r2c *********************************/
+
+FFTW_VOIDFUNC F77(plan_dft_r2c, PLAN_DFT_R2C)(X(plan) *p, int *rank, const int *n,
+				     R *in, C *out, int *flags)
+{
+     int *nrev = reverse_n(*rank, n);
+     *p = X(plan_dft_r2c)(*rank, nrev, in, out, *flags);
+     X(ifree0)(nrev);
+}
+
+FFTW_VOIDFUNC F77(plan_dft_r2c_1d, PLAN_DFT_R2C_1D)(X(plan) *p, int *n, R *in, C *out,
+					   int *flags)
+{
+     *p = X(plan_dft_r2c_1d)(*n, in, out, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_dft_r2c_2d, PLAN_DFT_R2C_2D)(X(plan) *p, int *nx, int *ny,
+					   R *in, C *out, int *flags)
+{
+     *p = X(plan_dft_r2c_2d)(*ny, *nx, in, out, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_dft_r2c_3d, PLAN_DFT_R2C_3D)(X(plan) *p,
+					   int *nx, int *ny, int *nz,
+					   R *in, C *out,
+					   int *flags)
+{
+     *p = X(plan_dft_r2c_3d)(*nz, *ny, *nx, in, out, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_many_dft_r2c, PLAN_MANY_DFT_R2C)(
+     X(plan) *p, int *rank, const int *n,
+     int *howmany,
+     R *in, const int *inembed, int *istride, int *idist,
+     C *out, const int *onembed, int *ostride, int *odist,
+     int *flags)
+{
+     int *nrev = reverse_n(*rank, n);
+     int *inembedrev = reverse_n(*rank, inembed);
+     int *onembedrev = reverse_n(*rank, onembed);
+     *p = X(plan_many_dft_r2c)(*rank, nrev, *howmany,
+			       in, inembedrev, *istride, *idist,
+			       out, onembedrev, *ostride, *odist,
+			       *flags);
+     X(ifree0)(onembedrev);
+     X(ifree0)(inembedrev);
+     X(ifree0)(nrev);
+}
+
+FFTW_VOIDFUNC F77(plan_guru_dft_r2c, PLAN_GURU_DFT_R2C)(
+     X(plan) *p, int *rank, const int *n,
+     const int *is, const int *os,
+     int *howmany_rank, const int *h_n,
+     const int *h_is, const int *h_os,
+     R *in, C *out, int *flags)
+{
+     X(iodim) *dims = make_dims(*rank, n, is, os);
+     X(iodim) *howmany_dims = make_dims(*howmany_rank, h_n, h_is, h_os);
+     *p = X(plan_guru_dft_r2c)(*rank, dims, *howmany_rank, howmany_dims,
+			       in, out, *flags);
+     X(ifree0)(howmany_dims);
+     X(ifree0)(dims);
+}
+
+FFTW_VOIDFUNC F77(plan_guru_split_dft_r2c, PLAN_GURU_SPLIT_DFT_R2C)(
+     X(plan) *p, int *rank, const int *n,
+     const int *is, const int *os,
+     int *howmany_rank, const int *h_n,
+     const int *h_is, const int *h_os,
+     R *in, R *ro, R *io, int *flags)
+{
+     X(iodim) *dims = make_dims(*rank, n, is, os);
+     X(iodim) *howmany_dims = make_dims(*howmany_rank, h_n, h_is, h_os);
+     *p = X(plan_guru_split_dft_r2c)(*rank, dims, *howmany_rank, howmany_dims,
+			       in, ro, io, *flags);
+     X(ifree0)(howmany_dims);
+     X(ifree0)(dims);
+}
+
+FFTW_VOIDFUNC F77(execute_dft_r2c, EXECUTE_DFT_R2C)(X(plan) * const p, R *in, C *out)
+{
+     plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
+     problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
+     pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), out[0], out[0]+1);
+}
+
+FFTW_VOIDFUNC F77(execute_split_dft_r2c, EXECUTE_SPLIT_DFT_R2C)(X(plan) * const p,
+						       R *in, R *ro, R *io)
+{
+     plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
+     problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
+     pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), ro, io);
+}
+
+/****************************** DFT c2r *********************************/
+
+FFTW_VOIDFUNC F77(plan_dft_c2r, PLAN_DFT_C2R)(X(plan) *p, int *rank, const int *n,
+				     C *in, R *out, int *flags)
+{
+     int *nrev = reverse_n(*rank, n);
+     *p = X(plan_dft_c2r)(*rank, nrev, in, out, *flags);
+     X(ifree0)(nrev);
+}
+
+FFTW_VOIDFUNC F77(plan_dft_c2r_1d, PLAN_DFT_C2R_1D)(X(plan) *p, int *n, C *in, R *out,
+					   int *flags)
+{
+     *p = X(plan_dft_c2r_1d)(*n, in, out, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_dft_c2r_2d, PLAN_DFT_C2R_2D)(X(plan) *p, int *nx, int *ny,
+					   C *in, R *out, int *flags)
+{
+     *p = X(plan_dft_c2r_2d)(*ny, *nx, in, out, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_dft_c2r_3d, PLAN_DFT_C2R_3D)(X(plan) *p,
+					   int *nx, int *ny, int *nz,
+					   C *in, R *out,
+					   int *flags)
+{
+     *p = X(plan_dft_c2r_3d)(*nz, *ny, *nx, in, out, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_many_dft_c2r, PLAN_MANY_DFT_C2R)(
+     X(plan) *p, int *rank, const int *n,
+     int *howmany,
+     C *in, const int *inembed, int *istride, int *idist,
+     R *out, const int *onembed, int *ostride, int *odist,
+     int *flags)
+{
+     int *nrev = reverse_n(*rank, n);
+     int *inembedrev = reverse_n(*rank, inembed);
+     int *onembedrev = reverse_n(*rank, onembed);
+     *p = X(plan_many_dft_c2r)(*rank, nrev, *howmany,
+			       in, inembedrev, *istride, *idist,
+			       out, onembedrev, *ostride, *odist,
+			       *flags);
+     X(ifree0)(onembedrev);
+     X(ifree0)(inembedrev);
+     X(ifree0)(nrev);
+}
+
+FFTW_VOIDFUNC F77(plan_guru_dft_c2r, PLAN_GURU_DFT_C2R)(
+     X(plan) *p, int *rank, const int *n,
+     const int *is, const int *os,
+     int *howmany_rank, const int *h_n,
+     const int *h_is, const int *h_os,
+     C *in, R *out, int *flags)
+{
+     X(iodim) *dims = make_dims(*rank, n, is, os);
+     X(iodim) *howmany_dims = make_dims(*howmany_rank, h_n, h_is, h_os);
+     *p = X(plan_guru_dft_c2r)(*rank, dims, *howmany_rank, howmany_dims,
+			       in, out, *flags);
+     X(ifree0)(howmany_dims);
+     X(ifree0)(dims);
+}
+
+FFTW_VOIDFUNC F77(plan_guru_split_dft_c2r, PLAN_GURU_SPLIT_DFT_C2R)(
+     X(plan) *p, int *rank, const int *n,
+     const int *is, const int *os,
+     int *howmany_rank, const int *h_n,
+     const int *h_is, const int *h_os,
+     R *ri, R *ii, R *out, int *flags)
+{
+     X(iodim) *dims = make_dims(*rank, n, is, os);
+     X(iodim) *howmany_dims = make_dims(*howmany_rank, h_n, h_is, h_os);
+     *p = X(plan_guru_split_dft_c2r)(*rank, dims, *howmany_rank, howmany_dims,
+			       ri, ii, out, *flags);
+     X(ifree0)(howmany_dims);
+     X(ifree0)(dims);
+}
+
+FFTW_VOIDFUNC F77(execute_dft_c2r, EXECUTE_DFT_C2R)(X(plan) * const p, C *in, R *out)
+{
+     plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
+     problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
+     pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), in[0], in[0]+1);
+}
+
+FFTW_VOIDFUNC F77(execute_split_dft_c2r, EXECUTE_SPLIT_DFT_C2R)(X(plan) * const p,
+					   R *ri, R *ii, R *out)
+{
+     plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
+     problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
+     pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), ri, ii);
+}
+
+/****************************** r2r *********************************/
+
+FFTW_VOIDFUNC F77(plan_r2r, PLAN_R2R)(X(plan) *p, int *rank, const int *n,
+			     R *in, R *out,
+			     int *kind, int *flags)
+{
+     int *nrev = reverse_n(*rank, n);
+     X(r2r_kind) *k = ints2kinds(*rank, kind);
+     *p = X(plan_r2r)(*rank, nrev, in, out, k, *flags);
+     X(ifree0)(k);
+     X(ifree0)(nrev);
+}
+
+FFTW_VOIDFUNC F77(plan_r2r_1d, PLAN_R2R_1D)(X(plan) *p, int *n, R *in, R *out,
+				   int *kind, int *flags)
+{
+     *p = X(plan_r2r_1d)(*n, in, out, (X(r2r_kind)) *kind, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_r2r_2d, PLAN_R2R_2D)(X(plan) *p, int *nx, int *ny,
+				   R *in, R *out, 
+				   int *kindx, int *kindy, int *flags)
+{
+     *p = X(plan_r2r_2d)(*ny, *nx, in, out,
+			 (X(r2r_kind)) *kindy, (X(r2r_kind)) *kindx, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_r2r_3d, PLAN_R2R_3D)(X(plan) *p,
+				   int *nx, int *ny, int *nz,
+				   R *in, R *out,
+				   int *kindx, int *kindy, int *kindz,
+				   int *flags)
+{
+     *p = X(plan_r2r_3d)(*nz, *ny, *nx, in, out,
+			 (X(r2r_kind)) *kindz, (X(r2r_kind)) *kindy, 
+			 (X(r2r_kind)) *kindx, *flags);
+}
+
+FFTW_VOIDFUNC F77(plan_many_r2r, PLAN_MANY_R2R)(
+     X(plan) *p, int *rank, const int *n,
+     int *howmany,
+     R *in, const int *inembed, int *istride, int *idist,
+     R *out, const int *onembed, int *ostride, int *odist,
+     int *kind, int *flags)
+{
+     int *nrev = reverse_n(*rank, n);
+     int *inembedrev = reverse_n(*rank, inembed);
+     int *onembedrev = reverse_n(*rank, onembed);
+     X(r2r_kind) *k = ints2kinds(*rank, kind);
+     *p = X(plan_many_r2r)(*rank, nrev, *howmany,
+			       in, inembedrev, *istride, *idist,
+			       out, onembedrev, *ostride, *odist,
+			       k, *flags);
+     X(ifree0)(k);
+     X(ifree0)(onembedrev);
+     X(ifree0)(inembedrev);
+     X(ifree0)(nrev);
+}
+
+FFTW_VOIDFUNC F77(plan_guru_r2r, PLAN_GURU_R2R)(
+     X(plan) *p, int *rank, const int *n,
+     const int *is, const int *os,
+     int *howmany_rank, const int *h_n,
+     const int *h_is, const int *h_os,
+     R *in, R *out, int *kind, int *flags)
+{
+     X(iodim) *dims = make_dims(*rank, n, is, os);
+     X(iodim) *howmany_dims = make_dims(*howmany_rank, h_n, h_is, h_os);
+     X(r2r_kind) *k = ints2kinds(*rank, kind);
+     *p = X(plan_guru_r2r)(*rank, dims, *howmany_rank, howmany_dims,
+			       in, out, k, *flags);
+     X(ifree0)(k);
+     X(ifree0)(howmany_dims);
+     X(ifree0)(dims);
+}
+
+FFTW_VOIDFUNC F77(execute_r2r, EXECUTE_R2R)(X(plan) * const p, R *in, R *out)
+{
+     plan_rdft *pln = (plan_rdft *) (*p)->pln;
+     pln->apply((plan *) pln, in, out);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/fftw3.f
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/fftw3.f	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,72 @@
+      INTEGER FFTW_R2HC
+      PARAMETER (FFTW_R2HC=0)
+      INTEGER FFTW_HC2R
+      PARAMETER (FFTW_HC2R=1)
+      INTEGER FFTW_DHT
+      PARAMETER (FFTW_DHT=2)
+      INTEGER FFTW_REDFT00
+      PARAMETER (FFTW_REDFT00=3)
+      INTEGER FFTW_REDFT01
+      PARAMETER (FFTW_REDFT01=4)
+      INTEGER FFTW_REDFT10
+      PARAMETER (FFTW_REDFT10=5)
+      INTEGER FFTW_REDFT11
+      PARAMETER (FFTW_REDFT11=6)
+      INTEGER FFTW_RODFT00
+      PARAMETER (FFTW_RODFT00=7)
+      INTEGER FFTW_RODFT01
+      PARAMETER (FFTW_RODFT01=8)
+      INTEGER FFTW_RODFT10
+      PARAMETER (FFTW_RODFT10=9)
+      INTEGER FFTW_RODFT11
+      PARAMETER (FFTW_RODFT11=10)
+      INTEGER FFTW_FORWARD
+      PARAMETER (FFTW_FORWARD=-1)
+      INTEGER FFTW_BACKWARD
+      PARAMETER (FFTW_BACKWARD=+1)
+      INTEGER FFTW_MEASURE
+      PARAMETER (FFTW_MEASURE=0)
+      INTEGER FFTW_DESTROY_INPUT
+      PARAMETER (FFTW_DESTROY_INPUT=1)
+      INTEGER FFTW_UNALIGNED
+      PARAMETER (FFTW_UNALIGNED=2)
+      INTEGER FFTW_CONSERVE_MEMORY
+      PARAMETER (FFTW_CONSERVE_MEMORY=4)
+      INTEGER FFTW_EXHAUSTIVE
+      PARAMETER (FFTW_EXHAUSTIVE=8)
+      INTEGER FFTW_PRESERVE_INPUT
+      PARAMETER (FFTW_PRESERVE_INPUT=16)
+      INTEGER FFTW_PATIENT
+      PARAMETER (FFTW_PATIENT=32)
+      INTEGER FFTW_ESTIMATE
+      PARAMETER (FFTW_ESTIMATE=64)
+      INTEGER FFTW_WISDOM_ONLY
+      PARAMETER (FFTW_WISDOM_ONLY=2097152)
+      INTEGER FFTW_ESTIMATE_PATIENT
+      PARAMETER (FFTW_ESTIMATE_PATIENT=128)
+      INTEGER FFTW_BELIEVE_PCOST
+      PARAMETER (FFTW_BELIEVE_PCOST=256)
+      INTEGER FFTW_NO_DFT_R2HC
+      PARAMETER (FFTW_NO_DFT_R2HC=512)
+      INTEGER FFTW_NO_NONTHREADED
+      PARAMETER (FFTW_NO_NONTHREADED=1024)
+      INTEGER FFTW_NO_BUFFERING
+      PARAMETER (FFTW_NO_BUFFERING=2048)
+      INTEGER FFTW_NO_INDIRECT_OP
+      PARAMETER (FFTW_NO_INDIRECT_OP=4096)
+      INTEGER FFTW_ALLOW_LARGE_GENERIC
+      PARAMETER (FFTW_ALLOW_LARGE_GENERIC=8192)
+      INTEGER FFTW_NO_RANK_SPLITS
+      PARAMETER (FFTW_NO_RANK_SPLITS=16384)
+      INTEGER FFTW_NO_VRANK_SPLITS
+      PARAMETER (FFTW_NO_VRANK_SPLITS=32768)
+      INTEGER FFTW_NO_VRECURSE
+      PARAMETER (FFTW_NO_VRECURSE=65536)
+      INTEGER FFTW_NO_SIMD
+      PARAMETER (FFTW_NO_SIMD=131072)
+      INTEGER FFTW_NO_SLOW
+      PARAMETER (FFTW_NO_SLOW=262144)
+      INTEGER FFTW_NO_FIXED_RADIX_LARGE_N
+      PARAMETER (FFTW_NO_FIXED_RADIX_LARGE_N=524288)
+      INTEGER FFTW_ALLOW_PRUNING
+      PARAMETER (FFTW_ALLOW_PRUNING=1048576)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/fftw3.f03.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/fftw3.f03.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1224 @@
+! Generated automatically.  DO NOT EDIT!
+
+  integer(C_INT), parameter :: FFTW_R2HC = 0
+  integer(C_INT), parameter :: FFTW_HC2R = 1
+  integer(C_INT), parameter :: FFTW_DHT = 2
+  integer(C_INT), parameter :: FFTW_REDFT00 = 3
+  integer(C_INT), parameter :: FFTW_REDFT01 = 4
+  integer(C_INT), parameter :: FFTW_REDFT10 = 5
+  integer(C_INT), parameter :: FFTW_REDFT11 = 6
+  integer(C_INT), parameter :: FFTW_RODFT00 = 7
+  integer(C_INT), parameter :: FFTW_RODFT01 = 8
+  integer(C_INT), parameter :: FFTW_RODFT10 = 9
+  integer(C_INT), parameter :: FFTW_RODFT11 = 10
+  integer(C_INT), parameter :: FFTW_FORWARD = -1
+  integer(C_INT), parameter :: FFTW_BACKWARD = +1
+  integer(C_INT), parameter :: FFTW_MEASURE = 0
+  integer(C_INT), parameter :: FFTW_DESTROY_INPUT = 1
+  integer(C_INT), parameter :: FFTW_UNALIGNED = 2
+  integer(C_INT), parameter :: FFTW_CONSERVE_MEMORY = 4
+  integer(C_INT), parameter :: FFTW_EXHAUSTIVE = 8
+  integer(C_INT), parameter :: FFTW_PRESERVE_INPUT = 16
+  integer(C_INT), parameter :: FFTW_PATIENT = 32
+  integer(C_INT), parameter :: FFTW_ESTIMATE = 64
+  integer(C_INT), parameter :: FFTW_WISDOM_ONLY = 2097152
+  integer(C_INT), parameter :: FFTW_ESTIMATE_PATIENT = 128
+  integer(C_INT), parameter :: FFTW_BELIEVE_PCOST = 256
+  integer(C_INT), parameter :: FFTW_NO_DFT_R2HC = 512
+  integer(C_INT), parameter :: FFTW_NO_NONTHREADED = 1024
+  integer(C_INT), parameter :: FFTW_NO_BUFFERING = 2048
+  integer(C_INT), parameter :: FFTW_NO_INDIRECT_OP = 4096
+  integer(C_INT), parameter :: FFTW_ALLOW_LARGE_GENERIC = 8192
+  integer(C_INT), parameter :: FFTW_NO_RANK_SPLITS = 16384
+  integer(C_INT), parameter :: FFTW_NO_VRANK_SPLITS = 32768
+  integer(C_INT), parameter :: FFTW_NO_VRECURSE = 65536
+  integer(C_INT), parameter :: FFTW_NO_SIMD = 131072
+  integer(C_INT), parameter :: FFTW_NO_SLOW = 262144
+  integer(C_INT), parameter :: FFTW_NO_FIXED_RADIX_LARGE_N = 524288
+  integer(C_INT), parameter :: FFTW_ALLOW_PRUNING = 1048576
+
+  type, bind(C) :: fftw_iodim
+     integer(C_INT) n, is, os
+  end type fftw_iodim
+  type, bind(C) :: fftw_iodim64
+     integer(C_INTPTR_T) n, is, os
+  end type fftw_iodim64
+
+  interface
+    type(C_PTR) function fftw_plan_dft(rank,n,in,out,sign,flags) bind(C, name='fftw_plan_dft')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft
+    
+    type(C_PTR) function fftw_plan_dft_1d(n,in,out,sign,flags) bind(C, name='fftw_plan_dft_1d')
+      import
+      integer(C_INT), value :: n
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_1d
+    
+    type(C_PTR) function fftw_plan_dft_2d(n0,n1,in,out,sign,flags) bind(C, name='fftw_plan_dft_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_2d
+    
+    type(C_PTR) function fftw_plan_dft_3d(n0,n1,n2,in,out,sign,flags) bind(C, name='fftw_plan_dft_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_3d
+    
+    type(C_PTR) function fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags) &
+                         bind(C, name='fftw_plan_many_dft')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_plan_many_dft
+    
+    type(C_PTR) function fftw_plan_guru_dft(rank,dims,howmany_rank,howmany_dims,in,out,sign,flags) &
+                         bind(C, name='fftw_plan_guru_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim), dimension(*), intent(in) :: howmany_dims
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru_dft
+    
+    type(C_PTR) function fftw_plan_guru_split_dft(rank,dims,howmany_rank,howmany_dims,ri,ii,ro,io,flags) &
+                         bind(C, name='fftw_plan_guru_split_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_DOUBLE), dimension(*), intent(out) :: ri
+      real(C_DOUBLE), dimension(*), intent(out) :: ii
+      real(C_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_DOUBLE), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru_split_dft
+    
+    type(C_PTR) function fftw_plan_guru64_dft(rank,dims,howmany_rank,howmany_dims,in,out,sign,flags) &
+                         bind(C, name='fftw_plan_guru64_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim64), dimension(*), intent(in) :: howmany_dims
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru64_dft
+    
+    type(C_PTR) function fftw_plan_guru64_split_dft(rank,dims,howmany_rank,howmany_dims,ri,ii,ro,io,flags) &
+                         bind(C, name='fftw_plan_guru64_split_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_DOUBLE), dimension(*), intent(out) :: ri
+      real(C_DOUBLE), dimension(*), intent(out) :: ii
+      real(C_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_DOUBLE), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru64_split_dft
+    
+    subroutine fftw_execute_dft(p,in,out) bind(C, name='fftw_execute_dft')
+      import
+      type(C_PTR), value :: p
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftw_execute_dft
+    
+    subroutine fftw_execute_split_dft(p,ri,ii,ro,io) bind(C, name='fftw_execute_split_dft')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), dimension(*), intent(inout) :: ri
+      real(C_DOUBLE), dimension(*), intent(inout) :: ii
+      real(C_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_DOUBLE), dimension(*), intent(out) :: io
+    end subroutine fftw_execute_split_dft
+    
+    type(C_PTR) function fftw_plan_many_dft_r2c(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,flags) &
+                         bind(C, name='fftw_plan_many_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: flags
+    end function fftw_plan_many_dft_r2c
+    
+    type(C_PTR) function fftw_plan_dft_r2c(rank,n,in,out,flags) bind(C, name='fftw_plan_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_r2c
+    
+    type(C_PTR) function fftw_plan_dft_r2c_1d(n,in,out,flags) bind(C, name='fftw_plan_dft_r2c_1d')
+      import
+      integer(C_INT), value :: n
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_r2c_1d
+    
+    type(C_PTR) function fftw_plan_dft_r2c_2d(n0,n1,in,out,flags) bind(C, name='fftw_plan_dft_r2c_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_r2c_2d
+    
+    type(C_PTR) function fftw_plan_dft_r2c_3d(n0,n1,n2,in,out,flags) bind(C, name='fftw_plan_dft_r2c_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_r2c_3d
+    
+    type(C_PTR) function fftw_plan_many_dft_c2r(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,flags) &
+                         bind(C, name='fftw_plan_many_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: flags
+    end function fftw_plan_many_dft_c2r
+    
+    type(C_PTR) function fftw_plan_dft_c2r(rank,n,in,out,flags) bind(C, name='fftw_plan_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_c2r
+    
+    type(C_PTR) function fftw_plan_dft_c2r_1d(n,in,out,flags) bind(C, name='fftw_plan_dft_c2r_1d')
+      import
+      integer(C_INT), value :: n
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_c2r_1d
+    
+    type(C_PTR) function fftw_plan_dft_c2r_2d(n0,n1,in,out,flags) bind(C, name='fftw_plan_dft_c2r_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_c2r_2d
+    
+    type(C_PTR) function fftw_plan_dft_c2r_3d(n0,n1,n2,in,out,flags) bind(C, name='fftw_plan_dft_c2r_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_dft_c2r_3d
+    
+    type(C_PTR) function fftw_plan_guru_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftw_plan_guru_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru_dft_r2c
+    
+    type(C_PTR) function fftw_plan_guru_dft_c2r(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftw_plan_guru_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim), dimension(*), intent(in) :: howmany_dims
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru_dft_c2r
+    
+    type(C_PTR) function fftw_plan_guru_split_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,ro,io,flags) &
+                         bind(C, name='fftw_plan_guru_split_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_DOUBLE), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru_split_dft_r2c
+    
+    type(C_PTR) function fftw_plan_guru_split_dft_c2r(rank,dims,howmany_rank,howmany_dims,ri,ii,out,flags) &
+                         bind(C, name='fftw_plan_guru_split_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_DOUBLE), dimension(*), intent(out) :: ri
+      real(C_DOUBLE), dimension(*), intent(out) :: ii
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru_split_dft_c2r
+    
+    type(C_PTR) function fftw_plan_guru64_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftw_plan_guru64_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru64_dft_r2c
+    
+    type(C_PTR) function fftw_plan_guru64_dft_c2r(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftw_plan_guru64_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim64), dimension(*), intent(in) :: howmany_dims
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru64_dft_c2r
+    
+    type(C_PTR) function fftw_plan_guru64_split_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,ro,io,flags) &
+                         bind(C, name='fftw_plan_guru64_split_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_DOUBLE), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru64_split_dft_r2c
+    
+    type(C_PTR) function fftw_plan_guru64_split_dft_c2r(rank,dims,howmany_rank,howmany_dims,ri,ii,out,flags) &
+                         bind(C, name='fftw_plan_guru64_split_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_DOUBLE), dimension(*), intent(out) :: ri
+      real(C_DOUBLE), dimension(*), intent(out) :: ii
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru64_split_dft_c2r
+    
+    subroutine fftw_execute_dft_r2c(p,in,out) bind(C, name='fftw_execute_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), dimension(*), intent(inout) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftw_execute_dft_r2c
+    
+    subroutine fftw_execute_dft_c2r(p,in,out) bind(C, name='fftw_execute_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftw_execute_dft_c2r
+    
+    subroutine fftw_execute_split_dft_r2c(p,in,ro,io) bind(C, name='fftw_execute_split_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), dimension(*), intent(inout) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_DOUBLE), dimension(*), intent(out) :: io
+    end subroutine fftw_execute_split_dft_r2c
+    
+    subroutine fftw_execute_split_dft_c2r(p,ri,ii,out) bind(C, name='fftw_execute_split_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), dimension(*), intent(inout) :: ri
+      real(C_DOUBLE), dimension(*), intent(inout) :: ii
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftw_execute_split_dft_c2r
+    
+    type(C_PTR) function fftw_plan_many_r2r(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,kind,flags) &
+                         bind(C, name='fftw_plan_many_r2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftw_plan_many_r2r
+    
+    type(C_PTR) function fftw_plan_r2r(rank,n,in,out,kind,flags) bind(C, name='fftw_plan_r2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftw_plan_r2r
+    
+    type(C_PTR) function fftw_plan_r2r_1d(n,in,out,kind,flags) bind(C, name='fftw_plan_r2r_1d')
+      import
+      integer(C_INT), value :: n
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind
+      integer(C_INT), value :: flags
+    end function fftw_plan_r2r_1d
+    
+    type(C_PTR) function fftw_plan_r2r_2d(n0,n1,in,out,kind0,kind1,flags) bind(C, name='fftw_plan_r2r_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_INT), value :: flags
+    end function fftw_plan_r2r_2d
+    
+    type(C_PTR) function fftw_plan_r2r_3d(n0,n1,n2,in,out,kind0,kind1,kind2,flags) bind(C, name='fftw_plan_r2r_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_FFTW_R2R_KIND), value :: kind2
+      integer(C_INT), value :: flags
+    end function fftw_plan_r2r_3d
+    
+    type(C_PTR) function fftw_plan_guru_r2r(rank,dims,howmany_rank,howmany_dims,in,out,kind,flags) &
+                         bind(C, name='fftw_plan_guru_r2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru_r2r
+    
+    type(C_PTR) function fftw_plan_guru64_r2r(rank,dims,howmany_rank,howmany_dims,in,out,kind,flags) &
+                         bind(C, name='fftw_plan_guru64_r2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftw_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftw_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftw_plan_guru64_r2r
+    
+    subroutine fftw_execute_r2r(p,in,out) bind(C, name='fftw_execute_r2r')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), dimension(*), intent(inout) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftw_execute_r2r
+    
+    subroutine fftw_destroy_plan(p) bind(C, name='fftw_destroy_plan')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftw_destroy_plan
+    
+    subroutine fftw_forget_wisdom() bind(C, name='fftw_forget_wisdom')
+      import
+    end subroutine fftw_forget_wisdom
+    
+    subroutine fftw_cleanup() bind(C, name='fftw_cleanup')
+      import
+    end subroutine fftw_cleanup
+    
+    subroutine fftw_set_timelimit(t) bind(C, name='fftw_set_timelimit')
+      import
+      real(C_DOUBLE), value :: t
+    end subroutine fftw_set_timelimit
+    
+    subroutine fftw_plan_with_nthreads(nthreads) bind(C, name='fftw_plan_with_nthreads')
+      import
+      integer(C_INT), value :: nthreads
+    end subroutine fftw_plan_with_nthreads
+    
+    integer(C_INT) function fftw_init_threads() bind(C, name='fftw_init_threads')
+      import
+    end function fftw_init_threads
+    
+    subroutine fftw_cleanup_threads() bind(C, name='fftw_cleanup_threads')
+      import
+    end subroutine fftw_cleanup_threads
+    
+    integer(C_INT) function fftw_export_wisdom_to_filename(filename) bind(C, name='fftw_export_wisdom_to_filename')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: filename
+    end function fftw_export_wisdom_to_filename
+    
+    subroutine fftw_export_wisdom_to_file(output_file) bind(C, name='fftw_export_wisdom_to_file')
+      import
+      type(C_PTR), value :: output_file
+    end subroutine fftw_export_wisdom_to_file
+    
+    type(C_PTR) function fftw_export_wisdom_to_string() bind(C, name='fftw_export_wisdom_to_string')
+      import
+    end function fftw_export_wisdom_to_string
+    
+    subroutine fftw_export_wisdom(write_char,data) bind(C, name='fftw_export_wisdom')
+      import
+      type(C_FUNPTR), value :: write_char
+      type(C_PTR), value :: data
+    end subroutine fftw_export_wisdom
+    
+    integer(C_INT) function fftw_import_system_wisdom() bind(C, name='fftw_import_system_wisdom')
+      import
+    end function fftw_import_system_wisdom
+    
+    integer(C_INT) function fftw_import_wisdom_from_filename(filename) bind(C, name='fftw_import_wisdom_from_filename')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: filename
+    end function fftw_import_wisdom_from_filename
+    
+    integer(C_INT) function fftw_import_wisdom_from_file(input_file) bind(C, name='fftw_import_wisdom_from_file')
+      import
+      type(C_PTR), value :: input_file
+    end function fftw_import_wisdom_from_file
+    
+    integer(C_INT) function fftw_import_wisdom_from_string(input_string) bind(C, name='fftw_import_wisdom_from_string')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: input_string
+    end function fftw_import_wisdom_from_string
+    
+    integer(C_INT) function fftw_import_wisdom(read_char,data) bind(C, name='fftw_import_wisdom')
+      import
+      type(C_FUNPTR), value :: read_char
+      type(C_PTR), value :: data
+    end function fftw_import_wisdom
+    
+    subroutine fftw_fprint_plan(p,output_file) bind(C, name='fftw_fprint_plan')
+      import
+      type(C_PTR), value :: p
+      type(C_PTR), value :: output_file
+    end subroutine fftw_fprint_plan
+    
+    subroutine fftw_print_plan(p) bind(C, name='fftw_print_plan')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftw_print_plan
+    
+    type(C_PTR) function fftw_malloc(n) bind(C, name='fftw_malloc')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftw_malloc
+    
+    type(C_PTR) function fftw_alloc_real(n) bind(C, name='fftw_alloc_real')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftw_alloc_real
+    
+    type(C_PTR) function fftw_alloc_complex(n) bind(C, name='fftw_alloc_complex')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftw_alloc_complex
+    
+    subroutine fftw_free(p) bind(C, name='fftw_free')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftw_free
+    
+    subroutine fftw_flops(p,add,mul,fmas) bind(C, name='fftw_flops')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), intent(out) :: add
+      real(C_DOUBLE), intent(out) :: mul
+      real(C_DOUBLE), intent(out) :: fmas
+    end subroutine fftw_flops
+    
+    real(C_DOUBLE) function fftw_estimate_cost(p) bind(C, name='fftw_estimate_cost')
+      import
+      type(C_PTR), value :: p
+    end function fftw_estimate_cost
+    
+    real(C_DOUBLE) function fftw_cost(p) bind(C, name='fftw_cost')
+      import
+      type(C_PTR), value :: p
+    end function fftw_cost
+    
+  end interface
+
+  type, bind(C) :: fftwf_iodim
+     integer(C_INT) n, is, os
+  end type fftwf_iodim
+  type, bind(C) :: fftwf_iodim64
+     integer(C_INTPTR_T) n, is, os
+  end type fftwf_iodim64
+
+  interface
+    type(C_PTR) function fftwf_plan_dft(rank,n,in,out,sign,flags) bind(C, name='fftwf_plan_dft')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft
+    
+    type(C_PTR) function fftwf_plan_dft_1d(n,in,out,sign,flags) bind(C, name='fftwf_plan_dft_1d')
+      import
+      integer(C_INT), value :: n
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_1d
+    
+    type(C_PTR) function fftwf_plan_dft_2d(n0,n1,in,out,sign,flags) bind(C, name='fftwf_plan_dft_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_2d
+    
+    type(C_PTR) function fftwf_plan_dft_3d(n0,n1,n2,in,out,sign,flags) bind(C, name='fftwf_plan_dft_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_3d
+    
+    type(C_PTR) function fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags) &
+                         bind(C, name='fftwf_plan_many_dft')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_plan_many_dft
+    
+    type(C_PTR) function fftwf_plan_guru_dft(rank,dims,howmany_rank,howmany_dims,in,out,sign,flags) &
+                         bind(C, name='fftwf_plan_guru_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim), dimension(*), intent(in) :: howmany_dims
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru_dft
+    
+    type(C_PTR) function fftwf_plan_guru_split_dft(rank,dims,howmany_rank,howmany_dims,ri,ii,ro,io,flags) &
+                         bind(C, name='fftwf_plan_guru_split_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_FLOAT), dimension(*), intent(out) :: ri
+      real(C_FLOAT), dimension(*), intent(out) :: ii
+      real(C_FLOAT), dimension(*), intent(out) :: ro
+      real(C_FLOAT), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru_split_dft
+    
+    type(C_PTR) function fftwf_plan_guru64_dft(rank,dims,howmany_rank,howmany_dims,in,out,sign,flags) &
+                         bind(C, name='fftwf_plan_guru64_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: howmany_dims
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru64_dft
+    
+    type(C_PTR) function fftwf_plan_guru64_split_dft(rank,dims,howmany_rank,howmany_dims,ri,ii,ro,io,flags) &
+                         bind(C, name='fftwf_plan_guru64_split_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_FLOAT), dimension(*), intent(out) :: ri
+      real(C_FLOAT), dimension(*), intent(out) :: ii
+      real(C_FLOAT), dimension(*), intent(out) :: ro
+      real(C_FLOAT), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru64_split_dft
+    
+    subroutine fftwf_execute_dft(p,in,out) bind(C, name='fftwf_execute_dft')
+      import
+      type(C_PTR), value :: p
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(inout) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwf_execute_dft
+    
+    subroutine fftwf_execute_split_dft(p,ri,ii,ro,io) bind(C, name='fftwf_execute_split_dft')
+      import
+      type(C_PTR), value :: p
+      real(C_FLOAT), dimension(*), intent(inout) :: ri
+      real(C_FLOAT), dimension(*), intent(inout) :: ii
+      real(C_FLOAT), dimension(*), intent(out) :: ro
+      real(C_FLOAT), dimension(*), intent(out) :: io
+    end subroutine fftwf_execute_split_dft
+    
+    type(C_PTR) function fftwf_plan_many_dft_r2c(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,flags) &
+                         bind(C, name='fftwf_plan_many_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: flags
+    end function fftwf_plan_many_dft_r2c
+    
+    type(C_PTR) function fftwf_plan_dft_r2c(rank,n,in,out,flags) bind(C, name='fftwf_plan_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_r2c
+    
+    type(C_PTR) function fftwf_plan_dft_r2c_1d(n,in,out,flags) bind(C, name='fftwf_plan_dft_r2c_1d')
+      import
+      integer(C_INT), value :: n
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_r2c_1d
+    
+    type(C_PTR) function fftwf_plan_dft_r2c_2d(n0,n1,in,out,flags) bind(C, name='fftwf_plan_dft_r2c_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_r2c_2d
+    
+    type(C_PTR) function fftwf_plan_dft_r2c_3d(n0,n1,n2,in,out,flags) bind(C, name='fftwf_plan_dft_r2c_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_r2c_3d
+    
+    type(C_PTR) function fftwf_plan_many_dft_c2r(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,flags) &
+                         bind(C, name='fftwf_plan_many_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: flags
+    end function fftwf_plan_many_dft_c2r
+    
+    type(C_PTR) function fftwf_plan_dft_c2r(rank,n,in,out,flags) bind(C, name='fftwf_plan_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_c2r
+    
+    type(C_PTR) function fftwf_plan_dft_c2r_1d(n,in,out,flags) bind(C, name='fftwf_plan_dft_c2r_1d')
+      import
+      integer(C_INT), value :: n
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_c2r_1d
+    
+    type(C_PTR) function fftwf_plan_dft_c2r_2d(n0,n1,in,out,flags) bind(C, name='fftwf_plan_dft_c2r_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_c2r_2d
+    
+    type(C_PTR) function fftwf_plan_dft_c2r_3d(n0,n1,n2,in,out,flags) bind(C, name='fftwf_plan_dft_c2r_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_dft_c2r_3d
+    
+    type(C_PTR) function fftwf_plan_guru_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwf_plan_guru_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru_dft_r2c
+    
+    type(C_PTR) function fftwf_plan_guru_dft_c2r(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwf_plan_guru_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim), dimension(*), intent(in) :: howmany_dims
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru_dft_c2r
+    
+    type(C_PTR) function fftwf_plan_guru_split_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,ro,io,flags) &
+                         bind(C, name='fftwf_plan_guru_split_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: ro
+      real(C_FLOAT), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru_split_dft_r2c
+    
+    type(C_PTR) function fftwf_plan_guru_split_dft_c2r(rank,dims,howmany_rank,howmany_dims,ri,ii,out,flags) &
+                         bind(C, name='fftwf_plan_guru_split_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_FLOAT), dimension(*), intent(out) :: ri
+      real(C_FLOAT), dimension(*), intent(out) :: ii
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru_split_dft_c2r
+    
+    type(C_PTR) function fftwf_plan_guru64_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwf_plan_guru64_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru64_dft_r2c
+    
+    type(C_PTR) function fftwf_plan_guru64_dft_c2r(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwf_plan_guru64_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: howmany_dims
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru64_dft_c2r
+    
+    type(C_PTR) function fftwf_plan_guru64_split_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,ro,io,flags) &
+                         bind(C, name='fftwf_plan_guru64_split_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: ro
+      real(C_FLOAT), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru64_split_dft_r2c
+    
+    type(C_PTR) function fftwf_plan_guru64_split_dft_c2r(rank,dims,howmany_rank,howmany_dims,ri,ii,out,flags) &
+                         bind(C, name='fftwf_plan_guru64_split_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_FLOAT), dimension(*), intent(out) :: ri
+      real(C_FLOAT), dimension(*), intent(out) :: ii
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru64_split_dft_c2r
+    
+    subroutine fftwf_execute_dft_r2c(p,in,out) bind(C, name='fftwf_execute_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_FLOAT), dimension(*), intent(inout) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwf_execute_dft_r2c
+    
+    subroutine fftwf_execute_dft_c2r(p,in,out) bind(C, name='fftwf_execute_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(inout) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+    end subroutine fftwf_execute_dft_c2r
+    
+    subroutine fftwf_execute_split_dft_r2c(p,in,ro,io) bind(C, name='fftwf_execute_split_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_FLOAT), dimension(*), intent(inout) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: ro
+      real(C_FLOAT), dimension(*), intent(out) :: io
+    end subroutine fftwf_execute_split_dft_r2c
+    
+    subroutine fftwf_execute_split_dft_c2r(p,ri,ii,out) bind(C, name='fftwf_execute_split_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      real(C_FLOAT), dimension(*), intent(inout) :: ri
+      real(C_FLOAT), dimension(*), intent(inout) :: ii
+      real(C_FLOAT), dimension(*), intent(out) :: out
+    end subroutine fftwf_execute_split_dft_c2r
+    
+    type(C_PTR) function fftwf_plan_many_r2r(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,kind,flags) &
+                         bind(C, name='fftwf_plan_many_r2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwf_plan_many_r2r
+    
+    type(C_PTR) function fftwf_plan_r2r(rank,n,in,out,kind,flags) bind(C, name='fftwf_plan_r2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwf_plan_r2r
+    
+    type(C_PTR) function fftwf_plan_r2r_1d(n,in,out,kind,flags) bind(C, name='fftwf_plan_r2r_1d')
+      import
+      integer(C_INT), value :: n
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind
+      integer(C_INT), value :: flags
+    end function fftwf_plan_r2r_1d
+    
+    type(C_PTR) function fftwf_plan_r2r_2d(n0,n1,in,out,kind0,kind1,flags) bind(C, name='fftwf_plan_r2r_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_INT), value :: flags
+    end function fftwf_plan_r2r_2d
+    
+    type(C_PTR) function fftwf_plan_r2r_3d(n0,n1,n2,in,out,kind0,kind1,kind2,flags) bind(C, name='fftwf_plan_r2r_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_FFTW_R2R_KIND), value :: kind2
+      integer(C_INT), value :: flags
+    end function fftwf_plan_r2r_3d
+    
+    type(C_PTR) function fftwf_plan_guru_r2r(rank,dims,howmany_rank,howmany_dims,in,out,kind,flags) &
+                         bind(C, name='fftwf_plan_guru_r2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru_r2r
+    
+    type(C_PTR) function fftwf_plan_guru64_r2r(rank,dims,howmany_rank,howmany_dims,in,out,kind,flags) &
+                         bind(C, name='fftwf_plan_guru64_r2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwf_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwf_plan_guru64_r2r
+    
+    subroutine fftwf_execute_r2r(p,in,out) bind(C, name='fftwf_execute_r2r')
+      import
+      type(C_PTR), value :: p
+      real(C_FLOAT), dimension(*), intent(inout) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+    end subroutine fftwf_execute_r2r
+    
+    subroutine fftwf_destroy_plan(p) bind(C, name='fftwf_destroy_plan')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftwf_destroy_plan
+    
+    subroutine fftwf_forget_wisdom() bind(C, name='fftwf_forget_wisdom')
+      import
+    end subroutine fftwf_forget_wisdom
+    
+    subroutine fftwf_cleanup() bind(C, name='fftwf_cleanup')
+      import
+    end subroutine fftwf_cleanup
+    
+    subroutine fftwf_set_timelimit(t) bind(C, name='fftwf_set_timelimit')
+      import
+      real(C_DOUBLE), value :: t
+    end subroutine fftwf_set_timelimit
+    
+    subroutine fftwf_plan_with_nthreads(nthreads) bind(C, name='fftwf_plan_with_nthreads')
+      import
+      integer(C_INT), value :: nthreads
+    end subroutine fftwf_plan_with_nthreads
+    
+    integer(C_INT) function fftwf_init_threads() bind(C, name='fftwf_init_threads')
+      import
+    end function fftwf_init_threads
+    
+    subroutine fftwf_cleanup_threads() bind(C, name='fftwf_cleanup_threads')
+      import
+    end subroutine fftwf_cleanup_threads
+    
+    integer(C_INT) function fftwf_export_wisdom_to_filename(filename) bind(C, name='fftwf_export_wisdom_to_filename')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: filename
+    end function fftwf_export_wisdom_to_filename
+    
+    subroutine fftwf_export_wisdom_to_file(output_file) bind(C, name='fftwf_export_wisdom_to_file')
+      import
+      type(C_PTR), value :: output_file
+    end subroutine fftwf_export_wisdom_to_file
+    
+    type(C_PTR) function fftwf_export_wisdom_to_string() bind(C, name='fftwf_export_wisdom_to_string')
+      import
+    end function fftwf_export_wisdom_to_string
+    
+    subroutine fftwf_export_wisdom(write_char,data) bind(C, name='fftwf_export_wisdom')
+      import
+      type(C_FUNPTR), value :: write_char
+      type(C_PTR), value :: data
+    end subroutine fftwf_export_wisdom
+    
+    integer(C_INT) function fftwf_import_system_wisdom() bind(C, name='fftwf_import_system_wisdom')
+      import
+    end function fftwf_import_system_wisdom
+    
+    integer(C_INT) function fftwf_import_wisdom_from_filename(filename) bind(C, name='fftwf_import_wisdom_from_filename')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: filename
+    end function fftwf_import_wisdom_from_filename
+    
+    integer(C_INT) function fftwf_import_wisdom_from_file(input_file) bind(C, name='fftwf_import_wisdom_from_file')
+      import
+      type(C_PTR), value :: input_file
+    end function fftwf_import_wisdom_from_file
+    
+    integer(C_INT) function fftwf_import_wisdom_from_string(input_string) bind(C, name='fftwf_import_wisdom_from_string')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: input_string
+    end function fftwf_import_wisdom_from_string
+    
+    integer(C_INT) function fftwf_import_wisdom(read_char,data) bind(C, name='fftwf_import_wisdom')
+      import
+      type(C_FUNPTR), value :: read_char
+      type(C_PTR), value :: data
+    end function fftwf_import_wisdom
+    
+    subroutine fftwf_fprint_plan(p,output_file) bind(C, name='fftwf_fprint_plan')
+      import
+      type(C_PTR), value :: p
+      type(C_PTR), value :: output_file
+    end subroutine fftwf_fprint_plan
+    
+    subroutine fftwf_print_plan(p) bind(C, name='fftwf_print_plan')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftwf_print_plan
+    
+    type(C_PTR) function fftwf_malloc(n) bind(C, name='fftwf_malloc')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftwf_malloc
+    
+    type(C_PTR) function fftwf_alloc_real(n) bind(C, name='fftwf_alloc_real')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftwf_alloc_real
+    
+    type(C_PTR) function fftwf_alloc_complex(n) bind(C, name='fftwf_alloc_complex')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftwf_alloc_complex
+    
+    subroutine fftwf_free(p) bind(C, name='fftwf_free')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftwf_free
+    
+    subroutine fftwf_flops(p,add,mul,fmas) bind(C, name='fftwf_flops')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), intent(out) :: add
+      real(C_DOUBLE), intent(out) :: mul
+      real(C_DOUBLE), intent(out) :: fmas
+    end subroutine fftwf_flops
+    
+    real(C_DOUBLE) function fftwf_estimate_cost(p) bind(C, name='fftwf_estimate_cost')
+      import
+      type(C_PTR), value :: p
+    end function fftwf_estimate_cost
+    
+    real(C_DOUBLE) function fftwf_cost(p) bind(C, name='fftwf_cost')
+      import
+      type(C_PTR), value :: p
+    end function fftwf_cost
+    
+  end interface
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/fftw3.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/fftw3.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * The following statement of license applies *only* to this header file,
+ * and *not* to the other files distributed with FFTW or derived therefrom:
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/***************************** NOTE TO USERS *********************************
+ *
+ *                 THIS IS A HEADER FILE, NOT A MANUAL
+ *
+ *    If you want to know how to use FFTW, please read the manual,
+ *    online at http://www.fftw.org/doc/ and also included with FFTW.
+ *    For a quick start, see the manual's tutorial section.
+ *
+ *   (Reading header files to learn how to use a library is a habit
+ *    stemming from code lacking a proper manual.  Arguably, it's a
+ *    *bad* habit in most cases, because header files can contain
+ *    interfaces that are not part of the public, stable API.)
+ *
+ ****************************************************************************/
+
+#ifndef FFTW3_H
+#define FFTW3_H
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+/* If <complex.h> is included, use the C99 complex type.  Otherwise
+   define a type bit-compatible with C99 complex */
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+#  define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C
+#else
+#  define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2]
+#endif
+
+#define FFTW_CONCAT(prefix, name) prefix ## name
+#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name)
+#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name)
+#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name)
+#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name)
+
+/* IMPORTANT: for Windows compilers, you should add a line
+        #define FFTW_DLL
+   here and in kernel/ifftw.h if you are compiling/using FFTW as a
+   DLL, in order to do the proper importing/exporting, or
+   alternatively compile with -DFFTW_DLL or the equivalent
+   command-line flag.  This is not necessary under MinGW/Cygwin, where
+   libtool does the imports/exports automatically. */
+#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
+   /* annoying Windows syntax for shared-library declarations */
+#  if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */
+#    define FFTW_EXTERN extern __declspec(dllexport) 
+#  else /* user is calling FFTW; import symbol */
+#    define FFTW_EXTERN extern __declspec(dllimport) 
+#  endif
+#else
+#  define FFTW_EXTERN extern
+#endif
+
+enum fftw_r2r_kind_do_not_use_me {
+     FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
+     FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
+     FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
+};
+
+struct fftw_iodim_do_not_use_me {
+     int n;                     /* dimension size */
+     int is;			/* input stride */
+     int os;			/* output stride */
+};
+
+#include <stddef.h> /* for ptrdiff_t */
+struct fftw_iodim64_do_not_use_me {
+     ptrdiff_t n;                     /* dimension size */
+     ptrdiff_t is;			/* input stride */
+     ptrdiff_t os;			/* output stride */
+};
+
+typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *);
+typedef int (*fftw_read_char_func_do_not_use_me)(void *);
+
+/*
+  huge second-order macro that defines prototypes for all API
+  functions.  We expand this macro for each supported precision
+ 
+  X: name-mangling macro
+  R: real data type
+  C: complex data type
+*/
+
+#define FFTW_DEFINE_API(X, R, C)					   \
+									   \
+FFTW_DEFINE_COMPLEX(R, C);						   \
+									   \
+typedef struct X(plan_s) *X(plan);					   \
+									   \
+typedef struct fftw_iodim_do_not_use_me X(iodim);			   \
+typedef struct fftw_iodim64_do_not_use_me X(iodim64);			   \
+									   \
+typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind);			   \
+									   \
+typedef fftw_write_char_func_do_not_use_me X(write_char_func);		   \
+typedef fftw_read_char_func_do_not_use_me X(read_char_func);		   \
+									   \
+FFTW_EXTERN void X(execute)(const X(plan) p);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n,			   \
+		    C *in, C *out, int sign, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign,	   \
+		       unsigned flags);					   \
+FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1,			   \
+		       C *in, C *out, int sign, unsigned flags);	   \
+FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2,		   \
+		       C *in, C *out, int sign, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n,		   \
+                         int howmany,					   \
+                         C *in, const int *inembed,			   \
+                         int istride, int idist,			   \
+                         C *out, const int *onembed,			   \
+                         int ostride, int odist,			   \
+                         int sign, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims,	   \
+			 int howmany_rank,				   \
+			 const X(iodim) *howmany_dims,			   \
+			 C *in, C *out,					   \
+			 int sign, unsigned flags);			   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \
+			 int howmany_rank,				   \
+			 const X(iodim) *howmany_dims,			   \
+			 R *ri, R *ii, R *ro, R *io,			   \
+			 unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank,			   \
+                         const X(iodim64) *dims,			   \
+			 int howmany_rank,				   \
+			 const X(iodim64) *howmany_dims,		   \
+			 C *in, C *out,					   \
+			 int sign, unsigned flags);			   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank,			   \
+                         const X(iodim64) *dims,			   \
+			 int howmany_rank,				   \
+			 const X(iodim64) *howmany_dims,		   \
+			 R *ri, R *ii, R *ro, R *io,			   \
+			 unsigned flags);				   \
+									   \
+FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out);	   \
+FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii,	   \
+                                      R *ro, R *io);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n,	   \
+                             int howmany,				   \
+                             R *in, const int *inembed,			   \
+                             int istride, int idist,			   \
+                             C *out, const int *onembed,		   \
+                             int ostride, int odist,			   \
+                             unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n,		   \
+                        R *in, C *out, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1,			   \
+			   R *in, C *out, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1,			   \
+			   int n2,					   \
+			   R *in, C *out, unsigned flags);		   \
+									   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n,	   \
+			     int howmany,				   \
+			     C *in, const int *inembed,			   \
+			     int istride, int idist,			   \
+			     R *out, const int *onembed,		   \
+			     int ostride, int odist,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n,		   \
+                        C *in, R *out, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1,			   \
+			   C *in, R *out, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1,			   \
+			   int n2,					   \
+			   C *in, R *out, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims,   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *in, C *out,				   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims,   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     C *in, R *out,				   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)(				   \
+                             int rank, const X(iodim) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *in, R *ro, R *io,			   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)(				   \
+                             int rank, const X(iodim) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *ri, R *ii, R *out,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank,			   \
+                             const X(iodim64) *dims,			   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *in, C *out,				   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank,			   \
+                             const X(iodim64) *dims,			   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     C *in, R *out,				   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)(			   \
+                             int rank, const X(iodim64) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *in, R *ro, R *io,			   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)(			   \
+                             int rank, const X(iodim64) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *ri, R *ii, R *out,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out);	   \
+FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out);	   \
+									   \
+FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p,		   \
+                                          R *in, R *ro, R *io);		   \
+FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p,		   \
+                                          R *ri, R *ii, R *out);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n,		   \
+                         int howmany,					   \
+                         R *in, const int *inembed,			   \
+                         int istride, int idist,			   \
+                         R *out, const int *onembed,			   \
+                         int ostride, int odist,			   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out,	   \
+                    const X(r2r_kind) *kind, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out,		   \
+                       X(r2r_kind) kind, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out,	   \
+                       X(r2r_kind) kind0, X(r2r_kind) kind1,		   \
+                       unsigned flags);					   \
+FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2,		   \
+                       R *in, R *out, X(r2r_kind) kind0,		   \
+                       X(r2r_kind) kind1, X(r2r_kind) kind2,		   \
+                       unsigned flags);					   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims,	   \
+                         int howmany_rank,				   \
+                         const X(iodim) *howmany_dims,			   \
+                         R *in, R *out,					   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims,   \
+                         int howmany_rank,				   \
+                         const X(iodim64) *howmany_dims,		   \
+                         R *in, R *out,					   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out);	   \
+									   \
+FFTW_EXTERN void X(destroy_plan)(X(plan) p);				   \
+FFTW_EXTERN void X(forget_wisdom)(void);				   \
+FFTW_EXTERN void X(cleanup)(void);					   \
+									   \
+FFTW_EXTERN void X(set_timelimit)(double t);				   \
+									   \
+FFTW_EXTERN void X(plan_with_nthreads)(int nthreads);			   \
+FFTW_EXTERN int X(init_threads)(void);					   \
+FFTW_EXTERN void X(cleanup_threads)(void);				   \
+									   \
+FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename);	   \
+FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file);		   \
+FFTW_EXTERN char *X(export_wisdom_to_string)(void);			   \
+FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char,   	   \
+                                  void *data);				   \
+FFTW_EXTERN int X(import_system_wisdom)(void);				   \
+FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename);	   \
+FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file);		   \
+FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string);	   \
+FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \
+									   \
+FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file);	   \
+FFTW_EXTERN void X(print_plan)(const X(plan) p);			   \
+									   \
+FFTW_EXTERN void *X(malloc)(size_t n);					   \
+FFTW_EXTERN R *X(alloc_real)(size_t n);					   \
+FFTW_EXTERN C *X(alloc_complex)(size_t n);				   \
+FFTW_EXTERN void X(free)(void *p);					   \
+									   \
+FFTW_EXTERN void X(flops)(const X(plan) p,				   \
+                          double *add, double *mul, double *fmas);	   \
+FFTW_EXTERN double X(estimate_cost)(const X(plan) p);			   \
+FFTW_EXTERN double X(cost)(const X(plan) p);				   \
+									   \
+FFTW_EXTERN const char X(version)[];					   \
+FFTW_EXTERN const char X(cc)[];						   \
+FFTW_EXTERN const char X(codelet_optim)[];
+
+
+/* end of FFTW_DEFINE_API macro */
+
+FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex)
+FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex)
+FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
+
+/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
+   for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
+#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
+ && !(defined(__ICC) || defined(__INTEL_COMPILER)) \
+ && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
+#  if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+/* note: __float128 is a typedef, which is not supported with the _Complex
+         keyword in gcc, so instead we use this ugly __attribute__ version.
+         However, we can't simply pass the __attribute__ version to
+         FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer
+         types.  Hence redefining FFTW_DEFINE_COMPLEX.  Ugh. */
+#    undef FFTW_DEFINE_COMPLEX
+#    define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C
+#  endif
+FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex)
+#endif
+
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+
+#define FFTW_NO_TIMELIMIT (-1.0)
+
+/* documented flags */
+#define FFTW_MEASURE (0U)
+#define FFTW_DESTROY_INPUT (1U << 0)
+#define FFTW_UNALIGNED (1U << 1)
+#define FFTW_CONSERVE_MEMORY (1U << 2)
+#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */
+#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */
+#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */
+#define FFTW_ESTIMATE (1U << 6)
+#define FFTW_WISDOM_ONLY (1U << 21)
+
+/* undocumented beyond-guru flags */
+#define FFTW_ESTIMATE_PATIENT (1U << 7)
+#define FFTW_BELIEVE_PCOST (1U << 8)
+#define FFTW_NO_DFT_R2HC (1U << 9)
+#define FFTW_NO_NONTHREADED (1U << 10)
+#define FFTW_NO_BUFFERING (1U << 11)
+#define FFTW_NO_INDIRECT_OP (1U << 12)
+#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */
+#define FFTW_NO_RANK_SPLITS (1U << 14)
+#define FFTW_NO_VRANK_SPLITS (1U << 15)
+#define FFTW_NO_VRECURSE (1U << 16)
+#define FFTW_NO_SIMD (1U << 17)
+#define FFTW_NO_SLOW (1U << 18)
+#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19)
+#define FFTW_ALLOW_PRUNING (1U << 20)
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* FFTW3_H */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/fftw3l.f03
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/fftw3l.f03	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,595 @@
+! Generated automatically.  DO NOT EDIT!
+
+
+  type, bind(C) :: fftwl_iodim
+     integer(C_INT) n, is, os
+  end type fftwl_iodim
+  type, bind(C) :: fftwl_iodim64
+     integer(C_INTPTR_T) n, is, os
+  end type fftwl_iodim64
+
+  interface
+    type(C_PTR) function fftwl_plan_dft(rank,n,in,out,sign,flags) bind(C, name='fftwl_plan_dft')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft
+    
+    type(C_PTR) function fftwl_plan_dft_1d(n,in,out,sign,flags) bind(C, name='fftwl_plan_dft_1d')
+      import
+      integer(C_INT), value :: n
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_1d
+    
+    type(C_PTR) function fftwl_plan_dft_2d(n0,n1,in,out,sign,flags) bind(C, name='fftwl_plan_dft_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_2d
+    
+    type(C_PTR) function fftwl_plan_dft_3d(n0,n1,n2,in,out,sign,flags) bind(C, name='fftwl_plan_dft_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_3d
+    
+    type(C_PTR) function fftwl_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags) &
+                         bind(C, name='fftwl_plan_many_dft')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_plan_many_dft
+    
+    type(C_PTR) function fftwl_plan_guru_dft(rank,dims,howmany_rank,howmany_dims,in,out,sign,flags) &
+                         bind(C, name='fftwl_plan_guru_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim), dimension(*), intent(in) :: howmany_dims
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru_dft
+    
+    type(C_PTR) function fftwl_plan_guru_split_dft(rank,dims,howmany_rank,howmany_dims,ri,ii,ro,io,flags) &
+                         bind(C, name='fftwl_plan_guru_split_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ri
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ii
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru_split_dft
+    
+    type(C_PTR) function fftwl_plan_guru64_dft(rank,dims,howmany_rank,howmany_dims,in,out,sign,flags) &
+                         bind(C, name='fftwl_plan_guru64_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: howmany_dims
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru64_dft
+    
+    type(C_PTR) function fftwl_plan_guru64_split_dft(rank,dims,howmany_rank,howmany_dims,ri,ii,ro,io,flags) &
+                         bind(C, name='fftwl_plan_guru64_split_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ri
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ii
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru64_split_dft
+    
+    subroutine fftwl_execute_dft(p,in,out) bind(C, name='fftwl_execute_dft')
+      import
+      type(C_PTR), value :: p
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwl_execute_dft
+    
+    subroutine fftwl_execute_split_dft(p,ri,ii,ro,io) bind(C, name='fftwl_execute_split_dft')
+      import
+      type(C_PTR), value :: p
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: ri
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: ii
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: io
+    end subroutine fftwl_execute_split_dft
+    
+    type(C_PTR) function fftwl_plan_many_dft_r2c(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,flags) &
+                         bind(C, name='fftwl_plan_many_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: flags
+    end function fftwl_plan_many_dft_r2c
+    
+    type(C_PTR) function fftwl_plan_dft_r2c(rank,n,in,out,flags) bind(C, name='fftwl_plan_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_r2c
+    
+    type(C_PTR) function fftwl_plan_dft_r2c_1d(n,in,out,flags) bind(C, name='fftwl_plan_dft_r2c_1d')
+      import
+      integer(C_INT), value :: n
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_r2c_1d
+    
+    type(C_PTR) function fftwl_plan_dft_r2c_2d(n0,n1,in,out,flags) bind(C, name='fftwl_plan_dft_r2c_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_r2c_2d
+    
+    type(C_PTR) function fftwl_plan_dft_r2c_3d(n0,n1,n2,in,out,flags) bind(C, name='fftwl_plan_dft_r2c_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_r2c_3d
+    
+    type(C_PTR) function fftwl_plan_many_dft_c2r(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,flags) &
+                         bind(C, name='fftwl_plan_many_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: flags
+    end function fftwl_plan_many_dft_c2r
+    
+    type(C_PTR) function fftwl_plan_dft_c2r(rank,n,in,out,flags) bind(C, name='fftwl_plan_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_c2r
+    
+    type(C_PTR) function fftwl_plan_dft_c2r_1d(n,in,out,flags) bind(C, name='fftwl_plan_dft_c2r_1d')
+      import
+      integer(C_INT), value :: n
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_c2r_1d
+    
+    type(C_PTR) function fftwl_plan_dft_c2r_2d(n0,n1,in,out,flags) bind(C, name='fftwl_plan_dft_c2r_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_c2r_2d
+    
+    type(C_PTR) function fftwl_plan_dft_c2r_3d(n0,n1,n2,in,out,flags) bind(C, name='fftwl_plan_dft_c2r_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_dft_c2r_3d
+    
+    type(C_PTR) function fftwl_plan_guru_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwl_plan_guru_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru_dft_r2c
+    
+    type(C_PTR) function fftwl_plan_guru_dft_c2r(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwl_plan_guru_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim), dimension(*), intent(in) :: howmany_dims
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru_dft_c2r
+    
+    type(C_PTR) function fftwl_plan_guru_split_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,ro,io,flags) &
+                         bind(C, name='fftwl_plan_guru_split_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru_split_dft_r2c
+    
+    type(C_PTR) function fftwl_plan_guru_split_dft_c2r(rank,dims,howmany_rank,howmany_dims,ri,ii,out,flags) &
+                         bind(C, name='fftwl_plan_guru_split_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ri
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ii
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru_split_dft_c2r
+    
+    type(C_PTR) function fftwl_plan_guru64_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwl_plan_guru64_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru64_dft_r2c
+    
+    type(C_PTR) function fftwl_plan_guru64_dft_c2r(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwl_plan_guru64_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: howmany_dims
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru64_dft_c2r
+    
+    type(C_PTR) function fftwl_plan_guru64_split_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,ro,io,flags) &
+                         bind(C, name='fftwl_plan_guru64_split_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru64_split_dft_r2c
+    
+    type(C_PTR) function fftwl_plan_guru64_split_dft_c2r(rank,dims,howmany_rank,howmany_dims,ri,ii,out,flags) &
+                         bind(C, name='fftwl_plan_guru64_split_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ri
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ii
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru64_split_dft_c2r
+    
+    subroutine fftwl_execute_dft_r2c(p,in,out) bind(C, name='fftwl_execute_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwl_execute_dft_r2c
+    
+    subroutine fftwl_execute_dft_c2r(p,in,out) bind(C, name='fftwl_execute_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftwl_execute_dft_c2r
+    
+    subroutine fftwl_execute_split_dft_r2c(p,in,ro,io) bind(C, name='fftwl_execute_split_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: ro
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: io
+    end subroutine fftwl_execute_split_dft_r2c
+    
+    subroutine fftwl_execute_split_dft_c2r(p,ri,ii,out) bind(C, name='fftwl_execute_split_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: ri
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: ii
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftwl_execute_split_dft_c2r
+    
+    type(C_PTR) function fftwl_plan_many_r2r(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,kind,flags) &
+                         bind(C, name='fftwl_plan_many_r2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwl_plan_many_r2r
+    
+    type(C_PTR) function fftwl_plan_r2r(rank,n,in,out,kind,flags) bind(C, name='fftwl_plan_r2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwl_plan_r2r
+    
+    type(C_PTR) function fftwl_plan_r2r_1d(n,in,out,kind,flags) bind(C, name='fftwl_plan_r2r_1d')
+      import
+      integer(C_INT), value :: n
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind
+      integer(C_INT), value :: flags
+    end function fftwl_plan_r2r_1d
+    
+    type(C_PTR) function fftwl_plan_r2r_2d(n0,n1,in,out,kind0,kind1,flags) bind(C, name='fftwl_plan_r2r_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_INT), value :: flags
+    end function fftwl_plan_r2r_2d
+    
+    type(C_PTR) function fftwl_plan_r2r_3d(n0,n1,n2,in,out,kind0,kind1,kind2,flags) bind(C, name='fftwl_plan_r2r_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_FFTW_R2R_KIND), value :: kind2
+      integer(C_INT), value :: flags
+    end function fftwl_plan_r2r_3d
+    
+    type(C_PTR) function fftwl_plan_guru_r2r(rank,dims,howmany_rank,howmany_dims,in,out,kind,flags) &
+                         bind(C, name='fftwl_plan_guru_r2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim), dimension(*), intent(in) :: howmany_dims
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru_r2r
+    
+    type(C_PTR) function fftwl_plan_guru64_r2r(rank,dims,howmany_rank,howmany_dims,in,out,kind,flags) &
+                         bind(C, name='fftwl_plan_guru64_r2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwl_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwl_plan_guru64_r2r
+    
+    subroutine fftwl_execute_r2r(p,in,out) bind(C, name='fftwl_execute_r2r')
+      import
+      type(C_PTR), value :: p
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftwl_execute_r2r
+    
+    subroutine fftwl_destroy_plan(p) bind(C, name='fftwl_destroy_plan')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftwl_destroy_plan
+    
+    subroutine fftwl_forget_wisdom() bind(C, name='fftwl_forget_wisdom')
+      import
+    end subroutine fftwl_forget_wisdom
+    
+    subroutine fftwl_cleanup() bind(C, name='fftwl_cleanup')
+      import
+    end subroutine fftwl_cleanup
+    
+    subroutine fftwl_set_timelimit(t) bind(C, name='fftwl_set_timelimit')
+      import
+      real(C_DOUBLE), value :: t
+    end subroutine fftwl_set_timelimit
+    
+    subroutine fftwl_plan_with_nthreads(nthreads) bind(C, name='fftwl_plan_with_nthreads')
+      import
+      integer(C_INT), value :: nthreads
+    end subroutine fftwl_plan_with_nthreads
+    
+    integer(C_INT) function fftwl_init_threads() bind(C, name='fftwl_init_threads')
+      import
+    end function fftwl_init_threads
+    
+    subroutine fftwl_cleanup_threads() bind(C, name='fftwl_cleanup_threads')
+      import
+    end subroutine fftwl_cleanup_threads
+    
+    integer(C_INT) function fftwl_export_wisdom_to_filename(filename) bind(C, name='fftwl_export_wisdom_to_filename')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: filename
+    end function fftwl_export_wisdom_to_filename
+    
+    subroutine fftwl_export_wisdom_to_file(output_file) bind(C, name='fftwl_export_wisdom_to_file')
+      import
+      type(C_PTR), value :: output_file
+    end subroutine fftwl_export_wisdom_to_file
+    
+    type(C_PTR) function fftwl_export_wisdom_to_string() bind(C, name='fftwl_export_wisdom_to_string')
+      import
+    end function fftwl_export_wisdom_to_string
+    
+    subroutine fftwl_export_wisdom(write_char,data) bind(C, name='fftwl_export_wisdom')
+      import
+      type(C_FUNPTR), value :: write_char
+      type(C_PTR), value :: data
+    end subroutine fftwl_export_wisdom
+    
+    integer(C_INT) function fftwl_import_system_wisdom() bind(C, name='fftwl_import_system_wisdom')
+      import
+    end function fftwl_import_system_wisdom
+    
+    integer(C_INT) function fftwl_import_wisdom_from_filename(filename) bind(C, name='fftwl_import_wisdom_from_filename')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: filename
+    end function fftwl_import_wisdom_from_filename
+    
+    integer(C_INT) function fftwl_import_wisdom_from_file(input_file) bind(C, name='fftwl_import_wisdom_from_file')
+      import
+      type(C_PTR), value :: input_file
+    end function fftwl_import_wisdom_from_file
+    
+    integer(C_INT) function fftwl_import_wisdom_from_string(input_string) bind(C, name='fftwl_import_wisdom_from_string')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: input_string
+    end function fftwl_import_wisdom_from_string
+    
+    integer(C_INT) function fftwl_import_wisdom(read_char,data) bind(C, name='fftwl_import_wisdom')
+      import
+      type(C_FUNPTR), value :: read_char
+      type(C_PTR), value :: data
+    end function fftwl_import_wisdom
+    
+    subroutine fftwl_fprint_plan(p,output_file) bind(C, name='fftwl_fprint_plan')
+      import
+      type(C_PTR), value :: p
+      type(C_PTR), value :: output_file
+    end subroutine fftwl_fprint_plan
+    
+    subroutine fftwl_print_plan(p) bind(C, name='fftwl_print_plan')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftwl_print_plan
+    
+    type(C_PTR) function fftwl_malloc(n) bind(C, name='fftwl_malloc')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftwl_malloc
+    
+    type(C_PTR) function fftwl_alloc_real(n) bind(C, name='fftwl_alloc_real')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftwl_alloc_real
+    
+    type(C_PTR) function fftwl_alloc_complex(n) bind(C, name='fftwl_alloc_complex')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftwl_alloc_complex
+    
+    subroutine fftwl_free(p) bind(C, name='fftwl_free')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftwl_free
+    
+    subroutine fftwl_flops(p,add,mul,fmas) bind(C, name='fftwl_flops')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), intent(out) :: add
+      real(C_DOUBLE), intent(out) :: mul
+      real(C_DOUBLE), intent(out) :: fmas
+    end subroutine fftwl_flops
+    
+    real(C_DOUBLE) function fftwl_estimate_cost(p) bind(C, name='fftwl_estimate_cost')
+      import
+      type(C_PTR), value :: p
+    end function fftwl_estimate_cost
+    
+    real(C_DOUBLE) function fftwl_cost(p) bind(C, name='fftwl_cost')
+      import
+      type(C_PTR), value :: p
+    end function fftwl_cost
+    
+  end interface
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/fftw3q.f03
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/fftw3q.f03	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,591 @@
+! Generated automatically.  DO NOT EDIT!
+
+
+  type, bind(C) :: fftwq_iodim
+     integer(C_INT) n, is, os
+  end type fftwq_iodim
+  type, bind(C) :: fftwq_iodim64
+     integer(C_INTPTR_T) n, is, os
+  end type fftwq_iodim64
+
+  interface
+    type(C_PTR) function fftwq_plan_dft(rank,n,in,out,sign,flags) bind(C, name='fftwq_plan_dft')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      complex(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft
+    
+    type(C_PTR) function fftwq_plan_dft_1d(n,in,out,sign,flags) bind(C, name='fftwq_plan_dft_1d')
+      import
+      integer(C_INT), value :: n
+      complex(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_1d
+    
+    type(C_PTR) function fftwq_plan_dft_2d(n0,n1,in,out,sign,flags) bind(C, name='fftwq_plan_dft_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      complex(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_2d
+    
+    type(C_PTR) function fftwq_plan_dft_3d(n0,n1,n2,in,out,sign,flags) bind(C, name='fftwq_plan_dft_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      complex(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_3d
+    
+    type(C_PTR) function fftwq_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags) &
+                         bind(C, name='fftwq_plan_many_dft')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      complex(16), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwq_plan_many_dft
+    
+    type(C_PTR) function fftwq_plan_guru_dft(rank,dims,howmany_rank,howmany_dims,in,out,sign,flags) &
+                         bind(C, name='fftwq_plan_guru_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim), dimension(*), intent(in) :: howmany_dims
+      complex(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru_dft
+    
+    type(C_PTR) function fftwq_plan_guru_split_dft(rank,dims,howmany_rank,howmany_dims,ri,ii,ro,io,flags) &
+                         bind(C, name='fftwq_plan_guru_split_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim), dimension(*), intent(in) :: howmany_dims
+      real(16), dimension(*), intent(out) :: ri
+      real(16), dimension(*), intent(out) :: ii
+      real(16), dimension(*), intent(out) :: ro
+      real(16), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru_split_dft
+    
+    type(C_PTR) function fftwq_plan_guru64_dft(rank,dims,howmany_rank,howmany_dims,in,out,sign,flags) &
+                         bind(C, name='fftwq_plan_guru64_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: howmany_dims
+      complex(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru64_dft
+    
+    type(C_PTR) function fftwq_plan_guru64_split_dft(rank,dims,howmany_rank,howmany_dims,ri,ii,ro,io,flags) &
+                         bind(C, name='fftwq_plan_guru64_split_dft')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(16), dimension(*), intent(out) :: ri
+      real(16), dimension(*), intent(out) :: ii
+      real(16), dimension(*), intent(out) :: ro
+      real(16), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru64_split_dft
+    
+    subroutine fftwq_execute_dft(p,in,out) bind(C, name='fftwq_execute_dft')
+      import
+      type(C_PTR), value :: p
+      complex(16), dimension(*), intent(inout) :: in
+      complex(16), dimension(*), intent(out) :: out
+    end subroutine fftwq_execute_dft
+    
+    subroutine fftwq_execute_split_dft(p,ri,ii,ro,io) bind(C, name='fftwq_execute_split_dft')
+      import
+      type(C_PTR), value :: p
+      real(16), dimension(*), intent(inout) :: ri
+      real(16), dimension(*), intent(inout) :: ii
+      real(16), dimension(*), intent(out) :: ro
+      real(16), dimension(*), intent(out) :: io
+    end subroutine fftwq_execute_split_dft
+    
+    type(C_PTR) function fftwq_plan_many_dft_r2c(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,flags) &
+                         bind(C, name='fftwq_plan_many_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      real(16), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: flags
+    end function fftwq_plan_many_dft_r2c
+    
+    type(C_PTR) function fftwq_plan_dft_r2c(rank,n,in,out,flags) bind(C, name='fftwq_plan_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      real(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_r2c
+    
+    type(C_PTR) function fftwq_plan_dft_r2c_1d(n,in,out,flags) bind(C, name='fftwq_plan_dft_r2c_1d')
+      import
+      integer(C_INT), value :: n
+      real(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_r2c_1d
+    
+    type(C_PTR) function fftwq_plan_dft_r2c_2d(n0,n1,in,out,flags) bind(C, name='fftwq_plan_dft_r2c_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      real(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_r2c_2d
+    
+    type(C_PTR) function fftwq_plan_dft_r2c_3d(n0,n1,n2,in,out,flags) bind(C, name='fftwq_plan_dft_r2c_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      real(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_r2c_3d
+    
+    type(C_PTR) function fftwq_plan_many_dft_c2r(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,flags) &
+                         bind(C, name='fftwq_plan_many_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      complex(16), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      real(16), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_INT), value :: flags
+    end function fftwq_plan_many_dft_c2r
+    
+    type(C_PTR) function fftwq_plan_dft_c2r(rank,n,in,out,flags) bind(C, name='fftwq_plan_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      complex(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_c2r
+    
+    type(C_PTR) function fftwq_plan_dft_c2r_1d(n,in,out,flags) bind(C, name='fftwq_plan_dft_c2r_1d')
+      import
+      integer(C_INT), value :: n
+      complex(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_c2r_1d
+    
+    type(C_PTR) function fftwq_plan_dft_c2r_2d(n0,n1,in,out,flags) bind(C, name='fftwq_plan_dft_c2r_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      complex(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_c2r_2d
+    
+    type(C_PTR) function fftwq_plan_dft_c2r_3d(n0,n1,n2,in,out,flags) bind(C, name='fftwq_plan_dft_c2r_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      complex(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_dft_c2r_3d
+    
+    type(C_PTR) function fftwq_plan_guru_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwq_plan_guru_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim), dimension(*), intent(in) :: howmany_dims
+      real(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru_dft_r2c
+    
+    type(C_PTR) function fftwq_plan_guru_dft_c2r(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwq_plan_guru_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim), dimension(*), intent(in) :: howmany_dims
+      complex(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru_dft_c2r
+    
+    type(C_PTR) function fftwq_plan_guru_split_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,ro,io,flags) &
+                         bind(C, name='fftwq_plan_guru_split_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim), dimension(*), intent(in) :: howmany_dims
+      real(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: ro
+      real(16), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru_split_dft_r2c
+    
+    type(C_PTR) function fftwq_plan_guru_split_dft_c2r(rank,dims,howmany_rank,howmany_dims,ri,ii,out,flags) &
+                         bind(C, name='fftwq_plan_guru_split_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim), dimension(*), intent(in) :: howmany_dims
+      real(16), dimension(*), intent(out) :: ri
+      real(16), dimension(*), intent(out) :: ii
+      real(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru_split_dft_c2r
+    
+    type(C_PTR) function fftwq_plan_guru64_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwq_plan_guru64_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(16), dimension(*), intent(out) :: in
+      complex(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru64_dft_r2c
+    
+    type(C_PTR) function fftwq_plan_guru64_dft_c2r(rank,dims,howmany_rank,howmany_dims,in,out,flags) &
+                         bind(C, name='fftwq_plan_guru64_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: howmany_dims
+      complex(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru64_dft_c2r
+    
+    type(C_PTR) function fftwq_plan_guru64_split_dft_r2c(rank,dims,howmany_rank,howmany_dims,in,ro,io,flags) &
+                         bind(C, name='fftwq_plan_guru64_split_dft_r2c')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: ro
+      real(16), dimension(*), intent(out) :: io
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru64_split_dft_r2c
+    
+    type(C_PTR) function fftwq_plan_guru64_split_dft_c2r(rank,dims,howmany_rank,howmany_dims,ri,ii,out,flags) &
+                         bind(C, name='fftwq_plan_guru64_split_dft_c2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(16), dimension(*), intent(out) :: ri
+      real(16), dimension(*), intent(out) :: ii
+      real(16), dimension(*), intent(out) :: out
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru64_split_dft_c2r
+    
+    subroutine fftwq_execute_dft_r2c(p,in,out) bind(C, name='fftwq_execute_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(16), dimension(*), intent(inout) :: in
+      complex(16), dimension(*), intent(out) :: out
+    end subroutine fftwq_execute_dft_r2c
+    
+    subroutine fftwq_execute_dft_c2r(p,in,out) bind(C, name='fftwq_execute_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      complex(16), dimension(*), intent(inout) :: in
+      real(16), dimension(*), intent(out) :: out
+    end subroutine fftwq_execute_dft_c2r
+    
+    subroutine fftwq_execute_split_dft_r2c(p,in,ro,io) bind(C, name='fftwq_execute_split_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(16), dimension(*), intent(inout) :: in
+      real(16), dimension(*), intent(out) :: ro
+      real(16), dimension(*), intent(out) :: io
+    end subroutine fftwq_execute_split_dft_r2c
+    
+    subroutine fftwq_execute_split_dft_c2r(p,ri,ii,out) bind(C, name='fftwq_execute_split_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      real(16), dimension(*), intent(inout) :: ri
+      real(16), dimension(*), intent(inout) :: ii
+      real(16), dimension(*), intent(out) :: out
+    end subroutine fftwq_execute_split_dft_c2r
+    
+    type(C_PTR) function fftwq_plan_many_r2r(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,kind,flags) &
+                         bind(C, name='fftwq_plan_many_r2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      integer(C_INT), value :: howmany
+      real(16), dimension(*), intent(out) :: in
+      integer(C_INT), dimension(*), intent(in) :: inembed
+      integer(C_INT), value :: istride
+      integer(C_INT), value :: idist
+      real(16), dimension(*), intent(out) :: out
+      integer(C_INT), dimension(*), intent(in) :: onembed
+      integer(C_INT), value :: ostride
+      integer(C_INT), value :: odist
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwq_plan_many_r2r
+    
+    type(C_PTR) function fftwq_plan_r2r(rank,n,in,out,kind,flags) bind(C, name='fftwq_plan_r2r')
+      import
+      integer(C_INT), value :: rank
+      integer(C_INT), dimension(*), intent(in) :: n
+      real(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwq_plan_r2r
+    
+    type(C_PTR) function fftwq_plan_r2r_1d(n,in,out,kind,flags) bind(C, name='fftwq_plan_r2r_1d')
+      import
+      integer(C_INT), value :: n
+      real(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind
+      integer(C_INT), value :: flags
+    end function fftwq_plan_r2r_1d
+    
+    type(C_PTR) function fftwq_plan_r2r_2d(n0,n1,in,out,kind0,kind1,flags) bind(C, name='fftwq_plan_r2r_2d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      real(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_INT), value :: flags
+    end function fftwq_plan_r2r_2d
+    
+    type(C_PTR) function fftwq_plan_r2r_3d(n0,n1,n2,in,out,kind0,kind1,kind2,flags) bind(C, name='fftwq_plan_r2r_3d')
+      import
+      integer(C_INT), value :: n0
+      integer(C_INT), value :: n1
+      integer(C_INT), value :: n2
+      real(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_FFTW_R2R_KIND), value :: kind2
+      integer(C_INT), value :: flags
+    end function fftwq_plan_r2r_3d
+    
+    type(C_PTR) function fftwq_plan_guru_r2r(rank,dims,howmany_rank,howmany_dims,in,out,kind,flags) &
+                         bind(C, name='fftwq_plan_guru_r2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim), dimension(*), intent(in) :: howmany_dims
+      real(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru_r2r
+    
+    type(C_PTR) function fftwq_plan_guru64_r2r(rank,dims,howmany_rank,howmany_dims,in,out,kind,flags) &
+                         bind(C, name='fftwq_plan_guru64_r2r')
+      import
+      integer(C_INT), value :: rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: dims
+      integer(C_INT), value :: howmany_rank
+      type(fftwq_iodim64), dimension(*), intent(in) :: howmany_dims
+      real(16), dimension(*), intent(out) :: in
+      real(16), dimension(*), intent(out) :: out
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwq_plan_guru64_r2r
+    
+    subroutine fftwq_execute_r2r(p,in,out) bind(C, name='fftwq_execute_r2r')
+      import
+      type(C_PTR), value :: p
+      real(16), dimension(*), intent(inout) :: in
+      real(16), dimension(*), intent(out) :: out
+    end subroutine fftwq_execute_r2r
+    
+    subroutine fftwq_destroy_plan(p) bind(C, name='fftwq_destroy_plan')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftwq_destroy_plan
+    
+    subroutine fftwq_forget_wisdom() bind(C, name='fftwq_forget_wisdom')
+      import
+    end subroutine fftwq_forget_wisdom
+    
+    subroutine fftwq_cleanup() bind(C, name='fftwq_cleanup')
+      import
+    end subroutine fftwq_cleanup
+    
+    subroutine fftwq_set_timelimit(t) bind(C, name='fftwq_set_timelimit')
+      import
+      real(C_DOUBLE), value :: t
+    end subroutine fftwq_set_timelimit
+    
+    subroutine fftwq_plan_with_nthreads(nthreads) bind(C, name='fftwq_plan_with_nthreads')
+      import
+      integer(C_INT), value :: nthreads
+    end subroutine fftwq_plan_with_nthreads
+    
+    integer(C_INT) function fftwq_init_threads() bind(C, name='fftwq_init_threads')
+      import
+    end function fftwq_init_threads
+    
+    subroutine fftwq_cleanup_threads() bind(C, name='fftwq_cleanup_threads')
+      import
+    end subroutine fftwq_cleanup_threads
+    
+    integer(C_INT) function fftwq_export_wisdom_to_filename(filename) bind(C, name='fftwq_export_wisdom_to_filename')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: filename
+    end function fftwq_export_wisdom_to_filename
+    
+    subroutine fftwq_export_wisdom_to_file(output_file) bind(C, name='fftwq_export_wisdom_to_file')
+      import
+      type(C_PTR), value :: output_file
+    end subroutine fftwq_export_wisdom_to_file
+    
+    type(C_PTR) function fftwq_export_wisdom_to_string() bind(C, name='fftwq_export_wisdom_to_string')
+      import
+    end function fftwq_export_wisdom_to_string
+    
+    subroutine fftwq_export_wisdom(write_char,data) bind(C, name='fftwq_export_wisdom')
+      import
+      type(C_FUNPTR), value :: write_char
+      type(C_PTR), value :: data
+    end subroutine fftwq_export_wisdom
+    
+    integer(C_INT) function fftwq_import_system_wisdom() bind(C, name='fftwq_import_system_wisdom')
+      import
+    end function fftwq_import_system_wisdom
+    
+    integer(C_INT) function fftwq_import_wisdom_from_filename(filename) bind(C, name='fftwq_import_wisdom_from_filename')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: filename
+    end function fftwq_import_wisdom_from_filename
+    
+    integer(C_INT) function fftwq_import_wisdom_from_file(input_file) bind(C, name='fftwq_import_wisdom_from_file')
+      import
+      type(C_PTR), value :: input_file
+    end function fftwq_import_wisdom_from_file
+    
+    integer(C_INT) function fftwq_import_wisdom_from_string(input_string) bind(C, name='fftwq_import_wisdom_from_string')
+      import
+      character(C_CHAR), dimension(*), intent(in) :: input_string
+    end function fftwq_import_wisdom_from_string
+    
+    integer(C_INT) function fftwq_import_wisdom(read_char,data) bind(C, name='fftwq_import_wisdom')
+      import
+      type(C_FUNPTR), value :: read_char
+      type(C_PTR), value :: data
+    end function fftwq_import_wisdom
+    
+    subroutine fftwq_fprint_plan(p,output_file) bind(C, name='fftwq_fprint_plan')
+      import
+      type(C_PTR), value :: p
+      type(C_PTR), value :: output_file
+    end subroutine fftwq_fprint_plan
+    
+    subroutine fftwq_print_plan(p) bind(C, name='fftwq_print_plan')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftwq_print_plan
+    
+    type(C_PTR) function fftwq_malloc(n) bind(C, name='fftwq_malloc')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftwq_malloc
+    
+! Unable to generate Fortran interface for fftwq_alloc_real
+    type(C_PTR) function fftwq_alloc_complex(n) bind(C, name='fftwq_alloc_complex')
+      import
+      integer(C_SIZE_T), value :: n
+    end function fftwq_alloc_complex
+    
+    subroutine fftwq_free(p) bind(C, name='fftwq_free')
+      import
+      type(C_PTR), value :: p
+    end subroutine fftwq_free
+    
+    subroutine fftwq_flops(p,add,mul,fmas) bind(C, name='fftwq_flops')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), intent(out) :: add
+      real(C_DOUBLE), intent(out) :: mul
+      real(C_DOUBLE), intent(out) :: fmas
+    end subroutine fftwq_flops
+    
+    real(C_DOUBLE) function fftwq_estimate_cost(p) bind(C, name='fftwq_estimate_cost')
+      import
+      type(C_PTR), value :: p
+    end function fftwq_estimate_cost
+    
+    real(C_DOUBLE) function fftwq_cost(p) bind(C, name='fftwq_cost')
+      import
+      type(C_PTR), value :: p
+    end function fftwq_cost
+    
+  end interface
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/flops.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/flops.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+void X(flops)(const X(plan) p, double *add, double *mul, double *fma)
+{
+     planner *plnr = X(the_planner)();
+     opcnt *o = &p->pln->ops;
+     *add = o->add; *mul = o->mul; *fma = o->fma;
+     if (plnr->cost_hook) {
+	  *add = plnr->cost_hook(p->prb, *add, COST_SUM);
+	  *mul = plnr->cost_hook(p->prb, *mul, COST_SUM);
+	  *fma = plnr->cost_hook(p->prb, *fma, COST_SUM);
+     }
+}
+
+double X(estimate_cost)(const X(plan) p)
+{
+     return X(iestimate_cost)(X(the_planner)(), p->pln, p->prb);
+}
+
+double X(cost)(const X(plan) p)
+{
+     return p->pln->pcost;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/forget-wisdom.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/forget-wisdom.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+void X(forget_wisdom)(void)
+{
+     planner *plnr = X(the_planner)();
+     plnr->adt->forget(plnr, FORGET_EVERYTHING);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/genf03.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/genf03.pl	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,213 @@
+#!/usr/bin/perl -w
+# Generate Fortran 2003 interfaces from a sequence of C function declarations
+# of the form (one per line):
+#     extern <type> <name>(...args...)
+#     extern <type> <name>(...args...)
+#     ...
+# with no line breaks within a given function.  (It's too much work to
+# write a general parser, since we just have to handle FFTW's header files.)
+
+sub canonicalize_type {
+    my($type);
+    ($type) = @_;
+    $type =~ s/ +/ /g;
+    $type =~ s/^ //;
+    $type =~ s/ $//;
+    $type =~ s/([^\* ])\*/$1 \*/g;
+    return $type;
+}
+
+# C->Fortran map of supported return types
+%return_types = (
+    "int" => "integer(C_INT)",
+    "ptrdiff_t" => "integer(C_INTPTR_T)",
+    "size_t" => "integer(C_SIZE_T)",
+    "double" => "real(C_DOUBLE)",
+    "float" => "real(C_FLOAT)",
+    "long double" => "real(C_LONG_DOUBLE)",
+    "float128__" => "real(16)",
+    "fftw_plan" => "type(C_PTR)",
+    "fftwf_plan" => "type(C_PTR)",
+    "fftwl_plan" => "type(C_PTR)",
+    "fftwq_plan" => "type(C_PTR)",
+    "void *" => "type(C_PTR)",
+    "char *" => "type(C_PTR)",
+    "double *" => "type(C_PTR)",
+    "float *" => "type(C_PTR)",
+    "long double *" => "type(C_PTR)",
+    "float128__ *" => "type(C_PTR)",
+    "fftw_complex *" => "type(C_PTR)",
+    "fftwf_complex *" => "type(C_PTR)",
+    "fftwl_complex *" => "type(C_PTR)",
+    "fftwq_complex *" => "type(C_PTR)",
+    );
+
+# C->Fortran map of supported argument types
+%arg_types = (
+    "int" => "integer(C_INT), value",
+    "unsigned" => "integer(C_INT), value",
+    "size_t" => "integer(C_SIZE_T), value",
+    "ptrdiff_t" => "integer(C_INTPTR_T), value",
+
+    "fftw_r2r_kind" => "integer(C_FFTW_R2R_KIND), value",
+    "fftwf_r2r_kind" => "integer(C_FFTW_R2R_KIND), value",
+    "fftwl_r2r_kind" => "integer(C_FFTW_R2R_KIND), value",
+    "fftwq_r2r_kind" => "integer(C_FFTW_R2R_KIND), value",
+
+    "double" => "real(C_DOUBLE), value",
+    "float" => "real(C_FLOAT), value",
+    "long double" => "real(C_LONG_DOUBLE), value",
+    "__float128" => "real(16), value",
+
+    "fftw_complex" => "complex(C_DOUBLE_COMPLEX), value",
+    "fftwf_complex" => "complex(C_DOUBLE_COMPLEX), value",
+    "fftwl_complex" => "complex(C_LONG_DOUBLE), value",
+    "fftwq_complex" => "complex(16), value",
+
+    "fftw_plan" => "type(C_PTR), value",
+    "fftwf_plan" => "type(C_PTR), value",
+    "fftwl_plan" => "type(C_PTR), value",
+    "fftwq_plan" => "type(C_PTR), value",
+    "const fftw_plan" => "type(C_PTR), value",
+    "const fftwf_plan" => "type(C_PTR), value",
+    "const fftwl_plan" => "type(C_PTR), value",
+    "const fftwq_plan" => "type(C_PTR), value",
+
+    "const int *" => "integer(C_INT), dimension(*), intent(in)",
+    "ptrdiff_t *" => "integer(C_INTPTR_T), intent(out)",
+    "const ptrdiff_t *" => "integer(C_INTPTR_T), dimension(*), intent(in)",
+
+    "const fftw_r2r_kind *" => "integer(C_FFTW_R2R_KIND), dimension(*), intent(in)",
+    "const fftwf_r2r_kind *" => "integer(C_FFTW_R2R_KIND), dimension(*), intent(in)",
+    "const fftwl_r2r_kind *" => "integer(C_FFTW_R2R_KIND), dimension(*), intent(in)",
+    "const fftwq_r2r_kind *" => "integer(C_FFTW_R2R_KIND), dimension(*), intent(in)",
+
+    "double *" => "real(C_DOUBLE), dimension(*), intent(out)",
+    "float *" => "real(C_FLOAT), dimension(*), intent(out)",
+    "long double *" => "real(C_LONG_DOUBLE), dimension(*), intent(out)",
+    "__float128 *" => "real(16), dimension(*), intent(out)",
+
+    "fftw_complex *" => "complex(C_DOUBLE_COMPLEX), dimension(*), intent(out)",
+    "fftwf_complex *" => "complex(C_FLOAT_COMPLEX), dimension(*), intent(out)",
+    "fftwl_complex *" => "complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out)",
+    "fftwq_complex *" => "complex(16), dimension(*), intent(out)",
+
+    "const fftw_iodim *" => "type(fftw_iodim), dimension(*), intent(in)",
+    "const fftwf_iodim *" => "type(fftwf_iodim), dimension(*), intent(in)",
+    "const fftwl_iodim *" => "type(fftwl_iodim), dimension(*), intent(in)",
+    "const fftwq_iodim *" => "type(fftwq_iodim), dimension(*), intent(in)",
+
+    "const fftw_iodim64 *" => "type(fftw_iodim64), dimension(*), intent(in)",
+    "const fftwf_iodim64 *" => "type(fftwf_iodim64), dimension(*), intent(in)",
+    "const fftwl_iodim64 *" => "type(fftwl_iodim64), dimension(*), intent(in)",
+    "const fftwq_iodim64 *" => "type(fftwq_iodim64), dimension(*), intent(in)",
+
+    "void *" => "type(C_PTR), value",
+    "FILE *" => "type(C_PTR), value",
+
+    "const char *" => "character(C_CHAR), dimension(*), intent(in)",
+
+    "fftw_write_char_func" => "type(C_FUNPTR), value",
+    "fftwf_write_char_func" => "type(C_FUNPTR), value",
+    "fftwl_write_char_func" => "type(C_FUNPTR), value",
+    "fftwq_write_char_func" => "type(C_FUNPTR), value",
+    "fftw_read_char_func" => "type(C_FUNPTR), value",
+    "fftwf_read_char_func" => "type(C_FUNPTR), value",
+    "fftwl_read_char_func" => "type(C_FUNPTR), value",
+    "fftwq_read_char_func" => "type(C_FUNPTR), value",
+
+    # Although the MPI standard defines this type as simply "integer",
+    # if we use integer without a 'C_' kind in a bind(C) interface then
+    # gfortran complains.  Instead, since MPI also requires the C type
+    # MPI_Fint to match Fortran integers, we use the size of this type
+    # (extracted by configure and substituted by the Makefile).
+    "MPI_Comm" => "integer(C_MPI_FINT), value"
+    );
+
+while (<>) {
+    next if /^ *$/;
+    if (/^ *extern +([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *\((.*)\) *$/) {
+	$ret = &canonicalize_type($1);
+	$name = $2;
+
+	$args = $3;
+	$args =~ s/^ *void *$//;
+
+	$bad = ($ret ne "void") && !exists($return_types{$ret});	
+	foreach $arg (split(/ *, */, $args)) {
+	    $arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
+	    $argtype = &canonicalize_type($1);
+	    $bad = 1 if !exists($arg_types{$argtype});
+	}
+	if ($bad) {
+	    print "! Unable to generate Fortran interface for $name\n";
+	    next;
+	}
+
+	# any function taking an MPI_Comm arg needs a C wrapper (grr).
+	if ($args =~ /MPI_Comm/) {
+	    $cname = $name . "_f03";
+	}
+	else {
+	    $cname = $name;
+	}
+
+	# Fortran has a 132-character line-length limit by default (grr)
+	$len = 0;
+
+	print "    "; $len = $len + length("    ");
+	if ($ret eq "void") {
+	    $kind = "subroutine"
+	}
+	else {
+	    print "$return_types{$ret} ";
+	    $len = $len + length("$return_types{$ret} ");
+	    $kind = "function"
+	}
+	print "$kind $name("; $len = $len + length("$kind $name(");
+	$len0 = $len;
+	
+	$argnames = $args;
+	$argnames =~ s/([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) */$2/g;
+	$comma = "";
+	foreach $argname (split(/ *, */, $argnames)) {
+	    if ($len + length("$comma$argname") + 3 > 132) {
+		printf ", &\n%*s", $len0, "";
+		$len = $len0;
+		$comma = "";
+	    }
+	    print "$comma$argname";
+	    $len = $len + length("$comma$argname");
+	    $comma = ",";
+	}
+	print ") "; $len = $len + 2;
+
+	if ($len + length("bind(C, name='$cname')") > 132) {
+	    printf "&\n%*s", $len0 - length("$name("), "";
+	}
+	print "bind(C, name='$cname')\n";
+
+	print "      import\n";
+	foreach $arg (split(/ *, */, $args)) {
+	    $arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
+	    $argtype = &canonicalize_type($1);
+	    $argname = $2;
+	    $ftype = $arg_types{$argtype};
+
+	    # Various special cases for argument types:
+	    if ($name =~ /_flops$/ && $argtype eq "double *") {
+		$ftype = "real(C_DOUBLE), intent(out)" 
+	    }
+	    if ($name =~ /_execute/ && ($argname eq "ri" ||
+					$argname eq "ii" || 
+					$argname eq "in")) {
+		$ftype =~ s/intent\(out\)/intent(inout)/;
+	    }
+
+	    print "      $ftype :: $argname\n"
+	}
+
+	print "    end $kind $name\n";
+	print "    \n";
+    }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/guru.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/guru.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,4 @@
+#define XGURU(name) X(plan_guru_ ## name)
+#define IODIM X(iodim)
+#define MKTENSOR_IODIMS X(mktensor_iodims)
+#define GURU_KOSHERP X(guru_kosherp)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/guru64.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/guru64.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,4 @@
+#define XGURU(name) X(plan_guru64_ ## name)
+#define IODIM X(iodim64)
+#define MKTENSOR_IODIMS X(mktensor_iodims64)
+#define GURU_KOSHERP X(guru64_kosherp)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/import-system-wisdom.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/import-system-wisdom.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+#if defined(FFTW_SINGLE)
+#  define WISDOM_NAME "wisdomf"
+#elif defined(FFTW_LDOUBLE)
+#  define WISDOM_NAME "wisdoml"
+#else
+#  define WISDOM_NAME "wisdom"
+#endif
+
+/* OS-specific configuration-file directory */
+#if defined(__DJGPP__)
+#  define WISDOM_DIR "/dev/env/DJDIR/etc/fftw/"
+#else
+#  define WISDOM_DIR "/etc/fftw/"
+#endif
+
+int X(import_system_wisdom)(void)
+{
+#if defined(__WIN32__) || defined(WIN32) || defined(_WINDOWS)
+     return 0; /* TODO? */
+#else
+
+     FILE *f;
+     f = fopen(WISDOM_DIR WISDOM_NAME, "r");
+     if (f) {
+          int ret = X(import_wisdom_from_file)(f);
+          fclose(f);
+          return ret;
+     } else
+          return 0;
+#endif
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/import-wisdom-from-file.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/import-wisdom-from-file.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include <stdio.h>
+
+/* getc()/putc() are *unbelievably* slow on linux.  Looks like glibc
+   is grabbing a lock for each call to getc()/putc(), or something
+   like that.  You pay the price for these idiotic posix threads
+   whether you use them or not.
+
+   So, we do our own buffering.  This completely defeats the purpose
+   of having stdio in the first place, of course.
+*/
+  
+#define BUFSZ 256
+
+typedef struct {
+     scanner super;
+     FILE *f;
+     char buf[BUFSZ];
+     char *bufr, *bufw;
+} S;
+
+static int getchr_file(scanner * sc_)
+{
+     S *sc = (S *) sc_;
+
+     if (sc->bufr >= sc->bufw) {
+	  sc->bufr = sc->buf;
+	  sc->bufw = sc->buf + fread(sc->buf, 1, BUFSZ, sc->f);
+	  if (sc->bufr >= sc->bufw)
+	       return EOF;
+     }
+
+     return *(sc->bufr++);
+}
+
+static scanner *mkscanner_file(FILE *f)
+{
+     S *sc = (S *) X(mkscanner)(sizeof(S), getchr_file);
+     sc->f = f;
+     sc->bufr = sc->bufw = sc->buf;
+     return &sc->super;
+}
+
+int X(import_wisdom_from_file)(FILE *input_file)
+{
+     scanner *s = mkscanner_file(input_file);
+     planner *plnr = X(the_planner)();
+     int ret = plnr->adt->imprt(plnr, s);
+     X(scanner_destroy)(s);
+     return ret;
+}
+
+int X(import_wisdom_from_filename)(const char *filename)
+{
+     FILE *f = fopen(filename, "r");
+     int ret;
+     if (!f) return 0; /* error opening file */
+     ret = X(import_wisdom_from_file)(f);
+     if (fclose(f)) ret = 0; /* error closing file */
+     return ret;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/import-wisdom-from-string.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/import-wisdom-from-string.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+typedef struct {
+     scanner super;
+     const char *s;
+} S_str;
+
+static int getchr_str(scanner * sc_)
+{
+     S_str *sc = (S_str *) sc_;
+     if (!*sc->s)
+          return EOF;
+     return *sc->s++;
+}
+
+static scanner *mkscanner_str(const char *s)
+{
+     S_str *sc = (S_str *) X(mkscanner)(sizeof(S_str), getchr_str);
+     sc->s = s;
+     return &sc->super;
+}
+
+int X(import_wisdom_from_string)(const char *input_string)
+{
+     scanner *s = mkscanner_str(input_string);
+     planner *plnr = X(the_planner)();
+     int ret = plnr->adt->imprt(plnr, s);
+     X(scanner_destroy)(s);
+     return ret;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/import-wisdom.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/import-wisdom.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+typedef struct {
+     scanner super;
+     int (*read_char)(void *);
+     void *data;
+} S;
+
+static int getchr_generic(scanner * s_)
+{
+     S *s = (S *) s_;
+     return (s->read_char)(s->data);
+}
+
+int X(import_wisdom)(int (*read_char)(void *), void *data)
+{
+     S *s = (S *) X(mkscanner)(sizeof(S), getchr_generic);
+     planner *plnr = X(the_planner)();
+     int ret;
+
+     s->read_char = read_char;
+     s->data = data;
+     ret = plnr->adt->imprt(plnr, (scanner *) s);
+     X(scanner_destroy)((scanner *) s);
+     return ret;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/malloc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/malloc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+
+void *X(malloc)(size_t n)
+{
+     return X(kernel_malloc)(n);
+}
+
+void X(free)(void *p)
+{
+     X(kernel_free)(p);
+}
+
+/* The following two routines are mainly for the convenience of
+   the Fortran 2003 API, although C users may find them convienent
+   as well.  The problem is that, although Fortran 2003 has a
+   c_sizeof intrinsic that is equivalent to sizeof, it is broken
+   in some gfortran versions, and in any case is a bit unnatural
+   in a Fortran context.  So we provide routines to allocate real
+   and complex arrays, which are all that are really needed by FFTW. */
+
+R *X(alloc_real)(size_t n)
+{
+     return (R *) X(malloc)(sizeof(R) * n);
+}
+
+C *X(alloc_complex)(size_t n)
+{
+     return (C *) X(malloc)(sizeof(C) * n);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/map-r2r-kind.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/map-r2r-kind.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+rdft_kind *X(map_r2r_kind)(int rank, const X(r2r_kind) * kind)
+{
+     int i;
+     rdft_kind *k;
+
+     A(FINITE_RNK(rank));
+     k = (rdft_kind *) MALLOC(rank * sizeof(rdft_kind), PROBLEMS);
+     for (i = 0; i < rank; ++i) {
+	  rdft_kind m;
+          switch (kind[i]) {
+	      case FFTW_R2HC: m = R2HC; break;
+	      case FFTW_HC2R: m = HC2R; break;
+	      case FFTW_DHT: m = DHT; break;
+	      case FFTW_REDFT00: m = REDFT00; break;
+	      case FFTW_REDFT01: m = REDFT01; break;
+	      case FFTW_REDFT10: m = REDFT10; break;
+	      case FFTW_REDFT11: m = REDFT11; break;
+	      case FFTW_RODFT00: m = RODFT00; break;
+	      case FFTW_RODFT01: m = RODFT01; break;
+	      case FFTW_RODFT10: m = RODFT10; break;
+	      case FFTW_RODFT11: m = RODFT11; break;
+	      default: m = R2HC; A(0);
+          }
+	  k[i] = m;
+     }
+     return k;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/mapflags.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/mapflags.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include <math.h>
+
+/* a flag operation: x is either a flag, in which case xm == 0, or
+   a mask, in which case xm == x; using this we can compactly code
+   the various bit operations via (flags & x) ^ xm or (flags | x) ^ xm. */
+typedef struct {
+     unsigned x, xm;
+} flagmask;
+
+typedef struct {
+     flagmask flag;
+     flagmask op;
+} flagop;
+
+#define FLAGP(f, msk)(((f) & (msk).x) ^ (msk).xm)
+#define OP(f, msk)(((f) | (msk).x) ^ (msk).xm)
+
+#define YES(x) {x, 0}
+#define NO(x) {x, x}
+#define IMPLIES(predicate, consequence) { predicate, consequence }
+#define EQV(a, b) IMPLIES(YES(a), YES(b)), IMPLIES(NO(a), NO(b))
+#define NEQV(a, b) IMPLIES(YES(a), NO(b)), IMPLIES(NO(a), YES(b))
+
+static void map_flags(unsigned *iflags, unsigned *oflags,
+		      const flagop flagmap[], int nmap)
+{
+     int i;
+     for (i = 0; i < nmap; ++i)
+          if (FLAGP(*iflags, flagmap[i].flag))
+               *oflags = OP(*oflags, flagmap[i].op);
+}
+
+/* encoding of the planner timelimit into a BITS_FOR_TIMELIMIT-bits
+   nonnegative integer, such that we can still view the integer as
+   ``impatience'': higher means *lower* time limit, and 0 is the
+   highest possible value (about 1 year of calendar time) */
+static unsigned timelimit_to_flags(double timelimit)
+{
+     const double tmax = 365 * 24 * 3600;
+     const double tstep = 1.05;
+     const int nsteps = (1 << BITS_FOR_TIMELIMIT);
+     int x;
+     
+     if (timelimit < 0 || timelimit >= tmax)
+	  return 0;
+     if (timelimit <= 1.0e-10)
+	  return nsteps - 1;
+     
+     x = (int) (0.5 + (log(tmax / timelimit) / log(tstep)));
+
+     if (x < 0) x = 0;
+     if (x >= nsteps) x = nsteps - 1;
+     return x;
+}
+
+void X(mapflags)(planner *plnr, unsigned flags)
+{
+     unsigned l, u, t;
+
+     /* map of api flags -> api flags, to implement consistency rules
+        and combination flags */
+     const flagop self_flagmap[] = {
+	  /* in some cases (notably for halfcomplex->real transforms),
+	     DESTROY_INPUT is the default, so we need to support
+	     an inverse flag to disable it.
+
+	     (PRESERVE, DESTROY)   ->   (PRESERVE, DESTROY)
+               (0, 0)                       (1, 0)
+               (0, 1)                       (0, 1)
+               (1, 0)                       (1, 0)
+               (1, 1)                       (1, 0)
+	  */
+	  IMPLIES(YES(FFTW_PRESERVE_INPUT), NO(FFTW_DESTROY_INPUT)),
+	  IMPLIES(NO(FFTW_DESTROY_INPUT), YES(FFTW_PRESERVE_INPUT)),
+
+	  IMPLIES(YES(FFTW_EXHAUSTIVE), YES(FFTW_PATIENT)),
+
+	  IMPLIES(YES(FFTW_ESTIMATE), NO(FFTW_PATIENT)),
+	  IMPLIES(YES(FFTW_ESTIMATE),
+		  YES(FFTW_ESTIMATE_PATIENT 
+		      | FFTW_NO_INDIRECT_OP
+		      | FFTW_ALLOW_PRUNING)),
+
+	  IMPLIES(NO(FFTW_EXHAUSTIVE), 
+		  YES(FFTW_NO_SLOW)),
+
+	  /* a canonical set of fftw2-like impatience flags */
+	  IMPLIES(NO(FFTW_PATIENT),
+		  YES(FFTW_NO_VRECURSE
+		      | FFTW_NO_RANK_SPLITS
+		      | FFTW_NO_VRANK_SPLITS
+		      | FFTW_NO_NONTHREADED
+		      | FFTW_NO_DFT_R2HC
+		      | FFTW_NO_FIXED_RADIX_LARGE_N
+		      | FFTW_BELIEVE_PCOST))
+     };
+
+     /* map of (processed) api flags to internal problem/planner flags */
+     const flagop l_flagmap[] = {
+	  EQV(FFTW_PRESERVE_INPUT, NO_DESTROY_INPUT),
+	  EQV(FFTW_NO_SIMD, NO_SIMD),
+	  EQV(FFTW_CONSERVE_MEMORY, CONSERVE_MEMORY),
+	  EQV(FFTW_NO_BUFFERING, NO_BUFFERING),
+	  NEQV(FFTW_ALLOW_LARGE_GENERIC, NO_LARGE_GENERIC)
+     };
+
+     const flagop u_flagmap[] = {
+	  IMPLIES(YES(FFTW_EXHAUSTIVE), NO(0xFFFFFFFF)),
+	  IMPLIES(NO(FFTW_EXHAUSTIVE), YES(NO_UGLY)),
+
+	  /* the following are undocumented, "beyond-guru" flags that
+	     require some understanding of FFTW internals */
+	  EQV(FFTW_ESTIMATE_PATIENT, ESTIMATE),
+	  EQV(FFTW_ALLOW_PRUNING, ALLOW_PRUNING),
+	  EQV(FFTW_BELIEVE_PCOST, BELIEVE_PCOST),
+	  EQV(FFTW_NO_DFT_R2HC, NO_DFT_R2HC),
+	  EQV(FFTW_NO_NONTHREADED, NO_NONTHREADED),
+	  EQV(FFTW_NO_INDIRECT_OP, NO_INDIRECT_OP),
+	  EQV(FFTW_NO_RANK_SPLITS, NO_RANK_SPLITS),
+	  EQV(FFTW_NO_VRANK_SPLITS, NO_VRANK_SPLITS),
+	  EQV(FFTW_NO_VRECURSE, NO_VRECURSE),
+	  EQV(FFTW_NO_SLOW, NO_SLOW),
+	  EQV(FFTW_NO_FIXED_RADIX_LARGE_N, NO_FIXED_RADIX_LARGE_N)
+     };
+
+     map_flags(&flags, &flags, self_flagmap, NELEM(self_flagmap));
+
+     l = u = 0;
+     map_flags(&flags, &l, l_flagmap, NELEM(l_flagmap));
+     map_flags(&flags, &u, u_flagmap, NELEM(u_flagmap));
+
+     /* enforce l <= u  */
+     PLNR_L(plnr) = l;
+     PLNR_U(plnr) = u | l;
+
+     /* assert that the conversion didn't lose bits */
+     A(PLNR_L(plnr) == l);
+     A(PLNR_U(plnr) == (u | l));
+
+     /* compute flags representation of the timelimit */
+     t = timelimit_to_flags(plnr->timelimit);
+
+     PLNR_TIMELIMIT_IMPATIENCE(plnr) = t;
+     A(PLNR_TIMELIMIT_IMPATIENCE(plnr) == t);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/mkprinter-file.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/mkprinter-file.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include <stdio.h>
+
+#define BUFSZ 256
+
+typedef struct {
+     printer super;
+     FILE *f;
+     char buf[BUFSZ];
+     char *bufw;
+} P;
+
+static void myflush(P *p)
+{
+     fwrite(p->buf, 1, p->bufw - p->buf, p->f);
+     p->bufw = p->buf;
+}
+
+static void myputchr(printer *p_, char c)
+{
+     P *p = (P *) p_;
+     if (p->bufw >= p->buf + BUFSZ)
+	  myflush(p);
+     *p->bufw++ = c;
+}
+
+static void mycleanup(printer *p_)
+{
+     P *p = (P *) p_;
+     myflush(p);
+}
+
+printer *X(mkprinter_file)(FILE *f)
+{
+     P *p = (P *) X(mkprinter)(sizeof(P), myputchr, mycleanup);
+     p->f = f;
+     p->bufw = p->buf;
+     return &p->super;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/mktensor-iodims.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/mktensor-iodims.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru.h"
+#include "mktensor-iodims.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/mktensor-iodims.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/mktensor-iodims.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+tensor *MKTENSOR_IODIMS(int rank, const IODIM *dims, int is, int os)
+{
+     int i;
+     tensor *x = X(mktensor)(rank);
+
+     if (FINITE_RNK(rank)) {
+          for (i = 0; i < rank; ++i) {
+               x->dims[i].n = dims[i].n;
+               x->dims[i].is = dims[i].is * is;
+               x->dims[i].os = dims[i].os * os;
+          }
+     }
+     return x;
+}
+
+static int iodims_kosherp(int rank, const IODIM *dims, int allow_minfty)
+{
+     int i;
+
+     if (rank < 0) return 0;
+
+     if (allow_minfty) {
+	  if (!FINITE_RNK(rank)) return 1;
+	  for (i = 0; i < rank; ++i)
+	       if (dims[i].n < 0) return 0;
+     } else {
+	  if (!FINITE_RNK(rank)) return 0;
+	  for (i = 0; i < rank; ++i)
+	       if (dims[i].n <= 0) return 0;
+     }
+
+     return 1;
+}
+
+int GURU_KOSHERP(int rank, const IODIM *dims,
+		 int howmany_rank, const IODIM *howmany_dims)
+{
+     return (iodims_kosherp(rank, dims, 0) &&
+	     iodims_kosherp(howmany_rank, howmany_dims, 1));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/mktensor-iodims64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/mktensor-iodims64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru64.h"
+#include "mktensor-iodims.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/mktensor-rowmajor.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/mktensor-rowmajor.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+tensor *X(mktensor_rowmajor)(int rnk, const int *n,
+			     const int *niphys, const int *nophys,
+			     int is, int os)
+{
+     tensor *x = X(mktensor)(rnk);
+
+     if (FINITE_RNK(rnk) && rnk > 0) {
+          int i;
+
+          A(n && niphys && nophys);
+          x->dims[rnk - 1].is = is;
+          x->dims[rnk - 1].os = os;
+          x->dims[rnk - 1].n = n[rnk - 1];
+          for (i = rnk - 1; i > 0; --i) {
+               x->dims[i - 1].is = x->dims[i].is * niphys[i];
+               x->dims[i - 1].os = x->dims[i].os * nophys[i];
+               x->dims[i - 1].n = n[i - 1];
+          }
+     }
+     return x;
+}
+
+static int rowmajor_kosherp(int rnk, const int *n)
+{
+     int i;
+
+     if (!FINITE_RNK(rnk)) return 0;
+     if (rnk < 0) return 0;
+
+     for (i = 0; i < rnk; ++i)
+	  if (n[i] <= 0) return 0;
+
+     return 1;
+}
+
+int X(many_kosherp)(int rnk, const int *n, int howmany)
+{
+     return (howmany >= 0) && rowmajor_kosherp(rnk, n);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-1d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-1d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "dft.h"
+
+X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign, unsigned flags)
+{
+     return X(plan_dft)(1, &n, in, out, sign, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-2d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-2d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "dft.h"
+
+X(plan) X(plan_dft_2d)(int nx, int ny, C *in, C *out, int sign, unsigned flags)
+{
+     int n[2];
+     n[0] = nx;
+     n[1] = ny;
+     return X(plan_dft)(2, n, in, out, sign, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-3d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-3d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "dft.h"
+
+X(plan) X(plan_dft_3d)(int nx, int ny, int nz,
+		       C *in, C *out, int sign, unsigned flags)
+{
+     int n[3];
+     n[0] = nx;
+     n[1] = ny;
+     n[2] = nz;
+     return X(plan_dft)(3, n, in, out, sign, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-c2r-1d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-c2r-1d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_dft_c2r_1d)(int n, C *in, R *out, unsigned flags)
+{
+     return X(plan_dft_c2r)(1, &n, in, out, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-c2r-2d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-c2r-2d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_dft_c2r_2d)(int nx, int ny, C *in, R *out, unsigned flags)
+{
+     int n[2];
+     n[0] = nx;
+     n[1] = ny;
+     return X(plan_dft_c2r)(2, n, in, out, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-c2r-3d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-c2r-3d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_dft_c2r_3d)(int nx, int ny, int nz,
+			   C *in, R *out, unsigned flags)
+{
+     int n[3];
+     n[0] = nx;
+     n[1] = ny;
+     n[2] = nz;
+     return X(plan_dft_c2r)(3, n, in, out, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-c2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-c2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_dft_c2r)(int rank, const int *n, C *in, R *out, unsigned flags)
+{
+     return X(plan_many_dft_c2r)(rank, n, 1,
+				 in, 0, 1, 1, out, 0, 1, 1, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-r2c-1d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-r2c-1d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_dft_r2c_1d)(int n, R *in, C *out, unsigned flags)
+{
+     return X(plan_dft_r2c)(1, &n, in, out, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-r2c-2d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-r2c-2d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_dft_r2c_2d)(int nx, int ny, R *in, C *out, unsigned flags)
+{
+     int n[2];
+     n[0] = nx;
+     n[1] = ny;
+     return X(plan_dft_r2c)(2, n, in, out, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-r2c-3d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-r2c-3d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_dft_r2c_3d)(int nx, int ny, int nz,
+			   R *in, C *out, unsigned flags)
+{
+     int n[3];
+     n[0] = nx;
+     n[1] = ny;
+     n[2] = nz;
+     return X(plan_dft_r2c)(3, n, in, out, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft-r2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft-r2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_dft_r2c)(int rank, const int *n, R *in, C *out, unsigned flags)
+{
+     return X(plan_many_dft_r2c)(rank, n, 1,
+				 in, 0, 1, 1, 
+				 out, 0, 1, 1, 
+				 flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-dft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-dft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_dft)(int rank, const int *n,
+		    C *in, C *out, int sign, unsigned flags)
+{
+     return X(plan_many_dft)(rank, n, 1,
+			     in, 0, 1, 1, 
+			     out, 0, 1, 1, 
+			     sign, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-dft-c2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-dft-c2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru.h"
+#include "plan-guru-dft-c2r.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-dft-c2r.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-dft-c2r.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+X(plan) XGURU(dft_c2r)(int rank, const IODIM *dims,
+		       int howmany_rank, const IODIM *howmany_dims,
+		       C *in, R *out, unsigned flags)
+{
+     R *ri, *ii;
+
+     if (!GURU_KOSHERP(rank, dims, howmany_rank, howmany_dims)) return 0;
+
+     EXTRACT_REIM(FFT_SIGN, in, &ri, &ii);
+
+     if (out != ri)
+	  flags |= FFTW_DESTROY_INPUT;
+     return X(mkapiplan)(
+	  0, flags, 
+	  X(mkproblem_rdft2_d_3pointers)(
+	       MKTENSOR_IODIMS(rank, dims, 2, 1),
+	       MKTENSOR_IODIMS(howmany_rank, howmany_dims, 2, 1),
+	       TAINT_UNALIGNED(out, flags),
+	       TAINT_UNALIGNED(ri, flags),
+	       TAINT_UNALIGNED(ii, flags), HC2R));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-dft-r2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-dft-r2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru.h"
+#include "plan-guru-dft-r2c.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-dft-r2c.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-dft-r2c.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+X(plan) XGURU(dft_r2c)(int rank, const IODIM *dims,
+		       int howmany_rank,
+		       const IODIM *howmany_dims,
+		       R *in, C *out, unsigned flags)
+{
+     R *ro, *io;
+
+     if (!GURU_KOSHERP(rank, dims, howmany_rank, howmany_dims)) return 0;
+
+     EXTRACT_REIM(FFT_SIGN, out, &ro, &io);
+
+     return X(mkapiplan)(
+	  0, flags,
+	  X(mkproblem_rdft2_d_3pointers)(
+	       MKTENSOR_IODIMS(rank, dims, 1, 2),
+	       MKTENSOR_IODIMS(howmany_rank, howmany_dims, 1, 2),
+	       TAINT_UNALIGNED(in, flags),
+	       TAINT_UNALIGNED(ro, flags),
+	       TAINT_UNALIGNED(io, flags), R2HC));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-dft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-dft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru.h"
+#include "plan-guru-dft.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-dft.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-dft.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "dft.h"
+
+X(plan) XGURU(dft)(int rank, const IODIM *dims,
+			 int howmany_rank, const IODIM *howmany_dims,
+			 C *in, C *out, int sign, unsigned flags)
+{
+     R *ri, *ii, *ro, *io;
+
+     if (!GURU_KOSHERP(rank, dims, howmany_rank, howmany_dims)) return 0;
+
+     EXTRACT_REIM(sign, in, &ri, &ii);
+     EXTRACT_REIM(sign, out, &ro, &io);
+
+     return X(mkapiplan)(
+	  sign, flags,
+	  X(mkproblem_dft_d)(MKTENSOR_IODIMS(rank, dims, 2, 2),
+			     MKTENSOR_IODIMS(howmany_rank, howmany_dims,
+						2, 2),
+			     TAINT_UNALIGNED(ri, flags),
+			     TAINT_UNALIGNED(ii, flags), 
+			     TAINT_UNALIGNED(ro, flags),
+			     TAINT_UNALIGNED(io, flags)));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-r2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-r2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru.h"
+#include "plan-guru-r2r.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-r2r.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-r2r.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+X(plan) XGURU(r2r)(int rank, const IODIM *dims,
+			 int howmany_rank,
+			 const IODIM *howmany_dims,
+			 R *in, R *out,
+			 const X(r2r_kind) * kind, unsigned flags)
+{
+     X(plan) p;
+     rdft_kind *k;
+
+     if (!GURU_KOSHERP(rank, dims, howmany_rank, howmany_dims)) return 0;
+
+     k = X(map_r2r_kind)(rank, kind);
+     p = X(mkapiplan)(
+	  0, flags,
+	  X(mkproblem_rdft_d)(MKTENSOR_IODIMS(rank, dims, 1, 1),
+			      MKTENSOR_IODIMS(howmany_rank, howmany_dims,
+						 1, 1), 
+			      TAINT_UNALIGNED(in, flags),
+			      TAINT_UNALIGNED(out, flags), k));
+     X(ifree0)(k);
+     return p;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-split-dft-c2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-split-dft-c2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru.h"
+#include "plan-guru-split-dft-c2r.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-split-dft-c2r.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-split-dft-c2r.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+X(plan) XGURU(split_dft_c2r)(int rank, const IODIM *dims,
+			     int howmany_rank, const IODIM *howmany_dims,
+			     R *ri, R *ii, R *out, unsigned flags)
+{
+     if (!GURU_KOSHERP(rank, dims, howmany_rank, howmany_dims)) return 0;
+
+     if (out != ri)
+	  flags |= FFTW_DESTROY_INPUT;
+     return X(mkapiplan)(
+	  0, flags, 
+	  X(mkproblem_rdft2_d_3pointers)(
+	       MKTENSOR_IODIMS(rank, dims, 1, 1),
+	       MKTENSOR_IODIMS(howmany_rank, howmany_dims, 1, 1),
+	       TAINT_UNALIGNED(out, flags),
+	       TAINT_UNALIGNED(ri, flags),
+	       TAINT_UNALIGNED(ii, flags), HC2R));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-split-dft-r2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-split-dft-r2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru.h"
+#include "plan-guru-split-dft-r2c.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-split-dft-r2c.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-split-dft-r2c.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+X(plan) XGURU(split_dft_r2c)(int rank, const IODIM *dims,
+			     int howmany_rank,
+			     const IODIM *howmany_dims,
+			     R *in, R *ro, R *io, unsigned flags)
+{
+     if (!GURU_KOSHERP(rank, dims, howmany_rank, howmany_dims)) return 0;
+
+     return X(mkapiplan)(
+	  0, flags,
+	  X(mkproblem_rdft2_d_3pointers)(
+	       MKTENSOR_IODIMS(rank, dims, 1, 1),
+	       MKTENSOR_IODIMS(howmany_rank, howmany_dims, 1, 1),
+	       TAINT_UNALIGNED(in, flags),
+	       TAINT_UNALIGNED(ro, flags),
+	       TAINT_UNALIGNED(io, flags), R2HC));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-split-dft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-split-dft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru.h"
+#include "plan-guru-split-dft.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru-split-dft.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru-split-dft.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "dft.h"
+
+X(plan) XGURU(split_dft)(int rank, const IODIM *dims,
+			       int howmany_rank, const IODIM *howmany_dims,
+			       R *ri, R *ii, R *ro, R *io, unsigned flags)
+{
+     if (!GURU_KOSHERP(rank, dims, howmany_rank, howmany_dims)) return 0;
+
+     return X(mkapiplan)(
+	  ii - ri == 1 && io - ro == 1 ? FFT_SIGN : -FFT_SIGN, flags,
+	  X(mkproblem_dft_d)(MKTENSOR_IODIMS(rank, dims, 1, 1),
+			     MKTENSOR_IODIMS(howmany_rank, howmany_dims,
+						1, 1),
+			     TAINT_UNALIGNED(ri, flags),
+			     TAINT_UNALIGNED(ii, flags), 
+			     TAINT_UNALIGNED(ro, flags),
+			     TAINT_UNALIGNED(io, flags)));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru64-dft-c2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru64-dft-c2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru64.h"
+#include "plan-guru-dft-c2r.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru64-dft-r2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru64-dft-r2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru64.h"
+#include "plan-guru-dft-r2c.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru64-dft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru64-dft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru64.h"
+#include "plan-guru-dft.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru64-r2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru64-r2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru64.h"
+#include "plan-guru-r2r.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru64-split-dft-c2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru64-split-dft-c2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru64.h"
+#include "plan-guru-split-dft-c2r.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru64-split-dft-r2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru64-split-dft-r2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru64.h"
+#include "plan-guru-split-dft-r2c.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-guru64-split-dft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-guru64-split-dft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+#include "guru64.h"
+#include "plan-guru-split-dft.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-many-dft-c2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-many-dft-c2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+X(plan) X(plan_many_dft_c2r)(int rank, const int *n,
+			     int howmany,
+			     C *in, const int *inembed,
+			     int istride, int idist,
+			     R *out, const int *onembed,
+			     int ostride, int odist, unsigned flags)
+{
+     R *ri, *ii;
+     int *nfi, *nfo;
+     int inplace;
+     X(plan) p;
+
+     if (!X(many_kosherp)(rank, n, howmany)) return 0;
+
+     EXTRACT_REIM(FFT_SIGN, in, &ri, &ii);
+     inplace = out == ri;
+
+     if (!inplace)
+	  flags |= FFTW_DESTROY_INPUT;
+     p = X(mkapiplan)(
+	  0, flags,
+	  X(mkproblem_rdft2_d_3pointers)(
+	       X(mktensor_rowmajor)(
+		    rank, n, 
+		    X(rdft2_pad)(rank, n, inembed, inplace, 1, &nfi),
+		    X(rdft2_pad)(rank, n, onembed, inplace, 0, &nfo),
+		    2 * istride, ostride),
+	       X(mktensor_1d)(howmany, 2 * idist, odist),
+	       TAINT_UNALIGNED(out, flags),
+	       TAINT_UNALIGNED(ri, flags), TAINT_UNALIGNED(ii, flags),
+	       HC2R));
+
+     X(ifree0)(nfi);
+     X(ifree0)(nfo);
+     return p;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-many-dft-r2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-many-dft-r2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+X(plan) X(plan_many_dft_r2c)(int rank, const int *n,
+			     int howmany,
+			     R *in, const int *inembed,
+			     int istride, int idist,
+			     C *out, const int *onembed,
+			     int ostride, int odist, unsigned flags)
+{
+     R *ro, *io;
+     int *nfi, *nfo;
+     int inplace;
+     X(plan) p;
+
+     if (!X(many_kosherp)(rank, n, howmany)) return 0;
+
+     EXTRACT_REIM(FFT_SIGN, out, &ro, &io);
+     inplace = in == ro;
+
+     p = X(mkapiplan)(
+	  0, flags, 
+	  X(mkproblem_rdft2_d_3pointers)(
+	       X(mktensor_rowmajor)(
+		    rank, n,
+		    X(rdft2_pad)(rank, n, inembed, inplace, 0, &nfi),
+		    X(rdft2_pad)(rank, n, onembed, inplace, 1, &nfo),
+		    istride, 2 * ostride), 
+	       X(mktensor_1d)(howmany, idist, 2 * odist),
+	       TAINT_UNALIGNED(in, flags),
+	       TAINT_UNALIGNED(ro, flags), TAINT_UNALIGNED(io, flags),
+	       R2HC));
+
+     X(ifree0)(nfi);
+     X(ifree0)(nfo);
+     return p;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-many-dft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-many-dft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "dft.h"
+
+#define N0(nembed)((nembed) ? (nembed) : n)
+
+X(plan) X(plan_many_dft)(int rank, const int *n,
+			 int howmany,
+			 C *in, const int *inembed,
+			 int istride, int idist,
+			 C *out, const int *onembed,
+			 int ostride, int odist, int sign, unsigned flags)
+{
+     R *ri, *ii, *ro, *io;
+
+     if (!X(many_kosherp)(rank, n, howmany)) return 0;
+
+     EXTRACT_REIM(sign, in, &ri, &ii);
+     EXTRACT_REIM(sign, out, &ro, &io);
+
+     return 
+	  X(mkapiplan)(sign, flags,
+		       X(mkproblem_dft_d)(
+			    X(mktensor_rowmajor)(rank, n, 
+						 N0(inembed), N0(onembed),
+						 2 * istride, 2 * ostride),
+			    X(mktensor_1d)(howmany, 2 * idist, 2 * odist),
+			    TAINT_UNALIGNED(ri, flags),
+			    TAINT_UNALIGNED(ii, flags),
+			    TAINT_UNALIGNED(ro, flags),
+			    TAINT_UNALIGNED(io, flags)));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-many-r2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-many-r2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "rdft.h"
+
+#define N0(nembed)((nembed) ? (nembed) : n)
+
+X(plan) X(plan_many_r2r)(int rank, const int *n,
+			 int howmany,
+			 R *in, const int *inembed,
+			 int istride, int idist,
+			 R *out, const int *onembed,
+			 int ostride, int odist,
+			 const X(r2r_kind) * kind, unsigned flags)
+{
+     X(plan) p;
+     rdft_kind *k;
+
+     if (!X(many_kosherp)(rank, n, howmany)) return 0;
+
+     k = X(map_r2r_kind)(rank, kind);
+     p = X(mkapiplan)(
+	  0, flags,
+	  X(mkproblem_rdft_d)(X(mktensor_rowmajor)(rank, n, 
+						   N0(inembed), N0(onembed),
+						   istride, ostride),
+			      X(mktensor_1d)(howmany, idist, odist),
+			      TAINT_UNALIGNED(in, flags), 
+			      TAINT_UNALIGNED(out, flags), k));
+     X(ifree0)(k);
+     return p;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-r2r-1d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-r2r-1d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_r2r_1d)(int n, R *in, R *out, X(r2r_kind) kind, unsigned flags)
+{
+     return X(plan_r2r)(1, &n, in, out, &kind, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-r2r-2d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-r2r-2d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_r2r_2d)(int nx, int ny, R *in, R *out,
+		       X(r2r_kind) kindx, X(r2r_kind) kindy, unsigned flags)
+{
+     int n[2];
+     X(r2r_kind) kind[2];
+     n[0] = nx;
+     n[1] = ny;
+     kind[0] = kindx;
+     kind[1] = kindy;
+     return X(plan_r2r)(2, n, in, out, kind, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-r2r-3d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-r2r-3d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_r2r_3d)(int nx, int ny, int nz,
+		       R *in, R *out, X(r2r_kind) kindx,
+		       X(r2r_kind) kindy, X(r2r_kind) kindz, unsigned flags)
+{
+     int n[3];
+     X(r2r_kind) kind[3];
+     n[0] = nx;
+     n[1] = ny;
+     n[2] = nz;
+     kind[0] = kindx;
+     kind[1] = kindy;
+     kind[2] = kindz;
+     return X(plan_r2r)(3, n, in, out, kind, flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/plan-r2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/plan-r2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out,
+		    const X(r2r_kind) * kind, unsigned flags)
+{
+     return X(plan_many_r2r)(rank, n, 1, in, 0, 1, 1, out, 0, 1, 1, kind,
+			     flags);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/print-plan.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/print-plan.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+void X(fprint_plan)(const X(plan) p, FILE *output_file)
+{
+     printer *pr = X(mkprinter_file)(output_file);
+     plan *pln = p->pln;
+     pln->adt->print(pln, pr);
+     X(printer_destroy)(pr);
+}
+
+void X(print_plan)(const X(plan) p)
+{
+     X(fprint_plan)(p, stdout);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/rdft2-pad.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/rdft2-pad.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <string.h>
+#include "api.h"
+
+const int *X(rdft2_pad)(int rnk, const int *n, const int *nembed,
+			int inplace, int cmplx, int **nfree)
+{
+     A(FINITE_RNK(rnk));
+     *nfree = 0;
+     if (!nembed && rnk > 0) {
+          if (inplace || cmplx) {
+               int *np = (int *) MALLOC(sizeof(int) * rnk, PROBLEMS);
+               memcpy(np, n, sizeof(int) * rnk);
+               np[rnk - 1] = (n[rnk - 1] / 2 + 1) * (1 + !cmplx);
+               nembed = *nfree = np;
+          } else
+               nembed = n;
+     }
+     return nembed;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/the-planner.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/the-planner.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+static planner *plnr = 0;
+
+/* create the planner for the rest of the API */
+planner *X(the_planner)(void)
+{
+     if (!plnr) {
+          plnr = X(mkplanner)();
+          X(configure_planner)(plnr);
+     }
+
+     return plnr;
+}
+
+void X(cleanup)(void)
+{
+     if (plnr) {
+          X(planner_destroy)(plnr);
+          plnr = 0;
+     }
+}
+
+void X(set_timelimit)(double tlim) 
+{
+     /* PLNR is not necessarily initialized when this function is
+	called, so use X(the_planner)() */
+     X(the_planner)()->timelimit = tlim; 
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/version.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/version.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "api.h"
+
+const char X(cc)[] = FFTW_CC;
+
+/* fftw <= 3.2.2 had special compiler flags for codelets, which are
+   not used anymore.  We keep this variable around because it is part
+   of the ABI */
+const char X(codelet_optim)[] = "";
+
+const char X(version)[] = PACKAGE "-" PACKAGE_VERSION
+
+#if HAVE_FMA
+   "-fma"
+#endif
+
+#if HAVE_SSE2
+   "-sse2"
+#endif
+
+#if HAVE_AVX
+   "-avx"
+#endif
+
+#if HAVE_ALTIVEC
+   "-altivec"
+#endif
+
+#if HAVE_NEON
+   "-neon"
+#endif
+
+;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/api/x77.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/api/x77.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Fortran-like (e.g. as in BLAS) type prefixes for F77 interface */
+#if defined(FFTW_SINGLE)
+#  define x77(name) CONCAT(sfftw_, name)
+#  define X77(NAME) CONCAT(SFFTW_, NAME)
+#elif defined(FFTW_LDOUBLE)
+/* FIXME: what is best?  BLAS uses D..._X, apparently.  Ugh. */
+#  define x77(name) CONCAT(lfftw_, name)
+#  define X77(NAME) CONCAT(LFFTW_, NAME)
+#elif defined(FFTW_QUAD)
+#  define x77(name) CONCAT(qfftw_, name)
+#  define X77(NAME) CONCAT(QFFTW_, NAME)
+#else
+#  define x77(name) CONCAT(dfftw_, name)
+#  define X77(NAME) CONCAT(DFFTW_, NAME)
+#endif
+
+/* If F77_FUNC is not defined and the user didn't explicitly specify
+   --disable-fortran, then make our best guess at default wrappers
+   (since F77_FUNC_EQUIV should not be defined in this case, we
+    will use both double-underscored g77 wrappers and single- or
+    non-underscored wrappers).  This saves us from dealing with
+    complaints in the cases where the user failed to specify
+    an F77 compiler or wrapper detection failed for some reason. */
+#if !defined(F77_FUNC) && !defined(DISABLE_FORTRAN)
+#  if (defined(_WIN32) || defined(__WIN32__)) && !defined(WINDOWS_F77_MANGLING)
+#    define WINDOWS_F77_MANGLING 1
+#  endif
+#  if defined(_AIX) || defined(__hpux) || defined(hpux)
+#    define F77_FUNC(a, A) a
+#  elif defined(CRAY) || defined(_CRAY) || defined(_UNICOS)
+#    define F77_FUNC(a, A) A
+#  else
+#    define F77_FUNC(a, A) a ## _
+#  endif
+#  define F77_FUNC_(a, A) a ## __
+#endif
+
+#if defined(WITH_G77_WRAPPERS) && !defined(DISABLE_FORTRAN)
+#  undef F77_FUNC_
+#  define F77_FUNC_(a, A) a ## __
+#  undef F77_FUNC_EQUIV
+#endif
+
+/* annoying Windows syntax for shared-library declarations */
+#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
+#  define FFTW_VOIDFUNC __declspec(dllexport) void
+#else
+#  define FFTW_VOIDFUNC void
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/bootstrap.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/bootstrap.sh	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+#! /bin/sh
+############################################################################
+#
+# NOTE: If you just want to build FFTW, do not use this file.  Just use
+# the ordinary ./configure && make commmands as described in the installation
+# section of the manual.
+#
+# This file is only for users that want to generate their own codelets,
+# as described in the "generating your own code" section of the manual.
+#
+############################################################################
+
+touch ChangeLog
+
+echo "PLEASE IGNORE WARNINGS AND ERRORS"
+
+# paranoia: sometimes autoconf doesn't get things right the first time
+rm -rf autom4te.cache
+autoreconf --verbose --install --symlink --force
+autoreconf --verbose --install --symlink --force
+autoreconf --verbose --install --symlink --force
+
+rm -f config.cache
+
+# --enable-maintainer-mode enables build of genfft and automatic
+# rebuild of codelets whenever genfft changes
+(
+    ./configure --disable-shared --enable-maintainer-mode --enable-threads $*
+)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/compile
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/compile	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,343 @@
+#! /bin/sh
+# Wrapper for compilers which do not understand '-c -o'.
+
+scriptversion=2012-03-05.13; # UTC
+
+# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2009, 2010, 2012 Free
+# Software Foundation, Inc.
+# Written by Tom Tromey <tromey@cygnus.com>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+nl='
+'
+
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent tools from complaining about whitespace usage.
+IFS=" ""	$nl"
+
+file_conv=
+
+# func_file_conv build_file lazy
+# Convert a $build file to $host form and store it in $file
+# Currently only supports Windows hosts. If the determined conversion
+# type is listed in (the comma separated) LAZY, no conversion will
+# take place.
+func_file_conv ()
+{
+  file=$1
+  case $file in
+    / | /[!/]*) # absolute file, and not a UNC file
+      if test -z "$file_conv"; then
+	# lazily determine how to convert abs files
+	case `uname -s` in
+	  MINGW*)
+	    file_conv=mingw
+	    ;;
+	  CYGWIN*)
+	    file_conv=cygwin
+	    ;;
+	  *)
+	    file_conv=wine
+	    ;;
+	esac
+      fi
+      case $file_conv/,$2, in
+	*,$file_conv,*)
+	  ;;
+	mingw/*)
+	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
+	  ;;
+	cygwin/*)
+	  file=`cygpath -m "$file" || echo "$file"`
+	  ;;
+	wine/*)
+	  file=`winepath -w "$file" || echo "$file"`
+	  ;;
+      esac
+      ;;
+  esac
+}
+
+# func_cl_dashL linkdir
+# Make cl look for libraries in LINKDIR
+func_cl_dashL ()
+{
+  func_file_conv "$1"
+  if test -z "$lib_path"; then
+    lib_path=$file
+  else
+    lib_path="$lib_path;$file"
+  fi
+  linker_opts="$linker_opts -LIBPATH:$file"
+}
+
+# func_cl_dashl library
+# Do a library search-path lookup for cl
+func_cl_dashl ()
+{
+  lib=$1
+  found=no
+  save_IFS=$IFS
+  IFS=';'
+  for dir in $lib_path $LIB
+  do
+    IFS=$save_IFS
+    if $shared && test -f "$dir/$lib.dll.lib"; then
+      found=yes
+      lib=$dir/$lib.dll.lib
+      break
+    fi
+    if test -f "$dir/$lib.lib"; then
+      found=yes
+      lib=$dir/$lib.lib
+      break
+    fi
+  done
+  IFS=$save_IFS
+
+  if test "$found" != yes; then
+    lib=$lib.lib
+  fi
+}
+
+# func_cl_wrapper cl arg...
+# Adjust compile command to suit cl
+func_cl_wrapper ()
+{
+  # Assume a capable shell
+  lib_path=
+  shared=:
+  linker_opts=
+  for arg
+  do
+    if test -n "$eat"; then
+      eat=
+    else
+      case $1 in
+	-o)
+	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
+	  eat=1
+	  case $2 in
+	    *.o | *.[oO][bB][jJ])
+	      func_file_conv "$2"
+	      set x "$@" -Fo"$file"
+	      shift
+	      ;;
+	    *)
+	      func_file_conv "$2"
+	      set x "$@" -Fe"$file"
+	      shift
+	      ;;
+	  esac
+	  ;;
+	-I)
+	  eat=1
+	  func_file_conv "$2" mingw
+	  set x "$@" -I"$file"
+	  shift
+	  ;;
+	-I*)
+	  func_file_conv "${1#-I}" mingw
+	  set x "$@" -I"$file"
+	  shift
+	  ;;
+	-l)
+	  eat=1
+	  func_cl_dashl "$2"
+	  set x "$@" "$lib"
+	  shift
+	  ;;
+	-l*)
+	  func_cl_dashl "${1#-l}"
+	  set x "$@" "$lib"
+	  shift
+	  ;;
+	-L)
+	  eat=1
+	  func_cl_dashL "$2"
+	  ;;
+	-L*)
+	  func_cl_dashL "${1#-L}"
+	  ;;
+	-static)
+	  shared=false
+	  ;;
+	-Wl,*)
+	  arg=${1#-Wl,}
+	  save_ifs="$IFS"; IFS=','
+	  for flag in $arg; do
+	    IFS="$save_ifs"
+	    linker_opts="$linker_opts $flag"
+	  done
+	  IFS="$save_ifs"
+	  ;;
+	-Xlinker)
+	  eat=1
+	  linker_opts="$linker_opts $2"
+	  ;;
+	-*)
+	  set x "$@" "$1"
+	  shift
+	  ;;
+	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
+	  func_file_conv "$1"
+	  set x "$@" -Tp"$file"
+	  shift
+	  ;;
+	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
+	  func_file_conv "$1" mingw
+	  set x "$@" "$file"
+	  shift
+	  ;;
+	*)
+	  set x "$@" "$1"
+	  shift
+	  ;;
+      esac
+    fi
+    shift
+  done
+  if test -n "$linker_opts"; then
+    linker_opts="-link$linker_opts"
+  fi
+  exec "$@" $linker_opts
+  exit 1
+}
+
+eat=
+
+case $1 in
+  '')
+     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
+     exit 1;
+     ;;
+  -h | --h*)
+    cat <<\EOF
+Usage: compile [--help] [--version] PROGRAM [ARGS]
+
+Wrapper for compilers which do not understand '-c -o'.
+Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
+arguments, and rename the output as expected.
+
+If you are trying to build a whole package this is not the
+right script to run: please start by reading the file 'INSTALL'.
+
+Report bugs to <bug-automake@gnu.org>.
+EOF
+    exit $?
+    ;;
+  -v | --v*)
+    echo "compile $scriptversion"
+    exit $?
+    ;;
+  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
+    func_cl_wrapper "$@"      # Doesn't return...
+    ;;
+esac
+
+ofile=
+cfile=
+
+for arg
+do
+  if test -n "$eat"; then
+    eat=
+  else
+    case $1 in
+      -o)
+	# configure might choose to run compile as 'compile cc -o foo foo.c'.
+	# So we strip '-o arg' only if arg is an object.
+	eat=1
+	case $2 in
+	  *.o | *.obj)
+	    ofile=$2
+	    ;;
+	  *)
+	    set x "$@" -o "$2"
+	    shift
+	    ;;
+	esac
+	;;
+      *.c)
+	cfile=$1
+	set x "$@" "$1"
+	shift
+	;;
+      *)
+	set x "$@" "$1"
+	shift
+	;;
+    esac
+  fi
+  shift
+done
+
+if test -z "$ofile" || test -z "$cfile"; then
+  # If no '-o' option was seen then we might have been invoked from a
+  # pattern rule where we don't need one.  That is ok -- this is a
+  # normal compilation that the losing compiler can handle.  If no
+  # '.c' file was seen then we are probably linking.  That is also
+  # ok.
+  exec "$@"
+fi
+
+# Name of file we expect compiler to create.
+cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
+
+# Create the lock directory.
+# Note: use '[/\\:.-]' here to ensure that we don't use the same name
+# that we are using for the .o file.  Also, base the name on the expected
+# object file name, since that is what matters with a parallel build.
+lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
+while true; do
+  if mkdir "$lockdir" >/dev/null 2>&1; then
+    break
+  fi
+  sleep 1
+done
+# FIXME: race condition here if user kills between mkdir and trap.
+trap "rmdir '$lockdir'; exit 1" 1 2 15
+
+# Run the compile.
+"$@"
+ret=$?
+
+if test -f "$cofile"; then
+  test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
+elif test -f "${cofile}bj"; then
+  test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
+fi
+
+rmdir "$lockdir"
+exit $ret
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/config.guess
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/config.guess	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1530 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011, 2012 Free Software Foundation, Inc.
+
+timestamp='2012-02-10'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Originally written by Per Bothner.  Please send patches (context
+# diff format) to <config-patches@gnu.org> and include a ChangeLog
+# entry.
+#
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
+#
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system \`$me' is run on.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+trap 'exit 1' 1 2 15
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
+# use `HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+set_cc_for_build='
+trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
+trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
+: ${TMPDIR=/tmp} ;
+ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
+dummy=$tmp/dummy ;
+tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
+case $CC_FOR_BUILD,$HOST_CC,$CC in
+ ,,)    echo "int x;" > $dummy.c ;
+	for c in cc gcc c89 c99 ; do
+	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+	     CC_FOR_BUILD="$c"; break ;
+	  fi ;
+	done ;
+	if test x"$CC_FOR_BUILD" = x ; then
+	  CC_FOR_BUILD=no_compiler_found ;
+	fi
+	;;
+ ,,*)   CC_FOR_BUILD=$CC ;;
+ ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+esac ; set_cc_for_build= ;'
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 1994-08-24)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+# Note: order is significant - the case branches are not exclusive.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	sysctl="sysctl -n hw.machine_arch"
+	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+	case "${UNAME_MACHINE_ARCH}" in
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    sh5el) machine=sh5le-unknown ;;
+	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently, or will in the future.
+	case "${UNAME_MACHINE_ARCH}" in
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		eval $set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep -q __ELF__
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+		os=netbsd
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case "${UNAME_VERSION}" in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	echo "${machine}-${os}${release}"
+	exit ;;
+    *:OpenBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
+	exit ;;
+    *:ekkoBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
+	exit ;;
+    *:SolidBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
+	exit ;;
+    macppc:MirBSD:*:*)
+	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    *:MirBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    alpha:OSF1:*:*)
+	case $UNAME_RELEASE in
+	*4.0)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+		;;
+	*5.*)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		;;
+	esac
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case "$ALPHA_CPU_TYPE" in
+	    "EV4 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE="alphaev5" ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE="alphaev56" ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE="alphapca56" ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE="alphapca57" ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE="alphaev6" ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE="alphaev67" ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE="alphaev69" ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE="alphaev7" ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE="alphaev79" ;;
+	esac
+	# A Pn.n version is a patched version.
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
+	exitcode=$?
+	trap '' 0
+	exit $exitcode ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit ;;
+    21064:Windows_NT:50:3)
+	echo alpha-dec-winnt3.5
+	exit ;;
+    Amiga*:UNIX_System_V:4.0:*)
+	echo m68k-unknown-sysv4
+	exit ;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit ;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-morphos
+	exit ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit ;;
+    *:z/VM:*:*)
+	echo s390-ibm-zvmoe
+	exit ;;
+    *:OS400:*:*)
+	echo powerpc-ibm-os400
+	exit ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	echo arm-acorn-riscix${UNAME_RELEASE}
+	exit ;;
+    arm:riscos:*:*|arm:RISCOS:*:*)
+	echo arm-unknown-riscos
+	exit ;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+	echo hppa1.1-hitachi-hiuxmpp
+	exit ;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	if test "`(/bin/universe) 2>/dev/null`" = att ; then
+		echo pyramid-pyramid-sysv3
+	else
+		echo pyramid-pyramid-bsd
+	fi
+	exit ;;
+    NILE*:*:*:dcosx)
+	echo pyramid-pyramid-svr4
+	exit ;;
+    DRS?6000:unix:4.0:6*)
+	echo sparc-icl-nx6
+	exit ;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) echo sparc-icl-nx7; exit ;;
+	esac ;;
+    s390x:SunOS:*:*)
+	echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
+	echo i386-pc-auroraux${UNAME_RELEASE}
+	exit ;;
+    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
+	eval $set_cc_for_build
+	SUN_ARCH="i386"
+	# If there is a compiler, see if it is configured for 64-bit objects.
+	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
+	# This test works for both compilers.
+	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
+		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		grep IS_64BIT_ARCH >/dev/null
+	    then
+		SUN_ARCH="x86_64"
+	    fi
+	fi
+	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:6*:*)
+	# According to config.sub, this is the proper way to canonicalize
+	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+	# it's likely to be more like Solaris than SunOS4.
+	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit ;;
+    sun3*:SunOS:*:*)
+	echo m68k-sun-sunos${UNAME_RELEASE}
+	exit ;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+	case "`/bin/arch`" in
+	    sun3)
+		echo m68k-sun-sunos${UNAME_RELEASE}
+		;;
+	    sun4)
+		echo sparc-sun-sunos${UNAME_RELEASE}
+		;;
+	esac
+	exit ;;
+    aushp:SunOS:*:*)
+	echo sparc-auspex-sunos${UNAME_RELEASE}
+	exit ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+	echo m68k-milan-mint${UNAME_RELEASE}
+	exit ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+	echo m68k-hades-mint${UNAME_RELEASE}
+	exit ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+	echo m68k-unknown-mint${UNAME_RELEASE}
+	exit ;;
+    m68k:machten:*:*)
+	echo m68k-apple-machten${UNAME_RELEASE}
+	exit ;;
+    powerpc:machten:*:*)
+	echo powerpc-apple-machten${UNAME_RELEASE}
+	exit ;;
+    RISC*:Mach:*:*)
+	echo mips-dec-mach_bsd4.3
+	exit ;;
+    RISC*:ULTRIX:*:*)
+	echo mips-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    VAX*:ULTRIX*:*:*)
+	echo vax-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	echo clipper-intergraph-clix${UNAME_RELEASE}
+	exit ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c &&
+	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`$dummy $dummyarg` &&
+	    { echo "$SYSTEM_NAME"; exit; }
+	echo mips-mips-riscos${UNAME_RELEASE}
+	exit ;;
+    Motorola:PowerMAX_OS:*:*)
+	echo powerpc-motorola-powermax
+	exit ;;
+    Motorola:*:4.3:PL8-*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:Power_UNIX:*:*)
+	echo powerpc-harris-powerunix
+	exit ;;
+    m88k:CX/UX:7*:*)
+	echo m88k-harris-cxux7
+	exit ;;
+    m88k:*:4*:R4*)
+	echo m88k-motorola-sysv4
+	exit ;;
+    m88k:*:3*:R3*)
+	echo m88k-motorola-sysv3
+	exit ;;
+    AViiON:dgux:*:*)
+	# DG/UX returns AViiON for all architectures
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
+		echo m88k-dg-dgux${UNAME_RELEASE}
+	    else
+		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
+	fi
+	exit ;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	echo m88k-dolphin-sysv3
+	exit ;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	echo m88k-motorola-sysv3
+	exit ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	echo m88k-tektronix-sysv3
+	exit ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	echo m68k-tektronix-bsd
+	exit ;;
+    *:IRIX*:*:*)
+	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	exit ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
+	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+	echo i386-ibm-aix
+	exit ;;
+    ia64:AIX:*:*)
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		eval $set_cc_for_build
+		sed 's/^		//' << EOF >$dummy.c
+		#include <sys/systemcfg.h>
+
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+		then
+			echo "$SYSTEM_NAME"
+		else
+			echo rs6000-ibm-aix3.2.5
+		fi
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		echo rs6000-ibm-aix3.2.4
+	else
+		echo rs6000-ibm-aix3.2
+	fi
+	exit ;;
+    *:AIX:*:[4567])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+		IBM_ARCH=rs6000
+	else
+		IBM_ARCH=powerpc
+	fi
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:*:*)
+	echo rs6000-ibm-aix
+	exit ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+	echo romp-ibm-bsd4.4
+	exit ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	exit ;;                             # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	echo rs6000-bull-bosx
+	exit ;;
+    DPX/2?00:B.O.S.:*:*)
+	echo m68k-bull-sysv3
+	exit ;;
+    9000/[34]??:4.3bsd:1.*:*)
+	echo m68k-hp-bsd
+	exit ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	echo m68k-hp-bsd4.4
+	exit ;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	case "${UNAME_MACHINE}" in
+	    9000/31? )            HP_ARCH=m68000 ;;
+	    9000/[34]?? )         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+		if [ -x /usr/bin/getconf ]; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+		    case "${sc_cpu_version}" in
+		      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+		      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+		      532)                      # CPU_PA_RISC2_0
+			case "${sc_kernel_bits}" in
+			  32) HP_ARCH="hppa2.0n" ;;
+			  64) HP_ARCH="hppa2.0w" ;;
+			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
+			esac ;;
+		    esac
+		fi
+		if [ "${HP_ARCH}" = "" ]; then
+		    eval $set_cc_for_build
+		    sed 's/^		//' << EOF >$dummy.c
+
+		#define _HPUX_SOURCE
+		#include <stdlib.h>
+		#include <unistd.h>
+
+		int main ()
+		{
+		#if defined(_SC_KERNEL_BITS)
+		    long bits = sysconf(_SC_KERNEL_BITS);
+		#endif
+		    long cpu  = sysconf (_SC_CPU_VERSION);
+
+		    switch (cpu)
+			{
+			case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+			case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+			case CPU_PA_RISC2_0:
+		#if defined(_SC_KERNEL_BITS)
+			    switch (bits)
+				{
+				case 64: puts ("hppa2.0w"); break;
+				case 32: puts ("hppa2.0n"); break;
+				default: puts ("hppa2.0"); break;
+				} break;
+		#else  /* !defined(_SC_KERNEL_BITS) */
+			    puts ("hppa2.0"); break;
+		#endif
+			default: puts ("hppa1.0"); break;
+			}
+		    exit (0);
+		}
+EOF
+		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
+	esac
+	if [ ${HP_ARCH} = "hppa2.0w" ]
+	then
+	    eval $set_cc_for_build
+
+	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+	    # generating 64-bit code.  GNU and HP use different nomenclature:
+	    #
+	    # $ CC_FOR_BUILD=cc ./config.guess
+	    # => hppa2.0w-hp-hpux11.23
+	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+	    # => hppa64-hp-hpux11.23
+
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
+		grep -q __LP64__
+	    then
+		HP_ARCH="hppa2.0w"
+	    else
+		HP_ARCH="hppa64"
+	    fi
+	fi
+	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	exit ;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ia64-hp-hpux${HPUX_REV}
+	exit ;;
+    3050*:HI-UX:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+		{ echo "$SYSTEM_NAME"; exit; }
+	echo unknown-hitachi-hiuxwe2
+	exit ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+	echo hppa1.1-hp-bsd
+	exit ;;
+    9000/8??:4.3bsd:*:*)
+	echo hppa1.0-hp-bsd
+	exit ;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+	echo hppa1.1-hp-osf
+	exit ;;
+    hp8??:OSF1:*:*)
+	echo hppa1.0-hp-osf
+	exit ;;
+    i*86:OSF1:*:*)
+	if [ -x /usr/sbin/sysversion ] ; then
+	    echo ${UNAME_MACHINE}-unknown-osf1mk
+	else
+	    echo ${UNAME_MACHINE}-unknown-osf1
+	fi
+	exit ;;
+    parisc*:Lites*:*:*)
+	echo hppa1.1-hp-lites
+	exit ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	echo c1-convex-bsd
+	exit ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	echo c34-convex-bsd
+	exit ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	echo c38-convex-bsd
+	exit ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	echo c4-convex-bsd
+	exit ;;
+    CRAY*Y-MP:*:*:*)
+	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*[A-Z]90:*:*:*)
+	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*TS:*:*:*)
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    *:UNICOS/mp:*:*)
+	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
+    5000:UNIX_System_V:4.*:*)
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
+	exit ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:FreeBSD:*:*)
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
+	case ${UNAME_PROCESSOR} in
+	    amd64)
+		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    *)
+		echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	esac
+	exit ;;
+    i*:CYGWIN*:*)
+	echo ${UNAME_MACHINE}-pc-cygwin
+	exit ;;
+    *:MINGW*:*)
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit ;;
+    i*:MSYS*:*)
+	echo ${UNAME_MACHINE}-pc-msys
+	exit ;;
+    i*:windows32*:*)
+	# uname -m includes "-pc" on this system.
+	echo ${UNAME_MACHINE}-mingw32
+	exit ;;
+    i*:PW*:*)
+	echo ${UNAME_MACHINE}-pc-pw32
+	exit ;;
+    *:Interix*:*)
+	case ${UNAME_MACHINE} in
+	    x86)
+		echo i586-pc-interix${UNAME_RELEASE}
+		exit ;;
+	    authenticamd | genuineintel | EM64T)
+		echo x86_64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	    IA64)
+		echo ia64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	esac ;;
+    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
+	echo i${UNAME_MACHINE}-pc-mks
+	exit ;;
+    8664:Windows_NT:*)
+	echo x86_64-pc-mks
+	exit ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i586-pc-interix
+	exit ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit ;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+	echo x86_64-unknown-cygwin
+	exit ;;
+    p*:CYGWIN*:*)
+	echo powerpcle-unknown-cygwin
+	exit ;;
+    prep*:SunOS:5.*:*)
+	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    *:GNU:*:*)
+	# the GNU system
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	exit ;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	exit ;;
+    i*86:Minix:*:*)
+	echo ${UNAME_MACHINE}-pc-minix
+	exit ;;
+    aarch64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    aarch64_be:Linux:*:*)
+	UNAME_MACHINE=aarch64_be
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+	esac
+	objdump --private-headers /bin/sh | grep -q ld.so.1
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	exit ;;
+    arm*:Linux:*:*)
+	eval $set_cc_for_build
+	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
+	    | grep -q __ARM_EABI__
+	then
+	    echo ${UNAME_MACHINE}-unknown-linux-gnu
+	else
+	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+		| grep -q __ARM_PCS_VFP
+	    then
+		echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+	    else
+		echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
+	    fi
+	fi
+	exit ;;
+    avr32*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    cris:Linux:*:*)
+	echo ${UNAME_MACHINE}-axis-linux-gnu
+	exit ;;
+    crisv32:Linux:*:*)
+	echo ${UNAME_MACHINE}-axis-linux-gnu
+	exit ;;
+    frv:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    hexagon:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    i*86:Linux:*:*)
+	LIBC=gnu
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#ifdef __dietlibc__
+	LIBC=dietlibc
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+	echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
+	exit ;;
+    ia64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m32r*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m68*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    mips:Linux:*:* | mips64:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef ${UNAME_MACHINE}
+	#undef ${UNAME_MACHINE}el
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=${UNAME_MACHINE}el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=${UNAME_MACHINE}
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	;;
+    or32:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    padre:Linux:*:*)
+	echo sparc-unknown-linux-gnu
+	exit ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-gnu
+	exit ;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
+	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
+	  *)    echo hppa-unknown-linux-gnu ;;
+	esac
+	exit ;;
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-gnu
+	exit ;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	echo ${UNAME_MACHINE}-ibm-linux
+	exit ;;
+    sh64*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    sh*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    tile*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    vax:Linux:*:*)
+	echo ${UNAME_MACHINE}-dec-linux-gnu
+	exit ;;
+    x86_64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    xtensa*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
+	echo i386-sequent-sysv4
+	exit ;;
+    i*86:UNIX_SV:4.2MP:2.*)
+	# Unixware is an offshoot of SVR4, but it has its own version
+	# number series starting with 2...
+	# I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+	# Use sysv4.2uw... so that sysv4* matches it.
+	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
+	exit ;;
+    i*86:OS/2:*:*)
+	# If we were able to find `uname', then EMX Unix compatibility
+	# is probably installed.
+	echo ${UNAME_MACHINE}-pc-os2-emx
+	exit ;;
+    i*86:XTS-300:*:STOP)
+	echo ${UNAME_MACHINE}-unknown-stop
+	exit ;;
+    i*86:atheos:*:*)
+	echo ${UNAME_MACHINE}-unknown-atheos
+	exit ;;
+    i*86:syllable:*:*)
+	echo ${UNAME_MACHINE}-pc-syllable
+	exit ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    i*86:*DOS:*:*)
+	echo ${UNAME_MACHINE}-pc-msdosdjgpp
+	exit ;;
+    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+	else
+		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+	fi
+	exit ;;
+    i*86:*:5:[678]*)
+	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	exit ;;
+    i*86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+	else
+		echo ${UNAME_MACHINE}-pc-sysv32
+	fi
+	exit ;;
+    pc:*:*:*)
+	# Left here for compatibility:
+	# uname -m prints for DJGPP always 'pc', but it prints nothing about
+	# the processor, so we play safe by assuming i586.
+	# Note: whatever this is, it MUST be the same as what config.sub
+	# prints for the "djgpp" host, or else GDB configury will decide that
+	# this is a cross-build.
+	echo i586-pc-msdosdjgpp
+	exit ;;
+    Intel:Mach:3*:*)
+	echo i386-pc-mach3
+	exit ;;
+    paragon:*:*:*)
+	echo i860-intel-osf1
+	exit ;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	fi
+	exit ;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	echo m68010-convergent-sysv
+	exit ;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	echo m68k-convergent-sysv
+	exit ;;
+    M680?0:D-NIX:5.3:*)
+	echo m68k-diab-dnix
+	exit ;;
+    M68*:*:R3V[5678]*:*)
+	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4; exit; } ;;
+    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
+	OS_REL='.3'
+	test -r /etc/.relid \
+	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	    && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
+	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+	echo m68k-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    mc68030:UNIX_System_V:4.*:*)
+	echo m68k-atari-sysv4
+	exit ;;
+    TSUNAMI:LynxOS:2.*:*)
+	echo sparc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    rs6000:LynxOS:2.*:*)
+	echo rs6000-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
+	echo powerpc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    SM[BE]S:UNIX_SV:*:*)
+	echo mips-dde-sysv${UNAME_RELEASE}
+	exit ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    RM*:SINIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		echo ${UNAME_MACHINE}-sni-sysv4
+	else
+		echo ns32k-sni-sysv
+	fi
+	exit ;;
+    PENTIUM:*:4.0*:*)	# Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+			# says <Richard.M.Bartel@ccMail.Census.GOV>
+	echo i586-unisys-sysv4
+	exit ;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	echo hppa1.1-stratus-sysv4
+	exit ;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	echo i860-stratus-sysv4
+	exit ;;
+    i*86:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo ${UNAME_MACHINE}-stratus-vos
+	exit ;;
+    *:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo hppa1.1-stratus-vos
+	exit ;;
+    mc68*:A/UX:*:*)
+	echo m68k-apple-aux${UNAME_RELEASE}
+	exit ;;
+    news*:NEWS-OS:6*:*)
+	echo mips-sony-newsos6
+	exit ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if [ -d /usr/nec ]; then
+		echo mips-nec-sysv${UNAME_RELEASE}
+	else
+		echo mips-unknown-sysv${UNAME_RELEASE}
+	fi
+	exit ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit ;;
+    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
+	echo i586-pc-haiku
+	exit ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-6:SUPER-UX:*:*)
+	echo sx6-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-7:SUPER-UX:*:*)
+	echo sx7-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8:SUPER-UX:*:*)
+	echo sx8-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8R:SUPER-UX:*:*)
+	echo sx8r-nec-superux${UNAME_RELEASE}
+	exit ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Darwin:*:*)
+	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
+	case $UNAME_PROCESSOR in
+	    i386)
+		eval $set_cc_for_build
+		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		      grep IS_64BIT_ARCH >/dev/null
+		  then
+		      UNAME_PROCESSOR="x86_64"
+		  fi
+		fi ;;
+	    unknown) UNAME_PROCESSOR=powerpc ;;
+	esac
+	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+	exit ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = "x86"; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+	exit ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit ;;
+    NEO-?:NONSTOP_KERNEL:*:*)
+	echo neo-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSE-?:NONSTOP_KERNEL:*:*)
+	echo nse-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSR-?:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    *:NonStop-UX:*:*)
+	echo mips-compaq-nonstopux
+	exit ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit ;;
+    DS/*:UNIX_System_V:*:*)
+	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+	exit ;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "$cputype" = "386"; then
+	    UNAME_MACHINE=i386
+	else
+	    UNAME_MACHINE="$cputype"
+	fi
+	echo ${UNAME_MACHINE}-unknown-plan9
+	exit ;;
+    *:TOPS-10:*:*)
+	echo pdp10-unknown-tops10
+	exit ;;
+    *:TENEX:*:*)
+	echo pdp10-unknown-tenex
+	exit ;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	echo pdp10-dec-tops20
+	exit ;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	echo pdp10-xkl-tops20
+	exit ;;
+    *:TOPS-20:*:*)
+	echo pdp10-unknown-tops20
+	exit ;;
+    *:ITS:*:*)
+	echo pdp10-unknown-its
+	exit ;;
+    SEI:*:*:SEIUX)
+	echo mips-sei-seiux${UNAME_RELEASE}
+	exit ;;
+    *:DragonFly:*:*)
+	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit ;;
+    *:*VMS:*:*)
+	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	case "${UNAME_MACHINE}" in
+	    A*) echo alpha-dec-vms ; exit ;;
+	    I*) echo ia64-dec-vms ; exit ;;
+	    V*) echo vax-dec-vms ; exit ;;
+	esac ;;
+    *:XENIX:*:SysV)
+	echo i386-pc-xenix
+	exit ;;
+    i*86:skyos:*:*)
+	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
+	exit ;;
+    i*86:rdos:*:*)
+	echo ${UNAME_MACHINE}-pc-rdos
+	exit ;;
+    i*86:AROS:*:*)
+	echo ${UNAME_MACHINE}-pc-aros
+	exit ;;
+    x86_64:VMkernel:*:*)
+	echo ${UNAME_MACHINE}-unknown-esx
+	exit ;;
+esac
+
+#echo '(No uname command or uname output not recognized.)' 1>&2
+#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
+
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+	"4"
+#else
+	""
+#endif
+	); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix\n"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+	{ echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    c34*)
+	echo c34-convex-bsd
+	exit ;;
+    c38*)
+	echo c38-convex-bsd
+	exit ;;
+    c4*)
+	echo c4-convex-bsd
+	exit ;;
+    esac
+fi
+
+cat >&2 <<EOF
+$0: unable to guess system type
+
+This script, last modified $timestamp, has failed to recognize
+the operating system you are using. It is advised that you
+download the most up to date version of the config scripts from
+
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+and
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+
+If the version you run ($0) is already up to date, please
+send the following data and any information you think might be
+pertinent to <config-patches@gnu.org> in order to provide the needed
+information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = ${UNAME_MACHINE}
+UNAME_RELEASE = ${UNAME_RELEASE}
+UNAME_SYSTEM  = ${UNAME_SYSTEM}
+UNAME_VERSION = ${UNAME_VERSION}
+EOF
+
+exit 1
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/config.h.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/config.h.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,398 @@
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define to compile in long-double precision. */
+#undef BENCHFFT_LDOUBLE
+
+/* Define to compile in quad precision. */
+#undef BENCHFFT_QUAD
+
+/* Define to compile in single precision. */
+#undef BENCHFFT_SINGLE
+
+/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
+   systems. This function is required for `alloca.c' support on those systems.
+   */
+#undef CRAY_STACKSEG_END
+
+/* Define to 1 if using `alloca.c'. */
+#undef C_ALLOCA
+
+/* Define to disable Fortran wrappers. */
+#undef DISABLE_FORTRAN
+
+/* Define to dummy `main' function (if any) required to link to the Fortran
+   libraries. */
+#undef F77_DUMMY_MAIN
+
+/* Define to a macro mangling the given C identifier (in lower and upper
+   case), which must not contain underscores, for linking with Fortran. */
+#undef F77_FUNC
+
+/* As F77_FUNC, but for C identifiers containing underscores. */
+#undef F77_FUNC_
+
+/* Define if F77_FUNC and F77_FUNC_ are equivalent. */
+#undef F77_FUNC_EQUIV
+
+/* Define if F77 and FC dummy `main' functions are identical. */
+#undef FC_DUMMY_MAIN_EQ_F77
+
+/* C compiler name and flags */
+#undef FFTW_CC
+
+/* Define to enable extra FFTW debugging code. */
+#undef FFTW_DEBUG
+
+/* Define to enable alignment debugging hacks. */
+#undef FFTW_DEBUG_ALIGNMENT
+
+/* Define to enable debugging malloc. */
+#undef FFTW_DEBUG_MALLOC
+
+/* Define to enable the use of alloca(). */
+#undef FFTW_ENABLE_ALLOCA
+
+/* Define to compile in long-double precision. */
+#undef FFTW_LDOUBLE
+
+/* Define to compile in quad precision. */
+#undef FFTW_QUAD
+
+/* Define to enable pseudorandom estimate planning for debugging. */
+#undef FFTW_RANDOM_ESTIMATOR
+
+/* Define to compile in single precision. */
+#undef FFTW_SINGLE
+
+/* Define to 1 if you have the `abort' function. */
+#undef HAVE_ABORT
+
+/* Define to 1 if you have `alloca', as a function or macro. */
+#undef HAVE_ALLOCA
+
+/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
+   */
+#undef HAVE_ALLOCA_H
+
+/* Define to enable Altivec optimizations. */
+#undef HAVE_ALTIVEC
+
+/* Define to 1 if you have the <altivec.h> header file. */
+#undef HAVE_ALTIVEC_H
+
+/* Define to enable AVX optimizations. */
+#undef HAVE_AVX
+
+/* Define to 1 if you have the `BSDgettimeofday' function. */
+#undef HAVE_BSDGETTIMEOFDAY
+
+/* Define to 1 if you have the `clock_gettime' function. */
+#undef HAVE_CLOCK_GETTIME
+
+/* Define to 1 if you have the `cosl' function. */
+#undef HAVE_COSL
+
+/* Define to 1 if you have the <c_asm.h> header file. */
+#undef HAVE_C_ASM_H
+
+/* Define to 1 if you have the declaration of `cosl', and to 0 if you don't.
+   */
+#undef HAVE_DECL_COSL
+
+/* Define to 1 if you have the declaration of `cosq', and to 0 if you don't.
+   */
+#undef HAVE_DECL_COSQ
+
+/* Define to 1 if you have the declaration of `drand48', and to 0 if you
+   don't. */
+#undef HAVE_DECL_DRAND48
+
+/* Define to 1 if you have the declaration of `memalign', and to 0 if you
+   don't. */
+#undef HAVE_DECL_MEMALIGN
+
+/* Define to 1 if you have the declaration of `posix_memalign', and to 0 if
+   you don't. */
+#undef HAVE_DECL_POSIX_MEMALIGN
+
+/* Define to 1 if you have the declaration of `sinl', and to 0 if you don't.
+   */
+#undef HAVE_DECL_SINL
+
+/* Define to 1 if you have the declaration of `sinq', and to 0 if you don't.
+   */
+#undef HAVE_DECL_SINQ
+
+/* Define to 1 if you have the declaration of `srand48', and to 0 if you
+   don't. */
+#undef HAVE_DECL_SRAND48
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */
+#undef HAVE_DOPRNT
+
+/* Define to 1 if you have the `drand48' function. */
+#undef HAVE_DRAND48
+
+/* Define if you have a machine with fused multiply-add */
+#undef HAVE_FMA
+
+/* Define to 1 if you have the `gethrtime' function. */
+#undef HAVE_GETHRTIME
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#undef HAVE_GETTIMEOFDAY
+
+/* Define to 1 if hrtime_t is defined in <sys/time.h> */
+#undef HAVE_HRTIME_T
+
+/* Define to 1 if you have the <intrinsics.h> header file. */
+#undef HAVE_INTRINSICS_H
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Define if the isnan() function/macro is available. */
+#undef HAVE_ISNAN
+
+/* Define to 1 if you have the <libintl.h> header file. */
+#undef HAVE_LIBINTL_H
+
+/* Define to 1 if you have the `m' library (-lm). */
+#undef HAVE_LIBM
+
+/* Define to 1 if you have the `quadmath' library (-lquadmath). */
+#undef HAVE_LIBQUADMATH
+
+/* Define to 1 if you have the <limits.h> header file. */
+#undef HAVE_LIMITS_H
+
+/* Define to 1 if the compiler supports `long double' */
+#undef HAVE_LONG_DOUBLE
+
+/* Define to 1 if you have the `mach_absolute_time' function. */
+#undef HAVE_MACH_ABSOLUTE_TIME
+
+/* Define to 1 if you have the <mach/mach_time.h> header file. */
+#undef HAVE_MACH_MACH_TIME_H
+
+/* Define to 1 if you have the <malloc.h> header file. */
+#undef HAVE_MALLOC_H
+
+/* Define to 1 if you have the `memalign' function. */
+#undef HAVE_MEMALIGN
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have the `memset' function. */
+#undef HAVE_MEMSET
+
+/* Define to enable use of MIPS ZBus cycle-counter. */
+#undef HAVE_MIPS_ZBUS_TIMER
+
+/* Define if you have the MPI library. */
+#undef HAVE_MPI
+
+/* Define to enable ARM NEON optimizations. */
+#undef HAVE_NEON
+
+/* Define if OpenMP is enabled */
+#undef HAVE_OPENMP
+
+/* Define to 1 if you have the `posix_memalign' function. */
+#undef HAVE_POSIX_MEMALIGN
+
+/* Define if you have POSIX threads libraries and header files. */
+#undef HAVE_PTHREAD
+
+/* Define to 1 if you have the `read_real_time' function. */
+#undef HAVE_READ_REAL_TIME
+
+/* Define to 1 if you have the `sinl' function. */
+#undef HAVE_SINL
+
+/* Define to 1 if you have the `snprintf' function. */
+#undef HAVE_SNPRINTF
+
+/* Define to 1 if you have the `sqrt' function. */
+#undef HAVE_SQRT
+
+/* Define to enable SSE/SSE2 optimizations. */
+#undef HAVE_SSE2
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#undef HAVE_STDDEF_H
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the `sysctl' function. */
+#undef HAVE_SYSCTL
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#undef HAVE_SYS_SYSCTL_H
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#undef HAVE_SYS_TIME_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define to 1 if you have the `tanl' function. */
+#undef HAVE_TANL
+
+/* Define if we have a threads library. */
+#undef HAVE_THREADS
+
+/* Define to 1 if you have the `time_base_to_time' function. */
+#undef HAVE_TIME_BASE_TO_TIME
+
+/* Define to 1 if the system has the type `uintptr_t'. */
+#undef HAVE_UINTPTR_T
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Define to 1 if you have the `vprintf' function. */
+#undef HAVE_VPRINTF
+
+/* Define to 1 if you have the `_mm_free' function. */
+#undef HAVE__MM_FREE
+
+/* Define to 1 if you have the `_mm_malloc' function. */
+#undef HAVE__MM_MALLOC
+
+/* Define if you have the UNICOS _rtc() intrinsic. */
+#undef HAVE__RTC
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#undef LT_OBJDIR
+
+/* Define to 1 if your C compiler doesn't accept -c and -o together. */
+#undef NO_MINUS_C_MINUS_O
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* Define to necessary symbol if this constant uses a non-standard name on
+   your system. */
+#undef PTHREAD_CREATE_JOINABLE
+
+/* The size of `double', as computed by sizeof. */
+#undef SIZEOF_DOUBLE
+
+/* The size of `fftw_r2r_kind', as computed by sizeof. */
+#undef SIZEOF_FFTW_R2R_KIND
+
+/* The size of `float', as computed by sizeof. */
+#undef SIZEOF_FLOAT
+
+/* The size of `int', as computed by sizeof. */
+#undef SIZEOF_INT
+
+/* The size of `long', as computed by sizeof. */
+#undef SIZEOF_LONG
+
+/* The size of `long long', as computed by sizeof. */
+#undef SIZEOF_LONG_LONG
+
+/* The size of `MPI_Fint', as computed by sizeof. */
+#undef SIZEOF_MPI_FINT
+
+/* The size of `ptrdiff_t', as computed by sizeof. */
+#undef SIZEOF_PTRDIFF_T
+
+/* The size of `size_t', as computed by sizeof. */
+#undef SIZEOF_SIZE_T
+
+/* The size of `unsigned int', as computed by sizeof. */
+#undef SIZEOF_UNSIGNED_INT
+
+/* The size of `unsigned long', as computed by sizeof. */
+#undef SIZEOF_UNSIGNED_LONG
+
+/* The size of `unsigned long long', as computed by sizeof. */
+#undef SIZEOF_UNSIGNED_LONG_LONG
+
+/* The size of `void *', as computed by sizeof. */
+#undef SIZEOF_VOID_P
+
+/* If using the C implementation of alloca, define if you know the
+   direction of stack growth for your system; otherwise it will be
+   automatically deduced at runtime.
+	STACK_DIRECTION > 0 => grows toward higher addresses
+	STACK_DIRECTION < 0 => grows toward lower addresses
+	STACK_DIRECTION = 0 => direction of growth unknown */
+#undef STACK_DIRECTION
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
+#undef TIME_WITH_SYS_TIME
+
+/* Define if we have and are using POSIX threads. */
+#undef USING_POSIX_THREADS
+
+/* Version number of package */
+#undef VERSION
+
+/* Use common Windows Fortran mangling styles for the Fortran interfaces. */
+#undef WINDOWS_F77_MANGLING
+
+/* Include g77-compatible wrappers in addition to any other Fortran wrappers.
+   */
+#undef WITH_G77_WRAPPERS
+
+/* Use our own aligned malloc routine; mainly helpful for Windows systems
+   lacking aligned allocation system-library routines. */
+#undef WITH_OUR_MALLOC
+
+/* Use low-precision timers, making planner very slow */
+#undef WITH_SLOW_TIMER
+
+/* Define to empty if `const' does not conform to ANSI C. */
+#undef const
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#undef inline
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/config.sub
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/config.sub	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1782 @@
+#! /bin/sh
+# Configuration validation subroutine script.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011, 2012 Free Software Foundation, Inc.
+
+timestamp='2012-04-18'
+
+# This file is (in principle) common to ALL GNU software.
+# The presence of a machine in this file suggests that SOME GNU software
+# can handle that machine.  It does not imply ALL GNU software can.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted GNU ChangeLog entry.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION] CPU-MFR-OPSYS
+       $0 [OPTION] ALIAS
+
+Canonicalize a configuration name.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.sub ($timestamp)
+
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help"
+       exit 1 ;;
+
+    *local*)
+       # First pass through any local machine types.
+       echo $1
+       exit ;;
+
+    * )
+       break ;;
+  esac
+done
+
+case $# in
+ 0) echo "$me: missing argument$help" >&2
+    exit 1;;
+ 1) ;;
+ *) echo "$me: too many arguments$help" >&2
+    exit 1;;
+esac
+
+# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
+# Here we must recognize all the valid KERNEL-OS combinations.
+maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
+case $maybe_os in
+  nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
+  linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
+  knetbsd*-gnu* | netbsd*-gnu* | \
+  kopensolaris*-gnu* | \
+  storm-chaos* | os2-emx* | rtmk-nova*)
+    os=-$maybe_os
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
+    ;;
+  android-linux)
+    os=-linux-android
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown
+    ;;
+  *)
+    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
+    if [ $basic_machine != $1 ]
+    then os=`echo $1 | sed 's/.*-/-/'`
+    else os=; fi
+    ;;
+esac
+
+### Let's recognize common machines as not being operating systems so
+### that things like config.sub decstation-3100 work.  We also
+### recognize some manufacturers as not being operating systems, so we
+### can provide default operating systems below.
+case $os in
+	-sun*os*)
+		# Prevent following clause from handling this invalid input.
+		;;
+	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
+	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
+	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
+	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
+	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
+	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
+	-apple | -axis | -knuth | -cray | -microblaze)
+		os=
+		basic_machine=$1
+		;;
+	-bluegene*)
+		os=-cnk
+		;;
+	-sim | -cisco | -oki | -wec | -winbond)
+		os=
+		basic_machine=$1
+		;;
+	-scout)
+		;;
+	-wrs)
+		os=-vxworks
+		basic_machine=$1
+		;;
+	-chorusos*)
+		os=-chorusos
+		basic_machine=$1
+		;;
+	-chorusrdb)
+		os=-chorusrdb
+		basic_machine=$1
+		;;
+	-hiux*)
+		os=-hiuxwe2
+		;;
+	-sco6)
+		os=-sco5v6
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco5)
+		os=-sco3.2v5
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco4)
+		os=-sco3.2v4
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2.[4-9]*)
+		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2v[4-9]*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco5v6*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco*)
+		os=-sco3.2v2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-udk*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-isc)
+		os=-isc2.2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-clix*)
+		basic_machine=clipper-intergraph
+		;;
+	-isc*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-lynx*178)
+		os=-lynxos178
+		;;
+	-lynx*5)
+		os=-lynxos5
+		;;
+	-lynx*)
+		os=-lynxos
+		;;
+	-ptx*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
+		;;
+	-windowsnt*)
+		os=`echo $os | sed -e 's/windowsnt/winnt/'`
+		;;
+	-psos*)
+		os=-psos
+		;;
+	-mint | -mint[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+esac
+
+# Decode aliases for certain CPU-COMPANY combinations.
+case $basic_machine in
+	# Recognize the basic CPU types without company name.
+	# Some are omitted here because they have special meanings below.
+	1750a | 580 \
+	| a29k \
+	| aarch64 | aarch64_be \
+	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
+	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
+	| am33_2.0 \
+	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
+        | be32 | be64 \
+	| bfin \
+	| c4x | clipper \
+	| d10v | d30v | dlx | dsp16xx \
+	| epiphany \
+	| fido | fr30 | frv \
+	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
+	| hexagon \
+	| i370 | i860 | i960 | ia64 \
+	| ip2k | iq2000 \
+	| le32 | le64 \
+	| lm32 \
+	| m32c | m32r | m32rle | m68000 | m68k | m88k \
+	| maxq | mb | microblaze | mcore | mep | metag \
+	| mips | mipsbe | mipseb | mipsel | mipsle \
+	| mips16 \
+	| mips64 | mips64el \
+	| mips64octeon | mips64octeonel \
+	| mips64orion | mips64orionel \
+	| mips64r5900 | mips64r5900el \
+	| mips64vr | mips64vrel \
+	| mips64vr4100 | mips64vr4100el \
+	| mips64vr4300 | mips64vr4300el \
+	| mips64vr5000 | mips64vr5000el \
+	| mips64vr5900 | mips64vr5900el \
+	| mipsisa32 | mipsisa32el \
+	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa64 | mipsisa64el \
+	| mipsisa64r2 | mipsisa64r2el \
+	| mipsisa64sb1 | mipsisa64sb1el \
+	| mipsisa64sr71k | mipsisa64sr71kel \
+	| mipstx39 | mipstx39el \
+	| mn10200 | mn10300 \
+	| moxie \
+	| mt \
+	| msp430 \
+	| nds32 | nds32le | nds32be \
+	| nios | nios2 \
+	| ns16k | ns32k \
+	| open8 \
+	| or32 \
+	| pdp10 | pdp11 | pj | pjl \
+	| powerpc | powerpc64 | powerpc64le | powerpcle \
+	| pyramid \
+	| rl78 | rx \
+	| score \
+	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+	| sh64 | sh64le \
+	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
+	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
+	| spu \
+	| tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
+	| ubicom32 \
+	| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
+	| we32k \
+	| x86 | xc16x | xstormy16 | xtensa \
+	| z8k | z80)
+		basic_machine=$basic_machine-unknown
+		;;
+	c54x)
+		basic_machine=tic54x-unknown
+		;;
+	c55x)
+		basic_machine=tic55x-unknown
+		;;
+	c6x)
+		basic_machine=tic6x-unknown
+		;;
+	m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip)
+		basic_machine=$basic_machine-unknown
+		os=-none
+		;;
+	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
+		;;
+	ms1)
+		basic_machine=mt-unknown
+		;;
+
+	strongarm | thumb | xscale)
+		basic_machine=arm-unknown
+		;;
+	xgate)
+		basic_machine=$basic_machine-unknown
+		os=-none
+		;;
+	xscaleeb)
+		basic_machine=armeb-unknown
+		;;
+
+	xscaleel)
+		basic_machine=armel-unknown
+		;;
+
+	# We use `pc' rather than `unknown'
+	# because (1) that's what they normally are, and
+	# (2) the word "unknown" tends to confuse beginning users.
+	i*86 | x86_64)
+	  basic_machine=$basic_machine-pc
+	  ;;
+	# Object if more than one company name word.
+	*-*-*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+	# Recognize the basic CPU types with company name.
+	580-* \
+	| a29k-* \
+	| aarch64-* | aarch64_be-* \
+	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
+	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
+	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
+	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
+	| avr-* | avr32-* \
+	| be32-* | be64-* \
+	| bfin-* | bs2000-* \
+	| c[123]* | c30-* | [cjt]90-* | c4x-* \
+	| clipper-* | craynv-* | cydra-* \
+	| d10v-* | d30v-* | dlx-* \
+	| elxsi-* \
+	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
+	| h8300-* | h8500-* \
+	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
+	| hexagon-* \
+	| i*86-* | i860-* | i960-* | ia64-* \
+	| ip2k-* | iq2000-* \
+	| le32-* | le64-* \
+	| lm32-* \
+	| m32c-* | m32r-* | m32rle-* \
+	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
+	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
+	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
+	| mips16-* \
+	| mips64-* | mips64el-* \
+	| mips64octeon-* | mips64octeonel-* \
+	| mips64orion-* | mips64orionel-* \
+	| mips64r5900-* | mips64r5900el-* \
+	| mips64vr-* | mips64vrel-* \
+	| mips64vr4100-* | mips64vr4100el-* \
+	| mips64vr4300-* | mips64vr4300el-* \
+	| mips64vr5000-* | mips64vr5000el-* \
+	| mips64vr5900-* | mips64vr5900el-* \
+	| mipsisa32-* | mipsisa32el-* \
+	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa64-* | mipsisa64el-* \
+	| mipsisa64r2-* | mipsisa64r2el-* \
+	| mipsisa64sb1-* | mipsisa64sb1el-* \
+	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
+	| mipstx39-* | mipstx39el-* \
+	| mmix-* \
+	| mt-* \
+	| msp430-* \
+	| nds32-* | nds32le-* | nds32be-* \
+	| nios-* | nios2-* \
+	| none-* | np1-* | ns16k-* | ns32k-* \
+	| open8-* \
+	| orion-* \
+	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
+	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
+	| pyramid-* \
+	| rl78-* | romp-* | rs6000-* | rx-* \
+	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
+	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
+	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
+	| sparclite-* \
+	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
+	| tahoe-* \
+	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
+	| tile*-* \
+	| tron-* \
+	| ubicom32-* \
+	| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
+	| vax-* \
+	| we32k-* \
+	| x86-* | x86_64-* | xc16x-* | xps100-* \
+	| xstormy16-* | xtensa*-* \
+	| ymp-* \
+	| z8k-* | z80-*)
+		;;
+	# Recognize the basic CPU types without company name, with glob match.
+	xtensa*)
+		basic_machine=$basic_machine-unknown
+		;;
+	# Recognize the various machine names and aliases which stand
+	# for a CPU type and a company and sometimes even an OS.
+	386bsd)
+		basic_machine=i386-unknown
+		os=-bsd
+		;;
+	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+		basic_machine=m68000-att
+		;;
+	3b*)
+		basic_machine=we32k-att
+		;;
+	a29khif)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	abacus)
+		basic_machine=abacus-unknown
+		;;
+	adobe68k)
+		basic_machine=m68010-adobe
+		os=-scout
+		;;
+	alliant | fx80)
+		basic_machine=fx80-alliant
+		;;
+	altos | altos3068)
+		basic_machine=m68k-altos
+		;;
+	am29k)
+		basic_machine=a29k-none
+		os=-bsd
+		;;
+	amd64)
+		basic_machine=x86_64-pc
+		;;
+	amd64-*)
+		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	amdahl)
+		basic_machine=580-amdahl
+		os=-sysv
+		;;
+	amiga | amiga-*)
+		basic_machine=m68k-unknown
+		;;
+	amigaos | amigados)
+		basic_machine=m68k-unknown
+		os=-amigaos
+		;;
+	amigaunix | amix)
+		basic_machine=m68k-unknown
+		os=-sysv4
+		;;
+	apollo68)
+		basic_machine=m68k-apollo
+		os=-sysv
+		;;
+	apollo68bsd)
+		basic_machine=m68k-apollo
+		os=-bsd
+		;;
+	aros)
+		basic_machine=i386-pc
+		os=-aros
+		;;
+	aux)
+		basic_machine=m68k-apple
+		os=-aux
+		;;
+	balance)
+		basic_machine=ns32k-sequent
+		os=-dynix
+		;;
+	blackfin)
+		basic_machine=bfin-unknown
+		os=-linux
+		;;
+	blackfin-*)
+		basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	bluegene*)
+		basic_machine=powerpc-ibm
+		os=-cnk
+		;;
+	c54x-*)
+		basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	c55x-*)
+		basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	c6x-*)
+		basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	c90)
+		basic_machine=c90-cray
+		os=-unicos
+		;;
+	cegcc)
+		basic_machine=arm-unknown
+		os=-cegcc
+		;;
+	convex-c1)
+		basic_machine=c1-convex
+		os=-bsd
+		;;
+	convex-c2)
+		basic_machine=c2-convex
+		os=-bsd
+		;;
+	convex-c32)
+		basic_machine=c32-convex
+		os=-bsd
+		;;
+	convex-c34)
+		basic_machine=c34-convex
+		os=-bsd
+		;;
+	convex-c38)
+		basic_machine=c38-convex
+		os=-bsd
+		;;
+	cray | j90)
+		basic_machine=j90-cray
+		os=-unicos
+		;;
+	craynv)
+		basic_machine=craynv-cray
+		os=-unicosmp
+		;;
+	cr16 | cr16-*)
+		basic_machine=cr16-unknown
+		os=-elf
+		;;
+	crds | unos)
+		basic_machine=m68k-crds
+		;;
+	crisv32 | crisv32-* | etraxfs*)
+		basic_machine=crisv32-axis
+		;;
+	cris | cris-* | etrax*)
+		basic_machine=cris-axis
+		;;
+	crx)
+		basic_machine=crx-unknown
+		os=-elf
+		;;
+	da30 | da30-*)
+		basic_machine=m68k-da30
+		;;
+	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
+		basic_machine=mips-dec
+		;;
+	decsystem10* | dec10*)
+		basic_machine=pdp10-dec
+		os=-tops10
+		;;
+	decsystem20* | dec20*)
+		basic_machine=pdp10-dec
+		os=-tops20
+		;;
+	delta | 3300 | motorola-3300 | motorola-delta \
+	      | 3300-motorola | delta-motorola)
+		basic_machine=m68k-motorola
+		;;
+	delta88)
+		basic_machine=m88k-motorola
+		os=-sysv3
+		;;
+	dicos)
+		basic_machine=i686-pc
+		os=-dicos
+		;;
+	djgpp)
+		basic_machine=i586-pc
+		os=-msdosdjgpp
+		;;
+	dpx20 | dpx20-*)
+		basic_machine=rs6000-bull
+		os=-bosx
+		;;
+	dpx2* | dpx2*-bull)
+		basic_machine=m68k-bull
+		os=-sysv3
+		;;
+	ebmon29k)
+		basic_machine=a29k-amd
+		os=-ebmon
+		;;
+	elxsi)
+		basic_machine=elxsi-elxsi
+		os=-bsd
+		;;
+	encore | umax | mmax)
+		basic_machine=ns32k-encore
+		;;
+	es1800 | OSE68k | ose68k | ose | OSE)
+		basic_machine=m68k-ericsson
+		os=-ose
+		;;
+	fx2800)
+		basic_machine=i860-alliant
+		;;
+	genix)
+		basic_machine=ns32k-ns
+		;;
+	gmicro)
+		basic_machine=tron-gmicro
+		os=-sysv
+		;;
+	go32)
+		basic_machine=i386-pc
+		os=-go32
+		;;
+	h3050r* | hiux*)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	h8300hms)
+		basic_machine=h8300-hitachi
+		os=-hms
+		;;
+	h8300xray)
+		basic_machine=h8300-hitachi
+		os=-xray
+		;;
+	h8500hms)
+		basic_machine=h8500-hitachi
+		os=-hms
+		;;
+	harris)
+		basic_machine=m88k-harris
+		os=-sysv3
+		;;
+	hp300-*)
+		basic_machine=m68k-hp
+		;;
+	hp300bsd)
+		basic_machine=m68k-hp
+		os=-bsd
+		;;
+	hp300hpux)
+		basic_machine=m68k-hp
+		os=-hpux
+		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k2[0-9][0-9] | hp9k31[0-9])
+		basic_machine=m68000-hp
+		;;
+	hp9k3[2-9][0-9])
+		basic_machine=m68k-hp
+		;;
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][0-9] | hp8[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hppa-next)
+		os=-nextstep3
+		;;
+	hppaosf)
+		basic_machine=hppa1.1-hp
+		os=-osf
+		;;
+	hppro)
+		basic_machine=hppa1.1-hp
+		os=-proelf
+		;;
+	i370-ibm* | ibm*)
+		basic_machine=i370-ibm
+		;;
+	i*86v32)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv32
+		;;
+	i*86v4*)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv4
+		;;
+	i*86v)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv
+		;;
+	i*86sol2)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-solaris2
+		;;
+	i386mach)
+		basic_machine=i386-mach
+		os=-mach
+		;;
+	i386-vsta | vsta)
+		basic_machine=i386-unknown
+		os=-vsta
+		;;
+	iris | iris4d)
+		basic_machine=mips-sgi
+		case $os in
+		    -irix*)
+			;;
+		    *)
+			os=-irix4
+			;;
+		esac
+		;;
+	isi68 | isi)
+		basic_machine=m68k-isi
+		os=-sysv
+		;;
+	m68knommu)
+		basic_machine=m68k-unknown
+		os=-linux
+		;;
+	m68knommu-*)
+		basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	m88k-omron*)
+		basic_machine=m88k-omron
+		;;
+	magnum | m3230)
+		basic_machine=mips-mips
+		os=-sysv
+		;;
+	merlin)
+		basic_machine=ns32k-utek
+		os=-sysv
+		;;
+	microblaze)
+		basic_machine=microblaze-xilinx
+		;;
+	mingw32)
+		basic_machine=i386-pc
+		os=-mingw32
+		;;
+	mingw32ce)
+		basic_machine=arm-unknown
+		os=-mingw32ce
+		;;
+	miniframe)
+		basic_machine=m68000-convergent
+		;;
+	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+	mips3*-*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
+		;;
+	mips3*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
+		;;
+	monitor)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	morphos)
+		basic_machine=powerpc-unknown
+		os=-morphos
+		;;
+	msdos)
+		basic_machine=i386-pc
+		os=-msdos
+		;;
+	ms1-*)
+		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
+		;;
+	msys)
+		basic_machine=i386-pc
+		os=-msys
+		;;
+	mvs)
+		basic_machine=i370-ibm
+		os=-mvs
+		;;
+	nacl)
+		basic_machine=le32-unknown
+		os=-nacl
+		;;
+	ncr3000)
+		basic_machine=i486-ncr
+		os=-sysv4
+		;;
+	netbsd386)
+		basic_machine=i386-unknown
+		os=-netbsd
+		;;
+	netwinder)
+		basic_machine=armv4l-rebel
+		os=-linux
+		;;
+	news | news700 | news800 | news900)
+		basic_machine=m68k-sony
+		os=-newsos
+		;;
+	news1000)
+		basic_machine=m68030-sony
+		os=-newsos
+		;;
+	news-3600 | risc-news)
+		basic_machine=mips-sony
+		os=-newsos
+		;;
+	necv70)
+		basic_machine=v70-nec
+		os=-sysv
+		;;
+	next | m*-next )
+		basic_machine=m68k-next
+		case $os in
+		    -nextstep* )
+			;;
+		    -ns2*)
+		      os=-nextstep2
+			;;
+		    *)
+		      os=-nextstep3
+			;;
+		esac
+		;;
+	nh3000)
+		basic_machine=m68k-harris
+		os=-cxux
+		;;
+	nh[45]000)
+		basic_machine=m88k-harris
+		os=-cxux
+		;;
+	nindy960)
+		basic_machine=i960-intel
+		os=-nindy
+		;;
+	mon960)
+		basic_machine=i960-intel
+		os=-mon960
+		;;
+	nonstopux)
+		basic_machine=mips-compaq
+		os=-nonstopux
+		;;
+	np1)
+		basic_machine=np1-gould
+		;;
+	neo-tandem)
+		basic_machine=neo-tandem
+		;;
+	nse-tandem)
+		basic_machine=nse-tandem
+		;;
+	nsr-tandem)
+		basic_machine=nsr-tandem
+		;;
+	op50n-* | op60c-*)
+		basic_machine=hppa1.1-oki
+		os=-proelf
+		;;
+	openrisc | openrisc-*)
+		basic_machine=or32-unknown
+		;;
+	os400)
+		basic_machine=powerpc-ibm
+		os=-os400
+		;;
+	OSE68000 | ose68000)
+		basic_machine=m68000-ericsson
+		os=-ose
+		;;
+	os68k)
+		basic_machine=m68k-none
+		os=-os68k
+		;;
+	pa-hitachi)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	paragon)
+		basic_machine=i860-intel
+		os=-osf
+		;;
+	parisc)
+		basic_machine=hppa-unknown
+		os=-linux
+		;;
+	parisc-*)
+		basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	pbd)
+		basic_machine=sparc-tti
+		;;
+	pbb)
+		basic_machine=m68k-tti
+		;;
+	pc532 | pc532-*)
+		basic_machine=ns32k-pc532
+		;;
+	pc98)
+		basic_machine=i386-pc
+		;;
+	pc98-*)
+		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentium | p5 | k5 | k6 | nexgen | viac3)
+		basic_machine=i586-pc
+		;;
+	pentiumpro | p6 | 6x86 | athlon | athlon_*)
+		basic_machine=i686-pc
+		;;
+	pentiumii | pentium2 | pentiumiii | pentium3)
+		basic_machine=i686-pc
+		;;
+	pentium4)
+		basic_machine=i786-pc
+		;;
+	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
+		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumpro-* | p6-* | 6x86-* | athlon-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentium4-*)
+		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pn)
+		basic_machine=pn-gould
+		;;
+	power)	basic_machine=power-ibm
+		;;
+	ppc | ppcbe)	basic_machine=powerpc-unknown
+		;;
+	ppc-* | ppcbe-*)
+		basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppcle | powerpclittle | ppc-le | powerpc-little)
+		basic_machine=powerpcle-unknown
+		;;
+	ppcle-* | powerpclittle-*)
+		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64)	basic_machine=powerpc64-unknown
+		;;
+	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
+		basic_machine=powerpc64le-unknown
+		;;
+	ppc64le-* | powerpc64little-*)
+		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ps2)
+		basic_machine=i386-ibm
+		;;
+	pw32)
+		basic_machine=i586-unknown
+		os=-pw32
+		;;
+	rdos)
+		basic_machine=i386-pc
+		os=-rdos
+		;;
+	rom68k)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	rm[46]00)
+		basic_machine=mips-siemens
+		;;
+	rtpc | rtpc-*)
+		basic_machine=romp-ibm
+		;;
+	s390 | s390-*)
+		basic_machine=s390-ibm
+		;;
+	s390x | s390x-*)
+		basic_machine=s390x-ibm
+		;;
+	sa29200)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	sb1)
+		basic_machine=mipsisa64sb1-unknown
+		;;
+	sb1el)
+		basic_machine=mipsisa64sb1el-unknown
+		;;
+	sde)
+		basic_machine=mipsisa32-sde
+		os=-elf
+		;;
+	sei)
+		basic_machine=mips-sei
+		os=-seiux
+		;;
+	sequent)
+		basic_machine=i386-sequent
+		;;
+	sh)
+		basic_machine=sh-hitachi
+		os=-hms
+		;;
+	sh5el)
+		basic_machine=sh5le-unknown
+		;;
+	sh64)
+		basic_machine=sh64-unknown
+		;;
+	sparclite-wrs | simso-wrs)
+		basic_machine=sparclite-wrs
+		os=-vxworks
+		;;
+	sps7)
+		basic_machine=m68k-bull
+		os=-sysv2
+		;;
+	spur)
+		basic_machine=spur-unknown
+		;;
+	st2000)
+		basic_machine=m68k-tandem
+		;;
+	stratus)
+		basic_machine=i860-stratus
+		os=-sysv4
+		;;
+	strongarm-* | thumb-*)
+		basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	sun2)
+		basic_machine=m68000-sun
+		;;
+	sun2os3)
+		basic_machine=m68000-sun
+		os=-sunos3
+		;;
+	sun2os4)
+		basic_machine=m68000-sun
+		os=-sunos4
+		;;
+	sun3os3)
+		basic_machine=m68k-sun
+		os=-sunos3
+		;;
+	sun3os4)
+		basic_machine=m68k-sun
+		os=-sunos4
+		;;
+	sun4os3)
+		basic_machine=sparc-sun
+		os=-sunos3
+		;;
+	sun4os4)
+		basic_machine=sparc-sun
+		os=-sunos4
+		;;
+	sun4sol2)
+		basic_machine=sparc-sun
+		os=-solaris2
+		;;
+	sun3 | sun3-*)
+		basic_machine=m68k-sun
+		;;
+	sun4)
+		basic_machine=sparc-sun
+		;;
+	sun386 | sun386i | roadrunner)
+		basic_machine=i386-sun
+		;;
+	sv1)
+		basic_machine=sv1-cray
+		os=-unicos
+		;;
+	symmetry)
+		basic_machine=i386-sequent
+		os=-dynix
+		;;
+	t3e)
+		basic_machine=alphaev5-cray
+		os=-unicos
+		;;
+	t90)
+		basic_machine=t90-cray
+		os=-unicos
+		;;
+	tile*)
+		basic_machine=$basic_machine-unknown
+		os=-linux-gnu
+		;;
+	tx39)
+		basic_machine=mipstx39-unknown
+		;;
+	tx39el)
+		basic_machine=mipstx39el-unknown
+		;;
+	toad1)
+		basic_machine=pdp10-xkl
+		os=-tops20
+		;;
+	tower | tower-32)
+		basic_machine=m68k-ncr
+		;;
+	tpf)
+		basic_machine=s390x-ibm
+		os=-tpf
+		;;
+	udi29k)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	ultra3)
+		basic_machine=a29k-nyu
+		os=-sym1
+		;;
+	v810 | necv810)
+		basic_machine=v810-nec
+		os=-none
+		;;
+	vaxv)
+		basic_machine=vax-dec
+		os=-sysv
+		;;
+	vms)
+		basic_machine=vax-dec
+		os=-vms
+		;;
+	vpp*|vx|vx-*)
+		basic_machine=f301-fujitsu
+		;;
+	vxworks960)
+		basic_machine=i960-wrs
+		os=-vxworks
+		;;
+	vxworks68)
+		basic_machine=m68k-wrs
+		os=-vxworks
+		;;
+	vxworks29k)
+		basic_machine=a29k-wrs
+		os=-vxworks
+		;;
+	w65*)
+		basic_machine=w65-wdc
+		os=-none
+		;;
+	w89k-*)
+		basic_machine=hppa1.1-winbond
+		os=-proelf
+		;;
+	xbox)
+		basic_machine=i686-pc
+		os=-mingw32
+		;;
+	xps | xps100)
+		basic_machine=xps100-honeywell
+		;;
+	xscale-* | xscalee[bl]-*)
+		basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'`
+		;;
+	ymp)
+		basic_machine=ymp-cray
+		os=-unicos
+		;;
+	z8k-*-coff)
+		basic_machine=z8k-unknown
+		os=-sim
+		;;
+	z80-*-coff)
+		basic_machine=z80-unknown
+		os=-sim
+		;;
+	none)
+		basic_machine=none-none
+		os=-none
+		;;
+
+# Here we handle the default manufacturer of certain CPU types.  It is in
+# some cases the only manufacturer, in others, it is the most popular.
+	w89k)
+		basic_machine=hppa1.1-winbond
+		;;
+	op50n)
+		basic_machine=hppa1.1-oki
+		;;
+	op60c)
+		basic_machine=hppa1.1-oki
+		;;
+	romp)
+		basic_machine=romp-ibm
+		;;
+	mmix)
+		basic_machine=mmix-knuth
+		;;
+	rs6000)
+		basic_machine=rs6000-ibm
+		;;
+	vax)
+		basic_machine=vax-dec
+		;;
+	pdp10)
+		# there are many clones, so DEC is not a safe bet
+		basic_machine=pdp10-unknown
+		;;
+	pdp11)
+		basic_machine=pdp11-dec
+		;;
+	we32k)
+		basic_machine=we32k-att
+		;;
+	sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
+		basic_machine=sh-unknown
+		;;
+	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
+		basic_machine=sparc-sun
+		;;
+	cydra)
+		basic_machine=cydra-cydrome
+		;;
+	orion)
+		basic_machine=orion-highlevel
+		;;
+	orion105)
+		basic_machine=clipper-highlevel
+		;;
+	mac | mpw | mac-mpw)
+		basic_machine=m68k-apple
+		;;
+	pmac | pmac-mpw)
+		basic_machine=powerpc-apple
+		;;
+	*-unknown)
+		# Make sure to match an already-canonicalized machine name.
+		;;
+	*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $basic_machine in
+	*-digital*)
+		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+		;;
+	*-commodore*)
+		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+		;;
+	*)
+		;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if [ x"$os" != x"" ]
+then
+case $os in
+	# First match some system type aliases
+	# that might get confused with valid system types.
+	# -solaris* is a basic system type, with this one exception.
+	-auroraux)
+		os=-auroraux
+		;;
+	-solaris1 | -solaris1.*)
+		os=`echo $os | sed -e 's|solaris1|sunos4|'`
+		;;
+	-solaris)
+		os=-solaris2
+		;;
+	-svr4*)
+		os=-sysv4
+		;;
+	-unixware*)
+		os=-sysv4.2uw
+		;;
+	-gnu/linux*)
+		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
+		;;
+	# First accept the basic system types.
+	# The portable systems comes first.
+	# Each alternative MUST END IN A *, to match a version number.
+	# -sysv* is not here because it comes later, after sysvr4.
+	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
+	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
+	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
+	      | -sym* | -kopensolaris* \
+	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
+	      | -aos* | -aros* \
+	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
+	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
+	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
+	      | -openbsd* | -solidbsd* \
+	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
+	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
+	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
+	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
+	      | -chorusos* | -chorusrdb* | -cegcc* \
+	      | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+	      | -mingw32* | -linux-gnu* | -linux-android* \
+	      | -linux-newlib* | -linux-uclibc* \
+	      | -uxpv* | -beos* | -mpeix* | -udk* \
+	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
+	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
+	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
+	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
+	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
+	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
+	# Remember, each alternative MUST END IN *, to match a version number.
+		;;
+	-qnx*)
+		case $basic_machine in
+		    x86-* | i*86-*)
+			;;
+		    *)
+			os=-nto$os
+			;;
+		esac
+		;;
+	-nto-qnx*)
+		;;
+	-nto*)
+		os=`echo $os | sed -e 's|nto|nto-qnx|'`
+		;;
+	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
+	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
+	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+		;;
+	-mac*)
+		os=`echo $os | sed -e 's|mac|macos|'`
+		;;
+	-linux-dietlibc)
+		os=-linux-dietlibc
+		;;
+	-linux*)
+		os=`echo $os | sed -e 's|linux|linux-gnu|'`
+		;;
+	-sunos5*)
+		os=`echo $os | sed -e 's|sunos5|solaris2|'`
+		;;
+	-sunos6*)
+		os=`echo $os | sed -e 's|sunos6|solaris3|'`
+		;;
+	-opened*)
+		os=-openedition
+		;;
+	-os400*)
+		os=-os400
+		;;
+	-wince*)
+		os=-wince
+		;;
+	-osfrose*)
+		os=-osfrose
+		;;
+	-osf*)
+		os=-osf
+		;;
+	-utek*)
+		os=-bsd
+		;;
+	-dynix*)
+		os=-bsd
+		;;
+	-acis*)
+		os=-aos
+		;;
+	-atheos*)
+		os=-atheos
+		;;
+	-syllable*)
+		os=-syllable
+		;;
+	-386bsd)
+		os=-bsd
+		;;
+	-ctix* | -uts*)
+		os=-sysv
+		;;
+	-nova*)
+		os=-rtmk-nova
+		;;
+	-ns2 )
+		os=-nextstep2
+		;;
+	-nsk*)
+		os=-nsk
+		;;
+	# Preserve the version number of sinix5.
+	-sinix5.*)
+		os=`echo $os | sed -e 's|sinix|sysv|'`
+		;;
+	-sinix*)
+		os=-sysv4
+		;;
+	-tpf*)
+		os=-tpf
+		;;
+	-triton*)
+		os=-sysv3
+		;;
+	-oss*)
+		os=-sysv3
+		;;
+	-svr4)
+		os=-sysv4
+		;;
+	-svr3)
+		os=-sysv3
+		;;
+	-sysvr4)
+		os=-sysv4
+		;;
+	# This must come after -sysvr4.
+	-sysv*)
+		;;
+	-ose*)
+		os=-ose
+		;;
+	-es1800*)
+		os=-ose
+		;;
+	-xenix)
+		os=-xenix
+		;;
+	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+		os=-mint
+		;;
+	-aros*)
+		os=-aros
+		;;
+	-kaos*)
+		os=-kaos
+		;;
+	-zvmoe)
+		os=-zvmoe
+		;;
+	-dicos*)
+		os=-dicos
+		;;
+	-nacl*)
+		;;
+	-none)
+		;;
+	*)
+		# Get rid of the `-' at the beginning of $os.
+		os=`echo $os | sed 's/[^-]*-//'`
+		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
+		exit 1
+		;;
+esac
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+case $basic_machine in
+	score-*)
+		os=-elf
+		;;
+	spu-*)
+		os=-elf
+		;;
+	*-acorn)
+		os=-riscix1.2
+		;;
+	arm*-rebel)
+		os=-linux
+		;;
+	arm*-semi)
+		os=-aout
+		;;
+	c4x-* | tic4x-*)
+		os=-coff
+		;;
+	hexagon-*)
+		os=-elf
+		;;
+	tic54x-*)
+		os=-coff
+		;;
+	tic55x-*)
+		os=-coff
+		;;
+	tic6x-*)
+		os=-coff
+		;;
+	# This must come before the *-dec entry.
+	pdp10-*)
+		os=-tops20
+		;;
+	pdp11-*)
+		os=-none
+		;;
+	*-dec | vax-*)
+		os=-ultrix4.2
+		;;
+	m68*-apollo)
+		os=-domain
+		;;
+	i386-sun)
+		os=-sunos4.0.2
+		;;
+	m68000-sun)
+		os=-sunos3
+		;;
+	m68*-cisco)
+		os=-aout
+		;;
+	mep-*)
+		os=-elf
+		;;
+	mips*-cisco)
+		os=-elf
+		;;
+	mips*-*)
+		os=-elf
+		;;
+	or32-*)
+		os=-coff
+		;;
+	*-tti)	# must be before sparc entry or we get the wrong os.
+		os=-sysv3
+		;;
+	sparc-* | *-sun)
+		os=-sunos4.1.1
+		;;
+	*-be)
+		os=-beos
+		;;
+	*-haiku)
+		os=-haiku
+		;;
+	*-ibm)
+		os=-aix
+		;;
+	*-knuth)
+		os=-mmixware
+		;;
+	*-wec)
+		os=-proelf
+		;;
+	*-winbond)
+		os=-proelf
+		;;
+	*-oki)
+		os=-proelf
+		;;
+	*-hp)
+		os=-hpux
+		;;
+	*-hitachi)
+		os=-hiux
+		;;
+	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+		os=-sysv
+		;;
+	*-cbm)
+		os=-amigaos
+		;;
+	*-dg)
+		os=-dgux
+		;;
+	*-dolphin)
+		os=-sysv3
+		;;
+	m68k-ccur)
+		os=-rtu
+		;;
+	m88k-omron*)
+		os=-luna
+		;;
+	*-next )
+		os=-nextstep
+		;;
+	*-sequent)
+		os=-ptx
+		;;
+	*-crds)
+		os=-unos
+		;;
+	*-ns)
+		os=-genix
+		;;
+	i370-*)
+		os=-mvs
+		;;
+	*-next)
+		os=-nextstep3
+		;;
+	*-gould)
+		os=-sysv
+		;;
+	*-highlevel)
+		os=-bsd
+		;;
+	*-encore)
+		os=-bsd
+		;;
+	*-sgi)
+		os=-irix
+		;;
+	*-siemens)
+		os=-sysv4
+		;;
+	*-masscomp)
+		os=-rtu
+		;;
+	f30[01]-fujitsu | f700-fujitsu)
+		os=-uxpv
+		;;
+	*-rom68k)
+		os=-coff
+		;;
+	*-*bug)
+		os=-coff
+		;;
+	*-apple)
+		os=-macos
+		;;
+	*-atari*)
+		os=-mint
+		;;
+	*)
+		os=-none
+		;;
+esac
+fi
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+vendor=unknown
+case $basic_machine in
+	*-unknown)
+		case $os in
+			-riscix*)
+				vendor=acorn
+				;;
+			-sunos*)
+				vendor=sun
+				;;
+			-cnk*|-aix*)
+				vendor=ibm
+				;;
+			-beos*)
+				vendor=be
+				;;
+			-hpux*)
+				vendor=hp
+				;;
+			-mpeix*)
+				vendor=hp
+				;;
+			-hiux*)
+				vendor=hitachi
+				;;
+			-unos*)
+				vendor=crds
+				;;
+			-dgux*)
+				vendor=dg
+				;;
+			-luna*)
+				vendor=omron
+				;;
+			-genix*)
+				vendor=ns
+				;;
+			-mvs* | -opened*)
+				vendor=ibm
+				;;
+			-os400*)
+				vendor=ibm
+				;;
+			-ptx*)
+				vendor=sequent
+				;;
+			-tpf*)
+				vendor=ibm
+				;;
+			-vxsim* | -vxworks* | -windiss*)
+				vendor=wrs
+				;;
+			-aux*)
+				vendor=apple
+				;;
+			-hms*)
+				vendor=hitachi
+				;;
+			-mpw* | -macos*)
+				vendor=apple
+				;;
+			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+				vendor=atari
+				;;
+			-vos*)
+				vendor=stratus
+				;;
+		esac
+		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
+		;;
+esac
+
+echo $basic_machine$os
+exit
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/configure
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/configure	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,22659 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.69 for fftw 3.3.3.
+#
+# Report bugs to <fftw@fftw.org>.
+#
+#
+# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
+#
+#
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='print -r --'
+  as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in #(
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there.  '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+# Use a proper internal environment variable to ensure we don't fall
+  # into an infinite loop, continuously re-executing ourselves.
+  if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then
+    _as_can_reexec=no; export _as_can_reexec;
+    # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+as_fn_exit 255
+  fi
+  # We don't want this to propagate to other subprocesses.
+          { _as_can_reexec=; unset _as_can_reexec;}
+if test "x$CONFIG_SHELL" = x; then
+  as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '\${1+\"\$@\"}'='\"\$@\"'
+  setopt NO_GLOB_SUBST
+else
+  case \`(set -o) 2>/dev/null\` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+"
+  as_required="as_fn_return () { (exit \$1); }
+as_fn_success () { as_fn_return 0; }
+as_fn_failure () { as_fn_return 1; }
+as_fn_ret_success () { return 0; }
+as_fn_ret_failure () { return 1; }
+
+exitcode=0
+as_fn_success || { exitcode=1; echo as_fn_success failed.; }
+as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
+as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
+as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
+if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
+
+else
+  exitcode=1; echo positional parameters were not saved.
+fi
+test x\$exitcode = x0 || exit 1
+test -x / || exit 1"
+  as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
+  as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
+  eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
+  test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
+
+  test -n \"\${ZSH_VERSION+set}\${BASH_VERSION+set}\" || (
+    ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+    ECHO=\$ECHO\$ECHO\$ECHO\$ECHO\$ECHO
+    ECHO=\$ECHO\$ECHO\$ECHO\$ECHO\$ECHO\$ECHO
+    PATH=/empty FPATH=/empty; export PATH FPATH
+    test \"X\`printf %s \$ECHO\`\" = \"X\$ECHO\" \\
+      || test \"X\`print -r -- \$ECHO\`\" = \"X\$ECHO\" ) || exit 1
+test \$(( 1 + 1 )) = 2 || exit 1"
+  if (eval "$as_required") 2>/dev/null; then :
+  as_have_required=yes
+else
+  as_have_required=no
+fi
+  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
+
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+as_found=false
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  as_found=:
+  case $as_dir in #(
+	 /*)
+	   for as_base in sh bash ksh sh5; do
+	     # Try only shells that exist, to save several forks.
+	     as_shell=$as_dir/$as_base
+	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+		    { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
+  CONFIG_SHELL=$as_shell as_have_required=yes
+		   if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
+  break 2
+fi
+fi
+	   done;;
+       esac
+  as_found=false
+done
+$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
+	      { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
+  CONFIG_SHELL=$SHELL as_have_required=yes
+fi; }
+IFS=$as_save_IFS
+
+
+      if test "x$CONFIG_SHELL" != x; then :
+  export CONFIG_SHELL
+             # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+exit 255
+fi
+
+    if test x$as_have_required = xno; then :
+  $as_echo "$0: This script requires a shell more modern than all"
+  $as_echo "$0: the shells that I found on your system."
+  if test x${ZSH_VERSION+set} = xset ; then
+    $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
+    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
+  else
+    $as_echo "$0: Please tell bug-autoconf@gnu.org and fftw@fftw.org
+$0: about your system, including any error possibly output
+$0: before this message. Then install a modern shell, or
+$0: manually run the script under such a shell if you do
+$0: have one."
+  fi
+  exit 1
+fi
+fi
+fi
+SHELL=${CONFIG_SHELL-/bin/sh}
+export SHELL
+# Unset more variables known to interfere with behavior of common tools.
+CLICOLOR_FORCE= GREP_OPTIONS=
+unset CLICOLOR_FORCE GREP_OPTIONS
+
+## --------------------- ##
+## M4sh Shell Functions. ##
+## --------------------- ##
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else
+  as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else
+  as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  }
+fi # as_fn_arith
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  $as_echo "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+
+  as_lineno_1=$LINENO as_lineno_1a=$LINENO
+  as_lineno_2=$LINENO as_lineno_2a=$LINENO
+  eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" &&
+  test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || {
+  # Blame Lee E. McMahon (1931-1989) for sed's syntax.  :-)
+  sed -n '
+    p
+    /[$]LINENO/=
+  ' <$as_myself |
+    sed '
+      s/[$]LINENO.*/&-/
+      t lineno
+      b
+      :lineno
+      N
+      :loop
+      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+      t loop
+      s/-\n.*//
+    ' >$as_me.lineno &&
+  chmod +x "$as_me.lineno" ||
+    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
+
+  # If we had to re-execute with $CONFIG_SHELL, we're ensured to have
+  # already done that, so ensure we don't try to do so again and fall
+  # in an infinite loop.  This has already happened in practice.
+  _as_can_reexec=no; export _as_can_reexec
+  # Don't try to exec as it changes $[0], causing all sort of problems
+  # (the dirname of $[0] is not the place where we might find the
+  # original and so on.  Autoconf is especially sensitive to this).
+  . "./$as_me.lineno"
+  # Exit status is that of the last command.
+  exit
+}
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+    # In both cases, we have to default to `cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+SHELL=${CONFIG_SHELL-/bin/sh}
+
+
+test -n "$DJDIR" || exec 7<&0 </dev/null
+exec 6>&1
+
+# Name of the host.
+# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_clean_files=
+ac_config_libobj_dir=.
+LIBOBJS=
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+
+# Identity of this package.
+PACKAGE_NAME='fftw'
+PACKAGE_TARNAME='fftw'
+PACKAGE_VERSION='3.3.3'
+PACKAGE_STRING='fftw 3.3.3'
+PACKAGE_BUGREPORT='fftw@fftw.org'
+PACKAGE_URL=''
+
+ac_unique_file="kernel/ifftw.h"
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+# include <sys/stat.h>
+#endif
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#ifdef HAVE_STRING_H
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
+#  include <memory.h>
+# endif
+# include <string.h>
+#endif
+#ifdef HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
+ac_subst_vars='am__EXEEXT_FALSE
+am__EXEEXT_TRUE
+LTLIBOBJS
+COMBINED_THREADS_FALSE
+COMBINED_THREADS_TRUE
+SMP_FALSE
+SMP_TRUE
+OPENMP_FALSE
+OPENMP_TRUE
+THREADS_FALSE
+THREADS_TRUE
+THREADLIBS
+PTHREAD_CFLAGS
+PTHREAD_LIBS
+PTHREAD_CC
+acx_pthread_config
+OPENMP_CFLAGS
+FLIBS
+ac_ct_F77
+FFLAGS
+F77
+LIBQUADMATH
+LIBOBJS
+POW_LIB
+ALLOCA
+C_FFTW_R2R_KIND
+STACK_ALIGN_CFLAGS
+NEON_CFLAGS
+ALTIVEC_CFLAGS
+AVX_CFLAGS
+SSE2_CFLAGS
+MPI_FALSE
+MPI_TRUE
+C_MPI_FINT
+MPIRUN
+MPILIBS
+MPICC
+OCAMLBUILD
+CPP
+OTOOL64
+OTOOL
+LIPO
+NMEDIT
+DSYMUTIL
+MANIFEST_TOOL
+RANLIB
+ac_ct_AR
+AR
+NM
+ac_ct_DUMPBIN
+DUMPBIN
+LD
+FGREP
+EGREP
+GREP
+SED
+LIBTOOL
+OBJDUMP
+DLLTOOL
+AS
+LN_S
+am__fastdepCC_FALSE
+am__fastdepCC_TRUE
+CCDEPMODE
+am__nodep
+AMDEPBACKSLASH
+AMDEP_FALSE
+AMDEP_TRUE
+am__quote
+am__include
+DEPDIR
+OBJEXT
+EXEEXT
+ac_ct_CC
+CPPFLAGS
+LDFLAGS
+CFLAGS
+CC
+PREC_SUFFIX
+HAVE_NEON_FALSE
+HAVE_NEON_TRUE
+HAVE_ALTIVEC_FALSE
+HAVE_ALTIVEC_TRUE
+HAVE_AVX_FALSE
+HAVE_AVX_TRUE
+HAVE_SSE2_FALSE
+HAVE_SSE2_TRUE
+CHECK_PL_OPTS
+PRECISION
+QUAD_FALSE
+QUAD_TRUE
+LDOUBLE_FALSE
+LDOUBLE_TRUE
+SINGLE_FALSE
+SINGLE_TRUE
+host_os
+host_vendor
+host_cpu
+host
+build_os
+build_vendor
+build_cpu
+build
+SHARED_VERSION_INFO
+MAINT
+MAINTAINER_MODE_FALSE
+MAINTAINER_MODE_TRUE
+am__untar
+am__tar
+AMTAR
+am__leading_dot
+SET_MAKE
+AWK
+mkdir_p
+MKDIR_P
+INSTALL_STRIP_PROGRAM
+STRIP
+install_sh
+MAKEINFO
+AUTOHEADER
+AUTOMAKE
+AUTOCONF
+ACLOCAL
+VERSION
+PACKAGE
+CYGPATH_W
+am__isrc
+INSTALL_DATA
+INSTALL_SCRIPT
+INSTALL_PROGRAM
+target_alias
+host_alias
+build_alias
+LIBS
+ECHO_T
+ECHO_N
+ECHO_C
+DEFS
+mandir
+localedir
+libdir
+psdir
+pdfdir
+dvidir
+htmldir
+infodir
+docdir
+oldincludedir
+includedir
+localstatedir
+sharedstatedir
+sysconfdir
+datadir
+datarootdir
+libexecdir
+sbindir
+bindir
+program_transform_name
+prefix
+exec_prefix
+PACKAGE_URL
+PACKAGE_BUGREPORT
+PACKAGE_STRING
+PACKAGE_VERSION
+PACKAGE_TARNAME
+PACKAGE_NAME
+PATH_SEPARATOR
+SHELL'
+ac_subst_files=''
+ac_user_opts='
+enable_option_checking
+enable_maintainer_mode
+enable_shared
+enable_fma
+enable_debug
+enable_debug_malloc
+enable_debug_alignment
+enable_random_estimator
+enable_alloca
+enable_single
+enable_float
+enable_long_double
+enable_quad_precision
+enable_sse
+enable_sse2
+enable_avx
+enable_altivec
+enable_neon
+with_slow_timer
+enable_mips_zbus_timer
+with_our_malloc
+with_our_malloc16
+with_windows_f77_mangling
+with_incoming_stack_boundary
+enable_dependency_tracking
+enable_static
+with_pic
+enable_fast_install
+with_gnu_ld
+with_sysroot
+enable_libtool_lock
+enable_mpi
+enable_fortran
+with_g77_wrappers
+enable_openmp
+enable_threads
+with_combined_threads
+'
+      ac_precious_vars='build_alias
+host_alias
+target_alias
+CC
+CFLAGS
+LDFLAGS
+LIBS
+CPPFLAGS
+CPP
+MPICC
+F77
+FFLAGS'
+
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+ac_unrecognized_opts=
+ac_unrecognized_sep=
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+# (The list follows the same order as the GNU Coding Standards.)
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datarootdir='${prefix}/share'
+datadir='${datarootdir}'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
+infodir='${datarootdir}/info'
+htmldir='${docdir}'
+dvidir='${docdir}'
+pdfdir='${docdir}'
+psdir='${docdir}'
+libdir='${exec_prefix}/lib'
+localedir='${datarootdir}/locale'
+mandir='${datarootdir}/man'
+
+ac_prev=
+ac_dashdash=
+for ac_option
+do
+  # If the previous option needs an argument, assign it.
+  if test -n "$ac_prev"; then
+    eval $ac_prev=\$ac_option
+    ac_prev=
+    continue
+  fi
+
+  case $ac_option in
+  *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+  *=)   ac_optarg= ;;
+  *)    ac_optarg=yes ;;
+  esac
+
+  # Accept the important Cygnus configure options, so we can diagnose typos.
+
+  case $ac_dashdash$ac_option in
+  --)
+    ac_dashdash=yes ;;
+
+  -bindir | --bindir | --bindi | --bind | --bin | --bi)
+    ac_prev=bindir ;;
+  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+    bindir=$ac_optarg ;;
+
+  -build | --build | --buil | --bui | --bu)
+    ac_prev=build_alias ;;
+  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+    build_alias=$ac_optarg ;;
+
+  -cache-file | --cache-file | --cache-fil | --cache-fi \
+  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+    ac_prev=cache_file ;;
+  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+    cache_file=$ac_optarg ;;
+
+  --config-cache | -C)
+    cache_file=config.cache ;;
+
+  -datadir | --datadir | --datadi | --datad)
+    ac_prev=datadir ;;
+  -datadir=* | --datadir=* | --datadi=* | --datad=*)
+    datadir=$ac_optarg ;;
+
+  -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
+  | --dataroo | --dataro | --datar)
+    ac_prev=datarootdir ;;
+  -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
+  | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
+    datarootdir=$ac_optarg ;;
+
+  -disable-* | --disable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=no ;;
+
+  -docdir | --docdir | --docdi | --doc | --do)
+    ac_prev=docdir ;;
+  -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
+    docdir=$ac_optarg ;;
+
+  -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
+    ac_prev=dvidir ;;
+  -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
+    dvidir=$ac_optarg ;;
+
+  -enable-* | --enable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=\$ac_optarg ;;
+
+  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+  | --exec | --exe | --ex)
+    ac_prev=exec_prefix ;;
+  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+  | --exec=* | --exe=* | --ex=*)
+    exec_prefix=$ac_optarg ;;
+
+  -gas | --gas | --ga | --g)
+    # Obsolete; use --with-gas.
+    with_gas=yes ;;
+
+  -help | --help | --hel | --he | -h)
+    ac_init_help=long ;;
+  -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+    ac_init_help=recursive ;;
+  -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+    ac_init_help=short ;;
+
+  -host | --host | --hos | --ho)
+    ac_prev=host_alias ;;
+  -host=* | --host=* | --hos=* | --ho=*)
+    host_alias=$ac_optarg ;;
+
+  -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
+    ac_prev=htmldir ;;
+  -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
+  | --ht=*)
+    htmldir=$ac_optarg ;;
+
+  -includedir | --includedir | --includedi | --included | --include \
+  | --includ | --inclu | --incl | --inc)
+    ac_prev=includedir ;;
+  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+  | --includ=* | --inclu=* | --incl=* | --inc=*)
+    includedir=$ac_optarg ;;
+
+  -infodir | --infodir | --infodi | --infod | --info | --inf)
+    ac_prev=infodir ;;
+  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+    infodir=$ac_optarg ;;
+
+  -libdir | --libdir | --libdi | --libd)
+    ac_prev=libdir ;;
+  -libdir=* | --libdir=* | --libdi=* | --libd=*)
+    libdir=$ac_optarg ;;
+
+  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+  | --libexe | --libex | --libe)
+    ac_prev=libexecdir ;;
+  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+  | --libexe=* | --libex=* | --libe=*)
+    libexecdir=$ac_optarg ;;
+
+  -localedir | --localedir | --localedi | --localed | --locale)
+    ac_prev=localedir ;;
+  -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
+    localedir=$ac_optarg ;;
+
+  -localstatedir | --localstatedir | --localstatedi | --localstated \
+  | --localstate | --localstat | --localsta | --localst | --locals)
+    ac_prev=localstatedir ;;
+  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+  | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
+    localstatedir=$ac_optarg ;;
+
+  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+    ac_prev=mandir ;;
+  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+    mandir=$ac_optarg ;;
+
+  -nfp | --nfp | --nf)
+    # Obsolete; use --without-fp.
+    with_fp=no ;;
+
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c | -n)
+    no_create=yes ;;
+
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+    no_recursion=yes ;;
+
+  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+  | --oldin | --oldi | --old | --ol | --o)
+    ac_prev=oldincludedir ;;
+  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+    oldincludedir=$ac_optarg ;;
+
+  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+    ac_prev=prefix ;;
+  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+    prefix=$ac_optarg ;;
+
+  -program-prefix | --program-prefix | --program-prefi | --program-pref \
+  | --program-pre | --program-pr | --program-p)
+    ac_prev=program_prefix ;;
+  -program-prefix=* | --program-prefix=* | --program-prefi=* \
+  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+    program_prefix=$ac_optarg ;;
+
+  -program-suffix | --program-suffix | --program-suffi | --program-suff \
+  | --program-suf | --program-su | --program-s)
+    ac_prev=program_suffix ;;
+  -program-suffix=* | --program-suffix=* | --program-suffi=* \
+  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+    program_suffix=$ac_optarg ;;
+
+  -program-transform-name | --program-transform-name \
+  | --program-transform-nam | --program-transform-na \
+  | --program-transform-n | --program-transform- \
+  | --program-transform | --program-transfor \
+  | --program-transfo | --program-transf \
+  | --program-trans | --program-tran \
+  | --progr-tra | --program-tr | --program-t)
+    ac_prev=program_transform_name ;;
+  -program-transform-name=* | --program-transform-name=* \
+  | --program-transform-nam=* | --program-transform-na=* \
+  | --program-transform-n=* | --program-transform-=* \
+  | --program-transform=* | --program-transfor=* \
+  | --program-transfo=* | --program-transf=* \
+  | --program-trans=* | --program-tran=* \
+  | --progr-tra=* | --program-tr=* | --program-t=*)
+    program_transform_name=$ac_optarg ;;
+
+  -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
+    ac_prev=pdfdir ;;
+  -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
+    pdfdir=$ac_optarg ;;
+
+  -psdir | --psdir | --psdi | --psd | --ps)
+    ac_prev=psdir ;;
+  -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
+    psdir=$ac_optarg ;;
+
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil)
+    silent=yes ;;
+
+  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+    ac_prev=sbindir ;;
+  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+  | --sbi=* | --sb=*)
+    sbindir=$ac_optarg ;;
+
+  -sharedstatedir | --sharedstatedir | --sharedstatedi \
+  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+  | --sharedst | --shareds | --shared | --share | --shar \
+  | --sha | --sh)
+    ac_prev=sharedstatedir ;;
+  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+  | --sha=* | --sh=*)
+    sharedstatedir=$ac_optarg ;;
+
+  -site | --site | --sit)
+    ac_prev=site ;;
+  -site=* | --site=* | --sit=*)
+    site=$ac_optarg ;;
+
+  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+    ac_prev=srcdir ;;
+  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+    srcdir=$ac_optarg ;;
+
+  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+  | --syscon | --sysco | --sysc | --sys | --sy)
+    ac_prev=sysconfdir ;;
+  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+    sysconfdir=$ac_optarg ;;
+
+  -target | --target | --targe | --targ | --tar | --ta | --t)
+    ac_prev=target_alias ;;
+  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+    target_alias=$ac_optarg ;;
+
+  -v | -verbose | --verbose | --verbos | --verbo | --verb)
+    verbose=yes ;;
+
+  -version | --version | --versio | --versi | --vers | -V)
+    ac_init_version=: ;;
+
+  -with-* | --with-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=\$ac_optarg ;;
+
+  -without-* | --without-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=no ;;
+
+  --x)
+    # Obsolete; use --with-x.
+    with_x=yes ;;
+
+  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+  | --x-incl | --x-inc | --x-in | --x-i)
+    ac_prev=x_includes ;;
+  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+    x_includes=$ac_optarg ;;
+
+  -x-libraries | --x-libraries | --x-librarie | --x-librari \
+  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+    ac_prev=x_libraries ;;
+  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+    x_libraries=$ac_optarg ;;
+
+  -*) as_fn_error $? "unrecognized option: \`$ac_option'
+Try \`$0 --help' for more information"
+    ;;
+
+  *=*)
+    ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+    # Reject names that are not valid shell variable names.
+    case $ac_envvar in #(
+      '' | [0-9]* | *[!_$as_cr_alnum]* )
+      as_fn_error $? "invalid variable name: \`$ac_envvar'" ;;
+    esac
+    eval $ac_envvar=\$ac_optarg
+    export $ac_envvar ;;
+
+  *)
+    # FIXME: should be removed in autoconf 3.0.
+    $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+    expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+    : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}"
+    ;;
+
+  esac
+done
+
+if test -n "$ac_prev"; then
+  ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+  as_fn_error $? "missing argument to $ac_option"
+fi
+
+if test -n "$ac_unrecognized_opts"; then
+  case $enable_option_checking in
+    no) ;;
+    fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
+    *)     $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+  esac
+fi
+
+# Check all directory arguments for consistency.
+for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
+		datadir sysconfdir sharedstatedir localstatedir includedir \
+		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
+		libdir localedir mandir
+do
+  eval ac_val=\$$ac_var
+  # Remove trailing slashes.
+  case $ac_val in
+    */ )
+      ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'`
+      eval $ac_var=\$ac_val;;
+  esac
+  # Be sure to have absolute directory names.
+  case $ac_val in
+    [\\/$]* | ?:[\\/]* )  continue;;
+    NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
+  esac
+  as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val"
+done
+
+# There might be people who depend on the old broken behavior: `$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+  if test "x$build_alias" = x; then
+    cross_compiling=maybe
+  elif test "x$build_alias" != "x$host_alias"; then
+    cross_compiling=yes
+  fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+ac_pwd=`pwd` && test -n "$ac_pwd" &&
+ac_ls_di=`ls -di .` &&
+ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
+  as_fn_error $? "working directory cannot be determined"
+test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
+  as_fn_error $? "pwd does not report name of working directory"
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+  ac_srcdir_defaulted=yes
+  # Try the directory containing this script, then the parent directory.
+  ac_confdir=`$as_dirname -- "$as_myself" ||
+$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_myself" : 'X\(//\)[^/]' \| \
+	 X"$as_myself" : 'X\(//\)$' \| \
+	 X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_myself" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  srcdir=$ac_confdir
+  if test ! -r "$srcdir/$ac_unique_file"; then
+    srcdir=..
+  fi
+else
+  ac_srcdir_defaulted=no
+fi
+if test ! -r "$srcdir/$ac_unique_file"; then
+  test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
+  as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir"
+fi
+ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
+ac_abs_confdir=`(
+	cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
+	pwd)`
+# When building in place, set srcdir=.
+if test "$ac_abs_confdir" = "$ac_pwd"; then
+  srcdir=.
+fi
+# Remove unnecessary trailing slashes from srcdir.
+# Double slashes in file names in object file debugging info
+# mess up M-x gdb in Emacs.
+case $srcdir in
+*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
+esac
+for ac_var in $ac_precious_vars; do
+  eval ac_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_env_${ac_var}_value=\$${ac_var}
+  eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_cv_env_${ac_var}_value=\$${ac_var}
+done
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+  # Omit some internal or obsolete options to make the list less imposing.
+  # This message is too long to be a string in the A/UX 3.1 sh.
+  cat <<_ACEOF
+\`configure' configures fftw 3.3.3 to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE.  See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+  -h, --help              display this help and exit
+      --help=short        display options specific to this package
+      --help=recursive    display the short help of all the included packages
+  -V, --version           display version information and exit
+  -q, --quiet, --silent   do not print \`checking ...' messages
+      --cache-file=FILE   cache test results in FILE [disabled]
+  -C, --config-cache      alias for \`--cache-file=config.cache'
+  -n, --no-create         do not create output files
+      --srcdir=DIR        find the sources in DIR [configure dir or \`..']
+
+Installation directories:
+  --prefix=PREFIX         install architecture-independent files in PREFIX
+                          [$ac_default_prefix]
+  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
+                          [PREFIX]
+
+By default, \`make install' will install all the files in
+\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc.  You can specify
+an installation prefix other than \`$ac_default_prefix' using \`--prefix',
+for instance \`--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+  --bindir=DIR            user executables [EPREFIX/bin]
+  --sbindir=DIR           system admin executables [EPREFIX/sbin]
+  --libexecdir=DIR        program executables [EPREFIX/libexec]
+  --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
+  --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
+  --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --libdir=DIR            object code libraries [EPREFIX/lib]
+  --includedir=DIR        C header files [PREFIX/include]
+  --oldincludedir=DIR     C header files for non-gcc [/usr/include]
+  --datarootdir=DIR       read-only arch.-independent data root [PREFIX/share]
+  --datadir=DIR           read-only architecture-independent data [DATAROOTDIR]
+  --infodir=DIR           info documentation [DATAROOTDIR/info]
+  --localedir=DIR         locale-dependent data [DATAROOTDIR/locale]
+  --mandir=DIR            man documentation [DATAROOTDIR/man]
+  --docdir=DIR            documentation root [DATAROOTDIR/doc/fftw]
+  --htmldir=DIR           html documentation [DOCDIR]
+  --dvidir=DIR            dvi documentation [DOCDIR]
+  --pdfdir=DIR            pdf documentation [DOCDIR]
+  --psdir=DIR             ps documentation [DOCDIR]
+_ACEOF
+
+  cat <<\_ACEOF
+
+Program names:
+  --program-prefix=PREFIX            prepend PREFIX to installed program names
+  --program-suffix=SUFFIX            append SUFFIX to installed program names
+  --program-transform-name=PROGRAM   run sed PROGRAM on installed program names
+
+System types:
+  --build=BUILD     configure for building on BUILD [guessed]
+  --host=HOST       cross-compile to build programs to run on HOST [BUILD]
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+  case $ac_init_help in
+     short | recursive ) echo "Configuration of fftw 3.3.3:";;
+   esac
+  cat <<\_ACEOF
+
+Optional Features:
+  --disable-option-checking  ignore unrecognized --enable/--with options
+  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
+  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
+  --enable-maintainer-mode  enable make rules and dependencies not useful
+			  (and sometimes confusing) to the casual installer
+  --enable-shared[=PKGS]  build shared libraries [default=no]
+  --enable-fma            enable optimizations for machines with fused
+                          multiply-add
+  --enable-debug          compile fftw with extra runtime checks for debugging
+  --enable-debug-malloc   enable malloc debugging version
+  --enable-debug-alignment
+                          enable alignment debugging hacks
+  --enable-random-estimator
+                          enable pseudorandom estimator (debugging hack)
+  --disable-alloca        disable use of the alloca() function (may be broken
+                          on mingw64)
+  --enable-single         compile fftw in single precision
+  --enable-float          synonym for --enable-single
+  --enable-long-double    compile fftw in long-double precision
+  --enable-quad-precision compile fftw in quadruple precision if available
+  --enable-sse            enable SSE optimizations
+  --enable-sse2           enable SSE/SSE2 optimizations
+  --enable-avx            enable AVX optimizations
+  --enable-altivec        enable Altivec optimizations
+  --enable-neon           enable ARM NEON optimizations
+  --enable-mips-zbus-timer
+                          use MIPS ZBus cycle-counter
+  --disable-dependency-tracking  speeds up one-time build
+  --enable-dependency-tracking   do not reject slow dependency extractors
+  --enable-static[=PKGS]  build static libraries [default=yes]
+  --enable-fast-install[=PKGS]
+                          optimize for fast installation [default=yes]
+  --disable-libtool-lock  avoid locking (might break parallel builds)
+  --enable-mpi            compile FFTW MPI library
+  --disable-fortran       don't include Fortran-callable wrappers
+  --enable-openmp         use OpenMP directives for parallelism
+  --enable-threads        compile FFTW SMP threads library
+
+Optional Packages:
+  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
+  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
+  --with-slow-timer       use low-precision timers (SLOW)
+  --with-our-malloc       use our aligned malloc (helpful for Win32)
+  --with-our-malloc16     Obsolete alias for --with-our-malloc16
+  --with-windows-f77-mangling
+                          use common Win32 Fortran interface styles
+  --with-incoming-stack-boundary=X
+                          Assume that stack is aligned to (1<<X) bytes
+  --with-pic[=PKGS]       try to use only PIC/non-PIC objects [default=use
+                          both]
+  --with-gnu-ld           assume the C compiler uses GNU ld [default=no]
+  --with-sysroot=DIR Search for dependent libraries within DIR
+                        (or the compiler's sysroot if not specified).
+  --with-g77-wrappers     force inclusion of g77-compatible wrappers in
+                          addition to any other Fortran compiler that is
+                          detected
+  --with-combined-threads combine threads into main libfftw3
+
+Some influential environment variables:
+  CC          C compiler command
+  CFLAGS      C compiler flags
+  LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
+              nonstandard directory <lib dir>
+  LIBS        libraries to pass to the linker, e.g. -l<library>
+  CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
+              you have headers in a nonstandard directory <include dir>
+  CPP         C preprocessor
+  MPICC       MPI C compiler command
+  F77         Fortran 77 compiler command
+  FFLAGS      Fortran 77 compiler flags
+
+Use these variables to override the choices made by `configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+Report bugs to <fftw@fftw.org>.
+_ACEOF
+ac_status=$?
+fi
+
+if test "$ac_init_help" = "recursive"; then
+  # If there are subdirs, report their specific --help.
+  for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+    test -d "$ac_dir" ||
+      { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } ||
+      continue
+    ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+    cd "$ac_dir" || { ac_status=$?; continue; }
+    # Check for guested configure.
+    if test -f "$ac_srcdir/configure.gnu"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure.gnu" --help=recursive
+    elif test -f "$ac_srcdir/configure"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure" --help=recursive
+    else
+      $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+    fi || ac_status=$?
+    cd "$ac_pwd" || { ac_status=$?; break; }
+  done
+fi
+
+test -n "$ac_init_help" && exit $ac_status
+if $ac_init_version; then
+  cat <<\_ACEOF
+fftw configure 3.3.3
+generated by GNU Autoconf 2.69
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+_ACEOF
+  exit
+fi
+
+## ------------------------ ##
+## Autoconf initialization. ##
+## ------------------------ ##
+
+# ac_fn_c_try_compile LINENO
+# --------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_compile
+
+# ac_fn_c_try_link LINENO
+# -----------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_link ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext conftest$ac_exeext
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 test -x conftest$ac_exeext
+       }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
+  # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
+  # interfere with the next link command; also delete a directory that is
+  # left behind by Apple's compiler.  We do this before executing the actions.
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_link
+
+# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists and can be compiled using the include files in
+# INCLUDES, setting the cache variable VAR accordingly.
+ac_fn_c_check_header_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_header_compile
+
+# ac_fn_c_try_cpp LINENO
+# ----------------------
+# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_cpp ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } > conftest.i && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+    ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_cpp
+
+# ac_fn_c_try_run LINENO
+# ----------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
+# that executables *can* be run.
+ac_fn_c_try_run ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: program exited with status $ac_status" >&5
+       $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+       ac_retval=$ac_status
+fi
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_run
+
+# ac_fn_c_check_func LINENO FUNC VAR
+# ----------------------------------
+# Tests whether FUNC exists, setting the cache variable VAR accordingly
+ac_fn_c_check_func ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+/* Define $2 to an innocuous variant, in case <limits.h> declares $2.
+   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
+#define $2 innocuous_$2
+
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char $2 (); below.
+    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+    <limits.h> exists even on freestanding compilers.  */
+
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+
+#undef $2
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $2 ();
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined __stub_$2 || defined __stub___$2
+choke me
+#endif
+
+int
+main ()
+{
+return $2 ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_func
+
+# ac_fn_c_compute_int LINENO EXPR VAR INCLUDES
+# --------------------------------------------
+# Tries to find the compile-time value of EXPR in a program that includes
+# INCLUDES, setting VAR accordingly. Returns whether the value could be
+# computed
+ac_fn_c_compute_int ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if test "$cross_compiling" = yes; then
+    # Depending upon the size, compute the lo and hi bounds.
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) >= 0)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_lo=0 ac_mid=0
+  while :; do
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) <= $ac_mid)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_hi=$ac_mid; break
+else
+  as_fn_arith $ac_mid + 1 && ac_lo=$as_val
+			if test $ac_lo -le $ac_mid; then
+			  ac_lo= ac_hi=
+			  break
+			fi
+			as_fn_arith 2 '*' $ac_mid + 1 && ac_mid=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) < 0)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_hi=-1 ac_mid=-1
+  while :; do
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) >= $ac_mid)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_lo=$ac_mid; break
+else
+  as_fn_arith '(' $ac_mid ')' - 1 && ac_hi=$as_val
+			if test $ac_mid -le $ac_hi; then
+			  ac_lo= ac_hi=
+			  break
+			fi
+			as_fn_arith 2 '*' $ac_mid && ac_mid=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
+else
+  ac_lo= ac_hi=
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+# Binary search between lo and hi bounds.
+while test "x$ac_lo" != "x$ac_hi"; do
+  as_fn_arith '(' $ac_hi - $ac_lo ')' / 2 + $ac_lo && ac_mid=$as_val
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) <= $ac_mid)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_hi=$ac_mid
+else
+  as_fn_arith '(' $ac_mid ')' + 1 && ac_lo=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+done
+case $ac_lo in #((
+?*) eval "$3=\$ac_lo"; ac_retval=0 ;;
+'') ac_retval=1 ;;
+esac
+  else
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+static long int longval () { return $2; }
+static unsigned long int ulongval () { return $2; }
+#include <stdio.h>
+#include <stdlib.h>
+int
+main ()
+{
+
+  FILE *f = fopen ("conftest.val", "w");
+  if (! f)
+    return 1;
+  if (($2) < 0)
+    {
+      long int i = longval ();
+      if (i != ($2))
+	return 1;
+      fprintf (f, "%ld", i);
+    }
+  else
+    {
+      unsigned long int i = ulongval ();
+      if (i != ($2))
+	return 1;
+      fprintf (f, "%lu", i);
+    }
+  /* Do not output a trailing newline, as this causes \r\n confusion
+     on some platforms.  */
+  return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  echo >>conftest.val; read $3 <conftest.val; ac_retval=0
+else
+  ac_retval=1
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+rm -f conftest.val
+
+  fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_compute_int
+
+# ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists, giving a warning if it cannot be compiled using
+# the include files in INCLUDES and setting the cache variable VAR
+# accordingly.
+ac_fn_c_check_header_mongrel ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if eval \${$3+:} false; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+else
+  # Is the header compilable?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5
+$as_echo_n "checking $2 usability... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_header_compiler=yes
+else
+  ac_header_compiler=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5
+$as_echo "$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5
+$as_echo_n "checking $2 presence... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <$2>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  ac_header_preproc=yes
+else
+  ac_header_preproc=no
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5
+$as_echo "$ac_header_preproc" >&6; }
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in #((
+  yes:no: )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5
+$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+    ;;
+  no:yes:* )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5
+$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     check for missing prerequisite headers?" >&5
+$as_echo "$as_me: WARNING: $2:     check for missing prerequisite headers?" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5
+$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&5
+$as_echo "$as_me: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+( $as_echo "## ---------------------------- ##
+## Report this to fftw@fftw.org ##
+## ---------------------------- ##"
+     ) | sed "s/^/$as_me: WARNING:     /" >&2
+    ;;
+esac
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=\$ac_header_compiler"
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_header_mongrel
+
+# ac_fn_c_check_type LINENO TYPE VAR INCLUDES
+# -------------------------------------------
+# Tests whether TYPE exists after having included INCLUDES, setting cache
+# variable VAR accordingly.
+ac_fn_c_check_type ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=no"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+if (sizeof ($2))
+	 return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+if (sizeof (($2)))
+	    return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+else
+  eval "$3=yes"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_type
+
+# ac_fn_c_check_decl LINENO SYMBOL VAR INCLUDES
+# ---------------------------------------------
+# Tests whether SYMBOL is declared in INCLUDES, setting cache variable VAR
+# accordingly.
+ac_fn_c_check_decl ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  as_decl_name=`echo $2|sed 's/ *(.*//'`
+  as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'`
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5
+$as_echo_n "checking whether $as_decl_name is declared... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+#ifndef $as_decl_name
+#ifdef __cplusplus
+  (void) $as_decl_use;
+#else
+  (void) $as_decl_name;
+#endif
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_decl
+
+# ac_fn_f77_try_compile LINENO
+# ----------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_f77_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_f77_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_f77_try_compile
+
+# ac_fn_f77_try_link LINENO
+# -------------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded.
+ac_fn_f77_try_link ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext conftest$ac_exeext
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_f77_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 test -x conftest$ac_exeext
+       }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
+  # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
+  # interfere with the next link command; also delete a directory that is
+  # left behind by Apple's compiler.  We do this before executing the actions.
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_f77_try_link
+cat >config.log <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by fftw $as_me 3.3.3, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
+
+  $ $0 $@
+
+_ACEOF
+exec 5>>config.log
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null     || echo unknown`
+
+/bin/arch              = `(/bin/arch) 2>/dev/null              || echo unknown`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null       || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+/usr/bin/hostinfo      = `(/usr/bin/hostinfo) 2>/dev/null      || echo unknown`
+/bin/machine           = `(/bin/machine) 2>/dev/null           || echo unknown`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null       || echo unknown`
+/bin/universe          = `(/bin/universe) 2>/dev/null          || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    $as_echo "PATH: $as_dir"
+  done
+IFS=$as_save_IFS
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+  for ac_arg
+  do
+    case $ac_arg in
+    -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+    -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+    | -silent | --silent | --silen | --sile | --sil)
+      continue ;;
+    *\'*)
+      ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    esac
+    case $ac_pass in
+    1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
+    2)
+      as_fn_append ac_configure_args1 " '$ac_arg'"
+      if test $ac_must_keep_next = true; then
+	ac_must_keep_next=false # Got value, back to normal.
+      else
+	case $ac_arg in
+	  *=* | --config-cache | -C | -disable-* | --disable-* \
+	  | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+	  | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+	  | -with-* | --with-* | -without-* | --without-* | --x)
+	    case "$ac_configure_args0 " in
+	      "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+	    esac
+	    ;;
+	  -* ) ac_must_keep_next=true ;;
+	esac
+      fi
+      as_fn_append ac_configure_args " '$ac_arg'"
+      ;;
+    esac
+  done
+done
+{ ac_configure_args0=; unset ac_configure_args0;}
+{ ac_configure_args1=; unset ac_configure_args1;}
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log.  We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Use '\'' to represent an apostrophe within the trap.
+# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
+trap 'exit_status=$?
+  # Save into config.log some information that might help in debugging.
+  {
+    echo
+
+    $as_echo "## ---------------- ##
+## Cache variables. ##
+## ---------------- ##"
+    echo
+    # The following way of writing the cache mishandles newlines in values,
+(
+  for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+  (set) 2>&1 |
+    case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      sed -n \
+	"s/'\''/'\''\\\\'\'''\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
+      ;; #(
+    *)
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+)
+    echo
+
+    $as_echo "## ----------------- ##
+## Output variables. ##
+## ----------------- ##"
+    echo
+    for ac_var in $ac_subst_vars
+    do
+      eval ac_val=\$$ac_var
+      case $ac_val in
+      *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+      esac
+      $as_echo "$ac_var='\''$ac_val'\''"
+    done | sort
+    echo
+
+    if test -n "$ac_subst_files"; then
+      $as_echo "## ------------------- ##
+## File substitutions. ##
+## ------------------- ##"
+      echo
+      for ac_var in $ac_subst_files
+      do
+	eval ac_val=\$$ac_var
+	case $ac_val in
+	*\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+	esac
+	$as_echo "$ac_var='\''$ac_val'\''"
+      done | sort
+      echo
+    fi
+
+    if test -s confdefs.h; then
+      $as_echo "## ----------- ##
+## confdefs.h. ##
+## ----------- ##"
+      echo
+      cat confdefs.h
+      echo
+    fi
+    test "$ac_signal" != 0 &&
+      $as_echo "$as_me: caught signal $ac_signal"
+    $as_echo "$as_me: exit $exit_status"
+  } >&5
+  rm -f core *.core core.conftest.* &&
+    rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
+    exit $exit_status
+' 0
+for ac_signal in 1 2 13 15; do
+  trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -f -r conftest* confdefs.h
+
+$as_echo "/* confdefs.h */" > confdefs.h
+
+# Predefined preprocessor variables.
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_NAME "$PACKAGE_NAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_VERSION "$PACKAGE_VERSION"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_STRING "$PACKAGE_STRING"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_URL "$PACKAGE_URL"
+_ACEOF
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer an explicitly selected file to automatically selected ones.
+ac_site_file1=NONE
+ac_site_file2=NONE
+if test -n "$CONFIG_SITE"; then
+  # We do not want a PATH search for config.site.
+  case $CONFIG_SITE in #((
+    -*)  ac_site_file1=./$CONFIG_SITE;;
+    */*) ac_site_file1=$CONFIG_SITE;;
+    *)   ac_site_file1=./$CONFIG_SITE;;
+  esac
+elif test "x$prefix" != xNONE; then
+  ac_site_file1=$prefix/share/config.site
+  ac_site_file2=$prefix/etc/config.site
+else
+  ac_site_file1=$ac_default_prefix/share/config.site
+  ac_site_file2=$ac_default_prefix/etc/config.site
+fi
+for ac_site_file in "$ac_site_file1" "$ac_site_file2"
+do
+  test "x$ac_site_file" = xNONE && continue
+  if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
+$as_echo "$as_me: loading site script $ac_site_file" >&6;}
+    sed 's/^/| /' "$ac_site_file" >&5
+    . "$ac_site_file" \
+      || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "failed to load site script $ac_site_file
+See \`config.log' for more details" "$LINENO" 5; }
+  fi
+done
+
+if test -r "$cache_file"; then
+  # Some versions of bash will fail to source /dev/null (special files
+  # actually), so we avoid doing that.  DJGPP emulates it as a regular file.
+  if test /dev/null != "$cache_file" && test -f "$cache_file"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
+$as_echo "$as_me: loading cache $cache_file" >&6;}
+    case $cache_file in
+      [\\/]* | ?:[\\/]* ) . "$cache_file";;
+      *)                      . "./$cache_file";;
+    esac
+  fi
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
+$as_echo "$as_me: creating cache $cache_file" >&6;}
+  >$cache_file
+fi
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+  eval ac_old_set=\$ac_cv_env_${ac_var}_set
+  eval ac_new_set=\$ac_env_${ac_var}_set
+  eval ac_old_val=\$ac_cv_env_${ac_var}_value
+  eval ac_new_val=\$ac_env_${ac_var}_value
+  case $ac_old_set,$ac_new_set in
+    set,)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,set)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,);;
+    *)
+      if test "x$ac_old_val" != "x$ac_new_val"; then
+	# differences in whitespace do not lead to failure.
+	ac_old_val_w=`echo x $ac_old_val`
+	ac_new_val_w=`echo x $ac_new_val`
+	if test "$ac_old_val_w" != "$ac_new_val_w"; then
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
+$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+	  ac_cache_corrupted=:
+	else
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
+$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
+	  eval $ac_var=\$ac_old_val
+	fi
+	{ $as_echo "$as_me:${as_lineno-$LINENO}:   former value:  \`$ac_old_val'" >&5
+$as_echo "$as_me:   former value:  \`$ac_old_val'" >&2;}
+	{ $as_echo "$as_me:${as_lineno-$LINENO}:   current value: \`$ac_new_val'" >&5
+$as_echo "$as_me:   current value: \`$ac_new_val'" >&2;}
+      fi;;
+  esac
+  # Pass precious variables to config.status.
+  if test "$ac_new_set" = set; then
+    case $ac_new_val in
+    *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *) ac_arg=$ac_var=$ac_new_val ;;
+    esac
+    case " $ac_configure_args " in
+      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
+      *) as_fn_append ac_configure_args " '$ac_arg'" ;;
+    esac
+  fi
+done
+if $ac_cache_corrupted; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
+$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
+  as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5
+fi
+## -------------------- ##
+## Main body of script. ##
+## -------------------- ##
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+# fftw-3.1.x was 4:X:1
+# fftw-3.2.x was 5:X:2
+# fftw-3.3.x was 6:X:3
+SHARED_VERSION_INFO="6:2:3" # CURRENT:REVISION:AGE
+
+am__api_version='1.11'
+
+ac_aux_dir=
+for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do
+  if test -f "$ac_dir/install-sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install-sh -c"
+    break
+  elif test -f "$ac_dir/install.sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install.sh -c"
+    break
+  elif test -f "$ac_dir/shtool"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/shtool install -c"
+    break
+  fi
+done
+if test -z "$ac_aux_dir"; then
+  as_fn_error $? "cannot find install-sh, install.sh, or shtool in \"$srcdir\" \"$srcdir/..\" \"$srcdir/../..\"" "$LINENO" 5
+fi
+
+# These three variables are undocumented and unsupported,
+# and are intended to be withdrawn in a future Autoconf release.
+# They can cause serious problems if a builder's source tree is in a directory
+# whose full name contains unusual characters.
+ac_config_guess="$SHELL $ac_aux_dir/config.guess"  # Please don't use this var.
+ac_config_sub="$SHELL $ac_aux_dir/config.sub"  # Please don't use this var.
+ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
+
+
+# Find a good install program.  We prefer a C program (faster),
+# so one script is as good as another.  But avoid the broken or
+# incompatible versions:
+# SysV /etc/install, /usr/sbin/install
+# SunOS /usr/etc/install
+# IRIX /sbin/install
+# AIX /bin/install
+# AmigaOS /C/install, which installs bootblocks on floppy discs
+# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
+# AFS /usr/afsws/bin/install, which mishandles nonexistent args
+# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
+# OS/2's system install, which has a completely different semantic
+# ./install, which can be erroneously created by make from ./install.sh.
+# Reject install programs that cannot install multiple files.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a BSD-compatible install" >&5
+$as_echo_n "checking for a BSD-compatible install... " >&6; }
+if test -z "$INSTALL"; then
+if ${ac_cv_path_install+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    # Account for people who put trailing slashes in PATH elements.
+case $as_dir/ in #((
+  ./ | .// | /[cC]/* | \
+  /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \
+  ?:[\\/]os2[\\/]install[\\/]* | ?:[\\/]OS2[\\/]INSTALL[\\/]* | \
+  /usr/ucb/* ) ;;
+  *)
+    # OSF1 and SCO ODT 3.0 have their own names for install.
+    # Don't use installbsd from OSF since it installs stuff as root
+    # by default.
+    for ac_prog in ginstall scoinst install; do
+      for ac_exec_ext in '' $ac_executable_extensions; do
+	if as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then
+	  if test $ac_prog = install &&
+	    grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
+	    # AIX install.  It has an incompatible calling convention.
+	    :
+	  elif test $ac_prog = install &&
+	    grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
+	    # program-specific install script used by HP pwplus--don't use.
+	    :
+	  else
+	    rm -rf conftest.one conftest.two conftest.dir
+	    echo one > conftest.one
+	    echo two > conftest.two
+	    mkdir conftest.dir
+	    if "$as_dir/$ac_prog$ac_exec_ext" -c conftest.one conftest.two "`pwd`/conftest.dir" &&
+	      test -s conftest.one && test -s conftest.two &&
+	      test -s conftest.dir/conftest.one &&
+	      test -s conftest.dir/conftest.two
+	    then
+	      ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c"
+	      break 3
+	    fi
+	  fi
+	fi
+      done
+    done
+    ;;
+esac
+
+  done
+IFS=$as_save_IFS
+
+rm -rf conftest.one conftest.two conftest.dir
+
+fi
+  if test "${ac_cv_path_install+set}" = set; then
+    INSTALL=$ac_cv_path_install
+  else
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for INSTALL within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    INSTALL=$ac_install_sh
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $INSTALL" >&5
+$as_echo "$INSTALL" >&6; }
+
+# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
+# It thinks the first close brace ends the variable substitution.
+test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
+
+test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
+
+test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether build environment is sane" >&5
+$as_echo_n "checking whether build environment is sane... " >&6; }
+# Just in case
+sleep 1
+echo timestamp > conftest.file
+# Reject unsafe characters in $srcdir or the absolute working directory
+# name.  Accept space and tab only in the latter.
+am_lf='
+'
+case `pwd` in
+  *[\\\"\#\$\&\'\`$am_lf]*)
+    as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;;
+esac
+case $srcdir in
+  *[\\\"\#\$\&\'\`$am_lf\ \	]*)
+    as_fn_error $? "unsafe srcdir value: \`$srcdir'" "$LINENO" 5;;
+esac
+
+# Do `set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
+   if test "$*" = "X"; then
+      # -L didn't work.
+      set X `ls -t "$srcdir/configure" conftest.file`
+   fi
+   rm -f conftest.file
+   if test "$*" != "X $srcdir/configure conftest.file" \
+      && test "$*" != "X conftest.file $srcdir/configure"; then
+
+      # If neither matched, then we have a broken ls.  This can happen
+      # if, for instance, CONFIG_SHELL is bash and it inherits a
+      # broken ls alias from the environment.  This has actually
+      # happened.  Such a system could not be considered "sane".
+      as_fn_error $? "ls -t appears to fail.  Make sure there is not a broken
+alias in your environment" "$LINENO" 5
+   fi
+
+   test "$2" = conftest.file
+   )
+then
+   # Ok.
+   :
+else
+   as_fn_error $? "newly created file is older than distributed files!
+Check your system clock" "$LINENO" 5
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+test "$program_prefix" != NONE &&
+  program_transform_name="s&^&$program_prefix&;$program_transform_name"
+# Use a double $ so make ignores it.
+test "$program_suffix" != NONE &&
+  program_transform_name="s&\$&$program_suffix&;$program_transform_name"
+# Double any \ or $.
+# By default was `s,x,x', remove it if useless.
+ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
+program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
+
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`
+
+if test x"${MISSING+set}" != xset; then
+  case $am_aux_dir in
+  *\ * | *\	*)
+    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
+  *)
+    MISSING="\${SHELL} $am_aux_dir/missing" ;;
+  esac
+fi
+# Use eval to expand $SHELL
+if eval "$MISSING --run true"; then
+  am_missing_run="$MISSING --run "
+else
+  am_missing_run=
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: \`missing' script is too old or missing" >&5
+$as_echo "$as_me: WARNING: \`missing' script is too old or missing" >&2;}
+fi
+
+if test x"${install_sh}" != xset; then
+  case $am_aux_dir in
+  *\ * | *\	*)
+    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
+  *)
+    install_sh="\${SHELL} $am_aux_dir/install-sh"
+  esac
+fi
+
+# Installed binaries are usually stripped using `strip' when the user
+# run `make install-strip'.  However `strip' might not be the right
+# tool to use in cross-compilation environments, therefore Automake
+# will honor the `STRIP' environment variable to overrule this program.
+if test "$cross_compiling" != no; then
+  if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
+set dummy ${ac_tool_prefix}strip; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_STRIP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_STRIP="${ac_tool_prefix}strip"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+STRIP=$ac_cv_prog_STRIP
+if test -n "$STRIP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $STRIP" >&5
+$as_echo "$STRIP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_STRIP"; then
+  ac_ct_STRIP=$STRIP
+  # Extract the first word of "strip", so it can be a program name with args.
+set dummy strip; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_STRIP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_STRIP"; then
+  ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_STRIP="strip"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP
+if test -n "$ac_ct_STRIP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_STRIP" >&5
+$as_echo "$ac_ct_STRIP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_STRIP" = x; then
+    STRIP=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    STRIP=$ac_ct_STRIP
+  fi
+else
+  STRIP="$ac_cv_prog_STRIP"
+fi
+
+fi
+INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a thread-safe mkdir -p" >&5
+$as_echo_n "checking for a thread-safe mkdir -p... " >&6; }
+if test -z "$MKDIR_P"; then
+  if ${ac_cv_path_mkdir+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/opt/sfw/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in mkdir gmkdir; do
+	 for ac_exec_ext in '' $ac_executable_extensions; do
+	   as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext" || continue
+	   case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #(
+	     'mkdir (GNU coreutils) '* | \
+	     'mkdir (coreutils) '* | \
+	     'mkdir (fileutils) '4.1*)
+	       ac_cv_path_mkdir=$as_dir/$ac_prog$ac_exec_ext
+	       break 3;;
+	   esac
+	 done
+       done
+  done
+IFS=$as_save_IFS
+
+fi
+
+  test -d ./--version && rmdir ./--version
+  if test "${ac_cv_path_mkdir+set}" = set; then
+    MKDIR_P="$ac_cv_path_mkdir -p"
+  else
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for MKDIR_P within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    MKDIR_P="$ac_install_sh -d"
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $MKDIR_P" >&5
+$as_echo "$MKDIR_P" >&6; }
+
+mkdir_p="$MKDIR_P"
+case $mkdir_p in
+  [\\/$]* | ?:[\\/]*) ;;
+  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
+esac
+
+for ac_prog in gawk mawk nawk awk
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AWK+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$AWK"; then
+  ac_cv_prog_AWK="$AWK" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_AWK="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+AWK=$ac_cv_prog_AWK
+if test -n "$AWK"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AWK" >&5
+$as_echo "$AWK" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$AWK" && break
+done
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5
+$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; }
+set x ${MAKE-make}
+ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'`
+if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat >conftest.make <<\_ACEOF
+SHELL = /bin/sh
+all:
+	@echo '@@@%%%=$(MAKE)=@@@%%%'
+_ACEOF
+# GNU make sometimes prints "make[1]: Entering ...", which would confuse us.
+case `${MAKE-make} -f conftest.make 2>/dev/null` in
+  *@@@%%%=?*=@@@%%%*)
+    eval ac_cv_prog_make_${ac_make}_set=yes;;
+  *)
+    eval ac_cv_prog_make_${ac_make}_set=no;;
+esac
+rm -f conftest.make
+fi
+if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+  SET_MAKE=
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+  SET_MAKE="MAKE=${MAKE-make}"
+fi
+
+rm -rf .tst 2>/dev/null
+mkdir .tst 2>/dev/null
+if test -d .tst; then
+  am__leading_dot=.
+else
+  am__leading_dot=_
+fi
+rmdir .tst 2>/dev/null
+
+if test "`cd $srcdir && pwd`" != "`pwd`"; then
+  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
+  # is not polluted with repeated "-I."
+  am__isrc=' -I$(srcdir)'
+  # test to see if srcdir already configured
+  if test -f $srcdir/config.status; then
+    as_fn_error $? "source directory already configured; run \"make distclean\" there first" "$LINENO" 5
+  fi
+fi
+
+# test whether we have cygpath
+if test -z "$CYGPATH_W"; then
+  if (cygpath --version) >/dev/null 2>/dev/null; then
+    CYGPATH_W='cygpath -w'
+  else
+    CYGPATH_W=echo
+  fi
+fi
+
+
+# Define the identity of the package.
+ PACKAGE='fftw'
+ VERSION='3.3.3'
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE "$PACKAGE"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define VERSION "$VERSION"
+_ACEOF
+
+# Some tools Automake needs.
+
+ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal-${am__api_version}"}
+
+
+AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
+
+
+AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake-${am__api_version}"}
+
+
+AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
+
+
+MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
+
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
+# Always define AMTAR for backward compatibility.  Yes, it's still used
+# in the wild :-(  We should find a proper way to deprecate it ...
+AMTAR='$${TAR-tar}'
+
+am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
+
+
+
+
+
+ac_config_headers="$ac_config_headers config.h"
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to enable maintainer-specific portions of Makefiles" >&5
+$as_echo_n "checking whether to enable maintainer-specific portions of Makefiles... " >&6; }
+    # Check whether --enable-maintainer-mode was given.
+if test "${enable_maintainer_mode+set}" = set; then :
+  enableval=$enable_maintainer_mode; USE_MAINTAINER_MODE=$enableval
+else
+  USE_MAINTAINER_MODE=no
+fi
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $USE_MAINTAINER_MODE" >&5
+$as_echo "$USE_MAINTAINER_MODE" >&6; }
+   if test $USE_MAINTAINER_MODE = yes; then
+  MAINTAINER_MODE_TRUE=
+  MAINTAINER_MODE_FALSE='#'
+else
+  MAINTAINER_MODE_TRUE='#'
+  MAINTAINER_MODE_FALSE=
+fi
+
+  MAINT=$MAINTAINER_MODE_TRUE
+
+
+
+# Check whether --enable-shared was given.
+if test "${enable_shared+set}" = set; then :
+  enableval=$enable_shared; p=${PACKAGE-default}
+    case $enableval in
+    yes) enable_shared=yes ;;
+    no) enable_shared=no ;;
+    *)
+      enable_shared=no
+      # Look at the argument we got.  We use all the common list separators.
+      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      for pkg in $enableval; do
+	IFS="$lt_save_ifs"
+	if test "X$pkg" = "X$p"; then
+	  enable_shared=yes
+	fi
+      done
+      IFS="$lt_save_ifs"
+      ;;
+    esac
+else
+  enable_shared=no
+fi
+
+
+
+
+
+
+
+
+ # Make sure we can run config.sub.
+$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
+  as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5
+$as_echo_n "checking build system type... " >&6; }
+if ${ac_cv_build+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_build_alias=$build_alias
+test "x$ac_build_alias" = x &&
+  ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"`
+test "x$ac_build_alias" = x &&
+  as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5
+ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` ||
+  as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5
+$as_echo "$ac_cv_build" >&6; }
+case $ac_cv_build in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;;
+esac
+build=$ac_cv_build
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_build
+shift
+build_cpu=$1
+build_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+build_os=$*
+IFS=$ac_save_IFS
+case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5
+$as_echo_n "checking host system type... " >&6; }
+if ${ac_cv_host+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "x$host_alias" = x; then
+  ac_cv_host=$ac_cv_build
+else
+  ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` ||
+    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5
+$as_echo "$ac_cv_host" >&6; }
+case $ac_cv_host in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;;
+esac
+host=$ac_cv_host
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_host
+shift
+host_cpu=$1
+host_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+host_os=$*
+IFS=$ac_save_IFS
+case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
+
+
+
+case "${host_cpu}" in
+  powerpc*) have_fma=yes;;
+  ia64*) have_fma=yes;;
+  hppa*) have_fma=yes;;
+  mips64*) have_fma=yes;;
+  *) have_fma=no;;
+esac
+
+# Check whether --enable-fma was given.
+if test "${enable_fma+set}" = set; then :
+  enableval=$enable_fma; have_fma=$enableval
+fi
+
+if test "$have_fma"x = "yes"x; then
+
+$as_echo "#define HAVE_FMA 1" >>confdefs.h
+
+fi
+
+
+# Check whether --enable-debug was given.
+if test "${enable_debug+set}" = set; then :
+  enableval=$enable_debug; ok=$enableval
+else
+  ok=no
+fi
+
+if test "$ok" = "yes"; then
+
+$as_echo "#define FFTW_DEBUG 1" >>confdefs.h
+
+	debug_malloc=yes
+else
+	debug_malloc=no
+fi
+
+# Check whether --enable-debug-malloc was given.
+if test "${enable_debug_malloc+set}" = set; then :
+  enableval=$enable_debug_malloc; ok=$enableval
+else
+  ok=$debug_malloc
+fi
+
+if test "$ok" = "yes"; then
+
+$as_echo "#define FFTW_DEBUG_MALLOC 1" >>confdefs.h
+
+fi
+
+# Check whether --enable-debug-alignment was given.
+if test "${enable_debug_alignment+set}" = set; then :
+  enableval=$enable_debug_alignment; ok=$enableval
+else
+  ok=no
+fi
+
+if test "$ok" = "yes"; then
+
+$as_echo "#define FFTW_DEBUG_ALIGNMENT 1" >>confdefs.h
+
+fi
+
+# Check whether --enable-random-estimator was given.
+if test "${enable_random_estimator+set}" = set; then :
+  enableval=$enable_random_estimator; ok=$enableval
+else
+  ok=no
+fi
+
+if test "$ok" = "yes"; then
+
+$as_echo "#define FFTW_RANDOM_ESTIMATOR 1" >>confdefs.h
+
+	CHECK_PL_OPTS="--estimate"
+fi
+
+# Check whether --enable-alloca was given.
+if test "${enable_alloca+set}" = set; then :
+  enableval=$enable_alloca; ok=$enableval
+else
+  ok=yes
+fi
+
+if test "$ok" = "yes"; then
+
+$as_echo "#define FFTW_ENABLE_ALLOCA 1" >>confdefs.h
+
+fi
+
+# Check whether --enable-single was given.
+if test "${enable_single+set}" = set; then :
+  enableval=$enable_single; ok=$enableval
+else
+  ok=no
+fi
+
+# Check whether --enable-float was given.
+if test "${enable_float+set}" = set; then :
+  enableval=$enable_float; ok=$enableval
+fi
+
+if test "$ok" = "yes"; then
+
+$as_echo "#define FFTW_SINGLE 1" >>confdefs.h
+
+
+$as_echo "#define BENCHFFT_SINGLE 1" >>confdefs.h
+
+	PRECISION=s
+else
+	PRECISION=d
+fi
+ if test "$ok" = "yes"; then
+  SINGLE_TRUE=
+  SINGLE_FALSE='#'
+else
+  SINGLE_TRUE='#'
+  SINGLE_FALSE=
+fi
+
+
+# Check whether --enable-long-double was given.
+if test "${enable_long_double+set}" = set; then :
+  enableval=$enable_long_double; ok=$enableval
+else
+  ok=no
+fi
+
+if test "$ok" = "yes"; then
+	if test "$PRECISION" = "s"; then
+		as_fn_error $? "--enable-single/--enable-long-double conflict" "$LINENO" 5
+	fi
+
+$as_echo "#define FFTW_LDOUBLE 1" >>confdefs.h
+
+
+$as_echo "#define BENCHFFT_LDOUBLE 1" >>confdefs.h
+
+	PRECISION=l
+fi
+ if test "$ok" = "yes"; then
+  LDOUBLE_TRUE=
+  LDOUBLE_FALSE='#'
+else
+  LDOUBLE_TRUE='#'
+  LDOUBLE_FALSE=
+fi
+
+
+# Check whether --enable-quad-precision was given.
+if test "${enable_quad_precision+set}" = set; then :
+  enableval=$enable_quad_precision; ok=$enableval
+else
+  ok=no
+fi
+
+if test "$ok" = "yes"; then
+	if test "$PRECISION" != "d"; then
+		as_fn_error $? "conflicting precisions specified" "$LINENO" 5
+	fi
+
+$as_echo "#define FFTW_QUAD 1" >>confdefs.h
+
+
+$as_echo "#define BENCHFFT_QUAD 1" >>confdefs.h
+
+	PRECISION=q
+fi
+ if test "$ok" = "yes"; then
+  QUAD_TRUE=
+  QUAD_FALSE='#'
+else
+  QUAD_TRUE='#'
+  QUAD_FALSE=
+fi
+
+
+
+
+
+# Check whether --enable-sse was given.
+if test "${enable_sse+set}" = set; then :
+  enableval=$enable_sse; have_sse=$enableval
+else
+  have_sse=no
+fi
+
+if test "$have_sse" = "yes"; then
+	if test "$PRECISION" != "s"; then
+		as_fn_error $? "SSE requires single precision" "$LINENO" 5
+	fi
+fi
+
+# Check whether --enable-sse2 was given.
+if test "${enable_sse2+set}" = set; then :
+  enableval=$enable_sse2; have_sse2=$enableval
+else
+  have_sse2=no
+fi
+
+if test "$have_sse" = "yes"; then have_sse2=yes; fi
+if test "$have_sse2" = "yes"; then
+
+$as_echo "#define HAVE_SSE2 1" >>confdefs.h
+
+	if test "$PRECISION" != "d" -a "$PRECISION" != "s"; then
+		as_fn_error $? "SSE2 requires single or double precision" "$LINENO" 5
+	fi
+fi
+ if test "$have_sse2" = "yes"; then
+  HAVE_SSE2_TRUE=
+  HAVE_SSE2_FALSE='#'
+else
+  HAVE_SSE2_TRUE='#'
+  HAVE_SSE2_FALSE=
+fi
+
+
+# Check whether --enable-avx was given.
+if test "${enable_avx+set}" = set; then :
+  enableval=$enable_avx; have_avx=$enableval
+else
+  have_avx=no
+fi
+
+if test "$have_avx" = "yes"; then
+
+$as_echo "#define HAVE_AVX 1" >>confdefs.h
+
+	if test "$PRECISION" != "d" -a "$PRECISION" != "s"; then
+		as_fn_error $? "AVX requires single or double precision" "$LINENO" 5
+	fi
+fi
+ if test "$have_avx" = "yes"; then
+  HAVE_AVX_TRUE=
+  HAVE_AVX_FALSE='#'
+else
+  HAVE_AVX_TRUE='#'
+  HAVE_AVX_FALSE=
+fi
+
+
+# Check whether --enable-altivec was given.
+if test "${enable_altivec+set}" = set; then :
+  enableval=$enable_altivec; have_altivec=$enableval
+else
+  have_altivec=no
+fi
+
+if test "$have_altivec" = "yes"; then
+
+$as_echo "#define HAVE_ALTIVEC 1" >>confdefs.h
+
+	if test "$PRECISION" != "s"; then
+		as_fn_error $? "Altivec requires single precision" "$LINENO" 5
+	fi
+fi
+ if test "$have_altivec" = "yes"; then
+  HAVE_ALTIVEC_TRUE=
+  HAVE_ALTIVEC_FALSE='#'
+else
+  HAVE_ALTIVEC_TRUE='#'
+  HAVE_ALTIVEC_FALSE=
+fi
+
+
+# Check whether --enable-neon was given.
+if test "${enable_neon+set}" = set; then :
+  enableval=$enable_neon; have_neon=$enableval
+else
+  have_neon=no
+fi
+
+if test "$have_neon" = "yes"; then
+
+$as_echo "#define HAVE_NEON 1" >>confdefs.h
+
+	if test "$PRECISION" != "s"; then
+		as_fn_error $? "NEON requires single precision" "$LINENO" 5
+	fi
+fi
+ if test "$have_neon" = "yes"; then
+  HAVE_NEON_TRUE=
+  HAVE_NEON_FALSE='#'
+else
+  HAVE_NEON_TRUE='#'
+  HAVE_NEON_FALSE=
+fi
+
+
+
+
+# Check whether --with-slow-timer was given.
+if test "${with_slow_timer+set}" = set; then :
+  withval=$with_slow_timer; with_slow_timer=$withval
+else
+  with_slow_timer=no
+fi
+
+if test "$with_slow_timer" = "yes"; then
+
+$as_echo "#define WITH_SLOW_TIMER 1" >>confdefs.h
+
+fi
+
+# Check whether --enable-mips_zbus_timer was given.
+if test "${enable_mips_zbus_timer+set}" = set; then :
+  enableval=$enable_mips_zbus_timer; have_mips_zbus_timer=$enableval
+else
+  have_mips_zbus_timer=no
+fi
+
+if test "$have_mips_zbus_timer" = "yes"; then
+
+$as_echo "#define HAVE_MIPS_ZBUS_TIMER 1" >>confdefs.h
+
+fi
+
+
+# Check whether --with-our-malloc was given.
+if test "${with_our_malloc+set}" = set; then :
+  withval=$with_our_malloc; with_our_malloc=$withval
+else
+  with_our_malloc=no
+fi
+
+
+# Check whether --with-our-malloc16 was given.
+if test "${with_our_malloc16+set}" = set; then :
+  withval=$with_our_malloc16; with_our_malloc=$withval
+fi
+
+if test "$with_our_malloc" = "yes"; then
+
+$as_echo "#define WITH_OUR_MALLOC 1" >>confdefs.h
+
+fi
+
+
+# Check whether --with-windows-f77-mangling was given.
+if test "${with_windows_f77_mangling+set}" = set; then :
+  withval=$with_windows_f77_mangling; with_windows_f77_mangling=$withval
+else
+  with_windows_f77_mangling=no
+fi
+
+if test "$with_windows_f77_mangling" = "yes"; then
+
+$as_echo "#define WINDOWS_F77_MANGLING 1" >>confdefs.h
+
+fi
+
+
+# Check whether --with-incoming-stack-boundary was given.
+if test "${with_incoming_stack_boundary+set}" = set; then :
+  withval=$with_incoming_stack_boundary; with_incoming_stack_boundary=$withval
+else
+  with_incoming_stack_boundary=no
+fi
+
+
+case "$PRECISION" in
+     s) PREC_SUFFIX=f;;
+     d) PREC_SUFFIX=;;
+     l) PREC_SUFFIX=l;;
+     q) PREC_SUFFIX=q;;
+esac
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}gcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_CC"; then
+  ac_ct_CC=$CC
+  # Extract the first word of "gcc", so it can be a program name with args.
+set dummy gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="gcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+else
+  CC="$ac_cv_prog_CC"
+fi
+
+if test -z "$CC"; then
+          if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  fi
+fi
+if test -z "$CC"; then
+  # Extract the first word of "cc", so it can be a program name with args.
+set dummy cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+  ac_prog_rejected=no
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
+       ac_prog_rejected=yes
+       continue
+     fi
+    ac_cv_prog_CC="cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+if test $ac_prog_rejected = yes; then
+  # We found a bogon in the path, so make sure we never use it.
+  set dummy $ac_cv_prog_CC
+  shift
+  if test $# != 0; then
+    # We chose a different compiler from the bogus one.
+    # However, it has the same basename, so the bogon will be chosen
+    # first if we set CC to just the basename; use the full file name.
+    shift
+    ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
+  fi
+fi
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$CC"; then
+  if test -n "$ac_tool_prefix"; then
+  for ac_prog in cl.exe
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$CC" && break
+  done
+fi
+if test -z "$CC"; then
+  ac_ct_CC=$CC
+  for ac_prog in cl.exe
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CC" && break
+done
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+fi
+
+fi
+
+
+test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "no acceptable C compiler found in \$PATH
+See \`config.log' for more details" "$LINENO" 5; }
+
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
+# Try to create an executable without -o first, disregard a.out.
+# It will help us diagnose broken compilers, and finding out an intuition
+# of exeext.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5
+$as_echo_n "checking whether the C compiler works... " >&6; }
+ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+
+# The possible output files:
+ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
+
+ac_rmfiles=
+for ac_file in $ac_files
+do
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    * ) ac_rmfiles="$ac_rmfiles $ac_file";;
+  esac
+done
+rm -f $ac_rmfiles
+
+if { { ac_try="$ac_link_default"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link_default") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
+# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
+# in a Makefile.  We should not override ac_cv_exeext if it was cached,
+# so that the user can short-circuit this test for compilers unknown to
+# Autoconf.
+for ac_file in $ac_files ''
+do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj )
+	;;
+    [ab].out )
+	# We found the default executable, but exeext='' is most
+	# certainly right.
+	break;;
+    *.* )
+	if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+	then :; else
+	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	fi
+	# We set ac_cv_exeext here because the later test for it is not
+	# safe: cross compilers may not add the suffix if given an `-o'
+	# argument, so we may need to know it at that point already.
+	# Even if this section looks crufty: it has the advantage of
+	# actually working.
+	break;;
+    * )
+	break;;
+  esac
+done
+test "$ac_cv_exeext" = no && ac_cv_exeext=
+
+else
+  ac_file=''
+fi
+if test -z "$ac_file"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+$as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "C compiler cannot create executables
+See \`config.log' for more details" "$LINENO" 5; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5
+$as_echo_n "checking for C compiler default output file name... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
+$as_echo "$ac_file" >&6; }
+ac_exeext=$ac_cv_exeext
+
+rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
+$as_echo_n "checking for suffix of executables... " >&6; }
+if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  # If both `conftest.exe' and `conftest' are `present' (well, observable)
+# catch `conftest.exe'.  For instance with Cygwin, `ls conftest' will
+# work properly (i.e., refer to `conftest.exe'), while it won't with
+# `rm'.
+for ac_file in conftest.exe conftest conftest.*; do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	  break;;
+    * ) break;;
+  esac
+done
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest conftest$ac_cv_exeext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
+$as_echo "$ac_cv_exeext" >&6; }
+
+rm -f conftest.$ac_ext
+EXEEXT=$ac_cv_exeext
+ac_exeext=$EXEEXT
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
+FILE *f = fopen ("conftest.out", "w");
+ return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files="$ac_clean_files conftest.out"
+# Check that the compiler produces executables we can run.  If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
+$as_echo_n "checking whether we are cross compiling... " >&6; }
+if test "$cross_compiling" != yes; then
+  { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+  if { ac_try='./conftest$ac_cv_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+    cross_compiling=no
+  else
+    if test "$cross_compiling" = maybe; then
+	cross_compiling=yes
+    else
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot run C compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details" "$LINENO" 5; }
+    fi
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
+$as_echo "$cross_compiling" >&6; }
+
+rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
+$as_echo_n "checking for suffix of object files... " >&6; }
+if ${ac_cv_objext+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.o conftest.obj
+if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  for ac_file in conftest.o conftest.obj conftest.*; do
+  test -f "$ac_file" || continue;
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;;
+    *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
+       break;;
+  esac
+done
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of object files: cannot compile
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest.$ac_cv_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
+$as_echo "$ac_cv_objext" >&6; }
+OBJEXT=$ac_cv_objext
+ac_objext=$OBJEXT
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
+$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
+if ${ac_cv_c_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_c_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
+$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+  GCC=yes
+else
+  GCC=
+fi
+ac_test_CFLAGS=${CFLAGS+set}
+ac_save_CFLAGS=$CFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
+$as_echo_n "checking whether $CC accepts -g... " >&6; }
+if ${ac_cv_prog_cc_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_c_werror_flag=$ac_c_werror_flag
+   ac_c_werror_flag=yes
+   ac_cv_prog_cc_g=no
+   CFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+else
+  CFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+else
+  ac_c_werror_flag=$ac_save_c_werror_flag
+	 CFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_c_werror_flag=$ac_save_c_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
+$as_echo "$ac_cv_prog_cc_g" >&6; }
+if test "$ac_test_CFLAGS" = set; then
+  CFLAGS=$ac_save_CFLAGS
+elif test $ac_cv_prog_cc_g = yes; then
+  if test "$GCC" = yes; then
+    CFLAGS="-g -O2"
+  else
+    CFLAGS="-g"
+  fi
+else
+  if test "$GCC" = yes; then
+    CFLAGS="-O2"
+  else
+    CFLAGS=
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
+$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
+if ${ac_cv_prog_cc_c89+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdio.h>
+struct stat;
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+     char **p;
+     int i;
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
+   function prototypes and stuff, but not '\xHH' hex character constants.
+   These don't provoke an error unfortunately, instead are silently treated
+   as 'x'.  The following induces an error, until -std is added to get
+   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
+   array size at least.  It's necessary to write '\x00'==0 to get something
+   that's true only with -std.  */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+   inside strings and character constants.  */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_c89=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+  x)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+  xno)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c89"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c89" != xno; then :
+
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+DEPDIR="${am__leading_dot}deps"
+
+ac_config_commands="$ac_config_commands depfiles"
+
+
+am_make=${MAKE-make}
+cat > confinc << 'END'
+am__doit:
+	@echo this is the am__doit target
+.PHONY: am__doit
+END
+# If we don't find an include directive, just comment out the code.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for style of include used by $am_make" >&5
+$as_echo_n "checking for style of include used by $am_make... " >&6; }
+am__include="#"
+am__quote=
+_am_result=none
+# First try GNU make style include.
+echo "include confinc" > confmf
+# Ignore all kinds of additional output from `make'.
+case `$am_make -s -f confmf 2> /dev/null` in #(
+*the\ am__doit\ target*)
+  am__include=include
+  am__quote=
+  _am_result=GNU
+  ;;
+esac
+# Now try BSD make style include.
+if test "$am__include" = "#"; then
+   echo '.include "confinc"' > confmf
+   case `$am_make -s -f confmf 2> /dev/null` in #(
+   *the\ am__doit\ target*)
+     am__include=.include
+     am__quote="\""
+     _am_result=BSD
+     ;;
+   esac
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_am_result" >&5
+$as_echo "$_am_result" >&6; }
+rm -f confinc confmf
+
+# Check whether --enable-dependency-tracking was given.
+if test "${enable_dependency_tracking+set}" = set; then :
+  enableval=$enable_dependency_tracking;
+fi
+
+if test "x$enable_dependency_tracking" != xno; then
+  am_depcomp="$ac_aux_dir/depcomp"
+  AMDEPBACKSLASH='\'
+  am__nodep='_no'
+fi
+ if test "x$enable_dependency_tracking" != xno; then
+  AMDEP_TRUE=
+  AMDEP_FALSE='#'
+else
+  AMDEP_TRUE='#'
+  AMDEP_FALSE=
+fi
+
+
+
+depcc="$CC"   am_compiler_list=
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
+$as_echo_n "checking dependency style of $depcc... " >&6; }
+if ${am_cv_CC_dependencies_compiler_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named `D' -- because `-MD' means `put the output
+  # in D'.
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_CC_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  am__universal=false
+  case " $depcc " in #(
+     *\ -arch\ *\ -arch\ *) am__universal=true ;;
+     esac
+
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
+      # Solaris 8's {/usr,}/bin/sh.
+      touch sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    # We check with `-c' and `-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle `-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # after this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok `-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CC_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CC_dependencies_compiler_type=none
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CC_dependencies_compiler_type" >&5
+$as_echo "$am_cv_CC_dependencies_compiler_type" >&6; }
+CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type
+
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then
+  am__fastdepCC_TRUE=
+  am__fastdepCC_FALSE='#'
+else
+  am__fastdepCC_TRUE='#'
+  am__fastdepCC_FALSE=
+fi
+
+
+if test "x$CC" != xcc; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC and cc understand -c and -o together" >&5
+$as_echo_n "checking whether $CC and cc understand -c and -o together... " >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether cc understands -c and -o together" >&5
+$as_echo_n "checking whether cc understands -c and -o together... " >&6; }
+fi
+set dummy $CC; ac_cc=`$as_echo "$2" |
+		      sed 's/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/'`
+if eval \${ac_cv_prog_cc_${ac_cc}_c_o+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+# Make sure it works both with $CC and with simple cc.
+# We do the test twice because some compilers refuse to overwrite an
+# existing .o file with -o, though they will create one.
+ac_try='$CC -c conftest.$ac_ext -o conftest2.$ac_objext >&5'
+rm -f conftest2.*
+if { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } &&
+   test -f conftest2.$ac_objext && { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; };
+then
+  eval ac_cv_prog_cc_${ac_cc}_c_o=yes
+  if test "x$CC" != xcc; then
+    # Test first that cc exists at all.
+    if { ac_try='cc -c conftest.$ac_ext >&5'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+      ac_try='cc -c conftest.$ac_ext -o conftest2.$ac_objext >&5'
+      rm -f conftest2.*
+      if { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } &&
+	 test -f conftest2.$ac_objext && { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; };
+      then
+	# cc works too.
+	:
+      else
+	# cc exists but doesn't like -o.
+	eval ac_cv_prog_cc_${ac_cc}_c_o=no
+      fi
+    fi
+  fi
+else
+  eval ac_cv_prog_cc_${ac_cc}_c_o=no
+fi
+rm -f core conftest*
+
+fi
+if eval test \$ac_cv_prog_cc_${ac_cc}_c_o = yes; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+$as_echo "#define NO_MINUS_C_MINUS_O 1" >>confdefs.h
+
+fi
+
+# FIXME: we rely on the cache variable name because
+# there is no other way.
+set dummy $CC
+am_cc=`echo $2 | sed 's/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/'`
+eval am_t=\$ac_cv_prog_cc_${am_cc}_c_o
+if test "$am_t" != yes; then
+   # Losing compiler, so override with the script.
+   # FIXME: It is wrong to rewrite CC.
+   # But if we don't then we get into trouble of one sort or another.
+   # A longer-term fix would be to have automake use am__CC in this case,
+   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
+   CC="$am_aux_dir/compile $CC"
+fi
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler vendor" >&5
+$as_echo_n "checking for C compiler vendor... " >&6; }
+if ${ax_cv_c_compiler_vendor+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ax_cv_c_compiler_vendor=unknown
+  # note: don't check for gcc first since some other compilers define __GNUC__
+  for ventest in intel:__ICC,__ECC,__INTEL_COMPILER ibm:__xlc__,__xlC__,__IBMC__,__IBMCPP__ pathscale:__PATHCC__,__PATHSCALE__ gnu:__GNUC__ sun:__SUNPRO_C,__SUNPRO_CC hp:__HP_cc,__HP_aCC dec:__DECC,__DECCXX,__DECC_VER,__DECCXX_VER borland:__BORLANDC__,__TURBOC__ comeau:__COMO__ cray:_CRAYC kai:__KCC lcc:__LCC__ metrowerks:__MWERKS__ sgi:__sgi,sgi microsoft:_MSC_VER watcom:__WATCOMC__ portland:__PGI; do
+    vencpp="defined("`echo $ventest | cut -d: -f2 | sed 's/,/) || defined(/g'`")"
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+#if !($vencpp)
+      thisisanerror;
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_compiler_vendor=`echo $ventest | cut -d: -f1`; break
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_c_compiler_vendor" >&5
+$as_echo "$ax_cv_c_compiler_vendor" >&6; }
+
+   case $ac_cv_prog_cc_stdc in #(
+  no) :
+    ac_cv_prog_cc_c99=no; ac_cv_prog_cc_c89=no ;; #(
+  *) :
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C99" >&5
+$as_echo_n "checking for $CC option to accept ISO C99... " >&6; }
+if ${ac_cv_prog_cc_c99+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_prog_cc_c99=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <wchar.h>
+#include <stdio.h>
+
+// Check varargs macros.  These examples are taken from C99 6.10.3.5.
+#define debug(...) fprintf (stderr, __VA_ARGS__)
+#define showlist(...) puts (#__VA_ARGS__)
+#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__))
+static void
+test_varargs_macros (void)
+{
+  int x = 1234;
+  int y = 5678;
+  debug ("Flag");
+  debug ("X = %d\n", x);
+  showlist (The first, second, and third items.);
+  report (x>y, "x is %d but y is %d", x, y);
+}
+
+// Check long long types.
+#define BIG64 18446744073709551615ull
+#define BIG32 4294967295ul
+#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0)
+#if !BIG_OK
+  your preprocessor is broken;
+#endif
+#if BIG_OK
+#else
+  your preprocessor is broken;
+#endif
+static long long int bignum = -9223372036854775807LL;
+static unsigned long long int ubignum = BIG64;
+
+struct incomplete_array
+{
+  int datasize;
+  double data[];
+};
+
+struct named_init {
+  int number;
+  const wchar_t *name;
+  double average;
+};
+
+typedef const char *ccp;
+
+static inline int
+test_restrict (ccp restrict text)
+{
+  // See if C++-style comments work.
+  // Iterate through items via the restricted pointer.
+  // Also check for declarations in for loops.
+  for (unsigned int i = 0; *(text+i) != '\0'; ++i)
+    continue;
+  return 0;
+}
+
+// Check varargs and va_copy.
+static void
+test_varargs (const char *format, ...)
+{
+  va_list args;
+  va_start (args, format);
+  va_list args_copy;
+  va_copy (args_copy, args);
+
+  const char *str;
+  int number;
+  float fnumber;
+
+  while (*format)
+    {
+      switch (*format++)
+	{
+	case 's': // string
+	  str = va_arg (args_copy, const char *);
+	  break;
+	case 'd': // int
+	  number = va_arg (args_copy, int);
+	  break;
+	case 'f': // float
+	  fnumber = va_arg (args_copy, double);
+	  break;
+	default:
+	  break;
+	}
+    }
+  va_end (args_copy);
+  va_end (args);
+}
+
+int
+main ()
+{
+
+  // Check bool.
+  _Bool success = false;
+
+  // Check restrict.
+  if (test_restrict ("String literal") == 0)
+    success = true;
+  char *restrict newvar = "Another string";
+
+  // Check varargs.
+  test_varargs ("s, d' f .", "string", 65, 34.234);
+  test_varargs_macros ();
+
+  // Check flexible array members.
+  struct incomplete_array *ia =
+    malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10));
+  ia->datasize = 10;
+  for (int i = 0; i < ia->datasize; ++i)
+    ia->data[i] = i * 1.234;
+
+  // Check named initializers.
+  struct named_init ni = {
+    .number = 34,
+    .name = L"Test wide string",
+    .average = 543.34343,
+  };
+
+  ni.number = 58;
+
+  int dynamic_array[ni.number];
+  dynamic_array[ni.number - 1] = 543;
+
+  // work around unused variable warnings
+  return (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == 'x'
+	  || dynamic_array[ni.number - 1] != 543);
+
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -D_STDC_C99= -qlanglvl=extc99
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_c99=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c99" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c99" in
+  x)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+  xno)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c99"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c99" >&5
+$as_echo "$ac_cv_prog_cc_c99" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c99" != xno; then :
+  ac_cv_prog_cc_stdc=$ac_cv_prog_cc_c99
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
+$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
+if ${ac_cv_prog_cc_c89+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdio.h>
+struct stat;
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+     char **p;
+     int i;
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
+   function prototypes and stuff, but not '\xHH' hex character constants.
+   These don't provoke an error unfortunately, instead are silently treated
+   as 'x'.  The following induces an error, until -std is added to get
+   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
+   array size at least.  It's necessary to write '\x00'==0 to get something
+   that's true only with -std.  */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+   inside strings and character constants.  */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_c89=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+  x)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+  xno)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c89"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c89" != xno; then :
+  ac_cv_prog_cc_stdc=$ac_cv_prog_cc_c89
+else
+  ac_cv_prog_cc_stdc=no
+fi
+
+fi
+ ;;
+esac
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO Standard C" >&5
+$as_echo_n "checking for $CC option to accept ISO Standard C... " >&6; }
+  if ${ac_cv_prog_cc_stdc+:} false; then :
+  $as_echo_n "(cached) " >&6
+fi
+
+  case $ac_cv_prog_cc_stdc in #(
+  no) :
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;; #(
+  '') :
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;; #(
+  *) :
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_stdc" >&5
+$as_echo "$ac_cv_prog_cc_stdc" >&6; } ;;
+esac
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ln -s works" >&5
+$as_echo_n "checking whether ln -s works... " >&6; }
+LN_S=$as_ln_s
+if test "$LN_S" = "ln -s"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no, using $LN_S" >&5
+$as_echo "no, using $LN_S" >&6; }
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5
+$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; }
+set x ${MAKE-make}
+ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'`
+if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat >conftest.make <<\_ACEOF
+SHELL = /bin/sh
+all:
+	@echo '@@@%%%=$(MAKE)=@@@%%%'
+_ACEOF
+# GNU make sometimes prints "make[1]: Entering ...", which would confuse us.
+case `${MAKE-make} -f conftest.make 2>/dev/null` in
+  *@@@%%%=?*=@@@%%%*)
+    eval ac_cv_prog_make_${ac_make}_set=yes;;
+  *)
+    eval ac_cv_prog_make_${ac_make}_set=no;;
+esac
+rm -f conftest.make
+fi
+if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+  SET_MAKE=
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+  SET_MAKE="MAKE=${MAKE-make}"
+fi
+
+enable_win32_dll=yes
+
+case $host in
+*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-cegcc*)
+  if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}as", so it can be a program name with args.
+set dummy ${ac_tool_prefix}as; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AS+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$AS"; then
+  ac_cv_prog_AS="$AS" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_AS="${ac_tool_prefix}as"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+AS=$ac_cv_prog_AS
+if test -n "$AS"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AS" >&5
+$as_echo "$AS" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_AS"; then
+  ac_ct_AS=$AS
+  # Extract the first word of "as", so it can be a program name with args.
+set dummy as; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_AS+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_AS"; then
+  ac_cv_prog_ac_ct_AS="$ac_ct_AS" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_AS="as"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_AS=$ac_cv_prog_ac_ct_AS
+if test -n "$ac_ct_AS"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_AS" >&5
+$as_echo "$ac_ct_AS" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_AS" = x; then
+    AS="false"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    AS=$ac_ct_AS
+  fi
+else
+  AS="$ac_cv_prog_AS"
+fi
+
+  if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}dlltool", so it can be a program name with args.
+set dummy ${ac_tool_prefix}dlltool; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_DLLTOOL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$DLLTOOL"; then
+  ac_cv_prog_DLLTOOL="$DLLTOOL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_DLLTOOL="${ac_tool_prefix}dlltool"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+DLLTOOL=$ac_cv_prog_DLLTOOL
+if test -n "$DLLTOOL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DLLTOOL" >&5
+$as_echo "$DLLTOOL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_DLLTOOL"; then
+  ac_ct_DLLTOOL=$DLLTOOL
+  # Extract the first word of "dlltool", so it can be a program name with args.
+set dummy dlltool; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_DLLTOOL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_DLLTOOL"; then
+  ac_cv_prog_ac_ct_DLLTOOL="$ac_ct_DLLTOOL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_DLLTOOL="dlltool"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_DLLTOOL=$ac_cv_prog_ac_ct_DLLTOOL
+if test -n "$ac_ct_DLLTOOL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_DLLTOOL" >&5
+$as_echo "$ac_ct_DLLTOOL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_DLLTOOL" = x; then
+    DLLTOOL="false"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    DLLTOOL=$ac_ct_DLLTOOL
+  fi
+else
+  DLLTOOL="$ac_cv_prog_DLLTOOL"
+fi
+
+  if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}objdump", so it can be a program name with args.
+set dummy ${ac_tool_prefix}objdump; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_OBJDUMP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$OBJDUMP"; then
+  ac_cv_prog_OBJDUMP="$OBJDUMP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_OBJDUMP="${ac_tool_prefix}objdump"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+OBJDUMP=$ac_cv_prog_OBJDUMP
+if test -n "$OBJDUMP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OBJDUMP" >&5
+$as_echo "$OBJDUMP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_OBJDUMP"; then
+  ac_ct_OBJDUMP=$OBJDUMP
+  # Extract the first word of "objdump", so it can be a program name with args.
+set dummy objdump; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_OBJDUMP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_OBJDUMP"; then
+  ac_cv_prog_ac_ct_OBJDUMP="$ac_ct_OBJDUMP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_OBJDUMP="objdump"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_OBJDUMP=$ac_cv_prog_ac_ct_OBJDUMP
+if test -n "$ac_ct_OBJDUMP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_OBJDUMP" >&5
+$as_echo "$ac_ct_OBJDUMP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_OBJDUMP" = x; then
+    OBJDUMP="false"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    OBJDUMP=$ac_ct_OBJDUMP
+  fi
+else
+  OBJDUMP="$ac_cv_prog_OBJDUMP"
+fi
+
+  ;;
+esac
+
+test -z "$AS" && AS=as
+
+
+
+
+
+test -z "$DLLTOOL" && DLLTOOL=dlltool
+
+
+
+
+
+test -z "$OBJDUMP" && OBJDUMP=objdump
+
+
+
+
+
+
+
+case `pwd` in
+  *\ * | *\	*)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Libtool does not cope well with whitespace in \`pwd\`" >&5
+$as_echo "$as_me: WARNING: Libtool does not cope well with whitespace in \`pwd\`" >&2;} ;;
+esac
+
+
+
+macro_version='2.4.2'
+macro_revision='1.3337'
+
+
+
+
+
+
+
+
+
+
+
+
+
+ltmain="$ac_aux_dir/ltmain.sh"
+
+# Backslashify metacharacters that are still active within
+# double-quoted strings.
+sed_quote_subst='s/\(["`$\\]\)/\\\1/g'
+
+# Same as above, but do not quote variable references.
+double_quote_subst='s/\(["`\\]\)/\\\1/g'
+
+# Sed substitution to delay expansion of an escaped shell variable in a
+# double_quote_subst'ed string.
+delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g'
+
+# Sed substitution to delay expansion of an escaped single quote.
+delay_single_quote_subst='s/'\''/'\'\\\\\\\'\''/g'
+
+# Sed substitution to avoid accidental globbing in evaled expressions
+no_glob_subst='s/\*/\\\*/g'
+
+ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO
+ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO$ECHO
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to print strings" >&5
+$as_echo_n "checking how to print strings... " >&6; }
+# Test print first, because it will be a builtin if present.
+if test "X`( print -r -- -n ) 2>/dev/null`" = X-n && \
+   test "X`print -r -- $ECHO 2>/dev/null`" = "X$ECHO"; then
+  ECHO='print -r --'
+elif test "X`printf %s $ECHO 2>/dev/null`" = "X$ECHO"; then
+  ECHO='printf %s\n'
+else
+  # Use this function as a fallback that always works.
+  func_fallback_echo ()
+  {
+    eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+  }
+  ECHO='func_fallback_echo'
+fi
+
+# func_echo_all arg...
+# Invoke $ECHO with all args, space-separated.
+func_echo_all ()
+{
+    $ECHO ""
+}
+
+case "$ECHO" in
+  printf*) { $as_echo "$as_me:${as_lineno-$LINENO}: result: printf" >&5
+$as_echo "printf" >&6; } ;;
+  print*) { $as_echo "$as_me:${as_lineno-$LINENO}: result: print -r" >&5
+$as_echo "print -r" >&6; } ;;
+  *) { $as_echo "$as_me:${as_lineno-$LINENO}: result: cat" >&5
+$as_echo "cat" >&6; } ;;
+esac
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a sed that does not truncate output" >&5
+$as_echo_n "checking for a sed that does not truncate output... " >&6; }
+if ${ac_cv_path_SED+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+            ac_script=s/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/
+     for ac_i in 1 2 3 4 5 6 7; do
+       ac_script="$ac_script$as_nl$ac_script"
+     done
+     echo "$ac_script" 2>/dev/null | sed 99q >conftest.sed
+     { ac_script=; unset ac_script;}
+     if test -z "$SED"; then
+  ac_path_SED_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in sed gsed; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_SED="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_SED" || continue
+# Check for GNU ac_path_SED and select it if it is found.
+  # Check for GNU $ac_path_SED
+case `"$ac_path_SED" --version 2>&1` in
+*GNU*)
+  ac_cv_path_SED="$ac_path_SED" ac_path_SED_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo '' >> "conftest.nl"
+    "$ac_path_SED" -f conftest.sed < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_SED_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_SED="$ac_path_SED"
+      ac_path_SED_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_SED_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_SED"; then
+    as_fn_error $? "no acceptable sed could be found in \$PATH" "$LINENO" 5
+  fi
+else
+  ac_cv_path_SED=$SED
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_SED" >&5
+$as_echo "$ac_cv_path_SED" >&6; }
+ SED="$ac_cv_path_SED"
+  rm -f conftest.sed
+
+test -z "$SED" && SED=sed
+Xsed="$SED -e 1s/^X//"
+
+
+
+
+
+
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
+if ${ac_cv_path_GREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$GREP"; then
+  ac_path_GREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in grep ggrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_GREP" || continue
+# Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_GREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_GREP"; then
+    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+$as_echo "$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
+$as_echo_n "checking for egrep... " >&6; }
+if ${ac_cv_path_EGREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+   then ac_cv_path_EGREP="$GREP -E"
+   else
+     if test -z "$EGREP"; then
+  ac_path_EGREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in egrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_EGREP" || continue
+# Check for GNU ac_path_EGREP and select it if it is found.
+  # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'EGREP' >> "conftest.nl"
+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_EGREP="$ac_path_EGREP"
+      ac_path_EGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_EGREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_EGREP"; then
+    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_EGREP=$EGREP
+fi
+
+   fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
+$as_echo "$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for fgrep" >&5
+$as_echo_n "checking for fgrep... " >&6; }
+if ${ac_cv_path_FGREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if echo 'ab*c' | $GREP -F 'ab*c' >/dev/null 2>&1
+   then ac_cv_path_FGREP="$GREP -F"
+   else
+     if test -z "$FGREP"; then
+  ac_path_FGREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in fgrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_FGREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_FGREP" || continue
+# Check for GNU ac_path_FGREP and select it if it is found.
+  # Check for GNU $ac_path_FGREP
+case `"$ac_path_FGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_FGREP="$ac_path_FGREP" ac_path_FGREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'FGREP' >> "conftest.nl"
+    "$ac_path_FGREP" FGREP < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_FGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_FGREP="$ac_path_FGREP"
+      ac_path_FGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_FGREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_FGREP"; then
+    as_fn_error $? "no acceptable fgrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_FGREP=$FGREP
+fi
+
+   fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_FGREP" >&5
+$as_echo "$ac_cv_path_FGREP" >&6; }
+ FGREP="$ac_cv_path_FGREP"
+
+
+test -z "$GREP" && GREP=grep
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Check whether --with-gnu-ld was given.
+if test "${with_gnu_ld+set}" = set; then :
+  withval=$with_gnu_ld; test "$withval" = no || with_gnu_ld=yes
+else
+  with_gnu_ld=no
+fi
+
+ac_prog=ld
+if test "$GCC" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ld used by $CC" >&5
+$as_echo_n "checking for ld used by $CC... " >&6; }
+  case $host in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case $ac_prog in
+    # Accept absolute paths.
+    [\\/]* | ?:[\\/]*)
+      re_direlt='/[^/][^/]*/\.\./'
+      # Canonicalize the pathname of ld
+      ac_prog=`$ECHO "$ac_prog"| $SED 's%\\\\%/%g'`
+      while $ECHO "$ac_prog" | $GREP "$re_direlt" > /dev/null 2>&1; do
+	ac_prog=`$ECHO $ac_prog| $SED "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GNU ld" >&5
+$as_echo_n "checking for GNU ld... " >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for non-GNU ld" >&5
+$as_echo_n "checking for non-GNU ld... " >&6; }
+fi
+if ${lt_cv_path_LD+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$LD"; then
+  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  for ac_dir in $PATH; do
+    IFS="$lt_save_ifs"
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      lt_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some variants of GNU ld only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      case `"$lt_cv_path_LD" -v 2>&1 </dev/null` in
+      *GNU* | *'with BFD'*)
+	test "$with_gnu_ld" != no && break
+	;;
+      *)
+	test "$with_gnu_ld" != yes && break
+	;;
+      esac
+    fi
+  done
+  IFS="$lt_save_ifs"
+else
+  lt_cv_path_LD="$LD" # Let the user override the test with a path.
+fi
+fi
+
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LD" >&5
+$as_echo "$LD" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+test -z "$LD" && as_fn_error $? "no acceptable ld found in \$PATH" "$LINENO" 5
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if the linker ($LD) is GNU ld" >&5
+$as_echo_n "checking if the linker ($LD) is GNU ld... " >&6; }
+if ${lt_cv_prog_gnu_ld+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  # I'd rather use --version here, but apparently some GNU lds only accept -v.
+case `$LD -v 2>&1 </dev/null` in
+*GNU* | *'with BFD'*)
+  lt_cv_prog_gnu_ld=yes
+  ;;
+*)
+  lt_cv_prog_gnu_ld=no
+  ;;
+esac
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_gnu_ld" >&5
+$as_echo "$lt_cv_prog_gnu_ld" >&6; }
+with_gnu_ld=$lt_cv_prog_gnu_ld
+
+
+
+
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for BSD- or MS-compatible name lister (nm)" >&5
+$as_echo_n "checking for BSD- or MS-compatible name lister (nm)... " >&6; }
+if ${lt_cv_path_NM+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$NM"; then
+  # Let the user override the test.
+  lt_cv_path_NM="$NM"
+else
+  lt_nm_to_check="${ac_tool_prefix}nm"
+  if test -n "$ac_tool_prefix" && test "$build" = "$host"; then
+    lt_nm_to_check="$lt_nm_to_check nm"
+  fi
+  for lt_tmp_nm in $lt_nm_to_check; do
+    lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+    for ac_dir in $PATH /usr/ccs/bin/elf /usr/ccs/bin /usr/ucb /bin; do
+      IFS="$lt_save_ifs"
+      test -z "$ac_dir" && ac_dir=.
+      tmp_nm="$ac_dir/$lt_tmp_nm"
+      if test -f "$tmp_nm" || test -f "$tmp_nm$ac_exeext" ; then
+	# Check to see if the nm accepts a BSD-compat flag.
+	# Adding the `sed 1q' prevents false positives on HP-UX, which says:
+	#   nm: unknown option "B" ignored
+	# Tru64's nm complains that /dev/null is an invalid object file
+	case `"$tmp_nm" -B /dev/null 2>&1 | sed '1q'` in
+	*/dev/null* | *'Invalid file or object type'*)
+	  lt_cv_path_NM="$tmp_nm -B"
+	  break
+	  ;;
+	*)
+	  case `"$tmp_nm" -p /dev/null 2>&1 | sed '1q'` in
+	  */dev/null*)
+	    lt_cv_path_NM="$tmp_nm -p"
+	    break
+	    ;;
+	  *)
+	    lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
+	    continue # so that we can try to find one that supports BSD flags
+	    ;;
+	  esac
+	  ;;
+	esac
+      fi
+    done
+    IFS="$lt_save_ifs"
+  done
+  : ${lt_cv_path_NM=no}
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_path_NM" >&5
+$as_echo "$lt_cv_path_NM" >&6; }
+if test "$lt_cv_path_NM" != "no"; then
+  NM="$lt_cv_path_NM"
+else
+  # Didn't find any BSD compatible name lister, look for dumpbin.
+  if test -n "$DUMPBIN"; then :
+    # Let the user override the test.
+  else
+    if test -n "$ac_tool_prefix"; then
+  for ac_prog in dumpbin "link -dump"
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_DUMPBIN+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$DUMPBIN"; then
+  ac_cv_prog_DUMPBIN="$DUMPBIN" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_DUMPBIN="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+DUMPBIN=$ac_cv_prog_DUMPBIN
+if test -n "$DUMPBIN"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DUMPBIN" >&5
+$as_echo "$DUMPBIN" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$DUMPBIN" && break
+  done
+fi
+if test -z "$DUMPBIN"; then
+  ac_ct_DUMPBIN=$DUMPBIN
+  for ac_prog in dumpbin "link -dump"
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_DUMPBIN+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_DUMPBIN"; then
+  ac_cv_prog_ac_ct_DUMPBIN="$ac_ct_DUMPBIN" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_DUMPBIN="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_DUMPBIN=$ac_cv_prog_ac_ct_DUMPBIN
+if test -n "$ac_ct_DUMPBIN"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_DUMPBIN" >&5
+$as_echo "$ac_ct_DUMPBIN" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_DUMPBIN" && break
+done
+
+  if test "x$ac_ct_DUMPBIN" = x; then
+    DUMPBIN=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    DUMPBIN=$ac_ct_DUMPBIN
+  fi
+fi
+
+    case `$DUMPBIN -symbols /dev/null 2>&1 | sed '1q'` in
+    *COFF*)
+      DUMPBIN="$DUMPBIN -symbols"
+      ;;
+    *)
+      DUMPBIN=:
+      ;;
+    esac
+  fi
+
+  if test "$DUMPBIN" != ":"; then
+    NM="$DUMPBIN"
+  fi
+fi
+test -z "$NM" && NM=nm
+
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking the name lister ($NM) interface" >&5
+$as_echo_n "checking the name lister ($NM) interface... " >&6; }
+if ${lt_cv_nm_interface+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_nm_interface="BSD nm"
+  echo "int some_variable = 0;" > conftest.$ac_ext
+  (eval echo "\"\$as_me:$LINENO: $ac_compile\"" >&5)
+  (eval "$ac_compile" 2>conftest.err)
+  cat conftest.err >&5
+  (eval echo "\"\$as_me:$LINENO: $NM \\\"conftest.$ac_objext\\\"\"" >&5)
+  (eval "$NM \"conftest.$ac_objext\"" 2>conftest.err > conftest.out)
+  cat conftest.err >&5
+  (eval echo "\"\$as_me:$LINENO: output\"" >&5)
+  cat conftest.out >&5
+  if $GREP 'External.*some_variable' conftest.out > /dev/null; then
+    lt_cv_nm_interface="MS dumpbin"
+  fi
+  rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_nm_interface" >&5
+$as_echo "$lt_cv_nm_interface" >&6; }
+
+# find the maximum length of command line arguments
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking the maximum length of command line arguments" >&5
+$as_echo_n "checking the maximum length of command line arguments... " >&6; }
+if ${lt_cv_sys_max_cmd_len+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+    i=0
+  teststring="ABCD"
+
+  case $build_os in
+  msdosdjgpp*)
+    # On DJGPP, this test can blow up pretty badly due to problems in libc
+    # (any single argument exceeding 2000 bytes causes a buffer overrun
+    # during glob expansion).  Even if it were fixed, the result of this
+    # check would be larger than it should be.
+    lt_cv_sys_max_cmd_len=12288;    # 12K is about right
+    ;;
+
+  gnu*)
+    # Under GNU Hurd, this test is not required because there is
+    # no limit to the length of command line arguments.
+    # Libtool will interpret -1 as no limit whatsoever
+    lt_cv_sys_max_cmd_len=-1;
+    ;;
+
+  cygwin* | mingw* | cegcc*)
+    # On Win9x/ME, this test blows up -- it succeeds, but takes
+    # about 5 minutes as the teststring grows exponentially.
+    # Worse, since 9x/ME are not pre-emptively multitasking,
+    # you end up with a "frozen" computer, even though with patience
+    # the test eventually succeeds (with a max line length of 256k).
+    # Instead, let's just punt: use the minimum linelength reported by
+    # all of the supported platforms: 8192 (on NT/2K/XP).
+    lt_cv_sys_max_cmd_len=8192;
+    ;;
+
+  mint*)
+    # On MiNT this can take a long time and run out of memory.
+    lt_cv_sys_max_cmd_len=8192;
+    ;;
+
+  amigaos*)
+    # On AmigaOS with pdksh, this test takes hours, literally.
+    # So we just punt and use a minimum line length of 8192.
+    lt_cv_sys_max_cmd_len=8192;
+    ;;
+
+  netbsd* | freebsd* | openbsd* | darwin* | dragonfly*)
+    # This has been around since 386BSD, at least.  Likely further.
+    if test -x /sbin/sysctl; then
+      lt_cv_sys_max_cmd_len=`/sbin/sysctl -n kern.argmax`
+    elif test -x /usr/sbin/sysctl; then
+      lt_cv_sys_max_cmd_len=`/usr/sbin/sysctl -n kern.argmax`
+    else
+      lt_cv_sys_max_cmd_len=65536	# usable default for all BSDs
+    fi
+    # And add a safety zone
+    lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4`
+    lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3`
+    ;;
+
+  interix*)
+    # We know the value 262144 and hardcode it with a safety zone (like BSD)
+    lt_cv_sys_max_cmd_len=196608
+    ;;
+
+  os2*)
+    # The test takes a long time on OS/2.
+    lt_cv_sys_max_cmd_len=8192
+    ;;
+
+  osf*)
+    # Dr. Hans Ekkehard Plesser reports seeing a kernel panic running configure
+    # due to this test when exec_disable_arg_limit is 1 on Tru64. It is not
+    # nice to cause kernel panics so lets avoid the loop below.
+    # First set a reasonable default.
+    lt_cv_sys_max_cmd_len=16384
+    #
+    if test -x /sbin/sysconfig; then
+      case `/sbin/sysconfig -q proc exec_disable_arg_limit` in
+        *1*) lt_cv_sys_max_cmd_len=-1 ;;
+      esac
+    fi
+    ;;
+  sco3.2v5*)
+    lt_cv_sys_max_cmd_len=102400
+    ;;
+  sysv5* | sco5v6* | sysv4.2uw2*)
+    kargmax=`grep ARG_MAX /etc/conf/cf.d/stune 2>/dev/null`
+    if test -n "$kargmax"; then
+      lt_cv_sys_max_cmd_len=`echo $kargmax | sed 's/.*[	 ]//'`
+    else
+      lt_cv_sys_max_cmd_len=32768
+    fi
+    ;;
+  *)
+    lt_cv_sys_max_cmd_len=`(getconf ARG_MAX) 2> /dev/null`
+    if test -n "$lt_cv_sys_max_cmd_len"; then
+      lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4`
+      lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3`
+    else
+      # Make teststring a little bigger before we do anything with it.
+      # a 1K string should be a reasonable start.
+      for i in 1 2 3 4 5 6 7 8 ; do
+        teststring=$teststring$teststring
+      done
+      SHELL=${SHELL-${CONFIG_SHELL-/bin/sh}}
+      # If test is not a shell built-in, we'll probably end up computing a
+      # maximum length that is only half of the actual maximum length, but
+      # we can't tell.
+      while { test "X"`env echo "$teststring$teststring" 2>/dev/null` \
+	         = "X$teststring$teststring"; } >/dev/null 2>&1 &&
+	      test $i != 17 # 1/2 MB should be enough
+      do
+        i=`expr $i + 1`
+        teststring=$teststring$teststring
+      done
+      # Only check the string length outside the loop.
+      lt_cv_sys_max_cmd_len=`expr "X$teststring" : ".*" 2>&1`
+      teststring=
+      # Add a significant safety factor because C++ compilers can tack on
+      # massive amounts of additional arguments before passing them to the
+      # linker.  It appears as though 1/2 is a usable value.
+      lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 2`
+    fi
+    ;;
+  esac
+
+fi
+
+if test -n $lt_cv_sys_max_cmd_len ; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_sys_max_cmd_len" >&5
+$as_echo "$lt_cv_sys_max_cmd_len" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
+$as_echo "none" >&6; }
+fi
+max_cmd_len=$lt_cv_sys_max_cmd_len
+
+
+
+
+
+
+: ${CP="cp -f"}
+: ${MV="mv -f"}
+: ${RM="rm -f"}
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the shell understands some XSI constructs" >&5
+$as_echo_n "checking whether the shell understands some XSI constructs... " >&6; }
+# Try some XSI features
+xsi_shell=no
+( _lt_dummy="a/b/c"
+  test "${_lt_dummy##*/},${_lt_dummy%/*},${_lt_dummy#??}"${_lt_dummy%"$_lt_dummy"}, \
+      = c,a/b,b/c, \
+    && eval 'test $(( 1 + 1 )) -eq 2 \
+    && test "${#_lt_dummy}" -eq 5' ) >/dev/null 2>&1 \
+  && xsi_shell=yes
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $xsi_shell" >&5
+$as_echo "$xsi_shell" >&6; }
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the shell understands \"+=\"" >&5
+$as_echo_n "checking whether the shell understands \"+=\"... " >&6; }
+lt_shell_append=no
+( foo=bar; set foo baz; eval "$1+=\$2" && test "$foo" = barbaz ) \
+    >/dev/null 2>&1 \
+  && lt_shell_append=yes
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_shell_append" >&5
+$as_echo "$lt_shell_append" >&6; }
+
+
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+  lt_unset=unset
+else
+  lt_unset=false
+fi
+
+
+
+
+
+# test EBCDIC or ASCII
+case `echo X|tr X '\101'` in
+ A) # ASCII based system
+    # \n is not interpreted correctly by Solaris 8 /usr/ucb/tr
+  lt_SP2NL='tr \040 \012'
+  lt_NL2SP='tr \015\012 \040\040'
+  ;;
+ *) # EBCDIC based system
+  lt_SP2NL='tr \100 \n'
+  lt_NL2SP='tr \r\n \100\100'
+  ;;
+esac
+
+
+
+
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to convert $build file names to $host format" >&5
+$as_echo_n "checking how to convert $build file names to $host format... " >&6; }
+if ${lt_cv_to_host_file_cmd+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $host in
+  *-*-mingw* )
+    case $build in
+      *-*-mingw* ) # actually msys
+        lt_cv_to_host_file_cmd=func_convert_file_msys_to_w32
+        ;;
+      *-*-cygwin* )
+        lt_cv_to_host_file_cmd=func_convert_file_cygwin_to_w32
+        ;;
+      * ) # otherwise, assume *nix
+        lt_cv_to_host_file_cmd=func_convert_file_nix_to_w32
+        ;;
+    esac
+    ;;
+  *-*-cygwin* )
+    case $build in
+      *-*-mingw* ) # actually msys
+        lt_cv_to_host_file_cmd=func_convert_file_msys_to_cygwin
+        ;;
+      *-*-cygwin* )
+        lt_cv_to_host_file_cmd=func_convert_file_noop
+        ;;
+      * ) # otherwise, assume *nix
+        lt_cv_to_host_file_cmd=func_convert_file_nix_to_cygwin
+        ;;
+    esac
+    ;;
+  * ) # unhandled hosts (and "normal" native builds)
+    lt_cv_to_host_file_cmd=func_convert_file_noop
+    ;;
+esac
+
+fi
+
+to_host_file_cmd=$lt_cv_to_host_file_cmd
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_to_host_file_cmd" >&5
+$as_echo "$lt_cv_to_host_file_cmd" >&6; }
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to convert $build file names to toolchain format" >&5
+$as_echo_n "checking how to convert $build file names to toolchain format... " >&6; }
+if ${lt_cv_to_tool_file_cmd+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  #assume ordinary cross tools, or native build.
+lt_cv_to_tool_file_cmd=func_convert_file_noop
+case $host in
+  *-*-mingw* )
+    case $build in
+      *-*-mingw* ) # actually msys
+        lt_cv_to_tool_file_cmd=func_convert_file_msys_to_w32
+        ;;
+    esac
+    ;;
+esac
+
+fi
+
+to_tool_file_cmd=$lt_cv_to_tool_file_cmd
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_to_tool_file_cmd" >&5
+$as_echo "$lt_cv_to_tool_file_cmd" >&6; }
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $LD option to reload object files" >&5
+$as_echo_n "checking for $LD option to reload object files... " >&6; }
+if ${lt_cv_ld_reload_flag+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_ld_reload_flag='-r'
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_ld_reload_flag" >&5
+$as_echo "$lt_cv_ld_reload_flag" >&6; }
+reload_flag=$lt_cv_ld_reload_flag
+case $reload_flag in
+"" | " "*) ;;
+*) reload_flag=" $reload_flag" ;;
+esac
+reload_cmds='$LD$reload_flag -o $output$reload_objs'
+case $host_os in
+  cygwin* | mingw* | pw32* | cegcc*)
+    if test "$GCC" != yes; then
+      reload_cmds=false
+    fi
+    ;;
+  darwin*)
+    if test "$GCC" = yes; then
+      reload_cmds='$LTCC $LTCFLAGS -nostdlib ${wl}-r -o $output$reload_objs'
+    else
+      reload_cmds='$LD$reload_flag -o $output$reload_objs'
+    fi
+    ;;
+esac
+
+
+
+
+
+
+
+
+
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}objdump", so it can be a program name with args.
+set dummy ${ac_tool_prefix}objdump; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_OBJDUMP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$OBJDUMP"; then
+  ac_cv_prog_OBJDUMP="$OBJDUMP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_OBJDUMP="${ac_tool_prefix}objdump"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+OBJDUMP=$ac_cv_prog_OBJDUMP
+if test -n "$OBJDUMP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OBJDUMP" >&5
+$as_echo "$OBJDUMP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_OBJDUMP"; then
+  ac_ct_OBJDUMP=$OBJDUMP
+  # Extract the first word of "objdump", so it can be a program name with args.
+set dummy objdump; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_OBJDUMP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_OBJDUMP"; then
+  ac_cv_prog_ac_ct_OBJDUMP="$ac_ct_OBJDUMP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_OBJDUMP="objdump"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_OBJDUMP=$ac_cv_prog_ac_ct_OBJDUMP
+if test -n "$ac_ct_OBJDUMP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_OBJDUMP" >&5
+$as_echo "$ac_ct_OBJDUMP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_OBJDUMP" = x; then
+    OBJDUMP="false"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    OBJDUMP=$ac_ct_OBJDUMP
+  fi
+else
+  OBJDUMP="$ac_cv_prog_OBJDUMP"
+fi
+
+test -z "$OBJDUMP" && OBJDUMP=objdump
+
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to recognize dependent libraries" >&5
+$as_echo_n "checking how to recognize dependent libraries... " >&6; }
+if ${lt_cv_deplibs_check_method+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_file_magic_cmd='$MAGIC_CMD'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [[regex]]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given extended regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case $host_os in
+aix[4-9]*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi[45]*)
+  lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)'
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin*)
+  # func_win32_libid is a shell function defined in ltmain.sh
+  lt_cv_deplibs_check_method='file_magic ^x86 archive import|^x86 DLL'
+  lt_cv_file_magic_cmd='func_win32_libid'
+  ;;
+
+mingw* | pw32*)
+  # Base MSYS/MinGW do not provide the 'file' command needed by
+  # func_win32_libid shell function, so use a weaker test based on 'objdump',
+  # unless we find 'file', for example because we are cross-compiling.
+  # func_win32_libid assumes BSD nm, so disallow it if using MS dumpbin.
+  if ( test "$lt_cv_nm_interface" = "BSD nm" && file / ) >/dev/null 2>&1; then
+    lt_cv_deplibs_check_method='file_magic ^x86 archive import|^x86 DLL'
+    lt_cv_file_magic_cmd='func_win32_libid'
+  else
+    # Keep this pattern in sync with the one in func_win32_libid.
+    lt_cv_deplibs_check_method='file_magic file format (pei*-i386(.*architecture: i386)?|pe-arm-wince|pe-x86-64)'
+    lt_cv_file_magic_cmd='$OBJDUMP -f'
+  fi
+  ;;
+
+cegcc*)
+  # use the weaker test based on 'objdump'. See mingw*.
+  lt_cv_deplibs_check_method='file_magic file format pe-arm-.*little(.*architecture: arm)?'
+  lt_cv_file_magic_cmd='$OBJDUMP -f'
+  ;;
+
+darwin* | rhapsody*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+freebsd* | dragonfly*)
+  if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then
+    case $host_cpu in
+    i*86 )
+      # Not sure whether the presence of OpenBSD here was a mistake.
+      # Let's accept both of them until this is cleared up.
+      lt_cv_deplibs_check_method='file_magic (FreeBSD|OpenBSD|DragonFly)/i[3-9]86 (compact )?demand paged shared library'
+      lt_cv_file_magic_cmd=/usr/bin/file
+      lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+      ;;
+    esac
+  else
+    lt_cv_deplibs_check_method=pass_all
+  fi
+  ;;
+
+haiku*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+hpux10.20* | hpux11*)
+  lt_cv_file_magic_cmd=/usr/bin/file
+  case $host_cpu in
+  ia64*)
+    lt_cv_deplibs_check_method='file_magic (s[0-9][0-9][0-9]|ELF-[0-9][0-9]) shared object file - IA64'
+    lt_cv_file_magic_test_file=/usr/lib/hpux32/libc.so
+    ;;
+  hppa*64*)
+    lt_cv_deplibs_check_method='file_magic (s[0-9][0-9][0-9]|ELF[ -][0-9][0-9])(-bit)?( [LM]SB)? shared object( file)?[, -]* PA-RISC [0-9]\.[0-9]'
+    lt_cv_file_magic_test_file=/usr/lib/pa20_64/libc.sl
+    ;;
+  *)
+    lt_cv_deplibs_check_method='file_magic (s[0-9][0-9][0-9]|PA-RISC[0-9]\.[0-9]) shared library'
+    lt_cv_file_magic_test_file=/usr/lib/libc.sl
+    ;;
+  esac
+  ;;
+
+interix[3-9]*)
+  # PIC code is broken on Interix 3.x, that's why |\.a not |_pic\.a here
+  lt_cv_deplibs_check_method='match_pattern /lib[^/]+(\.so|\.a)$'
+  ;;
+
+irix5* | irix6* | nonstopux*)
+  case $LD in
+  *-32|*"-32 ") libmagic=32-bit;;
+  *-n32|*"-n32 ") libmagic=N32;;
+  *-64|*"-64 ") libmagic=64-bit;;
+  *) libmagic=never-match;;
+  esac
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be glibc/ELF.
+linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+netbsd* | netbsdelf*-gnu)
+  if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then
+    lt_cv_deplibs_check_method='match_pattern /lib[^/]+(\.so\.[0-9]+\.[0-9]+|_pic\.a)$'
+  else
+    lt_cv_deplibs_check_method='match_pattern /lib[^/]+(\.so|_pic\.a)$'
+  fi
+  ;;
+
+newos6*)
+  lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (executable|dynamic lib)'
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libnls.so
+  ;;
+
+*nto* | *qnx*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+openbsd*)
+  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+    lt_cv_deplibs_check_method='match_pattern /lib[^/]+(\.so\.[0-9]+\.[0-9]+|\.so|_pic\.a)$'
+  else
+    lt_cv_deplibs_check_method='match_pattern /lib[^/]+(\.so\.[0-9]+\.[0-9]+|_pic\.a)$'
+  fi
+  ;;
+
+osf3* | osf4* | osf5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+rdos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sysv4 | sysv4.3*)
+  case $host_vendor in
+  motorola)
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]'
+    lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  sequent)
+    lt_cv_file_magic_cmd='/bin/file'
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )'
+    ;;
+  sni)
+    lt_cv_file_magic_cmd='/bin/file'
+    lt_cv_deplibs_check_method="file_magic ELF [0-9][0-9]*-bit [LM]SB dynamic lib"
+    lt_cv_file_magic_test_file=/lib/libc.so
+    ;;
+  siemens)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  pc)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  esac
+  ;;
+
+tpf*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+esac
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_deplibs_check_method" >&5
+$as_echo "$lt_cv_deplibs_check_method" >&6; }
+
+file_magic_glob=
+want_nocaseglob=no
+if test "$build" = "$host"; then
+  case $host_os in
+  mingw* | pw32*)
+    if ( shopt | grep nocaseglob ) >/dev/null 2>&1; then
+      want_nocaseglob=yes
+    else
+      file_magic_glob=`echo aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ | $SED -e "s/\(..\)/s\/[\1]\/[\1]\/g;/g"`
+    fi
+    ;;
+  esac
+fi
+
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+test -z "$deplibs_check_method" && deplibs_check_method=unknown
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}dlltool", so it can be a program name with args.
+set dummy ${ac_tool_prefix}dlltool; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_DLLTOOL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$DLLTOOL"; then
+  ac_cv_prog_DLLTOOL="$DLLTOOL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_DLLTOOL="${ac_tool_prefix}dlltool"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+DLLTOOL=$ac_cv_prog_DLLTOOL
+if test -n "$DLLTOOL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DLLTOOL" >&5
+$as_echo "$DLLTOOL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_DLLTOOL"; then
+  ac_ct_DLLTOOL=$DLLTOOL
+  # Extract the first word of "dlltool", so it can be a program name with args.
+set dummy dlltool; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_DLLTOOL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_DLLTOOL"; then
+  ac_cv_prog_ac_ct_DLLTOOL="$ac_ct_DLLTOOL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_DLLTOOL="dlltool"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_DLLTOOL=$ac_cv_prog_ac_ct_DLLTOOL
+if test -n "$ac_ct_DLLTOOL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_DLLTOOL" >&5
+$as_echo "$ac_ct_DLLTOOL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_DLLTOOL" = x; then
+    DLLTOOL="false"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    DLLTOOL=$ac_ct_DLLTOOL
+  fi
+else
+  DLLTOOL="$ac_cv_prog_DLLTOOL"
+fi
+
+test -z "$DLLTOOL" && DLLTOOL=dlltool
+
+
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to associate runtime and link libraries" >&5
+$as_echo_n "checking how to associate runtime and link libraries... " >&6; }
+if ${lt_cv_sharedlib_from_linklib_cmd+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_sharedlib_from_linklib_cmd='unknown'
+
+case $host_os in
+cygwin* | mingw* | pw32* | cegcc*)
+  # two different shell functions defined in ltmain.sh
+  # decide which to use based on capabilities of $DLLTOOL
+  case `$DLLTOOL --help 2>&1` in
+  *--identify-strict*)
+    lt_cv_sharedlib_from_linklib_cmd=func_cygming_dll_for_implib
+    ;;
+  *)
+    lt_cv_sharedlib_from_linklib_cmd=func_cygming_dll_for_implib_fallback
+    ;;
+  esac
+  ;;
+*)
+  # fallback: assume linklib IS sharedlib
+  lt_cv_sharedlib_from_linklib_cmd="$ECHO"
+  ;;
+esac
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_sharedlib_from_linklib_cmd" >&5
+$as_echo "$lt_cv_sharedlib_from_linklib_cmd" >&6; }
+sharedlib_from_linklib_cmd=$lt_cv_sharedlib_from_linklib_cmd
+test -z "$sharedlib_from_linklib_cmd" && sharedlib_from_linklib_cmd=$ECHO
+
+
+
+
+
+
+
+if test -n "$ac_tool_prefix"; then
+  for ac_prog in ar
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AR+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$AR"; then
+  ac_cv_prog_AR="$AR" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_AR="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+AR=$ac_cv_prog_AR
+if test -n "$AR"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AR" >&5
+$as_echo "$AR" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$AR" && break
+  done
+fi
+if test -z "$AR"; then
+  ac_ct_AR=$AR
+  for ac_prog in ar
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_AR+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_AR"; then
+  ac_cv_prog_ac_ct_AR="$ac_ct_AR" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_AR="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_AR=$ac_cv_prog_ac_ct_AR
+if test -n "$ac_ct_AR"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_AR" >&5
+$as_echo "$ac_ct_AR" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_AR" && break
+done
+
+  if test "x$ac_ct_AR" = x; then
+    AR="false"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    AR=$ac_ct_AR
+  fi
+fi
+
+: ${AR=ar}
+: ${AR_FLAGS=cru}
+
+
+
+
+
+
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for archiver @FILE support" >&5
+$as_echo_n "checking for archiver @FILE support... " >&6; }
+if ${lt_cv_ar_at_file+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_ar_at_file=no
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  echo conftest.$ac_objext > conftest.lst
+      lt_ar_try='$AR $AR_FLAGS libconftest.a @conftest.lst >&5'
+      { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$lt_ar_try\""; } >&5
+  (eval $lt_ar_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+      if test "$ac_status" -eq 0; then
+	# Ensure the archiver fails upon bogus file names.
+	rm -f conftest.$ac_objext libconftest.a
+	{ { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$lt_ar_try\""; } >&5
+  (eval $lt_ar_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+	if test "$ac_status" -ne 0; then
+          lt_cv_ar_at_file=@
+        fi
+      fi
+      rm -f conftest.* libconftest.a
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_ar_at_file" >&5
+$as_echo "$lt_cv_ar_at_file" >&6; }
+
+if test "x$lt_cv_ar_at_file" = xno; then
+  archiver_list_spec=
+else
+  archiver_list_spec=$lt_cv_ar_at_file
+fi
+
+
+
+
+
+
+
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
+set dummy ${ac_tool_prefix}strip; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_STRIP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_STRIP="${ac_tool_prefix}strip"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+STRIP=$ac_cv_prog_STRIP
+if test -n "$STRIP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $STRIP" >&5
+$as_echo "$STRIP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_STRIP"; then
+  ac_ct_STRIP=$STRIP
+  # Extract the first word of "strip", so it can be a program name with args.
+set dummy strip; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_STRIP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_STRIP"; then
+  ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_STRIP="strip"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP
+if test -n "$ac_ct_STRIP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_STRIP" >&5
+$as_echo "$ac_ct_STRIP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_STRIP" = x; then
+    STRIP=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    STRIP=$ac_ct_STRIP
+  fi
+else
+  STRIP="$ac_cv_prog_STRIP"
+fi
+
+test -z "$STRIP" && STRIP=:
+
+
+
+
+
+
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
+set dummy ${ac_tool_prefix}ranlib; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_RANLIB+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+RANLIB=$ac_cv_prog_RANLIB
+if test -n "$RANLIB"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5
+$as_echo "$RANLIB" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_RANLIB"; then
+  ac_ct_RANLIB=$RANLIB
+  # Extract the first word of "ranlib", so it can be a program name with args.
+set dummy ranlib; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_RANLIB+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_RANLIB"; then
+  ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_RANLIB="ranlib"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB
+if test -n "$ac_ct_RANLIB"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5
+$as_echo "$ac_ct_RANLIB" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_RANLIB" = x; then
+    RANLIB=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    RANLIB=$ac_ct_RANLIB
+  fi
+else
+  RANLIB="$ac_cv_prog_RANLIB"
+fi
+
+test -z "$RANLIB" && RANLIB=:
+
+
+
+
+
+
+# Determine commands to create old-style static archives.
+old_archive_cmds='$AR $AR_FLAGS $oldlib$oldobjs'
+old_postinstall_cmds='chmod 644 $oldlib'
+old_postuninstall_cmds=
+
+if test -n "$RANLIB"; then
+  case $host_os in
+  openbsd*)
+    old_postinstall_cmds="$old_postinstall_cmds~\$RANLIB -t \$tool_oldlib"
+    ;;
+  *)
+    old_postinstall_cmds="$old_postinstall_cmds~\$RANLIB \$tool_oldlib"
+    ;;
+  esac
+  old_archive_cmds="$old_archive_cmds~\$RANLIB \$tool_oldlib"
+fi
+
+case $host_os in
+  darwin*)
+    lock_old_archive_extraction=yes ;;
+  *)
+    lock_old_archive_extraction=no ;;
+esac
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# If no C compiler was specified, use CC.
+LTCC=${LTCC-"$CC"}
+
+# If no C compiler flags were specified, use CFLAGS.
+LTCFLAGS=${LTCFLAGS-"$CFLAGS"}
+
+# Allow CC to be a program name with arguments.
+compiler=$CC
+
+
+# Check for command to grab the raw symbol name followed by C symbol from nm.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking command to parse $NM output from $compiler object" >&5
+$as_echo_n "checking command to parse $NM output from $compiler object... " >&6; }
+if ${lt_cv_sys_global_symbol_pipe+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+# These are sane defaults that work on at least a few old systems.
+# [They come from Ultrix.  What could be older than Ultrix?!! ;)]
+
+# Character class describing NM global symbol codes.
+symcode='[BCDEGRST]'
+
+# Regexp to match symbols that can be accessed directly from C.
+sympat='\([_A-Za-z][_A-Za-z0-9]*\)'
+
+# Define system-specific variables.
+case $host_os in
+aix*)
+  symcode='[BCDT]'
+  ;;
+cygwin* | mingw* | pw32* | cegcc*)
+  symcode='[ABCDGISTW]'
+  ;;
+hpux*)
+  if test "$host_cpu" = ia64; then
+    symcode='[ABCDEGRST]'
+  fi
+  ;;
+irix* | nonstopux*)
+  symcode='[BCDEGRST]'
+  ;;
+osf*)
+  symcode='[BCDEGQRST]'
+  ;;
+solaris*)
+  symcode='[BDRT]'
+  ;;
+sco3.2v5*)
+  symcode='[DT]'
+  ;;
+sysv4.2uw2*)
+  symcode='[DT]'
+  ;;
+sysv5* | sco5v6* | unixware* | OpenUNIX*)
+  symcode='[ABDT]'
+  ;;
+sysv4)
+  symcode='[DFNSTU]'
+  ;;
+esac
+
+# If we're using GNU nm, then use its standard symbol codes.
+case `$NM -V 2>&1` in
+*GNU* | *'with BFD'*)
+  symcode='[ABCDGIRSTW]' ;;
+esac
+
+# Transform an extracted symbol line into a proper C declaration.
+# Some systems (esp. on ia64) link data and code symbols differently,
+# so use this general approach.
+lt_cv_sys_global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern int \1();/p' -e 's/^$symcode* .* \(.*\)$/extern char \1;/p'"
+
+# Transform an extracted symbol line into symbol name and symbol address
+lt_cv_sys_global_symbol_to_c_name_address="sed -n -e 's/^: \([^ ]*\)[ ]*$/  {\\\"\1\\\", (void *) 0},/p' -e 's/^$symcode* \([^ ]*\) \([^ ]*\)$/  {\"\2\", (void *) \&\2},/p'"
+lt_cv_sys_global_symbol_to_c_name_address_lib_prefix="sed -n -e 's/^: \([^ ]*\)[ ]*$/  {\\\"\1\\\", (void *) 0},/p' -e 's/^$symcode* \([^ ]*\) \(lib[^ ]*\)$/  {\"\2\", (void *) \&\2},/p' -e 's/^$symcode* \([^ ]*\) \([^ ]*\)$/  {\"lib\2\", (void *) \&\2},/p'"
+
+# Handle CRLF in mingw tool chain
+opt_cr=
+case $build_os in
+mingw*)
+  opt_cr=`$ECHO 'x\{0,1\}' | tr x '\015'` # option cr in regexp
+  ;;
+esac
+
+# Try without a prefix underscore, then with it.
+for ac_symprfx in "" "_"; do
+
+  # Transform symcode, sympat, and symprfx into a raw symbol and a C symbol.
+  symxfrm="\\1 $ac_symprfx\\2 \\2"
+
+  # Write the raw and C identifiers.
+  if test "$lt_cv_nm_interface" = "MS dumpbin"; then
+    # Fake it for dumpbin and say T for any non-static function
+    # and D for any global variable.
+    # Also find C++ and __fastcall symbols from MSVC++,
+    # which start with @ or ?.
+    lt_cv_sys_global_symbol_pipe="$AWK '"\
+"     {last_section=section; section=\$ 3};"\
+"     /^COFF SYMBOL TABLE/{for(i in hide) delete hide[i]};"\
+"     /Section length .*#relocs.*(pick any)/{hide[last_section]=1};"\
+"     \$ 0!~/External *\|/{next};"\
+"     / 0+ UNDEF /{next}; / UNDEF \([^|]\)*()/{next};"\
+"     {if(hide[section]) next};"\
+"     {f=0}; \$ 0~/\(\).*\|/{f=1}; {printf f ? \"T \" : \"D \"};"\
+"     {split(\$ 0, a, /\||\r/); split(a[2], s)};"\
+"     s[1]~/^[@?]/{print s[1], s[1]; next};"\
+"     s[1]~prfx {split(s[1],t,\"@\"); print t[1], substr(t[1],length(prfx))}"\
+"     ' prfx=^$ac_symprfx"
+  else
+    lt_cv_sys_global_symbol_pipe="sed -n -e 's/^.*[	 ]\($symcode$symcode*\)[	 ][	 ]*$ac_symprfx$sympat$opt_cr$/$symxfrm/p'"
+  fi
+  lt_cv_sys_global_symbol_pipe="$lt_cv_sys_global_symbol_pipe | sed '/ __gnu_lto/d'"
+
+  # Check to see that the pipe works correctly.
+  pipe_works=no
+
+  rm -f conftest*
+  cat > conftest.$ac_ext <<_LT_EOF
+#ifdef __cplusplus
+extern "C" {
+#endif
+char nm_test_var;
+void nm_test_func(void);
+void nm_test_func(void){}
+#ifdef __cplusplus
+}
+#endif
+int main(){nm_test_var='a';nm_test_func();return(0);}
+_LT_EOF
+
+  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+    # Now try to grab the symbols.
+    nlist=conftest.nm
+    if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$NM conftest.$ac_objext \| "$lt_cv_sys_global_symbol_pipe" \> $nlist\""; } >&5
+  (eval $NM conftest.$ac_objext \| "$lt_cv_sys_global_symbol_pipe" \> $nlist) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && test -s "$nlist"; then
+      # Try sorting and uniquifying the output.
+      if sort "$nlist" | uniq > "$nlist"T; then
+	mv -f "$nlist"T "$nlist"
+      else
+	rm -f "$nlist"T
+      fi
+
+      # Make sure that we snagged all the symbols we need.
+      if $GREP ' nm_test_var$' "$nlist" >/dev/null; then
+	if $GREP ' nm_test_func$' "$nlist" >/dev/null; then
+	  cat <<_LT_EOF > conftest.$ac_ext
+/* Keep this code in sync between libtool.m4, ltmain, lt_system.h, and tests.  */
+#if defined(_WIN32) || defined(__CYGWIN__) || defined(_WIN32_WCE)
+/* DATA imports from DLLs on WIN32 con't be const, because runtime
+   relocations are performed -- see ld's documentation on pseudo-relocs.  */
+# define LT_DLSYM_CONST
+#elif defined(__osf__)
+/* This system does not cope well with relocations in const data.  */
+# define LT_DLSYM_CONST
+#else
+# define LT_DLSYM_CONST const
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+_LT_EOF
+	  # Now generate the symbol file.
+	  eval "$lt_cv_sys_global_symbol_to_cdecl"' < "$nlist" | $GREP -v main >> conftest.$ac_ext'
+
+	  cat <<_LT_EOF >> conftest.$ac_ext
+
+/* The mapping between symbol names and symbols.  */
+LT_DLSYM_CONST struct {
+  const char *name;
+  void       *address;
+}
+lt__PROGRAM__LTX_preloaded_symbols[] =
+{
+  { "@PROGRAM@", (void *) 0 },
+_LT_EOF
+	  $SED "s/^$symcode$symcode* \(.*\) \(.*\)$/  {\"\2\", (void *) \&\2},/" < "$nlist" | $GREP -v main >> conftest.$ac_ext
+	  cat <<\_LT_EOF >> conftest.$ac_ext
+  {0, (void *) 0}
+};
+
+/* This works around a problem in FreeBSD linker */
+#ifdef FREEBSD_WORKAROUND
+static const void *lt_preloaded_setup() {
+  return lt__PROGRAM__LTX_preloaded_symbols;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+_LT_EOF
+	  # Now try linking the two files.
+	  mv conftest.$ac_objext conftstm.$ac_objext
+	  lt_globsym_save_LIBS=$LIBS
+	  lt_globsym_save_CFLAGS=$CFLAGS
+	  LIBS="conftstm.$ac_objext"
+	  CFLAGS="$CFLAGS$lt_prog_compiler_no_builtin_flag"
+	  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_link\""; } >&5
+  (eval $ac_link) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && test -s conftest${ac_exeext}; then
+	    pipe_works=yes
+	  fi
+	  LIBS=$lt_globsym_save_LIBS
+	  CFLAGS=$lt_globsym_save_CFLAGS
+	else
+	  echo "cannot find nm_test_func in $nlist" >&5
+	fi
+      else
+	echo "cannot find nm_test_var in $nlist" >&5
+      fi
+    else
+      echo "cannot run $lt_cv_sys_global_symbol_pipe" >&5
+    fi
+  else
+    echo "$progname: failed program was:" >&5
+    cat conftest.$ac_ext >&5
+  fi
+  rm -rf conftest* conftst*
+
+  # Do not use the global_symbol_pipe unless it works.
+  if test "$pipe_works" = yes; then
+    break
+  else
+    lt_cv_sys_global_symbol_pipe=
+  fi
+done
+
+fi
+
+if test -z "$lt_cv_sys_global_symbol_pipe"; then
+  lt_cv_sys_global_symbol_to_cdecl=
+fi
+if test -z "$lt_cv_sys_global_symbol_pipe$lt_cv_sys_global_symbol_to_cdecl"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: failed" >&5
+$as_echo "failed" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ok" >&5
+$as_echo "ok" >&6; }
+fi
+
+# Response file support.
+if test "$lt_cv_nm_interface" = "MS dumpbin"; then
+  nm_file_list_spec='@'
+elif $NM --help 2>/dev/null | grep '[@]FILE' >/dev/null; then
+  nm_file_list_spec='@'
+fi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for sysroot" >&5
+$as_echo_n "checking for sysroot... " >&6; }
+
+# Check whether --with-sysroot was given.
+if test "${with_sysroot+set}" = set; then :
+  withval=$with_sysroot;
+else
+  with_sysroot=no
+fi
+
+
+lt_sysroot=
+case ${with_sysroot} in #(
+ yes)
+   if test "$GCC" = yes; then
+     lt_sysroot=`$CC --print-sysroot 2>/dev/null`
+   fi
+   ;; #(
+ /*)
+   lt_sysroot=`echo "$with_sysroot" | sed -e "$sed_quote_subst"`
+   ;; #(
+ no|'')
+   ;; #(
+ *)
+   { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${with_sysroot}" >&5
+$as_echo "${with_sysroot}" >&6; }
+   as_fn_error $? "The sysroot must be an absolute path." "$LINENO" 5
+   ;;
+esac
+
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${lt_sysroot:-no}" >&5
+$as_echo "${lt_sysroot:-no}" >&6; }
+
+
+
+
+
+# Check whether --enable-libtool-lock was given.
+if test "${enable_libtool_lock+set}" = set; then :
+  enableval=$enable_libtool_lock;
+fi
+
+test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case $host in
+ia64-*-hpux*)
+  # Find out which ABI we are using.
+  echo 'int i;' > conftest.$ac_ext
+  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+    case `/usr/bin/file conftest.$ac_objext` in
+      *ELF-32*)
+	HPUX_IA64_MODE="32"
+	;;
+      *ELF-64*)
+	HPUX_IA64_MODE="64"
+	;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '#line '$LINENO' "configure"' > conftest.$ac_ext
+  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+    if test "$lt_cv_prog_gnu_ld" = yes; then
+      case `/usr/bin/file conftest.$ac_objext` in
+	*32-bit*)
+	  LD="${LD-ld} -melf32bsmip"
+	  ;;
+	*N32*)
+	  LD="${LD-ld} -melf32bmipn32"
+	  ;;
+	*64-bit*)
+	  LD="${LD-ld} -melf64bmip"
+	;;
+      esac
+    else
+      case `/usr/bin/file conftest.$ac_objext` in
+	*32-bit*)
+	  LD="${LD-ld} -32"
+	  ;;
+	*N32*)
+	  LD="${LD-ld} -n32"
+	  ;;
+	*64-bit*)
+	  LD="${LD-ld} -64"
+	  ;;
+      esac
+    fi
+  fi
+  rm -rf conftest*
+  ;;
+
+x86_64-*kfreebsd*-gnu|x86_64-*linux*|ppc*-*linux*|powerpc*-*linux*| \
+s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
+  # Find out which ABI we are using.
+  echo 'int i;' > conftest.$ac_ext
+  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+    case `/usr/bin/file conftest.o` in
+      *32-bit*)
+	case $host in
+	  x86_64-*kfreebsd*-gnu)
+	    LD="${LD-ld} -m elf_i386_fbsd"
+	    ;;
+	  x86_64-*linux*)
+	    LD="${LD-ld} -m elf_i386"
+	    ;;
+	  ppc64-*linux*|powerpc64-*linux*)
+	    LD="${LD-ld} -m elf32ppclinux"
+	    ;;
+	  s390x-*linux*)
+	    LD="${LD-ld} -m elf_s390"
+	    ;;
+	  sparc64-*linux*)
+	    LD="${LD-ld} -m elf32_sparc"
+	    ;;
+	esac
+	;;
+      *64-bit*)
+	case $host in
+	  x86_64-*kfreebsd*-gnu)
+	    LD="${LD-ld} -m elf_x86_64_fbsd"
+	    ;;
+	  x86_64-*linux*)
+	    LD="${LD-ld} -m elf_x86_64"
+	    ;;
+	  ppc*-*linux*|powerpc*-*linux*)
+	    LD="${LD-ld} -m elf64ppc"
+	    ;;
+	  s390*-*linux*|s390*-*tpf*)
+	    LD="${LD-ld} -m elf64_s390"
+	    ;;
+	  sparc*-*linux*)
+	    LD="${LD-ld} -m elf64_sparc"
+	    ;;
+	esac
+	;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler needs -belf" >&5
+$as_echo_n "checking whether the C compiler needs -belf... " >&6; }
+if ${lt_cv_cc_needs_belf+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+     cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  lt_cv_cc_needs_belf=yes
+else
+  lt_cv_cc_needs_belf=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+     ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_cc_needs_belf" >&5
+$as_echo "$lt_cv_cc_needs_belf" >&6; }
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
+*-*solaris*)
+  # Find out which ABI we are using.
+  echo 'int i;' > conftest.$ac_ext
+  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+    case `/usr/bin/file conftest.o` in
+    *64-bit*)
+      case $lt_cv_prog_gnu_ld in
+      yes*)
+        case $host in
+        i?86-*-solaris*)
+          LD="${LD-ld} -m elf_x86_64"
+          ;;
+        sparc*-*-solaris*)
+          LD="${LD-ld} -m elf64_sparc"
+          ;;
+        esac
+        # GNU ld 2.21 introduced _sol2 emulations.  Use them if available.
+        if ${LD-ld} -V | grep _sol2 >/dev/null 2>&1; then
+          LD="${LD-ld}_sol2"
+        fi
+        ;;
+      *)
+	if ${LD-ld} -64 -r -o conftest2.o conftest.o >/dev/null 2>&1; then
+	  LD="${LD-ld} -64"
+	fi
+	;;
+      esac
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+esac
+
+need_locks="$enable_libtool_lock"
+
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}mt", so it can be a program name with args.
+set dummy ${ac_tool_prefix}mt; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_MANIFEST_TOOL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$MANIFEST_TOOL"; then
+  ac_cv_prog_MANIFEST_TOOL="$MANIFEST_TOOL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_MANIFEST_TOOL="${ac_tool_prefix}mt"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+MANIFEST_TOOL=$ac_cv_prog_MANIFEST_TOOL
+if test -n "$MANIFEST_TOOL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $MANIFEST_TOOL" >&5
+$as_echo "$MANIFEST_TOOL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_MANIFEST_TOOL"; then
+  ac_ct_MANIFEST_TOOL=$MANIFEST_TOOL
+  # Extract the first word of "mt", so it can be a program name with args.
+set dummy mt; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_MANIFEST_TOOL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_MANIFEST_TOOL"; then
+  ac_cv_prog_ac_ct_MANIFEST_TOOL="$ac_ct_MANIFEST_TOOL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_MANIFEST_TOOL="mt"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_MANIFEST_TOOL=$ac_cv_prog_ac_ct_MANIFEST_TOOL
+if test -n "$ac_ct_MANIFEST_TOOL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_MANIFEST_TOOL" >&5
+$as_echo "$ac_ct_MANIFEST_TOOL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_MANIFEST_TOOL" = x; then
+    MANIFEST_TOOL=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    MANIFEST_TOOL=$ac_ct_MANIFEST_TOOL
+  fi
+else
+  MANIFEST_TOOL="$ac_cv_prog_MANIFEST_TOOL"
+fi
+
+test -z "$MANIFEST_TOOL" && MANIFEST_TOOL=mt
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if $MANIFEST_TOOL is a manifest tool" >&5
+$as_echo_n "checking if $MANIFEST_TOOL is a manifest tool... " >&6; }
+if ${lt_cv_path_mainfest_tool+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_path_mainfest_tool=no
+  echo "$as_me:$LINENO: $MANIFEST_TOOL '-?'" >&5
+  $MANIFEST_TOOL '-?' 2>conftest.err > conftest.out
+  cat conftest.err >&5
+  if $GREP 'Manifest Tool' conftest.out > /dev/null; then
+    lt_cv_path_mainfest_tool=yes
+  fi
+  rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_path_mainfest_tool" >&5
+$as_echo "$lt_cv_path_mainfest_tool" >&6; }
+if test "x$lt_cv_path_mainfest_tool" != xyes; then
+  MANIFEST_TOOL=:
+fi
+
+
+
+
+
+
+  case $host_os in
+    rhapsody* | darwin*)
+    if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}dsymutil", so it can be a program name with args.
+set dummy ${ac_tool_prefix}dsymutil; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_DSYMUTIL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$DSYMUTIL"; then
+  ac_cv_prog_DSYMUTIL="$DSYMUTIL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_DSYMUTIL="${ac_tool_prefix}dsymutil"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+DSYMUTIL=$ac_cv_prog_DSYMUTIL
+if test -n "$DSYMUTIL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DSYMUTIL" >&5
+$as_echo "$DSYMUTIL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_DSYMUTIL"; then
+  ac_ct_DSYMUTIL=$DSYMUTIL
+  # Extract the first word of "dsymutil", so it can be a program name with args.
+set dummy dsymutil; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_DSYMUTIL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_DSYMUTIL"; then
+  ac_cv_prog_ac_ct_DSYMUTIL="$ac_ct_DSYMUTIL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_DSYMUTIL="dsymutil"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_DSYMUTIL=$ac_cv_prog_ac_ct_DSYMUTIL
+if test -n "$ac_ct_DSYMUTIL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_DSYMUTIL" >&5
+$as_echo "$ac_ct_DSYMUTIL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_DSYMUTIL" = x; then
+    DSYMUTIL=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    DSYMUTIL=$ac_ct_DSYMUTIL
+  fi
+else
+  DSYMUTIL="$ac_cv_prog_DSYMUTIL"
+fi
+
+    if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}nmedit", so it can be a program name with args.
+set dummy ${ac_tool_prefix}nmedit; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_NMEDIT+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$NMEDIT"; then
+  ac_cv_prog_NMEDIT="$NMEDIT" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_NMEDIT="${ac_tool_prefix}nmedit"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+NMEDIT=$ac_cv_prog_NMEDIT
+if test -n "$NMEDIT"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $NMEDIT" >&5
+$as_echo "$NMEDIT" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_NMEDIT"; then
+  ac_ct_NMEDIT=$NMEDIT
+  # Extract the first word of "nmedit", so it can be a program name with args.
+set dummy nmedit; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_NMEDIT+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_NMEDIT"; then
+  ac_cv_prog_ac_ct_NMEDIT="$ac_ct_NMEDIT" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_NMEDIT="nmedit"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_NMEDIT=$ac_cv_prog_ac_ct_NMEDIT
+if test -n "$ac_ct_NMEDIT"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_NMEDIT" >&5
+$as_echo "$ac_ct_NMEDIT" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_NMEDIT" = x; then
+    NMEDIT=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    NMEDIT=$ac_ct_NMEDIT
+  fi
+else
+  NMEDIT="$ac_cv_prog_NMEDIT"
+fi
+
+    if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}lipo", so it can be a program name with args.
+set dummy ${ac_tool_prefix}lipo; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_LIPO+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$LIPO"; then
+  ac_cv_prog_LIPO="$LIPO" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_LIPO="${ac_tool_prefix}lipo"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+LIPO=$ac_cv_prog_LIPO
+if test -n "$LIPO"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LIPO" >&5
+$as_echo "$LIPO" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_LIPO"; then
+  ac_ct_LIPO=$LIPO
+  # Extract the first word of "lipo", so it can be a program name with args.
+set dummy lipo; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_LIPO+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_LIPO"; then
+  ac_cv_prog_ac_ct_LIPO="$ac_ct_LIPO" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_LIPO="lipo"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_LIPO=$ac_cv_prog_ac_ct_LIPO
+if test -n "$ac_ct_LIPO"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_LIPO" >&5
+$as_echo "$ac_ct_LIPO" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_LIPO" = x; then
+    LIPO=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    LIPO=$ac_ct_LIPO
+  fi
+else
+  LIPO="$ac_cv_prog_LIPO"
+fi
+
+    if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}otool", so it can be a program name with args.
+set dummy ${ac_tool_prefix}otool; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_OTOOL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$OTOOL"; then
+  ac_cv_prog_OTOOL="$OTOOL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_OTOOL="${ac_tool_prefix}otool"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+OTOOL=$ac_cv_prog_OTOOL
+if test -n "$OTOOL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OTOOL" >&5
+$as_echo "$OTOOL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_OTOOL"; then
+  ac_ct_OTOOL=$OTOOL
+  # Extract the first word of "otool", so it can be a program name with args.
+set dummy otool; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_OTOOL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_OTOOL"; then
+  ac_cv_prog_ac_ct_OTOOL="$ac_ct_OTOOL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_OTOOL="otool"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_OTOOL=$ac_cv_prog_ac_ct_OTOOL
+if test -n "$ac_ct_OTOOL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_OTOOL" >&5
+$as_echo "$ac_ct_OTOOL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_OTOOL" = x; then
+    OTOOL=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    OTOOL=$ac_ct_OTOOL
+  fi
+else
+  OTOOL="$ac_cv_prog_OTOOL"
+fi
+
+    if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}otool64", so it can be a program name with args.
+set dummy ${ac_tool_prefix}otool64; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_OTOOL64+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$OTOOL64"; then
+  ac_cv_prog_OTOOL64="$OTOOL64" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_OTOOL64="${ac_tool_prefix}otool64"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+OTOOL64=$ac_cv_prog_OTOOL64
+if test -n "$OTOOL64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OTOOL64" >&5
+$as_echo "$OTOOL64" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_OTOOL64"; then
+  ac_ct_OTOOL64=$OTOOL64
+  # Extract the first word of "otool64", so it can be a program name with args.
+set dummy otool64; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_OTOOL64+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_OTOOL64"; then
+  ac_cv_prog_ac_ct_OTOOL64="$ac_ct_OTOOL64" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_OTOOL64="otool64"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_OTOOL64=$ac_cv_prog_ac_ct_OTOOL64
+if test -n "$ac_ct_OTOOL64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_OTOOL64" >&5
+$as_echo "$ac_ct_OTOOL64" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_OTOOL64" = x; then
+    OTOOL64=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    OTOOL64=$ac_ct_OTOOL64
+  fi
+else
+  OTOOL64="$ac_cv_prog_OTOOL64"
+fi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -single_module linker flag" >&5
+$as_echo_n "checking for -single_module linker flag... " >&6; }
+if ${lt_cv_apple_cc_single_mod+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_apple_cc_single_mod=no
+      if test -z "${LT_MULTI_MODULE}"; then
+	# By default we will add the -single_module flag. You can override
+	# by either setting the environment variable LT_MULTI_MODULE
+	# non-empty at configure time, or by adding -multi_module to the
+	# link flags.
+	rm -rf libconftest.dylib*
+	echo "int foo(void){return 1;}" > conftest.c
+	echo "$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \
+-dynamiclib -Wl,-single_module conftest.c" >&5
+	$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \
+	  -dynamiclib -Wl,-single_module conftest.c 2>conftest.err
+        _lt_result=$?
+	# If there is a non-empty error log, and "single_module"
+	# appears in it, assume the flag caused a linker warning
+        if test -s conftest.err && $GREP single_module conftest.err; then
+	  cat conftest.err >&5
+	# Otherwise, if the output was created with a 0 exit code from
+	# the compiler, it worked.
+	elif test -f libconftest.dylib && test $_lt_result -eq 0; then
+	  lt_cv_apple_cc_single_mod=yes
+	else
+	  cat conftest.err >&5
+	fi
+	rm -rf libconftest.dylib*
+	rm -f conftest.*
+      fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_apple_cc_single_mod" >&5
+$as_echo "$lt_cv_apple_cc_single_mod" >&6; }
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -exported_symbols_list linker flag" >&5
+$as_echo_n "checking for -exported_symbols_list linker flag... " >&6; }
+if ${lt_cv_ld_exported_symbols_list+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_ld_exported_symbols_list=no
+      save_LDFLAGS=$LDFLAGS
+      echo "_main" > conftest.sym
+      LDFLAGS="$LDFLAGS -Wl,-exported_symbols_list,conftest.sym"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  lt_cv_ld_exported_symbols_list=yes
+else
+  lt_cv_ld_exported_symbols_list=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+	LDFLAGS="$save_LDFLAGS"
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_ld_exported_symbols_list" >&5
+$as_echo "$lt_cv_ld_exported_symbols_list" >&6; }
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -force_load linker flag" >&5
+$as_echo_n "checking for -force_load linker flag... " >&6; }
+if ${lt_cv_ld_force_load+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_ld_force_load=no
+      cat > conftest.c << _LT_EOF
+int forced_loaded() { return 2;}
+_LT_EOF
+      echo "$LTCC $LTCFLAGS -c -o conftest.o conftest.c" >&5
+      $LTCC $LTCFLAGS -c -o conftest.o conftest.c 2>&5
+      echo "$AR cru libconftest.a conftest.o" >&5
+      $AR cru libconftest.a conftest.o 2>&5
+      echo "$RANLIB libconftest.a" >&5
+      $RANLIB libconftest.a 2>&5
+      cat > conftest.c << _LT_EOF
+int main() { return 0;}
+_LT_EOF
+      echo "$LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a" >&5
+      $LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a 2>conftest.err
+      _lt_result=$?
+      if test -s conftest.err && $GREP force_load conftest.err; then
+	cat conftest.err >&5
+      elif test -f conftest && test $_lt_result -eq 0 && $GREP forced_load conftest >/dev/null 2>&1 ; then
+	lt_cv_ld_force_load=yes
+      else
+	cat conftest.err >&5
+      fi
+        rm -f conftest.err libconftest.a conftest conftest.c
+        rm -rf conftest.dSYM
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_ld_force_load" >&5
+$as_echo "$lt_cv_ld_force_load" >&6; }
+    case $host_os in
+    rhapsody* | darwin1.[012])
+      _lt_dar_allow_undefined='${wl}-undefined ${wl}suppress' ;;
+    darwin1.*)
+      _lt_dar_allow_undefined='${wl}-flat_namespace ${wl}-undefined ${wl}suppress' ;;
+    darwin*) # darwin 5.x on
+      # if running on 10.5 or later, the deployment target defaults
+      # to the OS version, if on x86, and 10.4, the deployment
+      # target defaults to 10.4. Don't you love it?
+      case ${MACOSX_DEPLOYMENT_TARGET-10.0},$host in
+	10.0,*86*-darwin8*|10.0,*-darwin[91]*)
+	  _lt_dar_allow_undefined='${wl}-undefined ${wl}dynamic_lookup' ;;
+	10.[012]*)
+	  _lt_dar_allow_undefined='${wl}-flat_namespace ${wl}-undefined ${wl}suppress' ;;
+	10.*)
+	  _lt_dar_allow_undefined='${wl}-undefined ${wl}dynamic_lookup' ;;
+      esac
+    ;;
+  esac
+    if test "$lt_cv_apple_cc_single_mod" = "yes"; then
+      _lt_dar_single_mod='$single_module'
+    fi
+    if test "$lt_cv_ld_exported_symbols_list" = "yes"; then
+      _lt_dar_export_syms=' ${wl}-exported_symbols_list,$output_objdir/${libname}-symbols.expsym'
+    else
+      _lt_dar_export_syms='~$NMEDIT -s $output_objdir/${libname}-symbols.expsym ${lib}'
+    fi
+    if test "$DSYMUTIL" != ":" && test "$lt_cv_ld_force_load" = "no"; then
+      _lt_dsymutil='~$DSYMUTIL $lib || :'
+    else
+      _lt_dsymutil=
+    fi
+    ;;
+  esac
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5
+$as_echo_n "checking how to run the C preprocessor... " >&6; }
+# On Suns, sometimes $CPP names a directory.
+if test -n "$CPP" && test -d "$CPP"; then
+  CPP=
+fi
+if test -z "$CPP"; then
+  if ${ac_cv_prog_CPP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+      # Double quotes because CPP needs to be expanded
+    for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp"
+    do
+      ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+  break
+fi
+
+    done
+    ac_cv_prog_CPP=$CPP
+
+fi
+  CPP=$ac_cv_prog_CPP
+else
+  ac_cv_prog_CPP=$CPP
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5
+$as_echo "$CPP" >&6; }
+ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "C preprocessor \"$CPP\" fails sanity check
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
+$as_echo_n "checking for ANSI C header files... " >&6; }
+if ${ac_cv_header_stdc+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_header_stdc=yes
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "memchr" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "free" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+  if test "$cross_compiling" = yes; then :
+  :
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+		   (('a' <= (c) && (c) <= 'i') \
+		     || ('j' <= (c) && (c) <= 'r') \
+		     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+	|| toupper (i) != TOUPPER (i))
+      return 2;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
+$as_echo "$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+$as_echo "#define STDC_HEADERS 1" >>confdefs.h
+
+fi
+
+# On IRIX 5.3, sys/types and inttypes.h are conflicting.
+for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
+		  inttypes.h stdint.h unistd.h
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
+"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+for ac_header in dlfcn.h
+do :
+  ac_fn_c_check_header_compile "$LINENO" "dlfcn.h" "ac_cv_header_dlfcn_h" "$ac_includes_default
+"
+if test "x$ac_cv_header_dlfcn_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_DLFCN_H 1
+_ACEOF
+
+fi
+
+done
+
+
+
+
+
+# Set options
+
+
+
+        enable_dlopen=no
+
+
+
+
+  # Check whether --enable-static was given.
+if test "${enable_static+set}" = set; then :
+  enableval=$enable_static; p=${PACKAGE-default}
+    case $enableval in
+    yes) enable_static=yes ;;
+    no) enable_static=no ;;
+    *)
+     enable_static=no
+      # Look at the argument we got.  We use all the common list separators.
+      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      for pkg in $enableval; do
+	IFS="$lt_save_ifs"
+	if test "X$pkg" = "X$p"; then
+	  enable_static=yes
+	fi
+      done
+      IFS="$lt_save_ifs"
+      ;;
+    esac
+else
+  enable_static=yes
+fi
+
+
+
+
+
+
+
+
+
+
+# Check whether --with-pic was given.
+if test "${with_pic+set}" = set; then :
+  withval=$with_pic; lt_p=${PACKAGE-default}
+    case $withval in
+    yes|no) pic_mode=$withval ;;
+    *)
+      pic_mode=default
+      # Look at the argument we got.  We use all the common list separators.
+      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      for lt_pkg in $withval; do
+	IFS="$lt_save_ifs"
+	if test "X$lt_pkg" = "X$lt_p"; then
+	  pic_mode=yes
+	fi
+      done
+      IFS="$lt_save_ifs"
+      ;;
+    esac
+else
+  pic_mode=default
+fi
+
+
+test -z "$pic_mode" && pic_mode=default
+
+
+
+
+
+
+
+  # Check whether --enable-fast-install was given.
+if test "${enable_fast_install+set}" = set; then :
+  enableval=$enable_fast_install; p=${PACKAGE-default}
+    case $enableval in
+    yes) enable_fast_install=yes ;;
+    no) enable_fast_install=no ;;
+    *)
+      enable_fast_install=no
+      # Look at the argument we got.  We use all the common list separators.
+      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      for pkg in $enableval; do
+	IFS="$lt_save_ifs"
+	if test "X$pkg" = "X$p"; then
+	  enable_fast_install=yes
+	fi
+      done
+      IFS="$lt_save_ifs"
+      ;;
+    esac
+else
+  enable_fast_install=yes
+fi
+
+
+
+
+
+
+
+
+
+
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ltmain"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+test -z "$LN_S" && LN_S="ln -s"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+if test -n "${ZSH_VERSION+set}" ; then
+   setopt NO_GLOB_SUBST
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for objdir" >&5
+$as_echo_n "checking for objdir... " >&6; }
+if ${lt_cv_objdir+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  rm -f .libs 2>/dev/null
+mkdir .libs 2>/dev/null
+if test -d .libs; then
+  lt_cv_objdir=.libs
+else
+  # MS-DOS does not allow filenames that begin with a dot.
+  lt_cv_objdir=_libs
+fi
+rmdir .libs 2>/dev/null
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_objdir" >&5
+$as_echo "$lt_cv_objdir" >&6; }
+objdir=$lt_cv_objdir
+
+
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define LT_OBJDIR "$lt_cv_objdir/"
+_ACEOF
+
+
+
+
+case $host_os in
+aix3*)
+  # AIX sometimes has problems with the GCC collect2 program.  For some
+  # reason, if we set the COLLECT_NAMES environment variable, the problems
+  # vanish in a puff of smoke.
+  if test "X${COLLECT_NAMES+set}" != Xset; then
+    COLLECT_NAMES=
+    export COLLECT_NAMES
+  fi
+  ;;
+esac
+
+# Global variables:
+ofile=libtool
+can_build_shared=yes
+
+# All known linkers require a `.a' archive for static linking (except MSVC,
+# which needs '.lib').
+libext=a
+
+with_gnu_ld="$lt_cv_prog_gnu_ld"
+
+old_CC="$CC"
+old_CFLAGS="$CFLAGS"
+
+# Set sane defaults for various variables
+test -z "$CC" && CC=cc
+test -z "$LTCC" && LTCC=$CC
+test -z "$LTCFLAGS" && LTCFLAGS=$CFLAGS
+test -z "$LD" && LD=ld
+test -z "$ac_objext" && ac_objext=o
+
+for cc_temp in $compiler""; do
+  case $cc_temp in
+    compile | *[\\/]compile | ccache | *[\\/]ccache ) ;;
+    distcc | *[\\/]distcc | purify | *[\\/]purify ) ;;
+    \-*) ;;
+    *) break;;
+  esac
+done
+cc_basename=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"`
+
+
+# Only perform the check for file, if the check method requires it
+test -z "$MAGIC_CMD" && MAGIC_CMD=file
+case $deplibs_check_method in
+file_magic*)
+  if test "$file_magic_cmd" = '$MAGIC_CMD'; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ${ac_tool_prefix}file" >&5
+$as_echo_n "checking for ${ac_tool_prefix}file... " >&6; }
+if ${lt_cv_path_MAGIC_CMD+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $MAGIC_CMD in
+[\\/*] |  ?:[\\/]*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+  ;;
+*)
+  lt_save_MAGIC_CMD="$MAGIC_CMD"
+  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  ac_dummy="/usr/bin$PATH_SEPARATOR$PATH"
+  for ac_dir in $ac_dummy; do
+    IFS="$lt_save_ifs"
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/${ac_tool_prefix}file; then
+      lt_cv_path_MAGIC_CMD="$ac_dir/${ac_tool_prefix}file"
+      if test -n "$file_magic_test_file"; then
+	case $deplibs_check_method in
+	"file_magic "*)
+	  file_magic_regex=`expr "$deplibs_check_method" : "file_magic \(.*\)"`
+	  MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    $EGREP "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<_LT_EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+_LT_EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$lt_save_ifs"
+  MAGIC_CMD="$lt_save_MAGIC_CMD"
+  ;;
+esac
+fi
+
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $MAGIC_CMD" >&5
+$as_echo "$MAGIC_CMD" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+
+
+
+if test -z "$lt_cv_path_MAGIC_CMD"; then
+  if test -n "$ac_tool_prefix"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for file" >&5
+$as_echo_n "checking for file... " >&6; }
+if ${lt_cv_path_MAGIC_CMD+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $MAGIC_CMD in
+[\\/*] |  ?:[\\/]*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+  ;;
+*)
+  lt_save_MAGIC_CMD="$MAGIC_CMD"
+  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  ac_dummy="/usr/bin$PATH_SEPARATOR$PATH"
+  for ac_dir in $ac_dummy; do
+    IFS="$lt_save_ifs"
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/file; then
+      lt_cv_path_MAGIC_CMD="$ac_dir/file"
+      if test -n "$file_magic_test_file"; then
+	case $deplibs_check_method in
+	"file_magic "*)
+	  file_magic_regex=`expr "$deplibs_check_method" : "file_magic \(.*\)"`
+	  MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    $EGREP "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<_LT_EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+_LT_EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$lt_save_ifs"
+  MAGIC_CMD="$lt_save_MAGIC_CMD"
+  ;;
+esac
+fi
+
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $MAGIC_CMD" >&5
+$as_echo "$MAGIC_CMD" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  else
+    MAGIC_CMD=:
+  fi
+fi
+
+  fi
+  ;;
+esac
+
+# Use C for the default configuration in the libtool script
+
+lt_save_CC="$CC"
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+# Source file extension for C test sources.
+ac_ext=c
+
+# Object file extension for compiled C test sources.
+objext=o
+objext=$objext
+
+# Code to be used in simple compile tests
+lt_simple_compile_test_code="int some_variable = 0;"
+
+# Code to be used in simple link tests
+lt_simple_link_test_code='int main(){return(0);}'
+
+
+
+
+
+
+
+# If no C compiler was specified, use CC.
+LTCC=${LTCC-"$CC"}
+
+# If no C compiler flags were specified, use CFLAGS.
+LTCFLAGS=${LTCFLAGS-"$CFLAGS"}
+
+# Allow CC to be a program name with arguments.
+compiler=$CC
+
+# Save the default compiler, since it gets overwritten when the other
+# tags are being tested, and _LT_TAGVAR(compiler, []) is a NOP.
+compiler_DEFAULT=$CC
+
+# save warnings/boilerplate of simple test code
+ac_outfile=conftest.$ac_objext
+echo "$lt_simple_compile_test_code" >conftest.$ac_ext
+eval "$ac_compile" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
+_lt_compiler_boilerplate=`cat conftest.err`
+$RM conftest*
+
+ac_outfile=conftest.$ac_objext
+echo "$lt_simple_link_test_code" >conftest.$ac_ext
+eval "$ac_link" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
+_lt_linker_boilerplate=`cat conftest.err`
+$RM -r conftest*
+
+
+## CAVEAT EMPTOR:
+## There is no encapsulation within the following macros, do not change
+## the running order or otherwise move them around unless you know exactly
+## what you are doing...
+if test -n "$compiler"; then
+
+lt_prog_compiler_no_builtin_flag=
+
+if test "$GCC" = yes; then
+  case $cc_basename in
+  nvcc*)
+    lt_prog_compiler_no_builtin_flag=' -Xcompiler -fno-builtin' ;;
+  *)
+    lt_prog_compiler_no_builtin_flag=' -fno-builtin' ;;
+  esac
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -fno-rtti -fno-exceptions" >&5
+$as_echo_n "checking if $compiler supports -fno-rtti -fno-exceptions... " >&6; }
+if ${lt_cv_prog_compiler_rtti_exceptions+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_rtti_exceptions=no
+   ac_outfile=conftest.$ac_objext
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+   lt_compiler_flag="-fno-rtti -fno-exceptions"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   # The option is referenced via a variable to avoid confusing sed.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
+   (eval "$lt_compile" 2>conftest.err)
+   ac_status=$?
+   cat conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   if (exit $ac_status) && test -s "$ac_outfile"; then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings other than the usual output.
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' >conftest.exp
+     $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
+     if test ! -s conftest.er2 || diff conftest.exp conftest.er2 >/dev/null; then
+       lt_cv_prog_compiler_rtti_exceptions=yes
+     fi
+   fi
+   $RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_rtti_exceptions" >&5
+$as_echo "$lt_cv_prog_compiler_rtti_exceptions" >&6; }
+
+if test x"$lt_cv_prog_compiler_rtti_exceptions" = xyes; then
+    lt_prog_compiler_no_builtin_flag="$lt_prog_compiler_no_builtin_flag -fno-rtti -fno-exceptions"
+else
+    :
+fi
+
+fi
+
+
+
+
+
+
+  lt_prog_compiler_wl=
+lt_prog_compiler_pic=
+lt_prog_compiler_static=
+
+
+  if test "$GCC" = yes; then
+    lt_prog_compiler_wl='-Wl,'
+    lt_prog_compiler_static='-static'
+
+    case $host_os in
+      aix*)
+      # All AIX code is PIC.
+      if test "$host_cpu" = ia64; then
+	# AIX 5 now supports IA64 processor
+	lt_prog_compiler_static='-Bstatic'
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            lt_prog_compiler_pic='-fPIC'
+        ;;
+      m68k)
+            # FIXME: we need at least 68020 code to build shared libraries, but
+            # adding the `-m68020' flag to GCC prevents building anything better,
+            # like `-m68040'.
+            lt_prog_compiler_pic='-m68020 -resident32 -malways-restore-a4'
+        ;;
+      esac
+      ;;
+
+    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+
+    mingw* | cygwin* | pw32* | os2* | cegcc*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      # Although the cygwin gcc ignores -fPIC, still need this for old-style
+      # (--disable-auto-import) libraries
+      lt_prog_compiler_pic='-DDLL_EXPORT'
+      ;;
+
+    darwin* | rhapsody*)
+      # PIC is the default on this platform
+      # Common symbols not allowed in MH_DYLIB files
+      lt_prog_compiler_pic='-fno-common'
+      ;;
+
+    haiku*)
+      # PIC is the default for Haiku.
+      # The "-static" flag exists, but is broken.
+      lt_prog_compiler_static=
+      ;;
+
+    hpux*)
+      # PIC is the default for 64-bit PA HP-UX, but not for 32-bit
+      # PA HP-UX.  On IA64 HP-UX, PIC is the default but the pic flag
+      # sets the default TLS model and affects inlining.
+      case $host_cpu in
+      hppa*64*)
+	# +Z the default
+	;;
+      *)
+	lt_prog_compiler_pic='-fPIC'
+	;;
+      esac
+      ;;
+
+    interix[3-9]*)
+      # Interix 3.x gcc -fpic/-fPIC options generate broken code.
+      # Instead, we relocate shared libraries at runtime.
+      ;;
+
+    msdosdjgpp*)
+      # Just because we use GCC doesn't mean we suddenly get shared libraries
+      # on systems that don't support them.
+      lt_prog_compiler_can_build_shared=no
+      enable_shared=no
+      ;;
+
+    *nto* | *qnx*)
+      # QNX uses GNU C++, but need to define -shared option too, otherwise
+      # it will coredump.
+      lt_prog_compiler_pic='-fPIC -shared'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	lt_prog_compiler_pic=-Kconform_pic
+      fi
+      ;;
+
+    *)
+      lt_prog_compiler_pic='-fPIC'
+      ;;
+    esac
+
+    case $cc_basename in
+    nvcc*) # Cuda Compiler Driver 2.2
+      lt_prog_compiler_wl='-Xlinker '
+      if test -n "$lt_prog_compiler_pic"; then
+        lt_prog_compiler_pic="-Xcompiler $lt_prog_compiler_pic"
+      fi
+      ;;
+    esac
+  else
+    # PORTME Check for flag to pass linker flags through the system compiler.
+    case $host_os in
+    aix*)
+      lt_prog_compiler_wl='-Wl,'
+      if test "$host_cpu" = ia64; then
+	# AIX 5 now supports IA64 processor
+	lt_prog_compiler_static='-Bstatic'
+      else
+	lt_prog_compiler_static='-bnso -bI:/lib/syscalls.exp'
+      fi
+      ;;
+
+    mingw* | cygwin* | pw32* | os2* | cegcc*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      lt_prog_compiler_pic='-DDLL_EXPORT'
+      ;;
+
+    hpux9* | hpux10* | hpux11*)
+      lt_prog_compiler_wl='-Wl,'
+      # PIC is the default for IA64 HP-UX and 64-bit HP-UX, but
+      # not for PA HP-UX.
+      case $host_cpu in
+      hppa*64*|ia64*)
+	# +Z the default
+	;;
+      *)
+	lt_prog_compiler_pic='+Z'
+	;;
+      esac
+      # Is there a better lt_prog_compiler_static that works with the bundled CC?
+      lt_prog_compiler_static='${wl}-a ${wl}archive'
+      ;;
+
+    irix5* | irix6* | nonstopux*)
+      lt_prog_compiler_wl='-Wl,'
+      # PIC (with -KPIC) is the default.
+      lt_prog_compiler_static='-non_shared'
+      ;;
+
+    linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+      case $cc_basename in
+      # old Intel for x86_64 which still supported -KPIC.
+      ecc*)
+	lt_prog_compiler_wl='-Wl,'
+	lt_prog_compiler_pic='-KPIC'
+	lt_prog_compiler_static='-static'
+        ;;
+      # icc used to be incompatible with GCC.
+      # ICC 10 doesn't accept -KPIC any more.
+      icc* | ifort*)
+	lt_prog_compiler_wl='-Wl,'
+	lt_prog_compiler_pic='-fPIC'
+	lt_prog_compiler_static='-static'
+        ;;
+      # Lahey Fortran 8.1.
+      lf95*)
+	lt_prog_compiler_wl='-Wl,'
+	lt_prog_compiler_pic='--shared'
+	lt_prog_compiler_static='--static'
+	;;
+      nagfor*)
+	# NAG Fortran compiler
+	lt_prog_compiler_wl='-Wl,-Wl,,'
+	lt_prog_compiler_pic='-PIC'
+	lt_prog_compiler_static='-Bstatic'
+	;;
+      pgcc* | pgf77* | pgf90* | pgf95* | pgfortran*)
+        # Portland Group compilers (*not* the Pentium gcc compiler,
+	# which looks to be a dead project)
+	lt_prog_compiler_wl='-Wl,'
+	lt_prog_compiler_pic='-fpic'
+	lt_prog_compiler_static='-Bstatic'
+        ;;
+      ccc*)
+        lt_prog_compiler_wl='-Wl,'
+        # All Alpha code is PIC.
+        lt_prog_compiler_static='-non_shared'
+        ;;
+      xl* | bgxl* | bgf* | mpixl*)
+	# IBM XL C 8.0/Fortran 10.1, 11.1 on PPC and BlueGene
+	lt_prog_compiler_wl='-Wl,'
+	lt_prog_compiler_pic='-qpic'
+	lt_prog_compiler_static='-qstaticlink'
+	;;
+      *)
+	case `$CC -V 2>&1 | sed 5q` in
+	*Sun\ Ceres\ Fortran* | *Sun*Fortran*\ [1-7].* | *Sun*Fortran*\ 8.[0-3]*)
+	  # Sun Fortran 8.3 passes all unrecognized flags to the linker
+	  lt_prog_compiler_pic='-KPIC'
+	  lt_prog_compiler_static='-Bstatic'
+	  lt_prog_compiler_wl=''
+	  ;;
+	*Sun\ F* | *Sun*Fortran*)
+	  lt_prog_compiler_pic='-KPIC'
+	  lt_prog_compiler_static='-Bstatic'
+	  lt_prog_compiler_wl='-Qoption ld '
+	  ;;
+	*Sun\ C*)
+	  # Sun C 5.9
+	  lt_prog_compiler_pic='-KPIC'
+	  lt_prog_compiler_static='-Bstatic'
+	  lt_prog_compiler_wl='-Wl,'
+	  ;;
+        *Intel*\ [CF]*Compiler*)
+	  lt_prog_compiler_wl='-Wl,'
+	  lt_prog_compiler_pic='-fPIC'
+	  lt_prog_compiler_static='-static'
+	  ;;
+	*Portland\ Group*)
+	  lt_prog_compiler_wl='-Wl,'
+	  lt_prog_compiler_pic='-fpic'
+	  lt_prog_compiler_static='-Bstatic'
+	  ;;
+	esac
+	;;
+      esac
+      ;;
+
+    newsos6)
+      lt_prog_compiler_pic='-KPIC'
+      lt_prog_compiler_static='-Bstatic'
+      ;;
+
+    *nto* | *qnx*)
+      # QNX uses GNU C++, but need to define -shared option too, otherwise
+      # it will coredump.
+      lt_prog_compiler_pic='-fPIC -shared'
+      ;;
+
+    osf3* | osf4* | osf5*)
+      lt_prog_compiler_wl='-Wl,'
+      # All OSF/1 code is PIC.
+      lt_prog_compiler_static='-non_shared'
+      ;;
+
+    rdos*)
+      lt_prog_compiler_static='-non_shared'
+      ;;
+
+    solaris*)
+      lt_prog_compiler_pic='-KPIC'
+      lt_prog_compiler_static='-Bstatic'
+      case $cc_basename in
+      f77* | f90* | f95* | sunf77* | sunf90* | sunf95*)
+	lt_prog_compiler_wl='-Qoption ld ';;
+      *)
+	lt_prog_compiler_wl='-Wl,';;
+      esac
+      ;;
+
+    sunos4*)
+      lt_prog_compiler_wl='-Qoption ld '
+      lt_prog_compiler_pic='-PIC'
+      lt_prog_compiler_static='-Bstatic'
+      ;;
+
+    sysv4 | sysv4.2uw2* | sysv4.3*)
+      lt_prog_compiler_wl='-Wl,'
+      lt_prog_compiler_pic='-KPIC'
+      lt_prog_compiler_static='-Bstatic'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec ;then
+	lt_prog_compiler_pic='-Kconform_pic'
+	lt_prog_compiler_static='-Bstatic'
+      fi
+      ;;
+
+    sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*)
+      lt_prog_compiler_wl='-Wl,'
+      lt_prog_compiler_pic='-KPIC'
+      lt_prog_compiler_static='-Bstatic'
+      ;;
+
+    unicos*)
+      lt_prog_compiler_wl='-Wl,'
+      lt_prog_compiler_can_build_shared=no
+      ;;
+
+    uts4*)
+      lt_prog_compiler_pic='-pic'
+      lt_prog_compiler_static='-Bstatic'
+      ;;
+
+    *)
+      lt_prog_compiler_can_build_shared=no
+      ;;
+    esac
+  fi
+
+case $host_os in
+  # For platforms which do not support PIC, -DPIC is meaningless:
+  *djgpp*)
+    lt_prog_compiler_pic=
+    ;;
+  *)
+    lt_prog_compiler_pic="$lt_prog_compiler_pic -DPIC"
+    ;;
+esac
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $compiler option to produce PIC" >&5
+$as_echo_n "checking for $compiler option to produce PIC... " >&6; }
+if ${lt_cv_prog_compiler_pic+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_pic=$lt_prog_compiler_pic
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_pic" >&5
+$as_echo "$lt_cv_prog_compiler_pic" >&6; }
+lt_prog_compiler_pic=$lt_cv_prog_compiler_pic
+
+#
+# Check to make sure the PIC flag actually works.
+#
+if test -n "$lt_prog_compiler_pic"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler PIC flag $lt_prog_compiler_pic works" >&5
+$as_echo_n "checking if $compiler PIC flag $lt_prog_compiler_pic works... " >&6; }
+if ${lt_cv_prog_compiler_pic_works+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_pic_works=no
+   ac_outfile=conftest.$ac_objext
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+   lt_compiler_flag="$lt_prog_compiler_pic -DPIC"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   # The option is referenced via a variable to avoid confusing sed.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
+   (eval "$lt_compile" 2>conftest.err)
+   ac_status=$?
+   cat conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   if (exit $ac_status) && test -s "$ac_outfile"; then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings other than the usual output.
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' >conftest.exp
+     $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
+     if test ! -s conftest.er2 || diff conftest.exp conftest.er2 >/dev/null; then
+       lt_cv_prog_compiler_pic_works=yes
+     fi
+   fi
+   $RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_pic_works" >&5
+$as_echo "$lt_cv_prog_compiler_pic_works" >&6; }
+
+if test x"$lt_cv_prog_compiler_pic_works" = xyes; then
+    case $lt_prog_compiler_pic in
+     "" | " "*) ;;
+     *) lt_prog_compiler_pic=" $lt_prog_compiler_pic" ;;
+     esac
+else
+    lt_prog_compiler_pic=
+     lt_prog_compiler_can_build_shared=no
+fi
+
+fi
+
+
+
+
+
+
+
+
+
+
+
+#
+# Check to make sure the static flag actually works.
+#
+wl=$lt_prog_compiler_wl eval lt_tmp_static_flag=\"$lt_prog_compiler_static\"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler static flag $lt_tmp_static_flag works" >&5
+$as_echo_n "checking if $compiler static flag $lt_tmp_static_flag works... " >&6; }
+if ${lt_cv_prog_compiler_static_works+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_static_works=no
+   save_LDFLAGS="$LDFLAGS"
+   LDFLAGS="$LDFLAGS $lt_tmp_static_flag"
+   echo "$lt_simple_link_test_code" > conftest.$ac_ext
+   if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then
+     # The linker can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     if test -s conftest.err; then
+       # Append any errors to the config.log.
+       cat conftest.err 1>&5
+       $ECHO "$_lt_linker_boilerplate" | $SED '/^$/d' > conftest.exp
+       $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
+       if diff conftest.exp conftest.er2 >/dev/null; then
+         lt_cv_prog_compiler_static_works=yes
+       fi
+     else
+       lt_cv_prog_compiler_static_works=yes
+     fi
+   fi
+   $RM -r conftest*
+   LDFLAGS="$save_LDFLAGS"
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_static_works" >&5
+$as_echo "$lt_cv_prog_compiler_static_works" >&6; }
+
+if test x"$lt_cv_prog_compiler_static_works" = xyes; then
+    :
+else
+    lt_prog_compiler_static=
+fi
+
+
+
+
+
+
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -c -o file.$ac_objext" >&5
+$as_echo_n "checking if $compiler supports -c -o file.$ac_objext... " >&6; }
+if ${lt_cv_prog_compiler_c_o+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_c_o=no
+   $RM -r conftest 2>/dev/null
+   mkdir conftest
+   cd conftest
+   mkdir out
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+
+   lt_compiler_flag="-o out/conftest2.$ac_objext"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
+   (eval "$lt_compile" 2>out/conftest.err)
+   ac_status=$?
+   cat out/conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   if (exit $ac_status) && test -s out/conftest2.$ac_objext
+   then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
+     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
+     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
+       lt_cv_prog_compiler_c_o=yes
+     fi
+   fi
+   chmod u+w . 2>&5
+   $RM conftest*
+   # SGI C++ compiler will create directory out/ii_files/ for
+   # template instantiation
+   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
+   $RM out/* && rmdir out
+   cd ..
+   $RM -r conftest
+   $RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_c_o" >&5
+$as_echo "$lt_cv_prog_compiler_c_o" >&6; }
+
+
+
+
+
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -c -o file.$ac_objext" >&5
+$as_echo_n "checking if $compiler supports -c -o file.$ac_objext... " >&6; }
+if ${lt_cv_prog_compiler_c_o+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_c_o=no
+   $RM -r conftest 2>/dev/null
+   mkdir conftest
+   cd conftest
+   mkdir out
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+
+   lt_compiler_flag="-o out/conftest2.$ac_objext"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
+   (eval "$lt_compile" 2>out/conftest.err)
+   ac_status=$?
+   cat out/conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   if (exit $ac_status) && test -s out/conftest2.$ac_objext
+   then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
+     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
+     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
+       lt_cv_prog_compiler_c_o=yes
+     fi
+   fi
+   chmod u+w . 2>&5
+   $RM conftest*
+   # SGI C++ compiler will create directory out/ii_files/ for
+   # template instantiation
+   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
+   $RM out/* && rmdir out
+   cd ..
+   $RM -r conftest
+   $RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_c_o" >&5
+$as_echo "$lt_cv_prog_compiler_c_o" >&6; }
+
+
+
+
+hard_links="nottested"
+if test "$lt_cv_prog_compiler_c_o" = no && test "$need_locks" != no; then
+  # do not overwrite the value of need_locks provided by the user
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if we can lock with hard links" >&5
+$as_echo_n "checking if we can lock with hard links... " >&6; }
+  hard_links=yes
+  $RM conftest*
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  touch conftest.a
+  ln conftest.a conftest.b 2>&5 || hard_links=no
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $hard_links" >&5
+$as_echo "$hard_links" >&6; }
+  if test "$hard_links" = no; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&5
+$as_echo "$as_me: WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&2;}
+    need_locks=warn
+  fi
+else
+  need_locks=no
+fi
+
+
+
+
+
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the $compiler linker ($LD) supports shared libraries" >&5
+$as_echo_n "checking whether the $compiler linker ($LD) supports shared libraries... " >&6; }
+
+  runpath_var=
+  allow_undefined_flag=
+  always_export_symbols=no
+  archive_cmds=
+  archive_expsym_cmds=
+  compiler_needs_object=no
+  enable_shared_with_static_runtimes=no
+  export_dynamic_flag_spec=
+  export_symbols_cmds='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
+  hardcode_automatic=no
+  hardcode_direct=no
+  hardcode_direct_absolute=no
+  hardcode_libdir_flag_spec=
+  hardcode_libdir_separator=
+  hardcode_minus_L=no
+  hardcode_shlibpath_var=unsupported
+  inherit_rpath=no
+  link_all_deplibs=unknown
+  module_cmds=
+  module_expsym_cmds=
+  old_archive_from_new_cmds=
+  old_archive_from_expsyms_cmds=
+  thread_safe_flag_spec=
+  whole_archive_flag_spec=
+  # include_expsyms should be a list of space-separated symbols to be *always*
+  # included in the symbol list
+  include_expsyms=
+  # exclude_expsyms can be an extended regexp of symbols to exclude
+  # it will be wrapped by ` (' and `)$', so one must not match beginning or
+  # end of line.  Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
+  # as well as any symbol that contains `d'.
+  exclude_expsyms='_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*'
+  # Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
+  # platforms (ab)use it in PIC code, but their linkers get confused if
+  # the symbol is explicitly referenced.  Since portable code cannot
+  # rely on this symbol name, it's probably fine to never include it in
+  # preloaded symbol tables.
+  # Exclude shared library initialization/finalization symbols.
+  extract_expsyms_cmds=
+
+  case $host_os in
+  cygwin* | mingw* | pw32* | cegcc*)
+    # FIXME: the MSVC++ port hasn't been tested in a loooong time
+    # When not using gcc, we currently assume that we are using
+    # Microsoft Visual C++.
+    if test "$GCC" != yes; then
+      with_gnu_ld=no
+    fi
+    ;;
+  interix*)
+    # we just hope/assume this is gcc and not c89 (= MSVC++)
+    with_gnu_ld=yes
+    ;;
+  openbsd*)
+    with_gnu_ld=no
+    ;;
+  linux* | k*bsd*-gnu | gnu*)
+    link_all_deplibs=no
+    ;;
+  esac
+
+  ld_shlibs=yes
+
+  # On some targets, GNU ld is compatible enough with the native linker
+  # that we're better off using the native interface for both.
+  lt_use_gnu_ld_interface=no
+  if test "$with_gnu_ld" = yes; then
+    case $host_os in
+      aix*)
+	# The AIX port of GNU ld has always aspired to compatibility
+	# with the native linker.  However, as the warning in the GNU ld
+	# block says, versions before 2.19.5* couldn't really create working
+	# shared libraries, regardless of the interface used.
+	case `$LD -v 2>&1` in
+	  *\ \(GNU\ Binutils\)\ 2.19.5*) ;;
+	  *\ \(GNU\ Binutils\)\ 2.[2-9]*) ;;
+	  *\ \(GNU\ Binutils\)\ [3-9]*) ;;
+	  *)
+	    lt_use_gnu_ld_interface=yes
+	    ;;
+	esac
+	;;
+      *)
+	lt_use_gnu_ld_interface=yes
+	;;
+    esac
+  fi
+
+  if test "$lt_use_gnu_ld_interface" = yes; then
+    # If archive_cmds runs LD, not CC, wlarc should be empty
+    wlarc='${wl}'
+
+    # Set some defaults for GNU ld with shared library support. These
+    # are reset later if shared libraries are not supported. Putting them
+    # here allows them to be overridden if necessary.
+    runpath_var=LD_RUN_PATH
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    export_dynamic_flag_spec='${wl}--export-dynamic'
+    # ancient GNU ld didn't support --whole-archive et. al.
+    if $LD --help 2>&1 | $GREP 'no-whole-archive' > /dev/null; then
+      whole_archive_flag_spec="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+    else
+      whole_archive_flag_spec=
+    fi
+    supports_anon_versioning=no
+    case `$LD -v 2>&1` in
+      *GNU\ gold*) supports_anon_versioning=yes ;;
+      *\ [01].* | *\ 2.[0-9].* | *\ 2.10.*) ;; # catch versions < 2.11
+      *\ 2.11.93.0.2\ *) supports_anon_versioning=yes ;; # RH7.3 ...
+      *\ 2.11.92.0.12\ *) supports_anon_versioning=yes ;; # Mandrake 8.2 ...
+      *\ 2.11.*) ;; # other 2.11 versions
+      *) supports_anon_versioning=yes ;;
+    esac
+
+    # See if GNU ld supports shared libraries.
+    case $host_os in
+    aix[3-9]*)
+      # On AIX/PPC, the GNU linker is very broken
+      if test "$host_cpu" != ia64; then
+	ld_shlibs=no
+	cat <<_LT_EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.19, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support.  If you
+*** really care for shared libraries, you may want to install binutils
+*** 2.20 or above, or modify your PATH so that a non-GNU linker is found.
+*** You will then need to restart the configuration process.
+
+_LT_EOF
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+            archive_expsym_cmds=''
+        ;;
+      m68k)
+            archive_cmds='$RM $output_objdir/a2ixlibrary.data~$ECHO "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$ECHO "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$ECHO "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$ECHO "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+            hardcode_libdir_flag_spec='-L$libdir'
+            hardcode_minus_L=yes
+        ;;
+      esac
+      ;;
+
+    beos*)
+      if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	allow_undefined_flag=unsupported
+	# Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+	# support --undefined.  This deserves some investigation.  FIXME
+	archive_cmds='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      else
+	ld_shlibs=no
+      fi
+      ;;
+
+    cygwin* | mingw* | pw32* | cegcc*)
+      # _LT_TAGVAR(hardcode_libdir_flag_spec, ) is actually meaningless,
+      # as there is no search path for DLLs.
+      hardcode_libdir_flag_spec='-L$libdir'
+      export_dynamic_flag_spec='${wl}--export-all-symbols'
+      allow_undefined_flag=unsupported
+      always_export_symbols=no
+      enable_shared_with_static_runtimes=yes
+      export_symbols_cmds='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[BCDGRS][ ]/s/.*[ ]\([^ ]*\)/\1 DATA/;s/^.*[ ]__nm__\([^ ]*\)[ ][^ ]*/\1 DATA/;/^I[ ]/d;/^[AITW][ ]/s/.* //'\'' | sort | uniq > $export_symbols'
+      exclude_expsyms='[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname'
+
+      if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
+        archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+	# If the export-symbols file already is a .def file (1st line
+	# is EXPORTS), use it as is; otherwise, prepend...
+	archive_expsym_cmds='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
+	  cp $export_symbols $output_objdir/$soname.def;
+	else
+	  echo EXPORTS > $output_objdir/$soname.def;
+	  cat $export_symbols >> $output_objdir/$soname.def;
+	fi~
+	$CC -shared $output_objdir/$soname.def $libobjs $deplibs $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+      else
+	ld_shlibs=no
+      fi
+      ;;
+
+    haiku*)
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      link_all_deplibs=yes
+      ;;
+
+    interix[3-9]*)
+      hardcode_direct=no
+      hardcode_shlibpath_var=no
+      hardcode_libdir_flag_spec='${wl}-rpath,$libdir'
+      export_dynamic_flag_spec='${wl}-E'
+      # Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
+      # Instead, shared libraries are loaded at an image base (0x10000000 by
+      # default) and relocated if they conflict, which is a slow very memory
+      # consuming and fragmenting process.  To avoid this, we pick a random,
+      # 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
+      # time.  Moving up from 0x10000000 also allows more sbrk(2) space.
+      archive_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+      archive_expsym_cmds='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+      ;;
+
+    gnu* | linux* | tpf* | k*bsd*-gnu | kopensolaris*-gnu)
+      tmp_diet=no
+      if test "$host_os" = linux-dietlibc; then
+	case $cc_basename in
+	  diet\ *) tmp_diet=yes;;	# linux-dietlibc with static linking (!diet-dyn)
+	esac
+      fi
+      if $LD --help 2>&1 | $EGREP ': supported targets:.* elf' > /dev/null \
+	 && test "$tmp_diet" = no
+      then
+	tmp_addflag=' $pic_flag'
+	tmp_sharedflag='-shared'
+	case $cc_basename,$host_cpu in
+        pgcc*)				# Portland Group C compiler
+	  whole_archive_flag_spec='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  tmp_addflag=' $pic_flag'
+	  ;;
+	pgf77* | pgf90* | pgf95* | pgfortran*)
+					# Portland Group f77 and f90 compilers
+	  whole_archive_flag_spec='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  tmp_addflag=' $pic_flag -Mnomain' ;;
+	ecc*,ia64* | icc*,ia64*)	# Intel C compiler on ia64
+	  tmp_addflag=' -i_dynamic' ;;
+	efc*,ia64* | ifort*,ia64*)	# Intel Fortran compiler on ia64
+	  tmp_addflag=' -i_dynamic -nofor_main' ;;
+	ifc* | ifort*)			# Intel Fortran compiler
+	  tmp_addflag=' -nofor_main' ;;
+	lf95*)				# Lahey Fortran 8.1
+	  whole_archive_flag_spec=
+	  tmp_sharedflag='--shared' ;;
+	xl[cC]* | bgxl[cC]* | mpixl[cC]*) # IBM XL C 8.0 on PPC (deal with xlf below)
+	  tmp_sharedflag='-qmkshrobj'
+	  tmp_addflag= ;;
+	nvcc*)	# Cuda Compiler Driver 2.2
+	  whole_archive_flag_spec='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  compiler_needs_object=yes
+	  ;;
+	esac
+	case `$CC -V 2>&1 | sed 5q` in
+	*Sun\ C*)			# Sun C 5.9
+	  whole_archive_flag_spec='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  compiler_needs_object=yes
+	  tmp_sharedflag='-G' ;;
+	*Sun\ F*)			# Sun Fortran 8.3
+	  tmp_sharedflag='-G' ;;
+	esac
+	archive_cmds='$CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+
+        if test "x$supports_anon_versioning" = xyes; then
+          archive_expsym_cmds='echo "{ global:" > $output_objdir/$libname.ver~
+	    cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+	    echo "local: *; };" >> $output_objdir/$libname.ver~
+	    $CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
+        fi
+
+	case $cc_basename in
+	xlf* | bgf* | bgxlf* | mpixlf*)
+	  # IBM XL Fortran 10.1 on PPC cannot create shared libs itself
+	  whole_archive_flag_spec='--whole-archive$convenience --no-whole-archive'
+	  hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+	  archive_cmds='$LD -shared $libobjs $deplibs $linker_flags -soname $soname -o $lib'
+	  if test "x$supports_anon_versioning" = xyes; then
+	    archive_expsym_cmds='echo "{ global:" > $output_objdir/$libname.ver~
+	      cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+	      echo "local: *; };" >> $output_objdir/$libname.ver~
+	      $LD -shared $libobjs $deplibs $linker_flags -soname $soname -version-script $output_objdir/$libname.ver -o $lib'
+	  fi
+	  ;;
+	esac
+      else
+        ld_shlibs=no
+      fi
+      ;;
+
+    netbsd* | netbsdelf*-gnu)
+      if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+	archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+	wlarc=
+      else
+	archive_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	archive_expsym_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+      fi
+      ;;
+
+    solaris*)
+      if $LD -v 2>&1 | $GREP 'BFD 2\.8' > /dev/null; then
+	ld_shlibs=no
+	cat <<_LT_EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+_LT_EOF
+      elif $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	archive_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	archive_expsym_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+      else
+	ld_shlibs=no
+      fi
+      ;;
+
+    sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX*)
+      case `$LD -v 2>&1` in
+        *\ [01].* | *\ 2.[0-9].* | *\ 2.1[0-5].*)
+	ld_shlibs=no
+	cat <<_LT_EOF 1>&2
+
+*** Warning: Releases of the GNU linker prior to 2.16.91.0.3 can not
+*** reliably create shared libraries on SCO systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.16.91.0.3 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+_LT_EOF
+	;;
+	*)
+	  # For security reasons, it is highly recommended that you always
+	  # use absolute paths for naming shared libraries, and exclude the
+	  # DT_RUNPATH tag from executables and libraries.  But doing so
+	  # requires that you compile everything twice, which is a pain.
+	  if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+	    archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	    archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+	  else
+	    ld_shlibs=no
+	  fi
+	;;
+      esac
+      ;;
+
+    sunos4*)
+      archive_cmds='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+      wlarc=
+      hardcode_direct=yes
+      hardcode_shlibpath_var=no
+      ;;
+
+    *)
+      if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	archive_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	archive_expsym_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+      else
+	ld_shlibs=no
+      fi
+      ;;
+    esac
+
+    if test "$ld_shlibs" = no; then
+      runpath_var=
+      hardcode_libdir_flag_spec=
+      export_dynamic_flag_spec=
+      whole_archive_flag_spec=
+    fi
+  else
+    # PORTME fill in a description of your system's linker (not GNU ld)
+    case $host_os in
+    aix3*)
+      allow_undefined_flag=unsupported
+      always_export_symbols=yes
+      archive_expsym_cmds='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname'
+      # Note: this linker hardcodes the directories in LIBPATH if there
+      # are no directories specified by -L.
+      hardcode_minus_L=yes
+      if test "$GCC" = yes && test -z "$lt_prog_compiler_static"; then
+	# Neither direct hardcoding nor static linking is supported with a
+	# broken collect2.
+	hardcode_direct=unsupported
+      fi
+      ;;
+
+    aix[4-9]*)
+      if test "$host_cpu" = ia64; then
+	# On IA64, the linker does run time linking by default, so we don't
+	# have to do anything special.
+	aix_use_runtimelinking=no
+	exp_sym_flag='-Bexport'
+	no_entry_flag=""
+      else
+	# If we're using GNU nm, then we don't want the "-C" option.
+	# -C means demangle to AIX nm, but means don't demangle with GNU nm
+	# Also, AIX nm treats weak defined symbols like other global
+	# defined symbols, whereas GNU nm marks them as "W".
+	if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
+	  export_symbols_cmds='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && (substr(\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+	else
+	  export_symbols_cmds='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B")) && (substr(\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+	fi
+	aix_use_runtimelinking=no
+
+	# Test if we are trying to use run time linking or normal
+	# AIX style linking. If -brtl is somewhere in LDFLAGS, we
+	# need to do runtime linking.
+	case $host_os in aix4.[23]|aix4.[23].*|aix[5-9]*)
+	  for ld_flag in $LDFLAGS; do
+	  if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl"); then
+	    aix_use_runtimelinking=yes
+	    break
+	  fi
+	  done
+	  ;;
+	esac
+
+	exp_sym_flag='-bexport'
+	no_entry_flag='-bnoentry'
+      fi
+
+      # When large executables or shared objects are built, AIX ld can
+      # have problems creating the table of contents.  If linking a library
+      # or program results in "error TOC overflow" add -mminimal-toc to
+      # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
+      # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
+
+      archive_cmds=''
+      hardcode_direct=yes
+      hardcode_direct_absolute=yes
+      hardcode_libdir_separator=':'
+      link_all_deplibs=yes
+      file_list_spec='${wl}-f,'
+
+      if test "$GCC" = yes; then
+	case $host_os in aix4.[012]|aix4.[012].*)
+	# We only want to do this on AIX 4.2 and lower, the check
+	# below for broken collect2 doesn't work under 4.3+
+	  collect2name=`${CC} -print-prog-name=collect2`
+	  if test -f "$collect2name" &&
+	   strings "$collect2name" | $GREP resolve_lib_name >/dev/null
+	  then
+	  # We have reworked collect2
+	  :
+	  else
+	  # We have old collect2
+	  hardcode_direct=unsupported
+	  # It fails to find uninstalled libraries when the uninstalled
+	  # path is not listed in the libpath.  Setting hardcode_minus_L
+	  # to unsupported forces relinking
+	  hardcode_minus_L=yes
+	  hardcode_libdir_flag_spec='-L$libdir'
+	  hardcode_libdir_separator=
+	  fi
+	  ;;
+	esac
+	shared_flag='-shared'
+	if test "$aix_use_runtimelinking" = yes; then
+	  shared_flag="$shared_flag "'${wl}-G'
+	fi
+	link_all_deplibs=no
+      else
+	# not using gcc
+	if test "$host_cpu" = ia64; then
+	# VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
+	# chokes on -Wl,-G. The following line is correct:
+	  shared_flag='-G'
+	else
+	  if test "$aix_use_runtimelinking" = yes; then
+	    shared_flag='${wl}-G'
+	  else
+	    shared_flag='${wl}-bM:SRE'
+	  fi
+	fi
+      fi
+
+      export_dynamic_flag_spec='${wl}-bexpall'
+      # It seems that -bexpall does not export symbols beginning with
+      # underscore (_), so it is better to generate a list of symbols to export.
+      always_export_symbols=yes
+      if test "$aix_use_runtimelinking" = yes; then
+	# Warning - without using the other runtime loading flags (-brtl),
+	# -berok will link without error, but may produce a broken library.
+	allow_undefined_flag='-berok'
+        # Determine the default libpath from the value encoded in an
+        # empty executable.
+        if test "${lt_cv_aix_libpath+set}" = set; then
+  aix_libpath=$lt_cv_aix_libpath
+else
+  if ${lt_cv_aix_libpath_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+  lt_aix_libpath_sed='
+      /Import File Strings/,/^$/ {
+	  /^0/ {
+	      s/^0  *\([^ ]*\) *$/\1/
+	      p
+	  }
+      }'
+  lt_cv_aix_libpath_=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  # Check for a 64-bit object if we didn't find anything.
+  if test -z "$lt_cv_aix_libpath_"; then
+    lt_cv_aix_libpath_=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  fi
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+  if test -z "$lt_cv_aix_libpath_"; then
+    lt_cv_aix_libpath_="/usr/lib:/lib"
+  fi
+
+fi
+
+  aix_libpath=$lt_cv_aix_libpath_
+fi
+
+        hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:'"$aix_libpath"
+        archive_expsym_cmds='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
+      else
+	if test "$host_cpu" = ia64; then
+	  hardcode_libdir_flag_spec='${wl}-R $libdir:/usr/lib:/lib'
+	  allow_undefined_flag="-z nodefs"
+	  archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
+	else
+	 # Determine the default libpath from the value encoded in an
+	 # empty executable.
+	 if test "${lt_cv_aix_libpath+set}" = set; then
+  aix_libpath=$lt_cv_aix_libpath
+else
+  if ${lt_cv_aix_libpath_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+  lt_aix_libpath_sed='
+      /Import File Strings/,/^$/ {
+	  /^0/ {
+	      s/^0  *\([^ ]*\) *$/\1/
+	      p
+	  }
+      }'
+  lt_cv_aix_libpath_=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  # Check for a 64-bit object if we didn't find anything.
+  if test -z "$lt_cv_aix_libpath_"; then
+    lt_cv_aix_libpath_=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  fi
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+  if test -z "$lt_cv_aix_libpath_"; then
+    lt_cv_aix_libpath_="/usr/lib:/lib"
+  fi
+
+fi
+
+  aix_libpath=$lt_cv_aix_libpath_
+fi
+
+	 hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:'"$aix_libpath"
+	  # Warning - without using the other run time loading flags,
+	  # -berok will link without error, but may produce a broken library.
+	  no_undefined_flag=' ${wl}-bernotok'
+	  allow_undefined_flag=' ${wl}-berok'
+	  if test "$with_gnu_ld" = yes; then
+	    # We only use this code for GNU lds that support --whole-archive.
+	    whole_archive_flag_spec='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	  else
+	    # Exported symbols can be pulled into shared objects from archives
+	    whole_archive_flag_spec='$convenience'
+	  fi
+	  archive_cmds_need_lc=yes
+	  # This is similar to how AIX traditionally builds its shared libraries.
+	  archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+	fi
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+            archive_expsym_cmds=''
+        ;;
+      m68k)
+            archive_cmds='$RM $output_objdir/a2ixlibrary.data~$ECHO "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$ECHO "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$ECHO "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$ECHO "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+            hardcode_libdir_flag_spec='-L$libdir'
+            hardcode_minus_L=yes
+        ;;
+      esac
+      ;;
+
+    bsdi[45]*)
+      export_dynamic_flag_spec=-rdynamic
+      ;;
+
+    cygwin* | mingw* | pw32* | cegcc*)
+      # When not using gcc, we currently assume that we are using
+      # Microsoft Visual C++.
+      # hardcode_libdir_flag_spec is actually meaningless, as there is
+      # no search path for DLLs.
+      case $cc_basename in
+      cl*)
+	# Native MSVC
+	hardcode_libdir_flag_spec=' '
+	allow_undefined_flag=unsupported
+	always_export_symbols=yes
+	file_list_spec='@'
+	# Tell ltmain to make .lib files, not .a files.
+	libext=lib
+	# Tell ltmain to make .dll files, not .so files.
+	shrext_cmds=".dll"
+	# FIXME: Setting linknames here is a bad hack.
+	archive_cmds='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-dll~linknames='
+	archive_expsym_cmds='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
+	    sed -n -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' -e '1\\\!p' < $export_symbols > $output_objdir/$soname.exp;
+	  else
+	    sed -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' < $export_symbols > $output_objdir/$soname.exp;
+	  fi~
+	  $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
+	  linknames='
+	# The linker will not automatically build a static lib if we build a DLL.
+	# _LT_TAGVAR(old_archive_from_new_cmds, )='true'
+	enable_shared_with_static_runtimes=yes
+	exclude_expsyms='_NULL_IMPORT_DESCRIPTOR|_IMPORT_DESCRIPTOR_.*'
+	export_symbols_cmds='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[BCDGRS][ ]/s/.*[ ]\([^ ]*\)/\1,DATA/'\'' | $SED -e '\''/^[AITW][ ]/s/.*[ ]//'\'' | sort | uniq > $export_symbols'
+	# Don't use ranlib
+	old_postinstall_cmds='chmod 644 $oldlib'
+	postlink_cmds='lt_outputfile="@OUTPUT@"~
+	  lt_tool_outputfile="@TOOL_OUTPUT@"~
+	  case $lt_outputfile in
+	    *.exe|*.EXE) ;;
+	    *)
+	      lt_outputfile="$lt_outputfile.exe"
+	      lt_tool_outputfile="$lt_tool_outputfile.exe"
+	      ;;
+	  esac~
+	  if test "$MANIFEST_TOOL" != ":" && test -f "$lt_outputfile.manifest"; then
+	    $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
+	    $RM "$lt_outputfile.manifest";
+	  fi'
+	;;
+      *)
+	# Assume MSVC wrapper
+	hardcode_libdir_flag_spec=' '
+	allow_undefined_flag=unsupported
+	# Tell ltmain to make .lib files, not .a files.
+	libext=lib
+	# Tell ltmain to make .dll files, not .so files.
+	shrext_cmds=".dll"
+	# FIXME: Setting linknames here is a bad hack.
+	archive_cmds='$CC -o $lib $libobjs $compiler_flags `func_echo_all "$deplibs" | $SED '\''s/ -lc$//'\''` -link -dll~linknames='
+	# The linker will automatically build a .lib file if we build a DLL.
+	old_archive_from_new_cmds='true'
+	# FIXME: Should let the user specify the lib program.
+	old_archive_cmds='lib -OUT:$oldlib$oldobjs$old_deplibs'
+	enable_shared_with_static_runtimes=yes
+	;;
+      esac
+      ;;
+
+    darwin* | rhapsody*)
+
+
+  archive_cmds_need_lc=no
+  hardcode_direct=no
+  hardcode_automatic=yes
+  hardcode_shlibpath_var=unsupported
+  if test "$lt_cv_ld_force_load" = "yes"; then
+    whole_archive_flag_spec='`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience ${wl}-force_load,$conv\"; done; func_echo_all \"$new_convenience\"`'
+
+  else
+    whole_archive_flag_spec=''
+  fi
+  link_all_deplibs=yes
+  allow_undefined_flag="$_lt_dar_allow_undefined"
+  case $cc_basename in
+     ifort*) _lt_dar_can_shared=yes ;;
+     *) _lt_dar_can_shared=$GCC ;;
+  esac
+  if test "$_lt_dar_can_shared" = "yes"; then
+    output_verbose_link_cmd=func_echo_all
+    archive_cmds="\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod${_lt_dsymutil}"
+    module_cmds="\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dsymutil}"
+    archive_expsym_cmds="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring ${_lt_dar_single_mod}${_lt_dar_export_syms}${_lt_dsymutil}"
+    module_expsym_cmds="sed -e 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dar_export_syms}${_lt_dsymutil}"
+
+  else
+  ld_shlibs=no
+  fi
+
+      ;;
+
+    dgux*)
+      archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_libdir_flag_spec='-L$libdir'
+      hardcode_shlibpath_var=no
+      ;;
+
+    # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+    # support.  Future versions do this automatically, but an explicit c++rt0.o
+    # does not break anything, and helps significantly (at the cost of a little
+    # extra space).
+    freebsd2.2*)
+      archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+      hardcode_libdir_flag_spec='-R$libdir'
+      hardcode_direct=yes
+      hardcode_shlibpath_var=no
+      ;;
+
+    # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+    freebsd2.*)
+      archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_direct=yes
+      hardcode_minus_L=yes
+      hardcode_shlibpath_var=no
+      ;;
+
+    # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+    freebsd* | dragonfly*)
+      archive_cmds='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+      hardcode_libdir_flag_spec='-R$libdir'
+      hardcode_direct=yes
+      hardcode_shlibpath_var=no
+      ;;
+
+    hpux9*)
+      if test "$GCC" = yes; then
+	archive_cmds='$RM $output_objdir/$soname~$CC -shared $pic_flag ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $libobjs $deplibs $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+      else
+	archive_cmds='$RM $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+      fi
+      hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+      hardcode_libdir_separator=:
+      hardcode_direct=yes
+
+      # hardcode_minus_L: Not really in the search PATH,
+      # but as the default location of the library.
+      hardcode_minus_L=yes
+      export_dynamic_flag_spec='${wl}-E'
+      ;;
+
+    hpux10*)
+      if test "$GCC" = yes && test "$with_gnu_ld" = no; then
+	archive_cmds='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags'
+      fi
+      if test "$with_gnu_ld" = no; then
+	hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+	hardcode_libdir_separator=:
+	hardcode_direct=yes
+	hardcode_direct_absolute=yes
+	export_dynamic_flag_spec='${wl}-E'
+	# hardcode_minus_L: Not really in the search PATH,
+	# but as the default location of the library.
+	hardcode_minus_L=yes
+      fi
+      ;;
+
+    hpux11*)
+      if test "$GCC" = yes && test "$with_gnu_ld" = no; then
+	case $host_cpu in
+	hppa*64*)
+	  archive_cmds='$CC -shared ${wl}+h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	ia64*)
+	  archive_cmds='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	*)
+	  archive_cmds='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	esac
+      else
+	case $host_cpu in
+	hppa*64*)
+	  archive_cmds='$CC -b ${wl}+h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	ia64*)
+	  archive_cmds='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	*)
+
+	  # Older versions of the 11.00 compiler do not understand -b yet
+	  # (HP92453-01 A.11.01.20 doesn't, HP92453-01 B.11.X.35175-35176.GP does)
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $CC understands -b" >&5
+$as_echo_n "checking if $CC understands -b... " >&6; }
+if ${lt_cv_prog_compiler__b+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler__b=no
+   save_LDFLAGS="$LDFLAGS"
+   LDFLAGS="$LDFLAGS -b"
+   echo "$lt_simple_link_test_code" > conftest.$ac_ext
+   if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then
+     # The linker can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     if test -s conftest.err; then
+       # Append any errors to the config.log.
+       cat conftest.err 1>&5
+       $ECHO "$_lt_linker_boilerplate" | $SED '/^$/d' > conftest.exp
+       $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
+       if diff conftest.exp conftest.er2 >/dev/null; then
+         lt_cv_prog_compiler__b=yes
+       fi
+     else
+       lt_cv_prog_compiler__b=yes
+     fi
+   fi
+   $RM -r conftest*
+   LDFLAGS="$save_LDFLAGS"
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler__b" >&5
+$as_echo "$lt_cv_prog_compiler__b" >&6; }
+
+if test x"$lt_cv_prog_compiler__b" = xyes; then
+    archive_cmds='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
+else
+    archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags'
+fi
+
+	  ;;
+	esac
+      fi
+      if test "$with_gnu_ld" = no; then
+	hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+	hardcode_libdir_separator=:
+
+	case $host_cpu in
+	hppa*64*|ia64*)
+	  hardcode_direct=no
+	  hardcode_shlibpath_var=no
+	  ;;
+	*)
+	  hardcode_direct=yes
+	  hardcode_direct_absolute=yes
+	  export_dynamic_flag_spec='${wl}-E'
+
+	  # hardcode_minus_L: Not really in the search PATH,
+	  # but as the default location of the library.
+	  hardcode_minus_L=yes
+	  ;;
+	esac
+      fi
+      ;;
+
+    irix5* | irix6* | nonstopux*)
+      if test "$GCC" = yes; then
+	archive_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	# Try to use the -exported_symbol ld option, if it does not
+	# work, assume that -exports_file does not work either and
+	# implicitly export all symbols.
+	# This should be the same for all languages, so no per-tag cache variable.
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the $host_os linker accepts -exported_symbol" >&5
+$as_echo_n "checking whether the $host_os linker accepts -exported_symbol... " >&6; }
+if ${lt_cv_irix_exported_symbol+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  save_LDFLAGS="$LDFLAGS"
+	   LDFLAGS="$LDFLAGS -shared ${wl}-exported_symbol ${wl}foo ${wl}-update_registry ${wl}/dev/null"
+	   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+int foo (void) { return 0; }
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  lt_cv_irix_exported_symbol=yes
+else
+  lt_cv_irix_exported_symbol=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+           LDFLAGS="$save_LDFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_irix_exported_symbol" >&5
+$as_echo "$lt_cv_irix_exported_symbol" >&6; }
+	if test "$lt_cv_irix_exported_symbol" = yes; then
+          archive_expsym_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations ${wl}-exports_file ${wl}$export_symbols -o $lib'
+	fi
+      else
+	archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -exports_file $export_symbols -o $lib'
+      fi
+      archive_cmds_need_lc='no'
+      hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+      hardcode_libdir_separator=:
+      inherit_rpath=yes
+      link_all_deplibs=yes
+      ;;
+
+    netbsd* | netbsdelf*-gnu)
+      if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+	archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
+      else
+	archive_cmds='$LD -shared -o $lib $libobjs $deplibs $linker_flags'      # ELF
+      fi
+      hardcode_libdir_flag_spec='-R$libdir'
+      hardcode_direct=yes
+      hardcode_shlibpath_var=no
+      ;;
+
+    newsos6)
+      archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_direct=yes
+      hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+      hardcode_libdir_separator=:
+      hardcode_shlibpath_var=no
+      ;;
+
+    *nto* | *qnx*)
+      ;;
+
+    openbsd*)
+      if test -f /usr/libexec/ld.so; then
+	hardcode_direct=yes
+	hardcode_shlibpath_var=no
+	hardcode_direct_absolute=yes
+	if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+	  archive_cmds='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+	  archive_expsym_cmds='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags ${wl}-retain-symbols-file,$export_symbols'
+	  hardcode_libdir_flag_spec='${wl}-rpath,$libdir'
+	  export_dynamic_flag_spec='${wl}-E'
+	else
+	  case $host_os in
+	   openbsd[01].* | openbsd2.[0-7] | openbsd2.[0-7].*)
+	     archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+	     hardcode_libdir_flag_spec='-R$libdir'
+	     ;;
+	   *)
+	     archive_cmds='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+	     hardcode_libdir_flag_spec='${wl}-rpath,$libdir'
+	     ;;
+	  esac
+	fi
+      else
+	ld_shlibs=no
+      fi
+      ;;
+
+    os2*)
+      hardcode_libdir_flag_spec='-L$libdir'
+      hardcode_minus_L=yes
+      allow_undefined_flag=unsupported
+      archive_cmds='$ECHO "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~echo DATA >> $output_objdir/$libname.def~echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+      old_archive_from_new_cmds='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+      ;;
+
+    osf3*)
+      if test "$GCC" = yes; then
+	allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+	archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+      else
+	allow_undefined_flag=' -expect_unresolved \*'
+	archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+      fi
+      archive_cmds_need_lc='no'
+      hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+      hardcode_libdir_separator=:
+      ;;
+
+    osf4* | osf5*)	# as osf3* with the addition of -msym flag
+      if test "$GCC" = yes; then
+	allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+	archive_cmds='$CC -shared${allow_undefined_flag} $pic_flag $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+      else
+	allow_undefined_flag=' -expect_unresolved \*'
+	archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	archive_expsym_cmds='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done; printf "%s\\n" "-hidden">> $lib.exp~
+	$CC -shared${allow_undefined_flag} ${wl}-input ${wl}$lib.exp $compiler_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~$RM $lib.exp'
+
+	# Both c and cxx compiler support -rpath directly
+	hardcode_libdir_flag_spec='-rpath $libdir'
+      fi
+      archive_cmds_need_lc='no'
+      hardcode_libdir_separator=:
+      ;;
+
+    solaris*)
+      no_undefined_flag=' -z defs'
+      if test "$GCC" = yes; then
+	wlarc='${wl}'
+	archive_cmds='$CC -shared $pic_flag ${wl}-z ${wl}text ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	archive_expsym_cmds='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	  $CC -shared $pic_flag ${wl}-z ${wl}text ${wl}-M ${wl}$lib.exp ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
+      else
+	case `$CC -V 2>&1` in
+	*"Compilers 5.0"*)
+	  wlarc=''
+	  archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	  archive_expsym_cmds='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	  $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$RM $lib.exp'
+	  ;;
+	*)
+	  wlarc='${wl}'
+	  archive_cmds='$CC -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $compiler_flags'
+	  archive_expsym_cmds='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	  $CC -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
+	  ;;
+	esac
+      fi
+      hardcode_libdir_flag_spec='-R$libdir'
+      hardcode_shlibpath_var=no
+      case $host_os in
+      solaris2.[0-5] | solaris2.[0-5].*) ;;
+      *)
+	# The compiler driver will combine and reorder linker options,
+	# but understands `-z linker_flag'.  GCC discards it without `$wl',
+	# but is careful enough not to reorder.
+	# Supported since Solaris 2.6 (maybe 2.5.1?)
+	if test "$GCC" = yes; then
+	  whole_archive_flag_spec='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
+	else
+	  whole_archive_flag_spec='-z allextract$convenience -z defaultextract'
+	fi
+	;;
+      esac
+      link_all_deplibs=yes
+      ;;
+
+    sunos4*)
+      if test "x$host_vendor" = xsequent; then
+	# Use $CC to link under sequent, because it throws in some extra .o
+	# files that make .init and .fini sections work.
+	archive_cmds='$CC -G ${wl}-h $soname -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	archive_cmds='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+      fi
+      hardcode_libdir_flag_spec='-L$libdir'
+      hardcode_direct=yes
+      hardcode_minus_L=yes
+      hardcode_shlibpath_var=no
+      ;;
+
+    sysv4)
+      case $host_vendor in
+	sni)
+	  archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	  hardcode_direct=yes # is this really true???
+	;;
+	siemens)
+	  ## LD is ld it makes a PLAMLIB
+	  ## CC just makes a GrossModule.
+	  archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+	  reload_cmds='$CC -r -o $output$reload_objs'
+	  hardcode_direct=no
+        ;;
+	motorola)
+	  archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	  hardcode_direct=no #Motorola manual says yes, but my tests say they lie
+	;;
+      esac
+      runpath_var='LD_RUN_PATH'
+      hardcode_shlibpath_var=no
+      ;;
+
+    sysv4.3*)
+      archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_shlibpath_var=no
+      export_dynamic_flag_spec='-Bexport'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	hardcode_shlibpath_var=no
+	runpath_var=LD_RUN_PATH
+	hardcode_runpath_var=yes
+	ld_shlibs=yes
+      fi
+      ;;
+
+    sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[01].[10]* | unixware7* | sco3.2v5.0.[024]*)
+      no_undefined_flag='${wl}-z,text'
+      archive_cmds_need_lc=no
+      hardcode_shlibpath_var=no
+      runpath_var='LD_RUN_PATH'
+
+      if test "$GCC" = yes; then
+	archive_cmds='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	archive_expsym_cmds='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	archive_cmds='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	archive_expsym_cmds='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      fi
+      ;;
+
+    sysv5* | sco3.2v5* | sco5v6*)
+      # Note: We can NOT use -z defs as we might desire, because we do not
+      # link with -lc, and that would cause any symbols used from libc to
+      # always be unresolved, which means just about no library would
+      # ever link correctly.  If we're not using GNU ld we use -z text
+      # though, which does catch some bad symbols but isn't as heavy-handed
+      # as -z defs.
+      no_undefined_flag='${wl}-z,text'
+      allow_undefined_flag='${wl}-z,nodefs'
+      archive_cmds_need_lc=no
+      hardcode_shlibpath_var=no
+      hardcode_libdir_flag_spec='${wl}-R,$libdir'
+      hardcode_libdir_separator=':'
+      link_all_deplibs=yes
+      export_dynamic_flag_spec='${wl}-Bexport'
+      runpath_var='LD_RUN_PATH'
+
+      if test "$GCC" = yes; then
+	archive_cmds='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	archive_expsym_cmds='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	archive_cmds='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	archive_expsym_cmds='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      fi
+      ;;
+
+    uts4*)
+      archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_libdir_flag_spec='-L$libdir'
+      hardcode_shlibpath_var=no
+      ;;
+
+    *)
+      ld_shlibs=no
+      ;;
+    esac
+
+    if test x$host_vendor = xsni; then
+      case $host in
+      sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+	export_dynamic_flag_spec='${wl}-Blargedynsym'
+	;;
+      esac
+    fi
+  fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ld_shlibs" >&5
+$as_echo "$ld_shlibs" >&6; }
+test "$ld_shlibs" = no && can_build_shared=no
+
+with_gnu_ld=$with_gnu_ld
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#
+# Do we need to explicitly link libc?
+#
+case "x$archive_cmds_need_lc" in
+x|xyes)
+  # Assume -lc should be added
+  archive_cmds_need_lc=yes
+
+  if test "$enable_shared" = yes && test "$GCC" = yes; then
+    case $archive_cmds in
+    *'~'*)
+      # FIXME: we may have to deal with multi-command sequences.
+      ;;
+    '$CC '*)
+      # Test whether the compiler implicitly links with -lc since on some
+      # systems, -lgcc has to come before -lc. If gcc already passes -lc
+      # to ld, don't add -lc before -lgcc.
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether -lc should be explicitly linked in" >&5
+$as_echo_n "checking whether -lc should be explicitly linked in... " >&6; }
+if ${lt_cv_archive_cmds_need_lc+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  $RM conftest*
+	echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+
+	if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } 2>conftest.err; then
+	  soname=conftest
+	  lib=conftest
+	  libobjs=conftest.$ac_objext
+	  deplibs=
+	  wl=$lt_prog_compiler_wl
+	  pic_flag=$lt_prog_compiler_pic
+	  compiler_flags=-v
+	  linker_flags=-v
+	  verstring=
+	  output_objdir=.
+	  libname=conftest
+	  lt_save_allow_undefined_flag=$allow_undefined_flag
+	  allow_undefined_flag=
+	  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$archive_cmds 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1\""; } >&5
+  (eval $archive_cmds 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+	  then
+	    lt_cv_archive_cmds_need_lc=no
+	  else
+	    lt_cv_archive_cmds_need_lc=yes
+	  fi
+	  allow_undefined_flag=$lt_save_allow_undefined_flag
+	else
+	  cat conftest.err 1>&5
+	fi
+	$RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_archive_cmds_need_lc" >&5
+$as_echo "$lt_cv_archive_cmds_need_lc" >&6; }
+      archive_cmds_need_lc=$lt_cv_archive_cmds_need_lc
+      ;;
+    esac
+  fi
+  ;;
+esac
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking dynamic linker characteristics" >&5
+$as_echo_n "checking dynamic linker characteristics... " >&6; }
+
+if test "$GCC" = yes; then
+  case $host_os in
+    darwin*) lt_awk_arg="/^libraries:/,/LR/" ;;
+    *) lt_awk_arg="/^libraries:/" ;;
+  esac
+  case $host_os in
+    mingw* | cegcc*) lt_sed_strip_eq="s,=\([A-Za-z]:\),\1,g" ;;
+    *) lt_sed_strip_eq="s,=/,/,g" ;;
+  esac
+  lt_search_path_spec=`$CC -print-search-dirs | awk $lt_awk_arg | $SED -e "s/^libraries://" -e $lt_sed_strip_eq`
+  case $lt_search_path_spec in
+  *\;*)
+    # if the path contains ";" then we assume it to be the separator
+    # otherwise default to the standard path separator (i.e. ":") - it is
+    # assumed that no part of a normal pathname contains ";" but that should
+    # okay in the real world where ";" in dirpaths is itself problematic.
+    lt_search_path_spec=`$ECHO "$lt_search_path_spec" | $SED 's/;/ /g'`
+    ;;
+  *)
+    lt_search_path_spec=`$ECHO "$lt_search_path_spec" | $SED "s/$PATH_SEPARATOR/ /g"`
+    ;;
+  esac
+  # Ok, now we have the path, separated by spaces, we can step through it
+  # and add multilib dir if necessary.
+  lt_tmp_lt_search_path_spec=
+  lt_multi_os_dir=`$CC $CPPFLAGS $CFLAGS $LDFLAGS -print-multi-os-directory 2>/dev/null`
+  for lt_sys_path in $lt_search_path_spec; do
+    if test -d "$lt_sys_path/$lt_multi_os_dir"; then
+      lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path/$lt_multi_os_dir"
+    else
+      test -d "$lt_sys_path" && \
+	lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path"
+    fi
+  done
+  lt_search_path_spec=`$ECHO "$lt_tmp_lt_search_path_spec" | awk '
+BEGIN {RS=" "; FS="/|\n";} {
+  lt_foo="";
+  lt_count=0;
+  for (lt_i = NF; lt_i > 0; lt_i--) {
+    if ($lt_i != "" && $lt_i != ".") {
+      if ($lt_i == "..") {
+        lt_count++;
+      } else {
+        if (lt_count == 0) {
+          lt_foo="/" $lt_i lt_foo;
+        } else {
+          lt_count--;
+        }
+      }
+    }
+  }
+  if (lt_foo != "") { lt_freq[lt_foo]++; }
+  if (lt_freq[lt_foo] == 1) { print lt_foo; }
+}'`
+  # AWK program above erroneously prepends '/' to C:/dos/paths
+  # for these hosts.
+  case $host_os in
+    mingw* | cegcc*) lt_search_path_spec=`$ECHO "$lt_search_path_spec" |\
+      $SED 's,/\([A-Za-z]:\),\1,g'` ;;
+  esac
+  sys_lib_search_path_spec=`$ECHO "$lt_search_path_spec" | $lt_NL2SP`
+else
+  sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib"
+fi
+library_names_spec=
+libname_spec='lib$name'
+soname_spec=
+shrext_cmds=".so"
+postinstall_cmds=
+postuninstall_cmds=
+finish_cmds=
+finish_eval=
+shlibpath_var=
+shlibpath_overrides_runpath=unknown
+version_type=none
+dynamic_linker="$host_os ld.so"
+sys_lib_dlsearch_path_spec="/lib /usr/lib"
+need_lib_prefix=unknown
+hardcode_into_libs=no
+
+# when you set need_version to no, make sure it does not cause -set_version
+# flags to be left without arguments
+need_version=unknown
+
+case $host_os in
+aix3*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix $libname.a'
+  shlibpath_var=LIBPATH
+
+  # AIX 3 has no versioning support, so we append a major version to the name.
+  soname_spec='${libname}${release}${shared_ext}$major'
+  ;;
+
+aix[4-9]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  hardcode_into_libs=yes
+  if test "$host_cpu" = ia64; then
+    # AIX 5 supports IA64
+    library_names_spec='${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext}$versuffix $libname${shared_ext}'
+    shlibpath_var=LD_LIBRARY_PATH
+  else
+    # With GCC up to 2.95.x, collect2 would create an import file
+    # for dependence libraries.  The import file would start with
+    # the line `#! .'.  This would cause the generated library to
+    # depend on `.', always an invalid library.  This was fixed in
+    # development snapshots of GCC prior to 3.0.
+    case $host_os in
+      aix4 | aix4.[01] | aix4.[01].*)
+      if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
+	   echo ' yes '
+	   echo '#endif'; } | ${CC} -E - | $GREP yes > /dev/null; then
+	:
+      else
+	can_build_shared=no
+      fi
+      ;;
+    esac
+    # AIX (on Power*) has no versioning support, so currently we can not hardcode correct
+    # soname into executable. Probably we can add versioning support to
+    # collect2, so additional links can be useful in future.
+    if test "$aix_use_runtimelinking" = yes; then
+      # If using run time linking (on AIX 4.2 or later) use lib<name>.so
+      # instead of lib<name>.a to let people know that these are not
+      # typical AIX shared libraries.
+      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    else
+      # We preserve .a as extension for shared libraries through AIX4.2
+      # and later when we are not doing run time linking.
+      library_names_spec='${libname}${release}.a $libname.a'
+      soname_spec='${libname}${release}${shared_ext}$major'
+    fi
+    shlibpath_var=LIBPATH
+  fi
+  ;;
+
+amigaos*)
+  case $host_cpu in
+  powerpc)
+    # Since July 2007 AmigaOS4 officially supports .so libraries.
+    # When compiling the executable, add -use-dynld -Lsobjs: to the compileline.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    ;;
+  m68k)
+    library_names_spec='$libname.ixlibrary $libname.a'
+    # Create ${libname}_ixlibrary.a entries in /sys/libs.
+    finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`func_echo_all "$lib" | $SED '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $RM /sys/libs/${libname}_ixlibrary.a; $show "cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a"; cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a || exit 1; done'
+    ;;
+  esac
+  ;;
+
+beos*)
+  library_names_spec='${libname}${shared_ext}'
+  dynamic_linker="$host_os ld.so"
+  shlibpath_var=LIBRARY_PATH
+  ;;
+
+bsdi[45]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
+  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
+  # the default ld.so.conf also contains /usr/contrib/lib and
+  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
+  # libtool to hard-code these into programs
+  ;;
+
+cygwin* | mingw* | pw32* | cegcc*)
+  version_type=windows
+  shrext_cmds=".dll"
+  need_version=no
+  need_lib_prefix=no
+
+  case $GCC,$cc_basename in
+  yes,*)
+    # gcc
+    library_names_spec='$libname.dll.a'
+    # DLL is installed to $(libdir)/../bin by postinstall_cmds
+    postinstall_cmds='base_file=`basename \${file}`~
+      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog $dir/$dlname \$dldir/$dlname~
+      chmod a+x \$dldir/$dlname~
+      if test -n '\''$stripme'\'' && test -n '\''$striplib'\''; then
+        eval '\''$striplib \$dldir/$dlname'\'' || exit \$?;
+      fi'
+    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll~
+       $RM \$dlpath'
+    shlibpath_overrides_runpath=yes
+
+    case $host_os in
+    cygwin*)
+      # Cygwin DLLs use 'cyg' prefix rather than 'lib'
+      soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+
+      sys_lib_search_path_spec="$sys_lib_search_path_spec /usr/lib/w32api"
+      ;;
+    mingw* | cegcc*)
+      # MinGW DLLs use traditional 'lib' prefix
+      soname_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+      ;;
+    pw32*)
+      # pw32 DLLs use 'pw' prefix rather than 'lib'
+      library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+      ;;
+    esac
+    dynamic_linker='Win32 ld.exe'
+    ;;
+
+  *,cl*)
+    # Native MSVC
+    libname_spec='$name'
+    soname_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+    library_names_spec='${libname}.dll.lib'
+
+    case $build_os in
+    mingw*)
+      sys_lib_search_path_spec=
+      lt_save_ifs=$IFS
+      IFS=';'
+      for lt_path in $LIB
+      do
+        IFS=$lt_save_ifs
+        # Let DOS variable expansion print the short 8.3 style file name.
+        lt_path=`cd "$lt_path" 2>/dev/null && cmd //C "for %i in (".") do @echo %~si"`
+        sys_lib_search_path_spec="$sys_lib_search_path_spec $lt_path"
+      done
+      IFS=$lt_save_ifs
+      # Convert to MSYS style.
+      sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | sed -e 's|\\\\|/|g' -e 's| \\([a-zA-Z]\\):| /\\1|g' -e 's|^ ||'`
+      ;;
+    cygwin*)
+      # Convert to unix form, then to dos form, then back to unix form
+      # but this time dos style (no spaces!) so that the unix form looks
+      # like /cygdrive/c/PROGRA~1:/cygdr...
+      sys_lib_search_path_spec=`cygpath --path --unix "$LIB"`
+      sys_lib_search_path_spec=`cygpath --path --dos "$sys_lib_search_path_spec" 2>/dev/null`
+      sys_lib_search_path_spec=`cygpath --path --unix "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
+      ;;
+    *)
+      sys_lib_search_path_spec="$LIB"
+      if $ECHO "$sys_lib_search_path_spec" | $GREP ';[c-zC-Z]:/' >/dev/null; then
+        # It is most probably a Windows format PATH.
+        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e 's/;/ /g'`
+      else
+        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
+      fi
+      # FIXME: find the short name or the path components, as spaces are
+      # common. (e.g. "Program Files" -> "PROGRA~1")
+      ;;
+    esac
+
+    # DLL is installed to $(libdir)/../bin by postinstall_cmds
+    postinstall_cmds='base_file=`basename \${file}`~
+      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog $dir/$dlname \$dldir/$dlname'
+    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll~
+       $RM \$dlpath'
+    shlibpath_overrides_runpath=yes
+    dynamic_linker='Win32 link.exe'
+    ;;
+
+  *)
+    # Assume MSVC wrapper
+    library_names_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext} $libname.lib'
+    dynamic_linker='Win32 ld.exe'
+    ;;
+  esac
+  # FIXME: first we should search . and the directory the executable is in
+  shlibpath_var=PATH
+  ;;
+
+darwin* | rhapsody*)
+  dynamic_linker="$host_os dyld"
+  version_type=darwin
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${major}$shared_ext ${libname}$shared_ext'
+  soname_spec='${libname}${release}${major}$shared_ext'
+  shlibpath_overrides_runpath=yes
+  shlibpath_var=DYLD_LIBRARY_PATH
+  shrext_cmds='`test .$module = .yes && echo .so || echo .dylib`'
+
+  sys_lib_search_path_spec="$sys_lib_search_path_spec /usr/local/lib"
+  sys_lib_dlsearch_path_spec='/usr/local/lib /lib /usr/lib'
+  ;;
+
+dgux*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname$shared_ext'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+freebsd* | dragonfly*)
+  # DragonFly does not have aout.  When/if they implement a new
+  # versioning mechanism, adjust this.
+  if test -x /usr/bin/objformat; then
+    objformat=`/usr/bin/objformat`
+  else
+    case $host_os in
+    freebsd[23].*) objformat=aout ;;
+    *) objformat=elf ;;
+    esac
+  fi
+  version_type=freebsd-$objformat
+  case $version_type in
+    freebsd-elf*)
+      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
+      need_version=no
+      need_lib_prefix=no
+      ;;
+    freebsd-*)
+      library_names_spec='${libname}${release}${shared_ext}$versuffix $libname${shared_ext}$versuffix'
+      need_version=yes
+      ;;
+  esac
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_os in
+  freebsd2.*)
+    shlibpath_overrides_runpath=yes
+    ;;
+  freebsd3.[01]* | freebsdelf3.[01]*)
+    shlibpath_overrides_runpath=yes
+    hardcode_into_libs=yes
+    ;;
+  freebsd3.[2-9]* | freebsdelf3.[2-9]* | \
+  freebsd4.[0-5] | freebsdelf4.[0-5] | freebsd4.1.1 | freebsdelf4.1.1)
+    shlibpath_overrides_runpath=no
+    hardcode_into_libs=yes
+    ;;
+  *) # from 4.6 on, and DragonFly
+    shlibpath_overrides_runpath=yes
+    hardcode_into_libs=yes
+    ;;
+  esac
+  ;;
+
+haiku*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  dynamic_linker="$host_os runtime_loader"
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}${major} ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  sys_lib_dlsearch_path_spec='/boot/home/config/lib /boot/common/lib /boot/system/lib'
+  hardcode_into_libs=yes
+  ;;
+
+hpux9* | hpux10* | hpux11*)
+  # Give a soname corresponding to the major version so that dld.sl refuses to
+  # link against other versions.
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  case $host_cpu in
+  ia64*)
+    shrext_cmds='.so'
+    hardcode_into_libs=yes
+    dynamic_linker="$host_os dld.so"
+    shlibpath_var=LD_LIBRARY_PATH
+    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    if test "X$HPUX_IA64_MODE" = X32; then
+      sys_lib_search_path_spec="/usr/lib/hpux32 /usr/local/lib/hpux32 /usr/local/lib"
+    else
+      sys_lib_search_path_spec="/usr/lib/hpux64 /usr/local/lib/hpux64"
+    fi
+    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
+    ;;
+  hppa*64*)
+    shrext_cmds='.sl'
+    hardcode_into_libs=yes
+    dynamic_linker="$host_os dld.sl"
+    shlibpath_var=LD_LIBRARY_PATH # How should we handle SHLIB_PATH
+    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    sys_lib_search_path_spec="/usr/lib/pa20_64 /usr/ccs/lib/pa20_64"
+    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
+    ;;
+  *)
+    shrext_cmds='.sl'
+    dynamic_linker="$host_os dld.sl"
+    shlibpath_var=SHLIB_PATH
+    shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    ;;
+  esac
+  # HP-UX runs *really* slowly unless shared libraries are mode 555, ...
+  postinstall_cmds='chmod 555 $lib'
+  # or fails outright, so override atomically:
+  install_override_mode=555
+  ;;
+
+interix[3-9]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  dynamic_linker='Interix 3.x ld.so.1 (PE, like ELF)'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  ;;
+
+irix5* | irix6* | nonstopux*)
+  case $host_os in
+    nonstopux*) version_type=nonstopux ;;
+    *)
+	if test "$lt_cv_prog_gnu_ld" = yes; then
+		version_type=linux # correct to gnu/linux during the next big refactor
+	else
+		version_type=irix
+	fi ;;
+  esac
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext} $libname${shared_ext}'
+  case $host_os in
+  irix5* | nonstopux*)
+    libsuff= shlibsuff=
+    ;;
+  *)
+    case $LD in # libtool.m4 will add one of these switches to LD
+    *-32|*"-32 "|*-melf32bsmip|*"-melf32bsmip ")
+      libsuff= shlibsuff= libmagic=32-bit;;
+    *-n32|*"-n32 "|*-melf32bmipn32|*"-melf32bmipn32 ")
+      libsuff=32 shlibsuff=N32 libmagic=N32;;
+    *-64|*"-64 "|*-melf64bmip|*"-melf64bmip ")
+      libsuff=64 shlibsuff=64 libmagic=64-bit;;
+    *) libsuff= shlibsuff= libmagic=never-match;;
+    esac
+    ;;
+  esac
+  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
+  shlibpath_overrides_runpath=no
+  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
+  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+  hardcode_into_libs=yes
+  ;;
+
+# No shared lib support for Linux oldld, aout, or coff.
+linux*oldld* | linux*aout* | linux*coff*)
+  dynamic_linker=no
+  ;;
+
+# This must be glibc/ELF.
+linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+
+  # Some binutils ld are patched to set DT_RUNPATH
+  if ${lt_cv_shlibpath_overrides_runpath+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_shlibpath_overrides_runpath=no
+    save_LDFLAGS=$LDFLAGS
+    save_libdir=$libdir
+    eval "libdir=/foo; wl=\"$lt_prog_compiler_wl\"; \
+	 LDFLAGS=\"\$LDFLAGS $hardcode_libdir_flag_spec\""
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  if  ($OBJDUMP -p conftest$ac_exeext) 2>/dev/null | grep "RUNPATH.*$libdir" >/dev/null; then :
+  lt_cv_shlibpath_overrides_runpath=yes
+fi
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+    LDFLAGS=$save_LDFLAGS
+    libdir=$save_libdir
+
+fi
+
+  shlibpath_overrides_runpath=$lt_cv_shlibpath_overrides_runpath
+
+  # This implies no fast_install, which is unacceptable.
+  # Some rework will be needed to allow for fast_install
+  # before this can be enabled.
+  hardcode_into_libs=yes
+
+  # Append ld.so.conf contents to the search path
+  if test -f /etc/ld.so.conf; then
+    lt_ld_extra=`awk '/^include / { system(sprintf("cd /etc; cat %s 2>/dev/null", \$2)); skip = 1; } { if (!skip) print \$0; skip = 0; }' < /etc/ld.so.conf | $SED -e 's/#.*//;/^[	 ]*hwcap[	 ]/d;s/[:,	]/ /g;s/=[^=]*$//;s/=[^= ]* / /g;s/"//g;/^$/d' | tr '\n' ' '`
+    sys_lib_dlsearch_path_spec="/lib /usr/lib $lt_ld_extra"
+  fi
+
+  # We used to test for /lib/ld.so.1 and disable shared libraries on
+  # powerpc, because MkLinux only supported shared libraries with the
+  # GNU dynamic linker.  Since this was broken with cross compilers,
+  # most powerpc-linux boxes support dynamic linking these days and
+  # people can always --disable-shared, the test was removed, and we
+  # assume the GNU/Linux dynamic linker is in use.
+  dynamic_linker='GNU/Linux ld.so'
+  ;;
+
+netbsdelf*-gnu)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  dynamic_linker='NetBSD ld.elf_so'
+  ;;
+
+netbsd*)
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+    dynamic_linker='NetBSD (a.out) ld.so'
+  else
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    dynamic_linker='NetBSD ld.elf_so'
+  fi
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  ;;
+
+newsos6)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  ;;
+
+*nto* | *qnx*)
+  version_type=qnx
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  dynamic_linker='ldqnx.so'
+  ;;
+
+openbsd*)
+  version_type=sunos
+  sys_lib_dlsearch_path_spec="/usr/lib"
+  need_lib_prefix=no
+  # Some older versions of OpenBSD (3.3 at least) *do* need versioned libs.
+  case $host_os in
+    openbsd3.3 | openbsd3.3.*)	need_version=yes ;;
+    *)				need_version=no  ;;
+  esac
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+    case $host_os in
+      openbsd2.[89] | openbsd2.[89].*)
+	shlibpath_overrides_runpath=no
+	;;
+      *)
+	shlibpath_overrides_runpath=yes
+	;;
+      esac
+  else
+    shlibpath_overrides_runpath=yes
+  fi
+  ;;
+
+os2*)
+  libname_spec='$name'
+  shrext_cmds=".dll"
+  need_lib_prefix=no
+  library_names_spec='$libname${shared_ext} $libname.a'
+  dynamic_linker='OS/2 ld.exe'
+  shlibpath_var=LIBPATH
+  ;;
+
+osf3* | osf4* | osf5*)
+  version_type=osf
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
+  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+  ;;
+
+rdos*)
+  dynamic_linker=no
+  ;;
+
+solaris*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  # ldd complains unless libraries are executable
+  postinstall_cmds='chmod +x $lib'
+  ;;
+
+sunos4*)
+  version_type=sunos
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+  fi
+  need_version=yes
+  ;;
+
+sysv4 | sysv4.3*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_vendor in
+    sni)
+      shlibpath_overrides_runpath=no
+      need_lib_prefix=no
+      runpath_var=LD_RUN_PATH
+      ;;
+    siemens)
+      need_lib_prefix=no
+      ;;
+    motorola)
+      need_lib_prefix=no
+      need_version=no
+      shlibpath_overrides_runpath=no
+      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+      ;;
+  esac
+  ;;
+
+sysv4*MP*)
+  if test -d /usr/nec ;then
+    version_type=linux # correct to gnu/linux during the next big refactor
+    library_names_spec='$libname${shared_ext}.$versuffix $libname${shared_ext}.$major $libname${shared_ext}'
+    soname_spec='$libname${shared_ext}.$major'
+    shlibpath_var=LD_LIBRARY_PATH
+  fi
+  ;;
+
+sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
+  version_type=freebsd-elf
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  if test "$with_gnu_ld" = yes; then
+    sys_lib_search_path_spec='/usr/local/lib /usr/gnu/lib /usr/ccs/lib /usr/lib /lib'
+  else
+    sys_lib_search_path_spec='/usr/ccs/lib /usr/lib'
+    case $host_os in
+      sco3.2v5*)
+        sys_lib_search_path_spec="$sys_lib_search_path_spec /lib"
+	;;
+    esac
+  fi
+  sys_lib_dlsearch_path_spec='/usr/lib'
+  ;;
+
+tpf*)
+  # TPF is a cross-target only.  Preferred cross-host = GNU/Linux.
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  ;;
+
+uts4*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+*)
+  dynamic_linker=no
+  ;;
+esac
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $dynamic_linker" >&5
+$as_echo "$dynamic_linker" >&6; }
+test "$dynamic_linker" = no && can_build_shared=no
+
+variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
+if test "$GCC" = yes; then
+  variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
+fi
+
+if test "${lt_cv_sys_lib_search_path_spec+set}" = set; then
+  sys_lib_search_path_spec="$lt_cv_sys_lib_search_path_spec"
+fi
+if test "${lt_cv_sys_lib_dlsearch_path_spec+set}" = set; then
+  sys_lib_dlsearch_path_spec="$lt_cv_sys_lib_dlsearch_path_spec"
+fi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking how to hardcode library paths into programs" >&5
+$as_echo_n "checking how to hardcode library paths into programs... " >&6; }
+hardcode_action=
+if test -n "$hardcode_libdir_flag_spec" ||
+   test -n "$runpath_var" ||
+   test "X$hardcode_automatic" = "Xyes" ; then
+
+  # We can hardcode non-existent directories.
+  if test "$hardcode_direct" != no &&
+     # If the only mechanism to avoid hardcoding is shlibpath_var, we
+     # have to relink, otherwise we might link with an installed library
+     # when we should be linking with a yet-to-be-installed one
+     ## test "$_LT_TAGVAR(hardcode_shlibpath_var, )" != no &&
+     test "$hardcode_minus_L" != no; then
+    # Linking always hardcodes the temporary library directory.
+    hardcode_action=relink
+  else
+    # We can link without hardcoding, and we can hardcode nonexisting dirs.
+    hardcode_action=immediate
+  fi
+else
+  # We cannot hardcode anything, or else we can only hardcode existing
+  # directories.
+  hardcode_action=unsupported
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hardcode_action" >&5
+$as_echo "$hardcode_action" >&6; }
+
+if test "$hardcode_action" = relink ||
+   test "$inherit_rpath" = yes; then
+  # Fast installation is not supported
+  enable_fast_install=no
+elif test "$shlibpath_overrides_runpath" = yes ||
+     test "$enable_shared" = no; then
+  # Fast installation is not necessary
+  enable_fast_install=needless
+fi
+
+
+
+
+
+
+  if test "x$enable_dlopen" != xyes; then
+  enable_dlopen=unknown
+  enable_dlopen_self=unknown
+  enable_dlopen_self_static=unknown
+else
+  lt_cv_dlopen=no
+  lt_cv_dlopen_libs=
+
+  case $host_os in
+  beos*)
+    lt_cv_dlopen="load_add_on"
+    lt_cv_dlopen_libs=
+    lt_cv_dlopen_self=yes
+    ;;
+
+  mingw* | pw32* | cegcc*)
+    lt_cv_dlopen="LoadLibrary"
+    lt_cv_dlopen_libs=
+    ;;
+
+  cygwin*)
+    lt_cv_dlopen="dlopen"
+    lt_cv_dlopen_libs=
+    ;;
+
+  darwin*)
+  # if libdl is installed we need to link against it
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dlopen in -ldl" >&5
+$as_echo_n "checking for dlopen in -ldl... " >&6; }
+if ${ac_cv_lib_dl_dlopen+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldl  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen ();
+int
+main ()
+{
+return dlopen ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_dl_dlopen=yes
+else
+  ac_cv_lib_dl_dlopen=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_dl_dlopen" >&5
+$as_echo "$ac_cv_lib_dl_dlopen" >&6; }
+if test "x$ac_cv_lib_dl_dlopen" = xyes; then :
+  lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"
+else
+
+    lt_cv_dlopen="dyld"
+    lt_cv_dlopen_libs=
+    lt_cv_dlopen_self=yes
+
+fi
+
+    ;;
+
+  *)
+    ac_fn_c_check_func "$LINENO" "shl_load" "ac_cv_func_shl_load"
+if test "x$ac_cv_func_shl_load" = xyes; then :
+  lt_cv_dlopen="shl_load"
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for shl_load in -ldld" >&5
+$as_echo_n "checking for shl_load in -ldld... " >&6; }
+if ${ac_cv_lib_dld_shl_load+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldld  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char shl_load ();
+int
+main ()
+{
+return shl_load ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_dld_shl_load=yes
+else
+  ac_cv_lib_dld_shl_load=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_dld_shl_load" >&5
+$as_echo "$ac_cv_lib_dld_shl_load" >&6; }
+if test "x$ac_cv_lib_dld_shl_load" = xyes; then :
+  lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-ldld"
+else
+  ac_fn_c_check_func "$LINENO" "dlopen" "ac_cv_func_dlopen"
+if test "x$ac_cv_func_dlopen" = xyes; then :
+  lt_cv_dlopen="dlopen"
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dlopen in -ldl" >&5
+$as_echo_n "checking for dlopen in -ldl... " >&6; }
+if ${ac_cv_lib_dl_dlopen+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldl  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen ();
+int
+main ()
+{
+return dlopen ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_dl_dlopen=yes
+else
+  ac_cv_lib_dl_dlopen=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_dl_dlopen" >&5
+$as_echo "$ac_cv_lib_dl_dlopen" >&6; }
+if test "x$ac_cv_lib_dl_dlopen" = xyes; then :
+  lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dlopen in -lsvld" >&5
+$as_echo_n "checking for dlopen in -lsvld... " >&6; }
+if ${ac_cv_lib_svld_dlopen+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lsvld  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen ();
+int
+main ()
+{
+return dlopen ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_svld_dlopen=yes
+else
+  ac_cv_lib_svld_dlopen=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_svld_dlopen" >&5
+$as_echo "$ac_cv_lib_svld_dlopen" >&6; }
+if test "x$ac_cv_lib_svld_dlopen" = xyes; then :
+  lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-lsvld"
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dld_link in -ldld" >&5
+$as_echo_n "checking for dld_link in -ldld... " >&6; }
+if ${ac_cv_lib_dld_dld_link+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldld  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dld_link ();
+int
+main ()
+{
+return dld_link ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_dld_dld_link=yes
+else
+  ac_cv_lib_dld_dld_link=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_dld_dld_link" >&5
+$as_echo "$ac_cv_lib_dld_dld_link" >&6; }
+if test "x$ac_cv_lib_dld_dld_link" = xyes; then :
+  lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-ldld"
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+    ;;
+  esac
+
+  if test "x$lt_cv_dlopen" != xno; then
+    enable_dlopen=yes
+  else
+    enable_dlopen=no
+  fi
+
+  case $lt_cv_dlopen in
+  dlopen)
+    save_CPPFLAGS="$CPPFLAGS"
+    test "x$ac_cv_header_dlfcn_h" = xyes && CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
+
+    save_LDFLAGS="$LDFLAGS"
+    wl=$lt_prog_compiler_wl eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\"
+
+    save_LIBS="$LIBS"
+    LIBS="$lt_cv_dlopen_libs $LIBS"
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether a program can dlopen itself" >&5
+$as_echo_n "checking whether a program can dlopen itself... " >&6; }
+if ${lt_cv_dlopen_self+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  	  if test "$cross_compiling" = yes; then :
+  lt_cv_dlopen_self=cross
+else
+  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+  lt_status=$lt_dlunknown
+  cat > conftest.$ac_ext <<_LT_EOF
+#line $LINENO "configure"
+#include "confdefs.h"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+#  define LT_DLGLOBAL		RTLD_GLOBAL
+#else
+#  ifdef DL_GLOBAL
+#    define LT_DLGLOBAL		DL_GLOBAL
+#  else
+#    define LT_DLGLOBAL		0
+#  endif
+#endif
+
+/* We may have to define LT_DLLAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LT_DLLAZY_OR_NOW
+#  ifdef RTLD_LAZY
+#    define LT_DLLAZY_OR_NOW		RTLD_LAZY
+#  else
+#    ifdef DL_LAZY
+#      define LT_DLLAZY_OR_NOW		DL_LAZY
+#    else
+#      ifdef RTLD_NOW
+#        define LT_DLLAZY_OR_NOW	RTLD_NOW
+#      else
+#        ifdef DL_NOW
+#          define LT_DLLAZY_OR_NOW	DL_NOW
+#        else
+#          define LT_DLLAZY_OR_NOW	0
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+
+/* When -fvisbility=hidden is used, assume the code has been annotated
+   correspondingly for the symbols needed.  */
+#if defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3))
+int fnord () __attribute__((visibility("default")));
+#endif
+
+int fnord () { return 42; }
+int main ()
+{
+  void *self = dlopen (0, LT_DLGLOBAL|LT_DLLAZY_OR_NOW);
+  int status = $lt_dlunknown;
+
+  if (self)
+    {
+      if (dlsym (self,"fnord"))       status = $lt_dlno_uscore;
+      else
+        {
+	  if (dlsym( self,"_fnord"))  status = $lt_dlneed_uscore;
+          else puts (dlerror ());
+	}
+      /* dlclose (self); */
+    }
+  else
+    puts (dlerror ());
+
+  return status;
+}
+_LT_EOF
+  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_link\""; } >&5
+  (eval $ac_link) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && test -s conftest${ac_exeext} 2>/dev/null; then
+    (./conftest; exit; ) >&5 2>/dev/null
+    lt_status=$?
+    case x$lt_status in
+      x$lt_dlno_uscore) lt_cv_dlopen_self=yes ;;
+      x$lt_dlneed_uscore) lt_cv_dlopen_self=yes ;;
+      x$lt_dlunknown|x*) lt_cv_dlopen_self=no ;;
+    esac
+  else :
+    # compilation failed
+    lt_cv_dlopen_self=no
+  fi
+fi
+rm -fr conftest*
+
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_dlopen_self" >&5
+$as_echo "$lt_cv_dlopen_self" >&6; }
+
+    if test "x$lt_cv_dlopen_self" = xyes; then
+      wl=$lt_prog_compiler_wl eval LDFLAGS=\"\$LDFLAGS $lt_prog_compiler_static\"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether a statically linked program can dlopen itself" >&5
+$as_echo_n "checking whether a statically linked program can dlopen itself... " >&6; }
+if ${lt_cv_dlopen_self_static+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  	  if test "$cross_compiling" = yes; then :
+  lt_cv_dlopen_self_static=cross
+else
+  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+  lt_status=$lt_dlunknown
+  cat > conftest.$ac_ext <<_LT_EOF
+#line $LINENO "configure"
+#include "confdefs.h"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+#  define LT_DLGLOBAL		RTLD_GLOBAL
+#else
+#  ifdef DL_GLOBAL
+#    define LT_DLGLOBAL		DL_GLOBAL
+#  else
+#    define LT_DLGLOBAL		0
+#  endif
+#endif
+
+/* We may have to define LT_DLLAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LT_DLLAZY_OR_NOW
+#  ifdef RTLD_LAZY
+#    define LT_DLLAZY_OR_NOW		RTLD_LAZY
+#  else
+#    ifdef DL_LAZY
+#      define LT_DLLAZY_OR_NOW		DL_LAZY
+#    else
+#      ifdef RTLD_NOW
+#        define LT_DLLAZY_OR_NOW	RTLD_NOW
+#      else
+#        ifdef DL_NOW
+#          define LT_DLLAZY_OR_NOW	DL_NOW
+#        else
+#          define LT_DLLAZY_OR_NOW	0
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+
+/* When -fvisbility=hidden is used, assume the code has been annotated
+   correspondingly for the symbols needed.  */
+#if defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3))
+int fnord () __attribute__((visibility("default")));
+#endif
+
+int fnord () { return 42; }
+int main ()
+{
+  void *self = dlopen (0, LT_DLGLOBAL|LT_DLLAZY_OR_NOW);
+  int status = $lt_dlunknown;
+
+  if (self)
+    {
+      if (dlsym (self,"fnord"))       status = $lt_dlno_uscore;
+      else
+        {
+	  if (dlsym( self,"_fnord"))  status = $lt_dlneed_uscore;
+          else puts (dlerror ());
+	}
+      /* dlclose (self); */
+    }
+  else
+    puts (dlerror ());
+
+  return status;
+}
+_LT_EOF
+  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_link\""; } >&5
+  (eval $ac_link) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && test -s conftest${ac_exeext} 2>/dev/null; then
+    (./conftest; exit; ) >&5 2>/dev/null
+    lt_status=$?
+    case x$lt_status in
+      x$lt_dlno_uscore) lt_cv_dlopen_self_static=yes ;;
+      x$lt_dlneed_uscore) lt_cv_dlopen_self_static=yes ;;
+      x$lt_dlunknown|x*) lt_cv_dlopen_self_static=no ;;
+    esac
+  else :
+    # compilation failed
+    lt_cv_dlopen_self_static=no
+  fi
+fi
+rm -fr conftest*
+
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_dlopen_self_static" >&5
+$as_echo "$lt_cv_dlopen_self_static" >&6; }
+    fi
+
+    CPPFLAGS="$save_CPPFLAGS"
+    LDFLAGS="$save_LDFLAGS"
+    LIBS="$save_LIBS"
+    ;;
+  esac
+
+  case $lt_cv_dlopen_self in
+  yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;;
+  *) enable_dlopen_self=unknown ;;
+  esac
+
+  case $lt_cv_dlopen_self_static in
+  yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;;
+  *) enable_dlopen_self_static=unknown ;;
+  esac
+fi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+striplib=
+old_striplib=
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether stripping libraries is possible" >&5
+$as_echo_n "checking whether stripping libraries is possible... " >&6; }
+if test -n "$STRIP" && $STRIP -V 2>&1 | $GREP "GNU strip" >/dev/null; then
+  test -z "$old_striplib" && old_striplib="$STRIP --strip-debug"
+  test -z "$striplib" && striplib="$STRIP --strip-unneeded"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+# FIXME - insert some real tests, host_os isn't really good enough
+  case $host_os in
+  darwin*)
+    if test -n "$STRIP" ; then
+      striplib="$STRIP -x"
+      old_striplib="$STRIP -S"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+    else
+      { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+    fi
+    ;;
+  *)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+    ;;
+  esac
+fi
+
+
+
+
+
+
+
+
+
+
+
+
+  # Report which library types will actually be built
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if libtool supports shared libraries" >&5
+$as_echo_n "checking if libtool supports shared libraries... " >&6; }
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $can_build_shared" >&5
+$as_echo "$can_build_shared" >&6; }
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build shared libraries" >&5
+$as_echo_n "checking whether to build shared libraries... " >&6; }
+  test "$can_build_shared" = "no" && enable_shared=no
+
+  # On AIX, shared libraries and static libraries use the same namespace, and
+  # are all built from PIC.
+  case $host_os in
+  aix3*)
+    test "$enable_shared" = yes && enable_static=no
+    if test -n "$RANLIB"; then
+      archive_cmds="$archive_cmds~\$RANLIB \$lib"
+      postinstall_cmds='$RANLIB $lib'
+    fi
+    ;;
+
+  aix[4-9]*)
+    if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
+      test "$enable_shared" = yes && enable_static=no
+    fi
+    ;;
+  esac
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $enable_shared" >&5
+$as_echo "$enable_shared" >&6; }
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build static libraries" >&5
+$as_echo_n "checking whether to build static libraries... " >&6; }
+  # Make sure either enable_shared or enable_static is yes.
+  test "$enable_shared" = yes || enable_static=yes
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $enable_static" >&5
+$as_echo "$enable_static" >&6; }
+
+
+
+
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+CC="$lt_save_CC"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+        ac_config_commands="$ac_config_commands libtool"
+
+
+
+
+# Only expand once:
+
+
+
+# Extract the first word of "ocamlbuild", so it can be a program name with args.
+set dummy ocamlbuild; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_OCAMLBUILD+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$OCAMLBUILD"; then
+  ac_cv_prog_OCAMLBUILD="$OCAMLBUILD" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_OCAMLBUILD="ocamlbuild"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+OCAMLBUILD=$ac_cv_prog_OCAMLBUILD
+if test -n "$OCAMLBUILD"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OCAMLBUILD" >&5
+$as_echo "$OCAMLBUILD" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+
+
+# Check whether --enable-mpi was given.
+if test "${enable_mpi+set}" = set; then :
+  enableval=$enable_mpi; enable_mpi=$enableval
+else
+  enable_mpi=no
+fi
+
+
+if test "$enable_mpi" = "yes"; then
+   if test $PRECISION = q; then
+      as_fn_error $? "quad precision is not supported in MPI" "$LINENO" 5
+   fi
+
+
+
+
+
+	for ac_prog in mpicc hcc mpcc mpcc_r mpxlc cmpicc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_MPICC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$MPICC"; then
+  ac_cv_prog_MPICC="$MPICC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_MPICC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+MPICC=$ac_cv_prog_MPICC
+if test -n "$MPICC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $MPICC" >&5
+$as_echo "$MPICC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$MPICC" && break
+done
+test -n "$MPICC" || MPICC="$CC"
+
+	acx_mpi_save_CC="$CC"
+	CC="$MPICC"
+
+
+
+if test x = x"$MPILIBS"; then
+	ac_fn_c_check_func "$LINENO" "MPI_Init" "ac_cv_func_MPI_Init"
+if test "x$ac_cv_func_MPI_Init" = xyes; then :
+  MPILIBS=" "
+fi
+
+fi
+if test x = x"$MPILIBS"; then
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for MPI_Init in -lmpi" >&5
+$as_echo_n "checking for MPI_Init in -lmpi... " >&6; }
+if ${ac_cv_lib_mpi_MPI_Init+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lmpi  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char MPI_Init ();
+int
+main ()
+{
+return MPI_Init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_mpi_MPI_Init=yes
+else
+  ac_cv_lib_mpi_MPI_Init=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_mpi_MPI_Init" >&5
+$as_echo "$ac_cv_lib_mpi_MPI_Init" >&6; }
+if test "x$ac_cv_lib_mpi_MPI_Init" = xyes; then :
+  MPILIBS="-lmpi"
+fi
+
+fi
+if test x = x"$MPILIBS"; then
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for MPI_Init in -lmpich" >&5
+$as_echo_n "checking for MPI_Init in -lmpich... " >&6; }
+if ${ac_cv_lib_mpich_MPI_Init+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lmpich  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char MPI_Init ();
+int
+main ()
+{
+return MPI_Init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_mpich_MPI_Init=yes
+else
+  ac_cv_lib_mpich_MPI_Init=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_mpich_MPI_Init" >&5
+$as_echo "$ac_cv_lib_mpich_MPI_Init" >&6; }
+if test "x$ac_cv_lib_mpich_MPI_Init" = xyes; then :
+  MPILIBS="-lmpich"
+fi
+
+fi
+
+if test x != x"$MPILIBS"; then
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for mpi.h" >&5
+$as_echo_n "checking for mpi.h... " >&6; }
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <mpi.h>
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+  MPILIBS=""
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+CC="$acx_mpi_save_CC"
+
+
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x = x"$MPILIBS"; then
+        as_fn_error $? "could not find mpi library for --enable-mpi" "$LINENO" 5
+        :
+else
+
+$as_echo "#define HAVE_MPI 1" >>confdefs.h
+
+        :
+fi
+
+   # Extract the first word of "mpirun", so it can be a program name with args.
+set dummy mpirun; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_MPIRUN+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$MPIRUN"; then
+  ac_cv_prog_MPIRUN="$MPIRUN" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_MPIRUN="mpirun"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+MPIRUN=$ac_cv_prog_MPIRUN
+if test -n "$MPIRUN"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $MPIRUN" >&5
+$as_echo "$MPIRUN" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+
+
+   save_CC=$CC
+   CC=$MPICC
+   # The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of MPI_Fint" >&5
+$as_echo_n "checking size of MPI_Fint... " >&6; }
+if ${ac_cv_sizeof_MPI_Fint+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (MPI_Fint))" "ac_cv_sizeof_MPI_Fint"        "#include <mpi.h>
+"; then :
+
+else
+  if test "$ac_cv_type_MPI_Fint" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (MPI_Fint)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_MPI_Fint=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_MPI_Fint" >&5
+$as_echo "$ac_cv_sizeof_MPI_Fint" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_MPI_FINT $ac_cv_sizeof_MPI_Fint
+_ACEOF
+
+
+   CC=$save_CC
+   if test 0 = $ac_cv_sizeof_MPI_Fint; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: sizeof(MPI_Fint) test failed" >&5
+$as_echo "$as_me: WARNING: sizeof(MPI_Fint) test failed" >&2;};
+            # The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of int" >&5
+$as_echo_n "checking size of int... " >&6; }
+if ${ac_cv_sizeof_int+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (int))" "ac_cv_sizeof_int"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_int" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (int)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_int=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_int" >&5
+$as_echo "$ac_cv_sizeof_int" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_INT $ac_cv_sizeof_int
+_ACEOF
+
+
+      if test 0 = $ac_cv_sizeof_int; then as_fn_error $? "sizeof(int) test failed" "$LINENO" 5; fi
+      ac_cv_sizeof_MPI_Fint=$ac_cv_sizeof_int
+   fi
+   C_MPI_FINT=C_INT`expr $ac_cv_sizeof_MPI_Fint \* 8`_T
+
+fi
+ if test "$enable_mpi" = "yes"; then
+  MPI_TRUE=
+  MPI_FALSE='#'
+else
+  MPI_TRUE='#'
+  MPI_FALSE=
+fi
+
+
+
+
+
+
+
+
+# Try to determine "good" native compiler flags if none specified via CFLAGS
+if test "$ac_test_CFLAGS" != "set"; then
+  CFLAGS=""
+  case $ax_cv_c_compiler_vendor in
+    dec) CFLAGS="-newc -w0 -O5 -ansi_alias -ansi_args -fp_reorder -tune host"
+    	 ;;
+
+    sun) CFLAGS="-native -fast -xO5 -dalign"
+    	 ;;
+
+    hp)  CFLAGS="+Oall +Optrs_ansi +DSnative"
+    	 ;;
+
+    ibm) xlc_opt="-qtune=auto"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts $xlc_opt" >&5
+$as_echo_n "checking whether C compiler accepts $xlc_opt... " >&6; }
+ax_save_FLAGS=$CFLAGS
+   CFLAGS="$xlc_opt"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval `$as_echo "ax_cv_c_flags_$xlc_opt" | $as_tr_sh`=yes
+else
+  eval `$as_echo "ax_cv_c_flags_$xlc_opt" | $as_tr_sh`=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   CFLAGS=$ax_save_FLAGS
+eval ax_check_compiler_flags=$`$as_echo "ax_cv_c_flags_$xlc_opt" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CFLAGS="-O3 -qansialias -w $xlc_opt"
+else
+	CFLAGS="-O3 -qansialias -w"
+                echo "******************************************************"
+                echo "*  You seem to have the IBM  C compiler.  It is      *"
+                echo "*  recommended for best performance that you use:    *"
+                echo "*                                                    *"
+                echo "*    CFLAGS=-O3 -qarch=xxx -qtune=xxx -qansialias -w *"
+                echo "*                      ^^^        ^^^                *"
+                echo "*  where xxx is pwr2, pwr3, 604, or whatever kind of *"
+                echo "*  CPU you have.  (Set the CFLAGS environment var.   *"
+                echo "*  and re-run configure.)  For more info, man cc.    *"
+                echo "******************************************************"
+fi
+
+         ;;
+
+    intel) CFLAGS="-O3"
+        # Intel seems to have changed the spelling of this flag recently
+        icc_ansi_alias="unknown"
+	for flag in -ansi-alias -ansi_alias; do
+	   { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts $flag" >&5
+$as_echo_n "checking whether C compiler accepts $flag... " >&6; }
+ax_save_FLAGS=$CFLAGS
+   CFLAGS="$flag"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval `$as_echo "ax_cv_c_flags_$flag" | $as_tr_sh`=yes
+else
+  eval `$as_echo "ax_cv_c_flags_$flag" | $as_tr_sh`=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   CFLAGS=$ax_save_FLAGS
+eval ax_check_compiler_flags=$`$as_echo "ax_cv_c_flags_$flag" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	icc_ansi_alias=$flag; break
+else
+	:
+fi
+
+	done
+ 	if test "x$icc_ansi_alias" != xunknown; then
+            CFLAGS="$CFLAGS $icc_ansi_alias"
+        fi
+	 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -malign-double" >&5
+$as_echo_n "checking whether C compiler accepts -malign-double... " >&6; }
+if ${ax_cv_c_flags__malign_double+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-malign-double"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__malign_double=yes
+else
+  ax_cv_c_flags__malign_double=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__malign_double
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CFLAGS="$CFLAGS -malign-double"
+else
+	:
+fi
+
+	# We used to check for architecture flags here, e.g. -xHost etc.,
+	# but these flags are problematic.  On icc-12.0.0, "-mavx -xHost"
+	# overrides -mavx with -xHost, generating SSE2 code instead of AVX
+	# code.  ICC does not seem to support -mtune=host or equivalent
+	# non-ABI changing flag.
+	;;
+
+    gnu)
+     # Default optimization flags for gcc on all systems.
+     # Somehow -O3 does not imply -fomit-frame-pointer on ia32
+     CFLAGS="-O3 -fomit-frame-pointer"
+
+     # tune for the host by default
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mtune=native" >&5
+$as_echo_n "checking whether C compiler accepts -mtune=native... " >&6; }
+if ${ax_cv_c_flags__mtune_native+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-mtune=native"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__mtune_native=yes
+else
+  ax_cv_c_flags__mtune_native=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__mtune_native
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CFLAGS="$CFLAGS -mtune=native"
+else
+	:
+fi
+
+
+     # -malign-double for x86 systems
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -malign-double" >&5
+$as_echo_n "checking whether C compiler accepts -malign-double... " >&6; }
+if ${ax_cv_c_flags__malign_double+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-malign-double"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__malign_double=yes
+else
+  ax_cv_c_flags__malign_double=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__malign_double
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CFLAGS="$CFLAGS -malign-double"
+else
+	:
+fi
+
+
+     #  -fstrict-aliasing for gcc-2.95+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -fstrict-aliasing" >&5
+$as_echo_n "checking whether C compiler accepts -fstrict-aliasing... " >&6; }
+if ${ax_cv_c_flags__fstrict_aliasing+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-fstrict-aliasing"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__fstrict_aliasing=yes
+else
+  ax_cv_c_flags__fstrict_aliasing=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__fstrict_aliasing
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CFLAGS="$CFLAGS -fstrict-aliasing"
+else
+	:
+fi
+
+
+     # -fno-schedule-insns is pretty much required on all risc
+     # processors.
+     #
+     # gcc performs one pass of instruction scheduling, then a pass of
+     # register allocation, then another pass of instruction
+     # scheduling.  The first pass reorders instructions in a way that
+     # is pretty much the worst possible for the purposes of register
+     # allocation.  We disable the first pass.
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -fno-schedule-insns" >&5
+$as_echo_n "checking whether C compiler accepts -fno-schedule-insns... " >&6; }
+if ${ax_cv_c_flags__fno_schedule_insns+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-fno-schedule-insns"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__fno_schedule_insns=yes
+else
+  ax_cv_c_flags__fno_schedule_insns=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__fno_schedule_insns
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CFLAGS="$CFLAGS -fno-schedule-insns"
+else
+	:
+fi
+
+
+     # note that we enable "unsafe" fp optimization with other compilers, too
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -ffast-math" >&5
+$as_echo_n "checking whether C compiler accepts -ffast-math... " >&6; }
+if ${ax_cv_c_flags__ffast_math+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-ffast-math"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__ffast_math=yes
+else
+  ax_cv_c_flags__ffast_math=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__ffast_math
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CFLAGS="$CFLAGS -ffast-math"
+else
+	:
+fi
+
+
+     ;;
+  esac
+
+  if test -z "$CFLAGS"; then
+	echo ""
+	echo "********************************************************"
+        echo "* WARNING: Don't know the best CFLAGS for this system  *"
+        echo "* Use ./configure CFLAGS=... to specify your own flags *"
+	echo "* (otherwise, a default of CFLAGS=-O3 will be used)    *"
+	echo "********************************************************"
+	echo ""
+        CFLAGS="-O3"
+  fi
+
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts $CFLAGS" >&5
+$as_echo_n "checking whether C compiler accepts $CFLAGS... " >&6; }
+ax_save_FLAGS=$CFLAGS
+   CFLAGS="$CFLAGS"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval `$as_echo "ax_cv_c_flags_$CFLAGS" | $as_tr_sh`=yes
+else
+  eval `$as_echo "ax_cv_c_flags_$CFLAGS" | $as_tr_sh`=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   CFLAGS=$ax_save_FLAGS
+eval ax_check_compiler_flags=$`$as_echo "ax_cv_c_flags_$CFLAGS" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	:
+else
+
+	echo ""
+        echo "********************************************************"
+        echo "* WARNING: The guessed CFLAGS don't seem to work with  *"
+        echo "* your compiler.                                       *"
+        echo "* Use ./configure CFLAGS=... to specify your own flags *"
+        echo "********************************************************"
+        echo ""
+        CFLAGS=""
+
+fi
+
+
+fi
+
+
+case "${ax_cv_c_compiler_vendor}" in
+   intel) # Stop icc from defining __GNUC__, except on MacOS where this fails
+        case "${host_os}" in
+            *darwin*) ;; # icc -no-gcc fails to compile some system headers
+            *)
+	        { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -no-gcc" >&5
+$as_echo_n "checking whether C compiler accepts -no-gcc... " >&6; }
+if ${ax_cv_c_flags__no_gcc+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-no-gcc"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__no_gcc=yes
+else
+  ax_cv_c_flags__no_gcc=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__no_gcc
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CC="$CC -no-gcc"
+else
+	:
+fi
+
+               ;;
+        esac
+        ;;
+
+   hp) # must (sometimes) manually increase cpp limits to handle fftw3.h
+         { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -Wp,-H128000" >&5
+$as_echo_n "checking whether C compiler accepts -Wp,-H128000... " >&6; }
+if ${ax_cv_c_flags__Wp+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-Wp,-H128000"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__Wp=yes
+else
+  ax_cv_c_flags__Wp=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__Wp
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CC="$CC -Wp,-H128000"
+else
+	:
+fi
+
+        ;;
+
+   portland) # -Masmkeyword required for asm("") cycle counters
+	 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -Masmkeyword" >&5
+$as_echo_n "checking whether C compiler accepts -Masmkeyword... " >&6; }
+if ${ax_cv_c_flags__Masmkeyword+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-Masmkeyword"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__Masmkeyword=yes
+else
+  ax_cv_c_flags__Masmkeyword=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__Masmkeyword
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CC="$CC -Masmkeyword"
+else
+	:
+fi
+
+        ;;
+esac
+
+case "${ax_cv_c_compiler_vendor}" in
+    gnu|intel)
+	# SSE/SSE2
+	if test "$have_sse2" = "yes" -a "x$SSE2_CFLAGS" = x; then
+	    if test "$PRECISION" = d; then flag=msse2; else flag=msse; fi
+	     { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -$flag" >&5
+$as_echo_n "checking whether C compiler accepts -$flag... " >&6; }
+ax_save_FLAGS=$CFLAGS
+   CFLAGS="-$flag"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval `$as_echo "ax_cv_c_flags_-$flag" | $as_tr_sh`=yes
+else
+  eval `$as_echo "ax_cv_c_flags_-$flag" | $as_tr_sh`=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   CFLAGS=$ax_save_FLAGS
+eval ax_check_compiler_flags=$`$as_echo "ax_cv_c_flags_-$flag" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	SSE2_CFLAGS="-$flag"
+else
+	as_fn_error $? "Need a version of gcc with -$flag" "$LINENO" 5
+fi
+
+	fi
+
+	# AVX
+	if test "$have_avx" = "yes" -a "x$AVX_CFLAGS" = x; then
+	     { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mavx" >&5
+$as_echo_n "checking whether C compiler accepts -mavx... " >&6; }
+if ${ax_cv_c_flags__mavx+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-mavx"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__mavx=yes
+else
+  ax_cv_c_flags__mavx=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__mavx
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	AVX_CFLAGS="-mavx"
+else
+	as_fn_error $? "Need a version of gcc with -mavx" "$LINENO" 5
+fi
+
+	fi
+
+	if test "$have_altivec" = "yes" -a "x$ALTIVEC_CFLAGS" = x; then
+	    # -DFAKE__VEC__ is a workaround because gcc-3.3 does not
+	    # #define __VEC__ with -maltivec.
+	     { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -faltivec" >&5
+$as_echo_n "checking whether C compiler accepts -faltivec... " >&6; }
+if ${ax_cv_c_flags__faltivec+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-faltivec"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__faltivec=yes
+else
+  ax_cv_c_flags__faltivec=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__faltivec
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	ALTIVEC_CFLAGS="-faltivec"
+else
+	 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -maltivec -mabi=altivec" >&5
+$as_echo_n "checking whether C compiler accepts -maltivec -mabi=altivec... " >&6; }
+if ${ax_cv_c_flags__maltivec__mabi_altivec+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-maltivec -mabi=altivec"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__maltivec__mabi_altivec=yes
+else
+  ax_cv_c_flags__maltivec__mabi_altivec=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__maltivec__mabi_altivec
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	ALTIVEC_CFLAGS="-maltivec -mabi=altivec -DFAKE__VEC__"
+else
+	 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -fvec" >&5
+$as_echo_n "checking whether C compiler accepts -fvec... " >&6; }
+if ${ax_cv_c_flags__fvec+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-fvec"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__fvec=yes
+else
+  ax_cv_c_flags__fvec=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__fvec
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	ALTIVEC_CFLAGS="-fvec"
+else
+	as_fn_error $? "Need a version of gcc with -maltivec" "$LINENO" 5
+fi
+
+fi
+
+fi
+
+	fi
+
+	if test "$have_neon" = "yes" -a "x$NEON_CFLAGS" = x; then
+	     { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mfpu=neon" >&5
+$as_echo_n "checking whether C compiler accepts -mfpu=neon... " >&6; }
+if ${ax_cv_c_flags__mfpu_neon+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-mfpu=neon"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__mfpu_neon=yes
+else
+  ax_cv_c_flags__mfpu_neon=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__mfpu_neon
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	NEON_CFLAGS="-mfpu=neon"
+else
+	as_fn_error $? "Need a version of gcc with -mfpu=neon" "$LINENO" 5
+fi
+
+	fi
+
+														;;
+esac
+
+
+
+
+
+
+if test "$with_incoming_stack_boundary"x != "no"x; then
+   case "${ax_cv_c_compiler_vendor}" in
+      gnu)
+        tentative_flags="-mincoming-stack-boundary=$with_incoming_stack_boundary";
+         { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts $tentative_flags" >&5
+$as_echo_n "checking whether C compiler accepts $tentative_flags... " >&6; }
+ax_save_FLAGS=$CFLAGS
+   CFLAGS="$tentative_flags"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval `$as_echo "ax_cv_c_flags_$tentative_flags" | $as_tr_sh`=yes
+else
+  eval `$as_echo "ax_cv_c_flags_$tentative_flags" | $as_tr_sh`=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   CFLAGS=$ax_save_FLAGS
+eval ax_check_compiler_flags=$`$as_echo "ax_cv_c_flags_$tentative_flags" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	STACK_ALIGN_CFLAGS=$tentative_flags
+else
+	:
+fi
+
+      ;;
+   esac
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
+$as_echo_n "checking for ANSI C header files... " >&6; }
+if ${ac_cv_header_stdc+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_header_stdc=yes
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "memchr" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "free" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+  if test "$cross_compiling" = yes; then :
+  :
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+		   (('a' <= (c) && (c) <= 'i') \
+		     || ('j' <= (c) && (c) <= 'r') \
+		     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+	|| toupper (i) != TOUPPER (i))
+      return 2;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
+$as_echo "$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+$as_echo "#define STDC_HEADERS 1" >>confdefs.h
+
+fi
+
+for ac_header in libintl.h malloc.h stddef.h stdlib.h string.h strings.h sys/time.h unistd.h limits.h c_asm.h intrinsics.h stdint.h mach/mach_time.h sys/sysctl.h
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+save_CFLAGS="$CFLAGS"
+save_CPPFLAGS="$CPPFLAGS"
+CFLAGS="$CFLAGS $ALTIVEC_CFLAGS"
+CPPFLAGS="$CPPFLAGS $ALTIVEC_CFLAGS"
+for ac_header in altivec.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "altivec.h" "ac_cv_header_altivec_h" "$ac_includes_default"
+if test "x$ac_cv_header_altivec_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_ALTIVEC_H 1
+_ACEOF
+
+fi
+
+done
+
+CFLAGS="$save_CFLAGS"
+CPPFLAGS="$save_CPPFLAGS"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for an ANSI C-conforming const" >&5
+$as_echo_n "checking for an ANSI C-conforming const... " >&6; }
+if ${ac_cv_c_const+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+#ifndef __cplusplus
+  /* Ultrix mips cc rejects this sort of thing.  */
+  typedef int charset[2];
+  const charset cs = { 0, 0 };
+  /* SunOS 4.1.1 cc rejects this.  */
+  char const *const *pcpcc;
+  char **ppc;
+  /* NEC SVR4.0.2 mips cc rejects this.  */
+  struct point {int x, y;};
+  static struct point const zero = {0,0};
+  /* AIX XL C 1.02.0.0 rejects this.
+     It does not let you subtract one const X* pointer from another in
+     an arm of an if-expression whose if-part is not a constant
+     expression */
+  const char *g = "string";
+  pcpcc = &g + (g ? g-g : 0);
+  /* HPUX 7.0 cc rejects these. */
+  ++pcpcc;
+  ppc = (char**) pcpcc;
+  pcpcc = (char const *const *) ppc;
+  { /* SCO 3.2v4 cc rejects this sort of thing.  */
+    char tx;
+    char *t = &tx;
+    char const *s = 0 ? (char *) 0 : (char const *) 0;
+
+    *t++ = 0;
+    if (s) return 0;
+  }
+  { /* Someone thinks the Sun supposedly-ANSI compiler will reject this.  */
+    int x[] = {25, 17};
+    const int *foo = &x[0];
+    ++foo;
+  }
+  { /* Sun SC1.0 ANSI compiler rejects this -- but not the above. */
+    typedef const int *iptr;
+    iptr p = 0;
+    ++p;
+  }
+  { /* AIX XL C 1.02.0.0 rejects this sort of thing, saying
+       "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */
+    struct s { int j; const int *ap[3]; } bx;
+    struct s *b = &bx; b->j = 5;
+  }
+  { /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */
+    const int foo = 10;
+    if (!foo) return 0;
+  }
+  return !cs[0] && !zero.x;
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_c_const=yes
+else
+  ac_cv_c_const=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_const" >&5
+$as_echo "$ac_cv_c_const" >&6; }
+if test $ac_cv_c_const = no; then
+
+$as_echo "#define const /**/" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for inline" >&5
+$as_echo_n "checking for inline... " >&6; }
+if ${ac_cv_c_inline+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_c_inline=no
+for ac_kw in inline __inline__ __inline; do
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifndef __cplusplus
+typedef int foo_t;
+static $ac_kw foo_t static_foo () {return 0; }
+$ac_kw foo_t foo () {return 0; }
+#endif
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_c_inline=$ac_kw
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  test "$ac_cv_c_inline" != no && break
+done
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_inline" >&5
+$as_echo "$ac_cv_c_inline" >&6; }
+
+case $ac_cv_c_inline in
+  inline | yes) ;;
+  *)
+    case $ac_cv_c_inline in
+      no) ac_val=;;
+      *) ac_val=$ac_cv_c_inline;;
+    esac
+    cat >>confdefs.h <<_ACEOF
+#ifndef __cplusplus
+#define inline $ac_val
+#endif
+_ACEOF
+    ;;
+esac
+
+ac_fn_c_check_type "$LINENO" "size_t" "ac_cv_type_size_t" "$ac_includes_default"
+if test "x$ac_cv_type_size_t" = xyes; then :
+
+else
+
+cat >>confdefs.h <<_ACEOF
+#define size_t unsigned int
+_ACEOF
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether time.h and sys/time.h may both be included" >&5
+$as_echo_n "checking whether time.h and sys/time.h may both be included... " >&6; }
+if ${ac_cv_header_time+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+
+int
+main ()
+{
+if ((struct tm *) 0)
+return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_header_time=yes
+else
+  ac_cv_header_time=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_time" >&5
+$as_echo "$ac_cv_header_time" >&6; }
+if test $ac_cv_header_time = yes; then
+
+$as_echo "#define TIME_WITH_SYS_TIME 1" >>confdefs.h
+
+fi
+
+ac_fn_c_check_type "$LINENO" "long double" "ac_cv_type_long_double" "$ac_includes_default"
+if test "x$ac_cv_type_long_double" = xyes; then :
+
+$as_echo "#define HAVE_LONG_DOUBLE 1" >>confdefs.h
+
+else
+
+if test $PRECISION = l; then
+    as_fn_error $? "long double is not a supported type with your compiler." "$LINENO" 5
+fi
+
+fi
+
+ac_fn_c_check_type "$LINENO" "hrtime_t" "ac_cv_type_hrtime_t" "
+#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+"
+if test "x$ac_cv_type_hrtime_t" = xyes; then :
+
+$as_echo "#define HAVE_HRTIME_T 1" >>confdefs.h
+
+fi
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of int" >&5
+$as_echo_n "checking size of int... " >&6; }
+if ${ac_cv_sizeof_int+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (int))" "ac_cv_sizeof_int"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_int" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (int)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_int=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_int" >&5
+$as_echo "$ac_cv_sizeof_int" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_INT $ac_cv_sizeof_int
+_ACEOF
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned int" >&5
+$as_echo_n "checking size of unsigned int... " >&6; }
+if ${ac_cv_sizeof_unsigned_int+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned int))" "ac_cv_sizeof_unsigned_int"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_unsigned_int" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (unsigned int)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_unsigned_int=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_int" >&5
+$as_echo "$ac_cv_sizeof_unsigned_int" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_UNSIGNED_INT $ac_cv_sizeof_unsigned_int
+_ACEOF
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of long" >&5
+$as_echo_n "checking size of long... " >&6; }
+if ${ac_cv_sizeof_long+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (long))" "ac_cv_sizeof_long"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_long" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (long)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_long=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_long" >&5
+$as_echo "$ac_cv_sizeof_long" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_LONG $ac_cv_sizeof_long
+_ACEOF
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned long" >&5
+$as_echo_n "checking size of unsigned long... " >&6; }
+if ${ac_cv_sizeof_unsigned_long+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned long))" "ac_cv_sizeof_unsigned_long"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_unsigned_long" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (unsigned long)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_unsigned_long=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_long" >&5
+$as_echo "$ac_cv_sizeof_unsigned_long" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_UNSIGNED_LONG $ac_cv_sizeof_unsigned_long
+_ACEOF
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of long long" >&5
+$as_echo_n "checking size of long long... " >&6; }
+if ${ac_cv_sizeof_long_long+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (long long))" "ac_cv_sizeof_long_long"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_long_long" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (long long)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_long_long=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_long_long" >&5
+$as_echo "$ac_cv_sizeof_long_long" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_LONG_LONG $ac_cv_sizeof_long_long
+_ACEOF
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned long long" >&5
+$as_echo_n "checking size of unsigned long long... " >&6; }
+if ${ac_cv_sizeof_unsigned_long_long+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned long long))" "ac_cv_sizeof_unsigned_long_long"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_unsigned_long_long" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (unsigned long long)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_unsigned_long_long=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_long_long" >&5
+$as_echo "$ac_cv_sizeof_unsigned_long_long" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_UNSIGNED_LONG_LONG $ac_cv_sizeof_unsigned_long_long
+_ACEOF
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of size_t" >&5
+$as_echo_n "checking size of size_t... " >&6; }
+if ${ac_cv_sizeof_size_t+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (size_t))" "ac_cv_sizeof_size_t"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_size_t" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (size_t)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_size_t=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_size_t" >&5
+$as_echo "$ac_cv_sizeof_size_t" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_SIZE_T $ac_cv_sizeof_size_t
+_ACEOF
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of ptrdiff_t" >&5
+$as_echo_n "checking size of ptrdiff_t... " >&6; }
+if ${ac_cv_sizeof_ptrdiff_t+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (ptrdiff_t))" "ac_cv_sizeof_ptrdiff_t"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_ptrdiff_t" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (ptrdiff_t)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_ptrdiff_t=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_ptrdiff_t" >&5
+$as_echo "$ac_cv_sizeof_ptrdiff_t" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_PTRDIFF_T $ac_cv_sizeof_ptrdiff_t
+_ACEOF
+
+
+
+ac_fn_c_check_type "$LINENO" "uintptr_t" "ac_cv_type_uintptr_t" "$ac_includes_default
+#ifdef HAVE_STDINT_H
+#  include <stdint.h>
+#endif
+"
+if test "x$ac_cv_type_uintptr_t" = xyes; then :
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_UINTPTR_T 1
+_ACEOF
+
+
+else
+  # The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of void *" >&5
+$as_echo_n "checking size of void *... " >&6; }
+if ${ac_cv_sizeof_void_p+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (void *))" "ac_cv_sizeof_void_p"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_void_p" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (void *)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_void_p=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_void_p" >&5
+$as_echo "$ac_cv_sizeof_void_p" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_VOID_P $ac_cv_sizeof_void_p
+_ACEOF
+
+
+fi
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of float" >&5
+$as_echo_n "checking size of float... " >&6; }
+if ${ac_cv_sizeof_float+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (float))" "ac_cv_sizeof_float"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_float" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (float)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_float=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_float" >&5
+$as_echo "$ac_cv_sizeof_float" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_FLOAT $ac_cv_sizeof_float
+_ACEOF
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of double" >&5
+$as_echo_n "checking size of double... " >&6; }
+if ${ac_cv_sizeof_double+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (double))" "ac_cv_sizeof_double"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_double" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (double)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_double=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_double" >&5
+$as_echo "$ac_cv_sizeof_double" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_DOUBLE $ac_cv_sizeof_double
+_ACEOF
+
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of fftw_r2r_kind" >&5
+$as_echo_n "checking size of fftw_r2r_kind... " >&6; }
+if ${ac_cv_sizeof_fftw_r2r_kind+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (fftw_r2r_kind))" "ac_cv_sizeof_fftw_r2r_kind"        "typedef enum {
+     FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
+     FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
+     FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
+} fftw_r2r_kind;
+"; then :
+
+else
+  if test "$ac_cv_type_fftw_r2r_kind" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (fftw_r2r_kind)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_fftw_r2r_kind=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_fftw_r2r_kind" >&5
+$as_echo "$ac_cv_sizeof_fftw_r2r_kind" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_FFTW_R2R_KIND $ac_cv_sizeof_fftw_r2r_kind
+_ACEOF
+
+
+if test 0 = $ac_cv_sizeof_fftw_r2r_kind; then as_fn_error $? "sizeof(fftw_r2r_kind) test failed" "$LINENO" 5; fi
+C_FFTW_R2R_KIND=C_INT`expr $ac_cv_sizeof_fftw_r2r_kind \* 8`_T
+
+
+# The Ultrix 4.2 mips builtin alloca declared by alloca.h only works
+# for constant arguments.  Useless!
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for working alloca.h" >&5
+$as_echo_n "checking for working alloca.h... " >&6; }
+if ${ac_cv_working_alloca_h+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <alloca.h>
+int
+main ()
+{
+char *p = (char *) alloca (2 * sizeof (int));
+			  if (p) return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_working_alloca_h=yes
+else
+  ac_cv_working_alloca_h=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_working_alloca_h" >&5
+$as_echo "$ac_cv_working_alloca_h" >&6; }
+if test $ac_cv_working_alloca_h = yes; then
+
+$as_echo "#define HAVE_ALLOCA_H 1" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for alloca" >&5
+$as_echo_n "checking for alloca... " >&6; }
+if ${ac_cv_func_alloca_works+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __GNUC__
+# define alloca __builtin_alloca
+#else
+# ifdef _MSC_VER
+#  include <malloc.h>
+#  define alloca _alloca
+# else
+#  ifdef HAVE_ALLOCA_H
+#   include <alloca.h>
+#  else
+#   ifdef _AIX
+ #pragma alloca
+#   else
+#    ifndef alloca /* predefined by HP cc +Olibcalls */
+void *alloca (size_t);
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+int
+main ()
+{
+char *p = (char *) alloca (1);
+				    if (p) return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_func_alloca_works=yes
+else
+  ac_cv_func_alloca_works=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_alloca_works" >&5
+$as_echo "$ac_cv_func_alloca_works" >&6; }
+
+if test $ac_cv_func_alloca_works = yes; then
+
+$as_echo "#define HAVE_ALLOCA 1" >>confdefs.h
+
+else
+  # The SVR3 libPW and SVR4 libucb both contain incompatible functions
+# that cause trouble.  Some versions do not even contain alloca or
+# contain a buggy version.  If you still want to use their alloca,
+# use ar to extract alloca.o from them instead of compiling alloca.c.
+
+ALLOCA=\${LIBOBJDIR}alloca.$ac_objext
+
+$as_echo "#define C_ALLOCA 1" >>confdefs.h
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether \`alloca.c' needs Cray hooks" >&5
+$as_echo_n "checking whether \`alloca.c' needs Cray hooks... " >&6; }
+if ${ac_cv_os_cray+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#if defined CRAY && ! defined CRAY2
+webecray
+#else
+wenotbecray
+#endif
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "webecray" >/dev/null 2>&1; then :
+  ac_cv_os_cray=yes
+else
+  ac_cv_os_cray=no
+fi
+rm -f conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_os_cray" >&5
+$as_echo "$ac_cv_os_cray" >&6; }
+if test $ac_cv_os_cray = yes; then
+  for ac_func in _getb67 GETB67 getb67; do
+    as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+
+cat >>confdefs.h <<_ACEOF
+#define CRAY_STACKSEG_END $ac_func
+_ACEOF
+
+    break
+fi
+
+  done
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking stack direction for C alloca" >&5
+$as_echo_n "checking stack direction for C alloca... " >&6; }
+if ${ac_cv_c_stack_direction+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "$cross_compiling" = yes; then :
+  ac_cv_c_stack_direction=0
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+int
+find_stack_direction (int *addr, int depth)
+{
+  int dir, dummy = 0;
+  if (! addr)
+    addr = &dummy;
+  *addr = addr < &dummy ? 1 : addr == &dummy ? 0 : -1;
+  dir = depth ? find_stack_direction (addr, depth - 1) : 0;
+  return dir + dummy;
+}
+
+int
+main (int argc, char **argv)
+{
+  return find_stack_direction (0, argc + !argv + 20) < 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ac_cv_c_stack_direction=1
+else
+  ac_cv_c_stack_direction=-1
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_stack_direction" >&5
+$as_echo "$ac_cv_c_stack_direction" >&6; }
+cat >>confdefs.h <<_ACEOF
+#define STACK_DIRECTION $ac_cv_c_stack_direction
+_ACEOF
+
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for working strtod" >&5
+$as_echo_n "checking for working strtod... " >&6; }
+if ${ac_cv_func_strtod+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "$cross_compiling" = yes; then :
+  ac_cv_func_strtod=no
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+$ac_includes_default
+#ifndef strtod
+double strtod ();
+#endif
+int
+main()
+{
+  {
+    /* Some versions of Linux strtod mis-parse strings with leading '+'.  */
+    char *string = " +69";
+    char *term;
+    double value;
+    value = strtod (string, &term);
+    if (value != 69 || term != (string + 4))
+      return 1;
+  }
+
+  {
+    /* Under Solaris 2.4, strtod returns the wrong value for the
+       terminating character under some conditions.  */
+    char *string = "NaN";
+    char *term;
+    strtod (string, &term);
+    if (term != string && *(term - 1) == 0)
+      return 1;
+  }
+  return 0;
+}
+
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ac_cv_func_strtod=yes
+else
+  ac_cv_func_strtod=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_strtod" >&5
+$as_echo "$ac_cv_func_strtod" >&6; }
+if test $ac_cv_func_strtod = no; then
+  case " $LIBOBJS " in
+  *" strtod.$ac_objext "* ) ;;
+  *) LIBOBJS="$LIBOBJS strtod.$ac_objext"
+ ;;
+esac
+
+ac_fn_c_check_func "$LINENO" "pow" "ac_cv_func_pow"
+if test "x$ac_cv_func_pow" = xyes; then :
+
+fi
+
+if test $ac_cv_func_pow = no; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pow in -lm" >&5
+$as_echo_n "checking for pow in -lm... " >&6; }
+if ${ac_cv_lib_m_pow+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lm  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pow ();
+int
+main ()
+{
+return pow ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_m_pow=yes
+else
+  ac_cv_lib_m_pow=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_m_pow" >&5
+$as_echo "$ac_cv_lib_m_pow" >&6; }
+if test "x$ac_cv_lib_m_pow" = xyes; then :
+  POW_LIB=-lm
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot find library containing definition of pow" >&5
+$as_echo "$as_me: WARNING: cannot find library containing definition of pow" >&2;}
+fi
+
+fi
+
+fi
+
+for ac_func in vprintf
+do :
+  ac_fn_c_check_func "$LINENO" "vprintf" "ac_cv_func_vprintf"
+if test "x$ac_cv_func_vprintf" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_VPRINTF 1
+_ACEOF
+
+ac_fn_c_check_func "$LINENO" "_doprnt" "ac_cv_func__doprnt"
+if test "x$ac_cv_func__doprnt" = xyes; then :
+
+$as_echo "#define HAVE_DOPRNT 1" >>confdefs.h
+
+fi
+
+fi
+done
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for sin in -lm" >&5
+$as_echo_n "checking for sin in -lm... " >&6; }
+if ${ac_cv_lib_m_sin+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lm  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char sin ();
+int
+main ()
+{
+return sin ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_m_sin=yes
+else
+  ac_cv_lib_m_sin=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_m_sin" >&5
+$as_echo "$ac_cv_lib_m_sin" >&6; }
+if test "x$ac_cv_lib_m_sin" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBM 1
+_ACEOF
+
+  LIBS="-lm $LIBS"
+
+fi
+
+
+if test $PRECISION = q; then
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using gcc 4.6.0 or later" >&5
+$as_echo_n "checking whether we are using gcc 4.6.0 or later... " >&6; }
+if ${ax_cv_gcc_4_6_0+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+ax_cv_gcc_4_6_0=no
+if test "$GCC" = "yes"; then
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#ifdef __GNUC__
+#  if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 6) \
+   || (__GNUC__ == 4 && __GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ >= 0)
+     yes;
+#  endif
+#endif
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "yes" >/dev/null 2>&1; then :
+  ax_cv_gcc_4_6_0=yes
+fi
+rm -f conftest*
+
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_4_6_0" >&5
+$as_echo "$ax_cv_gcc_4_6_0" >&6; }
+if test "$ax_cv_gcc_4_6_0" = yes; then
+	:
+else
+	as_fn_error $? "gcc 4.6 or later required for quad precision support" "$LINENO" 5
+fi
+
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sinq in -lquadmath" >&5
+$as_echo_n "checking for sinq in -lquadmath... " >&6; }
+if ${ac_cv_lib_quadmath_sinq+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lquadmath  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char sinq ();
+int
+main ()
+{
+return sinq ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_quadmath_sinq=yes
+else
+  ac_cv_lib_quadmath_sinq=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_quadmath_sinq" >&5
+$as_echo "$ac_cv_lib_quadmath_sinq" >&6; }
+if test "x$ac_cv_lib_quadmath_sinq" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBQUADMATH 1
+_ACEOF
+
+  LIBS="-lquadmath $LIBS"
+
+else
+  as_fn_error $? "quad precision requires libquadmath for quad-precision trigonometric routines" "$LINENO" 5
+fi
+
+   LIBQUADMATH=-lquadmath
+fi
+
+
+for ac_func in BSDgettimeofday gettimeofday gethrtime read_real_time time_base_to_time drand48 sqrt memset posix_memalign memalign _mm_malloc _mm_free clock_gettime mach_absolute_time sysctl abort sinl cosl snprintf
+do :
+  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+done
+
+ac_fn_c_check_decl "$LINENO" "drand48" "ac_cv_have_decl_drand48" "$ac_includes_default"
+if test "x$ac_cv_have_decl_drand48" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_DRAND48 $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "srand48" "ac_cv_have_decl_srand48" "$ac_includes_default"
+if test "x$ac_cv_have_decl_srand48" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_SRAND48 $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "memalign" "ac_cv_have_decl_memalign" "$ac_includes_default"
+if test "x$ac_cv_have_decl_memalign" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_MEMALIGN $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "posix_memalign" "ac_cv_have_decl_posix_memalign" "$ac_includes_default"
+if test "x$ac_cv_have_decl_posix_memalign" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_POSIX_MEMALIGN $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "sinl" "ac_cv_have_decl_sinl" "$ac_includes_default"
+if test "x$ac_cv_have_decl_sinl" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_SINL $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "cosl" "ac_cv_have_decl_cosl" "$ac_includes_default"
+if test "x$ac_cv_have_decl_cosl" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_COSL $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "sinq" "ac_cv_have_decl_sinq" "$ac_includes_default"
+if test "x$ac_cv_have_decl_sinq" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_SINQ $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "cosq" "ac_cv_have_decl_cosq" "$ac_includes_default"
+if test "x$ac_cv_have_decl_cosq" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_COSQ $ac_have_decl
+_ACEOF
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _rtc intrinsic" >&5
+$as_echo_n "checking for _rtc intrinsic... " >&6; }
+rtc_ok=yes
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef HAVE_INTRINSICS_H
+#include <intrinsics.h>
+#endif
+int
+main ()
+{
+_rtc()
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+$as_echo "#define HAVE__RTC 1" >>confdefs.h
+
+else
+  rtc_ok=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $rtc_ok" >&5
+$as_echo "$rtc_ok" >&6; }
+
+if test "$PRECISION" = "l"; then
+	for ac_func in cosl sinl tanl
+do :
+  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+
+else
+  as_fn_error $? "long-double precision requires long-double trigonometric routines" "$LINENO" 5
+fi
+done
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for isnan" >&5
+$as_echo_n "checking for isnan... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <math.h>
+
+int
+main ()
+{
+if (!isnan(3.14159)) isnan(2.7183);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ok=yes
+else
+  ok=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+if test "$ok" = "yes"; then
+
+$as_echo "#define HAVE_ISNAN 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ok}" >&5
+$as_echo "${ok}" >&6; }
+
+
+
+ax_gcc_aligns_stack=no
+if test "$GCC" = "yes"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mpreferred-stack-boundary=4" >&5
+$as_echo_n "checking whether C compiler accepts -mpreferred-stack-boundary=4... " >&6; }
+if ${ax_cv_c_flags__mpreferred_stack_boundary_4+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-mpreferred-stack-boundary=4"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__mpreferred_stack_boundary_4=yes
+else
+  ax_cv_c_flags__mpreferred_stack_boundary_4=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__mpreferred_stack_boundary_4
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the stack is at least 8-byte aligned by gcc" >&5
+$as_echo_n "checking whether the stack is at least 8-byte aligned by gcc... " >&6; }
+	save_CFLAGS="$CFLAGS"
+	CFLAGS="-O"
+	 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -malign-double" >&5
+$as_echo_n "checking whether C compiler accepts -malign-double... " >&6; }
+if ${ax_cv_c_flags__malign_double+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+      ax_save_FLAGS=$CFLAGS
+      CFLAGS="-malign-double"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_c_flags__malign_double=yes
+else
+  ax_cv_c_flags__malign_double=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+      CFLAGS=$ax_save_FLAGS
+fi
+
+eval ax_check_compiler_flags=$ax_cv_c_flags__malign_double
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_check_compiler_flags" >&5
+$as_echo "$ax_check_compiler_flags" >&6; }
+if test "x$ax_check_compiler_flags" = xyes; then
+	CFLAGS="$CFLAGS -malign-double"
+else
+	:
+fi
+
+	if test "$cross_compiling" = yes; then :
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using gcc 3.0.0 or later" >&5
+$as_echo_n "checking whether we are using gcc 3.0.0 or later... " >&6; }
+if ${ax_cv_gcc_3_0_0+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+ax_cv_gcc_3_0_0=no
+if test "$GCC" = "yes"; then
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#ifdef __GNUC__
+#  if (__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ > 0) \
+   || (__GNUC__ == 3 && __GNUC_MINOR__ == 0 && __GNUC_PATCHLEVEL__ >= 0)
+     yes;
+#  endif
+#endif
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "yes" >/dev/null 2>&1; then :
+  ax_cv_gcc_3_0_0=yes
+fi
+rm -f conftest*
+
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_3_0_0" >&5
+$as_echo "$ax_cv_gcc_3_0_0" >&6; }
+if test "$ax_cv_gcc_3_0_0" = yes; then
+	ax_gcc_stack_align_bug=no
+else
+	ax_gcc_stack_align_bug=yes
+fi
+
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+#       include <stdio.h>
+	struct yuck { int blechh; };
+	int one(void) { return 1; }
+	struct yuck ick(void) { struct yuck y; y.blechh = 3; return y; }
+#       define CHK_ALIGN(x) if ((((long) &(x)) & 0x7)) { fprintf(stderr, "bad alignment of " #x "\n"); exit(1); }
+	void blah(int foo) { double foobar; CHK_ALIGN(foobar); }
+	int main2(void) {double ok1; struct yuck y; double ok2; CHK_ALIGN(ok1);
+                         CHK_ALIGN(ok2); y = ick(); blah(one()); return 0;}
+	int main(void) { if ((((long) (__builtin_alloca(0))) & 0x7)) __builtin_alloca(4); return main2(); }
+
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ax_gcc_aligns_stack=yes; ax_gcc_stack_align_bug=no
+else
+  ax_gcc_stack_align_bug=yes
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+	CFLAGS="$save_CFLAGS"
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_gcc_aligns_stack" >&5
+$as_echo "$ax_gcc_aligns_stack" >&6; }
+
+else
+	:
+fi
+
+fi
+if test "$ax_gcc_aligns_stack" = yes; then
+	:
+else
+	:
+fi
+
+
+if test "${enable_debug}" = "yes"; then
+	CFLAGS="-g"
+fi
+
+if test "$enable_debug" = yes || test "$USE_MAINTAINER_MODE" = yes; then
+if test "$ac_test_CFLAGS" != "set"; then
+	if test $ac_cv_c_compiler_gnu = yes; then
+		CFLAGS="$CFLAGS -Wall -W -Wcast-qual -Wpointer-arith -Wcast-align -pedantic -Wno-long-long -Wshadow -Wbad-function-cast -Wwrite-strings -Wstrict-prototypes -Wredundant-decls -Wnested-externs" # -Wundef -Wconversion -Wmissing-prototypes -Wmissing-declarations
+	fi
+fi
+fi
+
+
+# Check whether --enable-fortran was given.
+if test "${enable_fortran+set}" = set; then :
+  enableval=$enable_fortran; enable_fortran=$enableval
+else
+  enable_fortran=yes
+fi
+
+
+if test "$enable_fortran" = "yes"; then
+        ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+  for ac_prog in g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77 xlf90 f90 pgf90 pghpf epcf90 gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn nagfor
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_F77+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$F77"; then
+  ac_cv_prog_F77="$F77" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_F77="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+F77=$ac_cv_prog_F77
+if test -n "$F77"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $F77" >&5
+$as_echo "$F77" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$F77" && break
+  done
+fi
+if test -z "$F77"; then
+  ac_ct_F77=$F77
+  for ac_prog in g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77 xlf90 f90 pgf90 pghpf epcf90 gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn nagfor
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_F77+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_F77"; then
+  ac_cv_prog_ac_ct_F77="$ac_ct_F77" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_F77="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_F77=$ac_cv_prog_ac_ct_F77
+if test -n "$ac_ct_F77"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_F77" >&5
+$as_echo "$ac_ct_F77" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_F77" && break
+done
+
+  if test "x$ac_ct_F77" = x; then
+    F77=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    F77=$ac_ct_F77
+  fi
+fi
+
+
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran 77 compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+rm -f a.out
+
+# If we don't use `.F' as extension, the preprocessor is not run on the
+# input file.  (Note that this only needs to work for GNU compilers.)
+ac_save_ext=$ac_ext
+ac_ext=F
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU Fortran 77 compiler" >&5
+$as_echo_n "checking whether we are using the GNU Fortran 77 compiler... " >&6; }
+if ${ac_cv_f77_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.$ac_ext <<_ACEOF
+      program main
+#ifndef __GNUC__
+       choke me
+#endif
+
+      end
+_ACEOF
+if ac_fn_f77_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_f77_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_compiler_gnu" >&5
+$as_echo "$ac_cv_f77_compiler_gnu" >&6; }
+ac_ext=$ac_save_ext
+ac_test_FFLAGS=${FFLAGS+set}
+ac_save_FFLAGS=$FFLAGS
+FFLAGS=
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $F77 accepts -g" >&5
+$as_echo_n "checking whether $F77 accepts -g... " >&6; }
+if ${ac_cv_prog_f77_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  FFLAGS=-g
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+if ac_fn_f77_try_compile "$LINENO"; then :
+  ac_cv_prog_f77_g=yes
+else
+  ac_cv_prog_f77_g=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_f77_g" >&5
+$as_echo "$ac_cv_prog_f77_g" >&6; }
+if test "$ac_test_FFLAGS" = set; then
+  FFLAGS=$ac_save_FFLAGS
+elif test $ac_cv_prog_f77_g = yes; then
+  if test "x$ac_cv_f77_compiler_gnu" = xyes; then
+    FFLAGS="-g -O2"
+  else
+    FFLAGS="-g"
+  fi
+else
+  if test "x$ac_cv_f77_compiler_gnu" = xyes; then
+    FFLAGS="-O2"
+  else
+    FFLAGS=
+  fi
+fi
+
+if test $ac_compiler_gnu = yes; then
+  G77=yes
+else
+  G77=
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+      ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+
+if test -z "$F77" || test "X$F77" = "Xno"; then
+  _lt_disable_F77=yes
+fi
+
+archive_cmds_need_lc_F77=no
+allow_undefined_flag_F77=
+always_export_symbols_F77=no
+archive_expsym_cmds_F77=
+export_dynamic_flag_spec_F77=
+hardcode_direct_F77=no
+hardcode_direct_absolute_F77=no
+hardcode_libdir_flag_spec_F77=
+hardcode_libdir_separator_F77=
+hardcode_minus_L_F77=no
+hardcode_automatic_F77=no
+inherit_rpath_F77=no
+module_cmds_F77=
+module_expsym_cmds_F77=
+link_all_deplibs_F77=unknown
+old_archive_cmds_F77=$old_archive_cmds
+reload_flag_F77=$reload_flag
+reload_cmds_F77=$reload_cmds
+no_undefined_flag_F77=
+whole_archive_flag_spec_F77=
+enable_shared_with_static_runtimes_F77=no
+
+# Source file extension for f77 test sources.
+ac_ext=f
+
+# Object file extension for compiled f77 test sources.
+objext=o
+objext_F77=$objext
+
+# No sense in running all these tests if we already determined that
+# the F77 compiler isn't working.  Some variables (like enable_shared)
+# are currently assumed to apply to all compilers on this platform,
+# and will be corrupted by setting them based on a non-working compiler.
+if test "$_lt_disable_F77" != yes; then
+  # Code to be used in simple compile tests
+  lt_simple_compile_test_code="\
+      subroutine t
+      return
+      end
+"
+
+  # Code to be used in simple link tests
+  lt_simple_link_test_code="\
+      program t
+      end
+"
+
+  # ltmain only uses $CC for tagged configurations so make sure $CC is set.
+
+
+
+
+
+
+# If no C compiler was specified, use CC.
+LTCC=${LTCC-"$CC"}
+
+# If no C compiler flags were specified, use CFLAGS.
+LTCFLAGS=${LTCFLAGS-"$CFLAGS"}
+
+# Allow CC to be a program name with arguments.
+compiler=$CC
+
+
+  # save warnings/boilerplate of simple test code
+  ac_outfile=conftest.$ac_objext
+echo "$lt_simple_compile_test_code" >conftest.$ac_ext
+eval "$ac_compile" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
+_lt_compiler_boilerplate=`cat conftest.err`
+$RM conftest*
+
+  ac_outfile=conftest.$ac_objext
+echo "$lt_simple_link_test_code" >conftest.$ac_ext
+eval "$ac_link" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
+_lt_linker_boilerplate=`cat conftest.err`
+$RM -r conftest*
+
+
+  # Allow CC to be a program name with arguments.
+  lt_save_CC="$CC"
+  lt_save_GCC=$GCC
+  lt_save_CFLAGS=$CFLAGS
+  CC=${F77-"f77"}
+  CFLAGS=$FFLAGS
+  compiler=$CC
+  compiler_F77=$CC
+  for cc_temp in $compiler""; do
+  case $cc_temp in
+    compile | *[\\/]compile | ccache | *[\\/]ccache ) ;;
+    distcc | *[\\/]distcc | purify | *[\\/]purify ) ;;
+    \-*) ;;
+    *) break;;
+  esac
+done
+cc_basename=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"`
+
+  GCC=$G77
+  if test -n "$compiler"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking if libtool supports shared libraries" >&5
+$as_echo_n "checking if libtool supports shared libraries... " >&6; }
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $can_build_shared" >&5
+$as_echo "$can_build_shared" >&6; }
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build shared libraries" >&5
+$as_echo_n "checking whether to build shared libraries... " >&6; }
+    test "$can_build_shared" = "no" && enable_shared=no
+
+    # On AIX, shared libraries and static libraries use the same namespace, and
+    # are all built from PIC.
+    case $host_os in
+      aix3*)
+        test "$enable_shared" = yes && enable_static=no
+        if test -n "$RANLIB"; then
+          archive_cmds="$archive_cmds~\$RANLIB \$lib"
+          postinstall_cmds='$RANLIB $lib'
+        fi
+        ;;
+      aix[4-9]*)
+	if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
+	  test "$enable_shared" = yes && enable_static=no
+	fi
+        ;;
+    esac
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $enable_shared" >&5
+$as_echo "$enable_shared" >&6; }
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build static libraries" >&5
+$as_echo_n "checking whether to build static libraries... " >&6; }
+    # Make sure either enable_shared or enable_static is yes.
+    test "$enable_shared" = yes || enable_static=yes
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $enable_static" >&5
+$as_echo "$enable_static" >&6; }
+
+    GCC_F77="$G77"
+    LD_F77="$LD"
+
+    ## CAVEAT EMPTOR:
+    ## There is no encapsulation within the following macros, do not change
+    ## the running order or otherwise move them around unless you know exactly
+    ## what you are doing...
+    lt_prog_compiler_wl_F77=
+lt_prog_compiler_pic_F77=
+lt_prog_compiler_static_F77=
+
+
+  if test "$GCC" = yes; then
+    lt_prog_compiler_wl_F77='-Wl,'
+    lt_prog_compiler_static_F77='-static'
+
+    case $host_os in
+      aix*)
+      # All AIX code is PIC.
+      if test "$host_cpu" = ia64; then
+	# AIX 5 now supports IA64 processor
+	lt_prog_compiler_static_F77='-Bstatic'
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            lt_prog_compiler_pic_F77='-fPIC'
+        ;;
+      m68k)
+            # FIXME: we need at least 68020 code to build shared libraries, but
+            # adding the `-m68020' flag to GCC prevents building anything better,
+            # like `-m68040'.
+            lt_prog_compiler_pic_F77='-m68020 -resident32 -malways-restore-a4'
+        ;;
+      esac
+      ;;
+
+    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+
+    mingw* | cygwin* | pw32* | os2* | cegcc*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      # Although the cygwin gcc ignores -fPIC, still need this for old-style
+      # (--disable-auto-import) libraries
+      lt_prog_compiler_pic_F77='-DDLL_EXPORT'
+      ;;
+
+    darwin* | rhapsody*)
+      # PIC is the default on this platform
+      # Common symbols not allowed in MH_DYLIB files
+      lt_prog_compiler_pic_F77='-fno-common'
+      ;;
+
+    haiku*)
+      # PIC is the default for Haiku.
+      # The "-static" flag exists, but is broken.
+      lt_prog_compiler_static_F77=
+      ;;
+
+    hpux*)
+      # PIC is the default for 64-bit PA HP-UX, but not for 32-bit
+      # PA HP-UX.  On IA64 HP-UX, PIC is the default but the pic flag
+      # sets the default TLS model and affects inlining.
+      case $host_cpu in
+      hppa*64*)
+	# +Z the default
+	;;
+      *)
+	lt_prog_compiler_pic_F77='-fPIC'
+	;;
+      esac
+      ;;
+
+    interix[3-9]*)
+      # Interix 3.x gcc -fpic/-fPIC options generate broken code.
+      # Instead, we relocate shared libraries at runtime.
+      ;;
+
+    msdosdjgpp*)
+      # Just because we use GCC doesn't mean we suddenly get shared libraries
+      # on systems that don't support them.
+      lt_prog_compiler_can_build_shared_F77=no
+      enable_shared=no
+      ;;
+
+    *nto* | *qnx*)
+      # QNX uses GNU C++, but need to define -shared option too, otherwise
+      # it will coredump.
+      lt_prog_compiler_pic_F77='-fPIC -shared'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	lt_prog_compiler_pic_F77=-Kconform_pic
+      fi
+      ;;
+
+    *)
+      lt_prog_compiler_pic_F77='-fPIC'
+      ;;
+    esac
+
+    case $cc_basename in
+    nvcc*) # Cuda Compiler Driver 2.2
+      lt_prog_compiler_wl_F77='-Xlinker '
+      if test -n "$lt_prog_compiler_pic_F77"; then
+        lt_prog_compiler_pic_F77="-Xcompiler $lt_prog_compiler_pic_F77"
+      fi
+      ;;
+    esac
+  else
+    # PORTME Check for flag to pass linker flags through the system compiler.
+    case $host_os in
+    aix*)
+      lt_prog_compiler_wl_F77='-Wl,'
+      if test "$host_cpu" = ia64; then
+	# AIX 5 now supports IA64 processor
+	lt_prog_compiler_static_F77='-Bstatic'
+      else
+	lt_prog_compiler_static_F77='-bnso -bI:/lib/syscalls.exp'
+      fi
+      ;;
+
+    mingw* | cygwin* | pw32* | os2* | cegcc*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      lt_prog_compiler_pic_F77='-DDLL_EXPORT'
+      ;;
+
+    hpux9* | hpux10* | hpux11*)
+      lt_prog_compiler_wl_F77='-Wl,'
+      # PIC is the default for IA64 HP-UX and 64-bit HP-UX, but
+      # not for PA HP-UX.
+      case $host_cpu in
+      hppa*64*|ia64*)
+	# +Z the default
+	;;
+      *)
+	lt_prog_compiler_pic_F77='+Z'
+	;;
+      esac
+      # Is there a better lt_prog_compiler_static that works with the bundled CC?
+      lt_prog_compiler_static_F77='${wl}-a ${wl}archive'
+      ;;
+
+    irix5* | irix6* | nonstopux*)
+      lt_prog_compiler_wl_F77='-Wl,'
+      # PIC (with -KPIC) is the default.
+      lt_prog_compiler_static_F77='-non_shared'
+      ;;
+
+    linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+      case $cc_basename in
+      # old Intel for x86_64 which still supported -KPIC.
+      ecc*)
+	lt_prog_compiler_wl_F77='-Wl,'
+	lt_prog_compiler_pic_F77='-KPIC'
+	lt_prog_compiler_static_F77='-static'
+        ;;
+      # icc used to be incompatible with GCC.
+      # ICC 10 doesn't accept -KPIC any more.
+      icc* | ifort*)
+	lt_prog_compiler_wl_F77='-Wl,'
+	lt_prog_compiler_pic_F77='-fPIC'
+	lt_prog_compiler_static_F77='-static'
+        ;;
+      # Lahey Fortran 8.1.
+      lf95*)
+	lt_prog_compiler_wl_F77='-Wl,'
+	lt_prog_compiler_pic_F77='--shared'
+	lt_prog_compiler_static_F77='--static'
+	;;
+      nagfor*)
+	# NAG Fortran compiler
+	lt_prog_compiler_wl_F77='-Wl,-Wl,,'
+	lt_prog_compiler_pic_F77='-PIC'
+	lt_prog_compiler_static_F77='-Bstatic'
+	;;
+      pgcc* | pgf77* | pgf90* | pgf95* | pgfortran*)
+        # Portland Group compilers (*not* the Pentium gcc compiler,
+	# which looks to be a dead project)
+	lt_prog_compiler_wl_F77='-Wl,'
+	lt_prog_compiler_pic_F77='-fpic'
+	lt_prog_compiler_static_F77='-Bstatic'
+        ;;
+      ccc*)
+        lt_prog_compiler_wl_F77='-Wl,'
+        # All Alpha code is PIC.
+        lt_prog_compiler_static_F77='-non_shared'
+        ;;
+      xl* | bgxl* | bgf* | mpixl*)
+	# IBM XL C 8.0/Fortran 10.1, 11.1 on PPC and BlueGene
+	lt_prog_compiler_wl_F77='-Wl,'
+	lt_prog_compiler_pic_F77='-qpic'
+	lt_prog_compiler_static_F77='-qstaticlink'
+	;;
+      *)
+	case `$CC -V 2>&1 | sed 5q` in
+	*Sun\ Ceres\ Fortran* | *Sun*Fortran*\ [1-7].* | *Sun*Fortran*\ 8.[0-3]*)
+	  # Sun Fortran 8.3 passes all unrecognized flags to the linker
+	  lt_prog_compiler_pic_F77='-KPIC'
+	  lt_prog_compiler_static_F77='-Bstatic'
+	  lt_prog_compiler_wl_F77=''
+	  ;;
+	*Sun\ F* | *Sun*Fortran*)
+	  lt_prog_compiler_pic_F77='-KPIC'
+	  lt_prog_compiler_static_F77='-Bstatic'
+	  lt_prog_compiler_wl_F77='-Qoption ld '
+	  ;;
+	*Sun\ C*)
+	  # Sun C 5.9
+	  lt_prog_compiler_pic_F77='-KPIC'
+	  lt_prog_compiler_static_F77='-Bstatic'
+	  lt_prog_compiler_wl_F77='-Wl,'
+	  ;;
+        *Intel*\ [CF]*Compiler*)
+	  lt_prog_compiler_wl_F77='-Wl,'
+	  lt_prog_compiler_pic_F77='-fPIC'
+	  lt_prog_compiler_static_F77='-static'
+	  ;;
+	*Portland\ Group*)
+	  lt_prog_compiler_wl_F77='-Wl,'
+	  lt_prog_compiler_pic_F77='-fpic'
+	  lt_prog_compiler_static_F77='-Bstatic'
+	  ;;
+	esac
+	;;
+      esac
+      ;;
+
+    newsos6)
+      lt_prog_compiler_pic_F77='-KPIC'
+      lt_prog_compiler_static_F77='-Bstatic'
+      ;;
+
+    *nto* | *qnx*)
+      # QNX uses GNU C++, but need to define -shared option too, otherwise
+      # it will coredump.
+      lt_prog_compiler_pic_F77='-fPIC -shared'
+      ;;
+
+    osf3* | osf4* | osf5*)
+      lt_prog_compiler_wl_F77='-Wl,'
+      # All OSF/1 code is PIC.
+      lt_prog_compiler_static_F77='-non_shared'
+      ;;
+
+    rdos*)
+      lt_prog_compiler_static_F77='-non_shared'
+      ;;
+
+    solaris*)
+      lt_prog_compiler_pic_F77='-KPIC'
+      lt_prog_compiler_static_F77='-Bstatic'
+      case $cc_basename in
+      f77* | f90* | f95* | sunf77* | sunf90* | sunf95*)
+	lt_prog_compiler_wl_F77='-Qoption ld ';;
+      *)
+	lt_prog_compiler_wl_F77='-Wl,';;
+      esac
+      ;;
+
+    sunos4*)
+      lt_prog_compiler_wl_F77='-Qoption ld '
+      lt_prog_compiler_pic_F77='-PIC'
+      lt_prog_compiler_static_F77='-Bstatic'
+      ;;
+
+    sysv4 | sysv4.2uw2* | sysv4.3*)
+      lt_prog_compiler_wl_F77='-Wl,'
+      lt_prog_compiler_pic_F77='-KPIC'
+      lt_prog_compiler_static_F77='-Bstatic'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec ;then
+	lt_prog_compiler_pic_F77='-Kconform_pic'
+	lt_prog_compiler_static_F77='-Bstatic'
+      fi
+      ;;
+
+    sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*)
+      lt_prog_compiler_wl_F77='-Wl,'
+      lt_prog_compiler_pic_F77='-KPIC'
+      lt_prog_compiler_static_F77='-Bstatic'
+      ;;
+
+    unicos*)
+      lt_prog_compiler_wl_F77='-Wl,'
+      lt_prog_compiler_can_build_shared_F77=no
+      ;;
+
+    uts4*)
+      lt_prog_compiler_pic_F77='-pic'
+      lt_prog_compiler_static_F77='-Bstatic'
+      ;;
+
+    *)
+      lt_prog_compiler_can_build_shared_F77=no
+      ;;
+    esac
+  fi
+
+case $host_os in
+  # For platforms which do not support PIC, -DPIC is meaningless:
+  *djgpp*)
+    lt_prog_compiler_pic_F77=
+    ;;
+  *)
+    lt_prog_compiler_pic_F77="$lt_prog_compiler_pic_F77"
+    ;;
+esac
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $compiler option to produce PIC" >&5
+$as_echo_n "checking for $compiler option to produce PIC... " >&6; }
+if ${lt_cv_prog_compiler_pic_F77+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_pic_F77=$lt_prog_compiler_pic_F77
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_pic_F77" >&5
+$as_echo "$lt_cv_prog_compiler_pic_F77" >&6; }
+lt_prog_compiler_pic_F77=$lt_cv_prog_compiler_pic_F77
+
+#
+# Check to make sure the PIC flag actually works.
+#
+if test -n "$lt_prog_compiler_pic_F77"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler PIC flag $lt_prog_compiler_pic_F77 works" >&5
+$as_echo_n "checking if $compiler PIC flag $lt_prog_compiler_pic_F77 works... " >&6; }
+if ${lt_cv_prog_compiler_pic_works_F77+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_pic_works_F77=no
+   ac_outfile=conftest.$ac_objext
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+   lt_compiler_flag="$lt_prog_compiler_pic_F77"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   # The option is referenced via a variable to avoid confusing sed.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
+   (eval "$lt_compile" 2>conftest.err)
+   ac_status=$?
+   cat conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   if (exit $ac_status) && test -s "$ac_outfile"; then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings other than the usual output.
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' >conftest.exp
+     $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
+     if test ! -s conftest.er2 || diff conftest.exp conftest.er2 >/dev/null; then
+       lt_cv_prog_compiler_pic_works_F77=yes
+     fi
+   fi
+   $RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_pic_works_F77" >&5
+$as_echo "$lt_cv_prog_compiler_pic_works_F77" >&6; }
+
+if test x"$lt_cv_prog_compiler_pic_works_F77" = xyes; then
+    case $lt_prog_compiler_pic_F77 in
+     "" | " "*) ;;
+     *) lt_prog_compiler_pic_F77=" $lt_prog_compiler_pic_F77" ;;
+     esac
+else
+    lt_prog_compiler_pic_F77=
+     lt_prog_compiler_can_build_shared_F77=no
+fi
+
+fi
+
+
+
+
+
+#
+# Check to make sure the static flag actually works.
+#
+wl=$lt_prog_compiler_wl_F77 eval lt_tmp_static_flag=\"$lt_prog_compiler_static_F77\"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler static flag $lt_tmp_static_flag works" >&5
+$as_echo_n "checking if $compiler static flag $lt_tmp_static_flag works... " >&6; }
+if ${lt_cv_prog_compiler_static_works_F77+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_static_works_F77=no
+   save_LDFLAGS="$LDFLAGS"
+   LDFLAGS="$LDFLAGS $lt_tmp_static_flag"
+   echo "$lt_simple_link_test_code" > conftest.$ac_ext
+   if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then
+     # The linker can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     if test -s conftest.err; then
+       # Append any errors to the config.log.
+       cat conftest.err 1>&5
+       $ECHO "$_lt_linker_boilerplate" | $SED '/^$/d' > conftest.exp
+       $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
+       if diff conftest.exp conftest.er2 >/dev/null; then
+         lt_cv_prog_compiler_static_works_F77=yes
+       fi
+     else
+       lt_cv_prog_compiler_static_works_F77=yes
+     fi
+   fi
+   $RM -r conftest*
+   LDFLAGS="$save_LDFLAGS"
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_static_works_F77" >&5
+$as_echo "$lt_cv_prog_compiler_static_works_F77" >&6; }
+
+if test x"$lt_cv_prog_compiler_static_works_F77" = xyes; then
+    :
+else
+    lt_prog_compiler_static_F77=
+fi
+
+
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -c -o file.$ac_objext" >&5
+$as_echo_n "checking if $compiler supports -c -o file.$ac_objext... " >&6; }
+if ${lt_cv_prog_compiler_c_o_F77+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_c_o_F77=no
+   $RM -r conftest 2>/dev/null
+   mkdir conftest
+   cd conftest
+   mkdir out
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+
+   lt_compiler_flag="-o out/conftest2.$ac_objext"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
+   (eval "$lt_compile" 2>out/conftest.err)
+   ac_status=$?
+   cat out/conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   if (exit $ac_status) && test -s out/conftest2.$ac_objext
+   then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
+     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
+     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
+       lt_cv_prog_compiler_c_o_F77=yes
+     fi
+   fi
+   chmod u+w . 2>&5
+   $RM conftest*
+   # SGI C++ compiler will create directory out/ii_files/ for
+   # template instantiation
+   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
+   $RM out/* && rmdir out
+   cd ..
+   $RM -r conftest
+   $RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_c_o_F77" >&5
+$as_echo "$lt_cv_prog_compiler_c_o_F77" >&6; }
+
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -c -o file.$ac_objext" >&5
+$as_echo_n "checking if $compiler supports -c -o file.$ac_objext... " >&6; }
+if ${lt_cv_prog_compiler_c_o_F77+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_c_o_F77=no
+   $RM -r conftest 2>/dev/null
+   mkdir conftest
+   cd conftest
+   mkdir out
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+
+   lt_compiler_flag="-o out/conftest2.$ac_objext"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
+   (eval "$lt_compile" 2>out/conftest.err)
+   ac_status=$?
+   cat out/conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   if (exit $ac_status) && test -s out/conftest2.$ac_objext
+   then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
+     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
+     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
+       lt_cv_prog_compiler_c_o_F77=yes
+     fi
+   fi
+   chmod u+w . 2>&5
+   $RM conftest*
+   # SGI C++ compiler will create directory out/ii_files/ for
+   # template instantiation
+   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
+   $RM out/* && rmdir out
+   cd ..
+   $RM -r conftest
+   $RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_c_o_F77" >&5
+$as_echo "$lt_cv_prog_compiler_c_o_F77" >&6; }
+
+
+
+
+hard_links="nottested"
+if test "$lt_cv_prog_compiler_c_o_F77" = no && test "$need_locks" != no; then
+  # do not overwrite the value of need_locks provided by the user
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if we can lock with hard links" >&5
+$as_echo_n "checking if we can lock with hard links... " >&6; }
+  hard_links=yes
+  $RM conftest*
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  touch conftest.a
+  ln conftest.a conftest.b 2>&5 || hard_links=no
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $hard_links" >&5
+$as_echo "$hard_links" >&6; }
+  if test "$hard_links" = no; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&5
+$as_echo "$as_me: WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&2;}
+    need_locks=warn
+  fi
+else
+  need_locks=no
+fi
+
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the $compiler linker ($LD) supports shared libraries" >&5
+$as_echo_n "checking whether the $compiler linker ($LD) supports shared libraries... " >&6; }
+
+  runpath_var=
+  allow_undefined_flag_F77=
+  always_export_symbols_F77=no
+  archive_cmds_F77=
+  archive_expsym_cmds_F77=
+  compiler_needs_object_F77=no
+  enable_shared_with_static_runtimes_F77=no
+  export_dynamic_flag_spec_F77=
+  export_symbols_cmds_F77='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
+  hardcode_automatic_F77=no
+  hardcode_direct_F77=no
+  hardcode_direct_absolute_F77=no
+  hardcode_libdir_flag_spec_F77=
+  hardcode_libdir_separator_F77=
+  hardcode_minus_L_F77=no
+  hardcode_shlibpath_var_F77=unsupported
+  inherit_rpath_F77=no
+  link_all_deplibs_F77=unknown
+  module_cmds_F77=
+  module_expsym_cmds_F77=
+  old_archive_from_new_cmds_F77=
+  old_archive_from_expsyms_cmds_F77=
+  thread_safe_flag_spec_F77=
+  whole_archive_flag_spec_F77=
+  # include_expsyms should be a list of space-separated symbols to be *always*
+  # included in the symbol list
+  include_expsyms_F77=
+  # exclude_expsyms can be an extended regexp of symbols to exclude
+  # it will be wrapped by ` (' and `)$', so one must not match beginning or
+  # end of line.  Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
+  # as well as any symbol that contains `d'.
+  exclude_expsyms_F77='_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*'
+  # Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
+  # platforms (ab)use it in PIC code, but their linkers get confused if
+  # the symbol is explicitly referenced.  Since portable code cannot
+  # rely on this symbol name, it's probably fine to never include it in
+  # preloaded symbol tables.
+  # Exclude shared library initialization/finalization symbols.
+  extract_expsyms_cmds=
+
+  case $host_os in
+  cygwin* | mingw* | pw32* | cegcc*)
+    # FIXME: the MSVC++ port hasn't been tested in a loooong time
+    # When not using gcc, we currently assume that we are using
+    # Microsoft Visual C++.
+    if test "$GCC" != yes; then
+      with_gnu_ld=no
+    fi
+    ;;
+  interix*)
+    # we just hope/assume this is gcc and not c89 (= MSVC++)
+    with_gnu_ld=yes
+    ;;
+  openbsd*)
+    with_gnu_ld=no
+    ;;
+  linux* | k*bsd*-gnu | gnu*)
+    link_all_deplibs_F77=no
+    ;;
+  esac
+
+  ld_shlibs_F77=yes
+
+  # On some targets, GNU ld is compatible enough with the native linker
+  # that we're better off using the native interface for both.
+  lt_use_gnu_ld_interface=no
+  if test "$with_gnu_ld" = yes; then
+    case $host_os in
+      aix*)
+	# The AIX port of GNU ld has always aspired to compatibility
+	# with the native linker.  However, as the warning in the GNU ld
+	# block says, versions before 2.19.5* couldn't really create working
+	# shared libraries, regardless of the interface used.
+	case `$LD -v 2>&1` in
+	  *\ \(GNU\ Binutils\)\ 2.19.5*) ;;
+	  *\ \(GNU\ Binutils\)\ 2.[2-9]*) ;;
+	  *\ \(GNU\ Binutils\)\ [3-9]*) ;;
+	  *)
+	    lt_use_gnu_ld_interface=yes
+	    ;;
+	esac
+	;;
+      *)
+	lt_use_gnu_ld_interface=yes
+	;;
+    esac
+  fi
+
+  if test "$lt_use_gnu_ld_interface" = yes; then
+    # If archive_cmds runs LD, not CC, wlarc should be empty
+    wlarc='${wl}'
+
+    # Set some defaults for GNU ld with shared library support. These
+    # are reset later if shared libraries are not supported. Putting them
+    # here allows them to be overridden if necessary.
+    runpath_var=LD_RUN_PATH
+    hardcode_libdir_flag_spec_F77='${wl}-rpath ${wl}$libdir'
+    export_dynamic_flag_spec_F77='${wl}--export-dynamic'
+    # ancient GNU ld didn't support --whole-archive et. al.
+    if $LD --help 2>&1 | $GREP 'no-whole-archive' > /dev/null; then
+      whole_archive_flag_spec_F77="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+    else
+      whole_archive_flag_spec_F77=
+    fi
+    supports_anon_versioning=no
+    case `$LD -v 2>&1` in
+      *GNU\ gold*) supports_anon_versioning=yes ;;
+      *\ [01].* | *\ 2.[0-9].* | *\ 2.10.*) ;; # catch versions < 2.11
+      *\ 2.11.93.0.2\ *) supports_anon_versioning=yes ;; # RH7.3 ...
+      *\ 2.11.92.0.12\ *) supports_anon_versioning=yes ;; # Mandrake 8.2 ...
+      *\ 2.11.*) ;; # other 2.11 versions
+      *) supports_anon_versioning=yes ;;
+    esac
+
+    # See if GNU ld supports shared libraries.
+    case $host_os in
+    aix[3-9]*)
+      # On AIX/PPC, the GNU linker is very broken
+      if test "$host_cpu" != ia64; then
+	ld_shlibs_F77=no
+	cat <<_LT_EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.19, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support.  If you
+*** really care for shared libraries, you may want to install binutils
+*** 2.20 or above, or modify your PATH so that a non-GNU linker is found.
+*** You will then need to restart the configuration process.
+
+_LT_EOF
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            archive_cmds_F77='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+            archive_expsym_cmds_F77=''
+        ;;
+      m68k)
+            archive_cmds_F77='$RM $output_objdir/a2ixlibrary.data~$ECHO "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$ECHO "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$ECHO "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$ECHO "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+            hardcode_libdir_flag_spec_F77='-L$libdir'
+            hardcode_minus_L_F77=yes
+        ;;
+      esac
+      ;;
+
+    beos*)
+      if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	allow_undefined_flag_F77=unsupported
+	# Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+	# support --undefined.  This deserves some investigation.  FIXME
+	archive_cmds_F77='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      else
+	ld_shlibs_F77=no
+      fi
+      ;;
+
+    cygwin* | mingw* | pw32* | cegcc*)
+      # _LT_TAGVAR(hardcode_libdir_flag_spec, F77) is actually meaningless,
+      # as there is no search path for DLLs.
+      hardcode_libdir_flag_spec_F77='-L$libdir'
+      export_dynamic_flag_spec_F77='${wl}--export-all-symbols'
+      allow_undefined_flag_F77=unsupported
+      always_export_symbols_F77=no
+      enable_shared_with_static_runtimes_F77=yes
+      export_symbols_cmds_F77='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[BCDGRS][ ]/s/.*[ ]\([^ ]*\)/\1 DATA/;s/^.*[ ]__nm__\([^ ]*\)[ ][^ ]*/\1 DATA/;/^I[ ]/d;/^[AITW][ ]/s/.* //'\'' | sort | uniq > $export_symbols'
+      exclude_expsyms_F77='[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname'
+
+      if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
+        archive_cmds_F77='$CC -shared $libobjs $deplibs $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+	# If the export-symbols file already is a .def file (1st line
+	# is EXPORTS), use it as is; otherwise, prepend...
+	archive_expsym_cmds_F77='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
+	  cp $export_symbols $output_objdir/$soname.def;
+	else
+	  echo EXPORTS > $output_objdir/$soname.def;
+	  cat $export_symbols >> $output_objdir/$soname.def;
+	fi~
+	$CC -shared $output_objdir/$soname.def $libobjs $deplibs $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+      else
+	ld_shlibs_F77=no
+      fi
+      ;;
+
+    haiku*)
+      archive_cmds_F77='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      link_all_deplibs_F77=yes
+      ;;
+
+    interix[3-9]*)
+      hardcode_direct_F77=no
+      hardcode_shlibpath_var_F77=no
+      hardcode_libdir_flag_spec_F77='${wl}-rpath,$libdir'
+      export_dynamic_flag_spec_F77='${wl}-E'
+      # Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
+      # Instead, shared libraries are loaded at an image base (0x10000000 by
+      # default) and relocated if they conflict, which is a slow very memory
+      # consuming and fragmenting process.  To avoid this, we pick a random,
+      # 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
+      # time.  Moving up from 0x10000000 also allows more sbrk(2) space.
+      archive_cmds_F77='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+      archive_expsym_cmds_F77='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+      ;;
+
+    gnu* | linux* | tpf* | k*bsd*-gnu | kopensolaris*-gnu)
+      tmp_diet=no
+      if test "$host_os" = linux-dietlibc; then
+	case $cc_basename in
+	  diet\ *) tmp_diet=yes;;	# linux-dietlibc with static linking (!diet-dyn)
+	esac
+      fi
+      if $LD --help 2>&1 | $EGREP ': supported targets:.* elf' > /dev/null \
+	 && test "$tmp_diet" = no
+      then
+	tmp_addflag=' $pic_flag'
+	tmp_sharedflag='-shared'
+	case $cc_basename,$host_cpu in
+        pgcc*)				# Portland Group C compiler
+	  whole_archive_flag_spec_F77='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  tmp_addflag=' $pic_flag'
+	  ;;
+	pgf77* | pgf90* | pgf95* | pgfortran*)
+					# Portland Group f77 and f90 compilers
+	  whole_archive_flag_spec_F77='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  tmp_addflag=' $pic_flag -Mnomain' ;;
+	ecc*,ia64* | icc*,ia64*)	# Intel C compiler on ia64
+	  tmp_addflag=' -i_dynamic' ;;
+	efc*,ia64* | ifort*,ia64*)	# Intel Fortran compiler on ia64
+	  tmp_addflag=' -i_dynamic -nofor_main' ;;
+	ifc* | ifort*)			# Intel Fortran compiler
+	  tmp_addflag=' -nofor_main' ;;
+	lf95*)				# Lahey Fortran 8.1
+	  whole_archive_flag_spec_F77=
+	  tmp_sharedflag='--shared' ;;
+	xl[cC]* | bgxl[cC]* | mpixl[cC]*) # IBM XL C 8.0 on PPC (deal with xlf below)
+	  tmp_sharedflag='-qmkshrobj'
+	  tmp_addflag= ;;
+	nvcc*)	# Cuda Compiler Driver 2.2
+	  whole_archive_flag_spec_F77='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  compiler_needs_object_F77=yes
+	  ;;
+	esac
+	case `$CC -V 2>&1 | sed 5q` in
+	*Sun\ C*)			# Sun C 5.9
+	  whole_archive_flag_spec_F77='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  compiler_needs_object_F77=yes
+	  tmp_sharedflag='-G' ;;
+	*Sun\ F*)			# Sun Fortran 8.3
+	  tmp_sharedflag='-G' ;;
+	esac
+	archive_cmds_F77='$CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+
+        if test "x$supports_anon_versioning" = xyes; then
+          archive_expsym_cmds_F77='echo "{ global:" > $output_objdir/$libname.ver~
+	    cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+	    echo "local: *; };" >> $output_objdir/$libname.ver~
+	    $CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
+        fi
+
+	case $cc_basename in
+	xlf* | bgf* | bgxlf* | mpixlf*)
+	  # IBM XL Fortran 10.1 on PPC cannot create shared libs itself
+	  whole_archive_flag_spec_F77='--whole-archive$convenience --no-whole-archive'
+	  hardcode_libdir_flag_spec_F77='${wl}-rpath ${wl}$libdir'
+	  archive_cmds_F77='$LD -shared $libobjs $deplibs $linker_flags -soname $soname -o $lib'
+	  if test "x$supports_anon_versioning" = xyes; then
+	    archive_expsym_cmds_F77='echo "{ global:" > $output_objdir/$libname.ver~
+	      cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+	      echo "local: *; };" >> $output_objdir/$libname.ver~
+	      $LD -shared $libobjs $deplibs $linker_flags -soname $soname -version-script $output_objdir/$libname.ver -o $lib'
+	  fi
+	  ;;
+	esac
+      else
+        ld_shlibs_F77=no
+      fi
+      ;;
+
+    netbsd* | netbsdelf*-gnu)
+      if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+	archive_cmds_F77='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+	wlarc=
+      else
+	archive_cmds_F77='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	archive_expsym_cmds_F77='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+      fi
+      ;;
+
+    solaris*)
+      if $LD -v 2>&1 | $GREP 'BFD 2\.8' > /dev/null; then
+	ld_shlibs_F77=no
+	cat <<_LT_EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+_LT_EOF
+      elif $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	archive_cmds_F77='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	archive_expsym_cmds_F77='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+      else
+	ld_shlibs_F77=no
+      fi
+      ;;
+
+    sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX*)
+      case `$LD -v 2>&1` in
+        *\ [01].* | *\ 2.[0-9].* | *\ 2.1[0-5].*)
+	ld_shlibs_F77=no
+	cat <<_LT_EOF 1>&2
+
+*** Warning: Releases of the GNU linker prior to 2.16.91.0.3 can not
+*** reliably create shared libraries on SCO systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.16.91.0.3 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+_LT_EOF
+	;;
+	*)
+	  # For security reasons, it is highly recommended that you always
+	  # use absolute paths for naming shared libraries, and exclude the
+	  # DT_RUNPATH tag from executables and libraries.  But doing so
+	  # requires that you compile everything twice, which is a pain.
+	  if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	    hardcode_libdir_flag_spec_F77='${wl}-rpath ${wl}$libdir'
+	    archive_cmds_F77='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	    archive_expsym_cmds_F77='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+	  else
+	    ld_shlibs_F77=no
+	  fi
+	;;
+      esac
+      ;;
+
+    sunos4*)
+      archive_cmds_F77='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+      wlarc=
+      hardcode_direct_F77=yes
+      hardcode_shlibpath_var_F77=no
+      ;;
+
+    *)
+      if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	archive_cmds_F77='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	archive_expsym_cmds_F77='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+      else
+	ld_shlibs_F77=no
+      fi
+      ;;
+    esac
+
+    if test "$ld_shlibs_F77" = no; then
+      runpath_var=
+      hardcode_libdir_flag_spec_F77=
+      export_dynamic_flag_spec_F77=
+      whole_archive_flag_spec_F77=
+    fi
+  else
+    # PORTME fill in a description of your system's linker (not GNU ld)
+    case $host_os in
+    aix3*)
+      allow_undefined_flag_F77=unsupported
+      always_export_symbols_F77=yes
+      archive_expsym_cmds_F77='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname'
+      # Note: this linker hardcodes the directories in LIBPATH if there
+      # are no directories specified by -L.
+      hardcode_minus_L_F77=yes
+      if test "$GCC" = yes && test -z "$lt_prog_compiler_static"; then
+	# Neither direct hardcoding nor static linking is supported with a
+	# broken collect2.
+	hardcode_direct_F77=unsupported
+      fi
+      ;;
+
+    aix[4-9]*)
+      if test "$host_cpu" = ia64; then
+	# On IA64, the linker does run time linking by default, so we don't
+	# have to do anything special.
+	aix_use_runtimelinking=no
+	exp_sym_flag='-Bexport'
+	no_entry_flag=""
+      else
+	# If we're using GNU nm, then we don't want the "-C" option.
+	# -C means demangle to AIX nm, but means don't demangle with GNU nm
+	# Also, AIX nm treats weak defined symbols like other global
+	# defined symbols, whereas GNU nm marks them as "W".
+	if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
+	  export_symbols_cmds_F77='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && (substr(\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+	else
+	  export_symbols_cmds_F77='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B")) && (substr(\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+	fi
+	aix_use_runtimelinking=no
+
+	# Test if we are trying to use run time linking or normal
+	# AIX style linking. If -brtl is somewhere in LDFLAGS, we
+	# need to do runtime linking.
+	case $host_os in aix4.[23]|aix4.[23].*|aix[5-9]*)
+	  for ld_flag in $LDFLAGS; do
+	  if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl"); then
+	    aix_use_runtimelinking=yes
+	    break
+	  fi
+	  done
+	  ;;
+	esac
+
+	exp_sym_flag='-bexport'
+	no_entry_flag='-bnoentry'
+      fi
+
+      # When large executables or shared objects are built, AIX ld can
+      # have problems creating the table of contents.  If linking a library
+      # or program results in "error TOC overflow" add -mminimal-toc to
+      # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
+      # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
+
+      archive_cmds_F77=''
+      hardcode_direct_F77=yes
+      hardcode_direct_absolute_F77=yes
+      hardcode_libdir_separator_F77=':'
+      link_all_deplibs_F77=yes
+      file_list_spec_F77='${wl}-f,'
+
+      if test "$GCC" = yes; then
+	case $host_os in aix4.[012]|aix4.[012].*)
+	# We only want to do this on AIX 4.2 and lower, the check
+	# below for broken collect2 doesn't work under 4.3+
+	  collect2name=`${CC} -print-prog-name=collect2`
+	  if test -f "$collect2name" &&
+	   strings "$collect2name" | $GREP resolve_lib_name >/dev/null
+	  then
+	  # We have reworked collect2
+	  :
+	  else
+	  # We have old collect2
+	  hardcode_direct_F77=unsupported
+	  # It fails to find uninstalled libraries when the uninstalled
+	  # path is not listed in the libpath.  Setting hardcode_minus_L
+	  # to unsupported forces relinking
+	  hardcode_minus_L_F77=yes
+	  hardcode_libdir_flag_spec_F77='-L$libdir'
+	  hardcode_libdir_separator_F77=
+	  fi
+	  ;;
+	esac
+	shared_flag='-shared'
+	if test "$aix_use_runtimelinking" = yes; then
+	  shared_flag="$shared_flag "'${wl}-G'
+	fi
+	link_all_deplibs_F77=no
+      else
+	# not using gcc
+	if test "$host_cpu" = ia64; then
+	# VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
+	# chokes on -Wl,-G. The following line is correct:
+	  shared_flag='-G'
+	else
+	  if test "$aix_use_runtimelinking" = yes; then
+	    shared_flag='${wl}-G'
+	  else
+	    shared_flag='${wl}-bM:SRE'
+	  fi
+	fi
+      fi
+
+      export_dynamic_flag_spec_F77='${wl}-bexpall'
+      # It seems that -bexpall does not export symbols beginning with
+      # underscore (_), so it is better to generate a list of symbols to export.
+      always_export_symbols_F77=yes
+      if test "$aix_use_runtimelinking" = yes; then
+	# Warning - without using the other runtime loading flags (-brtl),
+	# -berok will link without error, but may produce a broken library.
+	allow_undefined_flag_F77='-berok'
+        # Determine the default libpath from the value encoded in an
+        # empty executable.
+        if test "${lt_cv_aix_libpath+set}" = set; then
+  aix_libpath=$lt_cv_aix_libpath
+else
+  if ${lt_cv_aix_libpath__F77+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+if ac_fn_f77_try_link "$LINENO"; then :
+
+  lt_aix_libpath_sed='
+      /Import File Strings/,/^$/ {
+	  /^0/ {
+	      s/^0  *\([^ ]*\) *$/\1/
+	      p
+	  }
+      }'
+  lt_cv_aix_libpath__F77=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  # Check for a 64-bit object if we didn't find anything.
+  if test -z "$lt_cv_aix_libpath__F77"; then
+    lt_cv_aix_libpath__F77=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  fi
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+  if test -z "$lt_cv_aix_libpath__F77"; then
+    lt_cv_aix_libpath__F77="/usr/lib:/lib"
+  fi
+
+fi
+
+  aix_libpath=$lt_cv_aix_libpath__F77
+fi
+
+        hardcode_libdir_flag_spec_F77='${wl}-blibpath:$libdir:'"$aix_libpath"
+        archive_expsym_cmds_F77='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
+      else
+	if test "$host_cpu" = ia64; then
+	  hardcode_libdir_flag_spec_F77='${wl}-R $libdir:/usr/lib:/lib'
+	  allow_undefined_flag_F77="-z nodefs"
+	  archive_expsym_cmds_F77="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
+	else
+	 # Determine the default libpath from the value encoded in an
+	 # empty executable.
+	 if test "${lt_cv_aix_libpath+set}" = set; then
+  aix_libpath=$lt_cv_aix_libpath
+else
+  if ${lt_cv_aix_libpath__F77+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+if ac_fn_f77_try_link "$LINENO"; then :
+
+  lt_aix_libpath_sed='
+      /Import File Strings/,/^$/ {
+	  /^0/ {
+	      s/^0  *\([^ ]*\) *$/\1/
+	      p
+	  }
+      }'
+  lt_cv_aix_libpath__F77=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  # Check for a 64-bit object if we didn't find anything.
+  if test -z "$lt_cv_aix_libpath__F77"; then
+    lt_cv_aix_libpath__F77=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  fi
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+  if test -z "$lt_cv_aix_libpath__F77"; then
+    lt_cv_aix_libpath__F77="/usr/lib:/lib"
+  fi
+
+fi
+
+  aix_libpath=$lt_cv_aix_libpath__F77
+fi
+
+	 hardcode_libdir_flag_spec_F77='${wl}-blibpath:$libdir:'"$aix_libpath"
+	  # Warning - without using the other run time loading flags,
+	  # -berok will link without error, but may produce a broken library.
+	  no_undefined_flag_F77=' ${wl}-bernotok'
+	  allow_undefined_flag_F77=' ${wl}-berok'
+	  if test "$with_gnu_ld" = yes; then
+	    # We only use this code for GNU lds that support --whole-archive.
+	    whole_archive_flag_spec_F77='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	  else
+	    # Exported symbols can be pulled into shared objects from archives
+	    whole_archive_flag_spec_F77='$convenience'
+	  fi
+	  archive_cmds_need_lc_F77=yes
+	  # This is similar to how AIX traditionally builds its shared libraries.
+	  archive_expsym_cmds_F77="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+	fi
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            archive_cmds_F77='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+            archive_expsym_cmds_F77=''
+        ;;
+      m68k)
+            archive_cmds_F77='$RM $output_objdir/a2ixlibrary.data~$ECHO "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$ECHO "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$ECHO "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$ECHO "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+            hardcode_libdir_flag_spec_F77='-L$libdir'
+            hardcode_minus_L_F77=yes
+        ;;
+      esac
+      ;;
+
+    bsdi[45]*)
+      export_dynamic_flag_spec_F77=-rdynamic
+      ;;
+
+    cygwin* | mingw* | pw32* | cegcc*)
+      # When not using gcc, we currently assume that we are using
+      # Microsoft Visual C++.
+      # hardcode_libdir_flag_spec is actually meaningless, as there is
+      # no search path for DLLs.
+      case $cc_basename in
+      cl*)
+	# Native MSVC
+	hardcode_libdir_flag_spec_F77=' '
+	allow_undefined_flag_F77=unsupported
+	always_export_symbols_F77=yes
+	file_list_spec_F77='@'
+	# Tell ltmain to make .lib files, not .a files.
+	libext=lib
+	# Tell ltmain to make .dll files, not .so files.
+	shrext_cmds=".dll"
+	# FIXME: Setting linknames here is a bad hack.
+	archive_cmds_F77='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-dll~linknames='
+	archive_expsym_cmds_F77='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
+	    sed -n -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' -e '1\\\!p' < $export_symbols > $output_objdir/$soname.exp;
+	  else
+	    sed -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' < $export_symbols > $output_objdir/$soname.exp;
+	  fi~
+	  $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
+	  linknames='
+	# The linker will not automatically build a static lib if we build a DLL.
+	# _LT_TAGVAR(old_archive_from_new_cmds, F77)='true'
+	enable_shared_with_static_runtimes_F77=yes
+	exclude_expsyms_F77='_NULL_IMPORT_DESCRIPTOR|_IMPORT_DESCRIPTOR_.*'
+	export_symbols_cmds_F77='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[BCDGRS][ ]/s/.*[ ]\([^ ]*\)/\1,DATA/'\'' | $SED -e '\''/^[AITW][ ]/s/.*[ ]//'\'' | sort | uniq > $export_symbols'
+	# Don't use ranlib
+	old_postinstall_cmds_F77='chmod 644 $oldlib'
+	postlink_cmds_F77='lt_outputfile="@OUTPUT@"~
+	  lt_tool_outputfile="@TOOL_OUTPUT@"~
+	  case $lt_outputfile in
+	    *.exe|*.EXE) ;;
+	    *)
+	      lt_outputfile="$lt_outputfile.exe"
+	      lt_tool_outputfile="$lt_tool_outputfile.exe"
+	      ;;
+	  esac~
+	  if test "$MANIFEST_TOOL" != ":" && test -f "$lt_outputfile.manifest"; then
+	    $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
+	    $RM "$lt_outputfile.manifest";
+	  fi'
+	;;
+      *)
+	# Assume MSVC wrapper
+	hardcode_libdir_flag_spec_F77=' '
+	allow_undefined_flag_F77=unsupported
+	# Tell ltmain to make .lib files, not .a files.
+	libext=lib
+	# Tell ltmain to make .dll files, not .so files.
+	shrext_cmds=".dll"
+	# FIXME: Setting linknames here is a bad hack.
+	archive_cmds_F77='$CC -o $lib $libobjs $compiler_flags `func_echo_all "$deplibs" | $SED '\''s/ -lc$//'\''` -link -dll~linknames='
+	# The linker will automatically build a .lib file if we build a DLL.
+	old_archive_from_new_cmds_F77='true'
+	# FIXME: Should let the user specify the lib program.
+	old_archive_cmds_F77='lib -OUT:$oldlib$oldobjs$old_deplibs'
+	enable_shared_with_static_runtimes_F77=yes
+	;;
+      esac
+      ;;
+
+    darwin* | rhapsody*)
+
+
+  archive_cmds_need_lc_F77=no
+  hardcode_direct_F77=no
+  hardcode_automatic_F77=yes
+  hardcode_shlibpath_var_F77=unsupported
+  if test "$lt_cv_ld_force_load" = "yes"; then
+    whole_archive_flag_spec_F77='`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience ${wl}-force_load,$conv\"; done; func_echo_all \"$new_convenience\"`'
+    compiler_needs_object_F77=yes
+  else
+    whole_archive_flag_spec_F77=''
+  fi
+  link_all_deplibs_F77=yes
+  allow_undefined_flag_F77="$_lt_dar_allow_undefined"
+  case $cc_basename in
+     ifort*) _lt_dar_can_shared=yes ;;
+     *) _lt_dar_can_shared=$GCC ;;
+  esac
+  if test "$_lt_dar_can_shared" = "yes"; then
+    output_verbose_link_cmd=func_echo_all
+    archive_cmds_F77="\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod${_lt_dsymutil}"
+    module_cmds_F77="\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dsymutil}"
+    archive_expsym_cmds_F77="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring ${_lt_dar_single_mod}${_lt_dar_export_syms}${_lt_dsymutil}"
+    module_expsym_cmds_F77="sed -e 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dar_export_syms}${_lt_dsymutil}"
+
+  else
+  ld_shlibs_F77=no
+  fi
+
+      ;;
+
+    dgux*)
+      archive_cmds_F77='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_libdir_flag_spec_F77='-L$libdir'
+      hardcode_shlibpath_var_F77=no
+      ;;
+
+    # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+    # support.  Future versions do this automatically, but an explicit c++rt0.o
+    # does not break anything, and helps significantly (at the cost of a little
+    # extra space).
+    freebsd2.2*)
+      archive_cmds_F77='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+      hardcode_libdir_flag_spec_F77='-R$libdir'
+      hardcode_direct_F77=yes
+      hardcode_shlibpath_var_F77=no
+      ;;
+
+    # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+    freebsd2.*)
+      archive_cmds_F77='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_direct_F77=yes
+      hardcode_minus_L_F77=yes
+      hardcode_shlibpath_var_F77=no
+      ;;
+
+    # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+    freebsd* | dragonfly*)
+      archive_cmds_F77='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+      hardcode_libdir_flag_spec_F77='-R$libdir'
+      hardcode_direct_F77=yes
+      hardcode_shlibpath_var_F77=no
+      ;;
+
+    hpux9*)
+      if test "$GCC" = yes; then
+	archive_cmds_F77='$RM $output_objdir/$soname~$CC -shared $pic_flag ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $libobjs $deplibs $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+      else
+	archive_cmds_F77='$RM $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+      fi
+      hardcode_libdir_flag_spec_F77='${wl}+b ${wl}$libdir'
+      hardcode_libdir_separator_F77=:
+      hardcode_direct_F77=yes
+
+      # hardcode_minus_L: Not really in the search PATH,
+      # but as the default location of the library.
+      hardcode_minus_L_F77=yes
+      export_dynamic_flag_spec_F77='${wl}-E'
+      ;;
+
+    hpux10*)
+      if test "$GCC" = yes && test "$with_gnu_ld" = no; then
+	archive_cmds_F77='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	archive_cmds_F77='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags'
+      fi
+      if test "$with_gnu_ld" = no; then
+	hardcode_libdir_flag_spec_F77='${wl}+b ${wl}$libdir'
+	hardcode_libdir_separator_F77=:
+	hardcode_direct_F77=yes
+	hardcode_direct_absolute_F77=yes
+	export_dynamic_flag_spec_F77='${wl}-E'
+	# hardcode_minus_L: Not really in the search PATH,
+	# but as the default location of the library.
+	hardcode_minus_L_F77=yes
+      fi
+      ;;
+
+    hpux11*)
+      if test "$GCC" = yes && test "$with_gnu_ld" = no; then
+	case $host_cpu in
+	hppa*64*)
+	  archive_cmds_F77='$CC -shared ${wl}+h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	ia64*)
+	  archive_cmds_F77='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	*)
+	  archive_cmds_F77='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	esac
+      else
+	case $host_cpu in
+	hppa*64*)
+	  archive_cmds_F77='$CC -b ${wl}+h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	ia64*)
+	  archive_cmds_F77='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	*)
+	archive_cmds_F77='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	esac
+      fi
+      if test "$with_gnu_ld" = no; then
+	hardcode_libdir_flag_spec_F77='${wl}+b ${wl}$libdir'
+	hardcode_libdir_separator_F77=:
+
+	case $host_cpu in
+	hppa*64*|ia64*)
+	  hardcode_direct_F77=no
+	  hardcode_shlibpath_var_F77=no
+	  ;;
+	*)
+	  hardcode_direct_F77=yes
+	  hardcode_direct_absolute_F77=yes
+	  export_dynamic_flag_spec_F77='${wl}-E'
+
+	  # hardcode_minus_L: Not really in the search PATH,
+	  # but as the default location of the library.
+	  hardcode_minus_L_F77=yes
+	  ;;
+	esac
+      fi
+      ;;
+
+    irix5* | irix6* | nonstopux*)
+      if test "$GCC" = yes; then
+	archive_cmds_F77='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	# Try to use the -exported_symbol ld option, if it does not
+	# work, assume that -exports_file does not work either and
+	# implicitly export all symbols.
+	# This should be the same for all languages, so no per-tag cache variable.
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the $host_os linker accepts -exported_symbol" >&5
+$as_echo_n "checking whether the $host_os linker accepts -exported_symbol... " >&6; }
+if ${lt_cv_irix_exported_symbol+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  save_LDFLAGS="$LDFLAGS"
+	   LDFLAGS="$LDFLAGS -shared ${wl}-exported_symbol ${wl}foo ${wl}-update_registry ${wl}/dev/null"
+	   cat > conftest.$ac_ext <<_ACEOF
+
+      subroutine foo
+      end
+_ACEOF
+if ac_fn_f77_try_link "$LINENO"; then :
+  lt_cv_irix_exported_symbol=yes
+else
+  lt_cv_irix_exported_symbol=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+           LDFLAGS="$save_LDFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_irix_exported_symbol" >&5
+$as_echo "$lt_cv_irix_exported_symbol" >&6; }
+	if test "$lt_cv_irix_exported_symbol" = yes; then
+          archive_expsym_cmds_F77='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations ${wl}-exports_file ${wl}$export_symbols -o $lib'
+	fi
+      else
+	archive_cmds_F77='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	archive_expsym_cmds_F77='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -exports_file $export_symbols -o $lib'
+      fi
+      archive_cmds_need_lc_F77='no'
+      hardcode_libdir_flag_spec_F77='${wl}-rpath ${wl}$libdir'
+      hardcode_libdir_separator_F77=:
+      inherit_rpath_F77=yes
+      link_all_deplibs_F77=yes
+      ;;
+
+    netbsd* | netbsdelf*-gnu)
+      if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+	archive_cmds_F77='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
+      else
+	archive_cmds_F77='$LD -shared -o $lib $libobjs $deplibs $linker_flags'      # ELF
+      fi
+      hardcode_libdir_flag_spec_F77='-R$libdir'
+      hardcode_direct_F77=yes
+      hardcode_shlibpath_var_F77=no
+      ;;
+
+    newsos6)
+      archive_cmds_F77='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_direct_F77=yes
+      hardcode_libdir_flag_spec_F77='${wl}-rpath ${wl}$libdir'
+      hardcode_libdir_separator_F77=:
+      hardcode_shlibpath_var_F77=no
+      ;;
+
+    *nto* | *qnx*)
+      ;;
+
+    openbsd*)
+      if test -f /usr/libexec/ld.so; then
+	hardcode_direct_F77=yes
+	hardcode_shlibpath_var_F77=no
+	hardcode_direct_absolute_F77=yes
+	if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+	  archive_cmds_F77='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+	  archive_expsym_cmds_F77='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags ${wl}-retain-symbols-file,$export_symbols'
+	  hardcode_libdir_flag_spec_F77='${wl}-rpath,$libdir'
+	  export_dynamic_flag_spec_F77='${wl}-E'
+	else
+	  case $host_os in
+	   openbsd[01].* | openbsd2.[0-7] | openbsd2.[0-7].*)
+	     archive_cmds_F77='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+	     hardcode_libdir_flag_spec_F77='-R$libdir'
+	     ;;
+	   *)
+	     archive_cmds_F77='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+	     hardcode_libdir_flag_spec_F77='${wl}-rpath,$libdir'
+	     ;;
+	  esac
+	fi
+      else
+	ld_shlibs_F77=no
+      fi
+      ;;
+
+    os2*)
+      hardcode_libdir_flag_spec_F77='-L$libdir'
+      hardcode_minus_L_F77=yes
+      allow_undefined_flag_F77=unsupported
+      archive_cmds_F77='$ECHO "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~echo DATA >> $output_objdir/$libname.def~echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+      old_archive_from_new_cmds_F77='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+      ;;
+
+    osf3*)
+      if test "$GCC" = yes; then
+	allow_undefined_flag_F77=' ${wl}-expect_unresolved ${wl}\*'
+	archive_cmds_F77='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+      else
+	allow_undefined_flag_F77=' -expect_unresolved \*'
+	archive_cmds_F77='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+      fi
+      archive_cmds_need_lc_F77='no'
+      hardcode_libdir_flag_spec_F77='${wl}-rpath ${wl}$libdir'
+      hardcode_libdir_separator_F77=:
+      ;;
+
+    osf4* | osf5*)	# as osf3* with the addition of -msym flag
+      if test "$GCC" = yes; then
+	allow_undefined_flag_F77=' ${wl}-expect_unresolved ${wl}\*'
+	archive_cmds_F77='$CC -shared${allow_undefined_flag} $pic_flag $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	hardcode_libdir_flag_spec_F77='${wl}-rpath ${wl}$libdir'
+      else
+	allow_undefined_flag_F77=' -expect_unresolved \*'
+	archive_cmds_F77='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	archive_expsym_cmds_F77='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done; printf "%s\\n" "-hidden">> $lib.exp~
+	$CC -shared${allow_undefined_flag} ${wl}-input ${wl}$lib.exp $compiler_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~$RM $lib.exp'
+
+	# Both c and cxx compiler support -rpath directly
+	hardcode_libdir_flag_spec_F77='-rpath $libdir'
+      fi
+      archive_cmds_need_lc_F77='no'
+      hardcode_libdir_separator_F77=:
+      ;;
+
+    solaris*)
+      no_undefined_flag_F77=' -z defs'
+      if test "$GCC" = yes; then
+	wlarc='${wl}'
+	archive_cmds_F77='$CC -shared $pic_flag ${wl}-z ${wl}text ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	archive_expsym_cmds_F77='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	  $CC -shared $pic_flag ${wl}-z ${wl}text ${wl}-M ${wl}$lib.exp ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
+      else
+	case `$CC -V 2>&1` in
+	*"Compilers 5.0"*)
+	  wlarc=''
+	  archive_cmds_F77='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	  archive_expsym_cmds_F77='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	  $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$RM $lib.exp'
+	  ;;
+	*)
+	  wlarc='${wl}'
+	  archive_cmds_F77='$CC -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $compiler_flags'
+	  archive_expsym_cmds_F77='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	  $CC -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
+	  ;;
+	esac
+      fi
+      hardcode_libdir_flag_spec_F77='-R$libdir'
+      hardcode_shlibpath_var_F77=no
+      case $host_os in
+      solaris2.[0-5] | solaris2.[0-5].*) ;;
+      *)
+	# The compiler driver will combine and reorder linker options,
+	# but understands `-z linker_flag'.  GCC discards it without `$wl',
+	# but is careful enough not to reorder.
+	# Supported since Solaris 2.6 (maybe 2.5.1?)
+	if test "$GCC" = yes; then
+	  whole_archive_flag_spec_F77='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
+	else
+	  whole_archive_flag_spec_F77='-z allextract$convenience -z defaultextract'
+	fi
+	;;
+      esac
+      link_all_deplibs_F77=yes
+      ;;
+
+    sunos4*)
+      if test "x$host_vendor" = xsequent; then
+	# Use $CC to link under sequent, because it throws in some extra .o
+	# files that make .init and .fini sections work.
+	archive_cmds_F77='$CC -G ${wl}-h $soname -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	archive_cmds_F77='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+      fi
+      hardcode_libdir_flag_spec_F77='-L$libdir'
+      hardcode_direct_F77=yes
+      hardcode_minus_L_F77=yes
+      hardcode_shlibpath_var_F77=no
+      ;;
+
+    sysv4)
+      case $host_vendor in
+	sni)
+	  archive_cmds_F77='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	  hardcode_direct_F77=yes # is this really true???
+	;;
+	siemens)
+	  ## LD is ld it makes a PLAMLIB
+	  ## CC just makes a GrossModule.
+	  archive_cmds_F77='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+	  reload_cmds_F77='$CC -r -o $output$reload_objs'
+	  hardcode_direct_F77=no
+        ;;
+	motorola)
+	  archive_cmds_F77='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	  hardcode_direct_F77=no #Motorola manual says yes, but my tests say they lie
+	;;
+      esac
+      runpath_var='LD_RUN_PATH'
+      hardcode_shlibpath_var_F77=no
+      ;;
+
+    sysv4.3*)
+      archive_cmds_F77='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_shlibpath_var_F77=no
+      export_dynamic_flag_spec_F77='-Bexport'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	archive_cmds_F77='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	hardcode_shlibpath_var_F77=no
+	runpath_var=LD_RUN_PATH
+	hardcode_runpath_var=yes
+	ld_shlibs_F77=yes
+      fi
+      ;;
+
+    sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[01].[10]* | unixware7* | sco3.2v5.0.[024]*)
+      no_undefined_flag_F77='${wl}-z,text'
+      archive_cmds_need_lc_F77=no
+      hardcode_shlibpath_var_F77=no
+      runpath_var='LD_RUN_PATH'
+
+      if test "$GCC" = yes; then
+	archive_cmds_F77='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	archive_expsym_cmds_F77='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	archive_cmds_F77='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	archive_expsym_cmds_F77='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      fi
+      ;;
+
+    sysv5* | sco3.2v5* | sco5v6*)
+      # Note: We can NOT use -z defs as we might desire, because we do not
+      # link with -lc, and that would cause any symbols used from libc to
+      # always be unresolved, which means just about no library would
+      # ever link correctly.  If we're not using GNU ld we use -z text
+      # though, which does catch some bad symbols but isn't as heavy-handed
+      # as -z defs.
+      no_undefined_flag_F77='${wl}-z,text'
+      allow_undefined_flag_F77='${wl}-z,nodefs'
+      archive_cmds_need_lc_F77=no
+      hardcode_shlibpath_var_F77=no
+      hardcode_libdir_flag_spec_F77='${wl}-R,$libdir'
+      hardcode_libdir_separator_F77=':'
+      link_all_deplibs_F77=yes
+      export_dynamic_flag_spec_F77='${wl}-Bexport'
+      runpath_var='LD_RUN_PATH'
+
+      if test "$GCC" = yes; then
+	archive_cmds_F77='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	archive_expsym_cmds_F77='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	archive_cmds_F77='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	archive_expsym_cmds_F77='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      fi
+      ;;
+
+    uts4*)
+      archive_cmds_F77='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_libdir_flag_spec_F77='-L$libdir'
+      hardcode_shlibpath_var_F77=no
+      ;;
+
+    *)
+      ld_shlibs_F77=no
+      ;;
+    esac
+
+    if test x$host_vendor = xsni; then
+      case $host in
+      sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+	export_dynamic_flag_spec_F77='${wl}-Blargedynsym'
+	;;
+      esac
+    fi
+  fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ld_shlibs_F77" >&5
+$as_echo "$ld_shlibs_F77" >&6; }
+test "$ld_shlibs_F77" = no && can_build_shared=no
+
+with_gnu_ld_F77=$with_gnu_ld
+
+
+
+
+
+
+#
+# Do we need to explicitly link libc?
+#
+case "x$archive_cmds_need_lc_F77" in
+x|xyes)
+  # Assume -lc should be added
+  archive_cmds_need_lc_F77=yes
+
+  if test "$enable_shared" = yes && test "$GCC" = yes; then
+    case $archive_cmds_F77 in
+    *'~'*)
+      # FIXME: we may have to deal with multi-command sequences.
+      ;;
+    '$CC '*)
+      # Test whether the compiler implicitly links with -lc since on some
+      # systems, -lgcc has to come before -lc. If gcc already passes -lc
+      # to ld, don't add -lc before -lgcc.
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether -lc should be explicitly linked in" >&5
+$as_echo_n "checking whether -lc should be explicitly linked in... " >&6; }
+if ${lt_cv_archive_cmds_need_lc_F77+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  $RM conftest*
+	echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+
+	if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } 2>conftest.err; then
+	  soname=conftest
+	  lib=conftest
+	  libobjs=conftest.$ac_objext
+	  deplibs=
+	  wl=$lt_prog_compiler_wl_F77
+	  pic_flag=$lt_prog_compiler_pic_F77
+	  compiler_flags=-v
+	  linker_flags=-v
+	  verstring=
+	  output_objdir=.
+	  libname=conftest
+	  lt_save_allow_undefined_flag=$allow_undefined_flag_F77
+	  allow_undefined_flag_F77=
+	  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$archive_cmds_F77 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1\""; } >&5
+  (eval $archive_cmds_F77 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+	  then
+	    lt_cv_archive_cmds_need_lc_F77=no
+	  else
+	    lt_cv_archive_cmds_need_lc_F77=yes
+	  fi
+	  allow_undefined_flag_F77=$lt_save_allow_undefined_flag
+	else
+	  cat conftest.err 1>&5
+	fi
+	$RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_archive_cmds_need_lc_F77" >&5
+$as_echo "$lt_cv_archive_cmds_need_lc_F77" >&6; }
+      archive_cmds_need_lc_F77=$lt_cv_archive_cmds_need_lc_F77
+      ;;
+    esac
+  fi
+  ;;
+esac
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking dynamic linker characteristics" >&5
+$as_echo_n "checking dynamic linker characteristics... " >&6; }
+
+library_names_spec=
+libname_spec='lib$name'
+soname_spec=
+shrext_cmds=".so"
+postinstall_cmds=
+postuninstall_cmds=
+finish_cmds=
+finish_eval=
+shlibpath_var=
+shlibpath_overrides_runpath=unknown
+version_type=none
+dynamic_linker="$host_os ld.so"
+sys_lib_dlsearch_path_spec="/lib /usr/lib"
+need_lib_prefix=unknown
+hardcode_into_libs=no
+
+# when you set need_version to no, make sure it does not cause -set_version
+# flags to be left without arguments
+need_version=unknown
+
+case $host_os in
+aix3*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix $libname.a'
+  shlibpath_var=LIBPATH
+
+  # AIX 3 has no versioning support, so we append a major version to the name.
+  soname_spec='${libname}${release}${shared_ext}$major'
+  ;;
+
+aix[4-9]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  hardcode_into_libs=yes
+  if test "$host_cpu" = ia64; then
+    # AIX 5 supports IA64
+    library_names_spec='${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext}$versuffix $libname${shared_ext}'
+    shlibpath_var=LD_LIBRARY_PATH
+  else
+    # With GCC up to 2.95.x, collect2 would create an import file
+    # for dependence libraries.  The import file would start with
+    # the line `#! .'.  This would cause the generated library to
+    # depend on `.', always an invalid library.  This was fixed in
+    # development snapshots of GCC prior to 3.0.
+    case $host_os in
+      aix4 | aix4.[01] | aix4.[01].*)
+      if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
+	   echo ' yes '
+	   echo '#endif'; } | ${CC} -E - | $GREP yes > /dev/null; then
+	:
+      else
+	can_build_shared=no
+      fi
+      ;;
+    esac
+    # AIX (on Power*) has no versioning support, so currently we can not hardcode correct
+    # soname into executable. Probably we can add versioning support to
+    # collect2, so additional links can be useful in future.
+    if test "$aix_use_runtimelinking" = yes; then
+      # If using run time linking (on AIX 4.2 or later) use lib<name>.so
+      # instead of lib<name>.a to let people know that these are not
+      # typical AIX shared libraries.
+      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    else
+      # We preserve .a as extension for shared libraries through AIX4.2
+      # and later when we are not doing run time linking.
+      library_names_spec='${libname}${release}.a $libname.a'
+      soname_spec='${libname}${release}${shared_ext}$major'
+    fi
+    shlibpath_var=LIBPATH
+  fi
+  ;;
+
+amigaos*)
+  case $host_cpu in
+  powerpc)
+    # Since July 2007 AmigaOS4 officially supports .so libraries.
+    # When compiling the executable, add -use-dynld -Lsobjs: to the compileline.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    ;;
+  m68k)
+    library_names_spec='$libname.ixlibrary $libname.a'
+    # Create ${libname}_ixlibrary.a entries in /sys/libs.
+    finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`func_echo_all "$lib" | $SED '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $RM /sys/libs/${libname}_ixlibrary.a; $show "cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a"; cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a || exit 1; done'
+    ;;
+  esac
+  ;;
+
+beos*)
+  library_names_spec='${libname}${shared_ext}'
+  dynamic_linker="$host_os ld.so"
+  shlibpath_var=LIBRARY_PATH
+  ;;
+
+bsdi[45]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
+  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
+  # the default ld.so.conf also contains /usr/contrib/lib and
+  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
+  # libtool to hard-code these into programs
+  ;;
+
+cygwin* | mingw* | pw32* | cegcc*)
+  version_type=windows
+  shrext_cmds=".dll"
+  need_version=no
+  need_lib_prefix=no
+
+  case $GCC,$cc_basename in
+  yes,*)
+    # gcc
+    library_names_spec='$libname.dll.a'
+    # DLL is installed to $(libdir)/../bin by postinstall_cmds
+    postinstall_cmds='base_file=`basename \${file}`~
+      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog $dir/$dlname \$dldir/$dlname~
+      chmod a+x \$dldir/$dlname~
+      if test -n '\''$stripme'\'' && test -n '\''$striplib'\''; then
+        eval '\''$striplib \$dldir/$dlname'\'' || exit \$?;
+      fi'
+    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll~
+       $RM \$dlpath'
+    shlibpath_overrides_runpath=yes
+
+    case $host_os in
+    cygwin*)
+      # Cygwin DLLs use 'cyg' prefix rather than 'lib'
+      soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+
+      ;;
+    mingw* | cegcc*)
+      # MinGW DLLs use traditional 'lib' prefix
+      soname_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+      ;;
+    pw32*)
+      # pw32 DLLs use 'pw' prefix rather than 'lib'
+      library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+      ;;
+    esac
+    dynamic_linker='Win32 ld.exe'
+    ;;
+
+  *,cl*)
+    # Native MSVC
+    libname_spec='$name'
+    soname_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+    library_names_spec='${libname}.dll.lib'
+
+    case $build_os in
+    mingw*)
+      sys_lib_search_path_spec=
+      lt_save_ifs=$IFS
+      IFS=';'
+      for lt_path in $LIB
+      do
+        IFS=$lt_save_ifs
+        # Let DOS variable expansion print the short 8.3 style file name.
+        lt_path=`cd "$lt_path" 2>/dev/null && cmd //C "for %i in (".") do @echo %~si"`
+        sys_lib_search_path_spec="$sys_lib_search_path_spec $lt_path"
+      done
+      IFS=$lt_save_ifs
+      # Convert to MSYS style.
+      sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | sed -e 's|\\\\|/|g' -e 's| \\([a-zA-Z]\\):| /\\1|g' -e 's|^ ||'`
+      ;;
+    cygwin*)
+      # Convert to unix form, then to dos form, then back to unix form
+      # but this time dos style (no spaces!) so that the unix form looks
+      # like /cygdrive/c/PROGRA~1:/cygdr...
+      sys_lib_search_path_spec=`cygpath --path --unix "$LIB"`
+      sys_lib_search_path_spec=`cygpath --path --dos "$sys_lib_search_path_spec" 2>/dev/null`
+      sys_lib_search_path_spec=`cygpath --path --unix "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
+      ;;
+    *)
+      sys_lib_search_path_spec="$LIB"
+      if $ECHO "$sys_lib_search_path_spec" | $GREP ';[c-zC-Z]:/' >/dev/null; then
+        # It is most probably a Windows format PATH.
+        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e 's/;/ /g'`
+      else
+        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
+      fi
+      # FIXME: find the short name or the path components, as spaces are
+      # common. (e.g. "Program Files" -> "PROGRA~1")
+      ;;
+    esac
+
+    # DLL is installed to $(libdir)/../bin by postinstall_cmds
+    postinstall_cmds='base_file=`basename \${file}`~
+      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog $dir/$dlname \$dldir/$dlname'
+    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll~
+       $RM \$dlpath'
+    shlibpath_overrides_runpath=yes
+    dynamic_linker='Win32 link.exe'
+    ;;
+
+  *)
+    # Assume MSVC wrapper
+    library_names_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext} $libname.lib'
+    dynamic_linker='Win32 ld.exe'
+    ;;
+  esac
+  # FIXME: first we should search . and the directory the executable is in
+  shlibpath_var=PATH
+  ;;
+
+darwin* | rhapsody*)
+  dynamic_linker="$host_os dyld"
+  version_type=darwin
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${major}$shared_ext ${libname}$shared_ext'
+  soname_spec='${libname}${release}${major}$shared_ext'
+  shlibpath_overrides_runpath=yes
+  shlibpath_var=DYLD_LIBRARY_PATH
+  shrext_cmds='`test .$module = .yes && echo .so || echo .dylib`'
+
+  sys_lib_dlsearch_path_spec='/usr/local/lib /lib /usr/lib'
+  ;;
+
+dgux*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname$shared_ext'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+freebsd* | dragonfly*)
+  # DragonFly does not have aout.  When/if they implement a new
+  # versioning mechanism, adjust this.
+  if test -x /usr/bin/objformat; then
+    objformat=`/usr/bin/objformat`
+  else
+    case $host_os in
+    freebsd[23].*) objformat=aout ;;
+    *) objformat=elf ;;
+    esac
+  fi
+  version_type=freebsd-$objformat
+  case $version_type in
+    freebsd-elf*)
+      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
+      need_version=no
+      need_lib_prefix=no
+      ;;
+    freebsd-*)
+      library_names_spec='${libname}${release}${shared_ext}$versuffix $libname${shared_ext}$versuffix'
+      need_version=yes
+      ;;
+  esac
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_os in
+  freebsd2.*)
+    shlibpath_overrides_runpath=yes
+    ;;
+  freebsd3.[01]* | freebsdelf3.[01]*)
+    shlibpath_overrides_runpath=yes
+    hardcode_into_libs=yes
+    ;;
+  freebsd3.[2-9]* | freebsdelf3.[2-9]* | \
+  freebsd4.[0-5] | freebsdelf4.[0-5] | freebsd4.1.1 | freebsdelf4.1.1)
+    shlibpath_overrides_runpath=no
+    hardcode_into_libs=yes
+    ;;
+  *) # from 4.6 on, and DragonFly
+    shlibpath_overrides_runpath=yes
+    hardcode_into_libs=yes
+    ;;
+  esac
+  ;;
+
+haiku*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  dynamic_linker="$host_os runtime_loader"
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}${major} ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  sys_lib_dlsearch_path_spec='/boot/home/config/lib /boot/common/lib /boot/system/lib'
+  hardcode_into_libs=yes
+  ;;
+
+hpux9* | hpux10* | hpux11*)
+  # Give a soname corresponding to the major version so that dld.sl refuses to
+  # link against other versions.
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  case $host_cpu in
+  ia64*)
+    shrext_cmds='.so'
+    hardcode_into_libs=yes
+    dynamic_linker="$host_os dld.so"
+    shlibpath_var=LD_LIBRARY_PATH
+    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    if test "X$HPUX_IA64_MODE" = X32; then
+      sys_lib_search_path_spec="/usr/lib/hpux32 /usr/local/lib/hpux32 /usr/local/lib"
+    else
+      sys_lib_search_path_spec="/usr/lib/hpux64 /usr/local/lib/hpux64"
+    fi
+    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
+    ;;
+  hppa*64*)
+    shrext_cmds='.sl'
+    hardcode_into_libs=yes
+    dynamic_linker="$host_os dld.sl"
+    shlibpath_var=LD_LIBRARY_PATH # How should we handle SHLIB_PATH
+    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    sys_lib_search_path_spec="/usr/lib/pa20_64 /usr/ccs/lib/pa20_64"
+    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
+    ;;
+  *)
+    shrext_cmds='.sl'
+    dynamic_linker="$host_os dld.sl"
+    shlibpath_var=SHLIB_PATH
+    shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    ;;
+  esac
+  # HP-UX runs *really* slowly unless shared libraries are mode 555, ...
+  postinstall_cmds='chmod 555 $lib'
+  # or fails outright, so override atomically:
+  install_override_mode=555
+  ;;
+
+interix[3-9]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  dynamic_linker='Interix 3.x ld.so.1 (PE, like ELF)'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  ;;
+
+irix5* | irix6* | nonstopux*)
+  case $host_os in
+    nonstopux*) version_type=nonstopux ;;
+    *)
+	if test "$lt_cv_prog_gnu_ld" = yes; then
+		version_type=linux # correct to gnu/linux during the next big refactor
+	else
+		version_type=irix
+	fi ;;
+  esac
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext} $libname${shared_ext}'
+  case $host_os in
+  irix5* | nonstopux*)
+    libsuff= shlibsuff=
+    ;;
+  *)
+    case $LD in # libtool.m4 will add one of these switches to LD
+    *-32|*"-32 "|*-melf32bsmip|*"-melf32bsmip ")
+      libsuff= shlibsuff= libmagic=32-bit;;
+    *-n32|*"-n32 "|*-melf32bmipn32|*"-melf32bmipn32 ")
+      libsuff=32 shlibsuff=N32 libmagic=N32;;
+    *-64|*"-64 "|*-melf64bmip|*"-melf64bmip ")
+      libsuff=64 shlibsuff=64 libmagic=64-bit;;
+    *) libsuff= shlibsuff= libmagic=never-match;;
+    esac
+    ;;
+  esac
+  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
+  shlibpath_overrides_runpath=no
+  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
+  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+  hardcode_into_libs=yes
+  ;;
+
+# No shared lib support for Linux oldld, aout, or coff.
+linux*oldld* | linux*aout* | linux*coff*)
+  dynamic_linker=no
+  ;;
+
+# This must be glibc/ELF.
+linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+
+  # Some binutils ld are patched to set DT_RUNPATH
+  if ${lt_cv_shlibpath_overrides_runpath+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_shlibpath_overrides_runpath=no
+    save_LDFLAGS=$LDFLAGS
+    save_libdir=$libdir
+    eval "libdir=/foo; wl=\"$lt_prog_compiler_wl_F77\"; \
+	 LDFLAGS=\"\$LDFLAGS $hardcode_libdir_flag_spec_F77\""
+    cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+if ac_fn_f77_try_link "$LINENO"; then :
+  if  ($OBJDUMP -p conftest$ac_exeext) 2>/dev/null | grep "RUNPATH.*$libdir" >/dev/null; then :
+  lt_cv_shlibpath_overrides_runpath=yes
+fi
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+    LDFLAGS=$save_LDFLAGS
+    libdir=$save_libdir
+
+fi
+
+  shlibpath_overrides_runpath=$lt_cv_shlibpath_overrides_runpath
+
+  # This implies no fast_install, which is unacceptable.
+  # Some rework will be needed to allow for fast_install
+  # before this can be enabled.
+  hardcode_into_libs=yes
+
+  # Append ld.so.conf contents to the search path
+  if test -f /etc/ld.so.conf; then
+    lt_ld_extra=`awk '/^include / { system(sprintf("cd /etc; cat %s 2>/dev/null", \$2)); skip = 1; } { if (!skip) print \$0; skip = 0; }' < /etc/ld.so.conf | $SED -e 's/#.*//;/^[	 ]*hwcap[	 ]/d;s/[:,	]/ /g;s/=[^=]*$//;s/=[^= ]* / /g;s/"//g;/^$/d' | tr '\n' ' '`
+    sys_lib_dlsearch_path_spec="/lib /usr/lib $lt_ld_extra"
+  fi
+
+  # We used to test for /lib/ld.so.1 and disable shared libraries on
+  # powerpc, because MkLinux only supported shared libraries with the
+  # GNU dynamic linker.  Since this was broken with cross compilers,
+  # most powerpc-linux boxes support dynamic linking these days and
+  # people can always --disable-shared, the test was removed, and we
+  # assume the GNU/Linux dynamic linker is in use.
+  dynamic_linker='GNU/Linux ld.so'
+  ;;
+
+netbsdelf*-gnu)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  dynamic_linker='NetBSD ld.elf_so'
+  ;;
+
+netbsd*)
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+    dynamic_linker='NetBSD (a.out) ld.so'
+  else
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    dynamic_linker='NetBSD ld.elf_so'
+  fi
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  ;;
+
+newsos6)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  ;;
+
+*nto* | *qnx*)
+  version_type=qnx
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  dynamic_linker='ldqnx.so'
+  ;;
+
+openbsd*)
+  version_type=sunos
+  sys_lib_dlsearch_path_spec="/usr/lib"
+  need_lib_prefix=no
+  # Some older versions of OpenBSD (3.3 at least) *do* need versioned libs.
+  case $host_os in
+    openbsd3.3 | openbsd3.3.*)	need_version=yes ;;
+    *)				need_version=no  ;;
+  esac
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+    case $host_os in
+      openbsd2.[89] | openbsd2.[89].*)
+	shlibpath_overrides_runpath=no
+	;;
+      *)
+	shlibpath_overrides_runpath=yes
+	;;
+      esac
+  else
+    shlibpath_overrides_runpath=yes
+  fi
+  ;;
+
+os2*)
+  libname_spec='$name'
+  shrext_cmds=".dll"
+  need_lib_prefix=no
+  library_names_spec='$libname${shared_ext} $libname.a'
+  dynamic_linker='OS/2 ld.exe'
+  shlibpath_var=LIBPATH
+  ;;
+
+osf3* | osf4* | osf5*)
+  version_type=osf
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
+  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+  ;;
+
+rdos*)
+  dynamic_linker=no
+  ;;
+
+solaris*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  # ldd complains unless libraries are executable
+  postinstall_cmds='chmod +x $lib'
+  ;;
+
+sunos4*)
+  version_type=sunos
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+  fi
+  need_version=yes
+  ;;
+
+sysv4 | sysv4.3*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_vendor in
+    sni)
+      shlibpath_overrides_runpath=no
+      need_lib_prefix=no
+      runpath_var=LD_RUN_PATH
+      ;;
+    siemens)
+      need_lib_prefix=no
+      ;;
+    motorola)
+      need_lib_prefix=no
+      need_version=no
+      shlibpath_overrides_runpath=no
+      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+      ;;
+  esac
+  ;;
+
+sysv4*MP*)
+  if test -d /usr/nec ;then
+    version_type=linux # correct to gnu/linux during the next big refactor
+    library_names_spec='$libname${shared_ext}.$versuffix $libname${shared_ext}.$major $libname${shared_ext}'
+    soname_spec='$libname${shared_ext}.$major'
+    shlibpath_var=LD_LIBRARY_PATH
+  fi
+  ;;
+
+sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
+  version_type=freebsd-elf
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  if test "$with_gnu_ld" = yes; then
+    sys_lib_search_path_spec='/usr/local/lib /usr/gnu/lib /usr/ccs/lib /usr/lib /lib'
+  else
+    sys_lib_search_path_spec='/usr/ccs/lib /usr/lib'
+    case $host_os in
+      sco3.2v5*)
+        sys_lib_search_path_spec="$sys_lib_search_path_spec /lib"
+	;;
+    esac
+  fi
+  sys_lib_dlsearch_path_spec='/usr/lib'
+  ;;
+
+tpf*)
+  # TPF is a cross-target only.  Preferred cross-host = GNU/Linux.
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  ;;
+
+uts4*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+*)
+  dynamic_linker=no
+  ;;
+esac
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $dynamic_linker" >&5
+$as_echo "$dynamic_linker" >&6; }
+test "$dynamic_linker" = no && can_build_shared=no
+
+variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
+if test "$GCC" = yes; then
+  variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
+fi
+
+if test "${lt_cv_sys_lib_search_path_spec+set}" = set; then
+  sys_lib_search_path_spec="$lt_cv_sys_lib_search_path_spec"
+fi
+if test "${lt_cv_sys_lib_dlsearch_path_spec+set}" = set; then
+  sys_lib_dlsearch_path_spec="$lt_cv_sys_lib_dlsearch_path_spec"
+fi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking how to hardcode library paths into programs" >&5
+$as_echo_n "checking how to hardcode library paths into programs... " >&6; }
+hardcode_action_F77=
+if test -n "$hardcode_libdir_flag_spec_F77" ||
+   test -n "$runpath_var_F77" ||
+   test "X$hardcode_automatic_F77" = "Xyes" ; then
+
+  # We can hardcode non-existent directories.
+  if test "$hardcode_direct_F77" != no &&
+     # If the only mechanism to avoid hardcoding is shlibpath_var, we
+     # have to relink, otherwise we might link with an installed library
+     # when we should be linking with a yet-to-be-installed one
+     ## test "$_LT_TAGVAR(hardcode_shlibpath_var, F77)" != no &&
+     test "$hardcode_minus_L_F77" != no; then
+    # Linking always hardcodes the temporary library directory.
+    hardcode_action_F77=relink
+  else
+    # We can link without hardcoding, and we can hardcode nonexisting dirs.
+    hardcode_action_F77=immediate
+  fi
+else
+  # We cannot hardcode anything, or else we can only hardcode existing
+  # directories.
+  hardcode_action_F77=unsupported
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hardcode_action_F77" >&5
+$as_echo "$hardcode_action_F77" >&6; }
+
+if test "$hardcode_action_F77" = relink ||
+   test "$inherit_rpath_F77" = yes; then
+  # Fast installation is not supported
+  enable_fast_install=no
+elif test "$shlibpath_overrides_runpath" = yes ||
+     test "$enable_shared" = no; then
+  # Fast installation is not necessary
+  enable_fast_install=needless
+fi
+
+
+
+
+
+
+
+  fi # test -n "$compiler"
+
+  GCC=$lt_save_GCC
+  CC="$lt_save_CC"
+  CFLAGS="$lt_save_CFLAGS"
+fi # test "$_lt_disable_F77" != yes
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+        if test -z "$F77"; then
+                enable_fortran=no
+                { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: *** Couldn't find f77 compiler; using default Fortran wrappers." >&5
+$as_echo "$as_me: WARNING: *** Couldn't find f77 compiler; using default Fortran wrappers." >&2;}
+	else
+		ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to get verbose linking output from $F77" >&5
+$as_echo_n "checking how to get verbose linking output from $F77... " >&6; }
+if ${ac_cv_prog_f77_v+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+if ac_fn_f77_try_compile "$LINENO"; then :
+  ac_cv_prog_f77_v=
+# Try some options frequently used verbose output
+for ac_verb in -v -verbose --verbose -V -\#\#\#; do
+  cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+
+# Compile and link our simple test program by passing a flag (argument
+# 1 to this macro) to the Fortran compiler in order to get
+# "verbose" output that we can then parse for the Fortran linker
+# flags.
+ac_save_FFLAGS=$FFLAGS
+FFLAGS="$FFLAGS $ac_verb"
+eval "set x $ac_link"
+shift
+$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5
+# gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH,
+# LIBRARY_PATH; skip all such settings.
+ac_f77_v_output=`eval $ac_link 5>&1 2>&1 |
+  sed '/^Driving:/d; /^Configured with:/d;
+      '"/^[_$as_cr_Letters][_$as_cr_alnum]*=/d"`
+$as_echo "$ac_f77_v_output" >&5
+FFLAGS=$ac_save_FFLAGS
+
+rm -rf conftest*
+
+# On HP/UX there is a line like: "LPATH is: /foo:/bar:/baz" where
+# /foo, /bar, and /baz are search directories for the Fortran linker.
+# Here, we change these into -L/foo -L/bar -L/baz (and put it first):
+ac_f77_v_output="`echo $ac_f77_v_output |
+	grep 'LPATH is:' |
+	sed 's|.*LPATH is\(: *[^ ]*\).*|\1|;s|: */| -L/|g'` $ac_f77_v_output"
+
+# FIXME: we keep getting bitten by quoted arguments; a more general fix
+#        that detects unbalanced quotes in FLIBS should be implemented
+#        and (ugh) tested at some point.
+case $ac_f77_v_output in
+  # With xlf replace commas with spaces,
+  # and remove "-link" and closing parenthesis.
+  *xlfentry*)
+    ac_f77_v_output=`echo $ac_f77_v_output |
+      sed '
+        s/,/ /g
+        s/ -link / /g
+        s/) *$//
+      '
+    ` ;;
+
+  # With Intel ifc, ignore the quoted -mGLOB_options_string stuff (quoted
+  # $LIBS confuse us, and the libraries appear later in the output anyway).
+  *mGLOB_options_string*)
+    ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"-mGLOB[^"]*"/ /g'` ;;
+
+  # Portland Group compiler has singly- or doubly-quoted -cmdline argument
+  # Singly-quoted arguments were reported for versions 5.2-4 and 6.0-4.
+  # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2".
+  *-cmdline\ * | *-ignore\ * | *-def\ *)
+    ac_f77_v_output=`echo $ac_f77_v_output | sed "\
+	s/-cmdline  *'[^']*'/ /g; s/-cmdline  *\"[^\"]*\"/ /g
+	s/-ignore  *'[^']*'/ /g; s/-ignore  *\"[^\"]*\"/ /g
+	s/-def  *'[^']*'/ /g; s/-def  *\"[^\"]*\"/ /g"` ;;
+
+  # If we are using fort77 (the f2c wrapper) then filter output and delete quotes.
+  *fort77*f2c*gcc*)
+    ac_f77_v_output=`echo "$ac_f77_v_output" | sed -n '
+        /:[	 ]\+Running[	 ]\{1,\}"gcc"/{
+          /"-c"/d
+          /[.]c"*/d
+          s/^.*"gcc"/"gcc"/
+          s/"//gp
+        }'` ;;
+
+  # If we are using Cray Fortran then delete quotes.
+  *cft90*)
+    ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"//g'` ;;
+esac
+
+
+  # look for -l* and *.a constructs in the output
+  for ac_arg in $ac_f77_v_output; do
+     case $ac_arg in
+	[\\/]*.a | ?:[\\/]*.a | -[lLRu]*)
+	  ac_cv_prog_f77_v=$ac_verb
+	  break 2 ;;
+     esac
+  done
+done
+if test -z "$ac_cv_prog_f77_v"; then
+   { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot determine how to obtain linking information from $F77" >&5
+$as_echo "$as_me: WARNING: cannot determine how to obtain linking information from $F77" >&2;}
+fi
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: compilation failed" >&5
+$as_echo "$as_me: WARNING: compilation failed" >&2;}
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_f77_v" >&5
+$as_echo "$ac_cv_prog_f77_v" >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran 77 libraries of $F77" >&5
+$as_echo_n "checking for Fortran 77 libraries of $F77... " >&6; }
+if ${ac_cv_f77_libs+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "x$FLIBS" != "x"; then
+  ac_cv_f77_libs="$FLIBS" # Let the user override the test.
+else
+
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+
+# Compile and link our simple test program by passing a flag (argument
+# 1 to this macro) to the Fortran compiler in order to get
+# "verbose" output that we can then parse for the Fortran linker
+# flags.
+ac_save_FFLAGS=$FFLAGS
+FFLAGS="$FFLAGS $ac_cv_prog_f77_v"
+eval "set x $ac_link"
+shift
+$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5
+# gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH,
+# LIBRARY_PATH; skip all such settings.
+ac_f77_v_output=`eval $ac_link 5>&1 2>&1 |
+  sed '/^Driving:/d; /^Configured with:/d;
+      '"/^[_$as_cr_Letters][_$as_cr_alnum]*=/d"`
+$as_echo "$ac_f77_v_output" >&5
+FFLAGS=$ac_save_FFLAGS
+
+rm -rf conftest*
+
+# On HP/UX there is a line like: "LPATH is: /foo:/bar:/baz" where
+# /foo, /bar, and /baz are search directories for the Fortran linker.
+# Here, we change these into -L/foo -L/bar -L/baz (and put it first):
+ac_f77_v_output="`echo $ac_f77_v_output |
+	grep 'LPATH is:' |
+	sed 's|.*LPATH is\(: *[^ ]*\).*|\1|;s|: */| -L/|g'` $ac_f77_v_output"
+
+# FIXME: we keep getting bitten by quoted arguments; a more general fix
+#        that detects unbalanced quotes in FLIBS should be implemented
+#        and (ugh) tested at some point.
+case $ac_f77_v_output in
+  # With xlf replace commas with spaces,
+  # and remove "-link" and closing parenthesis.
+  *xlfentry*)
+    ac_f77_v_output=`echo $ac_f77_v_output |
+      sed '
+        s/,/ /g
+        s/ -link / /g
+        s/) *$//
+      '
+    ` ;;
+
+  # With Intel ifc, ignore the quoted -mGLOB_options_string stuff (quoted
+  # $LIBS confuse us, and the libraries appear later in the output anyway).
+  *mGLOB_options_string*)
+    ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"-mGLOB[^"]*"/ /g'` ;;
+
+  # Portland Group compiler has singly- or doubly-quoted -cmdline argument
+  # Singly-quoted arguments were reported for versions 5.2-4 and 6.0-4.
+  # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2".
+  *-cmdline\ * | *-ignore\ * | *-def\ *)
+    ac_f77_v_output=`echo $ac_f77_v_output | sed "\
+	s/-cmdline  *'[^']*'/ /g; s/-cmdline  *\"[^\"]*\"/ /g
+	s/-ignore  *'[^']*'/ /g; s/-ignore  *\"[^\"]*\"/ /g
+	s/-def  *'[^']*'/ /g; s/-def  *\"[^\"]*\"/ /g"` ;;
+
+  # If we are using fort77 (the f2c wrapper) then filter output and delete quotes.
+  *fort77*f2c*gcc*)
+    ac_f77_v_output=`echo "$ac_f77_v_output" | sed -n '
+        /:[	 ]\+Running[	 ]\{1,\}"gcc"/{
+          /"-c"/d
+          /[.]c"*/d
+          s/^.*"gcc"/"gcc"/
+          s/"//gp
+        }'` ;;
+
+  # If we are using Cray Fortran then delete quotes.
+  *cft90*)
+    ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"//g'` ;;
+esac
+
+
+
+ac_cv_f77_libs=
+
+# Save positional arguments (if any)
+ac_save_positional="$@"
+
+set X $ac_f77_v_output
+while test $# != 1; do
+  shift
+  ac_arg=$1
+  case $ac_arg in
+	[\\/]*.a | ?:[\\/]*.a)
+	    ac_exists=false
+  for ac_i in $ac_cv_f77_libs; do
+    if test x"$ac_arg" = x"$ac_i"; then
+      ac_exists=true
+      break
+    fi
+  done
+
+  if test x"$ac_exists" = xtrue; then :
+
+else
+  ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg"
+fi
+	  ;;
+	-bI:*)
+	    ac_exists=false
+  for ac_i in $ac_cv_f77_libs; do
+    if test x"$ac_arg" = x"$ac_i"; then
+      ac_exists=true
+      break
+    fi
+  done
+
+  if test x"$ac_exists" = xtrue; then :
+
+else
+  if test "$ac_compiler_gnu" = yes; then
+  for ac_link_opt in $ac_arg; do
+    ac_cv_f77_libs="$ac_cv_f77_libs -Xlinker $ac_link_opt"
+  done
+else
+  ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg"
+fi
+fi
+	  ;;
+	  # Ignore these flags.
+	-lang* | -lcrt*.o | -lc | -lgcc* | -lSystem | -libmil | -little \
+	  |-LANG:=* | -LIST:* | -LNO:* | -link)
+	  ;;
+	-lkernel32)
+	  case $host_os in
+	  *cygwin*) ;;
+	  *) ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg"
+	    ;;
+	  esac
+	  ;;
+	-[LRuYz])
+	  # These flags, when seen by themselves, take an argument.
+	  # We remove the space between option and argument and re-iterate
+	  # unless we find an empty arg or a new option (starting with -)
+	  case $2 in
+	     "" | -*);;
+	     *)
+		ac_arg="$ac_arg$2"
+		shift; shift
+		set X $ac_arg "$@"
+		;;
+	  esac
+	  ;;
+	-YP,*)
+	  for ac_j in `$as_echo "$ac_arg" | sed -e 's/-YP,/-L/;s/:/ -L/g'`; do
+	      ac_exists=false
+  for ac_i in $ac_cv_f77_libs; do
+    if test x"$ac_j" = x"$ac_i"; then
+      ac_exists=true
+      break
+    fi
+  done
+
+  if test x"$ac_exists" = xtrue; then :
+
+else
+  ac_arg="$ac_arg $ac_j"
+			       ac_cv_f77_libs="$ac_cv_f77_libs $ac_j"
+fi
+	  done
+	  ;;
+	-[lLR]*)
+	    ac_exists=false
+  for ac_i in $ac_cv_f77_libs; do
+    if test x"$ac_arg" = x"$ac_i"; then
+      ac_exists=true
+      break
+    fi
+  done
+
+  if test x"$ac_exists" = xtrue; then :
+
+else
+  ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg"
+fi
+	  ;;
+	-zallextract*| -zdefaultextract)
+	  ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg"
+	  ;;
+	  # Ignore everything else.
+  esac
+done
+# restore positional arguments
+set X $ac_save_positional; shift
+
+# We only consider "LD_RUN_PATH" on Solaris systems.  If this is seen,
+# then we insist that the "run path" must be an absolute path (i.e. it
+# must begin with a "/").
+case `(uname -sr) 2>/dev/null` in
+   "SunOS 5"*)
+      ac_ld_run_path=`$as_echo "$ac_f77_v_output" |
+			sed -n 's,^.*LD_RUN_PATH *= *\(/[^ ]*\).*$,-R\1,p'`
+      test "x$ac_ld_run_path" != x &&
+	if test "$ac_compiler_gnu" = yes; then
+  for ac_link_opt in $ac_ld_run_path; do
+    ac_cv_f77_libs="$ac_cv_f77_libs -Xlinker $ac_link_opt"
+  done
+else
+  ac_cv_f77_libs="$ac_cv_f77_libs $ac_ld_run_path"
+fi
+      ;;
+esac
+fi # test "x$[]_AC_LANG_PREFIX[]LIBS" = "x"
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_libs" >&5
+$as_echo "$ac_cv_f77_libs" >&6; }
+FLIBS="$ac_cv_f77_libs"
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for dummy main to link with Fortran 77 libraries" >&5
+$as_echo_n "checking for dummy main to link with Fortran 77 libraries... " >&6; }
+if ${ac_cv_f77_dummy_main+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_f77_dm_save_LIBS=$LIBS
+ LIBS="$LIBS $FLIBS"
+ ac_fortran_dm_var=F77_DUMMY_MAIN
+ ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+ # First, try linking without a dummy main:
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#ifdef F77_DUMMY_MAIN
+
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+   int F77_DUMMY_MAIN() { return 1; }
+
+#endif
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_fortran_dummy_main=none
+else
+  ac_cv_fortran_dummy_main=unknown
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+ if test $ac_cv_fortran_dummy_main = unknown; then
+   for ac_func in MAIN__ MAIN_ __main MAIN _MAIN __MAIN main_ main__ _main; do
+     cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#define $ac_fortran_dm_var $ac_func
+#ifdef F77_DUMMY_MAIN
+
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+   int F77_DUMMY_MAIN() { return 1; }
+
+#endif
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_fortran_dummy_main=$ac_func; break
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+   done
+ fi
+ ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+ ac_cv_f77_dummy_main=$ac_cv_fortran_dummy_main
+ rm -rf conftest*
+ LIBS=$ac_f77_dm_save_LIBS
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_dummy_main" >&5
+$as_echo "$ac_cv_f77_dummy_main" >&6; }
+F77_DUMMY_MAIN=$ac_cv_f77_dummy_main
+if test "$F77_DUMMY_MAIN" != unknown; then :
+  if test $F77_DUMMY_MAIN != none; then
+
+cat >>confdefs.h <<_ACEOF
+#define F77_DUMMY_MAIN $F77_DUMMY_MAIN
+_ACEOF
+
+  if test "x$ac_cv_fc_dummy_main" = "x$ac_cv_f77_dummy_main"; then
+
+$as_echo "#define FC_DUMMY_MAIN_EQ_F77 1" >>confdefs.h
+
+  fi
+fi
+else
+  enable_fortran=no
+			{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: *** Couldn't figure out how to link C and Fortran; using default Fortran wrappers." >&5
+$as_echo "$as_me: WARNING: *** Couldn't figure out how to link C and Fortran; using default Fortran wrappers." >&2;}
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+        fi
+else
+
+$as_echo "#define DISABLE_FORTRAN 1" >>confdefs.h
+
+fi
+
+if test "x$enable_fortran" = xyes; then
+        ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran 77 name-mangling scheme" >&5
+$as_echo_n "checking for Fortran 77 name-mangling scheme... " >&6; }
+if ${ac_cv_f77_mangling+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.$ac_ext <<_ACEOF
+      subroutine foobar()
+      return
+      end
+      subroutine foo_bar()
+      return
+      end
+_ACEOF
+if ac_fn_f77_try_compile "$LINENO"; then :
+  mv conftest.$ac_objext cfortran_test.$ac_objext
+
+  ac_save_LIBS=$LIBS
+  LIBS="cfortran_test.$ac_objext $LIBS $FLIBS"
+
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+  ac_success=no
+  for ac_foobar in foobar FOOBAR; do
+    for ac_underscore in "" "_"; do
+      ac_func="$ac_foobar$ac_underscore"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $ac_func ();
+#ifdef F77_DUMMY_MAIN
+
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+   int F77_DUMMY_MAIN() { return 1; }
+
+#endif
+int
+main ()
+{
+return $ac_func ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_success=yes; break 2
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+    done
+  done
+  ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+
+  if test "$ac_success" = "yes"; then
+     case $ac_foobar in
+	foobar)
+	   ac_case=lower
+	   ac_foo_bar=foo_bar
+	   ;;
+	FOOBAR)
+	   ac_case=upper
+	   ac_foo_bar=FOO_BAR
+	   ;;
+     esac
+
+     ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+     ac_success_extra=no
+     for ac_extra in "" "_"; do
+	ac_func="$ac_foo_bar$ac_underscore$ac_extra"
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $ac_func ();
+#ifdef F77_DUMMY_MAIN
+
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+   int F77_DUMMY_MAIN() { return 1; }
+
+#endif
+int
+main ()
+{
+return $ac_func ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_success_extra=yes; break
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+     done
+     ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+
+     if test "$ac_success_extra" = "yes"; then
+	ac_cv_f77_mangling="$ac_case case"
+	if test -z "$ac_underscore"; then
+	   ac_cv_f77_mangling="$ac_cv_f77_mangling, no underscore"
+	else
+	   ac_cv_f77_mangling="$ac_cv_f77_mangling, underscore"
+	fi
+	if test -z "$ac_extra"; then
+	   ac_cv_f77_mangling="$ac_cv_f77_mangling, no extra underscore"
+	else
+	   ac_cv_f77_mangling="$ac_cv_f77_mangling, extra underscore"
+	fi
+      else
+	ac_cv_f77_mangling="unknown"
+      fi
+  else
+     ac_cv_f77_mangling="unknown"
+  fi
+
+  LIBS=$ac_save_LIBS
+  rm -rf conftest*
+  rm -f cfortran_test*
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compile a simple Fortran program
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_mangling" >&5
+$as_echo "$ac_cv_f77_mangling" >&6; }
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+case $ac_cv_f77_mangling in
+  "lower case, no underscore, no extra underscore")
+	  $as_echo "#define F77_FUNC(name,NAME) name" >>confdefs.h
+
+	  $as_echo "#define F77_FUNC_(name,NAME) name" >>confdefs.h
+ ;;
+  "lower case, no underscore, extra underscore")
+	  $as_echo "#define F77_FUNC(name,NAME) name" >>confdefs.h
+
+	  $as_echo "#define F77_FUNC_(name,NAME) name ## _" >>confdefs.h
+ ;;
+  "lower case, underscore, no extra underscore")
+	  $as_echo "#define F77_FUNC(name,NAME) name ## _" >>confdefs.h
+
+	  $as_echo "#define F77_FUNC_(name,NAME) name ## _" >>confdefs.h
+ ;;
+  "lower case, underscore, extra underscore")
+	  $as_echo "#define F77_FUNC(name,NAME) name ## _" >>confdefs.h
+
+	  $as_echo "#define F77_FUNC_(name,NAME) name ## __" >>confdefs.h
+ ;;
+  "upper case, no underscore, no extra underscore")
+	  $as_echo "#define F77_FUNC(name,NAME) NAME" >>confdefs.h
+
+	  $as_echo "#define F77_FUNC_(name,NAME) NAME" >>confdefs.h
+ ;;
+  "upper case, no underscore, extra underscore")
+	  $as_echo "#define F77_FUNC(name,NAME) NAME" >>confdefs.h
+
+	  $as_echo "#define F77_FUNC_(name,NAME) NAME ## _" >>confdefs.h
+ ;;
+  "upper case, underscore, no extra underscore")
+	  $as_echo "#define F77_FUNC(name,NAME) NAME ## _" >>confdefs.h
+
+	  $as_echo "#define F77_FUNC_(name,NAME) NAME ## _" >>confdefs.h
+ ;;
+  "upper case, underscore, extra underscore")
+	  $as_echo "#define F77_FUNC(name,NAME) NAME ## _" >>confdefs.h
+
+	  $as_echo "#define F77_FUNC_(name,NAME) NAME ## __" >>confdefs.h
+ ;;
+  *)
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unknown Fortran name-mangling scheme" >&5
+$as_echo "$as_me: WARNING: unknown Fortran name-mangling scheme" >&2;}
+	  ;;
+esac
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+	ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+case $ac_cv_f77_mangling in
+  upper*) ac_val="F77FOO" ;;
+  lower*) ac_val="f77foo" ;;
+  *)      ac_val="unknown" ;;
+esac
+case $ac_cv_f77_mangling in *," underscore"*) ac_val="$ac_val"_ ;; esac
+
+f77foo="$ac_val"
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+	ac_ext=f
+ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5'
+ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_f77_compiler_gnu
+case $ac_cv_f77_mangling in
+  upper*) ac_val="F77_FOO" ;;
+  lower*) ac_val="f77_foo" ;;
+  *)      ac_val="unknown" ;;
+esac
+case $ac_cv_f77_mangling in *," underscore"*) ac_val="$ac_val"_ ;; esac
+case $ac_cv_f77_mangling in *," extra underscore"*) ac_val="$ac_val"_ ;; esac
+
+f77_foo="$ac_val"
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+	f77_foo2=`echo $f77foo | sed 's/77/77_/'`
+	if test "$f77_foo" = "$f77_foo2"; then
+
+$as_echo "#define F77_FUNC_EQUIV 1" >>confdefs.h
+
+
+		# Include g77 wrappers by default for GNU systems or gfortran
+		with_g77_wrappers=$ac_cv_f77_compiler_gnu
+		case $host_os in *gnu*) with_g77_wrappers=yes ;; esac
+	fi
+else
+	with_g77_wrappers=no
+fi
+
+
+# Check whether --with-g77-wrappers was given.
+if test "${with_g77_wrappers+set}" = set; then :
+  withval=$with_g77_wrappers; with_g77_wrappers=$withval
+fi
+
+if test "x$with_g77_wrappers" = "xyes"; then
+
+$as_echo "#define WITH_G77_WRAPPERS 1" >>confdefs.h
+
+fi
+
+have_smp="no"
+# Check whether --enable-openmp was given.
+if test "${enable_openmp+set}" = set; then :
+  enableval=$enable_openmp; enable_openmp=$enableval
+else
+  enable_openmp=no
+fi
+
+
+if test "$enable_openmp" = "yes"; then
+
+$as_echo "#define HAVE_OPENMP 1" >>confdefs.h
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for OpenMP flag of C compiler" >&5
+$as_echo_n "checking for OpenMP flag of C compiler... " >&6; }
+if ${ax_cv_c_openmp+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  saveCFLAGS=$CFLAGS
+ax_cv_c_openmp=unknown
+# Flags to try:  -fopenmp (gcc), -openmp (icc), -mp (SGI & PGI),
+#                -xopenmp (Sun), -omp (Tru64), -qsmp=omp (AIX), none
+ax_openmp_flags="-fopenmp -openmp -mp -xopenmp -omp -qsmp=omp none"
+if test "x$OPENMP_CFLAGS" != x; then
+  ax_openmp_flags="$OPENMP_CFLAGS $ax_openmp_flags"
+fi
+for ax_openmp_flag in $ax_openmp_flags; do
+  case $ax_openmp_flag in
+    none) CFLAGS=$saveC ;;
+    *) CFLAGS="$saveCFLAGS $ax_openmp_flag" ;;
+  esac
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char omp_set_num_threads ();
+#ifdef F77_DUMMY_MAIN
+
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+   int F77_DUMMY_MAIN() { return 1; }
+
+#endif
+int
+main ()
+{
+return omp_set_num_threads ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ax_cv_c_openmp=$ax_openmp_flag; break
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+done
+CFLAGS=$saveCFLAGS
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_c_openmp" >&5
+$as_echo "$ax_cv_c_openmp" >&6; }
+if test "x$ax_cv_c_openmp" = "xunknown"; then
+  as_fn_error $? "don't know how to enable OpenMP" "$LINENO" 5
+else
+  if test "x$ax_cv_c_openmp" != "xnone"; then
+    OPENMP_CFLAGS=$ax_cv_c_openmp
+  fi
+
+$as_echo "#define HAVE_OPENMP 1" >>confdefs.h
+
+fi
+
+
+fi
+
+# Check whether --enable-threads was given.
+if test "${enable_threads+set}" = set; then :
+  enableval=$enable_threads; enable_threads=$enableval
+else
+  enable_threads=no
+fi
+
+
+if test "$enable_threads" = "yes"; then
+
+$as_echo "#define HAVE_THREADS 1" >>confdefs.h
+
+fi
+
+
+# Check whether --with-combined-threads was given.
+if test "${with_combined_threads+set}" = set; then :
+  withval=$with_combined_threads; with_combined_threads=$withval
+else
+  with_combined_threads=no
+fi
+
+
+if test "$with_combined_threads" = yes; then
+   if test "$enable_openmp" = "yes"; then
+      as_fn_error $? "--with-combined-threads incompatible with --enable-openmp" "$LINENO" 5
+   fi
+   if test "$enable_threads" != "yes"; then
+      as_fn_error $? "--with-combined-threads requires --enable-threads" "$LINENO" 5
+   fi
+fi
+
+THREADLIBS=""
+if test "$enable_threads" = "yes"; then
+	# POSIX threads, the default choice:
+	if test -z "$THREADLIBS"; then
+
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+acx_pthread_ok=no
+
+# We used to check for pthread.h first, but this fails if pthread.h
+# requires special compiler flags (e.g. on True64 or Sequent).
+# It gets checked for in the link test anyway.
+
+# First of all, check if the user has set any of the PTHREAD_LIBS,
+# etcetera environment variables, and if threads linking works using
+# them:
+if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS" >&5
+$as_echo_n "checking for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS... " >&6; }
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pthread_join ();
+#ifdef F77_DUMMY_MAIN
+
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+   int F77_DUMMY_MAIN() { return 1; }
+
+#endif
+int
+main ()
+{
+return pthread_join ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  acx_pthread_ok=yes
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $acx_pthread_ok" >&5
+$as_echo "$acx_pthread_ok" >&6; }
+        if test x"$acx_pthread_ok" = xno; then
+                PTHREAD_LIBS=""
+                PTHREAD_CFLAGS=""
+        fi
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+fi
+
+# We must check for the threads library under a number of different
+# names; the ordering is very important because some systems
+# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
+# libraries is broken (non-POSIX).
+
+# Create a list of thread flags to try.  Items starting with a "-" are
+# C compiler flags, and other items are library names, except for "none"
+# which indicates that we try without any flags at all, and "pthread-config"
+# which is a program returning the flags for the Pth emulation library.
+
+acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mt -mthreads pthread --thread-safe pthread-config"
+
+# The ordering *is* (sometimes) important.  Some notes on the
+# individual items follow:
+
+# pthreads: AIX (must check this before -lpthread)
+# none: in case threads are in libc; should be tried before -Kthread and
+#       other compiler flags to prevent continual compiler warnings
+# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
+# -pthreads: Solaris/gcc
+# -mthreads: Mingw32/gcc, Lynx/gcc
+# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+#      doesn't hurt to check since this sometimes defines pthreads too;
+#      also defines -D_REENTRANT)
+#      ... -mt is also the pthreads flag for HP/aCC
+#           (where it should come before -mthreads to avoid spurious warnings)
+# pthread: Linux, etcetera
+# --thread-safe: KAI C++
+# pthread-config: use pthread-config program (for GNU Pth library)
+
+case "${host_cpu}-${host_os}" in
+        *solaris*)
+
+        # On Solaris (at least, for some versions), libc contains stubbed
+        # (non-functional) versions of the pthreads routines, so link-based
+        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
+        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+        # a function called by this macro, so we could check for that, but
+        # who knows whether they'll stub that too in a future libc.)  So,
+        # we'll just look for -pthreads and -lpthread first:
+
+        acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags"
+        ;;
+esac
+
+if test x"$acx_pthread_ok" = xno; then
+for flag in $acx_pthread_flags; do
+
+        case $flag in
+                none)
+                { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether pthreads work without any flags" >&5
+$as_echo_n "checking whether pthreads work without any flags... " >&6; }
+                ;;
+
+                -*)
+                { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether pthreads work with $flag" >&5
+$as_echo_n "checking whether pthreads work with $flag... " >&6; }
+                PTHREAD_CFLAGS="$flag"
+                ;;
+
+		pthread-config)
+		# Extract the first word of "pthread-config", so it can be a program name with args.
+set dummy pthread-config; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_acx_pthread_config+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$acx_pthread_config"; then
+  ac_cv_prog_acx_pthread_config="$acx_pthread_config" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_acx_pthread_config="yes"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_prog_acx_pthread_config" && ac_cv_prog_acx_pthread_config="no"
+fi
+fi
+acx_pthread_config=$ac_cv_prog_acx_pthread_config
+if test -n "$acx_pthread_config"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $acx_pthread_config" >&5
+$as_echo "$acx_pthread_config" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+		if test x"$acx_pthread_config" = xno; then continue; fi
+		PTHREAD_CFLAGS="`pthread-config --cflags`"
+		PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
+		;;
+
+                *)
+                { $as_echo "$as_me:${as_lineno-$LINENO}: checking for the pthreads library -l$flag" >&5
+$as_echo_n "checking for the pthreads library -l$flag... " >&6; }
+                PTHREAD_LIBS="-l$flag"
+                ;;
+        esac
+
+        save_LIBS="$LIBS"
+        save_CFLAGS="$CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Check for various functions.  We must include pthread.h,
+        # since some functions may be macros.  (On the Sequent, we
+        # need a special flag -Kthread to make this header compile.)
+        # We check for pthread_join because it is in -lpthread on IRIX
+        # while pthread_create is in libc.  We check for pthread_attr_init
+        # due to DEC craziness with -lpthreads.  We check for
+        # pthread_cleanup_push because it is one of the few pthread
+        # functions on Solaris that doesn't have a non-functional libc stub.
+        # We try pthread_create on general principles.
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <pthread.h>
+#ifdef F77_DUMMY_MAIN
+
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+   int F77_DUMMY_MAIN() { return 1; }
+
+#endif
+int
+main ()
+{
+pthread_t th; pthread_join(th, (void**) 0);
+                     pthread_attr_init((pthread_attr_t*) 0);
+                     pthread_cleanup_push((void(*)(void *)) 0, (void*) 0);
+                     pthread_create((pthread_t*) 0, (pthread_attr_t*) 0,
+                                    (void*(*)(void *)) 0, (void*) 0);
+                     pthread_cleanup_pop(0);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  acx_pthread_ok=yes
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $acx_pthread_ok" >&5
+$as_echo "$acx_pthread_ok" >&6; }
+        if test "x$acx_pthread_ok" = xyes; then
+                break;
+        fi
+
+        PTHREAD_LIBS=""
+        PTHREAD_CFLAGS=""
+done
+fi
+
+# Various other checks:
+if test "x$acx_pthread_ok" = xyes; then
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for joinable pthread attribute" >&5
+$as_echo_n "checking for joinable pthread attribute... " >&6; }
+	attr_name=unknown
+	for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+	    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <pthread.h>
+#ifdef F77_DUMMY_MAIN
+
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+   int F77_DUMMY_MAIN() { return 1; }
+
+#endif
+int
+main ()
+{
+int attr=$attr; return attr;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  attr_name=$attr; break
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+	done
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $attr_name" >&5
+$as_echo "$attr_name" >&6; }
+        if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
+
+cat >>confdefs.h <<_ACEOF
+#define PTHREAD_CREATE_JOINABLE $attr_name
+_ACEOF
+
+        fi
+
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking if more special flags are required for pthreads" >&5
+$as_echo_n "checking if more special flags are required for pthreads... " >&6; }
+        flag=no
+        case "${host_cpu}-${host_os}" in
+            *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";;
+            *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";;
+        esac
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${flag}" >&5
+$as_echo "${flag}" >&6; }
+        if test "x$flag" != xno; then
+            PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
+        fi
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        # More AIX lossage: must compile with xlc_r or cc_r
+	if test x"$GCC" != xyes; then
+          for ac_prog in xlc_r cc_r
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_PTHREAD_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$PTHREAD_CC"; then
+  ac_cv_prog_PTHREAD_CC="$PTHREAD_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_PTHREAD_CC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+PTHREAD_CC=$ac_cv_prog_PTHREAD_CC
+if test -n "$PTHREAD_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $PTHREAD_CC" >&5
+$as_echo "$PTHREAD_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$PTHREAD_CC" && break
+done
+test -n "$PTHREAD_CC" || PTHREAD_CC="${CC}"
+
+        else
+          PTHREAD_CC=$CC
+	fi
+else
+        PTHREAD_CC="$CC"
+fi
+
+
+
+
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$acx_pthread_ok" = xyes; then
+        THREADLIBS="$PTHREAD_LIBS "
+	                     CC="$PTHREAD_CC"
+
+$as_echo "#define USING_POSIX_THREADS 1" >>confdefs.h
+
+        :
+else
+        acx_pthread_ok=no
+
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+	fi
+
+	if test -z "$THREADLIBS"; then
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Win32 threads" >&5
+$as_echo_n "checking for Win32 threads... " >&6; }
+		cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <windows.h>
+#ifdef F77_DUMMY_MAIN
+
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+   int F77_DUMMY_MAIN() { return 1; }
+
+#endif
+int
+main ()
+{
+_beginthreadex(0,0,0,0,0,0);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  THREADLIBS=" "; { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+	fi
+
+	if test -z "$THREADLIBS"; then
+		as_fn_error $? "couldn't find threads library for --enable-threads" "$LINENO" 5
+	fi
+
+$as_echo "#define HAVE_THREADS 1" >>confdefs.h
+
+fi
+
+ if test "$enable_threads" = "yes"; then
+  THREADS_TRUE=
+  THREADS_FALSE='#'
+else
+  THREADS_TRUE='#'
+  THREADS_FALSE=
+fi
+
+ if test "$enable_openmp" = "yes"; then
+  OPENMP_TRUE=
+  OPENMP_FALSE='#'
+else
+  OPENMP_TRUE='#'
+  OPENMP_FALSE=
+fi
+
+ if test "$enable_threads" = "yes" -o "$enable_openmp" = "yes"; then
+  SMP_TRUE=
+  SMP_FALSE='#'
+else
+  SMP_TRUE='#'
+  SMP_FALSE=
+fi
+
+ if test x"$with_combined_threads" = xyes; then
+  COMBINED_THREADS_TRUE=
+  COMBINED_THREADS_FALSE='#'
+else
+  COMBINED_THREADS_TRUE='#'
+  COMBINED_THREADS_FALSE=
+fi
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether a cycle counter is available" >&5
+$as_echo_n "checking whether a cycle counter is available... " >&6; }
+save_CPPFLAGS=$CPPFLAGS
+CPPFLAGS="$CPPFLAGS -I$srcdir/kernel"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include "cycle.h"
+#ifndef HAVE_TICK_COUNTER
+#  error No cycle counter
+#endif
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  ok=yes
+else
+  ok=no
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+CPPFLAGS=$save_CPPFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ok" >&5
+$as_echo "$ok" >&6; }
+if test $ok = no && test "x$with_slow_timer" = xno; then
+	echo "***************************************************************"
+	echo "WARNING: No cycle counter found.  FFTW will use ESTIMATE mode  "
+	echo "         for all plans.  See the manual for more information."
+	echo "***************************************************************"
+fi
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define FFTW_CC "$CC $CFLAGS"
+_ACEOF
+
+
+ac_config_files="$ac_config_files Makefile support/Makefile genfft/Makefile kernel/Makefile simd-support/Makefile dft/Makefile dft/scalar/Makefile dft/scalar/codelets/Makefile dft/simd/Makefile dft/simd/common/Makefile dft/simd/sse2/Makefile dft/simd/avx/Makefile dft/simd/altivec/Makefile dft/simd/neon/Makefile rdft/Makefile rdft/scalar/Makefile rdft/scalar/r2cf/Makefile rdft/scalar/r2cb/Makefile rdft/scalar/r2r/Makefile rdft/simd/Makefile rdft/simd/common/Makefile rdft/simd/sse2/Makefile rdft/simd/avx/Makefile rdft/simd/altivec/Makefile rdft/simd/neon/Makefile reodft/Makefile threads/Makefile api/Makefile mpi/Makefile libbench2/Makefile tests/Makefile doc/Makefile doc/FAQ/Makefile tools/Makefile tools/fftw_wisdom.1 tools/fftw-wisdom-to-conf m4/Makefile fftw.pc"
+
+
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, we kill variables containing newlines.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(
+  for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+
+  (set) 2>&1 |
+    case $as_nl`(ac_space=' '; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      # `set' does not quote correctly, so add quotes: double-quote
+      # substitution turns \\\\ into \\, and sed turns \\ into \.
+      sed -n \
+	"s/'/'\\\\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+      ;; #(
+    *)
+      # `set' quotes correctly as required by POSIX, so do not add quotes.
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+) |
+  sed '
+     /^ac_cv_env_/b end
+     t clear
+     :clear
+     s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+     t end
+     s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+     :end' >>confcache
+if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
+  if test -w "$cache_file"; then
+    if test "x$cache_file" != "x/dev/null"; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
+$as_echo "$as_me: updating cache $cache_file" >&6;}
+      if test ! -f "$cache_file" || test -h "$cache_file"; then
+	cat confcache >"$cache_file"
+      else
+        case $cache_file in #(
+        */* | ?:*)
+	  mv -f confcache "$cache_file"$$ &&
+	  mv -f "$cache_file"$$ "$cache_file" ;; #(
+        *)
+	  mv -f confcache "$cache_file" ;;
+	esac
+      fi
+    fi
+  else
+    { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
+$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+  fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+DEFS=-DHAVE_CONFIG_H
+
+ac_libobjs=
+ac_ltlibobjs=
+U=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+  # 1. Remove the extension, and $U if already installed.
+  ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
+  ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
+  # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
+  #    will be set to the directory where LIBOBJS objects are built.
+  as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+  as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+ if test -n "$EXEEXT"; then
+  am__EXEEXT_TRUE=
+  am__EXEEXT_FALSE='#'
+else
+  am__EXEEXT_TRUE='#'
+  am__EXEEXT_FALSE=
+fi
+
+if test -z "${MAINTAINER_MODE_TRUE}" && test -z "${MAINTAINER_MODE_FALSE}"; then
+  as_fn_error $? "conditional \"MAINTAINER_MODE\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${SINGLE_TRUE}" && test -z "${SINGLE_FALSE}"; then
+  as_fn_error $? "conditional \"SINGLE\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${LDOUBLE_TRUE}" && test -z "${LDOUBLE_FALSE}"; then
+  as_fn_error $? "conditional \"LDOUBLE\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${QUAD_TRUE}" && test -z "${QUAD_FALSE}"; then
+  as_fn_error $? "conditional \"QUAD\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_SSE2_TRUE}" && test -z "${HAVE_SSE2_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_SSE2\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_AVX_TRUE}" && test -z "${HAVE_AVX_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_AVX\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_ALTIVEC_TRUE}" && test -z "${HAVE_ALTIVEC_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_ALTIVEC\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_NEON_TRUE}" && test -z "${HAVE_NEON_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_NEON\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
+  as_fn_error $? "conditional \"AMDEP\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
+  as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${MPI_TRUE}" && test -z "${MPI_FALSE}"; then
+  as_fn_error $? "conditional \"MPI\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${THREADS_TRUE}" && test -z "${THREADS_FALSE}"; then
+  as_fn_error $? "conditional \"THREADS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${OPENMP_TRUE}" && test -z "${OPENMP_FALSE}"; then
+  as_fn_error $? "conditional \"OPENMP\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${SMP_TRUE}" && test -z "${SMP_FALSE}"; then
+  as_fn_error $? "conditional \"SMP\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${COMBINED_THREADS_TRUE}" && test -z "${COMBINED_THREADS_FALSE}"; then
+  as_fn_error $? "conditional \"COMBINED_THREADS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+
+: "${CONFIG_STATUS=./config.status}"
+ac_write_fail=0
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
+as_write_fail=0
+cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+
+SHELL=\${CONFIG_SHELL-$SHELL}
+export SHELL
+_ASEOF
+cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='print -r --'
+  as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in #(
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there.  '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  $as_echo "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else
+  as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else
+  as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  }
+fi # as_fn_arith
+
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+    # In both cases, we have to default to `cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+exec 6>&1
+## ----------------------------------- ##
+## Main body of $CONFIG_STATUS script. ##
+## ----------------------------------- ##
+_ASEOF
+test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# Save the log message, to keep $0 and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling.
+ac_log="
+This file was extended by fftw $as_me 3.3.3, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
+
+  CONFIG_FILES    = $CONFIG_FILES
+  CONFIG_HEADERS  = $CONFIG_HEADERS
+  CONFIG_LINKS    = $CONFIG_LINKS
+  CONFIG_COMMANDS = $CONFIG_COMMANDS
+  $ $0 $@
+
+on `(hostname || uname -n) 2>/dev/null | sed 1q`
+"
+
+_ACEOF
+
+case $ac_config_files in *"
+"*) set x $ac_config_files; shift; ac_config_files=$*;;
+esac
+
+case $ac_config_headers in *"
+"*) set x $ac_config_headers; shift; ac_config_headers=$*;;
+esac
+
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+# Files that config.status was made for.
+config_files="$ac_config_files"
+config_headers="$ac_config_headers"
+config_commands="$ac_config_commands"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+ac_cs_usage="\
+\`$as_me' instantiates files and other configuration actions
+from templates according to the current configuration.  Unless the files
+and actions are specified as TAGs, all are instantiated by default.
+
+Usage: $0 [OPTION]... [TAG]...
+
+  -h, --help       print this help, then exit
+  -V, --version    print version number and configuration settings, then exit
+      --config     print configuration, then exit
+  -q, --quiet, --silent
+                   do not print progress messages
+  -d, --debug      don't remove temporary files
+      --recheck    update $as_me by reconfiguring in the same conditions
+      --file=FILE[:TEMPLATE]
+                   instantiate the configuration file FILE
+      --header=FILE[:TEMPLATE]
+                   instantiate the configuration header FILE
+
+Configuration files:
+$config_files
+
+Configuration headers:
+$config_headers
+
+Configuration commands:
+$config_commands
+
+Report bugs to <fftw@fftw.org>."
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
+ac_cs_version="\\
+fftw config.status 3.3.3
+configured by $0, generated by GNU Autoconf 2.69,
+  with options \\"\$ac_cs_config\\"
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+
+ac_pwd='$ac_pwd'
+srcdir='$srcdir'
+INSTALL='$INSTALL'
+MKDIR_P='$MKDIR_P'
+AWK='$AWK'
+test -n "\$AWK" || AWK=awk
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# The default lists apply if the user does not specify any file.
+ac_need_defaults=:
+while test $# != 0
+do
+  case $1 in
+  --*=?*)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
+    ac_shift=:
+    ;;
+  --*=)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=
+    ac_shift=:
+    ;;
+  *)
+    ac_option=$1
+    ac_optarg=$2
+    ac_shift=shift
+    ;;
+  esac
+
+  case $ac_option in
+  # Handling of the options.
+  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+    ac_cs_recheck=: ;;
+  --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
+    $as_echo "$ac_cs_version"; exit ;;
+  --config | --confi | --conf | --con | --co | --c )
+    $as_echo "$ac_cs_config"; exit ;;
+  --debug | --debu | --deb | --de | --d | -d )
+    debug=: ;;
+  --file | --fil | --fi | --f )
+    $ac_shift
+    case $ac_optarg in
+    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    '') as_fn_error $? "missing file argument" ;;
+    esac
+    as_fn_append CONFIG_FILES " '$ac_optarg'"
+    ac_need_defaults=false;;
+  --header | --heade | --head | --hea )
+    $ac_shift
+    case $ac_optarg in
+    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    esac
+    as_fn_append CONFIG_HEADERS " '$ac_optarg'"
+    ac_need_defaults=false;;
+  --he | --h)
+    # Conflict between --help and --header
+    as_fn_error $? "ambiguous option: \`$1'
+Try \`$0 --help' for more information.";;
+  --help | --hel | -h )
+    $as_echo "$ac_cs_usage"; exit ;;
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil | --si | --s)
+    ac_cs_silent=: ;;
+
+  # This is an error.
+  -*) as_fn_error $? "unrecognized option: \`$1'
+Try \`$0 --help' for more information." ;;
+
+  *) as_fn_append ac_config_targets " $1"
+     ac_need_defaults=false ;;
+
+  esac
+  shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+  exec 6>/dev/null
+  ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+if \$ac_cs_recheck; then
+  set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+  shift
+  \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
+  CONFIG_SHELL='$SHELL'
+  export CONFIG_SHELL
+  exec "\$@"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+exec 5>>config.log
+{
+  echo
+  sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+  $as_echo "$ac_log"
+} >&5
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+#
+# INIT-COMMANDS
+#
+AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"
+
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+sed_quote_subst='$sed_quote_subst'
+double_quote_subst='$double_quote_subst'
+delay_variable_subst='$delay_variable_subst'
+enable_shared='`$ECHO "$enable_shared" | $SED "$delay_single_quote_subst"`'
+AS='`$ECHO "$AS" | $SED "$delay_single_quote_subst"`'
+DLLTOOL='`$ECHO "$DLLTOOL" | $SED "$delay_single_quote_subst"`'
+OBJDUMP='`$ECHO "$OBJDUMP" | $SED "$delay_single_quote_subst"`'
+macro_version='`$ECHO "$macro_version" | $SED "$delay_single_quote_subst"`'
+macro_revision='`$ECHO "$macro_revision" | $SED "$delay_single_quote_subst"`'
+enable_static='`$ECHO "$enable_static" | $SED "$delay_single_quote_subst"`'
+pic_mode='`$ECHO "$pic_mode" | $SED "$delay_single_quote_subst"`'
+enable_fast_install='`$ECHO "$enable_fast_install" | $SED "$delay_single_quote_subst"`'
+SHELL='`$ECHO "$SHELL" | $SED "$delay_single_quote_subst"`'
+ECHO='`$ECHO "$ECHO" | $SED "$delay_single_quote_subst"`'
+PATH_SEPARATOR='`$ECHO "$PATH_SEPARATOR" | $SED "$delay_single_quote_subst"`'
+host_alias='`$ECHO "$host_alias" | $SED "$delay_single_quote_subst"`'
+host='`$ECHO "$host" | $SED "$delay_single_quote_subst"`'
+host_os='`$ECHO "$host_os" | $SED "$delay_single_quote_subst"`'
+build_alias='`$ECHO "$build_alias" | $SED "$delay_single_quote_subst"`'
+build='`$ECHO "$build" | $SED "$delay_single_quote_subst"`'
+build_os='`$ECHO "$build_os" | $SED "$delay_single_quote_subst"`'
+SED='`$ECHO "$SED" | $SED "$delay_single_quote_subst"`'
+Xsed='`$ECHO "$Xsed" | $SED "$delay_single_quote_subst"`'
+GREP='`$ECHO "$GREP" | $SED "$delay_single_quote_subst"`'
+EGREP='`$ECHO "$EGREP" | $SED "$delay_single_quote_subst"`'
+FGREP='`$ECHO "$FGREP" | $SED "$delay_single_quote_subst"`'
+LD='`$ECHO "$LD" | $SED "$delay_single_quote_subst"`'
+NM='`$ECHO "$NM" | $SED "$delay_single_quote_subst"`'
+LN_S='`$ECHO "$LN_S" | $SED "$delay_single_quote_subst"`'
+max_cmd_len='`$ECHO "$max_cmd_len" | $SED "$delay_single_quote_subst"`'
+ac_objext='`$ECHO "$ac_objext" | $SED "$delay_single_quote_subst"`'
+exeext='`$ECHO "$exeext" | $SED "$delay_single_quote_subst"`'
+lt_unset='`$ECHO "$lt_unset" | $SED "$delay_single_quote_subst"`'
+lt_SP2NL='`$ECHO "$lt_SP2NL" | $SED "$delay_single_quote_subst"`'
+lt_NL2SP='`$ECHO "$lt_NL2SP" | $SED "$delay_single_quote_subst"`'
+lt_cv_to_host_file_cmd='`$ECHO "$lt_cv_to_host_file_cmd" | $SED "$delay_single_quote_subst"`'
+lt_cv_to_tool_file_cmd='`$ECHO "$lt_cv_to_tool_file_cmd" | $SED "$delay_single_quote_subst"`'
+reload_flag='`$ECHO "$reload_flag" | $SED "$delay_single_quote_subst"`'
+reload_cmds='`$ECHO "$reload_cmds" | $SED "$delay_single_quote_subst"`'
+deplibs_check_method='`$ECHO "$deplibs_check_method" | $SED "$delay_single_quote_subst"`'
+file_magic_cmd='`$ECHO "$file_magic_cmd" | $SED "$delay_single_quote_subst"`'
+file_magic_glob='`$ECHO "$file_magic_glob" | $SED "$delay_single_quote_subst"`'
+want_nocaseglob='`$ECHO "$want_nocaseglob" | $SED "$delay_single_quote_subst"`'
+sharedlib_from_linklib_cmd='`$ECHO "$sharedlib_from_linklib_cmd" | $SED "$delay_single_quote_subst"`'
+AR='`$ECHO "$AR" | $SED "$delay_single_quote_subst"`'
+AR_FLAGS='`$ECHO "$AR_FLAGS" | $SED "$delay_single_quote_subst"`'
+archiver_list_spec='`$ECHO "$archiver_list_spec" | $SED "$delay_single_quote_subst"`'
+STRIP='`$ECHO "$STRIP" | $SED "$delay_single_quote_subst"`'
+RANLIB='`$ECHO "$RANLIB" | $SED "$delay_single_quote_subst"`'
+old_postinstall_cmds='`$ECHO "$old_postinstall_cmds" | $SED "$delay_single_quote_subst"`'
+old_postuninstall_cmds='`$ECHO "$old_postuninstall_cmds" | $SED "$delay_single_quote_subst"`'
+old_archive_cmds='`$ECHO "$old_archive_cmds" | $SED "$delay_single_quote_subst"`'
+lock_old_archive_extraction='`$ECHO "$lock_old_archive_extraction" | $SED "$delay_single_quote_subst"`'
+CC='`$ECHO "$CC" | $SED "$delay_single_quote_subst"`'
+CFLAGS='`$ECHO "$CFLAGS" | $SED "$delay_single_quote_subst"`'
+compiler='`$ECHO "$compiler" | $SED "$delay_single_quote_subst"`'
+GCC='`$ECHO "$GCC" | $SED "$delay_single_quote_subst"`'
+lt_cv_sys_global_symbol_pipe='`$ECHO "$lt_cv_sys_global_symbol_pipe" | $SED "$delay_single_quote_subst"`'
+lt_cv_sys_global_symbol_to_cdecl='`$ECHO "$lt_cv_sys_global_symbol_to_cdecl" | $SED "$delay_single_quote_subst"`'
+lt_cv_sys_global_symbol_to_c_name_address='`$ECHO "$lt_cv_sys_global_symbol_to_c_name_address" | $SED "$delay_single_quote_subst"`'
+lt_cv_sys_global_symbol_to_c_name_address_lib_prefix='`$ECHO "$lt_cv_sys_global_symbol_to_c_name_address_lib_prefix" | $SED "$delay_single_quote_subst"`'
+nm_file_list_spec='`$ECHO "$nm_file_list_spec" | $SED "$delay_single_quote_subst"`'
+lt_sysroot='`$ECHO "$lt_sysroot" | $SED "$delay_single_quote_subst"`'
+objdir='`$ECHO "$objdir" | $SED "$delay_single_quote_subst"`'
+MAGIC_CMD='`$ECHO "$MAGIC_CMD" | $SED "$delay_single_quote_subst"`'
+lt_prog_compiler_no_builtin_flag='`$ECHO "$lt_prog_compiler_no_builtin_flag" | $SED "$delay_single_quote_subst"`'
+lt_prog_compiler_pic='`$ECHO "$lt_prog_compiler_pic" | $SED "$delay_single_quote_subst"`'
+lt_prog_compiler_wl='`$ECHO "$lt_prog_compiler_wl" | $SED "$delay_single_quote_subst"`'
+lt_prog_compiler_static='`$ECHO "$lt_prog_compiler_static" | $SED "$delay_single_quote_subst"`'
+lt_cv_prog_compiler_c_o='`$ECHO "$lt_cv_prog_compiler_c_o" | $SED "$delay_single_quote_subst"`'
+need_locks='`$ECHO "$need_locks" | $SED "$delay_single_quote_subst"`'
+MANIFEST_TOOL='`$ECHO "$MANIFEST_TOOL" | $SED "$delay_single_quote_subst"`'
+DSYMUTIL='`$ECHO "$DSYMUTIL" | $SED "$delay_single_quote_subst"`'
+NMEDIT='`$ECHO "$NMEDIT" | $SED "$delay_single_quote_subst"`'
+LIPO='`$ECHO "$LIPO" | $SED "$delay_single_quote_subst"`'
+OTOOL='`$ECHO "$OTOOL" | $SED "$delay_single_quote_subst"`'
+OTOOL64='`$ECHO "$OTOOL64" | $SED "$delay_single_quote_subst"`'
+libext='`$ECHO "$libext" | $SED "$delay_single_quote_subst"`'
+shrext_cmds='`$ECHO "$shrext_cmds" | $SED "$delay_single_quote_subst"`'
+extract_expsyms_cmds='`$ECHO "$extract_expsyms_cmds" | $SED "$delay_single_quote_subst"`'
+archive_cmds_need_lc='`$ECHO "$archive_cmds_need_lc" | $SED "$delay_single_quote_subst"`'
+enable_shared_with_static_runtimes='`$ECHO "$enable_shared_with_static_runtimes" | $SED "$delay_single_quote_subst"`'
+export_dynamic_flag_spec='`$ECHO "$export_dynamic_flag_spec" | $SED "$delay_single_quote_subst"`'
+whole_archive_flag_spec='`$ECHO "$whole_archive_flag_spec" | $SED "$delay_single_quote_subst"`'
+compiler_needs_object='`$ECHO "$compiler_needs_object" | $SED "$delay_single_quote_subst"`'
+old_archive_from_new_cmds='`$ECHO "$old_archive_from_new_cmds" | $SED "$delay_single_quote_subst"`'
+old_archive_from_expsyms_cmds='`$ECHO "$old_archive_from_expsyms_cmds" | $SED "$delay_single_quote_subst"`'
+archive_cmds='`$ECHO "$archive_cmds" | $SED "$delay_single_quote_subst"`'
+archive_expsym_cmds='`$ECHO "$archive_expsym_cmds" | $SED "$delay_single_quote_subst"`'
+module_cmds='`$ECHO "$module_cmds" | $SED "$delay_single_quote_subst"`'
+module_expsym_cmds='`$ECHO "$module_expsym_cmds" | $SED "$delay_single_quote_subst"`'
+with_gnu_ld='`$ECHO "$with_gnu_ld" | $SED "$delay_single_quote_subst"`'
+allow_undefined_flag='`$ECHO "$allow_undefined_flag" | $SED "$delay_single_quote_subst"`'
+no_undefined_flag='`$ECHO "$no_undefined_flag" | $SED "$delay_single_quote_subst"`'
+hardcode_libdir_flag_spec='`$ECHO "$hardcode_libdir_flag_spec" | $SED "$delay_single_quote_subst"`'
+hardcode_libdir_separator='`$ECHO "$hardcode_libdir_separator" | $SED "$delay_single_quote_subst"`'
+hardcode_direct='`$ECHO "$hardcode_direct" | $SED "$delay_single_quote_subst"`'
+hardcode_direct_absolute='`$ECHO "$hardcode_direct_absolute" | $SED "$delay_single_quote_subst"`'
+hardcode_minus_L='`$ECHO "$hardcode_minus_L" | $SED "$delay_single_quote_subst"`'
+hardcode_shlibpath_var='`$ECHO "$hardcode_shlibpath_var" | $SED "$delay_single_quote_subst"`'
+hardcode_automatic='`$ECHO "$hardcode_automatic" | $SED "$delay_single_quote_subst"`'
+inherit_rpath='`$ECHO "$inherit_rpath" | $SED "$delay_single_quote_subst"`'
+link_all_deplibs='`$ECHO "$link_all_deplibs" | $SED "$delay_single_quote_subst"`'
+always_export_symbols='`$ECHO "$always_export_symbols" | $SED "$delay_single_quote_subst"`'
+export_symbols_cmds='`$ECHO "$export_symbols_cmds" | $SED "$delay_single_quote_subst"`'
+exclude_expsyms='`$ECHO "$exclude_expsyms" | $SED "$delay_single_quote_subst"`'
+include_expsyms='`$ECHO "$include_expsyms" | $SED "$delay_single_quote_subst"`'
+prelink_cmds='`$ECHO "$prelink_cmds" | $SED "$delay_single_quote_subst"`'
+postlink_cmds='`$ECHO "$postlink_cmds" | $SED "$delay_single_quote_subst"`'
+file_list_spec='`$ECHO "$file_list_spec" | $SED "$delay_single_quote_subst"`'
+variables_saved_for_relink='`$ECHO "$variables_saved_for_relink" | $SED "$delay_single_quote_subst"`'
+need_lib_prefix='`$ECHO "$need_lib_prefix" | $SED "$delay_single_quote_subst"`'
+need_version='`$ECHO "$need_version" | $SED "$delay_single_quote_subst"`'
+version_type='`$ECHO "$version_type" | $SED "$delay_single_quote_subst"`'
+runpath_var='`$ECHO "$runpath_var" | $SED "$delay_single_quote_subst"`'
+shlibpath_var='`$ECHO "$shlibpath_var" | $SED "$delay_single_quote_subst"`'
+shlibpath_overrides_runpath='`$ECHO "$shlibpath_overrides_runpath" | $SED "$delay_single_quote_subst"`'
+libname_spec='`$ECHO "$libname_spec" | $SED "$delay_single_quote_subst"`'
+library_names_spec='`$ECHO "$library_names_spec" | $SED "$delay_single_quote_subst"`'
+soname_spec='`$ECHO "$soname_spec" | $SED "$delay_single_quote_subst"`'
+install_override_mode='`$ECHO "$install_override_mode" | $SED "$delay_single_quote_subst"`'
+postinstall_cmds='`$ECHO "$postinstall_cmds" | $SED "$delay_single_quote_subst"`'
+postuninstall_cmds='`$ECHO "$postuninstall_cmds" | $SED "$delay_single_quote_subst"`'
+finish_cmds='`$ECHO "$finish_cmds" | $SED "$delay_single_quote_subst"`'
+finish_eval='`$ECHO "$finish_eval" | $SED "$delay_single_quote_subst"`'
+hardcode_into_libs='`$ECHO "$hardcode_into_libs" | $SED "$delay_single_quote_subst"`'
+sys_lib_search_path_spec='`$ECHO "$sys_lib_search_path_spec" | $SED "$delay_single_quote_subst"`'
+sys_lib_dlsearch_path_spec='`$ECHO "$sys_lib_dlsearch_path_spec" | $SED "$delay_single_quote_subst"`'
+hardcode_action='`$ECHO "$hardcode_action" | $SED "$delay_single_quote_subst"`'
+enable_dlopen='`$ECHO "$enable_dlopen" | $SED "$delay_single_quote_subst"`'
+enable_dlopen_self='`$ECHO "$enable_dlopen_self" | $SED "$delay_single_quote_subst"`'
+enable_dlopen_self_static='`$ECHO "$enable_dlopen_self_static" | $SED "$delay_single_quote_subst"`'
+old_striplib='`$ECHO "$old_striplib" | $SED "$delay_single_quote_subst"`'
+striplib='`$ECHO "$striplib" | $SED "$delay_single_quote_subst"`'
+LD_F77='`$ECHO "$LD_F77" | $SED "$delay_single_quote_subst"`'
+reload_flag_F77='`$ECHO "$reload_flag_F77" | $SED "$delay_single_quote_subst"`'
+reload_cmds_F77='`$ECHO "$reload_cmds_F77" | $SED "$delay_single_quote_subst"`'
+old_archive_cmds_F77='`$ECHO "$old_archive_cmds_F77" | $SED "$delay_single_quote_subst"`'
+compiler_F77='`$ECHO "$compiler_F77" | $SED "$delay_single_quote_subst"`'
+GCC_F77='`$ECHO "$GCC_F77" | $SED "$delay_single_quote_subst"`'
+lt_prog_compiler_no_builtin_flag_F77='`$ECHO "$lt_prog_compiler_no_builtin_flag_F77" | $SED "$delay_single_quote_subst"`'
+lt_prog_compiler_pic_F77='`$ECHO "$lt_prog_compiler_pic_F77" | $SED "$delay_single_quote_subst"`'
+lt_prog_compiler_wl_F77='`$ECHO "$lt_prog_compiler_wl_F77" | $SED "$delay_single_quote_subst"`'
+lt_prog_compiler_static_F77='`$ECHO "$lt_prog_compiler_static_F77" | $SED "$delay_single_quote_subst"`'
+lt_cv_prog_compiler_c_o_F77='`$ECHO "$lt_cv_prog_compiler_c_o_F77" | $SED "$delay_single_quote_subst"`'
+archive_cmds_need_lc_F77='`$ECHO "$archive_cmds_need_lc_F77" | $SED "$delay_single_quote_subst"`'
+enable_shared_with_static_runtimes_F77='`$ECHO "$enable_shared_with_static_runtimes_F77" | $SED "$delay_single_quote_subst"`'
+export_dynamic_flag_spec_F77='`$ECHO "$export_dynamic_flag_spec_F77" | $SED "$delay_single_quote_subst"`'
+whole_archive_flag_spec_F77='`$ECHO "$whole_archive_flag_spec_F77" | $SED "$delay_single_quote_subst"`'
+compiler_needs_object_F77='`$ECHO "$compiler_needs_object_F77" | $SED "$delay_single_quote_subst"`'
+old_archive_from_new_cmds_F77='`$ECHO "$old_archive_from_new_cmds_F77" | $SED "$delay_single_quote_subst"`'
+old_archive_from_expsyms_cmds_F77='`$ECHO "$old_archive_from_expsyms_cmds_F77" | $SED "$delay_single_quote_subst"`'
+archive_cmds_F77='`$ECHO "$archive_cmds_F77" | $SED "$delay_single_quote_subst"`'
+archive_expsym_cmds_F77='`$ECHO "$archive_expsym_cmds_F77" | $SED "$delay_single_quote_subst"`'
+module_cmds_F77='`$ECHO "$module_cmds_F77" | $SED "$delay_single_quote_subst"`'
+module_expsym_cmds_F77='`$ECHO "$module_expsym_cmds_F77" | $SED "$delay_single_quote_subst"`'
+with_gnu_ld_F77='`$ECHO "$with_gnu_ld_F77" | $SED "$delay_single_quote_subst"`'
+allow_undefined_flag_F77='`$ECHO "$allow_undefined_flag_F77" | $SED "$delay_single_quote_subst"`'
+no_undefined_flag_F77='`$ECHO "$no_undefined_flag_F77" | $SED "$delay_single_quote_subst"`'
+hardcode_libdir_flag_spec_F77='`$ECHO "$hardcode_libdir_flag_spec_F77" | $SED "$delay_single_quote_subst"`'
+hardcode_libdir_separator_F77='`$ECHO "$hardcode_libdir_separator_F77" | $SED "$delay_single_quote_subst"`'
+hardcode_direct_F77='`$ECHO "$hardcode_direct_F77" | $SED "$delay_single_quote_subst"`'
+hardcode_direct_absolute_F77='`$ECHO "$hardcode_direct_absolute_F77" | $SED "$delay_single_quote_subst"`'
+hardcode_minus_L_F77='`$ECHO "$hardcode_minus_L_F77" | $SED "$delay_single_quote_subst"`'
+hardcode_shlibpath_var_F77='`$ECHO "$hardcode_shlibpath_var_F77" | $SED "$delay_single_quote_subst"`'
+hardcode_automatic_F77='`$ECHO "$hardcode_automatic_F77" | $SED "$delay_single_quote_subst"`'
+inherit_rpath_F77='`$ECHO "$inherit_rpath_F77" | $SED "$delay_single_quote_subst"`'
+link_all_deplibs_F77='`$ECHO "$link_all_deplibs_F77" | $SED "$delay_single_quote_subst"`'
+always_export_symbols_F77='`$ECHO "$always_export_symbols_F77" | $SED "$delay_single_quote_subst"`'
+export_symbols_cmds_F77='`$ECHO "$export_symbols_cmds_F77" | $SED "$delay_single_quote_subst"`'
+exclude_expsyms_F77='`$ECHO "$exclude_expsyms_F77" | $SED "$delay_single_quote_subst"`'
+include_expsyms_F77='`$ECHO "$include_expsyms_F77" | $SED "$delay_single_quote_subst"`'
+prelink_cmds_F77='`$ECHO "$prelink_cmds_F77" | $SED "$delay_single_quote_subst"`'
+postlink_cmds_F77='`$ECHO "$postlink_cmds_F77" | $SED "$delay_single_quote_subst"`'
+file_list_spec_F77='`$ECHO "$file_list_spec_F77" | $SED "$delay_single_quote_subst"`'
+hardcode_action_F77='`$ECHO "$hardcode_action_F77" | $SED "$delay_single_quote_subst"`'
+
+LTCC='$LTCC'
+LTCFLAGS='$LTCFLAGS'
+compiler='$compiler_DEFAULT'
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+\$1
+_LTECHO_EOF'
+}
+
+# Quote evaled strings.
+for var in AS \
+DLLTOOL \
+OBJDUMP \
+SHELL \
+ECHO \
+PATH_SEPARATOR \
+SED \
+GREP \
+EGREP \
+FGREP \
+LD \
+NM \
+LN_S \
+lt_SP2NL \
+lt_NL2SP \
+reload_flag \
+deplibs_check_method \
+file_magic_cmd \
+file_magic_glob \
+want_nocaseglob \
+sharedlib_from_linklib_cmd \
+AR \
+AR_FLAGS \
+archiver_list_spec \
+STRIP \
+RANLIB \
+CC \
+CFLAGS \
+compiler \
+lt_cv_sys_global_symbol_pipe \
+lt_cv_sys_global_symbol_to_cdecl \
+lt_cv_sys_global_symbol_to_c_name_address \
+lt_cv_sys_global_symbol_to_c_name_address_lib_prefix \
+nm_file_list_spec \
+lt_prog_compiler_no_builtin_flag \
+lt_prog_compiler_pic \
+lt_prog_compiler_wl \
+lt_prog_compiler_static \
+lt_cv_prog_compiler_c_o \
+need_locks \
+MANIFEST_TOOL \
+DSYMUTIL \
+NMEDIT \
+LIPO \
+OTOOL \
+OTOOL64 \
+shrext_cmds \
+export_dynamic_flag_spec \
+whole_archive_flag_spec \
+compiler_needs_object \
+with_gnu_ld \
+allow_undefined_flag \
+no_undefined_flag \
+hardcode_libdir_flag_spec \
+hardcode_libdir_separator \
+exclude_expsyms \
+include_expsyms \
+file_list_spec \
+variables_saved_for_relink \
+libname_spec \
+library_names_spec \
+soname_spec \
+install_override_mode \
+finish_eval \
+old_striplib \
+striplib \
+LD_F77 \
+reload_flag_F77 \
+compiler_F77 \
+lt_prog_compiler_no_builtin_flag_F77 \
+lt_prog_compiler_pic_F77 \
+lt_prog_compiler_wl_F77 \
+lt_prog_compiler_static_F77 \
+lt_cv_prog_compiler_c_o_F77 \
+export_dynamic_flag_spec_F77 \
+whole_archive_flag_spec_F77 \
+compiler_needs_object_F77 \
+with_gnu_ld_F77 \
+allow_undefined_flag_F77 \
+no_undefined_flag_F77 \
+hardcode_libdir_flag_spec_F77 \
+hardcode_libdir_separator_F77 \
+exclude_expsyms_F77 \
+include_expsyms_F77 \
+file_list_spec_F77; do
+    case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in
+    *[\\\\\\\`\\"\\\$]*)
+      eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED \\"\\\$sed_quote_subst\\"\\\`\\\\\\""
+      ;;
+    *)
+      eval "lt_\$var=\\\\\\"\\\$\$var\\\\\\""
+      ;;
+    esac
+done
+
+# Double-quote double-evaled strings.
+for var in reload_cmds \
+old_postinstall_cmds \
+old_postuninstall_cmds \
+old_archive_cmds \
+extract_expsyms_cmds \
+old_archive_from_new_cmds \
+old_archive_from_expsyms_cmds \
+archive_cmds \
+archive_expsym_cmds \
+module_cmds \
+module_expsym_cmds \
+export_symbols_cmds \
+prelink_cmds \
+postlink_cmds \
+postinstall_cmds \
+postuninstall_cmds \
+finish_cmds \
+sys_lib_search_path_spec \
+sys_lib_dlsearch_path_spec \
+reload_cmds_F77 \
+old_archive_cmds_F77 \
+old_archive_from_new_cmds_F77 \
+old_archive_from_expsyms_cmds_F77 \
+archive_cmds_F77 \
+archive_expsym_cmds_F77 \
+module_cmds_F77 \
+module_expsym_cmds_F77 \
+export_symbols_cmds_F77 \
+prelink_cmds_F77 \
+postlink_cmds_F77; do
+    case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in
+    *[\\\\\\\`\\"\\\$]*)
+      eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED -e \\"\\\$double_quote_subst\\" -e \\"\\\$sed_quote_subst\\" -e \\"\\\$delay_variable_subst\\"\\\`\\\\\\""
+      ;;
+    *)
+      eval "lt_\$var=\\\\\\"\\\$\$var\\\\\\""
+      ;;
+    esac
+done
+
+ac_aux_dir='$ac_aux_dir'
+xsi_shell='$xsi_shell'
+lt_shell_append='$lt_shell_append'
+
+# See if we are running on zsh, and set the options which allow our
+# commands through without removal of \ escapes INIT.
+if test -n "\${ZSH_VERSION+set}" ; then
+   setopt NO_GLOB_SUBST
+fi
+
+
+    PACKAGE='$PACKAGE'
+    VERSION='$VERSION'
+    TIMESTAMP='$TIMESTAMP'
+    RM='$RM'
+    ofile='$ofile'
+
+
+
+
+
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+
+# Handling of arguments.
+for ac_config_target in $ac_config_targets
+do
+  case $ac_config_target in
+    "config.h") CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;;
+    "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;;
+    "libtool") CONFIG_COMMANDS="$CONFIG_COMMANDS libtool" ;;
+    "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
+    "support/Makefile") CONFIG_FILES="$CONFIG_FILES support/Makefile" ;;
+    "genfft/Makefile") CONFIG_FILES="$CONFIG_FILES genfft/Makefile" ;;
+    "kernel/Makefile") CONFIG_FILES="$CONFIG_FILES kernel/Makefile" ;;
+    "simd-support/Makefile") CONFIG_FILES="$CONFIG_FILES simd-support/Makefile" ;;
+    "dft/Makefile") CONFIG_FILES="$CONFIG_FILES dft/Makefile" ;;
+    "dft/scalar/Makefile") CONFIG_FILES="$CONFIG_FILES dft/scalar/Makefile" ;;
+    "dft/scalar/codelets/Makefile") CONFIG_FILES="$CONFIG_FILES dft/scalar/codelets/Makefile" ;;
+    "dft/simd/Makefile") CONFIG_FILES="$CONFIG_FILES dft/simd/Makefile" ;;
+    "dft/simd/common/Makefile") CONFIG_FILES="$CONFIG_FILES dft/simd/common/Makefile" ;;
+    "dft/simd/sse2/Makefile") CONFIG_FILES="$CONFIG_FILES dft/simd/sse2/Makefile" ;;
+    "dft/simd/avx/Makefile") CONFIG_FILES="$CONFIG_FILES dft/simd/avx/Makefile" ;;
+    "dft/simd/altivec/Makefile") CONFIG_FILES="$CONFIG_FILES dft/simd/altivec/Makefile" ;;
+    "dft/simd/neon/Makefile") CONFIG_FILES="$CONFIG_FILES dft/simd/neon/Makefile" ;;
+    "rdft/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/Makefile" ;;
+    "rdft/scalar/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/scalar/Makefile" ;;
+    "rdft/scalar/r2cf/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/scalar/r2cf/Makefile" ;;
+    "rdft/scalar/r2cb/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/scalar/r2cb/Makefile" ;;
+    "rdft/scalar/r2r/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/scalar/r2r/Makefile" ;;
+    "rdft/simd/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/simd/Makefile" ;;
+    "rdft/simd/common/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/simd/common/Makefile" ;;
+    "rdft/simd/sse2/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/simd/sse2/Makefile" ;;
+    "rdft/simd/avx/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/simd/avx/Makefile" ;;
+    "rdft/simd/altivec/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/simd/altivec/Makefile" ;;
+    "rdft/simd/neon/Makefile") CONFIG_FILES="$CONFIG_FILES rdft/simd/neon/Makefile" ;;
+    "reodft/Makefile") CONFIG_FILES="$CONFIG_FILES reodft/Makefile" ;;
+    "threads/Makefile") CONFIG_FILES="$CONFIG_FILES threads/Makefile" ;;
+    "api/Makefile") CONFIG_FILES="$CONFIG_FILES api/Makefile" ;;
+    "mpi/Makefile") CONFIG_FILES="$CONFIG_FILES mpi/Makefile" ;;
+    "libbench2/Makefile") CONFIG_FILES="$CONFIG_FILES libbench2/Makefile" ;;
+    "tests/Makefile") CONFIG_FILES="$CONFIG_FILES tests/Makefile" ;;
+    "doc/Makefile") CONFIG_FILES="$CONFIG_FILES doc/Makefile" ;;
+    "doc/FAQ/Makefile") CONFIG_FILES="$CONFIG_FILES doc/FAQ/Makefile" ;;
+    "tools/Makefile") CONFIG_FILES="$CONFIG_FILES tools/Makefile" ;;
+    "tools/fftw_wisdom.1") CONFIG_FILES="$CONFIG_FILES tools/fftw_wisdom.1" ;;
+    "tools/fftw-wisdom-to-conf") CONFIG_FILES="$CONFIG_FILES tools/fftw-wisdom-to-conf" ;;
+    "m4/Makefile") CONFIG_FILES="$CONFIG_FILES m4/Makefile" ;;
+    "fftw.pc") CONFIG_FILES="$CONFIG_FILES fftw.pc" ;;
+
+  *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
+  esac
+done
+
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used.  Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+  test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers
+  test "${CONFIG_COMMANDS+set}" = set || CONFIG_COMMANDS=$config_commands
+fi
+
+# Have a temporary directory for convenience.  Make it in the build tree
+# simply because there is no reason against having it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Hook for its removal unless debugging.
+# Note that there is a small window in which the directory will not be cleaned:
+# after its creation but before its name has been assigned to `$tmp'.
+$debug ||
+{
+  tmp= ac_tmp=
+  trap 'exit_status=$?
+  : "${ac_tmp:=$tmp}"
+  { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status
+' 0
+  trap 'as_fn_exit 1' 1 2 13 15
+}
+# Create a (secure) tmp directory for tmp files.
+
+{
+  tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
+  test -d "$tmp"
+}  ||
+{
+  tmp=./conf$$-$RANDOM
+  (umask 077 && mkdir "$tmp")
+} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5
+ac_tmp=$tmp
+
+# Set up the scripts for CONFIG_FILES section.
+# No need to generate them if there are no CONFIG_FILES.
+# This happens for instance with `./config.status config.h'.
+if test -n "$CONFIG_FILES"; then
+
+
+ac_cr=`echo X | tr X '\015'`
+# On cygwin, bash can eat \r inside `` if the user requested igncr.
+# But we know of no other shell where ac_cr would be empty at this
+# point, so we can use a bashism as a fallback.
+if test "x$ac_cr" = x; then
+  eval ac_cr=\$\'\\r\'
+fi
+ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
+if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
+  ac_cs_awk_cr='\\r'
+else
+  ac_cs_awk_cr=$ac_cr
+fi
+
+echo 'BEGIN {' >"$ac_tmp/subs1.awk" &&
+_ACEOF
+
+
+{
+  echo "cat >conf$$subs.awk <<_ACEOF" &&
+  echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
+  echo "_ACEOF"
+} >conf$$subs.sh ||
+  as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'`
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+  . ./conf$$subs.sh ||
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+
+  ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
+  if test $ac_delim_n = $ac_delim_num; then
+    break
+  elif $ac_last_try; then
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+rm -f conf$$subs.sh
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK &&
+_ACEOF
+sed -n '
+h
+s/^/S["/; s/!.*/"]=/
+p
+g
+s/^[^!]*!//
+:repl
+t repl
+s/'"$ac_delim"'$//
+t delim
+:nl
+h
+s/\(.\{148\}\)..*/\1/
+t more1
+s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
+p
+n
+b repl
+:more1
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t nl
+:delim
+h
+s/\(.\{148\}\)..*/\1/
+t more2
+s/["\\]/\\&/g; s/^/"/; s/$/"/
+p
+b
+:more2
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t delim
+' <conf$$subs.awk | sed '
+/^[^""]/{
+  N
+  s/\n//
+}
+' >>$CONFIG_STATUS || ac_write_fail=1
+rm -f conf$$subs.awk
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACAWK
+cat >>"\$ac_tmp/subs1.awk" <<_ACAWK &&
+  for (key in S) S_is_set[key] = 1
+  FS = ""
+
+}
+{
+  line = $ 0
+  nfields = split(line, field, "@")
+  substed = 0
+  len = length(field[1])
+  for (i = 2; i < nfields; i++) {
+    key = field[i]
+    keylen = length(key)
+    if (S_is_set[key]) {
+      value = S[key]
+      line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3)
+      len += length(value) + length(field[++i])
+      substed = 1
+    } else
+      len += 1 + keylen
+  }
+
+  print line
+}
+
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then
+  sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g"
+else
+  cat
+fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \
+  || as_fn_error $? "could not setup config files machinery" "$LINENO" 5
+_ACEOF
+
+# VPATH may cause trouble with some makes, so we remove sole $(srcdir),
+# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+  ac_vpsub='/^[	 ]*VPATH[	 ]*=[	 ]*/{
+h
+s///
+s/^/:/
+s/[	 ]*$/:/
+s/:\$(srcdir):/:/g
+s/:\${srcdir}:/:/g
+s/:@srcdir@:/:/g
+s/^:*//
+s/:*$//
+x
+s/\(=[	 ]*\).*/\1/
+G
+s/\n//
+s/^[^=]*=[	 ]*$//
+}'
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+fi # test -n "$CONFIG_FILES"
+
+# Set up the scripts for CONFIG_HEADERS section.
+# No need to generate them if there are no CONFIG_HEADERS.
+# This happens for instance with `./config.status Makefile'.
+if test -n "$CONFIG_HEADERS"; then
+cat >"$ac_tmp/defines.awk" <<\_ACAWK ||
+BEGIN {
+_ACEOF
+
+# Transform confdefs.h into an awk script `defines.awk', embedded as
+# here-document in config.status, that substitutes the proper values into
+# config.h.in to produce config.h.
+
+# Create a delimiter string that does not exist in confdefs.h, to ease
+# handling of long lines.
+ac_delim='%!_!# '
+for ac_last_try in false false :; do
+  ac_tt=`sed -n "/$ac_delim/p" confdefs.h`
+  if test -z "$ac_tt"; then
+    break
+  elif $ac_last_try; then
+    as_fn_error $? "could not make $CONFIG_HEADERS" "$LINENO" 5
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+
+# For the awk script, D is an array of macro values keyed by name,
+# likewise P contains macro parameters if any.  Preserve backslash
+# newline sequences.
+
+ac_word_re=[_$as_cr_Letters][_$as_cr_alnum]*
+sed -n '
+s/.\{148\}/&'"$ac_delim"'/g
+t rset
+:rset
+s/^[	 ]*#[	 ]*define[	 ][	 ]*/ /
+t def
+d
+:def
+s/\\$//
+t bsnl
+s/["\\]/\\&/g
+s/^ \('"$ac_word_re"'\)\(([^()]*)\)[	 ]*\(.*\)/P["\1"]="\2"\
+D["\1"]=" \3"/p
+s/^ \('"$ac_word_re"'\)[	 ]*\(.*\)/D["\1"]=" \2"/p
+d
+:bsnl
+s/["\\]/\\&/g
+s/^ \('"$ac_word_re"'\)\(([^()]*)\)[	 ]*\(.*\)/P["\1"]="\2"\
+D["\1"]=" \3\\\\\\n"\\/p
+t cont
+s/^ \('"$ac_word_re"'\)[	 ]*\(.*\)/D["\1"]=" \2\\\\\\n"\\/p
+t cont
+d
+:cont
+n
+s/.\{148\}/&'"$ac_delim"'/g
+t clear
+:clear
+s/\\$//
+t bsnlc
+s/["\\]/\\&/g; s/^/"/; s/$/"/p
+d
+:bsnlc
+s/["\\]/\\&/g; s/^/"/; s/$/\\\\\\n"\\/p
+b cont
+' <confdefs.h | sed '
+s/'"$ac_delim"'/"\\\
+"/g' >>$CONFIG_STATUS || ac_write_fail=1
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+  for (key in D) D_is_set[key] = 1
+  FS = ""
+}
+/^[\t ]*#[\t ]*(define|undef)[\t ]+$ac_word_re([\t (]|\$)/ {
+  line = \$ 0
+  split(line, arg, " ")
+  if (arg[1] == "#") {
+    defundef = arg[2]
+    mac1 = arg[3]
+  } else {
+    defundef = substr(arg[1], 2)
+    mac1 = arg[2]
+  }
+  split(mac1, mac2, "(") #)
+  macro = mac2[1]
+  prefix = substr(line, 1, index(line, defundef) - 1)
+  if (D_is_set[macro]) {
+    # Preserve the white space surrounding the "#".
+    print prefix "define", macro P[macro] D[macro]
+    next
+  } else {
+    # Replace #undef with comments.  This is necessary, for example,
+    # in the case of _POSIX_SOURCE, which is predefined and required
+    # on some systems where configure will not decide to define it.
+    if (defundef == "undef") {
+      print "/*", prefix defundef, macro, "*/"
+      next
+    }
+  }
+}
+{ print }
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+  as_fn_error $? "could not setup config headers machinery" "$LINENO" 5
+fi # test -n "$CONFIG_HEADERS"
+
+
+eval set X "  :F $CONFIG_FILES  :H $CONFIG_HEADERS    :C $CONFIG_COMMANDS"
+shift
+for ac_tag
+do
+  case $ac_tag in
+  :[FHLC]) ac_mode=$ac_tag; continue;;
+  esac
+  case $ac_mode$ac_tag in
+  :[FHL]*:*);;
+  :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;;
+  :[FH]-) ac_tag=-:-;;
+  :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
+  esac
+  ac_save_IFS=$IFS
+  IFS=:
+  set x $ac_tag
+  IFS=$ac_save_IFS
+  shift
+  ac_file=$1
+  shift
+
+  case $ac_mode in
+  :L) ac_source=$1;;
+  :[FH])
+    ac_file_inputs=
+    for ac_f
+    do
+      case $ac_f in
+      -) ac_f="$ac_tmp/stdin";;
+      *) # Look for the file first in the build tree, then in the source tree
+	 # (if the path is not absolute).  The absolute path cannot be DOS-style,
+	 # because $ac_f cannot contain `:'.
+	 test -f "$ac_f" ||
+	   case $ac_f in
+	   [\\/$]*) false;;
+	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
+	   esac ||
+	   as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;;
+      esac
+      case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+      as_fn_append ac_file_inputs " '$ac_f'"
+    done
+
+    # Let's still pretend it is `configure' which instantiates (i.e., don't
+    # use $as_me), people would be surprised to read:
+    #    /* config.h.  Generated by config.status.  */
+    configure_input='Generated from '`
+	  $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+	`' by configure.'
+    if test x"$ac_file" != x-; then
+      configure_input="$ac_file.  $configure_input"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
+$as_echo "$as_me: creating $ac_file" >&6;}
+    fi
+    # Neutralize special characters interpreted by sed in replacement strings.
+    case $configure_input in #(
+    *\&* | *\|* | *\\* )
+       ac_sed_conf_input=`$as_echo "$configure_input" |
+       sed 's/[\\\\&|]/\\\\&/g'`;; #(
+    *) ac_sed_conf_input=$configure_input;;
+    esac
+
+    case $ac_tag in
+    *:-:* | *:-) cat >"$ac_tmp/stdin" \
+      || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;;
+    esac
+    ;;
+  esac
+
+  ac_dir=`$as_dirname -- "$ac_file" ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$ac_file" : 'X\(//\)[^/]' \| \
+	 X"$ac_file" : 'X\(//\)$' \| \
+	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$ac_file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  as_dir="$ac_dir"; as_fn_mkdir_p
+  ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+
+  case $ac_mode in
+  :F)
+  #
+  # CONFIG_FILE
+  #
+
+  case $INSTALL in
+  [\\/$]* | ?:[\\/]* ) ac_INSTALL=$INSTALL ;;
+  *) ac_INSTALL=$ac_top_build_prefix$INSTALL ;;
+  esac
+  ac_MKDIR_P=$MKDIR_P
+  case $MKDIR_P in
+  [\\/$]* | ?:[\\/]* ) ;;
+  */*) ac_MKDIR_P=$ac_top_build_prefix$MKDIR_P ;;
+  esac
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# If the template does not know about datarootdir, expand it.
+# FIXME: This hack should be removed a few years after 2.60.
+ac_datarootdir_hack=; ac_datarootdir_seen=
+ac_sed_dataroot='
+/datarootdir/ {
+  p
+  q
+}
+/@datadir@/p
+/@docdir@/p
+/@infodir@/p
+/@localedir@/p
+/@mandir@/p'
+case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
+*datarootdir*) ac_datarootdir_seen=yes;;
+*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+  ac_datarootdir_hack='
+  s&@datadir@&$datadir&g
+  s&@docdir@&$docdir&g
+  s&@infodir@&$infodir&g
+  s&@localedir@&$localedir&g
+  s&@mandir@&$mandir&g
+  s&\\\${datarootdir}&$datarootdir&g' ;;
+esac
+_ACEOF
+
+# Neutralize VPATH when `$srcdir' = `.'.
+# Shell code in configure.ac might set extrasub.
+# FIXME: do we really want to maintain this feature?
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_sed_extra="$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s|@configure_input@|$ac_sed_conf_input|;t t
+s&@top_builddir@&$ac_top_builddir_sub&;t t
+s&@top_build_prefix@&$ac_top_build_prefix&;t t
+s&@srcdir@&$ac_srcdir&;t t
+s&@abs_srcdir@&$ac_abs_srcdir&;t t
+s&@top_srcdir@&$ac_top_srcdir&;t t
+s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
+s&@builddir@&$ac_builddir&;t t
+s&@abs_builddir@&$ac_abs_builddir&;t t
+s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
+s&@INSTALL@&$ac_INSTALL&;t t
+s&@MKDIR_P@&$ac_MKDIR_P&;t t
+$ac_datarootdir_hack
+"
+eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \
+  >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+
+test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
+  { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } &&
+  { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' \
+      "$ac_tmp/out"`; test -z "$ac_out"; } &&
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&5
+$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&2;}
+
+  rm -f "$ac_tmp/stdin"
+  case $ac_file in
+  -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";;
+  *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";;
+  esac \
+  || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+ ;;
+  :H)
+  #
+  # CONFIG_HEADER
+  #
+  if test x"$ac_file" != x-; then
+    {
+      $as_echo "/* $configure_input  */" \
+      && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs"
+    } >"$ac_tmp/config.h" \
+      || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+    if diff "$ac_file" "$ac_tmp/config.h" >/dev/null 2>&1; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5
+$as_echo "$as_me: $ac_file is unchanged" >&6;}
+    else
+      rm -f "$ac_file"
+      mv "$ac_tmp/config.h" "$ac_file" \
+	|| as_fn_error $? "could not create $ac_file" "$LINENO" 5
+    fi
+  else
+    $as_echo "/* $configure_input  */" \
+      && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" \
+      || as_fn_error $? "could not create -" "$LINENO" 5
+  fi
+# Compute "$ac_file"'s index in $config_headers.
+_am_arg="$ac_file"
+_am_stamp_count=1
+for _am_header in $config_headers :; do
+  case $_am_header in
+    $_am_arg | $_am_arg:* )
+      break ;;
+    * )
+      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
+  esac
+done
+echo "timestamp for $_am_arg" >`$as_dirname -- "$_am_arg" ||
+$as_expr X"$_am_arg" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$_am_arg" : 'X\(//\)[^/]' \| \
+	 X"$_am_arg" : 'X\(//\)$' \| \
+	 X"$_am_arg" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$_am_arg" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`/stamp-h$_am_stamp_count
+ ;;
+
+  :C)  { $as_echo "$as_me:${as_lineno-$LINENO}: executing $ac_file commands" >&5
+$as_echo "$as_me: executing $ac_file commands" >&6;}
+ ;;
+  esac
+
+
+  case $ac_file$ac_mode in
+    "depfiles":C) test x"$AMDEP_TRUE" != x"" || {
+  # Autoconf 2.62 quotes --file arguments for eval, but not when files
+  # are listed without --file.  Let's play safe and only enable the eval
+  # if we detect the quoting.
+  case $CONFIG_FILES in
+  *\'*) eval set x "$CONFIG_FILES" ;;
+  *)   set x $CONFIG_FILES ;;
+  esac
+  shift
+  for mf
+  do
+    # Strip MF so we end up with the name of the file.
+    mf=`echo "$mf" | sed -e 's/:.*$//'`
+    # Check whether this is an Automake generated Makefile or not.
+    # We used to match only the files named `Makefile.in', but
+    # some people rename them; so instead we look at the file content.
+    # Grep'ing the first line is not enough: some people post-process
+    # each Makefile.in and add a new line on top of each file to say so.
+    # Grep'ing the whole file is not good either: AIX grep has a line
+    # limit of 2048, but all sed's we know have understand at least 4000.
+    if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
+      dirpart=`$as_dirname -- "$mf" ||
+$as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$mf" : 'X\(//\)[^/]' \| \
+	 X"$mf" : 'X\(//\)$' \| \
+	 X"$mf" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$mf" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+    else
+      continue
+    fi
+    # Extract the definition of DEPDIR, am__include, and am__quote
+    # from the Makefile without running `make'.
+    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
+    test -z "$DEPDIR" && continue
+    am__include=`sed -n 's/^am__include = //p' < "$mf"`
+    test -z "am__include" && continue
+    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
+    # When using ansi2knr, U may be empty or an underscore; expand it
+    U=`sed -n 's/^U = //p' < "$mf"`
+    # Find all dependency output files, they are included files with
+    # $(DEPDIR) in their names.  We invoke sed twice because it is the
+    # simplest approach to changing $(DEPDIR) to its actual value in the
+    # expansion.
+    for file in `sed -n "
+      s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
+	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
+      # Make sure the directory exists.
+      test -f "$dirpart/$file" && continue
+      fdir=`$as_dirname -- "$file" ||
+$as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$file" : 'X\(//\)[^/]' \| \
+	 X"$file" : 'X\(//\)$' \| \
+	 X"$file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      as_dir=$dirpart/$fdir; as_fn_mkdir_p
+      # echo "creating $dirpart/$file"
+      echo '# dummy' > "$dirpart/$file"
+    done
+  done
+}
+ ;;
+    "libtool":C)
+
+    # See if we are running on zsh, and set the options which allow our
+    # commands through without removal of \ escapes.
+    if test -n "${ZSH_VERSION+set}" ; then
+      setopt NO_GLOB_SUBST
+    fi
+
+    cfgfile="${ofile}T"
+    trap "$RM \"$cfgfile\"; exit 1" 1 2 15
+    $RM "$cfgfile"
+
+    cat <<_LT_EOF >> "$cfgfile"
+#! $SHELL
+
+# `$ECHO "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services.
+# Generated automatically by $as_me ($PACKAGE$TIMESTAMP) $VERSION
+# Libtool was configured on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
+# NOTE: Changes made to this file will be lost: look at ltmain.sh.
+#
+#   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005,
+#                 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+#                 Foundation, Inc.
+#   Written by Gordon Matzigkeit, 1996
+#
+#   This file is part of GNU Libtool.
+#
+# GNU Libtool is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of
+# the License, or (at your option) any later version.
+#
+# As a special exception to the GNU General Public License,
+# if you distribute this file as part of a program or library that
+# is built using GNU Libtool, you may include this file under the
+# same distribution terms that you use for the rest of that program.
+#
+# GNU Libtool is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Libtool; see the file COPYING.  If not, a copy
+# can be downloaded from http://www.gnu.org/licenses/gpl.html, or
+# obtained by writing to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+# The names of the tagged configurations supported by this script.
+available_tags="F77 "
+
+# ### BEGIN LIBTOOL CONFIG
+
+# Whether or not to build shared libraries.
+build_libtool_libs=$enable_shared
+
+# Assembler program.
+AS=$lt_AS
+
+# DLL creation program.
+DLLTOOL=$lt_DLLTOOL
+
+# Object dumper program.
+OBJDUMP=$lt_OBJDUMP
+
+# Which release of libtool.m4 was used?
+macro_version=$macro_version
+macro_revision=$macro_revision
+
+# Whether or not to build static libraries.
+build_old_libs=$enable_static
+
+# What type of objects to build.
+pic_mode=$pic_mode
+
+# Whether or not to optimize for fast installation.
+fast_install=$enable_fast_install
+
+# Shell to use when invoking shell scripts.
+SHELL=$lt_SHELL
+
+# An echo program that protects backslashes.
+ECHO=$lt_ECHO
+
+# The PATH separator for the build system.
+PATH_SEPARATOR=$lt_PATH_SEPARATOR
+
+# The host system.
+host_alias=$host_alias
+host=$host
+host_os=$host_os
+
+# The build system.
+build_alias=$build_alias
+build=$build
+build_os=$build_os
+
+# A sed program that does not truncate output.
+SED=$lt_SED
+
+# Sed that helps us avoid accidentally triggering echo(1) options like -n.
+Xsed="\$SED -e 1s/^X//"
+
+# A grep program that handles long lines.
+GREP=$lt_GREP
+
+# An ERE matcher.
+EGREP=$lt_EGREP
+
+# A literal string matcher.
+FGREP=$lt_FGREP
+
+# A BSD- or MS-compatible name lister.
+NM=$lt_NM
+
+# Whether we need soft or hard links.
+LN_S=$lt_LN_S
+
+# What is the maximum length of a command?
+max_cmd_len=$max_cmd_len
+
+# Object file suffix (normally "o").
+objext=$ac_objext
+
+# Executable file suffix (normally "").
+exeext=$exeext
+
+# whether the shell understands "unset".
+lt_unset=$lt_unset
+
+# turn spaces into newlines.
+SP2NL=$lt_lt_SP2NL
+
+# turn newlines into spaces.
+NL2SP=$lt_lt_NL2SP
+
+# convert \$build file names to \$host format.
+to_host_file_cmd=$lt_cv_to_host_file_cmd
+
+# convert \$build files to toolchain format.
+to_tool_file_cmd=$lt_cv_to_tool_file_cmd
+
+# Method to check whether dependent libraries are shared objects.
+deplibs_check_method=$lt_deplibs_check_method
+
+# Command to use when deplibs_check_method = "file_magic".
+file_magic_cmd=$lt_file_magic_cmd
+
+# How to find potential files when deplibs_check_method = "file_magic".
+file_magic_glob=$lt_file_magic_glob
+
+# Find potential files using nocaseglob when deplibs_check_method = "file_magic".
+want_nocaseglob=$lt_want_nocaseglob
+
+# Command to associate shared and link libraries.
+sharedlib_from_linklib_cmd=$lt_sharedlib_from_linklib_cmd
+
+# The archiver.
+AR=$lt_AR
+
+# Flags to create an archive.
+AR_FLAGS=$lt_AR_FLAGS
+
+# How to feed a file listing to the archiver.
+archiver_list_spec=$lt_archiver_list_spec
+
+# A symbol stripping program.
+STRIP=$lt_STRIP
+
+# Commands used to install an old-style archive.
+RANLIB=$lt_RANLIB
+old_postinstall_cmds=$lt_old_postinstall_cmds
+old_postuninstall_cmds=$lt_old_postuninstall_cmds
+
+# Whether to use a lock for old archive extraction.
+lock_old_archive_extraction=$lock_old_archive_extraction
+
+# A C compiler.
+LTCC=$lt_CC
+
+# LTCC compiler flags.
+LTCFLAGS=$lt_CFLAGS
+
+# Take the output of nm and produce a listing of raw symbols and C names.
+global_symbol_pipe=$lt_lt_cv_sys_global_symbol_pipe
+
+# Transform the output of nm in a proper C declaration.
+global_symbol_to_cdecl=$lt_lt_cv_sys_global_symbol_to_cdecl
+
+# Transform the output of nm in a C name address pair.
+global_symbol_to_c_name_address=$lt_lt_cv_sys_global_symbol_to_c_name_address
+
+# Transform the output of nm in a C name address pair when lib prefix is needed.
+global_symbol_to_c_name_address_lib_prefix=$lt_lt_cv_sys_global_symbol_to_c_name_address_lib_prefix
+
+# Specify filename containing input files for \$NM.
+nm_file_list_spec=$lt_nm_file_list_spec
+
+# The root where to search for dependent libraries,and in which our libraries should be installed.
+lt_sysroot=$lt_sysroot
+
+# The name of the directory that contains temporary libtool files.
+objdir=$objdir
+
+# Used to examine libraries when file_magic_cmd begins with "file".
+MAGIC_CMD=$MAGIC_CMD
+
+# Must we lock files when doing compilation?
+need_locks=$lt_need_locks
+
+# Manifest tool.
+MANIFEST_TOOL=$lt_MANIFEST_TOOL
+
+# Tool to manipulate archived DWARF debug symbol files on Mac OS X.
+DSYMUTIL=$lt_DSYMUTIL
+
+# Tool to change global to local symbols on Mac OS X.
+NMEDIT=$lt_NMEDIT
+
+# Tool to manipulate fat objects and archives on Mac OS X.
+LIPO=$lt_LIPO
+
+# ldd/readelf like tool for Mach-O binaries on Mac OS X.
+OTOOL=$lt_OTOOL
+
+# ldd/readelf like tool for 64 bit Mach-O binaries on Mac OS X 10.4.
+OTOOL64=$lt_OTOOL64
+
+# Old archive suffix (normally "a").
+libext=$libext
+
+# Shared library suffix (normally ".so").
+shrext_cmds=$lt_shrext_cmds
+
+# The commands to extract the exported symbol list from a shared archive.
+extract_expsyms_cmds=$lt_extract_expsyms_cmds
+
+# Variables whose values should be saved in libtool wrapper scripts and
+# restored at link time.
+variables_saved_for_relink=$lt_variables_saved_for_relink
+
+# Do we need the "lib" prefix for modules?
+need_lib_prefix=$need_lib_prefix
+
+# Do we need a version for libraries?
+need_version=$need_version
+
+# Library versioning type.
+version_type=$version_type
+
+# Shared library runtime path variable.
+runpath_var=$runpath_var
+
+# Shared library path variable.
+shlibpath_var=$shlibpath_var
+
+# Is shlibpath searched before the hard-coded library search path?
+shlibpath_overrides_runpath=$shlibpath_overrides_runpath
+
+# Format of library name prefix.
+libname_spec=$lt_libname_spec
+
+# List of archive names.  First name is the real one, the rest are links.
+# The last name is the one that the linker finds with -lNAME
+library_names_spec=$lt_library_names_spec
+
+# The coded name of the library, if different from the real name.
+soname_spec=$lt_soname_spec
+
+# Permission mode override for installation of shared libraries.
+install_override_mode=$lt_install_override_mode
+
+# Command to use after installation of a shared archive.
+postinstall_cmds=$lt_postinstall_cmds
+
+# Command to use after uninstallation of a shared archive.
+postuninstall_cmds=$lt_postuninstall_cmds
+
+# Commands used to finish a libtool library installation in a directory.
+finish_cmds=$lt_finish_cmds
+
+# As "finish_cmds", except a single script fragment to be evaled but
+# not shown.
+finish_eval=$lt_finish_eval
+
+# Whether we should hardcode library paths into libraries.
+hardcode_into_libs=$hardcode_into_libs
+
+# Compile-time system search path for libraries.
+sys_lib_search_path_spec=$lt_sys_lib_search_path_spec
+
+# Run-time system search path for libraries.
+sys_lib_dlsearch_path_spec=$lt_sys_lib_dlsearch_path_spec
+
+# Whether dlopen is supported.
+dlopen_support=$enable_dlopen
+
+# Whether dlopen of programs is supported.
+dlopen_self=$enable_dlopen_self
+
+# Whether dlopen of statically linked programs is supported.
+dlopen_self_static=$enable_dlopen_self_static
+
+# Commands to strip libraries.
+old_striplib=$lt_old_striplib
+striplib=$lt_striplib
+
+
+# The linker used to build libraries.
+LD=$lt_LD
+
+# How to create reloadable object files.
+reload_flag=$lt_reload_flag
+reload_cmds=$lt_reload_cmds
+
+# Commands used to build an old-style archive.
+old_archive_cmds=$lt_old_archive_cmds
+
+# A language specific compiler.
+CC=$lt_compiler
+
+# Is the compiler the GNU compiler?
+with_gcc=$GCC
+
+# Compiler flag to turn off builtin functions.
+no_builtin_flag=$lt_lt_prog_compiler_no_builtin_flag
+
+# Additional compiler flags for building library objects.
+pic_flag=$lt_lt_prog_compiler_pic
+
+# How to pass a linker flag through the compiler.
+wl=$lt_lt_prog_compiler_wl
+
+# Compiler flag to prevent dynamic linking.
+link_static_flag=$lt_lt_prog_compiler_static
+
+# Does compiler simultaneously support -c and -o options?
+compiler_c_o=$lt_lt_cv_prog_compiler_c_o
+
+# Whether or not to add -lc for building shared libraries.
+build_libtool_need_lc=$archive_cmds_need_lc
+
+# Whether or not to disallow shared libs when runtime libs are static.
+allow_libtool_libs_with_static_runtimes=$enable_shared_with_static_runtimes
+
+# Compiler flag to allow reflexive dlopens.
+export_dynamic_flag_spec=$lt_export_dynamic_flag_spec
+
+# Compiler flag to generate shared objects directly from archives.
+whole_archive_flag_spec=$lt_whole_archive_flag_spec
+
+# Whether the compiler copes with passing no objects directly.
+compiler_needs_object=$lt_compiler_needs_object
+
+# Create an old-style archive from a shared archive.
+old_archive_from_new_cmds=$lt_old_archive_from_new_cmds
+
+# Create a temporary old-style archive to link instead of a shared archive.
+old_archive_from_expsyms_cmds=$lt_old_archive_from_expsyms_cmds
+
+# Commands used to build a shared archive.
+archive_cmds=$lt_archive_cmds
+archive_expsym_cmds=$lt_archive_expsym_cmds
+
+# Commands used to build a loadable module if different from building
+# a shared archive.
+module_cmds=$lt_module_cmds
+module_expsym_cmds=$lt_module_expsym_cmds
+
+# Whether we are building with GNU ld or not.
+with_gnu_ld=$lt_with_gnu_ld
+
+# Flag that allows shared libraries with undefined symbols to be built.
+allow_undefined_flag=$lt_allow_undefined_flag
+
+# Flag that enforces no undefined symbols.
+no_undefined_flag=$lt_no_undefined_flag
+
+# Flag to hardcode \$libdir into a binary during linking.
+# This must work even if \$libdir does not exist
+hardcode_libdir_flag_spec=$lt_hardcode_libdir_flag_spec
+
+# Whether we need a single "-rpath" flag with a separated argument.
+hardcode_libdir_separator=$lt_hardcode_libdir_separator
+
+# Set to "yes" if using DIR/libNAME\${shared_ext} during linking hardcodes
+# DIR into the resulting binary.
+hardcode_direct=$hardcode_direct
+
+# Set to "yes" if using DIR/libNAME\${shared_ext} during linking hardcodes
+# DIR into the resulting binary and the resulting library dependency is
+# "absolute",i.e impossible to change by setting \${shlibpath_var} if the
+# library is relocated.
+hardcode_direct_absolute=$hardcode_direct_absolute
+
+# Set to "yes" if using the -LDIR flag during linking hardcodes DIR
+# into the resulting binary.
+hardcode_minus_L=$hardcode_minus_L
+
+# Set to "yes" if using SHLIBPATH_VAR=DIR during linking hardcodes DIR
+# into the resulting binary.
+hardcode_shlibpath_var=$hardcode_shlibpath_var
+
+# Set to "yes" if building a shared library automatically hardcodes DIR
+# into the library and all subsequent libraries and executables linked
+# against it.
+hardcode_automatic=$hardcode_automatic
+
+# Set to yes if linker adds runtime paths of dependent libraries
+# to runtime path list.
+inherit_rpath=$inherit_rpath
+
+# Whether libtool must link a program against all its dependency libraries.
+link_all_deplibs=$link_all_deplibs
+
+# Set to "yes" if exported symbols are required.
+always_export_symbols=$always_export_symbols
+
+# The commands to list exported symbols.
+export_symbols_cmds=$lt_export_symbols_cmds
+
+# Symbols that should not be listed in the preloaded symbols.
+exclude_expsyms=$lt_exclude_expsyms
+
+# Symbols that must always be exported.
+include_expsyms=$lt_include_expsyms
+
+# Commands necessary for linking programs (against libraries) with templates.
+prelink_cmds=$lt_prelink_cmds
+
+# Commands necessary for finishing linking programs.
+postlink_cmds=$lt_postlink_cmds
+
+# Specify filename containing input files.
+file_list_spec=$lt_file_list_spec
+
+# How to hardcode a shared library path into an executable.
+hardcode_action=$hardcode_action
+
+# ### END LIBTOOL CONFIG
+
+_LT_EOF
+
+  case $host_os in
+  aix3*)
+    cat <<\_LT_EOF >> "$cfgfile"
+# AIX sometimes has problems with the GCC collect2 program.  For some
+# reason, if we set the COLLECT_NAMES environment variable, the problems
+# vanish in a puff of smoke.
+if test "X${COLLECT_NAMES+set}" != Xset; then
+  COLLECT_NAMES=
+  export COLLECT_NAMES
+fi
+_LT_EOF
+    ;;
+  esac
+
+
+ltmain="$ac_aux_dir/ltmain.sh"
+
+
+  # We use sed instead of cat because bash on DJGPP gets confused if
+  # if finds mixed CR/LF and LF-only lines.  Since sed operates in
+  # text mode, it properly converts lines to CR/LF.  This bash problem
+  # is reportedly fixed, but why not run on old versions too?
+  sed '$q' "$ltmain" >> "$cfgfile" \
+     || (rm -f "$cfgfile"; exit 1)
+
+  if test x"$xsi_shell" = xyes; then
+  sed -e '/^func_dirname ()$/,/^} # func_dirname /c\
+func_dirname ()\
+{\
+\    case ${1} in\
+\      */*) func_dirname_result="${1%/*}${2}" ;;\
+\      *  ) func_dirname_result="${3}" ;;\
+\    esac\
+} # Extended-shell func_dirname implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  sed -e '/^func_basename ()$/,/^} # func_basename /c\
+func_basename ()\
+{\
+\    func_basename_result="${1##*/}"\
+} # Extended-shell func_basename implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  sed -e '/^func_dirname_and_basename ()$/,/^} # func_dirname_and_basename /c\
+func_dirname_and_basename ()\
+{\
+\    case ${1} in\
+\      */*) func_dirname_result="${1%/*}${2}" ;;\
+\      *  ) func_dirname_result="${3}" ;;\
+\    esac\
+\    func_basename_result="${1##*/}"\
+} # Extended-shell func_dirname_and_basename implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  sed -e '/^func_stripname ()$/,/^} # func_stripname /c\
+func_stripname ()\
+{\
+\    # pdksh 5.2.14 does not do ${X%$Y} correctly if both X and Y are\
+\    # positional parameters, so assign one to ordinary parameter first.\
+\    func_stripname_result=${3}\
+\    func_stripname_result=${func_stripname_result#"${1}"}\
+\    func_stripname_result=${func_stripname_result%"${2}"}\
+} # Extended-shell func_stripname implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  sed -e '/^func_split_long_opt ()$/,/^} # func_split_long_opt /c\
+func_split_long_opt ()\
+{\
+\    func_split_long_opt_name=${1%%=*}\
+\    func_split_long_opt_arg=${1#*=}\
+} # Extended-shell func_split_long_opt implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  sed -e '/^func_split_short_opt ()$/,/^} # func_split_short_opt /c\
+func_split_short_opt ()\
+{\
+\    func_split_short_opt_arg=${1#??}\
+\    func_split_short_opt_name=${1%"$func_split_short_opt_arg"}\
+} # Extended-shell func_split_short_opt implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  sed -e '/^func_lo2o ()$/,/^} # func_lo2o /c\
+func_lo2o ()\
+{\
+\    case ${1} in\
+\      *.lo) func_lo2o_result=${1%.lo}.${objext} ;;\
+\      *)    func_lo2o_result=${1} ;;\
+\    esac\
+} # Extended-shell func_lo2o implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  sed -e '/^func_xform ()$/,/^} # func_xform /c\
+func_xform ()\
+{\
+    func_xform_result=${1%.*}.lo\
+} # Extended-shell func_xform implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  sed -e '/^func_arith ()$/,/^} # func_arith /c\
+func_arith ()\
+{\
+    func_arith_result=$(( $* ))\
+} # Extended-shell func_arith implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  sed -e '/^func_len ()$/,/^} # func_len /c\
+func_len ()\
+{\
+    func_len_result=${#1}\
+} # Extended-shell func_len implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+fi
+
+if test x"$lt_shell_append" = xyes; then
+  sed -e '/^func_append ()$/,/^} # func_append /c\
+func_append ()\
+{\
+    eval "${1}+=\\${2}"\
+} # Extended-shell func_append implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  sed -e '/^func_append_quoted ()$/,/^} # func_append_quoted /c\
+func_append_quoted ()\
+{\
+\    func_quote_for_eval "${2}"\
+\    eval "${1}+=\\\\ \\$func_quote_for_eval_result"\
+} # Extended-shell func_append_quoted implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+
+
+  # Save a `func_append' function call where possible by direct use of '+='
+  sed -e 's%func_append \([a-zA-Z_]\{1,\}\) "%\1+="%g' $cfgfile > $cfgfile.tmp \
+    && mv -f "$cfgfile.tmp" "$cfgfile" \
+      || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+  test 0 -eq $? || _lt_function_replace_fail=:
+else
+  # Save a `func_append' function call even when '+=' is not available
+  sed -e 's%func_append \([a-zA-Z_]\{1,\}\) "%\1="$\1%g' $cfgfile > $cfgfile.tmp \
+    && mv -f "$cfgfile.tmp" "$cfgfile" \
+      || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+  test 0 -eq $? || _lt_function_replace_fail=:
+fi
+
+if test x"$_lt_function_replace_fail" = x":"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Unable to substitute extended shell functions in $ofile" >&5
+$as_echo "$as_me: WARNING: Unable to substitute extended shell functions in $ofile" >&2;}
+fi
+
+
+   mv -f "$cfgfile" "$ofile" ||
+    (rm -f "$ofile" && cp "$cfgfile" "$ofile" && rm -f "$cfgfile")
+  chmod +x "$ofile"
+
+
+    cat <<_LT_EOF >> "$ofile"
+
+# ### BEGIN LIBTOOL TAG CONFIG: F77
+
+# The linker used to build libraries.
+LD=$lt_LD_F77
+
+# How to create reloadable object files.
+reload_flag=$lt_reload_flag_F77
+reload_cmds=$lt_reload_cmds_F77
+
+# Commands used to build an old-style archive.
+old_archive_cmds=$lt_old_archive_cmds_F77
+
+# A language specific compiler.
+CC=$lt_compiler_F77
+
+# Is the compiler the GNU compiler?
+with_gcc=$GCC_F77
+
+# Compiler flag to turn off builtin functions.
+no_builtin_flag=$lt_lt_prog_compiler_no_builtin_flag_F77
+
+# Additional compiler flags for building library objects.
+pic_flag=$lt_lt_prog_compiler_pic_F77
+
+# How to pass a linker flag through the compiler.
+wl=$lt_lt_prog_compiler_wl_F77
+
+# Compiler flag to prevent dynamic linking.
+link_static_flag=$lt_lt_prog_compiler_static_F77
+
+# Does compiler simultaneously support -c and -o options?
+compiler_c_o=$lt_lt_cv_prog_compiler_c_o_F77
+
+# Whether or not to add -lc for building shared libraries.
+build_libtool_need_lc=$archive_cmds_need_lc_F77
+
+# Whether or not to disallow shared libs when runtime libs are static.
+allow_libtool_libs_with_static_runtimes=$enable_shared_with_static_runtimes_F77
+
+# Compiler flag to allow reflexive dlopens.
+export_dynamic_flag_spec=$lt_export_dynamic_flag_spec_F77
+
+# Compiler flag to generate shared objects directly from archives.
+whole_archive_flag_spec=$lt_whole_archive_flag_spec_F77
+
+# Whether the compiler copes with passing no objects directly.
+compiler_needs_object=$lt_compiler_needs_object_F77
+
+# Create an old-style archive from a shared archive.
+old_archive_from_new_cmds=$lt_old_archive_from_new_cmds_F77
+
+# Create a temporary old-style archive to link instead of a shared archive.
+old_archive_from_expsyms_cmds=$lt_old_archive_from_expsyms_cmds_F77
+
+# Commands used to build a shared archive.
+archive_cmds=$lt_archive_cmds_F77
+archive_expsym_cmds=$lt_archive_expsym_cmds_F77
+
+# Commands used to build a loadable module if different from building
+# a shared archive.
+module_cmds=$lt_module_cmds_F77
+module_expsym_cmds=$lt_module_expsym_cmds_F77
+
+# Whether we are building with GNU ld or not.
+with_gnu_ld=$lt_with_gnu_ld_F77
+
+# Flag that allows shared libraries with undefined symbols to be built.
+allow_undefined_flag=$lt_allow_undefined_flag_F77
+
+# Flag that enforces no undefined symbols.
+no_undefined_flag=$lt_no_undefined_flag_F77
+
+# Flag to hardcode \$libdir into a binary during linking.
+# This must work even if \$libdir does not exist
+hardcode_libdir_flag_spec=$lt_hardcode_libdir_flag_spec_F77
+
+# Whether we need a single "-rpath" flag with a separated argument.
+hardcode_libdir_separator=$lt_hardcode_libdir_separator_F77
+
+# Set to "yes" if using DIR/libNAME\${shared_ext} during linking hardcodes
+# DIR into the resulting binary.
+hardcode_direct=$hardcode_direct_F77
+
+# Set to "yes" if using DIR/libNAME\${shared_ext} during linking hardcodes
+# DIR into the resulting binary and the resulting library dependency is
+# "absolute",i.e impossible to change by setting \${shlibpath_var} if the
+# library is relocated.
+hardcode_direct_absolute=$hardcode_direct_absolute_F77
+
+# Set to "yes" if using the -LDIR flag during linking hardcodes DIR
+# into the resulting binary.
+hardcode_minus_L=$hardcode_minus_L_F77
+
+# Set to "yes" if using SHLIBPATH_VAR=DIR during linking hardcodes DIR
+# into the resulting binary.
+hardcode_shlibpath_var=$hardcode_shlibpath_var_F77
+
+# Set to "yes" if building a shared library automatically hardcodes DIR
+# into the library and all subsequent libraries and executables linked
+# against it.
+hardcode_automatic=$hardcode_automatic_F77
+
+# Set to yes if linker adds runtime paths of dependent libraries
+# to runtime path list.
+inherit_rpath=$inherit_rpath_F77
+
+# Whether libtool must link a program against all its dependency libraries.
+link_all_deplibs=$link_all_deplibs_F77
+
+# Set to "yes" if exported symbols are required.
+always_export_symbols=$always_export_symbols_F77
+
+# The commands to list exported symbols.
+export_symbols_cmds=$lt_export_symbols_cmds_F77
+
+# Symbols that should not be listed in the preloaded symbols.
+exclude_expsyms=$lt_exclude_expsyms_F77
+
+# Symbols that must always be exported.
+include_expsyms=$lt_include_expsyms_F77
+
+# Commands necessary for linking programs (against libraries) with templates.
+prelink_cmds=$lt_prelink_cmds_F77
+
+# Commands necessary for finishing linking programs.
+postlink_cmds=$lt_postlink_cmds_F77
+
+# Specify filename containing input files.
+file_list_spec=$lt_file_list_spec_F77
+
+# How to hardcode a shared library path into an executable.
+hardcode_action=$hardcode_action_F77
+
+# ### END LIBTOOL TAG CONFIG: F77
+_LT_EOF
+
+ ;;
+
+  esac
+done # for ac_tag
+
+
+as_fn_exit 0
+_ACEOF
+ac_clean_files=$ac_clean_files_save
+
+test $ac_write_fail = 0 ||
+  as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded.  So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status.  When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+  ac_cs_success=:
+  ac_config_status_args=
+  test "$silent" = yes &&
+    ac_config_status_args="$ac_config_status_args --quiet"
+  exec 5>/dev/null
+  $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+  exec 5>>config.log
+  # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+  # would make configure fail if this is the last instruction.
+  $ac_cs_success || as_fn_exit 1
+fi
+if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+fi
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/configure.ac
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/configure.ac	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,598 @@
+dnl Process this file with autoconf to produce a configure script.
+AC_INIT(fftw, 3.3.3, fftw@fftw.org)
+AC_CONFIG_SRCDIR(kernel/ifftw.h)
+# fftw-3.1.x was 4:X:1
+# fftw-3.2.x was 5:X:2
+# fftw-3.3.x was 6:X:3
+SHARED_VERSION_INFO="6:2:3" # CURRENT:REVISION:AGE
+
+AM_INIT_AUTOMAKE(1.7)
+AM_CONFIG_HEADER(config.h)
+AC_CONFIG_MACRO_DIR([m4])
+AM_MAINTAINER_MODE
+AC_SUBST(SHARED_VERSION_INFO)
+AC_DISABLE_SHARED dnl to hell with shared libraries
+AC_CANONICAL_HOST
+
+dnl configure options
+case "${host_cpu}" in
+  powerpc*) have_fma=yes;;
+  ia64*) have_fma=yes;;
+  hppa*) have_fma=yes;;
+  mips64*) have_fma=yes;;
+  *) have_fma=no;;
+esac
+
+AC_ARG_ENABLE(fma, [AC_HELP_STRING([--enable-fma],[enable optimizations for machines with fused multiply-add])], have_fma=$enableval)
+if test "$have_fma"x = "yes"x; then
+	AC_DEFINE(HAVE_FMA,1,[Define if you have a machine with fused multiply-add])
+fi
+
+
+AC_ARG_ENABLE(debug, [AC_HELP_STRING([--enable-debug],[compile fftw with extra runtime checks for debugging])], ok=$enableval, ok=no)
+if test "$ok" = "yes"; then
+	AC_DEFINE(FFTW_DEBUG,1,[Define to enable extra FFTW debugging code.])
+	debug_malloc=yes
+else
+	debug_malloc=no
+fi
+
+AC_ARG_ENABLE(debug-malloc, [AC_HELP_STRING([--enable-debug-malloc],[enable malloc debugging version])], ok=$enableval, ok=$debug_malloc)
+if test "$ok" = "yes"; then
+	AC_DEFINE(FFTW_DEBUG_MALLOC,1,[Define to enable debugging malloc.])
+fi
+
+AC_ARG_ENABLE(debug-alignment, [AC_HELP_STRING([--enable-debug-alignment],[enable alignment debugging hacks])], ok=$enableval, ok=no)
+if test "$ok" = "yes"; then
+	AC_DEFINE(FFTW_DEBUG_ALIGNMENT,1,[Define to enable alignment debugging hacks.])
+fi
+
+AC_ARG_ENABLE(random-estimator, [AC_HELP_STRING([--enable-random-estimator],[enable pseudorandom estimator (debugging hack)])], ok=$enableval, ok=no)
+if test "$ok" = "yes"; then
+	AC_DEFINE(FFTW_RANDOM_ESTIMATOR,1,[Define to enable pseudorandom estimate planning for debugging.])
+	CHECK_PL_OPTS="--estimate"
+fi
+
+AC_ARG_ENABLE(alloca, [AC_HELP_STRING([--disable-alloca],[disable use of the alloca() function (may be broken on mingw64)])], ok=$enableval, ok=yes)
+if test "$ok" = "yes"; then
+	AC_DEFINE(FFTW_ENABLE_ALLOCA,1,[Define to enable the use of alloca().])
+fi
+
+AC_ARG_ENABLE(single, [AC_HELP_STRING([--enable-single],[compile fftw in single precision])], ok=$enableval, ok=no)
+AC_ARG_ENABLE(float, [AC_HELP_STRING([--enable-float],[synonym for --enable-single])], ok=$enableval)
+if test "$ok" = "yes"; then
+	AC_DEFINE(FFTW_SINGLE,1,[Define to compile in single precision.])
+	AC_DEFINE(BENCHFFT_SINGLE,1,[Define to compile in single precision.])
+	PRECISION=s
+else
+	PRECISION=d
+fi
+AM_CONDITIONAL(SINGLE, test "$ok" = "yes")
+
+AC_ARG_ENABLE(long-double, [AC_HELP_STRING([--enable-long-double],[compile fftw in long-double precision])], ok=$enableval, ok=no)
+if test "$ok" = "yes"; then
+	if test "$PRECISION" = "s"; then
+		AC_MSG_ERROR([--enable-single/--enable-long-double conflict])
+	fi
+	AC_DEFINE(FFTW_LDOUBLE,1,[Define to compile in long-double precision.])
+	AC_DEFINE(BENCHFFT_LDOUBLE,1,[Define to compile in long-double precision.])
+	PRECISION=l
+fi
+AM_CONDITIONAL(LDOUBLE, test "$ok" = "yes")
+
+AC_ARG_ENABLE(quad-precision, [AC_HELP_STRING([--enable-quad-precision],[compile fftw in quadruple precision if available])], ok=$enableval, ok=no)
+if test "$ok" = "yes"; then
+	if test "$PRECISION" != "d"; then
+		AC_MSG_ERROR([conflicting precisions specified])
+	fi
+	AC_DEFINE(FFTW_QUAD,1,[Define to compile in quad precision.])
+	AC_DEFINE(BENCHFFT_QUAD,1,[Define to compile in quad precision.])
+	PRECISION=q
+fi
+AM_CONDITIONAL(QUAD, test "$ok" = "yes")
+
+AC_SUBST(PRECISION)
+AC_SUBST(CHECK_PL_OPTS)
+
+AC_ARG_ENABLE(sse, [AC_HELP_STRING([--enable-sse],[enable SSE optimizations])], have_sse=$enableval, have_sse=no)
+if test "$have_sse" = "yes"; then
+	if test "$PRECISION" != "s"; then
+		AC_MSG_ERROR([SSE requires single precision])
+	fi
+fi
+
+AC_ARG_ENABLE(sse2, [AC_HELP_STRING([--enable-sse2],[enable SSE/SSE2 optimizations])], have_sse2=$enableval, have_sse2=no)
+if test "$have_sse" = "yes"; then have_sse2=yes; fi
+if test "$have_sse2" = "yes"; then
+	AC_DEFINE(HAVE_SSE2,1,[Define to enable SSE/SSE2 optimizations.])
+	if test "$PRECISION" != "d" -a "$PRECISION" != "s"; then
+		AC_MSG_ERROR([SSE2 requires single or double precision])
+	fi
+fi
+AM_CONDITIONAL(HAVE_SSE2, test "$have_sse2" = "yes")
+
+AC_ARG_ENABLE(avx, [AC_HELP_STRING([--enable-avx],[enable AVX optimizations])], have_avx=$enableval, have_avx=no)
+if test "$have_avx" = "yes"; then
+        AC_DEFINE(HAVE_AVX,1,[Define to enable AVX optimizations.])
+	if test "$PRECISION" != "d" -a "$PRECISION" != "s"; then
+		AC_MSG_ERROR([AVX requires single or double precision])
+	fi
+fi
+AM_CONDITIONAL(HAVE_AVX, test "$have_avx" = "yes")
+
+AC_ARG_ENABLE(altivec, [AC_HELP_STRING([--enable-altivec],[enable Altivec optimizations])], have_altivec=$enableval, have_altivec=no)
+if test "$have_altivec" = "yes"; then
+	AC_DEFINE(HAVE_ALTIVEC,1,[Define to enable Altivec optimizations.])
+	if test "$PRECISION" != "s"; then
+		AC_MSG_ERROR([Altivec requires single precision])
+	fi
+fi
+AM_CONDITIONAL(HAVE_ALTIVEC, test "$have_altivec" = "yes")
+
+AC_ARG_ENABLE(neon, [AC_HELP_STRING([--enable-neon],[enable ARM NEON optimizations])], have_neon=$enableval, have_neon=no)
+if test "$have_neon" = "yes"; then
+	AC_DEFINE(HAVE_NEON,1,[Define to enable ARM NEON optimizations.])
+	if test "$PRECISION" != "s"; then
+		AC_MSG_ERROR([NEON requires single precision])
+	fi
+fi
+AM_CONDITIONAL(HAVE_NEON, test "$have_neon" = "yes")
+
+dnl FIXME:
+dnl AC_ARG_ENABLE(mips-ps, [AC_HELP_STRING([--enable-mips-ps],[enable MIPS pair-single optimizations])], have_mips_ps=$enableval, have_mips_ps=no)
+dnl if test "$have_mips_ps" = "yes"; then
+dnl 	AC_DEFINE(HAVE_MIPS_PS,1,[Define to enable MIPS paired-single optimizations.])
+dnl 	if test "$PRECISION" != "s"; then
+dnl 		AC_MSG_ERROR([MIPS paired-single requires single precision])
+dnl 	fi
+dnl fi
+dnl AM_CONDITIONAL(HAVE_MIPS_PS, test "$have_mips_ps" = "yes")
+
+AC_ARG_WITH(slow-timer, [AC_HELP_STRING([--with-slow-timer],[use low-precision timers (SLOW)])], with_slow_timer=$withval, with_slow_timer=no)
+if test "$with_slow_timer" = "yes"; then
+	AC_DEFINE(WITH_SLOW_TIMER,1,[Use low-precision timers, making planner very slow])
+fi
+
+AC_ARG_ENABLE(mips_zbus_timer, [AC_HELP_STRING([--enable-mips-zbus-timer],[use MIPS ZBus cycle-counter])], have_mips_zbus_timer=$enableval, have_mips_zbus_timer=no)
+if test "$have_mips_zbus_timer" = "yes"; then
+	AC_DEFINE(HAVE_MIPS_ZBUS_TIMER,1,[Define to enable use of MIPS ZBus cycle-counter.])
+fi
+
+AC_ARG_WITH(our-malloc, [AC_HELP_STRING([--with-our-malloc],[use our aligned malloc (helpful for Win32)])], with_our_malloc=$withval, with_our_malloc=no)
+AC_ARG_WITH(our-malloc16, [AC_HELP_STRING([--with-our-malloc16],[Obsolete alias for --with-our-malloc16])], with_our_malloc=$withval)
+if test "$with_our_malloc" = "yes"; then
+	AC_DEFINE(WITH_OUR_MALLOC,1,[Use our own aligned malloc routine; mainly helpful for Windows systems lacking aligned allocation system-library routines.])
+fi
+
+AC_ARG_WITH(windows-f77-mangling, [AC_HELP_STRING([--with-windows-f77-mangling],[use common Win32 Fortran interface styles])], with_windows_f77_mangling=$withval, with_windows_f77_mangling=no)
+if test "$with_windows_f77_mangling" = "yes"; then
+	AC_DEFINE(WINDOWS_F77_MANGLING,1,[Use common Windows Fortran mangling styles for the Fortran interfaces.])
+fi
+
+AC_ARG_WITH(incoming-stack-boundary, [AC_HELP_STRING([--with-incoming-stack-boundary=X],[Assume that stack is aligned to (1<<X) bytes])], with_incoming_stack_boundary=$withval, with_incoming_stack_boundary=no)
+
+dnl compute library suffix
+case "$PRECISION" in
+     s) PREC_SUFFIX=f;;
+     d) PREC_SUFFIX=;;
+     l) PREC_SUFFIX=l;;
+     q) PREC_SUFFIX=q;;
+esac
+AC_SUBST(PREC_SUFFIX)
+
+dnl Checks for programs.
+AC_PROG_CC
+AM_PROG_CC_C_O
+AX_COMPILER_VENDOR
+AC_PROG_CC_STDC
+AC_PROG_INSTALL
+AC_PROG_LN_S
+AC_PROG_MAKE_SET
+AC_LIBTOOL_WIN32_DLL
+AC_PROG_LIBTOOL
+
+AC_CHECK_PROG(OCAMLBUILD, ocamlbuild, ocamlbuild)
+
+dnl -----------------------------------------------------------------------
+
+AC_ARG_ENABLE(mpi, [AC_HELP_STRING([--enable-mpi],[compile FFTW MPI library])], enable_mpi=$enableval, enable_mpi=no)
+
+if test "$enable_mpi" = "yes"; then
+   if test $PRECISION = q; then
+      AC_MSG_ERROR([quad precision is not supported in MPI])
+   fi
+   ACX_MPI([],[AC_MSG_ERROR([could not find mpi library for --enable-mpi])])
+   AC_CHECK_PROG(MPIRUN, mpirun, mpirun)
+   AC_SUBST(MPIRUN)
+
+   save_CC=$CC
+   CC=$MPICC
+   AC_CHECK_SIZEOF(MPI_Fint, [], [#include <mpi.h>])
+   CC=$save_CC
+   if test 0 = $ac_cv_sizeof_MPI_Fint; then
+      AC_MSG_WARN([sizeof(MPI_Fint) test failed]);
+      dnl As a backup, assume Fortran integer == C int
+      AC_CHECK_SIZEOF(int) 
+      if test 0 = $ac_cv_sizeof_int; then AC_MSG_ERROR([sizeof(int) test failed]); fi
+      ac_cv_sizeof_MPI_Fint=$ac_cv_sizeof_int
+   fi
+   C_MPI_FINT=C_INT`expr $ac_cv_sizeof_MPI_Fint \* 8`_T
+   AC_SUBST(C_MPI_FINT)
+fi
+AM_CONDITIONAL(MPI, test "$enable_mpi" = "yes")
+
+dnl -----------------------------------------------------------------------
+
+dnl determine CFLAGS first
+AX_CC_MAXOPT
+
+case "${ax_cv_c_compiler_vendor}" in
+   intel) # Stop icc from defining __GNUC__, except on MacOS where this fails
+        case "${host_os}" in
+            *darwin*) ;; # icc -no-gcc fails to compile some system headers
+            *) 
+	       AX_CHECK_COMPILER_FLAGS([-no-gcc], [CC="$CC -no-gcc"])
+               ;;
+        esac
+        ;;
+
+   hp) # must (sometimes) manually increase cpp limits to handle fftw3.h
+        AX_CHECK_COMPILER_FLAGS([-Wp,-H128000],
+        		        [CC="$CC -Wp,-H128000"])
+        ;;
+
+   portland) # -Masmkeyword required for asm("") cycle counters
+	AX_CHECK_COMPILER_FLAGS([-Masmkeyword],
+                                [CC="$CC -Masmkeyword"])
+        ;;
+esac
+
+dnl Determine SIMD CFLAGS at least for gcc and icc
+case "${ax_cv_c_compiler_vendor}" in
+    gnu|intel)
+	# SSE/SSE2
+	if test "$have_sse2" = "yes" -a "x$SSE2_CFLAGS" = x; then
+	    if test "$PRECISION" = d; then flag=msse2; else flag=msse; fi
+	    AX_CHECK_COMPILER_FLAGS(-$flag, [SSE2_CFLAGS="-$flag"],
+		[AC_MSG_ERROR([Need a version of gcc with -$flag])])
+	fi
+
+	# AVX
+	if test "$have_avx" = "yes" -a "x$AVX_CFLAGS" = x; then
+	    AX_CHECK_COMPILER_FLAGS(-mavx, [AVX_CFLAGS="-mavx"],
+		[AC_MSG_ERROR([Need a version of gcc with -mavx])])
+	fi
+
+	if test "$have_altivec" = "yes" -a "x$ALTIVEC_CFLAGS" = x; then
+	    # -DFAKE__VEC__ is a workaround because gcc-3.3 does not
+	    # #define __VEC__ with -maltivec.
+	    AX_CHECK_COMPILER_FLAGS(-faltivec, [ALTIVEC_CFLAGS="-faltivec"],
+		[AX_CHECK_COMPILER_FLAGS(-maltivec -mabi=altivec,
+		    [ALTIVEC_CFLAGS="-maltivec -mabi=altivec -DFAKE__VEC__"],
+		    [AX_CHECK_COMPILER_FLAGS(-fvec, [ALTIVEC_CFLAGS="-fvec"],
+			[AC_MSG_ERROR([Need a version of gcc with -maltivec])])])])
+	fi
+
+	if test "$have_neon" = "yes" -a "x$NEON_CFLAGS" = x; then
+	    AX_CHECK_COMPILER_FLAGS(-mfpu=neon, [NEON_CFLAGS="-mfpu=neon"],
+		[AC_MSG_ERROR([Need a version of gcc with -mfpu=neon])])
+	fi
+
+	dnl FIXME:
+	dnl elif test "$have_mips_ps" = "yes"; then
+	dnl     # Just punt here and use only new 4.2 compiler :(
+	dnl 	# Should add section for older compilers...
+	dnl 	AX_CHECK_COMPILER_FLAGS(-mpaired-single,
+	dnl 	    [SIMD_CFLAGS="-mpaired-single"],
+	dnl 	    #[AC_MSG_ERROR([Need a version of gcc with -mpaired-single])])
+	dnl 	    [AX_CHECK_COMPILER_FLAGS(-march=mips64,
+	dnl 	      [SIMD_CFLAGS="-march=mips64"],
+	dnl 	        [AC_MSG_ERROR(
+	dnl 		 [Need a version of gcc with -mpaired-single or -march=mips64])
+	dnl 		])])
+	dnl fi
+	;;
+esac
+
+AC_SUBST(SSE2_CFLAGS)
+AC_SUBST(AVX_CFLAGS)
+AC_SUBST(ALTIVEC_CFLAGS)
+AC_SUBST(NEON_CFLAGS)
+
+dnl add stack alignment CFLAGS if so requested
+if test "$with_incoming_stack_boundary"x != "no"x; then
+   case "${ax_cv_c_compiler_vendor}" in
+      gnu)
+        tentative_flags="-mincoming-stack-boundary=$with_incoming_stack_boundary";
+        AX_CHECK_COMPILER_FLAGS($tentative_flags, 
+	          [STACK_ALIGN_CFLAGS=$tentative_flags])
+      ;;
+   esac
+fi
+AC_SUBST(STACK_ALIGN_CFLAGS)
+
+dnl Checks for header files.
+AC_HEADER_STDC
+AC_CHECK_HEADERS([libintl.h malloc.h stddef.h stdlib.h string.h strings.h sys/time.h unistd.h limits.h c_asm.h intrinsics.h stdint.h mach/mach_time.h sys/sysctl.h])
+dnl c_asm.h: Header file for enabling asm() on Digital Unix  
+dnl intrinsics.h: cray unicos
+dnl sys/sysctl.h: MacOS X altivec detection
+
+dnl altivec.h requires $ALTIVEC_CFLAGS
+save_CFLAGS="$CFLAGS"
+save_CPPFLAGS="$CPPFLAGS"
+CFLAGS="$CFLAGS $ALTIVEC_CFLAGS"
+CPPFLAGS="$CPPFLAGS $ALTIVEC_CFLAGS"
+AC_CHECK_HEADERS([altivec.h])
+CFLAGS="$save_CFLAGS"
+CPPFLAGS="$save_CPPFLAGS"
+
+
+dnl Checks for typedefs, structures, and compiler characteristics.
+AC_C_CONST
+AC_C_INLINE
+AC_TYPE_SIZE_T
+AC_HEADER_TIME
+AC_CHECK_TYPE([long double],
+              [AC_DEFINE(HAVE_LONG_DOUBLE, 1, [Define to 1 if the compiler supports `long double'])],
+[
+if test $PRECISION = l; then
+    AC_MSG_ERROR([long double is not a supported type with your compiler.])
+fi
+])
+AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,
+[
+#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+])
+
+AC_CHECK_SIZEOF(int)
+AC_CHECK_SIZEOF(unsigned int)
+AC_CHECK_SIZEOF(long)
+AC_CHECK_SIZEOF(unsigned long)
+AC_CHECK_SIZEOF(long long)
+AC_CHECK_SIZEOF(unsigned long long)
+AC_CHECK_SIZEOF(size_t)
+AC_CHECK_SIZEOF(ptrdiff_t)
+
+AC_CHECK_TYPES(uintptr_t, [], [AC_CHECK_SIZEOF(void *)], [$ac_includes_default
+#ifdef HAVE_STDINT_H
+#  include <stdint.h>
+#endif])
+
+AC_CHECK_SIZEOF(float)
+AC_CHECK_SIZEOF(double)
+
+dnl Check sizeof fftw_r2r_kind for Fortran interface [it has == sizeof(int)
+dnl for years, but being paranoid].  Note: the definition here must match
+dnl the one in api/fftw3.h!
+AC_CHECK_SIZEOF(fftw_r2r_kind, [], [typedef enum {
+     FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
+     FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
+     FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
+} fftw_r2r_kind;])
+if test 0 = $ac_cv_sizeof_fftw_r2r_kind; then AC_MSG_ERROR([sizeof(fftw_r2r_kind) test failed]); fi
+C_FFTW_R2R_KIND=C_INT`expr $ac_cv_sizeof_fftw_r2r_kind \* 8`_T
+AC_SUBST(C_FFTW_R2R_KIND)
+
+dnl Checks for library functions.
+AC_FUNC_ALLOCA
+AC_FUNC_STRTOD
+AC_FUNC_VPRINTF
+AC_CHECK_LIB(m, sin)
+
+if test $PRECISION = q; then
+   AX_GCC_VERSION(4,6,0,[],[AC_MSG_ERROR([gcc 4.6 or later required for quad precision support])])
+   AC_CHECK_LIB(quadmath, sinq, [], [AC_MSG_ERROR([quad precision requires libquadmath for quad-precision trigonometric routines])])
+   LIBQUADMATH=-lquadmath
+fi
+AC_SUBST(LIBQUADMATH)
+
+AC_CHECK_FUNCS([BSDgettimeofday gettimeofday gethrtime read_real_time time_base_to_time drand48 sqrt memset posix_memalign memalign _mm_malloc _mm_free clock_gettime mach_absolute_time sysctl abort sinl cosl snprintf])
+AC_CHECK_DECLS([drand48, srand48, memalign, posix_memalign, sinl, cosl, sinq, cosq])
+
+dnl Cray UNICOS _rtc() (real-time clock) intrinsic
+AC_MSG_CHECKING([for _rtc intrinsic])
+rtc_ok=yes
+AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
+#include <intrinsics.h>
+#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
+AC_MSG_RESULT($rtc_ok)
+
+if test "$PRECISION" = "l"; then
+	AC_CHECK_FUNCS([cosl sinl tanl], [], [AC_MSG_ERROR([long-double precision requires long-double trigonometric routines])])
+fi
+
+AC_MSG_CHECKING([for isnan])
+AC_TRY_LINK([#include <math.h>
+], if (!isnan(3.14159)) isnan(2.7183);, ok=yes, ok=no)
+if test "$ok" = "yes"; then
+	AC_DEFINE(HAVE_ISNAN,1,[Define if the isnan() function/macro is available.])
+fi
+AC_MSG_RESULT(${ok})
+
+dnl TODO
+AX_GCC_ALIGNS_STACK()
+
+dnl override CFLAGS selection when debugging
+if test "${enable_debug}" = "yes"; then
+	CFLAGS="-g"
+fi
+
+dnl add gcc warnings, in debug/maintainer mode only
+if test "$enable_debug" = yes || test "$USE_MAINTAINER_MODE" = yes; then
+if test "$ac_test_CFLAGS" != "set"; then
+	if test $ac_cv_prog_gcc = yes; then
+		CFLAGS="$CFLAGS -Wall -W -Wcast-qual -Wpointer-arith -Wcast-align -pedantic -Wno-long-long -Wshadow -Wbad-function-cast -Wwrite-strings -Wstrict-prototypes -Wredundant-decls -Wnested-externs" # -Wundef -Wconversion -Wmissing-prototypes -Wmissing-declarations 
+	fi
+fi
+fi
+
+dnl -----------------------------------------------------------------------
+
+AC_ARG_ENABLE(fortran, [AC_HELP_STRING([--disable-fortran],[don't include Fortran-callable wrappers])], enable_fortran=$enableval, enable_fortran=yes)
+
+if test "$enable_fortran" = "yes"; then
+        AC_PROG_F77
+        if test -z "$F77"; then
+                enable_fortran=no
+                AC_MSG_WARN([*** Couldn't find f77 compiler; using default Fortran wrappers.])
+	else
+		AC_F77_DUMMY_MAIN([], [enable_fortran=no
+			AC_MSG_WARN([*** Couldn't figure out how to link C and Fortran; using default Fortran wrappers.])])
+        fi
+else
+	AC_DEFINE([DISABLE_FORTRAN], 1, [Define to disable Fortran wrappers.])
+fi
+
+if test "x$enable_fortran" = xyes; then
+        AC_F77_WRAPPERS
+	AC_F77_FUNC(f77foo)
+	AC_F77_FUNC(f77_foo)
+	f77_foo2=`echo $f77foo | sed 's/77/77_/'`
+	if test "$f77_foo" = "$f77_foo2"; then
+		AC_DEFINE(F77_FUNC_EQUIV, 1, [Define if F77_FUNC and F77_FUNC_ are equivalent.])
+
+		# Include g77 wrappers by default for GNU systems or gfortran
+		with_g77_wrappers=$ac_cv_f77_compiler_gnu
+		case $host_os in *gnu*) with_g77_wrappers=yes ;; esac
+	fi
+else
+	with_g77_wrappers=no
+fi
+
+AC_ARG_WITH(g77-wrappers, [AC_HELP_STRING([--with-g77-wrappers],[force inclusion of g77-compatible wrappers in addition to any other Fortran compiler that is detected])], with_g77_wrappers=$withval)
+if test "x$with_g77_wrappers" = "xyes"; then
+	AC_DEFINE(WITH_G77_WRAPPERS,1,[Include g77-compatible wrappers in addition to any other Fortran wrappers.])
+fi
+
+dnl -----------------------------------------------------------------------
+have_smp="no"
+AC_ARG_ENABLE(openmp, [AC_HELP_STRING([--enable-openmp],[use OpenMP directives for parallelism])], enable_openmp=$enableval, enable_openmp=no)
+
+if test "$enable_openmp" = "yes"; then
+   AC_DEFINE(HAVE_OPENMP,1,[Define to enable OpenMP])
+   AX_OPENMP([], [AC_MSG_ERROR([don't know how to enable OpenMP])])
+fi
+
+AC_ARG_ENABLE(threads, [AC_HELP_STRING([--enable-threads],[compile FFTW SMP threads library])], enable_threads=$enableval, enable_threads=no)
+
+if test "$enable_threads" = "yes"; then
+   AC_DEFINE(HAVE_THREADS,1,[Define to enable SMP threads])
+fi
+
+AC_ARG_WITH(combined-threads, [AC_HELP_STRING([--with-combined-threads],[combine threads into main libfftw3])], with_combined_threads=$withval, with_combined_threads=no)
+
+if test "$with_combined_threads" = yes; then
+   if test "$enable_openmp" = "yes"; then
+      AC_MSG_ERROR([--with-combined-threads incompatible with --enable-openmp])
+   fi
+   if test "$enable_threads" != "yes"; then
+      AC_MSG_ERROR([--with-combined-threads requires --enable-threads])
+   fi
+fi
+
+dnl Check for threads library...
+THREADLIBS=""
+if test "$enable_threads" = "yes"; then
+	# POSIX threads, the default choice:
+	if test -z "$THREADLIBS"; then
+		ACX_PTHREAD([THREADLIBS="$PTHREAD_LIBS "
+	                     CC="$PTHREAD_CC"
+	                     AC_DEFINE(USING_POSIX_THREADS, 1, [Define if we have and are using POSIX threads.])])
+	fi
+
+	if test -z "$THREADLIBS"; then
+		AC_MSG_CHECKING([for Win32 threads])
+		AC_TRY_LINK([#include <windows.h>],
+			[_beginthreadex(0,0,0,0,0,0);],
+			[THREADLIBS=" "; AC_MSG_RESULT(yes)],
+			[AC_MSG_RESULT(no)])
+	fi
+
+	if test -z "$THREADLIBS"; then
+		AC_MSG_ERROR([couldn't find threads library for --enable-threads])
+	fi
+	AC_DEFINE(HAVE_THREADS, 1, [Define if we have a threads library.])
+fi
+AC_SUBST(THREADLIBS)
+AM_CONDITIONAL(THREADS, test "$enable_threads" = "yes")
+AM_CONDITIONAL(OPENMP, test "$enable_openmp" = "yes")
+AM_CONDITIONAL(SMP, test "$enable_threads" = "yes" -o "$enable_openmp" = "yes")
+AM_CONDITIONAL(COMBINED_THREADS, test x"$with_combined_threads" = xyes)
+
+dnl -----------------------------------------------------------------------
+
+AC_MSG_CHECKING([whether a cycle counter is available])
+save_CPPFLAGS=$CPPFLAGS
+CPPFLAGS="$CPPFLAGS -I$srcdir/kernel"
+AC_TRY_CPP([#include "cycle.h"
+#ifndef HAVE_TICK_COUNTER
+#  error No cycle counter
+#endif], [ok=yes], [ok=no])
+CPPFLAGS=$save_CPPFLAGS
+AC_MSG_RESULT($ok)
+if test $ok = no && test "x$with_slow_timer" = xno; then
+	echo "***************************************************************"
+	echo "WARNING: No cycle counter found.  FFTW will use ESTIMATE mode  "
+	echo "         for all plans.  See the manual for more information."
+	echo "***************************************************************"
+fi
+
+dnl -----------------------------------------------------------------------
+
+AC_DEFINE_UNQUOTED(FFTW_CC, "$CC $CFLAGS", [C compiler name and flags])
+
+AC_CONFIG_FILES([
+   Makefile
+   support/Makefile
+   genfft/Makefile
+   kernel/Makefile
+   simd-support/Makefile
+
+   dft/Makefile
+   dft/scalar/Makefile
+   dft/scalar/codelets/Makefile
+   dft/simd/Makefile
+   dft/simd/common/Makefile
+   dft/simd/sse2/Makefile
+   dft/simd/avx/Makefile
+   dft/simd/altivec/Makefile
+   dft/simd/neon/Makefile
+
+   rdft/Makefile
+   rdft/scalar/Makefile
+   rdft/scalar/r2cf/Makefile
+   rdft/scalar/r2cb/Makefile
+   rdft/scalar/r2r/Makefile
+   rdft/simd/Makefile
+   rdft/simd/common/Makefile
+   rdft/simd/sse2/Makefile
+   rdft/simd/avx/Makefile
+   rdft/simd/altivec/Makefile
+   rdft/simd/neon/Makefile
+
+   reodft/Makefile
+
+   threads/Makefile
+
+   api/Makefile
+
+   mpi/Makefile
+
+   libbench2/Makefile
+   tests/Makefile
+   doc/Makefile
+   doc/FAQ/Makefile
+
+   tools/Makefile
+   tools/fftw_wisdom.1
+   tools/fftw-wisdom-to-conf
+
+   m4/Makefile
+
+   fftw.pc
+])
+
+AC_OUTPUT
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/depcomp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/depcomp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,708 @@
+#! /bin/sh
+# depcomp - compile a program generating dependencies as side-effects
+
+scriptversion=2012-03-27.16; # UTC
+
+# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007, 2009, 2010,
+# 2011, 2012 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
+
+case $1 in
+  '')
+     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
+     exit 1;
+     ;;
+  -h | --h*)
+    cat <<\EOF
+Usage: depcomp [--help] [--version] PROGRAM [ARGS]
+
+Run PROGRAMS ARGS to compile a file, generating dependencies
+as side-effects.
+
+Environment variables:
+  depmode     Dependency tracking mode.
+  source      Source file read by 'PROGRAMS ARGS'.
+  object      Object file output by 'PROGRAMS ARGS'.
+  DEPDIR      directory where to store dependencies.
+  depfile     Dependency file to output.
+  tmpdepfile  Temporary file to use when outputting dependencies.
+  libtool     Whether libtool is used (yes/no).
+
+Report bugs to <bug-automake@gnu.org>.
+EOF
+    exit $?
+    ;;
+  -v | --v*)
+    echo "depcomp $scriptversion"
+    exit $?
+    ;;
+esac
+
+# A tabulation character.
+tab='	'
+# A newline character.
+nl='
+'
+
+if test -z "$depmode" || test -z "$source" || test -z "$object"; then
+  echo "depcomp: Variables source, object and depmode must be set" 1>&2
+  exit 1
+fi
+
+# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
+depfile=${depfile-`echo "$object" |
+  sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
+tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
+
+rm -f "$tmpdepfile"
+
+# Some modes work just like other modes, but use different flags.  We
+# parameterize here, but still list the modes in the big case below,
+# to make depend.m4 easier to write.  Note that we *cannot* use a case
+# here, because this file can only contain one case statement.
+if test "$depmode" = hp; then
+  # HP compiler uses -M and no extra arg.
+  gccflag=-M
+  depmode=gcc
+fi
+
+if test "$depmode" = dashXmstdout; then
+   # This is just like dashmstdout with a different argument.
+   dashmflag=-xM
+   depmode=dashmstdout
+fi
+
+cygpath_u="cygpath -u -f -"
+if test "$depmode" = msvcmsys; then
+   # This is just like msvisualcpp but w/o cygpath translation.
+   # Just convert the backslash-escaped backslashes to single forward
+   # slashes to satisfy depend.m4
+   cygpath_u='sed s,\\\\,/,g'
+   depmode=msvisualcpp
+fi
+
+if test "$depmode" = msvc7msys; then
+   # This is just like msvc7 but w/o cygpath translation.
+   # Just convert the backslash-escaped backslashes to single forward
+   # slashes to satisfy depend.m4
+   cygpath_u='sed s,\\\\,/,g'
+   depmode=msvc7
+fi
+
+if test "$depmode" = xlc; then
+   # IBM C/C++ Compilers xlc/xlC can output gcc-like dependency informations.
+   gccflag=-qmakedep=gcc,-MF
+   depmode=gcc
+fi
+
+case "$depmode" in
+gcc3)
+## gcc 3 implements dependency tracking that does exactly what
+## we want.  Yay!  Note: for some reason libtool 1.4 doesn't like
+## it if -MD -MP comes after the -MF stuff.  Hmm.
+## Unfortunately, FreeBSD c89 acceptance of flags depends upon
+## the command line argument order; so add the flags where they
+## appear in depend2.am.  Note that the slowdown incurred here
+## affects only configure: in makefiles, %FASTDEP% shortcuts this.
+  for arg
+  do
+    case $arg in
+    -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
+    *)  set fnord "$@" "$arg" ;;
+    esac
+    shift # fnord
+    shift # $arg
+  done
+  "$@"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  mv "$tmpdepfile" "$depfile"
+  ;;
+
+gcc)
+## There are various ways to get dependency output from gcc.  Here's
+## why we pick this rather obscure method:
+## - Don't want to use -MD because we'd like the dependencies to end
+##   up in a subdir.  Having to rename by hand is ugly.
+##   (We might end up doing this anyway to support other compilers.)
+## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
+##   -MM, not -M (despite what the docs say).
+## - Using -M directly means running the compiler twice (even worse
+##   than renaming).
+  if test -z "$gccflag"; then
+    gccflag=-MD,
+  fi
+  "$@" -Wp,"$gccflag$tmpdepfile"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+## The second -e expression handles DOS-style file names with drive letters.
+  sed -e 's/^[^:]*: / /' \
+      -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
+## This next piece of magic avoids the "deleted header file" problem.
+## The problem is that when a header file which appears in a .P file
+## is deleted, the dependency causes make to die (because there is
+## typically no way to rebuild the header).  We avoid this by adding
+## dummy dependencies for each header file.  Too bad gcc doesn't do
+## this for us directly.
+  tr ' ' "$nl" < "$tmpdepfile" |
+## Some versions of gcc put a space before the ':'.  On the theory
+## that the space means something, we add a space to the output as
+## well.  hp depmode also adds that space, but also prefixes the VPATH
+## to the object.  Take care to not repeat it in the output.
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \
+      | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+hp)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+sgi)
+  if test "$libtool" = yes; then
+    "$@" "-Wp,-MDupdate,$tmpdepfile"
+  else
+    "$@" -MDupdate "$tmpdepfile"
+  fi
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+
+  if test -f "$tmpdepfile"; then  # yes, the sourcefile depend on other files
+    echo "$object : \\" > "$depfile"
+
+    # Clip off the initial element (the dependent).  Don't try to be
+    # clever and replace this with sed code, as IRIX sed won't handle
+    # lines with more than a fixed number of characters (4096 in
+    # IRIX 6.2 sed, 8192 in IRIX 6.5).  We also remove comment lines;
+    # the IRIX cc adds comments like '#:fec' to the end of the
+    # dependency line.
+    tr ' ' "$nl" < "$tmpdepfile" \
+    | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
+    tr "$nl" ' ' >> "$depfile"
+    echo >> "$depfile"
+
+    # The second pass generates a dummy entry for each header file.
+    tr ' ' "$nl" < "$tmpdepfile" \
+   | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
+   >> "$depfile"
+  else
+    # The sourcefile does not contain any dependencies, so just
+    # store a dummy comment line, to avoid errors with the Makefile
+    # "include basename.Plo" scheme.
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile"
+  ;;
+
+xlc)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+aix)
+  # The C for AIX Compiler uses -M and outputs the dependencies
+  # in a .u file.  In older versions, this file always lives in the
+  # current directory.  Also, the AIX compiler puts '$object:' at the
+  # start of each line; $object doesn't have directory information.
+  # Version 6 uses the directory in both cases.
+  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+  test "x$dir" = "x$object" && dir=
+  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+  if test "$libtool" = yes; then
+    tmpdepfile1=$dir$base.u
+    tmpdepfile2=$base.u
+    tmpdepfile3=$dir.libs/$base.u
+    "$@" -Wc,-M
+  else
+    tmpdepfile1=$dir$base.u
+    tmpdepfile2=$dir$base.u
+    tmpdepfile3=$dir$base.u
+    "$@" -M
+  fi
+  stat=$?
+
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
+    exit $stat
+  fi
+
+  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
+  do
+    test -f "$tmpdepfile" && break
+  done
+  if test -f "$tmpdepfile"; then
+    # Each line is of the form 'foo.o: dependent.h'.
+    # Do two passes, one to just change these to
+    # '$object: dependent.h' and one to simply 'dependent.h:'.
+    sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
+    sed -e 's,^.*\.[a-z]*:['"$tab"' ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
+  else
+    # The sourcefile does not contain any dependencies, so just
+    # store a dummy comment line, to avoid errors with the Makefile
+    # "include basename.Plo" scheme.
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile"
+  ;;
+
+icc)
+  # Intel's C compiler anf tcc (Tiny C Compiler) understand '-MD -MF file'.
+  # However on
+  #    $CC -MD -MF foo.d -c -o sub/foo.o sub/foo.c
+  # ICC 7.0 will fill foo.d with something like
+  #    foo.o: sub/foo.c
+  #    foo.o: sub/foo.h
+  # which is wrong.  We want
+  #    sub/foo.o: sub/foo.c
+  #    sub/foo.o: sub/foo.h
+  #    sub/foo.c:
+  #    sub/foo.h:
+  # ICC 7.1 will output
+  #    foo.o: sub/foo.c sub/foo.h
+  # and will wrap long lines using '\':
+  #    foo.o: sub/foo.c ... \
+  #     sub/foo.h ... \
+  #     ...
+  # tcc 0.9.26 (FIXME still under development at the moment of writing)
+  # will emit a similar output, but also prepend the continuation lines
+  # with horizontal tabulation characters.
+  "$@" -MD -MF "$tmpdepfile"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  # Each line is of the form 'foo.o: dependent.h',
+  # or 'foo.o: dep1.h dep2.h \', or ' dep3.h dep4.h \'.
+  # Do two passes, one to just change these to
+  # '$object: dependent.h' and one to simply 'dependent.h:'.
+  sed -e "s/^[ $tab][ $tab]*/  /" -e "s,^[^:]*:,$object :," \
+    < "$tmpdepfile" > "$depfile"
+  sed '
+    s/[ '"$tab"'][ '"$tab"']*/ /g
+    s/^ *//
+    s/ *\\*$//
+    s/^[^:]*: *//
+    /^$/d
+    /:$/d
+    s/$/ :/
+  ' < "$tmpdepfile" >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+hp2)
+  # The "hp" stanza above does not work with aCC (C++) and HP's ia64
+  # compilers, which have integrated preprocessors.  The correct option
+  # to use with these is +Maked; it writes dependencies to a file named
+  # 'foo.d', which lands next to the object file, wherever that
+  # happens to be.
+  # Much of this is similar to the tru64 case; see comments there.
+  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+  test "x$dir" = "x$object" && dir=
+  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+  if test "$libtool" = yes; then
+    tmpdepfile1=$dir$base.d
+    tmpdepfile2=$dir.libs/$base.d
+    "$@" -Wc,+Maked
+  else
+    tmpdepfile1=$dir$base.d
+    tmpdepfile2=$dir$base.d
+    "$@" +Maked
+  fi
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+     rm -f "$tmpdepfile1" "$tmpdepfile2"
+     exit $stat
+  fi
+
+  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
+  do
+    test -f "$tmpdepfile" && break
+  done
+  if test -f "$tmpdepfile"; then
+    sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
+    # Add 'dependent.h:' lines.
+    sed -ne '2,${
+	       s/^ *//
+	       s/ \\*$//
+	       s/$/:/
+	       p
+	     }' "$tmpdepfile" >> "$depfile"
+  else
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile" "$tmpdepfile2"
+  ;;
+
+tru64)
+   # The Tru64 compiler uses -MD to generate dependencies as a side
+   # effect.  'cc -MD -o foo.o ...' puts the dependencies into 'foo.o.d'.
+   # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
+   # dependencies in 'foo.d' instead, so we check for that too.
+   # Subdirectories are respected.
+   dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+   test "x$dir" = "x$object" && dir=
+   base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+
+   if test "$libtool" = yes; then
+      # With Tru64 cc, shared objects can also be used to make a
+      # static library.  This mechanism is used in libtool 1.4 series to
+      # handle both shared and static libraries in a single compilation.
+      # With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
+      #
+      # With libtool 1.5 this exception was removed, and libtool now
+      # generates 2 separate objects for the 2 libraries.  These two
+      # compilations output dependencies in $dir.libs/$base.o.d and
+      # in $dir$base.o.d.  We have to check for both files, because
+      # one of the two compilations can be disabled.  We should prefer
+      # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
+      # automatically cleaned when .libs/ is deleted, while ignoring
+      # the former would cause a distcleancheck panic.
+      tmpdepfile1=$dir.libs/$base.lo.d   # libtool 1.4
+      tmpdepfile2=$dir$base.o.d          # libtool 1.5
+      tmpdepfile3=$dir.libs/$base.o.d    # libtool 1.5
+      tmpdepfile4=$dir.libs/$base.d      # Compaq CCC V6.2-504
+      "$@" -Wc,-MD
+   else
+      tmpdepfile1=$dir$base.o.d
+      tmpdepfile2=$dir$base.d
+      tmpdepfile3=$dir$base.d
+      tmpdepfile4=$dir$base.d
+      "$@" -MD
+   fi
+
+   stat=$?
+   if test $stat -eq 0; then :
+   else
+      rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
+      exit $stat
+   fi
+
+   for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
+   do
+     test -f "$tmpdepfile" && break
+   done
+   if test -f "$tmpdepfile"; then
+      sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
+      sed -e 's,^.*\.[a-z]*:['"$tab"' ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
+   else
+      echo "#dummy" > "$depfile"
+   fi
+   rm -f "$tmpdepfile"
+   ;;
+
+msvc7)
+  if test "$libtool" = yes; then
+    showIncludes=-Wc,-showIncludes
+  else
+    showIncludes=-showIncludes
+  fi
+  "$@" $showIncludes > "$tmpdepfile"
+  stat=$?
+  grep -v '^Note: including file: ' "$tmpdepfile"
+  if test "$stat" = 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  # The first sed program below extracts the file names and escapes
+  # backslashes for cygpath.  The second sed program outputs the file
+  # name when reading, but also accumulates all include files in the
+  # hold buffer in order to output them again at the end.  This only
+  # works with sed implementations that can handle large buffers.
+  sed < "$tmpdepfile" -n '
+/^Note: including file:  *\(.*\)/ {
+  s//\1/
+  s/\\/\\\\/g
+  p
+}' | $cygpath_u | sort -u | sed -n '
+s/ /\\ /g
+s/\(.*\)/'"$tab"'\1 \\/p
+s/.\(.*\) \\/\1:/
+H
+$ {
+  s/.*/'"$tab"'/
+  G
+  p
+}' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+msvc7msys)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+#nosideeffect)
+  # This comment above is used by automake to tell side-effect
+  # dependency tracking mechanisms from slower ones.
+
+dashmstdout)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout, regardless of -o.
+  "$@" || exit $?
+
+  # Remove the call to Libtool.
+  if test "$libtool" = yes; then
+    while test "X$1" != 'X--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+
+  # Remove '-o $object'.
+  IFS=" "
+  for arg
+  do
+    case $arg in
+    -o)
+      shift
+      ;;
+    $object)
+      shift
+      ;;
+    *)
+      set fnord "$@" "$arg"
+      shift # fnord
+      shift # $arg
+      ;;
+    esac
+  done
+
+  test -z "$dashmflag" && dashmflag=-M
+  # Require at least two characters before searching for ':'
+  # in the target name.  This is to cope with DOS-style filenames:
+  # a dependency such as 'c:/foo/bar' could be seen as target 'c' otherwise.
+  "$@" $dashmflag |
+    sed 's:^['"$tab"' ]*[^:'"$tab"' ][^:][^:]*\:['"$tab"' ]*:'"$object"'\: :' > "$tmpdepfile"
+  rm -f "$depfile"
+  cat < "$tmpdepfile" > "$depfile"
+  tr ' ' "$nl" < "$tmpdepfile" | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+dashXmstdout)
+  # This case only exists to satisfy depend.m4.  It is never actually
+  # run, as this mode is specially recognized in the preamble.
+  exit 1
+  ;;
+
+makedepend)
+  "$@" || exit $?
+  # Remove any Libtool call
+  if test "$libtool" = yes; then
+    while test "X$1" != 'X--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+  # X makedepend
+  shift
+  cleared=no eat=no
+  for arg
+  do
+    case $cleared in
+    no)
+      set ""; shift
+      cleared=yes ;;
+    esac
+    if test $eat = yes; then
+      eat=no
+      continue
+    fi
+    case "$arg" in
+    -D*|-I*)
+      set fnord "$@" "$arg"; shift ;;
+    # Strip any option that makedepend may not understand.  Remove
+    # the object too, otherwise makedepend will parse it as a source file.
+    -arch)
+      eat=yes ;;
+    -*|$object)
+      ;;
+    *)
+      set fnord "$@" "$arg"; shift ;;
+    esac
+  done
+  obj_suffix=`echo "$object" | sed 's/^.*\././'`
+  touch "$tmpdepfile"
+  ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
+  rm -f "$depfile"
+  # makedepend may prepend the VPATH from the source file name to the object.
+  # No need to regex-escape $object, excess matching of '.' is harmless.
+  sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile"
+  sed '1,2d' "$tmpdepfile" | tr ' ' "$nl" | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile" "$tmpdepfile".bak
+  ;;
+
+cpp)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout.
+  "$@" || exit $?
+
+  # Remove the call to Libtool.
+  if test "$libtool" = yes; then
+    while test "X$1" != 'X--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+
+  # Remove '-o $object'.
+  IFS=" "
+  for arg
+  do
+    case $arg in
+    -o)
+      shift
+      ;;
+    $object)
+      shift
+      ;;
+    *)
+      set fnord "$@" "$arg"
+      shift # fnord
+      shift # $arg
+      ;;
+    esac
+  done
+
+  "$@" -E |
+    sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
+       -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
+    sed '$ s: \\$::' > "$tmpdepfile"
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  cat < "$tmpdepfile" >> "$depfile"
+  sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+msvisualcpp)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout.
+  "$@" || exit $?
+
+  # Remove the call to Libtool.
+  if test "$libtool" = yes; then
+    while test "X$1" != 'X--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+
+  IFS=" "
+  for arg
+  do
+    case "$arg" in
+    -o)
+      shift
+      ;;
+    $object)
+      shift
+      ;;
+    "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
+	set fnord "$@"
+	shift
+	shift
+	;;
+    *)
+	set fnord "$@" "$arg"
+	shift
+	shift
+	;;
+    esac
+  done
+  "$@" -E 2>/dev/null |
+  sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile"
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::'"$tab"'\1 \\:p' >> "$depfile"
+  echo "$tab" >> "$depfile"
+  sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+msvcmsys)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+none)
+  exec "$@"
+  ;;
+
+*)
+  echo "Unknown depmode $depmode" 1>&2
+  exit 1
+  ;;
+esac
+
+exit 0
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,13 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel
+SUBDIRS = scalar simd
+
+noinst_LTLIBRARIES = libdft.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = codelet-dft.h dft.h
+
+libdft_la_SOURCES = bluestein.c buffered.c conf.c ct.c dftw-direct.c	\
+dftw-directsq.c dftw-generic.c dftw-genericbuf.c direct.c generic.c	\
+indirect.c indirect-transpose.c kdft-dif.c kdft-difsq.c kdft-dit.c	\
+kdft.c nop.c plan.c problem.c rader.c rank-geq2.c solve.c vrank-geq1.c	\
+zero.c codelet-dft.h ct.h dft.h
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,720 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = dft
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libdft_la_LIBADD =
+am_libdft_la_OBJECTS = bluestein.lo buffered.lo conf.lo ct.lo \
+	dftw-direct.lo dftw-directsq.lo dftw-generic.lo \
+	dftw-genericbuf.lo direct.lo generic.lo indirect.lo \
+	indirect-transpose.lo kdft-dif.lo kdft-difsq.lo kdft-dit.lo \
+	kdft.lo nop.lo plan.lo problem.lo rader.lo rank-geq2.lo \
+	solve.lo vrank-geq1.lo zero.lo
+libdft_la_OBJECTS = $(am_libdft_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libdft_la_SOURCES)
+DIST_SOURCES = $(libdft_la_SOURCES)
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
+	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
+	distdir
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel
+SUBDIRS = scalar simd
+noinst_LTLIBRARIES = libdft.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = codelet-dft.h dft.h
+libdft_la_SOURCES = bluestein.c buffered.c conf.c ct.c dftw-direct.c	\
+dftw-directsq.c dftw-generic.c dftw-genericbuf.c direct.c generic.c	\
+indirect.c indirect-transpose.c kdft-dif.c kdft-difsq.c kdft-dit.c	\
+kdft.c nop.c plan.c problem.c rader.c rank-geq2.c solve.c vrank-geq1.c	\
+zero.c codelet-dft.h ct.h dft.h
+
+all: all-recursive
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu dft/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libdft.la: $(libdft_la_OBJECTS) $(libdft_la_DEPENDENCIES) $(EXTRA_libdft_la_DEPENDENCIES) 
+	$(LINK)  $(libdft_la_OBJECTS) $(libdft_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bluestein.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffered.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/conf.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-direct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-directsq.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-generic.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-genericbuf.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/generic.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/indirect-transpose.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/indirect.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft-dif.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft-difsq.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft-dit.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nop.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rader.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank-geq2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solve.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank-geq1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/zero.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-recursive
+all-am: Makefile $(LTLIBRARIES)
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \
+	install-am install-strip tags-recursive
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am check check-am clean clean-generic clean-libtool \
+	clean-noinstLTLIBRARIES ctags ctags-recursive distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	installdirs-am maintainer-clean maintainer-clean-generic \
+	mostlyclean mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \
+	uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/bluestein.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/bluestein.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "dft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_dft super;
+     INT n;     /* problem size */
+     INT nb;    /* size of convolution */
+     R *w;      /* lambda k . exp(2*pi*i*k^2/(2*n)) */
+     R *W;      /* DFT(w) */
+     plan *cldf;
+     INT is, os;
+} P;
+
+static void bluestein_sequence(enum wakefulness wakefulness, INT n, R *w)
+{
+     INT k, ksq, n2 = 2 * n;
+     triggen *t = X(mktriggen)(wakefulness, n2);
+
+     ksq = 0;
+     for (k = 0; k < n; ++k) {
+	  t->cexp(t, ksq, w+2*k);
+          /* careful with overflow */
+          ksq += 2*k + 1; while (ksq > n2) ksq -= n2;
+     }
+
+     X(triggen_destroy)(t);
+}
+
+static void mktwiddle(enum wakefulness wakefulness, P *p)
+{
+     INT i;
+     INT n = p->n, nb = p->nb;
+     R *w, *W;
+     E nbf = (E)nb;
+
+     p->w = w = (R *) MALLOC(2 * n * sizeof(R), TWIDDLES);
+     p->W = W = (R *) MALLOC(2 * nb * sizeof(R), TWIDDLES);
+
+     bluestein_sequence(wakefulness, n, w);
+
+     for (i = 0; i < nb; ++i)
+          W[2*i] = W[2*i+1] = K(0.0);
+
+     W[0] = w[0] / nbf;
+     W[1] = w[1] / nbf;
+
+     for (i = 1; i < n; ++i) {
+          W[2*i] = W[2*(nb-i)] = w[2*i] / nbf;
+          W[2*i+1] = W[2*(nb-i)+1] = w[2*i+1] / nbf;
+     }
+
+     {
+          plan_dft *cldf = (plan_dft *)p->cldf;
+	  /* cldf must be awake */
+          cldf->apply(p->cldf, W, W+1, W, W+1);
+     }
+}
+
+static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     INT i, n = ego->n, nb = ego->nb, is = ego->is, os = ego->os;
+     R *w = ego->w, *W = ego->W;
+     R *b = (R *) MALLOC(2 * nb * sizeof(R), BUFFERS);
+
+     /* multiply input by conjugate bluestein sequence */
+     for (i = 0; i < n; ++i) {
+	  E xr = ri[i*is], xi = ii[i*is];
+          E wr = w[2*i], wi = w[2*i+1];
+          b[2*i] = xr * wr + xi * wi;
+          b[2*i+1] = xi * wr - xr * wi;
+     }
+
+     for (; i < nb; ++i) b[2*i] = b[2*i+1] = K(0.0);
+
+     /* convolution: FFT */
+     {
+          plan_dft *cldf = (plan_dft *)ego->cldf;
+          cldf->apply(ego->cldf, b, b+1, b, b+1);
+     }
+
+     /* convolution: pointwise multiplication */
+     for (i = 0; i < nb; ++i) {
+	  E xr = b[2*i], xi = b[2*i+1];
+          E wr = W[2*i], wi = W[2*i+1];
+          b[2*i] = xi * wr + xr * wi;
+          b[2*i+1] = xr * wr - xi * wi;
+     }
+
+     /* convolution: IFFT by FFT with real/imag input/output swapped */
+     {
+          plan_dft *cldf = (plan_dft *)ego->cldf;
+          cldf->apply(ego->cldf, b, b+1, b, b+1);
+     }
+
+     /* multiply output by conjugate bluestein sequence */
+     for (i = 0; i < n; ++i) {
+	  E xi = b[2*i], xr = b[2*i+1];
+          E wr = w[2*i], wi = w[2*i+1];
+          ro[i*os] = xr * wr + xi * wi;
+          io[i*os] = xi * wr - xr * wi;
+     }
+
+     X(ifree)(b);	  
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(plan_awake)(ego->cldf, wakefulness);
+
+     switch (wakefulness) {
+	 case SLEEPY:
+	      X(ifree0)(ego->w); ego->w = 0;
+	      X(ifree0)(ego->W); ego->W = 0;
+	      break;
+	 default:
+	      A(!ego->w);
+	      mktwiddle(wakefulness, ego);
+	      break;
+     }
+}
+
+static int applicable(const solver *ego, const problem *p_, 
+		      const planner *plnr)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     UNUSED(ego);
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk == 0
+	     /* FIXME: allow other sizes */
+	     && X(is_prime)(p->sz->dims[0].n)
+
+	     /* FIXME: avoid infinite recursion of bluestein with itself.
+		This works because all factors in child problems are 2, 3, 5 */
+	     && p->sz->dims[0].n > 16
+
+	     && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > BLUESTEIN_MAX_SLOW)
+	  );
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldf);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *)ego_;
+     p->print(p, "(dft-bluestein-%D/%D%(%p%))",
+              ego->n, ego->nb, ego->cldf);
+}
+
+static INT choose_transform_size(INT minsz)
+{
+     while (!X(factors_into_small_primes)(minsz))
+	  ++minsz;
+     return minsz;
+}
+
+static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     P *pln;
+     INT n, nb;
+     plan *cldf = 0;
+     R *buf = (R *) 0;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego, p_, plnr))
+	  return (plan *) 0;
+
+     n = p->sz->dims[0].n;
+     nb = choose_transform_size(2 * n - 1);
+     buf = (R *) MALLOC(2 * nb * sizeof(R), BUFFERS);
+
+     cldf = X(mkplan_f_d)(plnr, 
+			  X(mkproblem_dft_d)(X(mktensor_1d)(nb, 2, 2),
+					     X(mktensor_1d)(1, 0, 0),
+					     buf, buf+1, 
+					     buf, buf+1),
+			  NO_SLOW, 0, 0);
+     if (!cldf) goto nada;
+
+     X(ifree)(buf);
+
+     pln = MKPLAN_DFT(P, &padt, apply);
+
+     pln->n = n;
+     pln->nb = nb;
+     pln->w = 0;
+     pln->W = 0;
+     pln->cldf = cldf;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+
+     X(ops_add)(&cldf->ops, &cldf->ops, &pln->super.super.ops);
+     pln->super.super.ops.add += 4 * n + 2 * nb;
+     pln->super.super.ops.mul += 8 * n + 4 * nb;
+     pln->super.super.ops.other += 6 * (n + nb);
+
+     return &(pln->super.super);
+
+ nada:
+     X(ifree0)(buf);
+     X(plan_destroy_internal)(cldf);
+     return (plan *)0;
+}
+
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(dft_bluestein_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/buffered.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/buffered.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     int maxnbuf_ndx;
+} S;
+
+static const INT maxnbufs[] = { 8, 256 };
+
+typedef struct {
+     plan_dft super;
+
+     plan *cld, *cldcpy, *cldrest;
+     INT n, vl, nbuf, bufdist;
+     INT ivs_by_nbuf, ovs_by_nbuf;
+     INT roffset, ioffset;
+} P;
+
+/* transform a vector input with the help of bufs */
+static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     INT nbuf = ego->nbuf;
+     R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist * 2, BUFFERS);
+
+     plan_dft *cld = (plan_dft *) ego->cld;
+     plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
+     plan_dft *cldrest;
+     INT i, vl = ego->vl;
+     INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
+     INT roffset = ego->roffset, ioffset = ego->ioffset;
+
+     for (i = nbuf; i <= vl; i += nbuf) {
+          /* transform to bufs: */
+          cld->apply((plan *) cld, ri, ii, bufs + roffset, bufs + ioffset);
+	  ri += ivs_by_nbuf; ii += ivs_by_nbuf;
+
+          /* copy back */
+          cldcpy->apply((plan *) cldcpy, bufs+roffset, bufs+ioffset, ro, io);
+	  ro += ovs_by_nbuf; io += ovs_by_nbuf;
+     }
+
+     X(ifree)(bufs);
+
+     /* Do the remaining transforms, if any: */
+     cldrest = (plan_dft *) ego->cldrest;
+     cldrest->apply((plan *) cldrest, ri, ii, ro, io);
+}
+
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldcpy, wakefulness);
+     X(plan_awake)(ego->cldrest, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldrest);
+     X(plan_destroy_internal)(ego->cldcpy);
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(dft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
+              ego->n, ego->nbuf,
+              ego->vl, ego->bufdist % ego->n,
+              ego->cld, ego->cldcpy, ego->cldrest);
+}
+
+static int applicable0(const S *ego, const problem *p_, const planner *plnr)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     const iodim *d = p->sz->dims;
+
+     if (1
+	 && p->vecsz->rnk <= 1
+	 && p->sz->rnk == 1
+	  ) {
+	  INT vl, ivs, ovs;
+	  X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
+
+	  if (X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
+	       return 0;
+
+	  /* if this solver is redundant, in the sense that a solver
+	     of lower index generates the same plan, then prune this
+	     solver */
+	  if (X(nbuf_redundant)(d[0].n, vl, 
+				ego->maxnbuf_ndx,
+				maxnbufs, NELEM(maxnbufs)))
+	       return 0;
+
+	  /*
+	    In principle, the buffered transforms might be useful
+	    when working out of place.  However, in order to
+	    prevent infinite loops in the planner, we require
+	    that the output stride of the buffered transforms be
+	    greater than 2.
+	  */
+	  if (p->ri != p->ro)
+	       return (d[0].os > 2);
+
+	  /*
+	   * If the problem is in place, the input/output strides must
+	   * be the same or the whole thing must fit in the buffer.
+	   */
+	  if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
+	       return 1;
+
+	  if (/* fits into buffer: */
+	       ((p->vecsz->rnk == 0)
+		||
+		(X(nbuf)(d[0].n, p->vecsz->dims[0].n, 
+			 maxnbufs[ego->maxnbuf_ndx]) 
+		 == p->vecsz->dims[0].n)))
+	       return 1;
+     }
+
+     return 0;
+}
+
+static int applicable(const S *ego, const problem *p_, const planner *plnr)
+{
+     if (NO_BUFFERINGP(plnr)) return 0;
+     if (!applicable0(ego, p_, plnr)) return 0;
+
+     if (NO_UGLYP(plnr)) {
+	  const problem_dft *p = (const problem_dft *) p_;
+	  if (p->ri != p->ro) return 0;
+	  if (X(toobig)(p->sz->dims[0].n)) return 0;
+     }
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const S *ego = (const S *)ego_;
+     plan *cld = (plan *) 0;
+     plan *cldcpy = (plan *) 0;
+     plan *cldrest = (plan *) 0;
+     const problem_dft *p = (const problem_dft *) p_;
+     R *bufs = (R *) 0;
+     INT nbuf = 0, bufdist, n, vl;
+     INT ivs, ovs, roffset, ioffset;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego, p_, plnr))
+          goto nada;
+
+     n = X(tensor_sz)(p->sz);
+
+     X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
+
+     nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
+     bufdist = X(bufdist)(n, vl);
+     A(nbuf > 0);
+
+     /* attempt to keep real and imaginary part in the same order,
+	so as to allow optimizations in the the copy plan */
+     roffset = (p->ri - p->ii > 0) ? (INT)1 : (INT)0;
+     ioffset = 1 - roffset;
+
+     /* initial allocation for the purpose of planning */
+     bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist * 2, BUFFERS);
+
+     /* allow destruction of input if problem is in place */
+     cld = X(mkplan_f_d)(plnr,
+			 X(mkproblem_dft_d)(
+			      X(mktensor_1d)(n, p->sz->dims[0].is, 2),
+			      X(mktensor_1d)(nbuf, ivs, bufdist * 2),
+			      TAINT(p->ri, ivs * nbuf),
+			      TAINT(p->ii, ivs * nbuf),
+			      bufs + roffset, 
+			      bufs + ioffset),
+			 0, 0, (p->ri == p->ro) ? NO_DESTROY_INPUT : 0);
+     if (!cld)
+          goto nada;
+
+     /* copying back from the buffer is a rank-0 transform: */
+     cldcpy = X(mkplan_d)(plnr,
+			  X(mkproblem_dft_d)(
+			       X(mktensor_0d)(),
+			       X(mktensor_2d)(nbuf, bufdist * 2, ovs,
+					      n, 2, p->sz->dims[0].os),
+			       bufs + roffset, 
+			       bufs + ioffset, 
+			       TAINT(p->ro, ovs * nbuf), 
+			       TAINT(p->io, ovs * nbuf)));
+     if (!cldcpy)
+          goto nada;
+
+     /* deallocate buffers, let apply() allocate them for real */
+     X(ifree)(bufs);
+     bufs = 0;
+
+     /* plan the leftover transforms (cldrest): */
+     {
+	  INT id = ivs * (nbuf * (vl / nbuf));
+	  INT od = ovs * (nbuf * (vl / nbuf));
+	  cldrest = X(mkplan_d)(plnr, 
+				X(mkproblem_dft_d)(
+				     X(tensor_copy)(p->sz),
+				     X(mktensor_1d)(vl % nbuf, ivs, ovs),
+				     p->ri+id, p->ii+id, p->ro+od, p->io+od));
+     }
+     if (!cldrest)
+          goto nada;
+
+     pln = MKPLAN_DFT(P, &padt, apply);
+     pln->cld = cld;
+     pln->cldcpy = cldcpy;
+     pln->cldrest = cldrest;
+     pln->n = n;
+     pln->vl = vl;
+     pln->ivs_by_nbuf = ivs * nbuf;
+     pln->ovs_by_nbuf = ovs * nbuf;
+     pln->roffset = roffset;
+     pln->ioffset = ioffset;
+
+     pln->nbuf = nbuf;
+     pln->bufdist = bufdist;
+
+     {
+	  opcnt t;
+	  X(ops_add)(&cld->ops, &cldcpy->ops, &t);
+	  X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
+     }
+
+     return &(pln->super.super);
+
+ nada:
+     X(ifree0)(bufs);
+     X(plan_destroy_internal)(cldrest);
+     X(plan_destroy_internal)(cldcpy);
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int maxnbuf_ndx)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->maxnbuf_ndx = maxnbuf_ndx;
+     return &(slv->super);
+}
+
+void X(dft_buffered_register)(planner *p)
+{
+     size_t i;
+     for (i = 0; i < NELEM(maxnbufs); ++i)
+	  REGISTER_SOLVER(p, mksolver(i));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/codelet-dft.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/codelet-dft.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/*
+ * This header file must include every file or define every
+ * type or macro which is required to compile a codelet.
+ */
+
+#ifndef __DFT_CODELET_H__
+#define __DFT_CODELET_H__
+
+#include "ifftw.h"
+
+/**************************************************************
+ * types of codelets
+ **************************************************************/
+
+/* DFT codelets */
+typedef struct kdft_desc_s kdft_desc;
+
+typedef struct {
+     int (*okp)(
+	  const kdft_desc *desc,
+	  const R *ri, const R *ii, const R *ro, const R *io,
+	  INT is, INT os, INT vl, INT ivs, INT ovs,
+	  const planner *plnr);
+     INT vl;
+} kdft_genus;
+
+struct kdft_desc_s {
+     INT sz;    /* size of transform computed */
+     const char *nam;
+     opcnt ops;
+     const kdft_genus *genus;
+     INT is;
+     INT os;
+     INT ivs;
+     INT ovs;
+};
+
+typedef void (*kdft) (const R *ri, const R *ii, R *ro, R *io,
+                      stride is, stride os, INT vl, INT ivs, INT ovs);
+void X(kdft_register)(planner *p, kdft codelet, const kdft_desc *desc);
+
+
+typedef struct ct_desc_s ct_desc;
+
+typedef struct {
+     int (*okp)(
+	  const struct ct_desc_s *desc,
+	  const R *rio, const R *iio, 
+	  INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+	  const planner *plnr);
+     INT vl;
+} ct_genus;
+
+struct ct_desc_s {
+     INT radix;
+     const char *nam;
+     const tw_instr *tw;
+     const ct_genus *genus;
+     opcnt ops;
+     INT rs;
+     INT vs;
+     INT ms;
+};
+
+typedef void (*kdftw) (R *rioarray, R *iioarray, const R *W,
+		       stride ios, INT mb, INT me, INT ms);
+void X(kdft_dit_register)(planner *p, kdftw codelet, const ct_desc *desc);
+void X(kdft_dif_register)(planner *p, kdftw codelet, const ct_desc *desc);
+
+
+typedef void (*kdftwsq) (R *rioarray, R *iioarray,
+			 const R *W, stride is, stride vs,
+			 INT mb, INT me, INT ms);
+void X(kdft_difsq_register)(planner *p, kdftwsq codelet, const ct_desc *desc);
+
+
+extern const solvtab X(solvtab_dft_standard);
+extern const solvtab X(solvtab_dft_sse2);
+extern const solvtab X(solvtab_dft_avx);
+extern const solvtab X(solvtab_dft_altivec);
+extern const solvtab X(solvtab_dft_neon);
+
+#endif				/* __DFT_CODELET_H__ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/conf.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/conf.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "dft.h"
+
+static const solvtab s =
+{
+     SOLVTAB(X(dft_indirect_register)),
+     SOLVTAB(X(dft_indirect_transpose_register)),
+     SOLVTAB(X(dft_rank_geq2_register)),
+     SOLVTAB(X(dft_vrank_geq1_register)),
+     SOLVTAB(X(dft_buffered_register)),
+     SOLVTAB(X(dft_generic_register)),
+     SOLVTAB(X(dft_rader_register)),
+     SOLVTAB(X(dft_bluestein_register)),
+     SOLVTAB(X(dft_nop_register)),
+     SOLVTAB(X(ct_generic_register)),
+     SOLVTAB(X(ct_genericbuf_register)),
+     SOLVTAB_END
+};
+
+void X(dft_conf_standard)(planner *p)
+{
+     X(solvtab_exec)(s, p);
+     X(solvtab_exec)(X(solvtab_dft_standard), p);
+#if HAVE_SSE2
+     if (X(have_simd_sse2)())
+	  X(solvtab_exec)(X(solvtab_dft_sse2), p);
+#endif
+#if HAVE_AVX
+     if (X(have_simd_avx)())
+	  X(solvtab_exec)(X(solvtab_dft_avx), p);
+#endif
+#if HAVE_ALTIVEC
+     if (X(have_simd_altivec)())
+	  X(solvtab_exec)(X(solvtab_dft_altivec), p);
+#endif
+#if HAVE_NEON
+     if (X(have_simd_neon)())
+	  X(solvtab_exec)(X(solvtab_dft_neon), p);
+#endif
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/ct.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/ct.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ct.h"
+
+ct_solver *(*X(mksolver_ct_hook))(size_t, INT, int, 
+				  ct_mkinferior, ct_force_vrecursion) = 0;
+
+typedef struct {
+     plan_dft super;
+     plan *cld;
+     plan *cldw;
+     INT r;
+} P;
+
+static void apply_dit(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+     plan_dftw *cldw;
+
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, ri, ii, ro, io);
+
+     cldw = (plan_dftw *) ego->cldw;
+     cldw->apply(ego->cldw, ro, io);
+}
+
+static void apply_dif(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+     plan_dftw *cldw;
+
+     cldw = (plan_dftw *) ego->cldw;
+     cldw->apply(ego->cldw, ri, ii);
+
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, ri, ii, ro, io);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldw, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldw);
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(dft-ct-%s/%D%(%p%)%(%p%))",
+	      ego->super.apply == apply_dit ? "dit" : "dif",
+	      ego->r, ego->cldw, ego->cld);
+}
+
+static int applicable0(const ct_solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     INT r;
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1
+
+	     /* DIF destroys the input and we don't like it */
+	     && (ego->dec == DECDIT ||
+		 p->ri == p->ro ||
+		 !NO_DESTROY_INPUTP(plnr))
+
+	     && ((r = X(choose_radix)(ego->r, p->sz->dims[0].n)) > 1)
+	     && p->sz->dims[0].n > r);
+}
+
+
+int X(ct_applicable)(const ct_solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_dft *p;
+
+     if (!applicable0(ego, p_, plnr))
+          return 0;
+
+     p = (const problem_dft *) p_;
+
+     return (0
+	     || ego->dec == DECDIF+TRANSPOSE
+	     || p->vecsz->rnk == 0
+	     || !NO_VRECURSEP(plnr)
+	     || (ego->force_vrecursionp && ego->force_vrecursionp(ego, p))
+	  );
+}
+
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const ct_solver *ego = (const ct_solver *) ego_;
+     const problem_dft *p;
+     P *pln = 0;
+     plan *cld = 0, *cldw = 0;
+     INT n, r, m, v, ivs, ovs;
+     iodim *d;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     if ((NO_NONTHREADEDP(plnr)) || !X(ct_applicable)(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_dft *) p_;
+     d = p->sz->dims;
+     n = d[0].n;
+     r = X(choose_radix)(ego->r, n);
+     m = n / r;
+
+     X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
+
+     switch (ego->dec) {
+	 case DECDIT:
+	 {
+	      cldw = ego->mkcldw(ego,
+				 r, m * d[0].os, m * d[0].os,
+				 m, d[0].os,
+				 v, ovs, ovs,
+				 0, m,
+				 p->ro, p->io, plnr);
+	      if (!cldw) goto nada;
+
+	      cld = X(mkplan_d)(plnr,
+				X(mkproblem_dft_d)(
+				     X(mktensor_1d)(m, r * d[0].is, d[0].os),
+				     X(mktensor_2d)(r, d[0].is, m * d[0].os,
+						    v, ivs, ovs),
+				     p->ri, p->ii, p->ro, p->io)
+		   );
+	      if (!cld) goto nada;
+
+	      pln = MKPLAN_DFT(P, &padt, apply_dit);
+	      break;
+	 }
+	 case DECDIF:
+	 case DECDIF+TRANSPOSE:
+	 {
+	      INT cors, covs; /* cldw ors, ovs */
+	      if (ego->dec == DECDIF+TRANSPOSE) {
+		   cors = ivs;
+		   covs = m * d[0].is;
+		   /* ensure that we generate well-formed dftw subproblems */
+		   /* FIXME: too conservative */
+		   if (!(1
+			 && r == v
+			 && d[0].is == r * cors))
+			goto nada;
+
+		   /* FIXME: allow in-place only for now, like in
+		      fftw-3.[01] */
+		   if (!(1
+			 && p->ri == p->ro
+			 && d[0].is == r * d[0].os
+			 && cors == d[0].os
+			 && covs == ovs
+			    ))
+			goto nada;
+	      } else {
+		   cors = m * d[0].is;
+		   covs = ivs;
+	      }
+
+	      cldw = ego->mkcldw(ego,
+				 r, m * d[0].is, cors,
+				 m, d[0].is,
+				 v, ivs, covs,
+				 0, m,
+				 p->ri, p->ii, plnr);
+	      if (!cldw) goto nada;
+
+	      cld = X(mkplan_d)(plnr,
+				X(mkproblem_dft_d)(
+				     X(mktensor_1d)(m, d[0].is, r * d[0].os),
+				     X(mktensor_2d)(r, cors, d[0].os,
+						    v, covs, ovs),
+				     p->ri, p->ii, p->ro, p->io)
+		   );
+	      if (!cld) goto nada;
+
+	      pln = MKPLAN_DFT(P, &padt, apply_dif);
+	      break;
+	 }
+
+	 default: A(0);
+
+     }
+
+     pln->cld = cld;
+     pln->cldw = cldw;
+     pln->r = r;
+     X(ops_add)(&cld->ops, &cldw->ops, &pln->super.super.ops);
+
+     /* inherit could_prune_now_p attribute from cldw */
+     pln->super.super.could_prune_now_p = cldw->could_prune_now_p;
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldw);
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+ct_solver *X(mksolver_ct)(size_t size, INT r, int dec, 
+			  ct_mkinferior mkcldw,
+			  ct_force_vrecursion force_vrecursionp)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     ct_solver *slv = (ct_solver *)X(mksolver)(size, &sadt);
+     slv->r = r;
+     slv->dec = dec;
+     slv->mkcldw = mkcldw;
+     slv->force_vrecursionp = force_vrecursionp;
+     return slv;
+}
+
+plan *X(mkplan_dftw)(size_t size, const plan_adt *adt, dftwapply apply)
+{
+     plan_dftw *ego;
+
+     ego = (plan_dftw *) X(mkplan)(size, adt);
+     ego->apply = apply;
+
+     return &(ego->super);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/ct.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/ct.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "dft.h"
+
+typedef void (*dftwapply)(const plan *ego, R *rio, R *iio);
+typedef struct ct_solver_s ct_solver;
+typedef plan *(*ct_mkinferior)(const ct_solver *ego,
+			       INT r, INT irs, INT ors,
+			       INT m, INT ms,
+			       INT v, INT ivs, INT ovs,
+			       INT mstart, INT mcount,
+			       R *rio, R *iio, planner *plnr);
+typedef int (*ct_force_vrecursion)(const ct_solver *ego, 
+				   const problem_dft *p);
+
+typedef struct {
+     plan super;
+     dftwapply apply;
+} plan_dftw;
+
+extern plan *X(mkplan_dftw)(size_t size, const plan_adt *adt, dftwapply apply);
+
+#define MKPLAN_DFTW(type, adt, apply) \
+  (type *)X(mkplan_dftw)(sizeof(type), adt, apply)
+
+struct ct_solver_s {
+     solver super;
+     INT r;
+     int dec;
+#    define DECDIF 0
+#    define DECDIT 1
+#    define TRANSPOSE 2
+     ct_mkinferior mkcldw;
+     ct_force_vrecursion force_vrecursionp;
+};
+
+int X(ct_applicable)(const ct_solver *, const problem *, planner *);
+ct_solver *X(mksolver_ct)(size_t size, INT r, int dec, 
+			  ct_mkinferior mkcldw, 
+			  ct_force_vrecursion force_vrecursionp);
+extern ct_solver *(*X(mksolver_ct_hook))(size_t, INT, int, 
+					 ct_mkinferior, ct_force_vrecursion);
+
+void X(regsolver_ct_directw)(planner *plnr,
+     kdftw codelet, const ct_desc *desc, int dec);
+void X(regsolver_ct_directwbuf)(planner *plnr,
+     kdftw codelet, const ct_desc *desc, int dec);
+solver *X(mksolver_ctsq)(kdftwsq codelet, const ct_desc *desc, int dec);
+void X(regsolver_ct_directwsq)(planner *plnr, kdftwsq codelet, 
+			       const ct_desc *desc, int dec);
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/dft.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/dft.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#ifndef __DFT_H__
+#define __DFT_H__
+
+#include "ifftw.h"
+#include "codelet-dft.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+/* problem.c: */
+typedef struct {
+     problem super;
+     tensor *sz, *vecsz;
+     R *ri, *ii, *ro, *io;
+} problem_dft;
+
+void X(dft_zerotens)(tensor *sz, R *ri, R *ii);
+problem *X(mkproblem_dft)(const tensor *sz, const tensor *vecsz,
+				R *ri, R *ii, R *ro, R *io);
+problem *X(mkproblem_dft_d)(tensor *sz, tensor *vecsz,
+			    R *ri, R *ii, R *ro, R *io);
+
+/* solve.c: */
+void X(dft_solve)(const plan *ego_, const problem *p_);
+
+/* plan.c: */
+typedef void (*dftapply) (const plan *ego, R *ri, R *ii, R *ro, R *io);
+
+typedef struct {
+     plan super;
+     dftapply apply;
+} plan_dft;
+
+plan *X(mkplan_dft)(size_t size, const plan_adt *adt, dftapply apply);
+
+#define MKPLAN_DFT(type, adt, apply) \
+  (type *)X(mkplan_dft)(sizeof(type), adt, apply)
+
+/* various solvers */
+solver *X(mksolver_dft_direct)(kdft k, const kdft_desc *desc);
+solver *X(mksolver_dft_directbuf)(kdft k, const kdft_desc *desc);
+
+void X(dft_rank0_register)(planner *p);
+void X(dft_rank_geq2_register)(planner *p);
+void X(dft_indirect_register)(planner *p);
+void X(dft_indirect_transpose_register)(planner *p);
+void X(dft_vrank_geq1_register)(planner *p);
+void X(dft_vrank2_transpose_register)(planner *p);
+void X(dft_vrank3_transpose_register)(planner *p);
+void X(dft_buffered_register)(planner *p);
+void X(dft_generic_register)(planner *p);
+void X(dft_rader_register)(planner *p);
+void X(dft_bluestein_register)(planner *p);
+void X(dft_nop_register)(planner *p);
+void X(ct_generic_register)(planner *p);
+void X(ct_genericbuf_register)(planner *p);
+
+/* configurations */
+void X(dft_conf_standard)(planner *p);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* __DFT_H__ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/dftw-direct.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/dftw-direct.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ct.h"
+
+typedef struct {
+     ct_solver super;
+     const ct_desc *desc;
+     int bufferedp;
+     kdftw k;
+} S;
+
+typedef struct {
+     plan_dftw super;
+     kdftw k;
+     INT r;
+     stride rs;
+     INT m, ms, v, vs, mb, me, extra_iter;
+     stride brs;
+     twid *td;
+     const S *slv;
+} P;
+
+
+/*************************************************************
+  Nonbuffered code
+ *************************************************************/
+static void apply(const plan *ego_, R *rio, R *iio)
+{
+     const P *ego = (const P *) ego_;
+     INT i;
+     ASSERT_ALIGNED_DOUBLE;
+     for (i = 0; i < ego->v; ++i, rio += ego->vs, iio += ego->vs) {
+	  INT  mb = ego->mb, ms = ego->ms;
+	  ego->k(rio + mb*ms, iio + mb*ms, ego->td->W, 
+		 ego->rs, mb, ego->me, ms);
+     }
+}
+
+static void apply_extra_iter(const plan *ego_, R *rio, R *iio)
+{
+     const P *ego = (const P *) ego_;
+     INT i, v = ego->v, vs = ego->vs;
+     INT mb = ego->mb, me = ego->me, mm = me - 1, ms = ego->ms;
+     ASSERT_ALIGNED_DOUBLE;
+     for (i = 0; i < v; ++i, rio += vs, iio += vs) {
+	  ego->k(rio + mb*ms, iio + mb*ms, ego->td->W, 
+		 ego->rs, mb, mm, ms);
+	  ego->k(rio + mm*ms, iio + mm*ms, ego->td->W, 
+		 ego->rs, mm, mm+2, 0);
+     }
+}
+
+/*************************************************************
+  Buffered code
+ *************************************************************/
+static void dobatch(const P *ego, R *rA, R *iA, INT mb, INT me, R *buf)
+{
+     INT brs = WS(ego->brs, 1);
+     INT rs = WS(ego->rs, 1);
+     INT ms = ego->ms;
+
+     X(cpy2d_pair_ci)(rA + mb*ms, iA + mb*ms, buf, buf + 1,
+		      ego->r, rs, brs,
+		      me - mb, ms, 2);
+     ego->k(buf, buf + 1, ego->td->W, ego->brs, mb, me, 2);
+     X(cpy2d_pair_co)(buf, buf + 1, rA + mb*ms, iA + mb*ms,
+		      ego->r, brs, rs,
+		      me - mb, 2, ms);
+}
+
+/* must be even for SIMD alignment; should not be 2^k to avoid
+   associativity conflicts */
+static INT compute_batchsize(INT radix)
+{
+     /* round up to multiple of 4 */
+     radix += 3;
+     radix &= -4;
+
+     return (radix + 2);
+}
+
+static void apply_buf(const plan *ego_, R *rio, R *iio)
+{
+     const P *ego = (const P *) ego_;
+     INT i, j, v = ego->v, r = ego->r;
+     INT batchsz = compute_batchsize(r);
+     R *buf;
+     INT mb = ego->mb, me = ego->me;
+     size_t bufsz = r * batchsz * 2 * sizeof(R);
+
+     BUF_ALLOC(R *, buf, bufsz);
+
+     for (i = 0; i < v; ++i, rio += ego->vs, iio += ego->vs) {
+	  for (j = mb; j + batchsz < me; j += batchsz) 
+	       dobatch(ego, rio, iio, j, j + batchsz, buf);
+
+	  dobatch(ego, rio, iio, j, me, buf);
+     }
+
+     BUF_FREE(buf, bufsz);
+}
+
+/*************************************************************
+  common code
+ *************************************************************/
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
+		      ego->r * ego->m, ego->r, ego->m + ego->extra_iter);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(stride_destroy)(ego->brs);
+     X(stride_destroy)(ego->rs);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *slv = ego->slv;
+     const ct_desc *e = slv->desc;
+
+     if (slv->bufferedp)
+	  p->print(p, "(dftw-directbuf/%D-%D/%D%v \"%s\")",
+		   compute_batchsize(ego->r), ego->r,
+		   X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
+     else
+	  p->print(p, "(dftw-direct-%D/%D%v \"%s\")",
+		   ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
+}
+
+static int applicable0(const S *ego,
+		       INT r, INT irs, INT ors,
+		       INT m, INT ms,
+		       INT v, INT ivs, INT ovs,
+		       INT mb, INT me,
+		       R *rio, R *iio,
+		       const planner *plnr, INT *extra_iter)
+{
+     const ct_desc *e = ego->desc;
+     UNUSED(v);
+
+     return (
+	  1
+	  && r == e->radix
+	  && irs == ors /* in-place along R */
+	  && ivs == ovs /* in-place along V */
+
+	  /* check for alignment/vector length restrictions */
+	  && ((*extra_iter = 0,
+	       e->genus->okp(e, rio, iio, irs, ivs, m, mb, me, ms, plnr))
+	      ||
+	      (*extra_iter = 1,
+	       (1
+		/* FIXME: require full array, otherwise some threads
+		   may be extra_iter and other threads won't be.
+		   Generating the proper twiddle factors is a pain in
+		   this case */
+		&& mb == 0 && me == m
+		&& e->genus->okp(e, rio, iio, irs, ivs,
+				 m, mb, me - 1, ms, plnr)
+		&& e->genus->okp(e, rio, iio, irs, ivs,
+				 m, me - 1, me + 1, ms, plnr))))
+
+	  && (e->genus->okp(e, rio + ivs, iio + ivs, irs, ivs,
+			    m, mb, me - *extra_iter, ms, plnr))
+
+	  );
+}
+
+static int applicable0_buf(const S *ego,
+			   INT r, INT irs, INT ors,
+			   INT m, INT ms,
+			   INT v, INT ivs, INT ovs,
+			   INT mb, INT me,
+			   R *rio, R *iio,
+			   const planner *plnr)
+{
+     const ct_desc *e = ego->desc;
+     INT batchsz;
+     UNUSED(v); UNUSED(ms); UNUSED(rio); UNUSED(iio);
+
+     return (
+	  1
+	  && r == e->radix
+	  && irs == ors /* in-place along R */
+	  && ivs == ovs /* in-place along V */
+
+	  /* check for alignment/vector length restrictions, both for
+	     batchsize and for the remainder */
+	  && (batchsz = compute_batchsize(r), 1)
+	  && (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0,
+			    m, mb, mb + batchsz, 2, plnr))
+	  && (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0,
+			    m, mb, me, 2, plnr))
+	  );
+}
+
+static int applicable(const S *ego,
+		      INT r, INT irs, INT ors,
+		      INT m, INT ms,
+		      INT v, INT ivs, INT ovs,
+		      INT mb, INT me,
+		      R *rio, R *iio,
+		      const planner *plnr, INT *extra_iter)
+{
+     if (ego->bufferedp) {
+	  *extra_iter = 0;
+	  if (!applicable0_buf(ego,
+			       r, irs, ors, m, ms, v, ivs, ovs, mb, me,
+			       rio, iio, plnr))
+	       return 0;
+     } else {
+	  if (!applicable0(ego,
+			   r, irs, ors, m, ms, v, ivs, ovs, mb, me,
+			   rio, iio, plnr, extra_iter))
+	       return 0;
+     }
+
+     if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
+				       v, m * r, r))
+	  return 0;
+
+     if (m * r > 262144 && NO_FIXED_RADIX_LARGE_NP(plnr))
+	  return 0;
+
+     return 1;
+}
+
+static plan *mkcldw(const ct_solver *ego_,
+		    INT r, INT irs, INT ors,
+		    INT m, INT ms,
+		    INT v, INT ivs, INT ovs,
+		    INT mstart, INT mcount,
+		    R *rio, R *iio,
+		    planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     P *pln;
+     const ct_desc *e = ego->desc;
+     INT extra_iter;
+
+     static const plan_adt padt = {
+	  0, awake, print, destroy
+     };
+
+     A(mstart >= 0 && mstart + mcount <= m);
+     if (!applicable(ego,
+		     r, irs, ors, m, ms, v, ivs, ovs, mstart, mstart + mcount,
+		     rio, iio, plnr, &extra_iter))
+          return (plan *)0;
+
+     if (ego->bufferedp) {
+	  pln = MKPLAN_DFTW(P, &padt, apply_buf);
+     } else {
+	  pln = MKPLAN_DFTW(P, &padt, extra_iter ? apply_extra_iter : apply);
+     }
+
+     pln->k = ego->k;
+     pln->rs = X(mkstride)(r, irs);
+     pln->td = 0;
+     pln->r = r;
+     pln->m = m;
+     pln->ms = ms;
+     pln->v = v;
+     pln->vs = ivs;
+     pln->mb = mstart;
+     pln->me = mstart + mcount;
+     pln->slv = ego;
+     pln->brs = X(mkstride)(r, 2 * compute_batchsize(r));
+     pln->extra_iter = extra_iter;
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(v * (mcount/e->genus->vl), &e->ops, &pln->super.super.ops);
+
+     if (ego->bufferedp) {
+	  /* 8 load/stores * N * V */
+	  pln->super.super.ops.other += 8 * r * mcount * v;
+     }
+
+     pln->super.super.could_prune_now_p =
+	  (!ego->bufferedp && r >= 5 && r < 64 && m >= r);
+     return &(pln->super.super);
+}
+
+static void regone(planner *plnr, kdftw codelet,
+		   const ct_desc *desc, int dec, int bufferedp)
+{
+     S *slv = (S *)X(mksolver_ct)(sizeof(S), desc->radix, dec, mkcldw, 0);
+     slv->k = codelet;
+     slv->desc = desc;
+     slv->bufferedp = bufferedp;
+     REGISTER_SOLVER(plnr, &(slv->super.super));
+     if (X(mksolver_ct_hook)) {
+	  slv = (S *)X(mksolver_ct_hook)(sizeof(S), desc->radix,
+					 dec, mkcldw, 0);
+	  slv->k = codelet;
+	  slv->desc = desc;
+	  slv->bufferedp = bufferedp;
+	  REGISTER_SOLVER(plnr, &(slv->super.super));
+     }
+}
+
+void X(regsolver_ct_directw)(planner *plnr, kdftw codelet,
+			     const ct_desc *desc, int dec)
+{
+     regone(plnr, codelet, desc, dec, /* bufferedp */ 0);
+     regone(plnr, codelet, desc, dec, /* bufferedp */ 1);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/dftw-directsq.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/dftw-directsq.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ct.h"
+
+typedef struct {
+     ct_solver super;
+     const ct_desc *desc;
+     kdftwsq k;
+} S;
+
+typedef struct {
+     plan_dftw super;
+     kdftwsq k;
+     INT r;
+     stride rs, vs;
+     INT m, ms, v, mb, me;
+     twid *td;
+     const S *slv;
+} P;
+
+
+static void apply(const plan *ego_, R *rio, R *iio)
+{
+     const P *ego = (const P *) ego_;
+     INT mb = ego->mb, ms = ego->ms;
+     ego->k(rio + mb*ms, iio + mb*ms, ego->td->W, ego->rs, ego->vs,
+	    mb, ego->me, ms);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
+		      ego->r * ego->m, ego->r, ego->m);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(stride_destroy)(ego->rs);
+     X(stride_destroy)(ego->vs);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *slv = ego->slv;
+     const ct_desc *e = slv->desc;
+
+     p->print(p, "(dftw-directsq-%D/%D%v \"%s\")",
+	      ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
+}
+
+static int applicable(const S *ego,
+		      INT r, INT irs, INT ors,
+		      INT m, INT ms,
+		      INT v, INT ivs, INT ovs,
+		      INT mb, INT me,
+		      R *rio, R *iio,
+		      const planner *plnr)
+{
+     const ct_desc *e = ego->desc;
+     UNUSED(v);
+
+     return (
+	  1
+	  && r == e->radix
+
+	  /* transpose r, v */
+	  && r == v
+	  && irs == ovs
+	  && ivs == ors
+
+	  /* check for alignment/vector length restrictions */
+	  && e->genus->okp(e, rio, iio, irs, ivs, m, mb, me, ms, plnr)
+
+	  );
+}
+
+static plan *mkcldw(const ct_solver *ego_,
+		    INT r, INT irs, INT ors,
+		    INT m, INT ms,
+		    INT v, INT ivs, INT ovs,
+		    INT mstart, INT mcount,
+		    R *rio, R *iio,
+		    planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     P *pln;
+     const ct_desc *e = ego->desc;
+
+     static const plan_adt padt = {
+	  0, awake, print, destroy
+     };
+
+     A(mstart >= 0 && mstart + mcount <= m);
+     if (!applicable(ego,
+		     r, irs, ors, m, ms, v, ivs, ovs, mstart, mstart + mcount,
+		     rio, iio, plnr))
+          return (plan *)0;
+
+     pln = MKPLAN_DFTW(P, &padt, apply);
+
+     pln->k = ego->k;
+     pln->rs = X(mkstride)(r, irs);
+     pln->vs = X(mkstride)(v, ivs);
+     pln->td = 0;
+     pln->r = r;
+     pln->m = m;
+     pln->ms = ms;
+     pln->v = v;
+     pln->mb = mstart;
+     pln->me = mstart + mcount;
+     pln->slv = ego;
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(mcount/e->genus->vl, &e->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+}
+
+static void regone(planner *plnr, kdftwsq codelet,
+		   const ct_desc *desc, int dec)
+{
+     S *slv = (S *)X(mksolver_ct)(sizeof(S), desc->radix, dec, mkcldw, 0);
+     slv->k = codelet;
+     slv->desc = desc;
+     REGISTER_SOLVER(plnr, &(slv->super.super));
+     if (X(mksolver_ct_hook)) {
+	  slv = (S *)X(mksolver_ct_hook)(sizeof(S), desc->radix, dec,
+					 mkcldw, 0);
+	  slv->k = codelet;
+	  slv->desc = desc;
+	  REGISTER_SOLVER(plnr, &(slv->super.super));
+     }
+}
+
+void X(regsolver_ct_directwsq)(planner *plnr, kdftwsq codelet,
+			       const ct_desc *desc, int dec)
+{
+     regone(plnr, codelet, desc, dec+TRANSPOSE);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/dftw-generic.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/dftw-generic.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* express a twiddle problem in terms of dft + multiplication by
+   twiddle factors */
+
+#include "ct.h"
+
+typedef ct_solver S;
+
+typedef struct {
+     plan_dftw super;
+
+     INT r, rs, m, mb, me, ms, v, vs;
+
+     plan *cld;
+
+     twid *td;
+
+     const S *slv;
+     int dec;
+} P;
+
+static void mktwiddle(P *ego, enum wakefulness wakefulness)
+{
+     static const tw_instr tw[] = { { TW_FULL, 0, 0 }, { TW_NEXT, 1, 0 } };
+
+     /* note that R and M are swapped, to allow for sequential
+	access both to data and twiddles */
+     X(twiddle_awake)(wakefulness, &ego->td, tw,
+		      ego->r * ego->m, ego->m, ego->r);
+}
+
+static void bytwiddle(const P *ego, R *rio, R *iio)
+{
+     INT iv, ir, im;
+     INT r = ego->r, rs = ego->rs;
+     INT m = ego->m, mb = ego->mb, me = ego->me, ms = ego->ms;
+     INT v = ego->v, vs = ego->vs;
+     const R *W = ego->td->W;
+
+     mb += (mb == 0); /* skip m=0 iteration */
+     for (iv = 0; iv < v; ++iv) {
+	  for (ir = 1; ir < r; ++ir) {
+	       for (im = mb; im < me; ++im) {
+		    R *pr = rio + ms * im + rs * ir;
+		    R *pi = iio + ms * im + rs * ir;
+		    E xr = *pr;
+		    E xi = *pi;
+		    E wr = W[2 * im + (2 * (m-1)) * ir - 2];
+		    E wi = W[2 * im + (2 * (m-1)) * ir - 1];
+		    *pr = xr * wr + xi * wi;
+		    *pi = xi * wr - xr * wi;
+	       }
+	  }
+	  rio += vs;
+	  iio += vs;
+     }
+}
+
+static int applicable(INT irs, INT ors, INT ivs, INT ovs,
+		      const planner *plnr)
+{
+     return (1
+	     && irs == ors
+	     && ivs == ovs
+	     && !NO_SLOWP(plnr)
+	  );
+}
+
+static void apply_dit(const plan *ego_, R *rio, R *iio)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+     INT dm = ego->ms * ego->mb;
+
+     bytwiddle(ego, rio, iio);
+
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, rio + dm, iio + dm, rio + dm, iio + dm);
+}
+
+static void apply_dif(const plan *ego_, R *rio, R *iio)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+     INT dm = ego->ms * ego->mb;
+
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, rio + dm, iio + dm, rio + dm, iio + dm);
+
+     bytwiddle(ego, rio, iio);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+     mktwiddle(ego, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(dftw-generic-%s-%D-%D%v%(%p%))",
+	      ego->dec == DECDIT ? "dit" : "dif",
+	      ego->r, ego->m, ego->v, ego->cld);
+}
+
+static plan *mkcldw(const ct_solver *ego_,
+		    INT r, INT irs, INT ors,
+		    INT m, INT ms,
+		    INT v, INT ivs, INT ovs,
+		    INT mstart, INT mcount,
+		    R *rio, R *iio,
+		    planner *plnr)
+{
+     const S *ego = (const S *)ego_;
+     P *pln;
+     plan *cld = 0;
+     INT dm = ms * mstart;
+
+     static const plan_adt padt = {
+	  0, awake, print, destroy
+     };
+
+     A(mstart >= 0 && mstart + mcount <= m);
+     if (!applicable(irs, ors, ivs, ovs, plnr))
+          return (plan *)0;
+
+     cld = X(mkplan_d)(plnr,
+			X(mkproblem_dft_d)(
+			     X(mktensor_1d)(r, irs, irs),
+			     X(mktensor_2d)(mcount, ms, ms, v, ivs, ivs),
+			     rio + dm, iio + dm, rio + dm, iio + dm)
+			);
+     if (!cld) goto nada;
+
+     pln = MKPLAN_DFTW(P, &padt, ego->dec == DECDIT ? apply_dit : apply_dif);
+     pln->slv = ego;
+     pln->cld = cld;
+     pln->r = r;
+     pln->rs = irs;
+     pln->m = m;
+     pln->ms = ms;
+     pln->v = v;
+     pln->vs = ivs;
+     pln->mb = mstart;
+     pln->me = mstart + mcount;
+     pln->dec = ego->dec;
+     pln->td = 0;
+
+     {
+	  double n0 = (r - 1) * (mcount - 1) * v;
+	  pln->super.super.ops = cld->ops;
+	  pln->super.super.ops.mul += 8 * n0;
+	  pln->super.super.ops.add += 4 * n0;
+	  pln->super.super.ops.other += 8 * n0;
+     }
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+static void regsolver(planner *plnr, INT r, int dec)
+{
+     S *slv = (S *)X(mksolver_ct)(sizeof(S), r, dec, mkcldw, 0);
+     REGISTER_SOLVER(plnr, &(slv->super));
+     if (X(mksolver_ct_hook)) {
+	  slv = (S *)X(mksolver_ct_hook)(sizeof(S), r, dec, mkcldw, 0);
+	  REGISTER_SOLVER(plnr, &(slv->super));
+     }
+}
+
+void X(ct_generic_register)(planner *p)
+{
+     regsolver(p, 0, DECDIT);
+     regsolver(p, 0, DECDIF);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/dftw-genericbuf.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/dftw-genericbuf.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* express a twiddle problem in terms of dft + multiplication by
+   twiddle factors */
+
+#include "ct.h"
+
+typedef struct {
+     ct_solver super;
+     INT batchsz;
+} S;
+
+typedef struct {
+     plan_dftw super;
+
+     INT r, rs, m, ms, v, vs, mb, me;
+     INT batchsz;
+     plan *cld;
+
+     triggen *t;
+     const S *slv;
+} P;
+
+
+#define BATCHDIST(r) ((r) + 16)
+
+/**************************************************************/
+static void bytwiddle(const P *ego, INT mb, INT me, R *buf, R *rio, R *iio)
+{
+     INT j, k;
+     INT r = ego->r, rs = ego->rs, ms = ego->ms;
+     triggen *t = ego->t;
+     for (j = 0; j < r; ++j) {
+	  for (k = mb; k < me; ++k)
+	       t->rotate(t, j * k,
+			 rio[j * rs + k * ms],
+			 iio[j * rs + k * ms],
+			 &buf[j * 2 + 2 * BATCHDIST(r) * (k - mb) + 0]);
+     }
+}
+
+static int applicable0(const S *ego,
+		       INT r, INT irs, INT ors,
+		       INT m, INT v,
+		       INT mcount)
+{
+     return (1
+	     && v == 1
+	     && irs == ors
+	     && mcount >= ego->batchsz
+	     && mcount % ego->batchsz == 0
+	     && r >= 64 
+	     && m >= r
+	  );
+}
+
+static int applicable(const S *ego,
+		      INT r, INT irs, INT ors,
+		      INT m, INT v,
+		      INT mcount,
+		      const planner *plnr)
+{
+     if (!applicable0(ego, r, irs, ors, m, v, mcount))
+	  return 0;
+     if (NO_UGLYP(plnr) && m * r < 65536)
+	  return 0;
+
+     return 1;
+}
+
+static void dobatch(const P *ego, INT mb, INT me, R *buf, R *rio, R *iio)
+{
+     plan_dft *cld;
+     INT ms = ego->ms;
+
+     bytwiddle(ego, mb, me, buf, rio, iio);
+
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, buf, buf + 1, buf, buf + 1);
+     X(cpy2d_pair_co)(buf, buf + 1,
+		      rio + ms * mb, iio + ms * mb,
+		      me-mb, 2 * BATCHDIST(ego->r), ms,
+		      ego->r, 2, ego->rs);
+}
+
+static void apply(const plan *ego_, R *rio, R *iio)
+{
+     const P *ego = (const P *) ego_;
+     R *buf = (R *) MALLOC(sizeof(R) * 2 * BATCHDIST(ego->r) * ego->batchsz,
+			   BUFFERS);
+     INT m;
+
+     for (m = ego->mb; m < ego->me; m += ego->batchsz)
+	  dobatch(ego, m, m + ego->batchsz, buf, rio, iio);
+
+     A(m == ego->me);
+
+     X(ifree)(buf);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+
+     switch (wakefulness) {
+	 case SLEEPY:
+	      X(triggen_destroy)(ego->t); ego->t = 0;
+	      break;
+	 default:
+	      ego->t = X(mktriggen)(AWAKE_SQRTN_TABLE, ego->r * ego->m);
+	      break;
+     }
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(dftw-genericbuf/%D-%D-%D%(%p%))",
+	      ego->batchsz, ego->r, ego->m, ego->cld);
+}
+
+static plan *mkcldw(const ct_solver *ego_,
+		    INT r, INT irs, INT ors,
+		    INT m, INT ms,
+		    INT v, INT ivs, INT ovs,
+		    INT mstart, INT mcount,
+		    R *rio, R *iio,
+		    planner *plnr)
+{
+     const S *ego = (const S *)ego_;
+     P *pln;
+     plan *cld = 0;
+     R *buf;
+
+     static const plan_adt padt = {
+	  0, awake, print, destroy
+     };
+     
+     UNUSED(ivs); UNUSED(ovs); UNUSED(rio); UNUSED(iio);
+
+     A(mstart >= 0 && mstart + mcount <= m);
+     if (!applicable(ego, r, irs, ors, m, v, mcount, plnr))
+          return (plan *)0;
+
+     buf = (R *) MALLOC(sizeof(R) * 2 * BATCHDIST(r) * ego->batchsz, BUFFERS);
+     cld = X(mkplan_d)(plnr,
+			X(mkproblem_dft_d)(
+			     X(mktensor_1d)(r, 2, 2),
+			     X(mktensor_1d)(ego->batchsz,
+					    2 * BATCHDIST(r),
+					    2 * BATCHDIST(r)),
+			     buf, buf + 1, buf, buf + 1
+			     )
+			);
+     X(ifree)(buf);
+     if (!cld) goto nada;
+
+     pln = MKPLAN_DFTW(P, &padt, apply);
+     pln->slv = ego;
+     pln->cld = cld;
+     pln->r = r;
+     pln->m = m;
+     pln->ms = ms;
+     pln->rs = irs;
+     pln->batchsz = ego->batchsz;
+     pln->mb = mstart;
+     pln->me = mstart + mcount;
+
+     {
+	  double n0 = (r - 1) * (mcount - 1);
+	  pln->super.super.ops = cld->ops;
+	  pln->super.super.ops.mul += 8 * n0;
+	  pln->super.super.ops.add += 4 * n0;
+	  pln->super.super.ops.other += 8 * n0;
+     }
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+static void regsolver(planner *plnr, INT r, INT batchsz)
+{
+     S *slv = (S *)X(mksolver_ct)(sizeof(S), r, DECDIT, mkcldw, 0);
+     slv->batchsz = batchsz;
+     REGISTER_SOLVER(plnr, &(slv->super.super));
+
+     if (X(mksolver_ct_hook)) {
+	  slv = (S *)X(mksolver_ct_hook)(sizeof(S), r, DECDIT, mkcldw, 0);
+	  slv->batchsz = batchsz;
+	  REGISTER_SOLVER(plnr, &(slv->super.super));
+     }
+
+}
+
+void X(ct_genericbuf_register)(planner *p)
+{
+     static const INT radices[] = { -1, -2, -4, -8, -16, -32, -64 };
+     static const INT batchsizes[] = { 4, 8, 16, 32, 64 };
+     unsigned i, j;
+
+     for (i = 0; i < sizeof(radices) / sizeof(radices[0]); ++i)
+	  for (j = 0; j < sizeof(batchsizes) / sizeof(batchsizes[0]); ++j)
+	       regsolver(p, radices[i], batchsizes[j]);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/direct.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/direct.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* direct DFT solver, if we have a codelet */
+
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     const kdft_desc *desc;
+     kdft k;
+     int bufferedp;
+} S;
+
+typedef struct {
+     plan_dft super;
+
+     stride is, os, bufstride;
+     INT n, vl, ivs, ovs;
+     kdft k;
+     const S *slv;
+} P;
+
+static void dobatch(const P *ego, R *ri, R *ii, R *ro, R *io, 
+		    R *buf, INT batchsz)
+{
+     X(cpy2d_pair_ci)(ri, ii, buf, buf+1,
+		      ego->n, WS(ego->is, 1), WS(ego->bufstride, 1),
+		      batchsz, ego->ivs, 2);
+     
+     if (IABS(WS(ego->os, 1)) < IABS(ego->ovs)) {
+	  /* transform directly to output */
+	  ego->k(buf, buf+1, ro, io, 
+		 ego->bufstride, ego->os, batchsz, 2, ego->ovs);
+     } else {
+	  /* transform to buffer and copy back */
+	  ego->k(buf, buf+1, buf, buf+1, 
+		 ego->bufstride, ego->bufstride, batchsz, 2, 2);
+	  X(cpy2d_pair_co)(buf, buf+1, ro, io,
+			   ego->n, WS(ego->bufstride, 1), WS(ego->os, 1), 
+			   batchsz, 2, ego->ovs);
+     }
+}
+
+static INT compute_batchsize(INT n)
+{
+     /* round up to multiple of 4 */
+     n += 3;
+     n &= -4;
+
+     return (n + 2);
+}
+
+static void apply_buf(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     R *buf;
+     INT vl = ego->vl, n = ego->n, batchsz = compute_batchsize(n);
+     INT i;
+     size_t bufsz = n * batchsz * 2 * sizeof(R);
+
+     BUF_ALLOC(R *, buf, bufsz);
+
+     for (i = 0; i < vl - batchsz; i += batchsz) {
+	  dobatch(ego, ri, ii, ro, io, buf, batchsz);
+	  ri += batchsz * ego->ivs; ii += batchsz * ego->ivs;
+	  ro += batchsz * ego->ovs; io += batchsz * ego->ovs;
+     }
+     dobatch(ego, ri, ii, ro, io, buf, vl - i);
+
+     BUF_FREE(buf, bufsz);
+}
+
+static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     ASSERT_ALIGNED_DOUBLE;
+     ego->k(ri, ii, ro, io, ego->is, ego->os, ego->vl, ego->ivs, ego->ovs);
+}
+
+static void apply_extra_iter(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     INT vl = ego->vl;
+
+     ASSERT_ALIGNED_DOUBLE;
+
+     /* for 4-way SIMD when VL is odd: iterate over an
+	even vector length VL, and then execute the last
+	iteration as a 2-vector with vector stride 0. */
+     ego->k(ri, ii, ro, io, ego->is, ego->os, vl - 1, ego->ivs, ego->ovs);
+
+     ego->k(ri + (vl - 1) * ego->ivs, ii + (vl - 1) * ego->ivs,
+	    ro + (vl - 1) * ego->ovs, io + (vl - 1) * ego->ovs,
+	    ego->is, ego->os, 1, 0, 0);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(stride_destroy)(ego->is);
+     X(stride_destroy)(ego->os);
+     X(stride_destroy)(ego->bufstride);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->slv;
+     const kdft_desc *d = s->desc;
+
+     if (ego->slv->bufferedp)
+	  p->print(p, "(dft-directbuf/%D-%D%v \"%s\")", 
+		   compute_batchsize(d->sz), d->sz, ego->vl, d->nam);
+     else
+	  p->print(p, "(dft-direct-%D%v \"%s\")", d->sz, ego->vl, d->nam);
+}
+
+static int applicable_buf(const solver *ego_, const problem *p_,
+			  const planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_dft *p = (const problem_dft *) p_;
+     const kdft_desc *d = ego->desc;
+     INT vl;
+     INT ivs, ovs;
+     INT batchsz;
+
+     return (
+	  1
+	  && p->sz->rnk == 1
+	  && p->vecsz->rnk == 1
+	  && p->sz->dims[0].n == d->sz
+
+	  /* check strides etc */
+	  && X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
+
+	  /* UGLY if IS <= IVS */
+	  && !(NO_UGLYP(plnr) &&
+	       X(iabs)(p->sz->dims[0].is) <= X(iabs)(ivs))
+
+	  && (batchsz = compute_batchsize(d->sz), 1)
+	  && (d->genus->okp(d, 0, ((const R *)0) + 1, p->ro, p->io,
+			    2 * batchsz, p->sz->dims[0].os,
+			    batchsz, 2, ovs, plnr))
+	  && (d->genus->okp(d, 0, ((const R *)0) + 1, p->ro, p->io,
+			    2 * batchsz, p->sz->dims[0].os,
+			    vl % batchsz, 2, ovs, plnr))
+
+
+	  && (0
+	      /* can operate out-of-place */
+	      || p->ri != p->ro
+
+	      /* can operate in-place as long as strides are the same */
+	      || X(tensor_inplace_strides2)(p->sz, p->vecsz)
+
+	      /* can do it if the problem fits in the buffer, no matter
+		 what the strides are */
+	      || vl <= batchsz
+	       )
+	  );
+}
+
+static int applicable(const solver *ego_, const problem *p_,
+		      const planner *plnr, int *extra_iterp)
+{
+     const S *ego = (const S *) ego_;
+     const problem_dft *p = (const problem_dft *) p_;
+     const kdft_desc *d = ego->desc;
+     INT vl;
+     INT ivs, ovs;
+
+     return (
+	  1
+	  && p->sz->rnk == 1
+	  && p->vecsz->rnk <= 1
+	  && p->sz->dims[0].n == d->sz
+
+	  /* check strides etc */
+	  && X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
+
+	  && ((*extra_iterp = 0,
+	       (d->genus->okp(d, p->ri, p->ii, p->ro, p->io,
+			      p->sz->dims[0].is, p->sz->dims[0].os,
+			      vl, ivs, ovs, plnr)))
+	      ||
+	      (*extra_iterp = 1,
+	       ((d->genus->okp(d, p->ri, p->ii, p->ro, p->io,
+			       p->sz->dims[0].is, p->sz->dims[0].os,
+			       vl - 1, ivs, ovs, plnr))
+		&&
+		(d->genus->okp(d, p->ri, p->ii, p->ro, p->io,
+			       p->sz->dims[0].is, p->sz->dims[0].os,
+			       2, 0, 0, plnr)))))
+
+	  && (0
+	      /* can operate out-of-place */
+	      || p->ri != p->ro
+
+	      /* can always compute one transform */
+	      || vl == 1
+
+	      /* can operate in-place as long as strides are the same */
+	      || X(tensor_inplace_strides2)(p->sz, p->vecsz)
+	       )
+	  );
+}
+
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     P *pln;
+     const problem_dft *p;
+     iodim *d;
+     const kdft_desc *e = ego->desc;
+
+     static const plan_adt padt = {
+	  X(dft_solve), X(null_awake), print, destroy
+     };
+
+     UNUSED(plnr);
+
+     if (ego->bufferedp) {
+	  if (!applicable_buf(ego_, p_, plnr))
+	       return (plan *)0;
+	  pln = MKPLAN_DFT(P, &padt, apply_buf);
+     } else {
+	  int extra_iterp = 0;
+	  if (!applicable(ego_, p_, plnr, &extra_iterp))
+	       return (plan *)0;
+	  pln = MKPLAN_DFT(P, &padt, extra_iterp ? apply_extra_iter : apply);
+     }
+
+     p = (const problem_dft *) p_;
+     d = p->sz->dims;
+     pln->k = ego->k;
+     pln->n = d[0].n;
+     pln->is = X(mkstride)(pln->n, d[0].is);
+     pln->os = X(mkstride)(pln->n, d[0].os);
+     pln->bufstride = X(mkstride)(pln->n, 2 * compute_batchsize(pln->n));
+
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+     pln->slv = ego;
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl / e->genus->vl, &e->ops, &pln->super.super.ops);
+
+     if (ego->bufferedp) 
+	  pln->super.super.ops.other += 4 * pln->n * pln->vl;
+
+     pln->super.super.could_prune_now_p = !ego->bufferedp;
+     return &(pln->super.super);
+}
+
+static solver *mksolver(kdft k, const kdft_desc *desc, int bufferedp)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->k = k;
+     slv->desc = desc;
+     slv->bufferedp = bufferedp;
+     return &(slv->super);
+}
+
+solver *X(mksolver_dft_direct)(kdft k, const kdft_desc *desc)
+{
+     return mksolver(k, desc, 0);
+}
+
+solver *X(mksolver_dft_directbuf)(kdft k, const kdft_desc *desc)
+{
+     return mksolver(k, desc, 1);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/generic.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/generic.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "dft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_dft super;
+     twid *td;
+     INT n, is, os;
+} P;
+
+
+static void cdot(INT n, const E *x, const R *w, 
+		 R *or0, R *oi0, R *or1, R *oi1)
+{
+     INT i;
+
+     E rr = x[0], ri = 0, ir = x[1], ii = 0; 
+     x += 2;
+     for (i = 1; i + i < n; ++i) {
+	  rr += x[0] * w[0];
+	  ir += x[1] * w[0];
+	  ri += x[2] * w[1];
+	  ii += x[3] * w[1];
+	  x += 4; w += 2;
+     }
+     *or0 = rr + ii;
+     *oi0 = ir - ri;
+     *or1 = rr - ii;
+     *oi1 = ir + ri;
+}
+
+static void hartley(INT n, const R *xr, const R *xi, INT xs, E *o,
+		    R *pr, R *pi)
+{
+     INT i;
+     E sr, si;
+     o[0] = sr = xr[0]; o[1] = si = xi[0]; o += 2;
+     for (i = 1; i + i < n; ++i) {
+	  sr += (o[0] = xr[i * xs] + xr[(n - i) * xs]);
+	  si += (o[1] = xi[i * xs] + xi[(n - i) * xs]);
+	  o[2] = xr[i * xs] - xr[(n - i) * xs];
+	  o[3] = xi[i * xs] - xi[(n - i) * xs];
+	  o += 4;
+     }
+     *pr = sr;
+     *pi = si;
+}
+		    
+static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     INT i;
+     INT n = ego->n, is = ego->is, os = ego->os;
+     const R *W = ego->td->W;
+     E *buf;
+     size_t bufsz = n * 2 * sizeof(E);
+
+     BUF_ALLOC(E *, buf, bufsz);
+     hartley(n, ri, ii, is, buf, ro, io);
+
+     for (i = 1; i + i < n; ++i) {
+	  cdot(n, buf, W,
+	       ro + i * os, io + i * os,
+	       ro + (n - i) * os, io + (n - i) * os);
+	  W += n - 1;
+     }
+
+     BUF_FREE(buf, bufsz);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     static const tw_instr half_tw[] = {
+	  { TW_HALF, 1, 0 },
+	  { TW_NEXT, 1, 0 }
+     };
+
+     X(twiddle_awake)(wakefulness, &ego->td, half_tw, ego->n, ego->n,
+		      (ego->n - 1) / 2);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+
+     p->print(p, "(dft-generic-%D)", ego->n);
+}
+
+static int applicable(const solver *ego, const problem *p_, 
+		      const planner *plnr)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     UNUSED(ego);
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk == 0
+	     && (p->sz->dims[0].n % 2) == 1 
+	     && CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD)
+	     && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW)
+	     && X(is_prime)(p->sz->dims[0].n)
+	  );
+}
+
+static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_dft *p;
+     P *pln;
+     INT n;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, X(plan_null_destroy)
+     };
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *)0;
+
+     pln = MKPLAN_DFT(P, &padt, apply);
+
+     p = (const problem_dft *) p_;
+     pln->n = n = p->sz->dims[0].n;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+     pln->td = 0;
+
+     pln->super.super.ops.add = (n-1) * 5;
+     pln->super.super.ops.mul = 0;
+     pln->super.super.ops.fma = (n-1) * (n-1) ;
+#if 0 /* these are nice pipelined sequential loads and should cost nothing */
+     pln->super.super.ops.other = (n-1)*(4 + 1 + 2 * (n-1));  /* approximate */
+#endif
+
+     return &(pln->super.super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(dft_generic_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/indirect-transpose.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/indirect-transpose.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* solvers/plans for vectors of DFTs corresponding to the columns
+   of a matrix: first transpose the matrix so that the DFTs are
+   contiguous, then do DFTs with transposed output.   In particular,
+   we restrict ourselves to the case of a square transpose (or a
+   sequence thereof). */
+
+#include "dft.h"
+
+typedef solver S;
+
+typedef struct {
+     plan_dft super;
+     INT vl, ivs, ovs;
+     plan *cldtrans, *cld, *cldrest;
+} P;
+
+/* initial transpose is out-of-place from input to output */
+static void apply_op(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     INT vl = ego->vl, ivs = ego->ivs, ovs = ego->ovs, i;
+
+     for (i = 0; i < vl; ++i) {
+	  {
+	       plan_dft *cldtrans = (plan_dft *) ego->cldtrans;
+	       cldtrans->apply(ego->cldtrans, ri, ii, ro, io);
+	  }
+	  {
+	       plan_dft *cld = (plan_dft *) ego->cld;
+	       cld->apply(ego->cld, ro, io, ro, io);
+	  }
+	  ri += ivs; ii += ivs;
+	  ro += ovs; io += ovs;
+     }
+     {
+	  plan_dft *cldrest = (plan_dft *) ego->cldrest;
+	  cldrest->apply(ego->cldrest, ri, ii, ro, io);
+     }
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldrest);
+     X(plan_destroy_internal)(ego->cld);
+     X(plan_destroy_internal)(ego->cldtrans);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cldtrans, wakefulness);
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldrest, wakefulness);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(indirect-transpose%v%(%p%)%(%p%)%(%p%))", 
+	      ego->vl, ego->cldtrans, ego->cld, ego->cldrest);
+}
+
+static int pickdim(const tensor *vs, const tensor *s, int *pdim0, int *pdim1)
+{
+     int dim0, dim1;
+     *pdim0 = *pdim1 = -1;
+     for (dim0 = 0; dim0 < vs->rnk; ++dim0)
+          for (dim1 = 0; dim1 < s->rnk; ++dim1) 
+	       if (vs->dims[dim0].n * X(iabs)(vs->dims[dim0].is) <= X(iabs)(s->dims[dim1].is)
+		   && vs->dims[dim0].n >= s->dims[dim1].n
+		   && (*pdim0 == -1 
+		       || (X(iabs)(vs->dims[dim0].is) <= X(iabs)(vs->dims[*pdim0].is)
+			   && X(iabs)(s->dims[dim1].is) >= X(iabs)(s->dims[*pdim1].is)))) {
+		    *pdim0 = dim0;
+		    *pdim1 = dim1;
+	       }
+     return (*pdim0 != -1 && *pdim1 != -1);
+}
+
+static int applicable0(const solver *ego_, const problem *p_,
+		       const planner *plnr,
+		       int *pdim0, int *pdim1)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     UNUSED(ego_); UNUSED(plnr);
+
+     return (1
+	     && FINITE_RNK(p->vecsz->rnk) && FINITE_RNK(p->sz->rnk)
+
+	     /* FIXME: can/should we relax this constraint? */
+	     && X(tensor_inplace_strides2)(p->vecsz, p->sz)
+
+	     && pickdim(p->vecsz, p->sz, pdim0, pdim1)
+
+	     /* output should not *already* include the transpose
+		(in which case we duplicate the regular indirect.c) */
+	     && (p->sz->dims[*pdim1].os != p->vecsz->dims[*pdim0].is)
+	  );
+}
+
+static int applicable(const solver *ego_, const problem *p_,
+		      const planner *plnr,
+		      int *pdim0, int *pdim1)
+{
+     if (!applicable0(ego_, p_, plnr, pdim0, pdim1)) return 0;
+     {
+          const problem_dft *p = (const problem_dft *) p_;
+	  INT u = p->ri == p->ii + 1 || p->ii == p->ri + 1 ? (INT)2 : (INT)1;
+
+	  /* UGLY if does not result in contiguous transforms or
+	     transforms of contiguous vectors (since the latter at
+	     least have efficient transpositions) */
+	  if (NO_UGLYP(plnr)
+	      && p->vecsz->dims[*pdim0].is != u
+	      && !(p->vecsz->rnk == 2
+		   && p->vecsz->dims[1-*pdim0].is == u
+		   && p->vecsz->dims[*pdim0].is
+		      == u * p->vecsz->dims[1-*pdim0].n))
+	       return 0;
+
+	  if (NO_INDIRECT_OP_P(plnr) && p->ri != p->ro) return 0;
+     }
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     P *pln;
+     plan *cld = 0, *cldtrans = 0, *cldrest = 0;
+     int pdim0, pdim1;
+     tensor *ts, *tv;
+     INT vl, ivs, ovs;
+     R *rit, *iit, *rot, *iot;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &pdim0, &pdim1))
+          return (plan *) 0;
+
+     vl = p->vecsz->dims[pdim0].n / p->sz->dims[pdim1].n;
+     A(vl >= 1);
+     ivs = p->sz->dims[pdim1].n * p->vecsz->dims[pdim0].is;
+     ovs = p->sz->dims[pdim1].n * p->vecsz->dims[pdim0].os;
+     rit = TAINT(p->ri, vl == 1 ? 0 : ivs);
+     iit = TAINT(p->ii, vl == 1 ? 0 : ivs);
+     rot = TAINT(p->ro, vl == 1 ? 0 : ovs);
+     iot = TAINT(p->io, vl == 1 ? 0 : ovs);
+
+     ts = X(tensor_copy_inplace)(p->sz, INPLACE_IS);
+     ts->dims[pdim1].os = p->vecsz->dims[pdim0].is;
+     tv = X(tensor_copy_inplace)(p->vecsz, INPLACE_IS);
+     tv->dims[pdim0].os = p->sz->dims[pdim1].is;
+     tv->dims[pdim0].n = p->sz->dims[pdim1].n;
+     cldtrans = X(mkplan_d)(plnr, 
+			    X(mkproblem_dft_d)(X(mktensor_0d)(),
+					       X(tensor_append)(tv, ts),
+					       rit, iit, 
+					       rot, iot));
+     X(tensor_destroy2)(ts, tv);
+     if (!cldtrans) goto nada;
+
+     ts = X(tensor_copy)(p->sz);
+     ts->dims[pdim1].is = p->vecsz->dims[pdim0].is;
+     tv = X(tensor_copy)(p->vecsz);
+     tv->dims[pdim0].is = p->sz->dims[pdim1].is;
+     tv->dims[pdim0].n = p->sz->dims[pdim1].n;
+     cld = X(mkplan_d)(plnr, X(mkproblem_dft_d)(ts, tv,
+						rot, iot,
+						rot, iot));
+     if (!cld) goto nada;
+
+     tv = X(tensor_copy)(p->vecsz);
+     tv->dims[pdim0].n -= vl * p->sz->dims[pdim1].n;
+     cldrest = X(mkplan_d)(plnr, X(mkproblem_dft_d)(X(tensor_copy)(p->sz), tv,
+						    p->ri + ivs * vl,
+						    p->ii + ivs * vl,
+						    p->ro + ovs * vl,
+						    p->io + ovs * vl));
+     if (!cldrest) goto nada;
+
+     pln = MKPLAN_DFT(P, &padt, apply_op);
+     pln->cldtrans = cldtrans;
+     pln->cld = cld;
+     pln->cldrest = cldrest;
+     pln->vl = vl;
+     pln->ivs = ivs;
+     pln->ovs = ovs;
+     X(ops_cpy)(&cldrest->ops, &pln->super.super.ops);
+     X(ops_madd2)(vl, &cld->ops, &pln->super.super.ops);
+     X(ops_madd2)(vl, &cldtrans->ops, &pln->super.super.ops);
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldrest);
+     X(plan_destroy_internal)(cld);
+     X(plan_destroy_internal)(cldtrans);
+     return (plan *)0;
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return slv;
+}
+
+void X(dft_indirect_transpose_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/indirect.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/indirect.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+
+/* solvers/plans for vectors of small DFT's that cannot be done
+   in-place directly.  Use a rank-0 plan to rearrange the data
+   before or after the transform.  Can also change an out-of-place
+   plan into a copy + in-place (where the in-place transform
+   is e.g. unit stride). */
+
+/* FIXME: merge with rank-geq2.c(?), since this is just a special case
+   of a rank split where the first/second transform has rank 0. */
+
+#include "dft.h"
+
+typedef problem *(*mkcld_t) (const problem_dft *p);
+
+typedef struct {
+     dftapply apply;
+     problem *(*mkcld)(const problem_dft *p);
+     const char *nam;
+} ndrct_adt;
+
+typedef struct {
+     solver super;
+     const ndrct_adt *adt;
+} S;
+
+typedef struct {
+     plan_dft super;
+     plan *cldcpy, *cld;
+     const S *slv;
+} P;
+
+/*-----------------------------------------------------------------------*/
+/* first rearrange, then transform */
+static void apply_before(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+
+     {
+          plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
+          cldcpy->apply(ego->cldcpy, ri, ii, ro, io);
+     }
+     {
+          plan_dft *cld = (plan_dft *) ego->cld;
+          cld->apply(ego->cld, ro, io, ro, io);
+     }
+}
+
+static problem *mkcld_before(const problem_dft *p)
+{
+     return X(mkproblem_dft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_OS),
+			       X(tensor_copy_inplace)(p->vecsz, INPLACE_OS),
+			       p->ro, p->io, p->ro, p->io);
+}
+
+static const ndrct_adt adt_before =
+{
+     apply_before, mkcld_before, "dft-indirect-before"
+};
+
+/*-----------------------------------------------------------------------*/
+/* first transform, then rearrange */
+
+static void apply_after(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+
+     {
+          plan_dft *cld = (plan_dft *) ego->cld;
+          cld->apply(ego->cld, ri, ii, ri, ii);
+     }
+     {
+          plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
+          cldcpy->apply(ego->cldcpy, ri, ii, ro, io);
+     }
+}
+
+static problem *mkcld_after(const problem_dft *p)
+{
+     return X(mkproblem_dft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_IS),
+			       X(tensor_copy_inplace)(p->vecsz, INPLACE_IS),
+			       p->ri, p->ii, p->ri, p->ii);
+}
+
+static const ndrct_adt adt_after =
+{
+     apply_after, mkcld_after, "dft-indirect-after"
+};
+
+/*-----------------------------------------------------------------------*/
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+     X(plan_destroy_internal)(ego->cldcpy);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cldcpy, wakefulness);
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->slv;
+     p->print(p, "(%s%(%p%)%(%p%))", s->adt->nam, ego->cld, ego->cldcpy);
+}
+
+static int applicable0(const solver *ego_, const problem *p_,
+		       const planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_dft *p = (const problem_dft *) p_;
+     return (1
+	     && FINITE_RNK(p->vecsz->rnk)
+
+	     /* problem must be a nontrivial transform, not just a copy */
+	     && p->sz->rnk > 0
+
+	     && (0
+
+		 /* problem must be in-place & require some
+		    rearrangement of the data; to prevent
+		    infinite loops with indirect-transpose, we
+		    further require that at least some transform
+		    strides must decrease */
+		 || (p->ri == p->ro
+		     && !X(tensor_inplace_strides2)(p->sz, p->vecsz)
+		     && X(tensor_strides_decrease)(
+			  p->sz, p->vecsz,
+			  ego->adt->apply == apply_after ? 
+			  INPLACE_IS : INPLACE_OS))
+
+		 /* or problem must be out of place, transforming
+		    from stride 1/2 to bigger stride, for apply_after */
+		 || (p->ri != p->ro && ego->adt->apply == apply_after
+		     && !NO_DESTROY_INPUTP(plnr)
+		     && X(tensor_min_istride)(p->sz) <= 2
+		     && X(tensor_min_ostride)(p->sz) > 2)
+			  
+		 /* or problem must be out of place, transforming
+		    to stride 1/2 from bigger stride, for apply_before */
+		 || (p->ri != p->ro && ego->adt->apply == apply_before
+		     && X(tensor_min_ostride)(p->sz) <= 2
+		     && X(tensor_min_istride)(p->sz) > 2)
+		  )
+	  );
+}
+
+static int applicable(const solver *ego_, const problem *p_,
+		      const planner *plnr)
+{
+     if (!applicable0(ego_, p_, plnr)) return 0;
+     {
+          const problem_dft *p = (const problem_dft *) p_;
+	  if (NO_INDIRECT_OP_P(plnr) && p->ri != p->ro) return 0;
+     }
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     const S *ego = (const S *) ego_;
+     P *pln;
+     plan *cld = 0, *cldcpy = 0;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *) 0;
+
+     cldcpy =
+	  X(mkplan_d)(plnr, 
+		      X(mkproblem_dft_d)(X(mktensor_0d)(),
+					 X(tensor_append)(p->vecsz, p->sz),
+					 p->ri, p->ii, p->ro, p->io));
+
+     if (!cldcpy) goto nada;
+
+     cld = X(mkplan_f_d)(plnr, ego->adt->mkcld(p), NO_BUFFERING, 0, 0);
+     if (!cld) goto nada;
+
+     pln = MKPLAN_DFT(P, &padt, ego->adt->apply);
+     pln->cld = cld;
+     pln->cldcpy = cldcpy;
+     pln->slv = ego;
+     X(ops_add)(&cld->ops, &cldcpy->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld);
+     X(plan_destroy_internal)(cldcpy);
+     return (plan *)0;
+}
+
+static solver *mksolver(const ndrct_adt *adt)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->adt = adt;
+     return &(slv->super);
+}
+
+void X(dft_indirect_register)(planner *p)
+{
+     unsigned i;
+     static const ndrct_adt *const adts[] = {
+	  &adt_before, &adt_after
+     };
+
+     for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
+          REGISTER_SOLVER(p, mksolver(adts[i]));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/kdft-dif.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/kdft-dif.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ct.h"
+
+void X(kdft_dif_register)(planner *p, kdftw codelet, const ct_desc *desc)
+{
+     X(regsolver_ct_directw)(p, codelet, desc, DECDIF);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/kdft-difsq.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/kdft-difsq.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ct.h"
+
+void X(kdft_difsq_register)(planner *p, kdftwsq k, const ct_desc *desc)
+{
+     X(regsolver_ct_directwsq)(p, k, desc, DECDIF);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/kdft-dit.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/kdft-dit.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ct.h"
+
+void X(kdft_dit_register)(planner *p, kdftw codelet, const ct_desc *desc)
+{
+     X(regsolver_ct_directw)(p, codelet, desc, DECDIT);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/kdft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/kdft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "dft.h"
+
+void X(kdft_register)(planner *p, kdft codelet, const kdft_desc *desc)
+{
+     REGISTER_SOLVER(p, X(mksolver_dft_direct)(codelet, desc));
+     REGISTER_SOLVER(p, X(mksolver_dft_directbuf)(codelet, desc));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/nop.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/nop.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* plans for vrank -infty DFTs (nothing to do) */
+
+#include "dft.h"
+
+static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     UNUSED(ego_);
+     UNUSED(ri);
+     UNUSED(ii);
+     UNUSED(ro);
+     UNUSED(io);
+}
+
+static int applicable(const solver *ego_, const problem *p_)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+
+     UNUSED(ego_);
+
+     return 0
+	  /* case 1 : -infty vector rank */
+	  || (!FINITE_RNK(p->vecsz->rnk))
+
+	  /* case 2 : rank-0 in-place dft */
+	  || (1
+	      && p->sz->rnk == 0
+	      && FINITE_RNK(p->vecsz->rnk)
+	      && p->ro == p->ri
+	      && X(tensor_inplace_strides)(p->vecsz)
+	       );
+}
+
+static void print(const plan *ego, printer *p)
+{
+     UNUSED(ego);
+     p->print(p, "(dft-nop)");
+}
+
+static plan *mkplan(const solver *ego, const problem *p, planner *plnr)
+{
+     static const plan_adt padt = {
+	  X(dft_solve), X(null_awake), print, X(plan_null_destroy)
+     };
+     plan_dft *pln;
+
+     UNUSED(plnr);
+
+     if (!applicable(ego, p))
+          return (plan *) 0;
+     pln = MKPLAN_DFT(plan_dft, &padt, apply);
+     X(ops_zero)(&pln->super.ops);
+
+     return &(pln->super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     return MKSOLVER(solver, &sadt);
+}
+
+void X(dft_nop_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/plan.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/plan.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "dft.h"
+
+plan *X(mkplan_dft)(size_t size, const plan_adt *adt, dftapply apply)
+{
+     plan_dft *ego;
+
+     ego = (plan_dft *) X(mkplan)(size, adt);
+     ego->apply = apply;
+
+     return &(ego->super);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/problem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/problem.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "dft.h"
+#include <stddef.h>
+
+static void destroy(problem *ego_)
+{
+     problem_dft *ego = (problem_dft *) ego_;
+     X(tensor_destroy2)(ego->vecsz, ego->sz);
+     X(ifree)(ego_);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     X(md5puts)(m, "dft");
+     X(md5int)(m, p->ri == p->ro);
+     X(md5INT)(m, p->ii - p->ri);
+     X(md5INT)(m, p->io - p->ro);
+     X(md5int)(m, X(alignment_of)(p->ri));
+     X(md5int)(m, X(alignment_of)(p->ii));
+     X(md5int)(m, X(alignment_of)(p->ro));
+     X(md5int)(m, X(alignment_of)(p->io));
+     X(tensor_md5)(m, p->sz);
+     X(tensor_md5)(m, p->vecsz);
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_dft *ego = (const problem_dft *) ego_;
+     p->print(p, "(dft %d %d %d %D %D %T %T)", 
+	      ego->ri == ego->ro,
+	      X(alignment_of)(ego->ri),
+	      X(alignment_of)(ego->ro),
+	      (INT)(ego->ii - ego->ri), 
+	      (INT)(ego->io - ego->ro),
+	      ego->sz,
+	      ego->vecsz);
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_dft *ego = (const problem_dft *) ego_;
+     tensor *sz = X(tensor_append)(ego->vecsz, ego->sz);
+     X(dft_zerotens)(sz, UNTAINT(ego->ri), UNTAINT(ego->ii));
+     X(tensor_destroy)(sz);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_DFT,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+problem *X(mkproblem_dft)(const tensor *sz, const tensor *vecsz,
+			  R *ri, R *ii, R *ro, R *io)
+{
+     problem_dft *ego;
+
+     /* enforce pointer equality if untainted pointers are equal */
+     if (UNTAINT(ri) == UNTAINT(ro))
+	  ri = ro = JOIN_TAINT(ri, ro);
+     if (UNTAINT(ii) == UNTAINT(io))
+	  ii = io = JOIN_TAINT(ii, io);
+
+     /* more correctness conditions: */
+     A(TAINTOF(ri) == TAINTOF(ii));
+     A(TAINTOF(ro) == TAINTOF(io));
+
+     A(X(tensor_kosherp)(sz));
+     A(X(tensor_kosherp)(vecsz));
+
+     if (ri == ro || ii == io) {
+	  /* If either real or imag pointers are in place, both must be. */
+	  if (ri != ro || ii != io || !X(tensor_inplace_locations)(sz, vecsz))
+	       return X(mkproblem_unsolvable)();
+     }
+
+     ego = (problem_dft *)X(mkproblem)(sizeof(problem_dft), &padt);
+
+     ego->sz = X(tensor_compress)(sz);
+     ego->vecsz = X(tensor_compress_contiguous)(vecsz);
+     ego->ri = ri;
+     ego->ii = ii;
+     ego->ro = ro;
+     ego->io = io;
+
+     A(FINITE_RNK(ego->sz->rnk));
+     return &(ego->super);
+}
+
+/* Same as X(mkproblem_dft), but also destroy input tensors. */
+problem *X(mkproblem_dft_d)(tensor *sz, tensor *vecsz,
+			    R *ri, R *ii, R *ro, R *io)
+{
+     problem *p = X(mkproblem_dft)(sz, vecsz, ri, ii, ro, io);
+     X(tensor_destroy2)(vecsz, sz);
+     return p;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/rader.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/rader.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "dft.h"
+
+/*
+ * Compute transforms of prime sizes using Rader's trick: turn them
+ * into convolutions of size n - 1, which you then perform via a pair
+ * of FFTs.
+ */
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_dft super;
+
+     plan *cld1, *cld2;
+     R *omega;
+     INT n, g, ginv;
+     INT is, os;
+     plan *cld_omega;
+} P;
+
+static rader_tl *omegas = 0;
+
+static R *mkomega(enum wakefulness wakefulness, plan *p_, INT n, INT ginv)
+{
+     plan_dft *p = (plan_dft *) p_;
+     R *omega;
+     INT i, gpower;
+     trigreal scale;
+     triggen *t;
+
+     if ((omega = X(rader_tl_find)(n, n, ginv, omegas)))
+	  return omega;
+
+     omega = (R *)MALLOC(sizeof(R) * (n - 1) * 2, TWIDDLES);
+
+     scale = n - 1.0; /* normalization for convolution */
+
+     t = X(mktriggen)(wakefulness, n);
+     for (i = 0, gpower = 1; i < n-1; ++i, gpower = MULMOD(gpower, ginv, n)) {
+	  trigreal w[2];
+	  t->cexpl(t, gpower, w);
+	  omega[2*i] = w[0] / scale;
+	  omega[2*i+1] = FFT_SIGN * w[1] / scale;
+     }
+     X(triggen_destroy)(t);
+     A(gpower == 1);
+
+     p->apply(p_, omega, omega + 1, omega, omega + 1);
+
+     X(rader_tl_insert)(n, n, ginv, omega, &omegas);
+     return omega;
+}
+
+static void free_omega(R *omega)
+{
+     X(rader_tl_delete)(omega, &omegas);
+}
+
+
+/***************************************************************************/
+
+/* Below, we extensively use the identity that fft(x*)* = ifft(x) in
+   order to share data between forward and backward transforms and to
+   obviate the necessity of having separate forward and backward
+   plans.  (Although we often compute separate plans these days anyway
+   due to the differing strides, etcetera.)
+
+   Of course, since the new FFTW gives us separate pointers to
+   the real and imaginary parts, we could have instead used the
+   fft(r,i) = ifft(i,r) form of this identity, but it was easier to
+   reuse the code from our old version. */
+
+static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     INT is, os;
+     INT k, gpower, g, r;
+     R *buf;
+     R r0 = ri[0], i0 = ii[0];
+
+     r = ego->n; is = ego->is; os = ego->os; g = ego->g; 
+     buf = (R *) MALLOC(sizeof(R) * (r - 1) * 2, BUFFERS);
+
+     /* First, permute the input, storing in buf: */
+     for (gpower = 1, k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, g, r)) {
+	  R rA, iA;
+	  rA = ri[gpower * is];
+	  iA = ii[gpower * is];
+	  buf[2*k] = rA; buf[2*k + 1] = iA;
+     }
+     /* gpower == g^(r-1) mod r == 1 */;
+
+
+     /* compute DFT of buf, storing in output (except DC): */
+     {
+	    plan_dft *cld = (plan_dft *) ego->cld1;
+	    cld->apply(ego->cld1, buf, buf+1, ro+os, io+os);
+     }
+
+     /* set output DC component: */
+     {
+	  ro[0] = r0 + ro[os];
+	  io[0] = i0 + io[os];
+     }
+
+     /* now, multiply by omega: */
+     {
+	  const R *omega = ego->omega;
+	  for (k = 0; k < r - 1; ++k) {
+	       E rB, iB, rW, iW;
+	       rW = omega[2*k];
+	       iW = omega[2*k+1];
+	       rB = ro[(k+1)*os];
+	       iB = io[(k+1)*os];
+	       ro[(k+1)*os] = rW * rB - iW * iB;
+	       io[(k+1)*os] = -(rW * iB + iW * rB);
+	  }
+     }
+     
+     /* this will add input[0] to all of the outputs after the ifft */
+     ro[os] += r0;
+     io[os] -= i0;
+
+     /* inverse FFT: */
+     {
+	    plan_dft *cld = (plan_dft *) ego->cld2;
+	    cld->apply(ego->cld2, ro+os, io+os, buf, buf+1);
+     }
+     
+     /* finally, do inverse permutation to unshuffle the output: */
+     {
+	  INT ginv = ego->ginv;
+	  gpower = 1;
+	  for (k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, ginv, r)) {
+	       ro[gpower * os] = buf[2*k];
+	       io[gpower * os] = -buf[2*k+1];
+	  }
+	  A(gpower == 1);
+     }
+
+
+     X(ifree)(buf);
+}
+
+/***************************************************************************/
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+     X(plan_awake)(ego->cld_omega, wakefulness);
+
+     switch (wakefulness) {
+	 case SLEEPY:
+	      free_omega(ego->omega);
+	      ego->omega = 0;
+	      break;
+	 default:
+	      ego->g = X(find_generator)(ego->n);
+	      ego->ginv = X(power_mod)(ego->g, ego->n - 2, ego->n);
+	      A(MULMOD(ego->g, ego->ginv, ego->n) == 1);
+
+	      ego->omega = mkomega(wakefulness,
+				   ego->cld_omega, ego->n, ego->ginv);
+	      break;
+     }
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld_omega);
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *)ego_;
+     p->print(p, "(dft-rader-%D%ois=%oos=%(%p%)",
+              ego->n, ego->is, ego->os, ego->cld1);
+     if (ego->cld2 != ego->cld1)
+          p->print(p, "%(%p%)", ego->cld2);
+     if (ego->cld_omega != ego->cld1 && ego->cld_omega != ego->cld2)
+          p->print(p, "%(%p%)", ego->cld_omega);
+     p->putchr(p, ')');
+}
+
+static int applicable(const solver *ego_, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     UNUSED(ego_);
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk == 0
+	     && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
+	     && X(is_prime)(p->sz->dims[0].n)
+
+	     /* proclaim the solver SLOW if p-1 is not easily factorizable.
+		Bluestein should take care of this case. */
+	     && CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
+	  );
+}
+
+static int mkP(P *pln, INT n, INT is, INT os, R *ro, R *io,
+	       planner *plnr)
+{
+     plan *cld1 = (plan *) 0;
+     plan *cld2 = (plan *) 0;
+     plan *cld_omega = (plan *) 0;
+     R *buf = (R *) 0;
+
+     /* initial allocation for the purpose of planning */
+     buf = (R *) MALLOC(sizeof(R) * (n - 1) * 2, BUFFERS);
+
+     cld1 = X(mkplan_f_d)(plnr, 
+			  X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, 2, os),
+					     X(mktensor_1d)(1, 0, 0),
+					     buf, buf + 1, ro + os, io + os),
+			  NO_SLOW, 0, 0);
+     if (!cld1) goto nada;
+
+     cld2 = X(mkplan_f_d)(plnr, 
+			  X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, os, 2),
+					     X(mktensor_1d)(1, 0, 0),
+					     ro + os, io + os, buf, buf + 1),
+			  NO_SLOW, 0, 0);
+
+     if (!cld2) goto nada;
+
+     /* plan for omega array */
+     cld_omega = X(mkplan_f_d)(plnr, 
+			       X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, 2, 2),
+						  X(mktensor_1d)(1, 0, 0),
+						  buf, buf + 1, buf, buf + 1),
+			       NO_SLOW, ESTIMATE, 0);
+     if (!cld_omega) goto nada;
+
+     /* deallocate buffers; let awake() or apply() allocate them for real */
+     X(ifree)(buf);
+     buf = 0;
+
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->cld_omega = cld_omega;
+     pln->omega = 0;
+     pln->n = n;
+     pln->is = is;
+     pln->os = os;
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+     pln->super.super.ops.other += (n - 1) * (4 * 2 + 6) + 6;
+     pln->super.super.ops.add += (n - 1) * 2 + 4;
+     pln->super.super.ops.mul += (n - 1) * 4;
+
+     return 1;
+
+ nada:
+     X(ifree0)(buf);
+     X(plan_destroy_internal)(cld_omega);
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return 0;
+}
+
+static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     P *pln;
+     INT n;
+     INT is, os;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego, p_, plnr))
+	  return (plan *) 0;
+
+     n = p->sz->dims[0].n;
+     is = p->sz->dims[0].is;
+     os = p->sz->dims[0].os;
+
+     pln = MKPLAN_DFT(P, &padt, apply);
+     if (!mkP(pln, n, is, os, p->ro, p->io, plnr)) {
+	  X(ifree)(pln);
+	  return (plan *) 0;
+     }
+     return &(pln->super.super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(dft_rader_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/rank-geq2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/rank-geq2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* plans for DFT of rank >= 2 (multidimensional) */
+
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     int spltrnk;
+     const int *buddies;
+     int nbuddies;
+} S;
+
+typedef struct {
+     plan_dft super;
+
+     plan *cld1, *cld2;
+     const S *solver;
+} P;
+
+/* Compute multi-dimensional DFT by applying the two cld plans
+   (lower-rnk DFTs). */
+static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld1, *cld2;
+
+     cld1 = (plan_dft *) ego->cld1;
+     cld1->apply(ego->cld1, ri, ii, ro, io);
+
+     cld2 = (plan_dft *) ego->cld2;
+     cld2->apply(ego->cld2, ro, io, ro, io);
+}
+
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->solver;
+     p->print(p, "(dft-rank>=2/%d%(%p%)%(%p%))",
+	      s->spltrnk, ego->cld1, ego->cld2);
+}
+
+static int picksplit(const S *ego, const tensor *sz, int *rp)
+{
+     A(sz->rnk > 1); /* cannot split rnk <= 1 */
+     if (!X(pickdim)(ego->spltrnk, ego->buddies, ego->nbuddies, sz, 1, rp))
+	  return 0;
+     *rp += 1; /* convert from dim. index to rank */
+     if (*rp >= sz->rnk) /* split must reduce rank */
+	  return 0;
+     return 1;
+}
+
+static int applicable0(const solver *ego_, const problem *p_, int *rp)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     const S *ego = (const S *)ego_;
+     return (1
+	     && FINITE_RNK(p->sz->rnk) && FINITE_RNK(p->vecsz->rnk)
+	     && p->sz->rnk >= 2
+	     && picksplit(ego, p->sz, rp)
+	  );
+}
+
+/* TODO: revise this. */
+static int applicable(const solver *ego_, const problem *p_, 
+		      const planner *plnr, int *rp)
+{
+     const S *ego = (const S *)ego_;
+     const problem_dft *p = (const problem_dft *) p_;
+
+     if (!applicable0(ego_, p_, rp)) return 0;
+
+     if (NO_RANK_SPLITSP(plnr) && (ego->spltrnk != ego->buddies[0])) return 0;
+
+     /* Heuristic: if the vector stride is greater than the transform
+        sz, don't use (prefer to do the vector loop first with a
+        vrank-geq1 plan). */
+     if (NO_UGLYP(plnr))
+	  if (p->vecsz->rnk > 0 &&
+	      X(tensor_min_stride)(p->vecsz) > X(tensor_max_index)(p->sz))
+	       return 0;
+
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_dft *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0;
+     tensor *sz1, *sz2, *vecszi, *sz2i;
+     int spltrnk;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &spltrnk))
+          return (plan *) 0;
+
+     p = (const problem_dft *) p_;
+     X(tensor_split)(p->sz, &sz1, spltrnk, &sz2);
+     vecszi = X(tensor_copy_inplace)(p->vecsz, INPLACE_OS);
+     sz2i = X(tensor_copy_inplace)(sz2, INPLACE_OS);
+
+     cld1 = X(mkplan_d)(plnr, 
+			X(mkproblem_dft_d)(X(tensor_copy)(sz2),
+					   X(tensor_append)(p->vecsz, sz1),
+					   p->ri, p->ii, p->ro, p->io));
+     if (!cld1) goto nada;
+
+     cld2 = X(mkplan_d)(plnr, 
+			X(mkproblem_dft_d)(
+			     X(tensor_copy_inplace)(sz1, INPLACE_OS),
+			     X(tensor_append)(vecszi, sz2i),
+			     p->ro, p->io, p->ro, p->io));
+     if (!cld2) goto nada;
+
+     pln = MKPLAN_DFT(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+
+     pln->solver = ego;
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+
+     X(tensor_destroy4)(sz1, sz2, vecszi, sz2i);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     X(tensor_destroy4)(sz1, sz2, vecszi, sz2i);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int spltrnk, const int *buddies, int nbuddies)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->spltrnk = spltrnk;
+     slv->buddies = buddies;
+     slv->nbuddies = nbuddies;
+     return &(slv->super);
+}
+
+void X(dft_rank_geq2_register)(planner *p)
+{
+     int i;
+     static const int buddies[] = { 1, 0, -2 };
+
+     const int nbuddies = (int)(sizeof(buddies) / sizeof(buddies[0]));
+
+     for (i = 0; i < nbuddies; ++i)
+          REGISTER_SOLVER(p, mksolver(buddies[i], buddies, nbuddies));
+
+     /* FIXME:
+
+        Should we try more buddies? 
+
+        Another possible variant is to swap cld1 and cld2 (or rather,
+        to swap their problems; they are not interchangeable because
+        cld2 must be in-place).  In past versions of FFTW, however, I
+        seem to recall that such rearrangements have made little or no
+        difference.
+     */
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,6 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft
+SUBDIRS=codelets
+noinst_LTLIBRARIES = libdft_scalar.la
+
+libdft_scalar_la_SOURCES = n.c t.c f.h n.h q.h t.h
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,685 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = dft/scalar
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libdft_scalar_la_LIBADD =
+am_libdft_scalar_la_OBJECTS = n.lo t.lo
+libdft_scalar_la_OBJECTS = $(am_libdft_scalar_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libdft_scalar_la_SOURCES)
+DIST_SOURCES = $(libdft_scalar_la_SOURCES)
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
+	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
+	distdir
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft
+SUBDIRS = codelets
+noinst_LTLIBRARIES = libdft_scalar.la
+libdft_scalar_la_SOURCES = n.c t.c f.h n.h q.h t.h
+all: all-recursive
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/scalar/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu dft/scalar/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libdft_scalar.la: $(libdft_scalar_la_OBJECTS) $(libdft_scalar_la_DEPENDENCIES) $(EXTRA_libdft_scalar_la_DEPENDENCIES) 
+	$(LINK)  $(libdft_scalar_la_OBJECTS) $(libdft_scalar_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-recursive
+all-am: Makefile $(LTLIBRARIES)
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \
+	install-am install-strip tags-recursive
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am check check-am clean clean-generic clean-libtool \
+	clean-noinstLTLIBRARIES ctags ctags-recursive distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	installdirs-am maintainer-clean maintainer-clean-generic \
+	mostlyclean mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \
+	uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,97 @@
+# This Makefile.am specifies a set of codelets, efficient transforms
+# of small sizes, that are used as building blocks (kernels) by FFTW
+# to build up large transforms, as well as the options for generating
+# and compiling them.
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+###########################################################################
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/dft/scalar
+noinst_LTLIBRARIES = libdft_scalar_codelets.la
+
+###########################################################################
+# n1_<n> is a hard-coded FFT of size <n> (base cases of FFT recursion)
+N1 = n1_2.c n1_3.c n1_4.c n1_5.c n1_6.c n1_7.c n1_8.c n1_9.c n1_10.c	\
+n1_11.c n1_12.c n1_13.c n1_14.c n1_15.c n1_16.c n1_32.c n1_64.c \
+n1_20.c n1_25.c # n1_30.c n1_40.c n1_50.c
+
+###########################################################################
+# t1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
+T1 = t1_2.c t1_3.c t1_4.c t1_5.c t1_6.c t1_7.c t1_8.c t1_9.c	\
+t1_10.c t1_12.c t1_15.c t1_16.c t1_32.c t1_64.c \
+t1_20.c t1_25.c # t1_30.c t1_40.c t1_50.c
+
+# t2_<r> is also a twiddle FFT, but instead of using a complete lookup table
+# of trig. functions, it partially generates the trig. values on the fly
+# (this is faster for large sizes).
+T2 = t2_4.c t2_8.c t2_16.c t2_32.c t2_64.c \
+     t2_5.c t2_10.c t2_20.c t2_25.c
+
+###########################################################################
+# The F (DIF) codelets are used for a kind of in-place transform algorithm,
+# but the planner seems to never (or hardly ever) use them on the machines
+# we have access to, preferring the Q codelets and the use of buffers
+# for sub-transforms.  So, we comment them out, at least for now.
+
+# f1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF step
+F1 = # f1_2.c f1_3.c f1_4.c f1_5.c f1_6.c f1_7.c f1_8.c f1_9.c f1_10.c f1_12.c f1_15.c f1_16.c f1_32.c f1_64.c
+
+# like f1, but partially generates its trig. table on the fly
+F2 = # f2_4.c f2_8.c f2_16.c f2_32.c f2_64.c
+
+###########################################################################
+# q1_<r> is <r> twiddle FFTs of size <r> (DIF step), where the output is
+# transposed.  This is used for in-place transposes in sizes that are
+# divisible by <r>^2.  These codelets have size ~ <r>^2, so you should
+# probably not use <r> bigger than 8 or so.
+Q1 = q1_2.c q1_4.c q1_8.c  q1_3.c q1_5.c q1_6.c
+
+###########################################################################
+ALL_CODELETS = $(N1) $(T1) $(T2) $(F1) $(F2) $(Q1)
+BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
+
+libdft_scalar_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+SOLVTAB_NAME = X(solvtab_dft_standard)
+XRENAME=X
+
+# special rules for regenerating codelets.
+include $(top_srcdir)/support/Makefile.codelets
+
+if MAINTAINER_MODE
+FLAGS_N1=$(DFT_FLAGS_COMMON)
+FLAGS_T1=$(DFT_FLAGS_COMMON)
+FLAGS_T2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
+FLAGS_F1=$(DFT_FLAGS_COMMON)
+FLAGS_F2=$(DFT_FLAGS_COMMON) -twiddle-log3  -precompute-twiddles
+FLAGS_Q1=$(DFT_FLAGS_COMMON) -reload-twiddle
+FLAGS_Q2=$(DFT_FLAGS_COMMON) -twiddle-log3  -precompute-twiddles
+
+n1_%.c:  $(CODELET_DEPS) $(GEN_NOTW)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(FLAGS_N1) -n $* -name n1_$* -include "n.h") | $(ADD_DATE) | $(INDENT) >$@
+
+t1_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T1) -n $* -name t1_$* -include "t.h") | $(ADD_DATE) | $(INDENT) >$@
+
+t2_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T2) -n $* -name t2_$* -include "t.h") | $(ADD_DATE) | $(INDENT) >$@
+
+f1_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F1) -dif -n $* -name f1_$* -include "f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+f2_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F2) -dif -n $* -name f2_$* -include "f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+q1_%.c:  $(CODELET_DEPS) $(GEN_TWIDSQ)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q1) -dif -n $* -name q1_$* -include "q.h") | $(ADD_DATE) | $(INDENT) >$@
+
+q2_%.c:  $(CODELET_DEPS) $(GEN_TWIDSQ)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q2) -dif -n $* -name q2_$* -include "q.h") | $(ADD_DATE) | $(INDENT) >$@
+
+endif # MAINTAINER_MODE
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,767 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This Makefile.am specifies a set of codelets, efficient transforms
+# of small sizes, that are used as building blocks (kernels) by FFTW
+# to build up large transforms, as well as the options for generating
+# and compiling them.
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+# -*- makefile -*-
+# This file contains special make rules to generate codelets.
+# Most of this file requires GNU make .
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/support/Makefile.codelets
+subdir = dft/scalar/codelets
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libdft_scalar_codelets_la_LIBADD =
+am__objects_1 = n1_2.lo n1_3.lo n1_4.lo n1_5.lo n1_6.lo n1_7.lo \
+	n1_8.lo n1_9.lo n1_10.lo n1_11.lo n1_12.lo n1_13.lo n1_14.lo \
+	n1_15.lo n1_16.lo n1_32.lo n1_64.lo n1_20.lo n1_25.lo
+am__objects_2 = t1_2.lo t1_3.lo t1_4.lo t1_5.lo t1_6.lo t1_7.lo \
+	t1_8.lo t1_9.lo t1_10.lo t1_12.lo t1_15.lo t1_16.lo t1_32.lo \
+	t1_64.lo t1_20.lo t1_25.lo
+am__objects_3 = t2_4.lo t2_8.lo t2_16.lo t2_32.lo t2_64.lo t2_5.lo \
+	t2_10.lo t2_20.lo t2_25.lo
+am__objects_4 =
+am__objects_5 = q1_2.lo q1_4.lo q1_8.lo q1_3.lo q1_5.lo q1_6.lo
+am__objects_6 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
+	$(am__objects_4) $(am__objects_4) $(am__objects_5)
+am__objects_7 = codlist.lo
+am__objects_8 = $(am__objects_6) $(am__objects_7)
+am_libdft_scalar_codelets_la_OBJECTS = $(am__objects_8)
+libdft_scalar_codelets_la_OBJECTS =  \
+	$(am_libdft_scalar_codelets_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libdft_scalar_codelets_la_SOURCES)
+DIST_SOURCES = $(libdft_scalar_codelets_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+
+###########################################################################
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/dft/scalar
+
+noinst_LTLIBRARIES = libdft_scalar_codelets.la
+
+###########################################################################
+# n1_<n> is a hard-coded FFT of size <n> (base cases of FFT recursion)
+N1 = n1_2.c n1_3.c n1_4.c n1_5.c n1_6.c n1_7.c n1_8.c n1_9.c n1_10.c	\
+n1_11.c n1_12.c n1_13.c n1_14.c n1_15.c n1_16.c n1_32.c n1_64.c \
+n1_20.c n1_25.c # n1_30.c n1_40.c n1_50.c
+
+
+###########################################################################
+# t1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
+T1 = t1_2.c t1_3.c t1_4.c t1_5.c t1_6.c t1_7.c t1_8.c t1_9.c	\
+t1_10.c t1_12.c t1_15.c t1_16.c t1_32.c t1_64.c \
+t1_20.c t1_25.c # t1_30.c t1_40.c t1_50.c
+
+
+# t2_<r> is also a twiddle FFT, but instead of using a complete lookup table
+# of trig. functions, it partially generates the trig. values on the fly
+# (this is faster for large sizes).
+T2 = t2_4.c t2_8.c t2_16.c t2_32.c t2_64.c \
+     t2_5.c t2_10.c t2_20.c t2_25.c
+
+
+###########################################################################
+# The F (DIF) codelets are used for a kind of in-place transform algorithm,
+# but the planner seems to never (or hardly ever) use them on the machines
+# we have access to, preferring the Q codelets and the use of buffers
+# for sub-transforms.  So, we comment them out, at least for now.
+
+# f1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF step
+F1 = # f1_2.c f1_3.c f1_4.c f1_5.c f1_6.c f1_7.c f1_8.c f1_9.c f1_10.c f1_12.c f1_15.c f1_16.c f1_32.c f1_64.c
+
+# like f1, but partially generates its trig. table on the fly
+F2 = # f2_4.c f2_8.c f2_16.c f2_32.c f2_64.c
+
+###########################################################################
+# q1_<r> is <r> twiddle FFTs of size <r> (DIF step), where the output is
+# transposed.  This is used for in-place transposes in sizes that are
+# divisible by <r>^2.  These codelets have size ~ <r>^2, so you should
+# probably not use <r> bigger than 8 or so.
+Q1 = q1_2.c q1_4.c q1_8.c  q1_3.c q1_5.c q1_6.c
+
+###########################################################################
+ALL_CODELETS = $(N1) $(T1) $(T2) $(F1) $(F2) $(Q1)
+BUILT_SOURCES = $(ALL_CODELETS) $(CODLIST)
+libdft_scalar_codelets_la_SOURCES = $(BUILT_SOURCES)
+SOLVTAB_NAME = X(solvtab_dft_standard)
+XRENAME = X
+CODLIST = codlist.c
+CODELET_NAME = codelet_
+@MAINTAINER_MODE_TRUE@INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
+@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
+@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
+@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
+@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
+@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
+@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
+@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
+@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
+@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
+@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
+@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
+@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE) 
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
+@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
+@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+
+# special rules for regenerating codelets.
+@MAINTAINER_MODE_TRUE@FLAGS_N1 = $(DFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_T1 = $(DFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_T2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
+@MAINTAINER_MODE_TRUE@FLAGS_F1 = $(DFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_F2 = $(DFT_FLAGS_COMMON) -twiddle-log3  -precompute-twiddles
+@MAINTAINER_MODE_TRUE@FLAGS_Q1 = $(DFT_FLAGS_COMMON) -reload-twiddle
+@MAINTAINER_MODE_TRUE@FLAGS_Q2 = $(DFT_FLAGS_COMMON) -twiddle-log3  -precompute-twiddles
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/scalar/codelets/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu dft/scalar/codelets/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/support/Makefile.codelets:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libdft_scalar_codelets.la: $(libdft_scalar_codelets_la_OBJECTS) $(libdft_scalar_codelets_la_DEPENDENCIES) $(EXTRA_libdft_scalar_codelets_la_DEPENDENCIES) 
+	$(LINK)  $(libdft_scalar_codelets_la_OBJECTS) $(libdft_scalar_codelets_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_8.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic \
+	maintainer-clean-local
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic \
+	maintainer-clean-local mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am
+
+
+# rule to build codlist
+$(CODLIST): Makefile
+	(									\
+	echo "#include \"ifftw.h\"";						\
+	echo $(INCLUDE_SIMD_HEADER);						\
+	echo;									\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+             echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);";	\
+           fi									\
+	done;									\
+	echo;									\
+	echo;									\
+	echo "extern const solvtab $(SOLVTAB_NAME);";				\
+	echo "const solvtab $(SOLVTAB_NAME) = {";				\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+	     echo "   SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),";		\
+	   fi									\
+	done;									\
+	echo "   SOLVTAB_END";							\
+	echo "};";								\
+	) >$@
+
+# only delete codlist.c in maintainer-mode, since it is included in the dist
+# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
+maintainer-clean-local:
+	rm -f $(CODLIST)
+
+# cancel the hideous builtin rules that cause an infinite loop
+@MAINTAINER_MODE_TRUE@%: %.o
+@MAINTAINER_MODE_TRUE@%: %.s
+@MAINTAINER_MODE_TRUE@%: %.c
+@MAINTAINER_MODE_TRUE@%: %.S
+
+@MAINTAINER_MODE_TRUE@n1_%.c:  $(CODELET_DEPS) $(GEN_NOTW)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(FLAGS_N1) -n $* -name n1_$* -include "n.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t1_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T1) -n $* -name t1_$* -include "t.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t2_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T2) -n $* -name t2_$* -include "t.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@f1_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F1) -dif -n $* -name f1_$* -include "f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@f2_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F2) -dif -n $* -name f2_$* -include "f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@q1_%.c:  $(CODELET_DEPS) $(GEN_TWIDSQ)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q1) -dif -n $* -name q1_$* -include "q.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@q2_%.c:  $(CODELET_DEPS) $(GEN_TWIDSQ)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q2) -dif -n $* -name q2_$* -include "q.h") | $(ADD_DATE) | $(INDENT) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,109 @@
+#include "ifftw.h"
+
+
+extern void X(codelet_n1_2)(planner *);
+extern void X(codelet_n1_3)(planner *);
+extern void X(codelet_n1_4)(planner *);
+extern void X(codelet_n1_5)(planner *);
+extern void X(codelet_n1_6)(planner *);
+extern void X(codelet_n1_7)(planner *);
+extern void X(codelet_n1_8)(planner *);
+extern void X(codelet_n1_9)(planner *);
+extern void X(codelet_n1_10)(planner *);
+extern void X(codelet_n1_11)(planner *);
+extern void X(codelet_n1_12)(planner *);
+extern void X(codelet_n1_13)(planner *);
+extern void X(codelet_n1_14)(planner *);
+extern void X(codelet_n1_15)(planner *);
+extern void X(codelet_n1_16)(planner *);
+extern void X(codelet_n1_32)(planner *);
+extern void X(codelet_n1_64)(planner *);
+extern void X(codelet_n1_20)(planner *);
+extern void X(codelet_n1_25)(planner *);
+extern void X(codelet_t1_2)(planner *);
+extern void X(codelet_t1_3)(planner *);
+extern void X(codelet_t1_4)(planner *);
+extern void X(codelet_t1_5)(planner *);
+extern void X(codelet_t1_6)(planner *);
+extern void X(codelet_t1_7)(planner *);
+extern void X(codelet_t1_8)(planner *);
+extern void X(codelet_t1_9)(planner *);
+extern void X(codelet_t1_10)(planner *);
+extern void X(codelet_t1_12)(planner *);
+extern void X(codelet_t1_15)(planner *);
+extern void X(codelet_t1_16)(planner *);
+extern void X(codelet_t1_32)(planner *);
+extern void X(codelet_t1_64)(planner *);
+extern void X(codelet_t1_20)(planner *);
+extern void X(codelet_t1_25)(planner *);
+extern void X(codelet_t2_4)(planner *);
+extern void X(codelet_t2_8)(planner *);
+extern void X(codelet_t2_16)(planner *);
+extern void X(codelet_t2_32)(planner *);
+extern void X(codelet_t2_64)(planner *);
+extern void X(codelet_t2_5)(planner *);
+extern void X(codelet_t2_10)(planner *);
+extern void X(codelet_t2_20)(planner *);
+extern void X(codelet_t2_25)(planner *);
+extern void X(codelet_q1_2)(planner *);
+extern void X(codelet_q1_4)(planner *);
+extern void X(codelet_q1_8)(planner *);
+extern void X(codelet_q1_3)(planner *);
+extern void X(codelet_q1_5)(planner *);
+extern void X(codelet_q1_6)(planner *);
+
+
+extern const solvtab X(solvtab_dft_standard);
+const solvtab X(solvtab_dft_standard) = {
+   SOLVTAB(X(codelet_n1_2)),
+   SOLVTAB(X(codelet_n1_3)),
+   SOLVTAB(X(codelet_n1_4)),
+   SOLVTAB(X(codelet_n1_5)),
+   SOLVTAB(X(codelet_n1_6)),
+   SOLVTAB(X(codelet_n1_7)),
+   SOLVTAB(X(codelet_n1_8)),
+   SOLVTAB(X(codelet_n1_9)),
+   SOLVTAB(X(codelet_n1_10)),
+   SOLVTAB(X(codelet_n1_11)),
+   SOLVTAB(X(codelet_n1_12)),
+   SOLVTAB(X(codelet_n1_13)),
+   SOLVTAB(X(codelet_n1_14)),
+   SOLVTAB(X(codelet_n1_15)),
+   SOLVTAB(X(codelet_n1_16)),
+   SOLVTAB(X(codelet_n1_32)),
+   SOLVTAB(X(codelet_n1_64)),
+   SOLVTAB(X(codelet_n1_20)),
+   SOLVTAB(X(codelet_n1_25)),
+   SOLVTAB(X(codelet_t1_2)),
+   SOLVTAB(X(codelet_t1_3)),
+   SOLVTAB(X(codelet_t1_4)),
+   SOLVTAB(X(codelet_t1_5)),
+   SOLVTAB(X(codelet_t1_6)),
+   SOLVTAB(X(codelet_t1_7)),
+   SOLVTAB(X(codelet_t1_8)),
+   SOLVTAB(X(codelet_t1_9)),
+   SOLVTAB(X(codelet_t1_10)),
+   SOLVTAB(X(codelet_t1_12)),
+   SOLVTAB(X(codelet_t1_15)),
+   SOLVTAB(X(codelet_t1_16)),
+   SOLVTAB(X(codelet_t1_32)),
+   SOLVTAB(X(codelet_t1_64)),
+   SOLVTAB(X(codelet_t1_20)),
+   SOLVTAB(X(codelet_t1_25)),
+   SOLVTAB(X(codelet_t2_4)),
+   SOLVTAB(X(codelet_t2_8)),
+   SOLVTAB(X(codelet_t2_16)),
+   SOLVTAB(X(codelet_t2_32)),
+   SOLVTAB(X(codelet_t2_64)),
+   SOLVTAB(X(codelet_t2_5)),
+   SOLVTAB(X(codelet_t2_10)),
+   SOLVTAB(X(codelet_t2_20)),
+   SOLVTAB(X(codelet_t2_25)),
+   SOLVTAB(X(codelet_q1_2)),
+   SOLVTAB(X(codelet_q1_4)),
+   SOLVTAB(X(codelet_q1_8)),
+   SOLVTAB(X(codelet_q1_3)),
+   SOLVTAB(X(codelet_q1_5)),
+   SOLVTAB(X(codelet_q1_6)),
+   SOLVTAB_END
+};
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:43 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include n.h */
+
+/*
+ * This function contains 84 FP additions, 36 FP multiplications,
+ * (or, 48 additions, 0 multiplications, 36 fused multiply/add),
+ * 59 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "n.h"
+
+static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
+	       E T1g, T1a, T18, T1m, T1k, T1f, T19, T11, T1h, T1l;
+	       {
+		    E Tj, T3, T1b, TN, T1j, TU, T1i, TV, Tq, T10, Ti, Ts, Tw, T15, Tx;
+		    E T13, TG, Ty, TB, TC;
+		    {
+			 E T1, T2, TL, TM;
+			 T1 = ri[0];
+			 T2 = ri[WS(is, 5)];
+			 TL = ii[0];
+			 TM = ii[WS(is, 5)];
+			 {
+			      E T7, Tk, T6, To, Tg, T8, Tb, Tc;
+			      {
+				   E T4, T5, Te, Tf;
+				   T4 = ri[WS(is, 2)];
+				   Tj = T1 + T2;
+				   T3 = T1 - T2;
+				   T1b = TL + TM;
+				   TN = TL - TM;
+				   T5 = ri[WS(is, 7)];
+				   Te = ri[WS(is, 6)];
+				   Tf = ri[WS(is, 1)];
+				   T7 = ri[WS(is, 8)];
+				   Tk = T4 + T5;
+				   T6 = T4 - T5;
+				   To = Te + Tf;
+				   Tg = Te - Tf;
+				   T8 = ri[WS(is, 3)];
+				   Tb = ri[WS(is, 4)];
+				   Tc = ri[WS(is, 9)];
+			      }
+			      {
+				   E TE, TF, Tu, Tv;
+				   {
+					E Ta, Th, Tl, T9;
+					Tu = ii[WS(is, 2)];
+					Tl = T7 + T8;
+					T9 = T7 - T8;
+					{
+					     E Tn, Td, Tm, Tp;
+					     Tn = Tb + Tc;
+					     Td = Tb - Tc;
+					     Tm = Tk + Tl;
+					     T1j = Tk - Tl;
+					     Ta = T6 + T9;
+					     TU = T6 - T9;
+					     Tp = Tn + To;
+					     T1i = Tn - To;
+					     Th = Td + Tg;
+					     TV = Td - Tg;
+					     Tq = Tm + Tp;
+					     T10 = Tm - Tp;
+					     Tv = ii[WS(is, 7)];
+					}
+					Ti = Ta + Th;
+					Ts = Ta - Th;
+				   }
+				   TE = ii[WS(is, 6)];
+				   TF = ii[WS(is, 1)];
+				   Tw = Tu - Tv;
+				   T15 = Tu + Tv;
+				   Tx = ii[WS(is, 8)];
+				   T13 = TE + TF;
+				   TG = TE - TF;
+				   Ty = ii[WS(is, 3)];
+				   TB = ii[WS(is, 4)];
+				   TC = ii[WS(is, 9)];
+			      }
+			 }
+		    }
+		    {
+			 E T17, TA, T14, TH, T1e, TQ, TS;
+			 {
+			      E TO, TP, T16, Tz;
+			      ro[WS(os, 5)] = T3 + Ti;
+			      T16 = Tx + Ty;
+			      Tz = Tx - Ty;
+			      {
+				   E T12, TD, T1c, T1d;
+				   T12 = TB + TC;
+				   TD = TB - TC;
+				   T1c = T15 + T16;
+				   T17 = T15 - T16;
+				   TO = Tw + Tz;
+				   TA = Tw - Tz;
+				   T1d = T12 + T13;
+				   T14 = T12 - T13;
+				   TP = TD + TG;
+				   TH = TD - TG;
+				   T1e = T1c + T1d;
+				   T1g = T1c - T1d;
+			      }
+			      ro[0] = Tj + Tq;
+			      TQ = TO + TP;
+			      TS = TO - TP;
+			 }
+			 {
+			      E TK, TI, TY, TW, TR, TJ, Tt, Tr, TZ, TX, TT;
+			      TK = FNMS(KP618033988, TA, TH);
+			      TI = FMA(KP618033988, TH, TA);
+			      io[0] = T1b + T1e;
+			      io[WS(os, 5)] = TN + TQ;
+			      Tr = FNMS(KP250000000, Ti, T3);
+			      TY = FNMS(KP618033988, TU, TV);
+			      TW = FMA(KP618033988, TV, TU);
+			      TR = FNMS(KP250000000, TQ, TN);
+			      TJ = FNMS(KP559016994, Ts, Tr);
+			      Tt = FMA(KP559016994, Ts, Tr);
+			      T1a = FMA(KP618033988, T14, T17);
+			      T18 = FNMS(KP618033988, T17, T14);
+			      ro[WS(os, 7)] = FNMS(KP951056516, TK, TJ);
+			      ro[WS(os, 3)] = FMA(KP951056516, TK, TJ);
+			      ro[WS(os, 1)] = FMA(KP951056516, TI, Tt);
+			      ro[WS(os, 9)] = FNMS(KP951056516, TI, Tt);
+			      TX = FNMS(KP559016994, TS, TR);
+			      TT = FMA(KP559016994, TS, TR);
+			      TZ = FNMS(KP250000000, Tq, Tj);
+			      io[WS(os, 3)] = FNMS(KP951056516, TY, TX);
+			      io[WS(os, 7)] = FMA(KP951056516, TY, TX);
+			      io[WS(os, 9)] = FMA(KP951056516, TW, TT);
+			      io[WS(os, 1)] = FNMS(KP951056516, TW, TT);
+			      T1m = FMA(KP618033988, T1i, T1j);
+			      T1k = FNMS(KP618033988, T1j, T1i);
+			      T1f = FNMS(KP250000000, T1e, T1b);
+			      T19 = FMA(KP559016994, T10, TZ);
+			      T11 = FNMS(KP559016994, T10, TZ);
+			 }
+		    }
+	       }
+	       ro[WS(os, 4)] = FNMS(KP951056516, T1a, T19);
+	       ro[WS(os, 6)] = FMA(KP951056516, T1a, T19);
+	       ro[WS(os, 8)] = FMA(KP951056516, T18, T11);
+	       ro[WS(os, 2)] = FNMS(KP951056516, T18, T11);
+	       T1h = FNMS(KP559016994, T1g, T1f);
+	       T1l = FMA(KP559016994, T1g, T1f);
+	       io[WS(os, 4)] = FMA(KP951056516, T1m, T1l);
+	       io[WS(os, 6)] = FNMS(KP951056516, T1m, T1l);
+	       io[WS(os, 8)] = FNMS(KP951056516, T1k, T1h);
+	       io[WS(os, 2)] = FMA(KP951056516, T1k, T1h);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 10, "n1_10", {48, 0, 36, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_10) (planner *p) {
+     X(kdft_register) (p, n1_10, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include n.h */
+
+/*
+ * This function contains 84 FP additions, 24 FP multiplications,
+ * (or, 72 additions, 12 multiplications, 12 fused multiply/add),
+ * 41 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "n.h"
+
+static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
+	       E T3, Tj, TQ, T1e, TU, TV, T1c, T1b, Tm, Tp, Tq, Ta, Th, Ti, TA;
+	       E TH, T17, T14, T1f, T1g, T1h, TL, TM, TR;
+	       {
+		    E T1, T2, TO, TP;
+		    T1 = ri[0];
+		    T2 = ri[WS(is, 5)];
+		    T3 = T1 - T2;
+		    Tj = T1 + T2;
+		    TO = ii[0];
+		    TP = ii[WS(is, 5)];
+		    TQ = TO - TP;
+		    T1e = TO + TP;
+	       }
+	       {
+		    E T6, Tk, Tg, To, T9, Tl, Td, Tn;
+		    {
+			 E T4, T5, Te, Tf;
+			 T4 = ri[WS(is, 2)];
+			 T5 = ri[WS(is, 7)];
+			 T6 = T4 - T5;
+			 Tk = T4 + T5;
+			 Te = ri[WS(is, 6)];
+			 Tf = ri[WS(is, 1)];
+			 Tg = Te - Tf;
+			 To = Te + Tf;
+		    }
+		    {
+			 E T7, T8, Tb, Tc;
+			 T7 = ri[WS(is, 8)];
+			 T8 = ri[WS(is, 3)];
+			 T9 = T7 - T8;
+			 Tl = T7 + T8;
+			 Tb = ri[WS(is, 4)];
+			 Tc = ri[WS(is, 9)];
+			 Td = Tb - Tc;
+			 Tn = Tb + Tc;
+		    }
+		    TU = T6 - T9;
+		    TV = Td - Tg;
+		    T1c = Tk - Tl;
+		    T1b = Tn - To;
+		    Tm = Tk + Tl;
+		    Tp = Tn + To;
+		    Tq = Tm + Tp;
+		    Ta = T6 + T9;
+		    Th = Td + Tg;
+		    Ti = Ta + Th;
+	       }
+	       {
+		    E Tw, T15, TG, T13, Tz, T16, TD, T12;
+		    {
+			 E Tu, Tv, TE, TF;
+			 Tu = ii[WS(is, 2)];
+			 Tv = ii[WS(is, 7)];
+			 Tw = Tu - Tv;
+			 T15 = Tu + Tv;
+			 TE = ii[WS(is, 6)];
+			 TF = ii[WS(is, 1)];
+			 TG = TE - TF;
+			 T13 = TE + TF;
+		    }
+		    {
+			 E Tx, Ty, TB, TC;
+			 Tx = ii[WS(is, 8)];
+			 Ty = ii[WS(is, 3)];
+			 Tz = Tx - Ty;
+			 T16 = Tx + Ty;
+			 TB = ii[WS(is, 4)];
+			 TC = ii[WS(is, 9)];
+			 TD = TB - TC;
+			 T12 = TB + TC;
+		    }
+		    TA = Tw - Tz;
+		    TH = TD - TG;
+		    T17 = T15 - T16;
+		    T14 = T12 - T13;
+		    T1f = T15 + T16;
+		    T1g = T12 + T13;
+		    T1h = T1f + T1g;
+		    TL = Tw + Tz;
+		    TM = TD + TG;
+		    TR = TL + TM;
+	       }
+	       ro[WS(os, 5)] = T3 + Ti;
+	       io[WS(os, 5)] = TQ + TR;
+	       ro[0] = Tj + Tq;
+	       io[0] = T1e + T1h;
+	       {
+		    E TI, TK, Tt, TJ, Tr, Ts;
+		    TI = FMA(KP951056516, TA, KP587785252 * TH);
+		    TK = FNMS(KP587785252, TA, KP951056516 * TH);
+		    Tr = KP559016994 * (Ta - Th);
+		    Ts = FNMS(KP250000000, Ti, T3);
+		    Tt = Tr + Ts;
+		    TJ = Ts - Tr;
+		    ro[WS(os, 9)] = Tt - TI;
+		    ro[WS(os, 3)] = TJ + TK;
+		    ro[WS(os, 1)] = Tt + TI;
+		    ro[WS(os, 7)] = TJ - TK;
+	       }
+	       {
+		    E TW, TY, TT, TX, TN, TS;
+		    TW = FMA(KP951056516, TU, KP587785252 * TV);
+		    TY = FNMS(KP587785252, TU, KP951056516 * TV);
+		    TN = KP559016994 * (TL - TM);
+		    TS = FNMS(KP250000000, TR, TQ);
+		    TT = TN + TS;
+		    TX = TS - TN;
+		    io[WS(os, 1)] = TT - TW;
+		    io[WS(os, 7)] = TY + TX;
+		    io[WS(os, 9)] = TW + TT;
+		    io[WS(os, 3)] = TX - TY;
+	       }
+	       {
+		    E T18, T1a, T11, T19, TZ, T10;
+		    T18 = FNMS(KP587785252, T17, KP951056516 * T14);
+		    T1a = FMA(KP951056516, T17, KP587785252 * T14);
+		    TZ = FNMS(KP250000000, Tq, Tj);
+		    T10 = KP559016994 * (Tm - Tp);
+		    T11 = TZ - T10;
+		    T19 = T10 + TZ;
+		    ro[WS(os, 2)] = T11 - T18;
+		    ro[WS(os, 6)] = T19 + T1a;
+		    ro[WS(os, 8)] = T11 + T18;
+		    ro[WS(os, 4)] = T19 - T1a;
+	       }
+	       {
+		    E T1d, T1l, T1k, T1m, T1i, T1j;
+		    T1d = FNMS(KP587785252, T1c, KP951056516 * T1b);
+		    T1l = FMA(KP951056516, T1c, KP587785252 * T1b);
+		    T1i = FNMS(KP250000000, T1h, T1e);
+		    T1j = KP559016994 * (T1f - T1g);
+		    T1k = T1i - T1j;
+		    T1m = T1j + T1i;
+		    io[WS(os, 2)] = T1d + T1k;
+		    io[WS(os, 6)] = T1m - T1l;
+		    io[WS(os, 8)] = T1k - T1d;
+		    io[WS(os, 4)] = T1l + T1m;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 10, "n1_10", {72, 12, 12, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_10) (planner *p) {
+     X(kdft_register) (p, n1_10, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:43 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 11 -name n1_11 -include n.h */
+
+/*
+ * This function contains 140 FP additions, 110 FP multiplications,
+ * (or, 30 additions, 0 multiplications, 110 fused multiply/add),
+ * 84 stack variables, 10 constants, and 44 memory accesses
+ */
+#include "n.h"
+
+static void n1_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP989821441, +0.989821441880932732376092037776718787376519372);
+     DK(KP959492973, +0.959492973614497389890368057066327699062454848);
+     DK(KP918985947, +0.918985947228994779780736114132655398124909697);
+     DK(KP876768831, +0.876768831002589333891339807079336796764054852);
+     DK(KP830830026, +0.830830026003772851058548298459246407048009821);
+     DK(KP778434453, +0.778434453334651800608337670740821884709317477);
+     DK(KP715370323, +0.715370323453429719112414662767260662417897278);
+     DK(KP634356270, +0.634356270682424498893150776899916060542806975);
+     DK(KP342584725, +0.342584725681637509502641509861112333758894680);
+     DK(KP521108558, +0.521108558113202722944698153526659300680427422);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(44, is), MAKE_VOLATILE_STRIDE(44, os)) {
+	       E T1, TA, T1p, T1y, T19, T1d, T1a, T1e;
+	       {
+		    E T1f, T1u, T4, T1q, Tg, T1t, T7, T1s, Ta, Td, T1r, TP, T1X, T26, Ti;
+		    E TG, T1O, T1w, TY, T1F, T17, To, T1i, T1k, T1h, Tr, T1j, Tu, T1g, Tx;
+		    E T21, TU, TL, TC, T1S, T1J, T1m, T12, T1z, T1b;
+		    T1 = ri[0];
+		    T1f = ii[0];
+		    {
+			 E T1E, T16, Tb, Tc, Tv, Tw;
+			 {
+			      E T2, T3, Te, Tf;
+			      T2 = ri[WS(is, 1)];
+			      T3 = ri[WS(is, 10)];
+			      Te = ri[WS(is, 5)];
+			      Tf = ri[WS(is, 6)];
+			      {
+				   E T5, T6, T8, T9;
+				   T5 = ri[WS(is, 2)];
+				   T1u = T3 - T2;
+				   T4 = T2 + T3;
+				   T1q = Tf - Te;
+				   Tg = Te + Tf;
+				   T6 = ri[WS(is, 9)];
+				   T8 = ri[WS(is, 3)];
+				   T9 = ri[WS(is, 8)];
+				   Tb = ri[WS(is, 4)];
+				   T1t = T6 - T5;
+				   T7 = T5 + T6;
+				   T1s = T9 - T8;
+				   Ta = T8 + T9;
+				   Tc = ri[WS(is, 7)];
+			      }
+			 }
+			 {
+			      E T25, Th, T1W, TO;
+			      T25 = FMA(KP521108558, T1q, T1u);
+			      T1W = FMA(KP521108558, T1s, T1q);
+			      TO = FNMS(KP342584725, T4, Ta);
+			      Th = FNMS(KP342584725, Ta, T7);
+			      Td = Tb + Tc;
+			      T1r = Tc - Tb;
+			      TP = FNMS(KP634356270, TO, Tg);
+			      T1X = FNMS(KP715370323, T1W, T1t);
+			      T26 = FMA(KP715370323, T25, T1r);
+			      {
+				   E TF, T1N, T1v, TX;
+				   TF = FNMS(KP342584725, Td, T4);
+				   Ti = FNMS(KP634356270, Th, Td);
+				   T1N = FNMS(KP521108558, T1t, T1r);
+				   T1v = FNMS(KP521108558, T1u, T1t);
+				   TG = FNMS(KP634356270, TF, T7);
+				   TX = FNMS(KP342584725, T7, Tg);
+				   T1O = FMA(KP715370323, T1N, T1q);
+				   T1w = FNMS(KP715370323, T1v, T1s);
+				   T1E = FMA(KP521108558, T1r, T1s);
+				   TY = FNMS(KP634356270, TX, T4);
+				   T16 = FNMS(KP342584725, Tg, Td);
+			      }
+			 }
+			 {
+			      E Ty, Tz, Tm, Tn;
+			      Tm = ii[WS(is, 3)];
+			      T1F = FMA(KP715370323, T1E, T1u);
+			      Tn = ii[WS(is, 8)];
+			      T17 = FNMS(KP634356270, T16, Ta);
+			      Ty = ii[WS(is, 5)];
+			      Tz = ii[WS(is, 6)];
+			      To = Tm - Tn;
+			      T1i = Tm + Tn;
+			      {
+				   E Tp, Tq, Ts, Tt;
+				   Tp = ii[WS(is, 2)];
+				   T1k = Ty + Tz;
+				   TA = Ty - Tz;
+				   Tq = ii[WS(is, 9)];
+				   Ts = ii[WS(is, 4)];
+				   Tt = ii[WS(is, 7)];
+				   Tv = ii[WS(is, 1)];
+				   T1h = Tp + Tq;
+				   Tr = Tp - Tq;
+				   T1j = Ts + Tt;
+				   Tu = Ts - Tt;
+				   Tw = ii[WS(is, 10)];
+			      }
+			 }
+			 {
+			      E TB, T1R, T20, TK, TT, T1I, T1l;
+			      T20 = FNMS(KP342584725, T1i, T1h);
+			      TK = FMA(KP521108558, To, TA);
+			      TT = FNMS(KP521108558, Tr, Tu);
+			      T1g = Tv + Tw;
+			      Tx = Tv - Tw;
+			      T21 = FNMS(KP634356270, T20, T1j);
+			      TU = FMA(KP715370323, TT, TA);
+			      TL = FNMS(KP715370323, TK, Tr);
+			      TB = FMA(KP521108558, TA, Tx);
+			      T1R = FNMS(KP342584725, T1j, T1g);
+			      T1I = FNMS(KP342584725, T1g, T1i);
+			      T1l = FNMS(KP342584725, T1k, T1j);
+			      TC = FMA(KP715370323, TB, Tu);
+			      T1S = FNMS(KP634356270, T1R, T1h);
+			      T1J = FNMS(KP634356270, T1I, T1k);
+			      T1m = FNMS(KP634356270, T1l, T1i);
+			      T12 = FMA(KP521108558, Tu, To);
+			      T1z = FNMS(KP342584725, T1h, T1k);
+			      T1b = FNMS(KP521108558, Tx, Tr);
+			 }
+		    }
+		    {
+			 E T13, T1A, T1c, T1Z, T1V, TH, TM, Tj, TD;
+			 ro[0] = T1 + T4 + T7 + Ta + Td + Tg;
+			 T13 = FMA(KP715370323, T12, Tx);
+			 T1A = FNMS(KP634356270, T1z, T1g);
+			 T1c = FNMS(KP715370323, T1b, To);
+			 io[0] = T1f + T1g + T1h + T1i + T1j + T1k;
+			 Tj = FNMS(KP778434453, Ti, T4);
+			 TD = FMA(KP830830026, TC, Tr);
+			 {
+			      E TE, T23, T28, Tl, Tk, T22, T27;
+			      T22 = FNMS(KP778434453, T21, T1g);
+			      T27 = FMA(KP830830026, T26, T1t);
+			      Tk = FNMS(KP876768831, Tj, Tg);
+			      TE = FMA(KP918985947, TD, To);
+			      T23 = FNMS(KP876768831, T22, T1k);
+			      T28 = FMA(KP918985947, T27, T1s);
+			      Tl = FNMS(KP959492973, Tk, T1);
+			      {
+				   E T1U, T1T, T24, T1Y;
+				   T1T = FNMS(KP778434453, T1S, T1k);
+				   T24 = FNMS(KP959492973, T23, T1f);
+				   T1Y = FMA(KP830830026, T1X, T1u);
+				   ro[WS(os, 1)] = FMA(KP989821441, TE, Tl);
+				   ro[WS(os, 10)] = FNMS(KP989821441, TE, Tl);
+				   T1U = FNMS(KP876768831, T1T, T1i);
+				   io[WS(os, 10)] = FNMS(KP989821441, T28, T24);
+				   io[WS(os, 1)] = FMA(KP989821441, T28, T24);
+				   T1Z = FNMS(KP918985947, T1Y, T1r);
+				   T1V = FNMS(KP959492973, T1U, T1f);
+			      }
+			      TH = FNMS(KP778434453, TG, Tg);
+			      TM = FMA(KP830830026, TL, Tx);
+			 }
+			 {
+			      E T1M, TZ, T14, T1Q;
+			      {
+				   E TN, TR, TV, TJ, TI, TQ, T1P;
+				   TQ = FNMS(KP778434453, TP, Td);
+				   io[WS(os, 9)] = FMA(KP989821441, T1Z, T1V);
+				   io[WS(os, 2)] = FNMS(KP989821441, T1Z, T1V);
+				   TI = FNMS(KP876768831, TH, Ta);
+				   TN = FNMS(KP918985947, TM, Tu);
+				   TR = FNMS(KP876768831, TQ, T7);
+				   TV = FNMS(KP830830026, TU, To);
+				   TJ = FNMS(KP959492973, TI, T1);
+				   {
+					E T1L, TS, TW, T1K;
+					T1K = FNMS(KP778434453, T1J, T1j);
+					TS = FNMS(KP959492973, TR, T1);
+					TW = FNMS(KP918985947, TV, Tx);
+					ro[WS(os, 9)] = FMA(KP989821441, TN, TJ);
+					ro[WS(os, 2)] = FNMS(KP989821441, TN, TJ);
+					T1L = FNMS(KP876768831, T1K, T1h);
+					ro[WS(os, 3)] = FMA(KP989821441, TW, TS);
+					ro[WS(os, 8)] = FNMS(KP989821441, TW, TS);
+					T1P = FNMS(KP830830026, T1O, T1s);
+					T1M = FNMS(KP959492973, T1L, T1f);
+				   }
+				   TZ = FNMS(KP778434453, TY, Ta);
+				   T14 = FNMS(KP830830026, T13, TA);
+				   T1Q = FNMS(KP918985947, T1P, T1u);
+			      }
+			      {
+				   E T15, T11, T1C, T1G, T1B, T10;
+				   T1B = FNMS(KP778434453, T1A, T1i);
+				   T10 = FNMS(KP876768831, TZ, Td);
+				   T15 = FMA(KP918985947, T14, Tr);
+				   io[WS(os, 8)] = FNMS(KP989821441, T1Q, T1M);
+				   io[WS(os, 3)] = FMA(KP989821441, T1Q, T1M);
+				   T11 = FNMS(KP959492973, T10, T1);
+				   T1C = FNMS(KP876768831, T1B, T1j);
+				   T1G = FNMS(KP830830026, T1F, T1q);
+				   {
+					E T1D, T1H, T1o, T1x, T1n, T18;
+					T1n = FNMS(KP778434453, T1m, T1h);
+					ro[WS(os, 7)] = FMA(KP989821441, T15, T11);
+					ro[WS(os, 4)] = FNMS(KP989821441, T15, T11);
+					T1D = FNMS(KP959492973, T1C, T1f);
+					T1H = FMA(KP918985947, T1G, T1t);
+					T1o = FNMS(KP876768831, T1n, T1g);
+					T1x = FNMS(KP830830026, T1w, T1r);
+					T18 = FNMS(KP778434453, T17, T7);
+					io[WS(os, 7)] = FMA(KP989821441, T1H, T1D);
+					io[WS(os, 4)] = FNMS(KP989821441, T1H, T1D);
+					T1p = FNMS(KP959492973, T1o, T1f);
+					T1y = FNMS(KP918985947, T1x, T1q);
+					T19 = FNMS(KP876768831, T18, T4);
+					T1d = FNMS(KP830830026, T1c, Tu);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       io[WS(os, 6)] = FNMS(KP989821441, T1y, T1p);
+	       io[WS(os, 5)] = FMA(KP989821441, T1y, T1p);
+	       T1a = FNMS(KP959492973, T19, T1);
+	       T1e = FNMS(KP918985947, T1d, TA);
+	       ro[WS(os, 5)] = FMA(KP989821441, T1e, T1a);
+	       ro[WS(os, 6)] = FNMS(KP989821441, T1e, T1a);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 11, "n1_11", {30, 0, 110, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_11) (planner *p) {
+     X(kdft_register) (p, n1_11, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 11 -name n1_11 -include n.h */
+
+/*
+ * This function contains 140 FP additions, 100 FP multiplications,
+ * (or, 60 additions, 20 multiplications, 80 fused multiply/add),
+ * 41 stack variables, 10 constants, and 44 memory accesses
+ */
+#include "n.h"
+
+static void n1_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP654860733, +0.654860733945285064056925072466293553183791199);
+     DK(KP142314838, +0.142314838273285140443792668616369668791051361);
+     DK(KP959492973, +0.959492973614497389890368057066327699062454848);
+     DK(KP415415013, +0.415415013001886425529274149229623203524004910);
+     DK(KP841253532, +0.841253532831181168861811648919367717513292498);
+     DK(KP989821441, +0.989821441880932732376092037776718787376519372);
+     DK(KP909631995, +0.909631995354518371411715383079028460060241051);
+     DK(KP281732556, +0.281732556841429697711417915346616899035777899);
+     DK(KP540640817, +0.540640817455597582107635954318691695431770608);
+     DK(KP755749574, +0.755749574354258283774035843972344420179717445);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(44, is), MAKE_VOLATILE_STRIDE(44, os)) {
+	       E T1, TM, T4, TG, Tk, TR, Tw, TN, T7, TK, Ta, TH, Tn, TQ, Td;
+	       E TJ, Tq, TO, Tt, TP, Tg, TI;
+	       {
+		    E T2, T3, Ti, Tj;
+		    T1 = ri[0];
+		    TM = ii[0];
+		    T2 = ri[WS(is, 1)];
+		    T3 = ri[WS(is, 10)];
+		    T4 = T2 + T3;
+		    TG = T3 - T2;
+		    Ti = ii[WS(is, 1)];
+		    Tj = ii[WS(is, 10)];
+		    Tk = Ti - Tj;
+		    TR = Ti + Tj;
+		    {
+			 E Tu, Tv, T5, T6;
+			 Tu = ii[WS(is, 2)];
+			 Tv = ii[WS(is, 9)];
+			 Tw = Tu - Tv;
+			 TN = Tu + Tv;
+			 T5 = ri[WS(is, 2)];
+			 T6 = ri[WS(is, 9)];
+			 T7 = T5 + T6;
+			 TK = T6 - T5;
+		    }
+	       }
+	       {
+		    E T8, T9, To, Tp;
+		    T8 = ri[WS(is, 3)];
+		    T9 = ri[WS(is, 8)];
+		    Ta = T8 + T9;
+		    TH = T9 - T8;
+		    {
+			 E Tl, Tm, Tb, Tc;
+			 Tl = ii[WS(is, 3)];
+			 Tm = ii[WS(is, 8)];
+			 Tn = Tl - Tm;
+			 TQ = Tl + Tm;
+			 Tb = ri[WS(is, 4)];
+			 Tc = ri[WS(is, 7)];
+			 Td = Tb + Tc;
+			 TJ = Tc - Tb;
+		    }
+		    To = ii[WS(is, 4)];
+		    Tp = ii[WS(is, 7)];
+		    Tq = To - Tp;
+		    TO = To + Tp;
+		    {
+			 E Tr, Ts, Te, Tf;
+			 Tr = ii[WS(is, 5)];
+			 Ts = ii[WS(is, 6)];
+			 Tt = Tr - Ts;
+			 TP = Tr + Ts;
+			 Te = ri[WS(is, 5)];
+			 Tf = ri[WS(is, 6)];
+			 Tg = Te + Tf;
+			 TI = Tf - Te;
+		    }
+	       }
+	       {
+		    E Tx, Th, TZ, T10;
+		    ro[0] = T1 + T4 + T7 + Ta + Td + Tg;
+		    io[0] = TM + TR + TN + TQ + TO + TP;
+		    Tx = FMA(KP755749574, Tk, KP540640817 * Tn) + FNMS(KP909631995, Tt, KP281732556 * Tq) - (KP989821441 * Tw);
+		    Th = FMA(KP841253532, Ta, T1) + FNMS(KP959492973, Td, KP415415013 * Tg) + FNMA(KP142314838, T7, KP654860733 * T4);
+		    ro[WS(os, 7)] = Th - Tx;
+		    ro[WS(os, 4)] = Th + Tx;
+		    TZ = FMA(KP755749574, TG, KP540640817 * TH) + FNMS(KP909631995, TI, KP281732556 * TJ) - (KP989821441 * TK);
+		    T10 = FMA(KP841253532, TQ, TM) + FNMS(KP959492973, TO, KP415415013 * TP) + FNMA(KP142314838, TN, KP654860733 * TR);
+		    io[WS(os, 4)] = TZ + T10;
+		    io[WS(os, 7)] = T10 - TZ;
+		    {
+			 E TX, TY, Tz, Ty;
+			 TX = FMA(KP909631995, TG, KP755749574 * TK) + FNMA(KP540640817, TI, KP989821441 * TJ) - (KP281732556 * TH);
+			 TY = FMA(KP415415013, TR, TM) + FNMS(KP142314838, TO, KP841253532 * TP) + FNMA(KP959492973, TQ, KP654860733 * TN);
+			 io[WS(os, 2)] = TX + TY;
+			 io[WS(os, 9)] = TY - TX;
+			 Tz = FMA(KP909631995, Tk, KP755749574 * Tw) + FNMA(KP540640817, Tt, KP989821441 * Tq) - (KP281732556 * Tn);
+			 Ty = FMA(KP415415013, T4, T1) + FNMS(KP142314838, Td, KP841253532 * Tg) + FNMA(KP959492973, Ta, KP654860733 * T7);
+			 ro[WS(os, 9)] = Ty - Tz;
+			 ro[WS(os, 2)] = Ty + Tz;
+		    }
+	       }
+	       {
+		    E TB, TA, TT, TU;
+		    TB = FMA(KP540640817, Tk, KP909631995 * Tw) + FMA(KP989821441, Tn, KP755749574 * Tq) + (KP281732556 * Tt);
+		    TA = FMA(KP841253532, T4, T1) + FNMS(KP959492973, Tg, KP415415013 * T7) + FNMA(KP654860733, Td, KP142314838 * Ta);
+		    ro[WS(os, 10)] = TA - TB;
+		    ro[WS(os, 1)] = TA + TB;
+		    {
+			 E TV, TW, TD, TC;
+			 TV = FMA(KP540640817, TG, KP909631995 * TK) + FMA(KP989821441, TH, KP755749574 * TJ) + (KP281732556 * TI);
+			 TW = FMA(KP841253532, TR, TM) + FNMS(KP959492973, TP, KP415415013 * TN) + FNMA(KP654860733, TO, KP142314838 * TQ);
+			 io[WS(os, 1)] = TV + TW;
+			 io[WS(os, 10)] = TW - TV;
+			 TD = FMA(KP989821441, Tk, KP540640817 * Tq) + FNMS(KP909631995, Tn, KP755749574 * Tt) - (KP281732556 * Tw);
+			 TC = FMA(KP415415013, Ta, T1) + FNMS(KP654860733, Tg, KP841253532 * Td) + FNMA(KP959492973, T7, KP142314838 * T4);
+			 ro[WS(os, 8)] = TC - TD;
+			 ro[WS(os, 3)] = TC + TD;
+		    }
+		    TT = FMA(KP989821441, TG, KP540640817 * TJ) + FNMS(KP909631995, TH, KP755749574 * TI) - (KP281732556 * TK);
+		    TU = FMA(KP415415013, TQ, TM) + FNMS(KP654860733, TP, KP841253532 * TO) + FNMA(KP959492973, TN, KP142314838 * TR);
+		    io[WS(os, 3)] = TT + TU;
+		    io[WS(os, 8)] = TU - TT;
+		    {
+			 E TL, TS, TF, TE;
+			 TL = FMA(KP281732556, TG, KP755749574 * TH) + FNMS(KP909631995, TJ, KP989821441 * TI) - (KP540640817 * TK);
+			 TS = FMA(KP841253532, TN, TM) + FNMS(KP142314838, TP, KP415415013 * TO) + FNMA(KP654860733, TQ, KP959492973 * TR);
+			 io[WS(os, 5)] = TL + TS;
+			 io[WS(os, 6)] = TS - TL;
+			 TF = FMA(KP281732556, Tk, KP755749574 * Tn) + FNMS(KP909631995, Tq, KP989821441 * Tt) - (KP540640817 * Tw);
+			 TE = FMA(KP841253532, T7, T1) + FNMS(KP142314838, Tg, KP415415013 * Td) + FNMA(KP654860733, Ta, KP959492973 * T4);
+			 ro[WS(os, 6)] = TE - TF;
+			 ro[WS(os, 5)] = TE + TF;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 11, "n1_11", {60, 20, 80, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_11) (planner *p) {
+     X(kdft_register) (p, n1_11, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:43 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include n.h */
+
+/*
+ * This function contains 96 FP additions, 24 FP multiplications,
+ * (or, 72 additions, 0 multiplications, 24 fused multiply/add),
+ * 63 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "n.h"
+
+static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
+	       E TT, TW, TF, T1q, TY, TQ, TX, T1n;
+	       {
+		    E TA, TS, TR, T5, Ts, Tz, TD, TV, TU, Ta, Tx, TC, T1d, Th, TJ;
+		    E TG, Tg, T1u, T1c, T1f, TM, TN, Tk, T1i;
+		    {
+			 E T6, Tt, Tu, Tv, T9;
+			 {
+			      E T1, To, Tp, Tq, T4, T2, T3, T7, T8, Tr;
+			      T1 = ri[0];
+			      T2 = ri[WS(is, 4)];
+			      T3 = ri[WS(is, 8)];
+			      To = ii[0];
+			      Tp = ii[WS(is, 4)];
+			      Tq = ii[WS(is, 8)];
+			      T4 = T2 + T3;
+			      TA = T3 - T2;
+			      T6 = ri[WS(is, 6)];
+			      TS = Tp - Tq;
+			      Tr = Tp + Tq;
+			      TR = FNMS(KP500000000, T4, T1);
+			      T5 = T1 + T4;
+			      T7 = ri[WS(is, 10)];
+			      Ts = To + Tr;
+			      Tz = FNMS(KP500000000, Tr, To);
+			      T8 = ri[WS(is, 2)];
+			      Tt = ii[WS(is, 6)];
+			      Tu = ii[WS(is, 10)];
+			      Tv = ii[WS(is, 2)];
+			      T9 = T7 + T8;
+			      TD = T8 - T7;
+			 }
+			 {
+			      E Tc, T1a, TH, TI, Tf, Td, Te, Tw, Ti, Tj, T1b;
+			      Tc = ri[WS(is, 3)];
+			      TV = Tu - Tv;
+			      Tw = Tu + Tv;
+			      TU = FNMS(KP500000000, T9, T6);
+			      Ta = T6 + T9;
+			      Td = ri[WS(is, 7)];
+			      Tx = Tt + Tw;
+			      TC = FNMS(KP500000000, Tw, Tt);
+			      Te = ri[WS(is, 11)];
+			      T1a = ii[WS(is, 3)];
+			      TH = ii[WS(is, 7)];
+			      TI = ii[WS(is, 11)];
+			      Tf = Td + Te;
+			      T1d = Te - Td;
+			      Th = ri[WS(is, 9)];
+			      TJ = TH - TI;
+			      T1b = TH + TI;
+			      TG = FNMS(KP500000000, Tf, Tc);
+			      Tg = Tc + Tf;
+			      Ti = ri[WS(is, 1)];
+			      T1u = T1a + T1b;
+			      T1c = FNMS(KP500000000, T1b, T1a);
+			      Tj = ri[WS(is, 5)];
+			      T1f = ii[WS(is, 9)];
+			      TM = ii[WS(is, 1)];
+			      TN = ii[WS(is, 5)];
+			      Tk = Ti + Tj;
+			      T1i = Tj - Ti;
+			 }
+		    }
+		    {
+			 E T1t, TO, TL, T1h, T1w, Tb, T1g, Tl;
+			 T1t = T5 - Ta;
+			 Tb = T5 + Ta;
+			 TO = TM - TN;
+			 T1g = TM + TN;
+			 TL = FNMS(KP500000000, Tk, Th);
+			 Tl = Th + Tk;
+			 {
+			      E T1x, Ty, T1v, Tn, Tm, T1y;
+			      T1x = Ts + Tx;
+			      Ty = Ts - Tx;
+			      T1v = T1f + T1g;
+			      T1h = FNMS(KP500000000, T1g, T1f);
+			      Tn = Tg - Tl;
+			      Tm = Tg + Tl;
+			      T1y = T1u + T1v;
+			      T1w = T1u - T1v;
+			      ro[0] = Tb + Tm;
+			      ro[WS(os, 6)] = Tb - Tm;
+			      io[WS(os, 3)] = Tn + Ty;
+			      io[0] = T1x + T1y;
+			      io[WS(os, 6)] = T1x - T1y;
+			      io[WS(os, 9)] = Ty - Tn;
+			 }
+			 {
+			      E TB, TE, T1o, T11, T1p, TK, TP, T15, T1k, T18, T14, T16, T1l, T1m;
+			      {
+				   E T1e, T1j, TZ, T10, T12, T13;
+				   TB = FNMS(KP866025403, TA, Tz);
+				   TZ = FMA(KP866025403, TA, Tz);
+				   T10 = FMA(KP866025403, TD, TC);
+				   TE = FNMS(KP866025403, TD, TC);
+				   T1o = FNMS(KP866025403, T1d, T1c);
+				   T1e = FMA(KP866025403, T1d, T1c);
+				   ro[WS(os, 9)] = T1t + T1w;
+				   ro[WS(os, 3)] = T1t - T1w;
+				   T1l = TZ + T10;
+				   T11 = TZ - T10;
+				   T1j = FMA(KP866025403, T1i, T1h);
+				   T1p = FNMS(KP866025403, T1i, T1h);
+				   TK = FNMS(KP866025403, TJ, TG);
+				   T12 = FMA(KP866025403, TJ, TG);
+				   T13 = FMA(KP866025403, TO, TL);
+				   TP = FNMS(KP866025403, TO, TL);
+				   TT = FNMS(KP866025403, TS, TR);
+				   T15 = FMA(KP866025403, TS, TR);
+				   T1m = T1e + T1j;
+				   T1k = T1e - T1j;
+				   T18 = T12 + T13;
+				   T14 = T12 - T13;
+				   T16 = FMA(KP866025403, TV, TU);
+				   TW = FNMS(KP866025403, TV, TU);
+			      }
+			      io[WS(os, 10)] = T1l - T1m;
+			      io[WS(os, 4)] = T1l + T1m;
+			      io[WS(os, 7)] = T11 + T14;
+			      io[WS(os, 1)] = T11 - T14;
+			      {
+				   E T17, T19, T1r, T1s;
+				   T17 = T15 + T16;
+				   T19 = T15 - T16;
+				   ro[WS(os, 7)] = T19 - T1k;
+				   ro[WS(os, 1)] = T19 + T1k;
+				   ro[WS(os, 4)] = T17 + T18;
+				   ro[WS(os, 10)] = T17 - T18;
+				   T1r = TB + TE;
+				   TF = TB - TE;
+				   T1s = T1o + T1p;
+				   T1q = T1o - T1p;
+				   TY = TK + TP;
+				   TQ = TK - TP;
+				   io[WS(os, 2)] = T1r - T1s;
+				   io[WS(os, 8)] = T1r + T1s;
+			      }
+			 }
+		    }
+	       }
+	       io[WS(os, 11)] = TF + TQ;
+	       io[WS(os, 5)] = TF - TQ;
+	       TX = TT + TW;
+	       T1n = TT - TW;
+	       ro[WS(os, 11)] = T1n - T1q;
+	       ro[WS(os, 5)] = T1n + T1q;
+	       ro[WS(os, 8)] = TX + TY;
+	       ro[WS(os, 2)] = TX - TY;
+	  }
+     }
+}
+
+static const kdft_desc desc = { 12, "n1_12", {72, 0, 24, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_12) (planner *p) {
+     X(kdft_register) (p, n1_12, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include n.h */
+
+/*
+ * This function contains 96 FP additions, 16 FP multiplications,
+ * (or, 88 additions, 8 multiplications, 8 fused multiply/add),
+ * 43 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "n.h"
+
+static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
+	       E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1a, TG;
+	       E TJ, T1u, T1d, Tl, T1f, TL, TO, T1v, T1i;
+	       {
+		    E T1, T2, T3, T4;
+		    T1 = ri[0];
+		    T2 = ri[WS(is, 4)];
+		    T3 = ri[WS(is, 8)];
+		    T4 = T2 + T3;
+		    T5 = T1 + T4;
+		    TR = FNMS(KP500000000, T4, T1);
+		    TA = KP866025403 * (T3 - T2);
+	       }
+	       {
+		    E To, Tp, Tq, Tr;
+		    To = ii[0];
+		    Tp = ii[WS(is, 4)];
+		    Tq = ii[WS(is, 8)];
+		    Tr = Tp + Tq;
+		    Ts = To + Tr;
+		    TS = KP866025403 * (Tp - Tq);
+		    Tz = FNMS(KP500000000, Tr, To);
+	       }
+	       {
+		    E T6, T7, T8, T9;
+		    T6 = ri[WS(is, 6)];
+		    T7 = ri[WS(is, 10)];
+		    T8 = ri[WS(is, 2)];
+		    T9 = T7 + T8;
+		    Ta = T6 + T9;
+		    TU = FNMS(KP500000000, T9, T6);
+		    TD = KP866025403 * (T8 - T7);
+	       }
+	       {
+		    E Tt, Tu, Tv, Tw;
+		    Tt = ii[WS(is, 6)];
+		    Tu = ii[WS(is, 10)];
+		    Tv = ii[WS(is, 2)];
+		    Tw = Tu + Tv;
+		    Tx = Tt + Tw;
+		    TV = KP866025403 * (Tu - Tv);
+		    TC = FNMS(KP500000000, Tw, Tt);
+	       }
+	       {
+		    E Tc, Td, Te, Tf;
+		    Tc = ri[WS(is, 3)];
+		    Td = ri[WS(is, 7)];
+		    Te = ri[WS(is, 11)];
+		    Tf = Td + Te;
+		    Tg = Tc + Tf;
+		    T1a = KP866025403 * (Te - Td);
+		    TG = FNMS(KP500000000, Tf, Tc);
+	       }
+	       {
+		    E T1b, TH, TI, T1c;
+		    T1b = ii[WS(is, 3)];
+		    TH = ii[WS(is, 7)];
+		    TI = ii[WS(is, 11)];
+		    T1c = TH + TI;
+		    TJ = KP866025403 * (TH - TI);
+		    T1u = T1b + T1c;
+		    T1d = FNMS(KP500000000, T1c, T1b);
+	       }
+	       {
+		    E Th, Ti, Tj, Tk;
+		    Th = ri[WS(is, 9)];
+		    Ti = ri[WS(is, 1)];
+		    Tj = ri[WS(is, 5)];
+		    Tk = Ti + Tj;
+		    Tl = Th + Tk;
+		    T1f = KP866025403 * (Tj - Ti);
+		    TL = FNMS(KP500000000, Tk, Th);
+	       }
+	       {
+		    E T1g, TM, TN, T1h;
+		    T1g = ii[WS(is, 9)];
+		    TM = ii[WS(is, 1)];
+		    TN = ii[WS(is, 5)];
+		    T1h = TM + TN;
+		    TO = KP866025403 * (TM - TN);
+		    T1v = T1g + T1h;
+		    T1i = FNMS(KP500000000, T1h, T1g);
+	       }
+	       {
+		    E Tb, Tm, T1t, T1w;
+		    Tb = T5 + Ta;
+		    Tm = Tg + Tl;
+		    ro[WS(os, 6)] = Tb - Tm;
+		    ro[0] = Tb + Tm;
+		    {
+			 E T1x, T1y, Tn, Ty;
+			 T1x = Ts + Tx;
+			 T1y = T1u + T1v;
+			 io[WS(os, 6)] = T1x - T1y;
+			 io[0] = T1x + T1y;
+			 Tn = Tg - Tl;
+			 Ty = Ts - Tx;
+			 io[WS(os, 3)] = Tn + Ty;
+			 io[WS(os, 9)] = Ty - Tn;
+		    }
+		    T1t = T5 - Ta;
+		    T1w = T1u - T1v;
+		    ro[WS(os, 3)] = T1t - T1w;
+		    ro[WS(os, 9)] = T1t + T1w;
+		    {
+			 E T11, T1l, T1k, T1m, T14, T18, T17, T19;
+			 {
+			      E TZ, T10, T1e, T1j;
+			      TZ = TA + Tz;
+			      T10 = TD + TC;
+			      T11 = TZ - T10;
+			      T1l = TZ + T10;
+			      T1e = T1a + T1d;
+			      T1j = T1f + T1i;
+			      T1k = T1e - T1j;
+			      T1m = T1e + T1j;
+			 }
+			 {
+			      E T12, T13, T15, T16;
+			      T12 = TG + TJ;
+			      T13 = TL + TO;
+			      T14 = T12 - T13;
+			      T18 = T12 + T13;
+			      T15 = TR + TS;
+			      T16 = TU + TV;
+			      T17 = T15 + T16;
+			      T19 = T15 - T16;
+			 }
+			 io[WS(os, 1)] = T11 - T14;
+			 ro[WS(os, 1)] = T19 + T1k;
+			 io[WS(os, 7)] = T11 + T14;
+			 ro[WS(os, 7)] = T19 - T1k;
+			 ro[WS(os, 10)] = T17 - T18;
+			 io[WS(os, 10)] = T1l - T1m;
+			 ro[WS(os, 4)] = T17 + T18;
+			 io[WS(os, 4)] = T1l + T1m;
+		    }
+		    {
+			 E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
+			 {
+			      E TB, TE, T1o, T1p;
+			      TB = Tz - TA;
+			      TE = TC - TD;
+			      TF = TB - TE;
+			      T1r = TB + TE;
+			      T1o = T1d - T1a;
+			      T1p = T1i - T1f;
+			      T1q = T1o - T1p;
+			      T1s = T1o + T1p;
+			 }
+			 {
+			      E TK, TP, TT, TW;
+			      TK = TG - TJ;
+			      TP = TL - TO;
+			      TQ = TK - TP;
+			      TY = TK + TP;
+			      TT = TR - TS;
+			      TW = TU - TV;
+			      TX = TT + TW;
+			      T1n = TT - TW;
+			 }
+			 io[WS(os, 5)] = TF - TQ;
+			 ro[WS(os, 5)] = T1n + T1q;
+			 io[WS(os, 11)] = TF + TQ;
+			 ro[WS(os, 11)] = T1n - T1q;
+			 ro[WS(os, 2)] = TX - TY;
+			 io[WS(os, 2)] = T1r - T1s;
+			 ro[WS(os, 8)] = TX + TY;
+			 io[WS(os, 8)] = T1r + T1s;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 12, "n1_12", {88, 8, 8, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_12) (planner *p) {
+     X(kdft_register) (p, n1_12, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:43 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 13 -name n1_13 -include n.h */
+
+/*
+ * This function contains 176 FP additions, 114 FP multiplications,
+ * (or, 62 additions, 0 multiplications, 114 fused multiply/add),
+ * 87 stack variables, 25 constants, and 52 memory accesses
+ */
+#include "n.h"
+
+static void n1_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP875502302, +0.875502302409147941146295545768755143177842006);
+     DK(KP520028571, +0.520028571888864619117130500499232802493238139);
+     DK(KP575140729, +0.575140729474003121368385547455453388461001608);
+     DK(KP600477271, +0.600477271932665282925769253334763009352012849);
+     DK(KP300462606, +0.300462606288665774426601772289207995520941381);
+     DK(KP516520780, +0.516520780623489722840901288569017135705033622);
+     DK(KP968287244, +0.968287244361984016049539446938120421179794516);
+     DK(KP503537032, +0.503537032863766627246873853868466977093348562);
+     DK(KP251768516, +0.251768516431883313623436926934233488546674281);
+     DK(KP581704778, +0.581704778510515730456870384989698884939833902);
+     DK(KP859542535, +0.859542535098774820163672132761689612766401925);
+     DK(KP083333333, +0.083333333333333333333333333333333333333333333);
+     DK(KP957805992, +0.957805992594665126462521754605754580515587217);
+     DK(KP522026385, +0.522026385161275033714027226654165028300441940);
+     DK(KP853480001, +0.853480001859823990758994934970528322872359049);
+     DK(KP769338817, +0.769338817572980603471413688209101117038278899);
+     DK(KP612264650, +0.612264650376756543746494474777125408779395514);
+     DK(KP038632954, +0.038632954644348171955506895830342264440241080);
+     DK(KP302775637, +0.302775637731994646559610633735247973125648287);
+     DK(KP514918778, +0.514918778086315755491789696138117261566051239);
+     DK(KP686558370, +0.686558370781754340655719594850823015421401653);
+     DK(KP226109445, +0.226109445035782405468510155372505010481906348);
+     DK(KP301479260, +0.301479260047709873958013540496673347309208464);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(52, is), MAKE_VOLATILE_STRIDE(52, os)) {
+	       E T2B, T2H, T2I, T2G;
+	       {
+		    E T1, T1P, T2n, T2o, To, TH, T2h, T2k, TE, TB, TF, Tw, T2j, T2c, T1m;
+		    E T1W, T1X, T1c, T19, T1j, T12, T1f, T21, T24, T27, T1U;
+		    T1 = ri[0];
+		    T1P = ii[0];
+		    {
+			 E T2b, Tv, Ts, T2a;
+			 {
+			      E T2d, Tf, Tq, Ty, Tb, Tr, T6, Tx, Ti, Tt, Tu, Tl;
+			      {
+				   E T7, T8, T9, Td, Te;
+				   Td = ri[WS(is, 8)];
+				   Te = ri[WS(is, 5)];
+				   T7 = ri[WS(is, 12)];
+				   T8 = ri[WS(is, 10)];
+				   T9 = ri[WS(is, 4)];
+				   T2d = Td - Te;
+				   Tf = Td + Te;
+				   {
+					E T2, Ta, T3, T4;
+					T2 = ri[WS(is, 1)];
+					Ta = T8 + T9;
+					Tq = T8 - T9;
+					T3 = ri[WS(is, 3)];
+					T4 = ri[WS(is, 9)];
+					{
+					     E Tg, T5, Th, Tj, Tk;
+					     Tg = ri[WS(is, 11)];
+					     Ty = FMS(KP500000000, Ta, T7);
+					     Tb = T7 + Ta;
+					     Tr = T4 - T3;
+					     T5 = T3 + T4;
+					     Th = ri[WS(is, 6)];
+					     Tj = ri[WS(is, 7)];
+					     Tk = ri[WS(is, 2)];
+					     T6 = T2 + T5;
+					     Tx = FNMS(KP500000000, T5, T2);
+					     Ti = Tg + Th;
+					     Tt = Tg - Th;
+					     Tu = Tj - Tk;
+					     Tl = Tj + Tk;
+					}
+				   }
+			      }
+			      {
+				   E Tc, Tm, T2e, T2g;
+				   Tc = T6 + Tb;
+				   T2n = T6 - Tb;
+				   T2b = Ti - Tl;
+				   Tm = Ti + Tl;
+				   T2e = Tt + Tu;
+				   Tv = Tt - Tu;
+				   Ts = Tq - Tr;
+				   T2g = Tr + Tq;
+				   {
+					E Tz, TA, Tn, T2f;
+					Tz = Tx - Ty;
+					T2a = Tx + Ty;
+					TA = FNMS(KP500000000, Tm, Tf);
+					Tn = Tf + Tm;
+					T2f = FNMS(KP500000000, T2e, T2d);
+					T2o = T2d + T2e;
+					To = Tc + Tn;
+					TH = Tc - Tn;
+					T2h = FMA(KP866025403, T2g, T2f);
+					T2k = FNMS(KP866025403, T2g, T2f);
+					TE = Tz - TA;
+					TB = Tz + TA;
+				   }
+			      }
+			 }
+			 {
+			      E T1R, TM, T10, T18, T1l, TX, T1k, T15, TP, T1a, T1b, TS;
+			      {
+				   E T16, TY, TZ, TK, TL;
+				   TK = ii[WS(is, 8)];
+				   TF = Ts - Tv;
+				   Tw = Ts + Tv;
+				   T2j = FNMS(KP866025403, T2b, T2a);
+				   T2c = FMA(KP866025403, T2b, T2a);
+				   TL = ii[WS(is, 5)];
+				   T16 = ii[WS(is, 12)];
+				   TY = ii[WS(is, 10)];
+				   TZ = ii[WS(is, 4)];
+				   T1R = TK + TL;
+				   TM = TK - TL;
+				   {
+					E T13, T17, TV, TW;
+					T13 = ii[WS(is, 1)];
+					T17 = TY + TZ;
+					T10 = TY - TZ;
+					TV = ii[WS(is, 9)];
+					TW = ii[WS(is, 3)];
+					{
+					     E TN, T14, TO, TQ, TR;
+					     TN = ii[WS(is, 11)];
+					     T18 = FMS(KP500000000, T17, T16);
+					     T1l = T16 + T17;
+					     TX = TV - TW;
+					     T14 = TW + TV;
+					     TO = ii[WS(is, 6)];
+					     TQ = ii[WS(is, 7)];
+					     TR = ii[WS(is, 2)];
+					     T1k = T13 + T14;
+					     T15 = FNMS(KP500000000, T14, T13);
+					     TP = TN - TO;
+					     T1a = TN + TO;
+					     T1b = TQ + TR;
+					     TS = TQ - TR;
+					}
+				   }
+			      }
+			      {
+				   E T1Q, T11, TT, T1S;
+				   T1Q = T1k + T1l;
+				   T1m = T1k - T1l;
+				   T11 = TX + T10;
+				   T1W = T10 - TX;
+				   T1X = TP - TS;
+				   TT = TP + TS;
+				   T1S = T1a + T1b;
+				   T1c = T1a - T1b;
+				   {
+					E T1Z, TU, T1T, T20;
+					T19 = T15 + T18;
+					T1Z = T15 - T18;
+					T1j = TM + TT;
+					TU = FNMS(KP500000000, TT, TM);
+					T1T = T1R + T1S;
+					T20 = FNMS(KP500000000, T1S, T1R);
+					T12 = FMA(KP866025403, T11, TU);
+					T1f = FNMS(KP866025403, T11, TU);
+					T21 = T1Z + T20;
+					T24 = T1Z - T20;
+					T27 = T1Q - T1T;
+					T1U = T1Q + T1T;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T1g, T1d, T25, T1Y;
+			 ro[0] = T1 + To;
+			 T1g = FNMS(KP866025403, T1c, T19);
+			 T1d = FMA(KP866025403, T1c, T19);
+			 T25 = T1W - T1X;
+			 T1Y = T1W + T1X;
+			 io[0] = T1P + T1U;
+			 {
+			      E T1C, T1B, T1F, T1K;
+			      {
+				   E TC, T1J, T1z, T1w, T1I, T1O, Tp, T1E, T1q, TI, T1o, T1s;
+				   {
+					E TG, T1n, T1G, T1u, T1e, T1h, T1v, T1x, T1y, T1H, T1i;
+					TC = FMA(KP301479260, TB, Tw);
+					T1x = FNMS(KP226109445, Tw, TB);
+					T1y = FMA(KP686558370, TE, TF);
+					TG = FNMS(KP514918778, TF, TE);
+					T1n = FNMS(KP302775637, T1m, T1j);
+					T1G = FMA(KP302775637, T1j, T1m);
+					T1u = FNMS(KP038632954, T12, T1d);
+					T1e = FMA(KP038632954, T1d, T12);
+					T1h = FMA(KP612264650, T1g, T1f);
+					T1v = FNMS(KP612264650, T1f, T1g);
+					T1J = FMA(KP769338817, T1y, T1x);
+					T1z = FNMS(KP769338817, T1y, T1x);
+					T1H = FNMS(KP853480001, T1v, T1u);
+					T1w = FMA(KP853480001, T1v, T1u);
+					T1I = FNMS(KP522026385, T1H, T1G);
+					T1O = FMA(KP957805992, T1G, T1H);
+					Tp = FNMS(KP083333333, To, T1);
+					T1E = FMA(KP853480001, T1h, T1e);
+					T1i = FNMS(KP853480001, T1h, T1e);
+					T1q = FNMS(KP859542535, TG, TH);
+					TI = FMA(KP581704778, TH, TG);
+					T1o = FMA(KP957805992, T1n, T1i);
+					T1s = FNMS(KP522026385, T1i, T1n);
+				   }
+				   {
+					E T1A, T1D, T1t, T1L, T1M;
+					{
+					     E T1p, TD, TJ, T1N, T1r;
+					     T1p = FNMS(KP251768516, TC, Tp);
+					     TD = FMA(KP503537032, TC, Tp);
+					     T1C = FNMS(KP968287244, T1z, T1w);
+					     T1A = FMA(KP968287244, T1z, T1w);
+					     TJ = FMA(KP516520780, TI, TD);
+					     T1N = FNMS(KP516520780, TI, TD);
+					     T1D = FNMS(KP300462606, T1q, T1p);
+					     T1r = FMA(KP300462606, T1q, T1p);
+					     ro[WS(os, 8)] = FNMS(KP600477271, T1O, T1N);
+					     ro[WS(os, 12)] = FMA(KP600477271, T1o, TJ);
+					     ro[WS(os, 1)] = FNMS(KP600477271, T1o, TJ);
+					     T1t = FNMS(KP575140729, T1s, T1r);
+					     T1B = FMA(KP575140729, T1s, T1r);
+					     ro[WS(os, 5)] = FMA(KP600477271, T1O, T1N);
+					}
+					T1L = FNMS(KP520028571, T1E, T1D);
+					T1F = FMA(KP520028571, T1E, T1D);
+					T1K = FMA(KP875502302, T1J, T1I);
+					T1M = FNMS(KP875502302, T1J, T1I);
+					ro[WS(os, 3)] = FMA(KP520028571, T1A, T1t);
+					ro[WS(os, 9)] = FNMS(KP520028571, T1A, T1t);
+					ro[WS(os, 6)] = FMA(KP575140729, T1M, T1L);
+					ro[WS(os, 11)] = FNMS(KP575140729, T1M, T1L);
+				   }
+			      }
+			      {
+				   E T22, T2F, T2N, T2K, T2w, T2A, T1V, T2C, T28, T2y, T2M, T2q;
+				   {
+					E T26, T2v, T2p, T2i, T2s, T2t, T2l, T2D, T2E, T2u, T2m;
+					T2D = FNMS(KP226109445, T1Y, T21);
+					T22 = FMA(KP301479260, T21, T1Y);
+					ro[WS(os, 2)] = FMA(KP575140729, T1K, T1F);
+					ro[WS(os, 7)] = FNMS(KP575140729, T1K, T1F);
+					ro[WS(os, 4)] = FMA(KP520028571, T1C, T1B);
+					ro[WS(os, 10)] = FNMS(KP520028571, T1C, T1B);
+					T26 = FNMS(KP514918778, T25, T24);
+					T2E = FMA(KP686558370, T24, T25);
+					T2v = FNMS(KP302775637, T2n, T2o);
+					T2p = FMA(KP302775637, T2o, T2n);
+					T2i = FNMS(KP038632954, T2h, T2c);
+					T2s = FMA(KP038632954, T2c, T2h);
+					T2t = FMA(KP612264650, T2j, T2k);
+					T2l = FNMS(KP612264650, T2k, T2j);
+					T2F = FNMS(KP769338817, T2E, T2D);
+					T2N = FMA(KP769338817, T2E, T2D);
+					T2K = FMA(KP853480001, T2t, T2s);
+					T2u = FNMS(KP853480001, T2t, T2s);
+					T2w = FMA(KP957805992, T2v, T2u);
+					T2A = FNMS(KP522026385, T2u, T2v);
+					T1V = FNMS(KP083333333, T1U, T1P);
+					T2m = FNMS(KP853480001, T2l, T2i);
+					T2C = FMA(KP853480001, T2l, T2i);
+					T28 = FMA(KP581704778, T27, T26);
+					T2y = FNMS(KP859542535, T26, T27);
+					T2M = FNMS(KP522026385, T2m, T2p);
+					T2q = FMA(KP957805992, T2p, T2m);
+				   }
+				   {
+					E T2O, T2Q, T2z, T2P, T2L;
+					{
+					     E T23, T2x, T2r, T29, T2J;
+					     T23 = FMA(KP503537032, T22, T1V);
+					     T2x = FNMS(KP251768516, T22, T1V);
+					     T2O = FNMS(KP875502302, T2N, T2M);
+					     T2Q = FMA(KP875502302, T2N, T2M);
+					     T2r = FMA(KP516520780, T28, T23);
+					     T29 = FNMS(KP516520780, T28, T23);
+					     T2z = FMA(KP300462606, T2y, T2x);
+					     T2J = FNMS(KP300462606, T2y, T2x);
+					     io[WS(os, 12)] = FNMS(KP600477271, T2w, T2r);
+					     io[WS(os, 1)] = FMA(KP600477271, T2w, T2r);
+					     io[WS(os, 8)] = FMA(KP600477271, T2q, T29);
+					     io[WS(os, 5)] = FNMS(KP600477271, T2q, T29);
+					     T2P = FMA(KP520028571, T2K, T2J);
+					     T2L = FNMS(KP520028571, T2K, T2J);
+					}
+					T2B = FMA(KP575140729, T2A, T2z);
+					T2H = FNMS(KP575140729, T2A, T2z);
+					io[WS(os, 11)] = FMA(KP575140729, T2Q, T2P);
+					io[WS(os, 6)] = FNMS(KP575140729, T2Q, T2P);
+					io[WS(os, 7)] = FMA(KP575140729, T2O, T2L);
+					io[WS(os, 2)] = FNMS(KP575140729, T2O, T2L);
+					T2I = FMA(KP968287244, T2F, T2C);
+					T2G = FNMS(KP968287244, T2F, T2C);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       io[WS(os, 10)] = FMA(KP520028571, T2I, T2H);
+	       io[WS(os, 4)] = FNMS(KP520028571, T2I, T2H);
+	       io[WS(os, 9)] = FMA(KP520028571, T2G, T2B);
+	       io[WS(os, 3)] = FNMS(KP520028571, T2G, T2B);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 13, "n1_13", {62, 0, 114, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_13) (planner *p) {
+     X(kdft_register) (p, n1_13, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 13 -name n1_13 -include n.h */
+
+/*
+ * This function contains 176 FP additions, 68 FP multiplications,
+ * (or, 138 additions, 30 multiplications, 38 fused multiply/add),
+ * 71 stack variables, 20 constants, and 52 memory accesses
+ */
+#include "n.h"
+
+static void n1_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP083333333, +0.083333333333333333333333333333333333333333333);
+     DK(KP251768516, +0.251768516431883313623436926934233488546674281);
+     DK(KP075902986, +0.075902986037193865983102897245103540356428373);
+     DK(KP132983124, +0.132983124607418643793760531921092974399165133);
+     DK(KP258260390, +0.258260390311744861420450644284508567852516811);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP300238635, +0.300238635966332641462884626667381504676006424);
+     DK(KP011599105, +0.011599105605768290721655456654083252189827041);
+     DK(KP156891391, +0.156891391051584611046832726756003269660212636);
+     DK(KP256247671, +0.256247671582936600958684654061725059144125175);
+     DK(KP174138601, +0.174138601152135905005660794929264742616964676);
+     DK(KP575140729, +0.575140729474003121368385547455453388461001608);
+     DK(KP503537032, +0.503537032863766627246873853868466977093348562);
+     DK(KP113854479, +0.113854479055790798974654345867655310534642560);
+     DK(KP265966249, +0.265966249214837287587521063842185948798330267);
+     DK(KP387390585, +0.387390585467617292130675966426762851778775217);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP300462606, +0.300462606288665774426601772289207995520941381);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(52, is), MAKE_VOLATILE_STRIDE(52, os)) {
+	       E T1, T1q, Tt, Tu, To, T22, T20, T24, TF, TH, TA, TI, T1X, T25, T2a;
+	       E T2d, T18, T1n, T2k, T2n, T1l, T1r, T1f, T1o, T2h, T2m;
+	       T1 = ri[0];
+	       T1q = ii[0];
+	       {
+		    E Tf, Tp, Tb, TC, Tx, T6, TB, Tw, Ti, Tq, Tl, Tr, Tm, Ts, Td;
+		    E Te, Tc, Tn;
+		    Td = ri[WS(is, 8)];
+		    Te = ri[WS(is, 5)];
+		    Tf = Td + Te;
+		    Tp = Td - Te;
+		    {
+			 E T7, T8, T9, Ta;
+			 T7 = ri[WS(is, 12)];
+			 T8 = ri[WS(is, 10)];
+			 T9 = ri[WS(is, 4)];
+			 Ta = T8 + T9;
+			 Tb = T7 + Ta;
+			 TC = T8 - T9;
+			 Tx = FNMS(KP500000000, Ta, T7);
+		    }
+		    {
+			 E T2, T3, T4, T5;
+			 T2 = ri[WS(is, 1)];
+			 T3 = ri[WS(is, 3)];
+			 T4 = ri[WS(is, 9)];
+			 T5 = T3 + T4;
+			 T6 = T2 + T5;
+			 TB = T3 - T4;
+			 Tw = FNMS(KP500000000, T5, T2);
+		    }
+		    {
+			 E Tg, Th, Tj, Tk;
+			 Tg = ri[WS(is, 11)];
+			 Th = ri[WS(is, 6)];
+			 Ti = Tg + Th;
+			 Tq = Tg - Th;
+			 Tj = ri[WS(is, 7)];
+			 Tk = ri[WS(is, 2)];
+			 Tl = Tj + Tk;
+			 Tr = Tj - Tk;
+		    }
+		    Tm = Ti + Tl;
+		    Ts = Tq + Tr;
+		    Tt = Tp + Ts;
+		    Tu = T6 - Tb;
+		    Tc = T6 + Tb;
+		    Tn = Tf + Tm;
+		    To = Tc + Tn;
+		    T22 = KP300462606 * (Tc - Tn);
+		    {
+			 E T1Y, T1Z, TD, TE;
+			 T1Y = TB + TC;
+			 T1Z = Tq - Tr;
+			 T20 = T1Y - T1Z;
+			 T24 = T1Y + T1Z;
+			 TD = KP866025403 * (TB - TC);
+			 TE = FNMS(KP500000000, Ts, Tp);
+			 TF = TD - TE;
+			 TH = TD + TE;
+		    }
+		    {
+			 E Ty, Tz, T1V, T1W;
+			 Ty = Tw - Tx;
+			 Tz = KP866025403 * (Ti - Tl);
+			 TA = Ty + Tz;
+			 TI = Ty - Tz;
+			 T1V = Tw + Tx;
+			 T1W = FNMS(KP500000000, Tm, Tf);
+			 T1X = T1V - T1W;
+			 T25 = T1V + T1W;
+		    }
+	       }
+	       {
+		    E TZ, T2b, TV, T1i, T1a, TQ, T1h, T19, T12, T1d, T15, T1c, T16, T2c, TX;
+		    E TY, TW, T17;
+		    TX = ii[WS(is, 8)];
+		    TY = ii[WS(is, 5)];
+		    TZ = TX + TY;
+		    T2b = TX - TY;
+		    {
+			 E TR, TS, TT, TU;
+			 TR = ii[WS(is, 12)];
+			 TS = ii[WS(is, 10)];
+			 TT = ii[WS(is, 4)];
+			 TU = TS + TT;
+			 TV = FNMS(KP500000000, TU, TR);
+			 T1i = TR + TU;
+			 T1a = TS - TT;
+		    }
+		    {
+			 E TM, TN, TO, TP;
+			 TM = ii[WS(is, 1)];
+			 TN = ii[WS(is, 3)];
+			 TO = ii[WS(is, 9)];
+			 TP = TN + TO;
+			 TQ = FNMS(KP500000000, TP, TM);
+			 T1h = TM + TP;
+			 T19 = TN - TO;
+		    }
+		    {
+			 E T10, T11, T13, T14;
+			 T10 = ii[WS(is, 11)];
+			 T11 = ii[WS(is, 6)];
+			 T12 = T10 + T11;
+			 T1d = T10 - T11;
+			 T13 = ii[WS(is, 7)];
+			 T14 = ii[WS(is, 2)];
+			 T15 = T13 + T14;
+			 T1c = T13 - T14;
+		    }
+		    T16 = T12 + T15;
+		    T2c = T1d + T1c;
+		    T2a = T1h - T1i;
+		    T2d = T2b + T2c;
+		    TW = TQ + TV;
+		    T17 = FNMS(KP500000000, T16, TZ);
+		    T18 = TW - T17;
+		    T1n = TW + T17;
+		    {
+			 E T2i, T2j, T1j, T1k;
+			 T2i = TQ - TV;
+			 T2j = KP866025403 * (T15 - T12);
+			 T2k = T2i + T2j;
+			 T2n = T2i - T2j;
+			 T1j = T1h + T1i;
+			 T1k = TZ + T16;
+			 T1l = KP300462606 * (T1j - T1k);
+			 T1r = T1j + T1k;
+		    }
+		    {
+			 E T1b, T1e, T2f, T2g;
+			 T1b = T19 + T1a;
+			 T1e = T1c - T1d;
+			 T1f = T1b + T1e;
+			 T1o = T1e - T1b;
+			 T2f = FNMS(KP500000000, T2c, T2b);
+			 T2g = KP866025403 * (T1a - T19);
+			 T2h = T2f - T2g;
+			 T2m = T2g + T2f;
+		    }
+	       }
+	       ro[0] = T1 + To;
+	       io[0] = T1q + T1r;
+	       {
+		    E T1D, T1N, T1y, T1x, T1E, T1O, Tv, TK, T1J, T1Q, T1m, T1R, T1t, T1I, TG;
+		    E TJ;
+		    {
+			 E T1B, T1C, T1v, T1w;
+			 T1B = FMA(KP387390585, T1f, KP265966249 * T18);
+			 T1C = FMA(KP113854479, T1o, KP503537032 * T1n);
+			 T1D = T1B + T1C;
+			 T1N = T1C - T1B;
+			 T1y = FMA(KP575140729, Tu, KP174138601 * Tt);
+			 T1v = FNMS(KP156891391, TH, KP256247671 * TI);
+			 T1w = FMA(KP011599105, TF, KP300238635 * TA);
+			 T1x = T1v - T1w;
+			 T1E = T1y + T1x;
+			 T1O = KP1_732050807 * (T1v + T1w);
+		    }
+		    Tv = FNMS(KP174138601, Tu, KP575140729 * Tt);
+		    TG = FNMS(KP300238635, TF, KP011599105 * TA);
+		    TJ = FMA(KP256247671, TH, KP156891391 * TI);
+		    TK = TG - TJ;
+		    T1J = KP1_732050807 * (TJ + TG);
+		    T1Q = Tv - TK;
+		    {
+			 E T1g, T1H, T1p, T1s, T1G;
+			 T1g = FNMS(KP132983124, T1f, KP258260390 * T18);
+			 T1H = T1l - T1g;
+			 T1p = FNMS(KP251768516, T1o, KP075902986 * T1n);
+			 T1s = FNMS(KP083333333, T1r, T1q);
+			 T1G = T1s - T1p;
+			 T1m = FMA(KP2_000000000, T1g, T1l);
+			 T1R = T1H + T1G;
+			 T1t = FMA(KP2_000000000, T1p, T1s);
+			 T1I = T1G - T1H;
+		    }
+		    {
+			 E TL, T1u, T1P, T1S;
+			 TL = FMA(KP2_000000000, TK, Tv);
+			 T1u = T1m + T1t;
+			 io[WS(os, 1)] = TL + T1u;
+			 io[WS(os, 12)] = T1u - TL;
+			 {
+			      E T1z, T1A, T1T, T1U;
+			      T1z = FMS(KP2_000000000, T1x, T1y);
+			      T1A = T1t - T1m;
+			      io[WS(os, 5)] = T1z + T1A;
+			      io[WS(os, 8)] = T1A - T1z;
+			      T1T = T1R - T1Q;
+			      T1U = T1O + T1N;
+			      io[WS(os, 4)] = T1T - T1U;
+			      io[WS(os, 10)] = T1U + T1T;
+			 }
+			 T1P = T1N - T1O;
+			 T1S = T1Q + T1R;
+			 io[WS(os, 3)] = T1P + T1S;
+			 io[WS(os, 9)] = T1S - T1P;
+			 {
+			      E T1L, T1M, T1F, T1K;
+			      T1L = T1J + T1I;
+			      T1M = T1E + T1D;
+			      io[WS(os, 6)] = T1L - T1M;
+			      io[WS(os, 11)] = T1M + T1L;
+			      T1F = T1D - T1E;
+			      T1K = T1I - T1J;
+			      io[WS(os, 2)] = T1F + T1K;
+			      io[WS(os, 7)] = T1K - T1F;
+			 }
+		    }
+	       }
+	       {
+		    E T2y, T2I, T2J, T2K, T2B, T2L, T2e, T2p, T2u, T2G, T23, T2F, T28, T2t, T2l;
+		    E T2o;
+		    {
+			 E T2w, T2x, T2z, T2A;
+			 T2w = FMA(KP387390585, T20, KP265966249 * T1X);
+			 T2x = FNMS(KP503537032, T25, KP113854479 * T24);
+			 T2y = T2w + T2x;
+			 T2I = T2w - T2x;
+			 T2J = FMA(KP575140729, T2a, KP174138601 * T2d);
+			 T2z = FNMS(KP300238635, T2n, KP011599105 * T2m);
+			 T2A = FNMS(KP156891391, T2h, KP256247671 * T2k);
+			 T2K = T2z + T2A;
+			 T2B = KP1_732050807 * (T2z - T2A);
+			 T2L = T2J + T2K;
+		    }
+		    T2e = FNMS(KP575140729, T2d, KP174138601 * T2a);
+		    T2l = FMA(KP256247671, T2h, KP156891391 * T2k);
+		    T2o = FMA(KP300238635, T2m, KP011599105 * T2n);
+		    T2p = T2l - T2o;
+		    T2u = T2e - T2p;
+		    T2G = KP1_732050807 * (T2o + T2l);
+		    {
+			 E T21, T2r, T26, T27, T2s;
+			 T21 = FNMS(KP132983124, T20, KP258260390 * T1X);
+			 T2r = T22 - T21;
+			 T26 = FMA(KP251768516, T24, KP075902986 * T25);
+			 T27 = FNMS(KP083333333, To, T1);
+			 T2s = T27 - T26;
+			 T23 = FMA(KP2_000000000, T21, T22);
+			 T2F = T2s - T2r;
+			 T28 = FMA(KP2_000000000, T26, T27);
+			 T2t = T2r + T2s;
+		    }
+		    {
+			 E T29, T2q, T2N, T2O;
+			 T29 = T23 + T28;
+			 T2q = FMA(KP2_000000000, T2p, T2e);
+			 ro[WS(os, 12)] = T29 - T2q;
+			 ro[WS(os, 1)] = T29 + T2q;
+			 {
+			      E T2v, T2C, T2P, T2Q;
+			      T2v = T2t - T2u;
+			      T2C = T2y - T2B;
+			      ro[WS(os, 10)] = T2v - T2C;
+			      ro[WS(os, 4)] = T2v + T2C;
+			      T2P = T28 - T23;
+			      T2Q = FMS(KP2_000000000, T2K, T2J);
+			      ro[WS(os, 5)] = T2P - T2Q;
+			      ro[WS(os, 8)] = T2P + T2Q;
+			 }
+			 T2N = T2F - T2G;
+			 T2O = T2L - T2I;
+			 ro[WS(os, 11)] = T2N - T2O;
+			 ro[WS(os, 6)] = T2N + T2O;
+			 {
+			      E T2H, T2M, T2D, T2E;
+			      T2H = T2F + T2G;
+			      T2M = T2I + T2L;
+			      ro[WS(os, 7)] = T2H - T2M;
+			      ro[WS(os, 2)] = T2H + T2M;
+			      T2D = T2t + T2u;
+			      T2E = T2y + T2B;
+			      ro[WS(os, 3)] = T2D - T2E;
+			      ro[WS(os, 9)] = T2D + T2E;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 13, "n1_13", {138, 30, 38, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_13) (planner *p) {
+     X(kdft_register) (p, n1_13, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:43 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 14 -name n1_14 -include n.h */
+
+/*
+ * This function contains 148 FP additions, 84 FP multiplications,
+ * (or, 64 additions, 0 multiplications, 84 fused multiply/add),
+ * 80 stack variables, 6 constants, and 56 memory accesses
+ */
+#include "n.h"
+
+static void n1_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(56, is), MAKE_VOLATILE_STRIDE(56, os)) {
+	       E Tp, T1L, T24, T1W, T1X, T28, T2a, T1Y, T29, T2b;
+	       {
+		    E T3, T1x, T1b, To, T1i, T1M, Ts, Ta, T1k, Tv, Th, T1j, T1K, Ty, TZ;
+		    E T14, Tz, T1Z, T27, T2c, T1d, TI, T23, T1G, T1D, TW, T1e, T22, T1A, TP;
+		    E T1c, T1n, T1s, T1f, T1P;
+		    {
+			 E T1, T2, T19, T1a;
+			 T1 = ri[0];
+			 T2 = ri[WS(is, 7)];
+			 T19 = ii[0];
+			 T1a = ii[WS(is, 7)];
+			 {
+			      E Tq, T6, Tr, T9, Te, Tx, Tn, Tw, Tk, Tf, Tb, Tc;
+			      {
+				   E Tl, Tm, Ti, Tj;
+				   {
+					E T4, T5, T7, T8;
+					T4 = ri[WS(is, 2)];
+					Tp = T1 + T2;
+					T3 = T1 - T2;
+					T1x = T19 + T1a;
+					T1b = T19 - T1a;
+					T5 = ri[WS(is, 9)];
+					T7 = ri[WS(is, 12)];
+					T8 = ri[WS(is, 5)];
+					Tl = ri[WS(is, 8)];
+					Tq = T4 + T5;
+					T6 = T4 - T5;
+					Tr = T7 + T8;
+					T9 = T7 - T8;
+					Tm = ri[WS(is, 1)];
+				   }
+				   Ti = ri[WS(is, 6)];
+				   Tj = ri[WS(is, 13)];
+				   Te = ri[WS(is, 10)];
+				   Tx = Tl + Tm;
+				   Tn = Tl - Tm;
+				   Tw = Ti + Tj;
+				   Tk = Ti - Tj;
+				   Tf = ri[WS(is, 3)];
+				   Tb = ri[WS(is, 4)];
+				   Tc = ri[WS(is, 11)];
+			      }
+			      {
+				   E Tu, Tg, Tt, Td;
+				   To = Tk + Tn;
+				   T1i = Tn - Tk;
+				   Tu = Te + Tf;
+				   Tg = Te - Tf;
+				   Tt = Tb + Tc;
+				   Td = Tb - Tc;
+				   T1M = Tr - Tq;
+				   Ts = Tq + Tr;
+				   Ta = T6 + T9;
+				   T1k = T9 - T6;
+				   T1L = Tt - Tu;
+				   Tv = Tt + Tu;
+				   Th = Td + Tg;
+				   T1j = Tg - Td;
+				   T1K = Tw - Tx;
+				   Ty = Tw + Tx;
+				   TZ = FNMS(KP356895867, Ta, To);
+				   T14 = FNMS(KP356895867, To, Th);
+				   Tz = FNMS(KP356895867, Th, Ta);
+				   T1Z = FNMS(KP356895867, Ts, Ty);
+			      }
+			 }
+			 {
+			      E T1B, TE, T1C, TH, T1F, TV, TJ, T1E, TS, T1z, TO, TK, T1y, TL;
+			      {
+				   E TF, TG, TT, TU, TC, TD;
+				   TC = ii[WS(is, 4)];
+				   TD = ii[WS(is, 11)];
+				   T27 = FNMS(KP356895867, Tv, Ts);
+				   T2c = FNMS(KP356895867, Ty, Tv);
+				   TF = ii[WS(is, 10)];
+				   T1B = TC + TD;
+				   TE = TC - TD;
+				   TG = ii[WS(is, 3)];
+				   TT = ii[WS(is, 8)];
+				   TU = ii[WS(is, 1)];
+				   {
+					E TQ, TR, TM, TN;
+					TQ = ii[WS(is, 6)];
+					T1C = TF + TG;
+					TH = TF - TG;
+					T1F = TT + TU;
+					TV = TT - TU;
+					TR = ii[WS(is, 13)];
+					TM = ii[WS(is, 12)];
+					TN = ii[WS(is, 5)];
+					TJ = ii[WS(is, 2)];
+					T1E = TQ + TR;
+					TS = TQ - TR;
+					T1z = TM + TN;
+					TO = TM - TN;
+					TK = ii[WS(is, 9)];
+				   }
+			      }
+			      T1d = TE + TH;
+			      TI = TE - TH;
+			      T23 = T1F - T1E;
+			      T1G = T1E + T1F;
+			      T1D = T1B + T1C;
+			      T24 = T1C - T1B;
+			      T1y = TJ + TK;
+			      TL = TJ - TK;
+			      TW = TS - TV;
+			      T1e = TS + TV;
+			      T22 = T1y - T1z;
+			      T1A = T1y + T1z;
+			      TP = TL - TO;
+			      T1c = TL + TO;
+			      T1n = FNMS(KP356895867, T1c, T1e);
+			      T1s = FNMS(KP356895867, T1d, T1c);
+			      T1f = FNMS(KP356895867, T1e, T1d);
+			      T1P = FNMS(KP356895867, T1A, T1G);
+			 }
+		    }
+		    {
+			 E T1U, T1H, T11, T12, T1o, T1q;
+			 ro[WS(os, 7)] = T3 + Ta + Th + To;
+			 io[WS(os, 7)] = T1b + T1c + T1d + T1e;
+			 T1U = FNMS(KP356895867, T1D, T1A);
+			 T1H = FNMS(KP356895867, T1G, T1D);
+			 ro[0] = Tp + Ts + Tv + Ty;
+			 io[0] = T1x + T1A + T1D + T1G;
+			 {
+			      E TB, TY, T1u, T1w, T10;
+			      {
+				   E TA, TX, T1t, T1v;
+				   TA = FNMS(KP692021471, Tz, To);
+				   TX = FMA(KP554958132, TW, TP);
+				   T1t = FNMS(KP692021471, T1s, T1e);
+				   T1v = FMA(KP554958132, T1i, T1k);
+				   TB = FNMS(KP900968867, TA, T3);
+				   TY = FMA(KP801937735, TX, TI);
+				   T1u = FNMS(KP900968867, T1t, T1b);
+				   T1w = FMA(KP801937735, T1v, T1j);
+			      }
+			      T10 = FNMS(KP692021471, TZ, Th);
+			      ro[WS(os, 1)] = FMA(KP974927912, TY, TB);
+			      ro[WS(os, 13)] = FNMS(KP974927912, TY, TB);
+			      io[WS(os, 13)] = FNMS(KP974927912, T1w, T1u);
+			      io[WS(os, 1)] = FMA(KP974927912, T1w, T1u);
+			      T11 = FNMS(KP900968867, T10, T3);
+			      T12 = FMA(KP554958132, TI, TW);
+			      T1o = FNMS(KP692021471, T1n, T1d);
+			      T1q = FMA(KP554958132, T1j, T1i);
+			 }
+			 {
+			      E T1J, T1N, T2d, T2f;
+			      {
+				   E T16, T17, T1g, T1l;
+				   {
+					E T13, T1p, T1r, T15;
+					T15 = FNMS(KP692021471, T14, Ta);
+					T13 = FNMS(KP801937735, T12, TP);
+					T1p = FNMS(KP900968867, T1o, T1b);
+					T1r = FNMS(KP801937735, T1q, T1k);
+					T16 = FNMS(KP900968867, T15, T3);
+					ro[WS(os, 9)] = FMA(KP974927912, T13, T11);
+					ro[WS(os, 5)] = FNMS(KP974927912, T13, T11);
+					io[WS(os, 9)] = FMA(KP974927912, T1r, T1p);
+					io[WS(os, 5)] = FNMS(KP974927912, T1r, T1p);
+					T17 = FNMS(KP554958132, TP, TI);
+				   }
+				   T1g = FNMS(KP692021471, T1f, T1c);
+				   T1l = FNMS(KP554958132, T1k, T1j);
+				   {
+					E T18, T1h, T1m, T1I;
+					T1I = FNMS(KP692021471, T1H, T1A);
+					T18 = FNMS(KP801937735, T17, TW);
+					T1h = FNMS(KP900968867, T1g, T1b);
+					T1m = FNMS(KP801937735, T1l, T1i);
+					T1J = FNMS(KP900968867, T1I, T1x);
+					ro[WS(os, 3)] = FMA(KP974927912, T18, T16);
+					ro[WS(os, 11)] = FNMS(KP974927912, T18, T16);
+					io[WS(os, 11)] = FNMS(KP974927912, T1m, T1h);
+					io[WS(os, 3)] = FMA(KP974927912, T1m, T1h);
+					T1N = FMA(KP554958132, T1M, T1L);
+				   }
+				   T2d = FNMS(KP692021471, T2c, Ts);
+				   T2f = FMA(KP554958132, T22, T24);
+			      }
+			      {
+				   E T1R, T1S, T20, T25;
+				   {
+					E T1O, T2e, T2g, T1Q;
+					T1Q = FNMS(KP692021471, T1P, T1D);
+					T1O = FNMS(KP801937735, T1N, T1K);
+					T2e = FNMS(KP900968867, T2d, Tp);
+					T2g = FNMS(KP801937735, T2f, T23);
+					T1R = FNMS(KP900968867, T1Q, T1x);
+					io[WS(os, 10)] = FNMS(KP974927912, T1O, T1J);
+					io[WS(os, 4)] = FMA(KP974927912, T1O, T1J);
+					ro[WS(os, 4)] = FMA(KP974927912, T2g, T2e);
+					ro[WS(os, 10)] = FNMS(KP974927912, T2g, T2e);
+					T1S = FMA(KP554958132, T1L, T1K);
+				   }
+				   T20 = FNMS(KP692021471, T1Z, Tv);
+				   T25 = FMA(KP554958132, T24, T23);
+				   {
+					E T1T, T21, T26, T1V;
+					T1V = FNMS(KP692021471, T1U, T1G);
+					T1T = FMA(KP801937735, T1S, T1M);
+					T21 = FNMS(KP900968867, T20, Tp);
+					T26 = FMA(KP801937735, T25, T22);
+					T1W = FNMS(KP900968867, T1V, T1x);
+					io[WS(os, 12)] = FNMS(KP974927912, T1T, T1R);
+					io[WS(os, 2)] = FMA(KP974927912, T1T, T1R);
+					ro[WS(os, 2)] = FMA(KP974927912, T26, T21);
+					ro[WS(os, 12)] = FNMS(KP974927912, T26, T21);
+					T1X = FNMS(KP554958132, T1K, T1M);
+				   }
+				   T28 = FNMS(KP692021471, T27, Ty);
+				   T2a = FNMS(KP554958132, T23, T22);
+			      }
+			 }
+		    }
+	       }
+	       T1Y = FNMS(KP801937735, T1X, T1L);
+	       T29 = FNMS(KP900968867, T28, Tp);
+	       T2b = FNMS(KP801937735, T2a, T24);
+	       io[WS(os, 8)] = FNMS(KP974927912, T1Y, T1W);
+	       io[WS(os, 6)] = FMA(KP974927912, T1Y, T1W);
+	       ro[WS(os, 6)] = FMA(KP974927912, T2b, T29);
+	       ro[WS(os, 8)] = FNMS(KP974927912, T2b, T29);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 14, "n1_14", {64, 0, 84, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_14) (planner *p) {
+     X(kdft_register) (p, n1_14, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 14 -name n1_14 -include n.h */
+
+/*
+ * This function contains 148 FP additions, 72 FP multiplications,
+ * (or, 100 additions, 24 multiplications, 48 fused multiply/add),
+ * 43 stack variables, 6 constants, and 56 memory accesses
+ */
+#include "n.h"
+
+static void n1_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(56, is), MAKE_VOLATILE_STRIDE(56, os)) {
+	       E T3, Tp, T16, T1f, Ta, T1q, Ts, T10, TG, T1z, T19, T1i, Th, T1s, Tv;
+	       E T12, TU, T1B, T17, T1o, To, T1r, Ty, T11, TN, T1A, T18, T1l;
+	       {
+		    E T1, T2, T14, T15;
+		    T1 = ri[0];
+		    T2 = ri[WS(is, 7)];
+		    T3 = T1 - T2;
+		    Tp = T1 + T2;
+		    T14 = ii[0];
+		    T15 = ii[WS(is, 7)];
+		    T16 = T14 - T15;
+		    T1f = T14 + T15;
+	       }
+	       {
+		    E T6, Tq, T9, Tr;
+		    {
+			 E T4, T5, T7, T8;
+			 T4 = ri[WS(is, 2)];
+			 T5 = ri[WS(is, 9)];
+			 T6 = T4 - T5;
+			 Tq = T4 + T5;
+			 T7 = ri[WS(is, 12)];
+			 T8 = ri[WS(is, 5)];
+			 T9 = T7 - T8;
+			 Tr = T7 + T8;
+		    }
+		    Ta = T6 + T9;
+		    T1q = Tr - Tq;
+		    Ts = Tq + Tr;
+		    T10 = T9 - T6;
+	       }
+	       {
+		    E TC, T1g, TF, T1h;
+		    {
+			 E TA, TB, TD, TE;
+			 TA = ii[WS(is, 2)];
+			 TB = ii[WS(is, 9)];
+			 TC = TA - TB;
+			 T1g = TA + TB;
+			 TD = ii[WS(is, 12)];
+			 TE = ii[WS(is, 5)];
+			 TF = TD - TE;
+			 T1h = TD + TE;
+		    }
+		    TG = TC - TF;
+		    T1z = T1g - T1h;
+		    T19 = TC + TF;
+		    T1i = T1g + T1h;
+	       }
+	       {
+		    E Td, Tt, Tg, Tu;
+		    {
+			 E Tb, Tc, Te, Tf;
+			 Tb = ri[WS(is, 4)];
+			 Tc = ri[WS(is, 11)];
+			 Td = Tb - Tc;
+			 Tt = Tb + Tc;
+			 Te = ri[WS(is, 10)];
+			 Tf = ri[WS(is, 3)];
+			 Tg = Te - Tf;
+			 Tu = Te + Tf;
+		    }
+		    Th = Td + Tg;
+		    T1s = Tt - Tu;
+		    Tv = Tt + Tu;
+		    T12 = Tg - Td;
+	       }
+	       {
+		    E TQ, T1m, TT, T1n;
+		    {
+			 E TO, TP, TR, TS;
+			 TO = ii[WS(is, 4)];
+			 TP = ii[WS(is, 11)];
+			 TQ = TO - TP;
+			 T1m = TO + TP;
+			 TR = ii[WS(is, 10)];
+			 TS = ii[WS(is, 3)];
+			 TT = TR - TS;
+			 T1n = TR + TS;
+		    }
+		    TU = TQ - TT;
+		    T1B = T1n - T1m;
+		    T17 = TQ + TT;
+		    T1o = T1m + T1n;
+	       }
+	       {
+		    E Tk, Tw, Tn, Tx;
+		    {
+			 E Ti, Tj, Tl, Tm;
+			 Ti = ri[WS(is, 6)];
+			 Tj = ri[WS(is, 13)];
+			 Tk = Ti - Tj;
+			 Tw = Ti + Tj;
+			 Tl = ri[WS(is, 8)];
+			 Tm = ri[WS(is, 1)];
+			 Tn = Tl - Tm;
+			 Tx = Tl + Tm;
+		    }
+		    To = Tk + Tn;
+		    T1r = Tw - Tx;
+		    Ty = Tw + Tx;
+		    T11 = Tn - Tk;
+	       }
+	       {
+		    E TJ, T1j, TM, T1k;
+		    {
+			 E TH, TI, TK, TL;
+			 TH = ii[WS(is, 6)];
+			 TI = ii[WS(is, 13)];
+			 TJ = TH - TI;
+			 T1j = TH + TI;
+			 TK = ii[WS(is, 8)];
+			 TL = ii[WS(is, 1)];
+			 TM = TK - TL;
+			 T1k = TK + TL;
+		    }
+		    TN = TJ - TM;
+		    T1A = T1k - T1j;
+		    T18 = TJ + TM;
+		    T1l = T1j + T1k;
+	       }
+	       ro[WS(os, 7)] = T3 + Ta + Th + To;
+	       io[WS(os, 7)] = T16 + T19 + T17 + T18;
+	       ro[0] = Tp + Ts + Tv + Ty;
+	       io[0] = T1f + T1i + T1o + T1l;
+	       {
+		    E TV, Tz, T1e, T1d;
+		    TV = FNMS(KP781831482, TN, KP974927912 * TG) - (KP433883739 * TU);
+		    Tz = FMA(KP623489801, To, T3) + FNMA(KP900968867, Th, KP222520933 * Ta);
+		    ro[WS(os, 5)] = Tz - TV;
+		    ro[WS(os, 9)] = Tz + TV;
+		    T1e = FNMS(KP781831482, T11, KP974927912 * T10) - (KP433883739 * T12);
+		    T1d = FMA(KP623489801, T18, T16) + FNMA(KP900968867, T17, KP222520933 * T19);
+		    io[WS(os, 5)] = T1d - T1e;
+		    io[WS(os, 9)] = T1e + T1d;
+	       }
+	       {
+		    E TX, TW, T1b, T1c;
+		    TX = FMA(KP781831482, TG, KP974927912 * TU) + (KP433883739 * TN);
+		    TW = FMA(KP623489801, Ta, T3) + FNMA(KP900968867, To, KP222520933 * Th);
+		    ro[WS(os, 13)] = TW - TX;
+		    ro[WS(os, 1)] = TW + TX;
+		    T1b = FMA(KP781831482, T10, KP974927912 * T12) + (KP433883739 * T11);
+		    T1c = FMA(KP623489801, T19, T16) + FNMA(KP900968867, T18, KP222520933 * T17);
+		    io[WS(os, 1)] = T1b + T1c;
+		    io[WS(os, 13)] = T1c - T1b;
+	       }
+	       {
+		    E TZ, TY, T13, T1a;
+		    TZ = FMA(KP433883739, TG, KP974927912 * TN) - (KP781831482 * TU);
+		    TY = FMA(KP623489801, Th, T3) + FNMA(KP222520933, To, KP900968867 * Ta);
+		    ro[WS(os, 11)] = TY - TZ;
+		    ro[WS(os, 3)] = TY + TZ;
+		    T13 = FMA(KP433883739, T10, KP974927912 * T11) - (KP781831482 * T12);
+		    T1a = FMA(KP623489801, T17, T16) + FNMA(KP222520933, T18, KP900968867 * T19);
+		    io[WS(os, 3)] = T13 + T1a;
+		    io[WS(os, 11)] = T1a - T13;
+	       }
+	       {
+		    E T1t, T1p, T1C, T1y;
+		    T1t = FNMS(KP433883739, T1r, KP781831482 * T1q) - (KP974927912 * T1s);
+		    T1p = FMA(KP623489801, T1i, T1f) + FNMA(KP900968867, T1l, KP222520933 * T1o);
+		    io[WS(os, 6)] = T1p - T1t;
+		    io[WS(os, 8)] = T1t + T1p;
+		    T1C = FNMS(KP433883739, T1A, KP781831482 * T1z) - (KP974927912 * T1B);
+		    T1y = FMA(KP623489801, Ts, Tp) + FNMA(KP900968867, Ty, KP222520933 * Tv);
+		    ro[WS(os, 6)] = T1y - T1C;
+		    ro[WS(os, 8)] = T1y + T1C;
+	       }
+	       {
+		    E T1v, T1u, T1E, T1D;
+		    T1v = FMA(KP433883739, T1q, KP781831482 * T1s) - (KP974927912 * T1r);
+		    T1u = FMA(KP623489801, T1o, T1f) + FNMA(KP222520933, T1l, KP900968867 * T1i);
+		    io[WS(os, 4)] = T1u - T1v;
+		    io[WS(os, 10)] = T1v + T1u;
+		    T1E = FMA(KP433883739, T1z, KP781831482 * T1B) - (KP974927912 * T1A);
+		    T1D = FMA(KP623489801, Tv, Tp) + FNMA(KP222520933, Ty, KP900968867 * Ts);
+		    ro[WS(os, 4)] = T1D - T1E;
+		    ro[WS(os, 10)] = T1D + T1E;
+	       }
+	       {
+		    E T1w, T1x, T1G, T1F;
+		    T1w = FMA(KP974927912, T1q, KP433883739 * T1s) + (KP781831482 * T1r);
+		    T1x = FMA(KP623489801, T1l, T1f) + FNMA(KP900968867, T1o, KP222520933 * T1i);
+		    io[WS(os, 2)] = T1w + T1x;
+		    io[WS(os, 12)] = T1x - T1w;
+		    T1G = FMA(KP974927912, T1z, KP433883739 * T1B) + (KP781831482 * T1A);
+		    T1F = FMA(KP623489801, Ty, Tp) + FNMA(KP900968867, Tv, KP222520933 * Ts);
+		    ro[WS(os, 12)] = T1F - T1G;
+		    ro[WS(os, 2)] = T1F + T1G;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 14, "n1_14", {100, 24, 48, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_14) (planner *p) {
+     X(kdft_register) (p, n1_14, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:43 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include n.h */
+
+/*
+ * This function contains 156 FP additions, 84 FP multiplications,
+ * (or, 72 additions, 0 multiplications, 84 fused multiply/add),
+ * 75 stack variables, 6 constants, and 60 memory accesses
+ */
+#include "n.h"
+
+static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
+	       E T1r, T1g, T14, T13;
+	       {
+		    E T5, T2l, Tx, TV, T1z, T1X, T2s, Tr, T24, TT, T2e, T2n, T1Z, T1Q, T1B;
+		    E T11, T1H, TW, T2t, Tg, TX, T25, TI, T2h, T2m, T1Y, T1T, T1A;
+		    {
+			 E T1, T1v, T2, T3, Tu, Tv, TZ, T10;
+			 T1 = ri[0];
+			 T1v = ii[0];
+			 T2 = ri[WS(is, 5)];
+			 T3 = ri[WS(is, 10)];
+			 Tu = ii[WS(is, 5)];
+			 Tv = ii[WS(is, 10)];
+			 {
+			      E T1k, Tm, TM, TJ, Tl, T2c, T1j, T1m, TP, T1p, Tp, TQ;
+			      {
+				   E Th, T1h, TK, TL, Tk, Tn, To, T1i;
+				   {
+					E Ti, Tj, T1y, T4;
+					Th = ri[WS(is, 6)];
+					T1y = T3 - T2;
+					T4 = T2 + T3;
+					{
+					     E T1w, Tw, Tt, T1x;
+					     T1w = Tu + Tv;
+					     Tw = Tu - Tv;
+					     Ti = ri[WS(is, 11)];
+					     T5 = T1 + T4;
+					     Tt = FNMS(KP500000000, T4, T1);
+					     T2l = T1v + T1w;
+					     T1x = FNMS(KP500000000, T1w, T1v);
+					     Tx = FNMS(KP866025403, Tw, Tt);
+					     TV = FMA(KP866025403, Tw, Tt);
+					     T1z = FMA(KP866025403, T1y, T1x);
+					     T1X = FNMS(KP866025403, T1y, T1x);
+					     Tj = ri[WS(is, 1)];
+					}
+					T1h = ii[WS(is, 6)];
+					TK = ii[WS(is, 11)];
+					TL = ii[WS(is, 1)];
+					Tk = Ti + Tj;
+					T1k = Tj - Ti;
+				   }
+				   Tm = ri[WS(is, 9)];
+				   TM = TK - TL;
+				   T1i = TK + TL;
+				   TJ = FNMS(KP500000000, Tk, Th);
+				   Tl = Th + Tk;
+				   Tn = ri[WS(is, 14)];
+				   To = ri[WS(is, 4)];
+				   T2c = T1h + T1i;
+				   T1j = FNMS(KP500000000, T1i, T1h);
+				   T1m = ii[WS(is, 9)];
+				   TP = ii[WS(is, 14)];
+				   T1p = To - Tn;
+				   Tp = Tn + To;
+				   TQ = ii[WS(is, 4)];
+			      }
+			      {
+				   E TN, TS, T1o, T2d;
+				   {
+					E TO, T1n, TR, Tq;
+					TN = FNMS(KP866025403, TM, TJ);
+					TZ = FMA(KP866025403, TM, TJ);
+					TO = FNMS(KP500000000, Tp, Tm);
+					Tq = Tm + Tp;
+					T1n = TP + TQ;
+					TR = TP - TQ;
+					T2s = Tl - Tq;
+					Tr = Tl + Tq;
+					T10 = FMA(KP866025403, TR, TO);
+					TS = FNMS(KP866025403, TR, TO);
+					T1o = FNMS(KP500000000, T1n, T1m);
+					T2d = T1m + T1n;
+				   }
+				   {
+					E T1O, T1l, T1P, T1q;
+					T1O = FNMS(KP866025403, T1k, T1j);
+					T1l = FMA(KP866025403, T1k, T1j);
+					T24 = TN - TS;
+					TT = TN + TS;
+					T1P = FNMS(KP866025403, T1p, T1o);
+					T1q = FMA(KP866025403, T1p, T1o);
+					T2e = T2c - T2d;
+					T2n = T2c + T2d;
+					T1Z = T1O + T1P;
+					T1Q = T1O - T1P;
+					T1r = T1l - T1q;
+					T1B = T1l + T1q;
+				   }
+			      }
+			 }
+			 {
+			      E T19, Tb, TB, Ty, Ta, T2f, T18, T1b, TE, T1e, Te, TF;
+			      {
+				   E T6, T16, Tz, TA, T9, T7, T8, Tc, Td, T17;
+				   T6 = ri[WS(is, 3)];
+				   T7 = ri[WS(is, 8)];
+				   T11 = TZ + T10;
+				   T1H = TZ - T10;
+				   T8 = ri[WS(is, 13)];
+				   T16 = ii[WS(is, 3)];
+				   Tz = ii[WS(is, 8)];
+				   TA = ii[WS(is, 13)];
+				   T9 = T7 + T8;
+				   T19 = T8 - T7;
+				   Tb = ri[WS(is, 12)];
+				   TB = Tz - TA;
+				   T17 = Tz + TA;
+				   Ty = FNMS(KP500000000, T9, T6);
+				   Ta = T6 + T9;
+				   Tc = ri[WS(is, 2)];
+				   Td = ri[WS(is, 7)];
+				   T2f = T16 + T17;
+				   T18 = FNMS(KP500000000, T17, T16);
+				   T1b = ii[WS(is, 12)];
+				   TE = ii[WS(is, 2)];
+				   T1e = Td - Tc;
+				   Te = Tc + Td;
+				   TF = ii[WS(is, 7)];
+			      }
+			      {
+				   E TC, TH, T1d, T2g;
+				   {
+					E TD, T1c, TG, Tf;
+					TC = FNMS(KP866025403, TB, Ty);
+					TW = FMA(KP866025403, TB, Ty);
+					TD = FNMS(KP500000000, Te, Tb);
+					Tf = Tb + Te;
+					T1c = TE + TF;
+					TG = TE - TF;
+					T2t = Ta - Tf;
+					Tg = Ta + Tf;
+					TX = FMA(KP866025403, TG, TD);
+					TH = FNMS(KP866025403, TG, TD);
+					T1d = FNMS(KP500000000, T1c, T1b);
+					T2g = T1b + T1c;
+				   }
+				   {
+					E T1R, T1a, T1S, T1f;
+					T1R = FNMS(KP866025403, T19, T18);
+					T1a = FMA(KP866025403, T19, T18);
+					T25 = TC - TH;
+					TI = TC + TH;
+					T1S = FNMS(KP866025403, T1e, T1d);
+					T1f = FMA(KP866025403, T1e, T1d);
+					T2h = T2f - T2g;
+					T2m = T2f + T2g;
+					T1Y = T1R + T1S;
+					T1T = T1R - T1S;
+					T1g = T1a - T1f;
+					T1A = T1a + T1f;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E TY, T1G, T1M, T1L, T2a, T29, Ts, T22, T21, T20;
+			 T2a = Tg - Tr;
+			 Ts = Tg + Tr;
+			 TY = TW + TX;
+			 T1G = TW - TX;
+			 T29 = FNMS(KP250000000, Ts, T5);
+			 ro[0] = T5 + Ts;
+			 {
+			      E T2q, T2p, T2o, TU;
+			      T2o = T2m + T2n;
+			      T2q = T2m - T2n;
+			      {
+				   E T2k, T2i, T2b, T2j;
+				   T2k = FMA(KP618033988, T2e, T2h);
+				   T2i = FNMS(KP618033988, T2h, T2e);
+				   T2b = FNMS(KP559016994, T2a, T29);
+				   T2j = FMA(KP559016994, T2a, T29);
+				   ro[WS(os, 3)] = FMA(KP951056516, T2i, T2b);
+				   ro[WS(os, 12)] = FNMS(KP951056516, T2i, T2b);
+				   ro[WS(os, 6)] = FMA(KP951056516, T2k, T2j);
+				   ro[WS(os, 9)] = FNMS(KP951056516, T2k, T2j);
+				   T2p = FNMS(KP250000000, T2o, T2l);
+			      }
+			      io[0] = T2l + T2o;
+			      TU = TI + TT;
+			      T1M = TI - TT;
+			      {
+				   E T2r, T2v, T2w, T2u;
+				   T2r = FNMS(KP559016994, T2q, T2p);
+				   T2v = FMA(KP559016994, T2q, T2p);
+				   T2w = FMA(KP618033988, T2s, T2t);
+				   T2u = FNMS(KP618033988, T2t, T2s);
+				   io[WS(os, 9)] = FMA(KP951056516, T2w, T2v);
+				   io[WS(os, 6)] = FNMS(KP951056516, T2w, T2v);
+				   io[WS(os, 12)] = FMA(KP951056516, T2u, T2r);
+				   io[WS(os, 3)] = FNMS(KP951056516, T2u, T2r);
+				   T1L = FNMS(KP250000000, TU, Tx);
+			      }
+			      ro[WS(os, 5)] = Tx + TU;
+			 }
+			 T20 = T1Y + T1Z;
+			 T22 = T1Y - T1Z;
+			 {
+			      E T1N, T1V, T1W, T1U;
+			      T1N = FNMS(KP559016994, T1M, T1L);
+			      T1V = FMA(KP559016994, T1M, T1L);
+			      T1W = FMA(KP618033988, T1Q, T1T);
+			      T1U = FNMS(KP618033988, T1T, T1Q);
+			      ro[WS(os, 11)] = FMA(KP951056516, T1W, T1V);
+			      ro[WS(os, 14)] = FNMS(KP951056516, T1W, T1V);
+			      ro[WS(os, 8)] = FMA(KP951056516, T1U, T1N);
+			      ro[WS(os, 2)] = FNMS(KP951056516, T1U, T1N);
+			      T21 = FNMS(KP250000000, T20, T1X);
+			 }
+			 io[WS(os, 5)] = T1X + T20;
+			 {
+			      E T1E, T1D, T1C, T12;
+			      T1C = T1A + T1B;
+			      T1E = T1A - T1B;
+			      {
+				   E T23, T27, T28, T26;
+				   T23 = FNMS(KP559016994, T22, T21);
+				   T27 = FMA(KP559016994, T22, T21);
+				   T28 = FMA(KP618033988, T24, T25);
+				   T26 = FNMS(KP618033988, T25, T24);
+				   io[WS(os, 14)] = FMA(KP951056516, T28, T27);
+				   io[WS(os, 11)] = FNMS(KP951056516, T28, T27);
+				   io[WS(os, 8)] = FNMS(KP951056516, T26, T23);
+				   io[WS(os, 2)] = FMA(KP951056516, T26, T23);
+				   T1D = FNMS(KP250000000, T1C, T1z);
+			      }
+			      io[WS(os, 10)] = T1z + T1C;
+			      T12 = TY + T11;
+			      T14 = TY - T11;
+			      {
+				   E T1F, T1J, T1K, T1I;
+				   T1F = FMA(KP559016994, T1E, T1D);
+				   T1J = FNMS(KP559016994, T1E, T1D);
+				   T1K = FNMS(KP618033988, T1G, T1H);
+				   T1I = FMA(KP618033988, T1H, T1G);
+				   io[WS(os, 13)] = FNMS(KP951056516, T1K, T1J);
+				   io[WS(os, 7)] = FMA(KP951056516, T1K, T1J);
+				   io[WS(os, 4)] = FMA(KP951056516, T1I, T1F);
+				   io[WS(os, 1)] = FNMS(KP951056516, T1I, T1F);
+				   T13 = FNMS(KP250000000, T12, TV);
+			      }
+			      ro[WS(os, 10)] = TV + T12;
+			 }
+		    }
+	       }
+	       {
+		    E T1t, T15, T1s, T1u;
+		    T1t = FNMS(KP559016994, T14, T13);
+		    T15 = FMA(KP559016994, T14, T13);
+		    T1s = FMA(KP618033988, T1r, T1g);
+		    T1u = FNMS(KP618033988, T1g, T1r);
+		    ro[WS(os, 13)] = FMA(KP951056516, T1u, T1t);
+		    ro[WS(os, 7)] = FNMS(KP951056516, T1u, T1t);
+		    ro[WS(os, 1)] = FMA(KP951056516, T1s, T15);
+		    ro[WS(os, 4)] = FNMS(KP951056516, T1s, T15);
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 15, "n1_15", {72, 0, 84, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_15) (planner *p) {
+     X(kdft_register) (p, n1_15, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include n.h */
+
+/*
+ * This function contains 156 FP additions, 56 FP multiplications,
+ * (or, 128 additions, 28 multiplications, 28 fused multiply/add),
+ * 69 stack variables, 6 constants, and 60 memory accesses
+ */
+#include "n.h"
+
+static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
+	       E T5, T2l, Tx, TV, T1C, T20, Tl, Tq, Tr, TN, TS, TT, T2c, T2d, T2n;
+	       E T1O, T1P, T22, T1l, T1q, T1w, TZ, T10, T11, Ta, Tf, Tg, TC, TH, TI;
+	       E T2f, T2g, T2m, T1R, T1S, T21, T1a, T1f, T1v, TW, TX, TY;
+	       {
+		    E T1, T1z, T4, T1y, Tw, T1A, Tt, T1B;
+		    T1 = ri[0];
+		    T1z = ii[0];
+		    {
+			 E T2, T3, Tu, Tv;
+			 T2 = ri[WS(is, 5)];
+			 T3 = ri[WS(is, 10)];
+			 T4 = T2 + T3;
+			 T1y = KP866025403 * (T3 - T2);
+			 Tu = ii[WS(is, 5)];
+			 Tv = ii[WS(is, 10)];
+			 Tw = KP866025403 * (Tu - Tv);
+			 T1A = Tu + Tv;
+		    }
+		    T5 = T1 + T4;
+		    T2l = T1z + T1A;
+		    Tt = FNMS(KP500000000, T4, T1);
+		    Tx = Tt - Tw;
+		    TV = Tt + Tw;
+		    T1B = FNMS(KP500000000, T1A, T1z);
+		    T1C = T1y + T1B;
+		    T20 = T1B - T1y;
+	       }
+	       {
+		    E Th, Tk, TJ, T1h, T1i, T1j, TM, T1k, Tm, Tp, TO, T1m, T1n, T1o, TR;
+		    E T1p;
+		    {
+			 E Ti, Tj, TK, TL;
+			 Th = ri[WS(is, 6)];
+			 Ti = ri[WS(is, 11)];
+			 Tj = ri[WS(is, 1)];
+			 Tk = Ti + Tj;
+			 TJ = FNMS(KP500000000, Tk, Th);
+			 T1h = KP866025403 * (Tj - Ti);
+			 T1i = ii[WS(is, 6)];
+			 TK = ii[WS(is, 11)];
+			 TL = ii[WS(is, 1)];
+			 T1j = TK + TL;
+			 TM = KP866025403 * (TK - TL);
+			 T1k = FNMS(KP500000000, T1j, T1i);
+		    }
+		    {
+			 E Tn, To, TP, TQ;
+			 Tm = ri[WS(is, 9)];
+			 Tn = ri[WS(is, 14)];
+			 To = ri[WS(is, 4)];
+			 Tp = Tn + To;
+			 TO = FNMS(KP500000000, Tp, Tm);
+			 T1m = KP866025403 * (To - Tn);
+			 T1n = ii[WS(is, 9)];
+			 TP = ii[WS(is, 14)];
+			 TQ = ii[WS(is, 4)];
+			 T1o = TP + TQ;
+			 TR = KP866025403 * (TP - TQ);
+			 T1p = FNMS(KP500000000, T1o, T1n);
+		    }
+		    Tl = Th + Tk;
+		    Tq = Tm + Tp;
+		    Tr = Tl + Tq;
+		    TN = TJ - TM;
+		    TS = TO - TR;
+		    TT = TN + TS;
+		    T2c = T1i + T1j;
+		    T2d = T1n + T1o;
+		    T2n = T2c + T2d;
+		    T1O = T1k - T1h;
+		    T1P = T1p - T1m;
+		    T22 = T1O + T1P;
+		    T1l = T1h + T1k;
+		    T1q = T1m + T1p;
+		    T1w = T1l + T1q;
+		    TZ = TJ + TM;
+		    T10 = TO + TR;
+		    T11 = TZ + T10;
+	       }
+	       {
+		    E T6, T9, Ty, T16, T17, T18, TB, T19, Tb, Te, TD, T1b, T1c, T1d, TG;
+		    E T1e;
+		    {
+			 E T7, T8, Tz, TA;
+			 T6 = ri[WS(is, 3)];
+			 T7 = ri[WS(is, 8)];
+			 T8 = ri[WS(is, 13)];
+			 T9 = T7 + T8;
+			 Ty = FNMS(KP500000000, T9, T6);
+			 T16 = KP866025403 * (T8 - T7);
+			 T17 = ii[WS(is, 3)];
+			 Tz = ii[WS(is, 8)];
+			 TA = ii[WS(is, 13)];
+			 T18 = Tz + TA;
+			 TB = KP866025403 * (Tz - TA);
+			 T19 = FNMS(KP500000000, T18, T17);
+		    }
+		    {
+			 E Tc, Td, TE, TF;
+			 Tb = ri[WS(is, 12)];
+			 Tc = ri[WS(is, 2)];
+			 Td = ri[WS(is, 7)];
+			 Te = Tc + Td;
+			 TD = FNMS(KP500000000, Te, Tb);
+			 T1b = KP866025403 * (Td - Tc);
+			 T1c = ii[WS(is, 12)];
+			 TE = ii[WS(is, 2)];
+			 TF = ii[WS(is, 7)];
+			 T1d = TE + TF;
+			 TG = KP866025403 * (TE - TF);
+			 T1e = FNMS(KP500000000, T1d, T1c);
+		    }
+		    Ta = T6 + T9;
+		    Tf = Tb + Te;
+		    Tg = Ta + Tf;
+		    TC = Ty - TB;
+		    TH = TD - TG;
+		    TI = TC + TH;
+		    T2f = T17 + T18;
+		    T2g = T1c + T1d;
+		    T2m = T2f + T2g;
+		    T1R = T19 - T16;
+		    T1S = T1e - T1b;
+		    T21 = T1R + T1S;
+		    T1a = T16 + T19;
+		    T1f = T1b + T1e;
+		    T1v = T1a + T1f;
+		    TW = Ty + TB;
+		    TX = TD + TG;
+		    TY = TW + TX;
+	       }
+	       {
+		    E T2a, Ts, T29, T2i, T2k, T2e, T2h, T2j, T2b;
+		    T2a = KP559016994 * (Tg - Tr);
+		    Ts = Tg + Tr;
+		    T29 = FNMS(KP250000000, Ts, T5);
+		    T2e = T2c - T2d;
+		    T2h = T2f - T2g;
+		    T2i = FNMS(KP587785252, T2h, KP951056516 * T2e);
+		    T2k = FMA(KP951056516, T2h, KP587785252 * T2e);
+		    ro[0] = T5 + Ts;
+		    T2j = T2a + T29;
+		    ro[WS(os, 9)] = T2j - T2k;
+		    ro[WS(os, 6)] = T2j + T2k;
+		    T2b = T29 - T2a;
+		    ro[WS(os, 12)] = T2b - T2i;
+		    ro[WS(os, 3)] = T2b + T2i;
+	       }
+	       {
+		    E T2q, T2o, T2p, T2u, T2w, T2s, T2t, T2v, T2r;
+		    T2q = KP559016994 * (T2m - T2n);
+		    T2o = T2m + T2n;
+		    T2p = FNMS(KP250000000, T2o, T2l);
+		    T2s = Tl - Tq;
+		    T2t = Ta - Tf;
+		    T2u = FNMS(KP587785252, T2t, KP951056516 * T2s);
+		    T2w = FMA(KP951056516, T2t, KP587785252 * T2s);
+		    io[0] = T2l + T2o;
+		    T2v = T2q + T2p;
+		    io[WS(os, 6)] = T2v - T2w;
+		    io[WS(os, 9)] = T2w + T2v;
+		    T2r = T2p - T2q;
+		    io[WS(os, 3)] = T2r - T2u;
+		    io[WS(os, 12)] = T2u + T2r;
+	       }
+	       {
+		    E T1M, TU, T1L, T1U, T1W, T1Q, T1T, T1V, T1N;
+		    T1M = KP559016994 * (TI - TT);
+		    TU = TI + TT;
+		    T1L = FNMS(KP250000000, TU, Tx);
+		    T1Q = T1O - T1P;
+		    T1T = T1R - T1S;
+		    T1U = FNMS(KP587785252, T1T, KP951056516 * T1Q);
+		    T1W = FMA(KP951056516, T1T, KP587785252 * T1Q);
+		    ro[WS(os, 5)] = Tx + TU;
+		    T1V = T1M + T1L;
+		    ro[WS(os, 14)] = T1V - T1W;
+		    ro[WS(os, 11)] = T1V + T1W;
+		    T1N = T1L - T1M;
+		    ro[WS(os, 2)] = T1N - T1U;
+		    ro[WS(os, 8)] = T1N + T1U;
+	       }
+	       {
+		    E T25, T23, T24, T1Z, T28, T1X, T1Y, T27, T26;
+		    T25 = KP559016994 * (T21 - T22);
+		    T23 = T21 + T22;
+		    T24 = FNMS(KP250000000, T23, T20);
+		    T1X = TN - TS;
+		    T1Y = TC - TH;
+		    T1Z = FNMS(KP587785252, T1Y, KP951056516 * T1X);
+		    T28 = FMA(KP951056516, T1Y, KP587785252 * T1X);
+		    io[WS(os, 5)] = T20 + T23;
+		    T27 = T25 + T24;
+		    io[WS(os, 11)] = T27 - T28;
+		    io[WS(os, 14)] = T28 + T27;
+		    T26 = T24 - T25;
+		    io[WS(os, 2)] = T1Z + T26;
+		    io[WS(os, 8)] = T26 - T1Z;
+	       }
+	       {
+		    E T1x, T1D, T1E, T1I, T1J, T1G, T1H, T1K, T1F;
+		    T1x = KP559016994 * (T1v - T1w);
+		    T1D = T1v + T1w;
+		    T1E = FNMS(KP250000000, T1D, T1C);
+		    T1G = TW - TX;
+		    T1H = TZ - T10;
+		    T1I = FMA(KP951056516, T1G, KP587785252 * T1H);
+		    T1J = FNMS(KP587785252, T1G, KP951056516 * T1H);
+		    io[WS(os, 10)] = T1C + T1D;
+		    T1K = T1E - T1x;
+		    io[WS(os, 7)] = T1J + T1K;
+		    io[WS(os, 13)] = T1K - T1J;
+		    T1F = T1x + T1E;
+		    io[WS(os, 1)] = T1F - T1I;
+		    io[WS(os, 4)] = T1I + T1F;
+	       }
+	       {
+		    E T13, T12, T14, T1s, T1u, T1g, T1r, T1t, T15;
+		    T13 = KP559016994 * (TY - T11);
+		    T12 = TY + T11;
+		    T14 = FNMS(KP250000000, T12, TV);
+		    T1g = T1a - T1f;
+		    T1r = T1l - T1q;
+		    T1s = FMA(KP951056516, T1g, KP587785252 * T1r);
+		    T1u = FNMS(KP587785252, T1g, KP951056516 * T1r);
+		    ro[WS(os, 10)] = TV + T12;
+		    T1t = T14 - T13;
+		    ro[WS(os, 7)] = T1t - T1u;
+		    ro[WS(os, 13)] = T1t + T1u;
+		    T15 = T13 + T14;
+		    ro[WS(os, 4)] = T15 - T1s;
+		    ro[WS(os, 1)] = T15 + T1s;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 15, "n1_15", {128, 28, 28, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_15) (planner *p) {
+     X(kdft_register) (p, n1_15, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,556 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:44 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include n.h */
+
+/*
+ * This function contains 144 FP additions, 40 FP multiplications,
+ * (or, 104 additions, 0 multiplications, 40 fused multiply/add),
+ * 82 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "n.h"
+
+static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       E T1z, T1L, T1M, T1N, T1P, T1J, T1K, T1G, T1O, T1Q;
+	       {
+		    E T1l, T1H, T1R, T7, T1x, TN, TC, T25, T1E, T1b, T1Z, Tt, T2h, T22, T1D;
+		    E T1g, T1n, TQ, Te, T26, TT, T1m, TJ, T1S, Tj, T11, Ti, T1V, TZ, Tk;
+		    E T12, T13;
+		    {
+			 E Tq, T1c, Tp, T20, T1a, Tr, T1d, T1e;
+			 {
+			      E T4, TL, T3, T1k, Ty, T5, Tz, TA;
+			      {
+				   E T1, T2, Tw, Tx;
+				   T1 = ri[0];
+				   T2 = ri[WS(is, 8)];
+				   Tw = ii[0];
+				   Tx = ii[WS(is, 8)];
+				   T4 = ri[WS(is, 4)];
+				   TL = T1 - T2;
+				   T3 = T1 + T2;
+				   T1k = Tw - Tx;
+				   Ty = Tw + Tx;
+				   T5 = ri[WS(is, 12)];
+				   Tz = ii[WS(is, 4)];
+				   TA = ii[WS(is, 12)];
+			      }
+			      {
+				   E Tn, To, T18, T19;
+				   Tn = ri[WS(is, 15)];
+				   {
+					E T1j, T6, TM, TB;
+					T1j = T4 - T5;
+					T6 = T4 + T5;
+					TM = Tz - TA;
+					TB = Tz + TA;
+					T1l = T1j + T1k;
+					T1H = T1k - T1j;
+					T1R = T3 - T6;
+					T7 = T3 + T6;
+					T1x = TL + TM;
+					TN = TL - TM;
+					TC = Ty + TB;
+					T25 = Ty - TB;
+					To = ri[WS(is, 7)];
+				   }
+				   T18 = ii[WS(is, 15)];
+				   T19 = ii[WS(is, 7)];
+				   Tq = ri[WS(is, 3)];
+				   T1c = Tn - To;
+				   Tp = Tn + To;
+				   T20 = T18 + T19;
+				   T1a = T18 - T19;
+				   Tr = ri[WS(is, 11)];
+				   T1d = ii[WS(is, 3)];
+				   T1e = ii[WS(is, 11)];
+			      }
+			 }
+			 {
+			      E Tb, TP, Ta, TO, TF, Tc, TG, TH;
+			      {
+				   E T8, T9, TD, TE;
+				   T8 = ri[WS(is, 2)];
+				   {
+					E T17, Ts, T21, T1f;
+					T17 = Tq - Tr;
+					Ts = Tq + Tr;
+					T21 = T1d + T1e;
+					T1f = T1d - T1e;
+					T1E = T1a - T17;
+					T1b = T17 + T1a;
+					T1Z = Tp - Ts;
+					Tt = Tp + Ts;
+					T2h = T20 + T21;
+					T22 = T20 - T21;
+					T1D = T1c + T1f;
+					T1g = T1c - T1f;
+					T9 = ri[WS(is, 10)];
+				   }
+				   TD = ii[WS(is, 2)];
+				   TE = ii[WS(is, 10)];
+				   Tb = ri[WS(is, 14)];
+				   TP = T8 - T9;
+				   Ta = T8 + T9;
+				   TO = TD - TE;
+				   TF = TD + TE;
+				   Tc = ri[WS(is, 6)];
+				   TG = ii[WS(is, 14)];
+				   TH = ii[WS(is, 6)];
+			      }
+			      {
+				   E TR, Td, TS, TI;
+				   T1n = TP + TO;
+				   TQ = TO - TP;
+				   TR = Tb - Tc;
+				   Td = Tb + Tc;
+				   TS = TG - TH;
+				   TI = TG + TH;
+				   Te = Ta + Td;
+				   T26 = Td - Ta;
+				   TT = TR + TS;
+				   T1m = TR - TS;
+				   TJ = TF + TI;
+				   T1S = TF - TI;
+			      }
+			 }
+			 {
+			      E Tg, Th, TX, TY;
+			      Tg = ri[WS(is, 1)];
+			      Th = ri[WS(is, 9)];
+			      TX = ii[WS(is, 1)];
+			      TY = ii[WS(is, 9)];
+			      Tj = ri[WS(is, 5)];
+			      T11 = Tg - Th;
+			      Ti = Tg + Th;
+			      T1V = TX + TY;
+			      TZ = TX - TY;
+			      Tk = ri[WS(is, 13)];
+			      T12 = ii[WS(is, 5)];
+			      T13 = ii[WS(is, 13)];
+			 }
+		    }
+		    {
+			 E T2f, T1B, T10, T1U, T1X, T1A, T15, Tv, TK, T2i;
+			 {
+			      E Tf, Tu, T2j, T2k, T2g;
+			      T2f = T7 - Te;
+			      Tf = T7 + Te;
+			      {
+				   E TW, Tl, T1W, T14, Tm;
+				   TW = Tj - Tk;
+				   Tl = Tj + Tk;
+				   T1W = T12 + T13;
+				   T14 = T12 - T13;
+				   T1B = TZ - TW;
+				   T10 = TW + TZ;
+				   T1U = Ti - Tl;
+				   Tm = Ti + Tl;
+				   T2g = T1V + T1W;
+				   T1X = T1V - T1W;
+				   T1A = T11 + T14;
+				   T15 = T11 - T14;
+				   Tu = Tm + Tt;
+				   Tv = Tt - Tm;
+			      }
+			      TK = TC - TJ;
+			      T2j = TC + TJ;
+			      T2k = T2g + T2h;
+			      T2i = T2g - T2h;
+			      ro[0] = Tf + Tu;
+			      ro[WS(os, 8)] = Tf - Tu;
+			      io[0] = T2j + T2k;
+			      io[WS(os, 8)] = T2j - T2k;
+			 }
+			 {
+			      E T29, T1T, T27, T2d, T2a, T2b, T28, T24, T1Y, T23;
+			      T29 = T1R - T1S;
+			      T1T = T1R + T1S;
+			      io[WS(os, 12)] = TK - Tv;
+			      io[WS(os, 4)] = Tv + TK;
+			      ro[WS(os, 4)] = T2f + T2i;
+			      ro[WS(os, 12)] = T2f - T2i;
+			      T27 = T25 - T26;
+			      T2d = T26 + T25;
+			      T2a = T1X - T1U;
+			      T1Y = T1U + T1X;
+			      T23 = T1Z - T22;
+			      T2b = T1Z + T22;
+			      T28 = T23 - T1Y;
+			      T24 = T1Y + T23;
+			      {
+				   E T1I, TV, T1v, T1y, T1t, T1s, T1r, T1p, T1q, T1i;
+				   {
+					E T1o, T2e, T2c, TU, T16, T1h;
+					T1I = TQ + TT;
+					TU = TQ - TT;
+					io[WS(os, 14)] = FNMS(KP707106781, T28, T27);
+					io[WS(os, 6)] = FMA(KP707106781, T28, T27);
+					ro[WS(os, 2)] = FMA(KP707106781, T24, T1T);
+					ro[WS(os, 10)] = FNMS(KP707106781, T24, T1T);
+					T2e = T2a + T2b;
+					T2c = T2a - T2b;
+					TV = FMA(KP707106781, TU, TN);
+					T1v = FNMS(KP707106781, TU, TN);
+					io[WS(os, 10)] = FNMS(KP707106781, T2e, T2d);
+					io[WS(os, 2)] = FMA(KP707106781, T2e, T2d);
+					ro[WS(os, 6)] = FMA(KP707106781, T2c, T29);
+					ro[WS(os, 14)] = FNMS(KP707106781, T2c, T29);
+					T1o = T1m - T1n;
+					T1y = T1n + T1m;
+					T1t = FNMS(KP414213562, T10, T15);
+					T16 = FMA(KP414213562, T15, T10);
+					T1h = FNMS(KP414213562, T1g, T1b);
+					T1s = FMA(KP414213562, T1b, T1g);
+					T1r = FMA(KP707106781, T1o, T1l);
+					T1p = FNMS(KP707106781, T1o, T1l);
+					T1q = T16 + T1h;
+					T1i = T16 - T1h;
+				   }
+				   {
+					E T1w, T1u, T1C, T1F;
+					io[WS(os, 15)] = FMA(KP923879532, T1q, T1p);
+					io[WS(os, 7)] = FNMS(KP923879532, T1q, T1p);
+					ro[WS(os, 3)] = FMA(KP923879532, T1i, TV);
+					ro[WS(os, 11)] = FNMS(KP923879532, T1i, TV);
+					T1w = T1t + T1s;
+					T1u = T1s - T1t;
+					T1z = FMA(KP707106781, T1y, T1x);
+					T1L = FNMS(KP707106781, T1y, T1x);
+					ro[WS(os, 15)] = FMA(KP923879532, T1w, T1v);
+					ro[WS(os, 7)] = FNMS(KP923879532, T1w, T1v);
+					io[WS(os, 3)] = FMA(KP923879532, T1u, T1r);
+					io[WS(os, 11)] = FNMS(KP923879532, T1u, T1r);
+					T1M = FNMS(KP414213562, T1A, T1B);
+					T1C = FMA(KP414213562, T1B, T1A);
+					T1F = FNMS(KP414213562, T1E, T1D);
+					T1N = FMA(KP414213562, T1D, T1E);
+					T1P = FMA(KP707106781, T1I, T1H);
+					T1J = FNMS(KP707106781, T1I, T1H);
+					T1K = T1F - T1C;
+					T1G = T1C + T1F;
+				   }
+			      }
+			 }
+		    }
+	       }
+	       io[WS(os, 5)] = FMA(KP923879532, T1K, T1J);
+	       io[WS(os, 13)] = FNMS(KP923879532, T1K, T1J);
+	       ro[WS(os, 1)] = FMA(KP923879532, T1G, T1z);
+	       ro[WS(os, 9)] = FNMS(KP923879532, T1G, T1z);
+	       T1O = T1M - T1N;
+	       T1Q = T1M + T1N;
+	       io[WS(os, 1)] = FMA(KP923879532, T1Q, T1P);
+	       io[WS(os, 9)] = FNMS(KP923879532, T1Q, T1P);
+	       ro[WS(os, 5)] = FMA(KP923879532, T1O, T1L);
+	       ro[WS(os, 13)] = FNMS(KP923879532, T1O, T1L);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 16, "n1_16", {104, 0, 40, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_16) (planner *p) {
+     X(kdft_register) (p, n1_16, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include n.h */
+
+/*
+ * This function contains 144 FP additions, 24 FP multiplications,
+ * (or, 136 additions, 16 multiplications, 8 fused multiply/add),
+ * 50 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "n.h"
+
+static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
+	       E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
+	       E T1U, T1A;
+	       {
+		    E T3, TL, Ty, T1k, T6, T1j, TB, TM;
+		    {
+			 E T1, T2, Tw, Tx;
+			 T1 = ri[0];
+			 T2 = ri[WS(is, 8)];
+			 T3 = T1 + T2;
+			 TL = T1 - T2;
+			 Tw = ii[0];
+			 Tx = ii[WS(is, 8)];
+			 Ty = Tw + Tx;
+			 T1k = Tw - Tx;
+		    }
+		    {
+			 E T4, T5, Tz, TA;
+			 T4 = ri[WS(is, 4)];
+			 T5 = ri[WS(is, 12)];
+			 T6 = T4 + T5;
+			 T1j = T4 - T5;
+			 Tz = ii[WS(is, 4)];
+			 TA = ii[WS(is, 12)];
+			 TB = Tz + TA;
+			 TM = Tz - TA;
+		    }
+		    T7 = T3 + T6;
+		    T1R = T3 - T6;
+		    T25 = Ty - TB;
+		    TC = Ty + TB;
+		    TN = TL - TM;
+		    T1x = TL + TM;
+		    T1H = T1k - T1j;
+		    T1l = T1j + T1k;
+	       }
+	       {
+		    E Tp, T17, T1f, T20, Ts, T1c, T1a, T21;
+		    {
+			 E Tn, To, T1d, T1e;
+			 Tn = ri[WS(is, 15)];
+			 To = ri[WS(is, 7)];
+			 Tp = Tn + To;
+			 T17 = Tn - To;
+			 T1d = ii[WS(is, 15)];
+			 T1e = ii[WS(is, 7)];
+			 T1f = T1d - T1e;
+			 T20 = T1d + T1e;
+		    }
+		    {
+			 E Tq, Tr, T18, T19;
+			 Tq = ri[WS(is, 3)];
+			 Tr = ri[WS(is, 11)];
+			 Ts = Tq + Tr;
+			 T1c = Tq - Tr;
+			 T18 = ii[WS(is, 3)];
+			 T19 = ii[WS(is, 11)];
+			 T1a = T18 - T19;
+			 T21 = T18 + T19;
+		    }
+		    Tt = Tp + Ts;
+		    T22 = T20 - T21;
+		    T2h = T20 + T21;
+		    T1b = T17 - T1a;
+		    T1g = T1c + T1f;
+		    T1E = T1f - T1c;
+		    T1Z = Tp - Ts;
+		    T1D = T17 + T1a;
+	       }
+	       {
+		    E Ta, TP, TF, TO, Td, TR, TI, TS;
+		    {
+			 E T8, T9, TD, TE;
+			 T8 = ri[WS(is, 2)];
+			 T9 = ri[WS(is, 10)];
+			 Ta = T8 + T9;
+			 TP = T8 - T9;
+			 TD = ii[WS(is, 2)];
+			 TE = ii[WS(is, 10)];
+			 TF = TD + TE;
+			 TO = TD - TE;
+		    }
+		    {
+			 E Tb, Tc, TG, TH;
+			 Tb = ri[WS(is, 14)];
+			 Tc = ri[WS(is, 6)];
+			 Td = Tb + Tc;
+			 TR = Tb - Tc;
+			 TG = ii[WS(is, 14)];
+			 TH = ii[WS(is, 6)];
+			 TI = TG + TH;
+			 TS = TG - TH;
+		    }
+		    Te = Ta + Td;
+		    T1S = TF - TI;
+		    T26 = Td - Ta;
+		    TJ = TF + TI;
+		    TQ = TO - TP;
+		    T1m = TR - TS;
+		    T1n = TP + TO;
+		    TT = TR + TS;
+	       }
+	       {
+		    E Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
+		    {
+			 E Tg, Th, TX, TY;
+			 Tg = ri[WS(is, 1)];
+			 Th = ri[WS(is, 9)];
+			 Ti = Tg + Th;
+			 T11 = Tg - Th;
+			 TX = ii[WS(is, 1)];
+			 TY = ii[WS(is, 9)];
+			 TZ = TX - TY;
+			 T1V = TX + TY;
+		    }
+		    {
+			 E Tj, Tk, T12, T13;
+			 Tj = ri[WS(is, 5)];
+			 Tk = ri[WS(is, 13)];
+			 Tl = Tj + Tk;
+			 TW = Tj - Tk;
+			 T12 = ii[WS(is, 5)];
+			 T13 = ii[WS(is, 13)];
+			 T14 = T12 - T13;
+			 T1W = T12 + T13;
+		    }
+		    Tm = Ti + Tl;
+		    T1X = T1V - T1W;
+		    T2g = T1V + T1W;
+		    T10 = TW + TZ;
+		    T15 = T11 - T14;
+		    T1B = T11 + T14;
+		    T1U = Ti - Tl;
+		    T1A = TZ - TW;
+	       }
+	       {
+		    E Tf, Tu, T2j, T2k;
+		    Tf = T7 + Te;
+		    Tu = Tm + Tt;
+		    ro[WS(os, 8)] = Tf - Tu;
+		    ro[0] = Tf + Tu;
+		    T2j = TC + TJ;
+		    T2k = T2g + T2h;
+		    io[WS(os, 8)] = T2j - T2k;
+		    io[0] = T2j + T2k;
+	       }
+	       {
+		    E Tv, TK, T2f, T2i;
+		    Tv = Tt - Tm;
+		    TK = TC - TJ;
+		    io[WS(os, 4)] = Tv + TK;
+		    io[WS(os, 12)] = TK - Tv;
+		    T2f = T7 - Te;
+		    T2i = T2g - T2h;
+		    ro[WS(os, 12)] = T2f - T2i;
+		    ro[WS(os, 4)] = T2f + T2i;
+	       }
+	       {
+		    E T1T, T27, T24, T28, T1Y, T23;
+		    T1T = T1R + T1S;
+		    T27 = T25 - T26;
+		    T1Y = T1U + T1X;
+		    T23 = T1Z - T22;
+		    T24 = KP707106781 * (T1Y + T23);
+		    T28 = KP707106781 * (T23 - T1Y);
+		    ro[WS(os, 10)] = T1T - T24;
+		    io[WS(os, 6)] = T27 + T28;
+		    ro[WS(os, 2)] = T1T + T24;
+		    io[WS(os, 14)] = T27 - T28;
+	       }
+	       {
+		    E T29, T2d, T2c, T2e, T2a, T2b;
+		    T29 = T1R - T1S;
+		    T2d = T26 + T25;
+		    T2a = T1X - T1U;
+		    T2b = T1Z + T22;
+		    T2c = KP707106781 * (T2a - T2b);
+		    T2e = KP707106781 * (T2a + T2b);
+		    ro[WS(os, 14)] = T29 - T2c;
+		    io[WS(os, 2)] = T2d + T2e;
+		    ro[WS(os, 6)] = T29 + T2c;
+		    io[WS(os, 10)] = T2d - T2e;
+	       }
+	       {
+		    E TV, T1r, T1p, T1v, T1i, T1q, T1u, T1w, TU, T1o;
+		    TU = KP707106781 * (TQ - TT);
+		    TV = TN + TU;
+		    T1r = TN - TU;
+		    T1o = KP707106781 * (T1m - T1n);
+		    T1p = T1l - T1o;
+		    T1v = T1l + T1o;
+		    {
+			 E T16, T1h, T1s, T1t;
+			 T16 = FMA(KP923879532, T10, KP382683432 * T15);
+			 T1h = FNMS(KP923879532, T1g, KP382683432 * T1b);
+			 T1i = T16 + T1h;
+			 T1q = T1h - T16;
+			 T1s = FNMS(KP923879532, T15, KP382683432 * T10);
+			 T1t = FMA(KP382683432, T1g, KP923879532 * T1b);
+			 T1u = T1s - T1t;
+			 T1w = T1s + T1t;
+		    }
+		    ro[WS(os, 11)] = TV - T1i;
+		    io[WS(os, 11)] = T1v - T1w;
+		    ro[WS(os, 3)] = TV + T1i;
+		    io[WS(os, 3)] = T1v + T1w;
+		    io[WS(os, 15)] = T1p - T1q;
+		    ro[WS(os, 15)] = T1r - T1u;
+		    io[WS(os, 7)] = T1p + T1q;
+		    ro[WS(os, 7)] = T1r + T1u;
+	       }
+	       {
+		    E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
+		    T1y = KP707106781 * (T1n + T1m);
+		    T1z = T1x + T1y;
+		    T1L = T1x - T1y;
+		    T1I = KP707106781 * (TQ + TT);
+		    T1J = T1H - T1I;
+		    T1P = T1H + T1I;
+		    {
+			 E T1C, T1F, T1M, T1N;
+			 T1C = FMA(KP382683432, T1A, KP923879532 * T1B);
+			 T1F = FNMS(KP382683432, T1E, KP923879532 * T1D);
+			 T1G = T1C + T1F;
+			 T1K = T1F - T1C;
+			 T1M = FNMS(KP382683432, T1B, KP923879532 * T1A);
+			 T1N = FMA(KP923879532, T1E, KP382683432 * T1D);
+			 T1O = T1M - T1N;
+			 T1Q = T1M + T1N;
+		    }
+		    ro[WS(os, 9)] = T1z - T1G;
+		    io[WS(os, 9)] = T1P - T1Q;
+		    ro[WS(os, 1)] = T1z + T1G;
+		    io[WS(os, 1)] = T1P + T1Q;
+		    io[WS(os, 13)] = T1J - T1K;
+		    ro[WS(os, 13)] = T1L - T1O;
+		    io[WS(os, 5)] = T1J + T1K;
+		    ro[WS(os, 5)] = T1L + T1O;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 16, "n1_16", {136, 16, 8, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_16) (planner *p) {
+     X(kdft_register) (p, n1_16, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:42 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 2 -name n1_2 -include n.h */
+
+/*
+ * This function contains 4 FP additions, 0 FP multiplications,
+ * (or, 4 additions, 0 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "n.h"
+
+static void n1_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
+	       E T1, T2, T3, T4;
+	       T1 = ri[0];
+	       T2 = ri[WS(is, 1)];
+	       T3 = ii[0];
+	       T4 = ii[WS(is, 1)];
+	       ro[0] = T1 + T2;
+	       ro[WS(os, 1)] = T1 - T2;
+	       io[0] = T3 + T4;
+	       io[WS(os, 1)] = T3 - T4;
+	  }
+     }
+}
+
+static const kdft_desc desc = { 2, "n1_2", {4, 0, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_2) (planner *p) {
+     X(kdft_register) (p, n1_2, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 2 -name n1_2 -include n.h */
+
+/*
+ * This function contains 4 FP additions, 0 FP multiplications,
+ * (or, 4 additions, 0 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "n.h"
+
+static void n1_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
+	       E T1, T2, T3, T4;
+	       T1 = ri[0];
+	       T2 = ri[WS(is, 1)];
+	       ro[WS(os, 1)] = T1 - T2;
+	       ro[0] = T1 + T2;
+	       T3 = ii[0];
+	       T4 = ii[WS(is, 1)];
+	       io[WS(os, 1)] = T3 - T4;
+	       io[0] = T3 + T4;
+	  }
+     }
+}
+
+static const kdft_desc desc = { 2, "n1_2", {4, 0, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_2) (planner *p) {
+     X(kdft_register) (p, n1_2, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:46 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include n.h */
+
+/*
+ * This function contains 208 FP additions, 72 FP multiplications,
+ * (or, 136 additions, 0 multiplications, 72 fused multiply/add),
+ * 86 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "n.h"
+
+static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
+	       E T1Y, T1Z, T1W, T1V;
+	       {
+		    E T1d, TP, TD, T7, T3b, T2N, T2f, T1R, T2U, TB, T2P, T2A, T3d, T37, T3j;
+		    E TJ, T2n, T1b, T1T, T1y, T2b, T2h, T1j, T2V, Tm, T2O, T2H, T3c, T34, T1e;
+		    E T1f, T3i, TG, T2m, T10, T1S, T1J, T28, T2g;
+		    {
+			 E T4, T1N, T3, T2L, TN, T5, T1O, T1P, T1h, T1i;
+			 {
+			      E T1, T2, TL, TM;
+			      T1 = ri[0];
+			      T2 = ri[WS(is, 10)];
+			      TL = ii[0];
+			      TM = ii[WS(is, 10)];
+			      T4 = ri[WS(is, 5)];
+			      T1N = T1 - T2;
+			      T3 = T1 + T2;
+			      T2L = TL + TM;
+			      TN = TL - TM;
+			      T5 = ri[WS(is, 15)];
+			      T1O = ii[WS(is, 5)];
+			      T1P = ii[WS(is, 15)];
+			 }
+			 {
+			      E T1o, Tp, T2u, T13, T14, Ts, T2v, T1r, Tx, T1t, Tw, T2x, T18, Ty, T1u;
+			      E T1v;
+			      {
+				   E Tq, Tr, T1p, T1q;
+				   {
+					E Tn, To, T11, T12;
+					Tn = ri[WS(is, 8)];
+					{
+					     E TO, T6, T2M, T1Q;
+					     TO = T4 - T5;
+					     T6 = T4 + T5;
+					     T2M = T1O + T1P;
+					     T1Q = T1O - T1P;
+					     T1d = TO + TN;
+					     TP = TN - TO;
+					     TD = T3 + T6;
+					     T7 = T3 - T6;
+					     T3b = T2L + T2M;
+					     T2N = T2L - T2M;
+					     T2f = T1N + T1Q;
+					     T1R = T1N - T1Q;
+					     To = ri[WS(is, 18)];
+					}
+					T11 = ii[WS(is, 8)];
+					T12 = ii[WS(is, 18)];
+					Tq = ri[WS(is, 13)];
+					T1o = Tn - To;
+					Tp = Tn + To;
+					T2u = T11 + T12;
+					T13 = T11 - T12;
+					Tr = ri[WS(is, 3)];
+					T1p = ii[WS(is, 13)];
+					T1q = ii[WS(is, 3)];
+				   }
+				   {
+					E Tu, Tv, T16, T17;
+					Tu = ri[WS(is, 12)];
+					T14 = Tq - Tr;
+					Ts = Tq + Tr;
+					T2v = T1p + T1q;
+					T1r = T1p - T1q;
+					Tv = ri[WS(is, 2)];
+					T16 = ii[WS(is, 12)];
+					T17 = ii[WS(is, 2)];
+					Tx = ri[WS(is, 17)];
+					T1t = Tu - Tv;
+					Tw = Tu + Tv;
+					T2x = T16 + T17;
+					T18 = T16 - T17;
+					Ty = ri[WS(is, 7)];
+					T1u = ii[WS(is, 17)];
+					T1v = ii[WS(is, 7)];
+				   }
+			      }
+			      {
+				   E TH, T19, T1w, TI;
+				   {
+					E Tt, T2w, T35, TA, T2z, T36, Tz, T2y;
+					TH = Tp + Ts;
+					Tt = Tp - Ts;
+					T19 = Tx - Ty;
+					Tz = Tx + Ty;
+					T2y = T1u + T1v;
+					T1w = T1u - T1v;
+					T2w = T2u - T2v;
+					T35 = T2u + T2v;
+					TI = Tw + Tz;
+					TA = Tw - Tz;
+					T2z = T2x - T2y;
+					T36 = T2x + T2y;
+					T2U = Tt - TA;
+					TB = Tt + TA;
+					T2P = T2w + T2z;
+					T2A = T2w - T2z;
+					T3d = T35 + T36;
+					T37 = T35 - T36;
+				   }
+				   {
+					E T1s, T29, T1x, T2a, T15, T1a;
+					T15 = T13 - T14;
+					T1h = T14 + T13;
+					T1i = T19 + T18;
+					T1a = T18 - T19;
+					T1s = T1o - T1r;
+					T29 = T1o + T1r;
+					T3j = TH - TI;
+					TJ = TH + TI;
+					T1x = T1t - T1w;
+					T2a = T1t + T1w;
+					T2n = T15 - T1a;
+					T1b = T15 + T1a;
+					T1T = T1s + T1x;
+					T1y = T1s - T1x;
+					T2b = T29 - T2a;
+					T2h = T29 + T2a;
+				   }
+			      }
+			 }
+			 {
+			      E Ta, T1z, T2B, TS, TT, Td, T2C, T1C, Ti, T1E, Th, T2E, TX, Tj, T1F;
+			      E T1G;
+			      {
+				   E Tb, Tc, T1A, T1B;
+				   {
+					E TQ, TR, T8, T9;
+					T8 = ri[WS(is, 4)];
+					T9 = ri[WS(is, 14)];
+					T1j = T1h + T1i;
+					T1Y = T1h - T1i;
+					TQ = ii[WS(is, 4)];
+					TR = ii[WS(is, 14)];
+					Ta = T8 + T9;
+					T1z = T8 - T9;
+					Tb = ri[WS(is, 9)];
+					T2B = TQ + TR;
+					TS = TQ - TR;
+					Tc = ri[WS(is, 19)];
+					T1A = ii[WS(is, 9)];
+					T1B = ii[WS(is, 19)];
+				   }
+				   {
+					E Tf, Tg, TV, TW;
+					Tf = ri[WS(is, 16)];
+					TT = Tb - Tc;
+					Td = Tb + Tc;
+					T2C = T1A + T1B;
+					T1C = T1A - T1B;
+					Tg = ri[WS(is, 6)];
+					TV = ii[WS(is, 16)];
+					TW = ii[WS(is, 6)];
+					Ti = ri[WS(is, 1)];
+					T1E = Tf - Tg;
+					Th = Tf + Tg;
+					T2E = TV + TW;
+					TX = TV - TW;
+					Tj = ri[WS(is, 11)];
+					T1F = ii[WS(is, 1)];
+					T1G = ii[WS(is, 11)];
+				   }
+			      }
+			      {
+				   E TE, TY, T1H, TF;
+				   {
+					E Te, T2D, T32, Tl, T2G, T33, Tk, T2F;
+					TE = Ta + Td;
+					Te = Ta - Td;
+					TY = Ti - Tj;
+					Tk = Ti + Tj;
+					T2F = T1F + T1G;
+					T1H = T1F - T1G;
+					T2D = T2B - T2C;
+					T32 = T2B + T2C;
+					TF = Th + Tk;
+					Tl = Th - Tk;
+					T2G = T2E - T2F;
+					T33 = T2E + T2F;
+					T2V = Te - Tl;
+					Tm = Te + Tl;
+					T2O = T2D + T2G;
+					T2H = T2D - T2G;
+					T3c = T32 + T33;
+					T34 = T32 - T33;
+				   }
+				   {
+					E T1D, T26, T1I, T27, TU, TZ;
+					TU = TS - TT;
+					T1e = TT + TS;
+					T1f = TY + TX;
+					TZ = TX - TY;
+					T1D = T1z - T1C;
+					T26 = T1z + T1C;
+					T3i = TE - TF;
+					TG = TE + TF;
+					T1I = T1E - T1H;
+					T27 = T1E + T1H;
+					T2m = TU - TZ;
+					T10 = TU + TZ;
+					T1S = T1D + T1I;
+					T1J = T1D - T1I;
+					T28 = T26 - T27;
+					T2g = T26 + T27;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T1g, T3g, T3f, T2S, T2R, T2k, T2j;
+			 {
+			      E T2s, T2r, TC, T2Q;
+			      T2s = Tm - TB;
+			      TC = Tm + TB;
+			      T1g = T1e + T1f;
+			      T1Z = T1e - T1f;
+			      T2r = FNMS(KP250000000, TC, T7);
+			      ro[WS(os, 10)] = T7 + TC;
+			      T2Q = T2O + T2P;
+			      T2S = T2O - T2P;
+			      {
+				   E T2K, T2I, T2t, T2J;
+				   T2K = FMA(KP618033988, T2A, T2H);
+				   T2I = FNMS(KP618033988, T2H, T2A);
+				   T2t = FNMS(KP559016994, T2s, T2r);
+				   T2J = FMA(KP559016994, T2s, T2r);
+				   ro[WS(os, 18)] = FMA(KP951056516, T2I, T2t);
+				   ro[WS(os, 2)] = FNMS(KP951056516, T2I, T2t);
+				   ro[WS(os, 6)] = FMA(KP951056516, T2K, T2J);
+				   ro[WS(os, 14)] = FNMS(KP951056516, T2K, T2J);
+				   T2R = FNMS(KP250000000, T2Q, T2N);
+			      }
+			      io[WS(os, 10)] = T2N + T2Q;
+			 }
+			 {
+			      E T30, T2Z, TK, T3e;
+			      TK = TG + TJ;
+			      T30 = TG - TJ;
+			      {
+				   E T2T, T2X, T2Y, T2W;
+				   T2T = FNMS(KP559016994, T2S, T2R);
+				   T2X = FMA(KP559016994, T2S, T2R);
+				   T2Y = FMA(KP618033988, T2U, T2V);
+				   T2W = FNMS(KP618033988, T2V, T2U);
+				   io[WS(os, 14)] = FMA(KP951056516, T2Y, T2X);
+				   io[WS(os, 6)] = FNMS(KP951056516, T2Y, T2X);
+				   io[WS(os, 18)] = FNMS(KP951056516, T2W, T2T);
+				   io[WS(os, 2)] = FMA(KP951056516, T2W, T2T);
+				   T2Z = FNMS(KP250000000, TK, TD);
+			      }
+			      ro[0] = TD + TK;
+			      T3e = T3c + T3d;
+			      T3g = T3c - T3d;
+			      {
+				   E T31, T39, T3a, T38;
+				   T31 = FMA(KP559016994, T30, T2Z);
+				   T39 = FNMS(KP559016994, T30, T2Z);
+				   T3a = FNMS(KP618033988, T34, T37);
+				   T38 = FMA(KP618033988, T37, T34);
+				   ro[WS(os, 8)] = FMA(KP951056516, T3a, T39);
+				   ro[WS(os, 12)] = FNMS(KP951056516, T3a, T39);
+				   ro[WS(os, 16)] = FMA(KP951056516, T38, T31);
+				   ro[WS(os, 4)] = FNMS(KP951056516, T38, T31);
+				   T3f = FNMS(KP250000000, T3e, T3b);
+			      }
+			      io[0] = T3b + T3e;
+			 }
+			 {
+			      E T24, T23, T1c, T2i;
+			      T1c = T10 + T1b;
+			      T24 = T10 - T1b;
+			      {
+				   E T3h, T3l, T3m, T3k;
+				   T3h = FMA(KP559016994, T3g, T3f);
+				   T3l = FNMS(KP559016994, T3g, T3f);
+				   T3m = FNMS(KP618033988, T3i, T3j);
+				   T3k = FMA(KP618033988, T3j, T3i);
+				   io[WS(os, 12)] = FMA(KP951056516, T3m, T3l);
+				   io[WS(os, 8)] = FNMS(KP951056516, T3m, T3l);
+				   io[WS(os, 16)] = FNMS(KP951056516, T3k, T3h);
+				   io[WS(os, 4)] = FMA(KP951056516, T3k, T3h);
+				   T23 = FNMS(KP250000000, T1c, TP);
+			      }
+			      io[WS(os, 5)] = TP + T1c;
+			      T2i = T2g + T2h;
+			      T2k = T2g - T2h;
+			      {
+				   E T25, T2d, T2e, T2c;
+				   T25 = FMA(KP559016994, T24, T23);
+				   T2d = FNMS(KP559016994, T24, T23);
+				   T2e = FNMS(KP618033988, T28, T2b);
+				   T2c = FMA(KP618033988, T2b, T28);
+				   io[WS(os, 17)] = FMA(KP951056516, T2e, T2d);
+				   io[WS(os, 13)] = FNMS(KP951056516, T2e, T2d);
+				   io[WS(os, 9)] = FMA(KP951056516, T2c, T25);
+				   io[WS(os, 1)] = FNMS(KP951056516, T2c, T25);
+				   T2j = FNMS(KP250000000, T2i, T2f);
+			      }
+			      ro[WS(os, 5)] = T2f + T2i;
+			 }
+			 {
+			      E T1m, T1l, T1k, T1U;
+			      T1k = T1g + T1j;
+			      T1m = T1g - T1j;
+			      {
+				   E T2l, T2p, T2q, T2o;
+				   T2l = FMA(KP559016994, T2k, T2j);
+				   T2p = FNMS(KP559016994, T2k, T2j);
+				   T2q = FNMS(KP618033988, T2m, T2n);
+				   T2o = FMA(KP618033988, T2n, T2m);
+				   ro[WS(os, 17)] = FNMS(KP951056516, T2q, T2p);
+				   ro[WS(os, 13)] = FMA(KP951056516, T2q, T2p);
+				   ro[WS(os, 9)] = FNMS(KP951056516, T2o, T2l);
+				   ro[WS(os, 1)] = FMA(KP951056516, T2o, T2l);
+				   T1l = FNMS(KP250000000, T1k, T1d);
+			      }
+			      io[WS(os, 15)] = T1d + T1k;
+			      T1U = T1S + T1T;
+			      T1W = T1S - T1T;
+			      {
+				   E T1n, T1L, T1M, T1K;
+				   T1n = FNMS(KP559016994, T1m, T1l);
+				   T1L = FMA(KP559016994, T1m, T1l);
+				   T1M = FMA(KP618033988, T1y, T1J);
+				   T1K = FNMS(KP618033988, T1J, T1y);
+				   io[WS(os, 19)] = FMA(KP951056516, T1M, T1L);
+				   io[WS(os, 11)] = FNMS(KP951056516, T1M, T1L);
+				   io[WS(os, 7)] = FMA(KP951056516, T1K, T1n);
+				   io[WS(os, 3)] = FNMS(KP951056516, T1K, T1n);
+				   T1V = FNMS(KP250000000, T1U, T1R);
+			      }
+			      ro[WS(os, 15)] = T1R + T1U;
+			 }
+		    }
+	       }
+	       {
+		    E T21, T1X, T20, T22;
+		    T21 = FMA(KP559016994, T1W, T1V);
+		    T1X = FNMS(KP559016994, T1W, T1V);
+		    T20 = FNMS(KP618033988, T1Z, T1Y);
+		    T22 = FMA(KP618033988, T1Y, T1Z);
+		    ro[WS(os, 19)] = FNMS(KP951056516, T22, T21);
+		    ro[WS(os, 11)] = FMA(KP951056516, T22, T21);
+		    ro[WS(os, 7)] = FNMS(KP951056516, T20, T1X);
+		    ro[WS(os, 3)] = FMA(KP951056516, T20, T1X);
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 20, "n1_20", {136, 0, 72, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_20) (planner *p) {
+     X(kdft_register) (p, n1_20, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include n.h */
+
+/*
+ * This function contains 208 FP additions, 48 FP multiplications,
+ * (or, 184 additions, 24 multiplications, 24 fused multiply/add),
+ * 81 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "n.h"
+
+static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
+	       E T7, T2Q, T3h, TD, TP, T1U, T2l, T1d, Tt, TA, TB, T2w, T2z, T2S, T35;
+	       E T36, T3f, TH, TI, TJ, T15, T1a, T1b, T1s, T1x, T1W, T29, T2a, T2j, T1h;
+	       E T1i, T1j, Te, Tl, Tm, T2D, T2G, T2R, T32, T33, T3e, TE, TF, TG, TU;
+	       E TZ, T10, T1D, T1I, T1V, T26, T27, T2i, T1e, T1f, T1g;
+	       {
+		    E T3, T1Q, TN, T2O, T6, TO, T1T, T2P;
+		    {
+			 E T1, T2, TL, TM;
+			 T1 = ri[0];
+			 T2 = ri[WS(is, 10)];
+			 T3 = T1 + T2;
+			 T1Q = T1 - T2;
+			 TL = ii[0];
+			 TM = ii[WS(is, 10)];
+			 TN = TL - TM;
+			 T2O = TL + TM;
+		    }
+		    {
+			 E T4, T5, T1R, T1S;
+			 T4 = ri[WS(is, 5)];
+			 T5 = ri[WS(is, 15)];
+			 T6 = T4 + T5;
+			 TO = T4 - T5;
+			 T1R = ii[WS(is, 5)];
+			 T1S = ii[WS(is, 15)];
+			 T1T = T1R - T1S;
+			 T2P = T1R + T1S;
+		    }
+		    T7 = T3 - T6;
+		    T2Q = T2O - T2P;
+		    T3h = T2O + T2P;
+		    TD = T3 + T6;
+		    TP = TN - TO;
+		    T1U = T1Q - T1T;
+		    T2l = T1Q + T1T;
+		    T1d = TO + TN;
+	       }
+	       {
+		    E Tp, T1o, T13, T2u, Ts, T14, T1r, T2v, Tw, T1t, T18, T2x, Tz, T19, T1w;
+		    E T2y;
+		    {
+			 E Tn, To, T11, T12;
+			 Tn = ri[WS(is, 8)];
+			 To = ri[WS(is, 18)];
+			 Tp = Tn + To;
+			 T1o = Tn - To;
+			 T11 = ii[WS(is, 8)];
+			 T12 = ii[WS(is, 18)];
+			 T13 = T11 - T12;
+			 T2u = T11 + T12;
+		    }
+		    {
+			 E Tq, Tr, T1p, T1q;
+			 Tq = ri[WS(is, 13)];
+			 Tr = ri[WS(is, 3)];
+			 Ts = Tq + Tr;
+			 T14 = Tq - Tr;
+			 T1p = ii[WS(is, 13)];
+			 T1q = ii[WS(is, 3)];
+			 T1r = T1p - T1q;
+			 T2v = T1p + T1q;
+		    }
+		    {
+			 E Tu, Tv, T16, T17;
+			 Tu = ri[WS(is, 12)];
+			 Tv = ri[WS(is, 2)];
+			 Tw = Tu + Tv;
+			 T1t = Tu - Tv;
+			 T16 = ii[WS(is, 12)];
+			 T17 = ii[WS(is, 2)];
+			 T18 = T16 - T17;
+			 T2x = T16 + T17;
+		    }
+		    {
+			 E Tx, Ty, T1u, T1v;
+			 Tx = ri[WS(is, 17)];
+			 Ty = ri[WS(is, 7)];
+			 Tz = Tx + Ty;
+			 T19 = Tx - Ty;
+			 T1u = ii[WS(is, 17)];
+			 T1v = ii[WS(is, 7)];
+			 T1w = T1u - T1v;
+			 T2y = T1u + T1v;
+		    }
+		    Tt = Tp - Ts;
+		    TA = Tw - Tz;
+		    TB = Tt + TA;
+		    T2w = T2u - T2v;
+		    T2z = T2x - T2y;
+		    T2S = T2w + T2z;
+		    T35 = T2u + T2v;
+		    T36 = T2x + T2y;
+		    T3f = T35 + T36;
+		    TH = Tp + Ts;
+		    TI = Tw + Tz;
+		    TJ = TH + TI;
+		    T15 = T13 - T14;
+		    T1a = T18 - T19;
+		    T1b = T15 + T1a;
+		    T1s = T1o - T1r;
+		    T1x = T1t - T1w;
+		    T1W = T1s + T1x;
+		    T29 = T1o + T1r;
+		    T2a = T1t + T1w;
+		    T2j = T29 + T2a;
+		    T1h = T14 + T13;
+		    T1i = T19 + T18;
+		    T1j = T1h + T1i;
+	       }
+	       {
+		    E Ta, T1z, TS, T2B, Td, TT, T1C, T2C, Th, T1E, TX, T2E, Tk, TY, T1H;
+		    E T2F;
+		    {
+			 E T8, T9, TQ, TR;
+			 T8 = ri[WS(is, 4)];
+			 T9 = ri[WS(is, 14)];
+			 Ta = T8 + T9;
+			 T1z = T8 - T9;
+			 TQ = ii[WS(is, 4)];
+			 TR = ii[WS(is, 14)];
+			 TS = TQ - TR;
+			 T2B = TQ + TR;
+		    }
+		    {
+			 E Tb, Tc, T1A, T1B;
+			 Tb = ri[WS(is, 9)];
+			 Tc = ri[WS(is, 19)];
+			 Td = Tb + Tc;
+			 TT = Tb - Tc;
+			 T1A = ii[WS(is, 9)];
+			 T1B = ii[WS(is, 19)];
+			 T1C = T1A - T1B;
+			 T2C = T1A + T1B;
+		    }
+		    {
+			 E Tf, Tg, TV, TW;
+			 Tf = ri[WS(is, 16)];
+			 Tg = ri[WS(is, 6)];
+			 Th = Tf + Tg;
+			 T1E = Tf - Tg;
+			 TV = ii[WS(is, 16)];
+			 TW = ii[WS(is, 6)];
+			 TX = TV - TW;
+			 T2E = TV + TW;
+		    }
+		    {
+			 E Ti, Tj, T1F, T1G;
+			 Ti = ri[WS(is, 1)];
+			 Tj = ri[WS(is, 11)];
+			 Tk = Ti + Tj;
+			 TY = Ti - Tj;
+			 T1F = ii[WS(is, 1)];
+			 T1G = ii[WS(is, 11)];
+			 T1H = T1F - T1G;
+			 T2F = T1F + T1G;
+		    }
+		    Te = Ta - Td;
+		    Tl = Th - Tk;
+		    Tm = Te + Tl;
+		    T2D = T2B - T2C;
+		    T2G = T2E - T2F;
+		    T2R = T2D + T2G;
+		    T32 = T2B + T2C;
+		    T33 = T2E + T2F;
+		    T3e = T32 + T33;
+		    TE = Ta + Td;
+		    TF = Th + Tk;
+		    TG = TE + TF;
+		    TU = TS - TT;
+		    TZ = TX - TY;
+		    T10 = TU + TZ;
+		    T1D = T1z - T1C;
+		    T1I = T1E - T1H;
+		    T1V = T1D + T1I;
+		    T26 = T1z + T1C;
+		    T27 = T1E + T1H;
+		    T2i = T26 + T27;
+		    T1e = TT + TS;
+		    T1f = TY + TX;
+		    T1g = T1e + T1f;
+	       }
+	       {
+		    E T2s, TC, T2r, T2I, T2K, T2A, T2H, T2J, T2t;
+		    T2s = KP559016994 * (Tm - TB);
+		    TC = Tm + TB;
+		    T2r = FNMS(KP250000000, TC, T7);
+		    T2A = T2w - T2z;
+		    T2H = T2D - T2G;
+		    T2I = FNMS(KP587785252, T2H, KP951056516 * T2A);
+		    T2K = FMA(KP951056516, T2H, KP587785252 * T2A);
+		    ro[WS(os, 10)] = T7 + TC;
+		    T2J = T2s + T2r;
+		    ro[WS(os, 14)] = T2J - T2K;
+		    ro[WS(os, 6)] = T2J + T2K;
+		    T2t = T2r - T2s;
+		    ro[WS(os, 2)] = T2t - T2I;
+		    ro[WS(os, 18)] = T2t + T2I;
+	       }
+	       {
+		    E T2V, T2T, T2U, T2N, T2Y, T2L, T2M, T2X, T2W;
+		    T2V = KP559016994 * (T2R - T2S);
+		    T2T = T2R + T2S;
+		    T2U = FNMS(KP250000000, T2T, T2Q);
+		    T2L = Tt - TA;
+		    T2M = Te - Tl;
+		    T2N = FNMS(KP587785252, T2M, KP951056516 * T2L);
+		    T2Y = FMA(KP951056516, T2M, KP587785252 * T2L);
+		    io[WS(os, 10)] = T2Q + T2T;
+		    T2X = T2V + T2U;
+		    io[WS(os, 6)] = T2X - T2Y;
+		    io[WS(os, 14)] = T2Y + T2X;
+		    T2W = T2U - T2V;
+		    io[WS(os, 2)] = T2N + T2W;
+		    io[WS(os, 18)] = T2W - T2N;
+	       }
+	       {
+		    E T2Z, TK, T30, T38, T3a, T34, T37, T39, T31;
+		    T2Z = KP559016994 * (TG - TJ);
+		    TK = TG + TJ;
+		    T30 = FNMS(KP250000000, TK, TD);
+		    T34 = T32 - T33;
+		    T37 = T35 - T36;
+		    T38 = FMA(KP951056516, T34, KP587785252 * T37);
+		    T3a = FNMS(KP587785252, T34, KP951056516 * T37);
+		    ro[0] = TD + TK;
+		    T39 = T30 - T2Z;
+		    ro[WS(os, 12)] = T39 - T3a;
+		    ro[WS(os, 8)] = T39 + T3a;
+		    T31 = T2Z + T30;
+		    ro[WS(os, 4)] = T31 - T38;
+		    ro[WS(os, 16)] = T31 + T38;
+	       }
+	       {
+		    E T3g, T3i, T3j, T3d, T3m, T3b, T3c, T3l, T3k;
+		    T3g = KP559016994 * (T3e - T3f);
+		    T3i = T3e + T3f;
+		    T3j = FNMS(KP250000000, T3i, T3h);
+		    T3b = TE - TF;
+		    T3c = TH - TI;
+		    T3d = FMA(KP951056516, T3b, KP587785252 * T3c);
+		    T3m = FNMS(KP587785252, T3b, KP951056516 * T3c);
+		    io[0] = T3h + T3i;
+		    T3l = T3j - T3g;
+		    io[WS(os, 8)] = T3l - T3m;
+		    io[WS(os, 12)] = T3m + T3l;
+		    T3k = T3g + T3j;
+		    io[WS(os, 4)] = T3d + T3k;
+		    io[WS(os, 16)] = T3k - T3d;
+	       }
+	       {
+		    E T23, T1c, T24, T2c, T2e, T28, T2b, T2d, T25;
+		    T23 = KP559016994 * (T10 - T1b);
+		    T1c = T10 + T1b;
+		    T24 = FNMS(KP250000000, T1c, TP);
+		    T28 = T26 - T27;
+		    T2b = T29 - T2a;
+		    T2c = FMA(KP951056516, T28, KP587785252 * T2b);
+		    T2e = FNMS(KP587785252, T28, KP951056516 * T2b);
+		    io[WS(os, 5)] = TP + T1c;
+		    T2d = T24 - T23;
+		    io[WS(os, 13)] = T2d - T2e;
+		    io[WS(os, 17)] = T2d + T2e;
+		    T25 = T23 + T24;
+		    io[WS(os, 1)] = T25 - T2c;
+		    io[WS(os, 9)] = T25 + T2c;
+	       }
+	       {
+		    E T2k, T2m, T2n, T2h, T2p, T2f, T2g, T2q, T2o;
+		    T2k = KP559016994 * (T2i - T2j);
+		    T2m = T2i + T2j;
+		    T2n = FNMS(KP250000000, T2m, T2l);
+		    T2f = TU - TZ;
+		    T2g = T15 - T1a;
+		    T2h = FMA(KP951056516, T2f, KP587785252 * T2g);
+		    T2p = FNMS(KP587785252, T2f, KP951056516 * T2g);
+		    ro[WS(os, 5)] = T2l + T2m;
+		    T2q = T2n - T2k;
+		    ro[WS(os, 13)] = T2p + T2q;
+		    ro[WS(os, 17)] = T2q - T2p;
+		    T2o = T2k + T2n;
+		    ro[WS(os, 1)] = T2h + T2o;
+		    ro[WS(os, 9)] = T2o - T2h;
+	       }
+	       {
+		    E T1m, T1k, T1l, T1K, T1M, T1y, T1J, T1L, T1n;
+		    T1m = KP559016994 * (T1g - T1j);
+		    T1k = T1g + T1j;
+		    T1l = FNMS(KP250000000, T1k, T1d);
+		    T1y = T1s - T1x;
+		    T1J = T1D - T1I;
+		    T1K = FNMS(KP587785252, T1J, KP951056516 * T1y);
+		    T1M = FMA(KP951056516, T1J, KP587785252 * T1y);
+		    io[WS(os, 15)] = T1d + T1k;
+		    T1L = T1m + T1l;
+		    io[WS(os, 11)] = T1L - T1M;
+		    io[WS(os, 19)] = T1L + T1M;
+		    T1n = T1l - T1m;
+		    io[WS(os, 3)] = T1n - T1K;
+		    io[WS(os, 7)] = T1n + T1K;
+	       }
+	       {
+		    E T1Z, T1X, T1Y, T1P, T21, T1N, T1O, T22, T20;
+		    T1Z = KP559016994 * (T1V - T1W);
+		    T1X = T1V + T1W;
+		    T1Y = FNMS(KP250000000, T1X, T1U);
+		    T1N = T1h - T1i;
+		    T1O = T1e - T1f;
+		    T1P = FNMS(KP587785252, T1O, KP951056516 * T1N);
+		    T21 = FMA(KP951056516, T1O, KP587785252 * T1N);
+		    ro[WS(os, 15)] = T1U + T1X;
+		    T22 = T1Z + T1Y;
+		    ro[WS(os, 11)] = T21 + T22;
+		    ro[WS(os, 19)] = T22 - T21;
+		    T20 = T1Y - T1Z;
+		    ro[WS(os, 3)] = T1P + T20;
+		    ro[WS(os, 7)] = T20 - T1P;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 20, "n1_20", {184, 24, 24, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_20) (planner *p) {
+     X(kdft_register) (p, n1_20, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1207 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:46 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 25 -name n1_25 -include n.h */
+
+/*
+ * This function contains 352 FP additions, 268 FP multiplications,
+ * (or, 84 additions, 0 multiplications, 268 fused multiply/add),
+ * 164 stack variables, 47 constants, and 100 memory accesses
+ */
+#include "n.h"
+
+static void n1_25(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DK(KP554608978, +0.554608978404018097464974850792216217022558774);
+     DK(KP248028675, +0.248028675328619457762448260696444630363259177);
+     DK(KP726211448, +0.726211448929902658173535992263577167607493062);
+     DK(KP525970792, +0.525970792408939708442463226536226366643874659);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP851038619, +0.851038619207379630836264138867114231259902550);
+     DK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DK(KP912018591, +0.912018591466481957908415381764119056233607330);
+     DK(KP943557151, +0.943557151597354104399655195398983005179443399);
+     DK(KP614372930, +0.614372930789563808870829930444362096004872855);
+     DK(KP621716863, +0.621716863012209892444754556304102309693593202);
+     DK(KP994076283, +0.994076283785401014123185814696322018529298887);
+     DK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DK(KP126329378, +0.126329378446108174786050455341811215027378105);
+     DK(KP827271945, +0.827271945972475634034355757144307982555673741);
+     DK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DK(KP557913902, +0.557913902031834264187699648465567037992437152);
+     DK(KP249506682, +0.249506682107067890488084201715862638334226305);
+     DK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DK(KP968479752, +0.968479752739016373193524836781420152702090879);
+     DK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DK(KP470564281, +0.470564281212251493087595091036643380879947982);
+     DK(KP062914667, +0.062914667253649757225485955897349402364686947);
+     DK(KP921177326, +0.921177326965143320250447435415066029359282231);
+     DK(KP833417178, +0.833417178328688677408962550243238843138996060);
+     DK(KP541454447, +0.541454447536312777046285590082819509052033189);
+     DK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DK(KP871714437, +0.871714437527667770979999223229522602943903653);
+     DK(KP939062505, +0.939062505817492352556001843133229685779824606);
+     DK(KP549754652, +0.549754652192770074288023275540779861653779767);
+     DK(KP634619297, +0.634619297544148100711287640319130485732531031);
+     DK(KP256756360, +0.256756360367726783319498520922669048172391148);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(100, is), MAKE_VOLATILE_STRIDE(100, os)) {
+	       E T3Y, T3U, T3W, T42, T44, T3X, T3R, T3V, T3Z, T43;
+	       {
+		    E T4Q, T1U, T9, T3b, T45, T3e, T46, T1D, T4P, T1R, Ts, T1K, T18, T1E, T4z;
+		    E T5f, T3z, T22, T4s, T5b, T3C, T2o, T3D, T2h, T4p, T5c, T4w, T5e, T3A, T29;
+		    E T2z, T2y, TL, T1L, T1r, T1F, T4a, T57, T3v, T2x, T4k, T55, T3s, T2T, T2D;
+		    E T4c, T3t, T2M, T4h, T54, T1v, T1C, T1Q;
+		    {
+			 E T1, T2, T3, T5, T6;
+			 T1 = ri[0];
+			 T2 = ri[WS(is, 5)];
+			 T3 = ri[WS(is, 20)];
+			 T5 = ri[WS(is, 10)];
+			 T6 = ri[WS(is, 15)];
+			 {
+			      E T3a, T3c, T1y, T1z, T1A, T39, T4, T1S, T1B, T3d;
+			      T1v = ii[0];
+			      T4 = T2 + T3;
+			      T1S = T2 - T3;
+			      {
+				   E T7, T1T, T8, T1w, T1x;
+				   T7 = T5 + T6;
+				   T1T = T5 - T6;
+				   T1w = ii[WS(is, 5)];
+				   T1x = ii[WS(is, 20)];
+				   T4Q = FNMS(KP618033988, T1S, T1T);
+				   T1U = FMA(KP618033988, T1T, T1S);
+				   T8 = T4 + T7;
+				   T3a = T4 - T7;
+				   T3c = T1w - T1x;
+				   T1y = T1w + T1x;
+				   T1z = ii[WS(is, 10)];
+				   T1A = ii[WS(is, 15)];
+				   T39 = FNMS(KP250000000, T8, T1);
+				   T9 = T1 + T8;
+			      }
+			      T1B = T1z + T1A;
+			      T3d = T1z - T1A;
+			      T3b = FMA(KP559016994, T3a, T39);
+			      T45 = FNMS(KP559016994, T3a, T39);
+			      T3e = FMA(KP618033988, T3d, T3c);
+			      T46 = FNMS(KP618033988, T3c, T3d);
+			      T1C = T1y + T1B;
+			      T1Q = T1y - T1B;
+			 }
+		    }
+		    {
+			 E T24, T23, T28, T4v;
+			 {
+			      E Ta, TQ, Tj, TZ, T1Z, T20, Th, T26, T27, T1X, TX, T2l, T2m, Tq, T2c;
+			      E T2e, T12, T15, T2f, T1P, TT, TW;
+			      Ta = ri[WS(is, 1)];
+			      T1P = FNMS(KP250000000, T1C, T1v);
+			      T1D = T1v + T1C;
+			      TQ = ii[WS(is, 1)];
+			      Tj = ri[WS(is, 4)];
+			      T4P = FNMS(KP559016994, T1Q, T1P);
+			      T1R = FMA(KP559016994, T1Q, T1P);
+			      TZ = ii[WS(is, 4)];
+			      {
+				   E Tb, Tc, Te, Tf;
+				   Tb = ri[WS(is, 6)];
+				   Tc = ri[WS(is, 21)];
+				   Te = ri[WS(is, 11)];
+				   Tf = ri[WS(is, 16)];
+				   {
+					E TR, Td, Tg, TS, TU, TV;
+					TR = ii[WS(is, 6)];
+					T1Z = Tc - Tb;
+					Td = Tb + Tc;
+					T20 = Tf - Te;
+					Tg = Te + Tf;
+					TS = ii[WS(is, 21)];
+					TU = ii[WS(is, 11)];
+					TV = ii[WS(is, 16)];
+					Th = Td + Tg;
+					T24 = Td - Tg;
+					T26 = TR - TS;
+					TT = TR + TS;
+					TW = TU + TV;
+					T27 = TV - TU;
+				   }
+			      }
+			      {
+				   E Tk, Tl, Tn, To;
+				   Tk = ri[WS(is, 9)];
+				   T1X = TT - TW;
+				   TX = TT + TW;
+				   Tl = ri[WS(is, 24)];
+				   Tn = ri[WS(is, 14)];
+				   To = ri[WS(is, 19)];
+				   {
+					E T10, Tm, Tp, T11, T13, T14;
+					T10 = ii[WS(is, 9)];
+					T2l = Tl - Tk;
+					Tm = Tk + Tl;
+					T2m = To - Tn;
+					Tp = Tn + To;
+					T11 = ii[WS(is, 24)];
+					T13 = ii[WS(is, 14)];
+					T14 = ii[WS(is, 19)];
+					Tq = Tm + Tp;
+					T2c = Tm - Tp;
+					T2e = T11 - T10;
+					T12 = T10 + T11;
+					T15 = T13 + T14;
+					T2f = T14 - T13;
+				   }
+			      }
+			      {
+				   E T2j, T2b, T1W, T21, T4y, T2i;
+				   {
+					E Ti, T16, Tr, TY, T17;
+					T23 = FNMS(KP250000000, Th, Ta);
+					Ti = Ta + Th;
+					T2j = T15 - T12;
+					T16 = T12 + T15;
+					Tr = Tj + Tq;
+					T2b = FMS(KP250000000, Tq, Tj);
+					T1W = FNMS(KP250000000, TX, TQ);
+					TY = TQ + TX;
+					T21 = FMA(KP618033988, T20, T1Z);
+					T4y = FNMS(KP618033988, T1Z, T20);
+					T2i = FNMS(KP250000000, T16, TZ);
+					T17 = TZ + T16;
+					Ts = Ti + Tr;
+					T1K = Ti - Tr;
+					T18 = TY - T17;
+					T1E = TY + T17;
+				   }
+				   {
+					E T2n, T4r, T4x, T1Y;
+					T2n = FMA(KP618033988, T2m, T2l);
+					T4r = FNMS(KP618033988, T2l, T2m);
+					T4x = FNMS(KP559016994, T1X, T1W);
+					T1Y = FMA(KP559016994, T1X, T1W);
+					{
+					     E T4o, T2g, T2d, T4n, T4q, T2k;
+					     T4o = FNMS(KP618033988, T2e, T2f);
+					     T2g = FMA(KP618033988, T2f, T2e);
+					     T4z = FMA(KP951056516, T4y, T4x);
+					     T5f = FNMS(KP951056516, T4y, T4x);
+					     T3z = FNMS(KP951056516, T21, T1Y);
+					     T22 = FMA(KP951056516, T21, T1Y);
+					     T4q = FMA(KP559016994, T2j, T2i);
+					     T2k = FNMS(KP559016994, T2j, T2i);
+					     T4s = FMA(KP951056516, T4r, T4q);
+					     T5b = FNMS(KP951056516, T4r, T4q);
+					     T3C = FNMS(KP951056516, T2n, T2k);
+					     T2o = FMA(KP951056516, T2n, T2k);
+					     T2d = FNMS(KP559016994, T2c, T2b);
+					     T4n = FMA(KP559016994, T2c, T2b);
+					     T28 = FNMS(KP618033988, T27, T26);
+					     T4v = FMA(KP618033988, T26, T27);
+					     T3D = FNMS(KP951056516, T2g, T2d);
+					     T2h = FMA(KP951056516, T2g, T2d);
+					     T4p = FMA(KP951056516, T4o, T4n);
+					     T5c = FNMS(KP951056516, T4o, T4n);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E Tt, T19, TC, T1i, T2u, T2v, TA, T2B, T2C, T2s, T1g, T2J, T2K, TJ, T2O;
+			      E T2Q, T1l, T1o, T2R;
+			      {
+				   E T4u, T25, T1c, T1f;
+				   Tt = ri[WS(is, 2)];
+				   T19 = ii[WS(is, 2)];
+				   TC = ri[WS(is, 3)];
+				   T4u = FNMS(KP559016994, T24, T23);
+				   T25 = FMA(KP559016994, T24, T23);
+				   T1i = ii[WS(is, 3)];
+				   {
+					E Tu, Tv, Tx, Ty;
+					Tu = ri[WS(is, 7)];
+					T4w = FNMS(KP951056516, T4v, T4u);
+					T5e = FMA(KP951056516, T4v, T4u);
+					T3A = FNMS(KP951056516, T28, T25);
+					T29 = FMA(KP951056516, T28, T25);
+					Tv = ri[WS(is, 22)];
+					Tx = ri[WS(is, 12)];
+					Ty = ri[WS(is, 17)];
+					{
+					     E T1a, Tw, Tz, T1b, T1d, T1e;
+					     T1a = ii[WS(is, 7)];
+					     T2u = Tv - Tu;
+					     Tw = Tu + Tv;
+					     T2v = Ty - Tx;
+					     Tz = Tx + Ty;
+					     T1b = ii[WS(is, 22)];
+					     T1d = ii[WS(is, 12)];
+					     T1e = ii[WS(is, 17)];
+					     TA = Tw + Tz;
+					     T2z = Tz - Tw;
+					     T2B = T1b - T1a;
+					     T1c = T1a + T1b;
+					     T1f = T1d + T1e;
+					     T2C = T1d - T1e;
+					}
+				   }
+				   {
+					E TD, TE, TG, TH;
+					TD = ri[WS(is, 8)];
+					T2s = T1f - T1c;
+					T1g = T1c + T1f;
+					TE = ri[WS(is, 23)];
+					TG = ri[WS(is, 13)];
+					TH = ri[WS(is, 18)];
+					{
+					     E T1j, TF, TI, T1k, T1m, T1n;
+					     T1j = ii[WS(is, 8)];
+					     T2J = TD - TE;
+					     TF = TD + TE;
+					     T2K = TG - TH;
+					     TI = TG + TH;
+					     T1k = ii[WS(is, 23)];
+					     T1m = ii[WS(is, 13)];
+					     T1n = ii[WS(is, 18)];
+					     TJ = TF + TI;
+					     T2O = TI - TF;
+					     T2Q = T1k - T1j;
+					     T1l = T1j + T1k;
+					     T1o = T1m + T1n;
+					     T2R = T1n - T1m;
+					}
+				   }
+			      }
+			      {
+				   E T2H, T2N, T2r, T2w, T49, T2G;
+				   {
+					E TB, T1p, TK, T1h, T1q;
+					T2y = FNMS(KP250000000, TA, Tt);
+					TB = Tt + TA;
+					T2H = T1o - T1l;
+					T1p = T1l + T1o;
+					TK = TC + TJ;
+					T2N = FNMS(KP250000000, TJ, TC);
+					T2r = FNMS(KP250000000, T1g, T19);
+					T1h = T19 + T1g;
+					T2w = FMA(KP618033988, T2v, T2u);
+					T49 = FNMS(KP618033988, T2u, T2v);
+					T2G = FNMS(KP250000000, T1p, T1i);
+					T1q = T1i + T1p;
+					TL = TB + TK;
+					T1L = TB - TK;
+					T1r = T1h - T1q;
+					T1F = T1h + T1q;
+				   }
+				   {
+					E T2S, T4j, T48, T2t;
+					T2S = FMA(KP618033988, T2R, T2Q);
+					T4j = FNMS(KP618033988, T2Q, T2R);
+					T48 = FMA(KP559016994, T2s, T2r);
+					T2t = FNMS(KP559016994, T2s, T2r);
+					{
+					     E T4g, T2L, T2I, T4f, T4i, T2P;
+					     T4g = FNMS(KP618033988, T2J, T2K);
+					     T2L = FMA(KP618033988, T2K, T2J);
+					     T4a = FMA(KP951056516, T49, T48);
+					     T57 = FNMS(KP951056516, T49, T48);
+					     T3v = FNMS(KP951056516, T2w, T2t);
+					     T2x = FMA(KP951056516, T2w, T2t);
+					     T4i = FMA(KP559016994, T2O, T2N);
+					     T2P = FNMS(KP559016994, T2O, T2N);
+					     T4k = FNMS(KP951056516, T4j, T4i);
+					     T55 = FMA(KP951056516, T4j, T4i);
+					     T3s = FMA(KP951056516, T2S, T2P);
+					     T2T = FNMS(KP951056516, T2S, T2P);
+					     T2I = FNMS(KP559016994, T2H, T2G);
+					     T4f = FMA(KP559016994, T2H, T2G);
+					     T2D = FNMS(KP618033988, T2C, T2B);
+					     T4c = FMA(KP618033988, T2B, T2C);
+					     T3t = FMA(KP951056516, T2L, T2I);
+					     T2M = FNMS(KP951056516, T2L, T2I);
+					     T4h = FNMS(KP951056516, T4g, T4f);
+					     T54 = FMA(KP951056516, T4g, T4f);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T4d, T58, T3w, T3H, T3r, T3k, T36, T38, T3o, T3q, T3j, T2Z, T37;
+			 {
+			      E T2E, T1s, T1u, TP, T1t;
+			      {
+				   E TM, TO, TN, T4b, T2A;
+				   TM = Ts + TL;
+				   TO = Ts - TL;
+				   T4b = FMA(KP559016994, T2z, T2y);
+				   T2A = FNMS(KP559016994, T2z, T2y);
+				   TN = FNMS(KP250000000, TM, T9);
+				   T4d = FMA(KP951056516, T4c, T4b);
+				   T58 = FNMS(KP951056516, T4c, T4b);
+				   T3w = FMA(KP951056516, T2D, T2A);
+				   T2E = FNMS(KP951056516, T2D, T2A);
+				   T1s = FMA(KP618033988, T1r, T18);
+				   T1u = FNMS(KP618033988, T18, T1r);
+				   ro[0] = T9 + TM;
+				   TP = FMA(KP559016994, TO, TN);
+				   T1t = FNMS(KP559016994, TO, TN);
+			      }
+			      {
+				   E T1J, T1N, T1M, T1O, T1G, T1I, T1H;
+				   T1G = T1E + T1F;
+				   T1I = T1E - T1F;
+				   ro[WS(os, 15)] = FMA(KP951056516, T1u, T1t);
+				   ro[WS(os, 10)] = FNMS(KP951056516, T1u, T1t);
+				   ro[WS(os, 5)] = FMA(KP951056516, T1s, TP);
+				   ro[WS(os, 20)] = FNMS(KP951056516, T1s, TP);
+				   T1H = FNMS(KP250000000, T1G, T1D);
+				   io[0] = T1D + T1G;
+				   T1J = FMA(KP559016994, T1I, T1H);
+				   T1N = FNMS(KP559016994, T1I, T1H);
+				   T1M = FMA(KP618033988, T1L, T1K);
+				   T1O = FNMS(KP618033988, T1K, T1L);
+				   {
+					E T1V, T3f, T3m, T3n, T2W, T2Y, T32, T3g, T3h, T35, T3i, T2X;
+					T3H = FMA(KP951056516, T1U, T1R);
+					T1V = FNMS(KP951056516, T1U, T1R);
+					T3f = FMA(KP951056516, T3e, T3b);
+					T3r = FNMS(KP951056516, T3e, T3b);
+					io[WS(os, 15)] = FNMS(KP951056516, T1O, T1N);
+					io[WS(os, 10)] = FMA(KP951056516, T1O, T1N);
+					io[WS(os, 20)] = FMA(KP951056516, T1M, T1J);
+					io[WS(os, 5)] = FNMS(KP951056516, T1M, T1J);
+					{
+					     E T30, T2a, T2p, T31, T33, T2F, T2U, T34, T2q, T2V;
+					     T30 = FMA(KP256756360, T22, T29);
+					     T2a = FNMS(KP256756360, T29, T22);
+					     T2p = FMA(KP634619297, T2o, T2h);
+					     T31 = FNMS(KP634619297, T2h, T2o);
+					     T33 = FMA(KP549754652, T2x, T2E);
+					     T2F = FNMS(KP549754652, T2E, T2x);
+					     T2U = FNMS(KP939062505, T2T, T2M);
+					     T34 = FMA(KP939062505, T2M, T2T);
+					     T3m = FNMS(KP871714437, T2p, T2a);
+					     T2q = FMA(KP871714437, T2p, T2a);
+					     T3n = FNMS(KP831864738, T2U, T2F);
+					     T2V = FMA(KP831864738, T2U, T2F);
+					     T2W = FMA(KP904730450, T2V, T2q);
+					     T2Y = FNMS(KP904730450, T2V, T2q);
+					     T32 = FNMS(KP871714437, T31, T30);
+					     T3g = FMA(KP871714437, T31, T30);
+					     T3h = FMA(KP831864738, T34, T33);
+					     T35 = FNMS(KP831864738, T34, T33);
+					}
+					io[WS(os, 1)] = FMA(KP968583161, T2W, T1V);
+					T3i = FMA(KP904730450, T3h, T3g);
+					T3k = FNMS(KP904730450, T3h, T3g);
+					T36 = FMA(KP559154169, T35, T32);
+					T38 = FNMS(KP683113946, T32, T35);
+					ro[WS(os, 1)] = FMA(KP968583161, T3i, T3f);
+					T2X = FNMS(KP242145790, T2W, T1V);
+					T3o = FMA(KP559154169, T3n, T3m);
+					T3q = FNMS(KP683113946, T3m, T3n);
+					T3j = FNMS(KP242145790, T3i, T3f);
+					T2Z = FMA(KP541454447, T2Y, T2X);
+					T37 = FNMS(KP541454447, T2Y, T2X);
+				   }
+			      }
+			 }
+			 {
+			      E T47, T4R, T5A, T5w, T5y, T5E, T5G, T5z, T5t, T5x;
+			      {
+				   E T53, T5j, T5u, T5v, T5i, T5D, T5m, T5p, T5C, T3p, T3l, T5s, T5q, T5r;
+				   T47 = FMA(KP951056516, T46, T45);
+				   T53 = FNMS(KP951056516, T46, T45);
+				   T3p = FNMS(KP541454447, T3k, T3j);
+				   T3l = FMA(KP541454447, T3k, T3j);
+				   io[WS(os, 16)] = FNMS(KP833417178, T38, T37);
+				   io[WS(os, 11)] = FMA(KP833417178, T38, T37);
+				   io[WS(os, 21)] = FMA(KP921177326, T36, T2Z);
+				   io[WS(os, 6)] = FNMS(KP921177326, T36, T2Z);
+				   ro[WS(os, 11)] = FNMS(KP833417178, T3q, T3p);
+				   ro[WS(os, 16)] = FMA(KP833417178, T3q, T3p);
+				   ro[WS(os, 21)] = FNMS(KP921177326, T3o, T3l);
+				   ro[WS(os, 6)] = FMA(KP921177326, T3o, T3l);
+				   T5j = FMA(KP951056516, T4Q, T4P);
+				   T4R = FNMS(KP951056516, T4Q, T4P);
+				   {
+					E T5k, T56, T59, T5l, T5n, T5d, T5g, T5o, T5a, T5h;
+					T5k = FNMS(KP062914667, T54, T55);
+					T56 = FMA(KP062914667, T55, T54);
+					T59 = FMA(KP634619297, T58, T57);
+					T5l = FNMS(KP634619297, T57, T58);
+					T5n = FNMS(KP470564281, T5b, T5c);
+					T5d = FMA(KP470564281, T5c, T5b);
+					T5g = FMA(KP549754652, T5f, T5e);
+					T5o = FNMS(KP549754652, T5e, T5f);
+					T5u = FNMS(KP845997307, T59, T56);
+					T5a = FMA(KP845997307, T59, T56);
+					T5v = FNMS(KP968479752, T5g, T5d);
+					T5h = FMA(KP968479752, T5g, T5d);
+					T5i = FMA(KP906616052, T5h, T5a);
+					T5A = FNMS(KP906616052, T5h, T5a);
+					T5D = FNMS(KP845997307, T5l, T5k);
+					T5m = FMA(KP845997307, T5l, T5k);
+					T5p = FMA(KP968479752, T5o, T5n);
+					T5C = FNMS(KP968479752, T5o, T5n);
+				   }
+				   ro[WS(os, 2)] = FMA(KP998026728, T5i, T53);
+				   T5s = FMA(KP906616052, T5p, T5m);
+				   T5q = FNMS(KP906616052, T5p, T5m);
+				   T5w = FNMS(KP560319534, T5v, T5u);
+				   T5y = FMA(KP681693190, T5u, T5v);
+				   T5E = FNMS(KP681693190, T5D, T5C);
+				   T5G = FMA(KP560319534, T5C, T5D);
+				   T5r = FMA(KP249506682, T5q, T5j);
+				   io[WS(os, 2)] = FNMS(KP998026728, T5q, T5j);
+				   T5z = FNMS(KP249506682, T5i, T53);
+				   T5t = FNMS(KP557913902, T5s, T5r);
+				   T5x = FMA(KP557913902, T5s, T5r);
+			      }
+			      {
+				   E T4W, T4M, T4O, T50, T52, T4V, T4F, T4N;
+				   {
+					E T4Y, T4Z, T4C, T4E, T4I, T4T, T4S, T4L, T5F, T5B, T4U, T4D;
+					T5F = FMA(KP557913902, T5A, T5z);
+					T5B = FNMS(KP557913902, T5A, T5z);
+					io[WS(os, 7)] = FMA(KP860541664, T5y, T5x);
+					io[WS(os, 22)] = FNMS(KP860541664, T5y, T5x);
+					io[WS(os, 17)] = FMA(KP949179823, T5w, T5t);
+					io[WS(os, 12)] = FNMS(KP949179823, T5w, T5t);
+					ro[WS(os, 12)] = FNMS(KP949179823, T5G, T5F);
+					ro[WS(os, 17)] = FMA(KP949179823, T5G, T5F);
+					ro[WS(os, 7)] = FNMS(KP860541664, T5E, T5B);
+					ro[WS(os, 22)] = FMA(KP860541664, T5E, T5B);
+					{
+					     E T4J, T4e, T4l, T4K, T4G, T4t, T4A, T4H, T4m, T4B;
+					     T4J = FNMS(KP062914667, T4a, T4d);
+					     T4e = FMA(KP062914667, T4d, T4a);
+					     T4l = FNMS(KP827271945, T4k, T4h);
+					     T4K = FMA(KP827271945, T4h, T4k);
+					     T4G = FNMS(KP126329378, T4p, T4s);
+					     T4t = FMA(KP126329378, T4s, T4p);
+					     T4A = FMA(KP939062505, T4z, T4w);
+					     T4H = FNMS(KP939062505, T4w, T4z);
+					     T4Y = FNMS(KP772036680, T4l, T4e);
+					     T4m = FMA(KP772036680, T4l, T4e);
+					     T4Z = FNMS(KP734762448, T4A, T4t);
+					     T4B = FMA(KP734762448, T4A, T4t);
+					     T4C = FMA(KP994076283, T4B, T4m);
+					     T4E = FNMS(KP994076283, T4B, T4m);
+					     T4I = FMA(KP734762448, T4H, T4G);
+					     T4T = FNMS(KP734762448, T4H, T4G);
+					     T4S = FMA(KP772036680, T4K, T4J);
+					     T4L = FNMS(KP772036680, T4K, T4J);
+					}
+					ro[WS(os, 3)] = FMA(KP998026728, T4C, T47);
+					T4U = FMA(KP994076283, T4T, T4S);
+					T4W = FNMS(KP994076283, T4T, T4S);
+					T4M = FNMS(KP621716863, T4L, T4I);
+					T4O = FMA(KP614372930, T4I, T4L);
+					io[WS(os, 3)] = FNMS(KP998026728, T4U, T4R);
+					T4D = FNMS(KP249506682, T4C, T47);
+					T50 = FMA(KP614372930, T4Z, T4Y);
+					T52 = FNMS(KP621716863, T4Y, T4Z);
+					T4V = FMA(KP249506682, T4U, T4R);
+					T4F = FNMS(KP557913902, T4E, T4D);
+					T4N = FMA(KP557913902, T4E, T4D);
+				   }
+				   {
+					E T3S, T3T, T3G, T41, T3K, T3N, T40, T51, T4X, T3Q, T3O, T3P;
+					T51 = FMA(KP557913902, T4W, T4V);
+					T4X = FNMS(KP557913902, T4W, T4V);
+					ro[WS(os, 18)] = FNMS(KP949179823, T4O, T4N);
+					ro[WS(os, 13)] = FMA(KP949179823, T4O, T4N);
+					ro[WS(os, 8)] = FMA(KP943557151, T4M, T4F);
+					ro[WS(os, 23)] = FNMS(KP943557151, T4M, T4F);
+					io[WS(os, 8)] = FMA(KP943557151, T52, T51);
+					io[WS(os, 23)] = FNMS(KP943557151, T52, T51);
+					io[WS(os, 18)] = FNMS(KP949179823, T50, T4X);
+					io[WS(os, 13)] = FMA(KP949179823, T50, T4X);
+					{
+					     E T3I, T3u, T3x, T3J, T3L, T3B, T3E, T3M, T3y, T3F;
+					     T3I = FMA(KP126329378, T3s, T3t);
+					     T3u = FNMS(KP126329378, T3t, T3s);
+					     T3x = FNMS(KP470564281, T3w, T3v);
+					     T3J = FMA(KP470564281, T3v, T3w);
+					     T3L = FNMS(KP634619297, T3z, T3A);
+					     T3B = FMA(KP634619297, T3A, T3z);
+					     T3E = FNMS(KP827271945, T3D, T3C);
+					     T3M = FMA(KP827271945, T3C, T3D);
+					     T3S = FMA(KP912018591, T3x, T3u);
+					     T3y = FNMS(KP912018591, T3x, T3u);
+					     T3T = FMA(KP912575812, T3E, T3B);
+					     T3F = FNMS(KP912575812, T3E, T3B);
+					     T3G = FNMS(KP851038619, T3F, T3y);
+					     T3Y = FMA(KP851038619, T3F, T3y);
+					     T41 = FNMS(KP912018591, T3J, T3I);
+					     T3K = FMA(KP912018591, T3J, T3I);
+					     T3N = FMA(KP912575812, T3M, T3L);
+					     T40 = FNMS(KP912575812, T3M, T3L);
+					}
+					ro[WS(os, 4)] = FNMS(KP992114701, T3G, T3r);
+					T3Q = FNMS(KP851038619, T3N, T3K);
+					T3O = FMA(KP851038619, T3N, T3K);
+					T3U = FNMS(KP525970792, T3T, T3S);
+					T3W = FMA(KP726211448, T3S, T3T);
+					T42 = FNMS(KP726211448, T41, T40);
+					T44 = FMA(KP525970792, T40, T41);
+					T3P = FMA(KP248028675, T3O, T3H);
+					io[WS(os, 4)] = FNMS(KP992114701, T3O, T3H);
+					T3X = FMA(KP248028675, T3G, T3r);
+					T3R = FNMS(KP554608978, T3Q, T3P);
+					T3V = FMA(KP554608978, T3Q, T3P);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T3Z = FMA(KP554608978, T3Y, T3X);
+	       T43 = FNMS(KP554608978, T3Y, T3X);
+	       io[WS(os, 9)] = FNMS(KP803003575, T3W, T3V);
+	       io[WS(os, 24)] = FMA(KP803003575, T3W, T3V);
+	       io[WS(os, 19)] = FNMS(KP943557151, T3U, T3R);
+	       io[WS(os, 14)] = FMA(KP943557151, T3U, T3R);
+	       ro[WS(os, 14)] = FNMS(KP943557151, T44, T43);
+	       ro[WS(os, 19)] = FMA(KP943557151, T44, T43);
+	       ro[WS(os, 24)] = FMA(KP803003575, T42, T3Z);
+	       ro[WS(os, 9)] = FNMS(KP803003575, T42, T3Z);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 25, "n1_25", {84, 0, 268, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_25) (planner *p) {
+     X(kdft_register) (p, n1_25, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 25 -name n1_25 -include n.h */
+
+/*
+ * This function contains 352 FP additions, 184 FP multiplications,
+ * (or, 260 additions, 92 multiplications, 92 fused multiply/add),
+ * 101 stack variables, 20 constants, and 100 memory accesses
+ */
+#include "n.h"
+
+static void n1_25(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(100, is), MAKE_VOLATILE_STRIDE(100, os)) {
+	       E T9, T4u, T2T, TP, T3H, TW, T5y, T3I, T2Q, T4v, Ti, Tr, Ts, T5m, T5n;
+	       E T5v, T18, T4G, T34, T3M, T1G, T4J, T38, T3T, T1v, T4K, T37, T3W, T1j, T4H;
+	       E T35, T3P, TB, TK, TL, T5p, T5q, T5w, T1T, T4N, T3c, T41, T2r, T4Q, T3e;
+	       E T4b, T2g, T4R, T3f, T48, T24, T4O, T3b, T44;
+	       {
+		    E T1, T4, T7, T8, T2S, T2R, TN, TO;
+		    T1 = ri[0];
+		    {
+			 E T2, T3, T5, T6;
+			 T2 = ri[WS(is, 5)];
+			 T3 = ri[WS(is, 20)];
+			 T4 = T2 + T3;
+			 T5 = ri[WS(is, 10)];
+			 T6 = ri[WS(is, 15)];
+			 T7 = T5 + T6;
+			 T8 = T4 + T7;
+			 T2S = T5 - T6;
+			 T2R = T2 - T3;
+		    }
+		    T9 = T1 + T8;
+		    T4u = FNMS(KP587785252, T2R, KP951056516 * T2S);
+		    T2T = FMA(KP951056516, T2R, KP587785252 * T2S);
+		    TN = KP559016994 * (T4 - T7);
+		    TO = FNMS(KP250000000, T8, T1);
+		    TP = TN + TO;
+		    T3H = TO - TN;
+	       }
+	       {
+		    E T2N, T2K, T2L, TS, T2O, TV, T2M, T2P;
+		    T2N = ii[0];
+		    {
+			 E TQ, TR, TT, TU;
+			 TQ = ii[WS(is, 5)];
+			 TR = ii[WS(is, 20)];
+			 T2K = TQ + TR;
+			 TT = ii[WS(is, 10)];
+			 TU = ii[WS(is, 15)];
+			 T2L = TT + TU;
+			 TS = TQ - TR;
+			 T2O = T2K + T2L;
+			 TV = TT - TU;
+		    }
+		    TW = FMA(KP951056516, TS, KP587785252 * TV);
+		    T5y = T2N + T2O;
+		    T3I = FNMS(KP587785252, TS, KP951056516 * TV);
+		    T2M = KP559016994 * (T2K - T2L);
+		    T2P = FNMS(KP250000000, T2O, T2N);
+		    T2Q = T2M + T2P;
+		    T4v = T2P - T2M;
+	       }
+	       {
+		    E Ta, T1c, Tj, T1z, Th, T1h, TY, T1g, T13, T1d, T16, T1b, Tq, T1E, T1l;
+		    E T1D, T1q, T1A, T1t, T1y;
+		    Ta = ri[WS(is, 1)];
+		    T1c = ii[WS(is, 1)];
+		    Tj = ri[WS(is, 4)];
+		    T1z = ii[WS(is, 4)];
+		    {
+			 E Tb, Tc, Td, Te, Tf, Tg;
+			 Tb = ri[WS(is, 6)];
+			 Tc = ri[WS(is, 21)];
+			 Td = Tb + Tc;
+			 Te = ri[WS(is, 11)];
+			 Tf = ri[WS(is, 16)];
+			 Tg = Te + Tf;
+			 Th = Td + Tg;
+			 T1h = Te - Tf;
+			 TY = KP559016994 * (Td - Tg);
+			 T1g = Tb - Tc;
+		    }
+		    {
+			 E T11, T12, T19, T14, T15, T1a;
+			 T11 = ii[WS(is, 6)];
+			 T12 = ii[WS(is, 21)];
+			 T19 = T11 + T12;
+			 T14 = ii[WS(is, 11)];
+			 T15 = ii[WS(is, 16)];
+			 T1a = T14 + T15;
+			 T13 = T11 - T12;
+			 T1d = T19 + T1a;
+			 T16 = T14 - T15;
+			 T1b = KP559016994 * (T19 - T1a);
+		    }
+		    {
+			 E Tk, Tl, Tm, Tn, To, Tp;
+			 Tk = ri[WS(is, 9)];
+			 Tl = ri[WS(is, 24)];
+			 Tm = Tk + Tl;
+			 Tn = ri[WS(is, 14)];
+			 To = ri[WS(is, 19)];
+			 Tp = Tn + To;
+			 Tq = Tm + Tp;
+			 T1E = Tn - To;
+			 T1l = KP559016994 * (Tm - Tp);
+			 T1D = Tk - Tl;
+		    }
+		    {
+			 E T1o, T1p, T1w, T1r, T1s, T1x;
+			 T1o = ii[WS(is, 9)];
+			 T1p = ii[WS(is, 24)];
+			 T1w = T1o + T1p;
+			 T1r = ii[WS(is, 14)];
+			 T1s = ii[WS(is, 19)];
+			 T1x = T1r + T1s;
+			 T1q = T1o - T1p;
+			 T1A = T1w + T1x;
+			 T1t = T1r - T1s;
+			 T1y = KP559016994 * (T1w - T1x);
+		    }
+		    Ti = Ta + Th;
+		    Tr = Tj + Tq;
+		    Ts = Ti + Tr;
+		    T5m = T1c + T1d;
+		    T5n = T1z + T1A;
+		    T5v = T5m + T5n;
+		    {
+			 E T17, T3L, T10, T3K, TZ;
+			 T17 = FMA(KP951056516, T13, KP587785252 * T16);
+			 T3L = FNMS(KP587785252, T13, KP951056516 * T16);
+			 TZ = FNMS(KP250000000, Th, Ta);
+			 T10 = TY + TZ;
+			 T3K = TZ - TY;
+			 T18 = T10 + T17;
+			 T4G = T3K + T3L;
+			 T34 = T10 - T17;
+			 T3M = T3K - T3L;
+		    }
+		    {
+			 E T1F, T3R, T1C, T3S, T1B;
+			 T1F = FMA(KP951056516, T1D, KP587785252 * T1E);
+			 T3R = FNMS(KP587785252, T1D, KP951056516 * T1E);
+			 T1B = FNMS(KP250000000, T1A, T1z);
+			 T1C = T1y + T1B;
+			 T3S = T1B - T1y;
+			 T1G = T1C - T1F;
+			 T4J = T3S - T3R;
+			 T38 = T1F + T1C;
+			 T3T = T3R + T3S;
+		    }
+		    {
+			 E T1u, T3V, T1n, T3U, T1m;
+			 T1u = FMA(KP951056516, T1q, KP587785252 * T1t);
+			 T3V = FNMS(KP587785252, T1q, KP951056516 * T1t);
+			 T1m = FNMS(KP250000000, Tq, Tj);
+			 T1n = T1l + T1m;
+			 T3U = T1m - T1l;
+			 T1v = T1n + T1u;
+			 T4K = T3U + T3V;
+			 T37 = T1n - T1u;
+			 T3W = T3U - T3V;
+		    }
+		    {
+			 E T1i, T3N, T1f, T3O, T1e;
+			 T1i = FMA(KP951056516, T1g, KP587785252 * T1h);
+			 T3N = FNMS(KP587785252, T1g, KP951056516 * T1h);
+			 T1e = FNMS(KP250000000, T1d, T1c);
+			 T1f = T1b + T1e;
+			 T3O = T1e - T1b;
+			 T1j = T1f - T1i;
+			 T4H = T3O - T3N;
+			 T35 = T1i + T1f;
+			 T3P = T3N + T3O;
+		    }
+	       }
+	       {
+		    E Tt, T1X, TC, T2k, TA, T22, T1J, T21, T1O, T1Y, T1R, T1W, TJ, T2p, T26;
+		    E T2o, T2b, T2l, T2e, T2j;
+		    Tt = ri[WS(is, 2)];
+		    T1X = ii[WS(is, 2)];
+		    TC = ri[WS(is, 3)];
+		    T2k = ii[WS(is, 3)];
+		    {
+			 E Tu, Tv, Tw, Tx, Ty, Tz;
+			 Tu = ri[WS(is, 7)];
+			 Tv = ri[WS(is, 22)];
+			 Tw = Tu + Tv;
+			 Tx = ri[WS(is, 12)];
+			 Ty = ri[WS(is, 17)];
+			 Tz = Tx + Ty;
+			 TA = Tw + Tz;
+			 T22 = Tx - Ty;
+			 T1J = KP559016994 * (Tw - Tz);
+			 T21 = Tu - Tv;
+		    }
+		    {
+			 E T1M, T1N, T1U, T1P, T1Q, T1V;
+			 T1M = ii[WS(is, 7)];
+			 T1N = ii[WS(is, 22)];
+			 T1U = T1M + T1N;
+			 T1P = ii[WS(is, 12)];
+			 T1Q = ii[WS(is, 17)];
+			 T1V = T1P + T1Q;
+			 T1O = T1M - T1N;
+			 T1Y = T1U + T1V;
+			 T1R = T1P - T1Q;
+			 T1W = KP559016994 * (T1U - T1V);
+		    }
+		    {
+			 E TD, TE, TF, TG, TH, TI;
+			 TD = ri[WS(is, 8)];
+			 TE = ri[WS(is, 23)];
+			 TF = TD + TE;
+			 TG = ri[WS(is, 13)];
+			 TH = ri[WS(is, 18)];
+			 TI = TG + TH;
+			 TJ = TF + TI;
+			 T2p = TG - TH;
+			 T26 = KP559016994 * (TF - TI);
+			 T2o = TD - TE;
+		    }
+		    {
+			 E T29, T2a, T2h, T2c, T2d, T2i;
+			 T29 = ii[WS(is, 8)];
+			 T2a = ii[WS(is, 23)];
+			 T2h = T29 + T2a;
+			 T2c = ii[WS(is, 13)];
+			 T2d = ii[WS(is, 18)];
+			 T2i = T2c + T2d;
+			 T2b = T29 - T2a;
+			 T2l = T2h + T2i;
+			 T2e = T2c - T2d;
+			 T2j = KP559016994 * (T2h - T2i);
+		    }
+		    TB = Tt + TA;
+		    TK = TC + TJ;
+		    TL = TB + TK;
+		    T5p = T1X + T1Y;
+		    T5q = T2k + T2l;
+		    T5w = T5p + T5q;
+		    {
+			 E T1S, T40, T1L, T3Z, T1K;
+			 T1S = FMA(KP951056516, T1O, KP587785252 * T1R);
+			 T40 = FNMS(KP587785252, T1O, KP951056516 * T1R);
+			 T1K = FNMS(KP250000000, TA, Tt);
+			 T1L = T1J + T1K;
+			 T3Z = T1K - T1J;
+			 T1T = T1L + T1S;
+			 T4N = T3Z + T40;
+			 T3c = T1L - T1S;
+			 T41 = T3Z - T40;
+		    }
+		    {
+			 E T2q, T49, T2n, T4a, T2m;
+			 T2q = FMA(KP951056516, T2o, KP587785252 * T2p);
+			 T49 = FNMS(KP587785252, T2o, KP951056516 * T2p);
+			 T2m = FNMS(KP250000000, T2l, T2k);
+			 T2n = T2j + T2m;
+			 T4a = T2m - T2j;
+			 T2r = T2n - T2q;
+			 T4Q = T4a - T49;
+			 T3e = T2q + T2n;
+			 T4b = T49 + T4a;
+		    }
+		    {
+			 E T2f, T47, T28, T46, T27;
+			 T2f = FMA(KP951056516, T2b, KP587785252 * T2e);
+			 T47 = FNMS(KP587785252, T2b, KP951056516 * T2e);
+			 T27 = FNMS(KP250000000, TJ, TC);
+			 T28 = T26 + T27;
+			 T46 = T27 - T26;
+			 T2g = T28 + T2f;
+			 T4R = T46 + T47;
+			 T3f = T28 - T2f;
+			 T48 = T46 - T47;
+		    }
+		    {
+			 E T23, T42, T20, T43, T1Z;
+			 T23 = FMA(KP951056516, T21, KP587785252 * T22);
+			 T42 = FNMS(KP587785252, T21, KP951056516 * T22);
+			 T1Z = FNMS(KP250000000, T1Y, T1X);
+			 T20 = T1W + T1Z;
+			 T43 = T1Z - T1W;
+			 T24 = T20 - T23;
+			 T4O = T43 - T42;
+			 T3b = T23 + T20;
+			 T44 = T42 + T43;
+		    }
+	       }
+	       {
+		    E T5j, TM, T5k, T5s, T5u, T5o, T5r, T5t, T5l;
+		    T5j = KP559016994 * (Ts - TL);
+		    TM = Ts + TL;
+		    T5k = FNMS(KP250000000, TM, T9);
+		    T5o = T5m - T5n;
+		    T5r = T5p - T5q;
+		    T5s = FMA(KP951056516, T5o, KP587785252 * T5r);
+		    T5u = FNMS(KP587785252, T5o, KP951056516 * T5r);
+		    ro[0] = T9 + TM;
+		    T5t = T5k - T5j;
+		    ro[WS(os, 10)] = T5t - T5u;
+		    ro[WS(os, 15)] = T5t + T5u;
+		    T5l = T5j + T5k;
+		    ro[WS(os, 20)] = T5l - T5s;
+		    ro[WS(os, 5)] = T5l + T5s;
+	       }
+	       {
+		    E T5x, T5z, T5A, T5E, T5F, T5C, T5D, T5G, T5B;
+		    T5x = KP559016994 * (T5v - T5w);
+		    T5z = T5v + T5w;
+		    T5A = FNMS(KP250000000, T5z, T5y);
+		    T5C = Ti - Tr;
+		    T5D = TB - TK;
+		    T5E = FMA(KP951056516, T5C, KP587785252 * T5D);
+		    T5F = FNMS(KP587785252, T5C, KP951056516 * T5D);
+		    io[0] = T5y + T5z;
+		    T5G = T5A - T5x;
+		    io[WS(os, 10)] = T5F + T5G;
+		    io[WS(os, 15)] = T5G - T5F;
+		    T5B = T5x + T5A;
+		    io[WS(os, 5)] = T5B - T5E;
+		    io[WS(os, 20)] = T5E + T5B;
+	       }
+	       {
+		    E TX, T2U, T2u, T2Z, T2v, T2Y, T2A, T2V, T2D, T2J;
+		    TX = TP + TW;
+		    T2U = T2Q - T2T;
+		    {
+			 E T1k, T1H, T1I, T25, T2s, T2t;
+			 T1k = FMA(KP968583161, T18, KP248689887 * T1j);
+			 T1H = FMA(KP535826794, T1v, KP844327925 * T1G);
+			 T1I = T1k + T1H;
+			 T25 = FMA(KP876306680, T1T, KP481753674 * T24);
+			 T2s = FMA(KP728968627, T2g, KP684547105 * T2r);
+			 T2t = T25 + T2s;
+			 T2u = T1I + T2t;
+			 T2Z = T25 - T2s;
+			 T2v = KP559016994 * (T1I - T2t);
+			 T2Y = T1k - T1H;
+		    }
+		    {
+			 E T2y, T2z, T2H, T2B, T2C, T2I;
+			 T2y = FNMS(KP248689887, T18, KP968583161 * T1j);
+			 T2z = FNMS(KP844327925, T1v, KP535826794 * T1G);
+			 T2H = T2y + T2z;
+			 T2B = FNMS(KP481753674, T1T, KP876306680 * T24);
+			 T2C = FNMS(KP684547105, T2g, KP728968627 * T2r);
+			 T2I = T2B + T2C;
+			 T2A = T2y - T2z;
+			 T2V = T2H + T2I;
+			 T2D = T2B - T2C;
+			 T2J = KP559016994 * (T2H - T2I);
+		    }
+		    ro[WS(os, 1)] = TX + T2u;
+		    io[WS(os, 1)] = T2U + T2V;
+		    {
+			 E T2E, T2G, T2x, T2F, T2w;
+			 T2E = FMA(KP951056516, T2A, KP587785252 * T2D);
+			 T2G = FNMS(KP587785252, T2A, KP951056516 * T2D);
+			 T2w = FNMS(KP250000000, T2u, TX);
+			 T2x = T2v + T2w;
+			 T2F = T2w - T2v;
+			 ro[WS(os, 21)] = T2x - T2E;
+			 ro[WS(os, 16)] = T2F + T2G;
+			 ro[WS(os, 6)] = T2x + T2E;
+			 ro[WS(os, 11)] = T2F - T2G;
+		    }
+		    {
+			 E T30, T31, T2X, T32, T2W;
+			 T30 = FMA(KP951056516, T2Y, KP587785252 * T2Z);
+			 T31 = FNMS(KP587785252, T2Y, KP951056516 * T2Z);
+			 T2W = FNMS(KP250000000, T2V, T2U);
+			 T2X = T2J + T2W;
+			 T32 = T2W - T2J;
+			 io[WS(os, 6)] = T2X - T30;
+			 io[WS(os, 16)] = T32 - T31;
+			 io[WS(os, 21)] = T30 + T2X;
+			 io[WS(os, 11)] = T31 + T32;
+		    }
+	       }
+	       {
+		    E T4F, T52, T4U, T5b, T56, T57, T51, T5f, T53, T5e;
+		    T4F = T3H + T3I;
+		    T52 = T4v - T4u;
+		    {
+			 E T4I, T4L, T4M, T4P, T4S, T4T;
+			 T4I = FMA(KP728968627, T4G, KP684547105 * T4H);
+			 T4L = FNMS(KP992114701, T4K, KP125333233 * T4J);
+			 T4M = T4I + T4L;
+			 T4P = FMA(KP062790519, T4N, KP998026728 * T4O);
+			 T4S = FNMS(KP637423989, T4R, KP770513242 * T4Q);
+			 T4T = T4P + T4S;
+			 T4U = T4M + T4T;
+			 T5b = KP559016994 * (T4M - T4T);
+			 T56 = T4I - T4L;
+			 T57 = T4P - T4S;
+		    }
+		    {
+			 E T4V, T4W, T4X, T4Y, T4Z, T50;
+			 T4V = FNMS(KP684547105, T4G, KP728968627 * T4H);
+			 T4W = FMA(KP125333233, T4K, KP992114701 * T4J);
+			 T4X = T4V - T4W;
+			 T4Y = FNMS(KP998026728, T4N, KP062790519 * T4O);
+			 T4Z = FMA(KP770513242, T4R, KP637423989 * T4Q);
+			 T50 = T4Y - T4Z;
+			 T51 = KP559016994 * (T4X - T50);
+			 T5f = T4Y + T4Z;
+			 T53 = T4X + T50;
+			 T5e = T4V + T4W;
+		    }
+		    ro[WS(os, 3)] = T4F + T4U;
+		    io[WS(os, 3)] = T52 + T53;
+		    {
+			 E T58, T59, T55, T5a, T54;
+			 T58 = FMA(KP951056516, T56, KP587785252 * T57);
+			 T59 = FNMS(KP587785252, T56, KP951056516 * T57);
+			 T54 = FNMS(KP250000000, T53, T52);
+			 T55 = T51 + T54;
+			 T5a = T54 - T51;
+			 io[WS(os, 8)] = T55 - T58;
+			 io[WS(os, 18)] = T5a - T59;
+			 io[WS(os, 23)] = T58 + T55;
+			 io[WS(os, 13)] = T59 + T5a;
+		    }
+		    {
+			 E T5g, T5i, T5d, T5h, T5c;
+			 T5g = FMA(KP951056516, T5e, KP587785252 * T5f);
+			 T5i = FNMS(KP587785252, T5e, KP951056516 * T5f);
+			 T5c = FNMS(KP250000000, T4U, T4F);
+			 T5d = T5b + T5c;
+			 T5h = T5c - T5b;
+			 ro[WS(os, 23)] = T5d - T5g;
+			 ro[WS(os, 18)] = T5h + T5i;
+			 ro[WS(os, 8)] = T5d + T5g;
+			 ro[WS(os, 13)] = T5h - T5i;
+		    }
+	       }
+	       {
+		    E T3J, T4w, T4e, T4B, T4f, T4A, T4k, T4x, T4n, T4t;
+		    T3J = T3H - T3I;
+		    T4w = T4u + T4v;
+		    {
+			 E T3Q, T3X, T3Y, T45, T4c, T4d;
+			 T3Q = FMA(KP876306680, T3M, KP481753674 * T3P);
+			 T3X = FNMS(KP425779291, T3W, KP904827052 * T3T);
+			 T3Y = T3Q + T3X;
+			 T45 = FMA(KP535826794, T41, KP844327925 * T44);
+			 T4c = FMA(KP062790519, T48, KP998026728 * T4b);
+			 T4d = T45 + T4c;
+			 T4e = T3Y + T4d;
+			 T4B = T45 - T4c;
+			 T4f = KP559016994 * (T3Y - T4d);
+			 T4A = T3Q - T3X;
+		    }
+		    {
+			 E T4i, T4j, T4r, T4l, T4m, T4s;
+			 T4i = FNMS(KP481753674, T3M, KP876306680 * T3P);
+			 T4j = FMA(KP904827052, T3W, KP425779291 * T3T);
+			 T4r = T4i - T4j;
+			 T4l = FNMS(KP844327925, T41, KP535826794 * T44);
+			 T4m = FNMS(KP998026728, T48, KP062790519 * T4b);
+			 T4s = T4l + T4m;
+			 T4k = T4i + T4j;
+			 T4x = T4r + T4s;
+			 T4n = T4l - T4m;
+			 T4t = KP559016994 * (T4r - T4s);
+		    }
+		    ro[WS(os, 2)] = T3J + T4e;
+		    io[WS(os, 2)] = T4w + T4x;
+		    {
+			 E T4o, T4q, T4h, T4p, T4g;
+			 T4o = FMA(KP951056516, T4k, KP587785252 * T4n);
+			 T4q = FNMS(KP587785252, T4k, KP951056516 * T4n);
+			 T4g = FNMS(KP250000000, T4e, T3J);
+			 T4h = T4f + T4g;
+			 T4p = T4g - T4f;
+			 ro[WS(os, 22)] = T4h - T4o;
+			 ro[WS(os, 17)] = T4p + T4q;
+			 ro[WS(os, 7)] = T4h + T4o;
+			 ro[WS(os, 12)] = T4p - T4q;
+		    }
+		    {
+			 E T4C, T4D, T4z, T4E, T4y;
+			 T4C = FMA(KP951056516, T4A, KP587785252 * T4B);
+			 T4D = FNMS(KP587785252, T4A, KP951056516 * T4B);
+			 T4y = FNMS(KP250000000, T4x, T4w);
+			 T4z = T4t + T4y;
+			 T4E = T4y - T4t;
+			 io[WS(os, 7)] = T4z - T4C;
+			 io[WS(os, 17)] = T4E - T4D;
+			 io[WS(os, 22)] = T4C + T4z;
+			 io[WS(os, 12)] = T4D + T4E;
+		    }
+	       }
+	       {
+		    E T33, T3j, T3i, T3z, T3r, T3s, T3q, T3D, T3v, T3C;
+		    T33 = TP - TW;
+		    T3j = T2T + T2Q;
+		    {
+			 E T36, T39, T3a, T3d, T3g, T3h;
+			 T36 = FMA(KP535826794, T34, KP844327925 * T35);
+			 T39 = FMA(KP637423989, T37, KP770513242 * T38);
+			 T3a = T36 - T39;
+			 T3d = FNMS(KP425779291, T3c, KP904827052 * T3b);
+			 T3g = FNMS(KP992114701, T3f, KP125333233 * T3e);
+			 T3h = T3d + T3g;
+			 T3i = T3a + T3h;
+			 T3z = KP559016994 * (T3a - T3h);
+			 T3r = T3d - T3g;
+			 T3s = T36 + T39;
+		    }
+		    {
+			 E T3k, T3l, T3m, T3n, T3o, T3p;
+			 T3k = FNMS(KP844327925, T34, KP535826794 * T35);
+			 T3l = FNMS(KP637423989, T38, KP770513242 * T37);
+			 T3m = T3k + T3l;
+			 T3n = FMA(KP904827052, T3c, KP425779291 * T3b);
+			 T3o = FMA(KP125333233, T3f, KP992114701 * T3e);
+			 T3p = T3n + T3o;
+			 T3q = T3m - T3p;
+			 T3D = T3o - T3n;
+			 T3v = KP559016994 * (T3m + T3p);
+			 T3C = T3k - T3l;
+		    }
+		    ro[WS(os, 4)] = T33 + T3i;
+		    io[WS(os, 4)] = T3j + T3q;
+		    {
+			 E T3t, T3y, T3w, T3x, T3u;
+			 T3t = FNMS(KP587785252, T3s, KP951056516 * T3r);
+			 T3y = FMA(KP951056516, T3s, KP587785252 * T3r);
+			 T3u = FNMS(KP250000000, T3q, T3j);
+			 T3w = T3u - T3v;
+			 T3x = T3u + T3v;
+			 io[WS(os, 14)] = T3t + T3w;
+			 io[WS(os, 24)] = T3y + T3x;
+			 io[WS(os, 19)] = T3w - T3t;
+			 io[WS(os, 9)] = T3x - T3y;
+		    }
+		    {
+			 E T3E, T3G, T3B, T3F, T3A;
+			 T3E = FMA(KP951056516, T3C, KP587785252 * T3D);
+			 T3G = FNMS(KP587785252, T3C, KP951056516 * T3D);
+			 T3A = FNMS(KP250000000, T3i, T33);
+			 T3B = T3z + T3A;
+			 T3F = T3A - T3z;
+			 ro[WS(os, 24)] = T3B - T3E;
+			 ro[WS(os, 19)] = T3F + T3G;
+			 ro[WS(os, 9)] = T3B + T3E;
+			 ro[WS(os, 14)] = T3F - T3G;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 25, "n1_25", {260, 92, 92, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_25) (planner *p) {
+     X(kdft_register) (p, n1_25, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:42 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 3 -name n1_3 -include n.h */
+
+/*
+ * This function contains 12 FP additions, 6 FP multiplications,
+ * (or, 6 additions, 0 multiplications, 6 fused multiply/add),
+ * 15 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "n.h"
+
+static void n1_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
+	       E T1, T9, T2, T3, T6, T7;
+	       T1 = ri[0];
+	       T9 = ii[0];
+	       T2 = ri[WS(is, 1)];
+	       T3 = ri[WS(is, 2)];
+	       T6 = ii[WS(is, 1)];
+	       T7 = ii[WS(is, 2)];
+	       {
+		    E T4, Tc, T8, Ta, T5, Tb;
+		    T4 = T2 + T3;
+		    Tc = T3 - T2;
+		    T8 = T6 - T7;
+		    Ta = T6 + T7;
+		    T5 = FNMS(KP500000000, T4, T1);
+		    ro[0] = T1 + T4;
+		    Tb = FNMS(KP500000000, Ta, T9);
+		    io[0] = T9 + Ta;
+		    ro[WS(os, 1)] = FMA(KP866025403, T8, T5);
+		    ro[WS(os, 2)] = FNMS(KP866025403, T8, T5);
+		    io[WS(os, 2)] = FNMS(KP866025403, Tc, Tb);
+		    io[WS(os, 1)] = FMA(KP866025403, Tc, Tb);
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 3, "n1_3", {6, 0, 6, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_3) (planner *p) {
+     X(kdft_register) (p, n1_3, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 3 -name n1_3 -include n.h */
+
+/*
+ * This function contains 12 FP additions, 4 FP multiplications,
+ * (or, 10 additions, 2 multiplications, 2 fused multiply/add),
+ * 15 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "n.h"
+
+static void n1_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
+	       E T1, Ta, T4, T9, T8, Tb, T5, Tc;
+	       T1 = ri[0];
+	       Ta = ii[0];
+	       {
+		    E T2, T3, T6, T7;
+		    T2 = ri[WS(is, 1)];
+		    T3 = ri[WS(is, 2)];
+		    T4 = T2 + T3;
+		    T9 = KP866025403 * (T3 - T2);
+		    T6 = ii[WS(is, 1)];
+		    T7 = ii[WS(is, 2)];
+		    T8 = KP866025403 * (T6 - T7);
+		    Tb = T6 + T7;
+	       }
+	       ro[0] = T1 + T4;
+	       io[0] = Ta + Tb;
+	       T5 = FNMS(KP500000000, T4, T1);
+	       ro[WS(os, 2)] = T5 - T8;
+	       ro[WS(os, 1)] = T5 + T8;
+	       Tc = FNMS(KP500000000, Tb, Ta);
+	       io[WS(os, 1)] = T9 + Tc;
+	       io[WS(os, 2)] = Tc - T9;
+	  }
+     }
+}
+
+static const kdft_desc desc = { 3, "n1_3", {10, 2, 2, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_3) (planner *p) {
+     X(kdft_register) (p, n1_3, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1291 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:45 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 32 -name n1_32 -include n.h */
+
+/*
+ * This function contains 372 FP additions, 136 FP multiplications,
+ * (or, 236 additions, 0 multiplications, 136 fused multiply/add),
+ * 136 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "n.h"
+
+static void n1_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       E T3g, T3f, T3n, T3b, T3r, T3l, T3o, T3e, T3h, T3p;
+	       {
+		    E T2T, T3T, T4r, T7, T3t, T1z, T18, T4Z, Te, T50, T4s, T1f, T2W, T3u, T3U;
+		    E T1G, Tm, T1n, T3X, T3y, T2Z, T1O, T53, T4w, Tt, T1u, T3W, T3B, T2Y, T1V;
+		    E T52, T4z, T3O, T2t, T3L, T2K, T5F, TZ, T5I, T5X, T4R, T5k, T3M, T2E, T5j;
+		    E T4W, T3P, T2N, T3H, T22, T3E, T2j, T4H, T4K, T5A, TK, T5D, T5W, T2k, T2l;
+		    E T4G, T5h, T3F, T2d;
+		    {
+			 E Tj, T1L, Ti, T1I, T1j, Tk, T1k, T1l;
+			 {
+			      E T4, T1x, T3, T2R, T14, T5, T15, T16, T1C, T1F;
+			      {
+				   E T1, T2, T12, T13;
+				   T1 = ri[0];
+				   T2 = ri[WS(is, 16)];
+				   T12 = ii[0];
+				   T13 = ii[WS(is, 16)];
+				   T4 = ri[WS(is, 8)];
+				   T1x = T1 - T2;
+				   T3 = T1 + T2;
+				   T2R = T12 - T13;
+				   T14 = T12 + T13;
+				   T5 = ri[WS(is, 24)];
+				   T15 = ii[WS(is, 8)];
+				   T16 = ii[WS(is, 24)];
+			      }
+			      {
+				   E Tb, T1A, Ta, T1B, T1b, Tc, T1c, T1d;
+				   {
+					E T8, T9, T19, T1a;
+					T8 = ri[WS(is, 4)];
+					{
+					     E T2S, T6, T1y, T17;
+					     T2S = T4 - T5;
+					     T6 = T4 + T5;
+					     T1y = T15 - T16;
+					     T17 = T15 + T16;
+					     T2T = T2R - T2S;
+					     T3T = T2S + T2R;
+					     T4r = T3 - T6;
+					     T7 = T3 + T6;
+					     T3t = T1x - T1y;
+					     T1z = T1x + T1y;
+					     T18 = T14 + T17;
+					     T4Z = T14 - T17;
+					     T9 = ri[WS(is, 20)];
+					}
+					T19 = ii[WS(is, 4)];
+					T1a = ii[WS(is, 20)];
+					Tb = ri[WS(is, 28)];
+					T1A = T8 - T9;
+					Ta = T8 + T9;
+					T1B = T19 - T1a;
+					T1b = T19 + T1a;
+					Tc = ri[WS(is, 12)];
+					T1c = ii[WS(is, 28)];
+					T1d = ii[WS(is, 12)];
+				   }
+				   {
+					E T2U, T1D, Td, T1E, T1e, T2V;
+					T1C = T1A + T1B;
+					T2U = T1B - T1A;
+					T1D = Tb - Tc;
+					Td = Tb + Tc;
+					T1E = T1c - T1d;
+					T1e = T1c + T1d;
+					Te = Ta + Td;
+					T50 = Td - Ta;
+					T1F = T1D - T1E;
+					T2V = T1D + T1E;
+					T4s = T1b - T1e;
+					T1f = T1b + T1e;
+					T2W = T2U + T2V;
+					T3u = T2U - T2V;
+				   }
+			      }
+			      {
+				   E Tg, Th, T1h, T1i;
+				   Tg = ri[WS(is, 2)];
+				   T3U = T1F - T1C;
+				   T1G = T1C + T1F;
+				   Th = ri[WS(is, 18)];
+				   T1h = ii[WS(is, 2)];
+				   T1i = ii[WS(is, 18)];
+				   Tj = ri[WS(is, 10)];
+				   T1L = Tg - Th;
+				   Ti = Tg + Th;
+				   T1I = T1h - T1i;
+				   T1j = T1h + T1i;
+				   Tk = ri[WS(is, 26)];
+				   T1k = ii[WS(is, 10)];
+				   T1l = ii[WS(is, 26)];
+			      }
+			 }
+			 {
+			      E Tq, T1S, Tp, T1P, T1q, Tr, T1r, T1s;
+			      {
+				   E Tn, To, T1o, T1p, T1J, Tl;
+				   Tn = ri[WS(is, 30)];
+				   T1J = Tj - Tk;
+				   Tl = Tj + Tk;
+				   {
+					E T1M, T1m, T3w, T1K;
+					T1M = T1k - T1l;
+					T1m = T1k + T1l;
+					T3w = T1J + T1I;
+					T1K = T1I - T1J;
+					{
+					     E T4v, T3x, T1N, T4u;
+					     T4v = Ti - Tl;
+					     Tm = Ti + Tl;
+					     T3x = T1L - T1M;
+					     T1N = T1L + T1M;
+					     T4u = T1j - T1m;
+					     T1n = T1j + T1m;
+					     T3X = FNMS(KP414213562, T3w, T3x);
+					     T3y = FMA(KP414213562, T3x, T3w);
+					     T2Z = FMA(KP414213562, T1K, T1N);
+					     T1O = FNMS(KP414213562, T1N, T1K);
+					     T53 = T4v + T4u;
+					     T4w = T4u - T4v;
+					     To = ri[WS(is, 14)];
+					}
+				   }
+				   T1o = ii[WS(is, 30)];
+				   T1p = ii[WS(is, 14)];
+				   Tq = ri[WS(is, 6)];
+				   T1S = Tn - To;
+				   Tp = Tn + To;
+				   T1P = T1o - T1p;
+				   T1q = T1o + T1p;
+				   Tr = ri[WS(is, 22)];
+				   T1r = ii[WS(is, 6)];
+				   T1s = ii[WS(is, 22)];
+			      }
+			      {
+				   E T4S, T4V, T2L, T2M;
+				   {
+					E T2G, TN, T4N, T2r, T2s, TQ, T4O, T2J, TV, T2x, TU, T4T, T2w, TW, T2A;
+					E T2B;
+					{
+					     E TO, TP, T2H, T2I;
+					     {
+						  E TL, TM, T2p, T2q, T1Q, Ts;
+						  TL = ri[WS(is, 31)];
+						  T1Q = Tq - Tr;
+						  Ts = Tq + Tr;
+						  {
+						       E T1T, T1t, T3z, T1R;
+						       T1T = T1r - T1s;
+						       T1t = T1r + T1s;
+						       T3z = T1Q + T1P;
+						       T1R = T1P - T1Q;
+						       {
+							    E T4x, T3A, T1U, T4y;
+							    T4x = Tp - Ts;
+							    Tt = Tp + Ts;
+							    T3A = T1S - T1T;
+							    T1U = T1S + T1T;
+							    T4y = T1q - T1t;
+							    T1u = T1q + T1t;
+							    T3W = FMA(KP414213562, T3z, T3A);
+							    T3B = FNMS(KP414213562, T3A, T3z);
+							    T2Y = FNMS(KP414213562, T1R, T1U);
+							    T1V = FMA(KP414213562, T1U, T1R);
+							    T52 = T4x - T4y;
+							    T4z = T4x + T4y;
+							    TM = ri[WS(is, 15)];
+						       }
+						  }
+						  T2p = ii[WS(is, 31)];
+						  T2q = ii[WS(is, 15)];
+						  TO = ri[WS(is, 7)];
+						  T2G = TL - TM;
+						  TN = TL + TM;
+						  T4N = T2p + T2q;
+						  T2r = T2p - T2q;
+						  TP = ri[WS(is, 23)];
+						  T2H = ii[WS(is, 7)];
+						  T2I = ii[WS(is, 23)];
+					     }
+					     {
+						  E TS, TT, T2u, T2v;
+						  TS = ri[WS(is, 3)];
+						  T2s = TO - TP;
+						  TQ = TO + TP;
+						  T4O = T2H + T2I;
+						  T2J = T2H - T2I;
+						  TT = ri[WS(is, 19)];
+						  T2u = ii[WS(is, 3)];
+						  T2v = ii[WS(is, 19)];
+						  TV = ri[WS(is, 27)];
+						  T2x = TS - TT;
+						  TU = TS + TT;
+						  T4T = T2u + T2v;
+						  T2w = T2u - T2v;
+						  TW = ri[WS(is, 11)];
+						  T2A = ii[WS(is, 27)];
+						  T2B = ii[WS(is, 11)];
+					     }
+					}
+					{
+					     E T2z, T4U, T2C, TR, TY, T4Q, TX;
+					     T3O = T2s + T2r;
+					     T2t = T2r - T2s;
+					     T2z = TV - TW;
+					     TX = TV + TW;
+					     T4U = T2A + T2B;
+					     T2C = T2A - T2B;
+					     T3L = T2G - T2J;
+					     T2K = T2G + T2J;
+					     T4S = TN - TQ;
+					     TR = TN + TQ;
+					     TY = TU + TX;
+					     T4Q = TX - TU;
+					     {
+						  E T4P, T5G, T5H, T2y, T2D;
+						  T4P = T4N - T4O;
+						  T5G = T4N + T4O;
+						  T5H = T4T + T4U;
+						  T4V = T4T - T4U;
+						  T5F = TR - TY;
+						  TZ = TR + TY;
+						  T5I = T5G - T5H;
+						  T5X = T5G + T5H;
+						  T2L = T2x + T2w;
+						  T2y = T2w - T2x;
+						  T2D = T2z + T2C;
+						  T2M = T2z - T2C;
+						  T4R = T4P - T4Q;
+						  T5k = T4Q + T4P;
+						  T3M = T2D - T2y;
+						  T2E = T2y + T2D;
+					     }
+					}
+				   }
+				   {
+					E T2f, Ty, T4C, T20, T21, TB, T4D, T2i, TG, T26, TF, T4I, T25, TH, T29;
+					E T2a;
+					{
+					     E Tz, TA, T2g, T2h;
+					     {
+						  E Tw, Tx, T1Y, T1Z;
+						  Tw = ri[WS(is, 1)];
+						  T5j = T4S + T4V;
+						  T4W = T4S - T4V;
+						  T3P = T2L - T2M;
+						  T2N = T2L + T2M;
+						  Tx = ri[WS(is, 17)];
+						  T1Y = ii[WS(is, 1)];
+						  T1Z = ii[WS(is, 17)];
+						  Tz = ri[WS(is, 9)];
+						  T2f = Tw - Tx;
+						  Ty = Tw + Tx;
+						  T4C = T1Y + T1Z;
+						  T20 = T1Y - T1Z;
+						  TA = ri[WS(is, 25)];
+						  T2g = ii[WS(is, 9)];
+						  T2h = ii[WS(is, 25)];
+					     }
+					     {
+						  E TD, TE, T23, T24;
+						  TD = ri[WS(is, 5)];
+						  T21 = Tz - TA;
+						  TB = Tz + TA;
+						  T4D = T2g + T2h;
+						  T2i = T2g - T2h;
+						  TE = ri[WS(is, 21)];
+						  T23 = ii[WS(is, 5)];
+						  T24 = ii[WS(is, 21)];
+						  TG = ri[WS(is, 29)];
+						  T26 = TD - TE;
+						  TF = TD + TE;
+						  T4I = T23 + T24;
+						  T25 = T23 - T24;
+						  TH = ri[WS(is, 13)];
+						  T29 = ii[WS(is, 29)];
+						  T2a = ii[WS(is, 13)];
+					     }
+					}
+					{
+					     E T28, T4J, T2b, TC, TJ, T4F, TI;
+					     T3H = T21 + T20;
+					     T22 = T20 - T21;
+					     T28 = TG - TH;
+					     TI = TG + TH;
+					     T4J = T29 + T2a;
+					     T2b = T29 - T2a;
+					     T3E = T2f - T2i;
+					     T2j = T2f + T2i;
+					     T4H = Ty - TB;
+					     TC = Ty + TB;
+					     TJ = TF + TI;
+					     T4F = TI - TF;
+					     {
+						  E T4E, T5B, T5C, T27, T2c;
+						  T4E = T4C - T4D;
+						  T5B = T4C + T4D;
+						  T5C = T4I + T4J;
+						  T4K = T4I - T4J;
+						  T5A = TC - TJ;
+						  TK = TC + TJ;
+						  T5D = T5B - T5C;
+						  T5W = T5B + T5C;
+						  T2k = T26 + T25;
+						  T27 = T25 - T26;
+						  T2c = T28 + T2b;
+						  T2l = T28 - T2b;
+						  T4G = T4E - T4F;
+						  T5h = T4F + T4E;
+						  T3F = T2c - T27;
+						  T2d = T27 + T2c;
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T3I, T2m, Tv, T60, T11, T10, T5Z, T1w;
+			 {
+			      E T5f, T5w, T5q, T5m, T5v, T5p;
+			      {
+				   E T5d, T5g, T5o, T4B, T5a, T5n, T5e, T56, T4Y, T57, T55;
+				   {
+					E T4X, T4M, T5b, T5c, T51, T54;
+					{
+					     E T4t, T4A, T58, T59, T4L;
+					     T5d = T4r + T4s;
+					     T4t = T4r - T4s;
+					     T5g = T4H + T4K;
+					     T4L = T4H - T4K;
+					     T3I = T2k - T2l;
+					     T2m = T2k + T2l;
+					     T4A = T4w - T4z;
+					     T5o = T4w + T4z;
+					     T4X = FNMS(KP414213562, T4W, T4R);
+					     T58 = FMA(KP414213562, T4R, T4W);
+					     T59 = FNMS(KP414213562, T4G, T4L);
+					     T4M = FMA(KP414213562, T4L, T4G);
+					     T5b = FNMS(KP707106781, T4A, T4t);
+					     T4B = FMA(KP707106781, T4A, T4t);
+					     T5c = T59 + T58;
+					     T5a = T58 - T59;
+					     T5n = T50 + T4Z;
+					     T51 = T4Z - T50;
+					     T54 = T52 - T53;
+					     T5e = T53 + T52;
+					}
+					ro[WS(os, 14)] = FNMS(KP923879532, T5c, T5b);
+					T56 = T4M + T4X;
+					T4Y = T4M - T4X;
+					T57 = FMA(KP707106781, T54, T51);
+					T55 = FNMS(KP707106781, T54, T51);
+					ro[WS(os, 30)] = FMA(KP923879532, T5c, T5b);
+				   }
+				   ro[WS(os, 6)] = FMA(KP923879532, T4Y, T4B);
+				   ro[WS(os, 22)] = FNMS(KP923879532, T4Y, T4B);
+				   io[WS(os, 6)] = FMA(KP923879532, T5a, T57);
+				   io[WS(os, 22)] = FNMS(KP923879532, T5a, T57);
+				   io[WS(os, 30)] = FMA(KP923879532, T56, T55);
+				   io[WS(os, 14)] = FNMS(KP923879532, T56, T55);
+				   {
+					E T5i, T5l, T5r, T5u, T5s, T5t;
+					T5i = FMA(KP414213562, T5h, T5g);
+					T5s = FNMS(KP414213562, T5g, T5h);
+					T5t = FMA(KP414213562, T5j, T5k);
+					T5l = FNMS(KP414213562, T5k, T5j);
+					T5r = FNMS(KP707106781, T5e, T5d);
+					T5f = FMA(KP707106781, T5e, T5d);
+					T5w = T5s + T5t;
+					T5u = T5s - T5t;
+					ro[WS(os, 26)] = FNMS(KP923879532, T5u, T5r);
+					T5q = T5l - T5i;
+					T5m = T5i + T5l;
+					T5v = FMA(KP707106781, T5o, T5n);
+					T5p = FNMS(KP707106781, T5o, T5n);
+					ro[WS(os, 10)] = FMA(KP923879532, T5u, T5r);
+				   }
+			      }
+			      ro[WS(os, 2)] = FMA(KP923879532, T5m, T5f);
+			      ro[WS(os, 18)] = FNMS(KP923879532, T5m, T5f);
+			      io[WS(os, 2)] = FMA(KP923879532, T5w, T5v);
+			      io[WS(os, 18)] = FNMS(KP923879532, T5w, T5v);
+			      io[WS(os, 10)] = FMA(KP923879532, T5q, T5p);
+			      io[WS(os, 26)] = FNMS(KP923879532, T5q, T5p);
+			      {
+				   E Tf, T1v, T5z, T5U, T1g, Tu, T5O, T5K, T5T, T5N, T5V, T5Y;
+				   {
+					E T5E, T5J, T5P, T5S, T5L, T5M;
+					{
+					     E T5x, T5y, T5Q, T5R;
+					     Tf = T7 + Te;
+					     T5x = T7 - Te;
+					     T5y = T1n - T1u;
+					     T1v = T1n + T1u;
+					     T5E = T5A + T5D;
+					     T5Q = T5D - T5A;
+					     T5R = T5F + T5I;
+					     T5J = T5F - T5I;
+					     T5P = T5x - T5y;
+					     T5z = T5x + T5y;
+					     T5U = T5Q + T5R;
+					     T5S = T5Q - T5R;
+					     T1g = T18 + T1f;
+					     T5L = T18 - T1f;
+					     T5M = Tt - Tm;
+					     Tu = Tm + Tt;
+					}
+					ro[WS(os, 28)] = FNMS(KP707106781, T5S, T5P);
+					T5O = T5J - T5E;
+					T5K = T5E + T5J;
+					T5T = T5M + T5L;
+					T5N = T5L - T5M;
+					ro[WS(os, 12)] = FMA(KP707106781, T5S, T5P);
+				   }
+				   ro[WS(os, 4)] = FMA(KP707106781, T5K, T5z);
+				   ro[WS(os, 20)] = FNMS(KP707106781, T5K, T5z);
+				   io[WS(os, 4)] = FMA(KP707106781, T5U, T5T);
+				   io[WS(os, 20)] = FNMS(KP707106781, T5U, T5T);
+				   io[WS(os, 12)] = FMA(KP707106781, T5O, T5N);
+				   io[WS(os, 28)] = FNMS(KP707106781, T5O, T5N);
+				   T5V = Tf - Tu;
+				   Tv = Tf + Tu;
+				   T60 = T5W + T5X;
+				   T5Y = T5W - T5X;
+				   ro[WS(os, 8)] = T5V + T5Y;
+				   T11 = TZ - TK;
+				   T10 = TK + TZ;
+				   T5Z = T1g + T1v;
+				   T1w = T1g - T1v;
+				   ro[WS(os, 24)] = T5V - T5Y;
+			      }
+			 }
+			 ro[0] = Tv + T10;
+			 ro[WS(os, 16)] = Tv - T10;
+			 io[0] = T5Z + T60;
+			 io[WS(os, 16)] = T5Z - T60;
+			 io[WS(os, 24)] = T1w - T11;
+			 io[WS(os, 8)] = T11 + T1w;
+			 {
+			      E T39, T3k, T3j, T3a, T3d, T3c, T47, T4i, T4h, T41, T3D, T48, T4b, T4a, T4e;
+			      E T3N, T45, T3Z, T42, T3K, T3Q, T4d;
+			      {
+				   E T2e, T37, T1X, T33, T31, T2n, T2F, T2O;
+				   {
+					E T1H, T1W, T2X, T30;
+					T39 = FMA(KP707106781, T1G, T1z);
+					T1H = FNMS(KP707106781, T1G, T1z);
+					T1W = T1O - T1V;
+					T3k = T1O + T1V;
+					T3j = FMA(KP707106781, T2W, T2T);
+					T2X = FNMS(KP707106781, T2W, T2T);
+					T30 = T2Y - T2Z;
+					T3a = T2Z + T2Y;
+					T3d = FMA(KP707106781, T2d, T22);
+					T2e = FNMS(KP707106781, T2d, T22);
+					T37 = FNMS(KP923879532, T1W, T1H);
+					T1X = FMA(KP923879532, T1W, T1H);
+					T33 = FMA(KP923879532, T30, T2X);
+					T31 = FNMS(KP923879532, T30, T2X);
+					T2n = FNMS(KP707106781, T2m, T2j);
+					T3c = FMA(KP707106781, T2m, T2j);
+					T3g = FMA(KP707106781, T2E, T2t);
+					T2F = FNMS(KP707106781, T2E, T2t);
+					T2O = FNMS(KP707106781, T2N, T2K);
+					T3f = FMA(KP707106781, T2N, T2K);
+				   }
+				   {
+					E T3V, T3Y, T3G, T3J;
+					{
+					     E T3v, T35, T2o, T34, T2P, T3C;
+					     T47 = FNMS(KP707106781, T3u, T3t);
+					     T3v = FMA(KP707106781, T3u, T3t);
+					     T35 = FNMS(KP668178637, T2e, T2n);
+					     T2o = FMA(KP668178637, T2n, T2e);
+					     T34 = FMA(KP668178637, T2F, T2O);
+					     T2P = FNMS(KP668178637, T2O, T2F);
+					     T3C = T3y - T3B;
+					     T4i = T3y + T3B;
+					     T4h = FNMS(KP707106781, T3U, T3T);
+					     T3V = FMA(KP707106781, T3U, T3T);
+					     {
+						  E T38, T36, T32, T2Q;
+						  T38 = T35 + T34;
+						  T36 = T34 - T35;
+						  T32 = T2o + T2P;
+						  T2Q = T2o - T2P;
+						  T41 = FNMS(KP923879532, T3C, T3v);
+						  T3D = FMA(KP923879532, T3C, T3v);
+						  ro[WS(os, 29)] = FMA(KP831469612, T38, T37);
+						  ro[WS(os, 13)] = FNMS(KP831469612, T38, T37);
+						  io[WS(os, 5)] = FMA(KP831469612, T36, T33);
+						  io[WS(os, 21)] = FNMS(KP831469612, T36, T33);
+						  io[WS(os, 29)] = FMA(KP831469612, T32, T31);
+						  io[WS(os, 13)] = FNMS(KP831469612, T32, T31);
+						  ro[WS(os, 5)] = FMA(KP831469612, T2Q, T1X);
+						  ro[WS(os, 21)] = FNMS(KP831469612, T2Q, T1X);
+						  T3Y = T3W - T3X;
+						  T48 = T3X + T3W;
+					     }
+					}
+					T4b = FMA(KP707106781, T3F, T3E);
+					T3G = FNMS(KP707106781, T3F, T3E);
+					T3J = FNMS(KP707106781, T3I, T3H);
+					T4a = FMA(KP707106781, T3I, T3H);
+					T4e = FMA(KP707106781, T3M, T3L);
+					T3N = FNMS(KP707106781, T3M, T3L);
+					T45 = FMA(KP923879532, T3Y, T3V);
+					T3Z = FNMS(KP923879532, T3Y, T3V);
+					T42 = FNMS(KP668178637, T3G, T3J);
+					T3K = FMA(KP668178637, T3J, T3G);
+					T3Q = FNMS(KP707106781, T3P, T3O);
+					T4d = FMA(KP707106781, T3P, T3O);
+				   }
+			      }
+			      {
+				   E T4p, T49, T4l, T4j, T4n, T4c, T43, T3R, T4m, T4f;
+				   T43 = FMA(KP668178637, T3N, T3Q);
+				   T3R = FNMS(KP668178637, T3Q, T3N);
+				   T4p = FMA(KP923879532, T48, T47);
+				   T49 = FNMS(KP923879532, T48, T47);
+				   {
+					E T44, T46, T40, T3S;
+					T44 = T42 - T43;
+					T46 = T42 + T43;
+					T40 = T3R - T3K;
+					T3S = T3K + T3R;
+					ro[WS(os, 11)] = FMA(KP831469612, T44, T41);
+					ro[WS(os, 27)] = FNMS(KP831469612, T44, T41);
+					io[WS(os, 3)] = FMA(KP831469612, T46, T45);
+					io[WS(os, 19)] = FNMS(KP831469612, T46, T45);
+					io[WS(os, 11)] = FMA(KP831469612, T40, T3Z);
+					io[WS(os, 27)] = FNMS(KP831469612, T40, T3Z);
+					ro[WS(os, 3)] = FMA(KP831469612, T3S, T3D);
+					ro[WS(os, 19)] = FNMS(KP831469612, T3S, T3D);
+				   }
+				   T4l = FNMS(KP923879532, T4i, T4h);
+				   T4j = FMA(KP923879532, T4i, T4h);
+				   T4n = FNMS(KP198912367, T4a, T4b);
+				   T4c = FMA(KP198912367, T4b, T4a);
+				   T4m = FMA(KP198912367, T4d, T4e);
+				   T4f = FNMS(KP198912367, T4e, T4d);
+				   T3n = FNMS(KP923879532, T3a, T39);
+				   T3b = FMA(KP923879532, T3a, T39);
+				   {
+					E T4q, T4o, T4k, T4g;
+					T4q = T4n + T4m;
+					T4o = T4m - T4n;
+					T4k = T4c + T4f;
+					T4g = T4c - T4f;
+					ro[WS(os, 31)] = FMA(KP980785280, T4q, T4p);
+					ro[WS(os, 15)] = FNMS(KP980785280, T4q, T4p);
+					io[WS(os, 7)] = FMA(KP980785280, T4o, T4l);
+					io[WS(os, 23)] = FNMS(KP980785280, T4o, T4l);
+					io[WS(os, 31)] = FMA(KP980785280, T4k, T4j);
+					io[WS(os, 15)] = FNMS(KP980785280, T4k, T4j);
+					ro[WS(os, 7)] = FMA(KP980785280, T4g, T49);
+					ro[WS(os, 23)] = FNMS(KP980785280, T4g, T49);
+				   }
+				   T3r = FMA(KP923879532, T3k, T3j);
+				   T3l = FNMS(KP923879532, T3k, T3j);
+				   T3o = FNMS(KP198912367, T3c, T3d);
+				   T3e = FMA(KP198912367, T3d, T3c);
+			      }
+			 }
+		    }
+	       }
+	       T3h = FNMS(KP198912367, T3g, T3f);
+	       T3p = FMA(KP198912367, T3f, T3g);
+	       {
+		    E T3s, T3q, T3i, T3m;
+		    T3s = T3o + T3p;
+		    T3q = T3o - T3p;
+		    T3i = T3e + T3h;
+		    T3m = T3h - T3e;
+		    ro[WS(os, 9)] = FMA(KP980785280, T3q, T3n);
+		    ro[WS(os, 25)] = FNMS(KP980785280, T3q, T3n);
+		    io[WS(os, 1)] = FMA(KP980785280, T3s, T3r);
+		    io[WS(os, 17)] = FNMS(KP980785280, T3s, T3r);
+		    io[WS(os, 9)] = FMA(KP980785280, T3m, T3l);
+		    io[WS(os, 25)] = FNMS(KP980785280, T3m, T3l);
+		    ro[WS(os, 1)] = FMA(KP980785280, T3i, T3b);
+		    ro[WS(os, 17)] = FNMS(KP980785280, T3i, T3b);
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 32, "n1_32", {236, 0, 136, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_32) (planner *p) {
+     X(kdft_register) (p, n1_32, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 32 -name n1_32 -include n.h */
+
+/*
+ * This function contains 372 FP additions, 84 FP multiplications,
+ * (or, 340 additions, 52 multiplications, 32 fused multiply/add),
+ * 100 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "n.h"
+
+static void n1_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       E T7, T4r, T4Z, T18, T1z, T3t, T3T, T2T, Te, T1f, T50, T4s, T2W, T3u, T1G;
+	       E T3U, Tm, T1n, T1O, T2Z, T3y, T3X, T4w, T53, Tt, T1u, T1V, T2Y, T3B, T3W;
+	       E T4z, T52, T2t, T3L, T3O, T2K, TR, TY, T5F, T5G, T5H, T5I, T4R, T5j, T2E;
+	       E T3P, T4W, T5k, T2N, T3M, T22, T3E, T3H, T2j, TC, TJ, T5A, T5B, T5C, T5D;
+	       E T4G, T5g, T2d, T3F, T4L, T5h, T2m, T3I;
+	       {
+		    E T3, T1x, T14, T2S, T6, T2R, T17, T1y;
+		    {
+			 E T1, T2, T12, T13;
+			 T1 = ri[0];
+			 T2 = ri[WS(is, 16)];
+			 T3 = T1 + T2;
+			 T1x = T1 - T2;
+			 T12 = ii[0];
+			 T13 = ii[WS(is, 16)];
+			 T14 = T12 + T13;
+			 T2S = T12 - T13;
+		    }
+		    {
+			 E T4, T5, T15, T16;
+			 T4 = ri[WS(is, 8)];
+			 T5 = ri[WS(is, 24)];
+			 T6 = T4 + T5;
+			 T2R = T4 - T5;
+			 T15 = ii[WS(is, 8)];
+			 T16 = ii[WS(is, 24)];
+			 T17 = T15 + T16;
+			 T1y = T15 - T16;
+		    }
+		    T7 = T3 + T6;
+		    T4r = T3 - T6;
+		    T4Z = T14 - T17;
+		    T18 = T14 + T17;
+		    T1z = T1x - T1y;
+		    T3t = T1x + T1y;
+		    T3T = T2S - T2R;
+		    T2T = T2R + T2S;
+	       }
+	       {
+		    E Ta, T1B, T1b, T1A, Td, T1D, T1e, T1E;
+		    {
+			 E T8, T9, T19, T1a;
+			 T8 = ri[WS(is, 4)];
+			 T9 = ri[WS(is, 20)];
+			 Ta = T8 + T9;
+			 T1B = T8 - T9;
+			 T19 = ii[WS(is, 4)];
+			 T1a = ii[WS(is, 20)];
+			 T1b = T19 + T1a;
+			 T1A = T19 - T1a;
+		    }
+		    {
+			 E Tb, Tc, T1c, T1d;
+			 Tb = ri[WS(is, 28)];
+			 Tc = ri[WS(is, 12)];
+			 Td = Tb + Tc;
+			 T1D = Tb - Tc;
+			 T1c = ii[WS(is, 28)];
+			 T1d = ii[WS(is, 12)];
+			 T1e = T1c + T1d;
+			 T1E = T1c - T1d;
+		    }
+		    Te = Ta + Td;
+		    T1f = T1b + T1e;
+		    T50 = Td - Ta;
+		    T4s = T1b - T1e;
+		    {
+			 E T2U, T2V, T1C, T1F;
+			 T2U = T1D - T1E;
+			 T2V = T1B + T1A;
+			 T2W = KP707106781 * (T2U - T2V);
+			 T3u = KP707106781 * (T2V + T2U);
+			 T1C = T1A - T1B;
+			 T1F = T1D + T1E;
+			 T1G = KP707106781 * (T1C - T1F);
+			 T3U = KP707106781 * (T1C + T1F);
+		    }
+	       }
+	       {
+		    E Ti, T1L, T1j, T1J, Tl, T1I, T1m, T1M, T1K, T1N;
+		    {
+			 E Tg, Th, T1h, T1i;
+			 Tg = ri[WS(is, 2)];
+			 Th = ri[WS(is, 18)];
+			 Ti = Tg + Th;
+			 T1L = Tg - Th;
+			 T1h = ii[WS(is, 2)];
+			 T1i = ii[WS(is, 18)];
+			 T1j = T1h + T1i;
+			 T1J = T1h - T1i;
+		    }
+		    {
+			 E Tj, Tk, T1k, T1l;
+			 Tj = ri[WS(is, 10)];
+			 Tk = ri[WS(is, 26)];
+			 Tl = Tj + Tk;
+			 T1I = Tj - Tk;
+			 T1k = ii[WS(is, 10)];
+			 T1l = ii[WS(is, 26)];
+			 T1m = T1k + T1l;
+			 T1M = T1k - T1l;
+		    }
+		    Tm = Ti + Tl;
+		    T1n = T1j + T1m;
+		    T1K = T1I + T1J;
+		    T1N = T1L - T1M;
+		    T1O = FNMS(KP923879532, T1N, KP382683432 * T1K);
+		    T2Z = FMA(KP923879532, T1K, KP382683432 * T1N);
+		    {
+			 E T3w, T3x, T4u, T4v;
+			 T3w = T1J - T1I;
+			 T3x = T1L + T1M;
+			 T3y = FNMS(KP382683432, T3x, KP923879532 * T3w);
+			 T3X = FMA(KP382683432, T3w, KP923879532 * T3x);
+			 T4u = T1j - T1m;
+			 T4v = Ti - Tl;
+			 T4w = T4u - T4v;
+			 T53 = T4v + T4u;
+		    }
+	       }
+	       {
+		    E Tp, T1S, T1q, T1Q, Ts, T1P, T1t, T1T, T1R, T1U;
+		    {
+			 E Tn, To, T1o, T1p;
+			 Tn = ri[WS(is, 30)];
+			 To = ri[WS(is, 14)];
+			 Tp = Tn + To;
+			 T1S = Tn - To;
+			 T1o = ii[WS(is, 30)];
+			 T1p = ii[WS(is, 14)];
+			 T1q = T1o + T1p;
+			 T1Q = T1o - T1p;
+		    }
+		    {
+			 E Tq, Tr, T1r, T1s;
+			 Tq = ri[WS(is, 6)];
+			 Tr = ri[WS(is, 22)];
+			 Ts = Tq + Tr;
+			 T1P = Tq - Tr;
+			 T1r = ii[WS(is, 6)];
+			 T1s = ii[WS(is, 22)];
+			 T1t = T1r + T1s;
+			 T1T = T1r - T1s;
+		    }
+		    Tt = Tp + Ts;
+		    T1u = T1q + T1t;
+		    T1R = T1P + T1Q;
+		    T1U = T1S - T1T;
+		    T1V = FMA(KP382683432, T1R, KP923879532 * T1U);
+		    T2Y = FNMS(KP923879532, T1R, KP382683432 * T1U);
+		    {
+			 E T3z, T3A, T4x, T4y;
+			 T3z = T1Q - T1P;
+			 T3A = T1S + T1T;
+			 T3B = FMA(KP923879532, T3z, KP382683432 * T3A);
+			 T3W = FNMS(KP382683432, T3z, KP923879532 * T3A);
+			 T4x = Tp - Ts;
+			 T4y = T1q - T1t;
+			 T4z = T4x + T4y;
+			 T52 = T4x - T4y;
+		    }
+	       }
+	       {
+		    E TN, T2p, T2J, T4S, TQ, T2G, T2s, T4T, TU, T2x, T2w, T4O, TX, T2z, T2C;
+		    E T4P;
+		    {
+			 E TL, TM, T2H, T2I;
+			 TL = ri[WS(is, 31)];
+			 TM = ri[WS(is, 15)];
+			 TN = TL + TM;
+			 T2p = TL - TM;
+			 T2H = ii[WS(is, 31)];
+			 T2I = ii[WS(is, 15)];
+			 T2J = T2H - T2I;
+			 T4S = T2H + T2I;
+		    }
+		    {
+			 E TO, TP, T2q, T2r;
+			 TO = ri[WS(is, 7)];
+			 TP = ri[WS(is, 23)];
+			 TQ = TO + TP;
+			 T2G = TO - TP;
+			 T2q = ii[WS(is, 7)];
+			 T2r = ii[WS(is, 23)];
+			 T2s = T2q - T2r;
+			 T4T = T2q + T2r;
+		    }
+		    {
+			 E TS, TT, T2u, T2v;
+			 TS = ri[WS(is, 3)];
+			 TT = ri[WS(is, 19)];
+			 TU = TS + TT;
+			 T2x = TS - TT;
+			 T2u = ii[WS(is, 3)];
+			 T2v = ii[WS(is, 19)];
+			 T2w = T2u - T2v;
+			 T4O = T2u + T2v;
+		    }
+		    {
+			 E TV, TW, T2A, T2B;
+			 TV = ri[WS(is, 27)];
+			 TW = ri[WS(is, 11)];
+			 TX = TV + TW;
+			 T2z = TV - TW;
+			 T2A = ii[WS(is, 27)];
+			 T2B = ii[WS(is, 11)];
+			 T2C = T2A - T2B;
+			 T4P = T2A + T2B;
+		    }
+		    T2t = T2p - T2s;
+		    T3L = T2p + T2s;
+		    T3O = T2J - T2G;
+		    T2K = T2G + T2J;
+		    TR = TN + TQ;
+		    TY = TU + TX;
+		    T5F = TR - TY;
+		    {
+			 E T4N, T4Q, T2y, T2D;
+			 T5G = T4S + T4T;
+			 T5H = T4O + T4P;
+			 T5I = T5G - T5H;
+			 T4N = TN - TQ;
+			 T4Q = T4O - T4P;
+			 T4R = T4N - T4Q;
+			 T5j = T4N + T4Q;
+			 T2y = T2w - T2x;
+			 T2D = T2z + T2C;
+			 T2E = KP707106781 * (T2y - T2D);
+			 T3P = KP707106781 * (T2y + T2D);
+			 {
+			      E T4U, T4V, T2L, T2M;
+			      T4U = T4S - T4T;
+			      T4V = TX - TU;
+			      T4W = T4U - T4V;
+			      T5k = T4V + T4U;
+			      T2L = T2z - T2C;
+			      T2M = T2x + T2w;
+			      T2N = KP707106781 * (T2L - T2M);
+			      T3M = KP707106781 * (T2M + T2L);
+			 }
+		    }
+	       }
+	       {
+		    E Ty, T2f, T21, T4C, TB, T1Y, T2i, T4D, TF, T28, T2b, T4I, TI, T23, T26;
+		    E T4J;
+		    {
+			 E Tw, Tx, T1Z, T20;
+			 Tw = ri[WS(is, 1)];
+			 Tx = ri[WS(is, 17)];
+			 Ty = Tw + Tx;
+			 T2f = Tw - Tx;
+			 T1Z = ii[WS(is, 1)];
+			 T20 = ii[WS(is, 17)];
+			 T21 = T1Z - T20;
+			 T4C = T1Z + T20;
+		    }
+		    {
+			 E Tz, TA, T2g, T2h;
+			 Tz = ri[WS(is, 9)];
+			 TA = ri[WS(is, 25)];
+			 TB = Tz + TA;
+			 T1Y = Tz - TA;
+			 T2g = ii[WS(is, 9)];
+			 T2h = ii[WS(is, 25)];
+			 T2i = T2g - T2h;
+			 T4D = T2g + T2h;
+		    }
+		    {
+			 E TD, TE, T29, T2a;
+			 TD = ri[WS(is, 5)];
+			 TE = ri[WS(is, 21)];
+			 TF = TD + TE;
+			 T28 = TD - TE;
+			 T29 = ii[WS(is, 5)];
+			 T2a = ii[WS(is, 21)];
+			 T2b = T29 - T2a;
+			 T4I = T29 + T2a;
+		    }
+		    {
+			 E TG, TH, T24, T25;
+			 TG = ri[WS(is, 29)];
+			 TH = ri[WS(is, 13)];
+			 TI = TG + TH;
+			 T23 = TG - TH;
+			 T24 = ii[WS(is, 29)];
+			 T25 = ii[WS(is, 13)];
+			 T26 = T24 - T25;
+			 T4J = T24 + T25;
+		    }
+		    T22 = T1Y + T21;
+		    T3E = T2f + T2i;
+		    T3H = T21 - T1Y;
+		    T2j = T2f - T2i;
+		    TC = Ty + TB;
+		    TJ = TF + TI;
+		    T5A = TC - TJ;
+		    {
+			 E T4E, T4F, T27, T2c;
+			 T5B = T4C + T4D;
+			 T5C = T4I + T4J;
+			 T5D = T5B - T5C;
+			 T4E = T4C - T4D;
+			 T4F = TI - TF;
+			 T4G = T4E - T4F;
+			 T5g = T4F + T4E;
+			 T27 = T23 - T26;
+			 T2c = T28 + T2b;
+			 T2d = KP707106781 * (T27 - T2c);
+			 T3F = KP707106781 * (T2c + T27);
+			 {
+			      E T4H, T4K, T2k, T2l;
+			      T4H = Ty - TB;
+			      T4K = T4I - T4J;
+			      T4L = T4H - T4K;
+			      T5h = T4H + T4K;
+			      T2k = T2b - T28;
+			      T2l = T23 + T26;
+			      T2m = KP707106781 * (T2k - T2l);
+			      T3I = KP707106781 * (T2k + T2l);
+			 }
+		    }
+	       }
+	       {
+		    E T4B, T57, T5a, T5c, T4Y, T56, T55, T5b;
+		    {
+			 E T4t, T4A, T58, T59;
+			 T4t = T4r - T4s;
+			 T4A = KP707106781 * (T4w - T4z);
+			 T4B = T4t + T4A;
+			 T57 = T4t - T4A;
+			 T58 = FNMS(KP923879532, T4L, KP382683432 * T4G);
+			 T59 = FMA(KP382683432, T4W, KP923879532 * T4R);
+			 T5a = T58 - T59;
+			 T5c = T58 + T59;
+		    }
+		    {
+			 E T4M, T4X, T51, T54;
+			 T4M = FMA(KP923879532, T4G, KP382683432 * T4L);
+			 T4X = FNMS(KP923879532, T4W, KP382683432 * T4R);
+			 T4Y = T4M + T4X;
+			 T56 = T4X - T4M;
+			 T51 = T4Z - T50;
+			 T54 = KP707106781 * (T52 - T53);
+			 T55 = T51 - T54;
+			 T5b = T51 + T54;
+		    }
+		    ro[WS(os, 22)] = T4B - T4Y;
+		    io[WS(os, 22)] = T5b - T5c;
+		    ro[WS(os, 6)] = T4B + T4Y;
+		    io[WS(os, 6)] = T5b + T5c;
+		    io[WS(os, 30)] = T55 - T56;
+		    ro[WS(os, 30)] = T57 - T5a;
+		    io[WS(os, 14)] = T55 + T56;
+		    ro[WS(os, 14)] = T57 + T5a;
+	       }
+	       {
+		    E T5f, T5r, T5u, T5w, T5m, T5q, T5p, T5v;
+		    {
+			 E T5d, T5e, T5s, T5t;
+			 T5d = T4r + T4s;
+			 T5e = KP707106781 * (T53 + T52);
+			 T5f = T5d + T5e;
+			 T5r = T5d - T5e;
+			 T5s = FNMS(KP382683432, T5h, KP923879532 * T5g);
+			 T5t = FMA(KP923879532, T5k, KP382683432 * T5j);
+			 T5u = T5s - T5t;
+			 T5w = T5s + T5t;
+		    }
+		    {
+			 E T5i, T5l, T5n, T5o;
+			 T5i = FMA(KP382683432, T5g, KP923879532 * T5h);
+			 T5l = FNMS(KP382683432, T5k, KP923879532 * T5j);
+			 T5m = T5i + T5l;
+			 T5q = T5l - T5i;
+			 T5n = T50 + T4Z;
+			 T5o = KP707106781 * (T4w + T4z);
+			 T5p = T5n - T5o;
+			 T5v = T5n + T5o;
+		    }
+		    ro[WS(os, 18)] = T5f - T5m;
+		    io[WS(os, 18)] = T5v - T5w;
+		    ro[WS(os, 2)] = T5f + T5m;
+		    io[WS(os, 2)] = T5v + T5w;
+		    io[WS(os, 26)] = T5p - T5q;
+		    ro[WS(os, 26)] = T5r - T5u;
+		    io[WS(os, 10)] = T5p + T5q;
+		    ro[WS(os, 10)] = T5r + T5u;
+	       }
+	       {
+		    E T5z, T5P, T5S, T5U, T5K, T5O, T5N, T5T;
+		    {
+			 E T5x, T5y, T5Q, T5R;
+			 T5x = T7 - Te;
+			 T5y = T1n - T1u;
+			 T5z = T5x + T5y;
+			 T5P = T5x - T5y;
+			 T5Q = T5D - T5A;
+			 T5R = T5F + T5I;
+			 T5S = KP707106781 * (T5Q - T5R);
+			 T5U = KP707106781 * (T5Q + T5R);
+		    }
+		    {
+			 E T5E, T5J, T5L, T5M;
+			 T5E = T5A + T5D;
+			 T5J = T5F - T5I;
+			 T5K = KP707106781 * (T5E + T5J);
+			 T5O = KP707106781 * (T5J - T5E);
+			 T5L = T18 - T1f;
+			 T5M = Tt - Tm;
+			 T5N = T5L - T5M;
+			 T5T = T5M + T5L;
+		    }
+		    ro[WS(os, 20)] = T5z - T5K;
+		    io[WS(os, 20)] = T5T - T5U;
+		    ro[WS(os, 4)] = T5z + T5K;
+		    io[WS(os, 4)] = T5T + T5U;
+		    io[WS(os, 28)] = T5N - T5O;
+		    ro[WS(os, 28)] = T5P - T5S;
+		    io[WS(os, 12)] = T5N + T5O;
+		    ro[WS(os, 12)] = T5P + T5S;
+	       }
+	       {
+		    E Tv, T5V, T5Y, T60, T10, T11, T1w, T5Z;
+		    {
+			 E Tf, Tu, T5W, T5X;
+			 Tf = T7 + Te;
+			 Tu = Tm + Tt;
+			 Tv = Tf + Tu;
+			 T5V = Tf - Tu;
+			 T5W = T5B + T5C;
+			 T5X = T5G + T5H;
+			 T5Y = T5W - T5X;
+			 T60 = T5W + T5X;
+		    }
+		    {
+			 E TK, TZ, T1g, T1v;
+			 TK = TC + TJ;
+			 TZ = TR + TY;
+			 T10 = TK + TZ;
+			 T11 = TZ - TK;
+			 T1g = T18 + T1f;
+			 T1v = T1n + T1u;
+			 T1w = T1g - T1v;
+			 T5Z = T1g + T1v;
+		    }
+		    ro[WS(os, 16)] = Tv - T10;
+		    io[WS(os, 16)] = T5Z - T60;
+		    ro[0] = Tv + T10;
+		    io[0] = T5Z + T60;
+		    io[WS(os, 8)] = T11 + T1w;
+		    ro[WS(os, 8)] = T5V + T5Y;
+		    io[WS(os, 24)] = T1w - T11;
+		    ro[WS(os, 24)] = T5V - T5Y;
+	       }
+	       {
+		    E T1X, T33, T31, T37, T2o, T34, T2P, T35;
+		    {
+			 E T1H, T1W, T2X, T30;
+			 T1H = T1z - T1G;
+			 T1W = T1O - T1V;
+			 T1X = T1H + T1W;
+			 T33 = T1H - T1W;
+			 T2X = T2T - T2W;
+			 T30 = T2Y - T2Z;
+			 T31 = T2X - T30;
+			 T37 = T2X + T30;
+		    }
+		    {
+			 E T2e, T2n, T2F, T2O;
+			 T2e = T22 - T2d;
+			 T2n = T2j - T2m;
+			 T2o = FMA(KP980785280, T2e, KP195090322 * T2n);
+			 T34 = FNMS(KP980785280, T2n, KP195090322 * T2e);
+			 T2F = T2t - T2E;
+			 T2O = T2K - T2N;
+			 T2P = FNMS(KP980785280, T2O, KP195090322 * T2F);
+			 T35 = FMA(KP195090322, T2O, KP980785280 * T2F);
+		    }
+		    {
+			 E T2Q, T38, T32, T36;
+			 T2Q = T2o + T2P;
+			 ro[WS(os, 23)] = T1X - T2Q;
+			 ro[WS(os, 7)] = T1X + T2Q;
+			 T38 = T34 + T35;
+			 io[WS(os, 23)] = T37 - T38;
+			 io[WS(os, 7)] = T37 + T38;
+			 T32 = T2P - T2o;
+			 io[WS(os, 31)] = T31 - T32;
+			 io[WS(os, 15)] = T31 + T32;
+			 T36 = T34 - T35;
+			 ro[WS(os, 31)] = T33 - T36;
+			 ro[WS(os, 15)] = T33 + T36;
+		    }
+	       }
+	       {
+		    E T3D, T41, T3Z, T45, T3K, T42, T3R, T43;
+		    {
+			 E T3v, T3C, T3V, T3Y;
+			 T3v = T3t - T3u;
+			 T3C = T3y - T3B;
+			 T3D = T3v + T3C;
+			 T41 = T3v - T3C;
+			 T3V = T3T - T3U;
+			 T3Y = T3W - T3X;
+			 T3Z = T3V - T3Y;
+			 T45 = T3V + T3Y;
+		    }
+		    {
+			 E T3G, T3J, T3N, T3Q;
+			 T3G = T3E - T3F;
+			 T3J = T3H - T3I;
+			 T3K = FMA(KP555570233, T3G, KP831469612 * T3J);
+			 T42 = FNMS(KP831469612, T3G, KP555570233 * T3J);
+			 T3N = T3L - T3M;
+			 T3Q = T3O - T3P;
+			 T3R = FNMS(KP831469612, T3Q, KP555570233 * T3N);
+			 T43 = FMA(KP831469612, T3N, KP555570233 * T3Q);
+		    }
+		    {
+			 E T3S, T46, T40, T44;
+			 T3S = T3K + T3R;
+			 ro[WS(os, 21)] = T3D - T3S;
+			 ro[WS(os, 5)] = T3D + T3S;
+			 T46 = T42 + T43;
+			 io[WS(os, 21)] = T45 - T46;
+			 io[WS(os, 5)] = T45 + T46;
+			 T40 = T3R - T3K;
+			 io[WS(os, 29)] = T3Z - T40;
+			 io[WS(os, 13)] = T3Z + T40;
+			 T44 = T42 - T43;
+			 ro[WS(os, 29)] = T41 - T44;
+			 ro[WS(os, 13)] = T41 + T44;
+		    }
+	       }
+	       {
+		    E T49, T4l, T4j, T4p, T4c, T4m, T4f, T4n;
+		    {
+			 E T47, T48, T4h, T4i;
+			 T47 = T3t + T3u;
+			 T48 = T3X + T3W;
+			 T49 = T47 + T48;
+			 T4l = T47 - T48;
+			 T4h = T3T + T3U;
+			 T4i = T3y + T3B;
+			 T4j = T4h - T4i;
+			 T4p = T4h + T4i;
+		    }
+		    {
+			 E T4a, T4b, T4d, T4e;
+			 T4a = T3E + T3F;
+			 T4b = T3H + T3I;
+			 T4c = FMA(KP980785280, T4a, KP195090322 * T4b);
+			 T4m = FNMS(KP195090322, T4a, KP980785280 * T4b);
+			 T4d = T3L + T3M;
+			 T4e = T3O + T3P;
+			 T4f = FNMS(KP195090322, T4e, KP980785280 * T4d);
+			 T4n = FMA(KP195090322, T4d, KP980785280 * T4e);
+		    }
+		    {
+			 E T4g, T4q, T4k, T4o;
+			 T4g = T4c + T4f;
+			 ro[WS(os, 17)] = T49 - T4g;
+			 ro[WS(os, 1)] = T49 + T4g;
+			 T4q = T4m + T4n;
+			 io[WS(os, 17)] = T4p - T4q;
+			 io[WS(os, 1)] = T4p + T4q;
+			 T4k = T4f - T4c;
+			 io[WS(os, 25)] = T4j - T4k;
+			 io[WS(os, 9)] = T4j + T4k;
+			 T4o = T4m - T4n;
+			 ro[WS(os, 25)] = T4l - T4o;
+			 ro[WS(os, 9)] = T4l + T4o;
+		    }
+	       }
+	       {
+		    E T3b, T3n, T3l, T3r, T3e, T3o, T3h, T3p;
+		    {
+			 E T39, T3a, T3j, T3k;
+			 T39 = T1z + T1G;
+			 T3a = T2Z + T2Y;
+			 T3b = T39 + T3a;
+			 T3n = T39 - T3a;
+			 T3j = T2T + T2W;
+			 T3k = T1O + T1V;
+			 T3l = T3j - T3k;
+			 T3r = T3j + T3k;
+		    }
+		    {
+			 E T3c, T3d, T3f, T3g;
+			 T3c = T22 + T2d;
+			 T3d = T2j + T2m;
+			 T3e = FMA(KP555570233, T3c, KP831469612 * T3d);
+			 T3o = FNMS(KP555570233, T3d, KP831469612 * T3c);
+			 T3f = T2t + T2E;
+			 T3g = T2K + T2N;
+			 T3h = FNMS(KP555570233, T3g, KP831469612 * T3f);
+			 T3p = FMA(KP831469612, T3g, KP555570233 * T3f);
+		    }
+		    {
+			 E T3i, T3s, T3m, T3q;
+			 T3i = T3e + T3h;
+			 ro[WS(os, 19)] = T3b - T3i;
+			 ro[WS(os, 3)] = T3b + T3i;
+			 T3s = T3o + T3p;
+			 io[WS(os, 19)] = T3r - T3s;
+			 io[WS(os, 3)] = T3r + T3s;
+			 T3m = T3h - T3e;
+			 io[WS(os, 27)] = T3l - T3m;
+			 io[WS(os, 11)] = T3l + T3m;
+			 T3q = T3o - T3p;
+			 ro[WS(os, 27)] = T3n - T3q;
+			 ro[WS(os, 11)] = T3n + T3q;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 32, "n1_32", {340, 52, 32, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_32) (planner *p) {
+     X(kdft_register) (p, n1_32, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:42 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 4 -name n1_4 -include n.h */
+
+/*
+ * This function contains 16 FP additions, 0 FP multiplications,
+ * (or, 16 additions, 0 multiplications, 0 fused multiply/add),
+ * 13 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "n.h"
+
+static void n1_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       E T4, Tb, T3, Tf, T9, T5, Tc, Td;
+	       {
+		    E T1, T2, T7, T8;
+		    T1 = ri[0];
+		    T2 = ri[WS(is, 2)];
+		    T7 = ii[0];
+		    T8 = ii[WS(is, 2)];
+		    T4 = ri[WS(is, 1)];
+		    Tb = T1 - T2;
+		    T3 = T1 + T2;
+		    Tf = T7 + T8;
+		    T9 = T7 - T8;
+		    T5 = ri[WS(is, 3)];
+		    Tc = ii[WS(is, 1)];
+		    Td = ii[WS(is, 3)];
+	       }
+	       {
+		    E T6, Ta, Te, Tg;
+		    T6 = T4 + T5;
+		    Ta = T4 - T5;
+		    Te = Tc - Td;
+		    Tg = Tc + Td;
+		    io[WS(os, 3)] = Ta + T9;
+		    io[WS(os, 1)] = T9 - Ta;
+		    ro[0] = T3 + T6;
+		    ro[WS(os, 2)] = T3 - T6;
+		    io[0] = Tf + Tg;
+		    io[WS(os, 2)] = Tf - Tg;
+		    ro[WS(os, 3)] = Tb - Te;
+		    ro[WS(os, 1)] = Tb + Te;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 4, "n1_4", {16, 0, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_4) (planner *p) {
+     X(kdft_register) (p, n1_4, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 4 -name n1_4 -include n.h */
+
+/*
+ * This function contains 16 FP additions, 0 FP multiplications,
+ * (or, 16 additions, 0 multiplications, 0 fused multiply/add),
+ * 13 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "n.h"
+
+static void n1_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       E T3, Tb, T9, Tf, T6, Ta, Te, Tg;
+	       {
+		    E T1, T2, T7, T8;
+		    T1 = ri[0];
+		    T2 = ri[WS(is, 2)];
+		    T3 = T1 + T2;
+		    Tb = T1 - T2;
+		    T7 = ii[0];
+		    T8 = ii[WS(is, 2)];
+		    T9 = T7 - T8;
+		    Tf = T7 + T8;
+	       }
+	       {
+		    E T4, T5, Tc, Td;
+		    T4 = ri[WS(is, 1)];
+		    T5 = ri[WS(is, 3)];
+		    T6 = T4 + T5;
+		    Ta = T4 - T5;
+		    Tc = ii[WS(is, 1)];
+		    Td = ii[WS(is, 3)];
+		    Te = Tc - Td;
+		    Tg = Tc + Td;
+	       }
+	       ro[WS(os, 2)] = T3 - T6;
+	       io[WS(os, 2)] = Tf - Tg;
+	       ro[0] = T3 + T6;
+	       io[0] = Tf + Tg;
+	       io[WS(os, 1)] = T9 - Ta;
+	       ro[WS(os, 1)] = Tb + Te;
+	       io[WS(os, 3)] = Ta + T9;
+	       ro[WS(os, 3)] = Tb - Te;
+	  }
+     }
+}
+
+static const kdft_desc desc = { 4, "n1_4", {16, 0, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_4) (planner *p) {
+     X(kdft_register) (p, n1_4, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:42 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 5 -name n1_5 -include n.h */
+
+/*
+ * This function contains 32 FP additions, 18 FP multiplications,
+ * (or, 14 additions, 0 multiplications, 18 fused multiply/add),
+ * 37 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "n.h"
+
+static void n1_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
+	       E Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv;
+	       {
+		    E T1, Tl, Ts, Tt, T8, Ta, Te, Tm, Tn, Th, To, T9;
+		    T1 = ri[0];
+		    Tl = ii[0];
+		    {
+			 E T2, T3, T5, T6;
+			 T2 = ri[WS(is, 1)];
+			 T3 = ri[WS(is, 4)];
+			 T5 = ri[WS(is, 2)];
+			 T6 = ri[WS(is, 3)];
+			 {
+			      E Tc, T4, T7, Td, Tf, Tg;
+			      Tc = ii[WS(is, 1)];
+			      Ts = T2 - T3;
+			      T4 = T2 + T3;
+			      Tt = T5 - T6;
+			      T7 = T5 + T6;
+			      Td = ii[WS(is, 4)];
+			      Tf = ii[WS(is, 2)];
+			      Tg = ii[WS(is, 3)];
+			      T8 = T4 + T7;
+			      Ta = T4 - T7;
+			      Te = Tc - Td;
+			      Tm = Tc + Td;
+			      Tn = Tf + Tg;
+			      Th = Tf - Tg;
+			 }
+		    }
+		    ro[0] = T1 + T8;
+		    To = Tm + Tn;
+		    Tq = Tm - Tn;
+		    Ti = FMA(KP618033988, Th, Te);
+		    Tk = FNMS(KP618033988, Te, Th);
+		    io[0] = Tl + To;
+		    T9 = FNMS(KP250000000, T8, T1);
+		    Tu = FMA(KP618033988, Tt, Ts);
+		    Tw = FNMS(KP618033988, Ts, Tt);
+		    Tp = FNMS(KP250000000, To, Tl);
+		    Tb = FMA(KP559016994, Ta, T9);
+		    Tj = FNMS(KP559016994, Ta, T9);
+	       }
+	       Tr = FMA(KP559016994, Tq, Tp);
+	       Tv = FNMS(KP559016994, Tq, Tp);
+	       ro[WS(os, 2)] = FNMS(KP951056516, Tk, Tj);
+	       ro[WS(os, 3)] = FMA(KP951056516, Tk, Tj);
+	       ro[WS(os, 1)] = FMA(KP951056516, Ti, Tb);
+	       ro[WS(os, 4)] = FNMS(KP951056516, Ti, Tb);
+	       io[WS(os, 2)] = FMA(KP951056516, Tw, Tv);
+	       io[WS(os, 3)] = FNMS(KP951056516, Tw, Tv);
+	       io[WS(os, 4)] = FMA(KP951056516, Tu, Tr);
+	       io[WS(os, 1)] = FNMS(KP951056516, Tu, Tr);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 5, "n1_5", {14, 0, 18, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_5) (planner *p) {
+     X(kdft_register) (p, n1_5, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 5 -name n1_5 -include n.h */
+
+/*
+ * This function contains 32 FP additions, 12 FP multiplications,
+ * (or, 26 additions, 6 multiplications, 6 fused multiply/add),
+ * 21 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "n.h"
+
+static void n1_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
+	       E T1, To, T8, Tt, T9, Ts, Te, Tp, Th, Tn;
+	       T1 = ri[0];
+	       To = ii[0];
+	       {
+		    E T2, T3, T4, T5, T6, T7;
+		    T2 = ri[WS(is, 1)];
+		    T3 = ri[WS(is, 4)];
+		    T4 = T2 + T3;
+		    T5 = ri[WS(is, 2)];
+		    T6 = ri[WS(is, 3)];
+		    T7 = T5 + T6;
+		    T8 = T4 + T7;
+		    Tt = T5 - T6;
+		    T9 = KP559016994 * (T4 - T7);
+		    Ts = T2 - T3;
+	       }
+	       {
+		    E Tc, Td, Tl, Tf, Tg, Tm;
+		    Tc = ii[WS(is, 1)];
+		    Td = ii[WS(is, 4)];
+		    Tl = Tc + Td;
+		    Tf = ii[WS(is, 2)];
+		    Tg = ii[WS(is, 3)];
+		    Tm = Tf + Tg;
+		    Te = Tc - Td;
+		    Tp = Tl + Tm;
+		    Th = Tf - Tg;
+		    Tn = KP559016994 * (Tl - Tm);
+	       }
+	       ro[0] = T1 + T8;
+	       io[0] = To + Tp;
+	       {
+		    E Ti, Tk, Tb, Tj, Ta;
+		    Ti = FMA(KP951056516, Te, KP587785252 * Th);
+		    Tk = FNMS(KP587785252, Te, KP951056516 * Th);
+		    Ta = FNMS(KP250000000, T8, T1);
+		    Tb = T9 + Ta;
+		    Tj = Ta - T9;
+		    ro[WS(os, 4)] = Tb - Ti;
+		    ro[WS(os, 3)] = Tj + Tk;
+		    ro[WS(os, 1)] = Tb + Ti;
+		    ro[WS(os, 2)] = Tj - Tk;
+	       }
+	       {
+		    E Tu, Tv, Tr, Tw, Tq;
+		    Tu = FMA(KP951056516, Ts, KP587785252 * Tt);
+		    Tv = FNMS(KP587785252, Ts, KP951056516 * Tt);
+		    Tq = FNMS(KP250000000, Tp, To);
+		    Tr = Tn + Tq;
+		    Tw = Tq - Tn;
+		    io[WS(os, 1)] = Tr - Tu;
+		    io[WS(os, 3)] = Tw - Tv;
+		    io[WS(os, 4)] = Tu + Tr;
+		    io[WS(os, 2)] = Tv + Tw;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 5, "n1_5", {26, 6, 6, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_5) (planner *p) {
+     X(kdft_register) (p, n1_5, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:42 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 6 -name n1_6 -include n.h */
+
+/*
+ * This function contains 36 FP additions, 12 FP multiplications,
+ * (or, 24 additions, 0 multiplications, 12 fused multiply/add),
+ * 30 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "n.h"
+
+static void n1_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
+	       E TA, Tz;
+	       {
+		    E Tb, T3, Tx, Tp, Tj, Te, Ts, Ta, Tu, Ti, Tk;
+		    {
+			 E T1, T2, Tn, To;
+			 T1 = ri[0];
+			 T2 = ri[WS(is, 3)];
+			 Tn = ii[0];
+			 To = ii[WS(is, 3)];
+			 {
+			      E T4, T5, T7, T8;
+			      T4 = ri[WS(is, 2)];
+			      Tb = T1 + T2;
+			      T3 = T1 - T2;
+			      Tx = Tn + To;
+			      Tp = Tn - To;
+			      T5 = ri[WS(is, 5)];
+			      T7 = ri[WS(is, 4)];
+			      T8 = ri[WS(is, 1)];
+			      {
+				   E Tg, Tc, T6, Td, T9, Th;
+				   Tg = ii[WS(is, 2)];
+				   Tc = T4 + T5;
+				   T6 = T4 - T5;
+				   Td = T7 + T8;
+				   T9 = T7 - T8;
+				   Th = ii[WS(is, 5)];
+				   Tj = ii[WS(is, 4)];
+				   Te = Tc + Td;
+				   TA = Td - Tc;
+				   Ts = T9 - T6;
+				   Ta = T6 + T9;
+				   Tu = Tg + Th;
+				   Ti = Tg - Th;
+				   Tk = ii[WS(is, 1)];
+			      }
+			 }
+		    }
+		    ro[WS(os, 3)] = T3 + Ta;
+		    ro[0] = Tb + Te;
+		    {
+			 E Tf, Tv, Tl, Ty, Tr;
+			 Tf = FNMS(KP500000000, Ta, T3);
+			 Tv = Tj + Tk;
+			 Tl = Tj - Tk;
+			 {
+			      E Tt, Tw, Tq, Tm;
+			      Tt = FNMS(KP500000000, Te, Tb);
+			      Ty = Tu + Tv;
+			      Tw = Tu - Tv;
+			      Tq = Ti + Tl;
+			      Tm = Ti - Tl;
+			      io[0] = Tx + Ty;
+			      ro[WS(os, 1)] = FMA(KP866025403, Tm, Tf);
+			      ro[WS(os, 5)] = FNMS(KP866025403, Tm, Tf);
+			      Tr = FNMS(KP500000000, Tq, Tp);
+			      io[WS(os, 3)] = Tp + Tq;
+			      ro[WS(os, 2)] = FNMS(KP866025403, Tw, Tt);
+			      ro[WS(os, 4)] = FMA(KP866025403, Tw, Tt);
+			 }
+			 io[WS(os, 5)] = FNMS(KP866025403, Ts, Tr);
+			 io[WS(os, 1)] = FMA(KP866025403, Ts, Tr);
+			 Tz = FNMS(KP500000000, Ty, Tx);
+		    }
+	       }
+	       io[WS(os, 4)] = FMA(KP866025403, TA, Tz);
+	       io[WS(os, 2)] = FNMS(KP866025403, TA, Tz);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 6, "n1_6", {24, 0, 12, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_6) (planner *p) {
+     X(kdft_register) (p, n1_6, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 6 -name n1_6 -include n.h */
+
+/*
+ * This function contains 36 FP additions, 8 FP multiplications,
+ * (or, 32 additions, 4 multiplications, 4 fused multiply/add),
+ * 23 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "n.h"
+
+static void n1_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
+	       E T3, Tb, Tq, Tx, T6, Tc, T9, Td, Ta, Te, Ti, Tu, Tl, Tv, Tr;
+	       E Ty;
+	       {
+		    E T1, T2, To, Tp;
+		    T1 = ri[0];
+		    T2 = ri[WS(is, 3)];
+		    T3 = T1 - T2;
+		    Tb = T1 + T2;
+		    To = ii[0];
+		    Tp = ii[WS(is, 3)];
+		    Tq = To - Tp;
+		    Tx = To + Tp;
+	       }
+	       {
+		    E T4, T5, T7, T8;
+		    T4 = ri[WS(is, 2)];
+		    T5 = ri[WS(is, 5)];
+		    T6 = T4 - T5;
+		    Tc = T4 + T5;
+		    T7 = ri[WS(is, 4)];
+		    T8 = ri[WS(is, 1)];
+		    T9 = T7 - T8;
+		    Td = T7 + T8;
+	       }
+	       Ta = T6 + T9;
+	       Te = Tc + Td;
+	       {
+		    E Tg, Th, Tj, Tk;
+		    Tg = ii[WS(is, 2)];
+		    Th = ii[WS(is, 5)];
+		    Ti = Tg - Th;
+		    Tu = Tg + Th;
+		    Tj = ii[WS(is, 4)];
+		    Tk = ii[WS(is, 1)];
+		    Tl = Tj - Tk;
+		    Tv = Tj + Tk;
+	       }
+	       Tr = Ti + Tl;
+	       Ty = Tu + Tv;
+	       ro[WS(os, 3)] = T3 + Ta;
+	       io[WS(os, 3)] = Tq + Tr;
+	       ro[0] = Tb + Te;
+	       io[0] = Tx + Ty;
+	       {
+		    E Tf, Tm, Tn, Ts;
+		    Tf = FNMS(KP500000000, Ta, T3);
+		    Tm = KP866025403 * (Ti - Tl);
+		    ro[WS(os, 5)] = Tf - Tm;
+		    ro[WS(os, 1)] = Tf + Tm;
+		    Tn = KP866025403 * (T9 - T6);
+		    Ts = FNMS(KP500000000, Tr, Tq);
+		    io[WS(os, 1)] = Tn + Ts;
+		    io[WS(os, 5)] = Ts - Tn;
+	       }
+	       {
+		    E Tt, Tw, Tz, TA;
+		    Tt = FNMS(KP500000000, Te, Tb);
+		    Tw = KP866025403 * (Tu - Tv);
+		    ro[WS(os, 2)] = Tt - Tw;
+		    ro[WS(os, 4)] = Tt + Tw;
+		    Tz = FNMS(KP500000000, Ty, Tx);
+		    TA = KP866025403 * (Td - Tc);
+		    io[WS(os, 2)] = Tz - TA;
+		    io[WS(os, 4)] = TA + Tz;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 6, "n1_6", {32, 4, 4, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_6) (planner *p) {
+     X(kdft_register) (p, n1_6, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2981 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:46 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 64 -name n1_64 -include n.h */
+
+/*
+ * This function contains 912 FP additions, 392 FP multiplications,
+ * (or, 520 additions, 0 multiplications, 392 fused multiply/add),
+ * 202 stack variables, 15 constants, and 256 memory accesses
+ */
+#include "n.h"
+
+static void n1_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(256, is), MAKE_VOLATILE_STRIDE(256, os)) {
+	       E T9b, T9e;
+	       {
+		    E T7B, T37, T5Z, T8F, Td9, Tf, TcB, TbB, T7C, T62, TdH, T2i, Tcb, Tah, T8G;
+		    E T3e, Tu, TdI, Tak, TbC, TbD, Tan, Tda, T2x, T65, T3m, T8I, T7G, T8J, T7J;
+		    E T64, T3t, Tdd, TK, Tce, Tas, Tcf, Tav, Tdc, T2N, T6G, T3G, T9k, T7O, T9l;
+		    E T7R, T6H, T3N, TdA, T1L, Tct, Tbs, Teo, Tdx, T6Y, T5j, T6V, T5Q, T9z, T8y;
+		    E Tcw, Tbb, T9C, T8n, Tdf, TZ, Tch, Taz, Tci, TaC, Tdg, T32, T6J, T3Z, T9n;
+		    E T7V, T9o, T7Y, T6K, T46, Tdp, T1g, Tcm, Tb1, Tej, Tdm, T6R, T4q, T6O, T4X;
+		    E T9s, T8f, Tcp, TaK, T9v, T84, Tdn, T1v, Tcq, Tb4, Tek, Tds, T6P, T4N, T6S;
+		    E T50, T9w, T8i, Tcn, TaV, T9t, T8b, Tdy, T20, Tcx, Tbv, Tep, TdD, T8q, T6W;
+		    E T5G, T6Z, T5T, T8t, T9D, T8B, Tcu, Tbm, T8l, T8m;
+		    {
+			 E T3s, T3p, T3M, T3J;
+			 {
+			      E Taf, T3d, T3a, Tag;
+			      {
+				   E T35, T3, T5Y, T26, T5X, T6, T36, T29, Tb, T39, Ta, T38, T2d, Tc, T2e;
+				   E T2f;
+				   {
+					E T4, T5, T27, T28;
+					{
+					     E T1, T2, T24, T25;
+					     T1 = ri[0];
+					     T2 = ri[WS(is, 32)];
+					     T24 = ii[0];
+					     T25 = ii[WS(is, 32)];
+					     T4 = ri[WS(is, 16)];
+					     T35 = T1 - T2;
+					     T3 = T1 + T2;
+					     T5Y = T24 - T25;
+					     T26 = T24 + T25;
+					     T5 = ri[WS(is, 48)];
+					     T27 = ii[WS(is, 16)];
+					     T28 = ii[WS(is, 48)];
+					}
+					{
+					     E T8, T9, T2b, T2c;
+					     T8 = ri[WS(is, 8)];
+					     T5X = T4 - T5;
+					     T6 = T4 + T5;
+					     T36 = T27 - T28;
+					     T29 = T27 + T28;
+					     T9 = ri[WS(is, 40)];
+					     T2b = ii[WS(is, 8)];
+					     T2c = ii[WS(is, 40)];
+					     Tb = ri[WS(is, 56)];
+					     T39 = T8 - T9;
+					     Ta = T8 + T9;
+					     T38 = T2b - T2c;
+					     T2d = T2b + T2c;
+					     Tc = ri[WS(is, 24)];
+					     T2e = ii[WS(is, 56)];
+					     T2f = ii[WS(is, 24)];
+					}
+				   }
+				   {
+					E T3b, T3c, T2g, T7, Te, Tbz, Td;
+					T7B = T35 + T36;
+					T37 = T35 - T36;
+					T3b = Tb - Tc;
+					Td = Tb + Tc;
+					T3c = T2e - T2f;
+					T2g = T2e + T2f;
+					T5Z = T5X + T5Y;
+					T8F = T5Y - T5X;
+					Taf = T3 - T6;
+					T7 = T3 + T6;
+					Te = Ta + Td;
+					Tbz = Td - Ta;
+					{
+					     E T2a, T60, T61, TbA, T2h;
+					     TbA = T26 - T29;
+					     T2a = T26 + T29;
+					     T3d = T3b + T3c;
+					     T60 = T3b - T3c;
+					     Td9 = T7 - Te;
+					     Tf = T7 + Te;
+					     TcB = TbA - Tbz;
+					     TbB = Tbz + TbA;
+					     T61 = T39 + T38;
+					     T3a = T38 - T39;
+					     T2h = T2d + T2g;
+					     Tag = T2d - T2g;
+					     T7C = T61 + T60;
+					     T62 = T60 - T61;
+					     TdH = T2a - T2h;
+					     T2i = T2a + T2h;
+					}
+				   }
+			      }
+			      {
+				   E T3j, Ti, T3h, T2l, T3g, Tl, T3k, T2o, Tq, T3q, Tp, T3o, T2s, Tr, T2t;
+				   E T2u;
+				   {
+					E Tj, Tk, T2m, T2n;
+					{
+					     E Tg, Th, T2j, T2k;
+					     Tg = ri[WS(is, 4)];
+					     Tcb = Taf - Tag;
+					     Tah = Taf + Tag;
+					     T8G = T3a + T3d;
+					     T3e = T3a - T3d;
+					     Th = ri[WS(is, 36)];
+					     T2j = ii[WS(is, 4)];
+					     T2k = ii[WS(is, 36)];
+					     Tj = ri[WS(is, 20)];
+					     T3j = Tg - Th;
+					     Ti = Tg + Th;
+					     T3h = T2j - T2k;
+					     T2l = T2j + T2k;
+					     Tk = ri[WS(is, 52)];
+					     T2m = ii[WS(is, 20)];
+					     T2n = ii[WS(is, 52)];
+					}
+					{
+					     E Tn, To, T2q, T2r;
+					     Tn = ri[WS(is, 60)];
+					     T3g = Tj - Tk;
+					     Tl = Tj + Tk;
+					     T3k = T2m - T2n;
+					     T2o = T2m + T2n;
+					     To = ri[WS(is, 28)];
+					     T2q = ii[WS(is, 60)];
+					     T2r = ii[WS(is, 28)];
+					     Tq = ri[WS(is, 12)];
+					     T3q = Tn - To;
+					     Tp = Tn + To;
+					     T3o = T2q - T2r;
+					     T2s = T2q + T2r;
+					     Tr = ri[WS(is, 44)];
+					     T2t = ii[WS(is, 12)];
+					     T2u = ii[WS(is, 44)];
+					}
+				   }
+				   {
+					E T3n, T3r, T2p, T2w;
+					{
+					     E Tai, Tm, T2v, Tal, Tt, Taj, Ts, Tam;
+					     Tai = Ti - Tl;
+					     Tm = Ti + Tl;
+					     T3n = Tq - Tr;
+					     Ts = Tq + Tr;
+					     T3r = T2t - T2u;
+					     T2v = T2t + T2u;
+					     Tal = Tp - Ts;
+					     Tt = Tp + Ts;
+					     Taj = T2l - T2o;
+					     T2p = T2l + T2o;
+					     Tam = T2s - T2v;
+					     T2w = T2s + T2v;
+					     Tu = Tm + Tt;
+					     TdI = Tt - Tm;
+					     Tak = Tai + Taj;
+					     TbC = Taj - Tai;
+					     TbD = Tal + Tam;
+					     Tan = Tal - Tam;
+					}
+					{
+					     E T7F, T7E, T3i, T3l, T7H, T7I;
+					     T7F = T3h - T3g;
+					     T3i = T3g + T3h;
+					     T3l = T3j - T3k;
+					     T7E = T3j + T3k;
+					     Tda = T2p - T2w;
+					     T2x = T2p + T2w;
+					     T65 = FNMS(KP414213562, T3i, T3l);
+					     T3m = FMA(KP414213562, T3l, T3i);
+					     T3s = T3q - T3r;
+					     T7H = T3q + T3r;
+					     T7I = T3o - T3n;
+					     T3p = T3n + T3o;
+					     T8I = FNMS(KP414213562, T7E, T7F);
+					     T7G = FMA(KP414213562, T7F, T7E);
+					     T8J = FMA(KP414213562, T7H, T7I);
+					     T7J = FNMS(KP414213562, T7I, T7H);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T3H, Ty, T3x, T2B, T3w, TB, T3I, T2E, TI, T2L, T3z, TF, T3E, T3K, T2I;
+			      E T3A;
+			      {
+				   E T2z, T2A, Tz, TA, Tw, Tx, T2C, T2D;
+				   Tw = ri[WS(is, 2)];
+				   Tx = ri[WS(is, 34)];
+				   T2z = ii[WS(is, 2)];
+				   T64 = FMA(KP414213562, T3p, T3s);
+				   T3t = FNMS(KP414213562, T3s, T3p);
+				   T3H = Tw - Tx;
+				   Ty = Tw + Tx;
+				   T2A = ii[WS(is, 34)];
+				   Tz = ri[WS(is, 18)];
+				   TA = ri[WS(is, 50)];
+				   T2C = ii[WS(is, 18)];
+				   T3x = T2z - T2A;
+				   T2B = T2z + T2A;
+				   T3w = Tz - TA;
+				   TB = Tz + TA;
+				   T2D = ii[WS(is, 50)];
+				   {
+					E T2J, T3C, T2K, TG, TH;
+					TG = ri[WS(is, 58)];
+					TH = ri[WS(is, 26)];
+					T2J = ii[WS(is, 58)];
+					T3I = T2C - T2D;
+					T2E = T2C + T2D;
+					T3C = TG - TH;
+					TI = TG + TH;
+					T2K = ii[WS(is, 26)];
+					{
+					     E T2G, T2H, TD, TE, T3D;
+					     TD = ri[WS(is, 10)];
+					     TE = ri[WS(is, 42)];
+					     T3D = T2J - T2K;
+					     T2L = T2J + T2K;
+					     T2G = ii[WS(is, 10)];
+					     T3z = TD - TE;
+					     TF = TD + TE;
+					     T2H = ii[WS(is, 42)];
+					     T3E = T3C - T3D;
+					     T3K = T3C + T3D;
+					     T2I = T2G + T2H;
+					     T3A = T2G - T2H;
+					}
+				   }
+			      }
+			      {
+				   E T3L, T3B, T2F, T2M;
+				   {
+					E Tat, Taq, Tar, TC, TJ, Tau;
+					Tat = Ty - TB;
+					TC = Ty + TB;
+					TJ = TF + TI;
+					Taq = TI - TF;
+					T3L = T3A - T3z;
+					T3B = T3z + T3A;
+					Tdd = TC - TJ;
+					TK = TC + TJ;
+					Tar = T2B - T2E;
+					T2F = T2B + T2E;
+					Tau = T2I - T2L;
+					T2M = T2I + T2L;
+					Tce = Tar - Taq;
+					Tas = Taq + Tar;
+					Tcf = Tat - Tau;
+					Tav = Tat + Tau;
+				   }
+				   {
+					E T7M, T7Q, T7N, T3y, T3F, T7P;
+					T7M = T3x - T3w;
+					T3y = T3w + T3x;
+					T3F = T3B - T3E;
+					T7Q = T3B + T3E;
+					Tdc = T2F - T2M;
+					T2N = T2F + T2M;
+					T6G = FMA(KP707106781, T3F, T3y);
+					T3G = FNMS(KP707106781, T3F, T3y);
+					T7N = T3L + T3K;
+					T3M = T3K - T3L;
+					T3J = T3H - T3I;
+					T7P = T3H + T3I;
+					T9k = FNMS(KP707106781, T7N, T7M);
+					T7O = FMA(KP707106781, T7N, T7M);
+					T9l = FNMS(KP707106781, T7Q, T7P);
+					T7R = FMA(KP707106781, T7Q, T7P);
+				   }
+			      }
+			 }
+			 {
+			      E T5I, T1z, Tb8, T56, T53, T1C, Tb9, T5L, T1J, Tbq, T58, T1G, T5N, T5h, Tbp;
+			      E T5b;
+			      {
+				   E T54, T55, T1A, T1B, T1x, T1y, T5J, T5K;
+				   T1x = ri[WS(is, 63)];
+				   T1y = ri[WS(is, 31)];
+				   T54 = ii[WS(is, 63)];
+				   T6H = FMA(KP707106781, T3M, T3J);
+				   T3N = FNMS(KP707106781, T3M, T3J);
+				   T5I = T1x - T1y;
+				   T1z = T1x + T1y;
+				   T55 = ii[WS(is, 31)];
+				   T1A = ri[WS(is, 15)];
+				   T1B = ri[WS(is, 47)];
+				   T5J = ii[WS(is, 15)];
+				   Tb8 = T54 + T55;
+				   T56 = T54 - T55;
+				   T53 = T1A - T1B;
+				   T1C = T1A + T1B;
+				   T5K = ii[WS(is, 47)];
+				   {
+					E T5e, T5d, T5f, T1H, T1I;
+					T1H = ri[WS(is, 55)];
+					T1I = ri[WS(is, 23)];
+					T5e = ii[WS(is, 55)];
+					Tb9 = T5J + T5K;
+					T5L = T5J - T5K;
+					T5d = T1H - T1I;
+					T1J = T1H + T1I;
+					T5f = ii[WS(is, 23)];
+					{
+					     E T59, T5a, T1E, T1F, T5g;
+					     T1E = ri[WS(is, 7)];
+					     T1F = ri[WS(is, 39)];
+					     T5g = T5e - T5f;
+					     Tbq = T5e + T5f;
+					     T59 = ii[WS(is, 7)];
+					     T58 = T1E - T1F;
+					     T1G = T1E + T1F;
+					     T5a = ii[WS(is, 39)];
+					     T5N = T5d + T5g;
+					     T5h = T5d - T5g;
+					     Tbp = T59 + T5a;
+					     T5b = T59 - T5a;
+					}
+				   }
+			      }
+			      {
+				   E Tb7, T5O, Tba, T57, T5i, T8x, T8w, T5M, T5P;
+				   {
+					E Tbo, T5c, Tbr, Tdw, T1D, T1K, Tdv;
+					Tbo = T1z - T1C;
+					T1D = T1z + T1C;
+					T1K = T1G + T1J;
+					Tb7 = T1J - T1G;
+					T5c = T58 + T5b;
+					T5O = T5b - T58;
+					TdA = T1D - T1K;
+					T1L = T1D + T1K;
+					Tbr = Tbp - Tbq;
+					Tdw = Tbp + Tbq;
+					Tba = Tb8 - Tb9;
+					Tdv = Tb8 + Tb9;
+					T8l = T56 - T53;
+					T57 = T53 + T56;
+					Tct = Tbo - Tbr;
+					Tbs = Tbo + Tbr;
+					Teo = Tdv + Tdw;
+					Tdx = Tdv - Tdw;
+					T5i = T5c - T5h;
+					T8x = T5c + T5h;
+				   }
+				   T8w = T5I + T5L;
+				   T5M = T5I - T5L;
+				   T5P = T5N - T5O;
+				   T8m = T5O + T5N;
+				   T6Y = FMA(KP707106781, T5i, T57);
+				   T5j = FNMS(KP707106781, T5i, T57);
+				   T6V = FMA(KP707106781, T5P, T5M);
+				   T5Q = FNMS(KP707106781, T5P, T5M);
+				   T9z = FNMS(KP707106781, T8x, T8w);
+				   T8y = FMA(KP707106781, T8x, T8w);
+				   Tcw = Tba - Tb7;
+				   Tbb = Tb7 + Tba;
+			      }
+			 }
+		    }
+		    {
+			 E T82, T83, T45, T42, T87, T8a;
+			 {
+			      E T40, TN, T3Q, T2Q, T3P, TQ, T41, T2T, TX, T30, T3S, TU, T3X, T43, T2X;
+			      E T3T;
+			      {
+				   E T2O, T2P, TO, TP, TL, TM, T2R, T2S;
+				   TL = ri[WS(is, 62)];
+				   TM = ri[WS(is, 30)];
+				   T2O = ii[WS(is, 62)];
+				   T9C = FNMS(KP707106781, T8m, T8l);
+				   T8n = FMA(KP707106781, T8m, T8l);
+				   T40 = TL - TM;
+				   TN = TL + TM;
+				   T2P = ii[WS(is, 30)];
+				   TO = ri[WS(is, 14)];
+				   TP = ri[WS(is, 46)];
+				   T2R = ii[WS(is, 14)];
+				   T3Q = T2O - T2P;
+				   T2Q = T2O + T2P;
+				   T3P = TO - TP;
+				   TQ = TO + TP;
+				   T2S = ii[WS(is, 46)];
+				   {
+					E T2Y, T3V, T2Z, TV, TW;
+					TV = ri[WS(is, 54)];
+					TW = ri[WS(is, 22)];
+					T2Y = ii[WS(is, 54)];
+					T41 = T2R - T2S;
+					T2T = T2R + T2S;
+					T3V = TV - TW;
+					TX = TV + TW;
+					T2Z = ii[WS(is, 22)];
+					{
+					     E T2V, T2W, TS, TT, T3W;
+					     TS = ri[WS(is, 6)];
+					     TT = ri[WS(is, 38)];
+					     T3W = T2Y - T2Z;
+					     T30 = T2Y + T2Z;
+					     T2V = ii[WS(is, 6)];
+					     T3S = TS - TT;
+					     TU = TS + TT;
+					     T2W = ii[WS(is, 38)];
+					     T3X = T3V - T3W;
+					     T43 = T3V + T3W;
+					     T2X = T2V + T2W;
+					     T3T = T2V - T2W;
+					}
+				   }
+			      }
+			      {
+				   E T44, T3U, T2U, T31;
+				   {
+					E TaA, Tax, Tay, TR, TY, TaB;
+					TaA = TN - TQ;
+					TR = TN + TQ;
+					TY = TU + TX;
+					Tax = TX - TU;
+					T44 = T3T - T3S;
+					T3U = T3S + T3T;
+					Tdf = TR - TY;
+					TZ = TR + TY;
+					Tay = T2Q - T2T;
+					T2U = T2Q + T2T;
+					TaB = T2X - T30;
+					T31 = T2X + T30;
+					Tch = Tay - Tax;
+					Taz = Tax + Tay;
+					Tci = TaA - TaB;
+					TaC = TaA + TaB;
+				   }
+				   {
+					E T7T, T7X, T7U, T3R, T3Y, T7W;
+					T7T = T3Q - T3P;
+					T3R = T3P + T3Q;
+					T3Y = T3U - T3X;
+					T7X = T3U + T3X;
+					Tdg = T2U - T31;
+					T32 = T2U + T31;
+					T6J = FMA(KP707106781, T3Y, T3R);
+					T3Z = FNMS(KP707106781, T3Y, T3R);
+					T7U = T44 + T43;
+					T45 = T43 - T44;
+					T42 = T40 - T41;
+					T7W = T40 + T41;
+					T9n = FNMS(KP707106781, T7U, T7T);
+					T7V = FMA(KP707106781, T7U, T7T);
+					T9o = FNMS(KP707106781, T7X, T7W);
+					T7Y = FMA(KP707106781, T7X, T7W);
+				   }
+			      }
+			 }
+			 {
+			      E T4P, T14, TaH, T4d, T4a, T17, TaI, T4S, T1e, TaZ, T4f, T1b, T4U, T4o, TaY;
+			      E T4i;
+			      {
+				   E T4b, T4c, T15, T16, T12, T13, T4Q, T4R;
+				   T12 = ri[WS(is, 1)];
+				   T13 = ri[WS(is, 33)];
+				   T4b = ii[WS(is, 1)];
+				   T6K = FMA(KP707106781, T45, T42);
+				   T46 = FNMS(KP707106781, T45, T42);
+				   T4P = T12 - T13;
+				   T14 = T12 + T13;
+				   T4c = ii[WS(is, 33)];
+				   T15 = ri[WS(is, 17)];
+				   T16 = ri[WS(is, 49)];
+				   T4Q = ii[WS(is, 17)];
+				   TaH = T4b + T4c;
+				   T4d = T4b - T4c;
+				   T4a = T15 - T16;
+				   T17 = T15 + T16;
+				   T4R = ii[WS(is, 49)];
+				   {
+					E T4l, T4k, T4m, T1c, T1d;
+					T1c = ri[WS(is, 57)];
+					T1d = ri[WS(is, 25)];
+					T4l = ii[WS(is, 57)];
+					TaI = T4Q + T4R;
+					T4S = T4Q - T4R;
+					T4k = T1c - T1d;
+					T1e = T1c + T1d;
+					T4m = ii[WS(is, 25)];
+					{
+					     E T4g, T4h, T19, T1a, T4n;
+					     T19 = ri[WS(is, 9)];
+					     T1a = ri[WS(is, 41)];
+					     T4n = T4l - T4m;
+					     TaZ = T4l + T4m;
+					     T4g = ii[WS(is, 9)];
+					     T4f = T19 - T1a;
+					     T1b = T19 + T1a;
+					     T4h = ii[WS(is, 41)];
+					     T4U = T4k + T4n;
+					     T4o = T4k - T4n;
+					     TaY = T4g + T4h;
+					     T4i = T4g - T4h;
+					}
+				   }
+			      }
+			      {
+				   E TaG, T4V, TaJ, T4e, T4p, T8e, T8d, T4T, T4W;
+				   {
+					E TaX, T4j, Tb0, Tdl, T18, T1f, Tdk;
+					TaX = T14 - T17;
+					T18 = T14 + T17;
+					T1f = T1b + T1e;
+					TaG = T1e - T1b;
+					T4j = T4f + T4i;
+					T4V = T4i - T4f;
+					Tdp = T18 - T1f;
+					T1g = T18 + T1f;
+					Tb0 = TaY - TaZ;
+					Tdl = TaY + TaZ;
+					TaJ = TaH - TaI;
+					Tdk = TaH + TaI;
+					T82 = T4d - T4a;
+					T4e = T4a + T4d;
+					Tcm = TaX - Tb0;
+					Tb1 = TaX + Tb0;
+					Tej = Tdk + Tdl;
+					Tdm = Tdk - Tdl;
+					T4p = T4j - T4o;
+					T8e = T4j + T4o;
+				   }
+				   T8d = T4P + T4S;
+				   T4T = T4P - T4S;
+				   T4W = T4U - T4V;
+				   T83 = T4V + T4U;
+				   T6R = FMA(KP707106781, T4p, T4e);
+				   T4q = FNMS(KP707106781, T4p, T4e);
+				   T6O = FMA(KP707106781, T4W, T4T);
+				   T4X = FNMS(KP707106781, T4W, T4T);
+				   T9s = FNMS(KP707106781, T8e, T8d);
+				   T8f = FMA(KP707106781, T8e, T8d);
+				   Tcp = TaJ - TaG;
+				   TaK = TaG + TaJ;
+			      }
+			 }
+			 {
+			      E T85, T4L, TaO, T1n, Tdq, TaN, T86, T4G, T4r, T1q, T4s, TaR, T4z, T4w, T1t;
+			      E T4t;
+			      {
+				   E T4C, T1j, T4D, TaL, T4K, T4H, T1m, T4E;
+				   {
+					E T4I, T4J, T1h, T1i, T1k, T1l;
+					T1h = ri[WS(is, 5)];
+					T1i = ri[WS(is, 37)];
+					T4I = ii[WS(is, 5)];
+					T9v = FNMS(KP707106781, T83, T82);
+					T84 = FMA(KP707106781, T83, T82);
+					T4C = T1h - T1i;
+					T1j = T1h + T1i;
+					T4J = ii[WS(is, 37)];
+					T1k = ri[WS(is, 21)];
+					T1l = ri[WS(is, 53)];
+					T4D = ii[WS(is, 21)];
+					TaL = T4I + T4J;
+					T4K = T4I - T4J;
+					T4H = T1k - T1l;
+					T1m = T1k + T1l;
+					T4E = ii[WS(is, 53)];
+				   }
+				   {
+					E T4x, T4y, T1r, T1s;
+					{
+					     E T1o, T4F, TaM, T1p;
+					     T1o = ri[WS(is, 61)];
+					     T85 = T4K - T4H;
+					     T4L = T4H + T4K;
+					     TaO = T1j - T1m;
+					     T1n = T1j + T1m;
+					     T4F = T4D - T4E;
+					     TaM = T4D + T4E;
+					     T1p = ri[WS(is, 29)];
+					     T4x = ii[WS(is, 61)];
+					     Tdq = TaL + TaM;
+					     TaN = TaL - TaM;
+					     T86 = T4C + T4F;
+					     T4G = T4C - T4F;
+					     T4r = T1o - T1p;
+					     T1q = T1o + T1p;
+					     T4y = ii[WS(is, 29)];
+					}
+					T1r = ri[WS(is, 13)];
+					T1s = ri[WS(is, 45)];
+					T4s = ii[WS(is, 13)];
+					TaR = T4x + T4y;
+					T4z = T4x - T4y;
+					T4w = T1r - T1s;
+					T1t = T1r + T1s;
+					T4t = ii[WS(is, 45)];
+				   }
+			      }
+			      {
+				   E T88, TaP, T89, TaU, T4Z, T4B, T4M, T4Y, T8g, T8h;
+				   {
+					E T4A, Tb2, Tdr, T4v, Tb3;
+					{
+					     E TaQ, T1u, T4u, TaS, TaT;
+					     T88 = T4z - T4w;
+					     T4A = T4w + T4z;
+					     TaQ = T1q - T1t;
+					     T1u = T1q + T1t;
+					     T4u = T4s - T4t;
+					     TaS = T4s + T4t;
+					     Tb2 = TaO + TaN;
+					     TaP = TaN - TaO;
+					     Tdr = TaR + TaS;
+					     TaT = TaR - TaS;
+					     T89 = T4r + T4u;
+					     T4v = T4r - T4u;
+					     Tdn = T1u - T1n;
+					     T1v = T1n + T1u;
+					     Tb3 = TaQ - TaT;
+					     TaU = TaQ + TaT;
+					}
+					T4Z = FNMS(KP414213562, T4v, T4A);
+					T4B = FMA(KP414213562, T4A, T4v);
+					Tcq = Tb2 - Tb3;
+					Tb4 = Tb2 + Tb3;
+					Tek = Tdq + Tdr;
+					Tds = Tdq - Tdr;
+					T4M = FNMS(KP414213562, T4L, T4G);
+					T4Y = FMA(KP414213562, T4G, T4L);
+				   }
+				   T87 = FNMS(KP414213562, T86, T85);
+				   T8g = FMA(KP414213562, T85, T86);
+				   T6P = T4M + T4B;
+				   T4N = T4B - T4M;
+				   T6S = T4Y + T4Z;
+				   T50 = T4Y - T4Z;
+				   T8h = FNMS(KP414213562, T88, T89);
+				   T8a = FMA(KP414213562, T89, T88);
+				   T9w = T8g - T8h;
+				   T8i = T8g + T8h;
+				   Tcn = TaU - TaP;
+				   TaV = TaP + TaU;
+			      }
+			 }
+			 {
+			      E T8o, T5E, Tbf, T1S, TdB, Tbe, T8p, T5z, T5k, T1V, T5l, Tbi, T5s, T5p, T1Y;
+			      E T5m;
+			      {
+				   E T5v, T1O, T5w, Tbc, T5D, T5A, T1R, T5x;
+				   {
+					E T5B, T5C, T1M, T1N, T1P, T1Q;
+					T1M = ri[WS(is, 3)];
+					T1N = ri[WS(is, 35)];
+					T5B = ii[WS(is, 3)];
+					T9t = T8a - T87;
+					T8b = T87 + T8a;
+					T5v = T1M - T1N;
+					T1O = T1M + T1N;
+					T5C = ii[WS(is, 35)];
+					T1P = ri[WS(is, 19)];
+					T1Q = ri[WS(is, 51)];
+					T5w = ii[WS(is, 19)];
+					Tbc = T5B + T5C;
+					T5D = T5B - T5C;
+					T5A = T1P - T1Q;
+					T1R = T1P + T1Q;
+					T5x = ii[WS(is, 51)];
+				   }
+				   {
+					E T5q, T5r, T1W, T1X;
+					{
+					     E T1T, T5y, Tbd, T1U;
+					     T1T = ri[WS(is, 59)];
+					     T8o = T5D - T5A;
+					     T5E = T5A + T5D;
+					     Tbf = T1O - T1R;
+					     T1S = T1O + T1R;
+					     T5y = T5w - T5x;
+					     Tbd = T5w + T5x;
+					     T1U = ri[WS(is, 27)];
+					     T5q = ii[WS(is, 59)];
+					     TdB = Tbc + Tbd;
+					     Tbe = Tbc - Tbd;
+					     T8p = T5v + T5y;
+					     T5z = T5v - T5y;
+					     T5k = T1T - T1U;
+					     T1V = T1T + T1U;
+					     T5r = ii[WS(is, 27)];
+					}
+					T1W = ri[WS(is, 11)];
+					T1X = ri[WS(is, 43)];
+					T5l = ii[WS(is, 11)];
+					Tbi = T5q + T5r;
+					T5s = T5q - T5r;
+					T5p = T1W - T1X;
+					T1Y = T1W + T1X;
+					T5m = ii[WS(is, 43)];
+				   }
+			      }
+			      {
+				   E T8r, Tbg, T8s, Tbl, T5S, T5u, T5F, T5R, T8z, T8A;
+				   {
+					E T5t, Tbt, TdC, T5o, Tbu;
+					{
+					     E Tbh, T1Z, T5n, Tbj, Tbk;
+					     T8r = T5s - T5p;
+					     T5t = T5p + T5s;
+					     Tbh = T1V - T1Y;
+					     T1Z = T1V + T1Y;
+					     T5n = T5l - T5m;
+					     Tbj = T5l + T5m;
+					     Tbt = Tbf + Tbe;
+					     Tbg = Tbe - Tbf;
+					     TdC = Tbi + Tbj;
+					     Tbk = Tbi - Tbj;
+					     T8s = T5k + T5n;
+					     T5o = T5k - T5n;
+					     Tdy = T1Z - T1S;
+					     T20 = T1S + T1Z;
+					     Tbu = Tbh - Tbk;
+					     Tbl = Tbh + Tbk;
+					}
+					T5S = FNMS(KP414213562, T5o, T5t);
+					T5u = FMA(KP414213562, T5t, T5o);
+					Tcx = Tbt - Tbu;
+					Tbv = Tbt + Tbu;
+					Tep = TdB + TdC;
+					TdD = TdB - TdC;
+					T5F = FNMS(KP414213562, T5E, T5z);
+					T5R = FMA(KP414213562, T5z, T5E);
+				   }
+				   T8q = FNMS(KP414213562, T8p, T8o);
+				   T8z = FMA(KP414213562, T8o, T8p);
+				   T6W = T5F + T5u;
+				   T5G = T5u - T5F;
+				   T6Z = T5R + T5S;
+				   T5T = T5R - T5S;
+				   T8A = FNMS(KP414213562, T8r, T8s);
+				   T8t = FMA(KP414213562, T8s, T8r);
+				   T9D = T8z - T8A;
+				   T8B = T8z + T8A;
+				   Tcu = Tbl - Tbg;
+				   Tbm = Tbg + Tbl;
+			      }
+			 }
+		    }
+		    {
+			 E T9A, T8u, TbE, Tao, Td7, Td8;
+			 {
+			      E Teq, Ten, Tex, Teh, TeB, Tev, Tey, Tem, Te9, Tec;
+			      {
+				   E Tef, Teu, Tel, T11, Tei, Tet, T2y, TeI, T23, T22, T33, Teg, TeD, TeG, T34;
+				   E TeH;
+				   {
+					E TeE, TeF, Tv, T10, T1w, T21;
+					Tef = Tf - Tu;
+					Tv = Tf + Tu;
+					T10 = TK + TZ;
+					Teu = TZ - TK;
+					Tel = Tej - Tek;
+					TeE = Tej + Tek;
+					T9A = T8t - T8q;
+					T8u = T8q + T8t;
+					TeD = Tv - T10;
+					T11 = Tv + T10;
+					TeF = Teo + Tep;
+					Teq = Teo - Tep;
+					Tei = T1g - T1v;
+					T1w = T1g + T1v;
+					T21 = T1L + T20;
+					Ten = T1L - T20;
+					Tet = T2i - T2x;
+					T2y = T2i + T2x;
+					TeI = TeE + TeF;
+					TeG = TeE - TeF;
+					T23 = T21 - T1w;
+					T22 = T1w + T21;
+					T33 = T2N + T32;
+					Teg = T2N - T32;
+				   }
+				   ro[WS(os, 16)] = TeD + TeG;
+				   ro[WS(os, 48)] = TeD - TeG;
+				   ro[0] = T11 + T22;
+				   ro[WS(os, 32)] = T11 - T22;
+				   T34 = T2y - T33;
+				   TeH = T2y + T33;
+				   io[0] = TeH + TeI;
+				   io[WS(os, 32)] = TeH - TeI;
+				   io[WS(os, 48)] = T34 - T23;
+				   io[WS(os, 16)] = T23 + T34;
+				   Tex = Tef - Teg;
+				   Teh = Tef + Teg;
+				   TeB = Teu + Tet;
+				   Tev = Tet - Teu;
+				   Tey = Tel - Tei;
+				   Tem = Tei + Tel;
+			      }
+			      {
+				   E TdV, Tdb, TdJ, Te5, TdE, Tdz, Te6, Tdi, Teb, Te3, TdZ, TdY, TdW, TdM, TdR;
+				   E Tdu;
+				   {
+					E TdL, Tde, Tdh, TdK, Tez, Ter;
+					TdV = Td9 + Tda;
+					Tdb = Td9 - Tda;
+					TdJ = TdH - TdI;
+					Te5 = TdI + TdH;
+					Tez = Ten + Teq;
+					Ter = Ten - Teq;
+					TdL = Tdd + Tdc;
+					Tde = Tdc - Tdd;
+					{
+					     E TeA, TeC, Tew, Tes;
+					     TeA = Tey - Tez;
+					     TeC = Tey + Tez;
+					     Tew = Ter - Tem;
+					     Tes = Tem + Ter;
+					     ro[WS(os, 24)] = FMA(KP707106781, TeA, Tex);
+					     ro[WS(os, 56)] = FNMS(KP707106781, TeA, Tex);
+					     io[WS(os, 8)] = FMA(KP707106781, TeC, TeB);
+					     io[WS(os, 40)] = FNMS(KP707106781, TeC, TeB);
+					     io[WS(os, 24)] = FMA(KP707106781, Tew, Tev);
+					     io[WS(os, 56)] = FNMS(KP707106781, Tew, Tev);
+					     ro[WS(os, 8)] = FMA(KP707106781, Tes, Teh);
+					     ro[WS(os, 40)] = FNMS(KP707106781, Tes, Teh);
+					     Tdh = Tdf + Tdg;
+					     TdK = Tdf - Tdg;
+					}
+					{
+					     E Te1, Te2, Tdo, Tdt;
+					     TdE = TdA - TdD;
+					     Te1 = TdA + TdD;
+					     Te2 = Tdy + Tdx;
+					     Tdz = Tdx - Tdy;
+					     Te6 = Tde + Tdh;
+					     Tdi = Tde - Tdh;
+					     Teb = FMA(KP414213562, Te1, Te2);
+					     Te3 = FNMS(KP414213562, Te2, Te1);
+					     TdZ = Tdn + Tdm;
+					     Tdo = Tdm - Tdn;
+					     Tdt = Tdp - Tds;
+					     TdY = Tdp + Tds;
+					     TdW = TdL + TdK;
+					     TdM = TdK - TdL;
+					     TdR = FNMS(KP414213562, Tdo, Tdt);
+					     Tdu = FMA(KP414213562, Tdt, Tdo);
+					}
+				   }
+				   {
+					E TdT, Tea, Te0, TdU;
+					{
+					     E Tdj, TdQ, TdF, TdP, TdN, TdS, TdO, TdG;
+					     TdT = FNMS(KP707106781, Tdi, Tdb);
+					     Tdj = FMA(KP707106781, Tdi, Tdb);
+					     Tea = FNMS(KP414213562, TdY, TdZ);
+					     Te0 = FMA(KP414213562, TdZ, TdY);
+					     TdQ = FMA(KP414213562, Tdz, TdE);
+					     TdF = FNMS(KP414213562, TdE, Tdz);
+					     TdP = FMA(KP707106781, TdM, TdJ);
+					     TdN = FNMS(KP707106781, TdM, TdJ);
+					     TdS = TdQ - TdR;
+					     TdU = TdR + TdQ;
+					     TdO = Tdu + TdF;
+					     TdG = Tdu - TdF;
+					     io[WS(os, 12)] = FMA(KP923879532, TdS, TdP);
+					     io[WS(os, 44)] = FNMS(KP923879532, TdS, TdP);
+					     ro[WS(os, 12)] = FMA(KP923879532, TdG, Tdj);
+					     ro[WS(os, 44)] = FNMS(KP923879532, TdG, Tdj);
+					     io[WS(os, 60)] = FMA(KP923879532, TdO, TdN);
+					     io[WS(os, 28)] = FNMS(KP923879532, TdO, TdN);
+					}
+					{
+					     E Te8, Te7, Ted, Tee, TdX, Te4;
+					     Te9 = FNMS(KP707106781, TdW, TdV);
+					     TdX = FMA(KP707106781, TdW, TdV);
+					     Te4 = Te0 + Te3;
+					     Te8 = Te3 - Te0;
+					     Te7 = FNMS(KP707106781, Te6, Te5);
+					     Ted = FMA(KP707106781, Te6, Te5);
+					     ro[WS(os, 60)] = FMA(KP923879532, TdU, TdT);
+					     ro[WS(os, 28)] = FNMS(KP923879532, TdU, TdT);
+					     ro[WS(os, 4)] = FMA(KP923879532, Te4, TdX);
+					     ro[WS(os, 36)] = FNMS(KP923879532, Te4, TdX);
+					     Tee = Tea + Teb;
+					     Tec = Tea - Teb;
+					     io[WS(os, 4)] = FMA(KP923879532, Tee, Ted);
+					     io[WS(os, 36)] = FNMS(KP923879532, Tee, Ted);
+					     io[WS(os, 20)] = FMA(KP923879532, Te8, Te7);
+					     io[WS(os, 52)] = FNMS(KP923879532, Te8, Te7);
+					}
+				   }
+			      }
+			      {
+				   E TcP, Tcd, TcZ, TcD, Tcy, Tcv, TcT, Td0, Tck, Td4, TcX, TcS, TcK, Tcs, TcQ;
+				   E TcG;
+				   {
+					E TcF, Tcg, Tcj, TcE, TcV, TcW, Tcc, TcC, Tco, Tcr;
+					TbE = TbC + TbD;
+					Tcc = TbC - TbD;
+					TcC = Tan - Tak;
+					Tao = Tak + Tan;
+					TcF = FNMS(KP414213562, Tce, Tcf);
+					Tcg = FMA(KP414213562, Tcf, Tce);
+					ro[WS(os, 20)] = FMA(KP923879532, Tec, Te9);
+					ro[WS(os, 52)] = FNMS(KP923879532, Tec, Te9);
+					TcP = FNMS(KP707106781, Tcc, Tcb);
+					Tcd = FMA(KP707106781, Tcc, Tcb);
+					TcZ = FNMS(KP707106781, TcC, TcB);
+					TcD = FMA(KP707106781, TcC, TcB);
+					Tcj = FNMS(KP414213562, Tci, Tch);
+					TcE = FMA(KP414213562, Tch, Tci);
+					Tcy = FNMS(KP707106781, Tcx, Tcw);
+					TcV = FMA(KP707106781, Tcx, Tcw);
+					TcW = FMA(KP707106781, Tcu, Tct);
+					Tcv = FNMS(KP707106781, Tcu, Tct);
+					TcT = FMA(KP707106781, Tcn, Tcm);
+					Tco = FNMS(KP707106781, Tcn, Tcm);
+					Td0 = Tcg + Tcj;
+					Tck = Tcg - Tcj;
+					Td4 = FMA(KP198912367, TcV, TcW);
+					TcX = FNMS(KP198912367, TcW, TcV);
+					Tcr = FNMS(KP707106781, Tcq, Tcp);
+					TcS = FMA(KP707106781, Tcq, Tcp);
+					TcK = FNMS(KP668178637, Tco, Tcr);
+					Tcs = FMA(KP668178637, Tcr, Tco);
+					TcQ = TcF + TcE;
+					TcG = TcE - TcF;
+				   }
+				   {
+					E TcJ, Td5, TcU, TcM;
+					{
+					     E Tcl, TcL, Tcz, TcN, TcH, TcO, TcI, TcA;
+					     TcJ = FNMS(KP923879532, Tck, Tcd);
+					     Tcl = FMA(KP923879532, Tck, Tcd);
+					     Td5 = FNMS(KP198912367, TcS, TcT);
+					     TcU = FMA(KP198912367, TcT, TcS);
+					     TcL = FMA(KP668178637, Tcv, Tcy);
+					     Tcz = FNMS(KP668178637, Tcy, Tcv);
+					     TcN = FMA(KP923879532, TcG, TcD);
+					     TcH = FNMS(KP923879532, TcG, TcD);
+					     TcO = TcK + TcL;
+					     TcM = TcK - TcL;
+					     TcI = Tcz - Tcs;
+					     TcA = Tcs + Tcz;
+					     io[WS(os, 6)] = FMA(KP831469612, TcO, TcN);
+					     io[WS(os, 38)] = FNMS(KP831469612, TcO, TcN);
+					     ro[WS(os, 6)] = FMA(KP831469612, TcA, Tcl);
+					     ro[WS(os, 38)] = FNMS(KP831469612, TcA, Tcl);
+					     io[WS(os, 22)] = FMA(KP831469612, TcI, TcH);
+					     io[WS(os, 54)] = FNMS(KP831469612, TcI, TcH);
+					}
+					{
+					     E Td2, Td1, Td3, Td6, TcR, TcY;
+					     Td7 = FMA(KP923879532, TcQ, TcP);
+					     TcR = FNMS(KP923879532, TcQ, TcP);
+					     TcY = TcU - TcX;
+					     Td2 = TcU + TcX;
+					     Td1 = FMA(KP923879532, Td0, TcZ);
+					     Td3 = FNMS(KP923879532, Td0, TcZ);
+					     ro[WS(os, 22)] = FMA(KP831469612, TcM, TcJ);
+					     ro[WS(os, 54)] = FNMS(KP831469612, TcM, TcJ);
+					     ro[WS(os, 14)] = FMA(KP980785280, TcY, TcR);
+					     ro[WS(os, 46)] = FNMS(KP980785280, TcY, TcR);
+					     Td6 = Td4 - Td5;
+					     Td8 = Td5 + Td4;
+					     io[WS(os, 14)] = FMA(KP980785280, Td6, Td3);
+					     io[WS(os, 46)] = FNMS(KP980785280, Td6, Td3);
+					     io[WS(os, 62)] = FMA(KP980785280, Td2, Td1);
+					     io[WS(os, 30)] = FNMS(KP980785280, Td2, Td1);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T3f, T66, T63, T3u, T7z, T7A, Tc5, Tc8;
+			      {
+				   E TbR, Tap, Tc1, TbF, Tbw, Tbn, TbV, Tc2, TaE, Tc7, TbZ, TbU, TbN, Tb6, TbS;
+				   E TbI;
+				   {
+					E TbH, Taw, TaD, TbG, TbX, TbY, TaW, Tb5;
+					TbH = FMA(KP414213562, Tas, Tav);
+					Taw = FNMS(KP414213562, Tav, Tas);
+					ro[WS(os, 62)] = FMA(KP980785280, Td8, Td7);
+					ro[WS(os, 30)] = FNMS(KP980785280, Td8, Td7);
+					TbR = FMA(KP707106781, Tao, Tah);
+					Tap = FNMS(KP707106781, Tao, Tah);
+					Tc1 = FMA(KP707106781, TbE, TbB);
+					TbF = FNMS(KP707106781, TbE, TbB);
+					TaD = FMA(KP414213562, TaC, Taz);
+					TbG = FNMS(KP414213562, Taz, TaC);
+					Tbw = FNMS(KP707106781, Tbv, Tbs);
+					TbX = FMA(KP707106781, Tbv, Tbs);
+					TbY = FMA(KP707106781, Tbm, Tbb);
+					Tbn = FNMS(KP707106781, Tbm, Tbb);
+					TbV = FMA(KP707106781, TaV, TaK);
+					TaW = FNMS(KP707106781, TaV, TaK);
+					Tc2 = Taw + TaD;
+					TaE = Taw - TaD;
+					Tc7 = FMA(KP198912367, TbX, TbY);
+					TbZ = FNMS(KP198912367, TbY, TbX);
+					Tb5 = FNMS(KP707106781, Tb4, Tb1);
+					TbU = FMA(KP707106781, Tb4, Tb1);
+					TbN = FNMS(KP668178637, TaW, Tb5);
+					Tb6 = FMA(KP668178637, Tb5, TaW);
+					TbS = TbH + TbG;
+					TbI = TbG - TbH;
+				   }
+				   {
+					E TbP, Tc6, TbW, TbQ;
+					{
+					     E TaF, TbM, Tbx, TbL, TbJ, TbO, TbK, Tby;
+					     TbP = FNMS(KP923879532, TaE, Tap);
+					     TaF = FMA(KP923879532, TaE, Tap);
+					     Tc6 = FNMS(KP198912367, TbU, TbV);
+					     TbW = FMA(KP198912367, TbV, TbU);
+					     TbM = FMA(KP668178637, Tbn, Tbw);
+					     Tbx = FNMS(KP668178637, Tbw, Tbn);
+					     TbL = FMA(KP923879532, TbI, TbF);
+					     TbJ = FNMS(KP923879532, TbI, TbF);
+					     TbO = TbM - TbN;
+					     TbQ = TbN + TbM;
+					     TbK = Tb6 + Tbx;
+					     Tby = Tb6 - Tbx;
+					     io[WS(os, 10)] = FMA(KP831469612, TbO, TbL);
+					     io[WS(os, 42)] = FNMS(KP831469612, TbO, TbL);
+					     ro[WS(os, 10)] = FMA(KP831469612, Tby, TaF);
+					     ro[WS(os, 42)] = FNMS(KP831469612, Tby, TaF);
+					     io[WS(os, 58)] = FMA(KP831469612, TbK, TbJ);
+					     io[WS(os, 26)] = FNMS(KP831469612, TbK, TbJ);
+					}
+					{
+					     E Tc4, Tc3, Tc9, Tca, TbT, Tc0;
+					     Tc5 = FNMS(KP923879532, TbS, TbR);
+					     TbT = FMA(KP923879532, TbS, TbR);
+					     Tc0 = TbW + TbZ;
+					     Tc4 = TbZ - TbW;
+					     Tc3 = FNMS(KP923879532, Tc2, Tc1);
+					     Tc9 = FMA(KP923879532, Tc2, Tc1);
+					     ro[WS(os, 58)] = FMA(KP831469612, TbQ, TbP);
+					     ro[WS(os, 26)] = FNMS(KP831469612, TbQ, TbP);
+					     ro[WS(os, 2)] = FMA(KP980785280, Tc0, TbT);
+					     ro[WS(os, 34)] = FNMS(KP980785280, Tc0, TbT);
+					     Tca = Tc6 + Tc7;
+					     Tc8 = Tc6 - Tc7;
+					     io[WS(os, 2)] = FMA(KP980785280, Tca, Tc9);
+					     io[WS(os, 34)] = FNMS(KP980785280, Tca, Tc9);
+					     io[WS(os, 18)] = FMA(KP980785280, Tc4, Tc3);
+					     io[WS(os, 50)] = FNMS(KP980785280, Tc4, Tc3);
+					}
+				   }
+			      }
+			      {
+				   E T7h, T6F, T70, T6X, T7x, T7m, T7w, T7p, T7s, T6M, T7c, T6U, T7r, T75, T7i;
+				   E T78;
+				   {
+					E T6T, T6Q, T77, T6I, T6L, T76, T73, T74;
+					{
+					     E T7k, T7l, T6D, T6E, T7n, T7o;
+					     T3f = FMA(KP707106781, T3e, T37);
+					     T6D = FNMS(KP707106781, T3e, T37);
+					     T6E = T65 + T64;
+					     T66 = T64 - T65;
+					     T6T = FNMS(KP923879532, T6S, T6R);
+					     T7k = FMA(KP923879532, T6S, T6R);
+					     ro[WS(os, 18)] = FMA(KP980785280, Tc8, Tc5);
+					     ro[WS(os, 50)] = FNMS(KP980785280, Tc8, Tc5);
+					     T7h = FMA(KP923879532, T6E, T6D);
+					     T6F = FNMS(KP923879532, T6E, T6D);
+					     T7l = FMA(KP923879532, T6P, T6O);
+					     T6Q = FNMS(KP923879532, T6P, T6O);
+					     T70 = FNMS(KP923879532, T6Z, T6Y);
+					     T7n = FMA(KP923879532, T6Z, T6Y);
+					     T7o = FMA(KP923879532, T6W, T6V);
+					     T6X = FNMS(KP923879532, T6W, T6V);
+					     T77 = FNMS(KP198912367, T6G, T6H);
+					     T6I = FMA(KP198912367, T6H, T6G);
+					     T7x = FNMS(KP098491403, T7k, T7l);
+					     T7m = FMA(KP098491403, T7l, T7k);
+					     T7w = FMA(KP098491403, T7n, T7o);
+					     T7p = FNMS(KP098491403, T7o, T7n);
+					     T6L = FNMS(KP198912367, T6K, T6J);
+					     T76 = FMA(KP198912367, T6J, T6K);
+					}
+					T63 = FMA(KP707106781, T62, T5Z);
+					T73 = FNMS(KP707106781, T62, T5Z);
+					T7s = T6I + T6L;
+					T6M = T6I - T6L;
+					T7c = FNMS(KP820678790, T6Q, T6T);
+					T6U = FMA(KP820678790, T6T, T6Q);
+					T74 = T3m + T3t;
+					T3u = T3m - T3t;
+					T7r = FMA(KP923879532, T74, T73);
+					T75 = FNMS(KP923879532, T74, T73);
+					T7i = T77 + T76;
+					T78 = T76 - T77;
+				   }
+				   {
+					E T7b, T6N, T7f, T79, T71, T7d;
+					T7b = FNMS(KP980785280, T6M, T6F);
+					T6N = FMA(KP980785280, T6M, T6F);
+					T7f = FMA(KP980785280, T78, T75);
+					T79 = FNMS(KP980785280, T78, T75);
+					T71 = FNMS(KP820678790, T70, T6X);
+					T7d = FMA(KP820678790, T6X, T70);
+					{
+					     E T7u, T7t, T7v, T7y, T7j, T7q;
+					     T7z = FMA(KP980785280, T7i, T7h);
+					     T7j = FNMS(KP980785280, T7i, T7h);
+					     T7q = T7m - T7p;
+					     T7u = T7m + T7p;
+					     {
+						  E T7g, T7e, T72, T7a;
+						  T7g = T7c + T7d;
+						  T7e = T7c - T7d;
+						  T72 = T6U + T71;
+						  T7a = T71 - T6U;
+						  ro[WS(os, 23)] = FMA(KP773010453, T7e, T7b);
+						  ro[WS(os, 55)] = FNMS(KP773010453, T7e, T7b);
+						  io[WS(os, 7)] = FMA(KP773010453, T7g, T7f);
+						  io[WS(os, 39)] = FNMS(KP773010453, T7g, T7f);
+						  io[WS(os, 23)] = FMA(KP773010453, T7a, T79);
+						  io[WS(os, 55)] = FNMS(KP773010453, T7a, T79);
+						  ro[WS(os, 7)] = FMA(KP773010453, T72, T6N);
+						  ro[WS(os, 39)] = FNMS(KP773010453, T72, T6N);
+						  ro[WS(os, 47)] = FNMS(KP995184726, T7q, T7j);
+						  ro[WS(os, 15)] = FMA(KP995184726, T7q, T7j);
+					     }
+					     T7t = FMA(KP980785280, T7s, T7r);
+					     T7v = FNMS(KP980785280, T7s, T7r);
+					     T7y = T7w - T7x;
+					     T7A = T7x + T7w;
+					     io[WS(os, 15)] = FMA(KP995184726, T7y, T7v);
+					     io[WS(os, 47)] = FNMS(KP995184726, T7y, T7v);
+					     io[WS(os, 63)] = FMA(KP995184726, T7u, T7t);
+					     io[WS(os, 31)] = FNMS(KP995184726, T7u, T7t);
+					}
+				   }
+			      }
+			      {
+				   E T7D, T8K, T8H, T7K, Tad, Tae, T6x, T6A;
+				   {
+					E T9V, T9j, T9E, T9B, Tab, Ta0, Taa, Ta3, Ta6, T9q, T9Q, T9y, Ta5, T9J, T9W;
+					E T9M;
+					{
+					     E T9x, T9u, T9L, T9m, T9p, T9K, T9H, T9I;
+					     {
+						  E T9Y, T9Z, T9h, T9i, Ta1, Ta2;
+						  T7D = FMA(KP707106781, T7C, T7B);
+						  T9h = FNMS(KP707106781, T7C, T7B);
+						  T9i = T8I - T8J;
+						  T8K = T8I + T8J;
+						  T9x = FNMS(KP923879532, T9w, T9v);
+						  T9Y = FMA(KP923879532, T9w, T9v);
+						  ro[WS(os, 63)] = FMA(KP995184726, T7A, T7z);
+						  ro[WS(os, 31)] = FNMS(KP995184726, T7A, T7z);
+						  T9V = FNMS(KP923879532, T9i, T9h);
+						  T9j = FMA(KP923879532, T9i, T9h);
+						  T9Z = FMA(KP923879532, T9t, T9s);
+						  T9u = FNMS(KP923879532, T9t, T9s);
+						  T9E = FNMS(KP923879532, T9D, T9C);
+						  Ta1 = FMA(KP923879532, T9D, T9C);
+						  Ta2 = FMA(KP923879532, T9A, T9z);
+						  T9B = FNMS(KP923879532, T9A, T9z);
+						  T9L = FNMS(KP668178637, T9k, T9l);
+						  T9m = FMA(KP668178637, T9l, T9k);
+						  Tab = FNMS(KP303346683, T9Y, T9Z);
+						  Ta0 = FMA(KP303346683, T9Z, T9Y);
+						  Taa = FMA(KP303346683, Ta1, Ta2);
+						  Ta3 = FNMS(KP303346683, Ta2, Ta1);
+						  T9p = FNMS(KP668178637, T9o, T9n);
+						  T9K = FMA(KP668178637, T9n, T9o);
+					     }
+					     T8H = FMA(KP707106781, T8G, T8F);
+					     T9H = FNMS(KP707106781, T8G, T8F);
+					     Ta6 = T9m + T9p;
+					     T9q = T9m - T9p;
+					     T9Q = FNMS(KP534511135, T9u, T9x);
+					     T9y = FMA(KP534511135, T9x, T9u);
+					     T9I = T7J - T7G;
+					     T7K = T7G + T7J;
+					     Ta5 = FNMS(KP923879532, T9I, T9H);
+					     T9J = FMA(KP923879532, T9I, T9H);
+					     T9W = T9L + T9K;
+					     T9M = T9K - T9L;
+					}
+					{
+					     E T9P, T9r, T9T, T9N, T9F, T9R;
+					     T9P = FNMS(KP831469612, T9q, T9j);
+					     T9r = FMA(KP831469612, T9q, T9j);
+					     T9T = FMA(KP831469612, T9M, T9J);
+					     T9N = FNMS(KP831469612, T9M, T9J);
+					     T9F = FNMS(KP534511135, T9E, T9B);
+					     T9R = FMA(KP534511135, T9B, T9E);
+					     {
+						  E Ta8, Ta7, Ta9, Tac, T9X, Ta4;
+						  Tad = FMA(KP831469612, T9W, T9V);
+						  T9X = FNMS(KP831469612, T9W, T9V);
+						  Ta4 = Ta0 - Ta3;
+						  Ta8 = Ta0 + Ta3;
+						  {
+						       E T9U, T9S, T9G, T9O;
+						       T9U = T9Q + T9R;
+						       T9S = T9Q - T9R;
+						       T9G = T9y + T9F;
+						       T9O = T9F - T9y;
+						       ro[WS(os, 21)] = FMA(KP881921264, T9S, T9P);
+						       ro[WS(os, 53)] = FNMS(KP881921264, T9S, T9P);
+						       io[WS(os, 5)] = FMA(KP881921264, T9U, T9T);
+						       io[WS(os, 37)] = FNMS(KP881921264, T9U, T9T);
+						       io[WS(os, 21)] = FMA(KP881921264, T9O, T9N);
+						       io[WS(os, 53)] = FNMS(KP881921264, T9O, T9N);
+						       ro[WS(os, 5)] = FMA(KP881921264, T9G, T9r);
+						       ro[WS(os, 37)] = FNMS(KP881921264, T9G, T9r);
+						       ro[WS(os, 45)] = FNMS(KP956940335, Ta4, T9X);
+						       ro[WS(os, 13)] = FMA(KP956940335, Ta4, T9X);
+						  }
+						  Ta7 = FMA(KP831469612, Ta6, Ta5);
+						  Ta9 = FNMS(KP831469612, Ta6, Ta5);
+						  Tac = Taa - Tab;
+						  Tae = Tab + Taa;
+						  io[WS(os, 13)] = FMA(KP956940335, Tac, Ta9);
+						  io[WS(os, 45)] = FNMS(KP956940335, Tac, Ta9);
+						  io[WS(os, 61)] = FMA(KP956940335, Ta8, Ta7);
+						  io[WS(os, 29)] = FNMS(KP956940335, Ta8, Ta7);
+					     }
+					}
+				   }
+				   {
+					E T6j, T3v, T5U, T5H, T6y, T6o, T6z, T6r, T6u, T48, T6f, T52, T6t, T67, T6k;
+					E T6a;
+					{
+					     E T51, T4O, T69, T3O, T47, T68;
+					     {
+						  E T6m, T6n, T6p, T6q;
+						  T51 = FNMS(KP923879532, T50, T4X);
+						  T6m = FMA(KP923879532, T50, T4X);
+						  ro[WS(os, 61)] = FMA(KP956940335, Tae, Tad);
+						  ro[WS(os, 29)] = FNMS(KP956940335, Tae, Tad);
+						  T6j = FMA(KP923879532, T3u, T3f);
+						  T3v = FNMS(KP923879532, T3u, T3f);
+						  T6n = FMA(KP923879532, T4N, T4q);
+						  T4O = FNMS(KP923879532, T4N, T4q);
+						  T5U = FNMS(KP923879532, T5T, T5Q);
+						  T6p = FMA(KP923879532, T5T, T5Q);
+						  T6q = FMA(KP923879532, T5G, T5j);
+						  T5H = FNMS(KP923879532, T5G, T5j);
+						  T69 = FMA(KP668178637, T3G, T3N);
+						  T3O = FNMS(KP668178637, T3N, T3G);
+						  T6y = FNMS(KP303346683, T6m, T6n);
+						  T6o = FMA(KP303346683, T6n, T6m);
+						  T6z = FMA(KP303346683, T6p, T6q);
+						  T6r = FNMS(KP303346683, T6q, T6p);
+						  T47 = FMA(KP668178637, T46, T3Z);
+						  T68 = FNMS(KP668178637, T3Z, T46);
+					     }
+					     T6u = T3O + T47;
+					     T48 = T3O - T47;
+					     T6f = FNMS(KP534511135, T4O, T51);
+					     T52 = FMA(KP534511135, T51, T4O);
+					     T6t = FMA(KP923879532, T66, T63);
+					     T67 = FNMS(KP923879532, T66, T63);
+					     T6k = T69 + T68;
+					     T6a = T68 - T69;
+					}
+					{
+					     E T6h, T49, T6d, T6b, T5V, T6e;
+					     T6h = FNMS(KP831469612, T48, T3v);
+					     T49 = FMA(KP831469612, T48, T3v);
+					     T6d = FMA(KP831469612, T6a, T67);
+					     T6b = FNMS(KP831469612, T6a, T67);
+					     T5V = FNMS(KP534511135, T5U, T5H);
+					     T6e = FMA(KP534511135, T5H, T5U);
+					     {
+						  E T6w, T6v, T6B, T6C, T6l, T6s;
+						  T6x = FNMS(KP831469612, T6k, T6j);
+						  T6l = FMA(KP831469612, T6k, T6j);
+						  T6s = T6o + T6r;
+						  T6w = T6r - T6o;
+						  {
+						       E T6g, T6i, T5W, T6c;
+						       T6g = T6e - T6f;
+						       T6i = T6f + T6e;
+						       T5W = T52 - T5V;
+						       T6c = T52 + T5V;
+						       ro[WS(os, 59)] = FMA(KP881921264, T6i, T6h);
+						       ro[WS(os, 27)] = FNMS(KP881921264, T6i, T6h);
+						       io[WS(os, 11)] = FMA(KP881921264, T6g, T6d);
+						       io[WS(os, 43)] = FNMS(KP881921264, T6g, T6d);
+						       io[WS(os, 59)] = FMA(KP881921264, T6c, T6b);
+						       io[WS(os, 27)] = FNMS(KP881921264, T6c, T6b);
+						       ro[WS(os, 11)] = FMA(KP881921264, T5W, T49);
+						       ro[WS(os, 43)] = FNMS(KP881921264, T5W, T49);
+						       ro[WS(os, 35)] = FNMS(KP956940335, T6s, T6l);
+						       ro[WS(os, 3)] = FMA(KP956940335, T6s, T6l);
+						  }
+						  T6v = FNMS(KP831469612, T6u, T6t);
+						  T6B = FMA(KP831469612, T6u, T6t);
+						  T6C = T6y + T6z;
+						  T6A = T6y - T6z;
+						  io[WS(os, 3)] = FMA(KP956940335, T6C, T6B);
+						  io[WS(os, 35)] = FNMS(KP956940335, T6C, T6B);
+						  io[WS(os, 19)] = FMA(KP956940335, T6w, T6v);
+						  io[WS(os, 51)] = FNMS(KP956940335, T6w, T6v);
+					     }
+					}
+				   }
+				   {
+					E T8X, T7L, T8C, T8v, T9c, T92, T9d, T95, T98, T80, T8T, T8k, T97, T8L, T8Y;
+					E T8O;
+					{
+					     E T8j, T8c, T8N, T7S, T7Z, T8M;
+					     {
+						  E T90, T91, T93, T94;
+						  T8j = FNMS(KP923879532, T8i, T8f);
+						  T90 = FMA(KP923879532, T8i, T8f);
+						  ro[WS(os, 19)] = FMA(KP956940335, T6A, T6x);
+						  ro[WS(os, 51)] = FNMS(KP956940335, T6A, T6x);
+						  T8X = FMA(KP923879532, T7K, T7D);
+						  T7L = FNMS(KP923879532, T7K, T7D);
+						  T91 = FMA(KP923879532, T8b, T84);
+						  T8c = FNMS(KP923879532, T8b, T84);
+						  T8C = FNMS(KP923879532, T8B, T8y);
+						  T93 = FMA(KP923879532, T8B, T8y);
+						  T94 = FMA(KP923879532, T8u, T8n);
+						  T8v = FNMS(KP923879532, T8u, T8n);
+						  T8N = FMA(KP198912367, T7O, T7R);
+						  T7S = FNMS(KP198912367, T7R, T7O);
+						  T9c = FNMS(KP098491403, T90, T91);
+						  T92 = FMA(KP098491403, T91, T90);
+						  T9d = FMA(KP098491403, T93, T94);
+						  T95 = FNMS(KP098491403, T94, T93);
+						  T7Z = FMA(KP198912367, T7Y, T7V);
+						  T8M = FNMS(KP198912367, T7V, T7Y);
+					     }
+					     T98 = T7S + T7Z;
+					     T80 = T7S - T7Z;
+					     T8T = FNMS(KP820678790, T8c, T8j);
+					     T8k = FMA(KP820678790, T8j, T8c);
+					     T97 = FMA(KP923879532, T8K, T8H);
+					     T8L = FNMS(KP923879532, T8K, T8H);
+					     T8Y = T8N + T8M;
+					     T8O = T8M - T8N;
+					}
+					{
+					     E T8V, T81, T8R, T8P, T8D, T8S;
+					     T8V = FNMS(KP980785280, T80, T7L);
+					     T81 = FMA(KP980785280, T80, T7L);
+					     T8R = FMA(KP980785280, T8O, T8L);
+					     T8P = FNMS(KP980785280, T8O, T8L);
+					     T8D = FNMS(KP820678790, T8C, T8v);
+					     T8S = FMA(KP820678790, T8v, T8C);
+					     {
+						  E T9a, T99, T9f, T9g, T8Z, T96;
+						  T9b = FNMS(KP980785280, T8Y, T8X);
+						  T8Z = FMA(KP980785280, T8Y, T8X);
+						  T96 = T92 + T95;
+						  T9a = T95 - T92;
+						  {
+						       E T8U, T8W, T8E, T8Q;
+						       T8U = T8S - T8T;
+						       T8W = T8T + T8S;
+						       T8E = T8k - T8D;
+						       T8Q = T8k + T8D;
+						       ro[WS(os, 57)] = FMA(KP773010453, T8W, T8V);
+						       ro[WS(os, 25)] = FNMS(KP773010453, T8W, T8V);
+						       io[WS(os, 9)] = FMA(KP773010453, T8U, T8R);
+						       io[WS(os, 41)] = FNMS(KP773010453, T8U, T8R);
+						       io[WS(os, 57)] = FMA(KP773010453, T8Q, T8P);
+						       io[WS(os, 25)] = FNMS(KP773010453, T8Q, T8P);
+						       ro[WS(os, 9)] = FMA(KP773010453, T8E, T81);
+						       ro[WS(os, 41)] = FNMS(KP773010453, T8E, T81);
+						       ro[WS(os, 33)] = FNMS(KP995184726, T96, T8Z);
+						       ro[WS(os, 1)] = FMA(KP995184726, T96, T8Z);
+						  }
+						  T99 = FNMS(KP980785280, T98, T97);
+						  T9f = FMA(KP980785280, T98, T97);
+						  T9g = T9c + T9d;
+						  T9e = T9c - T9d;
+						  io[WS(os, 1)] = FMA(KP995184726, T9g, T9f);
+						  io[WS(os, 33)] = FNMS(KP995184726, T9g, T9f);
+						  io[WS(os, 17)] = FMA(KP995184726, T9a, T99);
+						  io[WS(os, 49)] = FNMS(KP995184726, T9a, T99);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ro[WS(os, 17)] = FMA(KP995184726, T9e, T9b);
+	       ro[WS(os, 49)] = FNMS(KP995184726, T9e, T9b);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 64, "n1_64", {520, 0, 392, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_64) (planner *p) {
+     X(kdft_register) (p, n1_64, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 64 -name n1_64 -include n.h */
+
+/*
+ * This function contains 912 FP additions, 248 FP multiplications,
+ * (or, 808 additions, 144 multiplications, 104 fused multiply/add),
+ * 172 stack variables, 15 constants, and 256 memory accesses
+ */
+#include "n.h"
+
+static void n1_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(256, is), MAKE_VOLATILE_STRIDE(256, os)) {
+	       E T37, T7B, T8F, T5Z, Tf, Td9, TbB, TcB, T62, T7C, T2i, TdH, Tah, Tcb, T3e;
+	       E T8G, Tu, TdI, Tak, TbD, Tan, TbC, T2x, Tda, T3m, T65, T7G, T8J, T7J, T8I;
+	       E T3t, T64, TK, Tdd, Tas, Tce, Tav, Tcf, T2N, Tdc, T3G, T6G, T7O, T9k, T7R;
+	       E T9l, T3N, T6H, T1L, Tdv, Tbs, Tcw, TdC, Teo, T5j, T6V, T5Q, T6Y, T8y, T9C;
+	       E Tbb, Tct, T8n, T9z, TZ, Tdf, Taz, Tch, TaC, Tci, T32, Tdg, T3Z, T6J, T7V;
+	       E T9n, T7Y, T9o, T46, T6K, T1g, Tdp, Tb1, Tcm, Tdm, Tej, T4q, T6R, T4X, T6O;
+	       E T8f, T9s, TaK, Tcp, T84, T9v, T1v, Tdn, Tb4, Tcq, Tds, Tek, T4N, T6P, T50;
+	       E T6S, T8i, T9w, TaV, Tcn, T8b, T9t, T20, TdD, Tbv, Tcu, Tdy, Tep, T5G, T6Z;
+	       E T5T, T6W, T8B, T9A, Tbm, Tcx, T8u, T9D;
+	       {
+		    E T3, T35, T26, T5Y, T6, T5X, T29, T36, Ta, T39, T2d, T38, Td, T3b, T2g;
+		    E T3c;
+		    {
+			 E T1, T2, T24, T25;
+			 T1 = ri[0];
+			 T2 = ri[WS(is, 32)];
+			 T3 = T1 + T2;
+			 T35 = T1 - T2;
+			 T24 = ii[0];
+			 T25 = ii[WS(is, 32)];
+			 T26 = T24 + T25;
+			 T5Y = T24 - T25;
+		    }
+		    {
+			 E T4, T5, T27, T28;
+			 T4 = ri[WS(is, 16)];
+			 T5 = ri[WS(is, 48)];
+			 T6 = T4 + T5;
+			 T5X = T4 - T5;
+			 T27 = ii[WS(is, 16)];
+			 T28 = ii[WS(is, 48)];
+			 T29 = T27 + T28;
+			 T36 = T27 - T28;
+		    }
+		    {
+			 E T8, T9, T2b, T2c;
+			 T8 = ri[WS(is, 8)];
+			 T9 = ri[WS(is, 40)];
+			 Ta = T8 + T9;
+			 T39 = T8 - T9;
+			 T2b = ii[WS(is, 8)];
+			 T2c = ii[WS(is, 40)];
+			 T2d = T2b + T2c;
+			 T38 = T2b - T2c;
+		    }
+		    {
+			 E Tb, Tc, T2e, T2f;
+			 Tb = ri[WS(is, 56)];
+			 Tc = ri[WS(is, 24)];
+			 Td = Tb + Tc;
+			 T3b = Tb - Tc;
+			 T2e = ii[WS(is, 56)];
+			 T2f = ii[WS(is, 24)];
+			 T2g = T2e + T2f;
+			 T3c = T2e - T2f;
+		    }
+		    {
+			 E T7, Te, T2a, T2h;
+			 T37 = T35 - T36;
+			 T7B = T35 + T36;
+			 T8F = T5Y - T5X;
+			 T5Z = T5X + T5Y;
+			 T7 = T3 + T6;
+			 Te = Ta + Td;
+			 Tf = T7 + Te;
+			 Td9 = T7 - Te;
+			 {
+			      E Tbz, TbA, T60, T61;
+			      Tbz = T26 - T29;
+			      TbA = Td - Ta;
+			      TbB = Tbz - TbA;
+			      TcB = TbA + Tbz;
+			      T60 = T3b - T3c;
+			      T61 = T39 + T38;
+			      T62 = KP707106781 * (T60 - T61);
+			      T7C = KP707106781 * (T61 + T60);
+			 }
+			 T2a = T26 + T29;
+			 T2h = T2d + T2g;
+			 T2i = T2a + T2h;
+			 TdH = T2a - T2h;
+			 {
+			      E Taf, Tag, T3a, T3d;
+			      Taf = T3 - T6;
+			      Tag = T2d - T2g;
+			      Tah = Taf - Tag;
+			      Tcb = Taf + Tag;
+			      T3a = T38 - T39;
+			      T3d = T3b + T3c;
+			      T3e = KP707106781 * (T3a - T3d);
+			      T8G = KP707106781 * (T3a + T3d);
+			 }
+		    }
+	       }
+	       {
+		    E Ti, T3j, T2l, T3h, Tl, T3g, T2o, T3k, Tp, T3q, T2s, T3o, Ts, T3n, T2v;
+		    E T3r;
+		    {
+			 E Tg, Th, T2j, T2k;
+			 Tg = ri[WS(is, 4)];
+			 Th = ri[WS(is, 36)];
+			 Ti = Tg + Th;
+			 T3j = Tg - Th;
+			 T2j = ii[WS(is, 4)];
+			 T2k = ii[WS(is, 36)];
+			 T2l = T2j + T2k;
+			 T3h = T2j - T2k;
+		    }
+		    {
+			 E Tj, Tk, T2m, T2n;
+			 Tj = ri[WS(is, 20)];
+			 Tk = ri[WS(is, 52)];
+			 Tl = Tj + Tk;
+			 T3g = Tj - Tk;
+			 T2m = ii[WS(is, 20)];
+			 T2n = ii[WS(is, 52)];
+			 T2o = T2m + T2n;
+			 T3k = T2m - T2n;
+		    }
+		    {
+			 E Tn, To, T2q, T2r;
+			 Tn = ri[WS(is, 60)];
+			 To = ri[WS(is, 28)];
+			 Tp = Tn + To;
+			 T3q = Tn - To;
+			 T2q = ii[WS(is, 60)];
+			 T2r = ii[WS(is, 28)];
+			 T2s = T2q + T2r;
+			 T3o = T2q - T2r;
+		    }
+		    {
+			 E Tq, Tr, T2t, T2u;
+			 Tq = ri[WS(is, 12)];
+			 Tr = ri[WS(is, 44)];
+			 Ts = Tq + Tr;
+			 T3n = Tq - Tr;
+			 T2t = ii[WS(is, 12)];
+			 T2u = ii[WS(is, 44)];
+			 T2v = T2t + T2u;
+			 T3r = T2t - T2u;
+		    }
+		    {
+			 E Tm, Tt, Tai, Taj;
+			 Tm = Ti + Tl;
+			 Tt = Tp + Ts;
+			 Tu = Tm + Tt;
+			 TdI = Tt - Tm;
+			 Tai = T2l - T2o;
+			 Taj = Ti - Tl;
+			 Tak = Tai - Taj;
+			 TbD = Taj + Tai;
+		    }
+		    {
+			 E Tal, Tam, T2p, T2w;
+			 Tal = Tp - Ts;
+			 Tam = T2s - T2v;
+			 Tan = Tal + Tam;
+			 TbC = Tal - Tam;
+			 T2p = T2l + T2o;
+			 T2w = T2s + T2v;
+			 T2x = T2p + T2w;
+			 Tda = T2p - T2w;
+		    }
+		    {
+			 E T3i, T3l, T7E, T7F;
+			 T3i = T3g + T3h;
+			 T3l = T3j - T3k;
+			 T3m = FNMS(KP923879532, T3l, KP382683432 * T3i);
+			 T65 = FMA(KP923879532, T3i, KP382683432 * T3l);
+			 T7E = T3h - T3g;
+			 T7F = T3j + T3k;
+			 T7G = FNMS(KP382683432, T7F, KP923879532 * T7E);
+			 T8J = FMA(KP382683432, T7E, KP923879532 * T7F);
+		    }
+		    {
+			 E T7H, T7I, T3p, T3s;
+			 T7H = T3o - T3n;
+			 T7I = T3q + T3r;
+			 T7J = FMA(KP923879532, T7H, KP382683432 * T7I);
+			 T8I = FNMS(KP382683432, T7H, KP923879532 * T7I);
+			 T3p = T3n + T3o;
+			 T3s = T3q - T3r;
+			 T3t = FMA(KP382683432, T3p, KP923879532 * T3s);
+			 T64 = FNMS(KP923879532, T3p, KP382683432 * T3s);
+		    }
+	       }
+	       {
+		    E Ty, T3H, T2B, T3x, TB, T3w, T2E, T3I, TI, T3L, T2L, T3B, TF, T3K, T2I;
+		    E T3E;
+		    {
+			 E Tw, Tx, T2C, T2D;
+			 Tw = ri[WS(is, 2)];
+			 Tx = ri[WS(is, 34)];
+			 Ty = Tw + Tx;
+			 T3H = Tw - Tx;
+			 {
+			      E T2z, T2A, Tz, TA;
+			      T2z = ii[WS(is, 2)];
+			      T2A = ii[WS(is, 34)];
+			      T2B = T2z + T2A;
+			      T3x = T2z - T2A;
+			      Tz = ri[WS(is, 18)];
+			      TA = ri[WS(is, 50)];
+			      TB = Tz + TA;
+			      T3w = Tz - TA;
+			 }
+			 T2C = ii[WS(is, 18)];
+			 T2D = ii[WS(is, 50)];
+			 T2E = T2C + T2D;
+			 T3I = T2C - T2D;
+			 {
+			      E TG, TH, T3z, T2J, T2K, T3A;
+			      TG = ri[WS(is, 58)];
+			      TH = ri[WS(is, 26)];
+			      T3z = TG - TH;
+			      T2J = ii[WS(is, 58)];
+			      T2K = ii[WS(is, 26)];
+			      T3A = T2J - T2K;
+			      TI = TG + TH;
+			      T3L = T3z + T3A;
+			      T2L = T2J + T2K;
+			      T3B = T3z - T3A;
+			 }
+			 {
+			      E TD, TE, T3C, T2G, T2H, T3D;
+			      TD = ri[WS(is, 10)];
+			      TE = ri[WS(is, 42)];
+			      T3C = TD - TE;
+			      T2G = ii[WS(is, 10)];
+			      T2H = ii[WS(is, 42)];
+			      T3D = T2G - T2H;
+			      TF = TD + TE;
+			      T3K = T3D - T3C;
+			      T2I = T2G + T2H;
+			      T3E = T3C + T3D;
+			 }
+		    }
+		    {
+			 E TC, TJ, Taq, Tar;
+			 TC = Ty + TB;
+			 TJ = TF + TI;
+			 TK = TC + TJ;
+			 Tdd = TC - TJ;
+			 Taq = T2B - T2E;
+			 Tar = TI - TF;
+			 Tas = Taq - Tar;
+			 Tce = Tar + Taq;
+		    }
+		    {
+			 E Tat, Tau, T2F, T2M;
+			 Tat = Ty - TB;
+			 Tau = T2I - T2L;
+			 Tav = Tat - Tau;
+			 Tcf = Tat + Tau;
+			 T2F = T2B + T2E;
+			 T2M = T2I + T2L;
+			 T2N = T2F + T2M;
+			 Tdc = T2F - T2M;
+		    }
+		    {
+			 E T3y, T3F, T7M, T7N;
+			 T3y = T3w + T3x;
+			 T3F = KP707106781 * (T3B - T3E);
+			 T3G = T3y - T3F;
+			 T6G = T3y + T3F;
+			 T7M = T3x - T3w;
+			 T7N = KP707106781 * (T3K + T3L);
+			 T7O = T7M - T7N;
+			 T9k = T7M + T7N;
+		    }
+		    {
+			 E T7P, T7Q, T3J, T3M;
+			 T7P = T3H + T3I;
+			 T7Q = KP707106781 * (T3E + T3B);
+			 T7R = T7P - T7Q;
+			 T9l = T7P + T7Q;
+			 T3J = T3H - T3I;
+			 T3M = KP707106781 * (T3K - T3L);
+			 T3N = T3J - T3M;
+			 T6H = T3J + T3M;
+		    }
+	       }
+	       {
+		    E T1z, T53, T5L, Tbo, T1C, T5I, T56, Tbp, T1J, Tb9, T5h, T5N, T1G, Tb8, T5c;
+		    E T5O;
+		    {
+			 E T1x, T1y, T54, T55;
+			 T1x = ri[WS(is, 63)];
+			 T1y = ri[WS(is, 31)];
+			 T1z = T1x + T1y;
+			 T53 = T1x - T1y;
+			 {
+			      E T5J, T5K, T1A, T1B;
+			      T5J = ii[WS(is, 63)];
+			      T5K = ii[WS(is, 31)];
+			      T5L = T5J - T5K;
+			      Tbo = T5J + T5K;
+			      T1A = ri[WS(is, 15)];
+			      T1B = ri[WS(is, 47)];
+			      T1C = T1A + T1B;
+			      T5I = T1A - T1B;
+			 }
+			 T54 = ii[WS(is, 15)];
+			 T55 = ii[WS(is, 47)];
+			 T56 = T54 - T55;
+			 Tbp = T54 + T55;
+			 {
+			      E T1H, T1I, T5d, T5e, T5f, T5g;
+			      T1H = ri[WS(is, 55)];
+			      T1I = ri[WS(is, 23)];
+			      T5d = T1H - T1I;
+			      T5e = ii[WS(is, 55)];
+			      T5f = ii[WS(is, 23)];
+			      T5g = T5e - T5f;
+			      T1J = T1H + T1I;
+			      Tb9 = T5e + T5f;
+			      T5h = T5d + T5g;
+			      T5N = T5d - T5g;
+			 }
+			 {
+			      E T1E, T1F, T5b, T58, T59, T5a;
+			      T1E = ri[WS(is, 7)];
+			      T1F = ri[WS(is, 39)];
+			      T5b = T1E - T1F;
+			      T58 = ii[WS(is, 7)];
+			      T59 = ii[WS(is, 39)];
+			      T5a = T58 - T59;
+			      T1G = T1E + T1F;
+			      Tb8 = T58 + T59;
+			      T5c = T5a - T5b;
+			      T5O = T5b + T5a;
+			 }
+		    }
+		    {
+			 E T1D, T1K, Tbq, Tbr;
+			 T1D = T1z + T1C;
+			 T1K = T1G + T1J;
+			 T1L = T1D + T1K;
+			 Tdv = T1D - T1K;
+			 Tbq = Tbo - Tbp;
+			 Tbr = T1J - T1G;
+			 Tbs = Tbq - Tbr;
+			 Tcw = Tbr + Tbq;
+		    }
+		    {
+			 E TdA, TdB, T57, T5i;
+			 TdA = Tbo + Tbp;
+			 TdB = Tb8 + Tb9;
+			 TdC = TdA - TdB;
+			 Teo = TdA + TdB;
+			 T57 = T53 - T56;
+			 T5i = KP707106781 * (T5c - T5h);
+			 T5j = T57 - T5i;
+			 T6V = T57 + T5i;
+		    }
+		    {
+			 E T5M, T5P, T8w, T8x;
+			 T5M = T5I + T5L;
+			 T5P = KP707106781 * (T5N - T5O);
+			 T5Q = T5M - T5P;
+			 T6Y = T5M + T5P;
+			 T8w = T5L - T5I;
+			 T8x = KP707106781 * (T5c + T5h);
+			 T8y = T8w - T8x;
+			 T9C = T8w + T8x;
+		    }
+		    {
+			 E Tb7, Tba, T8l, T8m;
+			 Tb7 = T1z - T1C;
+			 Tba = Tb8 - Tb9;
+			 Tbb = Tb7 - Tba;
+			 Tct = Tb7 + Tba;
+			 T8l = T53 + T56;
+			 T8m = KP707106781 * (T5O + T5N);
+			 T8n = T8l - T8m;
+			 T9z = T8l + T8m;
+		    }
+	       }
+	       {
+		    E TN, T40, T2Q, T3Q, TQ, T3P, T2T, T41, TX, T44, T30, T3U, TU, T43, T2X;
+		    E T3X;
+		    {
+			 E TL, TM, T2R, T2S;
+			 TL = ri[WS(is, 62)];
+			 TM = ri[WS(is, 30)];
+			 TN = TL + TM;
+			 T40 = TL - TM;
+			 {
+			      E T2O, T2P, TO, TP;
+			      T2O = ii[WS(is, 62)];
+			      T2P = ii[WS(is, 30)];
+			      T2Q = T2O + T2P;
+			      T3Q = T2O - T2P;
+			      TO = ri[WS(is, 14)];
+			      TP = ri[WS(is, 46)];
+			      TQ = TO + TP;
+			      T3P = TO - TP;
+			 }
+			 T2R = ii[WS(is, 14)];
+			 T2S = ii[WS(is, 46)];
+			 T2T = T2R + T2S;
+			 T41 = T2R - T2S;
+			 {
+			      E TV, TW, T3S, T2Y, T2Z, T3T;
+			      TV = ri[WS(is, 54)];
+			      TW = ri[WS(is, 22)];
+			      T3S = TV - TW;
+			      T2Y = ii[WS(is, 54)];
+			      T2Z = ii[WS(is, 22)];
+			      T3T = T2Y - T2Z;
+			      TX = TV + TW;
+			      T44 = T3S + T3T;
+			      T30 = T2Y + T2Z;
+			      T3U = T3S - T3T;
+			 }
+			 {
+			      E TS, TT, T3V, T2V, T2W, T3W;
+			      TS = ri[WS(is, 6)];
+			      TT = ri[WS(is, 38)];
+			      T3V = TS - TT;
+			      T2V = ii[WS(is, 6)];
+			      T2W = ii[WS(is, 38)];
+			      T3W = T2V - T2W;
+			      TU = TS + TT;
+			      T43 = T3W - T3V;
+			      T2X = T2V + T2W;
+			      T3X = T3V + T3W;
+			 }
+		    }
+		    {
+			 E TR, TY, Tax, Tay;
+			 TR = TN + TQ;
+			 TY = TU + TX;
+			 TZ = TR + TY;
+			 Tdf = TR - TY;
+			 Tax = T2Q - T2T;
+			 Tay = TX - TU;
+			 Taz = Tax - Tay;
+			 Tch = Tay + Tax;
+		    }
+		    {
+			 E TaA, TaB, T2U, T31;
+			 TaA = TN - TQ;
+			 TaB = T2X - T30;
+			 TaC = TaA - TaB;
+			 Tci = TaA + TaB;
+			 T2U = T2Q + T2T;
+			 T31 = T2X + T30;
+			 T32 = T2U + T31;
+			 Tdg = T2U - T31;
+		    }
+		    {
+			 E T3R, T3Y, T7T, T7U;
+			 T3R = T3P + T3Q;
+			 T3Y = KP707106781 * (T3U - T3X);
+			 T3Z = T3R - T3Y;
+			 T6J = T3R + T3Y;
+			 T7T = T40 + T41;
+			 T7U = KP707106781 * (T3X + T3U);
+			 T7V = T7T - T7U;
+			 T9n = T7T + T7U;
+		    }
+		    {
+			 E T7W, T7X, T42, T45;
+			 T7W = T3Q - T3P;
+			 T7X = KP707106781 * (T43 + T44);
+			 T7Y = T7W - T7X;
+			 T9o = T7W + T7X;
+			 T42 = T40 - T41;
+			 T45 = KP707106781 * (T43 - T44);
+			 T46 = T42 - T45;
+			 T6K = T42 + T45;
+		    }
+	       }
+	       {
+		    E T14, T4P, T4d, TaG, T17, T4a, T4S, TaH, T1e, TaZ, T4j, T4V, T1b, TaY, T4o;
+		    E T4U;
+		    {
+			 E T12, T13, T4Q, T4R;
+			 T12 = ri[WS(is, 1)];
+			 T13 = ri[WS(is, 33)];
+			 T14 = T12 + T13;
+			 T4P = T12 - T13;
+			 {
+			      E T4b, T4c, T15, T16;
+			      T4b = ii[WS(is, 1)];
+			      T4c = ii[WS(is, 33)];
+			      T4d = T4b - T4c;
+			      TaG = T4b + T4c;
+			      T15 = ri[WS(is, 17)];
+			      T16 = ri[WS(is, 49)];
+			      T17 = T15 + T16;
+			      T4a = T15 - T16;
+			 }
+			 T4Q = ii[WS(is, 17)];
+			 T4R = ii[WS(is, 49)];
+			 T4S = T4Q - T4R;
+			 TaH = T4Q + T4R;
+			 {
+			      E T1c, T1d, T4f, T4g, T4h, T4i;
+			      T1c = ri[WS(is, 57)];
+			      T1d = ri[WS(is, 25)];
+			      T4f = T1c - T1d;
+			      T4g = ii[WS(is, 57)];
+			      T4h = ii[WS(is, 25)];
+			      T4i = T4g - T4h;
+			      T1e = T1c + T1d;
+			      TaZ = T4g + T4h;
+			      T4j = T4f - T4i;
+			      T4V = T4f + T4i;
+			 }
+			 {
+			      E T19, T1a, T4k, T4l, T4m, T4n;
+			      T19 = ri[WS(is, 9)];
+			      T1a = ri[WS(is, 41)];
+			      T4k = T19 - T1a;
+			      T4l = ii[WS(is, 9)];
+			      T4m = ii[WS(is, 41)];
+			      T4n = T4l - T4m;
+			      T1b = T19 + T1a;
+			      TaY = T4l + T4m;
+			      T4o = T4k + T4n;
+			      T4U = T4n - T4k;
+			 }
+		    }
+		    {
+			 E T18, T1f, TaX, Tb0;
+			 T18 = T14 + T17;
+			 T1f = T1b + T1e;
+			 T1g = T18 + T1f;
+			 Tdp = T18 - T1f;
+			 TaX = T14 - T17;
+			 Tb0 = TaY - TaZ;
+			 Tb1 = TaX - Tb0;
+			 Tcm = TaX + Tb0;
+		    }
+		    {
+			 E Tdk, Tdl, T4e, T4p;
+			 Tdk = TaG + TaH;
+			 Tdl = TaY + TaZ;
+			 Tdm = Tdk - Tdl;
+			 Tej = Tdk + Tdl;
+			 T4e = T4a + T4d;
+			 T4p = KP707106781 * (T4j - T4o);
+			 T4q = T4e - T4p;
+			 T6R = T4e + T4p;
+		    }
+		    {
+			 E T4T, T4W, T8d, T8e;
+			 T4T = T4P - T4S;
+			 T4W = KP707106781 * (T4U - T4V);
+			 T4X = T4T - T4W;
+			 T6O = T4T + T4W;
+			 T8d = T4P + T4S;
+			 T8e = KP707106781 * (T4o + T4j);
+			 T8f = T8d - T8e;
+			 T9s = T8d + T8e;
+		    }
+		    {
+			 E TaI, TaJ, T82, T83;
+			 TaI = TaG - TaH;
+			 TaJ = T1e - T1b;
+			 TaK = TaI - TaJ;
+			 Tcp = TaJ + TaI;
+			 T82 = T4d - T4a;
+			 T83 = KP707106781 * (T4U + T4V);
+			 T84 = T82 - T83;
+			 T9v = T82 + T83;
+		    }
+	       }
+	       {
+		    E T1j, TaR, T1m, TaS, T4G, T4L, TaT, TaQ, T89, T88, T1q, TaM, T1t, TaN, T4v;
+		    E T4A, TaO, TaL, T86, T85;
+		    {
+			 E T4H, T4F, T4C, T4K;
+			 {
+			      E T1h, T1i, T4D, T4E;
+			      T1h = ri[WS(is, 5)];
+			      T1i = ri[WS(is, 37)];
+			      T1j = T1h + T1i;
+			      T4H = T1h - T1i;
+			      T4D = ii[WS(is, 5)];
+			      T4E = ii[WS(is, 37)];
+			      T4F = T4D - T4E;
+			      TaR = T4D + T4E;
+			 }
+			 {
+			      E T1k, T1l, T4I, T4J;
+			      T1k = ri[WS(is, 21)];
+			      T1l = ri[WS(is, 53)];
+			      T1m = T1k + T1l;
+			      T4C = T1k - T1l;
+			      T4I = ii[WS(is, 21)];
+			      T4J = ii[WS(is, 53)];
+			      T4K = T4I - T4J;
+			      TaS = T4I + T4J;
+			 }
+			 T4G = T4C + T4F;
+			 T4L = T4H - T4K;
+			 TaT = TaR - TaS;
+			 TaQ = T1j - T1m;
+			 T89 = T4H + T4K;
+			 T88 = T4F - T4C;
+		    }
+		    {
+			 E T4r, T4z, T4w, T4u;
+			 {
+			      E T1o, T1p, T4x, T4y;
+			      T1o = ri[WS(is, 61)];
+			      T1p = ri[WS(is, 29)];
+			      T1q = T1o + T1p;
+			      T4r = T1o - T1p;
+			      T4x = ii[WS(is, 61)];
+			      T4y = ii[WS(is, 29)];
+			      T4z = T4x - T4y;
+			      TaM = T4x + T4y;
+			 }
+			 {
+			      E T1r, T1s, T4s, T4t;
+			      T1r = ri[WS(is, 13)];
+			      T1s = ri[WS(is, 45)];
+			      T1t = T1r + T1s;
+			      T4w = T1r - T1s;
+			      T4s = ii[WS(is, 13)];
+			      T4t = ii[WS(is, 45)];
+			      T4u = T4s - T4t;
+			      TaN = T4s + T4t;
+			 }
+			 T4v = T4r - T4u;
+			 T4A = T4w + T4z;
+			 TaO = TaM - TaN;
+			 TaL = T1q - T1t;
+			 T86 = T4z - T4w;
+			 T85 = T4r + T4u;
+		    }
+		    {
+			 E T1n, T1u, Tb2, Tb3;
+			 T1n = T1j + T1m;
+			 T1u = T1q + T1t;
+			 T1v = T1n + T1u;
+			 Tdn = T1u - T1n;
+			 Tb2 = TaT - TaQ;
+			 Tb3 = TaL + TaO;
+			 Tb4 = KP707106781 * (Tb2 - Tb3);
+			 Tcq = KP707106781 * (Tb2 + Tb3);
+		    }
+		    {
+			 E Tdq, Tdr, T4B, T4M;
+			 Tdq = TaR + TaS;
+			 Tdr = TaM + TaN;
+			 Tds = Tdq - Tdr;
+			 Tek = Tdq + Tdr;
+			 T4B = FNMS(KP923879532, T4A, KP382683432 * T4v);
+			 T4M = FMA(KP923879532, T4G, KP382683432 * T4L);
+			 T4N = T4B - T4M;
+			 T6P = T4M + T4B;
+		    }
+		    {
+			 E T4Y, T4Z, T8g, T8h;
+			 T4Y = FNMS(KP923879532, T4L, KP382683432 * T4G);
+			 T4Z = FMA(KP382683432, T4A, KP923879532 * T4v);
+			 T50 = T4Y - T4Z;
+			 T6S = T4Y + T4Z;
+			 T8g = FNMS(KP382683432, T89, KP923879532 * T88);
+			 T8h = FMA(KP923879532, T86, KP382683432 * T85);
+			 T8i = T8g - T8h;
+			 T9w = T8g + T8h;
+		    }
+		    {
+			 E TaP, TaU, T87, T8a;
+			 TaP = TaL - TaO;
+			 TaU = TaQ + TaT;
+			 TaV = KP707106781 * (TaP - TaU);
+			 Tcn = KP707106781 * (TaU + TaP);
+			 T87 = FNMS(KP382683432, T86, KP923879532 * T85);
+			 T8a = FMA(KP382683432, T88, KP923879532 * T89);
+			 T8b = T87 - T8a;
+			 T9t = T8a + T87;
+		    }
+	       }
+	       {
+		    E T1O, Tbc, T1R, Tbd, T5o, T5t, Tbf, Tbe, T8p, T8o, T1V, Tbi, T1Y, Tbj, T5z;
+		    E T5E, Tbk, Tbh, T8s, T8r;
+		    {
+			 E T5p, T5n, T5k, T5s;
+			 {
+			      E T1M, T1N, T5l, T5m;
+			      T1M = ri[WS(is, 3)];
+			      T1N = ri[WS(is, 35)];
+			      T1O = T1M + T1N;
+			      T5p = T1M - T1N;
+			      T5l = ii[WS(is, 3)];
+			      T5m = ii[WS(is, 35)];
+			      T5n = T5l - T5m;
+			      Tbc = T5l + T5m;
+			 }
+			 {
+			      E T1P, T1Q, T5q, T5r;
+			      T1P = ri[WS(is, 19)];
+			      T1Q = ri[WS(is, 51)];
+			      T1R = T1P + T1Q;
+			      T5k = T1P - T1Q;
+			      T5q = ii[WS(is, 19)];
+			      T5r = ii[WS(is, 51)];
+			      T5s = T5q - T5r;
+			      Tbd = T5q + T5r;
+			 }
+			 T5o = T5k + T5n;
+			 T5t = T5p - T5s;
+			 Tbf = T1O - T1R;
+			 Tbe = Tbc - Tbd;
+			 T8p = T5p + T5s;
+			 T8o = T5n - T5k;
+		    }
+		    {
+			 E T5A, T5y, T5v, T5D;
+			 {
+			      E T1T, T1U, T5w, T5x;
+			      T1T = ri[WS(is, 59)];
+			      T1U = ri[WS(is, 27)];
+			      T1V = T1T + T1U;
+			      T5A = T1T - T1U;
+			      T5w = ii[WS(is, 59)];
+			      T5x = ii[WS(is, 27)];
+			      T5y = T5w - T5x;
+			      Tbi = T5w + T5x;
+			 }
+			 {
+			      E T1W, T1X, T5B, T5C;
+			      T1W = ri[WS(is, 11)];
+			      T1X = ri[WS(is, 43)];
+			      T1Y = T1W + T1X;
+			      T5v = T1W - T1X;
+			      T5B = ii[WS(is, 11)];
+			      T5C = ii[WS(is, 43)];
+			      T5D = T5B - T5C;
+			      Tbj = T5B + T5C;
+			 }
+			 T5z = T5v + T5y;
+			 T5E = T5A - T5D;
+			 Tbk = Tbi - Tbj;
+			 Tbh = T1V - T1Y;
+			 T8s = T5A + T5D;
+			 T8r = T5y - T5v;
+		    }
+		    {
+			 E T1S, T1Z, Tbt, Tbu;
+			 T1S = T1O + T1R;
+			 T1Z = T1V + T1Y;
+			 T20 = T1S + T1Z;
+			 TdD = T1Z - T1S;
+			 Tbt = Tbh - Tbk;
+			 Tbu = Tbf + Tbe;
+			 Tbv = KP707106781 * (Tbt - Tbu);
+			 Tcu = KP707106781 * (Tbu + Tbt);
+		    }
+		    {
+			 E Tdw, Tdx, T5u, T5F;
+			 Tdw = Tbc + Tbd;
+			 Tdx = Tbi + Tbj;
+			 Tdy = Tdw - Tdx;
+			 Tep = Tdw + Tdx;
+			 T5u = FNMS(KP923879532, T5t, KP382683432 * T5o);
+			 T5F = FMA(KP382683432, T5z, KP923879532 * T5E);
+			 T5G = T5u - T5F;
+			 T6Z = T5u + T5F;
+		    }
+		    {
+			 E T5R, T5S, T8z, T8A;
+			 T5R = FNMS(KP923879532, T5z, KP382683432 * T5E);
+			 T5S = FMA(KP923879532, T5o, KP382683432 * T5t);
+			 T5T = T5R - T5S;
+			 T6W = T5S + T5R;
+			 T8z = FNMS(KP382683432, T8r, KP923879532 * T8s);
+			 T8A = FMA(KP382683432, T8o, KP923879532 * T8p);
+			 T8B = T8z - T8A;
+			 T9A = T8A + T8z;
+		    }
+		    {
+			 E Tbg, Tbl, T8q, T8t;
+			 Tbg = Tbe - Tbf;
+			 Tbl = Tbh + Tbk;
+			 Tbm = KP707106781 * (Tbg - Tbl);
+			 Tcx = KP707106781 * (Tbg + Tbl);
+			 T8q = FNMS(KP382683432, T8p, KP923879532 * T8o);
+			 T8t = FMA(KP923879532, T8r, KP382683432 * T8s);
+			 T8u = T8q - T8t;
+			 T9D = T8q + T8t;
+		    }
+	       }
+	       {
+		    E T11, TeD, TeG, TeI, T22, T23, T34, TeH;
+		    {
+			 E Tv, T10, TeE, TeF;
+			 Tv = Tf + Tu;
+			 T10 = TK + TZ;
+			 T11 = Tv + T10;
+			 TeD = Tv - T10;
+			 TeE = Tej + Tek;
+			 TeF = Teo + Tep;
+			 TeG = TeE - TeF;
+			 TeI = TeE + TeF;
+		    }
+		    {
+			 E T1w, T21, T2y, T33;
+			 T1w = T1g + T1v;
+			 T21 = T1L + T20;
+			 T22 = T1w + T21;
+			 T23 = T21 - T1w;
+			 T2y = T2i + T2x;
+			 T33 = T2N + T32;
+			 T34 = T2y - T33;
+			 TeH = T2y + T33;
+		    }
+		    ro[WS(os, 32)] = T11 - T22;
+		    io[WS(os, 32)] = TeH - TeI;
+		    ro[0] = T11 + T22;
+		    io[0] = TeH + TeI;
+		    io[WS(os, 16)] = T23 + T34;
+		    ro[WS(os, 16)] = TeD + TeG;
+		    io[WS(os, 48)] = T34 - T23;
+		    ro[WS(os, 48)] = TeD - TeG;
+	       }
+	       {
+		    E Teh, Tex, Tev, TeB, Tem, Tey, Ter, Tez;
+		    {
+			 E Tef, Teg, Tet, Teu;
+			 Tef = Tf - Tu;
+			 Teg = T2N - T32;
+			 Teh = Tef + Teg;
+			 Tex = Tef - Teg;
+			 Tet = T2i - T2x;
+			 Teu = TZ - TK;
+			 Tev = Tet - Teu;
+			 TeB = Teu + Tet;
+		    }
+		    {
+			 E Tei, Tel, Ten, Teq;
+			 Tei = T1g - T1v;
+			 Tel = Tej - Tek;
+			 Tem = Tei + Tel;
+			 Tey = Tel - Tei;
+			 Ten = T1L - T20;
+			 Teq = Teo - Tep;
+			 Ter = Ten - Teq;
+			 Tez = Ten + Teq;
+		    }
+		    {
+			 E Tes, TeC, Tew, TeA;
+			 Tes = KP707106781 * (Tem + Ter);
+			 ro[WS(os, 40)] = Teh - Tes;
+			 ro[WS(os, 8)] = Teh + Tes;
+			 TeC = KP707106781 * (Tey + Tez);
+			 io[WS(os, 40)] = TeB - TeC;
+			 io[WS(os, 8)] = TeB + TeC;
+			 Tew = KP707106781 * (Ter - Tem);
+			 io[WS(os, 56)] = Tev - Tew;
+			 io[WS(os, 24)] = Tev + Tew;
+			 TeA = KP707106781 * (Tey - Tez);
+			 ro[WS(os, 56)] = Tex - TeA;
+			 ro[WS(os, 24)] = Tex + TeA;
+		    }
+	       }
+	       {
+		    E Tdb, TdV, Te5, TdJ, Tdi, Te6, Te3, Teb, TdM, TdW, Tdu, TdQ, Te0, Tea, TdF;
+		    E TdR;
+		    {
+			 E Tde, Tdh, Tdo, Tdt;
+			 Tdb = Td9 - Tda;
+			 TdV = Td9 + Tda;
+			 Te5 = TdI + TdH;
+			 TdJ = TdH - TdI;
+			 Tde = Tdc - Tdd;
+			 Tdh = Tdf + Tdg;
+			 Tdi = KP707106781 * (Tde - Tdh);
+			 Te6 = KP707106781 * (Tde + Tdh);
+			 {
+			      E Te1, Te2, TdK, TdL;
+			      Te1 = Tdv + Tdy;
+			      Te2 = TdD + TdC;
+			      Te3 = FNMS(KP382683432, Te2, KP923879532 * Te1);
+			      Teb = FMA(KP923879532, Te2, KP382683432 * Te1);
+			      TdK = Tdf - Tdg;
+			      TdL = Tdd + Tdc;
+			      TdM = KP707106781 * (TdK - TdL);
+			      TdW = KP707106781 * (TdL + TdK);
+			 }
+			 Tdo = Tdm - Tdn;
+			 Tdt = Tdp - Tds;
+			 Tdu = FMA(KP923879532, Tdo, KP382683432 * Tdt);
+			 TdQ = FNMS(KP923879532, Tdt, KP382683432 * Tdo);
+			 {
+			      E TdY, TdZ, Tdz, TdE;
+			      TdY = Tdn + Tdm;
+			      TdZ = Tdp + Tds;
+			      Te0 = FMA(KP382683432, TdY, KP923879532 * TdZ);
+			      Tea = FNMS(KP382683432, TdZ, KP923879532 * TdY);
+			      Tdz = Tdv - Tdy;
+			      TdE = TdC - TdD;
+			      TdF = FNMS(KP923879532, TdE, KP382683432 * Tdz);
+			      TdR = FMA(KP382683432, TdE, KP923879532 * Tdz);
+			 }
+		    }
+		    {
+			 E Tdj, TdG, TdT, TdU;
+			 Tdj = Tdb + Tdi;
+			 TdG = Tdu + TdF;
+			 ro[WS(os, 44)] = Tdj - TdG;
+			 ro[WS(os, 12)] = Tdj + TdG;
+			 TdT = TdJ + TdM;
+			 TdU = TdQ + TdR;
+			 io[WS(os, 44)] = TdT - TdU;
+			 io[WS(os, 12)] = TdT + TdU;
+		    }
+		    {
+			 E TdN, TdO, TdP, TdS;
+			 TdN = TdJ - TdM;
+			 TdO = TdF - Tdu;
+			 io[WS(os, 60)] = TdN - TdO;
+			 io[WS(os, 28)] = TdN + TdO;
+			 TdP = Tdb - Tdi;
+			 TdS = TdQ - TdR;
+			 ro[WS(os, 60)] = TdP - TdS;
+			 ro[WS(os, 28)] = TdP + TdS;
+		    }
+		    {
+			 E TdX, Te4, Ted, Tee;
+			 TdX = TdV + TdW;
+			 Te4 = Te0 + Te3;
+			 ro[WS(os, 36)] = TdX - Te4;
+			 ro[WS(os, 4)] = TdX + Te4;
+			 Ted = Te5 + Te6;
+			 Tee = Tea + Teb;
+			 io[WS(os, 36)] = Ted - Tee;
+			 io[WS(os, 4)] = Ted + Tee;
+		    }
+		    {
+			 E Te7, Te8, Te9, Tec;
+			 Te7 = Te5 - Te6;
+			 Te8 = Te3 - Te0;
+			 io[WS(os, 52)] = Te7 - Te8;
+			 io[WS(os, 20)] = Te7 + Te8;
+			 Te9 = TdV - TdW;
+			 Tec = Tea - Teb;
+			 ro[WS(os, 52)] = Te9 - Tec;
+			 ro[WS(os, 20)] = Te9 + Tec;
+		    }
+	       }
+	       {
+		    E Tcd, TcP, TcD, TcZ, Tck, Td0, TcX, Td5, Tcs, TcK, TcG, TcQ, TcU, Td4, Tcz;
+		    E TcL, Tcc, TcC;
+		    Tcc = KP707106781 * (TbD + TbC);
+		    Tcd = Tcb - Tcc;
+		    TcP = Tcb + Tcc;
+		    TcC = KP707106781 * (Tak + Tan);
+		    TcD = TcB - TcC;
+		    TcZ = TcB + TcC;
+		    {
+			 E Tcg, Tcj, TcV, TcW;
+			 Tcg = FNMS(KP382683432, Tcf, KP923879532 * Tce);
+			 Tcj = FMA(KP923879532, Tch, KP382683432 * Tci);
+			 Tck = Tcg - Tcj;
+			 Td0 = Tcg + Tcj;
+			 TcV = Tct + Tcu;
+			 TcW = Tcw + Tcx;
+			 TcX = FNMS(KP195090322, TcW, KP980785280 * TcV);
+			 Td5 = FMA(KP195090322, TcV, KP980785280 * TcW);
+		    }
+		    {
+			 E Tco, Tcr, TcE, TcF;
+			 Tco = Tcm - Tcn;
+			 Tcr = Tcp - Tcq;
+			 Tcs = FMA(KP555570233, Tco, KP831469612 * Tcr);
+			 TcK = FNMS(KP831469612, Tco, KP555570233 * Tcr);
+			 TcE = FNMS(KP382683432, Tch, KP923879532 * Tci);
+			 TcF = FMA(KP382683432, Tce, KP923879532 * Tcf);
+			 TcG = TcE - TcF;
+			 TcQ = TcF + TcE;
+		    }
+		    {
+			 E TcS, TcT, Tcv, Tcy;
+			 TcS = Tcm + Tcn;
+			 TcT = Tcp + Tcq;
+			 TcU = FMA(KP980785280, TcS, KP195090322 * TcT);
+			 Td4 = FNMS(KP195090322, TcS, KP980785280 * TcT);
+			 Tcv = Tct - Tcu;
+			 Tcy = Tcw - Tcx;
+			 Tcz = FNMS(KP831469612, Tcy, KP555570233 * Tcv);
+			 TcL = FMA(KP831469612, Tcv, KP555570233 * Tcy);
+		    }
+		    {
+			 E Tcl, TcA, TcN, TcO;
+			 Tcl = Tcd + Tck;
+			 TcA = Tcs + Tcz;
+			 ro[WS(os, 42)] = Tcl - TcA;
+			 ro[WS(os, 10)] = Tcl + TcA;
+			 TcN = TcD + TcG;
+			 TcO = TcK + TcL;
+			 io[WS(os, 42)] = TcN - TcO;
+			 io[WS(os, 10)] = TcN + TcO;
+		    }
+		    {
+			 E TcH, TcI, TcJ, TcM;
+			 TcH = TcD - TcG;
+			 TcI = Tcz - Tcs;
+			 io[WS(os, 58)] = TcH - TcI;
+			 io[WS(os, 26)] = TcH + TcI;
+			 TcJ = Tcd - Tck;
+			 TcM = TcK - TcL;
+			 ro[WS(os, 58)] = TcJ - TcM;
+			 ro[WS(os, 26)] = TcJ + TcM;
+		    }
+		    {
+			 E TcR, TcY, Td7, Td8;
+			 TcR = TcP + TcQ;
+			 TcY = TcU + TcX;
+			 ro[WS(os, 34)] = TcR - TcY;
+			 ro[WS(os, 2)] = TcR + TcY;
+			 Td7 = TcZ + Td0;
+			 Td8 = Td4 + Td5;
+			 io[WS(os, 34)] = Td7 - Td8;
+			 io[WS(os, 2)] = Td7 + Td8;
+		    }
+		    {
+			 E Td1, Td2, Td3, Td6;
+			 Td1 = TcZ - Td0;
+			 Td2 = TcX - TcU;
+			 io[WS(os, 50)] = Td1 - Td2;
+			 io[WS(os, 18)] = Td1 + Td2;
+			 Td3 = TcP - TcQ;
+			 Td6 = Td4 - Td5;
+			 ro[WS(os, 50)] = Td3 - Td6;
+			 ro[WS(os, 18)] = Td3 + Td6;
+		    }
+	       }
+	       {
+		    E Tap, TbR, TbF, Tc1, TaE, Tc2, TbZ, Tc7, Tb6, TbM, TbI, TbS, TbW, Tc6, Tbx;
+		    E TbN, Tao, TbE;
+		    Tao = KP707106781 * (Tak - Tan);
+		    Tap = Tah - Tao;
+		    TbR = Tah + Tao;
+		    TbE = KP707106781 * (TbC - TbD);
+		    TbF = TbB - TbE;
+		    Tc1 = TbB + TbE;
+		    {
+			 E Taw, TaD, TbX, TbY;
+			 Taw = FNMS(KP923879532, Tav, KP382683432 * Tas);
+			 TaD = FMA(KP382683432, Taz, KP923879532 * TaC);
+			 TaE = Taw - TaD;
+			 Tc2 = Taw + TaD;
+			 TbX = Tbb + Tbm;
+			 TbY = Tbs + Tbv;
+			 TbZ = FNMS(KP555570233, TbY, KP831469612 * TbX);
+			 Tc7 = FMA(KP831469612, TbY, KP555570233 * TbX);
+		    }
+		    {
+			 E TaW, Tb5, TbG, TbH;
+			 TaW = TaK - TaV;
+			 Tb5 = Tb1 - Tb4;
+			 Tb6 = FMA(KP980785280, TaW, KP195090322 * Tb5);
+			 TbM = FNMS(KP980785280, Tb5, KP195090322 * TaW);
+			 TbG = FNMS(KP923879532, Taz, KP382683432 * TaC);
+			 TbH = FMA(KP923879532, Tas, KP382683432 * Tav);
+			 TbI = TbG - TbH;
+			 TbS = TbH + TbG;
+		    }
+		    {
+			 E TbU, TbV, Tbn, Tbw;
+			 TbU = TaK + TaV;
+			 TbV = Tb1 + Tb4;
+			 TbW = FMA(KP555570233, TbU, KP831469612 * TbV);
+			 Tc6 = FNMS(KP555570233, TbV, KP831469612 * TbU);
+			 Tbn = Tbb - Tbm;
+			 Tbw = Tbs - Tbv;
+			 Tbx = FNMS(KP980785280, Tbw, KP195090322 * Tbn);
+			 TbN = FMA(KP195090322, Tbw, KP980785280 * Tbn);
+		    }
+		    {
+			 E TaF, Tby, TbP, TbQ;
+			 TaF = Tap + TaE;
+			 Tby = Tb6 + Tbx;
+			 ro[WS(os, 46)] = TaF - Tby;
+			 ro[WS(os, 14)] = TaF + Tby;
+			 TbP = TbF + TbI;
+			 TbQ = TbM + TbN;
+			 io[WS(os, 46)] = TbP - TbQ;
+			 io[WS(os, 14)] = TbP + TbQ;
+		    }
+		    {
+			 E TbJ, TbK, TbL, TbO;
+			 TbJ = TbF - TbI;
+			 TbK = Tbx - Tb6;
+			 io[WS(os, 62)] = TbJ - TbK;
+			 io[WS(os, 30)] = TbJ + TbK;
+			 TbL = Tap - TaE;
+			 TbO = TbM - TbN;
+			 ro[WS(os, 62)] = TbL - TbO;
+			 ro[WS(os, 30)] = TbL + TbO;
+		    }
+		    {
+			 E TbT, Tc0, Tc9, Tca;
+			 TbT = TbR + TbS;
+			 Tc0 = TbW + TbZ;
+			 ro[WS(os, 38)] = TbT - Tc0;
+			 ro[WS(os, 6)] = TbT + Tc0;
+			 Tc9 = Tc1 + Tc2;
+			 Tca = Tc6 + Tc7;
+			 io[WS(os, 38)] = Tc9 - Tca;
+			 io[WS(os, 6)] = Tc9 + Tca;
+		    }
+		    {
+			 E Tc3, Tc4, Tc5, Tc8;
+			 Tc3 = Tc1 - Tc2;
+			 Tc4 = TbZ - TbW;
+			 io[WS(os, 54)] = Tc3 - Tc4;
+			 io[WS(os, 22)] = Tc3 + Tc4;
+			 Tc5 = TbR - TbS;
+			 Tc8 = Tc6 - Tc7;
+			 ro[WS(os, 54)] = Tc5 - Tc8;
+			 ro[WS(os, 22)] = Tc5 + Tc8;
+		    }
+	       }
+	       {
+		    E T6F, T7h, T7m, T7w, T7p, T7x, T6M, T7s, T6U, T7c, T75, T7r, T78, T7i, T71;
+		    E T7d;
+		    {
+			 E T6D, T6E, T7k, T7l;
+			 T6D = T37 + T3e;
+			 T6E = T65 + T64;
+			 T6F = T6D - T6E;
+			 T7h = T6D + T6E;
+			 T7k = T6O + T6P;
+			 T7l = T6R + T6S;
+			 T7m = FMA(KP956940335, T7k, KP290284677 * T7l);
+			 T7w = FNMS(KP290284677, T7k, KP956940335 * T7l);
+		    }
+		    {
+			 E T7n, T7o, T6I, T6L;
+			 T7n = T6V + T6W;
+			 T7o = T6Y + T6Z;
+			 T7p = FNMS(KP290284677, T7o, KP956940335 * T7n);
+			 T7x = FMA(KP290284677, T7n, KP956940335 * T7o);
+			 T6I = FNMS(KP555570233, T6H, KP831469612 * T6G);
+			 T6L = FMA(KP831469612, T6J, KP555570233 * T6K);
+			 T6M = T6I - T6L;
+			 T7s = T6I + T6L;
+		    }
+		    {
+			 E T6Q, T6T, T73, T74;
+			 T6Q = T6O - T6P;
+			 T6T = T6R - T6S;
+			 T6U = FMA(KP471396736, T6Q, KP881921264 * T6T);
+			 T7c = FNMS(KP881921264, T6Q, KP471396736 * T6T);
+			 T73 = T5Z + T62;
+			 T74 = T3m + T3t;
+			 T75 = T73 - T74;
+			 T7r = T73 + T74;
+		    }
+		    {
+			 E T76, T77, T6X, T70;
+			 T76 = FNMS(KP555570233, T6J, KP831469612 * T6K);
+			 T77 = FMA(KP555570233, T6G, KP831469612 * T6H);
+			 T78 = T76 - T77;
+			 T7i = T77 + T76;
+			 T6X = T6V - T6W;
+			 T70 = T6Y - T6Z;
+			 T71 = FNMS(KP881921264, T70, KP471396736 * T6X);
+			 T7d = FMA(KP881921264, T6X, KP471396736 * T70);
+		    }
+		    {
+			 E T6N, T72, T7f, T7g;
+			 T6N = T6F + T6M;
+			 T72 = T6U + T71;
+			 ro[WS(os, 43)] = T6N - T72;
+			 ro[WS(os, 11)] = T6N + T72;
+			 T7f = T75 + T78;
+			 T7g = T7c + T7d;
+			 io[WS(os, 43)] = T7f - T7g;
+			 io[WS(os, 11)] = T7f + T7g;
+		    }
+		    {
+			 E T79, T7a, T7b, T7e;
+			 T79 = T75 - T78;
+			 T7a = T71 - T6U;
+			 io[WS(os, 59)] = T79 - T7a;
+			 io[WS(os, 27)] = T79 + T7a;
+			 T7b = T6F - T6M;
+			 T7e = T7c - T7d;
+			 ro[WS(os, 59)] = T7b - T7e;
+			 ro[WS(os, 27)] = T7b + T7e;
+		    }
+		    {
+			 E T7j, T7q, T7z, T7A;
+			 T7j = T7h + T7i;
+			 T7q = T7m + T7p;
+			 ro[WS(os, 35)] = T7j - T7q;
+			 ro[WS(os, 3)] = T7j + T7q;
+			 T7z = T7r + T7s;
+			 T7A = T7w + T7x;
+			 io[WS(os, 35)] = T7z - T7A;
+			 io[WS(os, 3)] = T7z + T7A;
+		    }
+		    {
+			 E T7t, T7u, T7v, T7y;
+			 T7t = T7r - T7s;
+			 T7u = T7p - T7m;
+			 io[WS(os, 51)] = T7t - T7u;
+			 io[WS(os, 19)] = T7t + T7u;
+			 T7v = T7h - T7i;
+			 T7y = T7w - T7x;
+			 ro[WS(os, 51)] = T7v - T7y;
+			 ro[WS(os, 19)] = T7v + T7y;
+		    }
+	       }
+	       {
+		    E T9j, T9V, Ta0, Taa, Ta3, Tab, T9q, Ta6, T9y, T9Q, T9J, Ta5, T9M, T9W, T9F;
+		    E T9R;
+		    {
+			 E T9h, T9i, T9Y, T9Z;
+			 T9h = T7B + T7C;
+			 T9i = T8J + T8I;
+			 T9j = T9h - T9i;
+			 T9V = T9h + T9i;
+			 T9Y = T9s + T9t;
+			 T9Z = T9v + T9w;
+			 Ta0 = FMA(KP995184726, T9Y, KP098017140 * T9Z);
+			 Taa = FNMS(KP098017140, T9Y, KP995184726 * T9Z);
+		    }
+		    {
+			 E Ta1, Ta2, T9m, T9p;
+			 Ta1 = T9z + T9A;
+			 Ta2 = T9C + T9D;
+			 Ta3 = FNMS(KP098017140, Ta2, KP995184726 * Ta1);
+			 Tab = FMA(KP098017140, Ta1, KP995184726 * Ta2);
+			 T9m = FNMS(KP195090322, T9l, KP980785280 * T9k);
+			 T9p = FMA(KP195090322, T9n, KP980785280 * T9o);
+			 T9q = T9m - T9p;
+			 Ta6 = T9m + T9p;
+		    }
+		    {
+			 E T9u, T9x, T9H, T9I;
+			 T9u = T9s - T9t;
+			 T9x = T9v - T9w;
+			 T9y = FMA(KP634393284, T9u, KP773010453 * T9x);
+			 T9Q = FNMS(KP773010453, T9u, KP634393284 * T9x);
+			 T9H = T8F + T8G;
+			 T9I = T7G + T7J;
+			 T9J = T9H - T9I;
+			 Ta5 = T9H + T9I;
+		    }
+		    {
+			 E T9K, T9L, T9B, T9E;
+			 T9K = FNMS(KP195090322, T9o, KP980785280 * T9n);
+			 T9L = FMA(KP980785280, T9l, KP195090322 * T9k);
+			 T9M = T9K - T9L;
+			 T9W = T9L + T9K;
+			 T9B = T9z - T9A;
+			 T9E = T9C - T9D;
+			 T9F = FNMS(KP773010453, T9E, KP634393284 * T9B);
+			 T9R = FMA(KP773010453, T9B, KP634393284 * T9E);
+		    }
+		    {
+			 E T9r, T9G, T9T, T9U;
+			 T9r = T9j + T9q;
+			 T9G = T9y + T9F;
+			 ro[WS(os, 41)] = T9r - T9G;
+			 ro[WS(os, 9)] = T9r + T9G;
+			 T9T = T9J + T9M;
+			 T9U = T9Q + T9R;
+			 io[WS(os, 41)] = T9T - T9U;
+			 io[WS(os, 9)] = T9T + T9U;
+		    }
+		    {
+			 E T9N, T9O, T9P, T9S;
+			 T9N = T9J - T9M;
+			 T9O = T9F - T9y;
+			 io[WS(os, 57)] = T9N - T9O;
+			 io[WS(os, 25)] = T9N + T9O;
+			 T9P = T9j - T9q;
+			 T9S = T9Q - T9R;
+			 ro[WS(os, 57)] = T9P - T9S;
+			 ro[WS(os, 25)] = T9P + T9S;
+		    }
+		    {
+			 E T9X, Ta4, Tad, Tae;
+			 T9X = T9V + T9W;
+			 Ta4 = Ta0 + Ta3;
+			 ro[WS(os, 33)] = T9X - Ta4;
+			 ro[WS(os, 1)] = T9X + Ta4;
+			 Tad = Ta5 + Ta6;
+			 Tae = Taa + Tab;
+			 io[WS(os, 33)] = Tad - Tae;
+			 io[WS(os, 1)] = Tad + Tae;
+		    }
+		    {
+			 E Ta7, Ta8, Ta9, Tac;
+			 Ta7 = Ta5 - Ta6;
+			 Ta8 = Ta3 - Ta0;
+			 io[WS(os, 49)] = Ta7 - Ta8;
+			 io[WS(os, 17)] = Ta7 + Ta8;
+			 Ta9 = T9V - T9W;
+			 Tac = Taa - Tab;
+			 ro[WS(os, 49)] = Ta9 - Tac;
+			 ro[WS(os, 17)] = Ta9 + Tac;
+		    }
+	       }
+	       {
+		    E T3v, T6j, T6o, T6y, T6r, T6z, T48, T6u, T52, T6e, T67, T6t, T6a, T6k, T5V;
+		    E T6f;
+		    {
+			 E T3f, T3u, T6m, T6n;
+			 T3f = T37 - T3e;
+			 T3u = T3m - T3t;
+			 T3v = T3f - T3u;
+			 T6j = T3f + T3u;
+			 T6m = T4q + T4N;
+			 T6n = T4X + T50;
+			 T6o = FMA(KP634393284, T6m, KP773010453 * T6n);
+			 T6y = FNMS(KP634393284, T6n, KP773010453 * T6m);
+		    }
+		    {
+			 E T6p, T6q, T3O, T47;
+			 T6p = T5j + T5G;
+			 T6q = T5Q + T5T;
+			 T6r = FNMS(KP634393284, T6q, KP773010453 * T6p);
+			 T6z = FMA(KP773010453, T6q, KP634393284 * T6p);
+			 T3O = FNMS(KP980785280, T3N, KP195090322 * T3G);
+			 T47 = FMA(KP195090322, T3Z, KP980785280 * T46);
+			 T48 = T3O - T47;
+			 T6u = T3O + T47;
+		    }
+		    {
+			 E T4O, T51, T63, T66;
+			 T4O = T4q - T4N;
+			 T51 = T4X - T50;
+			 T52 = FMA(KP995184726, T4O, KP098017140 * T51);
+			 T6e = FNMS(KP995184726, T51, KP098017140 * T4O);
+			 T63 = T5Z - T62;
+			 T66 = T64 - T65;
+			 T67 = T63 - T66;
+			 T6t = T63 + T66;
+		    }
+		    {
+			 E T68, T69, T5H, T5U;
+			 T68 = FNMS(KP980785280, T3Z, KP195090322 * T46);
+			 T69 = FMA(KP980785280, T3G, KP195090322 * T3N);
+			 T6a = T68 - T69;
+			 T6k = T69 + T68;
+			 T5H = T5j - T5G;
+			 T5U = T5Q - T5T;
+			 T5V = FNMS(KP995184726, T5U, KP098017140 * T5H);
+			 T6f = FMA(KP098017140, T5U, KP995184726 * T5H);
+		    }
+		    {
+			 E T49, T5W, T6h, T6i;
+			 T49 = T3v + T48;
+			 T5W = T52 + T5V;
+			 ro[WS(os, 47)] = T49 - T5W;
+			 ro[WS(os, 15)] = T49 + T5W;
+			 T6h = T67 + T6a;
+			 T6i = T6e + T6f;
+			 io[WS(os, 47)] = T6h - T6i;
+			 io[WS(os, 15)] = T6h + T6i;
+		    }
+		    {
+			 E T6b, T6c, T6d, T6g;
+			 T6b = T67 - T6a;
+			 T6c = T5V - T52;
+			 io[WS(os, 63)] = T6b - T6c;
+			 io[WS(os, 31)] = T6b + T6c;
+			 T6d = T3v - T48;
+			 T6g = T6e - T6f;
+			 ro[WS(os, 63)] = T6d - T6g;
+			 ro[WS(os, 31)] = T6d + T6g;
+		    }
+		    {
+			 E T6l, T6s, T6B, T6C;
+			 T6l = T6j + T6k;
+			 T6s = T6o + T6r;
+			 ro[WS(os, 39)] = T6l - T6s;
+			 ro[WS(os, 7)] = T6l + T6s;
+			 T6B = T6t + T6u;
+			 T6C = T6y + T6z;
+			 io[WS(os, 39)] = T6B - T6C;
+			 io[WS(os, 7)] = T6B + T6C;
+		    }
+		    {
+			 E T6v, T6w, T6x, T6A;
+			 T6v = T6t - T6u;
+			 T6w = T6r - T6o;
+			 io[WS(os, 55)] = T6v - T6w;
+			 io[WS(os, 23)] = T6v + T6w;
+			 T6x = T6j - T6k;
+			 T6A = T6y - T6z;
+			 ro[WS(os, 55)] = T6x - T6A;
+			 ro[WS(os, 23)] = T6x + T6A;
+		    }
+	       }
+	       {
+		    E T7L, T8X, T92, T9c, T95, T9d, T80, T98, T8k, T8S, T8L, T97, T8O, T8Y, T8D;
+		    E T8T;
+		    {
+			 E T7D, T7K, T90, T91;
+			 T7D = T7B - T7C;
+			 T7K = T7G - T7J;
+			 T7L = T7D - T7K;
+			 T8X = T7D + T7K;
+			 T90 = T84 + T8b;
+			 T91 = T8f + T8i;
+			 T92 = FMA(KP471396736, T90, KP881921264 * T91);
+			 T9c = FNMS(KP471396736, T91, KP881921264 * T90);
+		    }
+		    {
+			 E T93, T94, T7S, T7Z;
+			 T93 = T8n + T8u;
+			 T94 = T8y + T8B;
+			 T95 = FNMS(KP471396736, T94, KP881921264 * T93);
+			 T9d = FMA(KP881921264, T94, KP471396736 * T93);
+			 T7S = FNMS(KP831469612, T7R, KP555570233 * T7O);
+			 T7Z = FMA(KP831469612, T7V, KP555570233 * T7Y);
+			 T80 = T7S - T7Z;
+			 T98 = T7S + T7Z;
+		    }
+		    {
+			 E T8c, T8j, T8H, T8K;
+			 T8c = T84 - T8b;
+			 T8j = T8f - T8i;
+			 T8k = FMA(KP956940335, T8c, KP290284677 * T8j);
+			 T8S = FNMS(KP956940335, T8j, KP290284677 * T8c);
+			 T8H = T8F - T8G;
+			 T8K = T8I - T8J;
+			 T8L = T8H - T8K;
+			 T97 = T8H + T8K;
+		    }
+		    {
+			 E T8M, T8N, T8v, T8C;
+			 T8M = FNMS(KP831469612, T7Y, KP555570233 * T7V);
+			 T8N = FMA(KP555570233, T7R, KP831469612 * T7O);
+			 T8O = T8M - T8N;
+			 T8Y = T8N + T8M;
+			 T8v = T8n - T8u;
+			 T8C = T8y - T8B;
+			 T8D = FNMS(KP956940335, T8C, KP290284677 * T8v);
+			 T8T = FMA(KP290284677, T8C, KP956940335 * T8v);
+		    }
+		    {
+			 E T81, T8E, T8V, T8W;
+			 T81 = T7L + T80;
+			 T8E = T8k + T8D;
+			 ro[WS(os, 45)] = T81 - T8E;
+			 ro[WS(os, 13)] = T81 + T8E;
+			 T8V = T8L + T8O;
+			 T8W = T8S + T8T;
+			 io[WS(os, 45)] = T8V - T8W;
+			 io[WS(os, 13)] = T8V + T8W;
+		    }
+		    {
+			 E T8P, T8Q, T8R, T8U;
+			 T8P = T8L - T8O;
+			 T8Q = T8D - T8k;
+			 io[WS(os, 61)] = T8P - T8Q;
+			 io[WS(os, 29)] = T8P + T8Q;
+			 T8R = T7L - T80;
+			 T8U = T8S - T8T;
+			 ro[WS(os, 61)] = T8R - T8U;
+			 ro[WS(os, 29)] = T8R + T8U;
+		    }
+		    {
+			 E T8Z, T96, T9f, T9g;
+			 T8Z = T8X + T8Y;
+			 T96 = T92 + T95;
+			 ro[WS(os, 37)] = T8Z - T96;
+			 ro[WS(os, 5)] = T8Z + T96;
+			 T9f = T97 + T98;
+			 T9g = T9c + T9d;
+			 io[WS(os, 37)] = T9f - T9g;
+			 io[WS(os, 5)] = T9f + T9g;
+		    }
+		    {
+			 E T99, T9a, T9b, T9e;
+			 T99 = T97 - T98;
+			 T9a = T95 - T92;
+			 io[WS(os, 53)] = T99 - T9a;
+			 io[WS(os, 21)] = T99 + T9a;
+			 T9b = T8X - T8Y;
+			 T9e = T9c - T9d;
+			 ro[WS(os, 53)] = T9b - T9e;
+			 ro[WS(os, 21)] = T9b + T9e;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 64, "n1_64", {808, 144, 104, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_64) (planner *p) {
+     X(kdft_register) (p, n1_64, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:42 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include n.h */
+
+/*
+ * This function contains 60 FP additions, 42 FP multiplications,
+ * (or, 18 additions, 0 multiplications, 42 fused multiply/add),
+ * 51 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "n.h"
+
+static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
+	       E Tz, TP, Ty, TK, TN, TE, Tw, TF;
+	       {
+		    E T1, TI, T4, TG, Ta, TT, Tp, TH, T7, Tk, TJ, TO, Tu, Tb, TB;
+		    E Tg, Tl, Th, Ti;
+		    T1 = ri[0];
+		    Tz = ii[0];
+		    {
+			 E T5, T6, Te, Tf;
+			 {
+			      E T2, T3, T8, T9;
+			      T2 = ri[WS(is, 1)];
+			      T3 = ri[WS(is, 6)];
+			      T8 = ri[WS(is, 3)];
+			      T9 = ri[WS(is, 4)];
+			      T5 = ri[WS(is, 2)];
+			      TI = T3 - T2;
+			      T4 = T2 + T3;
+			      TG = T9 - T8;
+			      Ta = T8 + T9;
+			      T6 = ri[WS(is, 5)];
+			 }
+			 Te = ii[WS(is, 2)];
+			 TT = FMA(KP554958132, TG, TI);
+			 Tp = FNMS(KP356895867, T4, Ta);
+			 TH = T6 - T5;
+			 T7 = T5 + T6;
+			 Tf = ii[WS(is, 5)];
+			 Tk = ii[WS(is, 3)];
+			 TJ = FNMS(KP554958132, TI, TH);
+			 TO = FMA(KP554958132, TH, TG);
+			 Tu = FNMS(KP356895867, Ta, T7);
+			 Tb = FNMS(KP356895867, T7, T4);
+			 TB = Te + Tf;
+			 Tg = Te - Tf;
+			 Tl = ii[WS(is, 4)];
+			 Th = ii[WS(is, 1)];
+			 Ti = ii[WS(is, 6)];
+		    }
+		    {
+			 E Tm, TA, Tj, TD, Ts, TL, Tx, TU, To, TR, Td, TM, Tv;
+			 {
+			      E TC, TQ, Tn, Tc;
+			      ro[0] = T1 + T4 + T7 + Ta;
+			      TC = Tk + Tl;
+			      Tm = Tk - Tl;
+			      TA = Th + Ti;
+			      Tj = Th - Ti;
+			      TD = FNMS(KP356895867, TC, TB);
+			      Ts = FMA(KP554958132, Tg, Tm);
+			      TL = FNMS(KP356895867, TA, TC);
+			      TQ = FNMS(KP356895867, TB, TA);
+			      Tx = FNMS(KP554958132, Tj, Tg);
+			      Tn = FMA(KP554958132, Tm, Tj);
+			      io[0] = Tz + TA + TB + TC;
+			      Tc = FNMS(KP692021471, Tb, Ta);
+			      TU = FMA(KP801937735, TT, TH);
+			      To = FMA(KP801937735, Tn, Tg);
+			      TR = FNMS(KP692021471, TQ, TC);
+			      Td = FNMS(KP900968867, Tc, T1);
+			 }
+			 {
+			      E Tt, Tr, TS, Tq;
+			      Tt = FNMS(KP801937735, Ts, Tj);
+			      Tq = FNMS(KP692021471, Tp, T7);
+			      TS = FNMS(KP900968867, TR, Tz);
+			      ro[WS(os, 1)] = FMA(KP974927912, To, Td);
+			      ro[WS(os, 6)] = FNMS(KP974927912, To, Td);
+			      Tr = FNMS(KP900968867, Tq, T1);
+			      io[WS(os, 6)] = FNMS(KP974927912, TU, TS);
+			      io[WS(os, 1)] = FMA(KP974927912, TU, TS);
+			      TP = FNMS(KP801937735, TO, TI);
+			      ro[WS(os, 2)] = FMA(KP974927912, Tt, Tr);
+			      ro[WS(os, 5)] = FNMS(KP974927912, Tt, Tr);
+			      TM = FNMS(KP692021471, TL, TB);
+			 }
+			 Ty = FNMS(KP801937735, Tx, Tm);
+			 Tv = FNMS(KP692021471, Tu, T4);
+			 TK = FNMS(KP801937735, TJ, TG);
+			 TN = FNMS(KP900968867, TM, Tz);
+			 TE = FNMS(KP692021471, TD, TA);
+			 Tw = FNMS(KP900968867, Tv, T1);
+		    }
+	       }
+	       io[WS(os, 5)] = FNMS(KP974927912, TP, TN);
+	       io[WS(os, 2)] = FMA(KP974927912, TP, TN);
+	       TF = FNMS(KP900968867, TE, Tz);
+	       ro[WS(os, 3)] = FMA(KP974927912, Ty, Tw);
+	       ro[WS(os, 4)] = FNMS(KP974927912, Ty, Tw);
+	       io[WS(os, 4)] = FNMS(KP974927912, TK, TF);
+	       io[WS(os, 3)] = FMA(KP974927912, TK, TF);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 7, "n1_7", {18, 0, 42, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_7) (planner *p) {
+     X(kdft_register) (p, n1_7, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include n.h */
+
+/*
+ * This function contains 60 FP additions, 36 FP multiplications,
+ * (or, 36 additions, 12 multiplications, 24 fused multiply/add),
+ * 25 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "n.h"
+
+static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
+	       E T1, Tu, T4, Tq, Te, Tx, T7, Ts, Tk, Tv, Ta, Tr, Th, Tw;
+	       T1 = ri[0];
+	       Tu = ii[0];
+	       {
+		    E T2, T3, Tc, Td;
+		    T2 = ri[WS(is, 1)];
+		    T3 = ri[WS(is, 6)];
+		    T4 = T2 + T3;
+		    Tq = T3 - T2;
+		    Tc = ii[WS(is, 1)];
+		    Td = ii[WS(is, 6)];
+		    Te = Tc - Td;
+		    Tx = Tc + Td;
+	       }
+	       {
+		    E T5, T6, Ti, Tj;
+		    T5 = ri[WS(is, 2)];
+		    T6 = ri[WS(is, 5)];
+		    T7 = T5 + T6;
+		    Ts = T6 - T5;
+		    Ti = ii[WS(is, 2)];
+		    Tj = ii[WS(is, 5)];
+		    Tk = Ti - Tj;
+		    Tv = Ti + Tj;
+	       }
+	       {
+		    E T8, T9, Tf, Tg;
+		    T8 = ri[WS(is, 3)];
+		    T9 = ri[WS(is, 4)];
+		    Ta = T8 + T9;
+		    Tr = T9 - T8;
+		    Tf = ii[WS(is, 3)];
+		    Tg = ii[WS(is, 4)];
+		    Th = Tf - Tg;
+		    Tw = Tf + Tg;
+	       }
+	       ro[0] = T1 + T4 + T7 + Ta;
+	       io[0] = Tu + Tx + Tv + Tw;
+	       {
+		    E Tl, Tb, TB, TC;
+		    Tl = FNMS(KP781831482, Th, KP974927912 * Te) - (KP433883739 * Tk);
+		    Tb = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
+		    ro[WS(os, 5)] = Tb - Tl;
+		    ro[WS(os, 2)] = Tb + Tl;
+		    TB = FNMS(KP781831482, Tr, KP974927912 * Tq) - (KP433883739 * Ts);
+		    TC = FMA(KP623489801, Tw, Tu) + FNMA(KP900968867, Tv, KP222520933 * Tx);
+		    io[WS(os, 2)] = TB + TC;
+		    io[WS(os, 5)] = TC - TB;
+	       }
+	       {
+		    E Tn, Tm, Tz, TA;
+		    Tn = FMA(KP781831482, Te, KP974927912 * Tk) + (KP433883739 * Th);
+		    Tm = FMA(KP623489801, T4, T1) + FNMA(KP900968867, Ta, KP222520933 * T7);
+		    ro[WS(os, 6)] = Tm - Tn;
+		    ro[WS(os, 1)] = Tm + Tn;
+		    Tz = FMA(KP781831482, Tq, KP974927912 * Ts) + (KP433883739 * Tr);
+		    TA = FMA(KP623489801, Tx, Tu) + FNMA(KP900968867, Tw, KP222520933 * Tv);
+		    io[WS(os, 1)] = Tz + TA;
+		    io[WS(os, 6)] = TA - Tz;
+	       }
+	       {
+		    E Tp, To, Tt, Ty;
+		    Tp = FMA(KP433883739, Te, KP974927912 * Th) - (KP781831482 * Tk);
+		    To = FMA(KP623489801, T7, T1) + FNMA(KP222520933, Ta, KP900968867 * T4);
+		    ro[WS(os, 4)] = To - Tp;
+		    ro[WS(os, 3)] = To + Tp;
+		    Tt = FMA(KP433883739, Tq, KP974927912 * Tr) - (KP781831482 * Ts);
+		    Ty = FMA(KP623489801, Tv, Tu) + FNMA(KP222520933, Tw, KP900968867 * Tx);
+		    io[WS(os, 3)] = Tt + Ty;
+		    io[WS(os, 4)] = Ty - Tt;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 7, "n1_7", {36, 12, 24, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_7) (planner *p) {
+     X(kdft_register) (p, n1_7, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:42 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include n.h */
+
+/*
+ * This function contains 52 FP additions, 8 FP multiplications,
+ * (or, 44 additions, 0 multiplications, 8 fused multiply/add),
+ * 36 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "n.h"
+
+static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       E TF, TE, TD, TI;
+	       {
+		    E Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, Tt;
+		    E TM;
+		    {
+			 E T4, T5, Tj, Tk;
+			 {
+			      E T1, T2, Tg, Th;
+			      T1 = ri[0];
+			      T2 = ri[WS(is, 4)];
+			      Tg = ii[0];
+			      Th = ii[WS(is, 4)];
+			      T4 = ri[WS(is, 2)];
+			      Tn = T1 - T2;
+			      T3 = T1 + T2;
+			      TC = Tg - Th;
+			      Ti = Tg + Th;
+			      T5 = ri[WS(is, 6)];
+			 }
+			 Tj = ii[WS(is, 2)];
+			 Tk = ii[WS(is, 6)];
+			 {
+			      E Tb, Tc, Tw, Tx;
+			      Tb = ri[WS(is, 7)];
+			      TB = T4 - T5;
+			      T6 = T4 + T5;
+			      To = Tj - Tk;
+			      Tl = Tj + Tk;
+			      Tc = ri[WS(is, 3)];
+			      Tw = ii[WS(is, 7)];
+			      Tx = ii[WS(is, 3)];
+			      {
+				   E T8, Tv, Ty, T9, Tr, Ts;
+				   T8 = ri[WS(is, 1)];
+				   Td = Tb + Tc;
+				   Tv = Tb - Tc;
+				   TN = Tw + Tx;
+				   Ty = Tw - Tx;
+				   T9 = ri[WS(is, 5)];
+				   Tr = ii[WS(is, 1)];
+				   Ts = ii[WS(is, 5)];
+				   Tz = Tv - Ty;
+				   TH = Tv + Ty;
+				   Ta = T8 + T9;
+				   Tq = T8 - T9;
+				   Tt = Tr - Ts;
+				   TM = Tr + Ts;
+			      }
+			 }
+		    }
+		    {
+			 E TL, TG, Tu, Tf, Tm, TO;
+			 {
+			      E T7, Te, TP, TQ;
+			      TL = T3 - T6;
+			      T7 = T3 + T6;
+			      TG = Tt - Tq;
+			      Tu = Tq + Tt;
+			      Te = Ta + Td;
+			      Tf = Td - Ta;
+			      Tm = Ti - Tl;
+			      TP = Ti + Tl;
+			      TQ = TM + TN;
+			      TO = TM - TN;
+			      ro[0] = T7 + Te;
+			      ro[WS(os, 4)] = T7 - Te;
+			      io[0] = TP + TQ;
+			      io[WS(os, 4)] = TP - TQ;
+			 }
+			 {
+			      E Tp, TA, TJ, TK;
+			      TF = Tn - To;
+			      Tp = Tn + To;
+			      io[WS(os, 6)] = Tm - Tf;
+			      io[WS(os, 2)] = Tf + Tm;
+			      ro[WS(os, 2)] = TL + TO;
+			      ro[WS(os, 6)] = TL - TO;
+			      TA = Tu + Tz;
+			      TE = Tz - Tu;
+			      TD = TB + TC;
+			      TJ = TC - TB;
+			      TK = TG + TH;
+			      TI = TG - TH;
+			      ro[WS(os, 1)] = FMA(KP707106781, TA, Tp);
+			      ro[WS(os, 5)] = FNMS(KP707106781, TA, Tp);
+			      io[WS(os, 1)] = FMA(KP707106781, TK, TJ);
+			      io[WS(os, 5)] = FNMS(KP707106781, TK, TJ);
+			 }
+		    }
+	       }
+	       io[WS(os, 3)] = FMA(KP707106781, TE, TD);
+	       io[WS(os, 7)] = FNMS(KP707106781, TE, TD);
+	       ro[WS(os, 3)] = FMA(KP707106781, TI, TF);
+	       ro[WS(os, 7)] = FNMS(KP707106781, TI, TF);
+	  }
+     }
+}
+
+static const kdft_desc desc = { 8, "n1_8", {44, 0, 8, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_8) (planner *p) {
+     X(kdft_register) (p, n1_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include n.h */
+
+/*
+ * This function contains 52 FP additions, 4 FP multiplications,
+ * (or, 52 additions, 4 multiplications, 0 fused multiply/add),
+ * 28 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "n.h"
+
+static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
+	       E TG;
+	       {
+		    E T1, T2, Tj, Tk;
+		    T1 = ri[0];
+		    T2 = ri[WS(is, 4)];
+		    T3 = T1 + T2;
+		    Tn = T1 - T2;
+		    {
+			 E Tg, Th, T4, T5;
+			 Tg = ii[0];
+			 Th = ii[WS(is, 4)];
+			 Ti = Tg + Th;
+			 TC = Tg - Th;
+			 T4 = ri[WS(is, 2)];
+			 T5 = ri[WS(is, 6)];
+			 T6 = T4 + T5;
+			 TB = T4 - T5;
+		    }
+		    Tj = ii[WS(is, 2)];
+		    Tk = ii[WS(is, 6)];
+		    Tl = Tj + Tk;
+		    To = Tj - Tk;
+		    {
+			 E Tb, Tc, Tv, Tw, Tx, Ty;
+			 Tb = ri[WS(is, 7)];
+			 Tc = ri[WS(is, 3)];
+			 Tv = Tb - Tc;
+			 Tw = ii[WS(is, 7)];
+			 Tx = ii[WS(is, 3)];
+			 Ty = Tw - Tx;
+			 Td = Tb + Tc;
+			 TN = Tw + Tx;
+			 Tz = Tv - Ty;
+			 TH = Tv + Ty;
+		    }
+		    {
+			 E T8, T9, Tq, Tr, Ts, Tt;
+			 T8 = ri[WS(is, 1)];
+			 T9 = ri[WS(is, 5)];
+			 Tq = T8 - T9;
+			 Tr = ii[WS(is, 1)];
+			 Ts = ii[WS(is, 5)];
+			 Tt = Tr - Ts;
+			 Ta = T8 + T9;
+			 TM = Tr + Ts;
+			 Tu = Tq + Tt;
+			 TG = Tt - Tq;
+		    }
+	       }
+	       {
+		    E T7, Te, TP, TQ;
+		    T7 = T3 + T6;
+		    Te = Ta + Td;
+		    ro[WS(os, 4)] = T7 - Te;
+		    ro[0] = T7 + Te;
+		    TP = Ti + Tl;
+		    TQ = TM + TN;
+		    io[WS(os, 4)] = TP - TQ;
+		    io[0] = TP + TQ;
+	       }
+	       {
+		    E Tf, Tm, TL, TO;
+		    Tf = Td - Ta;
+		    Tm = Ti - Tl;
+		    io[WS(os, 2)] = Tf + Tm;
+		    io[WS(os, 6)] = Tm - Tf;
+		    TL = T3 - T6;
+		    TO = TM - TN;
+		    ro[WS(os, 6)] = TL - TO;
+		    ro[WS(os, 2)] = TL + TO;
+	       }
+	       {
+		    E Tp, TA, TJ, TK;
+		    Tp = Tn + To;
+		    TA = KP707106781 * (Tu + Tz);
+		    ro[WS(os, 5)] = Tp - TA;
+		    ro[WS(os, 1)] = Tp + TA;
+		    TJ = TC - TB;
+		    TK = KP707106781 * (TG + TH);
+		    io[WS(os, 5)] = TJ - TK;
+		    io[WS(os, 1)] = TJ + TK;
+	       }
+	       {
+		    E TD, TE, TF, TI;
+		    TD = TB + TC;
+		    TE = KP707106781 * (Tz - Tu);
+		    io[WS(os, 7)] = TD - TE;
+		    io[WS(os, 3)] = TD + TE;
+		    TF = Tn - To;
+		    TI = KP707106781 * (TG - TH);
+		    ro[WS(os, 7)] = TF - TI;
+		    ro[WS(os, 3)] = TF + TI;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 8, "n1_8", {52, 4, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_8) (planner *p) {
+     X(kdft_register) (p, n1_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/n1_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/n1_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:42 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include n.h */
+
+/*
+ * This function contains 80 FP additions, 56 FP multiplications,
+ * (or, 24 additions, 0 multiplications, 56 fused multiply/add),
+ * 59 stack variables, 10 constants, and 36 memory accesses
+ */
+#include "n.h"
+
+static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP954188894, +0.954188894138671133499268364187245676532219158);
+     DK(KP363970234, +0.363970234266202361351047882776834043890471784);
+     DK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP492403876, +0.492403876506104029683371512294761506835321626);
+     DK(KP777861913, +0.777861913430206160028177977318626690410586096);
+     DK(KP839099631, +0.839099631177280011763127298123181364687434283);
+     DK(KP176326980, +0.176326980708464973471090386868618986121633062);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
+	       E T17, TV, T14, TY, T11, T15;
+	       {
+		    E Tm, TM, TL, T5, Tl, T1f, Tb, Tt, Ta, T1c, TI, TX, TF, TW, Tc;
+		    E Td, Tp, Tq;
+		    {
+			 E T1, Th, Ti, Tj, T4, T2, T3;
+			 T1 = ri[0];
+			 T2 = ri[WS(is, 3)];
+			 T3 = ri[WS(is, 6)];
+			 Th = ii[0];
+			 Ti = ii[WS(is, 3)];
+			 Tj = ii[WS(is, 6)];
+			 T4 = T2 + T3;
+			 Tm = T3 - T2;
+			 {
+			      E T6, Tz, T7, T8, TA, TB, Tk;
+			      T6 = ri[WS(is, 1)];
+			      TM = Ti - Tj;
+			      Tk = Ti + Tj;
+			      TL = FNMS(KP500000000, T4, T1);
+			      T5 = T1 + T4;
+			      Tz = ii[WS(is, 1)];
+			      Tl = FNMS(KP500000000, Tk, Th);
+			      T1f = Th + Tk;
+			      T7 = ri[WS(is, 4)];
+			      T8 = ri[WS(is, 7)];
+			      TA = ii[WS(is, 4)];
+			      TB = ii[WS(is, 7)];
+			      {
+				   E TE, T9, TH, TC, TG, TD;
+				   Tb = ri[WS(is, 2)];
+				   TE = T7 - T8;
+				   T9 = T7 + T8;
+				   TH = TB - TA;
+				   TC = TA + TB;
+				   Tt = ii[WS(is, 2)];
+				   Ta = T6 + T9;
+				   TG = FNMS(KP500000000, T9, T6);
+				   T1c = Tz + TC;
+				   TD = FNMS(KP500000000, TC, Tz);
+				   TI = FNMS(KP866025403, TH, TG);
+				   TX = FMA(KP866025403, TH, TG);
+				   TF = FNMS(KP866025403, TE, TD);
+				   TW = FMA(KP866025403, TE, TD);
+				   Tc = ri[WS(is, 5)];
+				   Td = ri[WS(is, 8)];
+				   Tp = ii[WS(is, 5)];
+				   Tq = ii[WS(is, 8)];
+			      }
+			 }
+		    }
+		    {
+			 E Tn, TN, TZ, T10, TO, Ty, TJ, TP;
+			 {
+			      E Tw, Te, Tu, Tr;
+			      T17 = FNMS(KP866025403, Tm, Tl);
+			      Tn = FMA(KP866025403, Tm, Tl);
+			      Tw = Td - Tc;
+			      Te = Tc + Td;
+			      Tu = Tp + Tq;
+			      Tr = Tp - Tq;
+			      TN = FMA(KP866025403, TM, TL);
+			      TV = FNMS(KP866025403, TM, TL);
+			      {
+				   E Tf, To, T1d, Tv;
+				   Tf = Tb + Te;
+				   To = FNMS(KP500000000, Te, Tb);
+				   T1d = Tt + Tu;
+				   Tv = FNMS(KP500000000, Tu, Tt);
+				   {
+					E Ts, Tg, T1i, Tx;
+					Ts = FMA(KP866025403, Tr, To);
+					TZ = FNMS(KP866025403, Tr, To);
+					Tg = Ta + Tf;
+					T1i = Tf - Ta;
+					Tx = FMA(KP866025403, Tw, Tv);
+					T10 = FNMS(KP866025403, Tw, Tv);
+					{
+					     E T1e, T1g, T1b, T1h;
+					     T1e = T1c - T1d;
+					     T1g = T1c + T1d;
+					     ro[0] = T5 + Tg;
+					     T1b = FNMS(KP500000000, Tg, T5);
+					     io[0] = T1f + T1g;
+					     T1h = FNMS(KP500000000, T1g, T1f);
+					     TO = FMA(KP176326980, Ts, Tx);
+					     Ty = FNMS(KP176326980, Tx, Ts);
+					     ro[WS(os, 6)] = FNMS(KP866025403, T1e, T1b);
+					     ro[WS(os, 3)] = FMA(KP866025403, T1e, T1b);
+					     io[WS(os, 6)] = FNMS(KP866025403, T1i, T1h);
+					     io[WS(os, 3)] = FMA(KP866025403, T1i, T1h);
+					     TJ = FNMS(KP839099631, TI, TF);
+					     TP = FMA(KP839099631, TF, TI);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E TS, TK, TU, TQ, TT, TR;
+			      TS = FMA(KP777861913, TJ, Ty);
+			      TK = FNMS(KP777861913, TJ, Ty);
+			      TU = FNMS(KP777861913, TP, TO);
+			      TQ = FMA(KP777861913, TP, TO);
+			      TT = FMA(KP492403876, TK, Tn);
+			      io[WS(os, 1)] = FNMS(KP984807753, TK, Tn);
+			      TR = FNMS(KP492403876, TQ, TN);
+			      ro[WS(os, 1)] = FMA(KP984807753, TQ, TN);
+			      io[WS(os, 4)] = FMA(KP852868531, TU, TT);
+			      io[WS(os, 7)] = FNMS(KP852868531, TU, TT);
+			      ro[WS(os, 7)] = FNMS(KP852868531, TS, TR);
+			      ro[WS(os, 4)] = FMA(KP852868531, TS, TR);
+			      T14 = FNMS(KP176326980, TW, TX);
+			      TY = FMA(KP176326980, TX, TW);
+			      T11 = FNMS(KP363970234, T10, TZ);
+			      T15 = FMA(KP363970234, TZ, T10);
+			 }
+		    }
+	       }
+	       {
+		    E T12, T1a, T16, T18, T13, T19;
+		    T12 = FNMS(KP954188894, T11, TY);
+		    T1a = FMA(KP954188894, T11, TY);
+		    T16 = FNMS(KP954188894, T15, T14);
+		    T18 = FMA(KP954188894, T15, T14);
+		    T13 = FNMS(KP492403876, T12, TV);
+		    ro[WS(os, 2)] = FMA(KP984807753, T12, TV);
+		    T19 = FMA(KP492403876, T18, T17);
+		    io[WS(os, 2)] = FNMS(KP984807753, T18, T17);
+		    ro[WS(os, 8)] = FMA(KP852868531, T16, T13);
+		    ro[WS(os, 5)] = FNMS(KP852868531, T16, T13);
+		    io[WS(os, 8)] = FMA(KP852868531, T1a, T19);
+		    io[WS(os, 5)] = FNMS(KP852868531, T1a, T19);
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 9, "n1_9", {24, 0, 56, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_9) (planner *p) {
+     X(kdft_register) (p, n1_9, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include n.h */
+
+/*
+ * This function contains 80 FP additions, 40 FP multiplications,
+ * (or, 60 additions, 20 multiplications, 20 fused multiply/add),
+ * 39 stack variables, 8 constants, and 36 memory accesses
+ */
+#include "n.h"
+
+static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
+	       E T5, TO, Th, Tk, T1g, TR, Ta, T1c, Tq, TW, Tv, TX, Tf, T1d, TB;
+	       E T10, TG, TZ;
+	       {
+		    E T1, T2, T3, T4;
+		    T1 = ri[0];
+		    T2 = ri[WS(is, 3)];
+		    T3 = ri[WS(is, 6)];
+		    T4 = T2 + T3;
+		    T5 = T1 + T4;
+		    TO = KP866025403 * (T3 - T2);
+		    Th = FNMS(KP500000000, T4, T1);
+	       }
+	       {
+		    E TP, Ti, Tj, TQ;
+		    TP = ii[0];
+		    Ti = ii[WS(is, 3)];
+		    Tj = ii[WS(is, 6)];
+		    TQ = Ti + Tj;
+		    Tk = KP866025403 * (Ti - Tj);
+		    T1g = TP + TQ;
+		    TR = FNMS(KP500000000, TQ, TP);
+	       }
+	       {
+		    E T6, Ts, T9, Tr, Tp, Tt, Tm, Tu;
+		    T6 = ri[WS(is, 1)];
+		    Ts = ii[WS(is, 1)];
+		    {
+			 E T7, T8, Tn, To;
+			 T7 = ri[WS(is, 4)];
+			 T8 = ri[WS(is, 7)];
+			 T9 = T7 + T8;
+			 Tr = KP866025403 * (T8 - T7);
+			 Tn = ii[WS(is, 4)];
+			 To = ii[WS(is, 7)];
+			 Tp = KP866025403 * (Tn - To);
+			 Tt = Tn + To;
+		    }
+		    Ta = T6 + T9;
+		    T1c = Ts + Tt;
+		    Tm = FNMS(KP500000000, T9, T6);
+		    Tq = Tm + Tp;
+		    TW = Tm - Tp;
+		    Tu = FNMS(KP500000000, Tt, Ts);
+		    Tv = Tr + Tu;
+		    TX = Tu - Tr;
+	       }
+	       {
+		    E Tb, TD, Te, TC, TA, TE, Tx, TF;
+		    Tb = ri[WS(is, 2)];
+		    TD = ii[WS(is, 2)];
+		    {
+			 E Tc, Td, Ty, Tz;
+			 Tc = ri[WS(is, 5)];
+			 Td = ri[WS(is, 8)];
+			 Te = Tc + Td;
+			 TC = KP866025403 * (Td - Tc);
+			 Ty = ii[WS(is, 5)];
+			 Tz = ii[WS(is, 8)];
+			 TA = KP866025403 * (Ty - Tz);
+			 TE = Ty + Tz;
+		    }
+		    Tf = Tb + Te;
+		    T1d = TD + TE;
+		    Tx = FNMS(KP500000000, Te, Tb);
+		    TB = Tx + TA;
+		    T10 = Tx - TA;
+		    TF = FNMS(KP500000000, TE, TD);
+		    TG = TC + TF;
+		    TZ = TF - TC;
+	       }
+	       {
+		    E T1e, Tg, T1b, T1f, T1h, T1i;
+		    T1e = KP866025403 * (T1c - T1d);
+		    Tg = Ta + Tf;
+		    T1b = FNMS(KP500000000, Tg, T5);
+		    ro[0] = T5 + Tg;
+		    ro[WS(os, 3)] = T1b + T1e;
+		    ro[WS(os, 6)] = T1b - T1e;
+		    T1f = KP866025403 * (Tf - Ta);
+		    T1h = T1c + T1d;
+		    T1i = FNMS(KP500000000, T1h, T1g);
+		    io[WS(os, 3)] = T1f + T1i;
+		    io[0] = T1g + T1h;
+		    io[WS(os, 6)] = T1i - T1f;
+	       }
+	       {
+		    E Tl, TS, TI, TN, TM, TT, TJ, TU;
+		    Tl = Th + Tk;
+		    TS = TO + TR;
+		    {
+			 E Tw, TH, TK, TL;
+			 Tw = FMA(KP766044443, Tq, KP642787609 * Tv);
+			 TH = FMA(KP173648177, TB, KP984807753 * TG);
+			 TI = Tw + TH;
+			 TN = KP866025403 * (TH - Tw);
+			 TK = FNMS(KP642787609, Tq, KP766044443 * Tv);
+			 TL = FNMS(KP984807753, TB, KP173648177 * TG);
+			 TM = KP866025403 * (TK - TL);
+			 TT = TK + TL;
+		    }
+		    ro[WS(os, 1)] = Tl + TI;
+		    io[WS(os, 1)] = TS + TT;
+		    TJ = FNMS(KP500000000, TI, Tl);
+		    ro[WS(os, 7)] = TJ - TM;
+		    ro[WS(os, 4)] = TJ + TM;
+		    TU = FNMS(KP500000000, TT, TS);
+		    io[WS(os, 4)] = TN + TU;
+		    io[WS(os, 7)] = TU - TN;
+	       }
+	       {
+		    E TV, T14, T12, T13, T17, T1a, T18, T19;
+		    TV = Th - Tk;
+		    T14 = TR - TO;
+		    {
+			 E TY, T11, T15, T16;
+			 TY = FMA(KP173648177, TW, KP984807753 * TX);
+			 T11 = FNMS(KP939692620, T10, KP342020143 * TZ);
+			 T12 = TY + T11;
+			 T13 = KP866025403 * (T11 - TY);
+			 T15 = FNMS(KP984807753, TW, KP173648177 * TX);
+			 T16 = FMA(KP342020143, T10, KP939692620 * TZ);
+			 T17 = T15 - T16;
+			 T1a = KP866025403 * (T15 + T16);
+		    }
+		    ro[WS(os, 2)] = TV + T12;
+		    io[WS(os, 2)] = T14 + T17;
+		    T18 = FNMS(KP500000000, T17, T14);
+		    io[WS(os, 5)] = T13 + T18;
+		    io[WS(os, 8)] = T18 - T13;
+		    T19 = FNMS(KP500000000, T12, TV);
+		    ro[WS(os, 8)] = T19 - T1a;
+		    ro[WS(os, 5)] = T19 + T1a;
+	       }
+	  }
+     }
+}
+
+static const kdft_desc desc = { 9, "n1_9", {60, 20, 20, 0}, &GENUS, 0, 0, 0, 0 };
+
+void X(codelet_n1_9) (planner *p) {
+     X(kdft_register) (p, n1_9, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/q1_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/q1_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:17 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 2 -name q1_2 -include q.h */
+
+/*
+ * This function contains 12 FP additions, 8 FP multiplications,
+ * (or, 8 additions, 4 multiplications, 4 fused multiply/add),
+ * 21 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "q.h"
+
+static void q1_2(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T9, T6, T5;
+	       {
+		    E T1, T2, T7, T8, Tb, T4, Tc, Th, Ti, Te, Tj, Td, Tg;
+		    T1 = rio[0];
+		    T2 = rio[WS(rs, 1)];
+		    T7 = iio[0];
+		    T8 = iio[WS(rs, 1)];
+		    Tb = rio[WS(vs, 1)];
+		    T4 = T1 - T2;
+		    Tc = rio[WS(vs, 1) + WS(rs, 1)];
+		    T9 = T7 - T8;
+		    Th = iio[WS(vs, 1)];
+		    Ti = iio[WS(vs, 1) + WS(rs, 1)];
+		    Te = Tb - Tc;
+		    rio[0] = T1 + T2;
+		    iio[0] = T7 + T8;
+		    Tj = Th - Ti;
+		    rio[WS(rs, 1)] = Tb + Tc;
+		    iio[WS(rs, 1)] = Th + Ti;
+		    Td = W[0];
+		    Tg = W[1];
+		    {
+			 E T3, Tk, Tf, Ta;
+			 T3 = W[0];
+			 T6 = W[1];
+			 Tk = Td * Tj;
+			 Tf = Td * Te;
+			 Ta = T3 * T9;
+			 T5 = T3 * T4;
+			 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tg, Te, Tk);
+			 rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tg, Tj, Tf);
+			 iio[WS(vs, 1)] = FNMS(T6, T4, Ta);
+		    }
+	       }
+	       rio[WS(vs, 1)] = FMA(T6, T9, T5);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 2, "q1_2", twinstr, &GENUS, {8, 4, 4, 0}, 0, 0, 0 };
+
+void X(codelet_q1_2) (planner *p) {
+     X(kdft_difsq_register) (p, q1_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 2 -name q1_2 -include q.h */
+
+/*
+ * This function contains 12 FP additions, 8 FP multiplications,
+ * (or, 8 additions, 4 multiplications, 4 fused multiply/add),
+ * 17 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "q.h"
+
+static void q1_2(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T1, T2, T4, T6, T7, T8, T9, Ta, Tc, Te, Tf, Tg;
+	       T1 = rio[0];
+	       T2 = rio[WS(rs, 1)];
+	       T4 = T1 - T2;
+	       T6 = iio[0];
+	       T7 = iio[WS(rs, 1)];
+	       T8 = T6 - T7;
+	       T9 = rio[WS(vs, 1)];
+	       Ta = rio[WS(vs, 1) + WS(rs, 1)];
+	       Tc = T9 - Ta;
+	       Te = iio[WS(vs, 1)];
+	       Tf = iio[WS(vs, 1) + WS(rs, 1)];
+	       Tg = Te - Tf;
+	       rio[0] = T1 + T2;
+	       iio[0] = T6 + T7;
+	       rio[WS(rs, 1)] = T9 + Ta;
+	       iio[WS(rs, 1)] = Te + Tf;
+	       {
+		    E Tb, Td, T3, T5;
+		    Tb = W[0];
+		    Td = W[1];
+		    rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tb, Tc, Td * Tg);
+		    iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Td, Tc, Tb * Tg);
+		    T3 = W[0];
+		    T5 = W[1];
+		    rio[WS(vs, 1)] = FMA(T3, T4, T5 * T8);
+		    iio[WS(vs, 1)] = FNMS(T5, T4, T3 * T8);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 2, "q1_2", twinstr, &GENUS, {8, 4, 4, 0}, 0, 0, 0 };
+
+void X(codelet_q1_2) (planner *p) {
+     X(kdft_difsq_register) (p, q1_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/q1_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/q1_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:23 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include q.h */
+
+/*
+ * This function contains 48 FP additions, 42 FP multiplications,
+ * (or, 18 additions, 12 multiplications, 30 fused multiply/add),
+ * 56 stack variables, 2 constants, and 36 memory accesses
+ */
+#include "q.h"
+
+static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E Tk, Tn, Tm, To, Tl;
+	       {
+		    E T1, Td, T4, Tg, Tp, T9, Te, T6, Tf, TB, TE, Ts, TZ, Tu, Tx;
+		    E TC, TN, TO, TD, TV, T10, TP, Tq, Tr;
+		    {
+			 E T2, T3, T7, T8;
+			 T1 = rio[0];
+			 T2 = rio[WS(rs, 1)];
+			 T3 = rio[WS(rs, 2)];
+			 Td = iio[0];
+			 T7 = iio[WS(rs, 1)];
+			 T8 = iio[WS(rs, 2)];
+			 T4 = T2 + T3;
+			 Tg = T3 - T2;
+			 Tp = rio[WS(vs, 1)];
+			 T9 = T7 - T8;
+			 Te = T7 + T8;
+			 T6 = FNMS(KP500000000, T4, T1);
+			 Tq = rio[WS(vs, 1) + WS(rs, 1)];
+			 Tr = rio[WS(vs, 1) + WS(rs, 2)];
+			 Tf = FNMS(KP500000000, Te, Td);
+		    }
+		    {
+			 E Tv, Tw, TT, TU;
+			 TB = iio[WS(vs, 1)];
+			 Tv = iio[WS(vs, 1) + WS(rs, 1)];
+			 TE = Tr - Tq;
+			 Ts = Tq + Tr;
+			 Tw = iio[WS(vs, 1) + WS(rs, 2)];
+			 TZ = iio[WS(vs, 2)];
+			 TT = iio[WS(vs, 2) + WS(rs, 1)];
+			 Tu = FNMS(KP500000000, Ts, Tp);
+			 Tx = Tv - Tw;
+			 TC = Tv + Tw;
+			 TU = iio[WS(vs, 2) + WS(rs, 2)];
+			 TN = rio[WS(vs, 2)];
+			 TO = rio[WS(vs, 2) + WS(rs, 1)];
+			 TD = FNMS(KP500000000, TC, TB);
+			 TV = TT - TU;
+			 T10 = TT + TU;
+			 TP = rio[WS(vs, 2) + WS(rs, 2)];
+		    }
+		    {
+			 E T11, T12, TS, TQ;
+			 rio[0] = T1 + T4;
+			 iio[0] = Td + Te;
+			 T11 = FNMS(KP500000000, T10, TZ);
+			 T12 = TP - TO;
+			 TQ = TO + TP;
+			 rio[WS(rs, 1)] = Tp + Ts;
+			 iio[WS(rs, 1)] = TB + TC;
+			 iio[WS(rs, 2)] = TZ + T10;
+			 TS = FNMS(KP500000000, TQ, TN);
+			 rio[WS(rs, 2)] = TN + TQ;
+			 {
+			      E TW, T13, Ty, TI, TL, TF, TH, TK;
+			      {
+				   E Ta, Th, T5, Tc;
+				   Tk = FNMS(KP866025403, T9, T6);
+				   Ta = FMA(KP866025403, T9, T6);
+				   Th = FMA(KP866025403, Tg, Tf);
+				   Tn = FNMS(KP866025403, Tg, Tf);
+				   T5 = W[0];
+				   Tc = W[1];
+				   {
+					E T16, T19, T18, T1a, T17, Ti, Tb, T15;
+					TW = FMA(KP866025403, TV, TS);
+					T16 = FNMS(KP866025403, TV, TS);
+					T19 = FNMS(KP866025403, T12, T11);
+					T13 = FMA(KP866025403, T12, T11);
+					Ti = T5 * Th;
+					Tb = T5 * Ta;
+					T15 = W[2];
+					T18 = W[3];
+					iio[WS(vs, 1)] = FNMS(Tc, Ta, Ti);
+					rio[WS(vs, 1)] = FMA(Tc, Th, Tb);
+					T1a = T15 * T19;
+					T17 = T15 * T16;
+					Ty = FMA(KP866025403, Tx, Tu);
+					TI = FNMS(KP866025403, Tx, Tu);
+					TL = FNMS(KP866025403, TE, TD);
+					TF = FMA(KP866025403, TE, TD);
+					iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T18, T16, T1a);
+					rio[WS(vs, 2) + WS(rs, 2)] = FMA(T18, T19, T17);
+					TH = W[2];
+					TK = W[3];
+				   }
+			      }
+			      {
+				   E TA, TG, Tz, TM, TJ, Tt;
+				   TM = TH * TL;
+				   TJ = TH * TI;
+				   Tt = W[0];
+				   TA = W[1];
+				   iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TK, TI, TM);
+				   rio[WS(vs, 2) + WS(rs, 1)] = FMA(TK, TL, TJ);
+				   TG = Tt * TF;
+				   Tz = Tt * Ty;
+				   {
+					E TR, TY, T14, TX, Tj;
+					iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TA, Ty, TG);
+					rio[WS(vs, 1) + WS(rs, 1)] = FMA(TA, TF, Tz);
+					TR = W[0];
+					TY = W[1];
+					T14 = TR * T13;
+					TX = TR * TW;
+					Tj = W[2];
+					Tm = W[3];
+					iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TY, TW, T14);
+					rio[WS(vs, 1) + WS(rs, 2)] = FMA(TY, T13, TX);
+					To = Tj * Tn;
+					Tl = Tj * Tk;
+				   }
+			      }
+			 }
+		    }
+	       }
+	       iio[WS(vs, 2)] = FNMS(Tm, Tk, To);
+	       rio[WS(vs, 2)] = FMA(Tm, Tn, Tl);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, {18, 12, 30, 0}, 0, 0, 0 };
+
+void X(codelet_q1_3) (planner *p) {
+     X(kdft_difsq_register) (p, q1_3, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include q.h */
+
+/*
+ * This function contains 48 FP additions, 36 FP multiplications,
+ * (or, 30 additions, 18 multiplications, 18 fused multiply/add),
+ * 35 stack variables, 2 constants, and 36 memory accesses
+ */
+#include "q.h"
+
+static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T1, T4, T6, Tc, Td, Te, T9, Tf, Tl, To, Tq, Tw, Tx, Ty, Tt;
+	       E Tz, TR, TS, TN, TT, TF, TI, TK, TQ;
+	       {
+		    E T2, T3, Tr, Ts;
+		    T1 = rio[0];
+		    T2 = rio[WS(rs, 1)];
+		    T3 = rio[WS(rs, 2)];
+		    T4 = T2 + T3;
+		    T6 = FNMS(KP500000000, T4, T1);
+		    Tc = KP866025403 * (T3 - T2);
+		    {
+			 E T7, T8, Tm, Tn;
+			 Td = iio[0];
+			 T7 = iio[WS(rs, 1)];
+			 T8 = iio[WS(rs, 2)];
+			 Te = T7 + T8;
+			 T9 = KP866025403 * (T7 - T8);
+			 Tf = FNMS(KP500000000, Te, Td);
+			 Tl = rio[WS(vs, 1)];
+			 Tm = rio[WS(vs, 1) + WS(rs, 1)];
+			 Tn = rio[WS(vs, 1) + WS(rs, 2)];
+			 To = Tm + Tn;
+			 Tq = FNMS(KP500000000, To, Tl);
+			 Tw = KP866025403 * (Tn - Tm);
+		    }
+		    Tx = iio[WS(vs, 1)];
+		    Tr = iio[WS(vs, 1) + WS(rs, 1)];
+		    Ts = iio[WS(vs, 1) + WS(rs, 2)];
+		    Ty = Tr + Ts;
+		    Tt = KP866025403 * (Tr - Ts);
+		    Tz = FNMS(KP500000000, Ty, Tx);
+		    {
+			 E TL, TM, TG, TH;
+			 TR = iio[WS(vs, 2)];
+			 TL = iio[WS(vs, 2) + WS(rs, 1)];
+			 TM = iio[WS(vs, 2) + WS(rs, 2)];
+			 TS = TL + TM;
+			 TN = KP866025403 * (TL - TM);
+			 TT = FNMS(KP500000000, TS, TR);
+			 TF = rio[WS(vs, 2)];
+			 TG = rio[WS(vs, 2) + WS(rs, 1)];
+			 TH = rio[WS(vs, 2) + WS(rs, 2)];
+			 TI = TG + TH;
+			 TK = FNMS(KP500000000, TI, TF);
+			 TQ = KP866025403 * (TH - TG);
+		    }
+	       }
+	       rio[0] = T1 + T4;
+	       iio[0] = Td + Te;
+	       rio[WS(rs, 1)] = Tl + To;
+	       iio[WS(rs, 1)] = Tx + Ty;
+	       iio[WS(rs, 2)] = TR + TS;
+	       rio[WS(rs, 2)] = TF + TI;
+	       {
+		    E Ta, Tg, T5, Tb;
+		    Ta = T6 + T9;
+		    Tg = Tc + Tf;
+		    T5 = W[0];
+		    Tb = W[1];
+		    rio[WS(vs, 1)] = FMA(T5, Ta, Tb * Tg);
+		    iio[WS(vs, 1)] = FNMS(Tb, Ta, T5 * Tg);
+	       }
+	       {
+		    E TW, TY, TV, TX;
+		    TW = TK - TN;
+		    TY = TT - TQ;
+		    TV = W[2];
+		    TX = W[3];
+		    rio[WS(vs, 2) + WS(rs, 2)] = FMA(TV, TW, TX * TY);
+		    iio[WS(vs, 2) + WS(rs, 2)] = FNMS(TX, TW, TV * TY);
+	       }
+	       {
+		    E TC, TE, TB, TD;
+		    TC = Tq - Tt;
+		    TE = Tz - Tw;
+		    TB = W[2];
+		    TD = W[3];
+		    rio[WS(vs, 2) + WS(rs, 1)] = FMA(TB, TC, TD * TE);
+		    iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TD, TC, TB * TE);
+	       }
+	       {
+		    E Tu, TA, Tp, Tv;
+		    Tu = Tq + Tt;
+		    TA = Tw + Tz;
+		    Tp = W[0];
+		    Tv = W[1];
+		    rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tp, Tu, Tv * TA);
+		    iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tv, Tu, Tp * TA);
+	       }
+	       {
+		    E TO, TU, TJ, TP;
+		    TO = TK + TN;
+		    TU = TQ + TT;
+		    TJ = W[0];
+		    TP = W[1];
+		    rio[WS(vs, 1) + WS(rs, 2)] = FMA(TJ, TO, TP * TU);
+		    iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TP, TO, TJ * TU);
+	       }
+	       {
+		    E Ti, Tk, Th, Tj;
+		    Ti = T6 - T9;
+		    Tk = Tf - Tc;
+		    Th = W[2];
+		    Tj = W[3];
+		    rio[WS(vs, 2)] = FMA(Th, Ti, Tj * Tk);
+		    iio[WS(vs, 2)] = FNMS(Tj, Ti, Th * Tk);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, {30, 18, 18, 0}, 0, 0, 0 };
+
+void X(codelet_q1_3) (planner *p) {
+     X(kdft_difsq_register) (p, q1_3, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/q1_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/q1_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,518 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:17 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include q.h */
+
+/*
+ * This function contains 88 FP additions, 48 FP multiplications,
+ * (or, 64 additions, 24 multiplications, 24 fused multiply/add),
+ * 76 stack variables, 0 constants, and 64 memory accesses
+ */
+#include "q.h"
+
+static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T1X, T1S, T1L, T1Y, T1R;
+	       {
+		    E T3, Tf, Tv, Ti, Tw, Tx, T6, Tm, Tc, Ts, T1T, T1H, T29, T1W, T2a;
+		    E T2b, T1K, T20, T1Q, T26, TN, TB, T13, TQ, T14, T15, TE, TU, TK, T10;
+		    E T1l, T19, T1a, T1h, T1B, T1o, T1C, T1b, T1D, T1e, T1c;
+		    {
+			 E T1I, T1P, T1J, T1M;
+			 {
+			      E Tb, T4, T5, T8;
+			      {
+				   E T1, T2, T9, Ta, Tg, Th;
+				   T1 = rio[0];
+				   T2 = rio[WS(rs, 2)];
+				   T9 = iio[0];
+				   Ta = iio[WS(rs, 2)];
+				   Tg = iio[WS(rs, 1)];
+				   T3 = T1 + T2;
+				   Tf = T1 - T2;
+				   Th = iio[WS(rs, 3)];
+				   Tv = T9 + Ta;
+				   Tb = T9 - Ta;
+				   T4 = rio[WS(rs, 1)];
+				   Ti = Tg - Th;
+				   Tw = Tg + Th;
+				   T5 = rio[WS(rs, 3)];
+			      }
+			      Tx = Tv - Tw;
+			      T8 = T4 - T5;
+			      T6 = T4 + T5;
+			      {
+				   E T1N, T1O, T1F, T1G, T1U, T1V;
+				   T1F = rio[WS(vs, 3)];
+				   T1G = rio[WS(vs, 3) + WS(rs, 2)];
+				   Tm = Tb - T8;
+				   Tc = T8 + Tb;
+				   Ts = T3 - T6;
+				   T1T = T1F - T1G;
+				   T1H = T1F + T1G;
+				   T1N = iio[WS(vs, 3)];
+				   T1O = iio[WS(vs, 3) + WS(rs, 2)];
+				   T1U = iio[WS(vs, 3) + WS(rs, 1)];
+				   T1V = iio[WS(vs, 3) + WS(rs, 3)];
+				   T1I = rio[WS(vs, 3) + WS(rs, 1)];
+				   T1P = T1N - T1O;
+				   T29 = T1N + T1O;
+				   T1W = T1U - T1V;
+				   T2a = T1U + T1V;
+				   T1J = rio[WS(vs, 3) + WS(rs, 3)];
+			      }
+			 }
+			 T2b = T29 - T2a;
+			 T1M = T1I - T1J;
+			 T1K = T1I + T1J;
+			 {
+			      E TC, TJ, TD, TG;
+			      {
+				   E TH, TI, Tz, TA, TO, TP;
+				   Tz = rio[WS(vs, 1)];
+				   TA = rio[WS(vs, 1) + WS(rs, 2)];
+				   T20 = T1P - T1M;
+				   T1Q = T1M + T1P;
+				   T26 = T1H - T1K;
+				   TN = Tz - TA;
+				   TB = Tz + TA;
+				   TH = iio[WS(vs, 1)];
+				   TI = iio[WS(vs, 1) + WS(rs, 2)];
+				   TO = iio[WS(vs, 1) + WS(rs, 1)];
+				   TP = iio[WS(vs, 1) + WS(rs, 3)];
+				   TC = rio[WS(vs, 1) + WS(rs, 1)];
+				   TJ = TH - TI;
+				   T13 = TH + TI;
+				   TQ = TO - TP;
+				   T14 = TO + TP;
+				   TD = rio[WS(vs, 1) + WS(rs, 3)];
+			      }
+			      T15 = T13 - T14;
+			      TG = TC - TD;
+			      TE = TC + TD;
+			      {
+				   E T1f, T1g, T17, T18, T1m, T1n;
+				   T17 = rio[WS(vs, 2)];
+				   T18 = rio[WS(vs, 2) + WS(rs, 2)];
+				   TU = TJ - TG;
+				   TK = TG + TJ;
+				   T10 = TB - TE;
+				   T1l = T17 - T18;
+				   T19 = T17 + T18;
+				   T1f = iio[WS(vs, 2)];
+				   T1g = iio[WS(vs, 2) + WS(rs, 2)];
+				   T1m = iio[WS(vs, 2) + WS(rs, 1)];
+				   T1n = iio[WS(vs, 2) + WS(rs, 3)];
+				   T1a = rio[WS(vs, 2) + WS(rs, 1)];
+				   T1h = T1f - T1g;
+				   T1B = T1f + T1g;
+				   T1o = T1m - T1n;
+				   T1C = T1m + T1n;
+				   T1b = rio[WS(vs, 2) + WS(rs, 3)];
+			      }
+			 }
+		    }
+		    T1D = T1B - T1C;
+		    T1e = T1a - T1b;
+		    T1c = T1a + T1b;
+		    {
+			 E T1s, T1i, T1y, T28, T27, Tr, Tu;
+			 rio[0] = T3 + T6;
+			 iio[0] = Tv + Tw;
+			 T1s = T1h - T1e;
+			 T1i = T1e + T1h;
+			 T1y = T19 - T1c;
+			 rio[WS(rs, 1)] = TB + TE;
+			 iio[WS(rs, 1)] = T13 + T14;
+			 rio[WS(rs, 2)] = T19 + T1c;
+			 iio[WS(rs, 2)] = T1B + T1C;
+			 iio[WS(rs, 3)] = T29 + T2a;
+			 rio[WS(rs, 3)] = T1H + T1K;
+			 Tr = W[2];
+			 Tu = W[3];
+			 {
+			      E T25, Ty, Tt, T2c;
+			      T25 = W[2];
+			      T28 = W[3];
+			      Ty = Tr * Tx;
+			      Tt = Tr * Ts;
+			      T2c = T25 * T2b;
+			      T27 = T25 * T26;
+			      iio[WS(vs, 2)] = FNMS(Tu, Ts, Ty);
+			      rio[WS(vs, 2)] = FMA(Tu, Tx, Tt);
+			      iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T28, T26, T2c);
+			 }
+			 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T28, T2b, T27);
+			 {
+			      E Tp, T1v, T23, T22, T1Z, TR, TM, TF;
+			      {
+				   E T1A, T1z, TZ, T12;
+				   TZ = W[2];
+				   T12 = W[3];
+				   {
+					E T1x, T16, T11, T1E;
+					T1x = W[2];
+					T1A = W[3];
+					T16 = TZ * T15;
+					T11 = TZ * T10;
+					T1E = T1x * T1D;
+					T1z = T1x * T1y;
+					iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T12, T10, T16);
+					rio[WS(vs, 2) + WS(rs, 1)] = FMA(T12, T15, T11);
+					iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1A, T1y, T1E);
+				   }
+				   rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1A, T1D, T1z);
+				   {
+					E Tj, Te, T7, T1p, T1k, T1j;
+					Tp = Tf + Ti;
+					Tj = Tf - Ti;
+					Te = W[5];
+					T7 = W[4];
+					{
+					     E T1d, T1q, Tk, Td;
+					     T1p = T1l - T1o;
+					     T1v = T1l + T1o;
+					     T1k = W[5];
+					     Tk = Te * Tc;
+					     Td = T7 * Tc;
+					     T1d = W[4];
+					     T1q = T1k * T1i;
+					     rio[WS(vs, 3)] = FMA(T7, Tj, Tk);
+					     iio[WS(vs, 3)] = FNMS(Te, Tj, Td);
+					     T1j = T1d * T1i;
+					     rio[WS(vs, 3) + WS(rs, 2)] = FMA(T1d, T1p, T1q);
+					}
+					T23 = T1T + T1W;
+					T1X = T1T - T1W;
+					T22 = W[1];
+					iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T1k, T1p, T1j);
+					T1Z = W[0];
+				   }
+			      }
+			      {
+				   E TX, TW, TT, TY, TV, T24, T21;
+				   TX = TN + TQ;
+				   TR = TN - TQ;
+				   T24 = T22 * T20;
+				   TW = W[1];
+				   T21 = T1Z * T20;
+				   TT = W[0];
+				   rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1Z, T23, T24);
+				   TY = TW * TU;
+				   iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T22, T23, T21);
+				   TV = TT * TU;
+				   rio[WS(vs, 1) + WS(rs, 1)] = FMA(TT, TX, TY);
+				   TM = W[5];
+				   iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TW, TX, TV);
+				   TF = W[4];
+			      }
+			      {
+				   E To, Tl, Tq, Tn, TS, TL;
+				   TS = TM * TK;
+				   To = W[1];
+				   TL = TF * TK;
+				   Tl = W[0];
+				   rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TR, TS);
+				   Tq = To * Tm;
+				   iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TM, TR, TL);
+				   Tn = Tl * Tm;
+				   {
+					E T1u, T1r, T1w, T1t;
+					rio[WS(vs, 1)] = FMA(Tl, Tp, Tq);
+					T1u = W[1];
+					iio[WS(vs, 1)] = FNMS(To, Tp, Tn);
+					T1r = W[0];
+					T1w = T1u * T1s;
+					T1S = W[5];
+					T1t = T1r * T1s;
+					T1L = W[4];
+					rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1r, T1v, T1w);
+					T1Y = T1S * T1Q;
+					iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1u, T1v, T1t);
+					T1R = T1L * T1Q;
+				   }
+			      }
+			 }
+		    }
+	       }
+	       rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1L, T1X, T1Y);
+	       iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1S, T1X, T1R);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, {64, 24, 24, 0}, 0, 0, 0 };
+
+void X(codelet_q1_4) (planner *p) {
+     X(kdft_difsq_register) (p, q1_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include q.h */
+
+/*
+ * This function contains 88 FP additions, 48 FP multiplications,
+ * (or, 64 additions, 24 multiplications, 24 fused multiply/add),
+ * 37 stack variables, 0 constants, and 64 memory accesses
+ */
+#include "q.h"
+
+static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T3, Te, Tb, Tq, T6, T8, Th, Tr, Tv, TG, TD, TS, Ty, TA, TJ;
+	       E TT, TX, T18, T15, T1k, T10, T12, T1b, T1l, T1p, T1A, T1x, T1M, T1s, T1u;
+	       E T1D, T1N;
+	       {
+		    E T1, T2, T9, Ta;
+		    T1 = rio[0];
+		    T2 = rio[WS(rs, 2)];
+		    T3 = T1 + T2;
+		    Te = T1 - T2;
+		    T9 = iio[0];
+		    Ta = iio[WS(rs, 2)];
+		    Tb = T9 - Ta;
+		    Tq = T9 + Ta;
+	       }
+	       {
+		    E T4, T5, Tf, Tg;
+		    T4 = rio[WS(rs, 1)];
+		    T5 = rio[WS(rs, 3)];
+		    T6 = T4 + T5;
+		    T8 = T4 - T5;
+		    Tf = iio[WS(rs, 1)];
+		    Tg = iio[WS(rs, 3)];
+		    Th = Tf - Tg;
+		    Tr = Tf + Tg;
+	       }
+	       {
+		    E Tt, Tu, TB, TC;
+		    Tt = rio[WS(vs, 1)];
+		    Tu = rio[WS(vs, 1) + WS(rs, 2)];
+		    Tv = Tt + Tu;
+		    TG = Tt - Tu;
+		    TB = iio[WS(vs, 1)];
+		    TC = iio[WS(vs, 1) + WS(rs, 2)];
+		    TD = TB - TC;
+		    TS = TB + TC;
+	       }
+	       {
+		    E Tw, Tx, TH, TI;
+		    Tw = rio[WS(vs, 1) + WS(rs, 1)];
+		    Tx = rio[WS(vs, 1) + WS(rs, 3)];
+		    Ty = Tw + Tx;
+		    TA = Tw - Tx;
+		    TH = iio[WS(vs, 1) + WS(rs, 1)];
+		    TI = iio[WS(vs, 1) + WS(rs, 3)];
+		    TJ = TH - TI;
+		    TT = TH + TI;
+	       }
+	       {
+		    E TV, TW, T13, T14;
+		    TV = rio[WS(vs, 2)];
+		    TW = rio[WS(vs, 2) + WS(rs, 2)];
+		    TX = TV + TW;
+		    T18 = TV - TW;
+		    T13 = iio[WS(vs, 2)];
+		    T14 = iio[WS(vs, 2) + WS(rs, 2)];
+		    T15 = T13 - T14;
+		    T1k = T13 + T14;
+	       }
+	       {
+		    E TY, TZ, T19, T1a;
+		    TY = rio[WS(vs, 2) + WS(rs, 1)];
+		    TZ = rio[WS(vs, 2) + WS(rs, 3)];
+		    T10 = TY + TZ;
+		    T12 = TY - TZ;
+		    T19 = iio[WS(vs, 2) + WS(rs, 1)];
+		    T1a = iio[WS(vs, 2) + WS(rs, 3)];
+		    T1b = T19 - T1a;
+		    T1l = T19 + T1a;
+	       }
+	       {
+		    E T1n, T1o, T1v, T1w;
+		    T1n = rio[WS(vs, 3)];
+		    T1o = rio[WS(vs, 3) + WS(rs, 2)];
+		    T1p = T1n + T1o;
+		    T1A = T1n - T1o;
+		    T1v = iio[WS(vs, 3)];
+		    T1w = iio[WS(vs, 3) + WS(rs, 2)];
+		    T1x = T1v - T1w;
+		    T1M = T1v + T1w;
+	       }
+	       {
+		    E T1q, T1r, T1B, T1C;
+		    T1q = rio[WS(vs, 3) + WS(rs, 1)];
+		    T1r = rio[WS(vs, 3) + WS(rs, 3)];
+		    T1s = T1q + T1r;
+		    T1u = T1q - T1r;
+		    T1B = iio[WS(vs, 3) + WS(rs, 1)];
+		    T1C = iio[WS(vs, 3) + WS(rs, 3)];
+		    T1D = T1B - T1C;
+		    T1N = T1B + T1C;
+	       }
+	       rio[0] = T3 + T6;
+	       iio[0] = Tq + Tr;
+	       rio[WS(rs, 1)] = Tv + Ty;
+	       iio[WS(rs, 1)] = TS + TT;
+	       rio[WS(rs, 2)] = TX + T10;
+	       iio[WS(rs, 2)] = T1k + T1l;
+	       iio[WS(rs, 3)] = T1M + T1N;
+	       rio[WS(rs, 3)] = T1p + T1s;
+	       {
+		    E Tc, Ti, T7, Td;
+		    Tc = T8 + Tb;
+		    Ti = Te - Th;
+		    T7 = W[4];
+		    Td = W[5];
+		    iio[WS(vs, 3)] = FNMS(Td, Ti, T7 * Tc);
+		    rio[WS(vs, 3)] = FMA(Td, Tc, T7 * Ti);
+	       }
+	       {
+		    E T1K, T1O, T1J, T1L;
+		    T1K = T1p - T1s;
+		    T1O = T1M - T1N;
+		    T1J = W[2];
+		    T1L = W[3];
+		    rio[WS(vs, 2) + WS(rs, 3)] = FMA(T1J, T1K, T1L * T1O);
+		    iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T1L, T1K, T1J * T1O);
+	       }
+	       {
+		    E Tk, Tm, Tj, Tl;
+		    Tk = Tb - T8;
+		    Tm = Te + Th;
+		    Tj = W[0];
+		    Tl = W[1];
+		    iio[WS(vs, 1)] = FNMS(Tl, Tm, Tj * Tk);
+		    rio[WS(vs, 1)] = FMA(Tl, Tk, Tj * Tm);
+	       }
+	       {
+		    E To, Ts, Tn, Tp;
+		    To = T3 - T6;
+		    Ts = Tq - Tr;
+		    Tn = W[2];
+		    Tp = W[3];
+		    rio[WS(vs, 2)] = FMA(Tn, To, Tp * Ts);
+		    iio[WS(vs, 2)] = FNMS(Tp, To, Tn * Ts);
+	       }
+	       {
+		    E T16, T1c, T11, T17;
+		    T16 = T12 + T15;
+		    T1c = T18 - T1b;
+		    T11 = W[4];
+		    T17 = W[5];
+		    iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T17, T1c, T11 * T16);
+		    rio[WS(vs, 3) + WS(rs, 2)] = FMA(T17, T16, T11 * T1c);
+	       }
+	       {
+		    E T1G, T1I, T1F, T1H;
+		    T1G = T1x - T1u;
+		    T1I = T1A + T1D;
+		    T1F = W[0];
+		    T1H = W[1];
+		    iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T1H, T1I, T1F * T1G);
+		    rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1H, T1G, T1F * T1I);
+	       }
+	       {
+		    E TQ, TU, TP, TR;
+		    TQ = Tv - Ty;
+		    TU = TS - TT;
+		    TP = W[2];
+		    TR = W[3];
+		    rio[WS(vs, 2) + WS(rs, 1)] = FMA(TP, TQ, TR * TU);
+		    iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TR, TQ, TP * TU);
+	       }
+	       {
+		    E T1e, T1g, T1d, T1f;
+		    T1e = T15 - T12;
+		    T1g = T18 + T1b;
+		    T1d = W[0];
+		    T1f = W[1];
+		    iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
+		    rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
+	       }
+	       {
+		    E T1i, T1m, T1h, T1j;
+		    T1i = TX - T10;
+		    T1m = T1k - T1l;
+		    T1h = W[2];
+		    T1j = W[3];
+		    rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1h, T1i, T1j * T1m);
+		    iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1j, T1i, T1h * T1m);
+	       }
+	       {
+		    E T1y, T1E, T1t, T1z;
+		    T1y = T1u + T1x;
+		    T1E = T1A - T1D;
+		    T1t = W[4];
+		    T1z = W[5];
+		    iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1z, T1E, T1t * T1y);
+		    rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1z, T1y, T1t * T1E);
+	       }
+	       {
+		    E TM, TO, TL, TN;
+		    TM = TD - TA;
+		    TO = TG + TJ;
+		    TL = W[0];
+		    TN = W[1];
+		    iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TN, TO, TL * TM);
+		    rio[WS(vs, 1) + WS(rs, 1)] = FMA(TN, TM, TL * TO);
+	       }
+	       {
+		    E TE, TK, Tz, TF;
+		    TE = TA + TD;
+		    TK = TG - TJ;
+		    Tz = W[4];
+		    TF = W[5];
+		    iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TF, TK, Tz * TE);
+		    rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TE, Tz * TK);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, {64, 24, 24, 0}, 0, 0, 0 };
+
+void X(codelet_q1_4) (planner *p) {
+     X(kdft_difsq_register) (p, q1_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/q1_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/q1_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,983 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:23 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include q.h */
+
+/*
+ * This function contains 200 FP additions, 170 FP multiplications,
+ * (or, 70 additions, 40 multiplications, 130 fused multiply/add),
+ * 104 stack variables, 4 constants, and 100 memory accesses
+ */
+#include "q.h"
+
+static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T1x, T1w, T1v;
+	       {
+		    E T1, Tn, TM, Tw, Tb, T8, Ta, TV, Tq, Ts, TH, Tj, Tr, T1h, T1q;
+		    E T1G, T12, T15, T1P, T14, T1k, T1m, T1B, T1d, T1l, T2b, T2k, T2A, T1W, T1Z;
+		    E T3Z, T1Y, T2e, T2g, T2v, T27, T2f, T3D, T42, T44, T4j, T3V, T43, T2J, T48;
+		    E T4o, T3K, T3N, T35, T3M, T2V, T3e, T3u, T2Q, T2T, T37, T30, T2S, T2W;
+		    {
+			 E T1Q, T2j, T1V, T1R;
+			 {
+			      E Tp, Ti, Td, Te;
+			      {
+				   E T5, T6, T2, T3, T7, Tv;
+				   T1 = rio[0];
+				   T5 = rio[WS(rs, 2)];
+				   T6 = rio[WS(rs, 3)];
+				   T2 = rio[WS(rs, 1)];
+				   T3 = rio[WS(rs, 4)];
+				   Tn = iio[0];
+				   T7 = T5 + T6;
+				   Tv = T5 - T6;
+				   {
+					E T4, Tu, Tg, Th;
+					T4 = T2 + T3;
+					Tu = T2 - T3;
+					Tg = iio[WS(rs, 2)];
+					Th = iio[WS(rs, 3)];
+					TM = FNMS(KP618033988, Tu, Tv);
+					Tw = FMA(KP618033988, Tv, Tu);
+					Tb = T4 - T7;
+					T8 = T4 + T7;
+					Tp = Tg + Th;
+					Ti = Tg - Th;
+					Ta = FNMS(KP250000000, T8, T1);
+					Td = iio[WS(rs, 1)];
+					Te = iio[WS(rs, 4)];
+				   }
+			      }
+			      {
+				   E TW, T1p, T11, TX;
+				   TV = rio[WS(vs, 1)];
+				   {
+					E TZ, T10, Tf, To;
+					TZ = rio[WS(vs, 1) + WS(rs, 2)];
+					T10 = rio[WS(vs, 1) + WS(rs, 3)];
+					Tf = Td - Te;
+					To = Td + Te;
+					TW = rio[WS(vs, 1) + WS(rs, 1)];
+					T1p = TZ - T10;
+					T11 = TZ + T10;
+					Tq = To + Tp;
+					Ts = To - Tp;
+					TH = FNMS(KP618033988, Tf, Ti);
+					Tj = FMA(KP618033988, Ti, Tf);
+					Tr = FNMS(KP250000000, Tq, Tn);
+					TX = rio[WS(vs, 1) + WS(rs, 4)];
+				   }
+				   {
+					E T17, T1j, T1c, T18;
+					T1h = iio[WS(vs, 1)];
+					{
+					     E T1a, T1b, TY, T1o;
+					     T1a = iio[WS(vs, 1) + WS(rs, 2)];
+					     T1b = iio[WS(vs, 1) + WS(rs, 3)];
+					     TY = TW + TX;
+					     T1o = TW - TX;
+					     T17 = iio[WS(vs, 1) + WS(rs, 1)];
+					     T1j = T1a + T1b;
+					     T1c = T1a - T1b;
+					     T1q = FMA(KP618033988, T1p, T1o);
+					     T1G = FNMS(KP618033988, T1o, T1p);
+					     T12 = TY + T11;
+					     T15 = TY - T11;
+					     T18 = iio[WS(vs, 1) + WS(rs, 4)];
+					}
+					T1P = rio[WS(vs, 2)];
+					T14 = FNMS(KP250000000, T12, TV);
+					{
+					     E T1T, T1i, T19, T1U;
+					     T1T = rio[WS(vs, 2) + WS(rs, 2)];
+					     T1i = T17 + T18;
+					     T19 = T17 - T18;
+					     T1U = rio[WS(vs, 2) + WS(rs, 3)];
+					     T1Q = rio[WS(vs, 2) + WS(rs, 1)];
+					     T1k = T1i + T1j;
+					     T1m = T1i - T1j;
+					     T1B = FNMS(KP618033988, T19, T1c);
+					     T1d = FMA(KP618033988, T1c, T19);
+					     T2j = T1T - T1U;
+					     T1V = T1T + T1U;
+					     T1l = FNMS(KP250000000, T1k, T1h);
+					     T1R = rio[WS(vs, 2) + WS(rs, 4)];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T3P, T41, T3U, T3Q;
+			      {
+				   E T21, T2d, T26, T22;
+				   T2b = iio[WS(vs, 2)];
+				   {
+					E T24, T25, T1S, T2i;
+					T24 = iio[WS(vs, 2) + WS(rs, 2)];
+					T25 = iio[WS(vs, 2) + WS(rs, 3)];
+					T1S = T1Q + T1R;
+					T2i = T1Q - T1R;
+					T21 = iio[WS(vs, 2) + WS(rs, 1)];
+					T2d = T24 + T25;
+					T26 = T24 - T25;
+					T2k = FMA(KP618033988, T2j, T2i);
+					T2A = FNMS(KP618033988, T2i, T2j);
+					T1W = T1S + T1V;
+					T1Z = T1S - T1V;
+					T22 = iio[WS(vs, 2) + WS(rs, 4)];
+				   }
+				   T3Z = iio[WS(vs, 4)];
+				   T1Y = FNMS(KP250000000, T1W, T1P);
+				   {
+					E T3S, T2c, T23, T3T;
+					T3S = iio[WS(vs, 4) + WS(rs, 2)];
+					T2c = T21 + T22;
+					T23 = T21 - T22;
+					T3T = iio[WS(vs, 4) + WS(rs, 3)];
+					T3P = iio[WS(vs, 4) + WS(rs, 1)];
+					T2e = T2c + T2d;
+					T2g = T2c - T2d;
+					T2v = FNMS(KP618033988, T23, T26);
+					T27 = FMA(KP618033988, T26, T23);
+					T41 = T3S + T3T;
+					T3U = T3S - T3T;
+					T2f = FNMS(KP250000000, T2e, T2b);
+					T3Q = iio[WS(vs, 4) + WS(rs, 4)];
+				   }
+			      }
+			      {
+				   E T3E, T47, T3J, T3F;
+				   T3D = rio[WS(vs, 4)];
+				   {
+					E T3H, T3I, T3R, T40;
+					T3H = rio[WS(vs, 4) + WS(rs, 2)];
+					T3I = rio[WS(vs, 4) + WS(rs, 3)];
+					T3R = T3P - T3Q;
+					T40 = T3P + T3Q;
+					T3E = rio[WS(vs, 4) + WS(rs, 1)];
+					T47 = T3H - T3I;
+					T3J = T3H + T3I;
+					T42 = T40 + T41;
+					T44 = T40 - T41;
+					T4j = FNMS(KP618033988, T3R, T3U);
+					T3V = FMA(KP618033988, T3U, T3R);
+					T43 = FNMS(KP250000000, T42, T3Z);
+					T3F = rio[WS(vs, 4) + WS(rs, 4)];
+				   }
+				   {
+					E T2K, T3d, T2P, T2L;
+					T2J = rio[WS(vs, 3)];
+					{
+					     E T2N, T2O, T3G, T46;
+					     T2N = rio[WS(vs, 3) + WS(rs, 2)];
+					     T2O = rio[WS(vs, 3) + WS(rs, 3)];
+					     T3G = T3E + T3F;
+					     T46 = T3E - T3F;
+					     T2K = rio[WS(vs, 3) + WS(rs, 1)];
+					     T3d = T2N - T2O;
+					     T2P = T2N + T2O;
+					     T48 = FMA(KP618033988, T47, T46);
+					     T4o = FNMS(KP618033988, T46, T47);
+					     T3K = T3G + T3J;
+					     T3N = T3G - T3J;
+					     T2L = rio[WS(vs, 3) + WS(rs, 4)];
+					}
+					T35 = iio[WS(vs, 3)];
+					T3M = FNMS(KP250000000, T3K, T3D);
+					{
+					     E T2Y, T3c, T2M, T2Z;
+					     T2Y = iio[WS(vs, 3) + WS(rs, 2)];
+					     T3c = T2K - T2L;
+					     T2M = T2K + T2L;
+					     T2Z = iio[WS(vs, 3) + WS(rs, 3)];
+					     T2V = iio[WS(vs, 3) + WS(rs, 1)];
+					     T3e = FMA(KP618033988, T3d, T3c);
+					     T3u = FNMS(KP618033988, T3c, T3d);
+					     T2Q = T2M + T2P;
+					     T2T = T2M - T2P;
+					     T37 = T2Y + T2Z;
+					     T30 = T2Y - T2Z;
+					     T2S = FNMS(KP250000000, T2Q, T2J);
+					     T2W = iio[WS(vs, 3) + WS(rs, 4)];
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T3a, T31, T3p, T39, T2X, T36, T38;
+			 rio[0] = T1 + T8;
+			 iio[0] = Tn + Tq;
+			 rio[WS(rs, 1)] = TV + T12;
+			 T2X = T2V - T2W;
+			 T36 = T2V + T2W;
+			 iio[WS(rs, 1)] = T1h + T1k;
+			 rio[WS(rs, 2)] = T1P + T1W;
+			 T3a = T36 - T37;
+			 T38 = T36 + T37;
+			 T31 = FMA(KP618033988, T30, T2X);
+			 T3p = FNMS(KP618033988, T2X, T30);
+			 T39 = FNMS(KP250000000, T38, T35);
+			 iio[WS(rs, 2)] = T2b + T2e;
+			 iio[WS(rs, 4)] = T3Z + T42;
+			 rio[WS(rs, 4)] = T3D + T3K;
+			 rio[WS(rs, 3)] = T2J + T2Q;
+			 iio[WS(rs, 3)] = T35 + T38;
+			 {
+			      E T3O, T45, T2r, T2q, T2p, TT, TS, TR;
+			      {
+				   E TG, TL, TD, TC, TB, Tc, Tt;
+				   TG = FNMS(KP559016994, Tb, Ta);
+				   Tc = FMA(KP559016994, Tb, Ta);
+				   Tt = FMA(KP559016994, Ts, Tr);
+				   TL = FNMS(KP559016994, Ts, Tr);
+				   {
+					E T9, Tm, Tk, TA, Tx;
+					T9 = W[0];
+					Tm = W[1];
+					Tk = FMA(KP951056516, Tj, Tc);
+					TA = FNMS(KP951056516, Tj, Tc);
+					Tx = FNMS(KP951056516, Tw, Tt);
+					TD = FMA(KP951056516, Tw, Tt);
+					{
+					     E Tz, Tl, Ty, TE;
+					     Tz = W[6];
+					     Tl = T9 * Tk;
+					     TC = W[7];
+					     Ty = T9 * Tx;
+					     TE = Tz * TD;
+					     TB = Tz * TA;
+					     rio[WS(vs, 1)] = FMA(Tm, Tx, Tl);
+					     iio[WS(vs, 1)] = FNMS(Tm, Tk, Ty);
+					     iio[WS(vs, 4)] = FNMS(TC, TA, TE);
+					}
+				   }
+				   rio[WS(vs, 4)] = FMA(TC, TD, TB);
+				   {
+					E TF, TK, TI, TQ, TN;
+					TF = W[2];
+					TK = W[3];
+					TI = FNMS(KP951056516, TH, TG);
+					TQ = FMA(KP951056516, TH, TG);
+					TN = FMA(KP951056516, TM, TL);
+					TT = FNMS(KP951056516, TM, TL);
+					{
+					     E TP, TJ, TO, TU;
+					     TP = W[4];
+					     TJ = TF * TI;
+					     TS = W[5];
+					     TO = TF * TN;
+					     TU = TP * TT;
+					     TR = TP * TQ;
+					     rio[WS(vs, 2)] = FMA(TK, TN, TJ);
+					     iio[WS(vs, 2)] = FNMS(TK, TI, TO);
+					     iio[WS(vs, 3)] = FNMS(TS, TQ, TU);
+					}
+				   }
+			      }
+			      rio[WS(vs, 3)] = FMA(TS, TT, TR);
+			      {
+				   E T20, T2h, T2H, T2G, T2F, T2u, T2z;
+				   T20 = FMA(KP559016994, T1Z, T1Y);
+				   T2u = FNMS(KP559016994, T1Z, T1Y);
+				   T2z = FNMS(KP559016994, T2g, T2f);
+				   T2h = FMA(KP559016994, T2g, T2f);
+				   {
+					E T2t, T2y, T2w, T2E, T2B;
+					T2t = W[2];
+					T2y = W[3];
+					T2w = FNMS(KP951056516, T2v, T2u);
+					T2E = FMA(KP951056516, T2v, T2u);
+					T2B = FMA(KP951056516, T2A, T2z);
+					T2H = FNMS(KP951056516, T2A, T2z);
+					{
+					     E T2D, T2x, T2C, T2I;
+					     T2D = W[4];
+					     T2x = T2t * T2w;
+					     T2G = W[5];
+					     T2C = T2t * T2B;
+					     T2I = T2D * T2H;
+					     T2F = T2D * T2E;
+					     rio[WS(vs, 2) + WS(rs, 2)] = FMA(T2y, T2B, T2x);
+					     iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2y, T2w, T2C);
+					     iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2G, T2E, T2I);
+					}
+				   }
+				   rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2G, T2H, T2F);
+				   {
+					E T4v, T4u, T4t, T4i, T4n;
+					T3O = FMA(KP559016994, T3N, T3M);
+					T4i = FNMS(KP559016994, T3N, T3M);
+					T4n = FNMS(KP559016994, T44, T43);
+					T45 = FMA(KP559016994, T44, T43);
+					{
+					     E T4h, T4m, T4k, T4s, T4p;
+					     T4h = W[2];
+					     T4m = W[3];
+					     T4k = FNMS(KP951056516, T4j, T4i);
+					     T4s = FMA(KP951056516, T4j, T4i);
+					     T4p = FMA(KP951056516, T4o, T4n);
+					     T4v = FNMS(KP951056516, T4o, T4n);
+					     {
+						  E T4r, T4l, T4q, T4w;
+						  T4r = W[4];
+						  T4l = T4h * T4k;
+						  T4u = W[5];
+						  T4q = T4h * T4p;
+						  T4w = T4r * T4v;
+						  T4t = T4r * T4s;
+						  rio[WS(vs, 2) + WS(rs, 4)] = FMA(T4m, T4p, T4l);
+						  iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T4m, T4k, T4q);
+						  iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4u, T4s, T4w);
+					     }
+					}
+					rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4u, T4v, T4t);
+					{
+					     E T1X, T2a, T28, T2o, T2l;
+					     T1X = W[0];
+					     T2a = W[1];
+					     T28 = FMA(KP951056516, T27, T20);
+					     T2o = FNMS(KP951056516, T27, T20);
+					     T2l = FNMS(KP951056516, T2k, T2h);
+					     T2r = FMA(KP951056516, T2k, T2h);
+					     {
+						  E T2n, T29, T2m, T2s;
+						  T2n = W[6];
+						  T29 = T1X * T28;
+						  T2q = W[7];
+						  T2m = T1X * T2l;
+						  T2s = T2n * T2r;
+						  T2p = T2n * T2o;
+						  rio[WS(vs, 1) + WS(rs, 2)] = FMA(T2a, T2l, T29);
+						  iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2a, T28, T2m);
+						  iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T2q, T2o, T2s);
+					     }
+					}
+				   }
+			      }
+			      rio[WS(vs, 4) + WS(rs, 2)] = FMA(T2q, T2r, T2p);
+			      {
+				   E T3B, T3A, T3z, T4f, T4e, T4d;
+				   {
+					E T3o, T3t, T3l, T3k, T3j, T2U, T3b;
+					T3o = FNMS(KP559016994, T2T, T2S);
+					T2U = FMA(KP559016994, T2T, T2S);
+					T3b = FMA(KP559016994, T3a, T39);
+					T3t = FNMS(KP559016994, T3a, T39);
+					{
+					     E T2R, T34, T32, T3i, T3f;
+					     T2R = W[0];
+					     T34 = W[1];
+					     T32 = FMA(KP951056516, T31, T2U);
+					     T3i = FNMS(KP951056516, T31, T2U);
+					     T3f = FNMS(KP951056516, T3e, T3b);
+					     T3l = FMA(KP951056516, T3e, T3b);
+					     {
+						  E T3h, T33, T3g, T3m;
+						  T3h = W[6];
+						  T33 = T2R * T32;
+						  T3k = W[7];
+						  T3g = T2R * T3f;
+						  T3m = T3h * T3l;
+						  T3j = T3h * T3i;
+						  rio[WS(vs, 1) + WS(rs, 3)] = FMA(T34, T3f, T33);
+						  iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T34, T32, T3g);
+						  iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T3k, T3i, T3m);
+					     }
+					}
+					rio[WS(vs, 4) + WS(rs, 3)] = FMA(T3k, T3l, T3j);
+					{
+					     E T3n, T3s, T3q, T3y, T3v;
+					     T3n = W[2];
+					     T3s = W[3];
+					     T3q = FNMS(KP951056516, T3p, T3o);
+					     T3y = FMA(KP951056516, T3p, T3o);
+					     T3v = FMA(KP951056516, T3u, T3t);
+					     T3B = FNMS(KP951056516, T3u, T3t);
+					     {
+						  E T3x, T3r, T3w, T3C;
+						  T3x = W[4];
+						  T3r = T3n * T3q;
+						  T3A = W[5];
+						  T3w = T3n * T3v;
+						  T3C = T3x * T3B;
+						  T3z = T3x * T3y;
+						  rio[WS(vs, 2) + WS(rs, 3)] = FMA(T3s, T3v, T3r);
+						  iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T3s, T3q, T3w);
+						  iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3A, T3y, T3C);
+					     }
+					}
+				   }
+				   rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3A, T3B, T3z);
+				   {
+					E T3L, T3Y, T3W, T4c, T49;
+					T3L = W[0];
+					T3Y = W[1];
+					T3W = FMA(KP951056516, T3V, T3O);
+					T4c = FNMS(KP951056516, T3V, T3O);
+					T49 = FNMS(KP951056516, T48, T45);
+					T4f = FMA(KP951056516, T48, T45);
+					{
+					     E T4b, T3X, T4a, T4g;
+					     T4b = W[6];
+					     T3X = T3L * T3W;
+					     T4e = W[7];
+					     T4a = T3L * T49;
+					     T4g = T4b * T4f;
+					     T4d = T4b * T4c;
+					     rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3Y, T49, T3X);
+					     iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3Y, T3W, T4a);
+					     iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T4e, T4c, T4g);
+					}
+				   }
+				   rio[WS(vs, 4) + WS(rs, 4)] = FMA(T4e, T4f, T4d);
+				   {
+					E T16, T1n, T1N, T1M, T1L, T1A, T1F;
+					T16 = FMA(KP559016994, T15, T14);
+					T1A = FNMS(KP559016994, T15, T14);
+					T1F = FNMS(KP559016994, T1m, T1l);
+					T1n = FMA(KP559016994, T1m, T1l);
+					{
+					     E T1z, T1E, T1C, T1K, T1H;
+					     T1z = W[2];
+					     T1E = W[3];
+					     T1C = FNMS(KP951056516, T1B, T1A);
+					     T1K = FMA(KP951056516, T1B, T1A);
+					     T1H = FMA(KP951056516, T1G, T1F);
+					     T1N = FNMS(KP951056516, T1G, T1F);
+					     {
+						  E T1J, T1D, T1I, T1O;
+						  T1J = W[4];
+						  T1D = T1z * T1C;
+						  T1M = W[5];
+						  T1I = T1z * T1H;
+						  T1O = T1J * T1N;
+						  T1L = T1J * T1K;
+						  rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1E, T1H, T1D);
+						  iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1E, T1C, T1I);
+						  iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1M, T1K, T1O);
+					     }
+					}
+					rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1M, T1N, T1L);
+					{
+					     E T13, T1g, T1e, T1u, T1r;
+					     T13 = W[0];
+					     T1g = W[1];
+					     T1e = FMA(KP951056516, T1d, T16);
+					     T1u = FNMS(KP951056516, T1d, T16);
+					     T1r = FNMS(KP951056516, T1q, T1n);
+					     T1x = FMA(KP951056516, T1q, T1n);
+					     {
+						  E T1t, T1f, T1s, T1y;
+						  T1t = W[6];
+						  T1f = T13 * T1e;
+						  T1w = W[7];
+						  T1s = T13 * T1r;
+						  T1y = T1t * T1x;
+						  T1v = T1t * T1u;
+						  rio[WS(vs, 1) + WS(rs, 1)] = FMA(T1g, T1r, T1f);
+						  iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1g, T1e, T1s);
+						  iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1w, T1u, T1y);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1w, T1x, T1v);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 5},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, {70, 40, 130, 0}, 0, 0, 0 };
+
+void X(codelet_q1_5) (planner *p) {
+     X(kdft_difsq_register) (p, q1_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include q.h */
+
+/*
+ * This function contains 200 FP additions, 140 FP multiplications,
+ * (or, 130 additions, 70 multiplications, 70 fused multiply/add),
+ * 75 stack variables, 4 constants, and 100 memory accesses
+ */
+#include "q.h"
+
+static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T1, Ta, TG, Tv, T8, Tb, Tp, Tj, TD, To, Tq, Tr, TN, TW, T1s;
+	       E T1h, TU, TX, T1b, T15, T1p, T1a, T1c, T1d, T1z, T1I, T2e, T23, T1G, T1J;
+	       E T1X, T1R, T2b, T1W, T1Y, T1Z, T3v, T3p, T3J, T3u, T3w, T3x, T37, T3g, T3M;
+	       E T3B, T3e, T3h, T2l, T2u, T30, T2P, T2s, T2v, T2J, T2D, T2X, T2I, T2K, T2L;
+	       {
+		    E T7, Tu, T4, Tt;
+		    T1 = rio[0];
+		    {
+			 E T5, T6, T2, T3;
+			 T5 = rio[WS(rs, 2)];
+			 T6 = rio[WS(rs, 3)];
+			 T7 = T5 + T6;
+			 Tu = T5 - T6;
+			 T2 = rio[WS(rs, 1)];
+			 T3 = rio[WS(rs, 4)];
+			 T4 = T2 + T3;
+			 Tt = T2 - T3;
+		    }
+		    Ta = KP559016994 * (T4 - T7);
+		    TG = FNMS(KP587785252, Tt, KP951056516 * Tu);
+		    Tv = FMA(KP951056516, Tt, KP587785252 * Tu);
+		    T8 = T4 + T7;
+		    Tb = FNMS(KP250000000, T8, T1);
+	       }
+	       {
+		    E Ti, Tn, Tf, Tm;
+		    Tp = iio[0];
+		    {
+			 E Tg, Th, Td, Te;
+			 Tg = iio[WS(rs, 2)];
+			 Th = iio[WS(rs, 3)];
+			 Ti = Tg - Th;
+			 Tn = Tg + Th;
+			 Td = iio[WS(rs, 1)];
+			 Te = iio[WS(rs, 4)];
+			 Tf = Td - Te;
+			 Tm = Td + Te;
+		    }
+		    Tj = FMA(KP951056516, Tf, KP587785252 * Ti);
+		    TD = FNMS(KP587785252, Tf, KP951056516 * Ti);
+		    To = KP559016994 * (Tm - Tn);
+		    Tq = Tm + Tn;
+		    Tr = FNMS(KP250000000, Tq, Tp);
+	       }
+	       {
+		    E TT, T1g, TQ, T1f;
+		    TN = rio[WS(vs, 1)];
+		    {
+			 E TR, TS, TO, TP;
+			 TR = rio[WS(vs, 1) + WS(rs, 2)];
+			 TS = rio[WS(vs, 1) + WS(rs, 3)];
+			 TT = TR + TS;
+			 T1g = TR - TS;
+			 TO = rio[WS(vs, 1) + WS(rs, 1)];
+			 TP = rio[WS(vs, 1) + WS(rs, 4)];
+			 TQ = TO + TP;
+			 T1f = TO - TP;
+		    }
+		    TW = KP559016994 * (TQ - TT);
+		    T1s = FNMS(KP587785252, T1f, KP951056516 * T1g);
+		    T1h = FMA(KP951056516, T1f, KP587785252 * T1g);
+		    TU = TQ + TT;
+		    TX = FNMS(KP250000000, TU, TN);
+	       }
+	       {
+		    E T14, T19, T11, T18;
+		    T1b = iio[WS(vs, 1)];
+		    {
+			 E T12, T13, TZ, T10;
+			 T12 = iio[WS(vs, 1) + WS(rs, 2)];
+			 T13 = iio[WS(vs, 1) + WS(rs, 3)];
+			 T14 = T12 - T13;
+			 T19 = T12 + T13;
+			 TZ = iio[WS(vs, 1) + WS(rs, 1)];
+			 T10 = iio[WS(vs, 1) + WS(rs, 4)];
+			 T11 = TZ - T10;
+			 T18 = TZ + T10;
+		    }
+		    T15 = FMA(KP951056516, T11, KP587785252 * T14);
+		    T1p = FNMS(KP587785252, T11, KP951056516 * T14);
+		    T1a = KP559016994 * (T18 - T19);
+		    T1c = T18 + T19;
+		    T1d = FNMS(KP250000000, T1c, T1b);
+	       }
+	       {
+		    E T1F, T22, T1C, T21;
+		    T1z = rio[WS(vs, 2)];
+		    {
+			 E T1D, T1E, T1A, T1B;
+			 T1D = rio[WS(vs, 2) + WS(rs, 2)];
+			 T1E = rio[WS(vs, 2) + WS(rs, 3)];
+			 T1F = T1D + T1E;
+			 T22 = T1D - T1E;
+			 T1A = rio[WS(vs, 2) + WS(rs, 1)];
+			 T1B = rio[WS(vs, 2) + WS(rs, 4)];
+			 T1C = T1A + T1B;
+			 T21 = T1A - T1B;
+		    }
+		    T1I = KP559016994 * (T1C - T1F);
+		    T2e = FNMS(KP587785252, T21, KP951056516 * T22);
+		    T23 = FMA(KP951056516, T21, KP587785252 * T22);
+		    T1G = T1C + T1F;
+		    T1J = FNMS(KP250000000, T1G, T1z);
+	       }
+	       {
+		    E T1Q, T1V, T1N, T1U;
+		    T1X = iio[WS(vs, 2)];
+		    {
+			 E T1O, T1P, T1L, T1M;
+			 T1O = iio[WS(vs, 2) + WS(rs, 2)];
+			 T1P = iio[WS(vs, 2) + WS(rs, 3)];
+			 T1Q = T1O - T1P;
+			 T1V = T1O + T1P;
+			 T1L = iio[WS(vs, 2) + WS(rs, 1)];
+			 T1M = iio[WS(vs, 2) + WS(rs, 4)];
+			 T1N = T1L - T1M;
+			 T1U = T1L + T1M;
+		    }
+		    T1R = FMA(KP951056516, T1N, KP587785252 * T1Q);
+		    T2b = FNMS(KP587785252, T1N, KP951056516 * T1Q);
+		    T1W = KP559016994 * (T1U - T1V);
+		    T1Y = T1U + T1V;
+		    T1Z = FNMS(KP250000000, T1Y, T1X);
+	       }
+	       {
+		    E T3o, T3t, T3l, T3s;
+		    T3v = iio[WS(vs, 4)];
+		    {
+			 E T3m, T3n, T3j, T3k;
+			 T3m = iio[WS(vs, 4) + WS(rs, 2)];
+			 T3n = iio[WS(vs, 4) + WS(rs, 3)];
+			 T3o = T3m - T3n;
+			 T3t = T3m + T3n;
+			 T3j = iio[WS(vs, 4) + WS(rs, 1)];
+			 T3k = iio[WS(vs, 4) + WS(rs, 4)];
+			 T3l = T3j - T3k;
+			 T3s = T3j + T3k;
+		    }
+		    T3p = FMA(KP951056516, T3l, KP587785252 * T3o);
+		    T3J = FNMS(KP587785252, T3l, KP951056516 * T3o);
+		    T3u = KP559016994 * (T3s - T3t);
+		    T3w = T3s + T3t;
+		    T3x = FNMS(KP250000000, T3w, T3v);
+	       }
+	       {
+		    E T3d, T3A, T3a, T3z;
+		    T37 = rio[WS(vs, 4)];
+		    {
+			 E T3b, T3c, T38, T39;
+			 T3b = rio[WS(vs, 4) + WS(rs, 2)];
+			 T3c = rio[WS(vs, 4) + WS(rs, 3)];
+			 T3d = T3b + T3c;
+			 T3A = T3b - T3c;
+			 T38 = rio[WS(vs, 4) + WS(rs, 1)];
+			 T39 = rio[WS(vs, 4) + WS(rs, 4)];
+			 T3a = T38 + T39;
+			 T3z = T38 - T39;
+		    }
+		    T3g = KP559016994 * (T3a - T3d);
+		    T3M = FNMS(KP587785252, T3z, KP951056516 * T3A);
+		    T3B = FMA(KP951056516, T3z, KP587785252 * T3A);
+		    T3e = T3a + T3d;
+		    T3h = FNMS(KP250000000, T3e, T37);
+	       }
+	       {
+		    E T2r, T2O, T2o, T2N;
+		    T2l = rio[WS(vs, 3)];
+		    {
+			 E T2p, T2q, T2m, T2n;
+			 T2p = rio[WS(vs, 3) + WS(rs, 2)];
+			 T2q = rio[WS(vs, 3) + WS(rs, 3)];
+			 T2r = T2p + T2q;
+			 T2O = T2p - T2q;
+			 T2m = rio[WS(vs, 3) + WS(rs, 1)];
+			 T2n = rio[WS(vs, 3) + WS(rs, 4)];
+			 T2o = T2m + T2n;
+			 T2N = T2m - T2n;
+		    }
+		    T2u = KP559016994 * (T2o - T2r);
+		    T30 = FNMS(KP587785252, T2N, KP951056516 * T2O);
+		    T2P = FMA(KP951056516, T2N, KP587785252 * T2O);
+		    T2s = T2o + T2r;
+		    T2v = FNMS(KP250000000, T2s, T2l);
+	       }
+	       {
+		    E T2C, T2H, T2z, T2G;
+		    T2J = iio[WS(vs, 3)];
+		    {
+			 E T2A, T2B, T2x, T2y;
+			 T2A = iio[WS(vs, 3) + WS(rs, 2)];
+			 T2B = iio[WS(vs, 3) + WS(rs, 3)];
+			 T2C = T2A - T2B;
+			 T2H = T2A + T2B;
+			 T2x = iio[WS(vs, 3) + WS(rs, 1)];
+			 T2y = iio[WS(vs, 3) + WS(rs, 4)];
+			 T2z = T2x - T2y;
+			 T2G = T2x + T2y;
+		    }
+		    T2D = FMA(KP951056516, T2z, KP587785252 * T2C);
+		    T2X = FNMS(KP587785252, T2z, KP951056516 * T2C);
+		    T2I = KP559016994 * (T2G - T2H);
+		    T2K = T2G + T2H;
+		    T2L = FNMS(KP250000000, T2K, T2J);
+	       }
+	       rio[0] = T1 + T8;
+	       iio[0] = Tp + Tq;
+	       rio[WS(rs, 1)] = TN + TU;
+	       iio[WS(rs, 1)] = T1b + T1c;
+	       rio[WS(rs, 2)] = T1z + T1G;
+	       iio[WS(rs, 2)] = T1X + T1Y;
+	       iio[WS(rs, 4)] = T3v + T3w;
+	       rio[WS(rs, 4)] = T37 + T3e;
+	       rio[WS(rs, 3)] = T2l + T2s;
+	       iio[WS(rs, 3)] = T2J + T2K;
+	       {
+		    E Tk, Ty, Tw, TA, Tc, Ts;
+		    Tc = Ta + Tb;
+		    Tk = Tc + Tj;
+		    Ty = Tc - Tj;
+		    Ts = To + Tr;
+		    Tw = Ts - Tv;
+		    TA = Tv + Ts;
+		    {
+			 E T9, Tl, Tx, Tz;
+			 T9 = W[0];
+			 Tl = W[1];
+			 rio[WS(vs, 1)] = FMA(T9, Tk, Tl * Tw);
+			 iio[WS(vs, 1)] = FNMS(Tl, Tk, T9 * Tw);
+			 Tx = W[6];
+			 Tz = W[7];
+			 rio[WS(vs, 4)] = FMA(Tx, Ty, Tz * TA);
+			 iio[WS(vs, 4)] = FNMS(Tz, Ty, Tx * TA);
+		    }
+	       }
+	       {
+		    E TE, TK, TI, TM, TC, TH;
+		    TC = Tb - Ta;
+		    TE = TC - TD;
+		    TK = TC + TD;
+		    TH = Tr - To;
+		    TI = TG + TH;
+		    TM = TH - TG;
+		    {
+			 E TB, TF, TJ, TL;
+			 TB = W[2];
+			 TF = W[3];
+			 rio[WS(vs, 2)] = FMA(TB, TE, TF * TI);
+			 iio[WS(vs, 2)] = FNMS(TF, TE, TB * TI);
+			 TJ = W[4];
+			 TL = W[5];
+			 rio[WS(vs, 3)] = FMA(TJ, TK, TL * TM);
+			 iio[WS(vs, 3)] = FNMS(TL, TK, TJ * TM);
+		    }
+	       }
+	       {
+		    E T2c, T2i, T2g, T2k, T2a, T2f;
+		    T2a = T1J - T1I;
+		    T2c = T2a - T2b;
+		    T2i = T2a + T2b;
+		    T2f = T1Z - T1W;
+		    T2g = T2e + T2f;
+		    T2k = T2f - T2e;
+		    {
+			 E T29, T2d, T2h, T2j;
+			 T29 = W[2];
+			 T2d = W[3];
+			 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T29, T2c, T2d * T2g);
+			 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2d, T2c, T29 * T2g);
+			 T2h = W[4];
+			 T2j = W[5];
+			 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2h, T2i, T2j * T2k);
+			 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2j, T2i, T2h * T2k);
+		    }
+	       }
+	       {
+		    E T3K, T3Q, T3O, T3S, T3I, T3N;
+		    T3I = T3h - T3g;
+		    T3K = T3I - T3J;
+		    T3Q = T3I + T3J;
+		    T3N = T3x - T3u;
+		    T3O = T3M + T3N;
+		    T3S = T3N - T3M;
+		    {
+			 E T3H, T3L, T3P, T3R;
+			 T3H = W[2];
+			 T3L = W[3];
+			 rio[WS(vs, 2) + WS(rs, 4)] = FMA(T3H, T3K, T3L * T3O);
+			 iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T3L, T3K, T3H * T3O);
+			 T3P = W[4];
+			 T3R = W[5];
+			 rio[WS(vs, 3) + WS(rs, 4)] = FMA(T3P, T3Q, T3R * T3S);
+			 iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T3R, T3Q, T3P * T3S);
+		    }
+	       }
+	       {
+		    E T1S, T26, T24, T28, T1K, T20;
+		    T1K = T1I + T1J;
+		    T1S = T1K + T1R;
+		    T26 = T1K - T1R;
+		    T20 = T1W + T1Z;
+		    T24 = T20 - T23;
+		    T28 = T23 + T20;
+		    {
+			 E T1H, T1T, T25, T27;
+			 T1H = W[0];
+			 T1T = W[1];
+			 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1H, T1S, T1T * T24);
+			 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1T, T1S, T1H * T24);
+			 T25 = W[6];
+			 T27 = W[7];
+			 rio[WS(vs, 4) + WS(rs, 2)] = FMA(T25, T26, T27 * T28);
+			 iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T27, T26, T25 * T28);
+		    }
+	       }
+	       {
+		    E T2E, T2S, T2Q, T2U, T2w, T2M;
+		    T2w = T2u + T2v;
+		    T2E = T2w + T2D;
+		    T2S = T2w - T2D;
+		    T2M = T2I + T2L;
+		    T2Q = T2M - T2P;
+		    T2U = T2P + T2M;
+		    {
+			 E T2t, T2F, T2R, T2T;
+			 T2t = W[0];
+			 T2F = W[1];
+			 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T2t, T2E, T2F * T2Q);
+			 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T2F, T2E, T2t * T2Q);
+			 T2R = W[6];
+			 T2T = W[7];
+			 rio[WS(vs, 4) + WS(rs, 3)] = FMA(T2R, T2S, T2T * T2U);
+			 iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T2T, T2S, T2R * T2U);
+		    }
+	       }
+	       {
+		    E T2Y, T34, T32, T36, T2W, T31;
+		    T2W = T2v - T2u;
+		    T2Y = T2W - T2X;
+		    T34 = T2W + T2X;
+		    T31 = T2L - T2I;
+		    T32 = T30 + T31;
+		    T36 = T31 - T30;
+		    {
+			 E T2V, T2Z, T33, T35;
+			 T2V = W[2];
+			 T2Z = W[3];
+			 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T2V, T2Y, T2Z * T32);
+			 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T2Z, T2Y, T2V * T32);
+			 T33 = W[4];
+			 T35 = W[5];
+			 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T33, T34, T35 * T36);
+			 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T35, T34, T33 * T36);
+		    }
+	       }
+	       {
+		    E T3q, T3E, T3C, T3G, T3i, T3y;
+		    T3i = T3g + T3h;
+		    T3q = T3i + T3p;
+		    T3E = T3i - T3p;
+		    T3y = T3u + T3x;
+		    T3C = T3y - T3B;
+		    T3G = T3B + T3y;
+		    {
+			 E T3f, T3r, T3D, T3F;
+			 T3f = W[0];
+			 T3r = W[1];
+			 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3f, T3q, T3r * T3C);
+			 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3r, T3q, T3f * T3C);
+			 T3D = W[6];
+			 T3F = W[7];
+			 rio[WS(vs, 4) + WS(rs, 4)] = FMA(T3D, T3E, T3F * T3G);
+			 iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T3F, T3E, T3D * T3G);
+		    }
+	       }
+	       {
+		    E T1q, T1w, T1u, T1y, T1o, T1t;
+		    T1o = TX - TW;
+		    T1q = T1o - T1p;
+		    T1w = T1o + T1p;
+		    T1t = T1d - T1a;
+		    T1u = T1s + T1t;
+		    T1y = T1t - T1s;
+		    {
+			 E T1n, T1r, T1v, T1x;
+			 T1n = W[2];
+			 T1r = W[3];
+			 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1n, T1q, T1r * T1u);
+			 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1r, T1q, T1n * T1u);
+			 T1v = W[4];
+			 T1x = W[5];
+			 rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1v, T1w, T1x * T1y);
+			 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1x, T1w, T1v * T1y);
+		    }
+	       }
+	       {
+		    E T16, T1k, T1i, T1m, TY, T1e;
+		    TY = TW + TX;
+		    T16 = TY + T15;
+		    T1k = TY - T15;
+		    T1e = T1a + T1d;
+		    T1i = T1e - T1h;
+		    T1m = T1h + T1e;
+		    {
+			 E TV, T17, T1j, T1l;
+			 TV = W[0];
+			 T17 = W[1];
+			 rio[WS(vs, 1) + WS(rs, 1)] = FMA(TV, T16, T17 * T1i);
+			 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T17, T16, TV * T1i);
+			 T1j = W[6];
+			 T1l = W[7];
+			 rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1j, T1k, T1l * T1m);
+			 iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1l, T1k, T1j * T1m);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 5},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, {130, 70, 70, 0}, 0, 0, 0 };
+
+void X(codelet_q1_5) (planner *p) {
+     X(kdft_difsq_register) (p, q1_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/q1_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/q1_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1313 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:24 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 6 -name q1_6 -include q.h */
+
+/*
+ * This function contains 276 FP additions, 192 FP multiplications,
+ * (or, 144 additions, 60 multiplications, 132 fused multiply/add),
+ * 129 stack variables, 2 constants, and 144 memory accesses
+ */
+#include "q.h"
+
+static void q1_6(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 10); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T4c, T4f, T4e, T4g, T4d;
+	       {
+		    E T3, Tw, Ta, TW, Tg, TG, TM, TT, TU, TP, Tn, T17, TV, TJ, Tv;
+		    E T1A, T1e, T20, T1k, T1K, T1Q, T1X, T1Y, T1T, T1r, T1Z, T1N, T1z, T31, T32;
+		    E T2X, T2v, T2b, T33, T2R, T2D, T2E, T2i, T34, T3f, T2o, T2O, T2U, T3I, T3m;
+		    E T48, T3s, T3S, T3Y, T45, T46, T41, T3z, T4j, T47, T3V, T3H, T4M, T4q, T5c;
+		    E T4w, T4W, T52, T59, T5a, T55, T4D, T5b, T4Z, T4L, T6d, T5r, T6e, T69, T5H;
+		    E T5w, T5n, T6f, T63, T5P, T5s, T5o, T5p;
+		    {
+			 E T2f, T2k, T2g, T2c, T2d;
+			 {
+			      E T1b, T1g, T1c, T18, T19;
+			      {
+				   E T4, Tc, Te, T9, T5;
+				   {
+					E T1, T2, T7, T8;
+					T1 = rio[0];
+					T2 = rio[WS(rs, 3)];
+					T7 = rio[WS(rs, 4)];
+					T8 = rio[WS(rs, 1)];
+					T4 = rio[WS(rs, 2)];
+					Tc = T1 - T2;
+					T3 = T1 + T2;
+					Te = T7 - T8;
+					T9 = T7 + T8;
+					T5 = rio[WS(rs, 5)];
+				   }
+				   {
+					E TN, Tj, Tk, Tl, Tt, Th, Ti;
+					Th = iio[WS(rs, 2)];
+					Ti = iio[WS(rs, 5)];
+					{
+					     E Tr, Ts, Td, T6, Tf;
+					     Tr = iio[0];
+					     Td = T4 - T5;
+					     T6 = T4 + T5;
+					     TN = Th + Ti;
+					     Tj = Th - Ti;
+					     Tf = Td + Te;
+					     Tw = Te - Td;
+					     Ta = T6 + T9;
+					     TW = T9 - T6;
+					     Tg = FNMS(KP500000000, Tf, Tc);
+					     TG = Tc + Tf;
+					     Ts = iio[WS(rs, 3)];
+					     TM = FNMS(KP500000000, Ta, T3);
+					     Tk = iio[WS(rs, 4)];
+					     Tl = iio[WS(rs, 1)];
+					     Tt = Tr - Ts;
+					     TT = Tr + Ts;
+					}
+					{
+					     E T15, TO, Tm, T16, Tu;
+					     T15 = rio[WS(vs, 1)];
+					     TO = Tk + Tl;
+					     Tm = Tk - Tl;
+					     T16 = rio[WS(vs, 1) + WS(rs, 3)];
+					     T1b = rio[WS(vs, 1) + WS(rs, 4)];
+					     TU = TN + TO;
+					     TP = TN - TO;
+					     Tu = Tj + Tm;
+					     Tn = Tj - Tm;
+					     T1g = T15 - T16;
+					     T17 = T15 + T16;
+					     TV = FNMS(KP500000000, TU, TT);
+					     TJ = Tt + Tu;
+					     Tv = FNMS(KP500000000, Tu, Tt);
+					     T1c = rio[WS(vs, 1) + WS(rs, 1)];
+					     T18 = rio[WS(vs, 1) + WS(rs, 2)];
+					     T19 = rio[WS(vs, 1) + WS(rs, 5)];
+					}
+				   }
+			      }
+			      {
+				   E T1v, T1R, T1n, T1w, T1o, T1p;
+				   {
+					E T1l, T1i, T1d, T1h, T1a, T1m, T1j;
+					T1l = iio[WS(vs, 1) + WS(rs, 2)];
+					T1i = T1b - T1c;
+					T1d = T1b + T1c;
+					T1h = T18 - T19;
+					T1a = T18 + T19;
+					T1m = iio[WS(vs, 1) + WS(rs, 5)];
+					T1v = iio[WS(vs, 1)];
+					T1j = T1h + T1i;
+					T1A = T1i - T1h;
+					T1e = T1a + T1d;
+					T20 = T1d - T1a;
+					T1R = T1l + T1m;
+					T1n = T1l - T1m;
+					T1k = FNMS(KP500000000, T1j, T1g);
+					T1K = T1g + T1j;
+					T1Q = FNMS(KP500000000, T1e, T17);
+					T1w = iio[WS(vs, 1) + WS(rs, 3)];
+					T1o = iio[WS(vs, 1) + WS(rs, 4)];
+					T1p = iio[WS(vs, 1) + WS(rs, 1)];
+				   }
+				   {
+					E T2z, T2V, T2r, T2A, T2s, T2t;
+					{
+					     E T2p, T1x, T1S, T1q, T2q, T1y;
+					     T2p = iio[WS(vs, 2) + WS(rs, 2)];
+					     T1X = T1v + T1w;
+					     T1x = T1v - T1w;
+					     T1S = T1o + T1p;
+					     T1q = T1o - T1p;
+					     T2q = iio[WS(vs, 2) + WS(rs, 5)];
+					     T2z = iio[WS(vs, 2)];
+					     T1Y = T1R + T1S;
+					     T1T = T1R - T1S;
+					     T1y = T1n + T1q;
+					     T1r = T1n - T1q;
+					     T2V = T2p + T2q;
+					     T2r = T2p - T2q;
+					     T1Z = FNMS(KP500000000, T1Y, T1X);
+					     T1N = T1x + T1y;
+					     T1z = FNMS(KP500000000, T1y, T1x);
+					     T2A = iio[WS(vs, 2) + WS(rs, 3)];
+					     T2s = iio[WS(vs, 2) + WS(rs, 4)];
+					     T2t = iio[WS(vs, 2) + WS(rs, 1)];
+					}
+					{
+					     E T29, T2B, T2W, T2u, T2a, T2C;
+					     T29 = rio[WS(vs, 2)];
+					     T31 = T2z + T2A;
+					     T2B = T2z - T2A;
+					     T2W = T2s + T2t;
+					     T2u = T2s - T2t;
+					     T2a = rio[WS(vs, 2) + WS(rs, 3)];
+					     T2f = rio[WS(vs, 2) + WS(rs, 4)];
+					     T32 = T2V + T2W;
+					     T2X = T2V - T2W;
+					     T2C = T2r + T2u;
+					     T2v = T2r - T2u;
+					     T2k = T29 - T2a;
+					     T2b = T29 + T2a;
+					     T33 = FNMS(KP500000000, T32, T31);
+					     T2R = T2B + T2C;
+					     T2D = FNMS(KP500000000, T2C, T2B);
+					     T2g = rio[WS(vs, 2) + WS(rs, 1)];
+					     T2c = rio[WS(vs, 2) + WS(rs, 2)];
+					     T2d = rio[WS(vs, 2) + WS(rs, 5)];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T4n, T4s, T4o, T4k, T4l;
+			      {
+				   E T3j, T3o, T3k, T3g, T3h;
+				   {
+					E T3d, T2m, T2h, T2l, T2e, T3e, T2n;
+					T3d = rio[WS(vs, 3)];
+					T2m = T2f - T2g;
+					T2h = T2f + T2g;
+					T2l = T2c - T2d;
+					T2e = T2c + T2d;
+					T3e = rio[WS(vs, 3) + WS(rs, 3)];
+					T3j = rio[WS(vs, 3) + WS(rs, 4)];
+					T2n = T2l + T2m;
+					T2E = T2m - T2l;
+					T2i = T2e + T2h;
+					T34 = T2h - T2e;
+					T3o = T3d - T3e;
+					T3f = T3d + T3e;
+					T2o = FNMS(KP500000000, T2n, T2k);
+					T2O = T2k + T2n;
+					T2U = FNMS(KP500000000, T2i, T2b);
+					T3k = rio[WS(vs, 3) + WS(rs, 1)];
+					T3g = rio[WS(vs, 3) + WS(rs, 2)];
+					T3h = rio[WS(vs, 3) + WS(rs, 5)];
+				   }
+				   {
+					E T3D, T3Z, T3v, T3E, T3w, T3x;
+					{
+					     E T3t, T3q, T3l, T3p, T3i, T3u, T3r;
+					     T3t = iio[WS(vs, 3) + WS(rs, 2)];
+					     T3q = T3j - T3k;
+					     T3l = T3j + T3k;
+					     T3p = T3g - T3h;
+					     T3i = T3g + T3h;
+					     T3u = iio[WS(vs, 3) + WS(rs, 5)];
+					     T3D = iio[WS(vs, 3)];
+					     T3r = T3p + T3q;
+					     T3I = T3q - T3p;
+					     T3m = T3i + T3l;
+					     T48 = T3l - T3i;
+					     T3Z = T3t + T3u;
+					     T3v = T3t - T3u;
+					     T3s = FNMS(KP500000000, T3r, T3o);
+					     T3S = T3o + T3r;
+					     T3Y = FNMS(KP500000000, T3m, T3f);
+					     T3E = iio[WS(vs, 3) + WS(rs, 3)];
+					     T3w = iio[WS(vs, 3) + WS(rs, 4)];
+					     T3x = iio[WS(vs, 3) + WS(rs, 1)];
+					}
+					{
+					     E T4h, T3F, T40, T3y, T4i, T3G;
+					     T4h = rio[WS(vs, 4)];
+					     T45 = T3D + T3E;
+					     T3F = T3D - T3E;
+					     T40 = T3w + T3x;
+					     T3y = T3w - T3x;
+					     T4i = rio[WS(vs, 4) + WS(rs, 3)];
+					     T4n = rio[WS(vs, 4) + WS(rs, 4)];
+					     T46 = T3Z + T40;
+					     T41 = T3Z - T40;
+					     T3G = T3v + T3y;
+					     T3z = T3v - T3y;
+					     T4s = T4h - T4i;
+					     T4j = T4h + T4i;
+					     T47 = FNMS(KP500000000, T46, T45);
+					     T3V = T3F + T3G;
+					     T3H = FNMS(KP500000000, T3G, T3F);
+					     T4o = rio[WS(vs, 4) + WS(rs, 1)];
+					     T4k = rio[WS(vs, 4) + WS(rs, 2)];
+					     T4l = rio[WS(vs, 4) + WS(rs, 5)];
+					}
+				   }
+			      }
+			      {
+				   E T4H, T53, T4z, T4I, T4A, T4B;
+				   {
+					E T4x, T4u, T4p, T4t, T4m, T4y, T4v;
+					T4x = iio[WS(vs, 4) + WS(rs, 2)];
+					T4u = T4n - T4o;
+					T4p = T4n + T4o;
+					T4t = T4k - T4l;
+					T4m = T4k + T4l;
+					T4y = iio[WS(vs, 4) + WS(rs, 5)];
+					T4H = iio[WS(vs, 4)];
+					T4v = T4t + T4u;
+					T4M = T4u - T4t;
+					T4q = T4m + T4p;
+					T5c = T4p - T4m;
+					T53 = T4x + T4y;
+					T4z = T4x - T4y;
+					T4w = FNMS(KP500000000, T4v, T4s);
+					T4W = T4s + T4v;
+					T52 = FNMS(KP500000000, T4q, T4j);
+					T4I = iio[WS(vs, 4) + WS(rs, 3)];
+					T4A = iio[WS(vs, 4) + WS(rs, 4)];
+					T4B = iio[WS(vs, 4) + WS(rs, 1)];
+				   }
+				   {
+					E T5L, T67, T5D, T5M, T5E, T5F;
+					{
+					     E T5B, T4J, T54, T4C, T5C, T4K;
+					     T5B = iio[WS(vs, 5) + WS(rs, 2)];
+					     T59 = T4H + T4I;
+					     T4J = T4H - T4I;
+					     T54 = T4A + T4B;
+					     T4C = T4A - T4B;
+					     T5C = iio[WS(vs, 5) + WS(rs, 5)];
+					     T5L = iio[WS(vs, 5)];
+					     T5a = T53 + T54;
+					     T55 = T53 - T54;
+					     T4K = T4z + T4C;
+					     T4D = T4z - T4C;
+					     T67 = T5B + T5C;
+					     T5D = T5B - T5C;
+					     T5b = FNMS(KP500000000, T5a, T59);
+					     T4Z = T4J + T4K;
+					     T4L = FNMS(KP500000000, T4K, T4J);
+					     T5M = iio[WS(vs, 5) + WS(rs, 3)];
+					     T5E = iio[WS(vs, 5) + WS(rs, 4)];
+					     T5F = iio[WS(vs, 5) + WS(rs, 1)];
+					}
+					{
+					     E T5l, T5N, T68, T5G, T5m, T5O;
+					     T5l = rio[WS(vs, 5)];
+					     T6d = T5L + T5M;
+					     T5N = T5L - T5M;
+					     T68 = T5E + T5F;
+					     T5G = T5E - T5F;
+					     T5m = rio[WS(vs, 5) + WS(rs, 3)];
+					     T5r = rio[WS(vs, 5) + WS(rs, 4)];
+					     T6e = T67 + T68;
+					     T69 = T67 - T68;
+					     T5O = T5D + T5G;
+					     T5H = T5D - T5G;
+					     T5w = T5l - T5m;
+					     T5n = T5l + T5m;
+					     T6f = FNMS(KP500000000, T6e, T6d);
+					     T63 = T5N + T5O;
+					     T5P = FNMS(KP500000000, T5O, T5N);
+					     T5s = rio[WS(vs, 5) + WS(rs, 1)];
+					     T5o = rio[WS(vs, 5) + WS(rs, 2)];
+					     T5p = rio[WS(vs, 5) + WS(rs, 5)];
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T6a, T6h, T5I, T5R, T65, T6c;
+			 {
+			      E T5Q, T5u, T6g, T5A, T60, T66;
+			      {
+				   E T5y, T5t, T5x, T5q, T5z;
+				   rio[0] = T3 + Ta;
+				   T5y = T5r - T5s;
+				   T5t = T5r + T5s;
+				   T5x = T5o - T5p;
+				   T5q = T5o + T5p;
+				   iio[0] = TT + TU;
+				   rio[WS(rs, 1)] = T17 + T1e;
+				   T5z = T5x + T5y;
+				   T5Q = T5y - T5x;
+				   T5u = T5q + T5t;
+				   T6g = T5t - T5q;
+				   T5A = FNMS(KP500000000, T5z, T5w);
+				   T60 = T5w + T5z;
+				   iio[WS(rs, 1)] = T1X + T1Y;
+				   T66 = FNMS(KP500000000, T5u, T5n);
+				   rio[WS(rs, 2)] = T2b + T2i;
+			      }
+			      iio[WS(rs, 2)] = T31 + T32;
+			      iio[WS(rs, 4)] = T59 + T5a;
+			      rio[WS(rs, 4)] = T4j + T4q;
+			      rio[WS(rs, 3)] = T3f + T3m;
+			      iio[WS(rs, 3)] = T45 + T46;
+			      {
+				   E TA, TD, TQ, T10, T13, TX, TZ, T12;
+				   rio[WS(rs, 5)] = T5n + T5u;
+				   iio[WS(rs, 5)] = T6d + T6e;
+				   {
+					E To, Tx, Tb, Tq;
+					TA = FNMS(KP866025403, Tn, Tg);
+					To = FMA(KP866025403, Tn, Tg);
+					Tx = FMA(KP866025403, Tw, Tv);
+					TD = FNMS(KP866025403, Tw, Tv);
+					Tb = W[0];
+					Tq = W[1];
+					{
+					     E TI, TK, TH, Ty, Tp, TF;
+					     Ty = Tb * Tx;
+					     Tp = Tb * To;
+					     TF = W[4];
+					     TI = W[5];
+					     iio[WS(vs, 1)] = FNMS(Tq, To, Ty);
+					     rio[WS(vs, 1)] = FMA(Tq, Tx, Tp);
+					     TK = TF * TJ;
+					     TH = TF * TG;
+					     TQ = FNMS(KP866025403, TP, TM);
+					     T10 = FMA(KP866025403, TP, TM);
+					     T13 = FMA(KP866025403, TW, TV);
+					     TX = FNMS(KP866025403, TW, TV);
+					     iio[WS(vs, 3)] = FNMS(TI, TG, TK);
+					     rio[WS(vs, 3)] = FMA(TI, TJ, TH);
+					     TZ = W[6];
+					     T12 = W[7];
+					}
+				   }
+				   {
+					E TC, TE, TB, TL, TS;
+					{
+					     E T62, T64, T61, T14, T11, T5Z;
+					     T14 = TZ * T13;
+					     T11 = TZ * T10;
+					     T5Z = W[4];
+					     T62 = W[5];
+					     iio[WS(vs, 4)] = FNMS(T12, T10, T14);
+					     rio[WS(vs, 4)] = FMA(T12, T13, T11);
+					     T64 = T5Z * T63;
+					     T61 = T5Z * T60;
+					     {
+						  E T6k, T6n, T6j, T6m, T6o, T6l, Tz;
+						  T6a = FNMS(KP866025403, T69, T66);
+						  T6k = FMA(KP866025403, T69, T66);
+						  T6n = FMA(KP866025403, T6g, T6f);
+						  T6h = FNMS(KP866025403, T6g, T6f);
+						  iio[WS(vs, 3) + WS(rs, 5)] = FNMS(T62, T60, T64);
+						  rio[WS(vs, 3) + WS(rs, 5)] = FMA(T62, T63, T61);
+						  T6j = W[6];
+						  T6m = W[7];
+						  T6o = T6j * T6n;
+						  T6l = T6j * T6k;
+						  Tz = W[8];
+						  TC = W[9];
+						  iio[WS(vs, 4) + WS(rs, 5)] = FNMS(T6m, T6k, T6o);
+						  rio[WS(vs, 4) + WS(rs, 5)] = FMA(T6m, T6n, T6l);
+						  TE = Tz * TD;
+						  TB = Tz * TA;
+					     }
+					}
+					iio[WS(vs, 5)] = FNMS(TC, TA, TE);
+					rio[WS(vs, 5)] = FMA(TC, TD, TB);
+					TL = W[2];
+					TS = W[3];
+					{
+					     E T5U, T5X, T5W, T5Y, T5V, TY, TR, T5T;
+					     T5I = FMA(KP866025403, T5H, T5A);
+					     T5U = FNMS(KP866025403, T5H, T5A);
+					     T5X = FNMS(KP866025403, T5Q, T5P);
+					     T5R = FMA(KP866025403, T5Q, T5P);
+					     TY = TL * TX;
+					     TR = TL * TQ;
+					     T5T = W[8];
+					     T5W = W[9];
+					     iio[WS(vs, 2)] = FNMS(TS, TQ, TY);
+					     rio[WS(vs, 2)] = FMA(TS, TX, TR);
+					     T5Y = T5T * T5X;
+					     T5V = T5T * T5U;
+					     iio[WS(vs, 5) + WS(rs, 5)] = FNMS(T5W, T5U, T5Y);
+					     rio[WS(vs, 5) + WS(rs, 5)] = FMA(T5W, T5X, T5V);
+					     T65 = W[2];
+					     T6c = W[3];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T5g, T5j, T5f, T5i;
+			      {
+				   E T1E, T1H, T3M, T3P, T56, T5d, T58, T5e, T57;
+				   {
+					E T1s, T1B, T1f, T1u;
+					{
+					     E T5K, T5S, T5J, T6i, T6b, T5v;
+					     T6i = T65 * T6h;
+					     T6b = T65 * T6a;
+					     T5v = W[0];
+					     T5K = W[1];
+					     iio[WS(vs, 2) + WS(rs, 5)] = FNMS(T6c, T6a, T6i);
+					     rio[WS(vs, 2) + WS(rs, 5)] = FMA(T6c, T6h, T6b);
+					     T5S = T5v * T5R;
+					     T5J = T5v * T5I;
+					     T1E = FNMS(KP866025403, T1r, T1k);
+					     T1s = FMA(KP866025403, T1r, T1k);
+					     T1B = FMA(KP866025403, T1A, T1z);
+					     T1H = FNMS(KP866025403, T1A, T1z);
+					     iio[WS(vs, 1) + WS(rs, 5)] = FNMS(T5K, T5I, T5S);
+					     rio[WS(vs, 1) + WS(rs, 5)] = FMA(T5K, T5R, T5J);
+					     T1f = W[0];
+					     T1u = W[1];
+					}
+					{
+					     E T3U, T3W, T3T, T1C, T1t, T3R;
+					     T1C = T1f * T1B;
+					     T1t = T1f * T1s;
+					     T3R = W[4];
+					     T3U = W[5];
+					     iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1u, T1s, T1C);
+					     rio[WS(vs, 1) + WS(rs, 1)] = FMA(T1u, T1B, T1t);
+					     T3W = T3R * T3V;
+					     T3T = T3R * T3S;
+					     {
+						  E T3A, T3J, T3n, T3C, T3K, T3B, T51;
+						  T3M = FNMS(KP866025403, T3z, T3s);
+						  T3A = FMA(KP866025403, T3z, T3s);
+						  T3J = FMA(KP866025403, T3I, T3H);
+						  T3P = FNMS(KP866025403, T3I, T3H);
+						  iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3U, T3S, T3W);
+						  rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3U, T3V, T3T);
+						  T3n = W[0];
+						  T3C = W[1];
+						  T5g = FMA(KP866025403, T55, T52);
+						  T56 = FNMS(KP866025403, T55, T52);
+						  T5d = FNMS(KP866025403, T5c, T5b);
+						  T5j = FMA(KP866025403, T5c, T5b);
+						  T3K = T3n * T3J;
+						  T3B = T3n * T3A;
+						  T51 = W[2];
+						  T58 = W[3];
+						  iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T3C, T3A, T3K);
+						  rio[WS(vs, 1) + WS(rs, 3)] = FMA(T3C, T3J, T3B);
+						  T5e = T51 * T5d;
+						  T57 = T51 * T56;
+					     }
+					}
+				   }
+				   {
+					E T38, T3b, T3O, T3Q, T3N, T37, T3a;
+					{
+					     E T2Y, T35, T2T, T30, T36, T2Z, T3L;
+					     T38 = FMA(KP866025403, T2X, T2U);
+					     T2Y = FNMS(KP866025403, T2X, T2U);
+					     T35 = FNMS(KP866025403, T34, T33);
+					     T3b = FMA(KP866025403, T34, T33);
+					     iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T58, T56, T5e);
+					     rio[WS(vs, 2) + WS(rs, 4)] = FMA(T58, T5d, T57);
+					     T2T = W[2];
+					     T30 = W[3];
+					     T36 = T2T * T35;
+					     T2Z = T2T * T2Y;
+					     T3L = W[8];
+					     T3O = W[9];
+					     iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T30, T2Y, T36);
+					     rio[WS(vs, 2) + WS(rs, 2)] = FMA(T30, T35, T2Z);
+					     T3Q = T3L * T3P;
+					     T3N = T3L * T3M;
+					}
+					iio[WS(vs, 5) + WS(rs, 3)] = FNMS(T3O, T3M, T3Q);
+					rio[WS(vs, 5) + WS(rs, 3)] = FMA(T3O, T3P, T3N);
+					T37 = W[6];
+					T3a = W[7];
+					{
+					     E T1G, T1I, T1F, T3c, T39, T1D;
+					     T3c = T37 * T3b;
+					     T39 = T37 * T38;
+					     T1D = W[8];
+					     T1G = W[9];
+					     iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T3a, T38, T3c);
+					     rio[WS(vs, 4) + WS(rs, 2)] = FMA(T3a, T3b, T39);
+					     T1I = T1D * T1H;
+					     T1F = T1D * T1E;
+					     iio[WS(vs, 5) + WS(rs, 1)] = FNMS(T1G, T1E, T1I);
+					     rio[WS(vs, 5) + WS(rs, 1)] = FMA(T1G, T1H, T1F);
+					     T5f = W[6];
+					     T5i = W[7];
+					}
+				   }
+			      }
+			      {
+				   E T4Q, T4T, T2I, T2w, T2F, T2L, T2y, T2G, T2x, T4V, T4Y;
+				   {
+					E T1M, T1O, T1L, T5k, T5h, T1J;
+					T5k = T5f * T5j;
+					T5h = T5f * T5g;
+					T1J = W[4];
+					T1M = W[5];
+					iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T5i, T5g, T5k);
+					rio[WS(vs, 4) + WS(rs, 4)] = FMA(T5i, T5j, T5h);
+					T1O = T1J * T1N;
+					T1L = T1J * T1K;
+					iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1M, T1K, T1O);
+					rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1M, T1N, T1L);
+					T4V = W[4];
+					T4Y = W[5];
+				   }
+				   {
+					E T4E, T4N, T4G, T4O, T4F, T50, T4X, T4r;
+					T4Q = FNMS(KP866025403, T4D, T4w);
+					T4E = FMA(KP866025403, T4D, T4w);
+					T4N = FMA(KP866025403, T4M, T4L);
+					T4T = FNMS(KP866025403, T4M, T4L);
+					T50 = T4V * T4Z;
+					T4X = T4V * T4W;
+					T4r = W[0];
+					T4G = W[1];
+					iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4Y, T4W, T50);
+					rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4Y, T4Z, T4X);
+					T4O = T4r * T4N;
+					T4F = T4r * T4E;
+					{
+					     E T2N, T2Q, T2S, T2P, T2j;
+					     iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T4G, T4E, T4O);
+					     rio[WS(vs, 1) + WS(rs, 4)] = FMA(T4G, T4N, T4F);
+					     T2N = W[4];
+					     T2Q = W[5];
+					     T2I = FNMS(KP866025403, T2v, T2o);
+					     T2w = FMA(KP866025403, T2v, T2o);
+					     T2F = FMA(KP866025403, T2E, T2D);
+					     T2L = FNMS(KP866025403, T2E, T2D);
+					     T2S = T2N * T2R;
+					     T2P = T2N * T2O;
+					     T2j = W[0];
+					     T2y = W[1];
+					     iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2Q, T2O, T2S);
+					     rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2Q, T2R, T2P);
+					     T2G = T2j * T2F;
+					     T2x = T2j * T2w;
+					}
+				   }
+				   {
+					E T1U, T21, T2H, T2K;
+					{
+					     E T24, T27, T23, T26;
+					     T1U = FNMS(KP866025403, T1T, T1Q);
+					     T24 = FMA(KP866025403, T1T, T1Q);
+					     T27 = FMA(KP866025403, T20, T1Z);
+					     T21 = FNMS(KP866025403, T20, T1Z);
+					     iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2y, T2w, T2G);
+					     rio[WS(vs, 1) + WS(rs, 2)] = FMA(T2y, T2F, T2x);
+					     T23 = W[6];
+					     T26 = W[7];
+					     {
+						  E T42, T49, T44, T4a, T43, T28, T25, T3X;
+						  T4c = FMA(KP866025403, T41, T3Y);
+						  T42 = FNMS(KP866025403, T41, T3Y);
+						  T49 = FNMS(KP866025403, T48, T47);
+						  T4f = FMA(KP866025403, T48, T47);
+						  T28 = T23 * T27;
+						  T25 = T23 * T24;
+						  T3X = W[2];
+						  T44 = W[3];
+						  iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T26, T24, T28);
+						  rio[WS(vs, 4) + WS(rs, 1)] = FMA(T26, T27, T25);
+						  T4a = T3X * T49;
+						  T43 = T3X * T42;
+						  iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T44, T42, T4a);
+						  rio[WS(vs, 2) + WS(rs, 3)] = FMA(T44, T49, T43);
+						  T2H = W[8];
+						  T2K = W[9];
+					     }
+					}
+					{
+					     E T4S, T4U, T4R, T2M, T2J, T4P;
+					     T2M = T2H * T2L;
+					     T2J = T2H * T2I;
+					     T4P = W[8];
+					     T4S = W[9];
+					     iio[WS(vs, 5) + WS(rs, 2)] = FNMS(T2K, T2I, T2M);
+					     rio[WS(vs, 5) + WS(rs, 2)] = FMA(T2K, T2L, T2J);
+					     T4U = T4P * T4T;
+					     T4R = T4P * T4Q;
+					     {
+						  E T1P, T1W, T22, T1V, T4b;
+						  iio[WS(vs, 5) + WS(rs, 4)] = FNMS(T4S, T4Q, T4U);
+						  rio[WS(vs, 5) + WS(rs, 4)] = FMA(T4S, T4T, T4R);
+						  T1P = W[2];
+						  T1W = W[3];
+						  T22 = T1P * T21;
+						  T1V = T1P * T1U;
+						  T4b = W[6];
+						  T4e = W[7];
+						  iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1W, T1U, T22);
+						  rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1W, T21, T1V);
+						  T4g = T4b * T4f;
+						  T4d = T4b * T4c;
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T4e, T4c, T4g);
+	       rio[WS(vs, 4) + WS(rs, 3)] = FMA(T4e, T4f, T4d);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 6, "q1_6", twinstr, &GENUS, {144, 60, 132, 0}, 0, 0, 0 };
+
+void X(codelet_q1_6) (planner *p) {
+     X(kdft_difsq_register) (p, q1_6, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 6 -name q1_6 -include q.h */
+
+/*
+ * This function contains 276 FP additions, 168 FP multiplications,
+ * (or, 192 additions, 84 multiplications, 84 fused multiply/add),
+ * 85 stack variables, 2 constants, and 144 memory accesses
+ */
+#include "q.h"
+
+static void q1_6(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 10); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T3, Tc, Tt, TM, TX, T16, T1n, T1G, T2h, T2A, T1R, T20, T2L, T2U, T3b;
+	       E T3u, T3F, T3O, T45, T4o, T4Z, T5i, T4z, T4I, Ta, TP, Tf, Tq, Tn, TN;
+	       E Tu, TJ, T14, T1J, T19, T1k, T1h, T1H, T1o, T1D, T2b, T2B, T2i, T2x, T1Y;
+	       E T2D, T23, T2e, T2S, T3x, T2X, T38, T35, T3v, T3c, T3r, T3M, T4r, T3R, T42;
+	       E T3Z, T4p, T46, T4l, T4T, T5j, T50, T5f, T4G, T5l, T4L, T4W;
+	       {
+		    E T1, T2, T1l, T1m;
+		    T1 = rio[0];
+		    T2 = rio[WS(rs, 3)];
+		    T3 = T1 + T2;
+		    Tc = T1 - T2;
+		    {
+			 E Tr, Ts, TV, TW;
+			 Tr = iio[0];
+			 Ts = iio[WS(rs, 3)];
+			 Tt = Tr - Ts;
+			 TM = Tr + Ts;
+			 TV = rio[WS(vs, 1)];
+			 TW = rio[WS(vs, 1) + WS(rs, 3)];
+			 TX = TV + TW;
+			 T16 = TV - TW;
+		    }
+		    T1l = iio[WS(vs, 1)];
+		    T1m = iio[WS(vs, 1) + WS(rs, 3)];
+		    T1n = T1l - T1m;
+		    T1G = T1l + T1m;
+		    {
+			 E T2f, T2g, T1P, T1Q;
+			 T2f = iio[WS(vs, 2)];
+			 T2g = iio[WS(vs, 2) + WS(rs, 3)];
+			 T2h = T2f - T2g;
+			 T2A = T2f + T2g;
+			 T1P = rio[WS(vs, 2)];
+			 T1Q = rio[WS(vs, 2) + WS(rs, 3)];
+			 T1R = T1P + T1Q;
+			 T20 = T1P - T1Q;
+		    }
+	       }
+	       {
+		    E T2J, T2K, T43, T44;
+		    T2J = rio[WS(vs, 3)];
+		    T2K = rio[WS(vs, 3) + WS(rs, 3)];
+		    T2L = T2J + T2K;
+		    T2U = T2J - T2K;
+		    {
+			 E T39, T3a, T3D, T3E;
+			 T39 = iio[WS(vs, 3)];
+			 T3a = iio[WS(vs, 3) + WS(rs, 3)];
+			 T3b = T39 - T3a;
+			 T3u = T39 + T3a;
+			 T3D = rio[WS(vs, 4)];
+			 T3E = rio[WS(vs, 4) + WS(rs, 3)];
+			 T3F = T3D + T3E;
+			 T3O = T3D - T3E;
+		    }
+		    T43 = iio[WS(vs, 4)];
+		    T44 = iio[WS(vs, 4) + WS(rs, 3)];
+		    T45 = T43 - T44;
+		    T4o = T43 + T44;
+		    {
+			 E T4X, T4Y, T4x, T4y;
+			 T4X = iio[WS(vs, 5)];
+			 T4Y = iio[WS(vs, 5) + WS(rs, 3)];
+			 T4Z = T4X - T4Y;
+			 T5i = T4X + T4Y;
+			 T4x = rio[WS(vs, 5)];
+			 T4y = rio[WS(vs, 5) + WS(rs, 3)];
+			 T4z = T4x + T4y;
+			 T4I = T4x - T4y;
+		    }
+	       }
+	       {
+		    E T6, Td, T9, Te;
+		    {
+			 E T4, T5, T7, T8;
+			 T4 = rio[WS(rs, 2)];
+			 T5 = rio[WS(rs, 5)];
+			 T6 = T4 + T5;
+			 Td = T4 - T5;
+			 T7 = rio[WS(rs, 4)];
+			 T8 = rio[WS(rs, 1)];
+			 T9 = T7 + T8;
+			 Te = T7 - T8;
+		    }
+		    Ta = T6 + T9;
+		    TP = KP866025403 * (T9 - T6);
+		    Tf = Td + Te;
+		    Tq = KP866025403 * (Te - Td);
+	       }
+	       {
+		    E Tj, TH, Tm, TI;
+		    {
+			 E Th, Ti, Tk, Tl;
+			 Th = iio[WS(rs, 2)];
+			 Ti = iio[WS(rs, 5)];
+			 Tj = Th - Ti;
+			 TH = Th + Ti;
+			 Tk = iio[WS(rs, 4)];
+			 Tl = iio[WS(rs, 1)];
+			 Tm = Tk - Tl;
+			 TI = Tk + Tl;
+		    }
+		    Tn = KP866025403 * (Tj - Tm);
+		    TN = TH + TI;
+		    Tu = Tj + Tm;
+		    TJ = KP866025403 * (TH - TI);
+	       }
+	       {
+		    E T10, T17, T13, T18;
+		    {
+			 E TY, TZ, T11, T12;
+			 TY = rio[WS(vs, 1) + WS(rs, 2)];
+			 TZ = rio[WS(vs, 1) + WS(rs, 5)];
+			 T10 = TY + TZ;
+			 T17 = TY - TZ;
+			 T11 = rio[WS(vs, 1) + WS(rs, 4)];
+			 T12 = rio[WS(vs, 1) + WS(rs, 1)];
+			 T13 = T11 + T12;
+			 T18 = T11 - T12;
+		    }
+		    T14 = T10 + T13;
+		    T1J = KP866025403 * (T13 - T10);
+		    T19 = T17 + T18;
+		    T1k = KP866025403 * (T18 - T17);
+	       }
+	       {
+		    E T1d, T1B, T1g, T1C;
+		    {
+			 E T1b, T1c, T1e, T1f;
+			 T1b = iio[WS(vs, 1) + WS(rs, 2)];
+			 T1c = iio[WS(vs, 1) + WS(rs, 5)];
+			 T1d = T1b - T1c;
+			 T1B = T1b + T1c;
+			 T1e = iio[WS(vs, 1) + WS(rs, 4)];
+			 T1f = iio[WS(vs, 1) + WS(rs, 1)];
+			 T1g = T1e - T1f;
+			 T1C = T1e + T1f;
+		    }
+		    T1h = KP866025403 * (T1d - T1g);
+		    T1H = T1B + T1C;
+		    T1o = T1d + T1g;
+		    T1D = KP866025403 * (T1B - T1C);
+	       }
+	       {
+		    E T27, T2v, T2a, T2w;
+		    {
+			 E T25, T26, T28, T29;
+			 T25 = iio[WS(vs, 2) + WS(rs, 2)];
+			 T26 = iio[WS(vs, 2) + WS(rs, 5)];
+			 T27 = T25 - T26;
+			 T2v = T25 + T26;
+			 T28 = iio[WS(vs, 2) + WS(rs, 4)];
+			 T29 = iio[WS(vs, 2) + WS(rs, 1)];
+			 T2a = T28 - T29;
+			 T2w = T28 + T29;
+		    }
+		    T2b = KP866025403 * (T27 - T2a);
+		    T2B = T2v + T2w;
+		    T2i = T27 + T2a;
+		    T2x = KP866025403 * (T2v - T2w);
+	       }
+	       {
+		    E T1U, T21, T1X, T22;
+		    {
+			 E T1S, T1T, T1V, T1W;
+			 T1S = rio[WS(vs, 2) + WS(rs, 2)];
+			 T1T = rio[WS(vs, 2) + WS(rs, 5)];
+			 T1U = T1S + T1T;
+			 T21 = T1S - T1T;
+			 T1V = rio[WS(vs, 2) + WS(rs, 4)];
+			 T1W = rio[WS(vs, 2) + WS(rs, 1)];
+			 T1X = T1V + T1W;
+			 T22 = T1V - T1W;
+		    }
+		    T1Y = T1U + T1X;
+		    T2D = KP866025403 * (T1X - T1U);
+		    T23 = T21 + T22;
+		    T2e = KP866025403 * (T22 - T21);
+	       }
+	       {
+		    E T2O, T2V, T2R, T2W;
+		    {
+			 E T2M, T2N, T2P, T2Q;
+			 T2M = rio[WS(vs, 3) + WS(rs, 2)];
+			 T2N = rio[WS(vs, 3) + WS(rs, 5)];
+			 T2O = T2M + T2N;
+			 T2V = T2M - T2N;
+			 T2P = rio[WS(vs, 3) + WS(rs, 4)];
+			 T2Q = rio[WS(vs, 3) + WS(rs, 1)];
+			 T2R = T2P + T2Q;
+			 T2W = T2P - T2Q;
+		    }
+		    T2S = T2O + T2R;
+		    T3x = KP866025403 * (T2R - T2O);
+		    T2X = T2V + T2W;
+		    T38 = KP866025403 * (T2W - T2V);
+	       }
+	       {
+		    E T31, T3p, T34, T3q;
+		    {
+			 E T2Z, T30, T32, T33;
+			 T2Z = iio[WS(vs, 3) + WS(rs, 2)];
+			 T30 = iio[WS(vs, 3) + WS(rs, 5)];
+			 T31 = T2Z - T30;
+			 T3p = T2Z + T30;
+			 T32 = iio[WS(vs, 3) + WS(rs, 4)];
+			 T33 = iio[WS(vs, 3) + WS(rs, 1)];
+			 T34 = T32 - T33;
+			 T3q = T32 + T33;
+		    }
+		    T35 = KP866025403 * (T31 - T34);
+		    T3v = T3p + T3q;
+		    T3c = T31 + T34;
+		    T3r = KP866025403 * (T3p - T3q);
+	       }
+	       {
+		    E T3I, T3P, T3L, T3Q;
+		    {
+			 E T3G, T3H, T3J, T3K;
+			 T3G = rio[WS(vs, 4) + WS(rs, 2)];
+			 T3H = rio[WS(vs, 4) + WS(rs, 5)];
+			 T3I = T3G + T3H;
+			 T3P = T3G - T3H;
+			 T3J = rio[WS(vs, 4) + WS(rs, 4)];
+			 T3K = rio[WS(vs, 4) + WS(rs, 1)];
+			 T3L = T3J + T3K;
+			 T3Q = T3J - T3K;
+		    }
+		    T3M = T3I + T3L;
+		    T4r = KP866025403 * (T3L - T3I);
+		    T3R = T3P + T3Q;
+		    T42 = KP866025403 * (T3Q - T3P);
+	       }
+	       {
+		    E T3V, T4j, T3Y, T4k;
+		    {
+			 E T3T, T3U, T3W, T3X;
+			 T3T = iio[WS(vs, 4) + WS(rs, 2)];
+			 T3U = iio[WS(vs, 4) + WS(rs, 5)];
+			 T3V = T3T - T3U;
+			 T4j = T3T + T3U;
+			 T3W = iio[WS(vs, 4) + WS(rs, 4)];
+			 T3X = iio[WS(vs, 4) + WS(rs, 1)];
+			 T3Y = T3W - T3X;
+			 T4k = T3W + T3X;
+		    }
+		    T3Z = KP866025403 * (T3V - T3Y);
+		    T4p = T4j + T4k;
+		    T46 = T3V + T3Y;
+		    T4l = KP866025403 * (T4j - T4k);
+	       }
+	       {
+		    E T4P, T5d, T4S, T5e;
+		    {
+			 E T4N, T4O, T4Q, T4R;
+			 T4N = iio[WS(vs, 5) + WS(rs, 2)];
+			 T4O = iio[WS(vs, 5) + WS(rs, 5)];
+			 T4P = T4N - T4O;
+			 T5d = T4N + T4O;
+			 T4Q = iio[WS(vs, 5) + WS(rs, 4)];
+			 T4R = iio[WS(vs, 5) + WS(rs, 1)];
+			 T4S = T4Q - T4R;
+			 T5e = T4Q + T4R;
+		    }
+		    T4T = KP866025403 * (T4P - T4S);
+		    T5j = T5d + T5e;
+		    T50 = T4P + T4S;
+		    T5f = KP866025403 * (T5d - T5e);
+	       }
+	       {
+		    E T4C, T4J, T4F, T4K;
+		    {
+			 E T4A, T4B, T4D, T4E;
+			 T4A = rio[WS(vs, 5) + WS(rs, 2)];
+			 T4B = rio[WS(vs, 5) + WS(rs, 5)];
+			 T4C = T4A + T4B;
+			 T4J = T4A - T4B;
+			 T4D = rio[WS(vs, 5) + WS(rs, 4)];
+			 T4E = rio[WS(vs, 5) + WS(rs, 1)];
+			 T4F = T4D + T4E;
+			 T4K = T4D - T4E;
+		    }
+		    T4G = T4C + T4F;
+		    T5l = KP866025403 * (T4F - T4C);
+		    T4L = T4J + T4K;
+		    T4W = KP866025403 * (T4K - T4J);
+	       }
+	       rio[0] = T3 + Ta;
+	       iio[0] = TM + TN;
+	       rio[WS(rs, 1)] = TX + T14;
+	       iio[WS(rs, 1)] = T1G + T1H;
+	       rio[WS(rs, 3)] = T2L + T2S;
+	       rio[WS(rs, 2)] = T1R + T1Y;
+	       iio[WS(rs, 2)] = T2A + T2B;
+	       iio[WS(rs, 3)] = T3u + T3v;
+	       iio[WS(rs, 4)] = T4o + T4p;
+	       iio[WS(rs, 5)] = T5i + T5j;
+	       rio[WS(rs, 5)] = T4z + T4G;
+	       rio[WS(rs, 4)] = T3F + T3M;
+	       {
+		    E T1w, T1y, T1v, T1x;
+		    T1w = T16 + T19;
+		    T1y = T1n + T1o;
+		    T1v = W[4];
+		    T1x = W[5];
+		    rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1v, T1w, T1x * T1y);
+		    iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1x, T1w, T1v * T1y);
+	       }
+	       {
+		    E T58, T5a, T57, T59;
+		    T58 = T4I + T4L;
+		    T5a = T4Z + T50;
+		    T57 = W[4];
+		    T59 = W[5];
+		    rio[WS(vs, 3) + WS(rs, 5)] = FMA(T57, T58, T59 * T5a);
+		    iio[WS(vs, 3) + WS(rs, 5)] = FNMS(T59, T58, T57 * T5a);
+	       }
+	       {
+		    E TC, TE, TB, TD;
+		    TC = Tc + Tf;
+		    TE = Tt + Tu;
+		    TB = W[4];
+		    TD = W[5];
+		    rio[WS(vs, 3)] = FMA(TB, TC, TD * TE);
+		    iio[WS(vs, 3)] = FNMS(TD, TC, TB * TE);
+	       }
+	       {
+		    E T4e, T4g, T4d, T4f;
+		    T4e = T3O + T3R;
+		    T4g = T45 + T46;
+		    T4d = W[4];
+		    T4f = W[5];
+		    rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4d, T4e, T4f * T4g);
+		    iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4f, T4e, T4d * T4g);
+	       }
+	       {
+		    E T3k, T3m, T3j, T3l;
+		    T3k = T2U + T2X;
+		    T3m = T3b + T3c;
+		    T3j = W[4];
+		    T3l = W[5];
+		    rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3j, T3k, T3l * T3m);
+		    iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3l, T3k, T3j * T3m);
+	       }
+	       {
+		    E T2q, T2s, T2p, T2r;
+		    T2q = T20 + T23;
+		    T2s = T2h + T2i;
+		    T2p = W[4];
+		    T2r = W[5];
+		    rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2p, T2q, T2r * T2s);
+		    iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2r, T2q, T2p * T2s);
+	       }
+	       {
+		    E T5g, T5o, T5m, T5q, T5c, T5k;
+		    T5c = FNMS(KP500000000, T4G, T4z);
+		    T5g = T5c - T5f;
+		    T5o = T5c + T5f;
+		    T5k = FNMS(KP500000000, T5j, T5i);
+		    T5m = T5k - T5l;
+		    T5q = T5l + T5k;
+		    {
+			 E T5b, T5h, T5n, T5p;
+			 T5b = W[2];
+			 T5h = W[3];
+			 rio[WS(vs, 2) + WS(rs, 5)] = FMA(T5b, T5g, T5h * T5m);
+			 iio[WS(vs, 2) + WS(rs, 5)] = FNMS(T5h, T5g, T5b * T5m);
+			 T5n = W[6];
+			 T5p = W[7];
+			 rio[WS(vs, 4) + WS(rs, 5)] = FMA(T5n, T5o, T5p * T5q);
+			 iio[WS(vs, 4) + WS(rs, 5)] = FNMS(T5p, T5o, T5n * T5q);
+		    }
+	       }
+	       {
+		    E To, Ty, Tw, TA, Tg, Tv;
+		    Tg = FNMS(KP500000000, Tf, Tc);
+		    To = Tg + Tn;
+		    Ty = Tg - Tn;
+		    Tv = FNMS(KP500000000, Tu, Tt);
+		    Tw = Tq + Tv;
+		    TA = Tv - Tq;
+		    {
+			 E Tb, Tp, Tx, Tz;
+			 Tb = W[0];
+			 Tp = W[1];
+			 rio[WS(vs, 1)] = FMA(Tb, To, Tp * Tw);
+			 iio[WS(vs, 1)] = FNMS(Tp, To, Tb * Tw);
+			 Tx = W[8];
+			 Tz = W[9];
+			 rio[WS(vs, 5)] = FMA(Tx, Ty, Tz * TA);
+			 iio[WS(vs, 5)] = FNMS(Tz, Ty, Tx * TA);
+		    }
+	       }
+	       {
+		    E T36, T3g, T3e, T3i, T2Y, T3d;
+		    T2Y = FNMS(KP500000000, T2X, T2U);
+		    T36 = T2Y + T35;
+		    T3g = T2Y - T35;
+		    T3d = FNMS(KP500000000, T3c, T3b);
+		    T3e = T38 + T3d;
+		    T3i = T3d - T38;
+		    {
+			 E T2T, T37, T3f, T3h;
+			 T2T = W[0];
+			 T37 = W[1];
+			 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T2T, T36, T37 * T3e);
+			 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T37, T36, T2T * T3e);
+			 T3f = W[8];
+			 T3h = W[9];
+			 rio[WS(vs, 5) + WS(rs, 3)] = FMA(T3f, T3g, T3h * T3i);
+			 iio[WS(vs, 5) + WS(rs, 3)] = FNMS(T3h, T3g, T3f * T3i);
+		    }
+	       }
+	       {
+		    E T2y, T2G, T2E, T2I, T2u, T2C;
+		    T2u = FNMS(KP500000000, T1Y, T1R);
+		    T2y = T2u - T2x;
+		    T2G = T2u + T2x;
+		    T2C = FNMS(KP500000000, T2B, T2A);
+		    T2E = T2C - T2D;
+		    T2I = T2D + T2C;
+		    {
+			 E T2t, T2z, T2F, T2H;
+			 T2t = W[2];
+			 T2z = W[3];
+			 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T2t, T2y, T2z * T2E);
+			 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2z, T2y, T2t * T2E);
+			 T2F = W[6];
+			 T2H = W[7];
+			 rio[WS(vs, 4) + WS(rs, 2)] = FMA(T2F, T2G, T2H * T2I);
+			 iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T2H, T2G, T2F * T2I);
+		    }
+	       }
+	       {
+		    E T3s, T3A, T3y, T3C, T3o, T3w;
+		    T3o = FNMS(KP500000000, T2S, T2L);
+		    T3s = T3o - T3r;
+		    T3A = T3o + T3r;
+		    T3w = FNMS(KP500000000, T3v, T3u);
+		    T3y = T3w - T3x;
+		    T3C = T3x + T3w;
+		    {
+			 E T3n, T3t, T3z, T3B;
+			 T3n = W[2];
+			 T3t = W[3];
+			 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T3n, T3s, T3t * T3y);
+			 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T3t, T3s, T3n * T3y);
+			 T3z = W[6];
+			 T3B = W[7];
+			 rio[WS(vs, 4) + WS(rs, 3)] = FMA(T3z, T3A, T3B * T3C);
+			 iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T3B, T3A, T3z * T3C);
+		    }
+	       }
+	       {
+		    E T1E, T1M, T1K, T1O, T1A, T1I;
+		    T1A = FNMS(KP500000000, T14, TX);
+		    T1E = T1A - T1D;
+		    T1M = T1A + T1D;
+		    T1I = FNMS(KP500000000, T1H, T1G);
+		    T1K = T1I - T1J;
+		    T1O = T1J + T1I;
+		    {
+			 E T1z, T1F, T1L, T1N;
+			 T1z = W[2];
+			 T1F = W[3];
+			 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1z, T1E, T1F * T1K);
+			 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1F, T1E, T1z * T1K);
+			 T1L = W[6];
+			 T1N = W[7];
+			 rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1L, T1M, T1N * T1O);
+			 iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1N, T1M, T1L * T1O);
+		    }
+	       }
+	       {
+		    E T4m, T4u, T4s, T4w, T4i, T4q;
+		    T4i = FNMS(KP500000000, T3M, T3F);
+		    T4m = T4i - T4l;
+		    T4u = T4i + T4l;
+		    T4q = FNMS(KP500000000, T4p, T4o);
+		    T4s = T4q - T4r;
+		    T4w = T4r + T4q;
+		    {
+			 E T4h, T4n, T4t, T4v;
+			 T4h = W[2];
+			 T4n = W[3];
+			 rio[WS(vs, 2) + WS(rs, 4)] = FMA(T4h, T4m, T4n * T4s);
+			 iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T4n, T4m, T4h * T4s);
+			 T4t = W[6];
+			 T4v = W[7];
+			 rio[WS(vs, 4) + WS(rs, 4)] = FMA(T4t, T4u, T4v * T4w);
+			 iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T4v, T4u, T4t * T4w);
+		    }
+	       }
+	       {
+		    E TK, TS, TQ, TU, TG, TO;
+		    TG = FNMS(KP500000000, Ta, T3);
+		    TK = TG - TJ;
+		    TS = TG + TJ;
+		    TO = FNMS(KP500000000, TN, TM);
+		    TQ = TO - TP;
+		    TU = TP + TO;
+		    {
+			 E TF, TL, TR, TT;
+			 TF = W[2];
+			 TL = W[3];
+			 rio[WS(vs, 2)] = FMA(TF, TK, TL * TQ);
+			 iio[WS(vs, 2)] = FNMS(TL, TK, TF * TQ);
+			 TR = W[6];
+			 TT = W[7];
+			 rio[WS(vs, 4)] = FMA(TR, TS, TT * TU);
+			 iio[WS(vs, 4)] = FNMS(TT, TS, TR * TU);
+		    }
+	       }
+	       {
+		    E T2c, T2m, T2k, T2o, T24, T2j;
+		    T24 = FNMS(KP500000000, T23, T20);
+		    T2c = T24 + T2b;
+		    T2m = T24 - T2b;
+		    T2j = FNMS(KP500000000, T2i, T2h);
+		    T2k = T2e + T2j;
+		    T2o = T2j - T2e;
+		    {
+			 E T1Z, T2d, T2l, T2n;
+			 T1Z = W[0];
+			 T2d = W[1];
+			 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1Z, T2c, T2d * T2k);
+			 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2d, T2c, T1Z * T2k);
+			 T2l = W[8];
+			 T2n = W[9];
+			 rio[WS(vs, 5) + WS(rs, 2)] = FMA(T2l, T2m, T2n * T2o);
+			 iio[WS(vs, 5) + WS(rs, 2)] = FNMS(T2n, T2m, T2l * T2o);
+		    }
+	       }
+	       {
+		    E T40, T4a, T48, T4c, T3S, T47;
+		    T3S = FNMS(KP500000000, T3R, T3O);
+		    T40 = T3S + T3Z;
+		    T4a = T3S - T3Z;
+		    T47 = FNMS(KP500000000, T46, T45);
+		    T48 = T42 + T47;
+		    T4c = T47 - T42;
+		    {
+			 E T3N, T41, T49, T4b;
+			 T3N = W[0];
+			 T41 = W[1];
+			 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3N, T40, T41 * T48);
+			 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T41, T40, T3N * T48);
+			 T49 = W[8];
+			 T4b = W[9];
+			 rio[WS(vs, 5) + WS(rs, 4)] = FMA(T49, T4a, T4b * T4c);
+			 iio[WS(vs, 5) + WS(rs, 4)] = FNMS(T4b, T4a, T49 * T4c);
+		    }
+	       }
+	       {
+		    E T1i, T1s, T1q, T1u, T1a, T1p;
+		    T1a = FNMS(KP500000000, T19, T16);
+		    T1i = T1a + T1h;
+		    T1s = T1a - T1h;
+		    T1p = FNMS(KP500000000, T1o, T1n);
+		    T1q = T1k + T1p;
+		    T1u = T1p - T1k;
+		    {
+			 E T15, T1j, T1r, T1t;
+			 T15 = W[0];
+			 T1j = W[1];
+			 rio[WS(vs, 1) + WS(rs, 1)] = FMA(T15, T1i, T1j * T1q);
+			 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1j, T1i, T15 * T1q);
+			 T1r = W[8];
+			 T1t = W[9];
+			 rio[WS(vs, 5) + WS(rs, 1)] = FMA(T1r, T1s, T1t * T1u);
+			 iio[WS(vs, 5) + WS(rs, 1)] = FNMS(T1t, T1s, T1r * T1u);
+		    }
+	       }
+	       {
+		    E T4U, T54, T52, T56, T4M, T51;
+		    T4M = FNMS(KP500000000, T4L, T4I);
+		    T4U = T4M + T4T;
+		    T54 = T4M - T4T;
+		    T51 = FNMS(KP500000000, T50, T4Z);
+		    T52 = T4W + T51;
+		    T56 = T51 - T4W;
+		    {
+			 E T4H, T4V, T53, T55;
+			 T4H = W[0];
+			 T4V = W[1];
+			 rio[WS(vs, 1) + WS(rs, 5)] = FMA(T4H, T4U, T4V * T52);
+			 iio[WS(vs, 1) + WS(rs, 5)] = FNMS(T4V, T4U, T4H * T52);
+			 T53 = W[8];
+			 T55 = W[9];
+			 rio[WS(vs, 5) + WS(rs, 5)] = FMA(T53, T54, T55 * T56);
+			 iio[WS(vs, 5) + WS(rs, 5)] = FNMS(T55, T54, T53 * T56);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 6, "q1_6", twinstr, &GENUS, {192, 84, 84, 0}, 0, 0, 0 };
+
+void X(codelet_q1_6) (planner *p) {
+     X(kdft_difsq_register) (p, q1_6, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/q1_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/q1_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2396 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:17 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 8 -name q1_8 -include q.h */
+
+/*
+ * This function contains 528 FP additions, 288 FP multiplications,
+ * (or, 352 additions, 112 multiplications, 176 fused multiply/add),
+ * 190 stack variables, 1 constants, and 256 memory accesses
+ */
+#include "q.h"
+
+static void q1_8(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 14); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T9C, T9N, T9l, T9E, T9D, T9O;
+	       {
+		    E TV, Tk, T1d, T7, T18, T1t, TQ, TD, T5t, T4S, T5L, T4F, T5G, T61, T5o;
+		    E T5b, T6Z, T6o, T7h, T6b, T7c, T7x, T6U, T6H, Tbx, TaW, TbP, TaJ, TbK, Tc5;
+		    E Tbs, Tbf, T2r, T1Q, T2J, T1D, T2E, T2Z, T2m, T29, T3X, T3m, T4f, T39, T4a;
+		    E T4v, T3S, T3F, T8v, T7U, T8N, T7H, T8I, T93, T8q, T8d, Ta1, T9q, Taj, T9d;
+		    E Tae, Taz, T9W, T9J, Te, T19, T1u, T1g, Tv, TR, TG, TW, T5H, T4M, T5O;
+		    E T62, T5p, T53, T5u, T5e, T6i, T7d, T7y, T7k, T6z, T6V, T6K, T70, TbL, TaQ;
+		    E TbS, Tc6, Tbt, Tb7, Tby, Tbi, T1K, T2F, T30, T2M, T21, T2n, T2c, T2s, T4b;
+		    E T3g, T4i, T4w, T3T, T3x, T3Y, T3I, T7O, T8J, T94, T8Q, T85, T8r, T8g, T8w;
+		    E Tak, T9r, T9K, T9A, Taf, T9k, Tal, T9u;
+		    {
+			 E T9a, T9F, T99, Tac, T9p, T9b, T9G, T9H;
+			 {
+			      E TaG, Tbb, TaF, TbI, TaV, TaH, Tbc, Tbd;
+			      {
+				   E T4C, T57, T4B, T5E, T4R, T4D, T58, T59;
+				   {
+					E T4, Tz, T3, T16, Tj, T5, TA, TB;
+					{
+					     E T1, T2, Th, Ti;
+					     T1 = rio[0];
+					     T2 = rio[WS(rs, 4)];
+					     Th = iio[0];
+					     Ti = iio[WS(rs, 4)];
+					     T4 = rio[WS(rs, 2)];
+					     Tz = T1 - T2;
+					     T3 = T1 + T2;
+					     T16 = Th + Ti;
+					     Tj = Th - Ti;
+					     T5 = rio[WS(rs, 6)];
+					     TA = iio[WS(rs, 2)];
+					     TB = iio[WS(rs, 6)];
+					}
+					{
+					     E T4z, T4A, T4P, T4Q;
+					     T4z = rio[WS(vs, 3)];
+					     {
+						  E Tg, T6, T17, TC;
+						  Tg = T4 - T5;
+						  T6 = T4 + T5;
+						  T17 = TA + TB;
+						  TC = TA - TB;
+						  TV = Tj - Tg;
+						  Tk = Tg + Tj;
+						  T1d = T3 - T6;
+						  T7 = T3 + T6;
+						  T18 = T16 - T17;
+						  T1t = T16 + T17;
+						  TQ = Tz + TC;
+						  TD = Tz - TC;
+						  T4A = rio[WS(vs, 3) + WS(rs, 4)];
+					     }
+					     T4P = iio[WS(vs, 3)];
+					     T4Q = iio[WS(vs, 3) + WS(rs, 4)];
+					     T4C = rio[WS(vs, 3) + WS(rs, 2)];
+					     T57 = T4z - T4A;
+					     T4B = T4z + T4A;
+					     T5E = T4P + T4Q;
+					     T4R = T4P - T4Q;
+					     T4D = rio[WS(vs, 3) + WS(rs, 6)];
+					     T58 = iio[WS(vs, 3) + WS(rs, 2)];
+					     T59 = iio[WS(vs, 3) + WS(rs, 6)];
+					}
+				   }
+				   {
+					E T68, T6D, T67, T7a, T6n, T69, T6E, T6F;
+					{
+					     E T65, T66, T6l, T6m;
+					     T65 = rio[WS(vs, 4)];
+					     {
+						  E T4O, T4E, T5F, T5a;
+						  T4O = T4C - T4D;
+						  T4E = T4C + T4D;
+						  T5F = T58 + T59;
+						  T5a = T58 - T59;
+						  T5t = T4R - T4O;
+						  T4S = T4O + T4R;
+						  T5L = T4B - T4E;
+						  T4F = T4B + T4E;
+						  T5G = T5E - T5F;
+						  T61 = T5E + T5F;
+						  T5o = T57 + T5a;
+						  T5b = T57 - T5a;
+						  T66 = rio[WS(vs, 4) + WS(rs, 4)];
+					     }
+					     T6l = iio[WS(vs, 4)];
+					     T6m = iio[WS(vs, 4) + WS(rs, 4)];
+					     T68 = rio[WS(vs, 4) + WS(rs, 2)];
+					     T6D = T65 - T66;
+					     T67 = T65 + T66;
+					     T7a = T6l + T6m;
+					     T6n = T6l - T6m;
+					     T69 = rio[WS(vs, 4) + WS(rs, 6)];
+					     T6E = iio[WS(vs, 4) + WS(rs, 2)];
+					     T6F = iio[WS(vs, 4) + WS(rs, 6)];
+					}
+					{
+					     E TaD, TaE, TaT, TaU;
+					     TaD = rio[WS(vs, 7)];
+					     {
+						  E T6k, T6a, T7b, T6G;
+						  T6k = T68 - T69;
+						  T6a = T68 + T69;
+						  T7b = T6E + T6F;
+						  T6G = T6E - T6F;
+						  T6Z = T6n - T6k;
+						  T6o = T6k + T6n;
+						  T7h = T67 - T6a;
+						  T6b = T67 + T6a;
+						  T7c = T7a - T7b;
+						  T7x = T7a + T7b;
+						  T6U = T6D + T6G;
+						  T6H = T6D - T6G;
+						  TaE = rio[WS(vs, 7) + WS(rs, 4)];
+					     }
+					     TaT = iio[WS(vs, 7)];
+					     TaU = iio[WS(vs, 7) + WS(rs, 4)];
+					     TaG = rio[WS(vs, 7) + WS(rs, 2)];
+					     Tbb = TaD - TaE;
+					     TaF = TaD + TaE;
+					     TbI = TaT + TaU;
+					     TaV = TaT - TaU;
+					     TaH = rio[WS(vs, 7) + WS(rs, 6)];
+					     Tbc = iio[WS(vs, 7) + WS(rs, 2)];
+					     Tbd = iio[WS(vs, 7) + WS(rs, 6)];
+					}
+				   }
+			      }
+			      {
+				   E T36, T3B, T35, T48, T3l, T37, T3C, T3D;
+				   {
+					E T1A, T25, T1z, T2C, T1P, T1B, T26, T27;
+					{
+					     E T1x, T1y, T1N, T1O;
+					     T1x = rio[WS(vs, 1)];
+					     {
+						  E TaS, TaI, TbJ, Tbe;
+						  TaS = TaG - TaH;
+						  TaI = TaG + TaH;
+						  TbJ = Tbc + Tbd;
+						  Tbe = Tbc - Tbd;
+						  Tbx = TaV - TaS;
+						  TaW = TaS + TaV;
+						  TbP = TaF - TaI;
+						  TaJ = TaF + TaI;
+						  TbK = TbI - TbJ;
+						  Tc5 = TbI + TbJ;
+						  Tbs = Tbb + Tbe;
+						  Tbf = Tbb - Tbe;
+						  T1y = rio[WS(vs, 1) + WS(rs, 4)];
+					     }
+					     T1N = iio[WS(vs, 1)];
+					     T1O = iio[WS(vs, 1) + WS(rs, 4)];
+					     T1A = rio[WS(vs, 1) + WS(rs, 2)];
+					     T25 = T1x - T1y;
+					     T1z = T1x + T1y;
+					     T2C = T1N + T1O;
+					     T1P = T1N - T1O;
+					     T1B = rio[WS(vs, 1) + WS(rs, 6)];
+					     T26 = iio[WS(vs, 1) + WS(rs, 2)];
+					     T27 = iio[WS(vs, 1) + WS(rs, 6)];
+					}
+					{
+					     E T33, T34, T3j, T3k;
+					     T33 = rio[WS(vs, 2)];
+					     {
+						  E T1M, T1C, T2D, T28;
+						  T1M = T1A - T1B;
+						  T1C = T1A + T1B;
+						  T2D = T26 + T27;
+						  T28 = T26 - T27;
+						  T2r = T1P - T1M;
+						  T1Q = T1M + T1P;
+						  T2J = T1z - T1C;
+						  T1D = T1z + T1C;
+						  T2E = T2C - T2D;
+						  T2Z = T2C + T2D;
+						  T2m = T25 + T28;
+						  T29 = T25 - T28;
+						  T34 = rio[WS(vs, 2) + WS(rs, 4)];
+					     }
+					     T3j = iio[WS(vs, 2)];
+					     T3k = iio[WS(vs, 2) + WS(rs, 4)];
+					     T36 = rio[WS(vs, 2) + WS(rs, 2)];
+					     T3B = T33 - T34;
+					     T35 = T33 + T34;
+					     T48 = T3j + T3k;
+					     T3l = T3j - T3k;
+					     T37 = rio[WS(vs, 2) + WS(rs, 6)];
+					     T3C = iio[WS(vs, 2) + WS(rs, 2)];
+					     T3D = iio[WS(vs, 2) + WS(rs, 6)];
+					}
+				   }
+				   {
+					E T7E, T89, T7D, T8G, T7T, T7F, T8a, T8b;
+					{
+					     E T7B, T7C, T7R, T7S;
+					     T7B = rio[WS(vs, 5)];
+					     {
+						  E T3i, T38, T49, T3E;
+						  T3i = T36 - T37;
+						  T38 = T36 + T37;
+						  T49 = T3C + T3D;
+						  T3E = T3C - T3D;
+						  T3X = T3l - T3i;
+						  T3m = T3i + T3l;
+						  T4f = T35 - T38;
+						  T39 = T35 + T38;
+						  T4a = T48 - T49;
+						  T4v = T48 + T49;
+						  T3S = T3B + T3E;
+						  T3F = T3B - T3E;
+						  T7C = rio[WS(vs, 5) + WS(rs, 4)];
+					     }
+					     T7R = iio[WS(vs, 5)];
+					     T7S = iio[WS(vs, 5) + WS(rs, 4)];
+					     T7E = rio[WS(vs, 5) + WS(rs, 2)];
+					     T89 = T7B - T7C;
+					     T7D = T7B + T7C;
+					     T8G = T7R + T7S;
+					     T7T = T7R - T7S;
+					     T7F = rio[WS(vs, 5) + WS(rs, 6)];
+					     T8a = iio[WS(vs, 5) + WS(rs, 2)];
+					     T8b = iio[WS(vs, 5) + WS(rs, 6)];
+					}
+					{
+					     E T97, T98, T9n, T9o;
+					     T97 = rio[WS(vs, 6)];
+					     {
+						  E T7Q, T7G, T8H, T8c;
+						  T7Q = T7E - T7F;
+						  T7G = T7E + T7F;
+						  T8H = T8a + T8b;
+						  T8c = T8a - T8b;
+						  T8v = T7T - T7Q;
+						  T7U = T7Q + T7T;
+						  T8N = T7D - T7G;
+						  T7H = T7D + T7G;
+						  T8I = T8G - T8H;
+						  T93 = T8G + T8H;
+						  T8q = T89 + T8c;
+						  T8d = T89 - T8c;
+						  T98 = rio[WS(vs, 6) + WS(rs, 4)];
+					     }
+					     T9n = iio[WS(vs, 6)];
+					     T9o = iio[WS(vs, 6) + WS(rs, 4)];
+					     T9a = rio[WS(vs, 6) + WS(rs, 2)];
+					     T9F = T97 - T98;
+					     T99 = T97 + T98;
+					     Tac = T9n + T9o;
+					     T9p = T9n - T9o;
+					     T9b = rio[WS(vs, 6) + WS(rs, 6)];
+					     T9G = iio[WS(vs, 6) + WS(rs, 2)];
+					     T9H = iio[WS(vs, 6) + WS(rs, 6)];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E TbQ, TaX, Tbg, Tb6, TbR, Tb0;
+			      {
+				   E T5M, T4T, T5c, T52, T5N, T4W;
+				   {
+					E Tu, TE, TF, Tp;
+					{
+					     E Tb, Tq, Ta, T1e, Tt, Tc, Tm, Tn;
+					     {
+						  E T8, T9, Tr, Ts;
+						  T8 = rio[WS(rs, 1)];
+						  {
+						       E T9m, T9c, Tad, T9I;
+						       T9m = T9a - T9b;
+						       T9c = T9a + T9b;
+						       Tad = T9G + T9H;
+						       T9I = T9G - T9H;
+						       Ta1 = T9p - T9m;
+						       T9q = T9m + T9p;
+						       Taj = T99 - T9c;
+						       T9d = T99 + T9c;
+						       Tae = Tac - Tad;
+						       Taz = Tac + Tad;
+						       T9W = T9F + T9I;
+						       T9J = T9F - T9I;
+						       T9 = rio[WS(rs, 5)];
+						  }
+						  Tr = iio[WS(rs, 1)];
+						  Ts = iio[WS(rs, 5)];
+						  Tb = rio[WS(rs, 7)];
+						  Tq = T8 - T9;
+						  Ta = T8 + T9;
+						  T1e = Tr + Ts;
+						  Tt = Tr - Ts;
+						  Tc = rio[WS(rs, 3)];
+						  Tm = iio[WS(rs, 7)];
+						  Tn = iio[WS(rs, 3)];
+					     }
+					     {
+						  E Tl, Td, T1f, To;
+						  Tu = Tq + Tt;
+						  TE = Tt - Tq;
+						  Tl = Tb - Tc;
+						  Td = Tb + Tc;
+						  T1f = Tm + Tn;
+						  To = Tm - Tn;
+						  Te = Ta + Td;
+						  T19 = Td - Ta;
+						  T1u = T1e + T1f;
+						  T1g = T1e - T1f;
+						  TF = Tl + To;
+						  Tp = Tl - To;
+					     }
+					}
+					{
+					     E T4I, T4Y, T4U, T51, T4L, T4V;
+					     {
+						  E T4Z, T50, T4G, T4H, T4J, T4K;
+						  T4G = rio[WS(vs, 3) + WS(rs, 1)];
+						  T4H = rio[WS(vs, 3) + WS(rs, 5)];
+						  Tv = Tp - Tu;
+						  TR = Tu + Tp;
+						  TG = TE - TF;
+						  TW = TE + TF;
+						  T4I = T4G + T4H;
+						  T4Y = T4G - T4H;
+						  T4Z = iio[WS(vs, 3) + WS(rs, 1)];
+						  T50 = iio[WS(vs, 3) + WS(rs, 5)];
+						  T4J = rio[WS(vs, 3) + WS(rs, 7)];
+						  T4K = rio[WS(vs, 3) + WS(rs, 3)];
+						  T4U = iio[WS(vs, 3) + WS(rs, 7)];
+						  T51 = T4Z - T50;
+						  T5M = T4Z + T50;
+						  T4L = T4J + T4K;
+						  T4T = T4J - T4K;
+						  T4V = iio[WS(vs, 3) + WS(rs, 3)];
+					     }
+					     T5c = T51 - T4Y;
+					     T52 = T4Y + T51;
+					     T5H = T4L - T4I;
+					     T4M = T4I + T4L;
+					     T5N = T4U + T4V;
+					     T4W = T4U - T4V;
+					}
+				   }
+				   {
+					E T7i, T6p, T6y, T6I, T6s, T7j;
+					{
+					     E T6e, T6u, T6q, T6x, T6h, T6r;
+					     {
+						  E T6v, T6w, T6f, T6g;
+						  {
+						       E T4X, T5d, T6c, T6d;
+						       T6c = rio[WS(vs, 4) + WS(rs, 1)];
+						       T6d = rio[WS(vs, 4) + WS(rs, 5)];
+						       T5O = T5M - T5N;
+						       T62 = T5M + T5N;
+						       T4X = T4T - T4W;
+						       T5d = T4T + T4W;
+						       T6e = T6c + T6d;
+						       T6u = T6c - T6d;
+						       T5p = T52 + T4X;
+						       T53 = T4X - T52;
+						       T5u = T5c + T5d;
+						       T5e = T5c - T5d;
+						       T6v = iio[WS(vs, 4) + WS(rs, 1)];
+						       T6w = iio[WS(vs, 4) + WS(rs, 5)];
+						  }
+						  T6f = rio[WS(vs, 4) + WS(rs, 7)];
+						  T6g = rio[WS(vs, 4) + WS(rs, 3)];
+						  T6q = iio[WS(vs, 4) + WS(rs, 7)];
+						  T7i = T6v + T6w;
+						  T6x = T6v - T6w;
+						  T6p = T6f - T6g;
+						  T6h = T6f + T6g;
+						  T6r = iio[WS(vs, 4) + WS(rs, 3)];
+					     }
+					     T6y = T6u + T6x;
+					     T6I = T6x - T6u;
+					     T6i = T6e + T6h;
+					     T7d = T6h - T6e;
+					     T6s = T6q - T6r;
+					     T7j = T6q + T6r;
+					}
+					{
+					     E Tb2, TaM, TaY, Tb5, TaP, TaZ;
+					     {
+						  E Tb3, Tb4, TaN, TaO;
+						  {
+						       E T6J, T6t, TaK, TaL;
+						       TaK = rio[WS(vs, 7) + WS(rs, 1)];
+						       TaL = rio[WS(vs, 7) + WS(rs, 5)];
+						       T7y = T7i + T7j;
+						       T7k = T7i - T7j;
+						       T6J = T6p + T6s;
+						       T6t = T6p - T6s;
+						       Tb2 = TaK - TaL;
+						       TaM = TaK + TaL;
+						       T6z = T6t - T6y;
+						       T6V = T6y + T6t;
+						       T6K = T6I - T6J;
+						       T70 = T6I + T6J;
+						       Tb3 = iio[WS(vs, 7) + WS(rs, 1)];
+						       Tb4 = iio[WS(vs, 7) + WS(rs, 5)];
+						  }
+						  TaN = rio[WS(vs, 7) + WS(rs, 7)];
+						  TaO = rio[WS(vs, 7) + WS(rs, 3)];
+						  TaY = iio[WS(vs, 7) + WS(rs, 7)];
+						  Tb5 = Tb3 - Tb4;
+						  TbQ = Tb3 + Tb4;
+						  TaP = TaN + TaO;
+						  TaX = TaN - TaO;
+						  TaZ = iio[WS(vs, 7) + WS(rs, 3)];
+					     }
+					     Tbg = Tb5 - Tb2;
+					     Tb6 = Tb2 + Tb5;
+					     TbL = TaP - TaM;
+					     TaQ = TaM + TaP;
+					     TbR = TaY + TaZ;
+					     Tb0 = TaY - TaZ;
+					}
+				   }
+			      }
+			      {
+				   E T4g, T3n, T3G, T3w, T4h, T3q;
+				   {
+					E T2K, T1R, T20, T2a, T1U, T2L;
+					{
+					     E T1G, T1W, T1S, T1Z, T1J, T1T;
+					     {
+						  E T1X, T1Y, T1H, T1I;
+						  {
+						       E Tb1, Tbh, T1E, T1F;
+						       T1E = rio[WS(vs, 1) + WS(rs, 1)];
+						       T1F = rio[WS(vs, 1) + WS(rs, 5)];
+						       TbS = TbQ - TbR;
+						       Tc6 = TbQ + TbR;
+						       Tb1 = TaX - Tb0;
+						       Tbh = TaX + Tb0;
+						       T1G = T1E + T1F;
+						       T1W = T1E - T1F;
+						       Tbt = Tb6 + Tb1;
+						       Tb7 = Tb1 - Tb6;
+						       Tby = Tbg + Tbh;
+						       Tbi = Tbg - Tbh;
+						       T1X = iio[WS(vs, 1) + WS(rs, 1)];
+						       T1Y = iio[WS(vs, 1) + WS(rs, 5)];
+						  }
+						  T1H = rio[WS(vs, 1) + WS(rs, 7)];
+						  T1I = rio[WS(vs, 1) + WS(rs, 3)];
+						  T1S = iio[WS(vs, 1) + WS(rs, 7)];
+						  T2K = T1X + T1Y;
+						  T1Z = T1X - T1Y;
+						  T1R = T1H - T1I;
+						  T1J = T1H + T1I;
+						  T1T = iio[WS(vs, 1) + WS(rs, 3)];
+					     }
+					     T20 = T1W + T1Z;
+					     T2a = T1Z - T1W;
+					     T1K = T1G + T1J;
+					     T2F = T1J - T1G;
+					     T1U = T1S - T1T;
+					     T2L = T1S + T1T;
+					}
+					{
+					     E T3s, T3c, T3o, T3v, T3f, T3p;
+					     {
+						  E T3t, T3u, T3d, T3e;
+						  {
+						       E T2b, T1V, T3a, T3b;
+						       T3a = rio[WS(vs, 2) + WS(rs, 1)];
+						       T3b = rio[WS(vs, 2) + WS(rs, 5)];
+						       T30 = T2K + T2L;
+						       T2M = T2K - T2L;
+						       T2b = T1R + T1U;
+						       T1V = T1R - T1U;
+						       T3s = T3a - T3b;
+						       T3c = T3a + T3b;
+						       T21 = T1V - T20;
+						       T2n = T20 + T1V;
+						       T2c = T2a - T2b;
+						       T2s = T2a + T2b;
+						       T3t = iio[WS(vs, 2) + WS(rs, 1)];
+						       T3u = iio[WS(vs, 2) + WS(rs, 5)];
+						  }
+						  T3d = rio[WS(vs, 2) + WS(rs, 7)];
+						  T3e = rio[WS(vs, 2) + WS(rs, 3)];
+						  T3o = iio[WS(vs, 2) + WS(rs, 7)];
+						  T3v = T3t - T3u;
+						  T4g = T3t + T3u;
+						  T3f = T3d + T3e;
+						  T3n = T3d - T3e;
+						  T3p = iio[WS(vs, 2) + WS(rs, 3)];
+					     }
+					     T3G = T3v - T3s;
+					     T3w = T3s + T3v;
+					     T4b = T3f - T3c;
+					     T3g = T3c + T3f;
+					     T4h = T3o + T3p;
+					     T3q = T3o - T3p;
+					}
+				   }
+				   {
+					E T8O, T7V, T84, T8e, T7Y, T8P;
+					{
+					     E T7K, T80, T7W, T83, T7N, T7X;
+					     {
+						  E T81, T82, T7L, T7M;
+						  {
+						       E T3r, T3H, T7I, T7J;
+						       T7I = rio[WS(vs, 5) + WS(rs, 1)];
+						       T7J = rio[WS(vs, 5) + WS(rs, 5)];
+						       T4i = T4g - T4h;
+						       T4w = T4g + T4h;
+						       T3r = T3n - T3q;
+						       T3H = T3n + T3q;
+						       T7K = T7I + T7J;
+						       T80 = T7I - T7J;
+						       T3T = T3w + T3r;
+						       T3x = T3r - T3w;
+						       T3Y = T3G + T3H;
+						       T3I = T3G - T3H;
+						       T81 = iio[WS(vs, 5) + WS(rs, 1)];
+						       T82 = iio[WS(vs, 5) + WS(rs, 5)];
+						  }
+						  T7L = rio[WS(vs, 5) + WS(rs, 7)];
+						  T7M = rio[WS(vs, 5) + WS(rs, 3)];
+						  T7W = iio[WS(vs, 5) + WS(rs, 7)];
+						  T8O = T81 + T82;
+						  T83 = T81 - T82;
+						  T7V = T7L - T7M;
+						  T7N = T7L + T7M;
+						  T7X = iio[WS(vs, 5) + WS(rs, 3)];
+					     }
+					     T84 = T80 + T83;
+					     T8e = T83 - T80;
+					     T7O = T7K + T7N;
+					     T8J = T7N - T7K;
+					     T7Y = T7W - T7X;
+					     T8P = T7W + T7X;
+					}
+					{
+					     E T9w, T9g, T9s, T9z, T9j, T9t;
+					     {
+						  E T9x, T9y, T9h, T9i;
+						  {
+						       E T8f, T7Z, T9e, T9f;
+						       T9e = rio[WS(vs, 6) + WS(rs, 1)];
+						       T9f = rio[WS(vs, 6) + WS(rs, 5)];
+						       T94 = T8O + T8P;
+						       T8Q = T8O - T8P;
+						       T8f = T7V + T7Y;
+						       T7Z = T7V - T7Y;
+						       T9w = T9e - T9f;
+						       T9g = T9e + T9f;
+						       T85 = T7Z - T84;
+						       T8r = T84 + T7Z;
+						       T8g = T8e - T8f;
+						       T8w = T8e + T8f;
+						       T9x = iio[WS(vs, 6) + WS(rs, 1)];
+						       T9y = iio[WS(vs, 6) + WS(rs, 5)];
+						  }
+						  T9h = rio[WS(vs, 6) + WS(rs, 7)];
+						  T9i = rio[WS(vs, 6) + WS(rs, 3)];
+						  T9s = iio[WS(vs, 6) + WS(rs, 7)];
+						  T9z = T9x - T9y;
+						  Tak = T9x + T9y;
+						  T9j = T9h + T9i;
+						  T9r = T9h - T9i;
+						  T9t = iio[WS(vs, 6) + WS(rs, 3)];
+					     }
+					     T9K = T9z - T9w;
+					     T9A = T9w + T9z;
+					     Taf = T9j - T9g;
+					     T9k = T9g + T9j;
+					     Tal = T9s + T9t;
+					     T9u = T9s - T9t;
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T9X, T9B, Ta2, T9M, T2T, T2Q, TbT, TbH, TbO, TbN, TbU;
+			 {
+			      E Tam, TaA, T9v, T9L;
+			      rio[0] = T7 + Te;
+			      iio[0] = T1t + T1u;
+			      Tam = Tak - Tal;
+			      TaA = Tak + Tal;
+			      T9v = T9r - T9u;
+			      T9L = T9r + T9u;
+			      rio[WS(rs, 1)] = T1D + T1K;
+			      iio[WS(rs, 1)] = T2Z + T30;
+			      T9X = T9A + T9v;
+			      T9B = T9v - T9A;
+			      Ta2 = T9K + T9L;
+			      T9M = T9K - T9L;
+			      rio[WS(rs, 2)] = T39 + T3g;
+			      iio[WS(rs, 2)] = T4v + T4w;
+			      rio[WS(rs, 3)] = T4F + T4M;
+			      iio[WS(rs, 3)] = T61 + T62;
+			      rio[WS(rs, 4)] = T6b + T6i;
+			      iio[WS(rs, 4)] = T7x + T7y;
+			      rio[WS(rs, 5)] = T7H + T7O;
+			      iio[WS(rs, 5)] = T93 + T94;
+			      rio[WS(rs, 6)] = T9d + T9k;
+			      iio[WS(rs, 6)] = Taz + TaA;
+			      rio[WS(rs, 7)] = TaJ + TaQ;
+			      iio[WS(rs, 7)] = Tc5 + Tc6;
+			      {
+				   E T10, T13, T1h, T1a, Tat, Taq, TbC, TbF, TbE, TbG, TbD;
+				   {
+					E T1q, T1v, T1s, T1w, T1r;
+					{
+					     E T2N, T2B, T2I, T2H, T2O;
+					     {
+						  E TS, TX, TP, TU, T2G, TY, TT;
+						  T10 = FMA(KP707106781, TR, TQ);
+						  TS = FNMS(KP707106781, TR, TQ);
+						  TX = FNMS(KP707106781, TW, TV);
+						  T13 = FMA(KP707106781, TW, TV);
+						  TP = W[8];
+						  TU = W[9];
+						  T2T = T2J + T2M;
+						  T2N = T2J - T2M;
+						  T2G = T2E - T2F;
+						  T2Q = T2F + T2E;
+						  TY = TP * TX;
+						  TT = TP * TS;
+						  T2B = W[10];
+						  T2I = W[11];
+						  iio[WS(vs, 5)] = FNMS(TU, TS, TY);
+						  rio[WS(vs, 5)] = FMA(TU, TX, TT);
+						  T2H = T2B * T2G;
+						  T2O = T2I * T2G;
+					     }
+					     {
+						  E T1n, T1k, T1j, T1m, T1l, T1o, T1p;
+						  T1h = T1d - T1g;
+						  T1n = T1d + T1g;
+						  T1k = T19 + T18;
+						  T1a = T18 - T19;
+						  iio[WS(vs, 6) + WS(rs, 1)] = FNMS(T2I, T2N, T2H);
+						  rio[WS(vs, 6) + WS(rs, 1)] = FMA(T2B, T2N, T2O);
+						  T1j = W[2];
+						  T1m = W[3];
+						  T1q = T7 - Te;
+						  T1v = T1t - T1u;
+						  T1l = T1j * T1k;
+						  T1o = T1m * T1k;
+						  T1p = W[6];
+						  T1s = W[7];
+						  iio[WS(vs, 2)] = FNMS(T1m, T1n, T1l);
+						  rio[WS(vs, 2)] = FMA(T1j, T1n, T1o);
+						  T1w = T1p * T1v;
+						  T1r = T1p * T1q;
+					     }
+					}
+					{
+					     E Tc2, Tc7, Tc4, Tc8, Tc3;
+					     {
+						  E Tan, Tag, Tab, Tai, Tah, Tao, Tc1;
+						  Tat = Taj + Tam;
+						  Tan = Taj - Tam;
+						  Tag = Tae - Taf;
+						  Taq = Taf + Tae;
+						  iio[WS(vs, 4)] = FNMS(T1s, T1q, T1w);
+						  rio[WS(vs, 4)] = FMA(T1s, T1v, T1r);
+						  Tab = W[10];
+						  Tai = W[11];
+						  Tc2 = TaJ - TaQ;
+						  Tc7 = Tc5 - Tc6;
+						  Tah = Tab * Tag;
+						  Tao = Tai * Tag;
+						  Tc1 = W[6];
+						  Tc4 = W[7];
+						  iio[WS(vs, 6) + WS(rs, 6)] = FNMS(Tai, Tan, Tah);
+						  rio[WS(vs, 6) + WS(rs, 6)] = FMA(Tab, Tan, Tao);
+						  Tc8 = Tc1 * Tc7;
+						  Tc3 = Tc1 * Tc2;
+					     }
+					     {
+						  E Tbu, Tbz, Tbr, Tbw, TbA, Tbv, TbB;
+						  TbC = FMA(KP707106781, Tbt, Tbs);
+						  Tbu = FNMS(KP707106781, Tbt, Tbs);
+						  Tbz = FNMS(KP707106781, Tby, Tbx);
+						  TbF = FMA(KP707106781, Tby, Tbx);
+						  iio[WS(vs, 4) + WS(rs, 7)] = FNMS(Tc4, Tc2, Tc8);
+						  rio[WS(vs, 4) + WS(rs, 7)] = FMA(Tc4, Tc7, Tc3);
+						  Tbr = W[8];
+						  Tbw = W[9];
+						  TbA = Tbr * Tbz;
+						  Tbv = Tbr * Tbu;
+						  TbB = W[0];
+						  TbE = W[1];
+						  iio[WS(vs, 5) + WS(rs, 7)] = FNMS(Tbw, Tbu, TbA);
+						  rio[WS(vs, 5) + WS(rs, 7)] = FMA(Tbw, Tbz, Tbv);
+						  TbG = TbB * TbF;
+						  TbD = TbB * TbC;
+					     }
+					}
+				   }
+				   {
+					E T2o, T2t, T2q, T2u, T2p;
+					{
+					     E T2w, T2z, T2y, T2A, T2x;
+					     {
+						  E TZ, T12, T14, T11, T2v;
+						  iio[WS(vs, 1) + WS(rs, 7)] = FNMS(TbE, TbC, TbG);
+						  rio[WS(vs, 1) + WS(rs, 7)] = FMA(TbE, TbF, TbD);
+						  TZ = W[0];
+						  T12 = W[1];
+						  T2o = FNMS(KP707106781, T2n, T2m);
+						  T2w = FMA(KP707106781, T2n, T2m);
+						  T2z = FMA(KP707106781, T2s, T2r);
+						  T2t = FNMS(KP707106781, T2s, T2r);
+						  T14 = TZ * T13;
+						  T11 = TZ * T10;
+						  T2v = W[0];
+						  T2y = W[1];
+						  iio[WS(vs, 1)] = FNMS(T12, T10, T14);
+						  rio[WS(vs, 1)] = FMA(T12, T13, T11);
+						  T2A = T2v * T2z;
+						  T2x = T2v * T2w;
+					     }
+					     {
+						  E T15, T1c, T1b, T1i, T2l;
+						  iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T2y, T2w, T2A);
+						  rio[WS(vs, 1) + WS(rs, 1)] = FMA(T2y, T2z, T2x);
+						  T15 = W[10];
+						  T1c = W[11];
+						  T1b = T15 * T1a;
+						  T1i = T1c * T1a;
+						  T2l = W[8];
+						  T2q = W[9];
+						  iio[WS(vs, 6)] = FNMS(T1c, T1h, T1b);
+						  rio[WS(vs, 6)] = FMA(T15, T1h, T1i);
+						  T2u = T2l * T2t;
+						  T2p = T2l * T2o;
+					     }
+					}
+					{
+					     E TbZ, TbM, TbV, TbY, TbX, Tc0;
+					     {
+						  E Tap, Tas, TbW, Tar, Tau;
+						  iio[WS(vs, 5) + WS(rs, 1)] = FNMS(T2q, T2o, T2u);
+						  rio[WS(vs, 5) + WS(rs, 1)] = FMA(T2q, T2t, T2p);
+						  Tap = W[2];
+						  Tas = W[3];
+						  TbT = TbP - TbS;
+						  TbZ = TbP + TbS;
+						  TbW = TbL + TbK;
+						  TbM = TbK - TbL;
+						  Tar = Tap * Taq;
+						  Tau = Tas * Taq;
+						  TbV = W[2];
+						  TbY = W[3];
+						  iio[WS(vs, 2) + WS(rs, 6)] = FNMS(Tas, Tat, Tar);
+						  rio[WS(vs, 2) + WS(rs, 6)] = FMA(Tap, Tat, Tau);
+						  TbX = TbV * TbW;
+						  Tc0 = TbY * TbW;
+					     }
+					     {
+						  E Taw, TaB, Tav, Tay, TaC, Tax;
+						  Taw = T9d - T9k;
+						  TaB = Taz - TaA;
+						  iio[WS(vs, 2) + WS(rs, 7)] = FNMS(TbY, TbZ, TbX);
+						  rio[WS(vs, 2) + WS(rs, 7)] = FMA(TbV, TbZ, Tc0);
+						  Tav = W[6];
+						  Tay = W[7];
+						  TaC = Tav * TaB;
+						  Tax = Tav * Taw;
+						  TbH = W[10];
+						  TbO = W[11];
+						  iio[WS(vs, 4) + WS(rs, 6)] = FNMS(Tay, Taw, TaC);
+						  rio[WS(vs, 4) + WS(rs, 6)] = FMA(Tay, TaB, Tax);
+						  TbN = TbH * TbM;
+						  TbU = TbO * TbM;
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T5q, T5v, T8R, T8K, T90, T95, T92, T96, T91;
+			      {
+				   E T3U, T3Z, T74, T77, T9Y, Ta3, T7l, T7e, T8X, T8T, T8W, T8V, T8Y;
+				   {
+					E T5y, T5B, T5A, T5C, T5z;
+					{
+					     E T5Y, T63, T60, T64, T5Z;
+					     {
+						  E T2P, T2S, T2R, T2U, T5X;
+						  iio[WS(vs, 6) + WS(rs, 7)] = FNMS(TbO, TbT, TbN);
+						  rio[WS(vs, 6) + WS(rs, 7)] = FMA(TbH, TbT, TbU);
+						  T2P = W[2];
+						  T2S = W[3];
+						  T5Y = T4F - T4M;
+						  T63 = T61 - T62;
+						  T2R = T2P * T2Q;
+						  T2U = T2S * T2Q;
+						  T5X = W[6];
+						  T60 = W[7];
+						  iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T2S, T2T, T2R);
+						  rio[WS(vs, 2) + WS(rs, 1)] = FMA(T2P, T2T, T2U);
+						  T64 = T5X * T63;
+						  T5Z = T5X * T5Y;
+					     }
+					     {
+						  E T42, T45, T41, T44, T46, T43, T5x;
+						  T3U = FNMS(KP707106781, T3T, T3S);
+						  T42 = FMA(KP707106781, T3T, T3S);
+						  T45 = FMA(KP707106781, T3Y, T3X);
+						  T3Z = FNMS(KP707106781, T3Y, T3X);
+						  iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T60, T5Y, T64);
+						  rio[WS(vs, 4) + WS(rs, 3)] = FMA(T60, T63, T5Z);
+						  T41 = W[0];
+						  T44 = W[1];
+						  T5q = FNMS(KP707106781, T5p, T5o);
+						  T5y = FMA(KP707106781, T5p, T5o);
+						  T5B = FMA(KP707106781, T5u, T5t);
+						  T5v = FNMS(KP707106781, T5u, T5t);
+						  T46 = T41 * T45;
+						  T43 = T41 * T42;
+						  T5x = W[0];
+						  T5A = W[1];
+						  iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T44, T42, T46);
+						  rio[WS(vs, 1) + WS(rs, 2)] = FMA(T44, T45, T43);
+						  T5C = T5x * T5B;
+						  T5z = T5x * T5y;
+					     }
+					}
+					{
+					     E Ta6, Ta9, Ta8, Taa, Ta7;
+					     {
+						  E T6W, T71, T6T, T6Y, T72, T6X, Ta5;
+						  T74 = FMA(KP707106781, T6V, T6U);
+						  T6W = FNMS(KP707106781, T6V, T6U);
+						  T71 = FNMS(KP707106781, T70, T6Z);
+						  T77 = FMA(KP707106781, T70, T6Z);
+						  iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T5A, T5y, T5C);
+						  rio[WS(vs, 1) + WS(rs, 3)] = FMA(T5A, T5B, T5z);
+						  T6T = W[8];
+						  T6Y = W[9];
+						  T9Y = FNMS(KP707106781, T9X, T9W);
+						  Ta6 = FMA(KP707106781, T9X, T9W);
+						  Ta9 = FMA(KP707106781, Ta2, Ta1);
+						  Ta3 = FNMS(KP707106781, Ta2, Ta1);
+						  T72 = T6T * T71;
+						  T6X = T6T * T6W;
+						  Ta5 = W[0];
+						  Ta8 = W[1];
+						  iio[WS(vs, 5) + WS(rs, 4)] = FNMS(T6Y, T6W, T72);
+						  rio[WS(vs, 5) + WS(rs, 4)] = FMA(T6Y, T71, T6X);
+						  Taa = Ta5 * Ta9;
+						  Ta7 = Ta5 * Ta6;
+					     }
+					     {
+						  E T7r, T7o, T7n, T7q, T8U, T7p, T7s;
+						  T7l = T7h - T7k;
+						  T7r = T7h + T7k;
+						  T7o = T7d + T7c;
+						  T7e = T7c - T7d;
+						  iio[WS(vs, 1) + WS(rs, 6)] = FNMS(Ta8, Ta6, Taa);
+						  rio[WS(vs, 1) + WS(rs, 6)] = FMA(Ta8, Ta9, Ta7);
+						  T7n = W[2];
+						  T7q = W[3];
+						  T8R = T8N - T8Q;
+						  T8X = T8N + T8Q;
+						  T8U = T8J + T8I;
+						  T8K = T8I - T8J;
+						  T7p = T7n * T7o;
+						  T7s = T7q * T7o;
+						  T8T = W[2];
+						  T8W = W[3];
+						  iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T7q, T7r, T7p);
+						  rio[WS(vs, 2) + WS(rs, 4)] = FMA(T7n, T7r, T7s);
+						  T8V = T8T * T8U;
+						  T8Y = T8W * T8U;
+					     }
+					}
+				   }
+				   {
+					E T5P, T5D, T5K, T5J, T5Q, Ta0, Ta4, T9Z;
+					{
+					     E T5V, T5I, T5R, T5U, T5T, T5W;
+					     {
+						  E T2W, T31, T2V, T2Y, T5S, T32, T2X;
+						  T2W = T1D - T1K;
+						  T31 = T2Z - T30;
+						  iio[WS(vs, 2) + WS(rs, 5)] = FNMS(T8W, T8X, T8V);
+						  rio[WS(vs, 2) + WS(rs, 5)] = FMA(T8T, T8X, T8Y);
+						  T2V = W[6];
+						  T2Y = W[7];
+						  T5P = T5L - T5O;
+						  T5V = T5L + T5O;
+						  T5S = T5H + T5G;
+						  T5I = T5G - T5H;
+						  T32 = T2V * T31;
+						  T2X = T2V * T2W;
+						  T5R = W[2];
+						  T5U = W[3];
+						  iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T2Y, T2W, T32);
+						  rio[WS(vs, 4) + WS(rs, 1)] = FMA(T2Y, T31, T2X);
+						  T5T = T5R * T5S;
+						  T5W = T5U * T5S;
+					     }
+					     {
+						  E T3R, T3W, T40, T3V;
+						  iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T5U, T5V, T5T);
+						  rio[WS(vs, 2) + WS(rs, 3)] = FMA(T5R, T5V, T5W);
+						  T3R = W[8];
+						  T3W = W[9];
+						  T40 = T3R * T3Z;
+						  T3V = T3R * T3U;
+						  T5D = W[10];
+						  T5K = W[11];
+						  iio[WS(vs, 5) + WS(rs, 2)] = FNMS(T3W, T3U, T40);
+						  rio[WS(vs, 5) + WS(rs, 2)] = FMA(T3W, T3Z, T3V);
+						  T5J = T5D * T5I;
+						  T5Q = T5K * T5I;
+					     }
+					}
+					{
+					     E T73, T76, T78, T75, T9V;
+					     iio[WS(vs, 6) + WS(rs, 3)] = FNMS(T5K, T5P, T5J);
+					     rio[WS(vs, 6) + WS(rs, 3)] = FMA(T5D, T5P, T5Q);
+					     T73 = W[0];
+					     T76 = W[1];
+					     T78 = T73 * T77;
+					     T75 = T73 * T74;
+					     T9V = W[8];
+					     Ta0 = W[9];
+					     iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T76, T74, T78);
+					     rio[WS(vs, 1) + WS(rs, 4)] = FMA(T76, T77, T75);
+					     Ta4 = T9V * Ta3;
+					     T9Z = T9V * T9Y;
+					}
+					{
+					     E T79, T7g, T7f, T7m, T8Z;
+					     iio[WS(vs, 5) + WS(rs, 6)] = FNMS(Ta0, T9Y, Ta4);
+					     rio[WS(vs, 5) + WS(rs, 6)] = FMA(Ta0, Ta3, T9Z);
+					     T79 = W[10];
+					     T7g = W[11];
+					     T90 = T7H - T7O;
+					     T95 = T93 - T94;
+					     T7f = T79 * T7e;
+					     T7m = T7g * T7e;
+					     T8Z = W[6];
+					     T92 = W[7];
+					     iio[WS(vs, 6) + WS(rs, 4)] = FNMS(T7g, T7l, T7f);
+					     rio[WS(vs, 6) + WS(rs, 4)] = FMA(T79, T7l, T7m);
+					     T96 = T8Z * T95;
+					     T91 = T8Z * T90;
+					}
+				   }
+			      }
+			      {
+				   E T8A, T8D, T8C, T8E, T8B;
+				   {
+					E T4s, T4x, T4u, T4y, T4t;
+					{
+					     E T4p, T4m, T5s, T5w, T5r;
+					     {
+						  E T4j, T4c, T47, T4e, T4d, T4k, T5n;
+						  T4p = T4f + T4i;
+						  T4j = T4f - T4i;
+						  T4c = T4a - T4b;
+						  T4m = T4b + T4a;
+						  iio[WS(vs, 4) + WS(rs, 5)] = FNMS(T92, T90, T96);
+						  rio[WS(vs, 4) + WS(rs, 5)] = FMA(T92, T95, T91);
+						  T47 = W[10];
+						  T4e = W[11];
+						  T4d = T47 * T4c;
+						  T4k = T4e * T4c;
+						  T5n = W[8];
+						  T5s = W[9];
+						  iio[WS(vs, 6) + WS(rs, 2)] = FNMS(T4e, T4j, T4d);
+						  rio[WS(vs, 6) + WS(rs, 2)] = FMA(T47, T4j, T4k);
+						  T5w = T5n * T5v;
+						  T5r = T5n * T5q;
+					     }
+					     {
+						  E T4l, T4o, T4n, T4q, T4r;
+						  iio[WS(vs, 5) + WS(rs, 3)] = FNMS(T5s, T5q, T5w);
+						  rio[WS(vs, 5) + WS(rs, 3)] = FMA(T5s, T5v, T5r);
+						  T4l = W[2];
+						  T4o = W[3];
+						  T4s = T39 - T3g;
+						  T4x = T4v - T4w;
+						  T4n = T4l * T4m;
+						  T4q = T4o * T4m;
+						  T4r = W[6];
+						  T4u = W[7];
+						  iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T4o, T4p, T4n);
+						  rio[WS(vs, 2) + WS(rs, 2)] = FMA(T4l, T4p, T4q);
+						  T4y = T4r * T4x;
+						  T4t = T4r * T4s;
+					     }
+					}
+					{
+					     E T8F, T8M, T8L, T8S;
+					     {
+						  E T7u, T7z, T7t, T7w, T7A, T7v;
+						  T7u = T6b - T6i;
+						  T7z = T7x - T7y;
+						  iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T4u, T4s, T4y);
+						  rio[WS(vs, 4) + WS(rs, 2)] = FMA(T4u, T4x, T4t);
+						  T7t = W[6];
+						  T7w = W[7];
+						  T7A = T7t * T7z;
+						  T7v = T7t * T7u;
+						  T8F = W[10];
+						  T8M = W[11];
+						  iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T7w, T7u, T7A);
+						  rio[WS(vs, 4) + WS(rs, 4)] = FMA(T7w, T7z, T7v);
+						  T8L = T8F * T8K;
+						  T8S = T8M * T8K;
+					     }
+					     {
+						  E T8s, T8x, T8p, T8u, T8y, T8t, T8z;
+						  T8A = FMA(KP707106781, T8r, T8q);
+						  T8s = FNMS(KP707106781, T8r, T8q);
+						  T8x = FNMS(KP707106781, T8w, T8v);
+						  T8D = FMA(KP707106781, T8w, T8v);
+						  iio[WS(vs, 6) + WS(rs, 5)] = FNMS(T8M, T8R, T8L);
+						  rio[WS(vs, 6) + WS(rs, 5)] = FMA(T8F, T8R, T8S);
+						  T8p = W[8];
+						  T8u = W[9];
+						  T8y = T8p * T8x;
+						  T8t = T8p * T8s;
+						  T8z = W[0];
+						  T8C = W[1];
+						  iio[WS(vs, 5) + WS(rs, 5)] = FNMS(T8u, T8s, T8y);
+						  rio[WS(vs, 5) + WS(rs, 5)] = FMA(T8u, T8x, T8t);
+						  T8E = T8z * T8D;
+						  T8B = T8z * T8A;
+					     }
+					}
+				   }
+				   {
+					E T3y, T3J, T3h, T3A, T3z, T3K;
+					{
+					     E T54, T5f, T4N, T56, T55, T5g;
+					     {
+						  E Tw, TH, Tf, Ty, Tx, TI;
+						  {
+						       E TN, TJ, TM, TL, TO, TK;
+						       TK = FMA(KP707106781, Tv, Tk);
+						       Tw = FNMS(KP707106781, Tv, Tk);
+						       iio[WS(vs, 1) + WS(rs, 5)] = FNMS(T8C, T8A, T8E);
+						       rio[WS(vs, 1) + WS(rs, 5)] = FMA(T8C, T8D, T8B);
+						       TH = FNMS(KP707106781, TG, TD);
+						       TN = FMA(KP707106781, TG, TD);
+						       TJ = W[4];
+						       TM = W[5];
+						       Tf = W[12];
+						       TL = TJ * TK;
+						       TO = TM * TK;
+						       Ty = W[13];
+						       Tx = Tf * Tw;
+						       iio[WS(vs, 3)] = FNMS(TM, TN, TL);
+						       rio[WS(vs, 3)] = FMA(TJ, TN, TO);
+						  }
+						  TI = Ty * Tw;
+						  iio[WS(vs, 7)] = FNMS(Ty, TH, Tx);
+						  {
+						       E T5h, T5l, T5k, T5j, T5m, T5i;
+						       T5i = FMA(KP707106781, T53, T4S);
+						       T54 = FNMS(KP707106781, T53, T4S);
+						       rio[WS(vs, 7)] = FMA(Tf, TH, TI);
+						       T5h = W[4];
+						       T5f = FNMS(KP707106781, T5e, T5b);
+						       T5l = FMA(KP707106781, T5e, T5b);
+						       T5k = W[5];
+						       T5j = T5h * T5i;
+						       T4N = W[12];
+						       T5m = T5k * T5i;
+						       T56 = W[13];
+						       iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T5k, T5l, T5j);
+						       T55 = T4N * T54;
+						       rio[WS(vs, 3) + WS(rs, 3)] = FMA(T5h, T5l, T5m);
+						  }
+					     }
+					     T5g = T56 * T54;
+					     {
+						  E T22, T2d, T1L, T24, T23, T2e;
+						  {
+						       E T2j, T2f, T2i, T2h, T2k, T2g;
+						       iio[WS(vs, 7) + WS(rs, 3)] = FNMS(T56, T5f, T55);
+						       T22 = FNMS(KP707106781, T21, T1Q);
+						       T2g = FMA(KP707106781, T21, T1Q);
+						       rio[WS(vs, 7) + WS(rs, 3)] = FMA(T4N, T5f, T5g);
+						       T2d = FNMS(KP707106781, T2c, T29);
+						       T2j = FMA(KP707106781, T2c, T29);
+						       T2f = W[4];
+						       T2i = W[5];
+						       T1L = W[12];
+						       T2h = T2f * T2g;
+						       T2k = T2i * T2g;
+						       T24 = W[13];
+						       T23 = T1L * T22;
+						       iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T2i, T2j, T2h);
+						       rio[WS(vs, 3) + WS(rs, 1)] = FMA(T2f, T2j, T2k);
+						  }
+						  T2e = T24 * T22;
+						  iio[WS(vs, 7) + WS(rs, 1)] = FNMS(T24, T2d, T23);
+						  {
+						       E T3L, T3P, T3O, T3N, T3Q, T3M;
+						       T3M = FMA(KP707106781, T3x, T3m);
+						       T3y = FNMS(KP707106781, T3x, T3m);
+						       rio[WS(vs, 7) + WS(rs, 1)] = FMA(T1L, T2d, T2e);
+						       T3L = W[4];
+						       T3J = FNMS(KP707106781, T3I, T3F);
+						       T3P = FMA(KP707106781, T3I, T3F);
+						       T3O = W[5];
+						       T3N = T3L * T3M;
+						       T3h = W[12];
+						       T3Q = T3O * T3M;
+						       T3A = W[13];
+						       iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T3O, T3P, T3N);
+						       T3z = T3h * T3y;
+						       rio[WS(vs, 3) + WS(rs, 2)] = FMA(T3L, T3P, T3Q);
+						  }
+					     }
+					}
+					T3K = T3A * T3y;
+					{
+					     E Tb8, Tbj, TaR, Tba, Tb9, Tbk;
+					     {
+						  E T6A, T6L, T6j, T6C, T6B, T6M;
+						  {
+						       E T6R, T6N, T6Q, T6P, T6S, T6O;
+						       iio[WS(vs, 7) + WS(rs, 2)] = FNMS(T3A, T3J, T3z);
+						       T6A = FNMS(KP707106781, T6z, T6o);
+						       T6O = FMA(KP707106781, T6z, T6o);
+						       rio[WS(vs, 7) + WS(rs, 2)] = FMA(T3h, T3J, T3K);
+						       T6L = FNMS(KP707106781, T6K, T6H);
+						       T6R = FMA(KP707106781, T6K, T6H);
+						       T6N = W[4];
+						       T6Q = W[5];
+						       T6j = W[12];
+						       T6P = T6N * T6O;
+						       T6S = T6Q * T6O;
+						       T6C = W[13];
+						       T6B = T6j * T6A;
+						       iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T6Q, T6R, T6P);
+						       rio[WS(vs, 3) + WS(rs, 4)] = FMA(T6N, T6R, T6S);
+						  }
+						  T6M = T6C * T6A;
+						  iio[WS(vs, 7) + WS(rs, 4)] = FNMS(T6C, T6L, T6B);
+						  {
+						       E Tbl, Tbp, Tbo, Tbn, Tbq, Tbm;
+						       Tbm = FMA(KP707106781, Tb7, TaW);
+						       Tb8 = FNMS(KP707106781, Tb7, TaW);
+						       rio[WS(vs, 7) + WS(rs, 4)] = FMA(T6j, T6L, T6M);
+						       Tbl = W[4];
+						       Tbj = FNMS(KP707106781, Tbi, Tbf);
+						       Tbp = FMA(KP707106781, Tbi, Tbf);
+						       Tbo = W[5];
+						       Tbn = Tbl * Tbm;
+						       TaR = W[12];
+						       Tbq = Tbo * Tbm;
+						       Tba = W[13];
+						       iio[WS(vs, 3) + WS(rs, 7)] = FNMS(Tbo, Tbp, Tbn);
+						       Tb9 = TaR * Tb8;
+						       rio[WS(vs, 3) + WS(rs, 7)] = FMA(Tbl, Tbp, Tbq);
+						  }
+					     }
+					     Tbk = Tba * Tb8;
+					     {
+						  E T86, T8h, T7P, T88, T87, T8i;
+						  {
+						       E T8n, T8j, T8m, T8l, T8o, T8k;
+						       iio[WS(vs, 7) + WS(rs, 7)] = FNMS(Tba, Tbj, Tb9);
+						       T86 = FNMS(KP707106781, T85, T7U);
+						       T8k = FMA(KP707106781, T85, T7U);
+						       rio[WS(vs, 7) + WS(rs, 7)] = FMA(TaR, Tbj, Tbk);
+						       T8h = FNMS(KP707106781, T8g, T8d);
+						       T8n = FMA(KP707106781, T8g, T8d);
+						       T8j = W[4];
+						       T8m = W[5];
+						       T7P = W[12];
+						       T8l = T8j * T8k;
+						       T8o = T8m * T8k;
+						       T88 = W[13];
+						       T87 = T7P * T86;
+						       iio[WS(vs, 3) + WS(rs, 5)] = FNMS(T8m, T8n, T8l);
+						       rio[WS(vs, 3) + WS(rs, 5)] = FMA(T8j, T8n, T8o);
+						  }
+						  T8i = T88 * T86;
+						  iio[WS(vs, 7) + WS(rs, 5)] = FNMS(T88, T8h, T87);
+						  {
+						       E T9P, T9T, T9S, T9R, T9U, T9Q;
+						       T9Q = FMA(KP707106781, T9B, T9q);
+						       T9C = FNMS(KP707106781, T9B, T9q);
+						       rio[WS(vs, 7) + WS(rs, 5)] = FMA(T7P, T8h, T8i);
+						       T9P = W[4];
+						       T9N = FNMS(KP707106781, T9M, T9J);
+						       T9T = FMA(KP707106781, T9M, T9J);
+						       T9S = W[5];
+						       T9R = T9P * T9Q;
+						       T9l = W[12];
+						       T9U = T9S * T9Q;
+						       T9E = W[13];
+						       iio[WS(vs, 3) + WS(rs, 6)] = FNMS(T9S, T9T, T9R);
+						       T9D = T9l * T9C;
+						       rio[WS(vs, 3) + WS(rs, 6)] = FMA(T9P, T9T, T9U);
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T9O = T9E * T9C;
+	       iio[WS(vs, 7) + WS(rs, 6)] = FNMS(T9E, T9N, T9D);
+	       rio[WS(vs, 7) + WS(rs, 6)] = FMA(T9l, T9N, T9O);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 8, "q1_8", twinstr, &GENUS, {352, 112, 176, 0}, 0, 0, 0 };
+
+void X(codelet_q1_8) (planner *p) {
+     X(kdft_difsq_register) (p, q1_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 8 -name q1_8 -include q.h */
+
+/*
+ * This function contains 528 FP additions, 256 FP multiplications,
+ * (or, 416 additions, 144 multiplications, 112 fused multiply/add),
+ * 142 stack variables, 1 constants, and 256 memory accesses
+ */
+#include "q.h"
+
+static void q1_8(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 14); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
+	       E T7, T14, T1g, Tk, TC, TQ, T10, TM, T1w, T2p, T2z, T1H, T1M, T1W, T2j;
+	       E T1V, T7R, T8O, T90, T84, T8m, T8A, T8K, T8w, T9g, Ta9, Taj, T9r, T9w, T9G;
+	       E Ta3, T9F, Te, T17, T1h, Tp, Tu, TE, T11, TD, T1p, T2m, T2y, T1C, T1U;
+	       E T28, T2i, T24, T7Y, T8R, T91, T89, T8e, T8o, T8L, T8n, T99, Ta6, Tai, T9m;
+	       E T9E, T9S, Ta2, T9O, T2H, T3E, T3Q, T2U, T3c, T3q, T3A, T3m, T46, T4Z, T59;
+	       E T4h, T4m, T4w, T4T, T4v, T5h, T6e, T6q, T5u, T5M, T60, T6a, T5W, T6G, T7z;
+	       E T7J, T6R, T6W, T76, T7t, T75, T2O, T3H, T3R, T2Z, T34, T3e, T3B, T3d, T3Z;
+	       E T4W, T58, T4c, T4u, T4I, T4S, T4E, T5o, T6h, T6r, T5z, T5E, T5O, T6b, T5N;
+	       E T6z, T7w, T7I, T6M, T74, T7i, T7s, T7e;
+	       {
+		    E T3, Ty, Tj, TY, T6, Tg, TB, TZ;
+		    {
+			 E T1, T2, Th, Ti;
+			 T1 = rio[0];
+			 T2 = rio[WS(rs, 4)];
+			 T3 = T1 + T2;
+			 Ty = T1 - T2;
+			 Th = iio[0];
+			 Ti = iio[WS(rs, 4)];
+			 Tj = Th - Ti;
+			 TY = Th + Ti;
+		    }
+		    {
+			 E T4, T5, Tz, TA;
+			 T4 = rio[WS(rs, 2)];
+			 T5 = rio[WS(rs, 6)];
+			 T6 = T4 + T5;
+			 Tg = T4 - T5;
+			 Tz = iio[WS(rs, 2)];
+			 TA = iio[WS(rs, 6)];
+			 TB = Tz - TA;
+			 TZ = Tz + TA;
+		    }
+		    T7 = T3 + T6;
+		    T14 = T3 - T6;
+		    T1g = TY + TZ;
+		    Tk = Tg + Tj;
+		    TC = Ty - TB;
+		    TQ = Tj - Tg;
+		    T10 = TY - TZ;
+		    TM = Ty + TB;
+	       }
+	       {
+		    E T1s, T1I, T1L, T2n, T1v, T1D, T1G, T2o;
+		    {
+			 E T1q, T1r, T1J, T1K;
+			 T1q = rio[WS(vs, 1) + WS(rs, 1)];
+			 T1r = rio[WS(vs, 1) + WS(rs, 5)];
+			 T1s = T1q + T1r;
+			 T1I = T1q - T1r;
+			 T1J = iio[WS(vs, 1) + WS(rs, 1)];
+			 T1K = iio[WS(vs, 1) + WS(rs, 5)];
+			 T1L = T1J - T1K;
+			 T2n = T1J + T1K;
+		    }
+		    {
+			 E T1t, T1u, T1E, T1F;
+			 T1t = rio[WS(vs, 1) + WS(rs, 7)];
+			 T1u = rio[WS(vs, 1) + WS(rs, 3)];
+			 T1v = T1t + T1u;
+			 T1D = T1t - T1u;
+			 T1E = iio[WS(vs, 1) + WS(rs, 7)];
+			 T1F = iio[WS(vs, 1) + WS(rs, 3)];
+			 T1G = T1E - T1F;
+			 T2o = T1E + T1F;
+		    }
+		    T1w = T1s + T1v;
+		    T2p = T2n - T2o;
+		    T2z = T2n + T2o;
+		    T1H = T1D - T1G;
+		    T1M = T1I + T1L;
+		    T1W = T1D + T1G;
+		    T2j = T1v - T1s;
+		    T1V = T1L - T1I;
+	       }
+	       {
+		    E T7N, T8i, T83, T8I, T7Q, T80, T8l, T8J;
+		    {
+			 E T7L, T7M, T81, T82;
+			 T7L = rio[WS(vs, 6)];
+			 T7M = rio[WS(vs, 6) + WS(rs, 4)];
+			 T7N = T7L + T7M;
+			 T8i = T7L - T7M;
+			 T81 = iio[WS(vs, 6)];
+			 T82 = iio[WS(vs, 6) + WS(rs, 4)];
+			 T83 = T81 - T82;
+			 T8I = T81 + T82;
+		    }
+		    {
+			 E T7O, T7P, T8j, T8k;
+			 T7O = rio[WS(vs, 6) + WS(rs, 2)];
+			 T7P = rio[WS(vs, 6) + WS(rs, 6)];
+			 T7Q = T7O + T7P;
+			 T80 = T7O - T7P;
+			 T8j = iio[WS(vs, 6) + WS(rs, 2)];
+			 T8k = iio[WS(vs, 6) + WS(rs, 6)];
+			 T8l = T8j - T8k;
+			 T8J = T8j + T8k;
+		    }
+		    T7R = T7N + T7Q;
+		    T8O = T7N - T7Q;
+		    T90 = T8I + T8J;
+		    T84 = T80 + T83;
+		    T8m = T8i - T8l;
+		    T8A = T83 - T80;
+		    T8K = T8I - T8J;
+		    T8w = T8i + T8l;
+	       }
+	       {
+		    E T9c, T9s, T9v, Ta7, T9f, T9n, T9q, Ta8;
+		    {
+			 E T9a, T9b, T9t, T9u;
+			 T9a = rio[WS(vs, 7) + WS(rs, 1)];
+			 T9b = rio[WS(vs, 7) + WS(rs, 5)];
+			 T9c = T9a + T9b;
+			 T9s = T9a - T9b;
+			 T9t = iio[WS(vs, 7) + WS(rs, 1)];
+			 T9u = iio[WS(vs, 7) + WS(rs, 5)];
+			 T9v = T9t - T9u;
+			 Ta7 = T9t + T9u;
+		    }
+		    {
+			 E T9d, T9e, T9o, T9p;
+			 T9d = rio[WS(vs, 7) + WS(rs, 7)];
+			 T9e = rio[WS(vs, 7) + WS(rs, 3)];
+			 T9f = T9d + T9e;
+			 T9n = T9d - T9e;
+			 T9o = iio[WS(vs, 7) + WS(rs, 7)];
+			 T9p = iio[WS(vs, 7) + WS(rs, 3)];
+			 T9q = T9o - T9p;
+			 Ta8 = T9o + T9p;
+		    }
+		    T9g = T9c + T9f;
+		    Ta9 = Ta7 - Ta8;
+		    Taj = Ta7 + Ta8;
+		    T9r = T9n - T9q;
+		    T9w = T9s + T9v;
+		    T9G = T9n + T9q;
+		    Ta3 = T9f - T9c;
+		    T9F = T9v - T9s;
+	       }
+	       {
+		    E Ta, Tq, Tt, T15, Td, Tl, To, T16;
+		    {
+			 E T8, T9, Tr, Ts;
+			 T8 = rio[WS(rs, 1)];
+			 T9 = rio[WS(rs, 5)];
+			 Ta = T8 + T9;
+			 Tq = T8 - T9;
+			 Tr = iio[WS(rs, 1)];
+			 Ts = iio[WS(rs, 5)];
+			 Tt = Tr - Ts;
+			 T15 = Tr + Ts;
+		    }
+		    {
+			 E Tb, Tc, Tm, Tn;
+			 Tb = rio[WS(rs, 7)];
+			 Tc = rio[WS(rs, 3)];
+			 Td = Tb + Tc;
+			 Tl = Tb - Tc;
+			 Tm = iio[WS(rs, 7)];
+			 Tn = iio[WS(rs, 3)];
+			 To = Tm - Tn;
+			 T16 = Tm + Tn;
+		    }
+		    Te = Ta + Td;
+		    T17 = T15 - T16;
+		    T1h = T15 + T16;
+		    Tp = Tl - To;
+		    Tu = Tq + Tt;
+		    TE = Tl + To;
+		    T11 = Td - Ta;
+		    TD = Tt - Tq;
+	       }
+	       {
+		    E T1l, T1Q, T1B, T2g, T1o, T1y, T1T, T2h;
+		    {
+			 E T1j, T1k, T1z, T1A;
+			 T1j = rio[WS(vs, 1)];
+			 T1k = rio[WS(vs, 1) + WS(rs, 4)];
+			 T1l = T1j + T1k;
+			 T1Q = T1j - T1k;
+			 T1z = iio[WS(vs, 1)];
+			 T1A = iio[WS(vs, 1) + WS(rs, 4)];
+			 T1B = T1z - T1A;
+			 T2g = T1z + T1A;
+		    }
+		    {
+			 E T1m, T1n, T1R, T1S;
+			 T1m = rio[WS(vs, 1) + WS(rs, 2)];
+			 T1n = rio[WS(vs, 1) + WS(rs, 6)];
+			 T1o = T1m + T1n;
+			 T1y = T1m - T1n;
+			 T1R = iio[WS(vs, 1) + WS(rs, 2)];
+			 T1S = iio[WS(vs, 1) + WS(rs, 6)];
+			 T1T = T1R - T1S;
+			 T2h = T1R + T1S;
+		    }
+		    T1p = T1l + T1o;
+		    T2m = T1l - T1o;
+		    T2y = T2g + T2h;
+		    T1C = T1y + T1B;
+		    T1U = T1Q - T1T;
+		    T28 = T1B - T1y;
+		    T2i = T2g - T2h;
+		    T24 = T1Q + T1T;
+	       }
+	       {
+		    E T7U, T8a, T8d, T8P, T7X, T85, T88, T8Q;
+		    {
+			 E T7S, T7T, T8b, T8c;
+			 T7S = rio[WS(vs, 6) + WS(rs, 1)];
+			 T7T = rio[WS(vs, 6) + WS(rs, 5)];
+			 T7U = T7S + T7T;
+			 T8a = T7S - T7T;
+			 T8b = iio[WS(vs, 6) + WS(rs, 1)];
+			 T8c = iio[WS(vs, 6) + WS(rs, 5)];
+			 T8d = T8b - T8c;
+			 T8P = T8b + T8c;
+		    }
+		    {
+			 E T7V, T7W, T86, T87;
+			 T7V = rio[WS(vs, 6) + WS(rs, 7)];
+			 T7W = rio[WS(vs, 6) + WS(rs, 3)];
+			 T7X = T7V + T7W;
+			 T85 = T7V - T7W;
+			 T86 = iio[WS(vs, 6) + WS(rs, 7)];
+			 T87 = iio[WS(vs, 6) + WS(rs, 3)];
+			 T88 = T86 - T87;
+			 T8Q = T86 + T87;
+		    }
+		    T7Y = T7U + T7X;
+		    T8R = T8P - T8Q;
+		    T91 = T8P + T8Q;
+		    T89 = T85 - T88;
+		    T8e = T8a + T8d;
+		    T8o = T85 + T88;
+		    T8L = T7X - T7U;
+		    T8n = T8d - T8a;
+	       }
+	       {
+		    E T95, T9A, T9l, Ta0, T98, T9i, T9D, Ta1;
+		    {
+			 E T93, T94, T9j, T9k;
+			 T93 = rio[WS(vs, 7)];
+			 T94 = rio[WS(vs, 7) + WS(rs, 4)];
+			 T95 = T93 + T94;
+			 T9A = T93 - T94;
+			 T9j = iio[WS(vs, 7)];
+			 T9k = iio[WS(vs, 7) + WS(rs, 4)];
+			 T9l = T9j - T9k;
+			 Ta0 = T9j + T9k;
+		    }
+		    {
+			 E T96, T97, T9B, T9C;
+			 T96 = rio[WS(vs, 7) + WS(rs, 2)];
+			 T97 = rio[WS(vs, 7) + WS(rs, 6)];
+			 T98 = T96 + T97;
+			 T9i = T96 - T97;
+			 T9B = iio[WS(vs, 7) + WS(rs, 2)];
+			 T9C = iio[WS(vs, 7) + WS(rs, 6)];
+			 T9D = T9B - T9C;
+			 Ta1 = T9B + T9C;
+		    }
+		    T99 = T95 + T98;
+		    Ta6 = T95 - T98;
+		    Tai = Ta0 + Ta1;
+		    T9m = T9i + T9l;
+		    T9E = T9A - T9D;
+		    T9S = T9l - T9i;
+		    Ta2 = Ta0 - Ta1;
+		    T9O = T9A + T9D;
+	       }
+	       {
+		    E T2D, T38, T2T, T3y, T2G, T2Q, T3b, T3z;
+		    {
+			 E T2B, T2C, T2R, T2S;
+			 T2B = rio[WS(vs, 2)];
+			 T2C = rio[WS(vs, 2) + WS(rs, 4)];
+			 T2D = T2B + T2C;
+			 T38 = T2B - T2C;
+			 T2R = iio[WS(vs, 2)];
+			 T2S = iio[WS(vs, 2) + WS(rs, 4)];
+			 T2T = T2R - T2S;
+			 T3y = T2R + T2S;
+		    }
+		    {
+			 E T2E, T2F, T39, T3a;
+			 T2E = rio[WS(vs, 2) + WS(rs, 2)];
+			 T2F = rio[WS(vs, 2) + WS(rs, 6)];
+			 T2G = T2E + T2F;
+			 T2Q = T2E - T2F;
+			 T39 = iio[WS(vs, 2) + WS(rs, 2)];
+			 T3a = iio[WS(vs, 2) + WS(rs, 6)];
+			 T3b = T39 - T3a;
+			 T3z = T39 + T3a;
+		    }
+		    T2H = T2D + T2G;
+		    T3E = T2D - T2G;
+		    T3Q = T3y + T3z;
+		    T2U = T2Q + T2T;
+		    T3c = T38 - T3b;
+		    T3q = T2T - T2Q;
+		    T3A = T3y - T3z;
+		    T3m = T38 + T3b;
+	       }
+	       {
+		    E T42, T4i, T4l, T4X, T45, T4d, T4g, T4Y;
+		    {
+			 E T40, T41, T4j, T4k;
+			 T40 = rio[WS(vs, 3) + WS(rs, 1)];
+			 T41 = rio[WS(vs, 3) + WS(rs, 5)];
+			 T42 = T40 + T41;
+			 T4i = T40 - T41;
+			 T4j = iio[WS(vs, 3) + WS(rs, 1)];
+			 T4k = iio[WS(vs, 3) + WS(rs, 5)];
+			 T4l = T4j - T4k;
+			 T4X = T4j + T4k;
+		    }
+		    {
+			 E T43, T44, T4e, T4f;
+			 T43 = rio[WS(vs, 3) + WS(rs, 7)];
+			 T44 = rio[WS(vs, 3) + WS(rs, 3)];
+			 T45 = T43 + T44;
+			 T4d = T43 - T44;
+			 T4e = iio[WS(vs, 3) + WS(rs, 7)];
+			 T4f = iio[WS(vs, 3) + WS(rs, 3)];
+			 T4g = T4e - T4f;
+			 T4Y = T4e + T4f;
+		    }
+		    T46 = T42 + T45;
+		    T4Z = T4X - T4Y;
+		    T59 = T4X + T4Y;
+		    T4h = T4d - T4g;
+		    T4m = T4i + T4l;
+		    T4w = T4d + T4g;
+		    T4T = T45 - T42;
+		    T4v = T4l - T4i;
+	       }
+	       {
+		    E T5d, T5I, T5t, T68, T5g, T5q, T5L, T69;
+		    {
+			 E T5b, T5c, T5r, T5s;
+			 T5b = rio[WS(vs, 4)];
+			 T5c = rio[WS(vs, 4) + WS(rs, 4)];
+			 T5d = T5b + T5c;
+			 T5I = T5b - T5c;
+			 T5r = iio[WS(vs, 4)];
+			 T5s = iio[WS(vs, 4) + WS(rs, 4)];
+			 T5t = T5r - T5s;
+			 T68 = T5r + T5s;
+		    }
+		    {
+			 E T5e, T5f, T5J, T5K;
+			 T5e = rio[WS(vs, 4) + WS(rs, 2)];
+			 T5f = rio[WS(vs, 4) + WS(rs, 6)];
+			 T5g = T5e + T5f;
+			 T5q = T5e - T5f;
+			 T5J = iio[WS(vs, 4) + WS(rs, 2)];
+			 T5K = iio[WS(vs, 4) + WS(rs, 6)];
+			 T5L = T5J - T5K;
+			 T69 = T5J + T5K;
+		    }
+		    T5h = T5d + T5g;
+		    T6e = T5d - T5g;
+		    T6q = T68 + T69;
+		    T5u = T5q + T5t;
+		    T5M = T5I - T5L;
+		    T60 = T5t - T5q;
+		    T6a = T68 - T69;
+		    T5W = T5I + T5L;
+	       }
+	       {
+		    E T6C, T6S, T6V, T7x, T6F, T6N, T6Q, T7y;
+		    {
+			 E T6A, T6B, T6T, T6U;
+			 T6A = rio[WS(vs, 5) + WS(rs, 1)];
+			 T6B = rio[WS(vs, 5) + WS(rs, 5)];
+			 T6C = T6A + T6B;
+			 T6S = T6A - T6B;
+			 T6T = iio[WS(vs, 5) + WS(rs, 1)];
+			 T6U = iio[WS(vs, 5) + WS(rs, 5)];
+			 T6V = T6T - T6U;
+			 T7x = T6T + T6U;
+		    }
+		    {
+			 E T6D, T6E, T6O, T6P;
+			 T6D = rio[WS(vs, 5) + WS(rs, 7)];
+			 T6E = rio[WS(vs, 5) + WS(rs, 3)];
+			 T6F = T6D + T6E;
+			 T6N = T6D - T6E;
+			 T6O = iio[WS(vs, 5) + WS(rs, 7)];
+			 T6P = iio[WS(vs, 5) + WS(rs, 3)];
+			 T6Q = T6O - T6P;
+			 T7y = T6O + T6P;
+		    }
+		    T6G = T6C + T6F;
+		    T7z = T7x - T7y;
+		    T7J = T7x + T7y;
+		    T6R = T6N - T6Q;
+		    T6W = T6S + T6V;
+		    T76 = T6N + T6Q;
+		    T7t = T6F - T6C;
+		    T75 = T6V - T6S;
+	       }
+	       {
+		    E T2K, T30, T33, T3F, T2N, T2V, T2Y, T3G;
+		    {
+			 E T2I, T2J, T31, T32;
+			 T2I = rio[WS(vs, 2) + WS(rs, 1)];
+			 T2J = rio[WS(vs, 2) + WS(rs, 5)];
+			 T2K = T2I + T2J;
+			 T30 = T2I - T2J;
+			 T31 = iio[WS(vs, 2) + WS(rs, 1)];
+			 T32 = iio[WS(vs, 2) + WS(rs, 5)];
+			 T33 = T31 - T32;
+			 T3F = T31 + T32;
+		    }
+		    {
+			 E T2L, T2M, T2W, T2X;
+			 T2L = rio[WS(vs, 2) + WS(rs, 7)];
+			 T2M = rio[WS(vs, 2) + WS(rs, 3)];
+			 T2N = T2L + T2M;
+			 T2V = T2L - T2M;
+			 T2W = iio[WS(vs, 2) + WS(rs, 7)];
+			 T2X = iio[WS(vs, 2) + WS(rs, 3)];
+			 T2Y = T2W - T2X;
+			 T3G = T2W + T2X;
+		    }
+		    T2O = T2K + T2N;
+		    T3H = T3F - T3G;
+		    T3R = T3F + T3G;
+		    T2Z = T2V - T2Y;
+		    T34 = T30 + T33;
+		    T3e = T2V + T2Y;
+		    T3B = T2N - T2K;
+		    T3d = T33 - T30;
+	       }
+	       {
+		    E T3V, T4q, T4b, T4Q, T3Y, T48, T4t, T4R;
+		    {
+			 E T3T, T3U, T49, T4a;
+			 T3T = rio[WS(vs, 3)];
+			 T3U = rio[WS(vs, 3) + WS(rs, 4)];
+			 T3V = T3T + T3U;
+			 T4q = T3T - T3U;
+			 T49 = iio[WS(vs, 3)];
+			 T4a = iio[WS(vs, 3) + WS(rs, 4)];
+			 T4b = T49 - T4a;
+			 T4Q = T49 + T4a;
+		    }
+		    {
+			 E T3W, T3X, T4r, T4s;
+			 T3W = rio[WS(vs, 3) + WS(rs, 2)];
+			 T3X = rio[WS(vs, 3) + WS(rs, 6)];
+			 T3Y = T3W + T3X;
+			 T48 = T3W - T3X;
+			 T4r = iio[WS(vs, 3) + WS(rs, 2)];
+			 T4s = iio[WS(vs, 3) + WS(rs, 6)];
+			 T4t = T4r - T4s;
+			 T4R = T4r + T4s;
+		    }
+		    T3Z = T3V + T3Y;
+		    T4W = T3V - T3Y;
+		    T58 = T4Q + T4R;
+		    T4c = T48 + T4b;
+		    T4u = T4q - T4t;
+		    T4I = T4b - T48;
+		    T4S = T4Q - T4R;
+		    T4E = T4q + T4t;
+	       }
+	       {
+		    E T5k, T5A, T5D, T6f, T5n, T5v, T5y, T6g;
+		    {
+			 E T5i, T5j, T5B, T5C;
+			 T5i = rio[WS(vs, 4) + WS(rs, 1)];
+			 T5j = rio[WS(vs, 4) + WS(rs, 5)];
+			 T5k = T5i + T5j;
+			 T5A = T5i - T5j;
+			 T5B = iio[WS(vs, 4) + WS(rs, 1)];
+			 T5C = iio[WS(vs, 4) + WS(rs, 5)];
+			 T5D = T5B - T5C;
+			 T6f = T5B + T5C;
+		    }
+		    {
+			 E T5l, T5m, T5w, T5x;
+			 T5l = rio[WS(vs, 4) + WS(rs, 7)];
+			 T5m = rio[WS(vs, 4) + WS(rs, 3)];
+			 T5n = T5l + T5m;
+			 T5v = T5l - T5m;
+			 T5w = iio[WS(vs, 4) + WS(rs, 7)];
+			 T5x = iio[WS(vs, 4) + WS(rs, 3)];
+			 T5y = T5w - T5x;
+			 T6g = T5w + T5x;
+		    }
+		    T5o = T5k + T5n;
+		    T6h = T6f - T6g;
+		    T6r = T6f + T6g;
+		    T5z = T5v - T5y;
+		    T5E = T5A + T5D;
+		    T5O = T5v + T5y;
+		    T6b = T5n - T5k;
+		    T5N = T5D - T5A;
+	       }
+	       {
+		    E T6v, T70, T6L, T7q, T6y, T6I, T73, T7r;
+		    {
+			 E T6t, T6u, T6J, T6K;
+			 T6t = rio[WS(vs, 5)];
+			 T6u = rio[WS(vs, 5) + WS(rs, 4)];
+			 T6v = T6t + T6u;
+			 T70 = T6t - T6u;
+			 T6J = iio[WS(vs, 5)];
+			 T6K = iio[WS(vs, 5) + WS(rs, 4)];
+			 T6L = T6J - T6K;
+			 T7q = T6J + T6K;
+		    }
+		    {
+			 E T6w, T6x, T71, T72;
+			 T6w = rio[WS(vs, 5) + WS(rs, 2)];
+			 T6x = rio[WS(vs, 5) + WS(rs, 6)];
+			 T6y = T6w + T6x;
+			 T6I = T6w - T6x;
+			 T71 = iio[WS(vs, 5) + WS(rs, 2)];
+			 T72 = iio[WS(vs, 5) + WS(rs, 6)];
+			 T73 = T71 - T72;
+			 T7r = T71 + T72;
+		    }
+		    T6z = T6v + T6y;
+		    T7w = T6v - T6y;
+		    T7I = T7q + T7r;
+		    T6M = T6I + T6L;
+		    T74 = T70 - T73;
+		    T7i = T6L - T6I;
+		    T7s = T7q - T7r;
+		    T7e = T70 + T73;
+	       }
+	       rio[0] = T7 + Te;
+	       iio[0] = T1g + T1h;
+	       rio[WS(rs, 1)] = T1p + T1w;
+	       iio[WS(rs, 1)] = T2y + T2z;
+	       rio[WS(rs, 3)] = T3Z + T46;
+	       rio[WS(rs, 2)] = T2H + T2O;
+	       iio[WS(rs, 2)] = T3Q + T3R;
+	       iio[WS(rs, 3)] = T58 + T59;
+	       rio[WS(rs, 6)] = T7R + T7Y;
+	       iio[WS(rs, 6)] = T90 + T91;
+	       iio[WS(rs, 5)] = T7I + T7J;
+	       rio[WS(rs, 5)] = T6z + T6G;
+	       iio[WS(rs, 4)] = T6q + T6r;
+	       rio[WS(rs, 4)] = T5h + T5o;
+	       rio[WS(rs, 7)] = T99 + T9g;
+	       iio[WS(rs, 7)] = Tai + Taj;
+	       {
+		    E T12, T18, TX, T13;
+		    T12 = T10 - T11;
+		    T18 = T14 - T17;
+		    TX = W[10];
+		    T13 = W[11];
+		    iio[WS(vs, 6)] = FNMS(T13, T18, TX * T12);
+		    rio[WS(vs, 6)] = FMA(T13, T12, TX * T18);
+	       }
+	       {
+		    E Tag, Tak, Taf, Tah;
+		    Tag = T99 - T9g;
+		    Tak = Tai - Taj;
+		    Taf = W[6];
+		    Tah = W[7];
+		    rio[WS(vs, 4) + WS(rs, 7)] = FMA(Taf, Tag, Tah * Tak);
+		    iio[WS(vs, 4) + WS(rs, 7)] = FNMS(Tah, Tag, Taf * Tak);
+	       }
+	       {
+		    E T8M, T8S, T8H, T8N;
+		    T8M = T8K - T8L;
+		    T8S = T8O - T8R;
+		    T8H = W[10];
+		    T8N = W[11];
+		    iio[WS(vs, 6) + WS(rs, 6)] = FNMS(T8N, T8S, T8H * T8M);
+		    rio[WS(vs, 6) + WS(rs, 6)] = FMA(T8N, T8M, T8H * T8S);
+	       }
+	       {
+		    E T2k, T2q, T2f, T2l;
+		    T2k = T2i - T2j;
+		    T2q = T2m - T2p;
+		    T2f = W[10];
+		    T2l = W[11];
+		    iio[WS(vs, 6) + WS(rs, 1)] = FNMS(T2l, T2q, T2f * T2k);
+		    rio[WS(vs, 6) + WS(rs, 1)] = FMA(T2l, T2k, T2f * T2q);
+	       }
+	       {
+		    E Ta4, Taa, T9Z, Ta5;
+		    Ta4 = Ta2 - Ta3;
+		    Taa = Ta6 - Ta9;
+		    T9Z = W[10];
+		    Ta5 = W[11];
+		    iio[WS(vs, 6) + WS(rs, 7)] = FNMS(Ta5, Taa, T9Z * Ta4);
+		    rio[WS(vs, 6) + WS(rs, 7)] = FMA(Ta5, Ta4, T9Z * Taa);
+	       }
+	       {
+		    E T8Y, T92, T8X, T8Z;
+		    T8Y = T7R - T7Y;
+		    T92 = T90 - T91;
+		    T8X = W[6];
+		    T8Z = W[7];
+		    rio[WS(vs, 4) + WS(rs, 6)] = FMA(T8X, T8Y, T8Z * T92);
+		    iio[WS(vs, 4) + WS(rs, 6)] = FNMS(T8Z, T8Y, T8X * T92);
+	       }
+	       {
+		    E T2w, T2A, T2v, T2x;
+		    T2w = T1p - T1w;
+		    T2A = T2y - T2z;
+		    T2v = W[6];
+		    T2x = W[7];
+		    rio[WS(vs, 4) + WS(rs, 1)] = FMA(T2v, T2w, T2x * T2A);
+		    iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T2x, T2w, T2v * T2A);
+	       }
+	       {
+		    E Tac, Tae, Tab, Tad;
+		    Tac = Ta3 + Ta2;
+		    Tae = Ta6 + Ta9;
+		    Tab = W[2];
+		    Tad = W[3];
+		    iio[WS(vs, 2) + WS(rs, 7)] = FNMS(Tad, Tae, Tab * Tac);
+		    rio[WS(vs, 2) + WS(rs, 7)] = FMA(Tad, Tac, Tab * Tae);
+	       }
+	       {
+		    E T8U, T8W, T8T, T8V;
+		    T8U = T8L + T8K;
+		    T8W = T8O + T8R;
+		    T8T = W[2];
+		    T8V = W[3];
+		    iio[WS(vs, 2) + WS(rs, 6)] = FNMS(T8V, T8W, T8T * T8U);
+		    rio[WS(vs, 2) + WS(rs, 6)] = FMA(T8V, T8U, T8T * T8W);
+	       }
+	       {
+		    E T1a, T1c, T19, T1b;
+		    T1a = T11 + T10;
+		    T1c = T14 + T17;
+		    T19 = W[2];
+		    T1b = W[3];
+		    iio[WS(vs, 2)] = FNMS(T1b, T1c, T19 * T1a);
+		    rio[WS(vs, 2)] = FMA(T1b, T1a, T19 * T1c);
+	       }
+	       {
+		    E T1e, T1i, T1d, T1f;
+		    T1e = T7 - Te;
+		    T1i = T1g - T1h;
+		    T1d = W[6];
+		    T1f = W[7];
+		    rio[WS(vs, 4)] = FMA(T1d, T1e, T1f * T1i);
+		    iio[WS(vs, 4)] = FNMS(T1f, T1e, T1d * T1i);
+	       }
+	       {
+		    E T2s, T2u, T2r, T2t;
+		    T2s = T2j + T2i;
+		    T2u = T2m + T2p;
+		    T2r = W[2];
+		    T2t = W[3];
+		    iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T2t, T2u, T2r * T2s);
+		    rio[WS(vs, 2) + WS(rs, 1)] = FMA(T2t, T2s, T2r * T2u);
+	       }
+	       {
+		    E T3C, T3I, T3x, T3D;
+		    T3C = T3A - T3B;
+		    T3I = T3E - T3H;
+		    T3x = W[10];
+		    T3D = W[11];
+		    iio[WS(vs, 6) + WS(rs, 2)] = FNMS(T3D, T3I, T3x * T3C);
+		    rio[WS(vs, 6) + WS(rs, 2)] = FMA(T3D, T3C, T3x * T3I);
+	       }
+	       {
+		    E T4U, T50, T4P, T4V;
+		    T4U = T4S - T4T;
+		    T50 = T4W - T4Z;
+		    T4P = W[10];
+		    T4V = W[11];
+		    iio[WS(vs, 6) + WS(rs, 3)] = FNMS(T4V, T50, T4P * T4U);
+		    rio[WS(vs, 6) + WS(rs, 3)] = FMA(T4V, T4U, T4P * T50);
+	       }
+	       {
+		    E T56, T5a, T55, T57;
+		    T56 = T3Z - T46;
+		    T5a = T58 - T59;
+		    T55 = W[6];
+		    T57 = W[7];
+		    rio[WS(vs, 4) + WS(rs, 3)] = FMA(T55, T56, T57 * T5a);
+		    iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T57, T56, T55 * T5a);
+	       }
+	       {
+		    E T6o, T6s, T6n, T6p;
+		    T6o = T5h - T5o;
+		    T6s = T6q - T6r;
+		    T6n = W[6];
+		    T6p = W[7];
+		    rio[WS(vs, 4) + WS(rs, 4)] = FMA(T6n, T6o, T6p * T6s);
+		    iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T6p, T6o, T6n * T6s);
+	       }
+	       {
+		    E T7u, T7A, T7p, T7v;
+		    T7u = T7s - T7t;
+		    T7A = T7w - T7z;
+		    T7p = W[10];
+		    T7v = W[11];
+		    iio[WS(vs, 6) + WS(rs, 5)] = FNMS(T7v, T7A, T7p * T7u);
+		    rio[WS(vs, 6) + WS(rs, 5)] = FMA(T7v, T7u, T7p * T7A);
+	       }
+	       {
+		    E T6c, T6i, T67, T6d;
+		    T6c = T6a - T6b;
+		    T6i = T6e - T6h;
+		    T67 = W[10];
+		    T6d = W[11];
+		    iio[WS(vs, 6) + WS(rs, 4)] = FNMS(T6d, T6i, T67 * T6c);
+		    rio[WS(vs, 6) + WS(rs, 4)] = FMA(T6d, T6c, T67 * T6i);
+	       }
+	       {
+		    E T7G, T7K, T7F, T7H;
+		    T7G = T6z - T6G;
+		    T7K = T7I - T7J;
+		    T7F = W[6];
+		    T7H = W[7];
+		    rio[WS(vs, 4) + WS(rs, 5)] = FMA(T7F, T7G, T7H * T7K);
+		    iio[WS(vs, 4) + WS(rs, 5)] = FNMS(T7H, T7G, T7F * T7K);
+	       }
+	       {
+		    E T3O, T3S, T3N, T3P;
+		    T3O = T2H - T2O;
+		    T3S = T3Q - T3R;
+		    T3N = W[6];
+		    T3P = W[7];
+		    rio[WS(vs, 4) + WS(rs, 2)] = FMA(T3N, T3O, T3P * T3S);
+		    iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T3P, T3O, T3N * T3S);
+	       }
+	       {
+		    E T3K, T3M, T3J, T3L;
+		    T3K = T3B + T3A;
+		    T3M = T3E + T3H;
+		    T3J = W[2];
+		    T3L = W[3];
+		    iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T3L, T3M, T3J * T3K);
+		    rio[WS(vs, 2) + WS(rs, 2)] = FMA(T3L, T3K, T3J * T3M);
+	       }
+	       {
+		    E T7C, T7E, T7B, T7D;
+		    T7C = T7t + T7s;
+		    T7E = T7w + T7z;
+		    T7B = W[2];
+		    T7D = W[3];
+		    iio[WS(vs, 2) + WS(rs, 5)] = FNMS(T7D, T7E, T7B * T7C);
+		    rio[WS(vs, 2) + WS(rs, 5)] = FMA(T7D, T7C, T7B * T7E);
+	       }
+	       {
+		    E T6k, T6m, T6j, T6l;
+		    T6k = T6b + T6a;
+		    T6m = T6e + T6h;
+		    T6j = W[2];
+		    T6l = W[3];
+		    iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T6l, T6m, T6j * T6k);
+		    rio[WS(vs, 2) + WS(rs, 4)] = FMA(T6l, T6k, T6j * T6m);
+	       }
+	       {
+		    E T52, T54, T51, T53;
+		    T52 = T4T + T4S;
+		    T54 = T4W + T4Z;
+		    T51 = W[2];
+		    T53 = W[3];
+		    iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T53, T54, T51 * T52);
+		    rio[WS(vs, 2) + WS(rs, 3)] = FMA(T53, T52, T51 * T54);
+	       }
+	       {
+		    E T5G, T5S, T5Q, T5U, T5F, T5P;
+		    T5F = KP707106781 * (T5z - T5E);
+		    T5G = T5u - T5F;
+		    T5S = T5u + T5F;
+		    T5P = KP707106781 * (T5N - T5O);
+		    T5Q = T5M - T5P;
+		    T5U = T5M + T5P;
+		    {
+			 E T5p, T5H, T5R, T5T;
+			 T5p = W[12];
+			 T5H = W[13];
+			 iio[WS(vs, 7) + WS(rs, 4)] = FNMS(T5H, T5Q, T5p * T5G);
+			 rio[WS(vs, 7) + WS(rs, 4)] = FMA(T5H, T5G, T5p * T5Q);
+			 T5R = W[4];
+			 T5T = W[5];
+			 iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T5T, T5U, T5R * T5S);
+			 rio[WS(vs, 3) + WS(rs, 4)] = FMA(T5T, T5S, T5R * T5U);
+		    }
+	       }
+	       {
+		    E Tw, TI, TG, TK, Tv, TF;
+		    Tv = KP707106781 * (Tp - Tu);
+		    Tw = Tk - Tv;
+		    TI = Tk + Tv;
+		    TF = KP707106781 * (TD - TE);
+		    TG = TC - TF;
+		    TK = TC + TF;
+		    {
+			 E Tf, Tx, TH, TJ;
+			 Tf = W[12];
+			 Tx = W[13];
+			 iio[WS(vs, 7)] = FNMS(Tx, TG, Tf * Tw);
+			 rio[WS(vs, 7)] = FMA(Tx, Tw, Tf * TG);
+			 TH = W[4];
+			 TJ = W[5];
+			 iio[WS(vs, 3)] = FNMS(TJ, TK, TH * TI);
+			 rio[WS(vs, 3)] = FMA(TJ, TI, TH * TK);
+		    }
+	       }
+	       {
+		    E T9Q, T9W, T9U, T9Y, T9P, T9T;
+		    T9P = KP707106781 * (T9w + T9r);
+		    T9Q = T9O - T9P;
+		    T9W = T9O + T9P;
+		    T9T = KP707106781 * (T9F + T9G);
+		    T9U = T9S - T9T;
+		    T9Y = T9S + T9T;
+		    {
+			 E T9N, T9R, T9V, T9X;
+			 T9N = W[8];
+			 T9R = W[9];
+			 rio[WS(vs, 5) + WS(rs, 7)] = FMA(T9N, T9Q, T9R * T9U);
+			 iio[WS(vs, 5) + WS(rs, 7)] = FNMS(T9R, T9Q, T9N * T9U);
+			 T9V = W[0];
+			 T9X = W[1];
+			 rio[WS(vs, 1) + WS(rs, 7)] = FMA(T9V, T9W, T9X * T9Y);
+			 iio[WS(vs, 1) + WS(rs, 7)] = FNMS(T9X, T9W, T9V * T9Y);
+		    }
+	       }
+	       {
+		    E T36, T3i, T3g, T3k, T35, T3f;
+		    T35 = KP707106781 * (T2Z - T34);
+		    T36 = T2U - T35;
+		    T3i = T2U + T35;
+		    T3f = KP707106781 * (T3d - T3e);
+		    T3g = T3c - T3f;
+		    T3k = T3c + T3f;
+		    {
+			 E T2P, T37, T3h, T3j;
+			 T2P = W[12];
+			 T37 = W[13];
+			 iio[WS(vs, 7) + WS(rs, 2)] = FNMS(T37, T3g, T2P * T36);
+			 rio[WS(vs, 7) + WS(rs, 2)] = FMA(T37, T36, T2P * T3g);
+			 T3h = W[4];
+			 T3j = W[5];
+			 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T3j, T3k, T3h * T3i);
+			 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T3j, T3i, T3h * T3k);
+		    }
+	       }
+	       {
+		    E T5Y, T64, T62, T66, T5X, T61;
+		    T5X = KP707106781 * (T5E + T5z);
+		    T5Y = T5W - T5X;
+		    T64 = T5W + T5X;
+		    T61 = KP707106781 * (T5N + T5O);
+		    T62 = T60 - T61;
+		    T66 = T60 + T61;
+		    {
+			 E T5V, T5Z, T63, T65;
+			 T5V = W[8];
+			 T5Z = W[9];
+			 rio[WS(vs, 5) + WS(rs, 4)] = FMA(T5V, T5Y, T5Z * T62);
+			 iio[WS(vs, 5) + WS(rs, 4)] = FNMS(T5Z, T5Y, T5V * T62);
+			 T63 = W[0];
+			 T65 = W[1];
+			 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T63, T64, T65 * T66);
+			 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T65, T64, T63 * T66);
+		    }
+	       }
+	       {
+		    E T7g, T7m, T7k, T7o, T7f, T7j;
+		    T7f = KP707106781 * (T6W + T6R);
+		    T7g = T7e - T7f;
+		    T7m = T7e + T7f;
+		    T7j = KP707106781 * (T75 + T76);
+		    T7k = T7i - T7j;
+		    T7o = T7i + T7j;
+		    {
+			 E T7d, T7h, T7l, T7n;
+			 T7d = W[8];
+			 T7h = W[9];
+			 rio[WS(vs, 5) + WS(rs, 5)] = FMA(T7d, T7g, T7h * T7k);
+			 iio[WS(vs, 5) + WS(rs, 5)] = FNMS(T7h, T7g, T7d * T7k);
+			 T7l = W[0];
+			 T7n = W[1];
+			 rio[WS(vs, 1) + WS(rs, 5)] = FMA(T7l, T7m, T7n * T7o);
+			 iio[WS(vs, 1) + WS(rs, 5)] = FNMS(T7n, T7m, T7l * T7o);
+		    }
+	       }
+	       {
+		    E T8g, T8s, T8q, T8u, T8f, T8p;
+		    T8f = KP707106781 * (T89 - T8e);
+		    T8g = T84 - T8f;
+		    T8s = T84 + T8f;
+		    T8p = KP707106781 * (T8n - T8o);
+		    T8q = T8m - T8p;
+		    T8u = T8m + T8p;
+		    {
+			 E T7Z, T8h, T8r, T8t;
+			 T7Z = W[12];
+			 T8h = W[13];
+			 iio[WS(vs, 7) + WS(rs, 6)] = FNMS(T8h, T8q, T7Z * T8g);
+			 rio[WS(vs, 7) + WS(rs, 6)] = FMA(T8h, T8g, T7Z * T8q);
+			 T8r = W[4];
+			 T8t = W[5];
+			 iio[WS(vs, 3) + WS(rs, 6)] = FNMS(T8t, T8u, T8r * T8s);
+			 rio[WS(vs, 3) + WS(rs, 6)] = FMA(T8t, T8s, T8r * T8u);
+		    }
+	       }
+	       {
+		    E T4G, T4M, T4K, T4O, T4F, T4J;
+		    T4F = KP707106781 * (T4m + T4h);
+		    T4G = T4E - T4F;
+		    T4M = T4E + T4F;
+		    T4J = KP707106781 * (T4v + T4w);
+		    T4K = T4I - T4J;
+		    T4O = T4I + T4J;
+		    {
+			 E T4D, T4H, T4L, T4N;
+			 T4D = W[8];
+			 T4H = W[9];
+			 rio[WS(vs, 5) + WS(rs, 3)] = FMA(T4D, T4G, T4H * T4K);
+			 iio[WS(vs, 5) + WS(rs, 3)] = FNMS(T4H, T4G, T4D * T4K);
+			 T4L = W[0];
+			 T4N = W[1];
+			 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T4L, T4M, T4N * T4O);
+			 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T4N, T4M, T4L * T4O);
+		    }
+	       }
+	       {
+		    E TO, TU, TS, TW, TN, TR;
+		    TN = KP707106781 * (Tu + Tp);
+		    TO = TM - TN;
+		    TU = TM + TN;
+		    TR = KP707106781 * (TD + TE);
+		    TS = TQ - TR;
+		    TW = TQ + TR;
+		    {
+			 E TL, TP, TT, TV;
+			 TL = W[8];
+			 TP = W[9];
+			 rio[WS(vs, 5)] = FMA(TL, TO, TP * TS);
+			 iio[WS(vs, 5)] = FNMS(TP, TO, TL * TS);
+			 TT = W[0];
+			 TV = W[1];
+			 rio[WS(vs, 1)] = FMA(TT, TU, TV * TW);
+			 iio[WS(vs, 1)] = FNMS(TV, TU, TT * TW);
+		    }
+	       }
+	       {
+		    E T26, T2c, T2a, T2e, T25, T29;
+		    T25 = KP707106781 * (T1M + T1H);
+		    T26 = T24 - T25;
+		    T2c = T24 + T25;
+		    T29 = KP707106781 * (T1V + T1W);
+		    T2a = T28 - T29;
+		    T2e = T28 + T29;
+		    {
+			 E T23, T27, T2b, T2d;
+			 T23 = W[8];
+			 T27 = W[9];
+			 rio[WS(vs, 5) + WS(rs, 1)] = FMA(T23, T26, T27 * T2a);
+			 iio[WS(vs, 5) + WS(rs, 1)] = FNMS(T27, T26, T23 * T2a);
+			 T2b = W[0];
+			 T2d = W[1];
+			 rio[WS(vs, 1) + WS(rs, 1)] = FMA(T2b, T2c, T2d * T2e);
+			 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T2d, T2c, T2b * T2e);
+		    }
+	       }
+	       {
+		    E T9y, T9K, T9I, T9M, T9x, T9H;
+		    T9x = KP707106781 * (T9r - T9w);
+		    T9y = T9m - T9x;
+		    T9K = T9m + T9x;
+		    T9H = KP707106781 * (T9F - T9G);
+		    T9I = T9E - T9H;
+		    T9M = T9E + T9H;
+		    {
+			 E T9h, T9z, T9J, T9L;
+			 T9h = W[12];
+			 T9z = W[13];
+			 iio[WS(vs, 7) + WS(rs, 7)] = FNMS(T9z, T9I, T9h * T9y);
+			 rio[WS(vs, 7) + WS(rs, 7)] = FMA(T9z, T9y, T9h * T9I);
+			 T9J = W[4];
+			 T9L = W[5];
+			 iio[WS(vs, 3) + WS(rs, 7)] = FNMS(T9L, T9M, T9J * T9K);
+			 rio[WS(vs, 3) + WS(rs, 7)] = FMA(T9L, T9K, T9J * T9M);
+		    }
+	       }
+	       {
+		    E T6Y, T7a, T78, T7c, T6X, T77;
+		    T6X = KP707106781 * (T6R - T6W);
+		    T6Y = T6M - T6X;
+		    T7a = T6M + T6X;
+		    T77 = KP707106781 * (T75 - T76);
+		    T78 = T74 - T77;
+		    T7c = T74 + T77;
+		    {
+			 E T6H, T6Z, T79, T7b;
+			 T6H = W[12];
+			 T6Z = W[13];
+			 iio[WS(vs, 7) + WS(rs, 5)] = FNMS(T6Z, T78, T6H * T6Y);
+			 rio[WS(vs, 7) + WS(rs, 5)] = FMA(T6Z, T6Y, T6H * T78);
+			 T79 = W[4];
+			 T7b = W[5];
+			 iio[WS(vs, 3) + WS(rs, 5)] = FNMS(T7b, T7c, T79 * T7a);
+			 rio[WS(vs, 3) + WS(rs, 5)] = FMA(T7b, T7a, T79 * T7c);
+		    }
+	       }
+	       {
+		    E T1O, T20, T1Y, T22, T1N, T1X;
+		    T1N = KP707106781 * (T1H - T1M);
+		    T1O = T1C - T1N;
+		    T20 = T1C + T1N;
+		    T1X = KP707106781 * (T1V - T1W);
+		    T1Y = T1U - T1X;
+		    T22 = T1U + T1X;
+		    {
+			 E T1x, T1P, T1Z, T21;
+			 T1x = W[12];
+			 T1P = W[13];
+			 iio[WS(vs, 7) + WS(rs, 1)] = FNMS(T1P, T1Y, T1x * T1O);
+			 rio[WS(vs, 7) + WS(rs, 1)] = FMA(T1P, T1O, T1x * T1Y);
+			 T1Z = W[4];
+			 T21 = W[5];
+			 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T21, T22, T1Z * T20);
+			 rio[WS(vs, 3) + WS(rs, 1)] = FMA(T21, T20, T1Z * T22);
+		    }
+	       }
+	       {
+		    E T4o, T4A, T4y, T4C, T4n, T4x;
+		    T4n = KP707106781 * (T4h - T4m);
+		    T4o = T4c - T4n;
+		    T4A = T4c + T4n;
+		    T4x = KP707106781 * (T4v - T4w);
+		    T4y = T4u - T4x;
+		    T4C = T4u + T4x;
+		    {
+			 E T47, T4p, T4z, T4B;
+			 T47 = W[12];
+			 T4p = W[13];
+			 iio[WS(vs, 7) + WS(rs, 3)] = FNMS(T4p, T4y, T47 * T4o);
+			 rio[WS(vs, 7) + WS(rs, 3)] = FMA(T4p, T4o, T47 * T4y);
+			 T4z = W[4];
+			 T4B = W[5];
+			 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T4B, T4C, T4z * T4A);
+			 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T4B, T4A, T4z * T4C);
+		    }
+	       }
+	       {
+		    E T3o, T3u, T3s, T3w, T3n, T3r;
+		    T3n = KP707106781 * (T34 + T2Z);
+		    T3o = T3m - T3n;
+		    T3u = T3m + T3n;
+		    T3r = KP707106781 * (T3d + T3e);
+		    T3s = T3q - T3r;
+		    T3w = T3q + T3r;
+		    {
+			 E T3l, T3p, T3t, T3v;
+			 T3l = W[8];
+			 T3p = W[9];
+			 rio[WS(vs, 5) + WS(rs, 2)] = FMA(T3l, T3o, T3p * T3s);
+			 iio[WS(vs, 5) + WS(rs, 2)] = FNMS(T3p, T3o, T3l * T3s);
+			 T3t = W[0];
+			 T3v = W[1];
+			 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T3t, T3u, T3v * T3w);
+			 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T3v, T3u, T3t * T3w);
+		    }
+	       }
+	       {
+		    E T8y, T8E, T8C, T8G, T8x, T8B;
+		    T8x = KP707106781 * (T8e + T89);
+		    T8y = T8w - T8x;
+		    T8E = T8w + T8x;
+		    T8B = KP707106781 * (T8n + T8o);
+		    T8C = T8A - T8B;
+		    T8G = T8A + T8B;
+		    {
+			 E T8v, T8z, T8D, T8F;
+			 T8v = W[8];
+			 T8z = W[9];
+			 rio[WS(vs, 5) + WS(rs, 6)] = FMA(T8v, T8y, T8z * T8C);
+			 iio[WS(vs, 5) + WS(rs, 6)] = FNMS(T8z, T8y, T8v * T8C);
+			 T8D = W[0];
+			 T8F = W[1];
+			 rio[WS(vs, 1) + WS(rs, 6)] = FMA(T8D, T8E, T8F * T8G);
+			 iio[WS(vs, 1) + WS(rs, 6)] = FNMS(T8F, T8E, T8D * T8G);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 8, "q1_8", twinstr, &GENUS, {416, 144, 112, 0}, 0, 0, 0 };
+
+void X(codelet_q1_8) (planner *p) {
+     X(kdft_difsq_register) (p, q1_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:49 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include t.h */
+
+/*
+ * This function contains 102 FP additions, 72 FP multiplications,
+ * (or, 48 additions, 18 multiplications, 54 fused multiply/add),
+ * 70 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t.h"
+
+static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
+	       E T1X, T21, T20, T22;
+	       {
+		    E T23, T1U, T8, T12, T1y, T25, T1P, T1H, T1Y, T18, T10, T2b, T1K, T1O, T15;
+		    E T1Z, T2a, Tz, T24, T1n;
+		    {
+			 E T1, T1T, T3, T6, T2, T5;
+			 T1 = ri[0];
+			 T1T = ii[0];
+			 T3 = ri[WS(rs, 5)];
+			 T6 = ii[WS(rs, 5)];
+			 T2 = W[8];
+			 T5 = W[9];
+			 {
+			      E T1w, TY, T1s, T1F, TM, T16, T1u, TS;
+			      {
+				   E TF, T1p, TO, TR, T1r, TL, TN, TQ, T1t, TP;
+				   {
+					E TU, TX, TT, TW;
+					{
+					     E TB, TE, T1R, T4, TA, TD;
+					     TB = ri[WS(rs, 4)];
+					     TE = ii[WS(rs, 4)];
+					     T1R = T2 * T6;
+					     T4 = T2 * T3;
+					     TA = W[6];
+					     TD = W[7];
+					     {
+						  E T1S, T7, T1o, TC;
+						  T1S = FNMS(T5, T3, T1R);
+						  T7 = FMA(T5, T6, T4);
+						  T1o = TA * TE;
+						  TC = TA * TB;
+						  T23 = T1T - T1S;
+						  T1U = T1S + T1T;
+						  T8 = T1 - T7;
+						  T12 = T1 + T7;
+						  TF = FMA(TD, TE, TC);
+						  T1p = FNMS(TD, TB, T1o);
+					     }
+					}
+					TU = ri[WS(rs, 1)];
+					TX = ii[WS(rs, 1)];
+					TT = W[0];
+					TW = W[1];
+					{
+					     E TH, TK, TJ, T1q, TI, T1v, TV, TG;
+					     TH = ri[WS(rs, 9)];
+					     TK = ii[WS(rs, 9)];
+					     T1v = TT * TX;
+					     TV = TT * TU;
+					     TG = W[16];
+					     TJ = W[17];
+					     T1w = FNMS(TW, TU, T1v);
+					     TY = FMA(TW, TX, TV);
+					     T1q = TG * TK;
+					     TI = TG * TH;
+					     TO = ri[WS(rs, 6)];
+					     TR = ii[WS(rs, 6)];
+					     T1r = FNMS(TJ, TH, T1q);
+					     TL = FMA(TJ, TK, TI);
+					     TN = W[10];
+					     TQ = W[11];
+					}
+				   }
+				   T1s = T1p - T1r;
+				   T1F = T1p + T1r;
+				   TM = TF - TL;
+				   T16 = TF + TL;
+				   T1t = TN * TR;
+				   TP = TN * TO;
+				   T1u = FNMS(TQ, TO, T1t);
+				   TS = FMA(TQ, TR, TP);
+			      }
+			      {
+				   E T1e, Te, T1l, Tx, Tn, Tq, Tp, T1g, Tk, T1i, To;
+				   {
+					E Tt, Tw, Tv, T1k, Tu;
+					{
+					     E Ta, Td, T9, Tc, T1d, Tb, Ts;
+					     Ta = ri[WS(rs, 2)];
+					     Td = ii[WS(rs, 2)];
+					     {
+						  E T1G, T1x, TZ, T17;
+						  T1G = T1u + T1w;
+						  T1x = T1u - T1w;
+						  TZ = TS - TY;
+						  T17 = TS + TY;
+						  T1y = T1s - T1x;
+						  T25 = T1s + T1x;
+						  T1P = T1F + T1G;
+						  T1H = T1F - T1G;
+						  T1Y = T16 - T17;
+						  T18 = T16 + T17;
+						  T10 = TM + TZ;
+						  T2b = TM - TZ;
+						  T9 = W[2];
+					     }
+					     Tc = W[3];
+					     Tt = ri[WS(rs, 3)];
+					     Tw = ii[WS(rs, 3)];
+					     T1d = T9 * Td;
+					     Tb = T9 * Ta;
+					     Ts = W[4];
+					     Tv = W[5];
+					     T1e = FNMS(Tc, Ta, T1d);
+					     Te = FMA(Tc, Td, Tb);
+					     T1k = Ts * Tw;
+					     Tu = Ts * Tt;
+					}
+					{
+					     E Tg, Tj, Tf, Ti, T1f, Th, Tm;
+					     Tg = ri[WS(rs, 7)];
+					     Tj = ii[WS(rs, 7)];
+					     T1l = FNMS(Tv, Tt, T1k);
+					     Tx = FMA(Tv, Tw, Tu);
+					     Tf = W[12];
+					     Ti = W[13];
+					     Tn = ri[WS(rs, 8)];
+					     Tq = ii[WS(rs, 8)];
+					     T1f = Tf * Tj;
+					     Th = Tf * Tg;
+					     Tm = W[14];
+					     Tp = W[15];
+					     T1g = FNMS(Ti, Tg, T1f);
+					     Tk = FMA(Ti, Tj, Th);
+					     T1i = Tm * Tq;
+					     To = Tm * Tn;
+					}
+				   }
+				   {
+					E T1h, T1I, Tl, T13, T1j, Tr;
+					T1h = T1e - T1g;
+					T1I = T1e + T1g;
+					Tl = Te - Tk;
+					T13 = Te + Tk;
+					T1j = FNMS(Tp, Tn, T1i);
+					Tr = FMA(Tp, Tq, To);
+					{
+					     E T1m, T1J, T14, Ty;
+					     T1m = T1j - T1l;
+					     T1J = T1j + T1l;
+					     T14 = Tr + Tx;
+					     Ty = Tr - Tx;
+					     T1K = T1I - T1J;
+					     T1O = T1I + T1J;
+					     T15 = T13 + T14;
+					     T1Z = T13 - T14;
+					     T2a = Tl - Ty;
+					     Tz = Tl + Ty;
+					     T24 = T1h + T1m;
+					     T1n = T1h - T1m;
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T2c, T2e, T29, T2d;
+			 {
+			      E T1b, T11, T26, T28, T27;
+			      T1b = Tz - T10;
+			      T11 = Tz + T10;
+			      T26 = T24 + T25;
+			      T28 = T24 - T25;
+			      {
+				   E T1B, T1z, T1a, T1A, T1c;
+				   T1B = FNMS(KP618033988, T1n, T1y);
+				   T1z = FMA(KP618033988, T1y, T1n);
+				   ri[WS(rs, 5)] = T8 + T11;
+				   T1a = FNMS(KP250000000, T11, T8);
+				   T1A = FNMS(KP559016994, T1b, T1a);
+				   T1c = FMA(KP559016994, T1b, T1a);
+				   T27 = FNMS(KP250000000, T26, T23);
+				   T2c = FMA(KP618033988, T2b, T2a);
+				   T2e = FNMS(KP618033988, T2a, T2b);
+				   ri[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
+				   ri[WS(rs, 9)] = FNMS(KP951056516, T1z, T1c);
+				   ri[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
+				   ri[WS(rs, 7)] = FNMS(KP951056516, T1B, T1A);
+			      }
+			      ii[WS(rs, 5)] = T26 + T23;
+			      T29 = FMA(KP559016994, T28, T27);
+			      T2d = FNMS(KP559016994, T28, T27);
+			 }
+			 {
+			      E T1E, T1M, T1L, T1N, T19, T1D, T1C, T1Q, T1W, T1V;
+			      T19 = T15 + T18;
+			      T1D = T15 - T18;
+			      ii[WS(rs, 7)] = FMA(KP951056516, T2e, T2d);
+			      ii[WS(rs, 3)] = FNMS(KP951056516, T2e, T2d);
+			      ii[WS(rs, 9)] = FMA(KP951056516, T2c, T29);
+			      ii[WS(rs, 1)] = FNMS(KP951056516, T2c, T29);
+			      T1C = FNMS(KP250000000, T19, T12);
+			      ri[0] = T12 + T19;
+			      T1E = FNMS(KP559016994, T1D, T1C);
+			      T1M = FMA(KP559016994, T1D, T1C);
+			      T1L = FNMS(KP618033988, T1K, T1H);
+			      T1N = FMA(KP618033988, T1H, T1K);
+			      T1Q = T1O + T1P;
+			      T1W = T1O - T1P;
+			      ri[WS(rs, 6)] = FMA(KP951056516, T1N, T1M);
+			      ri[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M);
+			      ri[WS(rs, 8)] = FMA(KP951056516, T1L, T1E);
+			      ri[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E);
+			      T1V = FNMS(KP250000000, T1Q, T1U);
+			      ii[0] = T1Q + T1U;
+			      T1X = FNMS(KP559016994, T1W, T1V);
+			      T21 = FMA(KP559016994, T1W, T1V);
+			      T20 = FNMS(KP618033988, T1Z, T1Y);
+			      T22 = FMA(KP618033988, T1Y, T1Z);
+			 }
+		    }
+	       }
+	       ii[WS(rs, 6)] = FNMS(KP951056516, T22, T21);
+	       ii[WS(rs, 4)] = FMA(KP951056516, T22, T21);
+	       ii[WS(rs, 8)] = FNMS(KP951056516, T20, T1X);
+	       ii[WS(rs, 2)] = FMA(KP951056516, T20, T1X);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, {48, 18, 54, 0}, 0, 0, 0 };
+
+void X(codelet_t1_10) (planner *p) {
+     X(kdft_dit_register) (p, t1_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include t.h */
+
+/*
+ * This function contains 102 FP additions, 60 FP multiplications,
+ * (or, 72 additions, 30 multiplications, 30 fused multiply/add),
+ * 45 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t.h"
+
+static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
+	       E T7, T1O, TT, T1C, TF, TQ, TR, T1o, T1p, T1y, TX, TY, TZ, T1d, T1g;
+	       E T1M, Ti, Tt, Tu, T1r, T1s, T1x, TU, TV, TW, T16, T19, T1L;
+	       {
+		    E T1, T1B, T6, T1A;
+		    T1 = ri[0];
+		    T1B = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 5)];
+			 T5 = ii[WS(rs, 5)];
+			 T2 = W[8];
+			 T4 = W[9];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T1A = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 - T6;
+		    T1O = T1B - T1A;
+		    TT = T1 + T6;
+		    T1C = T1A + T1B;
+	       }
+	       {
+		    E Tz, T1b, TP, T1f, TE, T1c, TK, T1e;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = ri[WS(rs, 4)];
+			 Ty = ii[WS(rs, 4)];
+			 Tv = W[6];
+			 Tx = W[7];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T1b = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TM, TO, TL, TN;
+			 TM = ri[WS(rs, 1)];
+			 TO = ii[WS(rs, 1)];
+			 TL = W[0];
+			 TN = W[1];
+			 TP = FMA(TL, TM, TN * TO);
+			 T1f = FNMS(TN, TM, TL * TO);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = ri[WS(rs, 9)];
+			 TD = ii[WS(rs, 9)];
+			 TA = W[16];
+			 TC = W[17];
+			 TE = FMA(TA, TB, TC * TD);
+			 T1c = FNMS(TC, TB, TA * TD);
+		    }
+		    {
+			 E TH, TJ, TG, TI;
+			 TH = ri[WS(rs, 6)];
+			 TJ = ii[WS(rs, 6)];
+			 TG = W[10];
+			 TI = W[11];
+			 TK = FMA(TG, TH, TI * TJ);
+			 T1e = FNMS(TI, TH, TG * TJ);
+		    }
+		    TF = Tz - TE;
+		    TQ = TK - TP;
+		    TR = TF + TQ;
+		    T1o = T1b + T1c;
+		    T1p = T1e + T1f;
+		    T1y = T1o + T1p;
+		    TX = Tz + TE;
+		    TY = TK + TP;
+		    TZ = TX + TY;
+		    T1d = T1b - T1c;
+		    T1g = T1e - T1f;
+		    T1M = T1d + T1g;
+	       }
+	       {
+		    E Tc, T14, Ts, T18, Th, T15, Tn, T17;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = ri[WS(rs, 2)];
+			 Tb = ii[WS(rs, 2)];
+			 T8 = W[2];
+			 Ta = W[3];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T14 = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = ri[WS(rs, 3)];
+			 Tr = ii[WS(rs, 3)];
+			 To = W[4];
+			 Tq = W[5];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 T18 = FNMS(Tq, Tp, To * Tr);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = ri[WS(rs, 7)];
+			 Tg = ii[WS(rs, 7)];
+			 Td = W[12];
+			 Tf = W[13];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T15 = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = ri[WS(rs, 8)];
+			 Tm = ii[WS(rs, 8)];
+			 Tj = W[14];
+			 Tl = W[15];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 T17 = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    Ti = Tc - Th;
+		    Tt = Tn - Ts;
+		    Tu = Ti + Tt;
+		    T1r = T14 + T15;
+		    T1s = T17 + T18;
+		    T1x = T1r + T1s;
+		    TU = Tc + Th;
+		    TV = Tn + Ts;
+		    TW = TU + TV;
+		    T16 = T14 - T15;
+		    T19 = T17 - T18;
+		    T1L = T16 + T19;
+	       }
+	       {
+		    E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13;
+		    T11 = KP559016994 * (Tu - TR);
+		    TS = Tu + TR;
+		    T12 = FNMS(KP250000000, TS, T7);
+		    T1a = T16 - T19;
+		    T1h = T1d - T1g;
+		    T1i = FMA(KP951056516, T1a, KP587785252 * T1h);
+		    T1k = FNMS(KP587785252, T1a, KP951056516 * T1h);
+		    ri[WS(rs, 5)] = T7 + TS;
+		    T1j = T12 - T11;
+		    ri[WS(rs, 7)] = T1j - T1k;
+		    ri[WS(rs, 3)] = T1j + T1k;
+		    T13 = T11 + T12;
+		    ri[WS(rs, 9)] = T13 - T1i;
+		    ri[WS(rs, 1)] = T13 + T1i;
+	       }
+	       {
+		    E T1N, T1P, T1Q, T1U, T1W, T1S, T1T, T1V, T1R;
+		    T1N = KP559016994 * (T1L - T1M);
+		    T1P = T1L + T1M;
+		    T1Q = FNMS(KP250000000, T1P, T1O);
+		    T1S = Ti - Tt;
+		    T1T = TF - TQ;
+		    T1U = FMA(KP951056516, T1S, KP587785252 * T1T);
+		    T1W = FNMS(KP587785252, T1S, KP951056516 * T1T);
+		    ii[WS(rs, 5)] = T1P + T1O;
+		    T1V = T1Q - T1N;
+		    ii[WS(rs, 3)] = T1V - T1W;
+		    ii[WS(rs, 7)] = T1W + T1V;
+		    T1R = T1N + T1Q;
+		    ii[WS(rs, 1)] = T1R - T1U;
+		    ii[WS(rs, 9)] = T1U + T1R;
+	       }
+	       {
+		    E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n;
+		    T1m = KP559016994 * (TW - TZ);
+		    T10 = TW + TZ;
+		    T1l = FNMS(KP250000000, T10, TT);
+		    T1q = T1o - T1p;
+		    T1t = T1r - T1s;
+		    T1u = FNMS(KP587785252, T1t, KP951056516 * T1q);
+		    T1w = FMA(KP951056516, T1t, KP587785252 * T1q);
+		    ri[0] = TT + T10;
+		    T1v = T1m + T1l;
+		    ri[WS(rs, 4)] = T1v - T1w;
+		    ri[WS(rs, 6)] = T1v + T1w;
+		    T1n = T1l - T1m;
+		    ri[WS(rs, 2)] = T1n - T1u;
+		    ri[WS(rs, 8)] = T1n + T1u;
+	       }
+	       {
+		    E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
+		    T1H = KP559016994 * (T1x - T1y);
+		    T1z = T1x + T1y;
+		    T1G = FNMS(KP250000000, T1z, T1C);
+		    T1D = TX - TY;
+		    T1E = TU - TV;
+		    T1F = FNMS(KP587785252, T1E, KP951056516 * T1D);
+		    T1J = FMA(KP951056516, T1E, KP587785252 * T1D);
+		    ii[0] = T1z + T1C;
+		    T1K = T1H + T1G;
+		    ii[WS(rs, 4)] = T1J + T1K;
+		    ii[WS(rs, 6)] = T1K - T1J;
+		    T1I = T1G - T1H;
+		    ii[WS(rs, 2)] = T1F + T1I;
+		    ii[WS(rs, 8)] = T1I - T1F;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, {72, 30, 30, 0}, 0, 0, 0 };
+
+void X(codelet_t1_10) (planner *p) {
+     X(kdft_dit_register) (p, t1_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:50 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include t.h */
+
+/*
+ * This function contains 118 FP additions, 68 FP multiplications,
+ * (or, 72 additions, 22 multiplications, 46 fused multiply/add),
+ * 84 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "t.h"
+
+static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T2B, T2C;
+	       {
+		    E T1, T2i, T2e, Tl, T1Y, T10, T1S, TG, T2f, T1s, T2r, Ty, T1Z, T1H, T21;
+		    E T1d, TI, TL, T2h, T1l, T2o, Te, TJ, T1w, TO, TR, TN, TK, TQ;
+		    {
+			 E TW, TZ, TY, T1X, TX;
+			 T1 = ri[0];
+			 T2i = ii[0];
+			 {
+			      E Th, Tk, Tg, Tj, T2d, Ti, TV;
+			      Th = ri[WS(rs, 6)];
+			      Tk = ii[WS(rs, 6)];
+			      Tg = W[10];
+			      Tj = W[11];
+			      TW = ri[WS(rs, 9)];
+			      TZ = ii[WS(rs, 9)];
+			      T2d = Tg * Tk;
+			      Ti = Tg * Th;
+			      TV = W[16];
+			      TY = W[17];
+			      T2e = FNMS(Tj, Th, T2d);
+			      Tl = FMA(Tj, Tk, Ti);
+			      T1X = TV * TZ;
+			      TX = TV * TW;
+			 }
+			 {
+			      E Tn, Tq, Tt, T1o, To, Tw, Ts, Tp, Tv;
+			      {
+				   E TC, TF, TB, TE, T1R, TD, Tm;
+				   TC = ri[WS(rs, 3)];
+				   TF = ii[WS(rs, 3)];
+				   T1Y = FNMS(TY, TW, T1X);
+				   T10 = FMA(TY, TZ, TX);
+				   TB = W[4];
+				   TE = W[5];
+				   Tn = ri[WS(rs, 10)];
+				   Tq = ii[WS(rs, 10)];
+				   T1R = TB * TF;
+				   TD = TB * TC;
+				   Tm = W[18];
+				   Tt = ri[WS(rs, 2)];
+				   T1S = FNMS(TE, TC, T1R);
+				   TG = FMA(TE, TF, TD);
+				   T1o = Tm * Tq;
+				   To = Tm * Tn;
+				   Tw = ii[WS(rs, 2)];
+				   Ts = W[2];
+				   Tp = W[19];
+				   Tv = W[3];
+			      }
+			      {
+				   E T12, T15, T13, T1D, T18, T1b, T17, T14, T1a;
+				   {
+					E T1p, Tr, T1r, Tx, T1q, Tu, T11;
+					T12 = ri[WS(rs, 1)];
+					T1q = Ts * Tw;
+					Tu = Ts * Tt;
+					T1p = FNMS(Tp, Tn, T1o);
+					Tr = FMA(Tp, Tq, To);
+					T1r = FNMS(Tv, Tt, T1q);
+					Tx = FMA(Tv, Tw, Tu);
+					T15 = ii[WS(rs, 1)];
+					T11 = W[0];
+					T2f = T1p + T1r;
+					T1s = T1p - T1r;
+					T2r = Tx - Tr;
+					Ty = Tr + Tx;
+					T13 = T11 * T12;
+					T1D = T11 * T15;
+				   }
+				   T18 = ri[WS(rs, 5)];
+				   T1b = ii[WS(rs, 5)];
+				   T17 = W[8];
+				   T14 = W[1];
+				   T1a = W[9];
+				   {
+					E T3, T6, T4, T1h, T9, Tc, T8, T5, Tb;
+					{
+					     E T1E, T16, T1G, T1c, T1F, T19, T2;
+					     T3 = ri[WS(rs, 4)];
+					     T1F = T17 * T1b;
+					     T19 = T17 * T18;
+					     T1E = FNMS(T14, T12, T1D);
+					     T16 = FMA(T14, T15, T13);
+					     T1G = FNMS(T1a, T18, T1F);
+					     T1c = FMA(T1a, T1b, T19);
+					     T6 = ii[WS(rs, 4)];
+					     T2 = W[6];
+					     T1Z = T1E + T1G;
+					     T1H = T1E - T1G;
+					     T21 = T1c - T16;
+					     T1d = T16 + T1c;
+					     T4 = T2 * T3;
+					     T1h = T2 * T6;
+					}
+					T9 = ri[WS(rs, 8)];
+					Tc = ii[WS(rs, 8)];
+					T8 = W[14];
+					T5 = W[7];
+					Tb = W[15];
+					{
+					     E T1i, T7, T1k, Td, T1j, Ta, TH;
+					     TI = ri[WS(rs, 7)];
+					     T1j = T8 * Tc;
+					     Ta = T8 * T9;
+					     T1i = FNMS(T5, T3, T1h);
+					     T7 = FMA(T5, T6, T4);
+					     T1k = FNMS(Tb, T9, T1j);
+					     Td = FMA(Tb, Tc, Ta);
+					     TL = ii[WS(rs, 7)];
+					     TH = W[12];
+					     T2h = T1i + T1k;
+					     T1l = T1i - T1k;
+					     T2o = Td - T7;
+					     Te = T7 + Td;
+					     TJ = TH * TI;
+					     T1w = TH * TL;
+					}
+					TO = ri[WS(rs, 11)];
+					TR = ii[WS(rs, 11)];
+					TN = W[20];
+					TK = W[13];
+					TQ = W[21];
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T1g, T1n, T2q, T1A, T1V, T28, TA, T2n, T1v, T1C, T1U, T29, T2m, T2k, T2l;
+			 E T1f, T2a, T20;
+			 {
+			      E T2g, T1T, TT, T2j, TU, T1e;
+			      {
+				   E Tf, T1x, TM, T1z, TS, Tz, T1y, TP;
+				   T1g = FNMS(KP500000000, Te, T1);
+				   Tf = T1 + Te;
+				   T1y = TN * TR;
+				   TP = TN * TO;
+				   T1x = FNMS(TK, TI, T1w);
+				   TM = FMA(TK, TL, TJ);
+				   T1z = FNMS(TQ, TO, T1y);
+				   TS = FMA(TQ, TR, TP);
+				   Tz = Tl + Ty;
+				   T1n = FNMS(KP500000000, Ty, Tl);
+				   T2q = FNMS(KP500000000, T2f, T2e);
+				   T2g = T2e + T2f;
+				   T1T = T1x + T1z;
+				   T1A = T1x - T1z;
+				   T1V = TS - TM;
+				   TT = TM + TS;
+				   T28 = Tf - Tz;
+				   TA = Tf + Tz;
+				   T2j = T2h + T2i;
+				   T2n = FNMS(KP500000000, T2h, T2i);
+			      }
+			      T1v = FNMS(KP500000000, TT, TG);
+			      TU = TG + TT;
+			      T1e = T10 + T1d;
+			      T1C = FNMS(KP500000000, T1d, T10);
+			      T1U = FNMS(KP500000000, T1T, T1S);
+			      T29 = T1S + T1T;
+			      T2m = T2j - T2g;
+			      T2k = T2g + T2j;
+			      T2l = TU - T1e;
+			      T1f = TU + T1e;
+			      T2a = T1Y + T1Z;
+			      T20 = FNMS(KP500000000, T1Z, T1Y);
+			 }
+			 {
+			      E T1m, T1K, T2y, T2p, T2x, T2s, T1L, T1t, T1B, T1N, T2c, T2b;
+			      ii[WS(rs, 9)] = T2m - T2l;
+			      ii[WS(rs, 3)] = T2l + T2m;
+			      ri[0] = TA + T1f;
+			      ri[WS(rs, 6)] = TA - T1f;
+			      T2c = T29 + T2a;
+			      T2b = T29 - T2a;
+			      T1m = FNMS(KP866025403, T1l, T1g);
+			      T1K = FMA(KP866025403, T1l, T1g);
+			      ii[0] = T2c + T2k;
+			      ii[WS(rs, 6)] = T2k - T2c;
+			      ri[WS(rs, 9)] = T28 + T2b;
+			      ri[WS(rs, 3)] = T28 - T2b;
+			      T2y = FNMS(KP866025403, T2o, T2n);
+			      T2p = FMA(KP866025403, T2o, T2n);
+			      T2x = FNMS(KP866025403, T2r, T2q);
+			      T2s = FMA(KP866025403, T2r, T2q);
+			      T1L = FMA(KP866025403, T1s, T1n);
+			      T1t = FNMS(KP866025403, T1s, T1n);
+			      T1B = FNMS(KP866025403, T1A, T1v);
+			      T1N = FMA(KP866025403, T1A, T1v);
+			      {
+				   E T24, T27, T1Q, T2u, T23, T2v, T2w, T2t;
+				   {
+					E T1u, T1W, T22, T1O, T1I, T2z, T2A, T25, T26, T1M, T1J, T1P;
+					T24 = T1m - T1t;
+					T1u = T1m + T1t;
+					T25 = FNMS(KP866025403, T1V, T1U);
+					T1W = FMA(KP866025403, T1V, T1U);
+					T26 = FNMS(KP866025403, T21, T20);
+					T22 = FMA(KP866025403, T21, T20);
+					T1O = FMA(KP866025403, T1H, T1C);
+					T1I = FNMS(KP866025403, T1H, T1C);
+					T2z = T2x + T2y;
+					T2B = T2y - T2x;
+					T27 = T25 - T26;
+					T2A = T25 + T26;
+					T1M = T1K + T1L;
+					T1Q = T1K - T1L;
+					T2C = T1B - T1I;
+					T1J = T1B + T1I;
+					T1P = T1N + T1O;
+					T2u = T1N - T1O;
+					ii[WS(rs, 8)] = T2A + T2z;
+					ii[WS(rs, 2)] = T2z - T2A;
+					ri[WS(rs, 8)] = T1u + T1J;
+					ri[WS(rs, 2)] = T1u - T1J;
+					ri[WS(rs, 10)] = T1M - T1P;
+					ri[WS(rs, 4)] = T1M + T1P;
+					T23 = T1W - T22;
+					T2v = T1W + T22;
+					T2w = T2s + T2p;
+					T2t = T2p - T2s;
+				   }
+				   ii[WS(rs, 10)] = T2w - T2v;
+				   ii[WS(rs, 4)] = T2v + T2w;
+				   ri[WS(rs, 1)] = T1Q + T23;
+				   ri[WS(rs, 7)] = T1Q - T23;
+				   ii[WS(rs, 7)] = T2u + T2t;
+				   ii[WS(rs, 1)] = T2t - T2u;
+				   ri[WS(rs, 5)] = T24 + T27;
+				   ri[WS(rs, 11)] = T24 - T27;
+			      }
+			 }
+		    }
+	       }
+	       ii[WS(rs, 11)] = T2C + T2B;
+	       ii[WS(rs, 5)] = T2B - T2C;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, {72, 22, 46, 0}, 0, 0, 0 };
+
+void X(codelet_t1_12) (planner *p) {
+     X(kdft_dit_register) (p, t1_12, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include t.h */
+
+/*
+ * This function contains 118 FP additions, 60 FP multiplications,
+ * (or, 88 additions, 30 multiplications, 30 fused multiply/add),
+ * 47 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "t.h"
+
+static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T1, T1W, T18, T21, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F;
+	       E T1G, Ti, T1S, T1d, T24, Tt, T1a, T1T, T25, TA, T1z, T1j, T1y, TL, T1g;
+	       E T1A, T1B;
+	       {
+		    E T6, T16, Tb, T17;
+		    T1 = ri[0];
+		    T1W = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 4)];
+			 T5 = ii[WS(rs, 4)];
+			 T2 = W[6];
+			 T4 = W[7];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T16 = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = ri[WS(rs, 8)];
+			 Ta = ii[WS(rs, 8)];
+			 T7 = W[14];
+			 T9 = W[15];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 T17 = FNMS(T9, T8, T7 * Ta);
+		    }
+		    T18 = KP866025403 * (T16 - T17);
+		    T21 = KP866025403 * (Tb - T6);
+		    Tc = T6 + Tb;
+		    T15 = FNMS(KP500000000, Tc, T1);
+		    T1V = T16 + T17;
+		    T22 = FNMS(KP500000000, T1V, T1W);
+	       }
+	       {
+		    E T11, T1n, TW, T1m;
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = ri[WS(rs, 9)];
+			 TQ = ii[WS(rs, 9)];
+			 TN = W[16];
+			 TP = W[17];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T1E = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E TY, T10, TX, TZ;
+			 TY = ri[WS(rs, 5)];
+			 T10 = ii[WS(rs, 5)];
+			 TX = W[8];
+			 TZ = W[9];
+			 T11 = FMA(TX, TY, TZ * T10);
+			 T1n = FNMS(TZ, TY, TX * T10);
+		    }
+		    {
+			 E TT, TV, TS, TU;
+			 TT = ri[WS(rs, 1)];
+			 TV = ii[WS(rs, 1)];
+			 TS = W[0];
+			 TU = W[1];
+			 TW = FMA(TS, TT, TU * TV);
+			 T1m = FNMS(TU, TT, TS * TV);
+		    }
+		    T1o = KP866025403 * (T1m - T1n);
+		    T1D = KP866025403 * (T11 - TW);
+		    T12 = TW + T11;
+		    T1l = FNMS(KP500000000, T12, TR);
+		    T1F = T1m + T1n;
+		    T1G = FNMS(KP500000000, T1F, T1E);
+	       }
+	       {
+		    E Ts, T1c, Tn, T1b;
+		    {
+			 E Tf, Th, Te, Tg;
+			 Tf = ri[WS(rs, 6)];
+			 Th = ii[WS(rs, 6)];
+			 Te = W[10];
+			 Tg = W[11];
+			 Ti = FMA(Te, Tf, Tg * Th);
+			 T1S = FNMS(Tg, Tf, Te * Th);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = ri[WS(rs, 2)];
+			 Tr = ii[WS(rs, 2)];
+			 To = W[2];
+			 Tq = W[3];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 T1c = FNMS(Tq, Tp, To * Tr);
+		    }
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = ri[WS(rs, 10)];
+			 Tm = ii[WS(rs, 10)];
+			 Tj = W[18];
+			 Tl = W[19];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 T1b = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    T1d = KP866025403 * (T1b - T1c);
+		    T24 = KP866025403 * (Ts - Tn);
+		    Tt = Tn + Ts;
+		    T1a = FNMS(KP500000000, Tt, Ti);
+		    T1T = T1b + T1c;
+		    T25 = FNMS(KP500000000, T1T, T1S);
+	       }
+	       {
+		    E TK, T1i, TF, T1h;
+		    {
+			 E Tx, Tz, Tw, Ty;
+			 Tx = ri[WS(rs, 3)];
+			 Tz = ii[WS(rs, 3)];
+			 Tw = W[4];
+			 Ty = W[5];
+			 TA = FMA(Tw, Tx, Ty * Tz);
+			 T1z = FNMS(Ty, Tx, Tw * Tz);
+		    }
+		    {
+			 E TH, TJ, TG, TI;
+			 TH = ri[WS(rs, 11)];
+			 TJ = ii[WS(rs, 11)];
+			 TG = W[20];
+			 TI = W[21];
+			 TK = FMA(TG, TH, TI * TJ);
+			 T1i = FNMS(TI, TH, TG * TJ);
+		    }
+		    {
+			 E TC, TE, TB, TD;
+			 TC = ri[WS(rs, 7)];
+			 TE = ii[WS(rs, 7)];
+			 TB = W[12];
+			 TD = W[13];
+			 TF = FMA(TB, TC, TD * TE);
+			 T1h = FNMS(TD, TC, TB * TE);
+		    }
+		    T1j = KP866025403 * (T1h - T1i);
+		    T1y = KP866025403 * (TK - TF);
+		    TL = TF + TK;
+		    T1g = FNMS(KP500000000, TL, TA);
+		    T1A = T1h + T1i;
+		    T1B = FNMS(KP500000000, T1A, T1z);
+	       }
+	       {
+		    E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
+		    {
+			 E Td, Tu, T1U, T1X;
+			 Td = T1 + Tc;
+			 Tu = Ti + Tt;
+			 Tv = Td + Tu;
+			 T1N = Td - Tu;
+			 T1U = T1S + T1T;
+			 T1X = T1V + T1W;
+			 T1Y = T1U + T1X;
+			 T20 = T1X - T1U;
+		    }
+		    {
+			 E TM, T13, T1O, T1P;
+			 TM = TA + TL;
+			 T13 = TR + T12;
+			 T14 = TM + T13;
+			 T1Z = TM - T13;
+			 T1O = T1z + T1A;
+			 T1P = T1E + T1F;
+			 T1Q = T1O - T1P;
+			 T1R = T1O + T1P;
+		    }
+		    ri[WS(rs, 6)] = Tv - T14;
+		    ii[WS(rs, 6)] = T1Y - T1R;
+		    ri[0] = Tv + T14;
+		    ii[0] = T1R + T1Y;
+		    ri[WS(rs, 3)] = T1N - T1Q;
+		    ii[WS(rs, 3)] = T1Z + T20;
+		    ri[WS(rs, 9)] = T1N + T1Q;
+		    ii[WS(rs, 9)] = T20 - T1Z;
+	       }
+	       {
+		    E T1t, T1x, T27, T2a, T1w, T28, T1I, T29;
+		    {
+			 E T1r, T1s, T23, T26;
+			 T1r = T15 + T18;
+			 T1s = T1a + T1d;
+			 T1t = T1r + T1s;
+			 T1x = T1r - T1s;
+			 T23 = T21 + T22;
+			 T26 = T24 + T25;
+			 T27 = T23 - T26;
+			 T2a = T26 + T23;
+		    }
+		    {
+			 E T1u, T1v, T1C, T1H;
+			 T1u = T1g + T1j;
+			 T1v = T1l + T1o;
+			 T1w = T1u + T1v;
+			 T28 = T1u - T1v;
+			 T1C = T1y + T1B;
+			 T1H = T1D + T1G;
+			 T1I = T1C - T1H;
+			 T29 = T1C + T1H;
+		    }
+		    ri[WS(rs, 10)] = T1t - T1w;
+		    ii[WS(rs, 10)] = T2a - T29;
+		    ri[WS(rs, 4)] = T1t + T1w;
+		    ii[WS(rs, 4)] = T29 + T2a;
+		    ri[WS(rs, 7)] = T1x - T1I;
+		    ii[WS(rs, 7)] = T28 + T27;
+		    ri[WS(rs, 1)] = T1x + T1I;
+		    ii[WS(rs, 1)] = T27 - T28;
+	       }
+	       {
+		    E T1f, T1J, T2d, T2f, T1q, T2g, T1M, T2e;
+		    {
+			 E T19, T1e, T2b, T2c;
+			 T19 = T15 - T18;
+			 T1e = T1a - T1d;
+			 T1f = T19 + T1e;
+			 T1J = T19 - T1e;
+			 T2b = T25 - T24;
+			 T2c = T22 - T21;
+			 T2d = T2b + T2c;
+			 T2f = T2c - T2b;
+		    }
+		    {
+			 E T1k, T1p, T1K, T1L;
+			 T1k = T1g - T1j;
+			 T1p = T1l - T1o;
+			 T1q = T1k + T1p;
+			 T2g = T1k - T1p;
+			 T1K = T1B - T1y;
+			 T1L = T1G - T1D;
+			 T1M = T1K - T1L;
+			 T2e = T1K + T1L;
+		    }
+		    ri[WS(rs, 2)] = T1f - T1q;
+		    ii[WS(rs, 2)] = T2d - T2e;
+		    ri[WS(rs, 8)] = T1f + T1q;
+		    ii[WS(rs, 8)] = T2e + T2d;
+		    ri[WS(rs, 11)] = T1J - T1M;
+		    ii[WS(rs, 11)] = T2g + T2f;
+		    ri[WS(rs, 5)] = T1J + T1M;
+		    ii[WS(rs, 5)] = T2f - T2g;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, {88, 30, 30, 0}, 0, 0, 0 };
+
+void X(codelet_t1_12) (planner *p) {
+     X(kdft_dit_register) (p, t1_12, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,801 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:50 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include t.h */
+
+/*
+ * This function contains 184 FP additions, 140 FP multiplications,
+ * (or, 72 additions, 28 multiplications, 112 fused multiply/add),
+ * 89 stack variables, 6 constants, and 60 memory accesses
+ */
+#include "t.h"
+
+static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
+	       E T2d, T2O, T2Q, T2m, T2k, T2l, T2P, T2n;
+	       {
+		    E T1G, T3u, T3k, T3t, T1B, Tf, T37, T1y, T2V, T2M, T2a, T2i, T39, Tz, T2X;
+		    E T2t, T1O, T2e, T3a, TT, T10, T2Y, T2z, T1V, T2f, T2C, T12, T15, T14, T21;
+		    E T1c, T1Y, T13;
+		    {
+			 E T2I, T1k, T1m, T1p, T1o, T28, T1w, T25, T1n;
+			 {
+			      E T1, T3j, T9, Tc, Tb, T1D, T7, T1E, Ta, T1j, T1i, T1h;
+			      T1 = ri[0];
+			      T3j = ii[0];
+			      {
+				   E T3, T6, T2, T5, T1C, T4, T8;
+				   T3 = ri[WS(rs, 5)];
+				   T6 = ii[WS(rs, 5)];
+				   T2 = W[8];
+				   T5 = W[9];
+				   T9 = ri[WS(rs, 10)];
+				   Tc = ii[WS(rs, 10)];
+				   T1C = T2 * T6;
+				   T4 = T2 * T3;
+				   T8 = W[18];
+				   Tb = W[19];
+				   T1D = FNMS(T5, T3, T1C);
+				   T7 = FMA(T5, T6, T4);
+				   T1E = T8 * Tc;
+				   Ta = T8 * T9;
+			      }
+			      {
+				   E T1g, T1F, Td, T1f, T3i, Te, T2H;
+				   T1g = ri[WS(rs, 9)];
+				   T1j = ii[WS(rs, 9)];
+				   T1F = FNMS(Tb, T9, T1E);
+				   Td = FMA(Tb, Tc, Ta);
+				   T1f = W[16];
+				   T1i = W[17];
+				   T1G = T1D - T1F;
+				   T3i = T1D + T1F;
+				   T3u = Td - T7;
+				   Te = T7 + Td;
+				   T2H = T1f * T1j;
+				   T1h = T1f * T1g;
+				   T3k = T3i + T3j;
+				   T3t = FNMS(KP500000000, T3i, T3j);
+				   T1B = FNMS(KP500000000, Te, T1);
+				   Tf = T1 + Te;
+				   T2I = FNMS(T1i, T1g, T2H);
+			      }
+			      T1k = FMA(T1i, T1j, T1h);
+			      {
+				   E T1s, T1v, T1r, T1u, T27, T1t, T1l;
+				   T1s = ri[WS(rs, 4)];
+				   T1v = ii[WS(rs, 4)];
+				   T1r = W[6];
+				   T1u = W[7];
+				   T1m = ri[WS(rs, 14)];
+				   T1p = ii[WS(rs, 14)];
+				   T27 = T1r * T1v;
+				   T1t = T1r * T1s;
+				   T1l = W[26];
+				   T1o = W[27];
+				   T28 = FNMS(T1u, T1s, T27);
+				   T1w = FMA(T1u, T1v, T1t);
+				   T25 = T1l * T1p;
+				   T1n = T1l * T1m;
+			      }
+			 }
+			 {
+			      E Tl, T2p, Tn, Tq, Tp, T1M, Tx, T1J, To;
+			      {
+				   E Th, Tk, T26, T1q, Tg, Tj;
+				   Th = ri[WS(rs, 3)];
+				   Tk = ii[WS(rs, 3)];
+				   T26 = FNMS(T1o, T1m, T25);
+				   T1q = FMA(T1o, T1p, T1n);
+				   Tg = W[4];
+				   Tj = W[5];
+				   {
+					E T29, T2J, T1x, T2L;
+					T29 = T26 - T28;
+					T2J = T26 + T28;
+					T1x = T1q + T1w;
+					T2L = T1w - T1q;
+					{
+					     E T2o, Ti, T2K, T24;
+					     T2o = Tg * Tk;
+					     Ti = Tg * Th;
+					     T2K = FNMS(KP500000000, T2J, T2I);
+					     T37 = T2I + T2J;
+					     T24 = FNMS(KP500000000, T1x, T1k);
+					     T1y = T1k + T1x;
+					     Tl = FMA(Tj, Tk, Ti);
+					     T2V = FNMS(KP866025403, T2L, T2K);
+					     T2M = FMA(KP866025403, T2L, T2K);
+					     T2a = FNMS(KP866025403, T29, T24);
+					     T2i = FMA(KP866025403, T29, T24);
+					     T2p = FNMS(Tj, Th, T2o);
+					}
+				   }
+			      }
+			      {
+				   E Tt, Tw, Ts, Tv, T1L, Tu, Tm;
+				   Tt = ri[WS(rs, 13)];
+				   Tw = ii[WS(rs, 13)];
+				   Ts = W[24];
+				   Tv = W[25];
+				   Tn = ri[WS(rs, 8)];
+				   Tq = ii[WS(rs, 8)];
+				   T1L = Ts * Tw;
+				   Tu = Ts * Tt;
+				   Tm = W[14];
+				   Tp = W[15];
+				   T1M = FNMS(Tv, Tt, T1L);
+				   Tx = FMA(Tv, Tw, Tu);
+				   T1J = Tm * Tq;
+				   To = Tm * Tn;
+			      }
+			      {
+				   E TF, T2v, TH, TK, TJ, T1T, TR, T1Q, TI;
+				   {
+					E TB, TE, T1K, Tr, TA, TD;
+					TB = ri[WS(rs, 12)];
+					TE = ii[WS(rs, 12)];
+					T1K = FNMS(Tp, Tn, T1J);
+					Tr = FMA(Tp, Tq, To);
+					TA = W[22];
+					TD = W[23];
+					{
+					     E T1N, T2q, Ty, T2s;
+					     T1N = T1K - T1M;
+					     T2q = T1K + T1M;
+					     Ty = Tr + Tx;
+					     T2s = Tx - Tr;
+					     {
+						  E T2u, TC, T2r, T1I;
+						  T2u = TA * TE;
+						  TC = TA * TB;
+						  T2r = FNMS(KP500000000, T2q, T2p);
+						  T39 = T2p + T2q;
+						  T1I = FNMS(KP500000000, Ty, Tl);
+						  Tz = Tl + Ty;
+						  TF = FMA(TD, TE, TC);
+						  T2X = FNMS(KP866025403, T2s, T2r);
+						  T2t = FMA(KP866025403, T2s, T2r);
+						  T1O = FNMS(KP866025403, T1N, T1I);
+						  T2e = FMA(KP866025403, T1N, T1I);
+						  T2v = FNMS(TD, TB, T2u);
+					     }
+					}
+				   }
+				   {
+					E TN, TQ, TM, TP, T1S, TO, TG;
+					TN = ri[WS(rs, 7)];
+					TQ = ii[WS(rs, 7)];
+					TM = W[12];
+					TP = W[13];
+					TH = ri[WS(rs, 2)];
+					TK = ii[WS(rs, 2)];
+					T1S = TM * TQ;
+					TO = TM * TN;
+					TG = W[2];
+					TJ = W[3];
+					T1T = FNMS(TP, TN, T1S);
+					TR = FMA(TP, TQ, TO);
+					T1Q = TG * TK;
+					TI = TG * TH;
+				   }
+				   {
+					E TW, TZ, T1R, TL, TV, TY;
+					TW = ri[WS(rs, 6)];
+					TZ = ii[WS(rs, 6)];
+					T1R = FNMS(TJ, TH, T1Q);
+					TL = FMA(TJ, TK, TI);
+					TV = W[10];
+					TY = W[11];
+					{
+					     E T1U, T2w, TS, T2y;
+					     T1U = T1R - T1T;
+					     T2w = T1R + T1T;
+					     TS = TL + TR;
+					     T2y = TR - TL;
+					     {
+						  E T2B, TX, T2x, T1P;
+						  T2B = TV * TZ;
+						  TX = TV * TW;
+						  T2x = FNMS(KP500000000, T2w, T2v);
+						  T3a = T2v + T2w;
+						  T1P = FNMS(KP500000000, TS, TF);
+						  TT = TF + TS;
+						  T10 = FMA(TY, TZ, TX);
+						  T2Y = FNMS(KP866025403, T2y, T2x);
+						  T2z = FMA(KP866025403, T2y, T2x);
+						  T1V = FNMS(KP866025403, T1U, T1P);
+						  T2f = FMA(KP866025403, T1U, T1P);
+						  T2C = FNMS(TY, TW, T2B);
+					     }
+					}
+				   }
+				   {
+					E T18, T1b, T17, T1a, T20, T19, T11;
+					T18 = ri[WS(rs, 1)];
+					T1b = ii[WS(rs, 1)];
+					T17 = W[0];
+					T1a = W[1];
+					T12 = ri[WS(rs, 11)];
+					T15 = ii[WS(rs, 11)];
+					T20 = T17 * T1b;
+					T19 = T17 * T18;
+					T11 = W[20];
+					T14 = W[21];
+					T21 = FNMS(T1a, T18, T20);
+					T1c = FMA(T1a, T1b, T19);
+					T1Y = T11 * T15;
+					T13 = T11 * T12;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T2G, T2h, T3J, T3I, T32, T30, T1H, T1W, T3P, T3O, T2b;
+			 {
+			      E T3f, T3b, T1Z, T16, T3p, TU;
+			      T3f = T39 + T3a;
+			      T3b = T39 - T3a;
+			      T1Z = FNMS(T14, T12, T1Y);
+			      T16 = FMA(T14, T15, T13);
+			      T3p = Tz - TT;
+			      TU = Tz + TT;
+			      {
+				   E T3g, T2U, T23, T3c, T3e, T3q, T3s, T1A, T34, T3r, T3n;
+				   {
+					E T22, T1d, T2F, T2E, T36, T2D;
+					T22 = T1Z - T21;
+					T2D = T1Z + T21;
+					T1d = T16 + T1c;
+					T2F = T1c - T16;
+					T2E = FNMS(KP500000000, T2D, T2C);
+					T36 = T2C + T2D;
+					{
+					     E T1e, T1X, T38, T1z, T3o;
+					     T1e = T10 + T1d;
+					     T1X = FNMS(KP500000000, T1d, T10);
+					     T38 = T36 - T37;
+					     T3g = T36 + T37;
+					     T2G = FMA(KP866025403, T2F, T2E);
+					     T2U = FNMS(KP866025403, T2F, T2E);
+					     T1z = T1e + T1y;
+					     T3o = T1e - T1y;
+					     T2h = FMA(KP866025403, T22, T1X);
+					     T23 = FNMS(KP866025403, T22, T1X);
+					     T3c = FNMS(KP618033988, T3b, T38);
+					     T3e = FMA(KP618033988, T38, T3b);
+					     T3q = FNMS(KP618033988, T3p, T3o);
+					     T3s = FMA(KP618033988, T3o, T3p);
+					     T1A = TU + T1z;
+					     T34 = TU - T1z;
+					}
+				   }
+				   {
+					E T2W, T33, T3m, T3h, T2Z, T3d, T35, T3l;
+					T3J = T2U + T2V;
+					T2W = T2U - T2V;
+					ri[0] = Tf + T1A;
+					T33 = FNMS(KP250000000, T1A, Tf);
+					T3m = T3f - T3g;
+					T3h = T3f + T3g;
+					T2Z = T2X - T2Y;
+					T3I = T2X + T2Y;
+					T3d = FMA(KP559016994, T34, T33);
+					T35 = FNMS(KP559016994, T34, T33);
+					ii[0] = T3h + T3k;
+					T3l = FNMS(KP250000000, T3h, T3k);
+					ri[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
+					ri[WS(rs, 12)] = FNMS(KP951056516, T3c, T35);
+					ri[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
+					ri[WS(rs, 9)] = FNMS(KP951056516, T3e, T3d);
+					T3r = FMA(KP559016994, T3m, T3l);
+					T3n = FNMS(KP559016994, T3m, T3l);
+					T32 = FMA(KP618033988, T2W, T2Z);
+					T30 = FNMS(KP618033988, T2Z, T2W);
+				   }
+				   ii[WS(rs, 12)] = FMA(KP951056516, T3q, T3n);
+				   ii[WS(rs, 3)] = FNMS(KP951056516, T3q, T3n);
+				   ii[WS(rs, 9)] = FMA(KP951056516, T3s, T3r);
+				   ii[WS(rs, 6)] = FNMS(KP951056516, T3s, T3r);
+				   T2d = FMA(KP866025403, T1G, T1B);
+				   T1H = FNMS(KP866025403, T1G, T1B);
+				   T1W = T1O + T1V;
+				   T3P = T1O - T1V;
+				   T3O = T23 - T2a;
+				   T2b = T23 + T2a;
+			      }
+			 }
+			 {
+			      E T3H, T3v, T2S, T3Q, T3S, T2R, T2c;
+			      T3H = FNMS(KP866025403, T3u, T3t);
+			      T3v = FMA(KP866025403, T3u, T3t);
+			      T2c = T1W + T2b;
+			      T2S = T1W - T2b;
+			      T3Q = FNMS(KP618033988, T3P, T3O);
+			      T3S = FMA(KP618033988, T3O, T3P);
+			      ri[WS(rs, 5)] = T1H + T2c;
+			      T2R = FNMS(KP250000000, T2c, T1H);
+			      {
+				   E T2g, T2j, T3G, T3E, T2A, T2N, T3y, T3A, T3M, T3L, T3z, T3F, T3B;
+				   {
+					E T3C, T3D, T31, T2T, T3K;
+					T2g = T2e + T2f;
+					T3C = T2e - T2f;
+					T3D = T2h - T2i;
+					T2j = T2h + T2i;
+					T31 = FMA(KP559016994, T2S, T2R);
+					T2T = FNMS(KP559016994, T2S, T2R);
+					T3K = T3I + T3J;
+					T3M = T3I - T3J;
+					ri[WS(rs, 8)] = FMA(KP951056516, T30, T2T);
+					ri[WS(rs, 2)] = FNMS(KP951056516, T30, T2T);
+					ri[WS(rs, 11)] = FMA(KP951056516, T32, T31);
+					ri[WS(rs, 14)] = FNMS(KP951056516, T32, T31);
+					ii[WS(rs, 5)] = T3K + T3H;
+					T3L = FNMS(KP250000000, T3K, T3H);
+					T3G = FNMS(KP618033988, T3C, T3D);
+					T3E = FMA(KP618033988, T3D, T3C);
+				   }
+				   {
+					E T3N, T3R, T3w, T3x;
+					T3N = FNMS(KP559016994, T3M, T3L);
+					T3R = FMA(KP559016994, T3M, T3L);
+					T3w = T2t + T2z;
+					T2A = T2t - T2z;
+					T2N = T2G - T2M;
+					T3x = T2G + T2M;
+					ii[WS(rs, 8)] = FNMS(KP951056516, T3Q, T3N);
+					ii[WS(rs, 2)] = FMA(KP951056516, T3Q, T3N);
+					ii[WS(rs, 14)] = FMA(KP951056516, T3S, T3R);
+					ii[WS(rs, 11)] = FNMS(KP951056516, T3S, T3R);
+					T3y = T3w + T3x;
+					T3A = T3w - T3x;
+				   }
+				   ii[WS(rs, 10)] = T3y + T3v;
+				   T3z = FNMS(KP250000000, T3y, T3v);
+				   T2O = FMA(KP618033988, T2N, T2A);
+				   T2Q = FNMS(KP618033988, T2A, T2N);
+				   T3F = FNMS(KP559016994, T3A, T3z);
+				   T3B = FMA(KP559016994, T3A, T3z);
+				   ii[WS(rs, 4)] = FMA(KP951056516, T3E, T3B);
+				   ii[WS(rs, 1)] = FNMS(KP951056516, T3E, T3B);
+				   ii[WS(rs, 13)] = FNMS(KP951056516, T3G, T3F);
+				   ii[WS(rs, 7)] = FMA(KP951056516, T3G, T3F);
+				   T2m = T2g - T2j;
+				   T2k = T2g + T2j;
+			      }
+			 }
+		    }
+	       }
+	       ri[WS(rs, 10)] = T2d + T2k;
+	       T2l = FNMS(KP250000000, T2k, T2d);
+	       T2P = FNMS(KP559016994, T2m, T2l);
+	       T2n = FMA(KP559016994, T2m, T2l);
+	       ri[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
+	       ri[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
+	       ri[WS(rs, 13)] = FMA(KP951056516, T2Q, T2P);
+	       ri[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, {72, 28, 112, 0}, 0, 0, 0 };
+
+void X(codelet_t1_15) (planner *p) {
+     X(kdft_dit_register) (p, t1_15, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include t.h */
+
+/*
+ * This function contains 184 FP additions, 112 FP multiplications,
+ * (or, 128 additions, 56 multiplications, 56 fused multiply/add),
+ * 65 stack variables, 6 constants, and 60 memory accesses
+ */
+#include "t.h"
+
+static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
+	       E T1q, T34, Td, T1n, T2S, T35, T13, T1k, T1l, T2E, T2F, T2O, T1H, T1T, T2k;
+	       E T2t, T2f, T2s, T1M, T1U, Tu, TL, TM, T2H, T2I, T2N, T1w, T1Q, T29, T2w;
+	       E T24, T2v, T1B, T1R;
+	       {
+		    E T1, T2R, T6, T1o, Tb, T1p, Tc, T2Q;
+		    T1 = ri[0];
+		    T2R = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 5)];
+			 T5 = ii[WS(rs, 5)];
+			 T2 = W[8];
+			 T4 = W[9];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T1o = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = ri[WS(rs, 10)];
+			 Ta = ii[WS(rs, 10)];
+			 T7 = W[18];
+			 T9 = W[19];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 T1p = FNMS(T9, T8, T7 * Ta);
+		    }
+		    T1q = KP866025403 * (T1o - T1p);
+		    T34 = KP866025403 * (Tb - T6);
+		    Tc = T6 + Tb;
+		    Td = T1 + Tc;
+		    T1n = FNMS(KP500000000, Tc, T1);
+		    T2Q = T1o + T1p;
+		    T2S = T2Q + T2R;
+		    T35 = FNMS(KP500000000, T2Q, T2R);
+	       }
+	       {
+		    E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
+		    E T2i;
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = ri[WS(rs, 6)];
+			 TQ = ii[WS(rs, 6)];
+			 TN = W[10];
+			 TP = W[11];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T2c = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E T15, T17, T14, T16;
+			 T15 = ri[WS(rs, 9)];
+			 T17 = ii[WS(rs, 9)];
+			 T14 = W[16];
+			 T16 = W[17];
+			 T18 = FMA(T14, T15, T16 * T17);
+			 T2h = FNMS(T16, T15, T14 * T17);
+		    }
+		    {
+			 E TT, TV, TS, TU;
+			 TT = ri[WS(rs, 11)];
+			 TV = ii[WS(rs, 11)];
+			 TS = W[20];
+			 TU = W[21];
+			 TW = FMA(TS, TT, TU * TV);
+			 T1E = FNMS(TU, TT, TS * TV);
+		    }
+		    {
+			 E TY, T10, TX, TZ;
+			 TY = ri[WS(rs, 1)];
+			 T10 = ii[WS(rs, 1)];
+			 TX = W[0];
+			 TZ = W[1];
+			 T11 = FMA(TX, TY, TZ * T10);
+			 T1F = FNMS(TZ, TY, TX * T10);
+		    }
+		    T12 = TW + T11;
+		    T2d = T1E + T1F;
+		    {
+			 E T1a, T1c, T19, T1b;
+			 T1a = ri[WS(rs, 14)];
+			 T1c = ii[WS(rs, 14)];
+			 T19 = W[26];
+			 T1b = W[27];
+			 T1d = FMA(T19, T1a, T1b * T1c);
+			 T1J = FNMS(T1b, T1a, T19 * T1c);
+		    }
+		    {
+			 E T1f, T1h, T1e, T1g;
+			 T1f = ri[WS(rs, 4)];
+			 T1h = ii[WS(rs, 4)];
+			 T1e = W[6];
+			 T1g = W[7];
+			 T1i = FMA(T1e, T1f, T1g * T1h);
+			 T1K = FNMS(T1g, T1f, T1e * T1h);
+		    }
+		    T1j = T1d + T1i;
+		    T2i = T1J + T1K;
+		    {
+			 E T1D, T1G, T2g, T2j;
+			 T13 = TR + T12;
+			 T1k = T18 + T1j;
+			 T1l = T13 + T1k;
+			 T2E = T2c + T2d;
+			 T2F = T2h + T2i;
+			 T2O = T2E + T2F;
+			 T1D = FNMS(KP500000000, T12, TR);
+			 T1G = KP866025403 * (T1E - T1F);
+			 T1H = T1D - T1G;
+			 T1T = T1D + T1G;
+			 T2g = KP866025403 * (T1i - T1d);
+			 T2j = FNMS(KP500000000, T2i, T2h);
+			 T2k = T2g + T2j;
+			 T2t = T2j - T2g;
+			 {
+			      E T2b, T2e, T1I, T1L;
+			      T2b = KP866025403 * (T11 - TW);
+			      T2e = FNMS(KP500000000, T2d, T2c);
+			      T2f = T2b + T2e;
+			      T2s = T2e - T2b;
+			      T1I = FNMS(KP500000000, T1j, T18);
+			      T1L = KP866025403 * (T1J - T1K);
+			      T1M = T1I - T1L;
+			      T1U = T1I + T1L;
+			 }
+		    }
+	       }
+	       {
+		    E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
+		    E T27;
+		    {
+			 E Tf, Th, Te, Tg;
+			 Tf = ri[WS(rs, 3)];
+			 Th = ii[WS(rs, 3)];
+			 Te = W[4];
+			 Tg = W[5];
+			 Ti = FMA(Te, Tf, Tg * Th);
+			 T21 = FNMS(Tg, Tf, Te * Th);
+		    }
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = ri[WS(rs, 12)];
+			 Ty = ii[WS(rs, 12)];
+			 Tv = W[22];
+			 Tx = W[23];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T26 = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = ri[WS(rs, 8)];
+			 Tm = ii[WS(rs, 8)];
+			 Tj = W[14];
+			 Tl = W[15];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 T1t = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = ri[WS(rs, 13)];
+			 Tr = ii[WS(rs, 13)];
+			 To = W[24];
+			 Tq = W[25];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 T1u = FNMS(Tq, Tp, To * Tr);
+		    }
+		    Tt = Tn + Ts;
+		    T22 = T1t + T1u;
+		    {
+			 E TB, TD, TA, TC;
+			 TB = ri[WS(rs, 2)];
+			 TD = ii[WS(rs, 2)];
+			 TA = W[2];
+			 TC = W[3];
+			 TE = FMA(TA, TB, TC * TD);
+			 T1y = FNMS(TC, TB, TA * TD);
+		    }
+		    {
+			 E TG, TI, TF, TH;
+			 TG = ri[WS(rs, 7)];
+			 TI = ii[WS(rs, 7)];
+			 TF = W[12];
+			 TH = W[13];
+			 TJ = FMA(TF, TG, TH * TI);
+			 T1z = FNMS(TH, TG, TF * TI);
+		    }
+		    TK = TE + TJ;
+		    T27 = T1y + T1z;
+		    {
+			 E T1s, T1v, T25, T28;
+			 Tu = Ti + Tt;
+			 TL = Tz + TK;
+			 TM = Tu + TL;
+			 T2H = T21 + T22;
+			 T2I = T26 + T27;
+			 T2N = T2H + T2I;
+			 T1s = FNMS(KP500000000, Tt, Ti);
+			 T1v = KP866025403 * (T1t - T1u);
+			 T1w = T1s - T1v;
+			 T1Q = T1s + T1v;
+			 T25 = KP866025403 * (TJ - TE);
+			 T28 = FNMS(KP500000000, T27, T26);
+			 T29 = T25 + T28;
+			 T2w = T28 - T25;
+			 {
+			      E T20, T23, T1x, T1A;
+			      T20 = KP866025403 * (Ts - Tn);
+			      T23 = FNMS(KP500000000, T22, T21);
+			      T24 = T20 + T23;
+			      T2v = T23 - T20;
+			      T1x = FNMS(KP500000000, TK, Tz);
+			      T1A = KP866025403 * (T1y - T1z);
+			      T1B = T1x - T1A;
+			      T1R = T1x + T1A;
+			 }
+		    }
+	       }
+	       {
+		    E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
+		    T2C = KP559016994 * (TM - T1l);
+		    T1m = TM + T1l;
+		    T2B = FNMS(KP250000000, T1m, Td);
+		    T2G = T2E - T2F;
+		    T2J = T2H - T2I;
+		    T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
+		    T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
+		    ri[0] = Td + T1m;
+		    T2L = T2C + T2B;
+		    ri[WS(rs, 9)] = T2L - T2M;
+		    ri[WS(rs, 6)] = T2L + T2M;
+		    T2D = T2B - T2C;
+		    ri[WS(rs, 12)] = T2D - T2K;
+		    ri[WS(rs, 3)] = T2D + T2K;
+	       }
+	       {
+		    E T2U, T2P, T2T, T2Y, T30, T2W, T2X, T2Z, T2V;
+		    T2U = KP559016994 * (T2N - T2O);
+		    T2P = T2N + T2O;
+		    T2T = FNMS(KP250000000, T2P, T2S);
+		    T2W = T13 - T1k;
+		    T2X = Tu - TL;
+		    T2Y = FNMS(KP587785252, T2X, KP951056516 * T2W);
+		    T30 = FMA(KP951056516, T2X, KP587785252 * T2W);
+		    ii[0] = T2P + T2S;
+		    T2Z = T2U + T2T;
+		    ii[WS(rs, 6)] = T2Z - T30;
+		    ii[WS(rs, 9)] = T30 + T2Z;
+		    T2V = T2T - T2U;
+		    ii[WS(rs, 3)] = T2V - T2Y;
+		    ii[WS(rs, 12)] = T2Y + T2V;
+	       }
+	       {
+		    E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
+		    {
+			 E T2u, T2x, T1C, T1N;
+			 T2u = T2s - T2t;
+			 T2x = T2v - T2w;
+			 T2y = FNMS(KP587785252, T2x, KP951056516 * T2u);
+			 T2A = FMA(KP951056516, T2x, KP587785252 * T2u);
+			 T1r = T1n - T1q;
+			 T1C = T1w + T1B;
+			 T1N = T1H + T1M;
+			 T1O = T1C + T1N;
+			 T2p = FNMS(KP250000000, T1O, T1r);
+			 T2q = KP559016994 * (T1C - T1N);
+		    }
+		    ri[WS(rs, 5)] = T1r + T1O;
+		    T2z = T2q + T2p;
+		    ri[WS(rs, 14)] = T2z - T2A;
+		    ri[WS(rs, 11)] = T2z + T2A;
+		    T2r = T2p - T2q;
+		    ri[WS(rs, 2)] = T2r - T2y;
+		    ri[WS(rs, 8)] = T2r + T2y;
+	       }
+	       {
+		    E T3h, T3q, T3i, T3l, T3m, T3n, T3p, T3o;
+		    {
+			 E T3f, T3g, T3j, T3k;
+			 T3f = T1H - T1M;
+			 T3g = T1w - T1B;
+			 T3h = FNMS(KP587785252, T3g, KP951056516 * T3f);
+			 T3q = FMA(KP951056516, T3g, KP587785252 * T3f);
+			 T3i = T35 - T34;
+			 T3j = T2v + T2w;
+			 T3k = T2s + T2t;
+			 T3l = T3j + T3k;
+			 T3m = FNMS(KP250000000, T3l, T3i);
+			 T3n = KP559016994 * (T3j - T3k);
+		    }
+		    ii[WS(rs, 5)] = T3l + T3i;
+		    T3p = T3n + T3m;
+		    ii[WS(rs, 11)] = T3p - T3q;
+		    ii[WS(rs, 14)] = T3q + T3p;
+		    T3o = T3m - T3n;
+		    ii[WS(rs, 2)] = T3h + T3o;
+		    ii[WS(rs, 8)] = T3o - T3h;
+	       }
+	       {
+		    E T3c, T3d, T36, T37, T33, T38, T3e, T39;
+		    {
+			 E T3a, T3b, T31, T32;
+			 T3a = T1Q - T1R;
+			 T3b = T1T - T1U;
+			 T3c = FMA(KP951056516, T3a, KP587785252 * T3b);
+			 T3d = FNMS(KP587785252, T3a, KP951056516 * T3b);
+			 T36 = T34 + T35;
+			 T31 = T24 + T29;
+			 T32 = T2f + T2k;
+			 T37 = T31 + T32;
+			 T33 = KP559016994 * (T31 - T32);
+			 T38 = FNMS(KP250000000, T37, T36);
+		    }
+		    ii[WS(rs, 10)] = T37 + T36;
+		    T3e = T38 - T33;
+		    ii[WS(rs, 7)] = T3d + T3e;
+		    ii[WS(rs, 13)] = T3e - T3d;
+		    T39 = T33 + T38;
+		    ii[WS(rs, 1)] = T39 - T3c;
+		    ii[WS(rs, 4)] = T3c + T39;
+	       }
+	       {
+		    E T2m, T2o, T1P, T1W, T1X, T1Y, T2n, T1Z;
+		    {
+			 E T2a, T2l, T1S, T1V;
+			 T2a = T24 - T29;
+			 T2l = T2f - T2k;
+			 T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
+			 T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
+			 T1P = T1n + T1q;
+			 T1S = T1Q + T1R;
+			 T1V = T1T + T1U;
+			 T1W = T1S + T1V;
+			 T1X = KP559016994 * (T1S - T1V);
+			 T1Y = FNMS(KP250000000, T1W, T1P);
+		    }
+		    ri[WS(rs, 10)] = T1P + T1W;
+		    T2n = T1Y - T1X;
+		    ri[WS(rs, 7)] = T2n - T2o;
+		    ri[WS(rs, 13)] = T2n + T2o;
+		    T1Z = T1X + T1Y;
+		    ri[WS(rs, 4)] = T1Z - T2m;
+		    ri[WS(rs, 1)] = T1Z + T2m;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, {128, 56, 56, 0}, 0, 0, 0 };
+
+void X(codelet_t1_15) (planner *p) {
+     X(kdft_dit_register) (p, t1_15, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,785 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:51 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include t.h */
+
+/*
+ * This function contains 174 FP additions, 100 FP multiplications,
+ * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
+ * 97 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "t.h"
+
+static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T3G, T3F;
+	       {
+		    E T3z, T3o, T8, T1I, T2o, T35, T2r, T1s, T2w, T36, T2p, T1F, T3k, T1N, T3A;
+		    E Tl, T1T, T2V, T1U, Tz, T29, T30, T2c, T11, TB, TE, T2h, T31, T2a, T1e;
+		    E TC, T1X, TH, TK, TG, TD, TJ;
+		    {
+			 E Ta, Td, Tb, T1J, Tg, Tj, Tf, Tc, Ti;
+			 {
+			      E T1h, T1k, T1n, T2k, T1i, T1q, T1m, T1j, T1p;
+			      {
+				   E T1, T3n, T3, T6, T2, T5;
+				   T1 = ri[0];
+				   T3n = ii[0];
+				   T3 = ri[WS(rs, 8)];
+				   T6 = ii[WS(rs, 8)];
+				   T2 = W[14];
+				   T5 = W[15];
+				   {
+					E T3l, T4, T1g, T3m, T7;
+					T1h = ri[WS(rs, 15)];
+					T1k = ii[WS(rs, 15)];
+					T3l = T2 * T6;
+					T4 = T2 * T3;
+					T1g = W[28];
+					T1n = ri[WS(rs, 7)];
+					T3m = FNMS(T5, T3, T3l);
+					T7 = FMA(T5, T6, T4);
+					T2k = T1g * T1k;
+					T1i = T1g * T1h;
+					T3z = T3n - T3m;
+					T3o = T3m + T3n;
+					T8 = T1 + T7;
+					T1I = T1 - T7;
+					T1q = ii[WS(rs, 7)];
+					T1m = W[12];
+				   }
+				   T1j = W[29];
+				   T1p = W[13];
+			      }
+			      {
+				   E T1u, T1x, T1v, T2s, T1A, T1D, T1z, T1w, T1C;
+				   {
+					E T2l, T1l, T2n, T1r, T2m, T1o, T1t;
+					T1u = ri[WS(rs, 3)];
+					T2m = T1m * T1q;
+					T1o = T1m * T1n;
+					T2l = FNMS(T1j, T1h, T2k);
+					T1l = FMA(T1j, T1k, T1i);
+					T2n = FNMS(T1p, T1n, T2m);
+					T1r = FMA(T1p, T1q, T1o);
+					T1x = ii[WS(rs, 3)];
+					T1t = W[4];
+					T2o = T2l - T2n;
+					T35 = T2l + T2n;
+					T2r = T1l - T1r;
+					T1s = T1l + T1r;
+					T1v = T1t * T1u;
+					T2s = T1t * T1x;
+				   }
+				   T1A = ri[WS(rs, 11)];
+				   T1D = ii[WS(rs, 11)];
+				   T1z = W[20];
+				   T1w = W[5];
+				   T1C = W[21];
+				   {
+					E T2t, T1y, T2v, T1E, T2u, T1B, T9;
+					Ta = ri[WS(rs, 4)];
+					T2u = T1z * T1D;
+					T1B = T1z * T1A;
+					T2t = FNMS(T1w, T1u, T2s);
+					T1y = FMA(T1w, T1x, T1v);
+					T2v = FNMS(T1C, T1A, T2u);
+					T1E = FMA(T1C, T1D, T1B);
+					Td = ii[WS(rs, 4)];
+					T9 = W[6];
+					T2w = T2t - T2v;
+					T36 = T2t + T2v;
+					T2p = T1y - T1E;
+					T1F = T1y + T1E;
+					Tb = T9 * Ta;
+					T1J = T9 * Td;
+				   }
+				   Tg = ri[WS(rs, 12)];
+				   Tj = ii[WS(rs, 12)];
+				   Tf = W[22];
+				   Tc = W[7];
+				   Ti = W[23];
+			      }
+			 }
+			 {
+			      E TQ, TT, TR, T25, TW, TZ, TV, TS, TY;
+			      {
+				   E To, Tr, Tp, T1P, Tu, Tx, Tt, Tq, Tw;
+				   {
+					E T1K, Te, T1M, Tk, T1L, Th, Tn;
+					To = ri[WS(rs, 2)];
+					T1L = Tf * Tj;
+					Th = Tf * Tg;
+					T1K = FNMS(Tc, Ta, T1J);
+					Te = FMA(Tc, Td, Tb);
+					T1M = FNMS(Ti, Tg, T1L);
+					Tk = FMA(Ti, Tj, Th);
+					Tr = ii[WS(rs, 2)];
+					Tn = W[2];
+					T3k = T1K + T1M;
+					T1N = T1K - T1M;
+					T3A = Te - Tk;
+					Tl = Te + Tk;
+					Tp = Tn * To;
+					T1P = Tn * Tr;
+				   }
+				   Tu = ri[WS(rs, 10)];
+				   Tx = ii[WS(rs, 10)];
+				   Tt = W[18];
+				   Tq = W[3];
+				   Tw = W[19];
+				   {
+					E T1Q, Ts, T1S, Ty, T1R, Tv, TP;
+					TQ = ri[WS(rs, 1)];
+					T1R = Tt * Tx;
+					Tv = Tt * Tu;
+					T1Q = FNMS(Tq, To, T1P);
+					Ts = FMA(Tq, Tr, Tp);
+					T1S = FNMS(Tw, Tu, T1R);
+					Ty = FMA(Tw, Tx, Tv);
+					TT = ii[WS(rs, 1)];
+					TP = W[0];
+					T1T = T1Q - T1S;
+					T2V = T1Q + T1S;
+					T1U = Ts - Ty;
+					Tz = Ts + Ty;
+					TR = TP * TQ;
+					T25 = TP * TT;
+				   }
+				   TW = ri[WS(rs, 9)];
+				   TZ = ii[WS(rs, 9)];
+				   TV = W[16];
+				   TS = W[1];
+				   TY = W[17];
+			      }
+			      {
+				   E T13, T16, T14, T2d, T19, T1c, T18, T15, T1b;
+				   {
+					E T26, TU, T28, T10, T27, TX, T12;
+					T13 = ri[WS(rs, 5)];
+					T27 = TV * TZ;
+					TX = TV * TW;
+					T26 = FNMS(TS, TQ, T25);
+					TU = FMA(TS, TT, TR);
+					T28 = FNMS(TY, TW, T27);
+					T10 = FMA(TY, TZ, TX);
+					T16 = ii[WS(rs, 5)];
+					T12 = W[8];
+					T29 = T26 - T28;
+					T30 = T26 + T28;
+					T2c = TU - T10;
+					T11 = TU + T10;
+					T14 = T12 * T13;
+					T2d = T12 * T16;
+				   }
+				   T19 = ri[WS(rs, 13)];
+				   T1c = ii[WS(rs, 13)];
+				   T18 = W[24];
+				   T15 = W[9];
+				   T1b = W[25];
+				   {
+					E T2e, T17, T2g, T1d, T2f, T1a, TA;
+					TB = ri[WS(rs, 14)];
+					T2f = T18 * T1c;
+					T1a = T18 * T19;
+					T2e = FNMS(T15, T13, T2d);
+					T17 = FMA(T15, T16, T14);
+					T2g = FNMS(T1b, T19, T2f);
+					T1d = FMA(T1b, T1c, T1a);
+					TE = ii[WS(rs, 14)];
+					TA = W[26];
+					T2h = T2e - T2g;
+					T31 = T2e + T2g;
+					T2a = T17 - T1d;
+					T1e = T17 + T1d;
+					TC = TA * TB;
+					T1X = TA * TE;
+				   }
+				   TH = ri[WS(rs, 6)];
+				   TK = ii[WS(rs, 6)];
+				   TG = W[10];
+				   TD = W[27];
+				   TJ = W[11];
+			      }
+			 }
+		    }
+		    {
+			 E T2U, T3u, T2Z, T21, T1W, T34, T2X, T3f, T32, T3t, T1H, T3q, T3e, TO, T3g;
+			 E T37, T3r, T3s, T3h, T3i;
+			 {
+			      E Tm, T1Y, TF, T20, TL, T3p, T1Z, TI;
+			      T2U = T8 - Tl;
+			      Tm = T8 + Tl;
+			      T1Z = TG * TK;
+			      TI = TG * TH;
+			      T1Y = FNMS(TD, TB, T1X);
+			      TF = FMA(TD, TE, TC);
+			      T20 = FNMS(TJ, TH, T1Z);
+			      TL = FMA(TJ, TK, TI);
+			      T3p = T3k + T3o;
+			      T3u = T3o - T3k;
+			      {
+				   E T1f, TM, T1G, T3j, T2W, TN;
+				   T2Z = T11 - T1e;
+				   T1f = T11 + T1e;
+				   T21 = T1Y - T20;
+				   T2W = T1Y + T20;
+				   T1W = TF - TL;
+				   TM = TF + TL;
+				   T1G = T1s + T1F;
+				   T34 = T1s - T1F;
+				   T2X = T2V - T2W;
+				   T3j = T2V + T2W;
+				   T3f = T30 + T31;
+				   T32 = T30 - T31;
+				   T3t = TM - Tz;
+				   TN = Tz + TM;
+				   T3r = T1G - T1f;
+				   T1H = T1f + T1G;
+				   T3s = T3p - T3j;
+				   T3q = T3j + T3p;
+				   T3e = Tm - TN;
+				   TO = Tm + TN;
+				   T3g = T35 + T36;
+				   T37 = T35 - T36;
+			      }
+			 }
+			 ii[WS(rs, 12)] = T3s - T3r;
+			 ii[WS(rs, 4)] = T3r + T3s;
+			 ri[0] = TO + T1H;
+			 ri[WS(rs, 8)] = TO - T1H;
+			 T3h = T3f - T3g;
+			 T3i = T3f + T3g;
+			 {
+			      E T3a, T2Y, T3x, T3v, T3b, T33;
+			      ii[0] = T3i + T3q;
+			      ii[WS(rs, 8)] = T3q - T3i;
+			      ri[WS(rs, 4)] = T3e + T3h;
+			      ri[WS(rs, 12)] = T3e - T3h;
+			      T3a = T2U - T2X;
+			      T2Y = T2U + T2X;
+			      T3x = T3u - T3t;
+			      T3v = T3t + T3u;
+			      T3b = T32 - T2Z;
+			      T33 = T2Z + T32;
+			      {
+				   E T2E, T1O, T3B, T3H, T2x, T2q, T3C, T23, T2S, T2O, T2K, T2J, T3I, T2H, T2B;
+				   E T2j;
+				   {
+					E T2F, T1V, T22, T2G, T3c, T38;
+					T2E = T1I + T1N;
+					T1O = T1I - T1N;
+					T3B = T3z - T3A;
+					T3H = T3A + T3z;
+					T3c = T34 + T37;
+					T38 = T34 - T37;
+					T2F = T1U + T1T;
+					T1V = T1T - T1U;
+					{
+					     E T3d, T3w, T3y, T39;
+					     T3d = T3b - T3c;
+					     T3w = T3b + T3c;
+					     T3y = T38 - T33;
+					     T39 = T33 + T38;
+					     ri[WS(rs, 6)] = FMA(KP707106781, T3d, T3a);
+					     ri[WS(rs, 14)] = FNMS(KP707106781, T3d, T3a);
+					     ii[WS(rs, 10)] = FNMS(KP707106781, T3w, T3v);
+					     ii[WS(rs, 2)] = FMA(KP707106781, T3w, T3v);
+					     ii[WS(rs, 14)] = FNMS(KP707106781, T3y, T3x);
+					     ii[WS(rs, 6)] = FMA(KP707106781, T3y, T3x);
+					     ri[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
+					     ri[WS(rs, 10)] = FNMS(KP707106781, T39, T2Y);
+					     T22 = T1W + T21;
+					     T2G = T1W - T21;
+					}
+					{
+					     E T2M, T2N, T2b, T2i;
+					     T2x = T2r - T2w;
+					     T2M = T2r + T2w;
+					     T2N = T2o - T2p;
+					     T2q = T2o + T2p;
+					     T3C = T1V + T22;
+					     T23 = T1V - T22;
+					     T2S = FMA(KP414213562, T2M, T2N);
+					     T2O = FNMS(KP414213562, T2N, T2M);
+					     T2K = T29 - T2a;
+					     T2b = T29 + T2a;
+					     T2i = T2c - T2h;
+					     T2J = T2c + T2h;
+					     T3I = T2G - T2F;
+					     T2H = T2F + T2G;
+					     T2B = FNMS(KP414213562, T2b, T2i);
+					     T2j = FMA(KP414213562, T2i, T2b);
+					}
+				   }
+				   {
+					E T2R, T2L, T3L, T3M;
+					{
+					     E T2A, T24, T2C, T2y, T3J, T3K, T2D, T2z;
+					     T2A = FNMS(KP707106781, T23, T1O);
+					     T24 = FMA(KP707106781, T23, T1O);
+					     T2R = FNMS(KP414213562, T2J, T2K);
+					     T2L = FMA(KP414213562, T2K, T2J);
+					     T2C = FMA(KP414213562, T2q, T2x);
+					     T2y = FNMS(KP414213562, T2x, T2q);
+					     T3J = FMA(KP707106781, T3I, T3H);
+					     T3L = FNMS(KP707106781, T3I, T3H);
+					     T3K = T2C - T2B;
+					     T2D = T2B + T2C;
+					     T3M = T2j + T2y;
+					     T2z = T2j - T2y;
+					     ii[WS(rs, 11)] = FNMS(KP923879532, T3K, T3J);
+					     ii[WS(rs, 3)] = FMA(KP923879532, T3K, T3J);
+					     ri[WS(rs, 3)] = FMA(KP923879532, T2z, T24);
+					     ri[WS(rs, 11)] = FNMS(KP923879532, T2z, T24);
+					     ri[WS(rs, 15)] = FMA(KP923879532, T2D, T2A);
+					     ri[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A);
+					}
+					{
+					     E T2Q, T3D, T3E, T2T, T2I, T2P;
+					     T2Q = FNMS(KP707106781, T2H, T2E);
+					     T2I = FMA(KP707106781, T2H, T2E);
+					     T2P = T2L + T2O;
+					     T3G = T2O - T2L;
+					     T3F = FNMS(KP707106781, T3C, T3B);
+					     T3D = FMA(KP707106781, T3C, T3B);
+					     ii[WS(rs, 15)] = FMA(KP923879532, T3M, T3L);
+					     ii[WS(rs, 7)] = FNMS(KP923879532, T3M, T3L);
+					     ri[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
+					     ri[WS(rs, 9)] = FNMS(KP923879532, T2P, T2I);
+					     T3E = T2R + T2S;
+					     T2T = T2R - T2S;
+					     ii[WS(rs, 9)] = FNMS(KP923879532, T3E, T3D);
+					     ii[WS(rs, 1)] = FMA(KP923879532, T3E, T3D);
+					     ri[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q);
+					     ri[WS(rs, 13)] = FNMS(KP923879532, T2T, T2Q);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ii[WS(rs, 13)] = FNMS(KP923879532, T3G, T3F);
+	       ii[WS(rs, 5)] = FMA(KP923879532, T3G, T3F);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, {104, 30, 70, 0}, 0, 0, 0 };
+
+void X(codelet_t1_16) (planner *p) {
+     X(kdft_dit_register) (p, t1_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include t.h */
+
+/*
+ * This function contains 174 FP additions, 84 FP multiplications,
+ * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
+ * 52 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "t.h"
+
+static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
+	       E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
+	       E T2y, T2z, T1O, T2g, T1T, T2h;
+	       {
+		    E T1, T2T, T6, T2S;
+		    T1 = ri[0];
+		    T2T = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 8)];
+			 T5 = ii[WS(rs, 8)];
+			 T2 = W[14];
+			 T4 = W[15];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T2S = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 + T6;
+		    T37 = T2T - T2S;
+		    T1t = T1 - T6;
+		    T2U = T2S + T2T;
+	       }
+	       {
+		    E Tc, T1u, Th, T1v;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = ri[WS(rs, 4)];
+			 Tb = ii[WS(rs, 4)];
+			 T8 = W[6];
+			 Ta = W[7];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T1u = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = ri[WS(rs, 12)];
+			 Tg = ii[WS(rs, 12)];
+			 Td = W[22];
+			 Tf = W[23];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T1v = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Ti = Tc + Th;
+		    T38 = Tc - Th;
+		    T1w = T1u - T1v;
+		    T2R = T1u + T1v;
+	       }
+	       {
+		    E To, T1y, Tt, T1z, T1A, T1B;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = ri[WS(rs, 2)];
+			 Tn = ii[WS(rs, 2)];
+			 Tk = W[2];
+			 Tm = W[3];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 T1y = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = ri[WS(rs, 10)];
+			 Ts = ii[WS(rs, 10)];
+			 Tp = W[18];
+			 Tr = W[19];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 T1z = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    Tu = To + Tt;
+		    T2s = T1y + T1z;
+		    T1A = T1y - T1z;
+		    T1B = To - Tt;
+		    T1C = T1A - T1B;
+		    T2c = T1B + T1A;
+	       }
+	       {
+		    E Tz, T1E, TE, T1F, T1D, T1G;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = ri[WS(rs, 14)];
+			 Ty = ii[WS(rs, 14)];
+			 Tv = W[26];
+			 Tx = W[27];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T1E = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = ri[WS(rs, 6)];
+			 TD = ii[WS(rs, 6)];
+			 TA = W[10];
+			 TC = W[11];
+			 TE = FMA(TA, TB, TC * TD);
+			 T1F = FNMS(TC, TB, TA * TD);
+		    }
+		    TF = Tz + TE;
+		    T2t = T1E + T1F;
+		    T1D = Tz - TE;
+		    T1G = T1E - T1F;
+		    T1H = T1D + T1G;
+		    T2d = T1D - T1G;
+	       }
+	       {
+		    E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
+		    {
+			 E T16, T18, T15, T17;
+			 T16 = ri[WS(rs, 15)];
+			 T18 = ii[WS(rs, 15)];
+			 T15 = W[28];
+			 T17 = W[29];
+			 T19 = FMA(T15, T16, T17 * T18);
+			 T20 = FNMS(T17, T16, T15 * T18);
+		    }
+		    {
+			 E T1m, T1o, T1l, T1n;
+			 T1m = ri[WS(rs, 11)];
+			 T1o = ii[WS(rs, 11)];
+			 T1l = W[20];
+			 T1n = W[21];
+			 T1p = FMA(T1l, T1m, T1n * T1o);
+			 T1X = FNMS(T1n, T1m, T1l * T1o);
+		    }
+		    {
+			 E T1b, T1d, T1a, T1c;
+			 T1b = ri[WS(rs, 7)];
+			 T1d = ii[WS(rs, 7)];
+			 T1a = W[12];
+			 T1c = W[13];
+			 T1e = FMA(T1a, T1b, T1c * T1d);
+			 T21 = FNMS(T1c, T1b, T1a * T1d);
+		    }
+		    {
+			 E T1h, T1j, T1g, T1i;
+			 T1h = ri[WS(rs, 3)];
+			 T1j = ii[WS(rs, 3)];
+			 T1g = W[4];
+			 T1i = W[5];
+			 T1k = FMA(T1g, T1h, T1i * T1j);
+			 T1W = FNMS(T1i, T1h, T1g * T1j);
+		    }
+		    T1f = T19 + T1e;
+		    T1q = T1k + T1p;
+		    T2B = T1f - T1q;
+		    T2C = T20 + T21;
+		    T2D = T1W + T1X;
+		    T2E = T2C - T2D;
+		    {
+			 E T1V, T1Y, T22, T23;
+			 T1V = T19 - T1e;
+			 T1Y = T1W - T1X;
+			 T1Z = T1V - T1Y;
+			 T2j = T1V + T1Y;
+			 T22 = T20 - T21;
+			 T23 = T1k - T1p;
+			 T24 = T22 + T23;
+			 T2k = T22 - T23;
+		    }
+	       }
+	       {
+		    E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
+		    {
+			 E TJ, TL, TI, TK;
+			 TJ = ri[WS(rs, 1)];
+			 TL = ii[WS(rs, 1)];
+			 TI = W[0];
+			 TK = W[1];
+			 TM = FMA(TI, TJ, TK * TL);
+			 T1K = FNMS(TK, TJ, TI * TL);
+		    }
+		    {
+			 E TZ, T11, TY, T10;
+			 TZ = ri[WS(rs, 13)];
+			 T11 = ii[WS(rs, 13)];
+			 TY = W[24];
+			 T10 = W[25];
+			 T12 = FMA(TY, TZ, T10 * T11);
+			 T1R = FNMS(T10, TZ, TY * T11);
+		    }
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = ri[WS(rs, 9)];
+			 TQ = ii[WS(rs, 9)];
+			 TN = W[16];
+			 TP = W[17];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T1L = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E TU, TW, TT, TV;
+			 TU = ri[WS(rs, 5)];
+			 TW = ii[WS(rs, 5)];
+			 TT = W[8];
+			 TV = W[9];
+			 TX = FMA(TT, TU, TV * TW);
+			 T1Q = FNMS(TV, TU, TT * TW);
+		    }
+		    TS = TM + TR;
+		    T13 = TX + T12;
+		    T2w = TS - T13;
+		    T2x = T1K + T1L;
+		    T2y = T1Q + T1R;
+		    T2z = T2x - T2y;
+		    {
+			 E T1M, T1N, T1P, T1S;
+			 T1M = T1K - T1L;
+			 T1N = TX - T12;
+			 T1O = T1M + T1N;
+			 T2g = T1M - T1N;
+			 T1P = TM - TR;
+			 T1S = T1Q - T1R;
+			 T1T = T1P - T1S;
+			 T2h = T1P + T1S;
+		    }
+	       }
+	       {
+		    E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
+		    {
+			 E T1x, T1I, T3e, T3f;
+			 T1x = T1t - T1w;
+			 T1I = KP707106781 * (T1C - T1H);
+			 T1J = T1x + T1I;
+			 T27 = T1x - T1I;
+			 T3e = KP707106781 * (T2d - T2c);
+			 T3f = T38 + T37;
+			 T3g = T3e + T3f;
+			 T3i = T3f - T3e;
+		    }
+		    {
+			 E T1U, T25, T28, T29;
+			 T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
+			 T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
+			 T26 = T1U + T25;
+			 T3h = T25 - T1U;
+			 T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
+			 T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
+			 T2a = T28 - T29;
+			 T3d = T28 + T29;
+		    }
+		    ri[WS(rs, 11)] = T1J - T26;
+		    ii[WS(rs, 11)] = T3g - T3d;
+		    ri[WS(rs, 3)] = T1J + T26;
+		    ii[WS(rs, 3)] = T3d + T3g;
+		    ri[WS(rs, 15)] = T27 - T2a;
+		    ii[WS(rs, 15)] = T3i - T3h;
+		    ri[WS(rs, 7)] = T27 + T2a;
+		    ii[WS(rs, 7)] = T3h + T3i;
+	       }
+	       {
+		    E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
+		    {
+			 E T2r, T2u, T30, T31;
+			 T2r = T7 - Ti;
+			 T2u = T2s - T2t;
+			 T2v = T2r + T2u;
+			 T2H = T2r - T2u;
+			 T30 = TF - Tu;
+			 T31 = T2U - T2R;
+			 T32 = T30 + T31;
+			 T34 = T31 - T30;
+		    }
+		    {
+			 E T2A, T2F, T2I, T2J;
+			 T2A = T2w + T2z;
+			 T2F = T2B - T2E;
+			 T2G = KP707106781 * (T2A + T2F);
+			 T33 = KP707106781 * (T2F - T2A);
+			 T2I = T2z - T2w;
+			 T2J = T2B + T2E;
+			 T2K = KP707106781 * (T2I - T2J);
+			 T2Z = KP707106781 * (T2I + T2J);
+		    }
+		    ri[WS(rs, 10)] = T2v - T2G;
+		    ii[WS(rs, 10)] = T32 - T2Z;
+		    ri[WS(rs, 2)] = T2v + T2G;
+		    ii[WS(rs, 2)] = T2Z + T32;
+		    ri[WS(rs, 14)] = T2H - T2K;
+		    ii[WS(rs, 14)] = T34 - T33;
+		    ri[WS(rs, 6)] = T2H + T2K;
+		    ii[WS(rs, 6)] = T33 + T34;
+	       }
+	       {
+		    E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
+		    {
+			 E T2b, T2e, T36, T39;
+			 T2b = T1t + T1w;
+			 T2e = KP707106781 * (T2c + T2d);
+			 T2f = T2b + T2e;
+			 T2n = T2b - T2e;
+			 T36 = KP707106781 * (T1C + T1H);
+			 T39 = T37 - T38;
+			 T3a = T36 + T39;
+			 T3c = T39 - T36;
+		    }
+		    {
+			 E T2i, T2l, T2o, T2p;
+			 T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
+			 T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
+			 T2m = T2i + T2l;
+			 T3b = T2l - T2i;
+			 T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
+			 T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
+			 T2q = T2o - T2p;
+			 T35 = T2o + T2p;
+		    }
+		    ri[WS(rs, 9)] = T2f - T2m;
+		    ii[WS(rs, 9)] = T3a - T35;
+		    ri[WS(rs, 1)] = T2f + T2m;
+		    ii[WS(rs, 1)] = T35 + T3a;
+		    ri[WS(rs, 13)] = T2n - T2q;
+		    ii[WS(rs, 13)] = T3c - T3b;
+		    ri[WS(rs, 5)] = T2n + T2q;
+		    ii[WS(rs, 5)] = T3b + T3c;
+	       }
+	       {
+		    E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
+		    {
+			 E Tj, TG, T2Q, T2V;
+			 Tj = T7 + Ti;
+			 TG = Tu + TF;
+			 TH = Tj + TG;
+			 T2L = Tj - TG;
+			 T2Q = T2s + T2t;
+			 T2V = T2R + T2U;
+			 T2W = T2Q + T2V;
+			 T2Y = T2V - T2Q;
+		    }
+		    {
+			 E T14, T1r, T2M, T2N;
+			 T14 = TS + T13;
+			 T1r = T1f + T1q;
+			 T1s = T14 + T1r;
+			 T2X = T1r - T14;
+			 T2M = T2x + T2y;
+			 T2N = T2C + T2D;
+			 T2O = T2M - T2N;
+			 T2P = T2M + T2N;
+		    }
+		    ri[WS(rs, 8)] = TH - T1s;
+		    ii[WS(rs, 8)] = T2W - T2P;
+		    ri[0] = TH + T1s;
+		    ii[0] = T2P + T2W;
+		    ri[WS(rs, 12)] = T2L - T2O;
+		    ii[WS(rs, 12)] = T2Y - T2X;
+		    ri[WS(rs, 4)] = T2L + T2O;
+		    ii[WS(rs, 4)] = T2X + T2Y;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, {136, 46, 38, 0}, 0, 0, 0 };
+
+void X(codelet_t1_16) (planner *p) {
+     X(kdft_dit_register) (p, t1_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:47 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 2 -name t1_2 -include t.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t.h"
+
+static void t1_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 2); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
+	       E T1, Ta, T3, T6, T2, T5;
+	       T1 = ri[0];
+	       Ta = ii[0];
+	       T3 = ri[WS(rs, 1)];
+	       T6 = ii[WS(rs, 1)];
+	       T2 = W[0];
+	       T5 = W[1];
+	       {
+		    E T8, T4, T9, T7;
+		    T8 = T2 * T6;
+		    T4 = T2 * T3;
+		    T9 = FNMS(T5, T3, T8);
+		    T7 = FMA(T5, T6, T4);
+		    ii[0] = T9 + Ta;
+		    ii[WS(rs, 1)] = Ta - T9;
+		    ri[0] = T1 + T7;
+		    ri[WS(rs, 1)] = T1 - T7;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 2, "t1_2", twinstr, &GENUS, {4, 2, 2, 0}, 0, 0, 0 };
+
+void X(codelet_t1_2) (planner *p) {
+     X(kdft_dit_register) (p, t1_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 2 -name t1_2 -include t.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 9 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t.h"
+
+static void t1_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 2); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
+	       E T1, T8, T6, T7;
+	       T1 = ri[0];
+	       T8 = ii[0];
+	       {
+		    E T3, T5, T2, T4;
+		    T3 = ri[WS(rs, 1)];
+		    T5 = ii[WS(rs, 1)];
+		    T2 = W[0];
+		    T4 = W[1];
+		    T6 = FMA(T2, T3, T4 * T5);
+		    T7 = FNMS(T4, T3, T2 * T5);
+	       }
+	       ri[WS(rs, 1)] = T1 - T6;
+	       ii[WS(rs, 1)] = T8 - T7;
+	       ri[0] = T1 + T6;
+	       ii[0] = T7 + T8;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 2, "t1_2", twinstr, &GENUS, {4, 2, 2, 0}, 0, 0, 0 };
+
+void X(codelet_t1_2) (planner *p) {
+     X(kdft_dit_register) (p, t1_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:53 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -name t1_20 -include t.h */
+
+/*
+ * This function contains 246 FP additions, 148 FP multiplications,
+ * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
+ * 97 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "t.h"
+
+static void t1_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 38); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T4P, T4Y, T50, T4U, T4S, T4T, T4Z, T4V;
+	       {
+		    E T4N, T4r, T8, T2i, T4n, T2n, T4O, Tl, T2v, T3v, T40, T4b, TN, T2b, T3F;
+		    E T3i, T2R, T3z, T3W, T4f, T27, T2f, T3J, T3a, T2K, T3y, T3T, T4e, T1G, T2e;
+		    E T3I, T33, T2C, T3w, T43, T4c, T1e, T2c, T3G, T3p;
+		    {
+			 E T1, T4q, T3, T6, T2, T5;
+			 T1 = ri[0];
+			 T4q = ii[0];
+			 T3 = ri[WS(rs, 10)];
+			 T6 = ii[WS(rs, 10)];
+			 T2 = W[18];
+			 T5 = W[19];
+			 {
+			      E Ta, Td, Tg, T2j, Tb, Tj, Tf, Tc, Ti;
+			      {
+				   E T4o, T4, T9, T4p, T7;
+				   Ta = ri[WS(rs, 5)];
+				   Td = ii[WS(rs, 5)];
+				   T4o = T2 * T6;
+				   T4 = T2 * T3;
+				   T9 = W[8];
+				   Tg = ri[WS(rs, 15)];
+				   T4p = FNMS(T5, T3, T4o);
+				   T7 = FMA(T5, T6, T4);
+				   T2j = T9 * Td;
+				   Tb = T9 * Ta;
+				   T4N = T4q - T4p;
+				   T4r = T4p + T4q;
+				   T8 = T1 + T7;
+				   T2i = T1 - T7;
+				   Tj = ii[WS(rs, 15)];
+				   Tf = W[28];
+			      }
+			      Tc = W[9];
+			      Ti = W[29];
+			      {
+				   E T3d, Ts, T2t, TL, TB, TE, TD, T3f, Ty, T2q, TC;
+				   {
+					E TH, TK, TJ, T2s, TI;
+					{
+					     E To, Tr, Tp, T3c, Tq, TG;
+					     {
+						  E T2k, Te, T2m, Tk, T2l, Th, Tn;
+						  To = ri[WS(rs, 4)];
+						  T2l = Tf * Tj;
+						  Th = Tf * Tg;
+						  T2k = FNMS(Tc, Ta, T2j);
+						  Te = FMA(Tc, Td, Tb);
+						  T2m = FNMS(Ti, Tg, T2l);
+						  Tk = FMA(Ti, Tj, Th);
+						  Tr = ii[WS(rs, 4)];
+						  Tn = W[6];
+						  T4n = T2k + T2m;
+						  T2n = T2k - T2m;
+						  T4O = Te - Tk;
+						  Tl = Te + Tk;
+						  Tp = Tn * To;
+						  T3c = Tn * Tr;
+					     }
+					     Tq = W[7];
+					     TH = ri[WS(rs, 19)];
+					     TK = ii[WS(rs, 19)];
+					     TG = W[36];
+					     T3d = FNMS(Tq, To, T3c);
+					     Ts = FMA(Tq, Tr, Tp);
+					     TJ = W[37];
+					     T2s = TG * TK;
+					     TI = TG * TH;
+					}
+					{
+					     E Tu, Tx, Tt, Tw, T3e, Tv, TA;
+					     Tu = ri[WS(rs, 14)];
+					     Tx = ii[WS(rs, 14)];
+					     T2t = FNMS(TJ, TH, T2s);
+					     TL = FMA(TJ, TK, TI);
+					     Tt = W[26];
+					     Tw = W[27];
+					     TB = ri[WS(rs, 9)];
+					     TE = ii[WS(rs, 9)];
+					     T3e = Tt * Tx;
+					     Tv = Tt * Tu;
+					     TA = W[16];
+					     TD = W[17];
+					     T3f = FNMS(Tw, Tu, T3e);
+					     Ty = FMA(Tw, Tx, Tv);
+					     T2q = TA * TE;
+					     TC = TA * TB;
+					}
+				   }
+				   {
+					E T3g, T3Y, Tz, T2p, T2r, TF;
+					T3g = T3d - T3f;
+					T3Y = T3d + T3f;
+					Tz = Ts + Ty;
+					T2p = Ts - Ty;
+					T2r = FNMS(TD, TB, T2q);
+					TF = FMA(TD, TE, TC);
+					{
+					     E T3Z, T2u, T3h, TM;
+					     T3Z = T2r + T2t;
+					     T2u = T2r - T2t;
+					     T3h = TF - TL;
+					     TM = TF + TL;
+					     T2v = T2p - T2u;
+					     T3v = T2p + T2u;
+					     T40 = T3Y - T3Z;
+					     T4b = T3Y + T3Z;
+					     TN = Tz - TM;
+					     T2b = Tz + TM;
+					     T3F = T3g - T3h;
+					     T3i = T3g + T3h;
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T35, T1M, T2P, T25, T1V, T1Y, T1X, T37, T1S, T2M, T1W;
+			 {
+			      E T21, T24, T23, T2O, T22;
+			      {
+				   E T1I, T1L, T1H, T1K, T34, T1J, T20;
+				   T1I = ri[WS(rs, 12)];
+				   T1L = ii[WS(rs, 12)];
+				   T1H = W[22];
+				   T1K = W[23];
+				   T21 = ri[WS(rs, 7)];
+				   T24 = ii[WS(rs, 7)];
+				   T34 = T1H * T1L;
+				   T1J = T1H * T1I;
+				   T20 = W[12];
+				   T23 = W[13];
+				   T35 = FNMS(T1K, T1I, T34);
+				   T1M = FMA(T1K, T1L, T1J);
+				   T2O = T20 * T24;
+				   T22 = T20 * T21;
+			      }
+			      {
+				   E T1O, T1R, T1N, T1Q, T36, T1P, T1U;
+				   T1O = ri[WS(rs, 2)];
+				   T1R = ii[WS(rs, 2)];
+				   T2P = FNMS(T23, T21, T2O);
+				   T25 = FMA(T23, T24, T22);
+				   T1N = W[2];
+				   T1Q = W[3];
+				   T1V = ri[WS(rs, 17)];
+				   T1Y = ii[WS(rs, 17)];
+				   T36 = T1N * T1R;
+				   T1P = T1N * T1O;
+				   T1U = W[32];
+				   T1X = W[33];
+				   T37 = FNMS(T1Q, T1O, T36);
+				   T1S = FMA(T1Q, T1R, T1P);
+				   T2M = T1U * T1Y;
+				   T1W = T1U * T1V;
+			      }
+			 }
+			 {
+			      E T38, T3U, T1T, T2L, T2N, T1Z;
+			      T38 = T35 - T37;
+			      T3U = T35 + T37;
+			      T1T = T1M + T1S;
+			      T2L = T1M - T1S;
+			      T2N = FNMS(T1X, T1V, T2M);
+			      T1Z = FMA(T1X, T1Y, T1W);
+			      {
+				   E T3V, T2Q, T39, T26;
+				   T3V = T2N + T2P;
+				   T2Q = T2N - T2P;
+				   T39 = T1Z - T25;
+				   T26 = T1Z + T25;
+				   T2R = T2L - T2Q;
+				   T3z = T2L + T2Q;
+				   T3W = T3U - T3V;
+				   T4f = T3U + T3V;
+				   T27 = T1T - T26;
+				   T2f = T1T + T26;
+				   T3J = T38 - T39;
+				   T3a = T38 + T39;
+			      }
+			 }
+		    }
+		    {
+			 E T2Y, T1l, T2I, T1E, T1u, T1x, T1w, T30, T1r, T2F, T1v;
+			 {
+			      E T1A, T1D, T1C, T2H, T1B;
+			      {
+				   E T1h, T1k, T1g, T1j, T2X, T1i, T1z;
+				   T1h = ri[WS(rs, 8)];
+				   T1k = ii[WS(rs, 8)];
+				   T1g = W[14];
+				   T1j = W[15];
+				   T1A = ri[WS(rs, 3)];
+				   T1D = ii[WS(rs, 3)];
+				   T2X = T1g * T1k;
+				   T1i = T1g * T1h;
+				   T1z = W[4];
+				   T1C = W[5];
+				   T2Y = FNMS(T1j, T1h, T2X);
+				   T1l = FMA(T1j, T1k, T1i);
+				   T2H = T1z * T1D;
+				   T1B = T1z * T1A;
+			      }
+			      {
+				   E T1n, T1q, T1m, T1p, T2Z, T1o, T1t;
+				   T1n = ri[WS(rs, 18)];
+				   T1q = ii[WS(rs, 18)];
+				   T2I = FNMS(T1C, T1A, T2H);
+				   T1E = FMA(T1C, T1D, T1B);
+				   T1m = W[34];
+				   T1p = W[35];
+				   T1u = ri[WS(rs, 13)];
+				   T1x = ii[WS(rs, 13)];
+				   T2Z = T1m * T1q;
+				   T1o = T1m * T1n;
+				   T1t = W[24];
+				   T1w = W[25];
+				   T30 = FNMS(T1p, T1n, T2Z);
+				   T1r = FMA(T1p, T1q, T1o);
+				   T2F = T1t * T1x;
+				   T1v = T1t * T1u;
+			      }
+			 }
+			 {
+			      E T31, T3R, T1s, T2E, T2G, T1y;
+			      T31 = T2Y - T30;
+			      T3R = T2Y + T30;
+			      T1s = T1l + T1r;
+			      T2E = T1l - T1r;
+			      T2G = FNMS(T1w, T1u, T2F);
+			      T1y = FMA(T1w, T1x, T1v);
+			      {
+				   E T3S, T2J, T32, T1F;
+				   T3S = T2G + T2I;
+				   T2J = T2G - T2I;
+				   T32 = T1y - T1E;
+				   T1F = T1y + T1E;
+				   T2K = T2E - T2J;
+				   T3y = T2E + T2J;
+				   T3T = T3R - T3S;
+				   T4e = T3R + T3S;
+				   T1G = T1s - T1F;
+				   T2e = T1s + T1F;
+				   T3I = T31 - T32;
+				   T33 = T31 + T32;
+			      }
+			 }
+		    }
+		    {
+			 E T3k, TT, T2A, T1c, T12, T15, T14, T3m, TZ, T2x, T13;
+			 {
+			      E T18, T1b, T1a, T2z, T19;
+			      {
+				   E TP, TS, TO, TR, T3j, TQ, T17;
+				   TP = ri[WS(rs, 16)];
+				   TS = ii[WS(rs, 16)];
+				   TO = W[30];
+				   TR = W[31];
+				   T18 = ri[WS(rs, 11)];
+				   T1b = ii[WS(rs, 11)];
+				   T3j = TO * TS;
+				   TQ = TO * TP;
+				   T17 = W[20];
+				   T1a = W[21];
+				   T3k = FNMS(TR, TP, T3j);
+				   TT = FMA(TR, TS, TQ);
+				   T2z = T17 * T1b;
+				   T19 = T17 * T18;
+			      }
+			      {
+				   E TV, TY, TU, TX, T3l, TW, T11;
+				   TV = ri[WS(rs, 6)];
+				   TY = ii[WS(rs, 6)];
+				   T2A = FNMS(T1a, T18, T2z);
+				   T1c = FMA(T1a, T1b, T19);
+				   TU = W[10];
+				   TX = W[11];
+				   T12 = ri[WS(rs, 1)];
+				   T15 = ii[WS(rs, 1)];
+				   T3l = TU * TY;
+				   TW = TU * TV;
+				   T11 = W[0];
+				   T14 = W[1];
+				   T3m = FNMS(TX, TV, T3l);
+				   TZ = FMA(TX, TY, TW);
+				   T2x = T11 * T15;
+				   T13 = T11 * T12;
+			      }
+			 }
+			 {
+			      E T3n, T41, T10, T2w, T2y, T16;
+			      T3n = T3k - T3m;
+			      T41 = T3k + T3m;
+			      T10 = TT + TZ;
+			      T2w = TT - TZ;
+			      T2y = FNMS(T14, T12, T2x);
+			      T16 = FMA(T14, T15, T13);
+			      {
+				   E T42, T2B, T3o, T1d;
+				   T42 = T2y + T2A;
+				   T2B = T2y - T2A;
+				   T3o = T16 - T1c;
+				   T1d = T16 + T1c;
+				   T2C = T2w - T2B;
+				   T3w = T2w + T2B;
+				   T43 = T41 - T42;
+				   T4c = T41 + T42;
+				   T1e = T10 - T1d;
+				   T2c = T10 + T1d;
+				   T3G = T3n - T3o;
+				   T3p = T3n + T3o;
+			      }
+			 }
+		    }
+		    {
+			 E T4s, T4k, T4l, T4h, T4j, T49, T4y, T4A, T48;
+			 {
+			      E T4D, T4C, T2a, T47, T45, T4B, T4M, T4K, T46, T3Q;
+			      {
+				   E Tm, T1f, T4J, T4I, T28, T3X, T44, T29, T3P, T3O;
+				   T4D = T3T + T3W;
+				   T3X = T3T - T3W;
+				   T44 = T40 - T43;
+				   T4C = T40 + T43;
+				   T2a = T8 + Tl;
+				   Tm = T8 - Tl;
+				   T1f = TN + T1e;
+				   T4J = TN - T1e;
+				   T4I = T1G - T27;
+				   T28 = T1G + T27;
+				   T47 = FMA(KP618033988, T3X, T44);
+				   T45 = FNMS(KP618033988, T44, T3X);
+				   T29 = T1f + T28;
+				   T3P = T1f - T28;
+				   T4B = T4r - T4n;
+				   T4s = T4n + T4r;
+				   ri[WS(rs, 10)] = Tm + T29;
+				   T3O = FNMS(KP250000000, T29, Tm);
+				   T4M = FMA(KP618033988, T4I, T4J);
+				   T4K = FNMS(KP618033988, T4J, T4I);
+				   T46 = FMA(KP559016994, T3P, T3O);
+				   T3Q = FNMS(KP559016994, T3P, T3O);
+			      }
+			      {
+				   E T2d, T4w, T4x, T2g, T2h;
+				   {
+					E T4d, T4G, T4F, T4g, T4E, T4L, T4H;
+					T4k = T4b + T4c;
+					T4d = T4b - T4c;
+					T4G = T4C - T4D;
+					T4E = T4C + T4D;
+					ri[WS(rs, 18)] = FMA(KP951056516, T45, T3Q);
+					ri[WS(rs, 2)] = FNMS(KP951056516, T45, T3Q);
+					ri[WS(rs, 6)] = FMA(KP951056516, T47, T46);
+					ri[WS(rs, 14)] = FNMS(KP951056516, T47, T46);
+					ii[WS(rs, 10)] = T4E + T4B;
+					T4F = FNMS(KP250000000, T4E, T4B);
+					T4g = T4e - T4f;
+					T4l = T4e + T4f;
+					T2d = T2b + T2c;
+					T4w = T2b - T2c;
+					T4L = FMA(KP559016994, T4G, T4F);
+					T4H = FNMS(KP559016994, T4G, T4F);
+					T4h = FMA(KP618033988, T4g, T4d);
+					T4j = FNMS(KP618033988, T4d, T4g);
+					ii[WS(rs, 18)] = FNMS(KP951056516, T4K, T4H);
+					ii[WS(rs, 2)] = FMA(KP951056516, T4K, T4H);
+					ii[WS(rs, 14)] = FMA(KP951056516, T4M, T4L);
+					ii[WS(rs, 6)] = FNMS(KP951056516, T4M, T4L);
+					T4x = T2e - T2f;
+					T2g = T2e + T2f;
+				   }
+				   T2h = T2d + T2g;
+				   T49 = T2d - T2g;
+				   T4y = FMA(KP618033988, T4x, T4w);
+				   T4A = FNMS(KP618033988, T4w, T4x);
+				   ri[0] = T2a + T2h;
+				   T48 = FNMS(KP250000000, T2h, T2a);
+			      }
+			 }
+			 {
+			      E T3u, T51, T5a, T5c, T56, T54;
+			      {
+				   E T53, T52, T3t, T3r, T2o, T59, T58, T2T, T2V, T4u, T4t, T2U, T3s, T2W;
+				   {
+					E T3b, T3q, T4i, T4a, T4m;
+					T53 = T33 + T3a;
+					T3b = T33 - T3a;
+					T3q = T3i - T3p;
+					T52 = T3i + T3p;
+					T4i = FNMS(KP559016994, T49, T48);
+					T4a = FMA(KP559016994, T49, T48);
+					T4m = T4k + T4l;
+					T4u = T4k - T4l;
+					ri[WS(rs, 16)] = FMA(KP951056516, T4h, T4a);
+					ri[WS(rs, 4)] = FNMS(KP951056516, T4h, T4a);
+					ri[WS(rs, 8)] = FMA(KP951056516, T4j, T4i);
+					ri[WS(rs, 12)] = FNMS(KP951056516, T4j, T4i);
+					ii[0] = T4m + T4s;
+					T4t = FNMS(KP250000000, T4m, T4s);
+					T3t = FMA(KP618033988, T3b, T3q);
+					T3r = FNMS(KP618033988, T3q, T3b);
+				   }
+				   T3u = T2i + T2n;
+				   T2o = T2i - T2n;
+				   {
+					E T4v, T4z, T2D, T2S;
+					T4v = FMA(KP559016994, T4u, T4t);
+					T4z = FNMS(KP559016994, T4u, T4t);
+					T2D = T2v + T2C;
+					T59 = T2v - T2C;
+					T58 = T2K - T2R;
+					T2S = T2K + T2R;
+					ii[WS(rs, 16)] = FNMS(KP951056516, T4y, T4v);
+					ii[WS(rs, 4)] = FMA(KP951056516, T4y, T4v);
+					ii[WS(rs, 12)] = FMA(KP951056516, T4A, T4z);
+					ii[WS(rs, 8)] = FNMS(KP951056516, T4A, T4z);
+					T2T = T2D + T2S;
+					T2V = T2D - T2S;
+				   }
+				   ri[WS(rs, 15)] = T2o + T2T;
+				   T2U = FNMS(KP250000000, T2T, T2o);
+				   T51 = T4O + T4N;
+				   T4P = T4N - T4O;
+				   T5a = FNMS(KP618033988, T59, T58);
+				   T5c = FMA(KP618033988, T58, T59);
+				   T3s = FMA(KP559016994, T2V, T2U);
+				   T2W = FNMS(KP559016994, T2V, T2U);
+				   ri[WS(rs, 7)] = FNMS(KP951056516, T3r, T2W);
+				   ri[WS(rs, 3)] = FMA(KP951056516, T3r, T2W);
+				   ri[WS(rs, 19)] = FNMS(KP951056516, T3t, T3s);
+				   ri[WS(rs, 11)] = FMA(KP951056516, T3t, T3s);
+				   T56 = T52 - T53;
+				   T54 = T52 + T53;
+			      }
+			      {
+				   E T4Q, T4R, T3N, T3L, T4W, T4X, T3B, T3D, T3H, T3K, T55, T3C, T3M, T3E;
+				   T4Q = T3F + T3G;
+				   T3H = T3F - T3G;
+				   T3K = T3I - T3J;
+				   T4R = T3I + T3J;
+				   ii[WS(rs, 15)] = T54 + T51;
+				   T55 = FNMS(KP250000000, T54, T51);
+				   T3N = FNMS(KP618033988, T3H, T3K);
+				   T3L = FMA(KP618033988, T3K, T3H);
+				   {
+					E T57, T5b, T3x, T3A;
+					T57 = FNMS(KP559016994, T56, T55);
+					T5b = FMA(KP559016994, T56, T55);
+					T3x = T3v + T3w;
+					T4W = T3v - T3w;
+					T4X = T3y - T3z;
+					T3A = T3y + T3z;
+					ii[WS(rs, 7)] = FMA(KP951056516, T5a, T57);
+					ii[WS(rs, 3)] = FNMS(KP951056516, T5a, T57);
+					ii[WS(rs, 19)] = FMA(KP951056516, T5c, T5b);
+					ii[WS(rs, 11)] = FNMS(KP951056516, T5c, T5b);
+					T3B = T3x + T3A;
+					T3D = T3x - T3A;
+				   }
+				   ri[WS(rs, 5)] = T3u + T3B;
+				   T3C = FNMS(KP250000000, T3B, T3u);
+				   T4Y = FMA(KP618033988, T4X, T4W);
+				   T50 = FNMS(KP618033988, T4W, T4X);
+				   T3M = FNMS(KP559016994, T3D, T3C);
+				   T3E = FMA(KP559016994, T3D, T3C);
+				   ri[WS(rs, 9)] = FNMS(KP951056516, T3L, T3E);
+				   ri[WS(rs, 1)] = FMA(KP951056516, T3L, T3E);
+				   ri[WS(rs, 17)] = FNMS(KP951056516, T3N, T3M);
+				   ri[WS(rs, 13)] = FMA(KP951056516, T3N, T3M);
+				   T4U = T4Q - T4R;
+				   T4S = T4Q + T4R;
+			      }
+			 }
+		    }
+	       }
+	       ii[WS(rs, 5)] = T4S + T4P;
+	       T4T = FNMS(KP250000000, T4S, T4P);
+	       T4Z = FNMS(KP559016994, T4U, T4T);
+	       T4V = FMA(KP559016994, T4U, T4T);
+	       ii[WS(rs, 9)] = FMA(KP951056516, T4Y, T4V);
+	       ii[WS(rs, 1)] = FNMS(KP951056516, T4Y, T4V);
+	       ii[WS(rs, 17)] = FMA(KP951056516, T50, T4Z);
+	       ii[WS(rs, 13)] = FNMS(KP951056516, T50, T4Z);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 20, "t1_20", twinstr, &GENUS, {136, 38, 110, 0}, 0, 0, 0 };
+
+void X(codelet_t1_20) (planner *p) {
+     X(kdft_dit_register) (p, t1_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 20 -name t1_20 -include t.h */
+
+/*
+ * This function contains 246 FP additions, 124 FP multiplications,
+ * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
+ * 85 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "t.h"
+
+static void t1_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 38); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E Tj, T1R, T4g, T4p, T2q, T37, T3Q, T42, T1r, T1O, T1P, T3i, T3l, T44, T3D;
+	       E T3E, T3K, T1V, T1W, T1X, T23, T28, T4r, T2W, T2X, T4c, T33, T34, T35, T2G;
+	       E T2L, T2M, TG, T13, T14, T3p, T3s, T43, T3A, T3B, T3J, T1S, T1T, T1U, T2e;
+	       E T2j, T4q, T2T, T2U, T4b, T30, T31, T32, T2v, T2A, T2B;
+	       {
+		    E T1, T3O, T6, T3N, Tc, T2n, Th, T2o;
+		    T1 = ri[0];
+		    T3O = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 10)];
+			 T5 = ii[WS(rs, 10)];
+			 T2 = W[18];
+			 T4 = W[19];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T3N = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = ri[WS(rs, 5)];
+			 Tb = ii[WS(rs, 5)];
+			 T8 = W[8];
+			 Ta = W[9];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T2n = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = ri[WS(rs, 15)];
+			 Tg = ii[WS(rs, 15)];
+			 Td = W[28];
+			 Tf = W[29];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T2o = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E T7, Ti, T4e, T4f;
+			 T7 = T1 + T6;
+			 Ti = Tc + Th;
+			 Tj = T7 - Ti;
+			 T1R = T7 + Ti;
+			 T4e = T3O - T3N;
+			 T4f = Tc - Th;
+			 T4g = T4e - T4f;
+			 T4p = T4f + T4e;
+		    }
+		    {
+			 E T2m, T2p, T3M, T3P;
+			 T2m = T1 - T6;
+			 T2p = T2n - T2o;
+			 T2q = T2m - T2p;
+			 T37 = T2m + T2p;
+			 T3M = T2n + T2o;
+			 T3P = T3N + T3O;
+			 T3Q = T3M + T3P;
+			 T42 = T3P - T3M;
+		    }
+	       }
+	       {
+		    E T1f, T3g, T21, T2C, T1N, T3k, T27, T2K, T1q, T3h, T22, T2F, T1C, T3j, T26;
+		    E T2H;
+		    {
+			 E T19, T1Z, T1e, T20;
+			 {
+			      E T16, T18, T15, T17;
+			      T16 = ri[WS(rs, 8)];
+			      T18 = ii[WS(rs, 8)];
+			      T15 = W[14];
+			      T17 = W[15];
+			      T19 = FMA(T15, T16, T17 * T18);
+			      T1Z = FNMS(T17, T16, T15 * T18);
+			 }
+			 {
+			      E T1b, T1d, T1a, T1c;
+			      T1b = ri[WS(rs, 18)];
+			      T1d = ii[WS(rs, 18)];
+			      T1a = W[34];
+			      T1c = W[35];
+			      T1e = FMA(T1a, T1b, T1c * T1d);
+			      T20 = FNMS(T1c, T1b, T1a * T1d);
+			 }
+			 T1f = T19 + T1e;
+			 T3g = T1Z + T20;
+			 T21 = T1Z - T20;
+			 T2C = T19 - T1e;
+		    }
+		    {
+			 E T1H, T2I, T1M, T2J;
+			 {
+			      E T1E, T1G, T1D, T1F;
+			      T1E = ri[WS(rs, 17)];
+			      T1G = ii[WS(rs, 17)];
+			      T1D = W[32];
+			      T1F = W[33];
+			      T1H = FMA(T1D, T1E, T1F * T1G);
+			      T2I = FNMS(T1F, T1E, T1D * T1G);
+			 }
+			 {
+			      E T1J, T1L, T1I, T1K;
+			      T1J = ri[WS(rs, 7)];
+			      T1L = ii[WS(rs, 7)];
+			      T1I = W[12];
+			      T1K = W[13];
+			      T1M = FMA(T1I, T1J, T1K * T1L);
+			      T2J = FNMS(T1K, T1J, T1I * T1L);
+			 }
+			 T1N = T1H + T1M;
+			 T3k = T2I + T2J;
+			 T27 = T1H - T1M;
+			 T2K = T2I - T2J;
+		    }
+		    {
+			 E T1k, T2D, T1p, T2E;
+			 {
+			      E T1h, T1j, T1g, T1i;
+			      T1h = ri[WS(rs, 13)];
+			      T1j = ii[WS(rs, 13)];
+			      T1g = W[24];
+			      T1i = W[25];
+			      T1k = FMA(T1g, T1h, T1i * T1j);
+			      T2D = FNMS(T1i, T1h, T1g * T1j);
+			 }
+			 {
+			      E T1m, T1o, T1l, T1n;
+			      T1m = ri[WS(rs, 3)];
+			      T1o = ii[WS(rs, 3)];
+			      T1l = W[4];
+			      T1n = W[5];
+			      T1p = FMA(T1l, T1m, T1n * T1o);
+			      T2E = FNMS(T1n, T1m, T1l * T1o);
+			 }
+			 T1q = T1k + T1p;
+			 T3h = T2D + T2E;
+			 T22 = T1k - T1p;
+			 T2F = T2D - T2E;
+		    }
+		    {
+			 E T1w, T24, T1B, T25;
+			 {
+			      E T1t, T1v, T1s, T1u;
+			      T1t = ri[WS(rs, 12)];
+			      T1v = ii[WS(rs, 12)];
+			      T1s = W[22];
+			      T1u = W[23];
+			      T1w = FMA(T1s, T1t, T1u * T1v);
+			      T24 = FNMS(T1u, T1t, T1s * T1v);
+			 }
+			 {
+			      E T1y, T1A, T1x, T1z;
+			      T1y = ri[WS(rs, 2)];
+			      T1A = ii[WS(rs, 2)];
+			      T1x = W[2];
+			      T1z = W[3];
+			      T1B = FMA(T1x, T1y, T1z * T1A);
+			      T25 = FNMS(T1z, T1y, T1x * T1A);
+			 }
+			 T1C = T1w + T1B;
+			 T3j = T24 + T25;
+			 T26 = T24 - T25;
+			 T2H = T1w - T1B;
+		    }
+		    T1r = T1f - T1q;
+		    T1O = T1C - T1N;
+		    T1P = T1r + T1O;
+		    T3i = T3g - T3h;
+		    T3l = T3j - T3k;
+		    T44 = T3i + T3l;
+		    T3D = T3g + T3h;
+		    T3E = T3j + T3k;
+		    T3K = T3D + T3E;
+		    T1V = T1f + T1q;
+		    T1W = T1C + T1N;
+		    T1X = T1V + T1W;
+		    T23 = T21 + T22;
+		    T28 = T26 + T27;
+		    T4r = T23 + T28;
+		    T2W = T21 - T22;
+		    T2X = T26 - T27;
+		    T4c = T2W + T2X;
+		    T33 = T2C + T2F;
+		    T34 = T2H + T2K;
+		    T35 = T33 + T34;
+		    T2G = T2C - T2F;
+		    T2L = T2H - T2K;
+		    T2M = T2G + T2L;
+	       }
+	       {
+		    E Tu, T3n, T2c, T2r, T12, T3r, T2i, T2z, TF, T3o, T2d, T2u, TR, T3q, T2h;
+		    E T2w;
+		    {
+			 E To, T2a, Tt, T2b;
+			 {
+			      E Tl, Tn, Tk, Tm;
+			      Tl = ri[WS(rs, 4)];
+			      Tn = ii[WS(rs, 4)];
+			      Tk = W[6];
+			      Tm = W[7];
+			      To = FMA(Tk, Tl, Tm * Tn);
+			      T2a = FNMS(Tm, Tl, Tk * Tn);
+			 }
+			 {
+			      E Tq, Ts, Tp, Tr;
+			      Tq = ri[WS(rs, 14)];
+			      Ts = ii[WS(rs, 14)];
+			      Tp = W[26];
+			      Tr = W[27];
+			      Tt = FMA(Tp, Tq, Tr * Ts);
+			      T2b = FNMS(Tr, Tq, Tp * Ts);
+			 }
+			 Tu = To + Tt;
+			 T3n = T2a + T2b;
+			 T2c = T2a - T2b;
+			 T2r = To - Tt;
+		    }
+		    {
+			 E TW, T2x, T11, T2y;
+			 {
+			      E TT, TV, TS, TU;
+			      TT = ri[WS(rs, 1)];
+			      TV = ii[WS(rs, 1)];
+			      TS = W[0];
+			      TU = W[1];
+			      TW = FMA(TS, TT, TU * TV);
+			      T2x = FNMS(TU, TT, TS * TV);
+			 }
+			 {
+			      E TY, T10, TX, TZ;
+			      TY = ri[WS(rs, 11)];
+			      T10 = ii[WS(rs, 11)];
+			      TX = W[20];
+			      TZ = W[21];
+			      T11 = FMA(TX, TY, TZ * T10);
+			      T2y = FNMS(TZ, TY, TX * T10);
+			 }
+			 T12 = TW + T11;
+			 T3r = T2x + T2y;
+			 T2i = TW - T11;
+			 T2z = T2x - T2y;
+		    }
+		    {
+			 E Tz, T2s, TE, T2t;
+			 {
+			      E Tw, Ty, Tv, Tx;
+			      Tw = ri[WS(rs, 9)];
+			      Ty = ii[WS(rs, 9)];
+			      Tv = W[16];
+			      Tx = W[17];
+			      Tz = FMA(Tv, Tw, Tx * Ty);
+			      T2s = FNMS(Tx, Tw, Tv * Ty);
+			 }
+			 {
+			      E TB, TD, TA, TC;
+			      TB = ri[WS(rs, 19)];
+			      TD = ii[WS(rs, 19)];
+			      TA = W[36];
+			      TC = W[37];
+			      TE = FMA(TA, TB, TC * TD);
+			      T2t = FNMS(TC, TB, TA * TD);
+			 }
+			 TF = Tz + TE;
+			 T3o = T2s + T2t;
+			 T2d = Tz - TE;
+			 T2u = T2s - T2t;
+		    }
+		    {
+			 E TL, T2f, TQ, T2g;
+			 {
+			      E TI, TK, TH, TJ;
+			      TI = ri[WS(rs, 16)];
+			      TK = ii[WS(rs, 16)];
+			      TH = W[30];
+			      TJ = W[31];
+			      TL = FMA(TH, TI, TJ * TK);
+			      T2f = FNMS(TJ, TI, TH * TK);
+			 }
+			 {
+			      E TN, TP, TM, TO;
+			      TN = ri[WS(rs, 6)];
+			      TP = ii[WS(rs, 6)];
+			      TM = W[10];
+			      TO = W[11];
+			      TQ = FMA(TM, TN, TO * TP);
+			      T2g = FNMS(TO, TN, TM * TP);
+			 }
+			 TR = TL + TQ;
+			 T3q = T2f + T2g;
+			 T2h = T2f - T2g;
+			 T2w = TL - TQ;
+		    }
+		    TG = Tu - TF;
+		    T13 = TR - T12;
+		    T14 = TG + T13;
+		    T3p = T3n - T3o;
+		    T3s = T3q - T3r;
+		    T43 = T3p + T3s;
+		    T3A = T3n + T3o;
+		    T3B = T3q + T3r;
+		    T3J = T3A + T3B;
+		    T1S = Tu + TF;
+		    T1T = TR + T12;
+		    T1U = T1S + T1T;
+		    T2e = T2c + T2d;
+		    T2j = T2h + T2i;
+		    T4q = T2e + T2j;
+		    T2T = T2c - T2d;
+		    T2U = T2h - T2i;
+		    T4b = T2T + T2U;
+		    T30 = T2r + T2u;
+		    T31 = T2w + T2z;
+		    T32 = T30 + T31;
+		    T2v = T2r - T2u;
+		    T2A = T2w - T2z;
+		    T2B = T2v + T2A;
+	       }
+	       {
+		    E T3e, T1Q, T3d, T3u, T3w, T3m, T3t, T3v, T3f;
+		    T3e = KP559016994 * (T14 - T1P);
+		    T1Q = T14 + T1P;
+		    T3d = FNMS(KP250000000, T1Q, Tj);
+		    T3m = T3i - T3l;
+		    T3t = T3p - T3s;
+		    T3u = FNMS(KP587785252, T3t, KP951056516 * T3m);
+		    T3w = FMA(KP951056516, T3t, KP587785252 * T3m);
+		    ri[WS(rs, 10)] = Tj + T1Q;
+		    T3v = T3e + T3d;
+		    ri[WS(rs, 14)] = T3v - T3w;
+		    ri[WS(rs, 6)] = T3v + T3w;
+		    T3f = T3d - T3e;
+		    ri[WS(rs, 2)] = T3f - T3u;
+		    ri[WS(rs, 18)] = T3f + T3u;
+	       }
+	       {
+		    E T47, T45, T46, T41, T4a, T3Z, T40, T49, T48;
+		    T47 = KP559016994 * (T43 - T44);
+		    T45 = T43 + T44;
+		    T46 = FNMS(KP250000000, T45, T42);
+		    T3Z = T1r - T1O;
+		    T40 = TG - T13;
+		    T41 = FNMS(KP587785252, T40, KP951056516 * T3Z);
+		    T4a = FMA(KP951056516, T40, KP587785252 * T3Z);
+		    ii[WS(rs, 10)] = T45 + T42;
+		    T49 = T47 + T46;
+		    ii[WS(rs, 6)] = T49 - T4a;
+		    ii[WS(rs, 14)] = T4a + T49;
+		    T48 = T46 - T47;
+		    ii[WS(rs, 2)] = T41 + T48;
+		    ii[WS(rs, 18)] = T48 - T41;
+	       }
+	       {
+		    E T3x, T1Y, T3y, T3G, T3I, T3C, T3F, T3H, T3z;
+		    T3x = KP559016994 * (T1U - T1X);
+		    T1Y = T1U + T1X;
+		    T3y = FNMS(KP250000000, T1Y, T1R);
+		    T3C = T3A - T3B;
+		    T3F = T3D - T3E;
+		    T3G = FMA(KP951056516, T3C, KP587785252 * T3F);
+		    T3I = FNMS(KP587785252, T3C, KP951056516 * T3F);
+		    ri[0] = T1R + T1Y;
+		    T3H = T3y - T3x;
+		    ri[WS(rs, 12)] = T3H - T3I;
+		    ri[WS(rs, 8)] = T3H + T3I;
+		    T3z = T3x + T3y;
+		    ri[WS(rs, 4)] = T3z - T3G;
+		    ri[WS(rs, 16)] = T3z + T3G;
+	       }
+	       {
+		    E T3U, T3L, T3V, T3T, T3Y, T3R, T3S, T3X, T3W;
+		    T3U = KP559016994 * (T3J - T3K);
+		    T3L = T3J + T3K;
+		    T3V = FNMS(KP250000000, T3L, T3Q);
+		    T3R = T1S - T1T;
+		    T3S = T1V - T1W;
+		    T3T = FMA(KP951056516, T3R, KP587785252 * T3S);
+		    T3Y = FNMS(KP587785252, T3R, KP951056516 * T3S);
+		    ii[0] = T3L + T3Q;
+		    T3X = T3V - T3U;
+		    ii[WS(rs, 8)] = T3X - T3Y;
+		    ii[WS(rs, 12)] = T3Y + T3X;
+		    T3W = T3U + T3V;
+		    ii[WS(rs, 4)] = T3T + T3W;
+		    ii[WS(rs, 16)] = T3W - T3T;
+	       }
+	       {
+		    E T2P, T2N, T2O, T2l, T2R, T29, T2k, T2S, T2Q;
+		    T2P = KP559016994 * (T2B - T2M);
+		    T2N = T2B + T2M;
+		    T2O = FNMS(KP250000000, T2N, T2q);
+		    T29 = T23 - T28;
+		    T2k = T2e - T2j;
+		    T2l = FNMS(KP587785252, T2k, KP951056516 * T29);
+		    T2R = FMA(KP951056516, T2k, KP587785252 * T29);
+		    ri[WS(rs, 15)] = T2q + T2N;
+		    T2S = T2P + T2O;
+		    ri[WS(rs, 11)] = T2R + T2S;
+		    ri[WS(rs, 19)] = T2S - T2R;
+		    T2Q = T2O - T2P;
+		    ri[WS(rs, 3)] = T2l + T2Q;
+		    ri[WS(rs, 7)] = T2Q - T2l;
+	       }
+	       {
+		    E T4u, T4s, T4t, T4y, T4A, T4w, T4x, T4z, T4v;
+		    T4u = KP559016994 * (T4q - T4r);
+		    T4s = T4q + T4r;
+		    T4t = FNMS(KP250000000, T4s, T4p);
+		    T4w = T2G - T2L;
+		    T4x = T2v - T2A;
+		    T4y = FNMS(KP587785252, T4x, KP951056516 * T4w);
+		    T4A = FMA(KP951056516, T4x, KP587785252 * T4w);
+		    ii[WS(rs, 15)] = T4s + T4p;
+		    T4z = T4u + T4t;
+		    ii[WS(rs, 11)] = T4z - T4A;
+		    ii[WS(rs, 19)] = T4A + T4z;
+		    T4v = T4t - T4u;
+		    ii[WS(rs, 3)] = T4v - T4y;
+		    ii[WS(rs, 7)] = T4y + T4v;
+	       }
+	       {
+		    E T36, T38, T39, T2Z, T3b, T2V, T2Y, T3c, T3a;
+		    T36 = KP559016994 * (T32 - T35);
+		    T38 = T32 + T35;
+		    T39 = FNMS(KP250000000, T38, T37);
+		    T2V = T2T - T2U;
+		    T2Y = T2W - T2X;
+		    T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
+		    T3b = FNMS(KP587785252, T2V, KP951056516 * T2Y);
+		    ri[WS(rs, 5)] = T37 + T38;
+		    T3c = T39 - T36;
+		    ri[WS(rs, 13)] = T3b + T3c;
+		    ri[WS(rs, 17)] = T3c - T3b;
+		    T3a = T36 + T39;
+		    ri[WS(rs, 1)] = T2Z + T3a;
+		    ri[WS(rs, 9)] = T3a - T2Z;
+	       }
+	       {
+		    E T4d, T4h, T4i, T4m, T4o, T4k, T4l, T4n, T4j;
+		    T4d = KP559016994 * (T4b - T4c);
+		    T4h = T4b + T4c;
+		    T4i = FNMS(KP250000000, T4h, T4g);
+		    T4k = T30 - T31;
+		    T4l = T33 - T34;
+		    T4m = FMA(KP951056516, T4k, KP587785252 * T4l);
+		    T4o = FNMS(KP587785252, T4k, KP951056516 * T4l);
+		    ii[WS(rs, 5)] = T4h + T4g;
+		    T4n = T4i - T4d;
+		    ii[WS(rs, 13)] = T4n - T4o;
+		    ii[WS(rs, 17)] = T4o + T4n;
+		    T4j = T4d + T4i;
+		    ii[WS(rs, 1)] = T4j - T4m;
+		    ii[WS(rs, 9)] = T4m + T4j;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 20, "t1_20", twinstr, &GENUS, {184, 62, 62, 0}, 0, 0, 0 };
+
+void X(codelet_t1_20) (planner *p) {
+     X(kdft_dit_register) (p, t1_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1561 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:54 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 25 -name t1_25 -include t.h */
+
+/*
+ * This function contains 400 FP additions, 364 FP multiplications,
+ * (or, 84 additions, 48 multiplications, 316 fused multiply/add),
+ * 181 stack variables, 47 constants, and 100 memory accesses
+ */
+#include "t.h"
+
+static void t1_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DK(KP557913902, +0.557913902031834264187699648465567037992437152);
+     DK(KP249506682, +0.249506682107067890488084201715862638334226305);
+     DK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DK(KP968479752, +0.968479752739016373193524836781420152702090879);
+     DK(KP621716863, +0.621716863012209892444754556304102309693593202);
+     DK(KP614372930, +0.614372930789563808870829930444362096004872855);
+     DK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP994076283, +0.994076283785401014123185814696322018529298887);
+     DK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DK(KP062914667, +0.062914667253649757225485955897349402364686947);
+     DK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DK(KP943557151, +0.943557151597354104399655195398983005179443399);
+     DK(KP554608978, +0.554608978404018097464974850792216217022558774);
+     DK(KP248028675, +0.248028675328619457762448260696444630363259177);
+     DK(KP726211448, +0.726211448929902658173535992263577167607493062);
+     DK(KP525970792, +0.525970792408939708442463226536226366643874659);
+     DK(KP921177326, +0.921177326965143320250447435415066029359282231);
+     DK(KP833417178, +0.833417178328688677408962550243238843138996060);
+     DK(KP541454447, +0.541454447536312777046285590082819509052033189);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DK(KP851038619, +0.851038619207379630836264138867114231259902550);
+     DK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP912018591, +0.912018591466481957908415381764119056233607330);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP470564281, +0.470564281212251493087595091036643380879947982);
+     DK(KP827271945, +0.827271945972475634034355757144307982555673741);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP126329378, +0.126329378446108174786050455341811215027378105);
+     DK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DK(KP549754652, +0.549754652192770074288023275540779861653779767);
+     DK(KP871714437, +0.871714437527667770979999223229522602943903653);
+     DK(KP634619297, +0.634619297544148100711287640319130485732531031);
+     DK(KP939062505, +0.939062505817492352556001843133229685779824606);
+     DK(KP256756360, +0.256756360367726783319498520922669048172391148);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 48); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 48, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E T7I, T6Q, T6O, T7O, T7M, T7H, T6P, T6H, T7J, T7N;
+	       {
+		    E T78, T5G, T3Y, T3M, T7C, T7c, T77, T6Y, Tt, T3L, T5T, T4P, T5Q, T4W, T3G;
+		    E T2G, T5P, T4T, T5S, T4M, T65, T45, T68, T4c, T2Z, T11, T67, T49, T64, T42;
+		    E T5Y, T4r, T61, T4k, T3d, T1z, T60, T4h, T5X, T4o, T3g, T1G, T3q, T4z, T4G;
+		    E T26, T3i, T1M, T3k, T1S;
+		    {
+			 E T3u, T2e, T3E, T4O, T4V, T2E, T3w, T2k, T3y, T2q;
+			 {
+			      E T1, T6X, T3P, T7, T3W, Tq, T9, Tc, Tb, T3U, Tk, T3Q, Ta;
+			      {
+				   E T3, T6, T2, T5;
+				   T1 = ri[0];
+				   T6X = ii[0];
+				   T3 = ri[WS(rs, 5)];
+				   T6 = ii[WS(rs, 5)];
+				   T2 = W[8];
+				   T5 = W[9];
+				   {
+					E Tm, Tp, To, T3V, Tn, T3O, T4, Tl;
+					Tm = ri[WS(rs, 15)];
+					Tp = ii[WS(rs, 15)];
+					T3O = T2 * T6;
+					T4 = T2 * T3;
+					Tl = W[28];
+					To = W[29];
+					T3P = FNMS(T5, T3, T3O);
+					T7 = FMA(T5, T6, T4);
+					T3V = Tl * Tp;
+					Tn = Tl * Tm;
+					{
+					     E Tg, Tj, Tf, Ti, T3T, Th, T8;
+					     Tg = ri[WS(rs, 10)];
+					     Tj = ii[WS(rs, 10)];
+					     T3W = FNMS(To, Tm, T3V);
+					     Tq = FMA(To, Tp, Tn);
+					     Tf = W[18];
+					     Ti = W[19];
+					     T9 = ri[WS(rs, 20)];
+					     Tc = ii[WS(rs, 20)];
+					     T3T = Tf * Tj;
+					     Th = Tf * Tg;
+					     T8 = W[38];
+					     Tb = W[39];
+					     T3U = FNMS(Ti, Tg, T3T);
+					     Tk = FMA(Ti, Tj, Th);
+					     T3Q = T8 * Tc;
+					     Ta = T8 * T9;
+					}
+				   }
+			      }
+			      {
+				   E T6V, T3X, T7b, Tr, T3R, Td;
+				   T6V = T3U + T3W;
+				   T3X = T3U - T3W;
+				   T7b = Tk - Tq;
+				   Tr = Tk + Tq;
+				   T3R = FNMS(Tb, T9, T3Q);
+				   Td = FMA(Tb, Tc, Ta);
+				   {
+					E T3S, T7a, Te, T6W, T6U, Ts;
+					T3S = T3P - T3R;
+					T6U = T3P + T3R;
+					T7a = T7 - Td;
+					Te = T7 + Td;
+					T78 = T6U - T6V;
+					T6W = T6U + T6V;
+					T5G = FNMS(KP618033988, T3S, T3X);
+					T3Y = FMA(KP618033988, T3X, T3S);
+					T3M = Te - Tr;
+					Ts = Te + Tr;
+					T7C = FNMS(KP618033988, T7a, T7b);
+					T7c = FMA(KP618033988, T7b, T7a);
+					T77 = FNMS(KP250000000, T6W, T6X);
+					T6Y = T6W + T6X;
+					Tt = T1 + Ts;
+					T3L = FNMS(KP250000000, Ts, T1);
+				   }
+			      }
+			 }
+			 {
+			      E T2g, T2j, T2m, T3v, T2h, T2p, T2l, T2i, T2o, T3x, T2n;
+			      {
+				   E T2a, T2d, T29, T2c;
+				   T2a = ri[WS(rs, 3)];
+				   T2d = ii[WS(rs, 3)];
+				   T29 = W[4];
+				   T2c = W[5];
+				   {
+					E T2t, T2w, T2z, T3A, T2u, T2C, T2y, T2v, T2B, T3t, T2b, T2s, T2f;
+					T2t = ri[WS(rs, 13)];
+					T2w = ii[WS(rs, 13)];
+					T3t = T29 * T2d;
+					T2b = T29 * T2a;
+					T2s = W[24];
+					T2z = ri[WS(rs, 18)];
+					T3u = FNMS(T2c, T2a, T3t);
+					T2e = FMA(T2c, T2d, T2b);
+					T3A = T2s * T2w;
+					T2u = T2s * T2t;
+					T2C = ii[WS(rs, 18)];
+					T2y = W[34];
+					T2v = W[25];
+					T2B = W[35];
+					{
+					     E T3B, T2x, T3D, T2D, T3C, T2A;
+					     T2g = ri[WS(rs, 8)];
+					     T3C = T2y * T2C;
+					     T2A = T2y * T2z;
+					     T3B = FNMS(T2v, T2t, T3A);
+					     T2x = FMA(T2v, T2w, T2u);
+					     T3D = FNMS(T2B, T2z, T3C);
+					     T2D = FMA(T2B, T2C, T2A);
+					     T2j = ii[WS(rs, 8)];
+					     T2f = W[14];
+					     T3E = T3B + T3D;
+					     T4O = T3D - T3B;
+					     T4V = T2x - T2D;
+					     T2E = T2x + T2D;
+					}
+					T2m = ri[WS(rs, 23)];
+					T3v = T2f * T2j;
+					T2h = T2f * T2g;
+					T2p = ii[WS(rs, 23)];
+					T2l = W[44];
+					T2i = W[15];
+					T2o = W[45];
+				   }
+			      }
+			      T3x = T2l * T2p;
+			      T2n = T2l * T2m;
+			      T3w = FNMS(T2i, T2g, T3v);
+			      T2k = FMA(T2i, T2j, T2h);
+			      T3y = FNMS(T2o, T2m, T3x);
+			      T2q = FMA(T2o, T2p, T2n);
+			 }
+			 {
+			      E T2N, Tz, T2X, T44, T4b, TZ, T2P, TF, T2R, TL;
+			      {
+				   E TB, TE, TH, T2O, TC, TK, TG, TD, TJ, T2Q, TI;
+				   {
+					E Tv, Ty, Tu, Tx;
+					{
+					     E T4S, T4L, T4R, T4K, T4N, T3z;
+					     Tv = ri[WS(rs, 1)];
+					     T4N = T3y - T3w;
+					     T3z = T3w + T3y;
+					     {
+						  E T4U, T2r, T3F, T2F;
+						  T4U = T2k - T2q;
+						  T2r = T2k + T2q;
+						  T5T = FNMS(KP618033988, T4N, T4O);
+						  T4P = FMA(KP618033988, T4O, T4N);
+						  T3F = T3z + T3E;
+						  T4S = T3E - T3z;
+						  T5Q = FNMS(KP618033988, T4U, T4V);
+						  T4W = FMA(KP618033988, T4V, T4U);
+						  T2F = T2r + T2E;
+						  T4L = T2E - T2r;
+						  T3G = T3u + T3F;
+						  T4R = FNMS(KP250000000, T3F, T3u);
+						  T2G = T2e + T2F;
+						  T4K = FNMS(KP250000000, T2F, T2e);
+						  Ty = ii[WS(rs, 1)];
+					     }
+					     T5P = FMA(KP559016994, T4S, T4R);
+					     T4T = FNMS(KP559016994, T4S, T4R);
+					     T5S = FMA(KP559016994, T4L, T4K);
+					     T4M = FNMS(KP559016994, T4L, T4K);
+					     Tu = W[0];
+					}
+					Tx = W[1];
+					{
+					     E TO, TR, TU, T2T, TP, TX, TT, TQ, TW, T2M, Tw, TN, TA;
+					     TO = ri[WS(rs, 11)];
+					     TR = ii[WS(rs, 11)];
+					     T2M = Tu * Ty;
+					     Tw = Tu * Tv;
+					     TN = W[20];
+					     TU = ri[WS(rs, 16)];
+					     T2N = FNMS(Tx, Tv, T2M);
+					     Tz = FMA(Tx, Ty, Tw);
+					     T2T = TN * TR;
+					     TP = TN * TO;
+					     TX = ii[WS(rs, 16)];
+					     TT = W[30];
+					     TQ = W[21];
+					     TW = W[31];
+					     {
+						  E T2U, TS, T2W, TY, T2V, TV;
+						  TB = ri[WS(rs, 6)];
+						  T2V = TT * TX;
+						  TV = TT * TU;
+						  T2U = FNMS(TQ, TO, T2T);
+						  TS = FMA(TQ, TR, TP);
+						  T2W = FNMS(TW, TU, T2V);
+						  TY = FMA(TW, TX, TV);
+						  TE = ii[WS(rs, 6)];
+						  TA = W[10];
+						  T2X = T2U + T2W;
+						  T44 = T2W - T2U;
+						  T4b = TY - TS;
+						  TZ = TS + TY;
+					     }
+					     TH = ri[WS(rs, 21)];
+					     T2O = TA * TE;
+					     TC = TA * TB;
+					     TK = ii[WS(rs, 21)];
+					     TG = W[40];
+					     TD = W[11];
+					     TJ = W[41];
+					}
+				   }
+				   T2Q = TG * TK;
+				   TI = TG * TH;
+				   T2P = FNMS(TD, TB, T2O);
+				   TF = FMA(TD, TE, TC);
+				   T2R = FNMS(TJ, TH, T2Q);
+				   TL = FMA(TJ, TK, TI);
+			      }
+			      {
+				   E T31, T17, T3b, T4q, T4j, T1x, T33, T1d, T35, T1j;
+				   {
+					E T19, T1c, T1f, T32, T1a, T1i, T1e, T1b, T1h, T34, T1g;
+					{
+					     E T13, T16, T12, T15;
+					     {
+						  E T48, T41, T47, T40, T43, T2S;
+						  T13 = ri[WS(rs, 4)];
+						  T43 = T2P - T2R;
+						  T2S = T2P + T2R;
+						  {
+						       E T4a, TM, T2Y, T10;
+						       T4a = TL - TF;
+						       TM = TF + TL;
+						       T65 = FMA(KP618033988, T43, T44);
+						       T45 = FNMS(KP618033988, T44, T43);
+						       T2Y = T2S + T2X;
+						       T48 = T2S - T2X;
+						       T68 = FNMS(KP618033988, T4a, T4b);
+						       T4c = FMA(KP618033988, T4b, T4a);
+						       T10 = TM + TZ;
+						       T41 = TM - TZ;
+						       T2Z = T2N + T2Y;
+						       T47 = FNMS(KP250000000, T2Y, T2N);
+						       T11 = Tz + T10;
+						       T40 = FNMS(KP250000000, T10, Tz);
+						       T16 = ii[WS(rs, 4)];
+						  }
+						  T67 = FNMS(KP559016994, T48, T47);
+						  T49 = FMA(KP559016994, T48, T47);
+						  T64 = FNMS(KP559016994, T41, T40);
+						  T42 = FMA(KP559016994, T41, T40);
+						  T12 = W[6];
+					     }
+					     T15 = W[7];
+					     {
+						  E T1m, T1p, T1s, T37, T1n, T1v, T1r, T1o, T1u, T30, T14, T1l, T18;
+						  T1m = ri[WS(rs, 14)];
+						  T1p = ii[WS(rs, 14)];
+						  T30 = T12 * T16;
+						  T14 = T12 * T13;
+						  T1l = W[26];
+						  T1s = ri[WS(rs, 19)];
+						  T31 = FNMS(T15, T13, T30);
+						  T17 = FMA(T15, T16, T14);
+						  T37 = T1l * T1p;
+						  T1n = T1l * T1m;
+						  T1v = ii[WS(rs, 19)];
+						  T1r = W[36];
+						  T1o = W[27];
+						  T1u = W[37];
+						  {
+						       E T38, T1q, T3a, T1w, T39, T1t;
+						       T19 = ri[WS(rs, 9)];
+						       T39 = T1r * T1v;
+						       T1t = T1r * T1s;
+						       T38 = FNMS(T1o, T1m, T37);
+						       T1q = FMA(T1o, T1p, T1n);
+						       T3a = FNMS(T1u, T1s, T39);
+						       T1w = FMA(T1u, T1v, T1t);
+						       T1c = ii[WS(rs, 9)];
+						       T18 = W[16];
+						       T3b = T38 + T3a;
+						       T4q = T3a - T38;
+						       T4j = T1w - T1q;
+						       T1x = T1q + T1w;
+						  }
+						  T1f = ri[WS(rs, 24)];
+						  T32 = T18 * T1c;
+						  T1a = T18 * T19;
+						  T1i = ii[WS(rs, 24)];
+						  T1e = W[46];
+						  T1b = W[17];
+						  T1h = W[47];
+					     }
+					}
+					T34 = T1e * T1i;
+					T1g = T1e * T1f;
+					T33 = FNMS(T1b, T19, T32);
+					T1d = FMA(T1b, T1c, T1a);
+					T35 = FNMS(T1h, T1f, T34);
+					T1j = FMA(T1h, T1i, T1g);
+				   }
+				   {
+					E T1I, T1L, T1O, T3h, T1J, T1R, T1N, T1K, T1Q, T3j, T1P;
+					{
+					     E T1C, T1F, T1B, T1E;
+					     {
+						  E T4g, T4n, T4f, T4m, T4p, T36;
+						  T1C = ri[WS(rs, 2)];
+						  T4p = T35 - T33;
+						  T36 = T33 + T35;
+						  {
+						       E T4i, T1k, T3c, T1y;
+						       T4i = T1j - T1d;
+						       T1k = T1d + T1j;
+						       T5Y = FNMS(KP618033988, T4p, T4q);
+						       T4r = FMA(KP618033988, T4q, T4p);
+						       T3c = T36 + T3b;
+						       T4g = T3b - T36;
+						       T61 = FNMS(KP618033988, T4i, T4j);
+						       T4k = FMA(KP618033988, T4j, T4i);
+						       T1y = T1k + T1x;
+						       T4n = T1k - T1x;
+						       T3d = T31 + T3c;
+						       T4f = FNMS(KP250000000, T3c, T31);
+						       T1z = T17 + T1y;
+						       T4m = FNMS(KP250000000, T1y, T17);
+						       T1F = ii[WS(rs, 2)];
+						  }
+						  T60 = FMA(KP559016994, T4g, T4f);
+						  T4h = FNMS(KP559016994, T4g, T4f);
+						  T5X = FNMS(KP559016994, T4n, T4m);
+						  T4o = FMA(KP559016994, T4n, T4m);
+						  T1B = W[2];
+					     }
+					     T1E = W[3];
+					     {
+						  E T1V, T1Y, T21, T3m, T1W, T24, T20, T1X, T23, T3f, T1D, T1U, T1H;
+						  T1V = ri[WS(rs, 12)];
+						  T1Y = ii[WS(rs, 12)];
+						  T3f = T1B * T1F;
+						  T1D = T1B * T1C;
+						  T1U = W[22];
+						  T21 = ri[WS(rs, 17)];
+						  T3g = FNMS(T1E, T1C, T3f);
+						  T1G = FMA(T1E, T1F, T1D);
+						  T3m = T1U * T1Y;
+						  T1W = T1U * T1V;
+						  T24 = ii[WS(rs, 17)];
+						  T20 = W[32];
+						  T1X = W[23];
+						  T23 = W[33];
+						  {
+						       E T3n, T1Z, T3p, T25, T3o, T22;
+						       T1I = ri[WS(rs, 7)];
+						       T3o = T20 * T24;
+						       T22 = T20 * T21;
+						       T3n = FNMS(T1X, T1V, T3m);
+						       T1Z = FMA(T1X, T1Y, T1W);
+						       T3p = FNMS(T23, T21, T3o);
+						       T25 = FMA(T23, T24, T22);
+						       T1L = ii[WS(rs, 7)];
+						       T1H = W[12];
+						       T3q = T3n + T3p;
+						       T4z = T3n - T3p;
+						       T4G = T25 - T1Z;
+						       T26 = T1Z + T25;
+						  }
+						  T1O = ri[WS(rs, 22)];
+						  T3h = T1H * T1L;
+						  T1J = T1H * T1I;
+						  T1R = ii[WS(rs, 22)];
+						  T1N = W[42];
+						  T1K = W[13];
+						  T1Q = W[43];
+					     }
+					}
+					T3j = T1N * T1R;
+					T1P = T1N * T1O;
+					T3i = FNMS(T1K, T1I, T3h);
+					T1M = FMA(T1K, T1L, T1J);
+					T3k = FNMS(T1Q, T1O, T3j);
+					T1S = FMA(T1Q, T1R, T1P);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T6R, T5M, T4A, T5J, T4H, T6S, T5I, T4E, T5L, T4x, T3K, T3I, T2K, T74, T76;
+			 E T2J;
+			 {
+			      E T1A, T72, T73, T2H, T28, T2I;
+			      {
+				   E T3e, T4D, T4w, T4C, T4v, T3H, T4y, T3l;
+				   T6R = T2Z + T3d;
+				   T3e = T2Z - T3d;
+				   T4y = T3k - T3i;
+				   T3l = T3i + T3k;
+				   {
+					E T4F, T1T, T3r, T27, T3s;
+					T4F = T1S - T1M;
+					T1T = T1M + T1S;
+					T5M = FMA(KP618033988, T4y, T4z);
+					T4A = FNMS(KP618033988, T4z, T4y);
+					T3r = T3l + T3q;
+					T4D = T3q - T3l;
+					T5J = FNMS(KP618033988, T4F, T4G);
+					T4H = FMA(KP618033988, T4G, T4F);
+					T27 = T1T + T26;
+					T4w = T26 - T1T;
+					T3s = T3g + T3r;
+					T4C = FNMS(KP250000000, T3r, T3g);
+					T28 = T1G + T27;
+					T4v = FNMS(KP250000000, T27, T1G);
+					T3H = T3s - T3G;
+					T6S = T3s + T3G;
+				   }
+				   T5I = FMA(KP559016994, T4D, T4C);
+				   T4E = FNMS(KP559016994, T4D, T4C);
+				   T5L = FMA(KP559016994, T4w, T4v);
+				   T4x = FNMS(KP559016994, T4w, T4v);
+				   T3K = FNMS(KP618033988, T3e, T3H);
+				   T3I = FMA(KP618033988, T3H, T3e);
+			      }
+			      T1A = T11 + T1z;
+			      T72 = T11 - T1z;
+			      T73 = T28 - T2G;
+			      T2H = T28 + T2G;
+			      T2I = T1A + T2H;
+			      T2K = T1A - T2H;
+			      T74 = FMA(KP618033988, T73, T72);
+			      T76 = FNMS(KP618033988, T72, T73);
+			      ri[0] = Tt + T2I;
+			      T2J = FNMS(KP250000000, T2I, Tt);
+			 }
+			 {
+			      E T5F, T7B, T7u, T5E, T5C, T7A, T7y, T7t, T5D, T5v;
+			      {
+				   E T3Z, T5d, T7p, T7d, T5m, T5l, T56, T7k, T59, T7l, T5z, T5g, T7g, T7i, T52;
+				   E T50, T5x, T5q, T5A, T5j, T70, T6Z, T3N;
+				   T5F = FNMS(KP559016994, T3M, T3L);
+				   T3N = FMA(KP559016994, T3M, T3L);
+				   {
+					E T79, T3J, T2L, T6T;
+					T79 = FMA(KP559016994, T78, T77);
+					T7B = FNMS(KP559016994, T78, T77);
+					T3J = FNMS(KP559016994, T2K, T2J);
+					T2L = FMA(KP559016994, T2K, T2J);
+					T6T = T6R + T6S;
+					T70 = T6R - T6S;
+					T3Z = FMA(KP951056516, T3Y, T3N);
+					T5d = FNMS(KP951056516, T3Y, T3N);
+					ri[WS(rs, 5)] = FMA(KP951056516, T3I, T2L);
+					ri[WS(rs, 20)] = FNMS(KP951056516, T3I, T2L);
+					ri[WS(rs, 15)] = FMA(KP951056516, T3K, T3J);
+					ri[WS(rs, 10)] = FNMS(KP951056516, T3K, T3J);
+					ii[0] = T6T + T6Y;
+					T6Z = FNMS(KP250000000, T6T, T6Y);
+					T7p = FMA(KP951056516, T7c, T79);
+					T7d = FNMS(KP951056516, T7c, T79);
+				   }
+				   {
+					E T5e, T54, T4e, T5f, T5o, T5p, T5i, T4B, T58, T4Y, T55, T4t, T4I, T5h;
+					{
+					     E T4Q, T4X, T4l, T4s;
+					     {
+						  E T46, T71, T75, T4d;
+						  T5m = FNMS(KP951056516, T45, T42);
+						  T46 = FMA(KP951056516, T45, T42);
+						  T71 = FMA(KP559016994, T70, T6Z);
+						  T75 = FNMS(KP559016994, T70, T6Z);
+						  T4d = FMA(KP951056516, T4c, T49);
+						  T5l = FNMS(KP951056516, T4c, T49);
+						  T5e = FMA(KP951056516, T4P, T4M);
+						  T4Q = FNMS(KP951056516, T4P, T4M);
+						  ii[WS(rs, 20)] = FMA(KP951056516, T74, T71);
+						  ii[WS(rs, 5)] = FNMS(KP951056516, T74, T71);
+						  ii[WS(rs, 15)] = FNMS(KP951056516, T76, T75);
+						  ii[WS(rs, 10)] = FMA(KP951056516, T76, T75);
+						  T54 = FNMS(KP256756360, T46, T4d);
+						  T4e = FMA(KP256756360, T4d, T46);
+						  T4X = FNMS(KP951056516, T4W, T4T);
+						  T5f = FMA(KP951056516, T4W, T4T);
+					     }
+					     T5o = FNMS(KP951056516, T4k, T4h);
+					     T4l = FMA(KP951056516, T4k, T4h);
+					     T4s = FNMS(KP951056516, T4r, T4o);
+					     T5p = FMA(KP951056516, T4r, T4o);
+					     T5i = FMA(KP951056516, T4A, T4x);
+					     T4B = FNMS(KP951056516, T4A, T4x);
+					     T58 = FNMS(KP939062505, T4Q, T4X);
+					     T4Y = FMA(KP939062505, T4X, T4Q);
+					     T55 = FNMS(KP634619297, T4l, T4s);
+					     T4t = FMA(KP634619297, T4s, T4l);
+					     T4I = FMA(KP951056516, T4H, T4E);
+					     T5h = FNMS(KP951056516, T4H, T4E);
+					}
+					{
+					     E T7e, T4u, T57, T4J, T7f, T4Z;
+					     T7e = FNMS(KP871714437, T55, T54);
+					     T56 = FMA(KP871714437, T55, T54);
+					     T4u = FMA(KP871714437, T4t, T4e);
+					     T7k = FNMS(KP871714437, T4t, T4e);
+					     T57 = FNMS(KP549754652, T4B, T4I);
+					     T4J = FMA(KP549754652, T4I, T4B);
+					     T7f = FMA(KP831864738, T58, T57);
+					     T59 = FNMS(KP831864738, T58, T57);
+					     T4Z = FMA(KP831864738, T4Y, T4J);
+					     T7l = FNMS(KP831864738, T4Y, T4J);
+					     T5z = FMA(KP126329378, T5e, T5f);
+					     T5g = FNMS(KP126329378, T5f, T5e);
+					     T7g = FMA(KP904730450, T7f, T7e);
+					     T7i = FNMS(KP904730450, T7f, T7e);
+					     T52 = FNMS(KP904730450, T4Z, T4u);
+					     T50 = FMA(KP904730450, T4Z, T4u);
+					}
+					T5x = FNMS(KP827271945, T5o, T5p);
+					T5q = FMA(KP827271945, T5p, T5o);
+					T5A = FMA(KP470564281, T5h, T5i);
+					T5j = FNMS(KP470564281, T5i, T5h);
+				   }
+				   {
+					E T7q, T5B, T5k, T7x, T5w, T5n;
+					ri[WS(rs, 1)] = FMA(KP968583161, T50, T3Z);
+					T7q = FMA(KP912018591, T5A, T5z);
+					T5B = FNMS(KP912018591, T5A, T5z);
+					T5k = FNMS(KP912018591, T5j, T5g);
+					T7x = FMA(KP912018591, T5j, T5g);
+					T5w = FNMS(KP634619297, T5l, T5m);
+					T5n = FMA(KP634619297, T5m, T5l);
+					ii[WS(rs, 1)] = FMA(KP968583161, T7g, T7d);
+					{
+					     E T5y, T7w, T7s, T5s, T5u, T7o, T7m, T7n, T7j, T5t;
+					     {
+						  E T5c, T5a, T51, T7r, T5r, T53, T5b, T7h;
+						  T5c = FNMS(KP683113946, T56, T59);
+						  T5a = FMA(KP559154169, T59, T56);
+						  T7r = FNMS(KP912575812, T5x, T5w);
+						  T5y = FMA(KP912575812, T5x, T5w);
+						  T5r = FNMS(KP912575812, T5q, T5n);
+						  T7w = FMA(KP912575812, T5q, T5n);
+						  T7s = FMA(KP851038619, T7r, T7q);
+						  T7u = FNMS(KP851038619, T7r, T7q);
+						  T5s = FNMS(KP851038619, T5r, T5k);
+						  T5u = FMA(KP851038619, T5r, T5k);
+						  T51 = FNMS(KP242145790, T50, T3Z);
+						  ii[WS(rs, 4)] = FNMS(KP992114701, T7s, T7p);
+						  ri[WS(rs, 4)] = FNMS(KP992114701, T5s, T5d);
+						  T7o = FNMS(KP683113946, T7k, T7l);
+						  T7m = FMA(KP559154169, T7l, T7k);
+						  T53 = FMA(KP541454447, T52, T51);
+						  T5b = FNMS(KP541454447, T52, T51);
+						  T7h = FNMS(KP242145790, T7g, T7d);
+						  ri[WS(rs, 11)] = FNMS(KP833417178, T5c, T5b);
+						  ri[WS(rs, 16)] = FMA(KP833417178, T5c, T5b);
+						  ri[WS(rs, 21)] = FNMS(KP921177326, T5a, T53);
+						  ri[WS(rs, 6)] = FMA(KP921177326, T5a, T53);
+						  T7n = FNMS(KP541454447, T7i, T7h);
+						  T7j = FMA(KP541454447, T7i, T7h);
+					     }
+					     T5E = FMA(KP525970792, T5y, T5B);
+					     T5C = FNMS(KP726211448, T5B, T5y);
+					     ii[WS(rs, 21)] = FMA(KP921177326, T7m, T7j);
+					     ii[WS(rs, 6)] = FNMS(KP921177326, T7m, T7j);
+					     ii[WS(rs, 11)] = FMA(KP833417178, T7o, T7n);
+					     ii[WS(rs, 16)] = FNMS(KP833417178, T7o, T7n);
+					     T5t = FMA(KP248028675, T5s, T5d);
+					     T7A = FNMS(KP525970792, T7w, T7x);
+					     T7y = FMA(KP726211448, T7x, T7w);
+					     T7t = FMA(KP248028675, T7s, T7p);
+					     T5D = FNMS(KP554608978, T5u, T5t);
+					     T5v = FMA(KP554608978, T5u, T5t);
+					}
+				   }
+			      }
+			      {
+				   E T5H, T6p, T7P, T7D, T6y, T6x, T6l, T7X, T6i, T7W, T6L, T6s, T7S, T7U, T6e;
+				   E T6c, T6J, T6C, T6M, T6v, T7z, T7v;
+				   ri[WS(rs, 14)] = FNMS(KP943557151, T5E, T5D);
+				   ri[WS(rs, 19)] = FMA(KP943557151, T5E, T5D);
+				   ri[WS(rs, 24)] = FMA(KP803003575, T5C, T5v);
+				   ri[WS(rs, 9)] = FNMS(KP803003575, T5C, T5v);
+				   T7z = FNMS(KP554608978, T7u, T7t);
+				   T7v = FMA(KP554608978, T7u, T7t);
+				   T5H = FMA(KP951056516, T5G, T5F);
+				   T6p = FNMS(KP951056516, T5G, T5F);
+				   ii[WS(rs, 14)] = FMA(KP943557151, T7A, T7z);
+				   ii[WS(rs, 19)] = FNMS(KP943557151, T7A, T7z);
+				   ii[WS(rs, 24)] = FMA(KP803003575, T7y, T7v);
+				   ii[WS(rs, 9)] = FNMS(KP803003575, T7y, T7v);
+				   {
+					E T6t, T6u, T6A, T6j, T5O, T6B, T6q, T6r, T5Z, T6h, T6a, T6k, T5V, T62;
+					{
+					     E T66, T69, T5K, T5N, T5R, T5U;
+					     T6t = FNMS(KP951056516, T5J, T5I);
+					     T5K = FMA(KP951056516, T5J, T5I);
+					     T5N = FMA(KP951056516, T5M, T5L);
+					     T6u = FNMS(KP951056516, T5M, T5L);
+					     T6A = FMA(KP951056516, T65, T64);
+					     T66 = FNMS(KP951056516, T65, T64);
+					     T7P = FNMS(KP951056516, T7C, T7B);
+					     T7D = FMA(KP951056516, T7C, T7B);
+					     T6j = FNMS(KP062914667, T5K, T5N);
+					     T5O = FMA(KP062914667, T5N, T5K);
+					     T69 = FMA(KP951056516, T68, T67);
+					     T6B = FNMS(KP951056516, T68, T67);
+					     T6q = FMA(KP951056516, T5Q, T5P);
+					     T5R = FNMS(KP951056516, T5Q, T5P);
+					     T5U = FNMS(KP951056516, T5T, T5S);
+					     T6r = FMA(KP951056516, T5T, T5S);
+					     T6y = FMA(KP951056516, T5Y, T5X);
+					     T5Z = FNMS(KP951056516, T5Y, T5X);
+					     T6h = FNMS(KP939062505, T66, T69);
+					     T6a = FMA(KP939062505, T69, T66);
+					     T6k = FMA(KP827271945, T5R, T5U);
+					     T5V = FNMS(KP827271945, T5U, T5R);
+					     T62 = FMA(KP951056516, T61, T60);
+					     T6x = FNMS(KP951056516, T61, T60);
+					}
+					{
+					     E T7Q, T5W, T6g, T63, T7R, T6b;
+					     T7Q = FMA(KP772036680, T6k, T6j);
+					     T6l = FNMS(KP772036680, T6k, T6j);
+					     T5W = FMA(KP772036680, T5V, T5O);
+					     T7X = FNMS(KP772036680, T5V, T5O);
+					     T6g = FMA(KP126329378, T5Z, T62);
+					     T63 = FNMS(KP126329378, T62, T5Z);
+					     T7R = FNMS(KP734762448, T6h, T6g);
+					     T6i = FMA(KP734762448, T6h, T6g);
+					     T6b = FNMS(KP734762448, T6a, T63);
+					     T7W = FMA(KP734762448, T6a, T63);
+					     T6L = FNMS(KP062914667, T6q, T6r);
+					     T6s = FMA(KP062914667, T6r, T6q);
+					     T7S = FMA(KP994076283, T7R, T7Q);
+					     T7U = FNMS(KP994076283, T7R, T7Q);
+					     T6e = FMA(KP994076283, T6b, T5W);
+					     T6c = FNMS(KP994076283, T6b, T5W);
+					}
+					T6J = FNMS(KP549754652, T6A, T6B);
+					T6C = FMA(KP549754652, T6B, T6A);
+					T6M = FNMS(KP634619297, T6t, T6u);
+					T6v = FMA(KP634619297, T6u, T6t);
+				   }
+				   {
+					E T7E, T6N, T6w, T7L, T6I, T6z;
+					ri[WS(rs, 3)] = FMA(KP998026728, T6c, T5H);
+					T7E = FMA(KP845997307, T6M, T6L);
+					T6N = FNMS(KP845997307, T6M, T6L);
+					T6w = FMA(KP845997307, T6v, T6s);
+					T7L = FNMS(KP845997307, T6v, T6s);
+					T6I = FMA(KP470564281, T6x, T6y);
+					T6z = FNMS(KP470564281, T6y, T6x);
+					ii[WS(rs, 3)] = FNMS(KP998026728, T7S, T7P);
+					{
+					     E T6K, T7K, T7G, T6E, T6G, T80, T7Y, T7Z, T7V, T6F;
+					     {
+						  E T6o, T6m, T6d, T7F, T6D, T6f, T6n, T7T;
+						  T6o = FMA(KP614372930, T6i, T6l);
+						  T6m = FNMS(KP621716863, T6l, T6i);
+						  T7F = FNMS(KP968479752, T6J, T6I);
+						  T6K = FMA(KP968479752, T6J, T6I);
+						  T6D = FMA(KP968479752, T6C, T6z);
+						  T7K = FNMS(KP968479752, T6C, T6z);
+						  T7G = FMA(KP906616052, T7F, T7E);
+						  T7I = FNMS(KP906616052, T7F, T7E);
+						  T6E = FMA(KP906616052, T6D, T6w);
+						  T6G = FNMS(KP906616052, T6D, T6w);
+						  T6d = FNMS(KP249506682, T6c, T5H);
+						  ii[WS(rs, 2)] = FNMS(KP998026728, T7G, T7D);
+						  ri[WS(rs, 2)] = FMA(KP998026728, T6E, T6p);
+						  T80 = FNMS(KP614372930, T7W, T7X);
+						  T7Y = FMA(KP621716863, T7X, T7W);
+						  T6f = FNMS(KP557913902, T6e, T6d);
+						  T6n = FMA(KP557913902, T6e, T6d);
+						  T7T = FMA(KP249506682, T7S, T7P);
+						  ri[WS(rs, 18)] = FNMS(KP949179823, T6o, T6n);
+						  ri[WS(rs, 13)] = FMA(KP949179823, T6o, T6n);
+						  ri[WS(rs, 8)] = FMA(KP943557151, T6m, T6f);
+						  ri[WS(rs, 23)] = FNMS(KP943557151, T6m, T6f);
+						  T7Z = FNMS(KP557913902, T7U, T7T);
+						  T7V = FMA(KP557913902, T7U, T7T);
+					     }
+					     T6Q = FNMS(KP560319534, T6K, T6N);
+					     T6O = FMA(KP681693190, T6N, T6K);
+					     ii[WS(rs, 23)] = FMA(KP943557151, T7Y, T7V);
+					     ii[WS(rs, 8)] = FNMS(KP943557151, T7Y, T7V);
+					     ii[WS(rs, 13)] = FMA(KP949179823, T80, T7Z);
+					     ii[WS(rs, 18)] = FNMS(KP949179823, T80, T7Z);
+					     T6F = FNMS(KP249506682, T6E, T6p);
+					     T7O = FNMS(KP560319534, T7K, T7L);
+					     T7M = FMA(KP681693190, T7L, T7K);
+					     T7H = FMA(KP249506682, T7G, T7D);
+					     T6P = FMA(KP557913902, T6G, T6F);
+					     T6H = FNMS(KP557913902, T6G, T6F);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ri[WS(rs, 12)] = FNMS(KP949179823, T6Q, T6P);
+	       ri[WS(rs, 17)] = FMA(KP949179823, T6Q, T6P);
+	       ri[WS(rs, 7)] = FMA(KP860541664, T6O, T6H);
+	       ri[WS(rs, 22)] = FNMS(KP860541664, T6O, T6H);
+	       T7J = FMA(KP557913902, T7I, T7H);
+	       T7N = FNMS(KP557913902, T7I, T7H);
+	       ii[WS(rs, 12)] = FNMS(KP949179823, T7O, T7N);
+	       ii[WS(rs, 17)] = FMA(KP949179823, T7O, T7N);
+	       ii[WS(rs, 22)] = FNMS(KP860541664, T7M, T7J);
+	       ii[WS(rs, 7)] = FMA(KP860541664, T7M, T7J);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 25},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 25, "t1_25", twinstr, &GENUS, {84, 48, 316, 0}, 0, 0, 0 };
+
+void X(codelet_t1_25) (planner *p) {
+     X(kdft_dit_register) (p, t1_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 25 -name t1_25 -include t.h */
+
+/*
+ * This function contains 400 FP additions, 280 FP multiplications,
+ * (or, 260 additions, 140 multiplications, 140 fused multiply/add),
+ * 101 stack variables, 20 constants, and 100 memory accesses
+ */
+#include "t.h"
+
+static void t1_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 48); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 48, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E T1, T6b, T2l, T6o, To, T2m, T6a, T6p, T6t, T6S, T2u, T4I, T2i, T60, T3O;
+	       E T5D, T4r, T58, T3Z, T5C, T4q, T5b, TS, T5W, T2G, T5s, T4g, T4M, T2R, T5t;
+	       E T4h, T4P, T1l, T5X, T33, T5w, T4j, T4W, T3e, T5v, T4k, T4T, T1P, T5Z, T3r;
+	       E T5z, T4o, T51, T3C, T5A, T4n, T54;
+	       {
+		    E T6, T2o, Tb, T2p, Tc, T68, Th, T2r, Tm, T2s, Tn, T69;
+		    T1 = ri[0];
+		    T6b = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 5)];
+			 T5 = ii[WS(rs, 5)];
+			 T2 = W[8];
+			 T4 = W[9];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T2o = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = ri[WS(rs, 20)];
+			 Ta = ii[WS(rs, 20)];
+			 T7 = W[38];
+			 T9 = W[39];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 T2p = FNMS(T9, T8, T7 * Ta);
+		    }
+		    Tc = T6 + Tb;
+		    T68 = T2o + T2p;
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = ri[WS(rs, 10)];
+			 Tg = ii[WS(rs, 10)];
+			 Td = W[18];
+			 Tf = W[19];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T2r = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E Tj, Tl, Ti, Tk;
+			 Tj = ri[WS(rs, 15)];
+			 Tl = ii[WS(rs, 15)];
+			 Ti = W[28];
+			 Tk = W[29];
+			 Tm = FMA(Ti, Tj, Tk * Tl);
+			 T2s = FNMS(Tk, Tj, Ti * Tl);
+		    }
+		    Tn = Th + Tm;
+		    T69 = T2r + T2s;
+		    T2l = KP559016994 * (Tc - Tn);
+		    T6o = KP559016994 * (T68 - T69);
+		    To = Tc + Tn;
+		    T2m = FNMS(KP250000000, To, T1);
+		    T6a = T68 + T69;
+		    T6p = FNMS(KP250000000, T6a, T6b);
+		    {
+			 E T6r, T6s, T2q, T2t;
+			 T6r = T6 - Tb;
+			 T6s = Th - Tm;
+			 T6t = FMA(KP951056516, T6r, KP587785252 * T6s);
+			 T6S = FNMS(KP587785252, T6r, KP951056516 * T6s);
+			 T2q = T2o - T2p;
+			 T2t = T2r - T2s;
+			 T2u = FMA(KP951056516, T2q, KP587785252 * T2t);
+			 T4I = FNMS(KP587785252, T2q, KP951056516 * T2t);
+		    }
+	       }
+	       {
+		    E T1U, T3S, T3J, T3M, T3X, T3W, T3P, T3Q, T3T, T25, T2g, T2h;
+		    {
+			 E T1R, T1T, T1Q, T1S;
+			 T1R = ri[WS(rs, 3)];
+			 T1T = ii[WS(rs, 3)];
+			 T1Q = W[4];
+			 T1S = W[5];
+			 T1U = FMA(T1Q, T1R, T1S * T1T);
+			 T3S = FNMS(T1S, T1R, T1Q * T1T);
+		    }
+		    {
+			 E T1Z, T3H, T2f, T3L, T24, T3I, T2a, T3K;
+			 {
+			      E T1W, T1Y, T1V, T1X;
+			      T1W = ri[WS(rs, 8)];
+			      T1Y = ii[WS(rs, 8)];
+			      T1V = W[14];
+			      T1X = W[15];
+			      T1Z = FMA(T1V, T1W, T1X * T1Y);
+			      T3H = FNMS(T1X, T1W, T1V * T1Y);
+			 }
+			 {
+			      E T2c, T2e, T2b, T2d;
+			      T2c = ri[WS(rs, 18)];
+			      T2e = ii[WS(rs, 18)];
+			      T2b = W[34];
+			      T2d = W[35];
+			      T2f = FMA(T2b, T2c, T2d * T2e);
+			      T3L = FNMS(T2d, T2c, T2b * T2e);
+			 }
+			 {
+			      E T21, T23, T20, T22;
+			      T21 = ri[WS(rs, 23)];
+			      T23 = ii[WS(rs, 23)];
+			      T20 = W[44];
+			      T22 = W[45];
+			      T24 = FMA(T20, T21, T22 * T23);
+			      T3I = FNMS(T22, T21, T20 * T23);
+			 }
+			 {
+			      E T27, T29, T26, T28;
+			      T27 = ri[WS(rs, 13)];
+			      T29 = ii[WS(rs, 13)];
+			      T26 = W[24];
+			      T28 = W[25];
+			      T2a = FMA(T26, T27, T28 * T29);
+			      T3K = FNMS(T28, T27, T26 * T29);
+			 }
+			 T3J = T3H - T3I;
+			 T3M = T3K - T3L;
+			 T3X = T2a - T2f;
+			 T3W = T1Z - T24;
+			 T3P = T3H + T3I;
+			 T3Q = T3K + T3L;
+			 T3T = T3P + T3Q;
+			 T25 = T1Z + T24;
+			 T2g = T2a + T2f;
+			 T2h = T25 + T2g;
+		    }
+		    T2i = T1U + T2h;
+		    T60 = T3S + T3T;
+		    {
+			 E T3N, T57, T3G, T56, T3E, T3F;
+			 T3N = FMA(KP951056516, T3J, KP587785252 * T3M);
+			 T57 = FNMS(KP587785252, T3J, KP951056516 * T3M);
+			 T3E = KP559016994 * (T25 - T2g);
+			 T3F = FNMS(KP250000000, T2h, T1U);
+			 T3G = T3E + T3F;
+			 T56 = T3F - T3E;
+			 T3O = T3G + T3N;
+			 T5D = T56 + T57;
+			 T4r = T3G - T3N;
+			 T58 = T56 - T57;
+		    }
+		    {
+			 E T3Y, T59, T3V, T5a, T3R, T3U;
+			 T3Y = FMA(KP951056516, T3W, KP587785252 * T3X);
+			 T59 = FNMS(KP587785252, T3W, KP951056516 * T3X);
+			 T3R = KP559016994 * (T3P - T3Q);
+			 T3U = FNMS(KP250000000, T3T, T3S);
+			 T3V = T3R + T3U;
+			 T5a = T3U - T3R;
+			 T3Z = T3V - T3Y;
+			 T5C = T5a - T59;
+			 T4q = T3Y + T3V;
+			 T5b = T59 + T5a;
+		    }
+	       }
+	       {
+		    E Tu, T2K, T2B, T2E, T2P, T2O, T2H, T2I, T2L, TF, TQ, TR;
+		    {
+			 E Tr, Tt, Tq, Ts;
+			 Tr = ri[WS(rs, 1)];
+			 Tt = ii[WS(rs, 1)];
+			 Tq = W[0];
+			 Ts = W[1];
+			 Tu = FMA(Tq, Tr, Ts * Tt);
+			 T2K = FNMS(Ts, Tr, Tq * Tt);
+		    }
+		    {
+			 E Tz, T2z, TP, T2D, TE, T2A, TK, T2C;
+			 {
+			      E Tw, Ty, Tv, Tx;
+			      Tw = ri[WS(rs, 6)];
+			      Ty = ii[WS(rs, 6)];
+			      Tv = W[10];
+			      Tx = W[11];
+			      Tz = FMA(Tv, Tw, Tx * Ty);
+			      T2z = FNMS(Tx, Tw, Tv * Ty);
+			 }
+			 {
+			      E TM, TO, TL, TN;
+			      TM = ri[WS(rs, 16)];
+			      TO = ii[WS(rs, 16)];
+			      TL = W[30];
+			      TN = W[31];
+			      TP = FMA(TL, TM, TN * TO);
+			      T2D = FNMS(TN, TM, TL * TO);
+			 }
+			 {
+			      E TB, TD, TA, TC;
+			      TB = ri[WS(rs, 21)];
+			      TD = ii[WS(rs, 21)];
+			      TA = W[40];
+			      TC = W[41];
+			      TE = FMA(TA, TB, TC * TD);
+			      T2A = FNMS(TC, TB, TA * TD);
+			 }
+			 {
+			      E TH, TJ, TG, TI;
+			      TH = ri[WS(rs, 11)];
+			      TJ = ii[WS(rs, 11)];
+			      TG = W[20];
+			      TI = W[21];
+			      TK = FMA(TG, TH, TI * TJ);
+			      T2C = FNMS(TI, TH, TG * TJ);
+			 }
+			 T2B = T2z - T2A;
+			 T2E = T2C - T2D;
+			 T2P = TK - TP;
+			 T2O = Tz - TE;
+			 T2H = T2z + T2A;
+			 T2I = T2C + T2D;
+			 T2L = T2H + T2I;
+			 TF = Tz + TE;
+			 TQ = TK + TP;
+			 TR = TF + TQ;
+		    }
+		    TS = Tu + TR;
+		    T5W = T2K + T2L;
+		    {
+			 E T2F, T4L, T2y, T4K, T2w, T2x;
+			 T2F = FMA(KP951056516, T2B, KP587785252 * T2E);
+			 T4L = FNMS(KP587785252, T2B, KP951056516 * T2E);
+			 T2w = KP559016994 * (TF - TQ);
+			 T2x = FNMS(KP250000000, TR, Tu);
+			 T2y = T2w + T2x;
+			 T4K = T2x - T2w;
+			 T2G = T2y + T2F;
+			 T5s = T4K + T4L;
+			 T4g = T2y - T2F;
+			 T4M = T4K - T4L;
+		    }
+		    {
+			 E T2Q, T4N, T2N, T4O, T2J, T2M;
+			 T2Q = FMA(KP951056516, T2O, KP587785252 * T2P);
+			 T4N = FNMS(KP587785252, T2O, KP951056516 * T2P);
+			 T2J = KP559016994 * (T2H - T2I);
+			 T2M = FNMS(KP250000000, T2L, T2K);
+			 T2N = T2J + T2M;
+			 T4O = T2M - T2J;
+			 T2R = T2N - T2Q;
+			 T5t = T4O - T4N;
+			 T4h = T2Q + T2N;
+			 T4P = T4N + T4O;
+		    }
+	       }
+	       {
+		    E TX, T37, T2Y, T31, T3c, T3b, T34, T35, T38, T18, T1j, T1k;
+		    {
+			 E TU, TW, TT, TV;
+			 TU = ri[WS(rs, 4)];
+			 TW = ii[WS(rs, 4)];
+			 TT = W[6];
+			 TV = W[7];
+			 TX = FMA(TT, TU, TV * TW);
+			 T37 = FNMS(TV, TU, TT * TW);
+		    }
+		    {
+			 E T12, T2W, T1i, T30, T17, T2X, T1d, T2Z;
+			 {
+			      E TZ, T11, TY, T10;
+			      TZ = ri[WS(rs, 9)];
+			      T11 = ii[WS(rs, 9)];
+			      TY = W[16];
+			      T10 = W[17];
+			      T12 = FMA(TY, TZ, T10 * T11);
+			      T2W = FNMS(T10, TZ, TY * T11);
+			 }
+			 {
+			      E T1f, T1h, T1e, T1g;
+			      T1f = ri[WS(rs, 19)];
+			      T1h = ii[WS(rs, 19)];
+			      T1e = W[36];
+			      T1g = W[37];
+			      T1i = FMA(T1e, T1f, T1g * T1h);
+			      T30 = FNMS(T1g, T1f, T1e * T1h);
+			 }
+			 {
+			      E T14, T16, T13, T15;
+			      T14 = ri[WS(rs, 24)];
+			      T16 = ii[WS(rs, 24)];
+			      T13 = W[46];
+			      T15 = W[47];
+			      T17 = FMA(T13, T14, T15 * T16);
+			      T2X = FNMS(T15, T14, T13 * T16);
+			 }
+			 {
+			      E T1a, T1c, T19, T1b;
+			      T1a = ri[WS(rs, 14)];
+			      T1c = ii[WS(rs, 14)];
+			      T19 = W[26];
+			      T1b = W[27];
+			      T1d = FMA(T19, T1a, T1b * T1c);
+			      T2Z = FNMS(T1b, T1a, T19 * T1c);
+			 }
+			 T2Y = T2W - T2X;
+			 T31 = T2Z - T30;
+			 T3c = T1d - T1i;
+			 T3b = T12 - T17;
+			 T34 = T2W + T2X;
+			 T35 = T2Z + T30;
+			 T38 = T34 + T35;
+			 T18 = T12 + T17;
+			 T1j = T1d + T1i;
+			 T1k = T18 + T1j;
+		    }
+		    T1l = TX + T1k;
+		    T5X = T37 + T38;
+		    {
+			 E T32, T4V, T2V, T4U, T2T, T2U;
+			 T32 = FMA(KP951056516, T2Y, KP587785252 * T31);
+			 T4V = FNMS(KP587785252, T2Y, KP951056516 * T31);
+			 T2T = KP559016994 * (T18 - T1j);
+			 T2U = FNMS(KP250000000, T1k, TX);
+			 T2V = T2T + T2U;
+			 T4U = T2U - T2T;
+			 T33 = T2V + T32;
+			 T5w = T4U + T4V;
+			 T4j = T2V - T32;
+			 T4W = T4U - T4V;
+		    }
+		    {
+			 E T3d, T4R, T3a, T4S, T36, T39;
+			 T3d = FMA(KP951056516, T3b, KP587785252 * T3c);
+			 T4R = FNMS(KP587785252, T3b, KP951056516 * T3c);
+			 T36 = KP559016994 * (T34 - T35);
+			 T39 = FNMS(KP250000000, T38, T37);
+			 T3a = T36 + T39;
+			 T4S = T39 - T36;
+			 T3e = T3a - T3d;
+			 T5v = T4S - T4R;
+			 T4k = T3d + T3a;
+			 T4T = T4R + T4S;
+		    }
+	       }
+	       {
+		    E T1r, T3v, T3m, T3p, T3A, T3z, T3s, T3t, T3w, T1C, T1N, T1O;
+		    {
+			 E T1o, T1q, T1n, T1p;
+			 T1o = ri[WS(rs, 2)];
+			 T1q = ii[WS(rs, 2)];
+			 T1n = W[2];
+			 T1p = W[3];
+			 T1r = FMA(T1n, T1o, T1p * T1q);
+			 T3v = FNMS(T1p, T1o, T1n * T1q);
+		    }
+		    {
+			 E T1w, T3k, T1M, T3o, T1B, T3l, T1H, T3n;
+			 {
+			      E T1t, T1v, T1s, T1u;
+			      T1t = ri[WS(rs, 7)];
+			      T1v = ii[WS(rs, 7)];
+			      T1s = W[12];
+			      T1u = W[13];
+			      T1w = FMA(T1s, T1t, T1u * T1v);
+			      T3k = FNMS(T1u, T1t, T1s * T1v);
+			 }
+			 {
+			      E T1J, T1L, T1I, T1K;
+			      T1J = ri[WS(rs, 17)];
+			      T1L = ii[WS(rs, 17)];
+			      T1I = W[32];
+			      T1K = W[33];
+			      T1M = FMA(T1I, T1J, T1K * T1L);
+			      T3o = FNMS(T1K, T1J, T1I * T1L);
+			 }
+			 {
+			      E T1y, T1A, T1x, T1z;
+			      T1y = ri[WS(rs, 22)];
+			      T1A = ii[WS(rs, 22)];
+			      T1x = W[42];
+			      T1z = W[43];
+			      T1B = FMA(T1x, T1y, T1z * T1A);
+			      T3l = FNMS(T1z, T1y, T1x * T1A);
+			 }
+			 {
+			      E T1E, T1G, T1D, T1F;
+			      T1E = ri[WS(rs, 12)];
+			      T1G = ii[WS(rs, 12)];
+			      T1D = W[22];
+			      T1F = W[23];
+			      T1H = FMA(T1D, T1E, T1F * T1G);
+			      T3n = FNMS(T1F, T1E, T1D * T1G);
+			 }
+			 T3m = T3k - T3l;
+			 T3p = T3n - T3o;
+			 T3A = T1H - T1M;
+			 T3z = T1w - T1B;
+			 T3s = T3k + T3l;
+			 T3t = T3n + T3o;
+			 T3w = T3s + T3t;
+			 T1C = T1w + T1B;
+			 T1N = T1H + T1M;
+			 T1O = T1C + T1N;
+		    }
+		    T1P = T1r + T1O;
+		    T5Z = T3v + T3w;
+		    {
+			 E T3q, T50, T3j, T4Z, T3h, T3i;
+			 T3q = FMA(KP951056516, T3m, KP587785252 * T3p);
+			 T50 = FNMS(KP587785252, T3m, KP951056516 * T3p);
+			 T3h = KP559016994 * (T1C - T1N);
+			 T3i = FNMS(KP250000000, T1O, T1r);
+			 T3j = T3h + T3i;
+			 T4Z = T3i - T3h;
+			 T3r = T3j + T3q;
+			 T5z = T4Z + T50;
+			 T4o = T3j - T3q;
+			 T51 = T4Z - T50;
+		    }
+		    {
+			 E T3B, T52, T3y, T53, T3u, T3x;
+			 T3B = FMA(KP951056516, T3z, KP587785252 * T3A);
+			 T52 = FNMS(KP587785252, T3z, KP951056516 * T3A);
+			 T3u = KP559016994 * (T3s - T3t);
+			 T3x = FNMS(KP250000000, T3w, T3v);
+			 T3y = T3u + T3x;
+			 T53 = T3x - T3u;
+			 T3C = T3y - T3B;
+			 T5A = T53 - T52;
+			 T4n = T3B + T3y;
+			 T54 = T52 + T53;
+		    }
+	       }
+	       {
+		    E T62, T64, Tp, T2k, T5T, T5U, T63, T5V;
+		    {
+			 E T5Y, T61, T1m, T2j;
+			 T5Y = T5W - T5X;
+			 T61 = T5Z - T60;
+			 T62 = FMA(KP951056516, T5Y, KP587785252 * T61);
+			 T64 = FNMS(KP587785252, T5Y, KP951056516 * T61);
+			 Tp = T1 + To;
+			 T1m = TS + T1l;
+			 T2j = T1P + T2i;
+			 T2k = T1m + T2j;
+			 T5T = KP559016994 * (T1m - T2j);
+			 T5U = FNMS(KP250000000, T2k, Tp);
+		    }
+		    ri[0] = Tp + T2k;
+		    T63 = T5U - T5T;
+		    ri[WS(rs, 10)] = T63 - T64;
+		    ri[WS(rs, 15)] = T63 + T64;
+		    T5V = T5T + T5U;
+		    ri[WS(rs, 20)] = T5V - T62;
+		    ri[WS(rs, 5)] = T5V + T62;
+	       }
+	       {
+		    E T6i, T6j, T6c, T67, T6d, T6e, T6k, T6f;
+		    {
+			 E T6g, T6h, T65, T66;
+			 T6g = TS - T1l;
+			 T6h = T1P - T2i;
+			 T6i = FMA(KP951056516, T6g, KP587785252 * T6h);
+			 T6j = FNMS(KP587785252, T6g, KP951056516 * T6h);
+			 T6c = T6a + T6b;
+			 T65 = T5W + T5X;
+			 T66 = T5Z + T60;
+			 T67 = T65 + T66;
+			 T6d = KP559016994 * (T65 - T66);
+			 T6e = FNMS(KP250000000, T67, T6c);
+		    }
+		    ii[0] = T67 + T6c;
+		    T6k = T6e - T6d;
+		    ii[WS(rs, 10)] = T6j + T6k;
+		    ii[WS(rs, 15)] = T6k - T6j;
+		    T6f = T6d + T6e;
+		    ii[WS(rs, 5)] = T6f - T6i;
+		    ii[WS(rs, 20)] = T6i + T6f;
+	       }
+	       {
+		    E T2v, T4f, T6u, T6G, T42, T6z, T43, T6y, T4A, T6H, T4D, T6F, T4u, T6L, T4v;
+		    E T6K, T48, T6v, T4b, T6n, T2n, T6q;
+		    T2n = T2l + T2m;
+		    T2v = T2n + T2u;
+		    T4f = T2n - T2u;
+		    T6q = T6o + T6p;
+		    T6u = T6q - T6t;
+		    T6G = T6t + T6q;
+		    {
+			 E T2S, T3f, T3g, T3D, T40, T41;
+			 T2S = FMA(KP968583161, T2G, KP248689887 * T2R);
+			 T3f = FMA(KP535826794, T33, KP844327925 * T3e);
+			 T3g = T2S + T3f;
+			 T3D = FMA(KP876306680, T3r, KP481753674 * T3C);
+			 T40 = FMA(KP728968627, T3O, KP684547105 * T3Z);
+			 T41 = T3D + T40;
+			 T42 = T3g + T41;
+			 T6z = T3D - T40;
+			 T43 = KP559016994 * (T3g - T41);
+			 T6y = T2S - T3f;
+		    }
+		    {
+			 E T4y, T4z, T6D, T4B, T4C, T6E;
+			 T4y = FNMS(KP844327925, T4g, KP535826794 * T4h);
+			 T4z = FNMS(KP637423989, T4k, KP770513242 * T4j);
+			 T6D = T4y + T4z;
+			 T4B = FMA(KP125333233, T4r, KP992114701 * T4q);
+			 T4C = FMA(KP904827052, T4o, KP425779291 * T4n);
+			 T6E = T4C + T4B;
+			 T4A = T4y - T4z;
+			 T6H = KP559016994 * (T6D + T6E);
+			 T4D = T4B - T4C;
+			 T6F = T6D - T6E;
+		    }
+		    {
+			 E T4i, T4l, T4m, T4p, T4s, T4t;
+			 T4i = FMA(KP535826794, T4g, KP844327925 * T4h);
+			 T4l = FMA(KP637423989, T4j, KP770513242 * T4k);
+			 T4m = T4i - T4l;
+			 T4p = FNMS(KP425779291, T4o, KP904827052 * T4n);
+			 T4s = FNMS(KP992114701, T4r, KP125333233 * T4q);
+			 T4t = T4p + T4s;
+			 T4u = T4m + T4t;
+			 T6L = T4p - T4s;
+			 T4v = KP559016994 * (T4m - T4t);
+			 T6K = T4i + T4l;
+		    }
+		    {
+			 E T46, T47, T6l, T49, T4a, T6m;
+			 T46 = FNMS(KP248689887, T2G, KP968583161 * T2R);
+			 T47 = FNMS(KP844327925, T33, KP535826794 * T3e);
+			 T6l = T46 + T47;
+			 T49 = FNMS(KP481753674, T3r, KP876306680 * T3C);
+			 T4a = FNMS(KP684547105, T3O, KP728968627 * T3Z);
+			 T6m = T49 + T4a;
+			 T48 = T46 - T47;
+			 T6v = KP559016994 * (T6l - T6m);
+			 T4b = T49 - T4a;
+			 T6n = T6l + T6m;
+		    }
+		    ri[WS(rs, 1)] = T2v + T42;
+		    ii[WS(rs, 1)] = T6n + T6u;
+		    ri[WS(rs, 4)] = T4f + T4u;
+		    ii[WS(rs, 4)] = T6F + T6G;
+		    {
+			 E T4c, T4e, T45, T4d, T44;
+			 T4c = FMA(KP951056516, T48, KP587785252 * T4b);
+			 T4e = FNMS(KP587785252, T48, KP951056516 * T4b);
+			 T44 = FNMS(KP250000000, T42, T2v);
+			 T45 = T43 + T44;
+			 T4d = T44 - T43;
+			 ri[WS(rs, 21)] = T45 - T4c;
+			 ri[WS(rs, 16)] = T4d + T4e;
+			 ri[WS(rs, 6)] = T45 + T4c;
+			 ri[WS(rs, 11)] = T4d - T4e;
+		    }
+		    {
+			 E T6A, T6B, T6x, T6C, T6w;
+			 T6A = FMA(KP951056516, T6y, KP587785252 * T6z);
+			 T6B = FNMS(KP587785252, T6y, KP951056516 * T6z);
+			 T6w = FNMS(KP250000000, T6n, T6u);
+			 T6x = T6v + T6w;
+			 T6C = T6w - T6v;
+			 ii[WS(rs, 6)] = T6x - T6A;
+			 ii[WS(rs, 16)] = T6C - T6B;
+			 ii[WS(rs, 21)] = T6A + T6x;
+			 ii[WS(rs, 11)] = T6B + T6C;
+		    }
+		    {
+			 E T4E, T4G, T4x, T4F, T4w;
+			 T4E = FMA(KP951056516, T4A, KP587785252 * T4D);
+			 T4G = FNMS(KP587785252, T4A, KP951056516 * T4D);
+			 T4w = FNMS(KP250000000, T4u, T4f);
+			 T4x = T4v + T4w;
+			 T4F = T4w - T4v;
+			 ri[WS(rs, 24)] = T4x - T4E;
+			 ri[WS(rs, 19)] = T4F + T4G;
+			 ri[WS(rs, 9)] = T4x + T4E;
+			 ri[WS(rs, 14)] = T4F - T4G;
+		    }
+		    {
+			 E T6M, T6N, T6J, T6O, T6I;
+			 T6M = FMA(KP951056516, T6K, KP587785252 * T6L);
+			 T6N = FNMS(KP587785252, T6K, KP951056516 * T6L);
+			 T6I = FNMS(KP250000000, T6F, T6G);
+			 T6J = T6H + T6I;
+			 T6O = T6I - T6H;
+			 ii[WS(rs, 9)] = T6J - T6M;
+			 ii[WS(rs, 19)] = T6O - T6N;
+			 ii[WS(rs, 24)] = T6M + T6J;
+			 ii[WS(rs, 14)] = T6N + T6O;
+		    }
+	       }
+	       {
+		    E T4J, T5r, T6U, T76, T5e, T6Z, T5f, T6Y, T5M, T77, T5P, T75, T5G, T7b, T5H;
+		    E T7a, T5k, T6V, T5n, T6R, T4H, T6T;
+		    T4H = T2m - T2l;
+		    T4J = T4H - T4I;
+		    T5r = T4H + T4I;
+		    T6T = T6p - T6o;
+		    T6U = T6S + T6T;
+		    T76 = T6T - T6S;
+		    {
+			 E T4Q, T4X, T4Y, T55, T5c, T5d;
+			 T4Q = FMA(KP876306680, T4M, KP481753674 * T4P);
+			 T4X = FNMS(KP425779291, T4W, KP904827052 * T4T);
+			 T4Y = T4Q + T4X;
+			 T55 = FMA(KP535826794, T51, KP844327925 * T54);
+			 T5c = FMA(KP062790519, T58, KP998026728 * T5b);
+			 T5d = T55 + T5c;
+			 T5e = T4Y + T5d;
+			 T6Z = T55 - T5c;
+			 T5f = KP559016994 * (T4Y - T5d);
+			 T6Y = T4Q - T4X;
+		    }
+		    {
+			 E T5K, T5L, T73, T5N, T5O, T74;
+			 T5K = FNMS(KP684547105, T5s, KP728968627 * T5t);
+			 T5L = FMA(KP125333233, T5w, KP992114701 * T5v);
+			 T73 = T5K - T5L;
+			 T5N = FNMS(KP998026728, T5z, KP062790519 * T5A);
+			 T5O = FMA(KP770513242, T5D, KP637423989 * T5C);
+			 T74 = T5N - T5O;
+			 T5M = T5K + T5L;
+			 T77 = KP559016994 * (T73 - T74);
+			 T5P = T5N + T5O;
+			 T75 = T73 + T74;
+		    }
+		    {
+			 E T5u, T5x, T5y, T5B, T5E, T5F;
+			 T5u = FMA(KP728968627, T5s, KP684547105 * T5t);
+			 T5x = FNMS(KP992114701, T5w, KP125333233 * T5v);
+			 T5y = T5u + T5x;
+			 T5B = FMA(KP062790519, T5z, KP998026728 * T5A);
+			 T5E = FNMS(KP637423989, T5D, KP770513242 * T5C);
+			 T5F = T5B + T5E;
+			 T5G = T5y + T5F;
+			 T7b = T5B - T5E;
+			 T5H = KP559016994 * (T5y - T5F);
+			 T7a = T5u - T5x;
+		    }
+		    {
+			 E T5i, T5j, T6P, T5l, T5m, T6Q;
+			 T5i = FNMS(KP481753674, T4M, KP876306680 * T4P);
+			 T5j = FMA(KP904827052, T4W, KP425779291 * T4T);
+			 T6P = T5i - T5j;
+			 T5l = FNMS(KP844327925, T51, KP535826794 * T54);
+			 T5m = FNMS(KP998026728, T58, KP062790519 * T5b);
+			 T6Q = T5l + T5m;
+			 T5k = T5i + T5j;
+			 T6V = KP559016994 * (T6P - T6Q);
+			 T5n = T5l - T5m;
+			 T6R = T6P + T6Q;
+		    }
+		    ri[WS(rs, 2)] = T4J + T5e;
+		    ii[WS(rs, 2)] = T6R + T6U;
+		    ri[WS(rs, 3)] = T5r + T5G;
+		    ii[WS(rs, 3)] = T75 + T76;
+		    {
+			 E T5o, T5q, T5h, T5p, T5g;
+			 T5o = FMA(KP951056516, T5k, KP587785252 * T5n);
+			 T5q = FNMS(KP587785252, T5k, KP951056516 * T5n);
+			 T5g = FNMS(KP250000000, T5e, T4J);
+			 T5h = T5f + T5g;
+			 T5p = T5g - T5f;
+			 ri[WS(rs, 22)] = T5h - T5o;
+			 ri[WS(rs, 17)] = T5p + T5q;
+			 ri[WS(rs, 7)] = T5h + T5o;
+			 ri[WS(rs, 12)] = T5p - T5q;
+		    }
+		    {
+			 E T70, T71, T6X, T72, T6W;
+			 T70 = FMA(KP951056516, T6Y, KP587785252 * T6Z);
+			 T71 = FNMS(KP587785252, T6Y, KP951056516 * T6Z);
+			 T6W = FNMS(KP250000000, T6R, T6U);
+			 T6X = T6V + T6W;
+			 T72 = T6W - T6V;
+			 ii[WS(rs, 7)] = T6X - T70;
+			 ii[WS(rs, 17)] = T72 - T71;
+			 ii[WS(rs, 22)] = T70 + T6X;
+			 ii[WS(rs, 12)] = T71 + T72;
+		    }
+		    {
+			 E T5Q, T5S, T5J, T5R, T5I;
+			 T5Q = FMA(KP951056516, T5M, KP587785252 * T5P);
+			 T5S = FNMS(KP587785252, T5M, KP951056516 * T5P);
+			 T5I = FNMS(KP250000000, T5G, T5r);
+			 T5J = T5H + T5I;
+			 T5R = T5I - T5H;
+			 ri[WS(rs, 23)] = T5J - T5Q;
+			 ri[WS(rs, 18)] = T5R + T5S;
+			 ri[WS(rs, 8)] = T5J + T5Q;
+			 ri[WS(rs, 13)] = T5R - T5S;
+		    }
+		    {
+			 E T7c, T7d, T79, T7e, T78;
+			 T7c = FMA(KP951056516, T7a, KP587785252 * T7b);
+			 T7d = FNMS(KP587785252, T7a, KP951056516 * T7b);
+			 T78 = FNMS(KP250000000, T75, T76);
+			 T79 = T77 + T78;
+			 T7e = T78 - T77;
+			 ii[WS(rs, 8)] = T79 - T7c;
+			 ii[WS(rs, 18)] = T7e - T7d;
+			 ii[WS(rs, 23)] = T7c + T79;
+			 ii[WS(rs, 13)] = T7d + T7e;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 25},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 25, "t1_25", twinstr, &GENUS, {260, 140, 140, 0}, 0, 0, 0 };
+
+void X(codelet_t1_25) (planner *p) {
+     X(kdft_dit_register) (p, t1_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:47 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 3 -name t1_3 -include t.h */
+
+/*
+ * This function contains 16 FP additions, 14 FP multiplications,
+ * (or, 6 additions, 4 multiplications, 10 fused multiply/add),
+ * 21 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "t.h"
+
+static void t1_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
+	       E T1, Tm, T9, Tc, Tb, Th, T7, Ti, Ta, Tj, Td;
+	       T1 = ri[0];
+	       Tm = ii[0];
+	       {
+		    E T3, T6, T2, T5, Tg, T4, T8;
+		    T3 = ri[WS(rs, 1)];
+		    T6 = ii[WS(rs, 1)];
+		    T2 = W[0];
+		    T5 = W[1];
+		    T9 = ri[WS(rs, 2)];
+		    Tc = ii[WS(rs, 2)];
+		    Tg = T2 * T6;
+		    T4 = T2 * T3;
+		    T8 = W[2];
+		    Tb = W[3];
+		    Th = FNMS(T5, T3, Tg);
+		    T7 = FMA(T5, T6, T4);
+		    Ti = T8 * Tc;
+		    Ta = T8 * T9;
+	       }
+	       Tj = FNMS(Tb, T9, Ti);
+	       Td = FMA(Tb, Tc, Ta);
+	       {
+		    E Tk, Te, To, Tn, Tl, Tf;
+		    Tk = Th - Tj;
+		    Tl = Th + Tj;
+		    Te = T7 + Td;
+		    To = Td - T7;
+		    ii[0] = Tl + Tm;
+		    Tn = FNMS(KP500000000, Tl, Tm);
+		    ri[0] = T1 + Te;
+		    Tf = FNMS(KP500000000, Te, T1);
+		    ii[WS(rs, 1)] = FMA(KP866025403, To, Tn);
+		    ii[WS(rs, 2)] = FNMS(KP866025403, To, Tn);
+		    ri[WS(rs, 2)] = FNMS(KP866025403, Tk, Tf);
+		    ri[WS(rs, 1)] = FMA(KP866025403, Tk, Tf);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 3, "t1_3", twinstr, &GENUS, {6, 4, 10, 0}, 0, 0, 0 };
+
+void X(codelet_t1_3) (planner *p) {
+     X(kdft_dit_register) (p, t1_3, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 3 -name t1_3 -include t.h */
+
+/*
+ * This function contains 16 FP additions, 12 FP multiplications,
+ * (or, 10 additions, 6 multiplications, 6 fused multiply/add),
+ * 15 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "t.h"
+
+static void t1_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
+	       E T1, Ti, T6, Te, Tb, Tf, Tc, Th;
+	       T1 = ri[0];
+	       Ti = ii[0];
+	       {
+		    E T3, T5, T2, T4;
+		    T3 = ri[WS(rs, 1)];
+		    T5 = ii[WS(rs, 1)];
+		    T2 = W[0];
+		    T4 = W[1];
+		    T6 = FMA(T2, T3, T4 * T5);
+		    Te = FNMS(T4, T3, T2 * T5);
+	       }
+	       {
+		    E T8, Ta, T7, T9;
+		    T8 = ri[WS(rs, 2)];
+		    Ta = ii[WS(rs, 2)];
+		    T7 = W[2];
+		    T9 = W[3];
+		    Tb = FMA(T7, T8, T9 * Ta);
+		    Tf = FNMS(T9, T8, T7 * Ta);
+	       }
+	       Tc = T6 + Tb;
+	       Th = Te + Tf;
+	       ri[0] = T1 + Tc;
+	       ii[0] = Th + Ti;
+	       {
+		    E Td, Tg, Tj, Tk;
+		    Td = FNMS(KP500000000, Tc, T1);
+		    Tg = KP866025403 * (Te - Tf);
+		    ri[WS(rs, 2)] = Td - Tg;
+		    ri[WS(rs, 1)] = Td + Tg;
+		    Tj = KP866025403 * (Tb - T6);
+		    Tk = FNMS(KP500000000, Th, Ti);
+		    ii[WS(rs, 1)] = Tj + Tk;
+		    ii[WS(rs, 2)] = Tk - Tj;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 3, "t1_3", twinstr, &GENUS, {10, 6, 6, 0}, 0, 0, 0 };
+
+void X(codelet_t1_3) (planner *p) {
+     X(kdft_dit_register) (p, t1_3, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1771 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:51 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 32 -name t1_32 -include t.h */
+
+/*
+ * This function contains 434 FP additions, 260 FP multiplications,
+ * (or, 236 additions, 62 multiplications, 198 fused multiply/add),
+ * 135 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "t.h"
+
+static void t1_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 62); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T90, T8Z;
+	       {
+		    E T8x, T87, T8, T3w, T83, T3B, T8y, Tl, T6F, Tz, T3J, T5T, T6G, TM, T3Q;
+		    E T5U, T46, T5Y, T7D, T6L, T5X, T3Z, T6M, T1f, T7E, T6R, T60, T4e, T6O, T1G;
+		    E T61, T4l, T78, T7N, T54, T6f, T32, T7b, T6c, T5r, T6X, T7I, T4v, T68, T29;
+		    E T70, T65, T4S, T5s, T5b, T7O, T7e, T79, T3t, T5t, T5i, T4H, T2y, T4A, T71;
+		    E T2m, T4B, T4F, T2s;
+		    {
+			 E T44, T1d, T3X, T6J, T11, T40, T42, T17, T5h, T5c;
+			 {
+			      E Ta, Td, Tg, T3x, Tb, Tj, Tf, Tc, Ti;
+			      {
+				   E T1, T86, T3, T6, T2, T5;
+				   T1 = ri[0];
+				   T86 = ii[0];
+				   T3 = ri[WS(rs, 16)];
+				   T6 = ii[WS(rs, 16)];
+				   T2 = W[30];
+				   T5 = W[31];
+				   {
+					E T84, T4, T9, T85, T7;
+					Ta = ri[WS(rs, 8)];
+					Td = ii[WS(rs, 8)];
+					T84 = T2 * T6;
+					T4 = T2 * T3;
+					T9 = W[14];
+					Tg = ri[WS(rs, 24)];
+					T85 = FNMS(T5, T3, T84);
+					T7 = FMA(T5, T6, T4);
+					T3x = T9 * Td;
+					Tb = T9 * Ta;
+					T8x = T86 - T85;
+					T87 = T85 + T86;
+					T8 = T1 + T7;
+					T3w = T1 - T7;
+					Tj = ii[WS(rs, 24)];
+					Tf = W[46];
+				   }
+				   Tc = W[15];
+				   Ti = W[47];
+			      }
+			      {
+				   E Tu, Tx, T3F, Ts, Tw, T3G, Tv;
+				   {
+					E To, Tr, Tp, T3E, Tq, Tt;
+					{
+					     E T3y, Te, T3A, Tk, T3z, Th, Tn;
+					     To = ri[WS(rs, 4)];
+					     T3z = Tf * Tj;
+					     Th = Tf * Tg;
+					     T3y = FNMS(Tc, Ta, T3x);
+					     Te = FMA(Tc, Td, Tb);
+					     T3A = FNMS(Ti, Tg, T3z);
+					     Tk = FMA(Ti, Tj, Th);
+					     Tr = ii[WS(rs, 4)];
+					     Tn = W[6];
+					     T83 = T3y + T3A;
+					     T3B = T3y - T3A;
+					     T8y = Te - Tk;
+					     Tl = Te + Tk;
+					     Tp = Tn * To;
+					     T3E = Tn * Tr;
+					}
+					Tq = W[7];
+					Tu = ri[WS(rs, 20)];
+					Tx = ii[WS(rs, 20)];
+					Tt = W[38];
+					T3F = FNMS(Tq, To, T3E);
+					Ts = FMA(Tq, Tr, Tp);
+					Tw = W[39];
+					T3G = Tt * Tx;
+					Tv = Tt * Tu;
+				   }
+				   {
+					E T3M, TF, TH, TK, TG, TJ, TE, TD, TC;
+					{
+					     E TB, T3H, Ty, TA, T3I, T3D, T3L;
+					     TB = ri[WS(rs, 28)];
+					     TE = ii[WS(rs, 28)];
+					     T3H = FNMS(Tw, Tu, T3G);
+					     Ty = FMA(Tw, Tx, Tv);
+					     TA = W[54];
+					     TD = W[55];
+					     T6F = T3F + T3H;
+					     T3I = T3F - T3H;
+					     Tz = Ts + Ty;
+					     T3D = Ts - Ty;
+					     T3L = TA * TE;
+					     TC = TA * TB;
+					     T3J = T3D + T3I;
+					     T5T = T3I - T3D;
+					     T3M = FNMS(TD, TB, T3L);
+					}
+					TF = FMA(TD, TE, TC);
+					TH = ri[WS(rs, 12)];
+					TK = ii[WS(rs, 12)];
+					TG = W[22];
+					TJ = W[23];
+					{
+					     E TU, T3U, T13, T16, T3W, T10, T12, T15, T41, T14;
+					     {
+						  E T19, T1c, T18, T1b, T3P, T3K;
+						  {
+						       E TQ, TT, T3N, TI, TP, TS;
+						       TQ = ri[WS(rs, 2)];
+						       TT = ii[WS(rs, 2)];
+						       T3N = TG * TK;
+						       TI = TG * TH;
+						       TP = W[2];
+						       TS = W[3];
+						       {
+							    E T3O, TL, T3T, TR;
+							    T3O = FNMS(TJ, TH, T3N);
+							    TL = FMA(TJ, TK, TI);
+							    T3T = TP * TT;
+							    TR = TP * TQ;
+							    T6G = T3M + T3O;
+							    T3P = T3M - T3O;
+							    TM = TF + TL;
+							    T3K = TF - TL;
+							    TU = FMA(TS, TT, TR);
+							    T3U = FNMS(TS, TQ, T3T);
+						       }
+						  }
+						  T3Q = T3K - T3P;
+						  T5U = T3K + T3P;
+						  T19 = ri[WS(rs, 26)];
+						  T1c = ii[WS(rs, 26)];
+						  T18 = W[50];
+						  T1b = W[51];
+						  {
+						       E TW, TZ, TY, T3V, TX, T43, T1a, TV;
+						       TW = ri[WS(rs, 18)];
+						       TZ = ii[WS(rs, 18)];
+						       T43 = T18 * T1c;
+						       T1a = T18 * T19;
+						       TV = W[34];
+						       TY = W[35];
+						       T44 = FNMS(T1b, T19, T43);
+						       T1d = FMA(T1b, T1c, T1a);
+						       T3V = TV * TZ;
+						       TX = TV * TW;
+						       T13 = ri[WS(rs, 10)];
+						       T16 = ii[WS(rs, 10)];
+						       T3W = FNMS(TY, TW, T3V);
+						       T10 = FMA(TY, TZ, TX);
+						       T12 = W[18];
+						       T15 = W[19];
+						  }
+					     }
+					     T3X = T3U - T3W;
+					     T6J = T3U + T3W;
+					     T11 = TU + T10;
+					     T40 = TU - T10;
+					     T41 = T12 * T16;
+					     T14 = T12 * T13;
+					     T42 = FNMS(T15, T13, T41);
+					     T17 = FMA(T15, T16, T14);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T49, T1l, T4j, T1E, T1u, T1x, T1w, T4b, T1r, T4g, T1v;
+			      {
+				   E T1A, T1D, T1C, T4i, T1B;
+				   {
+					E T1h, T1k, T1g, T1j, T48, T1i, T1z;
+					T1h = ri[WS(rs, 30)];
+					T1k = ii[WS(rs, 30)];
+					{
+					     E T6K, T45, T1e, T3Y;
+					     T6K = T42 + T44;
+					     T45 = T42 - T44;
+					     T1e = T17 + T1d;
+					     T3Y = T17 - T1d;
+					     T46 = T40 + T45;
+					     T5Y = T40 - T45;
+					     T7D = T6J + T6K;
+					     T6L = T6J - T6K;
+					     T5X = T3X + T3Y;
+					     T3Z = T3X - T3Y;
+					     T6M = T11 - T1e;
+					     T1f = T11 + T1e;
+					     T1g = W[58];
+					}
+					T1j = W[59];
+					T1A = ri[WS(rs, 22)];
+					T1D = ii[WS(rs, 22)];
+					T48 = T1g * T1k;
+					T1i = T1g * T1h;
+					T1z = W[42];
+					T1C = W[43];
+					T49 = FNMS(T1j, T1h, T48);
+					T1l = FMA(T1j, T1k, T1i);
+					T4i = T1z * T1D;
+					T1B = T1z * T1A;
+				   }
+				   {
+					E T1n, T1q, T1m, T1p, T4a, T1o, T1t;
+					T1n = ri[WS(rs, 14)];
+					T1q = ii[WS(rs, 14)];
+					T4j = FNMS(T1C, T1A, T4i);
+					T1E = FMA(T1C, T1D, T1B);
+					T1m = W[26];
+					T1p = W[27];
+					T1u = ri[WS(rs, 6)];
+					T1x = ii[WS(rs, 6)];
+					T4a = T1m * T1q;
+					T1o = T1m * T1n;
+					T1t = W[10];
+					T1w = W[11];
+					T4b = FNMS(T1p, T1n, T4a);
+					T1r = FMA(T1p, T1q, T1o);
+					T4g = T1t * T1x;
+					T1v = T1t * T1u;
+				   }
+			      }
+			      {
+				   E T4c, T6P, T1s, T4f, T4h, T1y;
+				   T4c = T49 - T4b;
+				   T6P = T49 + T4b;
+				   T1s = T1l + T1r;
+				   T4f = T1l - T1r;
+				   T4h = FNMS(T1w, T1u, T4g);
+				   T1y = FMA(T1w, T1x, T1v);
+				   {
+					E T4k, T6Q, T4d, T1F;
+					T4k = T4h - T4j;
+					T6Q = T4h + T4j;
+					T4d = T1y - T1E;
+					T1F = T1y + T1E;
+					T7E = T6P + T6Q;
+					T6R = T6P - T6Q;
+					T60 = T4c + T4d;
+					T4e = T4c - T4d;
+					T6O = T1s - T1F;
+					T1G = T1s + T1F;
+					T61 = T4f - T4k;
+					T4l = T4f + T4k;
+				   }
+			      }
+			 }
+			 {
+			      E T4Z, T2H, T5p, T30, T2Q, T2T, T2S, T51, T2N, T5m, T2R;
+			      {
+				   E T2W, T2Z, T2Y, T5o, T2X;
+				   {
+					E T2D, T2G, T2C, T2F, T4Y, T2E, T2V;
+					T2D = ri[WS(rs, 31)];
+					T2G = ii[WS(rs, 31)];
+					T2C = W[60];
+					T2F = W[61];
+					T2W = ri[WS(rs, 23)];
+					T2Z = ii[WS(rs, 23)];
+					T4Y = T2C * T2G;
+					T2E = T2C * T2D;
+					T2V = W[44];
+					T2Y = W[45];
+					T4Z = FNMS(T2F, T2D, T4Y);
+					T2H = FMA(T2F, T2G, T2E);
+					T5o = T2V * T2Z;
+					T2X = T2V * T2W;
+				   }
+				   {
+					E T2J, T2M, T2I, T2L, T50, T2K, T2P;
+					T2J = ri[WS(rs, 15)];
+					T2M = ii[WS(rs, 15)];
+					T5p = FNMS(T2Y, T2W, T5o);
+					T30 = FMA(T2Y, T2Z, T2X);
+					T2I = W[28];
+					T2L = W[29];
+					T2Q = ri[WS(rs, 7)];
+					T2T = ii[WS(rs, 7)];
+					T50 = T2I * T2M;
+					T2K = T2I * T2J;
+					T2P = W[12];
+					T2S = W[13];
+					T51 = FNMS(T2L, T2J, T50);
+					T2N = FMA(T2L, T2M, T2K);
+					T5m = T2P * T2T;
+					T2R = T2P * T2Q;
+				   }
+			      }
+			      {
+				   E T52, T76, T2O, T5l, T5n, T2U;
+				   T52 = T4Z - T51;
+				   T76 = T4Z + T51;
+				   T2O = T2H + T2N;
+				   T5l = T2H - T2N;
+				   T5n = FNMS(T2S, T2Q, T5m);
+				   T2U = FMA(T2S, T2T, T2R);
+				   {
+					E T5q, T77, T53, T31;
+					T5q = T5n - T5p;
+					T77 = T5n + T5p;
+					T53 = T2U - T30;
+					T31 = T2U + T30;
+					T78 = T76 - T77;
+					T7N = T76 + T77;
+					T54 = T52 - T53;
+					T6f = T52 + T53;
+					T32 = T2O + T31;
+					T7b = T2O - T31;
+					T6c = T5l - T5q;
+					T5r = T5l + T5q;
+				   }
+			      }
+			 }
+			 {
+			      E T4q, T1O, T4Q, T27, T1X, T20, T1Z, T4s, T1U, T4N, T1Y;
+			      {
+				   E T23, T26, T25, T4P, T24;
+				   {
+					E T1K, T1N, T1J, T1M, T4p, T1L, T22;
+					T1K = ri[WS(rs, 1)];
+					T1N = ii[WS(rs, 1)];
+					T1J = W[0];
+					T1M = W[1];
+					T23 = ri[WS(rs, 25)];
+					T26 = ii[WS(rs, 25)];
+					T4p = T1J * T1N;
+					T1L = T1J * T1K;
+					T22 = W[48];
+					T25 = W[49];
+					T4q = FNMS(T1M, T1K, T4p);
+					T1O = FMA(T1M, T1N, T1L);
+					T4P = T22 * T26;
+					T24 = T22 * T23;
+				   }
+				   {
+					E T1Q, T1T, T1P, T1S, T4r, T1R, T1W;
+					T1Q = ri[WS(rs, 17)];
+					T1T = ii[WS(rs, 17)];
+					T4Q = FNMS(T25, T23, T4P);
+					T27 = FMA(T25, T26, T24);
+					T1P = W[32];
+					T1S = W[33];
+					T1X = ri[WS(rs, 9)];
+					T20 = ii[WS(rs, 9)];
+					T4r = T1P * T1T;
+					T1R = T1P * T1Q;
+					T1W = W[16];
+					T1Z = W[17];
+					T4s = FNMS(T1S, T1Q, T4r);
+					T1U = FMA(T1S, T1T, T1R);
+					T4N = T1W * T20;
+					T1Y = T1W * T1X;
+				   }
+			      }
+			      {
+				   E T4t, T6V, T1V, T4M, T4O, T21;
+				   T4t = T4q - T4s;
+				   T6V = T4q + T4s;
+				   T1V = T1O + T1U;
+				   T4M = T1O - T1U;
+				   T4O = FNMS(T1Z, T1X, T4N);
+				   T21 = FMA(T1Z, T20, T1Y);
+				   {
+					E T4R, T6W, T4u, T28;
+					T4R = T4O - T4Q;
+					T6W = T4O + T4Q;
+					T4u = T21 - T27;
+					T28 = T21 + T27;
+					T6X = T6V - T6W;
+					T7I = T6V + T6W;
+					T4v = T4t - T4u;
+					T68 = T4t + T4u;
+					T29 = T1V + T28;
+					T70 = T1V - T28;
+					T65 = T4M - T4R;
+					T4S = T4M + T4R;
+				   }
+			      }
+			 }
+			 {
+			      E T56, T38, T5g, T3r, T3h, T3k, T3j, T58, T3e, T5d, T3i;
+			      {
+				   E T3n, T3q, T3p, T5f, T3o;
+				   {
+					E T34, T37, T33, T36, T55, T35, T3m;
+					T34 = ri[WS(rs, 3)];
+					T37 = ii[WS(rs, 3)];
+					T33 = W[4];
+					T36 = W[5];
+					T3n = ri[WS(rs, 11)];
+					T3q = ii[WS(rs, 11)];
+					T55 = T33 * T37;
+					T35 = T33 * T34;
+					T3m = W[20];
+					T3p = W[21];
+					T56 = FNMS(T36, T34, T55);
+					T38 = FMA(T36, T37, T35);
+					T5f = T3m * T3q;
+					T3o = T3m * T3n;
+				   }
+				   {
+					E T3a, T3d, T39, T3c, T57, T3b, T3g;
+					T3a = ri[WS(rs, 19)];
+					T3d = ii[WS(rs, 19)];
+					T5g = FNMS(T3p, T3n, T5f);
+					T3r = FMA(T3p, T3q, T3o);
+					T39 = W[36];
+					T3c = W[37];
+					T3h = ri[WS(rs, 27)];
+					T3k = ii[WS(rs, 27)];
+					T57 = T39 * T3d;
+					T3b = T39 * T3a;
+					T3g = W[52];
+					T3j = W[53];
+					T58 = FNMS(T3c, T3a, T57);
+					T3e = FMA(T3c, T3d, T3b);
+					T5d = T3g * T3k;
+					T3i = T3g * T3h;
+				   }
+			      }
+			      {
+				   E T59, T7c, T3f, T5a, T5e, T3l, T7d, T3s;
+				   T59 = T56 - T58;
+				   T7c = T56 + T58;
+				   T3f = T38 + T3e;
+				   T5a = T38 - T3e;
+				   T5e = FNMS(T3j, T3h, T5d);
+				   T3l = FMA(T3j, T3k, T3i);
+				   T5h = T5e - T5g;
+				   T7d = T5e + T5g;
+				   T3s = T3l + T3r;
+				   T5c = T3l - T3r;
+				   T5s = T5a + T59;
+				   T5b = T59 - T5a;
+				   T7O = T7c + T7d;
+				   T7e = T7c - T7d;
+				   T79 = T3s - T3f;
+				   T3t = T3f + T3s;
+			      }
+			 }
+			 {
+			      E T4x, T2f, T2o, T2r, T4z, T2l, T2n, T2q, T4E, T2p;
+			      {
+				   E T2u, T2x, T2t, T2w;
+				   {
+					E T2b, T2e, T2d, T4w, T2c, T2a;
+					T2b = ri[WS(rs, 5)];
+					T2e = ii[WS(rs, 5)];
+					T2a = W[8];
+					T5t = T5c - T5h;
+					T5i = T5c + T5h;
+					T2d = W[9];
+					T4w = T2a * T2e;
+					T2c = T2a * T2b;
+					T2u = ri[WS(rs, 13)];
+					T2x = ii[WS(rs, 13)];
+					T4x = FNMS(T2d, T2b, T4w);
+					T2f = FMA(T2d, T2e, T2c);
+					T2t = W[24];
+					T2w = W[25];
+				   }
+				   {
+					E T2h, T2k, T2j, T4y, T2i, T4G, T2v, T2g;
+					T2h = ri[WS(rs, 21)];
+					T2k = ii[WS(rs, 21)];
+					T4G = T2t * T2x;
+					T2v = T2t * T2u;
+					T2g = W[40];
+					T2j = W[41];
+					T4H = FNMS(T2w, T2u, T4G);
+					T2y = FMA(T2w, T2x, T2v);
+					T4y = T2g * T2k;
+					T2i = T2g * T2h;
+					T2o = ri[WS(rs, 29)];
+					T2r = ii[WS(rs, 29)];
+					T4z = FNMS(T2j, T2h, T4y);
+					T2l = FMA(T2j, T2k, T2i);
+					T2n = W[56];
+					T2q = W[57];
+				   }
+			      }
+			      T4A = T4x - T4z;
+			      T71 = T4x + T4z;
+			      T2m = T2f + T2l;
+			      T4B = T2f - T2l;
+			      T4E = T2n * T2r;
+			      T2p = T2n * T2o;
+			      T4F = FNMS(T2q, T2o, T4E);
+			      T2s = FMA(T2q, T2r, T2p);
+			 }
+		    }
+		    {
+			 E T4T, T4C, T4J, T4U, T7y, T8q, T8p, T7B;
+			 {
+			      E T6E, T8j, T73, T6Y, T6H, T8k, T8i, T8h;
+			      {
+				   E T7C, TO, T80, T7Z, T8e, T89, T8d, T1H, T8b, T3v, T7T, T7L, T7U, T7Q, T2A;
+				   E T7K, T7P, T7W, T1I;
+				   {
+					E T7X, T7Y, T7J, T82, T88;
+					{
+					     E Tm, T4I, T72, T4D, T2z, TN;
+					     T6E = T8 - Tl;
+					     Tm = T8 + Tl;
+					     T4T = T4B + T4A;
+					     T4C = T4A - T4B;
+					     T4I = T4F - T4H;
+					     T72 = T4F + T4H;
+					     T4D = T2s - T2y;
+					     T2z = T2s + T2y;
+					     TN = Tz + TM;
+					     T8j = TM - Tz;
+					     T73 = T71 - T72;
+					     T7J = T71 + T72;
+					     T4J = T4D + T4I;
+					     T4U = T4D - T4I;
+					     T2A = T2m + T2z;
+					     T6Y = T2z - T2m;
+					     T7C = Tm - TN;
+					     TO = Tm + TN;
+					}
+					T7K = T7I - T7J;
+					T7X = T7I + T7J;
+					T7Y = T7N + T7O;
+					T7P = T7N - T7O;
+					T6H = T6F - T6G;
+					T82 = T6F + T6G;
+					T88 = T83 + T87;
+					T8k = T87 - T83;
+					T80 = T7X + T7Y;
+					T7Z = T7X - T7Y;
+					T8e = T88 - T82;
+					T89 = T82 + T88;
+				   }
+				   {
+					E T7H, T7M, T2B, T3u;
+					T7H = T29 - T2A;
+					T2B = T29 + T2A;
+					T3u = T32 + T3t;
+					T7M = T32 - T3t;
+					T8d = T1G - T1f;
+					T1H = T1f + T1G;
+					T8b = T3u - T2B;
+					T3v = T2B + T3u;
+					T7T = T7K - T7H;
+					T7L = T7H + T7K;
+					T7U = T7M + T7P;
+					T7Q = T7M - T7P;
+				   }
+				   T7W = TO - T1H;
+				   T1I = TO + T1H;
+				   {
+					E T7S, T8f, T8g, T7V;
+					{
+					     E T7R, T8c, T8a, T7G, T81, T7F;
+					     T8i = T7Q - T7L;
+					     T7R = T7L + T7Q;
+					     T81 = T7D + T7E;
+					     T7F = T7D - T7E;
+					     ri[0] = T1I + T3v;
+					     ri[WS(rs, 16)] = T1I - T3v;
+					     ri[WS(rs, 8)] = T7W + T7Z;
+					     ri[WS(rs, 24)] = T7W - T7Z;
+					     T8c = T89 - T81;
+					     T8a = T81 + T89;
+					     T7G = T7C + T7F;
+					     T7S = T7C - T7F;
+					     T8h = T8e - T8d;
+					     T8f = T8d + T8e;
+					     ii[WS(rs, 24)] = T8c - T8b;
+					     ii[WS(rs, 8)] = T8b + T8c;
+					     ii[WS(rs, 16)] = T8a - T80;
+					     ii[0] = T80 + T8a;
+					     ri[WS(rs, 4)] = FMA(KP707106781, T7R, T7G);
+					     ri[WS(rs, 20)] = FNMS(KP707106781, T7R, T7G);
+					     T8g = T7T + T7U;
+					     T7V = T7T - T7U;
+					}
+					ii[WS(rs, 20)] = FNMS(KP707106781, T8g, T8f);
+					ii[WS(rs, 4)] = FMA(KP707106781, T8g, T8f);
+					ri[WS(rs, 12)] = FMA(KP707106781, T7V, T7S);
+					ri[WS(rs, 28)] = FNMS(KP707106781, T7V, T7S);
+				   }
+			      }
+			      {
+				   E T7f, T7m, T6I, T7a, T7A, T7w, T8r, T8l, T8m, T6T, T7j, T75, T8s, T7p, T7z;
+				   E T7t;
+				   {
+					E T7n, T6N, T6S, T7o, T7u, T7v;
+					T7f = T7b - T7e;
+					T7u = T7b + T7e;
+					ii[WS(rs, 28)] = FNMS(KP707106781, T8i, T8h);
+					ii[WS(rs, 12)] = FMA(KP707106781, T8i, T8h);
+					T7m = T6E + T6H;
+					T6I = T6E - T6H;
+					T7v = T78 + T79;
+					T7a = T78 - T79;
+					T7n = T6M + T6L;
+					T6N = T6L - T6M;
+					T7A = FMA(KP414213562, T7u, T7v);
+					T7w = FNMS(KP414213562, T7v, T7u);
+					T8r = T8k - T8j;
+					T8l = T8j + T8k;
+					T6S = T6O + T6R;
+					T7o = T6O - T6R;
+					{
+					     E T7s, T7r, T6Z, T74;
+					     T7s = T6X + T6Y;
+					     T6Z = T6X - T6Y;
+					     T74 = T70 - T73;
+					     T7r = T70 + T73;
+					     T8m = T6N + T6S;
+					     T6T = T6N - T6S;
+					     T7j = FNMS(KP414213562, T6Z, T74);
+					     T75 = FMA(KP414213562, T74, T6Z);
+					     T8s = T7o - T7n;
+					     T7p = T7n + T7o;
+					     T7z = FNMS(KP414213562, T7r, T7s);
+					     T7t = FMA(KP414213562, T7s, T7r);
+					}
+				   }
+				   {
+					E T7i, T6U, T8t, T8v, T7k, T7g;
+					T7i = FNMS(KP707106781, T6T, T6I);
+					T6U = FMA(KP707106781, T6T, T6I);
+					T8t = FMA(KP707106781, T8s, T8r);
+					T8v = FNMS(KP707106781, T8s, T8r);
+					T7k = FMA(KP414213562, T7a, T7f);
+					T7g = FNMS(KP414213562, T7f, T7a);
+					{
+					     E T7q, T7x, T8n, T8o;
+					     T7y = FNMS(KP707106781, T7p, T7m);
+					     T7q = FMA(KP707106781, T7p, T7m);
+					     {
+						  E T7l, T8u, T8w, T7h;
+						  T7l = T7j + T7k;
+						  T8u = T7k - T7j;
+						  T8w = T75 + T7g;
+						  T7h = T75 - T7g;
+						  ri[WS(rs, 30)] = FMA(KP923879532, T7l, T7i);
+						  ri[WS(rs, 14)] = FNMS(KP923879532, T7l, T7i);
+						  ii[WS(rs, 22)] = FNMS(KP923879532, T8u, T8t);
+						  ii[WS(rs, 6)] = FMA(KP923879532, T8u, T8t);
+						  ii[WS(rs, 30)] = FMA(KP923879532, T8w, T8v);
+						  ii[WS(rs, 14)] = FNMS(KP923879532, T8w, T8v);
+						  ri[WS(rs, 6)] = FMA(KP923879532, T7h, T6U);
+						  ri[WS(rs, 22)] = FNMS(KP923879532, T7h, T6U);
+						  T7x = T7t + T7w;
+						  T8q = T7w - T7t;
+					     }
+					     T8p = FNMS(KP707106781, T8m, T8l);
+					     T8n = FMA(KP707106781, T8m, T8l);
+					     T8o = T7z + T7A;
+					     T7B = T7z - T7A;
+					     ri[WS(rs, 2)] = FMA(KP923879532, T7x, T7q);
+					     ri[WS(rs, 18)] = FNMS(KP923879532, T7x, T7q);
+					     ii[WS(rs, 18)] = FNMS(KP923879532, T8o, T8n);
+					     ii[WS(rs, 2)] = FMA(KP923879532, T8o, T8n);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T5S, T8O, T8N, T5V, T6d, T6g, T66, T69, T8G, T8F;
+			      {
+				   E T5C, T3S, T8C, T4n, T8H, T8B, T8I, T5F, T5k, T5L, T5u, T4K, T4V;
+				   {
+					E T5D, T5E, T8z, T8A, T5j;
+					{
+					     E T3C, T3R, T47, T4m;
+					     T5S = T3w - T3B;
+					     T3C = T3w + T3B;
+					     ri[WS(rs, 10)] = FMA(KP923879532, T7B, T7y);
+					     ri[WS(rs, 26)] = FNMS(KP923879532, T7B, T7y);
+					     ii[WS(rs, 26)] = FNMS(KP923879532, T8q, T8p);
+					     ii[WS(rs, 10)] = FMA(KP923879532, T8q, T8p);
+					     T3R = T3J + T3Q;
+					     T8O = T3Q - T3J;
+					     T5D = FMA(KP414213562, T3Z, T46);
+					     T47 = FNMS(KP414213562, T46, T3Z);
+					     T4m = FMA(KP414213562, T4l, T4e);
+					     T5E = FNMS(KP414213562, T4e, T4l);
+					     T8N = T8y + T8x;
+					     T8z = T8x - T8y;
+					     T5C = FMA(KP707106781, T3R, T3C);
+					     T3S = FNMS(KP707106781, T3R, T3C);
+					     T8C = T47 + T4m;
+					     T4n = T47 - T4m;
+					     T8A = T5T + T5U;
+					     T5V = T5T - T5U;
+					}
+					T6d = T5i - T5b;
+					T5j = T5b + T5i;
+					T8H = FNMS(KP707106781, T8A, T8z);
+					T8B = FMA(KP707106781, T8A, T8z);
+					T8I = T5E - T5D;
+					T5F = T5D + T5E;
+					T5k = FNMS(KP707106781, T5j, T54);
+					T5L = FMA(KP707106781, T5j, T54);
+					T5u = T5s + T5t;
+					T6g = T5s - T5t;
+					T66 = T4J - T4C;
+					T4K = T4C + T4J;
+					T4V = T4T + T4U;
+					T69 = T4T - T4U;
+				   }
+				   {
+					E T5M, T5Q, T5J, T5P, T8L, T8M;
+					{
+					     E T5y, T4o, T5A, T5w, T5z, T4X, T8J, T5K, T5v, T8K, T5B, T5x;
+					     T5y = FNMS(KP923879532, T4n, T3S);
+					     T4o = FMA(KP923879532, T4n, T3S);
+					     T5K = FMA(KP707106781, T5u, T5r);
+					     T5v = FNMS(KP707106781, T5u, T5r);
+					     {
+						  E T5I, T4L, T5H, T4W;
+						  T5I = FMA(KP707106781, T4K, T4v);
+						  T4L = FNMS(KP707106781, T4K, T4v);
+						  T5H = FMA(KP707106781, T4V, T4S);
+						  T4W = FNMS(KP707106781, T4V, T4S);
+						  T5M = FNMS(KP198912367, T5L, T5K);
+						  T5Q = FMA(KP198912367, T5K, T5L);
+						  T5A = FMA(KP668178637, T5k, T5v);
+						  T5w = FNMS(KP668178637, T5v, T5k);
+						  T5J = FMA(KP198912367, T5I, T5H);
+						  T5P = FNMS(KP198912367, T5H, T5I);
+						  T5z = FNMS(KP668178637, T4L, T4W);
+						  T4X = FMA(KP668178637, T4W, T4L);
+					     }
+					     T8J = FMA(KP923879532, T8I, T8H);
+					     T8L = FNMS(KP923879532, T8I, T8H);
+					     T8K = T5A - T5z;
+					     T5B = T5z + T5A;
+					     T8M = T4X + T5w;
+					     T5x = T4X - T5w;
+					     ii[WS(rs, 21)] = FNMS(KP831469612, T8K, T8J);
+					     ii[WS(rs, 5)] = FMA(KP831469612, T8K, T8J);
+					     ri[WS(rs, 5)] = FMA(KP831469612, T5x, T4o);
+					     ri[WS(rs, 21)] = FNMS(KP831469612, T5x, T4o);
+					     ri[WS(rs, 29)] = FMA(KP831469612, T5B, T5y);
+					     ri[WS(rs, 13)] = FNMS(KP831469612, T5B, T5y);
+					}
+					{
+					     E T5O, T8D, T8E, T5R, T5G, T5N;
+					     T5O = FNMS(KP923879532, T5F, T5C);
+					     T5G = FMA(KP923879532, T5F, T5C);
+					     T5N = T5J + T5M;
+					     T8G = T5M - T5J;
+					     T8F = FNMS(KP923879532, T8C, T8B);
+					     T8D = FMA(KP923879532, T8C, T8B);
+					     ii[WS(rs, 29)] = FMA(KP831469612, T8M, T8L);
+					     ii[WS(rs, 13)] = FNMS(KP831469612, T8M, T8L);
+					     ri[WS(rs, 1)] = FMA(KP980785280, T5N, T5G);
+					     ri[WS(rs, 17)] = FNMS(KP980785280, T5N, T5G);
+					     T8E = T5P + T5Q;
+					     T5R = T5P - T5Q;
+					     ii[WS(rs, 17)] = FNMS(KP980785280, T8E, T8D);
+					     ii[WS(rs, 1)] = FMA(KP980785280, T8E, T8D);
+					     ri[WS(rs, 9)] = FMA(KP980785280, T5R, T5O);
+					     ri[WS(rs, 25)] = FNMS(KP980785280, T5R, T5O);
+					}
+				   }
+			      }
+			      {
+				   E T6o, T5W, T8W, T63, T8V, T8P, T8Q, T6r, T67, T6u, T6y, T6C, T6m, T6i;
+				   {
+					E T6p, T5Z, T62, T6q;
+					T6p = FNMS(KP414213562, T5X, T5Y);
+					T5Z = FMA(KP414213562, T5Y, T5X);
+					ii[WS(rs, 25)] = FNMS(KP980785280, T8G, T8F);
+					ii[WS(rs, 9)] = FMA(KP980785280, T8G, T8F);
+					T6o = FNMS(KP707106781, T5V, T5S);
+					T5W = FMA(KP707106781, T5V, T5S);
+					T62 = FNMS(KP414213562, T61, T60);
+					T6q = FMA(KP414213562, T60, T61);
+					T8W = T5Z + T62;
+					T63 = T5Z - T62;
+					T8V = FNMS(KP707106781, T8O, T8N);
+					T8P = FMA(KP707106781, T8O, T8N);
+					{
+					     E T6x, T6e, T6w, T6h;
+					     T8Q = T6q - T6p;
+					     T6r = T6p + T6q;
+					     T6x = FMA(KP707106781, T6d, T6c);
+					     T6e = FNMS(KP707106781, T6d, T6c);
+					     T6w = FMA(KP707106781, T6g, T6f);
+					     T6h = FNMS(KP707106781, T6g, T6f);
+					     T67 = FNMS(KP707106781, T66, T65);
+					     T6u = FMA(KP707106781, T66, T65);
+					     T6y = FNMS(KP198912367, T6x, T6w);
+					     T6C = FMA(KP198912367, T6w, T6x);
+					     T6m = FMA(KP668178637, T6e, T6h);
+					     T6i = FNMS(KP668178637, T6h, T6e);
+					}
+				   }
+				   {
+					E T6k, T64, T8R, T8T, T6t, T6a;
+					T6k = FNMS(KP923879532, T63, T5W);
+					T64 = FMA(KP923879532, T63, T5W);
+					T8R = FMA(KP923879532, T8Q, T8P);
+					T8T = FNMS(KP923879532, T8Q, T8P);
+					T6t = FMA(KP707106781, T69, T68);
+					T6a = FNMS(KP707106781, T69, T68);
+					{
+					     E T6A, T8X, T8Y, T6D;
+					     {
+						  E T6s, T6B, T6l, T6b, T6z, T6v;
+						  T6A = FMA(KP923879532, T6r, T6o);
+						  T6s = FNMS(KP923879532, T6r, T6o);
+						  T6v = FMA(KP198912367, T6u, T6t);
+						  T6B = FNMS(KP198912367, T6t, T6u);
+						  T6l = FNMS(KP668178637, T67, T6a);
+						  T6b = FMA(KP668178637, T6a, T67);
+						  T6z = T6v - T6y;
+						  T90 = T6v + T6y;
+						  T8Z = FMA(KP923879532, T8W, T8V);
+						  T8X = FNMS(KP923879532, T8W, T8V);
+						  {
+						       E T6n, T8S, T8U, T6j;
+						       T6n = T6l - T6m;
+						       T8S = T6l + T6m;
+						       T8U = T6i - T6b;
+						       T6j = T6b + T6i;
+						       ri[WS(rs, 7)] = FMA(KP980785280, T6z, T6s);
+						       ri[WS(rs, 23)] = FNMS(KP980785280, T6z, T6s);
+						       ri[WS(rs, 11)] = FMA(KP831469612, T6n, T6k);
+						       ri[WS(rs, 27)] = FNMS(KP831469612, T6n, T6k);
+						       ii[WS(rs, 19)] = FNMS(KP831469612, T8S, T8R);
+						       ii[WS(rs, 3)] = FMA(KP831469612, T8S, T8R);
+						       ii[WS(rs, 27)] = FNMS(KP831469612, T8U, T8T);
+						       ii[WS(rs, 11)] = FMA(KP831469612, T8U, T8T);
+						       ri[WS(rs, 3)] = FMA(KP831469612, T6j, T64);
+						       ri[WS(rs, 19)] = FNMS(KP831469612, T6j, T64);
+						       T8Y = T6C - T6B;
+						       T6D = T6B + T6C;
+						  }
+					     }
+					     ii[WS(rs, 23)] = FNMS(KP980785280, T8Y, T8X);
+					     ii[WS(rs, 7)] = FMA(KP980785280, T8Y, T8X);
+					     ri[WS(rs, 31)] = FMA(KP980785280, T6D, T6A);
+					     ri[WS(rs, 15)] = FNMS(KP980785280, T6D, T6A);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ii[WS(rs, 31)] = FMA(KP980785280, T90, T8Z);
+	       ii[WS(rs, 15)] = FNMS(KP980785280, T90, T8Z);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 32, "t1_32", twinstr, &GENUS, {236, 62, 198, 0}, 0, 0, 0 };
+
+void X(codelet_t1_32) (planner *p) {
+     X(kdft_dit_register) (p, t1_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 32 -name t1_32 -include t.h */
+
+/*
+ * This function contains 434 FP additions, 208 FP multiplications,
+ * (or, 340 additions, 114 multiplications, 94 fused multiply/add),
+ * 96 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "t.h"
+
+static void t1_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 62); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T59, T41;
+	       E T56, T2B, T67, T6e, T6O, T4b, T5d, T4s, T5g, TG, T7l, T5I, T73, T3a, T4U;
+	       E T3f, T4V, T14, T5N, T5M, T6E, T3m, T4Y, T3r, T4Z, T1r, T5P, T5S, T6F, T3x;
+	       E T51, T3C, T52, T2d, T5Z, T64, T6K, T3V, T57, T44, T5a, T2Y, T6f, T6a, T6P;
+	       E T4m, T5h, T4v, T5e;
+	       {
+		    E T1, T76, T6, T75, Tc, T32, Th, T33;
+		    T1 = ri[0];
+		    T76 = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 16)];
+			 T5 = ii[WS(rs, 16)];
+			 T2 = W[30];
+			 T4 = W[31];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T75 = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = ri[WS(rs, 8)];
+			 Tb = ii[WS(rs, 8)];
+			 T8 = W[14];
+			 Ta = W[15];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T32 = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = ri[WS(rs, 24)];
+			 Tg = ii[WS(rs, 24)];
+			 Td = W[46];
+			 Tf = W[47];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T33 = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E T7, Ti, T7A, T7B;
+			 T7 = T1 + T6;
+			 Ti = Tc + Th;
+			 Tj = T7 + Ti;
+			 T5F = T7 - Ti;
+			 T7A = T76 - T75;
+			 T7B = Tc - Th;
+			 T7C = T7A - T7B;
+			 T7Q = T7B + T7A;
+		    }
+		    {
+			 E T31, T34, T74, T77;
+			 T31 = T1 - T6;
+			 T34 = T32 - T33;
+			 T35 = T31 - T34;
+			 T4T = T31 + T34;
+			 T74 = T32 + T33;
+			 T77 = T75 + T76;
+			 T78 = T74 + T77;
+			 T7m = T77 - T74;
+		    }
+	       }
+	       {
+		    E T1y, T3G, T1O, T3Z, T1D, T3H, T1J, T3Y;
+		    {
+			 E T1v, T1x, T1u, T1w;
+			 T1v = ri[WS(rs, 1)];
+			 T1x = ii[WS(rs, 1)];
+			 T1u = W[0];
+			 T1w = W[1];
+			 T1y = FMA(T1u, T1v, T1w * T1x);
+			 T3G = FNMS(T1w, T1v, T1u * T1x);
+		    }
+		    {
+			 E T1L, T1N, T1K, T1M;
+			 T1L = ri[WS(rs, 25)];
+			 T1N = ii[WS(rs, 25)];
+			 T1K = W[48];
+			 T1M = W[49];
+			 T1O = FMA(T1K, T1L, T1M * T1N);
+			 T3Z = FNMS(T1M, T1L, T1K * T1N);
+		    }
+		    {
+			 E T1A, T1C, T1z, T1B;
+			 T1A = ri[WS(rs, 17)];
+			 T1C = ii[WS(rs, 17)];
+			 T1z = W[32];
+			 T1B = W[33];
+			 T1D = FMA(T1z, T1A, T1B * T1C);
+			 T3H = FNMS(T1B, T1A, T1z * T1C);
+		    }
+		    {
+			 E T1G, T1I, T1F, T1H;
+			 T1G = ri[WS(rs, 9)];
+			 T1I = ii[WS(rs, 9)];
+			 T1F = W[16];
+			 T1H = W[17];
+			 T1J = FMA(T1F, T1G, T1H * T1I);
+			 T3Y = FNMS(T1H, T1G, T1F * T1I);
+		    }
+		    {
+			 E T1E, T1P, T5W, T5X;
+			 T1E = T1y + T1D;
+			 T1P = T1J + T1O;
+			 T1Q = T1E + T1P;
+			 T61 = T1E - T1P;
+			 T5W = T3G + T3H;
+			 T5X = T3Y + T3Z;
+			 T5Y = T5W - T5X;
+			 T6J = T5W + T5X;
+		    }
+		    {
+			 E T3I, T3J, T3X, T40;
+			 T3I = T3G - T3H;
+			 T3J = T1J - T1O;
+			 T3K = T3I + T3J;
+			 T59 = T3I - T3J;
+			 T3X = T1y - T1D;
+			 T40 = T3Y - T3Z;
+			 T41 = T3X - T40;
+			 T56 = T3X + T40;
+		    }
+	       }
+	       {
+		    E T2j, T4o, T2z, T49, T2o, T4p, T2u, T48;
+		    {
+			 E T2g, T2i, T2f, T2h;
+			 T2g = ri[WS(rs, 31)];
+			 T2i = ii[WS(rs, 31)];
+			 T2f = W[60];
+			 T2h = W[61];
+			 T2j = FMA(T2f, T2g, T2h * T2i);
+			 T4o = FNMS(T2h, T2g, T2f * T2i);
+		    }
+		    {
+			 E T2w, T2y, T2v, T2x;
+			 T2w = ri[WS(rs, 23)];
+			 T2y = ii[WS(rs, 23)];
+			 T2v = W[44];
+			 T2x = W[45];
+			 T2z = FMA(T2v, T2w, T2x * T2y);
+			 T49 = FNMS(T2x, T2w, T2v * T2y);
+		    }
+		    {
+			 E T2l, T2n, T2k, T2m;
+			 T2l = ri[WS(rs, 15)];
+			 T2n = ii[WS(rs, 15)];
+			 T2k = W[28];
+			 T2m = W[29];
+			 T2o = FMA(T2k, T2l, T2m * T2n);
+			 T4p = FNMS(T2m, T2l, T2k * T2n);
+		    }
+		    {
+			 E T2r, T2t, T2q, T2s;
+			 T2r = ri[WS(rs, 7)];
+			 T2t = ii[WS(rs, 7)];
+			 T2q = W[12];
+			 T2s = W[13];
+			 T2u = FMA(T2q, T2r, T2s * T2t);
+			 T48 = FNMS(T2s, T2r, T2q * T2t);
+		    }
+		    {
+			 E T2p, T2A, T6c, T6d;
+			 T2p = T2j + T2o;
+			 T2A = T2u + T2z;
+			 T2B = T2p + T2A;
+			 T67 = T2p - T2A;
+			 T6c = T4o + T4p;
+			 T6d = T48 + T49;
+			 T6e = T6c - T6d;
+			 T6O = T6c + T6d;
+		    }
+		    {
+			 E T47, T4a, T4q, T4r;
+			 T47 = T2j - T2o;
+			 T4a = T48 - T49;
+			 T4b = T47 - T4a;
+			 T5d = T47 + T4a;
+			 T4q = T4o - T4p;
+			 T4r = T2u - T2z;
+			 T4s = T4q + T4r;
+			 T5g = T4q - T4r;
+		    }
+	       }
+	       {
+		    E To, T36, TE, T3d, Tt, T37, Tz, T3c;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = ri[WS(rs, 4)];
+			 Tn = ii[WS(rs, 4)];
+			 Tk = W[6];
+			 Tm = W[7];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 T36 = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = ri[WS(rs, 12)];
+			 TD = ii[WS(rs, 12)];
+			 TA = W[22];
+			 TC = W[23];
+			 TE = FMA(TA, TB, TC * TD);
+			 T3d = FNMS(TC, TB, TA * TD);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = ri[WS(rs, 20)];
+			 Ts = ii[WS(rs, 20)];
+			 Tp = W[38];
+			 Tr = W[39];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 T37 = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = ri[WS(rs, 28)];
+			 Ty = ii[WS(rs, 28)];
+			 Tv = W[54];
+			 Tx = W[55];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T3c = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E Tu, TF, T5G, T5H;
+			 Tu = To + Tt;
+			 TF = Tz + TE;
+			 TG = Tu + TF;
+			 T7l = TF - Tu;
+			 T5G = T36 + T37;
+			 T5H = T3c + T3d;
+			 T5I = T5G - T5H;
+			 T73 = T5G + T5H;
+		    }
+		    {
+			 E T38, T39, T3b, T3e;
+			 T38 = T36 - T37;
+			 T39 = To - Tt;
+			 T3a = T38 - T39;
+			 T4U = T39 + T38;
+			 T3b = Tz - TE;
+			 T3e = T3c - T3d;
+			 T3f = T3b + T3e;
+			 T4V = T3b - T3e;
+		    }
+	       }
+	       {
+		    E TM, T3i, T12, T3p, TR, T3j, TX, T3o;
+		    {
+			 E TJ, TL, TI, TK;
+			 TJ = ri[WS(rs, 2)];
+			 TL = ii[WS(rs, 2)];
+			 TI = W[2];
+			 TK = W[3];
+			 TM = FMA(TI, TJ, TK * TL);
+			 T3i = FNMS(TK, TJ, TI * TL);
+		    }
+		    {
+			 E TZ, T11, TY, T10;
+			 TZ = ri[WS(rs, 26)];
+			 T11 = ii[WS(rs, 26)];
+			 TY = W[50];
+			 T10 = W[51];
+			 T12 = FMA(TY, TZ, T10 * T11);
+			 T3p = FNMS(T10, TZ, TY * T11);
+		    }
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = ri[WS(rs, 18)];
+			 TQ = ii[WS(rs, 18)];
+			 TN = W[34];
+			 TP = W[35];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T3j = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E TU, TW, TT, TV;
+			 TU = ri[WS(rs, 10)];
+			 TW = ii[WS(rs, 10)];
+			 TT = W[18];
+			 TV = W[19];
+			 TX = FMA(TT, TU, TV * TW);
+			 T3o = FNMS(TV, TU, TT * TW);
+		    }
+		    {
+			 E TS, T13, T5K, T5L;
+			 TS = TM + TR;
+			 T13 = TX + T12;
+			 T14 = TS + T13;
+			 T5N = TS - T13;
+			 T5K = T3i + T3j;
+			 T5L = T3o + T3p;
+			 T5M = T5K - T5L;
+			 T6E = T5K + T5L;
+		    }
+		    {
+			 E T3k, T3l, T3n, T3q;
+			 T3k = T3i - T3j;
+			 T3l = TX - T12;
+			 T3m = T3k + T3l;
+			 T4Y = T3k - T3l;
+			 T3n = TM - TR;
+			 T3q = T3o - T3p;
+			 T3r = T3n - T3q;
+			 T4Z = T3n + T3q;
+		    }
+	       }
+	       {
+		    E T19, T3t, T1p, T3A, T1e, T3u, T1k, T3z;
+		    {
+			 E T16, T18, T15, T17;
+			 T16 = ri[WS(rs, 30)];
+			 T18 = ii[WS(rs, 30)];
+			 T15 = W[58];
+			 T17 = W[59];
+			 T19 = FMA(T15, T16, T17 * T18);
+			 T3t = FNMS(T17, T16, T15 * T18);
+		    }
+		    {
+			 E T1m, T1o, T1l, T1n;
+			 T1m = ri[WS(rs, 22)];
+			 T1o = ii[WS(rs, 22)];
+			 T1l = W[42];
+			 T1n = W[43];
+			 T1p = FMA(T1l, T1m, T1n * T1o);
+			 T3A = FNMS(T1n, T1m, T1l * T1o);
+		    }
+		    {
+			 E T1b, T1d, T1a, T1c;
+			 T1b = ri[WS(rs, 14)];
+			 T1d = ii[WS(rs, 14)];
+			 T1a = W[26];
+			 T1c = W[27];
+			 T1e = FMA(T1a, T1b, T1c * T1d);
+			 T3u = FNMS(T1c, T1b, T1a * T1d);
+		    }
+		    {
+			 E T1h, T1j, T1g, T1i;
+			 T1h = ri[WS(rs, 6)];
+			 T1j = ii[WS(rs, 6)];
+			 T1g = W[10];
+			 T1i = W[11];
+			 T1k = FMA(T1g, T1h, T1i * T1j);
+			 T3z = FNMS(T1i, T1h, T1g * T1j);
+		    }
+		    {
+			 E T1f, T1q, T5Q, T5R;
+			 T1f = T19 + T1e;
+			 T1q = T1k + T1p;
+			 T1r = T1f + T1q;
+			 T5P = T1f - T1q;
+			 T5Q = T3t + T3u;
+			 T5R = T3z + T3A;
+			 T5S = T5Q - T5R;
+			 T6F = T5Q + T5R;
+		    }
+		    {
+			 E T3v, T3w, T3y, T3B;
+			 T3v = T3t - T3u;
+			 T3w = T1k - T1p;
+			 T3x = T3v + T3w;
+			 T51 = T3v - T3w;
+			 T3y = T19 - T1e;
+			 T3B = T3z - T3A;
+			 T3C = T3y - T3B;
+			 T52 = T3y + T3B;
+		    }
+	       }
+	       {
+		    E T1V, T3R, T20, T3S, T3Q, T3T, T26, T3M, T2b, T3N, T3L, T3O;
+		    {
+			 E T1S, T1U, T1R, T1T;
+			 T1S = ri[WS(rs, 5)];
+			 T1U = ii[WS(rs, 5)];
+			 T1R = W[8];
+			 T1T = W[9];
+			 T1V = FMA(T1R, T1S, T1T * T1U);
+			 T3R = FNMS(T1T, T1S, T1R * T1U);
+		    }
+		    {
+			 E T1X, T1Z, T1W, T1Y;
+			 T1X = ri[WS(rs, 21)];
+			 T1Z = ii[WS(rs, 21)];
+			 T1W = W[40];
+			 T1Y = W[41];
+			 T20 = FMA(T1W, T1X, T1Y * T1Z);
+			 T3S = FNMS(T1Y, T1X, T1W * T1Z);
+		    }
+		    T3Q = T1V - T20;
+		    T3T = T3R - T3S;
+		    {
+			 E T23, T25, T22, T24;
+			 T23 = ri[WS(rs, 29)];
+			 T25 = ii[WS(rs, 29)];
+			 T22 = W[56];
+			 T24 = W[57];
+			 T26 = FMA(T22, T23, T24 * T25);
+			 T3M = FNMS(T24, T23, T22 * T25);
+		    }
+		    {
+			 E T28, T2a, T27, T29;
+			 T28 = ri[WS(rs, 13)];
+			 T2a = ii[WS(rs, 13)];
+			 T27 = W[24];
+			 T29 = W[25];
+			 T2b = FMA(T27, T28, T29 * T2a);
+			 T3N = FNMS(T29, T28, T27 * T2a);
+		    }
+		    T3L = T26 - T2b;
+		    T3O = T3M - T3N;
+		    {
+			 E T21, T2c, T62, T63;
+			 T21 = T1V + T20;
+			 T2c = T26 + T2b;
+			 T2d = T21 + T2c;
+			 T5Z = T2c - T21;
+			 T62 = T3R + T3S;
+			 T63 = T3M + T3N;
+			 T64 = T62 - T63;
+			 T6K = T62 + T63;
+		    }
+		    {
+			 E T3P, T3U, T42, T43;
+			 T3P = T3L - T3O;
+			 T3U = T3Q + T3T;
+			 T3V = KP707106781 * (T3P - T3U);
+			 T57 = KP707106781 * (T3U + T3P);
+			 T42 = T3T - T3Q;
+			 T43 = T3L + T3O;
+			 T44 = KP707106781 * (T42 - T43);
+			 T5a = KP707106781 * (T42 + T43);
+		    }
+	       }
+	       {
+		    E T2G, T4c, T2L, T4d, T4e, T4f, T2R, T4i, T2W, T4j, T4h, T4k;
+		    {
+			 E T2D, T2F, T2C, T2E;
+			 T2D = ri[WS(rs, 3)];
+			 T2F = ii[WS(rs, 3)];
+			 T2C = W[4];
+			 T2E = W[5];
+			 T2G = FMA(T2C, T2D, T2E * T2F);
+			 T4c = FNMS(T2E, T2D, T2C * T2F);
+		    }
+		    {
+			 E T2I, T2K, T2H, T2J;
+			 T2I = ri[WS(rs, 19)];
+			 T2K = ii[WS(rs, 19)];
+			 T2H = W[36];
+			 T2J = W[37];
+			 T2L = FMA(T2H, T2I, T2J * T2K);
+			 T4d = FNMS(T2J, T2I, T2H * T2K);
+		    }
+		    T4e = T4c - T4d;
+		    T4f = T2G - T2L;
+		    {
+			 E T2O, T2Q, T2N, T2P;
+			 T2O = ri[WS(rs, 27)];
+			 T2Q = ii[WS(rs, 27)];
+			 T2N = W[52];
+			 T2P = W[53];
+			 T2R = FMA(T2N, T2O, T2P * T2Q);
+			 T4i = FNMS(T2P, T2O, T2N * T2Q);
+		    }
+		    {
+			 E T2T, T2V, T2S, T2U;
+			 T2T = ri[WS(rs, 11)];
+			 T2V = ii[WS(rs, 11)];
+			 T2S = W[20];
+			 T2U = W[21];
+			 T2W = FMA(T2S, T2T, T2U * T2V);
+			 T4j = FNMS(T2U, T2T, T2S * T2V);
+		    }
+		    T4h = T2R - T2W;
+		    T4k = T4i - T4j;
+		    {
+			 E T2M, T2X, T68, T69;
+			 T2M = T2G + T2L;
+			 T2X = T2R + T2W;
+			 T2Y = T2M + T2X;
+			 T6f = T2X - T2M;
+			 T68 = T4c + T4d;
+			 T69 = T4i + T4j;
+			 T6a = T68 - T69;
+			 T6P = T68 + T69;
+		    }
+		    {
+			 E T4g, T4l, T4t, T4u;
+			 T4g = T4e - T4f;
+			 T4l = T4h + T4k;
+			 T4m = KP707106781 * (T4g - T4l);
+			 T5h = KP707106781 * (T4g + T4l);
+			 T4t = T4h - T4k;
+			 T4u = T4f + T4e;
+			 T4v = KP707106781 * (T4t - T4u);
+			 T5e = KP707106781 * (T4u + T4t);
+		    }
+	       }
+	       {
+		    E T1t, T6X, T7a, T7c, T30, T7b, T70, T71;
+		    {
+			 E TH, T1s, T72, T79;
+			 TH = Tj + TG;
+			 T1s = T14 + T1r;
+			 T1t = TH + T1s;
+			 T6X = TH - T1s;
+			 T72 = T6E + T6F;
+			 T79 = T73 + T78;
+			 T7a = T72 + T79;
+			 T7c = T79 - T72;
+		    }
+		    {
+			 E T2e, T2Z, T6Y, T6Z;
+			 T2e = T1Q + T2d;
+			 T2Z = T2B + T2Y;
+			 T30 = T2e + T2Z;
+			 T7b = T2Z - T2e;
+			 T6Y = T6J + T6K;
+			 T6Z = T6O + T6P;
+			 T70 = T6Y - T6Z;
+			 T71 = T6Y + T6Z;
+		    }
+		    ri[WS(rs, 16)] = T1t - T30;
+		    ii[WS(rs, 16)] = T7a - T71;
+		    ri[0] = T1t + T30;
+		    ii[0] = T71 + T7a;
+		    ri[WS(rs, 24)] = T6X - T70;
+		    ii[WS(rs, 24)] = T7c - T7b;
+		    ri[WS(rs, 8)] = T6X + T70;
+		    ii[WS(rs, 8)] = T7b + T7c;
+	       }
+	       {
+		    E T6H, T6T, T7g, T7i, T6M, T6U, T6R, T6V;
+		    {
+			 E T6D, T6G, T7e, T7f;
+			 T6D = Tj - TG;
+			 T6G = T6E - T6F;
+			 T6H = T6D + T6G;
+			 T6T = T6D - T6G;
+			 T7e = T1r - T14;
+			 T7f = T78 - T73;
+			 T7g = T7e + T7f;
+			 T7i = T7f - T7e;
+		    }
+		    {
+			 E T6I, T6L, T6N, T6Q;
+			 T6I = T1Q - T2d;
+			 T6L = T6J - T6K;
+			 T6M = T6I + T6L;
+			 T6U = T6L - T6I;
+			 T6N = T2B - T2Y;
+			 T6Q = T6O - T6P;
+			 T6R = T6N - T6Q;
+			 T6V = T6N + T6Q;
+		    }
+		    {
+			 E T6S, T7d, T6W, T7h;
+			 T6S = KP707106781 * (T6M + T6R);
+			 ri[WS(rs, 20)] = T6H - T6S;
+			 ri[WS(rs, 4)] = T6H + T6S;
+			 T7d = KP707106781 * (T6U + T6V);
+			 ii[WS(rs, 4)] = T7d + T7g;
+			 ii[WS(rs, 20)] = T7g - T7d;
+			 T6W = KP707106781 * (T6U - T6V);
+			 ri[WS(rs, 28)] = T6T - T6W;
+			 ri[WS(rs, 12)] = T6T + T6W;
+			 T7h = KP707106781 * (T6R - T6M);
+			 ii[WS(rs, 12)] = T7h + T7i;
+			 ii[WS(rs, 28)] = T7i - T7h;
+		    }
+	       }
+	       {
+		    E T5J, T7n, T7t, T6n, T5U, T7k, T6x, T6B, T6q, T7s, T66, T6k, T6u, T6A, T6h;
+		    E T6l;
+		    {
+			 E T5O, T5T, T60, T65;
+			 T5J = T5F - T5I;
+			 T7n = T7l + T7m;
+			 T7t = T7m - T7l;
+			 T6n = T5F + T5I;
+			 T5O = T5M - T5N;
+			 T5T = T5P + T5S;
+			 T5U = KP707106781 * (T5O - T5T);
+			 T7k = KP707106781 * (T5O + T5T);
+			 {
+			      E T6v, T6w, T6o, T6p;
+			      T6v = T67 + T6a;
+			      T6w = T6e + T6f;
+			      T6x = FNMS(KP382683432, T6w, KP923879532 * T6v);
+			      T6B = FMA(KP923879532, T6w, KP382683432 * T6v);
+			      T6o = T5N + T5M;
+			      T6p = T5P - T5S;
+			      T6q = KP707106781 * (T6o + T6p);
+			      T7s = KP707106781 * (T6p - T6o);
+			 }
+			 T60 = T5Y - T5Z;
+			 T65 = T61 - T64;
+			 T66 = FMA(KP923879532, T60, KP382683432 * T65);
+			 T6k = FNMS(KP923879532, T65, KP382683432 * T60);
+			 {
+			      E T6s, T6t, T6b, T6g;
+			      T6s = T5Y + T5Z;
+			      T6t = T61 + T64;
+			      T6u = FMA(KP382683432, T6s, KP923879532 * T6t);
+			      T6A = FNMS(KP382683432, T6t, KP923879532 * T6s);
+			      T6b = T67 - T6a;
+			      T6g = T6e - T6f;
+			      T6h = FNMS(KP923879532, T6g, KP382683432 * T6b);
+			      T6l = FMA(KP382683432, T6g, KP923879532 * T6b);
+			 }
+		    }
+		    {
+			 E T5V, T6i, T7r, T7u;
+			 T5V = T5J + T5U;
+			 T6i = T66 + T6h;
+			 ri[WS(rs, 22)] = T5V - T6i;
+			 ri[WS(rs, 6)] = T5V + T6i;
+			 T7r = T6k + T6l;
+			 T7u = T7s + T7t;
+			 ii[WS(rs, 6)] = T7r + T7u;
+			 ii[WS(rs, 22)] = T7u - T7r;
+		    }
+		    {
+			 E T6j, T6m, T7v, T7w;
+			 T6j = T5J - T5U;
+			 T6m = T6k - T6l;
+			 ri[WS(rs, 30)] = T6j - T6m;
+			 ri[WS(rs, 14)] = T6j + T6m;
+			 T7v = T6h - T66;
+			 T7w = T7t - T7s;
+			 ii[WS(rs, 14)] = T7v + T7w;
+			 ii[WS(rs, 30)] = T7w - T7v;
+		    }
+		    {
+			 E T6r, T6y, T7j, T7o;
+			 T6r = T6n + T6q;
+			 T6y = T6u + T6x;
+			 ri[WS(rs, 18)] = T6r - T6y;
+			 ri[WS(rs, 2)] = T6r + T6y;
+			 T7j = T6A + T6B;
+			 T7o = T7k + T7n;
+			 ii[WS(rs, 2)] = T7j + T7o;
+			 ii[WS(rs, 18)] = T7o - T7j;
+		    }
+		    {
+			 E T6z, T6C, T7p, T7q;
+			 T6z = T6n - T6q;
+			 T6C = T6A - T6B;
+			 ri[WS(rs, 26)] = T6z - T6C;
+			 ri[WS(rs, 10)] = T6z + T6C;
+			 T7p = T6x - T6u;
+			 T7q = T7n - T7k;
+			 ii[WS(rs, 10)] = T7p + T7q;
+			 ii[WS(rs, 26)] = T7q - T7p;
+		    }
+	       }
+	       {
+		    E T3h, T4D, T7R, T7X, T3E, T7O, T4N, T4R, T46, T4A, T4G, T7W, T4K, T4Q, T4x;
+		    E T4B, T3g, T7P;
+		    T3g = KP707106781 * (T3a - T3f);
+		    T3h = T35 - T3g;
+		    T4D = T35 + T3g;
+		    T7P = KP707106781 * (T4V - T4U);
+		    T7R = T7P + T7Q;
+		    T7X = T7Q - T7P;
+		    {
+			 E T3s, T3D, T4L, T4M;
+			 T3s = FNMS(KP923879532, T3r, KP382683432 * T3m);
+			 T3D = FMA(KP382683432, T3x, KP923879532 * T3C);
+			 T3E = T3s - T3D;
+			 T7O = T3s + T3D;
+			 T4L = T4b + T4m;
+			 T4M = T4s + T4v;
+			 T4N = FNMS(KP555570233, T4M, KP831469612 * T4L);
+			 T4R = FMA(KP831469612, T4M, KP555570233 * T4L);
+		    }
+		    {
+			 E T3W, T45, T4E, T4F;
+			 T3W = T3K - T3V;
+			 T45 = T41 - T44;
+			 T46 = FMA(KP980785280, T3W, KP195090322 * T45);
+			 T4A = FNMS(KP980785280, T45, KP195090322 * T3W);
+			 T4E = FMA(KP923879532, T3m, KP382683432 * T3r);
+			 T4F = FNMS(KP923879532, T3x, KP382683432 * T3C);
+			 T4G = T4E + T4F;
+			 T7W = T4F - T4E;
+		    }
+		    {
+			 E T4I, T4J, T4n, T4w;
+			 T4I = T3K + T3V;
+			 T4J = T41 + T44;
+			 T4K = FMA(KP555570233, T4I, KP831469612 * T4J);
+			 T4Q = FNMS(KP555570233, T4J, KP831469612 * T4I);
+			 T4n = T4b - T4m;
+			 T4w = T4s - T4v;
+			 T4x = FNMS(KP980785280, T4w, KP195090322 * T4n);
+			 T4B = FMA(KP195090322, T4w, KP980785280 * T4n);
+		    }
+		    {
+			 E T3F, T4y, T7V, T7Y;
+			 T3F = T3h + T3E;
+			 T4y = T46 + T4x;
+			 ri[WS(rs, 23)] = T3F - T4y;
+			 ri[WS(rs, 7)] = T3F + T4y;
+			 T7V = T4A + T4B;
+			 T7Y = T7W + T7X;
+			 ii[WS(rs, 7)] = T7V + T7Y;
+			 ii[WS(rs, 23)] = T7Y - T7V;
+		    }
+		    {
+			 E T4z, T4C, T7Z, T80;
+			 T4z = T3h - T3E;
+			 T4C = T4A - T4B;
+			 ri[WS(rs, 31)] = T4z - T4C;
+			 ri[WS(rs, 15)] = T4z + T4C;
+			 T7Z = T4x - T46;
+			 T80 = T7X - T7W;
+			 ii[WS(rs, 15)] = T7Z + T80;
+			 ii[WS(rs, 31)] = T80 - T7Z;
+		    }
+		    {
+			 E T4H, T4O, T7N, T7S;
+			 T4H = T4D + T4G;
+			 T4O = T4K + T4N;
+			 ri[WS(rs, 19)] = T4H - T4O;
+			 ri[WS(rs, 3)] = T4H + T4O;
+			 T7N = T4Q + T4R;
+			 T7S = T7O + T7R;
+			 ii[WS(rs, 3)] = T7N + T7S;
+			 ii[WS(rs, 19)] = T7S - T7N;
+		    }
+		    {
+			 E T4P, T4S, T7T, T7U;
+			 T4P = T4D - T4G;
+			 T4S = T4Q - T4R;
+			 ri[WS(rs, 27)] = T4P - T4S;
+			 ri[WS(rs, 11)] = T4P + T4S;
+			 T7T = T4N - T4K;
+			 T7U = T7R - T7O;
+			 ii[WS(rs, 11)] = T7T + T7U;
+			 ii[WS(rs, 27)] = T7U - T7T;
+		    }
+	       }
+	       {
+		    E T4X, T5p, T7D, T7J, T54, T7y, T5z, T5D, T5c, T5m, T5s, T7I, T5w, T5C, T5j;
+		    E T5n, T4W, T7z;
+		    T4W = KP707106781 * (T4U + T4V);
+		    T4X = T4T - T4W;
+		    T5p = T4T + T4W;
+		    T7z = KP707106781 * (T3a + T3f);
+		    T7D = T7z + T7C;
+		    T7J = T7C - T7z;
+		    {
+			 E T50, T53, T5x, T5y;
+			 T50 = FNMS(KP382683432, T4Z, KP923879532 * T4Y);
+			 T53 = FMA(KP923879532, T51, KP382683432 * T52);
+			 T54 = T50 - T53;
+			 T7y = T50 + T53;
+			 T5x = T5d + T5e;
+			 T5y = T5g + T5h;
+			 T5z = FNMS(KP195090322, T5y, KP980785280 * T5x);
+			 T5D = FMA(KP195090322, T5x, KP980785280 * T5y);
+		    }
+		    {
+			 E T58, T5b, T5q, T5r;
+			 T58 = T56 - T57;
+			 T5b = T59 - T5a;
+			 T5c = FMA(KP555570233, T58, KP831469612 * T5b);
+			 T5m = FNMS(KP831469612, T58, KP555570233 * T5b);
+			 T5q = FMA(KP382683432, T4Y, KP923879532 * T4Z);
+			 T5r = FNMS(KP382683432, T51, KP923879532 * T52);
+			 T5s = T5q + T5r;
+			 T7I = T5r - T5q;
+		    }
+		    {
+			 E T5u, T5v, T5f, T5i;
+			 T5u = T56 + T57;
+			 T5v = T59 + T5a;
+			 T5w = FMA(KP980785280, T5u, KP195090322 * T5v);
+			 T5C = FNMS(KP195090322, T5u, KP980785280 * T5v);
+			 T5f = T5d - T5e;
+			 T5i = T5g - T5h;
+			 T5j = FNMS(KP831469612, T5i, KP555570233 * T5f);
+			 T5n = FMA(KP831469612, T5f, KP555570233 * T5i);
+		    }
+		    {
+			 E T55, T5k, T7H, T7K;
+			 T55 = T4X + T54;
+			 T5k = T5c + T5j;
+			 ri[WS(rs, 21)] = T55 - T5k;
+			 ri[WS(rs, 5)] = T55 + T5k;
+			 T7H = T5m + T5n;
+			 T7K = T7I + T7J;
+			 ii[WS(rs, 5)] = T7H + T7K;
+			 ii[WS(rs, 21)] = T7K - T7H;
+		    }
+		    {
+			 E T5l, T5o, T7L, T7M;
+			 T5l = T4X - T54;
+			 T5o = T5m - T5n;
+			 ri[WS(rs, 29)] = T5l - T5o;
+			 ri[WS(rs, 13)] = T5l + T5o;
+			 T7L = T5j - T5c;
+			 T7M = T7J - T7I;
+			 ii[WS(rs, 13)] = T7L + T7M;
+			 ii[WS(rs, 29)] = T7M - T7L;
+		    }
+		    {
+			 E T5t, T5A, T7x, T7E;
+			 T5t = T5p + T5s;
+			 T5A = T5w + T5z;
+			 ri[WS(rs, 17)] = T5t - T5A;
+			 ri[WS(rs, 1)] = T5t + T5A;
+			 T7x = T5C + T5D;
+			 T7E = T7y + T7D;
+			 ii[WS(rs, 1)] = T7x + T7E;
+			 ii[WS(rs, 17)] = T7E - T7x;
+		    }
+		    {
+			 E T5B, T5E, T7F, T7G;
+			 T5B = T5p - T5s;
+			 T5E = T5C - T5D;
+			 ri[WS(rs, 25)] = T5B - T5E;
+			 ri[WS(rs, 9)] = T5B + T5E;
+			 T7F = T5z - T5w;
+			 T7G = T7D - T7y;
+			 ii[WS(rs, 9)] = T7F + T7G;
+			 ii[WS(rs, 25)] = T7G - T7F;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 32, "t1_32", twinstr, &GENUS, {340, 114, 94, 0}, 0, 0, 0 };
+
+void X(codelet_t1_32) (planner *p) {
+     X(kdft_dit_register) (p, t1_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:47 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 4 -name t1_4 -include t.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 31 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "t.h"
+
+static void t1_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E To, Te, Tm, T8, Tw, Tx, Tq, Tk;
+	       {
+		    E T1, Tv, Tu, T7, Tg, Tj, Tf, Ti, Tp, Th;
+		    T1 = ri[0];
+		    Tv = ii[0];
+		    {
+			 E T3, T6, T2, T5;
+			 T3 = ri[WS(rs, 2)];
+			 T6 = ii[WS(rs, 2)];
+			 T2 = W[2];
+			 T5 = W[3];
+			 {
+			      E Ta, Td, Tc, Tn, Tb, Tt, T4, T9;
+			      Ta = ri[WS(rs, 1)];
+			      Td = ii[WS(rs, 1)];
+			      Tt = T2 * T6;
+			      T4 = T2 * T3;
+			      T9 = W[0];
+			      Tc = W[1];
+			      Tu = FNMS(T5, T3, Tt);
+			      T7 = FMA(T5, T6, T4);
+			      Tn = T9 * Td;
+			      Tb = T9 * Ta;
+			      Tg = ri[WS(rs, 3)];
+			      Tj = ii[WS(rs, 3)];
+			      To = FNMS(Tc, Ta, Tn);
+			      Te = FMA(Tc, Td, Tb);
+			      Tf = W[4];
+			      Ti = W[5];
+			 }
+		    }
+		    Tm = T1 - T7;
+		    T8 = T1 + T7;
+		    Tw = Tu + Tv;
+		    Tx = Tv - Tu;
+		    Tp = Tf * Tj;
+		    Th = Tf * Tg;
+		    Tq = FNMS(Ti, Tg, Tp);
+		    Tk = FMA(Ti, Tj, Th);
+	       }
+	       {
+		    E Ts, Tr, Tl, Ty;
+		    Ts = To + Tq;
+		    Tr = To - Tq;
+		    Tl = Te + Tk;
+		    Ty = Te - Tk;
+		    ri[WS(rs, 1)] = Tm + Tr;
+		    ri[WS(rs, 3)] = Tm - Tr;
+		    ii[WS(rs, 2)] = Tw - Ts;
+		    ii[0] = Ts + Tw;
+		    ii[WS(rs, 3)] = Ty + Tx;
+		    ii[WS(rs, 1)] = Tx - Ty;
+		    ri[0] = T8 + Tl;
+		    ri[WS(rs, 2)] = T8 - Tl;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 4, "t1_4", twinstr, &GENUS, {16, 6, 6, 0}, 0, 0, 0 };
+
+void X(codelet_t1_4) (planner *p) {
+     X(kdft_dit_register) (p, t1_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 4 -name t1_4 -include t.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 13 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "t.h"
+
+static void t1_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T1, Tp, T6, To, Tc, Tk, Th, Tl;
+	       T1 = ri[0];
+	       Tp = ii[0];
+	       {
+		    E T3, T5, T2, T4;
+		    T3 = ri[WS(rs, 2)];
+		    T5 = ii[WS(rs, 2)];
+		    T2 = W[2];
+		    T4 = W[3];
+		    T6 = FMA(T2, T3, T4 * T5);
+		    To = FNMS(T4, T3, T2 * T5);
+	       }
+	       {
+		    E T9, Tb, T8, Ta;
+		    T9 = ri[WS(rs, 1)];
+		    Tb = ii[WS(rs, 1)];
+		    T8 = W[0];
+		    Ta = W[1];
+		    Tc = FMA(T8, T9, Ta * Tb);
+		    Tk = FNMS(Ta, T9, T8 * Tb);
+	       }
+	       {
+		    E Te, Tg, Td, Tf;
+		    Te = ri[WS(rs, 3)];
+		    Tg = ii[WS(rs, 3)];
+		    Td = W[4];
+		    Tf = W[5];
+		    Th = FMA(Td, Te, Tf * Tg);
+		    Tl = FNMS(Tf, Te, Td * Tg);
+	       }
+	       {
+		    E T7, Ti, Tn, Tq;
+		    T7 = T1 + T6;
+		    Ti = Tc + Th;
+		    ri[WS(rs, 2)] = T7 - Ti;
+		    ri[0] = T7 + Ti;
+		    Tn = Tk + Tl;
+		    Tq = To + Tp;
+		    ii[0] = Tn + Tq;
+		    ii[WS(rs, 2)] = Tq - Tn;
+	       }
+	       {
+		    E Tj, Tm, Tr, Ts;
+		    Tj = T1 - T6;
+		    Tm = Tk - Tl;
+		    ri[WS(rs, 3)] = Tj - Tm;
+		    ri[WS(rs, 1)] = Tj + Tm;
+		    Tr = Tp - To;
+		    Ts = Tc - Th;
+		    ii[WS(rs, 1)] = Tr - Ts;
+		    ii[WS(rs, 3)] = Ts + Tr;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 4, "t1_4", twinstr, &GENUS, {16, 6, 6, 0}, 0, 0, 0 };
+
+void X(codelet_t1_4) (planner *p) {
+     X(kdft_dit_register) (p, t1_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:47 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include t.h */
+
+/*
+ * This function contains 40 FP additions, 34 FP multiplications,
+ * (or, 14 additions, 8 multiplications, 26 fused multiply/add),
+ * 43 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t.h"
+
+static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E T1, TM, TJ, TA, TQ, Te, TC, Tk, TE, Tq;
+	       {
+		    E Tg, Tj, Tm, TB, Th, Tp, Tl, Ti, To, TD, Tn;
+		    T1 = ri[0];
+		    TM = ii[0];
+		    {
+			 E T9, Tc, Ty, Ta, Tb, Tx, T7, Tf, Tz, Td;
+			 {
+			      E T3, T6, T8, Tw, T4, T2, T5;
+			      T3 = ri[WS(rs, 1)];
+			      T6 = ii[WS(rs, 1)];
+			      T2 = W[0];
+			      T9 = ri[WS(rs, 4)];
+			      Tc = ii[WS(rs, 4)];
+			      T8 = W[6];
+			      Tw = T2 * T6;
+			      T4 = T2 * T3;
+			      T5 = W[1];
+			      Ty = T8 * Tc;
+			      Ta = T8 * T9;
+			      Tb = W[7];
+			      Tx = FNMS(T5, T3, Tw);
+			      T7 = FMA(T5, T6, T4);
+			 }
+			 Tg = ri[WS(rs, 2)];
+			 Tz = FNMS(Tb, T9, Ty);
+			 Td = FMA(Tb, Tc, Ta);
+			 Tj = ii[WS(rs, 2)];
+			 Tf = W[2];
+			 TJ = Tx + Tz;
+			 TA = Tx - Tz;
+			 TQ = T7 - Td;
+			 Te = T7 + Td;
+			 Tm = ri[WS(rs, 3)];
+			 TB = Tf * Tj;
+			 Th = Tf * Tg;
+			 Tp = ii[WS(rs, 3)];
+			 Tl = W[4];
+			 Ti = W[3];
+			 To = W[5];
+		    }
+		    TD = Tl * Tp;
+		    Tn = Tl * Tm;
+		    TC = FNMS(Ti, Tg, TB);
+		    Tk = FMA(Ti, Tj, Th);
+		    TE = FNMS(To, Tm, TD);
+		    Tq = FMA(To, Tp, Tn);
+	       }
+	       {
+		    E TG, TI, TO, TS, TU, Tu, TN, Tt, TK, TF;
+		    TK = TC + TE;
+		    TF = TC - TE;
+		    {
+			 E Tr, TR, TL, Ts;
+			 Tr = Tk + Tq;
+			 TR = Tk - Tq;
+			 TG = FMA(KP618033988, TF, TA);
+			 TI = FNMS(KP618033988, TA, TF);
+			 TO = TJ - TK;
+			 TL = TJ + TK;
+			 TS = FMA(KP618033988, TR, TQ);
+			 TU = FNMS(KP618033988, TQ, TR);
+			 Tu = Te - Tr;
+			 Ts = Te + Tr;
+			 ii[0] = TL + TM;
+			 TN = FNMS(KP250000000, TL, TM);
+			 ri[0] = T1 + Ts;
+			 Tt = FNMS(KP250000000, Ts, T1);
+		    }
+		    {
+			 E TT, TP, TH, Tv;
+			 TT = FNMS(KP559016994, TO, TN);
+			 TP = FMA(KP559016994, TO, TN);
+			 TH = FNMS(KP559016994, Tu, Tt);
+			 Tv = FMA(KP559016994, Tu, Tt);
+			 ii[WS(rs, 4)] = FMA(KP951056516, TS, TP);
+			 ii[WS(rs, 1)] = FNMS(KP951056516, TS, TP);
+			 ii[WS(rs, 3)] = FNMS(KP951056516, TU, TT);
+			 ii[WS(rs, 2)] = FMA(KP951056516, TU, TT);
+			 ri[WS(rs, 1)] = FMA(KP951056516, TG, Tv);
+			 ri[WS(rs, 4)] = FNMS(KP951056516, TG, Tv);
+			 ri[WS(rs, 3)] = FMA(KP951056516, TI, TH);
+			 ri[WS(rs, 2)] = FNMS(KP951056516, TI, TH);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 5},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, {14, 8, 26, 0}, 0, 0, 0 };
+
+void X(codelet_t1_5) (planner *p) {
+     X(kdft_dit_register) (p, t1_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include t.h */
+
+/*
+ * This function contains 40 FP additions, 28 FP multiplications,
+ * (or, 26 additions, 14 multiplications, 14 fused multiply/add),
+ * 29 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t.h"
+
+static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E T1, TE, Tu, Tx, TJ, TI, TB, TC, TD, Tc, Tn, To;
+	       T1 = ri[0];
+	       TE = ii[0];
+	       {
+		    E T6, Ts, Tm, Tw, Tb, Tt, Th, Tv;
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 1)];
+			 T5 = ii[WS(rs, 1)];
+			 T2 = W[0];
+			 T4 = W[1];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 Ts = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E Tj, Tl, Ti, Tk;
+			 Tj = ri[WS(rs, 3)];
+			 Tl = ii[WS(rs, 3)];
+			 Ti = W[4];
+			 Tk = W[5];
+			 Tm = FMA(Ti, Tj, Tk * Tl);
+			 Tw = FNMS(Tk, Tj, Ti * Tl);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = ri[WS(rs, 4)];
+			 Ta = ii[WS(rs, 4)];
+			 T7 = W[6];
+			 T9 = W[7];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 Tt = FNMS(T9, T8, T7 * Ta);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = ri[WS(rs, 2)];
+			 Tg = ii[WS(rs, 2)];
+			 Td = W[2];
+			 Tf = W[3];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 Tv = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Tu = Ts - Tt;
+		    Tx = Tv - Tw;
+		    TJ = Th - Tm;
+		    TI = T6 - Tb;
+		    TB = Ts + Tt;
+		    TC = Tv + Tw;
+		    TD = TB + TC;
+		    Tc = T6 + Tb;
+		    Tn = Th + Tm;
+		    To = Tc + Tn;
+	       }
+	       ri[0] = T1 + To;
+	       ii[0] = TD + TE;
+	       {
+		    E Ty, TA, Tr, Tz, Tp, Tq;
+		    Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
+		    TA = FNMS(KP587785252, Tu, KP951056516 * Tx);
+		    Tp = KP559016994 * (Tc - Tn);
+		    Tq = FNMS(KP250000000, To, T1);
+		    Tr = Tp + Tq;
+		    Tz = Tq - Tp;
+		    ri[WS(rs, 4)] = Tr - Ty;
+		    ri[WS(rs, 3)] = Tz + TA;
+		    ri[WS(rs, 1)] = Tr + Ty;
+		    ri[WS(rs, 2)] = Tz - TA;
+	       }
+	       {
+		    E TK, TL, TH, TM, TF, TG;
+		    TK = FMA(KP951056516, TI, KP587785252 * TJ);
+		    TL = FNMS(KP587785252, TI, KP951056516 * TJ);
+		    TF = KP559016994 * (TB - TC);
+		    TG = FNMS(KP250000000, TD, TE);
+		    TH = TF + TG;
+		    TM = TG - TF;
+		    ii[WS(rs, 1)] = TH - TK;
+		    ii[WS(rs, 3)] = TM - TL;
+		    ii[WS(rs, 4)] = TK + TH;
+		    ii[WS(rs, 2)] = TL + TM;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 5},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, {26, 14, 14, 0}, 0, 0, 0 };
+
+void X(codelet_t1_5) (planner *p) {
+     X(kdft_dit_register) (p, t1_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:48 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 6 -name t1_6 -include t.h */
+
+/*
+ * This function contains 46 FP additions, 32 FP multiplications,
+ * (or, 24 additions, 10 multiplications, 22 fused multiply/add),
+ * 47 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "t.h"
+
+static void t1_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
+	       E TY, TU, T10, TZ;
+	       {
+		    E T1, TX, TW, T7, Tn, Tq, TJ, TR, TB, Tl, To, TK, Tt, Tw, Ts;
+		    E Tp, Tv;
+		    T1 = ri[0];
+		    TX = ii[0];
+		    {
+			 E T3, T6, T2, T5;
+			 T3 = ri[WS(rs, 3)];
+			 T6 = ii[WS(rs, 3)];
+			 T2 = W[4];
+			 T5 = W[5];
+			 {
+			      E Ta, Td, Tg, TF, Tb, Tj, Tf, Tc, Ti, TV, T4, T9;
+			      Ta = ri[WS(rs, 2)];
+			      Td = ii[WS(rs, 2)];
+			      TV = T2 * T6;
+			      T4 = T2 * T3;
+			      T9 = W[2];
+			      Tg = ri[WS(rs, 5)];
+			      TW = FNMS(T5, T3, TV);
+			      T7 = FMA(T5, T6, T4);
+			      TF = T9 * Td;
+			      Tb = T9 * Ta;
+			      Tj = ii[WS(rs, 5)];
+			      Tf = W[8];
+			      Tc = W[3];
+			      Ti = W[9];
+			      {
+				   E TG, Te, TI, Tk, TH, Th, Tm;
+				   Tn = ri[WS(rs, 4)];
+				   TH = Tf * Tj;
+				   Th = Tf * Tg;
+				   TG = FNMS(Tc, Ta, TF);
+				   Te = FMA(Tc, Td, Tb);
+				   TI = FNMS(Ti, Tg, TH);
+				   Tk = FMA(Ti, Tj, Th);
+				   Tq = ii[WS(rs, 4)];
+				   Tm = W[6];
+				   TJ = TG - TI;
+				   TR = TG + TI;
+				   TB = Te + Tk;
+				   Tl = Te - Tk;
+				   To = Tm * Tn;
+				   TK = Tm * Tq;
+			      }
+			      Tt = ri[WS(rs, 1)];
+			      Tw = ii[WS(rs, 1)];
+			      Ts = W[0];
+			      Tp = W[7];
+			      Tv = W[1];
+			 }
+		    }
+		    {
+			 E TA, T8, TL, Tr, TN, Tx, T11, TM, Tu;
+			 TA = T1 + T7;
+			 T8 = T1 - T7;
+			 TM = Ts * Tw;
+			 Tu = Ts * Tt;
+			 TL = FNMS(Tp, Tn, TK);
+			 Tr = FMA(Tp, Tq, To);
+			 TN = FNMS(Tv, Tt, TM);
+			 Tx = FMA(Tv, Tw, Tu);
+			 T11 = TX - TW;
+			 TY = TW + TX;
+			 {
+			      E TP, TT, TD, TE, TQ, Tz, T14, T13;
+			      {
+				   E TO, TS, TC, Ty, T12;
+				   TO = TL - TN;
+				   TS = TL + TN;
+				   TC = Tr + Tx;
+				   Ty = Tr - Tx;
+				   T12 = TJ + TO;
+				   TP = TJ - TO;
+				   TT = TR - TS;
+				   TU = TR + TS;
+				   Tz = Tl + Ty;
+				   T14 = Ty - Tl;
+				   ii[WS(rs, 3)] = T12 + T11;
+				   T13 = FNMS(KP500000000, T12, T11);
+				   T10 = TC - TB;
+				   TD = TB + TC;
+			      }
+			      ri[WS(rs, 3)] = T8 + Tz;
+			      TE = FNMS(KP500000000, Tz, T8);
+			      ii[WS(rs, 5)] = FNMS(KP866025403, T14, T13);
+			      ii[WS(rs, 1)] = FMA(KP866025403, T14, T13);
+			      TQ = FNMS(KP500000000, TD, TA);
+			      ri[WS(rs, 5)] = FNMS(KP866025403, TP, TE);
+			      ri[WS(rs, 1)] = FMA(KP866025403, TP, TE);
+			      ri[0] = TA + TD;
+			      ri[WS(rs, 4)] = FMA(KP866025403, TT, TQ);
+			      ri[WS(rs, 2)] = FNMS(KP866025403, TT, TQ);
+			 }
+		    }
+	       }
+	       ii[0] = TU + TY;
+	       TZ = FNMS(KP500000000, TU, TY);
+	       ii[WS(rs, 2)] = FNMS(KP866025403, T10, TZ);
+	       ii[WS(rs, 4)] = FMA(KP866025403, T10, TZ);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 6, "t1_6", twinstr, &GENUS, {24, 10, 22, 0}, 0, 0, 0 };
+
+void X(codelet_t1_6) (planner *p) {
+     X(kdft_dit_register) (p, t1_6, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 6 -name t1_6 -include t.h */
+
+/*
+ * This function contains 46 FP additions, 28 FP multiplications,
+ * (or, 32 additions, 14 multiplications, 14 fused multiply/add),
+ * 23 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "t.h"
+
+static void t1_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
+	       E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
+	       {
+		    E T1, TN, T6, TM;
+		    T1 = ri[0];
+		    TN = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 3)];
+			 T5 = ii[WS(rs, 3)];
+			 T2 = W[4];
+			 T4 = W[5];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 TM = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 - T6;
+		    TS = TN - TM;
+		    Tv = T1 + T6;
+		    TO = TM + TN;
+	       }
+	       {
+		    E Tn, TD, Ts, TE;
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = ri[WS(rs, 4)];
+			 Tm = ii[WS(rs, 4)];
+			 Tj = W[6];
+			 Tl = W[7];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 TD = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = ri[WS(rs, 1)];
+			 Tr = ii[WS(rs, 1)];
+			 To = W[0];
+			 Tq = W[1];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 TE = FNMS(Tq, Tp, To * Tr);
+		    }
+		    Tt = Tn - Ts;
+		    TJ = TD + TE;
+		    Tx = Tn + Ts;
+		    TF = TD - TE;
+	       }
+	       {
+		    E Tc, TA, Th, TB;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = ri[WS(rs, 2)];
+			 Tb = ii[WS(rs, 2)];
+			 T8 = W[2];
+			 Ta = W[3];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 TA = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = ri[WS(rs, 5)];
+			 Tg = ii[WS(rs, 5)];
+			 Td = W[8];
+			 Tf = W[9];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 TB = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Ti = Tc - Th;
+		    TI = TA + TB;
+		    Tw = Tc + Th;
+		    TC = TA - TB;
+	       }
+	       {
+		    E TG, Tu, Tz, TR, TT, TU;
+		    TG = KP866025403 * (TC - TF);
+		    Tu = Ti + Tt;
+		    Tz = FNMS(KP500000000, Tu, T7);
+		    ri[WS(rs, 3)] = T7 + Tu;
+		    ri[WS(rs, 1)] = Tz + TG;
+		    ri[WS(rs, 5)] = Tz - TG;
+		    TR = KP866025403 * (Tt - Ti);
+		    TT = TC + TF;
+		    TU = FNMS(KP500000000, TT, TS);
+		    ii[WS(rs, 1)] = TR + TU;
+		    ii[WS(rs, 3)] = TT + TS;
+		    ii[WS(rs, 5)] = TU - TR;
+	       }
+	       {
+		    E TK, Ty, TH, TQ, TL, TP;
+		    TK = KP866025403 * (TI - TJ);
+		    Ty = Tw + Tx;
+		    TH = FNMS(KP500000000, Ty, Tv);
+		    ri[0] = Tv + Ty;
+		    ri[WS(rs, 4)] = TH + TK;
+		    ri[WS(rs, 2)] = TH - TK;
+		    TQ = KP866025403 * (Tx - Tw);
+		    TL = TI + TJ;
+		    TP = FNMS(KP500000000, TL, TO);
+		    ii[0] = TL + TO;
+		    ii[WS(rs, 4)] = TQ + TP;
+		    ii[WS(rs, 2)] = TP - TQ;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 6, "t1_6", twinstr, &GENUS, {32, 14, 14, 0}, 0, 0, 0 };
+
+void X(codelet_t1_6) (planner *p) {
+     X(kdft_dit_register) (p, t1_6, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3975 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 64 -name t1_64 -include t.h */
+
+/*
+ * This function contains 1038 FP additions, 644 FP multiplications,
+ * (or, 520 additions, 126 multiplications, 518 fused multiply/add),
+ * 228 stack variables, 15 constants, and 256 memory accesses
+ */
+#include "t.h"
+
+static void t1_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 126); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 126, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E TeI, Tkk, Tkj, TeL;
+	       {
+		    E TiV, Tjm, T7e, TcA, TjR, Tkl, Tm, TeM, TeZ, Ths, T7Q, TcJ, T1G, TeW, TcI;
+		    E T7X, Tf5, Thv, T87, TcN, T29, Tf8, TcQ, T8u, TfU, ThS, Taq, Tdm, T5K, Tg9;
+		    E Tdx, Tbj, TcB, T7l, TiP, TeP, Tjl, TN, TcC, T7s, T7I, TcF, TeU, Thr, T7B;
+		    E TcG, T1f, TeR, Tfg, ThB, T8G, TcU, T32, Tfj, TcX, T93, Tft, ThH, T9h, Td3;
+		    E T3X, TfI, Tde, Taa, Thw, Tfb, Tf6, T2A, T8x, TcO, T8m, TcR, Tfm, ThC, T3t;
+		    E Tfh, T96, TcV, T8V, TcY, ThI, TfL, Tfu, T4o, Tad, Td4, T9w, Tdf, Tgc, ThT;
+		    E T6b, TfV, Tbm, Tdn, TaF, Tdy, ThN, T4Q, TfN, TfA, Taf, Ta1, Td8, Tdh, ThO;
+		    E T5h, TfO, TfF, Tag, T9M, Tdb, Tdi, ThY, T6D, Tge, Tg1, Tbo, Tba, Tdr, TdA;
+		    E TaN, Tdt, Tg5, ThZ, Tg2, T74, Tds, TaU;
+		    {
+			 E T7a, Te, T78, T8, TjP, TiU, T7c, Tk;
+			 {
+			      E T1, TiT, TiS, T7, Tg, Tj, Tf, Ti, T7b, Th;
+			      T1 = ri[0];
+			      TiT = ii[0];
+			      {
+				   E T3, T6, T2, T5;
+				   T3 = ri[WS(rs, 32)];
+				   T6 = ii[WS(rs, 32)];
+				   T2 = W[62];
+				   T5 = W[63];
+				   {
+					E Ta, Td, Tc, T79, Tb, TiR, T4, T9;
+					Ta = ri[WS(rs, 16)];
+					Td = ii[WS(rs, 16)];
+					TiR = T2 * T6;
+					T4 = T2 * T3;
+					T9 = W[30];
+					Tc = W[31];
+					TiS = FNMS(T5, T3, TiR);
+					T7 = FMA(T5, T6, T4);
+					T79 = T9 * Td;
+					Tb = T9 * Ta;
+					Tg = ri[WS(rs, 48)];
+					Tj = ii[WS(rs, 48)];
+					T7a = FNMS(Tc, Ta, T79);
+					Te = FMA(Tc, Td, Tb);
+					Tf = W[94];
+					Ti = W[95];
+				   }
+			      }
+			      T78 = T1 - T7;
+			      T8 = T1 + T7;
+			      TjP = TiT - TiS;
+			      TiU = TiS + TiT;
+			      T7b = Tf * Tj;
+			      Th = Tf * Tg;
+			      T7c = FNMS(Ti, Tg, T7b);
+			      Tk = FMA(Ti, Tj, Th);
+			 }
+			 {
+			      E T7L, T1l, T7V, T1E, T1u, T1x, T1w, T7N, T1r, T7S, T1v;
+			      {
+				   E T1A, T1D, T1C, T7U, T1B;
+				   {
+					E T1h, T1k, T1g, T1j, T7K, T1i, T1z;
+					T1h = ri[WS(rs, 60)];
+					T1k = ii[WS(rs, 60)];
+					{
+					     E T7d, TiQ, Tl, TjQ;
+					     T7d = T7a - T7c;
+					     TiQ = T7a + T7c;
+					     Tl = Te + Tk;
+					     TjQ = Te - Tk;
+					     TiV = TiQ + TiU;
+					     Tjm = TiU - TiQ;
+					     T7e = T78 - T7d;
+					     TcA = T78 + T7d;
+					     TjR = TjP - TjQ;
+					     Tkl = TjQ + TjP;
+					     Tm = T8 + Tl;
+					     TeM = T8 - Tl;
+					     T1g = W[118];
+					}
+					T1j = W[119];
+					T1A = ri[WS(rs, 44)];
+					T1D = ii[WS(rs, 44)];
+					T7K = T1g * T1k;
+					T1i = T1g * T1h;
+					T1z = W[86];
+					T1C = W[87];
+					T7L = FNMS(T1j, T1h, T7K);
+					T1l = FMA(T1j, T1k, T1i);
+					T7U = T1z * T1D;
+					T1B = T1z * T1A;
+				   }
+				   {
+					E T1n, T1q, T1m, T1p, T7M, T1o, T1t;
+					T1n = ri[WS(rs, 28)];
+					T1q = ii[WS(rs, 28)];
+					T7V = FNMS(T1C, T1A, T7U);
+					T1E = FMA(T1C, T1D, T1B);
+					T1m = W[54];
+					T1p = W[55];
+					T1u = ri[WS(rs, 12)];
+					T1x = ii[WS(rs, 12)];
+					T7M = T1m * T1q;
+					T1o = T1m * T1n;
+					T1t = W[22];
+					T1w = W[23];
+					T7N = FNMS(T1p, T1n, T7M);
+					T1r = FMA(T1p, T1q, T1o);
+					T7S = T1t * T1x;
+					T1v = T1t * T1u;
+				   }
+			      }
+			      {
+				   E T7O, TeX, T1s, T7R, T7T, T1y;
+				   T7O = T7L - T7N;
+				   TeX = T7L + T7N;
+				   T1s = T1l + T1r;
+				   T7R = T1l - T1r;
+				   T7T = FNMS(T1w, T1u, T7S);
+				   T1y = FMA(T1w, T1x, T1v);
+				   {
+					E T7W, TeY, T7P, T1F;
+					T7W = T7T - T7V;
+					TeY = T7T + T7V;
+					T7P = T1y - T1E;
+					T1F = T1y + T1E;
+					TeZ = TeX - TeY;
+					Ths = TeX + TeY;
+					T7Q = T7O + T7P;
+					TcJ = T7O - T7P;
+					T1G = T1s + T1F;
+					TeW = T1s - T1F;
+					TcI = T7R + T7W;
+					T7X = T7R - T7W;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T82, T1O, T8s, T27, T1X, T20, T1Z, T84, T1U, T8p, T1Y;
+			 {
+			      E T23, T26, T25, T8r, T24;
+			      {
+				   E T1K, T1N, T1J, T1M, T81, T1L, T22;
+				   T1K = ri[WS(rs, 2)];
+				   T1N = ii[WS(rs, 2)];
+				   T1J = W[2];
+				   T1M = W[3];
+				   T23 = ri[WS(rs, 50)];
+				   T26 = ii[WS(rs, 50)];
+				   T81 = T1J * T1N;
+				   T1L = T1J * T1K;
+				   T22 = W[98];
+				   T25 = W[99];
+				   T82 = FNMS(T1M, T1K, T81);
+				   T1O = FMA(T1M, T1N, T1L);
+				   T8r = T22 * T26;
+				   T24 = T22 * T23;
+			      }
+			      {
+				   E T1Q, T1T, T1P, T1S, T83, T1R, T1W;
+				   T1Q = ri[WS(rs, 34)];
+				   T1T = ii[WS(rs, 34)];
+				   T8s = FNMS(T25, T23, T8r);
+				   T27 = FMA(T25, T26, T24);
+				   T1P = W[66];
+				   T1S = W[67];
+				   T1X = ri[WS(rs, 18)];
+				   T20 = ii[WS(rs, 18)];
+				   T83 = T1P * T1T;
+				   T1R = T1P * T1Q;
+				   T1W = W[34];
+				   T1Z = W[35];
+				   T84 = FNMS(T1S, T1Q, T83);
+				   T1U = FMA(T1S, T1T, T1R);
+				   T8p = T1W * T20;
+				   T1Y = T1W * T1X;
+			      }
+			 }
+			 {
+			      E T85, Tf3, T1V, T8o, T8q, T21;
+			      T85 = T82 - T84;
+			      Tf3 = T82 + T84;
+			      T1V = T1O + T1U;
+			      T8o = T1O - T1U;
+			      T8q = FNMS(T1Z, T1X, T8p);
+			      T21 = FMA(T1Z, T20, T1Y);
+			      {
+				   E T8t, Tf4, T86, T28;
+				   T8t = T8q - T8s;
+				   Tf4 = T8q + T8s;
+				   T86 = T21 - T27;
+				   T28 = T21 + T27;
+				   Tf5 = Tf3 - Tf4;
+				   Thv = Tf3 + Tf4;
+				   T87 = T85 + T86;
+				   TcN = T85 - T86;
+				   T29 = T1V + T28;
+				   Tf8 = T1V - T28;
+				   TcQ = T8o + T8t;
+				   T8u = T8o - T8t;
+			      }
+			 }
+		    }
+		    {
+			 E Tal, T5p, Tbh, T5I, T5y, T5B, T5A, Tan, T5v, Tbe, T5z;
+			 {
+			      E T5E, T5H, T5G, Tbg, T5F;
+			      {
+				   E T5l, T5o, T5k, T5n, Tak, T5m, T5D;
+				   T5l = ri[WS(rs, 63)];
+				   T5o = ii[WS(rs, 63)];
+				   T5k = W[124];
+				   T5n = W[125];
+				   T5E = ri[WS(rs, 47)];
+				   T5H = ii[WS(rs, 47)];
+				   Tak = T5k * T5o;
+				   T5m = T5k * T5l;
+				   T5D = W[92];
+				   T5G = W[93];
+				   Tal = FNMS(T5n, T5l, Tak);
+				   T5p = FMA(T5n, T5o, T5m);
+				   Tbg = T5D * T5H;
+				   T5F = T5D * T5E;
+			      }
+			      {
+				   E T5r, T5u, T5q, T5t, Tam, T5s, T5x;
+				   T5r = ri[WS(rs, 31)];
+				   T5u = ii[WS(rs, 31)];
+				   Tbh = FNMS(T5G, T5E, Tbg);
+				   T5I = FMA(T5G, T5H, T5F);
+				   T5q = W[60];
+				   T5t = W[61];
+				   T5y = ri[WS(rs, 15)];
+				   T5B = ii[WS(rs, 15)];
+				   Tam = T5q * T5u;
+				   T5s = T5q * T5r;
+				   T5x = W[28];
+				   T5A = W[29];
+				   Tan = FNMS(T5t, T5r, Tam);
+				   T5v = FMA(T5t, T5u, T5s);
+				   Tbe = T5x * T5B;
+				   T5z = T5x * T5y;
+			      }
+			 }
+			 {
+			      E Tao, TfS, T5w, Tbd, Tbf, T5C;
+			      Tao = Tal - Tan;
+			      TfS = Tal + Tan;
+			      T5w = T5p + T5v;
+			      Tbd = T5p - T5v;
+			      Tbf = FNMS(T5A, T5y, Tbe);
+			      T5C = FMA(T5A, T5B, T5z);
+			      {
+				   E Tbi, TfT, Tap, T5J;
+				   Tbi = Tbf - Tbh;
+				   TfT = Tbf + Tbh;
+				   Tap = T5C - T5I;
+				   T5J = T5C + T5I;
+				   TfU = TfS - TfT;
+				   ThS = TfS + TfT;
+				   Taq = Tao + Tap;
+				   Tdm = Tao - Tap;
+				   T5K = T5w + T5J;
+				   Tg9 = T5w - T5J;
+				   Tdx = Tbd + Tbi;
+				   Tbj = Tbd - Tbi;
+			      }
+			 }
+		    }
+		    {
+			 E T7G, T1d, T7z, TeS, T11, T7C, T7E, T17, T7r, T7m;
+			 {
+			      E T7g, Ts, T7q, TL, TB, TE, TD, T7i, Ty, T7n, TC;
+			      {
+				   E TH, TK, TJ, T7p, TI;
+				   {
+					E To, Tr, Tn, Tq, T7f, Tp, TG;
+					To = ri[WS(rs, 8)];
+					Tr = ii[WS(rs, 8)];
+					Tn = W[14];
+					Tq = W[15];
+					TH = ri[WS(rs, 24)];
+					TK = ii[WS(rs, 24)];
+					T7f = Tn * Tr;
+					Tp = Tn * To;
+					TG = W[46];
+					TJ = W[47];
+					T7g = FNMS(Tq, To, T7f);
+					Ts = FMA(Tq, Tr, Tp);
+					T7p = TG * TK;
+					TI = TG * TH;
+				   }
+				   {
+					E Tu, Tx, Tt, Tw, T7h, Tv, TA;
+					Tu = ri[WS(rs, 40)];
+					Tx = ii[WS(rs, 40)];
+					T7q = FNMS(TJ, TH, T7p);
+					TL = FMA(TJ, TK, TI);
+					Tt = W[78];
+					Tw = W[79];
+					TB = ri[WS(rs, 56)];
+					TE = ii[WS(rs, 56)];
+					T7h = Tt * Tx;
+					Tv = Tt * Tu;
+					TA = W[110];
+					TD = W[111];
+					T7i = FNMS(Tw, Tu, T7h);
+					Ty = FMA(Tw, Tx, Tv);
+					T7n = TA * TE;
+					TC = TA * TB;
+				   }
+			      }
+			      {
+				   E T7j, TeN, Tz, T7k, T7o, TF, TeO, TM;
+				   T7j = T7g - T7i;
+				   TeN = T7g + T7i;
+				   Tz = Ts + Ty;
+				   T7k = Ts - Ty;
+				   T7o = FNMS(TD, TB, T7n);
+				   TF = FMA(TD, TE, TC);
+				   T7r = T7o - T7q;
+				   TeO = T7o + T7q;
+				   TM = TF + TL;
+				   T7m = TF - TL;
+				   TcB = T7k + T7j;
+				   T7l = T7j - T7k;
+				   TiP = TeN + TeO;
+				   TeP = TeN - TeO;
+				   Tjl = TM - Tz;
+				   TN = Tz + TM;
+			      }
+			 }
+			 {
+			      E T7w, TU, T13, T16, T7y, T10, T12, T15, T7D, T14;
+			      {
+				   E T19, T1c, T18, T1b;
+				   {
+					E TQ, TT, TS, T7v, TR, TP;
+					TQ = ri[WS(rs, 4)];
+					TT = ii[WS(rs, 4)];
+					TP = W[6];
+					TcC = T7m - T7r;
+					T7s = T7m + T7r;
+					TS = W[7];
+					T7v = TP * TT;
+					TR = TP * TQ;
+					T19 = ri[WS(rs, 52)];
+					T1c = ii[WS(rs, 52)];
+					T7w = FNMS(TS, TQ, T7v);
+					TU = FMA(TS, TT, TR);
+					T18 = W[102];
+					T1b = W[103];
+				   }
+				   {
+					E TW, TZ, TY, T7x, TX, T7F, T1a, TV;
+					TW = ri[WS(rs, 36)];
+					TZ = ii[WS(rs, 36)];
+					T7F = T18 * T1c;
+					T1a = T18 * T19;
+					TV = W[70];
+					TY = W[71];
+					T7G = FNMS(T1b, T19, T7F);
+					T1d = FMA(T1b, T1c, T1a);
+					T7x = TV * TZ;
+					TX = TV * TW;
+					T13 = ri[WS(rs, 20)];
+					T16 = ii[WS(rs, 20)];
+					T7y = FNMS(TY, TW, T7x);
+					T10 = FMA(TY, TZ, TX);
+					T12 = W[38];
+					T15 = W[39];
+				   }
+			      }
+			      T7z = T7w - T7y;
+			      TeS = T7w + T7y;
+			      T11 = TU + T10;
+			      T7C = TU - T10;
+			      T7D = T12 * T16;
+			      T14 = T12 * T13;
+			      T7E = FNMS(T15, T13, T7D);
+			      T17 = FMA(T15, T16, T14);
+			 }
+			 {
+			      E T8B, T2H, T91, T30, T2Q, T2T, T2S, T8D, T2N, T8Y, T2R;
+			      {
+				   E T2W, T2Z, T2Y, T90, T2X;
+				   {
+					E T2D, T2G, T2C, T2F, T8A, T2E, T2V;
+					T2D = ri[WS(rs, 62)];
+					T2G = ii[WS(rs, 62)];
+					{
+					     E TeT, T7H, T1e, T7A;
+					     TeT = T7E + T7G;
+					     T7H = T7E - T7G;
+					     T1e = T17 + T1d;
+					     T7A = T17 - T1d;
+					     T7I = T7C - T7H;
+					     TcF = T7C + T7H;
+					     TeU = TeS - TeT;
+					     Thr = TeS + TeT;
+					     T7B = T7z + T7A;
+					     TcG = T7z - T7A;
+					     T1f = T11 + T1e;
+					     TeR = T11 - T1e;
+					     T2C = W[122];
+					}
+					T2F = W[123];
+					T2W = ri[WS(rs, 46)];
+					T2Z = ii[WS(rs, 46)];
+					T8A = T2C * T2G;
+					T2E = T2C * T2D;
+					T2V = W[90];
+					T2Y = W[91];
+					T8B = FNMS(T2F, T2D, T8A);
+					T2H = FMA(T2F, T2G, T2E);
+					T90 = T2V * T2Z;
+					T2X = T2V * T2W;
+				   }
+				   {
+					E T2J, T2M, T2I, T2L, T8C, T2K, T2P;
+					T2J = ri[WS(rs, 30)];
+					T2M = ii[WS(rs, 30)];
+					T91 = FNMS(T2Y, T2W, T90);
+					T30 = FMA(T2Y, T2Z, T2X);
+					T2I = W[58];
+					T2L = W[59];
+					T2Q = ri[WS(rs, 14)];
+					T2T = ii[WS(rs, 14)];
+					T8C = T2I * T2M;
+					T2K = T2I * T2J;
+					T2P = W[26];
+					T2S = W[27];
+					T8D = FNMS(T2L, T2J, T8C);
+					T2N = FMA(T2L, T2M, T2K);
+					T8Y = T2P * T2T;
+					T2R = T2P * T2Q;
+				   }
+			      }
+			      {
+				   E T8E, Tfe, T2O, T8X, T8Z, T2U;
+				   T8E = T8B - T8D;
+				   Tfe = T8B + T8D;
+				   T2O = T2H + T2N;
+				   T8X = T2H - T2N;
+				   T8Z = FNMS(T2S, T2Q, T8Y);
+				   T2U = FMA(T2S, T2T, T2R);
+				   {
+					E T92, Tff, T8F, T31;
+					T92 = T8Z - T91;
+					Tff = T8Z + T91;
+					T8F = T2U - T30;
+					T31 = T2U + T30;
+					Tfg = Tfe - Tff;
+					ThB = Tfe + Tff;
+					T8G = T8E + T8F;
+					TcU = T8E - T8F;
+					T32 = T2O + T31;
+					Tfj = T2O - T31;
+					TcX = T8X + T92;
+					T93 = T8X - T92;
+				   }
+			      }
+			 }
+			 {
+			      E T9c, T3C, Ta8, T3V, T3L, T3O, T3N, T9e, T3I, Ta5, T3M;
+			      {
+				   E T3R, T3U, T3T, Ta7, T3S;
+				   {
+					E T3y, T3B, T3x, T3A, T9b, T3z, T3Q;
+					T3y = ri[WS(rs, 1)];
+					T3B = ii[WS(rs, 1)];
+					T3x = W[0];
+					T3A = W[1];
+					T3R = ri[WS(rs, 49)];
+					T3U = ii[WS(rs, 49)];
+					T9b = T3x * T3B;
+					T3z = T3x * T3y;
+					T3Q = W[96];
+					T3T = W[97];
+					T9c = FNMS(T3A, T3y, T9b);
+					T3C = FMA(T3A, T3B, T3z);
+					Ta7 = T3Q * T3U;
+					T3S = T3Q * T3R;
+				   }
+				   {
+					E T3E, T3H, T3D, T3G, T9d, T3F, T3K;
+					T3E = ri[WS(rs, 33)];
+					T3H = ii[WS(rs, 33)];
+					Ta8 = FNMS(T3T, T3R, Ta7);
+					T3V = FMA(T3T, T3U, T3S);
+					T3D = W[64];
+					T3G = W[65];
+					T3L = ri[WS(rs, 17)];
+					T3O = ii[WS(rs, 17)];
+					T9d = T3D * T3H;
+					T3F = T3D * T3E;
+					T3K = W[32];
+					T3N = W[33];
+					T9e = FNMS(T3G, T3E, T9d);
+					T3I = FMA(T3G, T3H, T3F);
+					Ta5 = T3K * T3O;
+					T3M = T3K * T3L;
+				   }
+			      }
+			      {
+				   E T9f, Tfr, T3J, Ta4, Ta6, T3P;
+				   T9f = T9c - T9e;
+				   Tfr = T9c + T9e;
+				   T3J = T3C + T3I;
+				   Ta4 = T3C - T3I;
+				   Ta6 = FNMS(T3N, T3L, Ta5);
+				   T3P = FMA(T3N, T3O, T3M);
+				   {
+					E Ta9, Tfs, T9g, T3W;
+					Ta9 = Ta6 - Ta8;
+					Tfs = Ta6 + Ta8;
+					T9g = T3P - T3V;
+					T3W = T3P + T3V;
+					Tft = Tfr - Tfs;
+					ThH = Tfr + Tfs;
+					T9h = T9f + T9g;
+					Td3 = T9f - T9g;
+					T3X = T3J + T3W;
+					TfI = T3J - T3W;
+					Tde = Ta4 + Ta9;
+					Taa = Ta4 - Ta9;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E TaC, T69, Taw, Tga, T5X, Tar, TaA, T63;
+			 {
+			      E T8S, T3r, T8M, Tfk, T3f, T8H, T8Q, T3l;
+			      {
+				   E T8k, T8f, T8w, T8e;
+				   {
+					E T8a, T2f, T8j, T2y, T2o, T2r, T2q, T8c, T2l, T8g, T2p;
+					{
+					     E T2u, T2x, T2w, T8i, T2v;
+					     {
+						  E T2b, T2e, T2a, T2d, T89, T2c, T2t;
+						  T2b = ri[WS(rs, 10)];
+						  T2e = ii[WS(rs, 10)];
+						  T2a = W[18];
+						  T2d = W[19];
+						  T2u = ri[WS(rs, 26)];
+						  T2x = ii[WS(rs, 26)];
+						  T89 = T2a * T2e;
+						  T2c = T2a * T2b;
+						  T2t = W[50];
+						  T2w = W[51];
+						  T8a = FNMS(T2d, T2b, T89);
+						  T2f = FMA(T2d, T2e, T2c);
+						  T8i = T2t * T2x;
+						  T2v = T2t * T2u;
+					     }
+					     {
+						  E T2h, T2k, T2g, T2j, T8b, T2i, T2n;
+						  T2h = ri[WS(rs, 42)];
+						  T2k = ii[WS(rs, 42)];
+						  T8j = FNMS(T2w, T2u, T8i);
+						  T2y = FMA(T2w, T2x, T2v);
+						  T2g = W[82];
+						  T2j = W[83];
+						  T2o = ri[WS(rs, 58)];
+						  T2r = ii[WS(rs, 58)];
+						  T8b = T2g * T2k;
+						  T2i = T2g * T2h;
+						  T2n = W[114];
+						  T2q = W[115];
+						  T8c = FNMS(T2j, T2h, T8b);
+						  T2l = FMA(T2j, T2k, T2i);
+						  T8g = T2n * T2r;
+						  T2p = T2n * T2o;
+					     }
+					}
+					{
+					     E T8d, Tf9, T2m, T88, T8h, T2s, Tfa, T2z;
+					     T8d = T8a - T8c;
+					     Tf9 = T8a + T8c;
+					     T2m = T2f + T2l;
+					     T88 = T2f - T2l;
+					     T8h = FNMS(T2q, T2o, T8g);
+					     T2s = FMA(T2q, T2r, T2p);
+					     T8k = T8h - T8j;
+					     Tfa = T8h + T8j;
+					     T2z = T2s + T2y;
+					     T8f = T2s - T2y;
+					     T8w = T8d - T88;
+					     T8e = T88 + T8d;
+					     Thw = Tf9 + Tfa;
+					     Tfb = Tf9 - Tfa;
+					     Tf6 = T2z - T2m;
+					     T2A = T2m + T2z;
+					}
+				   }
+				   {
+					E T38, T8J, T3h, T3k, T8L, T3e, T3g, T3j, T8P, T3i;
+					{
+					     E T3n, T3q, T3m, T3p;
+					     {
+						  E T34, T37, T33, T8v, T8l, T36, T8I, T35;
+						  T34 = ri[WS(rs, 6)];
+						  T37 = ii[WS(rs, 6)];
+						  T33 = W[10];
+						  T8v = T8f + T8k;
+						  T8l = T8f - T8k;
+						  T36 = W[11];
+						  T8I = T33 * T37;
+						  T35 = T33 * T34;
+						  T8x = T8v - T8w;
+						  TcO = T8w + T8v;
+						  T8m = T8e - T8l;
+						  TcR = T8e + T8l;
+						  T38 = FMA(T36, T37, T35);
+						  T8J = FNMS(T36, T34, T8I);
+					     }
+					     T3n = ri[WS(rs, 22)];
+					     T3q = ii[WS(rs, 22)];
+					     T3m = W[42];
+					     T3p = W[43];
+					     {
+						  E T3a, T3d, T3c, T8K, T3b, T8R, T3o, T39;
+						  T3a = ri[WS(rs, 38)];
+						  T3d = ii[WS(rs, 38)];
+						  T8R = T3m * T3q;
+						  T3o = T3m * T3n;
+						  T39 = W[74];
+						  T3c = W[75];
+						  T8S = FNMS(T3p, T3n, T8R);
+						  T3r = FMA(T3p, T3q, T3o);
+						  T8K = T39 * T3d;
+						  T3b = T39 * T3a;
+						  T3h = ri[WS(rs, 54)];
+						  T3k = ii[WS(rs, 54)];
+						  T8L = FNMS(T3c, T3a, T8K);
+						  T3e = FMA(T3c, T3d, T3b);
+						  T3g = W[106];
+						  T3j = W[107];
+					     }
+					}
+					T8M = T8J - T8L;
+					Tfk = T8J + T8L;
+					T3f = T38 + T3e;
+					T8H = T38 - T3e;
+					T8P = T3g * T3k;
+					T3i = T3g * T3h;
+					T8Q = FNMS(T3j, T3h, T8P);
+					T3l = FMA(T3j, T3k, T3i);
+				   }
+			      }
+			      {
+				   E T9u, T9p, Tac, T9o;
+				   {
+					E T9k, T43, T9t, T4m, T4c, T4f, T4e, T9m, T49, T9q, T4d;
+					{
+					     E T4i, T4l, T4k, T9s, T4j;
+					     {
+						  E T3Z, T42, T3Y, T41, T9j, T40, T4h;
+						  {
+						       E T95, T8N, T8T, Tfl, T8O, T3s, T8U, T94;
+						       T3Z = ri[WS(rs, 9)];
+						       T95 = T8M - T8H;
+						       T8N = T8H + T8M;
+						       T8T = T8Q - T8S;
+						       Tfl = T8Q + T8S;
+						       T8O = T3l - T3r;
+						       T3s = T3l + T3r;
+						       T42 = ii[WS(rs, 9)];
+						       Tfm = Tfk - Tfl;
+						       ThC = Tfk + Tfl;
+						       T8U = T8O - T8T;
+						       T94 = T8O + T8T;
+						       T3t = T3f + T3s;
+						       Tfh = T3s - T3f;
+						       T96 = T94 - T95;
+						       TcV = T95 + T94;
+						       T8V = T8N - T8U;
+						       TcY = T8N + T8U;
+						       T3Y = W[16];
+						  }
+						  T41 = W[17];
+						  T4i = ri[WS(rs, 25)];
+						  T4l = ii[WS(rs, 25)];
+						  T9j = T3Y * T42;
+						  T40 = T3Y * T3Z;
+						  T4h = W[48];
+						  T4k = W[49];
+						  T9k = FNMS(T41, T3Z, T9j);
+						  T43 = FMA(T41, T42, T40);
+						  T9s = T4h * T4l;
+						  T4j = T4h * T4i;
+					     }
+					     {
+						  E T45, T48, T44, T47, T9l, T46, T4b;
+						  T45 = ri[WS(rs, 41)];
+						  T48 = ii[WS(rs, 41)];
+						  T9t = FNMS(T4k, T4i, T9s);
+						  T4m = FMA(T4k, T4l, T4j);
+						  T44 = W[80];
+						  T47 = W[81];
+						  T4c = ri[WS(rs, 57)];
+						  T4f = ii[WS(rs, 57)];
+						  T9l = T44 * T48;
+						  T46 = T44 * T45;
+						  T4b = W[112];
+						  T4e = W[113];
+						  T9m = FNMS(T47, T45, T9l);
+						  T49 = FMA(T47, T48, T46);
+						  T9q = T4b * T4f;
+						  T4d = T4b * T4c;
+					     }
+					}
+					{
+					     E T9n, TfJ, T4a, T9i, T9r, T4g, TfK, T4n;
+					     T9n = T9k - T9m;
+					     TfJ = T9k + T9m;
+					     T4a = T43 + T49;
+					     T9i = T43 - T49;
+					     T9r = FNMS(T4e, T4c, T9q);
+					     T4g = FMA(T4e, T4f, T4d);
+					     T9u = T9r - T9t;
+					     TfK = T9r + T9t;
+					     T4n = T4g + T4m;
+					     T9p = T4g - T4m;
+					     Tac = T9n - T9i;
+					     T9o = T9i + T9n;
+					     ThI = TfJ + TfK;
+					     TfL = TfJ - TfK;
+					     Tfu = T4n - T4a;
+					     T4o = T4a + T4n;
+					}
+				   }
+				   {
+					E T5Q, Tat, T5Z, T62, Tav, T5W, T5Y, T61, Taz, T60;
+					{
+					     E T65, T68, T64, T67;
+					     {
+						  E T5M, T5P, T5L, Tab, T9v, T5O, Tas, T5N;
+						  T5M = ri[WS(rs, 7)];
+						  T5P = ii[WS(rs, 7)];
+						  T5L = W[12];
+						  Tab = T9p + T9u;
+						  T9v = T9p - T9u;
+						  T5O = W[13];
+						  Tas = T5L * T5P;
+						  T5N = T5L * T5M;
+						  Tad = Tab - Tac;
+						  Td4 = Tac + Tab;
+						  T9w = T9o - T9v;
+						  Tdf = T9o + T9v;
+						  T5Q = FMA(T5O, T5P, T5N);
+						  Tat = FNMS(T5O, T5M, Tas);
+					     }
+					     T65 = ri[WS(rs, 23)];
+					     T68 = ii[WS(rs, 23)];
+					     T64 = W[44];
+					     T67 = W[45];
+					     {
+						  E T5S, T5V, T5U, Tau, T5T, TaB, T66, T5R;
+						  T5S = ri[WS(rs, 39)];
+						  T5V = ii[WS(rs, 39)];
+						  TaB = T64 * T68;
+						  T66 = T64 * T65;
+						  T5R = W[76];
+						  T5U = W[77];
+						  TaC = FNMS(T67, T65, TaB);
+						  T69 = FMA(T67, T68, T66);
+						  Tau = T5R * T5V;
+						  T5T = T5R * T5S;
+						  T5Z = ri[WS(rs, 55)];
+						  T62 = ii[WS(rs, 55)];
+						  Tav = FNMS(T5U, T5S, Tau);
+						  T5W = FMA(T5U, T5V, T5T);
+						  T5Y = W[108];
+						  T61 = W[109];
+					     }
+					}
+					Taw = Tat - Tav;
+					Tga = Tat + Tav;
+					T5X = T5Q + T5W;
+					Tar = T5Q - T5W;
+					Taz = T5Y * T62;
+					T60 = T5Y * T5Z;
+					TaA = FNMS(T61, T5Z, Taz);
+					T63 = FMA(T61, T62, T60);
+				   }
+			      }
+			 }
+			 {
+			      E T9E, Tda, TfE, TfB, Td9, T9L;
+			      {
+				   E T9T, Td7, Tfy, Tfz, Td6, Ta0;
+				   {
+					E T9V, T4v, T9R, T4O, T4E, T4H, T4G, T9X, T4B, T9O, T4F;
+					{
+					     E T4K, T4N, T4M, T9Q, T4L;
+					     {
+						  E T4r, T4u, T4q, T4t, T9U, T4s, T4J;
+						  {
+						       E Tbl, Tax, TaD, Tgb, Tay, T6a, TaE, Tbk;
+						       T4r = ri[WS(rs, 5)];
+						       Tbl = Taw - Tar;
+						       Tax = Tar + Taw;
+						       TaD = TaA - TaC;
+						       Tgb = TaA + TaC;
+						       Tay = T63 - T69;
+						       T6a = T63 + T69;
+						       T4u = ii[WS(rs, 5)];
+						       Tgc = Tga - Tgb;
+						       ThT = Tga + Tgb;
+						       TaE = Tay - TaD;
+						       Tbk = Tay + TaD;
+						       T6b = T5X + T6a;
+						       TfV = T6a - T5X;
+						       Tbm = Tbk - Tbl;
+						       Tdn = Tbl + Tbk;
+						       TaF = Tax - TaE;
+						       Tdy = Tax + TaE;
+						       T4q = W[8];
+						  }
+						  T4t = W[9];
+						  T4K = ri[WS(rs, 53)];
+						  T4N = ii[WS(rs, 53)];
+						  T9U = T4q * T4u;
+						  T4s = T4q * T4r;
+						  T4J = W[104];
+						  T4M = W[105];
+						  T9V = FNMS(T4t, T4r, T9U);
+						  T4v = FMA(T4t, T4u, T4s);
+						  T9Q = T4J * T4N;
+						  T4L = T4J * T4K;
+					     }
+					     {
+						  E T4x, T4A, T4w, T4z, T9W, T4y, T4D;
+						  T4x = ri[WS(rs, 37)];
+						  T4A = ii[WS(rs, 37)];
+						  T9R = FNMS(T4M, T4K, T9Q);
+						  T4O = FMA(T4M, T4N, T4L);
+						  T4w = W[72];
+						  T4z = W[73];
+						  T4E = ri[WS(rs, 21)];
+						  T4H = ii[WS(rs, 21)];
+						  T9W = T4w * T4A;
+						  T4y = T4w * T4x;
+						  T4D = W[40];
+						  T4G = W[41];
+						  T9X = FNMS(T4z, T4x, T9W);
+						  T4B = FMA(T4z, T4A, T4y);
+						  T9O = T4D * T4H;
+						  T4F = T4D * T4E;
+					     }
+					}
+					{
+					     E T9Y, Tfw, T4C, T9N, T9P, T4I;
+					     T9Y = T9V - T9X;
+					     Tfw = T9V + T9X;
+					     T4C = T4v + T4B;
+					     T9N = T4v - T4B;
+					     T9P = FNMS(T4G, T4E, T9O);
+					     T4I = FMA(T4G, T4H, T4F);
+					     {
+						  E Tfx, T9S, T9Z, T4P;
+						  Tfx = T9P + T9R;
+						  T9S = T9P - T9R;
+						  T9Z = T4I - T4O;
+						  T4P = T4I + T4O;
+						  T9T = T9N - T9S;
+						  Td7 = T9N + T9S;
+						  Tfy = Tfw - Tfx;
+						  ThN = Tfw + Tfx;
+						  Tfz = T4C - T4P;
+						  T4Q = T4C + T4P;
+						  Td6 = T9Y - T9Z;
+						  Ta0 = T9Y + T9Z;
+					     }
+					}
+				   }
+				   {
+					E T9G, T4W, T9C, T5f, T55, T58, T57, T9I, T52, T9z, T56;
+					{
+					     E T5b, T5e, T5d, T9B, T5c;
+					     {
+						  E T4S, T4V, T4R, T4U, T9F, T4T, T5a;
+						  T4S = ri[WS(rs, 61)];
+						  TfN = Tfz + Tfy;
+						  TfA = Tfy - Tfz;
+						  Taf = FMA(KP414213562, T9T, Ta0);
+						  Ta1 = FNMS(KP414213562, Ta0, T9T);
+						  Td8 = FNMS(KP414213562, Td7, Td6);
+						  Tdh = FMA(KP414213562, Td6, Td7);
+						  T4V = ii[WS(rs, 61)];
+						  T4R = W[120];
+						  T4U = W[121];
+						  T5b = ri[WS(rs, 45)];
+						  T5e = ii[WS(rs, 45)];
+						  T9F = T4R * T4V;
+						  T4T = T4R * T4S;
+						  T5a = W[88];
+						  T5d = W[89];
+						  T9G = FNMS(T4U, T4S, T9F);
+						  T4W = FMA(T4U, T4V, T4T);
+						  T9B = T5a * T5e;
+						  T5c = T5a * T5b;
+					     }
+					     {
+						  E T4Y, T51, T4X, T50, T9H, T4Z, T54;
+						  T4Y = ri[WS(rs, 29)];
+						  T51 = ii[WS(rs, 29)];
+						  T9C = FNMS(T5d, T5b, T9B);
+						  T5f = FMA(T5d, T5e, T5c);
+						  T4X = W[56];
+						  T50 = W[57];
+						  T55 = ri[WS(rs, 13)];
+						  T58 = ii[WS(rs, 13)];
+						  T9H = T4X * T51;
+						  T4Z = T4X * T4Y;
+						  T54 = W[24];
+						  T57 = W[25];
+						  T9I = FNMS(T50, T4Y, T9H);
+						  T52 = FMA(T50, T51, T4Z);
+						  T9z = T54 * T58;
+						  T56 = T54 * T55;
+					     }
+					}
+					{
+					     E T9J, TfC, T53, T9y, T9A, T59;
+					     T9J = T9G - T9I;
+					     TfC = T9G + T9I;
+					     T53 = T4W + T52;
+					     T9y = T4W - T52;
+					     T9A = FNMS(T57, T55, T9z);
+					     T59 = FMA(T57, T58, T56);
+					     {
+						  E TfD, T9D, T9K, T5g;
+						  TfD = T9A + T9C;
+						  T9D = T9A - T9C;
+						  T9K = T59 - T5f;
+						  T5g = T59 + T5f;
+						  T9E = T9y - T9D;
+						  Tda = T9y + T9D;
+						  TfE = TfC - TfD;
+						  ThO = TfC + TfD;
+						  TfB = T53 - T5g;
+						  T5h = T53 + T5g;
+						  Td9 = T9J - T9K;
+						  T9L = T9J + T9K;
+					     }
+					}
+				   }
+			      }
+			      {
+				   E Tb2, Tdq, TfZ, Tg0, Tdp, Tb9;
+				   {
+					E Tb4, T6i, Tb0, T6B, T6r, T6u, T6t, Tb6, T6o, TaX, T6s;
+					{
+					     E T6x, T6A, T6z, TaZ, T6y;
+					     {
+						  E T6e, T6h, T6d, T6g, Tb3, T6f, T6w;
+						  T6e = ri[WS(rs, 3)];
+						  TfO = TfB - TfE;
+						  TfF = TfB + TfE;
+						  Tag = FNMS(KP414213562, T9E, T9L);
+						  T9M = FMA(KP414213562, T9L, T9E);
+						  Tdb = FMA(KP414213562, Tda, Td9);
+						  Tdi = FNMS(KP414213562, Td9, Tda);
+						  T6h = ii[WS(rs, 3)];
+						  T6d = W[4];
+						  T6g = W[5];
+						  T6x = ri[WS(rs, 51)];
+						  T6A = ii[WS(rs, 51)];
+						  Tb3 = T6d * T6h;
+						  T6f = T6d * T6e;
+						  T6w = W[100];
+						  T6z = W[101];
+						  Tb4 = FNMS(T6g, T6e, Tb3);
+						  T6i = FMA(T6g, T6h, T6f);
+						  TaZ = T6w * T6A;
+						  T6y = T6w * T6x;
+					     }
+					     {
+						  E T6k, T6n, T6j, T6m, Tb5, T6l, T6q;
+						  T6k = ri[WS(rs, 35)];
+						  T6n = ii[WS(rs, 35)];
+						  Tb0 = FNMS(T6z, T6x, TaZ);
+						  T6B = FMA(T6z, T6A, T6y);
+						  T6j = W[68];
+						  T6m = W[69];
+						  T6r = ri[WS(rs, 19)];
+						  T6u = ii[WS(rs, 19)];
+						  Tb5 = T6j * T6n;
+						  T6l = T6j * T6k;
+						  T6q = W[36];
+						  T6t = W[37];
+						  Tb6 = FNMS(T6m, T6k, Tb5);
+						  T6o = FMA(T6m, T6n, T6l);
+						  TaX = T6q * T6u;
+						  T6s = T6q * T6r;
+					     }
+					}
+					{
+					     E Tb7, TfX, T6p, TaW, TaY, T6v;
+					     Tb7 = Tb4 - Tb6;
+					     TfX = Tb4 + Tb6;
+					     T6p = T6i + T6o;
+					     TaW = T6i - T6o;
+					     TaY = FNMS(T6t, T6r, TaX);
+					     T6v = FMA(T6t, T6u, T6s);
+					     {
+						  E TfY, Tb1, Tb8, T6C;
+						  TfY = TaY + Tb0;
+						  Tb1 = TaY - Tb0;
+						  Tb8 = T6v - T6B;
+						  T6C = T6v + T6B;
+						  Tb2 = TaW - Tb1;
+						  Tdq = TaW + Tb1;
+						  TfZ = TfX - TfY;
+						  ThY = TfX + TfY;
+						  Tg0 = T6p - T6C;
+						  T6D = T6p + T6C;
+						  Tdp = Tb7 - Tb8;
+						  Tb9 = Tb7 + Tb8;
+					     }
+					}
+				   }
+				   {
+					E TaP, T6J, TaL, T72, T6S, T6V, T6U, TaR, T6P, TaI, T6T;
+					{
+					     E T6Y, T71, T70, TaK, T6Z;
+					     {
+						  E T6F, T6I, T6E, T6H, TaO, T6G, T6X;
+						  T6F = ri[WS(rs, 59)];
+						  Tge = Tg0 + TfZ;
+						  Tg1 = TfZ - Tg0;
+						  Tbo = FMA(KP414213562, Tb2, Tb9);
+						  Tba = FNMS(KP414213562, Tb9, Tb2);
+						  Tdr = FNMS(KP414213562, Tdq, Tdp);
+						  TdA = FMA(KP414213562, Tdp, Tdq);
+						  T6I = ii[WS(rs, 59)];
+						  T6E = W[116];
+						  T6H = W[117];
+						  T6Y = ri[WS(rs, 43)];
+						  T71 = ii[WS(rs, 43)];
+						  TaO = T6E * T6I;
+						  T6G = T6E * T6F;
+						  T6X = W[84];
+						  T70 = W[85];
+						  TaP = FNMS(T6H, T6F, TaO);
+						  T6J = FMA(T6H, T6I, T6G);
+						  TaK = T6X * T71;
+						  T6Z = T6X * T6Y;
+					     }
+					     {
+						  E T6L, T6O, T6K, T6N, TaQ, T6M, T6R;
+						  T6L = ri[WS(rs, 27)];
+						  T6O = ii[WS(rs, 27)];
+						  TaL = FNMS(T70, T6Y, TaK);
+						  T72 = FMA(T70, T71, T6Z);
+						  T6K = W[52];
+						  T6N = W[53];
+						  T6S = ri[WS(rs, 11)];
+						  T6V = ii[WS(rs, 11)];
+						  TaQ = T6K * T6O;
+						  T6M = T6K * T6L;
+						  T6R = W[20];
+						  T6U = W[21];
+						  TaR = FNMS(T6N, T6L, TaQ);
+						  T6P = FMA(T6N, T6O, T6M);
+						  TaI = T6R * T6V;
+						  T6T = T6R * T6S;
+					     }
+					}
+					{
+					     E TaS, Tg3, T6Q, TaH, TaJ, T6W;
+					     TaS = TaP - TaR;
+					     Tg3 = TaP + TaR;
+					     T6Q = T6J + T6P;
+					     TaH = T6J - T6P;
+					     TaJ = FNMS(T6U, T6S, TaI);
+					     T6W = FMA(T6U, T6V, T6T);
+					     {
+						  E Tg4, TaM, TaT, T73;
+						  Tg4 = TaJ + TaL;
+						  TaM = TaJ - TaL;
+						  TaT = T6W - T72;
+						  T73 = T6W + T72;
+						  TaN = TaH - TaM;
+						  Tdt = TaH + TaM;
+						  Tg5 = Tg3 - Tg4;
+						  ThZ = Tg3 + Tg4;
+						  Tg2 = T6Q - T73;
+						  T74 = T6Q + T73;
+						  Tds = TaS - TaT;
+						  TaU = TaS + TaT;
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E Tgf, Tg6, Tbp, TaV, Tdu, TdB, Tje, Tjd, TjO, TjN;
+			 {
+			      E Thq, Tj7, Thy, ThA, Tht, Tj8, Thx, ThD, ThX, ThV, ThU, Ti0, ThM, ThK, ThJ;
+			      E ThP, TiI, TiZ, TiL, Tj0;
+			      {
+				   E Tio, T1I, Tj1, T3v, Tj2, TiX, TiN, Tir, T76, TiK, TiC, TiG, T5j, Tit, Tiw;
+				   E TiJ;
+				   {
+					E TiO, TiW, Tip, Tiq;
+					{
+					     E TO, T1H, T2B, T3u;
+					     Thq = Tm - TN;
+					     TO = Tm + TN;
+					     Tgf = Tg2 - Tg5;
+					     Tg6 = Tg2 + Tg5;
+					     Tbp = FNMS(KP414213562, TaN, TaU);
+					     TaV = FMA(KP414213562, TaU, TaN);
+					     Tdu = FMA(KP414213562, Tdt, Tds);
+					     TdB = FNMS(KP414213562, Tds, Tdt);
+					     T1H = T1f + T1G;
+					     Tj7 = T1G - T1f;
+					     Thy = T29 - T2A;
+					     T2B = T29 + T2A;
+					     T3u = T32 + T3t;
+					     ThA = T32 - T3t;
+					     Tht = Thr - Ths;
+					     TiO = Thr + Ths;
+					     Tio = TO - T1H;
+					     T1I = TO + T1H;
+					     Tj1 = T3u - T2B;
+					     T3v = T2B + T3u;
+					     TiW = TiP + TiV;
+					     Tj8 = TiV - TiP;
+					}
+					Thx = Thv - Thw;
+					Tip = Thv + Thw;
+					Tiq = ThB + ThC;
+					ThD = ThB - ThC;
+					{
+					     E T6c, T75, Tiz, TiA;
+					     ThX = T5K - T6b;
+					     T6c = T5K + T6b;
+					     Tj2 = TiW - TiO;
+					     TiX = TiO + TiW;
+					     TiN = Tip + Tiq;
+					     Tir = Tip - Tiq;
+					     T75 = T6D + T74;
+					     ThV = T74 - T6D;
+					     ThU = ThS - ThT;
+					     Tiz = ThS + ThT;
+					     TiA = ThY + ThZ;
+					     Ti0 = ThY - ThZ;
+					     {
+						  E T4p, Tiy, TiB, T5i, Tiu, Tiv;
+						  ThM = T3X - T4o;
+						  T4p = T3X + T4o;
+						  T76 = T6c + T75;
+						  Tiy = T6c - T75;
+						  TiK = Tiz + TiA;
+						  TiB = Tiz - TiA;
+						  T5i = T4Q + T5h;
+						  ThK = T5h - T4Q;
+						  ThJ = ThH - ThI;
+						  Tiu = ThH + ThI;
+						  Tiv = ThN + ThO;
+						  ThP = ThN - ThO;
+						  TiC = Tiy - TiB;
+						  TiG = Tiy + TiB;
+						  T5j = T4p + T5i;
+						  Tit = T4p - T5i;
+						  Tiw = Tiu - Tiv;
+						  TiJ = Tiu + Tiv;
+					     }
+					}
+				   }
+				   {
+					E TiE, Tis, TiD, Tj6, Tj5, Tj3, Tj4, TiH;
+					{
+					     E T3w, TiF, Tix, T77, TiM, TiY;
+					     TiI = T1I - T3v;
+					     T3w = T1I + T3v;
+					     TiF = Tiw - Tit;
+					     Tix = Tit + Tiw;
+					     T77 = T5j + T76;
+					     TiZ = T76 - T5j;
+					     TiL = TiJ - TiK;
+					     TiM = TiJ + TiK;
+					     TiY = TiN + TiX;
+					     Tj0 = TiX - TiN;
+					     TiE = Tio - Tir;
+					     Tis = Tio + Tir;
+					     ri[0] = T3w + T77;
+					     ri[WS(rs, 32)] = T3w - T77;
+					     ii[WS(rs, 32)] = TiY - TiM;
+					     ii[0] = TiM + TiY;
+					     TiD = Tix + TiC;
+					     Tj6 = TiC - Tix;
+					     Tj5 = Tj2 - Tj1;
+					     Tj3 = Tj1 + Tj2;
+					     Tj4 = TiF + TiG;
+					     TiH = TiF - TiG;
+					}
+					ri[WS(rs, 8)] = FMA(KP707106781, TiD, Tis);
+					ri[WS(rs, 40)] = FNMS(KP707106781, TiD, Tis);
+					ii[WS(rs, 40)] = FNMS(KP707106781, Tj4, Tj3);
+					ii[WS(rs, 8)] = FMA(KP707106781, Tj4, Tj3);
+					ri[WS(rs, 24)] = FMA(KP707106781, TiH, TiE);
+					ri[WS(rs, 56)] = FNMS(KP707106781, TiH, TiE);
+					ii[WS(rs, 56)] = FNMS(KP707106781, Tj6, Tj5);
+					ii[WS(rs, 24)] = FMA(KP707106781, Tj6, Tj5);
+				   }
+			      }
+			      {
+				   E Ti8, Thu, Tjf, Tj9, Tib, Tjg, Tja, ThF, Tih, ThW, Tif, Til, Ti5, ThR;
+				   ri[WS(rs, 16)] = TiI + TiL;
+				   ri[WS(rs, 48)] = TiI - TiL;
+				   ii[WS(rs, 48)] = Tj0 - TiZ;
+				   ii[WS(rs, 16)] = TiZ + Tj0;
+				   Ti8 = Thq + Tht;
+				   Thu = Thq - Tht;
+				   Tjf = Tj8 - Tj7;
+				   Tj9 = Tj7 + Tj8;
+				   {
+					E Tie, ThL, Tid, ThQ;
+					{
+					     E Ti9, Thz, Tia, ThE;
+					     Ti9 = Thy + Thx;
+					     Thz = Thx - Thy;
+					     Tia = ThA - ThD;
+					     ThE = ThA + ThD;
+					     Tib = Ti9 + Tia;
+					     Tjg = Tia - Ti9;
+					     Tja = Thz + ThE;
+					     ThF = Thz - ThE;
+					     Tie = ThJ + ThK;
+					     ThL = ThJ - ThK;
+					}
+					Tid = ThM + ThP;
+					ThQ = ThM - ThP;
+					Tih = ThU + ThV;
+					ThW = ThU - ThV;
+					Tif = FMA(KP414213562, Tie, Tid);
+					Til = FNMS(KP414213562, Tid, Tie);
+					Ti5 = FNMS(KP414213562, ThL, ThQ);
+					ThR = FMA(KP414213562, ThQ, ThL);
+				   }
+				   {
+					E Ti4, ThG, Tjh, Tjj, Tig, Ti1;
+					Ti4 = FNMS(KP707106781, ThF, Thu);
+					ThG = FMA(KP707106781, ThF, Thu);
+					Tjh = FMA(KP707106781, Tjg, Tjf);
+					Tjj = FNMS(KP707106781, Tjg, Tjf);
+					Tig = ThX + Ti0;
+					Ti1 = ThX - Ti0;
+					{
+					     E Tik, Tjb, Tjc, Tin;
+					     {
+						  E Tic, Tim, Ti6, Ti2, Tij, Tii;
+						  Tik = FNMS(KP707106781, Tib, Ti8);
+						  Tic = FMA(KP707106781, Tib, Ti8);
+						  Tii = FNMS(KP414213562, Tih, Tig);
+						  Tim = FMA(KP414213562, Tig, Tih);
+						  Ti6 = FMA(KP414213562, ThW, Ti1);
+						  Ti2 = FNMS(KP414213562, Ti1, ThW);
+						  Tij = Tif + Tii;
+						  Tje = Tii - Tif;
+						  Tjd = FNMS(KP707106781, Tja, Tj9);
+						  Tjb = FMA(KP707106781, Tja, Tj9);
+						  {
+						       E Ti7, Tji, Tjk, Ti3;
+						       Ti7 = Ti5 + Ti6;
+						       Tji = Ti6 - Ti5;
+						       Tjk = ThR + Ti2;
+						       Ti3 = ThR - Ti2;
+						       ri[WS(rs, 4)] = FMA(KP923879532, Tij, Tic);
+						       ri[WS(rs, 36)] = FNMS(KP923879532, Tij, Tic);
+						       ri[WS(rs, 60)] = FMA(KP923879532, Ti7, Ti4);
+						       ri[WS(rs, 28)] = FNMS(KP923879532, Ti7, Ti4);
+						       ii[WS(rs, 44)] = FNMS(KP923879532, Tji, Tjh);
+						       ii[WS(rs, 12)] = FMA(KP923879532, Tji, Tjh);
+						       ii[WS(rs, 60)] = FMA(KP923879532, Tjk, Tjj);
+						       ii[WS(rs, 28)] = FNMS(KP923879532, Tjk, Tjj);
+						       ri[WS(rs, 12)] = FMA(KP923879532, Ti3, ThG);
+						       ri[WS(rs, 44)] = FNMS(KP923879532, Ti3, ThG);
+						       Tjc = Til + Tim;
+						       Tin = Til - Tim;
+						  }
+					     }
+					     ii[WS(rs, 36)] = FNMS(KP923879532, Tjc, Tjb);
+					     ii[WS(rs, 4)] = FMA(KP923879532, Tjc, Tjb);
+					     ri[WS(rs, 20)] = FMA(KP923879532, Tin, Tik);
+					     ri[WS(rs, 52)] = FNMS(KP923879532, Tin, Tik);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E TjD, TjJ, Tgo, Tf2, Tjp, Tjv, Tha, TgI, Tgd, Tgr, Tjw, Tjq, Tfp, Tgg, Thk;
+			      E Tho, Th8, Th4, Tgv, TgB, Tgl, TfR, TjE, Thd, TjK, TgP, Tgx, Tg8, Thh, Thn;
+			      E Th7, TgX;
+			      {
+				   E TgJ, TgK, TgM, TgN, Tg7, TfW, Th1, Thj, Th0, Th2;
+				   {
+					E TgE, TeQ, TjB, Tjn, TgF, TgG, TjC, Tf1, TeV, Tf0;
+					TgE = TeM - TeP;
+					TeQ = TeM + TeP;
+					TjB = Tjm - Tjl;
+					Tjn = Tjl + Tjm;
+					TgF = TeU - TeR;
+					TeV = TeR + TeU;
+					ii[WS(rs, 52)] = FNMS(KP923879532, Tje, Tjd);
+					ii[WS(rs, 20)] = FMA(KP923879532, Tje, Tjd);
+					Tf0 = TeW - TeZ;
+					TgG = TeW + TeZ;
+					TjC = Tf0 - TeV;
+					Tf1 = TeV + Tf0;
+					{
+					     E Tfi, Tgp, Tfd, Tfn;
+					     {
+						  E Tf7, Tjo, TgH, Tfc;
+						  TgJ = Tf5 - Tf6;
+						  Tf7 = Tf5 + Tf6;
+						  TjD = FMA(KP707106781, TjC, TjB);
+						  TjJ = FNMS(KP707106781, TjC, TjB);
+						  Tgo = FMA(KP707106781, Tf1, TeQ);
+						  Tf2 = FNMS(KP707106781, Tf1, TeQ);
+						  Tjo = TgF + TgG;
+						  TgH = TgF - TgG;
+						  Tfc = Tf8 + Tfb;
+						  TgK = Tf8 - Tfb;
+						  TgM = Tfg - Tfh;
+						  Tfi = Tfg + Tfh;
+						  Tjp = FMA(KP707106781, Tjo, Tjn);
+						  Tjv = FNMS(KP707106781, Tjo, Tjn);
+						  Tha = FNMS(KP707106781, TgH, TgE);
+						  TgI = FMA(KP707106781, TgH, TgE);
+						  Tgp = FMA(KP414213562, Tf7, Tfc);
+						  Tfd = FNMS(KP414213562, Tfc, Tf7);
+						  Tfn = Tfj + Tfm;
+						  TgN = Tfj - Tfm;
+					     }
+					     {
+						  E TgY, TgZ, Tgq, Tfo;
+						  Tgd = Tg9 + Tgc;
+						  TgY = Tg9 - Tgc;
+						  TgZ = Tg6 - Tg1;
+						  Tg7 = Tg1 + Tg6;
+						  TfW = TfU + TfV;
+						  Th1 = TfU - TfV;
+						  Tgq = FNMS(KP414213562, Tfi, Tfn);
+						  Tfo = FMA(KP414213562, Tfn, Tfi);
+						  Thj = FMA(KP707106781, TgZ, TgY);
+						  Th0 = FNMS(KP707106781, TgZ, TgY);
+						  Tgr = Tgp + Tgq;
+						  Tjw = Tgq - Tgp;
+						  Tjq = Tfd + Tfo;
+						  Tfp = Tfd - Tfo;
+						  Th2 = Tge - Tgf;
+						  Tgg = Tge + Tgf;
+					     }
+					}
+				   }
+				   {
+					E TgU, TgS, TgR, TgV, Thb, TgL;
+					{
+					     E TfM, Tgu, TfH, TfP, Tgt, TfQ;
+					     {
+						  E Tfv, TfG, Thi, Th3;
+						  TgU = Tft - Tfu;
+						  Tfv = Tft + Tfu;
+						  TfG = TfA + TfF;
+						  TgS = TfF - TfA;
+						  TgR = TfI - TfL;
+						  TfM = TfI + TfL;
+						  Thi = FMA(KP707106781, Th2, Th1);
+						  Th3 = FNMS(KP707106781, Th2, Th1);
+						  Tgu = FMA(KP707106781, TfG, Tfv);
+						  TfH = FNMS(KP707106781, TfG, Tfv);
+						  Thk = FNMS(KP198912367, Thj, Thi);
+						  Tho = FMA(KP198912367, Thi, Thj);
+						  Th8 = FMA(KP668178637, Th0, Th3);
+						  Th4 = FNMS(KP668178637, Th3, Th0);
+						  TfP = TfN + TfO;
+						  TgV = TfN - TfO;
+					     }
+					     Tgt = FMA(KP707106781, TfP, TfM);
+					     TfQ = FNMS(KP707106781, TfP, TfM);
+					     Thb = FNMS(KP414213562, TgJ, TgK);
+					     TgL = FMA(KP414213562, TgK, TgJ);
+					     Tgv = FMA(KP198912367, Tgu, Tgt);
+					     TgB = FNMS(KP198912367, Tgt, Tgu);
+					     Tgl = FNMS(KP668178637, TfH, TfQ);
+					     TfR = FMA(KP668178637, TfQ, TfH);
+					}
+					{
+					     E Thg, TgT, Thc, TgO, Thf, TgW;
+					     Thc = FMA(KP414213562, TgM, TgN);
+					     TgO = FNMS(KP414213562, TgN, TgM);
+					     Thg = FMA(KP707106781, TgS, TgR);
+					     TgT = FNMS(KP707106781, TgS, TgR);
+					     TjE = Thc - Thb;
+					     Thd = Thb + Thc;
+					     TjK = TgL + TgO;
+					     TgP = TgL - TgO;
+					     Thf = FMA(KP707106781, TgV, TgU);
+					     TgW = FNMS(KP707106781, TgV, TgU);
+					     Tgx = FMA(KP707106781, Tg7, TfW);
+					     Tg8 = FNMS(KP707106781, Tg7, TfW);
+					     Thh = FMA(KP198912367, Thg, Thf);
+					     Thn = FNMS(KP198912367, Thf, Thg);
+					     Th7 = FNMS(KP668178637, TgT, TgW);
+					     TgX = FMA(KP668178637, TgW, TgT);
+					}
+				   }
+			      }
+			      {
+				   E Tju, Tjt, TjI, TjH;
+				   {
+					E Tgk, Tfq, Tjx, Tjz, Tgw, Tgh;
+					Tgk = FNMS(KP923879532, Tfp, Tf2);
+					Tfq = FMA(KP923879532, Tfp, Tf2);
+					Tjx = FMA(KP923879532, Tjw, Tjv);
+					Tjz = FNMS(KP923879532, Tjw, Tjv);
+					Tgw = FMA(KP707106781, Tgg, Tgd);
+					Tgh = FNMS(KP707106781, Tgg, Tgd);
+					{
+					     E TgA, Tjr, Tjs, TgD;
+					     {
+						  E Tgs, TgC, Tgm, Tgi, Tgz, Tgy;
+						  TgA = FNMS(KP923879532, Tgr, Tgo);
+						  Tgs = FMA(KP923879532, Tgr, Tgo);
+						  Tgy = FNMS(KP198912367, Tgx, Tgw);
+						  TgC = FMA(KP198912367, Tgw, Tgx);
+						  Tgm = FMA(KP668178637, Tg8, Tgh);
+						  Tgi = FNMS(KP668178637, Tgh, Tg8);
+						  Tgz = Tgv + Tgy;
+						  Tju = Tgy - Tgv;
+						  Tjt = FNMS(KP923879532, Tjq, Tjp);
+						  Tjr = FMA(KP923879532, Tjq, Tjp);
+						  {
+						       E Tgn, Tjy, TjA, Tgj;
+						       Tgn = Tgl + Tgm;
+						       Tjy = Tgm - Tgl;
+						       TjA = TfR + Tgi;
+						       Tgj = TfR - Tgi;
+						       ri[WS(rs, 2)] = FMA(KP980785280, Tgz, Tgs);
+						       ri[WS(rs, 34)] = FNMS(KP980785280, Tgz, Tgs);
+						       ri[WS(rs, 58)] = FMA(KP831469612, Tgn, Tgk);
+						       ri[WS(rs, 26)] = FNMS(KP831469612, Tgn, Tgk);
+						       ii[WS(rs, 42)] = FNMS(KP831469612, Tjy, Tjx);
+						       ii[WS(rs, 10)] = FMA(KP831469612, Tjy, Tjx);
+						       ii[WS(rs, 58)] = FMA(KP831469612, TjA, Tjz);
+						       ii[WS(rs, 26)] = FNMS(KP831469612, TjA, Tjz);
+						       ri[WS(rs, 10)] = FMA(KP831469612, Tgj, Tfq);
+						       ri[WS(rs, 42)] = FNMS(KP831469612, Tgj, Tfq);
+						       Tjs = TgB + TgC;
+						       TgD = TgB - TgC;
+						  }
+					     }
+					     ii[WS(rs, 34)] = FNMS(KP980785280, Tjs, Tjr);
+					     ii[WS(rs, 2)] = FMA(KP980785280, Tjs, Tjr);
+					     ri[WS(rs, 18)] = FMA(KP980785280, TgD, TgA);
+					     ri[WS(rs, 50)] = FNMS(KP980785280, TgD, TgA);
+					}
+				   }
+				   {
+					E Th6, TjF, TjG, Th9, TgQ, Th5;
+					Th6 = FNMS(KP923879532, TgP, TgI);
+					TgQ = FMA(KP923879532, TgP, TgI);
+					Th5 = TgX + Th4;
+					TjI = Th4 - TgX;
+					TjH = FNMS(KP923879532, TjE, TjD);
+					TjF = FMA(KP923879532, TjE, TjD);
+					ii[WS(rs, 50)] = FNMS(KP980785280, Tju, Tjt);
+					ii[WS(rs, 18)] = FMA(KP980785280, Tju, Tjt);
+					ri[WS(rs, 6)] = FMA(KP831469612, Th5, TgQ);
+					ri[WS(rs, 38)] = FNMS(KP831469612, Th5, TgQ);
+					TjG = Th7 + Th8;
+					Th9 = Th7 - Th8;
+					ii[WS(rs, 38)] = FNMS(KP831469612, TjG, TjF);
+					ii[WS(rs, 6)] = FMA(KP831469612, TjG, TjF);
+					ri[WS(rs, 22)] = FMA(KP831469612, Th9, Th6);
+					ri[WS(rs, 54)] = FNMS(KP831469612, Th9, Th6);
+				   }
+				   {
+					E Thm, TjL, TjM, Thp, The, Thl;
+					Thm = FMA(KP923879532, Thd, Tha);
+					The = FNMS(KP923879532, Thd, Tha);
+					Thl = Thh - Thk;
+					TjO = Thh + Thk;
+					TjN = FMA(KP923879532, TjK, TjJ);
+					TjL = FNMS(KP923879532, TjK, TjJ);
+					ii[WS(rs, 54)] = FNMS(KP831469612, TjI, TjH);
+					ii[WS(rs, 22)] = FMA(KP831469612, TjI, TjH);
+					ri[WS(rs, 14)] = FMA(KP980785280, Thl, The);
+					ri[WS(rs, 46)] = FNMS(KP980785280, Thl, The);
+					TjM = Tho - Thn;
+					Thp = Thn + Tho;
+					ii[WS(rs, 46)] = FNMS(KP980785280, TjM, TjL);
+					ii[WS(rs, 14)] = FMA(KP980785280, TjM, TjL);
+					ri[WS(rs, 62)] = FMA(KP980785280, Thp, Thm);
+					ri[WS(rs, 30)] = FNMS(KP980785280, Thp, Thm);
+				   }
+			      }
+			 }
+			 {
+			      E TjS, TcD, Tcw, TkO, TkN, Tcz;
+			      {
+				   E TbB, Tkw, Tkq, T99, TbF, TbL, Tbv, Taj, Tcu, Tcy, Tci, Tce, Tcr, Tcx, Tch;
+				   E Tc7, TkE, Tcn, TkK, TbZ, TbP, T7J, TbO, T7u, TkB, Tkn, TbI, TbM, Tbw, Tbs;
+				   E T7Y, TbQ;
+				   {
+					E TbT, TbU, TbW, TbX, Tc4, Tc2, Tc1, Tc5, Tbn, Tbb, TaG, Tcb, Tct, Tca, Tcc;
+					E Tbq, Tcl, TbV;
+					{
+					     E T8W, Tbz, T8z, T97, T8n, T8y;
+					     TbT = FMA(KP707106781, T8m, T87);
+					     T8n = FNMS(KP707106781, T8m, T87);
+					     T8y = FNMS(KP707106781, T8x, T8u);
+					     TbU = FMA(KP707106781, T8x, T8u);
+					     TbW = FMA(KP707106781, T8V, T8G);
+					     T8W = FNMS(KP707106781, T8V, T8G);
+					     ii[WS(rs, 62)] = FMA(KP980785280, TjO, TjN);
+					     ii[WS(rs, 30)] = FNMS(KP980785280, TjO, TjN);
+					     Tbz = FMA(KP668178637, T8n, T8y);
+					     T8z = FNMS(KP668178637, T8y, T8n);
+					     T97 = FNMS(KP707106781, T96, T93);
+					     TbX = FMA(KP707106781, T96, T93);
+					     {
+						  E Tae, TbE, Ta3, Tah;
+						  {
+						       E T9x, Ta2, TbA, T98;
+						       Tc4 = FMA(KP707106781, T9w, T9h);
+						       T9x = FNMS(KP707106781, T9w, T9h);
+						       Ta2 = T9M - Ta1;
+						       Tc2 = Ta1 + T9M;
+						       Tc1 = FMA(KP707106781, Tad, Taa);
+						       Tae = FNMS(KP707106781, Tad, Taa);
+						       TbA = FNMS(KP668178637, T8W, T97);
+						       T98 = FMA(KP668178637, T97, T8W);
+						       TbE = FMA(KP923879532, Ta2, T9x);
+						       Ta3 = FNMS(KP923879532, Ta2, T9x);
+						       TbB = Tbz + TbA;
+						       Tkw = TbA - Tbz;
+						       Tkq = T8z + T98;
+						       T99 = T8z - T98;
+						       Tah = Taf - Tag;
+						       Tc5 = Taf + Tag;
+						  }
+						  {
+						       E Tc8, Tc9, TbD, Tai;
+						       Tbn = FNMS(KP707106781, Tbm, Tbj);
+						       Tc8 = FMA(KP707106781, Tbm, Tbj);
+						       Tc9 = Tba + TaV;
+						       Tbb = TaV - Tba;
+						       TaG = FNMS(KP707106781, TaF, Taq);
+						       Tcb = FMA(KP707106781, TaF, Taq);
+						       TbD = FMA(KP923879532, Tah, Tae);
+						       Tai = FNMS(KP923879532, Tah, Tae);
+						       Tct = FMA(KP923879532, Tc9, Tc8);
+						       Tca = FNMS(KP923879532, Tc9, Tc8);
+						       TbF = FMA(KP303346683, TbE, TbD);
+						       TbL = FNMS(KP303346683, TbD, TbE);
+						       Tbv = FNMS(KP534511135, Ta3, Tai);
+						       Taj = FMA(KP534511135, Tai, Ta3);
+						       Tcc = Tbo + Tbp;
+						       Tbq = Tbo - Tbp;
+						  }
+					     }
+					}
+					{
+					     E Tcq, Tc3, Tcs, Tcd, Tcp, Tc6;
+					     Tcs = FMA(KP923879532, Tcc, Tcb);
+					     Tcd = FNMS(KP923879532, Tcc, Tcb);
+					     Tcq = FMA(KP923879532, Tc2, Tc1);
+					     Tc3 = FNMS(KP923879532, Tc2, Tc1);
+					     Tcu = FNMS(KP098491403, Tct, Tcs);
+					     Tcy = FMA(KP098491403, Tcs, Tct);
+					     Tci = FMA(KP820678790, Tca, Tcd);
+					     Tce = FNMS(KP820678790, Tcd, Tca);
+					     Tcp = FMA(KP923879532, Tc5, Tc4);
+					     Tc6 = FNMS(KP923879532, Tc5, Tc4);
+					     Tcl = FNMS(KP198912367, TbT, TbU);
+					     TbV = FMA(KP198912367, TbU, TbT);
+					     Tcr = FMA(KP098491403, Tcq, Tcp);
+					     Tcx = FNMS(KP098491403, Tcp, Tcq);
+					     Tch = FNMS(KP820678790, Tc3, Tc6);
+					     Tc7 = FMA(KP820678790, Tc6, Tc3);
+					}
+					{
+					     E TbH, Tbc, Tcm, TbY;
+					     Tcm = FMA(KP198912367, TbW, TbX);
+					     TbY = FNMS(KP198912367, TbX, TbW);
+					     TbH = FMA(KP923879532, Tbb, TaG);
+					     Tbc = FNMS(KP923879532, Tbb, TaG);
+					     TkE = Tcm - Tcl;
+					     Tcn = Tcl + Tcm;
+					     TkK = TbV + TbY;
+					     TbZ = TbV - TbY;
+					     {
+						  E T7t, Tkm, TbG, Tbr;
+						  TjS = T7l + T7s;
+						  T7t = T7l - T7s;
+						  Tkm = TcC - TcB;
+						  TcD = TcB + TcC;
+						  TbP = FNMS(KP414213562, T7B, T7I);
+						  T7J = FMA(KP414213562, T7I, T7B);
+						  TbG = FMA(KP923879532, Tbq, Tbn);
+						  Tbr = FNMS(KP923879532, Tbq, Tbn);
+						  TbO = FNMS(KP707106781, T7t, T7e);
+						  T7u = FMA(KP707106781, T7t, T7e);
+						  TkB = FNMS(KP707106781, Tkm, Tkl);
+						  Tkn = FMA(KP707106781, Tkm, Tkl);
+						  TbI = FNMS(KP303346683, TbH, TbG);
+						  TbM = FMA(KP303346683, TbG, TbH);
+						  Tbw = FMA(KP534511135, Tbc, Tbr);
+						  Tbs = FNMS(KP534511135, Tbr, Tbc);
+						  T7Y = FNMS(KP414213562, T7X, T7Q);
+						  TbQ = FMA(KP414213562, T7Q, T7X);
+					     }
+					}
+				   }
+				   {
+					E TkJ, TkD, Tck, TbS, TbK, Tku, Tkt, TbN;
+					{
+					     E TkA, Tby, Tkp, Tbu, Tkz, Tbx;
+					     {
+						  E Tbt, T9a, Tkx, Tky, Tkv;
+						  TkA = Taj + Tbs;
+						  Tbt = Taj - Tbs;
+						  {
+						       E TkC, T7Z, Tko, TbR, T80;
+						       TkC = T7J + T7Y;
+						       T7Z = T7J - T7Y;
+						       Tko = TbQ - TbP;
+						       TbR = TbP + TbQ;
+						       TkJ = FMA(KP923879532, TkC, TkB);
+						       TkD = FNMS(KP923879532, TkC, TkB);
+						       Tby = FMA(KP923879532, T7Z, T7u);
+						       T80 = FNMS(KP923879532, T7Z, T7u);
+						       Tkv = FNMS(KP923879532, Tko, Tkn);
+						       Tkp = FMA(KP923879532, Tko, Tkn);
+						       Tck = FMA(KP923879532, TbR, TbO);
+						       TbS = FNMS(KP923879532, TbR, TbO);
+						       T9a = FMA(KP831469612, T99, T80);
+						       Tbu = FNMS(KP831469612, T99, T80);
+						  }
+						  Tkz = FNMS(KP831469612, Tkw, Tkv);
+						  Tkx = FMA(KP831469612, Tkw, Tkv);
+						  Tky = Tbw - Tbv;
+						  Tbx = Tbv + Tbw;
+						  ri[WS(rs, 11)] = FMA(KP881921264, Tbt, T9a);
+						  ri[WS(rs, 43)] = FNMS(KP881921264, Tbt, T9a);
+						  ii[WS(rs, 43)] = FNMS(KP881921264, Tky, Tkx);
+						  ii[WS(rs, 11)] = FMA(KP881921264, Tky, Tkx);
+					     }
+					     {
+						  E TbC, TbJ, Tkr, Tks;
+						  TbK = FNMS(KP831469612, TbB, Tby);
+						  TbC = FMA(KP831469612, TbB, Tby);
+						  ri[WS(rs, 59)] = FMA(KP881921264, Tbx, Tbu);
+						  ri[WS(rs, 27)] = FNMS(KP881921264, Tbx, Tbu);
+						  ii[WS(rs, 59)] = FMA(KP881921264, TkA, Tkz);
+						  ii[WS(rs, 27)] = FNMS(KP881921264, TkA, Tkz);
+						  TbJ = TbF + TbI;
+						  Tku = TbI - TbF;
+						  Tkt = FNMS(KP831469612, Tkq, Tkp);
+						  Tkr = FMA(KP831469612, Tkq, Tkp);
+						  Tks = TbL + TbM;
+						  TbN = TbL - TbM;
+						  ri[WS(rs, 3)] = FMA(KP956940335, TbJ, TbC);
+						  ri[WS(rs, 35)] = FNMS(KP956940335, TbJ, TbC);
+						  ii[WS(rs, 35)] = FNMS(KP956940335, Tks, Tkr);
+						  ii[WS(rs, 3)] = FMA(KP956940335, Tks, Tkr);
+					     }
+					}
+					{
+					     E Tcg, TkI, TkH, Tcj;
+					     {
+						  E Tc0, Tcf, TkF, TkG;
+						  Tcg = FNMS(KP980785280, TbZ, TbS);
+						  Tc0 = FMA(KP980785280, TbZ, TbS);
+						  ri[WS(rs, 19)] = FMA(KP956940335, TbN, TbK);
+						  ri[WS(rs, 51)] = FNMS(KP956940335, TbN, TbK);
+						  ii[WS(rs, 51)] = FNMS(KP956940335, Tku, Tkt);
+						  ii[WS(rs, 19)] = FMA(KP956940335, Tku, Tkt);
+						  Tcf = Tc7 + Tce;
+						  TkI = Tce - Tc7;
+						  TkH = FNMS(KP980785280, TkE, TkD);
+						  TkF = FMA(KP980785280, TkE, TkD);
+						  TkG = Tch + Tci;
+						  Tcj = Tch - Tci;
+						  ri[WS(rs, 7)] = FMA(KP773010453, Tcf, Tc0);
+						  ri[WS(rs, 39)] = FNMS(KP773010453, Tcf, Tc0);
+						  ii[WS(rs, 39)] = FNMS(KP773010453, TkG, TkF);
+						  ii[WS(rs, 7)] = FMA(KP773010453, TkG, TkF);
+					     }
+					     {
+						  E Tco, Tcv, TkL, TkM;
+						  Tcw = FMA(KP980785280, Tcn, Tck);
+						  Tco = FNMS(KP980785280, Tcn, Tck);
+						  ri[WS(rs, 23)] = FMA(KP773010453, Tcj, Tcg);
+						  ri[WS(rs, 55)] = FNMS(KP773010453, Tcj, Tcg);
+						  ii[WS(rs, 55)] = FNMS(KP773010453, TkI, TkH);
+						  ii[WS(rs, 23)] = FMA(KP773010453, TkI, TkH);
+						  Tcv = Tcr - Tcu;
+						  TkO = Tcr + Tcu;
+						  TkN = FMA(KP980785280, TkK, TkJ);
+						  TkL = FNMS(KP980785280, TkK, TkJ);
+						  TkM = Tcy - Tcx;
+						  Tcz = Tcx + Tcy;
+						  ri[WS(rs, 15)] = FMA(KP995184726, Tcv, Tco);
+						  ri[WS(rs, 47)] = FNMS(KP995184726, Tcv, Tco);
+						  ii[WS(rs, 47)] = FNMS(KP995184726, TkM, TkL);
+						  ii[WS(rs, 15)] = FMA(KP995184726, TkM, TkL);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E TdN, Tk2, TjW, Td1, TdR, TdX, TdH, Tdl, TeG, TeK, Teu, Teq, TeD, TeJ, Tet;
+				   E Tej, Tka, Tez, Tkg, Teb, Te1, TcH, Te0, TcE, Tk7, TjT, TdU, TdY, TdI, TdE;
+				   E TcK, Te2;
+				   {
+					E Te5, Te6, Te8, Te9, Teg, Tee, Ted, Teh, Tdz, Tdv, Tdo, Ten, TeF, Tem, Teo;
+					E TdC, Tex, Te7;
+					{
+					     E TcP, TcS, TcW, TcZ;
+					     Te5 = FNMS(KP707106781, TcO, TcN);
+					     TcP = FMA(KP707106781, TcO, TcN);
+					     ri[WS(rs, 63)] = FMA(KP995184726, Tcz, Tcw);
+					     ri[WS(rs, 31)] = FNMS(KP995184726, Tcz, Tcw);
+					     ii[WS(rs, 63)] = FMA(KP995184726, TkO, TkN);
+					     ii[WS(rs, 31)] = FNMS(KP995184726, TkO, TkN);
+					     TcS = FMA(KP707106781, TcR, TcQ);
+					     Te6 = FNMS(KP707106781, TcR, TcQ);
+					     Te8 = FNMS(KP707106781, TcV, TcU);
+					     TcW = FMA(KP707106781, TcV, TcU);
+					     TcZ = FMA(KP707106781, TcY, TcX);
+					     Te9 = FNMS(KP707106781, TcY, TcX);
+					     {
+						  E Tdg, TdQ, Tdd, Tdj;
+						  {
+						       E Td5, TdL, TcT, TdM, Td0, Tdc;
+						       Teg = FNMS(KP707106781, Td4, Td3);
+						       Td5 = FMA(KP707106781, Td4, Td3);
+						       TdL = FMA(KP198912367, TcP, TcS);
+						       TcT = FNMS(KP198912367, TcS, TcP);
+						       TdM = FNMS(KP198912367, TcW, TcZ);
+						       Td0 = FMA(KP198912367, TcZ, TcW);
+						       Tdc = Td8 + Tdb;
+						       Tee = Tdb - Td8;
+						       Ted = FNMS(KP707106781, Tdf, Tde);
+						       Tdg = FMA(KP707106781, Tdf, Tde);
+						       TdN = TdL + TdM;
+						       Tk2 = TdM - TdL;
+						       TjW = TcT + Td0;
+						       Td1 = TcT - Td0;
+						       TdQ = FMA(KP923879532, Tdc, Td5);
+						       Tdd = FNMS(KP923879532, Tdc, Td5);
+						       Tdj = Tdh + Tdi;
+						       Teh = Tdh - Tdi;
+						  }
+						  {
+						       E Tek, Tel, TdP, Tdk;
+						       Tdz = FMA(KP707106781, Tdy, Tdx);
+						       Tek = FNMS(KP707106781, Tdy, Tdx);
+						       Tel = Tdu - Tdr;
+						       Tdv = Tdr + Tdu;
+						       Tdo = FMA(KP707106781, Tdn, Tdm);
+						       Ten = FNMS(KP707106781, Tdn, Tdm);
+						       TdP = FMA(KP923879532, Tdj, Tdg);
+						       Tdk = FNMS(KP923879532, Tdj, Tdg);
+						       TeF = FMA(KP923879532, Tel, Tek);
+						       Tem = FNMS(KP923879532, Tel, Tek);
+						       TdR = FMA(KP098491403, TdQ, TdP);
+						       TdX = FNMS(KP098491403, TdP, TdQ);
+						       TdH = FNMS(KP820678790, Tdd, Tdk);
+						       Tdl = FMA(KP820678790, Tdk, Tdd);
+						       Teo = TdA - TdB;
+						       TdC = TdA + TdB;
+						  }
+					     }
+					}
+					{
+					     E TeC, Tef, TeE, Tep, TeB, Tei;
+					     TeE = FMA(KP923879532, Teo, Ten);
+					     Tep = FNMS(KP923879532, Teo, Ten);
+					     TeC = FMA(KP923879532, Tee, Ted);
+					     Tef = FNMS(KP923879532, Tee, Ted);
+					     TeG = FNMS(KP303346683, TeF, TeE);
+					     TeK = FMA(KP303346683, TeE, TeF);
+					     Teu = FMA(KP534511135, Tem, Tep);
+					     Teq = FNMS(KP534511135, Tep, Tem);
+					     TeB = FMA(KP923879532, Teh, Teg);
+					     Tei = FNMS(KP923879532, Teh, Teg);
+					     Tex = FNMS(KP668178637, Te5, Te6);
+					     Te7 = FMA(KP668178637, Te6, Te5);
+					     TeD = FMA(KP303346683, TeC, TeB);
+					     TeJ = FNMS(KP303346683, TeB, TeC);
+					     Tet = FNMS(KP534511135, Tef, Tei);
+					     Tej = FMA(KP534511135, Tei, Tef);
+					}
+					{
+					     E TdT, Tdw, Tey, Tea, TdS, TdD;
+					     Tey = FMA(KP668178637, Te8, Te9);
+					     Tea = FNMS(KP668178637, Te9, Te8);
+					     TdT = FMA(KP923879532, Tdv, Tdo);
+					     Tdw = FNMS(KP923879532, Tdv, Tdo);
+					     Tka = Tey - Tex;
+					     Tez = Tex + Tey;
+					     Tkg = Te7 + Tea;
+					     Teb = Te7 - Tea;
+					     Te1 = FNMS(KP414213562, TcF, TcG);
+					     TcH = FMA(KP414213562, TcG, TcF);
+					     TdS = FMA(KP923879532, TdC, Tdz);
+					     TdD = FNMS(KP923879532, TdC, Tdz);
+					     Te0 = FNMS(KP707106781, TcD, TcA);
+					     TcE = FMA(KP707106781, TcD, TcA);
+					     Tk7 = FNMS(KP707106781, TjS, TjR);
+					     TjT = FMA(KP707106781, TjS, TjR);
+					     TdU = FNMS(KP098491403, TdT, TdS);
+					     TdY = FMA(KP098491403, TdS, TdT);
+					     TdI = FMA(KP820678790, Tdw, TdD);
+					     TdE = FNMS(KP820678790, TdD, Tdw);
+					     TcK = FNMS(KP414213562, TcJ, TcI);
+					     Te2 = FMA(KP414213562, TcI, TcJ);
+					}
+				   }
+				   {
+					E Tkf, Tk9, Tew, Te4, TdW, Tk0, TjZ, TdZ;
+					{
+					     E Tk6, TdK, TjV, TdG, Tk5, TdJ;
+					     {
+						  E TdF, Td2, Tk3, Tk4, Tk1;
+						  Tk6 = Tdl + TdE;
+						  TdF = Tdl - TdE;
+						  {
+						       E Tk8, TcL, TjU, Te3, TcM;
+						       Tk8 = TcK - TcH;
+						       TcL = TcH + TcK;
+						       TjU = Te1 + Te2;
+						       Te3 = Te1 - Te2;
+						       Tkf = FNMS(KP923879532, Tk8, Tk7);
+						       Tk9 = FMA(KP923879532, Tk8, Tk7);
+						       TdK = FMA(KP923879532, TcL, TcE);
+						       TcM = FNMS(KP923879532, TcL, TcE);
+						       Tk1 = FNMS(KP923879532, TjU, TjT);
+						       TjV = FMA(KP923879532, TjU, TjT);
+						       Tew = FNMS(KP923879532, Te3, Te0);
+						       Te4 = FMA(KP923879532, Te3, Te0);
+						       Td2 = FMA(KP980785280, Td1, TcM);
+						       TdG = FNMS(KP980785280, Td1, TcM);
+						  }
+						  Tk5 = FNMS(KP980785280, Tk2, Tk1);
+						  Tk3 = FMA(KP980785280, Tk2, Tk1);
+						  Tk4 = TdI - TdH;
+						  TdJ = TdH + TdI;
+						  ri[WS(rs, 9)] = FMA(KP773010453, TdF, Td2);
+						  ri[WS(rs, 41)] = FNMS(KP773010453, TdF, Td2);
+						  ii[WS(rs, 41)] = FNMS(KP773010453, Tk4, Tk3);
+						  ii[WS(rs, 9)] = FMA(KP773010453, Tk4, Tk3);
+					     }
+					     {
+						  E TdO, TdV, TjX, TjY;
+						  TdW = FNMS(KP980785280, TdN, TdK);
+						  TdO = FMA(KP980785280, TdN, TdK);
+						  ri[WS(rs, 57)] = FMA(KP773010453, TdJ, TdG);
+						  ri[WS(rs, 25)] = FNMS(KP773010453, TdJ, TdG);
+						  ii[WS(rs, 57)] = FMA(KP773010453, Tk6, Tk5);
+						  ii[WS(rs, 25)] = FNMS(KP773010453, Tk6, Tk5);
+						  TdV = TdR + TdU;
+						  Tk0 = TdU - TdR;
+						  TjZ = FNMS(KP980785280, TjW, TjV);
+						  TjX = FMA(KP980785280, TjW, TjV);
+						  TjY = TdX + TdY;
+						  TdZ = TdX - TdY;
+						  ri[WS(rs, 1)] = FMA(KP995184726, TdV, TdO);
+						  ri[WS(rs, 33)] = FNMS(KP995184726, TdV, TdO);
+						  ii[WS(rs, 33)] = FNMS(KP995184726, TjY, TjX);
+						  ii[WS(rs, 1)] = FMA(KP995184726, TjY, TjX);
+					     }
+					}
+					{
+					     E Tes, Tke, Tkd, Tev;
+					     {
+						  E Tec, Ter, Tkb, Tkc;
+						  Tes = FNMS(KP831469612, Teb, Te4);
+						  Tec = FMA(KP831469612, Teb, Te4);
+						  ri[WS(rs, 17)] = FMA(KP995184726, TdZ, TdW);
+						  ri[WS(rs, 49)] = FNMS(KP995184726, TdZ, TdW);
+						  ii[WS(rs, 49)] = FNMS(KP995184726, Tk0, TjZ);
+						  ii[WS(rs, 17)] = FMA(KP995184726, Tk0, TjZ);
+						  Ter = Tej + Teq;
+						  Tke = Teq - Tej;
+						  Tkd = FNMS(KP831469612, Tka, Tk9);
+						  Tkb = FMA(KP831469612, Tka, Tk9);
+						  Tkc = Tet + Teu;
+						  Tev = Tet - Teu;
+						  ri[WS(rs, 5)] = FMA(KP881921264, Ter, Tec);
+						  ri[WS(rs, 37)] = FNMS(KP881921264, Ter, Tec);
+						  ii[WS(rs, 37)] = FNMS(KP881921264, Tkc, Tkb);
+						  ii[WS(rs, 5)] = FMA(KP881921264, Tkc, Tkb);
+					     }
+					     {
+						  E TeA, TeH, Tkh, Tki;
+						  TeI = FMA(KP831469612, Tez, Tew);
+						  TeA = FNMS(KP831469612, Tez, Tew);
+						  ri[WS(rs, 21)] = FMA(KP881921264, Tev, Tes);
+						  ri[WS(rs, 53)] = FNMS(KP881921264, Tev, Tes);
+						  ii[WS(rs, 53)] = FNMS(KP881921264, Tke, Tkd);
+						  ii[WS(rs, 21)] = FMA(KP881921264, Tke, Tkd);
+						  TeH = TeD - TeG;
+						  Tkk = TeD + TeG;
+						  Tkj = FMA(KP831469612, Tkg, Tkf);
+						  Tkh = FNMS(KP831469612, Tkg, Tkf);
+						  Tki = TeK - TeJ;
+						  TeL = TeJ + TeK;
+						  ri[WS(rs, 13)] = FMA(KP956940335, TeH, TeA);
+						  ri[WS(rs, 45)] = FNMS(KP956940335, TeH, TeA);
+						  ii[WS(rs, 45)] = FNMS(KP956940335, Tki, Tkh);
+						  ii[WS(rs, 13)] = FMA(KP956940335, Tki, Tkh);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ri[WS(rs, 61)] = FMA(KP956940335, TeL, TeI);
+	       ri[WS(rs, 29)] = FNMS(KP956940335, TeL, TeI);
+	       ii[WS(rs, 61)] = FMA(KP956940335, Tkk, Tkj);
+	       ii[WS(rs, 29)] = FNMS(KP956940335, Tkk, Tkj);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 64},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 64, "t1_64", twinstr, &GENUS, {520, 126, 518, 0}, 0, 0, 0 };
+
+void X(codelet_t1_64) (planner *p) {
+     X(kdft_dit_register) (p, t1_64, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 64 -name t1_64 -include t.h */
+
+/*
+ * This function contains 1038 FP additions, 500 FP multiplications,
+ * (or, 808 additions, 270 multiplications, 230 fused multiply/add),
+ * 176 stack variables, 15 constants, and 256 memory accesses
+ */
+#include "t.h"
+
+static void t1_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 126); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 126, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E Tj, TcL, ThT, Tin, T6b, Taz, TgT, Thn, TG, Thm, TcO, TgO, T6m, ThQ, TaC;
+	       E Tim, T14, Tfq, T6y, T9O, TaG, Tc0, TcU, TeE, T1r, Tfr, T6J, T9P, TaJ, Tc1;
+	       E TcZ, TeF, T1Q, T2d, Tfx, Tfu, Tfv, Tfw, T6Q, TaM, Tdb, TeJ, T71, TaQ, T7a;
+	       E TaN, Td6, TeI, T77, TaP, T2B, T2Y, Tfz, TfA, TfB, TfC, T7h, TaW, Tdm, TeM;
+	       E T7s, TaU, T7B, TaX, Tdh, TeL, T7y, TaT, T5j, TfR, Tec, Tf0, TfY, Tgy, T8D;
+	       E Tbl, T8O, Tbx, T9l, Tbm, TdV, TeX, T9i, Tbw, T3M, TfL, TdL, TeQ, TfI, Tgt;
+	       E T7K, Tb2, T7V, Tbe, T8s, Tb3, Tdu, TeT, T8p, Tbd, T4x, TfJ, TdE, TdM, TfO;
+	       E Tgu, T87, T8v, T8i, T8u, Tba, Tbg, Tdz, TdN, Tb7, Tbh, T64, TfZ, Te5, Ted;
+	       E TfU, Tgz, T90, T9o, T9b, T9n, Tbt, Tbz, Te0, Tee, Tbq, TbA;
+	       {
+		    E T1, TgR, T6, TgQ, Tc, T68, Th, T69;
+		    T1 = ri[0];
+		    TgR = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 32)];
+			 T5 = ii[WS(rs, 32)];
+			 T2 = W[62];
+			 T4 = W[63];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 TgQ = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = ri[WS(rs, 16)];
+			 Tb = ii[WS(rs, 16)];
+			 T8 = W[30];
+			 Ta = W[31];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T68 = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = ri[WS(rs, 48)];
+			 Tg = ii[WS(rs, 48)];
+			 Td = W[94];
+			 Tf = W[95];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T69 = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E T7, Ti, ThR, ThS;
+			 T7 = T1 + T6;
+			 Ti = Tc + Th;
+			 Tj = T7 + Ti;
+			 TcL = T7 - Ti;
+			 ThR = TgR - TgQ;
+			 ThS = Tc - Th;
+			 ThT = ThR - ThS;
+			 Tin = ThS + ThR;
+		    }
+		    {
+			 E T67, T6a, TgP, TgS;
+			 T67 = T1 - T6;
+			 T6a = T68 - T69;
+			 T6b = T67 - T6a;
+			 Taz = T67 + T6a;
+			 TgP = T68 + T69;
+			 TgS = TgQ + TgR;
+			 TgT = TgP + TgS;
+			 Thn = TgS - TgP;
+		    }
+	       }
+	       {
+		    E To, T6c, Tt, T6d, T6e, T6f, Tz, T6i, TE, T6j, T6h, T6k;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = ri[WS(rs, 8)];
+			 Tn = ii[WS(rs, 8)];
+			 Tk = W[14];
+			 Tm = W[15];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 T6c = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = ri[WS(rs, 40)];
+			 Ts = ii[WS(rs, 40)];
+			 Tp = W[78];
+			 Tr = W[79];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 T6d = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    T6e = T6c - T6d;
+		    T6f = To - Tt;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = ri[WS(rs, 56)];
+			 Ty = ii[WS(rs, 56)];
+			 Tv = W[110];
+			 Tx = W[111];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T6i = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = ri[WS(rs, 24)];
+			 TD = ii[WS(rs, 24)];
+			 TA = W[46];
+			 TC = W[47];
+			 TE = FMA(TA, TB, TC * TD);
+			 T6j = FNMS(TC, TB, TA * TD);
+		    }
+		    T6h = Tz - TE;
+		    T6k = T6i - T6j;
+		    {
+			 E Tu, TF, TcM, TcN;
+			 Tu = To + Tt;
+			 TF = Tz + TE;
+			 TG = Tu + TF;
+			 Thm = TF - Tu;
+			 TcM = T6c + T6d;
+			 TcN = T6i + T6j;
+			 TcO = TcM - TcN;
+			 TgO = TcM + TcN;
+		    }
+		    {
+			 E T6g, T6l, TaA, TaB;
+			 T6g = T6e - T6f;
+			 T6l = T6h + T6k;
+			 T6m = KP707106781 * (T6g - T6l);
+			 ThQ = KP707106781 * (T6g + T6l);
+			 TaA = T6f + T6e;
+			 TaB = T6h - T6k;
+			 TaC = KP707106781 * (TaA + TaB);
+			 Tim = KP707106781 * (TaB - TaA);
+		    }
+	       }
+	       {
+		    E TS, TcQ, T6q, T6t, T13, TcR, T6r, T6w, T6s, T6x;
+		    {
+			 E TM, T6o, TR, T6p;
+			 {
+			      E TJ, TL, TI, TK;
+			      TJ = ri[WS(rs, 4)];
+			      TL = ii[WS(rs, 4)];
+			      TI = W[6];
+			      TK = W[7];
+			      TM = FMA(TI, TJ, TK * TL);
+			      T6o = FNMS(TK, TJ, TI * TL);
+			 }
+			 {
+			      E TO, TQ, TN, TP;
+			      TO = ri[WS(rs, 36)];
+			      TQ = ii[WS(rs, 36)];
+			      TN = W[70];
+			      TP = W[71];
+			      TR = FMA(TN, TO, TP * TQ);
+			      T6p = FNMS(TP, TO, TN * TQ);
+			 }
+			 TS = TM + TR;
+			 TcQ = T6o + T6p;
+			 T6q = T6o - T6p;
+			 T6t = TM - TR;
+		    }
+		    {
+			 E TX, T6u, T12, T6v;
+			 {
+			      E TU, TW, TT, TV;
+			      TU = ri[WS(rs, 20)];
+			      TW = ii[WS(rs, 20)];
+			      TT = W[38];
+			      TV = W[39];
+			      TX = FMA(TT, TU, TV * TW);
+			      T6u = FNMS(TV, TU, TT * TW);
+			 }
+			 {
+			      E TZ, T11, TY, T10;
+			      TZ = ri[WS(rs, 52)];
+			      T11 = ii[WS(rs, 52)];
+			      TY = W[102];
+			      T10 = W[103];
+			      T12 = FMA(TY, TZ, T10 * T11);
+			      T6v = FNMS(T10, TZ, TY * T11);
+			 }
+			 T13 = TX + T12;
+			 TcR = T6u + T6v;
+			 T6r = TX - T12;
+			 T6w = T6u - T6v;
+		    }
+		    T14 = TS + T13;
+		    Tfq = TcQ + TcR;
+		    T6s = T6q + T6r;
+		    T6x = T6t - T6w;
+		    T6y = FNMS(KP923879532, T6x, KP382683432 * T6s);
+		    T9O = FMA(KP923879532, T6s, KP382683432 * T6x);
+		    {
+			 E TaE, TaF, TcS, TcT;
+			 TaE = T6q - T6r;
+			 TaF = T6t + T6w;
+			 TaG = FNMS(KP382683432, TaF, KP923879532 * TaE);
+			 Tc0 = FMA(KP382683432, TaE, KP923879532 * TaF);
+			 TcS = TcQ - TcR;
+			 TcT = TS - T13;
+			 TcU = TcS - TcT;
+			 TeE = TcT + TcS;
+		    }
+	       }
+	       {
+		    E T1f, TcW, T6B, T6E, T1q, TcX, T6C, T6H, T6D, T6I;
+		    {
+			 E T19, T6z, T1e, T6A;
+			 {
+			      E T16, T18, T15, T17;
+			      T16 = ri[WS(rs, 60)];
+			      T18 = ii[WS(rs, 60)];
+			      T15 = W[118];
+			      T17 = W[119];
+			      T19 = FMA(T15, T16, T17 * T18);
+			      T6z = FNMS(T17, T16, T15 * T18);
+			 }
+			 {
+			      E T1b, T1d, T1a, T1c;
+			      T1b = ri[WS(rs, 28)];
+			      T1d = ii[WS(rs, 28)];
+			      T1a = W[54];
+			      T1c = W[55];
+			      T1e = FMA(T1a, T1b, T1c * T1d);
+			      T6A = FNMS(T1c, T1b, T1a * T1d);
+			 }
+			 T1f = T19 + T1e;
+			 TcW = T6z + T6A;
+			 T6B = T6z - T6A;
+			 T6E = T19 - T1e;
+		    }
+		    {
+			 E T1k, T6F, T1p, T6G;
+			 {
+			      E T1h, T1j, T1g, T1i;
+			      T1h = ri[WS(rs, 12)];
+			      T1j = ii[WS(rs, 12)];
+			      T1g = W[22];
+			      T1i = W[23];
+			      T1k = FMA(T1g, T1h, T1i * T1j);
+			      T6F = FNMS(T1i, T1h, T1g * T1j);
+			 }
+			 {
+			      E T1m, T1o, T1l, T1n;
+			      T1m = ri[WS(rs, 44)];
+			      T1o = ii[WS(rs, 44)];
+			      T1l = W[86];
+			      T1n = W[87];
+			      T1p = FMA(T1l, T1m, T1n * T1o);
+			      T6G = FNMS(T1n, T1m, T1l * T1o);
+			 }
+			 T1q = T1k + T1p;
+			 TcX = T6F + T6G;
+			 T6C = T1k - T1p;
+			 T6H = T6F - T6G;
+		    }
+		    T1r = T1f + T1q;
+		    Tfr = TcW + TcX;
+		    T6D = T6B + T6C;
+		    T6I = T6E - T6H;
+		    T6J = FMA(KP382683432, T6D, KP923879532 * T6I);
+		    T9P = FNMS(KP923879532, T6D, KP382683432 * T6I);
+		    {
+			 E TaH, TaI, TcV, TcY;
+			 TaH = T6B - T6C;
+			 TaI = T6E + T6H;
+			 TaJ = FMA(KP923879532, TaH, KP382683432 * TaI);
+			 Tc1 = FNMS(KP382683432, TaH, KP923879532 * TaI);
+			 TcV = T1f - T1q;
+			 TcY = TcW - TcX;
+			 TcZ = TcV + TcY;
+			 TeF = TcV - TcY;
+		    }
+	       }
+	       {
+		    E T1y, T6M, T1D, T6N, T1E, Td2, T1J, T74, T1O, T75, T1P, Td3, T21, Td8, T6W;
+		    E T6Z, T2c, Td9, T6R, T6U;
+		    {
+			 E T1v, T1x, T1u, T1w;
+			 T1v = ri[WS(rs, 2)];
+			 T1x = ii[WS(rs, 2)];
+			 T1u = W[2];
+			 T1w = W[3];
+			 T1y = FMA(T1u, T1v, T1w * T1x);
+			 T6M = FNMS(T1w, T1v, T1u * T1x);
+		    }
+		    {
+			 E T1A, T1C, T1z, T1B;
+			 T1A = ri[WS(rs, 34)];
+			 T1C = ii[WS(rs, 34)];
+			 T1z = W[66];
+			 T1B = W[67];
+			 T1D = FMA(T1z, T1A, T1B * T1C);
+			 T6N = FNMS(T1B, T1A, T1z * T1C);
+		    }
+		    T1E = T1y + T1D;
+		    Td2 = T6M + T6N;
+		    {
+			 E T1G, T1I, T1F, T1H;
+			 T1G = ri[WS(rs, 18)];
+			 T1I = ii[WS(rs, 18)];
+			 T1F = W[34];
+			 T1H = W[35];
+			 T1J = FMA(T1F, T1G, T1H * T1I);
+			 T74 = FNMS(T1H, T1G, T1F * T1I);
+		    }
+		    {
+			 E T1L, T1N, T1K, T1M;
+			 T1L = ri[WS(rs, 50)];
+			 T1N = ii[WS(rs, 50)];
+			 T1K = W[98];
+			 T1M = W[99];
+			 T1O = FMA(T1K, T1L, T1M * T1N);
+			 T75 = FNMS(T1M, T1L, T1K * T1N);
+		    }
+		    T1P = T1J + T1O;
+		    Td3 = T74 + T75;
+		    {
+			 E T1V, T6X, T20, T6Y;
+			 {
+			      E T1S, T1U, T1R, T1T;
+			      T1S = ri[WS(rs, 10)];
+			      T1U = ii[WS(rs, 10)];
+			      T1R = W[18];
+			      T1T = W[19];
+			      T1V = FMA(T1R, T1S, T1T * T1U);
+			      T6X = FNMS(T1T, T1S, T1R * T1U);
+			 }
+			 {
+			      E T1X, T1Z, T1W, T1Y;
+			      T1X = ri[WS(rs, 42)];
+			      T1Z = ii[WS(rs, 42)];
+			      T1W = W[82];
+			      T1Y = W[83];
+			      T20 = FMA(T1W, T1X, T1Y * T1Z);
+			      T6Y = FNMS(T1Y, T1X, T1W * T1Z);
+			 }
+			 T21 = T1V + T20;
+			 Td8 = T6X + T6Y;
+			 T6W = T1V - T20;
+			 T6Z = T6X - T6Y;
+		    }
+		    {
+			 E T26, T6S, T2b, T6T;
+			 {
+			      E T23, T25, T22, T24;
+			      T23 = ri[WS(rs, 58)];
+			      T25 = ii[WS(rs, 58)];
+			      T22 = W[114];
+			      T24 = W[115];
+			      T26 = FMA(T22, T23, T24 * T25);
+			      T6S = FNMS(T24, T23, T22 * T25);
+			 }
+			 {
+			      E T28, T2a, T27, T29;
+			      T28 = ri[WS(rs, 26)];
+			      T2a = ii[WS(rs, 26)];
+			      T27 = W[50];
+			      T29 = W[51];
+			      T2b = FMA(T27, T28, T29 * T2a);
+			      T6T = FNMS(T29, T28, T27 * T2a);
+			 }
+			 T2c = T26 + T2b;
+			 Td9 = T6S + T6T;
+			 T6R = T26 - T2b;
+			 T6U = T6S - T6T;
+		    }
+		    T1Q = T1E + T1P;
+		    T2d = T21 + T2c;
+		    Tfx = T1Q - T2d;
+		    Tfu = Td2 + Td3;
+		    Tfv = Td8 + Td9;
+		    Tfw = Tfu - Tfv;
+		    {
+			 E T6O, T6P, Td7, Tda;
+			 T6O = T6M - T6N;
+			 T6P = T1J - T1O;
+			 T6Q = T6O + T6P;
+			 TaM = T6O - T6P;
+			 Td7 = T1E - T1P;
+			 Tda = Td8 - Td9;
+			 Tdb = Td7 - Tda;
+			 TeJ = Td7 + Tda;
+		    }
+		    {
+			 E T6V, T70, T78, T79;
+			 T6V = T6R - T6U;
+			 T70 = T6W + T6Z;
+			 T71 = KP707106781 * (T6V - T70);
+			 TaQ = KP707106781 * (T70 + T6V);
+			 T78 = T6Z - T6W;
+			 T79 = T6R + T6U;
+			 T7a = KP707106781 * (T78 - T79);
+			 TaN = KP707106781 * (T78 + T79);
+		    }
+		    {
+			 E Td4, Td5, T73, T76;
+			 Td4 = Td2 - Td3;
+			 Td5 = T2c - T21;
+			 Td6 = Td4 - Td5;
+			 TeI = Td4 + Td5;
+			 T73 = T1y - T1D;
+			 T76 = T74 - T75;
+			 T77 = T73 - T76;
+			 TaP = T73 + T76;
+		    }
+	       }
+	       {
+		    E T2j, T7d, T2o, T7e, T2p, Tdd, T2u, T7v, T2z, T7w, T2A, Tde, T2M, Tdj, T7n;
+		    E T7q, T2X, Tdk, T7i, T7l;
+		    {
+			 E T2g, T2i, T2f, T2h;
+			 T2g = ri[WS(rs, 62)];
+			 T2i = ii[WS(rs, 62)];
+			 T2f = W[122];
+			 T2h = W[123];
+			 T2j = FMA(T2f, T2g, T2h * T2i);
+			 T7d = FNMS(T2h, T2g, T2f * T2i);
+		    }
+		    {
+			 E T2l, T2n, T2k, T2m;
+			 T2l = ri[WS(rs, 30)];
+			 T2n = ii[WS(rs, 30)];
+			 T2k = W[58];
+			 T2m = W[59];
+			 T2o = FMA(T2k, T2l, T2m * T2n);
+			 T7e = FNMS(T2m, T2l, T2k * T2n);
+		    }
+		    T2p = T2j + T2o;
+		    Tdd = T7d + T7e;
+		    {
+			 E T2r, T2t, T2q, T2s;
+			 T2r = ri[WS(rs, 14)];
+			 T2t = ii[WS(rs, 14)];
+			 T2q = W[26];
+			 T2s = W[27];
+			 T2u = FMA(T2q, T2r, T2s * T2t);
+			 T7v = FNMS(T2s, T2r, T2q * T2t);
+		    }
+		    {
+			 E T2w, T2y, T2v, T2x;
+			 T2w = ri[WS(rs, 46)];
+			 T2y = ii[WS(rs, 46)];
+			 T2v = W[90];
+			 T2x = W[91];
+			 T2z = FMA(T2v, T2w, T2x * T2y);
+			 T7w = FNMS(T2x, T2w, T2v * T2y);
+		    }
+		    T2A = T2u + T2z;
+		    Tde = T7v + T7w;
+		    {
+			 E T2G, T7o, T2L, T7p;
+			 {
+			      E T2D, T2F, T2C, T2E;
+			      T2D = ri[WS(rs, 6)];
+			      T2F = ii[WS(rs, 6)];
+			      T2C = W[10];
+			      T2E = W[11];
+			      T2G = FMA(T2C, T2D, T2E * T2F);
+			      T7o = FNMS(T2E, T2D, T2C * T2F);
+			 }
+			 {
+			      E T2I, T2K, T2H, T2J;
+			      T2I = ri[WS(rs, 38)];
+			      T2K = ii[WS(rs, 38)];
+			      T2H = W[74];
+			      T2J = W[75];
+			      T2L = FMA(T2H, T2I, T2J * T2K);
+			      T7p = FNMS(T2J, T2I, T2H * T2K);
+			 }
+			 T2M = T2G + T2L;
+			 Tdj = T7o + T7p;
+			 T7n = T2G - T2L;
+			 T7q = T7o - T7p;
+		    }
+		    {
+			 E T2R, T7j, T2W, T7k;
+			 {
+			      E T2O, T2Q, T2N, T2P;
+			      T2O = ri[WS(rs, 54)];
+			      T2Q = ii[WS(rs, 54)];
+			      T2N = W[106];
+			      T2P = W[107];
+			      T2R = FMA(T2N, T2O, T2P * T2Q);
+			      T7j = FNMS(T2P, T2O, T2N * T2Q);
+			 }
+			 {
+			      E T2T, T2V, T2S, T2U;
+			      T2T = ri[WS(rs, 22)];
+			      T2V = ii[WS(rs, 22)];
+			      T2S = W[42];
+			      T2U = W[43];
+			      T2W = FMA(T2S, T2T, T2U * T2V);
+			      T7k = FNMS(T2U, T2T, T2S * T2V);
+			 }
+			 T2X = T2R + T2W;
+			 Tdk = T7j + T7k;
+			 T7i = T2R - T2W;
+			 T7l = T7j - T7k;
+		    }
+		    T2B = T2p + T2A;
+		    T2Y = T2M + T2X;
+		    Tfz = T2B - T2Y;
+		    TfA = Tdd + Tde;
+		    TfB = Tdj + Tdk;
+		    TfC = TfA - TfB;
+		    {
+			 E T7f, T7g, Tdi, Tdl;
+			 T7f = T7d - T7e;
+			 T7g = T2u - T2z;
+			 T7h = T7f + T7g;
+			 TaW = T7f - T7g;
+			 Tdi = T2p - T2A;
+			 Tdl = Tdj - Tdk;
+			 Tdm = Tdi - Tdl;
+			 TeM = Tdi + Tdl;
+		    }
+		    {
+			 E T7m, T7r, T7z, T7A;
+			 T7m = T7i - T7l;
+			 T7r = T7n + T7q;
+			 T7s = KP707106781 * (T7m - T7r);
+			 TaU = KP707106781 * (T7r + T7m);
+			 T7z = T7q - T7n;
+			 T7A = T7i + T7l;
+			 T7B = KP707106781 * (T7z - T7A);
+			 TaX = KP707106781 * (T7z + T7A);
+		    }
+		    {
+			 E Tdf, Tdg, T7u, T7x;
+			 Tdf = Tdd - Tde;
+			 Tdg = T2X - T2M;
+			 Tdh = Tdf - Tdg;
+			 TeL = Tdf + Tdg;
+			 T7u = T2j - T2o;
+			 T7x = T7v - T7w;
+			 T7y = T7u - T7x;
+			 TaT = T7u + T7x;
+		    }
+	       }
+	       {
+		    E T4D, T9e, T4I, T9f, T4J, Te8, T4O, T8A, T4T, T8B, T4U, Te9, T56, TdS, T8G;
+		    E T8H, T5h, TdT, T8J, T8M;
+		    {
+			 E T4A, T4C, T4z, T4B;
+			 T4A = ri[WS(rs, 63)];
+			 T4C = ii[WS(rs, 63)];
+			 T4z = W[124];
+			 T4B = W[125];
+			 T4D = FMA(T4z, T4A, T4B * T4C);
+			 T9e = FNMS(T4B, T4A, T4z * T4C);
+		    }
+		    {
+			 E T4F, T4H, T4E, T4G;
+			 T4F = ri[WS(rs, 31)];
+			 T4H = ii[WS(rs, 31)];
+			 T4E = W[60];
+			 T4G = W[61];
+			 T4I = FMA(T4E, T4F, T4G * T4H);
+			 T9f = FNMS(T4G, T4F, T4E * T4H);
+		    }
+		    T4J = T4D + T4I;
+		    Te8 = T9e + T9f;
+		    {
+			 E T4L, T4N, T4K, T4M;
+			 T4L = ri[WS(rs, 15)];
+			 T4N = ii[WS(rs, 15)];
+			 T4K = W[28];
+			 T4M = W[29];
+			 T4O = FMA(T4K, T4L, T4M * T4N);
+			 T8A = FNMS(T4M, T4L, T4K * T4N);
+		    }
+		    {
+			 E T4Q, T4S, T4P, T4R;
+			 T4Q = ri[WS(rs, 47)];
+			 T4S = ii[WS(rs, 47)];
+			 T4P = W[92];
+			 T4R = W[93];
+			 T4T = FMA(T4P, T4Q, T4R * T4S);
+			 T8B = FNMS(T4R, T4Q, T4P * T4S);
+		    }
+		    T4U = T4O + T4T;
+		    Te9 = T8A + T8B;
+		    {
+			 E T50, T8E, T55, T8F;
+			 {
+			      E T4X, T4Z, T4W, T4Y;
+			      T4X = ri[WS(rs, 7)];
+			      T4Z = ii[WS(rs, 7)];
+			      T4W = W[12];
+			      T4Y = W[13];
+			      T50 = FMA(T4W, T4X, T4Y * T4Z);
+			      T8E = FNMS(T4Y, T4X, T4W * T4Z);
+			 }
+			 {
+			      E T52, T54, T51, T53;
+			      T52 = ri[WS(rs, 39)];
+			      T54 = ii[WS(rs, 39)];
+			      T51 = W[76];
+			      T53 = W[77];
+			      T55 = FMA(T51, T52, T53 * T54);
+			      T8F = FNMS(T53, T52, T51 * T54);
+			 }
+			 T56 = T50 + T55;
+			 TdS = T8E + T8F;
+			 T8G = T8E - T8F;
+			 T8H = T50 - T55;
+		    }
+		    {
+			 E T5b, T8K, T5g, T8L;
+			 {
+			      E T58, T5a, T57, T59;
+			      T58 = ri[WS(rs, 55)];
+			      T5a = ii[WS(rs, 55)];
+			      T57 = W[108];
+			      T59 = W[109];
+			      T5b = FMA(T57, T58, T59 * T5a);
+			      T8K = FNMS(T59, T58, T57 * T5a);
+			 }
+			 {
+			      E T5d, T5f, T5c, T5e;
+			      T5d = ri[WS(rs, 23)];
+			      T5f = ii[WS(rs, 23)];
+			      T5c = W[44];
+			      T5e = W[45];
+			      T5g = FMA(T5c, T5d, T5e * T5f);
+			      T8L = FNMS(T5e, T5d, T5c * T5f);
+			 }
+			 T5h = T5b + T5g;
+			 TdT = T8K + T8L;
+			 T8J = T5b - T5g;
+			 T8M = T8K - T8L;
+		    }
+		    {
+			 E T4V, T5i, Tea, Teb;
+			 T4V = T4J + T4U;
+			 T5i = T56 + T5h;
+			 T5j = T4V + T5i;
+			 TfR = T4V - T5i;
+			 Tea = Te8 - Te9;
+			 Teb = T5h - T56;
+			 Tec = Tea - Teb;
+			 Tf0 = Tea + Teb;
+		    }
+		    {
+			 E TfW, TfX, T8z, T8C;
+			 TfW = Te8 + Te9;
+			 TfX = TdS + TdT;
+			 TfY = TfW - TfX;
+			 Tgy = TfW + TfX;
+			 T8z = T4D - T4I;
+			 T8C = T8A - T8B;
+			 T8D = T8z - T8C;
+			 Tbl = T8z + T8C;
+		    }
+		    {
+			 E T8I, T8N, T9j, T9k;
+			 T8I = T8G - T8H;
+			 T8N = T8J + T8M;
+			 T8O = KP707106781 * (T8I - T8N);
+			 Tbx = KP707106781 * (T8I + T8N);
+			 T9j = T8J - T8M;
+			 T9k = T8H + T8G;
+			 T9l = KP707106781 * (T9j - T9k);
+			 Tbm = KP707106781 * (T9k + T9j);
+		    }
+		    {
+			 E TdR, TdU, T9g, T9h;
+			 TdR = T4J - T4U;
+			 TdU = TdS - TdT;
+			 TdV = TdR - TdU;
+			 TeX = TdR + TdU;
+			 T9g = T9e - T9f;
+			 T9h = T4O - T4T;
+			 T9i = T9g + T9h;
+			 Tbw = T9g - T9h;
+		    }
+	       }
+	       {
+		    E T36, T7G, T3b, T7H, T3c, Tdq, T3h, T8m, T3m, T8n, T3n, Tdr, T3z, TdI, T7Q;
+		    E T7T, T3K, TdJ, T7L, T7O;
+		    {
+			 E T33, T35, T32, T34;
+			 T33 = ri[WS(rs, 1)];
+			 T35 = ii[WS(rs, 1)];
+			 T32 = W[0];
+			 T34 = W[1];
+			 T36 = FMA(T32, T33, T34 * T35);
+			 T7G = FNMS(T34, T33, T32 * T35);
+		    }
+		    {
+			 E T38, T3a, T37, T39;
+			 T38 = ri[WS(rs, 33)];
+			 T3a = ii[WS(rs, 33)];
+			 T37 = W[64];
+			 T39 = W[65];
+			 T3b = FMA(T37, T38, T39 * T3a);
+			 T7H = FNMS(T39, T38, T37 * T3a);
+		    }
+		    T3c = T36 + T3b;
+		    Tdq = T7G + T7H;
+		    {
+			 E T3e, T3g, T3d, T3f;
+			 T3e = ri[WS(rs, 17)];
+			 T3g = ii[WS(rs, 17)];
+			 T3d = W[32];
+			 T3f = W[33];
+			 T3h = FMA(T3d, T3e, T3f * T3g);
+			 T8m = FNMS(T3f, T3e, T3d * T3g);
+		    }
+		    {
+			 E T3j, T3l, T3i, T3k;
+			 T3j = ri[WS(rs, 49)];
+			 T3l = ii[WS(rs, 49)];
+			 T3i = W[96];
+			 T3k = W[97];
+			 T3m = FMA(T3i, T3j, T3k * T3l);
+			 T8n = FNMS(T3k, T3j, T3i * T3l);
+		    }
+		    T3n = T3h + T3m;
+		    Tdr = T8m + T8n;
+		    {
+			 E T3t, T7R, T3y, T7S;
+			 {
+			      E T3q, T3s, T3p, T3r;
+			      T3q = ri[WS(rs, 9)];
+			      T3s = ii[WS(rs, 9)];
+			      T3p = W[16];
+			      T3r = W[17];
+			      T3t = FMA(T3p, T3q, T3r * T3s);
+			      T7R = FNMS(T3r, T3q, T3p * T3s);
+			 }
+			 {
+			      E T3v, T3x, T3u, T3w;
+			      T3v = ri[WS(rs, 41)];
+			      T3x = ii[WS(rs, 41)];
+			      T3u = W[80];
+			      T3w = W[81];
+			      T3y = FMA(T3u, T3v, T3w * T3x);
+			      T7S = FNMS(T3w, T3v, T3u * T3x);
+			 }
+			 T3z = T3t + T3y;
+			 TdI = T7R + T7S;
+			 T7Q = T3t - T3y;
+			 T7T = T7R - T7S;
+		    }
+		    {
+			 E T3E, T7M, T3J, T7N;
+			 {
+			      E T3B, T3D, T3A, T3C;
+			      T3B = ri[WS(rs, 57)];
+			      T3D = ii[WS(rs, 57)];
+			      T3A = W[112];
+			      T3C = W[113];
+			      T3E = FMA(T3A, T3B, T3C * T3D);
+			      T7M = FNMS(T3C, T3B, T3A * T3D);
+			 }
+			 {
+			      E T3G, T3I, T3F, T3H;
+			      T3G = ri[WS(rs, 25)];
+			      T3I = ii[WS(rs, 25)];
+			      T3F = W[48];
+			      T3H = W[49];
+			      T3J = FMA(T3F, T3G, T3H * T3I);
+			      T7N = FNMS(T3H, T3G, T3F * T3I);
+			 }
+			 T3K = T3E + T3J;
+			 TdJ = T7M + T7N;
+			 T7L = T3E - T3J;
+			 T7O = T7M - T7N;
+		    }
+		    {
+			 E T3o, T3L, TdH, TdK;
+			 T3o = T3c + T3n;
+			 T3L = T3z + T3K;
+			 T3M = T3o + T3L;
+			 TfL = T3o - T3L;
+			 TdH = T3c - T3n;
+			 TdK = TdI - TdJ;
+			 TdL = TdH - TdK;
+			 TeQ = TdH + TdK;
+		    }
+		    {
+			 E TfG, TfH, T7I, T7J;
+			 TfG = Tdq + Tdr;
+			 TfH = TdI + TdJ;
+			 TfI = TfG - TfH;
+			 Tgt = TfG + TfH;
+			 T7I = T7G - T7H;
+			 T7J = T3h - T3m;
+			 T7K = T7I + T7J;
+			 Tb2 = T7I - T7J;
+		    }
+		    {
+			 E T7P, T7U, T8q, T8r;
+			 T7P = T7L - T7O;
+			 T7U = T7Q + T7T;
+			 T7V = KP707106781 * (T7P - T7U);
+			 Tbe = KP707106781 * (T7U + T7P);
+			 T8q = T7T - T7Q;
+			 T8r = T7L + T7O;
+			 T8s = KP707106781 * (T8q - T8r);
+			 Tb3 = KP707106781 * (T8q + T8r);
+		    }
+		    {
+			 E Tds, Tdt, T8l, T8o;
+			 Tds = Tdq - Tdr;
+			 Tdt = T3K - T3z;
+			 Tdu = Tds - Tdt;
+			 TeT = Tds + Tdt;
+			 T8l = T36 - T3b;
+			 T8o = T8m - T8n;
+			 T8p = T8l - T8o;
+			 Tbd = T8l + T8o;
+		    }
+	       }
+	       {
+		    E T3X, TdB, T8a, T8d, T4v, Tdx, T80, T85, T48, TdC, T8b, T8g, T4k, Tdw, T7X;
+		    E T84;
+		    {
+			 E T3R, T88, T3W, T89;
+			 {
+			      E T3O, T3Q, T3N, T3P;
+			      T3O = ri[WS(rs, 5)];
+			      T3Q = ii[WS(rs, 5)];
+			      T3N = W[8];
+			      T3P = W[9];
+			      T3R = FMA(T3N, T3O, T3P * T3Q);
+			      T88 = FNMS(T3P, T3O, T3N * T3Q);
+			 }
+			 {
+			      E T3T, T3V, T3S, T3U;
+			      T3T = ri[WS(rs, 37)];
+			      T3V = ii[WS(rs, 37)];
+			      T3S = W[72];
+			      T3U = W[73];
+			      T3W = FMA(T3S, T3T, T3U * T3V);
+			      T89 = FNMS(T3U, T3T, T3S * T3V);
+			 }
+			 T3X = T3R + T3W;
+			 TdB = T88 + T89;
+			 T8a = T88 - T89;
+			 T8d = T3R - T3W;
+		    }
+		    {
+			 E T4p, T7Y, T4u, T7Z;
+			 {
+			      E T4m, T4o, T4l, T4n;
+			      T4m = ri[WS(rs, 13)];
+			      T4o = ii[WS(rs, 13)];
+			      T4l = W[24];
+			      T4n = W[25];
+			      T4p = FMA(T4l, T4m, T4n * T4o);
+			      T7Y = FNMS(T4n, T4m, T4l * T4o);
+			 }
+			 {
+			      E T4r, T4t, T4q, T4s;
+			      T4r = ri[WS(rs, 45)];
+			      T4t = ii[WS(rs, 45)];
+			      T4q = W[88];
+			      T4s = W[89];
+			      T4u = FMA(T4q, T4r, T4s * T4t);
+			      T7Z = FNMS(T4s, T4r, T4q * T4t);
+			 }
+			 T4v = T4p + T4u;
+			 Tdx = T7Y + T7Z;
+			 T80 = T7Y - T7Z;
+			 T85 = T4p - T4u;
+		    }
+		    {
+			 E T42, T8e, T47, T8f;
+			 {
+			      E T3Z, T41, T3Y, T40;
+			      T3Z = ri[WS(rs, 21)];
+			      T41 = ii[WS(rs, 21)];
+			      T3Y = W[40];
+			      T40 = W[41];
+			      T42 = FMA(T3Y, T3Z, T40 * T41);
+			      T8e = FNMS(T40, T3Z, T3Y * T41);
+			 }
+			 {
+			      E T44, T46, T43, T45;
+			      T44 = ri[WS(rs, 53)];
+			      T46 = ii[WS(rs, 53)];
+			      T43 = W[104];
+			      T45 = W[105];
+			      T47 = FMA(T43, T44, T45 * T46);
+			      T8f = FNMS(T45, T44, T43 * T46);
+			 }
+			 T48 = T42 + T47;
+			 TdC = T8e + T8f;
+			 T8b = T42 - T47;
+			 T8g = T8e - T8f;
+		    }
+		    {
+			 E T4e, T82, T4j, T83;
+			 {
+			      E T4b, T4d, T4a, T4c;
+			      T4b = ri[WS(rs, 61)];
+			      T4d = ii[WS(rs, 61)];
+			      T4a = W[120];
+			      T4c = W[121];
+			      T4e = FMA(T4a, T4b, T4c * T4d);
+			      T82 = FNMS(T4c, T4b, T4a * T4d);
+			 }
+			 {
+			      E T4g, T4i, T4f, T4h;
+			      T4g = ri[WS(rs, 29)];
+			      T4i = ii[WS(rs, 29)];
+			      T4f = W[56];
+			      T4h = W[57];
+			      T4j = FMA(T4f, T4g, T4h * T4i);
+			      T83 = FNMS(T4h, T4g, T4f * T4i);
+			 }
+			 T4k = T4e + T4j;
+			 Tdw = T82 + T83;
+			 T7X = T4e - T4j;
+			 T84 = T82 - T83;
+		    }
+		    {
+			 E T49, T4w, TdA, TdD;
+			 T49 = T3X + T48;
+			 T4w = T4k + T4v;
+			 T4x = T49 + T4w;
+			 TfJ = T4w - T49;
+			 TdA = T3X - T48;
+			 TdD = TdB - TdC;
+			 TdE = TdA + TdD;
+			 TdM = TdD - TdA;
+		    }
+		    {
+			 E TfM, TfN, T81, T86;
+			 TfM = TdB + TdC;
+			 TfN = Tdw + Tdx;
+			 TfO = TfM - TfN;
+			 Tgu = TfM + TfN;
+			 T81 = T7X - T80;
+			 T86 = T84 + T85;
+			 T87 = FNMS(KP923879532, T86, KP382683432 * T81);
+			 T8v = FMA(KP382683432, T86, KP923879532 * T81);
+		    }
+		    {
+			 E T8c, T8h, Tb8, Tb9;
+			 T8c = T8a + T8b;
+			 T8h = T8d - T8g;
+			 T8i = FMA(KP923879532, T8c, KP382683432 * T8h);
+			 T8u = FNMS(KP923879532, T8h, KP382683432 * T8c);
+			 Tb8 = T8a - T8b;
+			 Tb9 = T8d + T8g;
+			 Tba = FMA(KP382683432, Tb8, KP923879532 * Tb9);
+			 Tbg = FNMS(KP382683432, Tb9, KP923879532 * Tb8);
+		    }
+		    {
+			 E Tdv, Tdy, Tb5, Tb6;
+			 Tdv = T4k - T4v;
+			 Tdy = Tdw - Tdx;
+			 Tdz = Tdv - Tdy;
+			 TdN = Tdv + Tdy;
+			 Tb5 = T7X + T80;
+			 Tb6 = T84 - T85;
+			 Tb7 = FNMS(KP382683432, Tb6, KP923879532 * Tb5);
+			 Tbh = FMA(KP923879532, Tb6, KP382683432 * Tb5);
+		    }
+	       }
+	       {
+		    E T5u, TdW, T8S, T8V, T62, Te3, T94, T99, T5F, TdX, T8T, T8Y, T5R, Te2, T93;
+		    E T96;
+		    {
+			 E T5o, T8Q, T5t, T8R;
+			 {
+			      E T5l, T5n, T5k, T5m;
+			      T5l = ri[WS(rs, 3)];
+			      T5n = ii[WS(rs, 3)];
+			      T5k = W[4];
+			      T5m = W[5];
+			      T5o = FMA(T5k, T5l, T5m * T5n);
+			      T8Q = FNMS(T5m, T5l, T5k * T5n);
+			 }
+			 {
+			      E T5q, T5s, T5p, T5r;
+			      T5q = ri[WS(rs, 35)];
+			      T5s = ii[WS(rs, 35)];
+			      T5p = W[68];
+			      T5r = W[69];
+			      T5t = FMA(T5p, T5q, T5r * T5s);
+			      T8R = FNMS(T5r, T5q, T5p * T5s);
+			 }
+			 T5u = T5o + T5t;
+			 TdW = T8Q + T8R;
+			 T8S = T8Q - T8R;
+			 T8V = T5o - T5t;
+		    }
+		    {
+			 E T5W, T97, T61, T98;
+			 {
+			      E T5T, T5V, T5S, T5U;
+			      T5T = ri[WS(rs, 11)];
+			      T5V = ii[WS(rs, 11)];
+			      T5S = W[20];
+			      T5U = W[21];
+			      T5W = FMA(T5S, T5T, T5U * T5V);
+			      T97 = FNMS(T5U, T5T, T5S * T5V);
+			 }
+			 {
+			      E T5Y, T60, T5X, T5Z;
+			      T5Y = ri[WS(rs, 43)];
+			      T60 = ii[WS(rs, 43)];
+			      T5X = W[84];
+			      T5Z = W[85];
+			      T61 = FMA(T5X, T5Y, T5Z * T60);
+			      T98 = FNMS(T5Z, T5Y, T5X * T60);
+			 }
+			 T62 = T5W + T61;
+			 Te3 = T97 + T98;
+			 T94 = T5W - T61;
+			 T99 = T97 - T98;
+		    }
+		    {
+			 E T5z, T8W, T5E, T8X;
+			 {
+			      E T5w, T5y, T5v, T5x;
+			      T5w = ri[WS(rs, 19)];
+			      T5y = ii[WS(rs, 19)];
+			      T5v = W[36];
+			      T5x = W[37];
+			      T5z = FMA(T5v, T5w, T5x * T5y);
+			      T8W = FNMS(T5x, T5w, T5v * T5y);
+			 }
+			 {
+			      E T5B, T5D, T5A, T5C;
+			      T5B = ri[WS(rs, 51)];
+			      T5D = ii[WS(rs, 51)];
+			      T5A = W[100];
+			      T5C = W[101];
+			      T5E = FMA(T5A, T5B, T5C * T5D);
+			      T8X = FNMS(T5C, T5B, T5A * T5D);
+			 }
+			 T5F = T5z + T5E;
+			 TdX = T8W + T8X;
+			 T8T = T5z - T5E;
+			 T8Y = T8W - T8X;
+		    }
+		    {
+			 E T5L, T91, T5Q, T92;
+			 {
+			      E T5I, T5K, T5H, T5J;
+			      T5I = ri[WS(rs, 59)];
+			      T5K = ii[WS(rs, 59)];
+			      T5H = W[116];
+			      T5J = W[117];
+			      T5L = FMA(T5H, T5I, T5J * T5K);
+			      T91 = FNMS(T5J, T5I, T5H * T5K);
+			 }
+			 {
+			      E T5N, T5P, T5M, T5O;
+			      T5N = ri[WS(rs, 27)];
+			      T5P = ii[WS(rs, 27)];
+			      T5M = W[52];
+			      T5O = W[53];
+			      T5Q = FMA(T5M, T5N, T5O * T5P);
+			      T92 = FNMS(T5O, T5N, T5M * T5P);
+			 }
+			 T5R = T5L + T5Q;
+			 Te2 = T91 + T92;
+			 T93 = T91 - T92;
+			 T96 = T5L - T5Q;
+		    }
+		    {
+			 E T5G, T63, Te1, Te4;
+			 T5G = T5u + T5F;
+			 T63 = T5R + T62;
+			 T64 = T5G + T63;
+			 TfZ = T63 - T5G;
+			 Te1 = T5R - T62;
+			 Te4 = Te2 - Te3;
+			 Te5 = Te1 + Te4;
+			 Ted = Te1 - Te4;
+		    }
+		    {
+			 E TfS, TfT, T8U, T8Z;
+			 TfS = TdW + TdX;
+			 TfT = Te2 + Te3;
+			 TfU = TfS - TfT;
+			 Tgz = TfS + TfT;
+			 T8U = T8S + T8T;
+			 T8Z = T8V - T8Y;
+			 T90 = FNMS(KP923879532, T8Z, KP382683432 * T8U);
+			 T9o = FMA(KP923879532, T8U, KP382683432 * T8Z);
+		    }
+		    {
+			 E T95, T9a, Tbr, Tbs;
+			 T95 = T93 + T94;
+			 T9a = T96 - T99;
+			 T9b = FMA(KP382683432, T95, KP923879532 * T9a);
+			 T9n = FNMS(KP923879532, T95, KP382683432 * T9a);
+			 Tbr = T93 - T94;
+			 Tbs = T96 + T99;
+			 Tbt = FMA(KP923879532, Tbr, KP382683432 * Tbs);
+			 Tbz = FNMS(KP382683432, Tbr, KP923879532 * Tbs);
+		    }
+		    {
+			 E TdY, TdZ, Tbo, Tbp;
+			 TdY = TdW - TdX;
+			 TdZ = T5u - T5F;
+			 Te0 = TdY - TdZ;
+			 Tee = TdZ + TdY;
+			 Tbo = T8S - T8T;
+			 Tbp = T8V + T8Y;
+			 Tbq = FNMS(KP382683432, Tbp, KP923879532 * Tbo);
+			 TbA = FMA(KP382683432, Tbo, KP923879532 * Tbp);
+		    }
+	       }
+	       {
+		    E T1t, Tgn, TgK, TgL, TgV, Th1, T30, Th0, T66, TgX, Tgw, TgE, TgB, TgF, Tgq;
+		    E TgM;
+		    {
+			 E TH, T1s, TgI, TgJ;
+			 TH = Tj + TG;
+			 T1s = T14 + T1r;
+			 T1t = TH + T1s;
+			 Tgn = TH - T1s;
+			 TgI = Tgt + Tgu;
+			 TgJ = Tgy + Tgz;
+			 TgK = TgI - TgJ;
+			 TgL = TgI + TgJ;
+		    }
+		    {
+			 E TgN, TgU, T2e, T2Z;
+			 TgN = Tfq + Tfr;
+			 TgU = TgO + TgT;
+			 TgV = TgN + TgU;
+			 Th1 = TgU - TgN;
+			 T2e = T1Q + T2d;
+			 T2Z = T2B + T2Y;
+			 T30 = T2e + T2Z;
+			 Th0 = T2Z - T2e;
+		    }
+		    {
+			 E T4y, T65, Tgs, Tgv;
+			 T4y = T3M + T4x;
+			 T65 = T5j + T64;
+			 T66 = T4y + T65;
+			 TgX = T65 - T4y;
+			 Tgs = T3M - T4x;
+			 Tgv = Tgt - Tgu;
+			 Tgw = Tgs + Tgv;
+			 TgE = Tgv - Tgs;
+		    }
+		    {
+			 E Tgx, TgA, Tgo, Tgp;
+			 Tgx = T5j - T64;
+			 TgA = Tgy - Tgz;
+			 TgB = Tgx - TgA;
+			 TgF = Tgx + TgA;
+			 Tgo = Tfu + Tfv;
+			 Tgp = TfA + TfB;
+			 Tgq = Tgo - Tgp;
+			 TgM = Tgo + Tgp;
+		    }
+		    {
+			 E T31, TgW, TgH, TgY;
+			 T31 = T1t + T30;
+			 ri[WS(rs, 32)] = T31 - T66;
+			 ri[0] = T31 + T66;
+			 TgW = TgM + TgV;
+			 ii[0] = TgL + TgW;
+			 ii[WS(rs, 32)] = TgW - TgL;
+			 TgH = T1t - T30;
+			 ri[WS(rs, 48)] = TgH - TgK;
+			 ri[WS(rs, 16)] = TgH + TgK;
+			 TgY = TgV - TgM;
+			 ii[WS(rs, 16)] = TgX + TgY;
+			 ii[WS(rs, 48)] = TgY - TgX;
+		    }
+		    {
+			 E Tgr, TgC, TgZ, Th2;
+			 Tgr = Tgn + Tgq;
+			 TgC = KP707106781 * (Tgw + TgB);
+			 ri[WS(rs, 40)] = Tgr - TgC;
+			 ri[WS(rs, 8)] = Tgr + TgC;
+			 TgZ = KP707106781 * (TgE + TgF);
+			 Th2 = Th0 + Th1;
+			 ii[WS(rs, 8)] = TgZ + Th2;
+			 ii[WS(rs, 40)] = Th2 - TgZ;
+		    }
+		    {
+			 E TgD, TgG, Th3, Th4;
+			 TgD = Tgn - Tgq;
+			 TgG = KP707106781 * (TgE - TgF);
+			 ri[WS(rs, 56)] = TgD - TgG;
+			 ri[WS(rs, 24)] = TgD + TgG;
+			 Th3 = KP707106781 * (TgB - Tgw);
+			 Th4 = Th1 - Th0;
+			 ii[WS(rs, 24)] = Th3 + Th4;
+			 ii[WS(rs, 56)] = Th4 - Th3;
+		    }
+	       }
+	       {
+		    E Tft, Tg7, Tgh, Tgl, Th9, Thf, TfE, Th6, TfQ, Tg4, Tga, The, Tge, Tgk, Tg1;
+		    E Tg5;
+		    {
+			 E Tfp, Tfs, Tgf, Tgg;
+			 Tfp = Tj - TG;
+			 Tfs = Tfq - Tfr;
+			 Tft = Tfp - Tfs;
+			 Tg7 = Tfp + Tfs;
+			 Tgf = TfR + TfU;
+			 Tgg = TfY + TfZ;
+			 Tgh = FNMS(KP382683432, Tgg, KP923879532 * Tgf);
+			 Tgl = FMA(KP923879532, Tgg, KP382683432 * Tgf);
+		    }
+		    {
+			 E Th7, Th8, Tfy, TfD;
+			 Th7 = T1r - T14;
+			 Th8 = TgT - TgO;
+			 Th9 = Th7 + Th8;
+			 Thf = Th8 - Th7;
+			 Tfy = Tfw - Tfx;
+			 TfD = Tfz + TfC;
+			 TfE = KP707106781 * (Tfy - TfD);
+			 Th6 = KP707106781 * (Tfy + TfD);
+		    }
+		    {
+			 E TfK, TfP, Tg8, Tg9;
+			 TfK = TfI - TfJ;
+			 TfP = TfL - TfO;
+			 TfQ = FMA(KP923879532, TfK, KP382683432 * TfP);
+			 Tg4 = FNMS(KP923879532, TfP, KP382683432 * TfK);
+			 Tg8 = Tfx + Tfw;
+			 Tg9 = Tfz - TfC;
+			 Tga = KP707106781 * (Tg8 + Tg9);
+			 The = KP707106781 * (Tg9 - Tg8);
+		    }
+		    {
+			 E Tgc, Tgd, TfV, Tg0;
+			 Tgc = TfI + TfJ;
+			 Tgd = TfL + TfO;
+			 Tge = FMA(KP382683432, Tgc, KP923879532 * Tgd);
+			 Tgk = FNMS(KP382683432, Tgd, KP923879532 * Tgc);
+			 TfV = TfR - TfU;
+			 Tg0 = TfY - TfZ;
+			 Tg1 = FNMS(KP923879532, Tg0, KP382683432 * TfV);
+			 Tg5 = FMA(KP382683432, Tg0, KP923879532 * TfV);
+		    }
+		    {
+			 E TfF, Tg2, Thd, Thg;
+			 TfF = Tft + TfE;
+			 Tg2 = TfQ + Tg1;
+			 ri[WS(rs, 44)] = TfF - Tg2;
+			 ri[WS(rs, 12)] = TfF + Tg2;
+			 Thd = Tg4 + Tg5;
+			 Thg = The + Thf;
+			 ii[WS(rs, 12)] = Thd + Thg;
+			 ii[WS(rs, 44)] = Thg - Thd;
+		    }
+		    {
+			 E Tg3, Tg6, Thh, Thi;
+			 Tg3 = Tft - TfE;
+			 Tg6 = Tg4 - Tg5;
+			 ri[WS(rs, 60)] = Tg3 - Tg6;
+			 ri[WS(rs, 28)] = Tg3 + Tg6;
+			 Thh = Tg1 - TfQ;
+			 Thi = Thf - The;
+			 ii[WS(rs, 28)] = Thh + Thi;
+			 ii[WS(rs, 60)] = Thi - Thh;
+		    }
+		    {
+			 E Tgb, Tgi, Th5, Tha;
+			 Tgb = Tg7 + Tga;
+			 Tgi = Tge + Tgh;
+			 ri[WS(rs, 36)] = Tgb - Tgi;
+			 ri[WS(rs, 4)] = Tgb + Tgi;
+			 Th5 = Tgk + Tgl;
+			 Tha = Th6 + Th9;
+			 ii[WS(rs, 4)] = Th5 + Tha;
+			 ii[WS(rs, 36)] = Tha - Th5;
+		    }
+		    {
+			 E Tgj, Tgm, Thb, Thc;
+			 Tgj = Tg7 - Tga;
+			 Tgm = Tgk - Tgl;
+			 ri[WS(rs, 52)] = Tgj - Tgm;
+			 ri[WS(rs, 20)] = Tgj + Tgm;
+			 Thb = Tgh - Tge;
+			 Thc = Th9 - Th6;
+			 ii[WS(rs, 20)] = Thb + Thc;
+			 ii[WS(rs, 52)] = Thc - Thb;
+		    }
+	       }
+	       {
+		    E Td1, Ten, Tdo, ThA, ThD, ThJ, Teq, ThI, Teh, TeB, Tel, Tex, TdQ, TeA, Tek;
+		    E Teu;
+		    {
+			 E TcP, Td0, Teo, Tep;
+			 TcP = TcL - TcO;
+			 Td0 = KP707106781 * (TcU - TcZ);
+			 Td1 = TcP - Td0;
+			 Ten = TcP + Td0;
+			 {
+			      E Tdc, Tdn, ThB, ThC;
+			      Tdc = FNMS(KP923879532, Tdb, KP382683432 * Td6);
+			      Tdn = FMA(KP382683432, Tdh, KP923879532 * Tdm);
+			      Tdo = Tdc - Tdn;
+			      ThA = Tdc + Tdn;
+			      ThB = KP707106781 * (TeF - TeE);
+			      ThC = Thn - Thm;
+			      ThD = ThB + ThC;
+			      ThJ = ThC - ThB;
+			 }
+			 Teo = FMA(KP923879532, Td6, KP382683432 * Tdb);
+			 Tep = FNMS(KP923879532, Tdh, KP382683432 * Tdm);
+			 Teq = Teo + Tep;
+			 ThI = Tep - Teo;
+			 {
+			      E Te7, Tev, Teg, Tew, Te6, Tef;
+			      Te6 = KP707106781 * (Te0 - Te5);
+			      Te7 = TdV - Te6;
+			      Tev = TdV + Te6;
+			      Tef = KP707106781 * (Ted - Tee);
+			      Teg = Tec - Tef;
+			      Tew = Tec + Tef;
+			      Teh = FNMS(KP980785280, Teg, KP195090322 * Te7);
+			      TeB = FMA(KP831469612, Tew, KP555570233 * Tev);
+			      Tel = FMA(KP195090322, Teg, KP980785280 * Te7);
+			      Tex = FNMS(KP555570233, Tew, KP831469612 * Tev);
+			 }
+			 {
+			      E TdG, Tes, TdP, Tet, TdF, TdO;
+			      TdF = KP707106781 * (Tdz - TdE);
+			      TdG = Tdu - TdF;
+			      Tes = Tdu + TdF;
+			      TdO = KP707106781 * (TdM - TdN);
+			      TdP = TdL - TdO;
+			      Tet = TdL + TdO;
+			      TdQ = FMA(KP980785280, TdG, KP195090322 * TdP);
+			      TeA = FNMS(KP555570233, Tet, KP831469612 * Tes);
+			      Tek = FNMS(KP980785280, TdP, KP195090322 * TdG);
+			      Teu = FMA(KP555570233, Tes, KP831469612 * Tet);
+			 }
+		    }
+		    {
+			 E Tdp, Tei, ThH, ThK;
+			 Tdp = Td1 + Tdo;
+			 Tei = TdQ + Teh;
+			 ri[WS(rs, 46)] = Tdp - Tei;
+			 ri[WS(rs, 14)] = Tdp + Tei;
+			 ThH = Tek + Tel;
+			 ThK = ThI + ThJ;
+			 ii[WS(rs, 14)] = ThH + ThK;
+			 ii[WS(rs, 46)] = ThK - ThH;
+		    }
+		    {
+			 E Tej, Tem, ThL, ThM;
+			 Tej = Td1 - Tdo;
+			 Tem = Tek - Tel;
+			 ri[WS(rs, 62)] = Tej - Tem;
+			 ri[WS(rs, 30)] = Tej + Tem;
+			 ThL = Teh - TdQ;
+			 ThM = ThJ - ThI;
+			 ii[WS(rs, 30)] = ThL + ThM;
+			 ii[WS(rs, 62)] = ThM - ThL;
+		    }
+		    {
+			 E Ter, Tey, Thz, ThE;
+			 Ter = Ten + Teq;
+			 Tey = Teu + Tex;
+			 ri[WS(rs, 38)] = Ter - Tey;
+			 ri[WS(rs, 6)] = Ter + Tey;
+			 Thz = TeA + TeB;
+			 ThE = ThA + ThD;
+			 ii[WS(rs, 6)] = Thz + ThE;
+			 ii[WS(rs, 38)] = ThE - Thz;
+		    }
+		    {
+			 E Tez, TeC, ThF, ThG;
+			 Tez = Ten - Teq;
+			 TeC = TeA - TeB;
+			 ri[WS(rs, 54)] = Tez - TeC;
+			 ri[WS(rs, 22)] = Tez + TeC;
+			 ThF = Tex - Teu;
+			 ThG = ThD - ThA;
+			 ii[WS(rs, 22)] = ThF + ThG;
+			 ii[WS(rs, 54)] = ThG - ThF;
+		    }
+	       }
+	       {
+		    E TeH, Tf9, TeO, Thk, Thp, Thv, Tfc, Thu, Tf3, Tfn, Tf7, Tfj, TeW, Tfm, Tf6;
+		    E Tfg;
+		    {
+			 E TeD, TeG, Tfa, Tfb;
+			 TeD = TcL + TcO;
+			 TeG = KP707106781 * (TeE + TeF);
+			 TeH = TeD - TeG;
+			 Tf9 = TeD + TeG;
+			 {
+			      E TeK, TeN, Thl, Tho;
+			      TeK = FNMS(KP382683432, TeJ, KP923879532 * TeI);
+			      TeN = FMA(KP923879532, TeL, KP382683432 * TeM);
+			      TeO = TeK - TeN;
+			      Thk = TeK + TeN;
+			      Thl = KP707106781 * (TcU + TcZ);
+			      Tho = Thm + Thn;
+			      Thp = Thl + Tho;
+			      Thv = Tho - Thl;
+			 }
+			 Tfa = FMA(KP382683432, TeI, KP923879532 * TeJ);
+			 Tfb = FNMS(KP382683432, TeL, KP923879532 * TeM);
+			 Tfc = Tfa + Tfb;
+			 Thu = Tfb - Tfa;
+			 {
+			      E TeZ, Tfh, Tf2, Tfi, TeY, Tf1;
+			      TeY = KP707106781 * (Tee + Ted);
+			      TeZ = TeX - TeY;
+			      Tfh = TeX + TeY;
+			      Tf1 = KP707106781 * (Te0 + Te5);
+			      Tf2 = Tf0 - Tf1;
+			      Tfi = Tf0 + Tf1;
+			      Tf3 = FNMS(KP831469612, Tf2, KP555570233 * TeZ);
+			      Tfn = FMA(KP195090322, Tfh, KP980785280 * Tfi);
+			      Tf7 = FMA(KP831469612, TeZ, KP555570233 * Tf2);
+			      Tfj = FNMS(KP195090322, Tfi, KP980785280 * Tfh);
+			 }
+			 {
+			      E TeS, Tfe, TeV, Tff, TeR, TeU;
+			      TeR = KP707106781 * (TdE + Tdz);
+			      TeS = TeQ - TeR;
+			      Tfe = TeQ + TeR;
+			      TeU = KP707106781 * (TdM + TdN);
+			      TeV = TeT - TeU;
+			      Tff = TeT + TeU;
+			      TeW = FMA(KP555570233, TeS, KP831469612 * TeV);
+			      Tfm = FNMS(KP195090322, Tfe, KP980785280 * Tff);
+			      Tf6 = FNMS(KP831469612, TeS, KP555570233 * TeV);
+			      Tfg = FMA(KP980785280, Tfe, KP195090322 * Tff);
+			 }
+		    }
+		    {
+			 E TeP, Tf4, Tht, Thw;
+			 TeP = TeH + TeO;
+			 Tf4 = TeW + Tf3;
+			 ri[WS(rs, 42)] = TeP - Tf4;
+			 ri[WS(rs, 10)] = TeP + Tf4;
+			 Tht = Tf6 + Tf7;
+			 Thw = Thu + Thv;
+			 ii[WS(rs, 10)] = Tht + Thw;
+			 ii[WS(rs, 42)] = Thw - Tht;
+		    }
+		    {
+			 E Tf5, Tf8, Thx, Thy;
+			 Tf5 = TeH - TeO;
+			 Tf8 = Tf6 - Tf7;
+			 ri[WS(rs, 58)] = Tf5 - Tf8;
+			 ri[WS(rs, 26)] = Tf5 + Tf8;
+			 Thx = Tf3 - TeW;
+			 Thy = Thv - Thu;
+			 ii[WS(rs, 26)] = Thx + Thy;
+			 ii[WS(rs, 58)] = Thy - Thx;
+		    }
+		    {
+			 E Tfd, Tfk, Thj, Thq;
+			 Tfd = Tf9 + Tfc;
+			 Tfk = Tfg + Tfj;
+			 ri[WS(rs, 34)] = Tfd - Tfk;
+			 ri[WS(rs, 2)] = Tfd + Tfk;
+			 Thj = Tfm + Tfn;
+			 Thq = Thk + Thp;
+			 ii[WS(rs, 2)] = Thj + Thq;
+			 ii[WS(rs, 34)] = Thq - Thj;
+		    }
+		    {
+			 E Tfl, Tfo, Thr, Ths;
+			 Tfl = Tf9 - Tfc;
+			 Tfo = Tfm - Tfn;
+			 ri[WS(rs, 50)] = Tfl - Tfo;
+			 ri[WS(rs, 18)] = Tfl + Tfo;
+			 Thr = Tfj - Tfg;
+			 Ths = Thp - Thk;
+			 ii[WS(rs, 18)] = Thr + Ths;
+			 ii[WS(rs, 50)] = Ths - Thr;
+		    }
+	       }
+	       {
+		    E T6L, T9x, TiD, TiJ, T7E, TiI, T9A, TiA, T8y, T9K, T9u, T9E, T9r, T9L, T9v;
+		    E T9H;
+		    {
+			 E T6n, T6K, TiB, TiC;
+			 T6n = T6b - T6m;
+			 T6K = T6y - T6J;
+			 T6L = T6n - T6K;
+			 T9x = T6n + T6K;
+			 TiB = T9P - T9O;
+			 TiC = Tin - Tim;
+			 TiD = TiB + TiC;
+			 TiJ = TiC - TiB;
+		    }
+		    {
+			 E T7c, T9y, T7D, T9z;
+			 {
+			      E T72, T7b, T7t, T7C;
+			      T72 = T6Q - T71;
+			      T7b = T77 - T7a;
+			      T7c = FNMS(KP980785280, T7b, KP195090322 * T72);
+			      T9y = FMA(KP980785280, T72, KP195090322 * T7b);
+			      T7t = T7h - T7s;
+			      T7C = T7y - T7B;
+			      T7D = FMA(KP195090322, T7t, KP980785280 * T7C);
+			      T9z = FNMS(KP980785280, T7t, KP195090322 * T7C);
+			 }
+			 T7E = T7c - T7D;
+			 TiI = T9z - T9y;
+			 T9A = T9y + T9z;
+			 TiA = T7c + T7D;
+		    }
+		    {
+			 E T8k, T9C, T8x, T9D;
+			 {
+			      E T7W, T8j, T8t, T8w;
+			      T7W = T7K - T7V;
+			      T8j = T87 - T8i;
+			      T8k = T7W - T8j;
+			      T9C = T7W + T8j;
+			      T8t = T8p - T8s;
+			      T8w = T8u - T8v;
+			      T8x = T8t - T8w;
+			      T9D = T8t + T8w;
+			 }
+			 T8y = FMA(KP995184726, T8k, KP098017140 * T8x);
+			 T9K = FNMS(KP634393284, T9D, KP773010453 * T9C);
+			 T9u = FNMS(KP995184726, T8x, KP098017140 * T8k);
+			 T9E = FMA(KP634393284, T9C, KP773010453 * T9D);
+		    }
+		    {
+			 E T9d, T9F, T9q, T9G;
+			 {
+			      E T8P, T9c, T9m, T9p;
+			      T8P = T8D - T8O;
+			      T9c = T90 - T9b;
+			      T9d = T8P - T9c;
+			      T9F = T8P + T9c;
+			      T9m = T9i - T9l;
+			      T9p = T9n - T9o;
+			      T9q = T9m - T9p;
+			      T9G = T9m + T9p;
+			 }
+			 T9r = FNMS(KP995184726, T9q, KP098017140 * T9d);
+			 T9L = FMA(KP773010453, T9G, KP634393284 * T9F);
+			 T9v = FMA(KP098017140, T9q, KP995184726 * T9d);
+			 T9H = FNMS(KP634393284, T9G, KP773010453 * T9F);
+		    }
+		    {
+			 E T7F, T9s, TiH, TiK;
+			 T7F = T6L + T7E;
+			 T9s = T8y + T9r;
+			 ri[WS(rs, 47)] = T7F - T9s;
+			 ri[WS(rs, 15)] = T7F + T9s;
+			 TiH = T9u + T9v;
+			 TiK = TiI + TiJ;
+			 ii[WS(rs, 15)] = TiH + TiK;
+			 ii[WS(rs, 47)] = TiK - TiH;
+		    }
+		    {
+			 E T9t, T9w, TiL, TiM;
+			 T9t = T6L - T7E;
+			 T9w = T9u - T9v;
+			 ri[WS(rs, 63)] = T9t - T9w;
+			 ri[WS(rs, 31)] = T9t + T9w;
+			 TiL = T9r - T8y;
+			 TiM = TiJ - TiI;
+			 ii[WS(rs, 31)] = TiL + TiM;
+			 ii[WS(rs, 63)] = TiM - TiL;
+		    }
+		    {
+			 E T9B, T9I, Tiz, TiE;
+			 T9B = T9x + T9A;
+			 T9I = T9E + T9H;
+			 ri[WS(rs, 39)] = T9B - T9I;
+			 ri[WS(rs, 7)] = T9B + T9I;
+			 Tiz = T9K + T9L;
+			 TiE = TiA + TiD;
+			 ii[WS(rs, 7)] = Tiz + TiE;
+			 ii[WS(rs, 39)] = TiE - Tiz;
+		    }
+		    {
+			 E T9J, T9M, TiF, TiG;
+			 T9J = T9x - T9A;
+			 T9M = T9K - T9L;
+			 ri[WS(rs, 55)] = T9J - T9M;
+			 ri[WS(rs, 23)] = T9J + T9M;
+			 TiF = T9H - T9E;
+			 TiG = TiD - TiA;
+			 ii[WS(rs, 23)] = TiF + TiG;
+			 ii[WS(rs, 55)] = TiG - TiF;
+		    }
+	       }
+	       {
+		    E TaL, TbJ, Ti9, Tif, Tb0, Tie, TbM, Ti6, Tbk, TbW, TbG, TbQ, TbD, TbX, TbH;
+		    E TbT;
+		    {
+			 E TaD, TaK, Ti7, Ti8;
+			 TaD = Taz - TaC;
+			 TaK = TaG - TaJ;
+			 TaL = TaD - TaK;
+			 TbJ = TaD + TaK;
+			 Ti7 = Tc1 - Tc0;
+			 Ti8 = ThT - ThQ;
+			 Ti9 = Ti7 + Ti8;
+			 Tif = Ti8 - Ti7;
+		    }
+		    {
+			 E TaS, TbK, TaZ, TbL;
+			 {
+			      E TaO, TaR, TaV, TaY;
+			      TaO = TaM - TaN;
+			      TaR = TaP - TaQ;
+			      TaS = FNMS(KP831469612, TaR, KP555570233 * TaO);
+			      TbK = FMA(KP555570233, TaR, KP831469612 * TaO);
+			      TaV = TaT - TaU;
+			      TaY = TaW - TaX;
+			      TaZ = FMA(KP831469612, TaV, KP555570233 * TaY);
+			      TbL = FNMS(KP831469612, TaY, KP555570233 * TaV);
+			 }
+			 Tb0 = TaS - TaZ;
+			 Tie = TbL - TbK;
+			 TbM = TbK + TbL;
+			 Ti6 = TaS + TaZ;
+		    }
+		    {
+			 E Tbc, TbO, Tbj, TbP;
+			 {
+			      E Tb4, Tbb, Tbf, Tbi;
+			      Tb4 = Tb2 - Tb3;
+			      Tbb = Tb7 - Tba;
+			      Tbc = Tb4 - Tbb;
+			      TbO = Tb4 + Tbb;
+			      Tbf = Tbd - Tbe;
+			      Tbi = Tbg - Tbh;
+			      Tbj = Tbf - Tbi;
+			      TbP = Tbf + Tbi;
+			 }
+			 Tbk = FMA(KP956940335, Tbc, KP290284677 * Tbj);
+			 TbW = FNMS(KP471396736, TbP, KP881921264 * TbO);
+			 TbG = FNMS(KP956940335, Tbj, KP290284677 * Tbc);
+			 TbQ = FMA(KP471396736, TbO, KP881921264 * TbP);
+		    }
+		    {
+			 E Tbv, TbR, TbC, TbS;
+			 {
+			      E Tbn, Tbu, Tby, TbB;
+			      Tbn = Tbl - Tbm;
+			      Tbu = Tbq - Tbt;
+			      Tbv = Tbn - Tbu;
+			      TbR = Tbn + Tbu;
+			      Tby = Tbw - Tbx;
+			      TbB = Tbz - TbA;
+			      TbC = Tby - TbB;
+			      TbS = Tby + TbB;
+			 }
+			 TbD = FNMS(KP956940335, TbC, KP290284677 * Tbv);
+			 TbX = FMA(KP881921264, TbS, KP471396736 * TbR);
+			 TbH = FMA(KP290284677, TbC, KP956940335 * Tbv);
+			 TbT = FNMS(KP471396736, TbS, KP881921264 * TbR);
+		    }
+		    {
+			 E Tb1, TbE, Tid, Tig;
+			 Tb1 = TaL + Tb0;
+			 TbE = Tbk + TbD;
+			 ri[WS(rs, 45)] = Tb1 - TbE;
+			 ri[WS(rs, 13)] = Tb1 + TbE;
+			 Tid = TbG + TbH;
+			 Tig = Tie + Tif;
+			 ii[WS(rs, 13)] = Tid + Tig;
+			 ii[WS(rs, 45)] = Tig - Tid;
+		    }
+		    {
+			 E TbF, TbI, Tih, Tii;
+			 TbF = TaL - Tb0;
+			 TbI = TbG - TbH;
+			 ri[WS(rs, 61)] = TbF - TbI;
+			 ri[WS(rs, 29)] = TbF + TbI;
+			 Tih = TbD - Tbk;
+			 Tii = Tif - Tie;
+			 ii[WS(rs, 29)] = Tih + Tii;
+			 ii[WS(rs, 61)] = Tii - Tih;
+		    }
+		    {
+			 E TbN, TbU, Ti5, Tia;
+			 TbN = TbJ + TbM;
+			 TbU = TbQ + TbT;
+			 ri[WS(rs, 37)] = TbN - TbU;
+			 ri[WS(rs, 5)] = TbN + TbU;
+			 Ti5 = TbW + TbX;
+			 Tia = Ti6 + Ti9;
+			 ii[WS(rs, 5)] = Ti5 + Tia;
+			 ii[WS(rs, 37)] = Tia - Ti5;
+		    }
+		    {
+			 E TbV, TbY, Tib, Tic;
+			 TbV = TbJ - TbM;
+			 TbY = TbW - TbX;
+			 ri[WS(rs, 53)] = TbV - TbY;
+			 ri[WS(rs, 21)] = TbV + TbY;
+			 Tib = TbT - TbQ;
+			 Tic = Ti9 - Ti6;
+			 ii[WS(rs, 21)] = Tib + Tic;
+			 ii[WS(rs, 53)] = Tic - Tib;
+		    }
+	       }
+	       {
+		    E Tc3, Tcv, ThV, Ti1, Tca, Ti0, Tcy, ThO, Tci, TcI, Tcs, TcC, Tcp, TcJ, Tct;
+		    E TcF;
+		    {
+			 E TbZ, Tc2, ThP, ThU;
+			 TbZ = Taz + TaC;
+			 Tc2 = Tc0 + Tc1;
+			 Tc3 = TbZ - Tc2;
+			 Tcv = TbZ + Tc2;
+			 ThP = TaG + TaJ;
+			 ThU = ThQ + ThT;
+			 ThV = ThP + ThU;
+			 Ti1 = ThU - ThP;
+		    }
+		    {
+			 E Tc6, Tcw, Tc9, Tcx;
+			 {
+			      E Tc4, Tc5, Tc7, Tc8;
+			      Tc4 = TaM + TaN;
+			      Tc5 = TaP + TaQ;
+			      Tc6 = FNMS(KP195090322, Tc5, KP980785280 * Tc4);
+			      Tcw = FMA(KP980785280, Tc5, KP195090322 * Tc4);
+			      Tc7 = TaT + TaU;
+			      Tc8 = TaW + TaX;
+			      Tc9 = FMA(KP195090322, Tc7, KP980785280 * Tc8);
+			      Tcx = FNMS(KP195090322, Tc8, KP980785280 * Tc7);
+			 }
+			 Tca = Tc6 - Tc9;
+			 Ti0 = Tcx - Tcw;
+			 Tcy = Tcw + Tcx;
+			 ThO = Tc6 + Tc9;
+		    }
+		    {
+			 E Tce, TcA, Tch, TcB;
+			 {
+			      E Tcc, Tcd, Tcf, Tcg;
+			      Tcc = Tbd + Tbe;
+			      Tcd = Tba + Tb7;
+			      Tce = Tcc - Tcd;
+			      TcA = Tcc + Tcd;
+			      Tcf = Tb2 + Tb3;
+			      Tcg = Tbg + Tbh;
+			      Tch = Tcf - Tcg;
+			      TcB = Tcf + Tcg;
+			 }
+			 Tci = FMA(KP634393284, Tce, KP773010453 * Tch);
+			 TcI = FNMS(KP098017140, TcA, KP995184726 * TcB);
+			 Tcs = FNMS(KP773010453, Tce, KP634393284 * Tch);
+			 TcC = FMA(KP995184726, TcA, KP098017140 * TcB);
+		    }
+		    {
+			 E Tcl, TcD, Tco, TcE;
+			 {
+			      E Tcj, Tck, Tcm, Tcn;
+			      Tcj = Tbl + Tbm;
+			      Tck = TbA + Tbz;
+			      Tcl = Tcj - Tck;
+			      TcD = Tcj + Tck;
+			      Tcm = Tbw + Tbx;
+			      Tcn = Tbq + Tbt;
+			      Tco = Tcm - Tcn;
+			      TcE = Tcm + Tcn;
+			 }
+			 Tcp = FNMS(KP773010453, Tco, KP634393284 * Tcl);
+			 TcJ = FMA(KP098017140, TcD, KP995184726 * TcE);
+			 Tct = FMA(KP773010453, Tcl, KP634393284 * Tco);
+			 TcF = FNMS(KP098017140, TcE, KP995184726 * TcD);
+		    }
+		    {
+			 E Tcb, Tcq, ThZ, Ti2;
+			 Tcb = Tc3 + Tca;
+			 Tcq = Tci + Tcp;
+			 ri[WS(rs, 41)] = Tcb - Tcq;
+			 ri[WS(rs, 9)] = Tcb + Tcq;
+			 ThZ = Tcs + Tct;
+			 Ti2 = Ti0 + Ti1;
+			 ii[WS(rs, 9)] = ThZ + Ti2;
+			 ii[WS(rs, 41)] = Ti2 - ThZ;
+		    }
+		    {
+			 E Tcr, Tcu, Ti3, Ti4;
+			 Tcr = Tc3 - Tca;
+			 Tcu = Tcs - Tct;
+			 ri[WS(rs, 57)] = Tcr - Tcu;
+			 ri[WS(rs, 25)] = Tcr + Tcu;
+			 Ti3 = Tcp - Tci;
+			 Ti4 = Ti1 - Ti0;
+			 ii[WS(rs, 25)] = Ti3 + Ti4;
+			 ii[WS(rs, 57)] = Ti4 - Ti3;
+		    }
+		    {
+			 E Tcz, TcG, ThN, ThW;
+			 Tcz = Tcv + Tcy;
+			 TcG = TcC + TcF;
+			 ri[WS(rs, 33)] = Tcz - TcG;
+			 ri[WS(rs, 1)] = Tcz + TcG;
+			 ThN = TcI + TcJ;
+			 ThW = ThO + ThV;
+			 ii[WS(rs, 1)] = ThN + ThW;
+			 ii[WS(rs, 33)] = ThW - ThN;
+		    }
+		    {
+			 E TcH, TcK, ThX, ThY;
+			 TcH = Tcv - Tcy;
+			 TcK = TcI - TcJ;
+			 ri[WS(rs, 49)] = TcH - TcK;
+			 ri[WS(rs, 17)] = TcH + TcK;
+			 ThX = TcF - TcC;
+			 ThY = ThV - ThO;
+			 ii[WS(rs, 17)] = ThX + ThY;
+			 ii[WS(rs, 49)] = ThY - ThX;
+		    }
+	       }
+	       {
+		    E T9R, Taj, Tip, Tiv, T9Y, Tiu, Tam, Tik, Ta6, Taw, Tag, Taq, Tad, Tax, Tah;
+		    E Tat;
+		    {
+			 E T9N, T9Q, Til, Tio;
+			 T9N = T6b + T6m;
+			 T9Q = T9O + T9P;
+			 T9R = T9N - T9Q;
+			 Taj = T9N + T9Q;
+			 Til = T6y + T6J;
+			 Tio = Tim + Tin;
+			 Tip = Til + Tio;
+			 Tiv = Tio - Til;
+		    }
+		    {
+			 E T9U, Tak, T9X, Tal;
+			 {
+			      E T9S, T9T, T9V, T9W;
+			      T9S = T6Q + T71;
+			      T9T = T77 + T7a;
+			      T9U = FNMS(KP555570233, T9T, KP831469612 * T9S);
+			      Tak = FMA(KP555570233, T9S, KP831469612 * T9T);
+			      T9V = T7h + T7s;
+			      T9W = T7y + T7B;
+			      T9X = FMA(KP831469612, T9V, KP555570233 * T9W);
+			      Tal = FNMS(KP555570233, T9V, KP831469612 * T9W);
+			 }
+			 T9Y = T9U - T9X;
+			 Tiu = Tal - Tak;
+			 Tam = Tak + Tal;
+			 Tik = T9U + T9X;
+		    }
+		    {
+			 E Ta2, Tao, Ta5, Tap;
+			 {
+			      E Ta0, Ta1, Ta3, Ta4;
+			      Ta0 = T8p + T8s;
+			      Ta1 = T8i + T87;
+			      Ta2 = Ta0 - Ta1;
+			      Tao = Ta0 + Ta1;
+			      Ta3 = T7K + T7V;
+			      Ta4 = T8u + T8v;
+			      Ta5 = Ta3 - Ta4;
+			      Tap = Ta3 + Ta4;
+			 }
+			 Ta6 = FMA(KP471396736, Ta2, KP881921264 * Ta5);
+			 Taw = FNMS(KP290284677, Tao, KP956940335 * Tap);
+			 Tag = FNMS(KP881921264, Ta2, KP471396736 * Ta5);
+			 Taq = FMA(KP956940335, Tao, KP290284677 * Tap);
+		    }
+		    {
+			 E Ta9, Tar, Tac, Tas;
+			 {
+			      E Ta7, Ta8, Taa, Tab;
+			      Ta7 = T8D + T8O;
+			      Ta8 = T9o + T9n;
+			      Ta9 = Ta7 - Ta8;
+			      Tar = Ta7 + Ta8;
+			      Taa = T9i + T9l;
+			      Tab = T90 + T9b;
+			      Tac = Taa - Tab;
+			      Tas = Taa + Tab;
+			 }
+			 Tad = FNMS(KP881921264, Tac, KP471396736 * Ta9);
+			 Tax = FMA(KP290284677, Tar, KP956940335 * Tas);
+			 Tah = FMA(KP881921264, Ta9, KP471396736 * Tac);
+			 Tat = FNMS(KP290284677, Tas, KP956940335 * Tar);
+		    }
+		    {
+			 E T9Z, Tae, Tit, Tiw;
+			 T9Z = T9R + T9Y;
+			 Tae = Ta6 + Tad;
+			 ri[WS(rs, 43)] = T9Z - Tae;
+			 ri[WS(rs, 11)] = T9Z + Tae;
+			 Tit = Tag + Tah;
+			 Tiw = Tiu + Tiv;
+			 ii[WS(rs, 11)] = Tit + Tiw;
+			 ii[WS(rs, 43)] = Tiw - Tit;
+		    }
+		    {
+			 E Taf, Tai, Tix, Tiy;
+			 Taf = T9R - T9Y;
+			 Tai = Tag - Tah;
+			 ri[WS(rs, 59)] = Taf - Tai;
+			 ri[WS(rs, 27)] = Taf + Tai;
+			 Tix = Tad - Ta6;
+			 Tiy = Tiv - Tiu;
+			 ii[WS(rs, 27)] = Tix + Tiy;
+			 ii[WS(rs, 59)] = Tiy - Tix;
+		    }
+		    {
+			 E Tan, Tau, Tij, Tiq;
+			 Tan = Taj + Tam;
+			 Tau = Taq + Tat;
+			 ri[WS(rs, 35)] = Tan - Tau;
+			 ri[WS(rs, 3)] = Tan + Tau;
+			 Tij = Taw + Tax;
+			 Tiq = Tik + Tip;
+			 ii[WS(rs, 3)] = Tij + Tiq;
+			 ii[WS(rs, 35)] = Tiq - Tij;
+		    }
+		    {
+			 E Tav, Tay, Tir, Tis;
+			 Tav = Taj - Tam;
+			 Tay = Taw - Tax;
+			 ri[WS(rs, 51)] = Tav - Tay;
+			 ri[WS(rs, 19)] = Tav + Tay;
+			 Tir = Tat - Taq;
+			 Tis = Tip - Tik;
+			 ii[WS(rs, 19)] = Tir + Tis;
+			 ii[WS(rs, 51)] = Tis - Tir;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 64},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 64, "t1_64", twinstr, &GENUS, {808, 270, 230, 0}, 0, 0, 0 };
+
+void X(codelet_t1_64) (planner *p) {
+     X(kdft_dit_register) (p, t1_64, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:48 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include t.h */
+
+/*
+ * This function contains 72 FP additions, 66 FP multiplications,
+ * (or, 18 additions, 12 multiplications, 54 fused multiply/add),
+ * 66 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "t.h"
+
+static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
+	       E T1c, T19, T1i, T18, T16, T1q, T1t, T1r, T1u, T1s;
+	       {
+		    E T1, TR, T1h, Te, Tt, Tw, T1a, TM, T1g, Tr, Tu, TS, Tz, TC, Ty;
+		    E Tv, TB;
+		    T1 = ri[0];
+		    T1c = ii[0];
+		    {
+			 E T9, Tc, TP, Ta, Tb, TO, T7;
+			 {
+			      E T3, T6, T8, TN, T4, T2, T5;
+			      T3 = ri[WS(rs, 1)];
+			      T6 = ii[WS(rs, 1)];
+			      T2 = W[0];
+			      T9 = ri[WS(rs, 6)];
+			      Tc = ii[WS(rs, 6)];
+			      T8 = W[10];
+			      TN = T2 * T6;
+			      T4 = T2 * T3;
+			      T5 = W[1];
+			      TP = T8 * Tc;
+			      Ta = T8 * T9;
+			      Tb = W[11];
+			      TO = FNMS(T5, T3, TN);
+			      T7 = FMA(T5, T6, T4);
+			 }
+			 {
+			      E Tg, Tj, Th, TI, Tm, Tp, Tl, Ti, To, TQ, Td, Tf;
+			      Tg = ri[WS(rs, 2)];
+			      TQ = FNMS(Tb, T9, TP);
+			      Td = FMA(Tb, Tc, Ta);
+			      Tj = ii[WS(rs, 2)];
+			      Tf = W[2];
+			      T19 = TO + TQ;
+			      TR = TO - TQ;
+			      T1h = Td - T7;
+			      Te = T7 + Td;
+			      Th = Tf * Tg;
+			      TI = Tf * Tj;
+			      Tm = ri[WS(rs, 5)];
+			      Tp = ii[WS(rs, 5)];
+			      Tl = W[8];
+			      Ti = W[3];
+			      To = W[9];
+			      {
+				   E TJ, Tk, TL, Tq, TK, Tn, Ts;
+				   Tt = ri[WS(rs, 3)];
+				   TK = Tl * Tp;
+				   Tn = Tl * Tm;
+				   TJ = FNMS(Ti, Tg, TI);
+				   Tk = FMA(Ti, Tj, Th);
+				   TL = FNMS(To, Tm, TK);
+				   Tq = FMA(To, Tp, Tn);
+				   Tw = ii[WS(rs, 3)];
+				   Ts = W[4];
+				   T1a = TJ + TL;
+				   TM = TJ - TL;
+				   T1g = Tq - Tk;
+				   Tr = Tk + Tq;
+				   Tu = Ts * Tt;
+				   TS = Ts * Tw;
+			      }
+			      Tz = ri[WS(rs, 4)];
+			      TC = ii[WS(rs, 4)];
+			      Ty = W[6];
+			      Tv = W[5];
+			      TB = W[7];
+			 }
+		    }
+		    {
+			 E TF, TT, Tx, TV, TD, T1d, TU, TA;
+			 TF = FNMS(KP356895867, Tr, Te);
+			 TU = Ty * TC;
+			 TA = Ty * Tz;
+			 TT = FNMS(Tv, Tt, TS);
+			 Tx = FMA(Tv, Tw, Tu);
+			 TV = FNMS(TB, Tz, TU);
+			 TD = FMA(TB, TC, TA);
+			 T1d = FNMS(KP356895867, T1a, T19);
+			 {
+			      E T1b, T15, T17, TW;
+			      T17 = FNMS(KP554958132, TR, TM);
+			      T1b = TT + TV;
+			      TW = TT - TV;
+			      {
+				   E TE, T1l, T1e, T12;
+				   T1i = TD - Tx;
+				   TE = Tx + TD;
+				   T1l = FNMS(KP356895867, T19, T1b);
+				   T1e = FNMS(KP692021471, T1d, T1b);
+				   ii[0] = T19 + T1a + T1b + T1c;
+				   T12 = FMA(KP554958132, TM, TW);
+				   {
+					E TX, T1o, T1j, T14;
+					TX = FMA(KP554958132, TW, TR);
+					T1o = FMA(KP554958132, T1g, T1i);
+					T1j = FMA(KP554958132, T1i, T1h);
+					T14 = FNMS(KP356895867, TE, Tr);
+					{
+					     E TZ, TG, T1m, T1f;
+					     TZ = FNMS(KP356895867, Te, TE);
+					     TG = FNMS(KP692021471, TF, TE);
+					     ri[0] = T1 + Te + Tr + TE;
+					     T1m = FNMS(KP692021471, T1l, T1a);
+					     T1f = FNMS(KP900968867, T1e, T1c);
+					     {
+						  E T13, TY, T1p, T1k;
+						  T13 = FNMS(KP801937735, T12, TR);
+						  TY = FMA(KP801937735, TX, TM);
+						  T1p = FNMS(KP801937735, T1o, T1h);
+						  T1k = FMA(KP801937735, T1j, T1g);
+						  T15 = FNMS(KP692021471, T14, Te);
+						  {
+						       E T10, TH, T1n, T11;
+						       T10 = FNMS(KP692021471, TZ, Tr);
+						       TH = FNMS(KP900968867, TG, T1);
+						       T1n = FNMS(KP900968867, T1m, T1c);
+						       ii[WS(rs, 6)] = FNMS(KP974927912, T1k, T1f);
+						       ii[WS(rs, 1)] = FMA(KP974927912, T1k, T1f);
+						       T11 = FNMS(KP900968867, T10, T1);
+						       ri[WS(rs, 1)] = FMA(KP974927912, TY, TH);
+						       ri[WS(rs, 6)] = FNMS(KP974927912, TY, TH);
+						       ii[WS(rs, 5)] = FNMS(KP974927912, T1p, T1n);
+						       ii[WS(rs, 2)] = FMA(KP974927912, T1p, T1n);
+						       ri[WS(rs, 2)] = FMA(KP974927912, T13, T11);
+						       ri[WS(rs, 5)] = FNMS(KP974927912, T13, T11);
+						       T18 = FNMS(KP801937735, T17, TW);
+						  }
+					     }
+					}
+				   }
+			      }
+			      T16 = FNMS(KP900968867, T15, T1);
+			      T1q = FNMS(KP356895867, T1b, T1a);
+			      T1t = FNMS(KP554958132, T1h, T1g);
+			 }
+		    }
+	       }
+	       ri[WS(rs, 3)] = FMA(KP974927912, T18, T16);
+	       ri[WS(rs, 4)] = FNMS(KP974927912, T18, T16);
+	       T1r = FNMS(KP692021471, T1q, T19);
+	       T1u = FNMS(KP801937735, T1t, T1i);
+	       T1s = FNMS(KP900968867, T1r, T1c);
+	       ii[WS(rs, 4)] = FNMS(KP974927912, T1u, T1s);
+	       ii[WS(rs, 3)] = FMA(KP974927912, T1u, T1s);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, {18, 12, 54, 0}, 0, 0, 0 };
+
+void X(codelet_t1_7) (planner *p) {
+     X(kdft_dit_register) (p, t1_7, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include t.h */
+
+/*
+ * This function contains 72 FP additions, 60 FP multiplications,
+ * (or, 36 additions, 24 multiplications, 36 fused multiply/add),
+ * 29 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "t.h"
+
+static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
+	       E T1, TR, Tc, TS, TC, TO, Tn, TT, TI, TP, Ty, TU, TF, TQ;
+	       T1 = ri[0];
+	       TR = ii[0];
+	       {
+		    E T6, TA, Tb, TB;
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 1)];
+			 T5 = ii[WS(rs, 1)];
+			 T2 = W[0];
+			 T4 = W[1];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 TA = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = ri[WS(rs, 6)];
+			 Ta = ii[WS(rs, 6)];
+			 T7 = W[10];
+			 T9 = W[11];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 TB = FNMS(T9, T8, T7 * Ta);
+		    }
+		    Tc = T6 + Tb;
+		    TS = Tb - T6;
+		    TC = TA - TB;
+		    TO = TA + TB;
+	       }
+	       {
+		    E Th, TG, Tm, TH;
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = ri[WS(rs, 2)];
+			 Tg = ii[WS(rs, 2)];
+			 Td = W[2];
+			 Tf = W[3];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 TG = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E Tj, Tl, Ti, Tk;
+			 Tj = ri[WS(rs, 5)];
+			 Tl = ii[WS(rs, 5)];
+			 Ti = W[8];
+			 Tk = W[9];
+			 Tm = FMA(Ti, Tj, Tk * Tl);
+			 TH = FNMS(Tk, Tj, Ti * Tl);
+		    }
+		    Tn = Th + Tm;
+		    TT = Tm - Th;
+		    TI = TG - TH;
+		    TP = TG + TH;
+	       }
+	       {
+		    E Ts, TD, Tx, TE;
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = ri[WS(rs, 3)];
+			 Tr = ii[WS(rs, 3)];
+			 To = W[4];
+			 Tq = W[5];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 TD = FNMS(Tq, Tp, To * Tr);
+		    }
+		    {
+			 E Tu, Tw, Tt, Tv;
+			 Tu = ri[WS(rs, 4)];
+			 Tw = ii[WS(rs, 4)];
+			 Tt = W[6];
+			 Tv = W[7];
+			 Tx = FMA(Tt, Tu, Tv * Tw);
+			 TE = FNMS(Tv, Tu, Tt * Tw);
+		    }
+		    Ty = Ts + Tx;
+		    TU = Tx - Ts;
+		    TF = TD - TE;
+		    TQ = TD + TE;
+	       }
+	       ri[0] = T1 + Tc + Tn + Ty;
+	       ii[0] = TO + TP + TQ + TR;
+	       {
+		    E TJ, Tz, TX, TY;
+		    TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
+		    Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
+		    ri[WS(rs, 5)] = Tz - TJ;
+		    ri[WS(rs, 2)] = Tz + TJ;
+		    TX = FNMS(KP781831482, TU, KP974927912 * TS) - (KP433883739 * TT);
+		    TY = FMA(KP623489801, TQ, TR) + FNMA(KP900968867, TP, KP222520933 * TO);
+		    ii[WS(rs, 2)] = TX + TY;
+		    ii[WS(rs, 5)] = TY - TX;
+	       }
+	       {
+		    E TL, TK, TV, TW;
+		    TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
+		    TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
+		    ri[WS(rs, 6)] = TK - TL;
+		    ri[WS(rs, 1)] = TK + TL;
+		    TV = FMA(KP781831482, TS, KP974927912 * TT) + (KP433883739 * TU);
+		    TW = FMA(KP623489801, TO, TR) + FNMA(KP900968867, TQ, KP222520933 * TP);
+		    ii[WS(rs, 1)] = TV + TW;
+		    ii[WS(rs, 6)] = TW - TV;
+	       }
+	       {
+		    E TN, TM, TZ, T10;
+		    TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
+		    TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
+		    ri[WS(rs, 4)] = TM - TN;
+		    ri[WS(rs, 3)] = TM + TN;
+		    TZ = FMA(KP433883739, TS, KP974927912 * TU) - (KP781831482 * TT);
+		    T10 = FMA(KP623489801, TP, TR) + FNMA(KP222520933, TQ, KP900968867 * TO);
+		    ii[WS(rs, 3)] = TZ + T10;
+		    ii[WS(rs, 4)] = T10 - TZ;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, {36, 24, 36, 0}, 0, 0, 0 };
+
+void X(codelet_t1_7) (planner *p) {
+     X(kdft_dit_register) (p, t1_7, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:49 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include t.h */
+
+/*
+ * This function contains 66 FP additions, 36 FP multiplications,
+ * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
+ * 61 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "t.h"
+
+static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T1g, T1f, T1e, Tm, T1q, T1o, T1p, TN, T1h, T1i;
+	       {
+		    E T1, T1m, T1l, T7, TS, Tk, TQ, Te, To, Tr, T17, TM, T12, Tu, TW;
+		    E Tp, Tx, Tt, Tq, Tw;
+		    {
+			 E T3, T6, T2, T5;
+			 T1 = ri[0];
+			 T1m = ii[0];
+			 T3 = ri[WS(rs, 4)];
+			 T6 = ii[WS(rs, 4)];
+			 T2 = W[6];
+			 T5 = W[7];
+			 {
+			      E Ta, Td, T9, Tc;
+			      {
+				   E Tg, Tj, Ti, TR, Th, T1k, T4, Tf;
+				   Tg = ri[WS(rs, 6)];
+				   Tj = ii[WS(rs, 6)];
+				   T1k = T2 * T6;
+				   T4 = T2 * T3;
+				   Tf = W[10];
+				   Ti = W[11];
+				   T1l = FNMS(T5, T3, T1k);
+				   T7 = FMA(T5, T6, T4);
+				   TR = Tf * Tj;
+				   Th = Tf * Tg;
+				   Ta = ri[WS(rs, 2)];
+				   Td = ii[WS(rs, 2)];
+				   TS = FNMS(Ti, Tg, TR);
+				   Tk = FMA(Ti, Tj, Th);
+				   T9 = W[2];
+				   Tc = W[3];
+			      }
+			      {
+				   E TB, TE, TH, T13, TC, TK, TG, TD, TJ, TP, Tb, TA, Tn;
+				   TB = ri[WS(rs, 7)];
+				   TE = ii[WS(rs, 7)];
+				   TP = T9 * Td;
+				   Tb = T9 * Ta;
+				   TA = W[12];
+				   TH = ri[WS(rs, 3)];
+				   TQ = FNMS(Tc, Ta, TP);
+				   Te = FMA(Tc, Td, Tb);
+				   T13 = TA * TE;
+				   TC = TA * TB;
+				   TK = ii[WS(rs, 3)];
+				   TG = W[4];
+				   TD = W[13];
+				   TJ = W[5];
+				   {
+					E T14, TF, T16, TL, T15, TI;
+					To = ri[WS(rs, 1)];
+					T15 = TG * TK;
+					TI = TG * TH;
+					T14 = FNMS(TD, TB, T13);
+					TF = FMA(TD, TE, TC);
+					T16 = FNMS(TJ, TH, T15);
+					TL = FMA(TJ, TK, TI);
+					Tr = ii[WS(rs, 1)];
+					Tn = W[0];
+					T17 = T14 - T16;
+					T1g = T14 + T16;
+					TM = TF + TL;
+					T12 = TF - TL;
+				   }
+				   Tu = ri[WS(rs, 5)];
+				   TW = Tn * Tr;
+				   Tp = Tn * To;
+				   Tx = ii[WS(rs, 5)];
+				   Tt = W[8];
+				   Tq = W[1];
+				   Tw = W[9];
+			      }
+			 }
+		    }
+		    {
+			 E T8, T1j, T1n, Tz, T1a, TU, Tl, T1b, T1c, T1v, T1t, T1w, T19, T1u, T1d;
+			 {
+			      E T1r, T10, TV, T1s, T11, T18;
+			      {
+				   E TO, TX, Ts, TZ, Ty, TT, TY, Tv;
+				   T8 = T1 + T7;
+				   TO = T1 - T7;
+				   TY = Tt * Tx;
+				   Tv = Tt * Tu;
+				   TX = FNMS(Tq, To, TW);
+				   Ts = FMA(Tq, Tr, Tp);
+				   TZ = FNMS(Tw, Tu, TY);
+				   Ty = FMA(Tw, Tx, Tv);
+				   TT = TQ - TS;
+				   T1j = TQ + TS;
+				   T1n = T1l + T1m;
+				   T1r = T1m - T1l;
+				   T10 = TX - TZ;
+				   T1f = TX + TZ;
+				   Tz = Ts + Ty;
+				   TV = Ts - Ty;
+				   T1a = TO - TT;
+				   TU = TO + TT;
+				   T1s = Te - Tk;
+				   Tl = Te + Tk;
+			      }
+			      T1b = T10 - TV;
+			      T11 = TV + T10;
+			      T18 = T12 - T17;
+			      T1c = T12 + T17;
+			      T1v = T1s + T1r;
+			      T1t = T1r - T1s;
+			      T1w = T18 - T11;
+			      T19 = T11 + T18;
+			 }
+			 ii[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
+			 ii[WS(rs, 7)] = FNMS(KP707106781, T1w, T1v);
+			 ri[WS(rs, 1)] = FMA(KP707106781, T19, TU);
+			 ri[WS(rs, 5)] = FNMS(KP707106781, T19, TU);
+			 T1u = T1b + T1c;
+			 T1d = T1b - T1c;
+			 ii[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
+			 ii[WS(rs, 5)] = FNMS(KP707106781, T1u, T1t);
+			 ri[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
+			 ri[WS(rs, 7)] = FNMS(KP707106781, T1d, T1a);
+			 T1e = T8 - Tl;
+			 Tm = T8 + Tl;
+			 T1q = T1n - T1j;
+			 T1o = T1j + T1n;
+			 T1p = TM - Tz;
+			 TN = Tz + TM;
+		    }
+	       }
+	       ii[WS(rs, 2)] = T1p + T1q;
+	       ii[WS(rs, 6)] = T1q - T1p;
+	       ri[0] = Tm + TN;
+	       ri[WS(rs, 4)] = Tm - TN;
+	       T1h = T1f - T1g;
+	       T1i = T1f + T1g;
+	       ii[0] = T1i + T1o;
+	       ii[WS(rs, 4)] = T1o - T1i;
+	       ri[WS(rs, 2)] = T1e + T1h;
+	       ri[WS(rs, 6)] = T1e - T1h;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, {44, 14, 22, 0}, 0, 0, 0 };
+
+void X(codelet_t1_8) (planner *p) {
+     X(kdft_dit_register) (p, t1_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include t.h */
+
+/*
+ * This function contains 66 FP additions, 32 FP multiplications,
+ * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
+ * 28 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "t.h"
+
+static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
+	       E TP;
+	       {
+		    E T1, T18, T6, T17;
+		    T1 = ri[0];
+		    T18 = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 4)];
+			 T5 = ii[WS(rs, 4)];
+			 T2 = W[6];
+			 T4 = W[7];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T17 = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 + T6;
+		    T1e = T18 - T17;
+		    TH = T1 - T6;
+		    T19 = T17 + T18;
+	       }
+	       {
+		    E Tz, TS, TE, TT;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = ri[WS(rs, 7)];
+			 Ty = ii[WS(rs, 7)];
+			 Tv = W[12];
+			 Tx = W[13];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 TS = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = ri[WS(rs, 3)];
+			 TD = ii[WS(rs, 3)];
+			 TA = W[4];
+			 TC = W[5];
+			 TE = FMA(TA, TB, TC * TD);
+			 TT = FNMS(TC, TB, TA * TD);
+		    }
+		    TF = Tz + TE;
+		    T13 = TS + TT;
+		    TR = Tz - TE;
+		    TU = TS - TT;
+	       }
+	       {
+		    E Tc, TI, Th, TJ;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = ri[WS(rs, 2)];
+			 Tb = ii[WS(rs, 2)];
+			 T8 = W[2];
+			 Ta = W[3];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 TI = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = ri[WS(rs, 6)];
+			 Tg = ii[WS(rs, 6)];
+			 Td = W[10];
+			 Tf = W[11];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 TJ = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Ti = Tc + Th;
+		    T1f = Tc - Th;
+		    TK = TI - TJ;
+		    T16 = TI + TJ;
+	       }
+	       {
+		    E To, TN, Tt, TO;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = ri[WS(rs, 1)];
+			 Tn = ii[WS(rs, 1)];
+			 Tk = W[0];
+			 Tm = W[1];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 TN = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = ri[WS(rs, 5)];
+			 Ts = ii[WS(rs, 5)];
+			 Tp = W[8];
+			 Tr = W[9];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 TO = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    Tu = To + Tt;
+		    T12 = TN + TO;
+		    TM = To - Tt;
+		    TP = TN - TO;
+	       }
+	       {
+		    E Tj, TG, T1b, T1c;
+		    Tj = T7 + Ti;
+		    TG = Tu + TF;
+		    ri[WS(rs, 4)] = Tj - TG;
+		    ri[0] = Tj + TG;
+		    {
+			 E T15, T1a, T11, T14;
+			 T15 = T12 + T13;
+			 T1a = T16 + T19;
+			 ii[0] = T15 + T1a;
+			 ii[WS(rs, 4)] = T1a - T15;
+			 T11 = T7 - Ti;
+			 T14 = T12 - T13;
+			 ri[WS(rs, 6)] = T11 - T14;
+			 ri[WS(rs, 2)] = T11 + T14;
+		    }
+		    T1b = TF - Tu;
+		    T1c = T19 - T16;
+		    ii[WS(rs, 2)] = T1b + T1c;
+		    ii[WS(rs, 6)] = T1c - T1b;
+		    {
+			 E TX, T1g, T10, T1d, TY, TZ;
+			 TX = TH - TK;
+			 T1g = T1e - T1f;
+			 TY = TP - TM;
+			 TZ = TR + TU;
+			 T10 = KP707106781 * (TY - TZ);
+			 T1d = KP707106781 * (TY + TZ);
+			 ri[WS(rs, 7)] = TX - T10;
+			 ii[WS(rs, 5)] = T1g - T1d;
+			 ri[WS(rs, 3)] = TX + T10;
+			 ii[WS(rs, 1)] = T1d + T1g;
+		    }
+		    {
+			 E TL, T1i, TW, T1h, TQ, TV;
+			 TL = TH + TK;
+			 T1i = T1f + T1e;
+			 TQ = TM + TP;
+			 TV = TR - TU;
+			 TW = KP707106781 * (TQ + TV);
+			 T1h = KP707106781 * (TV - TQ);
+			 ri[WS(rs, 5)] = TL - TW;
+			 ii[WS(rs, 7)] = T1i - T1h;
+			 ri[WS(rs, 1)] = TL + TW;
+			 ii[WS(rs, 3)] = T1h + T1i;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, {52, 18, 14, 0}, 0, 0, 0 };
+
+void X(codelet_t1_8) (planner *p) {
+     X(kdft_dit_register) (p, t1_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t1_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t1_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:49 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include t.h */
+
+/*
+ * This function contains 96 FP additions, 88 FP multiplications,
+ * (or, 24 additions, 16 multiplications, 72 fused multiply/add),
+ * 72 stack variables, 10 constants, and 36 memory accesses
+ */
+#include "t.h"
+
+static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP954188894, +0.954188894138671133499268364187245676532219158);
+     DK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DK(KP363970234, +0.363970234266202361351047882776834043890471784);
+     DK(KP492403876, +0.492403876506104029683371512294761506835321626);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP777861913, +0.777861913430206160028177977318626690410586096);
+     DK(KP839099631, +0.839099631177280011763127298123181364687434283);
+     DK(KP176326980, +0.176326980708464973471090386868618986121633062);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
+	       E T1K, T24, T1H, T23;
+	       {
+		    E T1, T1R, T1Q, T10, T1W, Te, TB, T1l, T1r, T1q, T1M, TE, T1g, Tz, T12;
+		    E TC, TH, TK, T17, TR, TG, TJ, TD;
+		    T1 = ri[0];
+		    T1R = ii[0];
+		    {
+			 E T9, Tc, TY, Ta, Tb, TX, T7;
+			 {
+			      E T3, T6, T8, TW, T4, T2, T5;
+			      T3 = ri[WS(rs, 3)];
+			      T6 = ii[WS(rs, 3)];
+			      T2 = W[4];
+			      T9 = ri[WS(rs, 6)];
+			      Tc = ii[WS(rs, 6)];
+			      T8 = W[10];
+			      TW = T2 * T6;
+			      T4 = T2 * T3;
+			      T5 = W[5];
+			      TY = T8 * Tc;
+			      Ta = T8 * T9;
+			      Tb = W[11];
+			      TX = FNMS(T5, T3, TW);
+			      T7 = FMA(T5, T6, T4);
+			 }
+			 {
+			      E Th, Tk, Ti, T1n, Tn, Tq, Tp, T1i, Tx, T1j, To, Tj, TZ, Td, Tg;
+			      E TA, Tl, Ty;
+			      Th = ri[WS(rs, 1)];
+			      TZ = FNMS(Tb, T9, TY);
+			      Td = FMA(Tb, Tc, Ta);
+			      Tk = ii[WS(rs, 1)];
+			      Tg = W[0];
+			      T1Q = TX + TZ;
+			      T10 = TX - TZ;
+			      T1W = Td - T7;
+			      Te = T7 + Td;
+			      Ti = Tg * Th;
+			      T1n = Tg * Tk;
+			      {
+				   E Tt, Tw, Ts, Tv, T1h, Tu, Tm;
+				   Tt = ri[WS(rs, 7)];
+				   Tw = ii[WS(rs, 7)];
+				   Ts = W[12];
+				   Tv = W[13];
+				   Tn = ri[WS(rs, 4)];
+				   Tq = ii[WS(rs, 4)];
+				   T1h = Ts * Tw;
+				   Tu = Ts * Tt;
+				   Tm = W[6];
+				   Tp = W[7];
+				   T1i = FNMS(Tv, Tt, T1h);
+				   Tx = FMA(Tv, Tw, Tu);
+				   T1j = Tm * Tq;
+				   To = Tm * Tn;
+			      }
+			      Tj = W[1];
+			      TB = ri[WS(rs, 2)];
+			      {
+				   E T1k, Tr, T1o, T1p;
+				   T1k = FNMS(Tp, Tn, T1j);
+				   Tr = FMA(Tp, Tq, To);
+				   T1o = FNMS(Tj, Th, T1n);
+				   Tl = FMA(Tj, Tk, Ti);
+				   T1p = T1k + T1i;
+				   T1l = T1i - T1k;
+				   Ty = Tr + Tx;
+				   T1r = Tr - Tx;
+				   T1q = FNMS(KP500000000, T1p, T1o);
+				   T1M = T1o + T1p;
+				   TE = ii[WS(rs, 2)];
+			      }
+			      T1g = FNMS(KP500000000, Ty, Tl);
+			      Tz = Tl + Ty;
+			      TA = W[2];
+			      {
+				   E TN, TQ, TP, T16, TO, TM;
+				   TN = ri[WS(rs, 8)];
+				   TQ = ii[WS(rs, 8)];
+				   TM = W[14];
+				   T12 = TA * TE;
+				   TC = TA * TB;
+				   TP = W[15];
+				   T16 = TM * TQ;
+				   TO = TM * TN;
+				   TH = ri[WS(rs, 5)];
+				   TK = ii[WS(rs, 5)];
+				   T17 = FNMS(TP, TN, T16);
+				   TR = FMA(TP, TQ, TO);
+				   TG = W[8];
+				   TJ = W[9];
+			      }
+			      TD = W[3];
+			 }
+		    }
+		    {
+			 E TV, Tf, T1S, T1V, T1d, T1a, T19, T1N, TT, T1c;
+			 {
+			      E T13, TF, T15, TL, T14, TI, TS, T18;
+			      TV = FNMS(KP500000000, Te, T1);
+			      Tf = T1 + Te;
+			      T14 = TG * TK;
+			      TI = TG * TH;
+			      T13 = FNMS(TD, TB, T12);
+			      TF = FMA(TD, TE, TC);
+			      T15 = FNMS(TJ, TH, T14);
+			      TL = FMA(TJ, TK, TI);
+			      T1S = T1Q + T1R;
+			      T1V = FNMS(KP500000000, T1Q, T1R);
+			      T18 = T15 + T17;
+			      T1d = T15 - T17;
+			      TS = TL + TR;
+			      T1a = TR - TL;
+			      T19 = FNMS(KP500000000, T18, T13);
+			      T1N = T13 + T18;
+			      TT = TF + TS;
+			      T1c = FNMS(KP500000000, TS, TF);
+			 }
+			 {
+			      E T11, T1z, T1E, T1D, T21, T1X, T1I, T1C, T1Y, T1y, T20, T1u, T1U, TU;
+			      T1U = TT - Tz;
+			      TU = Tz + TT;
+			      {
+				   E T1P, T1O, T1L, T1T;
+				   T1P = T1M + T1N;
+				   T1O = T1M - T1N;
+				   T11 = FMA(KP866025403, T10, TV);
+				   T1z = FNMS(KP866025403, T10, TV);
+				   T1L = FNMS(KP500000000, TU, Tf);
+				   ri[0] = Tf + TU;
+				   T1T = FNMS(KP500000000, T1P, T1S);
+				   ii[0] = T1P + T1S;
+				   ri[WS(rs, 3)] = FMA(KP866025403, T1O, T1L);
+				   ri[WS(rs, 6)] = FNMS(KP866025403, T1O, T1L);
+				   ii[WS(rs, 6)] = FNMS(KP866025403, T1U, T1T);
+				   ii[WS(rs, 3)] = FMA(KP866025403, T1U, T1T);
+			      }
+			      {
+				   E T1B, T1m, T1w, T1f, T1s, T1A, T1b, T1e, T1x, T1t;
+				   T1E = FNMS(KP866025403, T1a, T19);
+				   T1b = FMA(KP866025403, T1a, T19);
+				   T1e = FMA(KP866025403, T1d, T1c);
+				   T1D = FNMS(KP866025403, T1d, T1c);
+				   T1B = FMA(KP866025403, T1l, T1g);
+				   T1m = FNMS(KP866025403, T1l, T1g);
+				   T21 = FNMS(KP866025403, T1W, T1V);
+				   T1X = FMA(KP866025403, T1W, T1V);
+				   T1w = FNMS(KP176326980, T1b, T1e);
+				   T1f = FMA(KP176326980, T1e, T1b);
+				   T1s = FNMS(KP866025403, T1r, T1q);
+				   T1A = FMA(KP866025403, T1r, T1q);
+				   T1x = FNMS(KP839099631, T1m, T1s);
+				   T1t = FMA(KP839099631, T1s, T1m);
+				   T1I = FNMS(KP176326980, T1A, T1B);
+				   T1C = FMA(KP176326980, T1B, T1A);
+				   T1Y = FNMS(KP777861913, T1x, T1w);
+				   T1y = FMA(KP777861913, T1x, T1w);
+				   T20 = FNMS(KP777861913, T1t, T1f);
+				   T1u = FMA(KP777861913, T1t, T1f);
+			      }
+			      {
+				   E T22, T1G, T1Z, T1F, T1J, T1v;
+				   ii[WS(rs, 1)] = FNMS(KP984807753, T1Y, T1X);
+				   T1v = FNMS(KP492403876, T1u, T11);
+				   ri[WS(rs, 1)] = FMA(KP984807753, T1u, T11);
+				   T1F = FNMS(KP363970234, T1E, T1D);
+				   T1J = FMA(KP363970234, T1D, T1E);
+				   ri[WS(rs, 4)] = FMA(KP852868531, T1y, T1v);
+				   ri[WS(rs, 7)] = FNMS(KP852868531, T1y, T1v);
+				   T1K = FNMS(KP954188894, T1J, T1I);
+				   T22 = FMA(KP954188894, T1J, T1I);
+				   T1G = FNMS(KP954188894, T1F, T1C);
+				   T24 = FMA(KP954188894, T1F, T1C);
+				   T1Z = FMA(KP492403876, T1Y, T1X);
+				   ii[WS(rs, 2)] = FNMS(KP984807753, T22, T21);
+				   ri[WS(rs, 2)] = FMA(KP984807753, T1G, T1z);
+				   T1H = FNMS(KP492403876, T1G, T1z);
+				   ii[WS(rs, 7)] = FNMS(KP852868531, T20, T1Z);
+				   ii[WS(rs, 4)] = FMA(KP852868531, T20, T1Z);
+				   T23 = FMA(KP492403876, T22, T21);
+			      }
+			 }
+		    }
+	       }
+	       ri[WS(rs, 8)] = FMA(KP852868531, T1K, T1H);
+	       ri[WS(rs, 5)] = FNMS(KP852868531, T1K, T1H);
+	       ii[WS(rs, 8)] = FMA(KP852868531, T24, T23);
+	       ii[WS(rs, 5)] = FNMS(KP852868531, T24, T23);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 9},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, {24, 16, 72, 0}, 0, 0, 0 };
+
+void X(codelet_t1_9) (planner *p) {
+     X(kdft_dit_register) (p, t1_9, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include t.h */
+
+/*
+ * This function contains 96 FP additions, 72 FP multiplications,
+ * (or, 60 additions, 36 multiplications, 36 fused multiply/add),
+ * 41 stack variables, 8 constants, and 36 memory accesses
+ */
+#include "t.h"
+
+static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
+	       E T1, T1B, TQ, T1G, Tc, TN, T1A, T1H, TL, T1x, T17, T1o, T1c, T1n, Tu;
+	       E T1w, TW, T1k, T11, T1l;
+	       {
+		    E T6, TO, Tb, TP;
+		    T1 = ri[0];
+		    T1B = ii[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = ri[WS(rs, 3)];
+			 T5 = ii[WS(rs, 3)];
+			 T2 = W[4];
+			 T4 = W[5];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 TO = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = ri[WS(rs, 6)];
+			 Ta = ii[WS(rs, 6)];
+			 T7 = W[10];
+			 T9 = W[11];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 TP = FNMS(T9, T8, T7 * Ta);
+		    }
+		    TQ = KP866025403 * (TO - TP);
+		    T1G = KP866025403 * (Tb - T6);
+		    Tc = T6 + Tb;
+		    TN = FNMS(KP500000000, Tc, T1);
+		    T1A = TO + TP;
+		    T1H = FNMS(KP500000000, T1A, T1B);
+	       }
+	       {
+		    E Tz, T19, TE, T14, TJ, T15, TK, T1a;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = ri[WS(rs, 2)];
+			 Ty = ii[WS(rs, 2)];
+			 Tv = W[2];
+			 Tx = W[3];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T19 = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = ri[WS(rs, 5)];
+			 TD = ii[WS(rs, 5)];
+			 TA = W[8];
+			 TC = W[9];
+			 TE = FMA(TA, TB, TC * TD);
+			 T14 = FNMS(TC, TB, TA * TD);
+		    }
+		    {
+			 E TG, TI, TF, TH;
+			 TG = ri[WS(rs, 8)];
+			 TI = ii[WS(rs, 8)];
+			 TF = W[14];
+			 TH = W[15];
+			 TJ = FMA(TF, TG, TH * TI);
+			 T15 = FNMS(TH, TG, TF * TI);
+		    }
+		    TK = TE + TJ;
+		    T1a = T14 + T15;
+		    TL = Tz + TK;
+		    T1x = T19 + T1a;
+		    {
+			 E T13, T16, T18, T1b;
+			 T13 = FNMS(KP500000000, TK, Tz);
+			 T16 = KP866025403 * (T14 - T15);
+			 T17 = T13 + T16;
+			 T1o = T13 - T16;
+			 T18 = KP866025403 * (TJ - TE);
+			 T1b = FNMS(KP500000000, T1a, T19);
+			 T1c = T18 + T1b;
+			 T1n = T1b - T18;
+		    }
+	       }
+	       {
+		    E Ti, TY, Tn, TT, Ts, TU, Tt, TZ;
+		    {
+			 E Tf, Th, Te, Tg;
+			 Tf = ri[WS(rs, 1)];
+			 Th = ii[WS(rs, 1)];
+			 Te = W[0];
+			 Tg = W[1];
+			 Ti = FMA(Te, Tf, Tg * Th);
+			 TY = FNMS(Tg, Tf, Te * Th);
+		    }
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = ri[WS(rs, 4)];
+			 Tm = ii[WS(rs, 4)];
+			 Tj = W[6];
+			 Tl = W[7];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 TT = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = ri[WS(rs, 7)];
+			 Tr = ii[WS(rs, 7)];
+			 To = W[12];
+			 Tq = W[13];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 TU = FNMS(Tq, Tp, To * Tr);
+		    }
+		    Tt = Tn + Ts;
+		    TZ = TT + TU;
+		    Tu = Ti + Tt;
+		    T1w = TY + TZ;
+		    {
+			 E TS, TV, TX, T10;
+			 TS = FNMS(KP500000000, Tt, Ti);
+			 TV = KP866025403 * (TT - TU);
+			 TW = TS + TV;
+			 T1k = TS - TV;
+			 TX = KP866025403 * (Ts - Tn);
+			 T10 = FNMS(KP500000000, TZ, TY);
+			 T11 = TX + T10;
+			 T1l = T10 - TX;
+		    }
+	       }
+	       {
+		    E T1y, Td, TM, T1v;
+		    T1y = KP866025403 * (T1w - T1x);
+		    Td = T1 + Tc;
+		    TM = Tu + TL;
+		    T1v = FNMS(KP500000000, TM, Td);
+		    ri[0] = Td + TM;
+		    ri[WS(rs, 3)] = T1v + T1y;
+		    ri[WS(rs, 6)] = T1v - T1y;
+	       }
+	       {
+		    E T1D, T1z, T1C, T1E;
+		    T1D = KP866025403 * (TL - Tu);
+		    T1z = T1w + T1x;
+		    T1C = T1A + T1B;
+		    T1E = FNMS(KP500000000, T1z, T1C);
+		    ii[0] = T1z + T1C;
+		    ii[WS(rs, 6)] = T1E - T1D;
+		    ii[WS(rs, 3)] = T1D + T1E;
+	       }
+	       {
+		    E TR, T1I, T1e, T1J, T1i, T1F, T1f, T1K;
+		    TR = TN + TQ;
+		    T1I = T1G + T1H;
+		    {
+			 E T12, T1d, T1g, T1h;
+			 T12 = FMA(KP766044443, TW, KP642787609 * T11);
+			 T1d = FMA(KP173648177, T17, KP984807753 * T1c);
+			 T1e = T12 + T1d;
+			 T1J = KP866025403 * (T1d - T12);
+			 T1g = FNMS(KP642787609, TW, KP766044443 * T11);
+			 T1h = FNMS(KP984807753, T17, KP173648177 * T1c);
+			 T1i = KP866025403 * (T1g - T1h);
+			 T1F = T1g + T1h;
+		    }
+		    ri[WS(rs, 1)] = TR + T1e;
+		    ii[WS(rs, 1)] = T1F + T1I;
+		    T1f = FNMS(KP500000000, T1e, TR);
+		    ri[WS(rs, 7)] = T1f - T1i;
+		    ri[WS(rs, 4)] = T1f + T1i;
+		    T1K = FNMS(KP500000000, T1F, T1I);
+		    ii[WS(rs, 4)] = T1J + T1K;
+		    ii[WS(rs, 7)] = T1K - T1J;
+	       }
+	       {
+		    E T1j, T1M, T1q, T1N, T1u, T1L, T1r, T1O;
+		    T1j = TN - TQ;
+		    T1M = T1H - T1G;
+		    {
+			 E T1m, T1p, T1s, T1t;
+			 T1m = FMA(KP173648177, T1k, KP984807753 * T1l);
+			 T1p = FNMS(KP939692620, T1o, KP342020143 * T1n);
+			 T1q = T1m + T1p;
+			 T1N = KP866025403 * (T1p - T1m);
+			 T1s = FNMS(KP984807753, T1k, KP173648177 * T1l);
+			 T1t = FMA(KP342020143, T1o, KP939692620 * T1n);
+			 T1u = KP866025403 * (T1s + T1t);
+			 T1L = T1s - T1t;
+		    }
+		    ri[WS(rs, 2)] = T1j + T1q;
+		    ii[WS(rs, 2)] = T1L + T1M;
+		    T1r = FNMS(KP500000000, T1q, T1j);
+		    ri[WS(rs, 8)] = T1r - T1u;
+		    ri[WS(rs, 5)] = T1r + T1u;
+		    T1O = FNMS(KP500000000, T1L, T1M);
+		    ii[WS(rs, 5)] = T1N + T1O;
+		    ii[WS(rs, 8)] = T1O - T1N;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 0, 9},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, {60, 36, 36, 0}, 0, 0, 0 };
+
+void X(codelet_t1_9) (planner *p) {
+     X(kdft_dit_register) (p, t1_9, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t2_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t2_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:09 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include t.h */
+
+/*
+ * This function contains 114 FP additions, 94 FP multiplications,
+ * (or, 48 additions, 28 multiplications, 66 fused multiply/add),
+ * 85 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t.h"
+
+static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) {
+	       E T27, T2b, T2a, T2c;
+	       {
+		    E T2, T3, T8, Tc, T5, T4, TX, T11, TE, T6, TB, TA;
+		    T2 = W[0];
+		    T3 = W[2];
+		    T8 = W[4];
+		    Tc = W[5];
+		    T5 = W[1];
+		    T4 = T2 * T3;
+		    TX = T3 * T8;
+		    TA = T2 * T8;
+		    T11 = T3 * Tc;
+		    TE = T2 * Tc;
+		    T6 = W[3];
+		    TB = FMA(T5, Tc, TA);
+		    {
+			 E T2d, T24, T1c, Tk, T1i, T28, T2l, T1a, T2f, T1I, T1R, T1Z, TL, T1v, T1d;
+			 E Tz, T1S, T1r, TH, T1t;
+			 {
+			      E T1, TF, TY, T12, Tl, T7, T23, To, Tb, Te, Ti, Th, Td, Tw, Ts;
+			      E Ta;
+			      T1 = ri[0];
+			      TF = FNMS(T5, T8, TE);
+			      TY = FMA(T6, Tc, TX);
+			      T12 = FNMS(T6, T8, T11);
+			      Tl = FMA(T5, T6, T4);
+			      T7 = FNMS(T5, T6, T4);
+			      Ta = T2 * T6;
+			      T23 = ii[0];
+			      {
+				   E Tg, T9, Tv, Tr;
+				   Tg = T7 * Tc;
+				   T9 = T7 * T8;
+				   Tv = Tl * Tc;
+				   Tr = Tl * T8;
+				   To = FNMS(T5, T3, Ta);
+				   Tb = FMA(T5, T3, Ta);
+				   Te = ri[WS(rs, 5)];
+				   Ti = ii[WS(rs, 5)];
+				   Th = FNMS(Tb, T8, Tg);
+				   Td = FMA(Tb, Tc, T9);
+				   Tw = FNMS(To, T8, Tv);
+				   Ts = FMA(To, Tc, Tr);
+			      }
+			      {
+				   E T18, T1G, T1g, TW, T1P, T1C, T14, T1E;
+				   {
+					E TR, T1z, TV, T1B, TZ, T13, T15, T17, T10, T1D;
+					{
+					     E TO, TQ, TP, T22, Tj, T1y, T21, Tf;
+					     TO = ri[WS(rs, 4)];
+					     T21 = Td * Ti;
+					     Tf = Td * Te;
+					     TQ = ii[WS(rs, 4)];
+					     TP = T7 * TO;
+					     T22 = FNMS(Th, Te, T21);
+					     Tj = FMA(Th, Ti, Tf);
+					     T1y = T7 * TQ;
+					     TR = FMA(Tb, TQ, TP);
+					     T2d = T23 - T22;
+					     T24 = T22 + T23;
+					     T1c = T1 + Tj;
+					     Tk = T1 - Tj;
+					     T1z = FNMS(Tb, TO, T1y);
+					}
+					T15 = ri[WS(rs, 1)];
+					T17 = ii[WS(rs, 1)];
+					{
+					     E TS, TU, T16, T1F, TT, T1A;
+					     TS = ri[WS(rs, 9)];
+					     TU = ii[WS(rs, 9)];
+					     T16 = T2 * T15;
+					     T1F = T2 * T17;
+					     TT = T8 * TS;
+					     T1A = T8 * TU;
+					     T18 = FMA(T5, T17, T16);
+					     T1G = FNMS(T5, T15, T1F);
+					     TV = FMA(Tc, TU, TT);
+					     T1B = FNMS(Tc, TS, T1A);
+					}
+					TZ = ri[WS(rs, 6)];
+					T13 = ii[WS(rs, 6)];
+					T1g = TR + TV;
+					TW = TR - TV;
+					T1P = T1z + T1B;
+					T1C = T1z - T1B;
+					T10 = TY * TZ;
+					T1D = TY * T13;
+					T14 = FMA(T12, T13, T10);
+					T1E = FNMS(T12, TZ, T1D);
+				   }
+				   {
+					E Tq, T1o, Ty, TC, TG, T1q, TD, T1s;
+					{
+					     E TI, TK, Tt, T1p;
+					     {
+						  E Tm, T1n, Tp, Tn;
+						  Tm = ri[WS(rs, 2)];
+						  Tp = ii[WS(rs, 2)];
+						  {
+						       E T19, T1h, T1Q, T1H;
+						       T19 = T14 - T18;
+						       T1h = T14 + T18;
+						       T1Q = T1E + T1G;
+						       T1H = T1E - T1G;
+						       Tn = Tl * Tm;
+						       T1i = T1g + T1h;
+						       T28 = T1g - T1h;
+						       T2l = TW - T19;
+						       T1a = TW + T19;
+						       T2f = T1C + T1H;
+						       T1I = T1C - T1H;
+						       T1R = T1P - T1Q;
+						       T1Z = T1P + T1Q;
+						       T1n = Tl * Tp;
+						  }
+						  Tq = FMA(To, Tp, Tn);
+						  TI = ri[WS(rs, 3)];
+						  TK = ii[WS(rs, 3)];
+						  T1o = FNMS(To, Tm, T1n);
+					     }
+					     {
+						  E Tx, Tu, TJ, T1u;
+						  Tt = ri[WS(rs, 7)];
+						  TJ = T3 * TI;
+						  T1u = T3 * TK;
+						  Tx = ii[WS(rs, 7)];
+						  Tu = Ts * Tt;
+						  TL = FMA(T6, TK, TJ);
+						  T1v = FNMS(T6, TI, T1u);
+						  T1p = Ts * Tx;
+						  Ty = FMA(Tw, Tx, Tu);
+					     }
+					     TC = ri[WS(rs, 8)];
+					     TG = ii[WS(rs, 8)];
+					     T1q = FNMS(Tw, Tt, T1p);
+					}
+					T1d = Tq + Ty;
+					Tz = Tq - Ty;
+					TD = TB * TC;
+					T1s = TB * TG;
+					T1S = T1o + T1q;
+					T1r = T1o - T1q;
+					TH = FMA(TF, TG, TD);
+					T1t = FNMS(TF, TC, T1s);
+				   }
+			      }
+			 }
+			 {
+			      E T1f, T29, T1Y, T1U, T2j, T2n, T2m, T2o;
+			      {
+				   E T2k, T2e, T1l, T1L, T1J, T1k, T1b, T1e, TM;
+				   T1e = TH + TL;
+				   TM = TH - TL;
+				   {
+					E T1w, T1T, TN, T1x;
+					T1w = T1t - T1v;
+					T1T = T1t + T1v;
+					T1f = T1d + T1e;
+					T29 = T1d - T1e;
+					T2k = Tz - TM;
+					TN = Tz + TM;
+					T1x = T1r - T1w;
+					T2e = T1r + T1w;
+					T1Y = T1S + T1T;
+					T1U = T1S - T1T;
+					T1l = TN - T1a;
+					T1b = TN + T1a;
+					T1L = FNMS(KP618033988, T1x, T1I);
+					T1J = FMA(KP618033988, T1I, T1x);
+				   }
+				   T1k = FNMS(KP250000000, T1b, Tk);
+				   ri[WS(rs, 5)] = Tk + T1b;
+				   {
+					E T2g, T2i, T2h, T1K, T1m;
+					T2g = T2e + T2f;
+					T2i = T2e - T2f;
+					T1K = FNMS(KP559016994, T1l, T1k);
+					T1m = FMA(KP559016994, T1l, T1k);
+					T2h = FNMS(KP250000000, T2g, T2d);
+					ri[WS(rs, 1)] = FMA(KP951056516, T1J, T1m);
+					ri[WS(rs, 9)] = FNMS(KP951056516, T1J, T1m);
+					ri[WS(rs, 3)] = FMA(KP951056516, T1L, T1K);
+					ri[WS(rs, 7)] = FNMS(KP951056516, T1L, T1K);
+					ii[WS(rs, 5)] = T2g + T2d;
+					T2j = FMA(KP559016994, T2i, T2h);
+					T2n = FNMS(KP559016994, T2i, T2h);
+					T2m = FMA(KP618033988, T2l, T2k);
+					T2o = FNMS(KP618033988, T2k, T2l);
+				   }
+			      }
+			      {
+				   E T1O, T1W, T1V, T1X, T1j, T1N, T1M, T20, T26, T25;
+				   T1j = T1f + T1i;
+				   T1N = T1f - T1i;
+				   ii[WS(rs, 7)] = FMA(KP951056516, T2o, T2n);
+				   ii[WS(rs, 3)] = FNMS(KP951056516, T2o, T2n);
+				   ii[WS(rs, 9)] = FMA(KP951056516, T2m, T2j);
+				   ii[WS(rs, 1)] = FNMS(KP951056516, T2m, T2j);
+				   T1M = FNMS(KP250000000, T1j, T1c);
+				   ri[0] = T1c + T1j;
+				   T1O = FNMS(KP559016994, T1N, T1M);
+				   T1W = FMA(KP559016994, T1N, T1M);
+				   T1V = FNMS(KP618033988, T1U, T1R);
+				   T1X = FMA(KP618033988, T1R, T1U);
+				   T20 = T1Y + T1Z;
+				   T26 = T1Y - T1Z;
+				   ri[WS(rs, 6)] = FMA(KP951056516, T1X, T1W);
+				   ri[WS(rs, 4)] = FNMS(KP951056516, T1X, T1W);
+				   ri[WS(rs, 8)] = FMA(KP951056516, T1V, T1O);
+				   ri[WS(rs, 2)] = FNMS(KP951056516, T1V, T1O);
+				   T25 = FNMS(KP250000000, T20, T24);
+				   ii[0] = T20 + T24;
+				   T27 = FNMS(KP559016994, T26, T25);
+				   T2b = FMA(KP559016994, T26, T25);
+				   T2a = FNMS(KP618033988, T29, T28);
+				   T2c = FMA(KP618033988, T28, T29);
+			      }
+			 }
+		    }
+	       }
+	       ii[WS(rs, 6)] = FNMS(KP951056516, T2c, T2b);
+	       ii[WS(rs, 4)] = FMA(KP951056516, T2c, T2b);
+	       ii[WS(rs, 8)] = FNMS(KP951056516, T2a, T27);
+	       ii[WS(rs, 2)] = FMA(KP951056516, T2a, T27);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, {48, 28, 66, 0}, 0, 0, 0 };
+
+void X(codelet_t2_10) (planner *p) {
+     X(kdft_dit_register) (p, t2_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include t.h */
+
+/*
+ * This function contains 114 FP additions, 80 FP multiplications,
+ * (or, 76 additions, 42 multiplications, 38 fused multiply/add),
+ * 63 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t.h"
+
+static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) {
+	       E T2, T5, T3, T6, T8, Tm, Tc, Tk, T9, Td, Te, TM, TO, Tg, Tp;
+	       E Tv, Tx, Tr;
+	       {
+		    E T4, Tb, T7, Ta;
+		    T2 = W[0];
+		    T5 = W[1];
+		    T3 = W[2];
+		    T6 = W[3];
+		    T4 = T2 * T3;
+		    Tb = T5 * T3;
+		    T7 = T5 * T6;
+		    Ta = T2 * T6;
+		    T8 = T4 - T7;
+		    Tm = Ta - Tb;
+		    Tc = Ta + Tb;
+		    Tk = T4 + T7;
+		    T9 = W[4];
+		    Td = W[5];
+		    Te = FMA(T8, T9, Tc * Td);
+		    TM = FMA(T3, T9, T6 * Td);
+		    TO = FNMS(T6, T9, T3 * Td);
+		    Tg = FNMS(Tc, T9, T8 * Td);
+		    Tp = FMA(Tk, T9, Tm * Td);
+		    Tv = FMA(T2, T9, T5 * Td);
+		    Tx = FNMS(T5, T9, T2 * Td);
+		    Tr = FNMS(Tm, T9, Tk * Td);
+	       }
+	       {
+		    E Tj, T1S, TX, T1G, TL, TU, TV, T1s, T1t, T1C, T11, T12, T13, T1h, T1k;
+		    E T1Q, Tu, TD, TE, T1v, T1w, T1B, TY, TZ, T10, T1a, T1d, T1P;
+		    {
+			 E T1, T1F, Ti, T1E, Tf, Th;
+			 T1 = ri[0];
+			 T1F = ii[0];
+			 Tf = ri[WS(rs, 5)];
+			 Th = ii[WS(rs, 5)];
+			 Ti = FMA(Te, Tf, Tg * Th);
+			 T1E = FNMS(Tg, Tf, Te * Th);
+			 Tj = T1 - Ti;
+			 T1S = T1F - T1E;
+			 TX = T1 + Ti;
+			 T1G = T1E + T1F;
+		    }
+		    {
+			 E TH, T1f, TT, T1j, TK, T1g, TQ, T1i;
+			 {
+			      E TF, TG, TR, TS;
+			      TF = ri[WS(rs, 4)];
+			      TG = ii[WS(rs, 4)];
+			      TH = FMA(T8, TF, Tc * TG);
+			      T1f = FNMS(Tc, TF, T8 * TG);
+			      TR = ri[WS(rs, 1)];
+			      TS = ii[WS(rs, 1)];
+			      TT = FMA(T2, TR, T5 * TS);
+			      T1j = FNMS(T5, TR, T2 * TS);
+			 }
+			 {
+			      E TI, TJ, TN, TP;
+			      TI = ri[WS(rs, 9)];
+			      TJ = ii[WS(rs, 9)];
+			      TK = FMA(T9, TI, Td * TJ);
+			      T1g = FNMS(Td, TI, T9 * TJ);
+			      TN = ri[WS(rs, 6)];
+			      TP = ii[WS(rs, 6)];
+			      TQ = FMA(TM, TN, TO * TP);
+			      T1i = FNMS(TO, TN, TM * TP);
+			 }
+			 TL = TH - TK;
+			 TU = TQ - TT;
+			 TV = TL + TU;
+			 T1s = T1f + T1g;
+			 T1t = T1i + T1j;
+			 T1C = T1s + T1t;
+			 T11 = TH + TK;
+			 T12 = TQ + TT;
+			 T13 = T11 + T12;
+			 T1h = T1f - T1g;
+			 T1k = T1i - T1j;
+			 T1Q = T1h + T1k;
+		    }
+		    {
+			 E To, T18, TC, T1c, Tt, T19, Tz, T1b;
+			 {
+			      E Tl, Tn, TA, TB;
+			      Tl = ri[WS(rs, 2)];
+			      Tn = ii[WS(rs, 2)];
+			      To = FMA(Tk, Tl, Tm * Tn);
+			      T18 = FNMS(Tm, Tl, Tk * Tn);
+			      TA = ri[WS(rs, 3)];
+			      TB = ii[WS(rs, 3)];
+			      TC = FMA(T3, TA, T6 * TB);
+			      T1c = FNMS(T6, TA, T3 * TB);
+			 }
+			 {
+			      E Tq, Ts, Tw, Ty;
+			      Tq = ri[WS(rs, 7)];
+			      Ts = ii[WS(rs, 7)];
+			      Tt = FMA(Tp, Tq, Tr * Ts);
+			      T19 = FNMS(Tr, Tq, Tp * Ts);
+			      Tw = ri[WS(rs, 8)];
+			      Ty = ii[WS(rs, 8)];
+			      Tz = FMA(Tv, Tw, Tx * Ty);
+			      T1b = FNMS(Tx, Tw, Tv * Ty);
+			 }
+			 Tu = To - Tt;
+			 TD = Tz - TC;
+			 TE = Tu + TD;
+			 T1v = T18 + T19;
+			 T1w = T1b + T1c;
+			 T1B = T1v + T1w;
+			 TY = To + Tt;
+			 TZ = Tz + TC;
+			 T10 = TY + TZ;
+			 T1a = T18 - T19;
+			 T1d = T1b - T1c;
+			 T1P = T1a + T1d;
+		    }
+		    {
+			 E T15, TW, T16, T1m, T1o, T1e, T1l, T1n, T17;
+			 T15 = KP559016994 * (TE - TV);
+			 TW = TE + TV;
+			 T16 = FNMS(KP250000000, TW, Tj);
+			 T1e = T1a - T1d;
+			 T1l = T1h - T1k;
+			 T1m = FMA(KP951056516, T1e, KP587785252 * T1l);
+			 T1o = FNMS(KP587785252, T1e, KP951056516 * T1l);
+			 ri[WS(rs, 5)] = Tj + TW;
+			 T1n = T16 - T15;
+			 ri[WS(rs, 7)] = T1n - T1o;
+			 ri[WS(rs, 3)] = T1n + T1o;
+			 T17 = T15 + T16;
+			 ri[WS(rs, 9)] = T17 - T1m;
+			 ri[WS(rs, 1)] = T17 + T1m;
+		    }
+		    {
+			 E T1R, T1T, T1U, T1Y, T20, T1W, T1X, T1Z, T1V;
+			 T1R = KP559016994 * (T1P - T1Q);
+			 T1T = T1P + T1Q;
+			 T1U = FNMS(KP250000000, T1T, T1S);
+			 T1W = Tu - TD;
+			 T1X = TL - TU;
+			 T1Y = FMA(KP951056516, T1W, KP587785252 * T1X);
+			 T20 = FNMS(KP587785252, T1W, KP951056516 * T1X);
+			 ii[WS(rs, 5)] = T1T + T1S;
+			 T1Z = T1U - T1R;
+			 ii[WS(rs, 3)] = T1Z - T20;
+			 ii[WS(rs, 7)] = T20 + T1Z;
+			 T1V = T1R + T1U;
+			 ii[WS(rs, 1)] = T1V - T1Y;
+			 ii[WS(rs, 9)] = T1Y + T1V;
+		    }
+		    {
+			 E T1q, T14, T1p, T1y, T1A, T1u, T1x, T1z, T1r;
+			 T1q = KP559016994 * (T10 - T13);
+			 T14 = T10 + T13;
+			 T1p = FNMS(KP250000000, T14, TX);
+			 T1u = T1s - T1t;
+			 T1x = T1v - T1w;
+			 T1y = FNMS(KP587785252, T1x, KP951056516 * T1u);
+			 T1A = FMA(KP951056516, T1x, KP587785252 * T1u);
+			 ri[0] = TX + T14;
+			 T1z = T1q + T1p;
+			 ri[WS(rs, 4)] = T1z - T1A;
+			 ri[WS(rs, 6)] = T1z + T1A;
+			 T1r = T1p - T1q;
+			 ri[WS(rs, 2)] = T1r - T1y;
+			 ri[WS(rs, 8)] = T1r + T1y;
+		    }
+		    {
+			 E T1L, T1D, T1K, T1J, T1N, T1H, T1I, T1O, T1M;
+			 T1L = KP559016994 * (T1B - T1C);
+			 T1D = T1B + T1C;
+			 T1K = FNMS(KP250000000, T1D, T1G);
+			 T1H = T11 - T12;
+			 T1I = TY - TZ;
+			 T1J = FNMS(KP587785252, T1I, KP951056516 * T1H);
+			 T1N = FMA(KP951056516, T1I, KP587785252 * T1H);
+			 ii[0] = T1D + T1G;
+			 T1O = T1L + T1K;
+			 ii[WS(rs, 4)] = T1N + T1O;
+			 ii[WS(rs, 6)] = T1O - T1N;
+			 T1M = T1K - T1L;
+			 ii[WS(rs, 2)] = T1J + T1M;
+			 ii[WS(rs, 8)] = T1M - T1J;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, {76, 42, 38, 0}, 0, 0, 0 };
+
+void X(codelet_t2_10) (planner *p) {
+     X(kdft_dit_register) (p, t2_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t2_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t2_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,827 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:00 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include t.h */
+
+/*
+ * This function contains 196 FP additions, 134 FP multiplications,
+ * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
+ * 100 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "t.h"
+
+static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T3S, T3R;
+	       {
+		    E T2, Tf, TM, TO, T3, Tg, TN, TS, T4, Tp, T6, T5, Th;
+		    T2 = W[0];
+		    Tf = W[2];
+		    TM = W[6];
+		    TO = W[7];
+		    T3 = W[4];
+		    Tg = T2 * Tf;
+		    TN = T2 * TM;
+		    TS = T2 * TO;
+		    T4 = T2 * T3;
+		    Tp = Tf * T3;
+		    T6 = W[5];
+		    T5 = W[1];
+		    Th = W[3];
+		    {
+			 E TZ, Te, T1U, T3A, T3L, T2D, T1G, T2A, T3h, T1R, T2B, T2I, T3i, Tx, T3M;
+			 E T1Z, T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, TX;
+			 E T10, TV, T2a, TY, T2b;
+			 {
+			      E TF, TP, TT, Tq, TW, Tz, Tu, TI, TC, T1m, T1f, T1p, T1j, Tr, Ts;
+			      E Tv, To, T1W;
+			      {
+				   E Ti, Tm, T1L, T1O, T1D, T1A, T1x, T2y, T1F, T2x;
+				   {
+					E T1, T7, Tb, T3z, T8, T1z, T9, Tc;
+					{
+					     E T1i, T1e, T1C, T1y, Tt, Ta, Tl;
+					     T1 = ri[0];
+					     Tt = Tf * T6;
+					     Ta = T2 * T6;
+					     T7 = FMA(T5, T6, T4);
+					     TF = FNMS(T5, T6, T4);
+					     TP = FMA(T5, TO, TN);
+					     TT = FNMS(T5, TM, TS);
+					     Tq = FNMS(Th, T6, Tp);
+					     TW = FMA(Th, T6, Tp);
+					     Tz = FMA(T5, Th, Tg);
+					     Ti = FNMS(T5, Th, Tg);
+					     Tl = T2 * Th;
+					     Tu = FMA(Th, T3, Tt);
+					     TZ = FNMS(Th, T3, Tt);
+					     TI = FMA(T5, T3, Ta);
+					     Tb = FNMS(T5, T3, Ta);
+					     T1i = Ti * T6;
+					     T1e = Ti * T3;
+					     T1C = Tz * T6;
+					     T1y = Tz * T3;
+					     Tm = FMA(T5, Tf, Tl);
+					     TC = FNMS(T5, Tf, Tl);
+					     T3z = ii[0];
+					     T8 = ri[WS(rs, 8)];
+					     T1m = FNMS(Tm, T6, T1e);
+					     T1f = FMA(Tm, T6, T1e);
+					     T1p = FMA(Tm, T3, T1i);
+					     T1j = FNMS(Tm, T3, T1i);
+					     T1L = FNMS(TC, T6, T1y);
+					     T1z = FMA(TC, T6, T1y);
+					     T1O = FMA(TC, T3, T1C);
+					     T1D = FNMS(TC, T3, T1C);
+					     T9 = T7 * T8;
+					     Tc = ii[WS(rs, 8)];
+					}
+					{
+					     E T1u, T1w, T1v, T2w, T3y, T1B, T1E, Td, T3x;
+					     T1u = ri[WS(rs, 15)];
+					     T1w = ii[WS(rs, 15)];
+					     T1A = ri[WS(rs, 7)];
+					     Td = FMA(Tb, Tc, T9);
+					     T3x = T7 * Tc;
+					     T1v = TM * T1u;
+					     T2w = TM * T1w;
+					     Te = T1 + Td;
+					     T1U = T1 - Td;
+					     T3y = FNMS(Tb, T8, T3x);
+					     T1B = T1z * T1A;
+					     T1E = ii[WS(rs, 7)];
+					     T1x = FMA(TO, T1w, T1v);
+					     T3A = T3y + T3z;
+					     T3L = T3z - T3y;
+					     T2y = T1z * T1E;
+					     T1F = FMA(T1D, T1E, T1B);
+					     T2x = FNMS(TO, T1u, T2w);
+					}
+				   }
+				   {
+					E T1H, T1I, T1J, T1M, T1P, T2z;
+					T1H = ri[WS(rs, 3)];
+					T2z = FNMS(T1D, T1A, T2y);
+					T2D = T1x - T1F;
+					T1G = T1x + T1F;
+					T1I = Tf * T1H;
+					T2A = T2x - T2z;
+					T3h = T2x + T2z;
+					T1J = ii[WS(rs, 3)];
+					T1M = ri[WS(rs, 11)];
+					T1P = ii[WS(rs, 11)];
+					{
+					     E Tj, Tk, Tn, T1V;
+					     {
+						  E T1K, T2F, T1Q, T2H, T2E, T1N, T2G;
+						  Tj = ri[WS(rs, 4)];
+						  T1K = FMA(Th, T1J, T1I);
+						  T2E = Tf * T1J;
+						  T1N = T1L * T1M;
+						  T2G = T1L * T1P;
+						  Tk = Ti * Tj;
+						  T2F = FNMS(Th, T1H, T2E);
+						  T1Q = FMA(T1O, T1P, T1N);
+						  T2H = FNMS(T1O, T1M, T2G);
+						  Tn = ii[WS(rs, 4)];
+						  Tr = ri[WS(rs, 12)];
+						  T1R = T1K + T1Q;
+						  T2B = T1K - T1Q;
+						  T2I = T2F - T2H;
+						  T3i = T2F + T2H;
+						  T1V = Ti * Tn;
+						  Ts = Tq * Tr;
+						  Tv = ii[WS(rs, 12)];
+					     }
+					     To = FMA(Tm, Tn, Tk);
+					     T1W = FNMS(Tm, Tj, T1V);
+					}
+				   }
+			      }
+			      {
+				   E T19, T1b, T18, T2i, T1a, T2j;
+				   {
+					E TE, T22, TK, T24;
+					{
+					     E TA, TD, TB, T21, TG, TJ, TH, T23, T1Y, Tw, T1X;
+					     TA = ri[WS(rs, 2)];
+					     Tw = FMA(Tu, Tv, Ts);
+					     T1X = Tq * Tv;
+					     TD = ii[WS(rs, 2)];
+					     TB = Tz * TA;
+					     Tx = To + Tw;
+					     T3M = To - Tw;
+					     T1Y = FNMS(Tu, Tr, T1X);
+					     T21 = Tz * TD;
+					     TG = ri[WS(rs, 10)];
+					     TJ = ii[WS(rs, 10)];
+					     T1Z = T1W - T1Y;
+					     T3w = T1W + T1Y;
+					     TH = TF * TG;
+					     T23 = TF * TJ;
+					     TE = FMA(TC, TD, TB);
+					     T22 = FNMS(TC, TA, T21);
+					     TK = FMA(TI, TJ, TH);
+					     T24 = FNMS(TI, TG, T23);
+					}
+					{
+					     E T15, T17, T16, T2h;
+					     T15 = ri[WS(rs, 1)];
+					     T17 = ii[WS(rs, 1)];
+					     TL = TE + TK;
+					     T26 = TE - TK;
+					     T25 = T22 - T24;
+					     T37 = T22 + T24;
+					     T16 = T2 * T15;
+					     T2h = T2 * T17;
+					     T19 = ri[WS(rs, 9)];
+					     T1b = ii[WS(rs, 9)];
+					     T18 = FMA(T5, T17, T16);
+					     T2i = FNMS(T5, T15, T2h);
+					     T1a = T3 * T19;
+					     T2j = T3 * T1b;
+					}
+				   }
+				   {
+					E T1n, T1q, T1l, T2q, T1o, T2r;
+					{
+					     E T1g, T1k, T1h, T2p, T1c, T2k;
+					     T1g = ri[WS(rs, 5)];
+					     T1k = ii[WS(rs, 5)];
+					     T1c = FMA(T6, T1b, T1a);
+					     T2k = FNMS(T6, T19, T2j);
+					     T1h = T1f * T1g;
+					     T2p = T1f * T1k;
+					     T1d = T18 + T1c;
+					     T2o = T18 - T1c;
+					     T2l = T2i - T2k;
+					     T3c = T2i + T2k;
+					     T1n = ri[WS(rs, 13)];
+					     T1q = ii[WS(rs, 13)];
+					     T1l = FMA(T1j, T1k, T1h);
+					     T2q = FNMS(T1j, T1g, T2p);
+					     T1o = T1m * T1n;
+					     T2r = T1m * T1q;
+					}
+					{
+					     E TQ, TU, TR, T29, T1r, T2s;
+					     TQ = ri[WS(rs, 14)];
+					     TU = ii[WS(rs, 14)];
+					     T1r = FMA(T1p, T1q, T1o);
+					     T2s = FNMS(T1p, T1n, T2r);
+					     TR = TP * TQ;
+					     T29 = TP * TU;
+					     T1s = T1l + T1r;
+					     T2m = T1l - T1r;
+					     T2t = T2q - T2s;
+					     T3d = T2q + T2s;
+					     TX = ri[WS(rs, 6)];
+					     T10 = ii[WS(rs, 6)];
+					     TV = FMA(TT, TU, TR);
+					     T2a = FNMS(TT, TQ, T29);
+					     TY = TW * TX;
+					     T2b = TW * T10;
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T36, T3G, T3b, T3g, T28, T2d, T3F, T39, T3e, T3q, T3C, T3j, T3u, T3t;
+			      {
+				   E T3D, T1T, T3r, T14, T3E, T3s;
+				   {
+					E Ty, T3B, T11, T2c, T13, T3v;
+					T36 = Te - Tx;
+					Ty = Te + Tx;
+					T3B = T3w + T3A;
+					T3G = T3A - T3w;
+					T11 = FMA(TZ, T10, TY);
+					T2c = FNMS(TZ, TX, T2b);
+					{
+					     E T1t, T1S, T12, T38;
+					     T3b = T1d - T1s;
+					     T1t = T1d + T1s;
+					     T1S = T1G + T1R;
+					     T3g = T1G - T1R;
+					     T12 = TV + T11;
+					     T28 = TV - T11;
+					     T2d = T2a - T2c;
+					     T38 = T2a + T2c;
+					     T3D = T1S - T1t;
+					     T1T = T1t + T1S;
+					     T13 = TL + T12;
+					     T3F = T12 - TL;
+					     T39 = T37 - T38;
+					     T3v = T37 + T38;
+					}
+					T3e = T3c - T3d;
+					T3r = T3c + T3d;
+					T3q = Ty - T13;
+					T14 = Ty + T13;
+					T3E = T3B - T3v;
+					T3C = T3v + T3B;
+					T3s = T3h + T3i;
+					T3j = T3h - T3i;
+				   }
+				   ri[WS(rs, 8)] = T14 - T1T;
+				   ri[0] = T14 + T1T;
+				   ii[WS(rs, 12)] = T3E - T3D;
+				   T3u = T3r + T3s;
+				   T3t = T3r - T3s;
+				   ii[WS(rs, 4)] = T3D + T3E;
+			      }
+			      {
+				   E T3m, T3a, T3J, T3H;
+				   ii[0] = T3u + T3C;
+				   ii[WS(rs, 8)] = T3C - T3u;
+				   ri[WS(rs, 4)] = T3q + T3t;
+				   ri[WS(rs, 12)] = T3q - T3t;
+				   T3m = T36 - T39;
+				   T3a = T36 + T39;
+				   T3J = T3G - T3F;
+				   T3H = T3F + T3G;
+				   {
+					E T2Q, T20, T3N, T3T, T2J, T2C, T3O, T2f, T34, T30, T2W, T2V, T3U, T2T, T2N;
+					E T2v;
+					{
+					     E T2R, T27, T2e, T2S;
+					     {
+						  E T3n, T3f, T3o, T3k;
+						  T2Q = T1U + T1Z;
+						  T20 = T1U - T1Z;
+						  T3n = T3e - T3b;
+						  T3f = T3b + T3e;
+						  T3o = T3g + T3j;
+						  T3k = T3g - T3j;
+						  T3N = T3L - T3M;
+						  T3T = T3M + T3L;
+						  {
+						       E T3p, T3I, T3K, T3l;
+						       T3p = T3n - T3o;
+						       T3I = T3n + T3o;
+						       T3K = T3k - T3f;
+						       T3l = T3f + T3k;
+						       ri[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
+						       ri[WS(rs, 14)] = FNMS(KP707106781, T3p, T3m);
+						       ii[WS(rs, 10)] = FNMS(KP707106781, T3I, T3H);
+						       ii[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
+						       ii[WS(rs, 14)] = FNMS(KP707106781, T3K, T3J);
+						       ii[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
+						       ri[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
+						       ri[WS(rs, 10)] = FNMS(KP707106781, T3l, T3a);
+						       T2R = T26 + T25;
+						       T27 = T25 - T26;
+						       T2e = T28 + T2d;
+						       T2S = T28 - T2d;
+						  }
+					     }
+					     {
+						  E T2Y, T2Z, T2n, T2u;
+						  T2J = T2D - T2I;
+						  T2Y = T2D + T2I;
+						  T2Z = T2A - T2B;
+						  T2C = T2A + T2B;
+						  T3O = T27 + T2e;
+						  T2f = T27 - T2e;
+						  T34 = FMA(KP414213562, T2Y, T2Z);
+						  T30 = FNMS(KP414213562, T2Z, T2Y);
+						  T2W = T2l - T2m;
+						  T2n = T2l + T2m;
+						  T2u = T2o - T2t;
+						  T2V = T2o + T2t;
+						  T3U = T2S - T2R;
+						  T2T = T2R + T2S;
+						  T2N = FNMS(KP414213562, T2n, T2u);
+						  T2v = FMA(KP414213562, T2u, T2n);
+					     }
+					}
+					{
+					     E T33, T2X, T3X, T3Y;
+					     {
+						  E T2M, T2g, T2O, T2K, T3V, T3W, T2P, T2L;
+						  T2M = FNMS(KP707106781, T2f, T20);
+						  T2g = FMA(KP707106781, T2f, T20);
+						  T33 = FNMS(KP414213562, T2V, T2W);
+						  T2X = FMA(KP414213562, T2W, T2V);
+						  T2O = FMA(KP414213562, T2C, T2J);
+						  T2K = FNMS(KP414213562, T2J, T2C);
+						  T3V = FMA(KP707106781, T3U, T3T);
+						  T3X = FNMS(KP707106781, T3U, T3T);
+						  T3W = T2O - T2N;
+						  T2P = T2N + T2O;
+						  T3Y = T2v + T2K;
+						  T2L = T2v - T2K;
+						  ii[WS(rs, 11)] = FNMS(KP923879532, T3W, T3V);
+						  ii[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
+						  ri[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
+						  ri[WS(rs, 11)] = FNMS(KP923879532, T2L, T2g);
+						  ri[WS(rs, 15)] = FMA(KP923879532, T2P, T2M);
+						  ri[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
+					     }
+					     {
+						  E T32, T3P, T3Q, T35, T2U, T31;
+						  T32 = FNMS(KP707106781, T2T, T2Q);
+						  T2U = FMA(KP707106781, T2T, T2Q);
+						  T31 = T2X + T30;
+						  T3S = T30 - T2X;
+						  T3R = FNMS(KP707106781, T3O, T3N);
+						  T3P = FMA(KP707106781, T3O, T3N);
+						  ii[WS(rs, 15)] = FMA(KP923879532, T3Y, T3X);
+						  ii[WS(rs, 7)] = FNMS(KP923879532, T3Y, T3X);
+						  ri[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
+						  ri[WS(rs, 9)] = FNMS(KP923879532, T31, T2U);
+						  T3Q = T33 + T34;
+						  T35 = T33 - T34;
+						  ii[WS(rs, 9)] = FNMS(KP923879532, T3Q, T3P);
+						  ii[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
+						  ri[WS(rs, 5)] = FMA(KP923879532, T35, T32);
+						  ri[WS(rs, 13)] = FNMS(KP923879532, T35, T32);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ii[WS(rs, 13)] = FNMS(KP923879532, T3S, T3R);
+	       ii[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_CEXP, 0, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, {104, 42, 92, 0}, 0, 0, 0 };
+
+void X(codelet_t2_16) (planner *p) {
+     X(kdft_dit_register) (p, t2_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include t.h */
+
+/*
+ * This function contains 196 FP additions, 108 FP multiplications,
+ * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
+ * 82 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "t.h"
+
+static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
+	       E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
+	       {
+		    E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
+		    {
+			 E Th, Tn, Tj, Tm;
+			 T2 = W[0];
+			 T5 = W[1];
+			 Tg = W[2];
+			 Ti = W[3];
+			 Th = T2 * Tg;
+			 Tn = T5 * Tg;
+			 Tj = T5 * Ti;
+			 Tm = T2 * Ti;
+			 Tk = Th - Tj;
+			 To = Tm + Tn;
+			 TE = Tm - Tn;
+			 TC = Th + Tj;
+			 T6 = W[5];
+			 T7 = T5 * T6;
+			 Tv = Tg * T6;
+			 Ta = T2 * T6;
+			 Ts = Ti * T6;
+			 T3 = W[4];
+			 T4 = T2 * T3;
+			 Tw = Ti * T3;
+			 Tb = T5 * T3;
+			 Tr = Tg * T3;
+		    }
+		    T8 = T4 + T7;
+		    TW = Tv - Tw;
+		    TJ = Ta + Tb;
+		    Tt = Tr - Ts;
+		    TU = Tr + Ts;
+		    Tc = Ta - Tb;
+		    Tx = Tv + Tw;
+		    TH = T4 - T7;
+		    TN = W[6];
+		    TO = W[7];
+		    TP = FMA(T2, TN, T5 * TO);
+		    TR = FNMS(T5, TN, T2 * TO);
+		    {
+			 E T1d, T1e, T19, T1a;
+			 T1d = Tk * T6;
+			 T1e = To * T3;
+			 T1f = T1d - T1e;
+			 T1k = T1d + T1e;
+			 T19 = Tk * T3;
+			 T1a = To * T6;
+			 T1b = T19 + T1a;
+			 T1i = T19 - T1a;
+		    }
+		    {
+			 E T1w, T1x, T1s, T1t;
+			 T1w = TC * T6;
+			 T1x = TE * T3;
+			 T1y = T1w - T1x;
+			 T1H = T1w + T1x;
+			 T1s = TC * T3;
+			 T1t = TE * T6;
+			 T1u = T1s + T1t;
+			 T1F = T1s - T1t;
+		    }
+	       }
+	       {
+		    E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
+		    E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
+		    E T2S, T2T, T28, T2A, T2d, T2B;
+		    {
+			 E T1, T3d, Te, T3c, T9, Td;
+			 T1 = ri[0];
+			 T3d = ii[0];
+			 T9 = ri[WS(rs, 8)];
+			 Td = ii[WS(rs, 8)];
+			 Te = FMA(T8, T9, Tc * Td);
+			 T3c = FNMS(Tc, T9, T8 * Td);
+			 Tf = T1 + Te;
+			 T3r = T3d - T3c;
+			 T1N = T1 - Te;
+			 T3e = T3c + T3d;
+		    }
+		    {
+			 E Tq, T1O, Tz, T1P;
+			 {
+			      E Tl, Tp, Tu, Ty;
+			      Tl = ri[WS(rs, 4)];
+			      Tp = ii[WS(rs, 4)];
+			      Tq = FMA(Tk, Tl, To * Tp);
+			      T1O = FNMS(To, Tl, Tk * Tp);
+			      Tu = ri[WS(rs, 12)];
+			      Ty = ii[WS(rs, 12)];
+			      Tz = FMA(Tt, Tu, Tx * Ty);
+			      T1P = FNMS(Tx, Tu, Tt * Ty);
+			 }
+			 TA = Tq + Tz;
+			 T3s = Tq - Tz;
+			 T1Q = T1O - T1P;
+			 T3b = T1O + T1P;
+		    }
+		    {
+			 E TG, T1S, TL, T1T, T1U, T1V;
+			 {
+			      E TD, TF, TI, TK;
+			      TD = ri[WS(rs, 2)];
+			      TF = ii[WS(rs, 2)];
+			      TG = FMA(TC, TD, TE * TF);
+			      T1S = FNMS(TE, TD, TC * TF);
+			      TI = ri[WS(rs, 10)];
+			      TK = ii[WS(rs, 10)];
+			      TL = FMA(TH, TI, TJ * TK);
+			      T1T = FNMS(TJ, TI, TH * TK);
+			 }
+			 TM = TG + TL;
+			 T2M = T1S + T1T;
+			 T1U = T1S - T1T;
+			 T1V = TG - TL;
+			 T1W = T1U - T1V;
+			 T2w = T1V + T1U;
+		    }
+		    {
+			 E TT, T1Y, TY, T1Z, T1X, T20;
+			 {
+			      E TQ, TS, TV, TX;
+			      TQ = ri[WS(rs, 14)];
+			      TS = ii[WS(rs, 14)];
+			      TT = FMA(TP, TQ, TR * TS);
+			      T1Y = FNMS(TR, TQ, TP * TS);
+			      TV = ri[WS(rs, 6)];
+			      TX = ii[WS(rs, 6)];
+			      TY = FMA(TU, TV, TW * TX);
+			      T1Z = FNMS(TW, TV, TU * TX);
+			 }
+			 TZ = TT + TY;
+			 T2N = T1Y + T1Z;
+			 T1X = TT - TY;
+			 T20 = T1Y - T1Z;
+			 T21 = T1X + T20;
+			 T2x = T1X - T20;
+		    }
+		    {
+			 E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
+			 {
+			      E T1p, T1q, T1G, T1I;
+			      T1p = ri[WS(rs, 15)];
+			      T1q = ii[WS(rs, 15)];
+			      T1r = FMA(TN, T1p, TO * T1q);
+			      T2k = FNMS(TO, T1p, TN * T1q);
+			      T1G = ri[WS(rs, 11)];
+			      T1I = ii[WS(rs, 11)];
+			      T1J = FMA(T1F, T1G, T1H * T1I);
+			      T2h = FNMS(T1H, T1G, T1F * T1I);
+			 }
+			 {
+			      E T1v, T1z, T1C, T1D;
+			      T1v = ri[WS(rs, 7)];
+			      T1z = ii[WS(rs, 7)];
+			      T1A = FMA(T1u, T1v, T1y * T1z);
+			      T2l = FNMS(T1y, T1v, T1u * T1z);
+			      T1C = ri[WS(rs, 3)];
+			      T1D = ii[WS(rs, 3)];
+			      T1E = FMA(Tg, T1C, Ti * T1D);
+			      T2g = FNMS(Ti, T1C, Tg * T1D);
+			 }
+			 T1B = T1r + T1A;
+			 T1K = T1E + T1J;
+			 T2V = T1B - T1K;
+			 T2W = T2k + T2l;
+			 T2X = T2g + T2h;
+			 T2Y = T2W - T2X;
+			 {
+			      E T2f, T2i, T2m, T2n;
+			      T2f = T1r - T1A;
+			      T2i = T2g - T2h;
+			      T2j = T2f - T2i;
+			      T2D = T2f + T2i;
+			      T2m = T2k - T2l;
+			      T2n = T1E - T1J;
+			      T2o = T2m + T2n;
+			      T2E = T2m - T2n;
+			 }
+		    }
+		    {
+			 E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
+			 {
+			      E T12, T13, T1j, T1l;
+			      T12 = ri[WS(rs, 1)];
+			      T13 = ii[WS(rs, 1)];
+			      T14 = FMA(T2, T12, T5 * T13);
+			      T24 = FNMS(T5, T12, T2 * T13);
+			      T1j = ri[WS(rs, 13)];
+			      T1l = ii[WS(rs, 13)];
+			      T1m = FMA(T1i, T1j, T1k * T1l);
+			      T2b = FNMS(T1k, T1j, T1i * T1l);
+			 }
+			 {
+			      E T15, T16, T1c, T1g;
+			      T15 = ri[WS(rs, 9)];
+			      T16 = ii[WS(rs, 9)];
+			      T17 = FMA(T3, T15, T6 * T16);
+			      T25 = FNMS(T6, T15, T3 * T16);
+			      T1c = ri[WS(rs, 5)];
+			      T1g = ii[WS(rs, 5)];
+			      T1h = FMA(T1b, T1c, T1f * T1g);
+			      T2a = FNMS(T1f, T1c, T1b * T1g);
+			 }
+			 T18 = T14 + T17;
+			 T1n = T1h + T1m;
+			 T2Q = T18 - T1n;
+			 T2R = T24 + T25;
+			 T2S = T2a + T2b;
+			 T2T = T2R - T2S;
+			 {
+			      E T26, T27, T29, T2c;
+			      T26 = T24 - T25;
+			      T27 = T1h - T1m;
+			      T28 = T26 + T27;
+			      T2A = T26 - T27;
+			      T29 = T14 - T17;
+			      T2c = T2a - T2b;
+			      T2d = T29 - T2c;
+			      T2B = T29 + T2c;
+			 }
+		    }
+		    {
+			 E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
+			 {
+			      E T1R, T22, T3y, T3z;
+			      T1R = T1N - T1Q;
+			      T22 = KP707106781 * (T1W - T21);
+			      T23 = T1R + T22;
+			      T2r = T1R - T22;
+			      T3y = KP707106781 * (T2x - T2w);
+			      T3z = T3s + T3r;
+			      T3A = T3y + T3z;
+			      T3C = T3z - T3y;
+			 }
+			 {
+			      E T2e, T2p, T2s, T2t;
+			      T2e = FMA(KP923879532, T28, KP382683432 * T2d);
+			      T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
+			      T2q = T2e + T2p;
+			      T3B = T2p - T2e;
+			      T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
+			      T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
+			      T2u = T2s - T2t;
+			      T3x = T2s + T2t;
+			 }
+			 ri[WS(rs, 11)] = T23 - T2q;
+			 ii[WS(rs, 11)] = T3A - T3x;
+			 ri[WS(rs, 3)] = T23 + T2q;
+			 ii[WS(rs, 3)] = T3x + T3A;
+			 ri[WS(rs, 15)] = T2r - T2u;
+			 ii[WS(rs, 15)] = T3C - T3B;
+			 ri[WS(rs, 7)] = T2r + T2u;
+			 ii[WS(rs, 7)] = T3B + T3C;
+		    }
+		    {
+			 E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
+			 {
+			      E T2L, T2O, T3k, T3l;
+			      T2L = Tf - TA;
+			      T2O = T2M - T2N;
+			      T2P = T2L + T2O;
+			      T31 = T2L - T2O;
+			      T3k = TZ - TM;
+			      T3l = T3e - T3b;
+			      T3m = T3k + T3l;
+			      T3o = T3l - T3k;
+			 }
+			 {
+			      E T2U, T2Z, T32, T33;
+			      T2U = T2Q + T2T;
+			      T2Z = T2V - T2Y;
+			      T30 = KP707106781 * (T2U + T2Z);
+			      T3n = KP707106781 * (T2Z - T2U);
+			      T32 = T2T - T2Q;
+			      T33 = T2V + T2Y;
+			      T34 = KP707106781 * (T32 - T33);
+			      T3j = KP707106781 * (T32 + T33);
+			 }
+			 ri[WS(rs, 10)] = T2P - T30;
+			 ii[WS(rs, 10)] = T3m - T3j;
+			 ri[WS(rs, 2)] = T2P + T30;
+			 ii[WS(rs, 2)] = T3j + T3m;
+			 ri[WS(rs, 14)] = T31 - T34;
+			 ii[WS(rs, 14)] = T3o - T3n;
+			 ri[WS(rs, 6)] = T31 + T34;
+			 ii[WS(rs, 6)] = T3n + T3o;
+		    }
+		    {
+			 E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
+			 {
+			      E T2v, T2y, T3q, T3t;
+			      T2v = T1N + T1Q;
+			      T2y = KP707106781 * (T2w + T2x);
+			      T2z = T2v + T2y;
+			      T2H = T2v - T2y;
+			      T3q = KP707106781 * (T1W + T21);
+			      T3t = T3r - T3s;
+			      T3u = T3q + T3t;
+			      T3w = T3t - T3q;
+			 }
+			 {
+			      E T2C, T2F, T2I, T2J;
+			      T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
+			      T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
+			      T2G = T2C + T2F;
+			      T3v = T2F - T2C;
+			      T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
+			      T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
+			      T2K = T2I - T2J;
+			      T3p = T2I + T2J;
+			 }
+			 ri[WS(rs, 9)] = T2z - T2G;
+			 ii[WS(rs, 9)] = T3u - T3p;
+			 ri[WS(rs, 1)] = T2z + T2G;
+			 ii[WS(rs, 1)] = T3p + T3u;
+			 ri[WS(rs, 13)] = T2H - T2K;
+			 ii[WS(rs, 13)] = T3w - T3v;
+			 ri[WS(rs, 5)] = T2H + T2K;
+			 ii[WS(rs, 5)] = T3v + T3w;
+		    }
+		    {
+			 E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
+			 {
+			      E TB, T10, T3a, T3f;
+			      TB = Tf + TA;
+			      T10 = TM + TZ;
+			      T11 = TB + T10;
+			      T35 = TB - T10;
+			      T3a = T2M + T2N;
+			      T3f = T3b + T3e;
+			      T3g = T3a + T3f;
+			      T3i = T3f - T3a;
+			 }
+			 {
+			      E T1o, T1L, T36, T37;
+			      T1o = T18 + T1n;
+			      T1L = T1B + T1K;
+			      T1M = T1o + T1L;
+			      T3h = T1L - T1o;
+			      T36 = T2R + T2S;
+			      T37 = T2W + T2X;
+			      T38 = T36 - T37;
+			      T39 = T36 + T37;
+			 }
+			 ri[WS(rs, 8)] = T11 - T1M;
+			 ii[WS(rs, 8)] = T3g - T39;
+			 ri[0] = T11 + T1M;
+			 ii[0] = T39 + T3g;
+			 ri[WS(rs, 12)] = T35 - T38;
+			 ii[WS(rs, 12)] = T3i - T3h;
+			 ri[WS(rs, 4)] = T35 + T38;
+			 ii[WS(rs, 4)] = T3h + T3i;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_CEXP, 0, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, {156, 68, 40, 0}, 0, 0, 0 };
+
+void X(codelet_t2_16) (planner *p) {
+     X(kdft_dit_register) (p, t2_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t2_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t2_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1064 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:09 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -name t2_20 -include t.h */
+
+/*
+ * This function contains 276 FP additions, 198 FP multiplications,
+ * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
+ * 142 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "t.h"
+
+static void t2_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T59, T5i, T5k, T5e, T5c, T5d, T5j, T5f;
+	       {
+		    E T2, Th, Tf, T6, T5, Tl, T1p, T1n, Ti, T3, Tt, Tv, T24, T1f, T1D;
+		    E Tb, T1P, Tm, T21, T1b, T7, T1A, Tw, T1H, T13, TA, T1L, T17, T1S, Tq;
+		    E T1o, T2g, T1t, T2c, TO, TK;
+		    {
+			 E T1e, Ta, Tk, Tg;
+			 T2 = W[0];
+			 Th = W[3];
+			 Tf = W[2];
+			 T6 = W[5];
+			 T5 = W[1];
+			 Tk = T2 * Th;
+			 Tg = T2 * Tf;
+			 T1e = Tf * T6;
+			 Ta = T2 * T6;
+			 Tl = FMA(T5, Tf, Tk);
+			 T1p = FNMS(T5, Tf, Tk);
+			 T1n = FMA(T5, Th, Tg);
+			 Ti = FNMS(T5, Th, Tg);
+			 T3 = W[4];
+			 Tt = W[6];
+			 Tv = W[7];
+			 {
+			      E Tp, Tj, TN, TJ;
+			      Tp = Ti * T6;
+			      T24 = FMA(Th, T3, T1e);
+			      T1f = FNMS(Th, T3, T1e);
+			      T1D = FNMS(T5, T3, Ta);
+			      Tb = FMA(T5, T3, Ta);
+			      Tj = Ti * T3;
+			      {
+				   E T1a, T4, Tu, T1G;
+				   T1a = Tf * T3;
+				   T4 = T2 * T3;
+				   Tu = Ti * Tt;
+				   T1G = T2 * Tt;
+				   {
+					E T12, Tz, T1K, T16;
+					T12 = Tf * Tt;
+					Tz = Ti * Tv;
+					T1K = T2 * Tv;
+					T16 = Tf * Tv;
+					T1P = FNMS(Tl, T6, Tj);
+					Tm = FMA(Tl, T6, Tj);
+					T21 = FNMS(Th, T6, T1a);
+					T1b = FMA(Th, T6, T1a);
+					T7 = FNMS(T5, T6, T4);
+					T1A = FMA(T5, T6, T4);
+					Tw = FMA(Tl, Tv, Tu);
+					T1H = FMA(T5, Tv, T1G);
+					T13 = FMA(Th, Tv, T12);
+					TA = FNMS(Tl, Tt, Tz);
+					T1L = FNMS(T5, Tt, T1K);
+					T17 = FNMS(Th, Tt, T16);
+					T1S = FMA(Tl, T3, Tp);
+					Tq = FNMS(Tl, T3, Tp);
+				   }
+			      }
+			      T1o = T1n * T3;
+			      T2g = T1n * Tv;
+			      TN = Tm * Tv;
+			      TJ = Tm * Tt;
+			      T1t = T1n * T6;
+			      T2c = T1n * Tt;
+			      TO = FNMS(Tq, Tt, TN);
+			      TK = FMA(Tq, Tv, TJ);
+			 }
+		    }
+		    {
+			 E Te, T2C, T4L, T57, T58, TD, T2H, T4H, T3C, T3Z, T11, T2v, T2P, T3P, T4k;
+			 E T4v, T3u, T43, T2r, T2z, T3b, T3T, T4g, T4z, T3n, T42, T20, T2y, T34, T3S;
+			 E T4d, T4y, T1c, T19, T1d, T3E, T1w, T2U, T1g, T1j, T1l;
+			 {
+			      E T2d, T2h, T2k, T1q, T1u, T2n, TL, TI, TM, T3x, TZ, T2N, TP, TS, TU;
+			      {
+				   E T1, T4K, T8, T9, Tc;
+				   T1 = ri[0];
+				   T4K = ii[0];
+				   T8 = ri[WS(rs, 10)];
+				   T2d = FMA(T1p, Tv, T2c);
+				   T2h = FNMS(T1p, Tt, T2g);
+				   T2k = FMA(T1p, T6, T1o);
+				   T1q = FNMS(T1p, T6, T1o);
+				   T1u = FMA(T1p, T3, T1t);
+				   T2n = FNMS(T1p, T3, T1t);
+				   T9 = T7 * T8;
+				   Tc = ii[WS(rs, 10)];
+				   {
+					E Tx, Ts, T2F, TC, T2E;
+					{
+					     E Tn, Tr, To, T2D, T4J, Ty, TB, Td, T4I;
+					     Tn = ri[WS(rs, 5)];
+					     Tr = ii[WS(rs, 5)];
+					     Tx = ri[WS(rs, 15)];
+					     Td = FMA(Tb, Tc, T9);
+					     T4I = T7 * Tc;
+					     To = Tm * Tn;
+					     T2D = Tm * Tr;
+					     Te = T1 + Td;
+					     T2C = T1 - Td;
+					     T4J = FNMS(Tb, T8, T4I);
+					     Ty = Tw * Tx;
+					     TB = ii[WS(rs, 15)];
+					     Ts = FMA(Tq, Tr, To);
+					     T4L = T4J + T4K;
+					     T57 = T4K - T4J;
+					     T2F = Tw * TB;
+					     TC = FMA(TA, TB, Ty);
+					     T2E = FNMS(Tq, Tn, T2D);
+					}
+					{
+					     E TF, TG, TH, TW, TY, T2G, T3w, TX, T2M;
+					     TF = ri[WS(rs, 4)];
+					     T2G = FNMS(TA, Tx, T2F);
+					     T58 = Ts - TC;
+					     TD = Ts + TC;
+					     TG = Ti * TF;
+					     T2H = T2E - T2G;
+					     T4H = T2E + T2G;
+					     TH = ii[WS(rs, 4)];
+					     TW = ri[WS(rs, 19)];
+					     TY = ii[WS(rs, 19)];
+					     TL = ri[WS(rs, 14)];
+					     TI = FMA(Tl, TH, TG);
+					     T3w = Ti * TH;
+					     TX = Tt * TW;
+					     T2M = Tt * TY;
+					     TM = TK * TL;
+					     T3x = FNMS(Tl, TF, T3w);
+					     TZ = FMA(Tv, TY, TX);
+					     T2N = FNMS(Tv, TW, T2M);
+					     TP = ii[WS(rs, 14)];
+					     TS = ri[WS(rs, 9)];
+					     TU = ii[WS(rs, 9)];
+					}
+				   }
+			      }
+			      {
+				   E T27, T26, T28, T3p, T2p, T39, T29, T2e, T2i;
+				   {
+					E T22, T23, T25, T2l, T2o, T3o, T2m, T38;
+					{
+					     E TR, T2J, T3z, TV, T2L, T4i, T3A;
+					     T22 = ri[WS(rs, 12)];
+					     {
+						  E TQ, T3y, TT, T2K;
+						  TQ = FMA(TO, TP, TM);
+						  T3y = TK * TP;
+						  TT = T3 * TS;
+						  T2K = T3 * TU;
+						  TR = TI + TQ;
+						  T2J = TI - TQ;
+						  T3z = FNMS(TO, TL, T3y);
+						  TV = FMA(T6, TU, TT);
+						  T2L = FNMS(T6, TS, T2K);
+						  T23 = T21 * T22;
+					     }
+					     T4i = T3x + T3z;
+					     T3A = T3x - T3z;
+					     {
+						  E T10, T3B, T4j, T2O;
+						  T10 = TV + TZ;
+						  T3B = TV - TZ;
+						  T4j = T2L + T2N;
+						  T2O = T2L - T2N;
+						  T3C = T3A + T3B;
+						  T3Z = T3A - T3B;
+						  T11 = TR - T10;
+						  T2v = TR + T10;
+						  T2P = T2J - T2O;
+						  T3P = T2J + T2O;
+						  T4k = T4i - T4j;
+						  T4v = T4i + T4j;
+						  T25 = ii[WS(rs, 12)];
+					     }
+					}
+					T2l = ri[WS(rs, 7)];
+					T2o = ii[WS(rs, 7)];
+					T27 = ri[WS(rs, 2)];
+					T26 = FMA(T24, T25, T23);
+					T3o = T21 * T25;
+					T2m = T2k * T2l;
+					T38 = T2k * T2o;
+					T28 = T1n * T27;
+					T3p = FNMS(T24, T22, T3o);
+					T2p = FMA(T2n, T2o, T2m);
+					T39 = FNMS(T2n, T2l, T38);
+					T29 = ii[WS(rs, 2)];
+					T2e = ri[WS(rs, 17)];
+					T2i = ii[WS(rs, 17)];
+				   }
+				   {
+					E T1I, T1F, T1J, T3i, T1Y, T32, T1M, T1Q, T1T;
+					{
+					     E T1B, T1C, T1E, T1V, T1X, T3h, T1W, T31;
+					     {
+						  E T2b, T35, T3r, T2j, T37, T4e, T3s;
+						  T1B = ri[WS(rs, 8)];
+						  {
+						       E T2a, T3q, T2f, T36;
+						       T2a = FMA(T1p, T29, T28);
+						       T3q = T1n * T29;
+						       T2f = T2d * T2e;
+						       T36 = T2d * T2i;
+						       T2b = T26 + T2a;
+						       T35 = T26 - T2a;
+						       T3r = FNMS(T1p, T27, T3q);
+						       T2j = FMA(T2h, T2i, T2f);
+						       T37 = FNMS(T2h, T2e, T36);
+						       T1C = T1A * T1B;
+						  }
+						  T4e = T3p + T3r;
+						  T3s = T3p - T3r;
+						  {
+						       E T2q, T3t, T4f, T3a;
+						       T2q = T2j + T2p;
+						       T3t = T2j - T2p;
+						       T4f = T37 + T39;
+						       T3a = T37 - T39;
+						       T3u = T3s + T3t;
+						       T43 = T3s - T3t;
+						       T2r = T2b - T2q;
+						       T2z = T2b + T2q;
+						       T3b = T35 - T3a;
+						       T3T = T35 + T3a;
+						       T4g = T4e - T4f;
+						       T4z = T4e + T4f;
+						       T1E = ii[WS(rs, 8)];
+						  }
+					     }
+					     T1V = ri[WS(rs, 3)];
+					     T1X = ii[WS(rs, 3)];
+					     T1I = ri[WS(rs, 18)];
+					     T1F = FMA(T1D, T1E, T1C);
+					     T3h = T1A * T1E;
+					     T1W = Tf * T1V;
+					     T31 = Tf * T1X;
+					     T1J = T1H * T1I;
+					     T3i = FNMS(T1D, T1B, T3h);
+					     T1Y = FMA(Th, T1X, T1W);
+					     T32 = FNMS(Th, T1V, T31);
+					     T1M = ii[WS(rs, 18)];
+					     T1Q = ri[WS(rs, 13)];
+					     T1T = ii[WS(rs, 13)];
+					}
+					{
+					     E T14, T15, T18, T1r, T1v, T3D, T1s, T2T;
+					     {
+						  E T1O, T2Y, T3k, T1U, T30, T4b, T3l;
+						  T14 = ri[WS(rs, 16)];
+						  {
+						       E T1N, T3j, T1R, T2Z;
+						       T1N = FMA(T1L, T1M, T1J);
+						       T3j = T1H * T1M;
+						       T1R = T1P * T1Q;
+						       T2Z = T1P * T1T;
+						       T1O = T1F + T1N;
+						       T2Y = T1F - T1N;
+						       T3k = FNMS(T1L, T1I, T3j);
+						       T1U = FMA(T1S, T1T, T1R);
+						       T30 = FNMS(T1S, T1Q, T2Z);
+						       T15 = T13 * T14;
+						  }
+						  T4b = T3i + T3k;
+						  T3l = T3i - T3k;
+						  {
+						       E T1Z, T3m, T4c, T33;
+						       T1Z = T1U + T1Y;
+						       T3m = T1U - T1Y;
+						       T4c = T30 + T32;
+						       T33 = T30 - T32;
+						       T3n = T3l + T3m;
+						       T42 = T3l - T3m;
+						       T20 = T1O - T1Z;
+						       T2y = T1O + T1Z;
+						       T34 = T2Y - T33;
+						       T3S = T2Y + T33;
+						       T4d = T4b - T4c;
+						       T4y = T4b + T4c;
+						       T18 = ii[WS(rs, 16)];
+						  }
+					     }
+					     T1r = ri[WS(rs, 11)];
+					     T1v = ii[WS(rs, 11)];
+					     T1c = ri[WS(rs, 6)];
+					     T19 = FMA(T17, T18, T15);
+					     T3D = T13 * T18;
+					     T1s = T1q * T1r;
+					     T2T = T1q * T1v;
+					     T1d = T1b * T1c;
+					     T3E = FNMS(T17, T14, T3D);
+					     T1w = FMA(T1u, T1v, T1s);
+					     T2U = FNMS(T1u, T1r, T2T);
+					     T1g = ii[WS(rs, 6)];
+					     T1j = ri[WS(rs, 1)];
+					     T1l = ii[WS(rs, 1)];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T3J, T40, T2W, T3Q, T4M, T4E, T4F, T4U, T4S;
+			      {
+				   E T4X, T2u, T2w, T4w, T4W, T4r, T4p, T54, T56, T4V, T4a, T4q;
+				   {
+					E T4h, TE, T4n, T53, T1z, T2s, T52;
+					{
+					     E T1i, T2Q, T3G, T1m, T2S, T4l, T3H;
+					     T4h = T4d - T4g;
+					     T4X = T4d + T4g;
+					     {
+						  E T1h, T3F, T1k, T2R;
+						  T1h = FMA(T1f, T1g, T1d);
+						  T3F = T1b * T1g;
+						  T1k = T2 * T1j;
+						  T2R = T2 * T1l;
+						  T1i = T19 + T1h;
+						  T2Q = T19 - T1h;
+						  T3G = FNMS(T1f, T1c, T3F);
+						  T1m = FMA(T5, T1l, T1k);
+						  T2S = FNMS(T5, T1j, T2R);
+					     }
+					     TE = Te - TD;
+					     T2u = Te + TD;
+					     T4l = T3E + T3G;
+					     T3H = T3E - T3G;
+					     {
+						  E T1x, T3I, T4m, T2V, T1y;
+						  T1x = T1m + T1w;
+						  T3I = T1m - T1w;
+						  T4m = T2S + T2U;
+						  T2V = T2S - T2U;
+						  T3J = T3H + T3I;
+						  T40 = T3H - T3I;
+						  T1y = T1i - T1x;
+						  T2w = T1i + T1x;
+						  T2W = T2Q - T2V;
+						  T3Q = T2Q + T2V;
+						  T4n = T4l - T4m;
+						  T4w = T4l + T4m;
+						  T53 = T11 - T1y;
+						  T1z = T11 + T1y;
+						  T2s = T20 + T2r;
+						  T52 = T20 - T2r;
+					     }
+					}
+					{
+					     E T49, T48, T4o, T2t;
+					     T4o = T4k - T4n;
+					     T4W = T4k + T4n;
+					     T49 = T1z - T2s;
+					     T2t = T1z + T2s;
+					     T4r = FMA(KP618033988, T4h, T4o);
+					     T4p = FNMS(KP618033988, T4o, T4h);
+					     T54 = FNMS(KP618033988, T53, T52);
+					     T56 = FMA(KP618033988, T52, T53);
+					     ri[WS(rs, 10)] = TE + T2t;
+					     T48 = FNMS(KP250000000, T2t, TE);
+					     T4V = T4L - T4H;
+					     T4M = T4H + T4L;
+					     T4a = FNMS(KP559016994, T49, T48);
+					     T4q = FMA(KP559016994, T49, T48);
+					}
+				   }
+				   {
+					E T2x, T4Q, T4B, T4D, T4R, T2A, T51, T55;
+					{
+					     E T4x, T50, T4Y, T4A, T4Z;
+					     T4E = T4v + T4w;
+					     T4x = T4v - T4w;
+					     ri[WS(rs, 18)] = FMA(KP951056516, T4p, T4a);
+					     ri[WS(rs, 2)] = FNMS(KP951056516, T4p, T4a);
+					     ri[WS(rs, 6)] = FMA(KP951056516, T4r, T4q);
+					     ri[WS(rs, 14)] = FNMS(KP951056516, T4r, T4q);
+					     T50 = T4W - T4X;
+					     T4Y = T4W + T4X;
+					     T4A = T4y - T4z;
+					     T4F = T4y + T4z;
+					     T2x = T2v + T2w;
+					     T4Q = T2v - T2w;
+					     ii[WS(rs, 10)] = T4Y + T4V;
+					     T4Z = FNMS(KP250000000, T4Y, T4V);
+					     T4B = FMA(KP618033988, T4A, T4x);
+					     T4D = FNMS(KP618033988, T4x, T4A);
+					     T4R = T2y - T2z;
+					     T2A = T2y + T2z;
+					     T51 = FNMS(KP559016994, T50, T4Z);
+					     T55 = FMA(KP559016994, T50, T4Z);
+					}
+					{
+					     E T4t, T4s, T2B, T4u, T4C;
+					     T2B = T2x + T2A;
+					     T4t = T2x - T2A;
+					     ii[WS(rs, 18)] = FNMS(KP951056516, T54, T51);
+					     ii[WS(rs, 2)] = FMA(KP951056516, T54, T51);
+					     ii[WS(rs, 14)] = FMA(KP951056516, T56, T55);
+					     ii[WS(rs, 6)] = FNMS(KP951056516, T56, T55);
+					     ri[0] = T2u + T2B;
+					     T4s = FNMS(KP250000000, T2B, T2u);
+					     T4u = FMA(KP559016994, T4t, T4s);
+					     T4C = FNMS(KP559016994, T4t, T4s);
+					     T4U = FNMS(KP618033988, T4Q, T4R);
+					     T4S = FMA(KP618033988, T4R, T4Q);
+					     ri[WS(rs, 16)] = FMA(KP951056516, T4B, T4u);
+					     ri[WS(rs, 4)] = FNMS(KP951056516, T4B, T4u);
+					     ri[WS(rs, 8)] = FMA(KP951056516, T4D, T4C);
+					     ri[WS(rs, 12)] = FNMS(KP951056516, T4D, T4C);
+					}
+				   }
+			      }
+			      {
+				   E T3O, T5u, T5w, T5l, T5q, T5o;
+				   {
+					E T5n, T5m, T2I, T4O, T3N, T3L, T2X, T5t, T4N, T5s, T3c, T3v, T3K, T4G;
+					T5n = T3n + T3u;
+					T3v = T3n - T3u;
+					T3K = T3C - T3J;
+					T5m = T3C + T3J;
+					T3O = T2C + T2H;
+					T2I = T2C - T2H;
+					T4O = T4E - T4F;
+					T4G = T4E + T4F;
+					T3N = FMA(KP618033988, T3v, T3K);
+					T3L = FNMS(KP618033988, T3K, T3v);
+					T2X = T2P + T2W;
+					T5t = T2P - T2W;
+					ii[0] = T4G + T4M;
+					T4N = FNMS(KP250000000, T4G, T4M);
+					T5s = T34 - T3b;
+					T3c = T34 + T3b;
+					{
+					     E T3f, T3e, T4P, T4T, T3d, T3M, T3g;
+					     T4P = FMA(KP559016994, T4O, T4N);
+					     T4T = FNMS(KP559016994, T4O, T4N);
+					     T3f = T2X - T3c;
+					     T3d = T2X + T3c;
+					     ii[WS(rs, 16)] = FNMS(KP951056516, T4S, T4P);
+					     ii[WS(rs, 4)] = FMA(KP951056516, T4S, T4P);
+					     ii[WS(rs, 12)] = FMA(KP951056516, T4U, T4T);
+					     ii[WS(rs, 8)] = FNMS(KP951056516, T4U, T4T);
+					     ri[WS(rs, 15)] = T2I + T3d;
+					     T3e = FNMS(KP250000000, T3d, T2I);
+					     T5u = FNMS(KP618033988, T5t, T5s);
+					     T5w = FMA(KP618033988, T5s, T5t);
+					     T5l = T58 + T57;
+					     T59 = T57 - T58;
+					     T3M = FMA(KP559016994, T3f, T3e);
+					     T3g = FNMS(KP559016994, T3f, T3e);
+					     ri[WS(rs, 7)] = FNMS(KP951056516, T3L, T3g);
+					     ri[WS(rs, 3)] = FMA(KP951056516, T3L, T3g);
+					     ri[WS(rs, 19)] = FNMS(KP951056516, T3N, T3M);
+					     ri[WS(rs, 11)] = FMA(KP951056516, T3N, T3M);
+					     T5q = T5m - T5n;
+					     T5o = T5m + T5n;
+					}
+				   }
+				   {
+					E T5a, T5b, T47, T45, T5g, T5h, T3V, T3X, T41, T44, T5p, T3W, T46, T3Y;
+					T5a = T3Z + T40;
+					T41 = T3Z - T40;
+					T44 = T42 - T43;
+					T5b = T42 + T43;
+					ii[WS(rs, 15)] = T5o + T5l;
+					T5p = FNMS(KP250000000, T5o, T5l);
+					T47 = FNMS(KP618033988, T41, T44);
+					T45 = FMA(KP618033988, T44, T41);
+					{
+					     E T5r, T5v, T3R, T3U;
+					     T5r = FNMS(KP559016994, T5q, T5p);
+					     T5v = FMA(KP559016994, T5q, T5p);
+					     T3R = T3P + T3Q;
+					     T5g = T3P - T3Q;
+					     T5h = T3S - T3T;
+					     T3U = T3S + T3T;
+					     ii[WS(rs, 7)] = FMA(KP951056516, T5u, T5r);
+					     ii[WS(rs, 3)] = FNMS(KP951056516, T5u, T5r);
+					     ii[WS(rs, 19)] = FMA(KP951056516, T5w, T5v);
+					     ii[WS(rs, 11)] = FNMS(KP951056516, T5w, T5v);
+					     T3V = T3R + T3U;
+					     T3X = T3R - T3U;
+					}
+					ri[WS(rs, 5)] = T3O + T3V;
+					T3W = FNMS(KP250000000, T3V, T3O);
+					T5i = FMA(KP618033988, T5h, T5g);
+					T5k = FNMS(KP618033988, T5g, T5h);
+					T46 = FNMS(KP559016994, T3X, T3W);
+					T3Y = FMA(KP559016994, T3X, T3W);
+					ri[WS(rs, 9)] = FNMS(KP951056516, T45, T3Y);
+					ri[WS(rs, 1)] = FMA(KP951056516, T45, T3Y);
+					ri[WS(rs, 17)] = FNMS(KP951056516, T47, T46);
+					ri[WS(rs, 13)] = FMA(KP951056516, T47, T46);
+					T5e = T5a - T5b;
+					T5c = T5a + T5b;
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ii[WS(rs, 5)] = T5c + T59;
+	       T5d = FNMS(KP250000000, T5c, T59);
+	       T5j = FNMS(KP559016994, T5e, T5d);
+	       T5f = FMA(KP559016994, T5e, T5d);
+	       ii[WS(rs, 9)] = FMA(KP951056516, T5i, T5f);
+	       ii[WS(rs, 1)] = FNMS(KP951056516, T5i, T5f);
+	       ii[WS(rs, 17)] = FMA(KP951056516, T5k, T5j);
+	       ii[WS(rs, 13)] = FNMS(KP951056516, T5k, T5j);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_CEXP, 0, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 20, "t2_20", twinstr, &GENUS, {136, 58, 140, 0}, 0, 0, 0 };
+
+void X(codelet_t2_20) (planner *p) {
+     X(kdft_dit_register) (p, t2_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -name t2_20 -include t.h */
+
+/*
+ * This function contains 276 FP additions, 164 FP multiplications,
+ * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
+ * 123 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "t.h"
+
+static void t2_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T2, T5, Tg, Ti, Tk, To, T1h, T1f, T6, T3, T8, T14, T1Q, Tc, T1O;
+	       E T1v, T18, T1t, T1n, T24, T1j, T22, Tq, Tu, T1E, T1G, Tx, Ty, Tz, TJ;
+	       E T1Z, TB, T1X, T1A, TZ, TL, T1y, TX;
+	       {
+		    E T7, T16, Ta, T13, T4, T17, Tb, T12;
+		    {
+			 E Th, Tn, Tj, Tm;
+			 T2 = W[0];
+			 T5 = W[1];
+			 Tg = W[2];
+			 Ti = W[3];
+			 Th = T2 * Tg;
+			 Tn = T5 * Tg;
+			 Tj = T5 * Ti;
+			 Tm = T2 * Ti;
+			 Tk = Th - Tj;
+			 To = Tm + Tn;
+			 T1h = Tm - Tn;
+			 T1f = Th + Tj;
+			 T6 = W[5];
+			 T7 = T5 * T6;
+			 T16 = Tg * T6;
+			 Ta = T2 * T6;
+			 T13 = Ti * T6;
+			 T3 = W[4];
+			 T4 = T2 * T3;
+			 T17 = Ti * T3;
+			 Tb = T5 * T3;
+			 T12 = Tg * T3;
+		    }
+		    T8 = T4 - T7;
+		    T14 = T12 + T13;
+		    T1Q = T16 + T17;
+		    Tc = Ta + Tb;
+		    T1O = T12 - T13;
+		    T1v = Ta - Tb;
+		    T18 = T16 - T17;
+		    T1t = T4 + T7;
+		    {
+			 E T1l, T1m, T1g, T1i;
+			 T1l = T1f * T6;
+			 T1m = T1h * T3;
+			 T1n = T1l + T1m;
+			 T24 = T1l - T1m;
+			 T1g = T1f * T3;
+			 T1i = T1h * T6;
+			 T1j = T1g - T1i;
+			 T22 = T1g + T1i;
+			 {
+			      E Tl, Tp, Ts, Tt;
+			      Tl = Tk * T3;
+			      Tp = To * T6;
+			      Tq = Tl + Tp;
+			      Ts = Tk * T6;
+			      Tt = To * T3;
+			      Tu = Ts - Tt;
+			      T1E = Tl - Tp;
+			      T1G = Ts + Tt;
+			      Tx = W[6];
+			      Ty = W[7];
+			      Tz = FMA(Tk, Tx, To * Ty);
+			      TJ = FMA(Tq, Tx, Tu * Ty);
+			      T1Z = FNMS(T1h, Tx, T1f * Ty);
+			      TB = FNMS(To, Tx, Tk * Ty);
+			      T1X = FMA(T1f, Tx, T1h * Ty);
+			      T1A = FNMS(T5, Tx, T2 * Ty);
+			      TZ = FNMS(Ti, Tx, Tg * Ty);
+			      TL = FNMS(Tu, Tx, Tq * Ty);
+			      T1y = FMA(T2, Tx, T5 * Ty);
+			      TX = FMA(Tg, Tx, Ti * Ty);
+			 }
+		    }
+	       }
+	       {
+		    E TF, T2b, T4A, T4J, T2K, T3r, T4a, T4m, T1N, T28, T29, T3C, T3F, T4o, T3X;
+		    E T3Y, T44, T2f, T2g, T2h, T2n, T2s, T4L, T3g, T3h, T4w, T3n, T3o, T3p, T30;
+		    E T35, T36, TW, T1r, T1s, T3J, T3M, T4n, T3U, T3V, T43, T2c, T2d, T2e, T2y;
+		    E T2D, T4K, T3d, T3e, T4v, T3k, T3l, T3m, T2P, T2U, T2V;
+		    {
+			 E T1, T48, Te, T47, Tw, T2H, TD, T2I, T9, Td;
+			 T1 = ri[0];
+			 T48 = ii[0];
+			 T9 = ri[WS(rs, 10)];
+			 Td = ii[WS(rs, 10)];
+			 Te = FMA(T8, T9, Tc * Td);
+			 T47 = FNMS(Tc, T9, T8 * Td);
+			 {
+			      E Tr, Tv, TA, TC;
+			      Tr = ri[WS(rs, 5)];
+			      Tv = ii[WS(rs, 5)];
+			      Tw = FMA(Tq, Tr, Tu * Tv);
+			      T2H = FNMS(Tu, Tr, Tq * Tv);
+			      TA = ri[WS(rs, 15)];
+			      TC = ii[WS(rs, 15)];
+			      TD = FMA(Tz, TA, TB * TC);
+			      T2I = FNMS(TB, TA, Tz * TC);
+			 }
+			 {
+			      E Tf, TE, T4y, T4z;
+			      Tf = T1 + Te;
+			      TE = Tw + TD;
+			      TF = Tf - TE;
+			      T2b = Tf + TE;
+			      T4y = T48 - T47;
+			      T4z = Tw - TD;
+			      T4A = T4y - T4z;
+			      T4J = T4z + T4y;
+			 }
+			 {
+			      E T2G, T2J, T46, T49;
+			      T2G = T1 - Te;
+			      T2J = T2H - T2I;
+			      T2K = T2G - T2J;
+			      T3r = T2G + T2J;
+			      T46 = T2H + T2I;
+			      T49 = T47 + T48;
+			      T4a = T46 + T49;
+			      T4m = T49 - T46;
+			 }
+		    }
+		    {
+			 E T1D, T3A, T2l, T2W, T27, T3E, T2r, T34, T1M, T3B, T2m, T2Z, T1W, T3D, T2q;
+			 E T31;
+			 {
+			      E T1x, T2j, T1C, T2k;
+			      {
+				   E T1u, T1w, T1z, T1B;
+				   T1u = ri[WS(rs, 8)];
+				   T1w = ii[WS(rs, 8)];
+				   T1x = FMA(T1t, T1u, T1v * T1w);
+				   T2j = FNMS(T1v, T1u, T1t * T1w);
+				   T1z = ri[WS(rs, 18)];
+				   T1B = ii[WS(rs, 18)];
+				   T1C = FMA(T1y, T1z, T1A * T1B);
+				   T2k = FNMS(T1A, T1z, T1y * T1B);
+			      }
+			      T1D = T1x + T1C;
+			      T3A = T2j + T2k;
+			      T2l = T2j - T2k;
+			      T2W = T1x - T1C;
+			 }
+			 {
+			      E T21, T32, T26, T33;
+			      {
+				   E T1Y, T20, T23, T25;
+				   T1Y = ri[WS(rs, 17)];
+				   T20 = ii[WS(rs, 17)];
+				   T21 = FMA(T1X, T1Y, T1Z * T20);
+				   T32 = FNMS(T1Z, T1Y, T1X * T20);
+				   T23 = ri[WS(rs, 7)];
+				   T25 = ii[WS(rs, 7)];
+				   T26 = FMA(T22, T23, T24 * T25);
+				   T33 = FNMS(T24, T23, T22 * T25);
+			      }
+			      T27 = T21 + T26;
+			      T3E = T32 + T33;
+			      T2r = T21 - T26;
+			      T34 = T32 - T33;
+			 }
+			 {
+			      E T1I, T2X, T1L, T2Y;
+			      {
+				   E T1F, T1H, T1J, T1K;
+				   T1F = ri[WS(rs, 13)];
+				   T1H = ii[WS(rs, 13)];
+				   T1I = FMA(T1E, T1F, T1G * T1H);
+				   T2X = FNMS(T1G, T1F, T1E * T1H);
+				   T1J = ri[WS(rs, 3)];
+				   T1K = ii[WS(rs, 3)];
+				   T1L = FMA(Tg, T1J, Ti * T1K);
+				   T2Y = FNMS(Ti, T1J, Tg * T1K);
+			      }
+			      T1M = T1I + T1L;
+			      T3B = T2X + T2Y;
+			      T2m = T1I - T1L;
+			      T2Z = T2X - T2Y;
+			 }
+			 {
+			      E T1S, T2o, T1V, T2p;
+			      {
+				   E T1P, T1R, T1T, T1U;
+				   T1P = ri[WS(rs, 12)];
+				   T1R = ii[WS(rs, 12)];
+				   T1S = FMA(T1O, T1P, T1Q * T1R);
+				   T2o = FNMS(T1Q, T1P, T1O * T1R);
+				   T1T = ri[WS(rs, 2)];
+				   T1U = ii[WS(rs, 2)];
+				   T1V = FMA(T1f, T1T, T1h * T1U);
+				   T2p = FNMS(T1h, T1T, T1f * T1U);
+			      }
+			      T1W = T1S + T1V;
+			      T3D = T2o + T2p;
+			      T2q = T2o - T2p;
+			      T31 = T1S - T1V;
+			 }
+			 T1N = T1D - T1M;
+			 T28 = T1W - T27;
+			 T29 = T1N + T28;
+			 T3C = T3A - T3B;
+			 T3F = T3D - T3E;
+			 T4o = T3C + T3F;
+			 T3X = T3A + T3B;
+			 T3Y = T3D + T3E;
+			 T44 = T3X + T3Y;
+			 T2f = T1D + T1M;
+			 T2g = T1W + T27;
+			 T2h = T2f + T2g;
+			 T2n = T2l + T2m;
+			 T2s = T2q + T2r;
+			 T4L = T2n + T2s;
+			 T3g = T2l - T2m;
+			 T3h = T2q - T2r;
+			 T4w = T3g + T3h;
+			 T3n = T2W + T2Z;
+			 T3o = T31 + T34;
+			 T3p = T3n + T3o;
+			 T30 = T2W - T2Z;
+			 T35 = T31 - T34;
+			 T36 = T30 + T35;
+		    }
+		    {
+			 E TO, T3H, T2w, T2L, T1q, T3L, T2C, T2T, TV, T3I, T2x, T2O, T1b, T3K, T2B;
+			 E T2Q;
+			 {
+			      E TI, T2u, TN, T2v;
+			      {
+				   E TG, TH, TK, TM;
+				   TG = ri[WS(rs, 4)];
+				   TH = ii[WS(rs, 4)];
+				   TI = FMA(Tk, TG, To * TH);
+				   T2u = FNMS(To, TG, Tk * TH);
+				   TK = ri[WS(rs, 14)];
+				   TM = ii[WS(rs, 14)];
+				   TN = FMA(TJ, TK, TL * TM);
+				   T2v = FNMS(TL, TK, TJ * TM);
+			      }
+			      TO = TI + TN;
+			      T3H = T2u + T2v;
+			      T2w = T2u - T2v;
+			      T2L = TI - TN;
+			 }
+			 {
+			      E T1e, T2R, T1p, T2S;
+			      {
+				   E T1c, T1d, T1k, T1o;
+				   T1c = ri[WS(rs, 1)];
+				   T1d = ii[WS(rs, 1)];
+				   T1e = FMA(T2, T1c, T5 * T1d);
+				   T2R = FNMS(T5, T1c, T2 * T1d);
+				   T1k = ri[WS(rs, 11)];
+				   T1o = ii[WS(rs, 11)];
+				   T1p = FMA(T1j, T1k, T1n * T1o);
+				   T2S = FNMS(T1n, T1k, T1j * T1o);
+			      }
+			      T1q = T1e + T1p;
+			      T3L = T2R + T2S;
+			      T2C = T1e - T1p;
+			      T2T = T2R - T2S;
+			 }
+			 {
+			      E TR, T2M, TU, T2N;
+			      {
+				   E TP, TQ, TS, TT;
+				   TP = ri[WS(rs, 9)];
+				   TQ = ii[WS(rs, 9)];
+				   TR = FMA(T3, TP, T6 * TQ);
+				   T2M = FNMS(T6, TP, T3 * TQ);
+				   TS = ri[WS(rs, 19)];
+				   TT = ii[WS(rs, 19)];
+				   TU = FMA(Tx, TS, Ty * TT);
+				   T2N = FNMS(Ty, TS, Tx * TT);
+			      }
+			      TV = TR + TU;
+			      T3I = T2M + T2N;
+			      T2x = TR - TU;
+			      T2O = T2M - T2N;
+			 }
+			 {
+			      E T11, T2z, T1a, T2A;
+			      {
+				   E TY, T10, T15, T19;
+				   TY = ri[WS(rs, 16)];
+				   T10 = ii[WS(rs, 16)];
+				   T11 = FMA(TX, TY, TZ * T10);
+				   T2z = FNMS(TZ, TY, TX * T10);
+				   T15 = ri[WS(rs, 6)];
+				   T19 = ii[WS(rs, 6)];
+				   T1a = FMA(T14, T15, T18 * T19);
+				   T2A = FNMS(T18, T15, T14 * T19);
+			      }
+			      T1b = T11 + T1a;
+			      T3K = T2z + T2A;
+			      T2B = T2z - T2A;
+			      T2Q = T11 - T1a;
+			 }
+			 TW = TO - TV;
+			 T1r = T1b - T1q;
+			 T1s = TW + T1r;
+			 T3J = T3H - T3I;
+			 T3M = T3K - T3L;
+			 T4n = T3J + T3M;
+			 T3U = T3H + T3I;
+			 T3V = T3K + T3L;
+			 T43 = T3U + T3V;
+			 T2c = TO + TV;
+			 T2d = T1b + T1q;
+			 T2e = T2c + T2d;
+			 T2y = T2w + T2x;
+			 T2D = T2B + T2C;
+			 T4K = T2y + T2D;
+			 T3d = T2w - T2x;
+			 T3e = T2B - T2C;
+			 T4v = T3d + T3e;
+			 T3k = T2L + T2O;
+			 T3l = T2Q + T2T;
+			 T3m = T3k + T3l;
+			 T2P = T2L - T2O;
+			 T2U = T2Q - T2T;
+			 T2V = T2P + T2U;
+		    }
+		    {
+			 E T3y, T2a, T3x, T3O, T3Q, T3G, T3N, T3P, T3z;
+			 T3y = KP559016994 * (T1s - T29);
+			 T2a = T1s + T29;
+			 T3x = FNMS(KP250000000, T2a, TF);
+			 T3G = T3C - T3F;
+			 T3N = T3J - T3M;
+			 T3O = FNMS(KP587785252, T3N, KP951056516 * T3G);
+			 T3Q = FMA(KP951056516, T3N, KP587785252 * T3G);
+			 ri[WS(rs, 10)] = TF + T2a;
+			 T3P = T3y + T3x;
+			 ri[WS(rs, 14)] = T3P - T3Q;
+			 ri[WS(rs, 6)] = T3P + T3Q;
+			 T3z = T3x - T3y;
+			 ri[WS(rs, 2)] = T3z - T3O;
+			 ri[WS(rs, 18)] = T3z + T3O;
+		    }
+		    {
+			 E T4r, T4p, T4q, T4l, T4u, T4j, T4k, T4t, T4s;
+			 T4r = KP559016994 * (T4n - T4o);
+			 T4p = T4n + T4o;
+			 T4q = FNMS(KP250000000, T4p, T4m);
+			 T4j = T1N - T28;
+			 T4k = TW - T1r;
+			 T4l = FNMS(KP587785252, T4k, KP951056516 * T4j);
+			 T4u = FMA(KP951056516, T4k, KP587785252 * T4j);
+			 ii[WS(rs, 10)] = T4p + T4m;
+			 T4t = T4r + T4q;
+			 ii[WS(rs, 6)] = T4t - T4u;
+			 ii[WS(rs, 14)] = T4u + T4t;
+			 T4s = T4q - T4r;
+			 ii[WS(rs, 2)] = T4l + T4s;
+			 ii[WS(rs, 18)] = T4s - T4l;
+		    }
+		    {
+			 E T3R, T2i, T3S, T40, T42, T3W, T3Z, T41, T3T;
+			 T3R = KP559016994 * (T2e - T2h);
+			 T2i = T2e + T2h;
+			 T3S = FNMS(KP250000000, T2i, T2b);
+			 T3W = T3U - T3V;
+			 T3Z = T3X - T3Y;
+			 T40 = FMA(KP951056516, T3W, KP587785252 * T3Z);
+			 T42 = FNMS(KP587785252, T3W, KP951056516 * T3Z);
+			 ri[0] = T2b + T2i;
+			 T41 = T3S - T3R;
+			 ri[WS(rs, 12)] = T41 - T42;
+			 ri[WS(rs, 8)] = T41 + T42;
+			 T3T = T3R + T3S;
+			 ri[WS(rs, 4)] = T3T - T40;
+			 ri[WS(rs, 16)] = T3T + T40;
+		    }
+		    {
+			 E T4e, T45, T4f, T4d, T4i, T4b, T4c, T4h, T4g;
+			 T4e = KP559016994 * (T43 - T44);
+			 T45 = T43 + T44;
+			 T4f = FNMS(KP250000000, T45, T4a);
+			 T4b = T2c - T2d;
+			 T4c = T2f - T2g;
+			 T4d = FMA(KP951056516, T4b, KP587785252 * T4c);
+			 T4i = FNMS(KP587785252, T4b, KP951056516 * T4c);
+			 ii[0] = T45 + T4a;
+			 T4h = T4f - T4e;
+			 ii[WS(rs, 8)] = T4h - T4i;
+			 ii[WS(rs, 12)] = T4i + T4h;
+			 T4g = T4e + T4f;
+			 ii[WS(rs, 4)] = T4d + T4g;
+			 ii[WS(rs, 16)] = T4g - T4d;
+		    }
+		    {
+			 E T39, T37, T38, T2F, T3b, T2t, T2E, T3c, T3a;
+			 T39 = KP559016994 * (T2V - T36);
+			 T37 = T2V + T36;
+			 T38 = FNMS(KP250000000, T37, T2K);
+			 T2t = T2n - T2s;
+			 T2E = T2y - T2D;
+			 T2F = FNMS(KP587785252, T2E, KP951056516 * T2t);
+			 T3b = FMA(KP951056516, T2E, KP587785252 * T2t);
+			 ri[WS(rs, 15)] = T2K + T37;
+			 T3c = T39 + T38;
+			 ri[WS(rs, 11)] = T3b + T3c;
+			 ri[WS(rs, 19)] = T3c - T3b;
+			 T3a = T38 - T39;
+			 ri[WS(rs, 3)] = T2F + T3a;
+			 ri[WS(rs, 7)] = T3a - T2F;
+		    }
+		    {
+			 E T4O, T4M, T4N, T4S, T4U, T4Q, T4R, T4T, T4P;
+			 T4O = KP559016994 * (T4K - T4L);
+			 T4M = T4K + T4L;
+			 T4N = FNMS(KP250000000, T4M, T4J);
+			 T4Q = T30 - T35;
+			 T4R = T2P - T2U;
+			 T4S = FNMS(KP587785252, T4R, KP951056516 * T4Q);
+			 T4U = FMA(KP951056516, T4R, KP587785252 * T4Q);
+			 ii[WS(rs, 15)] = T4M + T4J;
+			 T4T = T4O + T4N;
+			 ii[WS(rs, 11)] = T4T - T4U;
+			 ii[WS(rs, 19)] = T4U + T4T;
+			 T4P = T4N - T4O;
+			 ii[WS(rs, 3)] = T4P - T4S;
+			 ii[WS(rs, 7)] = T4S + T4P;
+		    }
+		    {
+			 E T3q, T3s, T3t, T3j, T3v, T3f, T3i, T3w, T3u;
+			 T3q = KP559016994 * (T3m - T3p);
+			 T3s = T3m + T3p;
+			 T3t = FNMS(KP250000000, T3s, T3r);
+			 T3f = T3d - T3e;
+			 T3i = T3g - T3h;
+			 T3j = FMA(KP951056516, T3f, KP587785252 * T3i);
+			 T3v = FNMS(KP587785252, T3f, KP951056516 * T3i);
+			 ri[WS(rs, 5)] = T3r + T3s;
+			 T3w = T3t - T3q;
+			 ri[WS(rs, 13)] = T3v + T3w;
+			 ri[WS(rs, 17)] = T3w - T3v;
+			 T3u = T3q + T3t;
+			 ri[WS(rs, 1)] = T3j + T3u;
+			 ri[WS(rs, 9)] = T3u - T3j;
+		    }
+		    {
+			 E T4x, T4B, T4C, T4G, T4I, T4E, T4F, T4H, T4D;
+			 T4x = KP559016994 * (T4v - T4w);
+			 T4B = T4v + T4w;
+			 T4C = FNMS(KP250000000, T4B, T4A);
+			 T4E = T3k - T3l;
+			 T4F = T3n - T3o;
+			 T4G = FMA(KP951056516, T4E, KP587785252 * T4F);
+			 T4I = FNMS(KP587785252, T4E, KP951056516 * T4F);
+			 ii[WS(rs, 5)] = T4B + T4A;
+			 T4H = T4C - T4x;
+			 ii[WS(rs, 13)] = T4H - T4I;
+			 ii[WS(rs, 17)] = T4I + T4H;
+			 T4D = T4x + T4C;
+			 ii[WS(rs, 1)] = T4D - T4G;
+			 ii[WS(rs, 9)] = T4G + T4D;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_CEXP, 0, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 20, "t2_20", twinstr, &GENUS, {204, 92, 72, 0}, 0, 0, 0 };
+
+void X(codelet_t2_20) (planner *p) {
+     X(kdft_dit_register) (p, t2_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t2_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t2_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1619 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:11 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 25 -name t2_25 -include t.h */
+
+/*
+ * This function contains 440 FP additions, 434 FP multiplications,
+ * (or, 84 additions, 78 multiplications, 356 fused multiply/add),
+ * 215 stack variables, 47 constants, and 100 memory accesses
+ */
+#include "t.h"
+
+static void t2_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DK(KP557913902, +0.557913902031834264187699648465567037992437152);
+     DK(KP249506682, +0.249506682107067890488084201715862638334226305);
+     DK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DK(KP968479752, +0.968479752739016373193524836781420152702090879);
+     DK(KP621716863, +0.621716863012209892444754556304102309693593202);
+     DK(KP614372930, +0.614372930789563808870829930444362096004872855);
+     DK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP994076283, +0.994076283785401014123185814696322018529298887);
+     DK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DK(KP062914667, +0.062914667253649757225485955897349402364686947);
+     DK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DK(KP943557151, +0.943557151597354104399655195398983005179443399);
+     DK(KP554608978, +0.554608978404018097464974850792216217022558774);
+     DK(KP248028675, +0.248028675328619457762448260696444630363259177);
+     DK(KP921177326, +0.921177326965143320250447435415066029359282231);
+     DK(KP833417178, +0.833417178328688677408962550243238843138996060);
+     DK(KP726211448, +0.726211448929902658173535992263577167607493062);
+     DK(KP525970792, +0.525970792408939708442463226536226366643874659);
+     DK(KP541454447, +0.541454447536312777046285590082819509052033189);
+     DK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP851038619, +0.851038619207379630836264138867114231259902550);
+     DK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DK(KP912018591, +0.912018591466481957908415381764119056233607330);
+     DK(KP470564281, +0.470564281212251493087595091036643380879947982);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP827271945, +0.827271945972475634034355757144307982555673741);
+     DK(KP126329378, +0.126329378446108174786050455341811215027378105);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DK(KP871714437, +0.871714437527667770979999223229522602943903653);
+     DK(KP549754652, +0.549754652192770074288023275540779861653779767);
+     DK(KP634619297, +0.634619297544148100711287640319130485732531031);
+     DK(KP939062505, +0.939062505817492352556001843133229685779824606);
+     DK(KP256756360, +0.256756360367726783319498520922669048172391148);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E T8c, T7k, T7i, T8i, T8g, T8b, T7j, T7b, T8d, T8h;
+	       {
+		    E T2, T8, T3, T6, Tk, Tv, TS, T4, Ta, TD, T2L, T10, Tm, T5, Tc;
+		    T2 = W[0];
+		    T8 = W[4];
+		    T3 = W[2];
+		    T6 = W[3];
+		    Tk = W[6];
+		    Tv = T2 * T8;
+		    TS = T3 * T8;
+		    T4 = T2 * T3;
+		    Ta = T2 * T6;
+		    TD = T8 * Tk;
+		    T2L = T2 * Tk;
+		    T10 = T3 * Tk;
+		    Tm = W[7];
+		    T5 = W[1];
+		    Tc = W[5];
+		    {
+			 E T7G, T86, T4s, T6a, T4g, TN, T4f, T7C, T7s, T7B, T5q, T6k, T3a, T5j, T6n;
+			 E T6m, T5g, T4a, T5n, T6j, T6C, T4G, T6z, T4z, T1v, T3t, T6y, T4w, T6B, T4D;
+			 E T6v, T4O, T6s, T4V, T21, T3H, T6r, T4S, T6u, T4L, T26, T3K, T5a, T2A, T3U;
+			 E T53, T2c, T3M, T2k, T3O;
+			 {
+			      E T11, T1b, Tb, T19, T7, T2m, TT, T15, T2Q, TX, T2p, T1g, T2a, T2e, T2i;
+			      E T27, T1c, T1O, T1K, T1q, T1m, T2x, T2t, T1W, T1S, T2G, T3Y, T2N, T5p, T38;
+			      E T48, T5i, T2K, T40, T2S, T41;
+			      {
+				   E T2M, T1j, T1l, T2X, T2U, T35, T31, T7r, T7p, T7o, T2O, T2R;
+				   {
+					E T1, Tj, T4j, TK, T4q, TC, T4o, Tt, T4l;
+					{
+					     E TE, Tw, TI, TA, Th, Tr, Tn, Td, Te, Ti, T14, T2P, TH, Tx, TB;
+					     T1 = ri[0];
+					     T11 = FMA(T6, Tm, T10);
+					     T14 = T3 * Tm;
+					     T2P = T2 * Tm;
+					     TH = T8 * Tm;
+					     T2M = FMA(T5, Tm, T2L);
+					     T1b = FNMS(T5, T3, Ta);
+					     Tb = FMA(T5, T3, Ta);
+					     T19 = FMA(T5, T6, T4);
+					     T7 = FNMS(T5, T6, T4);
+					     T2m = FNMS(T6, Tc, TS);
+					     TT = FMA(T6, Tc, TS);
+					     TE = FMA(Tc, Tm, TD);
+					     T1j = FMA(T5, Tc, Tv);
+					     Tw = FNMS(T5, Tc, Tv);
+					     {
+						  E TW, Tz, T1f, T2d;
+						  TW = T3 * Tc;
+						  Tz = T2 * Tc;
+						  T15 = FNMS(T6, Tk, T14);
+						  T2Q = FNMS(T5, Tk, T2P);
+						  TI = FNMS(Tc, Tk, TH);
+						  T1f = T19 * Tc;
+						  T2d = T19 * Tk;
+						  {
+						       E T2h, T1a, Tg, Tq;
+						       T2h = T19 * Tm;
+						       T1a = T19 * T8;
+						       Tg = T7 * Tc;
+						       Tq = T7 * Tm;
+						       {
+							    E Tl, T9, T1p, T1k;
+							    Tl = T7 * Tk;
+							    T9 = T7 * T8;
+							    T1p = T1j * Tm;
+							    T1k = T1j * Tk;
+							    {
+								 E T34, T30, T1N, T1J;
+								 T34 = TT * Tm;
+								 T30 = TT * Tk;
+								 T1N = Tw * Tm;
+								 T1J = Tw * Tk;
+								 TX = FNMS(T6, T8, TW);
+								 T2p = FMA(T6, T8, TW);
+								 TA = FMA(T5, T8, Tz);
+								 T1l = FNMS(T5, T8, Tz);
+								 T1g = FMA(T1b, T8, T1f);
+								 T2a = FNMS(T1b, T8, T1f);
+								 T2e = FMA(T1b, Tm, T2d);
+								 T2i = FNMS(T1b, Tk, T2h);
+								 T27 = FMA(T1b, Tc, T1a);
+								 T1c = FNMS(T1b, Tc, T1a);
+								 T2X = FMA(Tb, T8, Tg);
+								 Th = FNMS(Tb, T8, Tg);
+								 Tr = FNMS(Tb, Tk, Tq);
+								 Tn = FMA(Tb, Tm, Tl);
+								 Td = FMA(Tb, Tc, T9);
+								 T2U = FNMS(Tb, Tc, T9);
+								 T35 = FNMS(TX, Tk, T34);
+								 T31 = FMA(TX, Tm, T30);
+								 T1O = FNMS(TA, Tk, T1N);
+								 T1K = FMA(TA, Tm, T1J);
+								 T1q = FNMS(T1l, Tk, T1p);
+								 T1m = FMA(T1l, Tm, T1k);
+								 {
+								      E T2w, T2s, T1V, T1R;
+								      T2w = T27 * Tm;
+								      T2s = T27 * Tk;
+								      T1V = Td * Tm;
+								      T1R = Td * Tk;
+								      T2x = FNMS(T2a, Tk, T2w);
+								      T2t = FMA(T2a, Tm, T2s);
+								      T1W = FNMS(Th, Tk, T1V);
+								      T1S = FMA(Th, Tm, T1R);
+								      T7r = ii[0];
+								      Te = ri[WS(rs, 5)];
+								      Ti = ii[WS(rs, 5)];
+								 }
+							    }
+						       }
+						  }
+					     }
+					     {
+						  E TF, TJ, Tf, T4i, TG, T4p;
+						  TF = ri[WS(rs, 15)];
+						  TJ = ii[WS(rs, 15)];
+						  Tf = Td * Te;
+						  T4i = Td * Ti;
+						  TG = TE * TF;
+						  T4p = TE * TJ;
+						  Tj = FMA(Th, Ti, Tf);
+						  T4j = FNMS(Th, Te, T4i);
+						  TK = FMA(TI, TJ, TG);
+						  T4q = FNMS(TI, TF, T4p);
+					     }
+					     Tx = ri[WS(rs, 10)];
+					     TB = ii[WS(rs, 10)];
+					     {
+						  E To, Ts, Ty, T4n, Tp, T4k;
+						  To = ri[WS(rs, 20)];
+						  Ts = ii[WS(rs, 20)];
+						  Ty = Tw * Tx;
+						  T4n = Tw * TB;
+						  Tp = Tn * To;
+						  T4k = Tn * Ts;
+						  TC = FMA(TA, TB, Ty);
+						  T4o = FNMS(TA, Tx, T4n);
+						  Tt = FMA(Tr, Ts, Tp);
+						  T4l = FNMS(Tr, To, T4k);
+					     }
+					}
+					{
+					     E TL, T7F, T4r, Tu, T7E, T4m, TM;
+					     TL = TC + TK;
+					     T7F = TC - TK;
+					     T4r = T4o - T4q;
+					     T7p = T4o + T4q;
+					     Tu = Tj + Tt;
+					     T7E = Tj - Tt;
+					     T4m = T4j - T4l;
+					     T7o = T4j + T4l;
+					     T7G = FMA(KP618033988, T7F, T7E);
+					     T86 = FNMS(KP618033988, T7E, T7F);
+					     T4s = FMA(KP618033988, T4r, T4m);
+					     T6a = FNMS(KP618033988, T4m, T4r);
+					     T4g = Tu - TL;
+					     TM = Tu + TL;
+					     TN = T1 + TM;
+					     T4f = FNMS(KP250000000, TM, T1);
+					}
+				   }
+				   {
+					E T2D, T2F, T7q, T2E, T3X;
+					T2D = ri[WS(rs, 3)];
+					T2F = ii[WS(rs, 3)];
+					T7C = T7o - T7p;
+					T7q = T7o + T7p;
+					T2E = T3 * T2D;
+					T3X = T3 * T2F;
+					{
+					     E T2V, T2W, T2Y, T32, T36;
+					     T2V = ri[WS(rs, 13)];
+					     T7s = T7q + T7r;
+					     T7B = FNMS(KP250000000, T7q, T7r);
+					     T2G = FMA(T6, T2F, T2E);
+					     T3Y = FNMS(T6, T2D, T3X);
+					     T2W = T2U * T2V;
+					     T2Y = ii[WS(rs, 13)];
+					     T32 = ri[WS(rs, 18)];
+					     T36 = ii[WS(rs, 18)];
+					     {
+						  E T2H, T2I, T2J, T3Z;
+						  {
+						       E T2Z, T45, T37, T47, T44, T33, T46;
+						       T2H = ri[WS(rs, 8)];
+						       T2Z = FMA(T2X, T2Y, T2W);
+						       T44 = T2U * T2Y;
+						       T33 = T31 * T32;
+						       T46 = T31 * T36;
+						       T2I = T1j * T2H;
+						       T45 = FNMS(T2X, T2V, T44);
+						       T37 = FMA(T35, T36, T33);
+						       T47 = FNMS(T35, T32, T46);
+						       T2J = ii[WS(rs, 8)];
+						       T2N = ri[WS(rs, 23)];
+						       T5p = T2Z - T37;
+						       T38 = T2Z + T37;
+						       T48 = T45 + T47;
+						       T5i = T47 - T45;
+						       T3Z = T1j * T2J;
+						       T2O = T2M * T2N;
+						       T2R = ii[WS(rs, 23)];
+						  }
+						  T2K = FMA(T1l, T2J, T2I);
+						  T40 = FNMS(T1l, T2H, T3Z);
+					     }
+					}
+				   }
+				   T2S = FMA(T2Q, T2R, T2O);
+				   T41 = T2M * T2R;
+			      }
+			      {
+				   E TR, T3h, T1t, T4F, T3r, T4y, TZ, T3j, T17, T3l;
+				   {
+					E T12, T16, T13, T3k;
+					{
+					     E TO, TP, T5m, T5l, TQ;
+					     {
+						  E T2T, T5o, T42, T5f, T39;
+						  TO = ri[WS(rs, 1)];
+						  T2T = T2K + T2S;
+						  T5o = T2K - T2S;
+						  T42 = FNMS(T2Q, T2N, T41);
+						  TP = T2 * TO;
+						  T5q = FMA(KP618033988, T5p, T5o);
+						  T6k = FNMS(KP618033988, T5o, T5p);
+						  T5f = T38 - T2T;
+						  T39 = T2T + T38;
+						  {
+						       E T43, T5h, T5e, T49;
+						       T43 = T40 + T42;
+						       T5h = T42 - T40;
+						       T5e = FNMS(KP250000000, T39, T2G);
+						       T3a = T2G + T39;
+						       T5j = FMA(KP618033988, T5i, T5h);
+						       T6n = FNMS(KP618033988, T5h, T5i);
+						       T5m = T48 - T43;
+						       T49 = T43 + T48;
+						       T6m = FMA(KP559016994, T5f, T5e);
+						       T5g = FNMS(KP559016994, T5f, T5e);
+						       T5l = FNMS(KP250000000, T49, T3Y);
+						       T4a = T3Y + T49;
+						       TQ = ii[WS(rs, 1)];
+						  }
+					     }
+					     {
+						  E T1n, T1r, T1i, T1o, T3o, T3p;
+						  {
+						       E T1d, T1h, T1e, T3n, T3g;
+						       T1d = ri[WS(rs, 11)];
+						       T1h = ii[WS(rs, 11)];
+						       T5n = FNMS(KP559016994, T5m, T5l);
+						       T6j = FMA(KP559016994, T5m, T5l);
+						       TR = FMA(T5, TQ, TP);
+						       T3g = T2 * TQ;
+						       T1e = T1c * T1d;
+						       T3n = T1c * T1h;
+						       T1n = ri[WS(rs, 16)];
+						       T3h = FNMS(T5, TO, T3g);
+						       T1r = ii[WS(rs, 16)];
+						       T1i = FMA(T1g, T1h, T1e);
+						       T1o = T1m * T1n;
+						       T3o = FNMS(T1g, T1d, T3n);
+						       T3p = T1m * T1r;
+						  }
+						  {
+						       E TU, TY, TV, T3i, T3q, T1s;
+						       TU = ri[WS(rs, 6)];
+						       T1s = FMA(T1q, T1r, T1o);
+						       TY = ii[WS(rs, 6)];
+						       T3q = FNMS(T1q, T1n, T3p);
+						       TV = TT * TU;
+						       T1t = T1i + T1s;
+						       T4F = T1s - T1i;
+						       T3i = TT * TY;
+						       T3r = T3o + T3q;
+						       T4y = T3q - T3o;
+						       T12 = ri[WS(rs, 21)];
+						       T16 = ii[WS(rs, 21)];
+						       TZ = FMA(TX, TY, TV);
+						       T3j = FNMS(TX, TU, T3i);
+						       T13 = T11 * T12;
+						       T3k = T11 * T16;
+						  }
+					     }
+					}
+					T17 = FMA(T15, T16, T13);
+					T3l = FNMS(T15, T12, T3k);
+				   }
+				   {
+					E T1z, T3v, T4N, T1Z, T3F, T4U, T1D, T3x, T1H, T3z;
+					{
+					     E T1E, T1G, T1F, T3y;
+					     {
+						  E T1w, T1y, T1x, T4v, T4C, T4u, T4B, T3u, T18, T4E;
+						  T1w = ri[WS(rs, 4)];
+						  T1y = ii[WS(rs, 4)];
+						  T18 = TZ + T17;
+						  T4E = T17 - TZ;
+						  {
+						       E T3m, T4x, T1u, T3s;
+						       T3m = T3j + T3l;
+						       T4x = T3j - T3l;
+						       T1x = T7 * T1w;
+						       T6C = FNMS(KP618033988, T4E, T4F);
+						       T4G = FMA(KP618033988, T4F, T4E);
+						       T1u = T18 + T1t;
+						       T4v = T18 - T1t;
+						       T6z = FMA(KP618033988, T4x, T4y);
+						       T4z = FNMS(KP618033988, T4y, T4x);
+						       T3s = T3m + T3r;
+						       T4C = T3m - T3r;
+						       T1v = TR + T1u;
+						       T4u = FNMS(KP250000000, T1u, TR);
+						       T3t = T3h + T3s;
+						       T4B = FNMS(KP250000000, T3s, T3h);
+						       T3u = T7 * T1y;
+						  }
+						  T6y = FNMS(KP559016994, T4v, T4u);
+						  T4w = FMA(KP559016994, T4v, T4u);
+						  T6B = FNMS(KP559016994, T4C, T4B);
+						  T4D = FMA(KP559016994, T4C, T4B);
+						  T1z = FMA(Tb, T1y, T1x);
+						  T3v = FNMS(Tb, T1w, T3u);
+					     }
+					     {
+						  E T1Q, T3C, T1Y, T3E;
+						  {
+						       E T1L, T1P, T1T, T1X, T1M, T3B, T1U, T3D;
+						       T1L = ri[WS(rs, 14)];
+						       T1P = ii[WS(rs, 14)];
+						       T1T = ri[WS(rs, 19)];
+						       T1X = ii[WS(rs, 19)];
+						       T1M = T1K * T1L;
+						       T3B = T1K * T1P;
+						       T1U = T1S * T1T;
+						       T3D = T1S * T1X;
+						       T1Q = FMA(T1O, T1P, T1M);
+						       T3C = FNMS(T1O, T1L, T3B);
+						       T1Y = FMA(T1W, T1X, T1U);
+						       T3E = FNMS(T1W, T1T, T3D);
+						  }
+						  {
+						       E T1A, T1C, T1B, T3w;
+						       T1A = ri[WS(rs, 9)];
+						       T1C = ii[WS(rs, 9)];
+						       T4N = T1Y - T1Q;
+						       T1Z = T1Q + T1Y;
+						       T3F = T3C + T3E;
+						       T4U = T3E - T3C;
+						       T1B = T8 * T1A;
+						       T3w = T8 * T1C;
+						       T1E = ri[WS(rs, 24)];
+						       T1G = ii[WS(rs, 24)];
+						       T1D = FMA(Tc, T1C, T1B);
+						       T3x = FNMS(Tc, T1A, T3w);
+						       T1F = Tk * T1E;
+						       T3y = Tk * T1G;
+						  }
+					     }
+					     T1H = FMA(Tm, T1G, T1F);
+					     T3z = FNMS(Tm, T1E, T3y);
+					}
+					{
+					     E T2f, T2j, T2g, T3N;
+					     {
+						  E T23, T25, T24, T4R, T4K, T4Q, T4J, T3J, T1I, T4M;
+						  T23 = ri[WS(rs, 2)];
+						  T25 = ii[WS(rs, 2)];
+						  T1I = T1D + T1H;
+						  T4M = T1H - T1D;
+						  {
+						       E T3A, T4T, T20, T3G;
+						       T3A = T3x + T3z;
+						       T4T = T3z - T3x;
+						       T24 = T19 * T23;
+						       T6v = FNMS(KP618033988, T4M, T4N);
+						       T4O = FMA(KP618033988, T4N, T4M);
+						       T20 = T1I + T1Z;
+						       T4R = T1I - T1Z;
+						       T6s = FNMS(KP618033988, T4T, T4U);
+						       T4V = FMA(KP618033988, T4U, T4T);
+						       T3G = T3A + T3F;
+						       T4K = T3F - T3A;
+						       T21 = T1z + T20;
+						       T4Q = FNMS(KP250000000, T20, T1z);
+						       T3H = T3v + T3G;
+						       T4J = FNMS(KP250000000, T3G, T3v);
+						       T3J = T19 * T25;
+						  }
+						  T6r = FNMS(KP559016994, T4R, T4Q);
+						  T4S = FMA(KP559016994, T4R, T4Q);
+						  T6u = FMA(KP559016994, T4K, T4J);
+						  T4L = FNMS(KP559016994, T4K, T4J);
+						  T26 = FMA(T1b, T25, T24);
+						  T3K = FNMS(T1b, T23, T3J);
+					     }
+					     {
+						  E T2r, T3R, T2z, T3T;
+						  {
+						       E T2n, T2q, T2u, T2y, T2o, T3Q, T2v, T3S;
+						       T2n = ri[WS(rs, 12)];
+						       T2q = ii[WS(rs, 12)];
+						       T2u = ri[WS(rs, 17)];
+						       T2y = ii[WS(rs, 17)];
+						       T2o = T2m * T2n;
+						       T3Q = T2m * T2q;
+						       T2v = T2t * T2u;
+						       T3S = T2t * T2y;
+						       T2r = FMA(T2p, T2q, T2o);
+						       T3R = FNMS(T2p, T2n, T3Q);
+						       T2z = FMA(T2x, T2y, T2v);
+						       T3T = FNMS(T2x, T2u, T3S);
+						  }
+						  {
+						       E T28, T2b, T29, T3L;
+						       T28 = ri[WS(rs, 7)];
+						       T2b = ii[WS(rs, 7)];
+						       T5a = T2z - T2r;
+						       T2A = T2r + T2z;
+						       T3U = T3R + T3T;
+						       T53 = T3R - T3T;
+						       T29 = T27 * T28;
+						       T3L = T27 * T2b;
+						       T2f = ri[WS(rs, 22)];
+						       T2j = ii[WS(rs, 22)];
+						       T2c = FMA(T2a, T2b, T29);
+						       T3M = FNMS(T2a, T28, T3L);
+						       T2g = T2e * T2f;
+						       T3N = T2e * T2j;
+						  }
+					     }
+					     T2k = FMA(T2i, T2j, T2g);
+					     T3O = FNMS(T2i, T2f, T3N);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T7l, T5b, T6d, T54, T6g, T51, T6f, T7m, T6c, T58, T4e, T4c, T7A, T7y, T4d;
+			      E T3f;
+			      {
+				   E T7w, T22, T7x, T3b, T3I, T3c, T3e, T3d;
+				   T7l = T3t + T3H;
+				   T3I = T3t - T3H;
+				   {
+					E T2l, T59, T3P, T52;
+					T2l = T2c + T2k;
+					T59 = T2k - T2c;
+					T3P = T3M + T3O;
+					T52 = T3O - T3M;
+					T5b = FMA(KP618033988, T5a, T59);
+					T6d = FNMS(KP618033988, T59, T5a);
+					{
+					     E T50, T2B, T57, T3V;
+					     T50 = T2A - T2l;
+					     T2B = T2l + T2A;
+					     T54 = FNMS(KP618033988, T53, T52);
+					     T6g = FMA(KP618033988, T52, T53);
+					     T57 = T3U - T3P;
+					     T3V = T3P + T3U;
+					     {
+						  E T4Z, T2C, T56, T3W, T4b;
+						  T4Z = FNMS(KP250000000, T2B, T26);
+						  T2C = T26 + T2B;
+						  T56 = FNMS(KP250000000, T3V, T3K);
+						  T3W = T3K + T3V;
+						  T7w = T1v - T21;
+						  T22 = T1v + T21;
+						  T51 = FNMS(KP559016994, T50, T4Z);
+						  T6f = FMA(KP559016994, T50, T4Z);
+						  T4b = T3W - T4a;
+						  T7m = T3W + T4a;
+						  T6c = FMA(KP559016994, T57, T56);
+						  T58 = FNMS(KP559016994, T57, T56);
+						  T7x = T2C - T3a;
+						  T3b = T2C + T3a;
+						  T4e = FNMS(KP618033988, T3I, T4b);
+						  T4c = FMA(KP618033988, T4b, T3I);
+					     }
+					}
+				   }
+				   T3c = T22 + T3b;
+				   T3e = T22 - T3b;
+				   ri[0] = TN + T3c;
+				   T3d = FNMS(KP250000000, T3c, TN);
+				   T7A = FNMS(KP618033988, T7w, T7x);
+				   T7y = FMA(KP618033988, T7x, T7w);
+				   T4d = FNMS(KP559016994, T3e, T3d);
+				   T3f = FMA(KP559016994, T3e, T3d);
+			      }
+			      {
+				   E T69, T85, T7Y, T68, T66, T84, T82, T7X, T67, T5Z;
+				   {
+					E T4t, T5H, T5Q, T7T, T7H, T5P, T5M, T5L, T5A, T7O, T5D, T7P, T7K, T7M, T5u;
+					E T5w, T5K, T63, T61, T5U, T7D, T7z, T7v;
+					{
+					     E T7u, T7t, T4h, T7n;
+					     T69 = FNMS(KP559016994, T4g, T4f);
+					     T4h = FMA(KP559016994, T4g, T4f);
+					     T7u = T7l - T7m;
+					     T7n = T7l + T7m;
+					     ri[WS(rs, 5)] = FMA(KP951056516, T4c, T3f);
+					     ri[WS(rs, 20)] = FNMS(KP951056516, T4c, T3f);
+					     ri[WS(rs, 15)] = FMA(KP951056516, T4e, T4d);
+					     ri[WS(rs, 10)] = FNMS(KP951056516, T4e, T4d);
+					     ii[0] = T7n + T7s;
+					     T7t = FNMS(KP250000000, T7n, T7s);
+					     T4t = FMA(KP951056516, T4s, T4h);
+					     T5H = FNMS(KP951056516, T4s, T4h);
+					     T7D = FMA(KP559016994, T7C, T7B);
+					     T85 = FNMS(KP559016994, T7C, T7B);
+					     T7z = FNMS(KP559016994, T7u, T7t);
+					     T7v = FMA(KP559016994, T7u, T7t);
+					}
+					{
+					     E T5I, T5J, T5S, T4P, T5y, T4I, T5C, T5s, T4W, T5T, T55, T5c;
+					     {
+						  E T4A, T4H, T5k, T5r;
+						  T5Q = FNMS(KP951056516, T4z, T4w);
+						  T4A = FMA(KP951056516, T4z, T4w);
+						  T7T = FMA(KP951056516, T7G, T7D);
+						  T7H = FNMS(KP951056516, T7G, T7D);
+						  ii[WS(rs, 20)] = FMA(KP951056516, T7y, T7v);
+						  ii[WS(rs, 5)] = FNMS(KP951056516, T7y, T7v);
+						  ii[WS(rs, 15)] = FNMS(KP951056516, T7A, T7z);
+						  ii[WS(rs, 10)] = FMA(KP951056516, T7A, T7z);
+						  T4H = FMA(KP951056516, T4G, T4D);
+						  T5P = FNMS(KP951056516, T4G, T4D);
+						  T5I = FMA(KP951056516, T5j, T5g);
+						  T5k = FNMS(KP951056516, T5j, T5g);
+						  T5r = FNMS(KP951056516, T5q, T5n);
+						  T5J = FMA(KP951056516, T5q, T5n);
+						  T5S = FNMS(KP951056516, T4O, T4L);
+						  T4P = FMA(KP951056516, T4O, T4L);
+						  T5y = FNMS(KP256756360, T4A, T4H);
+						  T4I = FMA(KP256756360, T4H, T4A);
+						  T5C = FNMS(KP939062505, T5k, T5r);
+						  T5s = FMA(KP939062505, T5r, T5k);
+						  T4W = FNMS(KP951056516, T4V, T4S);
+						  T5T = FMA(KP951056516, T4V, T4S);
+						  T5M = FMA(KP951056516, T54, T51);
+						  T55 = FNMS(KP951056516, T54, T51);
+						  T5c = FMA(KP951056516, T5b, T58);
+						  T5L = FNMS(KP951056516, T5b, T58);
+					     }
+					     {
+						  E T4Y, T5t, T5z, T4X;
+						  T5z = FNMS(KP634619297, T4P, T4W);
+						  T4X = FMA(KP634619297, T4W, T4P);
+						  {
+						       E T5B, T5d, T7I, T7J;
+						       T5B = FNMS(KP549754652, T55, T5c);
+						       T5d = FMA(KP549754652, T5c, T55);
+						       T7I = FNMS(KP871714437, T5z, T5y);
+						       T5A = FMA(KP871714437, T5z, T5y);
+						       T4Y = FMA(KP871714437, T4X, T4I);
+						       T7O = FNMS(KP871714437, T4X, T4I);
+						       T7J = FMA(KP831864738, T5C, T5B);
+						       T5D = FNMS(KP831864738, T5C, T5B);
+						       T5t = FMA(KP831864738, T5s, T5d);
+						       T7P = FNMS(KP831864738, T5s, T5d);
+						       T7K = FMA(KP904730450, T7J, T7I);
+						       T7M = FNMS(KP904730450, T7J, T7I);
+						  }
+						  T5u = FMA(KP904730450, T5t, T4Y);
+						  T5w = FNMS(KP904730450, T5t, T4Y);
+					     }
+					     T5K = FNMS(KP126329378, T5J, T5I);
+					     T63 = FMA(KP126329378, T5I, T5J);
+					     T61 = FNMS(KP827271945, T5S, T5T);
+					     T5U = FMA(KP827271945, T5T, T5S);
+					}
+					{
+					     E T65, T81, T62, T80, T7W, T5W, T5Y;
+					     {
+						  E T5O, T5V, T64, T5N;
+						  ri[WS(rs, 1)] = FMA(KP968583161, T5u, T4t);
+						  T64 = FMA(KP470564281, T5L, T5M);
+						  T5N = FNMS(KP470564281, T5M, T5L);
+						  {
+						       E T60, T5R, T7U, T7V;
+						       T60 = FNMS(KP634619297, T5P, T5Q);
+						       T5R = FMA(KP634619297, T5Q, T5P);
+						       T7U = FMA(KP912018591, T64, T63);
+						       T65 = FNMS(KP912018591, T64, T63);
+						       T5O = FNMS(KP912018591, T5N, T5K);
+						       T81 = FMA(KP912018591, T5N, T5K);
+						       T7V = FNMS(KP912575812, T61, T60);
+						       T62 = FMA(KP912575812, T61, T60);
+						       T5V = FNMS(KP912575812, T5U, T5R);
+						       T80 = FMA(KP912575812, T5U, T5R);
+						       T7W = FMA(KP851038619, T7V, T7U);
+						       T7Y = FNMS(KP851038619, T7V, T7U);
+						       ii[WS(rs, 1)] = FMA(KP968583161, T7K, T7H);
+						  }
+						  T5W = FNMS(KP851038619, T5V, T5O);
+						  T5Y = FMA(KP851038619, T5V, T5O);
+					     }
+					     {
+						  E T5G, T5E, T7S, T7Q, T7L, T5F, T5x, T5v, T5X, T7R, T7N;
+						  T5G = FNMS(KP683113946, T5A, T5D);
+						  T5E = FMA(KP559154169, T5D, T5A);
+						  ii[WS(rs, 4)] = FNMS(KP992114701, T7W, T7T);
+						  ri[WS(rs, 4)] = FNMS(KP992114701, T5W, T5H);
+						  T5v = FNMS(KP242145790, T5u, T4t);
+						  T7S = FNMS(KP683113946, T7O, T7P);
+						  T7Q = FMA(KP559154169, T7P, T7O);
+						  T7L = FNMS(KP242145790, T7K, T7H);
+						  T5F = FNMS(KP541454447, T5w, T5v);
+						  T5x = FMA(KP541454447, T5w, T5v);
+						  T68 = FMA(KP525970792, T62, T65);
+						  T66 = FNMS(KP726211448, T65, T62);
+						  ri[WS(rs, 11)] = FNMS(KP833417178, T5G, T5F);
+						  ri[WS(rs, 16)] = FMA(KP833417178, T5G, T5F);
+						  ri[WS(rs, 21)] = FNMS(KP921177326, T5E, T5x);
+						  ri[WS(rs, 6)] = FMA(KP921177326, T5E, T5x);
+						  T7R = FNMS(KP541454447, T7M, T7L);
+						  T7N = FMA(KP541454447, T7M, T7L);
+						  T5X = FMA(KP248028675, T5W, T5H);
+						  ii[WS(rs, 11)] = FMA(KP833417178, T7S, T7R);
+						  ii[WS(rs, 16)] = FNMS(KP833417178, T7S, T7R);
+						  ii[WS(rs, 21)] = FMA(KP921177326, T7Q, T7N);
+						  ii[WS(rs, 6)] = FNMS(KP921177326, T7Q, T7N);
+						  T84 = FNMS(KP525970792, T80, T81);
+						  T82 = FMA(KP726211448, T81, T80);
+						  T7X = FMA(KP248028675, T7W, T7T);
+						  T67 = FNMS(KP554608978, T5Y, T5X);
+						  T5Z = FMA(KP554608978, T5Y, T5X);
+					     }
+					}
+				   }
+				   {
+					E T6b, T6T, T8j, T87, T72, T71, T6P, T8r, T6M, T8q, T7f, T6W, T8m, T8o, T6I;
+					E T6G, T7d, T76, T7g, T6Z, T83, T7Z;
+					ri[WS(rs, 14)] = FNMS(KP943557151, T68, T67);
+					ri[WS(rs, 19)] = FMA(KP943557151, T68, T67);
+					ri[WS(rs, 24)] = FMA(KP803003575, T66, T5Z);
+					ri[WS(rs, 9)] = FNMS(KP803003575, T66, T5Z);
+					T83 = FNMS(KP554608978, T7Y, T7X);
+					T7Z = FMA(KP554608978, T7Y, T7X);
+					T6b = FMA(KP951056516, T6a, T69);
+					T6T = FNMS(KP951056516, T6a, T69);
+					ii[WS(rs, 14)] = FMA(KP943557151, T84, T83);
+					ii[WS(rs, 19)] = FNMS(KP943557151, T84, T83);
+					ii[WS(rs, 24)] = FMA(KP803003575, T82, T7Z);
+					ii[WS(rs, 9)] = FNMS(KP803003575, T82, T7Z);
+					{
+					     E T6X, T6Y, T74, T6N, T6i, T75, T6U, T6V, T6t, T6L, T6E, T6O, T6p, T6w;
+					     {
+						  E T6A, T6D, T6e, T6h, T6l, T6o;
+						  T6X = FNMS(KP951056516, T6d, T6c);
+						  T6e = FMA(KP951056516, T6d, T6c);
+						  T6h = FMA(KP951056516, T6g, T6f);
+						  T6Y = FNMS(KP951056516, T6g, T6f);
+						  T74 = FMA(KP951056516, T6z, T6y);
+						  T6A = FNMS(KP951056516, T6z, T6y);
+						  T8j = FNMS(KP951056516, T86, T85);
+						  T87 = FMA(KP951056516, T86, T85);
+						  T6N = FNMS(KP062914667, T6e, T6h);
+						  T6i = FMA(KP062914667, T6h, T6e);
+						  T6D = FMA(KP951056516, T6C, T6B);
+						  T75 = FNMS(KP951056516, T6C, T6B);
+						  T6U = FMA(KP951056516, T6k, T6j);
+						  T6l = FNMS(KP951056516, T6k, T6j);
+						  T6o = FNMS(KP951056516, T6n, T6m);
+						  T6V = FMA(KP951056516, T6n, T6m);
+						  T72 = FMA(KP951056516, T6s, T6r);
+						  T6t = FNMS(KP951056516, T6s, T6r);
+						  T6L = FNMS(KP939062505, T6A, T6D);
+						  T6E = FMA(KP939062505, T6D, T6A);
+						  T6O = FMA(KP827271945, T6l, T6o);
+						  T6p = FNMS(KP827271945, T6o, T6l);
+						  T6w = FMA(KP951056516, T6v, T6u);
+						  T71 = FNMS(KP951056516, T6v, T6u);
+					     }
+					     {
+						  E T8k, T6q, T6K, T6x, T8l, T6F;
+						  T8k = FMA(KP772036680, T6O, T6N);
+						  T6P = FNMS(KP772036680, T6O, T6N);
+						  T6q = FMA(KP772036680, T6p, T6i);
+						  T8r = FNMS(KP772036680, T6p, T6i);
+						  T6K = FMA(KP126329378, T6t, T6w);
+						  T6x = FNMS(KP126329378, T6w, T6t);
+						  T8l = FNMS(KP734762448, T6L, T6K);
+						  T6M = FMA(KP734762448, T6L, T6K);
+						  T6F = FNMS(KP734762448, T6E, T6x);
+						  T8q = FMA(KP734762448, T6E, T6x);
+						  T7f = FNMS(KP062914667, T6U, T6V);
+						  T6W = FMA(KP062914667, T6V, T6U);
+						  T8m = FMA(KP994076283, T8l, T8k);
+						  T8o = FNMS(KP994076283, T8l, T8k);
+						  T6I = FMA(KP994076283, T6F, T6q);
+						  T6G = FNMS(KP994076283, T6F, T6q);
+					     }
+					     T7d = FNMS(KP549754652, T74, T75);
+					     T76 = FMA(KP549754652, T75, T74);
+					     T7g = FNMS(KP634619297, T6X, T6Y);
+					     T6Z = FMA(KP634619297, T6Y, T6X);
+					}
+					{
+					     E T88, T7h, T70, T8f, T7c, T73;
+					     ri[WS(rs, 3)] = FMA(KP998026728, T6G, T6b);
+					     T88 = FMA(KP845997307, T7g, T7f);
+					     T7h = FNMS(KP845997307, T7g, T7f);
+					     T70 = FMA(KP845997307, T6Z, T6W);
+					     T8f = FNMS(KP845997307, T6Z, T6W);
+					     T7c = FMA(KP470564281, T71, T72);
+					     T73 = FNMS(KP470564281, T72, T71);
+					     ii[WS(rs, 3)] = FNMS(KP998026728, T8m, T8j);
+					     {
+						  E T7e, T8e, T8a, T78, T7a, T8u, T8s, T8t, T8p, T79;
+						  {
+						       E T6S, T6Q, T6H, T89, T77, T6J, T6R, T8n;
+						       T6S = FMA(KP614372930, T6M, T6P);
+						       T6Q = FNMS(KP621716863, T6P, T6M);
+						       T89 = FNMS(KP968479752, T7d, T7c);
+						       T7e = FMA(KP968479752, T7d, T7c);
+						       T77 = FMA(KP968479752, T76, T73);
+						       T8e = FNMS(KP968479752, T76, T73);
+						       T8a = FMA(KP906616052, T89, T88);
+						       T8c = FNMS(KP906616052, T89, T88);
+						       T78 = FMA(KP906616052, T77, T70);
+						       T7a = FNMS(KP906616052, T77, T70);
+						       T6H = FNMS(KP249506682, T6G, T6b);
+						       ii[WS(rs, 2)] = FNMS(KP998026728, T8a, T87);
+						       ri[WS(rs, 2)] = FMA(KP998026728, T78, T6T);
+						       T8u = FNMS(KP614372930, T8q, T8r);
+						       T8s = FMA(KP621716863, T8r, T8q);
+						       T6J = FNMS(KP557913902, T6I, T6H);
+						       T6R = FMA(KP557913902, T6I, T6H);
+						       T8n = FMA(KP249506682, T8m, T8j);
+						       ri[WS(rs, 18)] = FNMS(KP949179823, T6S, T6R);
+						       ri[WS(rs, 13)] = FMA(KP949179823, T6S, T6R);
+						       ri[WS(rs, 8)] = FMA(KP943557151, T6Q, T6J);
+						       ri[WS(rs, 23)] = FNMS(KP943557151, T6Q, T6J);
+						       T8t = FNMS(KP557913902, T8o, T8n);
+						       T8p = FMA(KP557913902, T8o, T8n);
+						  }
+						  T7k = FNMS(KP560319534, T7e, T7h);
+						  T7i = FMA(KP681693190, T7h, T7e);
+						  ii[WS(rs, 23)] = FMA(KP943557151, T8s, T8p);
+						  ii[WS(rs, 8)] = FNMS(KP943557151, T8s, T8p);
+						  ii[WS(rs, 13)] = FMA(KP949179823, T8u, T8t);
+						  ii[WS(rs, 18)] = FNMS(KP949179823, T8u, T8t);
+						  T79 = FNMS(KP249506682, T78, T6T);
+						  T8i = FNMS(KP560319534, T8e, T8f);
+						  T8g = FMA(KP681693190, T8f, T8e);
+						  T8b = FMA(KP249506682, T8a, T87);
+						  T7j = FMA(KP557913902, T7a, T79);
+						  T7b = FNMS(KP557913902, T7a, T79);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ri[WS(rs, 12)] = FNMS(KP949179823, T7k, T7j);
+	       ri[WS(rs, 17)] = FMA(KP949179823, T7k, T7j);
+	       ri[WS(rs, 7)] = FMA(KP860541664, T7i, T7b);
+	       ri[WS(rs, 22)] = FNMS(KP860541664, T7i, T7b);
+	       T8d = FMA(KP557913902, T8c, T8b);
+	       T8h = FNMS(KP557913902, T8c, T8b);
+	       ii[WS(rs, 12)] = FNMS(KP949179823, T8i, T8h);
+	       ii[WS(rs, 17)] = FMA(KP949179823, T8i, T8h);
+	       ii[WS(rs, 22)] = FNMS(KP860541664, T8g, T8d);
+	       ii[WS(rs, 7)] = FMA(KP860541664, T8g, T8d);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_CEXP, 0, 24},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 25, "t2_25", twinstr, &GENUS, {84, 78, 356, 0}, 0, 0, 0 };
+
+void X(codelet_t2_25) (planner *p) {
+     X(kdft_dit_register) (p, t2_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 25 -name t2_25 -include t.h */
+
+/*
+ * This function contains 440 FP additions, 340 FP multiplications,
+ * (or, 280 additions, 180 multiplications, 160 fused multiply/add),
+ * 149 stack variables, 20 constants, and 100 memory accesses
+ */
+#include "t.h"
+
+static void t2_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E T2, T5, T3, T6, T8, Td, T16, T14, Te, T9, T21, T23, Tx, TR, T1g;
+	       E TB, T1f, TV, T1Q, Tg, T1S, Tk, T18, T2s, T1c, T2q, Tn, To, Tp, Tr;
+	       E T28, T2x, TY, T2k, T2m, T2v, TG, TE, T10, T1h, T1E, T26, T1B, T1G, T1V;
+	       E T1X, T1z, T1j;
+	       {
+		    E Tw, TT, Tz, TQ, Tv, TU, TA, TP;
+		    {
+			 E T4, Tc, T7, Tb;
+			 T2 = W[0];
+			 T5 = W[1];
+			 T3 = W[2];
+			 T6 = W[3];
+			 T4 = T2 * T3;
+			 Tc = T5 * T3;
+			 T7 = T5 * T6;
+			 Tb = T2 * T6;
+			 T8 = T4 - T7;
+			 Td = Tb + Tc;
+			 T16 = Tb - Tc;
+			 T14 = T4 + T7;
+			 Te = W[5];
+			 Tw = T5 * Te;
+			 TT = T3 * Te;
+			 Tz = T2 * Te;
+			 TQ = T6 * Te;
+			 T9 = W[4];
+			 Tv = T2 * T9;
+			 TU = T6 * T9;
+			 TA = T5 * T9;
+			 TP = T3 * T9;
+		    }
+		    T21 = TP - TQ;
+		    T23 = TT + TU;
+		    {
+			 E T15, T17, Ta, Tf, T1a, T1b, Ti, Tj;
+			 Tx = Tv - Tw;
+			 TR = TP + TQ;
+			 T1g = Tz - TA;
+			 TB = Tz + TA;
+			 T1f = Tv + Tw;
+			 TV = TT - TU;
+			 T15 = T14 * T9;
+			 T17 = T16 * Te;
+			 T1Q = T15 + T17;
+			 Ta = T8 * T9;
+			 Tf = Td * Te;
+			 Tg = Ta + Tf;
+			 T1a = T14 * Te;
+			 T1b = T16 * T9;
+			 T1S = T1a - T1b;
+			 Ti = T8 * Te;
+			 Tj = Td * T9;
+			 Tk = Ti - Tj;
+			 T18 = T15 - T17;
+			 T2s = Ti + Tj;
+			 T1c = T1a + T1b;
+			 T2q = Ta - Tf;
+			 Tn = W[6];
+			 To = W[7];
+			 Tp = FMA(T8, Tn, Td * To);
+			 Tr = FNMS(Td, Tn, T8 * To);
+			 T28 = FNMS(T1S, Tn, T1Q * To);
+			 T2x = FNMS(TV, Tn, TR * To);
+			 TY = FMA(T3, Tn, T6 * To);
+			 T2k = FMA(T2, Tn, T5 * To);
+			 T2m = FNMS(T5, Tn, T2 * To);
+			 T2v = FMA(TR, Tn, TV * To);
+			 TG = FNMS(Te, Tn, T9 * To);
+			 TE = FMA(T9, Tn, Te * To);
+			 T10 = FNMS(T6, Tn, T3 * To);
+			 T1h = FMA(T1f, Tn, T1g * To);
+			 T1E = FMA(Tg, Tn, Tk * To);
+			 T26 = FMA(T1Q, Tn, T1S * To);
+			 T1B = FNMS(TB, Tn, Tx * To);
+			 T1G = FNMS(Tk, Tn, Tg * To);
+			 T1V = FMA(T14, Tn, T16 * To);
+			 T1X = FNMS(T16, Tn, T14 * To);
+			 T1z = FMA(Tx, Tn, TB * To);
+			 T1j = FNMS(T1g, Tn, T1f * To);
+		    }
+	       }
+	       {
+		    E T1, T6v, T2F, T6I, TK, T2G, T6u, T6J, T6N, T7c, T2O, T52, T2C, T6k, T48;
+		    E T5X, T4L, T5s, T4j, T5W, T4K, T5v, T1o, T6g, T30, T5M, T4A, T56, T3b, T5N;
+		    E T4B, T59, T1L, T6h, T3n, T5Q, T4D, T5g, T3y, T5P, T4E, T5d, T2d, T6j, T3L;
+		    E T5T, T4I, T5l, T3W, T5U, T4H, T5o;
+		    {
+			 E Tm, T2I, Tt, T2J, Tu, T6s, TD, T2L, TI, T2M, TJ, T6t;
+			 T1 = ri[0];
+			 T6v = ii[0];
+			 {
+			      E Th, Tl, Tq, Ts;
+			      Th = ri[WS(rs, 5)];
+			      Tl = ii[WS(rs, 5)];
+			      Tm = FMA(Tg, Th, Tk * Tl);
+			      T2I = FNMS(Tk, Th, Tg * Tl);
+			      Tq = ri[WS(rs, 20)];
+			      Ts = ii[WS(rs, 20)];
+			      Tt = FMA(Tp, Tq, Tr * Ts);
+			      T2J = FNMS(Tr, Tq, Tp * Ts);
+			 }
+			 Tu = Tm + Tt;
+			 T6s = T2I + T2J;
+			 {
+			      E Ty, TC, TF, TH;
+			      Ty = ri[WS(rs, 10)];
+			      TC = ii[WS(rs, 10)];
+			      TD = FMA(Tx, Ty, TB * TC);
+			      T2L = FNMS(TB, Ty, Tx * TC);
+			      TF = ri[WS(rs, 15)];
+			      TH = ii[WS(rs, 15)];
+			      TI = FMA(TE, TF, TG * TH);
+			      T2M = FNMS(TG, TF, TE * TH);
+			 }
+			 TJ = TD + TI;
+			 T6t = T2L + T2M;
+			 T2F = KP559016994 * (Tu - TJ);
+			 T6I = KP559016994 * (T6s - T6t);
+			 TK = Tu + TJ;
+			 T2G = FNMS(KP250000000, TK, T1);
+			 T6u = T6s + T6t;
+			 T6J = FNMS(KP250000000, T6u, T6v);
+			 {
+			      E T6L, T6M, T2K, T2N;
+			      T6L = Tm - Tt;
+			      T6M = TD - TI;
+			      T6N = FMA(KP951056516, T6L, KP587785252 * T6M);
+			      T7c = FNMS(KP587785252, T6L, KP951056516 * T6M);
+			      T2K = T2I - T2J;
+			      T2N = T2L - T2M;
+			      T2O = FMA(KP951056516, T2K, KP587785252 * T2N);
+			      T52 = FNMS(KP587785252, T2K, KP951056516 * T2N);
+			 }
+		    }
+		    {
+			 E T2g, T4c, T43, T46, T4h, T4g, T49, T4a, T4d, T2p, T2A, T2B, T2e, T2f;
+			 T2e = ri[WS(rs, 3)];
+			 T2f = ii[WS(rs, 3)];
+			 T2g = FMA(T3, T2e, T6 * T2f);
+			 T4c = FNMS(T6, T2e, T3 * T2f);
+			 {
+			      E T2j, T41, T2z, T45, T2o, T42, T2u, T44;
+			      {
+				   E T2h, T2i, T2w, T2y;
+				   T2h = ri[WS(rs, 8)];
+				   T2i = ii[WS(rs, 8)];
+				   T2j = FMA(T1f, T2h, T1g * T2i);
+				   T41 = FNMS(T1g, T2h, T1f * T2i);
+				   T2w = ri[WS(rs, 18)];
+				   T2y = ii[WS(rs, 18)];
+				   T2z = FMA(T2v, T2w, T2x * T2y);
+				   T45 = FNMS(T2x, T2w, T2v * T2y);
+			      }
+			      {
+				   E T2l, T2n, T2r, T2t;
+				   T2l = ri[WS(rs, 23)];
+				   T2n = ii[WS(rs, 23)];
+				   T2o = FMA(T2k, T2l, T2m * T2n);
+				   T42 = FNMS(T2m, T2l, T2k * T2n);
+				   T2r = ri[WS(rs, 13)];
+				   T2t = ii[WS(rs, 13)];
+				   T2u = FMA(T2q, T2r, T2s * T2t);
+				   T44 = FNMS(T2s, T2r, T2q * T2t);
+			      }
+			      T43 = T41 - T42;
+			      T46 = T44 - T45;
+			      T4h = T2u - T2z;
+			      T4g = T2j - T2o;
+			      T49 = T41 + T42;
+			      T4a = T44 + T45;
+			      T4d = T49 + T4a;
+			      T2p = T2j + T2o;
+			      T2A = T2u + T2z;
+			      T2B = T2p + T2A;
+			 }
+			 T2C = T2g + T2B;
+			 T6k = T4c + T4d;
+			 {
+			      E T47, T5r, T40, T5q, T3Y, T3Z;
+			      T47 = FMA(KP951056516, T43, KP587785252 * T46);
+			      T5r = FNMS(KP587785252, T43, KP951056516 * T46);
+			      T3Y = KP559016994 * (T2p - T2A);
+			      T3Z = FNMS(KP250000000, T2B, T2g);
+			      T40 = T3Y + T3Z;
+			      T5q = T3Z - T3Y;
+			      T48 = T40 + T47;
+			      T5X = T5q + T5r;
+			      T4L = T40 - T47;
+			      T5s = T5q - T5r;
+			 }
+			 {
+			      E T4i, T5t, T4f, T5u, T4b, T4e;
+			      T4i = FMA(KP951056516, T4g, KP587785252 * T4h);
+			      T5t = FNMS(KP587785252, T4g, KP951056516 * T4h);
+			      T4b = KP559016994 * (T49 - T4a);
+			      T4e = FNMS(KP250000000, T4d, T4c);
+			      T4f = T4b + T4e;
+			      T5u = T4e - T4b;
+			      T4j = T4f - T4i;
+			      T5W = T5u - T5t;
+			      T4K = T4i + T4f;
+			      T5v = T5t + T5u;
+			 }
+		    }
+		    {
+			 E TO, T34, T2V, T2Y, T39, T38, T31, T32, T35, T13, T1m, T1n, TM, TN;
+			 TM = ri[WS(rs, 1)];
+			 TN = ii[WS(rs, 1)];
+			 TO = FMA(T2, TM, T5 * TN);
+			 T34 = FNMS(T5, TM, T2 * TN);
+			 {
+			      E TX, T2T, T1l, T2X, T12, T2U, T1e, T2W;
+			      {
+				   E TS, TW, T1i, T1k;
+				   TS = ri[WS(rs, 6)];
+				   TW = ii[WS(rs, 6)];
+				   TX = FMA(TR, TS, TV * TW);
+				   T2T = FNMS(TV, TS, TR * TW);
+				   T1i = ri[WS(rs, 16)];
+				   T1k = ii[WS(rs, 16)];
+				   T1l = FMA(T1h, T1i, T1j * T1k);
+				   T2X = FNMS(T1j, T1i, T1h * T1k);
+			      }
+			      {
+				   E TZ, T11, T19, T1d;
+				   TZ = ri[WS(rs, 21)];
+				   T11 = ii[WS(rs, 21)];
+				   T12 = FMA(TY, TZ, T10 * T11);
+				   T2U = FNMS(T10, TZ, TY * T11);
+				   T19 = ri[WS(rs, 11)];
+				   T1d = ii[WS(rs, 11)];
+				   T1e = FMA(T18, T19, T1c * T1d);
+				   T2W = FNMS(T1c, T19, T18 * T1d);
+			      }
+			      T2V = T2T - T2U;
+			      T2Y = T2W - T2X;
+			      T39 = T1e - T1l;
+			      T38 = TX - T12;
+			      T31 = T2T + T2U;
+			      T32 = T2W + T2X;
+			      T35 = T31 + T32;
+			      T13 = TX + T12;
+			      T1m = T1e + T1l;
+			      T1n = T13 + T1m;
+			 }
+			 T1o = TO + T1n;
+			 T6g = T34 + T35;
+			 {
+			      E T2Z, T55, T2S, T54, T2Q, T2R;
+			      T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
+			      T55 = FNMS(KP587785252, T2V, KP951056516 * T2Y);
+			      T2Q = KP559016994 * (T13 - T1m);
+			      T2R = FNMS(KP250000000, T1n, TO);
+			      T2S = T2Q + T2R;
+			      T54 = T2R - T2Q;
+			      T30 = T2S + T2Z;
+			      T5M = T54 + T55;
+			      T4A = T2S - T2Z;
+			      T56 = T54 - T55;
+			 }
+			 {
+			      E T3a, T57, T37, T58, T33, T36;
+			      T3a = FMA(KP951056516, T38, KP587785252 * T39);
+			      T57 = FNMS(KP587785252, T38, KP951056516 * T39);
+			      T33 = KP559016994 * (T31 - T32);
+			      T36 = FNMS(KP250000000, T35, T34);
+			      T37 = T33 + T36;
+			      T58 = T36 - T33;
+			      T3b = T37 - T3a;
+			      T5N = T58 - T57;
+			      T4B = T3a + T37;
+			      T59 = T57 + T58;
+			 }
+		    }
+		    {
+			 E T1r, T3r, T3i, T3l, T3w, T3v, T3o, T3p, T3s, T1y, T1J, T1K, T1p, T1q;
+			 T1p = ri[WS(rs, 4)];
+			 T1q = ii[WS(rs, 4)];
+			 T1r = FMA(T8, T1p, Td * T1q);
+			 T3r = FNMS(Td, T1p, T8 * T1q);
+			 {
+			      E T1u, T3g, T1I, T3k, T1x, T3h, T1D, T3j;
+			      {
+				   E T1s, T1t, T1F, T1H;
+				   T1s = ri[WS(rs, 9)];
+				   T1t = ii[WS(rs, 9)];
+				   T1u = FMA(T9, T1s, Te * T1t);
+				   T3g = FNMS(Te, T1s, T9 * T1t);
+				   T1F = ri[WS(rs, 19)];
+				   T1H = ii[WS(rs, 19)];
+				   T1I = FMA(T1E, T1F, T1G * T1H);
+				   T3k = FNMS(T1G, T1F, T1E * T1H);
+			      }
+			      {
+				   E T1v, T1w, T1A, T1C;
+				   T1v = ri[WS(rs, 24)];
+				   T1w = ii[WS(rs, 24)];
+				   T1x = FMA(Tn, T1v, To * T1w);
+				   T3h = FNMS(To, T1v, Tn * T1w);
+				   T1A = ri[WS(rs, 14)];
+				   T1C = ii[WS(rs, 14)];
+				   T1D = FMA(T1z, T1A, T1B * T1C);
+				   T3j = FNMS(T1B, T1A, T1z * T1C);
+			      }
+			      T3i = T3g - T3h;
+			      T3l = T3j - T3k;
+			      T3w = T1D - T1I;
+			      T3v = T1u - T1x;
+			      T3o = T3g + T3h;
+			      T3p = T3j + T3k;
+			      T3s = T3o + T3p;
+			      T1y = T1u + T1x;
+			      T1J = T1D + T1I;
+			      T1K = T1y + T1J;
+			 }
+			 T1L = T1r + T1K;
+			 T6h = T3r + T3s;
+			 {
+			      E T3m, T5f, T3f, T5e, T3d, T3e;
+			      T3m = FMA(KP951056516, T3i, KP587785252 * T3l);
+			      T5f = FNMS(KP587785252, T3i, KP951056516 * T3l);
+			      T3d = KP559016994 * (T1y - T1J);
+			      T3e = FNMS(KP250000000, T1K, T1r);
+			      T3f = T3d + T3e;
+			      T5e = T3e - T3d;
+			      T3n = T3f + T3m;
+			      T5Q = T5e + T5f;
+			      T4D = T3f - T3m;
+			      T5g = T5e - T5f;
+			 }
+			 {
+			      E T3x, T5b, T3u, T5c, T3q, T3t;
+			      T3x = FMA(KP951056516, T3v, KP587785252 * T3w);
+			      T5b = FNMS(KP587785252, T3v, KP951056516 * T3w);
+			      T3q = KP559016994 * (T3o - T3p);
+			      T3t = FNMS(KP250000000, T3s, T3r);
+			      T3u = T3q + T3t;
+			      T5c = T3t - T3q;
+			      T3y = T3u - T3x;
+			      T5P = T5c - T5b;
+			      T4E = T3x + T3u;
+			      T5d = T5b + T5c;
+			 }
+		    }
+		    {
+			 E T1P, T3P, T3G, T3J, T3U, T3T, T3M, T3N, T3Q, T20, T2b, T2c, T1N, T1O;
+			 T1N = ri[WS(rs, 2)];
+			 T1O = ii[WS(rs, 2)];
+			 T1P = FMA(T14, T1N, T16 * T1O);
+			 T3P = FNMS(T16, T1N, T14 * T1O);
+			 {
+			      E T1U, T3E, T2a, T3I, T1Z, T3F, T25, T3H;
+			      {
+				   E T1R, T1T, T27, T29;
+				   T1R = ri[WS(rs, 7)];
+				   T1T = ii[WS(rs, 7)];
+				   T1U = FMA(T1Q, T1R, T1S * T1T);
+				   T3E = FNMS(T1S, T1R, T1Q * T1T);
+				   T27 = ri[WS(rs, 17)];
+				   T29 = ii[WS(rs, 17)];
+				   T2a = FMA(T26, T27, T28 * T29);
+				   T3I = FNMS(T28, T27, T26 * T29);
+			      }
+			      {
+				   E T1W, T1Y, T22, T24;
+				   T1W = ri[WS(rs, 22)];
+				   T1Y = ii[WS(rs, 22)];
+				   T1Z = FMA(T1V, T1W, T1X * T1Y);
+				   T3F = FNMS(T1X, T1W, T1V * T1Y);
+				   T22 = ri[WS(rs, 12)];
+				   T24 = ii[WS(rs, 12)];
+				   T25 = FMA(T21, T22, T23 * T24);
+				   T3H = FNMS(T23, T22, T21 * T24);
+			      }
+			      T3G = T3E - T3F;
+			      T3J = T3H - T3I;
+			      T3U = T25 - T2a;
+			      T3T = T1U - T1Z;
+			      T3M = T3E + T3F;
+			      T3N = T3H + T3I;
+			      T3Q = T3M + T3N;
+			      T20 = T1U + T1Z;
+			      T2b = T25 + T2a;
+			      T2c = T20 + T2b;
+			 }
+			 T2d = T1P + T2c;
+			 T6j = T3P + T3Q;
+			 {
+			      E T3K, T5k, T3D, T5j, T3B, T3C;
+			      T3K = FMA(KP951056516, T3G, KP587785252 * T3J);
+			      T5k = FNMS(KP587785252, T3G, KP951056516 * T3J);
+			      T3B = KP559016994 * (T20 - T2b);
+			      T3C = FNMS(KP250000000, T2c, T1P);
+			      T3D = T3B + T3C;
+			      T5j = T3C - T3B;
+			      T3L = T3D + T3K;
+			      T5T = T5j + T5k;
+			      T4I = T3D - T3K;
+			      T5l = T5j - T5k;
+			 }
+			 {
+			      E T3V, T5m, T3S, T5n, T3O, T3R;
+			      T3V = FMA(KP951056516, T3T, KP587785252 * T3U);
+			      T5m = FNMS(KP587785252, T3T, KP951056516 * T3U);
+			      T3O = KP559016994 * (T3M - T3N);
+			      T3R = FNMS(KP250000000, T3Q, T3P);
+			      T3S = T3O + T3R;
+			      T5n = T3R - T3O;
+			      T3W = T3S - T3V;
+			      T5U = T5n - T5m;
+			      T4H = T3V + T3S;
+			      T5o = T5m + T5n;
+			 }
+		    }
+		    {
+			 E T6m, T6o, TL, T2E, T6d, T6e, T6n, T6f;
+			 {
+			      E T6i, T6l, T1M, T2D;
+			      T6i = T6g - T6h;
+			      T6l = T6j - T6k;
+			      T6m = FMA(KP951056516, T6i, KP587785252 * T6l);
+			      T6o = FNMS(KP587785252, T6i, KP951056516 * T6l);
+			      TL = T1 + TK;
+			      T1M = T1o + T1L;
+			      T2D = T2d + T2C;
+			      T2E = T1M + T2D;
+			      T6d = KP559016994 * (T1M - T2D);
+			      T6e = FNMS(KP250000000, T2E, TL);
+			 }
+			 ri[0] = TL + T2E;
+			 T6n = T6e - T6d;
+			 ri[WS(rs, 10)] = T6n - T6o;
+			 ri[WS(rs, 15)] = T6n + T6o;
+			 T6f = T6d + T6e;
+			 ri[WS(rs, 20)] = T6f - T6m;
+			 ri[WS(rs, 5)] = T6f + T6m;
+		    }
+		    {
+			 E T6C, T6D, T6w, T6r, T6x, T6y, T6E, T6z;
+			 {
+			      E T6A, T6B, T6p, T6q;
+			      T6A = T1o - T1L;
+			      T6B = T2d - T2C;
+			      T6C = FMA(KP951056516, T6A, KP587785252 * T6B);
+			      T6D = FNMS(KP587785252, T6A, KP951056516 * T6B);
+			      T6w = T6u + T6v;
+			      T6p = T6g + T6h;
+			      T6q = T6j + T6k;
+			      T6r = T6p + T6q;
+			      T6x = KP559016994 * (T6p - T6q);
+			      T6y = FNMS(KP250000000, T6r, T6w);
+			 }
+			 ii[0] = T6r + T6w;
+			 T6E = T6y - T6x;
+			 ii[WS(rs, 10)] = T6D + T6E;
+			 ii[WS(rs, 15)] = T6E - T6D;
+			 T6z = T6x + T6y;
+			 ii[WS(rs, 5)] = T6z - T6C;
+			 ii[WS(rs, 20)] = T6C + T6z;
+		    }
+		    {
+			 E T2P, T4z, T6O, T70, T4m, T6T, T4n, T6S, T4U, T71, T4X, T6Z, T4O, T75, T4P;
+			 E T74, T4s, T6P, T4v, T6H, T2H, T6K;
+			 T2H = T2F + T2G;
+			 T2P = T2H + T2O;
+			 T4z = T2H - T2O;
+			 T6K = T6I + T6J;
+			 T6O = T6K - T6N;
+			 T70 = T6N + T6K;
+			 {
+			      E T3c, T3z, T3A, T3X, T4k, T4l;
+			      T3c = FMA(KP968583161, T30, KP248689887 * T3b);
+			      T3z = FMA(KP535826794, T3n, KP844327925 * T3y);
+			      T3A = T3c + T3z;
+			      T3X = FMA(KP876306680, T3L, KP481753674 * T3W);
+			      T4k = FMA(KP728968627, T48, KP684547105 * T4j);
+			      T4l = T3X + T4k;
+			      T4m = T3A + T4l;
+			      T6T = T3X - T4k;
+			      T4n = KP559016994 * (T3A - T4l);
+			      T6S = T3c - T3z;
+			 }
+			 {
+			      E T4S, T4T, T6X, T4V, T4W, T6Y;
+			      T4S = FNMS(KP844327925, T4A, KP535826794 * T4B);
+			      T4T = FNMS(KP637423989, T4E, KP770513242 * T4D);
+			      T6X = T4S + T4T;
+			      T4V = FMA(KP125333233, T4L, KP992114701 * T4K);
+			      T4W = FMA(KP904827052, T4I, KP425779291 * T4H);
+			      T6Y = T4W + T4V;
+			      T4U = T4S - T4T;
+			      T71 = KP559016994 * (T6X + T6Y);
+			      T4X = T4V - T4W;
+			      T6Z = T6X - T6Y;
+			 }
+			 {
+			      E T4C, T4F, T4G, T4J, T4M, T4N;
+			      T4C = FMA(KP535826794, T4A, KP844327925 * T4B);
+			      T4F = FMA(KP637423989, T4D, KP770513242 * T4E);
+			      T4G = T4C - T4F;
+			      T4J = FNMS(KP425779291, T4I, KP904827052 * T4H);
+			      T4M = FNMS(KP992114701, T4L, KP125333233 * T4K);
+			      T4N = T4J + T4M;
+			      T4O = T4G + T4N;
+			      T75 = T4J - T4M;
+			      T4P = KP559016994 * (T4G - T4N);
+			      T74 = T4C + T4F;
+			 }
+			 {
+			      E T4q, T4r, T6F, T4t, T4u, T6G;
+			      T4q = FNMS(KP248689887, T30, KP968583161 * T3b);
+			      T4r = FNMS(KP844327925, T3n, KP535826794 * T3y);
+			      T6F = T4q + T4r;
+			      T4t = FNMS(KP481753674, T3L, KP876306680 * T3W);
+			      T4u = FNMS(KP684547105, T48, KP728968627 * T4j);
+			      T6G = T4t + T4u;
+			      T4s = T4q - T4r;
+			      T6P = KP559016994 * (T6F - T6G);
+			      T4v = T4t - T4u;
+			      T6H = T6F + T6G;
+			 }
+			 ri[WS(rs, 1)] = T2P + T4m;
+			 ii[WS(rs, 1)] = T6H + T6O;
+			 ri[WS(rs, 4)] = T4z + T4O;
+			 ii[WS(rs, 4)] = T6Z + T70;
+			 {
+			      E T4w, T4y, T4p, T4x, T4o;
+			      T4w = FMA(KP951056516, T4s, KP587785252 * T4v);
+			      T4y = FNMS(KP587785252, T4s, KP951056516 * T4v);
+			      T4o = FNMS(KP250000000, T4m, T2P);
+			      T4p = T4n + T4o;
+			      T4x = T4o - T4n;
+			      ri[WS(rs, 21)] = T4p - T4w;
+			      ri[WS(rs, 16)] = T4x + T4y;
+			      ri[WS(rs, 6)] = T4p + T4w;
+			      ri[WS(rs, 11)] = T4x - T4y;
+			 }
+			 {
+			      E T6U, T6V, T6R, T6W, T6Q;
+			      T6U = FMA(KP951056516, T6S, KP587785252 * T6T);
+			      T6V = FNMS(KP587785252, T6S, KP951056516 * T6T);
+			      T6Q = FNMS(KP250000000, T6H, T6O);
+			      T6R = T6P + T6Q;
+			      T6W = T6Q - T6P;
+			      ii[WS(rs, 6)] = T6R - T6U;
+			      ii[WS(rs, 16)] = T6W - T6V;
+			      ii[WS(rs, 21)] = T6U + T6R;
+			      ii[WS(rs, 11)] = T6V + T6W;
+			 }
+			 {
+			      E T4Y, T50, T4R, T4Z, T4Q;
+			      T4Y = FMA(KP951056516, T4U, KP587785252 * T4X);
+			      T50 = FNMS(KP587785252, T4U, KP951056516 * T4X);
+			      T4Q = FNMS(KP250000000, T4O, T4z);
+			      T4R = T4P + T4Q;
+			      T4Z = T4Q - T4P;
+			      ri[WS(rs, 24)] = T4R - T4Y;
+			      ri[WS(rs, 19)] = T4Z + T50;
+			      ri[WS(rs, 9)] = T4R + T4Y;
+			      ri[WS(rs, 14)] = T4Z - T50;
+			 }
+			 {
+			      E T76, T77, T73, T78, T72;
+			      T76 = FMA(KP951056516, T74, KP587785252 * T75);
+			      T77 = FNMS(KP587785252, T74, KP951056516 * T75);
+			      T72 = FNMS(KP250000000, T6Z, T70);
+			      T73 = T71 + T72;
+			      T78 = T72 - T71;
+			      ii[WS(rs, 9)] = T73 - T76;
+			      ii[WS(rs, 19)] = T78 - T77;
+			      ii[WS(rs, 24)] = T76 + T73;
+			      ii[WS(rs, 14)] = T77 + T78;
+			 }
+		    }
+		    {
+			 E T53, T5L, T7e, T7q, T5y, T7j, T5z, T7i, T66, T7r, T69, T7p, T60, T7v, T61;
+			 E T7u, T5E, T7f, T5H, T7b, T51, T7d;
+			 T51 = T2G - T2F;
+			 T53 = T51 - T52;
+			 T5L = T51 + T52;
+			 T7d = T6J - T6I;
+			 T7e = T7c + T7d;
+			 T7q = T7d - T7c;
+			 {
+			      E T5a, T5h, T5i, T5p, T5w, T5x;
+			      T5a = FMA(KP876306680, T56, KP481753674 * T59);
+			      T5h = FNMS(KP425779291, T5g, KP904827052 * T5d);
+			      T5i = T5a + T5h;
+			      T5p = FMA(KP535826794, T5l, KP844327925 * T5o);
+			      T5w = FMA(KP062790519, T5s, KP998026728 * T5v);
+			      T5x = T5p + T5w;
+			      T5y = T5i + T5x;
+			      T7j = T5p - T5w;
+			      T5z = KP559016994 * (T5i - T5x);
+			      T7i = T5a - T5h;
+			 }
+			 {
+			      E T64, T65, T7n, T67, T68, T7o;
+			      T64 = FNMS(KP684547105, T5M, KP728968627 * T5N);
+			      T65 = FMA(KP125333233, T5Q, KP992114701 * T5P);
+			      T7n = T64 - T65;
+			      T67 = FNMS(KP998026728, T5T, KP062790519 * T5U);
+			      T68 = FMA(KP770513242, T5X, KP637423989 * T5W);
+			      T7o = T67 - T68;
+			      T66 = T64 + T65;
+			      T7r = KP559016994 * (T7n - T7o);
+			      T69 = T67 + T68;
+			      T7p = T7n + T7o;
+			 }
+			 {
+			      E T5O, T5R, T5S, T5V, T5Y, T5Z;
+			      T5O = FMA(KP728968627, T5M, KP684547105 * T5N);
+			      T5R = FNMS(KP992114701, T5Q, KP125333233 * T5P);
+			      T5S = T5O + T5R;
+			      T5V = FMA(KP062790519, T5T, KP998026728 * T5U);
+			      T5Y = FNMS(KP637423989, T5X, KP770513242 * T5W);
+			      T5Z = T5V + T5Y;
+			      T60 = T5S + T5Z;
+			      T7v = T5V - T5Y;
+			      T61 = KP559016994 * (T5S - T5Z);
+			      T7u = T5O - T5R;
+			 }
+			 {
+			      E T5C, T5D, T79, T5F, T5G, T7a;
+			      T5C = FNMS(KP481753674, T56, KP876306680 * T59);
+			      T5D = FMA(KP904827052, T5g, KP425779291 * T5d);
+			      T79 = T5C - T5D;
+			      T5F = FNMS(KP844327925, T5l, KP535826794 * T5o);
+			      T5G = FNMS(KP998026728, T5s, KP062790519 * T5v);
+			      T7a = T5F + T5G;
+			      T5E = T5C + T5D;
+			      T7f = KP559016994 * (T79 - T7a);
+			      T5H = T5F - T5G;
+			      T7b = T79 + T7a;
+			 }
+			 ri[WS(rs, 2)] = T53 + T5y;
+			 ii[WS(rs, 2)] = T7b + T7e;
+			 ri[WS(rs, 3)] = T5L + T60;
+			 ii[WS(rs, 3)] = T7p + T7q;
+			 {
+			      E T5I, T5K, T5B, T5J, T5A;
+			      T5I = FMA(KP951056516, T5E, KP587785252 * T5H);
+			      T5K = FNMS(KP587785252, T5E, KP951056516 * T5H);
+			      T5A = FNMS(KP250000000, T5y, T53);
+			      T5B = T5z + T5A;
+			      T5J = T5A - T5z;
+			      ri[WS(rs, 22)] = T5B - T5I;
+			      ri[WS(rs, 17)] = T5J + T5K;
+			      ri[WS(rs, 7)] = T5B + T5I;
+			      ri[WS(rs, 12)] = T5J - T5K;
+			 }
+			 {
+			      E T7k, T7l, T7h, T7m, T7g;
+			      T7k = FMA(KP951056516, T7i, KP587785252 * T7j);
+			      T7l = FNMS(KP587785252, T7i, KP951056516 * T7j);
+			      T7g = FNMS(KP250000000, T7b, T7e);
+			      T7h = T7f + T7g;
+			      T7m = T7g - T7f;
+			      ii[WS(rs, 7)] = T7h - T7k;
+			      ii[WS(rs, 17)] = T7m - T7l;
+			      ii[WS(rs, 22)] = T7k + T7h;
+			      ii[WS(rs, 12)] = T7l + T7m;
+			 }
+			 {
+			      E T6a, T6c, T63, T6b, T62;
+			      T6a = FMA(KP951056516, T66, KP587785252 * T69);
+			      T6c = FNMS(KP587785252, T66, KP951056516 * T69);
+			      T62 = FNMS(KP250000000, T60, T5L);
+			      T63 = T61 + T62;
+			      T6b = T62 - T61;
+			      ri[WS(rs, 23)] = T63 - T6a;
+			      ri[WS(rs, 18)] = T6b + T6c;
+			      ri[WS(rs, 8)] = T63 + T6a;
+			      ri[WS(rs, 13)] = T6b - T6c;
+			 }
+			 {
+			      E T7w, T7x, T7t, T7y, T7s;
+			      T7w = FMA(KP951056516, T7u, KP587785252 * T7v);
+			      T7x = FNMS(KP587785252, T7u, KP951056516 * T7v);
+			      T7s = FNMS(KP250000000, T7p, T7q);
+			      T7t = T7r + T7s;
+			      T7y = T7s - T7r;
+			      ii[WS(rs, 8)] = T7t - T7w;
+			      ii[WS(rs, 18)] = T7y - T7x;
+			      ii[WS(rs, 23)] = T7w + T7t;
+			      ii[WS(rs, 13)] = T7x + T7y;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_CEXP, 0, 24},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 25, "t2_25", twinstr, &GENUS, {280, 180, 160, 0}, 0, 0, 0 };
+
+void X(codelet_t2_25) (planner *p) {
+     X(kdft_dit_register) (p, t2_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t2_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t2_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1844 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:00 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -name t2_32 -include t.h */
+
+/*
+ * This function contains 488 FP additions, 350 FP multiplications,
+ * (or, 236 additions, 98 multiplications, 252 fused multiply/add),
+ * 181 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "t.h"
+
+static void t2_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T9A, T9z;
+	       {
+		    E T2, T8, T3, T6, Te, Tr, T18, T4, Ta, Tz, T1n, T10, Ti, T5, Tc;
+		    T2 = W[0];
+		    T8 = W[4];
+		    T3 = W[2];
+		    T6 = W[3];
+		    Te = W[6];
+		    Tr = T2 * T8;
+		    T18 = T3 * T8;
+		    T4 = T2 * T3;
+		    Ta = T2 * T6;
+		    Tz = T3 * Te;
+		    T1n = T8 * Te;
+		    T10 = T2 * Te;
+		    Ti = W[7];
+		    T5 = W[1];
+		    Tc = W[5];
+		    {
+			 E T34, T31, T2X, T2T, Tq, T46, T8H, T97, TH, T98, T4b, T8D, TZ, T7f, T4j;
+			 E T6t, T1g, T7g, T4q, T6u, T4z, T6x, T1J, T7m, T7l, T8d, T6y, T4G, T2k, T7o;
+			 E T7r, T8e, T6A, T4O, T6B, T4V, T6P, T5E, T7L, T3G, T6M, T61, T8n, T7I, T6I;
+			 E T55, T7A, T2N, T6F, T5s, T8i, T7x, T5L, T62, T43, T7J, T5S, T63, T7O, T8o;
+			 E T2U, T2R, T2V, T57, T3a, T5h, T2Y, T32, T35;
+			 {
+			      E T1K, T23, T1N, T26, T2b, T1U, T3C, T3j, T3z, T3f, T1R, T29, TR, Th, T2J;
+			      E T2F, Td, TP, T3r, T3n, T2w, T2s, T3Q, T3M, T1Z, T1V, T2g, T2c;
+			      {
+				   E T11, T1C, TM, Tb, TJ, T7, T1o, T19, T1w, T1F, T15, T1s, T1d, T1z, TW;
+				   E TS, Ty, T48, TG, T4a;
+				   {
+					E T1, TA, Ts, TE, Tw, Tn, Tj, T8G, Tk, To, T14;
+					T1 = ri[0];
+					TA = FMA(T6, Ti, Tz);
+					T1K = FNMS(T6, Ti, Tz);
+					T14 = T2 * Ti;
+					{
+					     E T1r, TD, T1c, Tv;
+					     T1r = T8 * Ti;
+					     TD = T3 * Ti;
+					     T11 = FNMS(T5, Ti, T10);
+					     T1C = FMA(T5, Ti, T10);
+					     TM = FMA(T5, T3, Ta);
+					     Tb = FNMS(T5, T3, Ta);
+					     TJ = FNMS(T5, T6, T4);
+					     T7 = FMA(T5, T6, T4);
+					     T1o = FMA(Tc, Ti, T1n);
+					     T23 = FMA(T6, Tc, T18);
+					     T19 = FNMS(T6, Tc, T18);
+					     T1w = FNMS(T5, Tc, Tr);
+					     Ts = FMA(T5, Tc, Tr);
+					     T1c = T3 * Tc;
+					     Tv = T2 * Tc;
+					     T1F = FNMS(T5, Te, T14);
+					     T15 = FMA(T5, Te, T14);
+					     T1s = FNMS(Tc, Te, T1r);
+					     T1N = FMA(T6, Te, TD);
+					     TE = FNMS(T6, Te, TD);
+					     {
+						  E T1T, T3i, T3e, T1Q;
+						  T1T = TJ * Tc;
+						  T3i = TJ * Ti;
+						  T3e = TJ * Te;
+						  T1Q = TJ * T8;
+						  {
+						       E Tg, T2I, T2E, T9;
+						       Tg = T7 * Tc;
+						       T2I = T7 * Ti;
+						       T2E = T7 * Te;
+						       T9 = T7 * T8;
+						       {
+							    E T3q, T3m, T2v, T2r;
+							    T3q = T19 * Ti;
+							    T3m = T19 * Te;
+							    T2v = T1w * Ti;
+							    T2r = T1w * Te;
+							    {
+								 E T2W, T2S, T3P, T3L;
+								 T2W = T23 * Ti;
+								 T2S = T23 * Te;
+								 T3P = Ts * Ti;
+								 T3L = Ts * Te;
+								 T26 = FNMS(T6, T8, T1c);
+								 T1d = FMA(T6, T8, T1c);
+								 T1z = FMA(T5, T8, Tv);
+								 Tw = FNMS(T5, T8, Tv);
+								 T2b = FNMS(TM, T8, T1T);
+								 T1U = FMA(TM, T8, T1T);
+								 T3C = FNMS(TM, Te, T3i);
+								 T3j = FMA(TM, Te, T3i);
+								 T3z = FMA(TM, Ti, T3e);
+								 T3f = FNMS(TM, Ti, T3e);
+								 T1R = FNMS(TM, Tc, T1Q);
+								 T29 = FMA(TM, Tc, T1Q);
+								 TR = FNMS(Tb, T8, Tg);
+								 Th = FMA(Tb, T8, Tg);
+								 T34 = FMA(Tb, Te, T2I);
+								 T2J = FNMS(Tb, Te, T2I);
+								 T31 = FNMS(Tb, Ti, T2E);
+								 T2F = FMA(Tb, Ti, T2E);
+								 Td = FNMS(Tb, Tc, T9);
+								 TP = FMA(Tb, Tc, T9);
+								 T2X = FNMS(T26, Te, T2W);
+								 T2T = FMA(T26, Ti, T2S);
+								 T3r = FNMS(T1d, Te, T3q);
+								 T3n = FMA(T1d, Ti, T3m);
+								 T2w = FNMS(T1z, Te, T2v);
+								 T2s = FMA(T1z, Ti, T2r);
+								 T3Q = FNMS(Tw, Te, T3P);
+								 T3M = FMA(Tw, Ti, T3L);
+								 {
+								      E T1Y, T1S, T2f, T2a;
+								      T1Y = T1R * Ti;
+								      T1S = T1R * Te;
+								      T2f = T29 * Ti;
+								      T2a = T29 * Te;
+								      {
+									   E Tm, Tf, TV, TQ;
+									   Tm = Td * Ti;
+									   Tf = Td * Te;
+									   TV = TP * Ti;
+									   TQ = TP * Te;
+									   T1Z = FNMS(T1U, Te, T1Y);
+									   T1V = FMA(T1U, Ti, T1S);
+									   T2g = FNMS(T2b, Te, T2f);
+									   T2c = FMA(T2b, Ti, T2a);
+									   Tn = FNMS(Th, Te, Tm);
+									   Tj = FMA(Th, Ti, Tf);
+									   TW = FNMS(TR, Te, TV);
+									   TS = FMA(TR, Ti, TQ);
+									   T8G = ii[0];
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					Tk = ri[WS(rs, 16)];
+					To = ii[WS(rs, 16)];
+					{
+					     E Tt, Tx, Tu, T47, TB, TF, TC, T49;
+					     {
+						  E Tl, T8E, Tp, T8F;
+						  Tt = ri[WS(rs, 8)];
+						  Tx = ii[WS(rs, 8)];
+						  Tl = Tj * Tk;
+						  T8E = Tj * To;
+						  Tu = Ts * Tt;
+						  T47 = Ts * Tx;
+						  Tp = FMA(Tn, To, Tl);
+						  T8F = FNMS(Tn, Tk, T8E);
+						  TB = ri[WS(rs, 24)];
+						  TF = ii[WS(rs, 24)];
+						  Tq = T1 + Tp;
+						  T46 = T1 - Tp;
+						  T8H = T8F + T8G;
+						  T97 = T8G - T8F;
+						  TC = TA * TB;
+						  T49 = TA * TF;
+					     }
+					     Ty = FMA(Tw, Tx, Tu);
+					     T48 = FNMS(Tw, Tt, T47);
+					     TG = FMA(TE, TF, TC);
+					     T4a = FNMS(TE, TB, T49);
+					}
+				   }
+				   {
+					E TT, TX, TO, T4f, TU, T4g;
+					{
+					     E TK, TN, TL, T4e;
+					     TK = ri[WS(rs, 4)];
+					     TN = ii[WS(rs, 4)];
+					     TH = Ty + TG;
+					     T98 = Ty - TG;
+					     T4b = T48 - T4a;
+					     T8D = T48 + T4a;
+					     TL = TJ * TK;
+					     T4e = TJ * TN;
+					     TT = ri[WS(rs, 20)];
+					     TX = ii[WS(rs, 20)];
+					     TO = FMA(TM, TN, TL);
+					     T4f = FNMS(TM, TK, T4e);
+					     TU = TS * TT;
+					     T4g = TS * TX;
+					}
+					{
+					     E T17, T4m, T1a, T1e, T4d, T4i;
+					     {
+						  E T12, T16, TY, T4h, T13, T4l;
+						  T12 = ri[WS(rs, 28)];
+						  T16 = ii[WS(rs, 28)];
+						  TY = FMA(TW, TX, TU);
+						  T4h = FNMS(TW, TT, T4g);
+						  T13 = T11 * T12;
+						  T4l = T11 * T16;
+						  TZ = TO + TY;
+						  T4d = TO - TY;
+						  T7f = T4f + T4h;
+						  T4i = T4f - T4h;
+						  T17 = FMA(T15, T16, T13);
+						  T4m = FNMS(T15, T12, T4l);
+					     }
+					     T4j = T4d + T4i;
+					     T6t = T4i - T4d;
+					     T1a = ri[WS(rs, 12)];
+					     T1e = ii[WS(rs, 12)];
+					     {
+						  E T1m, T4u, T1H, T4E, T1x, T1A, T1u, T4w, T1y, T4B;
+						  {
+						       E T1D, T1G, T1E, T4D;
+						       {
+							    E T1f, T4o, T4k, T4p;
+							    {
+								 E T1j, T1l, T1b, T4n, T1k, T4t;
+								 T1j = ri[WS(rs, 2)];
+								 T1l = ii[WS(rs, 2)];
+								 T1b = T19 * T1a;
+								 T4n = T19 * T1e;
+								 T1k = T7 * T1j;
+								 T4t = T7 * T1l;
+								 T1f = FMA(T1d, T1e, T1b);
+								 T4o = FNMS(T1d, T1a, T4n);
+								 T1m = FMA(Tb, T1l, T1k);
+								 T4u = FNMS(Tb, T1j, T4t);
+							    }
+							    T1g = T17 + T1f;
+							    T4k = T17 - T1f;
+							    T7g = T4m + T4o;
+							    T4p = T4m - T4o;
+							    T1D = ri[WS(rs, 26)];
+							    T1G = ii[WS(rs, 26)];
+							    T4q = T4k - T4p;
+							    T6u = T4k + T4p;
+							    T1E = T1C * T1D;
+							    T4D = T1C * T1G;
+						       }
+						       {
+							    E T1p, T1t, T1q, T4v;
+							    T1p = ri[WS(rs, 18)];
+							    T1t = ii[WS(rs, 18)];
+							    T1H = FMA(T1F, T1G, T1E);
+							    T4E = FNMS(T1F, T1D, T4D);
+							    T1q = T1o * T1p;
+							    T4v = T1o * T1t;
+							    T1x = ri[WS(rs, 10)];
+							    T1A = ii[WS(rs, 10)];
+							    T1u = FMA(T1s, T1t, T1q);
+							    T4w = FNMS(T1s, T1p, T4v);
+							    T1y = T1w * T1x;
+							    T4B = T1w * T1A;
+						       }
+						  }
+						  {
+						       E T4A, T1v, T7j, T4x, T1B, T4C;
+						       T4A = T1m - T1u;
+						       T1v = T1m + T1u;
+						       T7j = T4u + T4w;
+						       T4x = T4u - T4w;
+						       T1B = FMA(T1z, T1A, T1y);
+						       T4C = FNMS(T1z, T1x, T4B);
+						       {
+							    E T1I, T4y, T4F, T7k;
+							    T1I = T1B + T1H;
+							    T4y = T1B - T1H;
+							    T4F = T4C - T4E;
+							    T7k = T4C + T4E;
+							    T4z = T4x - T4y;
+							    T6x = T4x + T4y;
+							    T1J = T1v + T1I;
+							    T7m = T1v - T1I;
+							    T7l = T7j - T7k;
+							    T8d = T7j + T7k;
+							    T6y = T4A - T4F;
+							    T4G = T4A + T4F;
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T5Z, T3u, T5V, T5C, T7G, T5D, T3F, T5X, T4P, T4U;
+				   {
+					E T1P, T4J, T2i, T4T, T21, T4L, T28, T4R;
+					{
+					     E T1L, T1O, T1W, T20;
+					     T1L = ri[WS(rs, 30)];
+					     T1O = ii[WS(rs, 30)];
+					     {
+						  E T2d, T2h, T1M, T4I, T2e, T4S;
+						  T2d = ri[WS(rs, 22)];
+						  T2h = ii[WS(rs, 22)];
+						  T1M = T1K * T1L;
+						  T4I = T1K * T1O;
+						  T2e = T2c * T2d;
+						  T4S = T2c * T2h;
+						  T1P = FMA(T1N, T1O, T1M);
+						  T4J = FNMS(T1N, T1L, T4I);
+						  T2i = FMA(T2g, T2h, T2e);
+						  T4T = FNMS(T2g, T2d, T4S);
+					     }
+					     T1W = ri[WS(rs, 14)];
+					     T20 = ii[WS(rs, 14)];
+					     {
+						  E T24, T27, T1X, T4K, T25, T4Q;
+						  T24 = ri[WS(rs, 6)];
+						  T27 = ii[WS(rs, 6)];
+						  T1X = T1V * T1W;
+						  T4K = T1V * T20;
+						  T25 = T23 * T24;
+						  T4Q = T23 * T27;
+						  T21 = FMA(T1Z, T20, T1X);
+						  T4L = FNMS(T1Z, T1W, T4K);
+						  T28 = FMA(T26, T27, T25);
+						  T4R = FNMS(T26, T24, T4Q);
+					     }
+					}
+					{
+					     E T22, T7p, T4M, T4N, T2j, T7q;
+					     T4P = T1P - T21;
+					     T22 = T1P + T21;
+					     T7p = T4J + T4L;
+					     T4M = T4J - T4L;
+					     T4N = T28 - T2i;
+					     T2j = T28 + T2i;
+					     T7q = T4R + T4T;
+					     T4U = T4R - T4T;
+					     T2k = T22 + T2j;
+					     T7o = T22 - T2j;
+					     T7r = T7p - T7q;
+					     T8e = T7p + T7q;
+					     T6A = T4M + T4N;
+					     T4O = T4M - T4N;
+					}
+				   }
+				   {
+					E T3l, T5z, T3E, T3v, T3t, T3w, T3x, T5B, T3A, T3B, T3D, T3y, T5W;
+					{
+					     E T3g, T3k, T3h, T5y;
+					     T3g = ri[WS(rs, 31)];
+					     T3k = ii[WS(rs, 31)];
+					     T3A = ri[WS(rs, 23)];
+					     T6B = T4P - T4U;
+					     T4V = T4P + T4U;
+					     T3h = T3f * T3g;
+					     T5y = T3f * T3k;
+					     T3B = T3z * T3A;
+					     T3D = ii[WS(rs, 23)];
+					     T3l = FMA(T3j, T3k, T3h);
+					     T5z = FNMS(T3j, T3g, T5y);
+					}
+					{
+					     E T3o, T5Y, T3s, T3p, T5A;
+					     T3o = ri[WS(rs, 15)];
+					     T3E = FMA(T3C, T3D, T3B);
+					     T5Y = T3z * T3D;
+					     T3s = ii[WS(rs, 15)];
+					     T3p = T3n * T3o;
+					     T3v = ri[WS(rs, 7)];
+					     T5Z = FNMS(T3C, T3A, T5Y);
+					     T5A = T3n * T3s;
+					     T3t = FMA(T3r, T3s, T3p);
+					     T3w = TP * T3v;
+					     T3x = ii[WS(rs, 7)];
+					     T5B = FNMS(T3r, T3o, T5A);
+					}
+					T3u = T3l + T3t;
+					T5V = T3l - T3t;
+					T3y = FMA(TR, T3x, T3w);
+					T5W = TP * T3x;
+					T5C = T5z - T5B;
+					T7G = T5z + T5B;
+					T5D = T3y - T3E;
+					T3F = T3y + T3E;
+					T5X = FNMS(TR, T3v, T5W);
+				   }
+				   {
+					E T2L, T5q, T5m, T2z, T7v, T53, T2D, T5o;
+					{
+					     E T2q, T50, T2y, T2A, T2C, T52, T2B, T5n;
+					     {
+						  E T2G, T2K, T2n, T4Z, T2t, T51;
+						  {
+						       E T2o, T2p, T60, T7H;
+						       T2n = ri[WS(rs, 1)];
+						       T6P = T5C + T5D;
+						       T5E = T5C - T5D;
+						       T7L = T3u - T3F;
+						       T3G = T3u + T3F;
+						       T60 = T5X - T5Z;
+						       T7H = T5X + T5Z;
+						       T2o = T2 * T2n;
+						       T2p = ii[WS(rs, 1)];
+						       T6M = T5V - T60;
+						       T61 = T5V + T60;
+						       T8n = T7G + T7H;
+						       T7I = T7G - T7H;
+						       T4Z = T2 * T2p;
+						       T2q = FMA(T5, T2p, T2o);
+						  }
+						  T2G = ri[WS(rs, 25)];
+						  T2K = ii[WS(rs, 25)];
+						  T50 = FNMS(T5, T2n, T4Z);
+						  {
+						       E T2x, T2u, T2H, T5p;
+						       T2t = ri[WS(rs, 17)];
+						       T2H = T2F * T2G;
+						       T5p = T2F * T2K;
+						       T2x = ii[WS(rs, 17)];
+						       T2u = T2s * T2t;
+						       T2L = FMA(T2J, T2K, T2H);
+						       T5q = FNMS(T2J, T2G, T5p);
+						       T51 = T2s * T2x;
+						       T2y = FMA(T2w, T2x, T2u);
+						  }
+						  T2A = ri[WS(rs, 9)];
+						  T2C = ii[WS(rs, 9)];
+						  T52 = FNMS(T2w, T2t, T51);
+					     }
+					     T5m = T2q - T2y;
+					     T2z = T2q + T2y;
+					     T2B = T8 * T2A;
+					     T5n = T8 * T2C;
+					     T7v = T50 + T52;
+					     T53 = T50 - T52;
+					     T2D = FMA(Tc, T2C, T2B);
+					     T5o = FNMS(Tc, T2A, T5n);
+					}
+					{
+					     E T3N, T3K, T3O, T5G, T41, T5Q, T3R, T3U, T3W;
+					     {
+						  E T3H, T3I, T3J, T3Y, T40, T5F, T3Z, T5P;
+						  T3H = ri[WS(rs, 3)];
+						  {
+						       E T54, T2M, T5r, T7w;
+						       T54 = T2D - T2L;
+						       T2M = T2D + T2L;
+						       T5r = T5o - T5q;
+						       T7w = T5o + T5q;
+						       T6I = T53 + T54;
+						       T55 = T53 - T54;
+						       T7A = T2z - T2M;
+						       T2N = T2z + T2M;
+						       T6F = T5m - T5r;
+						       T5s = T5m + T5r;
+						       T8i = T7v + T7w;
+						       T7x = T7v - T7w;
+						       T3I = T3 * T3H;
+						  }
+						  T3J = ii[WS(rs, 3)];
+						  T3Y = ri[WS(rs, 11)];
+						  T40 = ii[WS(rs, 11)];
+						  T3N = ri[WS(rs, 19)];
+						  T3K = FMA(T6, T3J, T3I);
+						  T5F = T3 * T3J;
+						  T3Z = Td * T3Y;
+						  T5P = Td * T40;
+						  T3O = T3M * T3N;
+						  T5G = FNMS(T6, T3H, T5F);
+						  T41 = FMA(Th, T40, T3Z);
+						  T5Q = FNMS(Th, T3Y, T5P);
+						  T3R = ii[WS(rs, 19)];
+						  T3U = ri[WS(rs, 27)];
+						  T3W = ii[WS(rs, 27)];
+					     }
+					     {
+						  E T2O, T2P, T2Q, T37, T39, T56, T38, T5g;
+						  {
+						       E T3T, T5K, T5I, T3X, T5O, T7M, T5J;
+						       T2O = ri[WS(rs, 5)];
+						       {
+							    E T3S, T5H, T3V, T5N;
+							    T3S = FMA(T3Q, T3R, T3O);
+							    T5H = T3M * T3R;
+							    T3V = Te * T3U;
+							    T5N = Te * T3W;
+							    T3T = T3K + T3S;
+							    T5K = T3K - T3S;
+							    T5I = FNMS(T3Q, T3N, T5H);
+							    T3X = FMA(Ti, T3W, T3V);
+							    T5O = FNMS(Ti, T3U, T5N);
+							    T2P = T29 * T2O;
+						       }
+						       T7M = T5G + T5I;
+						       T5J = T5G - T5I;
+						       {
+							    E T42, T5M, T7N, T5R;
+							    T42 = T3X + T41;
+							    T5M = T3X - T41;
+							    T7N = T5O + T5Q;
+							    T5R = T5O - T5Q;
+							    T5L = T5J - T5K;
+							    T62 = T5K + T5J;
+							    T43 = T3T + T42;
+							    T7J = T42 - T3T;
+							    T5S = T5M + T5R;
+							    T63 = T5M - T5R;
+							    T7O = T7M - T7N;
+							    T8o = T7M + T7N;
+							    T2Q = ii[WS(rs, 5)];
+						       }
+						  }
+						  T37 = ri[WS(rs, 13)];
+						  T39 = ii[WS(rs, 13)];
+						  T2U = ri[WS(rs, 21)];
+						  T2R = FMA(T2b, T2Q, T2P);
+						  T56 = T29 * T2Q;
+						  T38 = T1R * T37;
+						  T5g = T1R * T39;
+						  T2V = T2T * T2U;
+						  T57 = FNMS(T2b, T2O, T56);
+						  T3a = FMA(T1U, T39, T38);
+						  T5h = FNMS(T1U, T37, T5g);
+						  T2Y = ii[WS(rs, 21)];
+						  T32 = ri[WS(rs, 29)];
+						  T35 = ii[WS(rs, 29)];
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T5c, T5t, T5j, T5u, T88, T90, T8Z, T8b;
+			      {
+				   E T7e, T8T, T7y, T7D, T7h, T8U, T8S, T8R;
+				   {
+					E T8c, T1i, T8A, T8z, T8O, T8J, T8N, T2l, T8L, T45, T8t, T8l, T8u, T8q, T3c;
+					E T8k, T8p, T8w, T2m;
+					{
+					     E T8x, T8y, T8j, T8C, T8I;
+					     {
+						  E TI, T30, T5b, T59, T36, T5f, T1h, T7B, T5a;
+						  TI = Tq + TH;
+						  T7e = Tq - TH;
+						  {
+						       E T2Z, T58, T33, T5e;
+						       T2Z = FMA(T2X, T2Y, T2V);
+						       T58 = T2T * T2Y;
+						       T33 = T31 * T32;
+						       T5e = T31 * T35;
+						       T30 = T2R + T2Z;
+						       T5b = T2R - T2Z;
+						       T59 = FNMS(T2X, T2U, T58);
+						       T36 = FMA(T34, T35, T33);
+						       T5f = FNMS(T34, T32, T5e);
+						       T1h = TZ + T1g;
+						       T8T = T1g - TZ;
+						  }
+						  T7B = T57 + T59;
+						  T5a = T57 - T59;
+						  {
+						       E T3b, T5d, T7C, T5i;
+						       T3b = T36 + T3a;
+						       T5d = T36 - T3a;
+						       T7C = T5f + T5h;
+						       T5i = T5f - T5h;
+						       T5c = T5a - T5b;
+						       T5t = T5b + T5a;
+						       T3c = T30 + T3b;
+						       T7y = T3b - T30;
+						       T5j = T5d + T5i;
+						       T5u = T5d - T5i;
+						       T7D = T7B - T7C;
+						       T8j = T7B + T7C;
+						       T8c = TI - T1h;
+						       T1i = TI + T1h;
+						  }
+					     }
+					     T8k = T8i - T8j;
+					     T8x = T8i + T8j;
+					     T8y = T8n + T8o;
+					     T8p = T8n - T8o;
+					     T7h = T7f - T7g;
+					     T8C = T7f + T7g;
+					     T8I = T8D + T8H;
+					     T8U = T8H - T8D;
+					     T8A = T8x + T8y;
+					     T8z = T8x - T8y;
+					     T8O = T8I - T8C;
+					     T8J = T8C + T8I;
+					}
+					{
+					     E T8h, T8m, T3d, T44;
+					     T8h = T2N - T3c;
+					     T3d = T2N + T3c;
+					     T44 = T3G + T43;
+					     T8m = T3G - T43;
+					     T8N = T2k - T1J;
+					     T2l = T1J + T2k;
+					     T8L = T44 - T3d;
+					     T45 = T3d + T44;
+					     T8t = T8k - T8h;
+					     T8l = T8h + T8k;
+					     T8u = T8m + T8p;
+					     T8q = T8m - T8p;
+					}
+					T8w = T1i - T2l;
+					T2m = T1i + T2l;
+					{
+					     E T8s, T8P, T8Q, T8v;
+					     {
+						  E T8r, T8M, T8K, T8g, T8B, T8f;
+						  T8S = T8q - T8l;
+						  T8r = T8l + T8q;
+						  T8B = T8d + T8e;
+						  T8f = T8d - T8e;
+						  ri[0] = T2m + T45;
+						  ri[WS(rs, 16)] = T2m - T45;
+						  ri[WS(rs, 8)] = T8w + T8z;
+						  ri[WS(rs, 24)] = T8w - T8z;
+						  T8M = T8J - T8B;
+						  T8K = T8B + T8J;
+						  T8g = T8c + T8f;
+						  T8s = T8c - T8f;
+						  T8R = T8O - T8N;
+						  T8P = T8N + T8O;
+						  ii[WS(rs, 24)] = T8M - T8L;
+						  ii[WS(rs, 8)] = T8L + T8M;
+						  ii[WS(rs, 16)] = T8K - T8A;
+						  ii[0] = T8A + T8K;
+						  ri[WS(rs, 4)] = FMA(KP707106781, T8r, T8g);
+						  ri[WS(rs, 20)] = FNMS(KP707106781, T8r, T8g);
+						  T8Q = T8t + T8u;
+						  T8v = T8t - T8u;
+					     }
+					     ii[WS(rs, 20)] = FNMS(KP707106781, T8Q, T8P);
+					     ii[WS(rs, 4)] = FMA(KP707106781, T8Q, T8P);
+					     ri[WS(rs, 12)] = FMA(KP707106781, T8v, T8s);
+					     ri[WS(rs, 28)] = FNMS(KP707106781, T8v, T8s);
+					}
+				   }
+				   {
+					E T7P, T7W, T7i, T7K, T8a, T86, T91, T8V, T8W, T7t, T7T, T7F, T92, T7Z, T89;
+					E T83;
+					{
+					     E T7X, T7n, T7s, T7Y, T84, T85;
+					     T7P = T7L - T7O;
+					     T84 = T7L + T7O;
+					     ii[WS(rs, 28)] = FNMS(KP707106781, T8S, T8R);
+					     ii[WS(rs, 12)] = FMA(KP707106781, T8S, T8R);
+					     T7W = T7e + T7h;
+					     T7i = T7e - T7h;
+					     T85 = T7I + T7J;
+					     T7K = T7I - T7J;
+					     T7X = T7m + T7l;
+					     T7n = T7l - T7m;
+					     T8a = FMA(KP414213562, T84, T85);
+					     T86 = FNMS(KP414213562, T85, T84);
+					     T91 = T8U - T8T;
+					     T8V = T8T + T8U;
+					     T7s = T7o + T7r;
+					     T7Y = T7o - T7r;
+					     {
+						  E T82, T81, T7z, T7E;
+						  T82 = T7x + T7y;
+						  T7z = T7x - T7y;
+						  T7E = T7A - T7D;
+						  T81 = T7A + T7D;
+						  T8W = T7n + T7s;
+						  T7t = T7n - T7s;
+						  T7T = FNMS(KP414213562, T7z, T7E);
+						  T7F = FMA(KP414213562, T7E, T7z);
+						  T92 = T7Y - T7X;
+						  T7Z = T7X + T7Y;
+						  T89 = FNMS(KP414213562, T81, T82);
+						  T83 = FMA(KP414213562, T82, T81);
+					     }
+					}
+					{
+					     E T7S, T7u, T93, T95, T7U, T7Q;
+					     T7S = FNMS(KP707106781, T7t, T7i);
+					     T7u = FMA(KP707106781, T7t, T7i);
+					     T93 = FMA(KP707106781, T92, T91);
+					     T95 = FNMS(KP707106781, T92, T91);
+					     T7U = FMA(KP414213562, T7K, T7P);
+					     T7Q = FNMS(KP414213562, T7P, T7K);
+					     {
+						  E T80, T87, T8X, T8Y;
+						  T88 = FNMS(KP707106781, T7Z, T7W);
+						  T80 = FMA(KP707106781, T7Z, T7W);
+						  {
+						       E T7V, T94, T96, T7R;
+						       T7V = T7T + T7U;
+						       T94 = T7U - T7T;
+						       T96 = T7F + T7Q;
+						       T7R = T7F - T7Q;
+						       ri[WS(rs, 30)] = FMA(KP923879532, T7V, T7S);
+						       ri[WS(rs, 14)] = FNMS(KP923879532, T7V, T7S);
+						       ii[WS(rs, 22)] = FNMS(KP923879532, T94, T93);
+						       ii[WS(rs, 6)] = FMA(KP923879532, T94, T93);
+						       ii[WS(rs, 30)] = FMA(KP923879532, T96, T95);
+						       ii[WS(rs, 14)] = FNMS(KP923879532, T96, T95);
+						       ri[WS(rs, 6)] = FMA(KP923879532, T7R, T7u);
+						       ri[WS(rs, 22)] = FNMS(KP923879532, T7R, T7u);
+						       T87 = T83 + T86;
+						       T90 = T86 - T83;
+						  }
+						  T8Z = FNMS(KP707106781, T8W, T8V);
+						  T8X = FMA(KP707106781, T8W, T8V);
+						  T8Y = T89 + T8a;
+						  T8b = T89 - T8a;
+						  ri[WS(rs, 2)] = FMA(KP923879532, T87, T80);
+						  ri[WS(rs, 18)] = FNMS(KP923879532, T87, T80);
+						  ii[WS(rs, 18)] = FNMS(KP923879532, T8Y, T8X);
+						  ii[WS(rs, 2)] = FMA(KP923879532, T8Y, T8X);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T6s, T9o, T9n, T6v, T6N, T6Q, T6G, T6J, T9g, T9f;
+				   {
+					E T6c, T4s, T9c, T4X, T9h, T9b, T9i, T6f, T5U, T6l, T64, T5k, T5v;
+					{
+					     E T6d, T6e, T99, T9a, T5T;
+					     {
+						  E T4c, T4r, T4H, T4W;
+						  T6s = T46 - T4b;
+						  T4c = T46 + T4b;
+						  ri[WS(rs, 10)] = FMA(KP923879532, T8b, T88);
+						  ri[WS(rs, 26)] = FNMS(KP923879532, T8b, T88);
+						  ii[WS(rs, 26)] = FNMS(KP923879532, T90, T8Z);
+						  ii[WS(rs, 10)] = FMA(KP923879532, T90, T8Z);
+						  T4r = T4j + T4q;
+						  T9o = T4q - T4j;
+						  T6d = FMA(KP414213562, T4z, T4G);
+						  T4H = FNMS(KP414213562, T4G, T4z);
+						  T4W = FMA(KP414213562, T4V, T4O);
+						  T6e = FNMS(KP414213562, T4O, T4V);
+						  T9n = T98 + T97;
+						  T99 = T97 - T98;
+						  T6c = FMA(KP707106781, T4r, T4c);
+						  T4s = FNMS(KP707106781, T4r, T4c);
+						  T9c = T4H + T4W;
+						  T4X = T4H - T4W;
+						  T9a = T6t + T6u;
+						  T6v = T6t - T6u;
+					     }
+					     T6N = T5S - T5L;
+					     T5T = T5L + T5S;
+					     T9h = FNMS(KP707106781, T9a, T99);
+					     T9b = FMA(KP707106781, T9a, T99);
+					     T9i = T6e - T6d;
+					     T6f = T6d + T6e;
+					     T5U = FNMS(KP707106781, T5T, T5E);
+					     T6l = FMA(KP707106781, T5T, T5E);
+					     T64 = T62 + T63;
+					     T6Q = T62 - T63;
+					     T6G = T5j - T5c;
+					     T5k = T5c + T5j;
+					     T5v = T5t + T5u;
+					     T6J = T5t - T5u;
+					}
+					{
+					     E T6m, T6q, T6j, T6p, T9l, T9m;
+					     {
+						  E T68, T4Y, T6a, T66, T69, T5x, T9j, T6k, T65, T9k, T6b, T67;
+						  T68 = FNMS(KP923879532, T4X, T4s);
+						  T4Y = FMA(KP923879532, T4X, T4s);
+						  T6k = FMA(KP707106781, T64, T61);
+						  T65 = FNMS(KP707106781, T64, T61);
+						  {
+						       E T6i, T5l, T6h, T5w;
+						       T6i = FMA(KP707106781, T5k, T55);
+						       T5l = FNMS(KP707106781, T5k, T55);
+						       T6h = FMA(KP707106781, T5v, T5s);
+						       T5w = FNMS(KP707106781, T5v, T5s);
+						       T6m = FNMS(KP198912367, T6l, T6k);
+						       T6q = FMA(KP198912367, T6k, T6l);
+						       T6a = FMA(KP668178637, T5U, T65);
+						       T66 = FNMS(KP668178637, T65, T5U);
+						       T6j = FMA(KP198912367, T6i, T6h);
+						       T6p = FNMS(KP198912367, T6h, T6i);
+						       T69 = FNMS(KP668178637, T5l, T5w);
+						       T5x = FMA(KP668178637, T5w, T5l);
+						  }
+						  T9j = FMA(KP923879532, T9i, T9h);
+						  T9l = FNMS(KP923879532, T9i, T9h);
+						  T9k = T6a - T69;
+						  T6b = T69 + T6a;
+						  T9m = T5x + T66;
+						  T67 = T5x - T66;
+						  ii[WS(rs, 21)] = FNMS(KP831469612, T9k, T9j);
+						  ii[WS(rs, 5)] = FMA(KP831469612, T9k, T9j);
+						  ri[WS(rs, 5)] = FMA(KP831469612, T67, T4Y);
+						  ri[WS(rs, 21)] = FNMS(KP831469612, T67, T4Y);
+						  ri[WS(rs, 29)] = FMA(KP831469612, T6b, T68);
+						  ri[WS(rs, 13)] = FNMS(KP831469612, T6b, T68);
+					     }
+					     {
+						  E T6o, T9d, T9e, T6r, T6g, T6n;
+						  T6o = FNMS(KP923879532, T6f, T6c);
+						  T6g = FMA(KP923879532, T6f, T6c);
+						  T6n = T6j + T6m;
+						  T9g = T6m - T6j;
+						  T9f = FNMS(KP923879532, T9c, T9b);
+						  T9d = FMA(KP923879532, T9c, T9b);
+						  ii[WS(rs, 29)] = FMA(KP831469612, T9m, T9l);
+						  ii[WS(rs, 13)] = FNMS(KP831469612, T9m, T9l);
+						  ri[WS(rs, 1)] = FMA(KP980785280, T6n, T6g);
+						  ri[WS(rs, 17)] = FNMS(KP980785280, T6n, T6g);
+						  T9e = T6p + T6q;
+						  T6r = T6p - T6q;
+						  ii[WS(rs, 17)] = FNMS(KP980785280, T9e, T9d);
+						  ii[WS(rs, 1)] = FMA(KP980785280, T9e, T9d);
+						  ri[WS(rs, 9)] = FMA(KP980785280, T6r, T6o);
+						  ri[WS(rs, 25)] = FNMS(KP980785280, T6r, T6o);
+					     }
+					}
+				   }
+				   {
+					E T6Y, T6w, T9w, T6D, T9v, T9p, T9q, T71, T6H, T74, T78, T7c, T6W, T6S;
+					{
+					     E T6Z, T6z, T6C, T70;
+					     T6Z = FNMS(KP414213562, T6x, T6y);
+					     T6z = FMA(KP414213562, T6y, T6x);
+					     ii[WS(rs, 25)] = FNMS(KP980785280, T9g, T9f);
+					     ii[WS(rs, 9)] = FMA(KP980785280, T9g, T9f);
+					     T6Y = FNMS(KP707106781, T6v, T6s);
+					     T6w = FMA(KP707106781, T6v, T6s);
+					     T6C = FNMS(KP414213562, T6B, T6A);
+					     T70 = FMA(KP414213562, T6A, T6B);
+					     T9w = T6z + T6C;
+					     T6D = T6z - T6C;
+					     T9v = FNMS(KP707106781, T9o, T9n);
+					     T9p = FMA(KP707106781, T9o, T9n);
+					     {
+						  E T77, T6O, T76, T6R;
+						  T9q = T70 - T6Z;
+						  T71 = T6Z + T70;
+						  T77 = FMA(KP707106781, T6N, T6M);
+						  T6O = FNMS(KP707106781, T6N, T6M);
+						  T76 = FMA(KP707106781, T6Q, T6P);
+						  T6R = FNMS(KP707106781, T6Q, T6P);
+						  T6H = FNMS(KP707106781, T6G, T6F);
+						  T74 = FMA(KP707106781, T6G, T6F);
+						  T78 = FNMS(KP198912367, T77, T76);
+						  T7c = FMA(KP198912367, T76, T77);
+						  T6W = FMA(KP668178637, T6O, T6R);
+						  T6S = FNMS(KP668178637, T6R, T6O);
+					     }
+					}
+					{
+					     E T6U, T6E, T9r, T9t, T73, T6K;
+					     T6U = FNMS(KP923879532, T6D, T6w);
+					     T6E = FMA(KP923879532, T6D, T6w);
+					     T9r = FMA(KP923879532, T9q, T9p);
+					     T9t = FNMS(KP923879532, T9q, T9p);
+					     T73 = FMA(KP707106781, T6J, T6I);
+					     T6K = FNMS(KP707106781, T6J, T6I);
+					     {
+						  E T7a, T9x, T9y, T7d;
+						  {
+						       E T72, T7b, T6V, T6L, T79, T75;
+						       T7a = FMA(KP923879532, T71, T6Y);
+						       T72 = FNMS(KP923879532, T71, T6Y);
+						       T75 = FMA(KP198912367, T74, T73);
+						       T7b = FNMS(KP198912367, T73, T74);
+						       T6V = FNMS(KP668178637, T6H, T6K);
+						       T6L = FMA(KP668178637, T6K, T6H);
+						       T79 = T75 - T78;
+						       T9A = T75 + T78;
+						       T9z = FMA(KP923879532, T9w, T9v);
+						       T9x = FNMS(KP923879532, T9w, T9v);
+						       {
+							    E T6X, T9s, T9u, T6T;
+							    T6X = T6V - T6W;
+							    T9s = T6V + T6W;
+							    T9u = T6S - T6L;
+							    T6T = T6L + T6S;
+							    ri[WS(rs, 7)] = FMA(KP980785280, T79, T72);
+							    ri[WS(rs, 23)] = FNMS(KP980785280, T79, T72);
+							    ri[WS(rs, 11)] = FMA(KP831469612, T6X, T6U);
+							    ri[WS(rs, 27)] = FNMS(KP831469612, T6X, T6U);
+							    ii[WS(rs, 19)] = FNMS(KP831469612, T9s, T9r);
+							    ii[WS(rs, 3)] = FMA(KP831469612, T9s, T9r);
+							    ii[WS(rs, 27)] = FNMS(KP831469612, T9u, T9t);
+							    ii[WS(rs, 11)] = FMA(KP831469612, T9u, T9t);
+							    ri[WS(rs, 3)] = FMA(KP831469612, T6T, T6E);
+							    ri[WS(rs, 19)] = FNMS(KP831469612, T6T, T6E);
+							    T9y = T7c - T7b;
+							    T7d = T7b + T7c;
+						       }
+						  }
+						  ii[WS(rs, 23)] = FNMS(KP980785280, T9y, T9x);
+						  ii[WS(rs, 7)] = FMA(KP980785280, T9y, T9x);
+						  ri[WS(rs, 31)] = FMA(KP980785280, T7d, T7a);
+						  ri[WS(rs, 15)] = FNMS(KP980785280, T7d, T7a);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ii[WS(rs, 31)] = FMA(KP980785280, T9A, T9z);
+	       ii[WS(rs, 15)] = FNMS(KP980785280, T9A, T9z);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_CEXP, 0, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 32, "t2_32", twinstr, &GENUS, {236, 98, 252, 0}, 0, 0, 0 };
+
+void X(codelet_t2_32) (planner *p) {
+     X(kdft_dit_register) (p, t2_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -name t2_32 -include t.h */
+
+/*
+ * This function contains 488 FP additions, 280 FP multiplications,
+ * (or, 376 additions, 168 multiplications, 112 fused multiply/add),
+ * 158 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "t.h"
+
+static void t2_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T2, T5, T3, T6, T8, TM, TO, Td, T9, Te, Th, Tl, TD, TH, T1y;
+	       E T1H, T15, T1A, T11, T1F, T1n, T1p, T2q, T2I, T2u, T2K, T2V, T3b, T2Z, T3d;
+	       E Tu, Ty, T3l, T3n, T1t, T1v, T2f, T2h, T1a, T1e, T32, T34, T1W, T1Y, T2C;
+	       E T2E, Tg, TR, Tk, TS, Tm, TV, To, TT, T1M, T21, T1P, T22, T1Q, T25;
+	       E T1S, T23;
+	       {
+		    E Ts, T1d, Tx, T18, Tt, T1c, Tw, T19, TB, T14, TG, TZ, TC, T13, TF;
+		    E T10;
+		    {
+			 E T4, Tc, T7, Tb;
+			 T2 = W[0];
+			 T5 = W[1];
+			 T3 = W[2];
+			 T6 = W[3];
+			 T4 = T2 * T3;
+			 Tc = T5 * T3;
+			 T7 = T5 * T6;
+			 Tb = T2 * T6;
+			 T8 = T4 + T7;
+			 TM = T4 - T7;
+			 TO = Tb + Tc;
+			 Td = Tb - Tc;
+			 T9 = W[4];
+			 Ts = T2 * T9;
+			 T1d = T6 * T9;
+			 Tx = T5 * T9;
+			 T18 = T3 * T9;
+			 Te = W[5];
+			 Tt = T5 * Te;
+			 T1c = T3 * Te;
+			 Tw = T2 * Te;
+			 T19 = T6 * Te;
+			 Th = W[6];
+			 TB = T3 * Th;
+			 T14 = T5 * Th;
+			 TG = T6 * Th;
+			 TZ = T2 * Th;
+			 Tl = W[7];
+			 TC = T6 * Tl;
+			 T13 = T2 * Tl;
+			 TF = T3 * Tl;
+			 T10 = T5 * Tl;
+		    }
+		    TD = TB + TC;
+		    TH = TF - TG;
+		    T1y = TZ + T10;
+		    T1H = TF + TG;
+		    T15 = T13 + T14;
+		    T1A = T13 - T14;
+		    T11 = TZ - T10;
+		    T1F = TB - TC;
+		    T1n = FMA(T9, Th, Te * Tl);
+		    T1p = FNMS(Te, Th, T9 * Tl);
+		    {
+			 E T2o, T2p, T2s, T2t;
+			 T2o = T8 * Th;
+			 T2p = Td * Tl;
+			 T2q = T2o + T2p;
+			 T2I = T2o - T2p;
+			 T2s = T8 * Tl;
+			 T2t = Td * Th;
+			 T2u = T2s - T2t;
+			 T2K = T2s + T2t;
+		    }
+		    {
+			 E T2T, T2U, T2X, T2Y;
+			 T2T = TM * Th;
+			 T2U = TO * Tl;
+			 T2V = T2T - T2U;
+			 T3b = T2T + T2U;
+			 T2X = TM * Tl;
+			 T2Y = TO * Th;
+			 T2Z = T2X + T2Y;
+			 T3d = T2X - T2Y;
+			 Tu = Ts + Tt;
+			 Ty = Tw - Tx;
+			 T3l = FMA(Tu, Th, Ty * Tl);
+			 T3n = FNMS(Ty, Th, Tu * Tl);
+		    }
+		    T1t = Ts - Tt;
+		    T1v = Tw + Tx;
+		    T2f = FMA(T1t, Th, T1v * Tl);
+		    T2h = FNMS(T1v, Th, T1t * Tl);
+		    T1a = T18 - T19;
+		    T1e = T1c + T1d;
+		    T32 = FMA(T1a, Th, T1e * Tl);
+		    T34 = FNMS(T1e, Th, T1a * Tl);
+		    T1W = T18 + T19;
+		    T1Y = T1c - T1d;
+		    T2C = FMA(T1W, Th, T1Y * Tl);
+		    T2E = FNMS(T1Y, Th, T1W * Tl);
+		    {
+			 E Ta, Tf, Ti, Tj;
+			 Ta = T8 * T9;
+			 Tf = Td * Te;
+			 Tg = Ta - Tf;
+			 TR = Ta + Tf;
+			 Ti = T8 * Te;
+			 Tj = Td * T9;
+			 Tk = Ti + Tj;
+			 TS = Ti - Tj;
+		    }
+		    Tm = FMA(Tg, Th, Tk * Tl);
+		    TV = FNMS(TS, Th, TR * Tl);
+		    To = FNMS(Tk, Th, Tg * Tl);
+		    TT = FMA(TR, Th, TS * Tl);
+		    {
+			 E T1K, T1L, T1N, T1O;
+			 T1K = TM * T9;
+			 T1L = TO * Te;
+			 T1M = T1K - T1L;
+			 T21 = T1K + T1L;
+			 T1N = TM * Te;
+			 T1O = TO * T9;
+			 T1P = T1N + T1O;
+			 T22 = T1N - T1O;
+		    }
+		    T1Q = FMA(T1M, Th, T1P * Tl);
+		    T25 = FNMS(T22, Th, T21 * Tl);
+		    T1S = FNMS(T1P, Th, T1M * Tl);
+		    T23 = FMA(T21, Th, T22 * Tl);
+	       }
+	       {
+		    E TL, T6f, T8c, T8q, T3F, T5t, T7I, T7W, T2y, T6B, T6y, T7j, T4k, T5J, T4B;
+		    E T5G, T3h, T6H, T6O, T7o, T4L, T5N, T52, T5Q, T1i, T7V, T6i, T7D, T3K, T5u;
+		    E T3P, T5v, T1E, T6n, T6m, T7e, T3W, T5y, T41, T5z, T29, T6p, T6s, T7f, T47;
+		    E T5B, T4c, T5C, T2R, T6z, T6E, T7k, T4v, T5H, T4E, T5K, T3y, T6P, T6K, T7p;
+		    E T4W, T5R, T55, T5O;
+		    {
+			 E T1, T7G, Tq, T7F, TA, T3C, TJ, T3D, Tn, Tp;
+			 T1 = ri[0];
+			 T7G = ii[0];
+			 Tn = ri[WS(rs, 16)];
+			 Tp = ii[WS(rs, 16)];
+			 Tq = FMA(Tm, Tn, To * Tp);
+			 T7F = FNMS(To, Tn, Tm * Tp);
+			 {
+			      E Tv, Tz, TE, TI;
+			      Tv = ri[WS(rs, 8)];
+			      Tz = ii[WS(rs, 8)];
+			      TA = FMA(Tu, Tv, Ty * Tz);
+			      T3C = FNMS(Ty, Tv, Tu * Tz);
+			      TE = ri[WS(rs, 24)];
+			      TI = ii[WS(rs, 24)];
+			      TJ = FMA(TD, TE, TH * TI);
+			      T3D = FNMS(TH, TE, TD * TI);
+			 }
+			 {
+			      E Tr, TK, T8a, T8b;
+			      Tr = T1 + Tq;
+			      TK = TA + TJ;
+			      TL = Tr + TK;
+			      T6f = Tr - TK;
+			      T8a = T7G - T7F;
+			      T8b = TA - TJ;
+			      T8c = T8a - T8b;
+			      T8q = T8b + T8a;
+			 }
+			 {
+			      E T3B, T3E, T7E, T7H;
+			      T3B = T1 - Tq;
+			      T3E = T3C - T3D;
+			      T3F = T3B - T3E;
+			      T5t = T3B + T3E;
+			      T7E = T3C + T3D;
+			      T7H = T7F + T7G;
+			      T7I = T7E + T7H;
+			      T7W = T7H - T7E;
+			 }
+		    }
+		    {
+			 E T2e, T4g, T2w, T4z, T2j, T4h, T2n, T4y;
+			 {
+			      E T2c, T2d, T2r, T2v;
+			      T2c = ri[WS(rs, 1)];
+			      T2d = ii[WS(rs, 1)];
+			      T2e = FMA(T2, T2c, T5 * T2d);
+			      T4g = FNMS(T5, T2c, T2 * T2d);
+			      T2r = ri[WS(rs, 25)];
+			      T2v = ii[WS(rs, 25)];
+			      T2w = FMA(T2q, T2r, T2u * T2v);
+			      T4z = FNMS(T2u, T2r, T2q * T2v);
+			 }
+			 {
+			      E T2g, T2i, T2l, T2m;
+			      T2g = ri[WS(rs, 17)];
+			      T2i = ii[WS(rs, 17)];
+			      T2j = FMA(T2f, T2g, T2h * T2i);
+			      T4h = FNMS(T2h, T2g, T2f * T2i);
+			      T2l = ri[WS(rs, 9)];
+			      T2m = ii[WS(rs, 9)];
+			      T2n = FMA(T9, T2l, Te * T2m);
+			      T4y = FNMS(Te, T2l, T9 * T2m);
+			 }
+			 {
+			      E T2k, T2x, T6w, T6x;
+			      T2k = T2e + T2j;
+			      T2x = T2n + T2w;
+			      T2y = T2k + T2x;
+			      T6B = T2k - T2x;
+			      T6w = T4g + T4h;
+			      T6x = T4y + T4z;
+			      T6y = T6w - T6x;
+			      T7j = T6w + T6x;
+			 }
+			 {
+			      E T4i, T4j, T4x, T4A;
+			      T4i = T4g - T4h;
+			      T4j = T2n - T2w;
+			      T4k = T4i + T4j;
+			      T5J = T4i - T4j;
+			      T4x = T2e - T2j;
+			      T4A = T4y - T4z;
+			      T4B = T4x - T4A;
+			      T5G = T4x + T4A;
+			 }
+		    }
+		    {
+			 E T31, T4Y, T3f, T4J, T36, T4Z, T3a, T4I;
+			 {
+			      E T2W, T30, T3c, T3e;
+			      T2W = ri[WS(rs, 31)];
+			      T30 = ii[WS(rs, 31)];
+			      T31 = FMA(T2V, T2W, T2Z * T30);
+			      T4Y = FNMS(T2Z, T2W, T2V * T30);
+			      T3c = ri[WS(rs, 23)];
+			      T3e = ii[WS(rs, 23)];
+			      T3f = FMA(T3b, T3c, T3d * T3e);
+			      T4J = FNMS(T3d, T3c, T3b * T3e);
+			 }
+			 {
+			      E T33, T35, T38, T39;
+			      T33 = ri[WS(rs, 15)];
+			      T35 = ii[WS(rs, 15)];
+			      T36 = FMA(T32, T33, T34 * T35);
+			      T4Z = FNMS(T34, T33, T32 * T35);
+			      T38 = ri[WS(rs, 7)];
+			      T39 = ii[WS(rs, 7)];
+			      T3a = FMA(TR, T38, TS * T39);
+			      T4I = FNMS(TS, T38, TR * T39);
+			 }
+			 {
+			      E T37, T3g, T6M, T6N;
+			      T37 = T31 + T36;
+			      T3g = T3a + T3f;
+			      T3h = T37 + T3g;
+			      T6H = T37 - T3g;
+			      T6M = T4Y + T4Z;
+			      T6N = T4I + T4J;
+			      T6O = T6M - T6N;
+			      T7o = T6M + T6N;
+			 }
+			 {
+			      E T4H, T4K, T50, T51;
+			      T4H = T31 - T36;
+			      T4K = T4I - T4J;
+			      T4L = T4H - T4K;
+			      T5N = T4H + T4K;
+			      T50 = T4Y - T4Z;
+			      T51 = T3a - T3f;
+			      T52 = T50 + T51;
+			      T5Q = T50 - T51;
+			 }
+		    }
+		    {
+			 E TQ, T3G, T1g, T3N, TX, T3H, T17, T3M;
+			 {
+			      E TN, TP, T1b, T1f;
+			      TN = ri[WS(rs, 4)];
+			      TP = ii[WS(rs, 4)];
+			      TQ = FMA(TM, TN, TO * TP);
+			      T3G = FNMS(TO, TN, TM * TP);
+			      T1b = ri[WS(rs, 12)];
+			      T1f = ii[WS(rs, 12)];
+			      T1g = FMA(T1a, T1b, T1e * T1f);
+			      T3N = FNMS(T1e, T1b, T1a * T1f);
+			 }
+			 {
+			      E TU, TW, T12, T16;
+			      TU = ri[WS(rs, 20)];
+			      TW = ii[WS(rs, 20)];
+			      TX = FMA(TT, TU, TV * TW);
+			      T3H = FNMS(TV, TU, TT * TW);
+			      T12 = ri[WS(rs, 28)];
+			      T16 = ii[WS(rs, 28)];
+			      T17 = FMA(T11, T12, T15 * T16);
+			      T3M = FNMS(T15, T12, T11 * T16);
+			 }
+			 {
+			      E TY, T1h, T6g, T6h;
+			      TY = TQ + TX;
+			      T1h = T17 + T1g;
+			      T1i = TY + T1h;
+			      T7V = T1h - TY;
+			      T6g = T3G + T3H;
+			      T6h = T3M + T3N;
+			      T6i = T6g - T6h;
+			      T7D = T6g + T6h;
+			 }
+			 {
+			      E T3I, T3J, T3L, T3O;
+			      T3I = T3G - T3H;
+			      T3J = TQ - TX;
+			      T3K = T3I - T3J;
+			      T5u = T3J + T3I;
+			      T3L = T17 - T1g;
+			      T3O = T3M - T3N;
+			      T3P = T3L + T3O;
+			      T5v = T3L - T3O;
+			 }
+		    }
+		    {
+			 E T1m, T3S, T1C, T3Z, T1r, T3T, T1x, T3Y;
+			 {
+			      E T1k, T1l, T1z, T1B;
+			      T1k = ri[WS(rs, 2)];
+			      T1l = ii[WS(rs, 2)];
+			      T1m = FMA(T8, T1k, Td * T1l);
+			      T3S = FNMS(Td, T1k, T8 * T1l);
+			      T1z = ri[WS(rs, 26)];
+			      T1B = ii[WS(rs, 26)];
+			      T1C = FMA(T1y, T1z, T1A * T1B);
+			      T3Z = FNMS(T1A, T1z, T1y * T1B);
+			 }
+			 {
+			      E T1o, T1q, T1u, T1w;
+			      T1o = ri[WS(rs, 18)];
+			      T1q = ii[WS(rs, 18)];
+			      T1r = FMA(T1n, T1o, T1p * T1q);
+			      T3T = FNMS(T1p, T1o, T1n * T1q);
+			      T1u = ri[WS(rs, 10)];
+			      T1w = ii[WS(rs, 10)];
+			      T1x = FMA(T1t, T1u, T1v * T1w);
+			      T3Y = FNMS(T1v, T1u, T1t * T1w);
+			 }
+			 {
+			      E T1s, T1D, T6k, T6l;
+			      T1s = T1m + T1r;
+			      T1D = T1x + T1C;
+			      T1E = T1s + T1D;
+			      T6n = T1s - T1D;
+			      T6k = T3S + T3T;
+			      T6l = T3Y + T3Z;
+			      T6m = T6k - T6l;
+			      T7e = T6k + T6l;
+			 }
+			 {
+			      E T3U, T3V, T3X, T40;
+			      T3U = T3S - T3T;
+			      T3V = T1x - T1C;
+			      T3W = T3U + T3V;
+			      T5y = T3U - T3V;
+			      T3X = T1m - T1r;
+			      T40 = T3Y - T3Z;
+			      T41 = T3X - T40;
+			      T5z = T3X + T40;
+			 }
+		    }
+		    {
+			 E T1J, T43, T27, T4a, T1U, T44, T20, T49;
+			 {
+			      E T1G, T1I, T24, T26;
+			      T1G = ri[WS(rs, 30)];
+			      T1I = ii[WS(rs, 30)];
+			      T1J = FMA(T1F, T1G, T1H * T1I);
+			      T43 = FNMS(T1H, T1G, T1F * T1I);
+			      T24 = ri[WS(rs, 22)];
+			      T26 = ii[WS(rs, 22)];
+			      T27 = FMA(T23, T24, T25 * T26);
+			      T4a = FNMS(T25, T24, T23 * T26);
+			 }
+			 {
+			      E T1R, T1T, T1X, T1Z;
+			      T1R = ri[WS(rs, 14)];
+			      T1T = ii[WS(rs, 14)];
+			      T1U = FMA(T1Q, T1R, T1S * T1T);
+			      T44 = FNMS(T1S, T1R, T1Q * T1T);
+			      T1X = ri[WS(rs, 6)];
+			      T1Z = ii[WS(rs, 6)];
+			      T20 = FMA(T1W, T1X, T1Y * T1Z);
+			      T49 = FNMS(T1Y, T1X, T1W * T1Z);
+			 }
+			 {
+			      E T1V, T28, T6q, T6r;
+			      T1V = T1J + T1U;
+			      T28 = T20 + T27;
+			      T29 = T1V + T28;
+			      T6p = T1V - T28;
+			      T6q = T43 + T44;
+			      T6r = T49 + T4a;
+			      T6s = T6q - T6r;
+			      T7f = T6q + T6r;
+			 }
+			 {
+			      E T45, T46, T48, T4b;
+			      T45 = T43 - T44;
+			      T46 = T20 - T27;
+			      T47 = T45 + T46;
+			      T5B = T45 - T46;
+			      T48 = T1J - T1U;
+			      T4b = T49 - T4a;
+			      T4c = T48 - T4b;
+			      T5C = T48 + T4b;
+			 }
+		    }
+		    {
+			 E T2B, T4r, T2G, T4s, T4q, T4t, T2M, T4m, T2P, T4n, T4l, T4o;
+			 {
+			      E T2z, T2A, T2D, T2F;
+			      T2z = ri[WS(rs, 5)];
+			      T2A = ii[WS(rs, 5)];
+			      T2B = FMA(T21, T2z, T22 * T2A);
+			      T4r = FNMS(T22, T2z, T21 * T2A);
+			      T2D = ri[WS(rs, 21)];
+			      T2F = ii[WS(rs, 21)];
+			      T2G = FMA(T2C, T2D, T2E * T2F);
+			      T4s = FNMS(T2E, T2D, T2C * T2F);
+			 }
+			 T4q = T2B - T2G;
+			 T4t = T4r - T4s;
+			 {
+			      E T2J, T2L, T2N, T2O;
+			      T2J = ri[WS(rs, 29)];
+			      T2L = ii[WS(rs, 29)];
+			      T2M = FMA(T2I, T2J, T2K * T2L);
+			      T4m = FNMS(T2K, T2J, T2I * T2L);
+			      T2N = ri[WS(rs, 13)];
+			      T2O = ii[WS(rs, 13)];
+			      T2P = FMA(T1M, T2N, T1P * T2O);
+			      T4n = FNMS(T1P, T2N, T1M * T2O);
+			 }
+			 T4l = T2M - T2P;
+			 T4o = T4m - T4n;
+			 {
+			      E T2H, T2Q, T6C, T6D;
+			      T2H = T2B + T2G;
+			      T2Q = T2M + T2P;
+			      T2R = T2H + T2Q;
+			      T6z = T2Q - T2H;
+			      T6C = T4r + T4s;
+			      T6D = T4m + T4n;
+			      T6E = T6C - T6D;
+			      T7k = T6C + T6D;
+			 }
+			 {
+			      E T4p, T4u, T4C, T4D;
+			      T4p = T4l - T4o;
+			      T4u = T4q + T4t;
+			      T4v = KP707106781 * (T4p - T4u);
+			      T5H = KP707106781 * (T4u + T4p);
+			      T4C = T4t - T4q;
+			      T4D = T4l + T4o;
+			      T4E = KP707106781 * (T4C - T4D);
+			      T5K = KP707106781 * (T4C + T4D);
+			 }
+		    }
+		    {
+			 E T3k, T4M, T3p, T4N, T4O, T4P, T3t, T4S, T3w, T4T, T4R, T4U;
+			 {
+			      E T3i, T3j, T3m, T3o;
+			      T3i = ri[WS(rs, 3)];
+			      T3j = ii[WS(rs, 3)];
+			      T3k = FMA(T3, T3i, T6 * T3j);
+			      T4M = FNMS(T6, T3i, T3 * T3j);
+			      T3m = ri[WS(rs, 19)];
+			      T3o = ii[WS(rs, 19)];
+			      T3p = FMA(T3l, T3m, T3n * T3o);
+			      T4N = FNMS(T3n, T3m, T3l * T3o);
+			 }
+			 T4O = T4M - T4N;
+			 T4P = T3k - T3p;
+			 {
+			      E T3r, T3s, T3u, T3v;
+			      T3r = ri[WS(rs, 27)];
+			      T3s = ii[WS(rs, 27)];
+			      T3t = FMA(Th, T3r, Tl * T3s);
+			      T4S = FNMS(Tl, T3r, Th * T3s);
+			      T3u = ri[WS(rs, 11)];
+			      T3v = ii[WS(rs, 11)];
+			      T3w = FMA(Tg, T3u, Tk * T3v);
+			      T4T = FNMS(Tk, T3u, Tg * T3v);
+			 }
+			 T4R = T3t - T3w;
+			 T4U = T4S - T4T;
+			 {
+			      E T3q, T3x, T6I, T6J;
+			      T3q = T3k + T3p;
+			      T3x = T3t + T3w;
+			      T3y = T3q + T3x;
+			      T6P = T3x - T3q;
+			      T6I = T4M + T4N;
+			      T6J = T4S + T4T;
+			      T6K = T6I - T6J;
+			      T7p = T6I + T6J;
+			 }
+			 {
+			      E T4Q, T4V, T53, T54;
+			      T4Q = T4O - T4P;
+			      T4V = T4R + T4U;
+			      T4W = KP707106781 * (T4Q - T4V);
+			      T5R = KP707106781 * (T4Q + T4V);
+			      T53 = T4R - T4U;
+			      T54 = T4P + T4O;
+			      T55 = KP707106781 * (T53 - T54);
+			      T5O = KP707106781 * (T54 + T53);
+			 }
+		    }
+		    {
+			 E T2b, T7x, T7K, T7M, T3A, T7L, T7A, T7B;
+			 {
+			      E T1j, T2a, T7C, T7J;
+			      T1j = TL + T1i;
+			      T2a = T1E + T29;
+			      T2b = T1j + T2a;
+			      T7x = T1j - T2a;
+			      T7C = T7e + T7f;
+			      T7J = T7D + T7I;
+			      T7K = T7C + T7J;
+			      T7M = T7J - T7C;
+			 }
+			 {
+			      E T2S, T3z, T7y, T7z;
+			      T2S = T2y + T2R;
+			      T3z = T3h + T3y;
+			      T3A = T2S + T3z;
+			      T7L = T3z - T2S;
+			      T7y = T7j + T7k;
+			      T7z = T7o + T7p;
+			      T7A = T7y - T7z;
+			      T7B = T7y + T7z;
+			 }
+			 ri[WS(rs, 16)] = T2b - T3A;
+			 ii[WS(rs, 16)] = T7K - T7B;
+			 ri[0] = T2b + T3A;
+			 ii[0] = T7B + T7K;
+			 ri[WS(rs, 24)] = T7x - T7A;
+			 ii[WS(rs, 24)] = T7M - T7L;
+			 ri[WS(rs, 8)] = T7x + T7A;
+			 ii[WS(rs, 8)] = T7L + T7M;
+		    }
+		    {
+			 E T7h, T7t, T7Q, T7S, T7m, T7u, T7r, T7v;
+			 {
+			      E T7d, T7g, T7O, T7P;
+			      T7d = TL - T1i;
+			      T7g = T7e - T7f;
+			      T7h = T7d + T7g;
+			      T7t = T7d - T7g;
+			      T7O = T29 - T1E;
+			      T7P = T7I - T7D;
+			      T7Q = T7O + T7P;
+			      T7S = T7P - T7O;
+			 }
+			 {
+			      E T7i, T7l, T7n, T7q;
+			      T7i = T2y - T2R;
+			      T7l = T7j - T7k;
+			      T7m = T7i + T7l;
+			      T7u = T7l - T7i;
+			      T7n = T3h - T3y;
+			      T7q = T7o - T7p;
+			      T7r = T7n - T7q;
+			      T7v = T7n + T7q;
+			 }
+			 {
+			      E T7s, T7N, T7w, T7R;
+			      T7s = KP707106781 * (T7m + T7r);
+			      ri[WS(rs, 20)] = T7h - T7s;
+			      ri[WS(rs, 4)] = T7h + T7s;
+			      T7N = KP707106781 * (T7u + T7v);
+			      ii[WS(rs, 4)] = T7N + T7Q;
+			      ii[WS(rs, 20)] = T7Q - T7N;
+			      T7w = KP707106781 * (T7u - T7v);
+			      ri[WS(rs, 28)] = T7t - T7w;
+			      ri[WS(rs, 12)] = T7t + T7w;
+			      T7R = KP707106781 * (T7r - T7m);
+			      ii[WS(rs, 12)] = T7R + T7S;
+			      ii[WS(rs, 28)] = T7S - T7R;
+			 }
+		    }
+		    {
+			 E T6j, T7X, T83, T6X, T6u, T7U, T77, T7b, T70, T82, T6G, T6U, T74, T7a, T6R;
+			 E T6V;
+			 {
+			      E T6o, T6t, T6A, T6F;
+			      T6j = T6f - T6i;
+			      T7X = T7V + T7W;
+			      T83 = T7W - T7V;
+			      T6X = T6f + T6i;
+			      T6o = T6m - T6n;
+			      T6t = T6p + T6s;
+			      T6u = KP707106781 * (T6o - T6t);
+			      T7U = KP707106781 * (T6o + T6t);
+			      {
+				   E T75, T76, T6Y, T6Z;
+				   T75 = T6H + T6K;
+				   T76 = T6O + T6P;
+				   T77 = FNMS(KP382683432, T76, KP923879532 * T75);
+				   T7b = FMA(KP923879532, T76, KP382683432 * T75);
+				   T6Y = T6n + T6m;
+				   T6Z = T6p - T6s;
+				   T70 = KP707106781 * (T6Y + T6Z);
+				   T82 = KP707106781 * (T6Z - T6Y);
+			      }
+			      T6A = T6y - T6z;
+			      T6F = T6B - T6E;
+			      T6G = FMA(KP923879532, T6A, KP382683432 * T6F);
+			      T6U = FNMS(KP923879532, T6F, KP382683432 * T6A);
+			      {
+				   E T72, T73, T6L, T6Q;
+				   T72 = T6y + T6z;
+				   T73 = T6B + T6E;
+				   T74 = FMA(KP382683432, T72, KP923879532 * T73);
+				   T7a = FNMS(KP382683432, T73, KP923879532 * T72);
+				   T6L = T6H - T6K;
+				   T6Q = T6O - T6P;
+				   T6R = FNMS(KP923879532, T6Q, KP382683432 * T6L);
+				   T6V = FMA(KP382683432, T6Q, KP923879532 * T6L);
+			      }
+			 }
+			 {
+			      E T6v, T6S, T81, T84;
+			      T6v = T6j + T6u;
+			      T6S = T6G + T6R;
+			      ri[WS(rs, 22)] = T6v - T6S;
+			      ri[WS(rs, 6)] = T6v + T6S;
+			      T81 = T6U + T6V;
+			      T84 = T82 + T83;
+			      ii[WS(rs, 6)] = T81 + T84;
+			      ii[WS(rs, 22)] = T84 - T81;
+			 }
+			 {
+			      E T6T, T6W, T85, T86;
+			      T6T = T6j - T6u;
+			      T6W = T6U - T6V;
+			      ri[WS(rs, 30)] = T6T - T6W;
+			      ri[WS(rs, 14)] = T6T + T6W;
+			      T85 = T6R - T6G;
+			      T86 = T83 - T82;
+			      ii[WS(rs, 14)] = T85 + T86;
+			      ii[WS(rs, 30)] = T86 - T85;
+			 }
+			 {
+			      E T71, T78, T7T, T7Y;
+			      T71 = T6X + T70;
+			      T78 = T74 + T77;
+			      ri[WS(rs, 18)] = T71 - T78;
+			      ri[WS(rs, 2)] = T71 + T78;
+			      T7T = T7a + T7b;
+			      T7Y = T7U + T7X;
+			      ii[WS(rs, 2)] = T7T + T7Y;
+			      ii[WS(rs, 18)] = T7Y - T7T;
+			 }
+			 {
+			      E T79, T7c, T7Z, T80;
+			      T79 = T6X - T70;
+			      T7c = T7a - T7b;
+			      ri[WS(rs, 26)] = T79 - T7c;
+			      ri[WS(rs, 10)] = T79 + T7c;
+			      T7Z = T77 - T74;
+			      T80 = T7X - T7U;
+			      ii[WS(rs, 10)] = T7Z + T80;
+			      ii[WS(rs, 26)] = T80 - T7Z;
+			 }
+		    }
+		    {
+			 E T3R, T5d, T8r, T8x, T4e, T8o, T5n, T5r, T4G, T5a, T5g, T8w, T5k, T5q, T57;
+			 E T5b, T3Q, T8p;
+			 T3Q = KP707106781 * (T3K - T3P);
+			 T3R = T3F - T3Q;
+			 T5d = T3F + T3Q;
+			 T8p = KP707106781 * (T5v - T5u);
+			 T8r = T8p + T8q;
+			 T8x = T8q - T8p;
+			 {
+			      E T42, T4d, T5l, T5m;
+			      T42 = FNMS(KP923879532, T41, KP382683432 * T3W);
+			      T4d = FMA(KP382683432, T47, KP923879532 * T4c);
+			      T4e = T42 - T4d;
+			      T8o = T42 + T4d;
+			      T5l = T4L + T4W;
+			      T5m = T52 + T55;
+			      T5n = FNMS(KP555570233, T5m, KP831469612 * T5l);
+			      T5r = FMA(KP831469612, T5m, KP555570233 * T5l);
+			 }
+			 {
+			      E T4w, T4F, T5e, T5f;
+			      T4w = T4k - T4v;
+			      T4F = T4B - T4E;
+			      T4G = FMA(KP980785280, T4w, KP195090322 * T4F);
+			      T5a = FNMS(KP980785280, T4F, KP195090322 * T4w);
+			      T5e = FMA(KP923879532, T3W, KP382683432 * T41);
+			      T5f = FNMS(KP923879532, T47, KP382683432 * T4c);
+			      T5g = T5e + T5f;
+			      T8w = T5f - T5e;
+			 }
+			 {
+			      E T5i, T5j, T4X, T56;
+			      T5i = T4k + T4v;
+			      T5j = T4B + T4E;
+			      T5k = FMA(KP555570233, T5i, KP831469612 * T5j);
+			      T5q = FNMS(KP555570233, T5j, KP831469612 * T5i);
+			      T4X = T4L - T4W;
+			      T56 = T52 - T55;
+			      T57 = FNMS(KP980785280, T56, KP195090322 * T4X);
+			      T5b = FMA(KP195090322, T56, KP980785280 * T4X);
+			 }
+			 {
+			      E T4f, T58, T8v, T8y;
+			      T4f = T3R + T4e;
+			      T58 = T4G + T57;
+			      ri[WS(rs, 23)] = T4f - T58;
+			      ri[WS(rs, 7)] = T4f + T58;
+			      T8v = T5a + T5b;
+			      T8y = T8w + T8x;
+			      ii[WS(rs, 7)] = T8v + T8y;
+			      ii[WS(rs, 23)] = T8y - T8v;
+			 }
+			 {
+			      E T59, T5c, T8z, T8A;
+			      T59 = T3R - T4e;
+			      T5c = T5a - T5b;
+			      ri[WS(rs, 31)] = T59 - T5c;
+			      ri[WS(rs, 15)] = T59 + T5c;
+			      T8z = T57 - T4G;
+			      T8A = T8x - T8w;
+			      ii[WS(rs, 15)] = T8z + T8A;
+			      ii[WS(rs, 31)] = T8A - T8z;
+			 }
+			 {
+			      E T5h, T5o, T8n, T8s;
+			      T5h = T5d + T5g;
+			      T5o = T5k + T5n;
+			      ri[WS(rs, 19)] = T5h - T5o;
+			      ri[WS(rs, 3)] = T5h + T5o;
+			      T8n = T5q + T5r;
+			      T8s = T8o + T8r;
+			      ii[WS(rs, 3)] = T8n + T8s;
+			      ii[WS(rs, 19)] = T8s - T8n;
+			 }
+			 {
+			      E T5p, T5s, T8t, T8u;
+			      T5p = T5d - T5g;
+			      T5s = T5q - T5r;
+			      ri[WS(rs, 27)] = T5p - T5s;
+			      ri[WS(rs, 11)] = T5p + T5s;
+			      T8t = T5n - T5k;
+			      T8u = T8r - T8o;
+			      ii[WS(rs, 11)] = T8t + T8u;
+			      ii[WS(rs, 27)] = T8u - T8t;
+			 }
+		    }
+		    {
+			 E T5x, T5Z, T8d, T8j, T5E, T88, T69, T6d, T5M, T5W, T62, T8i, T66, T6c, T5T;
+			 E T5X, T5w, T89;
+			 T5w = KP707106781 * (T5u + T5v);
+			 T5x = T5t - T5w;
+			 T5Z = T5t + T5w;
+			 T89 = KP707106781 * (T3K + T3P);
+			 T8d = T89 + T8c;
+			 T8j = T8c - T89;
+			 {
+			      E T5A, T5D, T67, T68;
+			      T5A = FNMS(KP382683432, T5z, KP923879532 * T5y);
+			      T5D = FMA(KP923879532, T5B, KP382683432 * T5C);
+			      T5E = T5A - T5D;
+			      T88 = T5A + T5D;
+			      T67 = T5N + T5O;
+			      T68 = T5Q + T5R;
+			      T69 = FNMS(KP195090322, T68, KP980785280 * T67);
+			      T6d = FMA(KP195090322, T67, KP980785280 * T68);
+			 }
+			 {
+			      E T5I, T5L, T60, T61;
+			      T5I = T5G - T5H;
+			      T5L = T5J - T5K;
+			      T5M = FMA(KP555570233, T5I, KP831469612 * T5L);
+			      T5W = FNMS(KP831469612, T5I, KP555570233 * T5L);
+			      T60 = FMA(KP382683432, T5y, KP923879532 * T5z);
+			      T61 = FNMS(KP382683432, T5B, KP923879532 * T5C);
+			      T62 = T60 + T61;
+			      T8i = T61 - T60;
+			 }
+			 {
+			      E T64, T65, T5P, T5S;
+			      T64 = T5G + T5H;
+			      T65 = T5J + T5K;
+			      T66 = FMA(KP980785280, T64, KP195090322 * T65);
+			      T6c = FNMS(KP195090322, T64, KP980785280 * T65);
+			      T5P = T5N - T5O;
+			      T5S = T5Q - T5R;
+			      T5T = FNMS(KP831469612, T5S, KP555570233 * T5P);
+			      T5X = FMA(KP831469612, T5P, KP555570233 * T5S);
+			 }
+			 {
+			      E T5F, T5U, T8h, T8k;
+			      T5F = T5x + T5E;
+			      T5U = T5M + T5T;
+			      ri[WS(rs, 21)] = T5F - T5U;
+			      ri[WS(rs, 5)] = T5F + T5U;
+			      T8h = T5W + T5X;
+			      T8k = T8i + T8j;
+			      ii[WS(rs, 5)] = T8h + T8k;
+			      ii[WS(rs, 21)] = T8k - T8h;
+			 }
+			 {
+			      E T5V, T5Y, T8l, T8m;
+			      T5V = T5x - T5E;
+			      T5Y = T5W - T5X;
+			      ri[WS(rs, 29)] = T5V - T5Y;
+			      ri[WS(rs, 13)] = T5V + T5Y;
+			      T8l = T5T - T5M;
+			      T8m = T8j - T8i;
+			      ii[WS(rs, 13)] = T8l + T8m;
+			      ii[WS(rs, 29)] = T8m - T8l;
+			 }
+			 {
+			      E T63, T6a, T87, T8e;
+			      T63 = T5Z + T62;
+			      T6a = T66 + T69;
+			      ri[WS(rs, 17)] = T63 - T6a;
+			      ri[WS(rs, 1)] = T63 + T6a;
+			      T87 = T6c + T6d;
+			      T8e = T88 + T8d;
+			      ii[WS(rs, 1)] = T87 + T8e;
+			      ii[WS(rs, 17)] = T8e - T87;
+			 }
+			 {
+			      E T6b, T6e, T8f, T8g;
+			      T6b = T5Z - T62;
+			      T6e = T6c - T6d;
+			      ri[WS(rs, 25)] = T6b - T6e;
+			      ri[WS(rs, 9)] = T6b + T6e;
+			      T8f = T69 - T66;
+			      T8g = T8d - T88;
+			      ii[WS(rs, 9)] = T8f + T8g;
+			      ii[WS(rs, 25)] = T8g - T8f;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_CEXP, 0, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 32, "t2_32", twinstr, &GENUS, {376, 168, 112, 0}, 0, 0, 0 };
+
+void X(codelet_t2_32) (planner *p) {
+     X(kdft_dit_register) (p, t2_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t2_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t2_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include t.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 33 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "t.h"
+
+static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E Ti, Tq, To, Te, Ty, Tz, Tm, Ts;
+	       {
+		    E T2, T6, T3, T5;
+		    T2 = W[0];
+		    T6 = W[3];
+		    T3 = W[2];
+		    T5 = W[1];
+		    {
+			 E T1, Tx, Td, Tw, Tj, Tl, Ta, T4, Tk, Tr;
+			 T1 = ri[0];
+			 Ta = T2 * T6;
+			 T4 = T2 * T3;
+			 Tx = ii[0];
+			 {
+			      E T8, Tb, T7, Tc;
+			      T8 = ri[WS(rs, 2)];
+			      Tb = FNMS(T5, T3, Ta);
+			      T7 = FMA(T5, T6, T4);
+			      Tc = ii[WS(rs, 2)];
+			      {
+				   E Tf, Th, T9, Tv, Tg, Tp;
+				   Tf = ri[WS(rs, 1)];
+				   Th = ii[WS(rs, 1)];
+				   T9 = T7 * T8;
+				   Tv = T7 * Tc;
+				   Tg = T2 * Tf;
+				   Tp = T2 * Th;
+				   Td = FMA(Tb, Tc, T9);
+				   Tw = FNMS(Tb, T8, Tv);
+				   Ti = FMA(T5, Th, Tg);
+				   Tq = FNMS(T5, Tf, Tp);
+			      }
+			      Tj = ri[WS(rs, 3)];
+			      Tl = ii[WS(rs, 3)];
+			 }
+			 To = T1 - Td;
+			 Te = T1 + Td;
+			 Ty = Tw + Tx;
+			 Tz = Tx - Tw;
+			 Tk = T3 * Tj;
+			 Tr = T3 * Tl;
+			 Tm = FMA(T6, Tl, Tk);
+			 Ts = FNMS(T6, Tj, Tr);
+		    }
+	       }
+	       {
+		    E Tn, TA, Tu, Tt;
+		    Tn = Ti + Tm;
+		    TA = Ti - Tm;
+		    Tu = Tq + Ts;
+		    Tt = Tq - Ts;
+		    ii[WS(rs, 3)] = TA + Tz;
+		    ii[WS(rs, 1)] = Tz - TA;
+		    ri[0] = Te + Tn;
+		    ri[WS(rs, 2)] = Te - Tn;
+		    ri[WS(rs, 1)] = To + Tt;
+		    ri[WS(rs, 3)] = To - Tt;
+		    ii[WS(rs, 2)] = Ty - Tu;
+		    ii[0] = Tu + Ty;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, {16, 8, 8, 0}, 0, 0, 0 };
+
+void X(codelet_t2_4) (planner *p) {
+     X(kdft_dit_register) (p, t2_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include t.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 21 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "t.h"
+
+static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T2, T4, T3, T5, T6, T8;
+	       T2 = W[0];
+	       T4 = W[1];
+	       T3 = W[2];
+	       T5 = W[3];
+	       T6 = FMA(T2, T3, T4 * T5);
+	       T8 = FNMS(T4, T3, T2 * T5);
+	       {
+		    E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
+		    T1 = ri[0];
+		    Tp = ii[0];
+		    T7 = ri[WS(rs, 2)];
+		    T9 = ii[WS(rs, 2)];
+		    Ta = FMA(T6, T7, T8 * T9);
+		    To = FNMS(T8, T7, T6 * T9);
+		    {
+			 E Tc, Td, Tf, Tg;
+			 Tc = ri[WS(rs, 1)];
+			 Td = ii[WS(rs, 1)];
+			 Te = FMA(T2, Tc, T4 * Td);
+			 Tk = FNMS(T4, Tc, T2 * Td);
+			 Tf = ri[WS(rs, 3)];
+			 Tg = ii[WS(rs, 3)];
+			 Th = FMA(T3, Tf, T5 * Tg);
+			 Tl = FNMS(T5, Tf, T3 * Tg);
+		    }
+		    {
+			 E Tb, Ti, Tn, Tq;
+			 Tb = T1 + Ta;
+			 Ti = Te + Th;
+			 ri[WS(rs, 2)] = Tb - Ti;
+			 ri[0] = Tb + Ti;
+			 Tn = Tk + Tl;
+			 Tq = To + Tp;
+			 ii[0] = Tn + Tq;
+			 ii[WS(rs, 2)] = Tq - Tn;
+		    }
+		    {
+			 E Tj, Tm, Tr, Ts;
+			 Tj = T1 - Ta;
+			 Tm = Tk - Tl;
+			 ri[WS(rs, 3)] = Tj - Tm;
+			 ri[WS(rs, 1)] = Tj + Tm;
+			 Tr = Tp - To;
+			 Ts = Te - Th;
+			 ii[WS(rs, 1)] = Tr - Ts;
+			 ii[WS(rs, 3)] = Ts + Tr;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, {16, 8, 8, 0}, 0, 0, 0 };
+
+void X(codelet_t2_4) (planner *p) {
+     X(kdft_dit_register) (p, t2_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t2_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t2_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:09 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include t.h */
+
+/*
+ * This function contains 44 FP additions, 40 FP multiplications,
+ * (or, 14 additions, 10 multiplications, 30 fused multiply/add),
+ * 47 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t.h"
+
+static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E Ta, T1, TO, Tp, TS, Ti, TL, TC, To, TE, Ts, TF, T2, T8, T5;
+	       E TT, Tt, TG;
+	       T2 = W[0];
+	       Ta = W[3];
+	       T8 = W[2];
+	       T5 = W[1];
+	       {
+		    E Tq, Tr, Te, T9;
+		    T1 = ri[0];
+		    Te = T2 * Ta;
+		    T9 = T2 * T8;
+		    TO = ii[0];
+		    {
+			 E T3, Tf, Tm, Tj, Tb, T4, T6, Tc, Tg;
+			 T3 = ri[WS(rs, 1)];
+			 Tf = FMA(T5, T8, Te);
+			 Tm = FNMS(T5, T8, Te);
+			 Tj = FMA(T5, Ta, T9);
+			 Tb = FNMS(T5, Ta, T9);
+			 T4 = T2 * T3;
+			 T6 = ii[WS(rs, 1)];
+			 Tc = ri[WS(rs, 4)];
+			 Tg = ii[WS(rs, 4)];
+			 {
+			      E Tk, Tl, Tn, TD;
+			      {
+				   E T7, Tz, Th, TB, Ty, Td, TA;
+				   Tk = ri[WS(rs, 2)];
+				   T7 = FMA(T5, T6, T4);
+				   Ty = T2 * T6;
+				   Td = Tb * Tc;
+				   TA = Tb * Tg;
+				   Tl = Tj * Tk;
+				   Tz = FNMS(T5, T3, Ty);
+				   Th = FMA(Tf, Tg, Td);
+				   TB = FNMS(Tf, Tc, TA);
+				   Tn = ii[WS(rs, 2)];
+				   Tp = ri[WS(rs, 3)];
+				   TS = T7 - Th;
+				   Ti = T7 + Th;
+				   TL = Tz + TB;
+				   TC = Tz - TB;
+				   TD = Tj * Tn;
+				   Tq = T8 * Tp;
+				   Tr = ii[WS(rs, 3)];
+			      }
+			      To = FMA(Tm, Tn, Tl);
+			      TE = FNMS(Tm, Tk, TD);
+			 }
+		    }
+		    Ts = FMA(Ta, Tr, Tq);
+		    TF = T8 * Tr;
+	       }
+	       TT = To - Ts;
+	       Tt = To + Ts;
+	       TG = FNMS(Ta, Tp, TF);
+	       {
+		    E TU, TW, TV, TR, Tw, Tu;
+		    TU = FMA(KP618033988, TT, TS);
+		    TW = FNMS(KP618033988, TS, TT);
+		    Tw = Ti - Tt;
+		    Tu = Ti + Tt;
+		    {
+			 E TM, TH, Tv, TI, TK;
+			 TM = TE + TG;
+			 TH = TE - TG;
+			 ri[0] = T1 + Tu;
+			 Tv = FNMS(KP250000000, Tu, T1);
+			 TI = FMA(KP618033988, TH, TC);
+			 TK = FNMS(KP618033988, TC, TH);
+			 {
+			      E TQ, TN, TJ, Tx, TP;
+			      TQ = TL - TM;
+			      TN = TL + TM;
+			      TJ = FNMS(KP559016994, Tw, Tv);
+			      Tx = FMA(KP559016994, Tw, Tv);
+			      ii[0] = TN + TO;
+			      TP = FNMS(KP250000000, TN, TO);
+			      ri[WS(rs, 1)] = FMA(KP951056516, TI, Tx);
+			      ri[WS(rs, 4)] = FNMS(KP951056516, TI, Tx);
+			      ri[WS(rs, 3)] = FMA(KP951056516, TK, TJ);
+			      ri[WS(rs, 2)] = FNMS(KP951056516, TK, TJ);
+			      TV = FNMS(KP559016994, TQ, TP);
+			      TR = FMA(KP559016994, TQ, TP);
+			 }
+		    }
+		    ii[WS(rs, 4)] = FMA(KP951056516, TU, TR);
+		    ii[WS(rs, 1)] = FNMS(KP951056516, TU, TR);
+		    ii[WS(rs, 3)] = FNMS(KP951056516, TW, TV);
+		    ii[WS(rs, 2)] = FMA(KP951056516, TW, TV);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, {14, 10, 30, 0}, 0, 0, 0 };
+
+void X(codelet_t2_5) (planner *p) {
+     X(kdft_dit_register) (p, t2_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include t.h */
+
+/*
+ * This function contains 44 FP additions, 32 FP multiplications,
+ * (or, 30 additions, 18 multiplications, 14 fused multiply/add),
+ * 37 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t.h"
+
+static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E T2, T4, T7, T9, Tb, Tl, Tf, Tj;
+	       {
+		    E T8, Te, Ta, Td;
+		    T2 = W[0];
+		    T4 = W[1];
+		    T7 = W[2];
+		    T9 = W[3];
+		    T8 = T2 * T7;
+		    Te = T4 * T7;
+		    Ta = T4 * T9;
+		    Td = T2 * T9;
+		    Tb = T8 - Ta;
+		    Tl = Td - Te;
+		    Tf = Td + Te;
+		    Tj = T8 + Ta;
+	       }
+	       {
+		    E T1, TI, Ty, TB, TN, TM, TF, TG, TH, Ti, Tr, Ts;
+		    T1 = ri[0];
+		    TI = ii[0];
+		    {
+			 E T6, Tw, Tq, TA, Th, Tx, Tn, Tz;
+			 {
+			      E T3, T5, To, Tp;
+			      T3 = ri[WS(rs, 1)];
+			      T5 = ii[WS(rs, 1)];
+			      T6 = FMA(T2, T3, T4 * T5);
+			      Tw = FNMS(T4, T3, T2 * T5);
+			      To = ri[WS(rs, 3)];
+			      Tp = ii[WS(rs, 3)];
+			      Tq = FMA(T7, To, T9 * Tp);
+			      TA = FNMS(T9, To, T7 * Tp);
+			 }
+			 {
+			      E Tc, Tg, Tk, Tm;
+			      Tc = ri[WS(rs, 4)];
+			      Tg = ii[WS(rs, 4)];
+			      Th = FMA(Tb, Tc, Tf * Tg);
+			      Tx = FNMS(Tf, Tc, Tb * Tg);
+			      Tk = ri[WS(rs, 2)];
+			      Tm = ii[WS(rs, 2)];
+			      Tn = FMA(Tj, Tk, Tl * Tm);
+			      Tz = FNMS(Tl, Tk, Tj * Tm);
+			 }
+			 Ty = Tw - Tx;
+			 TB = Tz - TA;
+			 TN = Tn - Tq;
+			 TM = T6 - Th;
+			 TF = Tw + Tx;
+			 TG = Tz + TA;
+			 TH = TF + TG;
+			 Ti = T6 + Th;
+			 Tr = Tn + Tq;
+			 Ts = Ti + Tr;
+		    }
+		    ri[0] = T1 + Ts;
+		    ii[0] = TH + TI;
+		    {
+			 E TC, TE, Tv, TD, Tt, Tu;
+			 TC = FMA(KP951056516, Ty, KP587785252 * TB);
+			 TE = FNMS(KP587785252, Ty, KP951056516 * TB);
+			 Tt = KP559016994 * (Ti - Tr);
+			 Tu = FNMS(KP250000000, Ts, T1);
+			 Tv = Tt + Tu;
+			 TD = Tu - Tt;
+			 ri[WS(rs, 4)] = Tv - TC;
+			 ri[WS(rs, 3)] = TD + TE;
+			 ri[WS(rs, 1)] = Tv + TC;
+			 ri[WS(rs, 2)] = TD - TE;
+		    }
+		    {
+			 E TO, TP, TL, TQ, TJ, TK;
+			 TO = FMA(KP951056516, TM, KP587785252 * TN);
+			 TP = FNMS(KP587785252, TM, KP951056516 * TN);
+			 TJ = KP559016994 * (TF - TG);
+			 TK = FNMS(KP250000000, TH, TI);
+			 TL = TJ + TK;
+			 TQ = TK - TJ;
+			 ii[WS(rs, 1)] = TL - TO;
+			 ii[WS(rs, 3)] = TQ - TP;
+			 ii[WS(rs, 4)] = TO + TL;
+			 ii[WS(rs, 2)] = TP + TQ;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, {30, 18, 14, 0}, 0, 0, 0 };
+
+void X(codelet_t2_5) (planner *p) {
+     X(kdft_dit_register) (p, t2_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t2_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t2_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,4096 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:01 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 64 -name t2_64 -include t.h */
+
+/*
+ * This function contains 1154 FP additions, 840 FP multiplications,
+ * (or, 520 additions, 206 multiplications, 634 fused multiply/add),
+ * 349 stack variables, 15 constants, and 256 memory accesses
+ */
+#include "t.h"
+
+static void t2_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E Tg0, TlC, TlB, Tg3;
+	       {
+		    E T2, T3, Tc, T8, Te, T5, T6, T14, T3d, T3i, TJ, T7, Tr, T3g, TG;
+		    E T10, T3a, TL, TP, Tb, Tt, T17, Td, Ti, T3N, T3R, T1i, Tu, T1I, T2U;
+		    E T1t, T3U, T5O, T48, T2u, T7B, TK, T79, T3D, T2h, T2l, T3G, T1x, T3X, T2d;
+		    E T1M, T2X, T4B, T4x, T3j, T4T, T29, T5s, T81, T5w, T7X, T7N, T7h, T64, T6a;
+		    E T6e, T7l, T60, T7R, T6h, T5A, T7o, T6J, T6k, T5E, T6N, T7r, T6x, T6t, T7c;
+		    E TO, T2x, T7E, TU, TQ, T2C, T2y, T5R, T4b, T4c, T4g, T4W, T3m, T3r, T3n;
+		    E T1k, Tx, Ty, T4p, T4s, TC, T23, T1Z, T19, Th, T31, T35, T1e, T44, T41;
+		    E T1a, T6W, T70, T55, T59, T3v, T3z, Tf, T1R, T2N, T2Q, T1V, T1p, T1l, Tm;
+		    {
+			 E T1H, T1s, T2g, Tg, Tw, TH, T2t, T47, T3h, T3M, T4w, T28, T3Q, T4A, T2c;
+			 E Ts;
+			 {
+			      E T4, T13, TI, TF, TZ, Ta, T9;
+			      T2 = W[0];
+			      T3 = W[2];
+			      Tc = W[5];
+			      T8 = W[4];
+			      Te = W[6];
+			      T4 = T2 * T3;
+			      T13 = T2 * Tc;
+			      TI = T3 * Tc;
+			      TF = T3 * T8;
+			      T1H = T8 * Te;
+			      TZ = T2 * T8;
+			      T5 = W[1];
+			      T6 = W[3];
+			      T1s = T3 * Te;
+			      T2g = T2 * Te;
+			      T14 = FNMS(T5, T8, T13);
+			      T3d = FMA(T5, T8, T13);
+			      T3i = FNMS(T6, T8, TI);
+			      TJ = FMA(T6, T8, TI);
+			      T7 = FNMS(T5, T6, T4);
+			      Tr = FMA(T5, T6, T4);
+			      Ta = T2 * T6;
+			      Tg = T7 * Tc;
+			      Tw = Tr * Tc;
+			      T3g = FMA(T6, Tc, TF);
+			      TG = FNMS(T6, Tc, TF);
+			      T10 = FMA(T5, Tc, TZ);
+			      T3a = FNMS(T5, Tc, TZ);
+			      TH = TG * Te;
+			      T2t = T10 * Te;
+			      T47 = T3a * Te;
+			      T3h = T3g * Te;
+			      TL = W[8];
+			      TP = W[9];
+			      T9 = T7 * T8;
+			      Tb = FMA(T5, T3, Ta);
+			      Tt = FNMS(T5, T3, Ta);
+			      T3M = T2 * TL;
+			      T4w = T8 * TL;
+			      T28 = T3 * TL;
+			      T3Q = T2 * TP;
+			      T4A = T8 * TP;
+			      T2c = T3 * TP;
+			      T17 = FNMS(Tb, Tc, T9);
+			      Td = FMA(Tb, Tc, T9);
+			      Ts = Tr * T8;
+			      Ti = W[7];
+			 }
+			 {
+			      E T5r, T80, T1L, T2k, T1w, T5z, T2B, T2v;
+			      T3N = FMA(T5, TP, T3M);
+			      T3R = FNMS(T5, TL, T3Q);
+			      T1i = FMA(Tt, Tc, Ts);
+			      Tu = FNMS(Tt, Tc, Ts);
+			      T1I = FNMS(Tc, Ti, T1H);
+			      T2U = FMA(Tc, Ti, T1H);
+			      T1t = FMA(T6, Ti, T1s);
+			      T3U = FNMS(T6, Ti, T1s);
+			      T5O = FNMS(T3d, Ti, T47);
+			      T48 = FMA(T3d, Ti, T47);
+			      T2u = FMA(T14, Ti, T2t);
+			      T7B = FNMS(T14, Ti, T2t);
+			      T1L = T8 * Ti;
+			      T2k = T2 * Ti;
+			      T1w = T3 * Ti;
+			      TK = FMA(TJ, Ti, TH);
+			      T79 = FNMS(TJ, Ti, TH);
+			      T3D = FMA(T5, Ti, T2g);
+			      T2h = FNMS(T5, Ti, T2g);
+			      T2l = FMA(T5, Te, T2k);
+			      T3G = FNMS(T5, Te, T2k);
+			      T1x = FNMS(T6, Te, T1w);
+			      T3X = FMA(T6, Te, T1w);
+			      T2d = FNMS(T6, TL, T2c);
+			      T1M = FMA(Tc, Te, T1L);
+			      T2X = FNMS(Tc, Te, T1L);
+			      T4B = FNMS(Tc, TL, T4A);
+			      T4x = FMA(Tc, TP, T4w);
+			      T3j = FMA(T3i, Ti, T3h);
+			      T4T = FNMS(T3i, Ti, T3h);
+			      T29 = FMA(T6, TP, T28);
+			      T5r = T3g * TL;
+			      T80 = T7 * TP;
+			      {
+				   E T7M, T7g, T63, T5v, T7W;
+				   T5v = T3g * TP;
+				   T7W = T7 * TL;
+				   T5s = FMA(T3i, TP, T5r);
+				   T81 = FNMS(Tb, TL, T80);
+				   T5w = FNMS(T3i, TL, T5v);
+				   T7X = FMA(Tb, TP, T7W);
+				   T7M = TG * TL;
+				   T7g = T10 * TL;
+				   T63 = T3a * TP;
+				   {
+					E T6d, T7k, T69, T5Z, T7Q;
+					T69 = Tr * TL;
+					T7N = FMA(TJ, TP, T7M);
+					T7h = FMA(T14, TP, T7g);
+					T64 = FNMS(T3d, TL, T63);
+					T6a = FMA(Tt, TP, T69);
+					T6d = Tr * TP;
+					T7k = T10 * TP;
+					T5Z = T3a * TL;
+					T7Q = TG * TP;
+					T6e = FNMS(Tt, TL, T6d);
+					T7l = FNMS(T14, TL, T7k);
+					T60 = FMA(T3d, TP, T5Z);
+					T7R = FNMS(TJ, TL, T7Q);
+					T5z = Tr * Te;
+				   }
+			      }
+			      {
+				   E T6I, T5D, T6M, T6s, T6w;
+				   T6I = T7 * Te;
+				   T5D = Tr * Ti;
+				   T6M = T7 * Ti;
+				   T6h = FNMS(Tt, Ti, T5z);
+				   T5A = FMA(Tt, Ti, T5z);
+				   T7o = FMA(Tb, Ti, T6I);
+				   T6J = FNMS(Tb, Ti, T6I);
+				   T6k = FMA(Tt, Te, T5D);
+				   T5E = FNMS(Tt, Te, T5D);
+				   T6N = FMA(Tb, Te, T6M);
+				   T7r = FNMS(Tb, Te, T6M);
+				   T6s = T2U * TL;
+				   T6w = T2U * TP;
+				   {
+					E TN, TT, TM, T2w;
+					TN = TG * Ti;
+					T2w = T10 * Ti;
+					T6x = FNMS(T2X, TL, T6w);
+					T6t = FMA(T2X, TP, T6s);
+					T7c = FMA(TJ, Te, TN);
+					TO = FNMS(TJ, Te, TN);
+					TT = TK * TP;
+					TM = TK * TL;
+					T2x = FNMS(T14, Te, T2w);
+					T7E = FMA(T14, Te, T2w);
+					TU = FNMS(TO, TL, TT);
+					TQ = FMA(TO, TP, TM);
+					T2B = T2u * TP;
+					T2v = T2u * TL;
+				   }
+			      }
+			      {
+				   E T1Y, T22, Tv, TB;
+				   {
+					E T49, T4f, T4a, T3l, T3q, T3k;
+					T4a = T3a * Ti;
+					T2C = FNMS(T2x, TL, T2B);
+					T2y = FMA(T2x, TP, T2v);
+					T5R = FMA(T3d, Te, T4a);
+					T4b = FNMS(T3d, Te, T4a);
+					T49 = T48 * TL;
+					T4f = T48 * TP;
+					T3l = T3g * Ti;
+					T4c = FMA(T4b, TP, T49);
+					T4g = FNMS(T4b, TL, T4f);
+					T4W = FMA(T3i, Te, T3l);
+					T3m = FNMS(T3i, Te, T3l);
+					T1Y = Tu * TL;
+					T3q = T3j * TP;
+					T3k = T3j * TL;
+					T22 = Tu * TP;
+					Tv = Tu * Te;
+					T3r = FNMS(T3m, TL, T3q);
+					T3n = FMA(T3m, TP, T3k);
+					TB = Tu * Ti;
+					T1k = FNMS(Tt, T8, Tw);
+					Tx = FMA(Tt, T8, Tw);
+				   }
+				   {
+					E T30, T34, T18, T1d;
+					T30 = T17 * TL;
+					T34 = T17 * TP;
+					T18 = T17 * Te;
+					Ty = FMA(Tx, Ti, Tv);
+					T4p = FNMS(Tx, Ti, Tv);
+					T4s = FMA(Tx, Te, TB);
+					TC = FNMS(Tx, Te, TB);
+					T23 = FNMS(Tx, TL, T22);
+					T1Z = FMA(Tx, TP, T1Y);
+					T1d = T17 * Ti;
+					T19 = FMA(Tb, T8, Tg);
+					Th = FNMS(Tb, T8, Tg);
+					{
+					     E T1j, T1o, T1Q, T1U;
+					     T1j = T1i * TL;
+					     {
+						  E T6V, T6Z, T54, T58;
+						  T6V = Ty * TL;
+						  T6Z = Ty * TP;
+						  T31 = FMA(T19, TP, T30);
+						  T35 = FNMS(T19, TL, T34);
+						  T1e = FMA(T19, Te, T1d);
+						  T44 = FNMS(T19, Te, T1d);
+						  T41 = FMA(T19, Ti, T18);
+						  T1a = FNMS(T19, Ti, T18);
+						  T6W = FMA(TC, TP, T6V);
+						  T70 = FNMS(TC, TL, T6Z);
+						  T1o = T1i * TP;
+						  T54 = T41 * TL;
+						  T58 = T41 * TP;
+						  T1Q = T1i * Te;
+						  T1U = T1i * Ti;
+						  T55 = FMA(T44, TP, T54);
+						  T59 = FNMS(T44, TL, T58);
+					     }
+					     T3v = Td * TL;
+					     T3z = Td * TP;
+					     Tf = Td * Te;
+					     T1R = FMA(T1k, Ti, T1Q);
+					     T2N = FNMS(T1k, Ti, T1Q);
+					     T2Q = FMA(T1k, Te, T1U);
+					     T1V = FNMS(T1k, Te, T1U);
+					     T1p = FNMS(T1k, TL, T1o);
+					     T1l = FMA(T1k, TP, T1j);
+					     Tm = Td * Ti;
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E Tl9, TlD, TY, Tg4, T8w, TdS, TkE, Tkd, T2G, Tge, Tgh, TiK, Te1, T98, Te0;
+			 E T9f, Te5, T9p, Tgq, T39, Te8, T9M, TiN, Tgn, TeE, TbI, Thr, T74, TeP, TcB;
+			 E Tja, Thc, T8D, TdT, T1B, TkD, T8K, TdU, Tg7, Tk7, T8T, TdY, T27, Tg9, T90;
+			 E TdX, Tgc, TiJ, T9Y, Tec, T4k, TgB, Tal, Tef, Tgy, TiT, Taz, Tel, T5d, Th0;
+			 E Tbs, Tew, TgL, TiZ, T3K, Tgo, Tgt, TiO, T9P, Te6, T9E, Te9, T4L, Tgz, TgE;
+			 E TiU, Tao, Ted, Tad, Teg, T5I, TgM, Th3, Tj0, Tbv, Tem, TaO, Tex, T7v, Thd;
+			 E Thu, Tjb, TcE, TeF, TbX, TeQ, T68, Tj5, Tez, Teq, Tbj, Tbx, TgS, Th5, T6B;
+			 E Tj6, TeA, Tet, Tb4, Tby, TgX, Th6, T7V, Tjg, TeS, TeJ, Tcs, TcG, Thj, Thw;
+			 E T84, T83, T85, Tc7, T8k, Tc3, T86, T89, T8b;
+			 {
+			      E T3w, T3A, T4H, T4E, T8e, T8i, T5j, T5n, T4U, T4S, T4V, Tau, T5b, Tbq, T4X;
+			      E T50, T52;
+			      {
+				   E T72, Tcz, Tcv, T6Q, Tha, TbG, T6U, Tcx, T99, T9e;
+				   {
+					E T1, Tkb, Tp, Tka, TR, TV, TE, T8s, TS, T8t;
+					{
+					     E Tn, Tj, T8d, T8h, T5i, T5m;
+					     T1 = ri[0];
+					     T8d = T1R * TL;
+					     T8h = T1R * TP;
+					     T3w = FMA(Th, TP, T3v);
+					     T3A = FNMS(Th, TL, T3z);
+					     Tn = FMA(Th, Te, Tm);
+					     T4H = FNMS(Th, Te, Tm);
+					     T4E = FMA(Th, Ti, Tf);
+					     Tj = FNMS(Th, Ti, Tf);
+					     T8e = FMA(T1V, TP, T8d);
+					     T8i = FNMS(T1V, TL, T8h);
+					     Tkb = ii[0];
+					     T5i = T4E * TL;
+					     T5m = T4E * TP;
+					     {
+						  E Tk, To, Tl, Tk9;
+						  Tk = ri[WS(rs, 32)];
+						  To = ii[WS(rs, 32)];
+						  T5j = FMA(T4H, TP, T5i);
+						  T5n = FNMS(T4H, TL, T5m);
+						  Tl = Tj * Tk;
+						  Tk9 = Tj * To;
+						  {
+						       E Tz, TD, TA, T8r;
+						       Tz = ri[WS(rs, 16)];
+						       TD = ii[WS(rs, 16)];
+						       Tp = FMA(Tn, To, Tl);
+						       Tka = FNMS(Tn, Tk, Tk9);
+						       TA = Ty * Tz;
+						       T8r = Ty * TD;
+						       TR = ri[WS(rs, 48)];
+						       TV = ii[WS(rs, 48)];
+						       TE = FMA(TC, TD, TA);
+						       T8s = FNMS(TC, Tz, T8r);
+						       TS = TQ * TR;
+						       T8t = TQ * TV;
+						  }
+					     }
+					}
+					{
+					     E T8q, Tq, Tl7, Tkc, TW, T8u;
+					     T8q = T1 - Tp;
+					     Tq = T1 + Tp;
+					     Tl7 = Tkb - Tka;
+					     Tkc = Tka + Tkb;
+					     TW = FMA(TU, TV, TS);
+					     T8u = FNMS(TU, TR, T8t);
+					     {
+						  E TX, Tl8, T8v, Tk8;
+						  TX = TE + TW;
+						  Tl8 = TE - TW;
+						  T8v = T8s - T8u;
+						  Tk8 = T8s + T8u;
+						  Tl9 = Tl7 - Tl8;
+						  TlD = Tl8 + Tl7;
+						  TY = Tq + TX;
+						  Tg4 = Tq - TX;
+						  T8w = T8q - T8v;
+						  TdS = T8q + T8v;
+						  TkE = Tkc - Tk8;
+						  Tkd = Tk8 + Tkc;
+					     }
+					}
+				   }
+				   {
+					E T2f, T93, T2E, T9d, T2n, T95, T2s, T9b;
+					{
+					     E T2a, T2e, T2i, T2m;
+					     T2a = ri[WS(rs, 60)];
+					     T2e = ii[WS(rs, 60)];
+					     {
+						  E T2z, T2D, T2b, T92, T2A, T9c;
+						  T2z = ri[WS(rs, 44)];
+						  T2D = ii[WS(rs, 44)];
+						  T2b = T29 * T2a;
+						  T92 = T29 * T2e;
+						  T2A = T2y * T2z;
+						  T9c = T2y * T2D;
+						  T2f = FMA(T2d, T2e, T2b);
+						  T93 = FNMS(T2d, T2a, T92);
+						  T2E = FMA(T2C, T2D, T2A);
+						  T9d = FNMS(T2C, T2z, T9c);
+					     }
+					     T2i = ri[WS(rs, 28)];
+					     T2m = ii[WS(rs, 28)];
+					     {
+						  E T2p, T2r, T2j, T94, T2q, T9a;
+						  T2p = ri[WS(rs, 12)];
+						  T2r = ii[WS(rs, 12)];
+						  T2j = T2h * T2i;
+						  T94 = T2h * T2m;
+						  T2q = TG * T2p;
+						  T9a = TG * T2r;
+						  T2n = FMA(T2l, T2m, T2j);
+						  T95 = FNMS(T2l, T2i, T94);
+						  T2s = FMA(TJ, T2r, T2q);
+						  T9b = FNMS(TJ, T2p, T9a);
+					     }
+					}
+					{
+					     E T2o, Tgf, T96, T97, T2F, Tgg;
+					     T99 = T2f - T2n;
+					     T2o = T2f + T2n;
+					     Tgf = T93 + T95;
+					     T96 = T93 - T95;
+					     T97 = T2s - T2E;
+					     T2F = T2s + T2E;
+					     Tgg = T9b + T9d;
+					     T9e = T9b - T9d;
+					     T2G = T2o + T2F;
+					     Tge = T2o - T2F;
+					     Tgh = Tgf - Tgg;
+					     TiK = Tgf + Tgg;
+					     Te1 = T96 - T97;
+					     T98 = T96 + T97;
+					}
+				   }
+				   {
+					E T9K, T2T, T9G, T9n, Tgl, T9o, T38, T9I;
+					{
+					     E T2M, T9k, T37, T2V, T2S, T2W, T2Y, T9m, T32, T33, T36, T2Z, T9H;
+					     {
+						  E T2J, T2L, T2K, T9j;
+						  T2J = ri[WS(rs, 2)];
+						  T2L = ii[WS(rs, 2)];
+						  T32 = ri[WS(rs, 50)];
+						  Te0 = T99 + T9e;
+						  T9f = T99 - T9e;
+						  T2K = Tr * T2J;
+						  T9j = Tr * T2L;
+						  T33 = T31 * T32;
+						  T36 = ii[WS(rs, 50)];
+						  T2M = FMA(Tt, T2L, T2K);
+						  T9k = FNMS(Tt, T2J, T9j);
+					     }
+					     {
+						  E T2O, T9J, T2R, T2P, T9l;
+						  T2O = ri[WS(rs, 34)];
+						  T37 = FMA(T35, T36, T33);
+						  T9J = T31 * T36;
+						  T2R = ii[WS(rs, 34)];
+						  T2P = T2N * T2O;
+						  T2V = ri[WS(rs, 18)];
+						  T9K = FNMS(T35, T32, T9J);
+						  T9l = T2N * T2R;
+						  T2S = FMA(T2Q, T2R, T2P);
+						  T2W = T2U * T2V;
+						  T2Y = ii[WS(rs, 18)];
+						  T9m = FNMS(T2Q, T2O, T9l);
+					     }
+					     T2T = T2M + T2S;
+					     T9G = T2M - T2S;
+					     T2Z = FMA(T2X, T2Y, T2W);
+					     T9H = T2U * T2Y;
+					     T9n = T9k - T9m;
+					     Tgl = T9k + T9m;
+					     T9o = T2Z - T37;
+					     T38 = T2Z + T37;
+					     T9I = FNMS(T2X, T2V, T9H);
+					}
+					{
+					     E T6H, TbD, T6P, T6R, T6T, TbF, T6S, Tcw;
+					     {
+						  E T6X, T71, T6E, TbC, T6K, TbE;
+						  {
+						       E T6F, T6G, T9L, Tgm;
+						       T6E = ri[WS(rs, 63)];
+						       Te5 = T9n - T9o;
+						       T9p = T9n + T9o;
+						       Tgq = T2T - T38;
+						       T39 = T2T + T38;
+						       T9L = T9I - T9K;
+						       Tgm = T9I + T9K;
+						       T6F = TL * T6E;
+						       T6G = ii[WS(rs, 63)];
+						       Te8 = T9G + T9L;
+						       T9M = T9G - T9L;
+						       TiN = Tgl + Tgm;
+						       Tgn = Tgl - Tgm;
+						       TbC = TL * T6G;
+						       T6H = FMA(TP, T6G, T6F);
+						  }
+						  T6X = ri[WS(rs, 47)];
+						  T71 = ii[WS(rs, 47)];
+						  TbD = FNMS(TP, T6E, TbC);
+						  {
+						       E T6O, T6L, T6Y, Tcy;
+						       T6K = ri[WS(rs, 31)];
+						       T6Y = T6W * T6X;
+						       Tcy = T6W * T71;
+						       T6O = ii[WS(rs, 31)];
+						       T6L = T6J * T6K;
+						       T72 = FMA(T70, T71, T6Y);
+						       Tcz = FNMS(T70, T6X, Tcy);
+						       TbE = T6J * T6O;
+						       T6P = FMA(T6N, T6O, T6L);
+						  }
+						  T6R = ri[WS(rs, 15)];
+						  T6T = ii[WS(rs, 15)];
+						  TbF = FNMS(T6N, T6K, TbE);
+					     }
+					     Tcv = T6H - T6P;
+					     T6Q = T6H + T6P;
+					     T6S = TK * T6R;
+					     Tcw = TK * T6T;
+					     Tha = TbD + TbF;
+					     TbG = TbD - TbF;
+					     T6U = FMA(TO, T6T, T6S);
+					     Tcx = FNMS(TO, T6R, Tcw);
+					}
+				   }
+				   {
+					E T1J, T1G, T1K, T8O, T25, T8Y, T1N, T1S, T1W;
+					{
+					     E T1b, T16, T1c, T8y, T1z, T8I, T1f, T1m, T1q;
+					     {
+						  E T11, T12, T15, T1u, T1y, T8x, T1v, T8H;
+						  T11 = ri[WS(rs, 8)];
+						  {
+						       E TbH, T73, TcA, Thb;
+						       TbH = T6U - T72;
+						       T73 = T6U + T72;
+						       TcA = Tcx - Tcz;
+						       Thb = Tcx + Tcz;
+						       TeE = TbG - TbH;
+						       TbI = TbG + TbH;
+						       Thr = T6Q - T73;
+						       T74 = T6Q + T73;
+						       TeP = Tcv + TcA;
+						       TcB = Tcv - TcA;
+						       Tja = Tha + Thb;
+						       Thc = Tha - Thb;
+						       T12 = T10 * T11;
+						  }
+						  T15 = ii[WS(rs, 8)];
+						  T1u = ri[WS(rs, 24)];
+						  T1y = ii[WS(rs, 24)];
+						  T1b = ri[WS(rs, 40)];
+						  T16 = FMA(T14, T15, T12);
+						  T8x = T10 * T15;
+						  T1v = T1t * T1u;
+						  T8H = T1t * T1y;
+						  T1c = T1a * T1b;
+						  T8y = FNMS(T14, T11, T8x);
+						  T1z = FMA(T1x, T1y, T1v);
+						  T8I = FNMS(T1x, T1u, T8H);
+						  T1f = ii[WS(rs, 40)];
+						  T1m = ri[WS(rs, 56)];
+						  T1q = ii[WS(rs, 56)];
+					     }
+					     {
+						  E T1D, T1E, T1F, T20, T24, T8N, T21, T8X;
+						  {
+						       E T1h, T8C, T8A, T1r, T8G, Tg5, T8B;
+						       T1D = ri[WS(rs, 4)];
+						       {
+							    E T1g, T8z, T1n, T8F;
+							    T1g = FMA(T1e, T1f, T1c);
+							    T8z = T1a * T1f;
+							    T1n = T1l * T1m;
+							    T8F = T1l * T1q;
+							    T1h = T16 + T1g;
+							    T8C = T16 - T1g;
+							    T8A = FNMS(T1e, T1b, T8z);
+							    T1r = FMA(T1p, T1q, T1n);
+							    T8G = FNMS(T1p, T1m, T8F);
+							    T1E = T7 * T1D;
+						       }
+						       Tg5 = T8y + T8A;
+						       T8B = T8y - T8A;
+						       {
+							    E T1A, T8E, Tg6, T8J;
+							    T1A = T1r + T1z;
+							    T8E = T1r - T1z;
+							    Tg6 = T8G + T8I;
+							    T8J = T8G - T8I;
+							    T8D = T8B - T8C;
+							    TdT = T8C + T8B;
+							    T1B = T1h + T1A;
+							    TkD = T1A - T1h;
+							    T8K = T8E + T8J;
+							    TdU = T8E - T8J;
+							    Tg7 = Tg5 - Tg6;
+							    Tk7 = Tg5 + Tg6;
+							    T1F = ii[WS(rs, 4)];
+						       }
+						  }
+						  T20 = ri[WS(rs, 52)];
+						  T24 = ii[WS(rs, 52)];
+						  T1J = ri[WS(rs, 36)];
+						  T1G = FMA(Tb, T1F, T1E);
+						  T8N = T7 * T1F;
+						  T21 = T1Z * T20;
+						  T8X = T1Z * T24;
+						  T1K = T1I * T1J;
+						  T8O = FNMS(Tb, T1D, T8N);
+						  T25 = FMA(T23, T24, T21);
+						  T8Y = FNMS(T23, T20, T8X);
+						  T1N = ii[WS(rs, 36)];
+						  T1S = ri[WS(rs, 20)];
+						  T1W = ii[WS(rs, 20)];
+					     }
+					}
+					{
+					     E T3V, T3T, T3W, T9T, T4i, Taj, T3Y, T42, T45;
+					     {
+						  E T3O, T3P, T3S, T4d, T4h, T9S, T4e, Tai;
+						  {
+						       E T1P, T8U, T8Q, T1X, T8W, Tga, T8R;
+						       T3O = ri[WS(rs, 62)];
+						       {
+							    E T1O, T8P, T1T, T8V;
+							    T1O = FMA(T1M, T1N, T1K);
+							    T8P = T1I * T1N;
+							    T1T = T1R * T1S;
+							    T8V = T1R * T1W;
+							    T1P = T1G + T1O;
+							    T8U = T1G - T1O;
+							    T8Q = FNMS(T1M, T1J, T8P);
+							    T1X = FMA(T1V, T1W, T1T);
+							    T8W = FNMS(T1V, T1S, T8V);
+							    T3P = T3N * T3O;
+						       }
+						       Tga = T8O + T8Q;
+						       T8R = T8O - T8Q;
+						       {
+							    E T26, T8S, Tgb, T8Z;
+							    T26 = T1X + T25;
+							    T8S = T1X - T25;
+							    Tgb = T8W + T8Y;
+							    T8Z = T8W - T8Y;
+							    T8T = T8R + T8S;
+							    TdY = T8R - T8S;
+							    T27 = T1P + T26;
+							    Tg9 = T1P - T26;
+							    T90 = T8U - T8Z;
+							    TdX = T8U + T8Z;
+							    Tgc = Tga - Tgb;
+							    TiJ = Tga + Tgb;
+							    T3S = ii[WS(rs, 62)];
+						       }
+						  }
+						  T4d = ri[WS(rs, 46)];
+						  T4h = ii[WS(rs, 46)];
+						  T3V = ri[WS(rs, 30)];
+						  T3T = FMA(T3R, T3S, T3P);
+						  T9S = T3N * T3S;
+						  T4e = T4c * T4d;
+						  Tai = T4c * T4h;
+						  T3W = T3U * T3V;
+						  T9T = FNMS(T3R, T3O, T9S);
+						  T4i = FMA(T4g, T4h, T4e);
+						  Taj = FNMS(T4g, T4d, Tai);
+						  T3Y = ii[WS(rs, 30)];
+						  T42 = ri[WS(rs, 14)];
+						  T45 = ii[WS(rs, 14)];
+					     }
+					     {
+						  E T4P, T4Q, T4R, T56, T5a, Tat, T57, Tbp;
+						  {
+						       E T40, Taf, T9V, T46, Tah, Tgw, T9W;
+						       T4P = ri[WS(rs, 1)];
+						       {
+							    E T3Z, T9U, T43, Tag;
+							    T3Z = FMA(T3X, T3Y, T3W);
+							    T9U = T3U * T3Y;
+							    T43 = T41 * T42;
+							    Tag = T41 * T45;
+							    T40 = T3T + T3Z;
+							    Taf = T3T - T3Z;
+							    T9V = FNMS(T3X, T3V, T9U);
+							    T46 = FMA(T44, T45, T43);
+							    Tah = FNMS(T44, T42, Tag);
+							    T4Q = T2 * T4P;
+						       }
+						       Tgw = T9T + T9V;
+						       T9W = T9T - T9V;
+						       {
+							    E T4j, T9X, Tgx, Tak;
+							    T4j = T46 + T4i;
+							    T9X = T46 - T4i;
+							    Tgx = Tah + Taj;
+							    Tak = Tah - Taj;
+							    T9Y = T9W + T9X;
+							    Tec = T9W - T9X;
+							    T4k = T40 + T4j;
+							    TgB = T40 - T4j;
+							    Tal = Taf - Tak;
+							    Tef = Taf + Tak;
+							    Tgy = Tgw - Tgx;
+							    TiT = Tgw + Tgx;
+							    T4R = ii[WS(rs, 1)];
+						       }
+						  }
+						  T56 = ri[WS(rs, 49)];
+						  T5a = ii[WS(rs, 49)];
+						  T4U = ri[WS(rs, 33)];
+						  T4S = FMA(T5, T4R, T4Q);
+						  Tat = T2 * T4R;
+						  T57 = T55 * T56;
+						  Tbp = T55 * T5a;
+						  T4V = T4T * T4U;
+						  Tau = FNMS(T5, T4P, Tat);
+						  T5b = FMA(T59, T5a, T57);
+						  Tbq = FNMS(T59, T56, Tbp);
+						  T4X = ii[WS(rs, 33)];
+						  T50 = ri[WS(rs, 17)];
+						  T52 = ii[WS(rs, 17)];
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T7a, T78, T7b, TbL, T7t, TbU, T7d, T7i, T7m;
+				   {
+					E T4q, T4o, T4r, Ta1, T4J, Taa, T4t, T4y, T4C;
+					{
+					     E T3o, T3f, T3p, T9s, T3I, T9B, T3s, T3x, T3B;
+					     {
+						  E T3b, T3c, T3e, T3E, T3H, T9r, T3F, T9A;
+						  {
+						       E T4Z, Tbm, Taw, T53, Tbo, TgJ, Tax;
+						       T3b = ri[WS(rs, 10)];
+						       {
+							    E T4Y, Tav, T51, Tbn;
+							    T4Y = FMA(T4W, T4X, T4V);
+							    Tav = T4T * T4X;
+							    T51 = T48 * T50;
+							    Tbn = T48 * T52;
+							    T4Z = T4S + T4Y;
+							    Tbm = T4S - T4Y;
+							    Taw = FNMS(T4W, T4U, Tav);
+							    T53 = FMA(T4b, T52, T51);
+							    Tbo = FNMS(T4b, T50, Tbn);
+							    T3c = T3a * T3b;
+						       }
+						       TgJ = Tau + Taw;
+						       Tax = Tau - Taw;
+						       {
+							    E T5c, Tay, TgK, Tbr;
+							    T5c = T53 + T5b;
+							    Tay = T53 - T5b;
+							    TgK = Tbo + Tbq;
+							    Tbr = Tbo - Tbq;
+							    Taz = Tax + Tay;
+							    Tel = Tax - Tay;
+							    T5d = T4Z + T5c;
+							    Th0 = T4Z - T5c;
+							    Tbs = Tbm - Tbr;
+							    Tew = Tbm + Tbr;
+							    TgL = TgJ - TgK;
+							    TiZ = TgJ + TgK;
+							    T3e = ii[WS(rs, 10)];
+						       }
+						  }
+						  T3E = ri[WS(rs, 26)];
+						  T3H = ii[WS(rs, 26)];
+						  T3o = ri[WS(rs, 42)];
+						  T3f = FMA(T3d, T3e, T3c);
+						  T9r = T3a * T3e;
+						  T3F = T3D * T3E;
+						  T9A = T3D * T3H;
+						  T3p = T3n * T3o;
+						  T9s = FNMS(T3d, T3b, T9r);
+						  T3I = FMA(T3G, T3H, T3F);
+						  T9B = FNMS(T3G, T3E, T9A);
+						  T3s = ii[WS(rs, 42)];
+						  T3x = ri[WS(rs, 58)];
+						  T3B = ii[WS(rs, 58)];
+					     }
+					     {
+						  E T4l, T4m, T4n, T4F, T4I, Ta0, T4G, Ta9;
+						  {
+						       E T3u, T9q, T9u, T3C, T9z, Tgr, T9v;
+						       T4l = ri[WS(rs, 6)];
+						       {
+							    E T3t, T9t, T3y, T9y;
+							    T3t = FMA(T3r, T3s, T3p);
+							    T9t = T3n * T3s;
+							    T3y = T3w * T3x;
+							    T9y = T3w * T3B;
+							    T3u = T3f + T3t;
+							    T9q = T3f - T3t;
+							    T9u = FNMS(T3r, T3o, T9t);
+							    T3C = FMA(T3A, T3B, T3y);
+							    T9z = FNMS(T3A, T3x, T9y);
+							    T4m = T3g * T4l;
+						       }
+						       Tgr = T9s + T9u;
+						       T9v = T9s - T9u;
+						       {
+							    E T3J, T9x, Tgs, T9C;
+							    T3J = T3C + T3I;
+							    T9x = T3C - T3I;
+							    Tgs = T9z + T9B;
+							    T9C = T9z - T9B;
+							    {
+								 E T9w, T9O, T9D, T9N;
+								 T9w = T9q + T9v;
+								 T9O = T9v - T9q;
+								 T3K = T3u + T3J;
+								 Tgo = T3J - T3u;
+								 T9D = T9x - T9C;
+								 T9N = T9x + T9C;
+								 Tgt = Tgr - Tgs;
+								 TiO = Tgr + Tgs;
+								 T9P = T9N - T9O;
+								 Te6 = T9O + T9N;
+								 T9E = T9w - T9D;
+								 Te9 = T9w + T9D;
+								 T4n = ii[WS(rs, 6)];
+							    }
+						       }
+						  }
+						  T4F = ri[WS(rs, 22)];
+						  T4I = ii[WS(rs, 22)];
+						  T4q = ri[WS(rs, 38)];
+						  T4o = FMA(T3i, T4n, T4m);
+						  Ta0 = T3g * T4n;
+						  T4G = T4E * T4F;
+						  Ta9 = T4E * T4I;
+						  T4r = T4p * T4q;
+						  Ta1 = FNMS(T3i, T4l, Ta0);
+						  T4J = FMA(T4H, T4I, T4G);
+						  Taa = FNMS(T4H, T4F, Ta9);
+						  T4t = ii[WS(rs, 38)];
+						  T4y = ri[WS(rs, 54)];
+						  T4C = ii[WS(rs, 54)];
+					     }
+					}
+					{
+					     E T5k, T5h, T5l, TaC, T5G, TaL, T5o, T5t, T5x;
+					     {
+						  E T5e, T5f, T5g, T5B, T5F, TaB, T5C, TaK;
+						  {
+						       E T4v, T9Z, Ta3, T4D, Ta8, TgC, Ta4;
+						       T5e = ri[WS(rs, 9)];
+						       {
+							    E T4u, Ta2, T4z, Ta7;
+							    T4u = FMA(T4s, T4t, T4r);
+							    Ta2 = T4p * T4t;
+							    T4z = T4x * T4y;
+							    Ta7 = T4x * T4C;
+							    T4v = T4o + T4u;
+							    T9Z = T4o - T4u;
+							    Ta3 = FNMS(T4s, T4q, Ta2);
+							    T4D = FMA(T4B, T4C, T4z);
+							    Ta8 = FNMS(T4B, T4y, Ta7);
+							    T5f = T8 * T5e;
+						       }
+						       TgC = Ta1 + Ta3;
+						       Ta4 = Ta1 - Ta3;
+						       {
+							    E T4K, Ta6, TgD, Tab;
+							    T4K = T4D + T4J;
+							    Ta6 = T4D - T4J;
+							    TgD = Ta8 + Taa;
+							    Tab = Ta8 - Taa;
+							    {
+								 E Ta5, Tan, Tac, Tam;
+								 Ta5 = T9Z + Ta4;
+								 Tan = Ta4 - T9Z;
+								 T4L = T4v + T4K;
+								 Tgz = T4K - T4v;
+								 Tac = Ta6 - Tab;
+								 Tam = Ta6 + Tab;
+								 TgE = TgC - TgD;
+								 TiU = TgC + TgD;
+								 Tao = Tam - Tan;
+								 Ted = Tan + Tam;
+								 Tad = Ta5 - Tac;
+								 Teg = Ta5 + Tac;
+								 T5g = ii[WS(rs, 9)];
+							    }
+						       }
+						  }
+						  T5B = ri[WS(rs, 25)];
+						  T5F = ii[WS(rs, 25)];
+						  T5k = ri[WS(rs, 41)];
+						  T5h = FMA(Tc, T5g, T5f);
+						  TaB = T8 * T5g;
+						  T5C = T5A * T5B;
+						  TaK = T5A * T5F;
+						  T5l = T5j * T5k;
+						  TaC = FNMS(Tc, T5e, TaB);
+						  T5G = FMA(T5E, T5F, T5C);
+						  TaL = FNMS(T5E, T5B, TaK);
+						  T5o = ii[WS(rs, 41)];
+						  T5t = ri[WS(rs, 57)];
+						  T5x = ii[WS(rs, 57)];
+					     }
+					     {
+						  E T75, T76, T77, T7p, T7s, TbK, T7q, TbT;
+						  {
+						       E T5q, TaA, TaE, T5y, TaJ, Th1, TaF;
+						       T75 = ri[WS(rs, 7)];
+						       {
+							    E T5p, TaD, T5u, TaI;
+							    T5p = FMA(T5n, T5o, T5l);
+							    TaD = T5j * T5o;
+							    T5u = T5s * T5t;
+							    TaI = T5s * T5x;
+							    T5q = T5h + T5p;
+							    TaA = T5h - T5p;
+							    TaE = FNMS(T5n, T5k, TaD);
+							    T5y = FMA(T5w, T5x, T5u);
+							    TaJ = FNMS(T5w, T5t, TaI);
+							    T76 = T1i * T75;
+						       }
+						       Th1 = TaC + TaE;
+						       TaF = TaC - TaE;
+						       {
+							    E T5H, TaH, Th2, TaM;
+							    T5H = T5y + T5G;
+							    TaH = T5y - T5G;
+							    Th2 = TaJ + TaL;
+							    TaM = TaJ - TaL;
+							    {
+								 E TaG, Tbu, TaN, Tbt;
+								 TaG = TaA + TaF;
+								 Tbu = TaF - TaA;
+								 T5I = T5q + T5H;
+								 TgM = T5H - T5q;
+								 TaN = TaH - TaM;
+								 Tbt = TaH + TaM;
+								 Th3 = Th1 - Th2;
+								 Tj0 = Th1 + Th2;
+								 Tbv = Tbt - Tbu;
+								 Tem = Tbu + Tbt;
+								 TaO = TaG - TaN;
+								 Tex = TaG + TaN;
+								 T77 = ii[WS(rs, 7)];
+							    }
+						       }
+						  }
+						  T7p = ri[WS(rs, 23)];
+						  T7s = ii[WS(rs, 23)];
+						  T7a = ri[WS(rs, 39)];
+						  T78 = FMA(T1k, T77, T76);
+						  TbK = T1i * T77;
+						  T7q = T7o * T7p;
+						  TbT = T7o * T7s;
+						  T7b = T79 * T7a;
+						  TbL = FNMS(T1k, T75, TbK);
+						  T7t = FMA(T7r, T7s, T7q);
+						  TbU = FNMS(T7r, T7p, TbT);
+						  T7d = ii[WS(rs, 39)];
+						  T7i = ri[WS(rs, 55)];
+						  T7m = ii[WS(rs, 55)];
+					     }
+					}
+				   }
+				   {
+					E T6i, T6g, T6j, TaY, T6z, TaU, T6l, T6o, T6q;
+					{
+					     E T5P, T5N, T5Q, Tbd, T66, Tb9, T5S, T5V, T5X;
+					     {
+						  E T5K, T5L, T5M, T61, T65, Tbc, T62, Tb8;
+						  {
+						       E T7f, TbJ, TbN, T7n, TbS, Ths, TbO;
+						       T5K = ri[WS(rs, 5)];
+						       {
+							    E T7e, TbM, T7j, TbR;
+							    T7e = FMA(T7c, T7d, T7b);
+							    TbM = T79 * T7d;
+							    T7j = T7h * T7i;
+							    TbR = T7h * T7m;
+							    T7f = T78 + T7e;
+							    TbJ = T78 - T7e;
+							    TbN = FNMS(T7c, T7a, TbM);
+							    T7n = FMA(T7l, T7m, T7j);
+							    TbS = FNMS(T7l, T7i, TbR);
+							    T5L = Td * T5K;
+						       }
+						       Ths = TbL + TbN;
+						       TbO = TbL - TbN;
+						       {
+							    E T7u, TbQ, Tht, TbV;
+							    T7u = T7n + T7t;
+							    TbQ = T7n - T7t;
+							    Tht = TbS + TbU;
+							    TbV = TbS - TbU;
+							    {
+								 E TbP, TcD, TbW, TcC;
+								 TbP = TbJ + TbO;
+								 TcD = TbO - TbJ;
+								 T7v = T7f + T7u;
+								 Thd = T7u - T7f;
+								 TbW = TbQ - TbV;
+								 TcC = TbQ + TbV;
+								 Thu = Ths - Tht;
+								 Tjb = Ths + Tht;
+								 TcE = TcC - TcD;
+								 TeF = TcD + TcC;
+								 TbX = TbP - TbW;
+								 TeQ = TbP + TbW;
+								 T5M = ii[WS(rs, 5)];
+							    }
+						       }
+						  }
+						  T61 = ri[WS(rs, 53)];
+						  T65 = ii[WS(rs, 53)];
+						  T5P = ri[WS(rs, 37)];
+						  T5N = FMA(Th, T5M, T5L);
+						  Tbc = Td * T5M;
+						  T62 = T60 * T61;
+						  Tb8 = T60 * T65;
+						  T5Q = T5O * T5P;
+						  Tbd = FNMS(Th, T5K, Tbc);
+						  T66 = FMA(T64, T65, T62);
+						  Tb9 = FNMS(T64, T61, Tb8);
+						  T5S = ii[WS(rs, 37)];
+						  T5V = ri[WS(rs, 21)];
+						  T5X = ii[WS(rs, 21)];
+					     }
+					     {
+						  E T6b, T6c, T6f, T6u, T6y, TaX, T6v, TaT;
+						  {
+						       E T5U, Tb5, Tbf, T5Y, Tb7;
+						       T6b = ri[WS(rs, 61)];
+						       {
+							    E T5T, Tbe, T5W, Tb6;
+							    T5T = FMA(T5R, T5S, T5Q);
+							    Tbe = T5O * T5S;
+							    T5W = T3j * T5V;
+							    Tb6 = T3j * T5X;
+							    T5U = T5N + T5T;
+							    Tb5 = T5N - T5T;
+							    Tbf = FNMS(T5R, T5P, Tbe);
+							    T5Y = FMA(T3m, T5X, T5W);
+							    Tb7 = FNMS(T3m, T5V, Tb6);
+							    T6c = T6a * T6b;
+						       }
+						       {
+							    E TgO, Tbg, T67, Tbh;
+							    TgO = Tbd + Tbf;
+							    Tbg = Tbd - Tbf;
+							    T67 = T5Y + T66;
+							    Tbh = T5Y - T66;
+							    {
+								 E TgP, Tba, Tbi, Teo;
+								 TgP = Tb7 + Tb9;
+								 Tba = Tb7 - Tb9;
+								 Tbi = Tbg + Tbh;
+								 Teo = Tbg - Tbh;
+								 {
+								      E TgR, Tbb, Tep, TgQ;
+								      TgR = T5U - T67;
+								      T68 = T5U + T67;
+								      Tbb = Tb5 - Tba;
+								      Tep = Tb5 + Tba;
+								      TgQ = TgO - TgP;
+								      Tj5 = TgO + TgP;
+								      Tez = FMA(KP414213562, Teo, Tep);
+								      Teq = FNMS(KP414213562, Tep, Teo);
+								      Tbj = FNMS(KP414213562, Tbi, Tbb);
+								      Tbx = FMA(KP414213562, Tbb, Tbi);
+								      TgS = TgQ - TgR;
+								      Th5 = TgR + TgQ;
+								      T6f = ii[WS(rs, 61)];
+								 }
+							    }
+						       }
+						  }
+						  T6u = ri[WS(rs, 45)];
+						  T6y = ii[WS(rs, 45)];
+						  T6i = ri[WS(rs, 29)];
+						  T6g = FMA(T6e, T6f, T6c);
+						  TaX = T6a * T6f;
+						  T6v = T6t * T6u;
+						  TaT = T6t * T6y;
+						  T6j = T6h * T6i;
+						  TaY = FNMS(T6e, T6b, TaX);
+						  T6z = FMA(T6x, T6y, T6v);
+						  TaU = FNMS(T6x, T6u, TaT);
+						  T6l = ii[WS(rs, 29)];
+						  T6o = ri[WS(rs, 13)];
+						  T6q = ii[WS(rs, 13)];
+					     }
+					}
+					{
+					     E T7C, T7A, T7D, Tcm, T7T, Tci, T7F, T7I, T7K;
+					     {
+						  E T7x, T7y, T7z, T7O, T7S, Tcl, T7P, Tch;
+						  {
+						       E T6n, TaQ, Tb0, T6r, TaS;
+						       T7x = ri[WS(rs, 3)];
+						       {
+							    E T6m, TaZ, T6p, TaR;
+							    T6m = FMA(T6k, T6l, T6j);
+							    TaZ = T6h * T6l;
+							    T6p = T17 * T6o;
+							    TaR = T17 * T6q;
+							    T6n = T6g + T6m;
+							    TaQ = T6g - T6m;
+							    Tb0 = FNMS(T6k, T6i, TaZ);
+							    T6r = FMA(T19, T6q, T6p);
+							    TaS = FNMS(T19, T6o, TaR);
+							    T7y = T3 * T7x;
+						       }
+						       {
+							    E TgU, Tb1, T6A, Tb2;
+							    TgU = TaY + Tb0;
+							    Tb1 = TaY - Tb0;
+							    T6A = T6r + T6z;
+							    Tb2 = T6r - T6z;
+							    {
+								 E TgV, TaV, Tb3, Ter;
+								 TgV = TaS + TaU;
+								 TaV = TaS - TaU;
+								 Tb3 = Tb1 + Tb2;
+								 Ter = Tb1 - Tb2;
+								 {
+								      E TgT, TaW, Tes, TgW;
+								      TgT = T6n - T6A;
+								      T6B = T6n + T6A;
+								      TaW = TaQ - TaV;
+								      Tes = TaQ + TaV;
+								      TgW = TgU - TgV;
+								      Tj6 = TgU + TgV;
+								      TeA = FNMS(KP414213562, Ter, Tes);
+								      Tet = FMA(KP414213562, Tes, Ter);
+								      Tb4 = FMA(KP414213562, Tb3, TaW);
+								      Tby = FNMS(KP414213562, TaW, Tb3);
+								      TgX = TgT + TgW;
+								      Th6 = TgT - TgW;
+								      T7z = ii[WS(rs, 3)];
+								 }
+							    }
+						       }
+						  }
+						  T7O = ri[WS(rs, 51)];
+						  T7S = ii[WS(rs, 51)];
+						  T7C = ri[WS(rs, 35)];
+						  T7A = FMA(T6, T7z, T7y);
+						  Tcl = T3 * T7z;
+						  T7P = T7N * T7O;
+						  Tch = T7N * T7S;
+						  T7D = T7B * T7C;
+						  Tcm = FNMS(T6, T7x, Tcl);
+						  T7T = FMA(T7R, T7S, T7P);
+						  Tci = FNMS(T7R, T7O, Tch);
+						  T7F = ii[WS(rs, 35)];
+						  T7I = ri[WS(rs, 19)];
+						  T7K = ii[WS(rs, 19)];
+					     }
+					     {
+						  E T7Y, T7Z, T82, T8f, T8j, Tc6, T8g, Tc2;
+						  {
+						       E T7H, Tce, Tco, T7L, Tcg;
+						       T7Y = ri[WS(rs, 59)];
+						       {
+							    E T7G, Tcn, T7J, Tcf;
+							    T7G = FMA(T7E, T7F, T7D);
+							    Tcn = T7B * T7F;
+							    T7J = T2u * T7I;
+							    Tcf = T2u * T7K;
+							    T7H = T7A + T7G;
+							    Tce = T7A - T7G;
+							    Tco = FNMS(T7E, T7C, Tcn);
+							    T7L = FMA(T2x, T7K, T7J);
+							    Tcg = FNMS(T2x, T7I, Tcf);
+							    T7Z = T7X * T7Y;
+						       }
+						       {
+							    E Thf, Tcp, T7U, Tcq;
+							    Thf = Tcm + Tco;
+							    Tcp = Tcm - Tco;
+							    T7U = T7L + T7T;
+							    Tcq = T7L - T7T;
+							    {
+								 E Thg, Tcj, Tcr, TeH;
+								 Thg = Tcg + Tci;
+								 Tcj = Tcg - Tci;
+								 Tcr = Tcp + Tcq;
+								 TeH = Tcp - Tcq;
+								 {
+								      E Thi, Tck, TeI, Thh;
+								      Thi = T7H - T7U;
+								      T7V = T7H + T7U;
+								      Tck = Tce - Tcj;
+								      TeI = Tce + Tcj;
+								      Thh = Thf - Thg;
+								      Tjg = Thf + Thg;
+								      TeS = FMA(KP414213562, TeH, TeI);
+								      TeJ = FNMS(KP414213562, TeI, TeH);
+								      Tcs = FNMS(KP414213562, Tcr, Tck);
+								      TcG = FMA(KP414213562, Tck, Tcr);
+								      Thj = Thh - Thi;
+								      Thw = Thi + Thh;
+								      T82 = ii[WS(rs, 59)];
+								 }
+							    }
+						       }
+						  }
+						  T8f = ri[WS(rs, 43)];
+						  T8j = ii[WS(rs, 43)];
+						  T84 = ri[WS(rs, 27)];
+						  T83 = FMA(T81, T82, T7Z);
+						  Tc6 = T7X * T82;
+						  T8g = T8e * T8f;
+						  Tc2 = T8e * T8j;
+						  T85 = Te * T84;
+						  Tc7 = FNMS(T81, T7Y, Tc6);
+						  T8k = FMA(T8i, T8j, T8g);
+						  Tc3 = FNMS(T8i, T8f, Tc2);
+						  T86 = ii[WS(rs, 27)];
+						  T89 = ri[WS(rs, 11)];
+						  T8b = ii[WS(rs, 11)];
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E TeT, TeM, Tcd, TcH, Tho, Thx, Tkw, Tkv, Tl6, Tl5;
+			      {
+				   E TiI, Tkp, TiQ, TiS, TiL, Tkq, TiP, TiV, Tjf, Tjd, Tjc, Tji, Tj4, Tj2, Tj1;
+				   E Tj7, Tkh, Tki;
+				   {
+					E TjG, T2I, Tkj, T4N, Tkk, Tkf, Tk5, TjJ, T8o, Tk2, TjL, T6D, TjY, TjU, Tk1;
+					E TjO;
+					{
+					     E T8m, Tjh, T3L, T4M, Tk6, Tke, TjH, TjI;
+					     {
+						  E T1C, T88, TbZ, Tc9, T8c, Tc1, T2H;
+						  T1C = TY + T1B;
+						  TiI = TY - T1B;
+						  {
+						       E T87, Tc8, T8a, Tc0;
+						       T87 = FMA(Ti, T86, T85);
+						       Tc8 = Te * T86;
+						       T8a = Tu * T89;
+						       Tc0 = Tu * T8b;
+						       T88 = T83 + T87;
+						       TbZ = T83 - T87;
+						       Tc9 = FNMS(Ti, T84, Tc8);
+						       T8c = FMA(Tx, T8b, T8a);
+						       Tc1 = FNMS(Tx, T89, Tc0);
+						       T2H = T27 + T2G;
+						       Tkp = T2G - T27;
+						  }
+						  {
+						       E Thl, Tca, T8l, Tcb;
+						       Thl = Tc7 + Tc9;
+						       Tca = Tc7 - Tc9;
+						       T8l = T8c + T8k;
+						       Tcb = T8c - T8k;
+						       {
+							    E Thm, Tc4, Tcc, TeK;
+							    Thm = Tc1 + Tc3;
+							    Tc4 = Tc1 - Tc3;
+							    Tcc = Tca + Tcb;
+							    TeK = Tca - Tcb;
+							    {
+								 E Thk, Tc5, TeL, Thn;
+								 Thk = T88 - T8l;
+								 T8m = T88 + T8l;
+								 Tc5 = TbZ - Tc4;
+								 TeL = TbZ + Tc4;
+								 Thn = Thl - Thm;
+								 Tjh = Thl + Thm;
+								 TeT = FNMS(KP414213562, TeK, TeL);
+								 TeM = FMA(KP414213562, TeL, TeK);
+								 Tcd = FMA(KP414213562, Tcc, Tc5);
+								 TcH = FNMS(KP414213562, Tc5, Tcc);
+								 Tho = Thk + Thn;
+								 Thx = Thk - Thn;
+								 TjG = T1C - T2H;
+								 T2I = T1C + T2H;
+							    }
+						       }
+						  }
+					     }
+					     TiQ = T39 - T3K;
+					     T3L = T39 + T3K;
+					     T4M = T4k + T4L;
+					     TiS = T4k - T4L;
+					     TiL = TiJ - TiK;
+					     Tk6 = TiJ + TiK;
+					     Tke = Tk7 + Tkd;
+					     Tkq = Tkd - Tk7;
+					     TiP = TiN - TiO;
+					     TjH = TiN + TiO;
+					     Tkj = T4M - T3L;
+					     T4N = T3L + T4M;
+					     Tkk = Tke - Tk6;
+					     Tkf = Tk6 + Tke;
+					     TjI = TiT + TiU;
+					     TiV = TiT - TiU;
+					     {
+						  E TjR, TjQ, TjS, T7w, T8n;
+						  Tjf = T74 - T7v;
+						  T7w = T74 + T7v;
+						  T8n = T7V + T8m;
+						  Tjd = T8m - T7V;
+						  Tjc = Tja - Tjb;
+						  TjR = Tja + Tjb;
+						  Tk5 = TjH + TjI;
+						  TjJ = TjH - TjI;
+						  TjQ = T7w - T8n;
+						  T8o = T7w + T8n;
+						  Tji = Tjg - Tjh;
+						  TjS = Tjg + Tjh;
+						  {
+						       E TjM, TjN, T5J, T6C, TjT;
+						       Tj4 = T5d - T5I;
+						       T5J = T5d + T5I;
+						       T6C = T68 + T6B;
+						       Tj2 = T6B - T68;
+						       TjT = TjR - TjS;
+						       Tk2 = TjR + TjS;
+						       Tj1 = TiZ - Tj0;
+						       TjM = TiZ + Tj0;
+						       TjL = T5J - T6C;
+						       T6D = T5J + T6C;
+						       Tj7 = Tj5 - Tj6;
+						       TjN = Tj5 + Tj6;
+						       TjY = TjQ + TjT;
+						       TjU = TjQ - TjT;
+						       Tk1 = TjM + TjN;
+						       TjO = TjM - TjN;
+						  }
+					     }
+					}
+					{
+					     E Tk0, Tk3, TjW, Tko, Tkn, Tkl, Tkm, TjZ;
+					     {
+						  E TjP, TjX, Tk4, Tkg, T4O, T8p, TjK, TjV;
+						  Tk0 = T2I - T4N;
+						  T4O = T2I + T4N;
+						  T8p = T6D + T8o;
+						  Tkh = T8o - T6D;
+						  TjP = TjL + TjO;
+						  TjX = TjO - TjL;
+						  Tk3 = Tk1 - Tk2;
+						  Tk4 = Tk1 + Tk2;
+						  ri[0] = T4O + T8p;
+						  ri[WS(rs, 32)] = T4O - T8p;
+						  Tkg = Tk5 + Tkf;
+						  Tki = Tkf - Tk5;
+						  TjW = TjG - TjJ;
+						  TjK = TjG + TjJ;
+						  TjV = TjP + TjU;
+						  Tko = TjU - TjP;
+						  Tkn = Tkk - Tkj;
+						  Tkl = Tkj + Tkk;
+						  ii[WS(rs, 32)] = Tkg - Tk4;
+						  ii[0] = Tk4 + Tkg;
+						  ri[WS(rs, 8)] = FMA(KP707106781, TjV, TjK);
+						  ri[WS(rs, 40)] = FNMS(KP707106781, TjV, TjK);
+						  Tkm = TjX + TjY;
+						  TjZ = TjX - TjY;
+					     }
+					     ii[WS(rs, 40)] = FNMS(KP707106781, Tkm, Tkl);
+					     ii[WS(rs, 8)] = FMA(KP707106781, Tkm, Tkl);
+					     ri[WS(rs, 24)] = FMA(KP707106781, TjZ, TjW);
+					     ri[WS(rs, 56)] = FNMS(KP707106781, TjZ, TjW);
+					     ii[WS(rs, 56)] = FNMS(KP707106781, Tko, Tkn);
+					     ii[WS(rs, 24)] = FMA(KP707106781, Tko, Tkn);
+					     ri[WS(rs, 16)] = Tk0 + Tk3;
+					     ri[WS(rs, 48)] = Tk0 - Tk3;
+					}
+				   }
+				   {
+					E Tjq, TiM, Tkx, Tkr, Tjt, Tky, Tks, TiX, Tjz, Tje, Tjx, TjD, Tjn, Tj9, Tjr;
+					E TiR;
+					ii[WS(rs, 48)] = Tki - Tkh;
+					ii[WS(rs, 16)] = Tkh + Tki;
+					Tjq = TiI + TiL;
+					TiM = TiI - TiL;
+					Tkx = Tkq - Tkp;
+					Tkr = Tkp + Tkq;
+					Tjr = TiQ + TiP;
+					TiR = TiP - TiQ;
+					{
+					     E Tjw, Tj3, Tjs, TiW, Tjv, Tj8;
+					     Tjs = TiS - TiV;
+					     TiW = TiS + TiV;
+					     Tjw = Tj1 + Tj2;
+					     Tj3 = Tj1 - Tj2;
+					     Tjt = Tjr + Tjs;
+					     Tky = Tjs - Tjr;
+					     Tks = TiR + TiW;
+					     TiX = TiR - TiW;
+					     Tjv = Tj4 + Tj7;
+					     Tj8 = Tj4 - Tj7;
+					     Tjz = Tjc + Tjd;
+					     Tje = Tjc - Tjd;
+					     Tjx = FMA(KP414213562, Tjw, Tjv);
+					     TjD = FNMS(KP414213562, Tjv, Tjw);
+					     Tjn = FNMS(KP414213562, Tj3, Tj8);
+					     Tj9 = FMA(KP414213562, Tj8, Tj3);
+					}
+					{
+					     E Tjm, TiY, Tkz, TkB, Tjy, Tjj;
+					     Tjm = FNMS(KP707106781, TiX, TiM);
+					     TiY = FMA(KP707106781, TiX, TiM);
+					     Tkz = FMA(KP707106781, Tky, Tkx);
+					     TkB = FNMS(KP707106781, Tky, Tkx);
+					     Tjy = Tjf + Tji;
+					     Tjj = Tjf - Tji;
+					     {
+						  E TjC, Tkt, Tku, TjF;
+						  {
+						       E Tju, TjE, Tjo, Tjk, TjB, TjA;
+						       TjC = FNMS(KP707106781, Tjt, Tjq);
+						       Tju = FMA(KP707106781, Tjt, Tjq);
+						       TjA = FNMS(KP414213562, Tjz, Tjy);
+						       TjE = FMA(KP414213562, Tjy, Tjz);
+						       Tjo = FMA(KP414213562, Tje, Tjj);
+						       Tjk = FNMS(KP414213562, Tjj, Tje);
+						       TjB = Tjx + TjA;
+						       Tkw = TjA - Tjx;
+						       Tkv = FNMS(KP707106781, Tks, Tkr);
+						       Tkt = FMA(KP707106781, Tks, Tkr);
+						       {
+							    E Tjp, TkA, TkC, Tjl;
+							    Tjp = Tjn + Tjo;
+							    TkA = Tjo - Tjn;
+							    TkC = Tj9 + Tjk;
+							    Tjl = Tj9 - Tjk;
+							    ri[WS(rs, 4)] = FMA(KP923879532, TjB, Tju);
+							    ri[WS(rs, 36)] = FNMS(KP923879532, TjB, Tju);
+							    ri[WS(rs, 60)] = FMA(KP923879532, Tjp, Tjm);
+							    ri[WS(rs, 28)] = FNMS(KP923879532, Tjp, Tjm);
+							    ii[WS(rs, 44)] = FNMS(KP923879532, TkA, Tkz);
+							    ii[WS(rs, 12)] = FMA(KP923879532, TkA, Tkz);
+							    ii[WS(rs, 60)] = FMA(KP923879532, TkC, TkB);
+							    ii[WS(rs, 28)] = FNMS(KP923879532, TkC, TkB);
+							    ri[WS(rs, 12)] = FMA(KP923879532, Tjl, TiY);
+							    ri[WS(rs, 44)] = FNMS(KP923879532, Tjl, TiY);
+							    Tku = TjD + TjE;
+							    TjF = TjD - TjE;
+						       }
+						  }
+						  ii[WS(rs, 36)] = FNMS(KP923879532, Tku, Tkt);
+						  ii[WS(rs, 4)] = FMA(KP923879532, Tku, Tkt);
+						  ri[WS(rs, 20)] = FMA(KP923879532, TjF, TjC);
+						  ri[WS(rs, 52)] = FNMS(KP923879532, TjF, TjC);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E TkV, Tl1, ThG, Tgk, TkH, TkN, Tis, Ti0, Thv, ThJ, TkO, TkI, TgH, Thy, TiC;
+				   E TiG, Tiq, Tim, ThN, ThT, ThD, Th9, TkW, Tiv, Tl2, Ti7, ThP, Thq, Tiz, TiF;
+				   E Tip, Tif;
+				   {
+					E Ti1, Ti2, Ti4, Ti5, Thp, The, Tij, TiB, Tii, Tik;
+					{
+					     E ThW, Tg8, TkT, TkF, ThX, ThY, TkU, Tgj, Tgd, Tgi;
+					     ThW = Tg4 - Tg7;
+					     Tg8 = Tg4 + Tg7;
+					     TkT = TkE - TkD;
+					     TkF = TkD + TkE;
+					     ThX = Tgc - Tg9;
+					     Tgd = Tg9 + Tgc;
+					     ii[WS(rs, 52)] = FNMS(KP923879532, Tkw, Tkv);
+					     ii[WS(rs, 20)] = FMA(KP923879532, Tkw, Tkv);
+					     Tgi = Tge - Tgh;
+					     ThY = Tge + Tgh;
+					     TkU = Tgi - Tgd;
+					     Tgj = Tgd + Tgi;
+					     {
+						  E TgA, ThH, Tgv, TgF;
+						  {
+						       E Tgp, TkG, ThZ, Tgu;
+						       Ti1 = Tgn - Tgo;
+						       Tgp = Tgn + Tgo;
+						       TkV = FMA(KP707106781, TkU, TkT);
+						       Tl1 = FNMS(KP707106781, TkU, TkT);
+						       ThG = FMA(KP707106781, Tgj, Tg8);
+						       Tgk = FNMS(KP707106781, Tgj, Tg8);
+						       TkG = ThX + ThY;
+						       ThZ = ThX - ThY;
+						       Tgu = Tgq + Tgt;
+						       Ti2 = Tgq - Tgt;
+						       Ti4 = Tgy - Tgz;
+						       TgA = Tgy + Tgz;
+						       TkH = FMA(KP707106781, TkG, TkF);
+						       TkN = FNMS(KP707106781, TkG, TkF);
+						       Tis = FNMS(KP707106781, ThZ, ThW);
+						       Ti0 = FMA(KP707106781, ThZ, ThW);
+						       ThH = FMA(KP414213562, Tgp, Tgu);
+						       Tgv = FNMS(KP414213562, Tgu, Tgp);
+						       TgF = TgB + TgE;
+						       Ti5 = TgB - TgE;
+						  }
+						  {
+						       E Tig, Tih, ThI, TgG;
+						       Thv = Thr + Thu;
+						       Tig = Thr - Thu;
+						       Tih = Tho - Thj;
+						       Thp = Thj + Tho;
+						       The = Thc + Thd;
+						       Tij = Thc - Thd;
+						       ThI = FNMS(KP414213562, TgA, TgF);
+						       TgG = FMA(KP414213562, TgF, TgA);
+						       TiB = FMA(KP707106781, Tih, Tig);
+						       Tii = FNMS(KP707106781, Tih, Tig);
+						       ThJ = ThH + ThI;
+						       TkO = ThI - ThH;
+						       TkI = Tgv + TgG;
+						       TgH = Tgv - TgG;
+						       Tik = Thw - Thx;
+						       Thy = Thw + Thx;
+						  }
+					     }
+					}
+					{
+					     E Tic, Tia, Ti9, Tid, Tit, Ti3;
+					     {
+						  E Th4, ThM, TgZ, Th7, ThL, Th8;
+						  {
+						       E TgN, TgY, TiA, Til;
+						       Tic = TgL - TgM;
+						       TgN = TgL + TgM;
+						       TgY = TgS + TgX;
+						       Tia = TgX - TgS;
+						       Ti9 = Th0 - Th3;
+						       Th4 = Th0 + Th3;
+						       TiA = FMA(KP707106781, Tik, Tij);
+						       Til = FNMS(KP707106781, Tik, Tij);
+						       ThM = FMA(KP707106781, TgY, TgN);
+						       TgZ = FNMS(KP707106781, TgY, TgN);
+						       TiC = FNMS(KP198912367, TiB, TiA);
+						       TiG = FMA(KP198912367, TiA, TiB);
+						       Tiq = FMA(KP668178637, Tii, Til);
+						       Tim = FNMS(KP668178637, Til, Tii);
+						       Th7 = Th5 + Th6;
+						       Tid = Th5 - Th6;
+						  }
+						  ThL = FMA(KP707106781, Th7, Th4);
+						  Th8 = FNMS(KP707106781, Th7, Th4);
+						  Tit = FNMS(KP414213562, Ti1, Ti2);
+						  Ti3 = FMA(KP414213562, Ti2, Ti1);
+						  ThN = FMA(KP198912367, ThM, ThL);
+						  ThT = FNMS(KP198912367, ThL, ThM);
+						  ThD = FNMS(KP668178637, TgZ, Th8);
+						  Th9 = FMA(KP668178637, Th8, TgZ);
+					     }
+					     {
+						  E Tiy, Tib, Tiu, Ti6, Tix, Tie;
+						  Tiu = FMA(KP414213562, Ti4, Ti5);
+						  Ti6 = FNMS(KP414213562, Ti5, Ti4);
+						  Tiy = FMA(KP707106781, Tia, Ti9);
+						  Tib = FNMS(KP707106781, Tia, Ti9);
+						  TkW = Tiu - Tit;
+						  Tiv = Tit + Tiu;
+						  Tl2 = Ti3 + Ti6;
+						  Ti7 = Ti3 - Ti6;
+						  Tix = FMA(KP707106781, Tid, Tic);
+						  Tie = FNMS(KP707106781, Tid, Tic);
+						  ThP = FMA(KP707106781, Thp, The);
+						  Thq = FNMS(KP707106781, Thp, The);
+						  Tiz = FMA(KP198912367, Tiy, Tix);
+						  TiF = FNMS(KP198912367, Tix, Tiy);
+						  Tip = FNMS(KP668178637, Tib, Tie);
+						  Tif = FMA(KP668178637, Tie, Tib);
+					     }
+					}
+				   }
+				   {
+					E TkM, TkL, Tl0, TkZ;
+					{
+					     E ThC, TgI, TkP, TkR, ThO, Thz;
+					     ThC = FNMS(KP923879532, TgH, Tgk);
+					     TgI = FMA(KP923879532, TgH, Tgk);
+					     TkP = FMA(KP923879532, TkO, TkN);
+					     TkR = FNMS(KP923879532, TkO, TkN);
+					     ThO = FMA(KP707106781, Thy, Thv);
+					     Thz = FNMS(KP707106781, Thy, Thv);
+					     {
+						  E ThS, TkJ, TkK, ThV;
+						  {
+						       E ThK, ThU, ThE, ThA, ThR, ThQ;
+						       ThS = FNMS(KP923879532, ThJ, ThG);
+						       ThK = FMA(KP923879532, ThJ, ThG);
+						       ThQ = FNMS(KP198912367, ThP, ThO);
+						       ThU = FMA(KP198912367, ThO, ThP);
+						       ThE = FMA(KP668178637, Thq, Thz);
+						       ThA = FNMS(KP668178637, Thz, Thq);
+						       ThR = ThN + ThQ;
+						       TkM = ThQ - ThN;
+						       TkL = FNMS(KP923879532, TkI, TkH);
+						       TkJ = FMA(KP923879532, TkI, TkH);
+						       {
+							    E ThF, TkQ, TkS, ThB;
+							    ThF = ThD + ThE;
+							    TkQ = ThE - ThD;
+							    TkS = Th9 + ThA;
+							    ThB = Th9 - ThA;
+							    ri[WS(rs, 2)] = FMA(KP980785280, ThR, ThK);
+							    ri[WS(rs, 34)] = FNMS(KP980785280, ThR, ThK);
+							    ri[WS(rs, 58)] = FMA(KP831469612, ThF, ThC);
+							    ri[WS(rs, 26)] = FNMS(KP831469612, ThF, ThC);
+							    ii[WS(rs, 42)] = FNMS(KP831469612, TkQ, TkP);
+							    ii[WS(rs, 10)] = FMA(KP831469612, TkQ, TkP);
+							    ii[WS(rs, 58)] = FMA(KP831469612, TkS, TkR);
+							    ii[WS(rs, 26)] = FNMS(KP831469612, TkS, TkR);
+							    ri[WS(rs, 10)] = FMA(KP831469612, ThB, TgI);
+							    ri[WS(rs, 42)] = FNMS(KP831469612, ThB, TgI);
+							    TkK = ThT + ThU;
+							    ThV = ThT - ThU;
+						       }
+						  }
+						  ii[WS(rs, 34)] = FNMS(KP980785280, TkK, TkJ);
+						  ii[WS(rs, 2)] = FMA(KP980785280, TkK, TkJ);
+						  ri[WS(rs, 18)] = FMA(KP980785280, ThV, ThS);
+						  ri[WS(rs, 50)] = FNMS(KP980785280, ThV, ThS);
+					     }
+					}
+					{
+					     E Tio, TkX, TkY, Tir, Ti8, Tin;
+					     Tio = FNMS(KP923879532, Ti7, Ti0);
+					     Ti8 = FMA(KP923879532, Ti7, Ti0);
+					     Tin = Tif + Tim;
+					     Tl0 = Tim - Tif;
+					     TkZ = FNMS(KP923879532, TkW, TkV);
+					     TkX = FMA(KP923879532, TkW, TkV);
+					     ii[WS(rs, 50)] = FNMS(KP980785280, TkM, TkL);
+					     ii[WS(rs, 18)] = FMA(KP980785280, TkM, TkL);
+					     ri[WS(rs, 6)] = FMA(KP831469612, Tin, Ti8);
+					     ri[WS(rs, 38)] = FNMS(KP831469612, Tin, Ti8);
+					     TkY = Tip + Tiq;
+					     Tir = Tip - Tiq;
+					     ii[WS(rs, 38)] = FNMS(KP831469612, TkY, TkX);
+					     ii[WS(rs, 6)] = FMA(KP831469612, TkY, TkX);
+					     ri[WS(rs, 22)] = FMA(KP831469612, Tir, Tio);
+					     ri[WS(rs, 54)] = FNMS(KP831469612, Tir, Tio);
+					}
+					{
+					     E TiE, Tl3, Tl4, TiH, Tiw, TiD;
+					     TiE = FMA(KP923879532, Tiv, Tis);
+					     Tiw = FNMS(KP923879532, Tiv, Tis);
+					     TiD = Tiz - TiC;
+					     Tl6 = Tiz + TiC;
+					     Tl5 = FMA(KP923879532, Tl2, Tl1);
+					     Tl3 = FNMS(KP923879532, Tl2, Tl1);
+					     ii[WS(rs, 54)] = FNMS(KP831469612, Tl0, TkZ);
+					     ii[WS(rs, 22)] = FMA(KP831469612, Tl0, TkZ);
+					     ri[WS(rs, 14)] = FMA(KP980785280, TiD, Tiw);
+					     ri[WS(rs, 46)] = FNMS(KP980785280, TiD, Tiw);
+					     Tl4 = TiG - TiF;
+					     TiH = TiF + TiG;
+					     ii[WS(rs, 46)] = FNMS(KP980785280, Tl4, Tl3);
+					     ii[WS(rs, 14)] = FMA(KP980785280, Tl4, Tl3);
+					     ri[WS(rs, 62)] = FMA(KP980785280, TiH, TiE);
+					     ri[WS(rs, 30)] = FNMS(KP980785280, TiH, TiE);
+					}
+				   }
+			      }
+			      {
+				   E Tla, TdV, TdO, Tm6, Tm5, TdR;
+				   {
+					E TcT, TlO, TlI, Tar, TcX, Td3, TcN, TbB, TdM, TdQ, TdA, Tdw, TdJ, TdP, Tdz;
+					E Tdp, TlW, TdF, Tm2, Tdh, Td7, T91, Td6, T8M, TlT, TlF, Td0, Td4, TcO, TcK;
+					E T9g, Td8;
+					{
+					     E Tdb, Tdc, Tde, Tdf, Tdm, Tdk, Tdj, Tdn, TcF, Tct, TbY, Tdt, TdL, Tds, Tdu;
+					     E TcI, TdD, Tdd;
+					     {
+						  E Tae, TcR, T9R, Tap, T9F, T9Q;
+						  Tdb = FMA(KP707106781, T9E, T9p);
+						  T9F = FNMS(KP707106781, T9E, T9p);
+						  T9Q = FNMS(KP707106781, T9P, T9M);
+						  Tdc = FMA(KP707106781, T9P, T9M);
+						  Tde = FMA(KP707106781, Tad, T9Y);
+						  Tae = FNMS(KP707106781, Tad, T9Y);
+						  ii[WS(rs, 62)] = FMA(KP980785280, Tl6, Tl5);
+						  ii[WS(rs, 30)] = FNMS(KP980785280, Tl6, Tl5);
+						  TcR = FMA(KP668178637, T9F, T9Q);
+						  T9R = FNMS(KP668178637, T9Q, T9F);
+						  Tap = FNMS(KP707106781, Tao, Tal);
+						  Tdf = FMA(KP707106781, Tao, Tal);
+						  {
+						       E Tbw, TcW, Tbl, Tbz;
+						       {
+							    E TaP, Tbk, TcS, Taq;
+							    Tdm = FMA(KP707106781, TaO, Taz);
+							    TaP = FNMS(KP707106781, TaO, Taz);
+							    Tbk = Tb4 - Tbj;
+							    Tdk = Tbj + Tb4;
+							    Tdj = FMA(KP707106781, Tbv, Tbs);
+							    Tbw = FNMS(KP707106781, Tbv, Tbs);
+							    TcS = FNMS(KP668178637, Tae, Tap);
+							    Taq = FMA(KP668178637, Tap, Tae);
+							    TcW = FMA(KP923879532, Tbk, TaP);
+							    Tbl = FNMS(KP923879532, Tbk, TaP);
+							    TcT = TcR + TcS;
+							    TlO = TcS - TcR;
+							    TlI = T9R + Taq;
+							    Tar = T9R - Taq;
+							    Tbz = Tbx - Tby;
+							    Tdn = Tbx + Tby;
+						       }
+						       {
+							    E Tdq, Tdr, TcV, TbA;
+							    TcF = FNMS(KP707106781, TcE, TcB);
+							    Tdq = FMA(KP707106781, TcE, TcB);
+							    Tdr = Tcs + Tcd;
+							    Tct = Tcd - Tcs;
+							    TbY = FNMS(KP707106781, TbX, TbI);
+							    Tdt = FMA(KP707106781, TbX, TbI);
+							    TcV = FMA(KP923879532, Tbz, Tbw);
+							    TbA = FNMS(KP923879532, Tbz, Tbw);
+							    TdL = FMA(KP923879532, Tdr, Tdq);
+							    Tds = FNMS(KP923879532, Tdr, Tdq);
+							    TcX = FMA(KP303346683, TcW, TcV);
+							    Td3 = FNMS(KP303346683, TcV, TcW);
+							    TcN = FNMS(KP534511135, Tbl, TbA);
+							    TbB = FMA(KP534511135, TbA, Tbl);
+							    Tdu = TcG + TcH;
+							    TcI = TcG - TcH;
+						       }
+						  }
+					     }
+					     {
+						  E TdI, Tdl, TdK, Tdv, TdH, Tdo;
+						  TdK = FMA(KP923879532, Tdu, Tdt);
+						  Tdv = FNMS(KP923879532, Tdu, Tdt);
+						  TdI = FMA(KP923879532, Tdk, Tdj);
+						  Tdl = FNMS(KP923879532, Tdk, Tdj);
+						  TdM = FNMS(KP098491403, TdL, TdK);
+						  TdQ = FMA(KP098491403, TdK, TdL);
+						  TdA = FMA(KP820678790, Tds, Tdv);
+						  Tdw = FNMS(KP820678790, Tdv, Tds);
+						  TdH = FMA(KP923879532, Tdn, Tdm);
+						  Tdo = FNMS(KP923879532, Tdn, Tdm);
+						  TdD = FNMS(KP198912367, Tdb, Tdc);
+						  Tdd = FMA(KP198912367, Tdc, Tdb);
+						  TdJ = FMA(KP098491403, TdI, TdH);
+						  TdP = FNMS(KP098491403, TdH, TdI);
+						  Tdz = FNMS(KP820678790, Tdl, Tdo);
+						  Tdp = FMA(KP820678790, Tdo, Tdl);
+					     }
+					     {
+						  E TcZ, Tcu, TdE, Tdg;
+						  TdE = FMA(KP198912367, Tde, Tdf);
+						  Tdg = FNMS(KP198912367, Tdf, Tde);
+						  TcZ = FMA(KP923879532, Tct, TbY);
+						  Tcu = FNMS(KP923879532, Tct, TbY);
+						  TlW = TdE - TdD;
+						  TdF = TdD + TdE;
+						  Tm2 = Tdd + Tdg;
+						  Tdh = Tdd - Tdg;
+						  {
+						       E T8L, TlE, TcY, TcJ;
+						       Tla = T8D + T8K;
+						       T8L = T8D - T8K;
+						       TlE = TdU - TdT;
+						       TdV = TdT + TdU;
+						       Td7 = FNMS(KP414213562, T8T, T90);
+						       T91 = FMA(KP414213562, T90, T8T);
+						       TcY = FMA(KP923879532, TcI, TcF);
+						       TcJ = FNMS(KP923879532, TcI, TcF);
+						       Td6 = FNMS(KP707106781, T8L, T8w);
+						       T8M = FMA(KP707106781, T8L, T8w);
+						       TlT = FNMS(KP707106781, TlE, TlD);
+						       TlF = FMA(KP707106781, TlE, TlD);
+						       Td0 = FNMS(KP303346683, TcZ, TcY);
+						       Td4 = FMA(KP303346683, TcY, TcZ);
+						       TcO = FMA(KP534511135, Tcu, TcJ);
+						       TcK = FNMS(KP534511135, TcJ, Tcu);
+						       T9g = FNMS(KP414213562, T9f, T98);
+						       Td8 = FMA(KP414213562, T98, T9f);
+						  }
+					     }
+					}
+					{
+					     E Tm1, TlV, TdC, Tda, Td2, TlM, TlL, Td5;
+					     {
+						  E TlS, TcQ, TlH, TcM, TlR, TcP;
+						  {
+						       E TcL, Tas, TlP, TlQ, TlN;
+						       TlS = TbB + TcK;
+						       TcL = TbB - TcK;
+						       {
+							    E TlU, T9h, TlG, Td9, T9i;
+							    TlU = T91 + T9g;
+							    T9h = T91 - T9g;
+							    TlG = Td8 - Td7;
+							    Td9 = Td7 + Td8;
+							    Tm1 = FMA(KP923879532, TlU, TlT);
+							    TlV = FNMS(KP923879532, TlU, TlT);
+							    TcQ = FMA(KP923879532, T9h, T8M);
+							    T9i = FNMS(KP923879532, T9h, T8M);
+							    TlN = FNMS(KP923879532, TlG, TlF);
+							    TlH = FMA(KP923879532, TlG, TlF);
+							    TdC = FMA(KP923879532, Td9, Td6);
+							    Tda = FNMS(KP923879532, Td9, Td6);
+							    Tas = FMA(KP831469612, Tar, T9i);
+							    TcM = FNMS(KP831469612, Tar, T9i);
+						       }
+						       TlR = FNMS(KP831469612, TlO, TlN);
+						       TlP = FMA(KP831469612, TlO, TlN);
+						       TlQ = TcO - TcN;
+						       TcP = TcN + TcO;
+						       ri[WS(rs, 11)] = FMA(KP881921264, TcL, Tas);
+						       ri[WS(rs, 43)] = FNMS(KP881921264, TcL, Tas);
+						       ii[WS(rs, 43)] = FNMS(KP881921264, TlQ, TlP);
+						       ii[WS(rs, 11)] = FMA(KP881921264, TlQ, TlP);
+						  }
+						  {
+						       E TcU, Td1, TlJ, TlK;
+						       Td2 = FNMS(KP831469612, TcT, TcQ);
+						       TcU = FMA(KP831469612, TcT, TcQ);
+						       ri[WS(rs, 59)] = FMA(KP881921264, TcP, TcM);
+						       ri[WS(rs, 27)] = FNMS(KP881921264, TcP, TcM);
+						       ii[WS(rs, 59)] = FMA(KP881921264, TlS, TlR);
+						       ii[WS(rs, 27)] = FNMS(KP881921264, TlS, TlR);
+						       Td1 = TcX + Td0;
+						       TlM = Td0 - TcX;
+						       TlL = FNMS(KP831469612, TlI, TlH);
+						       TlJ = FMA(KP831469612, TlI, TlH);
+						       TlK = Td3 + Td4;
+						       Td5 = Td3 - Td4;
+						       ri[WS(rs, 3)] = FMA(KP956940335, Td1, TcU);
+						       ri[WS(rs, 35)] = FNMS(KP956940335, Td1, TcU);
+						       ii[WS(rs, 35)] = FNMS(KP956940335, TlK, TlJ);
+						       ii[WS(rs, 3)] = FMA(KP956940335, TlK, TlJ);
+						  }
+					     }
+					     {
+						  E Tdy, Tm0, TlZ, TdB;
+						  {
+						       E Tdi, Tdx, TlX, TlY;
+						       Tdy = FNMS(KP980785280, Tdh, Tda);
+						       Tdi = FMA(KP980785280, Tdh, Tda);
+						       ri[WS(rs, 19)] = FMA(KP956940335, Td5, Td2);
+						       ri[WS(rs, 51)] = FNMS(KP956940335, Td5, Td2);
+						       ii[WS(rs, 51)] = FNMS(KP956940335, TlM, TlL);
+						       ii[WS(rs, 19)] = FMA(KP956940335, TlM, TlL);
+						       Tdx = Tdp + Tdw;
+						       Tm0 = Tdw - Tdp;
+						       TlZ = FNMS(KP980785280, TlW, TlV);
+						       TlX = FMA(KP980785280, TlW, TlV);
+						       TlY = Tdz + TdA;
+						       TdB = Tdz - TdA;
+						       ri[WS(rs, 7)] = FMA(KP773010453, Tdx, Tdi);
+						       ri[WS(rs, 39)] = FNMS(KP773010453, Tdx, Tdi);
+						       ii[WS(rs, 39)] = FNMS(KP773010453, TlY, TlX);
+						       ii[WS(rs, 7)] = FMA(KP773010453, TlY, TlX);
+						  }
+						  {
+						       E TdG, TdN, Tm3, Tm4;
+						       TdO = FMA(KP980785280, TdF, TdC);
+						       TdG = FNMS(KP980785280, TdF, TdC);
+						       ri[WS(rs, 23)] = FMA(KP773010453, TdB, Tdy);
+						       ri[WS(rs, 55)] = FNMS(KP773010453, TdB, Tdy);
+						       ii[WS(rs, 55)] = FNMS(KP773010453, Tm0, TlZ);
+						       ii[WS(rs, 23)] = FMA(KP773010453, Tm0, TlZ);
+						       TdN = TdJ - TdM;
+						       Tm6 = TdJ + TdM;
+						       Tm5 = FMA(KP980785280, Tm2, Tm1);
+						       Tm3 = FNMS(KP980785280, Tm2, Tm1);
+						       Tm4 = TdQ - TdP;
+						       TdR = TdP + TdQ;
+						       ri[WS(rs, 15)] = FMA(KP995184726, TdN, TdG);
+						       ri[WS(rs, 47)] = FNMS(KP995184726, TdN, TdG);
+						       ii[WS(rs, 47)] = FNMS(KP995184726, Tm4, Tm3);
+						       ii[WS(rs, 15)] = FMA(KP995184726, Tm4, Tm3);
+						  }
+					     }
+					}
+				   }
+				   {
+					E Tf5, Tlk, Tle, Tej, Tf9, Tff, TeZ, TeD, TfY, Tg2, TfM, TfI, TfV, Tg1, TfL;
+					E TfB, Tls, TfR, Tly, Tft, Tfj, TdZ, Tfi, TdW, Tlp, Tlb, Tfc, Tfg, Tf0, TeW;
+					E Te2, Tfk;
+					{
+					     E Tfn, Tfo, Tfq, Tfr, Tfy, Tfw, Tfv, Tfz, TeR, TeN, TeG, TfF, TfX, TfE, TfG;
+					     E TeU, TfP, Tfp;
+					     {
+						  E Te7, Tea, Tee, Teh;
+						  Tfn = FNMS(KP707106781, Te6, Te5);
+						  Te7 = FMA(KP707106781, Te6, Te5);
+						  ri[WS(rs, 63)] = FMA(KP995184726, TdR, TdO);
+						  ri[WS(rs, 31)] = FNMS(KP995184726, TdR, TdO);
+						  ii[WS(rs, 63)] = FMA(KP995184726, Tm6, Tm5);
+						  ii[WS(rs, 31)] = FNMS(KP995184726, Tm6, Tm5);
+						  Tea = FMA(KP707106781, Te9, Te8);
+						  Tfo = FNMS(KP707106781, Te9, Te8);
+						  Tfq = FNMS(KP707106781, Ted, Tec);
+						  Tee = FMA(KP707106781, Ted, Tec);
+						  Teh = FMA(KP707106781, Teg, Tef);
+						  Tfr = FNMS(KP707106781, Teg, Tef);
+						  {
+						       E Tey, Tf8, Tev, TeB;
+						       {
+							    E Ten, Tf3, Teb, Tf4, Tei, Teu;
+							    Tfy = FNMS(KP707106781, Tem, Tel);
+							    Ten = FMA(KP707106781, Tem, Tel);
+							    Tf3 = FMA(KP198912367, Te7, Tea);
+							    Teb = FNMS(KP198912367, Tea, Te7);
+							    Tf4 = FNMS(KP198912367, Tee, Teh);
+							    Tei = FMA(KP198912367, Teh, Tee);
+							    Teu = Teq + Tet;
+							    Tfw = Tet - Teq;
+							    Tfv = FNMS(KP707106781, Tex, Tew);
+							    Tey = FMA(KP707106781, Tex, Tew);
+							    Tf5 = Tf3 + Tf4;
+							    Tlk = Tf4 - Tf3;
+							    Tle = Teb + Tei;
+							    Tej = Teb - Tei;
+							    Tf8 = FMA(KP923879532, Teu, Ten);
+							    Tev = FNMS(KP923879532, Teu, Ten);
+							    TeB = Tez + TeA;
+							    Tfz = Tez - TeA;
+						       }
+						       {
+							    E TfC, TfD, Tf7, TeC;
+							    TeR = FMA(KP707106781, TeQ, TeP);
+							    TfC = FNMS(KP707106781, TeQ, TeP);
+							    TfD = TeM - TeJ;
+							    TeN = TeJ + TeM;
+							    TeG = FMA(KP707106781, TeF, TeE);
+							    TfF = FNMS(KP707106781, TeF, TeE);
+							    Tf7 = FMA(KP923879532, TeB, Tey);
+							    TeC = FNMS(KP923879532, TeB, Tey);
+							    TfX = FMA(KP923879532, TfD, TfC);
+							    TfE = FNMS(KP923879532, TfD, TfC);
+							    Tf9 = FMA(KP098491403, Tf8, Tf7);
+							    Tff = FNMS(KP098491403, Tf7, Tf8);
+							    TeZ = FNMS(KP820678790, Tev, TeC);
+							    TeD = FMA(KP820678790, TeC, Tev);
+							    TfG = TeS - TeT;
+							    TeU = TeS + TeT;
+						       }
+						  }
+					     }
+					     {
+						  E TfU, Tfx, TfW, TfH, TfT, TfA;
+						  TfW = FMA(KP923879532, TfG, TfF);
+						  TfH = FNMS(KP923879532, TfG, TfF);
+						  TfU = FMA(KP923879532, Tfw, Tfv);
+						  Tfx = FNMS(KP923879532, Tfw, Tfv);
+						  TfY = FNMS(KP303346683, TfX, TfW);
+						  Tg2 = FMA(KP303346683, TfW, TfX);
+						  TfM = FMA(KP534511135, TfE, TfH);
+						  TfI = FNMS(KP534511135, TfH, TfE);
+						  TfT = FMA(KP923879532, Tfz, Tfy);
+						  TfA = FNMS(KP923879532, Tfz, Tfy);
+						  TfP = FNMS(KP668178637, Tfn, Tfo);
+						  Tfp = FMA(KP668178637, Tfo, Tfn);
+						  TfV = FMA(KP303346683, TfU, TfT);
+						  Tg1 = FNMS(KP303346683, TfT, TfU);
+						  TfL = FNMS(KP534511135, Tfx, TfA);
+						  TfB = FMA(KP534511135, TfA, Tfx);
+					     }
+					     {
+						  E Tfb, TeO, TfQ, Tfs, Tfa, TeV;
+						  TfQ = FMA(KP668178637, Tfq, Tfr);
+						  Tfs = FNMS(KP668178637, Tfr, Tfq);
+						  Tfb = FMA(KP923879532, TeN, TeG);
+						  TeO = FNMS(KP923879532, TeN, TeG);
+						  Tls = TfQ - TfP;
+						  TfR = TfP + TfQ;
+						  Tly = Tfp + Tfs;
+						  Tft = Tfp - Tfs;
+						  Tfj = FNMS(KP414213562, TdX, TdY);
+						  TdZ = FMA(KP414213562, TdY, TdX);
+						  Tfa = FMA(KP923879532, TeU, TeR);
+						  TeV = FNMS(KP923879532, TeU, TeR);
+						  Tfi = FNMS(KP707106781, TdV, TdS);
+						  TdW = FMA(KP707106781, TdV, TdS);
+						  Tlp = FNMS(KP707106781, Tla, Tl9);
+						  Tlb = FMA(KP707106781, Tla, Tl9);
+						  Tfc = FNMS(KP098491403, Tfb, Tfa);
+						  Tfg = FMA(KP098491403, Tfa, Tfb);
+						  Tf0 = FMA(KP820678790, TeO, TeV);
+						  TeW = FNMS(KP820678790, TeV, TeO);
+						  Te2 = FNMS(KP414213562, Te1, Te0);
+						  Tfk = FMA(KP414213562, Te0, Te1);
+					     }
+					}
+					{
+					     E Tlx, Tlr, TfO, Tfm, Tfe, Tli, Tlh, Tfh;
+					     {
+						  E Tlo, Tf2, Tld, TeY, Tln, Tf1;
+						  {
+						       E TeX, Tek, Tll, Tlm, Tlj;
+						       Tlo = TeD + TeW;
+						       TeX = TeD - TeW;
+						       {
+							    E Tlq, Te3, Tlc, Tfl, Te4;
+							    Tlq = Te2 - TdZ;
+							    Te3 = TdZ + Te2;
+							    Tlc = Tfj + Tfk;
+							    Tfl = Tfj - Tfk;
+							    Tlx = FNMS(KP923879532, Tlq, Tlp);
+							    Tlr = FMA(KP923879532, Tlq, Tlp);
+							    Tf2 = FMA(KP923879532, Te3, TdW);
+							    Te4 = FNMS(KP923879532, Te3, TdW);
+							    Tlj = FNMS(KP923879532, Tlc, Tlb);
+							    Tld = FMA(KP923879532, Tlc, Tlb);
+							    TfO = FNMS(KP923879532, Tfl, Tfi);
+							    Tfm = FMA(KP923879532, Tfl, Tfi);
+							    Tek = FMA(KP980785280, Tej, Te4);
+							    TeY = FNMS(KP980785280, Tej, Te4);
+						       }
+						       Tln = FNMS(KP980785280, Tlk, Tlj);
+						       Tll = FMA(KP980785280, Tlk, Tlj);
+						       Tlm = Tf0 - TeZ;
+						       Tf1 = TeZ + Tf0;
+						       ri[WS(rs, 9)] = FMA(KP773010453, TeX, Tek);
+						       ri[WS(rs, 41)] = FNMS(KP773010453, TeX, Tek);
+						       ii[WS(rs, 41)] = FNMS(KP773010453, Tlm, Tll);
+						       ii[WS(rs, 9)] = FMA(KP773010453, Tlm, Tll);
+						  }
+						  {
+						       E Tf6, Tfd, Tlf, Tlg;
+						       Tfe = FNMS(KP980785280, Tf5, Tf2);
+						       Tf6 = FMA(KP980785280, Tf5, Tf2);
+						       ri[WS(rs, 57)] = FMA(KP773010453, Tf1, TeY);
+						       ri[WS(rs, 25)] = FNMS(KP773010453, Tf1, TeY);
+						       ii[WS(rs, 57)] = FMA(KP773010453, Tlo, Tln);
+						       ii[WS(rs, 25)] = FNMS(KP773010453, Tlo, Tln);
+						       Tfd = Tf9 + Tfc;
+						       Tli = Tfc - Tf9;
+						       Tlh = FNMS(KP980785280, Tle, Tld);
+						       Tlf = FMA(KP980785280, Tle, Tld);
+						       Tlg = Tff + Tfg;
+						       Tfh = Tff - Tfg;
+						       ri[WS(rs, 1)] = FMA(KP995184726, Tfd, Tf6);
+						       ri[WS(rs, 33)] = FNMS(KP995184726, Tfd, Tf6);
+						       ii[WS(rs, 33)] = FNMS(KP995184726, Tlg, Tlf);
+						       ii[WS(rs, 1)] = FMA(KP995184726, Tlg, Tlf);
+						  }
+					     }
+					     {
+						  E TfK, Tlw, Tlv, TfN;
+						  {
+						       E Tfu, TfJ, Tlt, Tlu;
+						       TfK = FNMS(KP831469612, Tft, Tfm);
+						       Tfu = FMA(KP831469612, Tft, Tfm);
+						       ri[WS(rs, 17)] = FMA(KP995184726, Tfh, Tfe);
+						       ri[WS(rs, 49)] = FNMS(KP995184726, Tfh, Tfe);
+						       ii[WS(rs, 49)] = FNMS(KP995184726, Tli, Tlh);
+						       ii[WS(rs, 17)] = FMA(KP995184726, Tli, Tlh);
+						       TfJ = TfB + TfI;
+						       Tlw = TfI - TfB;
+						       Tlv = FNMS(KP831469612, Tls, Tlr);
+						       Tlt = FMA(KP831469612, Tls, Tlr);
+						       Tlu = TfL + TfM;
+						       TfN = TfL - TfM;
+						       ri[WS(rs, 5)] = FMA(KP881921264, TfJ, Tfu);
+						       ri[WS(rs, 37)] = FNMS(KP881921264, TfJ, Tfu);
+						       ii[WS(rs, 37)] = FNMS(KP881921264, Tlu, Tlt);
+						       ii[WS(rs, 5)] = FMA(KP881921264, Tlu, Tlt);
+						  }
+						  {
+						       E TfS, TfZ, Tlz, TlA;
+						       Tg0 = FMA(KP831469612, TfR, TfO);
+						       TfS = FNMS(KP831469612, TfR, TfO);
+						       ri[WS(rs, 21)] = FMA(KP881921264, TfN, TfK);
+						       ri[WS(rs, 53)] = FNMS(KP881921264, TfN, TfK);
+						       ii[WS(rs, 53)] = FNMS(KP881921264, Tlw, Tlv);
+						       ii[WS(rs, 21)] = FMA(KP881921264, Tlw, Tlv);
+						       TfZ = TfV - TfY;
+						       TlC = TfV + TfY;
+						       TlB = FMA(KP831469612, Tly, Tlx);
+						       Tlz = FNMS(KP831469612, Tly, Tlx);
+						       TlA = Tg2 - Tg1;
+						       Tg3 = Tg1 + Tg2;
+						       ri[WS(rs, 13)] = FMA(KP956940335, TfZ, TfS);
+						       ri[WS(rs, 45)] = FNMS(KP956940335, TfZ, TfS);
+						       ii[WS(rs, 45)] = FNMS(KP956940335, TlA, Tlz);
+						       ii[WS(rs, 13)] = FMA(KP956940335, TlA, Tlz);
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ri[WS(rs, 61)] = FMA(KP956940335, Tg3, Tg0);
+	       ri[WS(rs, 29)] = FNMS(KP956940335, Tg3, Tg0);
+	       ii[WS(rs, 61)] = FMA(KP956940335, TlC, TlB);
+	       ii[WS(rs, 29)] = FNMS(KP956940335, TlC, TlB);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_CEXP, 0, 27},
+     {TW_CEXP, 0, 63},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 64, "t2_64", twinstr, &GENUS, {520, 206, 634, 0}, 0, 0, 0 };
+
+void X(codelet_t2_64) (planner *p) {
+     X(kdft_dit_register) (p, t2_64, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 64 -name t2_64 -include t.h */
+
+/*
+ * This function contains 1154 FP additions, 660 FP multiplications,
+ * (or, 880 additions, 386 multiplications, 274 fused multiply/add),
+ * 302 stack variables, 15 constants, and 256 memory accesses
+ */
+#include "t.h"
+
+static void t2_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T2, T5, T3, T6, Te, T9, TP, T3e, T1e, T39, T3c, TT, T1a, T37, T8;
+	       E Tw, Td, Ty, Tm, Th, T1C, T3K, T1V, T3x, T3I, T1G, T1R, T3v, T2m, T2q;
+	       E T5Y, T6u, T53, T5B, T62, T6w, T57, T5D, T2V, T2X, Tg, TE, T3Y, T3V, T3j;
+	       E Tl, TA, T3g, T1j, T1t, TV, T2C, T2z, T1u, TZ, T1h, To, T1p, T6j, T6H;
+	       E Ts, T1l, T6l, T6F, T2P, T4b, T4x, T5i, T2R, T49, T4z, T5g, TG, T4k, T4m;
+	       E TK, T21, T3O, T3Q, T25, TW, T10, T11, T79, T6X, T5M, T6b, T1v, T30, T69;
+	       E T77, T13, T2F, T2D, T6p, T6O, T1x, T2a, T2f, T6V, T28, T6r, T2h, T6Q, T32;
+	       E T5K, T5w, T4G, T4Q, T3m, T4h, T4I, T5y, T3k, T4f, T41, T4S, T4Y, T3q, T3D;
+	       E T3F, T5r, T3s, T4W, T3Z, T5p;
+	       {
+		    E Ta, Tj, Tx, TC, Tf, Tk, Tz, TD, T1B, T1E, T2o, T2l, T1T, T1Q, T1A;
+		    E T1F, T2p, T2k, T1U, T1P;
+		    {
+			 E T4, T1d, T19, Tb, T1c, T7, Tc, T18, TR, TO, TS, TN;
+			 T2 = W[0];
+			 T5 = W[1];
+			 T3 = W[2];
+			 T6 = W[3];
+			 Te = W[5];
+			 T9 = W[4];
+			 T4 = T2 * T3;
+			 T1d = T5 * T9;
+			 T19 = T5 * Te;
+			 Tb = T2 * T6;
+			 T1c = T2 * Te;
+			 T7 = T5 * T6;
+			 Tc = T5 * T3;
+			 T18 = T2 * T9;
+			 TR = T3 * Te;
+			 TO = T6 * Te;
+			 TS = T6 * T9;
+			 TN = T3 * T9;
+			 TP = TN - TO;
+			 T3e = TR - TS;
+			 T1e = T1c - T1d;
+			 T39 = T1c + T1d;
+			 T3c = TN + TO;
+			 TT = TR + TS;
+			 T1a = T18 + T19;
+			 T37 = T18 - T19;
+			 T8 = T4 - T7;
+			 Ta = T8 * T9;
+			 Tj = T8 * Te;
+			 Tw = T4 + T7;
+			 Tx = Tw * T9;
+			 TC = Tw * Te;
+			 Td = Tb + Tc;
+			 Tf = Td * Te;
+			 Tk = Td * T9;
+			 Ty = Tb - Tc;
+			 Tz = Ty * Te;
+			 TD = Ty * T9;
+			 Tm = W[7];
+			 T1B = T6 * Tm;
+			 T1E = T3 * Tm;
+			 T2o = T2 * Tm;
+			 T2l = T5 * Tm;
+			 T1T = T9 * Tm;
+			 T1Q = Te * Tm;
+			 Th = W[6];
+			 T1A = T3 * Th;
+			 T1F = T6 * Th;
+			 T2p = T5 * Th;
+			 T2k = T2 * Th;
+			 T1U = Te * Th;
+			 T1P = T9 * Th;
+		    }
+		    T1C = T1A + T1B;
+		    T3K = T1E + T1F;
+		    T1V = T1T + T1U;
+		    T3x = T2o - T2p;
+		    T3I = T1A - T1B;
+		    T1G = T1E - T1F;
+		    T1R = T1P - T1Q;
+		    {
+			 E T5W, T5X, T55, T56;
+			 T3v = T2k + T2l;
+			 T2m = T2k - T2l;
+			 T2q = T2o + T2p;
+			 T5W = T8 * Th;
+			 T5X = Td * Tm;
+			 T5Y = T5W - T5X;
+			 T6u = T5W + T5X;
+			 {
+			      E T51, T52, T60, T61;
+			      T51 = Tw * Th;
+			      T52 = Ty * Tm;
+			      T53 = T51 + T52;
+			      T5B = T51 - T52;
+			      T60 = T8 * Tm;
+			      T61 = Td * Th;
+			      T62 = T60 + T61;
+			      T6w = T60 - T61;
+			 }
+			 T55 = Tw * Tm;
+			 T56 = Ty * Th;
+			 T57 = T55 - T56;
+			 T5D = T55 + T56;
+			 {
+			      E Ti, Tq, TF, TJ, T3W, T3X, T3T, T3U, T3h, T3i, Tn, Tr, TB, TI, T3d;
+			      E T3f, T1k, T1o, T1Z, T23, TQ, TU, T2A, T2B, T2x, T2y, T20, T24, TX, TY;
+			      E T1i, T1n;
+			      T2V = T1P + T1Q;
+			      T2X = T1T - T1U;
+			      Tg = Ta + Tf;
+			      Ti = Tg * Th;
+			      Tq = Tg * Tm;
+			      TE = TC + TD;
+			      TF = TE * Tm;
+			      TJ = TE * Th;
+			      T3W = T37 * Tm;
+			      T3X = T39 * Th;
+			      T3Y = T3W - T3X;
+			      T3T = T37 * Th;
+			      T3U = T39 * Tm;
+			      T3V = T3T + T3U;
+			      T3h = T3c * Tm;
+			      T3i = T3e * Th;
+			      T3j = T3h - T3i;
+			      Tl = Tj - Tk;
+			      Tn = Tl * Tm;
+			      Tr = Tl * Th;
+			      TA = Tx - Tz;
+			      TB = TA * Th;
+			      TI = TA * Tm;
+			      T3d = T3c * Th;
+			      T3f = T3e * Tm;
+			      T3g = T3d + T3f;
+			      T1j = Tj + Tk;
+			      T1k = T1j * Tm;
+			      T1o = T1j * Th;
+			      T1t = Tx + Tz;
+			      T1Z = T1t * Th;
+			      T23 = T1t * Tm;
+			      TQ = TP * Th;
+			      TU = TT * Tm;
+			      TV = TQ + TU;
+			      T2A = T1a * Tm;
+			      T2B = T1e * Th;
+			      T2C = T2A - T2B;
+			      T2x = T1a * Th;
+			      T2y = T1e * Tm;
+			      T2z = T2x + T2y;
+			      T1u = TC - TD;
+			      T20 = T1u * Tm;
+			      T24 = T1u * Th;
+			      TX = TP * Tm;
+			      TY = TT * Th;
+			      TZ = TX - TY;
+			      T1h = Ta - Tf;
+			      T1i = T1h * Th;
+			      T1n = T1h * Tm;
+			      To = Ti - Tn;
+			      T1p = T1n + T1o;
+			      T6j = TQ - TU;
+			      T6H = T2A + T2B;
+			      Ts = Tq + Tr;
+			      T1l = T1i - T1k;
+			      T6l = TX + TY;
+			      T6F = T2x - T2y;
+			      T2P = T1Z - T20;
+			      T4b = TI + TJ;
+			      T4x = T3d - T3f;
+			      T5i = T3W + T3X;
+			      T2R = T23 + T24;
+			      T49 = TB - TF;
+			      T4z = T3h + T3i;
+			      T5g = T3T - T3U;
+			      TG = TB + TF;
+			      T4k = Ti + Tn;
+			      T4m = Tq - Tr;
+			      TK = TI - TJ;
+			      T21 = T1Z + T20;
+			      T3O = T1i + T1k;
+			      T3Q = T1n - T1o;
+			      T25 = T23 - T24;
+			      TW = W[8];
+			      T10 = W[9];
+			      T11 = FMA(TV, TW, TZ * T10);
+			      T79 = FNMS(T25, TW, T21 * T10);
+			      T6X = FNMS(Td, TW, T8 * T10);
+			      T5M = FNMS(T2X, TW, T2V * T10);
+			      T6b = FNMS(TK, TW, TG * T10);
+			      T1v = FMA(T1t, TW, T1u * T10);
+			      T30 = FMA(T1h, TW, T1j * T10);
+			      T69 = FMA(TG, TW, TK * T10);
+			      T77 = FMA(T21, TW, T25 * T10);
+			      T13 = FNMS(TZ, TW, TV * T10);
+			      T2F = FNMS(T2C, TW, T2z * T10);
+			      T2D = FMA(T2z, TW, T2C * T10);
+			      T6p = FMA(T1a, TW, T1e * T10);
+			      T6O = FMA(TP, TW, TT * T10);
+			      T1x = FNMS(T1u, TW, T1t * T10);
+			      T2a = FNMS(TE, TW, TA * T10);
+			      T2f = FMA(T3, TW, T6 * T10);
+			      T6V = FMA(T8, TW, Td * T10);
+			      T28 = FMA(TA, TW, TE * T10);
+			      T6r = FNMS(T1e, TW, T1a * T10);
+			      T2h = FNMS(T6, TW, T3 * T10);
+			      T6Q = FNMS(TT, TW, TP * T10);
+			      T32 = FNMS(T1j, TW, T1h * T10);
+			      T5K = FMA(T2V, TW, T2X * T10);
+			      T5w = FMA(Tw, TW, Ty * T10);
+			      T4G = FMA(T3O, TW, T3Q * T10);
+			      T4Q = FMA(T4k, TW, T4m * T10);
+			      T3m = FNMS(T3j, TW, T3g * T10);
+			      T4h = FNMS(Te, TW, T9 * T10);
+			      T4I = FNMS(T3Q, TW, T3O * T10);
+			      T5y = FNMS(Ty, TW, Tw * T10);
+			      T3k = FMA(T3g, TW, T3j * T10);
+			      T4f = FMA(T9, TW, Te * T10);
+			      T41 = FNMS(T3Y, TW, T3V * T10);
+			      T4S = FNMS(T4m, TW, T4k * T10);
+			      T4Y = FNMS(T3e, TW, T3c * T10);
+			      T3q = FMA(Tg, TW, Tl * T10);
+			      T3D = FMA(T2, TW, T5 * T10);
+			      T3F = FNMS(T5, TW, T2 * T10);
+			      T5r = FNMS(T39, TW, T37 * T10);
+			      T3s = FNMS(Tl, TW, Tg * T10);
+			      T4W = FMA(T3c, TW, T3e * T10);
+			      T3Z = FMA(T3V, TW, T3Y * T10);
+			      T5p = FMA(T37, TW, T39 * T10);
+			 }
+		    }
+	       }
+	       {
+		    E T17, TdV, Tj3, Tjx, T7l, TbJ, Ti3, Tix, T1K, Tiw, TdY, ThY, T7w, Tj0, TbM;
+		    E Tjw, T2e, TgA, T7I, TaY, TbQ, Tda, Te4, TfO, T2J, TgB, T7T, TaZ, TbT, Tdb;
+		    E Te9, TfP, T36, T3B, TgH, TgE, TgF, TgG, T80, TbW, Tel, TfT, T8b, Tc0, T8k;
+		    E TbX, Teg, TfS, T8h, TbZ, T45, T4q, TgJ, TgK, TgL, TgM, T8r, Tc6, Tew, TfW;
+		    E T8C, Tc4, T8L, Tc7, Ter, TfV, T8I, Tc3, T6B, Th1, Tfm, Tga, Th8, ThI, T9N;
+		    E Tcv, T9Y, TcH, Tav, Tcw, Tf5, Tg7, Tas, TcG, T5c, TgV, TeV, Tg0, TgS, ThD;
+		    E T8U, Tcc, T95, Tco, T9C, Tcd, TeE, Tg3, T9z, Tcn, T5R, TgT, TeO, TeW, TgY;
+		    E ThE, T9h, T9F, T9s, T9E, Tck, Tcq, TeJ, TeX, Tch, Tcr, T7e, Th9, Tff, Tfn;
+		    E Th4, ThJ, Taa, Tay, Tal, Tax, TcD, TcJ, Tfa, Tfo, TcA, TcK;
+		    {
+			 E T1, Ti1, Tu, Ti0, TM, T7i, T15, T7j, Tp, Tt;
+			 T1 = ri[0];
+			 Ti1 = ii[0];
+			 Tp = ri[WS(rs, 32)];
+			 Tt = ii[WS(rs, 32)];
+			 Tu = FMA(To, Tp, Ts * Tt);
+			 Ti0 = FNMS(Ts, Tp, To * Tt);
+			 {
+			      E TH, TL, T12, T14;
+			      TH = ri[WS(rs, 16)];
+			      TL = ii[WS(rs, 16)];
+			      TM = FMA(TG, TH, TK * TL);
+			      T7i = FNMS(TK, TH, TG * TL);
+			      T12 = ri[WS(rs, 48)];
+			      T14 = ii[WS(rs, 48)];
+			      T15 = FMA(T11, T12, T13 * T14);
+			      T7j = FNMS(T13, T12, T11 * T14);
+			 }
+			 {
+			      E Tv, T16, Tj1, Tj2;
+			      Tv = T1 + Tu;
+			      T16 = TM + T15;
+			      T17 = Tv + T16;
+			      TdV = Tv - T16;
+			      Tj1 = Ti1 - Ti0;
+			      Tj2 = TM - T15;
+			      Tj3 = Tj1 - Tj2;
+			      Tjx = Tj2 + Tj1;
+			 }
+			 {
+			      E T7h, T7k, ThZ, Ti2;
+			      T7h = T1 - Tu;
+			      T7k = T7i - T7j;
+			      T7l = T7h - T7k;
+			      TbJ = T7h + T7k;
+			      ThZ = T7i + T7j;
+			      Ti2 = Ti0 + Ti1;
+			      Ti3 = ThZ + Ti2;
+			      Tix = Ti2 - ThZ;
+			 }
+		    }
+		    {
+			 E T1g, T7m, T1r, T7n, T7o, T7p, T1z, T7s, T1I, T7t, T7r, T7u;
+			 {
+			      E T1b, T1f, T1m, T1q;
+			      T1b = ri[WS(rs, 8)];
+			      T1f = ii[WS(rs, 8)];
+			      T1g = FMA(T1a, T1b, T1e * T1f);
+			      T7m = FNMS(T1e, T1b, T1a * T1f);
+			      T1m = ri[WS(rs, 40)];
+			      T1q = ii[WS(rs, 40)];
+			      T1r = FMA(T1l, T1m, T1p * T1q);
+			      T7n = FNMS(T1p, T1m, T1l * T1q);
+			 }
+			 T7o = T7m - T7n;
+			 T7p = T1g - T1r;
+			 {
+			      E T1w, T1y, T1D, T1H;
+			      T1w = ri[WS(rs, 56)];
+			      T1y = ii[WS(rs, 56)];
+			      T1z = FMA(T1v, T1w, T1x * T1y);
+			      T7s = FNMS(T1x, T1w, T1v * T1y);
+			      T1D = ri[WS(rs, 24)];
+			      T1H = ii[WS(rs, 24)];
+			      T1I = FMA(T1C, T1D, T1G * T1H);
+			      T7t = FNMS(T1G, T1D, T1C * T1H);
+			 }
+			 T7r = T1z - T1I;
+			 T7u = T7s - T7t;
+			 {
+			      E T1s, T1J, TdW, TdX;
+			      T1s = T1g + T1r;
+			      T1J = T1z + T1I;
+			      T1K = T1s + T1J;
+			      Tiw = T1J - T1s;
+			      TdW = T7m + T7n;
+			      TdX = T7s + T7t;
+			      TdY = TdW - TdX;
+			      ThY = TdW + TdX;
+			 }
+			 {
+			      E T7q, T7v, TbK, TbL;
+			      T7q = T7o - T7p;
+			      T7v = T7r + T7u;
+			      T7w = KP707106781 * (T7q - T7v);
+			      Tj0 = KP707106781 * (T7q + T7v);
+			      TbK = T7p + T7o;
+			      TbL = T7r - T7u;
+			      TbM = KP707106781 * (TbK + TbL);
+			      Tjw = KP707106781 * (TbL - TbK);
+			 }
+		    }
+		    {
+			 E T1Y, Te0, T7A, T7D, T2d, Te1, T7B, T7G, T7C, T7H;
+			 {
+			      E T1O, T7y, T1X, T7z;
+			      {
+				   E T1M, T1N, T1S, T1W;
+				   T1M = ri[WS(rs, 4)];
+				   T1N = ii[WS(rs, 4)];
+				   T1O = FMA(T8, T1M, Td * T1N);
+				   T7y = FNMS(Td, T1M, T8 * T1N);
+				   T1S = ri[WS(rs, 36)];
+				   T1W = ii[WS(rs, 36)];
+				   T1X = FMA(T1R, T1S, T1V * T1W);
+				   T7z = FNMS(T1V, T1S, T1R * T1W);
+			      }
+			      T1Y = T1O + T1X;
+			      Te0 = T7y + T7z;
+			      T7A = T7y - T7z;
+			      T7D = T1O - T1X;
+			 }
+			 {
+			      E T27, T7E, T2c, T7F;
+			      {
+				   E T22, T26, T29, T2b;
+				   T22 = ri[WS(rs, 20)];
+				   T26 = ii[WS(rs, 20)];
+				   T27 = FMA(T21, T22, T25 * T26);
+				   T7E = FNMS(T25, T22, T21 * T26);
+				   T29 = ri[WS(rs, 52)];
+				   T2b = ii[WS(rs, 52)];
+				   T2c = FMA(T28, T29, T2a * T2b);
+				   T7F = FNMS(T2a, T29, T28 * T2b);
+			      }
+			      T2d = T27 + T2c;
+			      Te1 = T7E + T7F;
+			      T7B = T27 - T2c;
+			      T7G = T7E - T7F;
+			 }
+			 T2e = T1Y + T2d;
+			 TgA = Te0 + Te1;
+			 T7C = T7A + T7B;
+			 T7H = T7D - T7G;
+			 T7I = FNMS(KP923879532, T7H, KP382683432 * T7C);
+			 TaY = FMA(KP923879532, T7C, KP382683432 * T7H);
+			 {
+			      E TbO, TbP, Te2, Te3;
+			      TbO = T7A - T7B;
+			      TbP = T7D + T7G;
+			      TbQ = FNMS(KP382683432, TbP, KP923879532 * TbO);
+			      Tda = FMA(KP382683432, TbO, KP923879532 * TbP);
+			      Te2 = Te0 - Te1;
+			      Te3 = T1Y - T2d;
+			      Te4 = Te2 - Te3;
+			      TfO = Te3 + Te2;
+			 }
+		    }
+		    {
+			 E T2t, Te6, T7L, T7O, T2I, Te7, T7M, T7R, T7N, T7S;
+			 {
+			      E T2j, T7J, T2s, T7K;
+			      {
+				   E T2g, T2i, T2n, T2r;
+				   T2g = ri[WS(rs, 60)];
+				   T2i = ii[WS(rs, 60)];
+				   T2j = FMA(T2f, T2g, T2h * T2i);
+				   T7J = FNMS(T2h, T2g, T2f * T2i);
+				   T2n = ri[WS(rs, 28)];
+				   T2r = ii[WS(rs, 28)];
+				   T2s = FMA(T2m, T2n, T2q * T2r);
+				   T7K = FNMS(T2q, T2n, T2m * T2r);
+			      }
+			      T2t = T2j + T2s;
+			      Te6 = T7J + T7K;
+			      T7L = T7J - T7K;
+			      T7O = T2j - T2s;
+			 }
+			 {
+			      E T2w, T7P, T2H, T7Q;
+			      {
+				   E T2u, T2v, T2E, T2G;
+				   T2u = ri[WS(rs, 12)];
+				   T2v = ii[WS(rs, 12)];
+				   T2w = FMA(TP, T2u, TT * T2v);
+				   T7P = FNMS(TT, T2u, TP * T2v);
+				   T2E = ri[WS(rs, 44)];
+				   T2G = ii[WS(rs, 44)];
+				   T2H = FMA(T2D, T2E, T2F * T2G);
+				   T7Q = FNMS(T2F, T2E, T2D * T2G);
+			      }
+			      T2I = T2w + T2H;
+			      Te7 = T7P + T7Q;
+			      T7M = T2w - T2H;
+			      T7R = T7P - T7Q;
+			 }
+			 T2J = T2t + T2I;
+			 TgB = Te6 + Te7;
+			 T7N = T7L + T7M;
+			 T7S = T7O - T7R;
+			 T7T = FMA(KP382683432, T7N, KP923879532 * T7S);
+			 TaZ = FNMS(KP923879532, T7N, KP382683432 * T7S);
+			 {
+			      E TbR, TbS, Te5, Te8;
+			      TbR = T7L - T7M;
+			      TbS = T7O + T7R;
+			      TbT = FMA(KP923879532, TbR, KP382683432 * TbS);
+			      Tdb = FNMS(KP382683432, TbR, KP923879532 * TbS);
+			      Te5 = T2t - T2I;
+			      Te8 = Te6 - Te7;
+			      Te9 = Te5 + Te8;
+			      TfP = Te5 - Te8;
+			 }
+		    }
+		    {
+			 E T2O, T7W, T2T, T7X, T2U, Tec, T2Z, T8e, T34, T8f, T35, Ted, T3p, Tei, T86;
+			 E T89, T3A, Tej, T81, T84;
+			 {
+			      E T2M, T2N, T2Q, T2S;
+			      T2M = ri[WS(rs, 2)];
+			      T2N = ii[WS(rs, 2)];
+			      T2O = FMA(Tw, T2M, Ty * T2N);
+			      T7W = FNMS(Ty, T2M, Tw * T2N);
+			      T2Q = ri[WS(rs, 34)];
+			      T2S = ii[WS(rs, 34)];
+			      T2T = FMA(T2P, T2Q, T2R * T2S);
+			      T7X = FNMS(T2R, T2Q, T2P * T2S);
+			 }
+			 T2U = T2O + T2T;
+			 Tec = T7W + T7X;
+			 {
+			      E T2W, T2Y, T31, T33;
+			      T2W = ri[WS(rs, 18)];
+			      T2Y = ii[WS(rs, 18)];
+			      T2Z = FMA(T2V, T2W, T2X * T2Y);
+			      T8e = FNMS(T2X, T2W, T2V * T2Y);
+			      T31 = ri[WS(rs, 50)];
+			      T33 = ii[WS(rs, 50)];
+			      T34 = FMA(T30, T31, T32 * T33);
+			      T8f = FNMS(T32, T31, T30 * T33);
+			 }
+			 T35 = T2Z + T34;
+			 Ted = T8e + T8f;
+			 {
+			      E T3b, T87, T3o, T88;
+			      {
+				   E T38, T3a, T3l, T3n;
+				   T38 = ri[WS(rs, 10)];
+				   T3a = ii[WS(rs, 10)];
+				   T3b = FMA(T37, T38, T39 * T3a);
+				   T87 = FNMS(T39, T38, T37 * T3a);
+				   T3l = ri[WS(rs, 42)];
+				   T3n = ii[WS(rs, 42)];
+				   T3o = FMA(T3k, T3l, T3m * T3n);
+				   T88 = FNMS(T3m, T3l, T3k * T3n);
+			      }
+			      T3p = T3b + T3o;
+			      Tei = T87 + T88;
+			      T86 = T3b - T3o;
+			      T89 = T87 - T88;
+			 }
+			 {
+			      E T3u, T82, T3z, T83;
+			      {
+				   E T3r, T3t, T3w, T3y;
+				   T3r = ri[WS(rs, 58)];
+				   T3t = ii[WS(rs, 58)];
+				   T3u = FMA(T3q, T3r, T3s * T3t);
+				   T82 = FNMS(T3s, T3r, T3q * T3t);
+				   T3w = ri[WS(rs, 26)];
+				   T3y = ii[WS(rs, 26)];
+				   T3z = FMA(T3v, T3w, T3x * T3y);
+				   T83 = FNMS(T3x, T3w, T3v * T3y);
+			      }
+			      T3A = T3u + T3z;
+			      Tej = T82 + T83;
+			      T81 = T3u - T3z;
+			      T84 = T82 - T83;
+			 }
+			 T36 = T2U + T35;
+			 T3B = T3p + T3A;
+			 TgH = T36 - T3B;
+			 TgE = Tec + Ted;
+			 TgF = Tei + Tej;
+			 TgG = TgE - TgF;
+			 {
+			      E T7Y, T7Z, Teh, Tek;
+			      T7Y = T7W - T7X;
+			      T7Z = T2Z - T34;
+			      T80 = T7Y + T7Z;
+			      TbW = T7Y - T7Z;
+			      Teh = T2U - T35;
+			      Tek = Tei - Tej;
+			      Tel = Teh - Tek;
+			      TfT = Teh + Tek;
+			 }
+			 {
+			      E T85, T8a, T8i, T8j;
+			      T85 = T81 - T84;
+			      T8a = T86 + T89;
+			      T8b = KP707106781 * (T85 - T8a);
+			      Tc0 = KP707106781 * (T8a + T85);
+			      T8i = T89 - T86;
+			      T8j = T81 + T84;
+			      T8k = KP707106781 * (T8i - T8j);
+			      TbX = KP707106781 * (T8i + T8j);
+			 }
+			 {
+			      E Tee, Tef, T8d, T8g;
+			      Tee = Tec - Ted;
+			      Tef = T3A - T3p;
+			      Teg = Tee - Tef;
+			      TfS = Tee + Tef;
+			      T8d = T2O - T2T;
+			      T8g = T8e - T8f;
+			      T8h = T8d - T8g;
+			      TbZ = T8d + T8g;
+			 }
+		    }
+		    {
+			 E T3H, T8n, T3M, T8o, T3N, Ten, T3S, T8F, T43, T8G, T44, Teo, T4e, Tet, T8x;
+			 E T8A, T4p, Teu, T8s, T8v;
+			 {
+			      E T3E, T3G, T3J, T3L;
+			      T3E = ri[WS(rs, 62)];
+			      T3G = ii[WS(rs, 62)];
+			      T3H = FMA(T3D, T3E, T3F * T3G);
+			      T8n = FNMS(T3F, T3E, T3D * T3G);
+			      T3J = ri[WS(rs, 30)];
+			      T3L = ii[WS(rs, 30)];
+			      T3M = FMA(T3I, T3J, T3K * T3L);
+			      T8o = FNMS(T3K, T3J, T3I * T3L);
+			 }
+			 T3N = T3H + T3M;
+			 Ten = T8n + T8o;
+			 {
+			      E T3P, T3R, T40, T42;
+			      T3P = ri[WS(rs, 14)];
+			      T3R = ii[WS(rs, 14)];
+			      T3S = FMA(T3O, T3P, T3Q * T3R);
+			      T8F = FNMS(T3Q, T3P, T3O * T3R);
+			      T40 = ri[WS(rs, 46)];
+			      T42 = ii[WS(rs, 46)];
+			      T43 = FMA(T3Z, T40, T41 * T42);
+			      T8G = FNMS(T41, T40, T3Z * T42);
+			 }
+			 T44 = T3S + T43;
+			 Teo = T8F + T8G;
+			 {
+			      E T48, T8y, T4d, T8z;
+			      {
+				   E T46, T47, T4a, T4c;
+				   T46 = ri[WS(rs, 6)];
+				   T47 = ii[WS(rs, 6)];
+				   T48 = FMA(T3c, T46, T3e * T47);
+				   T8y = FNMS(T3e, T46, T3c * T47);
+				   T4a = ri[WS(rs, 38)];
+				   T4c = ii[WS(rs, 38)];
+				   T4d = FMA(T49, T4a, T4b * T4c);
+				   T8z = FNMS(T4b, T4a, T49 * T4c);
+			      }
+			      T4e = T48 + T4d;
+			      Tet = T8y + T8z;
+			      T8x = T48 - T4d;
+			      T8A = T8y - T8z;
+			 }
+			 {
+			      E T4j, T8t, T4o, T8u;
+			      {
+				   E T4g, T4i, T4l, T4n;
+				   T4g = ri[WS(rs, 54)];
+				   T4i = ii[WS(rs, 54)];
+				   T4j = FMA(T4f, T4g, T4h * T4i);
+				   T8t = FNMS(T4h, T4g, T4f * T4i);
+				   T4l = ri[WS(rs, 22)];
+				   T4n = ii[WS(rs, 22)];
+				   T4o = FMA(T4k, T4l, T4m * T4n);
+				   T8u = FNMS(T4m, T4l, T4k * T4n);
+			      }
+			      T4p = T4j + T4o;
+			      Teu = T8t + T8u;
+			      T8s = T4j - T4o;
+			      T8v = T8t - T8u;
+			 }
+			 T45 = T3N + T44;
+			 T4q = T4e + T4p;
+			 TgJ = T45 - T4q;
+			 TgK = Ten + Teo;
+			 TgL = Tet + Teu;
+			 TgM = TgK - TgL;
+			 {
+			      E T8p, T8q, Tes, Tev;
+			      T8p = T8n - T8o;
+			      T8q = T3S - T43;
+			      T8r = T8p + T8q;
+			      Tc6 = T8p - T8q;
+			      Tes = T3N - T44;
+			      Tev = Tet - Teu;
+			      Tew = Tes - Tev;
+			      TfW = Tes + Tev;
+			 }
+			 {
+			      E T8w, T8B, T8J, T8K;
+			      T8w = T8s - T8v;
+			      T8B = T8x + T8A;
+			      T8C = KP707106781 * (T8w - T8B);
+			      Tc4 = KP707106781 * (T8B + T8w);
+			      T8J = T8A - T8x;
+			      T8K = T8s + T8v;
+			      T8L = KP707106781 * (T8J - T8K);
+			      Tc7 = KP707106781 * (T8J + T8K);
+			 }
+			 {
+			      E Tep, Teq, T8E, T8H;
+			      Tep = Ten - Teo;
+			      Teq = T4p - T4e;
+			      Ter = Tep - Teq;
+			      TfV = Tep + Teq;
+			      T8E = T3H - T3M;
+			      T8H = T8F - T8G;
+			      T8I = T8E - T8H;
+			      Tc3 = T8E + T8H;
+			 }
+		    }
+		    {
+			 E T5V, Tao, T64, Tap, T65, Tfi, T68, T9K, T6d, T9L, T6e, Tfj, T6o, Tf2, T9Q;
+			 E T9R, T6z, Tf3, T9T, T9W;
+			 {
+			      E T5T, T5U, T5Z, T63;
+			      T5T = ri[WS(rs, 63)];
+			      T5U = ii[WS(rs, 63)];
+			      T5V = FMA(TW, T5T, T10 * T5U);
+			      Tao = FNMS(T10, T5T, TW * T5U);
+			      T5Z = ri[WS(rs, 31)];
+			      T63 = ii[WS(rs, 31)];
+			      T64 = FMA(T5Y, T5Z, T62 * T63);
+			      Tap = FNMS(T62, T5Z, T5Y * T63);
+			 }
+			 T65 = T5V + T64;
+			 Tfi = Tao + Tap;
+			 {
+			      E T66, T67, T6a, T6c;
+			      T66 = ri[WS(rs, 15)];
+			      T67 = ii[WS(rs, 15)];
+			      T68 = FMA(TV, T66, TZ * T67);
+			      T9K = FNMS(TZ, T66, TV * T67);
+			      T6a = ri[WS(rs, 47)];
+			      T6c = ii[WS(rs, 47)];
+			      T6d = FMA(T69, T6a, T6b * T6c);
+			      T9L = FNMS(T6b, T6a, T69 * T6c);
+			 }
+			 T6e = T68 + T6d;
+			 Tfj = T9K + T9L;
+			 {
+			      E T6i, T9O, T6n, T9P;
+			      {
+				   E T6g, T6h, T6k, T6m;
+				   T6g = ri[WS(rs, 7)];
+				   T6h = ii[WS(rs, 7)];
+				   T6i = FMA(T1t, T6g, T1u * T6h);
+				   T9O = FNMS(T1u, T6g, T1t * T6h);
+				   T6k = ri[WS(rs, 39)];
+				   T6m = ii[WS(rs, 39)];
+				   T6n = FMA(T6j, T6k, T6l * T6m);
+				   T9P = FNMS(T6l, T6k, T6j * T6m);
+			      }
+			      T6o = T6i + T6n;
+			      Tf2 = T9O + T9P;
+			      T9Q = T9O - T9P;
+			      T9R = T6i - T6n;
+			 }
+			 {
+			      E T6t, T9U, T6y, T9V;
+			      {
+				   E T6q, T6s, T6v, T6x;
+				   T6q = ri[WS(rs, 55)];
+				   T6s = ii[WS(rs, 55)];
+				   T6t = FMA(T6p, T6q, T6r * T6s);
+				   T9U = FNMS(T6r, T6q, T6p * T6s);
+				   T6v = ri[WS(rs, 23)];
+				   T6x = ii[WS(rs, 23)];
+				   T6y = FMA(T6u, T6v, T6w * T6x);
+				   T9V = FNMS(T6w, T6v, T6u * T6x);
+			      }
+			      T6z = T6t + T6y;
+			      Tf3 = T9U + T9V;
+			      T9T = T6t - T6y;
+			      T9W = T9U - T9V;
+			 }
+			 {
+			      E T6f, T6A, Tfk, Tfl;
+			      T6f = T65 + T6e;
+			      T6A = T6o + T6z;
+			      T6B = T6f + T6A;
+			      Th1 = T6f - T6A;
+			      Tfk = Tfi - Tfj;
+			      Tfl = T6z - T6o;
+			      Tfm = Tfk - Tfl;
+			      Tga = Tfk + Tfl;
+			 }
+			 {
+			      E Th6, Th7, T9J, T9M;
+			      Th6 = Tfi + Tfj;
+			      Th7 = Tf2 + Tf3;
+			      Th8 = Th6 - Th7;
+			      ThI = Th6 + Th7;
+			      T9J = T5V - T64;
+			      T9M = T9K - T9L;
+			      T9N = T9J - T9M;
+			      Tcv = T9J + T9M;
+			 }
+			 {
+			      E T9S, T9X, Tat, Tau;
+			      T9S = T9Q - T9R;
+			      T9X = T9T + T9W;
+			      T9Y = KP707106781 * (T9S - T9X);
+			      TcH = KP707106781 * (T9S + T9X);
+			      Tat = T9T - T9W;
+			      Tau = T9R + T9Q;
+			      Tav = KP707106781 * (Tat - Tau);
+			      Tcw = KP707106781 * (Tau + Tat);
+			 }
+			 {
+			      E Tf1, Tf4, Taq, Tar;
+			      Tf1 = T65 - T6e;
+			      Tf4 = Tf2 - Tf3;
+			      Tf5 = Tf1 - Tf4;
+			      Tg7 = Tf1 + Tf4;
+			      Taq = Tao - Tap;
+			      Tar = T68 - T6d;
+			      Tas = Taq + Tar;
+			      TcG = Taq - Tar;
+			 }
+		    }
+		    {
+			 E T4w, T8Q, T4B, T8R, T4C, TeA, T4F, T9w, T4K, T9x, T4L, TeB, T4V, TeS, T90;
+			 E T93, T5a, TeT, T8V, T8Y;
+			 {
+			      E T4u, T4v, T4y, T4A;
+			      T4u = ri[WS(rs, 1)];
+			      T4v = ii[WS(rs, 1)];
+			      T4w = FMA(T2, T4u, T5 * T4v);
+			      T8Q = FNMS(T5, T4u, T2 * T4v);
+			      T4y = ri[WS(rs, 33)];
+			      T4A = ii[WS(rs, 33)];
+			      T4B = FMA(T4x, T4y, T4z * T4A);
+			      T8R = FNMS(T4z, T4y, T4x * T4A);
+			 }
+			 T4C = T4w + T4B;
+			 TeA = T8Q + T8R;
+			 {
+			      E T4D, T4E, T4H, T4J;
+			      T4D = ri[WS(rs, 17)];
+			      T4E = ii[WS(rs, 17)];
+			      T4F = FMA(T3V, T4D, T3Y * T4E);
+			      T9w = FNMS(T3Y, T4D, T3V * T4E);
+			      T4H = ri[WS(rs, 49)];
+			      T4J = ii[WS(rs, 49)];
+			      T4K = FMA(T4G, T4H, T4I * T4J);
+			      T9x = FNMS(T4I, T4H, T4G * T4J);
+			 }
+			 T4L = T4F + T4K;
+			 TeB = T9w + T9x;
+			 {
+			      E T4P, T91, T4U, T92;
+			      {
+				   E T4N, T4O, T4R, T4T;
+				   T4N = ri[WS(rs, 9)];
+				   T4O = ii[WS(rs, 9)];
+				   T4P = FMA(T9, T4N, Te * T4O);
+				   T91 = FNMS(Te, T4N, T9 * T4O);
+				   T4R = ri[WS(rs, 41)];
+				   T4T = ii[WS(rs, 41)];
+				   T4U = FMA(T4Q, T4R, T4S * T4T);
+				   T92 = FNMS(T4S, T4R, T4Q * T4T);
+			      }
+			      T4V = T4P + T4U;
+			      TeS = T91 + T92;
+			      T90 = T4P - T4U;
+			      T93 = T91 - T92;
+			 }
+			 {
+			      E T50, T8W, T59, T8X;
+			      {
+				   E T4X, T4Z, T54, T58;
+				   T4X = ri[WS(rs, 57)];
+				   T4Z = ii[WS(rs, 57)];
+				   T50 = FMA(T4W, T4X, T4Y * T4Z);
+				   T8W = FNMS(T4Y, T4X, T4W * T4Z);
+				   T54 = ri[WS(rs, 25)];
+				   T58 = ii[WS(rs, 25)];
+				   T59 = FMA(T53, T54, T57 * T58);
+				   T8X = FNMS(T57, T54, T53 * T58);
+			      }
+			      T5a = T50 + T59;
+			      TeT = T8W + T8X;
+			      T8V = T50 - T59;
+			      T8Y = T8W - T8X;
+			 }
+			 {
+			      E T4M, T5b, TeR, TeU;
+			      T4M = T4C + T4L;
+			      T5b = T4V + T5a;
+			      T5c = T4M + T5b;
+			      TgV = T4M - T5b;
+			      TeR = T4C - T4L;
+			      TeU = TeS - TeT;
+			      TeV = TeR - TeU;
+			      Tg0 = TeR + TeU;
+			 }
+			 {
+			      E TgQ, TgR, T8S, T8T;
+			      TgQ = TeA + TeB;
+			      TgR = TeS + TeT;
+			      TgS = TgQ - TgR;
+			      ThD = TgQ + TgR;
+			      T8S = T8Q - T8R;
+			      T8T = T4F - T4K;
+			      T8U = T8S + T8T;
+			      Tcc = T8S - T8T;
+			 }
+			 {
+			      E T8Z, T94, T9A, T9B;
+			      T8Z = T8V - T8Y;
+			      T94 = T90 + T93;
+			      T95 = KP707106781 * (T8Z - T94);
+			      Tco = KP707106781 * (T94 + T8Z);
+			      T9A = T93 - T90;
+			      T9B = T8V + T8Y;
+			      T9C = KP707106781 * (T9A - T9B);
+			      Tcd = KP707106781 * (T9A + T9B);
+			 }
+			 {
+			      E TeC, TeD, T9v, T9y;
+			      TeC = TeA - TeB;
+			      TeD = T5a - T4V;
+			      TeE = TeC - TeD;
+			      Tg3 = TeC + TeD;
+			      T9v = T4w - T4B;
+			      T9y = T9w - T9x;
+			      T9z = T9v - T9y;
+			      Tcn = T9v + T9y;
+			 }
+		    }
+		    {
+			 E T5l, TeL, T9k, T9n, T5P, TeH, T9a, T9f, T5u, TeM, T9l, T9q, T5G, TeG, T97;
+			 E T9e;
+			 {
+			      E T5f, T9i, T5k, T9j;
+			      {
+				   E T5d, T5e, T5h, T5j;
+				   T5d = ri[WS(rs, 5)];
+				   T5e = ii[WS(rs, 5)];
+				   T5f = FMA(Tg, T5d, Tl * T5e);
+				   T9i = FNMS(Tl, T5d, Tg * T5e);
+				   T5h = ri[WS(rs, 37)];
+				   T5j = ii[WS(rs, 37)];
+				   T5k = FMA(T5g, T5h, T5i * T5j);
+				   T9j = FNMS(T5i, T5h, T5g * T5j);
+			      }
+			      T5l = T5f + T5k;
+			      TeL = T9i + T9j;
+			      T9k = T9i - T9j;
+			      T9n = T5f - T5k;
+			 }
+			 {
+			      E T5J, T98, T5O, T99;
+			      {
+				   E T5H, T5I, T5L, T5N;
+				   T5H = ri[WS(rs, 13)];
+				   T5I = ii[WS(rs, 13)];
+				   T5J = FMA(T1h, T5H, T1j * T5I);
+				   T98 = FNMS(T1j, T5H, T1h * T5I);
+				   T5L = ri[WS(rs, 45)];
+				   T5N = ii[WS(rs, 45)];
+				   T5O = FMA(T5K, T5L, T5M * T5N);
+				   T99 = FNMS(T5M, T5L, T5K * T5N);
+			      }
+			      T5P = T5J + T5O;
+			      TeH = T98 + T99;
+			      T9a = T98 - T99;
+			      T9f = T5J - T5O;
+			 }
+			 {
+			      E T5o, T9o, T5t, T9p;
+			      {
+				   E T5m, T5n, T5q, T5s;
+				   T5m = ri[WS(rs, 21)];
+				   T5n = ii[WS(rs, 21)];
+				   T5o = FMA(T3g, T5m, T3j * T5n);
+				   T9o = FNMS(T3j, T5m, T3g * T5n);
+				   T5q = ri[WS(rs, 53)];
+				   T5s = ii[WS(rs, 53)];
+				   T5t = FMA(T5p, T5q, T5r * T5s);
+				   T9p = FNMS(T5r, T5q, T5p * T5s);
+			      }
+			      T5u = T5o + T5t;
+			      TeM = T9o + T9p;
+			      T9l = T5o - T5t;
+			      T9q = T9o - T9p;
+			 }
+			 {
+			      E T5A, T9c, T5F, T9d;
+			      {
+				   E T5x, T5z, T5C, T5E;
+				   T5x = ri[WS(rs, 61)];
+				   T5z = ii[WS(rs, 61)];
+				   T5A = FMA(T5w, T5x, T5y * T5z);
+				   T9c = FNMS(T5y, T5x, T5w * T5z);
+				   T5C = ri[WS(rs, 29)];
+				   T5E = ii[WS(rs, 29)];
+				   T5F = FMA(T5B, T5C, T5D * T5E);
+				   T9d = FNMS(T5D, T5C, T5B * T5E);
+			      }
+			      T5G = T5A + T5F;
+			      TeG = T9c + T9d;
+			      T97 = T5A - T5F;
+			      T9e = T9c - T9d;
+			 }
+			 {
+			      E T5v, T5Q, TeK, TeN;
+			      T5v = T5l + T5u;
+			      T5Q = T5G + T5P;
+			      T5R = T5v + T5Q;
+			      TgT = T5Q - T5v;
+			      TeK = T5l - T5u;
+			      TeN = TeL - TeM;
+			      TeO = TeK + TeN;
+			      TeW = TeN - TeK;
+			 }
+			 {
+			      E TgW, TgX, T9b, T9g;
+			      TgW = TeL + TeM;
+			      TgX = TeG + TeH;
+			      TgY = TgW - TgX;
+			      ThE = TgW + TgX;
+			      T9b = T97 - T9a;
+			      T9g = T9e + T9f;
+			      T9h = FNMS(KP923879532, T9g, KP382683432 * T9b);
+			      T9F = FMA(KP382683432, T9g, KP923879532 * T9b);
+			 }
+			 {
+			      E T9m, T9r, Tci, Tcj;
+			      T9m = T9k + T9l;
+			      T9r = T9n - T9q;
+			      T9s = FMA(KP923879532, T9m, KP382683432 * T9r);
+			      T9E = FNMS(KP923879532, T9r, KP382683432 * T9m);
+			      Tci = T9k - T9l;
+			      Tcj = T9n + T9q;
+			      Tck = FMA(KP382683432, Tci, KP923879532 * Tcj);
+			      Tcq = FNMS(KP382683432, Tcj, KP923879532 * Tci);
+			 }
+			 {
+			      E TeF, TeI, Tcf, Tcg;
+			      TeF = T5G - T5P;
+			      TeI = TeG - TeH;
+			      TeJ = TeF - TeI;
+			      TeX = TeF + TeI;
+			      Tcf = T97 + T9a;
+			      Tcg = T9e - T9f;
+			      Tch = FNMS(KP382683432, Tcg, KP923879532 * Tcf);
+			      Tcr = FMA(KP923879532, Tcg, KP382683432 * Tcf);
+			 }
+		    }
+		    {
+			 E T6K, Tf6, Ta2, Ta5, T7c, Tfd, Tae, Taj, T6T, Tf7, Ta3, Ta8, T73, Tfc, Tad;
+			 E Tag;
+			 {
+			      E T6E, Ta0, T6J, Ta1;
+			      {
+				   E T6C, T6D, T6G, T6I;
+				   T6C = ri[WS(rs, 3)];
+				   T6D = ii[WS(rs, 3)];
+				   T6E = FMA(T3, T6C, T6 * T6D);
+				   Ta0 = FNMS(T6, T6C, T3 * T6D);
+				   T6G = ri[WS(rs, 35)];
+				   T6I = ii[WS(rs, 35)];
+				   T6J = FMA(T6F, T6G, T6H * T6I);
+				   Ta1 = FNMS(T6H, T6G, T6F * T6I);
+			      }
+			      T6K = T6E + T6J;
+			      Tf6 = Ta0 + Ta1;
+			      Ta2 = Ta0 - Ta1;
+			      Ta5 = T6E - T6J;
+			 }
+			 {
+			      E T76, Tah, T7b, Tai;
+			      {
+				   E T74, T75, T78, T7a;
+				   T74 = ri[WS(rs, 11)];
+				   T75 = ii[WS(rs, 11)];
+				   T76 = FMA(TA, T74, TE * T75);
+				   Tah = FNMS(TE, T74, TA * T75);
+				   T78 = ri[WS(rs, 43)];
+				   T7a = ii[WS(rs, 43)];
+				   T7b = FMA(T77, T78, T79 * T7a);
+				   Tai = FNMS(T79, T78, T77 * T7a);
+			      }
+			      T7c = T76 + T7b;
+			      Tfd = Tah + Tai;
+			      Tae = T76 - T7b;
+			      Taj = Tah - Tai;
+			 }
+			 {
+			      E T6N, Ta6, T6S, Ta7;
+			      {
+				   E T6L, T6M, T6P, T6R;
+				   T6L = ri[WS(rs, 19)];
+				   T6M = ii[WS(rs, 19)];
+				   T6N = FMA(T2z, T6L, T2C * T6M);
+				   Ta6 = FNMS(T2C, T6L, T2z * T6M);
+				   T6P = ri[WS(rs, 51)];
+				   T6R = ii[WS(rs, 51)];
+				   T6S = FMA(T6O, T6P, T6Q * T6R);
+				   Ta7 = FNMS(T6Q, T6P, T6O * T6R);
+			      }
+			      T6T = T6N + T6S;
+			      Tf7 = Ta6 + Ta7;
+			      Ta3 = T6N - T6S;
+			      Ta8 = Ta6 - Ta7;
+			 }
+			 {
+			      E T6Z, Tab, T72, Tac;
+			      {
+				   E T6W, T6Y, T70, T71;
+				   T6W = ri[WS(rs, 59)];
+				   T6Y = ii[WS(rs, 59)];
+				   T6Z = FMA(T6V, T6W, T6X * T6Y);
+				   Tab = FNMS(T6X, T6W, T6V * T6Y);
+				   T70 = ri[WS(rs, 27)];
+				   T71 = ii[WS(rs, 27)];
+				   T72 = FMA(Th, T70, Tm * T71);
+				   Tac = FNMS(Tm, T70, Th * T71);
+			      }
+			      T73 = T6Z + T72;
+			      Tfc = Tab + Tac;
+			      Tad = Tab - Tac;
+			      Tag = T6Z - T72;
+			 }
+			 {
+			      E T6U, T7d, Tfb, Tfe;
+			      T6U = T6K + T6T;
+			      T7d = T73 + T7c;
+			      T7e = T6U + T7d;
+			      Th9 = T7d - T6U;
+			      Tfb = T73 - T7c;
+			      Tfe = Tfc - Tfd;
+			      Tff = Tfb + Tfe;
+			      Tfn = Tfb - Tfe;
+			 }
+			 {
+			      E Th2, Th3, Ta4, Ta9;
+			      Th2 = Tf6 + Tf7;
+			      Th3 = Tfc + Tfd;
+			      Th4 = Th2 - Th3;
+			      ThJ = Th2 + Th3;
+			      Ta4 = Ta2 + Ta3;
+			      Ta9 = Ta5 - Ta8;
+			      Taa = FNMS(KP923879532, Ta9, KP382683432 * Ta4);
+			      Tay = FMA(KP923879532, Ta4, KP382683432 * Ta9);
+			 }
+			 {
+			      E Taf, Tak, TcB, TcC;
+			      Taf = Tad + Tae;
+			      Tak = Tag - Taj;
+			      Tal = FMA(KP382683432, Taf, KP923879532 * Tak);
+			      Tax = FNMS(KP923879532, Taf, KP382683432 * Tak);
+			      TcB = Tad - Tae;
+			      TcC = Tag + Taj;
+			      TcD = FMA(KP923879532, TcB, KP382683432 * TcC);
+			      TcJ = FNMS(KP382683432, TcB, KP923879532 * TcC);
+			 }
+			 {
+			      E Tf8, Tf9, Tcy, Tcz;
+			      Tf8 = Tf6 - Tf7;
+			      Tf9 = T6K - T6T;
+			      Tfa = Tf8 - Tf9;
+			      Tfo = Tf9 + Tf8;
+			      Tcy = Ta2 - Ta3;
+			      Tcz = Ta5 + Ta8;
+			      TcA = FNMS(KP382683432, Tcz, KP923879532 * Tcy);
+			      TcK = FMA(KP382683432, Tcy, KP923879532 * Tcz);
+			 }
+		    }
+		    {
+			 E T2L, Thx, ThU, ThV, Ti5, Tib, T4s, Tia, T7g, Ti7, ThG, ThO, ThL, ThP, ThA;
+			 E ThW;
+			 {
+			      E T1L, T2K, ThS, ThT;
+			      T1L = T17 + T1K;
+			      T2K = T2e + T2J;
+			      T2L = T1L + T2K;
+			      Thx = T1L - T2K;
+			      ThS = ThD + ThE;
+			      ThT = ThI + ThJ;
+			      ThU = ThS - ThT;
+			      ThV = ThS + ThT;
+			 }
+			 {
+			      E ThX, Ti4, T3C, T4r;
+			      ThX = TgA + TgB;
+			      Ti4 = ThY + Ti3;
+			      Ti5 = ThX + Ti4;
+			      Tib = Ti4 - ThX;
+			      T3C = T36 + T3B;
+			      T4r = T45 + T4q;
+			      T4s = T3C + T4r;
+			      Tia = T4r - T3C;
+			 }
+			 {
+			      E T5S, T7f, ThC, ThF;
+			      T5S = T5c + T5R;
+			      T7f = T6B + T7e;
+			      T7g = T5S + T7f;
+			      Ti7 = T7f - T5S;
+			      ThC = T5c - T5R;
+			      ThF = ThD - ThE;
+			      ThG = ThC + ThF;
+			      ThO = ThF - ThC;
+			 }
+			 {
+			      E ThH, ThK, Thy, Thz;
+			      ThH = T6B - T7e;
+			      ThK = ThI - ThJ;
+			      ThL = ThH - ThK;
+			      ThP = ThH + ThK;
+			      Thy = TgE + TgF;
+			      Thz = TgK + TgL;
+			      ThA = Thy - Thz;
+			      ThW = Thy + Thz;
+			 }
+			 {
+			      E T4t, Ti6, ThR, Ti8;
+			      T4t = T2L + T4s;
+			      ri[WS(rs, 32)] = T4t - T7g;
+			      ri[0] = T4t + T7g;
+			      Ti6 = ThW + Ti5;
+			      ii[0] = ThV + Ti6;
+			      ii[WS(rs, 32)] = Ti6 - ThV;
+			      ThR = T2L - T4s;
+			      ri[WS(rs, 48)] = ThR - ThU;
+			      ri[WS(rs, 16)] = ThR + ThU;
+			      Ti8 = Ti5 - ThW;
+			      ii[WS(rs, 16)] = Ti7 + Ti8;
+			      ii[WS(rs, 48)] = Ti8 - Ti7;
+			 }
+			 {
+			      E ThB, ThM, Ti9, Tic;
+			      ThB = Thx + ThA;
+			      ThM = KP707106781 * (ThG + ThL);
+			      ri[WS(rs, 40)] = ThB - ThM;
+			      ri[WS(rs, 8)] = ThB + ThM;
+			      Ti9 = KP707106781 * (ThO + ThP);
+			      Tic = Tia + Tib;
+			      ii[WS(rs, 8)] = Ti9 + Tic;
+			      ii[WS(rs, 40)] = Tic - Ti9;
+			 }
+			 {
+			      E ThN, ThQ, Tid, Tie;
+			      ThN = Thx - ThA;
+			      ThQ = KP707106781 * (ThO - ThP);
+			      ri[WS(rs, 56)] = ThN - ThQ;
+			      ri[WS(rs, 24)] = ThN + ThQ;
+			      Tid = KP707106781 * (ThL - ThG);
+			      Tie = Tib - Tia;
+			      ii[WS(rs, 24)] = Tid + Tie;
+			      ii[WS(rs, 56)] = Tie - Tid;
+			 }
+		    }
+		    {
+			 E TgD, Thh, Thr, Thv, Tij, Tip, TgO, Tig, Th0, The, Thk, Tio, Tho, Thu, Thb;
+			 E Thf;
+			 {
+			      E Tgz, TgC, Thp, Thq;
+			      Tgz = T17 - T1K;
+			      TgC = TgA - TgB;
+			      TgD = Tgz - TgC;
+			      Thh = Tgz + TgC;
+			      Thp = Th1 + Th4;
+			      Thq = Th8 + Th9;
+			      Thr = FNMS(KP382683432, Thq, KP923879532 * Thp);
+			      Thv = FMA(KP923879532, Thq, KP382683432 * Thp);
+			 }
+			 {
+			      E Tih, Tii, TgI, TgN;
+			      Tih = T2J - T2e;
+			      Tii = Ti3 - ThY;
+			      Tij = Tih + Tii;
+			      Tip = Tii - Tih;
+			      TgI = TgG - TgH;
+			      TgN = TgJ + TgM;
+			      TgO = KP707106781 * (TgI - TgN);
+			      Tig = KP707106781 * (TgI + TgN);
+			 }
+			 {
+			      E TgU, TgZ, Thi, Thj;
+			      TgU = TgS - TgT;
+			      TgZ = TgV - TgY;
+			      Th0 = FMA(KP923879532, TgU, KP382683432 * TgZ);
+			      The = FNMS(KP923879532, TgZ, KP382683432 * TgU);
+			      Thi = TgH + TgG;
+			      Thj = TgJ - TgM;
+			      Thk = KP707106781 * (Thi + Thj);
+			      Tio = KP707106781 * (Thj - Thi);
+			 }
+			 {
+			      E Thm, Thn, Th5, Tha;
+			      Thm = TgS + TgT;
+			      Thn = TgV + TgY;
+			      Tho = FMA(KP382683432, Thm, KP923879532 * Thn);
+			      Thu = FNMS(KP382683432, Thn, KP923879532 * Thm);
+			      Th5 = Th1 - Th4;
+			      Tha = Th8 - Th9;
+			      Thb = FNMS(KP923879532, Tha, KP382683432 * Th5);
+			      Thf = FMA(KP382683432, Tha, KP923879532 * Th5);
+			 }
+			 {
+			      E TgP, Thc, Tin, Tiq;
+			      TgP = TgD + TgO;
+			      Thc = Th0 + Thb;
+			      ri[WS(rs, 44)] = TgP - Thc;
+			      ri[WS(rs, 12)] = TgP + Thc;
+			      Tin = The + Thf;
+			      Tiq = Tio + Tip;
+			      ii[WS(rs, 12)] = Tin + Tiq;
+			      ii[WS(rs, 44)] = Tiq - Tin;
+			 }
+			 {
+			      E Thd, Thg, Tir, Tis;
+			      Thd = TgD - TgO;
+			      Thg = The - Thf;
+			      ri[WS(rs, 60)] = Thd - Thg;
+			      ri[WS(rs, 28)] = Thd + Thg;
+			      Tir = Thb - Th0;
+			      Tis = Tip - Tio;
+			      ii[WS(rs, 28)] = Tir + Tis;
+			      ii[WS(rs, 60)] = Tis - Tir;
+			 }
+			 {
+			      E Thl, Ths, Tif, Tik;
+			      Thl = Thh + Thk;
+			      Ths = Tho + Thr;
+			      ri[WS(rs, 36)] = Thl - Ths;
+			      ri[WS(rs, 4)] = Thl + Ths;
+			      Tif = Thu + Thv;
+			      Tik = Tig + Tij;
+			      ii[WS(rs, 4)] = Tif + Tik;
+			      ii[WS(rs, 36)] = Tik - Tif;
+			 }
+			 {
+			      E Tht, Thw, Til, Tim;
+			      Tht = Thh - Thk;
+			      Thw = Thu - Thv;
+			      ri[WS(rs, 52)] = Tht - Thw;
+			      ri[WS(rs, 20)] = Tht + Thw;
+			      Til = Thr - Tho;
+			      Tim = Tij - Tig;
+			      ii[WS(rs, 20)] = Til + Tim;
+			      ii[WS(rs, 52)] = Tim - Til;
+			 }
+		    }
+		    {
+			 E Teb, Tfx, Tey, TiK, TiN, TiT, TfA, TiS, Tfr, TfL, Tfv, TfH, Tf0, TfK, Tfu;
+			 E TfE;
+			 {
+			      E TdZ, Tea, Tfy, Tfz;
+			      TdZ = TdV - TdY;
+			      Tea = KP707106781 * (Te4 - Te9);
+			      Teb = TdZ - Tea;
+			      Tfx = TdZ + Tea;
+			      {
+				   E Tem, Tex, TiL, TiM;
+				   Tem = FNMS(KP923879532, Tel, KP382683432 * Teg);
+				   Tex = FMA(KP382683432, Ter, KP923879532 * Tew);
+				   Tey = Tem - Tex;
+				   TiK = Tem + Tex;
+				   TiL = KP707106781 * (TfP - TfO);
+				   TiM = Tix - Tiw;
+				   TiN = TiL + TiM;
+				   TiT = TiM - TiL;
+			      }
+			      Tfy = FMA(KP923879532, Teg, KP382683432 * Tel);
+			      Tfz = FNMS(KP923879532, Ter, KP382683432 * Tew);
+			      TfA = Tfy + Tfz;
+			      TiS = Tfz - Tfy;
+			      {
+				   E Tfh, TfF, Tfq, TfG, Tfg, Tfp;
+				   Tfg = KP707106781 * (Tfa - Tff);
+				   Tfh = Tf5 - Tfg;
+				   TfF = Tf5 + Tfg;
+				   Tfp = KP707106781 * (Tfn - Tfo);
+				   Tfq = Tfm - Tfp;
+				   TfG = Tfm + Tfp;
+				   Tfr = FNMS(KP980785280, Tfq, KP195090322 * Tfh);
+				   TfL = FMA(KP831469612, TfG, KP555570233 * TfF);
+				   Tfv = FMA(KP195090322, Tfq, KP980785280 * Tfh);
+				   TfH = FNMS(KP555570233, TfG, KP831469612 * TfF);
+			      }
+			      {
+				   E TeQ, TfC, TeZ, TfD, TeP, TeY;
+				   TeP = KP707106781 * (TeJ - TeO);
+				   TeQ = TeE - TeP;
+				   TfC = TeE + TeP;
+				   TeY = KP707106781 * (TeW - TeX);
+				   TeZ = TeV - TeY;
+				   TfD = TeV + TeY;
+				   Tf0 = FMA(KP980785280, TeQ, KP195090322 * TeZ);
+				   TfK = FNMS(KP555570233, TfD, KP831469612 * TfC);
+				   Tfu = FNMS(KP980785280, TeZ, KP195090322 * TeQ);
+				   TfE = FMA(KP555570233, TfC, KP831469612 * TfD);
+			      }
+			 }
+			 {
+			      E Tez, Tfs, TiR, TiU;
+			      Tez = Teb + Tey;
+			      Tfs = Tf0 + Tfr;
+			      ri[WS(rs, 46)] = Tez - Tfs;
+			      ri[WS(rs, 14)] = Tez + Tfs;
+			      TiR = Tfu + Tfv;
+			      TiU = TiS + TiT;
+			      ii[WS(rs, 14)] = TiR + TiU;
+			      ii[WS(rs, 46)] = TiU - TiR;
+			 }
+			 {
+			      E Tft, Tfw, TiV, TiW;
+			      Tft = Teb - Tey;
+			      Tfw = Tfu - Tfv;
+			      ri[WS(rs, 62)] = Tft - Tfw;
+			      ri[WS(rs, 30)] = Tft + Tfw;
+			      TiV = Tfr - Tf0;
+			      TiW = TiT - TiS;
+			      ii[WS(rs, 30)] = TiV + TiW;
+			      ii[WS(rs, 62)] = TiW - TiV;
+			 }
+			 {
+			      E TfB, TfI, TiJ, TiO;
+			      TfB = Tfx + TfA;
+			      TfI = TfE + TfH;
+			      ri[WS(rs, 38)] = TfB - TfI;
+			      ri[WS(rs, 6)] = TfB + TfI;
+			      TiJ = TfK + TfL;
+			      TiO = TiK + TiN;
+			      ii[WS(rs, 6)] = TiJ + TiO;
+			      ii[WS(rs, 38)] = TiO - TiJ;
+			 }
+			 {
+			      E TfJ, TfM, TiP, TiQ;
+			      TfJ = Tfx - TfA;
+			      TfM = TfK - TfL;
+			      ri[WS(rs, 54)] = TfJ - TfM;
+			      ri[WS(rs, 22)] = TfJ + TfM;
+			      TiP = TfH - TfE;
+			      TiQ = TiN - TiK;
+			      ii[WS(rs, 22)] = TiP + TiQ;
+			      ii[WS(rs, 54)] = TiQ - TiP;
+			 }
+		    }
+		    {
+			 E TfR, Tgj, TfY, Tiu, Tiz, TiF, Tgm, TiE, Tgd, Tgx, Tgh, Tgt, Tg6, Tgw, Tgg;
+			 E Tgq;
+			 {
+			      E TfN, TfQ, Tgk, Tgl;
+			      TfN = TdV + TdY;
+			      TfQ = KP707106781 * (TfO + TfP);
+			      TfR = TfN - TfQ;
+			      Tgj = TfN + TfQ;
+			      {
+				   E TfU, TfX, Tiv, Tiy;
+				   TfU = FNMS(KP382683432, TfT, KP923879532 * TfS);
+				   TfX = FMA(KP923879532, TfV, KP382683432 * TfW);
+				   TfY = TfU - TfX;
+				   Tiu = TfU + TfX;
+				   Tiv = KP707106781 * (Te4 + Te9);
+				   Tiy = Tiw + Tix;
+				   Tiz = Tiv + Tiy;
+				   TiF = Tiy - Tiv;
+			      }
+			      Tgk = FMA(KP382683432, TfS, KP923879532 * TfT);
+			      Tgl = FNMS(KP382683432, TfV, KP923879532 * TfW);
+			      Tgm = Tgk + Tgl;
+			      TiE = Tgl - Tgk;
+			      {
+				   E Tg9, Tgr, Tgc, Tgs, Tg8, Tgb;
+				   Tg8 = KP707106781 * (Tfo + Tfn);
+				   Tg9 = Tg7 - Tg8;
+				   Tgr = Tg7 + Tg8;
+				   Tgb = KP707106781 * (Tfa + Tff);
+				   Tgc = Tga - Tgb;
+				   Tgs = Tga + Tgb;
+				   Tgd = FNMS(KP831469612, Tgc, KP555570233 * Tg9);
+				   Tgx = FMA(KP195090322, Tgr, KP980785280 * Tgs);
+				   Tgh = FMA(KP831469612, Tg9, KP555570233 * Tgc);
+				   Tgt = FNMS(KP195090322, Tgs, KP980785280 * Tgr);
+			      }
+			      {
+				   E Tg2, Tgo, Tg5, Tgp, Tg1, Tg4;
+				   Tg1 = KP707106781 * (TeO + TeJ);
+				   Tg2 = Tg0 - Tg1;
+				   Tgo = Tg0 + Tg1;
+				   Tg4 = KP707106781 * (TeW + TeX);
+				   Tg5 = Tg3 - Tg4;
+				   Tgp = Tg3 + Tg4;
+				   Tg6 = FMA(KP555570233, Tg2, KP831469612 * Tg5);
+				   Tgw = FNMS(KP195090322, Tgo, KP980785280 * Tgp);
+				   Tgg = FNMS(KP831469612, Tg2, KP555570233 * Tg5);
+				   Tgq = FMA(KP980785280, Tgo, KP195090322 * Tgp);
+			      }
+			 }
+			 {
+			      E TfZ, Tge, TiD, TiG;
+			      TfZ = TfR + TfY;
+			      Tge = Tg6 + Tgd;
+			      ri[WS(rs, 42)] = TfZ - Tge;
+			      ri[WS(rs, 10)] = TfZ + Tge;
+			      TiD = Tgg + Tgh;
+			      TiG = TiE + TiF;
+			      ii[WS(rs, 10)] = TiD + TiG;
+			      ii[WS(rs, 42)] = TiG - TiD;
+			 }
+			 {
+			      E Tgf, Tgi, TiH, TiI;
+			      Tgf = TfR - TfY;
+			      Tgi = Tgg - Tgh;
+			      ri[WS(rs, 58)] = Tgf - Tgi;
+			      ri[WS(rs, 26)] = Tgf + Tgi;
+			      TiH = Tgd - Tg6;
+			      TiI = TiF - TiE;
+			      ii[WS(rs, 26)] = TiH + TiI;
+			      ii[WS(rs, 58)] = TiI - TiH;
+			 }
+			 {
+			      E Tgn, Tgu, Tit, TiA;
+			      Tgn = Tgj + Tgm;
+			      Tgu = Tgq + Tgt;
+			      ri[WS(rs, 34)] = Tgn - Tgu;
+			      ri[WS(rs, 2)] = Tgn + Tgu;
+			      Tit = Tgw + Tgx;
+			      TiA = Tiu + Tiz;
+			      ii[WS(rs, 2)] = Tit + TiA;
+			      ii[WS(rs, 34)] = TiA - Tit;
+			 }
+			 {
+			      E Tgv, Tgy, TiB, TiC;
+			      Tgv = Tgj - Tgm;
+			      Tgy = Tgw - Tgx;
+			      ri[WS(rs, 50)] = Tgv - Tgy;
+			      ri[WS(rs, 18)] = Tgv + Tgy;
+			      TiB = Tgt - Tgq;
+			      TiC = Tiz - Tiu;
+			      ii[WS(rs, 18)] = TiB + TiC;
+			      ii[WS(rs, 50)] = TiC - TiB;
+			 }
+		    }
+		    {
+			 E T7V, TaH, TjN, TjT, T8O, TjS, TaK, TjK, T9I, TaU, TaE, TaO, TaB, TaV, TaF;
+			 E TaR;
+			 {
+			      E T7x, T7U, TjL, TjM;
+			      T7x = T7l - T7w;
+			      T7U = T7I - T7T;
+			      T7V = T7x - T7U;
+			      TaH = T7x + T7U;
+			      TjL = TaZ - TaY;
+			      TjM = Tjx - Tjw;
+			      TjN = TjL + TjM;
+			      TjT = TjM - TjL;
+			 }
+			 {
+			      E T8m, TaI, T8N, TaJ;
+			      {
+				   E T8c, T8l, T8D, T8M;
+				   T8c = T80 - T8b;
+				   T8l = T8h - T8k;
+				   T8m = FNMS(KP980785280, T8l, KP195090322 * T8c);
+				   TaI = FMA(KP980785280, T8c, KP195090322 * T8l);
+				   T8D = T8r - T8C;
+				   T8M = T8I - T8L;
+				   T8N = FMA(KP195090322, T8D, KP980785280 * T8M);
+				   TaJ = FNMS(KP980785280, T8D, KP195090322 * T8M);
+			      }
+			      T8O = T8m - T8N;
+			      TjS = TaJ - TaI;
+			      TaK = TaI + TaJ;
+			      TjK = T8m + T8N;
+			 }
+			 {
+			      E T9u, TaM, T9H, TaN;
+			      {
+				   E T96, T9t, T9D, T9G;
+				   T96 = T8U - T95;
+				   T9t = T9h - T9s;
+				   T9u = T96 - T9t;
+				   TaM = T96 + T9t;
+				   T9D = T9z - T9C;
+				   T9G = T9E - T9F;
+				   T9H = T9D - T9G;
+				   TaN = T9D + T9G;
+			      }
+			      T9I = FMA(KP995184726, T9u, KP098017140 * T9H);
+			      TaU = FNMS(KP634393284, TaN, KP773010453 * TaM);
+			      TaE = FNMS(KP995184726, T9H, KP098017140 * T9u);
+			      TaO = FMA(KP634393284, TaM, KP773010453 * TaN);
+			 }
+			 {
+			      E Tan, TaP, TaA, TaQ;
+			      {
+				   E T9Z, Tam, Taw, Taz;
+				   T9Z = T9N - T9Y;
+				   Tam = Taa - Tal;
+				   Tan = T9Z - Tam;
+				   TaP = T9Z + Tam;
+				   Taw = Tas - Tav;
+				   Taz = Tax - Tay;
+				   TaA = Taw - Taz;
+				   TaQ = Taw + Taz;
+			      }
+			      TaB = FNMS(KP995184726, TaA, KP098017140 * Tan);
+			      TaV = FMA(KP773010453, TaQ, KP634393284 * TaP);
+			      TaF = FMA(KP098017140, TaA, KP995184726 * Tan);
+			      TaR = FNMS(KP634393284, TaQ, KP773010453 * TaP);
+			 }
+			 {
+			      E T8P, TaC, TjR, TjU;
+			      T8P = T7V + T8O;
+			      TaC = T9I + TaB;
+			      ri[WS(rs, 47)] = T8P - TaC;
+			      ri[WS(rs, 15)] = T8P + TaC;
+			      TjR = TaE + TaF;
+			      TjU = TjS + TjT;
+			      ii[WS(rs, 15)] = TjR + TjU;
+			      ii[WS(rs, 47)] = TjU - TjR;
+			 }
+			 {
+			      E TaD, TaG, TjV, TjW;
+			      TaD = T7V - T8O;
+			      TaG = TaE - TaF;
+			      ri[WS(rs, 63)] = TaD - TaG;
+			      ri[WS(rs, 31)] = TaD + TaG;
+			      TjV = TaB - T9I;
+			      TjW = TjT - TjS;
+			      ii[WS(rs, 31)] = TjV + TjW;
+			      ii[WS(rs, 63)] = TjW - TjV;
+			 }
+			 {
+			      E TaL, TaS, TjJ, TjO;
+			      TaL = TaH + TaK;
+			      TaS = TaO + TaR;
+			      ri[WS(rs, 39)] = TaL - TaS;
+			      ri[WS(rs, 7)] = TaL + TaS;
+			      TjJ = TaU + TaV;
+			      TjO = TjK + TjN;
+			      ii[WS(rs, 7)] = TjJ + TjO;
+			      ii[WS(rs, 39)] = TjO - TjJ;
+			 }
+			 {
+			      E TaT, TaW, TjP, TjQ;
+			      TaT = TaH - TaK;
+			      TaW = TaU - TaV;
+			      ri[WS(rs, 55)] = TaT - TaW;
+			      ri[WS(rs, 23)] = TaT + TaW;
+			      TjP = TaR - TaO;
+			      TjQ = TjN - TjK;
+			      ii[WS(rs, 23)] = TjP + TjQ;
+			      ii[WS(rs, 55)] = TjQ - TjP;
+			 }
+		    }
+		    {
+			 E TbV, TcT, Tjj, Tjp, Tca, Tjo, TcW, Tjg, Tcu, Td6, TcQ, Td0, TcN, Td7, TcR;
+			 E Td3;
+			 {
+			      E TbN, TbU, Tjh, Tji;
+			      TbN = TbJ - TbM;
+			      TbU = TbQ - TbT;
+			      TbV = TbN - TbU;
+			      TcT = TbN + TbU;
+			      Tjh = Tdb - Tda;
+			      Tji = Tj3 - Tj0;
+			      Tjj = Tjh + Tji;
+			      Tjp = Tji - Tjh;
+			 }
+			 {
+			      E Tc2, TcU, Tc9, TcV;
+			      {
+				   E TbY, Tc1, Tc5, Tc8;
+				   TbY = TbW - TbX;
+				   Tc1 = TbZ - Tc0;
+				   Tc2 = FNMS(KP831469612, Tc1, KP555570233 * TbY);
+				   TcU = FMA(KP555570233, Tc1, KP831469612 * TbY);
+				   Tc5 = Tc3 - Tc4;
+				   Tc8 = Tc6 - Tc7;
+				   Tc9 = FMA(KP831469612, Tc5, KP555570233 * Tc8);
+				   TcV = FNMS(KP831469612, Tc8, KP555570233 * Tc5);
+			      }
+			      Tca = Tc2 - Tc9;
+			      Tjo = TcV - TcU;
+			      TcW = TcU + TcV;
+			      Tjg = Tc2 + Tc9;
+			 }
+			 {
+			      E Tcm, TcY, Tct, TcZ;
+			      {
+				   E Tce, Tcl, Tcp, Tcs;
+				   Tce = Tcc - Tcd;
+				   Tcl = Tch - Tck;
+				   Tcm = Tce - Tcl;
+				   TcY = Tce + Tcl;
+				   Tcp = Tcn - Tco;
+				   Tcs = Tcq - Tcr;
+				   Tct = Tcp - Tcs;
+				   TcZ = Tcp + Tcs;
+			      }
+			      Tcu = FMA(KP956940335, Tcm, KP290284677 * Tct);
+			      Td6 = FNMS(KP471396736, TcZ, KP881921264 * TcY);
+			      TcQ = FNMS(KP956940335, Tct, KP290284677 * Tcm);
+			      Td0 = FMA(KP471396736, TcY, KP881921264 * TcZ);
+			 }
+			 {
+			      E TcF, Td1, TcM, Td2;
+			      {
+				   E Tcx, TcE, TcI, TcL;
+				   Tcx = Tcv - Tcw;
+				   TcE = TcA - TcD;
+				   TcF = Tcx - TcE;
+				   Td1 = Tcx + TcE;
+				   TcI = TcG - TcH;
+				   TcL = TcJ - TcK;
+				   TcM = TcI - TcL;
+				   Td2 = TcI + TcL;
+			      }
+			      TcN = FNMS(KP956940335, TcM, KP290284677 * TcF);
+			      Td7 = FMA(KP881921264, Td2, KP471396736 * Td1);
+			      TcR = FMA(KP290284677, TcM, KP956940335 * TcF);
+			      Td3 = FNMS(KP471396736, Td2, KP881921264 * Td1);
+			 }
+			 {
+			      E Tcb, TcO, Tjn, Tjq;
+			      Tcb = TbV + Tca;
+			      TcO = Tcu + TcN;
+			      ri[WS(rs, 45)] = Tcb - TcO;
+			      ri[WS(rs, 13)] = Tcb + TcO;
+			      Tjn = TcQ + TcR;
+			      Tjq = Tjo + Tjp;
+			      ii[WS(rs, 13)] = Tjn + Tjq;
+			      ii[WS(rs, 45)] = Tjq - Tjn;
+			 }
+			 {
+			      E TcP, TcS, Tjr, Tjs;
+			      TcP = TbV - Tca;
+			      TcS = TcQ - TcR;
+			      ri[WS(rs, 61)] = TcP - TcS;
+			      ri[WS(rs, 29)] = TcP + TcS;
+			      Tjr = TcN - Tcu;
+			      Tjs = Tjp - Tjo;
+			      ii[WS(rs, 29)] = Tjr + Tjs;
+			      ii[WS(rs, 61)] = Tjs - Tjr;
+			 }
+			 {
+			      E TcX, Td4, Tjf, Tjk;
+			      TcX = TcT + TcW;
+			      Td4 = Td0 + Td3;
+			      ri[WS(rs, 37)] = TcX - Td4;
+			      ri[WS(rs, 5)] = TcX + Td4;
+			      Tjf = Td6 + Td7;
+			      Tjk = Tjg + Tjj;
+			      ii[WS(rs, 5)] = Tjf + Tjk;
+			      ii[WS(rs, 37)] = Tjk - Tjf;
+			 }
+			 {
+			      E Td5, Td8, Tjl, Tjm;
+			      Td5 = TcT - TcW;
+			      Td8 = Td6 - Td7;
+			      ri[WS(rs, 53)] = Td5 - Td8;
+			      ri[WS(rs, 21)] = Td5 + Td8;
+			      Tjl = Td3 - Td0;
+			      Tjm = Tjj - Tjg;
+			      ii[WS(rs, 21)] = Tjl + Tjm;
+			      ii[WS(rs, 53)] = Tjm - Tjl;
+			 }
+		    }
+		    {
+			 E Tdd, TdF, Tj5, Tjb, Tdk, Tja, TdI, TiY, Tds, TdS, TdC, TdM, Tdz, TdT, TdD;
+			 E TdP;
+			 {
+			      E Td9, Tdc, TiZ, Tj4;
+			      Td9 = TbJ + TbM;
+			      Tdc = Tda + Tdb;
+			      Tdd = Td9 - Tdc;
+			      TdF = Td9 + Tdc;
+			      TiZ = TbQ + TbT;
+			      Tj4 = Tj0 + Tj3;
+			      Tj5 = TiZ + Tj4;
+			      Tjb = Tj4 - TiZ;
+			 }
+			 {
+			      E Tdg, TdG, Tdj, TdH;
+			      {
+				   E Tde, Tdf, Tdh, Tdi;
+				   Tde = TbW + TbX;
+				   Tdf = TbZ + Tc0;
+				   Tdg = FNMS(KP195090322, Tdf, KP980785280 * Tde);
+				   TdG = FMA(KP980785280, Tdf, KP195090322 * Tde);
+				   Tdh = Tc3 + Tc4;
+				   Tdi = Tc6 + Tc7;
+				   Tdj = FMA(KP195090322, Tdh, KP980785280 * Tdi);
+				   TdH = FNMS(KP195090322, Tdi, KP980785280 * Tdh);
+			      }
+			      Tdk = Tdg - Tdj;
+			      Tja = TdH - TdG;
+			      TdI = TdG + TdH;
+			      TiY = Tdg + Tdj;
+			 }
+			 {
+			      E Tdo, TdK, Tdr, TdL;
+			      {
+				   E Tdm, Tdn, Tdp, Tdq;
+				   Tdm = Tcn + Tco;
+				   Tdn = Tck + Tch;
+				   Tdo = Tdm - Tdn;
+				   TdK = Tdm + Tdn;
+				   Tdp = Tcc + Tcd;
+				   Tdq = Tcq + Tcr;
+				   Tdr = Tdp - Tdq;
+				   TdL = Tdp + Tdq;
+			      }
+			      Tds = FMA(KP634393284, Tdo, KP773010453 * Tdr);
+			      TdS = FNMS(KP098017140, TdK, KP995184726 * TdL);
+			      TdC = FNMS(KP773010453, Tdo, KP634393284 * Tdr);
+			      TdM = FMA(KP995184726, TdK, KP098017140 * TdL);
+			 }
+			 {
+			      E Tdv, TdN, Tdy, TdO;
+			      {
+				   E Tdt, Tdu, Tdw, Tdx;
+				   Tdt = Tcv + Tcw;
+				   Tdu = TcK + TcJ;
+				   Tdv = Tdt - Tdu;
+				   TdN = Tdt + Tdu;
+				   Tdw = TcG + TcH;
+				   Tdx = TcA + TcD;
+				   Tdy = Tdw - Tdx;
+				   TdO = Tdw + Tdx;
+			      }
+			      Tdz = FNMS(KP773010453, Tdy, KP634393284 * Tdv);
+			      TdT = FMA(KP098017140, TdN, KP995184726 * TdO);
+			      TdD = FMA(KP773010453, Tdv, KP634393284 * Tdy);
+			      TdP = FNMS(KP098017140, TdO, KP995184726 * TdN);
+			 }
+			 {
+			      E Tdl, TdA, Tj9, Tjc;
+			      Tdl = Tdd + Tdk;
+			      TdA = Tds + Tdz;
+			      ri[WS(rs, 41)] = Tdl - TdA;
+			      ri[WS(rs, 9)] = Tdl + TdA;
+			      Tj9 = TdC + TdD;
+			      Tjc = Tja + Tjb;
+			      ii[WS(rs, 9)] = Tj9 + Tjc;
+			      ii[WS(rs, 41)] = Tjc - Tj9;
+			 }
+			 {
+			      E TdB, TdE, Tjd, Tje;
+			      TdB = Tdd - Tdk;
+			      TdE = TdC - TdD;
+			      ri[WS(rs, 57)] = TdB - TdE;
+			      ri[WS(rs, 25)] = TdB + TdE;
+			      Tjd = Tdz - Tds;
+			      Tje = Tjb - Tja;
+			      ii[WS(rs, 25)] = Tjd + Tje;
+			      ii[WS(rs, 57)] = Tje - Tjd;
+			 }
+			 {
+			      E TdJ, TdQ, TiX, Tj6;
+			      TdJ = TdF + TdI;
+			      TdQ = TdM + TdP;
+			      ri[WS(rs, 33)] = TdJ - TdQ;
+			      ri[WS(rs, 1)] = TdJ + TdQ;
+			      TiX = TdS + TdT;
+			      Tj6 = TiY + Tj5;
+			      ii[WS(rs, 1)] = TiX + Tj6;
+			      ii[WS(rs, 33)] = Tj6 - TiX;
+			 }
+			 {
+			      E TdR, TdU, Tj7, Tj8;
+			      TdR = TdF - TdI;
+			      TdU = TdS - TdT;
+			      ri[WS(rs, 49)] = TdR - TdU;
+			      ri[WS(rs, 17)] = TdR + TdU;
+			      Tj7 = TdP - TdM;
+			      Tj8 = Tj5 - TiY;
+			      ii[WS(rs, 17)] = Tj7 + Tj8;
+			      ii[WS(rs, 49)] = Tj8 - Tj7;
+			 }
+		    }
+		    {
+			 E Tb1, Tbt, Tjz, TjF, Tb8, TjE, Tbw, Tju, Tbg, TbG, Tbq, TbA, Tbn, TbH, Tbr;
+			 E TbD;
+			 {
+			      E TaX, Tb0, Tjv, Tjy;
+			      TaX = T7l + T7w;
+			      Tb0 = TaY + TaZ;
+			      Tb1 = TaX - Tb0;
+			      Tbt = TaX + Tb0;
+			      Tjv = T7I + T7T;
+			      Tjy = Tjw + Tjx;
+			      Tjz = Tjv + Tjy;
+			      TjF = Tjy - Tjv;
+			 }
+			 {
+			      E Tb4, Tbu, Tb7, Tbv;
+			      {
+				   E Tb2, Tb3, Tb5, Tb6;
+				   Tb2 = T80 + T8b;
+				   Tb3 = T8h + T8k;
+				   Tb4 = FNMS(KP555570233, Tb3, KP831469612 * Tb2);
+				   Tbu = FMA(KP555570233, Tb2, KP831469612 * Tb3);
+				   Tb5 = T8r + T8C;
+				   Tb6 = T8I + T8L;
+				   Tb7 = FMA(KP831469612, Tb5, KP555570233 * Tb6);
+				   Tbv = FNMS(KP555570233, Tb5, KP831469612 * Tb6);
+			      }
+			      Tb8 = Tb4 - Tb7;
+			      TjE = Tbv - Tbu;
+			      Tbw = Tbu + Tbv;
+			      Tju = Tb4 + Tb7;
+			 }
+			 {
+			      E Tbc, Tby, Tbf, Tbz;
+			      {
+				   E Tba, Tbb, Tbd, Tbe;
+				   Tba = T9z + T9C;
+				   Tbb = T9s + T9h;
+				   Tbc = Tba - Tbb;
+				   Tby = Tba + Tbb;
+				   Tbd = T8U + T95;
+				   Tbe = T9E + T9F;
+				   Tbf = Tbd - Tbe;
+				   Tbz = Tbd + Tbe;
+			      }
+			      Tbg = FMA(KP471396736, Tbc, KP881921264 * Tbf);
+			      TbG = FNMS(KP290284677, Tby, KP956940335 * Tbz);
+			      Tbq = FNMS(KP881921264, Tbc, KP471396736 * Tbf);
+			      TbA = FMA(KP956940335, Tby, KP290284677 * Tbz);
+			 }
+			 {
+			      E Tbj, TbB, Tbm, TbC;
+			      {
+				   E Tbh, Tbi, Tbk, Tbl;
+				   Tbh = T9N + T9Y;
+				   Tbi = Tay + Tax;
+				   Tbj = Tbh - Tbi;
+				   TbB = Tbh + Tbi;
+				   Tbk = Tas + Tav;
+				   Tbl = Taa + Tal;
+				   Tbm = Tbk - Tbl;
+				   TbC = Tbk + Tbl;
+			      }
+			      Tbn = FNMS(KP881921264, Tbm, KP471396736 * Tbj);
+			      TbH = FMA(KP290284677, TbB, KP956940335 * TbC);
+			      Tbr = FMA(KP881921264, Tbj, KP471396736 * Tbm);
+			      TbD = FNMS(KP290284677, TbC, KP956940335 * TbB);
+			 }
+			 {
+			      E Tb9, Tbo, TjD, TjG;
+			      Tb9 = Tb1 + Tb8;
+			      Tbo = Tbg + Tbn;
+			      ri[WS(rs, 43)] = Tb9 - Tbo;
+			      ri[WS(rs, 11)] = Tb9 + Tbo;
+			      TjD = Tbq + Tbr;
+			      TjG = TjE + TjF;
+			      ii[WS(rs, 11)] = TjD + TjG;
+			      ii[WS(rs, 43)] = TjG - TjD;
+			 }
+			 {
+			      E Tbp, Tbs, TjH, TjI;
+			      Tbp = Tb1 - Tb8;
+			      Tbs = Tbq - Tbr;
+			      ri[WS(rs, 59)] = Tbp - Tbs;
+			      ri[WS(rs, 27)] = Tbp + Tbs;
+			      TjH = Tbn - Tbg;
+			      TjI = TjF - TjE;
+			      ii[WS(rs, 27)] = TjH + TjI;
+			      ii[WS(rs, 59)] = TjI - TjH;
+			 }
+			 {
+			      E Tbx, TbE, Tjt, TjA;
+			      Tbx = Tbt + Tbw;
+			      TbE = TbA + TbD;
+			      ri[WS(rs, 35)] = Tbx - TbE;
+			      ri[WS(rs, 3)] = Tbx + TbE;
+			      Tjt = TbG + TbH;
+			      TjA = Tju + Tjz;
+			      ii[WS(rs, 3)] = Tjt + TjA;
+			      ii[WS(rs, 35)] = TjA - Tjt;
+			 }
+			 {
+			      E TbF, TbI, TjB, TjC;
+			      TbF = Tbt - Tbw;
+			      TbI = TbG - TbH;
+			      ri[WS(rs, 51)] = TbF - TbI;
+			      ri[WS(rs, 19)] = TbF + TbI;
+			      TjB = TbD - TbA;
+			      TjC = Tjz - Tju;
+			      ii[WS(rs, 19)] = TjB + TjC;
+			      ii[WS(rs, 51)] = TjC - TjB;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 9},
+     {TW_CEXP, 0, 27},
+     {TW_CEXP, 0, 63},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 64, "t2_64", twinstr, &GENUS, {880, 386, 274, 0}, 0, 0, 0 };
+
+void X(codelet_t2_64) (planner *p) {
+     X(kdft_dit_register) (p, t2_64, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/codelets/t2_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/codelets/t2_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:35:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include t.h */
+
+/*
+ * This function contains 74 FP additions, 50 FP multiplications,
+ * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
+ * 64 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "t.h"
+
+static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E TS, T1m, TJ, T1l, T1k, Tw, T1w, T1u;
+	       {
+		    E T2, T3, Tl, Tn, T5, T4, Tm, Tr, T6;
+		    T2 = W[0];
+		    T3 = W[2];
+		    Tl = W[4];
+		    Tn = W[5];
+		    T5 = W[1];
+		    T4 = T2 * T3;
+		    Tm = T2 * Tl;
+		    Tr = T2 * Tn;
+		    T6 = W[3];
+		    {
+			 E T1, T1s, TG, Td, T1r, Tu, TY, Tk, TW, T18, T1d, TD, TH, TA, T13;
+			 E TE, T14;
+			 {
+			      E To, Ts, Tf, T7, T8, Ti, Tb, T9, Tc, TC, Ta, TF, TB, Tg, Th;
+			      E Tj;
+			      T1 = ri[0];
+			      To = FMA(T5, Tn, Tm);
+			      Ts = FNMS(T5, Tl, Tr);
+			      Tf = FMA(T5, T6, T4);
+			      T7 = FNMS(T5, T6, T4);
+			      Ta = T2 * T6;
+			      T1s = ii[0];
+			      T8 = ri[WS(rs, 4)];
+			      TF = Tf * Tn;
+			      TB = Tf * Tl;
+			      Ti = FNMS(T5, T3, Ta);
+			      Tb = FMA(T5, T3, Ta);
+			      T9 = T7 * T8;
+			      Tc = ii[WS(rs, 4)];
+			      TG = FNMS(Ti, Tl, TF);
+			      TC = FMA(Ti, Tn, TB);
+			      {
+				   E Tp, T1q, Tt, Tq, TX;
+				   Tp = ri[WS(rs, 6)];
+				   Td = FMA(Tb, Tc, T9);
+				   T1q = T7 * Tc;
+				   Tt = ii[WS(rs, 6)];
+				   Tq = To * Tp;
+				   Tg = ri[WS(rs, 2)];
+				   T1r = FNMS(Tb, T8, T1q);
+				   TX = To * Tt;
+				   Tu = FMA(Ts, Tt, Tq);
+				   Th = Tf * Tg;
+				   Tj = ii[WS(rs, 2)];
+				   TY = FNMS(Ts, Tp, TX);
+			      }
+			      {
+				   E TO, TQ, TN, TP, T1a, T1b;
+				   {
+					E TK, TM, TL, T19, TV;
+					TK = ri[WS(rs, 7)];
+					TM = ii[WS(rs, 7)];
+					Tk = FMA(Ti, Tj, Th);
+					TV = Tf * Tj;
+					TL = Tl * TK;
+					T19 = Tl * TM;
+					TO = ri[WS(rs, 3)];
+					TW = FNMS(Ti, Tg, TV);
+					TQ = ii[WS(rs, 3)];
+					TN = FMA(Tn, TM, TL);
+					TP = T3 * TO;
+					T1a = FNMS(Tn, TK, T19);
+					T1b = T3 * TQ;
+				   }
+				   {
+					E Tx, Tz, Ty, T12, T1c, TR;
+					Tx = ri[WS(rs, 1)];
+					TR = FMA(T6, TQ, TP);
+					Tz = ii[WS(rs, 1)];
+					T1c = FNMS(T6, TO, T1b);
+					Ty = T2 * Tx;
+					T18 = TN - TR;
+					TS = TN + TR;
+					T12 = T2 * Tz;
+					T1d = T1a - T1c;
+					T1m = T1a + T1c;
+					TD = ri[WS(rs, 5)];
+					TH = ii[WS(rs, 5)];
+					TA = FMA(T5, Tz, Ty);
+					T13 = FNMS(T5, Tx, T12);
+					TE = TC * TD;
+					T14 = TC * TH;
+				   }
+			      }
+			 }
+			 {
+			      E Te, T1p, T1t, Tv;
+			      {
+				   E T1g, T10, T1z, T1B, T1A, T1j, T1C, T1f;
+				   {
+					E T1x, T11, T16, T1y;
+					{
+					     E TU, TZ, TI, T15;
+					     Te = T1 + Td;
+					     TU = T1 - Td;
+					     TZ = TW - TY;
+					     T1p = TW + TY;
+					     TI = FMA(TG, TH, TE);
+					     T15 = FNMS(TG, TD, T14);
+					     T1t = T1r + T1s;
+					     T1x = T1s - T1r;
+					     T1g = TU - TZ;
+					     T10 = TU + TZ;
+					     T11 = TA - TI;
+					     TJ = TA + TI;
+					     T1l = T13 + T15;
+					     T16 = T13 - T15;
+					     T1y = Tk - Tu;
+					     Tv = Tk + Tu;
+					}
+					{
+					     E T1i, T1e, T17, T1h;
+					     T1i = T18 + T1d;
+					     T1e = T18 - T1d;
+					     T17 = T11 + T16;
+					     T1h = T16 - T11;
+					     T1z = T1x - T1y;
+					     T1B = T1y + T1x;
+					     T1A = T1h + T1i;
+					     T1j = T1h - T1i;
+					     T1C = T1e - T17;
+					     T1f = T17 + T1e;
+					}
+				   }
+				   ri[WS(rs, 7)] = FNMS(KP707106781, T1j, T1g);
+				   ii[WS(rs, 7)] = FNMS(KP707106781, T1C, T1B);
+				   ri[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
+				   ri[WS(rs, 5)] = FNMS(KP707106781, T1f, T10);
+				   ii[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
+				   ii[WS(rs, 5)] = FNMS(KP707106781, T1A, T1z);
+				   ri[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
+				   ii[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
+			      }
+			      T1k = Te - Tv;
+			      Tw = Te + Tv;
+			      T1w = T1t - T1p;
+			      T1u = T1p + T1t;
+			 }
+		    }
+	       }
+	       {
+		    E TT, T1v, T1n, T1o;
+		    TT = TJ + TS;
+		    T1v = TS - TJ;
+		    T1n = T1l - T1m;
+		    T1o = T1l + T1m;
+		    ii[WS(rs, 2)] = T1v + T1w;
+		    ii[WS(rs, 6)] = T1w - T1v;
+		    ri[0] = Tw + TT;
+		    ri[WS(rs, 4)] = Tw - TT;
+		    ii[0] = T1o + T1u;
+		    ii[WS(rs, 4)] = T1u - T1o;
+		    ri[WS(rs, 2)] = T1k + T1n;
+		    ri[WS(rs, 6)] = T1k - T1n;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, {44, 20, 30, 0}, 0, 0, 0 };
+
+void X(codelet_t2_8) (planner *p) {
+     X(kdft_dit_register) (p, t2_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include t.h */
+
+/*
+ * This function contains 74 FP additions, 44 FP multiplications,
+ * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
+ * 42 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "t.h"
+
+static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
+	       {
+		    E T4, Tb, T7, Ta;
+		    T2 = W[0];
+		    T5 = W[1];
+		    T3 = W[2];
+		    T6 = W[3];
+		    T4 = T2 * T3;
+		    Tb = T5 * T3;
+		    T7 = T5 * T6;
+		    Ta = T2 * T6;
+		    T8 = T4 - T7;
+		    Tc = Ta + Tb;
+		    Tg = T4 + T7;
+		    Ti = Ta - Tb;
+		    Tl = W[4];
+		    Tm = W[5];
+		    Tn = FMA(T2, Tl, T5 * Tm);
+		    Tz = FNMS(Ti, Tl, Tg * Tm);
+		    Tp = FNMS(T5, Tl, T2 * Tm);
+		    Tx = FMA(Tg, Tl, Ti * Tm);
+	       }
+	       {
+		    E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
+		    E TT;
+		    {
+			 E T1, T1c, Te, T1b, T9, Td;
+			 T1 = ri[0];
+			 T1c = ii[0];
+			 T9 = ri[WS(rs, 4)];
+			 Td = ii[WS(rs, 4)];
+			 Te = FMA(T8, T9, Tc * Td);
+			 T1b = FNMS(Tc, T9, T8 * Td);
+			 Tf = T1 + Te;
+			 T1i = T1c - T1b;
+			 TL = T1 - Te;
+			 T1d = T1b + T1c;
+		    }
+		    {
+			 E TF, TW, TI, TX;
+			 {
+			      E TD, TE, TG, TH;
+			      TD = ri[WS(rs, 7)];
+			      TE = ii[WS(rs, 7)];
+			      TF = FMA(Tl, TD, Tm * TE);
+			      TW = FNMS(Tm, TD, Tl * TE);
+			      TG = ri[WS(rs, 3)];
+			      TH = ii[WS(rs, 3)];
+			      TI = FMA(T3, TG, T6 * TH);
+			      TX = FNMS(T6, TG, T3 * TH);
+			 }
+			 TJ = TF + TI;
+			 T17 = TW + TX;
+			 TV = TF - TI;
+			 TY = TW - TX;
+		    }
+		    {
+			 E Tk, TM, Tr, TN;
+			 {
+			      E Th, Tj, To, Tq;
+			      Th = ri[WS(rs, 2)];
+			      Tj = ii[WS(rs, 2)];
+			      Tk = FMA(Tg, Th, Ti * Tj);
+			      TM = FNMS(Ti, Th, Tg * Tj);
+			      To = ri[WS(rs, 6)];
+			      Tq = ii[WS(rs, 6)];
+			      Tr = FMA(Tn, To, Tp * Tq);
+			      TN = FNMS(Tp, To, Tn * Tq);
+			 }
+			 Ts = Tk + Tr;
+			 T1j = Tk - Tr;
+			 TO = TM - TN;
+			 T1a = TM + TN;
+		    }
+		    {
+			 E Tw, TR, TB, TS;
+			 {
+			      E Tu, Tv, Ty, TA;
+			      Tu = ri[WS(rs, 1)];
+			      Tv = ii[WS(rs, 1)];
+			      Tw = FMA(T2, Tu, T5 * Tv);
+			      TR = FNMS(T5, Tu, T2 * Tv);
+			      Ty = ri[WS(rs, 5)];
+			      TA = ii[WS(rs, 5)];
+			      TB = FMA(Tx, Ty, Tz * TA);
+			      TS = FNMS(Tz, Ty, Tx * TA);
+			 }
+			 TC = Tw + TB;
+			 T16 = TR + TS;
+			 TQ = Tw - TB;
+			 TT = TR - TS;
+		    }
+		    {
+			 E Tt, TK, T1f, T1g;
+			 Tt = Tf + Ts;
+			 TK = TC + TJ;
+			 ri[WS(rs, 4)] = Tt - TK;
+			 ri[0] = Tt + TK;
+			 {
+			      E T19, T1e, T15, T18;
+			      T19 = T16 + T17;
+			      T1e = T1a + T1d;
+			      ii[0] = T19 + T1e;
+			      ii[WS(rs, 4)] = T1e - T19;
+			      T15 = Tf - Ts;
+			      T18 = T16 - T17;
+			      ri[WS(rs, 6)] = T15 - T18;
+			      ri[WS(rs, 2)] = T15 + T18;
+			 }
+			 T1f = TJ - TC;
+			 T1g = T1d - T1a;
+			 ii[WS(rs, 2)] = T1f + T1g;
+			 ii[WS(rs, 6)] = T1g - T1f;
+			 {
+			      E T11, T1k, T14, T1h, T12, T13;
+			      T11 = TL - TO;
+			      T1k = T1i - T1j;
+			      T12 = TT - TQ;
+			      T13 = TV + TY;
+			      T14 = KP707106781 * (T12 - T13);
+			      T1h = KP707106781 * (T12 + T13);
+			      ri[WS(rs, 7)] = T11 - T14;
+			      ii[WS(rs, 5)] = T1k - T1h;
+			      ri[WS(rs, 3)] = T11 + T14;
+			      ii[WS(rs, 1)] = T1h + T1k;
+			 }
+			 {
+			      E TP, T1m, T10, T1l, TU, TZ;
+			      TP = TL + TO;
+			      T1m = T1j + T1i;
+			      TU = TQ + TT;
+			      TZ = TV - TY;
+			      T10 = KP707106781 * (TU + TZ);
+			      T1l = KP707106781 * (TZ - TU);
+			      ri[WS(rs, 5)] = TP - T10;
+			      ii[WS(rs, 7)] = T1m - T1l;
+			      ri[WS(rs, 1)] = TP + T10;
+			      ii[WS(rs, 3)] = T1l + T1m;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 0, 1},
+     {TW_CEXP, 0, 3},
+     {TW_CEXP, 0, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, {56, 26, 18, 0}, 0, 0, 0 };
+
+void X(codelet_t2_8) (planner *p) {
+     X(kdft_dit_register) (p, t2_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/f.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/f.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1 @@
+#include "t.h"  /* same stuff, no need to duplicate */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/n.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/n.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "codelet-dft.h"
+#include "n.h"
+
+static int okp(const kdft_desc *d,
+	       const R *ri, const R *ii, 
+	       const R *ro, const R *io,
+	       INT is, INT os, INT vl, INT ivs, INT ovs,
+	       const planner *plnr)
+{
+     UNUSED(ri); UNUSED(ii); UNUSED(ro); UNUSED(io); UNUSED(vl); UNUSED(plnr);
+     return (1
+	     && (!d->is || (d->is == is))
+	     && (!d->os || (d->os == os))
+	     && (!d->ivs || (d->ivs == ivs))
+	     && (!d->ovs || (d->ovs == ovs))
+	  );
+}
+
+const kdft_genus GENUS = { okp, 1 };
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/n.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/n.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(dft_n_genus)
+extern const kdft_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/q.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/q.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1 @@
+#include "t.h"  /* same stuff, no need to duplicate */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/t.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/t.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "codelet-dft.h"
+#include "t.h"
+
+static int okp(const ct_desc *d,
+	       const R *rio, const R *iio, 
+	       INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+	       const planner *plnr)
+{
+     UNUSED(rio); UNUSED(iio); UNUSED(m); UNUSED(mb); UNUSED(me); UNUSED(plnr);
+     return (1
+	     && (!d->rs || (d->rs == rs))
+	     && (!d->vs || (d->vs == vs))
+	     && (!d->ms || (d->ms == ms))
+	  );
+}
+
+const ct_genus GENUS = { okp, 1 };
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/scalar/t.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/scalar/t.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(dft_t_genus)
+extern const ct_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+SUBDIRS = common sse2 avx altivec neon
+EXTRA_DIST = n1b.h n1f.h n2b.h n2f.h n2s.h q1b.h q1f.h t1b.h t1bu.h	\
+t1f.h t1fu.h t2b.h t2f.h t3b.h t3f.h ts.h codlist.mk simd.mk
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,619 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = dft/simd
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+SOURCES =
+DIST_SOURCES =
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
+	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
+	distdir
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+SUBDIRS = common sse2 avx altivec neon
+EXTRA_DIST = n1b.h n1f.h n2b.h n2f.h n2s.h q1b.h q1f.h t1b.h t1bu.h	\
+t1f.h t1fu.h t2b.h t2f.h t3b.h t3f.h ts.h codlist.mk simd.mk
+
+all: all-recursive
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/simd/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu dft/simd/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-recursive
+all-am: Makefile
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \
+	install-am install-strip tags-recursive
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am check check-am clean clean-generic clean-libtool \
+	ctags ctags-recursive distclean distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs installdirs-am maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \
+	uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(ALTIVEC_CFLAGS)
+SIMD_HEADER=simd-altivec.h
+
+include $(top_srcdir)/dft/simd/codlist.mk
+include $(top_srcdir)/dft/simd/simd.mk
+
+if HAVE_ALTIVEC
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = libdft_altivec_codelets.la
+libdft_altivec_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,893 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This file contains a standard list of DFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/dft/simd/codlist.mk \
+	$(top_srcdir)/dft/simd/simd.mk
+subdir = dft/simd/altivec
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libdft_altivec_codelets_la_LIBADD =
+am__libdft_altivec_codelets_la_SOURCES_DIST = n1fv_2.c n1fv_3.c \
+	n1fv_4.c n1fv_5.c n1fv_6.c n1fv_7.c n1fv_8.c n1fv_9.c \
+	n1fv_10.c n1fv_11.c n1fv_12.c n1fv_13.c n1fv_14.c n1fv_15.c \
+	n1fv_16.c n1fv_32.c n1fv_64.c n1fv_128.c n1fv_20.c n1fv_25.c \
+	n1bv_2.c n1bv_3.c n1bv_4.c n1bv_5.c n1bv_6.c n1bv_7.c n1bv_8.c \
+	n1bv_9.c n1bv_10.c n1bv_11.c n1bv_12.c n1bv_13.c n1bv_14.c \
+	n1bv_15.c n1bv_16.c n1bv_32.c n1bv_64.c n1bv_128.c n1bv_20.c \
+	n1bv_25.c n2fv_2.c n2fv_4.c n2fv_6.c n2fv_8.c n2fv_10.c \
+	n2fv_12.c n2fv_14.c n2fv_16.c n2fv_32.c n2fv_64.c n2fv_20.c \
+	n2bv_2.c n2bv_4.c n2bv_6.c n2bv_8.c n2bv_10.c n2bv_12.c \
+	n2bv_14.c n2bv_16.c n2bv_32.c n2bv_64.c n2bv_20.c n2sv_4.c \
+	n2sv_8.c n2sv_16.c n2sv_32.c n2sv_64.c t1fuv_2.c t1fuv_3.c \
+	t1fuv_4.c t1fuv_5.c t1fuv_6.c t1fuv_7.c t1fuv_8.c t1fuv_9.c \
+	t1fuv_10.c t1fv_2.c t1fv_3.c t1fv_4.c t1fv_5.c t1fv_6.c \
+	t1fv_7.c t1fv_8.c t1fv_9.c t1fv_10.c t1fv_12.c t1fv_15.c \
+	t1fv_16.c t1fv_32.c t1fv_64.c t1fv_20.c t1fv_25.c t2fv_2.c \
+	t2fv_4.c t2fv_8.c t2fv_16.c t2fv_32.c t2fv_64.c t2fv_5.c \
+	t2fv_10.c t2fv_20.c t2fv_25.c t3fv_4.c t3fv_8.c t3fv_16.c \
+	t3fv_32.c t3fv_5.c t3fv_10.c t3fv_20.c t3fv_25.c t1buv_2.c \
+	t1buv_3.c t1buv_4.c t1buv_5.c t1buv_6.c t1buv_7.c t1buv_8.c \
+	t1buv_9.c t1buv_10.c t1bv_2.c t1bv_3.c t1bv_4.c t1bv_5.c \
+	t1bv_6.c t1bv_7.c t1bv_8.c t1bv_9.c t1bv_10.c t1bv_12.c \
+	t1bv_15.c t1bv_16.c t1bv_32.c t1bv_64.c t1bv_20.c t1bv_25.c \
+	t2bv_2.c t2bv_4.c t2bv_8.c t2bv_16.c t2bv_32.c t2bv_64.c \
+	t2bv_5.c t2bv_10.c t2bv_20.c t2bv_25.c t3bv_4.c t3bv_8.c \
+	t3bv_16.c t3bv_32.c t3bv_5.c t3bv_10.c t3bv_20.c t3bv_25.c \
+	t1sv_2.c t1sv_4.c t1sv_8.c t1sv_16.c t1sv_32.c t2sv_4.c \
+	t2sv_8.c t2sv_16.c t2sv_32.c q1fv_2.c q1fv_4.c q1fv_5.c \
+	q1fv_8.c q1bv_2.c q1bv_4.c q1bv_5.c q1bv_8.c genus.c codlist.c
+am__objects_1 = n1fv_2.lo n1fv_3.lo n1fv_4.lo n1fv_5.lo n1fv_6.lo \
+	n1fv_7.lo n1fv_8.lo n1fv_9.lo n1fv_10.lo n1fv_11.lo n1fv_12.lo \
+	n1fv_13.lo n1fv_14.lo n1fv_15.lo n1fv_16.lo n1fv_32.lo \
+	n1fv_64.lo n1fv_128.lo n1fv_20.lo n1fv_25.lo
+am__objects_2 = n1bv_2.lo n1bv_3.lo n1bv_4.lo n1bv_5.lo n1bv_6.lo \
+	n1bv_7.lo n1bv_8.lo n1bv_9.lo n1bv_10.lo n1bv_11.lo n1bv_12.lo \
+	n1bv_13.lo n1bv_14.lo n1bv_15.lo n1bv_16.lo n1bv_32.lo \
+	n1bv_64.lo n1bv_128.lo n1bv_20.lo n1bv_25.lo
+am__objects_3 = n2fv_2.lo n2fv_4.lo n2fv_6.lo n2fv_8.lo n2fv_10.lo \
+	n2fv_12.lo n2fv_14.lo n2fv_16.lo n2fv_32.lo n2fv_64.lo \
+	n2fv_20.lo
+am__objects_4 = n2bv_2.lo n2bv_4.lo n2bv_6.lo n2bv_8.lo n2bv_10.lo \
+	n2bv_12.lo n2bv_14.lo n2bv_16.lo n2bv_32.lo n2bv_64.lo \
+	n2bv_20.lo
+am__objects_5 = n2sv_4.lo n2sv_8.lo n2sv_16.lo n2sv_32.lo n2sv_64.lo
+am__objects_6 = t1fuv_2.lo t1fuv_3.lo t1fuv_4.lo t1fuv_5.lo t1fuv_6.lo \
+	t1fuv_7.lo t1fuv_8.lo t1fuv_9.lo t1fuv_10.lo
+am__objects_7 = t1fv_2.lo t1fv_3.lo t1fv_4.lo t1fv_5.lo t1fv_6.lo \
+	t1fv_7.lo t1fv_8.lo t1fv_9.lo t1fv_10.lo t1fv_12.lo t1fv_15.lo \
+	t1fv_16.lo t1fv_32.lo t1fv_64.lo t1fv_20.lo t1fv_25.lo
+am__objects_8 = t2fv_2.lo t2fv_4.lo t2fv_8.lo t2fv_16.lo t2fv_32.lo \
+	t2fv_64.lo t2fv_5.lo t2fv_10.lo t2fv_20.lo t2fv_25.lo
+am__objects_9 = t3fv_4.lo t3fv_8.lo t3fv_16.lo t3fv_32.lo t3fv_5.lo \
+	t3fv_10.lo t3fv_20.lo t3fv_25.lo
+am__objects_10 = t1buv_2.lo t1buv_3.lo t1buv_4.lo t1buv_5.lo \
+	t1buv_6.lo t1buv_7.lo t1buv_8.lo t1buv_9.lo t1buv_10.lo
+am__objects_11 = t1bv_2.lo t1bv_3.lo t1bv_4.lo t1bv_5.lo t1bv_6.lo \
+	t1bv_7.lo t1bv_8.lo t1bv_9.lo t1bv_10.lo t1bv_12.lo t1bv_15.lo \
+	t1bv_16.lo t1bv_32.lo t1bv_64.lo t1bv_20.lo t1bv_25.lo
+am__objects_12 = t2bv_2.lo t2bv_4.lo t2bv_8.lo t2bv_16.lo t2bv_32.lo \
+	t2bv_64.lo t2bv_5.lo t2bv_10.lo t2bv_20.lo t2bv_25.lo
+am__objects_13 = t3bv_4.lo t3bv_8.lo t3bv_16.lo t3bv_32.lo t3bv_5.lo \
+	t3bv_10.lo t3bv_20.lo t3bv_25.lo
+am__objects_14 = t1sv_2.lo t1sv_4.lo t1sv_8.lo t1sv_16.lo t1sv_32.lo
+am__objects_15 = t2sv_4.lo t2sv_8.lo t2sv_16.lo t2sv_32.lo
+am__objects_16 = q1fv_2.lo q1fv_4.lo q1fv_5.lo q1fv_8.lo
+am__objects_17 = q1bv_2.lo q1bv_4.lo q1bv_5.lo q1bv_8.lo
+am__objects_18 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
+	$(am__objects_4) $(am__objects_5) $(am__objects_6) \
+	$(am__objects_7) $(am__objects_8) $(am__objects_9) \
+	$(am__objects_10) $(am__objects_11) $(am__objects_12) \
+	$(am__objects_13) $(am__objects_14) $(am__objects_15) \
+	$(am__objects_16) $(am__objects_17)
+am__objects_19 = $(am__objects_18) genus.lo codlist.lo
+@HAVE_ALTIVEC_TRUE@am__objects_20 = $(am__objects_19)
+@HAVE_ALTIVEC_TRUE@am_libdft_altivec_codelets_la_OBJECTS =  \
+@HAVE_ALTIVEC_TRUE@	$(am__objects_20)
+libdft_altivec_codelets_la_OBJECTS =  \
+	$(am_libdft_altivec_codelets_la_OBJECTS)
+@HAVE_ALTIVEC_TRUE@am_libdft_altivec_codelets_la_rpath =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libdft_altivec_codelets_la_SOURCES)
+DIST_SOURCES = $(am__libdft_altivec_codelets_la_SOURCES_DIST)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CFLAGS = $(ALTIVEC_CFLAGS)
+SIMD_HEADER = simd-altivec.h
+
+###########################################################################
+# n1fv_<n> is a hard-coded FFTW_FORWARD FFT of size <n>, using SIMD
+N1F = n1fv_2.c n1fv_3.c n1fv_4.c n1fv_5.c n1fv_6.c n1fv_7.c n1fv_8.c	\
+n1fv_9.c n1fv_10.c n1fv_11.c n1fv_12.c n1fv_13.c n1fv_14.c n1fv_15.c	\
+n1fv_16.c n1fv_32.c n1fv_64.c n1fv_128.c n1fv_20.c n1fv_25.c
+
+
+# as above, with restricted input vector stride
+N2F = n2fv_2.c n2fv_4.c n2fv_6.c n2fv_8.c n2fv_10.c n2fv_12.c	\
+n2fv_14.c n2fv_16.c n2fv_32.c n2fv_64.c n2fv_20.c
+
+
+# as above, but FFTW_BACKWARD
+N1B = n1bv_2.c n1bv_3.c n1bv_4.c n1bv_5.c n1bv_6.c n1bv_7.c n1bv_8.c	\
+n1bv_9.c n1bv_10.c n1bv_11.c n1bv_12.c n1bv_13.c n1bv_14.c n1bv_15.c	\
+n1bv_16.c n1bv_32.c n1bv_64.c n1bv_128.c n1bv_20.c n1bv_25.c
+
+N2B = n2bv_2.c n2bv_4.c n2bv_6.c n2bv_8.c n2bv_10.c n2bv_12.c	\
+n2bv_14.c n2bv_16.c n2bv_32.c n2bv_64.c n2bv_20.c
+
+
+# split-complex codelets 
+N2S = n2sv_4.c n2sv_8.c n2sv_16.c n2sv_32.c n2sv_64.c
+
+###########################################################################
+# t1fv_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
+# for an FFTW_FORWARD transform, using SIMD
+T1F = t1fv_2.c t1fv_3.c t1fv_4.c t1fv_5.c t1fv_6.c t1fv_7.c t1fv_8.c	\
+t1fv_9.c t1fv_10.c t1fv_12.c t1fv_15.c t1fv_16.c t1fv_32.c t1fv_64.c	\
+t1fv_20.c t1fv_25.c
+
+
+# same as t1fv_*, but with different twiddle storage scheme
+T2F = t2fv_2.c t2fv_4.c t2fv_8.c t2fv_16.c t2fv_32.c t2fv_64.c	\
+t2fv_5.c t2fv_10.c t2fv_20.c t2fv_25.c
+
+T3F = t3fv_4.c t3fv_8.c t3fv_16.c t3fv_32.c t3fv_5.c t3fv_10.c	\
+t3fv_20.c t3fv_25.c
+
+T1FU = t1fuv_2.c t1fuv_3.c t1fuv_4.c t1fuv_5.c t1fuv_6.c t1fuv_7.c	\
+t1fuv_8.c t1fuv_9.c t1fuv_10.c
+
+
+# as above, but FFTW_BACKWARD
+T1B = t1bv_2.c t1bv_3.c t1bv_4.c t1bv_5.c t1bv_6.c t1bv_7.c t1bv_8.c	\
+t1bv_9.c t1bv_10.c t1bv_12.c t1bv_15.c t1bv_16.c t1bv_32.c t1bv_64.c	\
+t1bv_20.c t1bv_25.c
+
+
+# same as t1bv_*, but with different twiddle storage scheme
+T2B = t2bv_2.c t2bv_4.c t2bv_8.c t2bv_16.c t2bv_32.c t2bv_64.c	\
+t2bv_5.c t2bv_10.c t2bv_20.c t2bv_25.c
+
+T3B = t3bv_4.c t3bv_8.c t3bv_16.c t3bv_32.c t3bv_5.c t3bv_10.c	\
+t3bv_20.c t3bv_25.c
+
+T1BU = t1buv_2.c t1buv_3.c t1buv_4.c t1buv_5.c t1buv_6.c t1buv_7.c	\
+t1buv_8.c t1buv_9.c t1buv_10.c
+
+
+# split-complex codelets
+T1S = t1sv_2.c t1sv_4.c t1sv_8.c t1sv_16.c t1sv_32.c
+T2S = t2sv_4.c t2sv_8.c t2sv_16.c t2sv_32.c
+
+###########################################################################
+# q1fv_<r> is <r> twiddle FFTW_FORWARD FFTs of size <r> (DIF step),
+# where the output is transposed, using SIMD.  This is used for
+# in-place transposes in sizes that are divisible by <r>^2.  These
+# codelets have size ~ <r>^2, so you should probably not use <r>
+# bigger than 8 or so.
+Q1F = q1fv_2.c q1fv_4.c q1fv_5.c q1fv_8.c
+
+# as above, but FFTW_BACKWARD
+Q1B = q1bv_2.c q1bv_4.c q1bv_5.c q1bv_8.c
+
+###########################################################################
+SIMD_CODELETS = $(N1F) $(N1B) $(N2F) $(N2B) $(N2S) $(T1FU) $(T1F)	\
+$(T2F) $(T3F) $(T1BU) $(T1B) $(T2B) $(T3B) $(T1S) $(T2S) $(Q1F) $(Q1B)
+
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/dft/simd -I$(top_srcdir)/simd-support
+
+EXTRA_DIST = $(SIMD_CODELETS) genus.c codlist.c
+@HAVE_ALTIVEC_TRUE@BUILT_SOURCES = $(EXTRA_DIST)
+@HAVE_ALTIVEC_TRUE@noinst_LTLIBRARIES = libdft_altivec_codelets.la
+@HAVE_ALTIVEC_TRUE@libdft_altivec_codelets_la_SOURCES = $(BUILT_SOURCES)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/dft/simd/codlist.mk $(top_srcdir)/dft/simd/simd.mk $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/simd/altivec/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu dft/simd/altivec/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/dft/simd/codlist.mk $(top_srcdir)/dft/simd/simd.mk:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libdft_altivec_codelets.la: $(libdft_altivec_codelets_la_OBJECTS) $(libdft_altivec_codelets_la_DEPENDENCIES) $(EXTRA_libdft_altivec_codelets_la_DEPENDENCIES) 
+	$(LINK) $(am_libdft_altivec_codelets_la_rpath) $(libdft_altivec_codelets_la_OBJECTS) $(libdft_altivec_codelets_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/genus.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_128.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_128.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_8.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+$(EXTRA_DIST): Makefile
+	(							\
+	echo "/* Generated automatically.  DO NOT EDIT! */";	\
+	echo "#define SIMD_HEADER \"$(SIMD_HEADER)\"";		\
+	echo "#include \"../common/"$*".c\"";			\
+	) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/codlist.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/genus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/genus.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/genus.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_11.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_128.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_13.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1bv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1bv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1bv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_11.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_128.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_13.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n1fv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n1fv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n1fv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2sv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2sv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2sv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/n2sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/n2sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/n2sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/q1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/q1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/q1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/q1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/q1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/q1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/q1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/q1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/q1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/q1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/q1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/q1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/q1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/q1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/q1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/q1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/q1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/q1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/q1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/q1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/q1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/q1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/q1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/q1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1buv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1buv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1buv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1buv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1buv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1buv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1buv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1buv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1buv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1buv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1buv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1buv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1buv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1buv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1buv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1buv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1buv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1buv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1buv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1buv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1buv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1buv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1buv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1buv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1buv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1buv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1buv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1bv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1bv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1bv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fuv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fuv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fuv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fuv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fuv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fuv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fuv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fuv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fuv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fuv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fuv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fuv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fuv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fuv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fuv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fuv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fuv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fuv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fuv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fuv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fuv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fuv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fuv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fuv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fuv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fuv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fuv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1fv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1fv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1fv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1sv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1sv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1sv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t1sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t1sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t1sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t2sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t2sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t2sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/altivec/t3fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/altivec/t3fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/t3fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(AVX_CFLAGS)
+SIMD_HEADER=simd-avx.h
+
+include $(top_srcdir)/dft/simd/codlist.mk
+include $(top_srcdir)/dft/simd/simd.mk
+
+if HAVE_AVX
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = libdft_avx_codelets.la
+libdft_avx_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,891 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This file contains a standard list of DFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/dft/simd/codlist.mk \
+	$(top_srcdir)/dft/simd/simd.mk
+subdir = dft/simd/avx
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libdft_avx_codelets_la_LIBADD =
+am__libdft_avx_codelets_la_SOURCES_DIST = n1fv_2.c n1fv_3.c n1fv_4.c \
+	n1fv_5.c n1fv_6.c n1fv_7.c n1fv_8.c n1fv_9.c n1fv_10.c \
+	n1fv_11.c n1fv_12.c n1fv_13.c n1fv_14.c n1fv_15.c n1fv_16.c \
+	n1fv_32.c n1fv_64.c n1fv_128.c n1fv_20.c n1fv_25.c n1bv_2.c \
+	n1bv_3.c n1bv_4.c n1bv_5.c n1bv_6.c n1bv_7.c n1bv_8.c n1bv_9.c \
+	n1bv_10.c n1bv_11.c n1bv_12.c n1bv_13.c n1bv_14.c n1bv_15.c \
+	n1bv_16.c n1bv_32.c n1bv_64.c n1bv_128.c n1bv_20.c n1bv_25.c \
+	n2fv_2.c n2fv_4.c n2fv_6.c n2fv_8.c n2fv_10.c n2fv_12.c \
+	n2fv_14.c n2fv_16.c n2fv_32.c n2fv_64.c n2fv_20.c n2bv_2.c \
+	n2bv_4.c n2bv_6.c n2bv_8.c n2bv_10.c n2bv_12.c n2bv_14.c \
+	n2bv_16.c n2bv_32.c n2bv_64.c n2bv_20.c n2sv_4.c n2sv_8.c \
+	n2sv_16.c n2sv_32.c n2sv_64.c t1fuv_2.c t1fuv_3.c t1fuv_4.c \
+	t1fuv_5.c t1fuv_6.c t1fuv_7.c t1fuv_8.c t1fuv_9.c t1fuv_10.c \
+	t1fv_2.c t1fv_3.c t1fv_4.c t1fv_5.c t1fv_6.c t1fv_7.c t1fv_8.c \
+	t1fv_9.c t1fv_10.c t1fv_12.c t1fv_15.c t1fv_16.c t1fv_32.c \
+	t1fv_64.c t1fv_20.c t1fv_25.c t2fv_2.c t2fv_4.c t2fv_8.c \
+	t2fv_16.c t2fv_32.c t2fv_64.c t2fv_5.c t2fv_10.c t2fv_20.c \
+	t2fv_25.c t3fv_4.c t3fv_8.c t3fv_16.c t3fv_32.c t3fv_5.c \
+	t3fv_10.c t3fv_20.c t3fv_25.c t1buv_2.c t1buv_3.c t1buv_4.c \
+	t1buv_5.c t1buv_6.c t1buv_7.c t1buv_8.c t1buv_9.c t1buv_10.c \
+	t1bv_2.c t1bv_3.c t1bv_4.c t1bv_5.c t1bv_6.c t1bv_7.c t1bv_8.c \
+	t1bv_9.c t1bv_10.c t1bv_12.c t1bv_15.c t1bv_16.c t1bv_32.c \
+	t1bv_64.c t1bv_20.c t1bv_25.c t2bv_2.c t2bv_4.c t2bv_8.c \
+	t2bv_16.c t2bv_32.c t2bv_64.c t2bv_5.c t2bv_10.c t2bv_20.c \
+	t2bv_25.c t3bv_4.c t3bv_8.c t3bv_16.c t3bv_32.c t3bv_5.c \
+	t3bv_10.c t3bv_20.c t3bv_25.c t1sv_2.c t1sv_4.c t1sv_8.c \
+	t1sv_16.c t1sv_32.c t2sv_4.c t2sv_8.c t2sv_16.c t2sv_32.c \
+	q1fv_2.c q1fv_4.c q1fv_5.c q1fv_8.c q1bv_2.c q1bv_4.c q1bv_5.c \
+	q1bv_8.c genus.c codlist.c
+am__objects_1 = n1fv_2.lo n1fv_3.lo n1fv_4.lo n1fv_5.lo n1fv_6.lo \
+	n1fv_7.lo n1fv_8.lo n1fv_9.lo n1fv_10.lo n1fv_11.lo n1fv_12.lo \
+	n1fv_13.lo n1fv_14.lo n1fv_15.lo n1fv_16.lo n1fv_32.lo \
+	n1fv_64.lo n1fv_128.lo n1fv_20.lo n1fv_25.lo
+am__objects_2 = n1bv_2.lo n1bv_3.lo n1bv_4.lo n1bv_5.lo n1bv_6.lo \
+	n1bv_7.lo n1bv_8.lo n1bv_9.lo n1bv_10.lo n1bv_11.lo n1bv_12.lo \
+	n1bv_13.lo n1bv_14.lo n1bv_15.lo n1bv_16.lo n1bv_32.lo \
+	n1bv_64.lo n1bv_128.lo n1bv_20.lo n1bv_25.lo
+am__objects_3 = n2fv_2.lo n2fv_4.lo n2fv_6.lo n2fv_8.lo n2fv_10.lo \
+	n2fv_12.lo n2fv_14.lo n2fv_16.lo n2fv_32.lo n2fv_64.lo \
+	n2fv_20.lo
+am__objects_4 = n2bv_2.lo n2bv_4.lo n2bv_6.lo n2bv_8.lo n2bv_10.lo \
+	n2bv_12.lo n2bv_14.lo n2bv_16.lo n2bv_32.lo n2bv_64.lo \
+	n2bv_20.lo
+am__objects_5 = n2sv_4.lo n2sv_8.lo n2sv_16.lo n2sv_32.lo n2sv_64.lo
+am__objects_6 = t1fuv_2.lo t1fuv_3.lo t1fuv_4.lo t1fuv_5.lo t1fuv_6.lo \
+	t1fuv_7.lo t1fuv_8.lo t1fuv_9.lo t1fuv_10.lo
+am__objects_7 = t1fv_2.lo t1fv_3.lo t1fv_4.lo t1fv_5.lo t1fv_6.lo \
+	t1fv_7.lo t1fv_8.lo t1fv_9.lo t1fv_10.lo t1fv_12.lo t1fv_15.lo \
+	t1fv_16.lo t1fv_32.lo t1fv_64.lo t1fv_20.lo t1fv_25.lo
+am__objects_8 = t2fv_2.lo t2fv_4.lo t2fv_8.lo t2fv_16.lo t2fv_32.lo \
+	t2fv_64.lo t2fv_5.lo t2fv_10.lo t2fv_20.lo t2fv_25.lo
+am__objects_9 = t3fv_4.lo t3fv_8.lo t3fv_16.lo t3fv_32.lo t3fv_5.lo \
+	t3fv_10.lo t3fv_20.lo t3fv_25.lo
+am__objects_10 = t1buv_2.lo t1buv_3.lo t1buv_4.lo t1buv_5.lo \
+	t1buv_6.lo t1buv_7.lo t1buv_8.lo t1buv_9.lo t1buv_10.lo
+am__objects_11 = t1bv_2.lo t1bv_3.lo t1bv_4.lo t1bv_5.lo t1bv_6.lo \
+	t1bv_7.lo t1bv_8.lo t1bv_9.lo t1bv_10.lo t1bv_12.lo t1bv_15.lo \
+	t1bv_16.lo t1bv_32.lo t1bv_64.lo t1bv_20.lo t1bv_25.lo
+am__objects_12 = t2bv_2.lo t2bv_4.lo t2bv_8.lo t2bv_16.lo t2bv_32.lo \
+	t2bv_64.lo t2bv_5.lo t2bv_10.lo t2bv_20.lo t2bv_25.lo
+am__objects_13 = t3bv_4.lo t3bv_8.lo t3bv_16.lo t3bv_32.lo t3bv_5.lo \
+	t3bv_10.lo t3bv_20.lo t3bv_25.lo
+am__objects_14 = t1sv_2.lo t1sv_4.lo t1sv_8.lo t1sv_16.lo t1sv_32.lo
+am__objects_15 = t2sv_4.lo t2sv_8.lo t2sv_16.lo t2sv_32.lo
+am__objects_16 = q1fv_2.lo q1fv_4.lo q1fv_5.lo q1fv_8.lo
+am__objects_17 = q1bv_2.lo q1bv_4.lo q1bv_5.lo q1bv_8.lo
+am__objects_18 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
+	$(am__objects_4) $(am__objects_5) $(am__objects_6) \
+	$(am__objects_7) $(am__objects_8) $(am__objects_9) \
+	$(am__objects_10) $(am__objects_11) $(am__objects_12) \
+	$(am__objects_13) $(am__objects_14) $(am__objects_15) \
+	$(am__objects_16) $(am__objects_17)
+am__objects_19 = $(am__objects_18) genus.lo codlist.lo
+@HAVE_AVX_TRUE@am__objects_20 = $(am__objects_19)
+@HAVE_AVX_TRUE@am_libdft_avx_codelets_la_OBJECTS = $(am__objects_20)
+libdft_avx_codelets_la_OBJECTS = $(am_libdft_avx_codelets_la_OBJECTS)
+@HAVE_AVX_TRUE@am_libdft_avx_codelets_la_rpath =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libdft_avx_codelets_la_SOURCES)
+DIST_SOURCES = $(am__libdft_avx_codelets_la_SOURCES_DIST)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CFLAGS = $(AVX_CFLAGS)
+SIMD_HEADER = simd-avx.h
+
+###########################################################################
+# n1fv_<n> is a hard-coded FFTW_FORWARD FFT of size <n>, using SIMD
+N1F = n1fv_2.c n1fv_3.c n1fv_4.c n1fv_5.c n1fv_6.c n1fv_7.c n1fv_8.c	\
+n1fv_9.c n1fv_10.c n1fv_11.c n1fv_12.c n1fv_13.c n1fv_14.c n1fv_15.c	\
+n1fv_16.c n1fv_32.c n1fv_64.c n1fv_128.c n1fv_20.c n1fv_25.c
+
+
+# as above, with restricted input vector stride
+N2F = n2fv_2.c n2fv_4.c n2fv_6.c n2fv_8.c n2fv_10.c n2fv_12.c	\
+n2fv_14.c n2fv_16.c n2fv_32.c n2fv_64.c n2fv_20.c
+
+
+# as above, but FFTW_BACKWARD
+N1B = n1bv_2.c n1bv_3.c n1bv_4.c n1bv_5.c n1bv_6.c n1bv_7.c n1bv_8.c	\
+n1bv_9.c n1bv_10.c n1bv_11.c n1bv_12.c n1bv_13.c n1bv_14.c n1bv_15.c	\
+n1bv_16.c n1bv_32.c n1bv_64.c n1bv_128.c n1bv_20.c n1bv_25.c
+
+N2B = n2bv_2.c n2bv_4.c n2bv_6.c n2bv_8.c n2bv_10.c n2bv_12.c	\
+n2bv_14.c n2bv_16.c n2bv_32.c n2bv_64.c n2bv_20.c
+
+
+# split-complex codelets 
+N2S = n2sv_4.c n2sv_8.c n2sv_16.c n2sv_32.c n2sv_64.c
+
+###########################################################################
+# t1fv_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
+# for an FFTW_FORWARD transform, using SIMD
+T1F = t1fv_2.c t1fv_3.c t1fv_4.c t1fv_5.c t1fv_6.c t1fv_7.c t1fv_8.c	\
+t1fv_9.c t1fv_10.c t1fv_12.c t1fv_15.c t1fv_16.c t1fv_32.c t1fv_64.c	\
+t1fv_20.c t1fv_25.c
+
+
+# same as t1fv_*, but with different twiddle storage scheme
+T2F = t2fv_2.c t2fv_4.c t2fv_8.c t2fv_16.c t2fv_32.c t2fv_64.c	\
+t2fv_5.c t2fv_10.c t2fv_20.c t2fv_25.c
+
+T3F = t3fv_4.c t3fv_8.c t3fv_16.c t3fv_32.c t3fv_5.c t3fv_10.c	\
+t3fv_20.c t3fv_25.c
+
+T1FU = t1fuv_2.c t1fuv_3.c t1fuv_4.c t1fuv_5.c t1fuv_6.c t1fuv_7.c	\
+t1fuv_8.c t1fuv_9.c t1fuv_10.c
+
+
+# as above, but FFTW_BACKWARD
+T1B = t1bv_2.c t1bv_3.c t1bv_4.c t1bv_5.c t1bv_6.c t1bv_7.c t1bv_8.c	\
+t1bv_9.c t1bv_10.c t1bv_12.c t1bv_15.c t1bv_16.c t1bv_32.c t1bv_64.c	\
+t1bv_20.c t1bv_25.c
+
+
+# same as t1bv_*, but with different twiddle storage scheme
+T2B = t2bv_2.c t2bv_4.c t2bv_8.c t2bv_16.c t2bv_32.c t2bv_64.c	\
+t2bv_5.c t2bv_10.c t2bv_20.c t2bv_25.c
+
+T3B = t3bv_4.c t3bv_8.c t3bv_16.c t3bv_32.c t3bv_5.c t3bv_10.c	\
+t3bv_20.c t3bv_25.c
+
+T1BU = t1buv_2.c t1buv_3.c t1buv_4.c t1buv_5.c t1buv_6.c t1buv_7.c	\
+t1buv_8.c t1buv_9.c t1buv_10.c
+
+
+# split-complex codelets
+T1S = t1sv_2.c t1sv_4.c t1sv_8.c t1sv_16.c t1sv_32.c
+T2S = t2sv_4.c t2sv_8.c t2sv_16.c t2sv_32.c
+
+###########################################################################
+# q1fv_<r> is <r> twiddle FFTW_FORWARD FFTs of size <r> (DIF step),
+# where the output is transposed, using SIMD.  This is used for
+# in-place transposes in sizes that are divisible by <r>^2.  These
+# codelets have size ~ <r>^2, so you should probably not use <r>
+# bigger than 8 or so.
+Q1F = q1fv_2.c q1fv_4.c q1fv_5.c q1fv_8.c
+
+# as above, but FFTW_BACKWARD
+Q1B = q1bv_2.c q1bv_4.c q1bv_5.c q1bv_8.c
+
+###########################################################################
+SIMD_CODELETS = $(N1F) $(N1B) $(N2F) $(N2B) $(N2S) $(T1FU) $(T1F)	\
+$(T2F) $(T3F) $(T1BU) $(T1B) $(T2B) $(T3B) $(T1S) $(T2S) $(Q1F) $(Q1B)
+
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/dft/simd -I$(top_srcdir)/simd-support
+
+EXTRA_DIST = $(SIMD_CODELETS) genus.c codlist.c
+@HAVE_AVX_TRUE@BUILT_SOURCES = $(EXTRA_DIST)
+@HAVE_AVX_TRUE@noinst_LTLIBRARIES = libdft_avx_codelets.la
+@HAVE_AVX_TRUE@libdft_avx_codelets_la_SOURCES = $(BUILT_SOURCES)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/dft/simd/codlist.mk $(top_srcdir)/dft/simd/simd.mk $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/simd/avx/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu dft/simd/avx/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/dft/simd/codlist.mk $(top_srcdir)/dft/simd/simd.mk:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libdft_avx_codelets.la: $(libdft_avx_codelets_la_OBJECTS) $(libdft_avx_codelets_la_DEPENDENCIES) $(EXTRA_libdft_avx_codelets_la_DEPENDENCIES) 
+	$(LINK) $(am_libdft_avx_codelets_la_rpath) $(libdft_avx_codelets_la_OBJECTS) $(libdft_avx_codelets_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/genus.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_128.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_128.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_8.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+$(EXTRA_DIST): Makefile
+	(							\
+	echo "/* Generated automatically.  DO NOT EDIT! */";	\
+	echo "#define SIMD_HEADER \"$(SIMD_HEADER)\"";		\
+	echo "#include \"../common/"$*".c\"";			\
+	) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/codlist.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/genus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/genus.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/genus.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_11.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_128.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_13.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1bv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1bv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1bv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_11.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_128.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_13.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n1fv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n1fv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n1fv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2sv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2sv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2sv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/n2sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/n2sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/n2sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/q1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/q1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/q1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/q1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/q1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/q1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/q1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/q1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/q1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/q1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/q1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/q1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/q1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/q1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/q1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/q1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/q1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/q1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/q1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/q1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/q1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/q1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/q1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/q1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1buv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1buv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1buv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1buv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1buv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1buv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1buv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1buv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1buv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1buv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1buv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1buv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1buv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1buv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1buv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1buv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1buv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1buv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1buv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1buv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1buv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1buv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1buv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1buv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1buv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1buv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1buv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1bv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1bv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1bv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fuv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fuv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fuv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fuv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fuv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fuv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fuv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fuv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fuv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fuv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fuv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fuv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fuv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fuv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fuv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fuv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fuv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fuv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fuv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fuv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fuv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fuv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fuv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fuv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fuv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fuv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fuv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1fv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1fv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1fv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1sv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1sv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1sv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t1sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t1sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t1sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t2sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t2sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t2sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/avx/t3fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/avx/t3fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/t3fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/codlist.mk
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/codlist.mk	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,79 @@
+# This file contains a standard list of DFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+###########################################################################
+# n1fv_<n> is a hard-coded FFTW_FORWARD FFT of size <n>, using SIMD
+N1F = n1fv_2.c n1fv_3.c n1fv_4.c n1fv_5.c n1fv_6.c n1fv_7.c n1fv_8.c	\
+n1fv_9.c n1fv_10.c n1fv_11.c n1fv_12.c n1fv_13.c n1fv_14.c n1fv_15.c	\
+n1fv_16.c n1fv_32.c n1fv_64.c n1fv_128.c n1fv_20.c n1fv_25.c
+
+# as above, with restricted input vector stride
+N2F = n2fv_2.c n2fv_4.c n2fv_6.c n2fv_8.c n2fv_10.c n2fv_12.c	\
+n2fv_14.c n2fv_16.c n2fv_32.c n2fv_64.c n2fv_20.c
+
+# as above, but FFTW_BACKWARD
+N1B = n1bv_2.c n1bv_3.c n1bv_4.c n1bv_5.c n1bv_6.c n1bv_7.c n1bv_8.c	\
+n1bv_9.c n1bv_10.c n1bv_11.c n1bv_12.c n1bv_13.c n1bv_14.c n1bv_15.c	\
+n1bv_16.c n1bv_32.c n1bv_64.c n1bv_128.c n1bv_20.c n1bv_25.c
+
+N2B = n2bv_2.c n2bv_4.c n2bv_6.c n2bv_8.c n2bv_10.c n2bv_12.c	\
+n2bv_14.c n2bv_16.c n2bv_32.c n2bv_64.c n2bv_20.c
+
+# split-complex codelets 
+N2S = n2sv_4.c n2sv_8.c n2sv_16.c n2sv_32.c n2sv_64.c
+
+###########################################################################
+# t1fv_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
+# for an FFTW_FORWARD transform, using SIMD
+T1F = t1fv_2.c t1fv_3.c t1fv_4.c t1fv_5.c t1fv_6.c t1fv_7.c t1fv_8.c	\
+t1fv_9.c t1fv_10.c t1fv_12.c t1fv_15.c t1fv_16.c t1fv_32.c t1fv_64.c	\
+t1fv_20.c t1fv_25.c
+
+# same as t1fv_*, but with different twiddle storage scheme
+T2F = t2fv_2.c t2fv_4.c t2fv_8.c t2fv_16.c t2fv_32.c t2fv_64.c	\
+t2fv_5.c t2fv_10.c t2fv_20.c t2fv_25.c
+T3F = t3fv_4.c t3fv_8.c t3fv_16.c t3fv_32.c t3fv_5.c t3fv_10.c	\
+t3fv_20.c t3fv_25.c
+T1FU = t1fuv_2.c t1fuv_3.c t1fuv_4.c t1fuv_5.c t1fuv_6.c t1fuv_7.c	\
+t1fuv_8.c t1fuv_9.c t1fuv_10.c
+
+# as above, but FFTW_BACKWARD
+T1B = t1bv_2.c t1bv_3.c t1bv_4.c t1bv_5.c t1bv_6.c t1bv_7.c t1bv_8.c	\
+t1bv_9.c t1bv_10.c t1bv_12.c t1bv_15.c t1bv_16.c t1bv_32.c t1bv_64.c	\
+t1bv_20.c t1bv_25.c
+
+# same as t1bv_*, but with different twiddle storage scheme
+T2B = t2bv_2.c t2bv_4.c t2bv_8.c t2bv_16.c t2bv_32.c t2bv_64.c	\
+t2bv_5.c t2bv_10.c t2bv_20.c t2bv_25.c
+T3B = t3bv_4.c t3bv_8.c t3bv_16.c t3bv_32.c t3bv_5.c t3bv_10.c	\
+t3bv_20.c t3bv_25.c
+T1BU = t1buv_2.c t1buv_3.c t1buv_4.c t1buv_5.c t1buv_6.c t1buv_7.c	\
+t1buv_8.c t1buv_9.c t1buv_10.c
+
+# split-complex codelets
+T1S = t1sv_2.c t1sv_4.c t1sv_8.c t1sv_16.c t1sv_32.c
+T2S = t2sv_4.c t2sv_8.c t2sv_16.c t2sv_32.c
+
+###########################################################################
+# q1fv_<r> is <r> twiddle FFTW_FORWARD FFTs of size <r> (DIF step),
+# where the output is transposed, using SIMD.  This is used for
+# in-place transposes in sizes that are divisible by <r>^2.  These
+# codelets have size ~ <r>^2, so you should probably not use <r>
+# bigger than 8 or so.
+Q1F = q1fv_2.c q1fv_4.c q1fv_5.c q1fv_8.c
+
+# as above, but FFTW_BACKWARD
+Q1B = q1bv_2.c q1bv_4.c q1bv_5.c q1bv_8.c
+
+###########################################################################
+SIMD_CODELETS = $(N1F) $(N1B) $(N2F) $(N2B) $(N2S) $(T1FU) $(T1F)	\
+$(T2F) $(T3F) $(T1BU) $(T1B) $(T2B) $(T3B) $(T1S) $(T2S) $(Q1F) $(Q1B)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,73 @@
+# include the list of codelets
+
+include $(top_srcdir)/dft/simd/codlist.mk
+
+ALL_CODELETS = $(SIMD_CODELETS)
+BUILT_SOURCES= $(SIMD_CODELETS) $(CODLIST)
+EXTRA_DIST = $(BUILT_SOURCES) genus.c
+INCLUDE_SIMD_HEADER="\#include SIMD_HEADER"
+XRENAME=XSIMD
+SOLVTAB_NAME = XSIMD(solvtab_dft)
+
+# include special rules for regenerating codelets.
+include $(top_srcdir)/support/Makefile.codelets
+
+if MAINTAINER_MODE
+
+GFLAGS = -simd $(FLAGS_COMMON) -pipeline-latency 8
+FLAGS_T2S=-twiddle-log3 -precompute-twiddles
+FLAGS_T3=-twiddle-log3 -precompute-twiddles -no-generate-bytw
+
+n1fv_%.c:  $(CODELET_DEPS) $(GEN_NOTW_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW_C) $(GFLAGS) -n $* -name n1fv_$* -include "n1f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+n2fv_%.c:  $(CODELET_DEPS) $(GEN_NOTW_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW_C) $(GFLAGS) -n $* -name n2fv_$* -with-ostride 2 -include "n2f.h" -store-multiple 2) | $(ADD_DATE) | $(INDENT) >$@
+
+n1bv_%.c:  $(CODELET_DEPS) $(GEN_NOTW_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW_C) $(GFLAGS) -sign 1 -n $* -name n1bv_$* -include "n1b.h") | $(ADD_DATE) | $(INDENT) >$@
+
+n2bv_%.c:  $(CODELET_DEPS) $(GEN_NOTW_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW_C) $(GFLAGS) -sign 1 -n $* -name n2bv_$* -with-ostride 2 -include "n2b.h"  -store-multiple 2) | $(ADD_DATE) | $(INDENT) >$@
+
+n2sv_%.c:  $(CODELET_DEPS) $(GEN_NOTW)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(GFLAGS) -n $* -name n2sv_$* -with-ostride 1 -include "n2s.h" -store-multiple 4) | $(ADD_DATE) | $(INDENT) >$@
+
+t1fv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t1fv_$* -include "t1f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+t1fuv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t1fuv_$* -include "t1fu.h") | $(ADD_DATE) | $(INDENT) >$@
+
+t2fv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t2fv_$* -include "t2f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+t3fv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) $(FLAGS_T3) -n $* -name t3fv_$* -include "t3f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+t1bv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t1bv_$* -include "t1b.h" -sign 1) | $(ADD_DATE) | $(INDENT) >$@
+
+t1buv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t1buv_$* -include "t1bu.h" -sign 1) | $(ADD_DATE) | $(INDENT) >$@
+
+t2bv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t2bv_$* -include "t2b.h" -sign 1) | $(ADD_DATE) | $(INDENT) >$@
+
+t3bv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) $(FLAGS_T3) -n $* -name t3bv_$* -include "t3b.h" -sign 1) | $(ADD_DATE) | $(INDENT) >$@
+
+t1sv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(GFLAGS) -n $* -name t1sv_$* -include "ts.h") | $(ADD_DATE) | $(INDENT) >$@
+
+t2sv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(GFLAGS) $(FLAGS_T2S) -n $* -name t2sv_$* -include "ts.h") | $(ADD_DATE) | $(INDENT) >$@
+
+q1fv_%.c:  $(CODELET_DEPS) $(GEN_TWIDSQ_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ_C) $(GFLAGS) -n $* -dif -name q1fv_$* -include "q1f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+q1bv_%.c:  $(CODELET_DEPS) $(GEN_TWIDSQ_C)
+	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ_C) $(GFLAGS) -n $* -dif -name q1bv_$* -include "q1b.h" -sign 1) | $(ADD_DATE) | $(INDENT) >$@
+
+
+endif # MAINTAINER_MODE
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,650 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# include the list of codelets
+
+# This file contains a standard list of DFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+# -*- makefile -*-
+# This file contains special make rules to generate codelets.
+# Most of this file requires GNU make .
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/dft/simd/codlist.mk \
+	$(top_srcdir)/support/Makefile.codelets
+subdir = dft/simd/common
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+
+###########################################################################
+# n1fv_<n> is a hard-coded FFTW_FORWARD FFT of size <n>, using SIMD
+N1F = n1fv_2.c n1fv_3.c n1fv_4.c n1fv_5.c n1fv_6.c n1fv_7.c n1fv_8.c	\
+n1fv_9.c n1fv_10.c n1fv_11.c n1fv_12.c n1fv_13.c n1fv_14.c n1fv_15.c	\
+n1fv_16.c n1fv_32.c n1fv_64.c n1fv_128.c n1fv_20.c n1fv_25.c
+
+
+# as above, with restricted input vector stride
+N2F = n2fv_2.c n2fv_4.c n2fv_6.c n2fv_8.c n2fv_10.c n2fv_12.c	\
+n2fv_14.c n2fv_16.c n2fv_32.c n2fv_64.c n2fv_20.c
+
+
+# as above, but FFTW_BACKWARD
+N1B = n1bv_2.c n1bv_3.c n1bv_4.c n1bv_5.c n1bv_6.c n1bv_7.c n1bv_8.c	\
+n1bv_9.c n1bv_10.c n1bv_11.c n1bv_12.c n1bv_13.c n1bv_14.c n1bv_15.c	\
+n1bv_16.c n1bv_32.c n1bv_64.c n1bv_128.c n1bv_20.c n1bv_25.c
+
+N2B = n2bv_2.c n2bv_4.c n2bv_6.c n2bv_8.c n2bv_10.c n2bv_12.c	\
+n2bv_14.c n2bv_16.c n2bv_32.c n2bv_64.c n2bv_20.c
+
+
+# split-complex codelets 
+N2S = n2sv_4.c n2sv_8.c n2sv_16.c n2sv_32.c n2sv_64.c
+
+###########################################################################
+# t1fv_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
+# for an FFTW_FORWARD transform, using SIMD
+T1F = t1fv_2.c t1fv_3.c t1fv_4.c t1fv_5.c t1fv_6.c t1fv_7.c t1fv_8.c	\
+t1fv_9.c t1fv_10.c t1fv_12.c t1fv_15.c t1fv_16.c t1fv_32.c t1fv_64.c	\
+t1fv_20.c t1fv_25.c
+
+
+# same as t1fv_*, but with different twiddle storage scheme
+T2F = t2fv_2.c t2fv_4.c t2fv_8.c t2fv_16.c t2fv_32.c t2fv_64.c	\
+t2fv_5.c t2fv_10.c t2fv_20.c t2fv_25.c
+
+T3F = t3fv_4.c t3fv_8.c t3fv_16.c t3fv_32.c t3fv_5.c t3fv_10.c	\
+t3fv_20.c t3fv_25.c
+
+T1FU = t1fuv_2.c t1fuv_3.c t1fuv_4.c t1fuv_5.c t1fuv_6.c t1fuv_7.c	\
+t1fuv_8.c t1fuv_9.c t1fuv_10.c
+
+
+# as above, but FFTW_BACKWARD
+T1B = t1bv_2.c t1bv_3.c t1bv_4.c t1bv_5.c t1bv_6.c t1bv_7.c t1bv_8.c	\
+t1bv_9.c t1bv_10.c t1bv_12.c t1bv_15.c t1bv_16.c t1bv_32.c t1bv_64.c	\
+t1bv_20.c t1bv_25.c
+
+
+# same as t1bv_*, but with different twiddle storage scheme
+T2B = t2bv_2.c t2bv_4.c t2bv_8.c t2bv_16.c t2bv_32.c t2bv_64.c	\
+t2bv_5.c t2bv_10.c t2bv_20.c t2bv_25.c
+
+T3B = t3bv_4.c t3bv_8.c t3bv_16.c t3bv_32.c t3bv_5.c t3bv_10.c	\
+t3bv_20.c t3bv_25.c
+
+T1BU = t1buv_2.c t1buv_3.c t1buv_4.c t1buv_5.c t1buv_6.c t1buv_7.c	\
+t1buv_8.c t1buv_9.c t1buv_10.c
+
+
+# split-complex codelets
+T1S = t1sv_2.c t1sv_4.c t1sv_8.c t1sv_16.c t1sv_32.c
+T2S = t2sv_4.c t2sv_8.c t2sv_16.c t2sv_32.c
+
+###########################################################################
+# q1fv_<r> is <r> twiddle FFTW_FORWARD FFTs of size <r> (DIF step),
+# where the output is transposed, using SIMD.  This is used for
+# in-place transposes in sizes that are divisible by <r>^2.  These
+# codelets have size ~ <r>^2, so you should probably not use <r>
+# bigger than 8 or so.
+Q1F = q1fv_2.c q1fv_4.c q1fv_5.c q1fv_8.c
+
+# as above, but FFTW_BACKWARD
+Q1B = q1bv_2.c q1bv_4.c q1bv_5.c q1bv_8.c
+
+###########################################################################
+SIMD_CODELETS = $(N1F) $(N1B) $(N2F) $(N2B) $(N2S) $(T1FU) $(T1F)	\
+$(T2F) $(T3F) $(T1BU) $(T1B) $(T2B) $(T3B) $(T1S) $(T2S) $(Q1F) $(Q1B)
+
+ALL_CODELETS = $(SIMD_CODELETS)
+BUILT_SOURCES = $(SIMD_CODELETS) $(CODLIST)
+EXTRA_DIST = $(BUILT_SOURCES) genus.c
+INCLUDE_SIMD_HEADER = "\#include SIMD_HEADER"
+XRENAME = XSIMD
+SOLVTAB_NAME = XSIMD(solvtab_dft)
+CODLIST = codlist.c
+CODELET_NAME = codelet_
+@MAINTAINER_MODE_TRUE@INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
+@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
+@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
+@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
+@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
+@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
+@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
+@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
+@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
+@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
+@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
+@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
+@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE) 
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
+@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
+@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+
+# include special rules for regenerating codelets.
+@MAINTAINER_MODE_TRUE@GFLAGS = -simd $(FLAGS_COMMON) -pipeline-latency 8
+@MAINTAINER_MODE_TRUE@FLAGS_T2S = -twiddle-log3 -precompute-twiddles
+@MAINTAINER_MODE_TRUE@FLAGS_T3 = -twiddle-log3 -precompute-twiddles -no-generate-bytw
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/dft/simd/codlist.mk $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/simd/common/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu dft/simd/common/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/dft/simd/codlist.mk $(top_srcdir)/support/Makefile.codelets:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+tags: TAGS
+TAGS:
+
+ctags: CTAGS
+CTAGS:
+
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic \
+	maintainer-clean-local
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+	distclean distclean-generic distclean-libtool distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic maintainer-clean-local mostlyclean \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	uninstall uninstall-am
+
+
+# rule to build codlist
+$(CODLIST): Makefile
+	(									\
+	echo "#include \"ifftw.h\"";						\
+	echo $(INCLUDE_SIMD_HEADER);						\
+	echo;									\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+             echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);";	\
+           fi									\
+	done;									\
+	echo;									\
+	echo;									\
+	echo "extern const solvtab $(SOLVTAB_NAME);";				\
+	echo "const solvtab $(SOLVTAB_NAME) = {";				\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+	     echo "   SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),";		\
+	   fi									\
+	done;									\
+	echo "   SOLVTAB_END";							\
+	echo "};";								\
+	) >$@
+
+# only delete codlist.c in maintainer-mode, since it is included in the dist
+# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
+maintainer-clean-local:
+	rm -f $(CODLIST)
+
+# cancel the hideous builtin rules that cause an infinite loop
+@MAINTAINER_MODE_TRUE@%: %.o
+@MAINTAINER_MODE_TRUE@%: %.s
+@MAINTAINER_MODE_TRUE@%: %.c
+@MAINTAINER_MODE_TRUE@%: %.S
+
+@MAINTAINER_MODE_TRUE@n1fv_%.c:  $(CODELET_DEPS) $(GEN_NOTW_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW_C) $(GFLAGS) -n $* -name n1fv_$* -include "n1f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@n2fv_%.c:  $(CODELET_DEPS) $(GEN_NOTW_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW_C) $(GFLAGS) -n $* -name n2fv_$* -with-ostride 2 -include "n2f.h" -store-multiple 2) | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@n1bv_%.c:  $(CODELET_DEPS) $(GEN_NOTW_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW_C) $(GFLAGS) -sign 1 -n $* -name n1bv_$* -include "n1b.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@n2bv_%.c:  $(CODELET_DEPS) $(GEN_NOTW_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW_C) $(GFLAGS) -sign 1 -n $* -name n2bv_$* -with-ostride 2 -include "n2b.h"  -store-multiple 2) | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@n2sv_%.c:  $(CODELET_DEPS) $(GEN_NOTW)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(GFLAGS) -n $* -name n2sv_$* -with-ostride 1 -include "n2s.h" -store-multiple 4) | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t1fv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t1fv_$* -include "t1f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t1fuv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t1fuv_$* -include "t1fu.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t2fv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t2fv_$* -include "t2f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t3fv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) $(FLAGS_T3) -n $* -name t3fv_$* -include "t3f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t1bv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t1bv_$* -include "t1b.h" -sign 1) | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t1buv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t1buv_$* -include "t1bu.h" -sign 1) | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t2bv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) -n $* -name t2bv_$* -include "t2b.h" -sign 1) | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t3bv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE_C) $(GFLAGS) $(FLAGS_T3) -n $* -name t3bv_$* -include "t3b.h" -sign 1) | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t1sv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(GFLAGS) -n $* -name t1sv_$* -include "ts.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@t2sv_%.c:  $(CODELET_DEPS) $(GEN_TWIDDLE)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(GFLAGS) $(FLAGS_T2S) -n $* -name t2sv_$* -include "ts.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@q1fv_%.c:  $(CODELET_DEPS) $(GEN_TWIDSQ_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ_C) $(GFLAGS) -n $* -dif -name q1fv_$* -include "q1f.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@q1bv_%.c:  $(CODELET_DEPS) $(GEN_TWIDSQ_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ_C) $(GFLAGS) -n $* -dif -name q1bv_$* -include "q1b.h" -sign 1) | $(ADD_DATE) | $(INDENT) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,349 @@
+#include "ifftw.h"
+#include SIMD_HEADER
+
+extern void XSIMD(codelet_n1fv_2)(planner *);
+extern void XSIMD(codelet_n1fv_3)(planner *);
+extern void XSIMD(codelet_n1fv_4)(planner *);
+extern void XSIMD(codelet_n1fv_5)(planner *);
+extern void XSIMD(codelet_n1fv_6)(planner *);
+extern void XSIMD(codelet_n1fv_7)(planner *);
+extern void XSIMD(codelet_n1fv_8)(planner *);
+extern void XSIMD(codelet_n1fv_9)(planner *);
+extern void XSIMD(codelet_n1fv_10)(planner *);
+extern void XSIMD(codelet_n1fv_11)(planner *);
+extern void XSIMD(codelet_n1fv_12)(planner *);
+extern void XSIMD(codelet_n1fv_13)(planner *);
+extern void XSIMD(codelet_n1fv_14)(planner *);
+extern void XSIMD(codelet_n1fv_15)(planner *);
+extern void XSIMD(codelet_n1fv_16)(planner *);
+extern void XSIMD(codelet_n1fv_32)(planner *);
+extern void XSIMD(codelet_n1fv_64)(planner *);
+extern void XSIMD(codelet_n1fv_128)(planner *);
+extern void XSIMD(codelet_n1fv_20)(planner *);
+extern void XSIMD(codelet_n1fv_25)(planner *);
+extern void XSIMD(codelet_n1bv_2)(planner *);
+extern void XSIMD(codelet_n1bv_3)(planner *);
+extern void XSIMD(codelet_n1bv_4)(planner *);
+extern void XSIMD(codelet_n1bv_5)(planner *);
+extern void XSIMD(codelet_n1bv_6)(planner *);
+extern void XSIMD(codelet_n1bv_7)(planner *);
+extern void XSIMD(codelet_n1bv_8)(planner *);
+extern void XSIMD(codelet_n1bv_9)(planner *);
+extern void XSIMD(codelet_n1bv_10)(planner *);
+extern void XSIMD(codelet_n1bv_11)(planner *);
+extern void XSIMD(codelet_n1bv_12)(planner *);
+extern void XSIMD(codelet_n1bv_13)(planner *);
+extern void XSIMD(codelet_n1bv_14)(planner *);
+extern void XSIMD(codelet_n1bv_15)(planner *);
+extern void XSIMD(codelet_n1bv_16)(planner *);
+extern void XSIMD(codelet_n1bv_32)(planner *);
+extern void XSIMD(codelet_n1bv_64)(planner *);
+extern void XSIMD(codelet_n1bv_128)(planner *);
+extern void XSIMD(codelet_n1bv_20)(planner *);
+extern void XSIMD(codelet_n1bv_25)(planner *);
+extern void XSIMD(codelet_n2fv_2)(planner *);
+extern void XSIMD(codelet_n2fv_4)(planner *);
+extern void XSIMD(codelet_n2fv_6)(planner *);
+extern void XSIMD(codelet_n2fv_8)(planner *);
+extern void XSIMD(codelet_n2fv_10)(planner *);
+extern void XSIMD(codelet_n2fv_12)(planner *);
+extern void XSIMD(codelet_n2fv_14)(planner *);
+extern void XSIMD(codelet_n2fv_16)(planner *);
+extern void XSIMD(codelet_n2fv_32)(planner *);
+extern void XSIMD(codelet_n2fv_64)(planner *);
+extern void XSIMD(codelet_n2fv_20)(planner *);
+extern void XSIMD(codelet_n2bv_2)(planner *);
+extern void XSIMD(codelet_n2bv_4)(planner *);
+extern void XSIMD(codelet_n2bv_6)(planner *);
+extern void XSIMD(codelet_n2bv_8)(planner *);
+extern void XSIMD(codelet_n2bv_10)(planner *);
+extern void XSIMD(codelet_n2bv_12)(planner *);
+extern void XSIMD(codelet_n2bv_14)(planner *);
+extern void XSIMD(codelet_n2bv_16)(planner *);
+extern void XSIMD(codelet_n2bv_32)(planner *);
+extern void XSIMD(codelet_n2bv_64)(planner *);
+extern void XSIMD(codelet_n2bv_20)(planner *);
+extern void XSIMD(codelet_n2sv_4)(planner *);
+extern void XSIMD(codelet_n2sv_8)(planner *);
+extern void XSIMD(codelet_n2sv_16)(planner *);
+extern void XSIMD(codelet_n2sv_32)(planner *);
+extern void XSIMD(codelet_n2sv_64)(planner *);
+extern void XSIMD(codelet_t1fuv_2)(planner *);
+extern void XSIMD(codelet_t1fuv_3)(planner *);
+extern void XSIMD(codelet_t1fuv_4)(planner *);
+extern void XSIMD(codelet_t1fuv_5)(planner *);
+extern void XSIMD(codelet_t1fuv_6)(planner *);
+extern void XSIMD(codelet_t1fuv_7)(planner *);
+extern void XSIMD(codelet_t1fuv_8)(planner *);
+extern void XSIMD(codelet_t1fuv_9)(planner *);
+extern void XSIMD(codelet_t1fuv_10)(planner *);
+extern void XSIMD(codelet_t1fv_2)(planner *);
+extern void XSIMD(codelet_t1fv_3)(planner *);
+extern void XSIMD(codelet_t1fv_4)(planner *);
+extern void XSIMD(codelet_t1fv_5)(planner *);
+extern void XSIMD(codelet_t1fv_6)(planner *);
+extern void XSIMD(codelet_t1fv_7)(planner *);
+extern void XSIMD(codelet_t1fv_8)(planner *);
+extern void XSIMD(codelet_t1fv_9)(planner *);
+extern void XSIMD(codelet_t1fv_10)(planner *);
+extern void XSIMD(codelet_t1fv_12)(planner *);
+extern void XSIMD(codelet_t1fv_15)(planner *);
+extern void XSIMD(codelet_t1fv_16)(planner *);
+extern void XSIMD(codelet_t1fv_32)(planner *);
+extern void XSIMD(codelet_t1fv_64)(planner *);
+extern void XSIMD(codelet_t1fv_20)(planner *);
+extern void XSIMD(codelet_t1fv_25)(planner *);
+extern void XSIMD(codelet_t2fv_2)(planner *);
+extern void XSIMD(codelet_t2fv_4)(planner *);
+extern void XSIMD(codelet_t2fv_8)(planner *);
+extern void XSIMD(codelet_t2fv_16)(planner *);
+extern void XSIMD(codelet_t2fv_32)(planner *);
+extern void XSIMD(codelet_t2fv_64)(planner *);
+extern void XSIMD(codelet_t2fv_5)(planner *);
+extern void XSIMD(codelet_t2fv_10)(planner *);
+extern void XSIMD(codelet_t2fv_20)(planner *);
+extern void XSIMD(codelet_t2fv_25)(planner *);
+extern void XSIMD(codelet_t3fv_4)(planner *);
+extern void XSIMD(codelet_t3fv_8)(planner *);
+extern void XSIMD(codelet_t3fv_16)(planner *);
+extern void XSIMD(codelet_t3fv_32)(planner *);
+extern void XSIMD(codelet_t3fv_5)(planner *);
+extern void XSIMD(codelet_t3fv_10)(planner *);
+extern void XSIMD(codelet_t3fv_20)(planner *);
+extern void XSIMD(codelet_t3fv_25)(planner *);
+extern void XSIMD(codelet_t1buv_2)(planner *);
+extern void XSIMD(codelet_t1buv_3)(planner *);
+extern void XSIMD(codelet_t1buv_4)(planner *);
+extern void XSIMD(codelet_t1buv_5)(planner *);
+extern void XSIMD(codelet_t1buv_6)(planner *);
+extern void XSIMD(codelet_t1buv_7)(planner *);
+extern void XSIMD(codelet_t1buv_8)(planner *);
+extern void XSIMD(codelet_t1buv_9)(planner *);
+extern void XSIMD(codelet_t1buv_10)(planner *);
+extern void XSIMD(codelet_t1bv_2)(planner *);
+extern void XSIMD(codelet_t1bv_3)(planner *);
+extern void XSIMD(codelet_t1bv_4)(planner *);
+extern void XSIMD(codelet_t1bv_5)(planner *);
+extern void XSIMD(codelet_t1bv_6)(planner *);
+extern void XSIMD(codelet_t1bv_7)(planner *);
+extern void XSIMD(codelet_t1bv_8)(planner *);
+extern void XSIMD(codelet_t1bv_9)(planner *);
+extern void XSIMD(codelet_t1bv_10)(planner *);
+extern void XSIMD(codelet_t1bv_12)(planner *);
+extern void XSIMD(codelet_t1bv_15)(planner *);
+extern void XSIMD(codelet_t1bv_16)(planner *);
+extern void XSIMD(codelet_t1bv_32)(planner *);
+extern void XSIMD(codelet_t1bv_64)(planner *);
+extern void XSIMD(codelet_t1bv_20)(planner *);
+extern void XSIMD(codelet_t1bv_25)(planner *);
+extern void XSIMD(codelet_t2bv_2)(planner *);
+extern void XSIMD(codelet_t2bv_4)(planner *);
+extern void XSIMD(codelet_t2bv_8)(planner *);
+extern void XSIMD(codelet_t2bv_16)(planner *);
+extern void XSIMD(codelet_t2bv_32)(planner *);
+extern void XSIMD(codelet_t2bv_64)(planner *);
+extern void XSIMD(codelet_t2bv_5)(planner *);
+extern void XSIMD(codelet_t2bv_10)(planner *);
+extern void XSIMD(codelet_t2bv_20)(planner *);
+extern void XSIMD(codelet_t2bv_25)(planner *);
+extern void XSIMD(codelet_t3bv_4)(planner *);
+extern void XSIMD(codelet_t3bv_8)(planner *);
+extern void XSIMD(codelet_t3bv_16)(planner *);
+extern void XSIMD(codelet_t3bv_32)(planner *);
+extern void XSIMD(codelet_t3bv_5)(planner *);
+extern void XSIMD(codelet_t3bv_10)(planner *);
+extern void XSIMD(codelet_t3bv_20)(planner *);
+extern void XSIMD(codelet_t3bv_25)(planner *);
+extern void XSIMD(codelet_t1sv_2)(planner *);
+extern void XSIMD(codelet_t1sv_4)(planner *);
+extern void XSIMD(codelet_t1sv_8)(planner *);
+extern void XSIMD(codelet_t1sv_16)(planner *);
+extern void XSIMD(codelet_t1sv_32)(planner *);
+extern void XSIMD(codelet_t2sv_4)(planner *);
+extern void XSIMD(codelet_t2sv_8)(planner *);
+extern void XSIMD(codelet_t2sv_16)(planner *);
+extern void XSIMD(codelet_t2sv_32)(planner *);
+extern void XSIMD(codelet_q1fv_2)(planner *);
+extern void XSIMD(codelet_q1fv_4)(planner *);
+extern void XSIMD(codelet_q1fv_5)(planner *);
+extern void XSIMD(codelet_q1fv_8)(planner *);
+extern void XSIMD(codelet_q1bv_2)(planner *);
+extern void XSIMD(codelet_q1bv_4)(planner *);
+extern void XSIMD(codelet_q1bv_5)(planner *);
+extern void XSIMD(codelet_q1bv_8)(planner *);
+
+
+extern const solvtab XSIMD(solvtab_dft);
+const solvtab XSIMD(solvtab_dft) = {
+   SOLVTAB(XSIMD(codelet_n1fv_2)),
+   SOLVTAB(XSIMD(codelet_n1fv_3)),
+   SOLVTAB(XSIMD(codelet_n1fv_4)),
+   SOLVTAB(XSIMD(codelet_n1fv_5)),
+   SOLVTAB(XSIMD(codelet_n1fv_6)),
+   SOLVTAB(XSIMD(codelet_n1fv_7)),
+   SOLVTAB(XSIMD(codelet_n1fv_8)),
+   SOLVTAB(XSIMD(codelet_n1fv_9)),
+   SOLVTAB(XSIMD(codelet_n1fv_10)),
+   SOLVTAB(XSIMD(codelet_n1fv_11)),
+   SOLVTAB(XSIMD(codelet_n1fv_12)),
+   SOLVTAB(XSIMD(codelet_n1fv_13)),
+   SOLVTAB(XSIMD(codelet_n1fv_14)),
+   SOLVTAB(XSIMD(codelet_n1fv_15)),
+   SOLVTAB(XSIMD(codelet_n1fv_16)),
+   SOLVTAB(XSIMD(codelet_n1fv_32)),
+   SOLVTAB(XSIMD(codelet_n1fv_64)),
+   SOLVTAB(XSIMD(codelet_n1fv_128)),
+   SOLVTAB(XSIMD(codelet_n1fv_20)),
+   SOLVTAB(XSIMD(codelet_n1fv_25)),
+   SOLVTAB(XSIMD(codelet_n1bv_2)),
+   SOLVTAB(XSIMD(codelet_n1bv_3)),
+   SOLVTAB(XSIMD(codelet_n1bv_4)),
+   SOLVTAB(XSIMD(codelet_n1bv_5)),
+   SOLVTAB(XSIMD(codelet_n1bv_6)),
+   SOLVTAB(XSIMD(codelet_n1bv_7)),
+   SOLVTAB(XSIMD(codelet_n1bv_8)),
+   SOLVTAB(XSIMD(codelet_n1bv_9)),
+   SOLVTAB(XSIMD(codelet_n1bv_10)),
+   SOLVTAB(XSIMD(codelet_n1bv_11)),
+   SOLVTAB(XSIMD(codelet_n1bv_12)),
+   SOLVTAB(XSIMD(codelet_n1bv_13)),
+   SOLVTAB(XSIMD(codelet_n1bv_14)),
+   SOLVTAB(XSIMD(codelet_n1bv_15)),
+   SOLVTAB(XSIMD(codelet_n1bv_16)),
+   SOLVTAB(XSIMD(codelet_n1bv_32)),
+   SOLVTAB(XSIMD(codelet_n1bv_64)),
+   SOLVTAB(XSIMD(codelet_n1bv_128)),
+   SOLVTAB(XSIMD(codelet_n1bv_20)),
+   SOLVTAB(XSIMD(codelet_n1bv_25)),
+   SOLVTAB(XSIMD(codelet_n2fv_2)),
+   SOLVTAB(XSIMD(codelet_n2fv_4)),
+   SOLVTAB(XSIMD(codelet_n2fv_6)),
+   SOLVTAB(XSIMD(codelet_n2fv_8)),
+   SOLVTAB(XSIMD(codelet_n2fv_10)),
+   SOLVTAB(XSIMD(codelet_n2fv_12)),
+   SOLVTAB(XSIMD(codelet_n2fv_14)),
+   SOLVTAB(XSIMD(codelet_n2fv_16)),
+   SOLVTAB(XSIMD(codelet_n2fv_32)),
+   SOLVTAB(XSIMD(codelet_n2fv_64)),
+   SOLVTAB(XSIMD(codelet_n2fv_20)),
+   SOLVTAB(XSIMD(codelet_n2bv_2)),
+   SOLVTAB(XSIMD(codelet_n2bv_4)),
+   SOLVTAB(XSIMD(codelet_n2bv_6)),
+   SOLVTAB(XSIMD(codelet_n2bv_8)),
+   SOLVTAB(XSIMD(codelet_n2bv_10)),
+   SOLVTAB(XSIMD(codelet_n2bv_12)),
+   SOLVTAB(XSIMD(codelet_n2bv_14)),
+   SOLVTAB(XSIMD(codelet_n2bv_16)),
+   SOLVTAB(XSIMD(codelet_n2bv_32)),
+   SOLVTAB(XSIMD(codelet_n2bv_64)),
+   SOLVTAB(XSIMD(codelet_n2bv_20)),
+   SOLVTAB(XSIMD(codelet_n2sv_4)),
+   SOLVTAB(XSIMD(codelet_n2sv_8)),
+   SOLVTAB(XSIMD(codelet_n2sv_16)),
+   SOLVTAB(XSIMD(codelet_n2sv_32)),
+   SOLVTAB(XSIMD(codelet_n2sv_64)),
+   SOLVTAB(XSIMD(codelet_t1fuv_2)),
+   SOLVTAB(XSIMD(codelet_t1fuv_3)),
+   SOLVTAB(XSIMD(codelet_t1fuv_4)),
+   SOLVTAB(XSIMD(codelet_t1fuv_5)),
+   SOLVTAB(XSIMD(codelet_t1fuv_6)),
+   SOLVTAB(XSIMD(codelet_t1fuv_7)),
+   SOLVTAB(XSIMD(codelet_t1fuv_8)),
+   SOLVTAB(XSIMD(codelet_t1fuv_9)),
+   SOLVTAB(XSIMD(codelet_t1fuv_10)),
+   SOLVTAB(XSIMD(codelet_t1fv_2)),
+   SOLVTAB(XSIMD(codelet_t1fv_3)),
+   SOLVTAB(XSIMD(codelet_t1fv_4)),
+   SOLVTAB(XSIMD(codelet_t1fv_5)),
+   SOLVTAB(XSIMD(codelet_t1fv_6)),
+   SOLVTAB(XSIMD(codelet_t1fv_7)),
+   SOLVTAB(XSIMD(codelet_t1fv_8)),
+   SOLVTAB(XSIMD(codelet_t1fv_9)),
+   SOLVTAB(XSIMD(codelet_t1fv_10)),
+   SOLVTAB(XSIMD(codelet_t1fv_12)),
+   SOLVTAB(XSIMD(codelet_t1fv_15)),
+   SOLVTAB(XSIMD(codelet_t1fv_16)),
+   SOLVTAB(XSIMD(codelet_t1fv_32)),
+   SOLVTAB(XSIMD(codelet_t1fv_64)),
+   SOLVTAB(XSIMD(codelet_t1fv_20)),
+   SOLVTAB(XSIMD(codelet_t1fv_25)),
+   SOLVTAB(XSIMD(codelet_t2fv_2)),
+   SOLVTAB(XSIMD(codelet_t2fv_4)),
+   SOLVTAB(XSIMD(codelet_t2fv_8)),
+   SOLVTAB(XSIMD(codelet_t2fv_16)),
+   SOLVTAB(XSIMD(codelet_t2fv_32)),
+   SOLVTAB(XSIMD(codelet_t2fv_64)),
+   SOLVTAB(XSIMD(codelet_t2fv_5)),
+   SOLVTAB(XSIMD(codelet_t2fv_10)),
+   SOLVTAB(XSIMD(codelet_t2fv_20)),
+   SOLVTAB(XSIMD(codelet_t2fv_25)),
+   SOLVTAB(XSIMD(codelet_t3fv_4)),
+   SOLVTAB(XSIMD(codelet_t3fv_8)),
+   SOLVTAB(XSIMD(codelet_t3fv_16)),
+   SOLVTAB(XSIMD(codelet_t3fv_32)),
+   SOLVTAB(XSIMD(codelet_t3fv_5)),
+   SOLVTAB(XSIMD(codelet_t3fv_10)),
+   SOLVTAB(XSIMD(codelet_t3fv_20)),
+   SOLVTAB(XSIMD(codelet_t3fv_25)),
+   SOLVTAB(XSIMD(codelet_t1buv_2)),
+   SOLVTAB(XSIMD(codelet_t1buv_3)),
+   SOLVTAB(XSIMD(codelet_t1buv_4)),
+   SOLVTAB(XSIMD(codelet_t1buv_5)),
+   SOLVTAB(XSIMD(codelet_t1buv_6)),
+   SOLVTAB(XSIMD(codelet_t1buv_7)),
+   SOLVTAB(XSIMD(codelet_t1buv_8)),
+   SOLVTAB(XSIMD(codelet_t1buv_9)),
+   SOLVTAB(XSIMD(codelet_t1buv_10)),
+   SOLVTAB(XSIMD(codelet_t1bv_2)),
+   SOLVTAB(XSIMD(codelet_t1bv_3)),
+   SOLVTAB(XSIMD(codelet_t1bv_4)),
+   SOLVTAB(XSIMD(codelet_t1bv_5)),
+   SOLVTAB(XSIMD(codelet_t1bv_6)),
+   SOLVTAB(XSIMD(codelet_t1bv_7)),
+   SOLVTAB(XSIMD(codelet_t1bv_8)),
+   SOLVTAB(XSIMD(codelet_t1bv_9)),
+   SOLVTAB(XSIMD(codelet_t1bv_10)),
+   SOLVTAB(XSIMD(codelet_t1bv_12)),
+   SOLVTAB(XSIMD(codelet_t1bv_15)),
+   SOLVTAB(XSIMD(codelet_t1bv_16)),
+   SOLVTAB(XSIMD(codelet_t1bv_32)),
+   SOLVTAB(XSIMD(codelet_t1bv_64)),
+   SOLVTAB(XSIMD(codelet_t1bv_20)),
+   SOLVTAB(XSIMD(codelet_t1bv_25)),
+   SOLVTAB(XSIMD(codelet_t2bv_2)),
+   SOLVTAB(XSIMD(codelet_t2bv_4)),
+   SOLVTAB(XSIMD(codelet_t2bv_8)),
+   SOLVTAB(XSIMD(codelet_t2bv_16)),
+   SOLVTAB(XSIMD(codelet_t2bv_32)),
+   SOLVTAB(XSIMD(codelet_t2bv_64)),
+   SOLVTAB(XSIMD(codelet_t2bv_5)),
+   SOLVTAB(XSIMD(codelet_t2bv_10)),
+   SOLVTAB(XSIMD(codelet_t2bv_20)),
+   SOLVTAB(XSIMD(codelet_t2bv_25)),
+   SOLVTAB(XSIMD(codelet_t3bv_4)),
+   SOLVTAB(XSIMD(codelet_t3bv_8)),
+   SOLVTAB(XSIMD(codelet_t3bv_16)),
+   SOLVTAB(XSIMD(codelet_t3bv_32)),
+   SOLVTAB(XSIMD(codelet_t3bv_5)),
+   SOLVTAB(XSIMD(codelet_t3bv_10)),
+   SOLVTAB(XSIMD(codelet_t3bv_20)),
+   SOLVTAB(XSIMD(codelet_t3bv_25)),
+   SOLVTAB(XSIMD(codelet_t1sv_2)),
+   SOLVTAB(XSIMD(codelet_t1sv_4)),
+   SOLVTAB(XSIMD(codelet_t1sv_8)),
+   SOLVTAB(XSIMD(codelet_t1sv_16)),
+   SOLVTAB(XSIMD(codelet_t1sv_32)),
+   SOLVTAB(XSIMD(codelet_t2sv_4)),
+   SOLVTAB(XSIMD(codelet_t2sv_8)),
+   SOLVTAB(XSIMD(codelet_t2sv_16)),
+   SOLVTAB(XSIMD(codelet_t2sv_32)),
+   SOLVTAB(XSIMD(codelet_q1fv_2)),
+   SOLVTAB(XSIMD(codelet_q1fv_4)),
+   SOLVTAB(XSIMD(codelet_q1fv_5)),
+   SOLVTAB(XSIMD(codelet_q1fv_8)),
+   SOLVTAB(XSIMD(codelet_q1bv_2)),
+   SOLVTAB(XSIMD(codelet_q1bv_4)),
+   SOLVTAB(XSIMD(codelet_q1bv_5)),
+   SOLVTAB(XSIMD(codelet_q1bv_8)),
+   SOLVTAB_END
+};
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/genus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/genus.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "codelet-dft.h"
+#include SIMD_HEADER
+
+#define EXTERN_CONST(t, x) extern const t x; const t x
+
+static int n1b_okp(const kdft_desc *d,
+		   const R *ri, const R *ii, const R *ro, const R *io,
+		   INT is, INT os, INT vl, INT ivs, INT ovs, 
+		   const planner *plnr)
+{
+     return (1
+             && ALIGNED(ii)
+             && ALIGNED(io)
+	     && !NO_SIMDP(plnr)
+	     && SIMD_STRIDE_OK(is)
+	     && SIMD_STRIDE_OK(os)
+	     && SIMD_VSTRIDE_OK(ivs)
+	     && SIMD_VSTRIDE_OK(ovs)
+             && ri == ii + 1
+             && ro == io + 1
+             && (vl % VL) == 0
+             && (!d->is || (d->is == is))
+             && (!d->os || (d->os == os))
+             && (!d->ivs || (d->ivs == ivs))
+             && (!d->ovs || (d->ovs == ovs))
+          );
+}
+
+EXTERN_CONST(kdft_genus, XSIMD(dft_n1bsimd_genus)) = { n1b_okp, VL };
+
+static int n1f_okp(const kdft_desc *d,
+		   const R *ri, const R *ii, const R *ro, const R *io,
+		   INT is, INT os, INT vl, INT ivs, INT ovs, 
+		   const planner *plnr)
+{
+     return (1
+             && ALIGNED(ri)
+             && ALIGNED(ro)
+	     && !NO_SIMDP(plnr)
+	     && SIMD_STRIDE_OK(is)
+	     && SIMD_STRIDE_OK(os)
+	     && SIMD_VSTRIDE_OK(ivs)
+	     && SIMD_VSTRIDE_OK(ovs)
+             && ii == ri + 1
+             && io == ro + 1
+             && (vl % VL) == 0
+             && (!d->is || (d->is == is))
+             && (!d->os || (d->os == os))
+             && (!d->ivs || (d->ivs == ivs))
+             && (!d->ovs || (d->ovs == ovs))
+          );
+}
+
+EXTERN_CONST(kdft_genus, XSIMD(dft_n1fsimd_genus)) = { n1f_okp, VL };
+
+static int n2b_okp(const kdft_desc *d,
+		   const R *ri, const R *ii, const R *ro, const R *io,
+		   INT is, INT os, INT vl, INT ivs, INT ovs, 
+		   const planner *plnr)
+{
+     return (1
+             && ALIGNEDA(ii)
+             && ALIGNEDA(io)
+	     && !NO_SIMDP(plnr)
+	     && SIMD_STRIDE_OKA(is)
+	     && SIMD_VSTRIDE_OKA(ivs)
+	     && SIMD_VSTRIDE_OKA(os) /* os == 2 enforced by codelet */
+	     && SIMD_STRIDE_OKPAIR(ovs)
+             && ri == ii + 1
+             && ro == io + 1
+             && (vl % VL) == 0
+             && (!d->is || (d->is == is))
+             && (!d->os || (d->os == os))
+             && (!d->ivs || (d->ivs == ivs))
+             && (!d->ovs || (d->ovs == ovs))
+          );
+}
+
+EXTERN_CONST(kdft_genus, XSIMD(dft_n2bsimd_genus)) = { n2b_okp, VL };
+
+static int n2f_okp(const kdft_desc *d,
+		   const R *ri, const R *ii, const R *ro, const R *io,
+		   INT is, INT os, INT vl, INT ivs, INT ovs, 
+		   const planner *plnr)
+{
+     return (1
+             && ALIGNEDA(ri)
+             && ALIGNEDA(ro)
+	     && !NO_SIMDP(plnr)
+	     && SIMD_STRIDE_OKA(is)
+	     && SIMD_VSTRIDE_OKA(ivs)
+	     && SIMD_VSTRIDE_OKA(os) /* os == 2 enforced by codelet */
+	     && SIMD_STRIDE_OKPAIR(ovs)
+             && ii == ri + 1
+             && io == ro + 1
+             && (vl % VL) == 0
+             && (!d->is || (d->is == is))
+             && (!d->os || (d->os == os))
+             && (!d->ivs || (d->ivs == ivs))
+             && (!d->ovs || (d->ovs == ovs))
+          );
+}
+
+EXTERN_CONST(kdft_genus, XSIMD(dft_n2fsimd_genus)) = { n2f_okp, VL };
+
+static int n2s_okp(const kdft_desc *d,
+		   const R *ri, const R *ii, const R *ro, const R *io,
+		   INT is, INT os, INT vl, INT ivs, INT ovs, 
+		   const planner *plnr)
+{
+     return (1
+	     && !NO_SIMDP(plnr)
+	     && ALIGNEDA(ri)
+	     && ALIGNEDA(ii)
+	     && ALIGNEDA(ro)
+	     && ALIGNEDA(io)
+	     && SIMD_STRIDE_OKA(is)
+	     && ivs == 1
+	     && os == 1
+	     && SIMD_STRIDE_OKA(ovs)
+	     && (vl % (2 * VL)) == 0
+	     && (!d->is || (d->is == is))
+	     && (!d->os || (d->os == os))
+	     && (!d->ivs || (d->ivs == ivs))
+	     && (!d->ovs || (d->ovs == ovs))
+	  );
+}
+
+EXTERN_CONST(kdft_genus, XSIMD(dft_n2ssimd_genus)) = { n2s_okp, 2 * VL };
+
+static int q1b_okp(const ct_desc *d,
+		   const R *rio, const R *iio, 
+		   INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+		   const planner *plnr)
+{
+     return (1
+	     && ALIGNED(iio)
+	     && !NO_SIMDP(plnr)
+	     && SIMD_STRIDE_OK(rs)
+	     && SIMD_STRIDE_OK(vs)
+	     && SIMD_VSTRIDE_OK(ms)
+	     && rio == iio + 1
+	     && (m % VL) == 0
+	     && (mb % VL) == 0
+	     && (me % VL) == 0
+	     && (!d->rs || (d->rs == rs))
+	     && (!d->vs || (d->vs == vs))
+	     && (!d->ms || (d->ms == ms))
+	  );
+}
+EXTERN_CONST(ct_genus,  XSIMD(dft_q1bsimd_genus)) = { q1b_okp, VL };
+
+static int q1f_okp(const ct_desc *d,
+		   const R *rio, const R *iio, 
+		   INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+		   const planner *plnr)
+{
+     return (1
+	     && ALIGNED(rio)
+	     && !NO_SIMDP(plnr)
+	     && SIMD_STRIDE_OK(rs)
+	     && SIMD_STRIDE_OK(vs)
+	     && SIMD_VSTRIDE_OK(ms)
+	     && iio == rio + 1
+	     && (m % VL) == 0
+	     && (mb % VL) == 0
+	     && (me % VL) == 0
+	     && (!d->rs || (d->rs == rs))
+	     && (!d->vs || (d->vs == vs))
+	     && (!d->ms || (d->ms == ms))
+	  );
+}
+EXTERN_CONST(ct_genus,  XSIMD(dft_q1fsimd_genus)) = { q1f_okp, VL };
+
+static int t_okp_common(const ct_desc *d,
+			const R *rio, const R *iio, 
+			INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+			const planner *plnr)
+{
+     UNUSED(rio); UNUSED(iio);
+     return (1
+	     && !NO_SIMDP(plnr)
+	     && SIMD_STRIDE_OKA(rs)
+	     && SIMD_VSTRIDE_OKA(ms)
+	     && (m % VL) == 0
+	     && (mb % VL) == 0
+	     && (me % VL) == 0
+	     && (!d->rs || (d->rs == rs))
+	     && (!d->vs || (d->vs == vs))
+	     && (!d->ms || (d->ms == ms))
+	  );
+}
+
+static int t_okp_commonu(const ct_desc *d,
+			 const R *rio, const R *iio, 
+			 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+			 const planner *plnr)
+{
+     UNUSED(rio); UNUSED(iio); UNUSED(m);
+     return (1
+	     && !NO_SIMDP(plnr)
+	     && SIMD_STRIDE_OK(rs)
+	     && SIMD_VSTRIDE_OK(ms)
+	     && (mb % VL) == 0
+	     && (me % VL) == 0
+	     && (!d->rs || (d->rs == rs))
+	     && (!d->vs || (d->vs == vs))
+	     && (!d->ms || (d->ms == ms))
+	  );
+}
+
+static int t_okp_t1f(const ct_desc *d,
+		     const R *rio, const R *iio, 
+		     INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+		     const planner *plnr)
+{
+     return  t_okp_common(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
+	  && iio == rio + 1
+	  && ALIGNEDA(rio);
+}
+
+EXTERN_CONST(ct_genus,  XSIMD(dft_t1fsimd_genus)) = { t_okp_t1f, VL };
+
+static int t_okp_t1fu(const ct_desc *d,
+		      const R *rio, const R *iio, 
+		      INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+		      const planner *plnr)
+{
+     return  t_okp_commonu(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
+	  && iio == rio + 1
+	  && ALIGNED(rio);
+}
+
+EXTERN_CONST(ct_genus,  XSIMD(dft_t1fusimd_genus)) = { t_okp_t1fu, VL };
+
+static int t_okp_t1b(const ct_desc *d,
+		     const R *rio, const R *iio, 
+		     INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+		     const planner *plnr)
+{
+     return  t_okp_common(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
+	  && rio == iio + 1
+	  && ALIGNEDA(iio);
+}
+
+EXTERN_CONST(ct_genus,  XSIMD(dft_t1bsimd_genus)) = { t_okp_t1b, VL };
+
+static int t_okp_t1bu(const ct_desc *d,
+		      const R *rio, const R *iio,
+		      INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+		      const planner *plnr)
+{									
+     return  t_okp_commonu(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
+	  && rio == iio + 1
+	  && ALIGNED(iio);
+}
+
+EXTERN_CONST(ct_genus,  XSIMD(dft_t1busimd_genus)) = { t_okp_t1bu, VL };
+
+/* use t2* codelets only when n = m*radix is small, because
+   t2* codelets use ~2n twiddle factors (instead of ~n) */
+static int small_enough(const ct_desc *d, INT m)
+{
+     return m * d->radix <= 16384;
+}
+
+static int t_okp_t2f(const ct_desc *d,
+		     const R *rio, const R *iio, 
+		     INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+		     const planner *plnr)
+{
+     return  t_okp_t1f(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
+	  && small_enough(d, m);
+}
+
+EXTERN_CONST(ct_genus,  XSIMD(dft_t2fsimd_genus)) = { t_okp_t2f, VL };
+
+static int t_okp_t2b(const ct_desc *d,
+		     const R *rio, const R *iio, 
+		     INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+		     const planner *plnr)
+{
+     return  t_okp_t1b(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
+	  && small_enough(d, m);
+}
+
+EXTERN_CONST(ct_genus,  XSIMD(dft_t2bsimd_genus)) = { t_okp_t2b, VL };
+
+static int ts_okp(const ct_desc *d,
+		  const R *rio, const R *iio, 
+		  INT rs, INT vs, INT m, INT mb, INT me, INT ms,
+		  const planner *plnr)
+{
+     UNUSED(rio);
+     UNUSED(iio);
+     return (1
+	     && !NO_SIMDP(plnr)
+	     && ALIGNEDA(rio)
+	     && ALIGNEDA(iio)
+	     && SIMD_STRIDE_OKA(rs)
+	     && ms == 1
+	     && (m % (2 * VL)) == 0
+	     && (mb % (2 * VL)) == 0
+	     && (me % (2 * VL)) == 0
+	     && (!d->rs || (d->rs == rs))
+	     && (!d->vs || (d->vs == vs))
+	     && (!d->ms || (d->ms == ms))
+	  );
+}
+
+EXTERN_CONST(ct_genus,  XSIMD(dft_tssimd_genus)) = { ts_okp, 2 * VL };
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 10 -name n1bv_10 -include n1b.h */
+
+/*
+ * This function contains 42 FP additions, 22 FP multiplications,
+ * (or, 24 additions, 4 multiplications, 18 fused multiply/add),
+ * 43 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
+	       V Tb, Tr, T3, Ts, T6, Tw, Tg, Tt, T9, Tc, T1, T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T4, T5, Te, Tf, T7, T8;
+		    T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T5 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    Te = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    Tr = VADD(T1, T2);
+		    T3 = VSUB(T1, T2);
+		    Ts = VADD(T4, T5);
+		    T6 = VSUB(T4, T5);
+		    Tw = VADD(Te, Tf);
+		    Tg = VSUB(Te, Tf);
+		    Tt = VADD(T7, T8);
+		    T9 = VSUB(T7, T8);
+		    Tc = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+	       }
+	       {
+		    V TD, Tu, Tm, Ta, Td, Tv;
+		    TD = VSUB(Ts, Tt);
+		    Tu = VADD(Ts, Tt);
+		    Tm = VSUB(T6, T9);
+		    Ta = VADD(T6, T9);
+		    Td = VSUB(Tb, Tc);
+		    Tv = VADD(Tb, Tc);
+		    {
+			 V TC, Tx, Tn, Th;
+			 TC = VSUB(Tv, Tw);
+			 Tx = VADD(Tv, Tw);
+			 Tn = VSUB(Td, Tg);
+			 Th = VADD(Td, Tg);
+			 {
+			      V Ty, TA, TE, TG, Ti, Tk, To, Tq, Tz, Tj;
+			      Ty = VADD(Tu, Tx);
+			      TA = VSUB(Tu, Tx);
+			      TE = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TD, TC));
+			      TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TC, TD));
+			      Ti = VADD(Ta, Th);
+			      Tk = VSUB(Ta, Th);
+			      To = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tn, Tm));
+			      Tq = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tm, Tn));
+			      Tz = VFNMS(LDK(KP250000000), Ty, Tr);
+			      ST(&(xo[0]), VADD(Tr, Ty), ovs, &(xo[0]));
+			      Tj = VFNMS(LDK(KP250000000), Ti, T3);
+			      ST(&(xo[WS(os, 5)]), VADD(T3, Ti), ovs, &(xo[WS(os, 1)]));
+			      {
+				   V TB, TF, Tl, Tp;
+				   TB = VFNMS(LDK(KP559016994), TA, Tz);
+				   TF = VFMA(LDK(KP559016994), TA, Tz);
+				   Tl = VFMA(LDK(KP559016994), Tk, Tj);
+				   Tp = VFNMS(LDK(KP559016994), Tk, Tj);
+				   ST(&(xo[WS(os, 4)]), VFNMSI(TG, TF), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 6)]), VFMAI(TG, TF), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 8)]), VFMAI(TE, TB), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 2)]), VFNMSI(TE, TB), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 3)]), VFMAI(Tq, Tp), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 7)]), VFNMSI(Tq, Tp), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 9)]), VFNMSI(To, Tl), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 1)]), VFMAI(To, Tl), ovs, &(xo[WS(os, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 10, XSIMD_STRING("n1bv_10"), {24, 4, 18, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_10) (planner *p) {
+     X(kdft_register) (p, n1bv_10, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 10 -name n1bv_10 -include n1b.h */
+
+/*
+ * This function contains 42 FP additions, 12 FP multiplications,
+ * (or, 36 additions, 6 multiplications, 6 fused multiply/add),
+ * 33 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
+	       V Tl, Ty, T7, Te, Tw, Tt, Tz, TA, TB, Tg, Th, Tm, Tj, Tk;
+	       Tj = LD(&(xi[0]), ivs, &(xi[0]));
+	       Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       Tl = VSUB(Tj, Tk);
+	       Ty = VADD(Tj, Tk);
+	       {
+		    V T3, Tr, Td, Tv, T6, Ts, Ta, Tu;
+		    {
+			 V T1, T2, Tb, Tc;
+			 T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 Tr = VADD(T1, T2);
+			 Tb = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 Tv = VADD(Tb, Tc);
+		    }
+		    {
+			 V T4, T5, T8, T9;
+			 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 Ts = VADD(T4, T5);
+			 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VSUB(T8, T9);
+			 Tu = VADD(T8, T9);
+		    }
+		    T7 = VSUB(T3, T6);
+		    Te = VSUB(Ta, Td);
+		    Tw = VSUB(Tu, Tv);
+		    Tt = VSUB(Tr, Ts);
+		    Tz = VADD(Tr, Ts);
+		    TA = VADD(Tu, Tv);
+		    TB = VADD(Tz, TA);
+		    Tg = VADD(T3, T6);
+		    Th = VADD(Ta, Td);
+		    Tm = VADD(Tg, Th);
+	       }
+	       ST(&(xo[WS(os, 5)]), VADD(Tl, Tm), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[0]), VADD(Ty, TB), ovs, &(xo[0]));
+	       {
+		    V Tf, Tq, To, Tp, Ti, Tn;
+		    Tf = VBYI(VFMA(LDK(KP951056516), T7, VMUL(LDK(KP587785252), Te)));
+		    Tq = VBYI(VFNMS(LDK(KP951056516), Te, VMUL(LDK(KP587785252), T7)));
+		    Ti = VMUL(LDK(KP559016994), VSUB(Tg, Th));
+		    Tn = VFNMS(LDK(KP250000000), Tm, Tl);
+		    To = VADD(Ti, Tn);
+		    Tp = VSUB(Tn, Ti);
+		    ST(&(xo[WS(os, 1)]), VADD(Tf, To), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 7)]), VADD(Tq, Tp), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 9)]), VSUB(To, Tf), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VSUB(Tp, Tq), ovs, &(xo[WS(os, 1)]));
+	       }
+	       {
+		    V Tx, TG, TE, TF, TC, TD;
+		    Tx = VBYI(VFNMS(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tt)));
+		    TG = VBYI(VFMA(LDK(KP951056516), Tt, VMUL(LDK(KP587785252), Tw)));
+		    TC = VFNMS(LDK(KP250000000), TB, Ty);
+		    TD = VMUL(LDK(KP559016994), VSUB(Tz, TA));
+		    TE = VSUB(TC, TD);
+		    TF = VADD(TD, TC);
+		    ST(&(xo[WS(os, 2)]), VADD(Tx, TE), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 6)]), VADD(TG, TF), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 8)]), VSUB(TE, Tx), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VSUB(TF, TG), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 10, XSIMD_STRING("n1bv_10"), {36, 6, 6, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_10) (planner *p) {
+     X(kdft_register) (p, n1bv_10, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:00 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 11 -name n1bv_11 -include n1b.h */
+
+/*
+ * This function contains 70 FP additions, 60 FP multiplications,
+ * (or, 15 additions, 5 multiplications, 55 fused multiply/add),
+ * 67 stack variables, 11 constants, and 22 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
+     DVK(KP876768831, +0.876768831002589333891339807079336796764054852);
+     DVK(KP918985947, +0.918985947228994779780736114132655398124909697);
+     DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
+     DVK(KP778434453, +0.778434453334651800608337670740821884709317477);
+     DVK(KP830830026, +0.830830026003772851058548298459246407048009821);
+     DVK(KP372785597, +0.372785597771792209609773152906148328659002598);
+     DVK(KP634356270, +0.634356270682424498893150776899916060542806975);
+     DVK(KP715370323, +0.715370323453429719112414662767260662417897278);
+     DVK(KP342584725, +0.342584725681637509502641509861112333758894680);
+     DVK(KP521108558, +0.521108558113202722944698153526659300680427422);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
+	       V T1, Tb, T4, Tq, Tg, Tm, T7, Tp, Ta, To, Tc, T11;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       {
+		    V T2, T3, Te, Tf;
+		    T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    Tf = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    {
+			 V T5, T6, T8, T9;
+			 T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T6 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T4 = VADD(T2, T3);
+			 Tq = VSUB(T2, T3);
+			 Tg = VADD(Te, Tf);
+			 Tm = VSUB(Te, Tf);
+			 T7 = VADD(T5, T6);
+			 Tp = VSUB(T5, T6);
+			 Ta = VADD(T8, T9);
+			 To = VSUB(T8, T9);
+			 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    }
+	       }
+	       T11 = VFMA(LDK(KP521108558), Tm, Tq);
+	       {
+		    V TA, TS, TE, TW, Td, Tn, Ts, Tw, Tr, Tv, TT, TF;
+		    Tr = VFNMS(LDK(KP521108558), Tq, Tp);
+		    Tv = VFNMS(LDK(KP342584725), T7, Tg);
+		    TA = VFMA(LDK(KP715370323), To, Tq);
+		    TS = VFMA(LDK(KP521108558), To, Tm);
+		    TE = VFNMS(LDK(KP342584725), T4, Ta);
+		    TW = VFNMS(LDK(KP342584725), Ta, T7);
+		    Td = VADD(Tb, Tc);
+		    Tn = VSUB(Tb, Tc);
+		    Ts = VFNMS(LDK(KP715370323), Tr, To);
+		    Tw = VFNMS(LDK(KP634356270), Tv, T4);
+		    TT = VFNMS(LDK(KP715370323), TS, Tp);
+		    TF = VFNMS(LDK(KP634356270), TE, Tg);
+		    {
+			 V Tu, TV, TD, TL, T14, TP, TZ, Tj, Tz, TI, TB, TJ, TM;
+			 TB = VFMA(LDK(KP372785597), Tn, TA);
+			 TJ = VFNMS(LDK(KP521108558), Tp, Tn);
+			 {
+			      V T12, TN, TX, Th;
+			      T12 = VFMA(LDK(KP715370323), T11, Tn);
+			      ST(&(xo[0]), VADD(Tg, VADD(Td, VADD(Ta, VADD(T7, VADD(T4, T1))))), ovs, &(xo[0]));
+			      TN = VFNMS(LDK(KP342584725), Td, T4);
+			      TX = VFNMS(LDK(KP634356270), TW, Td);
+			      Th = VFNMS(LDK(KP342584725), Tg, Td);
+			      {
+				   V Tt, Tx, TU, TG;
+				   Tt = VFNMS(LDK(KP830830026), Ts, Tn);
+				   Tx = VFNMS(LDK(KP778434453), Tw, Ta);
+				   TU = VFMA(LDK(KP830830026), TT, Tq);
+				   TG = VFNMS(LDK(KP778434453), TF, Td);
+				   {
+					V TC, TK, T13, TO;
+					TC = VFNMS(LDK(KP830830026), TB, Tm);
+					TK = VFMA(LDK(KP715370323), TJ, Tm);
+					T13 = VFMA(LDK(KP830830026), T12, Tp);
+					TO = VFNMS(LDK(KP634356270), TN, T7);
+					{
+					     V TY, Ti, Ty, TH;
+					     TY = VFNMS(LDK(KP778434453), TX, T4);
+					     Ti = VFNMS(LDK(KP634356270), Th, Ta);
+					     Tu = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), Tt, Tm));
+					     Ty = VFNMS(LDK(KP876768831), Tx, Td);
+					     TV = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TU, Tn));
+					     TH = VFNMS(LDK(KP876768831), TG, T7);
+					     TD = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), TC, Tp));
+					     TL = VFNMS(LDK(KP830830026), TK, To);
+					     T14 = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), T13, To));
+					     TP = VFNMS(LDK(KP778434453), TO, Tg);
+					     TZ = VFNMS(LDK(KP876768831), TY, Tg);
+					     Tj = VFNMS(LDK(KP778434453), Ti, T7);
+					     Tz = VFNMS(LDK(KP959492973), Ty, T1);
+					     TI = VFNMS(LDK(KP959492973), TH, T1);
+					}
+				   }
+			      }
+			 }
+			 TM = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TL, Tq));
+			 {
+			      V TQ, T10, Tk, TR, Tl;
+			      TQ = VFNMS(LDK(KP876768831), TP, Ta);
+			      T10 = VFNMS(LDK(KP959492973), TZ, T1);
+			      Tk = VFNMS(LDK(KP876768831), Tj, T4);
+			      ST(&(xo[WS(os, 7)]), VFMAI(TD, Tz), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 4)]), VFNMSI(TD, Tz), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 8)]), VFNMSI(TM, TI), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 3)]), VFMAI(TM, TI), ovs, &(xo[WS(os, 1)]));
+			      TR = VFNMS(LDK(KP959492973), TQ, T1);
+			      ST(&(xo[WS(os, 10)]), VFNMSI(T14, T10), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 1)]), VFMAI(T14, T10), ovs, &(xo[WS(os, 1)]));
+			      Tl = VFNMS(LDK(KP959492973), Tk, T1);
+			      ST(&(xo[WS(os, 9)]), VFMAI(TV, TR), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 2)]), VFNMSI(TV, TR), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 6)]), VFNMSI(Tu, Tl), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 5)]), VFMAI(Tu, Tl), ovs, &(xo[WS(os, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 11, XSIMD_STRING("n1bv_11"), {15, 5, 55, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_11) (planner *p) {
+     X(kdft_register) (p, n1bv_11, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 11 -name n1bv_11 -include n1b.h */
+
+/*
+ * This function contains 70 FP additions, 50 FP multiplications,
+ * (or, 30 additions, 10 multiplications, 40 fused multiply/add),
+ * 32 stack variables, 10 constants, and 22 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
+     DVK(KP654860733, +0.654860733945285064056925072466293553183791199);
+     DVK(KP142314838, +0.142314838273285140443792668616369668791051361);
+     DVK(KP415415013, +0.415415013001886425529274149229623203524004910);
+     DVK(KP841253532, +0.841253532831181168861811648919367717513292498);
+     DVK(KP540640817, +0.540640817455597582107635954318691695431770608);
+     DVK(KP909631995, +0.909631995354518371411715383079028460060241051);
+     DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
+     DVK(KP755749574, +0.755749574354258283774035843972344420179717445);
+     DVK(KP281732556, +0.281732556841429697711417915346616899035777899);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
+	       V Th, T3, Tm, Tf, Ti, Tc, Tj, T9, Tk, T6, Tl, Ta, Tb, Ts, Tt;
+	       Th = LD(&(xi[0]), ivs, &(xi[0]));
+	       {
+		    V T1, T2, Td, Te;
+		    T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T3 = VSUB(T1, T2);
+		    Tm = VADD(T1, T2);
+		    Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    Te = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    Tf = VSUB(Td, Te);
+		    Ti = VADD(Td, Te);
+	       }
+	       Ta = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       Tb = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       Tc = VSUB(Ta, Tb);
+	       Tj = VADD(Ta, Tb);
+	       {
+		    V T7, T8, T4, T5;
+		    T7 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    T9 = VSUB(T7, T8);
+		    Tk = VADD(T7, T8);
+		    T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T5 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T6 = VSUB(T4, T5);
+		    Tl = VADD(T4, T5);
+	       }
+	       ST(&(xo[0]), VADD(Th, VADD(Tm, VADD(Ti, VADD(Tl, VADD(Tj, Tk))))), ovs, &(xo[0]));
+	       {
+		    V Tg, Tn, Tu, Tv;
+		    Tg = VBYI(VFMA(LDK(KP281732556), T3, VFMA(LDK(KP755749574), T6, VFNMS(LDK(KP909631995), Tc, VFNMS(LDK(KP540640817), Tf, VMUL(LDK(KP989821441), T9))))));
+		    Tn = VFMA(LDK(KP841253532), Ti, VFMA(LDK(KP415415013), Tj, VFNMS(LDK(KP142314838), Tk, VFNMS(LDK(KP654860733), Tl, VFNMS(LDK(KP959492973), Tm, Th)))));
+		    ST(&(xo[WS(os, 5)]), VADD(Tg, Tn), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 6)]), VSUB(Tn, Tg), ovs, &(xo[0]));
+		    Tu = VBYI(VFMA(LDK(KP755749574), T3, VFMA(LDK(KP540640817), T6, VFNMS(LDK(KP909631995), T9, VFNMS(LDK(KP989821441), Tf, VMUL(LDK(KP281732556), Tc))))));
+		    Tv = VFMA(LDK(KP841253532), Tl, VFMA(LDK(KP415415013), Tk, VFNMS(LDK(KP959492973), Tj, VFNMS(LDK(KP142314838), Ti, VFNMS(LDK(KP654860733), Tm, Th)))));
+		    ST(&(xo[WS(os, 4)]), VADD(Tu, Tv), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 7)]), VSUB(Tv, Tu), ovs, &(xo[WS(os, 1)]));
+	       }
+	       Ts = VBYI(VFMA(LDK(KP909631995), T3, VFNMS(LDK(KP540640817), T9, VFNMS(LDK(KP989821441), Tc, VFNMS(LDK(KP281732556), T6, VMUL(LDK(KP755749574), Tf))))));
+	       Tt = VFMA(LDK(KP415415013), Tm, VFMA(LDK(KP841253532), Tk, VFNMS(LDK(KP142314838), Tj, VFNMS(LDK(KP959492973), Tl, VFNMS(LDK(KP654860733), Ti, Th)))));
+	       ST(&(xo[WS(os, 2)]), VADD(Ts, Tt), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 9)]), VSUB(Tt, Ts), ovs, &(xo[WS(os, 1)]));
+	       {
+		    V Tq, Tr, To, Tp;
+		    Tq = VBYI(VFMA(LDK(KP540640817), T3, VFMA(LDK(KP909631995), Tf, VFMA(LDK(KP989821441), T6, VFMA(LDK(KP755749574), Tc, VMUL(LDK(KP281732556), T9))))));
+		    Tr = VFMA(LDK(KP841253532), Tm, VFMA(LDK(KP415415013), Ti, VFNMS(LDK(KP959492973), Tk, VFNMS(LDK(KP654860733), Tj, VFNMS(LDK(KP142314838), Tl, Th)))));
+		    ST(&(xo[WS(os, 1)]), VADD(Tq, Tr), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 10)]), VSUB(Tr, Tq), ovs, &(xo[0]));
+		    To = VBYI(VFMA(LDK(KP989821441), T3, VFMA(LDK(KP540640817), Tc, VFNMS(LDK(KP909631995), T6, VFNMS(LDK(KP281732556), Tf, VMUL(LDK(KP755749574), T9))))));
+		    Tp = VFMA(LDK(KP415415013), Tl, VFMA(LDK(KP841253532), Tj, VFNMS(LDK(KP654860733), Tk, VFNMS(LDK(KP959492973), Ti, VFNMS(LDK(KP142314838), Tm, Th)))));
+		    ST(&(xo[WS(os, 3)]), VADD(To, Tp), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 8)]), VSUB(Tp, To), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 11, XSIMD_STRING("n1bv_11"), {30, 10, 40, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_11) (planner *p) {
+     X(kdft_register) (p, n1bv_11, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 12 -name n1bv_12 -include n1b.h */
+
+/*
+ * This function contains 48 FP additions, 20 FP multiplications,
+ * (or, 30 additions, 2 multiplications, 18 fused multiply/add),
+ * 49 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
+	       V T1, T6, Tc, Th, Td, Te, Ti, Tz, T4, TA, T9, Tj, Tf, Tw;
+	       {
+		    V T2, T3, T7, T8;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+		    Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    Tz = VSUB(T2, T3);
+		    T4 = VADD(T2, T3);
+		    TA = VSUB(T7, T8);
+		    T9 = VADD(T7, T8);
+		    Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       }
+	       Tf = VADD(Td, Te);
+	       Tw = VSUB(Td, Te);
+	       {
+		    V T5, Tp, TJ, TB, Ta, Tq, Tk, Tx, Tg, Ts;
+		    T5 = VADD(T1, T4);
+		    Tp = VFNMS(LDK(KP500000000), T4, T1);
+		    TJ = VSUB(Tz, TA);
+		    TB = VADD(Tz, TA);
+		    Ta = VADD(T6, T9);
+		    Tq = VFNMS(LDK(KP500000000), T9, T6);
+		    Tk = VADD(Ti, Tj);
+		    Tx = VSUB(Tj, Ti);
+		    Tg = VADD(Tc, Tf);
+		    Ts = VFNMS(LDK(KP500000000), Tf, Tc);
+		    {
+			 V Tr, TF, Tb, Tn, TG, Ty, Tl, Tt;
+			 Tr = VADD(Tp, Tq);
+			 TF = VSUB(Tp, Tq);
+			 Tb = VSUB(T5, Ta);
+			 Tn = VADD(T5, Ta);
+			 TG = VADD(Tw, Tx);
+			 Ty = VSUB(Tw, Tx);
+			 Tl = VADD(Th, Tk);
+			 Tt = VFNMS(LDK(KP500000000), Tk, Th);
+			 {
+			      V TC, TE, TH, TL, Tu, TI, Tm, To;
+			      TC = VMUL(LDK(KP866025403), VSUB(Ty, TB));
+			      TE = VMUL(LDK(KP866025403), VADD(TB, Ty));
+			      TH = VFNMS(LDK(KP866025403), TG, TF);
+			      TL = VFMA(LDK(KP866025403), TG, TF);
+			      Tu = VADD(Ts, Tt);
+			      TI = VSUB(Ts, Tt);
+			      Tm = VSUB(Tg, Tl);
+			      To = VADD(Tg, Tl);
+			      {
+				   V TK, TM, Tv, TD;
+				   TK = VFMA(LDK(KP866025403), TJ, TI);
+				   TM = VFNMS(LDK(KP866025403), TJ, TI);
+				   Tv = VSUB(Tr, Tu);
+				   TD = VADD(Tr, Tu);
+				   ST(&(xo[0]), VADD(Tn, To), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 6)]), VSUB(Tn, To), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 9)]), VFMAI(Tm, Tb), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 3)]), VFNMSI(Tm, Tb), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 5)]), VFMAI(TM, TL), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 7)]), VFNMSI(TM, TL), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 11)]), VFNMSI(TK, TH), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 1)]), VFMAI(TK, TH), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 8)]), VFNMSI(TE, TD), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 4)]), VFMAI(TE, TD), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 2)]), VFMAI(TC, Tv), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 10)]), VFNMSI(TC, Tv), ovs, &(xo[0]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 12, XSIMD_STRING("n1bv_12"), {30, 2, 18, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_12) (planner *p) {
+     X(kdft_register) (p, n1bv_12, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 12 -name n1bv_12 -include n1b.h */
+
+/*
+ * This function contains 48 FP additions, 8 FP multiplications,
+ * (or, 44 additions, 4 multiplications, 4 fused multiply/add),
+ * 27 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
+	       V T5, Ta, TG, TF, Ty, Tm, Ti, Tp, TJ, TI, Tx, Ts;
+	       {
+		    V T1, T6, T4, Tk, T9, Tl;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    {
+			 V T2, T3, T7, T8;
+			 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T4 = VADD(T2, T3);
+			 Tk = VSUB(T2, T3);
+			 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T9 = VADD(T7, T8);
+			 Tl = VSUB(T7, T8);
+		    }
+		    T5 = VFNMS(LDK(KP500000000), T4, T1);
+		    Ta = VFNMS(LDK(KP500000000), T9, T6);
+		    TG = VADD(T6, T9);
+		    TF = VADD(T1, T4);
+		    Ty = VADD(Tk, Tl);
+		    Tm = VMUL(LDK(KP866025403), VSUB(Tk, Tl));
+	       }
+	       {
+		    V Tn, Tq, Te, To, Th, Tr;
+		    Tn = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Tq = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    {
+			 V Tc, Td, Tf, Tg;
+			 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Td = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Te = VSUB(Tc, Td);
+			 To = VADD(Tc, Td);
+			 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Th = VSUB(Tf, Tg);
+			 Tr = VADD(Tf, Tg);
+		    }
+		    Ti = VMUL(LDK(KP866025403), VSUB(Te, Th));
+		    Tp = VFNMS(LDK(KP500000000), To, Tn);
+		    TJ = VADD(Tq, Tr);
+		    TI = VADD(Tn, To);
+		    Tx = VADD(Te, Th);
+		    Ts = VFNMS(LDK(KP500000000), Tr, Tq);
+	       }
+	       {
+		    V TH, TK, TL, TM;
+		    TH = VSUB(TF, TG);
+		    TK = VBYI(VSUB(TI, TJ));
+		    ST(&(xo[WS(os, 3)]), VSUB(TH, TK), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 9)]), VADD(TH, TK), ovs, &(xo[WS(os, 1)]));
+		    TL = VADD(TF, TG);
+		    TM = VADD(TI, TJ);
+		    ST(&(xo[WS(os, 6)]), VSUB(TL, TM), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(TL, TM), ovs, &(xo[0]));
+	       }
+	       {
+		    V Tj, Tv, Tu, Tw, Tb, Tt;
+		    Tb = VSUB(T5, Ta);
+		    Tj = VSUB(Tb, Ti);
+		    Tv = VADD(Tb, Ti);
+		    Tt = VSUB(Tp, Ts);
+		    Tu = VBYI(VADD(Tm, Tt));
+		    Tw = VBYI(VSUB(Tt, Tm));
+		    ST(&(xo[WS(os, 11)]), VSUB(Tj, Tu), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 5)]), VADD(Tv, Tw), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VADD(Tj, Tu), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 7)]), VSUB(Tv, Tw), ovs, &(xo[WS(os, 1)]));
+	       }
+	       {
+		    V Tz, TD, TC, TE, TA, TB;
+		    Tz = VBYI(VMUL(LDK(KP866025403), VSUB(Tx, Ty)));
+		    TD = VBYI(VMUL(LDK(KP866025403), VADD(Ty, Tx)));
+		    TA = VADD(T5, Ta);
+		    TB = VADD(Tp, Ts);
+		    TC = VSUB(TA, TB);
+		    TE = VADD(TA, TB);
+		    ST(&(xo[WS(os, 2)]), VADD(Tz, TC), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 8)]), VSUB(TE, TD), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 10)]), VSUB(TC, Tz), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VADD(TD, TE), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 12, XSIMD_STRING("n1bv_12"), {44, 4, 4, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_12) (planner *p) {
+     X(kdft_register) (p, n1bv_12, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3527 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:08 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 128 -name n1bv_128 -include n1b.h */
+
+/*
+ * This function contains 1082 FP additions, 642 FP multiplications,
+ * (or, 440 additions, 0 multiplications, 642 fused multiply/add),
+ * 295 stack variables, 31 constants, and 256 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_128(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP903989293, +0.903989293123443331586200297230537048710132025);
+     DVK(KP941544065, +0.941544065183020778412509402599502357185589796);
+     DVK(KP357805721, +0.357805721314524104672487743774474392487532769);
+     DVK(KP472964775, +0.472964775891319928124438237972992463904131113);
+     DVK(KP857728610, +0.857728610000272069902269984284770137042490799);
+     DVK(KP970031253, +0.970031253194543992603984207286100251456865962);
+     DVK(KP250486960, +0.250486960191305461595702160124721208578685568);
+     DVK(KP998795456, +0.998795456205172392714771604759100694443203615);
+     DVK(KP740951125, +0.740951125354959091175616897495162729728955309);
+     DVK(KP599376933, +0.599376933681923766271389869014404232837890546);
+     DVK(KP906347169, +0.906347169019147157946142717268914412664134293);
+     DVK(KP049126849, +0.049126849769467254105343321271313617079695752);
+     DVK(KP989176509, +0.989176509964780973451673738016243063983689533);
+     DVK(KP803207531, +0.803207531480644909806676512963141923879569427);
+     DVK(KP741650546, +0.741650546272035369581266691172079863842265220);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP148335987, +0.148335987538347428753676511486911367000625355);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(256, is), MAKE_VOLATILE_STRIDE(256, os)) {
+	       V T6a, T5J, T6b, T5K, T6B, T6C, T6J, T6A, T6o, T6j, T6r, T68, T6d, T5O, T5R;
+	       V T6e, T6D, T6K;
+	       {
+		    V Tad, TcZ, T6Z, T8T, T4U, Tr, Tfq, TgG, Ted, Tgf, Td0, Tcc, T9k, T84, Tb6;
+		    V Tbt, Td8, TdK, TeK, Tgq, TeV, Tgt, T7q, T94, T3p, T5X, T7B, T97, T2G, T5U;
+		    V TbD, Tc0, Tdf, TdN, Tf5, Tgx, Tfg, TgA, T7J, T9b, T4E, T64, T7U, T9e, T3V;
+		    V T61, Td2, Td3, T85, T72, T4V, TI, Tcd, Tas, TgH, Tek, Tgg, Tft, T86, T75;
+		    V T4W, TZ, TaI, Tcf, Tdo, TdG, Tgi, Tet, Tgj, Teq, T8X, T7a, T5M, T1B, T8W;
+		    V T7d, T5N, T1s, TaX, Tcg, Tdr, TdH, Tgl, TeC, Tgm, Tez, T90, T7h, T5P, T2c;
+		    V T8Z, T7k, T5Q, T23, T3Y, T49, TdL, Tdb, Tbu, Tbl, Tgu, TeR, Tgr, TeY, Tf6;
+		    V TbG, T5V, T3s, T5Y, T3f, T95, T7E, T98, T7x, T4g, T4f, T4q, TbH, T41, TbI;
+		    V T44, T4h, T4j, T4k, Tf9, TbN;
+		    {
+			 V Tu, TF, Ty, TL, TW, Tah, Tx, Tag, Tee, Tz, TM, TN, Teh, Tan, TP;
+			 V TQ;
+			 {
+			      V TeG, T2A, Tbq, TeT, Tbp, TeH, T3m, T2x, Td6, T7o, T2q, T3l, T7z, Tbr, T2D;
+			      V T82, T83;
+			      {
+				   V Ta7, T3, Ta8, T4O, Taa, Tab, Ta, T4P, Te, Tc6, Th, Tc7, Tl, Tc9, Tca;
+				   V To;
+				   {
+					V T1, T2, T4M, T4N;
+					T1 = LD(&(xi[0]), ivs, &(xi[0]));
+					T2 = LD(&(xi[WS(is, 64)]), ivs, &(xi[0]));
+					T4M = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+					T4N = LD(&(xi[WS(is, 96)]), ivs, &(xi[0]));
+					{
+					     V T4, T5, T7, T8;
+					     T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+					     T5 = LD(&(xi[WS(is, 80)]), ivs, &(xi[0]));
+					     T7 = LD(&(xi[WS(is, 112)]), ivs, &(xi[0]));
+					     T8 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+					     {
+						  V Tc, T6, T9, Td, Tf, Tg;
+						  Tc = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+						  Ta7 = VADD(T1, T2);
+						  T3 = VSUB(T1, T2);
+						  Ta8 = VADD(T4M, T4N);
+						  T4O = VSUB(T4M, T4N);
+						  Taa = VADD(T4, T5);
+						  T6 = VSUB(T4, T5);
+						  Tab = VADD(T7, T8);
+						  T9 = VSUB(T7, T8);
+						  Td = LD(&(xi[WS(is, 72)]), ivs, &(xi[0]));
+						  Tf = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+						  Tg = LD(&(xi[WS(is, 104)]), ivs, &(xi[0]));
+						  {
+						       V Tj, Tk, Tm, Tn;
+						       Tj = LD(&(xi[WS(is, 120)]), ivs, &(xi[0]));
+						       Tk = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+						       Tm = LD(&(xi[WS(is, 88)]), ivs, &(xi[0]));
+						       Tn = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+						       Ta = VADD(T6, T9);
+						       T4P = VSUB(T6, T9);
+						       Te = VSUB(Tc, Td);
+						       Tc6 = VADD(Tc, Td);
+						       Th = VSUB(Tf, Tg);
+						       Tc7 = VADD(Tf, Tg);
+						       Tl = VSUB(Tj, Tk);
+						       Tc9 = VADD(Tj, Tk);
+						       Tca = VADD(Tn, Tm);
+						       To = VSUB(Tm, Tn);
+						  }
+					     }
+					}
+				   }
+				   {
+					V T6X, Tb, Te9, Ta9, Tc8, Tea, T4R, Ti, Tfo, Tac, Tp, T4S, Tcb, Teb, T4Q;
+					T6X = VFNMS(LDK(KP707106781), Ta, T3);
+					Tb = VFMA(LDK(KP707106781), Ta, T3);
+					Te9 = VSUB(Ta7, Ta8);
+					Ta9 = VADD(Ta7, Ta8);
+					Tc8 = VADD(Tc6, Tc7);
+					Tea = VSUB(Tc6, Tc7);
+					T4R = VFMA(LDK(KP414213562), Te, Th);
+					Ti = VFNMS(LDK(KP414213562), Th, Te);
+					Tfo = VSUB(Taa, Tab);
+					Tac = VADD(Taa, Tab);
+					Tp = VFNMS(LDK(KP414213562), To, Tl);
+					T4S = VFMA(LDK(KP414213562), Tl, To);
+					Tcb = VADD(Tc9, Tca);
+					Teb = VSUB(Tc9, Tca);
+					T4Q = VFMA(LDK(KP707106781), T4P, T4O);
+					T82 = VFNMS(LDK(KP707106781), T4P, T4O);
+					{
+					     V T4T, T6Y, Tq, Tfp, Tec;
+					     T4T = VSUB(T4R, T4S);
+					     T6Y = VADD(T4R, T4S);
+					     T83 = VSUB(Ti, Tp);
+					     Tq = VADD(Ti, Tp);
+					     Tfp = VSUB(Tea, Teb);
+					     Tec = VADD(Tea, Teb);
+					     Tad = VSUB(Ta9, Tac);
+					     TcZ = VADD(Ta9, Tac);
+					     T6Z = VFMA(LDK(KP923879532), T6Y, T6X);
+					     T8T = VFNMS(LDK(KP923879532), T6Y, T6X);
+					     T4U = VFMA(LDK(KP923879532), T4T, T4Q);
+					     T6a = VFNMS(LDK(KP923879532), T4T, T4Q);
+					     Tr = VFMA(LDK(KP923879532), Tq, Tb);
+					     T5J = VFNMS(LDK(KP923879532), Tq, Tb);
+					     Tfq = VFMA(LDK(KP707106781), Tfp, Tfo);
+					     TgG = VFNMS(LDK(KP707106781), Tfp, Tfo);
+					     Ted = VFMA(LDK(KP707106781), Tec, Te9);
+					     Tgf = VFNMS(LDK(KP707106781), Tec, Te9);
+					     Td0 = VADD(Tc8, Tcb);
+					     Tcc = VSUB(Tc8, Tcb);
+					}
+				   }
+			      }
+			      {
+				   V T2i, T3j, Tb2, T2B, Tb5, T3k, T2p, T2C;
+				   {
+					V T2m, Tb0, Tb1, Tb3, T2l, T2n;
+					{
+					     V T2g, T2h, T3h, T3i, T2j, T2k;
+					     T2g = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+					     T2h = LD(&(xi[WS(is, 65)]), ivs, &(xi[WS(is, 1)]));
+					     T3h = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+					     T3i = LD(&(xi[WS(is, 97)]), ivs, &(xi[WS(is, 1)]));
+					     T2j = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+					     T2k = LD(&(xi[WS(is, 81)]), ivs, &(xi[WS(is, 1)]));
+					     T2m = LD(&(xi[WS(is, 113)]), ivs, &(xi[WS(is, 1)]));
+					     T9k = VFMA(LDK(KP923879532), T83, T82);
+					     T84 = VFNMS(LDK(KP923879532), T83, T82);
+					     T2i = VSUB(T2g, T2h);
+					     Tb0 = VADD(T2g, T2h);
+					     T3j = VSUB(T3h, T3i);
+					     Tb1 = VADD(T3h, T3i);
+					     Tb3 = VADD(T2j, T2k);
+					     T2l = VSUB(T2j, T2k);
+					     T2n = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+					}
+					{
+					     V T2r, T2s, T2u, T2v;
+					     T2r = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+					     T2s = LD(&(xi[WS(is, 73)]), ivs, &(xi[WS(is, 1)]));
+					     T2u = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+					     T2v = LD(&(xi[WS(is, 105)]), ivs, &(xi[WS(is, 1)]));
+					     TeG = VSUB(Tb0, Tb1);
+					     Tb2 = VADD(Tb0, Tb1);
+					     {
+						  V T2y, T2z, Tb4, T2o, Tbn, T2t, Tbo, T2w;
+						  T2y = LD(&(xi[WS(is, 121)]), ivs, &(xi[WS(is, 1)]));
+						  T2z = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+						  Tb4 = VADD(T2m, T2n);
+						  T2o = VSUB(T2m, T2n);
+						  Tbn = VADD(T2r, T2s);
+						  T2t = VSUB(T2r, T2s);
+						  Tbo = VADD(T2u, T2v);
+						  T2w = VSUB(T2u, T2v);
+						  T2B = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+						  T2A = VSUB(T2y, T2z);
+						  Tbq = VADD(T2y, T2z);
+						  TeT = VSUB(Tb3, Tb4);
+						  Tb5 = VADD(Tb3, Tb4);
+						  T3k = VSUB(T2l, T2o);
+						  T2p = VADD(T2l, T2o);
+						  Tbp = VADD(Tbn, Tbo);
+						  TeH = VSUB(Tbn, Tbo);
+						  T3m = VFMA(LDK(KP414213562), T2t, T2w);
+						  T2x = VFNMS(LDK(KP414213562), T2w, T2t);
+						  T2C = LD(&(xi[WS(is, 89)]), ivs, &(xi[WS(is, 1)]));
+					     }
+					}
+				   }
+				   Td6 = VADD(Tb2, Tb5);
+				   Tb6 = VSUB(Tb2, Tb5);
+				   T7o = VFNMS(LDK(KP707106781), T2p, T2i);
+				   T2q = VFMA(LDK(KP707106781), T2p, T2i);
+				   T3l = VFMA(LDK(KP707106781), T3k, T3j);
+				   T7z = VFNMS(LDK(KP707106781), T3k, T3j);
+				   Tbr = VADD(T2B, T2C);
+				   T2D = VSUB(T2B, T2C);
+			      }
+			      {
+				   V Tf1, Tfe, Tf2, TbZ, T3M, T4B, Tdd, T3F, T7H, T4A, T7S, TbW, Tf3, T4C, T3T;
+				   {
+					V T3x, T4y, Tbz, T3Q, TbC, T4z, T3E, T3R, T3P, TbU, TbV, T3S;
+					{
+					     V T3y, T3z, T3B, T3C;
+					     {
+						  V T3v, T3w, T4w, T4x;
+						  T3v = LD(&(xi[WS(is, 127)]), ivs, &(xi[WS(is, 1)]));
+						  T3w = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+						  T4w = LD(&(xi[WS(is, 95)]), ivs, &(xi[WS(is, 1)]));
+						  T4x = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+						  T3y = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V Tbs, TeI, T3n, T2E, Tbx;
+						       Tbs = VADD(Tbq, Tbr);
+						       TeI = VSUB(Tbq, Tbr);
+						       T3n = VFNMS(LDK(KP414213562), T2A, T2D);
+						       T2E = VFMA(LDK(KP414213562), T2D, T2A);
+						       T3x = VSUB(T3v, T3w);
+						       Tbx = VADD(T3v, T3w);
+						       {
+							    V Tby, Td7, TeJ, TeU;
+							    T4y = VSUB(T4w, T4x);
+							    Tby = VADD(T4x, T4w);
+							    Td7 = VADD(Tbp, Tbs);
+							    Tbt = VSUB(Tbp, Tbs);
+							    TeJ = VADD(TeH, TeI);
+							    TeU = VSUB(TeH, TeI);
+							    {
+								 V T7p, T3o, T7A, T2F;
+								 T7p = VSUB(T3m, T3n);
+								 T3o = VADD(T3m, T3n);
+								 T7A = VSUB(T2x, T2E);
+								 T2F = VADD(T2x, T2E);
+								 Tbz = VADD(Tbx, Tby);
+								 Tf1 = VSUB(Tbx, Tby);
+								 Td8 = VADD(Td6, Td7);
+								 TdK = VSUB(Td6, Td7);
+								 TeK = VFMA(LDK(KP707106781), TeJ, TeG);
+								 Tgq = VFNMS(LDK(KP707106781), TeJ, TeG);
+								 TeV = VFMA(LDK(KP707106781), TeU, TeT);
+								 Tgt = VFNMS(LDK(KP707106781), TeU, TeT);
+								 T7q = VFMA(LDK(KP923879532), T7p, T7o);
+								 T94 = VFNMS(LDK(KP923879532), T7p, T7o);
+								 T3p = VFMA(LDK(KP923879532), T3o, T3l);
+								 T5X = VFNMS(LDK(KP923879532), T3o, T3l);
+								 T7B = VFNMS(LDK(KP923879532), T7A, T7z);
+								 T97 = VFMA(LDK(KP923879532), T7A, T7z);
+								 T2G = VFMA(LDK(KP923879532), T2F, T2q);
+								 T5U = VFNMS(LDK(KP923879532), T2F, T2q);
+								 T3z = LD(&(xi[WS(is, 79)]), ivs, &(xi[WS(is, 1)]));
+							    }
+						       }
+						  }
+						  T3B = LD(&(xi[WS(is, 111)]), ivs, &(xi[WS(is, 1)]));
+						  T3C = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+					     }
+					     {
+						  V T3G, T3H, T3J, T3K;
+						  T3G = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+						  T3H = LD(&(xi[WS(is, 71)]), ivs, &(xi[WS(is, 1)]));
+						  T3J = LD(&(xi[WS(is, 103)]), ivs, &(xi[WS(is, 1)]));
+						  T3K = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V T3N, T3A, TbA, T3D, TbB, T3I, TbX, T3L, TbY, T3O;
+						       T3N = LD(&(xi[WS(is, 119)]), ivs, &(xi[WS(is, 1)]));
+						       T3A = VSUB(T3y, T3z);
+						       TbA = VADD(T3y, T3z);
+						       T3D = VSUB(T3B, T3C);
+						       TbB = VADD(T3B, T3C);
+						       T3I = VSUB(T3G, T3H);
+						       TbX = VADD(T3G, T3H);
+						       T3L = VSUB(T3J, T3K);
+						       TbY = VADD(T3K, T3J);
+						       T3O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+						       T3Q = LD(&(xi[WS(is, 87)]), ivs, &(xi[WS(is, 1)]));
+						       Tfe = VSUB(TbB, TbA);
+						       TbC = VADD(TbA, TbB);
+						       T4z = VSUB(T3D, T3A);
+						       T3E = VADD(T3A, T3D);
+						       T3R = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+						       Tf2 = VSUB(TbX, TbY);
+						       TbZ = VADD(TbX, TbY);
+						       T3M = VFMA(LDK(KP414213562), T3L, T3I);
+						       T4B = VFNMS(LDK(KP414213562), T3I, T3L);
+						       T3P = VSUB(T3N, T3O);
+						       TbU = VADD(T3N, T3O);
+						  }
+					     }
+					}
+					Tdd = VADD(Tbz, TbC);
+					TbD = VSUB(Tbz, TbC);
+					TbV = VADD(T3R, T3Q);
+					T3S = VSUB(T3Q, T3R);
+					T3F = VFMA(LDK(KP707106781), T3E, T3x);
+					T7H = VFNMS(LDK(KP707106781), T3E, T3x);
+					T4A = VFMA(LDK(KP707106781), T4z, T4y);
+					T7S = VFNMS(LDK(KP707106781), T4z, T4y);
+					TbW = VADD(TbU, TbV);
+					Tf3 = VSUB(TbU, TbV);
+					T4C = VFMA(LDK(KP414213562), T3P, T3S);
+					T3T = VFNMS(LDK(KP414213562), T3S, T3P);
+				   }
+				   {
+					V TD, Tae, TE, TJ, TK, TU, TV;
+					{
+					     V Ts, Tt, Tde, Tf4, Tff;
+					     Ts = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					     Tt = LD(&(xi[WS(is, 68)]), ivs, &(xi[0]));
+					     TD = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+					     Tde = VADD(TbZ, TbW);
+					     Tc0 = VSUB(TbW, TbZ);
+					     Tf4 = VADD(Tf2, Tf3);
+					     Tff = VSUB(Tf3, Tf2);
+					     {
+						  V T7I, T4D, T7T, T3U;
+						  T7I = VSUB(T4C, T4B);
+						  T4D = VADD(T4B, T4C);
+						  T7T = VSUB(T3T, T3M);
+						  T3U = VADD(T3M, T3T);
+						  Tae = VADD(Ts, Tt);
+						  Tu = VSUB(Ts, Tt);
+						  Tdf = VADD(Tdd, Tde);
+						  TdN = VSUB(Tdd, Tde);
+						  Tf5 = VFMA(LDK(KP707106781), Tf4, Tf1);
+						  Tgx = VFNMS(LDK(KP707106781), Tf4, Tf1);
+						  Tfg = VFMA(LDK(KP707106781), Tff, Tfe);
+						  TgA = VFNMS(LDK(KP707106781), Tff, Tfe);
+						  T7J = VFMA(LDK(KP923879532), T7I, T7H);
+						  T9b = VFNMS(LDK(KP923879532), T7I, T7H);
+						  T4E = VFMA(LDK(KP923879532), T4D, T4A);
+						  T64 = VFNMS(LDK(KP923879532), T4D, T4A);
+						  T7U = VFNMS(LDK(KP923879532), T7T, T7S);
+						  T9e = VFMA(LDK(KP923879532), T7T, T7S);
+						  T3V = VFMA(LDK(KP923879532), T3U, T3F);
+						  T61 = VFNMS(LDK(KP923879532), T3U, T3F);
+						  TE = LD(&(xi[WS(is, 100)]), ivs, &(xi[0]));
+					     }
+					}
+					TJ = LD(&(xi[WS(is, 124)]), ivs, &(xi[0]));
+					TK = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+					TU = LD(&(xi[WS(is, 92)]), ivs, &(xi[0]));
+					TV = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+					{
+					     V Tal, Tam, Tv, Tw, Taf;
+					     Tv = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+					     Tw = LD(&(xi[WS(is, 84)]), ivs, &(xi[0]));
+					     Taf = VADD(TD, TE);
+					     TF = VSUB(TD, TE);
+					     Ty = LD(&(xi[WS(is, 116)]), ivs, &(xi[0]));
+					     TL = VSUB(TJ, TK);
+					     Tal = VADD(TJ, TK);
+					     TW = VSUB(TU, TV);
+					     Tam = VADD(TV, TU);
+					     Tah = VADD(Tv, Tw);
+					     Tx = VSUB(Tv, Tw);
+					     Tag = VADD(Tae, Taf);
+					     Tee = VSUB(Tae, Taf);
+					     Tz = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+					     TM = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+					     TN = LD(&(xi[WS(is, 76)]), ivs, &(xi[0]));
+					     Teh = VSUB(Tal, Tam);
+					     Tan = VADD(Tal, Tam);
+					     TP = LD(&(xi[WS(is, 108)]), ivs, &(xi[0]));
+					     TQ = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+					}
+				   }
+			      }
+			 }
+			 {
+			      V Tev, TeA, Tdp, TaP, Tew, TaV, T1U, T29, T7f, T1N, T28, T7i, Tex, TaS, T21;
+			      V T2a;
+			      {
+				   V Tem, Ter, Ten, TaD, T1j, T1y, TaA, Tdm, T1c, T78, T7b, T1x, TaG, Teo, T1z;
+				   V T1q;
+				   {
+					V T14, T1v, Taw, Taz, T1b, T1w, T1n, T1o, T1m, TaE, TaF, T1p;
+					{
+					     V Tau, Tav, T15, T16, T18, T19;
+					     {
+						  V T12, Tai, TA, Tao, TO, T13;
+						  T12 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+						  Tai = VADD(Ty, Tz);
+						  TA = VSUB(Ty, Tz);
+						  Tao = VADD(TM, TN);
+						  TO = VSUB(TM, TN);
+						  T13 = LD(&(xi[WS(is, 66)]), ivs, &(xi[0]));
+						  {
+						       V T1t, Tap, TR, Taj, Tef, TG, TB, T1u;
+						       T1t = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+						       Tap = VADD(TP, TQ);
+						       TR = VSUB(TP, TQ);
+						       Taj = VADD(Tah, Tai);
+						       Tef = VSUB(Tah, Tai);
+						       TG = VSUB(Tx, TA);
+						       TB = VADD(Tx, TA);
+						       Tau = VADD(T12, T13);
+						       T14 = VSUB(T12, T13);
+						       T1u = LD(&(xi[WS(is, 98)]), ivs, &(xi[0]));
+						       {
+							    V Taq, Tei, TX, TS, Tak;
+							    Taq = VADD(Tao, Tap);
+							    Tei = VSUB(Tap, Tao);
+							    TX = VSUB(TR, TO);
+							    TS = VADD(TO, TR);
+							    Tak = VSUB(Tag, Taj);
+							    Td2 = VADD(Tag, Taj);
+							    {
+								 V Teg, Tfr, T71, TH;
+								 Teg = VFNMS(LDK(KP414213562), Tef, Tee);
+								 Tfr = VFMA(LDK(KP414213562), Tee, Tef);
+								 T71 = VFNMS(LDK(KP707106781), TG, TF);
+								 TH = VFMA(LDK(KP707106781), TG, TF);
+								 {
+								      V T70, TC, Tar, Tej, Tfs;
+								      T70 = VFNMS(LDK(KP707106781), TB, Tu);
+								      TC = VFMA(LDK(KP707106781), TB, Tu);
+								      Tar = VSUB(Tan, Taq);
+								      Td3 = VADD(Tan, Taq);
+								      Tej = VFNMS(LDK(KP414213562), Tei, Teh);
+								      Tfs = VFMA(LDK(KP414213562), Teh, Tei);
+								      {
+									   V T74, TY, T73, TT;
+									   T74 = VFNMS(LDK(KP707106781), TX, TW);
+									   TY = VFMA(LDK(KP707106781), TX, TW);
+									   T73 = VFNMS(LDK(KP707106781), TS, TL);
+									   TT = VFMA(LDK(KP707106781), TS, TL);
+									   T85 = VFNMS(LDK(KP668178637), T70, T71);
+									   T72 = VFMA(LDK(KP668178637), T71, T70);
+									   T4V = VFMA(LDK(KP198912367), TC, TH);
+									   TI = VFNMS(LDK(KP198912367), TH, TC);
+									   Tcd = VSUB(Tak, Tar);
+									   Tas = VADD(Tak, Tar);
+									   TgH = VSUB(Teg, Tej);
+									   Tek = VADD(Teg, Tej);
+									   Tgg = VADD(Tfr, Tfs);
+									   Tft = VSUB(Tfr, Tfs);
+									   T86 = VFNMS(LDK(KP668178637), T73, T74);
+									   T75 = VFMA(LDK(KP668178637), T74, T73);
+									   T4W = VFMA(LDK(KP198912367), TT, TY);
+									   TZ = VFNMS(LDK(KP198912367), TY, TT);
+									   Tav = VADD(T1t, T1u);
+									   T1v = VSUB(T1t, T1u);
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					     T15 = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+					     T16 = LD(&(xi[WS(is, 82)]), ivs, &(xi[0]));
+					     T18 = LD(&(xi[WS(is, 114)]), ivs, &(xi[0]));
+					     T19 = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+					     {
+						  V T1d, T1e, T1g, T1h, Tax, T17, Tay, T1a;
+						  T1d = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+						  Taw = VADD(Tau, Tav);
+						  Tem = VSUB(Tau, Tav);
+						  T1e = LD(&(xi[WS(is, 74)]), ivs, &(xi[0]));
+						  T1g = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+						  T1h = LD(&(xi[WS(is, 106)]), ivs, &(xi[0]));
+						  Tax = VADD(T15, T16);
+						  T17 = VSUB(T15, T16);
+						  Tay = VADD(T18, T19);
+						  T1a = VSUB(T18, T19);
+						  {
+						       V T1k, T1f, TaB, T1i, TaC, T1l;
+						       T1k = LD(&(xi[WS(is, 122)]), ivs, &(xi[0]));
+						       T1f = VSUB(T1d, T1e);
+						       TaB = VADD(T1d, T1e);
+						       T1i = VSUB(T1g, T1h);
+						       TaC = VADD(T1g, T1h);
+						       T1l = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+						       Taz = VADD(Tax, Tay);
+						       Ter = VSUB(Tax, Tay);
+						       T1b = VADD(T17, T1a);
+						       T1w = VSUB(T17, T1a);
+						       T1n = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+						       T1o = LD(&(xi[WS(is, 90)]), ivs, &(xi[0]));
+						       Ten = VSUB(TaB, TaC);
+						       TaD = VADD(TaB, TaC);
+						       T1j = VFNMS(LDK(KP414213562), T1i, T1f);
+						       T1y = VFMA(LDK(KP414213562), T1f, T1i);
+						       T1m = VSUB(T1k, T1l);
+						       TaE = VADD(T1k, T1l);
+						  }
+					     }
+					}
+					TaA = VSUB(Taw, Taz);
+					Tdm = VADD(Taw, Taz);
+					TaF = VADD(T1n, T1o);
+					T1p = VSUB(T1n, T1o);
+					T1c = VFMA(LDK(KP707106781), T1b, T14);
+					T78 = VFNMS(LDK(KP707106781), T1b, T14);
+					T7b = VFNMS(LDK(KP707106781), T1w, T1v);
+					T1x = VFMA(LDK(KP707106781), T1w, T1v);
+					TaG = VADD(TaE, TaF);
+					Teo = VSUB(TaE, TaF);
+					T1z = VFNMS(LDK(KP414213562), T1m, T1p);
+					T1q = VFMA(LDK(KP414213562), T1p, T1m);
+				   }
+				   {
+					V T1F, T26, T1Q, TaT, TaL, TaO, T27, T1M, T1Y, T1Z, TaU, T1T, TaQ, T1X, T20;
+					V TaR;
+					{
+					     V T24, TaJ, T25, T1G, T1H, T1J, T1K, T1D, T1E;
+					     T1D = LD(&(xi[WS(is, 126)]), ivs, &(xi[0]));
+					     T1E = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+					     T24 = LD(&(xi[WS(is, 94)]), ivs, &(xi[0]));
+					     {
+						  V TaH, Tdn, Tes, Tep;
+						  TaH = VSUB(TaD, TaG);
+						  Tdn = VADD(TaD, TaG);
+						  Tes = VSUB(Ten, Teo);
+						  Tep = VADD(Ten, Teo);
+						  {
+						       V T79, T1A, T7c, T1r;
+						       T79 = VSUB(T1y, T1z);
+						       T1A = VADD(T1y, T1z);
+						       T7c = VSUB(T1j, T1q);
+						       T1r = VADD(T1j, T1q);
+						       TaJ = VADD(T1D, T1E);
+						       T1F = VSUB(T1D, T1E);
+						       TaI = VFNMS(LDK(KP414213562), TaH, TaA);
+						       Tcf = VFMA(LDK(KP414213562), TaA, TaH);
+						       Tdo = VADD(Tdm, Tdn);
+						       TdG = VSUB(Tdm, Tdn);
+						       Tgi = VFNMS(LDK(KP707106781), Tes, Ter);
+						       Tet = VFMA(LDK(KP707106781), Tes, Ter);
+						       Tgj = VFNMS(LDK(KP707106781), Tep, Tem);
+						       Teq = VFMA(LDK(KP707106781), Tep, Tem);
+						       T8X = VFNMS(LDK(KP923879532), T79, T78);
+						       T7a = VFMA(LDK(KP923879532), T79, T78);
+						       T5M = VFNMS(LDK(KP923879532), T1A, T1x);
+						       T1B = VFMA(LDK(KP923879532), T1A, T1x);
+						       T8W = VFMA(LDK(KP923879532), T7c, T7b);
+						       T7d = VFNMS(LDK(KP923879532), T7c, T7b);
+						       T5N = VFNMS(LDK(KP923879532), T1r, T1c);
+						       T1s = VFMA(LDK(KP923879532), T1r, T1c);
+						       T25 = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+						  }
+					     }
+					     T1G = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+					     T1H = LD(&(xi[WS(is, 78)]), ivs, &(xi[0]));
+					     T1J = LD(&(xi[WS(is, 110)]), ivs, &(xi[0]));
+					     T1K = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+					     {
+						  V T1R, T1I, TaM, T1L, TaN, T1S, T1O, T1P, TaK, T1V, T1W;
+						  T1O = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+						  T1P = LD(&(xi[WS(is, 70)]), ivs, &(xi[0]));
+						  T26 = VSUB(T24, T25);
+						  TaK = VADD(T25, T24);
+						  T1R = LD(&(xi[WS(is, 102)]), ivs, &(xi[0]));
+						  T1I = VSUB(T1G, T1H);
+						  TaM = VADD(T1G, T1H);
+						  T1L = VSUB(T1J, T1K);
+						  TaN = VADD(T1J, T1K);
+						  T1Q = VSUB(T1O, T1P);
+						  TaT = VADD(T1O, T1P);
+						  Tev = VSUB(TaJ, TaK);
+						  TaL = VADD(TaJ, TaK);
+						  T1S = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+						  T1V = LD(&(xi[WS(is, 118)]), ivs, &(xi[0]));
+						  T1W = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+						  TeA = VSUB(TaN, TaM);
+						  TaO = VADD(TaM, TaN);
+						  T27 = VSUB(T1L, T1I);
+						  T1M = VADD(T1I, T1L);
+						  T1Y = LD(&(xi[WS(is, 86)]), ivs, &(xi[0]));
+						  T1Z = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+						  TaU = VADD(T1S, T1R);
+						  T1T = VSUB(T1R, T1S);
+						  TaQ = VADD(T1V, T1W);
+						  T1X = VSUB(T1V, T1W);
+					     }
+					}
+					Tdp = VADD(TaL, TaO);
+					TaP = VSUB(TaL, TaO);
+					T20 = VSUB(T1Y, T1Z);
+					TaR = VADD(T1Z, T1Y);
+					Tew = VSUB(TaT, TaU);
+					TaV = VADD(TaT, TaU);
+					T1U = VFMA(LDK(KP414213562), T1T, T1Q);
+					T29 = VFNMS(LDK(KP414213562), T1Q, T1T);
+					T7f = VFNMS(LDK(KP707106781), T1M, T1F);
+					T1N = VFMA(LDK(KP707106781), T1M, T1F);
+					T28 = VFMA(LDK(KP707106781), T27, T26);
+					T7i = VFNMS(LDK(KP707106781), T27, T26);
+					Tex = VSUB(TaQ, TaR);
+					TaS = VADD(TaQ, TaR);
+					T21 = VFNMS(LDK(KP414213562), T20, T1X);
+					T2a = VFMA(LDK(KP414213562), T1X, T20);
+				   }
+			      }
+			      {
+				   V T2J, T2U, T30, T3b, TeL, Tb9, TeO, Tbg, T2M, Tba, T2P, Tbb, T34, Tbh, T33;
+				   V T35;
+				   {
+					V T2H, T2I, T2S, T2T, T2Y, T2Z, T39, T3a;
+					T2H = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+					{
+					     V Tdq, TaW, Tey, TeB;
+					     Tdq = VADD(TaV, TaS);
+					     TaW = VSUB(TaS, TaV);
+					     Tey = VADD(Tew, Tex);
+					     TeB = VSUB(Tex, Tew);
+					     {
+						  V T2b, T7g, T22, T7j;
+						  T2b = VADD(T29, T2a);
+						  T7g = VSUB(T2a, T29);
+						  T22 = VADD(T1U, T21);
+						  T7j = VSUB(T21, T1U);
+						  TaX = VFNMS(LDK(KP414213562), TaW, TaP);
+						  Tcg = VFMA(LDK(KP414213562), TaP, TaW);
+						  Tdr = VADD(Tdp, Tdq);
+						  TdH = VSUB(Tdp, Tdq);
+						  Tgl = VFNMS(LDK(KP707106781), TeB, TeA);
+						  TeC = VFMA(LDK(KP707106781), TeB, TeA);
+						  Tgm = VFNMS(LDK(KP707106781), Tey, Tev);
+						  Tez = VFMA(LDK(KP707106781), Tey, Tev);
+						  T90 = VFNMS(LDK(KP923879532), T7g, T7f);
+						  T7h = VFMA(LDK(KP923879532), T7g, T7f);
+						  T5P = VFNMS(LDK(KP923879532), T2b, T28);
+						  T2c = VFMA(LDK(KP923879532), T2b, T28);
+						  T8Z = VFMA(LDK(KP923879532), T7j, T7i);
+						  T7k = VFNMS(LDK(KP923879532), T7j, T7i);
+						  T5Q = VFNMS(LDK(KP923879532), T22, T1N);
+						  T23 = VFMA(LDK(KP923879532), T22, T1N);
+						  T2I = LD(&(xi[WS(is, 69)]), ivs, &(xi[WS(is, 1)]));
+					     }
+					}
+					T2S = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+					T2T = LD(&(xi[WS(is, 101)]), ivs, &(xi[WS(is, 1)]));
+					T2Y = LD(&(xi[WS(is, 125)]), ivs, &(xi[WS(is, 1)]));
+					T2Z = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+					T39 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+					T3a = LD(&(xi[WS(is, 93)]), ivs, &(xi[WS(is, 1)]));
+					{
+					     V T2K, Tbe, Tbf, T2L, T2N, T2O, Tb7, Tb8, T31, T32;
+					     T2K = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+					     T2J = VSUB(T2H, T2I);
+					     Tb7 = VADD(T2H, T2I);
+					     T2U = VSUB(T2S, T2T);
+					     Tb8 = VADD(T2S, T2T);
+					     T30 = VSUB(T2Y, T2Z);
+					     Tbe = VADD(T2Y, T2Z);
+					     T3b = VSUB(T39, T3a);
+					     Tbf = VADD(T39, T3a);
+					     T2L = LD(&(xi[WS(is, 85)]), ivs, &(xi[WS(is, 1)]));
+					     T2N = LD(&(xi[WS(is, 117)]), ivs, &(xi[WS(is, 1)]));
+					     T2O = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+					     TeL = VSUB(Tb7, Tb8);
+					     Tb9 = VADD(Tb7, Tb8);
+					     T31 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+					     T32 = LD(&(xi[WS(is, 77)]), ivs, &(xi[WS(is, 1)]));
+					     TeO = VSUB(Tbe, Tbf);
+					     Tbg = VADD(Tbe, Tbf);
+					     T2M = VSUB(T2K, T2L);
+					     Tba = VADD(T2K, T2L);
+					     T2P = VSUB(T2N, T2O);
+					     Tbb = VADD(T2N, T2O);
+					     T34 = LD(&(xi[WS(is, 109)]), ivs, &(xi[WS(is, 1)]));
+					     Tbh = VADD(T31, T32);
+					     T33 = VSUB(T31, T32);
+					     T35 = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+					}
+				   }
+				   {
+					V T4d, T4e, T4o, T4p;
+					{
+					     V T2X, T3q, T7t, T7C, T3r, T3e, T7D, T7w;
+					     {
+						  V T47, TbE, Tbd, Td9, TeW, TeN, T7s, T2W, T7r, T2R, TeP, Tbj, T37, T3c, T48;
+						  {
+						       V T3W, T3X, TeM, Tbc, T2Q, T2V, Tbi, T36;
+						       T3W = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+						       T3X = LD(&(xi[WS(is, 67)]), ivs, &(xi[WS(is, 1)]));
+						       TeM = VSUB(Tba, Tbb);
+						       Tbc = VADD(Tba, Tbb);
+						       T2Q = VADD(T2M, T2P);
+						       T2V = VSUB(T2M, T2P);
+						       T47 = LD(&(xi[WS(is, 99)]), ivs, &(xi[WS(is, 1)]));
+						       Tbi = VADD(T34, T35);
+						       T36 = VSUB(T34, T35);
+						       TbE = VADD(T3W, T3X);
+						       T3Y = VSUB(T3W, T3X);
+						       Tbd = VSUB(Tb9, Tbc);
+						       Td9 = VADD(Tb9, Tbc);
+						       TeW = VFMA(LDK(KP414213562), TeL, TeM);
+						       TeN = VFNMS(LDK(KP414213562), TeM, TeL);
+						       T7s = VFNMS(LDK(KP707106781), T2V, T2U);
+						       T2W = VFMA(LDK(KP707106781), T2V, T2U);
+						       T7r = VFNMS(LDK(KP707106781), T2Q, T2J);
+						       T2R = VFMA(LDK(KP707106781), T2Q, T2J);
+						       TeP = VSUB(Tbh, Tbi);
+						       Tbj = VADD(Tbh, Tbi);
+						       T37 = VADD(T33, T36);
+						       T3c = VSUB(T33, T36);
+						       T48 = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+						  }
+						  T2X = VFNMS(LDK(KP198912367), T2W, T2R);
+						  T3q = VFMA(LDK(KP198912367), T2R, T2W);
+						  T7t = VFMA(LDK(KP668178637), T7s, T7r);
+						  T7C = VFNMS(LDK(KP668178637), T7r, T7s);
+						  {
+						       V Tbk, Tda, TeX, TeQ;
+						       Tbk = VSUB(Tbg, Tbj);
+						       Tda = VADD(Tbg, Tbj);
+						       TeX = VFNMS(LDK(KP414213562), TeO, TeP);
+						       TeQ = VFMA(LDK(KP414213562), TeP, TeO);
+						       {
+							    V T7v, T3d, T7u, T38, TbF;
+							    T7v = VFNMS(LDK(KP707106781), T3c, T3b);
+							    T3d = VFMA(LDK(KP707106781), T3c, T3b);
+							    T7u = VFNMS(LDK(KP707106781), T37, T30);
+							    T38 = VFMA(LDK(KP707106781), T37, T30);
+							    T49 = VSUB(T47, T48);
+							    TbF = VADD(T48, T47);
+							    TdL = VSUB(Td9, Tda);
+							    Tdb = VADD(Td9, Tda);
+							    Tbu = VSUB(Tbd, Tbk);
+							    Tbl = VADD(Tbd, Tbk);
+							    Tgu = VSUB(TeN, TeQ);
+							    TeR = VADD(TeN, TeQ);
+							    Tgr = VSUB(TeW, TeX);
+							    TeY = VADD(TeW, TeX);
+							    T3r = VFNMS(LDK(KP198912367), T38, T3d);
+							    T3e = VFMA(LDK(KP198912367), T3d, T38);
+							    T7D = VFMA(LDK(KP668178637), T7u, T7v);
+							    T7w = VFNMS(LDK(KP668178637), T7v, T7u);
+							    Tf6 = VSUB(TbE, TbF);
+							    TbG = VADD(TbE, TbF);
+						       }
+						  }
+					     }
+					     T4d = LD(&(xi[WS(is, 123)]), ivs, &(xi[WS(is, 1)]));
+					     T5V = VSUB(T3q, T3r);
+					     T3s = VADD(T3q, T3r);
+					     T5Y = VSUB(T2X, T3e);
+					     T3f = VADD(T2X, T3e);
+					     T95 = VSUB(T7D, T7C);
+					     T7E = VADD(T7C, T7D);
+					     T98 = VSUB(T7t, T7w);
+					     T7x = VADD(T7t, T7w);
+					     T4e = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+					     T4o = LD(&(xi[WS(is, 91)]), ivs, &(xi[WS(is, 1)]));
+					     T4p = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+					}
+					{
+					     V T3Z, T40, T42, T43, TbL, TbM;
+					     T3Z = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+					     T40 = LD(&(xi[WS(is, 83)]), ivs, &(xi[WS(is, 1)]));
+					     T42 = LD(&(xi[WS(is, 115)]), ivs, &(xi[WS(is, 1)]));
+					     T43 = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+					     T4g = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+					     T4f = VSUB(T4d, T4e);
+					     TbL = VADD(T4d, T4e);
+					     T4q = VSUB(T4o, T4p);
+					     TbM = VADD(T4p, T4o);
+					     TbH = VADD(T3Z, T40);
+					     T41 = VSUB(T3Z, T40);
+					     TbI = VADD(T42, T43);
+					     T44 = VSUB(T42, T43);
+					     T4h = LD(&(xi[WS(is, 75)]), ivs, &(xi[WS(is, 1)]));
+					     T4j = LD(&(xi[WS(is, 107)]), ivs, &(xi[WS(is, 1)]));
+					     T4k = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+					     Tf9 = VSUB(TbL, TbM);
+					     TbN = VADD(TbL, TbM);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V TgB, Tgy, T62, T4H, T65, T4u, T9c, T7X, T9f, T7Q, Tg0, Tga, TfF, TeF, TfT;
+			 V TfU, TfP, Tg7, TfI, Tfy, Tfz, Tf0, TfA, Tfl, Tg1, TfS;
+			 {
+			      V Tc1, TbS, Tfc, Tfj, TdX, Te5, TdZ, TdR, Te7, Te3, TdU, Te4;
+			      {
+				   V TdF, TdS, Tdx, Td5, TdO, TdE, TdC, Tdt, Tdk;
+				   {
+					V Tdc, TdA, T4F, T4c, T7V, T7M, T4G, T4t, T7W, T7P, TdB, Tdj;
+					{
+					     V Td1, Tdg, TbK, Tf8, Tfh, T4b, T7L, T46, T7K, TbQ, Tfa, T4r, T4m, Td4;
+					     TdF = VSUB(TcZ, Td0);
+					     Td1 = VADD(TcZ, Td0);
+					     {
+						  V TbJ, Tf7, T4a, T45;
+						  TbJ = VADD(TbH, TbI);
+						  Tf7 = VSUB(TbI, TbH);
+						  T4a = VSUB(T44, T41);
+						  T45 = VADD(T41, T44);
+						  {
+						       V TbO, T4i, TbP, T4l;
+						       TbO = VADD(T4g, T4h);
+						       T4i = VSUB(T4g, T4h);
+						       TbP = VADD(T4j, T4k);
+						       T4l = VSUB(T4j, T4k);
+						       Tdg = VADD(TbG, TbJ);
+						       TbK = VSUB(TbG, TbJ);
+						       Tf8 = VFMA(LDK(KP414213562), Tf7, Tf6);
+						       Tfh = VFNMS(LDK(KP414213562), Tf6, Tf7);
+						       T4b = VFMA(LDK(KP707106781), T4a, T49);
+						       T7L = VFNMS(LDK(KP707106781), T4a, T49);
+						       T46 = VFMA(LDK(KP707106781), T45, T3Y);
+						       T7K = VFNMS(LDK(KP707106781), T45, T3Y);
+						       TbQ = VADD(TbO, TbP);
+						       Tfa = VSUB(TbP, TbO);
+						       T4r = VSUB(T4l, T4i);
+						       T4m = VADD(T4i, T4l);
+						       Td4 = VADD(Td2, Td3);
+						       TdS = VSUB(Td2, Td3);
+						  }
+					     }
+					     Tdc = VSUB(Td8, Tdb);
+					     TdA = VADD(Td8, Tdb);
+					     T4F = VFNMS(LDK(KP198912367), T46, T4b);
+					     T4c = VFMA(LDK(KP198912367), T4b, T46);
+					     T7V = VFMA(LDK(KP668178637), T7K, T7L);
+					     T7M = VFNMS(LDK(KP668178637), T7L, T7K);
+					     {
+						  V Tdh, TbR, Tfb, Tfi;
+						  Tdh = VADD(TbN, TbQ);
+						  TbR = VSUB(TbN, TbQ);
+						  Tfb = VFNMS(LDK(KP414213562), Tfa, Tf9);
+						  Tfi = VFMA(LDK(KP414213562), Tf9, Tfa);
+						  {
+						       V T4s, T7O, T4n, T7N, Tdi;
+						       T4s = VFMA(LDK(KP707106781), T4r, T4q);
+						       T7O = VFNMS(LDK(KP707106781), T4r, T4q);
+						       T4n = VFMA(LDK(KP707106781), T4m, T4f);
+						       T7N = VFNMS(LDK(KP707106781), T4m, T4f);
+						       Tdx = VADD(Td1, Td4);
+						       Td5 = VSUB(Td1, Td4);
+						       TdO = VSUB(Tdh, Tdg);
+						       Tdi = VADD(Tdg, Tdh);
+						       Tc1 = VSUB(TbR, TbK);
+						       TbS = VADD(TbK, TbR);
+						       TgB = VSUB(Tfb, Tf8);
+						       Tfc = VADD(Tf8, Tfb);
+						       Tgy = VSUB(Tfi, Tfh);
+						       Tfj = VADD(Tfh, Tfi);
+						       T4G = VFMA(LDK(KP198912367), T4n, T4s);
+						       T4t = VFNMS(LDK(KP198912367), T4s, T4n);
+						       T7W = VFNMS(LDK(KP668178637), T7N, T7O);
+						       T7P = VFMA(LDK(KP668178637), T7O, T7N);
+						       TdB = VADD(Tdf, Tdi);
+						       Tdj = VSUB(Tdf, Tdi);
+						  }
+					     }
+					}
+					T62 = VSUB(T4G, T4F);
+					T4H = VADD(T4F, T4G);
+					T65 = VSUB(T4t, T4c);
+					T4u = VADD(T4c, T4t);
+					T9c = VSUB(T7V, T7W);
+					T7X = VADD(T7V, T7W);
+					T9f = VSUB(T7P, T7M);
+					T7Q = VADD(T7M, T7P);
+					TdE = VADD(TdA, TdB);
+					TdC = VSUB(TdA, TdB);
+					Tdt = VSUB(Tdc, Tdj);
+					Tdk = VADD(Tdc, Tdj);
+				   }
+				   {
+					V TdT, Tdl, Tdv, TdJ, Te1, Te2, TdQ, Tdz, TdD, Tdu, Tdw;
+					{
+					     V TdI, TdM, TdV, TdW, TdP, Tds, Tdy;
+					     TdI = VADD(TdG, TdH);
+					     TdT = VSUB(TdG, TdH);
+					     TdM = VFNMS(LDK(KP414213562), TdL, TdK);
+					     TdV = VFMA(LDK(KP414213562), TdK, TdL);
+					     TdW = VFMA(LDK(KP414213562), TdN, TdO);
+					     TdP = VFNMS(LDK(KP414213562), TdO, TdN);
+					     Tdl = VFNMS(LDK(KP707106781), Tdk, Td5);
+					     Tdv = VFMA(LDK(KP707106781), Tdk, Td5);
+					     Tds = VSUB(Tdo, Tdr);
+					     Tdy = VADD(Tdo, Tdr);
+					     TdJ = VFMA(LDK(KP707106781), TdI, TdF);
+					     Te1 = VFNMS(LDK(KP707106781), TdI, TdF);
+					     TdX = VSUB(TdV, TdW);
+					     Te2 = VADD(TdV, TdW);
+					     Te5 = VSUB(TdM, TdP);
+					     TdQ = VADD(TdM, TdP);
+					     Tdz = VSUB(Tdx, Tdy);
+					     TdD = VADD(Tdx, Tdy);
+					     Tdu = VFNMS(LDK(KP707106781), Tdt, Tds);
+					     Tdw = VFMA(LDK(KP707106781), Tdt, Tds);
+					}
+					TdZ = VFMA(LDK(KP923879532), TdQ, TdJ);
+					TdR = VFNMS(LDK(KP923879532), TdQ, TdJ);
+					Te7 = VFMA(LDK(KP923879532), Te2, Te1);
+					Te3 = VFNMS(LDK(KP923879532), Te2, Te1);
+					ST(&(xo[0]), VADD(TdD, TdE), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 64)]), VSUB(TdD, TdE), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 32)]), VFMAI(TdC, Tdz), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 96)]), VFNMSI(TdC, Tdz), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 112)]), VFNMSI(Tdw, Tdv), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 16)]), VFMAI(Tdw, Tdv), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 80)]), VFMAI(Tdu, Tdl), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 48)]), VFNMSI(Tdu, Tdl), ovs, &(xo[0]));
+					TdU = VFMA(LDK(KP707106781), TdT, TdS);
+					Te4 = VFNMS(LDK(KP707106781), TdT, TdS);
+				   }
+			      }
+			      {
+				   V Tcx, TcJ, TcI, Tcy, TcA, Tbm, Tcp, TaZ, Tcs, Tci, Tbv, TcB, TcD, TbT, Tc2;
+				   V TcE, Tat, TaY;
+				   Tcx = VFNMS(LDK(KP707106781), Tas, Tad);
+				   Tat = VFMA(LDK(KP707106781), Tas, Tad);
+				   TaY = VADD(TaI, TaX);
+				   TcJ = VSUB(TaI, TaX);
+				   {
+					V Tce, Tch, Te8, Te6, TdY, Te0;
+					TcI = VFNMS(LDK(KP707106781), Tcd, Tcc);
+					Tce = VFMA(LDK(KP707106781), Tcd, Tcc);
+					Tch = VSUB(Tcf, Tcg);
+					Tcy = VADD(Tcf, Tcg);
+					Te8 = VFNMS(LDK(KP923879532), Te5, Te4);
+					Te6 = VFMA(LDK(KP923879532), Te5, Te4);
+					TdY = VFNMS(LDK(KP923879532), TdX, TdU);
+					Te0 = VFMA(LDK(KP923879532), TdX, TdU);
+					TcA = VFNMS(LDK(KP707106781), Tbl, Tb6);
+					Tbm = VFMA(LDK(KP707106781), Tbl, Tb6);
+					Tcp = VFNMS(LDK(KP923879532), TaY, Tat);
+					TaZ = VFMA(LDK(KP923879532), TaY, Tat);
+					Tcs = VFNMS(LDK(KP923879532), Tch, Tce);
+					Tci = VFMA(LDK(KP923879532), Tch, Tce);
+					ST(&(xo[WS(os, 88)]), VFNMSI(Te6, Te3), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 40)]), VFMAI(Te6, Te3), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 104)]), VFMAI(Te8, Te7), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 24)]), VFNMSI(Te8, Te7), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 8)]), VFMAI(Te0, TdZ), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 120)]), VFNMSI(Te0, TdZ), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 72)]), VFMAI(TdY, TdR), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 56)]), VFNMSI(TdY, TdR), ovs, &(xo[0]));
+					Tbv = VFMA(LDK(KP707106781), Tbu, Tbt);
+					TcB = VFNMS(LDK(KP707106781), Tbu, Tbt);
+					TcD = VFNMS(LDK(KP707106781), TbS, TbD);
+					TbT = VFMA(LDK(KP707106781), TbS, TbD);
+					Tc2 = VFMA(LDK(KP707106781), Tc1, Tc0);
+					TcE = VFNMS(LDK(KP707106781), Tc1, Tc0);
+				   }
+				   {
+					V TcR, Tcz, TcU, TcK, Tcq, Tcl, Tct, Tc4;
+					{
+					     V Tcj, Tbw, Tck, Tc3;
+					     Tcj = VFMA(LDK(KP198912367), Tbm, Tbv);
+					     Tbw = VFNMS(LDK(KP198912367), Tbv, Tbm);
+					     Tck = VFMA(LDK(KP198912367), TbT, Tc2);
+					     Tc3 = VFNMS(LDK(KP198912367), Tc2, TbT);
+					     TcR = VFNMS(LDK(KP923879532), Tcy, Tcx);
+					     Tcz = VFMA(LDK(KP923879532), Tcy, Tcx);
+					     TcU = VFMA(LDK(KP923879532), TcJ, TcI);
+					     TcK = VFNMS(LDK(KP923879532), TcJ, TcI);
+					     Tcq = VADD(Tcj, Tck);
+					     Tcl = VSUB(Tcj, Tck);
+					     Tct = VSUB(Tbw, Tc3);
+					     Tc4 = VADD(Tbw, Tc3);
+					}
+					{
+					     V TfN, Tel, TfY, Tfu, Tfv, Tfw, TcT, TcX, TcQ, TcO, TcW, TcY, TcP, TcH, TfZ;
+					     V TeE;
+					     {
+						  V Teu, TcS, TcN, TcV, TcG, TeD;
+						  TfN = VFNMS(LDK(KP923879532), Tek, Ted);
+						  Tel = VFMA(LDK(KP923879532), Tek, Ted);
+						  {
+						       V TcL, TcC, Tcr, Tcv;
+						       TcL = VFNMS(LDK(KP668178637), TcA, TcB);
+						       TcC = VFMA(LDK(KP668178637), TcB, TcA);
+						       Tcr = VFNMS(LDK(KP980785280), Tcq, Tcp);
+						       Tcv = VFMA(LDK(KP980785280), Tcq, Tcp);
+						       {
+							    V Tco, Tcm, Tcu, Tcw;
+							    Tco = VFMA(LDK(KP980785280), Tcl, Tci);
+							    Tcm = VFNMS(LDK(KP980785280), Tcl, Tci);
+							    Tcu = VFMA(LDK(KP980785280), Tct, Tcs);
+							    Tcw = VFNMS(LDK(KP980785280), Tct, Tcs);
+							    {
+								 V Tcn, Tc5, TcM, TcF;
+								 Tcn = VFMA(LDK(KP980785280), Tc4, TaZ);
+								 Tc5 = VFNMS(LDK(KP980785280), Tc4, TaZ);
+								 TcM = VFNMS(LDK(KP668178637), TcD, TcE);
+								 TcF = VFMA(LDK(KP668178637), TcE, TcD);
+								 TfY = VFNMS(LDK(KP923879532), Tft, Tfq);
+								 Tfu = VFMA(LDK(KP923879532), Tft, Tfq);
+								 Tfv = VFMA(LDK(KP198912367), Teq, Tet);
+								 Teu = VFNMS(LDK(KP198912367), Tet, Teq);
+								 ST(&(xo[WS(os, 92)]), VFNMSI(Tcu, Tcr), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 36)]), VFMAI(Tcu, Tcr), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 100)]), VFMAI(Tcw, Tcv), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 28)]), VFNMSI(Tcw, Tcv), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 4)]), VFMAI(Tco, Tcn), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 124)]), VFNMSI(Tco, Tcn), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 68)]), VFMAI(Tcm, Tc5), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 60)]), VFNMSI(Tcm, Tc5), ovs, &(xo[0]));
+								 TcS = VADD(TcL, TcM);
+								 TcN = VSUB(TcL, TcM);
+								 TcV = VSUB(TcC, TcF);
+								 TcG = VADD(TcC, TcF);
+								 TeD = VFNMS(LDK(KP198912367), TeC, Tez);
+								 Tfw = VFMA(LDK(KP198912367), Tez, TeC);
+							    }
+						       }
+						  }
+						  TcT = VFMA(LDK(KP831469612), TcS, TcR);
+						  TcX = VFNMS(LDK(KP831469612), TcS, TcR);
+						  TcQ = VFMA(LDK(KP831469612), TcN, TcK);
+						  TcO = VFNMS(LDK(KP831469612), TcN, TcK);
+						  TcW = VFNMS(LDK(KP831469612), TcV, TcU);
+						  TcY = VFMA(LDK(KP831469612), TcV, TcU);
+						  TcP = VFMA(LDK(KP831469612), TcG, Tcz);
+						  TcH = VFNMS(LDK(KP831469612), TcG, Tcz);
+						  TfZ = VSUB(Teu, TeD);
+						  TeE = VADD(Teu, TeD);
+					     }
+					     {
+						  V TfQ, TeS, TfO, Tfx, TeZ, TfR, Tfd, Tfk;
+						  TfQ = VFNMS(LDK(KP923879532), TeR, TeK);
+						  TeS = VFMA(LDK(KP923879532), TeR, TeK);
+						  ST(&(xo[WS(os, 84)]), VFMAI(TcW, TcT), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 44)]), VFNMSI(TcW, TcT), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 108)]), VFNMSI(TcY, TcX), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 20)]), VFMAI(TcY, TcX), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 116)]), VFMAI(TcQ, TcP), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 12)]), VFNMSI(TcQ, TcP), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 52)]), VFMAI(TcO, TcH), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 76)]), VFNMSI(TcO, TcH), ovs, &(xo[0]));
+						  Tg0 = VFNMS(LDK(KP980785280), TfZ, TfY);
+						  Tga = VFMA(LDK(KP980785280), TfZ, TfY);
+						  TfF = VFNMS(LDK(KP980785280), TeE, Tel);
+						  TeF = VFMA(LDK(KP980785280), TeE, Tel);
+						  TfO = VADD(Tfv, Tfw);
+						  Tfx = VSUB(Tfv, Tfw);
+						  TeZ = VFMA(LDK(KP923879532), TeY, TeV);
+						  TfR = VFNMS(LDK(KP923879532), TeY, TeV);
+						  TfT = VFNMS(LDK(KP923879532), Tfc, Tf5);
+						  Tfd = VFMA(LDK(KP923879532), Tfc, Tf5);
+						  Tfk = VFMA(LDK(KP923879532), Tfj, Tfg);
+						  TfU = VFNMS(LDK(KP923879532), Tfj, Tfg);
+						  TfP = VFMA(LDK(KP980785280), TfO, TfN);
+						  Tg7 = VFNMS(LDK(KP980785280), TfO, TfN);
+						  TfI = VFNMS(LDK(KP980785280), Tfx, Tfu);
+						  Tfy = VFMA(LDK(KP980785280), Tfx, Tfu);
+						  Tfz = VFMA(LDK(KP098491403), TeS, TeZ);
+						  Tf0 = VFNMS(LDK(KP098491403), TeZ, TeS);
+						  TfA = VFMA(LDK(KP098491403), Tfd, Tfk);
+						  Tfl = VFNMS(LDK(KP098491403), Tfk, Tfd);
+						  Tg1 = VFNMS(LDK(KP820678790), TfQ, TfR);
+						  TfS = VFMA(LDK(KP820678790), TfR, TfQ);
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T8x, T8y, T8F, T8w, T8k, T8f, T8n, T80, T9l, T76, T87, T8U, T89, T7e, T7l;
+			      V T8a;
+			      {
+				   V The, Tho, TgT, Tgp, Th7, Th8, Thf, Th6, Th3, Thl, TgW, TgM, TgU, TgP, TgX;
+				   V TgE;
+				   {
+					V Th1, TgI, TgJ, TgK;
+					{
+					     V Tgh, Thc, Tgk, TfG, TfB, TfJ, Tfm, Tg2, TfV, Tgn, TfL, TfH;
+					     Th1 = VFMA(LDK(KP923879532), Tgg, Tgf);
+					     Tgh = VFNMS(LDK(KP923879532), Tgg, Tgf);
+					     Thc = VFNMS(LDK(KP923879532), TgH, TgG);
+					     TgI = VFMA(LDK(KP923879532), TgH, TgG);
+					     TgJ = VFMA(LDK(KP668178637), Tgi, Tgj);
+					     Tgk = VFNMS(LDK(KP668178637), Tgj, Tgi);
+					     TfG = VADD(Tfz, TfA);
+					     TfB = VSUB(Tfz, TfA);
+					     TfJ = VSUB(Tf0, Tfl);
+					     Tfm = VADD(Tf0, Tfl);
+					     Tg2 = VFNMS(LDK(KP820678790), TfT, TfU);
+					     TfV = VFMA(LDK(KP820678790), TfU, TfT);
+					     Tgn = VFNMS(LDK(KP668178637), Tgm, Tgl);
+					     TgK = VFMA(LDK(KP668178637), Tgl, Tgm);
+					     TfL = VFMA(LDK(KP995184726), TfG, TfF);
+					     TfH = VFNMS(LDK(KP995184726), TfG, TfF);
+					     {
+						  V TfE, TfC, TfM, TfK;
+						  TfE = VFMA(LDK(KP995184726), TfB, Tfy);
+						  TfC = VFNMS(LDK(KP995184726), TfB, Tfy);
+						  TfM = VFNMS(LDK(KP995184726), TfJ, TfI);
+						  TfK = VFMA(LDK(KP995184726), TfJ, TfI);
+						  {
+						       V TfD, Tfn, Tg8, Tg3;
+						       TfD = VFMA(LDK(KP995184726), Tfm, TeF);
+						       Tfn = VFNMS(LDK(KP995184726), Tfm, TeF);
+						       Tg8 = VADD(Tg1, Tg2);
+						       Tg3 = VSUB(Tg1, Tg2);
+						       {
+							    V Tgb, TfW, Thd, Tgo;
+							    Tgb = VSUB(TfS, TfV);
+							    TfW = VADD(TfS, TfV);
+							    Thd = VSUB(Tgk, Tgn);
+							    Tgo = VADD(Tgk, Tgn);
+							    ST(&(xo[WS(os, 98)]), VFMAI(TfM, TfL), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 30)]), VFNMSI(TfM, TfL), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 94)]), VFNMSI(TfK, TfH), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 34)]), VFMAI(TfK, TfH), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 2)]), VFMAI(TfE, TfD), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 126)]), VFNMSI(TfE, TfD), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 66)]), VFMAI(TfC, Tfn), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 62)]), VFNMSI(TfC, Tfn), ovs, &(xo[0]));
+							    {
+								 V Tgd, Tg9, Tg6, Tg4;
+								 Tgd = VFNMS(LDK(KP773010453), Tg8, Tg7);
+								 Tg9 = VFMA(LDK(KP773010453), Tg8, Tg7);
+								 Tg6 = VFMA(LDK(KP773010453), Tg3, Tg0);
+								 Tg4 = VFNMS(LDK(KP773010453), Tg3, Tg0);
+								 {
+								      V Tge, Tgc, Tg5, TfX;
+								      Tge = VFMA(LDK(KP773010453), Tgb, Tga);
+								      Tgc = VFNMS(LDK(KP773010453), Tgb, Tga);
+								      Tg5 = VFMA(LDK(KP773010453), TfW, TfP);
+								      TfX = VFNMS(LDK(KP773010453), TfW, TfP);
+								      The = VFMA(LDK(KP831469612), Thd, Thc);
+								      Tho = VFNMS(LDK(KP831469612), Thd, Thc);
+								      TgT = VFMA(LDK(KP831469612), Tgo, Tgh);
+								      Tgp = VFNMS(LDK(KP831469612), Tgo, Tgh);
+								      ST(&(xo[WS(os, 110)]), VFNMSI(Tge, Tgd), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 18)]), VFMAI(Tge, Tgd), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 82)]), VFMAI(Tgc, Tg9), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 46)]), VFNMSI(Tgc, Tg9), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 114)]), VFMAI(Tg6, Tg5), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 14)]), VFNMSI(Tg6, Tg5), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 50)]), VFMAI(Tg4, TfX), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 78)]), VFNMSI(Tg4, TfX), ovs, &(xo[0]));
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					{
+					     V Th4, Tgs, Tgv, Th5, Tgz, TgC, Th2, TgL;
+					     Th4 = VFMA(LDK(KP923879532), Tgr, Tgq);
+					     Tgs = VFNMS(LDK(KP923879532), Tgr, Tgq);
+					     Tgv = VFMA(LDK(KP923879532), Tgu, Tgt);
+					     Th5 = VFNMS(LDK(KP923879532), Tgu, Tgt);
+					     Th7 = VFMA(LDK(KP923879532), Tgy, Tgx);
+					     Tgz = VFNMS(LDK(KP923879532), Tgy, Tgx);
+					     TgC = VFMA(LDK(KP923879532), TgB, TgA);
+					     Th8 = VFNMS(LDK(KP923879532), TgB, TgA);
+					     Th2 = VADD(TgJ, TgK);
+					     TgL = VSUB(TgJ, TgK);
+					     {
+						  V TgN, Tgw, TgO, TgD;
+						  TgN = VFMA(LDK(KP534511135), Tgs, Tgv);
+						  Tgw = VFNMS(LDK(KP534511135), Tgv, Tgs);
+						  TgO = VFMA(LDK(KP534511135), Tgz, TgC);
+						  TgD = VFNMS(LDK(KP534511135), TgC, Tgz);
+						  Thf = VFNMS(LDK(KP303346683), Th4, Th5);
+						  Th6 = VFMA(LDK(KP303346683), Th5, Th4);
+						  Th3 = VFMA(LDK(KP831469612), Th2, Th1);
+						  Thl = VFNMS(LDK(KP831469612), Th2, Th1);
+						  TgW = VFNMS(LDK(KP831469612), TgL, TgI);
+						  TgM = VFMA(LDK(KP831469612), TgL, TgI);
+						  TgU = VADD(TgN, TgO);
+						  TgP = VSUB(TgN, TgO);
+						  TgX = VSUB(Tgw, TgD);
+						  TgE = VADD(Tgw, TgD);
+					     }
+					}
+				   }
+				   {
+					V T8u, T8v, T7R, T8d, T7G, Thm, Thh, Thp, Tha, T7Y, Thr, Thn;
+					{
+					     V T7y, T7F, TgZ, TgV;
+					     T8u = VFNMS(LDK(KP831469612), T7x, T7q);
+					     T7y = VFMA(LDK(KP831469612), T7x, T7q);
+					     T7F = VFMA(LDK(KP831469612), T7E, T7B);
+					     T8v = VFNMS(LDK(KP831469612), T7E, T7B);
+					     T8x = VFNMS(LDK(KP831469612), T7Q, T7J);
+					     T7R = VFMA(LDK(KP831469612), T7Q, T7J);
+					     TgZ = VFMA(LDK(KP881921264), TgU, TgT);
+					     TgV = VFNMS(LDK(KP881921264), TgU, TgT);
+					     {
+						  V TgS, TgQ, Th0, TgY;
+						  TgS = VFMA(LDK(KP881921264), TgP, TgM);
+						  TgQ = VFNMS(LDK(KP881921264), TgP, TgM);
+						  Th0 = VFNMS(LDK(KP881921264), TgX, TgW);
+						  TgY = VFMA(LDK(KP881921264), TgX, TgW);
+						  {
+						       V TgR, TgF, Thg, Th9;
+						       TgR = VFMA(LDK(KP881921264), TgE, Tgp);
+						       TgF = VFNMS(LDK(KP881921264), TgE, Tgp);
+						       Thg = VFNMS(LDK(KP303346683), Th7, Th8);
+						       Th9 = VFMA(LDK(KP303346683), Th8, Th7);
+						       T8d = VFNMS(LDK(KP148335987), T7y, T7F);
+						       T7G = VFMA(LDK(KP148335987), T7F, T7y);
+						       ST(&(xo[WS(os, 106)]), VFMAI(Th0, TgZ), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 22)]), VFNMSI(Th0, TgZ), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 86)]), VFNMSI(TgY, TgV), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 42)]), VFMAI(TgY, TgV), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 10)]), VFMAI(TgS, TgR), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 118)]), VFNMSI(TgS, TgR), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 74)]), VFMAI(TgQ, TgF), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 54)]), VFNMSI(TgQ, TgF), ovs, &(xo[0]));
+						       Thm = VADD(Thf, Thg);
+						       Thh = VSUB(Thf, Thg);
+						       Thp = VSUB(Th6, Th9);
+						       Tha = VADD(Th6, Th9);
+						       T7Y = VFMA(LDK(KP831469612), T7X, T7U);
+						       T8y = VFNMS(LDK(KP831469612), T7X, T7U);
+						  }
+					     }
+					}
+					Thr = VFNMS(LDK(KP956940335), Thm, Thl);
+					Thn = VFMA(LDK(KP956940335), Thm, Thl);
+					{
+					     V Thk, Thi, Ths, Thq;
+					     Thk = VFMA(LDK(KP956940335), Thh, The);
+					     Thi = VFNMS(LDK(KP956940335), Thh, The);
+					     Ths = VFMA(LDK(KP956940335), Thp, Tho);
+					     Thq = VFNMS(LDK(KP956940335), Thp, Tho);
+					     {
+						  V Thj, Thb, T8e, T7Z;
+						  Thj = VFMA(LDK(KP956940335), Tha, Th3);
+						  Thb = VFNMS(LDK(KP956940335), Tha, Th3);
+						  T8e = VFNMS(LDK(KP148335987), T7R, T7Y);
+						  T7Z = VFMA(LDK(KP148335987), T7Y, T7R);
+						  T8F = VFMA(LDK(KP741650546), T8u, T8v);
+						  T8w = VFNMS(LDK(KP741650546), T8v, T8u);
+						  ST(&(xo[WS(os, 102)]), VFNMSI(Ths, Thr), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 26)]), VFMAI(Ths, Thr), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 90)]), VFMAI(Thq, Thn), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 38)]), VFNMSI(Thq, Thn), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 122)]), VFMAI(Thk, Thj), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 6)]), VFNMSI(Thk, Thj), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 58)]), VFMAI(Thi, Thb), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 70)]), VFNMSI(Thi, Thb), ovs, &(xo[0]));
+						  T8k = VADD(T8d, T8e);
+						  T8f = VSUB(T8d, T8e);
+						  T8n = VSUB(T7G, T7Z);
+						  T80 = VADD(T7G, T7Z);
+					     }
+					}
+					T9l = VSUB(T72, T75);
+					T76 = VADD(T72, T75);
+					T87 = VSUB(T85, T86);
+					T8U = VADD(T85, T86);
+					T89 = VFNMS(LDK(KP303346683), T7a, T7d);
+					T7e = VFMA(LDK(KP303346683), T7d, T7a);
+					T7l = VFMA(LDK(KP303346683), T7k, T7h);
+					T8a = VFNMS(LDK(KP303346683), T7h, T7k);
+				   }
+			      }
+			      {
+				   V T11, T5h, T5a, T55, T5d, T4K, T5C, T5x, T5F, T5q, T4X, T4Z, T1C, T2d, T50;
+				   {
+					V T5k, T3g, T3t, T5l, T5n, T4v, T4I, T5o, T8G, T8z;
+					T5k = VFNMS(LDK(KP980785280), T3f, T2G);
+					T3g = VFMA(LDK(KP980785280), T3f, T2G);
+					T8G = VFMA(LDK(KP741650546), T8x, T8y);
+					T8z = VFNMS(LDK(KP741650546), T8y, T8x);
+					{
+					     V T8r, T77, T8C, T88;
+					     T8r = VFNMS(LDK(KP831469612), T76, T6Z);
+					     T77 = VFMA(LDK(KP831469612), T76, T6Z);
+					     T8C = VFNMS(LDK(KP831469612), T87, T84);
+					     T88 = VFMA(LDK(KP831469612), T87, T84);
+					     {
+						  V T8D, T7m, T8s, T8b;
+						  T8D = VSUB(T7e, T7l);
+						  T7m = VADD(T7e, T7l);
+						  T8s = VADD(T89, T8a);
+						  T8b = VSUB(T89, T8a);
+						  {
+						       V T8M, T8H, T8P, T8A;
+						       T8M = VADD(T8F, T8G);
+						       T8H = VSUB(T8F, T8G);
+						       T8P = VSUB(T8w, T8z);
+						       T8A = VADD(T8w, T8z);
+						       {
+							    V T8E, T8O, T8j, T7n;
+							    T8E = VFMA(LDK(KP956940335), T8D, T8C);
+							    T8O = VFNMS(LDK(KP956940335), T8D, T8C);
+							    T8j = VFNMS(LDK(KP956940335), T7m, T77);
+							    T7n = VFMA(LDK(KP956940335), T7m, T77);
+							    {
+								 V T8t, T8L, T8m, T8c;
+								 T8t = VFNMS(LDK(KP956940335), T8s, T8r);
+								 T8L = VFMA(LDK(KP956940335), T8s, T8r);
+								 T8m = VFNMS(LDK(KP956940335), T8b, T88);
+								 T8c = VFMA(LDK(KP956940335), T8b, T88);
+								 {
+								      V T8K, T8I, T8S, T8Q;
+								      T8K = VFMA(LDK(KP803207531), T8H, T8E);
+								      T8I = VFNMS(LDK(KP803207531), T8H, T8E);
+								      T8S = VFNMS(LDK(KP803207531), T8P, T8O);
+								      T8Q = VFMA(LDK(KP803207531), T8P, T8O);
+								      {
+									   V T8p, T8l, T8h, T81;
+									   T8p = VFNMS(LDK(KP989176509), T8k, T8j);
+									   T8l = VFMA(LDK(KP989176509), T8k, T8j);
+									   T8h = VFMA(LDK(KP989176509), T80, T7n);
+									   T81 = VFNMS(LDK(KP989176509), T80, T7n);
+									   {
+										V T8J, T8B, T8R, T8N;
+										T8J = VFMA(LDK(KP803207531), T8A, T8t);
+										T8B = VFNMS(LDK(KP803207531), T8A, T8t);
+										T8R = VFMA(LDK(KP803207531), T8M, T8L);
+										T8N = VFNMS(LDK(KP803207531), T8M, T8L);
+										{
+										     V T8q, T8o, T8i, T8g;
+										     T8q = VFMA(LDK(KP989176509), T8n, T8m);
+										     T8o = VFNMS(LDK(KP989176509), T8n, T8m);
+										     T8i = VFMA(LDK(KP989176509), T8f, T8c);
+										     T8g = VFNMS(LDK(KP989176509), T8f, T8c);
+										     ST(&(xo[WS(os, 13)]), VFMAI(T8K, T8J), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 115)]), VFNMSI(T8K, T8J), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 77)]), VFMAI(T8I, T8B), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 51)]), VFNMSI(T8I, T8B), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 109)]), VFMAI(T8S, T8R), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 19)]), VFNMSI(T8S, T8R), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 83)]), VFNMSI(T8Q, T8N), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 45)]), VFMAI(T8Q, T8N), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 99)]), VFNMSI(T8q, T8p), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 29)]), VFMAI(T8q, T8p), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 93)]), VFMAI(T8o, T8l), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 35)]), VFNMSI(T8o, T8l), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 125)]), VFMAI(T8i, T8h), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 3)]), VFNMSI(T8i, T8h), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 61)]), VFMAI(T8g, T81), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 67)]), VFNMSI(T8g, T81), ovs, &(xo[WS(os, 1)]));
+										     T3t = VFMA(LDK(KP980785280), T3s, T3p);
+										     T5l = VFNMS(LDK(KP980785280), T3s, T3p);
+										}
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					T5n = VFNMS(LDK(KP980785280), T4u, T3V);
+					T4v = VFMA(LDK(KP980785280), T4u, T3V);
+					T4I = VFMA(LDK(KP980785280), T4H, T4E);
+					T5o = VFNMS(LDK(KP980785280), T4H, T4E);
+					{
+					     V T53, T3u, T54, T4J, T5v, T5m, T5w, T5p, T10;
+					     T6b = VSUB(TI, TZ);
+					     T10 = VADD(TI, TZ);
+					     T53 = VFMA(LDK(KP049126849), T3g, T3t);
+					     T3u = VFNMS(LDK(KP049126849), T3t, T3g);
+					     T54 = VFMA(LDK(KP049126849), T4v, T4I);
+					     T4J = VFNMS(LDK(KP049126849), T4I, T4v);
+					     T5v = VFNMS(LDK(KP906347169), T5k, T5l);
+					     T5m = VFMA(LDK(KP906347169), T5l, T5k);
+					     T5w = VFNMS(LDK(KP906347169), T5n, T5o);
+					     T5p = VFMA(LDK(KP906347169), T5o, T5n);
+					     T11 = VFMA(LDK(KP980785280), T10, Tr);
+					     T5h = VFNMS(LDK(KP980785280), T10, Tr);
+					     T5a = VADD(T53, T54);
+					     T55 = VSUB(T53, T54);
+					     T5d = VSUB(T3u, T4J);
+					     T4K = VADD(T3u, T4J);
+					     T5C = VADD(T5v, T5w);
+					     T5x = VSUB(T5v, T5w);
+					     T5F = VSUB(T5m, T5p);
+					     T5q = VADD(T5m, T5p);
+					     T4X = VSUB(T4V, T4W);
+					     T5K = VADD(T4V, T4W);
+					}
+					T4Z = VFMA(LDK(KP098491403), T1s, T1B);
+					T1C = VFNMS(LDK(KP098491403), T1B, T1s);
+					T2d = VFNMS(LDK(KP098491403), T2c, T23);
+					T50 = VFMA(LDK(KP098491403), T23, T2c);
+				   }
+				   {
+					V T9y, T9t, T9B, T9i, T9n, T9o, T9F, T8V, T9Q, T9m, T9R, T92, Ta0, T9V, Ta3;
+					V T9O;
+					{
+					     V T9I, T9J, T9L, T9d, T5s, T4Y, T5t, T2e, T5i, T51, T9r, T9a, T9g, T9M, T96;
+					     V T99;
+					     T9I = VFMA(LDK(KP831469612), T95, T94);
+					     T96 = VFNMS(LDK(KP831469612), T95, T94);
+					     T99 = VFNMS(LDK(KP831469612), T98, T97);
+					     T9J = VFMA(LDK(KP831469612), T98, T97);
+					     T9L = VFMA(LDK(KP831469612), T9c, T9b);
+					     T9d = VFNMS(LDK(KP831469612), T9c, T9b);
+					     T5s = VFNMS(LDK(KP980785280), T4X, T4U);
+					     T4Y = VFMA(LDK(KP980785280), T4X, T4U);
+					     T5t = VSUB(T1C, T2d);
+					     T2e = VADD(T1C, T2d);
+					     T5i = VADD(T4Z, T50);
+					     T51 = VSUB(T4Z, T50);
+					     T9r = VFNMS(LDK(KP599376933), T96, T99);
+					     T9a = VFMA(LDK(KP599376933), T99, T96);
+					     T9g = VFNMS(LDK(KP831469612), T9f, T9e);
+					     T9M = VFMA(LDK(KP831469612), T9f, T9e);
+					     {
+						  V T5u, T5E, T8Y, T91;
+						  T5u = VFNMS(LDK(KP995184726), T5t, T5s);
+						  T5E = VFMA(LDK(KP995184726), T5t, T5s);
+						  {
+						       V T59, T2f, T5j, T5B;
+						       T59 = VFNMS(LDK(KP995184726), T2e, T11);
+						       T2f = VFMA(LDK(KP995184726), T2e, T11);
+						       T5j = VFMA(LDK(KP995184726), T5i, T5h);
+						       T5B = VFNMS(LDK(KP995184726), T5i, T5h);
+						       {
+							    V T5c, T52, T9s, T9h;
+							    T5c = VFNMS(LDK(KP995184726), T51, T4Y);
+							    T52 = VFMA(LDK(KP995184726), T51, T4Y);
+							    T9s = VFNMS(LDK(KP599376933), T9d, T9g);
+							    T9h = VFMA(LDK(KP599376933), T9g, T9d);
+							    {
+								 V T5A, T5y, T5I, T5G;
+								 T5A = VFMA(LDK(KP740951125), T5x, T5u);
+								 T5y = VFNMS(LDK(KP740951125), T5x, T5u);
+								 T5I = VFMA(LDK(KP740951125), T5F, T5E);
+								 T5G = VFNMS(LDK(KP740951125), T5F, T5E);
+								 {
+								      V T5f, T5b, T57, T4L;
+								      T5f = VFMA(LDK(KP998795456), T5a, T59);
+								      T5b = VFNMS(LDK(KP998795456), T5a, T59);
+								      T57 = VFMA(LDK(KP998795456), T4K, T2f);
+								      T4L = VFNMS(LDK(KP998795456), T4K, T2f);
+								      {
+									   V T5z, T5r, T5H, T5D;
+									   T5z = VFMA(LDK(KP740951125), T5q, T5j);
+									   T5r = VFNMS(LDK(KP740951125), T5q, T5j);
+									   T5H = VFNMS(LDK(KP740951125), T5C, T5B);
+									   T5D = VFMA(LDK(KP740951125), T5C, T5B);
+									   {
+										V T5g, T5e, T58, T56;
+										T5g = VFNMS(LDK(KP998795456), T5d, T5c);
+										T5e = VFMA(LDK(KP998795456), T5d, T5c);
+										T58 = VFMA(LDK(KP998795456), T55, T52);
+										T56 = VFNMS(LDK(KP998795456), T55, T52);
+										T9y = VADD(T9r, T9s);
+										T9t = VSUB(T9r, T9s);
+										T9B = VSUB(T9a, T9h);
+										T9i = VADD(T9a, T9h);
+										ST(&(xo[WS(os, 113)]), VFMAI(T5A, T5z), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 15)]), VFNMSI(T5A, T5z), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 49)]), VFMAI(T5y, T5r), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 79)]), VFNMSI(T5y, T5r), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 111)]), VFNMSI(T5I, T5H), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 17)]), VFMAI(T5I, T5H), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 81)]), VFMAI(T5G, T5D), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 47)]), VFNMSI(T5G, T5D), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 97)]), VFMAI(T5g, T5f), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 31)]), VFNMSI(T5g, T5f), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 95)]), VFNMSI(T5e, T5b), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 33)]), VFMAI(T5e, T5b), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 1)]), VFMAI(T58, T57), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 127)]), VFNMSI(T58, T57), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 65)]), VFMAI(T56, T4L), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 63)]), VFNMSI(T56, T4L), ovs, &(xo[WS(os, 1)]));
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+						  T9n = VFNMS(LDK(KP534511135), T8W, T8X);
+						  T8Y = VFMA(LDK(KP534511135), T8X, T8W);
+						  T91 = VFMA(LDK(KP534511135), T90, T8Z);
+						  T9o = VFNMS(LDK(KP534511135), T8Z, T90);
+						  {
+						       V T9T, T9K, T9U, T9N;
+						       T9T = VFMA(LDK(KP250486960), T9I, T9J);
+						       T9K = VFNMS(LDK(KP250486960), T9J, T9I);
+						       T9U = VFMA(LDK(KP250486960), T9L, T9M);
+						       T9N = VFNMS(LDK(KP250486960), T9M, T9L);
+						       T9F = VFNMS(LDK(KP831469612), T8U, T8T);
+						       T8V = VFMA(LDK(KP831469612), T8U, T8T);
+						       T9Q = VFMA(LDK(KP831469612), T9l, T9k);
+						       T9m = VFNMS(LDK(KP831469612), T9l, T9k);
+						       T9R = VSUB(T8Y, T91);
+						       T92 = VADD(T8Y, T91);
+						       Ta0 = VADD(T9T, T9U);
+						       T9V = VSUB(T9T, T9U);
+						       Ta3 = VSUB(T9K, T9N);
+						       T9O = VADD(T9K, T9N);
+						  }
+					     }
+					}
+					{
+					     V T6y, T6z, T63, T9Y, T9W, Ta6, Ta4, T9D, T9z, T9v, T9j, T6h, T60, T9H, T9Z;
+					     V T9A, T9q, T66, T9X, T9P;
+					     {
+						  V T5W, T9S, Ta2, T9x, T93, T5Z, T9G, T9p;
+						  T6y = VFMA(LDK(KP980785280), T5V, T5U);
+						  T5W = VFNMS(LDK(KP980785280), T5V, T5U);
+						  T9S = VFMA(LDK(KP881921264), T9R, T9Q);
+						  Ta2 = VFNMS(LDK(KP881921264), T9R, T9Q);
+						  T9x = VFNMS(LDK(KP881921264), T92, T8V);
+						  T93 = VFMA(LDK(KP881921264), T92, T8V);
+						  T5Z = VFMA(LDK(KP980785280), T5Y, T5X);
+						  T6z = VFNMS(LDK(KP980785280), T5Y, T5X);
+						  T6B = VFMA(LDK(KP980785280), T62, T61);
+						  T63 = VFNMS(LDK(KP980785280), T62, T61);
+						  T9G = VADD(T9n, T9o);
+						  T9p = VSUB(T9n, T9o);
+						  T9Y = VFMA(LDK(KP970031253), T9V, T9S);
+						  T9W = VFNMS(LDK(KP970031253), T9V, T9S);
+						  Ta6 = VFNMS(LDK(KP970031253), Ta3, Ta2);
+						  Ta4 = VFMA(LDK(KP970031253), Ta3, Ta2);
+						  T9D = VFNMS(LDK(KP857728610), T9y, T9x);
+						  T9z = VFMA(LDK(KP857728610), T9y, T9x);
+						  T9v = VFMA(LDK(KP857728610), T9i, T93);
+						  T9j = VFNMS(LDK(KP857728610), T9i, T93);
+						  T6h = VFMA(LDK(KP472964775), T5W, T5Z);
+						  T60 = VFNMS(LDK(KP472964775), T5Z, T5W);
+						  T9H = VFMA(LDK(KP881921264), T9G, T9F);
+						  T9Z = VFNMS(LDK(KP881921264), T9G, T9F);
+						  T9A = VFMA(LDK(KP881921264), T9p, T9m);
+						  T9q = VFNMS(LDK(KP881921264), T9p, T9m);
+						  T66 = VFMA(LDK(KP980785280), T65, T64);
+						  T6C = VFNMS(LDK(KP980785280), T65, T64);
+					     }
+					     T9X = VFMA(LDK(KP970031253), T9O, T9H);
+					     T9P = VFNMS(LDK(KP970031253), T9O, T9H);
+					     {
+						  V Ta5, Ta1, T9E, T9C;
+						  Ta5 = VFMA(LDK(KP970031253), Ta0, T9Z);
+						  Ta1 = VFNMS(LDK(KP970031253), Ta0, T9Z);
+						  T9E = VFMA(LDK(KP857728610), T9B, T9A);
+						  T9C = VFNMS(LDK(KP857728610), T9B, T9A);
+						  {
+						       V T9w, T9u, T6i, T67;
+						       T9w = VFMA(LDK(KP857728610), T9t, T9q);
+						       T9u = VFNMS(LDK(KP857728610), T9t, T9q);
+						       T6i = VFMA(LDK(KP472964775), T63, T66);
+						       T67 = VFNMS(LDK(KP472964775), T66, T63);
+						       T6J = VFNMS(LDK(KP357805721), T6y, T6z);
+						       T6A = VFMA(LDK(KP357805721), T6z, T6y);
+						       ST(&(xo[WS(os, 5)]), VFMAI(T9Y, T9X), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 123)]), VFNMSI(T9Y, T9X), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 69)]), VFMAI(T9W, T9P), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 59)]), VFNMSI(T9W, T9P), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 101)]), VFMAI(Ta6, Ta5), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 27)]), VFNMSI(Ta6, Ta5), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 91)]), VFNMSI(Ta4, Ta1), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 37)]), VFMAI(Ta4, Ta1), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 107)]), VFNMSI(T9E, T9D), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 21)]), VFMAI(T9E, T9D), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 85)]), VFMAI(T9C, T9z), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 43)]), VFNMSI(T9C, T9z), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 117)]), VFMAI(T9w, T9v), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 11)]), VFNMSI(T9w, T9v), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 53)]), VFMAI(T9u, T9j), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 75)]), VFNMSI(T9u, T9j), ovs, &(xo[WS(os, 1)]));
+						       T6o = VADD(T6h, T6i);
+						       T6j = VSUB(T6h, T6i);
+						       T6r = VSUB(T60, T67);
+						       T68 = VADD(T60, T67);
+						  }
+					     }
+					     T6d = VFMA(LDK(KP820678790), T5M, T5N);
+					     T5O = VFNMS(LDK(KP820678790), T5N, T5M);
+					     T5R = VFNMS(LDK(KP820678790), T5Q, T5P);
+					     T6e = VFMA(LDK(KP820678790), T5P, T5Q);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T6D = VFMA(LDK(KP357805721), T6C, T6B);
+	       T6K = VFNMS(LDK(KP357805721), T6B, T6C);
+	       {
+		    V T5L, T6v, T6c, T6G;
+		    T5L = VFNMS(LDK(KP980785280), T5K, T5J);
+		    T6v = VFMA(LDK(KP980785280), T5K, T5J);
+		    T6c = VFMA(LDK(KP980785280), T6b, T6a);
+		    T6G = VFNMS(LDK(KP980785280), T6b, T6a);
+		    {
+			 V T5S, T6H, T6f, T6w;
+			 T5S = VADD(T5O, T5R);
+			 T6H = VSUB(T5O, T5R);
+			 T6f = VSUB(T6d, T6e);
+			 T6w = VADD(T6d, T6e);
+			 {
+			      V T6L, T6Q, T6E, T6T;
+			      T6L = VSUB(T6J, T6K);
+			      T6Q = VADD(T6J, T6K);
+			      T6E = VADD(T6A, T6D);
+			      T6T = VSUB(T6A, T6D);
+			      {
+				   V T6S, T6I, T5T, T6n;
+				   T6S = VFNMS(LDK(KP773010453), T6H, T6G);
+				   T6I = VFMA(LDK(KP773010453), T6H, T6G);
+				   T5T = VFNMS(LDK(KP773010453), T5S, T5L);
+				   T6n = VFMA(LDK(KP773010453), T5S, T5L);
+				   {
+					V T6P, T6x, T6g, T6q;
+					T6P = VFNMS(LDK(KP773010453), T6w, T6v);
+					T6x = VFMA(LDK(KP773010453), T6w, T6v);
+					T6g = VFMA(LDK(KP773010453), T6f, T6c);
+					T6q = VFNMS(LDK(KP773010453), T6f, T6c);
+					{
+					     V T6M, T6O, T6U, T6W;
+					     T6M = VFNMS(LDK(KP941544065), T6L, T6I);
+					     T6O = VFMA(LDK(KP941544065), T6L, T6I);
+					     T6U = VFNMS(LDK(KP941544065), T6T, T6S);
+					     T6W = VFMA(LDK(KP941544065), T6T, T6S);
+					     {
+						  V T6p, T6t, T69, T6l;
+						  T6p = VFNMS(LDK(KP903989293), T6o, T6n);
+						  T6t = VFMA(LDK(KP903989293), T6o, T6n);
+						  T69 = VFNMS(LDK(KP903989293), T68, T5T);
+						  T6l = VFMA(LDK(KP903989293), T68, T5T);
+						  {
+						       V T6F, T6N, T6R, T6V;
+						       T6F = VFNMS(LDK(KP941544065), T6E, T6x);
+						       T6N = VFMA(LDK(KP941544065), T6E, T6x);
+						       T6R = VFMA(LDK(KP941544065), T6Q, T6P);
+						       T6V = VFNMS(LDK(KP941544065), T6Q, T6P);
+						       {
+							    V T6s, T6u, T6k, T6m;
+							    T6s = VFMA(LDK(KP903989293), T6r, T6q);
+							    T6u = VFNMS(LDK(KP903989293), T6r, T6q);
+							    T6k = VFNMS(LDK(KP903989293), T6j, T6g);
+							    T6m = VFMA(LDK(KP903989293), T6j, T6g);
+							    ST(&(xo[WS(os, 121)]), VFMAI(T6O, T6N), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 7)]), VFNMSI(T6O, T6N), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 57)]), VFMAI(T6M, T6F), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 71)]), VFNMSI(T6M, T6F), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 103)]), VFNMSI(T6W, T6V), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 25)]), VFMAI(T6W, T6V), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 89)]), VFMAI(T6U, T6R), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 39)]), VFNMSI(T6U, T6R), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 105)]), VFMAI(T6u, T6t), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 23)]), VFNMSI(T6u, T6t), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 87)]), VFNMSI(T6s, T6p), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 41)]), VFMAI(T6s, T6p), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 9)]), VFMAI(T6m, T6l), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 119)]), VFNMSI(T6m, T6l), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 73)]), VFMAI(T6k, T69), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 55)]), VFNMSI(T6k, T69), ovs, &(xo[WS(os, 1)]));
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 128, XSIMD_STRING("n1bv_128"), {440, 0, 642, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_128) (planner *p) {
+     X(kdft_register) (p, n1bv_128, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 128 -name n1bv_128 -include n1b.h */
+
+/*
+ * This function contains 1082 FP additions, 330 FP multiplications,
+ * (or, 938 additions, 186 multiplications, 144 fused multiply/add),
+ * 194 stack variables, 31 constants, and 256 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_128(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP146730474, +0.146730474455361751658850129646717819706215317);
+     DVK(KP989176509, +0.989176509964780973451673738016243063983689533);
+     DVK(KP595699304, +0.595699304492433343467036528829969889511926338);
+     DVK(KP803207531, +0.803207531480644909806676512963141923879569427);
+     DVK(KP049067674, +0.049067674327418014254954976942682658314745363);
+     DVK(KP998795456, +0.998795456205172392714771604759100694443203615);
+     DVK(KP671558954, +0.671558954847018400625376850427421803228750632);
+     DVK(KP740951125, +0.740951125354959091175616897495162729728955309);
+     DVK(KP514102744, +0.514102744193221726593693838968815772608049120);
+     DVK(KP857728610, +0.857728610000272069902269984284770137042490799);
+     DVK(KP242980179, +0.242980179903263889948274162077471118320990783);
+     DVK(KP970031253, +0.970031253194543992603984207286100251456865962);
+     DVK(KP427555093, +0.427555093430282094320966856888798534304578629);
+     DVK(KP903989293, +0.903989293123443331586200297230537048710132025);
+     DVK(KP336889853, +0.336889853392220050689253212619147570477766780);
+     DVK(KP941544065, +0.941544065183020778412509402599502357185589796);
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(256, is), MAKE_VOLATILE_STRIDE(256, os)) {
+	       V T49, T6e, Tev, TgK, TfA, TgL, T4U, T5J, T7R, T9o, Tah, TdG, Tcw, TdB, T84;
+	       V T8T, Tfk, Tfo, T1G, T64, Tgs, Th6, T2p, T62, T7t, T9c, Tce, Tdm, T7i, T9e;
+	       V Tc8, Tdp, TgF, TgG, T4q, T4V, TeC, Tfx, T4H, T4W, T7X, T86, Tcr, TdH, T7U;
+	       V T85, Taw, TdC, Tf3, Tf7, Tr, T5X, Tgl, Th3, T1a, T5V, T7a, T95, TbD, Tdf;
+	       V T6Z, T97, Tbx, Tdi, Tgy, Tgz, TgA, TaN, Tdv, TeK, Tfu, T2W, T5M, T35, T5N;
+	       V T7F, T8X, TaI, Tdu, T7C, T8W, TgB, TgC, TgD, Tb4, Tdy, TeR, Tfv, T3x, T5P;
+	       V T3G, T5Q, T7M, T90, TaZ, Tdx, T7J, T8Z, Tbm, Tdg, TbG, Tdj, Tgo, Th4, Tf0;
+	       V Tf8, T76, T98, T7d, T94, T10, T5Y, T1d, T5U, TbX, Tdn, Tch, Tdq, Tgv, Th7;
+	       V Tfh, Tfp, T7p, T9f, T7w, T9b, T2f, T65, T2s, T61;
+	       {
+		    V T47, Ta8, T4O, Ta7, T44, Tcu, T4P, Tct, Taa, Tab, T3P, Tac, T4R, Tad, Tae;
+		    V T3W, Taf, T4S;
+		    {
+			 V T45, T46, T4M, T4N;
+			 T45 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+			 T46 = LD(&(xi[WS(is, 96)]), ivs, &(xi[0]));
+			 T47 = VSUB(T45, T46);
+			 Ta8 = VADD(T45, T46);
+			 T4M = LD(&(xi[0]), ivs, &(xi[0]));
+			 T4N = LD(&(xi[WS(is, 64)]), ivs, &(xi[0]));
+			 T4O = VSUB(T4M, T4N);
+			 Ta7 = VADD(T4M, T4N);
+		    }
+		    {
+			 V T3Y, T3Z, T40, T41, T42, T43;
+			 T3Y = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T3Z = LD(&(xi[WS(is, 80)]), ivs, &(xi[0]));
+			 T40 = VSUB(T3Y, T3Z);
+			 T41 = LD(&(xi[WS(is, 112)]), ivs, &(xi[0]));
+			 T42 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+			 T43 = VSUB(T41, T42);
+			 T44 = VMUL(LDK(KP707106781), VSUB(T40, T43));
+			 Tcu = VADD(T41, T42);
+			 T4P = VMUL(LDK(KP707106781), VADD(T40, T43));
+			 Tct = VADD(T3Y, T3Z);
+		    }
+		    {
+			 V T3L, T3O, T3S, T3V;
+			 {
+			      V T3J, T3K, T3M, T3N;
+			      T3J = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			      T3K = LD(&(xi[WS(is, 72)]), ivs, &(xi[0]));
+			      T3L = VSUB(T3J, T3K);
+			      Taa = VADD(T3J, T3K);
+			      T3M = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+			      T3N = LD(&(xi[WS(is, 104)]), ivs, &(xi[0]));
+			      T3O = VSUB(T3M, T3N);
+			      Tab = VADD(T3M, T3N);
+			 }
+			 T3P = VFNMS(LDK(KP382683432), T3O, VMUL(LDK(KP923879532), T3L));
+			 Tac = VSUB(Taa, Tab);
+			 T4R = VFMA(LDK(KP382683432), T3L, VMUL(LDK(KP923879532), T3O));
+			 {
+			      V T3Q, T3R, T3T, T3U;
+			      T3Q = LD(&(xi[WS(is, 120)]), ivs, &(xi[0]));
+			      T3R = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+			      T3S = VSUB(T3Q, T3R);
+			      Tad = VADD(T3Q, T3R);
+			      T3T = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			      T3U = LD(&(xi[WS(is, 88)]), ivs, &(xi[0]));
+			      T3V = VSUB(T3T, T3U);
+			      Tae = VADD(T3T, T3U);
+			 }
+			 T3W = VFMA(LDK(KP923879532), T3S, VMUL(LDK(KP382683432), T3V));
+			 Taf = VSUB(Tad, Tae);
+			 T4S = VFNMS(LDK(KP382683432), T3S, VMUL(LDK(KP923879532), T3V));
+		    }
+		    {
+			 V T3X, T48, Tet, Teu;
+			 T3X = VSUB(T3P, T3W);
+			 T48 = VSUB(T44, T47);
+			 T49 = VSUB(T3X, T48);
+			 T6e = VADD(T48, T3X);
+			 Tet = VADD(Ta7, Ta8);
+			 Teu = VADD(Tct, Tcu);
+			 Tev = VSUB(Tet, Teu);
+			 TgK = VADD(Tet, Teu);
+		    }
+		    {
+			 V Tfy, Tfz, T4Q, T4T;
+			 Tfy = VADD(Taa, Tab);
+			 Tfz = VADD(Tad, Tae);
+			 TfA = VSUB(Tfy, Tfz);
+			 TgL = VADD(Tfy, Tfz);
+			 T4Q = VSUB(T4O, T4P);
+			 T4T = VSUB(T4R, T4S);
+			 T4U = VSUB(T4Q, T4T);
+			 T5J = VADD(T4Q, T4T);
+		    }
+		    {
+			 V T7P, T7Q, Ta9, Tag;
+			 T7P = VADD(T4R, T4S);
+			 T7Q = VADD(T47, T44);
+			 T7R = VSUB(T7P, T7Q);
+			 T9o = VADD(T7Q, T7P);
+			 Ta9 = VSUB(Ta7, Ta8);
+			 Tag = VMUL(LDK(KP707106781), VADD(Tac, Taf));
+			 Tah = VSUB(Ta9, Tag);
+			 TdG = VADD(Ta9, Tag);
+		    }
+		    {
+			 V Tcs, Tcv, T82, T83;
+			 Tcs = VMUL(LDK(KP707106781), VSUB(Tac, Taf));
+			 Tcv = VSUB(Tct, Tcu);
+			 Tcw = VSUB(Tcs, Tcv);
+			 TdB = VADD(Tcv, Tcs);
+			 T82 = VADD(T4O, T4P);
+			 T83 = VADD(T3P, T3W);
+			 T84 = VSUB(T82, T83);
+			 T8T = VADD(T82, T83);
+		    }
+	       }
+	       {
+		    V Tca, Tcb, T1i, Tfm, T2n, Tc5, Tc6, T1p, Tfn, T2k, T1x, Tfi, T2h, Tc0, T1E;
+		    V Tfj, T2i, Tc3, T1l, T1o, Tcc, Tcd;
+		    {
+			 V T1g, T1h, T2l, T2m;
+			 T1g = LD(&(xi[WS(is, 127)]), ivs, &(xi[WS(is, 1)]));
+			 T1h = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+			 Tca = VADD(T1g, T1h);
+			 T2l = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+			 T2m = LD(&(xi[WS(is, 95)]), ivs, &(xi[WS(is, 1)]));
+			 Tcb = VADD(T2l, T2m);
+			 T1i = VSUB(T1g, T1h);
+			 Tfm = VADD(Tca, Tcb);
+			 T2n = VSUB(T2l, T2m);
+		    }
+		    {
+			 V T1j, T1k, T1m, T1n;
+			 T1j = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T1k = LD(&(xi[WS(is, 79)]), ivs, &(xi[WS(is, 1)]));
+			 T1l = VSUB(T1j, T1k);
+			 Tc5 = VADD(T1j, T1k);
+			 T1m = LD(&(xi[WS(is, 111)]), ivs, &(xi[WS(is, 1)]));
+			 T1n = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+			 T1o = VSUB(T1m, T1n);
+			 Tc6 = VADD(T1m, T1n);
+		    }
+		    T1p = VMUL(LDK(KP707106781), VADD(T1l, T1o));
+		    Tfn = VADD(Tc5, Tc6);
+		    T2k = VMUL(LDK(KP707106781), VSUB(T1l, T1o));
+		    {
+			 V T1t, TbY, T1w, TbZ;
+			 {
+			      V T1r, T1s, T1u, T1v;
+			      T1r = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			      T1s = LD(&(xi[WS(is, 71)]), ivs, &(xi[WS(is, 1)]));
+			      T1t = VSUB(T1r, T1s);
+			      TbY = VADD(T1r, T1s);
+			      T1u = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+			      T1v = LD(&(xi[WS(is, 103)]), ivs, &(xi[WS(is, 1)]));
+			      T1w = VSUB(T1u, T1v);
+			      TbZ = VADD(T1u, T1v);
+			 }
+			 T1x = VFMA(LDK(KP382683432), T1t, VMUL(LDK(KP923879532), T1w));
+			 Tfi = VADD(TbY, TbZ);
+			 T2h = VFNMS(LDK(KP382683432), T1w, VMUL(LDK(KP923879532), T1t));
+			 Tc0 = VSUB(TbY, TbZ);
+		    }
+		    {
+			 V T1A, Tc2, T1D, Tc1;
+			 {
+			      V T1y, T1z, T1B, T1C;
+			      T1y = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			      T1z = LD(&(xi[WS(is, 87)]), ivs, &(xi[WS(is, 1)]));
+			      T1A = VSUB(T1y, T1z);
+			      Tc2 = VADD(T1y, T1z);
+			      T1B = LD(&(xi[WS(is, 119)]), ivs, &(xi[WS(is, 1)]));
+			      T1C = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+			      T1D = VSUB(T1B, T1C);
+			      Tc1 = VADD(T1B, T1C);
+			 }
+			 T1E = VFNMS(LDK(KP382683432), T1D, VMUL(LDK(KP923879532), T1A));
+			 Tfj = VADD(Tc1, Tc2);
+			 T2i = VFMA(LDK(KP923879532), T1D, VMUL(LDK(KP382683432), T1A));
+			 Tc3 = VSUB(Tc1, Tc2);
+		    }
+		    Tfk = VSUB(Tfi, Tfj);
+		    Tfo = VSUB(Tfm, Tfn);
+		    {
+			 V T1q, T1F, Tgq, Tgr;
+			 T1q = VSUB(T1i, T1p);
+			 T1F = VSUB(T1x, T1E);
+			 T1G = VSUB(T1q, T1F);
+			 T64 = VADD(T1q, T1F);
+			 Tgq = VADD(Tfm, Tfn);
+			 Tgr = VADD(Tfi, Tfj);
+			 Tgs = VSUB(Tgq, Tgr);
+			 Th6 = VADD(Tgq, Tgr);
+		    }
+		    {
+			 V T2j, T2o, T7r, T7s;
+			 T2j = VSUB(T2h, T2i);
+			 T2o = VSUB(T2k, T2n);
+			 T2p = VSUB(T2j, T2o);
+			 T62 = VADD(T2o, T2j);
+			 T7r = VADD(T1x, T1E);
+			 T7s = VADD(T2n, T2k);
+			 T7t = VSUB(T7r, T7s);
+			 T9c = VADD(T7s, T7r);
+		    }
+		    Tcc = VSUB(Tca, Tcb);
+		    Tcd = VMUL(LDK(KP707106781), VADD(Tc0, Tc3));
+		    Tce = VSUB(Tcc, Tcd);
+		    Tdm = VADD(Tcc, Tcd);
+		    {
+			 V T7g, T7h, Tc4, Tc7;
+			 T7g = VADD(T1i, T1p);
+			 T7h = VADD(T2h, T2i);
+			 T7i = VSUB(T7g, T7h);
+			 T9e = VADD(T7g, T7h);
+			 Tc4 = VMUL(LDK(KP707106781), VSUB(Tc0, Tc3));
+			 Tc7 = VSUB(Tc5, Tc6);
+			 Tc8 = VSUB(Tc4, Tc7);
+			 Tdp = VADD(Tc7, Tc4);
+		    }
+	       }
+	       {
+		    V T4c, Tew, T4o, Tak, T4A, Tez, T4E, Tau, T4j, Tex, T4l, Tan, T4x, TeA, T4F;
+		    V Tar, Tcp, Tcq;
+		    {
+			 V T4a, T4b, Tai, T4m, T4n, Taj;
+			 T4a = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T4b = LD(&(xi[WS(is, 68)]), ivs, &(xi[0]));
+			 Tai = VADD(T4a, T4b);
+			 T4m = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+			 T4n = LD(&(xi[WS(is, 100)]), ivs, &(xi[0]));
+			 Taj = VADD(T4m, T4n);
+			 T4c = VSUB(T4a, T4b);
+			 Tew = VADD(Tai, Taj);
+			 T4o = VSUB(T4m, T4n);
+			 Tak = VSUB(Tai, Taj);
+		    }
+		    {
+			 V T4y, T4z, Tat, T4C, T4D, Tas;
+			 T4y = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			 T4z = LD(&(xi[WS(is, 92)]), ivs, &(xi[0]));
+			 Tat = VADD(T4y, T4z);
+			 T4C = LD(&(xi[WS(is, 124)]), ivs, &(xi[0]));
+			 T4D = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+			 Tas = VADD(T4C, T4D);
+			 T4A = VSUB(T4y, T4z);
+			 Tez = VADD(Tas, Tat);
+			 T4E = VSUB(T4C, T4D);
+			 Tau = VSUB(Tas, Tat);
+		    }
+		    {
+			 V T4f, Tal, T4i, Tam;
+			 {
+			      V T4d, T4e, T4g, T4h;
+			      T4d = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			      T4e = LD(&(xi[WS(is, 84)]), ivs, &(xi[0]));
+			      T4f = VSUB(T4d, T4e);
+			      Tal = VADD(T4d, T4e);
+			      T4g = LD(&(xi[WS(is, 116)]), ivs, &(xi[0]));
+			      T4h = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+			      T4i = VSUB(T4g, T4h);
+			      Tam = VADD(T4g, T4h);
+			 }
+			 T4j = VMUL(LDK(KP707106781), VADD(T4f, T4i));
+			 Tex = VADD(Tal, Tam);
+			 T4l = VMUL(LDK(KP707106781), VSUB(T4f, T4i));
+			 Tan = VSUB(Tal, Tam);
+		    }
+		    {
+			 V T4t, Tap, T4w, Taq;
+			 {
+			      V T4r, T4s, T4u, T4v;
+			      T4r = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			      T4s = LD(&(xi[WS(is, 76)]), ivs, &(xi[0]));
+			      T4t = VSUB(T4r, T4s);
+			      Tap = VADD(T4r, T4s);
+			      T4u = LD(&(xi[WS(is, 108)]), ivs, &(xi[0]));
+			      T4v = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+			      T4w = VSUB(T4u, T4v);
+			      Taq = VADD(T4u, T4v);
+			 }
+			 T4x = VMUL(LDK(KP707106781), VSUB(T4t, T4w));
+			 TeA = VADD(Tap, Taq);
+			 T4F = VMUL(LDK(KP707106781), VADD(T4t, T4w));
+			 Tar = VSUB(Tap, Taq);
+		    }
+		    TgF = VADD(Tew, Tex);
+		    TgG = VADD(Tez, TeA);
+		    {
+			 V T4k, T4p, Tey, TeB;
+			 T4k = VSUB(T4c, T4j);
+			 T4p = VSUB(T4l, T4o);
+			 T4q = VFNMS(LDK(KP555570233), T4p, VMUL(LDK(KP831469612), T4k));
+			 T4V = VFMA(LDK(KP831469612), T4p, VMUL(LDK(KP555570233), T4k));
+			 Tey = VSUB(Tew, Tex);
+			 TeB = VSUB(Tez, TeA);
+			 TeC = VMUL(LDK(KP707106781), VADD(Tey, TeB));
+			 Tfx = VMUL(LDK(KP707106781), VSUB(Tey, TeB));
+		    }
+		    {
+			 V T4B, T4G, T7V, T7W;
+			 T4B = VSUB(T4x, T4A);
+			 T4G = VSUB(T4E, T4F);
+			 T4H = VFMA(LDK(KP555570233), T4B, VMUL(LDK(KP831469612), T4G));
+			 T4W = VFNMS(LDK(KP555570233), T4G, VMUL(LDK(KP831469612), T4B));
+			 T7V = VADD(T4A, T4x);
+			 T7W = VADD(T4E, T4F);
+			 T7X = VFMA(LDK(KP195090322), T7V, VMUL(LDK(KP980785280), T7W));
+			 T86 = VFNMS(LDK(KP195090322), T7W, VMUL(LDK(KP980785280), T7V));
+		    }
+		    Tcp = VFNMS(LDK(KP382683432), Tan, VMUL(LDK(KP923879532), Tak));
+		    Tcq = VFMA(LDK(KP923879532), Tau, VMUL(LDK(KP382683432), Tar));
+		    Tcr = VSUB(Tcp, Tcq);
+		    TdH = VADD(Tcp, Tcq);
+		    {
+			 V T7S, T7T, Tao, Tav;
+			 T7S = VADD(T4c, T4j);
+			 T7T = VADD(T4o, T4l);
+			 T7U = VFNMS(LDK(KP195090322), T7T, VMUL(LDK(KP980785280), T7S));
+			 T85 = VFMA(LDK(KP980785280), T7T, VMUL(LDK(KP195090322), T7S));
+			 Tao = VFMA(LDK(KP382683432), Tak, VMUL(LDK(KP923879532), Tan));
+			 Tav = VFNMS(LDK(KP382683432), Tau, VMUL(LDK(KP923879532), Tar));
+			 Taw = VSUB(Tao, Tav);
+			 TdC = VADD(Tao, Tav);
+		    }
+	       }
+	       {
+		    V Tbz, TbA, T3, Tf5, T18, Tbu, Tbv, Ta, Tf6, T15, Ti, Tf1, T12, Tbp, Tp;
+		    V Tf2, T13, Tbs, T6, T9, TbB, TbC;
+		    {
+			 V T1, T2, T16, T17;
+			 T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 65)]), ivs, &(xi[WS(is, 1)]));
+			 Tbz = VADD(T1, T2);
+			 T16 = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+			 T17 = LD(&(xi[WS(is, 97)]), ivs, &(xi[WS(is, 1)]));
+			 TbA = VADD(T16, T17);
+			 T3 = VSUB(T1, T2);
+			 Tf5 = VADD(Tbz, TbA);
+			 T18 = VSUB(T16, T17);
+		    }
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 T5 = LD(&(xi[WS(is, 81)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 Tbu = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 113)]), ivs, &(xi[WS(is, 1)]));
+			 T8 = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = VSUB(T7, T8);
+			 Tbv = VADD(T7, T8);
+		    }
+		    Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+		    Tf6 = VADD(Tbu, Tbv);
+		    T15 = VMUL(LDK(KP707106781), VSUB(T6, T9));
+		    {
+			 V Te, Tbn, Th, Tbo;
+			 {
+			      V Tc, Td, Tf, Tg;
+			      Tc = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			      Td = LD(&(xi[WS(is, 73)]), ivs, &(xi[WS(is, 1)]));
+			      Te = VSUB(Tc, Td);
+			      Tbn = VADD(Tc, Td);
+			      Tf = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+			      Tg = LD(&(xi[WS(is, 105)]), ivs, &(xi[WS(is, 1)]));
+			      Th = VSUB(Tf, Tg);
+			      Tbo = VADD(Tf, Tg);
+			 }
+			 Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
+			 Tf1 = VADD(Tbn, Tbo);
+			 T12 = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
+			 Tbp = VSUB(Tbn, Tbo);
+		    }
+		    {
+			 V Tl, Tbr, To, Tbq;
+			 {
+			      V Tj, Tk, Tm, Tn;
+			      Tj = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+			      Tk = LD(&(xi[WS(is, 89)]), ivs, &(xi[WS(is, 1)]));
+			      Tl = VSUB(Tj, Tk);
+			      Tbr = VADD(Tj, Tk);
+			      Tm = LD(&(xi[WS(is, 121)]), ivs, &(xi[WS(is, 1)]));
+			      Tn = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+			      To = VSUB(Tm, Tn);
+			      Tbq = VADD(Tm, Tn);
+			 }
+			 Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
+			 Tf2 = VADD(Tbq, Tbr);
+			 T13 = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
+			 Tbs = VSUB(Tbq, Tbr);
+		    }
+		    Tf3 = VSUB(Tf1, Tf2);
+		    Tf7 = VSUB(Tf5, Tf6);
+		    {
+			 V Tb, Tq, Tgj, Tgk;
+			 Tb = VSUB(T3, Ta);
+			 Tq = VSUB(Ti, Tp);
+			 Tr = VSUB(Tb, Tq);
+			 T5X = VADD(Tb, Tq);
+			 Tgj = VADD(Tf5, Tf6);
+			 Tgk = VADD(Tf1, Tf2);
+			 Tgl = VSUB(Tgj, Tgk);
+			 Th3 = VADD(Tgj, Tgk);
+		    }
+		    {
+			 V T14, T19, T78, T79;
+			 T14 = VSUB(T12, T13);
+			 T19 = VSUB(T15, T18);
+			 T1a = VSUB(T14, T19);
+			 T5V = VADD(T19, T14);
+			 T78 = VADD(Ti, Tp);
+			 T79 = VADD(T18, T15);
+			 T7a = VSUB(T78, T79);
+			 T95 = VADD(T79, T78);
+		    }
+		    TbB = VSUB(Tbz, TbA);
+		    TbC = VMUL(LDK(KP707106781), VADD(Tbp, Tbs));
+		    TbD = VSUB(TbB, TbC);
+		    Tdf = VADD(TbB, TbC);
+		    {
+			 V T6X, T6Y, Tbt, Tbw;
+			 T6X = VADD(T3, Ta);
+			 T6Y = VADD(T12, T13);
+			 T6Z = VSUB(T6X, T6Y);
+			 T97 = VADD(T6X, T6Y);
+			 Tbt = VMUL(LDK(KP707106781), VSUB(Tbp, Tbs));
+			 Tbw = VSUB(Tbu, Tbv);
+			 Tbx = VSUB(Tbt, Tbw);
+			 Tdi = VADD(Tbw, Tbt);
+		    }
+	       }
+	       {
+		    V TaK, TaJ, T2U, TeE, T2Z, TaF, TaG, T2R, TeF, T30, T2C, TeH, T32, TaA, T2J;
+		    V TeI, T33, TaD, T2N, T2Q, TaL, TaM;
+		    {
+			 V T2S, T2T, T2X, T2Y;
+			 T2S = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+			 T2T = LD(&(xi[WS(is, 98)]), ivs, &(xi[0]));
+			 TaK = VADD(T2S, T2T);
+			 T2X = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T2Y = LD(&(xi[WS(is, 66)]), ivs, &(xi[0]));
+			 TaJ = VADD(T2X, T2Y);
+			 T2U = VSUB(T2S, T2T);
+			 TeE = VADD(TaJ, TaK);
+			 T2Z = VSUB(T2X, T2Y);
+		    }
+		    {
+			 V T2L, T2M, T2O, T2P;
+			 T2L = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 T2M = LD(&(xi[WS(is, 82)]), ivs, &(xi[0]));
+			 T2N = VSUB(T2L, T2M);
+			 TaF = VADD(T2L, T2M);
+			 T2O = LD(&(xi[WS(is, 114)]), ivs, &(xi[0]));
+			 T2P = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+			 T2Q = VSUB(T2O, T2P);
+			 TaG = VADD(T2O, T2P);
+		    }
+		    T2R = VMUL(LDK(KP707106781), VSUB(T2N, T2Q));
+		    TeF = VADD(TaF, TaG);
+		    T30 = VMUL(LDK(KP707106781), VADD(T2N, T2Q));
+		    {
+			 V T2y, Tay, T2B, Taz;
+			 {
+			      V T2w, T2x, T2z, T2A;
+			      T2w = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			      T2x = LD(&(xi[WS(is, 74)]), ivs, &(xi[0]));
+			      T2y = VSUB(T2w, T2x);
+			      Tay = VADD(T2w, T2x);
+			      T2z = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+			      T2A = LD(&(xi[WS(is, 106)]), ivs, &(xi[0]));
+			      T2B = VSUB(T2z, T2A);
+			      Taz = VADD(T2z, T2A);
+			 }
+			 T2C = VFNMS(LDK(KP382683432), T2B, VMUL(LDK(KP923879532), T2y));
+			 TeH = VADD(Tay, Taz);
+			 T32 = VFMA(LDK(KP382683432), T2y, VMUL(LDK(KP923879532), T2B));
+			 TaA = VSUB(Tay, Taz);
+		    }
+		    {
+			 V T2F, TaB, T2I, TaC;
+			 {
+			      V T2D, T2E, T2G, T2H;
+			      T2D = LD(&(xi[WS(is, 122)]), ivs, &(xi[0]));
+			      T2E = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+			      T2F = VSUB(T2D, T2E);
+			      TaB = VADD(T2D, T2E);
+			      T2G = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+			      T2H = LD(&(xi[WS(is, 90)]), ivs, &(xi[0]));
+			      T2I = VSUB(T2G, T2H);
+			      TaC = VADD(T2G, T2H);
+			 }
+			 T2J = VFMA(LDK(KP923879532), T2F, VMUL(LDK(KP382683432), T2I));
+			 TeI = VADD(TaB, TaC);
+			 T33 = VFNMS(LDK(KP382683432), T2F, VMUL(LDK(KP923879532), T2I));
+			 TaD = VSUB(TaB, TaC);
+		    }
+		    Tgy = VADD(TeE, TeF);
+		    Tgz = VADD(TeH, TeI);
+		    TgA = VSUB(Tgy, Tgz);
+		    TaL = VSUB(TaJ, TaK);
+		    TaM = VMUL(LDK(KP707106781), VADD(TaA, TaD));
+		    TaN = VSUB(TaL, TaM);
+		    Tdv = VADD(TaL, TaM);
+		    {
+			 V TeG, TeJ, T2K, T2V;
+			 TeG = VSUB(TeE, TeF);
+			 TeJ = VSUB(TeH, TeI);
+			 TeK = VFMA(LDK(KP382683432), TeG, VMUL(LDK(KP923879532), TeJ));
+			 Tfu = VFNMS(LDK(KP382683432), TeJ, VMUL(LDK(KP923879532), TeG));
+			 T2K = VSUB(T2C, T2J);
+			 T2V = VSUB(T2R, T2U);
+			 T2W = VSUB(T2K, T2V);
+			 T5M = VADD(T2V, T2K);
+		    }
+		    {
+			 V T31, T34, T7D, T7E;
+			 T31 = VSUB(T2Z, T30);
+			 T34 = VSUB(T32, T33);
+			 T35 = VSUB(T31, T34);
+			 T5N = VADD(T31, T34);
+			 T7D = VADD(T32, T33);
+			 T7E = VADD(T2U, T2R);
+			 T7F = VSUB(T7D, T7E);
+			 T8X = VADD(T7E, T7D);
+		    }
+		    {
+			 V TaE, TaH, T7A, T7B;
+			 TaE = VMUL(LDK(KP707106781), VSUB(TaA, TaD));
+			 TaH = VSUB(TaF, TaG);
+			 TaI = VSUB(TaE, TaH);
+			 Tdu = VADD(TaH, TaE);
+			 T7A = VADD(T2Z, T30);
+			 T7B = VADD(T2C, T2J);
+			 T7C = VSUB(T7A, T7B);
+			 T8W = VADD(T7A, T7B);
+		    }
+	       }
+	       {
+		    V Tb1, Tb0, T3v, TeO, T3A, TaW, TaX, T3s, TeP, T3B, T3d, TeL, T3D, TaR, T3k;
+		    V TeM, T3E, TaU, T3o, T3r, Tb2, Tb3;
+		    {
+			 V T3t, T3u, T3y, T3z;
+			 T3t = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+			 T3u = LD(&(xi[WS(is, 94)]), ivs, &(xi[0]));
+			 Tb1 = VADD(T3t, T3u);
+			 T3y = LD(&(xi[WS(is, 126)]), ivs, &(xi[0]));
+			 T3z = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+			 Tb0 = VADD(T3y, T3z);
+			 T3v = VSUB(T3t, T3u);
+			 TeO = VADD(Tb0, Tb1);
+			 T3A = VSUB(T3y, T3z);
+		    }
+		    {
+			 V T3m, T3n, T3p, T3q;
+			 T3m = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 T3n = LD(&(xi[WS(is, 78)]), ivs, &(xi[0]));
+			 T3o = VSUB(T3m, T3n);
+			 TaW = VADD(T3m, T3n);
+			 T3p = LD(&(xi[WS(is, 110)]), ivs, &(xi[0]));
+			 T3q = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+			 T3r = VSUB(T3p, T3q);
+			 TaX = VADD(T3p, T3q);
+		    }
+		    T3s = VMUL(LDK(KP707106781), VSUB(T3o, T3r));
+		    TeP = VADD(TaW, TaX);
+		    T3B = VMUL(LDK(KP707106781), VADD(T3o, T3r));
+		    {
+			 V T39, TaP, T3c, TaQ;
+			 {
+			      V T37, T38, T3a, T3b;
+			      T37 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			      T38 = LD(&(xi[WS(is, 70)]), ivs, &(xi[0]));
+			      T39 = VSUB(T37, T38);
+			      TaP = VADD(T37, T38);
+			      T3a = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+			      T3b = LD(&(xi[WS(is, 102)]), ivs, &(xi[0]));
+			      T3c = VSUB(T3a, T3b);
+			      TaQ = VADD(T3a, T3b);
+			 }
+			 T3d = VFNMS(LDK(KP382683432), T3c, VMUL(LDK(KP923879532), T39));
+			 TeL = VADD(TaP, TaQ);
+			 T3D = VFMA(LDK(KP382683432), T39, VMUL(LDK(KP923879532), T3c));
+			 TaR = VSUB(TaP, TaQ);
+		    }
+		    {
+			 V T3g, TaS, T3j, TaT;
+			 {
+			      V T3e, T3f, T3h, T3i;
+			      T3e = LD(&(xi[WS(is, 118)]), ivs, &(xi[0]));
+			      T3f = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+			      T3g = VSUB(T3e, T3f);
+			      TaS = VADD(T3e, T3f);
+			      T3h = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			      T3i = LD(&(xi[WS(is, 86)]), ivs, &(xi[0]));
+			      T3j = VSUB(T3h, T3i);
+			      TaT = VADD(T3h, T3i);
+			 }
+			 T3k = VFMA(LDK(KP923879532), T3g, VMUL(LDK(KP382683432), T3j));
+			 TeM = VADD(TaS, TaT);
+			 T3E = VFNMS(LDK(KP382683432), T3g, VMUL(LDK(KP923879532), T3j));
+			 TaU = VSUB(TaS, TaT);
+		    }
+		    TgB = VADD(TeO, TeP);
+		    TgC = VADD(TeL, TeM);
+		    TgD = VSUB(TgB, TgC);
+		    Tb2 = VSUB(Tb0, Tb1);
+		    Tb3 = VMUL(LDK(KP707106781), VADD(TaR, TaU));
+		    Tb4 = VSUB(Tb2, Tb3);
+		    Tdy = VADD(Tb2, Tb3);
+		    {
+			 V TeN, TeQ, T3l, T3w;
+			 TeN = VSUB(TeL, TeM);
+			 TeQ = VSUB(TeO, TeP);
+			 TeR = VFNMS(LDK(KP382683432), TeQ, VMUL(LDK(KP923879532), TeN));
+			 Tfv = VFMA(LDK(KP923879532), TeQ, VMUL(LDK(KP382683432), TeN));
+			 T3l = VSUB(T3d, T3k);
+			 T3w = VSUB(T3s, T3v);
+			 T3x = VSUB(T3l, T3w);
+			 T5P = VADD(T3w, T3l);
+		    }
+		    {
+			 V T3C, T3F, T7K, T7L;
+			 T3C = VSUB(T3A, T3B);
+			 T3F = VSUB(T3D, T3E);
+			 T3G = VSUB(T3C, T3F);
+			 T5Q = VADD(T3C, T3F);
+			 T7K = VADD(T3A, T3B);
+			 T7L = VADD(T3d, T3k);
+			 T7M = VSUB(T7K, T7L);
+			 T90 = VADD(T7K, T7L);
+		    }
+		    {
+			 V TaV, TaY, T7H, T7I;
+			 TaV = VMUL(LDK(KP707106781), VSUB(TaR, TaU));
+			 TaY = VSUB(TaW, TaX);
+			 TaZ = VSUB(TaV, TaY);
+			 Tdx = VADD(TaY, TaV);
+			 T7H = VADD(T3D, T3E);
+			 T7I = VADD(T3v, T3s);
+			 T7J = VSUB(T7H, T7I);
+			 T8Z = VADD(T7I, T7H);
+		    }
+	       }
+	       {
+		    V TB, TeU, TF, Tba, TS, TeX, TW, Tbh, Ty, TeV, TG, Tbd, TP, TeY, TX;
+		    V Tbk;
+		    {
+			 V Tz, TA, Tb9, TD, TE, Tb8;
+			 Tz = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+			 TA = LD(&(xi[WS(is, 101)]), ivs, &(xi[WS(is, 1)]));
+			 Tb9 = VADD(Tz, TA);
+			 TD = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 TE = LD(&(xi[WS(is, 69)]), ivs, &(xi[WS(is, 1)]));
+			 Tb8 = VADD(TD, TE);
+			 TB = VSUB(Tz, TA);
+			 TeU = VADD(Tb8, Tb9);
+			 TF = VSUB(TD, TE);
+			 Tba = VSUB(Tb8, Tb9);
+		    }
+		    {
+			 V TQ, TR, Tbg, TU, TV, Tbf;
+			 TQ = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+			 TR = LD(&(xi[WS(is, 93)]), ivs, &(xi[WS(is, 1)]));
+			 Tbg = VADD(TQ, TR);
+			 TU = LD(&(xi[WS(is, 125)]), ivs, &(xi[WS(is, 1)]));
+			 TV = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+			 Tbf = VADD(TU, TV);
+			 TS = VSUB(TQ, TR);
+			 TeX = VADD(Tbf, Tbg);
+			 TW = VSUB(TU, TV);
+			 Tbh = VSUB(Tbf, Tbg);
+		    }
+		    {
+			 V Tu, Tbb, Tx, Tbc;
+			 {
+			      V Ts, Tt, Tv, Tw;
+			      Ts = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			      Tt = LD(&(xi[WS(is, 85)]), ivs, &(xi[WS(is, 1)]));
+			      Tu = VSUB(Ts, Tt);
+			      Tbb = VADD(Ts, Tt);
+			      Tv = LD(&(xi[WS(is, 117)]), ivs, &(xi[WS(is, 1)]));
+			      Tw = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+			      Tx = VSUB(Tv, Tw);
+			      Tbc = VADD(Tv, Tw);
+			 }
+			 Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
+			 TeV = VADD(Tbb, Tbc);
+			 TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
+			 Tbd = VSUB(Tbb, Tbc);
+		    }
+		    {
+			 V TL, Tbi, TO, Tbj;
+			 {
+			      V TJ, TK, TM, TN;
+			      TJ = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			      TK = LD(&(xi[WS(is, 77)]), ivs, &(xi[WS(is, 1)]));
+			      TL = VSUB(TJ, TK);
+			      Tbi = VADD(TJ, TK);
+			      TM = LD(&(xi[WS(is, 109)]), ivs, &(xi[WS(is, 1)]));
+			      TN = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+			      TO = VSUB(TM, TN);
+			      Tbj = VADD(TM, TN);
+			 }
+			 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
+			 TeY = VADD(Tbi, Tbj);
+			 TX = VMUL(LDK(KP707106781), VADD(TL, TO));
+			 Tbk = VSUB(Tbi, Tbj);
+		    }
+		    {
+			 V Tbe, Tbl, TeW, TeZ;
+			 Tbe = VFNMS(LDK(KP382683432), Tbd, VMUL(LDK(KP923879532), Tba));
+			 Tbl = VFMA(LDK(KP923879532), Tbh, VMUL(LDK(KP382683432), Tbk));
+			 Tbm = VSUB(Tbe, Tbl);
+			 Tdg = VADD(Tbe, Tbl);
+			 {
+			      V TbE, TbF, Tgm, Tgn;
+			      TbE = VFMA(LDK(KP382683432), Tba, VMUL(LDK(KP923879532), Tbd));
+			      TbF = VFNMS(LDK(KP382683432), Tbh, VMUL(LDK(KP923879532), Tbk));
+			      TbG = VSUB(TbE, TbF);
+			      Tdj = VADD(TbE, TbF);
+			      Tgm = VADD(TeU, TeV);
+			      Tgn = VADD(TeX, TeY);
+			      Tgo = VSUB(Tgm, Tgn);
+			      Th4 = VADD(Tgm, Tgn);
+			 }
+			 TeW = VSUB(TeU, TeV);
+			 TeZ = VSUB(TeX, TeY);
+			 Tf0 = VMUL(LDK(KP707106781), VSUB(TeW, TeZ));
+			 Tf8 = VMUL(LDK(KP707106781), VADD(TeW, TeZ));
+			 {
+			      V T72, T7b, T75, T7c;
+			      {
+				   V T70, T71, T73, T74;
+				   T70 = VADD(TB, Ty);
+				   T71 = VADD(TF, TG);
+				   T72 = VFMA(LDK(KP980785280), T70, VMUL(LDK(KP195090322), T71));
+				   T7b = VFNMS(LDK(KP195090322), T70, VMUL(LDK(KP980785280), T71));
+				   T73 = VADD(TS, TP);
+				   T74 = VADD(TW, TX);
+				   T75 = VFNMS(LDK(KP195090322), T74, VMUL(LDK(KP980785280), T73));
+				   T7c = VFMA(LDK(KP195090322), T73, VMUL(LDK(KP980785280), T74));
+			      }
+			      T76 = VSUB(T72, T75);
+			      T98 = VADD(T7b, T7c);
+			      T7d = VSUB(T7b, T7c);
+			      T94 = VADD(T72, T75);
+			 }
+			 {
+			      V TI, T1b, TZ, T1c;
+			      {
+				   V TC, TH, TT, TY;
+				   TC = VSUB(Ty, TB);
+				   TH = VSUB(TF, TG);
+				   TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
+				   T1b = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
+				   TT = VSUB(TP, TS);
+				   TY = VSUB(TW, TX);
+				   TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
+				   T1c = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
+			      }
+			      T10 = VSUB(TI, TZ);
+			      T5Y = VADD(T1b, T1c);
+			      T1d = VSUB(T1b, T1c);
+			      T5U = VADD(TI, TZ);
+			 }
+		    }
+	       }
+	       {
+		    V T1Q, Tfb, T1U, TbL, T27, Tfe, T2b, TbS, T1N, Tfc, T1V, TbO, T24, Tff, T2c;
+		    V TbV;
+		    {
+			 V T1O, T1P, TbK, T1S, T1T, TbJ;
+			 T1O = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+			 T1P = LD(&(xi[WS(is, 99)]), ivs, &(xi[WS(is, 1)]));
+			 TbK = VADD(T1O, T1P);
+			 T1S = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T1T = LD(&(xi[WS(is, 67)]), ivs, &(xi[WS(is, 1)]));
+			 TbJ = VADD(T1S, T1T);
+			 T1Q = VSUB(T1O, T1P);
+			 Tfb = VADD(TbJ, TbK);
+			 T1U = VSUB(T1S, T1T);
+			 TbL = VSUB(TbJ, TbK);
+		    }
+		    {
+			 V T25, T26, TbR, T29, T2a, TbQ;
+			 T25 = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+			 T26 = LD(&(xi[WS(is, 91)]), ivs, &(xi[WS(is, 1)]));
+			 TbR = VADD(T25, T26);
+			 T29 = LD(&(xi[WS(is, 123)]), ivs, &(xi[WS(is, 1)]));
+			 T2a = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+			 TbQ = VADD(T29, T2a);
+			 T27 = VSUB(T25, T26);
+			 Tfe = VADD(TbQ, TbR);
+			 T2b = VSUB(T29, T2a);
+			 TbS = VSUB(TbQ, TbR);
+		    }
+		    {
+			 V T1J, TbM, T1M, TbN;
+			 {
+			      V T1H, T1I, T1K, T1L;
+			      T1H = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			      T1I = LD(&(xi[WS(is, 83)]), ivs, &(xi[WS(is, 1)]));
+			      T1J = VSUB(T1H, T1I);
+			      TbM = VADD(T1H, T1I);
+			      T1K = LD(&(xi[WS(is, 115)]), ivs, &(xi[WS(is, 1)]));
+			      T1L = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+			      T1M = VSUB(T1K, T1L);
+			      TbN = VADD(T1K, T1L);
+			 }
+			 T1N = VMUL(LDK(KP707106781), VSUB(T1J, T1M));
+			 Tfc = VADD(TbM, TbN);
+			 T1V = VMUL(LDK(KP707106781), VADD(T1J, T1M));
+			 TbO = VSUB(TbM, TbN);
+		    }
+		    {
+			 V T20, TbT, T23, TbU;
+			 {
+			      V T1Y, T1Z, T21, T22;
+			      T1Y = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			      T1Z = LD(&(xi[WS(is, 75)]), ivs, &(xi[WS(is, 1)]));
+			      T20 = VSUB(T1Y, T1Z);
+			      TbT = VADD(T1Y, T1Z);
+			      T21 = LD(&(xi[WS(is, 107)]), ivs, &(xi[WS(is, 1)]));
+			      T22 = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+			      T23 = VSUB(T21, T22);
+			      TbU = VADD(T21, T22);
+			 }
+			 T24 = VMUL(LDK(KP707106781), VSUB(T20, T23));
+			 Tff = VADD(TbT, TbU);
+			 T2c = VMUL(LDK(KP707106781), VADD(T20, T23));
+			 TbV = VSUB(TbT, TbU);
+		    }
+		    {
+			 V TbP, TbW, Tfd, Tfg;
+			 TbP = VFNMS(LDK(KP382683432), TbO, VMUL(LDK(KP923879532), TbL));
+			 TbW = VFMA(LDK(KP923879532), TbS, VMUL(LDK(KP382683432), TbV));
+			 TbX = VSUB(TbP, TbW);
+			 Tdn = VADD(TbP, TbW);
+			 {
+			      V Tcf, Tcg, Tgt, Tgu;
+			      Tcf = VFMA(LDK(KP382683432), TbL, VMUL(LDK(KP923879532), TbO));
+			      Tcg = VFNMS(LDK(KP382683432), TbS, VMUL(LDK(KP923879532), TbV));
+			      Tch = VSUB(Tcf, Tcg);
+			      Tdq = VADD(Tcf, Tcg);
+			      Tgt = VADD(Tfb, Tfc);
+			      Tgu = VADD(Tfe, Tff);
+			      Tgv = VSUB(Tgt, Tgu);
+			      Th7 = VADD(Tgt, Tgu);
+			 }
+			 Tfd = VSUB(Tfb, Tfc);
+			 Tfg = VSUB(Tfe, Tff);
+			 Tfh = VMUL(LDK(KP707106781), VSUB(Tfd, Tfg));
+			 Tfp = VMUL(LDK(KP707106781), VADD(Tfd, Tfg));
+			 {
+			      V T7l, T7u, T7o, T7v;
+			      {
+				   V T7j, T7k, T7m, T7n;
+				   T7j = VADD(T1Q, T1N);
+				   T7k = VADD(T1U, T1V);
+				   T7l = VFMA(LDK(KP980785280), T7j, VMUL(LDK(KP195090322), T7k));
+				   T7u = VFNMS(LDK(KP195090322), T7j, VMUL(LDK(KP980785280), T7k));
+				   T7m = VADD(T27, T24);
+				   T7n = VADD(T2b, T2c);
+				   T7o = VFNMS(LDK(KP195090322), T7n, VMUL(LDK(KP980785280), T7m));
+				   T7v = VFMA(LDK(KP195090322), T7m, VMUL(LDK(KP980785280), T7n));
+			      }
+			      T7p = VSUB(T7l, T7o);
+			      T9f = VADD(T7u, T7v);
+			      T7w = VSUB(T7u, T7v);
+			      T9b = VADD(T7l, T7o);
+			 }
+			 {
+			      V T1X, T2q, T2e, T2r;
+			      {
+				   V T1R, T1W, T28, T2d;
+				   T1R = VSUB(T1N, T1Q);
+				   T1W = VSUB(T1U, T1V);
+				   T1X = VFMA(LDK(KP831469612), T1R, VMUL(LDK(KP555570233), T1W));
+				   T2q = VFNMS(LDK(KP555570233), T1R, VMUL(LDK(KP831469612), T1W));
+				   T28 = VSUB(T24, T27);
+				   T2d = VSUB(T2b, T2c);
+				   T2e = VFNMS(LDK(KP555570233), T2d, VMUL(LDK(KP831469612), T28));
+				   T2r = VFMA(LDK(KP555570233), T28, VMUL(LDK(KP831469612), T2d));
+			      }
+			      T2f = VSUB(T1X, T2e);
+			      T65 = VADD(T2q, T2r);
+			      T2s = VSUB(T2q, T2r);
+			      T61 = VADD(T1X, T2e);
+			 }
+		    }
+	       }
+	       {
+		    V Tgx, TgW, TgR, TgZ, TgI, TgY, TgO, TgV;
+		    {
+			 V Tgp, Tgw, TgP, TgQ;
+			 Tgp = VFNMS(LDK(KP382683432), Tgo, VMUL(LDK(KP923879532), Tgl));
+			 Tgw = VFMA(LDK(KP923879532), Tgs, VMUL(LDK(KP382683432), Tgv));
+			 Tgx = VSUB(Tgp, Tgw);
+			 TgW = VADD(Tgp, Tgw);
+			 TgP = VFMA(LDK(KP382683432), Tgl, VMUL(LDK(KP923879532), Tgo));
+			 TgQ = VFNMS(LDK(KP382683432), Tgs, VMUL(LDK(KP923879532), Tgv));
+			 TgR = VSUB(TgP, TgQ);
+			 TgZ = VADD(TgP, TgQ);
+		    }
+		    {
+			 V TgE, TgH, TgM, TgN;
+			 TgE = VMUL(LDK(KP707106781), VSUB(TgA, TgD));
+			 TgH = VSUB(TgF, TgG);
+			 TgI = VSUB(TgE, TgH);
+			 TgY = VADD(TgH, TgE);
+			 TgM = VSUB(TgK, TgL);
+			 TgN = VMUL(LDK(KP707106781), VADD(TgA, TgD));
+			 TgO = VSUB(TgM, TgN);
+			 TgV = VADD(TgM, TgN);
+		    }
+		    {
+			 V TgJ, TgS, Th1, Th2;
+			 TgJ = VBYI(VSUB(Tgx, TgI));
+			 TgS = VSUB(TgO, TgR);
+			 ST(&(xo[WS(os, 40)]), VADD(TgJ, TgS), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 88)]), VSUB(TgS, TgJ), ovs, &(xo[0]));
+			 Th1 = VSUB(TgV, TgW);
+			 Th2 = VBYI(VSUB(TgZ, TgY));
+			 ST(&(xo[WS(os, 72)]), VSUB(Th1, Th2), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 56)]), VADD(Th1, Th2), ovs, &(xo[0]));
+		    }
+		    {
+			 V TgT, TgU, TgX, Th0;
+			 TgT = VBYI(VADD(TgI, Tgx));
+			 TgU = VADD(TgO, TgR);
+			 ST(&(xo[WS(os, 24)]), VADD(TgT, TgU), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 104)]), VSUB(TgU, TgT), ovs, &(xo[0]));
+			 TgX = VADD(TgV, TgW);
+			 Th0 = VBYI(VADD(TgY, TgZ));
+			 ST(&(xo[WS(os, 120)]), VSUB(TgX, Th0), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 8)]), VADD(TgX, Th0), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V Th9, Thh, Thq, Ths, Thc, Thm, Thg, Thl, Thn, Thr;
+		    {
+			 V Th5, Th8, Tho, Thp;
+			 Th5 = VSUB(Th3, Th4);
+			 Th8 = VSUB(Th6, Th7);
+			 Th9 = VMUL(LDK(KP707106781), VSUB(Th5, Th8));
+			 Thh = VMUL(LDK(KP707106781), VADD(Th5, Th8));
+			 Tho = VADD(Th3, Th4);
+			 Thp = VADD(Th6, Th7);
+			 Thq = VBYI(VSUB(Tho, Thp));
+			 Ths = VADD(Tho, Thp);
+		    }
+		    {
+			 V Tha, Thb, The, Thf;
+			 Tha = VADD(Tgy, Tgz);
+			 Thb = VADD(TgB, TgC);
+			 Thc = VSUB(Tha, Thb);
+			 Thm = VADD(Tha, Thb);
+			 The = VADD(TgK, TgL);
+			 Thf = VADD(TgF, TgG);
+			 Thg = VSUB(The, Thf);
+			 Thl = VADD(The, Thf);
+		    }
+		    Thn = VSUB(Thl, Thm);
+		    ST(&(xo[WS(os, 96)]), VSUB(Thn, Thq), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 32)]), VADD(Thn, Thq), ovs, &(xo[0]));
+		    Thr = VADD(Thl, Thm);
+		    ST(&(xo[WS(os, 64)]), VSUB(Thr, Ths), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(Thr, Ths), ovs, &(xo[0]));
+		    {
+			 V Thd, Thi, Thj, Thk;
+			 Thd = VBYI(VSUB(Th9, Thc));
+			 Thi = VSUB(Thg, Thh);
+			 ST(&(xo[WS(os, 48)]), VADD(Thd, Thi), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 80)]), VSUB(Thi, Thd), ovs, &(xo[0]));
+			 Thj = VBYI(VADD(Thc, Th9));
+			 Thk = VADD(Thg, Thh);
+			 ST(&(xo[WS(os, 16)]), VADD(Thj, Thk), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 112)]), VSUB(Thk, Thj), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V TeT, TfM, TfC, TfK, Tfs, TfN, TfF, TfJ;
+		    {
+			 V TeD, TeS, Tfw, TfB;
+			 TeD = VSUB(Tev, TeC);
+			 TeS = VSUB(TeK, TeR);
+			 TeT = VSUB(TeD, TeS);
+			 TfM = VADD(TeD, TeS);
+			 Tfw = VSUB(Tfu, Tfv);
+			 TfB = VSUB(Tfx, TfA);
+			 TfC = VSUB(Tfw, TfB);
+			 TfK = VADD(TfB, Tfw);
+			 {
+			      V Tfa, TfD, Tfr, TfE;
+			      {
+				   V Tf4, Tf9, Tfl, Tfq;
+				   Tf4 = VSUB(Tf0, Tf3);
+				   Tf9 = VSUB(Tf7, Tf8);
+				   Tfa = VFMA(LDK(KP831469612), Tf4, VMUL(LDK(KP555570233), Tf9));
+				   TfD = VFNMS(LDK(KP555570233), Tf4, VMUL(LDK(KP831469612), Tf9));
+				   Tfl = VSUB(Tfh, Tfk);
+				   Tfq = VSUB(Tfo, Tfp);
+				   Tfr = VFNMS(LDK(KP555570233), Tfq, VMUL(LDK(KP831469612), Tfl));
+				   TfE = VFMA(LDK(KP555570233), Tfl, VMUL(LDK(KP831469612), Tfq));
+			      }
+			      Tfs = VSUB(Tfa, Tfr);
+			      TfN = VADD(TfD, TfE);
+			      TfF = VSUB(TfD, TfE);
+			      TfJ = VADD(Tfa, Tfr);
+			 }
+		    }
+		    {
+			 V Tft, TfG, TfP, TfQ;
+			 Tft = VADD(TeT, Tfs);
+			 TfG = VBYI(VADD(TfC, TfF));
+			 ST(&(xo[WS(os, 108)]), VSUB(Tft, TfG), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 20)]), VADD(Tft, TfG), ovs, &(xo[0]));
+			 TfP = VBYI(VADD(TfK, TfJ));
+			 TfQ = VADD(TfM, TfN);
+			 ST(&(xo[WS(os, 12)]), VADD(TfP, TfQ), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 116)]), VSUB(TfQ, TfP), ovs, &(xo[0]));
+		    }
+		    {
+			 V TfH, TfI, TfL, TfO;
+			 TfH = VSUB(TeT, Tfs);
+			 TfI = VBYI(VSUB(TfF, TfC));
+			 ST(&(xo[WS(os, 84)]), VSUB(TfH, TfI), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 44)]), VADD(TfH, TfI), ovs, &(xo[0]));
+			 TfL = VBYI(VSUB(TfJ, TfK));
+			 TfO = VSUB(TfM, TfN);
+			 ST(&(xo[WS(os, 52)]), VADD(TfL, TfO), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 76)]), VSUB(TfO, TfL), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V TfT, Tge, Tg4, Tgc, Tg0, Tgf, Tg7, Tgb;
+		    {
+			 V TfR, TfS, Tg2, Tg3;
+			 TfR = VADD(Tev, TeC);
+			 TfS = VADD(Tfu, Tfv);
+			 TfT = VSUB(TfR, TfS);
+			 Tge = VADD(TfR, TfS);
+			 Tg2 = VADD(TeK, TeR);
+			 Tg3 = VADD(TfA, Tfx);
+			 Tg4 = VSUB(Tg2, Tg3);
+			 Tgc = VADD(Tg3, Tg2);
+			 {
+			      V TfW, Tg5, TfZ, Tg6;
+			      {
+				   V TfU, TfV, TfX, TfY;
+				   TfU = VADD(Tf3, Tf0);
+				   TfV = VADD(Tf7, Tf8);
+				   TfW = VFMA(LDK(KP980785280), TfU, VMUL(LDK(KP195090322), TfV));
+				   Tg5 = VFNMS(LDK(KP195090322), TfU, VMUL(LDK(KP980785280), TfV));
+				   TfX = VADD(Tfk, Tfh);
+				   TfY = VADD(Tfo, Tfp);
+				   TfZ = VFNMS(LDK(KP195090322), TfY, VMUL(LDK(KP980785280), TfX));
+				   Tg6 = VFMA(LDK(KP195090322), TfX, VMUL(LDK(KP980785280), TfY));
+			      }
+			      Tg0 = VSUB(TfW, TfZ);
+			      Tgf = VADD(Tg5, Tg6);
+			      Tg7 = VSUB(Tg5, Tg6);
+			      Tgb = VADD(TfW, TfZ);
+			 }
+		    }
+		    {
+			 V Tg1, Tg8, Tgh, Tgi;
+			 Tg1 = VADD(TfT, Tg0);
+			 Tg8 = VBYI(VADD(Tg4, Tg7));
+			 ST(&(xo[WS(os, 100)]), VSUB(Tg1, Tg8), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 28)]), VADD(Tg1, Tg8), ovs, &(xo[0]));
+			 Tgh = VBYI(VADD(Tgc, Tgb));
+			 Tgi = VADD(Tge, Tgf);
+			 ST(&(xo[WS(os, 4)]), VADD(Tgh, Tgi), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 124)]), VSUB(Tgi, Tgh), ovs, &(xo[0]));
+		    }
+		    {
+			 V Tg9, Tga, Tgd, Tgg;
+			 Tg9 = VSUB(TfT, Tg0);
+			 Tga = VBYI(VSUB(Tg7, Tg4));
+			 ST(&(xo[WS(os, 92)]), VSUB(Tg9, Tga), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 36)]), VADD(Tg9, Tga), ovs, &(xo[0]));
+			 Tgd = VBYI(VSUB(Tgb, Tgc));
+			 Tgg = VSUB(Tge, Tgf);
+			 ST(&(xo[WS(os, 60)]), VADD(Tgd, Tgg), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 68)]), VSUB(Tgg, Tgd), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V Tb7, Td8, TcI, Td0, Tcy, Tda, TcG, TcP, Tck, TcJ, TcB, TcF, TcW, Tdb, Td3;
+		    V Td7;
+		    {
+			 V Tax, TcZ, Tb6, TcY, TaO, Tb5;
+			 Tax = VSUB(Tah, Taw);
+			 TcZ = VADD(Tcw, Tcr);
+			 TaO = VFMA(LDK(KP831469612), TaI, VMUL(LDK(KP555570233), TaN));
+			 Tb5 = VFNMS(LDK(KP555570233), Tb4, VMUL(LDK(KP831469612), TaZ));
+			 Tb6 = VSUB(TaO, Tb5);
+			 TcY = VADD(TaO, Tb5);
+			 Tb7 = VSUB(Tax, Tb6);
+			 Td8 = VADD(TcZ, TcY);
+			 TcI = VADD(Tax, Tb6);
+			 Td0 = VSUB(TcY, TcZ);
+		    }
+		    {
+			 V Tcx, TcN, Tco, TcO, Tcm, Tcn;
+			 Tcx = VSUB(Tcr, Tcw);
+			 TcN = VADD(Tah, Taw);
+			 Tcm = VFNMS(LDK(KP555570233), TaI, VMUL(LDK(KP831469612), TaN));
+			 Tcn = VFMA(LDK(KP555570233), TaZ, VMUL(LDK(KP831469612), Tb4));
+			 Tco = VSUB(Tcm, Tcn);
+			 TcO = VADD(Tcm, Tcn);
+			 Tcy = VSUB(Tco, Tcx);
+			 Tda = VADD(TcN, TcO);
+			 TcG = VADD(Tcx, Tco);
+			 TcP = VSUB(TcN, TcO);
+		    }
+		    {
+			 V TbI, Tcz, Tcj, TcA;
+			 {
+			      V Tby, TbH, Tc9, Tci;
+			      Tby = VSUB(Tbm, Tbx);
+			      TbH = VSUB(TbD, TbG);
+			      TbI = VFMA(LDK(KP881921264), Tby, VMUL(LDK(KP471396736), TbH));
+			      Tcz = VFNMS(LDK(KP471396736), Tby, VMUL(LDK(KP881921264), TbH));
+			      Tc9 = VSUB(TbX, Tc8);
+			      Tci = VSUB(Tce, Tch);
+			      Tcj = VFNMS(LDK(KP471396736), Tci, VMUL(LDK(KP881921264), Tc9));
+			      TcA = VFMA(LDK(KP471396736), Tc9, VMUL(LDK(KP881921264), Tci));
+			 }
+			 Tck = VSUB(TbI, Tcj);
+			 TcJ = VADD(Tcz, TcA);
+			 TcB = VSUB(Tcz, TcA);
+			 TcF = VADD(TbI, Tcj);
+		    }
+		    {
+			 V TcS, Td1, TcV, Td2;
+			 {
+			      V TcQ, TcR, TcT, TcU;
+			      TcQ = VADD(Tbx, Tbm);
+			      TcR = VADD(TbD, TbG);
+			      TcS = VFMA(LDK(KP956940335), TcQ, VMUL(LDK(KP290284677), TcR));
+			      Td1 = VFNMS(LDK(KP290284677), TcQ, VMUL(LDK(KP956940335), TcR));
+			      TcT = VADD(Tc8, TbX);
+			      TcU = VADD(Tce, Tch);
+			      TcV = VFNMS(LDK(KP290284677), TcU, VMUL(LDK(KP956940335), TcT));
+			      Td2 = VFMA(LDK(KP290284677), TcT, VMUL(LDK(KP956940335), TcU));
+			 }
+			 TcW = VSUB(TcS, TcV);
+			 Tdb = VADD(Td1, Td2);
+			 Td3 = VSUB(Td1, Td2);
+			 Td7 = VADD(TcS, TcV);
+		    }
+		    {
+			 V Tcl, TcC, Td9, Tdc;
+			 Tcl = VADD(Tb7, Tck);
+			 TcC = VBYI(VADD(Tcy, TcB));
+			 ST(&(xo[WS(os, 106)]), VSUB(Tcl, TcC), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 22)]), VADD(Tcl, TcC), ovs, &(xo[0]));
+			 Td9 = VBYI(VSUB(Td7, Td8));
+			 Tdc = VSUB(Tda, Tdb);
+			 ST(&(xo[WS(os, 58)]), VADD(Td9, Tdc), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 70)]), VSUB(Tdc, Td9), ovs, &(xo[0]));
+		    }
+		    {
+			 V Tdd, Tde, TcD, TcE;
+			 Tdd = VBYI(VADD(Td8, Td7));
+			 Tde = VADD(Tda, Tdb);
+			 ST(&(xo[WS(os, 6)]), VADD(Tdd, Tde), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 122)]), VSUB(Tde, Tdd), ovs, &(xo[0]));
+			 TcD = VSUB(Tb7, Tck);
+			 TcE = VBYI(VSUB(TcB, Tcy));
+			 ST(&(xo[WS(os, 86)]), VSUB(TcD, TcE), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 42)]), VADD(TcD, TcE), ovs, &(xo[0]));
+		    }
+		    {
+			 V TcH, TcK, TcX, Td4;
+			 TcH = VBYI(VSUB(TcF, TcG));
+			 TcK = VSUB(TcI, TcJ);
+			 ST(&(xo[WS(os, 54)]), VADD(TcH, TcK), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 74)]), VSUB(TcK, TcH), ovs, &(xo[0]));
+			 TcX = VADD(TcP, TcW);
+			 Td4 = VBYI(VADD(Td0, Td3));
+			 ST(&(xo[WS(os, 102)]), VSUB(TcX, Td4), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 26)]), VADD(TcX, Td4), ovs, &(xo[0]));
+		    }
+		    {
+			 V Td5, Td6, TcL, TcM;
+			 Td5 = VSUB(TcP, TcW);
+			 Td6 = VBYI(VSUB(Td3, Td0));
+			 ST(&(xo[WS(os, 90)]), VSUB(Td5, Td6), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 38)]), VADD(Td5, Td6), ovs, &(xo[0]));
+			 TcL = VBYI(VADD(TcG, TcF));
+			 TcM = VADD(TcI, TcJ);
+			 ST(&(xo[WS(os, 10)]), VADD(TcL, TcM), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 118)]), VSUB(TcM, TcL), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V TdE, Tel, TdW, Tee, TdM, Teo, TdT, Tea, Tdt, TdX, TdP, TdU, Te7, Tep, Teh;
+		    V Tem;
+		    {
+			 V TdD, Tec, TdA, Ted, Tdw, Tdz;
+			 TdD = VADD(TdB, TdC);
+			 Tec = VSUB(TdG, TdH);
+			 Tdw = VFMA(LDK(KP980785280), Tdu, VMUL(LDK(KP195090322), Tdv));
+			 Tdz = VFNMS(LDK(KP195090322), Tdy, VMUL(LDK(KP980785280), Tdx));
+			 TdA = VADD(Tdw, Tdz);
+			 Ted = VSUB(Tdw, Tdz);
+			 TdE = VSUB(TdA, TdD);
+			 Tel = VADD(Tec, Ted);
+			 TdW = VADD(TdD, TdA);
+			 Tee = VSUB(Tec, Ted);
+		    }
+		    {
+			 V TdI, Te9, TdL, Te8, TdJ, TdK;
+			 TdI = VADD(TdG, TdH);
+			 Te9 = VSUB(TdC, TdB);
+			 TdJ = VFNMS(LDK(KP195090322), Tdu, VMUL(LDK(KP980785280), Tdv));
+			 TdK = VFMA(LDK(KP195090322), Tdx, VMUL(LDK(KP980785280), Tdy));
+			 TdL = VADD(TdJ, TdK);
+			 Te8 = VSUB(TdJ, TdK);
+			 TdM = VSUB(TdI, TdL);
+			 Teo = VADD(Te9, Te8);
+			 TdT = VADD(TdI, TdL);
+			 Tea = VSUB(Te8, Te9);
+		    }
+		    {
+			 V Tdl, TdN, Tds, TdO;
+			 {
+			      V Tdh, Tdk, Tdo, Tdr;
+			      Tdh = VADD(Tdf, Tdg);
+			      Tdk = VADD(Tdi, Tdj);
+			      Tdl = VFNMS(LDK(KP098017140), Tdk, VMUL(LDK(KP995184726), Tdh));
+			      TdN = VFMA(LDK(KP098017140), Tdh, VMUL(LDK(KP995184726), Tdk));
+			      Tdo = VADD(Tdm, Tdn);
+			      Tdr = VADD(Tdp, Tdq);
+			      Tds = VFMA(LDK(KP995184726), Tdo, VMUL(LDK(KP098017140), Tdr));
+			      TdO = VFNMS(LDK(KP098017140), Tdo, VMUL(LDK(KP995184726), Tdr));
+			 }
+			 Tdt = VSUB(Tdl, Tds);
+			 TdX = VADD(TdN, TdO);
+			 TdP = VSUB(TdN, TdO);
+			 TdU = VADD(Tdl, Tds);
+		    }
+		    {
+			 V Te3, Tef, Te6, Teg;
+			 {
+			      V Te1, Te2, Te4, Te5;
+			      Te1 = VSUB(Tdf, Tdg);
+			      Te2 = VSUB(Tdj, Tdi);
+			      Te3 = VFNMS(LDK(KP634393284), Te2, VMUL(LDK(KP773010453), Te1));
+			      Tef = VFMA(LDK(KP634393284), Te1, VMUL(LDK(KP773010453), Te2));
+			      Te4 = VSUB(Tdm, Tdn);
+			      Te5 = VSUB(Tdq, Tdp);
+			      Te6 = VFMA(LDK(KP773010453), Te4, VMUL(LDK(KP634393284), Te5));
+			      Teg = VFNMS(LDK(KP634393284), Te4, VMUL(LDK(KP773010453), Te5));
+			 }
+			 Te7 = VSUB(Te3, Te6);
+			 Tep = VADD(Tef, Teg);
+			 Teh = VSUB(Tef, Teg);
+			 Tem = VADD(Te3, Te6);
+		    }
+		    {
+			 V TdF, TdQ, Ten, Teq;
+			 TdF = VBYI(VSUB(Tdt, TdE));
+			 TdQ = VSUB(TdM, TdP);
+			 ST(&(xo[WS(os, 34)]), VADD(TdF, TdQ), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 94)]), VSUB(TdQ, TdF), ovs, &(xo[0]));
+			 Ten = VADD(Tel, Tem);
+			 Teq = VBYI(VADD(Teo, Tep));
+			 ST(&(xo[WS(os, 114)]), VSUB(Ten, Teq), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 14)]), VADD(Ten, Teq), ovs, &(xo[0]));
+		    }
+		    {
+			 V Ter, Tes, TdR, TdS;
+			 Ter = VSUB(Tel, Tem);
+			 Tes = VBYI(VSUB(Tep, Teo));
+			 ST(&(xo[WS(os, 78)]), VSUB(Ter, Tes), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 50)]), VADD(Ter, Tes), ovs, &(xo[0]));
+			 TdR = VBYI(VADD(TdE, Tdt));
+			 TdS = VADD(TdM, TdP);
+			 ST(&(xo[WS(os, 30)]), VADD(TdR, TdS), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 98)]), VSUB(TdS, TdR), ovs, &(xo[0]));
+		    }
+		    {
+			 V TdV, TdY, Teb, Tei;
+			 TdV = VADD(TdT, TdU);
+			 TdY = VBYI(VADD(TdW, TdX));
+			 ST(&(xo[WS(os, 126)]), VSUB(TdV, TdY), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 2)]), VADD(TdV, TdY), ovs, &(xo[0]));
+			 Teb = VBYI(VSUB(Te7, Tea));
+			 Tei = VSUB(Tee, Teh);
+			 ST(&(xo[WS(os, 46)]), VADD(Teb, Tei), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 82)]), VSUB(Tei, Teb), ovs, &(xo[0]));
+		    }
+		    {
+			 V Tej, Tek, TdZ, Te0;
+			 Tej = VBYI(VADD(Tea, Te7));
+			 Tek = VADD(Tee, Teh);
+			 ST(&(xo[WS(os, 18)]), VADD(Tej, Tek), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 110)]), VSUB(Tek, Tej), ovs, &(xo[0]));
+			 TdZ = VSUB(TdT, TdU);
+			 Te0 = VBYI(VSUB(TdX, TdW));
+			 ST(&(xo[WS(os, 66)]), VSUB(TdZ, Te0), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 62)]), VADD(TdZ, Te0), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T7z, T8n, T8f, T8k, T8x, T8P, T8H, T8M, T80, T8L, T8O, T8c, T8j, T8A, T8E;
+		    V T8m;
+		    {
+			 V T7f, T8d, T7y, T8e;
+			 {
+			      V T77, T7e, T7q, T7x;
+			      T77 = VADD(T6Z, T76);
+			      T7e = VADD(T7a, T7d);
+			      T7f = VFNMS(LDK(KP336889853), T7e, VMUL(LDK(KP941544065), T77));
+			      T8d = VFMA(LDK(KP336889853), T77, VMUL(LDK(KP941544065), T7e));
+			      T7q = VADD(T7i, T7p);
+			      T7x = VADD(T7t, T7w);
+			      T7y = VFMA(LDK(KP941544065), T7q, VMUL(LDK(KP336889853), T7x));
+			      T8e = VFNMS(LDK(KP336889853), T7q, VMUL(LDK(KP941544065), T7x));
+			 }
+			 T7z = VSUB(T7f, T7y);
+			 T8n = VADD(T8d, T8e);
+			 T8f = VSUB(T8d, T8e);
+			 T8k = VADD(T7f, T7y);
+		    }
+		    {
+			 V T8t, T8F, T8w, T8G;
+			 {
+			      V T8r, T8s, T8u, T8v;
+			      T8r = VSUB(T6Z, T76);
+			      T8s = VSUB(T7d, T7a);
+			      T8t = VFNMS(LDK(KP427555093), T8s, VMUL(LDK(KP903989293), T8r));
+			      T8F = VFMA(LDK(KP427555093), T8r, VMUL(LDK(KP903989293), T8s));
+			      T8u = VSUB(T7i, T7p);
+			      T8v = VSUB(T7w, T7t);
+			      T8w = VFMA(LDK(KP903989293), T8u, VMUL(LDK(KP427555093), T8v));
+			      T8G = VFNMS(LDK(KP427555093), T8u, VMUL(LDK(KP903989293), T8v));
+			 }
+			 T8x = VSUB(T8t, T8w);
+			 T8P = VADD(T8F, T8G);
+			 T8H = VSUB(T8F, T8G);
+			 T8M = VADD(T8t, T8w);
+		    }
+		    {
+			 V T7Z, T8z, T88, T8C, T7O, T8D, T8b, T8y, T7Y, T87;
+			 T7Y = VSUB(T7U, T7X);
+			 T7Z = VADD(T7R, T7Y);
+			 T8z = VSUB(T7Y, T7R);
+			 T87 = VSUB(T85, T86);
+			 T88 = VADD(T84, T87);
+			 T8C = VSUB(T84, T87);
+			 {
+			      V T7G, T7N, T89, T8a;
+			      T7G = VFMA(LDK(KP634393284), T7C, VMUL(LDK(KP773010453), T7F));
+			      T7N = VFNMS(LDK(KP634393284), T7M, VMUL(LDK(KP773010453), T7J));
+			      T7O = VADD(T7G, T7N);
+			      T8D = VSUB(T7G, T7N);
+			      T89 = VFNMS(LDK(KP634393284), T7F, VMUL(LDK(KP773010453), T7C));
+			      T8a = VFMA(LDK(KP773010453), T7M, VMUL(LDK(KP634393284), T7J));
+			      T8b = VADD(T89, T8a);
+			      T8y = VSUB(T89, T8a);
+			 }
+			 T80 = VSUB(T7O, T7Z);
+			 T8L = VADD(T8C, T8D);
+			 T8O = VADD(T8z, T8y);
+			 T8c = VSUB(T88, T8b);
+			 T8j = VADD(T88, T8b);
+			 T8A = VSUB(T8y, T8z);
+			 T8E = VSUB(T8C, T8D);
+			 T8m = VADD(T7Z, T7O);
+		    }
+		    {
+			 V T81, T8g, T8N, T8Q;
+			 T81 = VBYI(VSUB(T7z, T80));
+			 T8g = VSUB(T8c, T8f);
+			 ST(&(xo[WS(os, 39)]), VADD(T81, T8g), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 89)]), VSUB(T8g, T81), ovs, &(xo[WS(os, 1)]));
+			 T8N = VADD(T8L, T8M);
+			 T8Q = VBYI(VADD(T8O, T8P));
+			 ST(&(xo[WS(os, 119)]), VSUB(T8N, T8Q), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 9)]), VADD(T8N, T8Q), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T8R, T8S, T8h, T8i;
+			 T8R = VSUB(T8L, T8M);
+			 T8S = VBYI(VSUB(T8P, T8O));
+			 ST(&(xo[WS(os, 73)]), VSUB(T8R, T8S), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 55)]), VADD(T8R, T8S), ovs, &(xo[WS(os, 1)]));
+			 T8h = VBYI(VADD(T80, T7z));
+			 T8i = VADD(T8c, T8f);
+			 ST(&(xo[WS(os, 25)]), VADD(T8h, T8i), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 103)]), VSUB(T8i, T8h), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T8l, T8o, T8B, T8I;
+			 T8l = VADD(T8j, T8k);
+			 T8o = VBYI(VADD(T8m, T8n));
+			 ST(&(xo[WS(os, 121)]), VSUB(T8l, T8o), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 7)]), VADD(T8l, T8o), ovs, &(xo[WS(os, 1)]));
+			 T8B = VBYI(VSUB(T8x, T8A));
+			 T8I = VSUB(T8E, T8H);
+			 ST(&(xo[WS(os, 41)]), VADD(T8B, T8I), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 87)]), VSUB(T8I, T8B), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T8J, T8K, T8p, T8q;
+			 T8J = VBYI(VADD(T8A, T8x));
+			 T8K = VADD(T8E, T8H);
+			 ST(&(xo[WS(os, 23)]), VADD(T8J, T8K), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 105)]), VSUB(T8K, T8J), ovs, &(xo[WS(os, 1)]));
+			 T8p = VSUB(T8j, T8k);
+			 T8q = VBYI(VSUB(T8n, T8m));
+			 ST(&(xo[WS(os, 71)]), VSUB(T8p, T8q), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 57)]), VADD(T8p, T8q), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T2v, T5d, T55, T5a, T5n, T5F, T5x, T5C, T4K, T5B, T5E, T52, T59, T5q, T5u;
+		    V T5c;
+		    {
+			 V T1f, T53, T2u, T54;
+			 {
+			      V T11, T1e, T2g, T2t;
+			      T11 = VADD(Tr, T10);
+			      T1e = VADD(T1a, T1d);
+			      T1f = VFNMS(LDK(KP242980179), T1e, VMUL(LDK(KP970031253), T11));
+			      T53 = VFMA(LDK(KP242980179), T11, VMUL(LDK(KP970031253), T1e));
+			      T2g = VADD(T1G, T2f);
+			      T2t = VADD(T2p, T2s);
+			      T2u = VFMA(LDK(KP970031253), T2g, VMUL(LDK(KP242980179), T2t));
+			      T54 = VFNMS(LDK(KP242980179), T2g, VMUL(LDK(KP970031253), T2t));
+			 }
+			 T2v = VSUB(T1f, T2u);
+			 T5d = VADD(T53, T54);
+			 T55 = VSUB(T53, T54);
+			 T5a = VADD(T1f, T2u);
+		    }
+		    {
+			 V T5j, T5v, T5m, T5w;
+			 {
+			      V T5h, T5i, T5k, T5l;
+			      T5h = VSUB(Tr, T10);
+			      T5i = VSUB(T1d, T1a);
+			      T5j = VFNMS(LDK(KP514102744), T5i, VMUL(LDK(KP857728610), T5h));
+			      T5v = VFMA(LDK(KP514102744), T5h, VMUL(LDK(KP857728610), T5i));
+			      T5k = VSUB(T1G, T2f);
+			      T5l = VSUB(T2s, T2p);
+			      T5m = VFMA(LDK(KP857728610), T5k, VMUL(LDK(KP514102744), T5l));
+			      T5w = VFNMS(LDK(KP514102744), T5k, VMUL(LDK(KP857728610), T5l));
+			 }
+			 T5n = VSUB(T5j, T5m);
+			 T5F = VADD(T5v, T5w);
+			 T5x = VSUB(T5v, T5w);
+			 T5C = VADD(T5j, T5m);
+		    }
+		    {
+			 V T4J, T5p, T4Y, T5s, T3I, T5t, T51, T5o, T4I, T4X;
+			 T4I = VSUB(T4q, T4H);
+			 T4J = VADD(T49, T4I);
+			 T5p = VSUB(T4I, T49);
+			 T4X = VSUB(T4V, T4W);
+			 T4Y = VADD(T4U, T4X);
+			 T5s = VSUB(T4U, T4X);
+			 {
+			      V T36, T3H, T4Z, T50;
+			      T36 = VFMA(LDK(KP881921264), T2W, VMUL(LDK(KP471396736), T35));
+			      T3H = VFNMS(LDK(KP471396736), T3G, VMUL(LDK(KP881921264), T3x));
+			      T3I = VADD(T36, T3H);
+			      T5t = VSUB(T36, T3H);
+			      T4Z = VFNMS(LDK(KP471396736), T2W, VMUL(LDK(KP881921264), T35));
+			      T50 = VFMA(LDK(KP471396736), T3x, VMUL(LDK(KP881921264), T3G));
+			      T51 = VADD(T4Z, T50);
+			      T5o = VSUB(T4Z, T50);
+			 }
+			 T4K = VSUB(T3I, T4J);
+			 T5B = VADD(T5s, T5t);
+			 T5E = VADD(T5p, T5o);
+			 T52 = VSUB(T4Y, T51);
+			 T59 = VADD(T4Y, T51);
+			 T5q = VSUB(T5o, T5p);
+			 T5u = VSUB(T5s, T5t);
+			 T5c = VADD(T4J, T3I);
+		    }
+		    {
+			 V T4L, T56, T5D, T5G;
+			 T4L = VBYI(VSUB(T2v, T4K));
+			 T56 = VSUB(T52, T55);
+			 ST(&(xo[WS(os, 37)]), VADD(T4L, T56), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 91)]), VSUB(T56, T4L), ovs, &(xo[WS(os, 1)]));
+			 T5D = VADD(T5B, T5C);
+			 T5G = VBYI(VADD(T5E, T5F));
+			 ST(&(xo[WS(os, 117)]), VSUB(T5D, T5G), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 11)]), VADD(T5D, T5G), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T5H, T5I, T57, T58;
+			 T5H = VSUB(T5B, T5C);
+			 T5I = VBYI(VSUB(T5F, T5E));
+			 ST(&(xo[WS(os, 75)]), VSUB(T5H, T5I), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 53)]), VADD(T5H, T5I), ovs, &(xo[WS(os, 1)]));
+			 T57 = VBYI(VADD(T4K, T2v));
+			 T58 = VADD(T52, T55);
+			 ST(&(xo[WS(os, 27)]), VADD(T57, T58), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 101)]), VSUB(T58, T57), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T5b, T5e, T5r, T5y;
+			 T5b = VADD(T59, T5a);
+			 T5e = VBYI(VADD(T5c, T5d));
+			 ST(&(xo[WS(os, 123)]), VSUB(T5b, T5e), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 5)]), VADD(T5b, T5e), ovs, &(xo[WS(os, 1)]));
+			 T5r = VBYI(VSUB(T5n, T5q));
+			 T5y = VSUB(T5u, T5x);
+			 ST(&(xo[WS(os, 43)]), VADD(T5r, T5y), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 85)]), VSUB(T5y, T5r), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T5z, T5A, T5f, T5g;
+			 T5z = VBYI(VADD(T5q, T5n));
+			 T5A = VADD(T5u, T5x);
+			 ST(&(xo[WS(os, 21)]), VADD(T5z, T5A), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 107)]), VSUB(T5A, T5z), ovs, &(xo[WS(os, 1)]));
+			 T5f = VSUB(T59, T5a);
+			 T5g = VBYI(VSUB(T5d, T5c));
+			 ST(&(xo[WS(os, 69)]), VSUB(T5f, T5g), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 59)]), VADD(T5f, T5g), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T9i, T9B, T9t, T9x, T9O, Ta3, T9V, T9Z, T93, Ta0, Ta2, T9q, T9y, T9H, T9S;
+		    V T9A;
+		    {
+			 V T9a, T9r, T9h, T9s;
+			 {
+			      V T96, T99, T9d, T9g;
+			      T96 = VSUB(T94, T95);
+			      T99 = VSUB(T97, T98);
+			      T9a = VFMA(LDK(KP740951125), T96, VMUL(LDK(KP671558954), T99));
+			      T9r = VFNMS(LDK(KP671558954), T96, VMUL(LDK(KP740951125), T99));
+			      T9d = VSUB(T9b, T9c);
+			      T9g = VSUB(T9e, T9f);
+			      T9h = VFNMS(LDK(KP671558954), T9g, VMUL(LDK(KP740951125), T9d));
+			      T9s = VFMA(LDK(KP671558954), T9d, VMUL(LDK(KP740951125), T9g));
+			 }
+			 T9i = VSUB(T9a, T9h);
+			 T9B = VADD(T9r, T9s);
+			 T9t = VSUB(T9r, T9s);
+			 T9x = VADD(T9a, T9h);
+		    }
+		    {
+			 V T9K, T9T, T9N, T9U;
+			 {
+			      V T9I, T9J, T9L, T9M;
+			      T9I = VADD(T95, T94);
+			      T9J = VADD(T97, T98);
+			      T9K = VFMA(LDK(KP998795456), T9I, VMUL(LDK(KP049067674), T9J));
+			      T9T = VFNMS(LDK(KP049067674), T9I, VMUL(LDK(KP998795456), T9J));
+			      T9L = VADD(T9c, T9b);
+			      T9M = VADD(T9e, T9f);
+			      T9N = VFNMS(LDK(KP049067674), T9M, VMUL(LDK(KP998795456), T9L));
+			      T9U = VFMA(LDK(KP049067674), T9L, VMUL(LDK(KP998795456), T9M));
+			 }
+			 T9O = VSUB(T9K, T9N);
+			 Ta3 = VADD(T9T, T9U);
+			 T9V = VSUB(T9T, T9U);
+			 T9Z = VADD(T9K, T9N);
+		    }
+		    {
+			 V T8V, T9F, T9p, T9R, T92, T9Q, T9m, T9G, T8U, T9n;
+			 T8U = VADD(T7U, T7X);
+			 T8V = VSUB(T8T, T8U);
+			 T9F = VADD(T8T, T8U);
+			 T9n = VADD(T85, T86);
+			 T9p = VSUB(T9n, T9o);
+			 T9R = VADD(T9o, T9n);
+			 {
+			      V T8Y, T91, T9k, T9l;
+			      T8Y = VFMA(LDK(KP098017140), T8W, VMUL(LDK(KP995184726), T8X));
+			      T91 = VFNMS(LDK(KP098017140), T90, VMUL(LDK(KP995184726), T8Z));
+			      T92 = VSUB(T8Y, T91);
+			      T9Q = VADD(T8Y, T91);
+			      T9k = VFNMS(LDK(KP098017140), T8X, VMUL(LDK(KP995184726), T8W));
+			      T9l = VFMA(LDK(KP995184726), T90, VMUL(LDK(KP098017140), T8Z));
+			      T9m = VSUB(T9k, T9l);
+			      T9G = VADD(T9k, T9l);
+			 }
+			 T93 = VSUB(T8V, T92);
+			 Ta0 = VADD(T9R, T9Q);
+			 Ta2 = VADD(T9F, T9G);
+			 T9q = VSUB(T9m, T9p);
+			 T9y = VADD(T9p, T9m);
+			 T9H = VSUB(T9F, T9G);
+			 T9S = VSUB(T9Q, T9R);
+			 T9A = VADD(T8V, T92);
+		    }
+		    {
+			 V T9j, T9u, Ta1, Ta4;
+			 T9j = VADD(T93, T9i);
+			 T9u = VBYI(VADD(T9q, T9t));
+			 ST(&(xo[WS(os, 111)]), VSUB(T9j, T9u), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 17)]), VADD(T9j, T9u), ovs, &(xo[WS(os, 1)]));
+			 Ta1 = VBYI(VSUB(T9Z, Ta0));
+			 Ta4 = VSUB(Ta2, Ta3);
+			 ST(&(xo[WS(os, 63)]), VADD(Ta1, Ta4), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 65)]), VSUB(Ta4, Ta1), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V Ta5, Ta6, T9v, T9w;
+			 Ta5 = VBYI(VADD(Ta0, T9Z));
+			 Ta6 = VADD(Ta2, Ta3);
+			 ST(&(xo[WS(os, 1)]), VADD(Ta5, Ta6), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 127)]), VSUB(Ta6, Ta5), ovs, &(xo[WS(os, 1)]));
+			 T9v = VSUB(T93, T9i);
+			 T9w = VBYI(VSUB(T9t, T9q));
+			 ST(&(xo[WS(os, 81)]), VSUB(T9v, T9w), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 47)]), VADD(T9v, T9w), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T9z, T9C, T9P, T9W;
+			 T9z = VBYI(VSUB(T9x, T9y));
+			 T9C = VSUB(T9A, T9B);
+			 ST(&(xo[WS(os, 49)]), VADD(T9z, T9C), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 79)]), VSUB(T9C, T9z), ovs, &(xo[WS(os, 1)]));
+			 T9P = VADD(T9H, T9O);
+			 T9W = VBYI(VADD(T9S, T9V));
+			 ST(&(xo[WS(os, 97)]), VSUB(T9P, T9W), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 31)]), VADD(T9P, T9W), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T9X, T9Y, T9D, T9E;
+			 T9X = VSUB(T9H, T9O);
+			 T9Y = VBYI(VSUB(T9V, T9S));
+			 ST(&(xo[WS(os, 95)]), VSUB(T9X, T9Y), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 33)]), VADD(T9X, T9Y), ovs, &(xo[WS(os, 1)]));
+			 T9D = VBYI(VADD(T9y, T9x));
+			 T9E = VADD(T9A, T9B);
+			 ST(&(xo[WS(os, 15)]), VADD(T9D, T9E), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 113)]), VSUB(T9E, T9D), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T68, T6r, T6j, T6n, T6E, T6T, T6L, T6P, T5T, T6Q, T6S, T6g, T6o, T6x, T6I;
+		    V T6q;
+		    {
+			 V T60, T6h, T67, T6i;
+			 {
+			      V T5W, T5Z, T63, T66;
+			      T5W = VSUB(T5U, T5V);
+			      T5Z = VSUB(T5X, T5Y);
+			      T60 = VFMA(LDK(KP803207531), T5W, VMUL(LDK(KP595699304), T5Z));
+			      T6h = VFNMS(LDK(KP595699304), T5W, VMUL(LDK(KP803207531), T5Z));
+			      T63 = VSUB(T61, T62);
+			      T66 = VSUB(T64, T65);
+			      T67 = VFNMS(LDK(KP595699304), T66, VMUL(LDK(KP803207531), T63));
+			      T6i = VFMA(LDK(KP595699304), T63, VMUL(LDK(KP803207531), T66));
+			 }
+			 T68 = VSUB(T60, T67);
+			 T6r = VADD(T6h, T6i);
+			 T6j = VSUB(T6h, T6i);
+			 T6n = VADD(T60, T67);
+		    }
+		    {
+			 V T6A, T6J, T6D, T6K;
+			 {
+			      V T6y, T6z, T6B, T6C;
+			      T6y = VADD(T5V, T5U);
+			      T6z = VADD(T5X, T5Y);
+			      T6A = VFMA(LDK(KP989176509), T6y, VMUL(LDK(KP146730474), T6z));
+			      T6J = VFNMS(LDK(KP146730474), T6y, VMUL(LDK(KP989176509), T6z));
+			      T6B = VADD(T62, T61);
+			      T6C = VADD(T64, T65);
+			      T6D = VFNMS(LDK(KP146730474), T6C, VMUL(LDK(KP989176509), T6B));
+			      T6K = VFMA(LDK(KP146730474), T6B, VMUL(LDK(KP989176509), T6C));
+			 }
+			 T6E = VSUB(T6A, T6D);
+			 T6T = VADD(T6J, T6K);
+			 T6L = VSUB(T6J, T6K);
+			 T6P = VADD(T6A, T6D);
+		    }
+		    {
+			 V T5L, T6v, T6f, T6H, T5S, T6G, T6c, T6w, T5K, T6d;
+			 T5K = VADD(T4q, T4H);
+			 T5L = VSUB(T5J, T5K);
+			 T6v = VADD(T5J, T5K);
+			 T6d = VADD(T4V, T4W);
+			 T6f = VSUB(T6d, T6e);
+			 T6H = VADD(T6e, T6d);
+			 {
+			      V T5O, T5R, T6a, T6b;
+			      T5O = VFMA(LDK(KP956940335), T5M, VMUL(LDK(KP290284677), T5N));
+			      T5R = VFNMS(LDK(KP290284677), T5Q, VMUL(LDK(KP956940335), T5P));
+			      T5S = VSUB(T5O, T5R);
+			      T6G = VADD(T5O, T5R);
+			      T6a = VFNMS(LDK(KP290284677), T5M, VMUL(LDK(KP956940335), T5N));
+			      T6b = VFMA(LDK(KP290284677), T5P, VMUL(LDK(KP956940335), T5Q));
+			      T6c = VSUB(T6a, T6b);
+			      T6w = VADD(T6a, T6b);
+			 }
+			 T5T = VSUB(T5L, T5S);
+			 T6Q = VADD(T6H, T6G);
+			 T6S = VADD(T6v, T6w);
+			 T6g = VSUB(T6c, T6f);
+			 T6o = VADD(T6f, T6c);
+			 T6x = VSUB(T6v, T6w);
+			 T6I = VSUB(T6G, T6H);
+			 T6q = VADD(T5L, T5S);
+		    }
+		    {
+			 V T69, T6k, T6R, T6U;
+			 T69 = VADD(T5T, T68);
+			 T6k = VBYI(VADD(T6g, T6j));
+			 ST(&(xo[WS(os, 109)]), VSUB(T69, T6k), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 19)]), VADD(T69, T6k), ovs, &(xo[WS(os, 1)]));
+			 T6R = VBYI(VSUB(T6P, T6Q));
+			 T6U = VSUB(T6S, T6T);
+			 ST(&(xo[WS(os, 61)]), VADD(T6R, T6U), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 67)]), VSUB(T6U, T6R), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T6V, T6W, T6l, T6m;
+			 T6V = VBYI(VADD(T6Q, T6P));
+			 T6W = VADD(T6S, T6T);
+			 ST(&(xo[WS(os, 3)]), VADD(T6V, T6W), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 125)]), VSUB(T6W, T6V), ovs, &(xo[WS(os, 1)]));
+			 T6l = VSUB(T5T, T68);
+			 T6m = VBYI(VSUB(T6j, T6g));
+			 ST(&(xo[WS(os, 83)]), VSUB(T6l, T6m), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 45)]), VADD(T6l, T6m), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T6p, T6s, T6F, T6M;
+			 T6p = VBYI(VSUB(T6n, T6o));
+			 T6s = VSUB(T6q, T6r);
+			 ST(&(xo[WS(os, 51)]), VADD(T6p, T6s), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 77)]), VSUB(T6s, T6p), ovs, &(xo[WS(os, 1)]));
+			 T6F = VADD(T6x, T6E);
+			 T6M = VBYI(VADD(T6I, T6L));
+			 ST(&(xo[WS(os, 99)]), VSUB(T6F, T6M), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 29)]), VADD(T6F, T6M), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T6N, T6O, T6t, T6u;
+			 T6N = VSUB(T6x, T6E);
+			 T6O = VBYI(VSUB(T6L, T6I));
+			 ST(&(xo[WS(os, 93)]), VSUB(T6N, T6O), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 35)]), VADD(T6N, T6O), ovs, &(xo[WS(os, 1)]));
+			 T6t = VBYI(VADD(T6o, T6n));
+			 T6u = VADD(T6q, T6r);
+			 ST(&(xo[WS(os, 13)]), VADD(T6t, T6u), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 115)]), VSUB(T6u, T6t), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 128, XSIMD_STRING("n1bv_128"), {938, 186, 144, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_128) (planner *p) {
+     X(kdft_register) (p, n1bv_128, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 13 -name n1bv_13 -include n1b.h */
+
+/*
+ * This function contains 88 FP additions, 63 FP multiplications,
+ * (or, 31 additions, 6 multiplications, 57 fused multiply/add),
+ * 96 stack variables, 23 constants, and 26 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP904176221, +0.904176221990848204433795481776887926501523162);
+     DVK(KP575140729, +0.575140729474003121368385547455453388461001608);
+     DVK(KP300462606, +0.300462606288665774426601772289207995520941381);
+     DVK(KP516520780, +0.516520780623489722840901288569017135705033622);
+     DVK(KP522026385, +0.522026385161275033714027226654165028300441940);
+     DVK(KP957805992, +0.957805992594665126462521754605754580515587217);
+     DVK(KP600477271, +0.600477271932665282925769253334763009352012849);
+     DVK(KP251768516, +0.251768516431883313623436926934233488546674281);
+     DVK(KP503537032, +0.503537032863766627246873853868466977093348562);
+     DVK(KP769338817, +0.769338817572980603471413688209101117038278899);
+     DVK(KP859542535, +0.859542535098774820163672132761689612766401925);
+     DVK(KP581704778, +0.581704778510515730456870384989698884939833902);
+     DVK(KP853480001, +0.853480001859823990758994934970528322872359049);
+     DVK(KP083333333, +0.083333333333333333333333333333333333333333333);
+     DVK(KP226109445, +0.226109445035782405468510155372505010481906348);
+     DVK(KP301479260, +0.301479260047709873958013540496673347309208464);
+     DVK(KP686558370, +0.686558370781754340655719594850823015421401653);
+     DVK(KP514918778, +0.514918778086315755491789696138117261566051239);
+     DVK(KP038632954, +0.038632954644348171955506895830342264440241080);
+     DVK(KP612264650, +0.612264650376756543746494474777125408779395514);
+     DVK(KP302775637, +0.302775637731994646559610633735247973125648287);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(26, is), MAKE_VOLATILE_STRIDE(26, os)) {
+	       V T1, T7, T2, Tg, Tf, TN, Th, Tq, Ta, Tj, T5, Tr, Tk;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       {
+		    V Td, Te, T8, T9, T3, T4;
+		    Td = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T9 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T4 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    Tg = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+		    Tf = VADD(Td, Te);
+		    TN = VSUB(Td, Te);
+		    Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Tq = VSUB(T8, T9);
+		    Ta = VADD(T8, T9);
+		    Tj = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    T5 = VADD(T3, T4);
+		    Tr = VSUB(T4, T3);
+		    Tk = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       }
+	       {
+		    V Tt, Ti, Ty, Tb, Ts, TQ, Tx, T6, Tu, Tl;
+		    Tt = VSUB(Tg, Th);
+		    Ti = VADD(Tg, Th);
+		    Ty = VFMS(LDK(KP500000000), Ta, T7);
+		    Tb = VADD(T7, Ta);
+		    Ts = VSUB(Tq, Tr);
+		    TQ = VADD(Tr, Tq);
+		    Tx = VFNMS(LDK(KP500000000), T5, T2);
+		    T6 = VADD(T2, T5);
+		    Tu = VSUB(Tj, Tk);
+		    Tl = VADD(Tj, Tk);
+		    {
+			 V TK, Tz, Tc, TX, Tv, TO, TL, Tm;
+			 TK = VADD(Tx, Ty);
+			 Tz = VSUB(Tx, Ty);
+			 Tc = VADD(T6, Tb);
+			 TX = VSUB(T6, Tb);
+			 Tv = VSUB(Tt, Tu);
+			 TO = VADD(Tt, Tu);
+			 TL = VSUB(Ti, Tl);
+			 Tm = VADD(Ti, Tl);
+			 {
+			      V TF, Tw, TP, TY, TT, TM, TA, Tn;
+			      TF = VSUB(Ts, Tv);
+			      Tw = VADD(Ts, Tv);
+			      TP = VFNMS(LDK(KP500000000), TO, TN);
+			      TY = VADD(TN, TO);
+			      TT = VFNMS(LDK(KP866025403), TL, TK);
+			      TM = VFMA(LDK(KP866025403), TL, TK);
+			      TA = VFNMS(LDK(KP500000000), Tm, Tf);
+			      Tn = VADD(Tf, Tm);
+			      {
+				   V T1f, T1n, TI, T18, T1k, T1c, TD, T17, T10, T1m, T16, T1e, TU, TR;
+				   TU = VFNMS(LDK(KP866025403), TQ, TP);
+				   TR = VFMA(LDK(KP866025403), TQ, TP);
+				   {
+					V TZ, T15, TE, TB;
+					TZ = VFMA(LDK(KP302775637), TY, TX);
+					T15 = VFNMS(LDK(KP302775637), TX, TY);
+					TE = VSUB(Tz, TA);
+					TB = VADD(Tz, TA);
+					{
+					     V TH, To, TV, T13;
+					     TH = VSUB(Tc, Tn);
+					     To = VADD(Tc, Tn);
+					     TV = VFNMS(LDK(KP612264650), TU, TT);
+					     T13 = VFMA(LDK(KP612264650), TT, TU);
+					     {
+						  V TS, T12, TG, T1b;
+						  TS = VFNMS(LDK(KP038632954), TR, TM);
+						  T12 = VFMA(LDK(KP038632954), TM, TR);
+						  TG = VFNMS(LDK(KP514918778), TF, TE);
+						  T1b = VFMA(LDK(KP686558370), TE, TF);
+						  {
+						       V TC, T1a, Tp, TW, T14;
+						       TC = VFMA(LDK(KP301479260), TB, Tw);
+						       T1a = VFNMS(LDK(KP226109445), Tw, TB);
+						       Tp = VFNMS(LDK(KP083333333), To, T1);
+						       ST(&(xo[0]), VADD(T1, To), ovs, &(xo[0]));
+						       T1f = VFMA(LDK(KP853480001), TV, TS);
+						       TW = VFNMS(LDK(KP853480001), TV, TS);
+						       T1n = VFMA(LDK(KP853480001), T13, T12);
+						       T14 = VFNMS(LDK(KP853480001), T13, T12);
+						       TI = VFMA(LDK(KP581704778), TH, TG);
+						       T18 = VFNMS(LDK(KP859542535), TG, TH);
+						       T1k = VFMA(LDK(KP769338817), T1b, T1a);
+						       T1c = VFNMS(LDK(KP769338817), T1b, T1a);
+						       TD = VFMA(LDK(KP503537032), TC, Tp);
+						       T17 = VFNMS(LDK(KP251768516), TC, Tp);
+						       T10 = VMUL(LDK(KP600477271), VFMA(LDK(KP957805992), TZ, TW));
+						       T1m = VFNMS(LDK(KP522026385), TW, TZ);
+						       T16 = VMUL(LDK(KP600477271), VFMA(LDK(KP957805992), T15, T14));
+						       T1e = VFNMS(LDK(KP522026385), T14, T15);
+						  }
+					     }
+					}
+				   }
+				   {
+					V T1o, T1q, T1g, T1i, T1d, T1h, T1l, T1p;
+					{
+					     V T11, TJ, T19, T1j;
+					     T11 = VFMA(LDK(KP516520780), TI, TD);
+					     TJ = VFNMS(LDK(KP516520780), TI, TD);
+					     T19 = VFMA(LDK(KP300462606), T18, T17);
+					     T1j = VFNMS(LDK(KP300462606), T18, T17);
+					     T1o = VMUL(LDK(KP575140729), VFNMS(LDK(KP904176221), T1n, T1m));
+					     T1q = VMUL(LDK(KP575140729), VFMA(LDK(KP904176221), T1n, T1m));
+					     T1g = VMUL(LDK(KP575140729), VFMA(LDK(KP904176221), T1f, T1e));
+					     T1i = VMUL(LDK(KP575140729), VFNMS(LDK(KP904176221), T1f, T1e));
+					     ST(&(xo[WS(os, 12)]), VFMAI(T16, T11), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 1)]), VFNMSI(T16, T11), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 8)]), VFNMSI(T10, TJ), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 5)]), VFMAI(T10, TJ), ovs, &(xo[WS(os, 1)]));
+					     T1d = VFNMS(LDK(KP503537032), T1c, T19);
+					     T1h = VFMA(LDK(KP503537032), T1c, T19);
+					     T1l = VFNMS(LDK(KP503537032), T1k, T1j);
+					     T1p = VFMA(LDK(KP503537032), T1k, T1j);
+					}
+					ST(&(xo[WS(os, 9)]), VFNMSI(T1g, T1d), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 4)]), VFMAI(T1g, T1d), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 10)]), VFMAI(T1i, T1h), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 3)]), VFNMSI(T1i, T1h), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 7)]), VFNMSI(T1o, T1l), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 6)]), VFMAI(T1o, T1l), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 11)]), VFNMSI(T1q, T1p), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 2)]), VFMAI(T1q, T1p), ovs, &(xo[0]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 13, XSIMD_STRING("n1bv_13"), {31, 6, 57, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_13) (planner *p) {
+     X(kdft_register) (p, n1bv_13, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 13 -name n1bv_13 -include n1b.h */
+
+/*
+ * This function contains 88 FP additions, 34 FP multiplications,
+ * (or, 69 additions, 15 multiplications, 19 fused multiply/add),
+ * 60 stack variables, 20 constants, and 26 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DVK(KP083333333, +0.083333333333333333333333333333333333333333333);
+     DVK(KP075902986, +0.075902986037193865983102897245103540356428373);
+     DVK(KP251768516, +0.251768516431883313623436926934233488546674281);
+     DVK(KP132983124, +0.132983124607418643793760531921092974399165133);
+     DVK(KP258260390, +0.258260390311744861420450644284508567852516811);
+     DVK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DVK(KP300238635, +0.300238635966332641462884626667381504676006424);
+     DVK(KP011599105, +0.011599105605768290721655456654083252189827041);
+     DVK(KP256247671, +0.256247671582936600958684654061725059144125175);
+     DVK(KP156891391, +0.156891391051584611046832726756003269660212636);
+     DVK(KP174138601, +0.174138601152135905005660794929264742616964676);
+     DVK(KP575140729, +0.575140729474003121368385547455453388461001608);
+     DVK(KP503537032, +0.503537032863766627246873853868466977093348562);
+     DVK(KP113854479, +0.113854479055790798974654345867655310534642560);
+     DVK(KP265966249, +0.265966249214837287587521063842185948798330267);
+     DVK(KP387390585, +0.387390585467617292130675966426762851778775217);
+     DVK(KP300462606, +0.300462606288665774426601772289207995520941381);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(26, is), MAKE_VOLATILE_STRIDE(26, os)) {
+	       V TW, Tb, Tm, Ts, TB, TR, TX, TK, TU, Tz, TC, TN, TT;
+	       TW = LD(&(xi[0]), ivs, &(xi[0]));
+	       {
+		    V Te, TH, Ta, Tu, Tp, T5, Tt, To, Th, Tw, Tk, Tx, Tl, TI, Tc;
+		    V Td, Tq, Tr;
+		    Tc = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    Td = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    Te = VSUB(Tc, Td);
+		    TH = VADD(Tc, Td);
+		    {
+			 V T6, T7, T8, T9;
+			 T6 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T9 = VADD(T7, T8);
+			 Ta = VADD(T6, T9);
+			 Tu = VFNMS(LDK(KP500000000), T9, T6);
+			 Tp = VSUB(T7, T8);
+		    }
+		    {
+			 V T1, T2, T3, T4;
+			 T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T4 = VADD(T2, T3);
+			 T5 = VADD(T1, T4);
+			 Tt = VFNMS(LDK(KP500000000), T4, T1);
+			 To = VSUB(T2, T3);
+		    }
+		    {
+			 V Tf, Tg, Ti, Tj;
+			 Tf = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Tg = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Th = VSUB(Tf, Tg);
+			 Tw = VADD(Tf, Tg);
+			 Ti = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Tj = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Tk = VSUB(Ti, Tj);
+			 Tx = VADD(Ti, Tj);
+		    }
+		    Tl = VADD(Th, Tk);
+		    TI = VADD(Tw, Tx);
+		    Tb = VSUB(T5, Ta);
+		    Tm = VADD(Te, Tl);
+		    Tq = VMUL(LDK(KP866025403), VSUB(To, Tp));
+		    Tr = VFNMS(LDK(KP500000000), Tl, Te);
+		    Ts = VADD(Tq, Tr);
+		    TB = VSUB(Tq, Tr);
+		    {
+			 V TP, TQ, TG, TJ;
+			 TP = VADD(T5, Ta);
+			 TQ = VADD(TH, TI);
+			 TR = VMUL(LDK(KP300462606), VSUB(TP, TQ));
+			 TX = VADD(TP, TQ);
+			 TG = VADD(Tt, Tu);
+			 TJ = VFNMS(LDK(KP500000000), TI, TH);
+			 TK = VSUB(TG, TJ);
+			 TU = VADD(TG, TJ);
+		    }
+		    {
+			 V Tv, Ty, TL, TM;
+			 Tv = VSUB(Tt, Tu);
+			 Ty = VMUL(LDK(KP866025403), VSUB(Tw, Tx));
+			 Tz = VSUB(Tv, Ty);
+			 TC = VADD(Tv, Ty);
+			 TL = VADD(To, Tp);
+			 TM = VSUB(Th, Tk);
+			 TN = VSUB(TL, TM);
+			 TT = VADD(TL, TM);
+		    }
+	       }
+	       ST(&(xo[0]), VADD(TW, TX), ovs, &(xo[0]));
+	       {
+		    V T1c, T1n, T11, T14, T17, T1k, Tn, TE, T18, T1j, TS, T1m, TZ, T1f, TA;
+		    V TD;
+		    {
+			 V T1a, T1b, T12, T13;
+			 T1a = VFMA(LDK(KP387390585), TN, VMUL(LDK(KP265966249), TK));
+			 T1b = VFNMS(LDK(KP503537032), TU, VMUL(LDK(KP113854479), TT));
+			 T1c = VSUB(T1a, T1b);
+			 T1n = VADD(T1a, T1b);
+			 T11 = VFMA(LDK(KP575140729), Tb, VMUL(LDK(KP174138601), Tm));
+			 T12 = VFNMS(LDK(KP256247671), Tz, VMUL(LDK(KP156891391), Ts));
+			 T13 = VFMA(LDK(KP011599105), TB, VMUL(LDK(KP300238635), TC));
+			 T14 = VADD(T12, T13);
+			 T17 = VSUB(T11, T14);
+			 T1k = VMUL(LDK(KP1_732050807), VSUB(T12, T13));
+		    }
+		    Tn = VFNMS(LDK(KP575140729), Tm, VMUL(LDK(KP174138601), Tb));
+		    TA = VFMA(LDK(KP256247671), Ts, VMUL(LDK(KP156891391), Tz));
+		    TD = VFNMS(LDK(KP011599105), TC, VMUL(LDK(KP300238635), TB));
+		    TE = VADD(TA, TD);
+		    T18 = VMUL(LDK(KP1_732050807), VSUB(TD, TA));
+		    T1j = VSUB(Tn, TE);
+		    {
+			 V TO, T1e, TV, TY, T1d;
+			 TO = VFNMS(LDK(KP132983124), TN, VMUL(LDK(KP258260390), TK));
+			 T1e = VSUB(TR, TO);
+			 TV = VFMA(LDK(KP251768516), TT, VMUL(LDK(KP075902986), TU));
+			 TY = VFNMS(LDK(KP083333333), TX, TW);
+			 T1d = VSUB(TY, TV);
+			 TS = VFMA(LDK(KP2_000000000), TO, TR);
+			 T1m = VADD(T1e, T1d);
+			 TZ = VFMA(LDK(KP2_000000000), TV, TY);
+			 T1f = VSUB(T1d, T1e);
+		    }
+		    {
+			 V TF, T10, T1l, T1o;
+			 TF = VBYI(VFMA(LDK(KP2_000000000), TE, Tn));
+			 T10 = VADD(TS, TZ);
+			 ST(&(xo[WS(os, 1)]), VADD(TF, T10), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 12)]), VSUB(T10, TF), ovs, &(xo[0]));
+			 {
+			      V T15, T16, T1p, T1q;
+			      T15 = VBYI(VFMA(LDK(KP2_000000000), T14, T11));
+			      T16 = VSUB(TZ, TS);
+			      ST(&(xo[WS(os, 5)]), VADD(T15, T16), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 8)]), VSUB(T16, T15), ovs, &(xo[0]));
+			      T1p = VADD(T1n, T1m);
+			      T1q = VBYI(VADD(T1j, T1k));
+			      ST(&(xo[WS(os, 4)]), VSUB(T1p, T1q), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 9)]), VADD(T1q, T1p), ovs, &(xo[WS(os, 1)]));
+			 }
+			 T1l = VBYI(VSUB(T1j, T1k));
+			 T1o = VSUB(T1m, T1n);
+			 ST(&(xo[WS(os, 3)]), VADD(T1l, T1o), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 10)]), VSUB(T1o, T1l), ovs, &(xo[0]));
+			 {
+			      V T1h, T1i, T19, T1g;
+			      T1h = VBYI(VADD(T18, T17));
+			      T1i = VSUB(T1f, T1c);
+			      ST(&(xo[WS(os, 6)]), VADD(T1h, T1i), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 7)]), VSUB(T1i, T1h), ovs, &(xo[WS(os, 1)]));
+			      T19 = VBYI(VSUB(T17, T18));
+			      T1g = VADD(T1c, T1f);
+			      ST(&(xo[WS(os, 2)]), VADD(T19, T1g), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 11)]), VSUB(T1g, T19), ovs, &(xo[WS(os, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 13, XSIMD_STRING("n1bv_13"), {69, 15, 19, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_13) (planner *p) {
+     X(kdft_register) (p, n1bv_13, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 14 -name n1bv_14 -include n1b.h */
+
+/*
+ * This function contains 74 FP additions, 48 FP multiplications,
+ * (or, 32 additions, 6 multiplications, 42 fused multiply/add),
+ * 63 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
+	       V TH, T3, TP, Tn, Ta, Tu, TU, TK, TO, Tk, TM, Tg, TL, Td, T1;
+	       V T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V Ti, TI, T6, TJ, T9, Tj, Te, Tf, Tb, Tc;
+		    {
+			 V T4, T5, T7, T8, Tl, Tm;
+			 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tl = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tm = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Ti = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 TH = VADD(T1, T2);
+			 T3 = VSUB(T1, T2);
+			 TI = VADD(T4, T5);
+			 T6 = VSUB(T4, T5);
+			 TJ = VADD(T7, T8);
+			 T9 = VSUB(T7, T8);
+			 TP = VADD(Tl, Tm);
+			 Tn = VSUB(Tl, Tm);
+			 Tj = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Te = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tf = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+		    }
+		    Ta = VADD(T6, T9);
+		    Tu = VSUB(T6, T9);
+		    TU = VSUB(TI, TJ);
+		    TK = VADD(TI, TJ);
+		    TO = VADD(Ti, Tj);
+		    Tk = VSUB(Ti, Tj);
+		    TM = VADD(Te, Tf);
+		    Tg = VSUB(Te, Tf);
+		    TL = VADD(Tb, Tc);
+		    Td = VSUB(Tb, Tc);
+	       }
+	       {
+		    V T13, TG, TY, T18, TB, Tw, TT, Tz, T11, T16, TE, Tr, TV, TQ;
+		    TV = VSUB(TP, TO);
+		    TQ = VADD(TO, TP);
+		    {
+			 V Ts, To, TW, TN;
+			 Ts = VSUB(Tk, Tn);
+			 To = VADD(Tk, Tn);
+			 TW = VSUB(TM, TL);
+			 TN = VADD(TL, TM);
+			 {
+			      V Tt, Th, TR, T12;
+			      Tt = VSUB(Td, Tg);
+			      Th = VADD(Td, Tg);
+			      TR = VFNMS(LDK(KP356895867), TK, TQ);
+			      T12 = VFNMS(LDK(KP554958132), TV, TU);
+			      {
+				   V Tx, TF, TZ, T14;
+				   Tx = VFNMS(LDK(KP356895867), Ta, To);
+				   TF = VFMA(LDK(KP554958132), Ts, Tu);
+				   ST(&(xo[0]), VADD(TH, VADD(TK, VADD(TN, TQ))), ovs, &(xo[0]));
+				   TZ = VFNMS(LDK(KP356895867), TN, TK);
+				   T14 = VFNMS(LDK(KP356895867), TQ, TN);
+				   {
+					V TX, T17, TC, Tp;
+					TX = VFMA(LDK(KP554958132), TW, TV);
+					T17 = VFMA(LDK(KP554958132), TU, TW);
+					ST(&(xo[WS(os, 7)]), VADD(T3, VADD(Ta, VADD(Th, To))), ovs, &(xo[WS(os, 1)]));
+					TC = VFNMS(LDK(KP356895867), Th, Ta);
+					Tp = VFNMS(LDK(KP356895867), To, Th);
+					{
+					     V TA, Tv, TS, Ty;
+					     TA = VFMA(LDK(KP554958132), Tt, Ts);
+					     Tv = VFNMS(LDK(KP554958132), Tu, Tt);
+					     TS = VFNMS(LDK(KP692021471), TR, TN);
+					     T13 = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), T12, TW));
+					     Ty = VFNMS(LDK(KP692021471), Tx, Th);
+					     TG = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), TF, Tt));
+					     {
+						  V T10, T15, TD, Tq;
+						  T10 = VFNMS(LDK(KP692021471), TZ, TQ);
+						  T15 = VFNMS(LDK(KP692021471), T14, TK);
+						  TY = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), TX, TU));
+						  T18 = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), T17, TV));
+						  TD = VFNMS(LDK(KP692021471), TC, To);
+						  Tq = VFNMS(LDK(KP692021471), Tp, Ta);
+						  TB = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), TA, Tu));
+						  Tw = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tv, Ts));
+						  TT = VFNMS(LDK(KP900968867), TS, TH);
+						  Tz = VFNMS(LDK(KP900968867), Ty, T3);
+						  T11 = VFNMS(LDK(KP900968867), T10, TH);
+						  T16 = VFNMS(LDK(KP900968867), T15, TH);
+						  TE = VFNMS(LDK(KP900968867), TD, T3);
+						  Tr = VFNMS(LDK(KP900968867), Tq, T3);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    ST(&(xo[WS(os, 2)]), VFMAI(TY, TT), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 12)]), VFNMSI(TY, TT), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 9)]), VFMAI(TB, Tz), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 5)]), VFNMSI(TB, Tz), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 6)]), VFMAI(T13, T11), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 8)]), VFNMSI(T13, T11), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VFMAI(T18, T16), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 10)]), VFNMSI(T18, T16), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 13)]), VFNMSI(TG, TE), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VFMAI(TG, TE), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 11)]), VFNMSI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VFMAI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 14, XSIMD_STRING("n1bv_14"), {32, 6, 42, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_14) (planner *p) {
+     X(kdft_register) (p, n1bv_14, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 14 -name n1bv_14 -include n1b.h */
+
+/*
+ * This function contains 74 FP additions, 36 FP multiplications,
+ * (or, 50 additions, 12 multiplications, 24 fused multiply/add),
+ * 33 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
+	       V Tp, Ty, Tl, TL, Tq, TE, T7, TJ, Ts, TB, Te, TK, Tr, TH, Tn;
+	       V To;
+	       Tn = LD(&(xi[0]), ivs, &(xi[0]));
+	       To = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       Tp = VSUB(Tn, To);
+	       Ty = VADD(Tn, To);
+	       {
+		    V Th, TC, Tk, TD;
+		    {
+			 V Tf, Tg, Ti, Tj;
+			 Tf = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tg = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Th = VSUB(Tf, Tg);
+			 TC = VADD(Tf, Tg);
+			 Ti = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tj = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tk = VSUB(Ti, Tj);
+			 TD = VADD(Ti, Tj);
+		    }
+		    Tl = VSUB(Th, Tk);
+		    TL = VSUB(TD, TC);
+		    Tq = VADD(Th, Tk);
+		    TE = VADD(TC, TD);
+	       }
+	       {
+		    V T3, Tz, T6, TA;
+		    {
+			 V T1, T2, T4, T5;
+			 T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 Tz = VADD(T1, T2);
+			 T4 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 TA = VADD(T4, T5);
+		    }
+		    T7 = VSUB(T3, T6);
+		    TJ = VSUB(Tz, TA);
+		    Ts = VADD(T3, T6);
+		    TB = VADD(Tz, TA);
+	       }
+	       {
+		    V Ta, TF, Td, TG;
+		    {
+			 V T8, T9, Tb, Tc;
+			 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 T9 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VSUB(T8, T9);
+			 TF = VADD(T8, T9);
+			 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 TG = VADD(Tb, Tc);
+		    }
+		    Te = VSUB(Ta, Td);
+		    TK = VSUB(TG, TF);
+		    Tr = VADD(Ta, Td);
+		    TH = VADD(TF, TG);
+	       }
+	       ST(&(xo[WS(os, 7)]), VADD(Tp, VADD(Ts, VADD(Tq, Tr))), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[0]), VADD(Ty, VADD(TB, VADD(TE, TH))), ovs, &(xo[0]));
+	       {
+		    V Tm, Tt, TQ, TP;
+		    Tm = VBYI(VFMA(LDK(KP433883739), T7, VFNMS(LDK(KP781831482), Tl, VMUL(LDK(KP974927912), Te))));
+		    Tt = VFMA(LDK(KP623489801), Tq, VFNMS(LDK(KP222520933), Tr, VFNMS(LDK(KP900968867), Ts, Tp)));
+		    ST(&(xo[WS(os, 3)]), VADD(Tm, Tt), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 11)]), VSUB(Tt, Tm), ovs, &(xo[WS(os, 1)]));
+		    TQ = VBYI(VFMA(LDK(KP974927912), TJ, VFMA(LDK(KP433883739), TL, VMUL(LDK(KP781831482), TK))));
+		    TP = VFMA(LDK(KP623489801), TH, VFNMS(LDK(KP900968867), TE, VFNMS(LDK(KP222520933), TB, Ty)));
+		    ST(&(xo[WS(os, 12)]), VSUB(TP, TQ), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VADD(TP, TQ), ovs, &(xo[0]));
+	       }
+	       {
+		    V Tu, Tv, TM, TI;
+		    Tu = VBYI(VFMA(LDK(KP781831482), T7, VFMA(LDK(KP974927912), Tl, VMUL(LDK(KP433883739), Te))));
+		    Tv = VFMA(LDK(KP623489801), Ts, VFNMS(LDK(KP900968867), Tr, VFNMS(LDK(KP222520933), Tq, Tp)));
+		    ST(&(xo[WS(os, 1)]), VADD(Tu, Tv), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 13)]), VSUB(Tv, Tu), ovs, &(xo[WS(os, 1)]));
+		    TM = VBYI(VFNMS(LDK(KP433883739), TK, VFNMS(LDK(KP974927912), TL, VMUL(LDK(KP781831482), TJ))));
+		    TI = VFMA(LDK(KP623489801), TB, VFNMS(LDK(KP900968867), TH, VFNMS(LDK(KP222520933), TE, Ty)));
+		    ST(&(xo[WS(os, 6)]), VSUB(TI, TM), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 8)]), VADD(TI, TM), ovs, &(xo[0]));
+	       }
+	       {
+		    V TO, TN, Tx, Tw;
+		    TO = VBYI(VFMA(LDK(KP433883739), TJ, VFNMS(LDK(KP974927912), TK, VMUL(LDK(KP781831482), TL))));
+		    TN = VFMA(LDK(KP623489801), TE, VFNMS(LDK(KP222520933), TH, VFNMS(LDK(KP900968867), TB, Ty)));
+		    ST(&(xo[WS(os, 4)]), VSUB(TN, TO), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 10)]), VADD(TN, TO), ovs, &(xo[0]));
+		    Tx = VBYI(VFNMS(LDK(KP781831482), Te, VFNMS(LDK(KP433883739), Tl, VMUL(LDK(KP974927912), T7))));
+		    Tw = VFMA(LDK(KP623489801), Tr, VFNMS(LDK(KP900968867), Tq, VFNMS(LDK(KP222520933), Ts, Tp)));
+		    ST(&(xo[WS(os, 5)]), VSUB(Tw, Tx), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 9)]), VADD(Tx, Tw), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 14, XSIMD_STRING("n1bv_14"), {50, 12, 24, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_14) (planner *p) {
+     X(kdft_register) (p, n1bv_14, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 15 -name n1bv_15 -include n1b.h */
+
+/*
+ * This function contains 78 FP additions, 49 FP multiplications,
+ * (or, 36 additions, 7 multiplications, 42 fused multiply/add),
+ * 78 stack variables, 8 constants, and 30 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DVK(KP910592997, +0.910592997310029334643087372129977886038870291);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(30, is), MAKE_VOLATILE_STRIDE(30, os)) {
+	       V Tb, TH, Tw, TA, Th, T11, T5, Ti, T12, Ta, Tx, Te, Tq, T16, Tj;
+	       V T1, T2, T3;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+	       {
+		    V T6, T7, T8, Tm, Tn, To;
+		    T6 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+		    Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    Tn = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+		    To = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    {
+			 V T4, Tc, T9, Td, Tp;
+			 Tb = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T4 = VADD(T2, T3);
+			 TH = VSUB(T2, T3);
+			 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Tw = VSUB(T7, T8);
+			 T9 = VADD(T7, T8);
+			 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Tp = VADD(Tn, To);
+			 TA = VSUB(Tn, To);
+			 Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 T11 = VADD(T1, T4);
+			 T5 = VFNMS(LDK(KP500000000), T4, T1);
+			 Ti = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 T12 = VADD(T6, T9);
+			 Ta = VFNMS(LDK(KP500000000), T9, T6);
+			 Tx = VSUB(Tc, Td);
+			 Te = VADD(Tc, Td);
+			 Tq = VFNMS(LDK(KP500000000), Tp, Tm);
+			 T16 = VADD(Tm, Tp);
+			 Tj = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    }
+	       }
+	       {
+		    V TI, Ty, T13, Tf, Tz, Tk;
+		    TI = VADD(Tw, Tx);
+		    Ty = VSUB(Tw, Tx);
+		    T13 = VADD(Tb, Te);
+		    Tf = VFNMS(LDK(KP500000000), Te, Tb);
+		    Tz = VSUB(Ti, Tj);
+		    Tk = VADD(Ti, Tj);
+		    {
+			 V T1d, T14, Tg, TE, TJ, TB, T15, Tl;
+			 T1d = VSUB(T12, T13);
+			 T14 = VADD(T12, T13);
+			 Tg = VADD(Ta, Tf);
+			 TE = VSUB(Ta, Tf);
+			 TJ = VADD(Tz, TA);
+			 TB = VSUB(Tz, TA);
+			 T15 = VADD(Th, Tk);
+			 Tl = VFNMS(LDK(KP500000000), Tk, Th);
+			 {
+			      V TM, TK, TS, TC, T1c, T17, Tr, TF, TL, T10;
+			      TM = VSUB(TI, TJ);
+			      TK = VADD(TI, TJ);
+			      TS = VFNMS(LDK(KP618033988), Ty, TB);
+			      TC = VFMA(LDK(KP618033988), TB, Ty);
+			      T1c = VSUB(T15, T16);
+			      T17 = VADD(T15, T16);
+			      Tr = VADD(Tl, Tq);
+			      TF = VSUB(Tl, Tq);
+			      TL = VFNMS(LDK(KP250000000), TK, TH);
+			      T10 = VMUL(LDK(KP866025403), VADD(TH, TK));
+			      {
+				   V T1g, T1e, T1a, Tu, Ts, TU, TG, TV, TN, T19, T18, Tt, TZ;
+				   T1g = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1c, T1d));
+				   T1e = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1d, T1c));
+				   T18 = VADD(T14, T17);
+				   T1a = VSUB(T14, T17);
+				   Tu = VSUB(Tg, Tr);
+				   Ts = VADD(Tg, Tr);
+				   TU = VFNMS(LDK(KP618033988), TE, TF);
+				   TG = VFMA(LDK(KP618033988), TF, TE);
+				   TV = VFNMS(LDK(KP559016994), TM, TL);
+				   TN = VFMA(LDK(KP559016994), TM, TL);
+				   ST(&(xo[0]), VADD(T11, T18), ovs, &(xo[0]));
+				   T19 = VFNMS(LDK(KP250000000), T18, T11);
+				   Tt = VFNMS(LDK(KP250000000), Ts, T5);
+				   TZ = VADD(T5, Ts);
+				   {
+					V TW, TY, TQ, TO, T1b, T1f, TR, Tv;
+					TW = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), TV, TU));
+					TY = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), TV, TU));
+					TQ = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), TN, TG));
+					TO = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), TN, TG));
+					T1b = VFNMS(LDK(KP559016994), T1a, T19);
+					T1f = VFMA(LDK(KP559016994), T1a, T19);
+					TR = VFNMS(LDK(KP559016994), Tu, Tt);
+					Tv = VFMA(LDK(KP559016994), Tu, Tt);
+					ST(&(xo[WS(os, 10)]), VFMAI(T10, TZ), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 5)]), VFNMSI(T10, TZ), ovs, &(xo[WS(os, 1)]));
+					{
+					     V TT, TX, TP, TD;
+					     ST(&(xo[WS(os, 12)]), VFNMSI(T1e, T1b), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 3)]), VFMAI(T1e, T1b), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 9)]), VFNMSI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 6)]), VFMAI(T1g, T1f), ovs, &(xo[0]));
+					     TT = VFNMS(LDK(KP823639103), TS, TR);
+					     TX = VFMA(LDK(KP823639103), TS, TR);
+					     TP = VFMA(LDK(KP823639103), TC, Tv);
+					     TD = VFNMS(LDK(KP823639103), TC, Tv);
+					     ST(&(xo[WS(os, 13)]), VFMAI(TW, TT), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 2)]), VFNMSI(TW, TT), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 8)]), VFMAI(TY, TX), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 7)]), VFNMSI(TY, TX), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 11)]), VFMAI(TQ, TP), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 4)]), VFNMSI(TQ, TP), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 14)]), VFNMSI(TO, TD), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 1)]), VFMAI(TO, TD), ovs, &(xo[WS(os, 1)]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 15, XSIMD_STRING("n1bv_15"), {36, 7, 42, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_15) (planner *p) {
+     X(kdft_register) (p, n1bv_15, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 15 -name n1bv_15 -include n1b.h */
+
+/*
+ * This function contains 78 FP additions, 25 FP multiplications,
+ * (or, 64 additions, 11 multiplications, 14 fused multiply/add),
+ * 55 stack variables, 10 constants, and 30 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP216506350, +0.216506350946109661690930792688234045867850657);
+     DVK(KP509036960, +0.509036960455127183450980863393907648510733164);
+     DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP484122918, +0.484122918275927110647408174972799951354115213);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(30, is), MAKE_VOLATILE_STRIDE(30, os)) {
+	       V Ti, T11, TH, Ts, TL, TM, Tz, TC, TD, TI, T12, T13, T14, T15, T16;
+	       V T17, Tf, Tj, TZ, T10;
+	       {
+		    V TF, Tg, Th, TG;
+		    TF = LD(&(xi[0]), ivs, &(xi[0]));
+		    Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    Th = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    TG = VADD(Tg, Th);
+		    Ti = VSUB(Tg, Th);
+		    T11 = VADD(TF, TG);
+		    TH = VFNMS(LDK(KP500000000), TG, TF);
+	       }
+	       {
+		    V Tm, Tn, T3, To, Tw, Tx, Td, Ty, Tp, Tq, T6, Tr, Tt, Tu, Ta;
+		    V Tv, T7, Te;
+		    {
+			 V T1, T2, Tb, Tc;
+			 Tm = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T1 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Tn = VADD(T1, T2);
+			 T3 = VSUB(T1, T2);
+			 To = VFNMS(LDK(KP500000000), Tn, Tm);
+			 Tw = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Tb = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tx = VADD(Tb, Tc);
+			 Td = VSUB(Tb, Tc);
+			 Ty = VFNMS(LDK(KP500000000), Tx, Tw);
+		    }
+		    {
+			 V T4, T5, T8, T9;
+			 Tp = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Tq = VADD(T4, T5);
+			 T6 = VSUB(T4, T5);
+			 Tr = VFNMS(LDK(KP500000000), Tq, Tp);
+			 Tt = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tu = VADD(T8, T9);
+			 Ta = VSUB(T8, T9);
+			 Tv = VFNMS(LDK(KP500000000), Tu, Tt);
+		    }
+		    Ts = VSUB(To, Tr);
+		    TL = VSUB(T3, T6);
+		    TM = VSUB(Ta, Td);
+		    Tz = VSUB(Tv, Ty);
+		    TC = VADD(To, Tr);
+		    TD = VADD(Tv, Ty);
+		    TI = VADD(TC, TD);
+		    T12 = VADD(Tm, Tn);
+		    T13 = VADD(Tp, Tq);
+		    T14 = VADD(T12, T13);
+		    T15 = VADD(Tt, Tu);
+		    T16 = VADD(Tw, Tx);
+		    T17 = VADD(T15, T16);
+		    T7 = VADD(T3, T6);
+		    Te = VADD(Ta, Td);
+		    Tf = VMUL(LDK(KP484122918), VSUB(T7, Te));
+		    Tj = VADD(T7, Te);
+	       }
+	       TZ = VADD(TH, TI);
+	       T10 = VBYI(VMUL(LDK(KP866025403), VADD(Ti, Tj)));
+	       ST(&(xo[WS(os, 5)]), VSUB(TZ, T10), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[WS(os, 10)]), VADD(T10, TZ), ovs, &(xo[0]));
+	       {
+		    V T1a, T18, T19, T1e, T1f, T1c, T1d, T1g, T1b;
+		    T1a = VMUL(LDK(KP559016994), VSUB(T14, T17));
+		    T18 = VADD(T14, T17);
+		    T19 = VFNMS(LDK(KP250000000), T18, T11);
+		    T1c = VSUB(T12, T13);
+		    T1d = VSUB(T15, T16);
+		    T1e = VBYI(VFNMS(LDK(KP951056516), T1d, VMUL(LDK(KP587785252), T1c)));
+		    T1f = VBYI(VFMA(LDK(KP951056516), T1c, VMUL(LDK(KP587785252), T1d)));
+		    ST(&(xo[0]), VADD(T11, T18), ovs, &(xo[0]));
+		    T1g = VADD(T1a, T19);
+		    ST(&(xo[WS(os, 6)]), VADD(T1f, T1g), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 9)]), VSUB(T1g, T1f), ovs, &(xo[WS(os, 1)]));
+		    T1b = VSUB(T19, T1a);
+		    ST(&(xo[WS(os, 3)]), VSUB(T1b, T1e), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 12)]), VADD(T1e, T1b), ovs, &(xo[0]));
+	       }
+	       {
+		    V TA, TN, TU, TS, Tl, TR, TK, TV, Tk, TE, TJ;
+		    TA = VFMA(LDK(KP951056516), Ts, VMUL(LDK(KP587785252), Tz));
+		    TN = VFMA(LDK(KP823639103), TL, VMUL(LDK(KP509036960), TM));
+		    TU = VFNMS(LDK(KP823639103), TM, VMUL(LDK(KP509036960), TL));
+		    TS = VFNMS(LDK(KP951056516), Tz, VMUL(LDK(KP587785252), Ts));
+		    Tk = VFNMS(LDK(KP216506350), Tj, VMUL(LDK(KP866025403), Ti));
+		    Tl = VADD(Tf, Tk);
+		    TR = VSUB(Tf, Tk);
+		    TE = VMUL(LDK(KP559016994), VSUB(TC, TD));
+		    TJ = VFNMS(LDK(KP250000000), TI, TH);
+		    TK = VADD(TE, TJ);
+		    TV = VSUB(TJ, TE);
+		    {
+			 V TB, TO, TX, TY;
+			 TB = VBYI(VADD(Tl, TA));
+			 TO = VSUB(TK, TN);
+			 ST(&(xo[WS(os, 1)]), VADD(TB, TO), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 14)]), VSUB(TO, TB), ovs, &(xo[0]));
+			 TX = VBYI(VSUB(TS, TR));
+			 TY = VSUB(TV, TU);
+			 ST(&(xo[WS(os, 7)]), VADD(TX, TY), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 8)]), VSUB(TY, TX), ovs, &(xo[0]));
+		    }
+		    {
+			 V TP, TQ, TT, TW;
+			 TP = VBYI(VSUB(Tl, TA));
+			 TQ = VADD(TN, TK);
+			 ST(&(xo[WS(os, 4)]), VADD(TP, TQ), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 11)]), VSUB(TQ, TP), ovs, &(xo[WS(os, 1)]));
+			 TT = VBYI(VADD(TR, TS));
+			 TW = VADD(TU, TV);
+			 ST(&(xo[WS(os, 2)]), VADD(TT, TW), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 13)]), VSUB(TW, TT), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 15, XSIMD_STRING("n1bv_15"), {64, 11, 14, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_15) (planner *p) {
+     X(kdft_register) (p, n1bv_15, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 16 -name n1bv_16 -include n1b.h */
+
+/*
+ * This function contains 72 FP additions, 34 FP multiplications,
+ * (or, 38 additions, 0 multiplications, 34 fused multiply/add),
+ * 54 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       V T7, Tu, TF, TB, T13, TL, TO, TX, TC, Te, TP, Th, TQ, Tk, TW;
+	       V T16;
+	       {
+		    V TH, TU, Tz, Tf, TK, TV, TA, TM, Ta, TN, Td, Tg, Ti, Tj;
+		    {
+			 V T1, T2, T4, T5, To, Tp, Tr, Ts;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Tp = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tr = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Ts = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 {
+			      V T8, TI, Tq, TJ, Tt, T9, Tb, Tc, T3, T6;
+			      T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			      TH = VSUB(T1, T2);
+			      T3 = VADD(T1, T2);
+			      TU = VSUB(T4, T5);
+			      T6 = VADD(T4, T5);
+			      TI = VSUB(To, Tp);
+			      Tq = VADD(To, Tp);
+			      TJ = VSUB(Tr, Ts);
+			      Tt = VADD(Tr, Ts);
+			      T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			      Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			      Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			      T7 = VSUB(T3, T6);
+			      Tz = VADD(T3, T6);
+			      Tf = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			      TK = VADD(TI, TJ);
+			      TV = VSUB(TI, TJ);
+			      TA = VADD(Tq, Tt);
+			      Tu = VSUB(Tq, Tt);
+			      TM = VSUB(T8, T9);
+			      Ta = VADD(T8, T9);
+			      TN = VSUB(Tb, Tc);
+			      Td = VADD(Tb, Tc);
+			      Tg = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			      Ti = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			      Tj = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 }
+		    }
+		    TF = VADD(Tz, TA);
+		    TB = VSUB(Tz, TA);
+		    T13 = VFNMS(LDK(KP707106781), TK, TH);
+		    TL = VFMA(LDK(KP707106781), TK, TH);
+		    TO = VFNMS(LDK(KP414213562), TN, TM);
+		    TX = VFMA(LDK(KP414213562), TM, TN);
+		    TC = VADD(Ta, Td);
+		    Te = VSUB(Ta, Td);
+		    TP = VSUB(Tf, Tg);
+		    Th = VADD(Tf, Tg);
+		    TQ = VSUB(Tj, Ti);
+		    Tk = VADD(Ti, Tj);
+		    TW = VFMA(LDK(KP707106781), TV, TU);
+		    T16 = VFNMS(LDK(KP707106781), TV, TU);
+	       }
+	       {
+		    V TY, TR, Tl, TD;
+		    TY = VFMA(LDK(KP414213562), TP, TQ);
+		    TR = VFNMS(LDK(KP414213562), TQ, TP);
+		    Tl = VSUB(Th, Tk);
+		    TD = VADD(Th, Tk);
+		    {
+			 V TS, T17, TZ, T14;
+			 TS = VADD(TO, TR);
+			 T17 = VSUB(TO, TR);
+			 TZ = VSUB(TX, TY);
+			 T14 = VADD(TX, TY);
+			 {
+			      V TE, TG, Tm, Tv;
+			      TE = VSUB(TC, TD);
+			      TG = VADD(TC, TD);
+			      Tm = VADD(Te, Tl);
+			      Tv = VSUB(Te, Tl);
+			      {
+				   V T18, T1a, TT, T11;
+				   T18 = VFMA(LDK(KP923879532), T17, T16);
+				   T1a = VFNMS(LDK(KP923879532), T17, T16);
+				   TT = VFNMS(LDK(KP923879532), TS, TL);
+				   T11 = VFMA(LDK(KP923879532), TS, TL);
+				   {
+					V T15, T19, T10, T12;
+					T15 = VFNMS(LDK(KP923879532), T14, T13);
+					T19 = VFMA(LDK(KP923879532), T14, T13);
+					T10 = VFNMS(LDK(KP923879532), TZ, TW);
+					T12 = VFMA(LDK(KP923879532), TZ, TW);
+					ST(&(xo[0]), VADD(TF, TG), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 8)]), VSUB(TF, TG), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 4)]), VFMAI(TE, TB), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 12)]), VFNMSI(TE, TB), ovs, &(xo[0]));
+					{
+					     V Tw, Ty, Tn, Tx;
+					     Tw = VFNMS(LDK(KP707106781), Tv, Tu);
+					     Ty = VFMA(LDK(KP707106781), Tv, Tu);
+					     Tn = VFNMS(LDK(KP707106781), Tm, T7);
+					     Tx = VFMA(LDK(KP707106781), Tm, T7);
+					     ST(&(xo[WS(os, 3)]), VFNMSI(T1a, T19), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 13)]), VFMAI(T1a, T19), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 11)]), VFNMSI(T18, T15), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 5)]), VFMAI(T18, T15), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 15)]), VFNMSI(T12, T11), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 1)]), VFMAI(T12, T11), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 9)]), VFMAI(T10, TT), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 7)]), VFNMSI(T10, TT), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 2)]), VFMAI(Ty, Tx), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 14)]), VFNMSI(Ty, Tx), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 10)]), VFMAI(Tw, Tn), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 6)]), VFNMSI(Tw, Tn), ovs, &(xo[0]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 16, XSIMD_STRING("n1bv_16"), {38, 0, 34, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_16) (planner *p) {
+     X(kdft_register) (p, n1bv_16, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 16 -name n1bv_16 -include n1b.h */
+
+/*
+ * This function contains 72 FP additions, 12 FP multiplications,
+ * (or, 68 additions, 8 multiplications, 4 fused multiply/add),
+ * 30 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       V Tp, T13, Tu, TY, Tm, T14, Tv, TU, T7, T16, Tx, TN, Te, T17, Ty;
+	       V TQ;
+	       {
+		    V Tn, To, TX, Ts, Tt, TW;
+		    Tn = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    To = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+		    TX = VADD(Tn, To);
+		    Ts = LD(&(xi[0]), ivs, &(xi[0]));
+		    Tt = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    TW = VADD(Ts, Tt);
+		    Tp = VSUB(Tn, To);
+		    T13 = VADD(TW, TX);
+		    Tu = VSUB(Ts, Tt);
+		    TY = VSUB(TW, TX);
+	       }
+	       {
+		    V Ti, TS, Tl, TT;
+		    {
+			 V Tg, Th, Tj, Tk;
+			 Tg = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Th = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Ti = VSUB(Tg, Th);
+			 TS = VADD(Tg, Th);
+			 Tj = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 TT = VADD(Tj, Tk);
+		    }
+		    Tm = VMUL(LDK(KP707106781), VSUB(Ti, Tl));
+		    T14 = VADD(TS, TT);
+		    Tv = VMUL(LDK(KP707106781), VADD(Ti, Tl));
+		    TU = VSUB(TS, TT);
+	       }
+	       {
+		    V T3, TL, T6, TM;
+		    {
+			 V T1, T2, T4, T5;
+			 T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 TL = VADD(T1, T2);
+			 T4 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T5 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 TM = VADD(T4, T5);
+		    }
+		    T7 = VFNMS(LDK(KP382683432), T6, VMUL(LDK(KP923879532), T3));
+		    T16 = VADD(TL, TM);
+		    Tx = VFMA(LDK(KP382683432), T3, VMUL(LDK(KP923879532), T6));
+		    TN = VSUB(TL, TM);
+	       }
+	       {
+		    V Ta, TO, Td, TP;
+		    {
+			 V T8, T9, Tb, Tc;
+			 T8 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VSUB(T8, T9);
+			 TO = VADD(T8, T9);
+			 Tb = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 TP = VADD(Tb, Tc);
+		    }
+		    Te = VFMA(LDK(KP923879532), Ta, VMUL(LDK(KP382683432), Td));
+		    T17 = VADD(TO, TP);
+		    Ty = VFNMS(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), Td));
+		    TQ = VSUB(TO, TP);
+	       }
+	       {
+		    V T15, T18, T19, T1a;
+		    T15 = VSUB(T13, T14);
+		    T18 = VBYI(VSUB(T16, T17));
+		    ST(&(xo[WS(os, 12)]), VSUB(T15, T18), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VADD(T15, T18), ovs, &(xo[0]));
+		    T19 = VADD(T13, T14);
+		    T1a = VADD(T16, T17);
+		    ST(&(xo[WS(os, 8)]), VSUB(T19, T1a), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(T19, T1a), ovs, &(xo[0]));
+	       }
+	       {
+		    V TV, T11, T10, T12, TR, TZ;
+		    TR = VMUL(LDK(KP707106781), VSUB(TN, TQ));
+		    TV = VBYI(VSUB(TR, TU));
+		    T11 = VBYI(VADD(TU, TR));
+		    TZ = VMUL(LDK(KP707106781), VADD(TN, TQ));
+		    T10 = VSUB(TY, TZ);
+		    T12 = VADD(TY, TZ);
+		    ST(&(xo[WS(os, 6)]), VADD(TV, T10), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 14)]), VSUB(T12, T11), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 10)]), VSUB(T10, TV), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VADD(T11, T12), ovs, &(xo[0]));
+	       }
+	       {
+		    V Tr, TB, TA, TC;
+		    {
+			 V Tf, Tq, Tw, Tz;
+			 Tf = VSUB(T7, Te);
+			 Tq = VSUB(Tm, Tp);
+			 Tr = VBYI(VSUB(Tf, Tq));
+			 TB = VBYI(VADD(Tq, Tf));
+			 Tw = VSUB(Tu, Tv);
+			 Tz = VSUB(Tx, Ty);
+			 TA = VSUB(Tw, Tz);
+			 TC = VADD(Tw, Tz);
+		    }
+		    ST(&(xo[WS(os, 5)]), VADD(Tr, TA), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 13)]), VSUB(TC, TB), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 11)]), VSUB(TA, Tr), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VADD(TB, TC), ovs, &(xo[WS(os, 1)]));
+	       }
+	       {
+		    V TF, TJ, TI, TK;
+		    {
+			 V TD, TE, TG, TH;
+			 TD = VADD(Tu, Tv);
+			 TE = VADD(T7, Te);
+			 TF = VADD(TD, TE);
+			 TJ = VSUB(TD, TE);
+			 TG = VADD(Tp, Tm);
+			 TH = VADD(Tx, Ty);
+			 TI = VBYI(VADD(TG, TH));
+			 TK = VBYI(VSUB(TH, TG));
+		    }
+		    ST(&(xo[WS(os, 15)]), VSUB(TF, TI), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 7)]), VADD(TJ, TK), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VADD(TF, TI), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 9)]), VSUB(TJ, TK), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 16, XSIMD_STRING("n1bv_16"), {68, 8, 4, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_16) (planner *p) {
+     X(kdft_register) (p, n1bv_16, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:58 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 2 -name n1bv_2 -include n1b.h */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(4, is), MAKE_VOLATILE_STRIDE(4, os)) {
+	       V T1, T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       ST(&(xo[0]), VADD(T1, T2), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 1)]), VSUB(T1, T2), ovs, &(xo[WS(os, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 2, XSIMD_STRING("n1bv_2"), {2, 0, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_2) (planner *p) {
+     X(kdft_register) (p, n1bv_2, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 2 -name n1bv_2 -include n1b.h */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(4, is), MAKE_VOLATILE_STRIDE(4, os)) {
+	       V T1, T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       ST(&(xo[WS(os, 1)]), VSUB(T1, T2), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[0]), VADD(T1, T2), ovs, &(xo[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 2, XSIMD_STRING("n1bv_2"), {2, 0, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_2) (planner *p) {
+     X(kdft_register) (p, n1bv_2, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:14 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n1bv_20 -include n1b.h */
+
+/*
+ * This function contains 104 FP additions, 50 FP multiplications,
+ * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
+ * 71 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
+	       V TS, TA, TN, TV, TK, TU, TR, Tl;
+	       {
+		    V T3, TE, T1r, T13, Ta, TL, Tz, TG, Ts, TF, Th, TM, T1u, T1C, T1n;
+		    V T1a, T1m, T1h, T1x, T1D, Tk, Ti;
+		    {
+			 V T1, T2, TC, TD;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 TC = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 TD = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 {
+			      V T14, T6, T1c, Tv, Tm, T1f, Ty, T17, T9, Tn, Tp, T1b, Td, Tq, Te;
+			      V Tf, T15, To;
+			      {
+				   V Tw, Tx, T7, T8, Tb, Tc;
+				   {
+					V T4, T5, Tt, Tu, T11, T12;
+					T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+					Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+					Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+					Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+					T3 = VSUB(T1, T2);
+					T11 = VADD(T1, T2);
+					TE = VSUB(TC, TD);
+					T12 = VADD(TC, TD);
+					T14 = VADD(T4, T5);
+					T6 = VSUB(T4, T5);
+					T1c = VADD(Tt, Tu);
+					Tv = VSUB(Tt, Tu);
+					Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+					T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+					T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+					T1r = VADD(T11, T12);
+					T13 = VSUB(T11, T12);
+				   }
+				   Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+				   Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+				   Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+				   T1f = VADD(Tw, Tx);
+				   Ty = VSUB(Tw, Tx);
+				   T17 = VADD(T7, T8);
+				   T9 = VSUB(T7, T8);
+				   Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+				   Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+				   T1b = VADD(Tb, Tc);
+				   Td = VSUB(Tb, Tc);
+				   Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+				   Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+				   Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			      }
+			      Ta = VADD(T6, T9);
+			      TL = VSUB(T6, T9);
+			      T15 = VADD(Tm, Tn);
+			      To = VSUB(Tm, Tn);
+			      Tz = VSUB(Tv, Ty);
+			      TG = VADD(Tv, Ty);
+			      {
+				   V T1d, T1v, T18, Tr, T1e, Tg, T16, T1s;
+				   T1d = VSUB(T1b, T1c);
+				   T1v = VADD(T1b, T1c);
+				   T18 = VADD(Tp, Tq);
+				   Tr = VSUB(Tp, Tq);
+				   T1e = VADD(Te, Tf);
+				   Tg = VSUB(Te, Tf);
+				   T16 = VSUB(T14, T15);
+				   T1s = VADD(T14, T15);
+				   {
+					V T1t, T19, T1w, T1g;
+					T1t = VADD(T17, T18);
+					T19 = VSUB(T17, T18);
+					Ts = VSUB(To, Tr);
+					TF = VADD(To, Tr);
+					T1w = VADD(T1e, T1f);
+					T1g = VSUB(T1e, T1f);
+					Th = VADD(Td, Tg);
+					TM = VSUB(Td, Tg);
+					T1u = VADD(T1s, T1t);
+					T1C = VSUB(T1s, T1t);
+					T1n = VSUB(T16, T19);
+					T1a = VADD(T16, T19);
+					T1m = VSUB(T1d, T1g);
+					T1h = VADD(T1d, T1g);
+					T1x = VADD(T1v, T1w);
+					T1D = VSUB(T1v, T1w);
+				   }
+			      }
+			 }
+		    }
+		    Tk = VSUB(Ta, Th);
+		    Ti = VADD(Ta, Th);
+		    {
+			 V TJ, T1k, T1A, TZ, Tj, T1E, T1G, TI, T10, T1j, T1z, T1i, T1y, TH;
+			 TJ = VSUB(TF, TG);
+			 TH = VADD(TF, TG);
+			 T1i = VADD(T1a, T1h);
+			 T1k = VSUB(T1a, T1h);
+			 T1y = VADD(T1u, T1x);
+			 T1A = VSUB(T1u, T1x);
+			 TZ = VADD(T3, Ti);
+			 Tj = VFNMS(LDK(KP250000000), Ti, T3);
+			 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
+			 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
+			 TI = VFNMS(LDK(KP250000000), TH, TE);
+			 T10 = VADD(TE, TH);
+			 T1j = VFNMS(LDK(KP250000000), T1i, T13);
+			 ST(&(xo[0]), VADD(T1r, T1y), ovs, &(xo[0]));
+			 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
+			 ST(&(xo[WS(os, 10)]), VADD(T13, T1i), ovs, &(xo[0]));
+			 {
+			      V T1p, T1l, T1o, T1q, T1F, T1B;
+			      TS = VFNMS(LDK(KP618033988), Ts, Tz);
+			      TA = VFMA(LDK(KP618033988), Tz, Ts);
+			      TN = VFMA(LDK(KP618033988), TM, TL);
+			      TV = VFNMS(LDK(KP618033988), TL, TM);
+			      ST(&(xo[WS(os, 5)]), VFMAI(T10, TZ), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 15)]), VFNMSI(T10, TZ), ovs, &(xo[WS(os, 1)]));
+			      T1p = VFMA(LDK(KP559016994), T1k, T1j);
+			      T1l = VFNMS(LDK(KP559016994), T1k, T1j);
+			      T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
+			      T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
+			      T1F = VFNMS(LDK(KP559016994), T1A, T1z);
+			      T1B = VFMA(LDK(KP559016994), T1A, T1z);
+			      ST(&(xo[WS(os, 14)]), VFNMSI(T1q, T1p), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 6)]), VFMAI(T1q, T1p), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 18)]), VFMAI(T1o, T1l), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 2)]), VFNMSI(T1o, T1l), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 16)]), VFMAI(T1E, T1B), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 4)]), VFNMSI(T1E, T1B), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 12)]), VFNMSI(T1G, T1F), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 8)]), VFMAI(T1G, T1F), ovs, &(xo[0]));
+			      TK = VFMA(LDK(KP559016994), TJ, TI);
+			      TU = VFNMS(LDK(KP559016994), TJ, TI);
+			      TR = VFNMS(LDK(KP559016994), Tk, Tj);
+			      Tl = VFMA(LDK(KP559016994), Tk, Tj);
+			 }
+		    }
+	       }
+	       {
+		    V TY, TW, TO, TQ, TB, TP, TX, TT;
+		    TY = VFMA(LDK(KP951056516), TV, TU);
+		    TW = VFNMS(LDK(KP951056516), TV, TU);
+		    TO = VFMA(LDK(KP951056516), TN, TK);
+		    TQ = VFNMS(LDK(KP951056516), TN, TK);
+		    TB = VFNMS(LDK(KP951056516), TA, Tl);
+		    TP = VFMA(LDK(KP951056516), TA, Tl);
+		    TX = VFNMS(LDK(KP951056516), TS, TR);
+		    TT = VFMA(LDK(KP951056516), TS, TR);
+		    ST(&(xo[WS(os, 9)]), VFMAI(TQ, TP), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 11)]), VFNMSI(TQ, TP), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VFMAI(TO, TB), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 19)]), VFNMSI(TO, TB), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 17)]), VFMAI(TW, TT), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VFNMSI(TW, TT), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 13)]), VFMAI(TY, TX), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 7)]), VFNMSI(TY, TX), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 20, XSIMD_STRING("n1bv_20"), {58, 4, 46, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_20) (planner *p) {
+     X(kdft_register) (p, n1bv_20, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n1bv_20 -include n1b.h */
+
+/*
+ * This function contains 104 FP additions, 24 FP multiplications,
+ * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
+ * 53 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
+	       V T3, T1y, TH, T1i, Ts, TL, TM, Tz, T13, T16, T1j, T1u, T1v, T1w, T1r;
+	       V T1s, T1t, T1a, T1d, T1k, Ti, Tk, TE, TI, TZ, T10;
+	       {
+		    V T1, T2, T1g, TF, TG, T1h;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T1g = VADD(T1, T2);
+		    TF = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    TG = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+		    T1h = VADD(TF, TG);
+		    T3 = VSUB(T1, T2);
+		    T1y = VADD(T1g, T1h);
+		    TH = VSUB(TF, TG);
+		    T1i = VSUB(T1g, T1h);
+	       }
+	       {
+		    V T6, T11, Tv, T19, Ty, T1c, T9, T14, Td, T18, To, T12, Tr, T15, Tg;
+		    V T1b;
+		    {
+			 V T4, T5, Tt, Tu;
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T11 = VADD(T4, T5);
+			 Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tv = VSUB(Tt, Tu);
+			 T19 = VADD(Tt, Tu);
+		    }
+		    {
+			 V Tw, Tx, T7, T8;
+			 Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Ty = VSUB(Tw, Tx);
+			 T1c = VADD(Tw, Tx);
+			 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T14 = VADD(T7, T8);
+		    }
+		    {
+			 V Tb, Tc, Tm, Tn;
+			 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 Td = VSUB(Tb, Tc);
+			 T18 = VADD(Tb, Tc);
+			 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			 To = VSUB(Tm, Tn);
+			 T12 = VADD(Tm, Tn);
+		    }
+		    {
+			 V Tp, Tq, Te, Tf;
+			 Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Tr = VSUB(Tp, Tq);
+			 T15 = VADD(Tp, Tq);
+			 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Tg = VSUB(Te, Tf);
+			 T1b = VADD(Te, Tf);
+		    }
+		    Ts = VSUB(To, Tr);
+		    TL = VSUB(T6, T9);
+		    TM = VSUB(Td, Tg);
+		    Tz = VSUB(Tv, Ty);
+		    T13 = VSUB(T11, T12);
+		    T16 = VSUB(T14, T15);
+		    T1j = VADD(T13, T16);
+		    T1u = VADD(T18, T19);
+		    T1v = VADD(T1b, T1c);
+		    T1w = VADD(T1u, T1v);
+		    T1r = VADD(T11, T12);
+		    T1s = VADD(T14, T15);
+		    T1t = VADD(T1r, T1s);
+		    T1a = VSUB(T18, T19);
+		    T1d = VSUB(T1b, T1c);
+		    T1k = VADD(T1a, T1d);
+		    {
+			 V Ta, Th, TC, TD;
+			 Ta = VADD(T6, T9);
+			 Th = VADD(Td, Tg);
+			 Ti = VADD(Ta, Th);
+			 Tk = VMUL(LDK(KP559016994), VSUB(Ta, Th));
+			 TC = VADD(To, Tr);
+			 TD = VADD(Tv, Ty);
+			 TE = VMUL(LDK(KP559016994), VSUB(TC, TD));
+			 TI = VADD(TC, TD);
+		    }
+	       }
+	       TZ = VADD(T3, Ti);
+	       T10 = VBYI(VADD(TH, TI));
+	       ST(&(xo[WS(os, 15)]), VSUB(TZ, T10), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[WS(os, 5)]), VADD(TZ, T10), ovs, &(xo[WS(os, 1)]));
+	       {
+		    V T1x, T1z, T1A, T1E, T1G, T1C, T1D, T1F, T1B;
+		    T1x = VMUL(LDK(KP559016994), VSUB(T1t, T1w));
+		    T1z = VADD(T1t, T1w);
+		    T1A = VFNMS(LDK(KP250000000), T1z, T1y);
+		    T1C = VSUB(T1r, T1s);
+		    T1D = VSUB(T1u, T1v);
+		    T1E = VBYI(VFMA(LDK(KP951056516), T1C, VMUL(LDK(KP587785252), T1D)));
+		    T1G = VBYI(VFNMS(LDK(KP951056516), T1D, VMUL(LDK(KP587785252), T1C)));
+		    ST(&(xo[0]), VADD(T1y, T1z), ovs, &(xo[0]));
+		    T1F = VSUB(T1A, T1x);
+		    ST(&(xo[WS(os, 8)]), VSUB(T1F, T1G), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 12)]), VADD(T1G, T1F), ovs, &(xo[0]));
+		    T1B = VADD(T1x, T1A);
+		    ST(&(xo[WS(os, 4)]), VSUB(T1B, T1E), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 16)]), VADD(T1E, T1B), ovs, &(xo[0]));
+	       }
+	       {
+		    V T1n, T1l, T1m, T1f, T1p, T17, T1e, T1q, T1o;
+		    T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
+		    T1l = VADD(T1j, T1k);
+		    T1m = VFNMS(LDK(KP250000000), T1l, T1i);
+		    T17 = VSUB(T13, T16);
+		    T1e = VSUB(T1a, T1d);
+		    T1f = VBYI(VFNMS(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
+		    T1p = VBYI(VFMA(LDK(KP951056516), T17, VMUL(LDK(KP587785252), T1e)));
+		    ST(&(xo[WS(os, 10)]), VADD(T1i, T1l), ovs, &(xo[0]));
+		    T1q = VADD(T1n, T1m);
+		    ST(&(xo[WS(os, 6)]), VADD(T1p, T1q), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 14)]), VSUB(T1q, T1p), ovs, &(xo[0]));
+		    T1o = VSUB(T1m, T1n);
+		    ST(&(xo[WS(os, 2)]), VADD(T1f, T1o), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 18)]), VSUB(T1o, T1f), ovs, &(xo[0]));
+	       }
+	       {
+		    V TA, TN, TU, TS, TK, TV, Tl, TR, TJ, Tj;
+		    TA = VFNMS(LDK(KP951056516), Tz, VMUL(LDK(KP587785252), Ts));
+		    TN = VFNMS(LDK(KP951056516), TM, VMUL(LDK(KP587785252), TL));
+		    TU = VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TM));
+		    TS = VFMA(LDK(KP951056516), Ts, VMUL(LDK(KP587785252), Tz));
+		    TJ = VFNMS(LDK(KP250000000), TI, TH);
+		    TK = VSUB(TE, TJ);
+		    TV = VADD(TE, TJ);
+		    Tj = VFNMS(LDK(KP250000000), Ti, T3);
+		    Tl = VSUB(Tj, Tk);
+		    TR = VADD(Tk, Tj);
+		    {
+			 V TB, TO, TX, TY;
+			 TB = VSUB(Tl, TA);
+			 TO = VBYI(VSUB(TK, TN));
+			 ST(&(xo[WS(os, 17)]), VSUB(TB, TO), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 3)]), VADD(TB, TO), ovs, &(xo[WS(os, 1)]));
+			 TX = VADD(TR, TS);
+			 TY = VBYI(VSUB(TV, TU));
+			 ST(&(xo[WS(os, 11)]), VSUB(TX, TY), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 9)]), VADD(TX, TY), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V TP, TQ, TT, TW;
+			 TP = VADD(Tl, TA);
+			 TQ = VBYI(VADD(TN, TK));
+			 ST(&(xo[WS(os, 13)]), VSUB(TP, TQ), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 7)]), VADD(TP, TQ), ovs, &(xo[WS(os, 1)]));
+			 TT = VSUB(TR, TS);
+			 TW = VBYI(VADD(TU, TV));
+			 ST(&(xo[WS(os, 19)]), VSUB(TT, TW), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 1)]), VADD(TT, TW), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 20, XSIMD_STRING("n1bv_20"), {92, 12, 12, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_20) (planner *p) {
+     X(kdft_register) (p, n1bv_20, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,798 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:14 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 25 -name n1bv_25 -include n1b.h */
+
+/*
+ * This function contains 224 FP additions, 193 FP multiplications,
+ * (or, 43 additions, 12 multiplications, 181 fused multiply/add),
+ * 215 stack variables, 67 constants, and 50 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_25(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP792626838, +0.792626838241819413632131824093538848057784557);
+     DVK(KP876091699, +0.876091699473550838204498029706869638173524346);
+     DVK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DVK(KP617882369, +0.617882369114440893914546919006756321695042882);
+     DVK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP999544308, +0.999544308746292983948881682379742149196758193);
+     DVK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DVK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DVK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DVK(KP829049696, +0.829049696159252993975487806364305442437946767);
+     DVK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DVK(KP916574801, +0.916574801383451584742370439148878693530976769);
+     DVK(KP894834959, +0.894834959464455102997960030820114611498661386);
+     DVK(KP809385824, +0.809385824416008241660603814668679683846476688);
+     DVK(KP447417479, +0.447417479732227551498980015410057305749330693);
+     DVK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DVK(KP897376177, +0.897376177523557693138608077137219684419427330);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DVK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DVK(KP855719849, +0.855719849902058969314654733608091555096772472);
+     DVK(KP237294955, +0.237294955877110315393888866460840817927895961);
+     DVK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DVK(KP904508497, +0.904508497187473712051146708591409529430077295);
+     DVK(KP997675361, +0.997675361079556513670859573984492383596555031);
+     DVK(KP262346850, +0.262346850930607871785420028382979691334784273);
+     DVK(KP763932022, +0.763932022500210303590826331268723764559381640);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DVK(KP952936919, +0.952936919628306576880750665357914584765951388);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP570584518, +0.570584518783621657366766175430996792655723863);
+     DVK(KP669429328, +0.669429328479476605641803240971985825917022098);
+     DVK(KP923225144, +0.923225144846402650453449441572664695995209956);
+     DVK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DVK(KP956723877, +0.956723877038460305821989399535483155872969262);
+     DVK(KP522616830, +0.522616830205754336872861364785224694908468440);
+     DVK(KP945422727, +0.945422727388575946270360266328811958657216298);
+     DVK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DVK(KP982009705, +0.982009705009746369461829878184175962711969869);
+     DVK(KP921078979, +0.921078979742360627699756128143719920817673854);
+     DVK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP958953096, +0.958953096729998668045963838399037225970891871);
+     DVK(KP867381224, +0.867381224396525206773171885031575671309956167);
+     DVK(KP269969613, +0.269969613759572083574752974412347470060951301);
+     DVK(KP244189809, +0.244189809627953270309879511234821255780225091);
+     DVK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DVK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DVK(KP132830569, +0.132830569247582714407653942074819768844536507);
+     DVK(KP120146378, +0.120146378570687701782758537356596213647956445);
+     DVK(KP987388751, +0.987388751065621252324603216482382109400433949);
+     DVK(KP893101515, +0.893101515366181661711202267938416198338079437);
+     DVK(KP786782374, +0.786782374965295178365099601674911834788448471);
+     DVK(KP869845200, +0.869845200362138853122720822420327157933056305);
+     DVK(KP447533225, +0.447533225982656890041886979663652563063114397);
+     DVK(KP494780565, +0.494780565770515410344588413655324772219443730);
+     DVK(KP578046249, +0.578046249379945007321754579646815604023525655);
+     DVK(KP522847744, +0.522847744331509716623755382187077770911012542);
+     DVK(KP059835404, +0.059835404262124915169548397419498386427871950);
+     DVK(KP066152395, +0.066152395967733048213034281011006031460903353);
+     DVK(KP603558818, +0.603558818296015001454675132653458027918768137);
+     DVK(KP667278218, +0.667278218140296670899089292254759909713898805);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(50, is), MAKE_VOLATILE_STRIDE(50, os)) {
+	       V T1g, T1k, T1I, T24, T2a, T1G, T1A, T1l, T1B, T1H, T1d;
+	       {
+		    V T2z, T1q, Ta, T9, T3n, Ty, Tl, T2O, T2W, T2l, T2s, TV, T1i, T1K, T1S;
+		    V T3z, T3t, Tk, T3o, Tp, T2g, T2N, T2V, T2o, T2t, T1a, T1j, T1J, T1R, Tz;
+		    V Tt, TA, Tw;
+		    {
+			 V T1, T5, T6, T2, T3;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 T6 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 {
+			      V TH, TW, TK, TS, T10, T8, TN, TT, T17, TZ, T11;
+			      TH = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			      TW = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			      {
+				   V TI, TJ, TL, T7, T1p, T4, T1o, TM, TX, TY;
+				   TI = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+				   TJ = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+				   TL = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+				   T7 = VADD(T5, T6);
+				   T1p = VSUB(T5, T6);
+				   T4 = VADD(T2, T3);
+				   T1o = VSUB(T2, T3);
+				   TM = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+				   TX = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+				   TK = VADD(TI, TJ);
+				   TS = VSUB(TI, TJ);
+				   TY = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+				   T10 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+				   T2z = VFNMS(LDK(KP618033988), T1o, T1p);
+				   T1q = VFMA(LDK(KP618033988), T1p, T1o);
+				   Ta = VSUB(T4, T7);
+				   T8 = VADD(T4, T7);
+				   TN = VADD(TL, TM);
+				   TT = VSUB(TM, TL);
+				   T17 = VSUB(TX, TY);
+				   TZ = VADD(TX, TY);
+				   T11 = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			      }
+			      {
+				   V Tc, T2m, T19, Tn, To, Tr, Tj, T16, T2n, Ts, Tu, Tv;
+				   {
+					V TU, T2j, TO, TQ, T12, T18;
+					Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+					T9 = VFNMS(LDK(KP250000000), T8, T1);
+					T3n = VADD(T1, T8);
+					TU = VFNMS(LDK(KP618033988), TT, TS);
+					T2j = VFMA(LDK(KP618033988), TS, TT);
+					TO = VADD(TK, TN);
+					TQ = VSUB(TN, TK);
+					T12 = VADD(T10, T11);
+					T18 = VSUB(T10, T11);
+					Ty = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					{
+					     V T3r, T15, T13, Tf, Ti, T2k, TR, TP, T3s, T14;
+					     {
+						  V Td, Te, Tg, Th;
+						  Td = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+						  Te = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+						  Tg = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+						  Th = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+						  TP = VFNMS(LDK(KP250000000), TO, TH);
+						  T3r = VADD(TH, TO);
+						  T2m = VFNMS(LDK(KP618033988), T17, T18);
+						  T19 = VFMA(LDK(KP618033988), T18, T17);
+						  T15 = VSUB(T12, TZ);
+						  T13 = VADD(TZ, T12);
+						  Tf = VADD(Td, Te);
+						  Tn = VSUB(Td, Te);
+						  To = VSUB(Th, Tg);
+						  Ti = VADD(Tg, Th);
+					     }
+					     T2k = VFMA(LDK(KP559016994), TQ, TP);
+					     TR = VFNMS(LDK(KP559016994), TQ, TP);
+					     Tr = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+					     T3s = VADD(TW, T13);
+					     T14 = VFNMS(LDK(KP250000000), T13, TW);
+					     Tj = VADD(Tf, Ti);
+					     Tl = VSUB(Tf, Ti);
+					     T2O = VFNMS(LDK(KP667278218), T2k, T2j);
+					     T2W = VFMA(LDK(KP603558818), T2j, T2k);
+					     T2l = VFMA(LDK(KP066152395), T2k, T2j);
+					     T2s = VFNMS(LDK(KP059835404), T2j, T2k);
+					     TV = VFNMS(LDK(KP522847744), TU, TR);
+					     T1i = VFMA(LDK(KP578046249), TR, TU);
+					     T1K = VFNMS(LDK(KP494780565), TR, TU);
+					     T1S = VFMA(LDK(KP447533225), TU, TR);
+					     T16 = VFNMS(LDK(KP559016994), T15, T14);
+					     T2n = VFMA(LDK(KP559016994), T15, T14);
+					     T3z = VSUB(T3r, T3s);
+					     T3t = VADD(T3r, T3s);
+					     Ts = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+					     Tu = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+					     Tv = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+					}
+				   }
+				   Tk = VFNMS(LDK(KP250000000), Tj, Tc);
+				   T3o = VADD(Tc, Tj);
+				   Tp = VFNMS(LDK(KP618033988), To, Tn);
+				   T2g = VFMA(LDK(KP618033988), Tn, To);
+				   T2N = VFMA(LDK(KP066152395), T2n, T2m);
+				   T2V = VFNMS(LDK(KP059835404), T2m, T2n);
+				   T2o = VFMA(LDK(KP869845200), T2n, T2m);
+				   T2t = VFNMS(LDK(KP786782374), T2m, T2n);
+				   T1a = VFNMS(LDK(KP893101515), T19, T16);
+				   T1j = VFMA(LDK(KP987388751), T16, T19);
+				   T1J = VFNMS(LDK(KP120146378), T19, T16);
+				   T1R = VFMA(LDK(KP132830569), T16, T19);
+				   Tz = VADD(Ts, Tr);
+				   Tt = VSUB(Tr, Ts);
+				   TA = VADD(Tv, Tu);
+				   Tw = VSUB(Tu, Tv);
+			      }
+			 }
+		    }
+		    {
+			 V T2p, T2I, T2u, T2C, Tx, T2d, T2X, T34, T2P, T3b, T2b, Tb, T2Q, T2Z, T2h;
+			 V T2w, Tq, T1e, T1M, T1U, TE, T2c, T3q, T3y;
+			 T2p = VFNMS(LDK(KP772036680), T2o, T2l);
+			 T2I = VFMA(LDK(KP772036680), T2o, T2l);
+			 T2u = VFMA(LDK(KP772036680), T2t, T2s);
+			 T2C = VFNMS(LDK(KP772036680), T2t, T2s);
+			 {
+			      V TD, TB, Tm, T2f, T3p, TC;
+			      Tx = VFMA(LDK(KP618033988), Tw, Tt);
+			      T2d = VFNMS(LDK(KP618033988), Tt, Tw);
+			      TD = VSUB(Tz, TA);
+			      TB = VADD(Tz, TA);
+			      Tm = VFMA(LDK(KP559016994), Tl, Tk);
+			      T2f = VFNMS(LDK(KP559016994), Tl, Tk);
+			      T2X = VFMA(LDK(KP845997307), T2W, T2V);
+			      T34 = VFNMS(LDK(KP845997307), T2W, T2V);
+			      T2P = VFNMS(LDK(KP845997307), T2O, T2N);
+			      T3b = VFMA(LDK(KP845997307), T2O, T2N);
+			      T2b = VFNMS(LDK(KP559016994), Ta, T9);
+			      Tb = VFMA(LDK(KP559016994), Ta, T9);
+			      T3p = VADD(Ty, TB);
+			      TC = VFMS(LDK(KP250000000), TB, Ty);
+			      T2Q = VFNMS(LDK(KP522847744), T2g, T2f);
+			      T2Z = VFMA(LDK(KP578046249), T2f, T2g);
+			      T2h = VFMA(LDK(KP893101515), T2g, T2f);
+			      T2w = VFNMS(LDK(KP987388751), T2f, T2g);
+			      Tq = VFNMS(LDK(KP244189809), Tp, Tm);
+			      T1e = VFMA(LDK(KP269969613), Tm, Tp);
+			      T1M = VFMA(LDK(KP667278218), Tm, Tp);
+			      T1U = VFNMS(LDK(KP603558818), Tp, Tm);
+			      TE = VFNMS(LDK(KP559016994), TD, TC);
+			      T2c = VFMA(LDK(KP559016994), TD, TC);
+			      T3q = VADD(T3o, T3p);
+			      T3y = VSUB(T3o, T3p);
+			 }
+			 {
+			      V T1Z, T25, T1P, T22, T1X, TG, T1b, T28, T1t, T1y, T1x, T1E, T1Q, T1Y;
+			      {
+				   V T26, T1L, T1T, TF, T1f, T1W, T3m, T3g, T2M, T2G, T39, T3j, T21, T1O, T20;
+				   V T27;
+				   T26 = VFMA(LDK(KP867381224), T1K, T1J);
+				   T1L = VFNMS(LDK(KP867381224), T1K, T1J);
+				   T20 = VFNMS(LDK(KP958953096), T1S, T1R);
+				   T1T = VFMA(LDK(KP958953096), T1S, T1R);
+				   {
+					V T2R, T2Y, T2e, T2v, T1N, T1V;
+					T2R = VFNMS(LDK(KP494780565), T2c, T2d);
+					T2Y = VFMA(LDK(KP447533225), T2d, T2c);
+					T2e = VFMA(LDK(KP120146378), T2d, T2c);
+					T2v = VFNMS(LDK(KP132830569), T2c, T2d);
+					TF = VFNMS(LDK(KP667278218), TE, Tx);
+					T1f = VFMA(LDK(KP603558818), Tx, TE);
+					T1N = VFMA(LDK(KP869845200), TE, Tx);
+					T1V = VFNMS(LDK(KP786782374), Tx, TE);
+					{
+					     V T3A, T3C, T3w, T3u;
+					     T3A = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T3z, T3y));
+					     T3C = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T3y, T3z));
+					     T3w = VSUB(T3q, T3t);
+					     T3u = VADD(T3q, T3t);
+					     {
+						  V T2B, T2x, T2H, T2i;
+						  T2B = VFMA(LDK(KP734762448), T2w, T2v);
+						  T2x = VFNMS(LDK(KP734762448), T2w, T2v);
+						  T2H = VFNMS(LDK(KP734762448), T2h, T2e);
+						  T2i = VFMA(LDK(KP734762448), T2h, T2e);
+						  {
+						       V T30, T35, T3c, T2S, T3v;
+						       T30 = VFNMS(LDK(KP921078979), T2Z, T2Y);
+						       T35 = VFMA(LDK(KP921078979), T2Z, T2Y);
+						       T3c = VFMA(LDK(KP982009705), T2R, T2Q);
+						       T2S = VFNMS(LDK(KP982009705), T2R, T2Q);
+						       T1W = VFMA(LDK(KP912575812), T1V, T1U);
+						       T1Z = VFNMS(LDK(KP912575812), T1V, T1U);
+						       T1O = VFMA(LDK(KP912575812), T1N, T1M);
+						       T25 = VFNMS(LDK(KP912575812), T1N, T1M);
+						       ST(&(xo[0]), VADD(T3u, T3n), ovs, &(xo[0]));
+						       T3v = VFNMS(LDK(KP250000000), T3u, T3n);
+						       {
+							    V T2y, T2J, T2q, T2D;
+							    T2y = VFMA(LDK(KP945422727), T2x, T2u);
+							    T2J = VFMA(LDK(KP522616830), T2x, T2I);
+							    T2q = VFMA(LDK(KP956723877), T2p, T2i);
+							    T2D = VFNMS(LDK(KP522616830), T2i, T2C);
+							    {
+								 V T3e, T31, T36, T2T;
+								 T3e = VFMA(LDK(KP906616052), T30, T2X);
+								 T31 = VFNMS(LDK(KP906616052), T30, T2X);
+								 T36 = VFNMS(LDK(KP923225144), T2S, T2P);
+								 T2T = VFMA(LDK(KP923225144), T2S, T2P);
+								 {
+								      V T3k, T3d, T3x, T3B;
+								      T3k = VFNMS(LDK(KP669429328), T3b, T3c);
+								      T3d = VFMA(LDK(KP570584518), T3c, T3b);
+								      T3x = VFMA(LDK(KP559016994), T3w, T3v);
+								      T3B = VFNMS(LDK(KP559016994), T3w, T3v);
+								      {
+									   V T2A, T2K, T2r, T2E;
+									   T2A = VMUL(LDK(KP998026728), VFMA(LDK(KP952936919), T2z, T2y));
+									   T2K = VFNMS(LDK(KP690983005), T2J, T2u);
+									   T2r = VFMA(LDK(KP992114701), T2q, T2b);
+									   T2E = VFMA(LDK(KP763932022), T2D, T2p);
+									   {
+										V T32, T3a, T37, T3h;
+										T32 = VMUL(LDK(KP998026728), VFNMS(LDK(KP952936919), T2z, T31));
+										T3a = VFMA(LDK(KP262346850), T31, T2z);
+										T37 = VFNMS(LDK(KP997675361), T36, T35);
+										T3h = VFNMS(LDK(KP904508497), T36, T34);
+										{
+										     V T2U, T33, T3l, T3f;
+										     T2U = VFMA(LDK(KP949179823), T2T, T2b);
+										     T33 = VFNMS(LDK(KP237294955), T2T, T2b);
+										     T3l = VFNMS(LDK(KP669429328), T3e, T3k);
+										     T3f = VFMA(LDK(KP618033988), T3e, T3d);
+										     ST(&(xo[WS(os, 20)]), VFNMSI(T3A, T3x), ovs, &(xo[0]));
+										     ST(&(xo[WS(os, 5)]), VFMAI(T3A, T3x), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 15)]), VFMAI(T3C, T3B), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 10)]), VFNMSI(T3C, T3B), ovs, &(xo[0]));
+										     {
+											  V T2L, T2F, T38, T3i;
+											  T2L = VFMA(LDK(KP855719849), T2K, T2H);
+											  ST(&(xo[WS(os, 3)]), VFMAI(T2A, T2r), ovs, &(xo[WS(os, 1)]));
+											  ST(&(xo[WS(os, 22)]), VFNMSI(T2A, T2r), ovs, &(xo[0]));
+											  T2F = VFNMS(LDK(KP855719849), T2E, T2B);
+											  T38 = VFMA(LDK(KP560319534), T37, T34);
+											  T3i = VFNMS(LDK(KP681693190), T3h, T35);
+											  ST(&(xo[WS(os, 2)]), VFMAI(T32, T2U), ovs, &(xo[0]));
+											  ST(&(xo[WS(os, 23)]), VFNMSI(T32, T2U), ovs, &(xo[WS(os, 1)]));
+											  T3m = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T3l, T3a));
+											  T3g = VMUL(LDK(KP951056516), VFNMS(LDK(KP949179823), T3f, T3a));
+											  T2M = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T2L, T2z));
+											  T2G = VFMA(LDK(KP897376177), T2F, T2b);
+											  T39 = VFNMS(LDK(KP949179823), T38, T33);
+											  T3j = VFNMS(LDK(KP860541664), T3i, T33);
+											  T21 = VFMA(LDK(KP447417479), T1O, T20);
+										     }
+										}
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+				   T1P = VFNMS(LDK(KP809385824), T1O, T1L);
+				   ST(&(xo[WS(os, 17)]), VFNMSI(T2M, T2G), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 8)]), VFMAI(T2M, T2G), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 13)]), VFMAI(T3g, T39), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 12)]), VFNMSI(T3g, T39), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 18)]), VFMAI(T3m, T3j), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 7)]), VFNMSI(T3m, T3j), ovs, &(xo[WS(os, 1)]));
+				   T22 = VFMA(LDK(KP690983005), T21, T1L);
+				   T27 = VFMA(LDK(KP447417479), T1W, T26);
+				   T1X = VFMA(LDK(KP894834959), T1W, T1T);
+				   {
+					V T1r, T1s, T1v, T1w;
+					T1r = VFNMS(LDK(KP916574801), T1f, T1e);
+					T1g = VFMA(LDK(KP916574801), T1f, T1e);
+					T1k = VFNMS(LDK(KP831864738), T1j, T1i);
+					T1s = VFMA(LDK(KP831864738), T1j, T1i);
+					T1v = VFNMS(LDK(KP829049696), TF, Tq);
+					TG = VFMA(LDK(KP829049696), TF, Tq);
+					T1b = VFMA(LDK(KP831864738), T1a, TV);
+					T1w = VFNMS(LDK(KP831864738), T1a, TV);
+					T28 = VFNMS(LDK(KP763932022), T27, T1T);
+					T1t = VFMA(LDK(KP904730450), T1s, T1r);
+					T1y = VFNMS(LDK(KP904730450), T1s, T1r);
+					T1x = VFMA(LDK(KP559154169), T1w, T1v);
+					T1E = VFNMS(LDK(KP683113946), T1v, T1w);
+				   }
+			      }
+			      T1Q = VFNMS(LDK(KP992114701), T1P, Tb);
+			      T1Y = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T1X, T1q));
+			      {
+				   V T1u, T1F, T1z, T1h, T1c, T23, T29;
+				   T23 = VFNMS(LDK(KP999544308), T22, T1Z);
+				   T29 = VFNMS(LDK(KP999544308), T28, T25);
+				   T1I = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1t, T1q));
+				   T1u = VFNMS(LDK(KP242145790), T1t, T1q);
+				   T1F = VFMA(LDK(KP617882369), T1y, T1E);
+				   T1z = VFMA(LDK(KP559016994), T1y, T1x);
+				   T1h = VFNMS(LDK(KP904730450), T1b, TG);
+				   T1c = VFMA(LDK(KP904730450), T1b, TG);
+				   ST(&(xo[WS(os, 21)]), VFMAI(T1Y, T1Q), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 4)]), VFNMSI(T1Y, T1Q), ovs, &(xo[0]));
+				   T24 = VFNMS(LDK(KP803003575), T23, Tb);
+				   T2a = VMUL(LDK(KP951056516), VFNMS(LDK(KP803003575), T29, T1q));
+				   T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T1F, T1u));
+				   T1A = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1z, T1u));
+				   T1l = VFNMS(LDK(KP904730450), T1k, T1h);
+				   T1B = VADD(T1g, T1h);
+				   T1H = VFMA(LDK(KP968583161), T1c, Tb);
+				   T1d = VFNMS(LDK(KP242145790), T1c, Tb);
+			      }
+			 }
+		    }
+	       }
+	       ST(&(xo[WS(os, 16)]), VFMAI(T2a, T24), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 9)]), VFNMSI(T2a, T24), ovs, &(xo[WS(os, 1)]));
+	       {
+		    V T1m, T1C, T1n, T1D;
+		    T1m = VFNMS(LDK(KP618033988), T1l, T1g);
+		    T1C = VFNMS(LDK(KP683113946), T1B, T1k);
+		    ST(&(xo[WS(os, 24)]), VFNMSI(T1I, T1H), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 1)]), VFMAI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
+		    T1n = VFNMS(LDK(KP876091699), T1m, T1d);
+		    T1D = VFMA(LDK(KP792626838), T1C, T1d);
+		    ST(&(xo[WS(os, 19)]), VFNMSI(T1A, T1n), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 6)]), VFMAI(T1A, T1n), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 14)]), VFNMSI(T1G, T1D), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 11)]), VFMAI(T1G, T1D), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 25, XSIMD_STRING("n1bv_25"), {43, 12, 181, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_25) (planner *p) {
+     X(kdft_register) (p, n1bv_25, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 25 -name n1bv_25 -include n1b.h */
+
+/*
+ * This function contains 224 FP additions, 140 FP multiplications,
+ * (or, 147 additions, 63 multiplications, 77 fused multiply/add),
+ * 115 stack variables, 40 constants, and 50 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_25(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DVK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DVK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DVK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
+     DVK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DVK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
+     DVK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DVK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP250666467, +0.250666467128608490746237519633017587885836494);
+     DVK(KP851558583, +0.851558583130145297725004891488503407959946084);
+     DVK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DVK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DVK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DVK(KP497379774, +0.497379774329709576484567492012895936835134813);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DVK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DVK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
+     DVK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DVK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DVK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DVK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DVK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
+     DVK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DVK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DVK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DVK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP125581039, +0.125581039058626752152356449131262266244969664);
+     DVK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
+     DVK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(50, is), MAKE_VOLATILE_STRIDE(50, os)) {
+	       V T1b, T2o, T1v, T1e, T2W, T2P, T2Q, T2U, T11, T27, TY, T26, T12, T2f, T1j;
+	       V T28, TM, T24, TJ, T23, TN, T2e, T1i, T25, T2M, T2N, T2T, Tm, T1W, Tt;
+	       V T1X, Tu, T20, Tw, T1Y, T7, T1U, Te, T1T, Tf, T21, Tx, T1V;
+	       {
+		    V T1c, T1a, T1t, T17, T1r;
+		    T1c = LD(&(xi[0]), ivs, &(xi[0]));
+		    {
+			 V T18, T19, T15, T16;
+			 T18 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 T19 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T1a = VADD(T18, T19);
+			 T1t = VSUB(T18, T19);
+			 T15 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T16 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 T17 = VADD(T15, T16);
+			 T1r = VSUB(T15, T16);
+		    }
+		    {
+			 V T2n, T1s, T1u, T1d;
+			 T1b = VMUL(LDK(KP559016994), VSUB(T17, T1a));
+			 T2n = VMUL(LDK(KP587785252), T1r);
+			 T2o = VFNMS(LDK(KP951056516), T1t, T2n);
+			 T1s = VMUL(LDK(KP951056516), T1r);
+			 T1u = VMUL(LDK(KP587785252), T1t);
+			 T1v = VADD(T1s, T1u);
+			 T1d = VADD(T17, T1a);
+			 T1e = VFNMS(LDK(KP250000000), T1d, T1c);
+			 T2W = VADD(T1c, T1d);
+		    }
+	       }
+	       {
+		    V TG, TV, TF, TL, TH, TK, TU, T10, TW, TZ, TX, TI;
+		    TG = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    TV = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    {
+			 V Tz, TA, TB, TC, TD, TE;
+			 Tz = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 TA = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			 TB = VADD(Tz, TA);
+			 TC = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 TD = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 TE = VADD(TC, TD);
+			 TF = VMUL(LDK(KP559016994), VSUB(TB, TE));
+			 TL = VSUB(TC, TD);
+			 TH = VADD(TB, TE);
+			 TK = VSUB(Tz, TA);
+		    }
+		    {
+			 V TO, TP, TQ, TR, TS, TT;
+			 TO = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 TP = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			 TQ = VADD(TO, TP);
+			 TR = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 TS = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 TT = VADD(TR, TS);
+			 TU = VMUL(LDK(KP559016994), VSUB(TQ, TT));
+			 T10 = VSUB(TR, TS);
+			 TW = VADD(TQ, TT);
+			 TZ = VSUB(TO, TP);
+		    }
+		    T2P = VADD(TG, TH);
+		    T2Q = VADD(TV, TW);
+		    T2U = VADD(T2P, T2Q);
+		    T11 = VFMA(LDK(KP475528258), TZ, VMUL(LDK(KP293892626), T10));
+		    T27 = VFNMS(LDK(KP475528258), T10, VMUL(LDK(KP293892626), TZ));
+		    TX = VFNMS(LDK(KP250000000), TW, TV);
+		    TY = VADD(TU, TX);
+		    T26 = VSUB(TX, TU);
+		    T12 = VFNMS(LDK(KP1_369094211), T11, VMUL(LDK(KP728968627), TY));
+		    T2f = VFMA(LDK(KP125581039), T27, VMUL(LDK(KP998026728), T26));
+		    T1j = VFMA(LDK(KP1_457937254), T11, VMUL(LDK(KP684547105), TY));
+		    T28 = VFNMS(LDK(KP1_996053456), T27, VMUL(LDK(KP062790519), T26));
+		    TM = VFMA(LDK(KP475528258), TK, VMUL(LDK(KP293892626), TL));
+		    T24 = VFNMS(LDK(KP475528258), TL, VMUL(LDK(KP293892626), TK));
+		    TI = VFNMS(LDK(KP250000000), TH, TG);
+		    TJ = VADD(TF, TI);
+		    T23 = VSUB(TI, TF);
+		    TN = VFNMS(LDK(KP963507348), TM, VMUL(LDK(KP876306680), TJ));
+		    T2e = VFMA(LDK(KP1_071653589), T24, VMUL(LDK(KP844327925), T23));
+		    T1i = VFMA(LDK(KP1_752613360), TM, VMUL(LDK(KP481753674), TJ));
+		    T25 = VFNMS(LDK(KP1_688655851), T24, VMUL(LDK(KP535826794), T23));
+	       }
+	       {
+		    V Tb, Tq, T3, Tc, T6, Ta, Ti, Tr, Tl, Tp, Ts, Td;
+		    Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    {
+			 V T1, T2, T8, T4, T5, T9;
+			 T1 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			 T8 = VADD(T1, T2);
+			 T4 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = VADD(T4, T5);
+			 T3 = VSUB(T1, T2);
+			 Tc = VADD(T8, T9);
+			 T6 = VSUB(T4, T5);
+			 Ta = VMUL(LDK(KP559016994), VSUB(T8, T9));
+		    }
+		    {
+			 V Tg, Th, Tn, Tj, Tk, To;
+			 Tg = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Th = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			 Tn = VADD(Tg, Th);
+			 Tj = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Tk = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 To = VADD(Tj, Tk);
+			 Ti = VSUB(Tg, Th);
+			 Tr = VADD(Tn, To);
+			 Tl = VSUB(Tj, Tk);
+			 Tp = VMUL(LDK(KP559016994), VSUB(Tn, To));
+		    }
+		    T2M = VADD(Tq, Tr);
+		    T2N = VADD(Tb, Tc);
+		    T2T = VADD(T2M, T2N);
+		    Tm = VFMA(LDK(KP475528258), Ti, VMUL(LDK(KP293892626), Tl));
+		    T1W = VFNMS(LDK(KP475528258), Tl, VMUL(LDK(KP293892626), Ti));
+		    Ts = VFNMS(LDK(KP250000000), Tr, Tq);
+		    Tt = VADD(Tp, Ts);
+		    T1X = VSUB(Ts, Tp);
+		    Tu = VFMA(LDK(KP1_937166322), Tm, VMUL(LDK(KP248689887), Tt));
+		    T20 = VFNMS(LDK(KP963507348), T1W, VMUL(LDK(KP876306680), T1X));
+		    Tw = VFNMS(LDK(KP497379774), Tm, VMUL(LDK(KP968583161), Tt));
+		    T1Y = VFMA(LDK(KP1_752613360), T1W, VMUL(LDK(KP481753674), T1X));
+		    T7 = VFMA(LDK(KP475528258), T3, VMUL(LDK(KP293892626), T6));
+		    T1U = VFNMS(LDK(KP475528258), T6, VMUL(LDK(KP293892626), T3));
+		    Td = VFNMS(LDK(KP250000000), Tc, Tb);
+		    Te = VADD(Ta, Td);
+		    T1T = VSUB(Td, Ta);
+		    Tf = VFMA(LDK(KP1_071653589), T7, VMUL(LDK(KP844327925), Te));
+		    T21 = VFMA(LDK(KP1_809654104), T1U, VMUL(LDK(KP425779291), T1T));
+		    Tx = VFNMS(LDK(KP1_688655851), T7, VMUL(LDK(KP535826794), Te));
+		    T1V = VFNMS(LDK(KP851558583), T1U, VMUL(LDK(KP904827052), T1T));
+	       }
+	       {
+		    V T2V, T2X, T2Y, T2S, T30, T2O, T2R, T31, T2Z;
+		    T2V = VMUL(LDK(KP559016994), VSUB(T2T, T2U));
+		    T2X = VADD(T2T, T2U);
+		    T2Y = VFNMS(LDK(KP250000000), T2X, T2W);
+		    T2O = VSUB(T2M, T2N);
+		    T2R = VSUB(T2P, T2Q);
+		    T2S = VBYI(VFMA(LDK(KP951056516), T2O, VMUL(LDK(KP587785252), T2R)));
+		    T30 = VBYI(VFNMS(LDK(KP951056516), T2R, VMUL(LDK(KP587785252), T2O)));
+		    ST(&(xo[0]), VADD(T2W, T2X), ovs, &(xo[0]));
+		    T31 = VSUB(T2Y, T2V);
+		    ST(&(xo[WS(os, 10)]), VADD(T30, T31), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 15)]), VSUB(T31, T30), ovs, &(xo[WS(os, 1)]));
+		    T2Z = VADD(T2V, T2Y);
+		    ST(&(xo[WS(os, 5)]), VADD(T2S, T2Z), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 20)]), VSUB(T2Z, T2S), ovs, &(xo[0]));
+	       }
+	       {
+		    V T1Z, T2i, T2j, T2g, T2w, T2x, T2y, T2G, T2H, T2I, T2D, T2E, T2F, T2z, T2A;
+		    V T2B, T2p, T2m, T2q, T2b, T2c, T2a, T2d, T2h, T2r;
+		    T1Z = VSUB(T1V, T1Y);
+		    T2i = VADD(T20, T21);
+		    T2j = VSUB(T25, T28);
+		    T2g = VSUB(T2e, T2f);
+		    T2w = VFMA(LDK(KP1_369094211), T1W, VMUL(LDK(KP728968627), T1X));
+		    T2x = VFNMS(LDK(KP992114701), T1T, VMUL(LDK(KP250666467), T1U));
+		    T2y = VADD(T2w, T2x);
+		    T2G = VFNMS(LDK(KP125581039), T24, VMUL(LDK(KP998026728), T23));
+		    T2H = VFMA(LDK(KP1_274847979), T27, VMUL(LDK(KP770513242), T26));
+		    T2I = VADD(T2G, T2H);
+		    T2D = VFNMS(LDK(KP1_457937254), T1W, VMUL(LDK(KP684547105), T1X));
+		    T2E = VFMA(LDK(KP1_984229402), T1U, VMUL(LDK(KP125333233), T1T));
+		    T2F = VADD(T2D, T2E);
+		    T2z = VFMA(LDK(KP1_996053456), T24, VMUL(LDK(KP062790519), T23));
+		    T2A = VFNMS(LDK(KP637423989), T26, VMUL(LDK(KP1_541026485), T27));
+		    T2B = VADD(T2z, T2A);
+		    {
+			 V T2k, T2l, T22, T29;
+			 T2k = VADD(T1Y, T1V);
+			 T2l = VADD(T2e, T2f);
+			 T2p = VADD(T2k, T2l);
+			 T2m = VMUL(LDK(KP559016994), VSUB(T2k, T2l));
+			 T2q = VFNMS(LDK(KP250000000), T2p, T2o);
+			 T2b = VSUB(T1e, T1b);
+			 T22 = VSUB(T20, T21);
+			 T29 = VADD(T25, T28);
+			 T2c = VADD(T22, T29);
+			 T2a = VMUL(LDK(KP559016994), VSUB(T22, T29));
+			 T2d = VFNMS(LDK(KP250000000), T2c, T2b);
+		    }
+		    {
+			 V T2u, T2v, T2C, T2J;
+			 T2u = VADD(T2b, T2c);
+			 T2v = VBYI(VADD(T2o, T2p));
+			 ST(&(xo[WS(os, 23)]), VSUB(T2u, T2v), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 2)]), VADD(T2u, T2v), ovs, &(xo[0]));
+			 T2C = VADD(T2b, VADD(T2y, T2B));
+			 T2J = VBYI(VSUB(VADD(T2F, T2I), T2o));
+			 ST(&(xo[WS(os, 22)]), VSUB(T2C, T2J), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 3)]), VADD(T2C, T2J), ovs, &(xo[WS(os, 1)]));
+		    }
+		    T2h = VFMA(LDK(KP951056516), T1Z, VADD(T2a, VFNMS(LDK(KP587785252), T2g, T2d)));
+		    T2r = VBYI(VADD(VFMA(LDK(KP951056516), T2i, VMUL(LDK(KP587785252), T2j)), VADD(T2m, T2q)));
+		    ST(&(xo[WS(os, 18)]), VSUB(T2h, T2r), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 7)]), VADD(T2h, T2r), ovs, &(xo[WS(os, 1)]));
+		    {
+			 V T2s, T2t, T2K, T2L;
+			 T2s = VFMA(LDK(KP587785252), T1Z, VFMA(LDK(KP951056516), T2g, VSUB(T2d, T2a)));
+			 T2t = VBYI(VADD(VFNMS(LDK(KP951056516), T2j, VMUL(LDK(KP587785252), T2i)), VSUB(T2q, T2m)));
+			 ST(&(xo[WS(os, 13)]), VSUB(T2s, T2t), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 12)]), VADD(T2s, T2t), ovs, &(xo[0]));
+			 T2K = VBYI(VSUB(VFMA(LDK(KP951056516), VSUB(T2w, T2x), VFMA(LDK(KP309016994), T2F, VFNMS(LDK(KP809016994), T2I, VMUL(LDK(KP587785252), VSUB(T2z, T2A))))), T2o));
+			 T2L = VFMA(LDK(KP309016994), T2y, VFMA(LDK(KP951056516), VSUB(T2E, T2D), VFMA(LDK(KP587785252), VSUB(T2H, T2G), VFNMS(LDK(KP809016994), T2B, T2b))));
+			 ST(&(xo[WS(os, 8)]), VADD(T2K, T2L), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 17)]), VSUB(T2L, T2K), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V Tv, T1m, T1n, T1k, T1D, T1E, T1F, T1N, T1O, T1P, T1K, T1L, T1M, T1G, T1H;
+		    V T1I, T1w, T1q, T1x, T1f, T1g, T14, T1h, T1l, T1y;
+		    Tv = VSUB(Tf, Tu);
+		    T1m = VSUB(Tw, Tx);
+		    T1n = VSUB(TN, T12);
+		    T1k = VSUB(T1i, T1j);
+		    T1D = VFMA(LDK(KP1_688655851), Tm, VMUL(LDK(KP535826794), Tt));
+		    T1E = VFMA(LDK(KP1_541026485), T7, VMUL(LDK(KP637423989), Te));
+		    T1F = VSUB(T1D, T1E);
+		    T1N = VFMA(LDK(KP851558583), TM, VMUL(LDK(KP904827052), TJ));
+		    T1O = VFMA(LDK(KP1_984229402), T11, VMUL(LDK(KP125333233), TY));
+		    T1P = VADD(T1N, T1O);
+		    T1K = VFNMS(LDK(KP1_071653589), Tm, VMUL(LDK(KP844327925), Tt));
+		    T1L = VFNMS(LDK(KP770513242), Te, VMUL(LDK(KP1_274847979), T7));
+		    T1M = VADD(T1K, T1L);
+		    T1G = VFNMS(LDK(KP425779291), TJ, VMUL(LDK(KP1_809654104), TM));
+		    T1H = VFNMS(LDK(KP992114701), TY, VMUL(LDK(KP250666467), T11));
+		    T1I = VADD(T1G, T1H);
+		    {
+			 V T1o, T1p, Ty, T13;
+			 T1o = VADD(Tu, Tf);
+			 T1p = VADD(T1i, T1j);
+			 T1w = VADD(T1o, T1p);
+			 T1q = VMUL(LDK(KP559016994), VSUB(T1o, T1p));
+			 T1x = VFNMS(LDK(KP250000000), T1w, T1v);
+			 T1f = VADD(T1b, T1e);
+			 Ty = VADD(Tw, Tx);
+			 T13 = VADD(TN, T12);
+			 T1g = VADD(Ty, T13);
+			 T14 = VMUL(LDK(KP559016994), VSUB(Ty, T13));
+			 T1h = VFNMS(LDK(KP250000000), T1g, T1f);
+		    }
+		    {
+			 V T1B, T1C, T1J, T1Q;
+			 T1B = VADD(T1f, T1g);
+			 T1C = VBYI(VADD(T1v, T1w));
+			 ST(&(xo[WS(os, 24)]), VSUB(T1B, T1C), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 1)]), VADD(T1B, T1C), ovs, &(xo[WS(os, 1)]));
+			 T1J = VADD(T1f, VADD(T1F, T1I));
+			 T1Q = VBYI(VSUB(VADD(T1M, T1P), T1v));
+			 ST(&(xo[WS(os, 21)]), VSUB(T1J, T1Q), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 4)]), VADD(T1J, T1Q), ovs, &(xo[0]));
+		    }
+		    T1l = VFMA(LDK(KP951056516), Tv, VADD(T14, VFNMS(LDK(KP587785252), T1k, T1h)));
+		    T1y = VBYI(VADD(VFMA(LDK(KP951056516), T1m, VMUL(LDK(KP587785252), T1n)), VADD(T1q, T1x)));
+		    ST(&(xo[WS(os, 19)]), VSUB(T1l, T1y), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 6)]), VADD(T1l, T1y), ovs, &(xo[0]));
+		    {
+			 V T1z, T1A, T1R, T1S;
+			 T1z = VFMA(LDK(KP587785252), Tv, VFMA(LDK(KP951056516), T1k, VSUB(T1h, T14)));
+			 T1A = VBYI(VADD(VFNMS(LDK(KP951056516), T1n, VMUL(LDK(KP587785252), T1m)), VSUB(T1x, T1q)));
+			 ST(&(xo[WS(os, 14)]), VSUB(T1z, T1A), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 11)]), VADD(T1z, T1A), ovs, &(xo[WS(os, 1)]));
+			 T1R = VBYI(VSUB(VFMA(LDK(KP309016994), T1M, VFMA(LDK(KP951056516), VADD(T1D, T1E), VFNMS(LDK(KP809016994), T1P, VMUL(LDK(KP587785252), VSUB(T1G, T1H))))), T1v));
+			 T1S = VFMA(LDK(KP951056516), VSUB(T1L, T1K), VFMA(LDK(KP309016994), T1F, VFMA(LDK(KP587785252), VSUB(T1O, T1N), VFNMS(LDK(KP809016994), T1I, T1f))));
+			 ST(&(xo[WS(os, 9)]), VADD(T1R, T1S), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 16)]), VSUB(T1S, T1R), ovs, &(xo[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 25, XSIMD_STRING("n1bv_25"), {147, 63, 77, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_25) (planner *p) {
+     X(kdft_register) (p, n1bv_25, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:58 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 3 -name n1bv_3 -include n1b.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 3 additions, 1 multiplications, 3 fused multiply/add),
+ * 11 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(6, is), MAKE_VOLATILE_STRIDE(6, os)) {
+	       V T1, T2, T3, T6, T4, T5;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T6 = VMUL(LDK(KP866025403), VSUB(T2, T3));
+	       T4 = VADD(T2, T3);
+	       T5 = VFNMS(LDK(KP500000000), T4, T1);
+	       ST(&(xo[0]), VADD(T1, T4), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 2)]), VFNMSI(T6, T5), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 1)]), VFMAI(T6, T5), ovs, &(xo[WS(os, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 3, XSIMD_STRING("n1bv_3"), {3, 1, 3, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_3) (planner *p) {
+     X(kdft_register) (p, n1bv_3, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 3 -name n1bv_3 -include n1b.h */
+
+/*
+ * This function contains 6 FP additions, 2 FP multiplications,
+ * (or, 5 additions, 1 multiplications, 1 fused multiply/add),
+ * 11 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(6, is), MAKE_VOLATILE_STRIDE(6, os)) {
+	       V T4, T3, T5, T1, T2, T6;
+	       T4 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T3 = VBYI(VMUL(LDK(KP866025403), VSUB(T1, T2)));
+	       T5 = VADD(T1, T2);
+	       ST(&(xo[0]), VADD(T4, T5), ovs, &(xo[0]));
+	       T6 = VFNMS(LDK(KP500000000), T5, T4);
+	       ST(&(xo[WS(os, 1)]), VADD(T3, T6), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[WS(os, 2)]), VSUB(T6, T3), ovs, &(xo[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 3, XSIMD_STRING("n1bv_3"), {5, 1, 1, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_3) (planner *p) {
+     X(kdft_register) (p, n1bv_3, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 32 -name n1bv_32 -include n1b.h */
+
+/*
+ * This function contains 186 FP additions, 98 FP multiplications,
+ * (or, 88 additions, 0 multiplications, 98 fused multiply/add),
+ * 104 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       V T1h, Tr, T1a, T1k, TI, T1b, T1L, T1P, T1I, T1G, T1O, T1Q, T1H, T1z, T1c;
+	       V TZ;
+	       {
+		    V T2x, T1T, T2K, T1W, T1p, Tb, T1A, T16, Tu, TF, T2O, T2H, T2b, T2t, TY;
+		    V T1w, TT, T1v, T20, T2C, Tj, Te, T2e, To, T2i, T23, T2D, TB, TG, Th;
+		    V T2f, Tk;
+		    {
+			 V TL, TW, TP, TQ, T2F, T27, T28, TO;
+			 {
+			      V T1, T2, T12, T13, T4, T5, T7, T8;
+			      T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			      T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			      T12 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			      T13 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			      T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			      T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			      T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			      T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			      {
+				   V TM, T25, T26, TN;
+				   {
+					V TJ, T3, T14, T1U, T6, T1V, T9, TK, TU, TV, T1R, T1S, Ta, T15;
+					TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+					T1R = VADD(T1, T2);
+					T3 = VSUB(T1, T2);
+					T1S = VADD(T12, T13);
+					T14 = VSUB(T12, T13);
+					T1U = VADD(T4, T5);
+					T6 = VSUB(T4, T5);
+					T1V = VADD(T7, T8);
+					T9 = VSUB(T7, T8);
+					TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+					TU = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+					T2x = VSUB(T1R, T1S);
+					T1T = VADD(T1R, T1S);
+					TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+					TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+					T2K = VSUB(T1U, T1V);
+					T1W = VADD(T1U, T1V);
+					Ta = VADD(T6, T9);
+					T15 = VSUB(T6, T9);
+					T25 = VADD(TJ, TK);
+					TL = VSUB(TJ, TK);
+					T26 = VADD(TV, TU);
+					TW = VSUB(TU, TV);
+					TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+					TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+					T1p = VFNMS(LDK(KP707106781), Ta, T3);
+					Tb = VFMA(LDK(KP707106781), Ta, T3);
+					T1A = VFNMS(LDK(KP707106781), T15, T14);
+					T16 = VFMA(LDK(KP707106781), T15, T14);
+					TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+				   }
+				   T2F = VSUB(T25, T26);
+				   T27 = VADD(T25, T26);
+				   T28 = VADD(TM, TN);
+				   TO = VSUB(TM, TN);
+			      }
+			 }
+			 {
+			      V Ty, T21, Tx, Tz, T1Y, T1Z;
+			      {
+				   V Ts, Tt, TD, T29, TR, TE, Tv, Tw;
+				   Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+				   Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+				   TD = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+				   T29 = VADD(TP, TQ);
+				   TR = VSUB(TP, TQ);
+				   TE = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+				   Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+				   Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+				   Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+				   T1Y = VADD(Ts, Tt);
+				   Tu = VSUB(Ts, Tt);
+				   {
+					V T2G, T2a, TX, TS;
+					T2G = VSUB(T29, T28);
+					T2a = VADD(T28, T29);
+					TX = VSUB(TR, TO);
+					TS = VADD(TO, TR);
+					T1Z = VADD(TD, TE);
+					TF = VSUB(TD, TE);
+					T21 = VADD(Tv, Tw);
+					Tx = VSUB(Tv, Tw);
+					T2O = VFMA(LDK(KP414213562), T2F, T2G);
+					T2H = VFNMS(LDK(KP414213562), T2G, T2F);
+					T2b = VSUB(T27, T2a);
+					T2t = VADD(T27, T2a);
+					TY = VFMA(LDK(KP707106781), TX, TW);
+					T1w = VFNMS(LDK(KP707106781), TX, TW);
+					TT = VFMA(LDK(KP707106781), TS, TL);
+					T1v = VFNMS(LDK(KP707106781), TS, TL);
+					Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+				   }
+			      }
+			      T20 = VADD(T1Y, T1Z);
+			      T2C = VSUB(T1Y, T1Z);
+			      {
+				   V Tc, Td, Tm, Tn;
+				   Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+				   Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+				   Tm = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+				   Tn = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+				   {
+					V Tf, TA, T22, Tg;
+					Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+					TA = VSUB(Ty, Tz);
+					T22 = VADD(Ty, Tz);
+					Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+					Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+					Te = VSUB(Tc, Td);
+					T2e = VADD(Tc, Td);
+					To = VSUB(Tm, Tn);
+					T2i = VADD(Tn, Tm);
+					T23 = VADD(T21, T22);
+					T2D = VSUB(T21, T22);
+					TB = VADD(Tx, TA);
+					TG = VSUB(Tx, TA);
+					Th = VSUB(Tf, Tg);
+					T2f = VADD(Tf, Tg);
+					Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T1t, TH, T1s, TC, T2P, T2U, T2n, T2d, T2w, T2u, T1q, T19, T1B, Tq, T2W;
+			 V T2M, T2B, T2T, T2v, T2r, T2o, T2m, T2X, T2I;
+			 {
+			      V T1X, T2p, T2E, T2N, T2s, T2y, T2g, T17, Ti, T2h, Tl, T2c, T2l, T24;
+			      T1X = VSUB(T1T, T1W);
+			      T2p = VADD(T1T, T1W);
+			      T2E = VFNMS(LDK(KP414213562), T2D, T2C);
+			      T2N = VFMA(LDK(KP414213562), T2C, T2D);
+			      T2s = VADD(T20, T23);
+			      T24 = VSUB(T20, T23);
+			      T1t = VFNMS(LDK(KP707106781), TG, TF);
+			      TH = VFMA(LDK(KP707106781), TG, TF);
+			      T1s = VFNMS(LDK(KP707106781), TB, Tu);
+			      TC = VFMA(LDK(KP707106781), TB, Tu);
+			      T2y = VSUB(T2e, T2f);
+			      T2g = VADD(T2e, T2f);
+			      T17 = VFMA(LDK(KP414213562), Te, Th);
+			      Ti = VFNMS(LDK(KP414213562), Th, Te);
+			      T2h = VADD(Tj, Tk);
+			      Tl = VSUB(Tj, Tk);
+			      T2c = VADD(T24, T2b);
+			      T2l = VSUB(T24, T2b);
+			      {
+				   V T2L, T2A, T2q, T2k;
+				   T2P = VSUB(T2N, T2O);
+				   T2U = VADD(T2N, T2O);
+				   {
+					V T2z, T2j, T18, Tp;
+					T2z = VSUB(T2h, T2i);
+					T2j = VADD(T2h, T2i);
+					T18 = VFMA(LDK(KP414213562), Tl, To);
+					Tp = VFNMS(LDK(KP414213562), To, Tl);
+					T2n = VFMA(LDK(KP707106781), T2c, T1X);
+					T2d = VFNMS(LDK(KP707106781), T2c, T1X);
+					T2w = VADD(T2s, T2t);
+					T2u = VSUB(T2s, T2t);
+					T2L = VSUB(T2y, T2z);
+					T2A = VADD(T2y, T2z);
+					T2q = VADD(T2g, T2j);
+					T2k = VSUB(T2g, T2j);
+					T1q = VADD(T17, T18);
+					T19 = VSUB(T17, T18);
+					T1B = VSUB(Ti, Tp);
+					Tq = VADD(Ti, Tp);
+				   }
+				   T2W = VFNMS(LDK(KP707106781), T2L, T2K);
+				   T2M = VFMA(LDK(KP707106781), T2L, T2K);
+				   T2B = VFMA(LDK(KP707106781), T2A, T2x);
+				   T2T = VFNMS(LDK(KP707106781), T2A, T2x);
+				   T2v = VADD(T2p, T2q);
+				   T2r = VSUB(T2p, T2q);
+				   T2o = VFMA(LDK(KP707106781), T2l, T2k);
+				   T2m = VFNMS(LDK(KP707106781), T2l, T2k);
+				   T2X = VSUB(T2E, T2H);
+				   T2I = VADD(T2E, T2H);
+			      }
+			 }
+			 {
+			      V T2V, T2Z, T2Y, T30, T2R, T2J;
+			      T2V = VFNMS(LDK(KP923879532), T2U, T2T);
+			      T2Z = VFMA(LDK(KP923879532), T2U, T2T);
+			      ST(&(xo[WS(os, 16)]), VSUB(T2v, T2w), ovs, &(xo[0]));
+			      ST(&(xo[0]), VADD(T2v, T2w), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 8)]), VFMAI(T2u, T2r), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 24)]), VFNMSI(T2u, T2r), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 4)]), VFMAI(T2o, T2n), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 28)]), VFNMSI(T2o, T2n), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 20)]), VFMAI(T2m, T2d), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 12)]), VFNMSI(T2m, T2d), ovs, &(xo[0]));
+			      T2Y = VFMA(LDK(KP923879532), T2X, T2W);
+			      T30 = VFNMS(LDK(KP923879532), T2X, T2W);
+			      T2R = VFMA(LDK(KP923879532), T2I, T2B);
+			      T2J = VFNMS(LDK(KP923879532), T2I, T2B);
+			      {
+				   V T1J, T1r, T1C, T1M, T2S, T2Q, T1u, T1D, T1E, T1x;
+				   T1J = VFNMS(LDK(KP923879532), T1q, T1p);
+				   T1r = VFMA(LDK(KP923879532), T1q, T1p);
+				   T1C = VFNMS(LDK(KP923879532), T1B, T1A);
+				   T1M = VFMA(LDK(KP923879532), T1B, T1A);
+				   ST(&(xo[WS(os, 6)]), VFNMSI(T30, T2Z), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 26)]), VFMAI(T30, T2Z), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 22)]), VFNMSI(T2Y, T2V), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 10)]), VFMAI(T2Y, T2V), ovs, &(xo[0]));
+				   T2S = VFMA(LDK(KP923879532), T2P, T2M);
+				   T2Q = VFNMS(LDK(KP923879532), T2P, T2M);
+				   T1u = VFMA(LDK(KP668178637), T1t, T1s);
+				   T1D = VFNMS(LDK(KP668178637), T1s, T1t);
+				   T1E = VFNMS(LDK(KP668178637), T1v, T1w);
+				   T1x = VFMA(LDK(KP668178637), T1w, T1v);
+				   {
+					V T1K, T1F, T1N, T1y;
+					T1h = VFNMS(LDK(KP923879532), Tq, Tb);
+					Tr = VFMA(LDK(KP923879532), Tq, Tb);
+					ST(&(xo[WS(os, 30)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 2)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 18)]), VFMAI(T2Q, T2J), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 14)]), VFNMSI(T2Q, T2J), ovs, &(xo[0]));
+					T1K = VADD(T1D, T1E);
+					T1F = VSUB(T1D, T1E);
+					T1N = VSUB(T1u, T1x);
+					T1y = VADD(T1u, T1x);
+					T1a = VFMA(LDK(KP923879532), T19, T16);
+					T1k = VFNMS(LDK(KP923879532), T19, T16);
+					TI = VFNMS(LDK(KP198912367), TH, TC);
+					T1b = VFMA(LDK(KP198912367), TC, TH);
+					T1L = VFMA(LDK(KP831469612), T1K, T1J);
+					T1P = VFNMS(LDK(KP831469612), T1K, T1J);
+					T1I = VFMA(LDK(KP831469612), T1F, T1C);
+					T1G = VFNMS(LDK(KP831469612), T1F, T1C);
+					T1O = VFNMS(LDK(KP831469612), T1N, T1M);
+					T1Q = VFMA(LDK(KP831469612), T1N, T1M);
+					T1H = VFMA(LDK(KP831469612), T1y, T1r);
+					T1z = VFNMS(LDK(KP831469612), T1y, T1r);
+					T1c = VFMA(LDK(KP198912367), TT, TY);
+					TZ = VFNMS(LDK(KP198912367), TY, TT);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1d, T1i, T10, T1l;
+		    ST(&(xo[WS(os, 21)]), VFMAI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 11)]), VFNMSI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 27)]), VFNMSI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 5)]), VFMAI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 29)]), VFMAI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VFNMSI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 13)]), VFMAI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 19)]), VFNMSI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
+		    T1d = VSUB(T1b, T1c);
+		    T1i = VADD(T1b, T1c);
+		    T10 = VADD(TI, TZ);
+		    T1l = VSUB(TI, TZ);
+		    {
+			 V T1n, T1j, T1e, T1g, T1o, T1m, T11, T1f;
+			 T1n = VFMA(LDK(KP980785280), T1i, T1h);
+			 T1j = VFNMS(LDK(KP980785280), T1i, T1h);
+			 T1e = VFNMS(LDK(KP980785280), T1d, T1a);
+			 T1g = VFMA(LDK(KP980785280), T1d, T1a);
+			 T1o = VFNMS(LDK(KP980785280), T1l, T1k);
+			 T1m = VFMA(LDK(KP980785280), T1l, T1k);
+			 T11 = VFNMS(LDK(KP980785280), T10, Tr);
+			 T1f = VFMA(LDK(KP980785280), T10, Tr);
+			 ST(&(xo[WS(os, 23)]), VFNMSI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 9)]), VFMAI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 25)]), VFMAI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 7)]), VFNMSI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 1)]), VFMAI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 31)]), VFNMSI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 17)]), VFMAI(T1e, T11), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 15)]), VFNMSI(T1e, T11), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 32, XSIMD_STRING("n1bv_32"), {88, 0, 98, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_32) (planner *p) {
+     X(kdft_register) (p, n1bv_32, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 32 -name n1bv_32 -include n1b.h */
+
+/*
+ * This function contains 186 FP additions, 42 FP multiplications,
+ * (or, 170 additions, 26 multiplications, 16 fused multiply/add),
+ * 58 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       V T2f, T2k, T2N, T2M, T19, T1B, Tb, T1p, TT, T1v, TY, T1w, T2E, T2F, T2G;
+	       V T24, T2o, TC, T1s, TH, T1t, T2B, T2C, T2D, T1X, T2n, T2I, T2J, Tq, T1A;
+	       V T14, T1q, T2c, T2l;
+	       {
+		    V T3, T2i, T18, T2j, T6, T2d, T9, T2e, T15, Ta;
+		    {
+			 V T1, T2, T16, T17;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T3 = VSUB(T1, T2);
+			 T2i = VADD(T1, T2);
+			 T16 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T17 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			 T18 = VSUB(T16, T17);
+			 T2j = VADD(T16, T17);
+		    }
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T2d = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T2e = VADD(T7, T8);
+		    }
+		    T2f = VSUB(T2d, T2e);
+		    T2k = VSUB(T2i, T2j);
+		    T2N = VADD(T2d, T2e);
+		    T2M = VADD(T2i, T2j);
+		    T15 = VMUL(LDK(KP707106781), VSUB(T6, T9));
+		    T19 = VSUB(T15, T18);
+		    T1B = VADD(T18, T15);
+		    Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+		    Tb = VSUB(T3, Ta);
+		    T1p = VADD(T3, Ta);
+	       }
+	       {
+		    V TL, T21, TW, T1Y, TO, T22, TS, T1Z;
+		    {
+			 V TJ, TK, TU, TV;
+			 TJ = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 TK = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			 TL = VSUB(TJ, TK);
+			 T21 = VADD(TJ, TK);
+			 TU = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+			 TV = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 TW = VSUB(TU, TV);
+			 T1Y = VADD(TU, TV);
+		    }
+		    {
+			 V TM, TN, TQ, TR;
+			 TM = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+			 TN = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 TO = VSUB(TM, TN);
+			 T22 = VADD(TM, TN);
+			 TQ = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 TR = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			 TS = VSUB(TQ, TR);
+			 T1Z = VADD(TQ, TR);
+		    }
+		    {
+			 V TP, TX, T20, T23;
+			 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
+			 TT = VSUB(TP, TS);
+			 T1v = VADD(TS, TP);
+			 TX = VMUL(LDK(KP707106781), VADD(TL, TO));
+			 TY = VSUB(TW, TX);
+			 T1w = VADD(TW, TX);
+			 T2E = VADD(T1Y, T1Z);
+			 T2F = VADD(T21, T22);
+			 T2G = VSUB(T2E, T2F);
+			 T20 = VSUB(T1Y, T1Z);
+			 T23 = VSUB(T21, T22);
+			 T24 = VFMA(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T23));
+			 T2o = VFNMS(LDK(KP382683432), T20, VMUL(LDK(KP923879532), T23));
+		    }
+	       }
+	       {
+		    V Tu, T1U, TF, T1R, Tx, T1V, TB, T1S;
+		    {
+			 V Ts, Tt, TD, TE;
+			 Ts = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tt = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			 Tu = VSUB(Ts, Tt);
+			 T1U = VADD(Ts, Tt);
+			 TD = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 TE = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 TF = VSUB(TD, TE);
+			 T1R = VADD(TD, TE);
+		    }
+		    {
+			 V Tv, Tw, Tz, TA;
+			 Tv = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+			 Tw = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Tx = VSUB(Tv, Tw);
+			 T1V = VADD(Tv, Tw);
+			 Tz = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 TA = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+			 TB = VSUB(Tz, TA);
+			 T1S = VADD(Tz, TA);
+		    }
+		    {
+			 V Ty, TG, T1T, T1W;
+			 Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
+			 TC = VSUB(Ty, TB);
+			 T1s = VADD(TB, Ty);
+			 TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
+			 TH = VSUB(TF, TG);
+			 T1t = VADD(TF, TG);
+			 T2B = VADD(T1R, T1S);
+			 T2C = VADD(T1U, T1V);
+			 T2D = VSUB(T2B, T2C);
+			 T1T = VSUB(T1R, T1S);
+			 T1W = VSUB(T1U, T1V);
+			 T1X = VFNMS(LDK(KP382683432), T1W, VMUL(LDK(KP923879532), T1T));
+			 T2n = VFMA(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1W));
+		    }
+	       }
+	       {
+		    V Te, T26, To, T29, Th, T27, Tl, T2a, Ti, Tp;
+		    {
+			 V Tc, Td, Tm, Tn;
+			 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 Te = VSUB(Tc, Td);
+			 T26 = VADD(Tc, Td);
+			 Tm = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+			 Tn = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 To = VSUB(Tm, Tn);
+			 T29 = VADD(Tm, Tn);
+		    }
+		    {
+			 V Tf, Tg, Tj, Tk;
+			 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+			 Th = VSUB(Tf, Tg);
+			 T27 = VADD(Tf, Tg);
+			 Tj = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 T2a = VADD(Tj, Tk);
+		    }
+		    T2I = VADD(T26, T27);
+		    T2J = VADD(T29, T2a);
+		    Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
+		    Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
+		    Tq = VSUB(Ti, Tp);
+		    T1A = VADD(Ti, Tp);
+		    {
+			 V T12, T13, T28, T2b;
+			 T12 = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
+			 T13 = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
+			 T14 = VSUB(T12, T13);
+			 T1q = VADD(T12, T13);
+			 T28 = VSUB(T26, T27);
+			 T2b = VSUB(T29, T2a);
+			 T2c = VMUL(LDK(KP707106781), VSUB(T28, T2b));
+			 T2l = VMUL(LDK(KP707106781), VADD(T28, T2b));
+		    }
+	       }
+	       {
+		    V T2L, T2R, T2Q, T2S;
+		    {
+			 V T2H, T2K, T2O, T2P;
+			 T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G));
+			 T2K = VSUB(T2I, T2J);
+			 T2L = VBYI(VSUB(T2H, T2K));
+			 T2R = VBYI(VADD(T2K, T2H));
+			 T2O = VSUB(T2M, T2N);
+			 T2P = VMUL(LDK(KP707106781), VADD(T2D, T2G));
+			 T2Q = VSUB(T2O, T2P);
+			 T2S = VADD(T2O, T2P);
+		    }
+		    ST(&(xo[WS(os, 12)]), VADD(T2L, T2Q), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 28)]), VSUB(T2S, T2R), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 20)]), VSUB(T2Q, T2L), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VADD(T2R, T2S), ovs, &(xo[0]));
+	       }
+	       {
+		    V T2h, T2r, T2q, T2s;
+		    {
+			 V T25, T2g, T2m, T2p;
+			 T25 = VSUB(T1X, T24);
+			 T2g = VSUB(T2c, T2f);
+			 T2h = VBYI(VSUB(T25, T2g));
+			 T2r = VBYI(VADD(T2g, T25));
+			 T2m = VSUB(T2k, T2l);
+			 T2p = VSUB(T2n, T2o);
+			 T2q = VSUB(T2m, T2p);
+			 T2s = VADD(T2m, T2p);
+		    }
+		    ST(&(xo[WS(os, 10)]), VADD(T2h, T2q), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 26)]), VSUB(T2s, T2r), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 22)]), VSUB(T2q, T2h), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 6)]), VADD(T2r, T2s), ovs, &(xo[0]));
+	       }
+	       {
+		    V T2V, T2Z, T2Y, T30;
+		    {
+			 V T2T, T2U, T2W, T2X;
+			 T2T = VADD(T2M, T2N);
+			 T2U = VADD(T2I, T2J);
+			 T2V = VSUB(T2T, T2U);
+			 T2Z = VADD(T2T, T2U);
+			 T2W = VADD(T2B, T2C);
+			 T2X = VADD(T2E, T2F);
+			 T2Y = VBYI(VSUB(T2W, T2X));
+			 T30 = VADD(T2W, T2X);
+		    }
+		    ST(&(xo[WS(os, 24)]), VSUB(T2V, T2Y), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(T2Z, T30), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 8)]), VADD(T2V, T2Y), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 16)]), VSUB(T2Z, T30), ovs, &(xo[0]));
+	       }
+	       {
+		    V T2v, T2z, T2y, T2A;
+		    {
+			 V T2t, T2u, T2w, T2x;
+			 T2t = VADD(T2k, T2l);
+			 T2u = VADD(T1X, T24);
+			 T2v = VADD(T2t, T2u);
+			 T2z = VSUB(T2t, T2u);
+			 T2w = VADD(T2f, T2c);
+			 T2x = VADD(T2n, T2o);
+			 T2y = VBYI(VADD(T2w, T2x));
+			 T2A = VBYI(VSUB(T2x, T2w));
+		    }
+		    ST(&(xo[WS(os, 30)]), VSUB(T2v, T2y), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 14)]), VADD(T2z, T2A), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VADD(T2v, T2y), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 18)]), VSUB(T2z, T2A), ovs, &(xo[0]));
+	       }
+	       {
+		    V T1r, T1C, T1M, T1K, T1F, T1N, T1y, T1J;
+		    T1r = VSUB(T1p, T1q);
+		    T1C = VSUB(T1A, T1B);
+		    T1M = VADD(T1p, T1q);
+		    T1K = VADD(T1B, T1A);
+		    {
+			 V T1D, T1E, T1u, T1x;
+			 T1D = VFNMS(LDK(KP195090322), T1s, VMUL(LDK(KP980785280), T1t));
+			 T1E = VFMA(LDK(KP195090322), T1v, VMUL(LDK(KP980785280), T1w));
+			 T1F = VSUB(T1D, T1E);
+			 T1N = VADD(T1D, T1E);
+			 T1u = VFMA(LDK(KP980785280), T1s, VMUL(LDK(KP195090322), T1t));
+			 T1x = VFNMS(LDK(KP195090322), T1w, VMUL(LDK(KP980785280), T1v));
+			 T1y = VSUB(T1u, T1x);
+			 T1J = VADD(T1u, T1x);
+		    }
+		    {
+			 V T1z, T1G, T1P, T1Q;
+			 T1z = VADD(T1r, T1y);
+			 T1G = VBYI(VADD(T1C, T1F));
+			 ST(&(xo[WS(os, 25)]), VSUB(T1z, T1G), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 7)]), VADD(T1z, T1G), ovs, &(xo[WS(os, 1)]));
+			 T1P = VBYI(VADD(T1K, T1J));
+			 T1Q = VADD(T1M, T1N);
+			 ST(&(xo[WS(os, 1)]), VADD(T1P, T1Q), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 31)]), VSUB(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T1H, T1I, T1L, T1O;
+			 T1H = VSUB(T1r, T1y);
+			 T1I = VBYI(VSUB(T1F, T1C));
+			 ST(&(xo[WS(os, 23)]), VSUB(T1H, T1I), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 9)]), VADD(T1H, T1I), ovs, &(xo[WS(os, 1)]));
+			 T1L = VBYI(VSUB(T1J, T1K));
+			 T1O = VSUB(T1M, T1N);
+			 ST(&(xo[WS(os, 15)]), VADD(T1L, T1O), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 17)]), VSUB(T1O, T1L), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V Tr, T1a, T1k, T1i, T1d, T1l, T10, T1h;
+		    Tr = VSUB(Tb, Tq);
+		    T1a = VSUB(T14, T19);
+		    T1k = VADD(Tb, Tq);
+		    T1i = VADD(T19, T14);
+		    {
+			 V T1b, T1c, TI, TZ;
+			 T1b = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
+			 T1c = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
+			 T1d = VSUB(T1b, T1c);
+			 T1l = VADD(T1b, T1c);
+			 TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
+			 TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
+			 T10 = VSUB(TI, TZ);
+			 T1h = VADD(TI, TZ);
+		    }
+		    {
+			 V T11, T1e, T1n, T1o;
+			 T11 = VADD(Tr, T10);
+			 T1e = VBYI(VADD(T1a, T1d));
+			 ST(&(xo[WS(os, 27)]), VSUB(T11, T1e), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 5)]), VADD(T11, T1e), ovs, &(xo[WS(os, 1)]));
+			 T1n = VBYI(VADD(T1i, T1h));
+			 T1o = VADD(T1k, T1l);
+			 ST(&(xo[WS(os, 3)]), VADD(T1n, T1o), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 29)]), VSUB(T1o, T1n), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T1f, T1g, T1j, T1m;
+			 T1f = VSUB(Tr, T10);
+			 T1g = VBYI(VSUB(T1d, T1a));
+			 ST(&(xo[WS(os, 21)]), VSUB(T1f, T1g), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 11)]), VADD(T1f, T1g), ovs, &(xo[WS(os, 1)]));
+			 T1j = VBYI(VSUB(T1h, T1i));
+			 T1m = VSUB(T1k, T1l);
+			 ST(&(xo[WS(os, 13)]), VADD(T1j, T1m), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 19)]), VSUB(T1m, T1j), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 32, XSIMD_STRING("n1bv_32"), {170, 26, 16, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_32) (planner *p) {
+     X(kdft_register) (p, n1bv_32, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:58 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 4 -name n1bv_4 -include n1b.h */
+
+/*
+ * This function contains 8 FP additions, 2 FP multiplications,
+ * (or, 6 additions, 0 multiplications, 2 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
+	       V T1, T2, T4, T5;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, T7, T6, T8;
+		    T3 = VSUB(T1, T2);
+		    T7 = VADD(T1, T2);
+		    T6 = VSUB(T4, T5);
+		    T8 = VADD(T4, T5);
+		    ST(&(xo[WS(os, 2)]), VSUB(T7, T8), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(T7, T8), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 1)]), VFMAI(T6, T3), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VFNMSI(T6, T3), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 4, XSIMD_STRING("n1bv_4"), {6, 0, 2, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_4) (planner *p) {
+     X(kdft_register) (p, n1bv_4, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 4 -name n1bv_4 -include n1b.h */
+
+/*
+ * This function contains 8 FP additions, 0 FP multiplications,
+ * (or, 8 additions, 0 multiplications, 0 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
+	       V T3, T7, T6, T8;
+	       {
+		    V T1, T2, T4, T5;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T3 = VSUB(T1, T2);
+		    T7 = VADD(T1, T2);
+		    T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T6 = VBYI(VSUB(T4, T5));
+		    T8 = VADD(T4, T5);
+	       }
+	       ST(&(xo[WS(os, 3)]), VSUB(T3, T6), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[0]), VADD(T7, T8), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 1)]), VADD(T3, T6), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[WS(os, 2)]), VSUB(T7, T8), ovs, &(xo[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 4, XSIMD_STRING("n1bv_4"), {8, 0, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_4) (planner *p) {
+     X(kdft_register) (p, n1bv_4, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:58 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 5 -name n1bv_5 -include n1b.h */
+
+/*
+ * This function contains 16 FP additions, 11 FP multiplications,
+ * (or, 7 additions, 2 multiplications, 9 fused multiply/add),
+ * 23 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(10, is), MAKE_VOLATILE_STRIDE(10, os)) {
+	       V T1, T2, T3, T5, T6;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T6 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V Tc, T4, Td, T7;
+		    Tc = VSUB(T2, T3);
+		    T4 = VADD(T2, T3);
+		    Td = VSUB(T5, T6);
+		    T7 = VADD(T5, T6);
+		    {
+			 V Tg, Te, Ta, T8, T9, Tf, Tb;
+			 Tg = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tc, Td));
+			 Te = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Td, Tc));
+			 Ta = VSUB(T4, T7);
+			 T8 = VADD(T4, T7);
+			 T9 = VFNMS(LDK(KP250000000), T8, T1);
+			 ST(&(xo[0]), VADD(T1, T8), ovs, &(xo[0]));
+			 Tf = VFNMS(LDK(KP559016994), Ta, T9);
+			 Tb = VFMA(LDK(KP559016994), Ta, T9);
+			 ST(&(xo[WS(os, 2)]), VFNMSI(Tg, Tf), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 3)]), VFMAI(Tg, Tf), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 4)]), VFNMSI(Te, Tb), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 1)]), VFMAI(Te, Tb), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 5, XSIMD_STRING("n1bv_5"), {7, 2, 9, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_5) (planner *p) {
+     X(kdft_register) (p, n1bv_5, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 5 -name n1bv_5 -include n1b.h */
+
+/*
+ * This function contains 16 FP additions, 6 FP multiplications,
+ * (or, 13 additions, 3 multiplications, 3 fused multiply/add),
+ * 18 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(10, is), MAKE_VOLATILE_STRIDE(10, os)) {
+	       V Tb, T3, Tc, T6, Ta;
+	       Tb = LD(&(xi[0]), ivs, &(xi[0]));
+	       {
+		    V T1, T2, T8, T4, T5, T9;
+		    T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T8 = VADD(T1, T2);
+		    T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T9 = VADD(T4, T5);
+		    T3 = VSUB(T1, T2);
+		    Tc = VADD(T8, T9);
+		    T6 = VSUB(T4, T5);
+		    Ta = VMUL(LDK(KP559016994), VSUB(T8, T9));
+	       }
+	       ST(&(xo[0]), VADD(Tb, Tc), ovs, &(xo[0]));
+	       {
+		    V T7, Tf, Te, Tg, Td;
+		    T7 = VBYI(VFMA(LDK(KP951056516), T3, VMUL(LDK(KP587785252), T6)));
+		    Tf = VBYI(VFNMS(LDK(KP951056516), T6, VMUL(LDK(KP587785252), T3)));
+		    Td = VFNMS(LDK(KP250000000), Tc, Tb);
+		    Te = VADD(Ta, Td);
+		    Tg = VSUB(Td, Ta);
+		    ST(&(xo[WS(os, 1)]), VADD(T7, Te), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VSUB(Tg, Tf), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 4)]), VSUB(Te, T7), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VADD(Tf, Tg), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 5, XSIMD_STRING("n1bv_5"), {13, 3, 3, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_5) (planner *p) {
+     X(kdft_register) (p, n1bv_5, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:58 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 6 -name n1bv_6 -include n1b.h */
+
+/*
+ * This function contains 18 FP additions, 8 FP multiplications,
+ * (or, 12 additions, 2 multiplications, 6 fused multiply/add),
+ * 23 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
+	       V T1, T2, T4, T5, T7, T8;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, Td, T6, Te, T9, Tf;
+		    T3 = VSUB(T1, T2);
+		    Td = VADD(T1, T2);
+		    T6 = VSUB(T4, T5);
+		    Te = VADD(T4, T5);
+		    T9 = VSUB(T7, T8);
+		    Tf = VADD(T7, T8);
+		    {
+			 V Tg, Ti, Ta, Tc, Th, Tb;
+			 Tg = VADD(Te, Tf);
+			 Ti = VMUL(LDK(KP866025403), VSUB(Te, Tf));
+			 Ta = VADD(T6, T9);
+			 Tc = VMUL(LDK(KP866025403), VSUB(T6, T9));
+			 Th = VFNMS(LDK(KP500000000), Tg, Td);
+			 ST(&(xo[0]), VADD(Td, Tg), ovs, &(xo[0]));
+			 Tb = VFNMS(LDK(KP500000000), Ta, T3);
+			 ST(&(xo[WS(os, 3)]), VADD(T3, Ta), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 4)]), VFMAI(Ti, Th), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 2)]), VFNMSI(Ti, Th), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 5)]), VFNMSI(Tc, Tb), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 1)]), VFMAI(Tc, Tb), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 6, XSIMD_STRING("n1bv_6"), {12, 2, 6, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_6) (planner *p) {
+     X(kdft_register) (p, n1bv_6, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 6 -name n1bv_6 -include n1b.h */
+
+/*
+ * This function contains 18 FP additions, 4 FP multiplications,
+ * (or, 16 additions, 2 multiplications, 2 fused multiply/add),
+ * 19 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
+	       V Ta, Td, T3, Te, T6, Tf, Tb, Tg, T8, T9;
+	       T8 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T9 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       Ta = VSUB(T8, T9);
+	       Td = VADD(T8, T9);
+	       {
+		    V T1, T2, T4, T5;
+		    T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = VSUB(T1, T2);
+		    Te = VADD(T1, T2);
+		    T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T5 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T6 = VSUB(T4, T5);
+		    Tf = VADD(T4, T5);
+	       }
+	       Tb = VADD(T3, T6);
+	       Tg = VADD(Te, Tf);
+	       ST(&(xo[WS(os, 3)]), VADD(Ta, Tb), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[0]), VADD(Td, Tg), ovs, &(xo[0]));
+	       {
+		    V T7, Tc, Th, Ti;
+		    T7 = VBYI(VMUL(LDK(KP866025403), VSUB(T3, T6)));
+		    Tc = VFNMS(LDK(KP500000000), Tb, Ta);
+		    ST(&(xo[WS(os, 1)]), VADD(T7, Tc), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 5)]), VSUB(Tc, T7), ovs, &(xo[WS(os, 1)]));
+		    Th = VFNMS(LDK(KP500000000), Tg, Td);
+		    Ti = VBYI(VMUL(LDK(KP866025403), VSUB(Te, Tf)));
+		    ST(&(xo[WS(os, 2)]), VSUB(Th, Ti), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VADD(Ti, Th), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 6, XSIMD_STRING("n1bv_6"), {16, 2, 2, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_6) (planner *p) {
+     X(kdft_register) (p, n1bv_6, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1568 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:05 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n1bv_64 -include n1b.h */
+
+/*
+ * This function contains 456 FP additions, 258 FP multiplications,
+ * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
+ * 168 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       V T5T, T5S, T5X, T65, T5Z, T5R, T67, T63, T5U, T64;
+	       {
+		    V T7, T26, T5k, T6A, T47, T69, T2V, T3z, T6B, T4e, T6a, T5n, T3M, T2Y, T27;
+		    V Tm, T3A, T3i, T29, TC, T5p, T4o, T6D, T6e, T3l, T3B, TR, T2a, T4x, T5q;
+		    V T6h, T6E, T39, T3H, T3I, T3c, T5N, T57, T72, T6w, T5O, T5e, T71, T6t, T2y;
+		    V T1W, T2x, T1N, T33, T34, T3E, T32, T1p, T2v, T1g, T2u, T4M, T5K, T6p, T6Z;
+		    V T6m, T6Y, T5L, T4T;
+		    {
+			 V T4g, T4l, T3g, Tu, Tx, T4h, TA, T4i;
+			 {
+			      V T1, T2, T23, T24, T4, T5, T20, T21;
+			      T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			      T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+			      T23 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+			      T24 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			      T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			      T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+			      T20 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			      T21 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+			      {
+				   V Ta, T48, Tk, T4c, T49, Td, Tf, Tg;
+				   {
+					V T8, T43, T3, T45, T25, T5i, T6, T44, T22, T9, Ti, Tj, Tb, Tc;
+					T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					T43 = VSUB(T1, T2);
+					T3 = VADD(T1, T2);
+					T45 = VSUB(T23, T24);
+					T25 = VADD(T23, T24);
+					T5i = VSUB(T4, T5);
+					T6 = VADD(T4, T5);
+					T44 = VSUB(T20, T21);
+					T22 = VADD(T20, T21);
+					T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+					Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+					Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+					Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+					Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+					{
+					     V T2T, T46, T5j, T2U;
+					     T7 = VSUB(T3, T6);
+					     T2T = VADD(T3, T6);
+					     T46 = VADD(T44, T45);
+					     T5j = VSUB(T44, T45);
+					     T26 = VSUB(T22, T25);
+					     T2U = VADD(T22, T25);
+					     Ta = VADD(T8, T9);
+					     T48 = VSUB(T8, T9);
+					     Tk = VADD(Ti, Tj);
+					     T4c = VSUB(Tj, Ti);
+					     T5k = VFMA(LDK(KP707106781), T5j, T5i);
+					     T6A = VFNMS(LDK(KP707106781), T5j, T5i);
+					     T47 = VFMA(LDK(KP707106781), T46, T43);
+					     T69 = VFNMS(LDK(KP707106781), T46, T43);
+					     T2V = VADD(T2T, T2U);
+					     T3z = VSUB(T2T, T2U);
+					     T49 = VSUB(Tb, Tc);
+					     Td = VADD(Tb, Tc);
+					}
+					Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+					Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+				   }
+				   {
+					V Te, T2W, T5l, T4a, Tq, Tt, Tv, Tw, T5m, T4d, Tl, T2X, Ty, Tz, To;
+					V Tp;
+					To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+					Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+					{
+					     V Th, T4b, Tr, Ts;
+					     Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+					     Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+					     Te = VSUB(Ta, Td);
+					     T2W = VADD(Ta, Td);
+					     T5l = VFMA(LDK(KP414213562), T48, T49);
+					     T4a = VFNMS(LDK(KP414213562), T49, T48);
+					     Th = VADD(Tf, Tg);
+					     T4b = VSUB(Tf, Tg);
+					     Tq = VADD(To, Tp);
+					     T4g = VSUB(To, Tp);
+					     T4l = VSUB(Tr, Ts);
+					     Tt = VADD(Tr, Ts);
+					     Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+					     Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+					     T5m = VFMA(LDK(KP414213562), T4b, T4c);
+					     T4d = VFNMS(LDK(KP414213562), T4c, T4b);
+					     Tl = VSUB(Th, Tk);
+					     T2X = VADD(Th, Tk);
+					     Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+					     Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+					}
+					T3g = VADD(Tq, Tt);
+					Tu = VSUB(Tq, Tt);
+					Tx = VADD(Tv, Tw);
+					T4h = VSUB(Tv, Tw);
+					T6B = VSUB(T4a, T4d);
+					T4e = VADD(T4a, T4d);
+					T6a = VADD(T5l, T5m);
+					T5n = VSUB(T5l, T5m);
+					T3M = VSUB(T2W, T2X);
+					T2Y = VADD(T2W, T2X);
+					T27 = VSUB(Te, Tl);
+					Tm = VADD(Te, Tl);
+					TA = VADD(Ty, Tz);
+					T4i = VSUB(Ty, Tz);
+				   }
+			      }
+			 }
+			 {
+			      V TK, T4p, T4u, T4k, T6d, T4n, T6c, TL, TN, TO, T3j, TJ, TF, TI;
+			      {
+				   V TD, TE, TG, TH;
+				   TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+				   TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+				   TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+				   TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+				   TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+				   {
+					V T3h, TB, T4j, T4m;
+					T3h = VADD(Tx, TA);
+					TB = VSUB(Tx, TA);
+					T4j = VADD(T4h, T4i);
+					T4m = VSUB(T4h, T4i);
+					T4p = VSUB(TD, TE);
+					TF = VADD(TD, TE);
+					T4u = VSUB(TH, TG);
+					TI = VADD(TG, TH);
+					T3A = VSUB(T3g, T3h);
+					T3i = VADD(T3g, T3h);
+					T29 = VFMA(LDK(KP414213562), Tu, TB);
+					TC = VFNMS(LDK(KP414213562), TB, Tu);
+					T4k = VFMA(LDK(KP707106781), T4j, T4g);
+					T6d = VFNMS(LDK(KP707106781), T4j, T4g);
+					T4n = VFMA(LDK(KP707106781), T4m, T4l);
+					T6c = VFNMS(LDK(KP707106781), T4m, T4l);
+					TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+				   }
+				   TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+				   TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+			      }
+			      T3j = VADD(TF, TI);
+			      TJ = VSUB(TF, TI);
+			      {
+				   V T3a, T1E, T52, T5b, T1x, T4Z, T6r, T6u, T5a, T1U, T55, T5c, T1L, T3b;
+				   {
+					V T4V, T1t, T58, T1w, T1Q, T1T, T1I, T4Y, T59, T1J, T53, T1H;
+					{
+					     V T1r, TM, T4r, TP, T4q, T1s, T1u, T1v;
+					     T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+					     T5p = VFMA(LDK(KP198912367), T4k, T4n);
+					     T4o = VFNMS(LDK(KP198912367), T4n, T4k);
+					     T6D = VFMA(LDK(KP668178637), T6c, T6d);
+					     T6e = VFNMS(LDK(KP668178637), T6d, T6c);
+					     TM = VADD(TK, TL);
+					     T4r = VSUB(TK, TL);
+					     TP = VADD(TN, TO);
+					     T4q = VSUB(TN, TO);
+					     T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+					     T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+					     T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+					     {
+						  V T1R, T4X, T6g, T4t, T6f, T4w, T1S, T1O, T1P;
+						  T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+						  T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+						  T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V T3k, TQ, T4s, T4v;
+						       T3k = VADD(TP, TM);
+						       TQ = VSUB(TM, TP);
+						       T4s = VADD(T4q, T4r);
+						       T4v = VSUB(T4r, T4q);
+						       T4V = VSUB(T1r, T1s);
+						       T1t = VADD(T1r, T1s);
+						       T58 = VSUB(T1v, T1u);
+						       T1w = VADD(T1u, T1v);
+						       T4X = VSUB(T1O, T1P);
+						       T1Q = VADD(T1O, T1P);
+						       T3l = VADD(T3j, T3k);
+						       T3B = VSUB(T3j, T3k);
+						       TR = VFNMS(LDK(KP414213562), TQ, TJ);
+						       T2a = VFMA(LDK(KP414213562), TJ, TQ);
+						       T6g = VFNMS(LDK(KP707106781), T4s, T4p);
+						       T4t = VFMA(LDK(KP707106781), T4s, T4p);
+						       T6f = VFNMS(LDK(KP707106781), T4v, T4u);
+						       T4w = VFMA(LDK(KP707106781), T4v, T4u);
+						       T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+						  }
+						  {
+						       V T4W, T1A, T50, T51, T1D, T1F, T1G;
+						       {
+							    V T1y, T1z, T1B, T1C;
+							    T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+							    T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+							    T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+							    T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+							    T4x = VFNMS(LDK(KP198912367), T4w, T4t);
+							    T5q = VFMA(LDK(KP198912367), T4t, T4w);
+							    T6h = VFNMS(LDK(KP668178637), T6g, T6f);
+							    T6E = VFMA(LDK(KP668178637), T6f, T6g);
+							    T4W = VSUB(T1R, T1S);
+							    T1T = VADD(T1R, T1S);
+							    T1A = VADD(T1y, T1z);
+							    T50 = VSUB(T1y, T1z);
+							    T51 = VSUB(T1C, T1B);
+							    T1D = VADD(T1B, T1C);
+						       }
+						       T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+						       T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+						       T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+						       T4Y = VADD(T4W, T4X);
+						       T59 = VSUB(T4X, T4W);
+						       T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+						       T3a = VADD(T1A, T1D);
+						       T1E = VSUB(T1A, T1D);
+						       T52 = VFMA(LDK(KP414213562), T51, T50);
+						       T5b = VFNMS(LDK(KP414213562), T50, T51);
+						       T53 = VSUB(T1F, T1G);
+						       T1H = VADD(T1F, T1G);
+						  }
+					     }
+					}
+					{
+					     V T37, T54, T1K, T38;
+					     T1x = VSUB(T1t, T1w);
+					     T37 = VADD(T1t, T1w);
+					     T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
+					     T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
+					     T54 = VSUB(T1J, T1I);
+					     T1K = VADD(T1I, T1J);
+					     T6u = VFNMS(LDK(KP707106781), T59, T58);
+					     T5a = VFMA(LDK(KP707106781), T59, T58);
+					     T38 = VADD(T1T, T1Q);
+					     T1U = VSUB(T1Q, T1T);
+					     T55 = VFNMS(LDK(KP414213562), T54, T53);
+					     T5c = VFMA(LDK(KP414213562), T53, T54);
+					     T1L = VSUB(T1H, T1K);
+					     T3b = VADD(T1H, T1K);
+					     T39 = VADD(T37, T38);
+					     T3H = VSUB(T37, T38);
+					}
+				   }
+				   {
+					V T4A, TW, T4N, TZ, T1j, T1m, T4O, T4D, T13, T4F, T16, T4G, T1a, T4I, T4J;
+					V T1d;
+					{
+					     V TU, TV, TX, TY, T56, T6v;
+					     TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+					     T56 = VADD(T52, T55);
+					     T6v = VSUB(T55, T52);
+					     {
+						  V T5d, T6s, T1V, T1M;
+						  T5d = VADD(T5b, T5c);
+						  T6s = VSUB(T5c, T5b);
+						  T1V = VSUB(T1L, T1E);
+						  T1M = VADD(T1E, T1L);
+						  T3I = VSUB(T3b, T3a);
+						  T3c = VADD(T3a, T3b);
+						  T5N = VFNMS(LDK(KP923879532), T56, T4Z);
+						  T57 = VFMA(LDK(KP923879532), T56, T4Z);
+						  T72 = VFNMS(LDK(KP923879532), T6v, T6u);
+						  T6w = VFMA(LDK(KP923879532), T6v, T6u);
+						  T5O = VFNMS(LDK(KP923879532), T5d, T5a);
+						  T5e = VFMA(LDK(KP923879532), T5d, T5a);
+						  T71 = VFMA(LDK(KP923879532), T6s, T6r);
+						  T6t = VFNMS(LDK(KP923879532), T6s, T6r);
+						  T2y = VFNMS(LDK(KP707106781), T1V, T1U);
+						  T1W = VFMA(LDK(KP707106781), T1V, T1U);
+						  T2x = VFNMS(LDK(KP707106781), T1M, T1x);
+						  T1N = VFMA(LDK(KP707106781), T1M, T1x);
+						  TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+					     }
+					     TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+					     TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+					     {
+						  V T1h, T1i, T1k, T1l;
+						  T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+						  T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+						  T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+						  T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V T11, T4B, T4C, T12, T14, T15;
+						       T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+						       T4A = VSUB(TU, TV);
+						       TW = VADD(TU, TV);
+						       T4N = VSUB(TX, TY);
+						       TZ = VADD(TX, TY);
+						       T1j = VADD(T1h, T1i);
+						       T4B = VSUB(T1h, T1i);
+						       T1m = VADD(T1k, T1l);
+						       T4C = VSUB(T1k, T1l);
+						       T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+						       T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+						       T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+						       {
+							    V T18, T19, T1b, T1c;
+							    T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+							    T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+							    T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+							    T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+							    T4O = VSUB(T4B, T4C);
+							    T4D = VADD(T4B, T4C);
+							    T13 = VADD(T11, T12);
+							    T4F = VSUB(T11, T12);
+							    T16 = VADD(T14, T15);
+							    T4G = VSUB(T14, T15);
+							    T1a = VADD(T18, T19);
+							    T4I = VSUB(T18, T19);
+							    T4J = VSUB(T1b, T1c);
+							    T1d = VADD(T1b, T1c);
+						       }
+						  }
+					     }
+					}
+					{
+					     V T30, T10, T6k, T4E, T4Q, T4H, T17, T6n, T4P, T1e, T4K, T4R, T1n, T31;
+					     T30 = VADD(TW, TZ);
+					     T10 = VSUB(TW, TZ);
+					     T6k = VFNMS(LDK(KP707106781), T4D, T4A);
+					     T4E = VFMA(LDK(KP707106781), T4D, T4A);
+					     T4Q = VFMA(LDK(KP414213562), T4F, T4G);
+					     T4H = VFNMS(LDK(KP414213562), T4G, T4F);
+					     T33 = VADD(T13, T16);
+					     T17 = VSUB(T13, T16);
+					     T6n = VFNMS(LDK(KP707106781), T4O, T4N);
+					     T4P = VFMA(LDK(KP707106781), T4O, T4N);
+					     T34 = VADD(T1a, T1d);
+					     T1e = VSUB(T1a, T1d);
+					     T4K = VFMA(LDK(KP414213562), T4J, T4I);
+					     T4R = VFNMS(LDK(KP414213562), T4I, T4J);
+					     T1n = VSUB(T1j, T1m);
+					     T31 = VADD(T1j, T1m);
+					     {
+						  V T1f, T1o, T6o, T4L, T4S, T6l;
+						  T1f = VADD(T17, T1e);
+						  T1o = VSUB(T17, T1e);
+						  T6o = VSUB(T4H, T4K);
+						  T4L = VADD(T4H, T4K);
+						  T4S = VADD(T4Q, T4R);
+						  T6l = VSUB(T4Q, T4R);
+						  T3E = VSUB(T30, T31);
+						  T32 = VADD(T30, T31);
+						  T1p = VFMA(LDK(KP707106781), T1o, T1n);
+						  T2v = VFNMS(LDK(KP707106781), T1o, T1n);
+						  T1g = VFMA(LDK(KP707106781), T1f, T10);
+						  T2u = VFNMS(LDK(KP707106781), T1f, T10);
+						  T4M = VFMA(LDK(KP923879532), T4L, T4E);
+						  T5K = VFNMS(LDK(KP923879532), T4L, T4E);
+						  T6p = VFMA(LDK(KP923879532), T6o, T6n);
+						  T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
+						  T6m = VFNMS(LDK(KP923879532), T6l, T6k);
+						  T6Y = VFMA(LDK(KP923879532), T6l, T6k);
+						  T5L = VFNMS(LDK(KP923879532), T4S, T4P);
+						  T4T = VFMA(LDK(KP923879532), T4S, T4P);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T6b, T6F, T7f, T6X, T70, T79, T7a, T73, T6C, T76, T77, T6i;
+			 {
+			      V T2Z, T3r, T3s, T3m, T3d, T3v;
+			      T2Z = VSUB(T2V, T2Y);
+			      T3r = VADD(T2V, T2Y);
+			      T3s = VADD(T3i, T3l);
+			      T3m = VSUB(T3i, T3l);
+			      T3d = VSUB(T39, T3c);
+			      T3v = VADD(T39, T3c);
+			      {
+				   V T3x, T3t, T3Q, T3J, T3D, T3V, T3G, T3P, T3u, T36, T3O, T3Y, T6V, T6W;
+				   {
+					V T3N, T3C, T3F, T35;
+					T3N = VSUB(T3A, T3B);
+					T3C = VADD(T3A, T3B);
+					T3F = VSUB(T33, T34);
+					T35 = VADD(T33, T34);
+					T3x = VADD(T3r, T3s);
+					T3t = VSUB(T3r, T3s);
+					T3Q = VFMA(LDK(KP414213562), T3H, T3I);
+					T3J = VFNMS(LDK(KP414213562), T3I, T3H);
+					T3D = VFMA(LDK(KP707106781), T3C, T3z);
+					T3V = VFNMS(LDK(KP707106781), T3C, T3z);
+					T3G = VFNMS(LDK(KP414213562), T3F, T3E);
+					T3P = VFMA(LDK(KP414213562), T3E, T3F);
+					T3u = VADD(T32, T35);
+					T36 = VSUB(T32, T35);
+					T3O = VFMA(LDK(KP707106781), T3N, T3M);
+					T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
+				   }
+				   T6b = VFNMS(LDK(KP923879532), T6a, T69);
+				   T6V = VFMA(LDK(KP923879532), T6a, T69);
+				   T6W = VADD(T6D, T6E);
+				   T6F = VSUB(T6D, T6E);
+				   {
+					V T3R, T3W, T3K, T3Z;
+					T3R = VSUB(T3P, T3Q);
+					T3W = VADD(T3P, T3Q);
+					T3K = VADD(T3G, T3J);
+					T3Z = VSUB(T3G, T3J);
+					{
+					     V T3e, T3n, T3w, T3y;
+					     T3e = VADD(T36, T3d);
+					     T3n = VSUB(T36, T3d);
+					     T3w = VSUB(T3u, T3v);
+					     T3y = VADD(T3u, T3v);
+					     {
+						  V T41, T3X, T3S, T3U;
+						  T41 = VFMA(LDK(KP923879532), T3W, T3V);
+						  T3X = VFNMS(LDK(KP923879532), T3W, T3V);
+						  T3S = VFNMS(LDK(KP923879532), T3R, T3O);
+						  T3U = VFMA(LDK(KP923879532), T3R, T3O);
+						  {
+						       V T42, T40, T3L, T3T;
+						       T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
+						       T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
+						       T3L = VFNMS(LDK(KP923879532), T3K, T3D);
+						       T3T = VFMA(LDK(KP923879532), T3K, T3D);
+						       {
+							    V T3o, T3q, T3f, T3p;
+							    T3o = VFNMS(LDK(KP707106781), T3n, T3m);
+							    T3q = VFMA(LDK(KP707106781), T3n, T3m);
+							    T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
+							    T3p = VFMA(LDK(KP707106781), T3e, T2Z);
+							    ST(&(xo[WS(os, 32)]), VSUB(T3x, T3y), ovs, &(xo[0]));
+							    ST(&(xo[0]), VADD(T3x, T3y), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 16)]), VFMAI(T3w, T3t), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 48)]), VFNMSI(T3w, T3t), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 44)]), VFNMSI(T40, T3X), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 20)]), VFMAI(T40, T3X), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 52)]), VFMAI(T42, T41), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 12)]), VFNMSI(T42, T41), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 4)]), VFMAI(T3U, T3T), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 60)]), VFNMSI(T3U, T3T), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 36)]), VFMAI(T3S, T3L), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 28)]), VFNMSI(T3S, T3L), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 56)]), VFNMSI(T3q, T3p), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 8)]), VFMAI(T3q, T3p), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 40)]), VFMAI(T3o, T3f), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 24)]), VFNMSI(T3o, T3f), ovs, &(xo[0]));
+							    T7f = VFNMS(LDK(KP831469612), T6W, T6V);
+							    T6X = VFMA(LDK(KP831469612), T6W, T6V);
+						       }
+						  }
+					     }
+					}
+				   }
+				   T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
+				   T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
+				   T7a = VFNMS(LDK(KP303346683), T71, T72);
+				   T73 = VFMA(LDK(KP303346683), T72, T71);
+				   T6C = VFMA(LDK(KP923879532), T6B, T6A);
+				   T76 = VFNMS(LDK(KP923879532), T6B, T6A);
+				   T77 = VSUB(T6e, T6h);
+				   T6i = VADD(T6e, T6h);
+			      }
+			 }
+			 {
+			      V T2r, T2D, T2C, T2s, T5H, T5o, T5v, T5D, T5r, T5I, T5x, T5h, T5F, T5B;
+			      {
+				   V TT, T2f, T2n, T1Y, T28, T2b, T2l, T2p, T2j, T2k;
+				   {
+					V T1q, T2d, T7h, T7l, T2e, T1X, T75, T7d, T7m, T7k, T7c, T7e, Tn, TS;
+					T2r = VFNMS(LDK(KP707106781), Tm, T7);
+					Tn = VFMA(LDK(KP707106781), Tm, T7);
+					TS = VADD(TC, TR);
+					T2D = VSUB(TC, TR);
+					{
+					     V T7b, T7j, T74, T7i, T78, T7g;
+					     T1q = VFNMS(LDK(KP198912367), T1p, T1g);
+					     T2d = VFMA(LDK(KP198912367), T1g, T1p);
+					     T7g = VADD(T79, T7a);
+					     T7b = VSUB(T79, T7a);
+					     T7j = VSUB(T70, T73);
+					     T74 = VADD(T70, T73);
+					     T7i = VFNMS(LDK(KP831469612), T77, T76);
+					     T78 = VFMA(LDK(KP831469612), T77, T76);
+					     T2j = VFNMS(LDK(KP923879532), TS, Tn);
+					     TT = VFMA(LDK(KP923879532), TS, Tn);
+					     T7h = VFMA(LDK(KP956940335), T7g, T7f);
+					     T7l = VFNMS(LDK(KP956940335), T7g, T7f);
+					     T2e = VFMA(LDK(KP198912367), T1N, T1W);
+					     T1X = VFNMS(LDK(KP198912367), T1W, T1N);
+					     T75 = VFNMS(LDK(KP956940335), T74, T6X);
+					     T7d = VFMA(LDK(KP956940335), T74, T6X);
+					     T7m = VFMA(LDK(KP956940335), T7j, T7i);
+					     T7k = VFNMS(LDK(KP956940335), T7j, T7i);
+					     T7c = VFNMS(LDK(KP956940335), T7b, T78);
+					     T7e = VFMA(LDK(KP956940335), T7b, T78);
+					}
+					T2k = VADD(T2d, T2e);
+					T2f = VSUB(T2d, T2e);
+					ST(&(xo[WS(os, 45)]), VFMAI(T7k, T7h), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 19)]), VFNMSI(T7k, T7h), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 51)]), VFNMSI(T7m, T7l), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 13)]), VFMAI(T7m, T7l), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 61)]), VFMAI(T7e, T7d), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 3)]), VFNMSI(T7e, T7d), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 29)]), VFMAI(T7c, T75), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 35)]), VFNMSI(T7c, T75), ovs, &(xo[WS(os, 1)]));
+					T2n = VSUB(T1q, T1X);
+					T1Y = VADD(T1q, T1X);
+					T2C = VFNMS(LDK(KP707106781), T27, T26);
+					T28 = VFMA(LDK(KP707106781), T27, T26);
+					T2b = VSUB(T29, T2a);
+					T2s = VADD(T29, T2a);
+				   }
+				   T2l = VFNMS(LDK(KP980785280), T2k, T2j);
+				   T2p = VFMA(LDK(KP980785280), T2k, T2j);
+				   {
+					V T5z, T4z, T5A, T5g;
+					{
+					     V T4f, T4y, T1Z, T2h, T4U, T5t, T2m, T2c, T5u, T5f;
+					     T5H = VFNMS(LDK(KP923879532), T4e, T47);
+					     T4f = VFMA(LDK(KP923879532), T4e, T47);
+					     T4y = VADD(T4o, T4x);
+					     T5T = VSUB(T4o, T4x);
+					     T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
+					     T2h = VFMA(LDK(KP980785280), T1Y, TT);
+					     T4U = VFNMS(LDK(KP098491403), T4T, T4M);
+					     T5t = VFMA(LDK(KP098491403), T4M, T4T);
+					     T2m = VFNMS(LDK(KP923879532), T2b, T28);
+					     T2c = VFMA(LDK(KP923879532), T2b, T28);
+					     T5u = VFMA(LDK(KP098491403), T57, T5e);
+					     T5f = VFNMS(LDK(KP098491403), T5e, T57);
+					     T5z = VFNMS(LDK(KP980785280), T4y, T4f);
+					     T4z = VFMA(LDK(KP980785280), T4y, T4f);
+					     T5S = VFNMS(LDK(KP923879532), T5n, T5k);
+					     T5o = VFMA(LDK(KP923879532), T5n, T5k);
+					     {
+						  V T2o, T2q, T2i, T2g;
+						  T2o = VFMA(LDK(KP980785280), T2n, T2m);
+						  T2q = VFNMS(LDK(KP980785280), T2n, T2m);
+						  T2i = VFMA(LDK(KP980785280), T2f, T2c);
+						  T2g = VFNMS(LDK(KP980785280), T2f, T2c);
+						  T5A = VADD(T5t, T5u);
+						  T5v = VSUB(T5t, T5u);
+						  T5D = VSUB(T4U, T5f);
+						  T5g = VADD(T4U, T5f);
+						  ST(&(xo[WS(os, 46)]), VFNMSI(T2o, T2l), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 18)]), VFMAI(T2o, T2l), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 50)]), VFMAI(T2q, T2p), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 14)]), VFNMSI(T2q, T2p), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 2)]), VFMAI(T2i, T2h), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 62)]), VFNMSI(T2i, T2h), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 34)]), VFMAI(T2g, T1Z), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 30)]), VFNMSI(T2g, T1Z), ovs, &(xo[0]));
+						  T5r = VSUB(T5p, T5q);
+						  T5I = VADD(T5p, T5q);
+					     }
+					}
+					T5x = VFMA(LDK(KP995184726), T5g, T4z);
+					T5h = VFNMS(LDK(KP995184726), T5g, T4z);
+					T5F = VFMA(LDK(KP995184726), T5A, T5z);
+					T5B = VFNMS(LDK(KP995184726), T5A, T5z);
+				   }
+			      }
+			      {
+				   V T6J, T6R, T6L, T6z, T6T, T6P;
+				   {
+					V T6N, T6j, T6O, T6y;
+					{
+					     V T6q, T6H, T5C, T5s, T6I, T6x;
+					     T6q = VFNMS(LDK(KP534511135), T6p, T6m);
+					     T6H = VFMA(LDK(KP534511135), T6m, T6p);
+					     T5C = VFNMS(LDK(KP980785280), T5r, T5o);
+					     T5s = VFMA(LDK(KP980785280), T5r, T5o);
+					     T6I = VFMA(LDK(KP534511135), T6t, T6w);
+					     T6x = VFNMS(LDK(KP534511135), T6w, T6t);
+					     T6N = VFMA(LDK(KP831469612), T6i, T6b);
+					     T6j = VFNMS(LDK(KP831469612), T6i, T6b);
+					     {
+						  V T5E, T5G, T5y, T5w;
+						  T5E = VFMA(LDK(KP995184726), T5D, T5C);
+						  T5G = VFNMS(LDK(KP995184726), T5D, T5C);
+						  T5y = VFMA(LDK(KP995184726), T5v, T5s);
+						  T5w = VFNMS(LDK(KP995184726), T5v, T5s);
+						  T6O = VADD(T6H, T6I);
+						  T6J = VSUB(T6H, T6I);
+						  T6R = VSUB(T6q, T6x);
+						  T6y = VADD(T6q, T6x);
+						  ST(&(xo[WS(os, 47)]), VFNMSI(T5E, T5B), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 17)]), VFMAI(T5E, T5B), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 49)]), VFMAI(T5G, T5F), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 15)]), VFNMSI(T5G, T5F), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 1)]), VFMAI(T5y, T5x), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 63)]), VFNMSI(T5y, T5x), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 33)]), VFMAI(T5w, T5h), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 31)]), VFNMSI(T5w, T5h), ovs, &(xo[WS(os, 1)]));
+					     }
+					}
+					T6L = VFMA(LDK(KP881921264), T6y, T6j);
+					T6z = VFNMS(LDK(KP881921264), T6y, T6j);
+					T6T = VFMA(LDK(KP881921264), T6O, T6N);
+					T6P = VFNMS(LDK(KP881921264), T6O, T6N);
+				   }
+				   {
+					V T2H, T2P, T2J, T2B, T2R, T2N;
+					{
+					     V T2L, T2t, T2M, T2A;
+					     {
+						  V T2w, T2F, T6Q, T6G, T2G, T2z;
+						  T2w = VFMA(LDK(KP668178637), T2v, T2u);
+						  T2F = VFNMS(LDK(KP668178637), T2u, T2v);
+						  T6Q = VFNMS(LDK(KP831469612), T6F, T6C);
+						  T6G = VFMA(LDK(KP831469612), T6F, T6C);
+						  T2G = VFNMS(LDK(KP668178637), T2x, T2y);
+						  T2z = VFMA(LDK(KP668178637), T2y, T2x);
+						  T2L = VFNMS(LDK(KP923879532), T2s, T2r);
+						  T2t = VFMA(LDK(KP923879532), T2s, T2r);
+						  {
+						       V T6S, T6U, T6M, T6K;
+						       T6S = VFMA(LDK(KP881921264), T6R, T6Q);
+						       T6U = VFNMS(LDK(KP881921264), T6R, T6Q);
+						       T6M = VFMA(LDK(KP881921264), T6J, T6G);
+						       T6K = VFNMS(LDK(KP881921264), T6J, T6G);
+						       T2M = VADD(T2F, T2G);
+						       T2H = VSUB(T2F, T2G);
+						       T2P = VSUB(T2w, T2z);
+						       T2A = VADD(T2w, T2z);
+						       ST(&(xo[WS(os, 43)]), VFNMSI(T6S, T6P), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 21)]), VFMAI(T6S, T6P), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 53)]), VFMAI(T6U, T6T), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 11)]), VFNMSI(T6U, T6T), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 5)]), VFMAI(T6M, T6L), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 59)]), VFNMSI(T6M, T6L), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 37)]), VFMAI(T6K, T6z), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 27)]), VFNMSI(T6K, T6z), ovs, &(xo[WS(os, 1)]));
+						  }
+					     }
+					     T2J = VFMA(LDK(KP831469612), T2A, T2t);
+					     T2B = VFNMS(LDK(KP831469612), T2A, T2t);
+					     T2R = VFNMS(LDK(KP831469612), T2M, T2L);
+					     T2N = VFMA(LDK(KP831469612), T2M, T2L);
+					}
+					{
+					     V T61, T5J, T62, T5Q;
+					     {
+						  V T5M, T5V, T2O, T2E, T5W, T5P;
+						  T5M = VFMA(LDK(KP820678790), T5L, T5K);
+						  T5V = VFNMS(LDK(KP820678790), T5K, T5L);
+						  T2O = VFMA(LDK(KP923879532), T2D, T2C);
+						  T2E = VFNMS(LDK(KP923879532), T2D, T2C);
+						  T5W = VFNMS(LDK(KP820678790), T5N, T5O);
+						  T5P = VFMA(LDK(KP820678790), T5O, T5N);
+						  T61 = VFNMS(LDK(KP980785280), T5I, T5H);
+						  T5J = VFMA(LDK(KP980785280), T5I, T5H);
+						  {
+						       V T2Q, T2S, T2K, T2I;
+						       T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
+						       T2S = VFMA(LDK(KP831469612), T2P, T2O);
+						       T2K = VFMA(LDK(KP831469612), T2H, T2E);
+						       T2I = VFNMS(LDK(KP831469612), T2H, T2E);
+						       T62 = VADD(T5V, T5W);
+						       T5X = VSUB(T5V, T5W);
+						       T65 = VSUB(T5M, T5P);
+						       T5Q = VADD(T5M, T5P);
+						       ST(&(xo[WS(os, 42)]), VFMAI(T2Q, T2N), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 22)]), VFNMSI(T2Q, T2N), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 54)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 10)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 58)]), VFMAI(T2K, T2J), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 6)]), VFNMSI(T2K, T2J), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 26)]), VFMAI(T2I, T2B), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 38)]), VFNMSI(T2I, T2B), ovs, &(xo[0]));
+						  }
+					     }
+					     T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
+					     T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
+					     T67 = VFNMS(LDK(KP773010453), T62, T61);
+					     T63 = VFMA(LDK(KP773010453), T62, T61);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T5U = VFNMS(LDK(KP980785280), T5T, T5S);
+	       T64 = VFMA(LDK(KP980785280), T5T, T5S);
+	       {
+		    V T68, T66, T5Y, T60;
+		    T68 = VFMA(LDK(KP773010453), T65, T64);
+		    T66 = VFNMS(LDK(KP773010453), T65, T64);
+		    T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
+		    T60 = VFMA(LDK(KP773010453), T5X, T5U);
+		    ST(&(xo[WS(os, 41)]), VFMAI(T66, T63), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 23)]), VFNMSI(T66, T63), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 55)]), VFNMSI(T68, T67), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 9)]), VFMAI(T68, T67), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 57)]), VFMAI(T60, T5Z), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 7)]), VFNMSI(T60, T5Z), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 25)]), VFMAI(T5Y, T5R), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 39)]), VFNMSI(T5Y, T5R), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 64, XSIMD_STRING("n1bv_64"), {198, 0, 258, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_64) (planner *p) {
+     X(kdft_register) (p, n1bv_64, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n1bv_64 -include n1b.h */
+
+/*
+ * This function contains 456 FP additions, 124 FP multiplications,
+ * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
+ * 108 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       V T4p, T5u, Tb, T3A, T2q, T3v, T6G, T78, Tq, T3w, T6B, T79, T2l, T3B, T4w;
+	       V T5r, TI, T2g, T6u, T74, T3q, T3D, T4E, T5o, TZ, T2h, T6x, T75, T3t, T3E;
+	       V T4L, T5p, T23, T2N, T6m, T70, T6p, T71, T2c, T2O, T3i, T3Y, T5f, T5R, T5k;
+	       V T5S, T3l, T3Z, T1s, T2K, T6f, T6X, T6i, T6Y, T1B, T2L, T3b, T3V, T4Y, T5O;
+	       V T53, T5P, T3e, T3W;
+	       {
+		    V T3, T4n, T2p, T4o, T6, T5s, T9, T5t;
+		    {
+			 V T1, T2, T2n, T2o;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+			 T3 = VSUB(T1, T2);
+			 T4n = VADD(T1, T2);
+			 T2n = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T2o = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+			 T2p = VSUB(T2n, T2o);
+			 T4o = VADD(T2n, T2o);
+		    }
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T5s = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T5t = VADD(T7, T8);
+		    }
+		    T4p = VSUB(T4n, T4o);
+		    T5u = VSUB(T5s, T5t);
+		    {
+			 V Ta, T2m, T6E, T6F;
+			 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+			 Tb = VSUB(T3, Ta);
+			 T3A = VADD(T3, Ta);
+			 T2m = VMUL(LDK(KP707106781), VSUB(T6, T9));
+			 T2q = VSUB(T2m, T2p);
+			 T3v = VADD(T2p, T2m);
+			 T6E = VADD(T4n, T4o);
+			 T6F = VADD(T5s, T5t);
+			 T6G = VSUB(T6E, T6F);
+			 T78 = VADD(T6E, T6F);
+		    }
+	       }
+	       {
+		    V Te, T4q, To, T4t, Th, T4r, Tl, T4u;
+		    {
+			 V Tc, Td, Tm, Tn;
+			 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+			 Te = VSUB(Tc, Td);
+			 T4q = VADD(Tc, Td);
+			 Tm = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+			 Tn = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			 To = VSUB(Tm, Tn);
+			 T4t = VADD(Tm, Tn);
+		    }
+		    {
+			 V Tf, Tg, Tj, Tk;
+			 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+			 Th = VSUB(Tf, Tg);
+			 T4r = VADD(Tf, Tg);
+			 Tj = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 T4u = VADD(Tj, Tk);
+		    }
+		    {
+			 V Ti, Tp, T6z, T6A;
+			 Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
+			 Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
+			 Tq = VSUB(Ti, Tp);
+			 T3w = VADD(Ti, Tp);
+			 T6z = VADD(T4q, T4r);
+			 T6A = VADD(T4t, T4u);
+			 T6B = VSUB(T6z, T6A);
+			 T79 = VADD(T6z, T6A);
+		    }
+		    {
+			 V T2j, T2k, T4s, T4v;
+			 T2j = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
+			 T2k = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
+			 T2l = VSUB(T2j, T2k);
+			 T3B = VADD(T2j, T2k);
+			 T4s = VSUB(T4q, T4r);
+			 T4v = VSUB(T4t, T4u);
+			 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
+			 T5r = VMUL(LDK(KP707106781), VSUB(T4s, T4v));
+		    }
+	       }
+	       {
+		    V TB, T4z, TF, T4y, Ty, T4C, TG, T4B;
+		    {
+			 V Tz, TA, TD, TE;
+			 Tz = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 TA = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+			 TB = VSUB(Tz, TA);
+			 T4z = VADD(Tz, TA);
+			 TD = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 TE = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+			 TF = VSUB(TD, TE);
+			 T4y = VADD(TD, TE);
+			 {
+			      V Ts, Tt, Tu, Tv, Tw, Tx;
+			      Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			      Tt = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+			      Tu = VSUB(Ts, Tt);
+			      Tv = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+			      Tw = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+			      Tx = VSUB(Tv, Tw);
+			      Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
+			      T4C = VADD(Tv, Tw);
+			      TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
+			      T4B = VADD(Ts, Tt);
+			 }
+		    }
+		    {
+			 V TC, TH, T6s, T6t;
+			 TC = VSUB(Ty, TB);
+			 TH = VSUB(TF, TG);
+			 TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
+			 T2g = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
+			 T6s = VADD(T4y, T4z);
+			 T6t = VADD(T4B, T4C);
+			 T6u = VSUB(T6s, T6t);
+			 T74 = VADD(T6s, T6t);
+		    }
+		    {
+			 V T3o, T3p, T4A, T4D;
+			 T3o = VADD(TB, Ty);
+			 T3p = VADD(TF, TG);
+			 T3q = VFMA(LDK(KP980785280), T3o, VMUL(LDK(KP195090322), T3p));
+			 T3D = VFNMS(LDK(KP195090322), T3o, VMUL(LDK(KP980785280), T3p));
+			 T4A = VSUB(T4y, T4z);
+			 T4D = VSUB(T4B, T4C);
+			 T4E = VFMA(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
+			 T5o = VFNMS(LDK(KP382683432), T4D, VMUL(LDK(KP923879532), T4A));
+		    }
+	       }
+	       {
+		    V TS, T4J, TW, T4I, TP, T4G, TX, T4F;
+		    {
+			 V TQ, TR, TU, TV;
+			 TQ = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 TR = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+			 TS = VSUB(TQ, TR);
+			 T4J = VADD(TQ, TR);
+			 TU = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+			 TV = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+			 TW = VSUB(TU, TV);
+			 T4I = VADD(TU, TV);
+			 {
+			      V TJ, TK, TL, TM, TN, TO;
+			      TJ = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			      TK = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+			      TL = VSUB(TJ, TK);
+			      TM = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+			      TN = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			      TO = VSUB(TM, TN);
+			      TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
+			      T4G = VADD(TM, TN);
+			      TX = VMUL(LDK(KP707106781), VADD(TL, TO));
+			      T4F = VADD(TJ, TK);
+			 }
+		    }
+		    {
+			 V TT, TY, T6v, T6w;
+			 TT = VSUB(TP, TS);
+			 TY = VSUB(TW, TX);
+			 TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
+			 T2h = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
+			 T6v = VADD(T4I, T4J);
+			 T6w = VADD(T4F, T4G);
+			 T6x = VSUB(T6v, T6w);
+			 T75 = VADD(T6v, T6w);
+		    }
+		    {
+			 V T3r, T3s, T4H, T4K;
+			 T3r = VADD(TS, TP);
+			 T3s = VADD(TW, TX);
+			 T3t = VFNMS(LDK(KP195090322), T3s, VMUL(LDK(KP980785280), T3r));
+			 T3E = VFMA(LDK(KP195090322), T3r, VMUL(LDK(KP980785280), T3s));
+			 T4H = VSUB(T4F, T4G);
+			 T4K = VSUB(T4I, T4J);
+			 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
+			 T5p = VFMA(LDK(KP923879532), T4K, VMUL(LDK(KP382683432), T4H));
+		    }
+	       }
+	       {
+		    V T21, T5h, T26, T5g, T1Y, T5d, T27, T5c, T55, T56, T1J, T57, T29, T58, T59;
+		    V T1Q, T5a, T2a;
+		    {
+			 V T1Z, T20, T24, T25;
+			 T1Z = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T20 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+			 T21 = VSUB(T1Z, T20);
+			 T5h = VADD(T1Z, T20);
+			 T24 = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+			 T25 = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+			 T26 = VSUB(T24, T25);
+			 T5g = VADD(T24, T25);
+		    }
+		    {
+			 V T1S, T1T, T1U, T1V, T1W, T1X;
+			 T1S = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T1T = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+			 T1U = VSUB(T1S, T1T);
+			 T1V = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+			 T1W = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			 T1X = VSUB(T1V, T1W);
+			 T1Y = VMUL(LDK(KP707106781), VSUB(T1U, T1X));
+			 T5d = VADD(T1V, T1W);
+			 T27 = VMUL(LDK(KP707106781), VADD(T1U, T1X));
+			 T5c = VADD(T1S, T1T);
+		    }
+		    {
+			 V T1F, T1I, T1M, T1P;
+			 {
+			      V T1D, T1E, T1G, T1H;
+			      T1D = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			      T1E = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+			      T1F = VSUB(T1D, T1E);
+			      T55 = VADD(T1D, T1E);
+			      T1G = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			      T1H = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+			      T1I = VSUB(T1G, T1H);
+			      T56 = VADD(T1G, T1H);
+			 }
+			 T1J = VFNMS(LDK(KP382683432), T1I, VMUL(LDK(KP923879532), T1F));
+			 T57 = VSUB(T55, T56);
+			 T29 = VFMA(LDK(KP382683432), T1F, VMUL(LDK(KP923879532), T1I));
+			 {
+			      V T1K, T1L, T1N, T1O;
+			      T1K = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+			      T1L = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+			      T1M = VSUB(T1K, T1L);
+			      T58 = VADD(T1K, T1L);
+			      T1N = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			      T1O = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+			      T1P = VSUB(T1N, T1O);
+			      T59 = VADD(T1N, T1O);
+			 }
+			 T1Q = VFMA(LDK(KP923879532), T1M, VMUL(LDK(KP382683432), T1P));
+			 T5a = VSUB(T58, T59);
+			 T2a = VFNMS(LDK(KP382683432), T1M, VMUL(LDK(KP923879532), T1P));
+		    }
+		    {
+			 V T1R, T22, T6k, T6l;
+			 T1R = VSUB(T1J, T1Q);
+			 T22 = VSUB(T1Y, T21);
+			 T23 = VSUB(T1R, T22);
+			 T2N = VADD(T22, T1R);
+			 T6k = VADD(T5g, T5h);
+			 T6l = VADD(T5c, T5d);
+			 T6m = VSUB(T6k, T6l);
+			 T70 = VADD(T6k, T6l);
+		    }
+		    {
+			 V T6n, T6o, T28, T2b;
+			 T6n = VADD(T55, T56);
+			 T6o = VADD(T58, T59);
+			 T6p = VSUB(T6n, T6o);
+			 T71 = VADD(T6n, T6o);
+			 T28 = VSUB(T26, T27);
+			 T2b = VSUB(T29, T2a);
+			 T2c = VSUB(T28, T2b);
+			 T2O = VADD(T28, T2b);
+		    }
+		    {
+			 V T3g, T3h, T5b, T5e;
+			 T3g = VADD(T26, T27);
+			 T3h = VADD(T1J, T1Q);
+			 T3i = VADD(T3g, T3h);
+			 T3Y = VSUB(T3g, T3h);
+			 T5b = VMUL(LDK(KP707106781), VSUB(T57, T5a));
+			 T5e = VSUB(T5c, T5d);
+			 T5f = VSUB(T5b, T5e);
+			 T5R = VADD(T5e, T5b);
+		    }
+		    {
+			 V T5i, T5j, T3j, T3k;
+			 T5i = VSUB(T5g, T5h);
+			 T5j = VMUL(LDK(KP707106781), VADD(T57, T5a));
+			 T5k = VSUB(T5i, T5j);
+			 T5S = VADD(T5i, T5j);
+			 T3j = VADD(T21, T1Y);
+			 T3k = VADD(T29, T2a);
+			 T3l = VADD(T3j, T3k);
+			 T3Z = VSUB(T3k, T3j);
+		    }
+	       }
+	       {
+		    V T1q, T50, T1v, T4Z, T1n, T4W, T1w, T4V, T4O, T4P, T18, T4Q, T1y, T4R, T4S;
+		    V T1f, T4T, T1z;
+		    {
+			 V T1o, T1p, T1t, T1u;
+			 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+			 T1q = VSUB(T1o, T1p);
+			 T50 = VADD(T1o, T1p);
+			 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+			 T1v = VSUB(T1t, T1u);
+			 T4Z = VADD(T1t, T1u);
+		    }
+		    {
+			 V T1h, T1i, T1j, T1k, T1l, T1m;
+			 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+			 T1j = VSUB(T1h, T1i);
+			 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+			 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+			 T1m = VSUB(T1k, T1l);
+			 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
+			 T4W = VADD(T1k, T1l);
+			 T1w = VMUL(LDK(KP707106781), VADD(T1j, T1m));
+			 T4V = VADD(T1h, T1i);
+		    }
+		    {
+			 V T14, T17, T1b, T1e;
+			 {
+			      V T12, T13, T15, T16;
+			      T12 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			      T13 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+			      T14 = VSUB(T12, T13);
+			      T4O = VADD(T12, T13);
+			      T15 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			      T16 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+			      T17 = VSUB(T15, T16);
+			      T4P = VADD(T15, T16);
+			 }
+			 T18 = VFNMS(LDK(KP382683432), T17, VMUL(LDK(KP923879532), T14));
+			 T4Q = VSUB(T4O, T4P);
+			 T1y = VFMA(LDK(KP382683432), T14, VMUL(LDK(KP923879532), T17));
+			 {
+			      V T19, T1a, T1c, T1d;
+			      T19 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+			      T1a = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+			      T1b = VSUB(T19, T1a);
+			      T4R = VADD(T19, T1a);
+			      T1c = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			      T1d = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+			      T1e = VSUB(T1c, T1d);
+			      T4S = VADD(T1c, T1d);
+			 }
+			 T1f = VFMA(LDK(KP923879532), T1b, VMUL(LDK(KP382683432), T1e));
+			 T4T = VSUB(T4R, T4S);
+			 T1z = VFNMS(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
+		    }
+		    {
+			 V T1g, T1r, T6d, T6e;
+			 T1g = VSUB(T18, T1f);
+			 T1r = VSUB(T1n, T1q);
+			 T1s = VSUB(T1g, T1r);
+			 T2K = VADD(T1r, T1g);
+			 T6d = VADD(T4Z, T50);
+			 T6e = VADD(T4V, T4W);
+			 T6f = VSUB(T6d, T6e);
+			 T6X = VADD(T6d, T6e);
+		    }
+		    {
+			 V T6g, T6h, T1x, T1A;
+			 T6g = VADD(T4O, T4P);
+			 T6h = VADD(T4R, T4S);
+			 T6i = VSUB(T6g, T6h);
+			 T6Y = VADD(T6g, T6h);
+			 T1x = VSUB(T1v, T1w);
+			 T1A = VSUB(T1y, T1z);
+			 T1B = VSUB(T1x, T1A);
+			 T2L = VADD(T1x, T1A);
+		    }
+		    {
+			 V T39, T3a, T4U, T4X;
+			 T39 = VADD(T1v, T1w);
+			 T3a = VADD(T18, T1f);
+			 T3b = VADD(T39, T3a);
+			 T3V = VSUB(T39, T3a);
+			 T4U = VMUL(LDK(KP707106781), VSUB(T4Q, T4T));
+			 T4X = VSUB(T4V, T4W);
+			 T4Y = VSUB(T4U, T4X);
+			 T5O = VADD(T4X, T4U);
+		    }
+		    {
+			 V T51, T52, T3c, T3d;
+			 T51 = VSUB(T4Z, T50);
+			 T52 = VMUL(LDK(KP707106781), VADD(T4Q, T4T));
+			 T53 = VSUB(T51, T52);
+			 T5P = VADD(T51, T52);
+			 T3c = VADD(T1q, T1n);
+			 T3d = VADD(T1y, T1z);
+			 T3e = VADD(T3c, T3d);
+			 T3W = VSUB(T3d, T3c);
+		    }
+	       }
+	       {
+		    V T7h, T7l, T7k, T7m;
+		    {
+			 V T7f, T7g, T7i, T7j;
+			 T7f = VADD(T78, T79);
+			 T7g = VADD(T74, T75);
+			 T7h = VSUB(T7f, T7g);
+			 T7l = VADD(T7f, T7g);
+			 T7i = VADD(T6X, T6Y);
+			 T7j = VADD(T70, T71);
+			 T7k = VBYI(VSUB(T7i, T7j));
+			 T7m = VADD(T7i, T7j);
+		    }
+		    ST(&(xo[WS(os, 48)]), VSUB(T7h, T7k), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(T7l, T7m), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 16)]), VADD(T7h, T7k), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 32)]), VSUB(T7l, T7m), ovs, &(xo[0]));
+	       }
+	       {
+		    V T76, T7a, T73, T7b, T6Z, T72;
+		    T76 = VSUB(T74, T75);
+		    T7a = VSUB(T78, T79);
+		    T6Z = VSUB(T6X, T6Y);
+		    T72 = VSUB(T70, T71);
+		    T73 = VMUL(LDK(KP707106781), VSUB(T6Z, T72));
+		    T7b = VMUL(LDK(KP707106781), VADD(T6Z, T72));
+		    {
+			 V T77, T7c, T7d, T7e;
+			 T77 = VBYI(VSUB(T73, T76));
+			 T7c = VSUB(T7a, T7b);
+			 ST(&(xo[WS(os, 24)]), VADD(T77, T7c), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 40)]), VSUB(T7c, T77), ovs, &(xo[0]));
+			 T7d = VBYI(VADD(T76, T73));
+			 T7e = VADD(T7a, T7b);
+			 ST(&(xo[WS(os, 8)]), VADD(T7d, T7e), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 56)]), VSUB(T7e, T7d), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T6C, T6S, T6I, T6P, T6r, T6Q, T6L, T6T, T6y, T6H;
+		    T6y = VMUL(LDK(KP707106781), VSUB(T6u, T6x));
+		    T6C = VSUB(T6y, T6B);
+		    T6S = VADD(T6B, T6y);
+		    T6H = VMUL(LDK(KP707106781), VADD(T6u, T6x));
+		    T6I = VSUB(T6G, T6H);
+		    T6P = VADD(T6G, T6H);
+		    {
+			 V T6j, T6q, T6J, T6K;
+			 T6j = VFNMS(LDK(KP382683432), T6i, VMUL(LDK(KP923879532), T6f));
+			 T6q = VFMA(LDK(KP923879532), T6m, VMUL(LDK(KP382683432), T6p));
+			 T6r = VSUB(T6j, T6q);
+			 T6Q = VADD(T6j, T6q);
+			 T6J = VFMA(LDK(KP382683432), T6f, VMUL(LDK(KP923879532), T6i));
+			 T6K = VFNMS(LDK(KP382683432), T6m, VMUL(LDK(KP923879532), T6p));
+			 T6L = VSUB(T6J, T6K);
+			 T6T = VADD(T6J, T6K);
+		    }
+		    {
+			 V T6D, T6M, T6V, T6W;
+			 T6D = VBYI(VSUB(T6r, T6C));
+			 T6M = VSUB(T6I, T6L);
+			 ST(&(xo[WS(os, 20)]), VADD(T6D, T6M), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 44)]), VSUB(T6M, T6D), ovs, &(xo[0]));
+			 T6V = VSUB(T6P, T6Q);
+			 T6W = VBYI(VSUB(T6T, T6S));
+			 ST(&(xo[WS(os, 36)]), VSUB(T6V, T6W), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 28)]), VADD(T6V, T6W), ovs, &(xo[0]));
+		    }
+		    {
+			 V T6N, T6O, T6R, T6U;
+			 T6N = VBYI(VADD(T6C, T6r));
+			 T6O = VADD(T6I, T6L);
+			 ST(&(xo[WS(os, 12)]), VADD(T6N, T6O), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 52)]), VSUB(T6O, T6N), ovs, &(xo[0]));
+			 T6R = VADD(T6P, T6Q);
+			 T6U = VBYI(VADD(T6S, T6T));
+			 ST(&(xo[WS(os, 60)]), VSUB(T6R, T6U), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 4)]), VADD(T6R, T6U), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
+		    {
+			 V T5L, T5M, T5Z, T60;
+			 T5L = VADD(T4p, T4w);
+			 T5M = VADD(T5o, T5p);
+			 T5N = VSUB(T5L, T5M);
+			 T68 = VADD(T5L, T5M);
+			 T5Z = VFNMS(LDK(KP195090322), T5O, VMUL(LDK(KP980785280), T5P));
+			 T60 = VFMA(LDK(KP195090322), T5R, VMUL(LDK(KP980785280), T5S));
+			 T61 = VSUB(T5Z, T60);
+			 T69 = VADD(T5Z, T60);
+		    }
+		    {
+			 V T5Q, T5T, T5W, T5X;
+			 T5Q = VFMA(LDK(KP980785280), T5O, VMUL(LDK(KP195090322), T5P));
+			 T5T = VFNMS(LDK(KP195090322), T5S, VMUL(LDK(KP980785280), T5R));
+			 T5U = VSUB(T5Q, T5T);
+			 T65 = VADD(T5Q, T5T);
+			 T5W = VADD(T4E, T4L);
+			 T5X = VADD(T5u, T5r);
+			 T5Y = VSUB(T5W, T5X);
+			 T66 = VADD(T5X, T5W);
+		    }
+		    {
+			 V T5V, T62, T6b, T6c;
+			 T5V = VADD(T5N, T5U);
+			 T62 = VBYI(VADD(T5Y, T61));
+			 ST(&(xo[WS(os, 50)]), VSUB(T5V, T62), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 14)]), VADD(T5V, T62), ovs, &(xo[0]));
+			 T6b = VBYI(VADD(T66, T65));
+			 T6c = VADD(T68, T69);
+			 ST(&(xo[WS(os, 2)]), VADD(T6b, T6c), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 62)]), VSUB(T6c, T6b), ovs, &(xo[0]));
+		    }
+		    {
+			 V T63, T64, T67, T6a;
+			 T63 = VSUB(T5N, T5U);
+			 T64 = VBYI(VSUB(T61, T5Y));
+			 ST(&(xo[WS(os, 46)]), VSUB(T63, T64), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 18)]), VADD(T63, T64), ovs, &(xo[0]));
+			 T67 = VBYI(VSUB(T65, T66));
+			 T6a = VSUB(T68, T69);
+			 ST(&(xo[WS(os, 30)]), VADD(T67, T6a), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 34)]), VSUB(T6a, T67), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
+		    {
+			 V Tr, T10, T2t, T2u;
+			 Tr = VSUB(Tb, Tq);
+			 T10 = VSUB(TI, TZ);
+			 T11 = VSUB(Tr, T10);
+			 T2C = VADD(Tr, T10);
+			 T2t = VFNMS(LDK(KP471396736), T1s, VMUL(LDK(KP881921264), T1B));
+			 T2u = VFMA(LDK(KP471396736), T23, VMUL(LDK(KP881921264), T2c));
+			 T2v = VSUB(T2t, T2u);
+			 T2D = VADD(T2t, T2u);
+		    }
+		    {
+			 V T1C, T2d, T2i, T2r;
+			 T1C = VFMA(LDK(KP881921264), T1s, VMUL(LDK(KP471396736), T1B));
+			 T2d = VFNMS(LDK(KP471396736), T2c, VMUL(LDK(KP881921264), T23));
+			 T2e = VSUB(T1C, T2d);
+			 T2z = VADD(T1C, T2d);
+			 T2i = VSUB(T2g, T2h);
+			 T2r = VSUB(T2l, T2q);
+			 T2s = VSUB(T2i, T2r);
+			 T2A = VADD(T2r, T2i);
+		    }
+		    {
+			 V T2f, T2w, T2F, T2G;
+			 T2f = VADD(T11, T2e);
+			 T2w = VBYI(VADD(T2s, T2v));
+			 ST(&(xo[WS(os, 53)]), VSUB(T2f, T2w), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 11)]), VADD(T2f, T2w), ovs, &(xo[WS(os, 1)]));
+			 T2F = VBYI(VADD(T2A, T2z));
+			 T2G = VADD(T2C, T2D);
+			 ST(&(xo[WS(os, 5)]), VADD(T2F, T2G), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 59)]), VSUB(T2G, T2F), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T2x, T2y, T2B, T2E;
+			 T2x = VSUB(T11, T2e);
+			 T2y = VBYI(VSUB(T2v, T2s));
+			 ST(&(xo[WS(os, 43)]), VSUB(T2x, T2y), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 21)]), VADD(T2x, T2y), ovs, &(xo[WS(os, 1)]));
+			 T2B = VBYI(VSUB(T2z, T2A));
+			 T2E = VSUB(T2C, T2D);
+			 ST(&(xo[WS(os, 27)]), VADD(T2B, T2E), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 37)]), VSUB(T2E, T2B), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T3n, T3O, T3J, T3R, T3y, T3Q, T3G, T3N;
+		    {
+			 V T3f, T3m, T3H, T3I;
+			 T3f = VFNMS(LDK(KP098017140), T3e, VMUL(LDK(KP995184726), T3b));
+			 T3m = VFMA(LDK(KP995184726), T3i, VMUL(LDK(KP098017140), T3l));
+			 T3n = VSUB(T3f, T3m);
+			 T3O = VADD(T3f, T3m);
+			 T3H = VFMA(LDK(KP098017140), T3b, VMUL(LDK(KP995184726), T3e));
+			 T3I = VFNMS(LDK(KP098017140), T3i, VMUL(LDK(KP995184726), T3l));
+			 T3J = VSUB(T3H, T3I);
+			 T3R = VADD(T3H, T3I);
+		    }
+		    {
+			 V T3u, T3x, T3C, T3F;
+			 T3u = VADD(T3q, T3t);
+			 T3x = VADD(T3v, T3w);
+			 T3y = VSUB(T3u, T3x);
+			 T3Q = VADD(T3x, T3u);
+			 T3C = VADD(T3A, T3B);
+			 T3F = VADD(T3D, T3E);
+			 T3G = VSUB(T3C, T3F);
+			 T3N = VADD(T3C, T3F);
+		    }
+		    {
+			 V T3z, T3K, T3T, T3U;
+			 T3z = VBYI(VSUB(T3n, T3y));
+			 T3K = VSUB(T3G, T3J);
+			 ST(&(xo[WS(os, 17)]), VADD(T3z, T3K), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 47)]), VSUB(T3K, T3z), ovs, &(xo[WS(os, 1)]));
+			 T3T = VSUB(T3N, T3O);
+			 T3U = VBYI(VSUB(T3R, T3Q));
+			 ST(&(xo[WS(os, 33)]), VSUB(T3T, T3U), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 31)]), VADD(T3T, T3U), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T3L, T3M, T3P, T3S;
+			 T3L = VBYI(VADD(T3y, T3n));
+			 T3M = VADD(T3G, T3J);
+			 ST(&(xo[WS(os, 15)]), VADD(T3L, T3M), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 49)]), VSUB(T3M, T3L), ovs, &(xo[WS(os, 1)]));
+			 T3P = VADD(T3N, T3O);
+			 T3S = VBYI(VADD(T3Q, T3R));
+			 ST(&(xo[WS(os, 63)]), VSUB(T3P, T3S), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 1)]), VADD(T3P, T3S), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
+		    {
+			 V T4x, T4M, T5x, T5y;
+			 T4x = VSUB(T4p, T4w);
+			 T4M = VSUB(T4E, T4L);
+			 T4N = VSUB(T4x, T4M);
+			 T5G = VADD(T4x, T4M);
+			 T5x = VFNMS(LDK(KP555570233), T4Y, VMUL(LDK(KP831469612), T53));
+			 T5y = VFMA(LDK(KP555570233), T5f, VMUL(LDK(KP831469612), T5k));
+			 T5z = VSUB(T5x, T5y);
+			 T5H = VADD(T5x, T5y);
+		    }
+		    {
+			 V T54, T5l, T5q, T5v;
+			 T54 = VFMA(LDK(KP831469612), T4Y, VMUL(LDK(KP555570233), T53));
+			 T5l = VFNMS(LDK(KP555570233), T5k, VMUL(LDK(KP831469612), T5f));
+			 T5m = VSUB(T54, T5l);
+			 T5D = VADD(T54, T5l);
+			 T5q = VSUB(T5o, T5p);
+			 T5v = VSUB(T5r, T5u);
+			 T5w = VSUB(T5q, T5v);
+			 T5E = VADD(T5v, T5q);
+		    }
+		    {
+			 V T5n, T5A, T5J, T5K;
+			 T5n = VADD(T4N, T5m);
+			 T5A = VBYI(VADD(T5w, T5z));
+			 ST(&(xo[WS(os, 54)]), VSUB(T5n, T5A), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 10)]), VADD(T5n, T5A), ovs, &(xo[0]));
+			 T5J = VBYI(VADD(T5E, T5D));
+			 T5K = VADD(T5G, T5H);
+			 ST(&(xo[WS(os, 6)]), VADD(T5J, T5K), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 58)]), VSUB(T5K, T5J), ovs, &(xo[0]));
+		    }
+		    {
+			 V T5B, T5C, T5F, T5I;
+			 T5B = VSUB(T4N, T5m);
+			 T5C = VBYI(VSUB(T5z, T5w));
+			 ST(&(xo[WS(os, 42)]), VSUB(T5B, T5C), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 22)]), VADD(T5B, T5C), ovs, &(xo[0]));
+			 T5F = VBYI(VSUB(T5D, T5E));
+			 T5I = VSUB(T5G, T5H);
+			 ST(&(xo[WS(os, 26)]), VADD(T5F, T5I), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 38)]), VSUB(T5I, T5F), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
+		    {
+			 V T2H, T2I, T2V, T2W;
+			 T2H = VADD(Tb, Tq);
+			 T2I = VADD(T2g, T2h);
+			 T2J = VSUB(T2H, T2I);
+			 T34 = VADD(T2H, T2I);
+			 T2V = VFNMS(LDK(KP290284677), T2K, VMUL(LDK(KP956940335), T2L));
+			 T2W = VFMA(LDK(KP290284677), T2N, VMUL(LDK(KP956940335), T2O));
+			 T2X = VSUB(T2V, T2W);
+			 T35 = VADD(T2V, T2W);
+		    }
+		    {
+			 V T2M, T2P, T2S, T2T;
+			 T2M = VFMA(LDK(KP956940335), T2K, VMUL(LDK(KP290284677), T2L));
+			 T2P = VFNMS(LDK(KP290284677), T2O, VMUL(LDK(KP956940335), T2N));
+			 T2Q = VSUB(T2M, T2P);
+			 T31 = VADD(T2M, T2P);
+			 T2S = VADD(TI, TZ);
+			 T2T = VADD(T2q, T2l);
+			 T2U = VSUB(T2S, T2T);
+			 T32 = VADD(T2T, T2S);
+		    }
+		    {
+			 V T2R, T2Y, T37, T38;
+			 T2R = VADD(T2J, T2Q);
+			 T2Y = VBYI(VADD(T2U, T2X));
+			 ST(&(xo[WS(os, 51)]), VSUB(T2R, T2Y), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 13)]), VADD(T2R, T2Y), ovs, &(xo[WS(os, 1)]));
+			 T37 = VBYI(VADD(T32, T31));
+			 T38 = VADD(T34, T35);
+			 ST(&(xo[WS(os, 3)]), VADD(T37, T38), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 61)]), VSUB(T38, T37), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T2Z, T30, T33, T36;
+			 T2Z = VSUB(T2J, T2Q);
+			 T30 = VBYI(VSUB(T2X, T2U));
+			 ST(&(xo[WS(os, 45)]), VSUB(T2Z, T30), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 19)]), VADD(T2Z, T30), ovs, &(xo[WS(os, 1)]));
+			 T33 = VBYI(VSUB(T31, T32));
+			 T36 = VSUB(T34, T35);
+			 ST(&(xo[WS(os, 29)]), VADD(T33, T36), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 35)]), VSUB(T36, T33), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T41, T4g, T4b, T4j, T44, T4i, T48, T4f;
+		    {
+			 V T3X, T40, T49, T4a;
+			 T3X = VFNMS(LDK(KP634393284), T3W, VMUL(LDK(KP773010453), T3V));
+			 T40 = VFMA(LDK(KP773010453), T3Y, VMUL(LDK(KP634393284), T3Z));
+			 T41 = VSUB(T3X, T40);
+			 T4g = VADD(T3X, T40);
+			 T49 = VFMA(LDK(KP634393284), T3V, VMUL(LDK(KP773010453), T3W));
+			 T4a = VFNMS(LDK(KP634393284), T3Y, VMUL(LDK(KP773010453), T3Z));
+			 T4b = VSUB(T49, T4a);
+			 T4j = VADD(T49, T4a);
+		    }
+		    {
+			 V T42, T43, T46, T47;
+			 T42 = VSUB(T3D, T3E);
+			 T43 = VSUB(T3w, T3v);
+			 T44 = VSUB(T42, T43);
+			 T4i = VADD(T43, T42);
+			 T46 = VSUB(T3A, T3B);
+			 T47 = VSUB(T3q, T3t);
+			 T48 = VSUB(T46, T47);
+			 T4f = VADD(T46, T47);
+		    }
+		    {
+			 V T45, T4c, T4l, T4m;
+			 T45 = VBYI(VSUB(T41, T44));
+			 T4c = VSUB(T48, T4b);
+			 ST(&(xo[WS(os, 23)]), VADD(T45, T4c), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 41)]), VSUB(T4c, T45), ovs, &(xo[WS(os, 1)]));
+			 T4l = VSUB(T4f, T4g);
+			 T4m = VBYI(VSUB(T4j, T4i));
+			 ST(&(xo[WS(os, 39)]), VSUB(T4l, T4m), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 25)]), VADD(T4l, T4m), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T4d, T4e, T4h, T4k;
+			 T4d = VBYI(VADD(T44, T41));
+			 T4e = VADD(T48, T4b);
+			 ST(&(xo[WS(os, 9)]), VADD(T4d, T4e), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 55)]), VSUB(T4e, T4d), ovs, &(xo[WS(os, 1)]));
+			 T4h = VADD(T4f, T4g);
+			 T4k = VBYI(VADD(T4i, T4j));
+			 ST(&(xo[WS(os, 57)]), VSUB(T4h, T4k), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 7)]), VADD(T4h, T4k), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 64, XSIMD_STRING("n1bv_64"), {404, 72, 52, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_64) (planner *p) {
+     X(kdft_register) (p, n1bv_64, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:58 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 7 -name n1bv_7 -include n1b.h */
+
+/*
+ * This function contains 30 FP additions, 24 FP multiplications,
+ * (or, 9 additions, 3 multiplications, 21 fused multiply/add),
+ * 37 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(14, is), MAKE_VOLATILE_STRIDE(14, os)) {
+	       V T1, T2, T3, T8, T9, T5, T6;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+	       T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T9 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T6 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V Tg, T4, Te, Ta, Tf, T7;
+		    Tg = VSUB(T2, T3);
+		    T4 = VADD(T2, T3);
+		    Te = VSUB(T8, T9);
+		    Ta = VADD(T8, T9);
+		    Tf = VSUB(T5, T6);
+		    T7 = VADD(T5, T6);
+		    {
+			 V Tr, Tj, Tm, Th, To, Tb;
+			 Tr = VFMA(LDK(KP554958132), Te, Tg);
+			 Tj = VFNMS(LDK(KP356895867), T4, Ta);
+			 Tm = VFMA(LDK(KP554958132), Tf, Te);
+			 Th = VFNMS(LDK(KP554958132), Tg, Tf);
+			 ST(&(xo[0]), VADD(T1, VADD(T4, VADD(T7, Ta))), ovs, &(xo[0]));
+			 To = VFNMS(LDK(KP356895867), T7, T4);
+			 Tb = VFNMS(LDK(KP356895867), Ta, T7);
+			 {
+			      V Ts, Tk, Tn, Ti;
+			      Ts = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), Tr, Tf));
+			      Tk = VFNMS(LDK(KP692021471), Tj, T7);
+			      Tn = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tm, Tg));
+			      Ti = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Th, Te));
+			      {
+				   V Tp, Tc, Tl, Tq, Td;
+				   Tp = VFNMS(LDK(KP692021471), To, Ta);
+				   Tc = VFNMS(LDK(KP692021471), Tb, T4);
+				   Tl = VFNMS(LDK(KP900968867), Tk, T1);
+				   Tq = VFNMS(LDK(KP900968867), Tp, T1);
+				   Td = VFNMS(LDK(KP900968867), Tc, T1);
+				   ST(&(xo[WS(os, 5)]), VFNMSI(Tn, Tl), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 2)]), VFMAI(Tn, Tl), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 6)]), VFNMSI(Ts, Tq), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 1)]), VFMAI(Ts, Tq), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 4)]), VFNMSI(Ti, Td), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 3)]), VFMAI(Ti, Td), ovs, &(xo[WS(os, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 7, XSIMD_STRING("n1bv_7"), {9, 3, 21, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_7) (planner *p) {
+     X(kdft_register) (p, n1bv_7, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 7 -name n1bv_7 -include n1b.h */
+
+/*
+ * This function contains 30 FP additions, 18 FP multiplications,
+ * (or, 18 additions, 6 multiplications, 12 fused multiply/add),
+ * 24 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(14, is), MAKE_VOLATILE_STRIDE(14, os)) {
+	       V Tb, T9, Tc, T3, Te, T6, Td, T7, T8, Ti, Tj;
+	       Tb = LD(&(xi[0]), ivs, &(xi[0]));
+	       T7 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T9 = VSUB(T7, T8);
+	       Tc = VADD(T7, T8);
+	       {
+		    V T1, T2, T4, T5;
+		    T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T2 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    T3 = VSUB(T1, T2);
+		    Te = VADD(T1, T2);
+		    T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T5 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T6 = VSUB(T4, T5);
+		    Td = VADD(T4, T5);
+	       }
+	       ST(&(xo[0]), VADD(Tb, VADD(Te, VADD(Tc, Td))), ovs, &(xo[0]));
+	       Ti = VBYI(VFNMS(LDK(KP781831482), T6, VFNMS(LDK(KP433883739), T9, VMUL(LDK(KP974927912), T3))));
+	       Tj = VFMA(LDK(KP623489801), Td, VFNMS(LDK(KP900968867), Tc, VFNMS(LDK(KP222520933), Te, Tb)));
+	       ST(&(xo[WS(os, 2)]), VADD(Ti, Tj), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 5)]), VSUB(Tj, Ti), ovs, &(xo[WS(os, 1)]));
+	       {
+		    V Ta, Tf, Tg, Th;
+		    Ta = VBYI(VFMA(LDK(KP433883739), T3, VFNMS(LDK(KP781831482), T9, VMUL(LDK(KP974927912), T6))));
+		    Tf = VFMA(LDK(KP623489801), Tc, VFNMS(LDK(KP222520933), Td, VFNMS(LDK(KP900968867), Te, Tb)));
+		    ST(&(xo[WS(os, 3)]), VADD(Ta, Tf), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 4)]), VSUB(Tf, Ta), ovs, &(xo[0]));
+		    Tg = VBYI(VFMA(LDK(KP781831482), T3, VFMA(LDK(KP974927912), T9, VMUL(LDK(KP433883739), T6))));
+		    Th = VFMA(LDK(KP623489801), Te, VFNMS(LDK(KP900968867), Td, VFNMS(LDK(KP222520933), Tc, Tb)));
+		    ST(&(xo[WS(os, 1)]), VADD(Tg, Th), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 6)]), VSUB(Th, Tg), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 7, XSIMD_STRING("n1bv_7"), {18, 6, 12, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_7) (planner *p) {
+     X(kdft_register) (p, n1bv_7, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 8 -name n1bv_8 -include n1b.h */
+
+/*
+ * This function contains 26 FP additions, 10 FP multiplications,
+ * (or, 16 additions, 0 multiplications, 10 fused multiply/add),
+ * 30 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       V T1, T2, Tc, Td, T4, T5, T7, T8;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       Td = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+	       T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, Tj, Te, Tk, T6, Tm, T9, Tn, Tp, Tl;
+		    T3 = VSUB(T1, T2);
+		    Tj = VADD(T1, T2);
+		    Te = VSUB(Tc, Td);
+		    Tk = VADD(Tc, Td);
+		    T6 = VSUB(T4, T5);
+		    Tm = VADD(T4, T5);
+		    T9 = VSUB(T7, T8);
+		    Tn = VADD(T7, T8);
+		    Tp = VADD(Tj, Tk);
+		    Tl = VSUB(Tj, Tk);
+		    {
+			 V Tq, To, Ta, Tf;
+			 Tq = VADD(Tm, Tn);
+			 To = VSUB(Tm, Tn);
+			 Ta = VADD(T6, T9);
+			 Tf = VSUB(T6, T9);
+			 {
+			      V Tg, Ti, Tb, Th;
+			      ST(&(xo[WS(os, 2)]), VFMAI(To, Tl), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 6)]), VFNMSI(To, Tl), ovs, &(xo[0]));
+			      ST(&(xo[0]), VADD(Tp, Tq), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 4)]), VSUB(Tp, Tq), ovs, &(xo[0]));
+			      Tg = VFNMS(LDK(KP707106781), Tf, Te);
+			      Ti = VFMA(LDK(KP707106781), Tf, Te);
+			      Tb = VFNMS(LDK(KP707106781), Ta, T3);
+			      Th = VFMA(LDK(KP707106781), Ta, T3);
+			      ST(&(xo[WS(os, 7)]), VFNMSI(Ti, Th), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 1)]), VFMAI(Ti, Th), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 5)]), VFMAI(Tg, Tb), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 3)]), VFNMSI(Tg, Tb), ovs, &(xo[WS(os, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 8, XSIMD_STRING("n1bv_8"), {16, 0, 10, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_8) (planner *p) {
+     X(kdft_register) (p, n1bv_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 8 -name n1bv_8 -include n1b.h */
+
+/*
+ * This function contains 26 FP additions, 2 FP multiplications,
+ * (or, 26 additions, 2 multiplications, 0 fused multiply/add),
+ * 22 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       V Ta, Tk, Te, Tj, T7, Tn, Tf, Tm;
+	       {
+		    V T8, T9, Tc, Td;
+		    T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T9 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Ta = VSUB(T8, T9);
+		    Tk = VADD(T8, T9);
+		    Tc = LD(&(xi[0]), ivs, &(xi[0]));
+		    Td = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    Te = VSUB(Tc, Td);
+		    Tj = VADD(Tc, Td);
+		    {
+			 V T1, T2, T3, T4, T5, T6;
+			 T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 T4 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 T7 = VMUL(LDK(KP707106781), VSUB(T3, T6));
+			 Tn = VADD(T4, T5);
+			 Tf = VMUL(LDK(KP707106781), VADD(T3, T6));
+			 Tm = VADD(T1, T2);
+		    }
+	       }
+	       {
+		    V Tb, Tg, Tp, Tq;
+		    Tb = VBYI(VSUB(T7, Ta));
+		    Tg = VSUB(Te, Tf);
+		    ST(&(xo[WS(os, 3)]), VADD(Tb, Tg), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 5)]), VSUB(Tg, Tb), ovs, &(xo[WS(os, 1)]));
+		    Tp = VADD(Tj, Tk);
+		    Tq = VADD(Tm, Tn);
+		    ST(&(xo[WS(os, 4)]), VSUB(Tp, Tq), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(Tp, Tq), ovs, &(xo[0]));
+	       }
+	       {
+		    V Th, Ti, Tl, To;
+		    Th = VBYI(VADD(Ta, T7));
+		    Ti = VADD(Te, Tf);
+		    ST(&(xo[WS(os, 1)]), VADD(Th, Ti), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 7)]), VSUB(Ti, Th), ovs, &(xo[WS(os, 1)]));
+		    Tl = VSUB(Tj, Tk);
+		    To = VBYI(VSUB(Tm, Tn));
+		    ST(&(xo[WS(os, 6)]), VSUB(Tl, To), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VADD(Tl, To), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 8, XSIMD_STRING("n1bv_8"), {26, 2, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_8) (planner *p) {
+     X(kdft_register) (p, n1bv_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1bv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1bv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 9 -name n1bv_9 -include n1b.h */
+
+/*
+ * This function contains 46 FP additions, 38 FP multiplications,
+ * (or, 12 additions, 4 multiplications, 34 fused multiply/add),
+ * 68 stack variables, 19 constants, and 18 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
+     DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
+     DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
+     DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
+     DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
+     DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
+     DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
+     DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
+     DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
+     DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
+     DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
+	       V T1, T2, T3, T6, Tf, T7, T8, Tb, Tc, Tp, T4;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+	       T6 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T7 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T8 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+	       Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       Tp = VSUB(T2, T3);
+	       T4 = VADD(T2, T3);
+	       {
+		    V Te, T9, Tg, Td, TF, T5;
+		    Te = VSUB(T8, T7);
+		    T9 = VADD(T7, T8);
+		    Tg = VADD(Tb, Tc);
+		    Td = VSUB(Tb, Tc);
+		    TF = VADD(T1, T4);
+		    T5 = VFNMS(LDK(KP500000000), T4, T1);
+		    {
+			 V Ta, TH, Th, TG;
+			 Ta = VFNMS(LDK(KP500000000), T9, T6);
+			 TH = VADD(T6, T9);
+			 Th = VFNMS(LDK(KP500000000), Tg, Tf);
+			 TG = VADD(Tf, Tg);
+			 {
+			      V Tr, Tu, Tm, Tv, Ts, Ti, TI, TK;
+			      Tr = VFNMS(LDK(KP152703644), Te, Ta);
+			      Tu = VFMA(LDK(KP203604859), Ta, Te);
+			      Tm = VFNMS(LDK(KP439692620), Td, Ta);
+			      Tv = VFNMS(LDK(KP726681596), Td, Th);
+			      Ts = VFMA(LDK(KP968908795), Th, Td);
+			      Ti = VFNMS(LDK(KP586256827), Th, Te);
+			      TI = VADD(TG, TH);
+			      TK = VMUL(LDK(KP866025403), VSUB(TG, TH));
+			      {
+				   V Tt, TA, Tw, Tz, Tj, TJ, To, TE, Tn;
+				   Tn = VFNMS(LDK(KP420276625), Tm, Te);
+				   Tt = VFNMS(LDK(KP673648177), Ts, Tr);
+				   TA = VFMA(LDK(KP673648177), Ts, Tr);
+				   Tw = VFMA(LDK(KP898197570), Tv, Tu);
+				   Tz = VFNMS(LDK(KP898197570), Tv, Tu);
+				   Tj = VFNMS(LDK(KP347296355), Ti, Td);
+				   ST(&(xo[0]), VADD(TI, TF), ovs, &(xo[0]));
+				   TJ = VFNMS(LDK(KP500000000), TI, TF);
+				   To = VFNMS(LDK(KP826351822), Tn, Th);
+				   TE = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tp, TA));
+				   {
+					V TB, TD, Tx, Tk, Tq, TC, Ty, Tl;
+					TB = VFMA(LDK(KP666666666), TA, Tz);
+					TD = VFMA(LDK(KP852868531), Tw, T5);
+					Tx = VFNMS(LDK(KP500000000), Tw, Tt);
+					Tk = VFNMS(LDK(KP907603734), Tj, Ta);
+					ST(&(xo[WS(os, 6)]), VFNMSI(TK, TJ), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 3)]), VFMAI(TK, TJ), ovs, &(xo[WS(os, 1)]));
+					Tq = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tp, To));
+					TC = VMUL(LDK(KP866025403), VFNMS(LDK(KP852868531), TB, Tp));
+					ST(&(xo[WS(os, 8)]), VFNMSI(TE, TD), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 1)]), VFMAI(TE, TD), ovs, &(xo[WS(os, 1)]));
+					Ty = VFMA(LDK(KP852868531), Tx, T5);
+					Tl = VFNMS(LDK(KP939692620), Tk, T5);
+					ST(&(xo[WS(os, 5)]), VFNMSI(TC, Ty), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 4)]), VFMAI(TC, Ty), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 2)]), VFMAI(Tq, Tl), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 7)]), VFNMSI(Tq, Tl), ovs, &(xo[WS(os, 1)]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 9, XSIMD_STRING("n1bv_9"), {12, 4, 34, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_9) (planner *p) {
+     X(kdft_register) (p, n1bv_9, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 9 -name n1bv_9 -include n1b.h */
+
+/*
+ * This function contains 46 FP additions, 26 FP multiplications,
+ * (or, 30 additions, 10 multiplications, 16 fused multiply/add),
+ * 41 stack variables, 14 constants, and 18 memory accesses
+ */
+#include "n1b.h"
+
+static void n1bv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
+     DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
+     DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
+     DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
+	       V T5, Ty, Tm, Ti, Tw, Th, Tj, To, Tb, Tv, Ta, Tc, Tn;
+	       {
+		    V T1, T2, T3, T4;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    T4 = VADD(T2, T3);
+		    T5 = VFNMS(LDK(KP500000000), T4, T1);
+		    Ty = VADD(T1, T4);
+		    Tm = VMUL(LDK(KP866025403), VSUB(T2, T3));
+	       }
+	       {
+		    V Td, Tg, Te, Tf;
+		    Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    Tf = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    Tg = VADD(Te, Tf);
+		    Ti = VSUB(Te, Tf);
+		    Tw = VADD(Td, Tg);
+		    Th = VFNMS(LDK(KP500000000), Tg, Td);
+		    Tj = VFNMS(LDK(KP852868531), Ti, VMUL(LDK(KP173648177), Th));
+		    To = VFMA(LDK(KP150383733), Ti, VMUL(LDK(KP984807753), Th));
+	       }
+	       {
+		    V T6, T9, T7, T8;
+		    T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    T9 = VADD(T7, T8);
+		    Tb = VSUB(T7, T8);
+		    Tv = VADD(T6, T9);
+		    Ta = VFNMS(LDK(KP500000000), T9, T6);
+		    Tc = VFNMS(LDK(KP556670399), Tb, VMUL(LDK(KP766044443), Ta));
+		    Tn = VFMA(LDK(KP663413948), Tb, VMUL(LDK(KP642787609), Ta));
+	       }
+	       {
+		    V Tx, Tz, TA, Tt, Tu;
+		    Tx = VBYI(VMUL(LDK(KP866025403), VSUB(Tv, Tw)));
+		    Tz = VADD(Tv, Tw);
+		    TA = VFNMS(LDK(KP500000000), Tz, Ty);
+		    ST(&(xo[WS(os, 3)]), VADD(Tx, TA), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[0]), VADD(Ty, Tz), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 6)]), VSUB(TA, Tx), ovs, &(xo[0]));
+		    Tt = VFMA(LDK(KP852868531), Tb, VFMA(LDK(KP173648177), Ta, VFMA(LDK(KP296198132), Ti, VFNMS(LDK(KP939692620), Th, T5))));
+		    Tu = VBYI(VSUB(VFMA(LDK(KP984807753), Ta, VFMA(LDK(KP813797681), Ti, VFNMS(LDK(KP150383733), Tb, VMUL(LDK(KP342020143), Th)))), Tm));
+		    ST(&(xo[WS(os, 7)]), VSUB(Tt, Tu), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 2)]), VADD(Tt, Tu), ovs, &(xo[0]));
+		    {
+			 V Tl, Ts, Tq, Tr, Tk, Tp;
+			 Tk = VADD(Tc, Tj);
+			 Tl = VADD(T5, Tk);
+			 Ts = VFMA(LDK(KP866025403), VSUB(To, Tn), VFNMS(LDK(KP500000000), Tk, T5));
+			 Tp = VADD(Tn, To);
+			 Tq = VBYI(VADD(Tm, Tp));
+			 Tr = VBYI(VADD(Tm, VFNMS(LDK(KP500000000), Tp, VMUL(LDK(KP866025403), VSUB(Tc, Tj)))));
+			 ST(&(xo[WS(os, 8)]), VSUB(Tl, Tq), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 5)]), VSUB(Ts, Tr), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 1)]), VADD(Tl, Tq), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 4)]), VADD(Tr, Ts), ovs, &(xo[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 9, XSIMD_STRING("n1bv_9"), {30, 10, 16, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1bv_9) (planner *p) {
+     X(kdft_register) (p, n1bv_9, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name n1fv_10 -include n1f.h */
+
+/*
+ * This function contains 42 FP additions, 22 FP multiplications,
+ * (or, 24 additions, 4 multiplications, 18 fused multiply/add),
+ * 43 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
+	       V Tb, Tr, T3, Ts, T6, Tw, Tg, Tt, T9, Tc, T1, T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T4, T5, Te, Tf, T7, T8;
+		    T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T5 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    Te = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    Tr = VADD(T1, T2);
+		    T3 = VSUB(T1, T2);
+		    Ts = VADD(T4, T5);
+		    T6 = VSUB(T4, T5);
+		    Tw = VADD(Te, Tf);
+		    Tg = VSUB(Te, Tf);
+		    Tt = VADD(T7, T8);
+		    T9 = VSUB(T7, T8);
+		    Tc = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+	       }
+	       {
+		    V TD, Tu, Tm, Ta, Td, Tv;
+		    TD = VSUB(Ts, Tt);
+		    Tu = VADD(Ts, Tt);
+		    Tm = VSUB(T6, T9);
+		    Ta = VADD(T6, T9);
+		    Td = VSUB(Tb, Tc);
+		    Tv = VADD(Tb, Tc);
+		    {
+			 V TC, Tx, Tn, Th;
+			 TC = VSUB(Tv, Tw);
+			 Tx = VADD(Tv, Tw);
+			 Tn = VSUB(Td, Tg);
+			 Th = VADD(Td, Tg);
+			 {
+			      V Ty, TA, TE, TG, Ti, Tk, To, Tq, Tz, Tj;
+			      Ty = VADD(Tu, Tx);
+			      TA = VSUB(Tu, Tx);
+			      TE = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TD, TC));
+			      TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TC, TD));
+			      Ti = VADD(Ta, Th);
+			      Tk = VSUB(Ta, Th);
+			      To = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tn, Tm));
+			      Tq = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tm, Tn));
+			      Tz = VFNMS(LDK(KP250000000), Ty, Tr);
+			      ST(&(xo[0]), VADD(Tr, Ty), ovs, &(xo[0]));
+			      Tj = VFNMS(LDK(KP250000000), Ti, T3);
+			      ST(&(xo[WS(os, 5)]), VADD(T3, Ti), ovs, &(xo[WS(os, 1)]));
+			      {
+				   V TB, TF, Tl, Tp;
+				   TB = VFNMS(LDK(KP559016994), TA, Tz);
+				   TF = VFMA(LDK(KP559016994), TA, Tz);
+				   Tl = VFMA(LDK(KP559016994), Tk, Tj);
+				   Tp = VFNMS(LDK(KP559016994), Tk, Tj);
+				   ST(&(xo[WS(os, 4)]), VFMAI(TG, TF), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 6)]), VFNMSI(TG, TF), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 8)]), VFNMSI(TE, TB), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 2)]), VFMAI(TE, TB), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 3)]), VFNMSI(Tq, Tp), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 7)]), VFMAI(Tq, Tp), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 9)]), VFMAI(To, Tl), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 1)]), VFNMSI(To, Tl), ovs, &(xo[WS(os, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 10, XSIMD_STRING("n1fv_10"), {24, 4, 18, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_10) (planner *p) {
+     X(kdft_register) (p, n1fv_10, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name n1fv_10 -include n1f.h */
+
+/*
+ * This function contains 42 FP additions, 12 FP multiplications,
+ * (or, 36 additions, 6 multiplications, 6 fused multiply/add),
+ * 33 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
+	       V Ti, Ty, Tm, Tn, Tw, Tt, Tz, TA, TB, T7, Te, Tj, Tg, Th;
+	       Tg = LD(&(xi[0]), ivs, &(xi[0]));
+	       Th = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       Ti = VSUB(Tg, Th);
+	       Ty = VADD(Tg, Th);
+	       {
+		    V T3, Tu, Td, Ts, T6, Tv, Ta, Tr;
+		    {
+			 V T1, T2, Tb, Tc;
+			 T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 Tu = VADD(T1, T2);
+			 Tb = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 Ts = VADD(Tb, Tc);
+		    }
+		    {
+			 V T4, T5, T8, T9;
+			 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 Tv = VADD(T4, T5);
+			 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VSUB(T8, T9);
+			 Tr = VADD(T8, T9);
+		    }
+		    Tm = VSUB(T3, T6);
+		    Tn = VSUB(Ta, Td);
+		    Tw = VSUB(Tu, Tv);
+		    Tt = VSUB(Tr, Ts);
+		    Tz = VADD(Tu, Tv);
+		    TA = VADD(Tr, Ts);
+		    TB = VADD(Tz, TA);
+		    T7 = VADD(T3, T6);
+		    Te = VADD(Ta, Td);
+		    Tj = VADD(T7, Te);
+	       }
+	       ST(&(xo[WS(os, 5)]), VADD(Ti, Tj), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[0]), VADD(Ty, TB), ovs, &(xo[0]));
+	       {
+		    V To, Tq, Tl, Tp, Tf, Tk;
+		    To = VBYI(VFMA(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tn)));
+		    Tq = VBYI(VFNMS(LDK(KP587785252), Tm, VMUL(LDK(KP951056516), Tn)));
+		    Tf = VMUL(LDK(KP559016994), VSUB(T7, Te));
+		    Tk = VFNMS(LDK(KP250000000), Tj, Ti);
+		    Tl = VADD(Tf, Tk);
+		    Tp = VSUB(Tk, Tf);
+		    ST(&(xo[WS(os, 1)]), VSUB(Tl, To), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 7)]), VADD(Tq, Tp), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 9)]), VADD(To, Tl), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VSUB(Tp, Tq), ovs, &(xo[WS(os, 1)]));
+	       }
+	       {
+		    V Tx, TF, TE, TG, TC, TD;
+		    Tx = VBYI(VFNMS(LDK(KP587785252), Tw, VMUL(LDK(KP951056516), Tt)));
+		    TF = VBYI(VFMA(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tt)));
+		    TC = VFNMS(LDK(KP250000000), TB, Ty);
+		    TD = VMUL(LDK(KP559016994), VSUB(Tz, TA));
+		    TE = VSUB(TC, TD);
+		    TG = VADD(TD, TC);
+		    ST(&(xo[WS(os, 2)]), VADD(Tx, TE), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 6)]), VSUB(TG, TF), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 8)]), VSUB(TE, Tx), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VADD(TF, TG), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 10, XSIMD_STRING("n1fv_10"), {36, 6, 6, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_10) (planner *p) {
+     X(kdft_register) (p, n1fv_10, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 11 -name n1fv_11 -include n1f.h */
+
+/*
+ * This function contains 70 FP additions, 60 FP multiplications,
+ * (or, 15 additions, 5 multiplications, 55 fused multiply/add),
+ * 67 stack variables, 11 constants, and 22 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
+     DVK(KP876768831, +0.876768831002589333891339807079336796764054852);
+     DVK(KP918985947, +0.918985947228994779780736114132655398124909697);
+     DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
+     DVK(KP778434453, +0.778434453334651800608337670740821884709317477);
+     DVK(KP830830026, +0.830830026003772851058548298459246407048009821);
+     DVK(KP372785597, +0.372785597771792209609773152906148328659002598);
+     DVK(KP634356270, +0.634356270682424498893150776899916060542806975);
+     DVK(KP715370323, +0.715370323453429719112414662767260662417897278);
+     DVK(KP342584725, +0.342584725681637509502641509861112333758894680);
+     DVK(KP521108558, +0.521108558113202722944698153526659300680427422);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
+	       V T1, Tb, T4, Tp, Tg, Tq, T7, Tn, Ta, Tm, Tc, Tr;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       {
+		    V T2, T3, Te, Tf;
+		    T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    Tf = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    {
+			 V T5, T6, T8, T9;
+			 T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T6 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T4 = VADD(T2, T3);
+			 Tp = VSUB(T3, T2);
+			 Tg = VADD(Te, Tf);
+			 Tq = VSUB(Tf, Te);
+			 T7 = VADD(T5, T6);
+			 Tn = VSUB(T6, T5);
+			 Ta = VADD(T8, T9);
+			 Tm = VSUB(T9, T8);
+			 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    }
+	       }
+	       Tr = VFMA(LDK(KP521108558), Tq, Tp);
+	       {
+		    V TS, TE, Th, Td, To, T12, TO, TB, T11, TN, TA, TF;
+		    T11 = VFNMS(LDK(KP521108558), Tp, Tn);
+		    TN = VFNMS(LDK(KP342584725), T7, Tg);
+		    TA = VFMA(LDK(KP521108558), Tm, Tq);
+		    TS = VFMA(LDK(KP715370323), Tm, Tp);
+		    TE = VFNMS(LDK(KP342584725), T4, Ta);
+		    Th = VFNMS(LDK(KP342584725), Ta, T7);
+		    Td = VADD(Tb, Tc);
+		    To = VSUB(Tc, Tb);
+		    T12 = VFNMS(LDK(KP715370323), T11, Tm);
+		    TO = VFNMS(LDK(KP634356270), TN, T4);
+		    TB = VFNMS(LDK(KP715370323), TA, Tn);
+		    TF = VFNMS(LDK(KP634356270), TE, Tg);
+		    {
+			 V T14, TD, TV, Tu, TY, Tx, Tk, TR, TI, TM, TJ, TT, Ts;
+			 TJ = VFNMS(LDK(KP521108558), Tn, To);
+			 TT = VFMA(LDK(KP372785597), To, TS);
+			 Ts = VFMA(LDK(KP715370323), Tr, To);
+			 ST(&(xo[0]), VADD(T1, VADD(T4, VADD(T7, VADD(Ta, VADD(Td, Tg))))), ovs, &(xo[0]));
+			 {
+			      V TW, Tv, Ti, T13;
+			      TW = VFNMS(LDK(KP342584725), Tg, Td);
+			      Tv = VFNMS(LDK(KP342584725), Td, T4);
+			      Ti = VFNMS(LDK(KP634356270), Th, Td);
+			      T13 = VFNMS(LDK(KP830830026), T12, To);
+			      {
+				   V TP, TC, TG, TK;
+				   TP = VFNMS(LDK(KP778434453), TO, Ta);
+				   TC = VFMA(LDK(KP830830026), TB, Tp);
+				   TG = VFNMS(LDK(KP778434453), TF, Td);
+				   TK = VFMA(LDK(KP715370323), TJ, Tq);
+				   {
+					V TU, Tt, TX, Tw;
+					TU = VFNMS(LDK(KP830830026), TT, Tq);
+					Tt = VFMA(LDK(KP830830026), Ts, Tn);
+					TX = VFNMS(LDK(KP634356270), TW, Ta);
+					Tw = VFNMS(LDK(KP634356270), Tv, T7);
+					{
+					     V Tj, TQ, TH, TL;
+					     Tj = VFNMS(LDK(KP778434453), Ti, T4);
+					     T14 = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), T13, Tq));
+					     TQ = VFNMS(LDK(KP876768831), TP, Td);
+					     TD = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TC, To));
+					     TH = VFNMS(LDK(KP876768831), TG, T7);
+					     TL = VFNMS(LDK(KP830830026), TK, Tm);
+					     TV = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), TU, Tn));
+					     Tu = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), Tt, Tm));
+					     TY = VFNMS(LDK(KP778434453), TX, T7);
+					     Tx = VFNMS(LDK(KP778434453), Tw, Tg);
+					     Tk = VFNMS(LDK(KP876768831), Tj, Tg);
+					     TR = VFNMS(LDK(KP959492973), TQ, T1);
+					     TI = VFNMS(LDK(KP959492973), TH, T1);
+					     TM = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TL, Tp));
+					}
+				   }
+			      }
+			 }
+			 {
+			      V TZ, Ty, Tl, T10, Tz;
+			      TZ = VFNMS(LDK(KP876768831), TY, T4);
+			      Ty = VFNMS(LDK(KP876768831), Tx, Ta);
+			      Tl = VFNMS(LDK(KP959492973), Tk, T1);
+			      ST(&(xo[WS(os, 7)]), VFMAI(TV, TR), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 4)]), VFNMSI(TV, TR), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 3)]), VFMAI(TM, TI), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 8)]), VFNMSI(TM, TI), ovs, &(xo[0]));
+			      T10 = VFNMS(LDK(KP959492973), TZ, T1);
+			      Tz = VFNMS(LDK(KP959492973), Ty, T1);
+			      ST(&(xo[WS(os, 1)]), VFMAI(Tu, Tl), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 10)]), VFNMSI(Tu, Tl), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 5)]), VFMAI(T14, T10), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 6)]), VFNMSI(T14, T10), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 9)]), VFMAI(TD, Tz), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 2)]), VFNMSI(TD, Tz), ovs, &(xo[0]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 11, XSIMD_STRING("n1fv_11"), {15, 5, 55, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_11) (planner *p) {
+     X(kdft_register) (p, n1fv_11, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 11 -name n1fv_11 -include n1f.h */
+
+/*
+ * This function contains 70 FP additions, 50 FP multiplications,
+ * (or, 30 additions, 10 multiplications, 40 fused multiply/add),
+ * 32 stack variables, 10 constants, and 22 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP654860733, +0.654860733945285064056925072466293553183791199);
+     DVK(KP142314838, +0.142314838273285140443792668616369668791051361);
+     DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
+     DVK(KP415415013, +0.415415013001886425529274149229623203524004910);
+     DVK(KP841253532, +0.841253532831181168861811648919367717513292498);
+     DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
+     DVK(KP909631995, +0.909631995354518371411715383079028460060241051);
+     DVK(KP281732556, +0.281732556841429697711417915346616899035777899);
+     DVK(KP540640817, +0.540640817455597582107635954318691695431770608);
+     DVK(KP755749574, +0.755749574354258283774035843972344420179717445);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
+	       V T1, T4, Ti, Tg, Tl, Td, Tk, Ta, Tj, T7, Tm, Tb, Tc, Tt, Ts;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       {
+		    V T2, T3, Te, Tf;
+		    T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T4 = VADD(T2, T3);
+		    Ti = VSUB(T3, T2);
+		    Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    Tf = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Tg = VADD(Te, Tf);
+		    Tl = VSUB(Tf, Te);
+	       }
+	       Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       Td = VADD(Tb, Tc);
+	       Tk = VSUB(Tc, Tb);
+	       {
+		    V T8, T9, T5, T6;
+		    T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T9 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    Ta = VADD(T8, T9);
+		    Tj = VSUB(T9, T8);
+		    T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T6 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = VADD(T5, T6);
+		    Tm = VSUB(T6, T5);
+	       }
+	       ST(&(xo[0]), VADD(T1, VADD(T4, VADD(T7, VADD(Ta, VADD(Td, Tg))))), ovs, &(xo[0]));
+	       {
+		    V Tn, Th, Tv, Tu;
+		    Tn = VBYI(VFMA(LDK(KP755749574), Ti, VFMA(LDK(KP540640817), Tj, VFNMS(LDK(KP909631995), Tl, VFNMS(LDK(KP989821441), Tm, VMUL(LDK(KP281732556), Tk))))));
+		    Th = VFMA(LDK(KP841253532), Ta, VFMA(LDK(KP415415013), Tg, VFNMS(LDK(KP959492973), Td, VFNMS(LDK(KP142314838), T7, VFNMS(LDK(KP654860733), T4, T1)))));
+		    ST(&(xo[WS(os, 7)]), VSUB(Th, Tn), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 4)]), VADD(Th, Tn), ovs, &(xo[0]));
+		    Tv = VBYI(VFMA(LDK(KP281732556), Ti, VFMA(LDK(KP755749574), Tj, VFNMS(LDK(KP909631995), Tk, VFNMS(LDK(KP540640817), Tm, VMUL(LDK(KP989821441), Tl))))));
+		    Tu = VFMA(LDK(KP841253532), T7, VFMA(LDK(KP415415013), Td, VFNMS(LDK(KP142314838), Tg, VFNMS(LDK(KP654860733), Ta, VFNMS(LDK(KP959492973), T4, T1)))));
+		    ST(&(xo[WS(os, 6)]), VSUB(Tu, Tv), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 5)]), VADD(Tu, Tv), ovs, &(xo[WS(os, 1)]));
+	       }
+	       Tt = VBYI(VFMA(LDK(KP989821441), Ti, VFMA(LDK(KP540640817), Tk, VFNMS(LDK(KP909631995), Tj, VFNMS(LDK(KP281732556), Tm, VMUL(LDK(KP755749574), Tl))))));
+	       Ts = VFMA(LDK(KP415415013), Ta, VFMA(LDK(KP841253532), Td, VFNMS(LDK(KP654860733), Tg, VFNMS(LDK(KP959492973), T7, VFNMS(LDK(KP142314838), T4, T1)))));
+	       ST(&(xo[WS(os, 8)]), VSUB(Ts, Tt), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 3)]), VADD(Ts, Tt), ovs, &(xo[WS(os, 1)]));
+	       {
+		    V Tr, Tq, Tp, To;
+		    Tr = VBYI(VFMA(LDK(KP540640817), Ti, VFMA(LDK(KP909631995), Tm, VFMA(LDK(KP989821441), Tj, VFMA(LDK(KP755749574), Tk, VMUL(LDK(KP281732556), Tl))))));
+		    Tq = VFMA(LDK(KP841253532), T4, VFMA(LDK(KP415415013), T7, VFNMS(LDK(KP959492973), Tg, VFNMS(LDK(KP654860733), Td, VFNMS(LDK(KP142314838), Ta, T1)))));
+		    ST(&(xo[WS(os, 10)]), VSUB(Tq, Tr), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 1)]), VADD(Tq, Tr), ovs, &(xo[WS(os, 1)]));
+		    Tp = VBYI(VFMA(LDK(KP909631995), Ti, VFNMS(LDK(KP540640817), Tl, VFNMS(LDK(KP989821441), Tk, VFNMS(LDK(KP281732556), Tj, VMUL(LDK(KP755749574), Tm))))));
+		    To = VFMA(LDK(KP415415013), T4, VFMA(LDK(KP841253532), Tg, VFNMS(LDK(KP142314838), Td, VFNMS(LDK(KP959492973), Ta, VFNMS(LDK(KP654860733), T7, T1)))));
+		    ST(&(xo[WS(os, 9)]), VSUB(To, Tp), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 2)]), VADD(To, Tp), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 11, XSIMD_STRING("n1fv_11"), {30, 10, 40, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_11) (planner *p) {
+     X(kdft_register) (p, n1fv_11, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n1fv_12 -include n1f.h */
+
+/*
+ * This function contains 48 FP additions, 20 FP multiplications,
+ * (or, 30 additions, 2 multiplications, 18 fused multiply/add),
+ * 49 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
+	       V T1, T6, Tk, Tn, Tc, Td, Tf, Tr, T4, Ts, T9, Tg, Te, Tl;
+	       {
+		    V T2, T3, T7, T8;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    Tk = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+		    Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    Tr = VSUB(T3, T2);
+		    T4 = VADD(T2, T3);
+		    Ts = VSUB(T8, T7);
+		    T9 = VADD(T7, T8);
+		    Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       }
+	       Te = VSUB(Tc, Td);
+	       Tl = VADD(Td, Tc);
+	       {
+		    V T5, TF, TB, Tt, Ta, TG, Th, To, Tm, TI;
+		    T5 = VFNMS(LDK(KP500000000), T4, T1);
+		    TF = VADD(T1, T4);
+		    TB = VADD(Tr, Ts);
+		    Tt = VSUB(Tr, Ts);
+		    Ta = VFNMS(LDK(KP500000000), T9, T6);
+		    TG = VADD(T6, T9);
+		    Th = VSUB(Tf, Tg);
+		    To = VADD(Tf, Tg);
+		    Tm = VFNMS(LDK(KP500000000), Tl, Tk);
+		    TI = VADD(Tk, Tl);
+		    {
+			 V TH, TL, Tb, Tx, TJ, Tp, Ti, TA;
+			 TH = VSUB(TF, TG);
+			 TL = VADD(TF, TG);
+			 Tb = VSUB(T5, Ta);
+			 Tx = VADD(T5, Ta);
+			 TJ = VADD(Tn, To);
+			 Tp = VFNMS(LDK(KP500000000), To, Tn);
+			 Ti = VADD(Te, Th);
+			 TA = VSUB(Te, Th);
+			 {
+			      V Tq, Ty, TK, TM;
+			      Tq = VSUB(Tm, Tp);
+			      Ty = VADD(Tm, Tp);
+			      TK = VSUB(TI, TJ);
+			      TM = VADD(TI, TJ);
+			      {
+				   V TC, TE, Tj, Tv;
+				   TC = VMUL(LDK(KP866025403), VSUB(TA, TB));
+				   TE = VMUL(LDK(KP866025403), VADD(TB, TA));
+				   Tj = VFMA(LDK(KP866025403), Ti, Tb);
+				   Tv = VFNMS(LDK(KP866025403), Ti, Tb);
+				   {
+					V Tz, TD, Tu, Tw;
+					Tz = VSUB(Tx, Ty);
+					TD = VADD(Tx, Ty);
+					Tu = VFNMS(LDK(KP866025403), Tt, Tq);
+					Tw = VFMA(LDK(KP866025403), Tt, Tq);
+					ST(&(xo[0]), VADD(TL, TM), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 6)]), VSUB(TL, TM), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 3)]), VFMAI(TK, TH), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 9)]), VFNMSI(TK, TH), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 4)]), VFMAI(TE, TD), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 8)]), VFNMSI(TE, TD), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 10)]), VFNMSI(TC, Tz), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 2)]), VFMAI(TC, Tz), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 5)]), VFNMSI(Tw, Tv), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 7)]), VFMAI(Tw, Tv), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 11)]), VFMAI(Tu, Tj), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 1)]), VFNMSI(Tu, Tj), ovs, &(xo[WS(os, 1)]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 12, XSIMD_STRING("n1fv_12"), {30, 2, 18, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_12) (planner *p) {
+     X(kdft_register) (p, n1fv_12, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n1fv_12 -include n1f.h */
+
+/*
+ * This function contains 48 FP additions, 8 FP multiplications,
+ * (or, 44 additions, 4 multiplications, 4 fused multiply/add),
+ * 27 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
+	       V T5, Ta, TJ, Ty, Tq, Tp, Tg, Tl, TI, TA, Tz, Tu;
+	       {
+		    V T1, T6, T4, Tw, T9, Tx;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    {
+			 V T2, T3, T7, T8;
+			 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T4 = VADD(T2, T3);
+			 Tw = VSUB(T3, T2);
+			 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T9 = VADD(T7, T8);
+			 Tx = VSUB(T8, T7);
+		    }
+		    T5 = VADD(T1, T4);
+		    Ta = VADD(T6, T9);
+		    TJ = VADD(Tw, Tx);
+		    Ty = VMUL(LDK(KP866025403), VSUB(Tw, Tx));
+		    Tq = VFNMS(LDK(KP500000000), T9, T6);
+		    Tp = VFNMS(LDK(KP500000000), T4, T1);
+	       }
+	       {
+		    V Tc, Th, Tf, Ts, Tk, Tt;
+		    Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    {
+			 V Td, Te, Ti, Tj;
+			 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Tf = VADD(Td, Te);
+			 Ts = VSUB(Te, Td);
+			 Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tk = VADD(Ti, Tj);
+			 Tt = VSUB(Tj, Ti);
+		    }
+		    Tg = VADD(Tc, Tf);
+		    Tl = VADD(Th, Tk);
+		    TI = VADD(Ts, Tt);
+		    TA = VFNMS(LDK(KP500000000), Tk, Th);
+		    Tz = VFNMS(LDK(KP500000000), Tf, Tc);
+		    Tu = VMUL(LDK(KP866025403), VSUB(Ts, Tt));
+	       }
+	       {
+		    V Tb, Tm, Tn, To;
+		    Tb = VSUB(T5, Ta);
+		    Tm = VBYI(VSUB(Tg, Tl));
+		    ST(&(xo[WS(os, 9)]), VSUB(Tb, Tm), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VADD(Tb, Tm), ovs, &(xo[WS(os, 1)]));
+		    Tn = VADD(T5, Ta);
+		    To = VADD(Tg, Tl);
+		    ST(&(xo[WS(os, 6)]), VSUB(Tn, To), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(Tn, To), ovs, &(xo[0]));
+	       }
+	       {
+		    V Tv, TE, TC, TD, Tr, TB;
+		    Tr = VSUB(Tp, Tq);
+		    Tv = VSUB(Tr, Tu);
+		    TE = VADD(Tr, Tu);
+		    TB = VSUB(Tz, TA);
+		    TC = VBYI(VADD(Ty, TB));
+		    TD = VBYI(VSUB(Ty, TB));
+		    ST(&(xo[WS(os, 5)]), VSUB(Tv, TC), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 11)]), VSUB(TE, TD), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 7)]), VADD(TC, Tv), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VADD(TD, TE), ovs, &(xo[WS(os, 1)]));
+	       }
+	       {
+		    V TK, TM, TH, TL, TF, TG;
+		    TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
+		    TM = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
+		    TF = VADD(Tp, Tq);
+		    TG = VADD(Tz, TA);
+		    TH = VSUB(TF, TG);
+		    TL = VADD(TF, TG);
+		    ST(&(xo[WS(os, 10)]), VSUB(TH, TK), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VADD(TL, TM), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VADD(TH, TK), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 8)]), VSUB(TL, TM), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 12, XSIMD_STRING("n1fv_12"), {44, 4, 4, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_12) (planner *p) {
+     X(kdft_register) (p, n1fv_12, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3527 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:54 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 128 -name n1fv_128 -include n1f.h */
+
+/*
+ * This function contains 1082 FP additions, 642 FP multiplications,
+ * (or, 440 additions, 0 multiplications, 642 fused multiply/add),
+ * 295 stack variables, 31 constants, and 256 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_128(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP903989293, +0.903989293123443331586200297230537048710132025);
+     DVK(KP941544065, +0.941544065183020778412509402599502357185589796);
+     DVK(KP357805721, +0.357805721314524104672487743774474392487532769);
+     DVK(KP472964775, +0.472964775891319928124438237972992463904131113);
+     DVK(KP857728610, +0.857728610000272069902269984284770137042490799);
+     DVK(KP970031253, +0.970031253194543992603984207286100251456865962);
+     DVK(KP250486960, +0.250486960191305461595702160124721208578685568);
+     DVK(KP998795456, +0.998795456205172392714771604759100694443203615);
+     DVK(KP740951125, +0.740951125354959091175616897495162729728955309);
+     DVK(KP599376933, +0.599376933681923766271389869014404232837890546);
+     DVK(KP906347169, +0.906347169019147157946142717268914412664134293);
+     DVK(KP049126849, +0.049126849769467254105343321271313617079695752);
+     DVK(KP989176509, +0.989176509964780973451673738016243063983689533);
+     DVK(KP803207531, +0.803207531480644909806676512963141923879569427);
+     DVK(KP741650546, +0.741650546272035369581266691172079863842265220);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP148335987, +0.148335987538347428753676511486911367000625355);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(256, is), MAKE_VOLATILE_STRIDE(256, os)) {
+	       V T6a, T5J, T6b, T5K, T6B, T6C, T6J, T6A, T6o, T6j, T6r, T68, T6e, T5O, T5R;
+	       V T6d, T6D, T6K;
+	       {
+		    V Tad, TcZ, T6Z, T8T, T4U, Tr, Tfq, TgG, Ted, Tgf, Td0, Tcc, T9k, T84, Tb6;
+		    V Tbt, Td8, TdK, TeK, Tgq, TeV, Tgt, T7q, T94, T3p, T5X, T7B, T97, T2G, T5U;
+		    V TbD, Tc0, Tdf, TdN, Tf5, Tgx, Tfg, TgA, T7J, T9b, T4E, T64, T7U, T9e, T3V;
+		    V T61, Td2, Td3, T85, T72, T4V, TI, Tcd, Tas, TgH, Tek, Tgg, Tft, T86, T75;
+		    V T4W, TZ, TaI, Tcg, Tdr, TdG, Tgi, Tet, Tgj, Teq, T8X, T7a, T5M, T1B, T8W;
+		    V T7d, T5N, T1s, TaX, Tcf, Tdo, TdH, Tgl, TeC, Tgm, Tez, T90, T7h, T5P, T2c;
+		    V T8Z, T7k, T5Q, T23, T3Y, T49, TdL, Tdb, Tbu, Tbl, Tgu, TeR, Tgr, TeY, Tf6;
+		    V TbG, T5V, T3s, T5Y, T3f, T95, T7E, T98, T7x, T4g, T4f, T4q, TbH, T41, TbI;
+		    V T44, T4h, T4j, T4k, Tf9, TbN;
+		    {
+			 V Tu, TF, Ty, TL, TW, Tah, Tx, Tag, Tee, Tz, TM, TN, Teh, Tan, TP;
+			 V TQ;
+			 {
+			      V TeG, T2A, Tbq, TeT, Tbp, TeH, T3m, T2x, Td6, T7o, T2q, T3l, T7z, Tbr, T2D;
+			      V T82, T83;
+			      {
+				   V Ta7, T3, Ta8, T4O, Taa, Tab, Ta, T4P, Te, Tc9, Th, Tca, Tl, Tc6, Tc7;
+				   V To;
+				   {
+					V T1, T2, T4M, T4N;
+					T1 = LD(&(xi[0]), ivs, &(xi[0]));
+					T2 = LD(&(xi[WS(is, 64)]), ivs, &(xi[0]));
+					T4M = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+					T4N = LD(&(xi[WS(is, 96)]), ivs, &(xi[0]));
+					{
+					     V T4, T5, T7, T8;
+					     T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+					     T5 = LD(&(xi[WS(is, 80)]), ivs, &(xi[0]));
+					     T7 = LD(&(xi[WS(is, 112)]), ivs, &(xi[0]));
+					     T8 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+					     {
+						  V Tc, T6, T9, Td, Tf, Tg;
+						  Tc = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+						  Ta7 = VADD(T1, T2);
+						  T3 = VSUB(T1, T2);
+						  Ta8 = VADD(T4M, T4N);
+						  T4O = VSUB(T4M, T4N);
+						  Taa = VADD(T4, T5);
+						  T6 = VSUB(T4, T5);
+						  Tab = VADD(T7, T8);
+						  T9 = VSUB(T7, T8);
+						  Td = LD(&(xi[WS(is, 72)]), ivs, &(xi[0]));
+						  Tf = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+						  Tg = LD(&(xi[WS(is, 104)]), ivs, &(xi[0]));
+						  {
+						       V Tj, Tk, Tm, Tn;
+						       Tj = LD(&(xi[WS(is, 120)]), ivs, &(xi[0]));
+						       Tk = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+						       Tm = LD(&(xi[WS(is, 88)]), ivs, &(xi[0]));
+						       Tn = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+						       Ta = VADD(T6, T9);
+						       T4P = VSUB(T9, T6);
+						       Te = VSUB(Tc, Td);
+						       Tc9 = VADD(Tc, Td);
+						       Th = VSUB(Tf, Tg);
+						       Tca = VADD(Tf, Tg);
+						       Tl = VSUB(Tj, Tk);
+						       Tc6 = VADD(Tj, Tk);
+						       Tc7 = VADD(Tn, Tm);
+						       To = VSUB(Tm, Tn);
+						  }
+					     }
+					}
+				   }
+				   {
+					V T6X, Tb, Te9, Ta9, Tcb, Tea, T4R, Ti, Tfo, Tac, Tp, T4S, Tc8, Teb, T4Q;
+					T6X = VFNMS(LDK(KP707106781), Ta, T3);
+					Tb = VFMA(LDK(KP707106781), Ta, T3);
+					Te9 = VSUB(Ta7, Ta8);
+					Ta9 = VADD(Ta7, Ta8);
+					Tcb = VADD(Tc9, Tca);
+					Tea = VSUB(Tc9, Tca);
+					T4R = VFMA(LDK(KP414213562), Te, Th);
+					Ti = VFNMS(LDK(KP414213562), Th, Te);
+					Tfo = VSUB(Tab, Taa);
+					Tac = VADD(Taa, Tab);
+					Tp = VFNMS(LDK(KP414213562), To, Tl);
+					T4S = VFMA(LDK(KP414213562), Tl, To);
+					Tc8 = VADD(Tc6, Tc7);
+					Teb = VSUB(Tc6, Tc7);
+					T4Q = VFNMS(LDK(KP707106781), T4P, T4O);
+					T82 = VFMA(LDK(KP707106781), T4P, T4O);
+					{
+					     V T4T, T6Y, Tq, Tfp, Tec;
+					     T4T = VSUB(T4R, T4S);
+					     T6Y = VADD(T4R, T4S);
+					     T83 = VSUB(Tp, Ti);
+					     Tq = VADD(Ti, Tp);
+					     Tfp = VSUB(Teb, Tea);
+					     Tec = VADD(Tea, Teb);
+					     Tad = VSUB(Ta9, Tac);
+					     TcZ = VADD(Ta9, Tac);
+					     T6Z = VFMA(LDK(KP923879532), T6Y, T6X);
+					     T8T = VFNMS(LDK(KP923879532), T6Y, T6X);
+					     T4U = VFMA(LDK(KP923879532), T4T, T4Q);
+					     T6a = VFNMS(LDK(KP923879532), T4T, T4Q);
+					     Tr = VFMA(LDK(KP923879532), Tq, Tb);
+					     T5J = VFNMS(LDK(KP923879532), Tq, Tb);
+					     Tfq = VFMA(LDK(KP707106781), Tfp, Tfo);
+					     TgG = VFNMS(LDK(KP707106781), Tfp, Tfo);
+					     Ted = VFMA(LDK(KP707106781), Tec, Te9);
+					     Tgf = VFNMS(LDK(KP707106781), Tec, Te9);
+					     Td0 = VADD(Tcb, Tc8);
+					     Tcc = VSUB(Tc8, Tcb);
+					}
+				   }
+			      }
+			      {
+				   V T2i, T3j, Tb2, T2B, Tb5, T3k, T2p, T2C;
+				   {
+					V T2m, Tb0, Tb1, Tb3, T2l, T2n;
+					{
+					     V T2g, T2h, T3h, T3i, T2j, T2k;
+					     T2g = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+					     T2h = LD(&(xi[WS(is, 65)]), ivs, &(xi[WS(is, 1)]));
+					     T3h = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+					     T3i = LD(&(xi[WS(is, 97)]), ivs, &(xi[WS(is, 1)]));
+					     T2j = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+					     T2k = LD(&(xi[WS(is, 81)]), ivs, &(xi[WS(is, 1)]));
+					     T2m = LD(&(xi[WS(is, 113)]), ivs, &(xi[WS(is, 1)]));
+					     T9k = VFNMS(LDK(KP923879532), T83, T82);
+					     T84 = VFMA(LDK(KP923879532), T83, T82);
+					     T2i = VSUB(T2g, T2h);
+					     Tb0 = VADD(T2g, T2h);
+					     T3j = VSUB(T3h, T3i);
+					     Tb1 = VADD(T3h, T3i);
+					     Tb3 = VADD(T2j, T2k);
+					     T2l = VSUB(T2j, T2k);
+					     T2n = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+					}
+					{
+					     V T2r, T2s, T2u, T2v;
+					     T2r = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+					     T2s = LD(&(xi[WS(is, 73)]), ivs, &(xi[WS(is, 1)]));
+					     T2u = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+					     T2v = LD(&(xi[WS(is, 105)]), ivs, &(xi[WS(is, 1)]));
+					     TeG = VSUB(Tb0, Tb1);
+					     Tb2 = VADD(Tb0, Tb1);
+					     {
+						  V T2y, T2z, Tb4, T2o, Tbn, T2t, Tbo, T2w;
+						  T2y = LD(&(xi[WS(is, 121)]), ivs, &(xi[WS(is, 1)]));
+						  T2z = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+						  Tb4 = VADD(T2m, T2n);
+						  T2o = VSUB(T2m, T2n);
+						  Tbn = VADD(T2r, T2s);
+						  T2t = VSUB(T2r, T2s);
+						  Tbo = VADD(T2u, T2v);
+						  T2w = VSUB(T2u, T2v);
+						  T2B = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+						  T2A = VSUB(T2y, T2z);
+						  Tbq = VADD(T2y, T2z);
+						  TeT = VSUB(Tb3, Tb4);
+						  Tb5 = VADD(Tb3, Tb4);
+						  T3k = VSUB(T2l, T2o);
+						  T2p = VADD(T2l, T2o);
+						  Tbp = VADD(Tbn, Tbo);
+						  TeH = VSUB(Tbn, Tbo);
+						  T3m = VFMA(LDK(KP414213562), T2t, T2w);
+						  T2x = VFNMS(LDK(KP414213562), T2w, T2t);
+						  T2C = LD(&(xi[WS(is, 89)]), ivs, &(xi[WS(is, 1)]));
+					     }
+					}
+				   }
+				   Td6 = VADD(Tb2, Tb5);
+				   Tb6 = VSUB(Tb2, Tb5);
+				   T7o = VFNMS(LDK(KP707106781), T2p, T2i);
+				   T2q = VFMA(LDK(KP707106781), T2p, T2i);
+				   T3l = VFMA(LDK(KP707106781), T3k, T3j);
+				   T7z = VFNMS(LDK(KP707106781), T3k, T3j);
+				   Tbr = VADD(T2B, T2C);
+				   T2D = VSUB(T2B, T2C);
+			      }
+			      {
+				   V Tf1, Tfe, Tf2, TbZ, T3M, T4B, Tdd, T3F, T7H, T4A, T7S, TbW, Tf3, T4C, T3T;
+				   {
+					V T3x, T4y, Tbz, T3Q, TbC, T4z, T3E, T3R, T3P, TbU, TbV, T3S;
+					{
+					     V T3y, T3z, T3B, T3C;
+					     {
+						  V T3v, T3w, T4w, T4x;
+						  T3v = LD(&(xi[WS(is, 127)]), ivs, &(xi[WS(is, 1)]));
+						  T3w = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+						  T4w = LD(&(xi[WS(is, 95)]), ivs, &(xi[WS(is, 1)]));
+						  T4x = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+						  T3y = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V Tbs, TeI, T3n, T2E, Tbx;
+						       Tbs = VADD(Tbq, Tbr);
+						       TeI = VSUB(Tbq, Tbr);
+						       T3n = VFNMS(LDK(KP414213562), T2A, T2D);
+						       T2E = VFMA(LDK(KP414213562), T2D, T2A);
+						       T3x = VSUB(T3v, T3w);
+						       Tbx = VADD(T3v, T3w);
+						       {
+							    V Tby, Td7, TeJ, TeU;
+							    T4y = VSUB(T4w, T4x);
+							    Tby = VADD(T4x, T4w);
+							    Td7 = VADD(Tbp, Tbs);
+							    Tbt = VSUB(Tbp, Tbs);
+							    TeJ = VADD(TeH, TeI);
+							    TeU = VSUB(TeH, TeI);
+							    {
+								 V T7p, T3o, T7A, T2F;
+								 T7p = VSUB(T3m, T3n);
+								 T3o = VADD(T3m, T3n);
+								 T7A = VSUB(T2x, T2E);
+								 T2F = VADD(T2x, T2E);
+								 Tbz = VADD(Tbx, Tby);
+								 Tf1 = VSUB(Tbx, Tby);
+								 Td8 = VADD(Td6, Td7);
+								 TdK = VSUB(Td6, Td7);
+								 TeK = VFMA(LDK(KP707106781), TeJ, TeG);
+								 Tgq = VFNMS(LDK(KP707106781), TeJ, TeG);
+								 TeV = VFMA(LDK(KP707106781), TeU, TeT);
+								 Tgt = VFNMS(LDK(KP707106781), TeU, TeT);
+								 T7q = VFMA(LDK(KP923879532), T7p, T7o);
+								 T94 = VFNMS(LDK(KP923879532), T7p, T7o);
+								 T3p = VFMA(LDK(KP923879532), T3o, T3l);
+								 T5X = VFNMS(LDK(KP923879532), T3o, T3l);
+								 T7B = VFNMS(LDK(KP923879532), T7A, T7z);
+								 T97 = VFMA(LDK(KP923879532), T7A, T7z);
+								 T2G = VFMA(LDK(KP923879532), T2F, T2q);
+								 T5U = VFNMS(LDK(KP923879532), T2F, T2q);
+								 T3z = LD(&(xi[WS(is, 79)]), ivs, &(xi[WS(is, 1)]));
+							    }
+						       }
+						  }
+						  T3B = LD(&(xi[WS(is, 111)]), ivs, &(xi[WS(is, 1)]));
+						  T3C = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+					     }
+					     {
+						  V T3G, T3H, T3J, T3K;
+						  T3G = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+						  T3H = LD(&(xi[WS(is, 71)]), ivs, &(xi[WS(is, 1)]));
+						  T3J = LD(&(xi[WS(is, 103)]), ivs, &(xi[WS(is, 1)]));
+						  T3K = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V T3N, T3A, TbA, T3D, TbB, T3I, TbX, T3L, TbY, T3O;
+						       T3N = LD(&(xi[WS(is, 119)]), ivs, &(xi[WS(is, 1)]));
+						       T3A = VSUB(T3y, T3z);
+						       TbA = VADD(T3y, T3z);
+						       T3D = VSUB(T3B, T3C);
+						       TbB = VADD(T3B, T3C);
+						       T3I = VSUB(T3G, T3H);
+						       TbX = VADD(T3G, T3H);
+						       T3L = VSUB(T3J, T3K);
+						       TbY = VADD(T3K, T3J);
+						       T3O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+						       T3Q = LD(&(xi[WS(is, 87)]), ivs, &(xi[WS(is, 1)]));
+						       Tfe = VSUB(TbB, TbA);
+						       TbC = VADD(TbA, TbB);
+						       T4z = VSUB(T3D, T3A);
+						       T3E = VADD(T3A, T3D);
+						       T3R = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+						       Tf2 = VSUB(TbX, TbY);
+						       TbZ = VADD(TbX, TbY);
+						       T3M = VFMA(LDK(KP414213562), T3L, T3I);
+						       T4B = VFNMS(LDK(KP414213562), T3I, T3L);
+						       T3P = VSUB(T3N, T3O);
+						       TbU = VADD(T3N, T3O);
+						  }
+					     }
+					}
+					Tdd = VADD(Tbz, TbC);
+					TbD = VSUB(Tbz, TbC);
+					TbV = VADD(T3R, T3Q);
+					T3S = VSUB(T3Q, T3R);
+					T3F = VFMA(LDK(KP707106781), T3E, T3x);
+					T7H = VFNMS(LDK(KP707106781), T3E, T3x);
+					T4A = VFMA(LDK(KP707106781), T4z, T4y);
+					T7S = VFNMS(LDK(KP707106781), T4z, T4y);
+					TbW = VADD(TbU, TbV);
+					Tf3 = VSUB(TbU, TbV);
+					T4C = VFMA(LDK(KP414213562), T3P, T3S);
+					T3T = VFNMS(LDK(KP414213562), T3S, T3P);
+				   }
+				   {
+					V TD, Tae, TE, TJ, TK, TU, TV;
+					{
+					     V Ts, Tt, Tde, Tf4, Tff;
+					     Ts = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					     Tt = LD(&(xi[WS(is, 68)]), ivs, &(xi[0]));
+					     TD = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+					     Tde = VADD(TbZ, TbW);
+					     Tc0 = VSUB(TbW, TbZ);
+					     Tf4 = VADD(Tf2, Tf3);
+					     Tff = VSUB(Tf3, Tf2);
+					     {
+						  V T7I, T4D, T7T, T3U;
+						  T7I = VSUB(T4C, T4B);
+						  T4D = VADD(T4B, T4C);
+						  T7T = VSUB(T3T, T3M);
+						  T3U = VADD(T3M, T3T);
+						  Tae = VADD(Ts, Tt);
+						  Tu = VSUB(Ts, Tt);
+						  Tdf = VADD(Tdd, Tde);
+						  TdN = VSUB(Tdd, Tde);
+						  Tf5 = VFMA(LDK(KP707106781), Tf4, Tf1);
+						  Tgx = VFNMS(LDK(KP707106781), Tf4, Tf1);
+						  Tfg = VFMA(LDK(KP707106781), Tff, Tfe);
+						  TgA = VFNMS(LDK(KP707106781), Tff, Tfe);
+						  T7J = VFMA(LDK(KP923879532), T7I, T7H);
+						  T9b = VFNMS(LDK(KP923879532), T7I, T7H);
+						  T4E = VFMA(LDK(KP923879532), T4D, T4A);
+						  T64 = VFNMS(LDK(KP923879532), T4D, T4A);
+						  T7U = VFNMS(LDK(KP923879532), T7T, T7S);
+						  T9e = VFMA(LDK(KP923879532), T7T, T7S);
+						  T3V = VFMA(LDK(KP923879532), T3U, T3F);
+						  T61 = VFNMS(LDK(KP923879532), T3U, T3F);
+						  TE = LD(&(xi[WS(is, 100)]), ivs, &(xi[0]));
+					     }
+					}
+					TJ = LD(&(xi[WS(is, 124)]), ivs, &(xi[0]));
+					TK = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+					TU = LD(&(xi[WS(is, 92)]), ivs, &(xi[0]));
+					TV = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+					{
+					     V Tal, Tam, Tv, Tw, Taf;
+					     Tv = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+					     Tw = LD(&(xi[WS(is, 84)]), ivs, &(xi[0]));
+					     Taf = VADD(TD, TE);
+					     TF = VSUB(TD, TE);
+					     Ty = LD(&(xi[WS(is, 116)]), ivs, &(xi[0]));
+					     TL = VSUB(TJ, TK);
+					     Tal = VADD(TJ, TK);
+					     TW = VSUB(TU, TV);
+					     Tam = VADD(TV, TU);
+					     Tah = VADD(Tv, Tw);
+					     Tx = VSUB(Tv, Tw);
+					     Tag = VADD(Tae, Taf);
+					     Tee = VSUB(Tae, Taf);
+					     Tz = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+					     TM = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+					     TN = LD(&(xi[WS(is, 76)]), ivs, &(xi[0]));
+					     Teh = VSUB(Tal, Tam);
+					     Tan = VADD(Tal, Tam);
+					     TP = LD(&(xi[WS(is, 108)]), ivs, &(xi[0]));
+					     TQ = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+					}
+				   }
+			      }
+			 }
+			 {
+			      V Tev, TeA, Tdm, TaP, Tew, TaV, T1U, T29, T7f, T1N, T28, T7i, Tex, TaS, T21;
+			      V T2a;
+			      {
+				   V Tem, Ter, Ten, TaD, T1j, T1y, TaA, Tdp, T1c, T78, T7b, T1x, TaG, Teo, T1z;
+				   V T1q;
+				   {
+					V T14, T1v, Taw, Taz, T1b, T1w, T1n, T1o, T1m, TaE, TaF, T1p;
+					{
+					     V Tau, Tav, T15, T16, T18, T19;
+					     {
+						  V T12, Tai, TA, Tao, TO, T13;
+						  T12 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+						  Tai = VADD(Ty, Tz);
+						  TA = VSUB(Ty, Tz);
+						  Tao = VADD(TM, TN);
+						  TO = VSUB(TM, TN);
+						  T13 = LD(&(xi[WS(is, 66)]), ivs, &(xi[0]));
+						  {
+						       V T1t, Tap, TR, Taj, Tef, TG, TB, T1u;
+						       T1t = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+						       Tap = VADD(TP, TQ);
+						       TR = VSUB(TP, TQ);
+						       Taj = VADD(Tah, Tai);
+						       Tef = VSUB(Tah, Tai);
+						       TG = VSUB(Tx, TA);
+						       TB = VADD(Tx, TA);
+						       Tau = VADD(T12, T13);
+						       T14 = VSUB(T12, T13);
+						       T1u = LD(&(xi[WS(is, 98)]), ivs, &(xi[0]));
+						       {
+							    V Taq, Tei, TX, TS, Tak;
+							    Taq = VADD(Tao, Tap);
+							    Tei = VSUB(Tap, Tao);
+							    TX = VSUB(TR, TO);
+							    TS = VADD(TO, TR);
+							    Tak = VSUB(Tag, Taj);
+							    Td2 = VADD(Tag, Taj);
+							    {
+								 V Teg, Tfs, T71, TH;
+								 Teg = VFNMS(LDK(KP414213562), Tef, Tee);
+								 Tfs = VFMA(LDK(KP414213562), Tee, Tef);
+								 T71 = VFNMS(LDK(KP707106781), TG, TF);
+								 TH = VFMA(LDK(KP707106781), TG, TF);
+								 {
+								      V T70, TC, Tar, Tej, Tfr;
+								      T70 = VFNMS(LDK(KP707106781), TB, Tu);
+								      TC = VFMA(LDK(KP707106781), TB, Tu);
+								      Tar = VSUB(Tan, Taq);
+								      Td3 = VADD(Tan, Taq);
+								      Tej = VFNMS(LDK(KP414213562), Tei, Teh);
+								      Tfr = VFMA(LDK(KP414213562), Teh, Tei);
+								      {
+									   V T74, TY, T73, TT;
+									   T74 = VFNMS(LDK(KP707106781), TX, TW);
+									   TY = VFMA(LDK(KP707106781), TX, TW);
+									   T73 = VFNMS(LDK(KP707106781), TS, TL);
+									   TT = VFMA(LDK(KP707106781), TS, TL);
+									   T85 = VFNMS(LDK(KP668178637), T70, T71);
+									   T72 = VFMA(LDK(KP668178637), T71, T70);
+									   T4V = VFMA(LDK(KP198912367), TC, TH);
+									   TI = VFNMS(LDK(KP198912367), TH, TC);
+									   Tcd = VSUB(Tar, Tak);
+									   Tas = VADD(Tak, Tar);
+									   TgH = VSUB(Tej, Teg);
+									   Tek = VADD(Teg, Tej);
+									   Tgg = VADD(Tfs, Tfr);
+									   Tft = VSUB(Tfr, Tfs);
+									   T86 = VFNMS(LDK(KP668178637), T73, T74);
+									   T75 = VFMA(LDK(KP668178637), T74, T73);
+									   T4W = VFMA(LDK(KP198912367), TT, TY);
+									   TZ = VFNMS(LDK(KP198912367), TY, TT);
+									   Tav = VADD(T1t, T1u);
+									   T1v = VSUB(T1t, T1u);
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					     T15 = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+					     T16 = LD(&(xi[WS(is, 82)]), ivs, &(xi[0]));
+					     T18 = LD(&(xi[WS(is, 114)]), ivs, &(xi[0]));
+					     T19 = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+					     {
+						  V T1d, T1e, T1g, T1h, Tax, T17, Tay, T1a;
+						  T1d = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+						  Taw = VADD(Tau, Tav);
+						  Tem = VSUB(Tau, Tav);
+						  T1e = LD(&(xi[WS(is, 74)]), ivs, &(xi[0]));
+						  T1g = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+						  T1h = LD(&(xi[WS(is, 106)]), ivs, &(xi[0]));
+						  Tax = VADD(T15, T16);
+						  T17 = VSUB(T15, T16);
+						  Tay = VADD(T18, T19);
+						  T1a = VSUB(T18, T19);
+						  {
+						       V T1k, T1f, TaB, T1i, TaC, T1l;
+						       T1k = LD(&(xi[WS(is, 122)]), ivs, &(xi[0]));
+						       T1f = VSUB(T1d, T1e);
+						       TaB = VADD(T1d, T1e);
+						       T1i = VSUB(T1g, T1h);
+						       TaC = VADD(T1g, T1h);
+						       T1l = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+						       Taz = VADD(Tax, Tay);
+						       Ter = VSUB(Tax, Tay);
+						       T1b = VADD(T17, T1a);
+						       T1w = VSUB(T17, T1a);
+						       T1n = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+						       T1o = LD(&(xi[WS(is, 90)]), ivs, &(xi[0]));
+						       Ten = VSUB(TaB, TaC);
+						       TaD = VADD(TaB, TaC);
+						       T1j = VFNMS(LDK(KP414213562), T1i, T1f);
+						       T1y = VFMA(LDK(KP414213562), T1f, T1i);
+						       T1m = VSUB(T1k, T1l);
+						       TaE = VADD(T1k, T1l);
+						  }
+					     }
+					}
+					TaA = VSUB(Taw, Taz);
+					Tdp = VADD(Taw, Taz);
+					TaF = VADD(T1n, T1o);
+					T1p = VSUB(T1n, T1o);
+					T1c = VFMA(LDK(KP707106781), T1b, T14);
+					T78 = VFNMS(LDK(KP707106781), T1b, T14);
+					T7b = VFNMS(LDK(KP707106781), T1w, T1v);
+					T1x = VFMA(LDK(KP707106781), T1w, T1v);
+					TaG = VADD(TaE, TaF);
+					Teo = VSUB(TaE, TaF);
+					T1z = VFNMS(LDK(KP414213562), T1m, T1p);
+					T1q = VFMA(LDK(KP414213562), T1p, T1m);
+				   }
+				   {
+					V T1F, T26, T1Q, TaT, TaL, TaO, T27, T1M, T1Y, T1Z, TaU, T1T, TaQ, T1X, T20;
+					V TaR;
+					{
+					     V T24, TaJ, T25, T1G, T1H, T1J, T1K, T1D, T1E;
+					     T1D = LD(&(xi[WS(is, 126)]), ivs, &(xi[0]));
+					     T1E = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+					     T24 = LD(&(xi[WS(is, 94)]), ivs, &(xi[0]));
+					     {
+						  V TaH, Tdq, Tes, Tep;
+						  TaH = VSUB(TaD, TaG);
+						  Tdq = VADD(TaD, TaG);
+						  Tes = VSUB(Ten, Teo);
+						  Tep = VADD(Ten, Teo);
+						  {
+						       V T79, T1A, T7c, T1r;
+						       T79 = VSUB(T1y, T1z);
+						       T1A = VADD(T1y, T1z);
+						       T7c = VSUB(T1j, T1q);
+						       T1r = VADD(T1j, T1q);
+						       TaJ = VADD(T1D, T1E);
+						       T1F = VSUB(T1D, T1E);
+						       TaI = VFNMS(LDK(KP414213562), TaH, TaA);
+						       Tcg = VFMA(LDK(KP414213562), TaA, TaH);
+						       Tdr = VADD(Tdp, Tdq);
+						       TdG = VSUB(Tdp, Tdq);
+						       Tgi = VFNMS(LDK(KP707106781), Tes, Ter);
+						       Tet = VFMA(LDK(KP707106781), Tes, Ter);
+						       Tgj = VFNMS(LDK(KP707106781), Tep, Tem);
+						       Teq = VFMA(LDK(KP707106781), Tep, Tem);
+						       T8X = VFNMS(LDK(KP923879532), T79, T78);
+						       T7a = VFMA(LDK(KP923879532), T79, T78);
+						       T5M = VFNMS(LDK(KP923879532), T1A, T1x);
+						       T1B = VFMA(LDK(KP923879532), T1A, T1x);
+						       T8W = VFMA(LDK(KP923879532), T7c, T7b);
+						       T7d = VFNMS(LDK(KP923879532), T7c, T7b);
+						       T5N = VFNMS(LDK(KP923879532), T1r, T1c);
+						       T1s = VFMA(LDK(KP923879532), T1r, T1c);
+						       T25 = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+						  }
+					     }
+					     T1G = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+					     T1H = LD(&(xi[WS(is, 78)]), ivs, &(xi[0]));
+					     T1J = LD(&(xi[WS(is, 110)]), ivs, &(xi[0]));
+					     T1K = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+					     {
+						  V T1R, T1I, TaM, T1L, TaN, T1S, T1O, T1P, TaK, T1V, T1W;
+						  T1O = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+						  T1P = LD(&(xi[WS(is, 70)]), ivs, &(xi[0]));
+						  T26 = VSUB(T24, T25);
+						  TaK = VADD(T25, T24);
+						  T1R = LD(&(xi[WS(is, 102)]), ivs, &(xi[0]));
+						  T1I = VSUB(T1G, T1H);
+						  TaM = VADD(T1G, T1H);
+						  T1L = VSUB(T1J, T1K);
+						  TaN = VADD(T1J, T1K);
+						  T1Q = VSUB(T1O, T1P);
+						  TaT = VADD(T1O, T1P);
+						  Tev = VSUB(TaJ, TaK);
+						  TaL = VADD(TaJ, TaK);
+						  T1S = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+						  T1V = LD(&(xi[WS(is, 118)]), ivs, &(xi[0]));
+						  T1W = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+						  TeA = VSUB(TaN, TaM);
+						  TaO = VADD(TaM, TaN);
+						  T27 = VSUB(T1L, T1I);
+						  T1M = VADD(T1I, T1L);
+						  T1Y = LD(&(xi[WS(is, 86)]), ivs, &(xi[0]));
+						  T1Z = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+						  TaU = VADD(T1S, T1R);
+						  T1T = VSUB(T1R, T1S);
+						  TaQ = VADD(T1V, T1W);
+						  T1X = VSUB(T1V, T1W);
+					     }
+					}
+					Tdm = VADD(TaL, TaO);
+					TaP = VSUB(TaL, TaO);
+					T20 = VSUB(T1Y, T1Z);
+					TaR = VADD(T1Z, T1Y);
+					Tew = VSUB(TaT, TaU);
+					TaV = VADD(TaT, TaU);
+					T1U = VFMA(LDK(KP414213562), T1T, T1Q);
+					T29 = VFNMS(LDK(KP414213562), T1Q, T1T);
+					T7f = VFNMS(LDK(KP707106781), T1M, T1F);
+					T1N = VFMA(LDK(KP707106781), T1M, T1F);
+					T28 = VFMA(LDK(KP707106781), T27, T26);
+					T7i = VFNMS(LDK(KP707106781), T27, T26);
+					Tex = VSUB(TaQ, TaR);
+					TaS = VADD(TaQ, TaR);
+					T21 = VFNMS(LDK(KP414213562), T20, T1X);
+					T2a = VFMA(LDK(KP414213562), T1X, T20);
+				   }
+			      }
+			      {
+				   V T2J, T2U, T30, T3b, TeL, Tb9, TeO, Tbg, T2M, Tba, T2P, Tbb, T34, Tbh, T33;
+				   V T35;
+				   {
+					V T2H, T2I, T2S, T2T, T2Y, T2Z, T39, T3a;
+					T2H = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+					{
+					     V Tdn, TaW, Tey, TeB;
+					     Tdn = VADD(TaV, TaS);
+					     TaW = VSUB(TaS, TaV);
+					     Tey = VADD(Tew, Tex);
+					     TeB = VSUB(Tex, Tew);
+					     {
+						  V T2b, T7g, T22, T7j;
+						  T2b = VADD(T29, T2a);
+						  T7g = VSUB(T2a, T29);
+						  T22 = VADD(T1U, T21);
+						  T7j = VSUB(T21, T1U);
+						  TaX = VFNMS(LDK(KP414213562), TaW, TaP);
+						  Tcf = VFMA(LDK(KP414213562), TaP, TaW);
+						  Tdo = VADD(Tdm, Tdn);
+						  TdH = VSUB(Tdm, Tdn);
+						  Tgl = VFNMS(LDK(KP707106781), TeB, TeA);
+						  TeC = VFMA(LDK(KP707106781), TeB, TeA);
+						  Tgm = VFNMS(LDK(KP707106781), Tey, Tev);
+						  Tez = VFMA(LDK(KP707106781), Tey, Tev);
+						  T90 = VFNMS(LDK(KP923879532), T7g, T7f);
+						  T7h = VFMA(LDK(KP923879532), T7g, T7f);
+						  T5P = VFNMS(LDK(KP923879532), T2b, T28);
+						  T2c = VFMA(LDK(KP923879532), T2b, T28);
+						  T8Z = VFMA(LDK(KP923879532), T7j, T7i);
+						  T7k = VFNMS(LDK(KP923879532), T7j, T7i);
+						  T5Q = VFNMS(LDK(KP923879532), T22, T1N);
+						  T23 = VFMA(LDK(KP923879532), T22, T1N);
+						  T2I = LD(&(xi[WS(is, 69)]), ivs, &(xi[WS(is, 1)]));
+					     }
+					}
+					T2S = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+					T2T = LD(&(xi[WS(is, 101)]), ivs, &(xi[WS(is, 1)]));
+					T2Y = LD(&(xi[WS(is, 125)]), ivs, &(xi[WS(is, 1)]));
+					T2Z = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+					T39 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+					T3a = LD(&(xi[WS(is, 93)]), ivs, &(xi[WS(is, 1)]));
+					{
+					     V T2K, Tbe, Tbf, T2L, T2N, T2O, Tb7, Tb8, T31, T32;
+					     T2K = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+					     T2J = VSUB(T2H, T2I);
+					     Tb7 = VADD(T2H, T2I);
+					     T2U = VSUB(T2S, T2T);
+					     Tb8 = VADD(T2S, T2T);
+					     T30 = VSUB(T2Y, T2Z);
+					     Tbe = VADD(T2Y, T2Z);
+					     T3b = VSUB(T39, T3a);
+					     Tbf = VADD(T39, T3a);
+					     T2L = LD(&(xi[WS(is, 85)]), ivs, &(xi[WS(is, 1)]));
+					     T2N = LD(&(xi[WS(is, 117)]), ivs, &(xi[WS(is, 1)]));
+					     T2O = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+					     TeL = VSUB(Tb7, Tb8);
+					     Tb9 = VADD(Tb7, Tb8);
+					     T31 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+					     T32 = LD(&(xi[WS(is, 77)]), ivs, &(xi[WS(is, 1)]));
+					     TeO = VSUB(Tbe, Tbf);
+					     Tbg = VADD(Tbe, Tbf);
+					     T2M = VSUB(T2K, T2L);
+					     Tba = VADD(T2K, T2L);
+					     T2P = VSUB(T2N, T2O);
+					     Tbb = VADD(T2N, T2O);
+					     T34 = LD(&(xi[WS(is, 109)]), ivs, &(xi[WS(is, 1)]));
+					     Tbh = VADD(T31, T32);
+					     T33 = VSUB(T31, T32);
+					     T35 = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+					}
+				   }
+				   {
+					V T4d, T4e, T4o, T4p;
+					{
+					     V T2X, T3q, T7t, T7C, T3r, T3e, T7D, T7w;
+					     {
+						  V T47, TbE, Tbd, Td9, TeW, TeN, T7s, T2W, T7r, T2R, TeP, Tbj, T37, T3c, T48;
+						  {
+						       V T3W, T3X, TeM, Tbc, T2Q, T2V, Tbi, T36;
+						       T3W = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+						       T3X = LD(&(xi[WS(is, 67)]), ivs, &(xi[WS(is, 1)]));
+						       TeM = VSUB(Tba, Tbb);
+						       Tbc = VADD(Tba, Tbb);
+						       T2Q = VADD(T2M, T2P);
+						       T2V = VSUB(T2M, T2P);
+						       T47 = LD(&(xi[WS(is, 99)]), ivs, &(xi[WS(is, 1)]));
+						       Tbi = VADD(T34, T35);
+						       T36 = VSUB(T34, T35);
+						       TbE = VADD(T3W, T3X);
+						       T3Y = VSUB(T3W, T3X);
+						       Tbd = VSUB(Tb9, Tbc);
+						       Td9 = VADD(Tb9, Tbc);
+						       TeW = VFMA(LDK(KP414213562), TeL, TeM);
+						       TeN = VFNMS(LDK(KP414213562), TeM, TeL);
+						       T7s = VFNMS(LDK(KP707106781), T2V, T2U);
+						       T2W = VFMA(LDK(KP707106781), T2V, T2U);
+						       T7r = VFNMS(LDK(KP707106781), T2Q, T2J);
+						       T2R = VFMA(LDK(KP707106781), T2Q, T2J);
+						       TeP = VSUB(Tbh, Tbi);
+						       Tbj = VADD(Tbh, Tbi);
+						       T37 = VADD(T33, T36);
+						       T3c = VSUB(T33, T36);
+						       T48 = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+						  }
+						  T2X = VFNMS(LDK(KP198912367), T2W, T2R);
+						  T3q = VFMA(LDK(KP198912367), T2R, T2W);
+						  T7t = VFMA(LDK(KP668178637), T7s, T7r);
+						  T7C = VFNMS(LDK(KP668178637), T7r, T7s);
+						  {
+						       V Tbk, Tda, TeX, TeQ;
+						       Tbk = VSUB(Tbg, Tbj);
+						       Tda = VADD(Tbg, Tbj);
+						       TeX = VFNMS(LDK(KP414213562), TeO, TeP);
+						       TeQ = VFMA(LDK(KP414213562), TeP, TeO);
+						       {
+							    V T7v, T3d, T7u, T38, TbF;
+							    T7v = VFNMS(LDK(KP707106781), T3c, T3b);
+							    T3d = VFMA(LDK(KP707106781), T3c, T3b);
+							    T7u = VFNMS(LDK(KP707106781), T37, T30);
+							    T38 = VFMA(LDK(KP707106781), T37, T30);
+							    T49 = VSUB(T47, T48);
+							    TbF = VADD(T48, T47);
+							    TdL = VSUB(Td9, Tda);
+							    Tdb = VADD(Td9, Tda);
+							    Tbu = VSUB(Tbd, Tbk);
+							    Tbl = VADD(Tbd, Tbk);
+							    Tgu = VSUB(TeN, TeQ);
+							    TeR = VADD(TeN, TeQ);
+							    Tgr = VSUB(TeW, TeX);
+							    TeY = VADD(TeW, TeX);
+							    T3r = VFNMS(LDK(KP198912367), T38, T3d);
+							    T3e = VFMA(LDK(KP198912367), T3d, T38);
+							    T7D = VFMA(LDK(KP668178637), T7u, T7v);
+							    T7w = VFNMS(LDK(KP668178637), T7v, T7u);
+							    Tf6 = VSUB(TbE, TbF);
+							    TbG = VADD(TbE, TbF);
+						       }
+						  }
+					     }
+					     T4d = LD(&(xi[WS(is, 123)]), ivs, &(xi[WS(is, 1)]));
+					     T5V = VSUB(T3q, T3r);
+					     T3s = VADD(T3q, T3r);
+					     T5Y = VSUB(T2X, T3e);
+					     T3f = VADD(T2X, T3e);
+					     T95 = VSUB(T7D, T7C);
+					     T7E = VADD(T7C, T7D);
+					     T98 = VSUB(T7t, T7w);
+					     T7x = VADD(T7t, T7w);
+					     T4e = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+					     T4o = LD(&(xi[WS(is, 91)]), ivs, &(xi[WS(is, 1)]));
+					     T4p = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+					}
+					{
+					     V T3Z, T40, T42, T43, TbL, TbM;
+					     T3Z = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+					     T40 = LD(&(xi[WS(is, 83)]), ivs, &(xi[WS(is, 1)]));
+					     T42 = LD(&(xi[WS(is, 115)]), ivs, &(xi[WS(is, 1)]));
+					     T43 = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+					     T4g = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+					     T4f = VSUB(T4d, T4e);
+					     TbL = VADD(T4d, T4e);
+					     T4q = VSUB(T4o, T4p);
+					     TbM = VADD(T4p, T4o);
+					     TbH = VADD(T3Z, T40);
+					     T41 = VSUB(T3Z, T40);
+					     TbI = VADD(T42, T43);
+					     T44 = VSUB(T42, T43);
+					     T4h = LD(&(xi[WS(is, 75)]), ivs, &(xi[WS(is, 1)]));
+					     T4j = LD(&(xi[WS(is, 107)]), ivs, &(xi[WS(is, 1)]));
+					     T4k = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+					     Tf9 = VSUB(TbL, TbM);
+					     TbN = VADD(TbL, TbM);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V TgB, Tgy, T62, T4H, T65, T4u, T9c, T7X, T9f, T7Q, Tg0, Tga, TfF, TeF, TfT;
+			 V TfU, TfP, Tg7, TfI, Tfy, TfA, Tf0, Tfz, Tfl, Tg2, TfS;
+			 {
+			      V Tc1, TbS, Tfc, Tfj, TdX, Te5, TdZ, TdR, Te7, Te3, TdU, Te4;
+			      {
+				   V TdF, TdS, Tdx, Td5, TdO, TdE, TdC, Tdt, Tdk;
+				   {
+					V Tdc, TdA, T4F, T4c, T7V, T7M, T4G, T4t, T7W, T7P, TdB, Tdj;
+					{
+					     V Td1, Tdg, TbK, Tf8, Tfh, T4b, T7L, T46, T7K, TbQ, Tfa, T4r, T4m, Td4;
+					     TdF = VSUB(TcZ, Td0);
+					     Td1 = VADD(TcZ, Td0);
+					     {
+						  V TbJ, Tf7, T4a, T45;
+						  TbJ = VADD(TbH, TbI);
+						  Tf7 = VSUB(TbI, TbH);
+						  T4a = VSUB(T44, T41);
+						  T45 = VADD(T41, T44);
+						  {
+						       V TbO, T4i, TbP, T4l;
+						       TbO = VADD(T4g, T4h);
+						       T4i = VSUB(T4g, T4h);
+						       TbP = VADD(T4j, T4k);
+						       T4l = VSUB(T4j, T4k);
+						       Tdg = VADD(TbG, TbJ);
+						       TbK = VSUB(TbG, TbJ);
+						       Tf8 = VFMA(LDK(KP414213562), Tf7, Tf6);
+						       Tfh = VFNMS(LDK(KP414213562), Tf6, Tf7);
+						       T4b = VFMA(LDK(KP707106781), T4a, T49);
+						       T7L = VFNMS(LDK(KP707106781), T4a, T49);
+						       T46 = VFMA(LDK(KP707106781), T45, T3Y);
+						       T7K = VFNMS(LDK(KP707106781), T45, T3Y);
+						       TbQ = VADD(TbO, TbP);
+						       Tfa = VSUB(TbP, TbO);
+						       T4r = VSUB(T4l, T4i);
+						       T4m = VADD(T4i, T4l);
+						       Td4 = VADD(Td2, Td3);
+						       TdS = VSUB(Td3, Td2);
+						  }
+					     }
+					     Tdc = VSUB(Td8, Tdb);
+					     TdA = VADD(Td8, Tdb);
+					     T4F = VFNMS(LDK(KP198912367), T46, T4b);
+					     T4c = VFMA(LDK(KP198912367), T4b, T46);
+					     T7V = VFMA(LDK(KP668178637), T7K, T7L);
+					     T7M = VFNMS(LDK(KP668178637), T7L, T7K);
+					     {
+						  V Tdh, TbR, Tfb, Tfi;
+						  Tdh = VADD(TbN, TbQ);
+						  TbR = VSUB(TbN, TbQ);
+						  Tfb = VFNMS(LDK(KP414213562), Tfa, Tf9);
+						  Tfi = VFMA(LDK(KP414213562), Tf9, Tfa);
+						  {
+						       V T4s, T7O, T4n, T7N, Tdi;
+						       T4s = VFMA(LDK(KP707106781), T4r, T4q);
+						       T7O = VFNMS(LDK(KP707106781), T4r, T4q);
+						       T4n = VFMA(LDK(KP707106781), T4m, T4f);
+						       T7N = VFNMS(LDK(KP707106781), T4m, T4f);
+						       Tdx = VADD(Td1, Td4);
+						       Td5 = VSUB(Td1, Td4);
+						       TdO = VSUB(Tdh, Tdg);
+						       Tdi = VADD(Tdg, Tdh);
+						       Tc1 = VSUB(TbR, TbK);
+						       TbS = VADD(TbK, TbR);
+						       TgB = VSUB(Tfb, Tf8);
+						       Tfc = VADD(Tf8, Tfb);
+						       Tgy = VSUB(Tfi, Tfh);
+						       Tfj = VADD(Tfh, Tfi);
+						       T4G = VFMA(LDK(KP198912367), T4n, T4s);
+						       T4t = VFNMS(LDK(KP198912367), T4s, T4n);
+						       T7W = VFNMS(LDK(KP668178637), T7N, T7O);
+						       T7P = VFMA(LDK(KP668178637), T7O, T7N);
+						       TdB = VADD(Tdf, Tdi);
+						       Tdj = VSUB(Tdf, Tdi);
+						  }
+					     }
+					}
+					T62 = VSUB(T4G, T4F);
+					T4H = VADD(T4F, T4G);
+					T65 = VSUB(T4t, T4c);
+					T4u = VADD(T4c, T4t);
+					T9c = VSUB(T7V, T7W);
+					T7X = VADD(T7V, T7W);
+					T9f = VSUB(T7P, T7M);
+					T7Q = VADD(T7M, T7P);
+					TdE = VSUB(TdB, TdA);
+					TdC = VADD(TdA, TdB);
+					Tdt = VSUB(Tdj, Tdc);
+					Tdk = VADD(Tdc, Tdj);
+				   }
+				   {
+					V TdT, Tdl, Tdv, TdJ, Te1, Te2, TdQ, Tdz, TdD, Tdu, Tdw;
+					{
+					     V TdI, TdP, TdV, TdW, TdM, Tds, Tdy;
+					     TdI = VADD(TdG, TdH);
+					     TdT = VSUB(TdH, TdG);
+					     TdP = VFNMS(LDK(KP414213562), TdO, TdN);
+					     TdV = VFMA(LDK(KP414213562), TdN, TdO);
+					     TdW = VFMA(LDK(KP414213562), TdK, TdL);
+					     TdM = VFNMS(LDK(KP414213562), TdL, TdK);
+					     Tdl = VFNMS(LDK(KP707106781), Tdk, Td5);
+					     Tdv = VFMA(LDK(KP707106781), Tdk, Td5);
+					     Tds = VSUB(Tdo, Tdr);
+					     Tdy = VADD(Tdr, Tdo);
+					     TdJ = VFMA(LDK(KP707106781), TdI, TdF);
+					     Te1 = VFNMS(LDK(KP707106781), TdI, TdF);
+					     TdX = VSUB(TdV, TdW);
+					     Te2 = VADD(TdW, TdV);
+					     Te5 = VSUB(TdP, TdM);
+					     TdQ = VADD(TdM, TdP);
+					     Tdz = VADD(Tdx, Tdy);
+					     TdD = VSUB(Tdx, Tdy);
+					     Tdu = VFNMS(LDK(KP707106781), Tdt, Tds);
+					     Tdw = VFMA(LDK(KP707106781), Tdt, Tds);
+					}
+					TdZ = VFMA(LDK(KP923879532), TdQ, TdJ);
+					TdR = VFNMS(LDK(KP923879532), TdQ, TdJ);
+					Te7 = VFMA(LDK(KP923879532), Te2, Te1);
+					Te3 = VFNMS(LDK(KP923879532), Te2, Te1);
+					ST(&(xo[WS(os, 32)]), VFMAI(TdE, TdD), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 96)]), VFNMSI(TdE, TdD), ovs, &(xo[0]));
+					ST(&(xo[0]), VADD(Tdz, TdC), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 64)]), VSUB(Tdz, TdC), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 16)]), VFMAI(Tdw, Tdv), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 112)]), VFNMSI(Tdw, Tdv), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 80)]), VFMAI(Tdu, Tdl), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 48)]), VFNMSI(Tdu, Tdl), ovs, &(xo[0]));
+					TdU = VFMA(LDK(KP707106781), TdT, TdS);
+					Te4 = VFNMS(LDK(KP707106781), TdT, TdS);
+				   }
+			      }
+			      {
+				   V Tcx, TcJ, TcI, Tcy, TcA, Tbm, Tcp, TaZ, Tcs, Tci, Tbv, TcB, TcD, TbT, Tc2;
+				   V TcE, Tat, TaY;
+				   Tcx = VFNMS(LDK(KP707106781), Tas, Tad);
+				   Tat = VFMA(LDK(KP707106781), Tas, Tad);
+				   TaY = VADD(TaI, TaX);
+				   TcJ = VSUB(TaX, TaI);
+				   {
+					V Tce, Tch, Te8, Te6, TdY, Te0;
+					TcI = VFNMS(LDK(KP707106781), Tcd, Tcc);
+					Tce = VFMA(LDK(KP707106781), Tcd, Tcc);
+					Tch = VSUB(Tcf, Tcg);
+					Tcy = VADD(Tcg, Tcf);
+					Te8 = VFNMS(LDK(KP923879532), Te5, Te4);
+					Te6 = VFMA(LDK(KP923879532), Te5, Te4);
+					TdY = VFNMS(LDK(KP923879532), TdX, TdU);
+					Te0 = VFMA(LDK(KP923879532), TdX, TdU);
+					TcA = VFNMS(LDK(KP707106781), Tbl, Tb6);
+					Tbm = VFMA(LDK(KP707106781), Tbl, Tb6);
+					Tcp = VFNMS(LDK(KP923879532), TaY, Tat);
+					TaZ = VFMA(LDK(KP923879532), TaY, Tat);
+					Tcs = VFNMS(LDK(KP923879532), Tch, Tce);
+					Tci = VFMA(LDK(KP923879532), Tch, Tce);
+					ST(&(xo[WS(os, 88)]), VFNMSI(Te6, Te3), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 40)]), VFMAI(Te6, Te3), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 104)]), VFMAI(Te8, Te7), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 24)]), VFNMSI(Te8, Te7), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 8)]), VFMAI(Te0, TdZ), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 120)]), VFNMSI(Te0, TdZ), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 72)]), VFMAI(TdY, TdR), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 56)]), VFNMSI(TdY, TdR), ovs, &(xo[0]));
+					Tbv = VFMA(LDK(KP707106781), Tbu, Tbt);
+					TcB = VFNMS(LDK(KP707106781), Tbu, Tbt);
+					TcD = VFNMS(LDK(KP707106781), TbS, TbD);
+					TbT = VFMA(LDK(KP707106781), TbS, TbD);
+					Tc2 = VFMA(LDK(KP707106781), Tc1, Tc0);
+					TcE = VFNMS(LDK(KP707106781), Tc1, Tc0);
+				   }
+				   {
+					V TcR, Tcz, TcU, TcK, Tcq, Tcl, Tct, Tc4;
+					{
+					     V Tck, Tbw, Tcj, Tc3;
+					     Tck = VFMA(LDK(KP198912367), Tbm, Tbv);
+					     Tbw = VFNMS(LDK(KP198912367), Tbv, Tbm);
+					     Tcj = VFMA(LDK(KP198912367), TbT, Tc2);
+					     Tc3 = VFNMS(LDK(KP198912367), Tc2, TbT);
+					     TcR = VFNMS(LDK(KP923879532), Tcy, Tcx);
+					     Tcz = VFMA(LDK(KP923879532), Tcy, Tcx);
+					     TcU = VFMA(LDK(KP923879532), TcJ, TcI);
+					     TcK = VFNMS(LDK(KP923879532), TcJ, TcI);
+					     Tcq = VADD(Tck, Tcj);
+					     Tcl = VSUB(Tcj, Tck);
+					     Tct = VSUB(Tc3, Tbw);
+					     Tc4 = VADD(Tbw, Tc3);
+					}
+					{
+					     V TfN, Tel, TfY, Tfu, Tfw, Tfv, TcT, TcX, TcQ, TcO, TcW, TcY, TcP, TcH, TfZ;
+					     V TeE;
+					     {
+						  V Teu, TcS, TcN, TcV, TcG, TeD;
+						  TfN = VFNMS(LDK(KP923879532), Tek, Ted);
+						  Tel = VFMA(LDK(KP923879532), Tek, Ted);
+						  {
+						       V TcM, TcC, Tcr, Tcv;
+						       TcM = VFNMS(LDK(KP668178637), TcA, TcB);
+						       TcC = VFMA(LDK(KP668178637), TcB, TcA);
+						       Tcr = VFNMS(LDK(KP980785280), Tcq, Tcp);
+						       Tcv = VFMA(LDK(KP980785280), Tcq, Tcp);
+						       {
+							    V Tco, Tcm, Tcu, Tcw;
+							    Tco = VFMA(LDK(KP980785280), Tcl, Tci);
+							    Tcm = VFNMS(LDK(KP980785280), Tcl, Tci);
+							    Tcu = VFMA(LDK(KP980785280), Tct, Tcs);
+							    Tcw = VFNMS(LDK(KP980785280), Tct, Tcs);
+							    {
+								 V Tcn, Tc5, TcL, TcF;
+								 Tcn = VFMA(LDK(KP980785280), Tc4, TaZ);
+								 Tc5 = VFNMS(LDK(KP980785280), Tc4, TaZ);
+								 TcL = VFNMS(LDK(KP668178637), TcD, TcE);
+								 TcF = VFMA(LDK(KP668178637), TcE, TcD);
+								 TfY = VFNMS(LDK(KP923879532), Tft, Tfq);
+								 Tfu = VFMA(LDK(KP923879532), Tft, Tfq);
+								 Tfw = VFMA(LDK(KP198912367), Teq, Tet);
+								 Teu = VFNMS(LDK(KP198912367), Tet, Teq);
+								 ST(&(xo[WS(os, 92)]), VFNMSI(Tcu, Tcr), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 36)]), VFMAI(Tcu, Tcr), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 100)]), VFMAI(Tcw, Tcv), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 28)]), VFNMSI(Tcw, Tcv), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 4)]), VFMAI(Tco, Tcn), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 124)]), VFNMSI(Tco, Tcn), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 68)]), VFMAI(Tcm, Tc5), ovs, &(xo[0]));
+								 ST(&(xo[WS(os, 60)]), VFNMSI(Tcm, Tc5), ovs, &(xo[0]));
+								 TcS = VADD(TcM, TcL);
+								 TcN = VSUB(TcL, TcM);
+								 TcV = VSUB(TcF, TcC);
+								 TcG = VADD(TcC, TcF);
+								 TeD = VFNMS(LDK(KP198912367), TeC, Tez);
+								 Tfv = VFMA(LDK(KP198912367), Tez, TeC);
+							    }
+						       }
+						  }
+						  TcT = VFMA(LDK(KP831469612), TcS, TcR);
+						  TcX = VFNMS(LDK(KP831469612), TcS, TcR);
+						  TcQ = VFMA(LDK(KP831469612), TcN, TcK);
+						  TcO = VFNMS(LDK(KP831469612), TcN, TcK);
+						  TcW = VFNMS(LDK(KP831469612), TcV, TcU);
+						  TcY = VFMA(LDK(KP831469612), TcV, TcU);
+						  TcP = VFMA(LDK(KP831469612), TcG, Tcz);
+						  TcH = VFNMS(LDK(KP831469612), TcG, Tcz);
+						  TfZ = VSUB(TeD, Teu);
+						  TeE = VADD(Teu, TeD);
+					     }
+					     {
+						  V TfQ, TeS, TfO, Tfx, TeZ, TfR, Tfd, Tfk;
+						  TfQ = VFNMS(LDK(KP923879532), TeR, TeK);
+						  TeS = VFMA(LDK(KP923879532), TeR, TeK);
+						  ST(&(xo[WS(os, 84)]), VFMAI(TcW, TcT), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 44)]), VFNMSI(TcW, TcT), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 108)]), VFNMSI(TcY, TcX), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 20)]), VFMAI(TcY, TcX), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 116)]), VFMAI(TcQ, TcP), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 12)]), VFNMSI(TcQ, TcP), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 52)]), VFMAI(TcO, TcH), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 76)]), VFNMSI(TcO, TcH), ovs, &(xo[0]));
+						  Tg0 = VFNMS(LDK(KP980785280), TfZ, TfY);
+						  Tga = VFMA(LDK(KP980785280), TfZ, TfY);
+						  TfF = VFNMS(LDK(KP980785280), TeE, Tel);
+						  TeF = VFMA(LDK(KP980785280), TeE, Tel);
+						  TfO = VADD(Tfw, Tfv);
+						  Tfx = VSUB(Tfv, Tfw);
+						  TeZ = VFMA(LDK(KP923879532), TeY, TeV);
+						  TfR = VFNMS(LDK(KP923879532), TeY, TeV);
+						  TfT = VFNMS(LDK(KP923879532), Tfc, Tf5);
+						  Tfd = VFMA(LDK(KP923879532), Tfc, Tf5);
+						  Tfk = VFMA(LDK(KP923879532), Tfj, Tfg);
+						  TfU = VFNMS(LDK(KP923879532), Tfj, Tfg);
+						  TfP = VFMA(LDK(KP980785280), TfO, TfN);
+						  Tg7 = VFNMS(LDK(KP980785280), TfO, TfN);
+						  TfI = VFNMS(LDK(KP980785280), Tfx, Tfu);
+						  Tfy = VFMA(LDK(KP980785280), Tfx, Tfu);
+						  TfA = VFMA(LDK(KP098491403), TeS, TeZ);
+						  Tf0 = VFNMS(LDK(KP098491403), TeZ, TeS);
+						  Tfz = VFMA(LDK(KP098491403), Tfd, Tfk);
+						  Tfl = VFNMS(LDK(KP098491403), Tfk, Tfd);
+						  Tg2 = VFNMS(LDK(KP820678790), TfQ, TfR);
+						  TfS = VFMA(LDK(KP820678790), TfR, TfQ);
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T8x, T8y, T8F, T8w, T8k, T8f, T8n, T80, T9l, T76, T87, T8U, T89, T7e, T7l;
+			      V T8a;
+			      {
+				   V The, Tho, TgT, Tgp, Th7, Th8, Thg, Th6, Th3, Thl, TgW, TgM, TgU, TgP, TgX;
+				   V TgE;
+				   {
+					V Th1, TgI, TgK, TgJ;
+					{
+					     V Tgh, Thc, Tgk, TfG, TfB, TfJ, Tfm, Tg1, TfV, Tgn, TfL, TfH;
+					     Th1 = VFMA(LDK(KP923879532), Tgg, Tgf);
+					     Tgh = VFNMS(LDK(KP923879532), Tgg, Tgf);
+					     Thc = VFNMS(LDK(KP923879532), TgH, TgG);
+					     TgI = VFMA(LDK(KP923879532), TgH, TgG);
+					     TgK = VFMA(LDK(KP668178637), Tgi, Tgj);
+					     Tgk = VFNMS(LDK(KP668178637), Tgj, Tgi);
+					     TfG = VADD(TfA, Tfz);
+					     TfB = VSUB(Tfz, TfA);
+					     TfJ = VSUB(Tfl, Tf0);
+					     Tfm = VADD(Tf0, Tfl);
+					     Tg1 = VFNMS(LDK(KP820678790), TfT, TfU);
+					     TfV = VFMA(LDK(KP820678790), TfU, TfT);
+					     Tgn = VFNMS(LDK(KP668178637), Tgm, Tgl);
+					     TgJ = VFMA(LDK(KP668178637), Tgl, Tgm);
+					     TfL = VFMA(LDK(KP995184726), TfG, TfF);
+					     TfH = VFNMS(LDK(KP995184726), TfG, TfF);
+					     {
+						  V TfE, TfC, TfM, TfK;
+						  TfE = VFMA(LDK(KP995184726), TfB, Tfy);
+						  TfC = VFNMS(LDK(KP995184726), TfB, Tfy);
+						  TfM = VFNMS(LDK(KP995184726), TfJ, TfI);
+						  TfK = VFMA(LDK(KP995184726), TfJ, TfI);
+						  {
+						       V TfD, Tfn, Tg8, Tg3;
+						       TfD = VFMA(LDK(KP995184726), Tfm, TeF);
+						       Tfn = VFNMS(LDK(KP995184726), Tfm, TeF);
+						       Tg8 = VADD(Tg2, Tg1);
+						       Tg3 = VSUB(Tg1, Tg2);
+						       {
+							    V Tgb, TfW, Thd, Tgo;
+							    Tgb = VSUB(TfV, TfS);
+							    TfW = VADD(TfS, TfV);
+							    Thd = VSUB(Tgn, Tgk);
+							    Tgo = VADD(Tgk, Tgn);
+							    ST(&(xo[WS(os, 98)]), VFMAI(TfM, TfL), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 30)]), VFNMSI(TfM, TfL), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 94)]), VFNMSI(TfK, TfH), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 34)]), VFMAI(TfK, TfH), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 2)]), VFMAI(TfE, TfD), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 126)]), VFNMSI(TfE, TfD), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 66)]), VFMAI(TfC, Tfn), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 62)]), VFNMSI(TfC, Tfn), ovs, &(xo[0]));
+							    {
+								 V Tgd, Tg9, Tg6, Tg4;
+								 Tgd = VFNMS(LDK(KP773010453), Tg8, Tg7);
+								 Tg9 = VFMA(LDK(KP773010453), Tg8, Tg7);
+								 Tg6 = VFMA(LDK(KP773010453), Tg3, Tg0);
+								 Tg4 = VFNMS(LDK(KP773010453), Tg3, Tg0);
+								 {
+								      V Tge, Tgc, Tg5, TfX;
+								      Tge = VFMA(LDK(KP773010453), Tgb, Tga);
+								      Tgc = VFNMS(LDK(KP773010453), Tgb, Tga);
+								      Tg5 = VFMA(LDK(KP773010453), TfW, TfP);
+								      TfX = VFNMS(LDK(KP773010453), TfW, TfP);
+								      The = VFMA(LDK(KP831469612), Thd, Thc);
+								      Tho = VFNMS(LDK(KP831469612), Thd, Thc);
+								      TgT = VFMA(LDK(KP831469612), Tgo, Tgh);
+								      Tgp = VFNMS(LDK(KP831469612), Tgo, Tgh);
+								      ST(&(xo[WS(os, 110)]), VFNMSI(Tge, Tgd), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 18)]), VFMAI(Tge, Tgd), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 82)]), VFMAI(Tgc, Tg9), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 46)]), VFNMSI(Tgc, Tg9), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 114)]), VFMAI(Tg6, Tg5), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 14)]), VFNMSI(Tg6, Tg5), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 50)]), VFMAI(Tg4, TfX), ovs, &(xo[0]));
+								      ST(&(xo[WS(os, 78)]), VFNMSI(Tg4, TfX), ovs, &(xo[0]));
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					{
+					     V Th4, Tgs, Tgv, Th5, Tgz, TgC, Th2, TgL;
+					     Th4 = VFMA(LDK(KP923879532), Tgr, Tgq);
+					     Tgs = VFNMS(LDK(KP923879532), Tgr, Tgq);
+					     Tgv = VFMA(LDK(KP923879532), Tgu, Tgt);
+					     Th5 = VFNMS(LDK(KP923879532), Tgu, Tgt);
+					     Th7 = VFMA(LDK(KP923879532), Tgy, Tgx);
+					     Tgz = VFNMS(LDK(KP923879532), Tgy, Tgx);
+					     TgC = VFMA(LDK(KP923879532), TgB, TgA);
+					     Th8 = VFNMS(LDK(KP923879532), TgB, TgA);
+					     Th2 = VADD(TgK, TgJ);
+					     TgL = VSUB(TgJ, TgK);
+					     {
+						  V TgO, Tgw, TgN, TgD;
+						  TgO = VFMA(LDK(KP534511135), Tgs, Tgv);
+						  Tgw = VFNMS(LDK(KP534511135), Tgv, Tgs);
+						  TgN = VFMA(LDK(KP534511135), Tgz, TgC);
+						  TgD = VFNMS(LDK(KP534511135), TgC, Tgz);
+						  Thg = VFNMS(LDK(KP303346683), Th4, Th5);
+						  Th6 = VFMA(LDK(KP303346683), Th5, Th4);
+						  Th3 = VFMA(LDK(KP831469612), Th2, Th1);
+						  Thl = VFNMS(LDK(KP831469612), Th2, Th1);
+						  TgW = VFNMS(LDK(KP831469612), TgL, TgI);
+						  TgM = VFMA(LDK(KP831469612), TgL, TgI);
+						  TgU = VADD(TgO, TgN);
+						  TgP = VSUB(TgN, TgO);
+						  TgX = VSUB(TgD, Tgw);
+						  TgE = VADD(Tgw, TgD);
+					     }
+					}
+				   }
+				   {
+					V T8u, T8v, T7R, T8d, T7G, Thm, Thh, Thp, Tha, T7Y, Thr, Thn;
+					{
+					     V T7y, T7F, TgZ, TgV;
+					     T8u = VFNMS(LDK(KP831469612), T7x, T7q);
+					     T7y = VFMA(LDK(KP831469612), T7x, T7q);
+					     T7F = VFMA(LDK(KP831469612), T7E, T7B);
+					     T8v = VFNMS(LDK(KP831469612), T7E, T7B);
+					     T8x = VFNMS(LDK(KP831469612), T7Q, T7J);
+					     T7R = VFMA(LDK(KP831469612), T7Q, T7J);
+					     TgZ = VFMA(LDK(KP881921264), TgU, TgT);
+					     TgV = VFNMS(LDK(KP881921264), TgU, TgT);
+					     {
+						  V TgS, TgQ, Th0, TgY;
+						  TgS = VFMA(LDK(KP881921264), TgP, TgM);
+						  TgQ = VFNMS(LDK(KP881921264), TgP, TgM);
+						  Th0 = VFNMS(LDK(KP881921264), TgX, TgW);
+						  TgY = VFMA(LDK(KP881921264), TgX, TgW);
+						  {
+						       V TgR, TgF, Thf, Th9;
+						       TgR = VFMA(LDK(KP881921264), TgE, Tgp);
+						       TgF = VFNMS(LDK(KP881921264), TgE, Tgp);
+						       Thf = VFNMS(LDK(KP303346683), Th7, Th8);
+						       Th9 = VFMA(LDK(KP303346683), Th8, Th7);
+						       T8d = VFNMS(LDK(KP148335987), T7y, T7F);
+						       T7G = VFMA(LDK(KP148335987), T7F, T7y);
+						       ST(&(xo[WS(os, 106)]), VFMAI(Th0, TgZ), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 22)]), VFNMSI(Th0, TgZ), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 86)]), VFNMSI(TgY, TgV), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 42)]), VFMAI(TgY, TgV), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 10)]), VFMAI(TgS, TgR), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 118)]), VFNMSI(TgS, TgR), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 74)]), VFMAI(TgQ, TgF), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 54)]), VFNMSI(TgQ, TgF), ovs, &(xo[0]));
+						       Thm = VADD(Thg, Thf);
+						       Thh = VSUB(Thf, Thg);
+						       Thp = VSUB(Th9, Th6);
+						       Tha = VADD(Th6, Th9);
+						       T7Y = VFMA(LDK(KP831469612), T7X, T7U);
+						       T8y = VFNMS(LDK(KP831469612), T7X, T7U);
+						  }
+					     }
+					}
+					Thr = VFNMS(LDK(KP956940335), Thm, Thl);
+					Thn = VFMA(LDK(KP956940335), Thm, Thl);
+					{
+					     V Thk, Thi, Ths, Thq;
+					     Thk = VFMA(LDK(KP956940335), Thh, The);
+					     Thi = VFNMS(LDK(KP956940335), Thh, The);
+					     Ths = VFMA(LDK(KP956940335), Thp, Tho);
+					     Thq = VFNMS(LDK(KP956940335), Thp, Tho);
+					     {
+						  V Thj, Thb, T8e, T7Z;
+						  Thj = VFMA(LDK(KP956940335), Tha, Th3);
+						  Thb = VFNMS(LDK(KP956940335), Tha, Th3);
+						  T8e = VFNMS(LDK(KP148335987), T7R, T7Y);
+						  T7Z = VFMA(LDK(KP148335987), T7Y, T7R);
+						  T8F = VFMA(LDK(KP741650546), T8u, T8v);
+						  T8w = VFNMS(LDK(KP741650546), T8v, T8u);
+						  ST(&(xo[WS(os, 102)]), VFNMSI(Ths, Thr), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 26)]), VFMAI(Ths, Thr), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 90)]), VFMAI(Thq, Thn), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 38)]), VFNMSI(Thq, Thn), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 122)]), VFMAI(Thk, Thj), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 6)]), VFNMSI(Thk, Thj), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 58)]), VFMAI(Thi, Thb), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 70)]), VFNMSI(Thi, Thb), ovs, &(xo[0]));
+						  T8k = VADD(T8d, T8e);
+						  T8f = VSUB(T8d, T8e);
+						  T8n = VSUB(T7Z, T7G);
+						  T80 = VADD(T7G, T7Z);
+					     }
+					}
+					T9l = VSUB(T75, T72);
+					T76 = VADD(T72, T75);
+					T87 = VSUB(T85, T86);
+					T8U = VADD(T85, T86);
+					T89 = VFNMS(LDK(KP303346683), T7a, T7d);
+					T7e = VFMA(LDK(KP303346683), T7d, T7a);
+					T7l = VFMA(LDK(KP303346683), T7k, T7h);
+					T8a = VFNMS(LDK(KP303346683), T7h, T7k);
+				   }
+			      }
+			      {
+				   V T11, T5h, T5a, T55, T5d, T4K, T5C, T5x, T5F, T5q, T4X, T4Z, T1C, T2d, T50;
+				   {
+					V T5k, T3g, T3t, T5l, T5n, T4v, T4I, T5o, T8G, T8z;
+					T5k = VFNMS(LDK(KP980785280), T3f, T2G);
+					T3g = VFMA(LDK(KP980785280), T3f, T2G);
+					T8G = VFMA(LDK(KP741650546), T8x, T8y);
+					T8z = VFNMS(LDK(KP741650546), T8y, T8x);
+					{
+					     V T8r, T77, T8C, T88;
+					     T8r = VFNMS(LDK(KP831469612), T76, T6Z);
+					     T77 = VFMA(LDK(KP831469612), T76, T6Z);
+					     T8C = VFNMS(LDK(KP831469612), T87, T84);
+					     T88 = VFMA(LDK(KP831469612), T87, T84);
+					     {
+						  V T8D, T7m, T8s, T8b;
+						  T8D = VSUB(T7l, T7e);
+						  T7m = VADD(T7e, T7l);
+						  T8s = VADD(T89, T8a);
+						  T8b = VSUB(T89, T8a);
+						  {
+						       V T8M, T8H, T8P, T8A;
+						       T8M = VADD(T8F, T8G);
+						       T8H = VSUB(T8F, T8G);
+						       T8P = VSUB(T8z, T8w);
+						       T8A = VADD(T8w, T8z);
+						       {
+							    V T8E, T8O, T8j, T7n;
+							    T8E = VFNMS(LDK(KP956940335), T8D, T8C);
+							    T8O = VFMA(LDK(KP956940335), T8D, T8C);
+							    T8j = VFNMS(LDK(KP956940335), T7m, T77);
+							    T7n = VFMA(LDK(KP956940335), T7m, T77);
+							    {
+								 V T8t, T8L, T8m, T8c;
+								 T8t = VFNMS(LDK(KP956940335), T8s, T8r);
+								 T8L = VFMA(LDK(KP956940335), T8s, T8r);
+								 T8m = VFNMS(LDK(KP956940335), T8b, T88);
+								 T8c = VFMA(LDK(KP956940335), T8b, T88);
+								 {
+								      V T8K, T8I, T8S, T8Q;
+								      T8K = VFMA(LDK(KP803207531), T8H, T8E);
+								      T8I = VFNMS(LDK(KP803207531), T8H, T8E);
+								      T8S = VFMA(LDK(KP803207531), T8P, T8O);
+								      T8Q = VFNMS(LDK(KP803207531), T8P, T8O);
+								      {
+									   V T8p, T8l, T8h, T81;
+									   T8p = VFNMS(LDK(KP989176509), T8k, T8j);
+									   T8l = VFMA(LDK(KP989176509), T8k, T8j);
+									   T8h = VFMA(LDK(KP989176509), T80, T7n);
+									   T81 = VFNMS(LDK(KP989176509), T80, T7n);
+									   {
+										V T8J, T8B, T8R, T8N;
+										T8J = VFMA(LDK(KP803207531), T8A, T8t);
+										T8B = VFNMS(LDK(KP803207531), T8A, T8t);
+										T8R = VFMA(LDK(KP803207531), T8M, T8L);
+										T8N = VFNMS(LDK(KP803207531), T8M, T8L);
+										{
+										     V T8q, T8o, T8i, T8g;
+										     T8q = VFNMS(LDK(KP989176509), T8n, T8m);
+										     T8o = VFMA(LDK(KP989176509), T8n, T8m);
+										     T8i = VFMA(LDK(KP989176509), T8f, T8c);
+										     T8g = VFNMS(LDK(KP989176509), T8f, T8c);
+										     ST(&(xo[WS(os, 115)]), VFMAI(T8K, T8J), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 13)]), VFNMSI(T8K, T8J), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 51)]), VFMAI(T8I, T8B), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 77)]), VFNMSI(T8I, T8B), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 109)]), VFNMSI(T8S, T8R), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 19)]), VFMAI(T8S, T8R), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 83)]), VFMAI(T8Q, T8N), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 45)]), VFNMSI(T8Q, T8N), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 99)]), VFMAI(T8q, T8p), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 29)]), VFNMSI(T8q, T8p), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 93)]), VFNMSI(T8o, T8l), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 35)]), VFMAI(T8o, T8l), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 3)]), VFMAI(T8i, T8h), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 125)]), VFNMSI(T8i, T8h), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 67)]), VFMAI(T8g, T81), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 61)]), VFNMSI(T8g, T81), ovs, &(xo[WS(os, 1)]));
+										     T3t = VFMA(LDK(KP980785280), T3s, T3p);
+										     T5l = VFNMS(LDK(KP980785280), T3s, T3p);
+										}
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					T5n = VFNMS(LDK(KP980785280), T4u, T3V);
+					T4v = VFMA(LDK(KP980785280), T4u, T3V);
+					T4I = VFMA(LDK(KP980785280), T4H, T4E);
+					T5o = VFNMS(LDK(KP980785280), T4H, T4E);
+					{
+					     V T53, T3u, T54, T4J, T5v, T5m, T5w, T5p, T10;
+					     T6b = VSUB(TZ, TI);
+					     T10 = VADD(TI, TZ);
+					     T53 = VFMA(LDK(KP049126849), T3g, T3t);
+					     T3u = VFNMS(LDK(KP049126849), T3t, T3g);
+					     T54 = VFMA(LDK(KP049126849), T4v, T4I);
+					     T4J = VFNMS(LDK(KP049126849), T4I, T4v);
+					     T5v = VFNMS(LDK(KP906347169), T5k, T5l);
+					     T5m = VFMA(LDK(KP906347169), T5l, T5k);
+					     T5w = VFNMS(LDK(KP906347169), T5n, T5o);
+					     T5p = VFMA(LDK(KP906347169), T5o, T5n);
+					     T11 = VFMA(LDK(KP980785280), T10, Tr);
+					     T5h = VFNMS(LDK(KP980785280), T10, Tr);
+					     T5a = VADD(T53, T54);
+					     T55 = VSUB(T53, T54);
+					     T5d = VSUB(T4J, T3u);
+					     T4K = VADD(T3u, T4J);
+					     T5C = VADD(T5v, T5w);
+					     T5x = VSUB(T5v, T5w);
+					     T5F = VSUB(T5p, T5m);
+					     T5q = VADD(T5m, T5p);
+					     T4X = VSUB(T4V, T4W);
+					     T5K = VADD(T4V, T4W);
+					}
+					T4Z = VFMA(LDK(KP098491403), T1s, T1B);
+					T1C = VFNMS(LDK(KP098491403), T1B, T1s);
+					T2d = VFNMS(LDK(KP098491403), T2c, T23);
+					T50 = VFMA(LDK(KP098491403), T23, T2c);
+				   }
+				   {
+					V T9y, T9t, T9B, T9i, T9o, T9n, T9F, T8V, T9Q, T9m, T9R, T92, Ta0, T9V, Ta3;
+					V T9O;
+					{
+					     V T9I, T9J, T9L, T9d, T5s, T4Y, T5t, T2e, T5i, T51, T9r, T9a, T9g, T9M, T96;
+					     V T99;
+					     T9I = VFMA(LDK(KP831469612), T95, T94);
+					     T96 = VFNMS(LDK(KP831469612), T95, T94);
+					     T99 = VFNMS(LDK(KP831469612), T98, T97);
+					     T9J = VFMA(LDK(KP831469612), T98, T97);
+					     T9L = VFMA(LDK(KP831469612), T9c, T9b);
+					     T9d = VFNMS(LDK(KP831469612), T9c, T9b);
+					     T5s = VFNMS(LDK(KP980785280), T4X, T4U);
+					     T4Y = VFMA(LDK(KP980785280), T4X, T4U);
+					     T5t = VSUB(T2d, T1C);
+					     T2e = VADD(T1C, T2d);
+					     T5i = VADD(T4Z, T50);
+					     T51 = VSUB(T4Z, T50);
+					     T9r = VFNMS(LDK(KP599376933), T96, T99);
+					     T9a = VFMA(LDK(KP599376933), T99, T96);
+					     T9g = VFNMS(LDK(KP831469612), T9f, T9e);
+					     T9M = VFMA(LDK(KP831469612), T9f, T9e);
+					     {
+						  V T5u, T5E, T8Y, T91;
+						  T5u = VFMA(LDK(KP995184726), T5t, T5s);
+						  T5E = VFNMS(LDK(KP995184726), T5t, T5s);
+						  {
+						       V T59, T2f, T5j, T5B;
+						       T59 = VFNMS(LDK(KP995184726), T2e, T11);
+						       T2f = VFMA(LDK(KP995184726), T2e, T11);
+						       T5j = VFMA(LDK(KP995184726), T5i, T5h);
+						       T5B = VFNMS(LDK(KP995184726), T5i, T5h);
+						       {
+							    V T5c, T52, T9s, T9h;
+							    T5c = VFNMS(LDK(KP995184726), T51, T4Y);
+							    T52 = VFMA(LDK(KP995184726), T51, T4Y);
+							    T9s = VFNMS(LDK(KP599376933), T9d, T9g);
+							    T9h = VFMA(LDK(KP599376933), T9g, T9d);
+							    {
+								 V T5A, T5y, T5I, T5G;
+								 T5A = VFMA(LDK(KP740951125), T5x, T5u);
+								 T5y = VFNMS(LDK(KP740951125), T5x, T5u);
+								 T5I = VFNMS(LDK(KP740951125), T5F, T5E);
+								 T5G = VFMA(LDK(KP740951125), T5F, T5E);
+								 {
+								      V T5f, T5b, T57, T4L;
+								      T5f = VFMA(LDK(KP998795456), T5a, T59);
+								      T5b = VFNMS(LDK(KP998795456), T5a, T59);
+								      T57 = VFMA(LDK(KP998795456), T4K, T2f);
+								      T4L = VFNMS(LDK(KP998795456), T4K, T2f);
+								      {
+									   V T5z, T5r, T5H, T5D;
+									   T5z = VFMA(LDK(KP740951125), T5q, T5j);
+									   T5r = VFNMS(LDK(KP740951125), T5q, T5j);
+									   T5H = VFNMS(LDK(KP740951125), T5C, T5B);
+									   T5D = VFMA(LDK(KP740951125), T5C, T5B);
+									   {
+										V T5g, T5e, T58, T56;
+										T5g = VFMA(LDK(KP998795456), T5d, T5c);
+										T5e = VFNMS(LDK(KP998795456), T5d, T5c);
+										T58 = VFMA(LDK(KP998795456), T55, T52);
+										T56 = VFNMS(LDK(KP998795456), T55, T52);
+										T9y = VADD(T9r, T9s);
+										T9t = VSUB(T9r, T9s);
+										T9B = VSUB(T9h, T9a);
+										T9i = VADD(T9a, T9h);
+										ST(&(xo[WS(os, 15)]), VFMAI(T5A, T5z), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 113)]), VFNMSI(T5A, T5z), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 79)]), VFMAI(T5y, T5r), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 49)]), VFNMSI(T5y, T5r), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 111)]), VFMAI(T5I, T5H), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 17)]), VFNMSI(T5I, T5H), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 81)]), VFNMSI(T5G, T5D), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 47)]), VFMAI(T5G, T5D), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 97)]), VFNMSI(T5g, T5f), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 31)]), VFMAI(T5g, T5f), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 95)]), VFMAI(T5e, T5b), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 33)]), VFNMSI(T5e, T5b), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 127)]), VFMAI(T58, T57), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 1)]), VFNMSI(T58, T57), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 63)]), VFMAI(T56, T4L), ovs, &(xo[WS(os, 1)]));
+										ST(&(xo[WS(os, 65)]), VFNMSI(T56, T4L), ovs, &(xo[WS(os, 1)]));
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+						  T9o = VFNMS(LDK(KP534511135), T8W, T8X);
+						  T8Y = VFMA(LDK(KP534511135), T8X, T8W);
+						  T91 = VFMA(LDK(KP534511135), T90, T8Z);
+						  T9n = VFNMS(LDK(KP534511135), T8Z, T90);
+						  {
+						       V T9T, T9K, T9U, T9N;
+						       T9T = VFMA(LDK(KP250486960), T9I, T9J);
+						       T9K = VFNMS(LDK(KP250486960), T9J, T9I);
+						       T9U = VFMA(LDK(KP250486960), T9L, T9M);
+						       T9N = VFNMS(LDK(KP250486960), T9M, T9L);
+						       T9F = VFNMS(LDK(KP831469612), T8U, T8T);
+						       T8V = VFMA(LDK(KP831469612), T8U, T8T);
+						       T9Q = VFNMS(LDK(KP831469612), T9l, T9k);
+						       T9m = VFMA(LDK(KP831469612), T9l, T9k);
+						       T9R = VSUB(T8Y, T91);
+						       T92 = VADD(T8Y, T91);
+						       Ta0 = VADD(T9T, T9U);
+						       T9V = VSUB(T9T, T9U);
+						       Ta3 = VSUB(T9N, T9K);
+						       T9O = VADD(T9K, T9N);
+						  }
+					     }
+					}
+					{
+					     V T6y, T6z, T63, T9Y, T9W, Ta6, Ta4, T9D, T9z, T9v, T9j, T6h, T60, T9H, T9Z;
+					     V T9A, T9q, T66, T9X, T9P;
+					     {
+						  V T5W, T9S, Ta2, T9x, T93, T5Z, T9G, T9p;
+						  T6y = VFMA(LDK(KP980785280), T5V, T5U);
+						  T5W = VFNMS(LDK(KP980785280), T5V, T5U);
+						  T9S = VFMA(LDK(KP881921264), T9R, T9Q);
+						  Ta2 = VFNMS(LDK(KP881921264), T9R, T9Q);
+						  T9x = VFNMS(LDK(KP881921264), T92, T8V);
+						  T93 = VFMA(LDK(KP881921264), T92, T8V);
+						  T5Z = VFMA(LDK(KP980785280), T5Y, T5X);
+						  T6z = VFNMS(LDK(KP980785280), T5Y, T5X);
+						  T6B = VFMA(LDK(KP980785280), T62, T61);
+						  T63 = VFNMS(LDK(KP980785280), T62, T61);
+						  T9G = VADD(T9o, T9n);
+						  T9p = VSUB(T9n, T9o);
+						  T9Y = VFMA(LDK(KP970031253), T9V, T9S);
+						  T9W = VFNMS(LDK(KP970031253), T9V, T9S);
+						  Ta6 = VFMA(LDK(KP970031253), Ta3, Ta2);
+						  Ta4 = VFNMS(LDK(KP970031253), Ta3, Ta2);
+						  T9D = VFNMS(LDK(KP857728610), T9y, T9x);
+						  T9z = VFMA(LDK(KP857728610), T9y, T9x);
+						  T9v = VFMA(LDK(KP857728610), T9i, T93);
+						  T9j = VFNMS(LDK(KP857728610), T9i, T93);
+						  T6h = VFMA(LDK(KP472964775), T5W, T5Z);
+						  T60 = VFNMS(LDK(KP472964775), T5Z, T5W);
+						  T9H = VFMA(LDK(KP881921264), T9G, T9F);
+						  T9Z = VFNMS(LDK(KP881921264), T9G, T9F);
+						  T9A = VFNMS(LDK(KP881921264), T9p, T9m);
+						  T9q = VFMA(LDK(KP881921264), T9p, T9m);
+						  T66 = VFMA(LDK(KP980785280), T65, T64);
+						  T6C = VFNMS(LDK(KP980785280), T65, T64);
+					     }
+					     T9X = VFMA(LDK(KP970031253), T9O, T9H);
+					     T9P = VFNMS(LDK(KP970031253), T9O, T9H);
+					     {
+						  V Ta5, Ta1, T9E, T9C;
+						  Ta5 = VFMA(LDK(KP970031253), Ta0, T9Z);
+						  Ta1 = VFNMS(LDK(KP970031253), Ta0, T9Z);
+						  T9E = VFNMS(LDK(KP857728610), T9B, T9A);
+						  T9C = VFMA(LDK(KP857728610), T9B, T9A);
+						  {
+						       V T9w, T9u, T6i, T67;
+						       T9w = VFMA(LDK(KP857728610), T9t, T9q);
+						       T9u = VFNMS(LDK(KP857728610), T9t, T9q);
+						       T6i = VFMA(LDK(KP472964775), T63, T66);
+						       T67 = VFNMS(LDK(KP472964775), T66, T63);
+						       T6J = VFNMS(LDK(KP357805721), T6y, T6z);
+						       T6A = VFMA(LDK(KP357805721), T6z, T6y);
+						       ST(&(xo[WS(os, 123)]), VFMAI(T9Y, T9X), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 5)]), VFNMSI(T9Y, T9X), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 59)]), VFMAI(T9W, T9P), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 69)]), VFNMSI(T9W, T9P), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 101)]), VFNMSI(Ta6, Ta5), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 27)]), VFMAI(Ta6, Ta5), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 91)]), VFMAI(Ta4, Ta1), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 37)]), VFNMSI(Ta4, Ta1), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 107)]), VFMAI(T9E, T9D), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 21)]), VFNMSI(T9E, T9D), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 85)]), VFNMSI(T9C, T9z), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 43)]), VFMAI(T9C, T9z), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 11)]), VFMAI(T9w, T9v), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 117)]), VFNMSI(T9w, T9v), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 75)]), VFMAI(T9u, T9j), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 53)]), VFNMSI(T9u, T9j), ovs, &(xo[WS(os, 1)]));
+						       T6o = VADD(T6h, T6i);
+						       T6j = VSUB(T6h, T6i);
+						       T6r = VSUB(T67, T60);
+						       T68 = VADD(T60, T67);
+						  }
+					     }
+					     T6e = VFMA(LDK(KP820678790), T5M, T5N);
+					     T5O = VFNMS(LDK(KP820678790), T5N, T5M);
+					     T5R = VFNMS(LDK(KP820678790), T5Q, T5P);
+					     T6d = VFMA(LDK(KP820678790), T5P, T5Q);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T6D = VFMA(LDK(KP357805721), T6C, T6B);
+	       T6K = VFNMS(LDK(KP357805721), T6B, T6C);
+	       {
+		    V T5L, T6v, T6c, T6G;
+		    T5L = VFNMS(LDK(KP980785280), T5K, T5J);
+		    T6v = VFMA(LDK(KP980785280), T5K, T5J);
+		    T6c = VFNMS(LDK(KP980785280), T6b, T6a);
+		    T6G = VFMA(LDK(KP980785280), T6b, T6a);
+		    {
+			 V T5S, T6H, T6f, T6w;
+			 T5S = VADD(T5O, T5R);
+			 T6H = VSUB(T5O, T5R);
+			 T6f = VSUB(T6d, T6e);
+			 T6w = VADD(T6e, T6d);
+			 {
+			      V T6L, T6Q, T6E, T6T;
+			      T6L = VSUB(T6J, T6K);
+			      T6Q = VADD(T6J, T6K);
+			      T6E = VADD(T6A, T6D);
+			      T6T = VSUB(T6D, T6A);
+			      {
+				   V T6S, T6I, T5T, T6n;
+				   T6S = VFNMS(LDK(KP773010453), T6H, T6G);
+				   T6I = VFMA(LDK(KP773010453), T6H, T6G);
+				   T5T = VFNMS(LDK(KP773010453), T5S, T5L);
+				   T6n = VFMA(LDK(KP773010453), T5S, T5L);
+				   {
+					V T6P, T6x, T6g, T6q;
+					T6P = VFNMS(LDK(KP773010453), T6w, T6v);
+					T6x = VFMA(LDK(KP773010453), T6w, T6v);
+					T6g = VFNMS(LDK(KP773010453), T6f, T6c);
+					T6q = VFMA(LDK(KP773010453), T6f, T6c);
+					{
+					     V T6M, T6O, T6U, T6W;
+					     T6M = VFNMS(LDK(KP941544065), T6L, T6I);
+					     T6O = VFMA(LDK(KP941544065), T6L, T6I);
+					     T6U = VFMA(LDK(KP941544065), T6T, T6S);
+					     T6W = VFNMS(LDK(KP941544065), T6T, T6S);
+					     {
+						  V T6p, T6t, T69, T6l;
+						  T6p = VFNMS(LDK(KP903989293), T6o, T6n);
+						  T6t = VFMA(LDK(KP903989293), T6o, T6n);
+						  T69 = VFNMS(LDK(KP903989293), T68, T5T);
+						  T6l = VFMA(LDK(KP903989293), T68, T5T);
+						  {
+						       V T6F, T6N, T6R, T6V;
+						       T6F = VFNMS(LDK(KP941544065), T6E, T6x);
+						       T6N = VFMA(LDK(KP941544065), T6E, T6x);
+						       T6R = VFMA(LDK(KP941544065), T6Q, T6P);
+						       T6V = VFNMS(LDK(KP941544065), T6Q, T6P);
+						       {
+							    V T6s, T6u, T6k, T6m;
+							    T6s = VFNMS(LDK(KP903989293), T6r, T6q);
+							    T6u = VFMA(LDK(KP903989293), T6r, T6q);
+							    T6k = VFNMS(LDK(KP903989293), T6j, T6g);
+							    T6m = VFMA(LDK(KP903989293), T6j, T6g);
+							    ST(&(xo[WS(os, 7)]), VFMAI(T6O, T6N), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 121)]), VFNMSI(T6O, T6N), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 71)]), VFMAI(T6M, T6F), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 57)]), VFNMSI(T6M, T6F), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 103)]), VFMAI(T6W, T6V), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 25)]), VFNMSI(T6W, T6V), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 89)]), VFNMSI(T6U, T6R), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 39)]), VFMAI(T6U, T6R), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 105)]), VFNMSI(T6u, T6t), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 23)]), VFMAI(T6u, T6t), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 87)]), VFMAI(T6s, T6p), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 41)]), VFNMSI(T6s, T6p), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 119)]), VFMAI(T6m, T6l), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 9)]), VFNMSI(T6m, T6l), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 55)]), VFMAI(T6k, T69), ovs, &(xo[WS(os, 1)]));
+							    ST(&(xo[WS(os, 73)]), VFNMSI(T6k, T69), ovs, &(xo[WS(os, 1)]));
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 128, XSIMD_STRING("n1fv_128"), {440, 0, 642, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_128) (planner *p) {
+     X(kdft_register) (p, n1fv_128, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 128 -name n1fv_128 -include n1f.h */
+
+/*
+ * This function contains 1082 FP additions, 330 FP multiplications,
+ * (or, 938 additions, 186 multiplications, 144 fused multiply/add),
+ * 194 stack variables, 31 constants, and 256 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_128(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP941544065, +0.941544065183020778412509402599502357185589796);
+     DVK(KP336889853, +0.336889853392220050689253212619147570477766780);
+     DVK(KP903989293, +0.903989293123443331586200297230537048710132025);
+     DVK(KP427555093, +0.427555093430282094320966856888798534304578629);
+     DVK(KP970031253, +0.970031253194543992603984207286100251456865962);
+     DVK(KP242980179, +0.242980179903263889948274162077471118320990783);
+     DVK(KP857728610, +0.857728610000272069902269984284770137042490799);
+     DVK(KP514102744, +0.514102744193221726593693838968815772608049120);
+     DVK(KP671558954, +0.671558954847018400625376850427421803228750632);
+     DVK(KP740951125, +0.740951125354959091175616897495162729728955309);
+     DVK(KP049067674, +0.049067674327418014254954976942682658314745363);
+     DVK(KP998795456, +0.998795456205172392714771604759100694443203615);
+     DVK(KP595699304, +0.595699304492433343467036528829969889511926338);
+     DVK(KP803207531, +0.803207531480644909806676512963141923879569427);
+     DVK(KP146730474, +0.146730474455361751658850129646717819706215317);
+     DVK(KP989176509, +0.989176509964780973451673738016243063983689533);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(256, is), MAKE_VOLATILE_STRIDE(256, os)) {
+	       V Tr, T5J, Ted, Tgf, Tfq, TgH, T4U, T6b, T6Z, T8T, Tad, TcZ, Tcc, Td0, T84;
+	       V T9l, Tb6, Tbt, T2G, T5X, TeV, Tgr, T3p, T5V, T7B, T95, TeK, Tgt, T7q, T97;
+	       V Td8, TdK, TbD, Tc0, T3V, T61, Tfg, TgB, T4E, T65, T7U, T9f, Tf5, Tgx, T7J;
+	       V T9b, Tdf, TdN, Td2, Td3, TI, T4V, Tft, Tgg, TZ, T4W, T75, T86, Tek, TgG;
+	       V T72, T85, Tas, Tcd, Tdp, Tdq, TdG, Teq, Tgm, Tet, Tgl, T1s, T5P, T1B, T5Q;
+	       V T7d, T8Z, TaI, Tcf, T7a, T90, Tdm, Tdn, TdH, Tez, Tgi, TeC, Tgj, T23, T5N;
+	       V T2c, T5M, T7k, T8X, TaX, Tcg, T7h, T8W, Tbl, Tbu, Tdb, TdL, TeY, Tgu, TeR;
+	       V Tgq, T7x, T98, T7E, T94, T3f, T5Y, T3s, T5U, TbS, Tc1, Tdi, TdO, Tfj, Tgy;
+	       V Tfc, TgA, T7Q, T9e, T7X, T9c, T4u, T64, T4H, T62;
+	       {
+		    V T3, Ta7, T4P, Ta8, Ta, Tab, T4M, Taa, Tc9, Tca, Ti, Tea, T4S, Tc6, Tc7;
+		    V Tp, Teb, T4R;
+		    {
+			 V T1, T2, T4N, T4O;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 64)]), ivs, &(xi[0]));
+			 T3 = VSUB(T1, T2);
+			 Ta7 = VADD(T1, T2);
+			 T4N = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+			 T4O = LD(&(xi[WS(is, 96)]), ivs, &(xi[0]));
+			 T4P = VSUB(T4N, T4O);
+			 Ta8 = VADD(T4N, T4O);
+		    }
+		    {
+			 V T4, T5, T6, T7, T8, T9;
+			 T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 80)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T7 = LD(&(xi[WS(is, 112)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+			 Tab = VADD(T7, T8);
+			 T4M = VMUL(LDK(KP707106781), VSUB(T9, T6));
+			 Taa = VADD(T4, T5);
+		    }
+		    {
+			 V Te, Th, Tl, To;
+			 {
+			      V Tc, Td, Tf, Tg;
+			      Tc = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			      Td = LD(&(xi[WS(is, 72)]), ivs, &(xi[0]));
+			      Te = VSUB(Tc, Td);
+			      Tc9 = VADD(Tc, Td);
+			      Tf = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+			      Tg = LD(&(xi[WS(is, 104)]), ivs, &(xi[0]));
+			      Th = VSUB(Tf, Tg);
+			      Tca = VADD(Tf, Tg);
+			 }
+			 Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
+			 Tea = VSUB(Tc9, Tca);
+			 T4S = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
+			 {
+			      V Tj, Tk, Tm, Tn;
+			      Tj = LD(&(xi[WS(is, 120)]), ivs, &(xi[0]));
+			      Tk = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+			      Tl = VSUB(Tj, Tk);
+			      Tc6 = VADD(Tj, Tk);
+			      Tm = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			      Tn = LD(&(xi[WS(is, 88)]), ivs, &(xi[0]));
+			      To = VSUB(Tm, Tn);
+			      Tc7 = VADD(Tm, Tn);
+			 }
+			 Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
+			 Teb = VSUB(Tc6, Tc7);
+			 T4R = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
+		    }
+		    {
+			 V Tb, Tq, Te9, Tec;
+			 Tb = VADD(T3, Ta);
+			 Tq = VADD(Ti, Tp);
+			 Tr = VADD(Tb, Tq);
+			 T5J = VSUB(Tb, Tq);
+			 Te9 = VSUB(Ta7, Ta8);
+			 Tec = VMUL(LDK(KP707106781), VADD(Tea, Teb));
+			 Ted = VADD(Te9, Tec);
+			 Tgf = VSUB(Te9, Tec);
+		    }
+		    {
+			 V Tfo, Tfp, T4Q, T4T;
+			 Tfo = VSUB(Tab, Taa);
+			 Tfp = VMUL(LDK(KP707106781), VSUB(Teb, Tea));
+			 Tfq = VADD(Tfo, Tfp);
+			 TgH = VSUB(Tfp, Tfo);
+			 T4Q = VSUB(T4M, T4P);
+			 T4T = VSUB(T4R, T4S);
+			 T4U = VADD(T4Q, T4T);
+			 T6b = VSUB(T4T, T4Q);
+		    }
+		    {
+			 V T6X, T6Y, Ta9, Tac;
+			 T6X = VSUB(T3, Ta);
+			 T6Y = VADD(T4S, T4R);
+			 T6Z = VADD(T6X, T6Y);
+			 T8T = VSUB(T6X, T6Y);
+			 Ta9 = VADD(Ta7, Ta8);
+			 Tac = VADD(Taa, Tab);
+			 Tad = VSUB(Ta9, Tac);
+			 TcZ = VADD(Ta9, Tac);
+		    }
+		    {
+			 V Tc8, Tcb, T82, T83;
+			 Tc8 = VADD(Tc6, Tc7);
+			 Tcb = VADD(Tc9, Tca);
+			 Tcc = VSUB(Tc8, Tcb);
+			 Td0 = VADD(Tcb, Tc8);
+			 T82 = VADD(T4P, T4M);
+			 T83 = VSUB(Tp, Ti);
+			 T84 = VADD(T82, T83);
+			 T9l = VSUB(T83, T82);
+		    }
+	       }
+	       {
+		    V Tb0, Tb1, T2i, Tb2, T3k, Tb3, Tb4, T2p, Tb5, T3h, T2x, TeH, T3n, Tbs, T2E;
+		    V TeI, T3m, Tbp, T2l, T2o, TeG, TeJ;
+		    {
+			 V T2g, T2h, T3i, T3j;
+			 T2g = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T2h = LD(&(xi[WS(is, 65)]), ivs, &(xi[WS(is, 1)]));
+			 Tb0 = VADD(T2g, T2h);
+			 T3i = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+			 T3j = LD(&(xi[WS(is, 97)]), ivs, &(xi[WS(is, 1)]));
+			 Tb1 = VADD(T3i, T3j);
+			 T2i = VSUB(T2g, T2h);
+			 Tb2 = VADD(Tb0, Tb1);
+			 T3k = VSUB(T3i, T3j);
+		    }
+		    {
+			 V T2j, T2k, T2m, T2n;
+			 T2j = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 T2k = LD(&(xi[WS(is, 81)]), ivs, &(xi[WS(is, 1)]));
+			 T2l = VSUB(T2j, T2k);
+			 Tb3 = VADD(T2j, T2k);
+			 T2m = LD(&(xi[WS(is, 113)]), ivs, &(xi[WS(is, 1)]));
+			 T2n = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+			 T2o = VSUB(T2m, T2n);
+			 Tb4 = VADD(T2m, T2n);
+		    }
+		    T2p = VMUL(LDK(KP707106781), VADD(T2l, T2o));
+		    Tb5 = VADD(Tb3, Tb4);
+		    T3h = VMUL(LDK(KP707106781), VSUB(T2o, T2l));
+		    {
+			 V T2t, Tbq, T2w, Tbr;
+			 {
+			      V T2r, T2s, T2u, T2v;
+			      T2r = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			      T2s = LD(&(xi[WS(is, 73)]), ivs, &(xi[WS(is, 1)]));
+			      T2t = VSUB(T2r, T2s);
+			      Tbq = VADD(T2r, T2s);
+			      T2u = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+			      T2v = LD(&(xi[WS(is, 105)]), ivs, &(xi[WS(is, 1)]));
+			      T2w = VSUB(T2u, T2v);
+			      Tbr = VADD(T2u, T2v);
+			 }
+			 T2x = VFNMS(LDK(KP382683432), T2w, VMUL(LDK(KP923879532), T2t));
+			 TeH = VSUB(Tbq, Tbr);
+			 T3n = VFMA(LDK(KP382683432), T2t, VMUL(LDK(KP923879532), T2w));
+			 Tbs = VADD(Tbq, Tbr);
+		    }
+		    {
+			 V T2A, Tbn, T2D, Tbo;
+			 {
+			      V T2y, T2z, T2B, T2C;
+			      T2y = LD(&(xi[WS(is, 121)]), ivs, &(xi[WS(is, 1)]));
+			      T2z = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+			      T2A = VSUB(T2y, T2z);
+			      Tbn = VADD(T2y, T2z);
+			      T2B = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+			      T2C = LD(&(xi[WS(is, 89)]), ivs, &(xi[WS(is, 1)]));
+			      T2D = VSUB(T2B, T2C);
+			      Tbo = VADD(T2B, T2C);
+			 }
+			 T2E = VFMA(LDK(KP923879532), T2A, VMUL(LDK(KP382683432), T2D));
+			 TeI = VSUB(Tbn, Tbo);
+			 T3m = VFNMS(LDK(KP923879532), T2D, VMUL(LDK(KP382683432), T2A));
+			 Tbp = VADD(Tbn, Tbo);
+		    }
+		    Tb6 = VSUB(Tb2, Tb5);
+		    Tbt = VSUB(Tbp, Tbs);
+		    {
+			 V T2q, T2F, TeT, TeU;
+			 T2q = VADD(T2i, T2p);
+			 T2F = VADD(T2x, T2E);
+			 T2G = VADD(T2q, T2F);
+			 T5X = VSUB(T2q, T2F);
+			 TeT = VSUB(Tb4, Tb3);
+			 TeU = VMUL(LDK(KP707106781), VSUB(TeI, TeH));
+			 TeV = VADD(TeT, TeU);
+			 Tgr = VSUB(TeU, TeT);
+		    }
+		    {
+			 V T3l, T3o, T7z, T7A;
+			 T3l = VSUB(T3h, T3k);
+			 T3o = VSUB(T3m, T3n);
+			 T3p = VADD(T3l, T3o);
+			 T5V = VSUB(T3o, T3l);
+			 T7z = VADD(T3k, T3h);
+			 T7A = VSUB(T2E, T2x);
+			 T7B = VADD(T7z, T7A);
+			 T95 = VSUB(T7A, T7z);
+		    }
+		    TeG = VSUB(Tb0, Tb1);
+		    TeJ = VMUL(LDK(KP707106781), VADD(TeH, TeI));
+		    TeK = VADD(TeG, TeJ);
+		    Tgt = VSUB(TeG, TeJ);
+		    {
+			 V T7o, T7p, Td6, Td7;
+			 T7o = VSUB(T2i, T2p);
+			 T7p = VADD(T3n, T3m);
+			 T7q = VADD(T7o, T7p);
+			 T97 = VSUB(T7o, T7p);
+			 Td6 = VADD(Tb2, Tb5);
+			 Td7 = VADD(Tbs, Tbp);
+			 Td8 = VADD(Td6, Td7);
+			 TdK = VSUB(Td6, Td7);
+		    }
+	       }
+	       {
+		    V Tbx, Tby, T3x, Tbz, T4z, TbA, TbB, T3E, TbC, T4w, T3M, Tf2, T4C, TbZ, T3T;
+		    V Tf3, T4B, TbW, T3A, T3D, Tf1, Tf4;
+		    {
+			 V T3v, T3w, T4x, T4y;
+			 T3v = LD(&(xi[WS(is, 127)]), ivs, &(xi[WS(is, 1)]));
+			 T3w = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+			 Tbx = VADD(T3v, T3w);
+			 T4x = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+			 T4y = LD(&(xi[WS(is, 95)]), ivs, &(xi[WS(is, 1)]));
+			 Tby = VADD(T4x, T4y);
+			 T3x = VSUB(T3v, T3w);
+			 Tbz = VADD(Tbx, Tby);
+			 T4z = VSUB(T4x, T4y);
+		    }
+		    {
+			 V T3y, T3z, T3B, T3C;
+			 T3y = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T3z = LD(&(xi[WS(is, 79)]), ivs, &(xi[WS(is, 1)]));
+			 T3A = VSUB(T3y, T3z);
+			 TbA = VADD(T3y, T3z);
+			 T3B = LD(&(xi[WS(is, 111)]), ivs, &(xi[WS(is, 1)]));
+			 T3C = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+			 T3D = VSUB(T3B, T3C);
+			 TbB = VADD(T3B, T3C);
+		    }
+		    T3E = VMUL(LDK(KP707106781), VADD(T3A, T3D));
+		    TbC = VADD(TbA, TbB);
+		    T4w = VMUL(LDK(KP707106781), VSUB(T3D, T3A));
+		    {
+			 V T3I, TbX, T3L, TbY;
+			 {
+			      V T3G, T3H, T3J, T3K;
+			      T3G = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			      T3H = LD(&(xi[WS(is, 71)]), ivs, &(xi[WS(is, 1)]));
+			      T3I = VSUB(T3G, T3H);
+			      TbX = VADD(T3G, T3H);
+			      T3J = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+			      T3K = LD(&(xi[WS(is, 103)]), ivs, &(xi[WS(is, 1)]));
+			      T3L = VSUB(T3J, T3K);
+			      TbY = VADD(T3J, T3K);
+			 }
+			 T3M = VFNMS(LDK(KP382683432), T3L, VMUL(LDK(KP923879532), T3I));
+			 Tf2 = VSUB(TbX, TbY);
+			 T4C = VFMA(LDK(KP382683432), T3I, VMUL(LDK(KP923879532), T3L));
+			 TbZ = VADD(TbX, TbY);
+		    }
+		    {
+			 V T3P, TbU, T3S, TbV;
+			 {
+			      V T3N, T3O, T3Q, T3R;
+			      T3N = LD(&(xi[WS(is, 119)]), ivs, &(xi[WS(is, 1)]));
+			      T3O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+			      T3P = VSUB(T3N, T3O);
+			      TbU = VADD(T3N, T3O);
+			      T3Q = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			      T3R = LD(&(xi[WS(is, 87)]), ivs, &(xi[WS(is, 1)]));
+			      T3S = VSUB(T3Q, T3R);
+			      TbV = VADD(T3Q, T3R);
+			 }
+			 T3T = VFMA(LDK(KP923879532), T3P, VMUL(LDK(KP382683432), T3S));
+			 Tf3 = VSUB(TbU, TbV);
+			 T4B = VFNMS(LDK(KP923879532), T3S, VMUL(LDK(KP382683432), T3P));
+			 TbW = VADD(TbU, TbV);
+		    }
+		    TbD = VSUB(Tbz, TbC);
+		    Tc0 = VSUB(TbW, TbZ);
+		    {
+			 V T3F, T3U, Tfe, Tff;
+			 T3F = VADD(T3x, T3E);
+			 T3U = VADD(T3M, T3T);
+			 T3V = VADD(T3F, T3U);
+			 T61 = VSUB(T3F, T3U);
+			 Tfe = VSUB(TbB, TbA);
+			 Tff = VMUL(LDK(KP707106781), VSUB(Tf3, Tf2));
+			 Tfg = VADD(Tfe, Tff);
+			 TgB = VSUB(Tff, Tfe);
+		    }
+		    {
+			 V T4A, T4D, T7S, T7T;
+			 T4A = VSUB(T4w, T4z);
+			 T4D = VSUB(T4B, T4C);
+			 T4E = VADD(T4A, T4D);
+			 T65 = VSUB(T4D, T4A);
+			 T7S = VADD(T4z, T4w);
+			 T7T = VSUB(T3T, T3M);
+			 T7U = VADD(T7S, T7T);
+			 T9f = VSUB(T7T, T7S);
+		    }
+		    Tf1 = VSUB(Tbx, Tby);
+		    Tf4 = VMUL(LDK(KP707106781), VADD(Tf2, Tf3));
+		    Tf5 = VADD(Tf1, Tf4);
+		    Tgx = VSUB(Tf1, Tf4);
+		    {
+			 V T7H, T7I, Tdd, Tde;
+			 T7H = VSUB(T3x, T3E);
+			 T7I = VADD(T4C, T4B);
+			 T7J = VADD(T7H, T7I);
+			 T9b = VSUB(T7H, T7I);
+			 Tdd = VADD(Tbz, TbC);
+			 Tde = VADD(TbZ, TbW);
+			 Tdf = VADD(Tdd, Tde);
+			 TdN = VSUB(Tdd, Tde);
+		    }
+	       }
+	       {
+		    V Tu, Tee, TG, Tag, TL, Teh, TX, Tan, TB, Tef, TD, Taj, TS, Tei, TU;
+		    V Taq, Teg, Tej;
+		    {
+			 V Ts, Tt, Tae, TE, TF, Taf;
+			 Ts = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tt = LD(&(xi[WS(is, 68)]), ivs, &(xi[0]));
+			 Tae = VADD(Ts, Tt);
+			 TE = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+			 TF = LD(&(xi[WS(is, 100)]), ivs, &(xi[0]));
+			 Taf = VADD(TE, TF);
+			 Tu = VSUB(Ts, Tt);
+			 Tee = VSUB(Tae, Taf);
+			 TG = VSUB(TE, TF);
+			 Tag = VADD(Tae, Taf);
+		    }
+		    {
+			 V TJ, TK, Tal, TV, TW, Tam;
+			 TJ = LD(&(xi[WS(is, 124)]), ivs, &(xi[0]));
+			 TK = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+			 Tal = VADD(TJ, TK);
+			 TV = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			 TW = LD(&(xi[WS(is, 92)]), ivs, &(xi[0]));
+			 Tam = VADD(TV, TW);
+			 TL = VSUB(TJ, TK);
+			 Teh = VSUB(Tal, Tam);
+			 TX = VSUB(TV, TW);
+			 Tan = VADD(Tal, Tam);
+		    }
+		    {
+			 V Tx, Tah, TA, Tai;
+			 {
+			      V Tv, Tw, Ty, Tz;
+			      Tv = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			      Tw = LD(&(xi[WS(is, 84)]), ivs, &(xi[0]));
+			      Tx = VSUB(Tv, Tw);
+			      Tah = VADD(Tv, Tw);
+			      Ty = LD(&(xi[WS(is, 116)]), ivs, &(xi[0]));
+			      Tz = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+			      TA = VSUB(Ty, Tz);
+			      Tai = VADD(Ty, Tz);
+			 }
+			 TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
+			 Tef = VSUB(Tai, Tah);
+			 TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
+			 Taj = VADD(Tah, Tai);
+		    }
+		    {
+			 V TO, Tao, TR, Tap;
+			 {
+			      V TM, TN, TP, TQ;
+			      TM = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			      TN = LD(&(xi[WS(is, 76)]), ivs, &(xi[0]));
+			      TO = VSUB(TM, TN);
+			      Tao = VADD(TM, TN);
+			      TP = LD(&(xi[WS(is, 108)]), ivs, &(xi[0]));
+			      TQ = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+			      TR = VSUB(TP, TQ);
+			      Tap = VADD(TP, TQ);
+			 }
+			 TS = VMUL(LDK(KP707106781), VADD(TO, TR));
+			 Tei = VSUB(Tap, Tao);
+			 TU = VMUL(LDK(KP707106781), VSUB(TR, TO));
+			 Taq = VADD(Tao, Tap);
+		    }
+		    Td2 = VADD(Tag, Taj);
+		    Td3 = VADD(Tan, Taq);
+		    {
+			 V TC, TH, Tfr, Tfs;
+			 TC = VADD(Tu, TB);
+			 TH = VSUB(TD, TG);
+			 TI = VFMA(LDK(KP980785280), TC, VMUL(LDK(KP195090322), TH));
+			 T4V = VFNMS(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
+			 Tfr = VFNMS(LDK(KP382683432), Tee, VMUL(LDK(KP923879532), Tef));
+			 Tfs = VFMA(LDK(KP382683432), Teh, VMUL(LDK(KP923879532), Tei));
+			 Tft = VADD(Tfr, Tfs);
+			 Tgg = VSUB(Tfs, Tfr);
+		    }
+		    {
+			 V TT, TY, T73, T74;
+			 TT = VADD(TL, TS);
+			 TY = VSUB(TU, TX);
+			 TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
+			 T4W = VFMA(LDK(KP195090322), TT, VMUL(LDK(KP980785280), TY));
+			 T73 = VSUB(TL, TS);
+			 T74 = VADD(TX, TU);
+			 T75 = VFNMS(LDK(KP555570233), T74, VMUL(LDK(KP831469612), T73));
+			 T86 = VFMA(LDK(KP555570233), T73, VMUL(LDK(KP831469612), T74));
+		    }
+		    Teg = VFMA(LDK(KP923879532), Tee, VMUL(LDK(KP382683432), Tef));
+		    Tej = VFNMS(LDK(KP382683432), Tei, VMUL(LDK(KP923879532), Teh));
+		    Tek = VADD(Teg, Tej);
+		    TgG = VSUB(Tej, Teg);
+		    {
+			 V T70, T71, Tak, Tar;
+			 T70 = VSUB(Tu, TB);
+			 T71 = VADD(TG, TD);
+			 T72 = VFMA(LDK(KP831469612), T70, VMUL(LDK(KP555570233), T71));
+			 T85 = VFNMS(LDK(KP555570233), T70, VMUL(LDK(KP831469612), T71));
+			 Tak = VSUB(Tag, Taj);
+			 Tar = VSUB(Tan, Taq);
+			 Tas = VMUL(LDK(KP707106781), VADD(Tak, Tar));
+			 Tcd = VMUL(LDK(KP707106781), VSUB(Tar, Tak));
+		    }
+	       }
+	       {
+		    V Tav, Tau, T1b, Taw, T1v, Tay, Tax, T18, Taz, T1w, T1j, Teo, T1z, TaD, T1q;
+		    V Ten, T1y, TaG, T14, T17, Tem, Tep;
+		    {
+			 V T19, T1a, T1t, T1u;
+			 T19 = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+			 T1a = LD(&(xi[WS(is, 98)]), ivs, &(xi[0]));
+			 Tav = VADD(T19, T1a);
+			 T1t = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T1u = LD(&(xi[WS(is, 66)]), ivs, &(xi[0]));
+			 Tau = VADD(T1t, T1u);
+			 T1b = VSUB(T19, T1a);
+			 Taw = VADD(Tau, Tav);
+			 T1v = VSUB(T1t, T1u);
+		    }
+		    {
+			 V T12, T13, T15, T16;
+			 T12 = LD(&(xi[WS(is, 114)]), ivs, &(xi[0]));
+			 T13 = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+			 T14 = VSUB(T12, T13);
+			 Tay = VADD(T12, T13);
+			 T15 = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 T16 = LD(&(xi[WS(is, 82)]), ivs, &(xi[0]));
+			 T17 = VSUB(T15, T16);
+			 Tax = VADD(T15, T16);
+		    }
+		    T18 = VMUL(LDK(KP707106781), VSUB(T14, T17));
+		    Taz = VADD(Tax, Tay);
+		    T1w = VMUL(LDK(KP707106781), VADD(T17, T14));
+		    {
+			 V T1f, TaB, T1i, TaC;
+			 {
+			      V T1d, T1e, T1g, T1h;
+			      T1d = LD(&(xi[WS(is, 122)]), ivs, &(xi[0]));
+			      T1e = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+			      T1f = VSUB(T1d, T1e);
+			      TaB = VADD(T1d, T1e);
+			      T1g = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+			      T1h = LD(&(xi[WS(is, 90)]), ivs, &(xi[0]));
+			      T1i = VSUB(T1g, T1h);
+			      TaC = VADD(T1g, T1h);
+			 }
+			 T1j = VFNMS(LDK(KP923879532), T1i, VMUL(LDK(KP382683432), T1f));
+			 Teo = VSUB(TaB, TaC);
+			 T1z = VFMA(LDK(KP923879532), T1f, VMUL(LDK(KP382683432), T1i));
+			 TaD = VADD(TaB, TaC);
+		    }
+		    {
+			 V T1m, TaE, T1p, TaF;
+			 {
+			      V T1k, T1l, T1n, T1o;
+			      T1k = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			      T1l = LD(&(xi[WS(is, 74)]), ivs, &(xi[0]));
+			      T1m = VSUB(T1k, T1l);
+			      TaE = VADD(T1k, T1l);
+			      T1n = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+			      T1o = LD(&(xi[WS(is, 106)]), ivs, &(xi[0]));
+			      T1p = VSUB(T1n, T1o);
+			      TaF = VADD(T1n, T1o);
+			 }
+			 T1q = VFMA(LDK(KP382683432), T1m, VMUL(LDK(KP923879532), T1p));
+			 Ten = VSUB(TaE, TaF);
+			 T1y = VFNMS(LDK(KP382683432), T1p, VMUL(LDK(KP923879532), T1m));
+			 TaG = VADD(TaE, TaF);
+		    }
+		    Tdp = VADD(Taw, Taz);
+		    Tdq = VADD(TaG, TaD);
+		    TdG = VSUB(Tdp, Tdq);
+		    Tem = VSUB(Tau, Tav);
+		    Tep = VMUL(LDK(KP707106781), VADD(Ten, Teo));
+		    Teq = VADD(Tem, Tep);
+		    Tgm = VSUB(Tem, Tep);
+		    {
+			 V Ter, Tes, T1c, T1r;
+			 Ter = VSUB(Tay, Tax);
+			 Tes = VMUL(LDK(KP707106781), VSUB(Teo, Ten));
+			 Tet = VADD(Ter, Tes);
+			 Tgl = VSUB(Tes, Ter);
+			 T1c = VSUB(T18, T1b);
+			 T1r = VSUB(T1j, T1q);
+			 T1s = VADD(T1c, T1r);
+			 T5P = VSUB(T1r, T1c);
+		    }
+		    {
+			 V T1x, T1A, T7b, T7c;
+			 T1x = VADD(T1v, T1w);
+			 T1A = VADD(T1y, T1z);
+			 T1B = VADD(T1x, T1A);
+			 T5Q = VSUB(T1x, T1A);
+			 T7b = VADD(T1b, T18);
+			 T7c = VSUB(T1z, T1y);
+			 T7d = VADD(T7b, T7c);
+			 T8Z = VSUB(T7c, T7b);
+		    }
+		    {
+			 V TaA, TaH, T78, T79;
+			 TaA = VSUB(Taw, Taz);
+			 TaH = VSUB(TaD, TaG);
+			 TaI = VFMA(LDK(KP923879532), TaA, VMUL(LDK(KP382683432), TaH));
+			 Tcf = VFNMS(LDK(KP382683432), TaA, VMUL(LDK(KP923879532), TaH));
+			 T78 = VSUB(T1v, T1w);
+			 T79 = VADD(T1q, T1j);
+			 T7a = VADD(T78, T79);
+			 T90 = VSUB(T78, T79);
+		    }
+	       }
+	       {
+		    V TaJ, TaK, T1F, TaL, T27, TaM, TaN, T1M, TaO, T24, T1U, Tew, T2a, TaV, T21;
+		    V Tex, T29, TaS, T1I, T1L, Tev, Tey;
+		    {
+			 V T1D, T1E, T25, T26;
+			 T1D = LD(&(xi[WS(is, 126)]), ivs, &(xi[0]));
+			 T1E = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+			 TaJ = VADD(T1D, T1E);
+			 T25 = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+			 T26 = LD(&(xi[WS(is, 94)]), ivs, &(xi[0]));
+			 TaK = VADD(T25, T26);
+			 T1F = VSUB(T1D, T1E);
+			 TaL = VADD(TaJ, TaK);
+			 T27 = VSUB(T25, T26);
+		    }
+		    {
+			 V T1G, T1H, T1J, T1K;
+			 T1G = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 T1H = LD(&(xi[WS(is, 78)]), ivs, &(xi[0]));
+			 T1I = VSUB(T1G, T1H);
+			 TaM = VADD(T1G, T1H);
+			 T1J = LD(&(xi[WS(is, 110)]), ivs, &(xi[0]));
+			 T1K = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+			 T1L = VSUB(T1J, T1K);
+			 TaN = VADD(T1J, T1K);
+		    }
+		    T1M = VMUL(LDK(KP707106781), VADD(T1I, T1L));
+		    TaO = VADD(TaM, TaN);
+		    T24 = VMUL(LDK(KP707106781), VSUB(T1L, T1I));
+		    {
+			 V T1Q, TaT, T1T, TaU;
+			 {
+			      V T1O, T1P, T1R, T1S;
+			      T1O = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			      T1P = LD(&(xi[WS(is, 70)]), ivs, &(xi[0]));
+			      T1Q = VSUB(T1O, T1P);
+			      TaT = VADD(T1O, T1P);
+			      T1R = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+			      T1S = LD(&(xi[WS(is, 102)]), ivs, &(xi[0]));
+			      T1T = VSUB(T1R, T1S);
+			      TaU = VADD(T1R, T1S);
+			 }
+			 T1U = VFNMS(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1Q));
+			 Tew = VSUB(TaT, TaU);
+			 T2a = VFMA(LDK(KP382683432), T1Q, VMUL(LDK(KP923879532), T1T));
+			 TaV = VADD(TaT, TaU);
+		    }
+		    {
+			 V T1X, TaQ, T20, TaR;
+			 {
+			      V T1V, T1W, T1Y, T1Z;
+			      T1V = LD(&(xi[WS(is, 118)]), ivs, &(xi[0]));
+			      T1W = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+			      T1X = VSUB(T1V, T1W);
+			      TaQ = VADD(T1V, T1W);
+			      T1Y = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			      T1Z = LD(&(xi[WS(is, 86)]), ivs, &(xi[0]));
+			      T20 = VSUB(T1Y, T1Z);
+			      TaR = VADD(T1Y, T1Z);
+			 }
+			 T21 = VFMA(LDK(KP923879532), T1X, VMUL(LDK(KP382683432), T20));
+			 Tex = VSUB(TaQ, TaR);
+			 T29 = VFNMS(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T1X));
+			 TaS = VADD(TaQ, TaR);
+		    }
+		    Tdm = VADD(TaL, TaO);
+		    Tdn = VADD(TaV, TaS);
+		    TdH = VSUB(Tdm, Tdn);
+		    Tev = VSUB(TaJ, TaK);
+		    Tey = VMUL(LDK(KP707106781), VADD(Tew, Tex));
+		    Tez = VADD(Tev, Tey);
+		    Tgi = VSUB(Tev, Tey);
+		    {
+			 V TeA, TeB, T1N, T22;
+			 TeA = VSUB(TaN, TaM);
+			 TeB = VMUL(LDK(KP707106781), VSUB(Tex, Tew));
+			 TeC = VADD(TeA, TeB);
+			 Tgj = VSUB(TeB, TeA);
+			 T1N = VADD(T1F, T1M);
+			 T22 = VADD(T1U, T21);
+			 T23 = VADD(T1N, T22);
+			 T5N = VSUB(T1N, T22);
+		    }
+		    {
+			 V T28, T2b, T7i, T7j;
+			 T28 = VSUB(T24, T27);
+			 T2b = VSUB(T29, T2a);
+			 T2c = VADD(T28, T2b);
+			 T5M = VSUB(T2b, T28);
+			 T7i = VADD(T27, T24);
+			 T7j = VSUB(T21, T1U);
+			 T7k = VADD(T7i, T7j);
+			 T8X = VSUB(T7j, T7i);
+		    }
+		    {
+			 V TaP, TaW, T7f, T7g;
+			 TaP = VSUB(TaL, TaO);
+			 TaW = VSUB(TaS, TaV);
+			 TaX = VFNMS(LDK(KP382683432), TaW, VMUL(LDK(KP923879532), TaP));
+			 Tcg = VFMA(LDK(KP382683432), TaP, VMUL(LDK(KP923879532), TaW));
+			 T7f = VSUB(T1F, T1M);
+			 T7g = VADD(T2a, T29);
+			 T7h = VADD(T7f, T7g);
+			 T8W = VSUB(T7f, T7g);
+		    }
+	       }
+	       {
+		    V T2J, TeL, T2V, Tb9, T30, TeO, T3c, Tbg, T2Q, TeM, T2S, Tbc, T37, TeP, T39;
+		    V Tbj;
+		    {
+			 V T2H, T2I, Tb7, T2T, T2U, Tb8;
+			 T2H = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T2I = LD(&(xi[WS(is, 69)]), ivs, &(xi[WS(is, 1)]));
+			 Tb7 = VADD(T2H, T2I);
+			 T2T = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+			 T2U = LD(&(xi[WS(is, 101)]), ivs, &(xi[WS(is, 1)]));
+			 Tb8 = VADD(T2T, T2U);
+			 T2J = VSUB(T2H, T2I);
+			 TeL = VSUB(Tb7, Tb8);
+			 T2V = VSUB(T2T, T2U);
+			 Tb9 = VADD(Tb7, Tb8);
+		    }
+		    {
+			 V T2Y, T2Z, Tbe, T3a, T3b, Tbf;
+			 T2Y = LD(&(xi[WS(is, 125)]), ivs, &(xi[WS(is, 1)]));
+			 T2Z = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+			 Tbe = VADD(T2Y, T2Z);
+			 T3a = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+			 T3b = LD(&(xi[WS(is, 93)]), ivs, &(xi[WS(is, 1)]));
+			 Tbf = VADD(T3a, T3b);
+			 T30 = VSUB(T2Y, T2Z);
+			 TeO = VSUB(Tbe, Tbf);
+			 T3c = VSUB(T3a, T3b);
+			 Tbg = VADD(Tbe, Tbf);
+		    }
+		    {
+			 V T2M, Tba, T2P, Tbb;
+			 {
+			      V T2K, T2L, T2N, T2O;
+			      T2K = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			      T2L = LD(&(xi[WS(is, 85)]), ivs, &(xi[WS(is, 1)]));
+			      T2M = VSUB(T2K, T2L);
+			      Tba = VADD(T2K, T2L);
+			      T2N = LD(&(xi[WS(is, 117)]), ivs, &(xi[WS(is, 1)]));
+			      T2O = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+			      T2P = VSUB(T2N, T2O);
+			      Tbb = VADD(T2N, T2O);
+			 }
+			 T2Q = VMUL(LDK(KP707106781), VADD(T2M, T2P));
+			 TeM = VSUB(Tbb, Tba);
+			 T2S = VMUL(LDK(KP707106781), VSUB(T2P, T2M));
+			 Tbc = VADD(Tba, Tbb);
+		    }
+		    {
+			 V T33, Tbh, T36, Tbi;
+			 {
+			      V T31, T32, T34, T35;
+			      T31 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			      T32 = LD(&(xi[WS(is, 77)]), ivs, &(xi[WS(is, 1)]));
+			      T33 = VSUB(T31, T32);
+			      Tbh = VADD(T31, T32);
+			      T34 = LD(&(xi[WS(is, 109)]), ivs, &(xi[WS(is, 1)]));
+			      T35 = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+			      T36 = VSUB(T34, T35);
+			      Tbi = VADD(T34, T35);
+			 }
+			 T37 = VMUL(LDK(KP707106781), VADD(T33, T36));
+			 TeP = VSUB(Tbi, Tbh);
+			 T39 = VMUL(LDK(KP707106781), VSUB(T36, T33));
+			 Tbj = VADD(Tbh, Tbi);
+		    }
+		    {
+			 V Tbd, Tbk, TeN, TeQ;
+			 Tbd = VSUB(Tb9, Tbc);
+			 Tbk = VSUB(Tbg, Tbj);
+			 Tbl = VMUL(LDK(KP707106781), VADD(Tbd, Tbk));
+			 Tbu = VMUL(LDK(KP707106781), VSUB(Tbk, Tbd));
+			 {
+			      V Td9, Tda, TeW, TeX;
+			      Td9 = VADD(Tb9, Tbc);
+			      Tda = VADD(Tbg, Tbj);
+			      Tdb = VADD(Td9, Tda);
+			      TdL = VSUB(Tda, Td9);
+			      TeW = VFNMS(LDK(KP382683432), TeL, VMUL(LDK(KP923879532), TeM));
+			      TeX = VFMA(LDK(KP382683432), TeO, VMUL(LDK(KP923879532), TeP));
+			      TeY = VADD(TeW, TeX);
+			      Tgu = VSUB(TeX, TeW);
+			 }
+			 TeN = VFMA(LDK(KP923879532), TeL, VMUL(LDK(KP382683432), TeM));
+			 TeQ = VFNMS(LDK(KP382683432), TeP, VMUL(LDK(KP923879532), TeO));
+			 TeR = VADD(TeN, TeQ);
+			 Tgq = VSUB(TeQ, TeN);
+			 {
+			      V T7t, T7C, T7w, T7D;
+			      {
+				   V T7r, T7s, T7u, T7v;
+				   T7r = VSUB(T2J, T2Q);
+				   T7s = VADD(T2V, T2S);
+				   T7t = VFMA(LDK(KP831469612), T7r, VMUL(LDK(KP555570233), T7s));
+				   T7C = VFNMS(LDK(KP555570233), T7r, VMUL(LDK(KP831469612), T7s));
+				   T7u = VSUB(T30, T37);
+				   T7v = VADD(T3c, T39);
+				   T7w = VFNMS(LDK(KP555570233), T7v, VMUL(LDK(KP831469612), T7u));
+				   T7D = VFMA(LDK(KP555570233), T7u, VMUL(LDK(KP831469612), T7v));
+			      }
+			      T7x = VADD(T7t, T7w);
+			      T98 = VSUB(T7D, T7C);
+			      T7E = VADD(T7C, T7D);
+			      T94 = VSUB(T7w, T7t);
+			 }
+			 {
+			      V T2X, T3q, T3e, T3r;
+			      {
+				   V T2R, T2W, T38, T3d;
+				   T2R = VADD(T2J, T2Q);
+				   T2W = VSUB(T2S, T2V);
+				   T2X = VFMA(LDK(KP980785280), T2R, VMUL(LDK(KP195090322), T2W));
+				   T3q = VFNMS(LDK(KP195090322), T2R, VMUL(LDK(KP980785280), T2W));
+				   T38 = VADD(T30, T37);
+				   T3d = VSUB(T39, T3c);
+				   T3e = VFNMS(LDK(KP195090322), T3d, VMUL(LDK(KP980785280), T38));
+				   T3r = VFMA(LDK(KP195090322), T38, VMUL(LDK(KP980785280), T3d));
+			      }
+			      T3f = VADD(T2X, T3e);
+			      T5Y = VSUB(T3r, T3q);
+			      T3s = VADD(T3q, T3r);
+			      T5U = VSUB(T3e, T2X);
+			 }
+		    }
+	       }
+	       {
+		    V T3Y, Tf6, T4a, TbG, T4f, Tf9, T4r, TbN, T45, Tf7, T47, TbJ, T4m, Tfa, T4o;
+		    V TbQ;
+		    {
+			 V T3W, T3X, TbE, T48, T49, TbF;
+			 T3W = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T3X = LD(&(xi[WS(is, 67)]), ivs, &(xi[WS(is, 1)]));
+			 TbE = VADD(T3W, T3X);
+			 T48 = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+			 T49 = LD(&(xi[WS(is, 99)]), ivs, &(xi[WS(is, 1)]));
+			 TbF = VADD(T48, T49);
+			 T3Y = VSUB(T3W, T3X);
+			 Tf6 = VSUB(TbE, TbF);
+			 T4a = VSUB(T48, T49);
+			 TbG = VADD(TbE, TbF);
+		    }
+		    {
+			 V T4d, T4e, TbL, T4p, T4q, TbM;
+			 T4d = LD(&(xi[WS(is, 123)]), ivs, &(xi[WS(is, 1)]));
+			 T4e = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+			 TbL = VADD(T4d, T4e);
+			 T4p = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+			 T4q = LD(&(xi[WS(is, 91)]), ivs, &(xi[WS(is, 1)]));
+			 TbM = VADD(T4p, T4q);
+			 T4f = VSUB(T4d, T4e);
+			 Tf9 = VSUB(TbL, TbM);
+			 T4r = VSUB(T4p, T4q);
+			 TbN = VADD(TbL, TbM);
+		    }
+		    {
+			 V T41, TbH, T44, TbI;
+			 {
+			      V T3Z, T40, T42, T43;
+			      T3Z = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			      T40 = LD(&(xi[WS(is, 83)]), ivs, &(xi[WS(is, 1)]));
+			      T41 = VSUB(T3Z, T40);
+			      TbH = VADD(T3Z, T40);
+			      T42 = LD(&(xi[WS(is, 115)]), ivs, &(xi[WS(is, 1)]));
+			      T43 = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+			      T44 = VSUB(T42, T43);
+			      TbI = VADD(T42, T43);
+			 }
+			 T45 = VMUL(LDK(KP707106781), VADD(T41, T44));
+			 Tf7 = VSUB(TbI, TbH);
+			 T47 = VMUL(LDK(KP707106781), VSUB(T44, T41));
+			 TbJ = VADD(TbH, TbI);
+		    }
+		    {
+			 V T4i, TbO, T4l, TbP;
+			 {
+			      V T4g, T4h, T4j, T4k;
+			      T4g = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			      T4h = LD(&(xi[WS(is, 75)]), ivs, &(xi[WS(is, 1)]));
+			      T4i = VSUB(T4g, T4h);
+			      TbO = VADD(T4g, T4h);
+			      T4j = LD(&(xi[WS(is, 107)]), ivs, &(xi[WS(is, 1)]));
+			      T4k = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+			      T4l = VSUB(T4j, T4k);
+			      TbP = VADD(T4j, T4k);
+			 }
+			 T4m = VMUL(LDK(KP707106781), VADD(T4i, T4l));
+			 Tfa = VSUB(TbP, TbO);
+			 T4o = VMUL(LDK(KP707106781), VSUB(T4l, T4i));
+			 TbQ = VADD(TbO, TbP);
+		    }
+		    {
+			 V TbK, TbR, Tf8, Tfb;
+			 TbK = VSUB(TbG, TbJ);
+			 TbR = VSUB(TbN, TbQ);
+			 TbS = VMUL(LDK(KP707106781), VADD(TbK, TbR));
+			 Tc1 = VMUL(LDK(KP707106781), VSUB(TbR, TbK));
+			 {
+			      V Tdg, Tdh, Tfh, Tfi;
+			      Tdg = VADD(TbG, TbJ);
+			      Tdh = VADD(TbN, TbQ);
+			      Tdi = VADD(Tdg, Tdh);
+			      TdO = VSUB(Tdh, Tdg);
+			      Tfh = VFNMS(LDK(KP382683432), Tf6, VMUL(LDK(KP923879532), Tf7));
+			      Tfi = VFMA(LDK(KP382683432), Tf9, VMUL(LDK(KP923879532), Tfa));
+			      Tfj = VADD(Tfh, Tfi);
+			      Tgy = VSUB(Tfi, Tfh);
+			 }
+			 Tf8 = VFMA(LDK(KP923879532), Tf6, VMUL(LDK(KP382683432), Tf7));
+			 Tfb = VFNMS(LDK(KP382683432), Tfa, VMUL(LDK(KP923879532), Tf9));
+			 Tfc = VADD(Tf8, Tfb);
+			 TgA = VSUB(Tfb, Tf8);
+			 {
+			      V T7M, T7V, T7P, T7W;
+			      {
+				   V T7K, T7L, T7N, T7O;
+				   T7K = VSUB(T3Y, T45);
+				   T7L = VADD(T4a, T47);
+				   T7M = VFMA(LDK(KP831469612), T7K, VMUL(LDK(KP555570233), T7L));
+				   T7V = VFNMS(LDK(KP555570233), T7K, VMUL(LDK(KP831469612), T7L));
+				   T7N = VSUB(T4f, T4m);
+				   T7O = VADD(T4r, T4o);
+				   T7P = VFNMS(LDK(KP555570233), T7O, VMUL(LDK(KP831469612), T7N));
+				   T7W = VFMA(LDK(KP555570233), T7N, VMUL(LDK(KP831469612), T7O));
+			      }
+			      T7Q = VADD(T7M, T7P);
+			      T9e = VSUB(T7P, T7M);
+			      T7X = VADD(T7V, T7W);
+			      T9c = VSUB(T7W, T7V);
+			 }
+			 {
+			      V T4c, T4F, T4t, T4G;
+			      {
+				   V T46, T4b, T4n, T4s;
+				   T46 = VADD(T3Y, T45);
+				   T4b = VSUB(T47, T4a);
+				   T4c = VFMA(LDK(KP980785280), T46, VMUL(LDK(KP195090322), T4b));
+				   T4F = VFNMS(LDK(KP195090322), T46, VMUL(LDK(KP980785280), T4b));
+				   T4n = VADD(T4f, T4m);
+				   T4s = VSUB(T4o, T4r);
+				   T4t = VFNMS(LDK(KP195090322), T4s, VMUL(LDK(KP980785280), T4n));
+				   T4G = VFMA(LDK(KP195090322), T4n, VMUL(LDK(KP980785280), T4s));
+			      }
+			      T4u = VADD(T4c, T4t);
+			      T64 = VSUB(T4t, T4c);
+			      T4H = VADD(T4F, T4G);
+			      T62 = VSUB(T4G, T4F);
+			 }
+		    }
+	       }
+	       {
+		    V Td5, Tdx, TdC, TdE, Tdk, Tdt, Tds, Tdy, Tdz, TdD;
+		    {
+			 V Td1, Td4, TdA, TdB;
+			 Td1 = VADD(TcZ, Td0);
+			 Td4 = VADD(Td2, Td3);
+			 Td5 = VSUB(Td1, Td4);
+			 Tdx = VADD(Td1, Td4);
+			 TdA = VADD(Td8, Tdb);
+			 TdB = VADD(Tdf, Tdi);
+			 TdC = VADD(TdA, TdB);
+			 TdE = VBYI(VSUB(TdB, TdA));
+		    }
+		    {
+			 V Tdc, Tdj, Tdo, Tdr;
+			 Tdc = VSUB(Td8, Tdb);
+			 Tdj = VSUB(Tdf, Tdi);
+			 Tdk = VMUL(LDK(KP707106781), VADD(Tdc, Tdj));
+			 Tdt = VMUL(LDK(KP707106781), VSUB(Tdj, Tdc));
+			 Tdo = VADD(Tdm, Tdn);
+			 Tdr = VADD(Tdp, Tdq);
+			 Tds = VSUB(Tdo, Tdr);
+			 Tdy = VADD(Tdr, Tdo);
+		    }
+		    Tdz = VADD(Tdx, Tdy);
+		    ST(&(xo[WS(os, 64)]), VSUB(Tdz, TdC), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(Tdz, TdC), ovs, &(xo[0]));
+		    TdD = VSUB(Tdx, Tdy);
+		    ST(&(xo[WS(os, 96)]), VSUB(TdD, TdE), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 32)]), VADD(TdD, TdE), ovs, &(xo[0]));
+		    {
+			 V Tdl, Tdu, Tdv, Tdw;
+			 Tdl = VADD(Td5, Tdk);
+			 Tdu = VBYI(VADD(Tds, Tdt));
+			 ST(&(xo[WS(os, 112)]), VSUB(Tdl, Tdu), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 16)]), VADD(Tdl, Tdu), ovs, &(xo[0]));
+			 Tdv = VSUB(Td5, Tdk);
+			 Tdw = VBYI(VSUB(Tdt, Tds));
+			 ST(&(xo[WS(os, 80)]), VSUB(Tdv, Tdw), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 48)]), VADD(Tdv, Tdw), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V TdJ, Te4, TdX, Te5, TdQ, Te1, TdU, Te2;
+		    {
+			 V TdF, TdI, TdV, TdW;
+			 TdF = VSUB(TcZ, Td0);
+			 TdI = VMUL(LDK(KP707106781), VADD(TdG, TdH));
+			 TdJ = VADD(TdF, TdI);
+			 Te4 = VSUB(TdF, TdI);
+			 TdV = VFNMS(LDK(KP382683432), TdK, VMUL(LDK(KP923879532), TdL));
+			 TdW = VFMA(LDK(KP382683432), TdN, VMUL(LDK(KP923879532), TdO));
+			 TdX = VADD(TdV, TdW);
+			 Te5 = VSUB(TdW, TdV);
+		    }
+		    {
+			 V TdM, TdP, TdS, TdT;
+			 TdM = VFMA(LDK(KP923879532), TdK, VMUL(LDK(KP382683432), TdL));
+			 TdP = VFNMS(LDK(KP382683432), TdO, VMUL(LDK(KP923879532), TdN));
+			 TdQ = VADD(TdM, TdP);
+			 Te1 = VSUB(TdP, TdM);
+			 TdS = VSUB(Td3, Td2);
+			 TdT = VMUL(LDK(KP707106781), VSUB(TdH, TdG));
+			 TdU = VADD(TdS, TdT);
+			 Te2 = VSUB(TdT, TdS);
+		    }
+		    {
+			 V TdR, TdY, Te7, Te8;
+			 TdR = VADD(TdJ, TdQ);
+			 TdY = VBYI(VADD(TdU, TdX));
+			 ST(&(xo[WS(os, 120)]), VSUB(TdR, TdY), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 8)]), VADD(TdR, TdY), ovs, &(xo[0]));
+			 Te7 = VBYI(VADD(Te2, Te1));
+			 Te8 = VADD(Te4, Te5);
+			 ST(&(xo[WS(os, 24)]), VADD(Te7, Te8), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 104)]), VSUB(Te8, Te7), ovs, &(xo[0]));
+		    }
+		    {
+			 V TdZ, Te0, Te3, Te6;
+			 TdZ = VSUB(TdJ, TdQ);
+			 Te0 = VBYI(VSUB(TdX, TdU));
+			 ST(&(xo[WS(os, 72)]), VSUB(TdZ, Te0), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 56)]), VADD(TdZ, Te0), ovs, &(xo[0]));
+			 Te3 = VBYI(VSUB(Te1, Te2));
+			 Te6 = VSUB(Te4, Te5);
+			 ST(&(xo[WS(os, 40)]), VADD(Te3, Te6), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 88)]), VSUB(Te6, Te3), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V TaZ, Tcs, Tci, Tcq, Tc4, Tct, Tcl, Tcp;
+		    {
+			 V Tat, TaY, Tce, Tch;
+			 Tat = VADD(Tad, Tas);
+			 TaY = VADD(TaI, TaX);
+			 TaZ = VADD(Tat, TaY);
+			 Tcs = VSUB(Tat, TaY);
+			 Tce = VADD(Tcc, Tcd);
+			 Tch = VADD(Tcf, Tcg);
+			 Tci = VADD(Tce, Tch);
+			 Tcq = VSUB(Tch, Tce);
+			 {
+			      V Tbw, Tcj, Tc3, Tck;
+			      {
+				   V Tbm, Tbv, TbT, Tc2;
+				   Tbm = VADD(Tb6, Tbl);
+				   Tbv = VADD(Tbt, Tbu);
+				   Tbw = VFMA(LDK(KP980785280), Tbm, VMUL(LDK(KP195090322), Tbv));
+				   Tcj = VFNMS(LDK(KP195090322), Tbm, VMUL(LDK(KP980785280), Tbv));
+				   TbT = VADD(TbD, TbS);
+				   Tc2 = VADD(Tc0, Tc1);
+				   Tc3 = VFNMS(LDK(KP195090322), Tc2, VMUL(LDK(KP980785280), TbT));
+				   Tck = VFMA(LDK(KP195090322), TbT, VMUL(LDK(KP980785280), Tc2));
+			      }
+			      Tc4 = VADD(Tbw, Tc3);
+			      Tct = VSUB(Tck, Tcj);
+			      Tcl = VADD(Tcj, Tck);
+			      Tcp = VSUB(Tc3, Tbw);
+			 }
+		    }
+		    {
+			 V Tc5, Tcm, Tcv, Tcw;
+			 Tc5 = VADD(TaZ, Tc4);
+			 Tcm = VBYI(VADD(Tci, Tcl));
+			 ST(&(xo[WS(os, 124)]), VSUB(Tc5, Tcm), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 4)]), VADD(Tc5, Tcm), ovs, &(xo[0]));
+			 Tcv = VBYI(VADD(Tcq, Tcp));
+			 Tcw = VADD(Tcs, Tct);
+			 ST(&(xo[WS(os, 28)]), VADD(Tcv, Tcw), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 100)]), VSUB(Tcw, Tcv), ovs, &(xo[0]));
+		    }
+		    {
+			 V Tcn, Tco, Tcr, Tcu;
+			 Tcn = VSUB(TaZ, Tc4);
+			 Tco = VBYI(VSUB(Tcl, Tci));
+			 ST(&(xo[WS(os, 68)]), VSUB(Tcn, Tco), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 60)]), VADD(Tcn, Tco), ovs, &(xo[0]));
+			 Tcr = VBYI(VSUB(Tcp, Tcq));
+			 Tcu = VSUB(Tcs, Tct);
+			 ST(&(xo[WS(os, 36)]), VADD(Tcr, Tcu), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 92)]), VSUB(Tcu, Tcr), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V Tcz, TcU, TcK, TcS, TcG, TcV, TcN, TcR;
+		    {
+			 V Tcx, Tcy, TcI, TcJ;
+			 Tcx = VSUB(Tad, Tas);
+			 Tcy = VSUB(Tcg, Tcf);
+			 Tcz = VADD(Tcx, Tcy);
+			 TcU = VSUB(Tcx, Tcy);
+			 TcI = VSUB(Tcd, Tcc);
+			 TcJ = VSUB(TaX, TaI);
+			 TcK = VADD(TcI, TcJ);
+			 TcS = VSUB(TcJ, TcI);
+			 {
+			      V TcC, TcL, TcF, TcM;
+			      {
+				   V TcA, TcB, TcD, TcE;
+				   TcA = VSUB(Tb6, Tbl);
+				   TcB = VSUB(Tbu, Tbt);
+				   TcC = VFMA(LDK(KP831469612), TcA, VMUL(LDK(KP555570233), TcB));
+				   TcL = VFNMS(LDK(KP555570233), TcA, VMUL(LDK(KP831469612), TcB));
+				   TcD = VSUB(TbD, TbS);
+				   TcE = VSUB(Tc1, Tc0);
+				   TcF = VFNMS(LDK(KP555570233), TcE, VMUL(LDK(KP831469612), TcD));
+				   TcM = VFMA(LDK(KP555570233), TcD, VMUL(LDK(KP831469612), TcE));
+			      }
+			      TcG = VADD(TcC, TcF);
+			      TcV = VSUB(TcM, TcL);
+			      TcN = VADD(TcL, TcM);
+			      TcR = VSUB(TcF, TcC);
+			 }
+		    }
+		    {
+			 V TcH, TcO, TcX, TcY;
+			 TcH = VADD(Tcz, TcG);
+			 TcO = VBYI(VADD(TcK, TcN));
+			 ST(&(xo[WS(os, 116)]), VSUB(TcH, TcO), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 12)]), VADD(TcH, TcO), ovs, &(xo[0]));
+			 TcX = VBYI(VADD(TcS, TcR));
+			 TcY = VADD(TcU, TcV);
+			 ST(&(xo[WS(os, 20)]), VADD(TcX, TcY), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 108)]), VSUB(TcY, TcX), ovs, &(xo[0]));
+		    }
+		    {
+			 V TcP, TcQ, TcT, TcW;
+			 TcP = VSUB(Tcz, TcG);
+			 TcQ = VBYI(VSUB(TcN, TcK));
+			 ST(&(xo[WS(os, 76)]), VSUB(TcP, TcQ), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 52)]), VADD(TcP, TcQ), ovs, &(xo[0]));
+			 TcT = VBYI(VSUB(TcR, TcS));
+			 TcW = VSUB(TcU, TcV);
+			 ST(&(xo[WS(os, 44)]), VADD(TcT, TcW), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 84)]), VSUB(TcW, TcT), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V TeF, Tg8, TfI, Tg0, Tfy, Tga, TfG, TfP, Tfm, TfJ, TfB, TfF, TfW, Tgb, Tg3;
+		    V Tg7;
+		    {
+			 V Tel, TfY, TeE, TfZ, Teu, TeD;
+			 Tel = VADD(Ted, Tek);
+			 TfY = VSUB(Tft, Tfq);
+			 Teu = VFMA(LDK(KP980785280), Teq, VMUL(LDK(KP195090322), Tet));
+			 TeD = VFNMS(LDK(KP195090322), TeC, VMUL(LDK(KP980785280), Tez));
+			 TeE = VADD(Teu, TeD);
+			 TfZ = VSUB(TeD, Teu);
+			 TeF = VADD(Tel, TeE);
+			 Tg8 = VSUB(TfZ, TfY);
+			 TfI = VSUB(Tel, TeE);
+			 Tg0 = VADD(TfY, TfZ);
+		    }
+		    {
+			 V Tfu, TfN, Tfx, TfO, Tfv, Tfw;
+			 Tfu = VADD(Tfq, Tft);
+			 TfN = VSUB(Ted, Tek);
+			 Tfv = VFNMS(LDK(KP195090322), Teq, VMUL(LDK(KP980785280), Tet));
+			 Tfw = VFMA(LDK(KP195090322), Tez, VMUL(LDK(KP980785280), TeC));
+			 Tfx = VADD(Tfv, Tfw);
+			 TfO = VSUB(Tfw, Tfv);
+			 Tfy = VADD(Tfu, Tfx);
+			 Tga = VSUB(TfN, TfO);
+			 TfG = VSUB(Tfx, Tfu);
+			 TfP = VADD(TfN, TfO);
+		    }
+		    {
+			 V Tf0, Tfz, Tfl, TfA;
+			 {
+			      V TeS, TeZ, Tfd, Tfk;
+			      TeS = VADD(TeK, TeR);
+			      TeZ = VADD(TeV, TeY);
+			      Tf0 = VFMA(LDK(KP995184726), TeS, VMUL(LDK(KP098017140), TeZ));
+			      Tfz = VFNMS(LDK(KP098017140), TeS, VMUL(LDK(KP995184726), TeZ));
+			      Tfd = VADD(Tf5, Tfc);
+			      Tfk = VADD(Tfg, Tfj);
+			      Tfl = VFNMS(LDK(KP098017140), Tfk, VMUL(LDK(KP995184726), Tfd));
+			      TfA = VFMA(LDK(KP098017140), Tfd, VMUL(LDK(KP995184726), Tfk));
+			 }
+			 Tfm = VADD(Tf0, Tfl);
+			 TfJ = VSUB(TfA, Tfz);
+			 TfB = VADD(Tfz, TfA);
+			 TfF = VSUB(Tfl, Tf0);
+		    }
+		    {
+			 V TfS, Tg1, TfV, Tg2;
+			 {
+			      V TfQ, TfR, TfT, TfU;
+			      TfQ = VSUB(TeK, TeR);
+			      TfR = VSUB(TeY, TeV);
+			      TfS = VFMA(LDK(KP773010453), TfQ, VMUL(LDK(KP634393284), TfR));
+			      Tg1 = VFNMS(LDK(KP634393284), TfQ, VMUL(LDK(KP773010453), TfR));
+			      TfT = VSUB(Tf5, Tfc);
+			      TfU = VSUB(Tfj, Tfg);
+			      TfV = VFNMS(LDK(KP634393284), TfU, VMUL(LDK(KP773010453), TfT));
+			      Tg2 = VFMA(LDK(KP634393284), TfT, VMUL(LDK(KP773010453), TfU));
+			 }
+			 TfW = VADD(TfS, TfV);
+			 Tgb = VSUB(Tg2, Tg1);
+			 Tg3 = VADD(Tg1, Tg2);
+			 Tg7 = VSUB(TfV, TfS);
+		    }
+		    {
+			 V Tfn, TfC, Tg9, Tgc;
+			 Tfn = VADD(TeF, Tfm);
+			 TfC = VBYI(VADD(Tfy, TfB));
+			 ST(&(xo[WS(os, 126)]), VSUB(Tfn, TfC), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 2)]), VADD(Tfn, TfC), ovs, &(xo[0]));
+			 Tg9 = VBYI(VSUB(Tg7, Tg8));
+			 Tgc = VSUB(Tga, Tgb);
+			 ST(&(xo[WS(os, 46)]), VADD(Tg9, Tgc), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 82)]), VSUB(Tgc, Tg9), ovs, &(xo[0]));
+		    }
+		    {
+			 V Tgd, Tge, TfD, TfE;
+			 Tgd = VBYI(VADD(Tg8, Tg7));
+			 Tge = VADD(Tga, Tgb);
+			 ST(&(xo[WS(os, 18)]), VADD(Tgd, Tge), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 110)]), VSUB(Tge, Tgd), ovs, &(xo[0]));
+			 TfD = VSUB(TeF, Tfm);
+			 TfE = VBYI(VSUB(TfB, Tfy));
+			 ST(&(xo[WS(os, 66)]), VSUB(TfD, TfE), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 62)]), VADD(TfD, TfE), ovs, &(xo[0]));
+		    }
+		    {
+			 V TfH, TfK, TfX, Tg4;
+			 TfH = VBYI(VSUB(TfF, TfG));
+			 TfK = VSUB(TfI, TfJ);
+			 ST(&(xo[WS(os, 34)]), VADD(TfH, TfK), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 94)]), VSUB(TfK, TfH), ovs, &(xo[0]));
+			 TfX = VADD(TfP, TfW);
+			 Tg4 = VBYI(VADD(Tg0, Tg3));
+			 ST(&(xo[WS(os, 114)]), VSUB(TfX, Tg4), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 14)]), VADD(TfX, Tg4), ovs, &(xo[0]));
+		    }
+		    {
+			 V Tg5, Tg6, TfL, TfM;
+			 Tg5 = VSUB(TfP, TfW);
+			 Tg6 = VBYI(VSUB(Tg3, Tg0));
+			 ST(&(xo[WS(os, 78)]), VSUB(Tg5, Tg6), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 50)]), VADD(Tg5, Tg6), ovs, &(xo[0]));
+			 TfL = VBYI(VADD(TfG, TfF));
+			 TfM = VADD(TfI, TfJ);
+			 ST(&(xo[WS(os, 30)]), VADD(TfL, TfM), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 98)]), VSUB(TfM, TfL), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V Tgp, Thm, TgW, The, TgM, Tho, TgU, Th3, TgE, TgX, TgP, TgT, Tha, Thp, Thh;
+		    V Thl;
+		    {
+			 V Tgh, Thc, Tgo, Thd, Tgk, Tgn;
+			 Tgh = VSUB(Tgf, Tgg);
+			 Thc = VADD(TgH, TgG);
+			 Tgk = VFMA(LDK(KP555570233), Tgi, VMUL(LDK(KP831469612), Tgj));
+			 Tgn = VFNMS(LDK(KP555570233), Tgm, VMUL(LDK(KP831469612), Tgl));
+			 Tgo = VSUB(Tgk, Tgn);
+			 Thd = VADD(Tgn, Tgk);
+			 Tgp = VADD(Tgh, Tgo);
+			 Thm = VSUB(Thd, Thc);
+			 TgW = VSUB(Tgh, Tgo);
+			 The = VADD(Thc, Thd);
+		    }
+		    {
+			 V TgI, Th1, TgL, Th2, TgJ, TgK;
+			 TgI = VSUB(TgG, TgH);
+			 Th1 = VADD(Tgf, Tgg);
+			 TgJ = VFNMS(LDK(KP555570233), Tgj, VMUL(LDK(KP831469612), Tgi));
+			 TgK = VFMA(LDK(KP831469612), Tgm, VMUL(LDK(KP555570233), Tgl));
+			 TgL = VSUB(TgJ, TgK);
+			 Th2 = VADD(TgK, TgJ);
+			 TgM = VADD(TgI, TgL);
+			 Tho = VSUB(Th1, Th2);
+			 TgU = VSUB(TgL, TgI);
+			 Th3 = VADD(Th1, Th2);
+		    }
+		    {
+			 V Tgw, TgN, TgD, TgO;
+			 {
+			      V Tgs, Tgv, Tgz, TgC;
+			      Tgs = VSUB(Tgq, Tgr);
+			      Tgv = VSUB(Tgt, Tgu);
+			      Tgw = VFMA(LDK(KP471396736), Tgs, VMUL(LDK(KP881921264), Tgv));
+			      TgN = VFNMS(LDK(KP471396736), Tgv, VMUL(LDK(KP881921264), Tgs));
+			      Tgz = VSUB(Tgx, Tgy);
+			      TgC = VSUB(TgA, TgB);
+			      TgD = VFNMS(LDK(KP471396736), TgC, VMUL(LDK(KP881921264), Tgz));
+			      TgO = VFMA(LDK(KP881921264), TgC, VMUL(LDK(KP471396736), Tgz));
+			 }
+			 TgE = VADD(Tgw, TgD);
+			 TgX = VSUB(TgO, TgN);
+			 TgP = VADD(TgN, TgO);
+			 TgT = VSUB(TgD, Tgw);
+		    }
+		    {
+			 V Th6, Thf, Th9, Thg;
+			 {
+			      V Th4, Th5, Th7, Th8;
+			      Th4 = VADD(Tgr, Tgq);
+			      Th5 = VADD(Tgt, Tgu);
+			      Th6 = VFMA(LDK(KP290284677), Th4, VMUL(LDK(KP956940335), Th5));
+			      Thf = VFNMS(LDK(KP290284677), Th5, VMUL(LDK(KP956940335), Th4));
+			      Th7 = VADD(Tgx, Tgy);
+			      Th8 = VADD(TgB, TgA);
+			      Th9 = VFNMS(LDK(KP290284677), Th8, VMUL(LDK(KP956940335), Th7));
+			      Thg = VFMA(LDK(KP956940335), Th8, VMUL(LDK(KP290284677), Th7));
+			 }
+			 Tha = VADD(Th6, Th9);
+			 Thp = VSUB(Thg, Thf);
+			 Thh = VADD(Thf, Thg);
+			 Thl = VSUB(Th9, Th6);
+		    }
+		    {
+			 V TgF, TgQ, Thn, Thq;
+			 TgF = VADD(Tgp, TgE);
+			 TgQ = VBYI(VADD(TgM, TgP));
+			 ST(&(xo[WS(os, 118)]), VSUB(TgF, TgQ), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 10)]), VADD(TgF, TgQ), ovs, &(xo[0]));
+			 Thn = VBYI(VSUB(Thl, Thm));
+			 Thq = VSUB(Tho, Thp);
+			 ST(&(xo[WS(os, 38)]), VADD(Thn, Thq), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 90)]), VSUB(Thq, Thn), ovs, &(xo[0]));
+		    }
+		    {
+			 V Thr, Ths, TgR, TgS;
+			 Thr = VBYI(VADD(Thm, Thl));
+			 Ths = VADD(Tho, Thp);
+			 ST(&(xo[WS(os, 26)]), VADD(Thr, Ths), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 102)]), VSUB(Ths, Thr), ovs, &(xo[0]));
+			 TgR = VSUB(Tgp, TgE);
+			 TgS = VBYI(VSUB(TgP, TgM));
+			 ST(&(xo[WS(os, 74)]), VSUB(TgR, TgS), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 54)]), VADD(TgR, TgS), ovs, &(xo[0]));
+		    }
+		    {
+			 V TgV, TgY, Thb, Thi;
+			 TgV = VBYI(VSUB(TgT, TgU));
+			 TgY = VSUB(TgW, TgX);
+			 ST(&(xo[WS(os, 42)]), VADD(TgV, TgY), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 86)]), VSUB(TgY, TgV), ovs, &(xo[0]));
+			 Thb = VADD(Th3, Tha);
+			 Thi = VBYI(VADD(The, Thh));
+			 ST(&(xo[WS(os, 122)]), VSUB(Thb, Thi), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 6)]), VADD(Thb, Thi), ovs, &(xo[0]));
+		    }
+		    {
+			 V Thj, Thk, TgZ, Th0;
+			 Thj = VSUB(Th3, Tha);
+			 Thk = VBYI(VSUB(Thh, The));
+			 ST(&(xo[WS(os, 70)]), VSUB(Thj, Thk), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 58)]), VADD(Thj, Thk), ovs, &(xo[0]));
+			 TgZ = VBYI(VADD(TgU, TgT));
+			 Th0 = VADD(TgW, TgX);
+			 ST(&(xo[WS(os, 22)]), VADD(TgZ, Th0), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 106)]), VSUB(Th0, TgZ), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T80, T8n, T8f, T8j, T8A, T8P, T8H, T8L, T7n, T8M, T8O, T8c, T8k, T8t, T8E;
+		    V T8m;
+		    {
+			 V T7G, T8d, T7Z, T8e;
+			 {
+			      V T7y, T7F, T7R, T7Y;
+			      T7y = VADD(T7q, T7x);
+			      T7F = VADD(T7B, T7E);
+			      T7G = VFMA(LDK(KP989176509), T7y, VMUL(LDK(KP146730474), T7F));
+			      T8d = VFNMS(LDK(KP146730474), T7y, VMUL(LDK(KP989176509), T7F));
+			      T7R = VADD(T7J, T7Q);
+			      T7Y = VADD(T7U, T7X);
+			      T7Z = VFNMS(LDK(KP146730474), T7Y, VMUL(LDK(KP989176509), T7R));
+			      T8e = VFMA(LDK(KP146730474), T7R, VMUL(LDK(KP989176509), T7Y));
+			 }
+			 T80 = VADD(T7G, T7Z);
+			 T8n = VSUB(T8e, T8d);
+			 T8f = VADD(T8d, T8e);
+			 T8j = VSUB(T7Z, T7G);
+		    }
+		    {
+			 V T8w, T8F, T8z, T8G;
+			 {
+			      V T8u, T8v, T8x, T8y;
+			      T8u = VSUB(T7q, T7x);
+			      T8v = VSUB(T7E, T7B);
+			      T8w = VFMA(LDK(KP803207531), T8u, VMUL(LDK(KP595699304), T8v));
+			      T8F = VFNMS(LDK(KP595699304), T8u, VMUL(LDK(KP803207531), T8v));
+			      T8x = VSUB(T7J, T7Q);
+			      T8y = VSUB(T7X, T7U);
+			      T8z = VFNMS(LDK(KP595699304), T8y, VMUL(LDK(KP803207531), T8x));
+			      T8G = VFMA(LDK(KP595699304), T8x, VMUL(LDK(KP803207531), T8y));
+			 }
+			 T8A = VADD(T8w, T8z);
+			 T8P = VSUB(T8G, T8F);
+			 T8H = VADD(T8F, T8G);
+			 T8L = VSUB(T8z, T8w);
+		    }
+		    {
+			 V T77, T8r, T88, T8C, T7m, T8D, T8b, T8s, T76, T87;
+			 T76 = VADD(T72, T75);
+			 T77 = VADD(T6Z, T76);
+			 T8r = VSUB(T6Z, T76);
+			 T87 = VADD(T85, T86);
+			 T88 = VADD(T84, T87);
+			 T8C = VSUB(T87, T84);
+			 {
+			      V T7e, T7l, T89, T8a;
+			      T7e = VFMA(LDK(KP956940335), T7a, VMUL(LDK(KP290284677), T7d));
+			      T7l = VFNMS(LDK(KP290284677), T7k, VMUL(LDK(KP956940335), T7h));
+			      T7m = VADD(T7e, T7l);
+			      T8D = VSUB(T7l, T7e);
+			      T89 = VFNMS(LDK(KP290284677), T7a, VMUL(LDK(KP956940335), T7d));
+			      T8a = VFMA(LDK(KP290284677), T7h, VMUL(LDK(KP956940335), T7k));
+			      T8b = VADD(T89, T8a);
+			      T8s = VSUB(T8a, T89);
+			 }
+			 T7n = VADD(T77, T7m);
+			 T8M = VSUB(T8D, T8C);
+			 T8O = VSUB(T8r, T8s);
+			 T8c = VADD(T88, T8b);
+			 T8k = VSUB(T8b, T88);
+			 T8t = VADD(T8r, T8s);
+			 T8E = VADD(T8C, T8D);
+			 T8m = VSUB(T77, T7m);
+		    }
+		    {
+			 V T81, T8g, T8N, T8Q;
+			 T81 = VADD(T7n, T80);
+			 T8g = VBYI(VADD(T8c, T8f));
+			 ST(&(xo[WS(os, 125)]), VSUB(T81, T8g), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 3)]), VADD(T81, T8g), ovs, &(xo[WS(os, 1)]));
+			 T8N = VBYI(VSUB(T8L, T8M));
+			 T8Q = VSUB(T8O, T8P);
+			 ST(&(xo[WS(os, 45)]), VADD(T8N, T8Q), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 83)]), VSUB(T8Q, T8N), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T8R, T8S, T8h, T8i;
+			 T8R = VBYI(VADD(T8M, T8L));
+			 T8S = VADD(T8O, T8P);
+			 ST(&(xo[WS(os, 19)]), VADD(T8R, T8S), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 109)]), VSUB(T8S, T8R), ovs, &(xo[WS(os, 1)]));
+			 T8h = VSUB(T7n, T80);
+			 T8i = VBYI(VSUB(T8f, T8c));
+			 ST(&(xo[WS(os, 67)]), VSUB(T8h, T8i), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 61)]), VADD(T8h, T8i), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T8l, T8o, T8B, T8I;
+			 T8l = VBYI(VSUB(T8j, T8k));
+			 T8o = VSUB(T8m, T8n);
+			 ST(&(xo[WS(os, 35)]), VADD(T8l, T8o), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 93)]), VSUB(T8o, T8l), ovs, &(xo[WS(os, 1)]));
+			 T8B = VADD(T8t, T8A);
+			 T8I = VBYI(VADD(T8E, T8H));
+			 ST(&(xo[WS(os, 115)]), VSUB(T8B, T8I), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 13)]), VADD(T8B, T8I), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T8J, T8K, T8p, T8q;
+			 T8J = VSUB(T8t, T8A);
+			 T8K = VBYI(VSUB(T8H, T8E));
+			 ST(&(xo[WS(os, 77)]), VSUB(T8J, T8K), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 51)]), VADD(T8J, T8K), ovs, &(xo[WS(os, 1)]));
+			 T8p = VBYI(VADD(T8k, T8j));
+			 T8q = VADD(T8m, T8n);
+			 ST(&(xo[WS(os, 29)]), VADD(T8p, T8q), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 99)]), VSUB(T8q, T8p), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T4K, T5d, T55, T59, T5q, T5F, T5x, T5B, T2f, T5C, T5E, T52, T5a, T5j, T5u;
+		    V T5c;
+		    {
+			 V T3u, T53, T4J, T54;
+			 {
+			      V T3g, T3t, T4v, T4I;
+			      T3g = VADD(T2G, T3f);
+			      T3t = VADD(T3p, T3s);
+			      T3u = VFMA(LDK(KP998795456), T3g, VMUL(LDK(KP049067674), T3t));
+			      T53 = VFNMS(LDK(KP049067674), T3g, VMUL(LDK(KP998795456), T3t));
+			      T4v = VADD(T3V, T4u);
+			      T4I = VADD(T4E, T4H);
+			      T4J = VFNMS(LDK(KP049067674), T4I, VMUL(LDK(KP998795456), T4v));
+			      T54 = VFMA(LDK(KP049067674), T4v, VMUL(LDK(KP998795456), T4I));
+			 }
+			 T4K = VADD(T3u, T4J);
+			 T5d = VSUB(T54, T53);
+			 T55 = VADD(T53, T54);
+			 T59 = VSUB(T4J, T3u);
+		    }
+		    {
+			 V T5m, T5v, T5p, T5w;
+			 {
+			      V T5k, T5l, T5n, T5o;
+			      T5k = VSUB(T2G, T3f);
+			      T5l = VSUB(T3s, T3p);
+			      T5m = VFMA(LDK(KP740951125), T5k, VMUL(LDK(KP671558954), T5l));
+			      T5v = VFNMS(LDK(KP671558954), T5k, VMUL(LDK(KP740951125), T5l));
+			      T5n = VSUB(T3V, T4u);
+			      T5o = VSUB(T4H, T4E);
+			      T5p = VFNMS(LDK(KP671558954), T5o, VMUL(LDK(KP740951125), T5n));
+			      T5w = VFMA(LDK(KP671558954), T5n, VMUL(LDK(KP740951125), T5o));
+			 }
+			 T5q = VADD(T5m, T5p);
+			 T5F = VSUB(T5w, T5v);
+			 T5x = VADD(T5v, T5w);
+			 T5B = VSUB(T5p, T5m);
+		    }
+		    {
+			 V T11, T5h, T4Y, T5s, T2e, T5t, T51, T5i, T10, T4X;
+			 T10 = VADD(TI, TZ);
+			 T11 = VADD(Tr, T10);
+			 T5h = VSUB(Tr, T10);
+			 T4X = VADD(T4V, T4W);
+			 T4Y = VADD(T4U, T4X);
+			 T5s = VSUB(T4X, T4U);
+			 {
+			      V T1C, T2d, T4Z, T50;
+			      T1C = VFMA(LDK(KP098017140), T1s, VMUL(LDK(KP995184726), T1B));
+			      T2d = VFNMS(LDK(KP098017140), T2c, VMUL(LDK(KP995184726), T23));
+			      T2e = VADD(T1C, T2d);
+			      T5t = VSUB(T2d, T1C);
+			      T4Z = VFNMS(LDK(KP098017140), T1B, VMUL(LDK(KP995184726), T1s));
+			      T50 = VFMA(LDK(KP995184726), T2c, VMUL(LDK(KP098017140), T23));
+			      T51 = VADD(T4Z, T50);
+			      T5i = VSUB(T50, T4Z);
+			 }
+			 T2f = VADD(T11, T2e);
+			 T5C = VSUB(T5t, T5s);
+			 T5E = VSUB(T5h, T5i);
+			 T52 = VADD(T4Y, T51);
+			 T5a = VSUB(T51, T4Y);
+			 T5j = VADD(T5h, T5i);
+			 T5u = VADD(T5s, T5t);
+			 T5c = VSUB(T11, T2e);
+		    }
+		    {
+			 V T4L, T56, T5D, T5G;
+			 T4L = VADD(T2f, T4K);
+			 T56 = VBYI(VADD(T52, T55));
+			 ST(&(xo[WS(os, 127)]), VSUB(T4L, T56), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 1)]), VADD(T4L, T56), ovs, &(xo[WS(os, 1)]));
+			 T5D = VBYI(VSUB(T5B, T5C));
+			 T5G = VSUB(T5E, T5F);
+			 ST(&(xo[WS(os, 47)]), VADD(T5D, T5G), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 81)]), VSUB(T5G, T5D), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T5H, T5I, T57, T58;
+			 T5H = VBYI(VADD(T5C, T5B));
+			 T5I = VADD(T5E, T5F);
+			 ST(&(xo[WS(os, 17)]), VADD(T5H, T5I), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 111)]), VSUB(T5I, T5H), ovs, &(xo[WS(os, 1)]));
+			 T57 = VSUB(T2f, T4K);
+			 T58 = VBYI(VSUB(T55, T52));
+			 ST(&(xo[WS(os, 65)]), VSUB(T57, T58), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 63)]), VADD(T57, T58), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T5b, T5e, T5r, T5y;
+			 T5b = VBYI(VSUB(T59, T5a));
+			 T5e = VSUB(T5c, T5d);
+			 ST(&(xo[WS(os, 33)]), VADD(T5b, T5e), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 95)]), VSUB(T5e, T5b), ovs, &(xo[WS(os, 1)]));
+			 T5r = VADD(T5j, T5q);
+			 T5y = VBYI(VADD(T5u, T5x));
+			 ST(&(xo[WS(os, 113)]), VSUB(T5r, T5y), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 15)]), VADD(T5r, T5y), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T5z, T5A, T5f, T5g;
+			 T5z = VSUB(T5j, T5q);
+			 T5A = VBYI(VSUB(T5x, T5u));
+			 ST(&(xo[WS(os, 79)]), VSUB(T5z, T5A), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 49)]), VADD(T5z, T5A), ovs, &(xo[WS(os, 1)]));
+			 T5f = VBYI(VADD(T5a, T59));
+			 T5g = VADD(T5c, T5d);
+			 ST(&(xo[WS(os, 31)]), VADD(T5f, T5g), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 97)]), VSUB(T5g, T5f), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T9i, T9B, T9t, T9x, T9O, Ta3, T9V, T9Z, T93, Ta0, Ta2, T9q, T9y, T9H, T9S;
+		    V T9A;
+		    {
+			 V T9a, T9r, T9h, T9s;
+			 {
+			      V T96, T99, T9d, T9g;
+			      T96 = VSUB(T94, T95);
+			      T99 = VSUB(T97, T98);
+			      T9a = VFMA(LDK(KP514102744), T96, VMUL(LDK(KP857728610), T99));
+			      T9r = VFNMS(LDK(KP514102744), T99, VMUL(LDK(KP857728610), T96));
+			      T9d = VSUB(T9b, T9c);
+			      T9g = VSUB(T9e, T9f);
+			      T9h = VFNMS(LDK(KP514102744), T9g, VMUL(LDK(KP857728610), T9d));
+			      T9s = VFMA(LDK(KP857728610), T9g, VMUL(LDK(KP514102744), T9d));
+			 }
+			 T9i = VADD(T9a, T9h);
+			 T9B = VSUB(T9s, T9r);
+			 T9t = VADD(T9r, T9s);
+			 T9x = VSUB(T9h, T9a);
+		    }
+		    {
+			 V T9K, T9T, T9N, T9U;
+			 {
+			      V T9I, T9J, T9L, T9M;
+			      T9I = VADD(T95, T94);
+			      T9J = VADD(T97, T98);
+			      T9K = VFMA(LDK(KP242980179), T9I, VMUL(LDK(KP970031253), T9J));
+			      T9T = VFNMS(LDK(KP242980179), T9J, VMUL(LDK(KP970031253), T9I));
+			      T9L = VADD(T9b, T9c);
+			      T9M = VADD(T9f, T9e);
+			      T9N = VFNMS(LDK(KP242980179), T9M, VMUL(LDK(KP970031253), T9L));
+			      T9U = VFMA(LDK(KP970031253), T9M, VMUL(LDK(KP242980179), T9L));
+			 }
+			 T9O = VADD(T9K, T9N);
+			 Ta3 = VSUB(T9U, T9T);
+			 T9V = VADD(T9T, T9U);
+			 T9Z = VSUB(T9N, T9K);
+		    }
+		    {
+			 V T8V, T9F, T9m, T9Q, T92, T9R, T9p, T9G, T8U, T9k;
+			 T8U = VSUB(T86, T85);
+			 T8V = VSUB(T8T, T8U);
+			 T9F = VADD(T8T, T8U);
+			 T9k = VSUB(T75, T72);
+			 T9m = VSUB(T9k, T9l);
+			 T9Q = VADD(T9l, T9k);
+			 {
+			      V T8Y, T91, T9n, T9o;
+			      T8Y = VFMA(LDK(KP471396736), T8W, VMUL(LDK(KP881921264), T8X));
+			      T91 = VFNMS(LDK(KP471396736), T90, VMUL(LDK(KP881921264), T8Z));
+			      T92 = VSUB(T8Y, T91);
+			      T9R = VADD(T91, T8Y);
+			      T9n = VFNMS(LDK(KP471396736), T8X, VMUL(LDK(KP881921264), T8W));
+			      T9o = VFMA(LDK(KP881921264), T90, VMUL(LDK(KP471396736), T8Z));
+			      T9p = VSUB(T9n, T9o);
+			      T9G = VADD(T9o, T9n);
+			 }
+			 T93 = VADD(T8V, T92);
+			 Ta0 = VSUB(T9R, T9Q);
+			 Ta2 = VSUB(T9F, T9G);
+			 T9q = VADD(T9m, T9p);
+			 T9y = VSUB(T9p, T9m);
+			 T9H = VADD(T9F, T9G);
+			 T9S = VADD(T9Q, T9R);
+			 T9A = VSUB(T8V, T92);
+		    }
+		    {
+			 V T9j, T9u, Ta1, Ta4;
+			 T9j = VADD(T93, T9i);
+			 T9u = VBYI(VADD(T9q, T9t));
+			 ST(&(xo[WS(os, 117)]), VSUB(T9j, T9u), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 11)]), VADD(T9j, T9u), ovs, &(xo[WS(os, 1)]));
+			 Ta1 = VBYI(VSUB(T9Z, Ta0));
+			 Ta4 = VSUB(Ta2, Ta3);
+			 ST(&(xo[WS(os, 37)]), VADD(Ta1, Ta4), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 91)]), VSUB(Ta4, Ta1), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V Ta5, Ta6, T9v, T9w;
+			 Ta5 = VBYI(VADD(Ta0, T9Z));
+			 Ta6 = VADD(Ta2, Ta3);
+			 ST(&(xo[WS(os, 27)]), VADD(Ta5, Ta6), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 101)]), VSUB(Ta6, Ta5), ovs, &(xo[WS(os, 1)]));
+			 T9v = VSUB(T93, T9i);
+			 T9w = VBYI(VSUB(T9t, T9q));
+			 ST(&(xo[WS(os, 75)]), VSUB(T9v, T9w), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 53)]), VADD(T9v, T9w), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T9z, T9C, T9P, T9W;
+			 T9z = VBYI(VSUB(T9x, T9y));
+			 T9C = VSUB(T9A, T9B);
+			 ST(&(xo[WS(os, 43)]), VADD(T9z, T9C), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 85)]), VSUB(T9C, T9z), ovs, &(xo[WS(os, 1)]));
+			 T9P = VADD(T9H, T9O);
+			 T9W = VBYI(VADD(T9S, T9V));
+			 ST(&(xo[WS(os, 123)]), VSUB(T9P, T9W), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 5)]), VADD(T9P, T9W), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T9X, T9Y, T9D, T9E;
+			 T9X = VSUB(T9H, T9O);
+			 T9Y = VBYI(VSUB(T9V, T9S));
+			 ST(&(xo[WS(os, 69)]), VSUB(T9X, T9Y), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 59)]), VADD(T9X, T9Y), ovs, &(xo[WS(os, 1)]));
+			 T9D = VBYI(VADD(T9y, T9x));
+			 T9E = VADD(T9A, T9B);
+			 ST(&(xo[WS(os, 21)]), VADD(T9D, T9E), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 107)]), VSUB(T9E, T9D), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T68, T6r, T6j, T6n, T6E, T6T, T6L, T6P, T5T, T6Q, T6S, T6g, T6o, T6x, T6I;
+		    V T6q;
+		    {
+			 V T60, T6h, T67, T6i;
+			 {
+			      V T5W, T5Z, T63, T66;
+			      T5W = VSUB(T5U, T5V);
+			      T5Z = VSUB(T5X, T5Y);
+			      T60 = VFMA(LDK(KP427555093), T5W, VMUL(LDK(KP903989293), T5Z));
+			      T6h = VFNMS(LDK(KP427555093), T5Z, VMUL(LDK(KP903989293), T5W));
+			      T63 = VSUB(T61, T62);
+			      T66 = VSUB(T64, T65);
+			      T67 = VFNMS(LDK(KP427555093), T66, VMUL(LDK(KP903989293), T63));
+			      T6i = VFMA(LDK(KP903989293), T66, VMUL(LDK(KP427555093), T63));
+			 }
+			 T68 = VADD(T60, T67);
+			 T6r = VSUB(T6i, T6h);
+			 T6j = VADD(T6h, T6i);
+			 T6n = VSUB(T67, T60);
+		    }
+		    {
+			 V T6A, T6J, T6D, T6K;
+			 {
+			      V T6y, T6z, T6B, T6C;
+			      T6y = VADD(T5V, T5U);
+			      T6z = VADD(T5X, T5Y);
+			      T6A = VFMA(LDK(KP336889853), T6y, VMUL(LDK(KP941544065), T6z));
+			      T6J = VFNMS(LDK(KP336889853), T6z, VMUL(LDK(KP941544065), T6y));
+			      T6B = VADD(T61, T62);
+			      T6C = VADD(T65, T64);
+			      T6D = VFNMS(LDK(KP336889853), T6C, VMUL(LDK(KP941544065), T6B));
+			      T6K = VFMA(LDK(KP941544065), T6C, VMUL(LDK(KP336889853), T6B));
+			 }
+			 T6E = VADD(T6A, T6D);
+			 T6T = VSUB(T6K, T6J);
+			 T6L = VADD(T6J, T6K);
+			 T6P = VSUB(T6D, T6A);
+		    }
+		    {
+			 V T5L, T6v, T6c, T6G, T5S, T6H, T6f, T6w, T5K, T6a;
+			 T5K = VSUB(T4W, T4V);
+			 T5L = VSUB(T5J, T5K);
+			 T6v = VADD(T5J, T5K);
+			 T6a = VSUB(TZ, TI);
+			 T6c = VSUB(T6a, T6b);
+			 T6G = VADD(T6b, T6a);
+			 {
+			      V T5O, T5R, T6d, T6e;
+			      T5O = VFMA(LDK(KP773010453), T5M, VMUL(LDK(KP634393284), T5N));
+			      T5R = VFNMS(LDK(KP634393284), T5Q, VMUL(LDK(KP773010453), T5P));
+			      T5S = VSUB(T5O, T5R);
+			      T6H = VADD(T5R, T5O);
+			      T6d = VFNMS(LDK(KP634393284), T5M, VMUL(LDK(KP773010453), T5N));
+			      T6e = VFMA(LDK(KP634393284), T5P, VMUL(LDK(KP773010453), T5Q));
+			      T6f = VSUB(T6d, T6e);
+			      T6w = VADD(T6e, T6d);
+			 }
+			 T5T = VADD(T5L, T5S);
+			 T6Q = VSUB(T6H, T6G);
+			 T6S = VSUB(T6v, T6w);
+			 T6g = VADD(T6c, T6f);
+			 T6o = VSUB(T6f, T6c);
+			 T6x = VADD(T6v, T6w);
+			 T6I = VADD(T6G, T6H);
+			 T6q = VSUB(T5L, T5S);
+		    }
+		    {
+			 V T69, T6k, T6R, T6U;
+			 T69 = VADD(T5T, T68);
+			 T6k = VBYI(VADD(T6g, T6j));
+			 ST(&(xo[WS(os, 119)]), VSUB(T69, T6k), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 9)]), VADD(T69, T6k), ovs, &(xo[WS(os, 1)]));
+			 T6R = VBYI(VSUB(T6P, T6Q));
+			 T6U = VSUB(T6S, T6T);
+			 ST(&(xo[WS(os, 39)]), VADD(T6R, T6U), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 89)]), VSUB(T6U, T6R), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T6V, T6W, T6l, T6m;
+			 T6V = VBYI(VADD(T6Q, T6P));
+			 T6W = VADD(T6S, T6T);
+			 ST(&(xo[WS(os, 25)]), VADD(T6V, T6W), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 103)]), VSUB(T6W, T6V), ovs, &(xo[WS(os, 1)]));
+			 T6l = VSUB(T5T, T68);
+			 T6m = VBYI(VSUB(T6j, T6g));
+			 ST(&(xo[WS(os, 73)]), VSUB(T6l, T6m), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 55)]), VADD(T6l, T6m), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T6p, T6s, T6F, T6M;
+			 T6p = VBYI(VSUB(T6n, T6o));
+			 T6s = VSUB(T6q, T6r);
+			 ST(&(xo[WS(os, 41)]), VADD(T6p, T6s), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 87)]), VSUB(T6s, T6p), ovs, &(xo[WS(os, 1)]));
+			 T6F = VADD(T6x, T6E);
+			 T6M = VBYI(VADD(T6I, T6L));
+			 ST(&(xo[WS(os, 121)]), VSUB(T6F, T6M), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 7)]), VADD(T6F, T6M), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T6N, T6O, T6t, T6u;
+			 T6N = VSUB(T6x, T6E);
+			 T6O = VBYI(VSUB(T6L, T6I));
+			 ST(&(xo[WS(os, 71)]), VSUB(T6N, T6O), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 57)]), VADD(T6N, T6O), ovs, &(xo[WS(os, 1)]));
+			 T6t = VBYI(VADD(T6o, T6n));
+			 T6u = VADD(T6q, T6r);
+			 ST(&(xo[WS(os, 23)]), VADD(T6t, T6u), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 105)]), VSUB(T6u, T6t), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 128, XSIMD_STRING("n1fv_128"), {938, 186, 144, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_128) (planner *p) {
+     X(kdft_register) (p, n1fv_128, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 13 -name n1fv_13 -include n1f.h */
+
+/*
+ * This function contains 88 FP additions, 63 FP multiplications,
+ * (or, 31 additions, 6 multiplications, 57 fused multiply/add),
+ * 96 stack variables, 23 constants, and 26 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP904176221, +0.904176221990848204433795481776887926501523162);
+     DVK(KP575140729, +0.575140729474003121368385547455453388461001608);
+     DVK(KP300462606, +0.300462606288665774426601772289207995520941381);
+     DVK(KP516520780, +0.516520780623489722840901288569017135705033622);
+     DVK(KP522026385, +0.522026385161275033714027226654165028300441940);
+     DVK(KP957805992, +0.957805992594665126462521754605754580515587217);
+     DVK(KP600477271, +0.600477271932665282925769253334763009352012849);
+     DVK(KP251768516, +0.251768516431883313623436926934233488546674281);
+     DVK(KP503537032, +0.503537032863766627246873853868466977093348562);
+     DVK(KP769338817, +0.769338817572980603471413688209101117038278899);
+     DVK(KP859542535, +0.859542535098774820163672132761689612766401925);
+     DVK(KP581704778, +0.581704778510515730456870384989698884939833902);
+     DVK(KP853480001, +0.853480001859823990758994934970528322872359049);
+     DVK(KP083333333, +0.083333333333333333333333333333333333333333333);
+     DVK(KP226109445, +0.226109445035782405468510155372505010481906348);
+     DVK(KP301479260, +0.301479260047709873958013540496673347309208464);
+     DVK(KP686558370, +0.686558370781754340655719594850823015421401653);
+     DVK(KP514918778, +0.514918778086315755491789696138117261566051239);
+     DVK(KP038632954, +0.038632954644348171955506895830342264440241080);
+     DVK(KP612264650, +0.612264650376756543746494474777125408779395514);
+     DVK(KP302775637, +0.302775637731994646559610633735247973125648287);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(26, is), MAKE_VOLATILE_STRIDE(26, os)) {
+	       V T1, T7, T2, Tg, Tf, TN, Th, Tq, Ta, Tj, T5, Tr, Tk;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       {
+		    V Td, Te, T8, T9, T3, T4;
+		    Td = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T9 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T4 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    Tg = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+		    Tf = VADD(Td, Te);
+		    TN = VSUB(Td, Te);
+		    Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Tq = VSUB(T8, T9);
+		    Ta = VADD(T8, T9);
+		    Tj = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    T5 = VADD(T3, T4);
+		    Tr = VSUB(T4, T3);
+		    Tk = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       }
+	       {
+		    V Tt, Ti, Ty, Tb, Ts, TQ, Tx, T6, Tu, Tl;
+		    Tt = VSUB(Tg, Th);
+		    Ti = VADD(Tg, Th);
+		    Ty = VFMS(LDK(KP500000000), Ta, T7);
+		    Tb = VADD(T7, Ta);
+		    Ts = VSUB(Tq, Tr);
+		    TQ = VADD(Tr, Tq);
+		    Tx = VFNMS(LDK(KP500000000), T5, T2);
+		    T6 = VADD(T2, T5);
+		    Tu = VSUB(Tj, Tk);
+		    Tl = VADD(Tj, Tk);
+		    {
+			 V TK, Tz, Tc, TX, Tv, TO, TL, Tm;
+			 TK = VADD(Tx, Ty);
+			 Tz = VSUB(Tx, Ty);
+			 Tc = VADD(T6, Tb);
+			 TX = VSUB(T6, Tb);
+			 Tv = VSUB(Tt, Tu);
+			 TO = VADD(Tt, Tu);
+			 TL = VSUB(Ti, Tl);
+			 Tm = VADD(Ti, Tl);
+			 {
+			      V TF, Tw, TP, TY, TT, TM, TA, Tn;
+			      TF = VSUB(Ts, Tv);
+			      Tw = VADD(Ts, Tv);
+			      TP = VFNMS(LDK(KP500000000), TO, TN);
+			      TY = VADD(TN, TO);
+			      TT = VFNMS(LDK(KP866025403), TL, TK);
+			      TM = VFMA(LDK(KP866025403), TL, TK);
+			      TA = VFNMS(LDK(KP500000000), Tm, Tf);
+			      Tn = VADD(Tf, Tm);
+			      {
+				   V T1f, T1n, TI, T18, T1k, T1c, TD, T17, T10, T1m, T16, T1e, TU, TR;
+				   TU = VFNMS(LDK(KP866025403), TQ, TP);
+				   TR = VFMA(LDK(KP866025403), TQ, TP);
+				   {
+					V TZ, T15, TE, TB;
+					TZ = VFMA(LDK(KP302775637), TY, TX);
+					T15 = VFNMS(LDK(KP302775637), TX, TY);
+					TE = VSUB(Tz, TA);
+					TB = VADD(Tz, TA);
+					{
+					     V TH, To, TV, T13;
+					     TH = VSUB(Tc, Tn);
+					     To = VADD(Tc, Tn);
+					     TV = VFNMS(LDK(KP612264650), TU, TT);
+					     T13 = VFMA(LDK(KP612264650), TT, TU);
+					     {
+						  V TS, T12, TG, T1b;
+						  TS = VFNMS(LDK(KP038632954), TR, TM);
+						  T12 = VFMA(LDK(KP038632954), TM, TR);
+						  TG = VFNMS(LDK(KP514918778), TF, TE);
+						  T1b = VFMA(LDK(KP686558370), TE, TF);
+						  {
+						       V TC, T1a, Tp, TW, T14;
+						       TC = VFMA(LDK(KP301479260), TB, Tw);
+						       T1a = VFNMS(LDK(KP226109445), Tw, TB);
+						       Tp = VFNMS(LDK(KP083333333), To, T1);
+						       ST(&(xo[0]), VADD(T1, To), ovs, &(xo[0]));
+						       T1f = VFMA(LDK(KP853480001), TV, TS);
+						       TW = VFNMS(LDK(KP853480001), TV, TS);
+						       T1n = VFMA(LDK(KP853480001), T13, T12);
+						       T14 = VFNMS(LDK(KP853480001), T13, T12);
+						       TI = VFMA(LDK(KP581704778), TH, TG);
+						       T18 = VFNMS(LDK(KP859542535), TG, TH);
+						       T1k = VFMA(LDK(KP769338817), T1b, T1a);
+						       T1c = VFNMS(LDK(KP769338817), T1b, T1a);
+						       TD = VFMA(LDK(KP503537032), TC, Tp);
+						       T17 = VFNMS(LDK(KP251768516), TC, Tp);
+						       T10 = VMUL(LDK(KP600477271), VFMA(LDK(KP957805992), TZ, TW));
+						       T1m = VFNMS(LDK(KP522026385), TW, TZ);
+						       T16 = VMUL(LDK(KP600477271), VFMA(LDK(KP957805992), T15, T14));
+						       T1e = VFNMS(LDK(KP522026385), T14, T15);
+						  }
+					     }
+					}
+				   }
+				   {
+					V T1o, T1q, T1g, T1i, T1d, T1h, T1l, T1p;
+					{
+					     V T11, TJ, T19, T1j;
+					     T11 = VFMA(LDK(KP516520780), TI, TD);
+					     TJ = VFNMS(LDK(KP516520780), TI, TD);
+					     T19 = VFMA(LDK(KP300462606), T18, T17);
+					     T1j = VFNMS(LDK(KP300462606), T18, T17);
+					     T1o = VMUL(LDK(KP575140729), VFNMS(LDK(KP904176221), T1n, T1m));
+					     T1q = VMUL(LDK(KP575140729), VFMA(LDK(KP904176221), T1n, T1m));
+					     T1g = VMUL(LDK(KP575140729), VFMA(LDK(KP904176221), T1f, T1e));
+					     T1i = VMUL(LDK(KP575140729), VFNMS(LDK(KP904176221), T1f, T1e));
+					     ST(&(xo[WS(os, 12)]), VFNMSI(T16, T11), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 1)]), VFMAI(T16, T11), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 8)]), VFMAI(T10, TJ), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 5)]), VFNMSI(T10, TJ), ovs, &(xo[WS(os, 1)]));
+					     T1d = VFNMS(LDK(KP503537032), T1c, T19);
+					     T1h = VFMA(LDK(KP503537032), T1c, T19);
+					     T1l = VFNMS(LDK(KP503537032), T1k, T1j);
+					     T1p = VFMA(LDK(KP503537032), T1k, T1j);
+					}
+					ST(&(xo[WS(os, 9)]), VFMAI(T1g, T1d), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 4)]), VFNMSI(T1g, T1d), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 10)]), VFNMSI(T1i, T1h), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 3)]), VFMAI(T1i, T1h), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 7)]), VFMAI(T1o, T1l), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 6)]), VFNMSI(T1o, T1l), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 11)]), VFMAI(T1q, T1p), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 2)]), VFNMSI(T1q, T1p), ovs, &(xo[0]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 13, XSIMD_STRING("n1fv_13"), {31, 6, 57, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_13) (planner *p) {
+     X(kdft_register) (p, n1fv_13, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 13 -name n1fv_13 -include n1f.h */
+
+/*
+ * This function contains 88 FP additions, 34 FP multiplications,
+ * (or, 69 additions, 15 multiplications, 19 fused multiply/add),
+ * 60 stack variables, 20 constants, and 26 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DVK(KP083333333, +0.083333333333333333333333333333333333333333333);
+     DVK(KP075902986, +0.075902986037193865983102897245103540356428373);
+     DVK(KP251768516, +0.251768516431883313623436926934233488546674281);
+     DVK(KP132983124, +0.132983124607418643793760531921092974399165133);
+     DVK(KP258260390, +0.258260390311744861420450644284508567852516811);
+     DVK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DVK(KP300238635, +0.300238635966332641462884626667381504676006424);
+     DVK(KP011599105, +0.011599105605768290721655456654083252189827041);
+     DVK(KP156891391, +0.156891391051584611046832726756003269660212636);
+     DVK(KP256247671, +0.256247671582936600958684654061725059144125175);
+     DVK(KP174138601, +0.174138601152135905005660794929264742616964676);
+     DVK(KP575140729, +0.575140729474003121368385547455453388461001608);
+     DVK(KP503537032, +0.503537032863766627246873853868466977093348562);
+     DVK(KP113854479, +0.113854479055790798974654345867655310534642560);
+     DVK(KP265966249, +0.265966249214837287587521063842185948798330267);
+     DVK(KP387390585, +0.387390585467617292130675966426762851778775217);
+     DVK(KP300462606, +0.300462606288665774426601772289207995520941381);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(26, is), MAKE_VOLATILE_STRIDE(26, os)) {
+	       V TW, Tb, Tm, Tu, TC, TR, TX, TK, TU, Tz, TB, TN, TT;
+	       TW = LD(&(xi[0]), ivs, &(xi[0]));
+	       {
+		    V T3, TH, Tl, Tw, Tp, Tg, Tv, To, T6, Tr, T9, Ts, Ta, TI, T1;
+		    V T2, Tq, Tt;
+		    T1 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = VSUB(T1, T2);
+		    TH = VADD(T1, T2);
+		    {
+			 V Th, Ti, Tj, Tk;
+			 Th = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 Ti = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tj = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tk = VADD(Ti, Tj);
+			 Tl = VADD(Th, Tk);
+			 Tw = VSUB(Ti, Tj);
+			 Tp = VFNMS(LDK(KP500000000), Tk, Th);
+		    }
+		    {
+			 V Tc, Td, Te, Tf;
+			 Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Td = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Te = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Tf = VADD(Td, Te);
+			 Tg = VADD(Tc, Tf);
+			 Tv = VSUB(Td, Te);
+			 To = VFNMS(LDK(KP500000000), Tf, Tc);
+		    }
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 T5 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 Tr = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 Ts = VADD(T7, T8);
+		    }
+		    Ta = VADD(T6, T9);
+		    TI = VADD(Tr, Ts);
+		    Tb = VADD(T3, Ta);
+		    Tm = VSUB(Tg, Tl);
+		    Tq = VSUB(To, Tp);
+		    Tt = VMUL(LDK(KP866025403), VSUB(Tr, Ts));
+		    Tu = VADD(Tq, Tt);
+		    TC = VSUB(Tq, Tt);
+		    {
+			 V TP, TQ, TG, TJ;
+			 TP = VADD(Tg, Tl);
+			 TQ = VADD(TH, TI);
+			 TR = VMUL(LDK(KP300462606), VSUB(TP, TQ));
+			 TX = VADD(TP, TQ);
+			 TG = VADD(To, Tp);
+			 TJ = VFNMS(LDK(KP500000000), TI, TH);
+			 TK = VSUB(TG, TJ);
+			 TU = VADD(TG, TJ);
+		    }
+		    {
+			 V Tx, Ty, TL, TM;
+			 Tx = VMUL(LDK(KP866025403), VSUB(Tv, Tw));
+			 Ty = VFNMS(LDK(KP500000000), Ta, T3);
+			 Tz = VSUB(Tx, Ty);
+			 TB = VADD(Tx, Ty);
+			 TL = VADD(Tv, Tw);
+			 TM = VSUB(T6, T9);
+			 TN = VSUB(TL, TM);
+			 TT = VADD(TL, TM);
+		    }
+	       }
+	       ST(&(xo[0]), VADD(TW, TX), ovs, &(xo[0]));
+	       {
+		    V T19, T1n, T14, T13, T1f, T1k, Tn, TE, T1e, T1j, TS, T1m, TZ, T1c, TA;
+		    V TD;
+		    {
+			 V T17, T18, T11, T12;
+			 T17 = VFMA(LDK(KP387390585), TN, VMUL(LDK(KP265966249), TK));
+			 T18 = VFNMS(LDK(KP503537032), TU, VMUL(LDK(KP113854479), TT));
+			 T19 = VSUB(T17, T18);
+			 T1n = VADD(T17, T18);
+			 T14 = VFMA(LDK(KP575140729), Tm, VMUL(LDK(KP174138601), Tb));
+			 T11 = VFNMS(LDK(KP156891391), TB, VMUL(LDK(KP256247671), TC));
+			 T12 = VFMA(LDK(KP011599105), Tz, VMUL(LDK(KP300238635), Tu));
+			 T13 = VSUB(T11, T12);
+			 T1f = VADD(T14, T13);
+			 T1k = VMUL(LDK(KP1_732050807), VADD(T11, T12));
+		    }
+		    Tn = VFNMS(LDK(KP174138601), Tm, VMUL(LDK(KP575140729), Tb));
+		    TA = VFNMS(LDK(KP300238635), Tz, VMUL(LDK(KP011599105), Tu));
+		    TD = VFMA(LDK(KP256247671), TB, VMUL(LDK(KP156891391), TC));
+		    TE = VSUB(TA, TD);
+		    T1e = VMUL(LDK(KP1_732050807), VADD(TD, TA));
+		    T1j = VSUB(Tn, TE);
+		    {
+			 V TO, T1b, TV, TY, T1a;
+			 TO = VFNMS(LDK(KP132983124), TN, VMUL(LDK(KP258260390), TK));
+			 T1b = VSUB(TR, TO);
+			 TV = VFMA(LDK(KP251768516), TT, VMUL(LDK(KP075902986), TU));
+			 TY = VFNMS(LDK(KP083333333), TX, TW);
+			 T1a = VSUB(TY, TV);
+			 TS = VFMA(LDK(KP2_000000000), TO, TR);
+			 T1m = VADD(T1b, T1a);
+			 TZ = VFMA(LDK(KP2_000000000), TV, TY);
+			 T1c = VSUB(T1a, T1b);
+		    }
+		    {
+			 V TF, T10, T1l, T1o;
+			 TF = VBYI(VFMA(LDK(KP2_000000000), TE, Tn));
+			 T10 = VADD(TS, TZ);
+			 ST(&(xo[WS(os, 1)]), VADD(TF, T10), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 12)]), VSUB(T10, TF), ovs, &(xo[0]));
+			 {
+			      V T15, T16, T1p, T1q;
+			      T15 = VBYI(VFMS(LDK(KP2_000000000), T13, T14));
+			      T16 = VSUB(TZ, TS);
+			      ST(&(xo[WS(os, 5)]), VADD(T15, T16), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 8)]), VSUB(T16, T15), ovs, &(xo[0]));
+			      T1p = VADD(T1n, T1m);
+			      T1q = VBYI(VADD(T1j, T1k));
+			      ST(&(xo[WS(os, 4)]), VSUB(T1p, T1q), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 9)]), VADD(T1q, T1p), ovs, &(xo[WS(os, 1)]));
+			 }
+			 T1l = VBYI(VSUB(T1j, T1k));
+			 T1o = VSUB(T1m, T1n);
+			 ST(&(xo[WS(os, 3)]), VADD(T1l, T1o), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 10)]), VSUB(T1o, T1l), ovs, &(xo[0]));
+			 {
+			      V T1h, T1i, T1d, T1g;
+			      T1h = VBYI(VSUB(T1e, T1f));
+			      T1i = VSUB(T1c, T19);
+			      ST(&(xo[WS(os, 6)]), VADD(T1h, T1i), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 7)]), VSUB(T1i, T1h), ovs, &(xo[WS(os, 1)]));
+			      T1d = VADD(T19, T1c);
+			      T1g = VBYI(VADD(T1e, T1f));
+			      ST(&(xo[WS(os, 2)]), VSUB(T1d, T1g), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 11)]), VADD(T1g, T1d), ovs, &(xo[WS(os, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 13, XSIMD_STRING("n1fv_13"), {69, 15, 19, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_13) (planner *p) {
+     X(kdft_register) (p, n1fv_13, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 14 -name n1fv_14 -include n1f.h */
+
+/*
+ * This function contains 74 FP additions, 48 FP multiplications,
+ * (or, 32 additions, 6 multiplications, 42 fused multiply/add),
+ * 63 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
+	       V TH, T3, TP, Tn, Ta, Ts, TW, TK, TO, Tk, TM, Tg, TL, Td, T1;
+	       V T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V Ti, TI, T6, TJ, T9, Tj, Te, Tf, Tb, Tc;
+		    {
+			 V T4, T5, T7, T8, Tl, Tm;
+			 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tl = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tm = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Ti = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 TH = VADD(T1, T2);
+			 T3 = VSUB(T1, T2);
+			 TI = VADD(T4, T5);
+			 T6 = VSUB(T4, T5);
+			 TJ = VADD(T7, T8);
+			 T9 = VSUB(T7, T8);
+			 TP = VADD(Tl, Tm);
+			 Tn = VSUB(Tl, Tm);
+			 Tj = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Te = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tf = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+		    }
+		    Ta = VADD(T6, T9);
+		    Ts = VSUB(T9, T6);
+		    TW = VSUB(TJ, TI);
+		    TK = VADD(TI, TJ);
+		    TO = VADD(Ti, Tj);
+		    Tk = VSUB(Ti, Tj);
+		    TM = VADD(Te, Tf);
+		    Tg = VSUB(Te, Tf);
+		    TL = VADD(Tb, Tc);
+		    Td = VSUB(Tb, Tc);
+	       }
+	       {
+		    V T18, TB, T13, TY, TG, Tw, T11, Tr, T16, TT, Tz, TE, TU, TQ;
+		    TU = VSUB(TO, TP);
+		    TQ = VADD(TO, TP);
+		    {
+			 V Tt, To, TV, TN;
+			 Tt = VSUB(Tn, Tk);
+			 To = VADD(Tk, Tn);
+			 TV = VSUB(TL, TM);
+			 TN = VADD(TL, TM);
+			 {
+			      V Tu, Th, TZ, T17;
+			      Tu = VSUB(Tg, Td);
+			      Th = VADD(Td, Tg);
+			      TZ = VFNMS(LDK(KP356895867), TK, TQ);
+			      T17 = VFNMS(LDK(KP554958132), TU, TW);
+			      {
+				   V Tp, TA, T14, TR;
+				   Tp = VFNMS(LDK(KP356895867), Ta, To);
+				   TA = VFMA(LDK(KP554958132), Tt, Ts);
+				   ST(&(xo[0]), VADD(TH, VADD(TK, VADD(TN, TQ))), ovs, &(xo[0]));
+				   T14 = VFNMS(LDK(KP356895867), TN, TK);
+				   TR = VFNMS(LDK(KP356895867), TQ, TN);
+				   {
+					V T12, TX, Tx, TC;
+					T12 = VFMA(LDK(KP554958132), TV, TU);
+					TX = VFMA(LDK(KP554958132), TW, TV);
+					ST(&(xo[WS(os, 7)]), VADD(T3, VADD(Ta, VADD(Th, To))), ovs, &(xo[WS(os, 1)]));
+					Tx = VFNMS(LDK(KP356895867), Th, Ta);
+					TC = VFNMS(LDK(KP356895867), To, Th);
+					{
+					     V TF, Tv, T10, Tq;
+					     TF = VFNMS(LDK(KP554958132), Ts, Tu);
+					     Tv = VFMA(LDK(KP554958132), Tu, Tt);
+					     T10 = VFNMS(LDK(KP692021471), TZ, TN);
+					     T18 = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), T17, TV));
+					     Tq = VFNMS(LDK(KP692021471), Tp, Th);
+					     TB = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), TA, Tu));
+					     {
+						  V T15, TS, Ty, TD;
+						  T15 = VFNMS(LDK(KP692021471), T14, TQ);
+						  TS = VFNMS(LDK(KP692021471), TR, TK);
+						  T13 = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), T12, TW));
+						  TY = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), TX, TU));
+						  Ty = VFNMS(LDK(KP692021471), Tx, To);
+						  TD = VFNMS(LDK(KP692021471), TC, Ta);
+						  TG = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), TF, Tt));
+						  Tw = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tv, Ts));
+						  T11 = VFNMS(LDK(KP900968867), T10, TH);
+						  Tr = VFNMS(LDK(KP900968867), Tq, T3);
+						  T16 = VFNMS(LDK(KP900968867), T15, TH);
+						  TT = VFNMS(LDK(KP900968867), TS, TH);
+						  Tz = VFNMS(LDK(KP900968867), Ty, T3);
+						  TE = VFNMS(LDK(KP900968867), TD, T3);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    ST(&(xo[WS(os, 12)]), VFNMSI(T13, T11), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VFMAI(T13, T11), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 9)]), VFMAI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 5)]), VFNMSI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 8)]), VFNMSI(T18, T16), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 6)]), VFMAI(T18, T16), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 10)]), VFNMSI(TY, TT), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VFMAI(TY, TT), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 1)]), VFMAI(TB, Tz), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 13)]), VFNMSI(TB, Tz), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VFMAI(TG, TE), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 11)]), VFNMSI(TG, TE), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 14, XSIMD_STRING("n1fv_14"), {32, 6, 42, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_14) (planner *p) {
+     X(kdft_register) (p, n1fv_14, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 14 -name n1fv_14 -include n1f.h */
+
+/*
+ * This function contains 74 FP additions, 36 FP multiplications,
+ * (or, 50 additions, 12 multiplications, 24 fused multiply/add),
+ * 33 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
+	       V T3, Ty, To, TK, Tr, TE, Ta, TJ, Tq, TB, Th, TL, Ts, TH, T1;
+	       V T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = VSUB(T1, T2);
+	       Ty = VADD(T1, T2);
+	       {
+		    V Tk, TC, Tn, TD;
+		    {
+			 V Ti, Tj, Tl, Tm;
+			 Ti = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tj = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Tk = VSUB(Ti, Tj);
+			 TC = VADD(Ti, Tj);
+			 Tl = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tm = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tn = VSUB(Tl, Tm);
+			 TD = VADD(Tl, Tm);
+		    }
+		    To = VADD(Tk, Tn);
+		    TK = VSUB(TC, TD);
+		    Tr = VSUB(Tn, Tk);
+		    TE = VADD(TC, TD);
+	       }
+	       {
+		    V T6, Tz, T9, TA;
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 Tz = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = VSUB(T7, T8);
+			 TA = VADD(T7, T8);
+		    }
+		    Ta = VADD(T6, T9);
+		    TJ = VSUB(TA, Tz);
+		    Tq = VSUB(T9, T6);
+		    TB = VADD(Tz, TA);
+	       }
+	       {
+		    V Td, TF, Tg, TG;
+		    {
+			 V Tb, Tc, Te, Tf;
+			 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 TF = VADD(Tb, Tc);
+			 Te = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tf = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tg = VSUB(Te, Tf);
+			 TG = VADD(Te, Tf);
+		    }
+		    Th = VADD(Td, Tg);
+		    TL = VSUB(TF, TG);
+		    Ts = VSUB(Tg, Td);
+		    TH = VADD(TF, TG);
+	       }
+	       ST(&(xo[WS(os, 7)]), VADD(T3, VADD(Ta, VADD(Th, To))), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[0]), VADD(Ty, VADD(TB, VADD(TH, TE))), ovs, &(xo[0]));
+	       {
+		    V Tt, Tp, TP, TQ;
+		    Tt = VBYI(VFNMS(LDK(KP781831482), Tr, VFNMS(LDK(KP433883739), Ts, VMUL(LDK(KP974927912), Tq))));
+		    Tp = VFMA(LDK(KP623489801), To, VFNMS(LDK(KP900968867), Th, VFNMS(LDK(KP222520933), Ta, T3)));
+		    ST(&(xo[WS(os, 5)]), VSUB(Tp, Tt), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 9)]), VADD(Tp, Tt), ovs, &(xo[WS(os, 1)]));
+		    TP = VBYI(VFMA(LDK(KP974927912), TJ, VFMA(LDK(KP433883739), TL, VMUL(LDK(KP781831482), TK))));
+		    TQ = VFMA(LDK(KP623489801), TE, VFNMS(LDK(KP900968867), TH, VFNMS(LDK(KP222520933), TB, Ty)));
+		    ST(&(xo[WS(os, 2)]), VADD(TP, TQ), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 12)]), VSUB(TQ, TP), ovs, &(xo[0]));
+	       }
+	       {
+		    V Tv, Tu, TM, TI;
+		    Tv = VBYI(VFMA(LDK(KP781831482), Tq, VFMA(LDK(KP974927912), Ts, VMUL(LDK(KP433883739), Tr))));
+		    Tu = VFMA(LDK(KP623489801), Ta, VFNMS(LDK(KP900968867), To, VFNMS(LDK(KP222520933), Th, T3)));
+		    ST(&(xo[WS(os, 13)]), VSUB(Tu, Tv), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VADD(Tu, Tv), ovs, &(xo[WS(os, 1)]));
+		    TM = VBYI(VFNMS(LDK(KP433883739), TK, VFNMS(LDK(KP974927912), TL, VMUL(LDK(KP781831482), TJ))));
+		    TI = VFMA(LDK(KP623489801), TB, VFNMS(LDK(KP900968867), TE, VFNMS(LDK(KP222520933), TH, Ty)));
+		    ST(&(xo[WS(os, 6)]), VSUB(TI, TM), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 8)]), VADD(TM, TI), ovs, &(xo[0]));
+	       }
+	       {
+		    V TO, TN, Tx, Tw;
+		    TO = VBYI(VFMA(LDK(KP433883739), TJ, VFNMS(LDK(KP974927912), TK, VMUL(LDK(KP781831482), TL))));
+		    TN = VFMA(LDK(KP623489801), TH, VFNMS(LDK(KP222520933), TE, VFNMS(LDK(KP900968867), TB, Ty)));
+		    ST(&(xo[WS(os, 4)]), VSUB(TN, TO), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 10)]), VADD(TO, TN), ovs, &(xo[0]));
+		    Tx = VBYI(VFMA(LDK(KP433883739), Tq, VFNMS(LDK(KP781831482), Ts, VMUL(LDK(KP974927912), Tr))));
+		    Tw = VFMA(LDK(KP623489801), Th, VFNMS(LDK(KP222520933), To, VFNMS(LDK(KP900968867), Ta, T3)));
+		    ST(&(xo[WS(os, 11)]), VSUB(Tw, Tx), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VADD(Tw, Tx), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 14, XSIMD_STRING("n1fv_14"), {50, 12, 24, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_14) (planner *p) {
+     X(kdft_register) (p, n1fv_14, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name n1fv_15 -include n1f.h */
+
+/*
+ * This function contains 78 FP additions, 49 FP multiplications,
+ * (or, 36 additions, 7 multiplications, 42 fused multiply/add),
+ * 78 stack variables, 8 constants, and 30 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DVK(KP910592997, +0.910592997310029334643087372129977886038870291);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(30, is), MAKE_VOLATILE_STRIDE(30, os)) {
+	       V Tb, TX, TM, TQ, Th, TB, T5, Ti, Ta, TC, TN, Te, TG, Tq, Tj;
+	       V T1, T2, T3;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+	       {
+		    V T6, T7, T8, Tm, Tn, To;
+		    T6 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+		    Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    Tn = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+		    To = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    {
+			 V T4, Tc, T9, Td, Tp;
+			 Tb = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T4 = VADD(T2, T3);
+			 TX = VSUB(T3, T2);
+			 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 TM = VSUB(T8, T7);
+			 T9 = VADD(T7, T8);
+			 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Tp = VADD(Tn, To);
+			 TQ = VSUB(To, Tn);
+			 Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 TB = VFNMS(LDK(KP500000000), T4, T1);
+			 T5 = VADD(T1, T4);
+			 Ti = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VADD(T6, T9);
+			 TC = VFNMS(LDK(KP500000000), T9, T6);
+			 TN = VSUB(Td, Tc);
+			 Te = VADD(Tc, Td);
+			 TG = VFNMS(LDK(KP500000000), Tp, Tm);
+			 Tq = VADD(Tm, Tp);
+			 Tj = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    }
+	       }
+	       {
+		    V TY, TO, Tf, TD, TP, Tk;
+		    TY = VADD(TM, TN);
+		    TO = VSUB(TM, TN);
+		    Tf = VADD(Tb, Te);
+		    TD = VFNMS(LDK(KP500000000), Te, Tb);
+		    TP = VSUB(Tj, Ti);
+		    Tk = VADD(Ti, Tj);
+		    {
+			 V Tx, Tg, TE, TU, TZ, TR, Tl, TF;
+			 Tx = VSUB(Ta, Tf);
+			 Tg = VADD(Ta, Tf);
+			 TE = VADD(TC, TD);
+			 TU = VSUB(TC, TD);
+			 TZ = VADD(TP, TQ);
+			 TR = VSUB(TP, TQ);
+			 Tl = VADD(Th, Tk);
+			 TF = VFNMS(LDK(KP500000000), Tk, Th);
+			 {
+			      V T12, T10, T18, TS, Tw, Tr, TH, TV, T11, T1g;
+			      T12 = VSUB(TY, TZ);
+			      T10 = VADD(TY, TZ);
+			      T18 = VFNMS(LDK(KP618033988), TO, TR);
+			      TS = VFMA(LDK(KP618033988), TR, TO);
+			      Tw = VSUB(Tl, Tq);
+			      Tr = VADD(Tl, Tq);
+			      TH = VADD(TF, TG);
+			      TV = VSUB(TF, TG);
+			      T11 = VFNMS(LDK(KP250000000), T10, TX);
+			      T1g = VMUL(LDK(KP866025403), VADD(TX, T10));
+			      {
+				   V TA, Ty, Tu, TK, TI, T1a, TW, T1b, T13, Tt, Ts, TJ, T1f;
+				   TA = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tw, Tx));
+				   Ty = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tx, Tw));
+				   Ts = VADD(Tg, Tr);
+				   Tu = VSUB(Tg, Tr);
+				   TK = VSUB(TE, TH);
+				   TI = VADD(TE, TH);
+				   T1a = VFNMS(LDK(KP618033988), TU, TV);
+				   TW = VFMA(LDK(KP618033988), TV, TU);
+				   T1b = VFNMS(LDK(KP559016994), T12, T11);
+				   T13 = VFMA(LDK(KP559016994), T12, T11);
+				   ST(&(xo[0]), VADD(T5, Ts), ovs, &(xo[0]));
+				   Tt = VFNMS(LDK(KP250000000), Ts, T5);
+				   TJ = VFNMS(LDK(KP250000000), TI, TB);
+				   T1f = VADD(TB, TI);
+				   {
+					V T1c, T1e, T16, T14, Tv, Tz, T17, TL;
+					T1c = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T1b, T1a));
+					T1e = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T1b, T1a));
+					T16 = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T13, TW));
+					T14 = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T13, TW));
+					Tv = VFNMS(LDK(KP559016994), Tu, Tt);
+					Tz = VFMA(LDK(KP559016994), Tu, Tt);
+					T17 = VFNMS(LDK(KP559016994), TK, TJ);
+					TL = VFMA(LDK(KP559016994), TK, TJ);
+					ST(&(xo[WS(os, 10)]), VFMAI(T1g, T1f), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 5)]), VFNMSI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
+					{
+					     V T19, T1d, T15, TT;
+					     ST(&(xo[WS(os, 12)]), VFMAI(Ty, Tv), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 3)]), VFNMSI(Ty, Tv), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 9)]), VFMAI(TA, Tz), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 6)]), VFNMSI(TA, Tz), ovs, &(xo[0]));
+					     T19 = VFMA(LDK(KP823639103), T18, T17);
+					     T1d = VFNMS(LDK(KP823639103), T18, T17);
+					     T15 = VFNMS(LDK(KP823639103), TS, TL);
+					     TT = VFMA(LDK(KP823639103), TS, TL);
+					     ST(&(xo[WS(os, 2)]), VFMAI(T1c, T19), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 13)]), VFNMSI(T1c, T19), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 7)]), VFMAI(T1e, T1d), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 8)]), VFNMSI(T1e, T1d), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 4)]), VFMAI(T16, T15), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 11)]), VFNMSI(T16, T15), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 14)]), VFMAI(T14, TT), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 1)]), VFNMSI(T14, TT), ovs, &(xo[WS(os, 1)]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 15, XSIMD_STRING("n1fv_15"), {36, 7, 42, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_15) (planner *p) {
+     X(kdft_register) (p, n1fv_15, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name n1fv_15 -include n1f.h */
+
+/*
+ * This function contains 78 FP additions, 25 FP multiplications,
+ * (or, 64 additions, 11 multiplications, 14 fused multiply/add),
+ * 55 stack variables, 10 constants, and 30 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP216506350, +0.216506350946109661690930792688234045867850657);
+     DVK(KP509036960, +0.509036960455127183450980863393907648510733164);
+     DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP484122918, +0.484122918275927110647408174972799951354115213);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(30, is), MAKE_VOLATILE_STRIDE(30, os)) {
+	       V T5, T10, TB, TO, TU, TV, TR, Ta, Tf, Tg, Tl, Tq, Tr, TE, TH;
+	       V TI, TZ, T11, T1f, T1g;
+	       {
+		    V T1, T2, T3, T4;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T4 = VADD(T2, T3);
+		    T5 = VADD(T1, T4);
+		    T10 = VSUB(T3, T2);
+		    TB = VFNMS(LDK(KP500000000), T4, T1);
+	       }
+	       {
+		    V T6, T9, TC, TP, Tm, Tp, TG, TN, Tb, Te, TD, TQ, Th, Tk, TF;
+		    V TM, TX, TY;
+		    {
+			 V T7, T8, Tn, To;
+			 T6 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = VADD(T7, T8);
+			 TC = VFNMS(LDK(KP500000000), T9, T6);
+			 TP = VSUB(T8, T7);
+			 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Tn = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 To = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tp = VADD(Tn, To);
+			 TG = VFNMS(LDK(KP500000000), Tp, Tm);
+			 TN = VSUB(To, Tn);
+		    }
+		    {
+			 V Tc, Td, Ti, Tj;
+			 Tb = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Te = VADD(Tc, Td);
+			 TD = VFNMS(LDK(KP500000000), Te, Tb);
+			 TQ = VSUB(Td, Tc);
+			 Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Ti = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Tj = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tk = VADD(Ti, Tj);
+			 TF = VFNMS(LDK(KP500000000), Tk, Th);
+			 TM = VSUB(Tj, Ti);
+		    }
+		    TO = VSUB(TM, TN);
+		    TU = VSUB(TF, TG);
+		    TV = VSUB(TC, TD);
+		    TR = VSUB(TP, TQ);
+		    Ta = VADD(T6, T9);
+		    Tf = VADD(Tb, Te);
+		    Tg = VADD(Ta, Tf);
+		    Tl = VADD(Th, Tk);
+		    Tq = VADD(Tm, Tp);
+		    Tr = VADD(Tl, Tq);
+		    TE = VADD(TC, TD);
+		    TH = VADD(TF, TG);
+		    TI = VADD(TE, TH);
+		    TX = VADD(TP, TQ);
+		    TY = VADD(TM, TN);
+		    TZ = VMUL(LDK(KP484122918), VSUB(TX, TY));
+		    T11 = VADD(TX, TY);
+	       }
+	       T1f = VADD(TB, TI);
+	       T1g = VBYI(VMUL(LDK(KP866025403), VADD(T10, T11)));
+	       ST(&(xo[WS(os, 5)]), VSUB(T1f, T1g), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[WS(os, 10)]), VADD(T1f, T1g), ovs, &(xo[0]));
+	       {
+		    V Tu, Ts, Tt, Ty, TA, Tw, Tx, Tz, Tv;
+		    Tu = VMUL(LDK(KP559016994), VSUB(Tg, Tr));
+		    Ts = VADD(Tg, Tr);
+		    Tt = VFNMS(LDK(KP250000000), Ts, T5);
+		    Tw = VSUB(Tl, Tq);
+		    Tx = VSUB(Ta, Tf);
+		    Ty = VBYI(VFNMS(LDK(KP587785252), Tx, VMUL(LDK(KP951056516), Tw)));
+		    TA = VBYI(VFMA(LDK(KP951056516), Tx, VMUL(LDK(KP587785252), Tw)));
+		    ST(&(xo[0]), VADD(T5, Ts), ovs, &(xo[0]));
+		    Tz = VADD(Tu, Tt);
+		    ST(&(xo[WS(os, 6)]), VSUB(Tz, TA), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 9)]), VADD(TA, Tz), ovs, &(xo[WS(os, 1)]));
+		    Tv = VSUB(Tt, Tu);
+		    ST(&(xo[WS(os, 3)]), VSUB(Tv, Ty), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 12)]), VADD(Ty, Tv), ovs, &(xo[0]));
+	       }
+	       {
+		    V TS, TW, T1b, T18, T13, T1a, TL, T17, T12, TJ, TK;
+		    TS = VFNMS(LDK(KP509036960), TR, VMUL(LDK(KP823639103), TO));
+		    TW = VFNMS(LDK(KP587785252), TV, VMUL(LDK(KP951056516), TU));
+		    T1b = VFMA(LDK(KP951056516), TV, VMUL(LDK(KP587785252), TU));
+		    T18 = VFMA(LDK(KP823639103), TR, VMUL(LDK(KP509036960), TO));
+		    T12 = VFNMS(LDK(KP216506350), T11, VMUL(LDK(KP866025403), T10));
+		    T13 = VSUB(TZ, T12);
+		    T1a = VADD(TZ, T12);
+		    TJ = VFNMS(LDK(KP250000000), TI, TB);
+		    TK = VMUL(LDK(KP559016994), VSUB(TE, TH));
+		    TL = VSUB(TJ, TK);
+		    T17 = VADD(TK, TJ);
+		    {
+			 V TT, T14, T1d, T1e;
+			 TT = VSUB(TL, TS);
+			 T14 = VBYI(VSUB(TW, T13));
+			 ST(&(xo[WS(os, 8)]), VSUB(TT, T14), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 7)]), VADD(TT, T14), ovs, &(xo[WS(os, 1)]));
+			 T1d = VSUB(T17, T18);
+			 T1e = VBYI(VADD(T1b, T1a));
+			 ST(&(xo[WS(os, 11)]), VSUB(T1d, T1e), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 4)]), VADD(T1d, T1e), ovs, &(xo[0]));
+		    }
+		    {
+			 V T15, T16, T19, T1c;
+			 T15 = VADD(TL, TS);
+			 T16 = VBYI(VADD(TW, T13));
+			 ST(&(xo[WS(os, 13)]), VSUB(T15, T16), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 2)]), VADD(T15, T16), ovs, &(xo[0]));
+			 T19 = VADD(T17, T18);
+			 T1c = VBYI(VSUB(T1a, T1b));
+			 ST(&(xo[WS(os, 14)]), VSUB(T19, T1c), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 1)]), VADD(T19, T1c), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 15, XSIMD_STRING("n1fv_15"), {64, 11, 14, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_15) (planner *p) {
+     X(kdft_register) (p, n1fv_15, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:53 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n1fv_16 -include n1f.h */
+
+/*
+ * This function contains 72 FP additions, 34 FP multiplications,
+ * (or, 38 additions, 0 multiplications, 34 fused multiply/add),
+ * 54 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       V T7, Tu, TF, TB, T13, TL, TO, TX, TC, Te, TP, Th, TQ, Tk, TW;
+	       V T16;
+	       {
+		    V TH, TU, Tz, Tf, TK, TV, TA, TM, Ta, TN, Td, Tg, Ti, Tj;
+		    {
+			 V T1, T2, T4, T5, To, Tp, Tr, Ts;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 To = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Tp = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tr = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 {
+			      V T8, TJ, Tq, TI, Tt, T9, Tb, Tc, T3, T6;
+			      T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			      TH = VSUB(T1, T2);
+			      T3 = VADD(T1, T2);
+			      TU = VSUB(T4, T5);
+			      T6 = VADD(T4, T5);
+			      TJ = VSUB(To, Tp);
+			      Tq = VADD(To, Tp);
+			      TI = VSUB(Tr, Ts);
+			      Tt = VADD(Tr, Ts);
+			      T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			      Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			      Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			      T7 = VSUB(T3, T6);
+			      Tz = VADD(T3, T6);
+			      Tf = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			      TK = VADD(TI, TJ);
+			      TV = VSUB(TJ, TI);
+			      TA = VADD(Tt, Tq);
+			      Tu = VSUB(Tq, Tt);
+			      TM = VSUB(T8, T9);
+			      Ta = VADD(T8, T9);
+			      TN = VSUB(Tb, Tc);
+			      Td = VADD(Tb, Tc);
+			      Tg = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			      Ti = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			      Tj = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 }
+		    }
+		    TF = VSUB(Tz, TA);
+		    TB = VADD(Tz, TA);
+		    T13 = VFNMS(LDK(KP707106781), TK, TH);
+		    TL = VFMA(LDK(KP707106781), TK, TH);
+		    TO = VFNMS(LDK(KP414213562), TN, TM);
+		    TX = VFMA(LDK(KP414213562), TM, TN);
+		    TC = VADD(Ta, Td);
+		    Te = VSUB(Ta, Td);
+		    TP = VSUB(Tf, Tg);
+		    Th = VADD(Tf, Tg);
+		    TQ = VSUB(Tj, Ti);
+		    Tk = VADD(Ti, Tj);
+		    TW = VFNMS(LDK(KP707106781), TV, TU);
+		    T16 = VFMA(LDK(KP707106781), TV, TU);
+	       }
+	       {
+		    V TY, TR, Tl, TD;
+		    TY = VFMA(LDK(KP414213562), TP, TQ);
+		    TR = VFNMS(LDK(KP414213562), TQ, TP);
+		    Tl = VSUB(Th, Tk);
+		    TD = VADD(Th, Tk);
+		    {
+			 V TS, T17, TZ, T14;
+			 TS = VADD(TO, TR);
+			 T17 = VSUB(TR, TO);
+			 TZ = VSUB(TX, TY);
+			 T14 = VADD(TX, TY);
+			 {
+			      V TE, TG, Tm, Tv;
+			      TE = VADD(TC, TD);
+			      TG = VSUB(TD, TC);
+			      Tm = VADD(Te, Tl);
+			      Tv = VSUB(Tl, Te);
+			      {
+				   V T18, T1a, TT, T11;
+				   T18 = VFNMS(LDK(KP923879532), T17, T16);
+				   T1a = VFMA(LDK(KP923879532), T17, T16);
+				   TT = VFNMS(LDK(KP923879532), TS, TL);
+				   T11 = VFMA(LDK(KP923879532), TS, TL);
+				   {
+					V T15, T19, T10, T12;
+					T15 = VFNMS(LDK(KP923879532), T14, T13);
+					T19 = VFMA(LDK(KP923879532), T14, T13);
+					T10 = VFNMS(LDK(KP923879532), TZ, TW);
+					T12 = VFMA(LDK(KP923879532), TZ, TW);
+					ST(&(xo[WS(os, 4)]), VFMAI(TG, TF), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 12)]), VFNMSI(TG, TF), ovs, &(xo[0]));
+					ST(&(xo[0]), VADD(TB, TE), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 8)]), VSUB(TB, TE), ovs, &(xo[0]));
+					{
+					     V Tw, Ty, Tn, Tx;
+					     Tw = VFNMS(LDK(KP707106781), Tv, Tu);
+					     Ty = VFMA(LDK(KP707106781), Tv, Tu);
+					     Tn = VFNMS(LDK(KP707106781), Tm, T7);
+					     Tx = VFMA(LDK(KP707106781), Tm, T7);
+					     ST(&(xo[WS(os, 3)]), VFMAI(T1a, T19), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 13)]), VFNMSI(T1a, T19), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 11)]), VFMAI(T18, T15), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 5)]), VFNMSI(T18, T15), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 1)]), VFNMSI(T12, T11), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 15)]), VFMAI(T12, T11), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 7)]), VFMAI(T10, TT), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 9)]), VFNMSI(T10, TT), ovs, &(xo[WS(os, 1)]));
+					     ST(&(xo[WS(os, 14)]), VFNMSI(Ty, Tx), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 2)]), VFMAI(Ty, Tx), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 10)]), VFMAI(Tw, Tn), ovs, &(xo[0]));
+					     ST(&(xo[WS(os, 6)]), VFNMSI(Tw, Tn), ovs, &(xo[0]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 16, XSIMD_STRING("n1fv_16"), {38, 0, 34, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_16) (planner *p) {
+     X(kdft_register) (p, n1fv_16, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n1fv_16 -include n1f.h */
+
+/*
+ * This function contains 72 FP additions, 12 FP multiplications,
+ * (or, 68 additions, 8 multiplications, 4 fused multiply/add),
+ * 30 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       V Tp, T13, Tu, TN, Tm, T14, Tv, TY, T7, T17, Ty, TT, Te, T16, Tx;
+	       V TQ;
+	       {
+		    V Tn, To, TM, Ts, Tt, TL;
+		    Tn = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    To = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+		    TM = VADD(Tn, To);
+		    Ts = LD(&(xi[0]), ivs, &(xi[0]));
+		    Tt = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    TL = VADD(Ts, Tt);
+		    Tp = VSUB(Tn, To);
+		    T13 = VADD(TL, TM);
+		    Tu = VSUB(Ts, Tt);
+		    TN = VSUB(TL, TM);
+	       }
+	       {
+		    V Ti, TW, Tl, TX;
+		    {
+			 V Tg, Th, Tj, Tk;
+			 Tg = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Ti = VSUB(Tg, Th);
+			 TW = VADD(Tg, Th);
+			 Tj = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 TX = VADD(Tj, Tk);
+		    }
+		    Tm = VMUL(LDK(KP707106781), VSUB(Ti, Tl));
+		    T14 = VADD(TX, TW);
+		    Tv = VMUL(LDK(KP707106781), VADD(Tl, Ti));
+		    TY = VSUB(TW, TX);
+	       }
+	       {
+		    V T3, TR, T6, TS;
+		    {
+			 V T1, T2, T4, T5;
+			 T1 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 TR = VADD(T1, T2);
+			 T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T5 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 TS = VADD(T4, T5);
+		    }
+		    T7 = VFNMS(LDK(KP923879532), T6, VMUL(LDK(KP382683432), T3));
+		    T17 = VADD(TR, TS);
+		    Ty = VFMA(LDK(KP923879532), T3, VMUL(LDK(KP382683432), T6));
+		    TT = VSUB(TR, TS);
+	       }
+	       {
+		    V Ta, TO, Td, TP;
+		    {
+			 V T8, T9, Tb, Tc;
+			 T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VSUB(T8, T9);
+			 TO = VADD(T8, T9);
+			 Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 TP = VADD(Tb, Tc);
+		    }
+		    Te = VFMA(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), Td));
+		    T16 = VADD(TO, TP);
+		    Tx = VFNMS(LDK(KP382683432), Td, VMUL(LDK(KP923879532), Ta));
+		    TQ = VSUB(TO, TP);
+	       }
+	       {
+		    V T15, T18, T19, T1a;
+		    T15 = VADD(T13, T14);
+		    T18 = VADD(T16, T17);
+		    ST(&(xo[WS(os, 8)]), VSUB(T15, T18), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(T15, T18), ovs, &(xo[0]));
+		    T19 = VSUB(T13, T14);
+		    T1a = VBYI(VSUB(T17, T16));
+		    ST(&(xo[WS(os, 12)]), VSUB(T19, T1a), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VADD(T19, T1a), ovs, &(xo[0]));
+	       }
+	       {
+		    V TV, T11, T10, T12, TU, TZ;
+		    TU = VMUL(LDK(KP707106781), VADD(TQ, TT));
+		    TV = VADD(TN, TU);
+		    T11 = VSUB(TN, TU);
+		    TZ = VMUL(LDK(KP707106781), VSUB(TT, TQ));
+		    T10 = VBYI(VADD(TY, TZ));
+		    T12 = VBYI(VSUB(TZ, TY));
+		    ST(&(xo[WS(os, 14)]), VSUB(TV, T10), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 6)]), VADD(T11, T12), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VADD(TV, T10), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 10)]), VSUB(T11, T12), ovs, &(xo[0]));
+	       }
+	       {
+		    V Tr, TB, TA, TC;
+		    {
+			 V Tf, Tq, Tw, Tz;
+			 Tf = VSUB(T7, Te);
+			 Tq = VSUB(Tm, Tp);
+			 Tr = VBYI(VSUB(Tf, Tq));
+			 TB = VBYI(VADD(Tq, Tf));
+			 Tw = VADD(Tu, Tv);
+			 Tz = VADD(Tx, Ty);
+			 TA = VSUB(Tw, Tz);
+			 TC = VADD(Tw, Tz);
+		    }
+		    ST(&(xo[WS(os, 7)]), VADD(Tr, TA), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 15)]), VSUB(TC, TB), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 9)]), VSUB(TA, Tr), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VADD(TB, TC), ovs, &(xo[WS(os, 1)]));
+	       }
+	       {
+		    V TF, TJ, TI, TK;
+		    {
+			 V TD, TE, TG, TH;
+			 TD = VSUB(Tu, Tv);
+			 TE = VADD(Te, T7);
+			 TF = VADD(TD, TE);
+			 TJ = VSUB(TD, TE);
+			 TG = VADD(Tp, Tm);
+			 TH = VSUB(Ty, Tx);
+			 TI = VBYI(VADD(TG, TH));
+			 TK = VBYI(VSUB(TH, TG));
+		    }
+		    ST(&(xo[WS(os, 13)]), VSUB(TF, TI), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 5)]), VADD(TJ, TK), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VADD(TF, TI), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 11)]), VSUB(TJ, TK), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 16, XSIMD_STRING("n1fv_16"), {68, 8, 4, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_16) (planner *p) {
+     X(kdft_register) (p, n1fv_16, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name n1fv_2 -include n1f.h */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(4, is), MAKE_VOLATILE_STRIDE(4, os)) {
+	       V T1, T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       ST(&(xo[0]), VADD(T1, T2), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 1)]), VSUB(T1, T2), ovs, &(xo[WS(os, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 2, XSIMD_STRING("n1fv_2"), {2, 0, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_2) (planner *p) {
+     X(kdft_register) (p, n1fv_2, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name n1fv_2 -include n1f.h */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(4, is), MAKE_VOLATILE_STRIDE(4, os)) {
+	       V T1, T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       ST(&(xo[WS(os, 1)]), VSUB(T1, T2), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[0]), VADD(T1, T2), ovs, &(xo[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 2, XSIMD_STRING("n1fv_2"), {2, 0, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_2) (planner *p) {
+     X(kdft_register) (p, n1fv_2, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:54 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n1fv_20 -include n1f.h */
+
+/*
+ * This function contains 104 FP additions, 50 FP multiplications,
+ * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
+ * 71 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
+	       V TU, TI, TP, TX, TM, TW, TT, TF;
+	       {
+		    V T3, Tm, T1r, T13, Ta, TN, TH, TA, TG, Tt, Th, TO, T1u, T1C, T1n;
+		    V T1a, T1m, T1h, T1x, T1D, TE, Ti;
+		    {
+			 V T1, T2, Tk, Tl;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 {
+			      V T14, T6, T1c, Tw, Tn, T1f, Tz, T17, T9, To, Tq, T1b, Td, Tr, Te;
+			      V Tf, T15, Tp;
+			      {
+				   V Tx, Ty, T7, T8, Tb, Tc;
+				   {
+					V T4, T5, Tu, Tv, T11, T12;
+					T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+					Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+					Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+					Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+					T3 = VSUB(T1, T2);
+					T11 = VADD(T1, T2);
+					Tm = VSUB(Tk, Tl);
+					T12 = VADD(Tk, Tl);
+					T14 = VADD(T4, T5);
+					T6 = VSUB(T4, T5);
+					T1c = VADD(Tu, Tv);
+					Tw = VSUB(Tu, Tv);
+					Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+					T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+					T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+					T1r = VADD(T11, T12);
+					T13 = VSUB(T11, T12);
+				   }
+				   Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+				   Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+				   Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+				   T1f = VADD(Tx, Ty);
+				   Tz = VSUB(Tx, Ty);
+				   T17 = VADD(T7, T8);
+				   T9 = VSUB(T7, T8);
+				   To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+				   Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+				   T1b = VADD(Tb, Tc);
+				   Td = VSUB(Tb, Tc);
+				   Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+				   Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+				   Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			      }
+			      Ta = VADD(T6, T9);
+			      TN = VSUB(T6, T9);
+			      T15 = VADD(Tn, To);
+			      Tp = VSUB(Tn, To);
+			      TH = VSUB(Tz, Tw);
+			      TA = VADD(Tw, Tz);
+			      {
+				   V T1d, T1v, T18, Ts, T1e, Tg, T16, T1s;
+				   T1d = VSUB(T1b, T1c);
+				   T1v = VADD(T1b, T1c);
+				   T18 = VADD(Tq, Tr);
+				   Ts = VSUB(Tq, Tr);
+				   T1e = VADD(Te, Tf);
+				   Tg = VSUB(Te, Tf);
+				   T16 = VSUB(T14, T15);
+				   T1s = VADD(T14, T15);
+				   {
+					V T1t, T19, T1w, T1g;
+					T1t = VADD(T17, T18);
+					T19 = VSUB(T17, T18);
+					TG = VSUB(Ts, Tp);
+					Tt = VADD(Tp, Ts);
+					T1w = VADD(T1e, T1f);
+					T1g = VSUB(T1e, T1f);
+					Th = VADD(Td, Tg);
+					TO = VSUB(Td, Tg);
+					T1u = VADD(T1s, T1t);
+					T1C = VSUB(T1s, T1t);
+					T1n = VSUB(T16, T19);
+					T1a = VADD(T16, T19);
+					T1m = VSUB(T1d, T1g);
+					T1h = VADD(T1d, T1g);
+					T1x = VADD(T1v, T1w);
+					T1D = VSUB(T1v, T1w);
+				   }
+			      }
+			 }
+		    }
+		    TE = VSUB(Ta, Th);
+		    Ti = VADD(Ta, Th);
+		    {
+			 V TL, T1k, T1A, Tj, TD, T1E, T1G, TK, TC, T1j, T1z, T1i, T1y, TB;
+			 TL = VSUB(TA, Tt);
+			 TB = VADD(Tt, TA);
+			 T1i = VADD(T1a, T1h);
+			 T1k = VSUB(T1a, T1h);
+			 T1y = VADD(T1u, T1x);
+			 T1A = VSUB(T1u, T1x);
+			 Tj = VADD(T3, Ti);
+			 TD = VFNMS(LDK(KP250000000), Ti, T3);
+			 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
+			 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
+			 TK = VFNMS(LDK(KP250000000), TB, Tm);
+			 TC = VADD(Tm, TB);
+			 T1j = VFNMS(LDK(KP250000000), T1i, T13);
+			 ST(&(xo[0]), VADD(T1r, T1y), ovs, &(xo[0]));
+			 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
+			 ST(&(xo[WS(os, 10)]), VADD(T13, T1i), ovs, &(xo[0]));
+			 {
+			      V T1p, T1l, T1o, T1q, T1F, T1B;
+			      TU = VFNMS(LDK(KP618033988), TG, TH);
+			      TI = VFMA(LDK(KP618033988), TH, TG);
+			      TP = VFMA(LDK(KP618033988), TO, TN);
+			      TX = VFNMS(LDK(KP618033988), TN, TO);
+			      ST(&(xo[WS(os, 15)]), VFMAI(TC, Tj), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 5)]), VFNMSI(TC, Tj), ovs, &(xo[WS(os, 1)]));
+			      T1p = VFMA(LDK(KP559016994), T1k, T1j);
+			      T1l = VFNMS(LDK(KP559016994), T1k, T1j);
+			      T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
+			      T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
+			      T1F = VFNMS(LDK(KP559016994), T1A, T1z);
+			      T1B = VFMA(LDK(KP559016994), T1A, T1z);
+			      ST(&(xo[WS(os, 14)]), VFMAI(T1q, T1p), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 6)]), VFNMSI(T1q, T1p), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 18)]), VFNMSI(T1o, T1l), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 2)]), VFMAI(T1o, T1l), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 16)]), VFNMSI(T1E, T1B), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 4)]), VFMAI(T1E, T1B), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 12)]), VFMAI(T1G, T1F), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 8)]), VFNMSI(T1G, T1F), ovs, &(xo[0]));
+			      TM = VFNMS(LDK(KP559016994), TL, TK);
+			      TW = VFMA(LDK(KP559016994), TL, TK);
+			      TT = VFNMS(LDK(KP559016994), TE, TD);
+			      TF = VFMA(LDK(KP559016994), TE, TD);
+			 }
+		    }
+	       }
+	       {
+		    V T10, TY, TQ, TS, TJ, TR, TZ, TV;
+		    T10 = VFMA(LDK(KP951056516), TX, TW);
+		    TY = VFNMS(LDK(KP951056516), TX, TW);
+		    TQ = VFMA(LDK(KP951056516), TP, TM);
+		    TS = VFNMS(LDK(KP951056516), TP, TM);
+		    TJ = VFMA(LDK(KP951056516), TI, TF);
+		    TR = VFNMS(LDK(KP951056516), TI, TF);
+		    TZ = VFMA(LDK(KP951056516), TU, TT);
+		    TV = VFNMS(LDK(KP951056516), TU, TT);
+		    ST(&(xo[WS(os, 11)]), VFMAI(TS, TR), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 9)]), VFNMSI(TS, TR), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 19)]), VFMAI(TQ, TJ), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VFNMSI(TQ, TJ), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VFMAI(TY, TV), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 17)]), VFNMSI(TY, TV), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 7)]), VFMAI(T10, TZ), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 13)]), VFNMSI(T10, TZ), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 20, XSIMD_STRING("n1fv_20"), {58, 4, 46, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_20) (planner *p) {
+     X(kdft_register) (p, n1fv_20, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n1fv_20 -include n1f.h */
+
+/*
+ * This function contains 104 FP additions, 24 FP multiplications,
+ * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
+ * 53 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
+	       V T3, T1B, Tm, T1i, TG, TN, TO, TH, T13, T16, T1k, T1u, T1v, T1z, T1r;
+	       V T1s, T1y, T1a, T1d, T1j, Ti, TD, TB, TL, Tj, TC;
+	       {
+		    V T1, T2, T1g, Tk, Tl, T1h;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T1g = VADD(T1, T2);
+		    Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+		    T1h = VADD(Tk, Tl);
+		    T3 = VSUB(T1, T2);
+		    T1B = VADD(T1g, T1h);
+		    Tm = VSUB(Tk, Tl);
+		    T1i = VSUB(T1g, T1h);
+	       }
+	       {
+		    V T6, T18, Tw, T12, Tz, T15, T9, T1b, Td, T11, Tp, T19, Ts, T1c, Tg;
+		    V T14;
+		    {
+			 V T4, T5, Tu, Tv;
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T18 = VADD(T4, T5);
+			 Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tw = VSUB(Tu, Tv);
+			 T12 = VADD(Tu, Tv);
+		    }
+		    {
+			 V Tx, Ty, T7, T8;
+			 Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Tz = VSUB(Tx, Ty);
+			 T15 = VADD(Tx, Ty);
+			 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T1b = VADD(T7, T8);
+		    }
+		    {
+			 V Tb, Tc, Tn, To;
+			 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 Td = VSUB(Tb, Tc);
+			 T11 = VADD(Tb, Tc);
+			 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			 Tp = VSUB(Tn, To);
+			 T19 = VADD(Tn, To);
+		    }
+		    {
+			 V Tq, Tr, Te, Tf;
+			 Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Ts = VSUB(Tq, Tr);
+			 T1c = VADD(Tq, Tr);
+			 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Tg = VSUB(Te, Tf);
+			 T14 = VADD(Te, Tf);
+		    }
+		    TG = VSUB(Ts, Tp);
+		    TN = VSUB(T6, T9);
+		    TO = VSUB(Td, Tg);
+		    TH = VSUB(Tz, Tw);
+		    T13 = VSUB(T11, T12);
+		    T16 = VSUB(T14, T15);
+		    T1k = VADD(T13, T16);
+		    T1u = VADD(T11, T12);
+		    T1v = VADD(T14, T15);
+		    T1z = VADD(T1u, T1v);
+		    T1r = VADD(T18, T19);
+		    T1s = VADD(T1b, T1c);
+		    T1y = VADD(T1r, T1s);
+		    T1a = VSUB(T18, T19);
+		    T1d = VSUB(T1b, T1c);
+		    T1j = VADD(T1a, T1d);
+		    {
+			 V Ta, Th, Tt, TA;
+			 Ta = VADD(T6, T9);
+			 Th = VADD(Td, Tg);
+			 Ti = VADD(Ta, Th);
+			 TD = VMUL(LDK(KP559016994), VSUB(Ta, Th));
+			 Tt = VADD(Tp, Ts);
+			 TA = VADD(Tw, Tz);
+			 TB = VADD(Tt, TA);
+			 TL = VMUL(LDK(KP559016994), VSUB(TA, Tt));
+		    }
+	       }
+	       Tj = VADD(T3, Ti);
+	       TC = VBYI(VADD(Tm, TB));
+	       ST(&(xo[WS(os, 5)]), VSUB(Tj, TC), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[WS(os, 15)]), VADD(Tj, TC), ovs, &(xo[WS(os, 1)]));
+	       {
+		    V T1A, T1C, T1D, T1x, T1G, T1t, T1w, T1F, T1E;
+		    T1A = VMUL(LDK(KP559016994), VSUB(T1y, T1z));
+		    T1C = VADD(T1y, T1z);
+		    T1D = VFNMS(LDK(KP250000000), T1C, T1B);
+		    T1t = VSUB(T1r, T1s);
+		    T1w = VSUB(T1u, T1v);
+		    T1x = VBYI(VFMA(LDK(KP951056516), T1t, VMUL(LDK(KP587785252), T1w)));
+		    T1G = VBYI(VFNMS(LDK(KP587785252), T1t, VMUL(LDK(KP951056516), T1w)));
+		    ST(&(xo[0]), VADD(T1B, T1C), ovs, &(xo[0]));
+		    T1F = VSUB(T1D, T1A);
+		    ST(&(xo[WS(os, 8)]), VSUB(T1F, T1G), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 12)]), VADD(T1G, T1F), ovs, &(xo[0]));
+		    T1E = VADD(T1A, T1D);
+		    ST(&(xo[WS(os, 4)]), VADD(T1x, T1E), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 16)]), VSUB(T1E, T1x), ovs, &(xo[0]));
+	       }
+	       {
+		    V T1n, T1l, T1m, T1f, T1q, T17, T1e, T1p, T1o;
+		    T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
+		    T1l = VADD(T1j, T1k);
+		    T1m = VFNMS(LDK(KP250000000), T1l, T1i);
+		    T17 = VSUB(T13, T16);
+		    T1e = VSUB(T1a, T1d);
+		    T1f = VBYI(VFNMS(LDK(KP587785252), T1e, VMUL(LDK(KP951056516), T17)));
+		    T1q = VBYI(VFMA(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
+		    ST(&(xo[WS(os, 10)]), VADD(T1i, T1l), ovs, &(xo[0]));
+		    T1p = VADD(T1n, T1m);
+		    ST(&(xo[WS(os, 6)]), VSUB(T1p, T1q), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 14)]), VADD(T1q, T1p), ovs, &(xo[0]));
+		    T1o = VSUB(T1m, T1n);
+		    ST(&(xo[WS(os, 2)]), VADD(T1f, T1o), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 18)]), VSUB(T1o, T1f), ovs, &(xo[0]));
+	       }
+	       {
+		    V TI, TP, TX, TU, TM, TW, TF, TT, TK, TE;
+		    TI = VFMA(LDK(KP951056516), TG, VMUL(LDK(KP587785252), TH));
+		    TP = VFMA(LDK(KP951056516), TN, VMUL(LDK(KP587785252), TO));
+		    TX = VFNMS(LDK(KP587785252), TN, VMUL(LDK(KP951056516), TO));
+		    TU = VFNMS(LDK(KP587785252), TG, VMUL(LDK(KP951056516), TH));
+		    TK = VFMS(LDK(KP250000000), TB, Tm);
+		    TM = VADD(TK, TL);
+		    TW = VSUB(TL, TK);
+		    TE = VFNMS(LDK(KP250000000), Ti, T3);
+		    TF = VADD(TD, TE);
+		    TT = VSUB(TE, TD);
+		    {
+			 V TJ, TQ, TZ, T10;
+			 TJ = VADD(TF, TI);
+			 TQ = VBYI(VSUB(TM, TP));
+			 ST(&(xo[WS(os, 19)]), VSUB(TJ, TQ), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 1)]), VADD(TJ, TQ), ovs, &(xo[WS(os, 1)]));
+			 TZ = VADD(TT, TU);
+			 T10 = VBYI(VADD(TX, TW));
+			 ST(&(xo[WS(os, 13)]), VSUB(TZ, T10), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 7)]), VADD(TZ, T10), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V TR, TS, TV, TY;
+			 TR = VSUB(TF, TI);
+			 TS = VBYI(VADD(TP, TM));
+			 ST(&(xo[WS(os, 11)]), VSUB(TR, TS), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 9)]), VADD(TR, TS), ovs, &(xo[WS(os, 1)]));
+			 TV = VSUB(TT, TU);
+			 TY = VBYI(VSUB(TW, TX));
+			 ST(&(xo[WS(os, 17)]), VSUB(TV, TY), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 3)]), VADD(TV, TY), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 20, XSIMD_STRING("n1fv_20"), {92, 12, 12, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_20) (planner *p) {
+     X(kdft_register) (p, n1fv_20, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,793 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:55 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name n1fv_25 -include n1f.h */
+
+/*
+ * This function contains 224 FP additions, 193 FP multiplications,
+ * (or, 43 additions, 12 multiplications, 181 fused multiply/add),
+ * 215 stack variables, 67 constants, and 50 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_25(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP792626838, +0.792626838241819413632131824093538848057784557);
+     DVK(KP876091699, +0.876091699473550838204498029706869638173524346);
+     DVK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DVK(KP617882369, +0.617882369114440893914546919006756321695042882);
+     DVK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP999544308, +0.999544308746292983948881682379742149196758193);
+     DVK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DVK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DVK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DVK(KP829049696, +0.829049696159252993975487806364305442437946767);
+     DVK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DVK(KP916574801, +0.916574801383451584742370439148878693530976769);
+     DVK(KP894834959, +0.894834959464455102997960030820114611498661386);
+     DVK(KP809385824, +0.809385824416008241660603814668679683846476688);
+     DVK(KP447417479, +0.447417479732227551498980015410057305749330693);
+     DVK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DVK(KP897376177, +0.897376177523557693138608077137219684419427330);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DVK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DVK(KP855719849, +0.855719849902058969314654733608091555096772472);
+     DVK(KP237294955, +0.237294955877110315393888866460840817927895961);
+     DVK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DVK(KP904508497, +0.904508497187473712051146708591409529430077295);
+     DVK(KP997675361, +0.997675361079556513670859573984492383596555031);
+     DVK(KP262346850, +0.262346850930607871785420028382979691334784273);
+     DVK(KP763932022, +0.763932022500210303590826331268723764559381640);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DVK(KP952936919, +0.952936919628306576880750665357914584765951388);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP570584518, +0.570584518783621657366766175430996792655723863);
+     DVK(KP669429328, +0.669429328479476605641803240971985825917022098);
+     DVK(KP923225144, +0.923225144846402650453449441572664695995209956);
+     DVK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DVK(KP956723877, +0.956723877038460305821989399535483155872969262);
+     DVK(KP522616830, +0.522616830205754336872861364785224694908468440);
+     DVK(KP945422727, +0.945422727388575946270360266328811958657216298);
+     DVK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DVK(KP982009705, +0.982009705009746369461829878184175962711969869);
+     DVK(KP921078979, +0.921078979742360627699756128143719920817673854);
+     DVK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP958953096, +0.958953096729998668045963838399037225970891871);
+     DVK(KP867381224, +0.867381224396525206773171885031575671309956167);
+     DVK(KP269969613, +0.269969613759572083574752974412347470060951301);
+     DVK(KP244189809, +0.244189809627953270309879511234821255780225091);
+     DVK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DVK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DVK(KP132830569, +0.132830569247582714407653942074819768844536507);
+     DVK(KP120146378, +0.120146378570687701782758537356596213647956445);
+     DVK(KP987388751, +0.987388751065621252324603216482382109400433949);
+     DVK(KP893101515, +0.893101515366181661711202267938416198338079437);
+     DVK(KP786782374, +0.786782374965295178365099601674911834788448471);
+     DVK(KP869845200, +0.869845200362138853122720822420327157933056305);
+     DVK(KP447533225, +0.447533225982656890041886979663652563063114397);
+     DVK(KP494780565, +0.494780565770515410344588413655324772219443730);
+     DVK(KP578046249, +0.578046249379945007321754579646815604023525655);
+     DVK(KP522847744, +0.522847744331509716623755382187077770911012542);
+     DVK(KP059835404, +0.059835404262124915169548397419498386427871950);
+     DVK(KP066152395, +0.066152395967733048213034281011006031460903353);
+     DVK(KP603558818, +0.603558818296015001454675132653458027918768137);
+     DVK(KP667278218, +0.667278218140296670899089292254759909713898805);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(50, is), MAKE_VOLATILE_STRIDE(50, os)) {
+	       V T1g, T1k, T1I, T24, T2a, T1G, T1A, T1l, T1B, T1H, T1d;
+	       {
+		    V T2z, T1q, Ta, T9, T3n, Ty, Tl, T2O, T2W, T2l, T2s, TV, T1i, T1K, T1S;
+		    V T3z, T3t, Tk, T3o, Tp, T2g, T2N, T2V, T2o, T2t, T1a, T1j, T1J, T1R, Tz;
+		    V Tt, TA, Tw;
+		    {
+			 V T1, T5, T6, T2, T3;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 T6 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 {
+			      V TH, TW, TK, TS, T10, T8, TN, TT, T17, TZ, T11;
+			      TH = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			      TW = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			      {
+				   V TI, TJ, TL, T7, T1p, T4, T1o, TM, TX, TY;
+				   TI = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+				   TJ = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+				   TL = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+				   T7 = VADD(T5, T6);
+				   T1p = VSUB(T5, T6);
+				   T4 = VADD(T2, T3);
+				   T1o = VSUB(T2, T3);
+				   TM = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+				   TX = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+				   TK = VADD(TI, TJ);
+				   TS = VSUB(TI, TJ);
+				   TY = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+				   T10 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+				   T2z = VFNMS(LDK(KP618033988), T1o, T1p);
+				   T1q = VFMA(LDK(KP618033988), T1p, T1o);
+				   Ta = VSUB(T4, T7);
+				   T8 = VADD(T4, T7);
+				   TN = VADD(TL, TM);
+				   TT = VSUB(TM, TL);
+				   T17 = VSUB(TX, TY);
+				   TZ = VADD(TX, TY);
+				   T11 = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			      }
+			      {
+				   V Tc, T2m, T19, Tn, To, Tr, Tj, T16, T2n, Ts, Tu, Tv;
+				   {
+					V TU, T2j, TO, TQ, T12, T18;
+					Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+					T9 = VFNMS(LDK(KP250000000), T8, T1);
+					T3n = VADD(T1, T8);
+					TU = VFNMS(LDK(KP618033988), TT, TS);
+					T2j = VFMA(LDK(KP618033988), TS, TT);
+					TO = VADD(TK, TN);
+					TQ = VSUB(TN, TK);
+					T12 = VADD(T10, T11);
+					T18 = VSUB(T10, T11);
+					Ty = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					{
+					     V T3r, T15, T13, Tf, Ti, T2k, TR, TP, T3s, T14;
+					     {
+						  V Td, Te, Tg, Th;
+						  Td = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+						  Te = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+						  Tg = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+						  Th = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+						  TP = VFNMS(LDK(KP250000000), TO, TH);
+						  T3r = VADD(TH, TO);
+						  T2m = VFNMS(LDK(KP618033988), T17, T18);
+						  T19 = VFMA(LDK(KP618033988), T18, T17);
+						  T15 = VSUB(T12, TZ);
+						  T13 = VADD(TZ, T12);
+						  Tf = VADD(Td, Te);
+						  Tn = VSUB(Td, Te);
+						  To = VSUB(Th, Tg);
+						  Ti = VADD(Tg, Th);
+					     }
+					     T2k = VFMA(LDK(KP559016994), TQ, TP);
+					     TR = VFNMS(LDK(KP559016994), TQ, TP);
+					     Tr = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+					     T3s = VADD(TW, T13);
+					     T14 = VFNMS(LDK(KP250000000), T13, TW);
+					     Tj = VADD(Tf, Ti);
+					     Tl = VSUB(Tf, Ti);
+					     T2O = VFNMS(LDK(KP667278218), T2k, T2j);
+					     T2W = VFMA(LDK(KP603558818), T2j, T2k);
+					     T2l = VFMA(LDK(KP066152395), T2k, T2j);
+					     T2s = VFNMS(LDK(KP059835404), T2j, T2k);
+					     TV = VFNMS(LDK(KP522847744), TU, TR);
+					     T1i = VFMA(LDK(KP578046249), TR, TU);
+					     T1K = VFNMS(LDK(KP494780565), TR, TU);
+					     T1S = VFMA(LDK(KP447533225), TU, TR);
+					     T16 = VFNMS(LDK(KP559016994), T15, T14);
+					     T2n = VFMA(LDK(KP559016994), T15, T14);
+					     T3z = VSUB(T3r, T3s);
+					     T3t = VADD(T3r, T3s);
+					     Ts = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+					     Tu = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+					     Tv = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+					}
+				   }
+				   Tk = VFNMS(LDK(KP250000000), Tj, Tc);
+				   T3o = VADD(Tc, Tj);
+				   Tp = VFNMS(LDK(KP618033988), To, Tn);
+				   T2g = VFMA(LDK(KP618033988), Tn, To);
+				   T2N = VFMA(LDK(KP066152395), T2n, T2m);
+				   T2V = VFNMS(LDK(KP059835404), T2m, T2n);
+				   T2o = VFMA(LDK(KP869845200), T2n, T2m);
+				   T2t = VFNMS(LDK(KP786782374), T2m, T2n);
+				   T1a = VFNMS(LDK(KP893101515), T19, T16);
+				   T1j = VFMA(LDK(KP987388751), T16, T19);
+				   T1J = VFNMS(LDK(KP120146378), T19, T16);
+				   T1R = VFMA(LDK(KP132830569), T16, T19);
+				   Tz = VADD(Ts, Tr);
+				   Tt = VSUB(Tr, Ts);
+				   TA = VADD(Tv, Tu);
+				   Tw = VSUB(Tu, Tv);
+			      }
+			 }
+		    }
+		    {
+			 V T2p, T2I, T2u, T2C, Tx, T2d, T2X, T34, T2P, T3b, T2b, Tb, T2Q, T2Z, T2h;
+			 V T2w, Tq, T1e, T1M, T1U, TE, T2c, T3q, T3y;
+			 T2p = VFNMS(LDK(KP772036680), T2o, T2l);
+			 T2I = VFMA(LDK(KP772036680), T2o, T2l);
+			 T2u = VFMA(LDK(KP772036680), T2t, T2s);
+			 T2C = VFNMS(LDK(KP772036680), T2t, T2s);
+			 {
+			      V TD, TB, Tm, T2f, T3p, TC;
+			      Tx = VFMA(LDK(KP618033988), Tw, Tt);
+			      T2d = VFNMS(LDK(KP618033988), Tt, Tw);
+			      TD = VSUB(Tz, TA);
+			      TB = VADD(Tz, TA);
+			      Tm = VFMA(LDK(KP559016994), Tl, Tk);
+			      T2f = VFNMS(LDK(KP559016994), Tl, Tk);
+			      T2X = VFMA(LDK(KP845997307), T2W, T2V);
+			      T34 = VFNMS(LDK(KP845997307), T2W, T2V);
+			      T2P = VFNMS(LDK(KP845997307), T2O, T2N);
+			      T3b = VFMA(LDK(KP845997307), T2O, T2N);
+			      T2b = VFNMS(LDK(KP559016994), Ta, T9);
+			      Tb = VFMA(LDK(KP559016994), Ta, T9);
+			      T3p = VADD(Ty, TB);
+			      TC = VFMS(LDK(KP250000000), TB, Ty);
+			      T2Q = VFNMS(LDK(KP522847744), T2g, T2f);
+			      T2Z = VFMA(LDK(KP578046249), T2f, T2g);
+			      T2h = VFMA(LDK(KP893101515), T2g, T2f);
+			      T2w = VFNMS(LDK(KP987388751), T2f, T2g);
+			      Tq = VFNMS(LDK(KP244189809), Tp, Tm);
+			      T1e = VFMA(LDK(KP269969613), Tm, Tp);
+			      T1M = VFMA(LDK(KP667278218), Tm, Tp);
+			      T1U = VFNMS(LDK(KP603558818), Tp, Tm);
+			      TE = VFNMS(LDK(KP559016994), TD, TC);
+			      T2c = VFMA(LDK(KP559016994), TD, TC);
+			      T3q = VADD(T3o, T3p);
+			      T3y = VSUB(T3o, T3p);
+			 }
+			 {
+			      V T1Z, T25, T1P, T22, T1X, TG, T1b, T28, T1t, T1y, T1x, T1E, T1Q, T1Y;
+			      {
+				   V T26, T1L, T1T, TF, T1f, T1W, T3m, T3g, T2M, T2G, T39, T3j, T21, T1O, T20;
+				   V T27;
+				   T26 = VFMA(LDK(KP867381224), T1K, T1J);
+				   T1L = VFNMS(LDK(KP867381224), T1K, T1J);
+				   T20 = VFNMS(LDK(KP958953096), T1S, T1R);
+				   T1T = VFMA(LDK(KP958953096), T1S, T1R);
+				   {
+					V T2R, T2Y, T2e, T2v, T1N, T1V;
+					T2R = VFNMS(LDK(KP494780565), T2c, T2d);
+					T2Y = VFMA(LDK(KP447533225), T2d, T2c);
+					T2e = VFMA(LDK(KP120146378), T2d, T2c);
+					T2v = VFNMS(LDK(KP132830569), T2c, T2d);
+					TF = VFNMS(LDK(KP667278218), TE, Tx);
+					T1f = VFMA(LDK(KP603558818), Tx, TE);
+					T1N = VFMA(LDK(KP869845200), TE, Tx);
+					T1V = VFNMS(LDK(KP786782374), Tx, TE);
+					{
+					     V T3A, T3C, T3w, T3u;
+					     T3A = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T3z, T3y));
+					     T3C = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T3y, T3z));
+					     T3w = VSUB(T3q, T3t);
+					     T3u = VADD(T3q, T3t);
+					     {
+						  V T2B, T2x, T2H, T2i;
+						  T2B = VFMA(LDK(KP734762448), T2w, T2v);
+						  T2x = VFNMS(LDK(KP734762448), T2w, T2v);
+						  T2H = VFNMS(LDK(KP734762448), T2h, T2e);
+						  T2i = VFMA(LDK(KP734762448), T2h, T2e);
+						  {
+						       V T30, T35, T3c, T2S, T3v;
+						       T30 = VFNMS(LDK(KP921078979), T2Z, T2Y);
+						       T35 = VFMA(LDK(KP921078979), T2Z, T2Y);
+						       T3c = VFMA(LDK(KP982009705), T2R, T2Q);
+						       T2S = VFNMS(LDK(KP982009705), T2R, T2Q);
+						       T1W = VFMA(LDK(KP912575812), T1V, T1U);
+						       T1Z = VFNMS(LDK(KP912575812), T1V, T1U);
+						       T1O = VFMA(LDK(KP912575812), T1N, T1M);
+						       T25 = VFNMS(LDK(KP912575812), T1N, T1M);
+						       ST(&(xo[0]), VADD(T3u, T3n), ovs, &(xo[0]));
+						       T3v = VFNMS(LDK(KP250000000), T3u, T3n);
+						       {
+							    V T2y, T2J, T2q, T2D;
+							    T2y = VFMA(LDK(KP945422727), T2x, T2u);
+							    T2J = VFMA(LDK(KP522616830), T2x, T2I);
+							    T2q = VFMA(LDK(KP956723877), T2p, T2i);
+							    T2D = VFNMS(LDK(KP522616830), T2i, T2C);
+							    {
+								 V T3e, T31, T36, T2T;
+								 T3e = VFMA(LDK(KP906616052), T30, T2X);
+								 T31 = VFNMS(LDK(KP906616052), T30, T2X);
+								 T36 = VFNMS(LDK(KP923225144), T2S, T2P);
+								 T2T = VFMA(LDK(KP923225144), T2S, T2P);
+								 {
+								      V T3k, T3d, T3x, T3B;
+								      T3k = VFNMS(LDK(KP669429328), T3b, T3c);
+								      T3d = VFMA(LDK(KP570584518), T3c, T3b);
+								      T3x = VFMA(LDK(KP559016994), T3w, T3v);
+								      T3B = VFNMS(LDK(KP559016994), T3w, T3v);
+								      {
+									   V T2A, T2K, T2r, T2E;
+									   T2A = VMUL(LDK(KP998026728), VFMA(LDK(KP952936919), T2z, T2y));
+									   T2K = VFNMS(LDK(KP690983005), T2J, T2u);
+									   T2r = VFMA(LDK(KP992114701), T2q, T2b);
+									   T2E = VFMA(LDK(KP763932022), T2D, T2p);
+									   {
+										V T32, T3a, T37, T3h;
+										T32 = VMUL(LDK(KP998026728), VFNMS(LDK(KP952936919), T2z, T31));
+										T3a = VFMA(LDK(KP262346850), T31, T2z);
+										T37 = VFNMS(LDK(KP997675361), T36, T35);
+										T3h = VFNMS(LDK(KP904508497), T36, T34);
+										{
+										     V T2U, T33, T3l, T3f;
+										     T2U = VFMA(LDK(KP949179823), T2T, T2b);
+										     T33 = VFNMS(LDK(KP237294955), T2T, T2b);
+										     T3l = VFNMS(LDK(KP669429328), T3e, T3k);
+										     T3f = VFMA(LDK(KP618033988), T3e, T3d);
+										     ST(&(xo[WS(os, 20)]), VFMAI(T3A, T3x), ovs, &(xo[0]));
+										     ST(&(xo[WS(os, 5)]), VFNMSI(T3A, T3x), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 15)]), VFNMSI(T3C, T3B), ovs, &(xo[WS(os, 1)]));
+										     ST(&(xo[WS(os, 10)]), VFMAI(T3C, T3B), ovs, &(xo[0]));
+										     {
+											  V T2L, T2F, T38, T3i;
+											  T2L = VFMA(LDK(KP855719849), T2K, T2H);
+											  ST(&(xo[WS(os, 22)]), VFMAI(T2A, T2r), ovs, &(xo[0]));
+											  ST(&(xo[WS(os, 3)]), VFNMSI(T2A, T2r), ovs, &(xo[WS(os, 1)]));
+											  T2F = VFNMS(LDK(KP855719849), T2E, T2B);
+											  T38 = VFMA(LDK(KP560319534), T37, T34);
+											  T3i = VFNMS(LDK(KP681693190), T3h, T35);
+											  ST(&(xo[WS(os, 23)]), VFMAI(T32, T2U), ovs, &(xo[WS(os, 1)]));
+											  ST(&(xo[WS(os, 2)]), VFNMSI(T32, T2U), ovs, &(xo[0]));
+											  T3m = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T3l, T3a));
+											  T3g = VMUL(LDK(KP951056516), VFNMS(LDK(KP949179823), T3f, T3a));
+											  T2M = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T2L, T2z));
+											  T2G = VFMA(LDK(KP897376177), T2F, T2b);
+											  T39 = VFNMS(LDK(KP949179823), T38, T33);
+											  T3j = VFNMS(LDK(KP860541664), T3i, T33);
+											  T21 = VFMA(LDK(KP447417479), T1O, T20);
+										     }
+										}
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+				   T1P = VFNMS(LDK(KP809385824), T1O, T1L);
+				   ST(&(xo[WS(os, 17)]), VFMAI(T2M, T2G), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 8)]), VFNMSI(T2M, T2G), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 12)]), VFMAI(T3g, T39), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 13)]), VFNMSI(T3g, T39), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 7)]), VFMAI(T3m, T3j), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 18)]), VFNMSI(T3m, T3j), ovs, &(xo[0]));
+				   T22 = VFMA(LDK(KP690983005), T21, T1L);
+				   T27 = VFMA(LDK(KP447417479), T1W, T26);
+				   T1X = VFMA(LDK(KP894834959), T1W, T1T);
+				   {
+					V T1r, T1s, T1v, T1w;
+					T1r = VFNMS(LDK(KP916574801), T1f, T1e);
+					T1g = VFMA(LDK(KP916574801), T1f, T1e);
+					T1k = VFNMS(LDK(KP831864738), T1j, T1i);
+					T1s = VFMA(LDK(KP831864738), T1j, T1i);
+					T1v = VFNMS(LDK(KP829049696), TF, Tq);
+					TG = VFMA(LDK(KP829049696), TF, Tq);
+					T1b = VFMA(LDK(KP831864738), T1a, TV);
+					T1w = VFNMS(LDK(KP831864738), T1a, TV);
+					T28 = VFNMS(LDK(KP763932022), T27, T1T);
+					T1t = VFMA(LDK(KP904730450), T1s, T1r);
+					T1y = VFNMS(LDK(KP904730450), T1s, T1r);
+					T1x = VFMA(LDK(KP559154169), T1w, T1v);
+					T1E = VFNMS(LDK(KP683113946), T1v, T1w);
+				   }
+			      }
+			      T1Q = VFNMS(LDK(KP992114701), T1P, Tb);
+			      T1Y = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T1X, T1q));
+			      {
+				   V T1u, T1F, T1z, T1h, T1c, T23, T29;
+				   T23 = VFNMS(LDK(KP999544308), T22, T1Z);
+				   T29 = VFNMS(LDK(KP999544308), T28, T25);
+				   T1I = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1t, T1q));
+				   T1u = VFNMS(LDK(KP242145790), T1t, T1q);
+				   T1F = VFMA(LDK(KP617882369), T1y, T1E);
+				   T1z = VFMA(LDK(KP559016994), T1y, T1x);
+				   T1h = VFNMS(LDK(KP904730450), T1b, TG);
+				   T1c = VFMA(LDK(KP904730450), T1b, TG);
+				   ST(&(xo[WS(os, 21)]), VFNMSI(T1Y, T1Q), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 4)]), VFMAI(T1Y, T1Q), ovs, &(xo[0]));
+				   T24 = VFNMS(LDK(KP803003575), T23, Tb);
+				   T2a = VMUL(LDK(KP951056516), VFNMS(LDK(KP803003575), T29, T1q));
+				   T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T1F, T1u));
+				   T1A = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1z, T1u));
+				   T1l = VFNMS(LDK(KP904730450), T1k, T1h);
+				   T1B = VADD(T1g, T1h);
+				   T1H = VFMA(LDK(KP968583161), T1c, Tb);
+				   T1d = VFNMS(LDK(KP242145790), T1c, Tb);
+			      }
+			 }
+		    }
+	       }
+	       ST(&(xo[WS(os, 9)]), VFMAI(T2a, T24), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[WS(os, 16)]), VFNMSI(T2a, T24), ovs, &(xo[0]));
+	       {
+		    V T1m, T1C, T1n, T1D;
+		    T1m = VFNMS(LDK(KP618033988), T1l, T1g);
+		    T1C = VFNMS(LDK(KP683113946), T1B, T1k);
+		    ST(&(xo[WS(os, 24)]), VFMAI(T1I, T1H), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 1)]), VFNMSI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
+		    T1n = VFNMS(LDK(KP876091699), T1m, T1d);
+		    T1D = VFMA(LDK(KP792626838), T1C, T1d);
+		    ST(&(xo[WS(os, 19)]), VFMAI(T1A, T1n), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 6)]), VFNMSI(T1A, T1n), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 14)]), VFMAI(T1G, T1D), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 11)]), VFNMSI(T1G, T1D), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 25, XSIMD_STRING("n1fv_25"), {43, 12, 181, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_25) (planner *p) {
+     X(kdft_register) (p, n1fv_25, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name n1fv_25 -include n1f.h */
+
+/*
+ * This function contains 224 FP additions, 140 FP multiplications,
+ * (or, 146 additions, 62 multiplications, 78 fused multiply/add),
+ * 115 stack variables, 40 constants, and 50 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_25(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DVK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DVK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DVK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP250666467, +0.250666467128608490746237519633017587885836494);
+     DVK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DVK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
+     DVK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DVK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
+     DVK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DVK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DVK(KP497379774, +0.497379774329709576484567492012895936835134813);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DVK(KP851558583, +0.851558583130145297725004891488503407959946084);
+     DVK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DVK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DVK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DVK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
+     DVK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DVK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DVK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DVK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
+     DVK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP125581039, +0.125581039058626752152356449131262266244969664);
+     DVK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DVK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
+     DVK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DVK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DVK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
+     DVK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(50, is), MAKE_VOLATILE_STRIDE(50, os)) {
+	       V T7, T1g, T26, Ta, T2R, T2N, T2O, T2P, T19, T1Y, T16, T1Z, T1a, T2v, T1l;
+	       V T2m, TU, T21, TR, T22, TV, T2u, T1k, T2l, T2K, T2L, T2M, TE, T1R, TB;
+	       V T1S, TF, T2r, T1i, T2j, Tp, T1U, Tm, T1V, Tq, T2s, T1h, T2i;
+	       {
+		    V T8, T6, T1f, T3, T1e, T25, T9;
+		    T8 = LD(&(xi[0]), ivs, &(xi[0]));
+		    {
+			 V T4, T5, T1, T2;
+			 T4 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VADD(T4, T5);
+			 T1f = VSUB(T4, T5);
+			 T1 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 T3 = VADD(T1, T2);
+			 T1e = VSUB(T1, T2);
+		    }
+		    T7 = VMUL(LDK(KP559016994), VSUB(T3, T6));
+		    T1g = VFMA(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T1f));
+		    T25 = VMUL(LDK(KP951056516), T1f);
+		    T26 = VFNMS(LDK(KP587785252), T1e, T25);
+		    T9 = VADD(T3, T6);
+		    Ta = VFNMS(LDK(KP250000000), T9, T8);
+		    T2R = VADD(T8, T9);
+	       }
+	       {
+		    V TO, T13, TN, TT, TP, TS, T12, T18, T14, T17, T15, TQ;
+		    TO = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T13 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    {
+			 V TH, TI, TJ, TK, TL, TM;
+			 TH = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 TI = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			 TJ = VADD(TH, TI);
+			 TK = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 TL = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 TM = VADD(TK, TL);
+			 TN = VMUL(LDK(KP559016994), VSUB(TJ, TM));
+			 TT = VSUB(TK, TL);
+			 TP = VADD(TJ, TM);
+			 TS = VSUB(TH, TI);
+		    }
+		    {
+			 V TW, TX, TY, TZ, T10, T11;
+			 TW = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 TX = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			 TY = VADD(TW, TX);
+			 TZ = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 T10 = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 T11 = VADD(TZ, T10);
+			 T12 = VMUL(LDK(KP559016994), VSUB(TY, T11));
+			 T18 = VSUB(TZ, T10);
+			 T14 = VADD(TY, T11);
+			 T17 = VSUB(TW, TX);
+		    }
+		    T2N = VADD(TO, TP);
+		    T2O = VADD(T13, T14);
+		    T2P = VADD(T2N, T2O);
+		    T19 = VFMA(LDK(KP475528258), T17, VMUL(LDK(KP293892626), T18));
+		    T1Y = VFNMS(LDK(KP293892626), T17, VMUL(LDK(KP475528258), T18));
+		    T15 = VFNMS(LDK(KP250000000), T14, T13);
+		    T16 = VADD(T12, T15);
+		    T1Z = VSUB(T15, T12);
+		    T1a = VFNMS(LDK(KP1_369094211), T19, VMUL(LDK(KP728968627), T16));
+		    T2v = VFMA(LDK(KP1_996053456), T1Y, VMUL(LDK(KP062790519), T1Z));
+		    T1l = VFMA(LDK(KP1_457937254), T19, VMUL(LDK(KP684547105), T16));
+		    T2m = VFNMS(LDK(KP998026728), T1Z, VMUL(LDK(KP125581039), T1Y));
+		    TU = VFMA(LDK(KP475528258), TS, VMUL(LDK(KP293892626), TT));
+		    T21 = VFNMS(LDK(KP293892626), TS, VMUL(LDK(KP475528258), TT));
+		    TQ = VFNMS(LDK(KP250000000), TP, TO);
+		    TR = VADD(TN, TQ);
+		    T22 = VSUB(TQ, TN);
+		    TV = VFNMS(LDK(KP963507348), TU, VMUL(LDK(KP876306680), TR));
+		    T2u = VFMA(LDK(KP1_688655851), T21, VMUL(LDK(KP535826794), T22));
+		    T1k = VFMA(LDK(KP1_752613360), TU, VMUL(LDK(KP481753674), TR));
+		    T2l = VFNMS(LDK(KP844327925), T22, VMUL(LDK(KP1_071653589), T21));
+	       }
+	       {
+		    V Tj, Ty, Ti, To, Tk, Tn, Tx, TD, Tz, TC, TA, Tl;
+		    Tj = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    Ty = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    {
+			 V Tc, Td, Te, Tf, Tg, Th;
+			 Tc = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Td = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			 Te = VADD(Tc, Td);
+			 Tf = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Tg = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 Th = VADD(Tf, Tg);
+			 Ti = VMUL(LDK(KP559016994), VSUB(Te, Th));
+			 To = VSUB(Tf, Tg);
+			 Tk = VADD(Te, Th);
+			 Tn = VSUB(Tc, Td);
+		    }
+		    {
+			 V Tr, Ts, Tt, Tu, Tv, Tw;
+			 Tr = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Ts = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			 Tt = VADD(Tr, Ts);
+			 Tu = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Tv = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			 Tw = VADD(Tu, Tv);
+			 Tx = VMUL(LDK(KP559016994), VSUB(Tt, Tw));
+			 TD = VSUB(Tu, Tv);
+			 Tz = VADD(Tt, Tw);
+			 TC = VSUB(Tr, Ts);
+		    }
+		    T2K = VADD(Tj, Tk);
+		    T2L = VADD(Ty, Tz);
+		    T2M = VADD(T2K, T2L);
+		    TE = VFMA(LDK(KP475528258), TC, VMUL(LDK(KP293892626), TD));
+		    T1R = VFNMS(LDK(KP293892626), TC, VMUL(LDK(KP475528258), TD));
+		    TA = VFNMS(LDK(KP250000000), Tz, Ty);
+		    TB = VADD(Tx, TA);
+		    T1S = VSUB(TA, Tx);
+		    TF = VFNMS(LDK(KP1_688655851), TE, VMUL(LDK(KP535826794), TB));
+		    T2r = VFNMS(LDK(KP425779291), T1S, VMUL(LDK(KP1_809654104), T1R));
+		    T1i = VFMA(LDK(KP1_071653589), TE, VMUL(LDK(KP844327925), TB));
+		    T2j = VFMA(LDK(KP851558583), T1R, VMUL(LDK(KP904827052), T1S));
+		    Tp = VFMA(LDK(KP475528258), Tn, VMUL(LDK(KP293892626), To));
+		    T1U = VFNMS(LDK(KP293892626), Tn, VMUL(LDK(KP475528258), To));
+		    Tl = VFNMS(LDK(KP250000000), Tk, Tj);
+		    Tm = VADD(Ti, Tl);
+		    T1V = VSUB(Tl, Ti);
+		    Tq = VFNMS(LDK(KP497379774), Tp, VMUL(LDK(KP968583161), Tm));
+		    T2s = VFMA(LDK(KP963507348), T1U, VMUL(LDK(KP876306680), T1V));
+		    T1h = VFMA(LDK(KP1_937166322), Tp, VMUL(LDK(KP248689887), Tm));
+		    T2i = VFNMS(LDK(KP481753674), T1V, VMUL(LDK(KP1_752613360), T1U));
+	       }
+	       {
+		    V T2Q, T2S, T2T, T2X, T2Y, T2V, T2W, T2Z, T2U;
+		    T2Q = VMUL(LDK(KP559016994), VSUB(T2M, T2P));
+		    T2S = VADD(T2M, T2P);
+		    T2T = VFNMS(LDK(KP250000000), T2S, T2R);
+		    T2V = VSUB(T2K, T2L);
+		    T2W = VSUB(T2N, T2O);
+		    T2X = VBYI(VFMA(LDK(KP951056516), T2V, VMUL(LDK(KP587785252), T2W)));
+		    T2Y = VBYI(VFNMS(LDK(KP587785252), T2V, VMUL(LDK(KP951056516), T2W)));
+		    ST(&(xo[0]), VADD(T2R, T2S), ovs, &(xo[0]));
+		    T2Z = VSUB(T2T, T2Q);
+		    ST(&(xo[WS(os, 10)]), VADD(T2Y, T2Z), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 15)]), VSUB(T2Z, T2Y), ovs, &(xo[WS(os, 1)]));
+		    T2U = VADD(T2Q, T2T);
+		    ST(&(xo[WS(os, 5)]), VSUB(T2U, T2X), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 20)]), VADD(T2X, T2U), ovs, &(xo[0]));
+	       }
+	       {
+		    V T2t, T2y, T2z, T2w, T1T, T1W, T1X, T2c, T2d, T2e, T29, T2a, T2b, T20, T23;
+		    V T24, T2p, T2o, T2q, T28, T2D, T2C, T2E, T2x, T2F;
+		    T2t = VSUB(T2r, T2s);
+		    T2y = VADD(T2i, T2j);
+		    T2z = VSUB(T2l, T2m);
+		    T2w = VSUB(T2u, T2v);
+		    T1T = VFNMS(LDK(KP125333233), T1S, VMUL(LDK(KP1_984229402), T1R));
+		    T1W = VFMA(LDK(KP1_457937254), T1U, VMUL(LDK(KP684547105), T1V));
+		    T1X = VSUB(T1T, T1W);
+		    T2c = VFNMS(LDK(KP1_996053456), T21, VMUL(LDK(KP062790519), T22));
+		    T2d = VFMA(LDK(KP1_541026485), T1Y, VMUL(LDK(KP637423989), T1Z));
+		    T2e = VSUB(T2c, T2d);
+		    T29 = VFNMS(LDK(KP1_369094211), T1U, VMUL(LDK(KP728968627), T1V));
+		    T2a = VFMA(LDK(KP250666467), T1R, VMUL(LDK(KP992114701), T1S));
+		    T2b = VSUB(T29, T2a);
+		    T20 = VFNMS(LDK(KP770513242), T1Z, VMUL(LDK(KP1_274847979), T1Y));
+		    T23 = VFMA(LDK(KP125581039), T21, VMUL(LDK(KP998026728), T22));
+		    T24 = VSUB(T20, T23);
+		    {
+			 V T2k, T2n, T2A, T2B;
+			 T2k = VSUB(T2i, T2j);
+			 T2n = VADD(T2l, T2m);
+			 T2p = VADD(T2k, T2n);
+			 T2o = VMUL(LDK(KP559016994), VSUB(T2k, T2n));
+			 T2q = VFNMS(LDK(KP250000000), T2p, T26);
+			 T28 = VSUB(Ta, T7);
+			 T2A = VADD(T2s, T2r);
+			 T2B = VADD(T2u, T2v);
+			 T2D = VADD(T2A, T2B);
+			 T2C = VMUL(LDK(KP559016994), VSUB(T2A, T2B));
+			 T2E = VFNMS(LDK(KP250000000), T2D, T28);
+		    }
+		    {
+			 V T2I, T2J, T27, T2f;
+			 T2I = VBYI(VADD(T26, T2p));
+			 T2J = VADD(T28, T2D);
+			 ST(&(xo[WS(os, 2)]), VADD(T2I, T2J), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 23)]), VSUB(T2J, T2I), ovs, &(xo[WS(os, 1)]));
+			 T27 = VBYI(VSUB(VADD(T1X, T24), T26));
+			 T2f = VADD(T28, VADD(T2b, T2e));
+			 ST(&(xo[WS(os, 3)]), VADD(T27, T2f), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 22)]), VSUB(T2f, T27), ovs, &(xo[0]));
+		    }
+		    T2x = VBYI(VADD(T2o, VADD(T2q, VFNMS(LDK(KP587785252), T2w, VMUL(LDK(KP951056516), T2t)))));
+		    T2F = VFMA(LDK(KP951056516), T2y, VFMA(LDK(KP587785252), T2z, VADD(T2C, T2E)));
+		    ST(&(xo[WS(os, 7)]), VADD(T2x, T2F), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 18)]), VSUB(T2F, T2x), ovs, &(xo[0]));
+		    {
+			 V T2G, T2H, T2g, T2h;
+			 T2G = VBYI(VADD(T2q, VSUB(VFMA(LDK(KP587785252), T2t, VMUL(LDK(KP951056516), T2w)), T2o)));
+			 T2H = VFMA(LDK(KP587785252), T2y, VSUB(VFNMS(LDK(KP951056516), T2z, T2E), T2C));
+			 ST(&(xo[WS(os, 12)]), VADD(T2G, T2H), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 13)]), VSUB(T2H, T2G), ovs, &(xo[WS(os, 1)]));
+			 T2g = VFMA(LDK(KP309016994), T2b, VFNMS(LDK(KP809016994), T2e, VFNMS(LDK(KP587785252), VADD(T23, T20), VFNMS(LDK(KP951056516), VADD(T1W, T1T), T28))));
+			 T2h = VBYI(VSUB(VFNMS(LDK(KP587785252), VADD(T2c, T2d), VFNMS(LDK(KP809016994), T24, VFNMS(LDK(KP951056516), VADD(T29, T2a), VMUL(LDK(KP309016994), T1X)))), T26));
+			 ST(&(xo[WS(os, 17)]), VSUB(T2g, T2h), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 8)]), VADD(T2g, T2h), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T1p, T1u, T1w, T1q, T1B, T1C, T1D, T1L, T1M, T1N, T1I, T1J, T1K, T1E, T1F;
+		    V T1G, T1n, T1r, T1s, Tb, T1c, T1v, T1x, T1t, T1y;
+		    T1p = VSUB(TF, Tq);
+		    T1u = VSUB(T1i, T1h);
+		    T1w = VSUB(T1l, T1k);
+		    T1q = VSUB(TV, T1a);
+		    T1B = VFMA(LDK(KP1_688655851), Tp, VMUL(LDK(KP535826794), Tm));
+		    T1C = VFMA(LDK(KP1_541026485), TE, VMUL(LDK(KP637423989), TB));
+		    T1D = VSUB(T1B, T1C);
+		    T1L = VFMA(LDK(KP851558583), TU, VMUL(LDK(KP904827052), TR));
+		    T1M = VFMA(LDK(KP1_984229402), T19, VMUL(LDK(KP125333233), T16));
+		    T1N = VADD(T1L, T1M);
+		    T1I = VFNMS(LDK(KP844327925), Tm, VMUL(LDK(KP1_071653589), Tp));
+		    T1J = VFNMS(LDK(KP1_274847979), TE, VMUL(LDK(KP770513242), TB));
+		    T1K = VADD(T1I, T1J);
+		    T1E = VFNMS(LDK(KP425779291), TR, VMUL(LDK(KP1_809654104), TU));
+		    T1F = VFNMS(LDK(KP992114701), T16, VMUL(LDK(KP250666467), T19));
+		    T1G = VADD(T1E, T1F);
+		    {
+			 V T1j, T1m, TG, T1b;
+			 T1j = VADD(T1h, T1i);
+			 T1m = VADD(T1k, T1l);
+			 T1n = VADD(T1j, T1m);
+			 T1r = VFMS(LDK(KP250000000), T1n, T1g);
+			 T1s = VMUL(LDK(KP559016994), VSUB(T1m, T1j));
+			 Tb = VADD(T7, Ta);
+			 TG = VADD(Tq, TF);
+			 T1b = VADD(TV, T1a);
+			 T1c = VADD(TG, T1b);
+			 T1v = VFNMS(LDK(KP250000000), T1c, Tb);
+			 T1x = VMUL(LDK(KP559016994), VSUB(TG, T1b));
+		    }
+		    {
+			 V T1d, T1o, T1H, T1O;
+			 T1d = VADD(Tb, T1c);
+			 T1o = VBYI(VADD(T1g, T1n));
+			 ST(&(xo[WS(os, 1)]), VSUB(T1d, T1o), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 24)]), VADD(T1d, T1o), ovs, &(xo[0]));
+			 T1H = VADD(Tb, VADD(T1D, T1G));
+			 T1O = VBYI(VADD(T1g, VSUB(T1K, T1N)));
+			 ST(&(xo[WS(os, 21)]), VSUB(T1H, T1O), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 4)]), VADD(T1H, T1O), ovs, &(xo[0]));
+		    }
+		    T1t = VBYI(VADD(VFMA(LDK(KP587785252), T1p, VMUL(LDK(KP951056516), T1q)), VSUB(T1r, T1s)));
+		    T1y = VFMA(LDK(KP587785252), T1u, VFNMS(LDK(KP951056516), T1w, VSUB(T1v, T1x)));
+		    ST(&(xo[WS(os, 11)]), VADD(T1t, T1y), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 14)]), VSUB(T1y, T1t), ovs, &(xo[0]));
+		    {
+			 V T1z, T1A, T1P, T1Q;
+			 T1z = VBYI(VADD(VFNMS(LDK(KP587785252), T1q, VMUL(LDK(KP951056516), T1p)), VADD(T1r, T1s)));
+			 T1A = VFMA(LDK(KP951056516), T1u, VADD(T1x, VFMA(LDK(KP587785252), T1w, T1v)));
+			 ST(&(xo[WS(os, 6)]), VADD(T1z, T1A), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 19)]), VSUB(T1A, T1z), ovs, &(xo[WS(os, 1)]));
+			 T1P = VBYI(VADD(T1g, VFMA(LDK(KP309016994), T1K, VFMA(LDK(KP587785252), VSUB(T1F, T1E), VFNMS(LDK(KP951056516), VADD(T1B, T1C), VMUL(LDK(KP809016994), T1N))))));
+			 T1Q = VFMA(LDK(KP309016994), T1D, VFMA(LDK(KP951056516), VSUB(T1I, T1J), VFMA(LDK(KP587785252), VSUB(T1M, T1L), VFNMS(LDK(KP809016994), T1G, Tb))));
+			 ST(&(xo[WS(os, 9)]), VADD(T1P, T1Q), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 16)]), VSUB(T1Q, T1P), ovs, &(xo[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 25, XSIMD_STRING("n1fv_25"), {146, 62, 78, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_25) (planner *p) {
+     X(kdft_register) (p, n1fv_25, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 3 -name n1fv_3 -include n1f.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 3 additions, 1 multiplications, 3 fused multiply/add),
+ * 11 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(6, is), MAKE_VOLATILE_STRIDE(6, os)) {
+	       V T1, T2, T3, T6, T4, T5;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T6 = VMUL(LDK(KP866025403), VSUB(T3, T2));
+	       T4 = VADD(T2, T3);
+	       T5 = VFNMS(LDK(KP500000000), T4, T1);
+	       ST(&(xo[0]), VADD(T1, T4), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 1)]), VFMAI(T6, T5), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[WS(os, 2)]), VFNMSI(T6, T5), ovs, &(xo[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 3, XSIMD_STRING("n1fv_3"), {3, 1, 3, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_3) (planner *p) {
+     X(kdft_register) (p, n1fv_3, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 3 -name n1fv_3 -include n1f.h */
+
+/*
+ * This function contains 6 FP additions, 2 FP multiplications,
+ * (or, 5 additions, 1 multiplications, 1 fused multiply/add),
+ * 11 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(6, is), MAKE_VOLATILE_STRIDE(6, os)) {
+	       V T1, T4, T6, T2, T3, T5;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T4 = VADD(T2, T3);
+	       T6 = VBYI(VMUL(LDK(KP866025403), VSUB(T3, T2)));
+	       ST(&(xo[0]), VADD(T1, T4), ovs, &(xo[0]));
+	       T5 = VFNMS(LDK(KP500000000), T4, T1);
+	       ST(&(xo[WS(os, 2)]), VSUB(T5, T6), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 1)]), VADD(T5, T6), ovs, &(xo[WS(os, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 3, XSIMD_STRING("n1fv_3"), {5, 1, 1, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_3) (planner *p) {
+     X(kdft_register) (p, n1fv_3, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:54 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n1fv_32 -include n1f.h */
+
+/*
+ * This function contains 186 FP additions, 98 FP multiplications,
+ * (or, 88 additions, 0 multiplications, 98 fused multiply/add),
+ * 104 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       V T1h, Tr, T1a, T1k, TI, T1b, T1L, T1P, T1I, T1G, T1O, T1Q, T1H, T1z, T1c;
+	       V TZ;
+	       {
+		    V T2x, T1T, T2K, T1W, T1p, Tb, T1A, T16, Tu, TF, T2N, T2H, T2b, T2t, TY;
+		    V T1w, TT, T1v, T20, T2C, Tj, Te, T2h, To, T2f, T23, T2D, TB, TG, Th;
+		    V T2i, Tk;
+		    {
+			 V TL, TW, TP, TQ, T2F, T27, T28, TO;
+			 {
+			      V T1, T2, T12, T13, T4, T5, T7, T8;
+			      T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			      T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			      T12 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			      T13 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			      T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			      T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			      T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			      T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			      {
+				   V TM, T25, T26, TN;
+				   {
+					V TJ, T3, T14, T1U, T6, T1V, T9, TK, TU, TV, T1R, T1S, Ta, T15;
+					TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+					T1R = VADD(T1, T2);
+					T3 = VSUB(T1, T2);
+					T1S = VADD(T12, T13);
+					T14 = VSUB(T12, T13);
+					T1U = VADD(T4, T5);
+					T6 = VSUB(T4, T5);
+					T1V = VADD(T7, T8);
+					T9 = VSUB(T7, T8);
+					TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+					TU = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+					T2x = VSUB(T1R, T1S);
+					T1T = VADD(T1R, T1S);
+					TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+					TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+					T2K = VSUB(T1V, T1U);
+					T1W = VADD(T1U, T1V);
+					Ta = VADD(T6, T9);
+					T15 = VSUB(T9, T6);
+					T25 = VADD(TJ, TK);
+					TL = VSUB(TJ, TK);
+					T26 = VADD(TV, TU);
+					TW = VSUB(TU, TV);
+					TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+					TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+					T1p = VFNMS(LDK(KP707106781), Ta, T3);
+					Tb = VFMA(LDK(KP707106781), Ta, T3);
+					T1A = VFMA(LDK(KP707106781), T15, T14);
+					T16 = VFNMS(LDK(KP707106781), T15, T14);
+					TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+				   }
+				   T2F = VSUB(T25, T26);
+				   T27 = VADD(T25, T26);
+				   T28 = VADD(TM, TN);
+				   TO = VSUB(TM, TN);
+			      }
+			 }
+			 {
+			      V Ty, T21, Tx, Tz, T1Y, T1Z;
+			      {
+				   V Ts, Tt, TD, T29, TR, TE, Tv, Tw;
+				   Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+				   Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+				   TD = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+				   T29 = VADD(TP, TQ);
+				   TR = VSUB(TP, TQ);
+				   TE = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+				   Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+				   Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+				   Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+				   T1Y = VADD(Ts, Tt);
+				   Tu = VSUB(Ts, Tt);
+				   {
+					V T2G, T2a, TX, TS;
+					T2G = VSUB(T29, T28);
+					T2a = VADD(T28, T29);
+					TX = VSUB(TR, TO);
+					TS = VADD(TO, TR);
+					T1Z = VADD(TD, TE);
+					TF = VSUB(TD, TE);
+					T21 = VADD(Tv, Tw);
+					Tx = VSUB(Tv, Tw);
+					T2N = VFMA(LDK(KP414213562), T2F, T2G);
+					T2H = VFNMS(LDK(KP414213562), T2G, T2F);
+					T2b = VSUB(T27, T2a);
+					T2t = VADD(T27, T2a);
+					TY = VFMA(LDK(KP707106781), TX, TW);
+					T1w = VFNMS(LDK(KP707106781), TX, TW);
+					TT = VFMA(LDK(KP707106781), TS, TL);
+					T1v = VFNMS(LDK(KP707106781), TS, TL);
+					Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+				   }
+			      }
+			      T20 = VADD(T1Y, T1Z);
+			      T2C = VSUB(T1Y, T1Z);
+			      {
+				   V Tc, Td, Tm, Tn;
+				   Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+				   Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+				   Tm = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+				   Tn = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+				   {
+					V Tf, TA, T22, Tg;
+					Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+					TA = VSUB(Ty, Tz);
+					T22 = VADD(Ty, Tz);
+					Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+					Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+					Te = VSUB(Tc, Td);
+					T2h = VADD(Tc, Td);
+					To = VSUB(Tm, Tn);
+					T2f = VADD(Tn, Tm);
+					T23 = VADD(T21, T22);
+					T2D = VSUB(T21, T22);
+					TB = VADD(Tx, TA);
+					TG = VSUB(Tx, TA);
+					Th = VSUB(Tf, Tg);
+					T2i = VADD(Tf, Tg);
+					Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T1t, TH, T1s, TC, T2P, T2U, T2n, T2d, T2w, T2u, T1q, T19, T1B, Tq, T2W;
+			 V T2M, T2B, T2T, T2v, T2r, T2o, T2m, T2X, T2I;
+			 {
+			      V T1X, T2p, T2E, T2O, T2s, T2y, T2j, T17, Ti, T2e, Tl, T2c, T2l, T24;
+			      T1X = VSUB(T1T, T1W);
+			      T2p = VADD(T1T, T1W);
+			      T2E = VFNMS(LDK(KP414213562), T2D, T2C);
+			      T2O = VFMA(LDK(KP414213562), T2C, T2D);
+			      T2s = VADD(T20, T23);
+			      T24 = VSUB(T20, T23);
+			      T1t = VFNMS(LDK(KP707106781), TG, TF);
+			      TH = VFMA(LDK(KP707106781), TG, TF);
+			      T1s = VFNMS(LDK(KP707106781), TB, Tu);
+			      TC = VFMA(LDK(KP707106781), TB, Tu);
+			      T2y = VSUB(T2h, T2i);
+			      T2j = VADD(T2h, T2i);
+			      T17 = VFMA(LDK(KP414213562), Te, Th);
+			      Ti = VFNMS(LDK(KP414213562), Th, Te);
+			      T2e = VADD(Tj, Tk);
+			      Tl = VSUB(Tj, Tk);
+			      T2c = VADD(T24, T2b);
+			      T2l = VSUB(T2b, T24);
+			      {
+				   V T2L, T2A, T2q, T2k;
+				   T2P = VSUB(T2N, T2O);
+				   T2U = VADD(T2O, T2N);
+				   {
+					V T2z, T2g, T18, Tp;
+					T2z = VSUB(T2e, T2f);
+					T2g = VADD(T2e, T2f);
+					T18 = VFMA(LDK(KP414213562), Tl, To);
+					Tp = VFNMS(LDK(KP414213562), To, Tl);
+					T2n = VFMA(LDK(KP707106781), T2c, T1X);
+					T2d = VFNMS(LDK(KP707106781), T2c, T1X);
+					T2w = VSUB(T2t, T2s);
+					T2u = VADD(T2s, T2t);
+					T2L = VSUB(T2z, T2y);
+					T2A = VADD(T2y, T2z);
+					T2q = VADD(T2j, T2g);
+					T2k = VSUB(T2g, T2j);
+					T1q = VADD(T17, T18);
+					T19 = VSUB(T17, T18);
+					T1B = VSUB(Tp, Ti);
+					Tq = VADD(Ti, Tp);
+				   }
+				   T2W = VFNMS(LDK(KP707106781), T2L, T2K);
+				   T2M = VFMA(LDK(KP707106781), T2L, T2K);
+				   T2B = VFMA(LDK(KP707106781), T2A, T2x);
+				   T2T = VFNMS(LDK(KP707106781), T2A, T2x);
+				   T2v = VSUB(T2p, T2q);
+				   T2r = VADD(T2p, T2q);
+				   T2o = VFMA(LDK(KP707106781), T2l, T2k);
+				   T2m = VFNMS(LDK(KP707106781), T2l, T2k);
+				   T2X = VSUB(T2H, T2E);
+				   T2I = VADD(T2E, T2H);
+			      }
+			 }
+			 {
+			      V T2V, T2Z, T2Y, T30, T2R, T2J;
+			      T2V = VFNMS(LDK(KP923879532), T2U, T2T);
+			      T2Z = VFMA(LDK(KP923879532), T2U, T2T);
+			      ST(&(xo[WS(os, 24)]), VFNMSI(T2w, T2v), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 8)]), VFMAI(T2w, T2v), ovs, &(xo[0]));
+			      ST(&(xo[0]), VADD(T2r, T2u), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 16)]), VSUB(T2r, T2u), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 28)]), VFNMSI(T2o, T2n), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 4)]), VFMAI(T2o, T2n), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 20)]), VFMAI(T2m, T2d), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 12)]), VFNMSI(T2m, T2d), ovs, &(xo[0]));
+			      T2Y = VFMA(LDK(KP923879532), T2X, T2W);
+			      T30 = VFNMS(LDK(KP923879532), T2X, T2W);
+			      T2R = VFMA(LDK(KP923879532), T2I, T2B);
+			      T2J = VFNMS(LDK(KP923879532), T2I, T2B);
+			      {
+				   V T1J, T1r, T1C, T1M, T2S, T2Q, T1u, T1D, T1E, T1x;
+				   T1J = VFNMS(LDK(KP923879532), T1q, T1p);
+				   T1r = VFMA(LDK(KP923879532), T1q, T1p);
+				   T1C = VFMA(LDK(KP923879532), T1B, T1A);
+				   T1M = VFNMS(LDK(KP923879532), T1B, T1A);
+				   ST(&(xo[WS(os, 6)]), VFNMSI(T30, T2Z), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 26)]), VFMAI(T30, T2Z), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 22)]), VFNMSI(T2Y, T2V), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 10)]), VFMAI(T2Y, T2V), ovs, &(xo[0]));
+				   T2S = VFMA(LDK(KP923879532), T2P, T2M);
+				   T2Q = VFNMS(LDK(KP923879532), T2P, T2M);
+				   T1u = VFMA(LDK(KP668178637), T1t, T1s);
+				   T1D = VFNMS(LDK(KP668178637), T1s, T1t);
+				   T1E = VFNMS(LDK(KP668178637), T1v, T1w);
+				   T1x = VFMA(LDK(KP668178637), T1w, T1v);
+				   {
+					V T1K, T1F, T1N, T1y;
+					T1h = VFNMS(LDK(KP923879532), Tq, Tb);
+					Tr = VFMA(LDK(KP923879532), Tq, Tb);
+					ST(&(xo[WS(os, 30)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 2)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 18)]), VFMAI(T2Q, T2J), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 14)]), VFNMSI(T2Q, T2J), ovs, &(xo[0]));
+					T1K = VADD(T1D, T1E);
+					T1F = VSUB(T1D, T1E);
+					T1N = VSUB(T1x, T1u);
+					T1y = VADD(T1u, T1x);
+					T1a = VFMA(LDK(KP923879532), T19, T16);
+					T1k = VFNMS(LDK(KP923879532), T19, T16);
+					TI = VFNMS(LDK(KP198912367), TH, TC);
+					T1b = VFMA(LDK(KP198912367), TC, TH);
+					T1L = VFMA(LDK(KP831469612), T1K, T1J);
+					T1P = VFNMS(LDK(KP831469612), T1K, T1J);
+					T1I = VFMA(LDK(KP831469612), T1F, T1C);
+					T1G = VFNMS(LDK(KP831469612), T1F, T1C);
+					T1O = VFMA(LDK(KP831469612), T1N, T1M);
+					T1Q = VFNMS(LDK(KP831469612), T1N, T1M);
+					T1H = VFMA(LDK(KP831469612), T1y, T1r);
+					T1z = VFNMS(LDK(KP831469612), T1y, T1r);
+					T1c = VFMA(LDK(KP198912367), TT, TY);
+					TZ = VFNMS(LDK(KP198912367), TY, TT);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1d, T1i, T10, T1l;
+		    ST(&(xo[WS(os, 21)]), VFNMSI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 11)]), VFMAI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 27)]), VFMAI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 5)]), VFNMSI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VFMAI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 29)]), VFNMSI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 19)]), VFMAI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 13)]), VFNMSI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
+		    T1d = VSUB(T1b, T1c);
+		    T1i = VADD(T1b, T1c);
+		    T10 = VADD(TI, TZ);
+		    T1l = VSUB(TZ, TI);
+		    {
+			 V T1n, T1j, T1e, T1g, T1o, T1m, T11, T1f;
+			 T1n = VFMA(LDK(KP980785280), T1i, T1h);
+			 T1j = VFNMS(LDK(KP980785280), T1i, T1h);
+			 T1e = VFNMS(LDK(KP980785280), T1d, T1a);
+			 T1g = VFMA(LDK(KP980785280), T1d, T1a);
+			 T1o = VFMA(LDK(KP980785280), T1l, T1k);
+			 T1m = VFNMS(LDK(KP980785280), T1l, T1k);
+			 T11 = VFNMS(LDK(KP980785280), T10, Tr);
+			 T1f = VFMA(LDK(KP980785280), T10, Tr);
+			 ST(&(xo[WS(os, 23)]), VFMAI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 9)]), VFNMSI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 25)]), VFNMSI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 7)]), VFMAI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 31)]), VFMAI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 1)]), VFNMSI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 15)]), VFMAI(T1e, T11), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 17)]), VFNMSI(T1e, T11), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 32, XSIMD_STRING("n1fv_32"), {88, 0, 98, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_32) (planner *p) {
+     X(kdft_register) (p, n1fv_32, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n1fv_32 -include n1f.h */
+
+/*
+ * This function contains 186 FP additions, 42 FP multiplications,
+ * (or, 170 additions, 26 multiplications, 16 fused multiply/add),
+ * 58 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       V T1T, T1W, T2K, T2x, T16, T1A, Tb, T1p, TT, T1v, TY, T1w, T27, T2a, T2b;
+	       V T2H, T2O, TC, T1s, TH, T1t, T20, T23, T24, T2E, T2N, T2g, T2j, Tq, T1B;
+	       V T19, T1q, T2A, T2L;
+	       {
+		    V T3, T1R, T15, T1S, T6, T1U, T9, T1V, T12, Ta;
+		    {
+			 V T1, T2, T13, T14;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T3 = VSUB(T1, T2);
+			 T1R = VADD(T1, T2);
+			 T13 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T14 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			 T15 = VSUB(T13, T14);
+			 T1S = VADD(T13, T14);
+		    }
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T1U = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T1V = VADD(T7, T8);
+		    }
+		    T1T = VADD(T1R, T1S);
+		    T1W = VADD(T1U, T1V);
+		    T2K = VSUB(T1V, T1U);
+		    T2x = VSUB(T1R, T1S);
+		    T12 = VMUL(LDK(KP707106781), VSUB(T9, T6));
+		    T16 = VSUB(T12, T15);
+		    T1A = VADD(T15, T12);
+		    Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+		    Tb = VADD(T3, Ta);
+		    T1p = VSUB(T3, Ta);
+	       }
+	       {
+		    V TL, T25, TX, T26, TO, T28, TR, T29;
+		    {
+			 V TJ, TK, TV, TW;
+			 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+			 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 TL = VSUB(TJ, TK);
+			 T25 = VADD(TJ, TK);
+			 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 TW = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			 TX = VSUB(TV, TW);
+			 T26 = VADD(TV, TW);
+		    }
+		    {
+			 V TM, TN, TP, TQ;
+			 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			 TO = VSUB(TM, TN);
+			 T28 = VADD(TM, TN);
+			 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+			 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 TR = VSUB(TP, TQ);
+			 T29 = VADD(TP, TQ);
+		    }
+		    {
+			 V TS, TU, T2F, T2G;
+			 TS = VMUL(LDK(KP707106781), VADD(TO, TR));
+			 TT = VADD(TL, TS);
+			 T1v = VSUB(TL, TS);
+			 TU = VMUL(LDK(KP707106781), VSUB(TR, TO));
+			 TY = VSUB(TU, TX);
+			 T1w = VADD(TX, TU);
+			 T27 = VADD(T25, T26);
+			 T2a = VADD(T28, T29);
+			 T2b = VSUB(T27, T2a);
+			 T2F = VSUB(T25, T26);
+			 T2G = VSUB(T29, T28);
+			 T2H = VFNMS(LDK(KP382683432), T2G, VMUL(LDK(KP923879532), T2F));
+			 T2O = VFMA(LDK(KP382683432), T2F, VMUL(LDK(KP923879532), T2G));
+		    }
+	       }
+	       {
+		    V Tu, T1Y, TG, T1Z, Tx, T21, TA, T22;
+		    {
+			 V Ts, Tt, TE, TF;
+			 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 Tu = VSUB(Ts, Tt);
+			 T1Y = VADD(Ts, Tt);
+			 TE = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 TF = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+			 TG = VSUB(TE, TF);
+			 T1Z = VADD(TE, TF);
+		    }
+		    {
+			 V Tv, Tw, Ty, Tz;
+			 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			 Tx = VSUB(Tv, Tw);
+			 T21 = VADD(Tv, Tw);
+			 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+			 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 TA = VSUB(Ty, Tz);
+			 T22 = VADD(Ty, Tz);
+		    }
+		    {
+			 V TB, TD, T2C, T2D;
+			 TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
+			 TC = VADD(Tu, TB);
+			 T1s = VSUB(Tu, TB);
+			 TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
+			 TH = VSUB(TD, TG);
+			 T1t = VADD(TG, TD);
+			 T20 = VADD(T1Y, T1Z);
+			 T23 = VADD(T21, T22);
+			 T24 = VSUB(T20, T23);
+			 T2C = VSUB(T1Y, T1Z);
+			 T2D = VSUB(T22, T21);
+			 T2E = VFMA(LDK(KP923879532), T2C, VMUL(LDK(KP382683432), T2D));
+			 T2N = VFNMS(LDK(KP382683432), T2C, VMUL(LDK(KP923879532), T2D));
+		    }
+	       }
+	       {
+		    V Te, T2h, To, T2f, Th, T2i, Tl, T2e, Ti, Tp;
+		    {
+			 V Tc, Td, Tm, Tn;
+			 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 Te = VSUB(Tc, Td);
+			 T2h = VADD(Tc, Td);
+			 Tm = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tn = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			 To = VSUB(Tm, Tn);
+			 T2f = VADD(Tm, Tn);
+		    }
+		    {
+			 V Tf, Tg, Tj, Tk;
+			 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+			 Th = VSUB(Tf, Tg);
+			 T2i = VADD(Tf, Tg);
+			 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 T2e = VADD(Tj, Tk);
+		    }
+		    T2g = VADD(T2e, T2f);
+		    T2j = VADD(T2h, T2i);
+		    Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
+		    Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
+		    Tq = VADD(Ti, Tp);
+		    T1B = VSUB(Tp, Ti);
+		    {
+			 V T17, T18, T2y, T2z;
+			 T17 = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
+			 T18 = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
+			 T19 = VSUB(T17, T18);
+			 T1q = VADD(T18, T17);
+			 T2y = VSUB(T2h, T2i);
+			 T2z = VSUB(T2e, T2f);
+			 T2A = VMUL(LDK(KP707106781), VADD(T2y, T2z));
+			 T2L = VMUL(LDK(KP707106781), VSUB(T2z, T2y));
+		    }
+	       }
+	       {
+		    V T2d, T2n, T2m, T2o;
+		    {
+			 V T1X, T2c, T2k, T2l;
+			 T1X = VSUB(T1T, T1W);
+			 T2c = VMUL(LDK(KP707106781), VADD(T24, T2b));
+			 T2d = VADD(T1X, T2c);
+			 T2n = VSUB(T1X, T2c);
+			 T2k = VSUB(T2g, T2j);
+			 T2l = VMUL(LDK(KP707106781), VSUB(T2b, T24));
+			 T2m = VBYI(VADD(T2k, T2l));
+			 T2o = VBYI(VSUB(T2l, T2k));
+		    }
+		    ST(&(xo[WS(os, 28)]), VSUB(T2d, T2m), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 12)]), VADD(T2n, T2o), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VADD(T2d, T2m), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 20)]), VSUB(T2n, T2o), ovs, &(xo[0]));
+	       }
+	       {
+		    V T2r, T2v, T2u, T2w;
+		    {
+			 V T2p, T2q, T2s, T2t;
+			 T2p = VADD(T1T, T1W);
+			 T2q = VADD(T2j, T2g);
+			 T2r = VADD(T2p, T2q);
+			 T2v = VSUB(T2p, T2q);
+			 T2s = VADD(T20, T23);
+			 T2t = VADD(T27, T2a);
+			 T2u = VADD(T2s, T2t);
+			 T2w = VBYI(VSUB(T2t, T2s));
+		    }
+		    ST(&(xo[WS(os, 16)]), VSUB(T2r, T2u), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 8)]), VADD(T2v, T2w), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(T2r, T2u), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 24)]), VSUB(T2v, T2w), ovs, &(xo[0]));
+	       }
+	       {
+		    V T2V, T2Z, T2Y, T30;
+		    {
+			 V T2T, T2U, T2W, T2X;
+			 T2T = VSUB(T2H, T2E);
+			 T2U = VSUB(T2L, T2K);
+			 T2V = VBYI(VSUB(T2T, T2U));
+			 T2Z = VBYI(VADD(T2U, T2T));
+			 T2W = VSUB(T2x, T2A);
+			 T2X = VSUB(T2O, T2N);
+			 T2Y = VSUB(T2W, T2X);
+			 T30 = VADD(T2W, T2X);
+		    }
+		    ST(&(xo[WS(os, 10)]), VADD(T2V, T2Y), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 26)]), VSUB(T30, T2Z), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 22)]), VSUB(T2Y, T2V), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 6)]), VADD(T2Z, T30), ovs, &(xo[0]));
+	       }
+	       {
+		    V T2J, T2R, T2Q, T2S;
+		    {
+			 V T2B, T2I, T2M, T2P;
+			 T2B = VADD(T2x, T2A);
+			 T2I = VADD(T2E, T2H);
+			 T2J = VADD(T2B, T2I);
+			 T2R = VSUB(T2B, T2I);
+			 T2M = VADD(T2K, T2L);
+			 T2P = VADD(T2N, T2O);
+			 T2Q = VBYI(VADD(T2M, T2P));
+			 T2S = VBYI(VSUB(T2P, T2M));
+		    }
+		    ST(&(xo[WS(os, 30)]), VSUB(T2J, T2Q), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 14)]), VADD(T2R, T2S), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VADD(T2J, T2Q), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 18)]), VSUB(T2R, T2S), ovs, &(xo[0]));
+	       }
+	       {
+		    V T1r, T1C, T1M, T1K, T1F, T1N, T1y, T1J;
+		    T1r = VADD(T1p, T1q);
+		    T1C = VADD(T1A, T1B);
+		    T1M = VSUB(T1p, T1q);
+		    T1K = VSUB(T1B, T1A);
+		    {
+			 V T1D, T1E, T1u, T1x;
+			 T1D = VFNMS(LDK(KP555570233), T1s, VMUL(LDK(KP831469612), T1t));
+			 T1E = VFMA(LDK(KP555570233), T1v, VMUL(LDK(KP831469612), T1w));
+			 T1F = VADD(T1D, T1E);
+			 T1N = VSUB(T1E, T1D);
+			 T1u = VFMA(LDK(KP831469612), T1s, VMUL(LDK(KP555570233), T1t));
+			 T1x = VFNMS(LDK(KP555570233), T1w, VMUL(LDK(KP831469612), T1v));
+			 T1y = VADD(T1u, T1x);
+			 T1J = VSUB(T1x, T1u);
+		    }
+		    {
+			 V T1z, T1G, T1P, T1Q;
+			 T1z = VADD(T1r, T1y);
+			 T1G = VBYI(VADD(T1C, T1F));
+			 ST(&(xo[WS(os, 29)]), VSUB(T1z, T1G), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 3)]), VADD(T1z, T1G), ovs, &(xo[WS(os, 1)]));
+			 T1P = VBYI(VADD(T1K, T1J));
+			 T1Q = VADD(T1M, T1N);
+			 ST(&(xo[WS(os, 5)]), VADD(T1P, T1Q), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 27)]), VSUB(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T1H, T1I, T1L, T1O;
+			 T1H = VSUB(T1r, T1y);
+			 T1I = VBYI(VSUB(T1F, T1C));
+			 ST(&(xo[WS(os, 19)]), VSUB(T1H, T1I), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 13)]), VADD(T1H, T1I), ovs, &(xo[WS(os, 1)]));
+			 T1L = VBYI(VSUB(T1J, T1K));
+			 T1O = VSUB(T1M, T1N);
+			 ST(&(xo[WS(os, 11)]), VADD(T1L, T1O), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 21)]), VSUB(T1O, T1L), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V Tr, T1a, T1k, T1i, T1d, T1l, T10, T1h;
+		    Tr = VADD(Tb, Tq);
+		    T1a = VADD(T16, T19);
+		    T1k = VSUB(Tb, Tq);
+		    T1i = VSUB(T19, T16);
+		    {
+			 V T1b, T1c, TI, TZ;
+			 T1b = VFNMS(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
+			 T1c = VFMA(LDK(KP195090322), TT, VMUL(LDK(KP980785280), TY));
+			 T1d = VADD(T1b, T1c);
+			 T1l = VSUB(T1c, T1b);
+			 TI = VFMA(LDK(KP980785280), TC, VMUL(LDK(KP195090322), TH));
+			 TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
+			 T10 = VADD(TI, TZ);
+			 T1h = VSUB(TZ, TI);
+		    }
+		    {
+			 V T11, T1e, T1n, T1o;
+			 T11 = VADD(Tr, T10);
+			 T1e = VBYI(VADD(T1a, T1d));
+			 ST(&(xo[WS(os, 31)]), VSUB(T11, T1e), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 1)]), VADD(T11, T1e), ovs, &(xo[WS(os, 1)]));
+			 T1n = VBYI(VADD(T1i, T1h));
+			 T1o = VADD(T1k, T1l);
+			 ST(&(xo[WS(os, 7)]), VADD(T1n, T1o), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 25)]), VSUB(T1o, T1n), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T1f, T1g, T1j, T1m;
+			 T1f = VSUB(Tr, T10);
+			 T1g = VBYI(VSUB(T1d, T1a));
+			 ST(&(xo[WS(os, 17)]), VSUB(T1f, T1g), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 15)]), VADD(T1f, T1g), ovs, &(xo[WS(os, 1)]));
+			 T1j = VBYI(VSUB(T1h, T1i));
+			 T1m = VSUB(T1k, T1l);
+			 ST(&(xo[WS(os, 9)]), VADD(T1j, T1m), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 23)]), VSUB(T1m, T1j), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 32, XSIMD_STRING("n1fv_32"), {170, 26, 16, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_32) (planner *p) {
+     X(kdft_register) (p, n1fv_32, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name n1fv_4 -include n1f.h */
+
+/*
+ * This function contains 8 FP additions, 2 FP multiplications,
+ * (or, 6 additions, 0 multiplications, 2 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
+	       V T1, T2, T4, T5;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, T7, T6, T8;
+		    T3 = VSUB(T1, T2);
+		    T7 = VADD(T1, T2);
+		    T6 = VSUB(T4, T5);
+		    T8 = VADD(T4, T5);
+		    ST(&(xo[WS(os, 2)]), VSUB(T7, T8), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(T7, T8), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 3)]), VFMAI(T6, T3), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VFNMSI(T6, T3), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 4, XSIMD_STRING("n1fv_4"), {6, 0, 2, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_4) (planner *p) {
+     X(kdft_register) (p, n1fv_4, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name n1fv_4 -include n1f.h */
+
+/*
+ * This function contains 8 FP additions, 0 FP multiplications,
+ * (or, 8 additions, 0 multiplications, 0 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
+	       V T3, T7, T6, T8;
+	       {
+		    V T1, T2, T4, T5;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T3 = VSUB(T1, T2);
+		    T7 = VADD(T1, T2);
+		    T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T6 = VBYI(VSUB(T4, T5));
+		    T8 = VADD(T4, T5);
+	       }
+	       ST(&(xo[WS(os, 1)]), VSUB(T3, T6), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[0]), VADD(T7, T8), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 3)]), VADD(T3, T6), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[WS(os, 2)]), VSUB(T7, T8), ovs, &(xo[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 4, XSIMD_STRING("n1fv_4"), {8, 0, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_4) (planner *p) {
+     X(kdft_register) (p, n1fv_4, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name n1fv_5 -include n1f.h */
+
+/*
+ * This function contains 16 FP additions, 11 FP multiplications,
+ * (or, 7 additions, 2 multiplications, 9 fused multiply/add),
+ * 23 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(10, is), MAKE_VOLATILE_STRIDE(10, os)) {
+	       V T1, T2, T3, T5, T6;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T6 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V Tc, T4, Td, T7;
+		    Tc = VSUB(T2, T3);
+		    T4 = VADD(T2, T3);
+		    Td = VSUB(T5, T6);
+		    T7 = VADD(T5, T6);
+		    {
+			 V Tg, Te, Ta, T8, T9, Tf, Tb;
+			 Tg = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tc, Td));
+			 Te = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Td, Tc));
+			 Ta = VSUB(T4, T7);
+			 T8 = VADD(T4, T7);
+			 T9 = VFNMS(LDK(KP250000000), T8, T1);
+			 ST(&(xo[0]), VADD(T1, T8), ovs, &(xo[0]));
+			 Tf = VFNMS(LDK(KP559016994), Ta, T9);
+			 Tb = VFMA(LDK(KP559016994), Ta, T9);
+			 ST(&(xo[WS(os, 2)]), VFMAI(Tg, Tf), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 3)]), VFNMSI(Tg, Tf), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 4)]), VFMAI(Te, Tb), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 1)]), VFNMSI(Te, Tb), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 5, XSIMD_STRING("n1fv_5"), {7, 2, 9, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_5) (planner *p) {
+     X(kdft_register) (p, n1fv_5, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name n1fv_5 -include n1f.h */
+
+/*
+ * This function contains 16 FP additions, 6 FP multiplications,
+ * (or, 13 additions, 3 multiplications, 3 fused multiply/add),
+ * 18 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(10, is), MAKE_VOLATILE_STRIDE(10, os)) {
+	       V T8, T7, Td, T9, Tc;
+	       T8 = LD(&(xi[0]), ivs, &(xi[0]));
+	       {
+		    V T1, T2, T3, T4, T5, T6;
+		    T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T3 = VADD(T1, T2);
+		    T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T6 = VADD(T4, T5);
+		    T7 = VMUL(LDK(KP559016994), VSUB(T3, T6));
+		    Td = VSUB(T4, T5);
+		    T9 = VADD(T3, T6);
+		    Tc = VSUB(T1, T2);
+	       }
+	       ST(&(xo[0]), VADD(T8, T9), ovs, &(xo[0]));
+	       {
+		    V Te, Tf, Tb, Tg, Ta;
+		    Te = VBYI(VFMA(LDK(KP951056516), Tc, VMUL(LDK(KP587785252), Td)));
+		    Tf = VBYI(VFNMS(LDK(KP587785252), Tc, VMUL(LDK(KP951056516), Td)));
+		    Ta = VFNMS(LDK(KP250000000), T9, T8);
+		    Tb = VADD(T7, Ta);
+		    Tg = VSUB(Ta, T7);
+		    ST(&(xo[WS(os, 1)]), VSUB(Tb, Te), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VSUB(Tg, Tf), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 4)]), VADD(Te, Tb), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VADD(Tf, Tg), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 5, XSIMD_STRING("n1fv_5"), {13, 3, 3, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_5) (planner *p) {
+     X(kdft_register) (p, n1fv_5, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name n1fv_6 -include n1f.h */
+
+/*
+ * This function contains 18 FP additions, 8 FP multiplications,
+ * (or, 12 additions, 2 multiplications, 6 fused multiply/add),
+ * 23 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
+	       V T1, T2, T4, T5, T7, T8;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, Td, T6, Te, T9, Tf;
+		    T3 = VSUB(T1, T2);
+		    Td = VADD(T1, T2);
+		    T6 = VSUB(T4, T5);
+		    Te = VADD(T4, T5);
+		    T9 = VSUB(T7, T8);
+		    Tf = VADD(T7, T8);
+		    {
+			 V Tg, Ti, Ta, Tc, Th, Tb;
+			 Tg = VADD(Te, Tf);
+			 Ti = VMUL(LDK(KP866025403), VSUB(Tf, Te));
+			 Ta = VADD(T6, T9);
+			 Tc = VMUL(LDK(KP866025403), VSUB(T9, T6));
+			 Th = VFNMS(LDK(KP500000000), Tg, Td);
+			 ST(&(xo[0]), VADD(Td, Tg), ovs, &(xo[0]));
+			 Tb = VFNMS(LDK(KP500000000), Ta, T3);
+			 ST(&(xo[WS(os, 3)]), VADD(T3, Ta), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 4)]), VFMAI(Ti, Th), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 2)]), VFNMSI(Ti, Th), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 1)]), VFMAI(Tc, Tb), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 5)]), VFNMSI(Tc, Tb), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 6, XSIMD_STRING("n1fv_6"), {12, 2, 6, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_6) (planner *p) {
+     X(kdft_register) (p, n1fv_6, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name n1fv_6 -include n1f.h */
+
+/*
+ * This function contains 18 FP additions, 4 FP multiplications,
+ * (or, 16 additions, 2 multiplications, 2 fused multiply/add),
+ * 19 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
+	       V T3, Td, T6, Te, T9, Tf, Ta, Tg, T1, T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = VSUB(T1, T2);
+	       Td = VADD(T1, T2);
+	       {
+		    V T4, T5, T7, T8;
+		    T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    T6 = VSUB(T4, T5);
+		    Te = VADD(T4, T5);
+		    T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T9 = VSUB(T7, T8);
+		    Tf = VADD(T7, T8);
+	       }
+	       Ta = VADD(T6, T9);
+	       Tg = VADD(Te, Tf);
+	       ST(&(xo[WS(os, 3)]), VADD(T3, Ta), ovs, &(xo[WS(os, 1)]));
+	       ST(&(xo[0]), VADD(Td, Tg), ovs, &(xo[0]));
+	       {
+		    V Tb, Tc, Th, Ti;
+		    Tb = VFNMS(LDK(KP500000000), Ta, T3);
+		    Tc = VBYI(VMUL(LDK(KP866025403), VSUB(T9, T6)));
+		    ST(&(xo[WS(os, 5)]), VSUB(Tb, Tc), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VADD(Tb, Tc), ovs, &(xo[WS(os, 1)]));
+		    Th = VFNMS(LDK(KP500000000), Tg, Td);
+		    Ti = VBYI(VMUL(LDK(KP866025403), VSUB(Tf, Te)));
+		    ST(&(xo[WS(os, 2)]), VSUB(Th, Ti), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 4)]), VADD(Th, Ti), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 6, XSIMD_STRING("n1fv_6"), {16, 2, 2, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_6) (planner *p) {
+     X(kdft_register) (p, n1fv_6, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1568 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:54 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n1fv_64 -include n1f.h */
+
+/*
+ * This function contains 456 FP additions, 258 FP multiplications,
+ * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
+ * 168 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       V T5T, T5S, T5X, T65, T5Z, T5R, T67, T63, T5U, T64;
+	       {
+		    V T7, T26, T5k, T6A, T47, T69, T2V, T3z, T6B, T4e, T6a, T5n, T3M, T2Y, T27;
+		    V Tm, T3A, T3l, T2a, TC, T5p, T4o, T6E, T6e, T3i, T3B, TR, T29, T4x, T5q;
+		    V T6h, T6D, T39, T3H, T3I, T3c, T5N, T57, T72, T6w, T5O, T5e, T71, T6t, T2y;
+		    V T1W, T2x, T1N, T33, T34, T3E, T32, T1p, T2v, T1g, T2u, T4M, T5K, T6p, T6Z;
+		    V T6m, T6Y, T5L, T4T;
+		    {
+			 V T4g, T4l, T3j, Tu, Tx, T4h, TA, T4i;
+			 {
+			      V T1, T2, T23, T24, T4, T5, T20, T21;
+			      T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			      T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+			      T23 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			      T24 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+			      T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			      T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+			      T20 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+			      T21 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			      {
+				   V Ta, T48, Tk, T4c, T49, Td, Tf, Tg;
+				   {
+					V T8, T43, T3, T44, T25, T5i, T6, T45, T22, T9, Ti, Tj, Tb, Tc;
+					T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					T43 = VSUB(T1, T2);
+					T3 = VADD(T1, T2);
+					T44 = VSUB(T23, T24);
+					T25 = VADD(T23, T24);
+					T5i = VSUB(T4, T5);
+					T6 = VADD(T4, T5);
+					T45 = VSUB(T20, T21);
+					T22 = VADD(T20, T21);
+					T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+					Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+					Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+					Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+					Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+					{
+					     V T2T, T46, T5j, T2U;
+					     T7 = VSUB(T3, T6);
+					     T2T = VADD(T3, T6);
+					     T46 = VADD(T44, T45);
+					     T5j = VSUB(T45, T44);
+					     T26 = VSUB(T22, T25);
+					     T2U = VADD(T25, T22);
+					     Ta = VADD(T8, T9);
+					     T48 = VSUB(T8, T9);
+					     Tk = VADD(Ti, Tj);
+					     T4c = VSUB(Tj, Ti);
+					     T5k = VFNMS(LDK(KP707106781), T5j, T5i);
+					     T6A = VFMA(LDK(KP707106781), T5j, T5i);
+					     T47 = VFMA(LDK(KP707106781), T46, T43);
+					     T69 = VFNMS(LDK(KP707106781), T46, T43);
+					     T2V = VADD(T2T, T2U);
+					     T3z = VSUB(T2T, T2U);
+					     T49 = VSUB(Tb, Tc);
+					     Td = VADD(Tb, Tc);
+					}
+					Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+					Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+				   }
+				   {
+					V Te, T2W, T5l, T4a, Tq, Tt, Tv, Tw, T5m, T4d, Tl, T2X, Ty, Tz, To;
+					V Tp;
+					To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+					Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+					{
+					     V Th, T4b, Tr, Ts;
+					     Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+					     Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+					     Te = VSUB(Ta, Td);
+					     T2W = VADD(Ta, Td);
+					     T5l = VFMA(LDK(KP414213562), T48, T49);
+					     T4a = VFNMS(LDK(KP414213562), T49, T48);
+					     Th = VADD(Tf, Tg);
+					     T4b = VSUB(Tf, Tg);
+					     Tq = VADD(To, Tp);
+					     T4g = VSUB(To, Tp);
+					     T4l = VSUB(Tr, Ts);
+					     Tt = VADD(Tr, Ts);
+					     Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+					     Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+					     T5m = VFMA(LDK(KP414213562), T4b, T4c);
+					     T4d = VFNMS(LDK(KP414213562), T4c, T4b);
+					     Tl = VSUB(Th, Tk);
+					     T2X = VADD(Th, Tk);
+					     Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+					     Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+					}
+					T3j = VADD(Tq, Tt);
+					Tu = VSUB(Tq, Tt);
+					Tx = VADD(Tv, Tw);
+					T4h = VSUB(Tv, Tw);
+					T6B = VSUB(T4d, T4a);
+					T4e = VADD(T4a, T4d);
+					T6a = VADD(T5l, T5m);
+					T5n = VSUB(T5l, T5m);
+					T3M = VSUB(T2X, T2W);
+					T2Y = VADD(T2W, T2X);
+					T27 = VSUB(Tl, Te);
+					Tm = VADD(Te, Tl);
+					TA = VADD(Ty, Tz);
+					T4i = VSUB(Ty, Tz);
+				   }
+			      }
+			 }
+			 {
+			      V TK, T4p, T4u, T4k, T6d, T4n, T6c, TL, TN, TO, T3g, TJ, TF, TI;
+			      {
+				   V TD, TE, TG, TH;
+				   TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+				   TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+				   TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+				   TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+				   TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+				   {
+					V T3k, TB, T4j, T4m;
+					T3k = VADD(Tx, TA);
+					TB = VSUB(Tx, TA);
+					T4j = VADD(T4h, T4i);
+					T4m = VSUB(T4h, T4i);
+					T4p = VSUB(TD, TE);
+					TF = VADD(TD, TE);
+					T4u = VSUB(TH, TG);
+					TI = VADD(TG, TH);
+					T3A = VSUB(T3j, T3k);
+					T3l = VADD(T3j, T3k);
+					T2a = VFMA(LDK(KP414213562), Tu, TB);
+					TC = VFNMS(LDK(KP414213562), TB, Tu);
+					T4k = VFMA(LDK(KP707106781), T4j, T4g);
+					T6d = VFNMS(LDK(KP707106781), T4j, T4g);
+					T4n = VFMA(LDK(KP707106781), T4m, T4l);
+					T6c = VFNMS(LDK(KP707106781), T4m, T4l);
+					TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+				   }
+				   TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+				   TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+			      }
+			      T3g = VADD(TF, TI);
+			      TJ = VSUB(TF, TI);
+			      {
+				   V T3a, T1E, T52, T5b, T1x, T4Z, T6r, T6u, T5a, T1U, T55, T5c, T1L, T3b;
+				   {
+					V T4V, T1t, T58, T1w, T1Q, T1T, T1I, T4Y, T59, T1J, T53, T1H;
+					{
+					     V T1r, TM, T4r, TP, T4q, T1s, T1u, T1v;
+					     T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+					     T5p = VFMA(LDK(KP198912367), T4k, T4n);
+					     T4o = VFNMS(LDK(KP198912367), T4n, T4k);
+					     T6E = VFMA(LDK(KP668178637), T6c, T6d);
+					     T6e = VFNMS(LDK(KP668178637), T6d, T6c);
+					     TM = VADD(TK, TL);
+					     T4r = VSUB(TK, TL);
+					     TP = VADD(TN, TO);
+					     T4q = VSUB(TN, TO);
+					     T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+					     T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+					     T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+					     {
+						  V T1R, T4X, T6g, T4t, T6f, T4w, T1S, T1O, T1P;
+						  T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+						  T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+						  T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V T3h, TQ, T4s, T4v;
+						       T3h = VADD(TP, TM);
+						       TQ = VSUB(TM, TP);
+						       T4s = VADD(T4q, T4r);
+						       T4v = VSUB(T4r, T4q);
+						       T4V = VSUB(T1r, T1s);
+						       T1t = VADD(T1r, T1s);
+						       T58 = VSUB(T1v, T1u);
+						       T1w = VADD(T1u, T1v);
+						       T4X = VSUB(T1O, T1P);
+						       T1Q = VADD(T1O, T1P);
+						       T3i = VADD(T3g, T3h);
+						       T3B = VSUB(T3g, T3h);
+						       TR = VFNMS(LDK(KP414213562), TQ, TJ);
+						       T29 = VFMA(LDK(KP414213562), TJ, TQ);
+						       T6g = VFNMS(LDK(KP707106781), T4s, T4p);
+						       T4t = VFMA(LDK(KP707106781), T4s, T4p);
+						       T6f = VFNMS(LDK(KP707106781), T4v, T4u);
+						       T4w = VFMA(LDK(KP707106781), T4v, T4u);
+						       T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+						  }
+						  {
+						       V T4W, T1A, T50, T51, T1D, T1F, T1G;
+						       {
+							    V T1y, T1z, T1B, T1C;
+							    T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+							    T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+							    T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+							    T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+							    T4x = VFNMS(LDK(KP198912367), T4w, T4t);
+							    T5q = VFMA(LDK(KP198912367), T4t, T4w);
+							    T6h = VFNMS(LDK(KP668178637), T6g, T6f);
+							    T6D = VFMA(LDK(KP668178637), T6f, T6g);
+							    T4W = VSUB(T1R, T1S);
+							    T1T = VADD(T1R, T1S);
+							    T1A = VADD(T1y, T1z);
+							    T50 = VSUB(T1y, T1z);
+							    T51 = VSUB(T1C, T1B);
+							    T1D = VADD(T1B, T1C);
+						       }
+						       T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+						       T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+						       T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+						       T4Y = VADD(T4W, T4X);
+						       T59 = VSUB(T4X, T4W);
+						       T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+						       T3a = VADD(T1A, T1D);
+						       T1E = VSUB(T1A, T1D);
+						       T52 = VFMA(LDK(KP414213562), T51, T50);
+						       T5b = VFNMS(LDK(KP414213562), T50, T51);
+						       T53 = VSUB(T1F, T1G);
+						       T1H = VADD(T1F, T1G);
+						  }
+					     }
+					}
+					{
+					     V T37, T54, T1K, T38;
+					     T1x = VSUB(T1t, T1w);
+					     T37 = VADD(T1t, T1w);
+					     T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
+					     T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
+					     T54 = VSUB(T1J, T1I);
+					     T1K = VADD(T1I, T1J);
+					     T6u = VFNMS(LDK(KP707106781), T59, T58);
+					     T5a = VFMA(LDK(KP707106781), T59, T58);
+					     T38 = VADD(T1T, T1Q);
+					     T1U = VSUB(T1Q, T1T);
+					     T55 = VFNMS(LDK(KP414213562), T54, T53);
+					     T5c = VFMA(LDK(KP414213562), T53, T54);
+					     T1L = VSUB(T1H, T1K);
+					     T3b = VADD(T1H, T1K);
+					     T39 = VADD(T37, T38);
+					     T3H = VSUB(T37, T38);
+					}
+				   }
+				   {
+					V T4A, TW, T4N, TZ, T1j, T1m, T4O, T4D, T13, T4F, T16, T4G, T1a, T4I, T4J;
+					V T1d;
+					{
+					     V TU, TV, TX, TY, T56, T6v;
+					     TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+					     T56 = VADD(T52, T55);
+					     T6v = VSUB(T55, T52);
+					     {
+						  V T5d, T6s, T1V, T1M;
+						  T5d = VADD(T5b, T5c);
+						  T6s = VSUB(T5c, T5b);
+						  T1V = VSUB(T1L, T1E);
+						  T1M = VADD(T1E, T1L);
+						  T3I = VSUB(T3b, T3a);
+						  T3c = VADD(T3a, T3b);
+						  T5N = VFNMS(LDK(KP923879532), T56, T4Z);
+						  T57 = VFMA(LDK(KP923879532), T56, T4Z);
+						  T72 = VFNMS(LDK(KP923879532), T6v, T6u);
+						  T6w = VFMA(LDK(KP923879532), T6v, T6u);
+						  T5O = VFNMS(LDK(KP923879532), T5d, T5a);
+						  T5e = VFMA(LDK(KP923879532), T5d, T5a);
+						  T71 = VFMA(LDK(KP923879532), T6s, T6r);
+						  T6t = VFNMS(LDK(KP923879532), T6s, T6r);
+						  T2y = VFNMS(LDK(KP707106781), T1V, T1U);
+						  T1W = VFMA(LDK(KP707106781), T1V, T1U);
+						  T2x = VFNMS(LDK(KP707106781), T1M, T1x);
+						  T1N = VFMA(LDK(KP707106781), T1M, T1x);
+						  TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+					     }
+					     TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+					     TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+					     {
+						  V T1h, T1i, T1k, T1l;
+						  T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+						  T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+						  T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+						  T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V T11, T4B, T4C, T12, T14, T15;
+						       T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+						       T4A = VSUB(TU, TV);
+						       TW = VADD(TU, TV);
+						       T4N = VSUB(TX, TY);
+						       TZ = VADD(TX, TY);
+						       T1j = VADD(T1h, T1i);
+						       T4B = VSUB(T1h, T1i);
+						       T1m = VADD(T1k, T1l);
+						       T4C = VSUB(T1k, T1l);
+						       T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+						       T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+						       T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+						       {
+							    V T18, T19, T1b, T1c;
+							    T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+							    T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+							    T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+							    T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+							    T4O = VSUB(T4B, T4C);
+							    T4D = VADD(T4B, T4C);
+							    T13 = VADD(T11, T12);
+							    T4F = VSUB(T11, T12);
+							    T16 = VADD(T14, T15);
+							    T4G = VSUB(T14, T15);
+							    T1a = VADD(T18, T19);
+							    T4I = VSUB(T18, T19);
+							    T4J = VSUB(T1b, T1c);
+							    T1d = VADD(T1b, T1c);
+						       }
+						  }
+					     }
+					}
+					{
+					     V T30, T10, T6k, T4E, T4Q, T4H, T17, T6n, T4P, T1e, T4K, T4R, T1n, T31;
+					     T30 = VADD(TW, TZ);
+					     T10 = VSUB(TW, TZ);
+					     T6k = VFNMS(LDK(KP707106781), T4D, T4A);
+					     T4E = VFMA(LDK(KP707106781), T4D, T4A);
+					     T4Q = VFMA(LDK(KP414213562), T4F, T4G);
+					     T4H = VFNMS(LDK(KP414213562), T4G, T4F);
+					     T33 = VADD(T13, T16);
+					     T17 = VSUB(T13, T16);
+					     T6n = VFNMS(LDK(KP707106781), T4O, T4N);
+					     T4P = VFMA(LDK(KP707106781), T4O, T4N);
+					     T34 = VADD(T1a, T1d);
+					     T1e = VSUB(T1a, T1d);
+					     T4K = VFMA(LDK(KP414213562), T4J, T4I);
+					     T4R = VFNMS(LDK(KP414213562), T4I, T4J);
+					     T1n = VSUB(T1j, T1m);
+					     T31 = VADD(T1j, T1m);
+					     {
+						  V T1f, T1o, T6o, T4L, T4S, T6l;
+						  T1f = VADD(T17, T1e);
+						  T1o = VSUB(T17, T1e);
+						  T6o = VSUB(T4H, T4K);
+						  T4L = VADD(T4H, T4K);
+						  T4S = VADD(T4Q, T4R);
+						  T6l = VSUB(T4Q, T4R);
+						  T3E = VSUB(T30, T31);
+						  T32 = VADD(T30, T31);
+						  T1p = VFMA(LDK(KP707106781), T1o, T1n);
+						  T2v = VFNMS(LDK(KP707106781), T1o, T1n);
+						  T1g = VFMA(LDK(KP707106781), T1f, T10);
+						  T2u = VFNMS(LDK(KP707106781), T1f, T10);
+						  T4M = VFMA(LDK(KP923879532), T4L, T4E);
+						  T5K = VFNMS(LDK(KP923879532), T4L, T4E);
+						  T6p = VFMA(LDK(KP923879532), T6o, T6n);
+						  T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
+						  T6m = VFNMS(LDK(KP923879532), T6l, T6k);
+						  T6Y = VFMA(LDK(KP923879532), T6l, T6k);
+						  T5L = VFNMS(LDK(KP923879532), T4S, T4P);
+						  T4T = VFMA(LDK(KP923879532), T4S, T4P);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T6b, T6F, T7f, T6X, T70, T79, T7a, T73, T6C, T76, T77, T6i;
+			 {
+			      V T2Z, T3r, T3s, T3m, T3d, T3v;
+			      T2Z = VSUB(T2V, T2Y);
+			      T3r = VADD(T2V, T2Y);
+			      T3s = VADD(T3l, T3i);
+			      T3m = VSUB(T3i, T3l);
+			      T3d = VSUB(T39, T3c);
+			      T3v = VADD(T39, T3c);
+			      {
+				   V T3x, T3t, T3P, T3J, T3D, T3V, T3Q, T3G, T36, T3u, T3Y, T3O, T6V, T6W;
+				   {
+					V T3N, T3C, T3F, T35;
+					T3N = VSUB(T3B, T3A);
+					T3C = VADD(T3A, T3B);
+					T3F = VSUB(T33, T34);
+					T35 = VADD(T33, T34);
+					T3x = VSUB(T3r, T3s);
+					T3t = VADD(T3r, T3s);
+					T3P = VFMA(LDK(KP414213562), T3H, T3I);
+					T3J = VFNMS(LDK(KP414213562), T3I, T3H);
+					T3D = VFMA(LDK(KP707106781), T3C, T3z);
+					T3V = VFNMS(LDK(KP707106781), T3C, T3z);
+					T3Q = VFMA(LDK(KP414213562), T3E, T3F);
+					T3G = VFNMS(LDK(KP414213562), T3F, T3E);
+					T36 = VSUB(T32, T35);
+					T3u = VADD(T32, T35);
+					T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
+					T3O = VFMA(LDK(KP707106781), T3N, T3M);
+				   }
+				   T6b = VFNMS(LDK(KP923879532), T6a, T69);
+				   T6V = VFMA(LDK(KP923879532), T6a, T69);
+				   T6W = VADD(T6E, T6D);
+				   T6F = VSUB(T6D, T6E);
+				   {
+					V T3K, T3Z, T3e, T3n;
+					T3K = VADD(T3G, T3J);
+					T3Z = VSUB(T3J, T3G);
+					T3e = VADD(T36, T3d);
+					T3n = VSUB(T3d, T36);
+					{
+					     V T3w, T3y, T3R, T3W;
+					     T3w = VADD(T3u, T3v);
+					     T3y = VSUB(T3v, T3u);
+					     T3R = VSUB(T3P, T3Q);
+					     T3W = VADD(T3Q, T3P);
+					     {
+						  V T42, T40, T3L, T3T;
+						  T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
+						  T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
+						  T3L = VFNMS(LDK(KP923879532), T3K, T3D);
+						  T3T = VFMA(LDK(KP923879532), T3K, T3D);
+						  {
+						       V T3o, T3q, T3f, T3p;
+						       T3o = VFNMS(LDK(KP707106781), T3n, T3m);
+						       T3q = VFMA(LDK(KP707106781), T3n, T3m);
+						       T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
+						       T3p = VFMA(LDK(KP707106781), T3e, T2Z);
+						       ST(&(xo[WS(os, 48)]), VFNMSI(T3y, T3x), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 16)]), VFMAI(T3y, T3x), ovs, &(xo[0]));
+						       ST(&(xo[0]), VADD(T3t, T3w), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 32)]), VSUB(T3t, T3w), ovs, &(xo[0]));
+						       {
+							    V T41, T3X, T3S, T3U;
+							    T41 = VFMA(LDK(KP923879532), T3W, T3V);
+							    T3X = VFNMS(LDK(KP923879532), T3W, T3V);
+							    T3S = VFNMS(LDK(KP923879532), T3R, T3O);
+							    T3U = VFMA(LDK(KP923879532), T3R, T3O);
+							    ST(&(xo[WS(os, 8)]), VFMAI(T3q, T3p), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 56)]), VFNMSI(T3q, T3p), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 40)]), VFMAI(T3o, T3f), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 24)]), VFNMSI(T3o, T3f), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 44)]), VFNMSI(T40, T3X), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 20)]), VFMAI(T40, T3X), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 52)]), VFMAI(T42, T41), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 12)]), VFNMSI(T42, T41), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 4)]), VFMAI(T3U, T3T), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 60)]), VFNMSI(T3U, T3T), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 36)]), VFMAI(T3S, T3L), ovs, &(xo[0]));
+							    ST(&(xo[WS(os, 28)]), VFNMSI(T3S, T3L), ovs, &(xo[0]));
+							    T7f = VFNMS(LDK(KP831469612), T6W, T6V);
+							    T6X = VFMA(LDK(KP831469612), T6W, T6V);
+						       }
+						  }
+					     }
+					}
+				   }
+				   T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
+				   T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
+				   T7a = VFNMS(LDK(KP303346683), T71, T72);
+				   T73 = VFMA(LDK(KP303346683), T72, T71);
+				   T6C = VFNMS(LDK(KP923879532), T6B, T6A);
+				   T76 = VFMA(LDK(KP923879532), T6B, T6A);
+				   T77 = VSUB(T6e, T6h);
+				   T6i = VADD(T6e, T6h);
+			      }
+			 }
+			 {
+			      V T2r, T2D, T2C, T2s, T5H, T5o, T5v, T5D, T5r, T5I, T5x, T5h, T5F, T5B;
+			      {
+				   V TT, T2f, T2n, T1Y, T28, T2b, T2l, T2p, T2j, T2k;
+				   {
+					V T1X, T2d, T7h, T7l, T2e, T1q, T75, T7d, T7m, T7k, T7c, T7e, Tn, TS;
+					T2r = VFNMS(LDK(KP707106781), Tm, T7);
+					Tn = VFMA(LDK(KP707106781), Tm, T7);
+					TS = VADD(TC, TR);
+					T2D = VSUB(TR, TC);
+					{
+					     V T7b, T7j, T74, T7i, T78, T7g;
+					     T1X = VFNMS(LDK(KP198912367), T1W, T1N);
+					     T2d = VFMA(LDK(KP198912367), T1N, T1W);
+					     T7g = VADD(T79, T7a);
+					     T7b = VSUB(T79, T7a);
+					     T7j = VSUB(T73, T70);
+					     T74 = VADD(T70, T73);
+					     T7i = VFNMS(LDK(KP831469612), T77, T76);
+					     T78 = VFMA(LDK(KP831469612), T77, T76);
+					     T2j = VFNMS(LDK(KP923879532), TS, Tn);
+					     TT = VFMA(LDK(KP923879532), TS, Tn);
+					     T7h = VFMA(LDK(KP956940335), T7g, T7f);
+					     T7l = VFNMS(LDK(KP956940335), T7g, T7f);
+					     T2e = VFMA(LDK(KP198912367), T1g, T1p);
+					     T1q = VFNMS(LDK(KP198912367), T1p, T1g);
+					     T75 = VFNMS(LDK(KP956940335), T74, T6X);
+					     T7d = VFMA(LDK(KP956940335), T74, T6X);
+					     T7m = VFNMS(LDK(KP956940335), T7j, T7i);
+					     T7k = VFMA(LDK(KP956940335), T7j, T7i);
+					     T7c = VFNMS(LDK(KP956940335), T7b, T78);
+					     T7e = VFMA(LDK(KP956940335), T7b, T78);
+					}
+					T2k = VADD(T2e, T2d);
+					T2f = VSUB(T2d, T2e);
+					ST(&(xo[WS(os, 45)]), VFNMSI(T7k, T7h), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 19)]), VFMAI(T7k, T7h), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 51)]), VFMAI(T7m, T7l), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 13)]), VFNMSI(T7m, T7l), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 3)]), VFMAI(T7e, T7d), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 61)]), VFNMSI(T7e, T7d), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 35)]), VFMAI(T7c, T75), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 29)]), VFNMSI(T7c, T75), ovs, &(xo[WS(os, 1)]));
+					T2n = VSUB(T1X, T1q);
+					T1Y = VADD(T1q, T1X);
+					T2C = VFNMS(LDK(KP707106781), T27, T26);
+					T28 = VFMA(LDK(KP707106781), T27, T26);
+					T2b = VSUB(T29, T2a);
+					T2s = VADD(T2a, T29);
+				   }
+				   T2l = VFNMS(LDK(KP980785280), T2k, T2j);
+				   T2p = VFMA(LDK(KP980785280), T2k, T2j);
+				   {
+					V T5z, T4z, T5A, T5g;
+					{
+					     V T4f, T4y, T1Z, T2h, T4U, T5t, T2m, T2c, T5u, T5f;
+					     T5H = VFNMS(LDK(KP923879532), T4e, T47);
+					     T4f = VFMA(LDK(KP923879532), T4e, T47);
+					     T4y = VADD(T4o, T4x);
+					     T5T = VSUB(T4x, T4o);
+					     T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
+					     T2h = VFMA(LDK(KP980785280), T1Y, TT);
+					     T4U = VFNMS(LDK(KP098491403), T4T, T4M);
+					     T5t = VFMA(LDK(KP098491403), T4M, T4T);
+					     T2m = VFNMS(LDK(KP923879532), T2b, T28);
+					     T2c = VFMA(LDK(KP923879532), T2b, T28);
+					     T5u = VFMA(LDK(KP098491403), T57, T5e);
+					     T5f = VFNMS(LDK(KP098491403), T5e, T57);
+					     T5z = VFNMS(LDK(KP980785280), T4y, T4f);
+					     T4z = VFMA(LDK(KP980785280), T4y, T4f);
+					     T5S = VFNMS(LDK(KP923879532), T5n, T5k);
+					     T5o = VFMA(LDK(KP923879532), T5n, T5k);
+					     {
+						  V T2o, T2q, T2i, T2g;
+						  T2o = VFMA(LDK(KP980785280), T2n, T2m);
+						  T2q = VFNMS(LDK(KP980785280), T2n, T2m);
+						  T2i = VFMA(LDK(KP980785280), T2f, T2c);
+						  T2g = VFNMS(LDK(KP980785280), T2f, T2c);
+						  T5A = VADD(T5t, T5u);
+						  T5v = VSUB(T5t, T5u);
+						  T5D = VSUB(T5f, T4U);
+						  T5g = VADD(T4U, T5f);
+						  ST(&(xo[WS(os, 46)]), VFNMSI(T2o, T2l), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 18)]), VFMAI(T2o, T2l), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 50)]), VFMAI(T2q, T2p), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 14)]), VFNMSI(T2q, T2p), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 2)]), VFMAI(T2i, T2h), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 62)]), VFNMSI(T2i, T2h), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 34)]), VFMAI(T2g, T1Z), ovs, &(xo[0]));
+						  ST(&(xo[WS(os, 30)]), VFNMSI(T2g, T1Z), ovs, &(xo[0]));
+						  T5r = VSUB(T5p, T5q);
+						  T5I = VADD(T5p, T5q);
+					     }
+					}
+					T5x = VFMA(LDK(KP995184726), T5g, T4z);
+					T5h = VFNMS(LDK(KP995184726), T5g, T4z);
+					T5F = VFMA(LDK(KP995184726), T5A, T5z);
+					T5B = VFNMS(LDK(KP995184726), T5A, T5z);
+				   }
+			      }
+			      {
+				   V T6J, T6R, T6L, T6z, T6T, T6P;
+				   {
+					V T6N, T6j, T6O, T6y;
+					{
+					     V T6q, T6H, T5C, T5s, T6I, T6x;
+					     T6q = VFNMS(LDK(KP534511135), T6p, T6m);
+					     T6H = VFMA(LDK(KP534511135), T6m, T6p);
+					     T5C = VFNMS(LDK(KP980785280), T5r, T5o);
+					     T5s = VFMA(LDK(KP980785280), T5r, T5o);
+					     T6I = VFMA(LDK(KP534511135), T6t, T6w);
+					     T6x = VFNMS(LDK(KP534511135), T6w, T6t);
+					     T6N = VFMA(LDK(KP831469612), T6i, T6b);
+					     T6j = VFNMS(LDK(KP831469612), T6i, T6b);
+					     {
+						  V T5E, T5G, T5y, T5w;
+						  T5E = VFNMS(LDK(KP995184726), T5D, T5C);
+						  T5G = VFMA(LDK(KP995184726), T5D, T5C);
+						  T5y = VFMA(LDK(KP995184726), T5v, T5s);
+						  T5w = VFNMS(LDK(KP995184726), T5v, T5s);
+						  T6O = VADD(T6H, T6I);
+						  T6J = VSUB(T6H, T6I);
+						  T6R = VSUB(T6x, T6q);
+						  T6y = VADD(T6q, T6x);
+						  ST(&(xo[WS(os, 47)]), VFMAI(T5E, T5B), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 17)]), VFNMSI(T5E, T5B), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 49)]), VFNMSI(T5G, T5F), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 15)]), VFMAI(T5G, T5F), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 63)]), VFMAI(T5y, T5x), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 1)]), VFNMSI(T5y, T5x), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 31)]), VFMAI(T5w, T5h), ovs, &(xo[WS(os, 1)]));
+						  ST(&(xo[WS(os, 33)]), VFNMSI(T5w, T5h), ovs, &(xo[WS(os, 1)]));
+					     }
+					}
+					T6L = VFMA(LDK(KP881921264), T6y, T6j);
+					T6z = VFNMS(LDK(KP881921264), T6y, T6j);
+					T6T = VFMA(LDK(KP881921264), T6O, T6N);
+					T6P = VFNMS(LDK(KP881921264), T6O, T6N);
+				   }
+				   {
+					V T2H, T2P, T2J, T2B, T2R, T2N;
+					{
+					     V T2L, T2t, T2M, T2A;
+					     {
+						  V T2z, T2F, T6Q, T6G, T2G, T2w;
+						  T2z = VFMA(LDK(KP668178637), T2y, T2x);
+						  T2F = VFNMS(LDK(KP668178637), T2x, T2y);
+						  T6Q = VFMA(LDK(KP831469612), T6F, T6C);
+						  T6G = VFNMS(LDK(KP831469612), T6F, T6C);
+						  T2G = VFNMS(LDK(KP668178637), T2u, T2v);
+						  T2w = VFMA(LDK(KP668178637), T2v, T2u);
+						  T2L = VFNMS(LDK(KP923879532), T2s, T2r);
+						  T2t = VFMA(LDK(KP923879532), T2s, T2r);
+						  {
+						       V T6S, T6U, T6M, T6K;
+						       T6S = VFNMS(LDK(KP881921264), T6R, T6Q);
+						       T6U = VFMA(LDK(KP881921264), T6R, T6Q);
+						       T6M = VFMA(LDK(KP881921264), T6J, T6G);
+						       T6K = VFNMS(LDK(KP881921264), T6J, T6G);
+						       T2M = VADD(T2G, T2F);
+						       T2H = VSUB(T2F, T2G);
+						       T2P = VSUB(T2z, T2w);
+						       T2A = VADD(T2w, T2z);
+						       ST(&(xo[WS(os, 43)]), VFMAI(T6S, T6P), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 21)]), VFNMSI(T6S, T6P), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 53)]), VFNMSI(T6U, T6T), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 11)]), VFMAI(T6U, T6T), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 59)]), VFMAI(T6M, T6L), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 5)]), VFNMSI(T6M, T6L), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 27)]), VFMAI(T6K, T6z), ovs, &(xo[WS(os, 1)]));
+						       ST(&(xo[WS(os, 37)]), VFNMSI(T6K, T6z), ovs, &(xo[WS(os, 1)]));
+						  }
+					     }
+					     T2J = VFMA(LDK(KP831469612), T2A, T2t);
+					     T2B = VFNMS(LDK(KP831469612), T2A, T2t);
+					     T2R = VFNMS(LDK(KP831469612), T2M, T2L);
+					     T2N = VFMA(LDK(KP831469612), T2M, T2L);
+					}
+					{
+					     V T61, T5J, T62, T5Q;
+					     {
+						  V T5M, T5V, T2O, T2E, T5W, T5P;
+						  T5M = VFMA(LDK(KP820678790), T5L, T5K);
+						  T5V = VFNMS(LDK(KP820678790), T5K, T5L);
+						  T2O = VFMA(LDK(KP923879532), T2D, T2C);
+						  T2E = VFNMS(LDK(KP923879532), T2D, T2C);
+						  T5W = VFNMS(LDK(KP820678790), T5N, T5O);
+						  T5P = VFMA(LDK(KP820678790), T5O, T5N);
+						  T61 = VFNMS(LDK(KP980785280), T5I, T5H);
+						  T5J = VFMA(LDK(KP980785280), T5I, T5H);
+						  {
+						       V T2Q, T2S, T2K, T2I;
+						       T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
+						       T2S = VFMA(LDK(KP831469612), T2P, T2O);
+						       T2K = VFMA(LDK(KP831469612), T2H, T2E);
+						       T2I = VFNMS(LDK(KP831469612), T2H, T2E);
+						       T62 = VADD(T5V, T5W);
+						       T5X = VSUB(T5V, T5W);
+						       T65 = VSUB(T5P, T5M);
+						       T5Q = VADD(T5M, T5P);
+						       ST(&(xo[WS(os, 42)]), VFMAI(T2Q, T2N), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 22)]), VFNMSI(T2Q, T2N), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 54)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 10)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 58)]), VFMAI(T2K, T2J), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 6)]), VFNMSI(T2K, T2J), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 26)]), VFMAI(T2I, T2B), ovs, &(xo[0]));
+						       ST(&(xo[WS(os, 38)]), VFNMSI(T2I, T2B), ovs, &(xo[0]));
+						  }
+					     }
+					     T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
+					     T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
+					     T67 = VFNMS(LDK(KP773010453), T62, T61);
+					     T63 = VFMA(LDK(KP773010453), T62, T61);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T5U = VFMA(LDK(KP980785280), T5T, T5S);
+	       T64 = VFNMS(LDK(KP980785280), T5T, T5S);
+	       {
+		    V T68, T66, T5Y, T60;
+		    T68 = VFNMS(LDK(KP773010453), T65, T64);
+		    T66 = VFMA(LDK(KP773010453), T65, T64);
+		    T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
+		    T60 = VFMA(LDK(KP773010453), T5X, T5U);
+		    ST(&(xo[WS(os, 41)]), VFNMSI(T66, T63), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 23)]), VFMAI(T66, T63), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 55)]), VFMAI(T68, T67), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 9)]), VFNMSI(T68, T67), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 7)]), VFMAI(T60, T5Z), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 57)]), VFNMSI(T60, T5Z), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 39)]), VFMAI(T5Y, T5R), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 25)]), VFNMSI(T5Y, T5R), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 64, XSIMD_STRING("n1fv_64"), {198, 0, 258, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_64) (planner *p) {
+     X(kdft_register) (p, n1fv_64, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n1fv_64 -include n1f.h */
+
+/*
+ * This function contains 456 FP additions, 124 FP multiplications,
+ * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
+ * 108 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       V T4p, T5q, Tb, T39, T2n, T3A, T6f, T6T, Tq, T3B, T6i, T76, T2i, T3a, T4w;
+	       V T5r, TI, T2p, T6C, T6V, T3h, T3E, T4L, T5u, TZ, T2q, T6F, T6U, T3e, T3D;
+	       V T4E, T5t, T23, T2N, T6t, T71, T6w, T72, T2c, T2O, T3t, T41, T5f, T5R, T5k;
+	       V T5S, T3w, T42, T1s, T2K, T6m, T6Y, T6p, T6Z, T1B, T2L, T3m, T3Y, T4Y, T5O;
+	       V T53, T5P, T3p, T3Z;
+	       {
+		    V T3, T4n, T2m, T4o, T6, T5p, T9, T5o;
+		    {
+			 V T1, T2, T2k, T2l;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+			 T3 = VSUB(T1, T2);
+			 T4n = VADD(T1, T2);
+			 T2k = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T2l = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+			 T2m = VSUB(T2k, T2l);
+			 T4o = VADD(T2k, T2l);
+		    }
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T5p = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T5o = VADD(T7, T8);
+		    }
+		    T4p = VSUB(T4n, T4o);
+		    T5q = VSUB(T5o, T5p);
+		    {
+			 V Ta, T2j, T6d, T6e;
+			 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+			 Tb = VADD(T3, Ta);
+			 T39 = VSUB(T3, Ta);
+			 T2j = VMUL(LDK(KP707106781), VSUB(T9, T6));
+			 T2n = VSUB(T2j, T2m);
+			 T3A = VADD(T2m, T2j);
+			 T6d = VADD(T4n, T4o);
+			 T6e = VADD(T5p, T5o);
+			 T6f = VADD(T6d, T6e);
+			 T6T = VSUB(T6d, T6e);
+		    }
+	       }
+	       {
+		    V Te, T4q, To, T4u, Th, T4r, Tl, T4t;
+		    {
+			 V Tc, Td, Tm, Tn;
+			 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+			 Te = VSUB(Tc, Td);
+			 T4q = VADD(Tc, Td);
+			 Tm = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 Tn = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+			 To = VSUB(Tm, Tn);
+			 T4u = VADD(Tm, Tn);
+		    }
+		    {
+			 V Tf, Tg, Tj, Tk;
+			 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+			 Th = VSUB(Tf, Tg);
+			 T4r = VADD(Tf, Tg);
+			 Tj = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 T4t = VADD(Tj, Tk);
+		    }
+		    {
+			 V Ti, Tp, T6g, T6h;
+			 Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
+			 Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
+			 Tq = VADD(Ti, Tp);
+			 T3B = VSUB(Tp, Ti);
+			 T6g = VADD(T4q, T4r);
+			 T6h = VADD(T4t, T4u);
+			 T6i = VADD(T6g, T6h);
+			 T76 = VSUB(T6h, T6g);
+		    }
+		    {
+			 V T2g, T2h, T4s, T4v;
+			 T2g = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
+			 T2h = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
+			 T2i = VSUB(T2g, T2h);
+			 T3a = VADD(T2h, T2g);
+			 T4s = VSUB(T4q, T4r);
+			 T4v = VSUB(T4t, T4u);
+			 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
+			 T5r = VMUL(LDK(KP707106781), VSUB(T4v, T4s));
+		    }
+	       }
+	       {
+		    V Tu, T4F, TG, T4G, TB, T4J, TD, T4I;
+		    {
+			 V Ts, Tt, TE, TF;
+			 Ts = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+			 Tt = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+			 Tu = VSUB(Ts, Tt);
+			 T4F = VADD(Ts, Tt);
+			 TE = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 TF = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+			 TG = VSUB(TE, TF);
+			 T4G = VADD(TE, TF);
+			 {
+			      V Tv, Tw, Tx, Ty, Tz, TA;
+			      Tv = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			      Tw = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+			      Tx = VSUB(Tv, Tw);
+			      Ty = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+			      Tz = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			      TA = VSUB(Ty, Tz);
+			      TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
+			      T4J = VADD(Tv, Tw);
+			      TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
+			      T4I = VADD(Ty, Tz);
+			 }
+		    }
+		    {
+			 V TC, TH, T6A, T6B;
+			 TC = VADD(Tu, TB);
+			 TH = VSUB(TD, TG);
+			 TI = VFMA(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
+			 T2p = VFNMS(LDK(KP195090322), TH, VMUL(LDK(KP980785280), TC));
+			 T6A = VADD(T4F, T4G);
+			 T6B = VADD(T4J, T4I);
+			 T6C = VADD(T6A, T6B);
+			 T6V = VSUB(T6A, T6B);
+		    }
+		    {
+			 V T3f, T3g, T4H, T4K;
+			 T3f = VSUB(Tu, TB);
+			 T3g = VADD(TG, TD);
+			 T3h = VFNMS(LDK(KP555570233), T3g, VMUL(LDK(KP831469612), T3f));
+			 T3E = VFMA(LDK(KP555570233), T3f, VMUL(LDK(KP831469612), T3g));
+			 T4H = VSUB(T4F, T4G);
+			 T4K = VSUB(T4I, T4J);
+			 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
+			 T5u = VFMA(LDK(KP382683432), T4H, VMUL(LDK(KP923879532), T4K));
+		    }
+	       }
+	       {
+		    V TS, T4z, TW, T4y, TP, T4C, TX, T4B;
+		    {
+			 V TQ, TR, TU, TV;
+			 TQ = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 TR = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+			 TS = VSUB(TQ, TR);
+			 T4z = VADD(TQ, TR);
+			 TU = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 TV = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+			 TW = VSUB(TU, TV);
+			 T4y = VADD(TU, TV);
+			 {
+			      V TJ, TK, TL, TM, TN, TO;
+			      TJ = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+			      TK = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+			      TL = VSUB(TJ, TK);
+			      TM = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			      TN = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+			      TO = VSUB(TM, TN);
+			      TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
+			      T4C = VADD(TM, TN);
+			      TX = VMUL(LDK(KP707106781), VADD(TO, TL));
+			      T4B = VADD(TJ, TK);
+			 }
+		    }
+		    {
+			 V TT, TY, T6D, T6E;
+			 TT = VSUB(TP, TS);
+			 TY = VADD(TW, TX);
+			 TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
+			 T2q = VFMA(LDK(KP980785280), TY, VMUL(LDK(KP195090322), TT));
+			 T6D = VADD(T4y, T4z);
+			 T6E = VADD(T4C, T4B);
+			 T6F = VADD(T6D, T6E);
+			 T6U = VSUB(T6D, T6E);
+		    }
+		    {
+			 V T3c, T3d, T4A, T4D;
+			 T3c = VSUB(TW, TX);
+			 T3d = VADD(TS, TP);
+			 T3e = VFMA(LDK(KP831469612), T3c, VMUL(LDK(KP555570233), T3d));
+			 T3D = VFNMS(LDK(KP555570233), T3c, VMUL(LDK(KP831469612), T3d));
+			 T4A = VSUB(T4y, T4z);
+			 T4D = VSUB(T4B, T4C);
+			 T4E = VFMA(LDK(KP923879532), T4A, VMUL(LDK(KP382683432), T4D));
+			 T5t = VFNMS(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
+		    }
+	       }
+	       {
+		    V T1F, T55, T2a, T56, T1M, T5h, T27, T5g, T58, T59, T1U, T5a, T25, T5b, T5c;
+		    V T21, T5d, T24;
+		    {
+			 V T1D, T1E, T28, T29;
+			 T1D = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+			 T1E = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+			 T1F = VSUB(T1D, T1E);
+			 T55 = VADD(T1D, T1E);
+			 T28 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T29 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+			 T2a = VSUB(T28, T29);
+			 T56 = VADD(T28, T29);
+		    }
+		    {
+			 V T1G, T1H, T1I, T1J, T1K, T1L;
+			 T1G = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T1H = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+			 T1I = VSUB(T1G, T1H);
+			 T1J = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+			 T1K = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			 T1L = VSUB(T1J, T1K);
+			 T1M = VMUL(LDK(KP707106781), VADD(T1I, T1L));
+			 T5h = VADD(T1G, T1H);
+			 T27 = VMUL(LDK(KP707106781), VSUB(T1L, T1I));
+			 T5g = VADD(T1J, T1K);
+		    }
+		    {
+			 V T1Q, T1T, T1X, T20;
+			 {
+			      V T1O, T1P, T1R, T1S;
+			      T1O = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			      T1P = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+			      T1Q = VSUB(T1O, T1P);
+			      T58 = VADD(T1O, T1P);
+			      T1R = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			      T1S = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+			      T1T = VSUB(T1R, T1S);
+			      T59 = VADD(T1R, T1S);
+			 }
+			 T1U = VFNMS(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1Q));
+			 T5a = VSUB(T58, T59);
+			 T25 = VFMA(LDK(KP382683432), T1Q, VMUL(LDK(KP923879532), T1T));
+			 {
+			      V T1V, T1W, T1Y, T1Z;
+			      T1V = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+			      T1W = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+			      T1X = VSUB(T1V, T1W);
+			      T5b = VADD(T1V, T1W);
+			      T1Y = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			      T1Z = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+			      T20 = VSUB(T1Y, T1Z);
+			      T5c = VADD(T1Y, T1Z);
+			 }
+			 T21 = VFMA(LDK(KP923879532), T1X, VMUL(LDK(KP382683432), T20));
+			 T5d = VSUB(T5b, T5c);
+			 T24 = VFNMS(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T1X));
+		    }
+		    {
+			 V T1N, T22, T6r, T6s;
+			 T1N = VADD(T1F, T1M);
+			 T22 = VADD(T1U, T21);
+			 T23 = VSUB(T1N, T22);
+			 T2N = VADD(T1N, T22);
+			 T6r = VADD(T55, T56);
+			 T6s = VADD(T5h, T5g);
+			 T6t = VADD(T6r, T6s);
+			 T71 = VSUB(T6r, T6s);
+		    }
+		    {
+			 V T6u, T6v, T26, T2b;
+			 T6u = VADD(T58, T59);
+			 T6v = VADD(T5b, T5c);
+			 T6w = VADD(T6u, T6v);
+			 T72 = VSUB(T6v, T6u);
+			 T26 = VSUB(T24, T25);
+			 T2b = VSUB(T27, T2a);
+			 T2c = VSUB(T26, T2b);
+			 T2O = VADD(T2b, T26);
+		    }
+		    {
+			 V T3r, T3s, T57, T5e;
+			 T3r = VSUB(T1F, T1M);
+			 T3s = VADD(T25, T24);
+			 T3t = VADD(T3r, T3s);
+			 T41 = VSUB(T3r, T3s);
+			 T57 = VSUB(T55, T56);
+			 T5e = VMUL(LDK(KP707106781), VADD(T5a, T5d));
+			 T5f = VADD(T57, T5e);
+			 T5R = VSUB(T57, T5e);
+		    }
+		    {
+			 V T5i, T5j, T3u, T3v;
+			 T5i = VSUB(T5g, T5h);
+			 T5j = VMUL(LDK(KP707106781), VSUB(T5d, T5a));
+			 T5k = VADD(T5i, T5j);
+			 T5S = VSUB(T5j, T5i);
+			 T3u = VADD(T2a, T27);
+			 T3v = VSUB(T21, T1U);
+			 T3w = VADD(T3u, T3v);
+			 T42 = VSUB(T3v, T3u);
+		    }
+	       }
+	       {
+		    V T1q, T4P, T1v, T4O, T1n, T50, T1w, T4Z, T4U, T4V, T18, T4W, T1z, T4R, T4S;
+		    V T1f, T4T, T1y;
+		    {
+			 V T1o, T1p, T1t, T1u;
+			 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+			 T1q = VSUB(T1o, T1p);
+			 T4P = VADD(T1o, T1p);
+			 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+			 T1v = VSUB(T1t, T1u);
+			 T4O = VADD(T1t, T1u);
+		    }
+		    {
+			 V T1h, T1i, T1j, T1k, T1l, T1m;
+			 T1h = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+			 T1i = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+			 T1j = VSUB(T1h, T1i);
+			 T1k = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T1l = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+			 T1m = VSUB(T1k, T1l);
+			 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
+			 T50 = VADD(T1k, T1l);
+			 T1w = VMUL(LDK(KP707106781), VADD(T1m, T1j));
+			 T4Z = VADD(T1h, T1i);
+		    }
+		    {
+			 V T14, T17, T1b, T1e;
+			 {
+			      V T12, T13, T15, T16;
+			      T12 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+			      T13 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+			      T14 = VSUB(T12, T13);
+			      T4U = VADD(T12, T13);
+			      T15 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			      T16 = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+			      T17 = VSUB(T15, T16);
+			      T4V = VADD(T15, T16);
+			 }
+			 T18 = VFNMS(LDK(KP923879532), T17, VMUL(LDK(KP382683432), T14));
+			 T4W = VSUB(T4U, T4V);
+			 T1z = VFMA(LDK(KP923879532), T14, VMUL(LDK(KP382683432), T17));
+			 {
+			      V T19, T1a, T1c, T1d;
+			      T19 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			      T1a = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+			      T1b = VSUB(T19, T1a);
+			      T4R = VADD(T19, T1a);
+			      T1c = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			      T1d = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+			      T1e = VSUB(T1c, T1d);
+			      T4S = VADD(T1c, T1d);
+			 }
+			 T1f = VFMA(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
+			 T4T = VSUB(T4R, T4S);
+			 T1y = VFNMS(LDK(KP382683432), T1e, VMUL(LDK(KP923879532), T1b));
+		    }
+		    {
+			 V T1g, T1r, T6k, T6l;
+			 T1g = VSUB(T18, T1f);
+			 T1r = VSUB(T1n, T1q);
+			 T1s = VSUB(T1g, T1r);
+			 T2K = VADD(T1r, T1g);
+			 T6k = VADD(T4O, T4P);
+			 T6l = VADD(T50, T4Z);
+			 T6m = VADD(T6k, T6l);
+			 T6Y = VSUB(T6k, T6l);
+		    }
+		    {
+			 V T6n, T6o, T1x, T1A;
+			 T6n = VADD(T4R, T4S);
+			 T6o = VADD(T4U, T4V);
+			 T6p = VADD(T6n, T6o);
+			 T6Z = VSUB(T6o, T6n);
+			 T1x = VADD(T1v, T1w);
+			 T1A = VADD(T1y, T1z);
+			 T1B = VSUB(T1x, T1A);
+			 T2L = VADD(T1x, T1A);
+		    }
+		    {
+			 V T3k, T3l, T4Q, T4X;
+			 T3k = VSUB(T1v, T1w);
+			 T3l = VADD(T1f, T18);
+			 T3m = VADD(T3k, T3l);
+			 T3Y = VSUB(T3k, T3l);
+			 T4Q = VSUB(T4O, T4P);
+			 T4X = VMUL(LDK(KP707106781), VADD(T4T, T4W));
+			 T4Y = VADD(T4Q, T4X);
+			 T5O = VSUB(T4Q, T4X);
+		    }
+		    {
+			 V T51, T52, T3n, T3o;
+			 T51 = VSUB(T4Z, T50);
+			 T52 = VMUL(LDK(KP707106781), VSUB(T4W, T4T));
+			 T53 = VADD(T51, T52);
+			 T5P = VSUB(T52, T51);
+			 T3n = VADD(T1q, T1n);
+			 T3o = VSUB(T1z, T1y);
+			 T3p = VADD(T3n, T3o);
+			 T3Z = VSUB(T3o, T3n);
+		    }
+	       }
+	       {
+		    V T6N, T6R, T6Q, T6S;
+		    {
+			 V T6L, T6M, T6O, T6P;
+			 T6L = VADD(T6f, T6i);
+			 T6M = VADD(T6F, T6C);
+			 T6N = VADD(T6L, T6M);
+			 T6R = VSUB(T6L, T6M);
+			 T6O = VADD(T6m, T6p);
+			 T6P = VADD(T6t, T6w);
+			 T6Q = VADD(T6O, T6P);
+			 T6S = VBYI(VSUB(T6P, T6O));
+		    }
+		    ST(&(xo[WS(os, 32)]), VSUB(T6N, T6Q), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 16)]), VADD(T6R, T6S), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(T6N, T6Q), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 48)]), VSUB(T6R, T6S), ovs, &(xo[0]));
+	       }
+	       {
+		    V T6j, T6G, T6y, T6H, T6q, T6x;
+		    T6j = VSUB(T6f, T6i);
+		    T6G = VSUB(T6C, T6F);
+		    T6q = VSUB(T6m, T6p);
+		    T6x = VSUB(T6t, T6w);
+		    T6y = VMUL(LDK(KP707106781), VADD(T6q, T6x));
+		    T6H = VMUL(LDK(KP707106781), VSUB(T6x, T6q));
+		    {
+			 V T6z, T6I, T6J, T6K;
+			 T6z = VADD(T6j, T6y);
+			 T6I = VBYI(VADD(T6G, T6H));
+			 ST(&(xo[WS(os, 56)]), VSUB(T6z, T6I), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 8)]), VADD(T6z, T6I), ovs, &(xo[0]));
+			 T6J = VSUB(T6j, T6y);
+			 T6K = VBYI(VSUB(T6H, T6G));
+			 ST(&(xo[WS(os, 40)]), VSUB(T6J, T6K), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 24)]), VADD(T6J, T6K), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T6X, T7i, T78, T7g, T74, T7f, T7b, T7j, T6W, T77;
+		    T6W = VMUL(LDK(KP707106781), VADD(T6U, T6V));
+		    T6X = VADD(T6T, T6W);
+		    T7i = VSUB(T6T, T6W);
+		    T77 = VMUL(LDK(KP707106781), VSUB(T6V, T6U));
+		    T78 = VADD(T76, T77);
+		    T7g = VSUB(T77, T76);
+		    {
+			 V T70, T73, T79, T7a;
+			 T70 = VFMA(LDK(KP923879532), T6Y, VMUL(LDK(KP382683432), T6Z));
+			 T73 = VFNMS(LDK(KP382683432), T72, VMUL(LDK(KP923879532), T71));
+			 T74 = VADD(T70, T73);
+			 T7f = VSUB(T73, T70);
+			 T79 = VFNMS(LDK(KP382683432), T6Y, VMUL(LDK(KP923879532), T6Z));
+			 T7a = VFMA(LDK(KP382683432), T71, VMUL(LDK(KP923879532), T72));
+			 T7b = VADD(T79, T7a);
+			 T7j = VSUB(T7a, T79);
+		    }
+		    {
+			 V T75, T7c, T7l, T7m;
+			 T75 = VADD(T6X, T74);
+			 T7c = VBYI(VADD(T78, T7b));
+			 ST(&(xo[WS(os, 60)]), VSUB(T75, T7c), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 4)]), VADD(T75, T7c), ovs, &(xo[0]));
+			 T7l = VBYI(VADD(T7g, T7f));
+			 T7m = VADD(T7i, T7j);
+			 ST(&(xo[WS(os, 12)]), VADD(T7l, T7m), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 52)]), VSUB(T7m, T7l), ovs, &(xo[0]));
+		    }
+		    {
+			 V T7d, T7e, T7h, T7k;
+			 T7d = VSUB(T6X, T74);
+			 T7e = VBYI(VSUB(T7b, T78));
+			 ST(&(xo[WS(os, 36)]), VSUB(T7d, T7e), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 28)]), VADD(T7d, T7e), ovs, &(xo[0]));
+			 T7h = VBYI(VSUB(T7f, T7g));
+			 T7k = VSUB(T7i, T7j);
+			 ST(&(xo[WS(os, 20)]), VADD(T7h, T7k), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 44)]), VSUB(T7k, T7h), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
+		    {
+			 V T5L, T5M, T5Z, T60;
+			 T5L = VSUB(T4p, T4w);
+			 T5M = VSUB(T5u, T5t);
+			 T5N = VADD(T5L, T5M);
+			 T68 = VSUB(T5L, T5M);
+			 T5Z = VFNMS(LDK(KP555570233), T5O, VMUL(LDK(KP831469612), T5P));
+			 T60 = VFMA(LDK(KP555570233), T5R, VMUL(LDK(KP831469612), T5S));
+			 T61 = VADD(T5Z, T60);
+			 T69 = VSUB(T60, T5Z);
+		    }
+		    {
+			 V T5Q, T5T, T5W, T5X;
+			 T5Q = VFMA(LDK(KP831469612), T5O, VMUL(LDK(KP555570233), T5P));
+			 T5T = VFNMS(LDK(KP555570233), T5S, VMUL(LDK(KP831469612), T5R));
+			 T5U = VADD(T5Q, T5T);
+			 T65 = VSUB(T5T, T5Q);
+			 T5W = VSUB(T5r, T5q);
+			 T5X = VSUB(T4L, T4E);
+			 T5Y = VADD(T5W, T5X);
+			 T66 = VSUB(T5X, T5W);
+		    }
+		    {
+			 V T5V, T62, T6b, T6c;
+			 T5V = VADD(T5N, T5U);
+			 T62 = VBYI(VADD(T5Y, T61));
+			 ST(&(xo[WS(os, 58)]), VSUB(T5V, T62), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 6)]), VADD(T5V, T62), ovs, &(xo[0]));
+			 T6b = VBYI(VADD(T66, T65));
+			 T6c = VADD(T68, T69);
+			 ST(&(xo[WS(os, 10)]), VADD(T6b, T6c), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 54)]), VSUB(T6c, T6b), ovs, &(xo[0]));
+		    }
+		    {
+			 V T63, T64, T67, T6a;
+			 T63 = VSUB(T5N, T5U);
+			 T64 = VBYI(VSUB(T61, T5Y));
+			 ST(&(xo[WS(os, 38)]), VSUB(T63, T64), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 26)]), VADD(T63, T64), ovs, &(xo[0]));
+			 T67 = VBYI(VSUB(T65, T66));
+			 T6a = VSUB(T68, T69);
+			 ST(&(xo[WS(os, 22)]), VADD(T67, T6a), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 42)]), VSUB(T6a, T67), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
+		    {
+			 V Tr, T10, T2t, T2u;
+			 Tr = VSUB(Tb, Tq);
+			 T10 = VSUB(TI, TZ);
+			 T11 = VADD(Tr, T10);
+			 T2C = VSUB(Tr, T10);
+			 T2t = VFNMS(LDK(KP634393284), T1B, VMUL(LDK(KP773010453), T1s));
+			 T2u = VFMA(LDK(KP773010453), T2c, VMUL(LDK(KP634393284), T23));
+			 T2v = VADD(T2t, T2u);
+			 T2D = VSUB(T2u, T2t);
+		    }
+		    {
+			 V T1C, T2d, T2o, T2r;
+			 T1C = VFMA(LDK(KP634393284), T1s, VMUL(LDK(KP773010453), T1B));
+			 T2d = VFNMS(LDK(KP634393284), T2c, VMUL(LDK(KP773010453), T23));
+			 T2e = VADD(T1C, T2d);
+			 T2z = VSUB(T2d, T1C);
+			 T2o = VSUB(T2i, T2n);
+			 T2r = VSUB(T2p, T2q);
+			 T2s = VADD(T2o, T2r);
+			 T2A = VSUB(T2r, T2o);
+		    }
+		    {
+			 V T2f, T2w, T2F, T2G;
+			 T2f = VADD(T11, T2e);
+			 T2w = VBYI(VADD(T2s, T2v));
+			 ST(&(xo[WS(os, 57)]), VSUB(T2f, T2w), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 7)]), VADD(T2f, T2w), ovs, &(xo[WS(os, 1)]));
+			 T2F = VBYI(VADD(T2A, T2z));
+			 T2G = VADD(T2C, T2D);
+			 ST(&(xo[WS(os, 9)]), VADD(T2F, T2G), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 55)]), VSUB(T2G, T2F), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T2x, T2y, T2B, T2E;
+			 T2x = VSUB(T11, T2e);
+			 T2y = VBYI(VSUB(T2v, T2s));
+			 ST(&(xo[WS(os, 39)]), VSUB(T2x, T2y), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 25)]), VADD(T2x, T2y), ovs, &(xo[WS(os, 1)]));
+			 T2B = VBYI(VSUB(T2z, T2A));
+			 T2E = VSUB(T2C, T2D);
+			 ST(&(xo[WS(os, 23)]), VADD(T2B, T2E), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 41)]), VSUB(T2E, T2B), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T3j, T3Q, T3J, T3R, T3y, T3N, T3G, T3O;
+		    {
+			 V T3b, T3i, T3H, T3I;
+			 T3b = VADD(T39, T3a);
+			 T3i = VADD(T3e, T3h);
+			 T3j = VADD(T3b, T3i);
+			 T3Q = VSUB(T3b, T3i);
+			 T3H = VFNMS(LDK(KP290284677), T3m, VMUL(LDK(KP956940335), T3p));
+			 T3I = VFMA(LDK(KP290284677), T3t, VMUL(LDK(KP956940335), T3w));
+			 T3J = VADD(T3H, T3I);
+			 T3R = VSUB(T3I, T3H);
+		    }
+		    {
+			 V T3q, T3x, T3C, T3F;
+			 T3q = VFMA(LDK(KP956940335), T3m, VMUL(LDK(KP290284677), T3p));
+			 T3x = VFNMS(LDK(KP290284677), T3w, VMUL(LDK(KP956940335), T3t));
+			 T3y = VADD(T3q, T3x);
+			 T3N = VSUB(T3x, T3q);
+			 T3C = VADD(T3A, T3B);
+			 T3F = VADD(T3D, T3E);
+			 T3G = VADD(T3C, T3F);
+			 T3O = VSUB(T3F, T3C);
+		    }
+		    {
+			 V T3z, T3K, T3T, T3U;
+			 T3z = VADD(T3j, T3y);
+			 T3K = VBYI(VADD(T3G, T3J));
+			 ST(&(xo[WS(os, 61)]), VSUB(T3z, T3K), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 3)]), VADD(T3z, T3K), ovs, &(xo[WS(os, 1)]));
+			 T3T = VBYI(VADD(T3O, T3N));
+			 T3U = VADD(T3Q, T3R);
+			 ST(&(xo[WS(os, 13)]), VADD(T3T, T3U), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 51)]), VSUB(T3U, T3T), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T3L, T3M, T3P, T3S;
+			 T3L = VSUB(T3j, T3y);
+			 T3M = VBYI(VSUB(T3J, T3G));
+			 ST(&(xo[WS(os, 35)]), VSUB(T3L, T3M), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 29)]), VADD(T3L, T3M), ovs, &(xo[WS(os, 1)]));
+			 T3P = VBYI(VSUB(T3N, T3O));
+			 T3S = VSUB(T3Q, T3R);
+			 ST(&(xo[WS(os, 19)]), VADD(T3P, T3S), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 45)]), VSUB(T3S, T3P), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
+		    {
+			 V T4x, T4M, T5x, T5y;
+			 T4x = VADD(T4p, T4w);
+			 T4M = VADD(T4E, T4L);
+			 T4N = VADD(T4x, T4M);
+			 T5G = VSUB(T4x, T4M);
+			 T5x = VFNMS(LDK(KP195090322), T4Y, VMUL(LDK(KP980785280), T53));
+			 T5y = VFMA(LDK(KP195090322), T5f, VMUL(LDK(KP980785280), T5k));
+			 T5z = VADD(T5x, T5y);
+			 T5H = VSUB(T5y, T5x);
+		    }
+		    {
+			 V T54, T5l, T5s, T5v;
+			 T54 = VFMA(LDK(KP980785280), T4Y, VMUL(LDK(KP195090322), T53));
+			 T5l = VFNMS(LDK(KP195090322), T5k, VMUL(LDK(KP980785280), T5f));
+			 T5m = VADD(T54, T5l);
+			 T5D = VSUB(T5l, T54);
+			 T5s = VADD(T5q, T5r);
+			 T5v = VADD(T5t, T5u);
+			 T5w = VADD(T5s, T5v);
+			 T5E = VSUB(T5v, T5s);
+		    }
+		    {
+			 V T5n, T5A, T5J, T5K;
+			 T5n = VADD(T4N, T5m);
+			 T5A = VBYI(VADD(T5w, T5z));
+			 ST(&(xo[WS(os, 62)]), VSUB(T5n, T5A), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 2)]), VADD(T5n, T5A), ovs, &(xo[0]));
+			 T5J = VBYI(VADD(T5E, T5D));
+			 T5K = VADD(T5G, T5H);
+			 ST(&(xo[WS(os, 14)]), VADD(T5J, T5K), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 50)]), VSUB(T5K, T5J), ovs, &(xo[0]));
+		    }
+		    {
+			 V T5B, T5C, T5F, T5I;
+			 T5B = VSUB(T4N, T5m);
+			 T5C = VBYI(VSUB(T5z, T5w));
+			 ST(&(xo[WS(os, 34)]), VSUB(T5B, T5C), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 30)]), VADD(T5B, T5C), ovs, &(xo[0]));
+			 T5F = VBYI(VSUB(T5D, T5E));
+			 T5I = VSUB(T5G, T5H);
+			 ST(&(xo[WS(os, 18)]), VADD(T5F, T5I), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 46)]), VSUB(T5I, T5F), ovs, &(xo[0]));
+		    }
+	       }
+	       {
+		    V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
+		    {
+			 V T2H, T2I, T2V, T2W;
+			 T2H = VADD(Tb, Tq);
+			 T2I = VADD(T2q, T2p);
+			 T2J = VADD(T2H, T2I);
+			 T34 = VSUB(T2H, T2I);
+			 T2V = VFNMS(LDK(KP098017140), T2L, VMUL(LDK(KP995184726), T2K));
+			 T2W = VFMA(LDK(KP995184726), T2O, VMUL(LDK(KP098017140), T2N));
+			 T2X = VADD(T2V, T2W);
+			 T35 = VSUB(T2W, T2V);
+		    }
+		    {
+			 V T2M, T2P, T2S, T2T;
+			 T2M = VFMA(LDK(KP098017140), T2K, VMUL(LDK(KP995184726), T2L));
+			 T2P = VFNMS(LDK(KP098017140), T2O, VMUL(LDK(KP995184726), T2N));
+			 T2Q = VADD(T2M, T2P);
+			 T31 = VSUB(T2P, T2M);
+			 T2S = VADD(T2n, T2i);
+			 T2T = VADD(TZ, TI);
+			 T2U = VADD(T2S, T2T);
+			 T32 = VSUB(T2T, T2S);
+		    }
+		    {
+			 V T2R, T2Y, T37, T38;
+			 T2R = VADD(T2J, T2Q);
+			 T2Y = VBYI(VADD(T2U, T2X));
+			 ST(&(xo[WS(os, 63)]), VSUB(T2R, T2Y), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 1)]), VADD(T2R, T2Y), ovs, &(xo[WS(os, 1)]));
+			 T37 = VBYI(VADD(T32, T31));
+			 T38 = VADD(T34, T35);
+			 ST(&(xo[WS(os, 15)]), VADD(T37, T38), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 49)]), VSUB(T38, T37), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T2Z, T30, T33, T36;
+			 T2Z = VSUB(T2J, T2Q);
+			 T30 = VBYI(VSUB(T2X, T2U));
+			 ST(&(xo[WS(os, 33)]), VSUB(T2Z, T30), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 31)]), VADD(T2Z, T30), ovs, &(xo[WS(os, 1)]));
+			 T33 = VBYI(VSUB(T31, T32));
+			 T36 = VSUB(T34, T35);
+			 ST(&(xo[WS(os, 17)]), VADD(T33, T36), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 47)]), VSUB(T36, T33), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	       {
+		    V T3X, T4i, T4b, T4j, T44, T4f, T48, T4g;
+		    {
+			 V T3V, T3W, T49, T4a;
+			 T3V = VSUB(T39, T3a);
+			 T3W = VSUB(T3E, T3D);
+			 T3X = VADD(T3V, T3W);
+			 T4i = VSUB(T3V, T3W);
+			 T49 = VFNMS(LDK(KP471396736), T3Y, VMUL(LDK(KP881921264), T3Z));
+			 T4a = VFMA(LDK(KP471396736), T41, VMUL(LDK(KP881921264), T42));
+			 T4b = VADD(T49, T4a);
+			 T4j = VSUB(T4a, T49);
+		    }
+		    {
+			 V T40, T43, T46, T47;
+			 T40 = VFMA(LDK(KP881921264), T3Y, VMUL(LDK(KP471396736), T3Z));
+			 T43 = VFNMS(LDK(KP471396736), T42, VMUL(LDK(KP881921264), T41));
+			 T44 = VADD(T40, T43);
+			 T4f = VSUB(T43, T40);
+			 T46 = VSUB(T3B, T3A);
+			 T47 = VSUB(T3h, T3e);
+			 T48 = VADD(T46, T47);
+			 T4g = VSUB(T47, T46);
+		    }
+		    {
+			 V T45, T4c, T4l, T4m;
+			 T45 = VADD(T3X, T44);
+			 T4c = VBYI(VADD(T48, T4b));
+			 ST(&(xo[WS(os, 59)]), VSUB(T45, T4c), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 5)]), VADD(T45, T4c), ovs, &(xo[WS(os, 1)]));
+			 T4l = VBYI(VADD(T4g, T4f));
+			 T4m = VADD(T4i, T4j);
+			 ST(&(xo[WS(os, 11)]), VADD(T4l, T4m), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 53)]), VSUB(T4m, T4l), ovs, &(xo[WS(os, 1)]));
+		    }
+		    {
+			 V T4d, T4e, T4h, T4k;
+			 T4d = VSUB(T3X, T44);
+			 T4e = VBYI(VSUB(T4b, T48));
+			 ST(&(xo[WS(os, 37)]), VSUB(T4d, T4e), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 27)]), VADD(T4d, T4e), ovs, &(xo[WS(os, 1)]));
+			 T4h = VBYI(VSUB(T4f, T4g));
+			 T4k = VSUB(T4i, T4j);
+			 ST(&(xo[WS(os, 21)]), VADD(T4h, T4k), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 43)]), VSUB(T4k, T4h), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 64, XSIMD_STRING("n1fv_64"), {404, 72, 52, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_64) (planner *p) {
+     X(kdft_register) (p, n1fv_64, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name n1fv_7 -include n1f.h */
+
+/*
+ * This function contains 30 FP additions, 24 FP multiplications,
+ * (or, 9 additions, 3 multiplications, 21 fused multiply/add),
+ * 37 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(14, is), MAKE_VOLATILE_STRIDE(14, os)) {
+	       V T1, T2, T3, T8, T9, T5, T6;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+	       T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T9 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T6 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V Te, T4, Tf, Ta, Tg, T7;
+		    Te = VSUB(T3, T2);
+		    T4 = VADD(T2, T3);
+		    Tf = VSUB(T9, T8);
+		    Ta = VADD(T8, T9);
+		    Tg = VSUB(T6, T5);
+		    T7 = VADD(T5, T6);
+		    {
+			 V Tm, Tb, Tr, Th, Tj, To;
+			 Tm = VFMA(LDK(KP554958132), Tf, Te);
+			 Tb = VFNMS(LDK(KP356895867), T4, Ta);
+			 Tr = VFNMS(LDK(KP554958132), Te, Tg);
+			 Th = VFMA(LDK(KP554958132), Tg, Tf);
+			 ST(&(xo[0]), VADD(T1, VADD(T4, VADD(T7, Ta))), ovs, &(xo[0]));
+			 Tj = VFNMS(LDK(KP356895867), T7, T4);
+			 To = VFNMS(LDK(KP356895867), Ta, T7);
+			 {
+			      V Tn, Tc, Ts, Ti;
+			      Tn = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), Tm, Tg));
+			      Tc = VFNMS(LDK(KP692021471), Tb, T7);
+			      Ts = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tr, Tf));
+			      Ti = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Th, Te));
+			      {
+				   V Tk, Tp, Td, Tl, Tq;
+				   Tk = VFNMS(LDK(KP692021471), Tj, Ta);
+				   Tp = VFNMS(LDK(KP692021471), To, T4);
+				   Td = VFNMS(LDK(KP900968867), Tc, T1);
+				   Tl = VFNMS(LDK(KP900968867), Tk, T1);
+				   Tq = VFNMS(LDK(KP900968867), Tp, T1);
+				   ST(&(xo[WS(os, 2)]), VFMAI(Ti, Td), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 5)]), VFNMSI(Ti, Td), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 1)]), VFMAI(Tn, Tl), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 6)]), VFNMSI(Tn, Tl), ovs, &(xo[0]));
+				   ST(&(xo[WS(os, 3)]), VFMAI(Ts, Tq), ovs, &(xo[WS(os, 1)]));
+				   ST(&(xo[WS(os, 4)]), VFNMSI(Ts, Tq), ovs, &(xo[0]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 7, XSIMD_STRING("n1fv_7"), {9, 3, 21, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_7) (planner *p) {
+     X(kdft_register) (p, n1fv_7, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name n1fv_7 -include n1f.h */
+
+/*
+ * This function contains 30 FP additions, 18 FP multiplications,
+ * (or, 18 additions, 6 multiplications, 12 fused multiply/add),
+ * 24 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(14, is), MAKE_VOLATILE_STRIDE(14, os)) {
+	       V T1, Ta, Td, T4, Tc, T7, Te, T8, T9, Tj, Ti;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T9 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       Ta = VADD(T8, T9);
+	       Td = VSUB(T9, T8);
+	       {
+		    V T2, T3, T5, T6;
+		    T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    T4 = VADD(T2, T3);
+		    Tc = VSUB(T3, T2);
+		    T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T6 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = VADD(T5, T6);
+		    Te = VSUB(T6, T5);
+	       }
+	       ST(&(xo[0]), VADD(T1, VADD(T4, VADD(T7, Ta))), ovs, &(xo[0]));
+	       Tj = VBYI(VFMA(LDK(KP433883739), Tc, VFNMS(LDK(KP781831482), Te, VMUL(LDK(KP974927912), Td))));
+	       Ti = VFMA(LDK(KP623489801), T7, VFNMS(LDK(KP222520933), Ta, VFNMS(LDK(KP900968867), T4, T1)));
+	       ST(&(xo[WS(os, 4)]), VSUB(Ti, Tj), ovs, &(xo[0]));
+	       ST(&(xo[WS(os, 3)]), VADD(Ti, Tj), ovs, &(xo[WS(os, 1)]));
+	       {
+		    V Tf, Tb, Th, Tg;
+		    Tf = VBYI(VFNMS(LDK(KP781831482), Td, VFNMS(LDK(KP433883739), Te, VMUL(LDK(KP974927912), Tc))));
+		    Tb = VFMA(LDK(KP623489801), Ta, VFNMS(LDK(KP900968867), T7, VFNMS(LDK(KP222520933), T4, T1)));
+		    ST(&(xo[WS(os, 5)]), VSUB(Tb, Tf), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 2)]), VADD(Tb, Tf), ovs, &(xo[0]));
+		    Th = VBYI(VFMA(LDK(KP781831482), Tc, VFMA(LDK(KP974927912), Te, VMUL(LDK(KP433883739), Td))));
+		    Tg = VFMA(LDK(KP623489801), T4, VFNMS(LDK(KP900968867), Ta, VFNMS(LDK(KP222520933), T7, T1)));
+		    ST(&(xo[WS(os, 6)]), VSUB(Tg, Th), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 1)]), VADD(Tg, Th), ovs, &(xo[WS(os, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 7, XSIMD_STRING("n1fv_7"), {18, 6, 12, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_7) (planner *p) {
+     X(kdft_register) (p, n1fv_7, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name n1fv_8 -include n1f.h */
+
+/*
+ * This function contains 26 FP additions, 10 FP multiplications,
+ * (or, 16 additions, 0 multiplications, 10 fused multiply/add),
+ * 30 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       V T1, T2, Tc, Td, T4, T5, T7, T8;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       Td = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+	       T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, Tj, Te, Tk, T6, Tm, T9, Tn, Tp, Tl;
+		    T3 = VSUB(T1, T2);
+		    Tj = VADD(T1, T2);
+		    Te = VSUB(Tc, Td);
+		    Tk = VADD(Tc, Td);
+		    T6 = VSUB(T4, T5);
+		    Tm = VADD(T4, T5);
+		    T9 = VSUB(T7, T8);
+		    Tn = VADD(T7, T8);
+		    Tp = VSUB(Tj, Tk);
+		    Tl = VADD(Tj, Tk);
+		    {
+			 V Tq, To, Ta, Tf;
+			 Tq = VSUB(Tn, Tm);
+			 To = VADD(Tm, Tn);
+			 Ta = VADD(T6, T9);
+			 Tf = VSUB(T9, T6);
+			 {
+			      V Tg, Ti, Tb, Th;
+			      ST(&(xo[0]), VADD(Tl, To), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 4)]), VSUB(Tl, To), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 2)]), VFMAI(Tq, Tp), ovs, &(xo[0]));
+			      ST(&(xo[WS(os, 6)]), VFNMSI(Tq, Tp), ovs, &(xo[0]));
+			      Tg = VFNMS(LDK(KP707106781), Tf, Te);
+			      Ti = VFMA(LDK(KP707106781), Tf, Te);
+			      Tb = VFMA(LDK(KP707106781), Ta, T3);
+			      Th = VFNMS(LDK(KP707106781), Ta, T3);
+			      ST(&(xo[WS(os, 3)]), VFMAI(Ti, Th), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 5)]), VFNMSI(Ti, Th), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 7)]), VFMAI(Tg, Tb), ovs, &(xo[WS(os, 1)]));
+			      ST(&(xo[WS(os, 1)]), VFNMSI(Tg, Tb), ovs, &(xo[WS(os, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 8, XSIMD_STRING("n1fv_8"), {16, 0, 10, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_8) (planner *p) {
+     X(kdft_register) (p, n1fv_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name n1fv_8 -include n1f.h */
+
+/*
+ * This function contains 26 FP additions, 2 FP multiplications,
+ * (or, 26 additions, 2 multiplications, 0 fused multiply/add),
+ * 22 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       V T3, Tj, Tf, Tk, Ta, Tn, Tc, Tm;
+	       {
+		    V T1, T2, Td, Te;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T3 = VSUB(T1, T2);
+		    Tj = VADD(T1, T2);
+		    Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    Te = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Tf = VSUB(Td, Te);
+		    Tk = VADD(Td, Te);
+		    {
+			 V T4, T5, T6, T7, T8, T9;
+			 T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = VSUB(T7, T8);
+			 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+			 Tn = VADD(T7, T8);
+			 Tc = VMUL(LDK(KP707106781), VSUB(T9, T6));
+			 Tm = VADD(T4, T5);
+		    }
+	       }
+	       {
+		    V Tb, Tg, Tp, Tq;
+		    Tb = VADD(T3, Ta);
+		    Tg = VBYI(VSUB(Tc, Tf));
+		    ST(&(xo[WS(os, 7)]), VSUB(Tb, Tg), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 1)]), VADD(Tb, Tg), ovs, &(xo[WS(os, 1)]));
+		    Tp = VSUB(Tj, Tk);
+		    Tq = VBYI(VSUB(Tn, Tm));
+		    ST(&(xo[WS(os, 6)]), VSUB(Tp, Tq), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 2)]), VADD(Tp, Tq), ovs, &(xo[0]));
+	       }
+	       {
+		    V Th, Ti, Tl, To;
+		    Th = VSUB(T3, Ta);
+		    Ti = VBYI(VADD(Tf, Tc));
+		    ST(&(xo[WS(os, 5)]), VSUB(Th, Ti), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 3)]), VADD(Th, Ti), ovs, &(xo[WS(os, 1)]));
+		    Tl = VADD(Tj, Tk);
+		    To = VADD(Tm, Tn);
+		    ST(&(xo[WS(os, 4)]), VSUB(Tl, To), ovs, &(xo[0]));
+		    ST(&(xo[0]), VADD(Tl, To), ovs, &(xo[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 8, XSIMD_STRING("n1fv_8"), {26, 2, 0, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_8) (planner *p) {
+     X(kdft_register) (p, n1fv_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n1fv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n1fv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:36:52 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name n1fv_9 -include n1f.h */
+
+/*
+ * This function contains 46 FP additions, 38 FP multiplications,
+ * (or, 12 additions, 4 multiplications, 34 fused multiply/add),
+ * 68 stack variables, 19 constants, and 18 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
+     DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
+     DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
+     DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
+     DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
+     DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
+     DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
+     DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
+     DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
+     DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
+     DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
+	       V T1, T2, T3, T6, Tb, T7, T8, Tc, Td, Tv, T4;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+	       T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       Tb = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       Tc = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       Td = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+	       Tv = VSUB(T3, T2);
+	       T4 = VADD(T2, T3);
+	       {
+		    V Tl, T9, Tm, Te, Tj, T5;
+		    Tl = VSUB(T7, T8);
+		    T9 = VADD(T7, T8);
+		    Tm = VSUB(Td, Tc);
+		    Te = VADD(Tc, Td);
+		    Tj = VFNMS(LDK(KP500000000), T4, T1);
+		    T5 = VADD(T1, T4);
+		    {
+			 V Tn, Ta, Tk, Tf;
+			 Tn = VFNMS(LDK(KP500000000), T9, T6);
+			 Ta = VADD(T6, T9);
+			 Tk = VFNMS(LDK(KP500000000), Te, Tb);
+			 Tf = VADD(Tb, Te);
+			 {
+			      V Ty, TC, To, TB, Tx, Ts, Tg, Ti;
+			      Ty = VFNMS(LDK(KP726681596), Tl, Tn);
+			      TC = VFMA(LDK(KP968908795), Tn, Tl);
+			      To = VFNMS(LDK(KP586256827), Tn, Tm);
+			      TB = VFNMS(LDK(KP152703644), Tm, Tk);
+			      Tx = VFMA(LDK(KP203604859), Tk, Tm);
+			      Ts = VFNMS(LDK(KP439692620), Tl, Tk);
+			      Tg = VADD(Ta, Tf);
+			      Ti = VMUL(LDK(KP866025403), VSUB(Tf, Ta));
+			      {
+				   V Tz, TI, TF, TD, Tt, Th, Tq, Tp;
+				   Tp = VFNMS(LDK(KP347296355), To, Tl);
+				   Tz = VFMA(LDK(KP898197570), Ty, Tx);
+				   TI = VFNMS(LDK(KP898197570), Ty, Tx);
+				   TF = VFNMS(LDK(KP673648177), TC, TB);
+				   TD = VFMA(LDK(KP673648177), TC, TB);
+				   Tt = VFNMS(LDK(KP420276625), Ts, Tm);
+				   ST(&(xo[0]), VADD(T5, Tg), ovs, &(xo[0]));
+				   Th = VFNMS(LDK(KP500000000), Tg, T5);
+				   Tq = VFNMS(LDK(KP907603734), Tp, Tk);
+				   {
+					V TA, TJ, TE, TG, Tu, Tr, TK, TH, Tw;
+					TA = VFMA(LDK(KP852868531), Tz, Tj);
+					TJ = VFMA(LDK(KP666666666), TD, TI);
+					TE = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tv, TD));
+					TG = VFNMS(LDK(KP500000000), Tz, TF);
+					Tu = VFNMS(LDK(KP826351822), Tt, Tn);
+					ST(&(xo[WS(os, 6)]), VFNMSI(Ti, Th), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 3)]), VFMAI(Ti, Th), ovs, &(xo[WS(os, 1)]));
+					Tr = VFNMS(LDK(KP939692620), Tq, Tj);
+					TK = VMUL(LDK(KP866025403), VFMA(LDK(KP852868531), TJ, Tv));
+					ST(&(xo[WS(os, 8)]), VFMAI(TE, TA), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 1)]), VFNMSI(TE, TA), ovs, &(xo[WS(os, 1)]));
+					TH = VFMA(LDK(KP852868531), TG, Tj);
+					Tw = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tv, Tu));
+					ST(&(xo[WS(os, 4)]), VFMAI(TK, TH), ovs, &(xo[0]));
+					ST(&(xo[WS(os, 5)]), VFNMSI(TK, TH), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 7)]), VFMAI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
+					ST(&(xo[WS(os, 2)]), VFNMSI(Tw, Tr), ovs, &(xo[0]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 9, XSIMD_STRING("n1fv_9"), {12, 4, 34, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_9) (planner *p) {
+     X(kdft_register) (p, n1fv_9, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name n1fv_9 -include n1f.h */
+
+/*
+ * This function contains 46 FP additions, 26 FP multiplications,
+ * (or, 30 additions, 10 multiplications, 16 fused multiply/add),
+ * 41 stack variables, 14 constants, and 18 memory accesses
+ */
+#include "n1f.h"
+
+static void n1fv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
+     DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
+     DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
+     DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
+	       V T5, Ts, Tj, To, Tf, Tn, Tp, Tu, Tl, Ta, Tk, Tm, Tt;
+	       {
+		    V T1, T2, T3, T4;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    T4 = VADD(T2, T3);
+		    T5 = VADD(T1, T4);
+		    Ts = VMUL(LDK(KP866025403), VSUB(T3, T2));
+		    Tj = VFNMS(LDK(KP500000000), T4, T1);
+	       }
+	       {
+		    V Tb, Te, Tc, Td;
+		    Tb = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    Tc = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    Td = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    Te = VADD(Tc, Td);
+		    To = VSUB(Td, Tc);
+		    Tf = VADD(Tb, Te);
+		    Tn = VFNMS(LDK(KP500000000), Te, Tb);
+		    Tp = VFMA(LDK(KP173648177), Tn, VMUL(LDK(KP852868531), To));
+		    Tu = VFNMS(LDK(KP984807753), Tn, VMUL(LDK(KP150383733), To));
+	       }
+	       {
+		    V T6, T9, T7, T8;
+		    T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    T9 = VADD(T7, T8);
+		    Tl = VSUB(T8, T7);
+		    Ta = VADD(T6, T9);
+		    Tk = VFNMS(LDK(KP500000000), T9, T6);
+		    Tm = VFMA(LDK(KP766044443), Tk, VMUL(LDK(KP556670399), Tl));
+		    Tt = VFNMS(LDK(KP642787609), Tk, VMUL(LDK(KP663413948), Tl));
+	       }
+	       {
+		    V Ti, Tg, Th, Tz, TA;
+		    Ti = VBYI(VMUL(LDK(KP866025403), VSUB(Tf, Ta)));
+		    Tg = VADD(Ta, Tf);
+		    Th = VFNMS(LDK(KP500000000), Tg, T5);
+		    ST(&(xo[0]), VADD(T5, Tg), ovs, &(xo[0]));
+		    ST(&(xo[WS(os, 3)]), VADD(Th, Ti), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 6)]), VSUB(Th, Ti), ovs, &(xo[0]));
+		    Tz = VFMA(LDK(KP173648177), Tk, VFNMS(LDK(KP296198132), To, VFNMS(LDK(KP939692620), Tn, VFNMS(LDK(KP852868531), Tl, Tj))));
+		    TA = VBYI(VSUB(VFNMS(LDK(KP342020143), Tn, VFNMS(LDK(KP150383733), Tl, VFNMS(LDK(KP984807753), Tk, VMUL(LDK(KP813797681), To)))), Ts));
+		    ST(&(xo[WS(os, 7)]), VSUB(Tz, TA), ovs, &(xo[WS(os, 1)]));
+		    ST(&(xo[WS(os, 2)]), VADD(Tz, TA), ovs, &(xo[0]));
+		    {
+			 V Tr, Tx, Tw, Ty, Tq, Tv;
+			 Tq = VADD(Tm, Tp);
+			 Tr = VADD(Tj, Tq);
+			 Tx = VFMA(LDK(KP866025403), VSUB(Tt, Tu), VFNMS(LDK(KP500000000), Tq, Tj));
+			 Tv = VADD(Tt, Tu);
+			 Tw = VBYI(VADD(Ts, Tv));
+			 Ty = VBYI(VADD(Ts, VFNMS(LDK(KP500000000), Tv, VMUL(LDK(KP866025403), VSUB(Tp, Tm)))));
+			 ST(&(xo[WS(os, 8)]), VSUB(Tr, Tw), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 4)]), VADD(Tx, Ty), ovs, &(xo[0]));
+			 ST(&(xo[WS(os, 1)]), VADD(Tw, Tr), ovs, &(xo[WS(os, 1)]));
+			 ST(&(xo[WS(os, 5)]), VSUB(Tx, Ty), ovs, &(xo[WS(os, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 9, XSIMD_STRING("n1fv_9"), {30, 10, 16, 0}, &GENUS, 0, 0, 0, 0 };
+
+void XSIMD(codelet_n1fv_9) (planner *p) {
+     X(kdft_register) (p, n1fv_9, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:29 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 10 -name n2bv_10 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 42 FP additions, 22 FP multiplications,
+ * (or, 24 additions, 4 multiplications, 18 fused multiply/add),
+ * 53 stack variables, 4 constants, and 25 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
+	       V Tb, Tr, T3, Ts, T6, Tw, Tg, Tt, T9, Tc, T1, T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T4, T5, Te, Tf, T7, T8;
+		    T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T5 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    Te = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    Tr = VADD(T1, T2);
+		    T3 = VSUB(T1, T2);
+		    Ts = VADD(T4, T5);
+		    T6 = VSUB(T4, T5);
+		    Tw = VADD(Te, Tf);
+		    Tg = VSUB(Te, Tf);
+		    Tt = VADD(T7, T8);
+		    T9 = VSUB(T7, T8);
+		    Tc = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+	       }
+	       {
+		    V TD, Tu, Tm, Ta, Td, Tv;
+		    TD = VSUB(Ts, Tt);
+		    Tu = VADD(Ts, Tt);
+		    Tm = VSUB(T6, T9);
+		    Ta = VADD(T6, T9);
+		    Td = VSUB(Tb, Tc);
+		    Tv = VADD(Tb, Tc);
+		    {
+			 V TC, Tx, Tn, Th;
+			 TC = VSUB(Tv, Tw);
+			 Tx = VADD(Tv, Tw);
+			 Tn = VSUB(Td, Tg);
+			 Th = VADD(Td, Tg);
+			 {
+			      V Ty, TA, TE, TG, Ti, Tk, To, Tq;
+			      Ty = VADD(Tu, Tx);
+			      TA = VSUB(Tu, Tx);
+			      TE = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TD, TC));
+			      TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TC, TD));
+			      Ti = VADD(Ta, Th);
+			      Tk = VSUB(Ta, Th);
+			      To = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tn, Tm));
+			      Tq = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tm, Tn));
+			      {
+				   V Tz, TH, Tj, TI;
+				   Tz = VFNMS(LDK(KP250000000), Ty, Tr);
+				   TH = VADD(Tr, Ty);
+				   STM2(&(xo[0]), TH, ovs, &(xo[0]));
+				   Tj = VFNMS(LDK(KP250000000), Ti, T3);
+				   TI = VADD(T3, Ti);
+				   STM2(&(xo[10]), TI, ovs, &(xo[2]));
+				   {
+					V TB, TF, Tl, Tp;
+					TB = VFNMS(LDK(KP559016994), TA, Tz);
+					TF = VFMA(LDK(KP559016994), TA, Tz);
+					Tl = VFMA(LDK(KP559016994), Tk, Tj);
+					Tp = VFNMS(LDK(KP559016994), Tk, Tj);
+					{
+					     V TJ, TK, TL, TM;
+					     TJ = VFNMSI(TG, TF);
+					     STM2(&(xo[8]), TJ, ovs, &(xo[0]));
+					     STN2(&(xo[8]), TJ, TI, ovs);
+					     TK = VFMAI(TG, TF);
+					     STM2(&(xo[12]), TK, ovs, &(xo[0]));
+					     TL = VFMAI(TE, TB);
+					     STM2(&(xo[16]), TL, ovs, &(xo[0]));
+					     TM = VFNMSI(TE, TB);
+					     STM2(&(xo[4]), TM, ovs, &(xo[0]));
+					     {
+						  V TN, TO, TP, TQ;
+						  TN = VFMAI(Tq, Tp);
+						  STM2(&(xo[6]), TN, ovs, &(xo[2]));
+						  STN2(&(xo[4]), TM, TN, ovs);
+						  TO = VFNMSI(Tq, Tp);
+						  STM2(&(xo[14]), TO, ovs, &(xo[2]));
+						  STN2(&(xo[12]), TK, TO, ovs);
+						  TP = VFNMSI(To, Tl);
+						  STM2(&(xo[18]), TP, ovs, &(xo[2]));
+						  STN2(&(xo[16]), TL, TP, ovs);
+						  TQ = VFMAI(To, Tl);
+						  STM2(&(xo[2]), TQ, ovs, &(xo[2]));
+						  STN2(&(xo[0]), TH, TQ, ovs);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 10, XSIMD_STRING("n2bv_10"), {24, 4, 18, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_10) (planner *p) {
+     X(kdft_register) (p, n2bv_10, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 10 -name n2bv_10 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 42 FP additions, 12 FP multiplications,
+ * (or, 36 additions, 6 multiplications, 6 fused multiply/add),
+ * 36 stack variables, 4 constants, and 25 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
+	       V Tl, Ty, T7, Te, Tw, Tt, Tz, TA, TB, Tg, Th, Tm, Tj, Tk;
+	       Tj = LD(&(xi[0]), ivs, &(xi[0]));
+	       Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       Tl = VSUB(Tj, Tk);
+	       Ty = VADD(Tj, Tk);
+	       {
+		    V T3, Tr, Td, Tv, T6, Ts, Ta, Tu;
+		    {
+			 V T1, T2, Tb, Tc;
+			 T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 Tr = VADD(T1, T2);
+			 Tb = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 Tv = VADD(Tb, Tc);
+		    }
+		    {
+			 V T4, T5, T8, T9;
+			 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 Ts = VADD(T4, T5);
+			 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VSUB(T8, T9);
+			 Tu = VADD(T8, T9);
+		    }
+		    T7 = VSUB(T3, T6);
+		    Te = VSUB(Ta, Td);
+		    Tw = VSUB(Tu, Tv);
+		    Tt = VSUB(Tr, Ts);
+		    Tz = VADD(Tr, Ts);
+		    TA = VADD(Tu, Tv);
+		    TB = VADD(Tz, TA);
+		    Tg = VADD(T3, T6);
+		    Th = VADD(Ta, Td);
+		    Tm = VADD(Tg, Th);
+	       }
+	       {
+		    V TH, TI, TK, TL, TM;
+		    TH = VADD(Tl, Tm);
+		    STM2(&(xo[10]), TH, ovs, &(xo[2]));
+		    TI = VADD(Ty, TB);
+		    STM2(&(xo[0]), TI, ovs, &(xo[0]));
+		    {
+			 V Tf, Tq, To, Tp, Ti, Tn, TJ;
+			 Tf = VBYI(VFMA(LDK(KP951056516), T7, VMUL(LDK(KP587785252), Te)));
+			 Tq = VBYI(VFNMS(LDK(KP951056516), Te, VMUL(LDK(KP587785252), T7)));
+			 Ti = VMUL(LDK(KP559016994), VSUB(Tg, Th));
+			 Tn = VFNMS(LDK(KP250000000), Tm, Tl);
+			 To = VADD(Ti, Tn);
+			 Tp = VSUB(Tn, Ti);
+			 TJ = VADD(Tf, To);
+			 STM2(&(xo[2]), TJ, ovs, &(xo[2]));
+			 STN2(&(xo[0]), TI, TJ, ovs);
+			 TK = VADD(Tq, Tp);
+			 STM2(&(xo[14]), TK, ovs, &(xo[2]));
+			 TL = VSUB(To, Tf);
+			 STM2(&(xo[18]), TL, ovs, &(xo[2]));
+			 TM = VSUB(Tp, Tq);
+			 STM2(&(xo[6]), TM, ovs, &(xo[2]));
+		    }
+		    {
+			 V Tx, TG, TE, TF, TC, TD;
+			 Tx = VBYI(VFNMS(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tt)));
+			 TG = VBYI(VFMA(LDK(KP951056516), Tt, VMUL(LDK(KP587785252), Tw)));
+			 TC = VFNMS(LDK(KP250000000), TB, Ty);
+			 TD = VMUL(LDK(KP559016994), VSUB(Tz, TA));
+			 TE = VSUB(TC, TD);
+			 TF = VADD(TD, TC);
+			 {
+			      V TN, TO, TP, TQ;
+			      TN = VADD(Tx, TE);
+			      STM2(&(xo[4]), TN, ovs, &(xo[0]));
+			      STN2(&(xo[4]), TN, TM, ovs);
+			      TO = VADD(TG, TF);
+			      STM2(&(xo[12]), TO, ovs, &(xo[0]));
+			      STN2(&(xo[12]), TO, TK, ovs);
+			      TP = VSUB(TE, Tx);
+			      STM2(&(xo[16]), TP, ovs, &(xo[0]));
+			      STN2(&(xo[16]), TP, TL, ovs);
+			      TQ = VSUB(TF, TG);
+			      STM2(&(xo[8]), TQ, ovs, &(xo[0]));
+			      STN2(&(xo[8]), TQ, TH, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 10, XSIMD_STRING("n2bv_10"), {36, 6, 6, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_10) (planner *p) {
+     X(kdft_register) (p, n2bv_10, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:30 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 12 -name n2bv_12 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 48 FP additions, 20 FP multiplications,
+ * (or, 30 additions, 2 multiplications, 18 fused multiply/add),
+ * 61 stack variables, 2 constants, and 30 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
+	       V T1, T6, Tc, Th, Td, Te, Ti, Tz, T4, TA, T9, Tj, Tf, Tw;
+	       {
+		    V T2, T3, T7, T8;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+		    Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    Tz = VSUB(T2, T3);
+		    T4 = VADD(T2, T3);
+		    TA = VSUB(T7, T8);
+		    T9 = VADD(T7, T8);
+		    Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       }
+	       Tf = VADD(Td, Te);
+	       Tw = VSUB(Td, Te);
+	       {
+		    V T5, Tp, TJ, TB, Ta, Tq, Tk, Tx, Tg, Ts;
+		    T5 = VADD(T1, T4);
+		    Tp = VFNMS(LDK(KP500000000), T4, T1);
+		    TJ = VSUB(Tz, TA);
+		    TB = VADD(Tz, TA);
+		    Ta = VADD(T6, T9);
+		    Tq = VFNMS(LDK(KP500000000), T9, T6);
+		    Tk = VADD(Ti, Tj);
+		    Tx = VSUB(Tj, Ti);
+		    Tg = VADD(Tc, Tf);
+		    Ts = VFNMS(LDK(KP500000000), Tf, Tc);
+		    {
+			 V Tr, TF, Tb, Tn, TG, Ty, Tl, Tt;
+			 Tr = VADD(Tp, Tq);
+			 TF = VSUB(Tp, Tq);
+			 Tb = VSUB(T5, Ta);
+			 Tn = VADD(T5, Ta);
+			 TG = VADD(Tw, Tx);
+			 Ty = VSUB(Tw, Tx);
+			 Tl = VADD(Th, Tk);
+			 Tt = VFNMS(LDK(KP500000000), Tk, Th);
+			 {
+			      V TC, TE, TH, TL, Tu, TI, Tm, To;
+			      TC = VMUL(LDK(KP866025403), VSUB(Ty, TB));
+			      TE = VMUL(LDK(KP866025403), VADD(TB, Ty));
+			      TH = VFNMS(LDK(KP866025403), TG, TF);
+			      TL = VFMA(LDK(KP866025403), TG, TF);
+			      Tu = VADD(Ts, Tt);
+			      TI = VSUB(Ts, Tt);
+			      Tm = VSUB(Tg, Tl);
+			      To = VADD(Tg, Tl);
+			      {
+				   V TK, TM, Tv, TD;
+				   TK = VFMA(LDK(KP866025403), TJ, TI);
+				   TM = VFNMS(LDK(KP866025403), TJ, TI);
+				   Tv = VSUB(Tr, Tu);
+				   TD = VADD(Tr, Tu);
+				   {
+					V TN, TO, TP, TQ;
+					TN = VADD(Tn, To);
+					STM2(&(xo[0]), TN, ovs, &(xo[0]));
+					TO = VSUB(Tn, To);
+					STM2(&(xo[12]), TO, ovs, &(xo[0]));
+					TP = VFMAI(Tm, Tb);
+					STM2(&(xo[18]), TP, ovs, &(xo[2]));
+					TQ = VFNMSI(Tm, Tb);
+					STM2(&(xo[6]), TQ, ovs, &(xo[2]));
+					{
+					     V TR, TS, TT, TU;
+					     TR = VFMAI(TM, TL);
+					     STM2(&(xo[10]), TR, ovs, &(xo[2]));
+					     TS = VFNMSI(TM, TL);
+					     STM2(&(xo[14]), TS, ovs, &(xo[2]));
+					     STN2(&(xo[12]), TO, TS, ovs);
+					     TT = VFNMSI(TK, TH);
+					     STM2(&(xo[22]), TT, ovs, &(xo[2]));
+					     TU = VFMAI(TK, TH);
+					     STM2(&(xo[2]), TU, ovs, &(xo[2]));
+					     STN2(&(xo[0]), TN, TU, ovs);
+					     {
+						  V TV, TW, TX, TY;
+						  TV = VFNMSI(TE, TD);
+						  STM2(&(xo[16]), TV, ovs, &(xo[0]));
+						  STN2(&(xo[16]), TV, TP, ovs);
+						  TW = VFMAI(TE, TD);
+						  STM2(&(xo[8]), TW, ovs, &(xo[0]));
+						  STN2(&(xo[8]), TW, TR, ovs);
+						  TX = VFMAI(TC, Tv);
+						  STM2(&(xo[4]), TX, ovs, &(xo[0]));
+						  STN2(&(xo[4]), TX, TQ, ovs);
+						  TY = VFNMSI(TC, Tv);
+						  STM2(&(xo[20]), TY, ovs, &(xo[0]));
+						  STN2(&(xo[20]), TY, TT, ovs);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 12, XSIMD_STRING("n2bv_12"), {30, 2, 18, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_12) (planner *p) {
+     X(kdft_register) (p, n2bv_12, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 12 -name n2bv_12 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 48 FP additions, 8 FP multiplications,
+ * (or, 44 additions, 4 multiplications, 4 fused multiply/add),
+ * 33 stack variables, 2 constants, and 30 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
+	       V T5, Ta, TG, TF, Ty, Tm, Ti, Tp, TJ, TI, Tx, Ts;
+	       {
+		    V T1, T6, T4, Tk, T9, Tl;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    {
+			 V T2, T3, T7, T8;
+			 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T4 = VADD(T2, T3);
+			 Tk = VSUB(T2, T3);
+			 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T9 = VADD(T7, T8);
+			 Tl = VSUB(T7, T8);
+		    }
+		    T5 = VFNMS(LDK(KP500000000), T4, T1);
+		    Ta = VFNMS(LDK(KP500000000), T9, T6);
+		    TG = VADD(T6, T9);
+		    TF = VADD(T1, T4);
+		    Ty = VADD(Tk, Tl);
+		    Tm = VMUL(LDK(KP866025403), VSUB(Tk, Tl));
+	       }
+	       {
+		    V Tn, Tq, Te, To, Th, Tr;
+		    Tn = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Tq = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    {
+			 V Tc, Td, Tf, Tg;
+			 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Td = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Te = VSUB(Tc, Td);
+			 To = VADD(Tc, Td);
+			 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Th = VSUB(Tf, Tg);
+			 Tr = VADD(Tf, Tg);
+		    }
+		    Ti = VMUL(LDK(KP866025403), VSUB(Te, Th));
+		    Tp = VFNMS(LDK(KP500000000), To, Tn);
+		    TJ = VADD(Tq, Tr);
+		    TI = VADD(Tn, To);
+		    Tx = VADD(Te, Th);
+		    Ts = VFNMS(LDK(KP500000000), Tr, Tq);
+	       }
+	       {
+		    V TN, TO, TP, TQ, TR, TS;
+		    {
+			 V TH, TK, TL, TM;
+			 TH = VSUB(TF, TG);
+			 TK = VBYI(VSUB(TI, TJ));
+			 TN = VSUB(TH, TK);
+			 STM2(&(xo[6]), TN, ovs, &(xo[2]));
+			 TO = VADD(TH, TK);
+			 STM2(&(xo[18]), TO, ovs, &(xo[2]));
+			 TL = VADD(TF, TG);
+			 TM = VADD(TI, TJ);
+			 TP = VSUB(TL, TM);
+			 STM2(&(xo[12]), TP, ovs, &(xo[0]));
+			 TQ = VADD(TL, TM);
+			 STM2(&(xo[0]), TQ, ovs, &(xo[0]));
+		    }
+		    {
+			 V Tj, Tv, Tu, Tw, Tb, Tt, TT, TU;
+			 Tb = VSUB(T5, Ta);
+			 Tj = VSUB(Tb, Ti);
+			 Tv = VADD(Tb, Ti);
+			 Tt = VSUB(Tp, Ts);
+			 Tu = VBYI(VADD(Tm, Tt));
+			 Tw = VBYI(VSUB(Tt, Tm));
+			 TR = VSUB(Tj, Tu);
+			 STM2(&(xo[22]), TR, ovs, &(xo[2]));
+			 TS = VADD(Tv, Tw);
+			 STM2(&(xo[10]), TS, ovs, &(xo[2]));
+			 TT = VADD(Tj, Tu);
+			 STM2(&(xo[2]), TT, ovs, &(xo[2]));
+			 STN2(&(xo[0]), TQ, TT, ovs);
+			 TU = VSUB(Tv, Tw);
+			 STM2(&(xo[14]), TU, ovs, &(xo[2]));
+			 STN2(&(xo[12]), TP, TU, ovs);
+		    }
+		    {
+			 V Tz, TD, TC, TE, TA, TB;
+			 Tz = VBYI(VMUL(LDK(KP866025403), VSUB(Tx, Ty)));
+			 TD = VBYI(VMUL(LDK(KP866025403), VADD(Ty, Tx)));
+			 TA = VADD(T5, Ta);
+			 TB = VADD(Tp, Ts);
+			 TC = VSUB(TA, TB);
+			 TE = VADD(TA, TB);
+			 {
+			      V TV, TW, TX, TY;
+			      TV = VADD(Tz, TC);
+			      STM2(&(xo[4]), TV, ovs, &(xo[0]));
+			      STN2(&(xo[4]), TV, TN, ovs);
+			      TW = VSUB(TE, TD);
+			      STM2(&(xo[16]), TW, ovs, &(xo[0]));
+			      STN2(&(xo[16]), TW, TO, ovs);
+			      TX = VSUB(TC, Tz);
+			      STM2(&(xo[20]), TX, ovs, &(xo[0]));
+			      STN2(&(xo[20]), TX, TR, ovs);
+			      TY = VADD(TD, TE);
+			      STM2(&(xo[8]), TY, ovs, &(xo[0]));
+			      STN2(&(xo[8]), TY, TS, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 12, XSIMD_STRING("n2bv_12"), {44, 4, 4, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_12) (planner *p) {
+     X(kdft_register) (p, n2bv_12, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:30 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 14 -name n2bv_14 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 74 FP additions, 48 FP multiplications,
+ * (or, 32 additions, 6 multiplications, 42 fused multiply/add),
+ * 65 stack variables, 6 constants, and 35 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
+	       V TH, T3, TP, Tn, Ta, Tu, TU, TK, TO, Tk, TM, Tg, TL, Td, T1;
+	       V T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V Ti, TI, T6, TJ, T9, Tj, Te, Tf, Tb, Tc;
+		    {
+			 V T4, T5, T7, T8, Tl, Tm;
+			 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tl = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tm = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Ti = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 TH = VADD(T1, T2);
+			 T3 = VSUB(T1, T2);
+			 TI = VADD(T4, T5);
+			 T6 = VSUB(T4, T5);
+			 TJ = VADD(T7, T8);
+			 T9 = VSUB(T7, T8);
+			 TP = VADD(Tl, Tm);
+			 Tn = VSUB(Tl, Tm);
+			 Tj = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Te = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tf = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+		    }
+		    Ta = VADD(T6, T9);
+		    Tu = VSUB(T6, T9);
+		    TU = VSUB(TI, TJ);
+		    TK = VADD(TI, TJ);
+		    TO = VADD(Ti, Tj);
+		    Tk = VSUB(Ti, Tj);
+		    TM = VADD(Te, Tf);
+		    Tg = VSUB(Te, Tf);
+		    TL = VADD(Tb, Tc);
+		    Td = VSUB(Tb, Tc);
+	       }
+	       {
+		    V T19, T1a, T13, TG, TY, T18, TB, Tw, TT, Tz, T11, T16, TE, Tr, TV;
+		    V TQ;
+		    TV = VSUB(TP, TO);
+		    TQ = VADD(TO, TP);
+		    {
+			 V Ts, To, TW, TN;
+			 Ts = VSUB(Tk, Tn);
+			 To = VADD(Tk, Tn);
+			 TW = VSUB(TM, TL);
+			 TN = VADD(TL, TM);
+			 {
+			      V Tt, Th, TR, T12;
+			      Tt = VSUB(Td, Tg);
+			      Th = VADD(Td, Tg);
+			      TR = VFNMS(LDK(KP356895867), TK, TQ);
+			      T12 = VFNMS(LDK(KP554958132), TV, TU);
+			      {
+				   V Tx, TF, TZ, T14;
+				   Tx = VFNMS(LDK(KP356895867), Ta, To);
+				   TF = VFMA(LDK(KP554958132), Ts, Tu);
+				   T19 = VADD(TH, VADD(TK, VADD(TN, TQ)));
+				   STM2(&(xo[0]), T19, ovs, &(xo[0]));
+				   TZ = VFNMS(LDK(KP356895867), TN, TK);
+				   T14 = VFNMS(LDK(KP356895867), TQ, TN);
+				   {
+					V TX, T17, TC, Tp;
+					TX = VFMA(LDK(KP554958132), TW, TV);
+					T17 = VFMA(LDK(KP554958132), TU, TW);
+					T1a = VADD(T3, VADD(Ta, VADD(Th, To)));
+					STM2(&(xo[14]), T1a, ovs, &(xo[2]));
+					TC = VFNMS(LDK(KP356895867), Th, Ta);
+					Tp = VFNMS(LDK(KP356895867), To, Th);
+					{
+					     V TA, Tv, TS, Ty;
+					     TA = VFMA(LDK(KP554958132), Tt, Ts);
+					     Tv = VFNMS(LDK(KP554958132), Tu, Tt);
+					     TS = VFNMS(LDK(KP692021471), TR, TN);
+					     T13 = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), T12, TW));
+					     Ty = VFNMS(LDK(KP692021471), Tx, Th);
+					     TG = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), TF, Tt));
+					     {
+						  V T10, T15, TD, Tq;
+						  T10 = VFNMS(LDK(KP692021471), TZ, TQ);
+						  T15 = VFNMS(LDK(KP692021471), T14, TK);
+						  TY = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), TX, TU));
+						  T18 = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), T17, TV));
+						  TD = VFNMS(LDK(KP692021471), TC, To);
+						  Tq = VFNMS(LDK(KP692021471), Tp, Ta);
+						  TB = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), TA, Tu));
+						  Tw = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tv, Ts));
+						  TT = VFNMS(LDK(KP900968867), TS, TH);
+						  Tz = VFNMS(LDK(KP900968867), Ty, T3);
+						  T11 = VFNMS(LDK(KP900968867), T10, TH);
+						  T16 = VFNMS(LDK(KP900968867), T15, TH);
+						  TE = VFNMS(LDK(KP900968867), TD, T3);
+						  Tr = VFNMS(LDK(KP900968867), Tq, T3);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T1b, T1c, T1d, T1e;
+			 T1b = VFMAI(TY, TT);
+			 STM2(&(xo[4]), T1b, ovs, &(xo[0]));
+			 T1c = VFNMSI(TY, TT);
+			 STM2(&(xo[24]), T1c, ovs, &(xo[0]));
+			 T1d = VFMAI(TB, Tz);
+			 STM2(&(xo[18]), T1d, ovs, &(xo[2]));
+			 T1e = VFNMSI(TB, Tz);
+			 STM2(&(xo[10]), T1e, ovs, &(xo[2]));
+			 {
+			      V T1f, T1g, T1h, T1i;
+			      T1f = VFMAI(T13, T11);
+			      STM2(&(xo[12]), T1f, ovs, &(xo[0]));
+			      STN2(&(xo[12]), T1f, T1a, ovs);
+			      T1g = VFNMSI(T13, T11);
+			      STM2(&(xo[16]), T1g, ovs, &(xo[0]));
+			      STN2(&(xo[16]), T1g, T1d, ovs);
+			      T1h = VFMAI(T18, T16);
+			      STM2(&(xo[8]), T1h, ovs, &(xo[0]));
+			      STN2(&(xo[8]), T1h, T1e, ovs);
+			      T1i = VFNMSI(T18, T16);
+			      STM2(&(xo[20]), T1i, ovs, &(xo[0]));
+			      {
+				   V T1j, T1k, T1l, T1m;
+				   T1j = VFNMSI(TG, TE);
+				   STM2(&(xo[26]), T1j, ovs, &(xo[2]));
+				   STN2(&(xo[24]), T1c, T1j, ovs);
+				   T1k = VFMAI(TG, TE);
+				   STM2(&(xo[2]), T1k, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T19, T1k, ovs);
+				   T1l = VFNMSI(Tw, Tr);
+				   STM2(&(xo[22]), T1l, ovs, &(xo[2]));
+				   STN2(&(xo[20]), T1i, T1l, ovs);
+				   T1m = VFMAI(Tw, Tr);
+				   STM2(&(xo[6]), T1m, ovs, &(xo[2]));
+				   STN2(&(xo[4]), T1b, T1m, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 14, XSIMD_STRING("n2bv_14"), {32, 6, 42, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_14) (planner *p) {
+     X(kdft_register) (p, n2bv_14, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 14 -name n2bv_14 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 74 FP additions, 36 FP multiplications,
+ * (or, 50 additions, 12 multiplications, 24 fused multiply/add),
+ * 41 stack variables, 6 constants, and 35 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
+	       V Tp, Ty, Tl, TL, Tq, TE, T7, TJ, Ts, TB, Te, TK, Tr, TH, Tn;
+	       V To;
+	       Tn = LD(&(xi[0]), ivs, &(xi[0]));
+	       To = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       Tp = VSUB(Tn, To);
+	       Ty = VADD(Tn, To);
+	       {
+		    V Th, TC, Tk, TD;
+		    {
+			 V Tf, Tg, Ti, Tj;
+			 Tf = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tg = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Th = VSUB(Tf, Tg);
+			 TC = VADD(Tf, Tg);
+			 Ti = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tj = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tk = VSUB(Ti, Tj);
+			 TD = VADD(Ti, Tj);
+		    }
+		    Tl = VSUB(Th, Tk);
+		    TL = VSUB(TD, TC);
+		    Tq = VADD(Th, Tk);
+		    TE = VADD(TC, TD);
+	       }
+	       {
+		    V T3, Tz, T6, TA;
+		    {
+			 V T1, T2, T4, T5;
+			 T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 Tz = VADD(T1, T2);
+			 T4 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 TA = VADD(T4, T5);
+		    }
+		    T7 = VSUB(T3, T6);
+		    TJ = VSUB(Tz, TA);
+		    Ts = VADD(T3, T6);
+		    TB = VADD(Tz, TA);
+	       }
+	       {
+		    V Ta, TF, Td, TG;
+		    {
+			 V T8, T9, Tb, Tc;
+			 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 T9 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VSUB(T8, T9);
+			 TF = VADD(T8, T9);
+			 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 TG = VADD(Tb, Tc);
+		    }
+		    Te = VSUB(Ta, Td);
+		    TK = VSUB(TG, TF);
+		    Tr = VADD(Ta, Td);
+		    TH = VADD(TF, TG);
+	       }
+	       {
+		    V TR, TS, TU, TV;
+		    TR = VADD(Tp, VADD(Ts, VADD(Tq, Tr)));
+		    STM2(&(xo[14]), TR, ovs, &(xo[2]));
+		    TS = VADD(Ty, VADD(TB, VADD(TE, TH)));
+		    STM2(&(xo[0]), TS, ovs, &(xo[0]));
+		    {
+			 V TT, Tm, Tt, TQ, TP, TW;
+			 Tm = VBYI(VFMA(LDK(KP433883739), T7, VFNMS(LDK(KP781831482), Tl, VMUL(LDK(KP974927912), Te))));
+			 Tt = VFMA(LDK(KP623489801), Tq, VFNMS(LDK(KP222520933), Tr, VFNMS(LDK(KP900968867), Ts, Tp)));
+			 TT = VADD(Tm, Tt);
+			 STM2(&(xo[6]), TT, ovs, &(xo[2]));
+			 TU = VSUB(Tt, Tm);
+			 STM2(&(xo[22]), TU, ovs, &(xo[2]));
+			 TQ = VBYI(VFMA(LDK(KP974927912), TJ, VFMA(LDK(KP433883739), TL, VMUL(LDK(KP781831482), TK))));
+			 TP = VFMA(LDK(KP623489801), TH, VFNMS(LDK(KP900968867), TE, VFNMS(LDK(KP222520933), TB, Ty)));
+			 TV = VSUB(TP, TQ);
+			 STM2(&(xo[24]), TV, ovs, &(xo[0]));
+			 TW = VADD(TP, TQ);
+			 STM2(&(xo[4]), TW, ovs, &(xo[0]));
+			 STN2(&(xo[4]), TW, TT, ovs);
+		    }
+		    {
+			 V T10, TM, TI, TZ;
+			 {
+			      V Tu, Tv, TX, TY;
+			      Tu = VBYI(VFMA(LDK(KP781831482), T7, VFMA(LDK(KP974927912), Tl, VMUL(LDK(KP433883739), Te))));
+			      Tv = VFMA(LDK(KP623489801), Ts, VFNMS(LDK(KP900968867), Tr, VFNMS(LDK(KP222520933), Tq, Tp)));
+			      TX = VADD(Tu, Tv);
+			      STM2(&(xo[2]), TX, ovs, &(xo[2]));
+			      STN2(&(xo[0]), TS, TX, ovs);
+			      TY = VSUB(Tv, Tu);
+			      STM2(&(xo[26]), TY, ovs, &(xo[2]));
+			      STN2(&(xo[24]), TV, TY, ovs);
+			 }
+			 TM = VBYI(VFNMS(LDK(KP433883739), TK, VFNMS(LDK(KP974927912), TL, VMUL(LDK(KP781831482), TJ))));
+			 TI = VFMA(LDK(KP623489801), TB, VFNMS(LDK(KP900968867), TH, VFNMS(LDK(KP222520933), TE, Ty)));
+			 TZ = VSUB(TI, TM);
+			 STM2(&(xo[12]), TZ, ovs, &(xo[0]));
+			 STN2(&(xo[12]), TZ, TR, ovs);
+			 T10 = VADD(TI, TM);
+			 STM2(&(xo[16]), T10, ovs, &(xo[0]));
+			 {
+			      V T11, TO, TN, T12;
+			      TO = VBYI(VFMA(LDK(KP433883739), TJ, VFNMS(LDK(KP974927912), TK, VMUL(LDK(KP781831482), TL))));
+			      TN = VFMA(LDK(KP623489801), TE, VFNMS(LDK(KP222520933), TH, VFNMS(LDK(KP900968867), TB, Ty)));
+			      T11 = VSUB(TN, TO);
+			      STM2(&(xo[8]), T11, ovs, &(xo[0]));
+			      T12 = VADD(TN, TO);
+			      STM2(&(xo[20]), T12, ovs, &(xo[0]));
+			      STN2(&(xo[20]), T12, TU, ovs);
+			      {
+				   V Tx, Tw, T13, T14;
+				   Tx = VBYI(VFNMS(LDK(KP781831482), Te, VFNMS(LDK(KP433883739), Tl, VMUL(LDK(KP974927912), T7))));
+				   Tw = VFMA(LDK(KP623489801), Tr, VFNMS(LDK(KP900968867), Tq, VFNMS(LDK(KP222520933), Ts, Tp)));
+				   T13 = VSUB(Tw, Tx);
+				   STM2(&(xo[10]), T13, ovs, &(xo[2]));
+				   STN2(&(xo[8]), T11, T13, ovs);
+				   T14 = VADD(Tx, Tw);
+				   STM2(&(xo[18]), T14, ovs, &(xo[2]));
+				   STN2(&(xo[16]), T10, T14, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 14, XSIMD_STRING("n2bv_14"), {50, 12, 24, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_14) (planner *p) {
+     X(kdft_register) (p, n2bv_14, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:31 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 16 -name n2bv_16 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 72 FP additions, 34 FP multiplications,
+ * (or, 38 additions, 0 multiplications, 34 fused multiply/add),
+ * 62 stack variables, 3 constants, and 40 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       V T7, Tu, TF, TB, T13, TL, TO, TX, TC, Te, TP, Th, TQ, Tk, TW;
+	       V T16;
+	       {
+		    V TH, TU, Tz, Tf, TK, TV, TA, TM, Ta, TN, Td, Tg, Ti, Tj;
+		    {
+			 V T1, T2, T4, T5, To, Tp, Tr, Ts;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Tp = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tr = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Ts = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 {
+			      V T8, TI, Tq, TJ, Tt, T9, Tb, Tc, T3, T6;
+			      T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			      TH = VSUB(T1, T2);
+			      T3 = VADD(T1, T2);
+			      TU = VSUB(T4, T5);
+			      T6 = VADD(T4, T5);
+			      TI = VSUB(To, Tp);
+			      Tq = VADD(To, Tp);
+			      TJ = VSUB(Tr, Ts);
+			      Tt = VADD(Tr, Ts);
+			      T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			      Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			      Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			      T7 = VSUB(T3, T6);
+			      Tz = VADD(T3, T6);
+			      Tf = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			      TK = VADD(TI, TJ);
+			      TV = VSUB(TI, TJ);
+			      TA = VADD(Tq, Tt);
+			      Tu = VSUB(Tq, Tt);
+			      TM = VSUB(T8, T9);
+			      Ta = VADD(T8, T9);
+			      TN = VSUB(Tb, Tc);
+			      Td = VADD(Tb, Tc);
+			      Tg = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			      Ti = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			      Tj = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 }
+		    }
+		    TF = VADD(Tz, TA);
+		    TB = VSUB(Tz, TA);
+		    T13 = VFNMS(LDK(KP707106781), TK, TH);
+		    TL = VFMA(LDK(KP707106781), TK, TH);
+		    TO = VFNMS(LDK(KP414213562), TN, TM);
+		    TX = VFMA(LDK(KP414213562), TM, TN);
+		    TC = VADD(Ta, Td);
+		    Te = VSUB(Ta, Td);
+		    TP = VSUB(Tf, Tg);
+		    Th = VADD(Tf, Tg);
+		    TQ = VSUB(Tj, Ti);
+		    Tk = VADD(Ti, Tj);
+		    TW = VFMA(LDK(KP707106781), TV, TU);
+		    T16 = VFNMS(LDK(KP707106781), TV, TU);
+	       }
+	       {
+		    V TY, TR, Tl, TD;
+		    TY = VFMA(LDK(KP414213562), TP, TQ);
+		    TR = VFNMS(LDK(KP414213562), TQ, TP);
+		    Tl = VSUB(Th, Tk);
+		    TD = VADD(Th, Tk);
+		    {
+			 V TS, T17, TZ, T14;
+			 TS = VADD(TO, TR);
+			 T17 = VSUB(TO, TR);
+			 TZ = VSUB(TX, TY);
+			 T14 = VADD(TX, TY);
+			 {
+			      V TE, TG, Tm, Tv;
+			      TE = VSUB(TC, TD);
+			      TG = VADD(TC, TD);
+			      Tm = VADD(Te, Tl);
+			      Tv = VSUB(Te, Tl);
+			      {
+				   V T18, T1a, TT, T11;
+				   T18 = VFMA(LDK(KP923879532), T17, T16);
+				   T1a = VFNMS(LDK(KP923879532), T17, T16);
+				   TT = VFNMS(LDK(KP923879532), TS, TL);
+				   T11 = VFMA(LDK(KP923879532), TS, TL);
+				   {
+					V T15, T19, T10, T12;
+					T15 = VFNMS(LDK(KP923879532), T14, T13);
+					T19 = VFMA(LDK(KP923879532), T14, T13);
+					T10 = VFNMS(LDK(KP923879532), TZ, TW);
+					T12 = VFMA(LDK(KP923879532), TZ, TW);
+					{
+					     V T1b, T1c, T1d, T1e;
+					     T1b = VADD(TF, TG);
+					     STM2(&(xo[0]), T1b, ovs, &(xo[0]));
+					     T1c = VSUB(TF, TG);
+					     STM2(&(xo[16]), T1c, ovs, &(xo[0]));
+					     T1d = VFMAI(TE, TB);
+					     STM2(&(xo[8]), T1d, ovs, &(xo[0]));
+					     T1e = VFNMSI(TE, TB);
+					     STM2(&(xo[24]), T1e, ovs, &(xo[0]));
+					     {
+						  V Tw, Ty, Tn, Tx;
+						  Tw = VFNMS(LDK(KP707106781), Tv, Tu);
+						  Ty = VFMA(LDK(KP707106781), Tv, Tu);
+						  Tn = VFNMS(LDK(KP707106781), Tm, T7);
+						  Tx = VFMA(LDK(KP707106781), Tm, T7);
+						  {
+						       V T1f, T1g, T1h, T1i;
+						       T1f = VFNMSI(T1a, T19);
+						       STM2(&(xo[6]), T1f, ovs, &(xo[2]));
+						       T1g = VFMAI(T1a, T19);
+						       STM2(&(xo[26]), T1g, ovs, &(xo[2]));
+						       STN2(&(xo[24]), T1e, T1g, ovs);
+						       T1h = VFNMSI(T18, T15);
+						       STM2(&(xo[22]), T1h, ovs, &(xo[2]));
+						       T1i = VFMAI(T18, T15);
+						       STM2(&(xo[10]), T1i, ovs, &(xo[2]));
+						       STN2(&(xo[8]), T1d, T1i, ovs);
+						       {
+							    V T1j, T1k, T1l, T1m;
+							    T1j = VFNMSI(T12, T11);
+							    STM2(&(xo[30]), T1j, ovs, &(xo[2]));
+							    T1k = VFMAI(T12, T11);
+							    STM2(&(xo[2]), T1k, ovs, &(xo[2]));
+							    STN2(&(xo[0]), T1b, T1k, ovs);
+							    T1l = VFMAI(T10, TT);
+							    STM2(&(xo[18]), T1l, ovs, &(xo[2]));
+							    STN2(&(xo[16]), T1c, T1l, ovs);
+							    T1m = VFNMSI(T10, TT);
+							    STM2(&(xo[14]), T1m, ovs, &(xo[2]));
+							    {
+								 V T1n, T1o, T1p, T1q;
+								 T1n = VFMAI(Ty, Tx);
+								 STM2(&(xo[4]), T1n, ovs, &(xo[0]));
+								 STN2(&(xo[4]), T1n, T1f, ovs);
+								 T1o = VFNMSI(Ty, Tx);
+								 STM2(&(xo[28]), T1o, ovs, &(xo[0]));
+								 STN2(&(xo[28]), T1o, T1j, ovs);
+								 T1p = VFMAI(Tw, Tn);
+								 STM2(&(xo[20]), T1p, ovs, &(xo[0]));
+								 STN2(&(xo[20]), T1p, T1h, ovs);
+								 T1q = VFNMSI(Tw, Tn);
+								 STM2(&(xo[12]), T1q, ovs, &(xo[0]));
+								 STN2(&(xo[12]), T1q, T1m, ovs);
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 16, XSIMD_STRING("n2bv_16"), {38, 0, 34, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_16) (planner *p) {
+     X(kdft_register) (p, n2bv_16, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 16 -name n2bv_16 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 72 FP additions, 12 FP multiplications,
+ * (or, 68 additions, 8 multiplications, 4 fused multiply/add),
+ * 38 stack variables, 3 constants, and 40 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       V Tp, T13, Tu, TY, Tm, T14, Tv, TU, T7, T16, Tx, TN, Te, T17, Ty;
+	       V TQ;
+	       {
+		    V Tn, To, TX, Ts, Tt, TW;
+		    Tn = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    To = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+		    TX = VADD(Tn, To);
+		    Ts = LD(&(xi[0]), ivs, &(xi[0]));
+		    Tt = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    TW = VADD(Ts, Tt);
+		    Tp = VSUB(Tn, To);
+		    T13 = VADD(TW, TX);
+		    Tu = VSUB(Ts, Tt);
+		    TY = VSUB(TW, TX);
+	       }
+	       {
+		    V Ti, TS, Tl, TT;
+		    {
+			 V Tg, Th, Tj, Tk;
+			 Tg = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Th = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Ti = VSUB(Tg, Th);
+			 TS = VADD(Tg, Th);
+			 Tj = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 TT = VADD(Tj, Tk);
+		    }
+		    Tm = VMUL(LDK(KP707106781), VSUB(Ti, Tl));
+		    T14 = VADD(TS, TT);
+		    Tv = VMUL(LDK(KP707106781), VADD(Ti, Tl));
+		    TU = VSUB(TS, TT);
+	       }
+	       {
+		    V T3, TL, T6, TM;
+		    {
+			 V T1, T2, T4, T5;
+			 T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 TL = VADD(T1, T2);
+			 T4 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T5 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 TM = VADD(T4, T5);
+		    }
+		    T7 = VFNMS(LDK(KP382683432), T6, VMUL(LDK(KP923879532), T3));
+		    T16 = VADD(TL, TM);
+		    Tx = VFMA(LDK(KP382683432), T3, VMUL(LDK(KP923879532), T6));
+		    TN = VSUB(TL, TM);
+	       }
+	       {
+		    V Ta, TO, Td, TP;
+		    {
+			 V T8, T9, Tb, Tc;
+			 T8 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VSUB(T8, T9);
+			 TO = VADD(T8, T9);
+			 Tb = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 TP = VADD(Tb, Tc);
+		    }
+		    Te = VFMA(LDK(KP923879532), Ta, VMUL(LDK(KP382683432), Td));
+		    T17 = VADD(TO, TP);
+		    Ty = VFNMS(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), Td));
+		    TQ = VSUB(TO, TP);
+	       }
+	       {
+		    V T1b, T1c, T1d, T1e;
+		    {
+			 V T15, T18, T19, T1a;
+			 T15 = VSUB(T13, T14);
+			 T18 = VBYI(VSUB(T16, T17));
+			 T1b = VSUB(T15, T18);
+			 STM2(&(xo[24]), T1b, ovs, &(xo[0]));
+			 T1c = VADD(T15, T18);
+			 STM2(&(xo[8]), T1c, ovs, &(xo[0]));
+			 T19 = VADD(T13, T14);
+			 T1a = VADD(T16, T17);
+			 T1d = VSUB(T19, T1a);
+			 STM2(&(xo[16]), T1d, ovs, &(xo[0]));
+			 T1e = VADD(T19, T1a);
+			 STM2(&(xo[0]), T1e, ovs, &(xo[0]));
+		    }
+		    {
+			 V T1f, T1g, T1h, T1i;
+			 {
+			      V TV, T11, T10, T12, TR, TZ;
+			      TR = VMUL(LDK(KP707106781), VSUB(TN, TQ));
+			      TV = VBYI(VSUB(TR, TU));
+			      T11 = VBYI(VADD(TU, TR));
+			      TZ = VMUL(LDK(KP707106781), VADD(TN, TQ));
+			      T10 = VSUB(TY, TZ);
+			      T12 = VADD(TY, TZ);
+			      T1f = VADD(TV, T10);
+			      STM2(&(xo[12]), T1f, ovs, &(xo[0]));
+			      T1g = VSUB(T12, T11);
+			      STM2(&(xo[28]), T1g, ovs, &(xo[0]));
+			      T1h = VSUB(T10, TV);
+			      STM2(&(xo[20]), T1h, ovs, &(xo[0]));
+			      T1i = VADD(T11, T12);
+			      STM2(&(xo[4]), T1i, ovs, &(xo[0]));
+			 }
+			 {
+			      V Tr, TB, TA, TC;
+			      {
+				   V Tf, Tq, Tw, Tz;
+				   Tf = VSUB(T7, Te);
+				   Tq = VSUB(Tm, Tp);
+				   Tr = VBYI(VSUB(Tf, Tq));
+				   TB = VBYI(VADD(Tq, Tf));
+				   Tw = VSUB(Tu, Tv);
+				   Tz = VSUB(Tx, Ty);
+				   TA = VSUB(Tw, Tz);
+				   TC = VADD(Tw, Tz);
+			      }
+			      {
+				   V T1j, T1k, T1l, T1m;
+				   T1j = VADD(Tr, TA);
+				   STM2(&(xo[10]), T1j, ovs, &(xo[2]));
+				   STN2(&(xo[8]), T1c, T1j, ovs);
+				   T1k = VSUB(TC, TB);
+				   STM2(&(xo[26]), T1k, ovs, &(xo[2]));
+				   STN2(&(xo[24]), T1b, T1k, ovs);
+				   T1l = VSUB(TA, Tr);
+				   STM2(&(xo[22]), T1l, ovs, &(xo[2]));
+				   STN2(&(xo[20]), T1h, T1l, ovs);
+				   T1m = VADD(TB, TC);
+				   STM2(&(xo[6]), T1m, ovs, &(xo[2]));
+				   STN2(&(xo[4]), T1i, T1m, ovs);
+			      }
+			 }
+			 {
+			      V TF, TJ, TI, TK;
+			      {
+				   V TD, TE, TG, TH;
+				   TD = VADD(Tu, Tv);
+				   TE = VADD(T7, Te);
+				   TF = VADD(TD, TE);
+				   TJ = VSUB(TD, TE);
+				   TG = VADD(Tp, Tm);
+				   TH = VADD(Tx, Ty);
+				   TI = VBYI(VADD(TG, TH));
+				   TK = VBYI(VSUB(TH, TG));
+			      }
+			      {
+				   V T1n, T1o, T1p, T1q;
+				   T1n = VSUB(TF, TI);
+				   STM2(&(xo[30]), T1n, ovs, &(xo[2]));
+				   STN2(&(xo[28]), T1g, T1n, ovs);
+				   T1o = VADD(TJ, TK);
+				   STM2(&(xo[14]), T1o, ovs, &(xo[2]));
+				   STN2(&(xo[12]), T1f, T1o, ovs);
+				   T1p = VADD(TF, TI);
+				   STM2(&(xo[2]), T1p, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T1e, T1p, ovs);
+				   T1q = VSUB(TJ, TK);
+				   STM2(&(xo[18]), T1q, ovs, &(xo[2]));
+				   STN2(&(xo[16]), T1d, T1q, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 16, XSIMD_STRING("n2bv_16"), {68, 8, 4, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_16) (planner *p) {
+     X(kdft_register) (p, n2bv_16, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:29 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 2 -name n2bv_2 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 7 stack variables, 0 constants, and 5 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(4, is), MAKE_VOLATILE_STRIDE(4, os)) {
+	       V T1, T2, T3, T4;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = VADD(T1, T2);
+	       STM2(&(xo[0]), T3, ovs, &(xo[0]));
+	       T4 = VSUB(T1, T2);
+	       STM2(&(xo[2]), T4, ovs, &(xo[2]));
+	       STN2(&(xo[0]), T3, T4, ovs);
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 2, XSIMD_STRING("n2bv_2"), {2, 0, 0, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_2) (planner *p) {
+     X(kdft_register) (p, n2bv_2, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 2 -name n2bv_2 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 7 stack variables, 0 constants, and 5 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(4, is), MAKE_VOLATILE_STRIDE(4, os)) {
+	       V T1, T2, T3, T4;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = VSUB(T1, T2);
+	       STM2(&(xo[2]), T3, ovs, &(xo[2]));
+	       T4 = VADD(T1, T2);
+	       STM2(&(xo[0]), T4, ovs, &(xo[0]));
+	       STN2(&(xo[0]), T4, T3, ovs);
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 2, XSIMD_STRING("n2bv_2"), {2, 0, 0, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_2) (planner *p) {
+     X(kdft_register) (p, n2bv_2, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:46 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n2bv_20 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 104 FP additions, 50 FP multiplications,
+ * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
+ * 79 stack variables, 4 constants, and 50 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
+	       V T1H, T1I, TS, TA, TN, TV, T1M, T1N, T1O, T1P, T1R, T1S, TK, TU, TR;
+	       V Tl;
+	       {
+		    V T3, TE, T1r, T13, Ta, TL, Tz, TG, Ts, TF, Th, TM, T1u, T1C, T1n;
+		    V T1a, T1m, T1h, T1x, T1D, Tk, Ti;
+		    {
+			 V T1, T2, TC, TD;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 TC = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 TD = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 {
+			      V T14, T6, T1c, Tv, Tm, T1f, Ty, T17, T9, Tn, Tp, T1b, Td, Tq, Te;
+			      V Tf, T15, To;
+			      {
+				   V Tw, Tx, T7, T8, Tb, Tc;
+				   {
+					V T4, T5, Tt, Tu, T11, T12;
+					T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+					Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+					Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+					Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+					T3 = VSUB(T1, T2);
+					T11 = VADD(T1, T2);
+					TE = VSUB(TC, TD);
+					T12 = VADD(TC, TD);
+					T14 = VADD(T4, T5);
+					T6 = VSUB(T4, T5);
+					T1c = VADD(Tt, Tu);
+					Tv = VSUB(Tt, Tu);
+					Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+					T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+					T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+					T1r = VADD(T11, T12);
+					T13 = VSUB(T11, T12);
+				   }
+				   Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+				   Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+				   Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+				   T1f = VADD(Tw, Tx);
+				   Ty = VSUB(Tw, Tx);
+				   T17 = VADD(T7, T8);
+				   T9 = VSUB(T7, T8);
+				   Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+				   Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+				   T1b = VADD(Tb, Tc);
+				   Td = VSUB(Tb, Tc);
+				   Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+				   Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+				   Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			      }
+			      Ta = VADD(T6, T9);
+			      TL = VSUB(T6, T9);
+			      T15 = VADD(Tm, Tn);
+			      To = VSUB(Tm, Tn);
+			      Tz = VSUB(Tv, Ty);
+			      TG = VADD(Tv, Ty);
+			      {
+				   V T1d, T1v, T18, Tr, T1e, Tg, T16, T1s;
+				   T1d = VSUB(T1b, T1c);
+				   T1v = VADD(T1b, T1c);
+				   T18 = VADD(Tp, Tq);
+				   Tr = VSUB(Tp, Tq);
+				   T1e = VADD(Te, Tf);
+				   Tg = VSUB(Te, Tf);
+				   T16 = VSUB(T14, T15);
+				   T1s = VADD(T14, T15);
+				   {
+					V T1t, T19, T1w, T1g;
+					T1t = VADD(T17, T18);
+					T19 = VSUB(T17, T18);
+					Ts = VSUB(To, Tr);
+					TF = VADD(To, Tr);
+					T1w = VADD(T1e, T1f);
+					T1g = VSUB(T1e, T1f);
+					Th = VADD(Td, Tg);
+					TM = VSUB(Td, Tg);
+					T1u = VADD(T1s, T1t);
+					T1C = VSUB(T1s, T1t);
+					T1n = VSUB(T16, T19);
+					T1a = VADD(T16, T19);
+					T1m = VSUB(T1d, T1g);
+					T1h = VADD(T1d, T1g);
+					T1x = VADD(T1v, T1w);
+					T1D = VSUB(T1v, T1w);
+				   }
+			      }
+			 }
+		    }
+		    Tk = VSUB(Ta, Th);
+		    Ti = VADD(Ta, Th);
+		    {
+			 V TJ, T1k, T1A, TZ, Tj, T1E, T1G, TI, T10, T1j, T1z, T1i, T1y, TH;
+			 TJ = VSUB(TF, TG);
+			 TH = VADD(TF, TG);
+			 T1i = VADD(T1a, T1h);
+			 T1k = VSUB(T1a, T1h);
+			 T1y = VADD(T1u, T1x);
+			 T1A = VSUB(T1u, T1x);
+			 TZ = VADD(T3, Ti);
+			 Tj = VFNMS(LDK(KP250000000), Ti, T3);
+			 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
+			 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
+			 TI = VFNMS(LDK(KP250000000), TH, TE);
+			 T10 = VADD(TE, TH);
+			 T1j = VFNMS(LDK(KP250000000), T1i, T13);
+			 T1H = VADD(T1r, T1y);
+			 STM2(&(xo[0]), T1H, ovs, &(xo[0]));
+			 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
+			 T1I = VADD(T13, T1i);
+			 STM2(&(xo[20]), T1I, ovs, &(xo[0]));
+			 {
+			      V T1J, T1K, T1p, T1l, T1o, T1q, T1F, T1B, T1L, T1Q;
+			      TS = VFNMS(LDK(KP618033988), Ts, Tz);
+			      TA = VFMA(LDK(KP618033988), Tz, Ts);
+			      TN = VFMA(LDK(KP618033988), TM, TL);
+			      TV = VFNMS(LDK(KP618033988), TL, TM);
+			      T1J = VFMAI(T10, TZ);
+			      STM2(&(xo[10]), T1J, ovs, &(xo[2]));
+			      T1K = VFNMSI(T10, TZ);
+			      STM2(&(xo[30]), T1K, ovs, &(xo[2]));
+			      T1p = VFMA(LDK(KP559016994), T1k, T1j);
+			      T1l = VFNMS(LDK(KP559016994), T1k, T1j);
+			      T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
+			      T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
+			      T1F = VFNMS(LDK(KP559016994), T1A, T1z);
+			      T1B = VFMA(LDK(KP559016994), T1A, T1z);
+			      T1L = VFNMSI(T1q, T1p);
+			      STM2(&(xo[28]), T1L, ovs, &(xo[0]));
+			      STN2(&(xo[28]), T1L, T1K, ovs);
+			      T1M = VFMAI(T1q, T1p);
+			      STM2(&(xo[12]), T1M, ovs, &(xo[0]));
+			      T1N = VFMAI(T1o, T1l);
+			      STM2(&(xo[36]), T1N, ovs, &(xo[0]));
+			      T1O = VFNMSI(T1o, T1l);
+			      STM2(&(xo[4]), T1O, ovs, &(xo[0]));
+			      T1P = VFMAI(T1E, T1B);
+			      STM2(&(xo[32]), T1P, ovs, &(xo[0]));
+			      T1Q = VFNMSI(T1E, T1B);
+			      STM2(&(xo[8]), T1Q, ovs, &(xo[0]));
+			      STN2(&(xo[8]), T1Q, T1J, ovs);
+			      T1R = VFNMSI(T1G, T1F);
+			      STM2(&(xo[24]), T1R, ovs, &(xo[0]));
+			      T1S = VFMAI(T1G, T1F);
+			      STM2(&(xo[16]), T1S, ovs, &(xo[0]));
+			      TK = VFMA(LDK(KP559016994), TJ, TI);
+			      TU = VFNMS(LDK(KP559016994), TJ, TI);
+			      TR = VFNMS(LDK(KP559016994), Tk, Tj);
+			      Tl = VFMA(LDK(KP559016994), Tk, Tj);
+			 }
+		    }
+	       }
+	       {
+		    V TY, TW, TO, TQ, TB, TP, TX, TT;
+		    TY = VFMA(LDK(KP951056516), TV, TU);
+		    TW = VFNMS(LDK(KP951056516), TV, TU);
+		    TO = VFMA(LDK(KP951056516), TN, TK);
+		    TQ = VFNMS(LDK(KP951056516), TN, TK);
+		    TB = VFNMS(LDK(KP951056516), TA, Tl);
+		    TP = VFMA(LDK(KP951056516), TA, Tl);
+		    TX = VFNMS(LDK(KP951056516), TS, TR);
+		    TT = VFMA(LDK(KP951056516), TS, TR);
+		    {
+			 V T1T, T1U, T1V, T1W;
+			 T1T = VFMAI(TQ, TP);
+			 STM2(&(xo[18]), T1T, ovs, &(xo[2]));
+			 STN2(&(xo[16]), T1S, T1T, ovs);
+			 T1U = VFNMSI(TQ, TP);
+			 STM2(&(xo[22]), T1U, ovs, &(xo[2]));
+			 STN2(&(xo[20]), T1I, T1U, ovs);
+			 T1V = VFMAI(TO, TB);
+			 STM2(&(xo[2]), T1V, ovs, &(xo[2]));
+			 STN2(&(xo[0]), T1H, T1V, ovs);
+			 T1W = VFNMSI(TO, TB);
+			 STM2(&(xo[38]), T1W, ovs, &(xo[2]));
+			 STN2(&(xo[36]), T1N, T1W, ovs);
+			 {
+			      V T1X, T1Y, T1Z, T20;
+			      T1X = VFMAI(TW, TT);
+			      STM2(&(xo[34]), T1X, ovs, &(xo[2]));
+			      STN2(&(xo[32]), T1P, T1X, ovs);
+			      T1Y = VFNMSI(TW, TT);
+			      STM2(&(xo[6]), T1Y, ovs, &(xo[2]));
+			      STN2(&(xo[4]), T1O, T1Y, ovs);
+			      T1Z = VFMAI(TY, TX);
+			      STM2(&(xo[26]), T1Z, ovs, &(xo[2]));
+			      STN2(&(xo[24]), T1R, T1Z, ovs);
+			      T20 = VFNMSI(TY, TX);
+			      STM2(&(xo[14]), T20, ovs, &(xo[2]));
+			      STN2(&(xo[12]), T1M, T20, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 20, XSIMD_STRING("n2bv_20"), {58, 4, 46, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_20) (planner *p) {
+     X(kdft_register) (p, n2bv_20, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n2bv_20 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 104 FP additions, 24 FP multiplications,
+ * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
+ * 57 stack variables, 4 constants, and 50 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
+	       V T3, T1y, TH, T1i, Ts, TL, TM, Tz, T13, T16, T1j, T1u, T1v, T1w, T1r;
+	       V T1s, T1t, T1a, T1d, T1k, Ti, Tk, TE, TI;
+	       {
+		    V T1, T2, T1g, TF, TG, T1h;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T1g = VADD(T1, T2);
+		    TF = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    TG = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+		    T1h = VADD(TF, TG);
+		    T3 = VSUB(T1, T2);
+		    T1y = VADD(T1g, T1h);
+		    TH = VSUB(TF, TG);
+		    T1i = VSUB(T1g, T1h);
+	       }
+	       {
+		    V T6, T11, Tv, T19, Ty, T1c, T9, T14, Td, T18, To, T12, Tr, T15, Tg;
+		    V T1b;
+		    {
+			 V T4, T5, Tt, Tu;
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T11 = VADD(T4, T5);
+			 Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tv = VSUB(Tt, Tu);
+			 T19 = VADD(Tt, Tu);
+		    }
+		    {
+			 V Tw, Tx, T7, T8;
+			 Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Ty = VSUB(Tw, Tx);
+			 T1c = VADD(Tw, Tx);
+			 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T14 = VADD(T7, T8);
+		    }
+		    {
+			 V Tb, Tc, Tm, Tn;
+			 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 Td = VSUB(Tb, Tc);
+			 T18 = VADD(Tb, Tc);
+			 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			 To = VSUB(Tm, Tn);
+			 T12 = VADD(Tm, Tn);
+		    }
+		    {
+			 V Tp, Tq, Te, Tf;
+			 Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Tr = VSUB(Tp, Tq);
+			 T15 = VADD(Tp, Tq);
+			 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Tg = VSUB(Te, Tf);
+			 T1b = VADD(Te, Tf);
+		    }
+		    Ts = VSUB(To, Tr);
+		    TL = VSUB(T6, T9);
+		    TM = VSUB(Td, Tg);
+		    Tz = VSUB(Tv, Ty);
+		    T13 = VSUB(T11, T12);
+		    T16 = VSUB(T14, T15);
+		    T1j = VADD(T13, T16);
+		    T1u = VADD(T18, T19);
+		    T1v = VADD(T1b, T1c);
+		    T1w = VADD(T1u, T1v);
+		    T1r = VADD(T11, T12);
+		    T1s = VADD(T14, T15);
+		    T1t = VADD(T1r, T1s);
+		    T1a = VSUB(T18, T19);
+		    T1d = VSUB(T1b, T1c);
+		    T1k = VADD(T1a, T1d);
+		    {
+			 V Ta, Th, TC, TD;
+			 Ta = VADD(T6, T9);
+			 Th = VADD(Td, Tg);
+			 Ti = VADD(Ta, Th);
+			 Tk = VMUL(LDK(KP559016994), VSUB(Ta, Th));
+			 TC = VADD(To, Tr);
+			 TD = VADD(Tv, Ty);
+			 TE = VMUL(LDK(KP559016994), VSUB(TC, TD));
+			 TI = VADD(TC, TD);
+		    }
+	       }
+	       {
+		    V T1H, T1J, T1K, T1L, T1N, T1I, TZ, T10;
+		    TZ = VADD(T3, Ti);
+		    T10 = VBYI(VADD(TH, TI));
+		    T1H = VSUB(TZ, T10);
+		    STM2(&(xo[30]), T1H, ovs, &(xo[2]));
+		    T1I = VADD(TZ, T10);
+		    STM2(&(xo[10]), T1I, ovs, &(xo[2]));
+		    {
+			 V T1x, T1z, T1A, T1E, T1G, T1C, T1D, T1F, T1B, T1M;
+			 T1x = VMUL(LDK(KP559016994), VSUB(T1t, T1w));
+			 T1z = VADD(T1t, T1w);
+			 T1A = VFNMS(LDK(KP250000000), T1z, T1y);
+			 T1C = VSUB(T1r, T1s);
+			 T1D = VSUB(T1u, T1v);
+			 T1E = VBYI(VFMA(LDK(KP951056516), T1C, VMUL(LDK(KP587785252), T1D)));
+			 T1G = VBYI(VFNMS(LDK(KP951056516), T1D, VMUL(LDK(KP587785252), T1C)));
+			 T1J = VADD(T1y, T1z);
+			 STM2(&(xo[0]), T1J, ovs, &(xo[0]));
+			 T1F = VSUB(T1A, T1x);
+			 T1K = VSUB(T1F, T1G);
+			 STM2(&(xo[16]), T1K, ovs, &(xo[0]));
+			 T1L = VADD(T1G, T1F);
+			 STM2(&(xo[24]), T1L, ovs, &(xo[0]));
+			 T1B = VADD(T1x, T1A);
+			 T1M = VSUB(T1B, T1E);
+			 STM2(&(xo[8]), T1M, ovs, &(xo[0]));
+			 STN2(&(xo[8]), T1M, T1I, ovs);
+			 T1N = VADD(T1E, T1B);
+			 STM2(&(xo[32]), T1N, ovs, &(xo[0]));
+		    }
+		    {
+			 V T1O, T1P, T1R, T1S;
+			 {
+			      V T1n, T1l, T1m, T1f, T1p, T17, T1e, T1q, T1Q, T1o;
+			      T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
+			      T1l = VADD(T1j, T1k);
+			      T1m = VFNMS(LDK(KP250000000), T1l, T1i);
+			      T17 = VSUB(T13, T16);
+			      T1e = VSUB(T1a, T1d);
+			      T1f = VBYI(VFNMS(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
+			      T1p = VBYI(VFMA(LDK(KP951056516), T17, VMUL(LDK(KP587785252), T1e)));
+			      T1O = VADD(T1i, T1l);
+			      STM2(&(xo[20]), T1O, ovs, &(xo[0]));
+			      T1q = VADD(T1n, T1m);
+			      T1P = VADD(T1p, T1q);
+			      STM2(&(xo[12]), T1P, ovs, &(xo[0]));
+			      T1Q = VSUB(T1q, T1p);
+			      STM2(&(xo[28]), T1Q, ovs, &(xo[0]));
+			      STN2(&(xo[28]), T1Q, T1H, ovs);
+			      T1o = VSUB(T1m, T1n);
+			      T1R = VADD(T1f, T1o);
+			      STM2(&(xo[4]), T1R, ovs, &(xo[0]));
+			      T1S = VSUB(T1o, T1f);
+			      STM2(&(xo[36]), T1S, ovs, &(xo[0]));
+			 }
+			 {
+			      V TA, TN, TU, TS, TK, TV, Tl, TR, TJ, Tj;
+			      TA = VFNMS(LDK(KP951056516), Tz, VMUL(LDK(KP587785252), Ts));
+			      TN = VFNMS(LDK(KP951056516), TM, VMUL(LDK(KP587785252), TL));
+			      TU = VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TM));
+			      TS = VFMA(LDK(KP951056516), Ts, VMUL(LDK(KP587785252), Tz));
+			      TJ = VFNMS(LDK(KP250000000), TI, TH);
+			      TK = VSUB(TE, TJ);
+			      TV = VADD(TE, TJ);
+			      Tj = VFNMS(LDK(KP250000000), Ti, T3);
+			      Tl = VSUB(Tj, Tk);
+			      TR = VADD(Tk, Tj);
+			      {
+				   V TB, TO, T1T, T1U;
+				   TB = VSUB(Tl, TA);
+				   TO = VBYI(VSUB(TK, TN));
+				   T1T = VSUB(TB, TO);
+				   STM2(&(xo[34]), T1T, ovs, &(xo[2]));
+				   STN2(&(xo[32]), T1N, T1T, ovs);
+				   T1U = VADD(TB, TO);
+				   STM2(&(xo[6]), T1U, ovs, &(xo[2]));
+				   STN2(&(xo[4]), T1R, T1U, ovs);
+			      }
+			      {
+				   V TX, TY, T1V, T1W;
+				   TX = VADD(TR, TS);
+				   TY = VBYI(VSUB(TV, TU));
+				   T1V = VSUB(TX, TY);
+				   STM2(&(xo[22]), T1V, ovs, &(xo[2]));
+				   STN2(&(xo[20]), T1O, T1V, ovs);
+				   T1W = VADD(TX, TY);
+				   STM2(&(xo[18]), T1W, ovs, &(xo[2]));
+				   STN2(&(xo[16]), T1K, T1W, ovs);
+			      }
+			      {
+				   V TP, TQ, T1X, T1Y;
+				   TP = VADD(Tl, TA);
+				   TQ = VBYI(VADD(TN, TK));
+				   T1X = VSUB(TP, TQ);
+				   STM2(&(xo[26]), T1X, ovs, &(xo[2]));
+				   STN2(&(xo[24]), T1L, T1X, ovs);
+				   T1Y = VADD(TP, TQ);
+				   STM2(&(xo[14]), T1Y, ovs, &(xo[2]));
+				   STN2(&(xo[12]), T1P, T1Y, ovs);
+			      }
+			      {
+				   V TT, TW, T1Z, T20;
+				   TT = VSUB(TR, TS);
+				   TW = VBYI(VADD(TU, TV));
+				   T1Z = VSUB(TT, TW);
+				   STM2(&(xo[38]), T1Z, ovs, &(xo[2]));
+				   STN2(&(xo[36]), T1S, T1Z, ovs);
+				   T20 = VADD(TT, TW);
+				   STM2(&(xo[2]), T20, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T1J, T20, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 20, XSIMD_STRING("n2bv_20"), {92, 12, 12, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_20) (planner *p) {
+     X(kdft_register) (p, n2bv_20, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,823 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:32 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 32 -name n2bv_32 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 186 FP additions, 98 FP multiplications,
+ * (or, 88 additions, 0 multiplications, 98 fused multiply/add),
+ * 120 stack variables, 7 constants, and 80 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       V T31, T32, T33, T34, T35, T36, T37, T38, T39, T3a, T3b, T3c, T1h, Tr, T3d;
+	       V T3e, T3f, T3g, T1a, T1k, TI, T1b, T1L, T1P, T1I, T1G, T1O, T1Q, T1H, T1z;
+	       V T1c, TZ;
+	       {
+		    V T2x, T1T, T2K, T1W, T1p, Tb, T1A, T16, Tu, TF, T2O, T2H, T2b, T2t, TY;
+		    V T1w, TT, T1v, T20, T2C, Tj, Te, T2e, To, T2i, T23, T2D, TB, TG, Th;
+		    V T2f, Tk;
+		    {
+			 V TL, TW, TP, TQ, T2F, T27, T28, TO;
+			 {
+			      V T1, T2, T12, T13, T4, T5, T7, T8;
+			      T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			      T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			      T12 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			      T13 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			      T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			      T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			      T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			      T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			      {
+				   V TM, T25, T26, TN;
+				   {
+					V TJ, T3, T14, T1U, T6, T1V, T9, TK, TU, TV, T1R, T1S, Ta, T15;
+					TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+					T1R = VADD(T1, T2);
+					T3 = VSUB(T1, T2);
+					T1S = VADD(T12, T13);
+					T14 = VSUB(T12, T13);
+					T1U = VADD(T4, T5);
+					T6 = VSUB(T4, T5);
+					T1V = VADD(T7, T8);
+					T9 = VSUB(T7, T8);
+					TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+					TU = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+					T2x = VSUB(T1R, T1S);
+					T1T = VADD(T1R, T1S);
+					TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+					TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+					T2K = VSUB(T1U, T1V);
+					T1W = VADD(T1U, T1V);
+					Ta = VADD(T6, T9);
+					T15 = VSUB(T6, T9);
+					T25 = VADD(TJ, TK);
+					TL = VSUB(TJ, TK);
+					T26 = VADD(TV, TU);
+					TW = VSUB(TU, TV);
+					TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+					TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+					T1p = VFNMS(LDK(KP707106781), Ta, T3);
+					Tb = VFMA(LDK(KP707106781), Ta, T3);
+					T1A = VFNMS(LDK(KP707106781), T15, T14);
+					T16 = VFMA(LDK(KP707106781), T15, T14);
+					TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+				   }
+				   T2F = VSUB(T25, T26);
+				   T27 = VADD(T25, T26);
+				   T28 = VADD(TM, TN);
+				   TO = VSUB(TM, TN);
+			      }
+			 }
+			 {
+			      V Ty, T21, Tx, Tz, T1Y, T1Z;
+			      {
+				   V Ts, Tt, TD, T29, TR, TE, Tv, Tw;
+				   Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+				   Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+				   TD = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+				   T29 = VADD(TP, TQ);
+				   TR = VSUB(TP, TQ);
+				   TE = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+				   Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+				   Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+				   Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+				   T1Y = VADD(Ts, Tt);
+				   Tu = VSUB(Ts, Tt);
+				   {
+					V T2G, T2a, TX, TS;
+					T2G = VSUB(T29, T28);
+					T2a = VADD(T28, T29);
+					TX = VSUB(TR, TO);
+					TS = VADD(TO, TR);
+					T1Z = VADD(TD, TE);
+					TF = VSUB(TD, TE);
+					T21 = VADD(Tv, Tw);
+					Tx = VSUB(Tv, Tw);
+					T2O = VFMA(LDK(KP414213562), T2F, T2G);
+					T2H = VFNMS(LDK(KP414213562), T2G, T2F);
+					T2b = VSUB(T27, T2a);
+					T2t = VADD(T27, T2a);
+					TY = VFMA(LDK(KP707106781), TX, TW);
+					T1w = VFNMS(LDK(KP707106781), TX, TW);
+					TT = VFMA(LDK(KP707106781), TS, TL);
+					T1v = VFNMS(LDK(KP707106781), TS, TL);
+					Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+				   }
+			      }
+			      T20 = VADD(T1Y, T1Z);
+			      T2C = VSUB(T1Y, T1Z);
+			      {
+				   V Tc, Td, Tm, Tn;
+				   Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+				   Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+				   Tm = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+				   Tn = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+				   {
+					V Tf, TA, T22, Tg;
+					Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+					TA = VSUB(Ty, Tz);
+					T22 = VADD(Ty, Tz);
+					Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+					Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+					Te = VSUB(Tc, Td);
+					T2e = VADD(Tc, Td);
+					To = VSUB(Tm, Tn);
+					T2i = VADD(Tn, Tm);
+					T23 = VADD(T21, T22);
+					T2D = VSUB(T21, T22);
+					TB = VADD(Tx, TA);
+					TG = VSUB(Tx, TA);
+					Th = VSUB(Tf, Tg);
+					T2f = VADD(Tf, Tg);
+					Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T1t, TH, T1s, TC, T2P, T2U, T2n, T2d, T2w, T2u, T1q, T19, T1B, Tq, T2W;
+			 V T2M, T2B, T2T, T2v, T2r, T2o, T2m, T2X, T2I;
+			 {
+			      V T1X, T2p, T2E, T2N, T2s, T2y, T2g, T17, Ti, T2h, Tl, T2c, T2l, T24;
+			      T1X = VSUB(T1T, T1W);
+			      T2p = VADD(T1T, T1W);
+			      T2E = VFNMS(LDK(KP414213562), T2D, T2C);
+			      T2N = VFMA(LDK(KP414213562), T2C, T2D);
+			      T2s = VADD(T20, T23);
+			      T24 = VSUB(T20, T23);
+			      T1t = VFNMS(LDK(KP707106781), TG, TF);
+			      TH = VFMA(LDK(KP707106781), TG, TF);
+			      T1s = VFNMS(LDK(KP707106781), TB, Tu);
+			      TC = VFMA(LDK(KP707106781), TB, Tu);
+			      T2y = VSUB(T2e, T2f);
+			      T2g = VADD(T2e, T2f);
+			      T17 = VFMA(LDK(KP414213562), Te, Th);
+			      Ti = VFNMS(LDK(KP414213562), Th, Te);
+			      T2h = VADD(Tj, Tk);
+			      Tl = VSUB(Tj, Tk);
+			      T2c = VADD(T24, T2b);
+			      T2l = VSUB(T24, T2b);
+			      {
+				   V T2L, T2A, T2q, T2k;
+				   T2P = VSUB(T2N, T2O);
+				   T2U = VADD(T2N, T2O);
+				   {
+					V T2z, T2j, T18, Tp;
+					T2z = VSUB(T2h, T2i);
+					T2j = VADD(T2h, T2i);
+					T18 = VFMA(LDK(KP414213562), Tl, To);
+					Tp = VFNMS(LDK(KP414213562), To, Tl);
+					T2n = VFMA(LDK(KP707106781), T2c, T1X);
+					T2d = VFNMS(LDK(KP707106781), T2c, T1X);
+					T2w = VADD(T2s, T2t);
+					T2u = VSUB(T2s, T2t);
+					T2L = VSUB(T2y, T2z);
+					T2A = VADD(T2y, T2z);
+					T2q = VADD(T2g, T2j);
+					T2k = VSUB(T2g, T2j);
+					T1q = VADD(T17, T18);
+					T19 = VSUB(T17, T18);
+					T1B = VSUB(Ti, Tp);
+					Tq = VADD(Ti, Tp);
+				   }
+				   T2W = VFNMS(LDK(KP707106781), T2L, T2K);
+				   T2M = VFMA(LDK(KP707106781), T2L, T2K);
+				   T2B = VFMA(LDK(KP707106781), T2A, T2x);
+				   T2T = VFNMS(LDK(KP707106781), T2A, T2x);
+				   T2v = VADD(T2p, T2q);
+				   T2r = VSUB(T2p, T2q);
+				   T2o = VFMA(LDK(KP707106781), T2l, T2k);
+				   T2m = VFNMS(LDK(KP707106781), T2l, T2k);
+				   T2X = VSUB(T2E, T2H);
+				   T2I = VADD(T2E, T2H);
+			      }
+			 }
+			 {
+			      V T2V, T2Z, T2Y, T30, T2R, T2J;
+			      T2V = VFNMS(LDK(KP923879532), T2U, T2T);
+			      T2Z = VFMA(LDK(KP923879532), T2U, T2T);
+			      T31 = VSUB(T2v, T2w);
+			      STM2(&(xo[32]), T31, ovs, &(xo[0]));
+			      T32 = VADD(T2v, T2w);
+			      STM2(&(xo[0]), T32, ovs, &(xo[0]));
+			      T33 = VFMAI(T2u, T2r);
+			      STM2(&(xo[16]), T33, ovs, &(xo[0]));
+			      T34 = VFNMSI(T2u, T2r);
+			      STM2(&(xo[48]), T34, ovs, &(xo[0]));
+			      T35 = VFMAI(T2o, T2n);
+			      STM2(&(xo[8]), T35, ovs, &(xo[0]));
+			      T36 = VFNMSI(T2o, T2n);
+			      STM2(&(xo[56]), T36, ovs, &(xo[0]));
+			      T37 = VFMAI(T2m, T2d);
+			      STM2(&(xo[40]), T37, ovs, &(xo[0]));
+			      T38 = VFNMSI(T2m, T2d);
+			      STM2(&(xo[24]), T38, ovs, &(xo[0]));
+			      T2Y = VFMA(LDK(KP923879532), T2X, T2W);
+			      T30 = VFNMS(LDK(KP923879532), T2X, T2W);
+			      T2R = VFMA(LDK(KP923879532), T2I, T2B);
+			      T2J = VFNMS(LDK(KP923879532), T2I, T2B);
+			      {
+				   V T1J, T1r, T1C, T1M, T2S, T2Q, T1u, T1D, T1E, T1x;
+				   T1J = VFNMS(LDK(KP923879532), T1q, T1p);
+				   T1r = VFMA(LDK(KP923879532), T1q, T1p);
+				   T1C = VFNMS(LDK(KP923879532), T1B, T1A);
+				   T1M = VFMA(LDK(KP923879532), T1B, T1A);
+				   T39 = VFNMSI(T30, T2Z);
+				   STM2(&(xo[12]), T39, ovs, &(xo[0]));
+				   T3a = VFMAI(T30, T2Z);
+				   STM2(&(xo[52]), T3a, ovs, &(xo[0]));
+				   T3b = VFNMSI(T2Y, T2V);
+				   STM2(&(xo[44]), T3b, ovs, &(xo[0]));
+				   T3c = VFMAI(T2Y, T2V);
+				   STM2(&(xo[20]), T3c, ovs, &(xo[0]));
+				   T2S = VFMA(LDK(KP923879532), T2P, T2M);
+				   T2Q = VFNMS(LDK(KP923879532), T2P, T2M);
+				   T1u = VFMA(LDK(KP668178637), T1t, T1s);
+				   T1D = VFNMS(LDK(KP668178637), T1s, T1t);
+				   T1E = VFNMS(LDK(KP668178637), T1v, T1w);
+				   T1x = VFMA(LDK(KP668178637), T1w, T1v);
+				   {
+					V T1K, T1F, T1N, T1y;
+					T1h = VFNMS(LDK(KP923879532), Tq, Tb);
+					Tr = VFMA(LDK(KP923879532), Tq, Tb);
+					T3d = VFNMSI(T2S, T2R);
+					STM2(&(xo[60]), T3d, ovs, &(xo[0]));
+					T3e = VFMAI(T2S, T2R);
+					STM2(&(xo[4]), T3e, ovs, &(xo[0]));
+					T3f = VFMAI(T2Q, T2J);
+					STM2(&(xo[36]), T3f, ovs, &(xo[0]));
+					T3g = VFNMSI(T2Q, T2J);
+					STM2(&(xo[28]), T3g, ovs, &(xo[0]));
+					T1K = VADD(T1D, T1E);
+					T1F = VSUB(T1D, T1E);
+					T1N = VSUB(T1u, T1x);
+					T1y = VADD(T1u, T1x);
+					T1a = VFMA(LDK(KP923879532), T19, T16);
+					T1k = VFNMS(LDK(KP923879532), T19, T16);
+					TI = VFNMS(LDK(KP198912367), TH, TC);
+					T1b = VFMA(LDK(KP198912367), TC, TH);
+					T1L = VFMA(LDK(KP831469612), T1K, T1J);
+					T1P = VFNMS(LDK(KP831469612), T1K, T1J);
+					T1I = VFMA(LDK(KP831469612), T1F, T1C);
+					T1G = VFNMS(LDK(KP831469612), T1F, T1C);
+					T1O = VFNMS(LDK(KP831469612), T1N, T1M);
+					T1Q = VFMA(LDK(KP831469612), T1N, T1M);
+					T1H = VFMA(LDK(KP831469612), T1y, T1r);
+					T1z = VFNMS(LDK(KP831469612), T1y, T1r);
+					T1c = VFMA(LDK(KP198912367), TT, TY);
+					TZ = VFNMS(LDK(KP198912367), TY, TT);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1d, T1i, T10, T1l;
+		    {
+			 V T3h, T3i, T3j, T3k;
+			 T3h = VFMAI(T1O, T1L);
+			 STM2(&(xo[42]), T3h, ovs, &(xo[2]));
+			 STN2(&(xo[40]), T37, T3h, ovs);
+			 T3i = VFNMSI(T1O, T1L);
+			 STM2(&(xo[22]), T3i, ovs, &(xo[2]));
+			 STN2(&(xo[20]), T3c, T3i, ovs);
+			 T3j = VFNMSI(T1Q, T1P);
+			 STM2(&(xo[54]), T3j, ovs, &(xo[2]));
+			 STN2(&(xo[52]), T3a, T3j, ovs);
+			 T3k = VFMAI(T1Q, T1P);
+			 STM2(&(xo[10]), T3k, ovs, &(xo[2]));
+			 STN2(&(xo[8]), T35, T3k, ovs);
+			 {
+			      V T3l, T3m, T3n, T3o;
+			      T3l = VFMAI(T1I, T1H);
+			      STM2(&(xo[58]), T3l, ovs, &(xo[2]));
+			      STN2(&(xo[56]), T36, T3l, ovs);
+			      T3m = VFNMSI(T1I, T1H);
+			      STM2(&(xo[6]), T3m, ovs, &(xo[2]));
+			      STN2(&(xo[4]), T3e, T3m, ovs);
+			      T3n = VFMAI(T1G, T1z);
+			      STM2(&(xo[26]), T3n, ovs, &(xo[2]));
+			      STN2(&(xo[24]), T38, T3n, ovs);
+			      T3o = VFNMSI(T1G, T1z);
+			      STM2(&(xo[38]), T3o, ovs, &(xo[2]));
+			      STN2(&(xo[36]), T3f, T3o, ovs);
+			      T1d = VSUB(T1b, T1c);
+			      T1i = VADD(T1b, T1c);
+			      T10 = VADD(TI, TZ);
+			      T1l = VSUB(TI, TZ);
+			 }
+		    }
+		    {
+			 V T1n, T1j, T1e, T1g, T1o, T1m, T11, T1f;
+			 T1n = VFMA(LDK(KP980785280), T1i, T1h);
+			 T1j = VFNMS(LDK(KP980785280), T1i, T1h);
+			 T1e = VFNMS(LDK(KP980785280), T1d, T1a);
+			 T1g = VFMA(LDK(KP980785280), T1d, T1a);
+			 T1o = VFNMS(LDK(KP980785280), T1l, T1k);
+			 T1m = VFMA(LDK(KP980785280), T1l, T1k);
+			 T11 = VFNMS(LDK(KP980785280), T10, Tr);
+			 T1f = VFMA(LDK(KP980785280), T10, Tr);
+			 {
+			      V T3p, T3q, T3r, T3s;
+			      T3p = VFNMSI(T1m, T1j);
+			      STM2(&(xo[46]), T3p, ovs, &(xo[2]));
+			      STN2(&(xo[44]), T3b, T3p, ovs);
+			      T3q = VFMAI(T1m, T1j);
+			      STM2(&(xo[18]), T3q, ovs, &(xo[2]));
+			      STN2(&(xo[16]), T33, T3q, ovs);
+			      T3r = VFMAI(T1o, T1n);
+			      STM2(&(xo[50]), T3r, ovs, &(xo[2]));
+			      STN2(&(xo[48]), T34, T3r, ovs);
+			      T3s = VFNMSI(T1o, T1n);
+			      STM2(&(xo[14]), T3s, ovs, &(xo[2]));
+			      STN2(&(xo[12]), T39, T3s, ovs);
+			      {
+				   V T3t, T3u, T3v, T3w;
+				   T3t = VFMAI(T1g, T1f);
+				   STM2(&(xo[2]), T3t, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T32, T3t, ovs);
+				   T3u = VFNMSI(T1g, T1f);
+				   STM2(&(xo[62]), T3u, ovs, &(xo[2]));
+				   STN2(&(xo[60]), T3d, T3u, ovs);
+				   T3v = VFMAI(T1e, T11);
+				   STM2(&(xo[34]), T3v, ovs, &(xo[2]));
+				   STN2(&(xo[32]), T31, T3v, ovs);
+				   T3w = VFNMSI(T1e, T11);
+				   STM2(&(xo[30]), T3w, ovs, &(xo[2]));
+				   STN2(&(xo[28]), T3g, T3w, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 32, XSIMD_STRING("n2bv_32"), {88, 0, 98, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_32) (planner *p) {
+     X(kdft_register) (p, n2bv_32, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 32 -name n2bv_32 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 186 FP additions, 42 FP multiplications,
+ * (or, 170 additions, 26 multiplications, 16 fused multiply/add),
+ * 72 stack variables, 7 constants, and 80 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       V T2f, T2k, T2N, T2M, T19, T1B, Tb, T1p, TT, T1v, TY, T1w, T2E, T2F, T2G;
+	       V T24, T2o, TC, T1s, TH, T1t, T2B, T2C, T2D, T1X, T2n, T2I, T2J, Tq, T1A;
+	       V T14, T1q, T2c, T2l;
+	       {
+		    V T3, T2i, T18, T2j, T6, T2d, T9, T2e, T15, Ta;
+		    {
+			 V T1, T2, T16, T17;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T3 = VSUB(T1, T2);
+			 T2i = VADD(T1, T2);
+			 T16 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T17 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			 T18 = VSUB(T16, T17);
+			 T2j = VADD(T16, T17);
+		    }
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T2d = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T2e = VADD(T7, T8);
+		    }
+		    T2f = VSUB(T2d, T2e);
+		    T2k = VSUB(T2i, T2j);
+		    T2N = VADD(T2d, T2e);
+		    T2M = VADD(T2i, T2j);
+		    T15 = VMUL(LDK(KP707106781), VSUB(T6, T9));
+		    T19 = VSUB(T15, T18);
+		    T1B = VADD(T18, T15);
+		    Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+		    Tb = VSUB(T3, Ta);
+		    T1p = VADD(T3, Ta);
+	       }
+	       {
+		    V TL, T21, TW, T1Y, TO, T22, TS, T1Z;
+		    {
+			 V TJ, TK, TU, TV;
+			 TJ = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 TK = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			 TL = VSUB(TJ, TK);
+			 T21 = VADD(TJ, TK);
+			 TU = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+			 TV = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 TW = VSUB(TU, TV);
+			 T1Y = VADD(TU, TV);
+		    }
+		    {
+			 V TM, TN, TQ, TR;
+			 TM = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+			 TN = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 TO = VSUB(TM, TN);
+			 T22 = VADD(TM, TN);
+			 TQ = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 TR = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			 TS = VSUB(TQ, TR);
+			 T1Z = VADD(TQ, TR);
+		    }
+		    {
+			 V TP, TX, T20, T23;
+			 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
+			 TT = VSUB(TP, TS);
+			 T1v = VADD(TS, TP);
+			 TX = VMUL(LDK(KP707106781), VADD(TL, TO));
+			 TY = VSUB(TW, TX);
+			 T1w = VADD(TW, TX);
+			 T2E = VADD(T1Y, T1Z);
+			 T2F = VADD(T21, T22);
+			 T2G = VSUB(T2E, T2F);
+			 T20 = VSUB(T1Y, T1Z);
+			 T23 = VSUB(T21, T22);
+			 T24 = VFMA(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T23));
+			 T2o = VFNMS(LDK(KP382683432), T20, VMUL(LDK(KP923879532), T23));
+		    }
+	       }
+	       {
+		    V Tu, T1U, TF, T1R, Tx, T1V, TB, T1S;
+		    {
+			 V Ts, Tt, TD, TE;
+			 Ts = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tt = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			 Tu = VSUB(Ts, Tt);
+			 T1U = VADD(Ts, Tt);
+			 TD = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 TE = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 TF = VSUB(TD, TE);
+			 T1R = VADD(TD, TE);
+		    }
+		    {
+			 V Tv, Tw, Tz, TA;
+			 Tv = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+			 Tw = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Tx = VSUB(Tv, Tw);
+			 T1V = VADD(Tv, Tw);
+			 Tz = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 TA = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+			 TB = VSUB(Tz, TA);
+			 T1S = VADD(Tz, TA);
+		    }
+		    {
+			 V Ty, TG, T1T, T1W;
+			 Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
+			 TC = VSUB(Ty, TB);
+			 T1s = VADD(TB, Ty);
+			 TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
+			 TH = VSUB(TF, TG);
+			 T1t = VADD(TF, TG);
+			 T2B = VADD(T1R, T1S);
+			 T2C = VADD(T1U, T1V);
+			 T2D = VSUB(T2B, T2C);
+			 T1T = VSUB(T1R, T1S);
+			 T1W = VSUB(T1U, T1V);
+			 T1X = VFNMS(LDK(KP382683432), T1W, VMUL(LDK(KP923879532), T1T));
+			 T2n = VFMA(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1W));
+		    }
+	       }
+	       {
+		    V Te, T26, To, T29, Th, T27, Tl, T2a, Ti, Tp;
+		    {
+			 V Tc, Td, Tm, Tn;
+			 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 Te = VSUB(Tc, Td);
+			 T26 = VADD(Tc, Td);
+			 Tm = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+			 Tn = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 To = VSUB(Tm, Tn);
+			 T29 = VADD(Tm, Tn);
+		    }
+		    {
+			 V Tf, Tg, Tj, Tk;
+			 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+			 Th = VSUB(Tf, Tg);
+			 T27 = VADD(Tf, Tg);
+			 Tj = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 T2a = VADD(Tj, Tk);
+		    }
+		    T2I = VADD(T26, T27);
+		    T2J = VADD(T29, T2a);
+		    Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
+		    Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
+		    Tq = VSUB(Ti, Tp);
+		    T1A = VADD(Ti, Tp);
+		    {
+			 V T12, T13, T28, T2b;
+			 T12 = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
+			 T13 = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
+			 T14 = VSUB(T12, T13);
+			 T1q = VADD(T12, T13);
+			 T28 = VSUB(T26, T27);
+			 T2b = VSUB(T29, T2a);
+			 T2c = VMUL(LDK(KP707106781), VSUB(T28, T2b));
+			 T2l = VMUL(LDK(KP707106781), VADD(T28, T2b));
+		    }
+	       }
+	       {
+		    V T31, T32, T33, T34, T35, T36, T37, T38, T39, T3a, T3b, T3c;
+		    {
+			 V T2L, T2R, T2Q, T2S;
+			 {
+			      V T2H, T2K, T2O, T2P;
+			      T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G));
+			      T2K = VSUB(T2I, T2J);
+			      T2L = VBYI(VSUB(T2H, T2K));
+			      T2R = VBYI(VADD(T2K, T2H));
+			      T2O = VSUB(T2M, T2N);
+			      T2P = VMUL(LDK(KP707106781), VADD(T2D, T2G));
+			      T2Q = VSUB(T2O, T2P);
+			      T2S = VADD(T2O, T2P);
+			 }
+			 T31 = VADD(T2L, T2Q);
+			 STM2(&(xo[24]), T31, ovs, &(xo[0]));
+			 T32 = VSUB(T2S, T2R);
+			 STM2(&(xo[56]), T32, ovs, &(xo[0]));
+			 T33 = VSUB(T2Q, T2L);
+			 STM2(&(xo[40]), T33, ovs, &(xo[0]));
+			 T34 = VADD(T2R, T2S);
+			 STM2(&(xo[8]), T34, ovs, &(xo[0]));
+		    }
+		    {
+			 V T2h, T2r, T2q, T2s;
+			 {
+			      V T25, T2g, T2m, T2p;
+			      T25 = VSUB(T1X, T24);
+			      T2g = VSUB(T2c, T2f);
+			      T2h = VBYI(VSUB(T25, T2g));
+			      T2r = VBYI(VADD(T2g, T25));
+			      T2m = VSUB(T2k, T2l);
+			      T2p = VSUB(T2n, T2o);
+			      T2q = VSUB(T2m, T2p);
+			      T2s = VADD(T2m, T2p);
+			 }
+			 T35 = VADD(T2h, T2q);
+			 STM2(&(xo[20]), T35, ovs, &(xo[0]));
+			 T36 = VSUB(T2s, T2r);
+			 STM2(&(xo[52]), T36, ovs, &(xo[0]));
+			 T37 = VSUB(T2q, T2h);
+			 STM2(&(xo[44]), T37, ovs, &(xo[0]));
+			 T38 = VADD(T2r, T2s);
+			 STM2(&(xo[12]), T38, ovs, &(xo[0]));
+		    }
+		    {
+			 V T2V, T2Z, T2Y, T30;
+			 {
+			      V T2T, T2U, T2W, T2X;
+			      T2T = VADD(T2M, T2N);
+			      T2U = VADD(T2I, T2J);
+			      T2V = VSUB(T2T, T2U);
+			      T2Z = VADD(T2T, T2U);
+			      T2W = VADD(T2B, T2C);
+			      T2X = VADD(T2E, T2F);
+			      T2Y = VBYI(VSUB(T2W, T2X));
+			      T30 = VADD(T2W, T2X);
+			 }
+			 T39 = VSUB(T2V, T2Y);
+			 STM2(&(xo[48]), T39, ovs, &(xo[0]));
+			 T3a = VADD(T2Z, T30);
+			 STM2(&(xo[0]), T3a, ovs, &(xo[0]));
+			 T3b = VADD(T2V, T2Y);
+			 STM2(&(xo[16]), T3b, ovs, &(xo[0]));
+			 T3c = VSUB(T2Z, T30);
+			 STM2(&(xo[32]), T3c, ovs, &(xo[0]));
+		    }
+		    {
+			 V T3d, T3e, T3f, T3g;
+			 {
+			      V T2v, T2z, T2y, T2A;
+			      {
+				   V T2t, T2u, T2w, T2x;
+				   T2t = VADD(T2k, T2l);
+				   T2u = VADD(T1X, T24);
+				   T2v = VADD(T2t, T2u);
+				   T2z = VSUB(T2t, T2u);
+				   T2w = VADD(T2f, T2c);
+				   T2x = VADD(T2n, T2o);
+				   T2y = VBYI(VADD(T2w, T2x));
+				   T2A = VBYI(VSUB(T2x, T2w));
+			      }
+			      T3d = VSUB(T2v, T2y);
+			      STM2(&(xo[60]), T3d, ovs, &(xo[0]));
+			      T3e = VADD(T2z, T2A);
+			      STM2(&(xo[28]), T3e, ovs, &(xo[0]));
+			      T3f = VADD(T2v, T2y);
+			      STM2(&(xo[4]), T3f, ovs, &(xo[0]));
+			      T3g = VSUB(T2z, T2A);
+			      STM2(&(xo[36]), T3g, ovs, &(xo[0]));
+			 }
+			 {
+			      V T1r, T1C, T1M, T1K, T1F, T1N, T1y, T1J;
+			      T1r = VSUB(T1p, T1q);
+			      T1C = VSUB(T1A, T1B);
+			      T1M = VADD(T1p, T1q);
+			      T1K = VADD(T1B, T1A);
+			      {
+				   V T1D, T1E, T1u, T1x;
+				   T1D = VFNMS(LDK(KP195090322), T1s, VMUL(LDK(KP980785280), T1t));
+				   T1E = VFMA(LDK(KP195090322), T1v, VMUL(LDK(KP980785280), T1w));
+				   T1F = VSUB(T1D, T1E);
+				   T1N = VADD(T1D, T1E);
+				   T1u = VFMA(LDK(KP980785280), T1s, VMUL(LDK(KP195090322), T1t));
+				   T1x = VFNMS(LDK(KP195090322), T1w, VMUL(LDK(KP980785280), T1v));
+				   T1y = VSUB(T1u, T1x);
+				   T1J = VADD(T1u, T1x);
+			      }
+			      {
+				   V T1z, T1G, T3h, T3i;
+				   T1z = VADD(T1r, T1y);
+				   T1G = VBYI(VADD(T1C, T1F));
+				   T3h = VSUB(T1z, T1G);
+				   STM2(&(xo[50]), T3h, ovs, &(xo[2]));
+				   STN2(&(xo[48]), T39, T3h, ovs);
+				   T3i = VADD(T1z, T1G);
+				   STM2(&(xo[14]), T3i, ovs, &(xo[2]));
+				   STN2(&(xo[12]), T38, T3i, ovs);
+			      }
+			      {
+				   V T1P, T1Q, T3j, T3k;
+				   T1P = VBYI(VADD(T1K, T1J));
+				   T1Q = VADD(T1M, T1N);
+				   T3j = VADD(T1P, T1Q);
+				   STM2(&(xo[2]), T3j, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T3a, T3j, ovs);
+				   T3k = VSUB(T1Q, T1P);
+				   STM2(&(xo[62]), T3k, ovs, &(xo[2]));
+				   STN2(&(xo[60]), T3d, T3k, ovs);
+			      }
+			      {
+				   V T1H, T1I, T3l, T3m;
+				   T1H = VSUB(T1r, T1y);
+				   T1I = VBYI(VSUB(T1F, T1C));
+				   T3l = VSUB(T1H, T1I);
+				   STM2(&(xo[46]), T3l, ovs, &(xo[2]));
+				   STN2(&(xo[44]), T37, T3l, ovs);
+				   T3m = VADD(T1H, T1I);
+				   STM2(&(xo[18]), T3m, ovs, &(xo[2]));
+				   STN2(&(xo[16]), T3b, T3m, ovs);
+			      }
+			      {
+				   V T1L, T1O, T3n, T3o;
+				   T1L = VBYI(VSUB(T1J, T1K));
+				   T1O = VSUB(T1M, T1N);
+				   T3n = VADD(T1L, T1O);
+				   STM2(&(xo[30]), T3n, ovs, &(xo[2]));
+				   STN2(&(xo[28]), T3e, T3n, ovs);
+				   T3o = VSUB(T1O, T1L);
+				   STM2(&(xo[34]), T3o, ovs, &(xo[2]));
+				   STN2(&(xo[32]), T3c, T3o, ovs);
+			      }
+			 }
+			 {
+			      V Tr, T1a, T1k, T1i, T1d, T1l, T10, T1h;
+			      Tr = VSUB(Tb, Tq);
+			      T1a = VSUB(T14, T19);
+			      T1k = VADD(Tb, Tq);
+			      T1i = VADD(T19, T14);
+			      {
+				   V T1b, T1c, TI, TZ;
+				   T1b = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
+				   T1c = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
+				   T1d = VSUB(T1b, T1c);
+				   T1l = VADD(T1b, T1c);
+				   TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
+				   TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
+				   T10 = VSUB(TI, TZ);
+				   T1h = VADD(TI, TZ);
+			      }
+			      {
+				   V T11, T1e, T3p, T3q;
+				   T11 = VADD(Tr, T10);
+				   T1e = VBYI(VADD(T1a, T1d));
+				   T3p = VSUB(T11, T1e);
+				   STM2(&(xo[54]), T3p, ovs, &(xo[2]));
+				   STN2(&(xo[52]), T36, T3p, ovs);
+				   T3q = VADD(T11, T1e);
+				   STM2(&(xo[10]), T3q, ovs, &(xo[2]));
+				   STN2(&(xo[8]), T34, T3q, ovs);
+			      }
+			      {
+				   V T1n, T1o, T3r, T3s;
+				   T1n = VBYI(VADD(T1i, T1h));
+				   T1o = VADD(T1k, T1l);
+				   T3r = VADD(T1n, T1o);
+				   STM2(&(xo[6]), T3r, ovs, &(xo[2]));
+				   STN2(&(xo[4]), T3f, T3r, ovs);
+				   T3s = VSUB(T1o, T1n);
+				   STM2(&(xo[58]), T3s, ovs, &(xo[2]));
+				   STN2(&(xo[56]), T32, T3s, ovs);
+			      }
+			      {
+				   V T1f, T1g, T3t, T3u;
+				   T1f = VSUB(Tr, T10);
+				   T1g = VBYI(VSUB(T1d, T1a));
+				   T3t = VSUB(T1f, T1g);
+				   STM2(&(xo[42]), T3t, ovs, &(xo[2]));
+				   STN2(&(xo[40]), T33, T3t, ovs);
+				   T3u = VADD(T1f, T1g);
+				   STM2(&(xo[22]), T3u, ovs, &(xo[2]));
+				   STN2(&(xo[20]), T35, T3u, ovs);
+			      }
+			      {
+				   V T1j, T1m, T3v, T3w;
+				   T1j = VBYI(VSUB(T1h, T1i));
+				   T1m = VSUB(T1k, T1l);
+				   T3v = VADD(T1j, T1m);
+				   STM2(&(xo[26]), T3v, ovs, &(xo[2]));
+				   STN2(&(xo[24]), T31, T3v, ovs);
+				   T3w = VSUB(T1m, T1j);
+				   STM2(&(xo[38]), T3w, ovs, &(xo[2]));
+				   STN2(&(xo[36]), T3g, T3w, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 32, XSIMD_STRING("n2bv_32"), {170, 26, 16, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_32) (planner *p) {
+     X(kdft_register) (p, n2bv_32, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:29 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 4 -name n2bv_4 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 8 FP additions, 2 FP multiplications,
+ * (or, 6 additions, 0 multiplications, 2 fused multiply/add),
+ * 15 stack variables, 0 constants, and 10 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
+	       V T1, T2, T4, T5;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, T7, T6, T8;
+		    T3 = VSUB(T1, T2);
+		    T7 = VADD(T1, T2);
+		    T6 = VSUB(T4, T5);
+		    T8 = VADD(T4, T5);
+		    {
+			 V T9, Ta, Tb, Tc;
+			 T9 = VSUB(T7, T8);
+			 STM2(&(xo[4]), T9, ovs, &(xo[0]));
+			 Ta = VADD(T7, T8);
+			 STM2(&(xo[0]), Ta, ovs, &(xo[0]));
+			 Tb = VFMAI(T6, T3);
+			 STM2(&(xo[2]), Tb, ovs, &(xo[2]));
+			 STN2(&(xo[0]), Ta, Tb, ovs);
+			 Tc = VFNMSI(T6, T3);
+			 STM2(&(xo[6]), Tc, ovs, &(xo[2]));
+			 STN2(&(xo[4]), T9, Tc, ovs);
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 4, XSIMD_STRING("n2bv_4"), {6, 0, 2, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_4) (planner *p) {
+     X(kdft_register) (p, n2bv_4, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 4 -name n2bv_4 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 8 FP additions, 0 FP multiplications,
+ * (or, 8 additions, 0 multiplications, 0 fused multiply/add),
+ * 11 stack variables, 0 constants, and 10 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
+	       V T3, T7, T6, T8;
+	       {
+		    V T1, T2, T4, T5;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T3 = VSUB(T1, T2);
+		    T7 = VADD(T1, T2);
+		    T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T6 = VBYI(VSUB(T4, T5));
+		    T8 = VADD(T4, T5);
+	       }
+	       {
+		    V T9, Ta, Tb, Tc;
+		    T9 = VSUB(T3, T6);
+		    STM2(&(xo[6]), T9, ovs, &(xo[2]));
+		    Ta = VADD(T7, T8);
+		    STM2(&(xo[0]), Ta, ovs, &(xo[0]));
+		    Tb = VADD(T3, T6);
+		    STM2(&(xo[2]), Tb, ovs, &(xo[2]));
+		    STN2(&(xo[0]), Ta, Tb, ovs);
+		    Tc = VSUB(T7, T8);
+		    STM2(&(xo[4]), Tc, ovs, &(xo[0]));
+		    STN2(&(xo[4]), Tc, T9, ovs);
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 4, XSIMD_STRING("n2bv_4"), {8, 0, 0, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_4) (planner *p) {
+     X(kdft_register) (p, n2bv_4, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:29 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 6 -name n2bv_6 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 18 FP additions, 8 FP multiplications,
+ * (or, 12 additions, 2 multiplications, 6 fused multiply/add),
+ * 29 stack variables, 2 constants, and 15 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
+	       V T1, T2, T4, T5, T7, T8;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, Td, T6, Te, T9, Tf;
+		    T3 = VSUB(T1, T2);
+		    Td = VADD(T1, T2);
+		    T6 = VSUB(T4, T5);
+		    Te = VADD(T4, T5);
+		    T9 = VSUB(T7, T8);
+		    Tf = VADD(T7, T8);
+		    {
+			 V Tg, Ti, Ta, Tc;
+			 Tg = VADD(Te, Tf);
+			 Ti = VMUL(LDK(KP866025403), VSUB(Te, Tf));
+			 Ta = VADD(T6, T9);
+			 Tc = VMUL(LDK(KP866025403), VSUB(T6, T9));
+			 {
+			      V Th, Tj, Tb, Tk;
+			      Th = VFNMS(LDK(KP500000000), Tg, Td);
+			      Tj = VADD(Td, Tg);
+			      STM2(&(xo[0]), Tj, ovs, &(xo[0]));
+			      Tb = VFNMS(LDK(KP500000000), Ta, T3);
+			      Tk = VADD(T3, Ta);
+			      STM2(&(xo[6]), Tk, ovs, &(xo[2]));
+			      {
+				   V Tl, Tm, Tn, To;
+				   Tl = VFMAI(Ti, Th);
+				   STM2(&(xo[8]), Tl, ovs, &(xo[0]));
+				   Tm = VFNMSI(Ti, Th);
+				   STM2(&(xo[4]), Tm, ovs, &(xo[0]));
+				   STN2(&(xo[4]), Tm, Tk, ovs);
+				   Tn = VFNMSI(Tc, Tb);
+				   STM2(&(xo[10]), Tn, ovs, &(xo[2]));
+				   STN2(&(xo[8]), Tl, Tn, ovs);
+				   To = VFMAI(Tc, Tb);
+				   STM2(&(xo[2]), To, ovs, &(xo[2]));
+				   STN2(&(xo[0]), Tj, To, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 6, XSIMD_STRING("n2bv_6"), {12, 2, 6, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_6) (planner *p) {
+     X(kdft_register) (p, n2bv_6, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 6 -name n2bv_6 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 18 FP additions, 4 FP multiplications,
+ * (or, 16 additions, 2 multiplications, 2 fused multiply/add),
+ * 25 stack variables, 2 constants, and 15 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
+	       V Ta, Td, T3, Te, T6, Tf, Tb, Tg, T8, T9, Tj, Tk;
+	       T8 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T9 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       Ta = VSUB(T8, T9);
+	       Td = VADD(T8, T9);
+	       {
+		    V T1, T2, T4, T5;
+		    T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    T3 = VSUB(T1, T2);
+		    Te = VADD(T1, T2);
+		    T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T5 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T6 = VSUB(T4, T5);
+		    Tf = VADD(T4, T5);
+	       }
+	       Tb = VADD(T3, T6);
+	       Tg = VADD(Te, Tf);
+	       Tj = VADD(Ta, Tb);
+	       STM2(&(xo[6]), Tj, ovs, &(xo[2]));
+	       Tk = VADD(Td, Tg);
+	       STM2(&(xo[0]), Tk, ovs, &(xo[0]));
+	       {
+		    V Tm, T7, Tc, Tl;
+		    T7 = VBYI(VMUL(LDK(KP866025403), VSUB(T3, T6)));
+		    Tc = VFNMS(LDK(KP500000000), Tb, Ta);
+		    Tl = VADD(T7, Tc);
+		    STM2(&(xo[2]), Tl, ovs, &(xo[2]));
+		    STN2(&(xo[0]), Tk, Tl, ovs);
+		    Tm = VSUB(Tc, T7);
+		    STM2(&(xo[10]), Tm, ovs, &(xo[2]));
+		    {
+			 V Th, Ti, Tn, To;
+			 Th = VFNMS(LDK(KP500000000), Tg, Td);
+			 Ti = VBYI(VMUL(LDK(KP866025403), VSUB(Te, Tf)));
+			 Tn = VSUB(Th, Ti);
+			 STM2(&(xo[4]), Tn, ovs, &(xo[0]));
+			 STN2(&(xo[4]), Tn, Tj, ovs);
+			 To = VADD(Ti, Th);
+			 STM2(&(xo[8]), To, ovs, &(xo[0]));
+			 STN2(&(xo[8]), To, Tm, ovs);
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 6, XSIMD_STRING("n2bv_6"), {16, 2, 2, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_6) (planner *p) {
+     X(kdft_register) (p, n2bv_6, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1815 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:36 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n2bv_64 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 456 FP additions, 258 FP multiplications,
+ * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
+ * 178 stack variables, 15 constants, and 160 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       V T7z, T7A, T7B, T7C, T5T, T5S, T5X, T65, T8a, T8b, T8e, T8g, T5Z, T5R, T67;
+	       V T63, T5U, T64;
+	       {
+		    V T7, T26, T5k, T6A, T47, T69, T2V, T3z, T6B, T4e, T6a, T5n, T3M, T2Y, T27;
+		    V Tm, T3A, T3i, T29, TC, T5p, T4o, T6D, T6e, T3l, T3B, TR, T2a, T4x, T5q;
+		    V T6h, T6E, T39, T3H, T3I, T3c, T5N, T57, T72, T6w, T5O, T5e, T71, T6t, T2y;
+		    V T1W, T2x, T1N, T33, T34, T3E, T32, T1p, T2v, T1g, T2u, T4M, T5K, T6p, T6Z;
+		    V T6m, T6Y, T5L, T4T;
+		    {
+			 V T4g, T4l, T3g, Tu, Tx, T4h, TA, T4i;
+			 {
+			      V T1, T2, T23, T24, T4, T5, T20, T21;
+			      T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			      T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+			      T23 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+			      T24 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			      T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			      T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+			      T20 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			      T21 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+			      {
+				   V Ta, T48, Tk, T4c, T49, Td, Tf, Tg;
+				   {
+					V T8, T43, T3, T45, T25, T5i, T6, T44, T22, T9, Ti, Tj, Tb, Tc;
+					T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					T43 = VSUB(T1, T2);
+					T3 = VADD(T1, T2);
+					T45 = VSUB(T23, T24);
+					T25 = VADD(T23, T24);
+					T5i = VSUB(T4, T5);
+					T6 = VADD(T4, T5);
+					T44 = VSUB(T20, T21);
+					T22 = VADD(T20, T21);
+					T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+					Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+					Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+					Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+					Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+					{
+					     V T2T, T46, T5j, T2U;
+					     T7 = VSUB(T3, T6);
+					     T2T = VADD(T3, T6);
+					     T46 = VADD(T44, T45);
+					     T5j = VSUB(T44, T45);
+					     T26 = VSUB(T22, T25);
+					     T2U = VADD(T22, T25);
+					     Ta = VADD(T8, T9);
+					     T48 = VSUB(T8, T9);
+					     Tk = VADD(Ti, Tj);
+					     T4c = VSUB(Tj, Ti);
+					     T5k = VFMA(LDK(KP707106781), T5j, T5i);
+					     T6A = VFNMS(LDK(KP707106781), T5j, T5i);
+					     T47 = VFMA(LDK(KP707106781), T46, T43);
+					     T69 = VFNMS(LDK(KP707106781), T46, T43);
+					     T2V = VADD(T2T, T2U);
+					     T3z = VSUB(T2T, T2U);
+					     T49 = VSUB(Tb, Tc);
+					     Td = VADD(Tb, Tc);
+					}
+					Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+					Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+				   }
+				   {
+					V Te, T2W, T5l, T4a, Tq, Tt, Tv, Tw, T5m, T4d, Tl, T2X, Ty, Tz, To;
+					V Tp;
+					To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+					Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+					{
+					     V Th, T4b, Tr, Ts;
+					     Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+					     Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+					     Te = VSUB(Ta, Td);
+					     T2W = VADD(Ta, Td);
+					     T5l = VFMA(LDK(KP414213562), T48, T49);
+					     T4a = VFNMS(LDK(KP414213562), T49, T48);
+					     Th = VADD(Tf, Tg);
+					     T4b = VSUB(Tf, Tg);
+					     Tq = VADD(To, Tp);
+					     T4g = VSUB(To, Tp);
+					     T4l = VSUB(Tr, Ts);
+					     Tt = VADD(Tr, Ts);
+					     Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+					     Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+					     T5m = VFMA(LDK(KP414213562), T4b, T4c);
+					     T4d = VFNMS(LDK(KP414213562), T4c, T4b);
+					     Tl = VSUB(Th, Tk);
+					     T2X = VADD(Th, Tk);
+					     Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+					     Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+					}
+					T3g = VADD(Tq, Tt);
+					Tu = VSUB(Tq, Tt);
+					Tx = VADD(Tv, Tw);
+					T4h = VSUB(Tv, Tw);
+					T6B = VSUB(T4a, T4d);
+					T4e = VADD(T4a, T4d);
+					T6a = VADD(T5l, T5m);
+					T5n = VSUB(T5l, T5m);
+					T3M = VSUB(T2W, T2X);
+					T2Y = VADD(T2W, T2X);
+					T27 = VSUB(Te, Tl);
+					Tm = VADD(Te, Tl);
+					TA = VADD(Ty, Tz);
+					T4i = VSUB(Ty, Tz);
+				   }
+			      }
+			 }
+			 {
+			      V TK, T4p, T4u, T4k, T6d, T4n, T6c, TL, TN, TO, T3j, TJ, TF, TI;
+			      {
+				   V TD, TE, TG, TH;
+				   TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+				   TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+				   TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+				   TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+				   TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+				   {
+					V T3h, TB, T4j, T4m;
+					T3h = VADD(Tx, TA);
+					TB = VSUB(Tx, TA);
+					T4j = VADD(T4h, T4i);
+					T4m = VSUB(T4h, T4i);
+					T4p = VSUB(TD, TE);
+					TF = VADD(TD, TE);
+					T4u = VSUB(TH, TG);
+					TI = VADD(TG, TH);
+					T3A = VSUB(T3g, T3h);
+					T3i = VADD(T3g, T3h);
+					T29 = VFMA(LDK(KP414213562), Tu, TB);
+					TC = VFNMS(LDK(KP414213562), TB, Tu);
+					T4k = VFMA(LDK(KP707106781), T4j, T4g);
+					T6d = VFNMS(LDK(KP707106781), T4j, T4g);
+					T4n = VFMA(LDK(KP707106781), T4m, T4l);
+					T6c = VFNMS(LDK(KP707106781), T4m, T4l);
+					TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+				   }
+				   TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+				   TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+			      }
+			      T3j = VADD(TF, TI);
+			      TJ = VSUB(TF, TI);
+			      {
+				   V T3a, T1E, T52, T5b, T1x, T4Z, T6r, T6u, T5a, T1U, T55, T5c, T1L, T3b;
+				   {
+					V T4V, T1t, T58, T1w, T1Q, T1T, T1I, T4Y, T59, T1J, T53, T1H;
+					{
+					     V T1r, TM, T4r, TP, T4q, T1s, T1u, T1v;
+					     T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+					     T5p = VFMA(LDK(KP198912367), T4k, T4n);
+					     T4o = VFNMS(LDK(KP198912367), T4n, T4k);
+					     T6D = VFMA(LDK(KP668178637), T6c, T6d);
+					     T6e = VFNMS(LDK(KP668178637), T6d, T6c);
+					     TM = VADD(TK, TL);
+					     T4r = VSUB(TK, TL);
+					     TP = VADD(TN, TO);
+					     T4q = VSUB(TN, TO);
+					     T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+					     T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+					     T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+					     {
+						  V T1R, T4X, T6g, T4t, T6f, T4w, T1S, T1O, T1P;
+						  T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+						  T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+						  T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V T3k, TQ, T4s, T4v;
+						       T3k = VADD(TP, TM);
+						       TQ = VSUB(TM, TP);
+						       T4s = VADD(T4q, T4r);
+						       T4v = VSUB(T4r, T4q);
+						       T4V = VSUB(T1r, T1s);
+						       T1t = VADD(T1r, T1s);
+						       T58 = VSUB(T1v, T1u);
+						       T1w = VADD(T1u, T1v);
+						       T4X = VSUB(T1O, T1P);
+						       T1Q = VADD(T1O, T1P);
+						       T3l = VADD(T3j, T3k);
+						       T3B = VSUB(T3j, T3k);
+						       TR = VFNMS(LDK(KP414213562), TQ, TJ);
+						       T2a = VFMA(LDK(KP414213562), TJ, TQ);
+						       T6g = VFNMS(LDK(KP707106781), T4s, T4p);
+						       T4t = VFMA(LDK(KP707106781), T4s, T4p);
+						       T6f = VFNMS(LDK(KP707106781), T4v, T4u);
+						       T4w = VFMA(LDK(KP707106781), T4v, T4u);
+						       T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+						  }
+						  {
+						       V T4W, T1A, T50, T51, T1D, T1F, T1G;
+						       {
+							    V T1y, T1z, T1B, T1C;
+							    T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+							    T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+							    T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+							    T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+							    T4x = VFNMS(LDK(KP198912367), T4w, T4t);
+							    T5q = VFMA(LDK(KP198912367), T4t, T4w);
+							    T6h = VFNMS(LDK(KP668178637), T6g, T6f);
+							    T6E = VFMA(LDK(KP668178637), T6f, T6g);
+							    T4W = VSUB(T1R, T1S);
+							    T1T = VADD(T1R, T1S);
+							    T1A = VADD(T1y, T1z);
+							    T50 = VSUB(T1y, T1z);
+							    T51 = VSUB(T1C, T1B);
+							    T1D = VADD(T1B, T1C);
+						       }
+						       T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+						       T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+						       T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+						       T4Y = VADD(T4W, T4X);
+						       T59 = VSUB(T4X, T4W);
+						       T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+						       T3a = VADD(T1A, T1D);
+						       T1E = VSUB(T1A, T1D);
+						       T52 = VFMA(LDK(KP414213562), T51, T50);
+						       T5b = VFNMS(LDK(KP414213562), T50, T51);
+						       T53 = VSUB(T1F, T1G);
+						       T1H = VADD(T1F, T1G);
+						  }
+					     }
+					}
+					{
+					     V T37, T54, T1K, T38;
+					     T1x = VSUB(T1t, T1w);
+					     T37 = VADD(T1t, T1w);
+					     T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
+					     T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
+					     T54 = VSUB(T1J, T1I);
+					     T1K = VADD(T1I, T1J);
+					     T6u = VFNMS(LDK(KP707106781), T59, T58);
+					     T5a = VFMA(LDK(KP707106781), T59, T58);
+					     T38 = VADD(T1T, T1Q);
+					     T1U = VSUB(T1Q, T1T);
+					     T55 = VFNMS(LDK(KP414213562), T54, T53);
+					     T5c = VFMA(LDK(KP414213562), T53, T54);
+					     T1L = VSUB(T1H, T1K);
+					     T3b = VADD(T1H, T1K);
+					     T39 = VADD(T37, T38);
+					     T3H = VSUB(T37, T38);
+					}
+				   }
+				   {
+					V T4A, TW, T4N, TZ, T1j, T1m, T4O, T4D, T13, T4F, T16, T4G, T1a, T4I, T4J;
+					V T1d;
+					{
+					     V TU, TV, TX, TY, T56, T6v;
+					     TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+					     T56 = VADD(T52, T55);
+					     T6v = VSUB(T55, T52);
+					     {
+						  V T5d, T6s, T1V, T1M;
+						  T5d = VADD(T5b, T5c);
+						  T6s = VSUB(T5c, T5b);
+						  T1V = VSUB(T1L, T1E);
+						  T1M = VADD(T1E, T1L);
+						  T3I = VSUB(T3b, T3a);
+						  T3c = VADD(T3a, T3b);
+						  T5N = VFNMS(LDK(KP923879532), T56, T4Z);
+						  T57 = VFMA(LDK(KP923879532), T56, T4Z);
+						  T72 = VFNMS(LDK(KP923879532), T6v, T6u);
+						  T6w = VFMA(LDK(KP923879532), T6v, T6u);
+						  T5O = VFNMS(LDK(KP923879532), T5d, T5a);
+						  T5e = VFMA(LDK(KP923879532), T5d, T5a);
+						  T71 = VFMA(LDK(KP923879532), T6s, T6r);
+						  T6t = VFNMS(LDK(KP923879532), T6s, T6r);
+						  T2y = VFNMS(LDK(KP707106781), T1V, T1U);
+						  T1W = VFMA(LDK(KP707106781), T1V, T1U);
+						  T2x = VFNMS(LDK(KP707106781), T1M, T1x);
+						  T1N = VFMA(LDK(KP707106781), T1M, T1x);
+						  TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+					     }
+					     TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+					     TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+					     {
+						  V T1h, T1i, T1k, T1l;
+						  T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+						  T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+						  T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+						  T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V T11, T4B, T4C, T12, T14, T15;
+						       T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+						       T4A = VSUB(TU, TV);
+						       TW = VADD(TU, TV);
+						       T4N = VSUB(TX, TY);
+						       TZ = VADD(TX, TY);
+						       T1j = VADD(T1h, T1i);
+						       T4B = VSUB(T1h, T1i);
+						       T1m = VADD(T1k, T1l);
+						       T4C = VSUB(T1k, T1l);
+						       T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+						       T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+						       T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+						       {
+							    V T18, T19, T1b, T1c;
+							    T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+							    T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+							    T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+							    T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+							    T4O = VSUB(T4B, T4C);
+							    T4D = VADD(T4B, T4C);
+							    T13 = VADD(T11, T12);
+							    T4F = VSUB(T11, T12);
+							    T16 = VADD(T14, T15);
+							    T4G = VSUB(T14, T15);
+							    T1a = VADD(T18, T19);
+							    T4I = VSUB(T18, T19);
+							    T4J = VSUB(T1b, T1c);
+							    T1d = VADD(T1b, T1c);
+						       }
+						  }
+					     }
+					}
+					{
+					     V T30, T10, T6k, T4E, T4Q, T4H, T17, T6n, T4P, T1e, T4K, T4R, T1n, T31;
+					     T30 = VADD(TW, TZ);
+					     T10 = VSUB(TW, TZ);
+					     T6k = VFNMS(LDK(KP707106781), T4D, T4A);
+					     T4E = VFMA(LDK(KP707106781), T4D, T4A);
+					     T4Q = VFMA(LDK(KP414213562), T4F, T4G);
+					     T4H = VFNMS(LDK(KP414213562), T4G, T4F);
+					     T33 = VADD(T13, T16);
+					     T17 = VSUB(T13, T16);
+					     T6n = VFNMS(LDK(KP707106781), T4O, T4N);
+					     T4P = VFMA(LDK(KP707106781), T4O, T4N);
+					     T34 = VADD(T1a, T1d);
+					     T1e = VSUB(T1a, T1d);
+					     T4K = VFMA(LDK(KP414213562), T4J, T4I);
+					     T4R = VFNMS(LDK(KP414213562), T4I, T4J);
+					     T1n = VSUB(T1j, T1m);
+					     T31 = VADD(T1j, T1m);
+					     {
+						  V T1f, T1o, T6o, T4L, T4S, T6l;
+						  T1f = VADD(T17, T1e);
+						  T1o = VSUB(T17, T1e);
+						  T6o = VSUB(T4H, T4K);
+						  T4L = VADD(T4H, T4K);
+						  T4S = VADD(T4Q, T4R);
+						  T6l = VSUB(T4Q, T4R);
+						  T3E = VSUB(T30, T31);
+						  T32 = VADD(T30, T31);
+						  T1p = VFMA(LDK(KP707106781), T1o, T1n);
+						  T2v = VFNMS(LDK(KP707106781), T1o, T1n);
+						  T1g = VFMA(LDK(KP707106781), T1f, T10);
+						  T2u = VFNMS(LDK(KP707106781), T1f, T10);
+						  T4M = VFMA(LDK(KP923879532), T4L, T4E);
+						  T5K = VFNMS(LDK(KP923879532), T4L, T4E);
+						  T6p = VFMA(LDK(KP923879532), T6o, T6n);
+						  T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
+						  T6m = VFNMS(LDK(KP923879532), T6l, T6k);
+						  T6Y = VFMA(LDK(KP923879532), T6l, T6k);
+						  T5L = VFNMS(LDK(KP923879532), T4S, T4P);
+						  T4T = VFMA(LDK(KP923879532), T4S, T4P);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T6b, T6F, T7n, T7o, T7p, T7q, T7r, T7s, T7t, T7u, T7v, T7w, T7x, T7y, T7f;
+			 V T6X, T70, T79, T7a, T73, T6C, T76, T77, T6i;
+			 {
+			      V T2Z, T3r, T3s, T3m, T3d, T3v;
+			      T2Z = VSUB(T2V, T2Y);
+			      T3r = VADD(T2V, T2Y);
+			      T3s = VADD(T3i, T3l);
+			      T3m = VSUB(T3i, T3l);
+			      T3d = VSUB(T39, T3c);
+			      T3v = VADD(T39, T3c);
+			      {
+				   V T3x, T3t, T3Q, T3J, T3D, T3V, T3G, T3P, T3u, T36, T3O, T3Y, T6V, T6W;
+				   {
+					V T3N, T3C, T3F, T35;
+					T3N = VSUB(T3A, T3B);
+					T3C = VADD(T3A, T3B);
+					T3F = VSUB(T33, T34);
+					T35 = VADD(T33, T34);
+					T3x = VADD(T3r, T3s);
+					T3t = VSUB(T3r, T3s);
+					T3Q = VFMA(LDK(KP414213562), T3H, T3I);
+					T3J = VFNMS(LDK(KP414213562), T3I, T3H);
+					T3D = VFMA(LDK(KP707106781), T3C, T3z);
+					T3V = VFNMS(LDK(KP707106781), T3C, T3z);
+					T3G = VFNMS(LDK(KP414213562), T3F, T3E);
+					T3P = VFMA(LDK(KP414213562), T3E, T3F);
+					T3u = VADD(T32, T35);
+					T36 = VSUB(T32, T35);
+					T3O = VFMA(LDK(KP707106781), T3N, T3M);
+					T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
+				   }
+				   T6b = VFNMS(LDK(KP923879532), T6a, T69);
+				   T6V = VFMA(LDK(KP923879532), T6a, T69);
+				   T6W = VADD(T6D, T6E);
+				   T6F = VSUB(T6D, T6E);
+				   {
+					V T3R, T3W, T3K, T3Z;
+					T3R = VSUB(T3P, T3Q);
+					T3W = VADD(T3P, T3Q);
+					T3K = VADD(T3G, T3J);
+					T3Z = VSUB(T3G, T3J);
+					{
+					     V T3e, T3n, T3w, T3y;
+					     T3e = VADD(T36, T3d);
+					     T3n = VSUB(T36, T3d);
+					     T3w = VSUB(T3u, T3v);
+					     T3y = VADD(T3u, T3v);
+					     {
+						  V T41, T3X, T3S, T3U;
+						  T41 = VFMA(LDK(KP923879532), T3W, T3V);
+						  T3X = VFNMS(LDK(KP923879532), T3W, T3V);
+						  T3S = VFNMS(LDK(KP923879532), T3R, T3O);
+						  T3U = VFMA(LDK(KP923879532), T3R, T3O);
+						  {
+						       V T42, T40, T3L, T3T;
+						       T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
+						       T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
+						       T3L = VFNMS(LDK(KP923879532), T3K, T3D);
+						       T3T = VFMA(LDK(KP923879532), T3K, T3D);
+						       {
+							    V T3o, T3q, T3f, T3p;
+							    T3o = VFNMS(LDK(KP707106781), T3n, T3m);
+							    T3q = VFMA(LDK(KP707106781), T3n, T3m);
+							    T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
+							    T3p = VFMA(LDK(KP707106781), T3e, T2Z);
+							    T7n = VSUB(T3x, T3y);
+							    STM2(&(xo[64]), T7n, ovs, &(xo[0]));
+							    T7o = VADD(T3x, T3y);
+							    STM2(&(xo[0]), T7o, ovs, &(xo[0]));
+							    T7p = VFMAI(T3w, T3t);
+							    STM2(&(xo[32]), T7p, ovs, &(xo[0]));
+							    T7q = VFNMSI(T3w, T3t);
+							    STM2(&(xo[96]), T7q, ovs, &(xo[0]));
+							    T7r = VFNMSI(T40, T3X);
+							    STM2(&(xo[88]), T7r, ovs, &(xo[0]));
+							    T7s = VFMAI(T40, T3X);
+							    STM2(&(xo[40]), T7s, ovs, &(xo[0]));
+							    T7t = VFMAI(T42, T41);
+							    STM2(&(xo[104]), T7t, ovs, &(xo[0]));
+							    T7u = VFNMSI(T42, T41);
+							    STM2(&(xo[24]), T7u, ovs, &(xo[0]));
+							    T7v = VFMAI(T3U, T3T);
+							    STM2(&(xo[8]), T7v, ovs, &(xo[0]));
+							    T7w = VFNMSI(T3U, T3T);
+							    STM2(&(xo[120]), T7w, ovs, &(xo[0]));
+							    T7x = VFMAI(T3S, T3L);
+							    STM2(&(xo[72]), T7x, ovs, &(xo[0]));
+							    T7y = VFNMSI(T3S, T3L);
+							    STM2(&(xo[56]), T7y, ovs, &(xo[0]));
+							    T7z = VFNMSI(T3q, T3p);
+							    STM2(&(xo[112]), T7z, ovs, &(xo[0]));
+							    T7A = VFMAI(T3q, T3p);
+							    STM2(&(xo[16]), T7A, ovs, &(xo[0]));
+							    T7B = VFMAI(T3o, T3f);
+							    STM2(&(xo[80]), T7B, ovs, &(xo[0]));
+							    T7C = VFNMSI(T3o, T3f);
+							    STM2(&(xo[48]), T7C, ovs, &(xo[0]));
+							    T7f = VFNMS(LDK(KP831469612), T6W, T6V);
+							    T6X = VFMA(LDK(KP831469612), T6W, T6V);
+						       }
+						  }
+					     }
+					}
+				   }
+				   T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
+				   T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
+				   T7a = VFNMS(LDK(KP303346683), T71, T72);
+				   T73 = VFMA(LDK(KP303346683), T72, T71);
+				   T6C = VFMA(LDK(KP923879532), T6B, T6A);
+				   T76 = VFNMS(LDK(KP923879532), T6B, T6A);
+				   T77 = VSUB(T6e, T6h);
+				   T6i = VADD(T6e, T6h);
+			      }
+			 }
+			 {
+			      V T2r, T2D, T2C, T2s, T5H, T5o, T5v, T5D, T7L, T7O, T7Q, T7S, T5r, T5I, T5x;
+			      V T5h, T5F, T5B;
+			      {
+				   V TT, T2f, T7E, T7F, T7I, T7K, T2n, T1Y, T28, T2b, T2l, T2p, T2j, T2k;
+				   {
+					V T1q, T2d, T7h, T7l, T2e, T1X, T75, T7d, T7m, T7k, T7c, T7e, Tn, TS;
+					T2r = VFNMS(LDK(KP707106781), Tm, T7);
+					Tn = VFMA(LDK(KP707106781), Tm, T7);
+					TS = VADD(TC, TR);
+					T2D = VSUB(TC, TR);
+					{
+					     V T7b, T7j, T74, T7i, T78, T7g;
+					     T1q = VFNMS(LDK(KP198912367), T1p, T1g);
+					     T2d = VFMA(LDK(KP198912367), T1g, T1p);
+					     T7g = VADD(T79, T7a);
+					     T7b = VSUB(T79, T7a);
+					     T7j = VSUB(T70, T73);
+					     T74 = VADD(T70, T73);
+					     T7i = VFNMS(LDK(KP831469612), T77, T76);
+					     T78 = VFMA(LDK(KP831469612), T77, T76);
+					     T2j = VFNMS(LDK(KP923879532), TS, Tn);
+					     TT = VFMA(LDK(KP923879532), TS, Tn);
+					     T7h = VFMA(LDK(KP956940335), T7g, T7f);
+					     T7l = VFNMS(LDK(KP956940335), T7g, T7f);
+					     T2e = VFMA(LDK(KP198912367), T1N, T1W);
+					     T1X = VFNMS(LDK(KP198912367), T1W, T1N);
+					     T75 = VFNMS(LDK(KP956940335), T74, T6X);
+					     T7d = VFMA(LDK(KP956940335), T74, T6X);
+					     T7m = VFMA(LDK(KP956940335), T7j, T7i);
+					     T7k = VFNMS(LDK(KP956940335), T7j, T7i);
+					     T7c = VFNMS(LDK(KP956940335), T7b, T78);
+					     T7e = VFMA(LDK(KP956940335), T7b, T78);
+					}
+					T2k = VADD(T2d, T2e);
+					T2f = VSUB(T2d, T2e);
+					{
+					     V T7D, T7G, T7H, T7J;
+					     T7D = VFMAI(T7k, T7h);
+					     STM2(&(xo[90]), T7D, ovs, &(xo[2]));
+					     STN2(&(xo[88]), T7r, T7D, ovs);
+					     T7E = VFNMSI(T7k, T7h);
+					     STM2(&(xo[38]), T7E, ovs, &(xo[2]));
+					     T7F = VFNMSI(T7m, T7l);
+					     STM2(&(xo[102]), T7F, ovs, &(xo[2]));
+					     T7G = VFMAI(T7m, T7l);
+					     STM2(&(xo[26]), T7G, ovs, &(xo[2]));
+					     STN2(&(xo[24]), T7u, T7G, ovs);
+					     T7H = VFMAI(T7e, T7d);
+					     STM2(&(xo[122]), T7H, ovs, &(xo[2]));
+					     STN2(&(xo[120]), T7w, T7H, ovs);
+					     T7I = VFNMSI(T7e, T7d);
+					     STM2(&(xo[6]), T7I, ovs, &(xo[2]));
+					     T7J = VFMAI(T7c, T75);
+					     STM2(&(xo[58]), T7J, ovs, &(xo[2]));
+					     STN2(&(xo[56]), T7y, T7J, ovs);
+					     T7K = VFNMSI(T7c, T75);
+					     STM2(&(xo[70]), T7K, ovs, &(xo[2]));
+					     T2n = VSUB(T1q, T1X);
+					     T1Y = VADD(T1q, T1X);
+					}
+					T2C = VFNMS(LDK(KP707106781), T27, T26);
+					T28 = VFMA(LDK(KP707106781), T27, T26);
+					T2b = VSUB(T29, T2a);
+					T2s = VADD(T29, T2a);
+				   }
+				   T2l = VFNMS(LDK(KP980785280), T2k, T2j);
+				   T2p = VFMA(LDK(KP980785280), T2k, T2j);
+				   {
+					V T5z, T4z, T5A, T5g;
+					{
+					     V T4f, T4y, T1Z, T2h, T4U, T5t, T2m, T2c, T5u, T5f;
+					     T5H = VFNMS(LDK(KP923879532), T4e, T47);
+					     T4f = VFMA(LDK(KP923879532), T4e, T47);
+					     T4y = VADD(T4o, T4x);
+					     T5T = VSUB(T4o, T4x);
+					     T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
+					     T2h = VFMA(LDK(KP980785280), T1Y, TT);
+					     T4U = VFNMS(LDK(KP098491403), T4T, T4M);
+					     T5t = VFMA(LDK(KP098491403), T4M, T4T);
+					     T2m = VFNMS(LDK(KP923879532), T2b, T28);
+					     T2c = VFMA(LDK(KP923879532), T2b, T28);
+					     T5u = VFMA(LDK(KP098491403), T57, T5e);
+					     T5f = VFNMS(LDK(KP098491403), T5e, T57);
+					     T5z = VFNMS(LDK(KP980785280), T4y, T4f);
+					     T4z = VFMA(LDK(KP980785280), T4y, T4f);
+					     T5S = VFNMS(LDK(KP923879532), T5n, T5k);
+					     T5o = VFMA(LDK(KP923879532), T5n, T5k);
+					     {
+						  V T2o, T2q, T2i, T2g;
+						  T2o = VFMA(LDK(KP980785280), T2n, T2m);
+						  T2q = VFNMS(LDK(KP980785280), T2n, T2m);
+						  T2i = VFMA(LDK(KP980785280), T2f, T2c);
+						  T2g = VFNMS(LDK(KP980785280), T2f, T2c);
+						  T5A = VADD(T5t, T5u);
+						  T5v = VSUB(T5t, T5u);
+						  T5D = VSUB(T4U, T5f);
+						  T5g = VADD(T4U, T5f);
+						  T7L = VFNMSI(T2o, T2l);
+						  STM2(&(xo[92]), T7L, ovs, &(xo[0]));
+						  {
+						       V T7M, T7N, T7P, T7R;
+						       T7M = VFMAI(T2o, T2l);
+						       STM2(&(xo[36]), T7M, ovs, &(xo[0]));
+						       STN2(&(xo[36]), T7M, T7E, ovs);
+						       T7N = VFMAI(T2q, T2p);
+						       STM2(&(xo[100]), T7N, ovs, &(xo[0]));
+						       STN2(&(xo[100]), T7N, T7F, ovs);
+						       T7O = VFNMSI(T2q, T2p);
+						       STM2(&(xo[28]), T7O, ovs, &(xo[0]));
+						       T7P = VFMAI(T2i, T2h);
+						       STM2(&(xo[4]), T7P, ovs, &(xo[0]));
+						       STN2(&(xo[4]), T7P, T7I, ovs);
+						       T7Q = VFNMSI(T2i, T2h);
+						       STM2(&(xo[124]), T7Q, ovs, &(xo[0]));
+						       T7R = VFMAI(T2g, T1Z);
+						       STM2(&(xo[68]), T7R, ovs, &(xo[0]));
+						       STN2(&(xo[68]), T7R, T7K, ovs);
+						       T7S = VFNMSI(T2g, T1Z);
+						       STM2(&(xo[60]), T7S, ovs, &(xo[0]));
+						       T5r = VSUB(T5p, T5q);
+						       T5I = VADD(T5p, T5q);
+						  }
+					     }
+					}
+					T5x = VFMA(LDK(KP995184726), T5g, T4z);
+					T5h = VFNMS(LDK(KP995184726), T5g, T4z);
+					T5F = VFMA(LDK(KP995184726), T5A, T5z);
+					T5B = VFNMS(LDK(KP995184726), T5A, T5z);
+				   }
+			      }
+			      {
+				   V T6J, T6R, T6L, T6z, T6T, T6P;
+				   {
+					V T6N, T6j, T6O, T6y;
+					{
+					     V T6q, T6H, T5C, T5s, T6I, T6x;
+					     T6q = VFNMS(LDK(KP534511135), T6p, T6m);
+					     T6H = VFMA(LDK(KP534511135), T6m, T6p);
+					     T5C = VFNMS(LDK(KP980785280), T5r, T5o);
+					     T5s = VFMA(LDK(KP980785280), T5r, T5o);
+					     T6I = VFMA(LDK(KP534511135), T6t, T6w);
+					     T6x = VFNMS(LDK(KP534511135), T6w, T6t);
+					     T6N = VFMA(LDK(KP831469612), T6i, T6b);
+					     T6j = VFNMS(LDK(KP831469612), T6i, T6b);
+					     {
+						  V T5E, T5G, T5y, T5w;
+						  T5E = VFMA(LDK(KP995184726), T5D, T5C);
+						  T5G = VFNMS(LDK(KP995184726), T5D, T5C);
+						  T5y = VFMA(LDK(KP995184726), T5v, T5s);
+						  T5w = VFNMS(LDK(KP995184726), T5v, T5s);
+						  T6O = VADD(T6H, T6I);
+						  T6J = VSUB(T6H, T6I);
+						  T6R = VSUB(T6q, T6x);
+						  T6y = VADD(T6q, T6x);
+						  {
+						       V T7T, T7U, T7V, T7W;
+						       T7T = VFNMSI(T5E, T5B);
+						       STM2(&(xo[94]), T7T, ovs, &(xo[2]));
+						       STN2(&(xo[92]), T7L, T7T, ovs);
+						       T7U = VFMAI(T5E, T5B);
+						       STM2(&(xo[34]), T7U, ovs, &(xo[2]));
+						       STN2(&(xo[32]), T7p, T7U, ovs);
+						       T7V = VFMAI(T5G, T5F);
+						       STM2(&(xo[98]), T7V, ovs, &(xo[2]));
+						       STN2(&(xo[96]), T7q, T7V, ovs);
+						       T7W = VFNMSI(T5G, T5F);
+						       STM2(&(xo[30]), T7W, ovs, &(xo[2]));
+						       STN2(&(xo[28]), T7O, T7W, ovs);
+						       {
+							    V T7X, T7Y, T7Z, T80;
+							    T7X = VFMAI(T5y, T5x);
+							    STM2(&(xo[2]), T7X, ovs, &(xo[2]));
+							    STN2(&(xo[0]), T7o, T7X, ovs);
+							    T7Y = VFNMSI(T5y, T5x);
+							    STM2(&(xo[126]), T7Y, ovs, &(xo[2]));
+							    STN2(&(xo[124]), T7Q, T7Y, ovs);
+							    T7Z = VFMAI(T5w, T5h);
+							    STM2(&(xo[66]), T7Z, ovs, &(xo[2]));
+							    STN2(&(xo[64]), T7n, T7Z, ovs);
+							    T80 = VFNMSI(T5w, T5h);
+							    STM2(&(xo[62]), T80, ovs, &(xo[2]));
+							    STN2(&(xo[60]), T7S, T80, ovs);
+						       }
+						  }
+					     }
+					}
+					T6L = VFMA(LDK(KP881921264), T6y, T6j);
+					T6z = VFNMS(LDK(KP881921264), T6y, T6j);
+					T6T = VFMA(LDK(KP881921264), T6O, T6N);
+					T6P = VFNMS(LDK(KP881921264), T6O, T6N);
+				   }
+				   {
+					V T2H, T2P, T81, T84, T86, T88, T2J, T2B, T2R, T2N;
+					{
+					     V T2L, T2t, T2M, T2A;
+					     {
+						  V T2w, T2F, T6Q, T6G, T2G, T2z;
+						  T2w = VFMA(LDK(KP668178637), T2v, T2u);
+						  T2F = VFNMS(LDK(KP668178637), T2u, T2v);
+						  T6Q = VFNMS(LDK(KP831469612), T6F, T6C);
+						  T6G = VFMA(LDK(KP831469612), T6F, T6C);
+						  T2G = VFNMS(LDK(KP668178637), T2x, T2y);
+						  T2z = VFMA(LDK(KP668178637), T2y, T2x);
+						  T2L = VFNMS(LDK(KP923879532), T2s, T2r);
+						  T2t = VFMA(LDK(KP923879532), T2s, T2r);
+						  {
+						       V T6S, T6U, T6M, T6K;
+						       T6S = VFMA(LDK(KP881921264), T6R, T6Q);
+						       T6U = VFNMS(LDK(KP881921264), T6R, T6Q);
+						       T6M = VFMA(LDK(KP881921264), T6J, T6G);
+						       T6K = VFNMS(LDK(KP881921264), T6J, T6G);
+						       T2M = VADD(T2F, T2G);
+						       T2H = VSUB(T2F, T2G);
+						       T2P = VSUB(T2w, T2z);
+						       T2A = VADD(T2w, T2z);
+						       T81 = VFNMSI(T6S, T6P);
+						       STM2(&(xo[86]), T81, ovs, &(xo[2]));
+						       {
+							    V T82, T83, T85, T87;
+							    T82 = VFMAI(T6S, T6P);
+							    STM2(&(xo[42]), T82, ovs, &(xo[2]));
+							    STN2(&(xo[40]), T7s, T82, ovs);
+							    T83 = VFMAI(T6U, T6T);
+							    STM2(&(xo[106]), T83, ovs, &(xo[2]));
+							    STN2(&(xo[104]), T7t, T83, ovs);
+							    T84 = VFNMSI(T6U, T6T);
+							    STM2(&(xo[22]), T84, ovs, &(xo[2]));
+							    T85 = VFMAI(T6M, T6L);
+							    STM2(&(xo[10]), T85, ovs, &(xo[2]));
+							    STN2(&(xo[8]), T7v, T85, ovs);
+							    T86 = VFNMSI(T6M, T6L);
+							    STM2(&(xo[118]), T86, ovs, &(xo[2]));
+							    T87 = VFMAI(T6K, T6z);
+							    STM2(&(xo[74]), T87, ovs, &(xo[2]));
+							    STN2(&(xo[72]), T7x, T87, ovs);
+							    T88 = VFNMSI(T6K, T6z);
+							    STM2(&(xo[54]), T88, ovs, &(xo[2]));
+						       }
+						  }
+					     }
+					     T2J = VFMA(LDK(KP831469612), T2A, T2t);
+					     T2B = VFNMS(LDK(KP831469612), T2A, T2t);
+					     T2R = VFNMS(LDK(KP831469612), T2M, T2L);
+					     T2N = VFMA(LDK(KP831469612), T2M, T2L);
+					}
+					{
+					     V T61, T5J, T62, T5Q;
+					     {
+						  V T5M, T5V, T2O, T2E, T5W, T5P;
+						  T5M = VFMA(LDK(KP820678790), T5L, T5K);
+						  T5V = VFNMS(LDK(KP820678790), T5K, T5L);
+						  T2O = VFMA(LDK(KP923879532), T2D, T2C);
+						  T2E = VFNMS(LDK(KP923879532), T2D, T2C);
+						  T5W = VFNMS(LDK(KP820678790), T5N, T5O);
+						  T5P = VFMA(LDK(KP820678790), T5O, T5N);
+						  T61 = VFNMS(LDK(KP980785280), T5I, T5H);
+						  T5J = VFMA(LDK(KP980785280), T5I, T5H);
+						  {
+						       V T2Q, T2S, T2K, T2I;
+						       T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
+						       T2S = VFMA(LDK(KP831469612), T2P, T2O);
+						       T2K = VFMA(LDK(KP831469612), T2H, T2E);
+						       T2I = VFNMS(LDK(KP831469612), T2H, T2E);
+						       T62 = VADD(T5V, T5W);
+						       T5X = VSUB(T5V, T5W);
+						       T65 = VSUB(T5M, T5P);
+						       T5Q = VADD(T5M, T5P);
+						       {
+							    V T89, T8c, T8d, T8f;
+							    T89 = VFMAI(T2Q, T2N);
+							    STM2(&(xo[84]), T89, ovs, &(xo[0]));
+							    STN2(&(xo[84]), T89, T81, ovs);
+							    T8a = VFNMSI(T2Q, T2N);
+							    STM2(&(xo[44]), T8a, ovs, &(xo[0]));
+							    T8b = VFNMSI(T2S, T2R);
+							    STM2(&(xo[108]), T8b, ovs, &(xo[0]));
+							    T8c = VFMAI(T2S, T2R);
+							    STM2(&(xo[20]), T8c, ovs, &(xo[0]));
+							    STN2(&(xo[20]), T8c, T84, ovs);
+							    T8d = VFMAI(T2K, T2J);
+							    STM2(&(xo[116]), T8d, ovs, &(xo[0]));
+							    STN2(&(xo[116]), T8d, T86, ovs);
+							    T8e = VFNMSI(T2K, T2J);
+							    STM2(&(xo[12]), T8e, ovs, &(xo[0]));
+							    T8f = VFMAI(T2I, T2B);
+							    STM2(&(xo[52]), T8f, ovs, &(xo[0]));
+							    STN2(&(xo[52]), T8f, T88, ovs);
+							    T8g = VFNMSI(T2I, T2B);
+							    STM2(&(xo[76]), T8g, ovs, &(xo[0]));
+						       }
+						  }
+					     }
+					     T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
+					     T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
+					     T67 = VFNMS(LDK(KP773010453), T62, T61);
+					     T63 = VFMA(LDK(KP773010453), T62, T61);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T5U = VFNMS(LDK(KP980785280), T5T, T5S);
+	       T64 = VFMA(LDK(KP980785280), T5T, T5S);
+	       {
+		    V T68, T66, T5Y, T60;
+		    T68 = VFMA(LDK(KP773010453), T65, T64);
+		    T66 = VFNMS(LDK(KP773010453), T65, T64);
+		    T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
+		    T60 = VFMA(LDK(KP773010453), T5X, T5U);
+		    {
+			 V T8h, T8i, T8j, T8k;
+			 T8h = VFMAI(T66, T63);
+			 STM2(&(xo[82]), T8h, ovs, &(xo[2]));
+			 STN2(&(xo[80]), T7B, T8h, ovs);
+			 T8i = VFNMSI(T66, T63);
+			 STM2(&(xo[46]), T8i, ovs, &(xo[2]));
+			 STN2(&(xo[44]), T8a, T8i, ovs);
+			 T8j = VFNMSI(T68, T67);
+			 STM2(&(xo[110]), T8j, ovs, &(xo[2]));
+			 STN2(&(xo[108]), T8b, T8j, ovs);
+			 T8k = VFMAI(T68, T67);
+			 STM2(&(xo[18]), T8k, ovs, &(xo[2]));
+			 STN2(&(xo[16]), T7A, T8k, ovs);
+			 {
+			      V T8l, T8m, T8n, T8o;
+			      T8l = VFMAI(T60, T5Z);
+			      STM2(&(xo[114]), T8l, ovs, &(xo[2]));
+			      STN2(&(xo[112]), T7z, T8l, ovs);
+			      T8m = VFNMSI(T60, T5Z);
+			      STM2(&(xo[14]), T8m, ovs, &(xo[2]));
+			      STN2(&(xo[12]), T8e, T8m, ovs);
+			      T8n = VFMAI(T5Y, T5R);
+			      STM2(&(xo[50]), T8n, ovs, &(xo[2]));
+			      STN2(&(xo[48]), T7C, T8n, ovs);
+			      T8o = VFNMSI(T5Y, T5R);
+			      STM2(&(xo[78]), T8o, ovs, &(xo[2]));
+			      STN2(&(xo[76]), T8g, T8o, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 64, XSIMD_STRING("n2bv_64"), {198, 0, 258, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_64) (planner *p) {
+     X(kdft_register) (p, n2bv_64, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n2bv_64 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 456 FP additions, 124 FP multiplications,
+ * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
+ * 128 stack variables, 15 constants, and 160 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       V T4p, T5u, Tb, T3A, T2q, T3v, T6G, T78, Tq, T3w, T6B, T79, T2l, T3B, T4w;
+	       V T5r, TI, T2g, T6u, T74, T3q, T3D, T4E, T5o, TZ, T2h, T6x, T75, T3t, T3E;
+	       V T4L, T5p, T23, T2N, T6m, T70, T6p, T71, T2c, T2O, T3i, T3Y, T5f, T5R, T5k;
+	       V T5S, T3l, T3Z, T1s, T2K, T6f, T6X, T6i, T6Y, T1B, T2L, T3b, T3V, T4Y, T5O;
+	       V T53, T5P, T3e, T3W;
+	       {
+		    V T3, T4n, T2p, T4o, T6, T5s, T9, T5t;
+		    {
+			 V T1, T2, T2n, T2o;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+			 T3 = VSUB(T1, T2);
+			 T4n = VADD(T1, T2);
+			 T2n = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T2o = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+			 T2p = VSUB(T2n, T2o);
+			 T4o = VADD(T2n, T2o);
+		    }
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T5s = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T5t = VADD(T7, T8);
+		    }
+		    T4p = VSUB(T4n, T4o);
+		    T5u = VSUB(T5s, T5t);
+		    {
+			 V Ta, T2m, T6E, T6F;
+			 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+			 Tb = VSUB(T3, Ta);
+			 T3A = VADD(T3, Ta);
+			 T2m = VMUL(LDK(KP707106781), VSUB(T6, T9));
+			 T2q = VSUB(T2m, T2p);
+			 T3v = VADD(T2p, T2m);
+			 T6E = VADD(T4n, T4o);
+			 T6F = VADD(T5s, T5t);
+			 T6G = VSUB(T6E, T6F);
+			 T78 = VADD(T6E, T6F);
+		    }
+	       }
+	       {
+		    V Te, T4q, To, T4t, Th, T4r, Tl, T4u;
+		    {
+			 V Tc, Td, Tm, Tn;
+			 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+			 Te = VSUB(Tc, Td);
+			 T4q = VADD(Tc, Td);
+			 Tm = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+			 Tn = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			 To = VSUB(Tm, Tn);
+			 T4t = VADD(Tm, Tn);
+		    }
+		    {
+			 V Tf, Tg, Tj, Tk;
+			 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+			 Th = VSUB(Tf, Tg);
+			 T4r = VADD(Tf, Tg);
+			 Tj = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 T4u = VADD(Tj, Tk);
+		    }
+		    {
+			 V Ti, Tp, T6z, T6A;
+			 Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
+			 Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
+			 Tq = VSUB(Ti, Tp);
+			 T3w = VADD(Ti, Tp);
+			 T6z = VADD(T4q, T4r);
+			 T6A = VADD(T4t, T4u);
+			 T6B = VSUB(T6z, T6A);
+			 T79 = VADD(T6z, T6A);
+		    }
+		    {
+			 V T2j, T2k, T4s, T4v;
+			 T2j = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
+			 T2k = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
+			 T2l = VSUB(T2j, T2k);
+			 T3B = VADD(T2j, T2k);
+			 T4s = VSUB(T4q, T4r);
+			 T4v = VSUB(T4t, T4u);
+			 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
+			 T5r = VMUL(LDK(KP707106781), VSUB(T4s, T4v));
+		    }
+	       }
+	       {
+		    V TB, T4z, TF, T4y, Ty, T4C, TG, T4B;
+		    {
+			 V Tz, TA, TD, TE;
+			 Tz = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 TA = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+			 TB = VSUB(Tz, TA);
+			 T4z = VADD(Tz, TA);
+			 TD = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 TE = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+			 TF = VSUB(TD, TE);
+			 T4y = VADD(TD, TE);
+			 {
+			      V Ts, Tt, Tu, Tv, Tw, Tx;
+			      Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			      Tt = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+			      Tu = VSUB(Ts, Tt);
+			      Tv = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+			      Tw = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+			      Tx = VSUB(Tv, Tw);
+			      Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
+			      T4C = VADD(Tv, Tw);
+			      TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
+			      T4B = VADD(Ts, Tt);
+			 }
+		    }
+		    {
+			 V TC, TH, T6s, T6t;
+			 TC = VSUB(Ty, TB);
+			 TH = VSUB(TF, TG);
+			 TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
+			 T2g = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
+			 T6s = VADD(T4y, T4z);
+			 T6t = VADD(T4B, T4C);
+			 T6u = VSUB(T6s, T6t);
+			 T74 = VADD(T6s, T6t);
+		    }
+		    {
+			 V T3o, T3p, T4A, T4D;
+			 T3o = VADD(TB, Ty);
+			 T3p = VADD(TF, TG);
+			 T3q = VFMA(LDK(KP980785280), T3o, VMUL(LDK(KP195090322), T3p));
+			 T3D = VFNMS(LDK(KP195090322), T3o, VMUL(LDK(KP980785280), T3p));
+			 T4A = VSUB(T4y, T4z);
+			 T4D = VSUB(T4B, T4C);
+			 T4E = VFMA(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
+			 T5o = VFNMS(LDK(KP382683432), T4D, VMUL(LDK(KP923879532), T4A));
+		    }
+	       }
+	       {
+		    V TS, T4J, TW, T4I, TP, T4G, TX, T4F;
+		    {
+			 V TQ, TR, TU, TV;
+			 TQ = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 TR = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+			 TS = VSUB(TQ, TR);
+			 T4J = VADD(TQ, TR);
+			 TU = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+			 TV = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+			 TW = VSUB(TU, TV);
+			 T4I = VADD(TU, TV);
+			 {
+			      V TJ, TK, TL, TM, TN, TO;
+			      TJ = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			      TK = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+			      TL = VSUB(TJ, TK);
+			      TM = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+			      TN = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			      TO = VSUB(TM, TN);
+			      TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
+			      T4G = VADD(TM, TN);
+			      TX = VMUL(LDK(KP707106781), VADD(TL, TO));
+			      T4F = VADD(TJ, TK);
+			 }
+		    }
+		    {
+			 V TT, TY, T6v, T6w;
+			 TT = VSUB(TP, TS);
+			 TY = VSUB(TW, TX);
+			 TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
+			 T2h = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
+			 T6v = VADD(T4I, T4J);
+			 T6w = VADD(T4F, T4G);
+			 T6x = VSUB(T6v, T6w);
+			 T75 = VADD(T6v, T6w);
+		    }
+		    {
+			 V T3r, T3s, T4H, T4K;
+			 T3r = VADD(TS, TP);
+			 T3s = VADD(TW, TX);
+			 T3t = VFNMS(LDK(KP195090322), T3s, VMUL(LDK(KP980785280), T3r));
+			 T3E = VFMA(LDK(KP195090322), T3r, VMUL(LDK(KP980785280), T3s));
+			 T4H = VSUB(T4F, T4G);
+			 T4K = VSUB(T4I, T4J);
+			 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
+			 T5p = VFMA(LDK(KP923879532), T4K, VMUL(LDK(KP382683432), T4H));
+		    }
+	       }
+	       {
+		    V T21, T5h, T26, T5g, T1Y, T5d, T27, T5c, T55, T56, T1J, T57, T29, T58, T59;
+		    V T1Q, T5a, T2a;
+		    {
+			 V T1Z, T20, T24, T25;
+			 T1Z = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T20 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+			 T21 = VSUB(T1Z, T20);
+			 T5h = VADD(T1Z, T20);
+			 T24 = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+			 T25 = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+			 T26 = VSUB(T24, T25);
+			 T5g = VADD(T24, T25);
+		    }
+		    {
+			 V T1S, T1T, T1U, T1V, T1W, T1X;
+			 T1S = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T1T = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+			 T1U = VSUB(T1S, T1T);
+			 T1V = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+			 T1W = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			 T1X = VSUB(T1V, T1W);
+			 T1Y = VMUL(LDK(KP707106781), VSUB(T1U, T1X));
+			 T5d = VADD(T1V, T1W);
+			 T27 = VMUL(LDK(KP707106781), VADD(T1U, T1X));
+			 T5c = VADD(T1S, T1T);
+		    }
+		    {
+			 V T1F, T1I, T1M, T1P;
+			 {
+			      V T1D, T1E, T1G, T1H;
+			      T1D = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			      T1E = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+			      T1F = VSUB(T1D, T1E);
+			      T55 = VADD(T1D, T1E);
+			      T1G = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			      T1H = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+			      T1I = VSUB(T1G, T1H);
+			      T56 = VADD(T1G, T1H);
+			 }
+			 T1J = VFNMS(LDK(KP382683432), T1I, VMUL(LDK(KP923879532), T1F));
+			 T57 = VSUB(T55, T56);
+			 T29 = VFMA(LDK(KP382683432), T1F, VMUL(LDK(KP923879532), T1I));
+			 {
+			      V T1K, T1L, T1N, T1O;
+			      T1K = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+			      T1L = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+			      T1M = VSUB(T1K, T1L);
+			      T58 = VADD(T1K, T1L);
+			      T1N = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			      T1O = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+			      T1P = VSUB(T1N, T1O);
+			      T59 = VADD(T1N, T1O);
+			 }
+			 T1Q = VFMA(LDK(KP923879532), T1M, VMUL(LDK(KP382683432), T1P));
+			 T5a = VSUB(T58, T59);
+			 T2a = VFNMS(LDK(KP382683432), T1M, VMUL(LDK(KP923879532), T1P));
+		    }
+		    {
+			 V T1R, T22, T6k, T6l;
+			 T1R = VSUB(T1J, T1Q);
+			 T22 = VSUB(T1Y, T21);
+			 T23 = VSUB(T1R, T22);
+			 T2N = VADD(T22, T1R);
+			 T6k = VADD(T5g, T5h);
+			 T6l = VADD(T5c, T5d);
+			 T6m = VSUB(T6k, T6l);
+			 T70 = VADD(T6k, T6l);
+		    }
+		    {
+			 V T6n, T6o, T28, T2b;
+			 T6n = VADD(T55, T56);
+			 T6o = VADD(T58, T59);
+			 T6p = VSUB(T6n, T6o);
+			 T71 = VADD(T6n, T6o);
+			 T28 = VSUB(T26, T27);
+			 T2b = VSUB(T29, T2a);
+			 T2c = VSUB(T28, T2b);
+			 T2O = VADD(T28, T2b);
+		    }
+		    {
+			 V T3g, T3h, T5b, T5e;
+			 T3g = VADD(T26, T27);
+			 T3h = VADD(T1J, T1Q);
+			 T3i = VADD(T3g, T3h);
+			 T3Y = VSUB(T3g, T3h);
+			 T5b = VMUL(LDK(KP707106781), VSUB(T57, T5a));
+			 T5e = VSUB(T5c, T5d);
+			 T5f = VSUB(T5b, T5e);
+			 T5R = VADD(T5e, T5b);
+		    }
+		    {
+			 V T5i, T5j, T3j, T3k;
+			 T5i = VSUB(T5g, T5h);
+			 T5j = VMUL(LDK(KP707106781), VADD(T57, T5a));
+			 T5k = VSUB(T5i, T5j);
+			 T5S = VADD(T5i, T5j);
+			 T3j = VADD(T21, T1Y);
+			 T3k = VADD(T29, T2a);
+			 T3l = VADD(T3j, T3k);
+			 T3Z = VSUB(T3k, T3j);
+		    }
+	       }
+	       {
+		    V T1q, T50, T1v, T4Z, T1n, T4W, T1w, T4V, T4O, T4P, T18, T4Q, T1y, T4R, T4S;
+		    V T1f, T4T, T1z;
+		    {
+			 V T1o, T1p, T1t, T1u;
+			 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+			 T1q = VSUB(T1o, T1p);
+			 T50 = VADD(T1o, T1p);
+			 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+			 T1v = VSUB(T1t, T1u);
+			 T4Z = VADD(T1t, T1u);
+		    }
+		    {
+			 V T1h, T1i, T1j, T1k, T1l, T1m;
+			 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+			 T1j = VSUB(T1h, T1i);
+			 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+			 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+			 T1m = VSUB(T1k, T1l);
+			 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
+			 T4W = VADD(T1k, T1l);
+			 T1w = VMUL(LDK(KP707106781), VADD(T1j, T1m));
+			 T4V = VADD(T1h, T1i);
+		    }
+		    {
+			 V T14, T17, T1b, T1e;
+			 {
+			      V T12, T13, T15, T16;
+			      T12 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			      T13 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+			      T14 = VSUB(T12, T13);
+			      T4O = VADD(T12, T13);
+			      T15 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			      T16 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+			      T17 = VSUB(T15, T16);
+			      T4P = VADD(T15, T16);
+			 }
+			 T18 = VFNMS(LDK(KP382683432), T17, VMUL(LDK(KP923879532), T14));
+			 T4Q = VSUB(T4O, T4P);
+			 T1y = VFMA(LDK(KP382683432), T14, VMUL(LDK(KP923879532), T17));
+			 {
+			      V T19, T1a, T1c, T1d;
+			      T19 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+			      T1a = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+			      T1b = VSUB(T19, T1a);
+			      T4R = VADD(T19, T1a);
+			      T1c = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			      T1d = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+			      T1e = VSUB(T1c, T1d);
+			      T4S = VADD(T1c, T1d);
+			 }
+			 T1f = VFMA(LDK(KP923879532), T1b, VMUL(LDK(KP382683432), T1e));
+			 T4T = VSUB(T4R, T4S);
+			 T1z = VFNMS(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
+		    }
+		    {
+			 V T1g, T1r, T6d, T6e;
+			 T1g = VSUB(T18, T1f);
+			 T1r = VSUB(T1n, T1q);
+			 T1s = VSUB(T1g, T1r);
+			 T2K = VADD(T1r, T1g);
+			 T6d = VADD(T4Z, T50);
+			 T6e = VADD(T4V, T4W);
+			 T6f = VSUB(T6d, T6e);
+			 T6X = VADD(T6d, T6e);
+		    }
+		    {
+			 V T6g, T6h, T1x, T1A;
+			 T6g = VADD(T4O, T4P);
+			 T6h = VADD(T4R, T4S);
+			 T6i = VSUB(T6g, T6h);
+			 T6Y = VADD(T6g, T6h);
+			 T1x = VSUB(T1v, T1w);
+			 T1A = VSUB(T1y, T1z);
+			 T1B = VSUB(T1x, T1A);
+			 T2L = VADD(T1x, T1A);
+		    }
+		    {
+			 V T39, T3a, T4U, T4X;
+			 T39 = VADD(T1v, T1w);
+			 T3a = VADD(T18, T1f);
+			 T3b = VADD(T39, T3a);
+			 T3V = VSUB(T39, T3a);
+			 T4U = VMUL(LDK(KP707106781), VSUB(T4Q, T4T));
+			 T4X = VSUB(T4V, T4W);
+			 T4Y = VSUB(T4U, T4X);
+			 T5O = VADD(T4X, T4U);
+		    }
+		    {
+			 V T51, T52, T3c, T3d;
+			 T51 = VSUB(T4Z, T50);
+			 T52 = VMUL(LDK(KP707106781), VADD(T4Q, T4T));
+			 T53 = VSUB(T51, T52);
+			 T5P = VADD(T51, T52);
+			 T3c = VADD(T1q, T1n);
+			 T3d = VADD(T1y, T1z);
+			 T3e = VADD(T3c, T3d);
+			 T3W = VSUB(T3d, T3c);
+		    }
+	       }
+	       {
+		    V T7n, T7o, T7p, T7q, T7r, T7s, T7t, T7u, T7v, T7w, T7x, T7y, T7z, T7A, T7B;
+		    V T7C, T7D, T7E, T7F, T7G, T7H, T7I, T7J, T7K;
+		    {
+			 V T7h, T7l, T7k, T7m;
+			 {
+			      V T7f, T7g, T7i, T7j;
+			      T7f = VADD(T78, T79);
+			      T7g = VADD(T74, T75);
+			      T7h = VSUB(T7f, T7g);
+			      T7l = VADD(T7f, T7g);
+			      T7i = VADD(T6X, T6Y);
+			      T7j = VADD(T70, T71);
+			      T7k = VBYI(VSUB(T7i, T7j));
+			      T7m = VADD(T7i, T7j);
+			 }
+			 T7n = VSUB(T7h, T7k);
+			 STM2(&(xo[96]), T7n, ovs, &(xo[0]));
+			 T7o = VADD(T7l, T7m);
+			 STM2(&(xo[0]), T7o, ovs, &(xo[0]));
+			 T7p = VADD(T7h, T7k);
+			 STM2(&(xo[32]), T7p, ovs, &(xo[0]));
+			 T7q = VSUB(T7l, T7m);
+			 STM2(&(xo[64]), T7q, ovs, &(xo[0]));
+		    }
+		    {
+			 V T76, T7a, T73, T7b, T6Z, T72;
+			 T76 = VSUB(T74, T75);
+			 T7a = VSUB(T78, T79);
+			 T6Z = VSUB(T6X, T6Y);
+			 T72 = VSUB(T70, T71);
+			 T73 = VMUL(LDK(KP707106781), VSUB(T6Z, T72));
+			 T7b = VMUL(LDK(KP707106781), VADD(T6Z, T72));
+			 {
+			      V T77, T7c, T7d, T7e;
+			      T77 = VBYI(VSUB(T73, T76));
+			      T7c = VSUB(T7a, T7b);
+			      T7r = VADD(T77, T7c);
+			      STM2(&(xo[48]), T7r, ovs, &(xo[0]));
+			      T7s = VSUB(T7c, T77);
+			      STM2(&(xo[80]), T7s, ovs, &(xo[0]));
+			      T7d = VBYI(VADD(T76, T73));
+			      T7e = VADD(T7a, T7b);
+			      T7t = VADD(T7d, T7e);
+			      STM2(&(xo[16]), T7t, ovs, &(xo[0]));
+			      T7u = VSUB(T7e, T7d);
+			      STM2(&(xo[112]), T7u, ovs, &(xo[0]));
+			 }
+		    }
+		    {
+			 V T6C, T6S, T6I, T6P, T6r, T6Q, T6L, T6T, T6y, T6H;
+			 T6y = VMUL(LDK(KP707106781), VSUB(T6u, T6x));
+			 T6C = VSUB(T6y, T6B);
+			 T6S = VADD(T6B, T6y);
+			 T6H = VMUL(LDK(KP707106781), VADD(T6u, T6x));
+			 T6I = VSUB(T6G, T6H);
+			 T6P = VADD(T6G, T6H);
+			 {
+			      V T6j, T6q, T6J, T6K;
+			      T6j = VFNMS(LDK(KP382683432), T6i, VMUL(LDK(KP923879532), T6f));
+			      T6q = VFMA(LDK(KP923879532), T6m, VMUL(LDK(KP382683432), T6p));
+			      T6r = VSUB(T6j, T6q);
+			      T6Q = VADD(T6j, T6q);
+			      T6J = VFMA(LDK(KP382683432), T6f, VMUL(LDK(KP923879532), T6i));
+			      T6K = VFNMS(LDK(KP382683432), T6m, VMUL(LDK(KP923879532), T6p));
+			      T6L = VSUB(T6J, T6K);
+			      T6T = VADD(T6J, T6K);
+			 }
+			 {
+			      V T6D, T6M, T6V, T6W;
+			      T6D = VBYI(VSUB(T6r, T6C));
+			      T6M = VSUB(T6I, T6L);
+			      T7v = VADD(T6D, T6M);
+			      STM2(&(xo[40]), T7v, ovs, &(xo[0]));
+			      T7w = VSUB(T6M, T6D);
+			      STM2(&(xo[88]), T7w, ovs, &(xo[0]));
+			      T6V = VSUB(T6P, T6Q);
+			      T6W = VBYI(VSUB(T6T, T6S));
+			      T7x = VSUB(T6V, T6W);
+			      STM2(&(xo[72]), T7x, ovs, &(xo[0]));
+			      T7y = VADD(T6V, T6W);
+			      STM2(&(xo[56]), T7y, ovs, &(xo[0]));
+			 }
+			 {
+			      V T6N, T6O, T6R, T6U;
+			      T6N = VBYI(VADD(T6C, T6r));
+			      T6O = VADD(T6I, T6L);
+			      T7z = VADD(T6N, T6O);
+			      STM2(&(xo[24]), T7z, ovs, &(xo[0]));
+			      T7A = VSUB(T6O, T6N);
+			      STM2(&(xo[104]), T7A, ovs, &(xo[0]));
+			      T6R = VADD(T6P, T6Q);
+			      T6U = VBYI(VADD(T6S, T6T));
+			      T7B = VSUB(T6R, T6U);
+			      STM2(&(xo[120]), T7B, ovs, &(xo[0]));
+			      T7C = VADD(T6R, T6U);
+			      STM2(&(xo[8]), T7C, ovs, &(xo[0]));
+			 }
+		    }
+		    {
+			 V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
+			 {
+			      V T5L, T5M, T5Z, T60;
+			      T5L = VADD(T4p, T4w);
+			      T5M = VADD(T5o, T5p);
+			      T5N = VSUB(T5L, T5M);
+			      T68 = VADD(T5L, T5M);
+			      T5Z = VFNMS(LDK(KP195090322), T5O, VMUL(LDK(KP980785280), T5P));
+			      T60 = VFMA(LDK(KP195090322), T5R, VMUL(LDK(KP980785280), T5S));
+			      T61 = VSUB(T5Z, T60);
+			      T69 = VADD(T5Z, T60);
+			 }
+			 {
+			      V T5Q, T5T, T5W, T5X;
+			      T5Q = VFMA(LDK(KP980785280), T5O, VMUL(LDK(KP195090322), T5P));
+			      T5T = VFNMS(LDK(KP195090322), T5S, VMUL(LDK(KP980785280), T5R));
+			      T5U = VSUB(T5Q, T5T);
+			      T65 = VADD(T5Q, T5T);
+			      T5W = VADD(T4E, T4L);
+			      T5X = VADD(T5u, T5r);
+			      T5Y = VSUB(T5W, T5X);
+			      T66 = VADD(T5X, T5W);
+			 }
+			 {
+			      V T5V, T62, T6b, T6c;
+			      T5V = VADD(T5N, T5U);
+			      T62 = VBYI(VADD(T5Y, T61));
+			      T7D = VSUB(T5V, T62);
+			      STM2(&(xo[100]), T7D, ovs, &(xo[0]));
+			      T7E = VADD(T5V, T62);
+			      STM2(&(xo[28]), T7E, ovs, &(xo[0]));
+			      T6b = VBYI(VADD(T66, T65));
+			      T6c = VADD(T68, T69);
+			      T7F = VADD(T6b, T6c);
+			      STM2(&(xo[4]), T7F, ovs, &(xo[0]));
+			      T7G = VSUB(T6c, T6b);
+			      STM2(&(xo[124]), T7G, ovs, &(xo[0]));
+			 }
+			 {
+			      V T63, T64, T67, T6a;
+			      T63 = VSUB(T5N, T5U);
+			      T64 = VBYI(VSUB(T61, T5Y));
+			      T7H = VSUB(T63, T64);
+			      STM2(&(xo[92]), T7H, ovs, &(xo[0]));
+			      T7I = VADD(T63, T64);
+			      STM2(&(xo[36]), T7I, ovs, &(xo[0]));
+			      T67 = VBYI(VSUB(T65, T66));
+			      T6a = VSUB(T68, T69);
+			      T7J = VADD(T67, T6a);
+			      STM2(&(xo[60]), T7J, ovs, &(xo[0]));
+			      T7K = VSUB(T6a, T67);
+			      STM2(&(xo[68]), T7K, ovs, &(xo[0]));
+			 }
+		    }
+		    {
+			 V T7M, T7O, T7P, T7R;
+			 {
+			      V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
+			      {
+				   V Tr, T10, T2t, T2u;
+				   Tr = VSUB(Tb, Tq);
+				   T10 = VSUB(TI, TZ);
+				   T11 = VSUB(Tr, T10);
+				   T2C = VADD(Tr, T10);
+				   T2t = VFNMS(LDK(KP471396736), T1s, VMUL(LDK(KP881921264), T1B));
+				   T2u = VFMA(LDK(KP471396736), T23, VMUL(LDK(KP881921264), T2c));
+				   T2v = VSUB(T2t, T2u);
+				   T2D = VADD(T2t, T2u);
+			      }
+			      {
+				   V T1C, T2d, T2i, T2r;
+				   T1C = VFMA(LDK(KP881921264), T1s, VMUL(LDK(KP471396736), T1B));
+				   T2d = VFNMS(LDK(KP471396736), T2c, VMUL(LDK(KP881921264), T23));
+				   T2e = VSUB(T1C, T2d);
+				   T2z = VADD(T1C, T2d);
+				   T2i = VSUB(T2g, T2h);
+				   T2r = VSUB(T2l, T2q);
+				   T2s = VSUB(T2i, T2r);
+				   T2A = VADD(T2r, T2i);
+			      }
+			      {
+				   V T2f, T2w, T7L, T2F, T2G, T7N;
+				   T2f = VADD(T11, T2e);
+				   T2w = VBYI(VADD(T2s, T2v));
+				   T7L = VSUB(T2f, T2w);
+				   STM2(&(xo[106]), T7L, ovs, &(xo[2]));
+				   STN2(&(xo[104]), T7A, T7L, ovs);
+				   T7M = VADD(T2f, T2w);
+				   STM2(&(xo[22]), T7M, ovs, &(xo[2]));
+				   T2F = VBYI(VADD(T2A, T2z));
+				   T2G = VADD(T2C, T2D);
+				   T7N = VADD(T2F, T2G);
+				   STM2(&(xo[10]), T7N, ovs, &(xo[2]));
+				   STN2(&(xo[8]), T7C, T7N, ovs);
+				   T7O = VSUB(T2G, T2F);
+				   STM2(&(xo[118]), T7O, ovs, &(xo[2]));
+			      }
+			      {
+				   V T2x, T2y, T7Q, T2B, T2E, T7S;
+				   T2x = VSUB(T11, T2e);
+				   T2y = VBYI(VSUB(T2v, T2s));
+				   T7P = VSUB(T2x, T2y);
+				   STM2(&(xo[86]), T7P, ovs, &(xo[2]));
+				   T7Q = VADD(T2x, T2y);
+				   STM2(&(xo[42]), T7Q, ovs, &(xo[2]));
+				   STN2(&(xo[40]), T7v, T7Q, ovs);
+				   T2B = VBYI(VSUB(T2z, T2A));
+				   T2E = VSUB(T2C, T2D);
+				   T7R = VADD(T2B, T2E);
+				   STM2(&(xo[54]), T7R, ovs, &(xo[2]));
+				   T7S = VSUB(T2E, T2B);
+				   STM2(&(xo[74]), T7S, ovs, &(xo[2]));
+				   STN2(&(xo[72]), T7x, T7S, ovs);
+			      }
+			 }
+			 {
+			      V T3n, T3O, T3J, T3R, T3y, T3Q, T3G, T3N;
+			      {
+				   V T3f, T3m, T3H, T3I;
+				   T3f = VFNMS(LDK(KP098017140), T3e, VMUL(LDK(KP995184726), T3b));
+				   T3m = VFMA(LDK(KP995184726), T3i, VMUL(LDK(KP098017140), T3l));
+				   T3n = VSUB(T3f, T3m);
+				   T3O = VADD(T3f, T3m);
+				   T3H = VFMA(LDK(KP098017140), T3b, VMUL(LDK(KP995184726), T3e));
+				   T3I = VFNMS(LDK(KP098017140), T3i, VMUL(LDK(KP995184726), T3l));
+				   T3J = VSUB(T3H, T3I);
+				   T3R = VADD(T3H, T3I);
+			      }
+			      {
+				   V T3u, T3x, T3C, T3F;
+				   T3u = VADD(T3q, T3t);
+				   T3x = VADD(T3v, T3w);
+				   T3y = VSUB(T3u, T3x);
+				   T3Q = VADD(T3x, T3u);
+				   T3C = VADD(T3A, T3B);
+				   T3F = VADD(T3D, T3E);
+				   T3G = VSUB(T3C, T3F);
+				   T3N = VADD(T3C, T3F);
+			      }
+			      {
+				   V T3z, T3K, T7T, T7U;
+				   T3z = VBYI(VSUB(T3n, T3y));
+				   T3K = VSUB(T3G, T3J);
+				   T7T = VADD(T3z, T3K);
+				   STM2(&(xo[34]), T7T, ovs, &(xo[2]));
+				   STN2(&(xo[32]), T7p, T7T, ovs);
+				   T7U = VSUB(T3K, T3z);
+				   STM2(&(xo[94]), T7U, ovs, &(xo[2]));
+				   STN2(&(xo[92]), T7H, T7U, ovs);
+			      }
+			      {
+				   V T3T, T3U, T7V, T7W;
+				   T3T = VSUB(T3N, T3O);
+				   T3U = VBYI(VSUB(T3R, T3Q));
+				   T7V = VSUB(T3T, T3U);
+				   STM2(&(xo[66]), T7V, ovs, &(xo[2]));
+				   STN2(&(xo[64]), T7q, T7V, ovs);
+				   T7W = VADD(T3T, T3U);
+				   STM2(&(xo[62]), T7W, ovs, &(xo[2]));
+				   STN2(&(xo[60]), T7J, T7W, ovs);
+			      }
+			      {
+				   V T3L, T3M, T7X, T7Y;
+				   T3L = VBYI(VADD(T3y, T3n));
+				   T3M = VADD(T3G, T3J);
+				   T7X = VADD(T3L, T3M);
+				   STM2(&(xo[30]), T7X, ovs, &(xo[2]));
+				   STN2(&(xo[28]), T7E, T7X, ovs);
+				   T7Y = VSUB(T3M, T3L);
+				   STM2(&(xo[98]), T7Y, ovs, &(xo[2]));
+				   STN2(&(xo[96]), T7n, T7Y, ovs);
+			      }
+			      {
+				   V T3P, T3S, T7Z, T80;
+				   T3P = VADD(T3N, T3O);
+				   T3S = VBYI(VADD(T3Q, T3R));
+				   T7Z = VSUB(T3P, T3S);
+				   STM2(&(xo[126]), T7Z, ovs, &(xo[2]));
+				   STN2(&(xo[124]), T7G, T7Z, ovs);
+				   T80 = VADD(T3P, T3S);
+				   STM2(&(xo[2]), T80, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T7o, T80, ovs);
+			      }
+			 }
+			 {
+			      V T81, T83, T86, T88;
+			      {
+				   V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
+				   {
+					V T4x, T4M, T5x, T5y;
+					T4x = VSUB(T4p, T4w);
+					T4M = VSUB(T4E, T4L);
+					T4N = VSUB(T4x, T4M);
+					T5G = VADD(T4x, T4M);
+					T5x = VFNMS(LDK(KP555570233), T4Y, VMUL(LDK(KP831469612), T53));
+					T5y = VFMA(LDK(KP555570233), T5f, VMUL(LDK(KP831469612), T5k));
+					T5z = VSUB(T5x, T5y);
+					T5H = VADD(T5x, T5y);
+				   }
+				   {
+					V T54, T5l, T5q, T5v;
+					T54 = VFMA(LDK(KP831469612), T4Y, VMUL(LDK(KP555570233), T53));
+					T5l = VFNMS(LDK(KP555570233), T5k, VMUL(LDK(KP831469612), T5f));
+					T5m = VSUB(T54, T5l);
+					T5D = VADD(T54, T5l);
+					T5q = VSUB(T5o, T5p);
+					T5v = VSUB(T5r, T5u);
+					T5w = VSUB(T5q, T5v);
+					T5E = VADD(T5v, T5q);
+				   }
+				   {
+					V T5n, T5A, T82, T5J, T5K, T84;
+					T5n = VADD(T4N, T5m);
+					T5A = VBYI(VADD(T5w, T5z));
+					T81 = VSUB(T5n, T5A);
+					STM2(&(xo[108]), T81, ovs, &(xo[0]));
+					T82 = VADD(T5n, T5A);
+					STM2(&(xo[20]), T82, ovs, &(xo[0]));
+					STN2(&(xo[20]), T82, T7M, ovs);
+					T5J = VBYI(VADD(T5E, T5D));
+					T5K = VADD(T5G, T5H);
+					T83 = VADD(T5J, T5K);
+					STM2(&(xo[12]), T83, ovs, &(xo[0]));
+					T84 = VSUB(T5K, T5J);
+					STM2(&(xo[116]), T84, ovs, &(xo[0]));
+					STN2(&(xo[116]), T84, T7O, ovs);
+				   }
+				   {
+					V T5B, T5C, T85, T5F, T5I, T87;
+					T5B = VSUB(T4N, T5m);
+					T5C = VBYI(VSUB(T5z, T5w));
+					T85 = VSUB(T5B, T5C);
+					STM2(&(xo[84]), T85, ovs, &(xo[0]));
+					STN2(&(xo[84]), T85, T7P, ovs);
+					T86 = VADD(T5B, T5C);
+					STM2(&(xo[44]), T86, ovs, &(xo[0]));
+					T5F = VBYI(VSUB(T5D, T5E));
+					T5I = VSUB(T5G, T5H);
+					T87 = VADD(T5F, T5I);
+					STM2(&(xo[52]), T87, ovs, &(xo[0]));
+					STN2(&(xo[52]), T87, T7R, ovs);
+					T88 = VSUB(T5I, T5F);
+					STM2(&(xo[76]), T88, ovs, &(xo[0]));
+				   }
+			      }
+			      {
+				   V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
+				   {
+					V T2H, T2I, T2V, T2W;
+					T2H = VADD(Tb, Tq);
+					T2I = VADD(T2g, T2h);
+					T2J = VSUB(T2H, T2I);
+					T34 = VADD(T2H, T2I);
+					T2V = VFNMS(LDK(KP290284677), T2K, VMUL(LDK(KP956940335), T2L));
+					T2W = VFMA(LDK(KP290284677), T2N, VMUL(LDK(KP956940335), T2O));
+					T2X = VSUB(T2V, T2W);
+					T35 = VADD(T2V, T2W);
+				   }
+				   {
+					V T2M, T2P, T2S, T2T;
+					T2M = VFMA(LDK(KP956940335), T2K, VMUL(LDK(KP290284677), T2L));
+					T2P = VFNMS(LDK(KP290284677), T2O, VMUL(LDK(KP956940335), T2N));
+					T2Q = VSUB(T2M, T2P);
+					T31 = VADD(T2M, T2P);
+					T2S = VADD(TI, TZ);
+					T2T = VADD(T2q, T2l);
+					T2U = VSUB(T2S, T2T);
+					T32 = VADD(T2T, T2S);
+				   }
+				   {
+					V T2R, T2Y, T89, T8a;
+					T2R = VADD(T2J, T2Q);
+					T2Y = VBYI(VADD(T2U, T2X));
+					T89 = VSUB(T2R, T2Y);
+					STM2(&(xo[102]), T89, ovs, &(xo[2]));
+					STN2(&(xo[100]), T7D, T89, ovs);
+					T8a = VADD(T2R, T2Y);
+					STM2(&(xo[26]), T8a, ovs, &(xo[2]));
+					STN2(&(xo[24]), T7z, T8a, ovs);
+				   }
+				   {
+					V T37, T38, T8b, T8c;
+					T37 = VBYI(VADD(T32, T31));
+					T38 = VADD(T34, T35);
+					T8b = VADD(T37, T38);
+					STM2(&(xo[6]), T8b, ovs, &(xo[2]));
+					STN2(&(xo[4]), T7F, T8b, ovs);
+					T8c = VSUB(T38, T37);
+					STM2(&(xo[122]), T8c, ovs, &(xo[2]));
+					STN2(&(xo[120]), T7B, T8c, ovs);
+				   }
+				   {
+					V T2Z, T30, T8d, T8e;
+					T2Z = VSUB(T2J, T2Q);
+					T30 = VBYI(VSUB(T2X, T2U));
+					T8d = VSUB(T2Z, T30);
+					STM2(&(xo[90]), T8d, ovs, &(xo[2]));
+					STN2(&(xo[88]), T7w, T8d, ovs);
+					T8e = VADD(T2Z, T30);
+					STM2(&(xo[38]), T8e, ovs, &(xo[2]));
+					STN2(&(xo[36]), T7I, T8e, ovs);
+				   }
+				   {
+					V T33, T36, T8f, T8g;
+					T33 = VBYI(VSUB(T31, T32));
+					T36 = VSUB(T34, T35);
+					T8f = VADD(T33, T36);
+					STM2(&(xo[58]), T8f, ovs, &(xo[2]));
+					STN2(&(xo[56]), T7y, T8f, ovs);
+					T8g = VSUB(T36, T33);
+					STM2(&(xo[70]), T8g, ovs, &(xo[2]));
+					STN2(&(xo[68]), T7K, T8g, ovs);
+				   }
+			      }
+			      {
+				   V T41, T4g, T4b, T4j, T44, T4i, T48, T4f;
+				   {
+					V T3X, T40, T49, T4a;
+					T3X = VFNMS(LDK(KP634393284), T3W, VMUL(LDK(KP773010453), T3V));
+					T40 = VFMA(LDK(KP773010453), T3Y, VMUL(LDK(KP634393284), T3Z));
+					T41 = VSUB(T3X, T40);
+					T4g = VADD(T3X, T40);
+					T49 = VFMA(LDK(KP634393284), T3V, VMUL(LDK(KP773010453), T3W));
+					T4a = VFNMS(LDK(KP634393284), T3Y, VMUL(LDK(KP773010453), T3Z));
+					T4b = VSUB(T49, T4a);
+					T4j = VADD(T49, T4a);
+				   }
+				   {
+					V T42, T43, T46, T47;
+					T42 = VSUB(T3D, T3E);
+					T43 = VSUB(T3w, T3v);
+					T44 = VSUB(T42, T43);
+					T4i = VADD(T43, T42);
+					T46 = VSUB(T3A, T3B);
+					T47 = VSUB(T3q, T3t);
+					T48 = VSUB(T46, T47);
+					T4f = VADD(T46, T47);
+				   }
+				   {
+					V T45, T4c, T8h, T8i;
+					T45 = VBYI(VSUB(T41, T44));
+					T4c = VSUB(T48, T4b);
+					T8h = VADD(T45, T4c);
+					STM2(&(xo[46]), T8h, ovs, &(xo[2]));
+					STN2(&(xo[44]), T86, T8h, ovs);
+					T8i = VSUB(T4c, T45);
+					STM2(&(xo[82]), T8i, ovs, &(xo[2]));
+					STN2(&(xo[80]), T7s, T8i, ovs);
+				   }
+				   {
+					V T4l, T4m, T8j, T8k;
+					T4l = VSUB(T4f, T4g);
+					T4m = VBYI(VSUB(T4j, T4i));
+					T8j = VSUB(T4l, T4m);
+					STM2(&(xo[78]), T8j, ovs, &(xo[2]));
+					STN2(&(xo[76]), T88, T8j, ovs);
+					T8k = VADD(T4l, T4m);
+					STM2(&(xo[50]), T8k, ovs, &(xo[2]));
+					STN2(&(xo[48]), T7r, T8k, ovs);
+				   }
+				   {
+					V T4d, T4e, T8l, T8m;
+					T4d = VBYI(VADD(T44, T41));
+					T4e = VADD(T48, T4b);
+					T8l = VADD(T4d, T4e);
+					STM2(&(xo[18]), T8l, ovs, &(xo[2]));
+					STN2(&(xo[16]), T7t, T8l, ovs);
+					T8m = VSUB(T4e, T4d);
+					STM2(&(xo[110]), T8m, ovs, &(xo[2]));
+					STN2(&(xo[108]), T81, T8m, ovs);
+				   }
+				   {
+					V T4h, T4k, T8n, T8o;
+					T4h = VADD(T4f, T4g);
+					T4k = VBYI(VADD(T4i, T4j));
+					T8n = VSUB(T4h, T4k);
+					STM2(&(xo[114]), T8n, ovs, &(xo[2]));
+					STN2(&(xo[112]), T7u, T8n, ovs);
+					T8o = VADD(T4h, T4k);
+					STM2(&(xo[14]), T8o, ovs, &(xo[2]));
+					STN2(&(xo[12]), T83, T8o, ovs);
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 64, XSIMD_STRING("n2bv_64"), {404, 72, 52, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_64) (planner *p) {
+     X(kdft_register) (p, n2bv_64, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:29 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 8 -name n2bv_8 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 26 FP additions, 10 FP multiplications,
+ * (or, 16 additions, 0 multiplications, 10 fused multiply/add),
+ * 38 stack variables, 1 constants, and 20 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       V T1, T2, Tc, Td, T4, T5, T7, T8;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       Td = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+	       T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, Tj, Te, Tk, T6, Tm, T9, Tn, Tp, Tl;
+		    T3 = VSUB(T1, T2);
+		    Tj = VADD(T1, T2);
+		    Te = VSUB(Tc, Td);
+		    Tk = VADD(Tc, Td);
+		    T6 = VSUB(T4, T5);
+		    Tm = VADD(T4, T5);
+		    T9 = VSUB(T7, T8);
+		    Tn = VADD(T7, T8);
+		    Tp = VADD(Tj, Tk);
+		    Tl = VSUB(Tj, Tk);
+		    {
+			 V Tq, To, Ta, Tf;
+			 Tq = VADD(Tm, Tn);
+			 To = VSUB(Tm, Tn);
+			 Ta = VADD(T6, T9);
+			 Tf = VSUB(T6, T9);
+			 {
+			      V Tr, Ts, Tt, Tu, Tg, Ti, Tb, Th;
+			      Tr = VFMAI(To, Tl);
+			      STM2(&(xo[4]), Tr, ovs, &(xo[0]));
+			      Ts = VFNMSI(To, Tl);
+			      STM2(&(xo[12]), Ts, ovs, &(xo[0]));
+			      Tt = VADD(Tp, Tq);
+			      STM2(&(xo[0]), Tt, ovs, &(xo[0]));
+			      Tu = VSUB(Tp, Tq);
+			      STM2(&(xo[8]), Tu, ovs, &(xo[0]));
+			      Tg = VFNMS(LDK(KP707106781), Tf, Te);
+			      Ti = VFMA(LDK(KP707106781), Tf, Te);
+			      Tb = VFNMS(LDK(KP707106781), Ta, T3);
+			      Th = VFMA(LDK(KP707106781), Ta, T3);
+			      {
+				   V Tv, Tw, Tx, Ty;
+				   Tv = VFNMSI(Ti, Th);
+				   STM2(&(xo[14]), Tv, ovs, &(xo[2]));
+				   STN2(&(xo[12]), Ts, Tv, ovs);
+				   Tw = VFMAI(Ti, Th);
+				   STM2(&(xo[2]), Tw, ovs, &(xo[2]));
+				   STN2(&(xo[0]), Tt, Tw, ovs);
+				   Tx = VFMAI(Tg, Tb);
+				   STM2(&(xo[10]), Tx, ovs, &(xo[2]));
+				   STN2(&(xo[8]), Tu, Tx, ovs);
+				   Ty = VFNMSI(Tg, Tb);
+				   STM2(&(xo[6]), Ty, ovs, &(xo[2]));
+				   STN2(&(xo[4]), Tr, Ty, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 8, XSIMD_STRING("n2bv_8"), {16, 0, 10, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_8) (planner *p) {
+     X(kdft_register) (p, n2bv_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 8 -name n2bv_8 -with-ostride 2 -include n2b.h -store-multiple 2 */
+
+/*
+ * This function contains 26 FP additions, 2 FP multiplications,
+ * (or, 26 additions, 2 multiplications, 0 fused multiply/add),
+ * 24 stack variables, 1 constants, and 20 memory accesses
+ */
+#include "n2b.h"
+
+static void n2bv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ii;
+	  xo = io;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       V Ta, Tk, Te, Tj, T7, Tn, Tf, Tm, Tr, Tu;
+	       {
+		    V T8, T9, Tc, Td;
+		    T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T9 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Ta = VSUB(T8, T9);
+		    Tk = VADD(T8, T9);
+		    Tc = LD(&(xi[0]), ivs, &(xi[0]));
+		    Td = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    Te = VSUB(Tc, Td);
+		    Tj = VADD(Tc, Td);
+		    {
+			 V T1, T2, T3, T4, T5, T6;
+			 T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 T4 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 T7 = VMUL(LDK(KP707106781), VSUB(T3, T6));
+			 Tn = VADD(T4, T5);
+			 Tf = VMUL(LDK(KP707106781), VADD(T3, T6));
+			 Tm = VADD(T1, T2);
+		    }
+	       }
+	       {
+		    V Ts, Tb, Tg, Tp, Tq, Tt;
+		    Tb = VBYI(VSUB(T7, Ta));
+		    Tg = VSUB(Te, Tf);
+		    Tr = VADD(Tb, Tg);
+		    STM2(&(xo[6]), Tr, ovs, &(xo[2]));
+		    Ts = VSUB(Tg, Tb);
+		    STM2(&(xo[10]), Ts, ovs, &(xo[2]));
+		    Tp = VADD(Tj, Tk);
+		    Tq = VADD(Tm, Tn);
+		    Tt = VSUB(Tp, Tq);
+		    STM2(&(xo[8]), Tt, ovs, &(xo[0]));
+		    STN2(&(xo[8]), Tt, Ts, ovs);
+		    Tu = VADD(Tp, Tq);
+		    STM2(&(xo[0]), Tu, ovs, &(xo[0]));
+	       }
+	       {
+		    V Tw, Th, Ti, Tv;
+		    Th = VBYI(VADD(Ta, T7));
+		    Ti = VADD(Te, Tf);
+		    Tv = VADD(Th, Ti);
+		    STM2(&(xo[2]), Tv, ovs, &(xo[2]));
+		    STN2(&(xo[0]), Tu, Tv, ovs);
+		    Tw = VSUB(Ti, Th);
+		    STM2(&(xo[14]), Tw, ovs, &(xo[2]));
+		    {
+			 V Tl, To, Tx, Ty;
+			 Tl = VSUB(Tj, Tk);
+			 To = VBYI(VSUB(Tm, Tn));
+			 Tx = VSUB(Tl, To);
+			 STM2(&(xo[12]), Tx, ovs, &(xo[0]));
+			 STN2(&(xo[12]), Tx, Tw, ovs);
+			 Ty = VADD(Tl, To);
+			 STM2(&(xo[4]), Ty, ovs, &(xo[0]));
+			 STN2(&(xo[4]), Ty, Tr, ovs);
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 8, XSIMD_STRING("n2bv_8"), {26, 2, 0, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2bv_8) (planner *p) {
+     X(kdft_register) (p, n2bv_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:22 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name n2fv_10 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 42 FP additions, 22 FP multiplications,
+ * (or, 24 additions, 4 multiplications, 18 fused multiply/add),
+ * 53 stack variables, 4 constants, and 25 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
+	       V Tb, Tr, T3, Ts, T6, Tw, Tg, Tt, T9, Tc, T1, T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T4, T5, Te, Tf, T7, T8;
+		    T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T5 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    Te = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    Tr = VADD(T1, T2);
+		    T3 = VSUB(T1, T2);
+		    Ts = VADD(T4, T5);
+		    T6 = VSUB(T4, T5);
+		    Tw = VADD(Te, Tf);
+		    Tg = VSUB(Te, Tf);
+		    Tt = VADD(T7, T8);
+		    T9 = VSUB(T7, T8);
+		    Tc = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+	       }
+	       {
+		    V TD, Tu, Tm, Ta, Td, Tv;
+		    TD = VSUB(Ts, Tt);
+		    Tu = VADD(Ts, Tt);
+		    Tm = VSUB(T6, T9);
+		    Ta = VADD(T6, T9);
+		    Td = VSUB(Tb, Tc);
+		    Tv = VADD(Tb, Tc);
+		    {
+			 V TC, Tx, Tn, Th;
+			 TC = VSUB(Tv, Tw);
+			 Tx = VADD(Tv, Tw);
+			 Tn = VSUB(Td, Tg);
+			 Th = VADD(Td, Tg);
+			 {
+			      V Ty, TA, TE, TG, Ti, Tk, To, Tq;
+			      Ty = VADD(Tu, Tx);
+			      TA = VSUB(Tu, Tx);
+			      TE = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TD, TC));
+			      TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TC, TD));
+			      Ti = VADD(Ta, Th);
+			      Tk = VSUB(Ta, Th);
+			      To = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tn, Tm));
+			      Tq = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tm, Tn));
+			      {
+				   V Tz, TH, Tj, TI;
+				   Tz = VFNMS(LDK(KP250000000), Ty, Tr);
+				   TH = VADD(Tr, Ty);
+				   STM2(&(xo[0]), TH, ovs, &(xo[0]));
+				   Tj = VFNMS(LDK(KP250000000), Ti, T3);
+				   TI = VADD(T3, Ti);
+				   STM2(&(xo[10]), TI, ovs, &(xo[2]));
+				   {
+					V TB, TF, Tl, Tp;
+					TB = VFNMS(LDK(KP559016994), TA, Tz);
+					TF = VFMA(LDK(KP559016994), TA, Tz);
+					Tl = VFMA(LDK(KP559016994), Tk, Tj);
+					Tp = VFNMS(LDK(KP559016994), Tk, Tj);
+					{
+					     V TJ, TK, TL, TM;
+					     TJ = VFMAI(TG, TF);
+					     STM2(&(xo[8]), TJ, ovs, &(xo[0]));
+					     STN2(&(xo[8]), TJ, TI, ovs);
+					     TK = VFNMSI(TG, TF);
+					     STM2(&(xo[12]), TK, ovs, &(xo[0]));
+					     TL = VFNMSI(TE, TB);
+					     STM2(&(xo[16]), TL, ovs, &(xo[0]));
+					     TM = VFMAI(TE, TB);
+					     STM2(&(xo[4]), TM, ovs, &(xo[0]));
+					     {
+						  V TN, TO, TP, TQ;
+						  TN = VFNMSI(Tq, Tp);
+						  STM2(&(xo[6]), TN, ovs, &(xo[2]));
+						  STN2(&(xo[4]), TM, TN, ovs);
+						  TO = VFMAI(Tq, Tp);
+						  STM2(&(xo[14]), TO, ovs, &(xo[2]));
+						  STN2(&(xo[12]), TK, TO, ovs);
+						  TP = VFMAI(To, Tl);
+						  STM2(&(xo[18]), TP, ovs, &(xo[2]));
+						  STN2(&(xo[16]), TL, TP, ovs);
+						  TQ = VFNMSI(To, Tl);
+						  STM2(&(xo[2]), TQ, ovs, &(xo[2]));
+						  STN2(&(xo[0]), TH, TQ, ovs);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 10, XSIMD_STRING("n2fv_10"), {24, 4, 18, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_10) (planner *p) {
+     X(kdft_register) (p, n2fv_10, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name n2fv_10 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 42 FP additions, 12 FP multiplications,
+ * (or, 36 additions, 6 multiplications, 6 fused multiply/add),
+ * 36 stack variables, 4 constants, and 25 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
+	       V Ti, Ty, Tm, Tn, Tw, Tt, Tz, TA, TB, T7, Te, Tj, Tg, Th;
+	       Tg = LD(&(xi[0]), ivs, &(xi[0]));
+	       Th = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       Ti = VSUB(Tg, Th);
+	       Ty = VADD(Tg, Th);
+	       {
+		    V T3, Tu, Td, Ts, T6, Tv, Ta, Tr;
+		    {
+			 V T1, T2, Tb, Tc;
+			 T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 Tu = VADD(T1, T2);
+			 Tb = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 Ts = VADD(Tb, Tc);
+		    }
+		    {
+			 V T4, T5, T8, T9;
+			 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 Tv = VADD(T4, T5);
+			 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VSUB(T8, T9);
+			 Tr = VADD(T8, T9);
+		    }
+		    Tm = VSUB(T3, T6);
+		    Tn = VSUB(Ta, Td);
+		    Tw = VSUB(Tu, Tv);
+		    Tt = VSUB(Tr, Ts);
+		    Tz = VADD(Tu, Tv);
+		    TA = VADD(Tr, Ts);
+		    TB = VADD(Tz, TA);
+		    T7 = VADD(T3, T6);
+		    Te = VADD(Ta, Td);
+		    Tj = VADD(T7, Te);
+	       }
+	       {
+		    V TH, TI, TK, TL, TM;
+		    TH = VADD(Ti, Tj);
+		    STM2(&(xo[10]), TH, ovs, &(xo[2]));
+		    TI = VADD(Ty, TB);
+		    STM2(&(xo[0]), TI, ovs, &(xo[0]));
+		    {
+			 V To, Tq, Tl, Tp, Tf, Tk, TJ;
+			 To = VBYI(VFMA(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tn)));
+			 Tq = VBYI(VFNMS(LDK(KP587785252), Tm, VMUL(LDK(KP951056516), Tn)));
+			 Tf = VMUL(LDK(KP559016994), VSUB(T7, Te));
+			 Tk = VFNMS(LDK(KP250000000), Tj, Ti);
+			 Tl = VADD(Tf, Tk);
+			 Tp = VSUB(Tk, Tf);
+			 TJ = VSUB(Tl, To);
+			 STM2(&(xo[2]), TJ, ovs, &(xo[2]));
+			 STN2(&(xo[0]), TI, TJ, ovs);
+			 TK = VADD(Tq, Tp);
+			 STM2(&(xo[14]), TK, ovs, &(xo[2]));
+			 TL = VADD(To, Tl);
+			 STM2(&(xo[18]), TL, ovs, &(xo[2]));
+			 TM = VSUB(Tp, Tq);
+			 STM2(&(xo[6]), TM, ovs, &(xo[2]));
+		    }
+		    {
+			 V Tx, TF, TE, TG, TC, TD;
+			 Tx = VBYI(VFNMS(LDK(KP587785252), Tw, VMUL(LDK(KP951056516), Tt)));
+			 TF = VBYI(VFMA(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tt)));
+			 TC = VFNMS(LDK(KP250000000), TB, Ty);
+			 TD = VMUL(LDK(KP559016994), VSUB(Tz, TA));
+			 TE = VSUB(TC, TD);
+			 TG = VADD(TD, TC);
+			 {
+			      V TN, TO, TP, TQ;
+			      TN = VADD(Tx, TE);
+			      STM2(&(xo[4]), TN, ovs, &(xo[0]));
+			      STN2(&(xo[4]), TN, TM, ovs);
+			      TO = VSUB(TG, TF);
+			      STM2(&(xo[12]), TO, ovs, &(xo[0]));
+			      STN2(&(xo[12]), TO, TK, ovs);
+			      TP = VSUB(TE, Tx);
+			      STM2(&(xo[16]), TP, ovs, &(xo[0]));
+			      STN2(&(xo[16]), TP, TL, ovs);
+			      TQ = VADD(TF, TG);
+			      STM2(&(xo[8]), TQ, ovs, &(xo[0]));
+			      STN2(&(xo[8]), TQ, TH, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 10, XSIMD_STRING("n2fv_10"), {36, 6, 6, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_10) (planner *p) {
+     X(kdft_register) (p, n2fv_10, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:22 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n2fv_12 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 48 FP additions, 20 FP multiplications,
+ * (or, 30 additions, 2 multiplications, 18 fused multiply/add),
+ * 61 stack variables, 2 constants, and 30 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
+	       V T1, T6, Tk, Tn, Tc, Td, Tf, Tr, T4, Ts, T9, Tg, Te, Tl;
+	       {
+		    V T2, T3, T7, T8;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    Tk = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+		    Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+		    Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    Tr = VSUB(T3, T2);
+		    T4 = VADD(T2, T3);
+		    Ts = VSUB(T8, T7);
+		    T9 = VADD(T7, T8);
+		    Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       }
+	       Te = VSUB(Tc, Td);
+	       Tl = VADD(Td, Tc);
+	       {
+		    V T5, TF, TB, Tt, Ta, TG, Th, To, Tm, TI;
+		    T5 = VFNMS(LDK(KP500000000), T4, T1);
+		    TF = VADD(T1, T4);
+		    TB = VADD(Tr, Ts);
+		    Tt = VSUB(Tr, Ts);
+		    Ta = VFNMS(LDK(KP500000000), T9, T6);
+		    TG = VADD(T6, T9);
+		    Th = VSUB(Tf, Tg);
+		    To = VADD(Tf, Tg);
+		    Tm = VFNMS(LDK(KP500000000), Tl, Tk);
+		    TI = VADD(Tk, Tl);
+		    {
+			 V TH, TL, Tb, Tx, TJ, Tp, Ti, TA;
+			 TH = VSUB(TF, TG);
+			 TL = VADD(TF, TG);
+			 Tb = VSUB(T5, Ta);
+			 Tx = VADD(T5, Ta);
+			 TJ = VADD(Tn, To);
+			 Tp = VFNMS(LDK(KP500000000), To, Tn);
+			 Ti = VADD(Te, Th);
+			 TA = VSUB(Te, Th);
+			 {
+			      V Tq, Ty, TK, TM;
+			      Tq = VSUB(Tm, Tp);
+			      Ty = VADD(Tm, Tp);
+			      TK = VSUB(TI, TJ);
+			      TM = VADD(TI, TJ);
+			      {
+				   V TC, TE, Tj, Tv;
+				   TC = VMUL(LDK(KP866025403), VSUB(TA, TB));
+				   TE = VMUL(LDK(KP866025403), VADD(TB, TA));
+				   Tj = VFMA(LDK(KP866025403), Ti, Tb);
+				   Tv = VFNMS(LDK(KP866025403), Ti, Tb);
+				   {
+					V Tz, TD, Tu, Tw;
+					Tz = VSUB(Tx, Ty);
+					TD = VADD(Tx, Ty);
+					Tu = VFNMS(LDK(KP866025403), Tt, Tq);
+					Tw = VFMA(LDK(KP866025403), Tt, Tq);
+					{
+					     V TN, TO, TP, TQ;
+					     TN = VADD(TL, TM);
+					     STM2(&(xo[0]), TN, ovs, &(xo[0]));
+					     TO = VSUB(TL, TM);
+					     STM2(&(xo[12]), TO, ovs, &(xo[0]));
+					     TP = VFMAI(TK, TH);
+					     STM2(&(xo[6]), TP, ovs, &(xo[2]));
+					     TQ = VFNMSI(TK, TH);
+					     STM2(&(xo[18]), TQ, ovs, &(xo[2]));
+					     {
+						  V TR, TS, TT, TU;
+						  TR = VFMAI(TE, TD);
+						  STM2(&(xo[8]), TR, ovs, &(xo[0]));
+						  TS = VFNMSI(TE, TD);
+						  STM2(&(xo[16]), TS, ovs, &(xo[0]));
+						  STN2(&(xo[16]), TS, TQ, ovs);
+						  TT = VFNMSI(TC, Tz);
+						  STM2(&(xo[20]), TT, ovs, &(xo[0]));
+						  TU = VFMAI(TC, Tz);
+						  STM2(&(xo[4]), TU, ovs, &(xo[0]));
+						  STN2(&(xo[4]), TU, TP, ovs);
+						  {
+						       V TV, TW, TX, TY;
+						       TV = VFNMSI(Tw, Tv);
+						       STM2(&(xo[10]), TV, ovs, &(xo[2]));
+						       STN2(&(xo[8]), TR, TV, ovs);
+						       TW = VFMAI(Tw, Tv);
+						       STM2(&(xo[14]), TW, ovs, &(xo[2]));
+						       STN2(&(xo[12]), TO, TW, ovs);
+						       TX = VFMAI(Tu, Tj);
+						       STM2(&(xo[22]), TX, ovs, &(xo[2]));
+						       STN2(&(xo[20]), TT, TX, ovs);
+						       TY = VFNMSI(Tu, Tj);
+						       STM2(&(xo[2]), TY, ovs, &(xo[2]));
+						       STN2(&(xo[0]), TN, TY, ovs);
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 12, XSIMD_STRING("n2fv_12"), {30, 2, 18, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_12) (planner *p) {
+     X(kdft_register) (p, n2fv_12, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n2fv_12 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 48 FP additions, 8 FP multiplications,
+ * (or, 44 additions, 4 multiplications, 4 fused multiply/add),
+ * 33 stack variables, 2 constants, and 30 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
+	       V T5, Ta, TJ, Ty, Tq, Tp, Tg, Tl, TI, TA, Tz, Tu;
+	       {
+		    V T1, T6, T4, Tw, T9, Tx;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    {
+			 V T2, T3, T7, T8;
+			 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T4 = VADD(T2, T3);
+			 Tw = VSUB(T3, T2);
+			 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T9 = VADD(T7, T8);
+			 Tx = VSUB(T8, T7);
+		    }
+		    T5 = VADD(T1, T4);
+		    Ta = VADD(T6, T9);
+		    TJ = VADD(Tw, Tx);
+		    Ty = VMUL(LDK(KP866025403), VSUB(Tw, Tx));
+		    Tq = VFNMS(LDK(KP500000000), T9, T6);
+		    Tp = VFNMS(LDK(KP500000000), T4, T1);
+	       }
+	       {
+		    V Tc, Th, Tf, Ts, Tk, Tt;
+		    Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+		    {
+			 V Td, Te, Ti, Tj;
+			 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Tf = VADD(Td, Te);
+			 Ts = VSUB(Te, Td);
+			 Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tk = VADD(Ti, Tj);
+			 Tt = VSUB(Tj, Ti);
+		    }
+		    Tg = VADD(Tc, Tf);
+		    Tl = VADD(Th, Tk);
+		    TI = VADD(Ts, Tt);
+		    TA = VFNMS(LDK(KP500000000), Tk, Th);
+		    Tz = VFNMS(LDK(KP500000000), Tf, Tc);
+		    Tu = VMUL(LDK(KP866025403), VSUB(Ts, Tt));
+	       }
+	       {
+		    V TN, TO, TP, TQ, TR, TS;
+		    {
+			 V Tb, Tm, Tn, To;
+			 Tb = VSUB(T5, Ta);
+			 Tm = VBYI(VSUB(Tg, Tl));
+			 TN = VSUB(Tb, Tm);
+			 STM2(&(xo[18]), TN, ovs, &(xo[2]));
+			 TO = VADD(Tb, Tm);
+			 STM2(&(xo[6]), TO, ovs, &(xo[2]));
+			 Tn = VADD(T5, Ta);
+			 To = VADD(Tg, Tl);
+			 TP = VSUB(Tn, To);
+			 STM2(&(xo[12]), TP, ovs, &(xo[0]));
+			 TQ = VADD(Tn, To);
+			 STM2(&(xo[0]), TQ, ovs, &(xo[0]));
+		    }
+		    {
+			 V Tv, TE, TC, TD, Tr, TB, TT, TU;
+			 Tr = VSUB(Tp, Tq);
+			 Tv = VSUB(Tr, Tu);
+			 TE = VADD(Tr, Tu);
+			 TB = VSUB(Tz, TA);
+			 TC = VBYI(VADD(Ty, TB));
+			 TD = VBYI(VSUB(Ty, TB));
+			 TR = VSUB(Tv, TC);
+			 STM2(&(xo[10]), TR, ovs, &(xo[2]));
+			 TS = VSUB(TE, TD);
+			 STM2(&(xo[22]), TS, ovs, &(xo[2]));
+			 TT = VADD(TC, Tv);
+			 STM2(&(xo[14]), TT, ovs, &(xo[2]));
+			 STN2(&(xo[12]), TP, TT, ovs);
+			 TU = VADD(TD, TE);
+			 STM2(&(xo[2]), TU, ovs, &(xo[2]));
+			 STN2(&(xo[0]), TQ, TU, ovs);
+		    }
+		    {
+			 V TK, TM, TH, TL, TF, TG;
+			 TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
+			 TM = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
+			 TF = VADD(Tp, Tq);
+			 TG = VADD(Tz, TA);
+			 TH = VSUB(TF, TG);
+			 TL = VADD(TF, TG);
+			 {
+			      V TV, TW, TX, TY;
+			      TV = VSUB(TH, TK);
+			      STM2(&(xo[20]), TV, ovs, &(xo[0]));
+			      STN2(&(xo[20]), TV, TS, ovs);
+			      TW = VADD(TL, TM);
+			      STM2(&(xo[8]), TW, ovs, &(xo[0]));
+			      STN2(&(xo[8]), TW, TR, ovs);
+			      TX = VADD(TH, TK);
+			      STM2(&(xo[4]), TX, ovs, &(xo[0]));
+			      STN2(&(xo[4]), TX, TO, ovs);
+			      TY = VSUB(TL, TM);
+			      STM2(&(xo[16]), TY, ovs, &(xo[0]));
+			      STN2(&(xo[16]), TY, TN, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 12, XSIMD_STRING("n2fv_12"), {44, 4, 4, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_12) (planner *p) {
+     X(kdft_register) (p, n2fv_12, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:22 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 14 -name n2fv_14 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 74 FP additions, 48 FP multiplications,
+ * (or, 32 additions, 6 multiplications, 42 fused multiply/add),
+ * 65 stack variables, 6 constants, and 35 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
+	       V TH, T3, TP, Tn, Ta, Ts, TW, TK, TO, Tk, TM, Tg, TL, Td, T1;
+	       V T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V Ti, TI, T6, TJ, T9, Tj, Te, Tf, Tb, Tc;
+		    {
+			 V T4, T5, T7, T8, Tl, Tm;
+			 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tl = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tm = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Ti = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 TH = VADD(T1, T2);
+			 T3 = VSUB(T1, T2);
+			 TI = VADD(T4, T5);
+			 T6 = VSUB(T4, T5);
+			 TJ = VADD(T7, T8);
+			 T9 = VSUB(T7, T8);
+			 TP = VADD(Tl, Tm);
+			 Tn = VSUB(Tl, Tm);
+			 Tj = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Te = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tf = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+		    }
+		    Ta = VADD(T6, T9);
+		    Ts = VSUB(T9, T6);
+		    TW = VSUB(TJ, TI);
+		    TK = VADD(TI, TJ);
+		    TO = VADD(Ti, Tj);
+		    Tk = VSUB(Ti, Tj);
+		    TM = VADD(Te, Tf);
+		    Tg = VSUB(Te, Tf);
+		    TL = VADD(Tb, Tc);
+		    Td = VSUB(Tb, Tc);
+	       }
+	       {
+		    V T19, T1a, T18, TB, T13, TY, TG, Tw, T11, Tr, T16, TT, Tz, TE, TU;
+		    V TQ;
+		    TU = VSUB(TO, TP);
+		    TQ = VADD(TO, TP);
+		    {
+			 V Tt, To, TV, TN;
+			 Tt = VSUB(Tn, Tk);
+			 To = VADD(Tk, Tn);
+			 TV = VSUB(TL, TM);
+			 TN = VADD(TL, TM);
+			 {
+			      V Tu, Th, TZ, T17;
+			      Tu = VSUB(Tg, Td);
+			      Th = VADD(Td, Tg);
+			      TZ = VFNMS(LDK(KP356895867), TK, TQ);
+			      T17 = VFNMS(LDK(KP554958132), TU, TW);
+			      {
+				   V Tp, TA, T14, TR;
+				   Tp = VFNMS(LDK(KP356895867), Ta, To);
+				   TA = VFMA(LDK(KP554958132), Tt, Ts);
+				   T19 = VADD(TH, VADD(TK, VADD(TN, TQ)));
+				   STM2(&(xo[0]), T19, ovs, &(xo[0]));
+				   T14 = VFNMS(LDK(KP356895867), TN, TK);
+				   TR = VFNMS(LDK(KP356895867), TQ, TN);
+				   {
+					V T12, TX, Tx, TC;
+					T12 = VFMA(LDK(KP554958132), TV, TU);
+					TX = VFMA(LDK(KP554958132), TW, TV);
+					T1a = VADD(T3, VADD(Ta, VADD(Th, To)));
+					STM2(&(xo[14]), T1a, ovs, &(xo[2]));
+					Tx = VFNMS(LDK(KP356895867), Th, Ta);
+					TC = VFNMS(LDK(KP356895867), To, Th);
+					{
+					     V TF, Tv, T10, Tq;
+					     TF = VFNMS(LDK(KP554958132), Ts, Tu);
+					     Tv = VFMA(LDK(KP554958132), Tu, Tt);
+					     T10 = VFNMS(LDK(KP692021471), TZ, TN);
+					     T18 = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), T17, TV));
+					     Tq = VFNMS(LDK(KP692021471), Tp, Th);
+					     TB = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), TA, Tu));
+					     {
+						  V T15, TS, Ty, TD;
+						  T15 = VFNMS(LDK(KP692021471), T14, TQ);
+						  TS = VFNMS(LDK(KP692021471), TR, TK);
+						  T13 = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), T12, TW));
+						  TY = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), TX, TU));
+						  Ty = VFNMS(LDK(KP692021471), Tx, To);
+						  TD = VFNMS(LDK(KP692021471), TC, Ta);
+						  TG = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), TF, Tt));
+						  Tw = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tv, Ts));
+						  T11 = VFNMS(LDK(KP900968867), T10, TH);
+						  Tr = VFNMS(LDK(KP900968867), Tq, T3);
+						  T16 = VFNMS(LDK(KP900968867), T15, TH);
+						  TT = VFNMS(LDK(KP900968867), TS, TH);
+						  Tz = VFNMS(LDK(KP900968867), Ty, T3);
+						  TE = VFNMS(LDK(KP900968867), TD, T3);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T1b, T1c, T1d, T1e;
+			 T1b = VFNMSI(T13, T11);
+			 STM2(&(xo[24]), T1b, ovs, &(xo[0]));
+			 T1c = VFMAI(T13, T11);
+			 STM2(&(xo[4]), T1c, ovs, &(xo[0]));
+			 T1d = VFMAI(Tw, Tr);
+			 STM2(&(xo[18]), T1d, ovs, &(xo[2]));
+			 T1e = VFNMSI(Tw, Tr);
+			 STM2(&(xo[10]), T1e, ovs, &(xo[2]));
+			 {
+			      V T1f, T1g, T1h, T1i;
+			      T1f = VFNMSI(T18, T16);
+			      STM2(&(xo[16]), T1f, ovs, &(xo[0]));
+			      STN2(&(xo[16]), T1f, T1d, ovs);
+			      T1g = VFMAI(T18, T16);
+			      STM2(&(xo[12]), T1g, ovs, &(xo[0]));
+			      STN2(&(xo[12]), T1g, T1a, ovs);
+			      T1h = VFNMSI(TY, TT);
+			      STM2(&(xo[20]), T1h, ovs, &(xo[0]));
+			      T1i = VFMAI(TY, TT);
+			      STM2(&(xo[8]), T1i, ovs, &(xo[0]));
+			      STN2(&(xo[8]), T1i, T1e, ovs);
+			      {
+				   V T1j, T1k, T1l, T1m;
+				   T1j = VFMAI(TB, Tz);
+				   STM2(&(xo[2]), T1j, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T19, T1j, ovs);
+				   T1k = VFNMSI(TB, Tz);
+				   STM2(&(xo[26]), T1k, ovs, &(xo[2]));
+				   STN2(&(xo[24]), T1b, T1k, ovs);
+				   T1l = VFMAI(TG, TE);
+				   STM2(&(xo[6]), T1l, ovs, &(xo[2]));
+				   STN2(&(xo[4]), T1c, T1l, ovs);
+				   T1m = VFNMSI(TG, TE);
+				   STM2(&(xo[22]), T1m, ovs, &(xo[2]));
+				   STN2(&(xo[20]), T1h, T1m, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 14, XSIMD_STRING("n2fv_14"), {32, 6, 42, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_14) (planner *p) {
+     X(kdft_register) (p, n2fv_14, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 14 -name n2fv_14 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 74 FP additions, 36 FP multiplications,
+ * (or, 50 additions, 12 multiplications, 24 fused multiply/add),
+ * 39 stack variables, 6 constants, and 35 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
+	       V T3, Ty, To, TK, Tr, TE, Ta, TJ, Tq, TB, Th, TL, Ts, TH, T1;
+	       V T2;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = VSUB(T1, T2);
+	       Ty = VADD(T1, T2);
+	       {
+		    V Tk, TC, Tn, TD;
+		    {
+			 V Ti, Tj, Tl, Tm;
+			 Ti = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tj = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Tk = VSUB(Ti, Tj);
+			 TC = VADD(Ti, Tj);
+			 Tl = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tm = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tn = VSUB(Tl, Tm);
+			 TD = VADD(Tl, Tm);
+		    }
+		    To = VADD(Tk, Tn);
+		    TK = VSUB(TC, TD);
+		    Tr = VSUB(Tn, Tk);
+		    TE = VADD(TC, TD);
+	       }
+	       {
+		    V T6, Tz, T9, TA;
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 Tz = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = VSUB(T7, T8);
+			 TA = VADD(T7, T8);
+		    }
+		    Ta = VADD(T6, T9);
+		    TJ = VSUB(TA, Tz);
+		    Tq = VSUB(T9, T6);
+		    TB = VADD(Tz, TA);
+	       }
+	       {
+		    V Td, TF, Tg, TG;
+		    {
+			 V Tb, Tc, Te, Tf;
+			 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 TF = VADD(Tb, Tc);
+			 Te = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tf = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tg = VSUB(Te, Tf);
+			 TG = VADD(Te, Tf);
+		    }
+		    Th = VADD(Td, Tg);
+		    TL = VSUB(TF, TG);
+		    Ts = VSUB(Tg, Td);
+		    TH = VADD(TF, TG);
+	       }
+	       {
+		    V TR, TS, TT, TU, TV, TW;
+		    TR = VADD(T3, VADD(Ta, VADD(Th, To)));
+		    STM2(&(xo[14]), TR, ovs, &(xo[2]));
+		    TS = VADD(Ty, VADD(TB, VADD(TH, TE)));
+		    STM2(&(xo[0]), TS, ovs, &(xo[0]));
+		    {
+			 V Tt, Tp, TP, TQ;
+			 Tt = VBYI(VFNMS(LDK(KP781831482), Tr, VFNMS(LDK(KP433883739), Ts, VMUL(LDK(KP974927912), Tq))));
+			 Tp = VFMA(LDK(KP623489801), To, VFNMS(LDK(KP900968867), Th, VFNMS(LDK(KP222520933), Ta, T3)));
+			 TT = VSUB(Tp, Tt);
+			 STM2(&(xo[10]), TT, ovs, &(xo[2]));
+			 TU = VADD(Tp, Tt);
+			 STM2(&(xo[18]), TU, ovs, &(xo[2]));
+			 TP = VBYI(VFMA(LDK(KP974927912), TJ, VFMA(LDK(KP433883739), TL, VMUL(LDK(KP781831482), TK))));
+			 TQ = VFMA(LDK(KP623489801), TE, VFNMS(LDK(KP900968867), TH, VFNMS(LDK(KP222520933), TB, Ty)));
+			 TV = VADD(TP, TQ);
+			 STM2(&(xo[4]), TV, ovs, &(xo[0]));
+			 TW = VSUB(TQ, TP);
+			 STM2(&(xo[24]), TW, ovs, &(xo[0]));
+		    }
+		    {
+			 V Tv, Tu, TX, TY;
+			 Tv = VBYI(VFMA(LDK(KP781831482), Tq, VFMA(LDK(KP974927912), Ts, VMUL(LDK(KP433883739), Tr))));
+			 Tu = VFMA(LDK(KP623489801), Ta, VFNMS(LDK(KP900968867), To, VFNMS(LDK(KP222520933), Th, T3)));
+			 TX = VSUB(Tu, Tv);
+			 STM2(&(xo[26]), TX, ovs, &(xo[2]));
+			 STN2(&(xo[24]), TW, TX, ovs);
+			 TY = VADD(Tu, Tv);
+			 STM2(&(xo[2]), TY, ovs, &(xo[2]));
+			 STN2(&(xo[0]), TS, TY, ovs);
+		    }
+		    {
+			 V TM, TI, TZ, T10;
+			 TM = VBYI(VFNMS(LDK(KP433883739), TK, VFNMS(LDK(KP974927912), TL, VMUL(LDK(KP781831482), TJ))));
+			 TI = VFMA(LDK(KP623489801), TB, VFNMS(LDK(KP900968867), TE, VFNMS(LDK(KP222520933), TH, Ty)));
+			 TZ = VSUB(TI, TM);
+			 STM2(&(xo[12]), TZ, ovs, &(xo[0]));
+			 STN2(&(xo[12]), TZ, TR, ovs);
+			 T10 = VADD(TM, TI);
+			 STM2(&(xo[16]), T10, ovs, &(xo[0]));
+			 STN2(&(xo[16]), T10, TU, ovs);
+		    }
+		    {
+			 V T12, TO, TN, T11;
+			 TO = VBYI(VFMA(LDK(KP433883739), TJ, VFNMS(LDK(KP974927912), TK, VMUL(LDK(KP781831482), TL))));
+			 TN = VFMA(LDK(KP623489801), TH, VFNMS(LDK(KP222520933), TE, VFNMS(LDK(KP900968867), TB, Ty)));
+			 T11 = VSUB(TN, TO);
+			 STM2(&(xo[8]), T11, ovs, &(xo[0]));
+			 STN2(&(xo[8]), T11, TT, ovs);
+			 T12 = VADD(TO, TN);
+			 STM2(&(xo[20]), T12, ovs, &(xo[0]));
+			 {
+			      V Tx, Tw, T13, T14;
+			      Tx = VBYI(VFMA(LDK(KP433883739), Tq, VFNMS(LDK(KP781831482), Ts, VMUL(LDK(KP974927912), Tr))));
+			      Tw = VFMA(LDK(KP623489801), Th, VFNMS(LDK(KP222520933), To, VFNMS(LDK(KP900968867), Ta, T3)));
+			      T13 = VSUB(Tw, Tx);
+			      STM2(&(xo[22]), T13, ovs, &(xo[2]));
+			      STN2(&(xo[20]), T12, T13, ovs);
+			      T14 = VADD(Tw, Tx);
+			      STM2(&(xo[6]), T14, ovs, &(xo[2]));
+			      STN2(&(xo[4]), TV, T14, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 14, XSIMD_STRING("n2fv_14"), {50, 12, 24, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_14) (planner *p) {
+     X(kdft_register) (p, n2fv_14, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:23 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n2fv_16 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 72 FP additions, 34 FP multiplications,
+ * (or, 38 additions, 0 multiplications, 34 fused multiply/add),
+ * 62 stack variables, 3 constants, and 40 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       V T7, Tu, TF, TB, T13, TL, TO, TX, TC, Te, TP, Th, TQ, Tk, TW;
+	       V T16;
+	       {
+		    V TH, TU, Tz, Tf, TK, TV, TA, TM, Ta, TN, Td, Tg, Ti, Tj;
+		    {
+			 V T1, T2, T4, T5, To, Tp, Tr, Ts;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 To = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Tp = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tr = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 {
+			      V T8, TJ, Tq, TI, Tt, T9, Tb, Tc, T3, T6;
+			      T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			      TH = VSUB(T1, T2);
+			      T3 = VADD(T1, T2);
+			      TU = VSUB(T4, T5);
+			      T6 = VADD(T4, T5);
+			      TJ = VSUB(To, Tp);
+			      Tq = VADD(To, Tp);
+			      TI = VSUB(Tr, Ts);
+			      Tt = VADD(Tr, Ts);
+			      T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			      Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			      Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			      T7 = VSUB(T3, T6);
+			      Tz = VADD(T3, T6);
+			      Tf = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			      TK = VADD(TI, TJ);
+			      TV = VSUB(TJ, TI);
+			      TA = VADD(Tt, Tq);
+			      Tu = VSUB(Tq, Tt);
+			      TM = VSUB(T8, T9);
+			      Ta = VADD(T8, T9);
+			      TN = VSUB(Tb, Tc);
+			      Td = VADD(Tb, Tc);
+			      Tg = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			      Ti = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			      Tj = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 }
+		    }
+		    TF = VSUB(Tz, TA);
+		    TB = VADD(Tz, TA);
+		    T13 = VFNMS(LDK(KP707106781), TK, TH);
+		    TL = VFMA(LDK(KP707106781), TK, TH);
+		    TO = VFNMS(LDK(KP414213562), TN, TM);
+		    TX = VFMA(LDK(KP414213562), TM, TN);
+		    TC = VADD(Ta, Td);
+		    Te = VSUB(Ta, Td);
+		    TP = VSUB(Tf, Tg);
+		    Th = VADD(Tf, Tg);
+		    TQ = VSUB(Tj, Ti);
+		    Tk = VADD(Ti, Tj);
+		    TW = VFNMS(LDK(KP707106781), TV, TU);
+		    T16 = VFMA(LDK(KP707106781), TV, TU);
+	       }
+	       {
+		    V TY, TR, Tl, TD;
+		    TY = VFMA(LDK(KP414213562), TP, TQ);
+		    TR = VFNMS(LDK(KP414213562), TQ, TP);
+		    Tl = VSUB(Th, Tk);
+		    TD = VADD(Th, Tk);
+		    {
+			 V TS, T17, TZ, T14;
+			 TS = VADD(TO, TR);
+			 T17 = VSUB(TR, TO);
+			 TZ = VSUB(TX, TY);
+			 T14 = VADD(TX, TY);
+			 {
+			      V TE, TG, Tm, Tv;
+			      TE = VADD(TC, TD);
+			      TG = VSUB(TD, TC);
+			      Tm = VADD(Te, Tl);
+			      Tv = VSUB(Tl, Te);
+			      {
+				   V T18, T1a, TT, T11;
+				   T18 = VFNMS(LDK(KP923879532), T17, T16);
+				   T1a = VFMA(LDK(KP923879532), T17, T16);
+				   TT = VFNMS(LDK(KP923879532), TS, TL);
+				   T11 = VFMA(LDK(KP923879532), TS, TL);
+				   {
+					V T15, T19, T10, T12;
+					T15 = VFNMS(LDK(KP923879532), T14, T13);
+					T19 = VFMA(LDK(KP923879532), T14, T13);
+					T10 = VFNMS(LDK(KP923879532), TZ, TW);
+					T12 = VFMA(LDK(KP923879532), TZ, TW);
+					{
+					     V T1b, T1c, T1d, T1e;
+					     T1b = VFMAI(TG, TF);
+					     STM2(&(xo[8]), T1b, ovs, &(xo[0]));
+					     T1c = VFNMSI(TG, TF);
+					     STM2(&(xo[24]), T1c, ovs, &(xo[0]));
+					     T1d = VADD(TB, TE);
+					     STM2(&(xo[0]), T1d, ovs, &(xo[0]));
+					     T1e = VSUB(TB, TE);
+					     STM2(&(xo[16]), T1e, ovs, &(xo[0]));
+					     {
+						  V Tw, Ty, Tn, Tx;
+						  Tw = VFNMS(LDK(KP707106781), Tv, Tu);
+						  Ty = VFMA(LDK(KP707106781), Tv, Tu);
+						  Tn = VFNMS(LDK(KP707106781), Tm, T7);
+						  Tx = VFMA(LDK(KP707106781), Tm, T7);
+						  {
+						       V T1f, T1g, T1h, T1i;
+						       T1f = VFMAI(T1a, T19);
+						       STM2(&(xo[6]), T1f, ovs, &(xo[2]));
+						       T1g = VFNMSI(T1a, T19);
+						       STM2(&(xo[26]), T1g, ovs, &(xo[2]));
+						       STN2(&(xo[24]), T1c, T1g, ovs);
+						       T1h = VFMAI(T18, T15);
+						       STM2(&(xo[22]), T1h, ovs, &(xo[2]));
+						       T1i = VFNMSI(T18, T15);
+						       STM2(&(xo[10]), T1i, ovs, &(xo[2]));
+						       STN2(&(xo[8]), T1b, T1i, ovs);
+						       {
+							    V T1j, T1k, T1l, T1m;
+							    T1j = VFNMSI(T12, T11);
+							    STM2(&(xo[2]), T1j, ovs, &(xo[2]));
+							    STN2(&(xo[0]), T1d, T1j, ovs);
+							    T1k = VFMAI(T12, T11);
+							    STM2(&(xo[30]), T1k, ovs, &(xo[2]));
+							    T1l = VFMAI(T10, TT);
+							    STM2(&(xo[14]), T1l, ovs, &(xo[2]));
+							    T1m = VFNMSI(T10, TT);
+							    STM2(&(xo[18]), T1m, ovs, &(xo[2]));
+							    STN2(&(xo[16]), T1e, T1m, ovs);
+							    {
+								 V T1n, T1o, T1p, T1q;
+								 T1n = VFNMSI(Ty, Tx);
+								 STM2(&(xo[28]), T1n, ovs, &(xo[0]));
+								 STN2(&(xo[28]), T1n, T1k, ovs);
+								 T1o = VFMAI(Ty, Tx);
+								 STM2(&(xo[4]), T1o, ovs, &(xo[0]));
+								 STN2(&(xo[4]), T1o, T1f, ovs);
+								 T1p = VFMAI(Tw, Tn);
+								 STM2(&(xo[20]), T1p, ovs, &(xo[0]));
+								 STN2(&(xo[20]), T1p, T1h, ovs);
+								 T1q = VFNMSI(Tw, Tn);
+								 STM2(&(xo[12]), T1q, ovs, &(xo[0]));
+								 STN2(&(xo[12]), T1q, T1l, ovs);
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 16, XSIMD_STRING("n2fv_16"), {38, 0, 34, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_16) (planner *p) {
+     X(kdft_register) (p, n2fv_16, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n2fv_16 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 72 FP additions, 12 FP multiplications,
+ * (or, 68 additions, 8 multiplications, 4 fused multiply/add),
+ * 38 stack variables, 3 constants, and 40 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       V Tp, T13, Tu, TN, Tm, T14, Tv, TY, T7, T17, Ty, TT, Te, T16, Tx;
+	       V TQ;
+	       {
+		    V Tn, To, TM, Ts, Tt, TL;
+		    Tn = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    To = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+		    TM = VADD(Tn, To);
+		    Ts = LD(&(xi[0]), ivs, &(xi[0]));
+		    Tt = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+		    TL = VADD(Ts, Tt);
+		    Tp = VSUB(Tn, To);
+		    T13 = VADD(TL, TM);
+		    Tu = VSUB(Ts, Tt);
+		    TN = VSUB(TL, TM);
+	       }
+	       {
+		    V Ti, TW, Tl, TX;
+		    {
+			 V Tg, Th, Tj, Tk;
+			 Tg = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Ti = VSUB(Tg, Th);
+			 TW = VADD(Tg, Th);
+			 Tj = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 TX = VADD(Tj, Tk);
+		    }
+		    Tm = VMUL(LDK(KP707106781), VSUB(Ti, Tl));
+		    T14 = VADD(TX, TW);
+		    Tv = VMUL(LDK(KP707106781), VADD(Tl, Ti));
+		    TY = VSUB(TW, TX);
+	       }
+	       {
+		    V T3, TR, T6, TS;
+		    {
+			 V T1, T2, T4, T5;
+			 T1 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T3 = VSUB(T1, T2);
+			 TR = VADD(T1, T2);
+			 T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T5 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 TS = VADD(T4, T5);
+		    }
+		    T7 = VFNMS(LDK(KP923879532), T6, VMUL(LDK(KP382683432), T3));
+		    T17 = VADD(TR, TS);
+		    Ty = VFMA(LDK(KP923879532), T3, VMUL(LDK(KP382683432), T6));
+		    TT = VSUB(TR, TS);
+	       }
+	       {
+		    V Ta, TO, Td, TP;
+		    {
+			 V T8, T9, Tb, Tc;
+			 T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 Ta = VSUB(T8, T9);
+			 TO = VADD(T8, T9);
+			 Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Td = VSUB(Tb, Tc);
+			 TP = VADD(Tb, Tc);
+		    }
+		    Te = VFMA(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), Td));
+		    T16 = VADD(TO, TP);
+		    Tx = VFNMS(LDK(KP382683432), Td, VMUL(LDK(KP923879532), Ta));
+		    TQ = VSUB(TO, TP);
+	       }
+	       {
+		    V T1b, T1c, T1d, T1e;
+		    {
+			 V T15, T18, T19, T1a;
+			 T15 = VADD(T13, T14);
+			 T18 = VADD(T16, T17);
+			 T1b = VSUB(T15, T18);
+			 STM2(&(xo[16]), T1b, ovs, &(xo[0]));
+			 T1c = VADD(T15, T18);
+			 STM2(&(xo[0]), T1c, ovs, &(xo[0]));
+			 T19 = VSUB(T13, T14);
+			 T1a = VBYI(VSUB(T17, T16));
+			 T1d = VSUB(T19, T1a);
+			 STM2(&(xo[24]), T1d, ovs, &(xo[0]));
+			 T1e = VADD(T19, T1a);
+			 STM2(&(xo[8]), T1e, ovs, &(xo[0]));
+		    }
+		    {
+			 V T1f, T1g, T1h, T1i;
+			 {
+			      V TV, T11, T10, T12, TU, TZ;
+			      TU = VMUL(LDK(KP707106781), VADD(TQ, TT));
+			      TV = VADD(TN, TU);
+			      T11 = VSUB(TN, TU);
+			      TZ = VMUL(LDK(KP707106781), VSUB(TT, TQ));
+			      T10 = VBYI(VADD(TY, TZ));
+			      T12 = VBYI(VSUB(TZ, TY));
+			      T1f = VSUB(TV, T10);
+			      STM2(&(xo[28]), T1f, ovs, &(xo[0]));
+			      T1g = VADD(T11, T12);
+			      STM2(&(xo[12]), T1g, ovs, &(xo[0]));
+			      T1h = VADD(TV, T10);
+			      STM2(&(xo[4]), T1h, ovs, &(xo[0]));
+			      T1i = VSUB(T11, T12);
+			      STM2(&(xo[20]), T1i, ovs, &(xo[0]));
+			 }
+			 {
+			      V Tr, TB, TA, TC;
+			      {
+				   V Tf, Tq, Tw, Tz;
+				   Tf = VSUB(T7, Te);
+				   Tq = VSUB(Tm, Tp);
+				   Tr = VBYI(VSUB(Tf, Tq));
+				   TB = VBYI(VADD(Tq, Tf));
+				   Tw = VADD(Tu, Tv);
+				   Tz = VADD(Tx, Ty);
+				   TA = VSUB(Tw, Tz);
+				   TC = VADD(Tw, Tz);
+			      }
+			      {
+				   V T1j, T1k, T1l, T1m;
+				   T1j = VADD(Tr, TA);
+				   STM2(&(xo[14]), T1j, ovs, &(xo[2]));
+				   STN2(&(xo[12]), T1g, T1j, ovs);
+				   T1k = VSUB(TC, TB);
+				   STM2(&(xo[30]), T1k, ovs, &(xo[2]));
+				   STN2(&(xo[28]), T1f, T1k, ovs);
+				   T1l = VSUB(TA, Tr);
+				   STM2(&(xo[18]), T1l, ovs, &(xo[2]));
+				   STN2(&(xo[16]), T1b, T1l, ovs);
+				   T1m = VADD(TB, TC);
+				   STM2(&(xo[2]), T1m, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T1c, T1m, ovs);
+			      }
+			 }
+			 {
+			      V TF, TJ, TI, TK;
+			      {
+				   V TD, TE, TG, TH;
+				   TD = VSUB(Tu, Tv);
+				   TE = VADD(Te, T7);
+				   TF = VADD(TD, TE);
+				   TJ = VSUB(TD, TE);
+				   TG = VADD(Tp, Tm);
+				   TH = VSUB(Ty, Tx);
+				   TI = VBYI(VADD(TG, TH));
+				   TK = VBYI(VSUB(TH, TG));
+			      }
+			      {
+				   V T1n, T1o, T1p, T1q;
+				   T1n = VSUB(TF, TI);
+				   STM2(&(xo[26]), T1n, ovs, &(xo[2]));
+				   STN2(&(xo[24]), T1d, T1n, ovs);
+				   T1o = VADD(TJ, TK);
+				   STM2(&(xo[10]), T1o, ovs, &(xo[2]));
+				   STN2(&(xo[8]), T1e, T1o, ovs);
+				   T1p = VADD(TF, TI);
+				   STM2(&(xo[6]), T1p, ovs, &(xo[2]));
+				   STN2(&(xo[4]), T1h, T1p, ovs);
+				   T1q = VSUB(TJ, TK);
+				   STM2(&(xo[22]), T1q, ovs, &(xo[2]));
+				   STN2(&(xo[20]), T1i, T1q, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 16, XSIMD_STRING("n2fv_16"), {68, 8, 4, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_16) (planner *p) {
+     X(kdft_register) (p, n2fv_16, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:21 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name n2fv_2 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 7 stack variables, 0 constants, and 5 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(4, is), MAKE_VOLATILE_STRIDE(4, os)) {
+	       V T1, T2, T3, T4;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = VADD(T1, T2);
+	       STM2(&(xo[0]), T3, ovs, &(xo[0]));
+	       T4 = VSUB(T1, T2);
+	       STM2(&(xo[2]), T4, ovs, &(xo[2]));
+	       STN2(&(xo[0]), T3, T4, ovs);
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 2, XSIMD_STRING("n2fv_2"), {2, 0, 0, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_2) (planner *p) {
+     X(kdft_register) (p, n2fv_2, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name n2fv_2 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 7 stack variables, 0 constants, and 5 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(4, is), MAKE_VOLATILE_STRIDE(4, os)) {
+	       V T1, T2, T3, T4;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = VSUB(T1, T2);
+	       STM2(&(xo[2]), T3, ovs, &(xo[2]));
+	       T4 = VADD(T1, T2);
+	       STM2(&(xo[0]), T4, ovs, &(xo[0]));
+	       STN2(&(xo[0]), T4, T3, ovs);
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 2, XSIMD_STRING("n2fv_2"), {2, 0, 0, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_2) (planner *p) {
+     X(kdft_register) (p, n2fv_2, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:28 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n2fv_20 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 104 FP additions, 50 FP multiplications,
+ * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
+ * 79 stack variables, 4 constants, and 50 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
+	       V T1H, T1I, TU, TI, TP, TX, T1M, T1N, T1O, T1P, T1R, T1S, TM, TW, TT;
+	       V TF;
+	       {
+		    V T3, Tm, T1r, T13, Ta, TN, TH, TA, TG, Tt, Th, TO, T1u, T1C, T1n;
+		    V T1a, T1m, T1h, T1x, T1D, TE, Ti;
+		    {
+			 V T1, T2, Tk, Tl;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 {
+			      V T14, T6, T1c, Tw, Tn, T1f, Tz, T17, T9, To, Tq, T1b, Td, Tr, Te;
+			      V Tf, T15, Tp;
+			      {
+				   V Tx, Ty, T7, T8, Tb, Tc;
+				   {
+					V T4, T5, Tu, Tv, T11, T12;
+					T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+					Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+					Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+					Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+					T3 = VSUB(T1, T2);
+					T11 = VADD(T1, T2);
+					Tm = VSUB(Tk, Tl);
+					T12 = VADD(Tk, Tl);
+					T14 = VADD(T4, T5);
+					T6 = VSUB(T4, T5);
+					T1c = VADD(Tu, Tv);
+					Tw = VSUB(Tu, Tv);
+					Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+					T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+					T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+					T1r = VADD(T11, T12);
+					T13 = VSUB(T11, T12);
+				   }
+				   Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+				   Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+				   Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+				   T1f = VADD(Tx, Ty);
+				   Tz = VSUB(Tx, Ty);
+				   T17 = VADD(T7, T8);
+				   T9 = VSUB(T7, T8);
+				   To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+				   Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+				   T1b = VADD(Tb, Tc);
+				   Td = VSUB(Tb, Tc);
+				   Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+				   Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+				   Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			      }
+			      Ta = VADD(T6, T9);
+			      TN = VSUB(T6, T9);
+			      T15 = VADD(Tn, To);
+			      Tp = VSUB(Tn, To);
+			      TH = VSUB(Tz, Tw);
+			      TA = VADD(Tw, Tz);
+			      {
+				   V T1d, T1v, T18, Ts, T1e, Tg, T16, T1s;
+				   T1d = VSUB(T1b, T1c);
+				   T1v = VADD(T1b, T1c);
+				   T18 = VADD(Tq, Tr);
+				   Ts = VSUB(Tq, Tr);
+				   T1e = VADD(Te, Tf);
+				   Tg = VSUB(Te, Tf);
+				   T16 = VSUB(T14, T15);
+				   T1s = VADD(T14, T15);
+				   {
+					V T1t, T19, T1w, T1g;
+					T1t = VADD(T17, T18);
+					T19 = VSUB(T17, T18);
+					TG = VSUB(Ts, Tp);
+					Tt = VADD(Tp, Ts);
+					T1w = VADD(T1e, T1f);
+					T1g = VSUB(T1e, T1f);
+					Th = VADD(Td, Tg);
+					TO = VSUB(Td, Tg);
+					T1u = VADD(T1s, T1t);
+					T1C = VSUB(T1s, T1t);
+					T1n = VSUB(T16, T19);
+					T1a = VADD(T16, T19);
+					T1m = VSUB(T1d, T1g);
+					T1h = VADD(T1d, T1g);
+					T1x = VADD(T1v, T1w);
+					T1D = VSUB(T1v, T1w);
+				   }
+			      }
+			 }
+		    }
+		    TE = VSUB(Ta, Th);
+		    Ti = VADD(Ta, Th);
+		    {
+			 V TL, T1k, T1A, Tj, TD, T1E, T1G, TK, TC, T1j, T1z, T1i, T1y, TB;
+			 TL = VSUB(TA, Tt);
+			 TB = VADD(Tt, TA);
+			 T1i = VADD(T1a, T1h);
+			 T1k = VSUB(T1a, T1h);
+			 T1y = VADD(T1u, T1x);
+			 T1A = VSUB(T1u, T1x);
+			 Tj = VADD(T3, Ti);
+			 TD = VFNMS(LDK(KP250000000), Ti, T3);
+			 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
+			 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
+			 TK = VFNMS(LDK(KP250000000), TB, Tm);
+			 TC = VADD(Tm, TB);
+			 T1j = VFNMS(LDK(KP250000000), T1i, T13);
+			 T1H = VADD(T1r, T1y);
+			 STM2(&(xo[0]), T1H, ovs, &(xo[0]));
+			 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
+			 T1I = VADD(T13, T1i);
+			 STM2(&(xo[20]), T1I, ovs, &(xo[0]));
+			 {
+			      V T1J, T1K, T1p, T1l, T1o, T1q, T1F, T1B, T1L, T1Q;
+			      TU = VFNMS(LDK(KP618033988), TG, TH);
+			      TI = VFMA(LDK(KP618033988), TH, TG);
+			      TP = VFMA(LDK(KP618033988), TO, TN);
+			      TX = VFNMS(LDK(KP618033988), TN, TO);
+			      T1J = VFMAI(TC, Tj);
+			      STM2(&(xo[30]), T1J, ovs, &(xo[2]));
+			      T1K = VFNMSI(TC, Tj);
+			      STM2(&(xo[10]), T1K, ovs, &(xo[2]));
+			      T1p = VFMA(LDK(KP559016994), T1k, T1j);
+			      T1l = VFNMS(LDK(KP559016994), T1k, T1j);
+			      T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
+			      T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
+			      T1F = VFNMS(LDK(KP559016994), T1A, T1z);
+			      T1B = VFMA(LDK(KP559016994), T1A, T1z);
+			      T1L = VFMAI(T1q, T1p);
+			      STM2(&(xo[28]), T1L, ovs, &(xo[0]));
+			      STN2(&(xo[28]), T1L, T1J, ovs);
+			      T1M = VFNMSI(T1q, T1p);
+			      STM2(&(xo[12]), T1M, ovs, &(xo[0]));
+			      T1N = VFNMSI(T1o, T1l);
+			      STM2(&(xo[36]), T1N, ovs, &(xo[0]));
+			      T1O = VFMAI(T1o, T1l);
+			      STM2(&(xo[4]), T1O, ovs, &(xo[0]));
+			      T1P = VFNMSI(T1E, T1B);
+			      STM2(&(xo[32]), T1P, ovs, &(xo[0]));
+			      T1Q = VFMAI(T1E, T1B);
+			      STM2(&(xo[8]), T1Q, ovs, &(xo[0]));
+			      STN2(&(xo[8]), T1Q, T1K, ovs);
+			      T1R = VFMAI(T1G, T1F);
+			      STM2(&(xo[24]), T1R, ovs, &(xo[0]));
+			      T1S = VFNMSI(T1G, T1F);
+			      STM2(&(xo[16]), T1S, ovs, &(xo[0]));
+			      TM = VFNMS(LDK(KP559016994), TL, TK);
+			      TW = VFMA(LDK(KP559016994), TL, TK);
+			      TT = VFNMS(LDK(KP559016994), TE, TD);
+			      TF = VFMA(LDK(KP559016994), TE, TD);
+			 }
+		    }
+	       }
+	       {
+		    V T10, TY, TQ, TS, TJ, TR, TZ, TV;
+		    T10 = VFMA(LDK(KP951056516), TX, TW);
+		    TY = VFNMS(LDK(KP951056516), TX, TW);
+		    TQ = VFMA(LDK(KP951056516), TP, TM);
+		    TS = VFNMS(LDK(KP951056516), TP, TM);
+		    TJ = VFMA(LDK(KP951056516), TI, TF);
+		    TR = VFNMS(LDK(KP951056516), TI, TF);
+		    TZ = VFMA(LDK(KP951056516), TU, TT);
+		    TV = VFNMS(LDK(KP951056516), TU, TT);
+		    {
+			 V T1T, T1U, T1V, T1W;
+			 T1T = VFMAI(TS, TR);
+			 STM2(&(xo[22]), T1T, ovs, &(xo[2]));
+			 STN2(&(xo[20]), T1I, T1T, ovs);
+			 T1U = VFNMSI(TS, TR);
+			 STM2(&(xo[18]), T1U, ovs, &(xo[2]));
+			 STN2(&(xo[16]), T1S, T1U, ovs);
+			 T1V = VFMAI(TQ, TJ);
+			 STM2(&(xo[38]), T1V, ovs, &(xo[2]));
+			 STN2(&(xo[36]), T1N, T1V, ovs);
+			 T1W = VFNMSI(TQ, TJ);
+			 STM2(&(xo[2]), T1W, ovs, &(xo[2]));
+			 STN2(&(xo[0]), T1H, T1W, ovs);
+			 {
+			      V T1X, T1Y, T1Z, T20;
+			      T1X = VFMAI(TY, TV);
+			      STM2(&(xo[6]), T1X, ovs, &(xo[2]));
+			      STN2(&(xo[4]), T1O, T1X, ovs);
+			      T1Y = VFNMSI(TY, TV);
+			      STM2(&(xo[34]), T1Y, ovs, &(xo[2]));
+			      STN2(&(xo[32]), T1P, T1Y, ovs);
+			      T1Z = VFMAI(T10, TZ);
+			      STM2(&(xo[14]), T1Z, ovs, &(xo[2]));
+			      STN2(&(xo[12]), T1M, T1Z, ovs);
+			      T20 = VFNMSI(T10, TZ);
+			      STM2(&(xo[26]), T20, ovs, &(xo[2]));
+			      STN2(&(xo[24]), T1R, T20, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 20, XSIMD_STRING("n2fv_20"), {58, 4, 46, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_20) (planner *p) {
+     X(kdft_register) (p, n2fv_20, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n2fv_20 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 104 FP additions, 24 FP multiplications,
+ * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
+ * 57 stack variables, 4 constants, and 50 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
+	       V T3, T1B, Tm, T1i, TG, TN, TO, TH, T13, T16, T1k, T1u, T1v, T1z, T1r;
+	       V T1s, T1y, T1a, T1d, T1j, Ti, TD, TB, TL;
+	       {
+		    V T1, T2, T1g, Tk, Tl, T1h;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+		    T1g = VADD(T1, T2);
+		    Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+		    T1h = VADD(Tk, Tl);
+		    T3 = VSUB(T1, T2);
+		    T1B = VADD(T1g, T1h);
+		    Tm = VSUB(Tk, Tl);
+		    T1i = VSUB(T1g, T1h);
+	       }
+	       {
+		    V T6, T18, Tw, T12, Tz, T15, T9, T1b, Td, T11, Tp, T19, Ts, T1c, Tg;
+		    V T14;
+		    {
+			 V T4, T5, Tu, Tv;
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T18 = VADD(T4, T5);
+			 Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 Tw = VSUB(Tu, Tv);
+			 T12 = VADD(Tu, Tv);
+		    }
+		    {
+			 V Tx, Ty, T7, T8;
+			 Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 Tz = VSUB(Tx, Ty);
+			 T15 = VADD(Tx, Ty);
+			 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T1b = VADD(T7, T8);
+		    }
+		    {
+			 V Tb, Tc, Tn, To;
+			 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 Td = VSUB(Tb, Tc);
+			 T11 = VADD(Tb, Tc);
+			 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			 Tp = VSUB(Tn, To);
+			 T19 = VADD(Tn, To);
+		    }
+		    {
+			 V Tq, Tr, Te, Tf;
+			 Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 Ts = VSUB(Tq, Tr);
+			 T1c = VADD(Tq, Tr);
+			 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Tg = VSUB(Te, Tf);
+			 T14 = VADD(Te, Tf);
+		    }
+		    TG = VSUB(Ts, Tp);
+		    TN = VSUB(T6, T9);
+		    TO = VSUB(Td, Tg);
+		    TH = VSUB(Tz, Tw);
+		    T13 = VSUB(T11, T12);
+		    T16 = VSUB(T14, T15);
+		    T1k = VADD(T13, T16);
+		    T1u = VADD(T11, T12);
+		    T1v = VADD(T14, T15);
+		    T1z = VADD(T1u, T1v);
+		    T1r = VADD(T18, T19);
+		    T1s = VADD(T1b, T1c);
+		    T1y = VADD(T1r, T1s);
+		    T1a = VSUB(T18, T19);
+		    T1d = VSUB(T1b, T1c);
+		    T1j = VADD(T1a, T1d);
+		    {
+			 V Ta, Th, Tt, TA;
+			 Ta = VADD(T6, T9);
+			 Th = VADD(Td, Tg);
+			 Ti = VADD(Ta, Th);
+			 TD = VMUL(LDK(KP559016994), VSUB(Ta, Th));
+			 Tt = VADD(Tp, Ts);
+			 TA = VADD(Tw, Tz);
+			 TB = VADD(Tt, TA);
+			 TL = VMUL(LDK(KP559016994), VSUB(TA, Tt));
+		    }
+	       }
+	       {
+		    V T1I, T1J, T1K, T1L, T1N, T1H, Tj, TC;
+		    Tj = VADD(T3, Ti);
+		    TC = VBYI(VADD(Tm, TB));
+		    T1H = VSUB(Tj, TC);
+		    STM2(&(xo[10]), T1H, ovs, &(xo[2]));
+		    T1I = VADD(Tj, TC);
+		    STM2(&(xo[30]), T1I, ovs, &(xo[2]));
+		    {
+			 V T1A, T1C, T1D, T1x, T1G, T1t, T1w, T1F, T1E, T1M;
+			 T1A = VMUL(LDK(KP559016994), VSUB(T1y, T1z));
+			 T1C = VADD(T1y, T1z);
+			 T1D = VFNMS(LDK(KP250000000), T1C, T1B);
+			 T1t = VSUB(T1r, T1s);
+			 T1w = VSUB(T1u, T1v);
+			 T1x = VBYI(VFMA(LDK(KP951056516), T1t, VMUL(LDK(KP587785252), T1w)));
+			 T1G = VBYI(VFNMS(LDK(KP587785252), T1t, VMUL(LDK(KP951056516), T1w)));
+			 T1J = VADD(T1B, T1C);
+			 STM2(&(xo[0]), T1J, ovs, &(xo[0]));
+			 T1F = VSUB(T1D, T1A);
+			 T1K = VSUB(T1F, T1G);
+			 STM2(&(xo[16]), T1K, ovs, &(xo[0]));
+			 T1L = VADD(T1G, T1F);
+			 STM2(&(xo[24]), T1L, ovs, &(xo[0]));
+			 T1E = VADD(T1A, T1D);
+			 T1M = VADD(T1x, T1E);
+			 STM2(&(xo[8]), T1M, ovs, &(xo[0]));
+			 STN2(&(xo[8]), T1M, T1H, ovs);
+			 T1N = VSUB(T1E, T1x);
+			 STM2(&(xo[32]), T1N, ovs, &(xo[0]));
+		    }
+		    {
+			 V T1O, T1P, T1R, T1S;
+			 {
+			      V T1n, T1l, T1m, T1f, T1q, T17, T1e, T1p, T1Q, T1o;
+			      T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
+			      T1l = VADD(T1j, T1k);
+			      T1m = VFNMS(LDK(KP250000000), T1l, T1i);
+			      T17 = VSUB(T13, T16);
+			      T1e = VSUB(T1a, T1d);
+			      T1f = VBYI(VFNMS(LDK(KP587785252), T1e, VMUL(LDK(KP951056516), T17)));
+			      T1q = VBYI(VFMA(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
+			      T1O = VADD(T1i, T1l);
+			      STM2(&(xo[20]), T1O, ovs, &(xo[0]));
+			      T1p = VADD(T1n, T1m);
+			      T1P = VSUB(T1p, T1q);
+			      STM2(&(xo[12]), T1P, ovs, &(xo[0]));
+			      T1Q = VADD(T1q, T1p);
+			      STM2(&(xo[28]), T1Q, ovs, &(xo[0]));
+			      STN2(&(xo[28]), T1Q, T1I, ovs);
+			      T1o = VSUB(T1m, T1n);
+			      T1R = VADD(T1f, T1o);
+			      STM2(&(xo[4]), T1R, ovs, &(xo[0]));
+			      T1S = VSUB(T1o, T1f);
+			      STM2(&(xo[36]), T1S, ovs, &(xo[0]));
+			 }
+			 {
+			      V TI, TP, TX, TU, TM, TW, TF, TT, TK, TE;
+			      TI = VFMA(LDK(KP951056516), TG, VMUL(LDK(KP587785252), TH));
+			      TP = VFMA(LDK(KP951056516), TN, VMUL(LDK(KP587785252), TO));
+			      TX = VFNMS(LDK(KP587785252), TN, VMUL(LDK(KP951056516), TO));
+			      TU = VFNMS(LDK(KP587785252), TG, VMUL(LDK(KP951056516), TH));
+			      TK = VFMS(LDK(KP250000000), TB, Tm);
+			      TM = VADD(TK, TL);
+			      TW = VSUB(TL, TK);
+			      TE = VFNMS(LDK(KP250000000), Ti, T3);
+			      TF = VADD(TD, TE);
+			      TT = VSUB(TE, TD);
+			      {
+				   V TJ, TQ, T1T, T1U;
+				   TJ = VADD(TF, TI);
+				   TQ = VBYI(VSUB(TM, TP));
+				   T1T = VSUB(TJ, TQ);
+				   STM2(&(xo[38]), T1T, ovs, &(xo[2]));
+				   STN2(&(xo[36]), T1S, T1T, ovs);
+				   T1U = VADD(TJ, TQ);
+				   STM2(&(xo[2]), T1U, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T1J, T1U, ovs);
+			      }
+			      {
+				   V TZ, T10, T1V, T1W;
+				   TZ = VADD(TT, TU);
+				   T10 = VBYI(VADD(TX, TW));
+				   T1V = VSUB(TZ, T10);
+				   STM2(&(xo[26]), T1V, ovs, &(xo[2]));
+				   STN2(&(xo[24]), T1L, T1V, ovs);
+				   T1W = VADD(TZ, T10);
+				   STM2(&(xo[14]), T1W, ovs, &(xo[2]));
+				   STN2(&(xo[12]), T1P, T1W, ovs);
+			      }
+			      {
+				   V TR, TS, T1X, T1Y;
+				   TR = VSUB(TF, TI);
+				   TS = VBYI(VADD(TP, TM));
+				   T1X = VSUB(TR, TS);
+				   STM2(&(xo[22]), T1X, ovs, &(xo[2]));
+				   STN2(&(xo[20]), T1O, T1X, ovs);
+				   T1Y = VADD(TR, TS);
+				   STM2(&(xo[18]), T1Y, ovs, &(xo[2]));
+				   STN2(&(xo[16]), T1K, T1Y, ovs);
+			      }
+			      {
+				   V TV, TY, T1Z, T20;
+				   TV = VSUB(TT, TU);
+				   TY = VBYI(VSUB(TW, TX));
+				   T1Z = VSUB(TV, TY);
+				   STM2(&(xo[34]), T1Z, ovs, &(xo[2]));
+				   STN2(&(xo[32]), T1N, T1Z, ovs);
+				   T20 = VADD(TV, TY);
+				   STM2(&(xo[6]), T20, ovs, &(xo[2]));
+				   STN2(&(xo[4]), T1R, T20, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 20, XSIMD_STRING("n2fv_20"), {92, 12, 12, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_20) (planner *p) {
+     X(kdft_register) (p, n2fv_20, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,823 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:24 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n2fv_32 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 186 FP additions, 98 FP multiplications,
+ * (or, 88 additions, 0 multiplications, 98 fused multiply/add),
+ * 120 stack variables, 7 constants, and 80 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       V T31, T32, T33, T34, T35, T36, T37, T38, T39, T3a, T3b, T3c, T1h, Tr, T3d;
+	       V T3e, T3f, T3g, T1a, T1k, TI, T1b, T1L, T1P, T1I, T1G, T1O, T1Q, T1H, T1z;
+	       V T1c, TZ;
+	       {
+		    V T2x, T1T, T2K, T1W, T1p, Tb, T1A, T16, Tu, TF, T2N, T2H, T2b, T2t, TY;
+		    V T1w, TT, T1v, T20, T2C, Tj, Te, T2h, To, T2f, T23, T2D, TB, TG, Th;
+		    V T2i, Tk;
+		    {
+			 V TL, TW, TP, TQ, T2F, T27, T28, TO;
+			 {
+			      V T1, T2, T12, T13, T4, T5, T7, T8;
+			      T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			      T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			      T12 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			      T13 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			      T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			      T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			      T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			      T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			      {
+				   V TM, T25, T26, TN;
+				   {
+					V TJ, T3, T14, T1U, T6, T1V, T9, TK, TU, TV, T1R, T1S, Ta, T15;
+					TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+					T1R = VADD(T1, T2);
+					T3 = VSUB(T1, T2);
+					T1S = VADD(T12, T13);
+					T14 = VSUB(T12, T13);
+					T1U = VADD(T4, T5);
+					T6 = VSUB(T4, T5);
+					T1V = VADD(T7, T8);
+					T9 = VSUB(T7, T8);
+					TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+					TU = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+					T2x = VSUB(T1R, T1S);
+					T1T = VADD(T1R, T1S);
+					TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+					TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+					T2K = VSUB(T1V, T1U);
+					T1W = VADD(T1U, T1V);
+					Ta = VADD(T6, T9);
+					T15 = VSUB(T9, T6);
+					T25 = VADD(TJ, TK);
+					TL = VSUB(TJ, TK);
+					T26 = VADD(TV, TU);
+					TW = VSUB(TU, TV);
+					TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+					TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+					T1p = VFNMS(LDK(KP707106781), Ta, T3);
+					Tb = VFMA(LDK(KP707106781), Ta, T3);
+					T1A = VFMA(LDK(KP707106781), T15, T14);
+					T16 = VFNMS(LDK(KP707106781), T15, T14);
+					TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+				   }
+				   T2F = VSUB(T25, T26);
+				   T27 = VADD(T25, T26);
+				   T28 = VADD(TM, TN);
+				   TO = VSUB(TM, TN);
+			      }
+			 }
+			 {
+			      V Ty, T21, Tx, Tz, T1Y, T1Z;
+			      {
+				   V Ts, Tt, TD, T29, TR, TE, Tv, Tw;
+				   Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+				   Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+				   TD = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+				   T29 = VADD(TP, TQ);
+				   TR = VSUB(TP, TQ);
+				   TE = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+				   Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+				   Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+				   Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+				   T1Y = VADD(Ts, Tt);
+				   Tu = VSUB(Ts, Tt);
+				   {
+					V T2G, T2a, TX, TS;
+					T2G = VSUB(T29, T28);
+					T2a = VADD(T28, T29);
+					TX = VSUB(TR, TO);
+					TS = VADD(TO, TR);
+					T1Z = VADD(TD, TE);
+					TF = VSUB(TD, TE);
+					T21 = VADD(Tv, Tw);
+					Tx = VSUB(Tv, Tw);
+					T2N = VFMA(LDK(KP414213562), T2F, T2G);
+					T2H = VFNMS(LDK(KP414213562), T2G, T2F);
+					T2b = VSUB(T27, T2a);
+					T2t = VADD(T27, T2a);
+					TY = VFMA(LDK(KP707106781), TX, TW);
+					T1w = VFNMS(LDK(KP707106781), TX, TW);
+					TT = VFMA(LDK(KP707106781), TS, TL);
+					T1v = VFNMS(LDK(KP707106781), TS, TL);
+					Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+				   }
+			      }
+			      T20 = VADD(T1Y, T1Z);
+			      T2C = VSUB(T1Y, T1Z);
+			      {
+				   V Tc, Td, Tm, Tn;
+				   Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+				   Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+				   Tm = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+				   Tn = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+				   {
+					V Tf, TA, T22, Tg;
+					Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+					TA = VSUB(Ty, Tz);
+					T22 = VADD(Ty, Tz);
+					Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+					Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+					Te = VSUB(Tc, Td);
+					T2h = VADD(Tc, Td);
+					To = VSUB(Tm, Tn);
+					T2f = VADD(Tn, Tm);
+					T23 = VADD(T21, T22);
+					T2D = VSUB(T21, T22);
+					TB = VADD(Tx, TA);
+					TG = VSUB(Tx, TA);
+					Th = VSUB(Tf, Tg);
+					T2i = VADD(Tf, Tg);
+					Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T1t, TH, T1s, TC, T2P, T2U, T2n, T2d, T2w, T2u, T1q, T19, T1B, Tq, T2W;
+			 V T2M, T2B, T2T, T2v, T2r, T2o, T2m, T2X, T2I;
+			 {
+			      V T1X, T2p, T2E, T2O, T2s, T2y, T2j, T17, Ti, T2e, Tl, T2c, T2l, T24;
+			      T1X = VSUB(T1T, T1W);
+			      T2p = VADD(T1T, T1W);
+			      T2E = VFNMS(LDK(KP414213562), T2D, T2C);
+			      T2O = VFMA(LDK(KP414213562), T2C, T2D);
+			      T2s = VADD(T20, T23);
+			      T24 = VSUB(T20, T23);
+			      T1t = VFNMS(LDK(KP707106781), TG, TF);
+			      TH = VFMA(LDK(KP707106781), TG, TF);
+			      T1s = VFNMS(LDK(KP707106781), TB, Tu);
+			      TC = VFMA(LDK(KP707106781), TB, Tu);
+			      T2y = VSUB(T2h, T2i);
+			      T2j = VADD(T2h, T2i);
+			      T17 = VFMA(LDK(KP414213562), Te, Th);
+			      Ti = VFNMS(LDK(KP414213562), Th, Te);
+			      T2e = VADD(Tj, Tk);
+			      Tl = VSUB(Tj, Tk);
+			      T2c = VADD(T24, T2b);
+			      T2l = VSUB(T2b, T24);
+			      {
+				   V T2L, T2A, T2q, T2k;
+				   T2P = VSUB(T2N, T2O);
+				   T2U = VADD(T2O, T2N);
+				   {
+					V T2z, T2g, T18, Tp;
+					T2z = VSUB(T2e, T2f);
+					T2g = VADD(T2e, T2f);
+					T18 = VFMA(LDK(KP414213562), Tl, To);
+					Tp = VFNMS(LDK(KP414213562), To, Tl);
+					T2n = VFMA(LDK(KP707106781), T2c, T1X);
+					T2d = VFNMS(LDK(KP707106781), T2c, T1X);
+					T2w = VSUB(T2t, T2s);
+					T2u = VADD(T2s, T2t);
+					T2L = VSUB(T2z, T2y);
+					T2A = VADD(T2y, T2z);
+					T2q = VADD(T2j, T2g);
+					T2k = VSUB(T2g, T2j);
+					T1q = VADD(T17, T18);
+					T19 = VSUB(T17, T18);
+					T1B = VSUB(Tp, Ti);
+					Tq = VADD(Ti, Tp);
+				   }
+				   T2W = VFNMS(LDK(KP707106781), T2L, T2K);
+				   T2M = VFMA(LDK(KP707106781), T2L, T2K);
+				   T2B = VFMA(LDK(KP707106781), T2A, T2x);
+				   T2T = VFNMS(LDK(KP707106781), T2A, T2x);
+				   T2v = VSUB(T2p, T2q);
+				   T2r = VADD(T2p, T2q);
+				   T2o = VFMA(LDK(KP707106781), T2l, T2k);
+				   T2m = VFNMS(LDK(KP707106781), T2l, T2k);
+				   T2X = VSUB(T2H, T2E);
+				   T2I = VADD(T2E, T2H);
+			      }
+			 }
+			 {
+			      V T2V, T2Z, T2Y, T30, T2R, T2J;
+			      T2V = VFNMS(LDK(KP923879532), T2U, T2T);
+			      T2Z = VFMA(LDK(KP923879532), T2U, T2T);
+			      T31 = VFNMSI(T2w, T2v);
+			      STM2(&(xo[48]), T31, ovs, &(xo[0]));
+			      T32 = VFMAI(T2w, T2v);
+			      STM2(&(xo[16]), T32, ovs, &(xo[0]));
+			      T33 = VADD(T2r, T2u);
+			      STM2(&(xo[0]), T33, ovs, &(xo[0]));
+			      T34 = VSUB(T2r, T2u);
+			      STM2(&(xo[32]), T34, ovs, &(xo[0]));
+			      T35 = VFNMSI(T2o, T2n);
+			      STM2(&(xo[56]), T35, ovs, &(xo[0]));
+			      T36 = VFMAI(T2o, T2n);
+			      STM2(&(xo[8]), T36, ovs, &(xo[0]));
+			      T37 = VFMAI(T2m, T2d);
+			      STM2(&(xo[40]), T37, ovs, &(xo[0]));
+			      T38 = VFNMSI(T2m, T2d);
+			      STM2(&(xo[24]), T38, ovs, &(xo[0]));
+			      T2Y = VFMA(LDK(KP923879532), T2X, T2W);
+			      T30 = VFNMS(LDK(KP923879532), T2X, T2W);
+			      T2R = VFMA(LDK(KP923879532), T2I, T2B);
+			      T2J = VFNMS(LDK(KP923879532), T2I, T2B);
+			      {
+				   V T1J, T1r, T1C, T1M, T2S, T2Q, T1u, T1D, T1E, T1x;
+				   T1J = VFNMS(LDK(KP923879532), T1q, T1p);
+				   T1r = VFMA(LDK(KP923879532), T1q, T1p);
+				   T1C = VFMA(LDK(KP923879532), T1B, T1A);
+				   T1M = VFNMS(LDK(KP923879532), T1B, T1A);
+				   T39 = VFNMSI(T30, T2Z);
+				   STM2(&(xo[12]), T39, ovs, &(xo[0]));
+				   T3a = VFMAI(T30, T2Z);
+				   STM2(&(xo[52]), T3a, ovs, &(xo[0]));
+				   T3b = VFNMSI(T2Y, T2V);
+				   STM2(&(xo[44]), T3b, ovs, &(xo[0]));
+				   T3c = VFMAI(T2Y, T2V);
+				   STM2(&(xo[20]), T3c, ovs, &(xo[0]));
+				   T2S = VFMA(LDK(KP923879532), T2P, T2M);
+				   T2Q = VFNMS(LDK(KP923879532), T2P, T2M);
+				   T1u = VFMA(LDK(KP668178637), T1t, T1s);
+				   T1D = VFNMS(LDK(KP668178637), T1s, T1t);
+				   T1E = VFNMS(LDK(KP668178637), T1v, T1w);
+				   T1x = VFMA(LDK(KP668178637), T1w, T1v);
+				   {
+					V T1K, T1F, T1N, T1y;
+					T1h = VFNMS(LDK(KP923879532), Tq, Tb);
+					Tr = VFMA(LDK(KP923879532), Tq, Tb);
+					T3d = VFNMSI(T2S, T2R);
+					STM2(&(xo[60]), T3d, ovs, &(xo[0]));
+					T3e = VFMAI(T2S, T2R);
+					STM2(&(xo[4]), T3e, ovs, &(xo[0]));
+					T3f = VFMAI(T2Q, T2J);
+					STM2(&(xo[36]), T3f, ovs, &(xo[0]));
+					T3g = VFNMSI(T2Q, T2J);
+					STM2(&(xo[28]), T3g, ovs, &(xo[0]));
+					T1K = VADD(T1D, T1E);
+					T1F = VSUB(T1D, T1E);
+					T1N = VSUB(T1x, T1u);
+					T1y = VADD(T1u, T1x);
+					T1a = VFMA(LDK(KP923879532), T19, T16);
+					T1k = VFNMS(LDK(KP923879532), T19, T16);
+					TI = VFNMS(LDK(KP198912367), TH, TC);
+					T1b = VFMA(LDK(KP198912367), TC, TH);
+					T1L = VFMA(LDK(KP831469612), T1K, T1J);
+					T1P = VFNMS(LDK(KP831469612), T1K, T1J);
+					T1I = VFMA(LDK(KP831469612), T1F, T1C);
+					T1G = VFNMS(LDK(KP831469612), T1F, T1C);
+					T1O = VFMA(LDK(KP831469612), T1N, T1M);
+					T1Q = VFNMS(LDK(KP831469612), T1N, T1M);
+					T1H = VFMA(LDK(KP831469612), T1y, T1r);
+					T1z = VFNMS(LDK(KP831469612), T1y, T1r);
+					T1c = VFMA(LDK(KP198912367), TT, TY);
+					TZ = VFNMS(LDK(KP198912367), TY, TT);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1d, T1i, T10, T1l;
+		    {
+			 V T3h, T3i, T3j, T3k;
+			 T3h = VFNMSI(T1O, T1L);
+			 STM2(&(xo[42]), T3h, ovs, &(xo[2]));
+			 STN2(&(xo[40]), T37, T3h, ovs);
+			 T3i = VFMAI(T1O, T1L);
+			 STM2(&(xo[22]), T3i, ovs, &(xo[2]));
+			 STN2(&(xo[20]), T3c, T3i, ovs);
+			 T3j = VFMAI(T1Q, T1P);
+			 STM2(&(xo[54]), T3j, ovs, &(xo[2]));
+			 STN2(&(xo[52]), T3a, T3j, ovs);
+			 T3k = VFNMSI(T1Q, T1P);
+			 STM2(&(xo[10]), T3k, ovs, &(xo[2]));
+			 STN2(&(xo[8]), T36, T3k, ovs);
+			 {
+			      V T3l, T3m, T3n, T3o;
+			      T3l = VFMAI(T1I, T1H);
+			      STM2(&(xo[6]), T3l, ovs, &(xo[2]));
+			      STN2(&(xo[4]), T3e, T3l, ovs);
+			      T3m = VFNMSI(T1I, T1H);
+			      STM2(&(xo[58]), T3m, ovs, &(xo[2]));
+			      STN2(&(xo[56]), T35, T3m, ovs);
+			      T3n = VFMAI(T1G, T1z);
+			      STM2(&(xo[38]), T3n, ovs, &(xo[2]));
+			      STN2(&(xo[36]), T3f, T3n, ovs);
+			      T3o = VFNMSI(T1G, T1z);
+			      STM2(&(xo[26]), T3o, ovs, &(xo[2]));
+			      STN2(&(xo[24]), T38, T3o, ovs);
+			      T1d = VSUB(T1b, T1c);
+			      T1i = VADD(T1b, T1c);
+			      T10 = VADD(TI, TZ);
+			      T1l = VSUB(TZ, TI);
+			 }
+		    }
+		    {
+			 V T1n, T1j, T1e, T1g, T1o, T1m, T11, T1f;
+			 T1n = VFMA(LDK(KP980785280), T1i, T1h);
+			 T1j = VFNMS(LDK(KP980785280), T1i, T1h);
+			 T1e = VFNMS(LDK(KP980785280), T1d, T1a);
+			 T1g = VFMA(LDK(KP980785280), T1d, T1a);
+			 T1o = VFMA(LDK(KP980785280), T1l, T1k);
+			 T1m = VFNMS(LDK(KP980785280), T1l, T1k);
+			 T11 = VFNMS(LDK(KP980785280), T10, Tr);
+			 T1f = VFMA(LDK(KP980785280), T10, Tr);
+			 {
+			      V T3p, T3q, T3r, T3s;
+			      T3p = VFMAI(T1m, T1j);
+			      STM2(&(xo[46]), T3p, ovs, &(xo[2]));
+			      STN2(&(xo[44]), T3b, T3p, ovs);
+			      T3q = VFNMSI(T1m, T1j);
+			      STM2(&(xo[18]), T3q, ovs, &(xo[2]));
+			      STN2(&(xo[16]), T32, T3q, ovs);
+			      T3r = VFNMSI(T1o, T1n);
+			      STM2(&(xo[50]), T3r, ovs, &(xo[2]));
+			      STN2(&(xo[48]), T31, T3r, ovs);
+			      T3s = VFMAI(T1o, T1n);
+			      STM2(&(xo[14]), T3s, ovs, &(xo[2]));
+			      STN2(&(xo[12]), T39, T3s, ovs);
+			      {
+				   V T3t, T3u, T3v, T3w;
+				   T3t = VFMAI(T1g, T1f);
+				   STM2(&(xo[62]), T3t, ovs, &(xo[2]));
+				   STN2(&(xo[60]), T3d, T3t, ovs);
+				   T3u = VFNMSI(T1g, T1f);
+				   STM2(&(xo[2]), T3u, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T33, T3u, ovs);
+				   T3v = VFMAI(T1e, T11);
+				   STM2(&(xo[30]), T3v, ovs, &(xo[2]));
+				   STN2(&(xo[28]), T3g, T3v, ovs);
+				   T3w = VFNMSI(T1e, T11);
+				   STM2(&(xo[34]), T3w, ovs, &(xo[2]));
+				   STN2(&(xo[32]), T34, T3w, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 32, XSIMD_STRING("n2fv_32"), {88, 0, 98, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_32) (planner *p) {
+     X(kdft_register) (p, n2fv_32, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n2fv_32 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 186 FP additions, 42 FP multiplications,
+ * (or, 170 additions, 26 multiplications, 16 fused multiply/add),
+ * 72 stack variables, 7 constants, and 80 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       V T1T, T1W, T2K, T2x, T16, T1A, Tb, T1p, TT, T1v, TY, T1w, T27, T2a, T2b;
+	       V T2H, T2O, TC, T1s, TH, T1t, T20, T23, T24, T2E, T2N, T2g, T2j, Tq, T1B;
+	       V T19, T1q, T2A, T2L;
+	       {
+		    V T3, T1R, T15, T1S, T6, T1U, T9, T1V, T12, Ta;
+		    {
+			 V T1, T2, T13, T14;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T3 = VSUB(T1, T2);
+			 T1R = VADD(T1, T2);
+			 T13 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T14 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			 T15 = VSUB(T13, T14);
+			 T1S = VADD(T13, T14);
+		    }
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T1U = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T1V = VADD(T7, T8);
+		    }
+		    T1T = VADD(T1R, T1S);
+		    T1W = VADD(T1U, T1V);
+		    T2K = VSUB(T1V, T1U);
+		    T2x = VSUB(T1R, T1S);
+		    T12 = VMUL(LDK(KP707106781), VSUB(T9, T6));
+		    T16 = VSUB(T12, T15);
+		    T1A = VADD(T15, T12);
+		    Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+		    Tb = VADD(T3, Ta);
+		    T1p = VSUB(T3, Ta);
+	       }
+	       {
+		    V TL, T25, TX, T26, TO, T28, TR, T29;
+		    {
+			 V TJ, TK, TV, TW;
+			 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+			 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 TL = VSUB(TJ, TK);
+			 T25 = VADD(TJ, TK);
+			 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 TW = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			 TX = VSUB(TV, TW);
+			 T26 = VADD(TV, TW);
+		    }
+		    {
+			 V TM, TN, TP, TQ;
+			 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			 TO = VSUB(TM, TN);
+			 T28 = VADD(TM, TN);
+			 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+			 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			 TR = VSUB(TP, TQ);
+			 T29 = VADD(TP, TQ);
+		    }
+		    {
+			 V TS, TU, T2F, T2G;
+			 TS = VMUL(LDK(KP707106781), VADD(TO, TR));
+			 TT = VADD(TL, TS);
+			 T1v = VSUB(TL, TS);
+			 TU = VMUL(LDK(KP707106781), VSUB(TR, TO));
+			 TY = VSUB(TU, TX);
+			 T1w = VADD(TX, TU);
+			 T27 = VADD(T25, T26);
+			 T2a = VADD(T28, T29);
+			 T2b = VSUB(T27, T2a);
+			 T2F = VSUB(T25, T26);
+			 T2G = VSUB(T29, T28);
+			 T2H = VFNMS(LDK(KP382683432), T2G, VMUL(LDK(KP923879532), T2F));
+			 T2O = VFMA(LDK(KP382683432), T2F, VMUL(LDK(KP923879532), T2G));
+		    }
+	       }
+	       {
+		    V Tu, T1Y, TG, T1Z, Tx, T21, TA, T22;
+		    {
+			 V Ts, Tt, TE, TF;
+			 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 Tu = VSUB(Ts, Tt);
+			 T1Y = VADD(Ts, Tt);
+			 TE = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 TF = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+			 TG = VSUB(TE, TF);
+			 T1Z = VADD(TE, TF);
+		    }
+		    {
+			 V Tv, Tw, Ty, Tz;
+			 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			 Tx = VSUB(Tv, Tw);
+			 T21 = VADD(Tv, Tw);
+			 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+			 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			 TA = VSUB(Ty, Tz);
+			 T22 = VADD(Ty, Tz);
+		    }
+		    {
+			 V TB, TD, T2C, T2D;
+			 TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
+			 TC = VADD(Tu, TB);
+			 T1s = VSUB(Tu, TB);
+			 TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
+			 TH = VSUB(TD, TG);
+			 T1t = VADD(TG, TD);
+			 T20 = VADD(T1Y, T1Z);
+			 T23 = VADD(T21, T22);
+			 T24 = VSUB(T20, T23);
+			 T2C = VSUB(T1Y, T1Z);
+			 T2D = VSUB(T22, T21);
+			 T2E = VFMA(LDK(KP923879532), T2C, VMUL(LDK(KP382683432), T2D));
+			 T2N = VFNMS(LDK(KP382683432), T2C, VMUL(LDK(KP923879532), T2D));
+		    }
+	       }
+	       {
+		    V Te, T2h, To, T2f, Th, T2i, Tl, T2e, Ti, Tp;
+		    {
+			 V Tc, Td, Tm, Tn;
+			 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 Te = VSUB(Tc, Td);
+			 T2h = VADD(Tc, Td);
+			 Tm = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			 Tn = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			 To = VSUB(Tm, Tn);
+			 T2f = VADD(Tm, Tn);
+		    }
+		    {
+			 V Tf, Tg, Tj, Tk;
+			 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+			 Th = VSUB(Tf, Tg);
+			 T2i = VADD(Tf, Tg);
+			 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 T2e = VADD(Tj, Tk);
+		    }
+		    T2g = VADD(T2e, T2f);
+		    T2j = VADD(T2h, T2i);
+		    Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
+		    Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
+		    Tq = VADD(Ti, Tp);
+		    T1B = VSUB(Tp, Ti);
+		    {
+			 V T17, T18, T2y, T2z;
+			 T17 = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
+			 T18 = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
+			 T19 = VSUB(T17, T18);
+			 T1q = VADD(T18, T17);
+			 T2y = VSUB(T2h, T2i);
+			 T2z = VSUB(T2e, T2f);
+			 T2A = VMUL(LDK(KP707106781), VADD(T2y, T2z));
+			 T2L = VMUL(LDK(KP707106781), VSUB(T2z, T2y));
+		    }
+	       }
+	       {
+		    V T31, T32, T33, T34, T35, T36, T37, T38, T39, T3a, T3b, T3c;
+		    {
+			 V T2d, T2n, T2m, T2o;
+			 {
+			      V T1X, T2c, T2k, T2l;
+			      T1X = VSUB(T1T, T1W);
+			      T2c = VMUL(LDK(KP707106781), VADD(T24, T2b));
+			      T2d = VADD(T1X, T2c);
+			      T2n = VSUB(T1X, T2c);
+			      T2k = VSUB(T2g, T2j);
+			      T2l = VMUL(LDK(KP707106781), VSUB(T2b, T24));
+			      T2m = VBYI(VADD(T2k, T2l));
+			      T2o = VBYI(VSUB(T2l, T2k));
+			 }
+			 T31 = VSUB(T2d, T2m);
+			 STM2(&(xo[56]), T31, ovs, &(xo[0]));
+			 T32 = VADD(T2n, T2o);
+			 STM2(&(xo[24]), T32, ovs, &(xo[0]));
+			 T33 = VADD(T2d, T2m);
+			 STM2(&(xo[8]), T33, ovs, &(xo[0]));
+			 T34 = VSUB(T2n, T2o);
+			 STM2(&(xo[40]), T34, ovs, &(xo[0]));
+		    }
+		    {
+			 V T2r, T2v, T2u, T2w;
+			 {
+			      V T2p, T2q, T2s, T2t;
+			      T2p = VADD(T1T, T1W);
+			      T2q = VADD(T2j, T2g);
+			      T2r = VADD(T2p, T2q);
+			      T2v = VSUB(T2p, T2q);
+			      T2s = VADD(T20, T23);
+			      T2t = VADD(T27, T2a);
+			      T2u = VADD(T2s, T2t);
+			      T2w = VBYI(VSUB(T2t, T2s));
+			 }
+			 T35 = VSUB(T2r, T2u);
+			 STM2(&(xo[32]), T35, ovs, &(xo[0]));
+			 T36 = VADD(T2v, T2w);
+			 STM2(&(xo[16]), T36, ovs, &(xo[0]));
+			 T37 = VADD(T2r, T2u);
+			 STM2(&(xo[0]), T37, ovs, &(xo[0]));
+			 T38 = VSUB(T2v, T2w);
+			 STM2(&(xo[48]), T38, ovs, &(xo[0]));
+		    }
+		    {
+			 V T2V, T2Z, T2Y, T30;
+			 {
+			      V T2T, T2U, T2W, T2X;
+			      T2T = VSUB(T2H, T2E);
+			      T2U = VSUB(T2L, T2K);
+			      T2V = VBYI(VSUB(T2T, T2U));
+			      T2Z = VBYI(VADD(T2U, T2T));
+			      T2W = VSUB(T2x, T2A);
+			      T2X = VSUB(T2O, T2N);
+			      T2Y = VSUB(T2W, T2X);
+			      T30 = VADD(T2W, T2X);
+			 }
+			 T39 = VADD(T2V, T2Y);
+			 STM2(&(xo[20]), T39, ovs, &(xo[0]));
+			 T3a = VSUB(T30, T2Z);
+			 STM2(&(xo[52]), T3a, ovs, &(xo[0]));
+			 T3b = VSUB(T2Y, T2V);
+			 STM2(&(xo[44]), T3b, ovs, &(xo[0]));
+			 T3c = VADD(T2Z, T30);
+			 STM2(&(xo[12]), T3c, ovs, &(xo[0]));
+		    }
+		    {
+			 V T3d, T3e, T3f, T3g;
+			 {
+			      V T2J, T2R, T2Q, T2S;
+			      {
+				   V T2B, T2I, T2M, T2P;
+				   T2B = VADD(T2x, T2A);
+				   T2I = VADD(T2E, T2H);
+				   T2J = VADD(T2B, T2I);
+				   T2R = VSUB(T2B, T2I);
+				   T2M = VADD(T2K, T2L);
+				   T2P = VADD(T2N, T2O);
+				   T2Q = VBYI(VADD(T2M, T2P));
+				   T2S = VBYI(VSUB(T2P, T2M));
+			      }
+			      T3d = VSUB(T2J, T2Q);
+			      STM2(&(xo[60]), T3d, ovs, &(xo[0]));
+			      T3e = VADD(T2R, T2S);
+			      STM2(&(xo[28]), T3e, ovs, &(xo[0]));
+			      T3f = VADD(T2J, T2Q);
+			      STM2(&(xo[4]), T3f, ovs, &(xo[0]));
+			      T3g = VSUB(T2R, T2S);
+			      STM2(&(xo[36]), T3g, ovs, &(xo[0]));
+			 }
+			 {
+			      V T1r, T1C, T1M, T1K, T1F, T1N, T1y, T1J;
+			      T1r = VADD(T1p, T1q);
+			      T1C = VADD(T1A, T1B);
+			      T1M = VSUB(T1p, T1q);
+			      T1K = VSUB(T1B, T1A);
+			      {
+				   V T1D, T1E, T1u, T1x;
+				   T1D = VFNMS(LDK(KP555570233), T1s, VMUL(LDK(KP831469612), T1t));
+				   T1E = VFMA(LDK(KP555570233), T1v, VMUL(LDK(KP831469612), T1w));
+				   T1F = VADD(T1D, T1E);
+				   T1N = VSUB(T1E, T1D);
+				   T1u = VFMA(LDK(KP831469612), T1s, VMUL(LDK(KP555570233), T1t));
+				   T1x = VFNMS(LDK(KP555570233), T1w, VMUL(LDK(KP831469612), T1v));
+				   T1y = VADD(T1u, T1x);
+				   T1J = VSUB(T1x, T1u);
+			      }
+			      {
+				   V T1z, T1G, T3h, T3i;
+				   T1z = VADD(T1r, T1y);
+				   T1G = VBYI(VADD(T1C, T1F));
+				   T3h = VSUB(T1z, T1G);
+				   STM2(&(xo[58]), T3h, ovs, &(xo[2]));
+				   STN2(&(xo[56]), T31, T3h, ovs);
+				   T3i = VADD(T1z, T1G);
+				   STM2(&(xo[6]), T3i, ovs, &(xo[2]));
+				   STN2(&(xo[4]), T3f, T3i, ovs);
+			      }
+			      {
+				   V T1P, T1Q, T3j, T3k;
+				   T1P = VBYI(VADD(T1K, T1J));
+				   T1Q = VADD(T1M, T1N);
+				   T3j = VADD(T1P, T1Q);
+				   STM2(&(xo[10]), T3j, ovs, &(xo[2]));
+				   STN2(&(xo[8]), T33, T3j, ovs);
+				   T3k = VSUB(T1Q, T1P);
+				   STM2(&(xo[54]), T3k, ovs, &(xo[2]));
+				   STN2(&(xo[52]), T3a, T3k, ovs);
+			      }
+			      {
+				   V T1H, T1I, T3l, T3m;
+				   T1H = VSUB(T1r, T1y);
+				   T1I = VBYI(VSUB(T1F, T1C));
+				   T3l = VSUB(T1H, T1I);
+				   STM2(&(xo[38]), T3l, ovs, &(xo[2]));
+				   STN2(&(xo[36]), T3g, T3l, ovs);
+				   T3m = VADD(T1H, T1I);
+				   STM2(&(xo[26]), T3m, ovs, &(xo[2]));
+				   STN2(&(xo[24]), T32, T3m, ovs);
+			      }
+			      {
+				   V T1L, T1O, T3n, T3o;
+				   T1L = VBYI(VSUB(T1J, T1K));
+				   T1O = VSUB(T1M, T1N);
+				   T3n = VADD(T1L, T1O);
+				   STM2(&(xo[22]), T3n, ovs, &(xo[2]));
+				   STN2(&(xo[20]), T39, T3n, ovs);
+				   T3o = VSUB(T1O, T1L);
+				   STM2(&(xo[42]), T3o, ovs, &(xo[2]));
+				   STN2(&(xo[40]), T34, T3o, ovs);
+			      }
+			 }
+			 {
+			      V Tr, T1a, T1k, T1i, T1d, T1l, T10, T1h;
+			      Tr = VADD(Tb, Tq);
+			      T1a = VADD(T16, T19);
+			      T1k = VSUB(Tb, Tq);
+			      T1i = VSUB(T19, T16);
+			      {
+				   V T1b, T1c, TI, TZ;
+				   T1b = VFNMS(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
+				   T1c = VFMA(LDK(KP195090322), TT, VMUL(LDK(KP980785280), TY));
+				   T1d = VADD(T1b, T1c);
+				   T1l = VSUB(T1c, T1b);
+				   TI = VFMA(LDK(KP980785280), TC, VMUL(LDK(KP195090322), TH));
+				   TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
+				   T10 = VADD(TI, TZ);
+				   T1h = VSUB(TZ, TI);
+			      }
+			      {
+				   V T11, T1e, T3p, T3q;
+				   T11 = VADD(Tr, T10);
+				   T1e = VBYI(VADD(T1a, T1d));
+				   T3p = VSUB(T11, T1e);
+				   STM2(&(xo[62]), T3p, ovs, &(xo[2]));
+				   STN2(&(xo[60]), T3d, T3p, ovs);
+				   T3q = VADD(T11, T1e);
+				   STM2(&(xo[2]), T3q, ovs, &(xo[2]));
+				   STN2(&(xo[0]), T37, T3q, ovs);
+			      }
+			      {
+				   V T1n, T1o, T3r, T3s;
+				   T1n = VBYI(VADD(T1i, T1h));
+				   T1o = VADD(T1k, T1l);
+				   T3r = VADD(T1n, T1o);
+				   STM2(&(xo[14]), T3r, ovs, &(xo[2]));
+				   STN2(&(xo[12]), T3c, T3r, ovs);
+				   T3s = VSUB(T1o, T1n);
+				   STM2(&(xo[50]), T3s, ovs, &(xo[2]));
+				   STN2(&(xo[48]), T38, T3s, ovs);
+			      }
+			      {
+				   V T1f, T1g, T3t, T3u;
+				   T1f = VSUB(Tr, T10);
+				   T1g = VBYI(VSUB(T1d, T1a));
+				   T3t = VSUB(T1f, T1g);
+				   STM2(&(xo[34]), T3t, ovs, &(xo[2]));
+				   STN2(&(xo[32]), T35, T3t, ovs);
+				   T3u = VADD(T1f, T1g);
+				   STM2(&(xo[30]), T3u, ovs, &(xo[2]));
+				   STN2(&(xo[28]), T3e, T3u, ovs);
+			      }
+			      {
+				   V T1j, T1m, T3v, T3w;
+				   T1j = VBYI(VSUB(T1h, T1i));
+				   T1m = VSUB(T1k, T1l);
+				   T3v = VADD(T1j, T1m);
+				   STM2(&(xo[18]), T3v, ovs, &(xo[2]));
+				   STN2(&(xo[16]), T36, T3v, ovs);
+				   T3w = VSUB(T1m, T1j);
+				   STM2(&(xo[46]), T3w, ovs, &(xo[2]));
+				   STN2(&(xo[44]), T3b, T3w, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 32, XSIMD_STRING("n2fv_32"), {170, 26, 16, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_32) (planner *p) {
+     X(kdft_register) (p, n2fv_32, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:21 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name n2fv_4 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 8 FP additions, 2 FP multiplications,
+ * (or, 6 additions, 0 multiplications, 2 fused multiply/add),
+ * 15 stack variables, 0 constants, and 10 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
+	       V T1, T2, T4, T5;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, T7, T6, T8;
+		    T3 = VSUB(T1, T2);
+		    T7 = VADD(T1, T2);
+		    T6 = VSUB(T4, T5);
+		    T8 = VADD(T4, T5);
+		    {
+			 V T9, Ta, Tb, Tc;
+			 T9 = VSUB(T7, T8);
+			 STM2(&(xo[4]), T9, ovs, &(xo[0]));
+			 Ta = VADD(T7, T8);
+			 STM2(&(xo[0]), Ta, ovs, &(xo[0]));
+			 Tb = VFMAI(T6, T3);
+			 STM2(&(xo[6]), Tb, ovs, &(xo[2]));
+			 STN2(&(xo[4]), T9, Tb, ovs);
+			 Tc = VFNMSI(T6, T3);
+			 STM2(&(xo[2]), Tc, ovs, &(xo[2]));
+			 STN2(&(xo[0]), Ta, Tc, ovs);
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 4, XSIMD_STRING("n2fv_4"), {6, 0, 2, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_4) (planner *p) {
+     X(kdft_register) (p, n2fv_4, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name n2fv_4 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 8 FP additions, 0 FP multiplications,
+ * (or, 8 additions, 0 multiplications, 0 fused multiply/add),
+ * 11 stack variables, 0 constants, and 10 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
+	       V T3, T7, T6, T8;
+	       {
+		    V T1, T2, T4, T5;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T3 = VSUB(T1, T2);
+		    T7 = VADD(T1, T2);
+		    T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+		    T6 = VBYI(VSUB(T4, T5));
+		    T8 = VADD(T4, T5);
+	       }
+	       {
+		    V T9, Ta, Tb, Tc;
+		    T9 = VSUB(T3, T6);
+		    STM2(&(xo[2]), T9, ovs, &(xo[2]));
+		    Ta = VADD(T7, T8);
+		    STM2(&(xo[0]), Ta, ovs, &(xo[0]));
+		    STN2(&(xo[0]), Ta, T9, ovs);
+		    Tb = VADD(T3, T6);
+		    STM2(&(xo[6]), Tb, ovs, &(xo[2]));
+		    Tc = VSUB(T7, T8);
+		    STM2(&(xo[4]), Tc, ovs, &(xo[0]));
+		    STN2(&(xo[4]), Tc, Tb, ovs);
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 4, XSIMD_STRING("n2fv_4"), {8, 0, 0, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_4) (planner *p) {
+     X(kdft_register) (p, n2fv_4, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:21 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name n2fv_6 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 18 FP additions, 8 FP multiplications,
+ * (or, 12 additions, 2 multiplications, 6 fused multiply/add),
+ * 29 stack variables, 2 constants, and 15 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
+	       V T1, T2, T4, T5, T7, T8;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, Td, T6, Te, T9, Tf;
+		    T3 = VSUB(T1, T2);
+		    Td = VADD(T1, T2);
+		    T6 = VSUB(T4, T5);
+		    Te = VADD(T4, T5);
+		    T9 = VSUB(T7, T8);
+		    Tf = VADD(T7, T8);
+		    {
+			 V Tg, Ti, Ta, Tc;
+			 Tg = VADD(Te, Tf);
+			 Ti = VMUL(LDK(KP866025403), VSUB(Tf, Te));
+			 Ta = VADD(T6, T9);
+			 Tc = VMUL(LDK(KP866025403), VSUB(T9, T6));
+			 {
+			      V Th, Tj, Tb, Tk;
+			      Th = VFNMS(LDK(KP500000000), Tg, Td);
+			      Tj = VADD(Td, Tg);
+			      STM2(&(xo[0]), Tj, ovs, &(xo[0]));
+			      Tb = VFNMS(LDK(KP500000000), Ta, T3);
+			      Tk = VADD(T3, Ta);
+			      STM2(&(xo[6]), Tk, ovs, &(xo[2]));
+			      {
+				   V Tl, Tm, Tn, To;
+				   Tl = VFMAI(Ti, Th);
+				   STM2(&(xo[8]), Tl, ovs, &(xo[0]));
+				   Tm = VFNMSI(Ti, Th);
+				   STM2(&(xo[4]), Tm, ovs, &(xo[0]));
+				   STN2(&(xo[4]), Tm, Tk, ovs);
+				   Tn = VFMAI(Tc, Tb);
+				   STM2(&(xo[2]), Tn, ovs, &(xo[2]));
+				   STN2(&(xo[0]), Tj, Tn, ovs);
+				   To = VFNMSI(Tc, Tb);
+				   STM2(&(xo[10]), To, ovs, &(xo[2]));
+				   STN2(&(xo[8]), Tl, To, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 6, XSIMD_STRING("n2fv_6"), {12, 2, 6, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_6) (planner *p) {
+     X(kdft_register) (p, n2fv_6, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name n2fv_6 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 18 FP additions, 4 FP multiplications,
+ * (or, 16 additions, 2 multiplications, 2 fused multiply/add),
+ * 25 stack variables, 2 constants, and 15 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
+	       V T3, Td, T6, Te, T9, Tf, Ta, Tg, T1, T2, Tj, Tk;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       T3 = VSUB(T1, T2);
+	       Td = VADD(T1, T2);
+	       {
+		    V T4, T5, T7, T8;
+		    T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+		    T6 = VSUB(T4, T5);
+		    Te = VADD(T4, T5);
+		    T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+		    T9 = VSUB(T7, T8);
+		    Tf = VADD(T7, T8);
+	       }
+	       Ta = VADD(T6, T9);
+	       Tg = VADD(Te, Tf);
+	       Tj = VADD(T3, Ta);
+	       STM2(&(xo[6]), Tj, ovs, &(xo[2]));
+	       Tk = VADD(Td, Tg);
+	       STM2(&(xo[0]), Tk, ovs, &(xo[0]));
+	       {
+		    V Tl, Tb, Tc, Tm;
+		    Tb = VFNMS(LDK(KP500000000), Ta, T3);
+		    Tc = VBYI(VMUL(LDK(KP866025403), VSUB(T9, T6)));
+		    Tl = VSUB(Tb, Tc);
+		    STM2(&(xo[10]), Tl, ovs, &(xo[2]));
+		    Tm = VADD(Tb, Tc);
+		    STM2(&(xo[2]), Tm, ovs, &(xo[2]));
+		    STN2(&(xo[0]), Tk, Tm, ovs);
+		    {
+			 V Th, Ti, Tn, To;
+			 Th = VFNMS(LDK(KP500000000), Tg, Td);
+			 Ti = VBYI(VMUL(LDK(KP866025403), VSUB(Tf, Te)));
+			 Tn = VSUB(Th, Ti);
+			 STM2(&(xo[4]), Tn, ovs, &(xo[0]));
+			 STN2(&(xo[4]), Tn, Tj, ovs);
+			 To = VADD(Th, Ti);
+			 STM2(&(xo[8]), To, ovs, &(xo[0]));
+			 STN2(&(xo[8]), To, Tl, ovs);
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 6, XSIMD_STRING("n2fv_6"), {16, 2, 2, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_6) (planner *p) {
+     X(kdft_register) (p, n2fv_6, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1815 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:25 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n2fv_64 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 456 FP additions, 258 FP multiplications,
+ * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
+ * 178 stack variables, 15 constants, and 160 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       V T7r, T7s, T7t, T7u, T5T, T5S, T5X, T65, T8a, T8b, T8e, T8g, T5Z, T5R, T67;
+	       V T63, T5U, T64;
+	       {
+		    V T7, T26, T5k, T6A, T47, T69, T2V, T3z, T6B, T4e, T6a, T5n, T3M, T2Y, T27;
+		    V Tm, T3A, T3l, T2a, TC, T5p, T4o, T6E, T6e, T3i, T3B, TR, T29, T4x, T5q;
+		    V T6h, T6D, T39, T3H, T3I, T3c, T5N, T57, T72, T6w, T5O, T5e, T71, T6t, T2y;
+		    V T1W, T2x, T1N, T33, T34, T3E, T32, T1p, T2v, T1g, T2u, T4M, T5K, T6p, T6Z;
+		    V T6m, T6Y, T5L, T4T;
+		    {
+			 V T4g, T4l, T3j, Tu, Tx, T4h, TA, T4i;
+			 {
+			      V T1, T2, T23, T24, T4, T5, T20, T21;
+			      T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			      T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+			      T23 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			      T24 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+			      T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			      T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+			      T20 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+			      T21 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			      {
+				   V Ta, T48, Tk, T4c, T49, Td, Tf, Tg;
+				   {
+					V T8, T43, T3, T44, T25, T5i, T6, T45, T22, T9, Ti, Tj, Tb, Tc;
+					T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+					T43 = VSUB(T1, T2);
+					T3 = VADD(T1, T2);
+					T44 = VSUB(T23, T24);
+					T25 = VADD(T23, T24);
+					T5i = VSUB(T4, T5);
+					T6 = VADD(T4, T5);
+					T45 = VSUB(T20, T21);
+					T22 = VADD(T20, T21);
+					T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+					Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+					Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+					Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+					Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+					{
+					     V T2T, T46, T5j, T2U;
+					     T7 = VSUB(T3, T6);
+					     T2T = VADD(T3, T6);
+					     T46 = VADD(T44, T45);
+					     T5j = VSUB(T45, T44);
+					     T26 = VSUB(T22, T25);
+					     T2U = VADD(T25, T22);
+					     Ta = VADD(T8, T9);
+					     T48 = VSUB(T8, T9);
+					     Tk = VADD(Ti, Tj);
+					     T4c = VSUB(Tj, Ti);
+					     T5k = VFNMS(LDK(KP707106781), T5j, T5i);
+					     T6A = VFMA(LDK(KP707106781), T5j, T5i);
+					     T47 = VFMA(LDK(KP707106781), T46, T43);
+					     T69 = VFNMS(LDK(KP707106781), T46, T43);
+					     T2V = VADD(T2T, T2U);
+					     T3z = VSUB(T2T, T2U);
+					     T49 = VSUB(Tb, Tc);
+					     Td = VADD(Tb, Tc);
+					}
+					Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+					Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+				   }
+				   {
+					V Te, T2W, T5l, T4a, Tq, Tt, Tv, Tw, T5m, T4d, Tl, T2X, Ty, Tz, To;
+					V Tp;
+					To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+					Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+					{
+					     V Th, T4b, Tr, Ts;
+					     Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+					     Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+					     Te = VSUB(Ta, Td);
+					     T2W = VADD(Ta, Td);
+					     T5l = VFMA(LDK(KP414213562), T48, T49);
+					     T4a = VFNMS(LDK(KP414213562), T49, T48);
+					     Th = VADD(Tf, Tg);
+					     T4b = VSUB(Tf, Tg);
+					     Tq = VADD(To, Tp);
+					     T4g = VSUB(To, Tp);
+					     T4l = VSUB(Tr, Ts);
+					     Tt = VADD(Tr, Ts);
+					     Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+					     Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+					     T5m = VFMA(LDK(KP414213562), T4b, T4c);
+					     T4d = VFNMS(LDK(KP414213562), T4c, T4b);
+					     Tl = VSUB(Th, Tk);
+					     T2X = VADD(Th, Tk);
+					     Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+					     Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+					}
+					T3j = VADD(Tq, Tt);
+					Tu = VSUB(Tq, Tt);
+					Tx = VADD(Tv, Tw);
+					T4h = VSUB(Tv, Tw);
+					T6B = VSUB(T4d, T4a);
+					T4e = VADD(T4a, T4d);
+					T6a = VADD(T5l, T5m);
+					T5n = VSUB(T5l, T5m);
+					T3M = VSUB(T2X, T2W);
+					T2Y = VADD(T2W, T2X);
+					T27 = VSUB(Tl, Te);
+					Tm = VADD(Te, Tl);
+					TA = VADD(Ty, Tz);
+					T4i = VSUB(Ty, Tz);
+				   }
+			      }
+			 }
+			 {
+			      V TK, T4p, T4u, T4k, T6d, T4n, T6c, TL, TN, TO, T3g, TJ, TF, TI;
+			      {
+				   V TD, TE, TG, TH;
+				   TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+				   TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+				   TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+				   TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+				   TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+				   {
+					V T3k, TB, T4j, T4m;
+					T3k = VADD(Tx, TA);
+					TB = VSUB(Tx, TA);
+					T4j = VADD(T4h, T4i);
+					T4m = VSUB(T4h, T4i);
+					T4p = VSUB(TD, TE);
+					TF = VADD(TD, TE);
+					T4u = VSUB(TH, TG);
+					TI = VADD(TG, TH);
+					T3A = VSUB(T3j, T3k);
+					T3l = VADD(T3j, T3k);
+					T2a = VFMA(LDK(KP414213562), Tu, TB);
+					TC = VFNMS(LDK(KP414213562), TB, Tu);
+					T4k = VFMA(LDK(KP707106781), T4j, T4g);
+					T6d = VFNMS(LDK(KP707106781), T4j, T4g);
+					T4n = VFMA(LDK(KP707106781), T4m, T4l);
+					T6c = VFNMS(LDK(KP707106781), T4m, T4l);
+					TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+				   }
+				   TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+				   TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+			      }
+			      T3g = VADD(TF, TI);
+			      TJ = VSUB(TF, TI);
+			      {
+				   V T3a, T1E, T52, T5b, T1x, T4Z, T6r, T6u, T5a, T1U, T55, T5c, T1L, T3b;
+				   {
+					V T4V, T1t, T58, T1w, T1Q, T1T, T1I, T4Y, T59, T1J, T53, T1H;
+					{
+					     V T1r, TM, T4r, TP, T4q, T1s, T1u, T1v;
+					     T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+					     T5p = VFMA(LDK(KP198912367), T4k, T4n);
+					     T4o = VFNMS(LDK(KP198912367), T4n, T4k);
+					     T6E = VFMA(LDK(KP668178637), T6c, T6d);
+					     T6e = VFNMS(LDK(KP668178637), T6d, T6c);
+					     TM = VADD(TK, TL);
+					     T4r = VSUB(TK, TL);
+					     TP = VADD(TN, TO);
+					     T4q = VSUB(TN, TO);
+					     T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+					     T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+					     T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+					     {
+						  V T1R, T4X, T6g, T4t, T6f, T4w, T1S, T1O, T1P;
+						  T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+						  T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+						  T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V T3h, TQ, T4s, T4v;
+						       T3h = VADD(TP, TM);
+						       TQ = VSUB(TM, TP);
+						       T4s = VADD(T4q, T4r);
+						       T4v = VSUB(T4r, T4q);
+						       T4V = VSUB(T1r, T1s);
+						       T1t = VADD(T1r, T1s);
+						       T58 = VSUB(T1v, T1u);
+						       T1w = VADD(T1u, T1v);
+						       T4X = VSUB(T1O, T1P);
+						       T1Q = VADD(T1O, T1P);
+						       T3i = VADD(T3g, T3h);
+						       T3B = VSUB(T3g, T3h);
+						       TR = VFNMS(LDK(KP414213562), TQ, TJ);
+						       T29 = VFMA(LDK(KP414213562), TJ, TQ);
+						       T6g = VFNMS(LDK(KP707106781), T4s, T4p);
+						       T4t = VFMA(LDK(KP707106781), T4s, T4p);
+						       T6f = VFNMS(LDK(KP707106781), T4v, T4u);
+						       T4w = VFMA(LDK(KP707106781), T4v, T4u);
+						       T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+						  }
+						  {
+						       V T4W, T1A, T50, T51, T1D, T1F, T1G;
+						       {
+							    V T1y, T1z, T1B, T1C;
+							    T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+							    T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+							    T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+							    T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+							    T4x = VFNMS(LDK(KP198912367), T4w, T4t);
+							    T5q = VFMA(LDK(KP198912367), T4t, T4w);
+							    T6h = VFNMS(LDK(KP668178637), T6g, T6f);
+							    T6D = VFMA(LDK(KP668178637), T6f, T6g);
+							    T4W = VSUB(T1R, T1S);
+							    T1T = VADD(T1R, T1S);
+							    T1A = VADD(T1y, T1z);
+							    T50 = VSUB(T1y, T1z);
+							    T51 = VSUB(T1C, T1B);
+							    T1D = VADD(T1B, T1C);
+						       }
+						       T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+						       T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+						       T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+						       T4Y = VADD(T4W, T4X);
+						       T59 = VSUB(T4X, T4W);
+						       T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+						       T3a = VADD(T1A, T1D);
+						       T1E = VSUB(T1A, T1D);
+						       T52 = VFMA(LDK(KP414213562), T51, T50);
+						       T5b = VFNMS(LDK(KP414213562), T50, T51);
+						       T53 = VSUB(T1F, T1G);
+						       T1H = VADD(T1F, T1G);
+						  }
+					     }
+					}
+					{
+					     V T37, T54, T1K, T38;
+					     T1x = VSUB(T1t, T1w);
+					     T37 = VADD(T1t, T1w);
+					     T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
+					     T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
+					     T54 = VSUB(T1J, T1I);
+					     T1K = VADD(T1I, T1J);
+					     T6u = VFNMS(LDK(KP707106781), T59, T58);
+					     T5a = VFMA(LDK(KP707106781), T59, T58);
+					     T38 = VADD(T1T, T1Q);
+					     T1U = VSUB(T1Q, T1T);
+					     T55 = VFNMS(LDK(KP414213562), T54, T53);
+					     T5c = VFMA(LDK(KP414213562), T53, T54);
+					     T1L = VSUB(T1H, T1K);
+					     T3b = VADD(T1H, T1K);
+					     T39 = VADD(T37, T38);
+					     T3H = VSUB(T37, T38);
+					}
+				   }
+				   {
+					V T4A, TW, T4N, TZ, T1j, T1m, T4O, T4D, T13, T4F, T16, T4G, T1a, T4I, T4J;
+					V T1d;
+					{
+					     V TU, TV, TX, TY, T56, T6v;
+					     TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+					     T56 = VADD(T52, T55);
+					     T6v = VSUB(T55, T52);
+					     {
+						  V T5d, T6s, T1V, T1M;
+						  T5d = VADD(T5b, T5c);
+						  T6s = VSUB(T5c, T5b);
+						  T1V = VSUB(T1L, T1E);
+						  T1M = VADD(T1E, T1L);
+						  T3I = VSUB(T3b, T3a);
+						  T3c = VADD(T3a, T3b);
+						  T5N = VFNMS(LDK(KP923879532), T56, T4Z);
+						  T57 = VFMA(LDK(KP923879532), T56, T4Z);
+						  T72 = VFNMS(LDK(KP923879532), T6v, T6u);
+						  T6w = VFMA(LDK(KP923879532), T6v, T6u);
+						  T5O = VFNMS(LDK(KP923879532), T5d, T5a);
+						  T5e = VFMA(LDK(KP923879532), T5d, T5a);
+						  T71 = VFMA(LDK(KP923879532), T6s, T6r);
+						  T6t = VFNMS(LDK(KP923879532), T6s, T6r);
+						  T2y = VFNMS(LDK(KP707106781), T1V, T1U);
+						  T1W = VFMA(LDK(KP707106781), T1V, T1U);
+						  T2x = VFNMS(LDK(KP707106781), T1M, T1x);
+						  T1N = VFMA(LDK(KP707106781), T1M, T1x);
+						  TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+					     }
+					     TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+					     TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+					     {
+						  V T1h, T1i, T1k, T1l;
+						  T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+						  T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+						  T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+						  T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+						  {
+						       V T11, T4B, T4C, T12, T14, T15;
+						       T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+						       T4A = VSUB(TU, TV);
+						       TW = VADD(TU, TV);
+						       T4N = VSUB(TX, TY);
+						       TZ = VADD(TX, TY);
+						       T1j = VADD(T1h, T1i);
+						       T4B = VSUB(T1h, T1i);
+						       T1m = VADD(T1k, T1l);
+						       T4C = VSUB(T1k, T1l);
+						       T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+						       T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+						       T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+						       {
+							    V T18, T19, T1b, T1c;
+							    T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+							    T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+							    T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+							    T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+							    T4O = VSUB(T4B, T4C);
+							    T4D = VADD(T4B, T4C);
+							    T13 = VADD(T11, T12);
+							    T4F = VSUB(T11, T12);
+							    T16 = VADD(T14, T15);
+							    T4G = VSUB(T14, T15);
+							    T1a = VADD(T18, T19);
+							    T4I = VSUB(T18, T19);
+							    T4J = VSUB(T1b, T1c);
+							    T1d = VADD(T1b, T1c);
+						       }
+						  }
+					     }
+					}
+					{
+					     V T30, T10, T6k, T4E, T4Q, T4H, T17, T6n, T4P, T1e, T4K, T4R, T1n, T31;
+					     T30 = VADD(TW, TZ);
+					     T10 = VSUB(TW, TZ);
+					     T6k = VFNMS(LDK(KP707106781), T4D, T4A);
+					     T4E = VFMA(LDK(KP707106781), T4D, T4A);
+					     T4Q = VFMA(LDK(KP414213562), T4F, T4G);
+					     T4H = VFNMS(LDK(KP414213562), T4G, T4F);
+					     T33 = VADD(T13, T16);
+					     T17 = VSUB(T13, T16);
+					     T6n = VFNMS(LDK(KP707106781), T4O, T4N);
+					     T4P = VFMA(LDK(KP707106781), T4O, T4N);
+					     T34 = VADD(T1a, T1d);
+					     T1e = VSUB(T1a, T1d);
+					     T4K = VFMA(LDK(KP414213562), T4J, T4I);
+					     T4R = VFNMS(LDK(KP414213562), T4I, T4J);
+					     T1n = VSUB(T1j, T1m);
+					     T31 = VADD(T1j, T1m);
+					     {
+						  V T1f, T1o, T6o, T4L, T4S, T6l;
+						  T1f = VADD(T17, T1e);
+						  T1o = VSUB(T17, T1e);
+						  T6o = VSUB(T4H, T4K);
+						  T4L = VADD(T4H, T4K);
+						  T4S = VADD(T4Q, T4R);
+						  T6l = VSUB(T4Q, T4R);
+						  T3E = VSUB(T30, T31);
+						  T32 = VADD(T30, T31);
+						  T1p = VFMA(LDK(KP707106781), T1o, T1n);
+						  T2v = VFNMS(LDK(KP707106781), T1o, T1n);
+						  T1g = VFMA(LDK(KP707106781), T1f, T10);
+						  T2u = VFNMS(LDK(KP707106781), T1f, T10);
+						  T4M = VFMA(LDK(KP923879532), T4L, T4E);
+						  T5K = VFNMS(LDK(KP923879532), T4L, T4E);
+						  T6p = VFMA(LDK(KP923879532), T6o, T6n);
+						  T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
+						  T6m = VFNMS(LDK(KP923879532), T6l, T6k);
+						  T6Y = VFMA(LDK(KP923879532), T6l, T6k);
+						  T5L = VFNMS(LDK(KP923879532), T4S, T4P);
+						  T4T = VFMA(LDK(KP923879532), T4S, T4P);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T6b, T6F, T7n, T7o, T7p, T7q, T7v, T7w, T7x, T7y, T7z, T7A, T7B, T7C, T7f;
+			 V T6X, T70, T79, T7a, T73, T6C, T76, T77, T6i;
+			 {
+			      V T2Z, T3r, T3s, T3m, T3d, T3v;
+			      T2Z = VSUB(T2V, T2Y);
+			      T3r = VADD(T2V, T2Y);
+			      T3s = VADD(T3l, T3i);
+			      T3m = VSUB(T3i, T3l);
+			      T3d = VSUB(T39, T3c);
+			      T3v = VADD(T39, T3c);
+			      {
+				   V T3x, T3t, T3P, T3J, T3D, T3V, T3Q, T3G, T36, T3u, T3Y, T3O, T6V, T6W;
+				   {
+					V T3N, T3C, T3F, T35;
+					T3N = VSUB(T3B, T3A);
+					T3C = VADD(T3A, T3B);
+					T3F = VSUB(T33, T34);
+					T35 = VADD(T33, T34);
+					T3x = VSUB(T3r, T3s);
+					T3t = VADD(T3r, T3s);
+					T3P = VFMA(LDK(KP414213562), T3H, T3I);
+					T3J = VFNMS(LDK(KP414213562), T3I, T3H);
+					T3D = VFMA(LDK(KP707106781), T3C, T3z);
+					T3V = VFNMS(LDK(KP707106781), T3C, T3z);
+					T3Q = VFMA(LDK(KP414213562), T3E, T3F);
+					T3G = VFNMS(LDK(KP414213562), T3F, T3E);
+					T36 = VSUB(T32, T35);
+					T3u = VADD(T32, T35);
+					T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
+					T3O = VFMA(LDK(KP707106781), T3N, T3M);
+				   }
+				   T6b = VFNMS(LDK(KP923879532), T6a, T69);
+				   T6V = VFMA(LDK(KP923879532), T6a, T69);
+				   T6W = VADD(T6E, T6D);
+				   T6F = VSUB(T6D, T6E);
+				   {
+					V T3K, T3Z, T3e, T3n;
+					T3K = VADD(T3G, T3J);
+					T3Z = VSUB(T3J, T3G);
+					T3e = VADD(T36, T3d);
+					T3n = VSUB(T3d, T36);
+					{
+					     V T3w, T3y, T3R, T3W;
+					     T3w = VADD(T3u, T3v);
+					     T3y = VSUB(T3v, T3u);
+					     T3R = VSUB(T3P, T3Q);
+					     T3W = VADD(T3Q, T3P);
+					     {
+						  V T42, T40, T3L, T3T;
+						  T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
+						  T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
+						  T3L = VFNMS(LDK(KP923879532), T3K, T3D);
+						  T3T = VFMA(LDK(KP923879532), T3K, T3D);
+						  {
+						       V T3o, T3q, T3f, T3p;
+						       T3o = VFNMS(LDK(KP707106781), T3n, T3m);
+						       T3q = VFMA(LDK(KP707106781), T3n, T3m);
+						       T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
+						       T3p = VFMA(LDK(KP707106781), T3e, T2Z);
+						       T7n = VFNMSI(T3y, T3x);
+						       STM2(&(xo[96]), T7n, ovs, &(xo[0]));
+						       T7o = VFMAI(T3y, T3x);
+						       STM2(&(xo[32]), T7o, ovs, &(xo[0]));
+						       T7p = VADD(T3t, T3w);
+						       STM2(&(xo[0]), T7p, ovs, &(xo[0]));
+						       T7q = VSUB(T3t, T3w);
+						       STM2(&(xo[64]), T7q, ovs, &(xo[0]));
+						       {
+							    V T41, T3X, T3S, T3U;
+							    T41 = VFMA(LDK(KP923879532), T3W, T3V);
+							    T3X = VFNMS(LDK(KP923879532), T3W, T3V);
+							    T3S = VFNMS(LDK(KP923879532), T3R, T3O);
+							    T3U = VFMA(LDK(KP923879532), T3R, T3O);
+							    T7r = VFMAI(T3q, T3p);
+							    STM2(&(xo[16]), T7r, ovs, &(xo[0]));
+							    T7s = VFNMSI(T3q, T3p);
+							    STM2(&(xo[112]), T7s, ovs, &(xo[0]));
+							    T7t = VFMAI(T3o, T3f);
+							    STM2(&(xo[80]), T7t, ovs, &(xo[0]));
+							    T7u = VFNMSI(T3o, T3f);
+							    STM2(&(xo[48]), T7u, ovs, &(xo[0]));
+							    T7v = VFNMSI(T40, T3X);
+							    STM2(&(xo[88]), T7v, ovs, &(xo[0]));
+							    T7w = VFMAI(T40, T3X);
+							    STM2(&(xo[40]), T7w, ovs, &(xo[0]));
+							    T7x = VFMAI(T42, T41);
+							    STM2(&(xo[104]), T7x, ovs, &(xo[0]));
+							    T7y = VFNMSI(T42, T41);
+							    STM2(&(xo[24]), T7y, ovs, &(xo[0]));
+							    T7z = VFMAI(T3U, T3T);
+							    STM2(&(xo[8]), T7z, ovs, &(xo[0]));
+							    T7A = VFNMSI(T3U, T3T);
+							    STM2(&(xo[120]), T7A, ovs, &(xo[0]));
+							    T7B = VFMAI(T3S, T3L);
+							    STM2(&(xo[72]), T7B, ovs, &(xo[0]));
+							    T7C = VFNMSI(T3S, T3L);
+							    STM2(&(xo[56]), T7C, ovs, &(xo[0]));
+							    T7f = VFNMS(LDK(KP831469612), T6W, T6V);
+							    T6X = VFMA(LDK(KP831469612), T6W, T6V);
+						       }
+						  }
+					     }
+					}
+				   }
+				   T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
+				   T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
+				   T7a = VFNMS(LDK(KP303346683), T71, T72);
+				   T73 = VFMA(LDK(KP303346683), T72, T71);
+				   T6C = VFNMS(LDK(KP923879532), T6B, T6A);
+				   T76 = VFMA(LDK(KP923879532), T6B, T6A);
+				   T77 = VSUB(T6e, T6h);
+				   T6i = VADD(T6e, T6h);
+			      }
+			 }
+			 {
+			      V T2r, T2D, T2C, T2s, T5H, T5o, T5v, T5D, T7L, T7O, T7Q, T7S, T5r, T5I, T5x;
+			      V T5h, T5F, T5B;
+			      {
+				   V TT, T2f, T7E, T7F, T7H, T7J, T2n, T1Y, T28, T2b, T2l, T2p, T2j, T2k;
+				   {
+					V T1X, T2d, T7h, T7l, T2e, T1q, T75, T7d, T7m, T7k, T7c, T7e, Tn, TS;
+					T2r = VFNMS(LDK(KP707106781), Tm, T7);
+					Tn = VFMA(LDK(KP707106781), Tm, T7);
+					TS = VADD(TC, TR);
+					T2D = VSUB(TR, TC);
+					{
+					     V T7b, T7j, T74, T7i, T78, T7g;
+					     T1X = VFNMS(LDK(KP198912367), T1W, T1N);
+					     T2d = VFMA(LDK(KP198912367), T1N, T1W);
+					     T7g = VADD(T79, T7a);
+					     T7b = VSUB(T79, T7a);
+					     T7j = VSUB(T73, T70);
+					     T74 = VADD(T70, T73);
+					     T7i = VFNMS(LDK(KP831469612), T77, T76);
+					     T78 = VFMA(LDK(KP831469612), T77, T76);
+					     T2j = VFNMS(LDK(KP923879532), TS, Tn);
+					     TT = VFMA(LDK(KP923879532), TS, Tn);
+					     T7h = VFMA(LDK(KP956940335), T7g, T7f);
+					     T7l = VFNMS(LDK(KP956940335), T7g, T7f);
+					     T2e = VFMA(LDK(KP198912367), T1g, T1p);
+					     T1q = VFNMS(LDK(KP198912367), T1p, T1g);
+					     T75 = VFNMS(LDK(KP956940335), T74, T6X);
+					     T7d = VFMA(LDK(KP956940335), T74, T6X);
+					     T7m = VFNMS(LDK(KP956940335), T7j, T7i);
+					     T7k = VFMA(LDK(KP956940335), T7j, T7i);
+					     T7c = VFNMS(LDK(KP956940335), T7b, T78);
+					     T7e = VFMA(LDK(KP956940335), T7b, T78);
+					}
+					T2k = VADD(T2e, T2d);
+					T2f = VSUB(T2d, T2e);
+					{
+					     V T7D, T7G, T7I, T7K;
+					     T7D = VFNMSI(T7k, T7h);
+					     STM2(&(xo[90]), T7D, ovs, &(xo[2]));
+					     STN2(&(xo[88]), T7v, T7D, ovs);
+					     T7E = VFMAI(T7k, T7h);
+					     STM2(&(xo[38]), T7E, ovs, &(xo[2]));
+					     T7F = VFMAI(T7m, T7l);
+					     STM2(&(xo[102]), T7F, ovs, &(xo[2]));
+					     T7G = VFNMSI(T7m, T7l);
+					     STM2(&(xo[26]), T7G, ovs, &(xo[2]));
+					     STN2(&(xo[24]), T7y, T7G, ovs);
+					     T7H = VFMAI(T7e, T7d);
+					     STM2(&(xo[6]), T7H, ovs, &(xo[2]));
+					     T7I = VFNMSI(T7e, T7d);
+					     STM2(&(xo[122]), T7I, ovs, &(xo[2]));
+					     STN2(&(xo[120]), T7A, T7I, ovs);
+					     T7J = VFMAI(T7c, T75);
+					     STM2(&(xo[70]), T7J, ovs, &(xo[2]));
+					     T7K = VFNMSI(T7c, T75);
+					     STM2(&(xo[58]), T7K, ovs, &(xo[2]));
+					     STN2(&(xo[56]), T7C, T7K, ovs);
+					     T2n = VSUB(T1X, T1q);
+					     T1Y = VADD(T1q, T1X);
+					}
+					T2C = VFNMS(LDK(KP707106781), T27, T26);
+					T28 = VFMA(LDK(KP707106781), T27, T26);
+					T2b = VSUB(T29, T2a);
+					T2s = VADD(T2a, T29);
+				   }
+				   T2l = VFNMS(LDK(KP980785280), T2k, T2j);
+				   T2p = VFMA(LDK(KP980785280), T2k, T2j);
+				   {
+					V T5z, T4z, T5A, T5g;
+					{
+					     V T4f, T4y, T1Z, T2h, T4U, T5t, T2m, T2c, T5u, T5f;
+					     T5H = VFNMS(LDK(KP923879532), T4e, T47);
+					     T4f = VFMA(LDK(KP923879532), T4e, T47);
+					     T4y = VADD(T4o, T4x);
+					     T5T = VSUB(T4x, T4o);
+					     T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
+					     T2h = VFMA(LDK(KP980785280), T1Y, TT);
+					     T4U = VFNMS(LDK(KP098491403), T4T, T4M);
+					     T5t = VFMA(LDK(KP098491403), T4M, T4T);
+					     T2m = VFNMS(LDK(KP923879532), T2b, T28);
+					     T2c = VFMA(LDK(KP923879532), T2b, T28);
+					     T5u = VFMA(LDK(KP098491403), T57, T5e);
+					     T5f = VFNMS(LDK(KP098491403), T5e, T57);
+					     T5z = VFNMS(LDK(KP980785280), T4y, T4f);
+					     T4z = VFMA(LDK(KP980785280), T4y, T4f);
+					     T5S = VFNMS(LDK(KP923879532), T5n, T5k);
+					     T5o = VFMA(LDK(KP923879532), T5n, T5k);
+					     {
+						  V T2o, T2q, T2i, T2g;
+						  T2o = VFMA(LDK(KP980785280), T2n, T2m);
+						  T2q = VFNMS(LDK(KP980785280), T2n, T2m);
+						  T2i = VFMA(LDK(KP980785280), T2f, T2c);
+						  T2g = VFNMS(LDK(KP980785280), T2f, T2c);
+						  T5A = VADD(T5t, T5u);
+						  T5v = VSUB(T5t, T5u);
+						  T5D = VSUB(T5f, T4U);
+						  T5g = VADD(T4U, T5f);
+						  T7L = VFNMSI(T2o, T2l);
+						  STM2(&(xo[92]), T7L, ovs, &(xo[0]));
+						  {
+						       V T7M, T7N, T7P, T7R;
+						       T7M = VFMAI(T2o, T2l);
+						       STM2(&(xo[36]), T7M, ovs, &(xo[0]));
+						       STN2(&(xo[36]), T7M, T7E, ovs);
+						       T7N = VFMAI(T2q, T2p);
+						       STM2(&(xo[100]), T7N, ovs, &(xo[0]));
+						       STN2(&(xo[100]), T7N, T7F, ovs);
+						       T7O = VFNMSI(T2q, T2p);
+						       STM2(&(xo[28]), T7O, ovs, &(xo[0]));
+						       T7P = VFMAI(T2i, T2h);
+						       STM2(&(xo[4]), T7P, ovs, &(xo[0]));
+						       STN2(&(xo[4]), T7P, T7H, ovs);
+						       T7Q = VFNMSI(T2i, T2h);
+						       STM2(&(xo[124]), T7Q, ovs, &(xo[0]));
+						       T7R = VFMAI(T2g, T1Z);
+						       STM2(&(xo[68]), T7R, ovs, &(xo[0]));
+						       STN2(&(xo[68]), T7R, T7J, ovs);
+						       T7S = VFNMSI(T2g, T1Z);
+						       STM2(&(xo[60]), T7S, ovs, &(xo[0]));
+						       T5r = VSUB(T5p, T5q);
+						       T5I = VADD(T5p, T5q);
+						  }
+					     }
+					}
+					T5x = VFMA(LDK(KP995184726), T5g, T4z);
+					T5h = VFNMS(LDK(KP995184726), T5g, T4z);
+					T5F = VFMA(LDK(KP995184726), T5A, T5z);
+					T5B = VFNMS(LDK(KP995184726), T5A, T5z);
+				   }
+			      }
+			      {
+				   V T6J, T6R, T6L, T6z, T6T, T6P;
+				   {
+					V T6N, T6j, T6O, T6y;
+					{
+					     V T6q, T6H, T5C, T5s, T6I, T6x;
+					     T6q = VFNMS(LDK(KP534511135), T6p, T6m);
+					     T6H = VFMA(LDK(KP534511135), T6m, T6p);
+					     T5C = VFNMS(LDK(KP980785280), T5r, T5o);
+					     T5s = VFMA(LDK(KP980785280), T5r, T5o);
+					     T6I = VFMA(LDK(KP534511135), T6t, T6w);
+					     T6x = VFNMS(LDK(KP534511135), T6w, T6t);
+					     T6N = VFMA(LDK(KP831469612), T6i, T6b);
+					     T6j = VFNMS(LDK(KP831469612), T6i, T6b);
+					     {
+						  V T5E, T5G, T5y, T5w;
+						  T5E = VFNMS(LDK(KP995184726), T5D, T5C);
+						  T5G = VFMA(LDK(KP995184726), T5D, T5C);
+						  T5y = VFMA(LDK(KP995184726), T5v, T5s);
+						  T5w = VFNMS(LDK(KP995184726), T5v, T5s);
+						  T6O = VADD(T6H, T6I);
+						  T6J = VSUB(T6H, T6I);
+						  T6R = VSUB(T6x, T6q);
+						  T6y = VADD(T6q, T6x);
+						  {
+						       V T7T, T7U, T7V, T7W;
+						       T7T = VFMAI(T5E, T5B);
+						       STM2(&(xo[94]), T7T, ovs, &(xo[2]));
+						       STN2(&(xo[92]), T7L, T7T, ovs);
+						       T7U = VFNMSI(T5E, T5B);
+						       STM2(&(xo[34]), T7U, ovs, &(xo[2]));
+						       STN2(&(xo[32]), T7o, T7U, ovs);
+						       T7V = VFNMSI(T5G, T5F);
+						       STM2(&(xo[98]), T7V, ovs, &(xo[2]));
+						       STN2(&(xo[96]), T7n, T7V, ovs);
+						       T7W = VFMAI(T5G, T5F);
+						       STM2(&(xo[30]), T7W, ovs, &(xo[2]));
+						       STN2(&(xo[28]), T7O, T7W, ovs);
+						       {
+							    V T7X, T7Y, T7Z, T80;
+							    T7X = VFMAI(T5y, T5x);
+							    STM2(&(xo[126]), T7X, ovs, &(xo[2]));
+							    STN2(&(xo[124]), T7Q, T7X, ovs);
+							    T7Y = VFNMSI(T5y, T5x);
+							    STM2(&(xo[2]), T7Y, ovs, &(xo[2]));
+							    STN2(&(xo[0]), T7p, T7Y, ovs);
+							    T7Z = VFMAI(T5w, T5h);
+							    STM2(&(xo[62]), T7Z, ovs, &(xo[2]));
+							    STN2(&(xo[60]), T7S, T7Z, ovs);
+							    T80 = VFNMSI(T5w, T5h);
+							    STM2(&(xo[66]), T80, ovs, &(xo[2]));
+							    STN2(&(xo[64]), T7q, T80, ovs);
+						       }
+						  }
+					     }
+					}
+					T6L = VFMA(LDK(KP881921264), T6y, T6j);
+					T6z = VFNMS(LDK(KP881921264), T6y, T6j);
+					T6T = VFMA(LDK(KP881921264), T6O, T6N);
+					T6P = VFNMS(LDK(KP881921264), T6O, T6N);
+				   }
+				   {
+					V T2H, T2P, T81, T84, T85, T87, T2J, T2B, T2R, T2N;
+					{
+					     V T2L, T2t, T2M, T2A;
+					     {
+						  V T2z, T2F, T6Q, T6G, T2G, T2w;
+						  T2z = VFMA(LDK(KP668178637), T2y, T2x);
+						  T2F = VFNMS(LDK(KP668178637), T2x, T2y);
+						  T6Q = VFMA(LDK(KP831469612), T6F, T6C);
+						  T6G = VFNMS(LDK(KP831469612), T6F, T6C);
+						  T2G = VFNMS(LDK(KP668178637), T2u, T2v);
+						  T2w = VFMA(LDK(KP668178637), T2v, T2u);
+						  T2L = VFNMS(LDK(KP923879532), T2s, T2r);
+						  T2t = VFMA(LDK(KP923879532), T2s, T2r);
+						  {
+						       V T6S, T6U, T6M, T6K;
+						       T6S = VFNMS(LDK(KP881921264), T6R, T6Q);
+						       T6U = VFMA(LDK(KP881921264), T6R, T6Q);
+						       T6M = VFMA(LDK(KP881921264), T6J, T6G);
+						       T6K = VFNMS(LDK(KP881921264), T6J, T6G);
+						       T2M = VADD(T2G, T2F);
+						       T2H = VSUB(T2F, T2G);
+						       T2P = VSUB(T2z, T2w);
+						       T2A = VADD(T2w, T2z);
+						       T81 = VFMAI(T6S, T6P);
+						       STM2(&(xo[86]), T81, ovs, &(xo[2]));
+						       {
+							    V T82, T83, T86, T88;
+							    T82 = VFNMSI(T6S, T6P);
+							    STM2(&(xo[42]), T82, ovs, &(xo[2]));
+							    STN2(&(xo[40]), T7w, T82, ovs);
+							    T83 = VFNMSI(T6U, T6T);
+							    STM2(&(xo[106]), T83, ovs, &(xo[2]));
+							    STN2(&(xo[104]), T7x, T83, ovs);
+							    T84 = VFMAI(T6U, T6T);
+							    STM2(&(xo[22]), T84, ovs, &(xo[2]));
+							    T85 = VFMAI(T6M, T6L);
+							    STM2(&(xo[118]), T85, ovs, &(xo[2]));
+							    T86 = VFNMSI(T6M, T6L);
+							    STM2(&(xo[10]), T86, ovs, &(xo[2]));
+							    STN2(&(xo[8]), T7z, T86, ovs);
+							    T87 = VFMAI(T6K, T6z);
+							    STM2(&(xo[54]), T87, ovs, &(xo[2]));
+							    T88 = VFNMSI(T6K, T6z);
+							    STM2(&(xo[74]), T88, ovs, &(xo[2]));
+							    STN2(&(xo[72]), T7B, T88, ovs);
+						       }
+						  }
+					     }
+					     T2J = VFMA(LDK(KP831469612), T2A, T2t);
+					     T2B = VFNMS(LDK(KP831469612), T2A, T2t);
+					     T2R = VFNMS(LDK(KP831469612), T2M, T2L);
+					     T2N = VFMA(LDK(KP831469612), T2M, T2L);
+					}
+					{
+					     V T61, T5J, T62, T5Q;
+					     {
+						  V T5M, T5V, T2O, T2E, T5W, T5P;
+						  T5M = VFMA(LDK(KP820678790), T5L, T5K);
+						  T5V = VFNMS(LDK(KP820678790), T5K, T5L);
+						  T2O = VFMA(LDK(KP923879532), T2D, T2C);
+						  T2E = VFNMS(LDK(KP923879532), T2D, T2C);
+						  T5W = VFNMS(LDK(KP820678790), T5N, T5O);
+						  T5P = VFMA(LDK(KP820678790), T5O, T5N);
+						  T61 = VFNMS(LDK(KP980785280), T5I, T5H);
+						  T5J = VFMA(LDK(KP980785280), T5I, T5H);
+						  {
+						       V T2Q, T2S, T2K, T2I;
+						       T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
+						       T2S = VFMA(LDK(KP831469612), T2P, T2O);
+						       T2K = VFMA(LDK(KP831469612), T2H, T2E);
+						       T2I = VFNMS(LDK(KP831469612), T2H, T2E);
+						       T62 = VADD(T5V, T5W);
+						       T5X = VSUB(T5V, T5W);
+						       T65 = VSUB(T5P, T5M);
+						       T5Q = VADD(T5M, T5P);
+						       {
+							    V T89, T8c, T8d, T8f;
+							    T89 = VFMAI(T2Q, T2N);
+							    STM2(&(xo[84]), T89, ovs, &(xo[0]));
+							    STN2(&(xo[84]), T89, T81, ovs);
+							    T8a = VFNMSI(T2Q, T2N);
+							    STM2(&(xo[44]), T8a, ovs, &(xo[0]));
+							    T8b = VFNMSI(T2S, T2R);
+							    STM2(&(xo[108]), T8b, ovs, &(xo[0]));
+							    T8c = VFMAI(T2S, T2R);
+							    STM2(&(xo[20]), T8c, ovs, &(xo[0]));
+							    STN2(&(xo[20]), T8c, T84, ovs);
+							    T8d = VFMAI(T2K, T2J);
+							    STM2(&(xo[116]), T8d, ovs, &(xo[0]));
+							    STN2(&(xo[116]), T8d, T85, ovs);
+							    T8e = VFNMSI(T2K, T2J);
+							    STM2(&(xo[12]), T8e, ovs, &(xo[0]));
+							    T8f = VFMAI(T2I, T2B);
+							    STM2(&(xo[52]), T8f, ovs, &(xo[0]));
+							    STN2(&(xo[52]), T8f, T87, ovs);
+							    T8g = VFNMSI(T2I, T2B);
+							    STM2(&(xo[76]), T8g, ovs, &(xo[0]));
+						       }
+						  }
+					     }
+					     T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
+					     T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
+					     T67 = VFNMS(LDK(KP773010453), T62, T61);
+					     T63 = VFMA(LDK(KP773010453), T62, T61);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T5U = VFMA(LDK(KP980785280), T5T, T5S);
+	       T64 = VFNMS(LDK(KP980785280), T5T, T5S);
+	       {
+		    V T68, T66, T5Y, T60;
+		    T68 = VFNMS(LDK(KP773010453), T65, T64);
+		    T66 = VFMA(LDK(KP773010453), T65, T64);
+		    T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
+		    T60 = VFMA(LDK(KP773010453), T5X, T5U);
+		    {
+			 V T8h, T8i, T8j, T8k;
+			 T8h = VFNMSI(T66, T63);
+			 STM2(&(xo[82]), T8h, ovs, &(xo[2]));
+			 STN2(&(xo[80]), T7t, T8h, ovs);
+			 T8i = VFMAI(T66, T63);
+			 STM2(&(xo[46]), T8i, ovs, &(xo[2]));
+			 STN2(&(xo[44]), T8a, T8i, ovs);
+			 T8j = VFMAI(T68, T67);
+			 STM2(&(xo[110]), T8j, ovs, &(xo[2]));
+			 STN2(&(xo[108]), T8b, T8j, ovs);
+			 T8k = VFNMSI(T68, T67);
+			 STM2(&(xo[18]), T8k, ovs, &(xo[2]));
+			 STN2(&(xo[16]), T7r, T8k, ovs);
+			 {
+			      V T8l, T8m, T8n, T8o;
+			      T8l = VFMAI(T60, T5Z);
+			      STM2(&(xo[14]), T8l, ovs, &(xo[2]));
+			      STN2(&(xo[12]), T8e, T8l, ovs);
+			      T8m = VFNMSI(T60, T5Z);
+			      STM2(&(xo[114]), T8m, ovs, &(xo[2]));
+			      STN2(&(xo[112]), T7s, T8m, ovs);
+			      T8n = VFMAI(T5Y, T5R);
+			      STM2(&(xo[78]), T8n, ovs, &(xo[2]));
+			      STN2(&(xo[76]), T8g, T8n, ovs);
+			      T8o = VFNMSI(T5Y, T5R);
+			      STM2(&(xo[50]), T8o, ovs, &(xo[2]));
+			      STN2(&(xo[48]), T7u, T8o, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 64, XSIMD_STRING("n2fv_64"), {198, 0, 258, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_64) (planner *p) {
+     X(kdft_register) (p, n2fv_64, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n2fv_64 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 456 FP additions, 124 FP multiplications,
+ * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
+ * 128 stack variables, 15 constants, and 160 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       V T4p, T5q, Tb, T39, T2n, T3A, T6f, T6T, Tq, T3B, T6i, T76, T2i, T3a, T4w;
+	       V T5r, TI, T2p, T6C, T6V, T3h, T3E, T4L, T5u, TZ, T2q, T6F, T6U, T3e, T3D;
+	       V T4E, T5t, T23, T2N, T6t, T71, T6w, T72, T2c, T2O, T3t, T41, T5f, T5R, T5k;
+	       V T5S, T3w, T42, T1s, T2K, T6m, T6Y, T6p, T6Z, T1B, T2L, T3m, T3Y, T4Y, T5O;
+	       V T53, T5P, T3p, T3Z;
+	       {
+		    V T3, T4n, T2m, T4o, T6, T5p, T9, T5o;
+		    {
+			 V T1, T2, T2k, T2l;
+			 T1 = LD(&(xi[0]), ivs, &(xi[0]));
+			 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
+			 T3 = VSUB(T1, T2);
+			 T4n = VADD(T1, T2);
+			 T2k = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
+			 T2l = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
+			 T2m = VSUB(T2k, T2l);
+			 T4o = VADD(T2k, T2l);
+		    }
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
+			 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
+			 T6 = VSUB(T4, T5);
+			 T5p = VADD(T4, T5);
+			 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
+			 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
+			 T9 = VSUB(T7, T8);
+			 T5o = VADD(T7, T8);
+		    }
+		    T4p = VSUB(T4n, T4o);
+		    T5q = VSUB(T5o, T5p);
+		    {
+			 V Ta, T2j, T6d, T6e;
+			 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+			 Tb = VADD(T3, Ta);
+			 T39 = VSUB(T3, Ta);
+			 T2j = VMUL(LDK(KP707106781), VSUB(T9, T6));
+			 T2n = VSUB(T2j, T2m);
+			 T3A = VADD(T2m, T2j);
+			 T6d = VADD(T4n, T4o);
+			 T6e = VADD(T5p, T5o);
+			 T6f = VADD(T6d, T6e);
+			 T6T = VSUB(T6d, T6e);
+		    }
+	       }
+	       {
+		    V Te, T4q, To, T4u, Th, T4r, Tl, T4t;
+		    {
+			 V Tc, Td, Tm, Tn;
+			 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+			 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
+			 Te = VSUB(Tc, Td);
+			 T4q = VADD(Tc, Td);
+			 Tm = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
+			 Tn = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
+			 To = VSUB(Tm, Tn);
+			 T4u = VADD(Tm, Tn);
+		    }
+		    {
+			 V Tf, Tg, Tj, Tk;
+			 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
+			 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
+			 Th = VSUB(Tf, Tg);
+			 T4r = VADD(Tf, Tg);
+			 Tj = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
+			 Tk = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
+			 Tl = VSUB(Tj, Tk);
+			 T4t = VADD(Tj, Tk);
+		    }
+		    {
+			 V Ti, Tp, T6g, T6h;
+			 Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
+			 Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
+			 Tq = VADD(Ti, Tp);
+			 T3B = VSUB(Tp, Ti);
+			 T6g = VADD(T4q, T4r);
+			 T6h = VADD(T4t, T4u);
+			 T6i = VADD(T6g, T6h);
+			 T76 = VSUB(T6h, T6g);
+		    }
+		    {
+			 V T2g, T2h, T4s, T4v;
+			 T2g = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
+			 T2h = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
+			 T2i = VSUB(T2g, T2h);
+			 T3a = VADD(T2h, T2g);
+			 T4s = VSUB(T4q, T4r);
+			 T4v = VSUB(T4t, T4u);
+			 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
+			 T5r = VMUL(LDK(KP707106781), VSUB(T4v, T4s));
+		    }
+	       }
+	       {
+		    V Tu, T4F, TG, T4G, TB, T4J, TD, T4I;
+		    {
+			 V Ts, Tt, TE, TF;
+			 Ts = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
+			 Tt = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
+			 Tu = VSUB(Ts, Tt);
+			 T4F = VADD(Ts, Tt);
+			 TE = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
+			 TF = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
+			 TG = VSUB(TE, TF);
+			 T4G = VADD(TE, TF);
+			 {
+			      V Tv, Tw, Tx, Ty, Tz, TA;
+			      Tv = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+			      Tw = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
+			      Tx = VSUB(Tv, Tw);
+			      Ty = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
+			      Tz = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
+			      TA = VSUB(Ty, Tz);
+			      TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
+			      T4J = VADD(Tv, Tw);
+			      TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
+			      T4I = VADD(Ty, Tz);
+			 }
+		    }
+		    {
+			 V TC, TH, T6A, T6B;
+			 TC = VADD(Tu, TB);
+			 TH = VSUB(TD, TG);
+			 TI = VFMA(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
+			 T2p = VFNMS(LDK(KP195090322), TH, VMUL(LDK(KP980785280), TC));
+			 T6A = VADD(T4F, T4G);
+			 T6B = VADD(T4J, T4I);
+			 T6C = VADD(T6A, T6B);
+			 T6V = VSUB(T6A, T6B);
+		    }
+		    {
+			 V T3f, T3g, T4H, T4K;
+			 T3f = VSUB(Tu, TB);
+			 T3g = VADD(TG, TD);
+			 T3h = VFNMS(LDK(KP555570233), T3g, VMUL(LDK(KP831469612), T3f));
+			 T3E = VFMA(LDK(KP555570233), T3f, VMUL(LDK(KP831469612), T3g));
+			 T4H = VSUB(T4F, T4G);
+			 T4K = VSUB(T4I, T4J);
+			 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
+			 T5u = VFMA(LDK(KP382683432), T4H, VMUL(LDK(KP923879532), T4K));
+		    }
+	       }
+	       {
+		    V TS, T4z, TW, T4y, TP, T4C, TX, T4B;
+		    {
+			 V TQ, TR, TU, TV;
+			 TQ = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
+			 TR = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
+			 TS = VSUB(TQ, TR);
+			 T4z = VADD(TQ, TR);
+			 TU = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+			 TV = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
+			 TW = VSUB(TU, TV);
+			 T4y = VADD(TU, TV);
+			 {
+			      V TJ, TK, TL, TM, TN, TO;
+			      TJ = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
+			      TK = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
+			      TL = VSUB(TJ, TK);
+			      TM = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
+			      TN = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
+			      TO = VSUB(TM, TN);
+			      TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
+			      T4C = VADD(TM, TN);
+			      TX = VMUL(LDK(KP707106781), VADD(TO, TL));
+			      T4B = VADD(TJ, TK);
+			 }
+		    }
+		    {
+			 V TT, TY, T6D, T6E;
+			 TT = VSUB(TP, TS);
+			 TY = VADD(TW, TX);
+			 TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
+			 T2q = VFMA(LDK(KP980785280), TY, VMUL(LDK(KP195090322), TT));
+			 T6D = VADD(T4y, T4z);
+			 T6E = VADD(T4C, T4B);
+			 T6F = VADD(T6D, T6E);
+			 T6U = VSUB(T6D, T6E);
+		    }
+		    {
+			 V T3c, T3d, T4A, T4D;
+			 T3c = VSUB(TW, TX);
+			 T3d = VADD(TS, TP);
+			 T3e = VFMA(LDK(KP831469612), T3c, VMUL(LDK(KP555570233), T3d));
+			 T3D = VFNMS(LDK(KP555570233), T3c, VMUL(LDK(KP831469612), T3d));
+			 T4A = VSUB(T4y, T4z);
+			 T4D = VSUB(T4B, T4C);
+			 T4E = VFMA(LDK(KP923879532), T4A, VMUL(LDK(KP382683432), T4D));
+			 T5t = VFNMS(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
+		    }
+	       }
+	       {
+		    V T1F, T55, T2a, T56, T1M, T5h, T27, T5g, T58, T59, T1U, T5a, T25, T5b, T5c;
+		    V T21, T5d, T24;
+		    {
+			 V T1D, T1E, T28, T29;
+			 T1D = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
+			 T1E = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
+			 T1F = VSUB(T1D, T1E);
+			 T55 = VADD(T1D, T1E);
+			 T28 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
+			 T29 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
+			 T2a = VSUB(T28, T29);
+			 T56 = VADD(T28, T29);
+		    }
+		    {
+			 V T1G, T1H, T1I, T1J, T1K, T1L;
+			 T1G = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T1H = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
+			 T1I = VSUB(T1G, T1H);
+			 T1J = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
+			 T1K = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
+			 T1L = VSUB(T1J, T1K);
+			 T1M = VMUL(LDK(KP707106781), VADD(T1I, T1L));
+			 T5h = VADD(T1G, T1H);
+			 T27 = VMUL(LDK(KP707106781), VSUB(T1L, T1I));
+			 T5g = VADD(T1J, T1K);
+		    }
+		    {
+			 V T1Q, T1T, T1X, T20;
+			 {
+			      V T1O, T1P, T1R, T1S;
+			      T1O = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			      T1P = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
+			      T1Q = VSUB(T1O, T1P);
+			      T58 = VADD(T1O, T1P);
+			      T1R = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
+			      T1S = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
+			      T1T = VSUB(T1R, T1S);
+			      T59 = VADD(T1R, T1S);
+			 }
+			 T1U = VFNMS(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1Q));
+			 T5a = VSUB(T58, T59);
+			 T25 = VFMA(LDK(KP382683432), T1Q, VMUL(LDK(KP923879532), T1T));
+			 {
+			      V T1V, T1W, T1Y, T1Z;
+			      T1V = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
+			      T1W = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
+			      T1X = VSUB(T1V, T1W);
+			      T5b = VADD(T1V, T1W);
+			      T1Y = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
+			      T1Z = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
+			      T20 = VSUB(T1Y, T1Z);
+			      T5c = VADD(T1Y, T1Z);
+			 }
+			 T21 = VFMA(LDK(KP923879532), T1X, VMUL(LDK(KP382683432), T20));
+			 T5d = VSUB(T5b, T5c);
+			 T24 = VFNMS(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T1X));
+		    }
+		    {
+			 V T1N, T22, T6r, T6s;
+			 T1N = VADD(T1F, T1M);
+			 T22 = VADD(T1U, T21);
+			 T23 = VSUB(T1N, T22);
+			 T2N = VADD(T1N, T22);
+			 T6r = VADD(T55, T56);
+			 T6s = VADD(T5h, T5g);
+			 T6t = VADD(T6r, T6s);
+			 T71 = VSUB(T6r, T6s);
+		    }
+		    {
+			 V T6u, T6v, T26, T2b;
+			 T6u = VADD(T58, T59);
+			 T6v = VADD(T5b, T5c);
+			 T6w = VADD(T6u, T6v);
+			 T72 = VSUB(T6v, T6u);
+			 T26 = VSUB(T24, T25);
+			 T2b = VSUB(T27, T2a);
+			 T2c = VSUB(T26, T2b);
+			 T2O = VADD(T2b, T26);
+		    }
+		    {
+			 V T3r, T3s, T57, T5e;
+			 T3r = VSUB(T1F, T1M);
+			 T3s = VADD(T25, T24);
+			 T3t = VADD(T3r, T3s);
+			 T41 = VSUB(T3r, T3s);
+			 T57 = VSUB(T55, T56);
+			 T5e = VMUL(LDK(KP707106781), VADD(T5a, T5d));
+			 T5f = VADD(T57, T5e);
+			 T5R = VSUB(T57, T5e);
+		    }
+		    {
+			 V T5i, T5j, T3u, T3v;
+			 T5i = VSUB(T5g, T5h);
+			 T5j = VMUL(LDK(KP707106781), VSUB(T5d, T5a));
+			 T5k = VADD(T5i, T5j);
+			 T5S = VSUB(T5j, T5i);
+			 T3u = VADD(T2a, T27);
+			 T3v = VSUB(T21, T1U);
+			 T3w = VADD(T3u, T3v);
+			 T42 = VSUB(T3v, T3u);
+		    }
+	       }
+	       {
+		    V T1q, T4P, T1v, T4O, T1n, T50, T1w, T4Z, T4U, T4V, T18, T4W, T1z, T4R, T4S;
+		    V T1f, T4T, T1y;
+		    {
+			 V T1o, T1p, T1t, T1u;
+			 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
+			 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
+			 T1q = VSUB(T1o, T1p);
+			 T4P = VADD(T1o, T1p);
+			 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
+			 T1v = VSUB(T1t, T1u);
+			 T4O = VADD(T1t, T1u);
+		    }
+		    {
+			 V T1h, T1i, T1j, T1k, T1l, T1m;
+			 T1h = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
+			 T1i = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
+			 T1j = VSUB(T1h, T1i);
+			 T1k = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
+			 T1l = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
+			 T1m = VSUB(T1k, T1l);
+			 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
+			 T50 = VADD(T1k, T1l);
+			 T1w = VMUL(LDK(KP707106781), VADD(T1m, T1j));
+			 T4Z = VADD(T1h, T1i);
+		    }
+		    {
+			 V T14, T17, T1b, T1e;
+			 {
+			      V T12, T13, T15, T16;
+			      T12 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
+			      T13 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
+			      T14 = VSUB(T12, T13);
+			      T4U = VADD(T12, T13);
+			      T15 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
+			      T16 = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
+			      T17 = VSUB(T15, T16);
+			      T4V = VADD(T15, T16);
+			 }
+			 T18 = VFNMS(LDK(KP923879532), T17, VMUL(LDK(KP382683432), T14));
+			 T4W = VSUB(T4U, T4V);
+			 T1z = VFMA(LDK(KP923879532), T14, VMUL(LDK(KP382683432), T17));
+			 {
+			      V T19, T1a, T1c, T1d;
+			      T19 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			      T1a = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
+			      T1b = VSUB(T19, T1a);
+			      T4R = VADD(T19, T1a);
+			      T1c = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
+			      T1d = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
+			      T1e = VSUB(T1c, T1d);
+			      T4S = VADD(T1c, T1d);
+			 }
+			 T1f = VFMA(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
+			 T4T = VSUB(T4R, T4S);
+			 T1y = VFNMS(LDK(KP382683432), T1e, VMUL(LDK(KP923879532), T1b));
+		    }
+		    {
+			 V T1g, T1r, T6k, T6l;
+			 T1g = VSUB(T18, T1f);
+			 T1r = VSUB(T1n, T1q);
+			 T1s = VSUB(T1g, T1r);
+			 T2K = VADD(T1r, T1g);
+			 T6k = VADD(T4O, T4P);
+			 T6l = VADD(T50, T4Z);
+			 T6m = VADD(T6k, T6l);
+			 T6Y = VSUB(T6k, T6l);
+		    }
+		    {
+			 V T6n, T6o, T1x, T1A;
+			 T6n = VADD(T4R, T4S);
+			 T6o = VADD(T4U, T4V);
+			 T6p = VADD(T6n, T6o);
+			 T6Z = VSUB(T6o, T6n);
+			 T1x = VADD(T1v, T1w);
+			 T1A = VADD(T1y, T1z);
+			 T1B = VSUB(T1x, T1A);
+			 T2L = VADD(T1x, T1A);
+		    }
+		    {
+			 V T3k, T3l, T4Q, T4X;
+			 T3k = VSUB(T1v, T1w);
+			 T3l = VADD(T1f, T18);
+			 T3m = VADD(T3k, T3l);
+			 T3Y = VSUB(T3k, T3l);
+			 T4Q = VSUB(T4O, T4P);
+			 T4X = VMUL(LDK(KP707106781), VADD(T4T, T4W));
+			 T4Y = VADD(T4Q, T4X);
+			 T5O = VSUB(T4Q, T4X);
+		    }
+		    {
+			 V T51, T52, T3n, T3o;
+			 T51 = VSUB(T4Z, T50);
+			 T52 = VMUL(LDK(KP707106781), VSUB(T4W, T4T));
+			 T53 = VADD(T51, T52);
+			 T5P = VSUB(T52, T51);
+			 T3n = VADD(T1q, T1n);
+			 T3o = VSUB(T1z, T1y);
+			 T3p = VADD(T3n, T3o);
+			 T3Z = VSUB(T3o, T3n);
+		    }
+	       }
+	       {
+		    V T7n, T7o, T7p, T7q, T7r, T7s, T7t, T7u, T7v, T7w, T7x, T7y, T7z, T7A, T7B;
+		    V T7C, T7D, T7E, T7F, T7G, T7H, T7I, T7J, T7K;
+		    {
+			 V T6N, T6R, T6Q, T6S;
+			 {
+			      V T6L, T6M, T6O, T6P;
+			      T6L = VADD(T6f, T6i);
+			      T6M = VADD(T6F, T6C);
+			      T6N = VADD(T6L, T6M);
+			      T6R = VSUB(T6L, T6M);
+			      T6O = VADD(T6m, T6p);
+			      T6P = VADD(T6t, T6w);
+			      T6Q = VADD(T6O, T6P);
+			      T6S = VBYI(VSUB(T6P, T6O));
+			 }
+			 T7n = VSUB(T6N, T6Q);
+			 STM2(&(xo[64]), T7n, ovs, &(xo[0]));
+			 T7o = VADD(T6R, T6S);
+			 STM2(&(xo[32]), T7o, ovs, &(xo[0]));
+			 T7p = VADD(T6N, T6Q);
+			 STM2(&(xo[0]), T7p, ovs, &(xo[0]));
+			 T7q = VSUB(T6R, T6S);
+			 STM2(&(xo[96]), T7q, ovs, &(xo[0]));
+		    }
+		    {
+			 V T6j, T6G, T6y, T6H, T6q, T6x;
+			 T6j = VSUB(T6f, T6i);
+			 T6G = VSUB(T6C, T6F);
+			 T6q = VSUB(T6m, T6p);
+			 T6x = VSUB(T6t, T6w);
+			 T6y = VMUL(LDK(KP707106781), VADD(T6q, T6x));
+			 T6H = VMUL(LDK(KP707106781), VSUB(T6x, T6q));
+			 {
+			      V T6z, T6I, T6J, T6K;
+			      T6z = VADD(T6j, T6y);
+			      T6I = VBYI(VADD(T6G, T6H));
+			      T7r = VSUB(T6z, T6I);
+			      STM2(&(xo[112]), T7r, ovs, &(xo[0]));
+			      T7s = VADD(T6z, T6I);
+			      STM2(&(xo[16]), T7s, ovs, &(xo[0]));
+			      T6J = VSUB(T6j, T6y);
+			      T6K = VBYI(VSUB(T6H, T6G));
+			      T7t = VSUB(T6J, T6K);
+			      STM2(&(xo[80]), T7t, ovs, &(xo[0]));
+			      T7u = VADD(T6J, T6K);
+			      STM2(&(xo[48]), T7u, ovs, &(xo[0]));
+			 }
+		    }
+		    {
+			 V T6X, T7i, T78, T7g, T74, T7f, T7b, T7j, T6W, T77;
+			 T6W = VMUL(LDK(KP707106781), VADD(T6U, T6V));
+			 T6X = VADD(T6T, T6W);
+			 T7i = VSUB(T6T, T6W);
+			 T77 = VMUL(LDK(KP707106781), VSUB(T6V, T6U));
+			 T78 = VADD(T76, T77);
+			 T7g = VSUB(T77, T76);
+			 {
+			      V T70, T73, T79, T7a;
+			      T70 = VFMA(LDK(KP923879532), T6Y, VMUL(LDK(KP382683432), T6Z));
+			      T73 = VFNMS(LDK(KP382683432), T72, VMUL(LDK(KP923879532), T71));
+			      T74 = VADD(T70, T73);
+			      T7f = VSUB(T73, T70);
+			      T79 = VFNMS(LDK(KP382683432), T6Y, VMUL(LDK(KP923879532), T6Z));
+			      T7a = VFMA(LDK(KP382683432), T71, VMUL(LDK(KP923879532), T72));
+			      T7b = VADD(T79, T7a);
+			      T7j = VSUB(T7a, T79);
+			 }
+			 {
+			      V T75, T7c, T7l, T7m;
+			      T75 = VADD(T6X, T74);
+			      T7c = VBYI(VADD(T78, T7b));
+			      T7v = VSUB(T75, T7c);
+			      STM2(&(xo[120]), T7v, ovs, &(xo[0]));
+			      T7w = VADD(T75, T7c);
+			      STM2(&(xo[8]), T7w, ovs, &(xo[0]));
+			      T7l = VBYI(VADD(T7g, T7f));
+			      T7m = VADD(T7i, T7j);
+			      T7x = VADD(T7l, T7m);
+			      STM2(&(xo[24]), T7x, ovs, &(xo[0]));
+			      T7y = VSUB(T7m, T7l);
+			      STM2(&(xo[104]), T7y, ovs, &(xo[0]));
+			 }
+			 {
+			      V T7d, T7e, T7h, T7k;
+			      T7d = VSUB(T6X, T74);
+			      T7e = VBYI(VSUB(T7b, T78));
+			      T7z = VSUB(T7d, T7e);
+			      STM2(&(xo[72]), T7z, ovs, &(xo[0]));
+			      T7A = VADD(T7d, T7e);
+			      STM2(&(xo[56]), T7A, ovs, &(xo[0]));
+			      T7h = VBYI(VSUB(T7f, T7g));
+			      T7k = VSUB(T7i, T7j);
+			      T7B = VADD(T7h, T7k);
+			      STM2(&(xo[40]), T7B, ovs, &(xo[0]));
+			      T7C = VSUB(T7k, T7h);
+			      STM2(&(xo[88]), T7C, ovs, &(xo[0]));
+			 }
+		    }
+		    {
+			 V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
+			 {
+			      V T5L, T5M, T5Z, T60;
+			      T5L = VSUB(T4p, T4w);
+			      T5M = VSUB(T5u, T5t);
+			      T5N = VADD(T5L, T5M);
+			      T68 = VSUB(T5L, T5M);
+			      T5Z = VFNMS(LDK(KP555570233), T5O, VMUL(LDK(KP831469612), T5P));
+			      T60 = VFMA(LDK(KP555570233), T5R, VMUL(LDK(KP831469612), T5S));
+			      T61 = VADD(T5Z, T60);
+			      T69 = VSUB(T60, T5Z);
+			 }
+			 {
+			      V T5Q, T5T, T5W, T5X;
+			      T5Q = VFMA(LDK(KP831469612), T5O, VMUL(LDK(KP555570233), T5P));
+			      T5T = VFNMS(LDK(KP555570233), T5S, VMUL(LDK(KP831469612), T5R));
+			      T5U = VADD(T5Q, T5T);
+			      T65 = VSUB(T5T, T5Q);
+			      T5W = VSUB(T5r, T5q);
+			      T5X = VSUB(T4L, T4E);
+			      T5Y = VADD(T5W, T5X);
+			      T66 = VSUB(T5X, T5W);
+			 }
+			 {
+			      V T5V, T62, T6b, T6c;
+			      T5V = VADD(T5N, T5U);
+			      T62 = VBYI(VADD(T5Y, T61));
+			      T7D = VSUB(T5V, T62);
+			      STM2(&(xo[116]), T7D, ovs, &(xo[0]));
+			      T7E = VADD(T5V, T62);
+			      STM2(&(xo[12]), T7E, ovs, &(xo[0]));
+			      T6b = VBYI(VADD(T66, T65));
+			      T6c = VADD(T68, T69);
+			      T7F = VADD(T6b, T6c);
+			      STM2(&(xo[20]), T7F, ovs, &(xo[0]));
+			      T7G = VSUB(T6c, T6b);
+			      STM2(&(xo[108]), T7G, ovs, &(xo[0]));
+			 }
+			 {
+			      V T63, T64, T67, T6a;
+			      T63 = VSUB(T5N, T5U);
+			      T64 = VBYI(VSUB(T61, T5Y));
+			      T7H = VSUB(T63, T64);
+			      STM2(&(xo[76]), T7H, ovs, &(xo[0]));
+			      T7I = VADD(T63, T64);
+			      STM2(&(xo[52]), T7I, ovs, &(xo[0]));
+			      T67 = VBYI(VSUB(T65, T66));
+			      T6a = VSUB(T68, T69);
+			      T7J = VADD(T67, T6a);
+			      STM2(&(xo[44]), T7J, ovs, &(xo[0]));
+			      T7K = VSUB(T6a, T67);
+			      STM2(&(xo[84]), T7K, ovs, &(xo[0]));
+			 }
+		    }
+		    {
+			 V T7U, T7W, T7X, T7Z;
+			 {
+			      V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
+			      {
+				   V Tr, T10, T2t, T2u;
+				   Tr = VSUB(Tb, Tq);
+				   T10 = VSUB(TI, TZ);
+				   T11 = VADD(Tr, T10);
+				   T2C = VSUB(Tr, T10);
+				   T2t = VFNMS(LDK(KP634393284), T1B, VMUL(LDK(KP773010453), T1s));
+				   T2u = VFMA(LDK(KP773010453), T2c, VMUL(LDK(KP634393284), T23));
+				   T2v = VADD(T2t, T2u);
+				   T2D = VSUB(T2u, T2t);
+			      }
+			      {
+				   V T1C, T2d, T2o, T2r;
+				   T1C = VFMA(LDK(KP634393284), T1s, VMUL(LDK(KP773010453), T1B));
+				   T2d = VFNMS(LDK(KP634393284), T2c, VMUL(LDK(KP773010453), T23));
+				   T2e = VADD(T1C, T2d);
+				   T2z = VSUB(T2d, T1C);
+				   T2o = VSUB(T2i, T2n);
+				   T2r = VSUB(T2p, T2q);
+				   T2s = VADD(T2o, T2r);
+				   T2A = VSUB(T2r, T2o);
+			      }
+			      {
+				   V T2f, T2w, T7L, T7M;
+				   T2f = VADD(T11, T2e);
+				   T2w = VBYI(VADD(T2s, T2v));
+				   T7L = VSUB(T2f, T2w);
+				   STM2(&(xo[114]), T7L, ovs, &(xo[2]));
+				   STN2(&(xo[112]), T7r, T7L, ovs);
+				   T7M = VADD(T2f, T2w);
+				   STM2(&(xo[14]), T7M, ovs, &(xo[2]));
+				   STN2(&(xo[12]), T7E, T7M, ovs);
+			      }
+			      {
+				   V T2F, T2G, T7N, T7O;
+				   T2F = VBYI(VADD(T2A, T2z));
+				   T2G = VADD(T2C, T2D);
+				   T7N = VADD(T2F, T2G);
+				   STM2(&(xo[18]), T7N, ovs, &(xo[2]));
+				   STN2(&(xo[16]), T7s, T7N, ovs);
+				   T7O = VSUB(T2G, T2F);
+				   STM2(&(xo[110]), T7O, ovs, &(xo[2]));
+				   STN2(&(xo[108]), T7G, T7O, ovs);
+			      }
+			      {
+				   V T2x, T2y, T7P, T7Q;
+				   T2x = VSUB(T11, T2e);
+				   T2y = VBYI(VSUB(T2v, T2s));
+				   T7P = VSUB(T2x, T2y);
+				   STM2(&(xo[78]), T7P, ovs, &(xo[2]));
+				   STN2(&(xo[76]), T7H, T7P, ovs);
+				   T7Q = VADD(T2x, T2y);
+				   STM2(&(xo[50]), T7Q, ovs, &(xo[2]));
+				   STN2(&(xo[48]), T7u, T7Q, ovs);
+			      }
+			      {
+				   V T2B, T2E, T7R, T7S;
+				   T2B = VBYI(VSUB(T2z, T2A));
+				   T2E = VSUB(T2C, T2D);
+				   T7R = VADD(T2B, T2E);
+				   STM2(&(xo[46]), T7R, ovs, &(xo[2]));
+				   STN2(&(xo[44]), T7J, T7R, ovs);
+				   T7S = VSUB(T2E, T2B);
+				   STM2(&(xo[82]), T7S, ovs, &(xo[2]));
+				   STN2(&(xo[80]), T7t, T7S, ovs);
+			      }
+			 }
+			 {
+			      V T3j, T3Q, T3J, T3R, T3y, T3N, T3G, T3O;
+			      {
+				   V T3b, T3i, T3H, T3I;
+				   T3b = VADD(T39, T3a);
+				   T3i = VADD(T3e, T3h);
+				   T3j = VADD(T3b, T3i);
+				   T3Q = VSUB(T3b, T3i);
+				   T3H = VFNMS(LDK(KP290284677), T3m, VMUL(LDK(KP956940335), T3p));
+				   T3I = VFMA(LDK(KP290284677), T3t, VMUL(LDK(KP956940335), T3w));
+				   T3J = VADD(T3H, T3I);
+				   T3R = VSUB(T3I, T3H);
+			      }
+			      {
+				   V T3q, T3x, T3C, T3F;
+				   T3q = VFMA(LDK(KP956940335), T3m, VMUL(LDK(KP290284677), T3p));
+				   T3x = VFNMS(LDK(KP290284677), T3w, VMUL(LDK(KP956940335), T3t));
+				   T3y = VADD(T3q, T3x);
+				   T3N = VSUB(T3x, T3q);
+				   T3C = VADD(T3A, T3B);
+				   T3F = VADD(T3D, T3E);
+				   T3G = VADD(T3C, T3F);
+				   T3O = VSUB(T3F, T3C);
+			      }
+			      {
+				   V T3z, T3K, T7T, T3T, T3U, T7V;
+				   T3z = VADD(T3j, T3y);
+				   T3K = VBYI(VADD(T3G, T3J));
+				   T7T = VSUB(T3z, T3K);
+				   STM2(&(xo[122]), T7T, ovs, &(xo[2]));
+				   STN2(&(xo[120]), T7v, T7T, ovs);
+				   T7U = VADD(T3z, T3K);
+				   STM2(&(xo[6]), T7U, ovs, &(xo[2]));
+				   T3T = VBYI(VADD(T3O, T3N));
+				   T3U = VADD(T3Q, T3R);
+				   T7V = VADD(T3T, T3U);
+				   STM2(&(xo[26]), T7V, ovs, &(xo[2]));
+				   STN2(&(xo[24]), T7x, T7V, ovs);
+				   T7W = VSUB(T3U, T3T);
+				   STM2(&(xo[102]), T7W, ovs, &(xo[2]));
+			      }
+			      {
+				   V T3L, T3M, T7Y, T3P, T3S, T80;
+				   T3L = VSUB(T3j, T3y);
+				   T3M = VBYI(VSUB(T3J, T3G));
+				   T7X = VSUB(T3L, T3M);
+				   STM2(&(xo[70]), T7X, ovs, &(xo[2]));
+				   T7Y = VADD(T3L, T3M);
+				   STM2(&(xo[58]), T7Y, ovs, &(xo[2]));
+				   STN2(&(xo[56]), T7A, T7Y, ovs);
+				   T3P = VBYI(VSUB(T3N, T3O));
+				   T3S = VSUB(T3Q, T3R);
+				   T7Z = VADD(T3P, T3S);
+				   STM2(&(xo[38]), T7Z, ovs, &(xo[2]));
+				   T80 = VSUB(T3S, T3P);
+				   STM2(&(xo[90]), T80, ovs, &(xo[2]));
+				   STN2(&(xo[88]), T7C, T80, ovs);
+			      }
+			 }
+			 {
+			      V T81, T83, T86, T88;
+			      {
+				   V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
+				   {
+					V T4x, T4M, T5x, T5y;
+					T4x = VADD(T4p, T4w);
+					T4M = VADD(T4E, T4L);
+					T4N = VADD(T4x, T4M);
+					T5G = VSUB(T4x, T4M);
+					T5x = VFNMS(LDK(KP195090322), T4Y, VMUL(LDK(KP980785280), T53));
+					T5y = VFMA(LDK(KP195090322), T5f, VMUL(LDK(KP980785280), T5k));
+					T5z = VADD(T5x, T5y);
+					T5H = VSUB(T5y, T5x);
+				   }
+				   {
+					V T54, T5l, T5s, T5v;
+					T54 = VFMA(LDK(KP980785280), T4Y, VMUL(LDK(KP195090322), T53));
+					T5l = VFNMS(LDK(KP195090322), T5k, VMUL(LDK(KP980785280), T5f));
+					T5m = VADD(T54, T5l);
+					T5D = VSUB(T5l, T54);
+					T5s = VADD(T5q, T5r);
+					T5v = VADD(T5t, T5u);
+					T5w = VADD(T5s, T5v);
+					T5E = VSUB(T5v, T5s);
+				   }
+				   {
+					V T5n, T5A, T82, T5J, T5K, T84;
+					T5n = VADD(T4N, T5m);
+					T5A = VBYI(VADD(T5w, T5z));
+					T81 = VSUB(T5n, T5A);
+					STM2(&(xo[124]), T81, ovs, &(xo[0]));
+					T82 = VADD(T5n, T5A);
+					STM2(&(xo[4]), T82, ovs, &(xo[0]));
+					STN2(&(xo[4]), T82, T7U, ovs);
+					T5J = VBYI(VADD(T5E, T5D));
+					T5K = VADD(T5G, T5H);
+					T83 = VADD(T5J, T5K);
+					STM2(&(xo[28]), T83, ovs, &(xo[0]));
+					T84 = VSUB(T5K, T5J);
+					STM2(&(xo[100]), T84, ovs, &(xo[0]));
+					STN2(&(xo[100]), T84, T7W, ovs);
+				   }
+				   {
+					V T5B, T5C, T85, T5F, T5I, T87;
+					T5B = VSUB(T4N, T5m);
+					T5C = VBYI(VSUB(T5z, T5w));
+					T85 = VSUB(T5B, T5C);
+					STM2(&(xo[68]), T85, ovs, &(xo[0]));
+					STN2(&(xo[68]), T85, T7X, ovs);
+					T86 = VADD(T5B, T5C);
+					STM2(&(xo[60]), T86, ovs, &(xo[0]));
+					T5F = VBYI(VSUB(T5D, T5E));
+					T5I = VSUB(T5G, T5H);
+					T87 = VADD(T5F, T5I);
+					STM2(&(xo[36]), T87, ovs, &(xo[0]));
+					STN2(&(xo[36]), T87, T7Z, ovs);
+					T88 = VSUB(T5I, T5F);
+					STM2(&(xo[92]), T88, ovs, &(xo[0]));
+				   }
+			      }
+			      {
+				   V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
+				   {
+					V T2H, T2I, T2V, T2W;
+					T2H = VADD(Tb, Tq);
+					T2I = VADD(T2q, T2p);
+					T2J = VADD(T2H, T2I);
+					T34 = VSUB(T2H, T2I);
+					T2V = VFNMS(LDK(KP098017140), T2L, VMUL(LDK(KP995184726), T2K));
+					T2W = VFMA(LDK(KP995184726), T2O, VMUL(LDK(KP098017140), T2N));
+					T2X = VADD(T2V, T2W);
+					T35 = VSUB(T2W, T2V);
+				   }
+				   {
+					V T2M, T2P, T2S, T2T;
+					T2M = VFMA(LDK(KP098017140), T2K, VMUL(LDK(KP995184726), T2L));
+					T2P = VFNMS(LDK(KP098017140), T2O, VMUL(LDK(KP995184726), T2N));
+					T2Q = VADD(T2M, T2P);
+					T31 = VSUB(T2P, T2M);
+					T2S = VADD(T2n, T2i);
+					T2T = VADD(TZ, TI);
+					T2U = VADD(T2S, T2T);
+					T32 = VSUB(T2T, T2S);
+				   }
+				   {
+					V T2R, T2Y, T89, T8a;
+					T2R = VADD(T2J, T2Q);
+					T2Y = VBYI(VADD(T2U, T2X));
+					T89 = VSUB(T2R, T2Y);
+					STM2(&(xo[126]), T89, ovs, &(xo[2]));
+					STN2(&(xo[124]), T81, T89, ovs);
+					T8a = VADD(T2R, T2Y);
+					STM2(&(xo[2]), T8a, ovs, &(xo[2]));
+					STN2(&(xo[0]), T7p, T8a, ovs);
+				   }
+				   {
+					V T37, T38, T8b, T8c;
+					T37 = VBYI(VADD(T32, T31));
+					T38 = VADD(T34, T35);
+					T8b = VADD(T37, T38);
+					STM2(&(xo[30]), T8b, ovs, &(xo[2]));
+					STN2(&(xo[28]), T83, T8b, ovs);
+					T8c = VSUB(T38, T37);
+					STM2(&(xo[98]), T8c, ovs, &(xo[2]));
+					STN2(&(xo[96]), T7q, T8c, ovs);
+				   }
+				   {
+					V T2Z, T30, T8d, T8e;
+					T2Z = VSUB(T2J, T2Q);
+					T30 = VBYI(VSUB(T2X, T2U));
+					T8d = VSUB(T2Z, T30);
+					STM2(&(xo[66]), T8d, ovs, &(xo[2]));
+					STN2(&(xo[64]), T7n, T8d, ovs);
+					T8e = VADD(T2Z, T30);
+					STM2(&(xo[62]), T8e, ovs, &(xo[2]));
+					STN2(&(xo[60]), T86, T8e, ovs);
+				   }
+				   {
+					V T33, T36, T8f, T8g;
+					T33 = VBYI(VSUB(T31, T32));
+					T36 = VSUB(T34, T35);
+					T8f = VADD(T33, T36);
+					STM2(&(xo[34]), T8f, ovs, &(xo[2]));
+					STN2(&(xo[32]), T7o, T8f, ovs);
+					T8g = VSUB(T36, T33);
+					STM2(&(xo[94]), T8g, ovs, &(xo[2]));
+					STN2(&(xo[92]), T88, T8g, ovs);
+				   }
+			      }
+			      {
+				   V T3X, T4i, T4b, T4j, T44, T4f, T48, T4g;
+				   {
+					V T3V, T3W, T49, T4a;
+					T3V = VSUB(T39, T3a);
+					T3W = VSUB(T3E, T3D);
+					T3X = VADD(T3V, T3W);
+					T4i = VSUB(T3V, T3W);
+					T49 = VFNMS(LDK(KP471396736), T3Y, VMUL(LDK(KP881921264), T3Z));
+					T4a = VFMA(LDK(KP471396736), T41, VMUL(LDK(KP881921264), T42));
+					T4b = VADD(T49, T4a);
+					T4j = VSUB(T4a, T49);
+				   }
+				   {
+					V T40, T43, T46, T47;
+					T40 = VFMA(LDK(KP881921264), T3Y, VMUL(LDK(KP471396736), T3Z));
+					T43 = VFNMS(LDK(KP471396736), T42, VMUL(LDK(KP881921264), T41));
+					T44 = VADD(T40, T43);
+					T4f = VSUB(T43, T40);
+					T46 = VSUB(T3B, T3A);
+					T47 = VSUB(T3h, T3e);
+					T48 = VADD(T46, T47);
+					T4g = VSUB(T47, T46);
+				   }
+				   {
+					V T45, T4c, T8h, T8i;
+					T45 = VADD(T3X, T44);
+					T4c = VBYI(VADD(T48, T4b));
+					T8h = VSUB(T45, T4c);
+					STM2(&(xo[118]), T8h, ovs, &(xo[2]));
+					STN2(&(xo[116]), T7D, T8h, ovs);
+					T8i = VADD(T45, T4c);
+					STM2(&(xo[10]), T8i, ovs, &(xo[2]));
+					STN2(&(xo[8]), T7w, T8i, ovs);
+				   }
+				   {
+					V T4l, T4m, T8j, T8k;
+					T4l = VBYI(VADD(T4g, T4f));
+					T4m = VADD(T4i, T4j);
+					T8j = VADD(T4l, T4m);
+					STM2(&(xo[22]), T8j, ovs, &(xo[2]));
+					STN2(&(xo[20]), T7F, T8j, ovs);
+					T8k = VSUB(T4m, T4l);
+					STM2(&(xo[106]), T8k, ovs, &(xo[2]));
+					STN2(&(xo[104]), T7y, T8k, ovs);
+				   }
+				   {
+					V T4d, T4e, T8l, T8m;
+					T4d = VSUB(T3X, T44);
+					T4e = VBYI(VSUB(T4b, T48));
+					T8l = VSUB(T4d, T4e);
+					STM2(&(xo[74]), T8l, ovs, &(xo[2]));
+					STN2(&(xo[72]), T7z, T8l, ovs);
+					T8m = VADD(T4d, T4e);
+					STM2(&(xo[54]), T8m, ovs, &(xo[2]));
+					STN2(&(xo[52]), T7I, T8m, ovs);
+				   }
+				   {
+					V T4h, T4k, T8n, T8o;
+					T4h = VBYI(VSUB(T4f, T4g));
+					T4k = VSUB(T4i, T4j);
+					T8n = VADD(T4h, T4k);
+					STM2(&(xo[42]), T8n, ovs, &(xo[2]));
+					STN2(&(xo[40]), T7B, T8n, ovs);
+					T8o = VSUB(T4k, T4h);
+					STM2(&(xo[86]), T8o, ovs, &(xo[2]));
+					STN2(&(xo[84]), T7K, T8o, ovs);
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 64, XSIMD_STRING("n2fv_64"), {404, 72, 52, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_64) (planner *p) {
+     X(kdft_register) (p, n2fv_64, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:22 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name n2fv_8 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 26 FP additions, 10 FP multiplications,
+ * (or, 16 additions, 0 multiplications, 10 fused multiply/add),
+ * 38 stack variables, 1 constants, and 20 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       V T1, T2, Tc, Td, T4, T5, T7, T8;
+	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
+	       T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+	       Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+	       Td = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+	       T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+	       T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+	       T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+	       T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+	       {
+		    V T3, Tj, Te, Tk, T6, Tm, T9, Tn, Tp, Tl;
+		    T3 = VSUB(T1, T2);
+		    Tj = VADD(T1, T2);
+		    Te = VSUB(Tc, Td);
+		    Tk = VADD(Tc, Td);
+		    T6 = VSUB(T4, T5);
+		    Tm = VADD(T4, T5);
+		    T9 = VSUB(T7, T8);
+		    Tn = VADD(T7, T8);
+		    Tp = VSUB(Tj, Tk);
+		    Tl = VADD(Tj, Tk);
+		    {
+			 V Tq, To, Ta, Tf;
+			 Tq = VSUB(Tn, Tm);
+			 To = VADD(Tm, Tn);
+			 Ta = VADD(T6, T9);
+			 Tf = VSUB(T9, T6);
+			 {
+			      V Tr, Ts, Tt, Tu, Tg, Ti, Tb, Th;
+			      Tr = VADD(Tl, To);
+			      STM2(&(xo[0]), Tr, ovs, &(xo[0]));
+			      Ts = VSUB(Tl, To);
+			      STM2(&(xo[8]), Ts, ovs, &(xo[0]));
+			      Tt = VFMAI(Tq, Tp);
+			      STM2(&(xo[4]), Tt, ovs, &(xo[0]));
+			      Tu = VFNMSI(Tq, Tp);
+			      STM2(&(xo[12]), Tu, ovs, &(xo[0]));
+			      Tg = VFNMS(LDK(KP707106781), Tf, Te);
+			      Ti = VFMA(LDK(KP707106781), Tf, Te);
+			      Tb = VFMA(LDK(KP707106781), Ta, T3);
+			      Th = VFNMS(LDK(KP707106781), Ta, T3);
+			      {
+				   V Tv, Tw, Tx, Ty;
+				   Tv = VFMAI(Ti, Th);
+				   STM2(&(xo[6]), Tv, ovs, &(xo[2]));
+				   STN2(&(xo[4]), Tt, Tv, ovs);
+				   Tw = VFNMSI(Ti, Th);
+				   STM2(&(xo[10]), Tw, ovs, &(xo[2]));
+				   STN2(&(xo[8]), Ts, Tw, ovs);
+				   Tx = VFMAI(Tg, Tb);
+				   STM2(&(xo[14]), Tx, ovs, &(xo[2]));
+				   STN2(&(xo[12]), Tu, Tx, ovs);
+				   Ty = VFNMSI(Tg, Tb);
+				   STM2(&(xo[2]), Ty, ovs, &(xo[2]));
+				   STN2(&(xo[0]), Tr, Ty, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 8, XSIMD_STRING("n2fv_8"), {16, 0, 10, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_8) (planner *p) {
+     X(kdft_register) (p, n2fv_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name n2fv_8 -with-ostride 2 -include n2f.h -store-multiple 2 */
+
+/*
+ * This function contains 26 FP additions, 2 FP multiplications,
+ * (or, 26 additions, 2 multiplications, 0 fused multiply/add),
+ * 24 stack variables, 1 constants, and 20 memory accesses
+ */
+#include "n2f.h"
+
+static void n2fv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  const R *xi;
+	  R *xo;
+	  xi = ri;
+	  xo = ro;
+	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       V T3, Tj, Tf, Tk, Ta, Tn, Tc, Tm, Ts, Tu;
+	       {
+		    V T1, T2, Td, Te;
+		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
+		    T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
+		    T3 = VSUB(T1, T2);
+		    Tj = VADD(T1, T2);
+		    Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
+		    Te = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
+		    Tf = VSUB(Td, Te);
+		    Tk = VADD(Td, Te);
+		    {
+			 V T4, T5, T6, T7, T8, T9;
+			 T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
+			 T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
+			 T6 = VSUB(T4, T5);
+			 T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
+			 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
+			 T9 = VSUB(T7, T8);
+			 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+			 Tn = VADD(T7, T8);
+			 Tc = VMUL(LDK(KP707106781), VSUB(T9, T6));
+			 Tm = VADD(T4, T5);
+		    }
+	       }
+	       {
+		    V Tr, Tb, Tg, Tp, Tq, Tt;
+		    Tb = VADD(T3, Ta);
+		    Tg = VBYI(VSUB(Tc, Tf));
+		    Tr = VSUB(Tb, Tg);
+		    STM2(&(xo[14]), Tr, ovs, &(xo[2]));
+		    Ts = VADD(Tb, Tg);
+		    STM2(&(xo[2]), Ts, ovs, &(xo[2]));
+		    Tp = VSUB(Tj, Tk);
+		    Tq = VBYI(VSUB(Tn, Tm));
+		    Tt = VSUB(Tp, Tq);
+		    STM2(&(xo[12]), Tt, ovs, &(xo[0]));
+		    STN2(&(xo[12]), Tt, Tr, ovs);
+		    Tu = VADD(Tp, Tq);
+		    STM2(&(xo[4]), Tu, ovs, &(xo[0]));
+	       }
+	       {
+		    V Tv, Th, Ti, Tw;
+		    Th = VSUB(T3, Ta);
+		    Ti = VBYI(VADD(Tf, Tc));
+		    Tv = VSUB(Th, Ti);
+		    STM2(&(xo[10]), Tv, ovs, &(xo[2]));
+		    Tw = VADD(Th, Ti);
+		    STM2(&(xo[6]), Tw, ovs, &(xo[2]));
+		    STN2(&(xo[4]), Tu, Tw, ovs);
+		    {
+			 V Tl, To, Tx, Ty;
+			 Tl = VADD(Tj, Tk);
+			 To = VADD(Tm, Tn);
+			 Tx = VSUB(Tl, To);
+			 STM2(&(xo[8]), Tx, ovs, &(xo[0]));
+			 STN2(&(xo[8]), Tx, Tv, ovs);
+			 Ty = VADD(Tl, To);
+			 STM2(&(xo[0]), Ty, ovs, &(xo[0]));
+			 STN2(&(xo[0]), Ty, Ts, ovs);
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 8, XSIMD_STRING("n2fv_8"), {26, 2, 0, 0}, &GENUS, 0, 2, 0, 0 };
+
+void XSIMD(codelet_n2fv_8) (planner *p) {
+     X(kdft_register) (p, n2fv_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:48 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n2sv_16 -with-ostride 1 -include n2s.h -store-multiple 4 */
+
+/*
+ * This function contains 144 FP additions, 40 FP multiplications,
+ * (or, 104 additions, 0 multiplications, 40 fused multiply/add),
+ * 110 stack variables, 3 constants, and 72 memory accesses
+ */
+#include "n2s.h"
+
+static void n2sv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       V T2p, T2q, T2r, T2s, T2x, T2y, T2z, T2A, T1M, T1N, T1L, T1P, T2F, T2G, T2H;
+	       V T2I, T1O, T1Q;
+	       {
+		    V T1l, T1H, T1R, T7, T1x, TN, TC, T25, T1E, T1b, T1Z, Tt, T2h, T22, T1D;
+		    V T1g, T1n, TQ, T11, Ti, Te, T26, T1m, TT, T1S, TJ, TZ, T1V, TW, Tl;
+		    V T12, T13;
+		    {
+			 V Tq, T1c, Tp, T20, T1a, Tr, T1d, T1e;
+			 {
+			      V T1, T2, Tw, Tx, T4, T5, Tz, TA;
+			      T1 = LD(&(ri[0]), ivs, &(ri[0]));
+			      T2 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
+			      Tw = LD(&(ii[0]), ivs, &(ii[0]));
+			      Tx = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
+			      T4 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
+			      T5 = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
+			      Tz = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
+			      TA = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
+			      {
+				   V Tn, TL, T3, T1k, Ty, T1j, T6, TM, TB, To, T18, T19;
+				   Tn = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
+				   TL = VSUB(T1, T2);
+				   T3 = VADD(T1, T2);
+				   T1k = VSUB(Tw, Tx);
+				   Ty = VADD(Tw, Tx);
+				   T1j = VSUB(T4, T5);
+				   T6 = VADD(T4, T5);
+				   TM = VSUB(Tz, TA);
+				   TB = VADD(Tz, TA);
+				   To = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
+				   T18 = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
+				   T19 = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
+				   Tq = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
+				   T1l = VADD(T1j, T1k);
+				   T1H = VSUB(T1k, T1j);
+				   T1R = VSUB(T3, T6);
+				   T7 = VADD(T3, T6);
+				   T1x = VADD(TL, TM);
+				   TN = VSUB(TL, TM);
+				   TC = VADD(Ty, TB);
+				   T25 = VSUB(Ty, TB);
+				   T1c = VSUB(Tn, To);
+				   Tp = VADD(Tn, To);
+				   T20 = VADD(T18, T19);
+				   T1a = VSUB(T18, T19);
+				   Tr = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
+				   T1d = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
+				   T1e = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
+			      }
+			 }
+			 {
+			      V Tb, Ta, TF, Tc, TG, TH, TP, TO;
+			      {
+				   V T8, T9, TD, TE;
+				   T8 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
+				   T9 = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
+				   TD = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
+				   TE = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
+				   Tb = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
+				   {
+					V T17, Ts, T21, T1f;
+					T17 = VSUB(Tq, Tr);
+					Ts = VADD(Tq, Tr);
+					T21 = VADD(T1d, T1e);
+					T1f = VSUB(T1d, T1e);
+					TP = VSUB(T8, T9);
+					Ta = VADD(T8, T9);
+					TO = VSUB(TD, TE);
+					TF = VADD(TD, TE);
+					T1E = VSUB(T1a, T17);
+					T1b = VADD(T17, T1a);
+					T1Z = VSUB(Tp, Ts);
+					Tt = VADD(Tp, Ts);
+					T2h = VADD(T20, T21);
+					T22 = VSUB(T20, T21);
+					T1D = VADD(T1c, T1f);
+					T1g = VSUB(T1c, T1f);
+					Tc = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
+				   }
+				   TG = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
+				   TH = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
+			      }
+			      T1n = VADD(TP, TO);
+			      TQ = VSUB(TO, TP);
+			      {
+				   V Tg, Th, TX, TR, Td, TS, TI, TY, Tj, Tk;
+				   Tg = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
+				   Th = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
+				   TX = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
+				   TR = VSUB(Tb, Tc);
+				   Td = VADD(Tb, Tc);
+				   TS = VSUB(TG, TH);
+				   TI = VADD(TG, TH);
+				   TY = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
+				   Tj = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
+				   T11 = VSUB(Tg, Th);
+				   Ti = VADD(Tg, Th);
+				   Tk = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
+				   Te = VADD(Ta, Td);
+				   T26 = VSUB(Td, Ta);
+				   T1m = VSUB(TR, TS);
+				   TT = VADD(TR, TS);
+				   T1S = VSUB(TF, TI);
+				   TJ = VADD(TF, TI);
+				   TZ = VSUB(TX, TY);
+				   T1V = VADD(TX, TY);
+				   TW = VSUB(Tj, Tk);
+				   Tl = VADD(Tj, Tk);
+				   T12 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
+				   T13 = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
+			      }
+			 }
+		    }
+		    {
+			 V T2f, Tf, T2j, TK, Tm, T1U, T10, T1B, T14, T1W;
+			 T2f = VSUB(T7, Te);
+			 Tf = VADD(T7, Te);
+			 T2j = VADD(TC, TJ);
+			 TK = VSUB(TC, TJ);
+			 Tm = VADD(Ti, Tl);
+			 T1U = VSUB(Ti, Tl);
+			 T10 = VADD(TW, TZ);
+			 T1B = VSUB(TZ, TW);
+			 T14 = VSUB(T12, T13);
+			 T1W = VADD(T12, T13);
+			 {
+			      V T29, T1T, T27, T2d, T2b, T23, T15, T1A, T2l, T2m, T2n, T2o, T2i, T2k, T1Y;
+			      V T2a;
+			      {
+				   V Tv, Tu, T1X, T2g;
+				   T29 = VSUB(T1R, T1S);
+				   T1T = VADD(T1R, T1S);
+				   T27 = VSUB(T25, T26);
+				   T2d = VADD(T26, T25);
+				   T2b = VADD(T1Z, T22);
+				   T23 = VSUB(T1Z, T22);
+				   Tv = VSUB(Tt, Tm);
+				   Tu = VADD(Tm, Tt);
+				   T1X = VSUB(T1V, T1W);
+				   T2g = VADD(T1V, T1W);
+				   T15 = VSUB(T11, T14);
+				   T1A = VADD(T11, T14);
+				   T2l = VSUB(TK, Tv);
+				   STM4(&(io[12]), T2l, ovs, &(io[0]));
+				   T2m = VADD(Tv, TK);
+				   STM4(&(io[4]), T2m, ovs, &(io[0]));
+				   T2n = VADD(Tf, Tu);
+				   STM4(&(ro[0]), T2n, ovs, &(ro[0]));
+				   T2o = VSUB(Tf, Tu);
+				   STM4(&(ro[8]), T2o, ovs, &(ro[0]));
+				   T2i = VSUB(T2g, T2h);
+				   T2k = VADD(T2g, T2h);
+				   T1Y = VADD(T1U, T1X);
+				   T2a = VSUB(T1X, T1U);
+			      }
+			      {
+				   V T1I, T1y, T1t, T16, T1v, TV, T1r, T1p, T2t, T2u, T2v, T2w, T1h, T1s, TU;
+				   V T1o;
+				   T1I = VADD(TQ, TT);
+				   TU = VSUB(TQ, TT);
+				   T1o = VSUB(T1m, T1n);
+				   T1y = VADD(T1n, T1m);
+				   T1t = VFNMS(LDK(KP414213562), T10, T15);
+				   T16 = VFMA(LDK(KP414213562), T15, T10);
+				   T2p = VADD(T2f, T2i);
+				   STM4(&(ro[4]), T2p, ovs, &(ro[0]));
+				   T2q = VSUB(T2f, T2i);
+				   STM4(&(ro[12]), T2q, ovs, &(ro[0]));
+				   T2r = VADD(T2j, T2k);
+				   STM4(&(io[0]), T2r, ovs, &(io[0]));
+				   T2s = VSUB(T2j, T2k);
+				   STM4(&(io[8]), T2s, ovs, &(io[0]));
+				   {
+					V T28, T24, T2e, T2c;
+					T28 = VSUB(T23, T1Y);
+					T24 = VADD(T1Y, T23);
+					T2e = VADD(T2a, T2b);
+					T2c = VSUB(T2a, T2b);
+					T1v = VFNMS(LDK(KP707106781), TU, TN);
+					TV = VFMA(LDK(KP707106781), TU, TN);
+					T1r = VFMA(LDK(KP707106781), T1o, T1l);
+					T1p = VFNMS(LDK(KP707106781), T1o, T1l);
+					T2t = VFNMS(LDK(KP707106781), T28, T27);
+					STM4(&(io[14]), T2t, ovs, &(io[0]));
+					T2u = VFMA(LDK(KP707106781), T28, T27);
+					STM4(&(io[6]), T2u, ovs, &(io[0]));
+					T2v = VFMA(LDK(KP707106781), T24, T1T);
+					STM4(&(ro[2]), T2v, ovs, &(ro[0]));
+					T2w = VFNMS(LDK(KP707106781), T24, T1T);
+					STM4(&(ro[10]), T2w, ovs, &(ro[0]));
+					T2x = VFNMS(LDK(KP707106781), T2e, T2d);
+					STM4(&(io[10]), T2x, ovs, &(io[0]));
+					T2y = VFMA(LDK(KP707106781), T2e, T2d);
+					STM4(&(io[2]), T2y, ovs, &(io[0]));
+					T2z = VFMA(LDK(KP707106781), T2c, T29);
+					STM4(&(ro[6]), T2z, ovs, &(ro[0]));
+					T2A = VFNMS(LDK(KP707106781), T2c, T29);
+					STM4(&(ro[14]), T2A, ovs, &(ro[0]));
+					T1h = VFNMS(LDK(KP414213562), T1g, T1b);
+					T1s = VFMA(LDK(KP414213562), T1b, T1g);
+				   }
+				   {
+					V T1z, T1J, T1K, T1G, T2B, T2C, T2D, T2E, T1C, T1F;
+					T1M = VFNMS(LDK(KP414213562), T1A, T1B);
+					T1C = VFMA(LDK(KP414213562), T1B, T1A);
+					T1F = VFNMS(LDK(KP414213562), T1E, T1D);
+					T1N = VFMA(LDK(KP414213562), T1D, T1E);
+					{
+					     V T1q, T1i, T1w, T1u;
+					     T1q = VADD(T16, T1h);
+					     T1i = VSUB(T16, T1h);
+					     T1w = VADD(T1t, T1s);
+					     T1u = VSUB(T1s, T1t);
+					     T1L = VFNMS(LDK(KP707106781), T1y, T1x);
+					     T1z = VFMA(LDK(KP707106781), T1y, T1x);
+					     T1P = VFMA(LDK(KP707106781), T1I, T1H);
+					     T1J = VFNMS(LDK(KP707106781), T1I, T1H);
+					     T1K = VSUB(T1F, T1C);
+					     T1G = VADD(T1C, T1F);
+					     T2B = VFMA(LDK(KP923879532), T1q, T1p);
+					     STM4(&(io[15]), T2B, ovs, &(io[1]));
+					     T2C = VFNMS(LDK(KP923879532), T1q, T1p);
+					     STM4(&(io[7]), T2C, ovs, &(io[1]));
+					     T2D = VFMA(LDK(KP923879532), T1i, TV);
+					     STM4(&(ro[3]), T2D, ovs, &(ro[1]));
+					     T2E = VFNMS(LDK(KP923879532), T1i, TV);
+					     STM4(&(ro[11]), T2E, ovs, &(ro[1]));
+					     T2F = VFMA(LDK(KP923879532), T1w, T1v);
+					     STM4(&(ro[15]), T2F, ovs, &(ro[1]));
+					     T2G = VFNMS(LDK(KP923879532), T1w, T1v);
+					     STM4(&(ro[7]), T2G, ovs, &(ro[1]));
+					     T2H = VFMA(LDK(KP923879532), T1u, T1r);
+					     STM4(&(io[3]), T2H, ovs, &(io[1]));
+					     T2I = VFNMS(LDK(KP923879532), T1u, T1r);
+					     STM4(&(io[11]), T2I, ovs, &(io[1]));
+					}
+					{
+					     V T2J, T2K, T2L, T2M;
+					     T2J = VFNMS(LDK(KP923879532), T1G, T1z);
+					     STM4(&(ro[9]), T2J, ovs, &(ro[1]));
+					     STN4(&(ro[8]), T2o, T2J, T2w, T2E, ovs);
+					     T2K = VFMA(LDK(KP923879532), T1G, T1z);
+					     STM4(&(ro[1]), T2K, ovs, &(ro[1]));
+					     STN4(&(ro[0]), T2n, T2K, T2v, T2D, ovs);
+					     T2L = VFNMS(LDK(KP923879532), T1K, T1J);
+					     STM4(&(io[13]), T2L, ovs, &(io[1]));
+					     STN4(&(io[12]), T2l, T2L, T2t, T2B, ovs);
+					     T2M = VFMA(LDK(KP923879532), T1K, T1J);
+					     STM4(&(io[5]), T2M, ovs, &(io[1]));
+					     STN4(&(io[4]), T2m, T2M, T2u, T2C, ovs);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T1O = VSUB(T1M, T1N);
+	       T1Q = VADD(T1M, T1N);
+	       {
+		    V T2N, T2O, T2P, T2Q;
+		    T2N = VFMA(LDK(KP923879532), T1Q, T1P);
+		    STM4(&(io[1]), T2N, ovs, &(io[1]));
+		    STN4(&(io[0]), T2r, T2N, T2y, T2H, ovs);
+		    T2O = VFNMS(LDK(KP923879532), T1Q, T1P);
+		    STM4(&(io[9]), T2O, ovs, &(io[1]));
+		    STN4(&(io[8]), T2s, T2O, T2x, T2I, ovs);
+		    T2P = VFMA(LDK(KP923879532), T1O, T1L);
+		    STM4(&(ro[5]), T2P, ovs, &(ro[1]));
+		    STN4(&(ro[4]), T2p, T2P, T2z, T2G, ovs);
+		    T2Q = VFNMS(LDK(KP923879532), T1O, T1L);
+		    STM4(&(ro[13]), T2Q, ovs, &(ro[1]));
+		    STN4(&(ro[12]), T2q, T2Q, T2A, T2F, ovs);
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 16, XSIMD_STRING("n2sv_16"), {104, 0, 40, 0}, &GENUS, 0, 1, 0, 0 };
+
+void XSIMD(codelet_n2sv_16) (planner *p) {
+     X(kdft_register) (p, n2sv_16, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n2sv_16 -with-ostride 1 -include n2s.h -store-multiple 4 */
+
+/*
+ * This function contains 144 FP additions, 24 FP multiplications,
+ * (or, 136 additions, 16 multiplications, 8 fused multiply/add),
+ * 74 stack variables, 3 constants, and 72 memory accesses
+ */
+#include "n2s.h"
+
+static void n2sv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
+	       V T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
+	       V T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
+	       V T1U, T1A;
+	       {
+		    V T3, TL, Ty, T1k, T6, T1j, TB, TM;
+		    {
+			 V T1, T2, Tw, Tx;
+			 T1 = LD(&(ri[0]), ivs, &(ri[0]));
+			 T2 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
+			 T3 = VADD(T1, T2);
+			 TL = VSUB(T1, T2);
+			 Tw = LD(&(ii[0]), ivs, &(ii[0]));
+			 Tx = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
+			 Ty = VADD(Tw, Tx);
+			 T1k = VSUB(Tw, Tx);
+		    }
+		    {
+			 V T4, T5, Tz, TA;
+			 T4 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
+			 T5 = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
+			 T6 = VADD(T4, T5);
+			 T1j = VSUB(T4, T5);
+			 Tz = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
+			 TA = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
+			 TB = VADD(Tz, TA);
+			 TM = VSUB(Tz, TA);
+		    }
+		    T7 = VADD(T3, T6);
+		    T1R = VSUB(T3, T6);
+		    T25 = VSUB(Ty, TB);
+		    TC = VADD(Ty, TB);
+		    TN = VSUB(TL, TM);
+		    T1x = VADD(TL, TM);
+		    T1H = VSUB(T1k, T1j);
+		    T1l = VADD(T1j, T1k);
+	       }
+	       {
+		    V Tp, T17, T1f, T20, Ts, T1c, T1a, T21;
+		    {
+			 V Tn, To, T1d, T1e;
+			 Tn = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
+			 To = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
+			 Tp = VADD(Tn, To);
+			 T17 = VSUB(Tn, To);
+			 T1d = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
+			 T1e = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
+			 T1f = VSUB(T1d, T1e);
+			 T20 = VADD(T1d, T1e);
+		    }
+		    {
+			 V Tq, Tr, T18, T19;
+			 Tq = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
+			 Tr = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
+			 Ts = VADD(Tq, Tr);
+			 T1c = VSUB(Tq, Tr);
+			 T18 = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
+			 T19 = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
+			 T1a = VSUB(T18, T19);
+			 T21 = VADD(T18, T19);
+		    }
+		    Tt = VADD(Tp, Ts);
+		    T22 = VSUB(T20, T21);
+		    T2h = VADD(T20, T21);
+		    T1b = VSUB(T17, T1a);
+		    T1g = VADD(T1c, T1f);
+		    T1E = VSUB(T1f, T1c);
+		    T1Z = VSUB(Tp, Ts);
+		    T1D = VADD(T17, T1a);
+	       }
+	       {
+		    V Ta, TP, TF, TO, Td, TR, TI, TS;
+		    {
+			 V T8, T9, TD, TE;
+			 T8 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
+			 T9 = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
+			 Ta = VADD(T8, T9);
+			 TP = VSUB(T8, T9);
+			 TD = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
+			 TE = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
+			 TF = VADD(TD, TE);
+			 TO = VSUB(TD, TE);
+		    }
+		    {
+			 V Tb, Tc, TG, TH;
+			 Tb = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
+			 Tc = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
+			 Td = VADD(Tb, Tc);
+			 TR = VSUB(Tb, Tc);
+			 TG = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
+			 TH = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
+			 TI = VADD(TG, TH);
+			 TS = VSUB(TG, TH);
+		    }
+		    Te = VADD(Ta, Td);
+		    T1S = VSUB(TF, TI);
+		    T26 = VSUB(Td, Ta);
+		    TJ = VADD(TF, TI);
+		    TQ = VSUB(TO, TP);
+		    T1m = VSUB(TR, TS);
+		    T1n = VADD(TP, TO);
+		    TT = VADD(TR, TS);
+	       }
+	       {
+		    V Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
+		    {
+			 V Tg, Th, TX, TY;
+			 Tg = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
+			 Th = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
+			 Ti = VADD(Tg, Th);
+			 T11 = VSUB(Tg, Th);
+			 TX = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
+			 TY = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
+			 TZ = VSUB(TX, TY);
+			 T1V = VADD(TX, TY);
+		    }
+		    {
+			 V Tj, Tk, T12, T13;
+			 Tj = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
+			 Tk = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
+			 Tl = VADD(Tj, Tk);
+			 TW = VSUB(Tj, Tk);
+			 T12 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
+			 T13 = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
+			 T14 = VSUB(T12, T13);
+			 T1W = VADD(T12, T13);
+		    }
+		    Tm = VADD(Ti, Tl);
+		    T1X = VSUB(T1V, T1W);
+		    T2g = VADD(T1V, T1W);
+		    T10 = VADD(TW, TZ);
+		    T15 = VSUB(T11, T14);
+		    T1B = VADD(T11, T14);
+		    T1U = VSUB(Ti, Tl);
+		    T1A = VSUB(TZ, TW);
+	       }
+	       {
+		    V T2l, T2m, T2n, T2o, T2p, T2q, T2r, T2s;
+		    {
+			 V Tf, Tu, T2j, T2k;
+			 Tf = VADD(T7, Te);
+			 Tu = VADD(Tm, Tt);
+			 T2l = VSUB(Tf, Tu);
+			 STM4(&(ro[8]), T2l, ovs, &(ro[0]));
+			 T2m = VADD(Tf, Tu);
+			 STM4(&(ro[0]), T2m, ovs, &(ro[0]));
+			 T2j = VADD(TC, TJ);
+			 T2k = VADD(T2g, T2h);
+			 T2n = VSUB(T2j, T2k);
+			 STM4(&(io[8]), T2n, ovs, &(io[0]));
+			 T2o = VADD(T2j, T2k);
+			 STM4(&(io[0]), T2o, ovs, &(io[0]));
+		    }
+		    {
+			 V Tv, TK, T2f, T2i;
+			 Tv = VSUB(Tt, Tm);
+			 TK = VSUB(TC, TJ);
+			 T2p = VADD(Tv, TK);
+			 STM4(&(io[4]), T2p, ovs, &(io[0]));
+			 T2q = VSUB(TK, Tv);
+			 STM4(&(io[12]), T2q, ovs, &(io[0]));
+			 T2f = VSUB(T7, Te);
+			 T2i = VSUB(T2g, T2h);
+			 T2r = VSUB(T2f, T2i);
+			 STM4(&(ro[12]), T2r, ovs, &(ro[0]));
+			 T2s = VADD(T2f, T2i);
+			 STM4(&(ro[4]), T2s, ovs, &(ro[0]));
+		    }
+		    {
+			 V T2t, T2u, T2v, T2w, T2x, T2y, T2z, T2A;
+			 {
+			      V T1T, T27, T24, T28, T1Y, T23;
+			      T1T = VADD(T1R, T1S);
+			      T27 = VSUB(T25, T26);
+			      T1Y = VADD(T1U, T1X);
+			      T23 = VSUB(T1Z, T22);
+			      T24 = VMUL(LDK(KP707106781), VADD(T1Y, T23));
+			      T28 = VMUL(LDK(KP707106781), VSUB(T23, T1Y));
+			      T2t = VSUB(T1T, T24);
+			      STM4(&(ro[10]), T2t, ovs, &(ro[0]));
+			      T2u = VADD(T27, T28);
+			      STM4(&(io[6]), T2u, ovs, &(io[0]));
+			      T2v = VADD(T1T, T24);
+			      STM4(&(ro[2]), T2v, ovs, &(ro[0]));
+			      T2w = VSUB(T27, T28);
+			      STM4(&(io[14]), T2w, ovs, &(io[0]));
+			 }
+			 {
+			      V T29, T2d, T2c, T2e, T2a, T2b;
+			      T29 = VSUB(T1R, T1S);
+			      T2d = VADD(T26, T25);
+			      T2a = VSUB(T1X, T1U);
+			      T2b = VADD(T1Z, T22);
+			      T2c = VMUL(LDK(KP707106781), VSUB(T2a, T2b));
+			      T2e = VMUL(LDK(KP707106781), VADD(T2a, T2b));
+			      T2x = VSUB(T29, T2c);
+			      STM4(&(ro[14]), T2x, ovs, &(ro[0]));
+			      T2y = VADD(T2d, T2e);
+			      STM4(&(io[2]), T2y, ovs, &(io[0]));
+			      T2z = VADD(T29, T2c);
+			      STM4(&(ro[6]), T2z, ovs, &(ro[0]));
+			      T2A = VSUB(T2d, T2e);
+			      STM4(&(io[10]), T2A, ovs, &(io[0]));
+			 }
+			 {
+			      V T2B, T2C, T2D, T2E, T2F, T2G, T2H, T2I;
+			      {
+				   V TV, T1r, T1p, T1v, T1i, T1q, T1u, T1w, TU, T1o;
+				   TU = VMUL(LDK(KP707106781), VSUB(TQ, TT));
+				   TV = VADD(TN, TU);
+				   T1r = VSUB(TN, TU);
+				   T1o = VMUL(LDK(KP707106781), VSUB(T1m, T1n));
+				   T1p = VSUB(T1l, T1o);
+				   T1v = VADD(T1l, T1o);
+				   {
+					V T16, T1h, T1s, T1t;
+					T16 = VFMA(LDK(KP923879532), T10, VMUL(LDK(KP382683432), T15));
+					T1h = VFNMS(LDK(KP923879532), T1g, VMUL(LDK(KP382683432), T1b));
+					T1i = VADD(T16, T1h);
+					T1q = VSUB(T1h, T16);
+					T1s = VFNMS(LDK(KP923879532), T15, VMUL(LDK(KP382683432), T10));
+					T1t = VFMA(LDK(KP382683432), T1g, VMUL(LDK(KP923879532), T1b));
+					T1u = VSUB(T1s, T1t);
+					T1w = VADD(T1s, T1t);
+				   }
+				   T2B = VSUB(TV, T1i);
+				   STM4(&(ro[11]), T2B, ovs, &(ro[1]));
+				   T2C = VSUB(T1v, T1w);
+				   STM4(&(io[11]), T2C, ovs, &(io[1]));
+				   T2D = VADD(TV, T1i);
+				   STM4(&(ro[3]), T2D, ovs, &(ro[1]));
+				   T2E = VADD(T1v, T1w);
+				   STM4(&(io[3]), T2E, ovs, &(io[1]));
+				   T2F = VSUB(T1p, T1q);
+				   STM4(&(io[15]), T2F, ovs, &(io[1]));
+				   T2G = VSUB(T1r, T1u);
+				   STM4(&(ro[15]), T2G, ovs, &(ro[1]));
+				   T2H = VADD(T1p, T1q);
+				   STM4(&(io[7]), T2H, ovs, &(io[1]));
+				   T2I = VADD(T1r, T1u);
+				   STM4(&(ro[7]), T2I, ovs, &(ro[1]));
+			      }
+			      {
+				   V T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
+				   T1y = VMUL(LDK(KP707106781), VADD(T1n, T1m));
+				   T1z = VADD(T1x, T1y);
+				   T1L = VSUB(T1x, T1y);
+				   T1I = VMUL(LDK(KP707106781), VADD(TQ, TT));
+				   T1J = VSUB(T1H, T1I);
+				   T1P = VADD(T1H, T1I);
+				   {
+					V T1C, T1F, T1M, T1N;
+					T1C = VFMA(LDK(KP382683432), T1A, VMUL(LDK(KP923879532), T1B));
+					T1F = VFNMS(LDK(KP382683432), T1E, VMUL(LDK(KP923879532), T1D));
+					T1G = VADD(T1C, T1F);
+					T1K = VSUB(T1F, T1C);
+					T1M = VFNMS(LDK(KP382683432), T1B, VMUL(LDK(KP923879532), T1A));
+					T1N = VFMA(LDK(KP923879532), T1E, VMUL(LDK(KP382683432), T1D));
+					T1O = VSUB(T1M, T1N);
+					T1Q = VADD(T1M, T1N);
+				   }
+				   {
+					V T2J, T2K, T2L, T2M;
+					T2J = VSUB(T1z, T1G);
+					STM4(&(ro[9]), T2J, ovs, &(ro[1]));
+					STN4(&(ro[8]), T2l, T2J, T2t, T2B, ovs);
+					T2K = VSUB(T1P, T1Q);
+					STM4(&(io[9]), T2K, ovs, &(io[1]));
+					STN4(&(io[8]), T2n, T2K, T2A, T2C, ovs);
+					T2L = VADD(T1z, T1G);
+					STM4(&(ro[1]), T2L, ovs, &(ro[1]));
+					STN4(&(ro[0]), T2m, T2L, T2v, T2D, ovs);
+					T2M = VADD(T1P, T1Q);
+					STM4(&(io[1]), T2M, ovs, &(io[1]));
+					STN4(&(io[0]), T2o, T2M, T2y, T2E, ovs);
+				   }
+				   {
+					V T2N, T2O, T2P, T2Q;
+					T2N = VSUB(T1J, T1K);
+					STM4(&(io[13]), T2N, ovs, &(io[1]));
+					STN4(&(io[12]), T2q, T2N, T2w, T2F, ovs);
+					T2O = VSUB(T1L, T1O);
+					STM4(&(ro[13]), T2O, ovs, &(ro[1]));
+					STN4(&(ro[12]), T2r, T2O, T2x, T2G, ovs);
+					T2P = VADD(T1J, T1K);
+					STM4(&(io[5]), T2P, ovs, &(io[1]));
+					STN4(&(io[4]), T2p, T2P, T2u, T2H, ovs);
+					T2Q = VADD(T1L, T1O);
+					STM4(&(ro[5]), T2Q, ovs, &(ro[1]));
+					STN4(&(ro[4]), T2s, T2Q, T2z, T2I, ovs);
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 16, XSIMD_STRING("n2sv_16"), {136, 16, 8, 0}, &GENUS, 0, 1, 0, 0 };
+
+void XSIMD(codelet_n2sv_16) (planner *p) {
+     X(kdft_register) (p, n2sv_16, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1453 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:49 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n2sv_32 -with-ostride 1 -include n2s.h -store-multiple 4 */
+
+/*
+ * This function contains 372 FP additions, 136 FP multiplications,
+ * (or, 236 additions, 0 multiplications, 136 fused multiply/add),
+ * 194 stack variables, 7 constants, and 144 memory accesses
+ */
+#include "n2s.h"
+
+static void n2sv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       V T61, T62, T63, T64, T65, T66, T67, T68, T69, T6a, T6b, T6c, T6d, T6e, T6f;
+	       V T6g, T6h, T6i, T6j, T6k, T6l, T6m, T6n, T6o, T6p, T6q, T6r, T6s, T6t, T6u;
+	       V T6v, T6w, T3g, T3f, T6x, T6y, T6z, T6A, T6B, T6C, T6D, T6E, T4p, T49, T4l;
+	       V T4j, T6F, T6G, T6H, T6I, T6J, T6K, T6L, T6M, T3n, T3b, T3r, T3l, T3o, T3e;
+	       V T4q, T4o, T4k, T4g, T3h, T3p;
+	       {
+		    V T2T, T3T, T4r, T7, T3t, T1z, T18, T4Z, Te, T50, T1f, T4s, T1G, T3U, T2W;
+		    V T3u, Tm, T1n, T3X, T3y, T2Z, T1O, T53, T4w, Tt, T1u, T3W, T3B, T2Y, T1V;
+		    V T52, T4z, T3O, T2t, T3L, T2K, TZ, T5F, T4R, T5k, T5j, T4W, T5I, T5X, T2E;
+		    V T3M, T2N, T3P, T3H, T22, T3E, T2j, T4G, T5h, TK, T5A, T5D, T5W, T2d, T3F;
+		    V T4L, T5g, T3I, T2m;
+		    {
+			 V T1L, T1j, T1k, T1l, T4v, T1K, T3w;
+			 {
+			      V T1, T2, T12, T13, T4, T5, T15, T16;
+			      T1 = LD(&(ri[0]), ivs, &(ri[0]));
+			      T2 = LD(&(ri[WS(is, 16)]), ivs, &(ri[0]));
+			      T12 = LD(&(ii[0]), ivs, &(ii[0]));
+			      T13 = LD(&(ii[WS(is, 16)]), ivs, &(ii[0]));
+			      T4 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
+			      T5 = LD(&(ri[WS(is, 24)]), ivs, &(ri[0]));
+			      T15 = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
+			      T16 = LD(&(ii[WS(is, 24)]), ivs, &(ii[0]));
+			      {
+				   V Tb, T1A, Ta, T1B, T1b, Tc, T1c, T1d;
+				   {
+					V T8, T1x, T3, T2R, T14, T2S, T6, T1y, T17, T9, T19, T1a;
+					T8 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
+					T1x = VSUB(T1, T2);
+					T3 = VADD(T1, T2);
+					T2R = VSUB(T12, T13);
+					T14 = VADD(T12, T13);
+					T2S = VSUB(T4, T5);
+					T6 = VADD(T4, T5);
+					T1y = VSUB(T15, T16);
+					T17 = VADD(T15, T16);
+					T9 = LD(&(ri[WS(is, 20)]), ivs, &(ri[0]));
+					T19 = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
+					T1a = LD(&(ii[WS(is, 20)]), ivs, &(ii[0]));
+					Tb = LD(&(ri[WS(is, 28)]), ivs, &(ri[0]));
+					T2T = VSUB(T2R, T2S);
+					T3T = VADD(T2S, T2R);
+					T4r = VSUB(T3, T6);
+					T7 = VADD(T3, T6);
+					T3t = VSUB(T1x, T1y);
+					T1z = VADD(T1x, T1y);
+					T18 = VADD(T14, T17);
+					T4Z = VSUB(T14, T17);
+					T1A = VSUB(T8, T9);
+					Ta = VADD(T8, T9);
+					T1B = VSUB(T19, T1a);
+					T1b = VADD(T19, T1a);
+					Tc = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
+					T1c = LD(&(ii[WS(is, 28)]), ivs, &(ii[0]));
+					T1d = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
+				   }
+				   {
+					V Ti, T1I, T1J, Tl;
+					{
+					     V T1h, T1C, T2U, T1D, Td, T1E, T1e, T1i, Tg, Th;
+					     Tg = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
+					     Th = LD(&(ri[WS(is, 18)]), ivs, &(ri[0]));
+					     T1h = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
+					     T1C = VADD(T1A, T1B);
+					     T2U = VSUB(T1B, T1A);
+					     T1D = VSUB(Tb, Tc);
+					     Td = VADD(Tb, Tc);
+					     T1E = VSUB(T1c, T1d);
+					     T1e = VADD(T1c, T1d);
+					     T1L = VSUB(Tg, Th);
+					     Ti = VADD(Tg, Th);
+					     T1i = LD(&(ii[WS(is, 18)]), ivs, &(ii[0]));
+					     {
+						  V T2V, T1F, Tj, Tk;
+						  Tj = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
+						  Tk = LD(&(ri[WS(is, 26)]), ivs, &(ri[0]));
+						  Te = VADD(Ta, Td);
+						  T50 = VSUB(Td, Ta);
+						  T2V = VADD(T1D, T1E);
+						  T1F = VSUB(T1D, T1E);
+						  T1f = VADD(T1b, T1e);
+						  T4s = VSUB(T1b, T1e);
+						  T1j = VADD(T1h, T1i);
+						  T1I = VSUB(T1h, T1i);
+						  T1J = VSUB(Tj, Tk);
+						  Tl = VADD(Tj, Tk);
+						  T1G = VADD(T1C, T1F);
+						  T3U = VSUB(T1F, T1C);
+						  T2W = VADD(T2U, T2V);
+						  T3u = VSUB(T2U, T2V);
+						  T1k = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
+						  T1l = LD(&(ii[WS(is, 26)]), ivs, &(ii[0]));
+					     }
+					}
+					T4v = VSUB(Ti, Tl);
+					Tm = VADD(Ti, Tl);
+					T1K = VSUB(T1I, T1J);
+					T3w = VADD(T1J, T1I);
+				   }
+			      }
+			 }
+			 {
+			      V T1r, T1S, T1q, T1s, T4x, T1R, T3z;
+			      {
+				   V Tp, T1P, T1Q, Ts;
+				   {
+					V Tn, To, T1o, T1M, T1m, T1p;
+					Tn = LD(&(ri[WS(is, 30)]), ivs, &(ri[0]));
+					To = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
+					T1o = LD(&(ii[WS(is, 30)]), ivs, &(ii[0]));
+					T1M = VSUB(T1k, T1l);
+					T1m = VADD(T1k, T1l);
+					T1p = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
+					{
+					     V Tq, Tr, T3x, T1N, T4u;
+					     Tq = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
+					     Tr = LD(&(ri[WS(is, 22)]), ivs, &(ri[0]));
+					     T1r = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
+					     T1S = VSUB(Tn, To);
+					     Tp = VADD(Tn, To);
+					     T3x = VSUB(T1L, T1M);
+					     T1N = VADD(T1L, T1M);
+					     T4u = VSUB(T1j, T1m);
+					     T1n = VADD(T1j, T1m);
+					     T1P = VSUB(T1o, T1p);
+					     T1q = VADD(T1o, T1p);
+					     T1Q = VSUB(Tq, Tr);
+					     Ts = VADD(Tq, Tr);
+					     T3X = VFNMS(LDK(KP414213562), T3w, T3x);
+					     T3y = VFMA(LDK(KP414213562), T3x, T3w);
+					     T2Z = VFMA(LDK(KP414213562), T1K, T1N);
+					     T1O = VFNMS(LDK(KP414213562), T1N, T1K);
+					     T53 = VADD(T4v, T4u);
+					     T4w = VSUB(T4u, T4v);
+					     T1s = LD(&(ii[WS(is, 22)]), ivs, &(ii[0]));
+					}
+				   }
+				   T4x = VSUB(Tp, Ts);
+				   Tt = VADD(Tp, Ts);
+				   T1R = VSUB(T1P, T1Q);
+				   T3z = VADD(T1Q, T1P);
+			      }
+			      {
+				   V T4S, T5G, T2y, T2L, T4V, T5H, T2D, T2M;
+				   {
+					V T2G, TN, T4N, T2r, T2s, TQ, T2A, T4O, T2J, T2x, TU, T4T, T2w, T2z, TX;
+					V T2B, T2H, T2I, TR;
+					{
+					     V TL, TM, T2p, T1T, T1t, T2q;
+					     TL = LD(&(ri[WS(is, 31)]), ivs, &(ri[WS(is, 1)]));
+					     TM = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
+					     T2p = LD(&(ii[WS(is, 31)]), ivs, &(ii[WS(is, 1)]));
+					     T1T = VSUB(T1r, T1s);
+					     T1t = VADD(T1r, T1s);
+					     T2q = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
+					     {
+						  V TO, TP, T3A, T1U, T4y;
+						  TO = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
+						  TP = LD(&(ri[WS(is, 23)]), ivs, &(ri[WS(is, 1)]));
+						  T2H = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
+						  T2G = VSUB(TL, TM);
+						  TN = VADD(TL, TM);
+						  T3A = VSUB(T1S, T1T);
+						  T1U = VADD(T1S, T1T);
+						  T4y = VSUB(T1q, T1t);
+						  T1u = VADD(T1q, T1t);
+						  T4N = VADD(T2p, T2q);
+						  T2r = VSUB(T2p, T2q);
+						  T2s = VSUB(TO, TP);
+						  TQ = VADD(TO, TP);
+						  T3W = VFMA(LDK(KP414213562), T3z, T3A);
+						  T3B = VFNMS(LDK(KP414213562), T3A, T3z);
+						  T2Y = VFNMS(LDK(KP414213562), T1R, T1U);
+						  T1V = VFMA(LDK(KP414213562), T1U, T1R);
+						  T52 = VSUB(T4x, T4y);
+						  T4z = VADD(T4x, T4y);
+						  T2I = LD(&(ii[WS(is, 23)]), ivs, &(ii[WS(is, 1)]));
+					     }
+					}
+					{
+					     V TS, TT, T2u, T2v, TV, TW;
+					     TS = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
+					     TT = LD(&(ri[WS(is, 19)]), ivs, &(ri[WS(is, 1)]));
+					     T2u = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
+					     T2v = LD(&(ii[WS(is, 19)]), ivs, &(ii[WS(is, 1)]));
+					     TV = LD(&(ri[WS(is, 27)]), ivs, &(ri[WS(is, 1)]));
+					     TW = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
+					     T2A = LD(&(ii[WS(is, 27)]), ivs, &(ii[WS(is, 1)]));
+					     T4O = VADD(T2H, T2I);
+					     T2J = VSUB(T2H, T2I);
+					     T2x = VSUB(TS, TT);
+					     TU = VADD(TS, TT);
+					     T4T = VADD(T2u, T2v);
+					     T2w = VSUB(T2u, T2v);
+					     T2z = VSUB(TV, TW);
+					     TX = VADD(TV, TW);
+					     T2B = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
+					}
+					T3O = VADD(T2s, T2r);
+					T2t = VSUB(T2r, T2s);
+					T3L = VSUB(T2G, T2J);
+					T2K = VADD(T2G, T2J);
+					T4S = VSUB(TN, TQ);
+					TR = VADD(TN, TQ);
+					{
+					     V T4P, T4Q, TY, T4U, T2C;
+					     T5G = VADD(T4N, T4O);
+					     T4P = VSUB(T4N, T4O);
+					     T4Q = VSUB(TX, TU);
+					     TY = VADD(TU, TX);
+					     T4U = VADD(T2A, T2B);
+					     T2C = VSUB(T2A, T2B);
+					     T2y = VSUB(T2w, T2x);
+					     T2L = VADD(T2x, T2w);
+					     TZ = VADD(TR, TY);
+					     T5F = VSUB(TR, TY);
+					     T4V = VSUB(T4T, T4U);
+					     T5H = VADD(T4T, T4U);
+					     T2D = VADD(T2z, T2C);
+					     T2M = VSUB(T2z, T2C);
+					     T4R = VSUB(T4P, T4Q);
+					     T5k = VADD(T4Q, T4P);
+					}
+				   }
+				   {
+					V T2f, Ty, T23, T4C, T20, T21, TB, T4D, T2i, T26, TF, T24, TG, TH, T29;
+					V T2a;
+					{
+					     V T1Y, T1Z, Tz, TA, T2g, T2h, Tw, Tx, TD, TE;
+					     Tw = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
+					     Tx = LD(&(ri[WS(is, 17)]), ivs, &(ri[WS(is, 1)]));
+					     T5j = VADD(T4S, T4V);
+					     T4W = VSUB(T4S, T4V);
+					     T5I = VSUB(T5G, T5H);
+					     T5X = VADD(T5G, T5H);
+					     T2E = VADD(T2y, T2D);
+					     T3M = VSUB(T2D, T2y);
+					     T2N = VADD(T2L, T2M);
+					     T3P = VSUB(T2L, T2M);
+					     T2f = VSUB(Tw, Tx);
+					     Ty = VADD(Tw, Tx);
+					     T1Y = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
+					     T1Z = LD(&(ii[WS(is, 17)]), ivs, &(ii[WS(is, 1)]));
+					     Tz = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
+					     TA = LD(&(ri[WS(is, 25)]), ivs, &(ri[WS(is, 1)]));
+					     T2g = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
+					     T2h = LD(&(ii[WS(is, 25)]), ivs, &(ii[WS(is, 1)]));
+					     TD = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
+					     TE = LD(&(ri[WS(is, 21)]), ivs, &(ri[WS(is, 1)]));
+					     T23 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
+					     T4C = VADD(T1Y, T1Z);
+					     T20 = VSUB(T1Y, T1Z);
+					     T21 = VSUB(Tz, TA);
+					     TB = VADD(Tz, TA);
+					     T4D = VADD(T2g, T2h);
+					     T2i = VSUB(T2g, T2h);
+					     T26 = VSUB(TD, TE);
+					     TF = VADD(TD, TE);
+					     T24 = LD(&(ii[WS(is, 21)]), ivs, &(ii[WS(is, 1)]));
+					     TG = LD(&(ri[WS(is, 29)]), ivs, &(ri[WS(is, 1)]));
+					     TH = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
+					     T29 = LD(&(ii[WS(is, 29)]), ivs, &(ii[WS(is, 1)]));
+					     T2a = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
+					}
+					{
+					     V T4I, T25, T28, TI, T4J, T2b, T4H, TC, T5B, T4E;
+					     T3H = VADD(T21, T20);
+					     T22 = VSUB(T20, T21);
+					     T3E = VSUB(T2f, T2i);
+					     T2j = VADD(T2f, T2i);
+					     T4I = VADD(T23, T24);
+					     T25 = VSUB(T23, T24);
+					     T28 = VSUB(TG, TH);
+					     TI = VADD(TG, TH);
+					     T4J = VADD(T29, T2a);
+					     T2b = VSUB(T29, T2a);
+					     T4H = VSUB(Ty, TB);
+					     TC = VADD(Ty, TB);
+					     T5B = VADD(T4C, T4D);
+					     T4E = VSUB(T4C, T4D);
+					     {
+						  V T27, T2k, TJ, T4F, T4K, T5C, T2c, T2l;
+						  T27 = VSUB(T25, T26);
+						  T2k = VADD(T26, T25);
+						  TJ = VADD(TF, TI);
+						  T4F = VSUB(TI, TF);
+						  T4K = VSUB(T4I, T4J);
+						  T5C = VADD(T4I, T4J);
+						  T2c = VADD(T28, T2b);
+						  T2l = VSUB(T28, T2b);
+						  T4G = VSUB(T4E, T4F);
+						  T5h = VADD(T4F, T4E);
+						  TK = VADD(TC, TJ);
+						  T5A = VSUB(TC, TJ);
+						  T5D = VSUB(T5B, T5C);
+						  T5W = VADD(T5B, T5C);
+						  T2d = VADD(T27, T2c);
+						  T3F = VSUB(T2c, T27);
+						  T4L = VSUB(T4H, T4K);
+						  T5g = VADD(T4H, T4K);
+						  T3I = VSUB(T2k, T2l);
+						  T2m = VADD(T2k, T2l);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T1v, T1g, T5V, Tv, T60, T5Y, T11, T10;
+			 {
+			      V T5o, T5n, T5i, T5r, T5f, T5l, T5w, T5u;
+			      {
+				   V T5d, T4t, T4A, T4X, T58, T51, T4M, T59, T54, T5e, T5b, T4B;
+				   T5d = VADD(T4r, T4s);
+				   T4t = VSUB(T4r, T4s);
+				   T4A = VSUB(T4w, T4z);
+				   T5o = VADD(T4w, T4z);
+				   T4X = VFNMS(LDK(KP414213562), T4W, T4R);
+				   T58 = VFMA(LDK(KP414213562), T4R, T4W);
+				   T5n = VADD(T50, T4Z);
+				   T51 = VSUB(T4Z, T50);
+				   T4M = VFMA(LDK(KP414213562), T4L, T4G);
+				   T59 = VFNMS(LDK(KP414213562), T4G, T4L);
+				   T54 = VSUB(T52, T53);
+				   T5e = VADD(T53, T52);
+				   T5b = VFNMS(LDK(KP707106781), T4A, T4t);
+				   T4B = VFMA(LDK(KP707106781), T4A, T4t);
+				   {
+					V T5s, T56, T4Y, T5c, T5a, T57, T55, T5t;
+					T5i = VFMA(LDK(KP414213562), T5h, T5g);
+					T5s = VFNMS(LDK(KP414213562), T5g, T5h);
+					T56 = VADD(T4M, T4X);
+					T4Y = VSUB(T4M, T4X);
+					T5c = VADD(T59, T58);
+					T5a = VSUB(T58, T59);
+					T57 = VFMA(LDK(KP707106781), T54, T51);
+					T55 = VFNMS(LDK(KP707106781), T54, T51);
+					T5r = VFNMS(LDK(KP707106781), T5e, T5d);
+					T5f = VFMA(LDK(KP707106781), T5e, T5d);
+					T5t = VFMA(LDK(KP414213562), T5j, T5k);
+					T5l = VFNMS(LDK(KP414213562), T5k, T5j);
+					T61 = VFMA(LDK(KP923879532), T4Y, T4B);
+					STM4(&(ro[6]), T61, ovs, &(ro[0]));
+					T62 = VFNMS(LDK(KP923879532), T4Y, T4B);
+					STM4(&(ro[22]), T62, ovs, &(ro[0]));
+					T63 = VFMA(LDK(KP923879532), T5c, T5b);
+					STM4(&(ro[30]), T63, ovs, &(ro[0]));
+					T64 = VFNMS(LDK(KP923879532), T5c, T5b);
+					STM4(&(ro[14]), T64, ovs, &(ro[0]));
+					T65 = VFMA(LDK(KP923879532), T5a, T57);
+					STM4(&(io[6]), T65, ovs, &(io[0]));
+					T66 = VFNMS(LDK(KP923879532), T5a, T57);
+					STM4(&(io[22]), T66, ovs, &(io[0]));
+					T67 = VFMA(LDK(KP923879532), T56, T55);
+					STM4(&(io[30]), T67, ovs, &(io[0]));
+					T68 = VFNMS(LDK(KP923879532), T56, T55);
+					STM4(&(io[14]), T68, ovs, &(io[0]));
+					T5w = VADD(T5s, T5t);
+					T5u = VSUB(T5s, T5t);
+				   }
+			      }
+			      {
+				   V Tf, T5P, T5z, T5S, T5U, T5O, T5K, T5L, T5M, Tu, T5T, T5N;
+				   {
+					V T5E, T5Q, T5q, T5m, T5v, T5p, T5R, T5J, T5x, T5y;
+					Tf = VADD(T7, Te);
+					T5x = VSUB(T7, Te);
+					T5y = VSUB(T1n, T1u);
+					T1v = VADD(T1n, T1u);
+					T69 = VFMA(LDK(KP923879532), T5u, T5r);
+					STM4(&(ro[10]), T69, ovs, &(ro[0]));
+					T6a = VFNMS(LDK(KP923879532), T5u, T5r);
+					STM4(&(ro[26]), T6a, ovs, &(ro[0]));
+					T5E = VADD(T5A, T5D);
+					T5Q = VSUB(T5D, T5A);
+					T5q = VSUB(T5l, T5i);
+					T5m = VADD(T5i, T5l);
+					T5v = VFMA(LDK(KP707106781), T5o, T5n);
+					T5p = VFNMS(LDK(KP707106781), T5o, T5n);
+					T5P = VSUB(T5x, T5y);
+					T5z = VADD(T5x, T5y);
+					T5R = VADD(T5F, T5I);
+					T5J = VSUB(T5F, T5I);
+					T6b = VFMA(LDK(KP923879532), T5m, T5f);
+					STM4(&(ro[2]), T6b, ovs, &(ro[0]));
+					T6c = VFNMS(LDK(KP923879532), T5m, T5f);
+					STM4(&(ro[18]), T6c, ovs, &(ro[0]));
+					T6d = VFMA(LDK(KP923879532), T5w, T5v);
+					STM4(&(io[2]), T6d, ovs, &(io[0]));
+					T6e = VFNMS(LDK(KP923879532), T5w, T5v);
+					STM4(&(io[18]), T6e, ovs, &(io[0]));
+					T6f = VFMA(LDK(KP923879532), T5q, T5p);
+					STM4(&(io[10]), T6f, ovs, &(io[0]));
+					T6g = VFNMS(LDK(KP923879532), T5q, T5p);
+					STM4(&(io[26]), T6g, ovs, &(io[0]));
+					T5S = VSUB(T5Q, T5R);
+					T5U = VADD(T5Q, T5R);
+					T5O = VSUB(T5J, T5E);
+					T5K = VADD(T5E, T5J);
+					T1g = VADD(T18, T1f);
+					T5L = VSUB(T18, T1f);
+					T5M = VSUB(Tt, Tm);
+					Tu = VADD(Tm, Tt);
+				   }
+				   T6h = VFMA(LDK(KP707106781), T5S, T5P);
+				   STM4(&(ro[12]), T6h, ovs, &(ro[0]));
+				   T6i = VFNMS(LDK(KP707106781), T5S, T5P);
+				   STM4(&(ro[28]), T6i, ovs, &(ro[0]));
+				   T6j = VFMA(LDK(KP707106781), T5K, T5z);
+				   STM4(&(ro[4]), T6j, ovs, &(ro[0]));
+				   T6k = VFNMS(LDK(KP707106781), T5K, T5z);
+				   STM4(&(ro[20]), T6k, ovs, &(ro[0]));
+				   T5T = VADD(T5M, T5L);
+				   T5N = VSUB(T5L, T5M);
+				   T5V = VSUB(Tf, Tu);
+				   Tv = VADD(Tf, Tu);
+				   T6l = VFMA(LDK(KP707106781), T5U, T5T);
+				   STM4(&(io[4]), T6l, ovs, &(io[0]));
+				   T6m = VFNMS(LDK(KP707106781), T5U, T5T);
+				   STM4(&(io[20]), T6m, ovs, &(io[0]));
+				   T6n = VFMA(LDK(KP707106781), T5O, T5N);
+				   STM4(&(io[12]), T6n, ovs, &(io[0]));
+				   T6o = VFNMS(LDK(KP707106781), T5O, T5N);
+				   STM4(&(io[28]), T6o, ovs, &(io[0]));
+				   T60 = VADD(T5W, T5X);
+				   T5Y = VSUB(T5W, T5X);
+				   T11 = VSUB(TZ, TK);
+				   T10 = VADD(TK, TZ);
+			      }
+			 }
+			 {
+			      V T39, T3k, T3j, T3a, T1X, T37, T33, T31, T3d, T3c, T47, T4i, T4h, T48, T4b;
+			      V T4a, T4e, T3N, T41, T3D, T45, T3Z, T38, T36, T32, T2Q, T42, T3K, T3Q, T4d;
+			      {
+				   V T2e, T2n, T2F, T2O, T1w, T5Z;
+				   {
+					V T1H, T1W, T2X, T30;
+					T39 = VFMA(LDK(KP707106781), T1G, T1z);
+					T1H = VFNMS(LDK(KP707106781), T1G, T1z);
+					T1W = VSUB(T1O, T1V);
+					T3k = VADD(T1O, T1V);
+					T3j = VFMA(LDK(KP707106781), T2W, T2T);
+					T2X = VFNMS(LDK(KP707106781), T2W, T2T);
+					T30 = VSUB(T2Y, T2Z);
+					T3a = VADD(T2Z, T2Y);
+					T6p = VSUB(T5V, T5Y);
+					STM4(&(ro[24]), T6p, ovs, &(ro[0]));
+					T6q = VADD(T5V, T5Y);
+					STM4(&(ro[8]), T6q, ovs, &(ro[0]));
+					T6r = VADD(Tv, T10);
+					STM4(&(ro[0]), T6r, ovs, &(ro[0]));
+					T6s = VSUB(Tv, T10);
+					STM4(&(ro[16]), T6s, ovs, &(ro[0]));
+					T1w = VSUB(T1g, T1v);
+					T5Z = VADD(T1g, T1v);
+					T1X = VFMA(LDK(KP923879532), T1W, T1H);
+					T37 = VFNMS(LDK(KP923879532), T1W, T1H);
+					T33 = VFMA(LDK(KP923879532), T30, T2X);
+					T31 = VFNMS(LDK(KP923879532), T30, T2X);
+				   }
+				   T3d = VFMA(LDK(KP707106781), T2d, T22);
+				   T2e = VFNMS(LDK(KP707106781), T2d, T22);
+				   T2n = VFNMS(LDK(KP707106781), T2m, T2j);
+				   T3c = VFMA(LDK(KP707106781), T2m, T2j);
+				   T6t = VADD(T5Z, T60);
+				   STM4(&(io[0]), T6t, ovs, &(io[0]));
+				   T6u = VSUB(T5Z, T60);
+				   STM4(&(io[16]), T6u, ovs, &(io[0]));
+				   T6v = VSUB(T1w, T11);
+				   STM4(&(io[24]), T6v, ovs, &(io[0]));
+				   T6w = VADD(T11, T1w);
+				   STM4(&(io[8]), T6w, ovs, &(io[0]));
+				   T3g = VFMA(LDK(KP707106781), T2E, T2t);
+				   T2F = VFNMS(LDK(KP707106781), T2E, T2t);
+				   T2O = VFNMS(LDK(KP707106781), T2N, T2K);
+				   T3f = VFMA(LDK(KP707106781), T2N, T2K);
+				   {
+					V T3v, T35, T2o, T3C, T3V, T3Y;
+					T47 = VFNMS(LDK(KP707106781), T3u, T3t);
+					T3v = VFMA(LDK(KP707106781), T3u, T3t);
+					T35 = VFNMS(LDK(KP668178637), T2e, T2n);
+					T2o = VFMA(LDK(KP668178637), T2n, T2e);
+					T3C = VSUB(T3y, T3B);
+					T4i = VADD(T3y, T3B);
+					T4h = VFNMS(LDK(KP707106781), T3U, T3T);
+					T3V = VFMA(LDK(KP707106781), T3U, T3T);
+					T3Y = VSUB(T3W, T3X);
+					T48 = VADD(T3X, T3W);
+					{
+					     V T3G, T34, T2P, T3J;
+					     T4b = VFMA(LDK(KP707106781), T3F, T3E);
+					     T3G = VFNMS(LDK(KP707106781), T3F, T3E);
+					     T34 = VFMA(LDK(KP668178637), T2F, T2O);
+					     T2P = VFNMS(LDK(KP668178637), T2O, T2F);
+					     T3J = VFNMS(LDK(KP707106781), T3I, T3H);
+					     T4a = VFMA(LDK(KP707106781), T3I, T3H);
+					     T4e = VFMA(LDK(KP707106781), T3M, T3L);
+					     T3N = VFNMS(LDK(KP707106781), T3M, T3L);
+					     T41 = VFNMS(LDK(KP923879532), T3C, T3v);
+					     T3D = VFMA(LDK(KP923879532), T3C, T3v);
+					     T45 = VFMA(LDK(KP923879532), T3Y, T3V);
+					     T3Z = VFNMS(LDK(KP923879532), T3Y, T3V);
+					     T38 = VADD(T35, T34);
+					     T36 = VSUB(T34, T35);
+					     T32 = VADD(T2o, T2P);
+					     T2Q = VSUB(T2o, T2P);
+					     T42 = VFNMS(LDK(KP668178637), T3G, T3J);
+					     T3K = VFMA(LDK(KP668178637), T3J, T3G);
+					     T3Q = VFNMS(LDK(KP707106781), T3P, T3O);
+					     T4d = VFMA(LDK(KP707106781), T3P, T3O);
+					}
+				   }
+			      }
+			      {
+				   V T4n, T4c, T43, T3R, T4m, T4f;
+				   T6x = VFMA(LDK(KP831469612), T38, T37);
+				   STM4(&(ro[29]), T6x, ovs, &(ro[1]));
+				   T6y = VFNMS(LDK(KP831469612), T38, T37);
+				   STM4(&(ro[13]), T6y, ovs, &(ro[1]));
+				   T6z = VFMA(LDK(KP831469612), T36, T33);
+				   STM4(&(io[5]), T6z, ovs, &(io[1]));
+				   T6A = VFNMS(LDK(KP831469612), T36, T33);
+				   STM4(&(io[21]), T6A, ovs, &(io[1]));
+				   T6B = VFMA(LDK(KP831469612), T32, T31);
+				   STM4(&(io[29]), T6B, ovs, &(io[1]));
+				   T6C = VFNMS(LDK(KP831469612), T32, T31);
+				   STM4(&(io[13]), T6C, ovs, &(io[1]));
+				   T6D = VFMA(LDK(KP831469612), T2Q, T1X);
+				   STM4(&(ro[5]), T6D, ovs, &(ro[1]));
+				   T6E = VFNMS(LDK(KP831469612), T2Q, T1X);
+				   STM4(&(ro[21]), T6E, ovs, &(ro[1]));
+				   T43 = VFMA(LDK(KP668178637), T3N, T3Q);
+				   T3R = VFNMS(LDK(KP668178637), T3Q, T3N);
+				   {
+					V T44, T46, T40, T3S;
+					T44 = VSUB(T42, T43);
+					T46 = VADD(T42, T43);
+					T40 = VSUB(T3R, T3K);
+					T3S = VADD(T3K, T3R);
+					T4p = VFMA(LDK(KP923879532), T48, T47);
+					T49 = VFNMS(LDK(KP923879532), T48, T47);
+					T4l = VFNMS(LDK(KP923879532), T4i, T4h);
+					T4j = VFMA(LDK(KP923879532), T4i, T4h);
+					T4n = VFNMS(LDK(KP198912367), T4a, T4b);
+					T4c = VFMA(LDK(KP198912367), T4b, T4a);
+					T6F = VFMA(LDK(KP831469612), T44, T41);
+					STM4(&(ro[11]), T6F, ovs, &(ro[1]));
+					T6G = VFNMS(LDK(KP831469612), T44, T41);
+					STM4(&(ro[27]), T6G, ovs, &(ro[1]));
+					T6H = VFMA(LDK(KP831469612), T46, T45);
+					STM4(&(io[3]), T6H, ovs, &(io[1]));
+					T6I = VFNMS(LDK(KP831469612), T46, T45);
+					STM4(&(io[19]), T6I, ovs, &(io[1]));
+					T6J = VFMA(LDK(KP831469612), T40, T3Z);
+					STM4(&(io[11]), T6J, ovs, &(io[1]));
+					T6K = VFNMS(LDK(KP831469612), T40, T3Z);
+					STM4(&(io[27]), T6K, ovs, &(io[1]));
+					T6L = VFMA(LDK(KP831469612), T3S, T3D);
+					STM4(&(ro[3]), T6L, ovs, &(ro[1]));
+					T6M = VFNMS(LDK(KP831469612), T3S, T3D);
+					STM4(&(ro[19]), T6M, ovs, &(ro[1]));
+				   }
+				   T4m = VFMA(LDK(KP198912367), T4d, T4e);
+				   T4f = VFNMS(LDK(KP198912367), T4e, T4d);
+				   T3n = VFNMS(LDK(KP923879532), T3a, T39);
+				   T3b = VFMA(LDK(KP923879532), T3a, T39);
+				   T3r = VFMA(LDK(KP923879532), T3k, T3j);
+				   T3l = VFNMS(LDK(KP923879532), T3k, T3j);
+				   T3o = VFNMS(LDK(KP198912367), T3c, T3d);
+				   T3e = VFMA(LDK(KP198912367), T3d, T3c);
+				   T4q = VADD(T4n, T4m);
+				   T4o = VSUB(T4m, T4n);
+				   T4k = VADD(T4c, T4f);
+				   T4g = VSUB(T4c, T4f);
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T6N, T6O, T6P, T6Q;
+		    T6N = VFMA(LDK(KP980785280), T4q, T4p);
+		    STM4(&(ro[31]), T6N, ovs, &(ro[1]));
+		    STN4(&(ro[28]), T6i, T6x, T63, T6N, ovs);
+		    T6O = VFNMS(LDK(KP980785280), T4q, T4p);
+		    STM4(&(ro[15]), T6O, ovs, &(ro[1]));
+		    STN4(&(ro[12]), T6h, T6y, T64, T6O, ovs);
+		    T6P = VFMA(LDK(KP980785280), T4o, T4l);
+		    STM4(&(io[7]), T6P, ovs, &(io[1]));
+		    STN4(&(io[4]), T6l, T6z, T65, T6P, ovs);
+		    T6Q = VFNMS(LDK(KP980785280), T4o, T4l);
+		    STM4(&(io[23]), T6Q, ovs, &(io[1]));
+		    STN4(&(io[20]), T6m, T6A, T66, T6Q, ovs);
+		    {
+			 V T6R, T6S, T6T, T6U;
+			 T6R = VFMA(LDK(KP980785280), T4k, T4j);
+			 STM4(&(io[31]), T6R, ovs, &(io[1]));
+			 STN4(&(io[28]), T6o, T6B, T67, T6R, ovs);
+			 T6S = VFNMS(LDK(KP980785280), T4k, T4j);
+			 STM4(&(io[15]), T6S, ovs, &(io[1]));
+			 STN4(&(io[12]), T6n, T6C, T68, T6S, ovs);
+			 T6T = VFMA(LDK(KP980785280), T4g, T49);
+			 STM4(&(ro[7]), T6T, ovs, &(ro[1]));
+			 STN4(&(ro[4]), T6j, T6D, T61, T6T, ovs);
+			 T6U = VFNMS(LDK(KP980785280), T4g, T49);
+			 STM4(&(ro[23]), T6U, ovs, &(ro[1]));
+			 STN4(&(ro[20]), T6k, T6E, T62, T6U, ovs);
+			 T3h = VFNMS(LDK(KP198912367), T3g, T3f);
+			 T3p = VFMA(LDK(KP198912367), T3f, T3g);
+		    }
+	       }
+	       {
+		    V T3s, T3q, T3i, T3m;
+		    T3s = VADD(T3o, T3p);
+		    T3q = VSUB(T3o, T3p);
+		    T3i = VADD(T3e, T3h);
+		    T3m = VSUB(T3h, T3e);
+		    {
+			 V T6V, T6W, T6X, T6Y;
+			 T6V = VFMA(LDK(KP980785280), T3q, T3n);
+			 STM4(&(ro[9]), T6V, ovs, &(ro[1]));
+			 STN4(&(ro[8]), T6q, T6V, T69, T6F, ovs);
+			 T6W = VFNMS(LDK(KP980785280), T3q, T3n);
+			 STM4(&(ro[25]), T6W, ovs, &(ro[1]));
+			 STN4(&(ro[24]), T6p, T6W, T6a, T6G, ovs);
+			 T6X = VFMA(LDK(KP980785280), T3s, T3r);
+			 STM4(&(io[1]), T6X, ovs, &(io[1]));
+			 STN4(&(io[0]), T6t, T6X, T6d, T6H, ovs);
+			 T6Y = VFNMS(LDK(KP980785280), T3s, T3r);
+			 STM4(&(io[17]), T6Y, ovs, &(io[1]));
+			 STN4(&(io[16]), T6u, T6Y, T6e, T6I, ovs);
+			 {
+			      V T6Z, T70, T71, T72;
+			      T6Z = VFMA(LDK(KP980785280), T3m, T3l);
+			      STM4(&(io[9]), T6Z, ovs, &(io[1]));
+			      STN4(&(io[8]), T6w, T6Z, T6f, T6J, ovs);
+			      T70 = VFNMS(LDK(KP980785280), T3m, T3l);
+			      STM4(&(io[25]), T70, ovs, &(io[1]));
+			      STN4(&(io[24]), T6v, T70, T6g, T6K, ovs);
+			      T71 = VFMA(LDK(KP980785280), T3i, T3b);
+			      STM4(&(ro[1]), T71, ovs, &(ro[1]));
+			      STN4(&(ro[0]), T6r, T71, T6b, T6L, ovs);
+			      T72 = VFNMS(LDK(KP980785280), T3i, T3b);
+			      STM4(&(ro[17]), T72, ovs, &(ro[1]));
+			      STN4(&(ro[16]), T6s, T72, T6c, T6M, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 32, XSIMD_STRING("n2sv_32"), {236, 0, 136, 0}, &GENUS, 0, 1, 0, 0 };
+
+void XSIMD(codelet_n2sv_32) (planner *p) {
+     X(kdft_register) (p, n2sv_32, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n2sv_32 -with-ostride 1 -include n2s.h -store-multiple 4 */
+
+/*
+ * This function contains 372 FP additions, 84 FP multiplications,
+ * (or, 340 additions, 52 multiplications, 32 fused multiply/add),
+ * 130 stack variables, 7 constants, and 144 memory accesses
+ */
+#include "n2s.h"
+
+static void n2sv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
+	       V T7, T4r, T4Z, T18, T1z, T3t, T3T, T2T, Te, T1f, T50, T4s, T2W, T3u, T1G;
+	       V T3U, Tm, T1n, T1O, T2Z, T3y, T3X, T4w, T53, Tt, T1u, T1V, T2Y, T3B, T3W;
+	       V T4z, T52, T2t, T3L, T3O, T2K, TR, TY, T5F, T5G, T5H, T5I, T4R, T5j, T2E;
+	       V T3P, T4W, T5k, T2N, T3M, T22, T3E, T3H, T2j, TC, TJ, T5A, T5B, T5C, T5D;
+	       V T4G, T5g, T2d, T3F, T4L, T5h, T2m, T3I;
+	       {
+		    V T3, T1x, T14, T2S, T6, T2R, T17, T1y;
+		    {
+			 V T1, T2, T12, T13;
+			 T1 = LD(&(ri[0]), ivs, &(ri[0]));
+			 T2 = LD(&(ri[WS(is, 16)]), ivs, &(ri[0]));
+			 T3 = VADD(T1, T2);
+			 T1x = VSUB(T1, T2);
+			 T12 = LD(&(ii[0]), ivs, &(ii[0]));
+			 T13 = LD(&(ii[WS(is, 16)]), ivs, &(ii[0]));
+			 T14 = VADD(T12, T13);
+			 T2S = VSUB(T12, T13);
+		    }
+		    {
+			 V T4, T5, T15, T16;
+			 T4 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
+			 T5 = LD(&(ri[WS(is, 24)]), ivs, &(ri[0]));
+			 T6 = VADD(T4, T5);
+			 T2R = VSUB(T4, T5);
+			 T15 = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
+			 T16 = LD(&(ii[WS(is, 24)]), ivs, &(ii[0]));
+			 T17 = VADD(T15, T16);
+			 T1y = VSUB(T15, T16);
+		    }
+		    T7 = VADD(T3, T6);
+		    T4r = VSUB(T3, T6);
+		    T4Z = VSUB(T14, T17);
+		    T18 = VADD(T14, T17);
+		    T1z = VSUB(T1x, T1y);
+		    T3t = VADD(T1x, T1y);
+		    T3T = VSUB(T2S, T2R);
+		    T2T = VADD(T2R, T2S);
+	       }
+	       {
+		    V Ta, T1B, T1b, T1A, Td, T1D, T1e, T1E;
+		    {
+			 V T8, T9, T19, T1a;
+			 T8 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
+			 T9 = LD(&(ri[WS(is, 20)]), ivs, &(ri[0]));
+			 Ta = VADD(T8, T9);
+			 T1B = VSUB(T8, T9);
+			 T19 = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
+			 T1a = LD(&(ii[WS(is, 20)]), ivs, &(ii[0]));
+			 T1b = VADD(T19, T1a);
+			 T1A = VSUB(T19, T1a);
+		    }
+		    {
+			 V Tb, Tc, T1c, T1d;
+			 Tb = LD(&(ri[WS(is, 28)]), ivs, &(ri[0]));
+			 Tc = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
+			 Td = VADD(Tb, Tc);
+			 T1D = VSUB(Tb, Tc);
+			 T1c = LD(&(ii[WS(is, 28)]), ivs, &(ii[0]));
+			 T1d = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
+			 T1e = VADD(T1c, T1d);
+			 T1E = VSUB(T1c, T1d);
+		    }
+		    Te = VADD(Ta, Td);
+		    T1f = VADD(T1b, T1e);
+		    T50 = VSUB(Td, Ta);
+		    T4s = VSUB(T1b, T1e);
+		    {
+			 V T2U, T2V, T1C, T1F;
+			 T2U = VSUB(T1D, T1E);
+			 T2V = VADD(T1B, T1A);
+			 T2W = VMUL(LDK(KP707106781), VSUB(T2U, T2V));
+			 T3u = VMUL(LDK(KP707106781), VADD(T2V, T2U));
+			 T1C = VSUB(T1A, T1B);
+			 T1F = VADD(T1D, T1E);
+			 T1G = VMUL(LDK(KP707106781), VSUB(T1C, T1F));
+			 T3U = VMUL(LDK(KP707106781), VADD(T1C, T1F));
+		    }
+	       }
+	       {
+		    V Ti, T1L, T1j, T1J, Tl, T1I, T1m, T1M, T1K, T1N;
+		    {
+			 V Tg, Th, T1h, T1i;
+			 Tg = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
+			 Th = LD(&(ri[WS(is, 18)]), ivs, &(ri[0]));
+			 Ti = VADD(Tg, Th);
+			 T1L = VSUB(Tg, Th);
+			 T1h = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
+			 T1i = LD(&(ii[WS(is, 18)]), ivs, &(ii[0]));
+			 T1j = VADD(T1h, T1i);
+			 T1J = VSUB(T1h, T1i);
+		    }
+		    {
+			 V Tj, Tk, T1k, T1l;
+			 Tj = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
+			 Tk = LD(&(ri[WS(is, 26)]), ivs, &(ri[0]));
+			 Tl = VADD(Tj, Tk);
+			 T1I = VSUB(Tj, Tk);
+			 T1k = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
+			 T1l = LD(&(ii[WS(is, 26)]), ivs, &(ii[0]));
+			 T1m = VADD(T1k, T1l);
+			 T1M = VSUB(T1k, T1l);
+		    }
+		    Tm = VADD(Ti, Tl);
+		    T1n = VADD(T1j, T1m);
+		    T1K = VADD(T1I, T1J);
+		    T1N = VSUB(T1L, T1M);
+		    T1O = VFNMS(LDK(KP923879532), T1N, VMUL(LDK(KP382683432), T1K));
+		    T2Z = VFMA(LDK(KP923879532), T1K, VMUL(LDK(KP382683432), T1N));
+		    {
+			 V T3w, T3x, T4u, T4v;
+			 T3w = VSUB(T1J, T1I);
+			 T3x = VADD(T1L, T1M);
+			 T3y = VFNMS(LDK(KP382683432), T3x, VMUL(LDK(KP923879532), T3w));
+			 T3X = VFMA(LDK(KP382683432), T3w, VMUL(LDK(KP923879532), T3x));
+			 T4u = VSUB(T1j, T1m);
+			 T4v = VSUB(Ti, Tl);
+			 T4w = VSUB(T4u, T4v);
+			 T53 = VADD(T4v, T4u);
+		    }
+	       }
+	       {
+		    V Tp, T1S, T1q, T1Q, Ts, T1P, T1t, T1T, T1R, T1U;
+		    {
+			 V Tn, To, T1o, T1p;
+			 Tn = LD(&(ri[WS(is, 30)]), ivs, &(ri[0]));
+			 To = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
+			 Tp = VADD(Tn, To);
+			 T1S = VSUB(Tn, To);
+			 T1o = LD(&(ii[WS(is, 30)]), ivs, &(ii[0]));
+			 T1p = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
+			 T1q = VADD(T1o, T1p);
+			 T1Q = VSUB(T1o, T1p);
+		    }
+		    {
+			 V Tq, Tr, T1r, T1s;
+			 Tq = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
+			 Tr = LD(&(ri[WS(is, 22)]), ivs, &(ri[0]));
+			 Ts = VADD(Tq, Tr);
+			 T1P = VSUB(Tq, Tr);
+			 T1r = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
+			 T1s = LD(&(ii[WS(is, 22)]), ivs, &(ii[0]));
+			 T1t = VADD(T1r, T1s);
+			 T1T = VSUB(T1r, T1s);
+		    }
+		    Tt = VADD(Tp, Ts);
+		    T1u = VADD(T1q, T1t);
+		    T1R = VADD(T1P, T1Q);
+		    T1U = VSUB(T1S, T1T);
+		    T1V = VFMA(LDK(KP382683432), T1R, VMUL(LDK(KP923879532), T1U));
+		    T2Y = VFNMS(LDK(KP923879532), T1R, VMUL(LDK(KP382683432), T1U));
+		    {
+			 V T3z, T3A, T4x, T4y;
+			 T3z = VSUB(T1Q, T1P);
+			 T3A = VADD(T1S, T1T);
+			 T3B = VFMA(LDK(KP923879532), T3z, VMUL(LDK(KP382683432), T3A));
+			 T3W = VFNMS(LDK(KP382683432), T3z, VMUL(LDK(KP923879532), T3A));
+			 T4x = VSUB(Tp, Ts);
+			 T4y = VSUB(T1q, T1t);
+			 T4z = VADD(T4x, T4y);
+			 T52 = VSUB(T4x, T4y);
+		    }
+	       }
+	       {
+		    V TN, T2p, T2J, T4S, TQ, T2G, T2s, T4T, TU, T2x, T2w, T4O, TX, T2z, T2C;
+		    V T4P;
+		    {
+			 V TL, TM, T2H, T2I;
+			 TL = LD(&(ri[WS(is, 31)]), ivs, &(ri[WS(is, 1)]));
+			 TM = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
+			 TN = VADD(TL, TM);
+			 T2p = VSUB(TL, TM);
+			 T2H = LD(&(ii[WS(is, 31)]), ivs, &(ii[WS(is, 1)]));
+			 T2I = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
+			 T2J = VSUB(T2H, T2I);
+			 T4S = VADD(T2H, T2I);
+		    }
+		    {
+			 V TO, TP, T2q, T2r;
+			 TO = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
+			 TP = LD(&(ri[WS(is, 23)]), ivs, &(ri[WS(is, 1)]));
+			 TQ = VADD(TO, TP);
+			 T2G = VSUB(TO, TP);
+			 T2q = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
+			 T2r = LD(&(ii[WS(is, 23)]), ivs, &(ii[WS(is, 1)]));
+			 T2s = VSUB(T2q, T2r);
+			 T4T = VADD(T2q, T2r);
+		    }
+		    {
+			 V TS, TT, T2u, T2v;
+			 TS = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
+			 TT = LD(&(ri[WS(is, 19)]), ivs, &(ri[WS(is, 1)]));
+			 TU = VADD(TS, TT);
+			 T2x = VSUB(TS, TT);
+			 T2u = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
+			 T2v = LD(&(ii[WS(is, 19)]), ivs, &(ii[WS(is, 1)]));
+			 T2w = VSUB(T2u, T2v);
+			 T4O = VADD(T2u, T2v);
+		    }
+		    {
+			 V TV, TW, T2A, T2B;
+			 TV = LD(&(ri[WS(is, 27)]), ivs, &(ri[WS(is, 1)]));
+			 TW = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
+			 TX = VADD(TV, TW);
+			 T2z = VSUB(TV, TW);
+			 T2A = LD(&(ii[WS(is, 27)]), ivs, &(ii[WS(is, 1)]));
+			 T2B = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
+			 T2C = VSUB(T2A, T2B);
+			 T4P = VADD(T2A, T2B);
+		    }
+		    T2t = VSUB(T2p, T2s);
+		    T3L = VADD(T2p, T2s);
+		    T3O = VSUB(T2J, T2G);
+		    T2K = VADD(T2G, T2J);
+		    TR = VADD(TN, TQ);
+		    TY = VADD(TU, TX);
+		    T5F = VSUB(TR, TY);
+		    {
+			 V T4N, T4Q, T2y, T2D;
+			 T5G = VADD(T4S, T4T);
+			 T5H = VADD(T4O, T4P);
+			 T5I = VSUB(T5G, T5H);
+			 T4N = VSUB(TN, TQ);
+			 T4Q = VSUB(T4O, T4P);
+			 T4R = VSUB(T4N, T4Q);
+			 T5j = VADD(T4N, T4Q);
+			 T2y = VSUB(T2w, T2x);
+			 T2D = VADD(T2z, T2C);
+			 T2E = VMUL(LDK(KP707106781), VSUB(T2y, T2D));
+			 T3P = VMUL(LDK(KP707106781), VADD(T2y, T2D));
+			 {
+			      V T4U, T4V, T2L, T2M;
+			      T4U = VSUB(T4S, T4T);
+			      T4V = VSUB(TX, TU);
+			      T4W = VSUB(T4U, T4V);
+			      T5k = VADD(T4V, T4U);
+			      T2L = VSUB(T2z, T2C);
+			      T2M = VADD(T2x, T2w);
+			      T2N = VMUL(LDK(KP707106781), VSUB(T2L, T2M));
+			      T3M = VMUL(LDK(KP707106781), VADD(T2M, T2L));
+			 }
+		    }
+	       }
+	       {
+		    V Ty, T2f, T21, T4C, TB, T1Y, T2i, T4D, TF, T28, T2b, T4I, TI, T23, T26;
+		    V T4J;
+		    {
+			 V Tw, Tx, T1Z, T20;
+			 Tw = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
+			 Tx = LD(&(ri[WS(is, 17)]), ivs, &(ri[WS(is, 1)]));
+			 Ty = VADD(Tw, Tx);
+			 T2f = VSUB(Tw, Tx);
+			 T1Z = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
+			 T20 = LD(&(ii[WS(is, 17)]), ivs, &(ii[WS(is, 1)]));
+			 T21 = VSUB(T1Z, T20);
+			 T4C = VADD(T1Z, T20);
+		    }
+		    {
+			 V Tz, TA, T2g, T2h;
+			 Tz = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
+			 TA = LD(&(ri[WS(is, 25)]), ivs, &(ri[WS(is, 1)]));
+			 TB = VADD(Tz, TA);
+			 T1Y = VSUB(Tz, TA);
+			 T2g = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
+			 T2h = LD(&(ii[WS(is, 25)]), ivs, &(ii[WS(is, 1)]));
+			 T2i = VSUB(T2g, T2h);
+			 T4D = VADD(T2g, T2h);
+		    }
+		    {
+			 V TD, TE, T29, T2a;
+			 TD = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
+			 TE = LD(&(ri[WS(is, 21)]), ivs, &(ri[WS(is, 1)]));
+			 TF = VADD(TD, TE);
+			 T28 = VSUB(TD, TE);
+			 T29 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
+			 T2a = LD(&(ii[WS(is, 21)]), ivs, &(ii[WS(is, 1)]));
+			 T2b = VSUB(T29, T2a);
+			 T4I = VADD(T29, T2a);
+		    }
+		    {
+			 V TG, TH, T24, T25;
+			 TG = LD(&(ri[WS(is, 29)]), ivs, &(ri[WS(is, 1)]));
+			 TH = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
+			 TI = VADD(TG, TH);
+			 T23 = VSUB(TG, TH);
+			 T24 = LD(&(ii[WS(is, 29)]), ivs, &(ii[WS(is, 1)]));
+			 T25 = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
+			 T26 = VSUB(T24, T25);
+			 T4J = VADD(T24, T25);
+		    }
+		    T22 = VADD(T1Y, T21);
+		    T3E = VADD(T2f, T2i);
+		    T3H = VSUB(T21, T1Y);
+		    T2j = VSUB(T2f, T2i);
+		    TC = VADD(Ty, TB);
+		    TJ = VADD(TF, TI);
+		    T5A = VSUB(TC, TJ);
+		    {
+			 V T4E, T4F, T27, T2c;
+			 T5B = VADD(T4C, T4D);
+			 T5C = VADD(T4I, T4J);
+			 T5D = VSUB(T5B, T5C);
+			 T4E = VSUB(T4C, T4D);
+			 T4F = VSUB(TI, TF);
+			 T4G = VSUB(T4E, T4F);
+			 T5g = VADD(T4F, T4E);
+			 T27 = VSUB(T23, T26);
+			 T2c = VADD(T28, T2b);
+			 T2d = VMUL(LDK(KP707106781), VSUB(T27, T2c));
+			 T3F = VMUL(LDK(KP707106781), VADD(T2c, T27));
+			 {
+			      V T4H, T4K, T2k, T2l;
+			      T4H = VSUB(Ty, TB);
+			      T4K = VSUB(T4I, T4J);
+			      T4L = VSUB(T4H, T4K);
+			      T5h = VADD(T4H, T4K);
+			      T2k = VSUB(T2b, T28);
+			      T2l = VADD(T23, T26);
+			      T2m = VMUL(LDK(KP707106781), VSUB(T2k, T2l));
+			      T3I = VMUL(LDK(KP707106781), VADD(T2k, T2l));
+			 }
+		    }
+	       }
+	       {
+		    V T61, T62, T63, T64, T65, T66, T67, T68, T69, T6a, T6b, T6c, T6d, T6e, T6f;
+		    V T6g, T6h, T6i, T6j, T6k, T6l, T6m, T6n, T6o, T6p, T6q, T6r, T6s, T6t, T6u;
+		    V T6v, T6w;
+		    {
+			 V T4B, T57, T5a, T5c, T4Y, T56, T55, T5b;
+			 {
+			      V T4t, T4A, T58, T59;
+			      T4t = VSUB(T4r, T4s);
+			      T4A = VMUL(LDK(KP707106781), VSUB(T4w, T4z));
+			      T4B = VADD(T4t, T4A);
+			      T57 = VSUB(T4t, T4A);
+			      T58 = VFNMS(LDK(KP923879532), T4L, VMUL(LDK(KP382683432), T4G));
+			      T59 = VFMA(LDK(KP382683432), T4W, VMUL(LDK(KP923879532), T4R));
+			      T5a = VSUB(T58, T59);
+			      T5c = VADD(T58, T59);
+			 }
+			 {
+			      V T4M, T4X, T51, T54;
+			      T4M = VFMA(LDK(KP923879532), T4G, VMUL(LDK(KP382683432), T4L));
+			      T4X = VFNMS(LDK(KP923879532), T4W, VMUL(LDK(KP382683432), T4R));
+			      T4Y = VADD(T4M, T4X);
+			      T56 = VSUB(T4X, T4M);
+			      T51 = VSUB(T4Z, T50);
+			      T54 = VMUL(LDK(KP707106781), VSUB(T52, T53));
+			      T55 = VSUB(T51, T54);
+			      T5b = VADD(T51, T54);
+			 }
+			 T61 = VSUB(T4B, T4Y);
+			 STM4(&(ro[22]), T61, ovs, &(ro[0]));
+			 T62 = VSUB(T5b, T5c);
+			 STM4(&(io[22]), T62, ovs, &(io[0]));
+			 T63 = VADD(T4B, T4Y);
+			 STM4(&(ro[6]), T63, ovs, &(ro[0]));
+			 T64 = VADD(T5b, T5c);
+			 STM4(&(io[6]), T64, ovs, &(io[0]));
+			 T65 = VSUB(T55, T56);
+			 STM4(&(io[30]), T65, ovs, &(io[0]));
+			 T66 = VSUB(T57, T5a);
+			 STM4(&(ro[30]), T66, ovs, &(ro[0]));
+			 T67 = VADD(T55, T56);
+			 STM4(&(io[14]), T67, ovs, &(io[0]));
+			 T68 = VADD(T57, T5a);
+			 STM4(&(ro[14]), T68, ovs, &(ro[0]));
+		    }
+		    {
+			 V T5f, T5r, T5u, T5w, T5m, T5q, T5p, T5v;
+			 {
+			      V T5d, T5e, T5s, T5t;
+			      T5d = VADD(T4r, T4s);
+			      T5e = VMUL(LDK(KP707106781), VADD(T53, T52));
+			      T5f = VADD(T5d, T5e);
+			      T5r = VSUB(T5d, T5e);
+			      T5s = VFNMS(LDK(KP382683432), T5h, VMUL(LDK(KP923879532), T5g));
+			      T5t = VFMA(LDK(KP923879532), T5k, VMUL(LDK(KP382683432), T5j));
+			      T5u = VSUB(T5s, T5t);
+			      T5w = VADD(T5s, T5t);
+			 }
+			 {
+			      V T5i, T5l, T5n, T5o;
+			      T5i = VFMA(LDK(KP382683432), T5g, VMUL(LDK(KP923879532), T5h));
+			      T5l = VFNMS(LDK(KP382683432), T5k, VMUL(LDK(KP923879532), T5j));
+			      T5m = VADD(T5i, T5l);
+			      T5q = VSUB(T5l, T5i);
+			      T5n = VADD(T50, T4Z);
+			      T5o = VMUL(LDK(KP707106781), VADD(T4w, T4z));
+			      T5p = VSUB(T5n, T5o);
+			      T5v = VADD(T5n, T5o);
+			 }
+			 T69 = VSUB(T5f, T5m);
+			 STM4(&(ro[18]), T69, ovs, &(ro[0]));
+			 T6a = VSUB(T5v, T5w);
+			 STM4(&(io[18]), T6a, ovs, &(io[0]));
+			 T6b = VADD(T5f, T5m);
+			 STM4(&(ro[2]), T6b, ovs, &(ro[0]));
+			 T6c = VADD(T5v, T5w);
+			 STM4(&(io[2]), T6c, ovs, &(io[0]));
+			 T6d = VSUB(T5p, T5q);
+			 STM4(&(io[26]), T6d, ovs, &(io[0]));
+			 T6e = VSUB(T5r, T5u);
+			 STM4(&(ro[26]), T6e, ovs, &(ro[0]));
+			 T6f = VADD(T5p, T5q);
+			 STM4(&(io[10]), T6f, ovs, &(io[0]));
+			 T6g = VADD(T5r, T5u);
+			 STM4(&(ro[10]), T6g, ovs, &(ro[0]));
+		    }
+		    {
+			 V T5z, T5P, T5S, T5U, T5K, T5O, T5N, T5T;
+			 {
+			      V T5x, T5y, T5Q, T5R;
+			      T5x = VSUB(T7, Te);
+			      T5y = VSUB(T1n, T1u);
+			      T5z = VADD(T5x, T5y);
+			      T5P = VSUB(T5x, T5y);
+			      T5Q = VSUB(T5D, T5A);
+			      T5R = VADD(T5F, T5I);
+			      T5S = VMUL(LDK(KP707106781), VSUB(T5Q, T5R));
+			      T5U = VMUL(LDK(KP707106781), VADD(T5Q, T5R));
+			 }
+			 {
+			      V T5E, T5J, T5L, T5M;
+			      T5E = VADD(T5A, T5D);
+			      T5J = VSUB(T5F, T5I);
+			      T5K = VMUL(LDK(KP707106781), VADD(T5E, T5J));
+			      T5O = VMUL(LDK(KP707106781), VSUB(T5J, T5E));
+			      T5L = VSUB(T18, T1f);
+			      T5M = VSUB(Tt, Tm);
+			      T5N = VSUB(T5L, T5M);
+			      T5T = VADD(T5M, T5L);
+			 }
+			 T6h = VSUB(T5z, T5K);
+			 STM4(&(ro[20]), T6h, ovs, &(ro[0]));
+			 T6i = VSUB(T5T, T5U);
+			 STM4(&(io[20]), T6i, ovs, &(io[0]));
+			 T6j = VADD(T5z, T5K);
+			 STM4(&(ro[4]), T6j, ovs, &(ro[0]));
+			 T6k = VADD(T5T, T5U);
+			 STM4(&(io[4]), T6k, ovs, &(io[0]));
+			 T6l = VSUB(T5N, T5O);
+			 STM4(&(io[28]), T6l, ovs, &(io[0]));
+			 T6m = VSUB(T5P, T5S);
+			 STM4(&(ro[28]), T6m, ovs, &(ro[0]));
+			 T6n = VADD(T5N, T5O);
+			 STM4(&(io[12]), T6n, ovs, &(io[0]));
+			 T6o = VADD(T5P, T5S);
+			 STM4(&(ro[12]), T6o, ovs, &(ro[0]));
+		    }
+		    {
+			 V Tv, T5V, T5Y, T60, T10, T11, T1w, T5Z;
+			 {
+			      V Tf, Tu, T5W, T5X;
+			      Tf = VADD(T7, Te);
+			      Tu = VADD(Tm, Tt);
+			      Tv = VADD(Tf, Tu);
+			      T5V = VSUB(Tf, Tu);
+			      T5W = VADD(T5B, T5C);
+			      T5X = VADD(T5G, T5H);
+			      T5Y = VSUB(T5W, T5X);
+			      T60 = VADD(T5W, T5X);
+			 }
+			 {
+			      V TK, TZ, T1g, T1v;
+			      TK = VADD(TC, TJ);
+			      TZ = VADD(TR, TY);
+			      T10 = VADD(TK, TZ);
+			      T11 = VSUB(TZ, TK);
+			      T1g = VADD(T18, T1f);
+			      T1v = VADD(T1n, T1u);
+			      T1w = VSUB(T1g, T1v);
+			      T5Z = VADD(T1g, T1v);
+			 }
+			 T6p = VSUB(Tv, T10);
+			 STM4(&(ro[16]), T6p, ovs, &(ro[0]));
+			 T6q = VSUB(T5Z, T60);
+			 STM4(&(io[16]), T6q, ovs, &(io[0]));
+			 T6r = VADD(Tv, T10);
+			 STM4(&(ro[0]), T6r, ovs, &(ro[0]));
+			 T6s = VADD(T5Z, T60);
+			 STM4(&(io[0]), T6s, ovs, &(io[0]));
+			 T6t = VADD(T11, T1w);
+			 STM4(&(io[8]), T6t, ovs, &(io[0]));
+			 T6u = VADD(T5V, T5Y);
+			 STM4(&(ro[8]), T6u, ovs, &(ro[0]));
+			 T6v = VSUB(T1w, T11);
+			 STM4(&(io[24]), T6v, ovs, &(io[0]));
+			 T6w = VSUB(T5V, T5Y);
+			 STM4(&(ro[24]), T6w, ovs, &(ro[0]));
+		    }
+		    {
+			 V T6x, T6y, T6z, T6A, T6B, T6C, T6D, T6E;
+			 {
+			      V T1X, T33, T31, T37, T2o, T34, T2P, T35;
+			      {
+				   V T1H, T1W, T2X, T30;
+				   T1H = VSUB(T1z, T1G);
+				   T1W = VSUB(T1O, T1V);
+				   T1X = VADD(T1H, T1W);
+				   T33 = VSUB(T1H, T1W);
+				   T2X = VSUB(T2T, T2W);
+				   T30 = VSUB(T2Y, T2Z);
+				   T31 = VSUB(T2X, T30);
+				   T37 = VADD(T2X, T30);
+			      }
+			      {
+				   V T2e, T2n, T2F, T2O;
+				   T2e = VSUB(T22, T2d);
+				   T2n = VSUB(T2j, T2m);
+				   T2o = VFMA(LDK(KP980785280), T2e, VMUL(LDK(KP195090322), T2n));
+				   T34 = VFNMS(LDK(KP980785280), T2n, VMUL(LDK(KP195090322), T2e));
+				   T2F = VSUB(T2t, T2E);
+				   T2O = VSUB(T2K, T2N);
+				   T2P = VFNMS(LDK(KP980785280), T2O, VMUL(LDK(KP195090322), T2F));
+				   T35 = VFMA(LDK(KP195090322), T2O, VMUL(LDK(KP980785280), T2F));
+			      }
+			      {
+				   V T2Q, T38, T32, T36;
+				   T2Q = VADD(T2o, T2P);
+				   T6x = VSUB(T1X, T2Q);
+				   STM4(&(ro[23]), T6x, ovs, &(ro[1]));
+				   T6y = VADD(T1X, T2Q);
+				   STM4(&(ro[7]), T6y, ovs, &(ro[1]));
+				   T38 = VADD(T34, T35);
+				   T6z = VSUB(T37, T38);
+				   STM4(&(io[23]), T6z, ovs, &(io[1]));
+				   T6A = VADD(T37, T38);
+				   STM4(&(io[7]), T6A, ovs, &(io[1]));
+				   T32 = VSUB(T2P, T2o);
+				   T6B = VSUB(T31, T32);
+				   STM4(&(io[31]), T6B, ovs, &(io[1]));
+				   T6C = VADD(T31, T32);
+				   STM4(&(io[15]), T6C, ovs, &(io[1]));
+				   T36 = VSUB(T34, T35);
+				   T6D = VSUB(T33, T36);
+				   STM4(&(ro[31]), T6D, ovs, &(ro[1]));
+				   T6E = VADD(T33, T36);
+				   STM4(&(ro[15]), T6E, ovs, &(ro[1]));
+			      }
+			 }
+			 {
+			      V T3D, T41, T3Z, T45, T3K, T42, T3R, T43;
+			      {
+				   V T3v, T3C, T3V, T3Y;
+				   T3v = VSUB(T3t, T3u);
+				   T3C = VSUB(T3y, T3B);
+				   T3D = VADD(T3v, T3C);
+				   T41 = VSUB(T3v, T3C);
+				   T3V = VSUB(T3T, T3U);
+				   T3Y = VSUB(T3W, T3X);
+				   T3Z = VSUB(T3V, T3Y);
+				   T45 = VADD(T3V, T3Y);
+			      }
+			      {
+				   V T3G, T3J, T3N, T3Q;
+				   T3G = VSUB(T3E, T3F);
+				   T3J = VSUB(T3H, T3I);
+				   T3K = VFMA(LDK(KP555570233), T3G, VMUL(LDK(KP831469612), T3J));
+				   T42 = VFNMS(LDK(KP831469612), T3G, VMUL(LDK(KP555570233), T3J));
+				   T3N = VSUB(T3L, T3M);
+				   T3Q = VSUB(T3O, T3P);
+				   T3R = VFNMS(LDK(KP831469612), T3Q, VMUL(LDK(KP555570233), T3N));
+				   T43 = VFMA(LDK(KP831469612), T3N, VMUL(LDK(KP555570233), T3Q));
+			      }
+			      {
+				   V T3S, T6F, T6G, T46, T6H, T6I;
+				   T3S = VADD(T3K, T3R);
+				   T6F = VSUB(T3D, T3S);
+				   STM4(&(ro[21]), T6F, ovs, &(ro[1]));
+				   STN4(&(ro[20]), T6h, T6F, T61, T6x, ovs);
+				   T6G = VADD(T3D, T3S);
+				   STM4(&(ro[5]), T6G, ovs, &(ro[1]));
+				   STN4(&(ro[4]), T6j, T6G, T63, T6y, ovs);
+				   T46 = VADD(T42, T43);
+				   T6H = VSUB(T45, T46);
+				   STM4(&(io[21]), T6H, ovs, &(io[1]));
+				   STN4(&(io[20]), T6i, T6H, T62, T6z, ovs);
+				   T6I = VADD(T45, T46);
+				   STM4(&(io[5]), T6I, ovs, &(io[1]));
+				   STN4(&(io[4]), T6k, T6I, T64, T6A, ovs);
+			      }
+			      {
+				   V T40, T6J, T6K, T44, T6L, T6M;
+				   T40 = VSUB(T3R, T3K);
+				   T6J = VSUB(T3Z, T40);
+				   STM4(&(io[29]), T6J, ovs, &(io[1]));
+				   STN4(&(io[28]), T6l, T6J, T65, T6B, ovs);
+				   T6K = VADD(T3Z, T40);
+				   STM4(&(io[13]), T6K, ovs, &(io[1]));
+				   STN4(&(io[12]), T6n, T6K, T67, T6C, ovs);
+				   T44 = VSUB(T42, T43);
+				   T6L = VSUB(T41, T44);
+				   STM4(&(ro[29]), T6L, ovs, &(ro[1]));
+				   STN4(&(ro[28]), T6m, T6L, T66, T6D, ovs);
+				   T6M = VADD(T41, T44);
+				   STM4(&(ro[13]), T6M, ovs, &(ro[1]));
+				   STN4(&(ro[12]), T6o, T6M, T68, T6E, ovs);
+			      }
+			 }
+		    }
+		    {
+			 V T6N, T6O, T6P, T6Q, T6R, T6S, T6T, T6U;
+			 {
+			      V T49, T4l, T4j, T4p, T4c, T4m, T4f, T4n;
+			      {
+				   V T47, T48, T4h, T4i;
+				   T47 = VADD(T3t, T3u);
+				   T48 = VADD(T3X, T3W);
+				   T49 = VADD(T47, T48);
+				   T4l = VSUB(T47, T48);
+				   T4h = VADD(T3T, T3U);
+				   T4i = VADD(T3y, T3B);
+				   T4j = VSUB(T4h, T4i);
+				   T4p = VADD(T4h, T4i);
+			      }
+			      {
+				   V T4a, T4b, T4d, T4e;
+				   T4a = VADD(T3E, T3F);
+				   T4b = VADD(T3H, T3I);
+				   T4c = VFMA(LDK(KP980785280), T4a, VMUL(LDK(KP195090322), T4b));
+				   T4m = VFNMS(LDK(KP195090322), T4a, VMUL(LDK(KP980785280), T4b));
+				   T4d = VADD(T3L, T3M);
+				   T4e = VADD(T3O, T3P);
+				   T4f = VFNMS(LDK(KP195090322), T4e, VMUL(LDK(KP980785280), T4d));
+				   T4n = VFMA(LDK(KP195090322), T4d, VMUL(LDK(KP980785280), T4e));
+			      }
+			      {
+				   V T4g, T4q, T4k, T4o;
+				   T4g = VADD(T4c, T4f);
+				   T6N = VSUB(T49, T4g);
+				   STM4(&(ro[17]), T6N, ovs, &(ro[1]));
+				   T6O = VADD(T49, T4g);
+				   STM4(&(ro[1]), T6O, ovs, &(ro[1]));
+				   T4q = VADD(T4m, T4n);
+				   T6P = VSUB(T4p, T4q);
+				   STM4(&(io[17]), T6P, ovs, &(io[1]));
+				   T6Q = VADD(T4p, T4q);
+				   STM4(&(io[1]), T6Q, ovs, &(io[1]));
+				   T4k = VSUB(T4f, T4c);
+				   T6R = VSUB(T4j, T4k);
+				   STM4(&(io[25]), T6R, ovs, &(io[1]));
+				   T6S = VADD(T4j, T4k);
+				   STM4(&(io[9]), T6S, ovs, &(io[1]));
+				   T4o = VSUB(T4m, T4n);
+				   T6T = VSUB(T4l, T4o);
+				   STM4(&(ro[25]), T6T, ovs, &(ro[1]));
+				   T6U = VADD(T4l, T4o);
+				   STM4(&(ro[9]), T6U, ovs, &(ro[1]));
+			      }
+			 }
+			 {
+			      V T3b, T3n, T3l, T3r, T3e, T3o, T3h, T3p;
+			      {
+				   V T39, T3a, T3j, T3k;
+				   T39 = VADD(T1z, T1G);
+				   T3a = VADD(T2Z, T2Y);
+				   T3b = VADD(T39, T3a);
+				   T3n = VSUB(T39, T3a);
+				   T3j = VADD(T2T, T2W);
+				   T3k = VADD(T1O, T1V);
+				   T3l = VSUB(T3j, T3k);
+				   T3r = VADD(T3j, T3k);
+			      }
+			      {
+				   V T3c, T3d, T3f, T3g;
+				   T3c = VADD(T22, T2d);
+				   T3d = VADD(T2j, T2m);
+				   T3e = VFMA(LDK(KP555570233), T3c, VMUL(LDK(KP831469612), T3d));
+				   T3o = VFNMS(LDK(KP555570233), T3d, VMUL(LDK(KP831469612), T3c));
+				   T3f = VADD(T2t, T2E);
+				   T3g = VADD(T2K, T2N);
+				   T3h = VFNMS(LDK(KP555570233), T3g, VMUL(LDK(KP831469612), T3f));
+				   T3p = VFMA(LDK(KP831469612), T3g, VMUL(LDK(KP555570233), T3f));
+			      }
+			      {
+				   V T3i, T6V, T6W, T3s, T6X, T6Y;
+				   T3i = VADD(T3e, T3h);
+				   T6V = VSUB(T3b, T3i);
+				   STM4(&(ro[19]), T6V, ovs, &(ro[1]));
+				   STN4(&(ro[16]), T6p, T6N, T69, T6V, ovs);
+				   T6W = VADD(T3b, T3i);
+				   STM4(&(ro[3]), T6W, ovs, &(ro[1]));
+				   STN4(&(ro[0]), T6r, T6O, T6b, T6W, ovs);
+				   T3s = VADD(T3o, T3p);
+				   T6X = VSUB(T3r, T3s);
+				   STM4(&(io[19]), T6X, ovs, &(io[1]));
+				   STN4(&(io[16]), T6q, T6P, T6a, T6X, ovs);
+				   T6Y = VADD(T3r, T3s);
+				   STM4(&(io[3]), T6Y, ovs, &(io[1]));
+				   STN4(&(io[0]), T6s, T6Q, T6c, T6Y, ovs);
+			      }
+			      {
+				   V T3m, T6Z, T70, T3q, T71, T72;
+				   T3m = VSUB(T3h, T3e);
+				   T6Z = VSUB(T3l, T3m);
+				   STM4(&(io[27]), T6Z, ovs, &(io[1]));
+				   STN4(&(io[24]), T6v, T6R, T6d, T6Z, ovs);
+				   T70 = VADD(T3l, T3m);
+				   STM4(&(io[11]), T70, ovs, &(io[1]));
+				   STN4(&(io[8]), T6t, T6S, T6f, T70, ovs);
+				   T3q = VSUB(T3o, T3p);
+				   T71 = VSUB(T3n, T3q);
+				   STM4(&(ro[27]), T71, ovs, &(ro[1]));
+				   STN4(&(ro[24]), T6w, T6T, T6e, T71, ovs);
+				   T72 = VADD(T3n, T3q);
+				   STM4(&(ro[11]), T72, ovs, &(ro[1]));
+				   STN4(&(ro[8]), T6u, T6U, T6g, T72, ovs);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 32, XSIMD_STRING("n2sv_32"), {340, 52, 32, 0}, &GENUS, 0, 1, 0, 0 };
+
+void XSIMD(codelet_n2sv_32) (planner *p) {
+     X(kdft_register) (p, n2sv_32, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:47 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name n2sv_4 -with-ostride 1 -include n2s.h -store-multiple 4 */
+
+/*
+ * This function contains 16 FP additions, 0 FP multiplications,
+ * (or, 16 additions, 0 multiplications, 0 fused multiply/add),
+ * 25 stack variables, 0 constants, and 18 memory accesses
+ */
+#include "n2s.h"
+
+static void n2sv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       V T1, T2, T7, T8, T4, T5, Tc, Td;
+	       T1 = LD(&(ri[0]), ivs, &(ri[0]));
+	       T2 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
+	       T7 = LD(&(ii[0]), ivs, &(ii[0]));
+	       T8 = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
+	       T4 = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
+	       T5 = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
+	       Tc = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
+	       Td = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
+	       {
+		    V T3, Tb, T9, Tf, T6, Ta, Te, Tg;
+		    T3 = VADD(T1, T2);
+		    Tb = VSUB(T1, T2);
+		    T9 = VSUB(T7, T8);
+		    Tf = VADD(T7, T8);
+		    T6 = VADD(T4, T5);
+		    Ta = VSUB(T4, T5);
+		    Te = VSUB(Tc, Td);
+		    Tg = VADD(Tc, Td);
+		    {
+			 V Th, Ti, Tj, Tk;
+			 Th = VADD(Ta, T9);
+			 STM4(&(io[3]), Th, ovs, &(io[1]));
+			 Ti = VSUB(T9, Ta);
+			 STM4(&(io[1]), Ti, ovs, &(io[1]));
+			 Tj = VADD(T3, T6);
+			 STM4(&(ro[0]), Tj, ovs, &(ro[0]));
+			 Tk = VSUB(T3, T6);
+			 STM4(&(ro[2]), Tk, ovs, &(ro[0]));
+			 {
+			      V Tl, Tm, Tn, To;
+			      Tl = VADD(Tf, Tg);
+			      STM4(&(io[0]), Tl, ovs, &(io[0]));
+			      Tm = VSUB(Tf, Tg);
+			      STM4(&(io[2]), Tm, ovs, &(io[0]));
+			      STN4(&(io[0]), Tl, Ti, Tm, Th, ovs);
+			      Tn = VSUB(Tb, Te);
+			      STM4(&(ro[3]), Tn, ovs, &(ro[1]));
+			      To = VADD(Tb, Te);
+			      STM4(&(ro[1]), To, ovs, &(ro[1]));
+			      STN4(&(ro[0]), Tj, To, Tk, Tn, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 4, XSIMD_STRING("n2sv_4"), {16, 0, 0, 0}, &GENUS, 0, 1, 0, 0 };
+
+void XSIMD(codelet_n2sv_4) (planner *p) {
+     X(kdft_register) (p, n2sv_4, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name n2sv_4 -with-ostride 1 -include n2s.h -store-multiple 4 */
+
+/*
+ * This function contains 16 FP additions, 0 FP multiplications,
+ * (or, 16 additions, 0 multiplications, 0 fused multiply/add),
+ * 17 stack variables, 0 constants, and 18 memory accesses
+ */
+#include "n2s.h"
+
+static void n2sv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       V T3, Tb, T9, Tf, T6, Ta, Te, Tg;
+	       {
+		    V T1, T2, T7, T8;
+		    T1 = LD(&(ri[0]), ivs, &(ri[0]));
+		    T2 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
+		    T3 = VADD(T1, T2);
+		    Tb = VSUB(T1, T2);
+		    T7 = LD(&(ii[0]), ivs, &(ii[0]));
+		    T8 = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
+		    T9 = VSUB(T7, T8);
+		    Tf = VADD(T7, T8);
+	       }
+	       {
+		    V T4, T5, Tc, Td;
+		    T4 = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
+		    T5 = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
+		    T6 = VADD(T4, T5);
+		    Ta = VSUB(T4, T5);
+		    Tc = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
+		    Td = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
+		    Te = VSUB(Tc, Td);
+		    Tg = VADD(Tc, Td);
+	       }
+	       {
+		    V Th, Ti, Tj, Tk;
+		    Th = VSUB(T3, T6);
+		    STM4(&(ro[2]), Th, ovs, &(ro[0]));
+		    Ti = VSUB(Tf, Tg);
+		    STM4(&(io[2]), Ti, ovs, &(io[0]));
+		    Tj = VADD(T3, T6);
+		    STM4(&(ro[0]), Tj, ovs, &(ro[0]));
+		    Tk = VADD(Tf, Tg);
+		    STM4(&(io[0]), Tk, ovs, &(io[0]));
+		    {
+			 V Tl, Tm, Tn, To;
+			 Tl = VSUB(T9, Ta);
+			 STM4(&(io[1]), Tl, ovs, &(io[1]));
+			 Tm = VADD(Tb, Te);
+			 STM4(&(ro[1]), Tm, ovs, &(ro[1]));
+			 Tn = VADD(Ta, T9);
+			 STM4(&(io[3]), Tn, ovs, &(io[1]));
+			 STN4(&(io[0]), Tk, Tl, Ti, Tn, ovs);
+			 To = VSUB(Tb, Te);
+			 STM4(&(ro[3]), To, ovs, &(ro[1]));
+			 STN4(&(ro[0]), Tj, Tm, Th, To, ovs);
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 4, XSIMD_STRING("n2sv_4"), {16, 0, 0, 0}, &GENUS, 0, 1, 0, 0 };
+
+void XSIMD(codelet_n2sv_4) (planner *p) {
+     X(kdft_register) (p, n2sv_4, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2sv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2sv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3303 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:57 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n2sv_64 -with-ostride 1 -include n2s.h -store-multiple 4 */
+
+/*
+ * This function contains 912 FP additions, 392 FP multiplications,
+ * (or, 520 additions, 0 multiplications, 392 fused multiply/add),
+ * 310 stack variables, 15 constants, and 288 memory accesses
+ */
+#include "n2s.h"
+
+static void n2sv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(256, is), MAKE_VOLATILE_STRIDE(256, os)) {
+	       V TeJ, TeK, TeP, TeQ, TfH, TfI, TfJ, TfK, Tgj, Tgk, Tgv, Tgw, T9a, T99, T9e;
+	       V T9b;
+	       {
+		    V T7B, T37, T5Z, T8F, TbB, TcB, Tf, Td9, T62, T7C, T2i, TdH, Tcb, Tah, T8G;
+		    V T3e, Tak, TbC, T65, T3m, TdI, Tu, Tda, T2x, TbD, Tan, T8I, T7G, T8J, T7J;
+		    V T64, T3t, Tas, Tce, TK, Tdd, Tav, Tcf, Tdc, T2N, T3G, T6G, T9k, T7O, T9l;
+		    V T7R, T6H, T3N, T1L, TdA, Tdx, Teo, Tbs, Tct, T5Q, T6V, T8y, T9z, T5j, T6Y;
+		    V Tbb, Tcw, T8n, T9C, Tch, Taz, Tdf, TZ, Tdg, T32, Tci, TaC, T6J, T3Z, T9n;
+		    V T7V, T9o, T7Y, T6K, T46, Tdp, T1g, Tej, Tdm, Tcm, Tb1, Tcp, TaK, T6O, T4X;
+		    V T9s, T8f, T6R, T4q, T9v, T84, Tdn, T1v, Tek, Tds, Tcn, TaV, Tcq, Tb4, T9t;
+		    V T8b, T9w, T8i, T6S, T50, T6P, T4N, T5k, T1V, T1S, TdB, Tbi, T5s, Tbt, Tbg;
+		    V T5F, T5R, T5p, T1Y, Tbj, T5n, T8z, T8q;
+		    {
+			 V Tba, T57, T8l, Tb7, T5M, T8w, T8m, T5P, T8x, T5i;
+			 {
+			      V T2p, T7F, T7E, Tal, T2w, Tam, T3s, T7H, T7I, T3p, T3d, T3a;
+			      {
+				   V T8, T35, T3, T5Y, T26, T5X, T6, T36, T29, T9, T2b, T2c, Tb, Tc, T2e;
+				   V T2f;
+				   {
+					V T1, T2, T24, T25, T4, T5, T27, T28;
+					T1 = LD(&(ri[0]), ivs, &(ri[0]));
+					T2 = LD(&(ri[WS(is, 32)]), ivs, &(ri[0]));
+					T24 = LD(&(ii[0]), ivs, &(ii[0]));
+					T25 = LD(&(ii[WS(is, 32)]), ivs, &(ii[0]));
+					T4 = LD(&(ri[WS(is, 16)]), ivs, &(ri[0]));
+					T5 = LD(&(ri[WS(is, 48)]), ivs, &(ri[0]));
+					T27 = LD(&(ii[WS(is, 16)]), ivs, &(ii[0]));
+					T28 = LD(&(ii[WS(is, 48)]), ivs, &(ii[0]));
+					T8 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
+					T35 = VSUB(T1, T2);
+					T3 = VADD(T1, T2);
+					T5Y = VSUB(T24, T25);
+					T26 = VADD(T24, T25);
+					T5X = VSUB(T4, T5);
+					T6 = VADD(T4, T5);
+					T36 = VSUB(T27, T28);
+					T29 = VADD(T27, T28);
+					T9 = LD(&(ri[WS(is, 40)]), ivs, &(ri[0]));
+					T2b = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
+					T2c = LD(&(ii[WS(is, 40)]), ivs, &(ii[0]));
+					Tb = LD(&(ri[WS(is, 56)]), ivs, &(ri[0]));
+					Tc = LD(&(ri[WS(is, 24)]), ivs, &(ri[0]));
+					T2e = LD(&(ii[WS(is, 56)]), ivs, &(ii[0]));
+					T2f = LD(&(ii[WS(is, 24)]), ivs, &(ii[0]));
+				   }
+				   {
+					V T39, Ta, T38, T2d, T3b, Td, T3c, T2g, Taf, T7;
+					T7B = VADD(T35, T36);
+					T37 = VSUB(T35, T36);
+					T39 = VSUB(T8, T9);
+					Ta = VADD(T8, T9);
+					T38 = VSUB(T2b, T2c);
+					T2d = VADD(T2b, T2c);
+					T3b = VSUB(Tb, Tc);
+					Td = VADD(Tb, Tc);
+					T3c = VSUB(T2e, T2f);
+					T2g = VADD(T2e, T2f);
+					T5Z = VADD(T5X, T5Y);
+					T8F = VSUB(T5Y, T5X);
+					Taf = VSUB(T3, T6);
+					T7 = VADD(T3, T6);
+					{
+					     V TbA, T2a, Te, Tbz, T60, T61, T2h, Tag;
+					     TbA = VSUB(T26, T29);
+					     T2a = VADD(T26, T29);
+					     Te = VADD(Ta, Td);
+					     Tbz = VSUB(Td, Ta);
+					     T3d = VADD(T3b, T3c);
+					     T60 = VSUB(T3b, T3c);
+					     T61 = VADD(T39, T38);
+					     T3a = VSUB(T38, T39);
+					     T2h = VADD(T2d, T2g);
+					     Tag = VSUB(T2d, T2g);
+					     TbB = VADD(Tbz, TbA);
+					     TcB = VSUB(TbA, Tbz);
+					     Tf = VADD(T7, Te);
+					     Td9 = VSUB(T7, Te);
+					     T62 = VSUB(T60, T61);
+					     T7C = VADD(T61, T60);
+					     T2i = VADD(T2a, T2h);
+					     TdH = VSUB(T2a, T2h);
+					     Tcb = VSUB(Taf, Tag);
+					     Tah = VADD(Taf, Tag);
+					}
+				   }
+			      }
+			      {
+				   V T3j, Ti, T3h, T2l, T3g, Tl, T2t, T3k, T2o, T3q, Tp, T3o, T2s, T3n, Ts;
+				   V T2u, T2m, T2n;
+				   {
+					V Tg, Th, T2j, T2k, Tj, Tk;
+					Tg = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
+					Th = LD(&(ri[WS(is, 36)]), ivs, &(ri[0]));
+					T2j = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
+					T2k = LD(&(ii[WS(is, 36)]), ivs, &(ii[0]));
+					Tj = LD(&(ri[WS(is, 20)]), ivs, &(ri[0]));
+					Tk = LD(&(ri[WS(is, 52)]), ivs, &(ri[0]));
+					T2m = LD(&(ii[WS(is, 20)]), ivs, &(ii[0]));
+					T8G = VADD(T3a, T3d);
+					T3e = VSUB(T3a, T3d);
+					T3j = VSUB(Tg, Th);
+					Ti = VADD(Tg, Th);
+					T3h = VSUB(T2j, T2k);
+					T2l = VADD(T2j, T2k);
+					T3g = VSUB(Tj, Tk);
+					Tl = VADD(Tj, Tk);
+					T2n = LD(&(ii[WS(is, 52)]), ivs, &(ii[0]));
+				   }
+				   {
+					V Tn, To, T2q, T2r, Tq, Tr;
+					Tn = LD(&(ri[WS(is, 60)]), ivs, &(ri[0]));
+					To = LD(&(ri[WS(is, 28)]), ivs, &(ri[0]));
+					T2q = LD(&(ii[WS(is, 60)]), ivs, &(ii[0]));
+					T2r = LD(&(ii[WS(is, 28)]), ivs, &(ii[0]));
+					Tq = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
+					Tr = LD(&(ri[WS(is, 44)]), ivs, &(ri[0]));
+					T2t = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
+					T3k = VSUB(T2m, T2n);
+					T2o = VADD(T2m, T2n);
+					T3q = VSUB(Tn, To);
+					Tp = VADD(Tn, To);
+					T3o = VSUB(T2q, T2r);
+					T2s = VADD(T2q, T2r);
+					T3n = VSUB(Tq, Tr);
+					Ts = VADD(Tq, Tr);
+					T2u = LD(&(ii[WS(is, 44)]), ivs, &(ii[0]));
+				   }
+				   {
+					V Tai, Tm, Taj, T3r;
+					Tai = VSUB(Ti, Tl);
+					Tm = VADD(Ti, Tl);
+					T2p = VADD(T2l, T2o);
+					Taj = VSUB(T2l, T2o);
+					{
+					     V T3i, T3l, Tt, T2v;
+					     T7F = VSUB(T3h, T3g);
+					     T3i = VADD(T3g, T3h);
+					     T3l = VSUB(T3j, T3k);
+					     T7E = VADD(T3j, T3k);
+					     Tt = VADD(Tp, Ts);
+					     Tal = VSUB(Tp, Ts);
+					     T2v = VADD(T2t, T2u);
+					     T3r = VSUB(T2t, T2u);
+					     Tak = VADD(Tai, Taj);
+					     TbC = VSUB(Taj, Tai);
+					     T65 = VFNMS(LDK(KP414213562), T3i, T3l);
+					     T3m = VFMA(LDK(KP414213562), T3l, T3i);
+					     TdI = VSUB(Tt, Tm);
+					     Tu = VADD(Tm, Tt);
+					     T2w = VADD(T2s, T2v);
+					     Tam = VSUB(T2s, T2v);
+					}
+					T3s = VSUB(T3q, T3r);
+					T7H = VADD(T3q, T3r);
+					T7I = VSUB(T3o, T3n);
+					T3p = VADD(T3n, T3o);
+				   }
+			      }
+			      {
+				   V T7M, T7Q, T7N, T3M, T3J, T7P;
+				   {
+					V TG, T3H, Ty, T3x, T2B, T3w, TB, T3I, T2E, TH, T2J, T2K, TD, TE, T2G;
+					V T2H;
+					{
+					     V Tw, Tx, T2z, T2A, Tz, TA, T2C, T2D;
+					     Tw = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
+					     Tda = VSUB(T2p, T2w);
+					     T2x = VADD(T2p, T2w);
+					     TbD = VADD(Tal, Tam);
+					     Tan = VSUB(Tal, Tam);
+					     T8I = VFNMS(LDK(KP414213562), T7E, T7F);
+					     T7G = VFMA(LDK(KP414213562), T7F, T7E);
+					     T8J = VFMA(LDK(KP414213562), T7H, T7I);
+					     T7J = VFNMS(LDK(KP414213562), T7I, T7H);
+					     T64 = VFMA(LDK(KP414213562), T3p, T3s);
+					     T3t = VFNMS(LDK(KP414213562), T3s, T3p);
+					     Tx = LD(&(ri[WS(is, 34)]), ivs, &(ri[0]));
+					     T2z = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
+					     T2A = LD(&(ii[WS(is, 34)]), ivs, &(ii[0]));
+					     Tz = LD(&(ri[WS(is, 18)]), ivs, &(ri[0]));
+					     TA = LD(&(ri[WS(is, 50)]), ivs, &(ri[0]));
+					     T2C = LD(&(ii[WS(is, 18)]), ivs, &(ii[0]));
+					     T2D = LD(&(ii[WS(is, 50)]), ivs, &(ii[0]));
+					     TG = LD(&(ri[WS(is, 58)]), ivs, &(ri[0]));
+					     T3H = VSUB(Tw, Tx);
+					     Ty = VADD(Tw, Tx);
+					     T3x = VSUB(T2z, T2A);
+					     T2B = VADD(T2z, T2A);
+					     T3w = VSUB(Tz, TA);
+					     TB = VADD(Tz, TA);
+					     T3I = VSUB(T2C, T2D);
+					     T2E = VADD(T2C, T2D);
+					     TH = LD(&(ri[WS(is, 26)]), ivs, &(ri[0]));
+					     T2J = LD(&(ii[WS(is, 58)]), ivs, &(ii[0]));
+					     T2K = LD(&(ii[WS(is, 26)]), ivs, &(ii[0]));
+					     TD = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
+					     TE = LD(&(ri[WS(is, 42)]), ivs, &(ri[0]));
+					     T2G = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
+					     T2H = LD(&(ii[WS(is, 42)]), ivs, &(ii[0]));
+					}
+					{
+					     V Tat, TC, Tar, T2F, T3K, T3E, TJ, Taq, T2M, Tau, T3B, T3L, T3y, T3F;
+					     {
+						  V TI, T3C, T2L, T3D, TF, T3z, T2I, T3A;
+						  Tat = VSUB(Ty, TB);
+						  TC = VADD(Ty, TB);
+						  TI = VADD(TG, TH);
+						  T3C = VSUB(TG, TH);
+						  T2L = VADD(T2J, T2K);
+						  T3D = VSUB(T2J, T2K);
+						  TF = VADD(TD, TE);
+						  T3z = VSUB(TD, TE);
+						  T2I = VADD(T2G, T2H);
+						  T3A = VSUB(T2G, T2H);
+						  Tar = VSUB(T2B, T2E);
+						  T2F = VADD(T2B, T2E);
+						  T3K = VADD(T3C, T3D);
+						  T3E = VSUB(T3C, T3D);
+						  TJ = VADD(TF, TI);
+						  Taq = VSUB(TI, TF);
+						  T2M = VADD(T2I, T2L);
+						  Tau = VSUB(T2I, T2L);
+						  T3B = VADD(T3z, T3A);
+						  T3L = VSUB(T3A, T3z);
+					     }
+					     T7M = VSUB(T3x, T3w);
+					     T3y = VADD(T3w, T3x);
+					     Tas = VADD(Taq, Tar);
+					     Tce = VSUB(Tar, Taq);
+					     TK = VADD(TC, TJ);
+					     Tdd = VSUB(TC, TJ);
+					     Tav = VADD(Tat, Tau);
+					     Tcf = VSUB(Tat, Tau);
+					     T7Q = VADD(T3B, T3E);
+					     T3F = VSUB(T3B, T3E);
+					     Tdc = VSUB(T2F, T2M);
+					     T2N = VADD(T2F, T2M);
+					     T7N = VADD(T3L, T3K);
+					     T3M = VSUB(T3K, T3L);
+					     T3J = VSUB(T3H, T3I);
+					     T7P = VADD(T3H, T3I);
+					     T3G = VFNMS(LDK(KP707106781), T3F, T3y);
+					     T6G = VFMA(LDK(KP707106781), T3F, T3y);
+					}
+				   }
+				   {
+					V T1H, T5I, T1z, Tb8, T56, T53, T1C, Tb9, T5L, T1I, T5e, T5f, T1E, T1F, T59;
+					V T5a;
+					{
+					     V T1x, T1y, T54, T55, T1A, T1B, T5J, T5K;
+					     T1x = LD(&(ri[WS(is, 63)]), ivs, &(ri[WS(is, 1)]));
+					     T9k = VFNMS(LDK(KP707106781), T7N, T7M);
+					     T7O = VFMA(LDK(KP707106781), T7N, T7M);
+					     T9l = VFNMS(LDK(KP707106781), T7Q, T7P);
+					     T7R = VFMA(LDK(KP707106781), T7Q, T7P);
+					     T6H = VFMA(LDK(KP707106781), T3M, T3J);
+					     T3N = VFNMS(LDK(KP707106781), T3M, T3J);
+					     T1y = LD(&(ri[WS(is, 31)]), ivs, &(ri[WS(is, 1)]));
+					     T54 = LD(&(ii[WS(is, 63)]), ivs, &(ii[WS(is, 1)]));
+					     T55 = LD(&(ii[WS(is, 31)]), ivs, &(ii[WS(is, 1)]));
+					     T1A = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
+					     T1B = LD(&(ri[WS(is, 47)]), ivs, &(ri[WS(is, 1)]));
+					     T5J = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
+					     T5K = LD(&(ii[WS(is, 47)]), ivs, &(ii[WS(is, 1)]));
+					     T1H = LD(&(ri[WS(is, 55)]), ivs, &(ri[WS(is, 1)]));
+					     T5I = VSUB(T1x, T1y);
+					     T1z = VADD(T1x, T1y);
+					     Tb8 = VADD(T54, T55);
+					     T56 = VSUB(T54, T55);
+					     T53 = VSUB(T1A, T1B);
+					     T1C = VADD(T1A, T1B);
+					     Tb9 = VADD(T5J, T5K);
+					     T5L = VSUB(T5J, T5K);
+					     T1I = LD(&(ri[WS(is, 23)]), ivs, &(ri[WS(is, 1)]));
+					     T5e = LD(&(ii[WS(is, 55)]), ivs, &(ii[WS(is, 1)]));
+					     T5f = LD(&(ii[WS(is, 23)]), ivs, &(ii[WS(is, 1)]));
+					     T1E = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
+					     T1F = LD(&(ri[WS(is, 39)]), ivs, &(ri[WS(is, 1)]));
+					     T59 = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
+					     T5a = LD(&(ii[WS(is, 39)]), ivs, &(ii[WS(is, 1)]));
+					}
+					{
+					     V Tbo, T1D, Tdv, T5h, T5N, T1K, Tdw, Tbr, T5O, T5c;
+					     {
+						  V T1J, T5d, Tbq, T5g, T1G, T58, Tbp, T5b;
+						  Tbo = VSUB(T1z, T1C);
+						  T1D = VADD(T1z, T1C);
+						  T1J = VADD(T1H, T1I);
+						  T5d = VSUB(T1H, T1I);
+						  Tbq = VADD(T5e, T5f);
+						  T5g = VSUB(T5e, T5f);
+						  T1G = VADD(T1E, T1F);
+						  T58 = VSUB(T1E, T1F);
+						  Tbp = VADD(T59, T5a);
+						  T5b = VSUB(T59, T5a);
+						  Tba = VSUB(Tb8, Tb9);
+						  Tdv = VADD(Tb8, Tb9);
+						  T57 = VADD(T53, T56);
+						  T8l = VSUB(T56, T53);
+						  T5h = VSUB(T5d, T5g);
+						  T5N = VADD(T5d, T5g);
+						  Tb7 = VSUB(T1J, T1G);
+						  T1K = VADD(T1G, T1J);
+						  Tdw = VADD(Tbp, Tbq);
+						  Tbr = VSUB(Tbp, Tbq);
+						  T5O = VSUB(T5b, T58);
+						  T5c = VADD(T58, T5b);
+					     }
+					     T5M = VSUB(T5I, T5L);
+					     T8w = VADD(T5I, T5L);
+					     T1L = VADD(T1D, T1K);
+					     TdA = VSUB(T1D, T1K);
+					     Tdx = VSUB(Tdv, Tdw);
+					     Teo = VADD(Tdv, Tdw);
+					     Tbs = VADD(Tbo, Tbr);
+					     Tct = VSUB(Tbo, Tbr);
+					     T8m = VADD(T5O, T5N);
+					     T5P = VSUB(T5N, T5O);
+					     T8x = VADD(T5c, T5h);
+					     T5i = VSUB(T5c, T5h);
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T4e, T82, T8d, T4T, T4W, T83, T4p, T8e;
+			      {
+				   V T7T, T3R, T42, T7W, T3Y, T7X, T45, T7U;
+				   {
+					V T40, TN, T2Y, T3Q, T2Q, T3P, TQ, T41, T2T, T3V, TX, T2Z, TS, TT, T2V;
+					V T2W;
+					{
+					     V T2O, T2P, TO, TP, TL, TM;
+					     TL = LD(&(ri[WS(is, 62)]), ivs, &(ri[0]));
+					     TM = LD(&(ri[WS(is, 30)]), ivs, &(ri[0]));
+					     T5Q = VFNMS(LDK(KP707106781), T5P, T5M);
+					     T6V = VFMA(LDK(KP707106781), T5P, T5M);
+					     T8y = VFMA(LDK(KP707106781), T8x, T8w);
+					     T9z = VFNMS(LDK(KP707106781), T8x, T8w);
+					     T5j = VFNMS(LDK(KP707106781), T5i, T57);
+					     T6Y = VFMA(LDK(KP707106781), T5i, T57);
+					     Tbb = VADD(Tb7, Tba);
+					     Tcw = VSUB(Tba, Tb7);
+					     T8n = VFMA(LDK(KP707106781), T8m, T8l);
+					     T9C = VFNMS(LDK(KP707106781), T8m, T8l);
+					     T40 = VSUB(TL, TM);
+					     TN = VADD(TL, TM);
+					     T2O = LD(&(ii[WS(is, 62)]), ivs, &(ii[0]));
+					     T2P = LD(&(ii[WS(is, 30)]), ivs, &(ii[0]));
+					     TO = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
+					     TP = LD(&(ri[WS(is, 46)]), ivs, &(ri[0]));
+					     {
+						  V T2R, T2S, TV, TW;
+						  T2R = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
+						  T2S = LD(&(ii[WS(is, 46)]), ivs, &(ii[0]));
+						  TV = LD(&(ri[WS(is, 54)]), ivs, &(ri[0]));
+						  TW = LD(&(ri[WS(is, 22)]), ivs, &(ri[0]));
+						  T2Y = LD(&(ii[WS(is, 54)]), ivs, &(ii[0]));
+						  T3Q = VSUB(T2O, T2P);
+						  T2Q = VADD(T2O, T2P);
+						  T3P = VSUB(TO, TP);
+						  TQ = VADD(TO, TP);
+						  T41 = VSUB(T2R, T2S);
+						  T2T = VADD(T2R, T2S);
+						  T3V = VSUB(TV, TW);
+						  TX = VADD(TV, TW);
+						  T2Z = LD(&(ii[WS(is, 22)]), ivs, &(ii[0]));
+						  TS = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
+						  TT = LD(&(ri[WS(is, 38)]), ivs, &(ri[0]));
+						  T2V = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
+						  T2W = LD(&(ii[WS(is, 38)]), ivs, &(ii[0]));
+					     }
+					}
+					{
+					     V TaA, TR, Tay, T2U, T3W, T30, TU, T3S, T2X, T3T;
+					     TaA = VSUB(TN, TQ);
+					     TR = VADD(TN, TQ);
+					     Tay = VSUB(T2Q, T2T);
+					     T2U = VADD(T2Q, T2T);
+					     T3W = VSUB(T2Y, T2Z);
+					     T30 = VADD(T2Y, T2Z);
+					     TU = VADD(TS, TT);
+					     T3S = VSUB(TS, TT);
+					     T2X = VADD(T2V, T2W);
+					     T3T = VSUB(T2V, T2W);
+					     {
+						  V T3X, T43, Tax, TY, T31, TaB, T3U, T44;
+						  T7T = VSUB(T3Q, T3P);
+						  T3R = VADD(T3P, T3Q);
+						  T3X = VSUB(T3V, T3W);
+						  T43 = VADD(T3V, T3W);
+						  Tax = VSUB(TX, TU);
+						  TY = VADD(TU, TX);
+						  T31 = VADD(T2X, T30);
+						  TaB = VSUB(T2X, T30);
+						  T3U = VADD(T3S, T3T);
+						  T44 = VSUB(T3T, T3S);
+						  T42 = VSUB(T40, T41);
+						  T7W = VADD(T40, T41);
+						  Tch = VSUB(Tay, Tax);
+						  Taz = VADD(Tax, Tay);
+						  Tdf = VSUB(TR, TY);
+						  TZ = VADD(TR, TY);
+						  Tdg = VSUB(T2U, T31);
+						  T32 = VADD(T2U, T31);
+						  Tci = VSUB(TaA, TaB);
+						  TaC = VADD(TaA, TaB);
+						  T3Y = VSUB(T3U, T3X);
+						  T7X = VADD(T3U, T3X);
+						  T45 = VSUB(T43, T44);
+						  T7U = VADD(T44, T43);
+					     }
+					}
+				   }
+				   {
+					V T4P, T14, T4l, TaH, T4d, T4a, T17, TaI, T4S, T4k, T1e, T4m, T19, T1a, T4g;
+					V T4h;
+					{
+					     V T4b, T4c, T15, T16, T12, T13;
+					     T12 = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
+					     T13 = LD(&(ri[WS(is, 33)]), ivs, &(ri[WS(is, 1)]));
+					     T4b = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
+					     T6J = VFMA(LDK(KP707106781), T3Y, T3R);
+					     T3Z = VFNMS(LDK(KP707106781), T3Y, T3R);
+					     T9n = VFNMS(LDK(KP707106781), T7U, T7T);
+					     T7V = VFMA(LDK(KP707106781), T7U, T7T);
+					     T9o = VFNMS(LDK(KP707106781), T7X, T7W);
+					     T7Y = VFMA(LDK(KP707106781), T7X, T7W);
+					     T6K = VFMA(LDK(KP707106781), T45, T42);
+					     T46 = VFNMS(LDK(KP707106781), T45, T42);
+					     T4P = VSUB(T12, T13);
+					     T14 = VADD(T12, T13);
+					     T4c = LD(&(ii[WS(is, 33)]), ivs, &(ii[WS(is, 1)]));
+					     T15 = LD(&(ri[WS(is, 17)]), ivs, &(ri[WS(is, 1)]));
+					     T16 = LD(&(ri[WS(is, 49)]), ivs, &(ri[WS(is, 1)]));
+					     {
+						  V T4Q, T4R, T1c, T1d;
+						  T4Q = LD(&(ii[WS(is, 17)]), ivs, &(ii[WS(is, 1)]));
+						  T4R = LD(&(ii[WS(is, 49)]), ivs, &(ii[WS(is, 1)]));
+						  T1c = LD(&(ri[WS(is, 57)]), ivs, &(ri[WS(is, 1)]));
+						  T1d = LD(&(ri[WS(is, 25)]), ivs, &(ri[WS(is, 1)]));
+						  T4l = LD(&(ii[WS(is, 57)]), ivs, &(ii[WS(is, 1)]));
+						  TaH = VADD(T4b, T4c);
+						  T4d = VSUB(T4b, T4c);
+						  T4a = VSUB(T15, T16);
+						  T17 = VADD(T15, T16);
+						  TaI = VADD(T4Q, T4R);
+						  T4S = VSUB(T4Q, T4R);
+						  T4k = VSUB(T1c, T1d);
+						  T1e = VADD(T1c, T1d);
+						  T4m = LD(&(ii[WS(is, 25)]), ivs, &(ii[WS(is, 1)]));
+						  T19 = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
+						  T1a = LD(&(ri[WS(is, 41)]), ivs, &(ri[WS(is, 1)]));
+						  T4g = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
+						  T4h = LD(&(ii[WS(is, 41)]), ivs, &(ii[WS(is, 1)]));
+					     }
+					}
+					{
+					     V TaX, T18, T4n, TaZ, TaJ, Tdk, T1b, T4f, TaY, T4i;
+					     TaX = VSUB(T14, T17);
+					     T18 = VADD(T14, T17);
+					     T4n = VSUB(T4l, T4m);
+					     TaZ = VADD(T4l, T4m);
+					     TaJ = VSUB(TaH, TaI);
+					     Tdk = VADD(TaH, TaI);
+					     T1b = VADD(T19, T1a);
+					     T4f = VSUB(T19, T1a);
+					     TaY = VADD(T4g, T4h);
+					     T4i = VSUB(T4g, T4h);
+					     T4e = VADD(T4a, T4d);
+					     T82 = VSUB(T4d, T4a);
+					     {
+						  V T4U, T4o, T1f, TaG, Tdl, Tb0, T4V, T4j;
+						  T8d = VADD(T4P, T4S);
+						  T4T = VSUB(T4P, T4S);
+						  T4U = VADD(T4k, T4n);
+						  T4o = VSUB(T4k, T4n);
+						  T1f = VADD(T1b, T1e);
+						  TaG = VSUB(T1e, T1b);
+						  Tdl = VADD(TaY, TaZ);
+						  Tb0 = VSUB(TaY, TaZ);
+						  T4V = VSUB(T4i, T4f);
+						  T4j = VADD(T4f, T4i);
+						  Tdp = VSUB(T18, T1f);
+						  T1g = VADD(T18, T1f);
+						  Tej = VADD(Tdk, Tdl);
+						  Tdm = VSUB(Tdk, Tdl);
+						  Tcm = VSUB(TaX, Tb0);
+						  Tb1 = VADD(TaX, Tb0);
+						  T4W = VSUB(T4U, T4V);
+						  T83 = VADD(T4V, T4U);
+						  T4p = VSUB(T4j, T4o);
+						  T8e = VADD(T4j, T4o);
+						  Tcp = VSUB(TaJ, TaG);
+						  TaK = VADD(TaG, TaJ);
+					     }
+					}
+				   }
+			      }
+			      {
+				   V T1n, Tdq, T4r, T1q, TaR, T4z, Tb2, TaP, T4M, T4Y, T4w, T1t, TaS, T4u, T8g;
+				   V T87;
+				   {
+					V T1r, T85, T4L, TaO, TaN, T86, T4G, T1s, T4s, T4t;
+					{
+					     V T1h, T1i, T4I, T4J, T1k, T1l, T4D, T4E;
+					     T1h = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
+					     T6O = VFMA(LDK(KP707106781), T4W, T4T);
+					     T4X = VFNMS(LDK(KP707106781), T4W, T4T);
+					     T9s = VFNMS(LDK(KP707106781), T8e, T8d);
+					     T8f = VFMA(LDK(KP707106781), T8e, T8d);
+					     T6R = VFMA(LDK(KP707106781), T4p, T4e);
+					     T4q = VFNMS(LDK(KP707106781), T4p, T4e);
+					     T9v = VFNMS(LDK(KP707106781), T83, T82);
+					     T84 = VFMA(LDK(KP707106781), T83, T82);
+					     T1i = LD(&(ri[WS(is, 37)]), ivs, &(ri[WS(is, 1)]));
+					     T4I = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
+					     T4J = LD(&(ii[WS(is, 37)]), ivs, &(ii[WS(is, 1)]));
+					     T1k = LD(&(ri[WS(is, 21)]), ivs, &(ri[WS(is, 1)]));
+					     T1l = LD(&(ri[WS(is, 53)]), ivs, &(ri[WS(is, 1)]));
+					     T4D = LD(&(ii[WS(is, 21)]), ivs, &(ii[WS(is, 1)]));
+					     T4E = LD(&(ii[WS(is, 53)]), ivs, &(ii[WS(is, 1)]));
+					     {
+						  V T1o, T4C, T1j, TaL, T4K, T4H, T1m, TaM, T4F, T1p, T4x, T4y;
+						  T1o = LD(&(ri[WS(is, 61)]), ivs, &(ri[WS(is, 1)]));
+						  T4C = VSUB(T1h, T1i);
+						  T1j = VADD(T1h, T1i);
+						  TaL = VADD(T4I, T4J);
+						  T4K = VSUB(T4I, T4J);
+						  T4H = VSUB(T1k, T1l);
+						  T1m = VADD(T1k, T1l);
+						  TaM = VADD(T4D, T4E);
+						  T4F = VSUB(T4D, T4E);
+						  T1p = LD(&(ri[WS(is, 29)]), ivs, &(ri[WS(is, 1)]));
+						  T4x = LD(&(ii[WS(is, 61)]), ivs, &(ii[WS(is, 1)]));
+						  T4y = LD(&(ii[WS(is, 29)]), ivs, &(ii[WS(is, 1)]));
+						  T1r = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
+						  T85 = VSUB(T4K, T4H);
+						  T4L = VADD(T4H, T4K);
+						  TaO = VSUB(T1j, T1m);
+						  T1n = VADD(T1j, T1m);
+						  Tdq = VADD(TaL, TaM);
+						  TaN = VSUB(TaL, TaM);
+						  T86 = VADD(T4C, T4F);
+						  T4G = VSUB(T4C, T4F);
+						  T4r = VSUB(T1o, T1p);
+						  T1q = VADD(T1o, T1p);
+						  TaR = VADD(T4x, T4y);
+						  T4z = VSUB(T4x, T4y);
+						  T1s = LD(&(ri[WS(is, 45)]), ivs, &(ri[WS(is, 1)]));
+						  T4s = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
+						  T4t = LD(&(ii[WS(is, 45)]), ivs, &(ii[WS(is, 1)]));
+					     }
+					}
+					Tb2 = VADD(TaO, TaN);
+					TaP = VSUB(TaN, TaO);
+					T4M = VFNMS(LDK(KP414213562), T4L, T4G);
+					T4Y = VFMA(LDK(KP414213562), T4G, T4L);
+					T4w = VSUB(T1r, T1s);
+					T1t = VADD(T1r, T1s);
+					TaS = VADD(T4s, T4t);
+					T4u = VSUB(T4s, T4t);
+					T8g = VFMA(LDK(KP414213562), T85, T86);
+					T87 = VFNMS(LDK(KP414213562), T86, T85);
+				   }
+				   {
+					V T1W, T8o, T5E, Tbf, Tbe, T8p, T5z, T1X, T5l, T5m;
+					{
+					     V T5B, T5v, T1O, T5C, T1P, T1Q, T5w, T5x;
+					     {
+						  V T1M, T88, T4A, T1u, TaQ, Tdr, TaT, T89, T4v, T1N, TaU, Tb3;
+						  T1M = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
+						  T88 = VSUB(T4z, T4w);
+						  T4A = VADD(T4w, T4z);
+						  T1u = VADD(T1q, T1t);
+						  TaQ = VSUB(T1q, T1t);
+						  Tdr = VADD(TaR, TaS);
+						  TaT = VSUB(TaR, TaS);
+						  T89 = VADD(T4r, T4u);
+						  T4v = VSUB(T4r, T4u);
+						  T1N = LD(&(ri[WS(is, 35)]), ivs, &(ri[WS(is, 1)]));
+						  T5B = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
+						  Tdn = VSUB(T1u, T1n);
+						  T1v = VADD(T1n, T1u);
+						  Tek = VADD(Tdq, Tdr);
+						  Tds = VSUB(Tdq, Tdr);
+						  TaU = VADD(TaQ, TaT);
+						  Tb3 = VSUB(TaQ, TaT);
+						  {
+						       V T8a, T8h, T4Z, T4B;
+						       T8a = VFMA(LDK(KP414213562), T89, T88);
+						       T8h = VFNMS(LDK(KP414213562), T88, T89);
+						       T4Z = VFNMS(LDK(KP414213562), T4v, T4A);
+						       T4B = VFMA(LDK(KP414213562), T4A, T4v);
+						       T5v = VSUB(T1M, T1N);
+						       T1O = VADD(T1M, T1N);
+						       Tcn = VSUB(TaU, TaP);
+						       TaV = VADD(TaP, TaU);
+						       Tcq = VSUB(Tb2, Tb3);
+						       Tb4 = VADD(Tb2, Tb3);
+						       T9t = VSUB(T8a, T87);
+						       T8b = VADD(T87, T8a);
+						       T9w = VSUB(T8g, T8h);
+						       T8i = VADD(T8g, T8h);
+						       T6S = VADD(T4Y, T4Z);
+						       T50 = VSUB(T4Y, T4Z);
+						       T6P = VADD(T4M, T4B);
+						       T4N = VSUB(T4B, T4M);
+						       T5C = LD(&(ii[WS(is, 35)]), ivs, &(ii[WS(is, 1)]));
+						  }
+					     }
+					     T1P = LD(&(ri[WS(is, 19)]), ivs, &(ri[WS(is, 1)]));
+					     T1Q = LD(&(ri[WS(is, 51)]), ivs, &(ri[WS(is, 1)]));
+					     T5w = LD(&(ii[WS(is, 19)]), ivs, &(ii[WS(is, 1)]));
+					     T5x = LD(&(ii[WS(is, 51)]), ivs, &(ii[WS(is, 1)]));
+					     {
+						  V T5q, Tbc, T5D, T5A, T1R, Tbd, T5y, T5r, T1T, T1U;
+						  T1T = LD(&(ri[WS(is, 59)]), ivs, &(ri[WS(is, 1)]));
+						  T1U = LD(&(ri[WS(is, 27)]), ivs, &(ri[WS(is, 1)]));
+						  T5q = LD(&(ii[WS(is, 59)]), ivs, &(ii[WS(is, 1)]));
+						  Tbc = VADD(T5B, T5C);
+						  T5D = VSUB(T5B, T5C);
+						  T5A = VSUB(T1P, T1Q);
+						  T1R = VADD(T1P, T1Q);
+						  Tbd = VADD(T5w, T5x);
+						  T5y = VSUB(T5w, T5x);
+						  T5k = VSUB(T1T, T1U);
+						  T1V = VADD(T1T, T1U);
+						  T5r = LD(&(ii[WS(is, 27)]), ivs, &(ii[WS(is, 1)]));
+						  T1W = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
+						  T8o = VSUB(T5D, T5A);
+						  T5E = VADD(T5A, T5D);
+						  Tbf = VSUB(T1O, T1R);
+						  T1S = VADD(T1O, T1R);
+						  TdB = VADD(Tbc, Tbd);
+						  Tbe = VSUB(Tbc, Tbd);
+						  T8p = VADD(T5v, T5y);
+						  T5z = VSUB(T5v, T5y);
+						  Tbi = VADD(T5q, T5r);
+						  T5s = VSUB(T5q, T5r);
+						  T1X = LD(&(ri[WS(is, 43)]), ivs, &(ri[WS(is, 1)]));
+						  T5l = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
+						  T5m = LD(&(ii[WS(is, 43)]), ivs, &(ii[WS(is, 1)]));
+					     }
+					}
+					Tbt = VADD(Tbf, Tbe);
+					Tbg = VSUB(Tbe, Tbf);
+					T5F = VFNMS(LDK(KP414213562), T5E, T5z);
+					T5R = VFMA(LDK(KP414213562), T5z, T5E);
+					T5p = VSUB(T1W, T1X);
+					T1Y = VADD(T1W, T1X);
+					Tbj = VADD(T5l, T5m);
+					T5n = VSUB(T5l, T5m);
+					T8z = VFMA(LDK(KP414213562), T8o, T8p);
+					T8q = VFNMS(LDK(KP414213562), T8p, T8o);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V Tbm, Tbv, T9A, T8u, T9D, T8B, T6Z, T5T, T6W, T5G, TeL, TeM, TeN, TeO, TeR;
+			 V TeS, TeT, TeU, TeV, TeW, TeX, TeY, TeZ, Tf0, Tf1, Tf2, Tf3, Tf4, Tf5, Tf6;
+			 V Tf7, Tf8, Tf9, Tfa, Tfb, Tfc, TbE, Tao, Tfd, Tfe, Td7, Td8, Tff, Tfg, Tfh;
+			 V Tfi, Tfj, Tfk, Tfl, Tfm, Tfn, Tfo, Tfp, Tfq, Tfr, Tfs;
+			 {
+			      V Tel, Tdy, TdD, Tcu, Tcx, Teq, Tei, Ten, Tex, Teh, TeB, Tev, Te9, Tec;
+			      {
+				   V Tef, Teu, TeE, TeD, T11, TeF, T1w, T21, Tet, T2y, T33, Teg, T20;
+				   {
+					V Tv, T8r, T5t, T1Z, Tbh, TdC, Tbk, T8s, T5o, T10, Tep, Tbl, Tbu;
+					Tef = VSUB(Tf, Tu);
+					Tv = VADD(Tf, Tu);
+					T8r = VSUB(T5s, T5p);
+					T5t = VADD(T5p, T5s);
+					T1Z = VADD(T1V, T1Y);
+					Tbh = VSUB(T1V, T1Y);
+					TdC = VADD(Tbi, Tbj);
+					Tbk = VSUB(Tbi, Tbj);
+					T8s = VADD(T5k, T5n);
+					T5o = VSUB(T5k, T5n);
+					T10 = VADD(TK, TZ);
+					Teu = VSUB(TZ, TK);
+					Tel = VSUB(Tej, Tek);
+					TeE = VADD(Tej, Tek);
+					Tdy = VSUB(T1Z, T1S);
+					T20 = VADD(T1S, T1Z);
+					Tep = VADD(TdB, TdC);
+					TdD = VSUB(TdB, TdC);
+					Tbl = VADD(Tbh, Tbk);
+					Tbu = VSUB(Tbh, Tbk);
+					{
+					     V T8t, T8A, T5S, T5u;
+					     T8t = VFMA(LDK(KP414213562), T8s, T8r);
+					     T8A = VFNMS(LDK(KP414213562), T8r, T8s);
+					     T5S = VFNMS(LDK(KP414213562), T5o, T5t);
+					     T5u = VFMA(LDK(KP414213562), T5t, T5o);
+					     TeD = VSUB(Tv, T10);
+					     T11 = VADD(Tv, T10);
+					     Tcu = VSUB(Tbl, Tbg);
+					     Tbm = VADD(Tbg, Tbl);
+					     Tcx = VSUB(Tbt, Tbu);
+					     Tbv = VADD(Tbt, Tbu);
+					     T9A = VSUB(T8t, T8q);
+					     T8u = VADD(T8q, T8t);
+					     T9D = VSUB(T8z, T8A);
+					     T8B = VADD(T8z, T8A);
+					     T6Z = VADD(T5R, T5S);
+					     T5T = VSUB(T5R, T5S);
+					     T6W = VADD(T5F, T5u);
+					     T5G = VSUB(T5u, T5F);
+					     TeF = VADD(Teo, Tep);
+					     Teq = VSUB(Teo, Tep);
+					}
+				   }
+				   Tei = VSUB(T1g, T1v);
+				   T1w = VADD(T1g, T1v);
+				   T21 = VADD(T1L, T20);
+				   Ten = VSUB(T1L, T20);
+				   Tet = VSUB(T2i, T2x);
+				   T2y = VADD(T2i, T2x);
+				   T33 = VADD(T2N, T32);
+				   Teg = VSUB(T2N, T32);
+				   {
+					V TeI, TeG, T23, T22, TeH, T34;
+					TeI = VADD(TeE, TeF);
+					TeG = VSUB(TeE, TeF);
+					T23 = VSUB(T21, T1w);
+					T22 = VADD(T1w, T21);
+					TeH = VADD(T2y, T33);
+					T34 = VSUB(T2y, T33);
+					Tex = VSUB(Tef, Teg);
+					Teh = VADD(Tef, Teg);
+					TeJ = VSUB(TeD, TeG);
+					STM4(&(ro[48]), TeJ, ovs, &(ro[0]));
+					TeK = VADD(TeD, TeG);
+					STM4(&(ro[16]), TeK, ovs, &(ro[0]));
+					TeL = VADD(T11, T22);
+					STM4(&(ro[0]), TeL, ovs, &(ro[0]));
+					TeM = VSUB(T11, T22);
+					STM4(&(ro[32]), TeM, ovs, &(ro[0]));
+					TeN = VADD(TeH, TeI);
+					STM4(&(io[0]), TeN, ovs, &(io[0]));
+					TeO = VSUB(TeH, TeI);
+					STM4(&(io[32]), TeO, ovs, &(io[0]));
+					TeP = VSUB(T34, T23);
+					STM4(&(io[48]), TeP, ovs, &(io[0]));
+					TeQ = VADD(T23, T34);
+					STM4(&(io[16]), TeQ, ovs, &(io[0]));
+					TeB = VADD(Teu, Tet);
+					Tev = VSUB(Tet, Teu);
+				   }
+			      }
+			      {
+				   V TdV, Tdb, TdJ, Te5, TdE, Tdz, TdZ, Tdo, Te6, Tdi, Teb, Te3, TdW, TdM, Tdt;
+				   V TdY;
+				   {
+					V TdL, Tde, Tey, Tem, Tez, Ter, Tdh, TdK, Te1, Te2;
+					TdV = VADD(Td9, Tda);
+					Tdb = VSUB(Td9, Tda);
+					TdJ = VSUB(TdH, TdI);
+					Te5 = VADD(TdI, TdH);
+					TdL = VADD(Tdd, Tdc);
+					Tde = VSUB(Tdc, Tdd);
+					Tey = VSUB(Tel, Tei);
+					Tem = VADD(Tei, Tel);
+					Tez = VADD(Ten, Teq);
+					Ter = VSUB(Ten, Teq);
+					Tdh = VADD(Tdf, Tdg);
+					TdK = VSUB(Tdf, Tdg);
+					TdE = VSUB(TdA, TdD);
+					Te1 = VADD(TdA, TdD);
+					Te2 = VADD(Tdy, Tdx);
+					Tdz = VSUB(Tdx, Tdy);
+					TdZ = VADD(Tdn, Tdm);
+					Tdo = VSUB(Tdm, Tdn);
+					{
+					     V TeA, TeC, Tew, Tes;
+					     TeA = VSUB(Tey, Tez);
+					     TeC = VADD(Tey, Tez);
+					     Tew = VSUB(Ter, Tem);
+					     Tes = VADD(Tem, Ter);
+					     Te6 = VADD(Tde, Tdh);
+					     Tdi = VSUB(Tde, Tdh);
+					     Teb = VFMA(LDK(KP414213562), Te1, Te2);
+					     Te3 = VFNMS(LDK(KP414213562), Te2, Te1);
+					     TdW = VADD(TdL, TdK);
+					     TdM = VSUB(TdK, TdL);
+					     TeR = VFMA(LDK(KP707106781), TeA, Tex);
+					     STM4(&(ro[24]), TeR, ovs, &(ro[0]));
+					     TeS = VFNMS(LDK(KP707106781), TeA, Tex);
+					     STM4(&(ro[56]), TeS, ovs, &(ro[0]));
+					     TeT = VFMA(LDK(KP707106781), TeC, TeB);
+					     STM4(&(io[8]), TeT, ovs, &(io[0]));
+					     TeU = VFNMS(LDK(KP707106781), TeC, TeB);
+					     STM4(&(io[40]), TeU, ovs, &(io[0]));
+					     TeV = VFMA(LDK(KP707106781), Tew, Tev);
+					     STM4(&(io[24]), TeV, ovs, &(io[0]));
+					     TeW = VFNMS(LDK(KP707106781), Tew, Tev);
+					     STM4(&(io[56]), TeW, ovs, &(io[0]));
+					     TeX = VFMA(LDK(KP707106781), Tes, Teh);
+					     STM4(&(ro[8]), TeX, ovs, &(ro[0]));
+					     TeY = VFNMS(LDK(KP707106781), Tes, Teh);
+					     STM4(&(ro[40]), TeY, ovs, &(ro[0]));
+					     Tdt = VSUB(Tdp, Tds);
+					     TdY = VADD(Tdp, Tds);
+					}
+				   }
+				   {
+					V TdT, Tdj, TdP, TdN, TdR, Tdu, Tea, Te0, TdQ, TdF, TdX, Ted, Te7;
+					TdT = VFNMS(LDK(KP707106781), Tdi, Tdb);
+					Tdj = VFMA(LDK(KP707106781), Tdi, Tdb);
+					TdP = VFMA(LDK(KP707106781), TdM, TdJ);
+					TdN = VFNMS(LDK(KP707106781), TdM, TdJ);
+					TdR = VFNMS(LDK(KP414213562), Tdo, Tdt);
+					Tdu = VFMA(LDK(KP414213562), Tdt, Tdo);
+					Tea = VFNMS(LDK(KP414213562), TdY, TdZ);
+					Te0 = VFMA(LDK(KP414213562), TdZ, TdY);
+					TdQ = VFMA(LDK(KP414213562), Tdz, TdE);
+					TdF = VFNMS(LDK(KP414213562), TdE, Tdz);
+					Te9 = VFNMS(LDK(KP707106781), TdW, TdV);
+					TdX = VFMA(LDK(KP707106781), TdW, TdV);
+					Ted = VFMA(LDK(KP707106781), Te6, Te5);
+					Te7 = VFNMS(LDK(KP707106781), Te6, Te5);
+					{
+					     V Tee, Te8, Te4, TdU, TdS, TdO, TdG;
+					     Tee = VADD(Tea, Teb);
+					     Tec = VSUB(Tea, Teb);
+					     Te8 = VSUB(Te3, Te0);
+					     Te4 = VADD(Te0, Te3);
+					     TdU = VADD(TdR, TdQ);
+					     TdS = VSUB(TdQ, TdR);
+					     TdO = VADD(Tdu, TdF);
+					     TdG = VSUB(Tdu, TdF);
+					     TeZ = VFMA(LDK(KP923879532), Tee, Ted);
+					     STM4(&(io[4]), TeZ, ovs, &(io[0]));
+					     Tf0 = VFNMS(LDK(KP923879532), Tee, Ted);
+					     STM4(&(io[36]), Tf0, ovs, &(io[0]));
+					     Tf1 = VFMA(LDK(KP923879532), Te4, TdX);
+					     STM4(&(ro[4]), Tf1, ovs, &(ro[0]));
+					     Tf2 = VFNMS(LDK(KP923879532), Te4, TdX);
+					     STM4(&(ro[36]), Tf2, ovs, &(ro[0]));
+					     Tf3 = VFMA(LDK(KP923879532), TdU, TdT);
+					     STM4(&(ro[60]), Tf3, ovs, &(ro[0]));
+					     Tf4 = VFNMS(LDK(KP923879532), TdU, TdT);
+					     STM4(&(ro[28]), Tf4, ovs, &(ro[0]));
+					     Tf5 = VFMA(LDK(KP923879532), TdS, TdP);
+					     STM4(&(io[12]), Tf5, ovs, &(io[0]));
+					     Tf6 = VFNMS(LDK(KP923879532), TdS, TdP);
+					     STM4(&(io[44]), Tf6, ovs, &(io[0]));
+					     Tf7 = VFMA(LDK(KP923879532), TdO, TdN);
+					     STM4(&(io[60]), Tf7, ovs, &(io[0]));
+					     Tf8 = VFNMS(LDK(KP923879532), TdO, TdN);
+					     STM4(&(io[28]), Tf8, ovs, &(io[0]));
+					     Tf9 = VFMA(LDK(KP923879532), TdG, Tdj);
+					     STM4(&(ro[12]), Tf9, ovs, &(ro[0]));
+					     Tfa = VFNMS(LDK(KP923879532), TdG, Tdj);
+					     STM4(&(ro[44]), Tfa, ovs, &(ro[0]));
+					     Tfb = VFMA(LDK(KP923879532), Te8, Te7);
+					     STM4(&(io[20]), Tfb, ovs, &(io[0]));
+					     Tfc = VFNMS(LDK(KP923879532), Te8, Te7);
+					     STM4(&(io[52]), Tfc, ovs, &(io[0]));
+					}
+				   }
+			      }
+			      {
+				   V TcF, TcE, Tcy, Tcv, TcT, Tco, TcP, Tcd, TcZ, TcD, Td0, Tck, Td4, TcX, Tcr;
+				   V TcS;
+				   {
+					V Tcc, TcC, Tcg, Tcj, TcV, TcW;
+					TbE = VADD(TbC, TbD);
+					Tcc = VSUB(TbC, TbD);
+					TcC = VSUB(Tan, Tak);
+					Tao = VADD(Tak, Tan);
+					TcF = VFNMS(LDK(KP414213562), Tce, Tcf);
+					Tcg = VFMA(LDK(KP414213562), Tcf, Tce);
+					Tcj = VFNMS(LDK(KP414213562), Tci, Tch);
+					TcE = VFMA(LDK(KP414213562), Tch, Tci);
+					Tcy = VFNMS(LDK(KP707106781), Tcx, Tcw);
+					TcV = VFMA(LDK(KP707106781), Tcx, Tcw);
+					TcW = VFMA(LDK(KP707106781), Tcu, Tct);
+					Tcv = VFNMS(LDK(KP707106781), Tcu, Tct);
+					TcT = VFMA(LDK(KP707106781), Tcn, Tcm);
+					Tco = VFNMS(LDK(KP707106781), Tcn, Tcm);
+					Tfd = VFMA(LDK(KP923879532), Tec, Te9);
+					STM4(&(ro[20]), Tfd, ovs, &(ro[0]));
+					Tfe = VFNMS(LDK(KP923879532), Tec, Te9);
+					STM4(&(ro[52]), Tfe, ovs, &(ro[0]));
+					TcP = VFNMS(LDK(KP707106781), Tcc, Tcb);
+					Tcd = VFMA(LDK(KP707106781), Tcc, Tcb);
+					TcZ = VFNMS(LDK(KP707106781), TcC, TcB);
+					TcD = VFMA(LDK(KP707106781), TcC, TcB);
+					Td0 = VADD(Tcg, Tcj);
+					Tck = VSUB(Tcg, Tcj);
+					Td4 = VFMA(LDK(KP198912367), TcV, TcW);
+					TcX = VFNMS(LDK(KP198912367), TcW, TcV);
+					Tcr = VFNMS(LDK(KP707106781), Tcq, Tcp);
+					TcS = VFMA(LDK(KP707106781), Tcq, Tcp);
+				   }
+				   {
+					V TcJ, Tcl, TcK, Tcs, TcQ, TcG, Td5, TcU, TcL, Tcz;
+					TcJ = VFNMS(LDK(KP923879532), Tck, Tcd);
+					Tcl = VFMA(LDK(KP923879532), Tck, Tcd);
+					TcK = VFNMS(LDK(KP668178637), Tco, Tcr);
+					Tcs = VFMA(LDK(KP668178637), Tcr, Tco);
+					TcQ = VADD(TcF, TcE);
+					TcG = VSUB(TcE, TcF);
+					Td5 = VFNMS(LDK(KP198912367), TcS, TcT);
+					TcU = VFMA(LDK(KP198912367), TcT, TcS);
+					TcL = VFMA(LDK(KP668178637), Tcv, Tcy);
+					Tcz = VFNMS(LDK(KP668178637), Tcy, Tcv);
+					{
+					     V Td1, Td3, TcR, TcN, TcH, Td2, TcY, TcM, TcO, TcI, TcA, Td6;
+					     Td1 = VFMA(LDK(KP923879532), Td0, TcZ);
+					     Td3 = VFNMS(LDK(KP923879532), Td0, TcZ);
+					     TcR = VFNMS(LDK(KP923879532), TcQ, TcP);
+					     Td7 = VFMA(LDK(KP923879532), TcQ, TcP);
+					     TcN = VFMA(LDK(KP923879532), TcG, TcD);
+					     TcH = VFNMS(LDK(KP923879532), TcG, TcD);
+					     Td2 = VADD(TcU, TcX);
+					     TcY = VSUB(TcU, TcX);
+					     TcM = VSUB(TcK, TcL);
+					     TcO = VADD(TcK, TcL);
+					     TcI = VSUB(Tcz, Tcs);
+					     TcA = VADD(Tcs, Tcz);
+					     Td6 = VSUB(Td4, Td5);
+					     Td8 = VADD(Td5, Td4);
+					     Tff = VFMA(LDK(KP980785280), TcY, TcR);
+					     STM4(&(ro[14]), Tff, ovs, &(ro[0]));
+					     Tfg = VFNMS(LDK(KP980785280), TcY, TcR);
+					     STM4(&(ro[46]), Tfg, ovs, &(ro[0]));
+					     Tfh = VFMA(LDK(KP831469612), TcM, TcJ);
+					     STM4(&(ro[22]), Tfh, ovs, &(ro[0]));
+					     Tfi = VFNMS(LDK(KP831469612), TcM, TcJ);
+					     STM4(&(ro[54]), Tfi, ovs, &(ro[0]));
+					     Tfj = VFMA(LDK(KP831469612), TcO, TcN);
+					     STM4(&(io[6]), Tfj, ovs, &(io[0]));
+					     Tfk = VFNMS(LDK(KP831469612), TcO, TcN);
+					     STM4(&(io[38]), Tfk, ovs, &(io[0]));
+					     Tfl = VFMA(LDK(KP831469612), TcI, TcH);
+					     STM4(&(io[22]), Tfl, ovs, &(io[0]));
+					     Tfm = VFNMS(LDK(KP831469612), TcI, TcH);
+					     STM4(&(io[54]), Tfm, ovs, &(io[0]));
+					     Tfn = VFMA(LDK(KP831469612), TcA, Tcl);
+					     STM4(&(ro[6]), Tfn, ovs, &(ro[0]));
+					     Tfo = VFNMS(LDK(KP831469612), TcA, Tcl);
+					     STM4(&(ro[38]), Tfo, ovs, &(ro[0]));
+					     Tfp = VFMA(LDK(KP980785280), Td6, Td3);
+					     STM4(&(io[14]), Tfp, ovs, &(io[0]));
+					     Tfq = VFNMS(LDK(KP980785280), Td6, Td3);
+					     STM4(&(io[46]), Tfq, ovs, &(io[0]));
+					     Tfr = VFNMS(LDK(KP980785280), Td2, Td1);
+					     STM4(&(io[30]), Tfr, ovs, &(io[0]));
+					     Tfs = VFMA(LDK(KP980785280), Td2, Td1);
+					     STM4(&(io[62]), Tfs, ovs, &(io[0]));
+					}
+				   }
+			      }
+			 }
+			 {
+			      V Tft, Tfu, Tfv, Tfw, Tfx, Tfy, Tfz, TfA, TfB, TfC, TfD, TfE, TfF, TfG, T3f;
+			      V T66, T63, T3u, TfL, TfM, TfN, TfO, TfP, TfQ, TfR, TfS, TfT, TfU, TfV, TfW;
+			      V TfX, TfY, TfZ, Tg0, Tc5, Tc8;
+			      {
+				   V TbH, TbG, Tbw, Tbn, TbV, TaW, TbR, Tap, Tc1, TbF, Tc2, TaE, Tc7, TbZ, Tb5;
+				   V TbU;
+				   {
+					V Taw, TaD, TbX, TbY;
+					TbH = VFMA(LDK(KP414213562), Tas, Tav);
+					Taw = VFNMS(LDK(KP414213562), Tav, Tas);
+					TaD = VFMA(LDK(KP414213562), TaC, Taz);
+					TbG = VFNMS(LDK(KP414213562), Taz, TaC);
+					Tbw = VFNMS(LDK(KP707106781), Tbv, Tbs);
+					TbX = VFMA(LDK(KP707106781), Tbv, Tbs);
+					TbY = VFMA(LDK(KP707106781), Tbm, Tbb);
+					Tbn = VFNMS(LDK(KP707106781), Tbm, Tbb);
+					TbV = VFMA(LDK(KP707106781), TaV, TaK);
+					TaW = VFNMS(LDK(KP707106781), TaV, TaK);
+					Tft = VFMA(LDK(KP980785280), Td8, Td7);
+					STM4(&(ro[62]), Tft, ovs, &(ro[0]));
+					Tfu = VFNMS(LDK(KP980785280), Td8, Td7);
+					STM4(&(ro[30]), Tfu, ovs, &(ro[0]));
+					TbR = VFMA(LDK(KP707106781), Tao, Tah);
+					Tap = VFNMS(LDK(KP707106781), Tao, Tah);
+					Tc1 = VFMA(LDK(KP707106781), TbE, TbB);
+					TbF = VFNMS(LDK(KP707106781), TbE, TbB);
+					Tc2 = VADD(Taw, TaD);
+					TaE = VSUB(Taw, TaD);
+					Tc7 = VFMA(LDK(KP198912367), TbX, TbY);
+					TbZ = VFNMS(LDK(KP198912367), TbY, TbX);
+					Tb5 = VFNMS(LDK(KP707106781), Tb4, Tb1);
+					TbU = VFMA(LDK(KP707106781), Tb4, Tb1);
+				   }
+				   {
+					V TbP, TaF, TbN, Tb6, TbS, TbI, Tc6, TbW, TbM, Tbx;
+					TbP = VFNMS(LDK(KP923879532), TaE, Tap);
+					TaF = VFMA(LDK(KP923879532), TaE, Tap);
+					TbN = VFNMS(LDK(KP668178637), TaW, Tb5);
+					Tb6 = VFMA(LDK(KP668178637), Tb5, TaW);
+					TbS = VADD(TbH, TbG);
+					TbI = VSUB(TbG, TbH);
+					Tc6 = VFNMS(LDK(KP198912367), TbU, TbV);
+					TbW = VFMA(LDK(KP198912367), TbV, TbU);
+					TbM = VFMA(LDK(KP668178637), Tbn, Tbw);
+					Tbx = VFNMS(LDK(KP668178637), Tbw, Tbn);
+					{
+					     V Tc3, Tc9, TbT, TbL, TbJ, Tc4, Tc0, TbQ, TbO, TbK, Tby, Tca;
+					     Tc3 = VFNMS(LDK(KP923879532), Tc2, Tc1);
+					     Tc9 = VFMA(LDK(KP923879532), Tc2, Tc1);
+					     TbT = VFMA(LDK(KP923879532), TbS, TbR);
+					     Tc5 = VFNMS(LDK(KP923879532), TbS, TbR);
+					     TbL = VFMA(LDK(KP923879532), TbI, TbF);
+					     TbJ = VFNMS(LDK(KP923879532), TbI, TbF);
+					     Tc4 = VSUB(TbZ, TbW);
+					     Tc0 = VADD(TbW, TbZ);
+					     TbQ = VADD(TbN, TbM);
+					     TbO = VSUB(TbM, TbN);
+					     TbK = VADD(Tb6, Tbx);
+					     Tby = VSUB(Tb6, Tbx);
+					     Tca = VADD(Tc6, Tc7);
+					     Tc8 = VSUB(Tc6, Tc7);
+					     Tfv = VFMA(LDK(KP980785280), Tc0, TbT);
+					     STM4(&(ro[2]), Tfv, ovs, &(ro[0]));
+					     Tfw = VFNMS(LDK(KP980785280), Tc0, TbT);
+					     STM4(&(ro[34]), Tfw, ovs, &(ro[0]));
+					     Tfx = VFMA(LDK(KP831469612), TbQ, TbP);
+					     STM4(&(ro[58]), Tfx, ovs, &(ro[0]));
+					     Tfy = VFNMS(LDK(KP831469612), TbQ, TbP);
+					     STM4(&(ro[26]), Tfy, ovs, &(ro[0]));
+					     Tfz = VFMA(LDK(KP831469612), TbO, TbL);
+					     STM4(&(io[10]), Tfz, ovs, &(io[0]));
+					     TfA = VFNMS(LDK(KP831469612), TbO, TbL);
+					     STM4(&(io[42]), TfA, ovs, &(io[0]));
+					     TfB = VFMA(LDK(KP831469612), TbK, TbJ);
+					     STM4(&(io[58]), TfB, ovs, &(io[0]));
+					     TfC = VFNMS(LDK(KP831469612), TbK, TbJ);
+					     STM4(&(io[26]), TfC, ovs, &(io[0]));
+					     TfD = VFMA(LDK(KP831469612), Tby, TaF);
+					     STM4(&(ro[10]), TfD, ovs, &(ro[0]));
+					     TfE = VFNMS(LDK(KP831469612), Tby, TaF);
+					     STM4(&(ro[42]), TfE, ovs, &(ro[0]));
+					     TfF = VFMA(LDK(KP980785280), Tca, Tc9);
+					     STM4(&(io[2]), TfF, ovs, &(io[0]));
+					     TfG = VFNMS(LDK(KP980785280), Tca, Tc9);
+					     STM4(&(io[34]), TfG, ovs, &(io[0]));
+					     TfH = VFNMS(LDK(KP980785280), Tc4, Tc3);
+					     STM4(&(io[50]), TfH, ovs, &(io[0]));
+					     TfI = VFMA(LDK(KP980785280), Tc4, Tc3);
+					     STM4(&(io[18]), TfI, ovs, &(io[0]));
+					}
+				   }
+			      }
+			      {
+				   V T70, T6X, T7h, T6F, T7x, T7m, T7w, T7p, T7s, T6M, T7c, T6U, T7r, T75, T7i;
+				   V T78, T7b, T6N;
+				   {
+					V T6T, T6Q, T77, T6I, T6L, T76, T73, T74;
+					{
+					     V T6D, T6E, T7k, T7l, T7n, T7o;
+					     T3f = VFMA(LDK(KP707106781), T3e, T37);
+					     T6D = VFNMS(LDK(KP707106781), T3e, T37);
+					     T6E = VADD(T65, T64);
+					     T66 = VSUB(T64, T65);
+					     T6T = VFNMS(LDK(KP923879532), T6S, T6R);
+					     T7k = VFMA(LDK(KP923879532), T6S, T6R);
+					     T7l = VFMA(LDK(KP923879532), T6P, T6O);
+					     T6Q = VFNMS(LDK(KP923879532), T6P, T6O);
+					     T70 = VFNMS(LDK(KP923879532), T6Z, T6Y);
+					     T7n = VFMA(LDK(KP923879532), T6Z, T6Y);
+					     T7o = VFMA(LDK(KP923879532), T6W, T6V);
+					     T6X = VFNMS(LDK(KP923879532), T6W, T6V);
+					     T77 = VFNMS(LDK(KP198912367), T6G, T6H);
+					     T6I = VFMA(LDK(KP198912367), T6H, T6G);
+					     TfJ = VFMA(LDK(KP980785280), Tc8, Tc5);
+					     STM4(&(ro[18]), TfJ, ovs, &(ro[0]));
+					     TfK = VFNMS(LDK(KP980785280), Tc8, Tc5);
+					     STM4(&(ro[50]), TfK, ovs, &(ro[0]));
+					     T7h = VFMA(LDK(KP923879532), T6E, T6D);
+					     T6F = VFNMS(LDK(KP923879532), T6E, T6D);
+					     T7x = VFNMS(LDK(KP098491403), T7k, T7l);
+					     T7m = VFMA(LDK(KP098491403), T7l, T7k);
+					     T7w = VFMA(LDK(KP098491403), T7n, T7o);
+					     T7p = VFNMS(LDK(KP098491403), T7o, T7n);
+					     T6L = VFNMS(LDK(KP198912367), T6K, T6J);
+					     T76 = VFMA(LDK(KP198912367), T6J, T6K);
+					}
+					T63 = VFMA(LDK(KP707106781), T62, T5Z);
+					T73 = VFNMS(LDK(KP707106781), T62, T5Z);
+					T74 = VADD(T3m, T3t);
+					T3u = VSUB(T3m, T3t);
+					T7s = VADD(T6I, T6L);
+					T6M = VSUB(T6I, T6L);
+					T7c = VFNMS(LDK(KP820678790), T6Q, T6T);
+					T6U = VFMA(LDK(KP820678790), T6T, T6Q);
+					T7r = VFMA(LDK(KP923879532), T74, T73);
+					T75 = VFNMS(LDK(KP923879532), T74, T73);
+					T7i = VADD(T77, T76);
+					T78 = VSUB(T76, T77);
+				   }
+				   T7b = VFNMS(LDK(KP980785280), T6M, T6F);
+				   T6N = VFMA(LDK(KP980785280), T6M, T6F);
+				   {
+					V T7u, T7q, T7v, T7t, T7A, T7y, T7j, T7z, T7f, T79, T71, T7d;
+					T7u = VADD(T7m, T7p);
+					T7q = VSUB(T7m, T7p);
+					T7v = VFNMS(LDK(KP980785280), T7s, T7r);
+					T7t = VFMA(LDK(KP980785280), T7s, T7r);
+					T7A = VADD(T7x, T7w);
+					T7y = VSUB(T7w, T7x);
+					T7j = VFNMS(LDK(KP980785280), T7i, T7h);
+					T7z = VFMA(LDK(KP980785280), T7i, T7h);
+					T7f = VFMA(LDK(KP980785280), T78, T75);
+					T79 = VFNMS(LDK(KP980785280), T78, T75);
+					T71 = VFNMS(LDK(KP820678790), T70, T6X);
+					T7d = VFMA(LDK(KP820678790), T6X, T70);
+					{
+					     V T7g, T7e, T72, T7a;
+					     TfL = VFMA(LDK(KP995184726), T7y, T7v);
+					     STM4(&(io[15]), TfL, ovs, &(io[1]));
+					     TfM = VFNMS(LDK(KP995184726), T7y, T7v);
+					     STM4(&(io[47]), TfM, ovs, &(io[1]));
+					     TfN = VFMA(LDK(KP995184726), T7q, T7j);
+					     STM4(&(ro[15]), TfN, ovs, &(ro[1]));
+					     TfO = VFNMS(LDK(KP995184726), T7q, T7j);
+					     STM4(&(ro[47]), TfO, ovs, &(ro[1]));
+					     T7g = VADD(T7c, T7d);
+					     T7e = VSUB(T7c, T7d);
+					     T72 = VADD(T6U, T71);
+					     T7a = VSUB(T71, T6U);
+					     TfP = VFNMS(LDK(KP995184726), T7u, T7t);
+					     STM4(&(io[31]), TfP, ovs, &(io[1]));
+					     TfQ = VFMA(LDK(KP995184726), T7u, T7t);
+					     STM4(&(io[63]), TfQ, ovs, &(io[1]));
+					     TfR = VFMA(LDK(KP773010453), T7e, T7b);
+					     STM4(&(ro[23]), TfR, ovs, &(ro[1]));
+					     TfS = VFNMS(LDK(KP773010453), T7e, T7b);
+					     STM4(&(ro[55]), TfS, ovs, &(ro[1]));
+					     TfT = VFMA(LDK(KP773010453), T7g, T7f);
+					     STM4(&(io[7]), TfT, ovs, &(io[1]));
+					     TfU = VFNMS(LDK(KP773010453), T7g, T7f);
+					     STM4(&(io[39]), TfU, ovs, &(io[1]));
+					     TfV = VFMA(LDK(KP773010453), T7a, T79);
+					     STM4(&(io[23]), TfV, ovs, &(io[1]));
+					     TfW = VFNMS(LDK(KP773010453), T7a, T79);
+					     STM4(&(io[55]), TfW, ovs, &(io[1]));
+					     TfX = VFMA(LDK(KP773010453), T72, T6N);
+					     STM4(&(ro[7]), TfX, ovs, &(ro[1]));
+					     TfY = VFNMS(LDK(KP773010453), T72, T6N);
+					     STM4(&(ro[39]), TfY, ovs, &(ro[1]));
+					     TfZ = VFNMS(LDK(KP995184726), T7A, T7z);
+					     STM4(&(ro[31]), TfZ, ovs, &(ro[1]));
+					     Tg0 = VFMA(LDK(KP995184726), T7A, T7z);
+					     STM4(&(ro[63]), Tg0, ovs, &(ro[1]));
+					}
+				   }
+			      }
+			      {
+				   V T7D, T8K, T8H, T7K, Ta8, Ta7, Tae, Tad;
+				   {
+					V T9x, T9u, T9E, T9B, T9L, T9K, T9V, T9j, Tab, Ta0, Taa, Ta3, Ta6, T9q, T9H;
+					V T9I;
+					{
+					     V T9h, T9i, T9Y, T9Z, Ta1, Ta2, T9m, T9p;
+					     T7D = VFMA(LDK(KP707106781), T7C, T7B);
+					     T9h = VFNMS(LDK(KP707106781), T7C, T7B);
+					     T9i = VSUB(T8I, T8J);
+					     T8K = VADD(T8I, T8J);
+					     T9x = VFNMS(LDK(KP923879532), T9w, T9v);
+					     T9Y = VFMA(LDK(KP923879532), T9w, T9v);
+					     T9Z = VFMA(LDK(KP923879532), T9t, T9s);
+					     T9u = VFNMS(LDK(KP923879532), T9t, T9s);
+					     T9E = VFNMS(LDK(KP923879532), T9D, T9C);
+					     Ta1 = VFMA(LDK(KP923879532), T9D, T9C);
+					     Ta2 = VFMA(LDK(KP923879532), T9A, T9z);
+					     T9B = VFNMS(LDK(KP923879532), T9A, T9z);
+					     T9L = VFNMS(LDK(KP668178637), T9k, T9l);
+					     T9m = VFMA(LDK(KP668178637), T9l, T9k);
+					     T9p = VFNMS(LDK(KP668178637), T9o, T9n);
+					     T9K = VFMA(LDK(KP668178637), T9n, T9o);
+					     T9V = VFNMS(LDK(KP923879532), T9i, T9h);
+					     T9j = VFMA(LDK(KP923879532), T9i, T9h);
+					     Tab = VFNMS(LDK(KP303346683), T9Y, T9Z);
+					     Ta0 = VFMA(LDK(KP303346683), T9Z, T9Y);
+					     Taa = VFMA(LDK(KP303346683), Ta1, Ta2);
+					     Ta3 = VFNMS(LDK(KP303346683), Ta2, Ta1);
+					     Ta6 = VADD(T9m, T9p);
+					     T9q = VSUB(T9m, T9p);
+					     T8H = VFMA(LDK(KP707106781), T8G, T8F);
+					     T9H = VFNMS(LDK(KP707106781), T8G, T8F);
+					     T9I = VSUB(T7J, T7G);
+					     T7K = VADD(T7G, T7J);
+					}
+					{
+					     V T9P, T9r, T9Q, T9y, Ta5, T9J, T9W, T9M, T9R, T9F;
+					     T9P = VFNMS(LDK(KP831469612), T9q, T9j);
+					     T9r = VFMA(LDK(KP831469612), T9q, T9j);
+					     T9Q = VFNMS(LDK(KP534511135), T9u, T9x);
+					     T9y = VFMA(LDK(KP534511135), T9x, T9u);
+					     Ta5 = VFNMS(LDK(KP923879532), T9I, T9H);
+					     T9J = VFMA(LDK(KP923879532), T9I, T9H);
+					     T9W = VADD(T9L, T9K);
+					     T9M = VSUB(T9K, T9L);
+					     T9R = VFMA(LDK(KP534511135), T9B, T9E);
+					     T9F = VFNMS(LDK(KP534511135), T9E, T9B);
+					     {
+						  V T9T, T9N, T9U, T9S, T9G, T9O;
+						  {
+						       V Ta4, Ta9, Tac, T9X;
+						       Ta8 = VADD(Ta0, Ta3);
+						       Ta4 = VSUB(Ta0, Ta3);
+						       Ta9 = VFNMS(LDK(KP831469612), Ta6, Ta5);
+						       Ta7 = VFMA(LDK(KP831469612), Ta6, Ta5);
+						       Tae = VADD(Tab, Taa);
+						       Tac = VSUB(Taa, Tab);
+						       T9X = VFNMS(LDK(KP831469612), T9W, T9V);
+						       Tad = VFMA(LDK(KP831469612), T9W, T9V);
+						       T9T = VFMA(LDK(KP831469612), T9M, T9J);
+						       T9N = VFNMS(LDK(KP831469612), T9M, T9J);
+						       T9U = VADD(T9Q, T9R);
+						       T9S = VSUB(T9Q, T9R);
+						       T9G = VADD(T9y, T9F);
+						       T9O = VSUB(T9F, T9y);
+						       {
+							    V Tg1, Tg2, Tg3, Tg4;
+							    Tg1 = VFNMS(LDK(KP956940335), Tac, Ta9);
+							    STM4(&(io[45]), Tg1, ovs, &(io[1]));
+							    STN4(&(io[44]), Tf6, Tg1, Tfq, TfM, ovs);
+							    Tg2 = VFMA(LDK(KP956940335), Ta4, T9X);
+							    STM4(&(ro[13]), Tg2, ovs, &(ro[1]));
+							    STN4(&(ro[12]), Tf9, Tg2, Tff, TfN, ovs);
+							    Tg3 = VFNMS(LDK(KP956940335), Ta4, T9X);
+							    STM4(&(ro[45]), Tg3, ovs, &(ro[1]));
+							    STN4(&(ro[44]), Tfa, Tg3, Tfg, TfO, ovs);
+							    Tg4 = VFMA(LDK(KP956940335), Tac, Ta9);
+							    STM4(&(io[13]), Tg4, ovs, &(io[1]));
+							    STN4(&(io[12]), Tf5, Tg4, Tfp, TfL, ovs);
+						       }
+						  }
+						  {
+						       V Tg5, Tg6, Tg7, Tg8;
+						       Tg5 = VFMA(LDK(KP881921264), T9S, T9P);
+						       STM4(&(ro[21]), Tg5, ovs, &(ro[1]));
+						       STN4(&(ro[20]), Tfd, Tg5, Tfh, TfR, ovs);
+						       Tg6 = VFNMS(LDK(KP881921264), T9S, T9P);
+						       STM4(&(ro[53]), Tg6, ovs, &(ro[1]));
+						       STN4(&(ro[52]), Tfe, Tg6, Tfi, TfS, ovs);
+						       Tg7 = VFMA(LDK(KP881921264), T9U, T9T);
+						       STM4(&(io[5]), Tg7, ovs, &(io[1]));
+						       STN4(&(io[4]), TeZ, Tg7, Tfj, TfT, ovs);
+						       Tg8 = VFNMS(LDK(KP881921264), T9U, T9T);
+						       STM4(&(io[37]), Tg8, ovs, &(io[1]));
+						       STN4(&(io[36]), Tf0, Tg8, Tfk, TfU, ovs);
+						       {
+							    V Tg9, Tga, Tgb, Tgc;
+							    Tg9 = VFMA(LDK(KP881921264), T9O, T9N);
+							    STM4(&(io[21]), Tg9, ovs, &(io[1]));
+							    STN4(&(io[20]), Tfb, Tg9, Tfl, TfV, ovs);
+							    Tga = VFNMS(LDK(KP881921264), T9O, T9N);
+							    STM4(&(io[53]), Tga, ovs, &(io[1]));
+							    STN4(&(io[52]), Tfc, Tga, Tfm, TfW, ovs);
+							    Tgb = VFMA(LDK(KP881921264), T9G, T9r);
+							    STM4(&(ro[5]), Tgb, ovs, &(ro[1]));
+							    STN4(&(ro[4]), Tf1, Tgb, Tfn, TfX, ovs);
+							    Tgc = VFNMS(LDK(KP881921264), T9G, T9r);
+							    STM4(&(ro[37]), Tgc, ovs, &(ro[1]));
+							    STN4(&(ro[36]), Tf2, Tgc, Tfo, TfY, ovs);
+						       }
+						  }
+					     }
+					}
+				   }
+				   {
+					V Tgh, Tgi, Tgl, Tgm, Tgn, Tgo, Tgp, Tgq, Tgr, Tgs, Tgt, Tgu;
+					{
+					     V T5U, T6j, T3v, T6y, T6o, T5H, T69, T68, T6z, T6r, T6u, T48, T6f, T52, T6t;
+					     V T67, T6h, T49;
+					     {
+						  V T51, T4O, T6p, T6q, T3O, T47, T6m, T6n;
+						  T51 = VFNMS(LDK(KP923879532), T50, T4X);
+						  T6m = VFMA(LDK(KP923879532), T50, T4X);
+						  T6n = VFMA(LDK(KP923879532), T4N, T4q);
+						  T4O = VFNMS(LDK(KP923879532), T4N, T4q);
+						  T5U = VFNMS(LDK(KP923879532), T5T, T5Q);
+						  T6p = VFMA(LDK(KP923879532), T5T, T5Q);
+						  {
+						       V Tgd, Tge, Tgf, Tgg;
+						       Tgd = VFMA(LDK(KP956940335), Ta8, Ta7);
+						       STM4(&(io[61]), Tgd, ovs, &(io[1]));
+						       STN4(&(io[60]), Tf7, Tgd, Tfs, TfQ, ovs);
+						       Tge = VFNMS(LDK(KP956940335), Ta8, Ta7);
+						       STM4(&(io[29]), Tge, ovs, &(io[1]));
+						       STN4(&(io[28]), Tf8, Tge, Tfr, TfP, ovs);
+						       Tgf = VFMA(LDK(KP956940335), Tae, Tad);
+						       STM4(&(ro[61]), Tgf, ovs, &(ro[1]));
+						       STN4(&(ro[60]), Tf3, Tgf, Tft, Tg0, ovs);
+						       Tgg = VFNMS(LDK(KP956940335), Tae, Tad);
+						       STM4(&(ro[29]), Tgg, ovs, &(ro[1]));
+						       STN4(&(ro[28]), Tf4, Tgg, Tfu, TfZ, ovs);
+						       T6j = VFMA(LDK(KP923879532), T3u, T3f);
+						       T3v = VFNMS(LDK(KP923879532), T3u, T3f);
+						       T6y = VFNMS(LDK(KP303346683), T6m, T6n);
+						       T6o = VFMA(LDK(KP303346683), T6n, T6m);
+						       T6q = VFMA(LDK(KP923879532), T5G, T5j);
+						       T5H = VFNMS(LDK(KP923879532), T5G, T5j);
+						  }
+						  T69 = VFMA(LDK(KP668178637), T3G, T3N);
+						  T3O = VFNMS(LDK(KP668178637), T3N, T3G);
+						  T47 = VFMA(LDK(KP668178637), T46, T3Z);
+						  T68 = VFNMS(LDK(KP668178637), T3Z, T46);
+						  T6z = VFMA(LDK(KP303346683), T6p, T6q);
+						  T6r = VFNMS(LDK(KP303346683), T6q, T6p);
+						  T6u = VADD(T3O, T47);
+						  T48 = VSUB(T3O, T47);
+						  T6f = VFNMS(LDK(KP534511135), T4O, T51);
+						  T52 = VFMA(LDK(KP534511135), T51, T4O);
+						  T6t = VFMA(LDK(KP923879532), T66, T63);
+						  T67 = VFNMS(LDK(KP923879532), T66, T63);
+					     }
+					     T6h = VFNMS(LDK(KP831469612), T48, T3v);
+					     T49 = VFMA(LDK(KP831469612), T48, T3v);
+					     {
+						  V T6w, T6s, T6B, T6v, T6A, T6C, T6k, T6a, T6e, T5V;
+						  T6w = VSUB(T6r, T6o);
+						  T6s = VADD(T6o, T6r);
+						  T6B = VFMA(LDK(KP831469612), T6u, T6t);
+						  T6v = VFNMS(LDK(KP831469612), T6u, T6t);
+						  T6A = VSUB(T6y, T6z);
+						  T6C = VADD(T6y, T6z);
+						  T6k = VADD(T69, T68);
+						  T6a = VSUB(T68, T69);
+						  T6e = VFMA(LDK(KP534511135), T5H, T5U);
+						  T5V = VFNMS(LDK(KP534511135), T5U, T5H);
+						  Tgh = VFMA(LDK(KP956940335), T6C, T6B);
+						  STM4(&(io[3]), Tgh, ovs, &(io[1]));
+						  Tgi = VFNMS(LDK(KP956940335), T6C, T6B);
+						  STM4(&(io[35]), Tgi, ovs, &(io[1]));
+						  {
+						       V T6l, T6x, T6d, T6b;
+						       T6l = VFMA(LDK(KP831469612), T6k, T6j);
+						       T6x = VFNMS(LDK(KP831469612), T6k, T6j);
+						       T6d = VFMA(LDK(KP831469612), T6a, T67);
+						       T6b = VFNMS(LDK(KP831469612), T6a, T67);
+						       {
+							    V T6g, T6i, T5W, T6c;
+							    T6g = VSUB(T6e, T6f);
+							    T6i = VADD(T6f, T6e);
+							    T5W = VSUB(T52, T5V);
+							    T6c = VADD(T52, T5V);
+							    Tgj = VFMA(LDK(KP956940335), T6w, T6v);
+							    STM4(&(io[19]), Tgj, ovs, &(io[1]));
+							    Tgk = VFNMS(LDK(KP956940335), T6w, T6v);
+							    STM4(&(io[51]), Tgk, ovs, &(io[1]));
+							    Tgl = VFMA(LDK(KP956940335), T6s, T6l);
+							    STM4(&(ro[3]), Tgl, ovs, &(ro[1]));
+							    Tgm = VFNMS(LDK(KP956940335), T6s, T6l);
+							    STM4(&(ro[35]), Tgm, ovs, &(ro[1]));
+							    Tgn = VFMA(LDK(KP881921264), T6i, T6h);
+							    STM4(&(ro[59]), Tgn, ovs, &(ro[1]));
+							    Tgo = VFNMS(LDK(KP881921264), T6i, T6h);
+							    STM4(&(ro[27]), Tgo, ovs, &(ro[1]));
+							    Tgp = VFMA(LDK(KP881921264), T6g, T6d);
+							    STM4(&(io[11]), Tgp, ovs, &(io[1]));
+							    Tgq = VFNMS(LDK(KP881921264), T6g, T6d);
+							    STM4(&(io[43]), Tgq, ovs, &(io[1]));
+							    Tgr = VFMA(LDK(KP881921264), T6c, T6b);
+							    STM4(&(io[59]), Tgr, ovs, &(io[1]));
+							    Tgs = VFNMS(LDK(KP881921264), T6c, T6b);
+							    STM4(&(io[27]), Tgs, ovs, &(io[1]));
+							    Tgt = VFMA(LDK(KP881921264), T5W, T49);
+							    STM4(&(ro[11]), Tgt, ovs, &(ro[1]));
+							    Tgu = VFNMS(LDK(KP881921264), T5W, T49);
+							    STM4(&(ro[43]), Tgu, ovs, &(ro[1]));
+							    Tgv = VFNMS(LDK(KP956940335), T6A, T6x);
+							    STM4(&(ro[51]), Tgv, ovs, &(ro[1]));
+							    Tgw = VFMA(LDK(KP956940335), T6A, T6x);
+							    STM4(&(ro[19]), Tgw, ovs, &(ro[1]));
+						       }
+						  }
+					     }
+					}
+					{
+					     V T8j, T8c, T8C, T8v, T8N, T8M, T8X, T7L, T9c, T92, T9d, T95, T98, T80;
+					     {
+						  V T90, T91, T93, T94, T7S, T7Z;
+						  T8j = VFNMS(LDK(KP923879532), T8i, T8f);
+						  T90 = VFMA(LDK(KP923879532), T8i, T8f);
+						  T91 = VFMA(LDK(KP923879532), T8b, T84);
+						  T8c = VFNMS(LDK(KP923879532), T8b, T84);
+						  T8C = VFNMS(LDK(KP923879532), T8B, T8y);
+						  T93 = VFMA(LDK(KP923879532), T8B, T8y);
+						  T94 = VFMA(LDK(KP923879532), T8u, T8n);
+						  T8v = VFNMS(LDK(KP923879532), T8u, T8n);
+						  T8N = VFMA(LDK(KP198912367), T7O, T7R);
+						  T7S = VFNMS(LDK(KP198912367), T7R, T7O);
+						  T7Z = VFMA(LDK(KP198912367), T7Y, T7V);
+						  T8M = VFNMS(LDK(KP198912367), T7V, T7Y);
+						  T8X = VFMA(LDK(KP923879532), T7K, T7D);
+						  T7L = VFNMS(LDK(KP923879532), T7K, T7D);
+						  T9c = VFNMS(LDK(KP098491403), T90, T91);
+						  T92 = VFMA(LDK(KP098491403), T91, T90);
+						  T9d = VFMA(LDK(KP098491403), T93, T94);
+						  T95 = VFNMS(LDK(KP098491403), T94, T93);
+						  T98 = VADD(T7S, T7Z);
+						  T80 = VSUB(T7S, T7Z);
+					     }
+					     {
+						  V T8V, T81, T8T, T8k, T97, T8L, T8Y, T8O, T8S, T8D;
+						  T8V = VFNMS(LDK(KP980785280), T80, T7L);
+						  T81 = VFMA(LDK(KP980785280), T80, T7L);
+						  T8T = VFNMS(LDK(KP820678790), T8c, T8j);
+						  T8k = VFMA(LDK(KP820678790), T8j, T8c);
+						  T97 = VFMA(LDK(KP923879532), T8K, T8H);
+						  T8L = VFNMS(LDK(KP923879532), T8K, T8H);
+						  T8Y = VADD(T8N, T8M);
+						  T8O = VSUB(T8M, T8N);
+						  T8S = VFMA(LDK(KP820678790), T8v, T8C);
+						  T8D = VFNMS(LDK(KP820678790), T8C, T8v);
+						  {
+						       V T8R, T8P, T8U, T8W, T8E, T8Q;
+						       {
+							    V T96, T9f, T9g, T8Z;
+							    T9a = VSUB(T95, T92);
+							    T96 = VADD(T92, T95);
+							    T9f = VFMA(LDK(KP980785280), T98, T97);
+							    T99 = VFNMS(LDK(KP980785280), T98, T97);
+							    T9e = VSUB(T9c, T9d);
+							    T9g = VADD(T9c, T9d);
+							    T8Z = VFMA(LDK(KP980785280), T8Y, T8X);
+							    T9b = VFNMS(LDK(KP980785280), T8Y, T8X);
+							    T8R = VFMA(LDK(KP980785280), T8O, T8L);
+							    T8P = VFNMS(LDK(KP980785280), T8O, T8L);
+							    T8U = VSUB(T8S, T8T);
+							    T8W = VADD(T8T, T8S);
+							    T8E = VSUB(T8k, T8D);
+							    T8Q = VADD(T8k, T8D);
+							    {
+								 V Tgx, Tgy, Tgz, TgA;
+								 Tgx = VFNMS(LDK(KP995184726), T9g, T9f);
+								 STM4(&(io[33]), Tgx, ovs, &(io[1]));
+								 STN4(&(io[32]), TeO, Tgx, TfG, Tgi, ovs);
+								 Tgy = VFMA(LDK(KP995184726), T96, T8Z);
+								 STM4(&(ro[1]), Tgy, ovs, &(ro[1]));
+								 STN4(&(ro[0]), TeL, Tgy, Tfv, Tgl, ovs);
+								 Tgz = VFNMS(LDK(KP995184726), T96, T8Z);
+								 STM4(&(ro[33]), Tgz, ovs, &(ro[1]));
+								 STN4(&(ro[32]), TeM, Tgz, Tfw, Tgm, ovs);
+								 TgA = VFMA(LDK(KP995184726), T9g, T9f);
+								 STM4(&(io[1]), TgA, ovs, &(io[1]));
+								 STN4(&(io[0]), TeN, TgA, TfF, Tgh, ovs);
+							    }
+						       }
+						       {
+							    V TgB, TgC, TgD, TgE;
+							    TgB = VFMA(LDK(KP773010453), T8W, T8V);
+							    STM4(&(ro[57]), TgB, ovs, &(ro[1]));
+							    STN4(&(ro[56]), TeS, TgB, Tfx, Tgn, ovs);
+							    TgC = VFNMS(LDK(KP773010453), T8W, T8V);
+							    STM4(&(ro[25]), TgC, ovs, &(ro[1]));
+							    STN4(&(ro[24]), TeR, TgC, Tfy, Tgo, ovs);
+							    TgD = VFMA(LDK(KP773010453), T8U, T8R);
+							    STM4(&(io[9]), TgD, ovs, &(io[1]));
+							    STN4(&(io[8]), TeT, TgD, Tfz, Tgp, ovs);
+							    TgE = VFNMS(LDK(KP773010453), T8U, T8R);
+							    STM4(&(io[41]), TgE, ovs, &(io[1]));
+							    STN4(&(io[40]), TeU, TgE, TfA, Tgq, ovs);
+							    {
+								 V TgF, TgG, TgH, TgI;
+								 TgF = VFMA(LDK(KP773010453), T8Q, T8P);
+								 STM4(&(io[57]), TgF, ovs, &(io[1]));
+								 STN4(&(io[56]), TeW, TgF, TfB, Tgr, ovs);
+								 TgG = VFNMS(LDK(KP773010453), T8Q, T8P);
+								 STM4(&(io[25]), TgG, ovs, &(io[1]));
+								 STN4(&(io[24]), TeV, TgG, TfC, Tgs, ovs);
+								 TgH = VFMA(LDK(KP773010453), T8E, T81);
+								 STM4(&(ro[9]), TgH, ovs, &(ro[1]));
+								 STN4(&(ro[8]), TeX, TgH, TfD, Tgt, ovs);
+								 TgI = VFNMS(LDK(KP773010453), T8E, T81);
+								 STM4(&(ro[41]), TgI, ovs, &(ro[1]));
+								 STN4(&(ro[40]), TeY, TgI, TfE, Tgu, ovs);
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V TgJ, TgK, TgL, TgM;
+		    TgJ = VFMA(LDK(KP995184726), T9a, T99);
+		    STM4(&(io[17]), TgJ, ovs, &(io[1]));
+		    STN4(&(io[16]), TeQ, TgJ, TfI, Tgj, ovs);
+		    TgK = VFNMS(LDK(KP995184726), T9a, T99);
+		    STM4(&(io[49]), TgK, ovs, &(io[1]));
+		    STN4(&(io[48]), TeP, TgK, TfH, Tgk, ovs);
+		    TgL = VFMA(LDK(KP995184726), T9e, T9b);
+		    STM4(&(ro[17]), TgL, ovs, &(ro[1]));
+		    STN4(&(ro[16]), TeK, TgL, TfJ, Tgw, ovs);
+		    TgM = VFNMS(LDK(KP995184726), T9e, T9b);
+		    STM4(&(ro[49]), TgM, ovs, &(ro[1]));
+		    STN4(&(ro[48]), TeJ, TgM, TfK, Tgv, ovs);
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 64, XSIMD_STRING("n2sv_64"), {520, 0, 392, 0}, &GENUS, 0, 1, 0, 0 };
+
+void XSIMD(codelet_n2sv_64) (planner *p) {
+     X(kdft_register) (p, n2sv_64, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n2sv_64 -with-ostride 1 -include n2s.h -store-multiple 4 */
+
+/*
+ * This function contains 912 FP additions, 248 FP multiplications,
+ * (or, 808 additions, 144 multiplications, 104 fused multiply/add),
+ * 260 stack variables, 15 constants, and 288 memory accesses
+ */
+#include "n2s.h"
+
+static void n2sv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(256, is), MAKE_VOLATILE_STRIDE(256, os)) {
+	       V T37, T7B, T8F, T5Z, Tf, Td9, TbB, TcB, T62, T7C, T2i, TdH, Tah, Tcb, T3e;
+	       V T8G, Tu, TdI, Tak, TbD, Tan, TbC, T2x, Tda, T3m, T65, T7G, T8J, T7J, T8I;
+	       V T3t, T64, TK, Tdd, Tas, Tce, Tav, Tcf, T2N, Tdc, T3G, T6G, T7O, T9k, T7R;
+	       V T9l, T3N, T6H, T1L, Tdv, Tbs, Tcw, TdC, Teo, T5j, T6V, T5Q, T6Y, T8y, T9C;
+	       V Tbb, Tct, T8n, T9z, TZ, Tdf, Taz, Tch, TaC, Tci, T32, Tdg, T3Z, T6J, T7V;
+	       V T9n, T7Y, T9o, T46, T6K, T1g, Tdp, Tb1, Tcm, Tdm, Tej, T4q, T6R, T4X, T6O;
+	       V T8f, T9s, TaK, Tcp, T84, T9v, T1v, Tdn, Tb4, Tcq, Tds, Tek, T4N, T6P, T50;
+	       V T6S, T8i, T9w, TaV, Tcn, T8b, T9t, T20, TdD, Tbv, Tcu, Tdy, Tep, T5G, T6Z;
+	       V T5T, T6W, T8B, T9A, Tbm, Tcx, T8u, T9D;
+	       {
+		    V T3, T35, T26, T5Y, T6, T5X, T29, T36, Ta, T39, T2d, T38, Td, T3b, T2g;
+		    V T3c;
+		    {
+			 V T1, T2, T24, T25;
+			 T1 = LD(&(ri[0]), ivs, &(ri[0]));
+			 T2 = LD(&(ri[WS(is, 32)]), ivs, &(ri[0]));
+			 T3 = VADD(T1, T2);
+			 T35 = VSUB(T1, T2);
+			 T24 = LD(&(ii[0]), ivs, &(ii[0]));
+			 T25 = LD(&(ii[WS(is, 32)]), ivs, &(ii[0]));
+			 T26 = VADD(T24, T25);
+			 T5Y = VSUB(T24, T25);
+		    }
+		    {
+			 V T4, T5, T27, T28;
+			 T4 = LD(&(ri[WS(is, 16)]), ivs, &(ri[0]));
+			 T5 = LD(&(ri[WS(is, 48)]), ivs, &(ri[0]));
+			 T6 = VADD(T4, T5);
+			 T5X = VSUB(T4, T5);
+			 T27 = LD(&(ii[WS(is, 16)]), ivs, &(ii[0]));
+			 T28 = LD(&(ii[WS(is, 48)]), ivs, &(ii[0]));
+			 T29 = VADD(T27, T28);
+			 T36 = VSUB(T27, T28);
+		    }
+		    {
+			 V T8, T9, T2b, T2c;
+			 T8 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
+			 T9 = LD(&(ri[WS(is, 40)]), ivs, &(ri[0]));
+			 Ta = VADD(T8, T9);
+			 T39 = VSUB(T8, T9);
+			 T2b = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
+			 T2c = LD(&(ii[WS(is, 40)]), ivs, &(ii[0]));
+			 T2d = VADD(T2b, T2c);
+			 T38 = VSUB(T2b, T2c);
+		    }
+		    {
+			 V Tb, Tc, T2e, T2f;
+			 Tb = LD(&(ri[WS(is, 56)]), ivs, &(ri[0]));
+			 Tc = LD(&(ri[WS(is, 24)]), ivs, &(ri[0]));
+			 Td = VADD(Tb, Tc);
+			 T3b = VSUB(Tb, Tc);
+			 T2e = LD(&(ii[WS(is, 56)]), ivs, &(ii[0]));
+			 T2f = LD(&(ii[WS(is, 24)]), ivs, &(ii[0]));
+			 T2g = VADD(T2e, T2f);
+			 T3c = VSUB(T2e, T2f);
+		    }
+		    {
+			 V T7, Te, T2a, T2h;
+			 T37 = VSUB(T35, T36);
+			 T7B = VADD(T35, T36);
+			 T8F = VSUB(T5Y, T5X);
+			 T5Z = VADD(T5X, T5Y);
+			 T7 = VADD(T3, T6);
+			 Te = VADD(Ta, Td);
+			 Tf = VADD(T7, Te);
+			 Td9 = VSUB(T7, Te);
+			 {
+			      V Tbz, TbA, T60, T61;
+			      Tbz = VSUB(T26, T29);
+			      TbA = VSUB(Td, Ta);
+			      TbB = VSUB(Tbz, TbA);
+			      TcB = VADD(TbA, Tbz);
+			      T60 = VSUB(T3b, T3c);
+			      T61 = VADD(T39, T38);
+			      T62 = VMUL(LDK(KP707106781), VSUB(T60, T61));
+			      T7C = VMUL(LDK(KP707106781), VADD(T61, T60));
+			 }
+			 T2a = VADD(T26, T29);
+			 T2h = VADD(T2d, T2g);
+			 T2i = VADD(T2a, T2h);
+			 TdH = VSUB(T2a, T2h);
+			 {
+			      V Taf, Tag, T3a, T3d;
+			      Taf = VSUB(T3, T6);
+			      Tag = VSUB(T2d, T2g);
+			      Tah = VSUB(Taf, Tag);
+			      Tcb = VADD(Taf, Tag);
+			      T3a = VSUB(T38, T39);
+			      T3d = VADD(T3b, T3c);
+			      T3e = VMUL(LDK(KP707106781), VSUB(T3a, T3d));
+			      T8G = VMUL(LDK(KP707106781), VADD(T3a, T3d));
+			 }
+		    }
+	       }
+	       {
+		    V Ti, T3j, T2l, T3h, Tl, T3g, T2o, T3k, Tp, T3q, T2s, T3o, Ts, T3n, T2v;
+		    V T3r;
+		    {
+			 V Tg, Th, T2j, T2k;
+			 Tg = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
+			 Th = LD(&(ri[WS(is, 36)]), ivs, &(ri[0]));
+			 Ti = VADD(Tg, Th);
+			 T3j = VSUB(Tg, Th);
+			 T2j = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
+			 T2k = LD(&(ii[WS(is, 36)]), ivs, &(ii[0]));
+			 T2l = VADD(T2j, T2k);
+			 T3h = VSUB(T2j, T2k);
+		    }
+		    {
+			 V Tj, Tk, T2m, T2n;
+			 Tj = LD(&(ri[WS(is, 20)]), ivs, &(ri[0]));
+			 Tk = LD(&(ri[WS(is, 52)]), ivs, &(ri[0]));
+			 Tl = VADD(Tj, Tk);
+			 T3g = VSUB(Tj, Tk);
+			 T2m = LD(&(ii[WS(is, 20)]), ivs, &(ii[0]));
+			 T2n = LD(&(ii[WS(is, 52)]), ivs, &(ii[0]));
+			 T2o = VADD(T2m, T2n);
+			 T3k = VSUB(T2m, T2n);
+		    }
+		    {
+			 V Tn, To, T2q, T2r;
+			 Tn = LD(&(ri[WS(is, 60)]), ivs, &(ri[0]));
+			 To = LD(&(ri[WS(is, 28)]), ivs, &(ri[0]));
+			 Tp = VADD(Tn, To);
+			 T3q = VSUB(Tn, To);
+			 T2q = LD(&(ii[WS(is, 60)]), ivs, &(ii[0]));
+			 T2r = LD(&(ii[WS(is, 28)]), ivs, &(ii[0]));
+			 T2s = VADD(T2q, T2r);
+			 T3o = VSUB(T2q, T2r);
+		    }
+		    {
+			 V Tq, Tr, T2t, T2u;
+			 Tq = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
+			 Tr = LD(&(ri[WS(is, 44)]), ivs, &(ri[0]));
+			 Ts = VADD(Tq, Tr);
+			 T3n = VSUB(Tq, Tr);
+			 T2t = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
+			 T2u = LD(&(ii[WS(is, 44)]), ivs, &(ii[0]));
+			 T2v = VADD(T2t, T2u);
+			 T3r = VSUB(T2t, T2u);
+		    }
+		    {
+			 V Tm, Tt, Tai, Taj;
+			 Tm = VADD(Ti, Tl);
+			 Tt = VADD(Tp, Ts);
+			 Tu = VADD(Tm, Tt);
+			 TdI = VSUB(Tt, Tm);
+			 Tai = VSUB(T2l, T2o);
+			 Taj = VSUB(Ti, Tl);
+			 Tak = VSUB(Tai, Taj);
+			 TbD = VADD(Taj, Tai);
+		    }
+		    {
+			 V Tal, Tam, T2p, T2w;
+			 Tal = VSUB(Tp, Ts);
+			 Tam = VSUB(T2s, T2v);
+			 Tan = VADD(Tal, Tam);
+			 TbC = VSUB(Tal, Tam);
+			 T2p = VADD(T2l, T2o);
+			 T2w = VADD(T2s, T2v);
+			 T2x = VADD(T2p, T2w);
+			 Tda = VSUB(T2p, T2w);
+		    }
+		    {
+			 V T3i, T3l, T7E, T7F;
+			 T3i = VADD(T3g, T3h);
+			 T3l = VSUB(T3j, T3k);
+			 T3m = VFNMS(LDK(KP923879532), T3l, VMUL(LDK(KP382683432), T3i));
+			 T65 = VFMA(LDK(KP923879532), T3i, VMUL(LDK(KP382683432), T3l));
+			 T7E = VSUB(T3h, T3g);
+			 T7F = VADD(T3j, T3k);
+			 T7G = VFNMS(LDK(KP382683432), T7F, VMUL(LDK(KP923879532), T7E));
+			 T8J = VFMA(LDK(KP382683432), T7E, VMUL(LDK(KP923879532), T7F));
+		    }
+		    {
+			 V T7H, T7I, T3p, T3s;
+			 T7H = VSUB(T3o, T3n);
+			 T7I = VADD(T3q, T3r);
+			 T7J = VFMA(LDK(KP923879532), T7H, VMUL(LDK(KP382683432), T7I));
+			 T8I = VFNMS(LDK(KP382683432), T7H, VMUL(LDK(KP923879532), T7I));
+			 T3p = VADD(T3n, T3o);
+			 T3s = VSUB(T3q, T3r);
+			 T3t = VFMA(LDK(KP382683432), T3p, VMUL(LDK(KP923879532), T3s));
+			 T64 = VFNMS(LDK(KP923879532), T3p, VMUL(LDK(KP382683432), T3s));
+		    }
+	       }
+	       {
+		    V Ty, T3H, T2B, T3x, TB, T3w, T2E, T3I, TI, T3L, T2L, T3B, TF, T3K, T2I;
+		    V T3E;
+		    {
+			 V Tw, Tx, T2C, T2D;
+			 Tw = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
+			 Tx = LD(&(ri[WS(is, 34)]), ivs, &(ri[0]));
+			 Ty = VADD(Tw, Tx);
+			 T3H = VSUB(Tw, Tx);
+			 {
+			      V T2z, T2A, Tz, TA;
+			      T2z = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
+			      T2A = LD(&(ii[WS(is, 34)]), ivs, &(ii[0]));
+			      T2B = VADD(T2z, T2A);
+			      T3x = VSUB(T2z, T2A);
+			      Tz = LD(&(ri[WS(is, 18)]), ivs, &(ri[0]));
+			      TA = LD(&(ri[WS(is, 50)]), ivs, &(ri[0]));
+			      TB = VADD(Tz, TA);
+			      T3w = VSUB(Tz, TA);
+			 }
+			 T2C = LD(&(ii[WS(is, 18)]), ivs, &(ii[0]));
+			 T2D = LD(&(ii[WS(is, 50)]), ivs, &(ii[0]));
+			 T2E = VADD(T2C, T2D);
+			 T3I = VSUB(T2C, T2D);
+			 {
+			      V TG, TH, T3z, T2J, T2K, T3A;
+			      TG = LD(&(ri[WS(is, 58)]), ivs, &(ri[0]));
+			      TH = LD(&(ri[WS(is, 26)]), ivs, &(ri[0]));
+			      T3z = VSUB(TG, TH);
+			      T2J = LD(&(ii[WS(is, 58)]), ivs, &(ii[0]));
+			      T2K = LD(&(ii[WS(is, 26)]), ivs, &(ii[0]));
+			      T3A = VSUB(T2J, T2K);
+			      TI = VADD(TG, TH);
+			      T3L = VADD(T3z, T3A);
+			      T2L = VADD(T2J, T2K);
+			      T3B = VSUB(T3z, T3A);
+			 }
+			 {
+			      V TD, TE, T3C, T2G, T2H, T3D;
+			      TD = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
+			      TE = LD(&(ri[WS(is, 42)]), ivs, &(ri[0]));
+			      T3C = VSUB(TD, TE);
+			      T2G = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
+			      T2H = LD(&(ii[WS(is, 42)]), ivs, &(ii[0]));
+			      T3D = VSUB(T2G, T2H);
+			      TF = VADD(TD, TE);
+			      T3K = VSUB(T3D, T3C);
+			      T2I = VADD(T2G, T2H);
+			      T3E = VADD(T3C, T3D);
+			 }
+		    }
+		    {
+			 V TC, TJ, Taq, Tar;
+			 TC = VADD(Ty, TB);
+			 TJ = VADD(TF, TI);
+			 TK = VADD(TC, TJ);
+			 Tdd = VSUB(TC, TJ);
+			 Taq = VSUB(T2B, T2E);
+			 Tar = VSUB(TI, TF);
+			 Tas = VSUB(Taq, Tar);
+			 Tce = VADD(Tar, Taq);
+		    }
+		    {
+			 V Tat, Tau, T2F, T2M;
+			 Tat = VSUB(Ty, TB);
+			 Tau = VSUB(T2I, T2L);
+			 Tav = VSUB(Tat, Tau);
+			 Tcf = VADD(Tat, Tau);
+			 T2F = VADD(T2B, T2E);
+			 T2M = VADD(T2I, T2L);
+			 T2N = VADD(T2F, T2M);
+			 Tdc = VSUB(T2F, T2M);
+		    }
+		    {
+			 V T3y, T3F, T7M, T7N;
+			 T3y = VADD(T3w, T3x);
+			 T3F = VMUL(LDK(KP707106781), VSUB(T3B, T3E));
+			 T3G = VSUB(T3y, T3F);
+			 T6G = VADD(T3y, T3F);
+			 T7M = VSUB(T3x, T3w);
+			 T7N = VMUL(LDK(KP707106781), VADD(T3K, T3L));
+			 T7O = VSUB(T7M, T7N);
+			 T9k = VADD(T7M, T7N);
+		    }
+		    {
+			 V T7P, T7Q, T3J, T3M;
+			 T7P = VADD(T3H, T3I);
+			 T7Q = VMUL(LDK(KP707106781), VADD(T3E, T3B));
+			 T7R = VSUB(T7P, T7Q);
+			 T9l = VADD(T7P, T7Q);
+			 T3J = VSUB(T3H, T3I);
+			 T3M = VMUL(LDK(KP707106781), VSUB(T3K, T3L));
+			 T3N = VSUB(T3J, T3M);
+			 T6H = VADD(T3J, T3M);
+		    }
+	       }
+	       {
+		    V T1z, T53, T5L, Tbo, T1C, T5I, T56, Tbp, T1J, Tb9, T5h, T5N, T1G, Tb8, T5c;
+		    V T5O;
+		    {
+			 V T1x, T1y, T54, T55;
+			 T1x = LD(&(ri[WS(is, 63)]), ivs, &(ri[WS(is, 1)]));
+			 T1y = LD(&(ri[WS(is, 31)]), ivs, &(ri[WS(is, 1)]));
+			 T1z = VADD(T1x, T1y);
+			 T53 = VSUB(T1x, T1y);
+			 {
+			      V T5J, T5K, T1A, T1B;
+			      T5J = LD(&(ii[WS(is, 63)]), ivs, &(ii[WS(is, 1)]));
+			      T5K = LD(&(ii[WS(is, 31)]), ivs, &(ii[WS(is, 1)]));
+			      T5L = VSUB(T5J, T5K);
+			      Tbo = VADD(T5J, T5K);
+			      T1A = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
+			      T1B = LD(&(ri[WS(is, 47)]), ivs, &(ri[WS(is, 1)]));
+			      T1C = VADD(T1A, T1B);
+			      T5I = VSUB(T1A, T1B);
+			 }
+			 T54 = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
+			 T55 = LD(&(ii[WS(is, 47)]), ivs, &(ii[WS(is, 1)]));
+			 T56 = VSUB(T54, T55);
+			 Tbp = VADD(T54, T55);
+			 {
+			      V T1H, T1I, T5d, T5e, T5f, T5g;
+			      T1H = LD(&(ri[WS(is, 55)]), ivs, &(ri[WS(is, 1)]));
+			      T1I = LD(&(ri[WS(is, 23)]), ivs, &(ri[WS(is, 1)]));
+			      T5d = VSUB(T1H, T1I);
+			      T5e = LD(&(ii[WS(is, 55)]), ivs, &(ii[WS(is, 1)]));
+			      T5f = LD(&(ii[WS(is, 23)]), ivs, &(ii[WS(is, 1)]));
+			      T5g = VSUB(T5e, T5f);
+			      T1J = VADD(T1H, T1I);
+			      Tb9 = VADD(T5e, T5f);
+			      T5h = VADD(T5d, T5g);
+			      T5N = VSUB(T5d, T5g);
+			 }
+			 {
+			      V T1E, T1F, T5b, T58, T59, T5a;
+			      T1E = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
+			      T1F = LD(&(ri[WS(is, 39)]), ivs, &(ri[WS(is, 1)]));
+			      T5b = VSUB(T1E, T1F);
+			      T58 = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
+			      T59 = LD(&(ii[WS(is, 39)]), ivs, &(ii[WS(is, 1)]));
+			      T5a = VSUB(T58, T59);
+			      T1G = VADD(T1E, T1F);
+			      Tb8 = VADD(T58, T59);
+			      T5c = VSUB(T5a, T5b);
+			      T5O = VADD(T5b, T5a);
+			 }
+		    }
+		    {
+			 V T1D, T1K, Tbq, Tbr;
+			 T1D = VADD(T1z, T1C);
+			 T1K = VADD(T1G, T1J);
+			 T1L = VADD(T1D, T1K);
+			 Tdv = VSUB(T1D, T1K);
+			 Tbq = VSUB(Tbo, Tbp);
+			 Tbr = VSUB(T1J, T1G);
+			 Tbs = VSUB(Tbq, Tbr);
+			 Tcw = VADD(Tbr, Tbq);
+		    }
+		    {
+			 V TdA, TdB, T57, T5i;
+			 TdA = VADD(Tbo, Tbp);
+			 TdB = VADD(Tb8, Tb9);
+			 TdC = VSUB(TdA, TdB);
+			 Teo = VADD(TdA, TdB);
+			 T57 = VSUB(T53, T56);
+			 T5i = VMUL(LDK(KP707106781), VSUB(T5c, T5h));
+			 T5j = VSUB(T57, T5i);
+			 T6V = VADD(T57, T5i);
+		    }
+		    {
+			 V T5M, T5P, T8w, T8x;
+			 T5M = VADD(T5I, T5L);
+			 T5P = VMUL(LDK(KP707106781), VSUB(T5N, T5O));
+			 T5Q = VSUB(T5M, T5P);
+			 T6Y = VADD(T5M, T5P);
+			 T8w = VSUB(T5L, T5I);
+			 T8x = VMUL(LDK(KP707106781), VADD(T5c, T5h));
+			 T8y = VSUB(T8w, T8x);
+			 T9C = VADD(T8w, T8x);
+		    }
+		    {
+			 V Tb7, Tba, T8l, T8m;
+			 Tb7 = VSUB(T1z, T1C);
+			 Tba = VSUB(Tb8, Tb9);
+			 Tbb = VSUB(Tb7, Tba);
+			 Tct = VADD(Tb7, Tba);
+			 T8l = VADD(T53, T56);
+			 T8m = VMUL(LDK(KP707106781), VADD(T5O, T5N));
+			 T8n = VSUB(T8l, T8m);
+			 T9z = VADD(T8l, T8m);
+		    }
+	       }
+	       {
+		    V TN, T40, T2Q, T3Q, TQ, T3P, T2T, T41, TX, T44, T30, T3U, TU, T43, T2X;
+		    V T3X;
+		    {
+			 V TL, TM, T2R, T2S;
+			 TL = LD(&(ri[WS(is, 62)]), ivs, &(ri[0]));
+			 TM = LD(&(ri[WS(is, 30)]), ivs, &(ri[0]));
+			 TN = VADD(TL, TM);
+			 T40 = VSUB(TL, TM);
+			 {
+			      V T2O, T2P, TO, TP;
+			      T2O = LD(&(ii[WS(is, 62)]), ivs, &(ii[0]));
+			      T2P = LD(&(ii[WS(is, 30)]), ivs, &(ii[0]));
+			      T2Q = VADD(T2O, T2P);
+			      T3Q = VSUB(T2O, T2P);
+			      TO = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
+			      TP = LD(&(ri[WS(is, 46)]), ivs, &(ri[0]));
+			      TQ = VADD(TO, TP);
+			      T3P = VSUB(TO, TP);
+			 }
+			 T2R = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
+			 T2S = LD(&(ii[WS(is, 46)]), ivs, &(ii[0]));
+			 T2T = VADD(T2R, T2S);
+			 T41 = VSUB(T2R, T2S);
+			 {
+			      V TV, TW, T3S, T2Y, T2Z, T3T;
+			      TV = LD(&(ri[WS(is, 54)]), ivs, &(ri[0]));
+			      TW = LD(&(ri[WS(is, 22)]), ivs, &(ri[0]));
+			      T3S = VSUB(TV, TW);
+			      T2Y = LD(&(ii[WS(is, 54)]), ivs, &(ii[0]));
+			      T2Z = LD(&(ii[WS(is, 22)]), ivs, &(ii[0]));
+			      T3T = VSUB(T2Y, T2Z);
+			      TX = VADD(TV, TW);
+			      T44 = VADD(T3S, T3T);
+			      T30 = VADD(T2Y, T2Z);
+			      T3U = VSUB(T3S, T3T);
+			 }
+			 {
+			      V TS, TT, T3V, T2V, T2W, T3W;
+			      TS = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
+			      TT = LD(&(ri[WS(is, 38)]), ivs, &(ri[0]));
+			      T3V = VSUB(TS, TT);
+			      T2V = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
+			      T2W = LD(&(ii[WS(is, 38)]), ivs, &(ii[0]));
+			      T3W = VSUB(T2V, T2W);
+			      TU = VADD(TS, TT);
+			      T43 = VSUB(T3W, T3V);
+			      T2X = VADD(T2V, T2W);
+			      T3X = VADD(T3V, T3W);
+			 }
+		    }
+		    {
+			 V TR, TY, Tax, Tay;
+			 TR = VADD(TN, TQ);
+			 TY = VADD(TU, TX);
+			 TZ = VADD(TR, TY);
+			 Tdf = VSUB(TR, TY);
+			 Tax = VSUB(T2Q, T2T);
+			 Tay = VSUB(TX, TU);
+			 Taz = VSUB(Tax, Tay);
+			 Tch = VADD(Tay, Tax);
+		    }
+		    {
+			 V TaA, TaB, T2U, T31;
+			 TaA = VSUB(TN, TQ);
+			 TaB = VSUB(T2X, T30);
+			 TaC = VSUB(TaA, TaB);
+			 Tci = VADD(TaA, TaB);
+			 T2U = VADD(T2Q, T2T);
+			 T31 = VADD(T2X, T30);
+			 T32 = VADD(T2U, T31);
+			 Tdg = VSUB(T2U, T31);
+		    }
+		    {
+			 V T3R, T3Y, T7T, T7U;
+			 T3R = VADD(T3P, T3Q);
+			 T3Y = VMUL(LDK(KP707106781), VSUB(T3U, T3X));
+			 T3Z = VSUB(T3R, T3Y);
+			 T6J = VADD(T3R, T3Y);
+			 T7T = VADD(T40, T41);
+			 T7U = VMUL(LDK(KP707106781), VADD(T3X, T3U));
+			 T7V = VSUB(T7T, T7U);
+			 T9n = VADD(T7T, T7U);
+		    }
+		    {
+			 V T7W, T7X, T42, T45;
+			 T7W = VSUB(T3Q, T3P);
+			 T7X = VMUL(LDK(KP707106781), VADD(T43, T44));
+			 T7Y = VSUB(T7W, T7X);
+			 T9o = VADD(T7W, T7X);
+			 T42 = VSUB(T40, T41);
+			 T45 = VMUL(LDK(KP707106781), VSUB(T43, T44));
+			 T46 = VSUB(T42, T45);
+			 T6K = VADD(T42, T45);
+		    }
+	       }
+	       {
+		    V T14, T4P, T4d, TaG, T17, T4a, T4S, TaH, T1e, TaZ, T4j, T4V, T1b, TaY, T4o;
+		    V T4U;
+		    {
+			 V T12, T13, T4Q, T4R;
+			 T12 = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
+			 T13 = LD(&(ri[WS(is, 33)]), ivs, &(ri[WS(is, 1)]));
+			 T14 = VADD(T12, T13);
+			 T4P = VSUB(T12, T13);
+			 {
+			      V T4b, T4c, T15, T16;
+			      T4b = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
+			      T4c = LD(&(ii[WS(is, 33)]), ivs, &(ii[WS(is, 1)]));
+			      T4d = VSUB(T4b, T4c);
+			      TaG = VADD(T4b, T4c);
+			      T15 = LD(&(ri[WS(is, 17)]), ivs, &(ri[WS(is, 1)]));
+			      T16 = LD(&(ri[WS(is, 49)]), ivs, &(ri[WS(is, 1)]));
+			      T17 = VADD(T15, T16);
+			      T4a = VSUB(T15, T16);
+			 }
+			 T4Q = LD(&(ii[WS(is, 17)]), ivs, &(ii[WS(is, 1)]));
+			 T4R = LD(&(ii[WS(is, 49)]), ivs, &(ii[WS(is, 1)]));
+			 T4S = VSUB(T4Q, T4R);
+			 TaH = VADD(T4Q, T4R);
+			 {
+			      V T1c, T1d, T4f, T4g, T4h, T4i;
+			      T1c = LD(&(ri[WS(is, 57)]), ivs, &(ri[WS(is, 1)]));
+			      T1d = LD(&(ri[WS(is, 25)]), ivs, &(ri[WS(is, 1)]));
+			      T4f = VSUB(T1c, T1d);
+			      T4g = LD(&(ii[WS(is, 57)]), ivs, &(ii[WS(is, 1)]));
+			      T4h = LD(&(ii[WS(is, 25)]), ivs, &(ii[WS(is, 1)]));
+			      T4i = VSUB(T4g, T4h);
+			      T1e = VADD(T1c, T1d);
+			      TaZ = VADD(T4g, T4h);
+			      T4j = VSUB(T4f, T4i);
+			      T4V = VADD(T4f, T4i);
+			 }
+			 {
+			      V T19, T1a, T4k, T4l, T4m, T4n;
+			      T19 = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
+			      T1a = LD(&(ri[WS(is, 41)]), ivs, &(ri[WS(is, 1)]));
+			      T4k = VSUB(T19, T1a);
+			      T4l = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
+			      T4m = LD(&(ii[WS(is, 41)]), ivs, &(ii[WS(is, 1)]));
+			      T4n = VSUB(T4l, T4m);
+			      T1b = VADD(T19, T1a);
+			      TaY = VADD(T4l, T4m);
+			      T4o = VADD(T4k, T4n);
+			      T4U = VSUB(T4n, T4k);
+			 }
+		    }
+		    {
+			 V T18, T1f, TaX, Tb0;
+			 T18 = VADD(T14, T17);
+			 T1f = VADD(T1b, T1e);
+			 T1g = VADD(T18, T1f);
+			 Tdp = VSUB(T18, T1f);
+			 TaX = VSUB(T14, T17);
+			 Tb0 = VSUB(TaY, TaZ);
+			 Tb1 = VSUB(TaX, Tb0);
+			 Tcm = VADD(TaX, Tb0);
+		    }
+		    {
+			 V Tdk, Tdl, T4e, T4p;
+			 Tdk = VADD(TaG, TaH);
+			 Tdl = VADD(TaY, TaZ);
+			 Tdm = VSUB(Tdk, Tdl);
+			 Tej = VADD(Tdk, Tdl);
+			 T4e = VADD(T4a, T4d);
+			 T4p = VMUL(LDK(KP707106781), VSUB(T4j, T4o));
+			 T4q = VSUB(T4e, T4p);
+			 T6R = VADD(T4e, T4p);
+		    }
+		    {
+			 V T4T, T4W, T8d, T8e;
+			 T4T = VSUB(T4P, T4S);
+			 T4W = VMUL(LDK(KP707106781), VSUB(T4U, T4V));
+			 T4X = VSUB(T4T, T4W);
+			 T6O = VADD(T4T, T4W);
+			 T8d = VADD(T4P, T4S);
+			 T8e = VMUL(LDK(KP707106781), VADD(T4o, T4j));
+			 T8f = VSUB(T8d, T8e);
+			 T9s = VADD(T8d, T8e);
+		    }
+		    {
+			 V TaI, TaJ, T82, T83;
+			 TaI = VSUB(TaG, TaH);
+			 TaJ = VSUB(T1e, T1b);
+			 TaK = VSUB(TaI, TaJ);
+			 Tcp = VADD(TaJ, TaI);
+			 T82 = VSUB(T4d, T4a);
+			 T83 = VMUL(LDK(KP707106781), VADD(T4U, T4V));
+			 T84 = VSUB(T82, T83);
+			 T9v = VADD(T82, T83);
+		    }
+	       }
+	       {
+		    V T1j, TaR, T1m, TaS, T4G, T4L, TaT, TaQ, T89, T88, T1q, TaM, T1t, TaN, T4v;
+		    V T4A, TaO, TaL, T86, T85;
+		    {
+			 V T4H, T4F, T4C, T4K;
+			 {
+			      V T1h, T1i, T4D, T4E;
+			      T1h = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
+			      T1i = LD(&(ri[WS(is, 37)]), ivs, &(ri[WS(is, 1)]));
+			      T1j = VADD(T1h, T1i);
+			      T4H = VSUB(T1h, T1i);
+			      T4D = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
+			      T4E = LD(&(ii[WS(is, 37)]), ivs, &(ii[WS(is, 1)]));
+			      T4F = VSUB(T4D, T4E);
+			      TaR = VADD(T4D, T4E);
+			 }
+			 {
+			      V T1k, T1l, T4I, T4J;
+			      T1k = LD(&(ri[WS(is, 21)]), ivs, &(ri[WS(is, 1)]));
+			      T1l = LD(&(ri[WS(is, 53)]), ivs, &(ri[WS(is, 1)]));
+			      T1m = VADD(T1k, T1l);
+			      T4C = VSUB(T1k, T1l);
+			      T4I = LD(&(ii[WS(is, 21)]), ivs, &(ii[WS(is, 1)]));
+			      T4J = LD(&(ii[WS(is, 53)]), ivs, &(ii[WS(is, 1)]));
+			      T4K = VSUB(T4I, T4J);
+			      TaS = VADD(T4I, T4J);
+			 }
+			 T4G = VADD(T4C, T4F);
+			 T4L = VSUB(T4H, T4K);
+			 TaT = VSUB(TaR, TaS);
+			 TaQ = VSUB(T1j, T1m);
+			 T89 = VADD(T4H, T4K);
+			 T88 = VSUB(T4F, T4C);
+		    }
+		    {
+			 V T4r, T4z, T4w, T4u;
+			 {
+			      V T1o, T1p, T4x, T4y;
+			      T1o = LD(&(ri[WS(is, 61)]), ivs, &(ri[WS(is, 1)]));
+			      T1p = LD(&(ri[WS(is, 29)]), ivs, &(ri[WS(is, 1)]));
+			      T1q = VADD(T1o, T1p);
+			      T4r = VSUB(T1o, T1p);
+			      T4x = LD(&(ii[WS(is, 61)]), ivs, &(ii[WS(is, 1)]));
+			      T4y = LD(&(ii[WS(is, 29)]), ivs, &(ii[WS(is, 1)]));
+			      T4z = VSUB(T4x, T4y);
+			      TaM = VADD(T4x, T4y);
+			 }
+			 {
+			      V T1r, T1s, T4s, T4t;
+			      T1r = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
+			      T1s = LD(&(ri[WS(is, 45)]), ivs, &(ri[WS(is, 1)]));
+			      T1t = VADD(T1r, T1s);
+			      T4w = VSUB(T1r, T1s);
+			      T4s = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
+			      T4t = LD(&(ii[WS(is, 45)]), ivs, &(ii[WS(is, 1)]));
+			      T4u = VSUB(T4s, T4t);
+			      TaN = VADD(T4s, T4t);
+			 }
+			 T4v = VSUB(T4r, T4u);
+			 T4A = VADD(T4w, T4z);
+			 TaO = VSUB(TaM, TaN);
+			 TaL = VSUB(T1q, T1t);
+			 T86 = VSUB(T4z, T4w);
+			 T85 = VADD(T4r, T4u);
+		    }
+		    {
+			 V T1n, T1u, Tb2, Tb3;
+			 T1n = VADD(T1j, T1m);
+			 T1u = VADD(T1q, T1t);
+			 T1v = VADD(T1n, T1u);
+			 Tdn = VSUB(T1u, T1n);
+			 Tb2 = VSUB(TaT, TaQ);
+			 Tb3 = VADD(TaL, TaO);
+			 Tb4 = VMUL(LDK(KP707106781), VSUB(Tb2, Tb3));
+			 Tcq = VMUL(LDK(KP707106781), VADD(Tb2, Tb3));
+		    }
+		    {
+			 V Tdq, Tdr, T4B, T4M;
+			 Tdq = VADD(TaR, TaS);
+			 Tdr = VADD(TaM, TaN);
+			 Tds = VSUB(Tdq, Tdr);
+			 Tek = VADD(Tdq, Tdr);
+			 T4B = VFNMS(LDK(KP923879532), T4A, VMUL(LDK(KP382683432), T4v));
+			 T4M = VFMA(LDK(KP923879532), T4G, VMUL(LDK(KP382683432), T4L));
+			 T4N = VSUB(T4B, T4M);
+			 T6P = VADD(T4M, T4B);
+		    }
+		    {
+			 V T4Y, T4Z, T8g, T8h;
+			 T4Y = VFNMS(LDK(KP923879532), T4L, VMUL(LDK(KP382683432), T4G));
+			 T4Z = VFMA(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4v));
+			 T50 = VSUB(T4Y, T4Z);
+			 T6S = VADD(T4Y, T4Z);
+			 T8g = VFNMS(LDK(KP382683432), T89, VMUL(LDK(KP923879532), T88));
+			 T8h = VFMA(LDK(KP923879532), T86, VMUL(LDK(KP382683432), T85));
+			 T8i = VSUB(T8g, T8h);
+			 T9w = VADD(T8g, T8h);
+		    }
+		    {
+			 V TaP, TaU, T87, T8a;
+			 TaP = VSUB(TaL, TaO);
+			 TaU = VADD(TaQ, TaT);
+			 TaV = VMUL(LDK(KP707106781), VSUB(TaP, TaU));
+			 Tcn = VMUL(LDK(KP707106781), VADD(TaU, TaP));
+			 T87 = VFNMS(LDK(KP382683432), T86, VMUL(LDK(KP923879532), T85));
+			 T8a = VFMA(LDK(KP382683432), T88, VMUL(LDK(KP923879532), T89));
+			 T8b = VSUB(T87, T8a);
+			 T9t = VADD(T8a, T87);
+		    }
+	       }
+	       {
+		    V T1O, Tbc, T1R, Tbd, T5o, T5t, Tbf, Tbe, T8p, T8o, T1V, Tbi, T1Y, Tbj, T5z;
+		    V T5E, Tbk, Tbh, T8s, T8r;
+		    {
+			 V T5p, T5n, T5k, T5s;
+			 {
+			      V T1M, T1N, T5l, T5m;
+			      T1M = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
+			      T1N = LD(&(ri[WS(is, 35)]), ivs, &(ri[WS(is, 1)]));
+			      T1O = VADD(T1M, T1N);
+			      T5p = VSUB(T1M, T1N);
+			      T5l = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
+			      T5m = LD(&(ii[WS(is, 35)]), ivs, &(ii[WS(is, 1)]));
+			      T5n = VSUB(T5l, T5m);
+			      Tbc = VADD(T5l, T5m);
+			 }
+			 {
+			      V T1P, T1Q, T5q, T5r;
+			      T1P = LD(&(ri[WS(is, 19)]), ivs, &(ri[WS(is, 1)]));
+			      T1Q = LD(&(ri[WS(is, 51)]), ivs, &(ri[WS(is, 1)]));
+			      T1R = VADD(T1P, T1Q);
+			      T5k = VSUB(T1P, T1Q);
+			      T5q = LD(&(ii[WS(is, 19)]), ivs, &(ii[WS(is, 1)]));
+			      T5r = LD(&(ii[WS(is, 51)]), ivs, &(ii[WS(is, 1)]));
+			      T5s = VSUB(T5q, T5r);
+			      Tbd = VADD(T5q, T5r);
+			 }
+			 T5o = VADD(T5k, T5n);
+			 T5t = VSUB(T5p, T5s);
+			 Tbf = VSUB(T1O, T1R);
+			 Tbe = VSUB(Tbc, Tbd);
+			 T8p = VADD(T5p, T5s);
+			 T8o = VSUB(T5n, T5k);
+		    }
+		    {
+			 V T5A, T5y, T5v, T5D;
+			 {
+			      V T1T, T1U, T5w, T5x;
+			      T1T = LD(&(ri[WS(is, 59)]), ivs, &(ri[WS(is, 1)]));
+			      T1U = LD(&(ri[WS(is, 27)]), ivs, &(ri[WS(is, 1)]));
+			      T1V = VADD(T1T, T1U);
+			      T5A = VSUB(T1T, T1U);
+			      T5w = LD(&(ii[WS(is, 59)]), ivs, &(ii[WS(is, 1)]));
+			      T5x = LD(&(ii[WS(is, 27)]), ivs, &(ii[WS(is, 1)]));
+			      T5y = VSUB(T5w, T5x);
+			      Tbi = VADD(T5w, T5x);
+			 }
+			 {
+			      V T1W, T1X, T5B, T5C;
+			      T1W = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
+			      T1X = LD(&(ri[WS(is, 43)]), ivs, &(ri[WS(is, 1)]));
+			      T1Y = VADD(T1W, T1X);
+			      T5v = VSUB(T1W, T1X);
+			      T5B = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
+			      T5C = LD(&(ii[WS(is, 43)]), ivs, &(ii[WS(is, 1)]));
+			      T5D = VSUB(T5B, T5C);
+			      Tbj = VADD(T5B, T5C);
+			 }
+			 T5z = VADD(T5v, T5y);
+			 T5E = VSUB(T5A, T5D);
+			 Tbk = VSUB(Tbi, Tbj);
+			 Tbh = VSUB(T1V, T1Y);
+			 T8s = VADD(T5A, T5D);
+			 T8r = VSUB(T5y, T5v);
+		    }
+		    {
+			 V T1S, T1Z, Tbt, Tbu;
+			 T1S = VADD(T1O, T1R);
+			 T1Z = VADD(T1V, T1Y);
+			 T20 = VADD(T1S, T1Z);
+			 TdD = VSUB(T1Z, T1S);
+			 Tbt = VSUB(Tbh, Tbk);
+			 Tbu = VADD(Tbf, Tbe);
+			 Tbv = VMUL(LDK(KP707106781), VSUB(Tbt, Tbu));
+			 Tcu = VMUL(LDK(KP707106781), VADD(Tbu, Tbt));
+		    }
+		    {
+			 V Tdw, Tdx, T5u, T5F;
+			 Tdw = VADD(Tbc, Tbd);
+			 Tdx = VADD(Tbi, Tbj);
+			 Tdy = VSUB(Tdw, Tdx);
+			 Tep = VADD(Tdw, Tdx);
+			 T5u = VFNMS(LDK(KP923879532), T5t, VMUL(LDK(KP382683432), T5o));
+			 T5F = VFMA(LDK(KP382683432), T5z, VMUL(LDK(KP923879532), T5E));
+			 T5G = VSUB(T5u, T5F);
+			 T6Z = VADD(T5u, T5F);
+		    }
+		    {
+			 V T5R, T5S, T8z, T8A;
+			 T5R = VFNMS(LDK(KP923879532), T5z, VMUL(LDK(KP382683432), T5E));
+			 T5S = VFMA(LDK(KP923879532), T5o, VMUL(LDK(KP382683432), T5t));
+			 T5T = VSUB(T5R, T5S);
+			 T6W = VADD(T5S, T5R);
+			 T8z = VFNMS(LDK(KP382683432), T8r, VMUL(LDK(KP923879532), T8s));
+			 T8A = VFMA(LDK(KP382683432), T8o, VMUL(LDK(KP923879532), T8p));
+			 T8B = VSUB(T8z, T8A);
+			 T9A = VADD(T8A, T8z);
+		    }
+		    {
+			 V Tbg, Tbl, T8q, T8t;
+			 Tbg = VSUB(Tbe, Tbf);
+			 Tbl = VADD(Tbh, Tbk);
+			 Tbm = VMUL(LDK(KP707106781), VSUB(Tbg, Tbl));
+			 Tcx = VMUL(LDK(KP707106781), VADD(Tbg, Tbl));
+			 T8q = VFNMS(LDK(KP382683432), T8p, VMUL(LDK(KP923879532), T8o));
+			 T8t = VFMA(LDK(KP923879532), T8r, VMUL(LDK(KP382683432), T8s));
+			 T8u = VSUB(T8q, T8t);
+			 T9D = VADD(T8q, T8t);
+		    }
+	       }
+	       {
+		    V TeJ, TeK, TeL, TeM, TeN, TeO, TeP, TeQ, TeR, TeS, TeT, TeU, TeV, TeW, TeX;
+		    V TeY, TeZ, Tf0, Tf1, Tf2, Tf3, Tf4, Tf5, Tf6, Tf7, Tf8, Tf9, Tfa, Tfb, Tfc;
+		    V Tfd, Tfe, Tff, Tfg, Tfh, Tfi, Tfj, Tfk, Tfl, Tfm, Tfn, Tfo, Tfp, Tfq, Tfr;
+		    V Tfs, Tft, Tfu;
+		    {
+			 V T11, TeD, TeG, TeI, T22, T23, T34, TeH;
+			 {
+			      V Tv, T10, TeE, TeF;
+			      Tv = VADD(Tf, Tu);
+			      T10 = VADD(TK, TZ);
+			      T11 = VADD(Tv, T10);
+			      TeD = VSUB(Tv, T10);
+			      TeE = VADD(Tej, Tek);
+			      TeF = VADD(Teo, Tep);
+			      TeG = VSUB(TeE, TeF);
+			      TeI = VADD(TeE, TeF);
+			 }
+			 {
+			      V T1w, T21, T2y, T33;
+			      T1w = VADD(T1g, T1v);
+			      T21 = VADD(T1L, T20);
+			      T22 = VADD(T1w, T21);
+			      T23 = VSUB(T21, T1w);
+			      T2y = VADD(T2i, T2x);
+			      T33 = VADD(T2N, T32);
+			      T34 = VSUB(T2y, T33);
+			      TeH = VADD(T2y, T33);
+			 }
+			 TeJ = VSUB(T11, T22);
+			 STM4(&(ro[32]), TeJ, ovs, &(ro[0]));
+			 TeK = VSUB(TeH, TeI);
+			 STM4(&(io[32]), TeK, ovs, &(io[0]));
+			 TeL = VADD(T11, T22);
+			 STM4(&(ro[0]), TeL, ovs, &(ro[0]));
+			 TeM = VADD(TeH, TeI);
+			 STM4(&(io[0]), TeM, ovs, &(io[0]));
+			 TeN = VADD(T23, T34);
+			 STM4(&(io[16]), TeN, ovs, &(io[0]));
+			 TeO = VADD(TeD, TeG);
+			 STM4(&(ro[16]), TeO, ovs, &(ro[0]));
+			 TeP = VSUB(T34, T23);
+			 STM4(&(io[48]), TeP, ovs, &(io[0]));
+			 TeQ = VSUB(TeD, TeG);
+			 STM4(&(ro[48]), TeQ, ovs, &(ro[0]));
+		    }
+		    {
+			 V Teh, Tex, Tev, TeB, Tem, Tey, Ter, Tez;
+			 {
+			      V Tef, Teg, Tet, Teu;
+			      Tef = VSUB(Tf, Tu);
+			      Teg = VSUB(T2N, T32);
+			      Teh = VADD(Tef, Teg);
+			      Tex = VSUB(Tef, Teg);
+			      Tet = VSUB(T2i, T2x);
+			      Teu = VSUB(TZ, TK);
+			      Tev = VSUB(Tet, Teu);
+			      TeB = VADD(Teu, Tet);
+			 }
+			 {
+			      V Tei, Tel, Ten, Teq;
+			      Tei = VSUB(T1g, T1v);
+			      Tel = VSUB(Tej, Tek);
+			      Tem = VADD(Tei, Tel);
+			      Tey = VSUB(Tel, Tei);
+			      Ten = VSUB(T1L, T20);
+			      Teq = VSUB(Teo, Tep);
+			      Ter = VSUB(Ten, Teq);
+			      Tez = VADD(Ten, Teq);
+			 }
+			 {
+			      V Tes, TeC, Tew, TeA;
+			      Tes = VMUL(LDK(KP707106781), VADD(Tem, Ter));
+			      TeR = VSUB(Teh, Tes);
+			      STM4(&(ro[40]), TeR, ovs, &(ro[0]));
+			      TeS = VADD(Teh, Tes);
+			      STM4(&(ro[8]), TeS, ovs, &(ro[0]));
+			      TeC = VMUL(LDK(KP707106781), VADD(Tey, Tez));
+			      TeT = VSUB(TeB, TeC);
+			      STM4(&(io[40]), TeT, ovs, &(io[0]));
+			      TeU = VADD(TeB, TeC);
+			      STM4(&(io[8]), TeU, ovs, &(io[0]));
+			      Tew = VMUL(LDK(KP707106781), VSUB(Ter, Tem));
+			      TeV = VSUB(Tev, Tew);
+			      STM4(&(io[56]), TeV, ovs, &(io[0]));
+			      TeW = VADD(Tev, Tew);
+			      STM4(&(io[24]), TeW, ovs, &(io[0]));
+			      TeA = VMUL(LDK(KP707106781), VSUB(Tey, Tez));
+			      TeX = VSUB(Tex, TeA);
+			      STM4(&(ro[56]), TeX, ovs, &(ro[0]));
+			      TeY = VADD(Tex, TeA);
+			      STM4(&(ro[24]), TeY, ovs, &(ro[0]));
+			 }
+		    }
+		    {
+			 V Tdb, TdV, Te5, TdJ, Tdi, Te6, Te3, Teb, TdM, TdW, Tdu, TdQ, Te0, Tea, TdF;
+			 V TdR;
+			 {
+			      V Tde, Tdh, Tdo, Tdt;
+			      Tdb = VSUB(Td9, Tda);
+			      TdV = VADD(Td9, Tda);
+			      Te5 = VADD(TdI, TdH);
+			      TdJ = VSUB(TdH, TdI);
+			      Tde = VSUB(Tdc, Tdd);
+			      Tdh = VADD(Tdf, Tdg);
+			      Tdi = VMUL(LDK(KP707106781), VSUB(Tde, Tdh));
+			      Te6 = VMUL(LDK(KP707106781), VADD(Tde, Tdh));
+			      {
+				   V Te1, Te2, TdK, TdL;
+				   Te1 = VADD(Tdv, Tdy);
+				   Te2 = VADD(TdD, TdC);
+				   Te3 = VFNMS(LDK(KP382683432), Te2, VMUL(LDK(KP923879532), Te1));
+				   Teb = VFMA(LDK(KP923879532), Te2, VMUL(LDK(KP382683432), Te1));
+				   TdK = VSUB(Tdf, Tdg);
+				   TdL = VADD(Tdd, Tdc);
+				   TdM = VMUL(LDK(KP707106781), VSUB(TdK, TdL));
+				   TdW = VMUL(LDK(KP707106781), VADD(TdL, TdK));
+			      }
+			      Tdo = VSUB(Tdm, Tdn);
+			      Tdt = VSUB(Tdp, Tds);
+			      Tdu = VFMA(LDK(KP923879532), Tdo, VMUL(LDK(KP382683432), Tdt));
+			      TdQ = VFNMS(LDK(KP923879532), Tdt, VMUL(LDK(KP382683432), Tdo));
+			      {
+				   V TdY, TdZ, Tdz, TdE;
+				   TdY = VADD(Tdn, Tdm);
+				   TdZ = VADD(Tdp, Tds);
+				   Te0 = VFMA(LDK(KP382683432), TdY, VMUL(LDK(KP923879532), TdZ));
+				   Tea = VFNMS(LDK(KP382683432), TdZ, VMUL(LDK(KP923879532), TdY));
+				   Tdz = VSUB(Tdv, Tdy);
+				   TdE = VSUB(TdC, TdD);
+				   TdF = VFNMS(LDK(KP923879532), TdE, VMUL(LDK(KP382683432), Tdz));
+				   TdR = VFMA(LDK(KP382683432), TdE, VMUL(LDK(KP923879532), Tdz));
+			      }
+			 }
+			 {
+			      V Tdj, TdG, TdT, TdU;
+			      Tdj = VADD(Tdb, Tdi);
+			      TdG = VADD(Tdu, TdF);
+			      TeZ = VSUB(Tdj, TdG);
+			      STM4(&(ro[44]), TeZ, ovs, &(ro[0]));
+			      Tf0 = VADD(Tdj, TdG);
+			      STM4(&(ro[12]), Tf0, ovs, &(ro[0]));
+			      TdT = VADD(TdJ, TdM);
+			      TdU = VADD(TdQ, TdR);
+			      Tf1 = VSUB(TdT, TdU);
+			      STM4(&(io[44]), Tf1, ovs, &(io[0]));
+			      Tf2 = VADD(TdT, TdU);
+			      STM4(&(io[12]), Tf2, ovs, &(io[0]));
+			 }
+			 {
+			      V TdN, TdO, TdP, TdS;
+			      TdN = VSUB(TdJ, TdM);
+			      TdO = VSUB(TdF, Tdu);
+			      Tf3 = VSUB(TdN, TdO);
+			      STM4(&(io[60]), Tf3, ovs, &(io[0]));
+			      Tf4 = VADD(TdN, TdO);
+			      STM4(&(io[28]), Tf4, ovs, &(io[0]));
+			      TdP = VSUB(Tdb, Tdi);
+			      TdS = VSUB(TdQ, TdR);
+			      Tf5 = VSUB(TdP, TdS);
+			      STM4(&(ro[60]), Tf5, ovs, &(ro[0]));
+			      Tf6 = VADD(TdP, TdS);
+			      STM4(&(ro[28]), Tf6, ovs, &(ro[0]));
+			 }
+			 {
+			      V TdX, Te4, Ted, Tee;
+			      TdX = VADD(TdV, TdW);
+			      Te4 = VADD(Te0, Te3);
+			      Tf7 = VSUB(TdX, Te4);
+			      STM4(&(ro[36]), Tf7, ovs, &(ro[0]));
+			      Tf8 = VADD(TdX, Te4);
+			      STM4(&(ro[4]), Tf8, ovs, &(ro[0]));
+			      Ted = VADD(Te5, Te6);
+			      Tee = VADD(Tea, Teb);
+			      Tf9 = VSUB(Ted, Tee);
+			      STM4(&(io[36]), Tf9, ovs, &(io[0]));
+			      Tfa = VADD(Ted, Tee);
+			      STM4(&(io[4]), Tfa, ovs, &(io[0]));
+			 }
+			 {
+			      V Te7, Te8, Te9, Tec;
+			      Te7 = VSUB(Te5, Te6);
+			      Te8 = VSUB(Te3, Te0);
+			      Tfb = VSUB(Te7, Te8);
+			      STM4(&(io[52]), Tfb, ovs, &(io[0]));
+			      Tfc = VADD(Te7, Te8);
+			      STM4(&(io[20]), Tfc, ovs, &(io[0]));
+			      Te9 = VSUB(TdV, TdW);
+			      Tec = VSUB(Tea, Teb);
+			      Tfd = VSUB(Te9, Tec);
+			      STM4(&(ro[52]), Tfd, ovs, &(ro[0]));
+			      Tfe = VADD(Te9, Tec);
+			      STM4(&(ro[20]), Tfe, ovs, &(ro[0]));
+			 }
+		    }
+		    {
+			 V Tcd, TcP, TcD, TcZ, Tck, Td0, TcX, Td5, Tcs, TcK, TcG, TcQ, TcU, Td4, Tcz;
+			 V TcL, Tcc, TcC;
+			 Tcc = VMUL(LDK(KP707106781), VADD(TbD, TbC));
+			 Tcd = VSUB(Tcb, Tcc);
+			 TcP = VADD(Tcb, Tcc);
+			 TcC = VMUL(LDK(KP707106781), VADD(Tak, Tan));
+			 TcD = VSUB(TcB, TcC);
+			 TcZ = VADD(TcB, TcC);
+			 {
+			      V Tcg, Tcj, TcV, TcW;
+			      Tcg = VFNMS(LDK(KP382683432), Tcf, VMUL(LDK(KP923879532), Tce));
+			      Tcj = VFMA(LDK(KP923879532), Tch, VMUL(LDK(KP382683432), Tci));
+			      Tck = VSUB(Tcg, Tcj);
+			      Td0 = VADD(Tcg, Tcj);
+			      TcV = VADD(Tct, Tcu);
+			      TcW = VADD(Tcw, Tcx);
+			      TcX = VFNMS(LDK(KP195090322), TcW, VMUL(LDK(KP980785280), TcV));
+			      Td5 = VFMA(LDK(KP195090322), TcV, VMUL(LDK(KP980785280), TcW));
+			 }
+			 {
+			      V Tco, Tcr, TcE, TcF;
+			      Tco = VSUB(Tcm, Tcn);
+			      Tcr = VSUB(Tcp, Tcq);
+			      Tcs = VFMA(LDK(KP555570233), Tco, VMUL(LDK(KP831469612), Tcr));
+			      TcK = VFNMS(LDK(KP831469612), Tco, VMUL(LDK(KP555570233), Tcr));
+			      TcE = VFNMS(LDK(KP382683432), Tch, VMUL(LDK(KP923879532), Tci));
+			      TcF = VFMA(LDK(KP382683432), Tce, VMUL(LDK(KP923879532), Tcf));
+			      TcG = VSUB(TcE, TcF);
+			      TcQ = VADD(TcF, TcE);
+			 }
+			 {
+			      V TcS, TcT, Tcv, Tcy;
+			      TcS = VADD(Tcm, Tcn);
+			      TcT = VADD(Tcp, Tcq);
+			      TcU = VFMA(LDK(KP980785280), TcS, VMUL(LDK(KP195090322), TcT));
+			      Td4 = VFNMS(LDK(KP195090322), TcS, VMUL(LDK(KP980785280), TcT));
+			      Tcv = VSUB(Tct, Tcu);
+			      Tcy = VSUB(Tcw, Tcx);
+			      Tcz = VFNMS(LDK(KP831469612), Tcy, VMUL(LDK(KP555570233), Tcv));
+			      TcL = VFMA(LDK(KP831469612), Tcv, VMUL(LDK(KP555570233), Tcy));
+			 }
+			 {
+			      V Tcl, TcA, TcN, TcO;
+			      Tcl = VADD(Tcd, Tck);
+			      TcA = VADD(Tcs, Tcz);
+			      Tff = VSUB(Tcl, TcA);
+			      STM4(&(ro[42]), Tff, ovs, &(ro[0]));
+			      Tfg = VADD(Tcl, TcA);
+			      STM4(&(ro[10]), Tfg, ovs, &(ro[0]));
+			      TcN = VADD(TcD, TcG);
+			      TcO = VADD(TcK, TcL);
+			      Tfh = VSUB(TcN, TcO);
+			      STM4(&(io[42]), Tfh, ovs, &(io[0]));
+			      Tfi = VADD(TcN, TcO);
+			      STM4(&(io[10]), Tfi, ovs, &(io[0]));
+			 }
+			 {
+			      V TcH, TcI, TcJ, TcM;
+			      TcH = VSUB(TcD, TcG);
+			      TcI = VSUB(Tcz, Tcs);
+			      Tfj = VSUB(TcH, TcI);
+			      STM4(&(io[58]), Tfj, ovs, &(io[0]));
+			      Tfk = VADD(TcH, TcI);
+			      STM4(&(io[26]), Tfk, ovs, &(io[0]));
+			      TcJ = VSUB(Tcd, Tck);
+			      TcM = VSUB(TcK, TcL);
+			      Tfl = VSUB(TcJ, TcM);
+			      STM4(&(ro[58]), Tfl, ovs, &(ro[0]));
+			      Tfm = VADD(TcJ, TcM);
+			      STM4(&(ro[26]), Tfm, ovs, &(ro[0]));
+			 }
+			 {
+			      V TcR, TcY, Td7, Td8;
+			      TcR = VADD(TcP, TcQ);
+			      TcY = VADD(TcU, TcX);
+			      Tfn = VSUB(TcR, TcY);
+			      STM4(&(ro[34]), Tfn, ovs, &(ro[0]));
+			      Tfo = VADD(TcR, TcY);
+			      STM4(&(ro[2]), Tfo, ovs, &(ro[0]));
+			      Td7 = VADD(TcZ, Td0);
+			      Td8 = VADD(Td4, Td5);
+			      Tfp = VSUB(Td7, Td8);
+			      STM4(&(io[34]), Tfp, ovs, &(io[0]));
+			      Tfq = VADD(Td7, Td8);
+			      STM4(&(io[2]), Tfq, ovs, &(io[0]));
+			 }
+			 {
+			      V Td1, Td2, Td3, Td6;
+			      Td1 = VSUB(TcZ, Td0);
+			      Td2 = VSUB(TcX, TcU);
+			      Tfr = VSUB(Td1, Td2);
+			      STM4(&(io[50]), Tfr, ovs, &(io[0]));
+			      Tfs = VADD(Td1, Td2);
+			      STM4(&(io[18]), Tfs, ovs, &(io[0]));
+			      Td3 = VSUB(TcP, TcQ);
+			      Td6 = VSUB(Td4, Td5);
+			      Tft = VSUB(Td3, Td6);
+			      STM4(&(ro[50]), Tft, ovs, &(ro[0]));
+			      Tfu = VADD(Td3, Td6);
+			      STM4(&(ro[18]), Tfu, ovs, &(ro[0]));
+			 }
+		    }
+		    {
+			 V Tfv, Tfw, Tfx, Tfy, Tfz, TfA, TfB, TfC, TfD, TfE, TfF, TfG, TfH, TfI, TfJ;
+			 V TfK, TfL, TfM, TfN, TfO, TfP, TfQ, TfR, TfS, TfT, TfU, TfV, TfW, TfX, TfY;
+			 V TfZ, Tg0;
+			 {
+			      V Tap, TbR, TbF, Tc1, TaE, Tc2, TbZ, Tc7, Tb6, TbM, TbI, TbS, TbW, Tc6, Tbx;
+			      V TbN, Tao, TbE;
+			      Tao = VMUL(LDK(KP707106781), VSUB(Tak, Tan));
+			      Tap = VSUB(Tah, Tao);
+			      TbR = VADD(Tah, Tao);
+			      TbE = VMUL(LDK(KP707106781), VSUB(TbC, TbD));
+			      TbF = VSUB(TbB, TbE);
+			      Tc1 = VADD(TbB, TbE);
+			      {
+				   V Taw, TaD, TbX, TbY;
+				   Taw = VFNMS(LDK(KP923879532), Tav, VMUL(LDK(KP382683432), Tas));
+				   TaD = VFMA(LDK(KP382683432), Taz, VMUL(LDK(KP923879532), TaC));
+				   TaE = VSUB(Taw, TaD);
+				   Tc2 = VADD(Taw, TaD);
+				   TbX = VADD(Tbb, Tbm);
+				   TbY = VADD(Tbs, Tbv);
+				   TbZ = VFNMS(LDK(KP555570233), TbY, VMUL(LDK(KP831469612), TbX));
+				   Tc7 = VFMA(LDK(KP831469612), TbY, VMUL(LDK(KP555570233), TbX));
+			      }
+			      {
+				   V TaW, Tb5, TbG, TbH;
+				   TaW = VSUB(TaK, TaV);
+				   Tb5 = VSUB(Tb1, Tb4);
+				   Tb6 = VFMA(LDK(KP980785280), TaW, VMUL(LDK(KP195090322), Tb5));
+				   TbM = VFNMS(LDK(KP980785280), Tb5, VMUL(LDK(KP195090322), TaW));
+				   TbG = VFNMS(LDK(KP923879532), Taz, VMUL(LDK(KP382683432), TaC));
+				   TbH = VFMA(LDK(KP923879532), Tas, VMUL(LDK(KP382683432), Tav));
+				   TbI = VSUB(TbG, TbH);
+				   TbS = VADD(TbH, TbG);
+			      }
+			      {
+				   V TbU, TbV, Tbn, Tbw;
+				   TbU = VADD(TaK, TaV);
+				   TbV = VADD(Tb1, Tb4);
+				   TbW = VFMA(LDK(KP555570233), TbU, VMUL(LDK(KP831469612), TbV));
+				   Tc6 = VFNMS(LDK(KP555570233), TbV, VMUL(LDK(KP831469612), TbU));
+				   Tbn = VSUB(Tbb, Tbm);
+				   Tbw = VSUB(Tbs, Tbv);
+				   Tbx = VFNMS(LDK(KP980785280), Tbw, VMUL(LDK(KP195090322), Tbn));
+				   TbN = VFMA(LDK(KP195090322), Tbw, VMUL(LDK(KP980785280), Tbn));
+			      }
+			      {
+				   V TaF, Tby, TbP, TbQ;
+				   TaF = VADD(Tap, TaE);
+				   Tby = VADD(Tb6, Tbx);
+				   Tfv = VSUB(TaF, Tby);
+				   STM4(&(ro[46]), Tfv, ovs, &(ro[0]));
+				   Tfw = VADD(TaF, Tby);
+				   STM4(&(ro[14]), Tfw, ovs, &(ro[0]));
+				   TbP = VADD(TbF, TbI);
+				   TbQ = VADD(TbM, TbN);
+				   Tfx = VSUB(TbP, TbQ);
+				   STM4(&(io[46]), Tfx, ovs, &(io[0]));
+				   Tfy = VADD(TbP, TbQ);
+				   STM4(&(io[14]), Tfy, ovs, &(io[0]));
+			      }
+			      {
+				   V TbJ, TbK, TbL, TbO;
+				   TbJ = VSUB(TbF, TbI);
+				   TbK = VSUB(Tbx, Tb6);
+				   Tfz = VSUB(TbJ, TbK);
+				   STM4(&(io[62]), Tfz, ovs, &(io[0]));
+				   TfA = VADD(TbJ, TbK);
+				   STM4(&(io[30]), TfA, ovs, &(io[0]));
+				   TbL = VSUB(Tap, TaE);
+				   TbO = VSUB(TbM, TbN);
+				   TfB = VSUB(TbL, TbO);
+				   STM4(&(ro[62]), TfB, ovs, &(ro[0]));
+				   TfC = VADD(TbL, TbO);
+				   STM4(&(ro[30]), TfC, ovs, &(ro[0]));
+			      }
+			      {
+				   V TbT, Tc0, Tc9, Tca;
+				   TbT = VADD(TbR, TbS);
+				   Tc0 = VADD(TbW, TbZ);
+				   TfD = VSUB(TbT, Tc0);
+				   STM4(&(ro[38]), TfD, ovs, &(ro[0]));
+				   TfE = VADD(TbT, Tc0);
+				   STM4(&(ro[6]), TfE, ovs, &(ro[0]));
+				   Tc9 = VADD(Tc1, Tc2);
+				   Tca = VADD(Tc6, Tc7);
+				   TfF = VSUB(Tc9, Tca);
+				   STM4(&(io[38]), TfF, ovs, &(io[0]));
+				   TfG = VADD(Tc9, Tca);
+				   STM4(&(io[6]), TfG, ovs, &(io[0]));
+			      }
+			      {
+				   V Tc3, Tc4, Tc5, Tc8;
+				   Tc3 = VSUB(Tc1, Tc2);
+				   Tc4 = VSUB(TbZ, TbW);
+				   TfH = VSUB(Tc3, Tc4);
+				   STM4(&(io[54]), TfH, ovs, &(io[0]));
+				   TfI = VADD(Tc3, Tc4);
+				   STM4(&(io[22]), TfI, ovs, &(io[0]));
+				   Tc5 = VSUB(TbR, TbS);
+				   Tc8 = VSUB(Tc6, Tc7);
+				   TfJ = VSUB(Tc5, Tc8);
+				   STM4(&(ro[54]), TfJ, ovs, &(ro[0]));
+				   TfK = VADD(Tc5, Tc8);
+				   STM4(&(ro[22]), TfK, ovs, &(ro[0]));
+			      }
+			 }
+			 {
+			      V T6F, T7h, T7m, T7w, T7p, T7x, T6M, T7s, T6U, T7c, T75, T7r, T78, T7i, T71;
+			      V T7d;
+			      {
+				   V T6D, T6E, T7k, T7l;
+				   T6D = VADD(T37, T3e);
+				   T6E = VADD(T65, T64);
+				   T6F = VSUB(T6D, T6E);
+				   T7h = VADD(T6D, T6E);
+				   T7k = VADD(T6O, T6P);
+				   T7l = VADD(T6R, T6S);
+				   T7m = VFMA(LDK(KP956940335), T7k, VMUL(LDK(KP290284677), T7l));
+				   T7w = VFNMS(LDK(KP290284677), T7k, VMUL(LDK(KP956940335), T7l));
+			      }
+			      {
+				   V T7n, T7o, T6I, T6L;
+				   T7n = VADD(T6V, T6W);
+				   T7o = VADD(T6Y, T6Z);
+				   T7p = VFNMS(LDK(KP290284677), T7o, VMUL(LDK(KP956940335), T7n));
+				   T7x = VFMA(LDK(KP290284677), T7n, VMUL(LDK(KP956940335), T7o));
+				   T6I = VFNMS(LDK(KP555570233), T6H, VMUL(LDK(KP831469612), T6G));
+				   T6L = VFMA(LDK(KP831469612), T6J, VMUL(LDK(KP555570233), T6K));
+				   T6M = VSUB(T6I, T6L);
+				   T7s = VADD(T6I, T6L);
+			      }
+			      {
+				   V T6Q, T6T, T73, T74;
+				   T6Q = VSUB(T6O, T6P);
+				   T6T = VSUB(T6R, T6S);
+				   T6U = VFMA(LDK(KP471396736), T6Q, VMUL(LDK(KP881921264), T6T));
+				   T7c = VFNMS(LDK(KP881921264), T6Q, VMUL(LDK(KP471396736), T6T));
+				   T73 = VADD(T5Z, T62);
+				   T74 = VADD(T3m, T3t);
+				   T75 = VSUB(T73, T74);
+				   T7r = VADD(T73, T74);
+			      }
+			      {
+				   V T76, T77, T6X, T70;
+				   T76 = VFNMS(LDK(KP555570233), T6J, VMUL(LDK(KP831469612), T6K));
+				   T77 = VFMA(LDK(KP555570233), T6G, VMUL(LDK(KP831469612), T6H));
+				   T78 = VSUB(T76, T77);
+				   T7i = VADD(T77, T76);
+				   T6X = VSUB(T6V, T6W);
+				   T70 = VSUB(T6Y, T6Z);
+				   T71 = VFNMS(LDK(KP881921264), T70, VMUL(LDK(KP471396736), T6X));
+				   T7d = VFMA(LDK(KP881921264), T6X, VMUL(LDK(KP471396736), T70));
+			      }
+			      {
+				   V T6N, T72, T7f, T7g;
+				   T6N = VADD(T6F, T6M);
+				   T72 = VADD(T6U, T71);
+				   TfL = VSUB(T6N, T72);
+				   STM4(&(ro[43]), TfL, ovs, &(ro[1]));
+				   TfM = VADD(T6N, T72);
+				   STM4(&(ro[11]), TfM, ovs, &(ro[1]));
+				   T7f = VADD(T75, T78);
+				   T7g = VADD(T7c, T7d);
+				   TfN = VSUB(T7f, T7g);
+				   STM4(&(io[43]), TfN, ovs, &(io[1]));
+				   TfO = VADD(T7f, T7g);
+				   STM4(&(io[11]), TfO, ovs, &(io[1]));
+			      }
+			      {
+				   V T79, T7a, T7b, T7e;
+				   T79 = VSUB(T75, T78);
+				   T7a = VSUB(T71, T6U);
+				   TfP = VSUB(T79, T7a);
+				   STM4(&(io[59]), TfP, ovs, &(io[1]));
+				   TfQ = VADD(T79, T7a);
+				   STM4(&(io[27]), TfQ, ovs, &(io[1]));
+				   T7b = VSUB(T6F, T6M);
+				   T7e = VSUB(T7c, T7d);
+				   TfR = VSUB(T7b, T7e);
+				   STM4(&(ro[59]), TfR, ovs, &(ro[1]));
+				   TfS = VADD(T7b, T7e);
+				   STM4(&(ro[27]), TfS, ovs, &(ro[1]));
+			      }
+			      {
+				   V T7j, T7q, T7z, T7A;
+				   T7j = VADD(T7h, T7i);
+				   T7q = VADD(T7m, T7p);
+				   TfT = VSUB(T7j, T7q);
+				   STM4(&(ro[35]), TfT, ovs, &(ro[1]));
+				   TfU = VADD(T7j, T7q);
+				   STM4(&(ro[3]), TfU, ovs, &(ro[1]));
+				   T7z = VADD(T7r, T7s);
+				   T7A = VADD(T7w, T7x);
+				   TfV = VSUB(T7z, T7A);
+				   STM4(&(io[35]), TfV, ovs, &(io[1]));
+				   TfW = VADD(T7z, T7A);
+				   STM4(&(io[3]), TfW, ovs, &(io[1]));
+			      }
+			      {
+				   V T7t, T7u, T7v, T7y;
+				   T7t = VSUB(T7r, T7s);
+				   T7u = VSUB(T7p, T7m);
+				   TfX = VSUB(T7t, T7u);
+				   STM4(&(io[51]), TfX, ovs, &(io[1]));
+				   TfY = VADD(T7t, T7u);
+				   STM4(&(io[19]), TfY, ovs, &(io[1]));
+				   T7v = VSUB(T7h, T7i);
+				   T7y = VSUB(T7w, T7x);
+				   TfZ = VSUB(T7v, T7y);
+				   STM4(&(ro[51]), TfZ, ovs, &(ro[1]));
+				   Tg0 = VADD(T7v, T7y);
+				   STM4(&(ro[19]), Tg0, ovs, &(ro[1]));
+			      }
+			 }
+			 {
+			      V T9j, T9V, Ta0, Taa, Ta3, Tab, T9q, Ta6, T9y, T9Q, T9J, Ta5, T9M, T9W, T9F;
+			      V T9R;
+			      {
+				   V T9h, T9i, T9Y, T9Z;
+				   T9h = VADD(T7B, T7C);
+				   T9i = VADD(T8J, T8I);
+				   T9j = VSUB(T9h, T9i);
+				   T9V = VADD(T9h, T9i);
+				   T9Y = VADD(T9s, T9t);
+				   T9Z = VADD(T9v, T9w);
+				   Ta0 = VFMA(LDK(KP995184726), T9Y, VMUL(LDK(KP098017140), T9Z));
+				   Taa = VFNMS(LDK(KP098017140), T9Y, VMUL(LDK(KP995184726), T9Z));
+			      }
+			      {
+				   V Ta1, Ta2, T9m, T9p;
+				   Ta1 = VADD(T9z, T9A);
+				   Ta2 = VADD(T9C, T9D);
+				   Ta3 = VFNMS(LDK(KP098017140), Ta2, VMUL(LDK(KP995184726), Ta1));
+				   Tab = VFMA(LDK(KP098017140), Ta1, VMUL(LDK(KP995184726), Ta2));
+				   T9m = VFNMS(LDK(KP195090322), T9l, VMUL(LDK(KP980785280), T9k));
+				   T9p = VFMA(LDK(KP195090322), T9n, VMUL(LDK(KP980785280), T9o));
+				   T9q = VSUB(T9m, T9p);
+				   Ta6 = VADD(T9m, T9p);
+			      }
+			      {
+				   V T9u, T9x, T9H, T9I;
+				   T9u = VSUB(T9s, T9t);
+				   T9x = VSUB(T9v, T9w);
+				   T9y = VFMA(LDK(KP634393284), T9u, VMUL(LDK(KP773010453), T9x));
+				   T9Q = VFNMS(LDK(KP773010453), T9u, VMUL(LDK(KP634393284), T9x));
+				   T9H = VADD(T8F, T8G);
+				   T9I = VADD(T7G, T7J);
+				   T9J = VSUB(T9H, T9I);
+				   Ta5 = VADD(T9H, T9I);
+			      }
+			      {
+				   V T9K, T9L, T9B, T9E;
+				   T9K = VFNMS(LDK(KP195090322), T9o, VMUL(LDK(KP980785280), T9n));
+				   T9L = VFMA(LDK(KP980785280), T9l, VMUL(LDK(KP195090322), T9k));
+				   T9M = VSUB(T9K, T9L);
+				   T9W = VADD(T9L, T9K);
+				   T9B = VSUB(T9z, T9A);
+				   T9E = VSUB(T9C, T9D);
+				   T9F = VFNMS(LDK(KP773010453), T9E, VMUL(LDK(KP634393284), T9B));
+				   T9R = VFMA(LDK(KP773010453), T9B, VMUL(LDK(KP634393284), T9E));
+			      }
+			      {
+				   V T9r, T9G, Tg1, Tg2;
+				   T9r = VADD(T9j, T9q);
+				   T9G = VADD(T9y, T9F);
+				   Tg1 = VSUB(T9r, T9G);
+				   STM4(&(ro[41]), Tg1, ovs, &(ro[1]));
+				   STN4(&(ro[40]), TeR, Tg1, Tff, TfL, ovs);
+				   Tg2 = VADD(T9r, T9G);
+				   STM4(&(ro[9]), Tg2, ovs, &(ro[1]));
+				   STN4(&(ro[8]), TeS, Tg2, Tfg, TfM, ovs);
+			      }
+			      {
+				   V T9T, T9U, Tg3, Tg4;
+				   T9T = VADD(T9J, T9M);
+				   T9U = VADD(T9Q, T9R);
+				   Tg3 = VSUB(T9T, T9U);
+				   STM4(&(io[41]), Tg3, ovs, &(io[1]));
+				   STN4(&(io[40]), TeT, Tg3, Tfh, TfN, ovs);
+				   Tg4 = VADD(T9T, T9U);
+				   STM4(&(io[9]), Tg4, ovs, &(io[1]));
+				   STN4(&(io[8]), TeU, Tg4, Tfi, TfO, ovs);
+			      }
+			      {
+				   V T9N, T9O, Tg5, Tg6;
+				   T9N = VSUB(T9J, T9M);
+				   T9O = VSUB(T9F, T9y);
+				   Tg5 = VSUB(T9N, T9O);
+				   STM4(&(io[57]), Tg5, ovs, &(io[1]));
+				   STN4(&(io[56]), TeV, Tg5, Tfj, TfP, ovs);
+				   Tg6 = VADD(T9N, T9O);
+				   STM4(&(io[25]), Tg6, ovs, &(io[1]));
+				   STN4(&(io[24]), TeW, Tg6, Tfk, TfQ, ovs);
+			      }
+			      {
+				   V T9P, T9S, Tg7, Tg8;
+				   T9P = VSUB(T9j, T9q);
+				   T9S = VSUB(T9Q, T9R);
+				   Tg7 = VSUB(T9P, T9S);
+				   STM4(&(ro[57]), Tg7, ovs, &(ro[1]));
+				   STN4(&(ro[56]), TeX, Tg7, Tfl, TfR, ovs);
+				   Tg8 = VADD(T9P, T9S);
+				   STM4(&(ro[25]), Tg8, ovs, &(ro[1]));
+				   STN4(&(ro[24]), TeY, Tg8, Tfm, TfS, ovs);
+			      }
+			      {
+				   V T9X, Ta4, Tg9, Tga;
+				   T9X = VADD(T9V, T9W);
+				   Ta4 = VADD(Ta0, Ta3);
+				   Tg9 = VSUB(T9X, Ta4);
+				   STM4(&(ro[33]), Tg9, ovs, &(ro[1]));
+				   STN4(&(ro[32]), TeJ, Tg9, Tfn, TfT, ovs);
+				   Tga = VADD(T9X, Ta4);
+				   STM4(&(ro[1]), Tga, ovs, &(ro[1]));
+				   STN4(&(ro[0]), TeL, Tga, Tfo, TfU, ovs);
+			      }
+			      {
+				   V Tad, Tae, Tgb, Tgc;
+				   Tad = VADD(Ta5, Ta6);
+				   Tae = VADD(Taa, Tab);
+				   Tgb = VSUB(Tad, Tae);
+				   STM4(&(io[33]), Tgb, ovs, &(io[1]));
+				   STN4(&(io[32]), TeK, Tgb, Tfp, TfV, ovs);
+				   Tgc = VADD(Tad, Tae);
+				   STM4(&(io[1]), Tgc, ovs, &(io[1]));
+				   STN4(&(io[0]), TeM, Tgc, Tfq, TfW, ovs);
+			      }
+			      {
+				   V Ta7, Ta8, Tgd, Tge;
+				   Ta7 = VSUB(Ta5, Ta6);
+				   Ta8 = VSUB(Ta3, Ta0);
+				   Tgd = VSUB(Ta7, Ta8);
+				   STM4(&(io[49]), Tgd, ovs, &(io[1]));
+				   STN4(&(io[48]), TeP, Tgd, Tfr, TfX, ovs);
+				   Tge = VADD(Ta7, Ta8);
+				   STM4(&(io[17]), Tge, ovs, &(io[1]));
+				   STN4(&(io[16]), TeN, Tge, Tfs, TfY, ovs);
+			      }
+			      {
+				   V Ta9, Tac, Tgf, Tgg;
+				   Ta9 = VSUB(T9V, T9W);
+				   Tac = VSUB(Taa, Tab);
+				   Tgf = VSUB(Ta9, Tac);
+				   STM4(&(ro[49]), Tgf, ovs, &(ro[1]));
+				   STN4(&(ro[48]), TeQ, Tgf, Tft, TfZ, ovs);
+				   Tgg = VADD(Ta9, Tac);
+				   STM4(&(ro[17]), Tgg, ovs, &(ro[1]));
+				   STN4(&(ro[16]), TeO, Tgg, Tfu, Tg0, ovs);
+			      }
+			 }
+			 {
+			      V Tgh, Tgi, Tgj, Tgk, Tgl, Tgm, Tgn, Tgo, Tgp, Tgq, Tgr, Tgs, Tgt, Tgu, Tgv;
+			      V Tgw;
+			      {
+				   V T3v, T6j, T6o, T6y, T6r, T6z, T48, T6u, T52, T6e, T67, T6t, T6a, T6k, T5V;
+				   V T6f;
+				   {
+					V T3f, T3u, T6m, T6n;
+					T3f = VSUB(T37, T3e);
+					T3u = VSUB(T3m, T3t);
+					T3v = VSUB(T3f, T3u);
+					T6j = VADD(T3f, T3u);
+					T6m = VADD(T4q, T4N);
+					T6n = VADD(T4X, T50);
+					T6o = VFMA(LDK(KP634393284), T6m, VMUL(LDK(KP773010453), T6n));
+					T6y = VFNMS(LDK(KP634393284), T6n, VMUL(LDK(KP773010453), T6m));
+				   }
+				   {
+					V T6p, T6q, T3O, T47;
+					T6p = VADD(T5j, T5G);
+					T6q = VADD(T5Q, T5T);
+					T6r = VFNMS(LDK(KP634393284), T6q, VMUL(LDK(KP773010453), T6p));
+					T6z = VFMA(LDK(KP773010453), T6q, VMUL(LDK(KP634393284), T6p));
+					T3O = VFNMS(LDK(KP980785280), T3N, VMUL(LDK(KP195090322), T3G));
+					T47 = VFMA(LDK(KP195090322), T3Z, VMUL(LDK(KP980785280), T46));
+					T48 = VSUB(T3O, T47);
+					T6u = VADD(T3O, T47);
+				   }
+				   {
+					V T4O, T51, T63, T66;
+					T4O = VSUB(T4q, T4N);
+					T51 = VSUB(T4X, T50);
+					T52 = VFMA(LDK(KP995184726), T4O, VMUL(LDK(KP098017140), T51));
+					T6e = VFNMS(LDK(KP995184726), T51, VMUL(LDK(KP098017140), T4O));
+					T63 = VSUB(T5Z, T62);
+					T66 = VSUB(T64, T65);
+					T67 = VSUB(T63, T66);
+					T6t = VADD(T63, T66);
+				   }
+				   {
+					V T68, T69, T5H, T5U;
+					T68 = VFNMS(LDK(KP980785280), T3Z, VMUL(LDK(KP195090322), T46));
+					T69 = VFMA(LDK(KP980785280), T3G, VMUL(LDK(KP195090322), T3N));
+					T6a = VSUB(T68, T69);
+					T6k = VADD(T69, T68);
+					T5H = VSUB(T5j, T5G);
+					T5U = VSUB(T5Q, T5T);
+					T5V = VFNMS(LDK(KP995184726), T5U, VMUL(LDK(KP098017140), T5H));
+					T6f = VFMA(LDK(KP098017140), T5U, VMUL(LDK(KP995184726), T5H));
+				   }
+				   {
+					V T49, T5W, T6h, T6i;
+					T49 = VADD(T3v, T48);
+					T5W = VADD(T52, T5V);
+					Tgh = VSUB(T49, T5W);
+					STM4(&(ro[47]), Tgh, ovs, &(ro[1]));
+					Tgi = VADD(T49, T5W);
+					STM4(&(ro[15]), Tgi, ovs, &(ro[1]));
+					T6h = VADD(T67, T6a);
+					T6i = VADD(T6e, T6f);
+					Tgj = VSUB(T6h, T6i);
+					STM4(&(io[47]), Tgj, ovs, &(io[1]));
+					Tgk = VADD(T6h, T6i);
+					STM4(&(io[15]), Tgk, ovs, &(io[1]));
+				   }
+				   {
+					V T6b, T6c, T6d, T6g;
+					T6b = VSUB(T67, T6a);
+					T6c = VSUB(T5V, T52);
+					Tgl = VSUB(T6b, T6c);
+					STM4(&(io[63]), Tgl, ovs, &(io[1]));
+					Tgm = VADD(T6b, T6c);
+					STM4(&(io[31]), Tgm, ovs, &(io[1]));
+					T6d = VSUB(T3v, T48);
+					T6g = VSUB(T6e, T6f);
+					Tgn = VSUB(T6d, T6g);
+					STM4(&(ro[63]), Tgn, ovs, &(ro[1]));
+					Tgo = VADD(T6d, T6g);
+					STM4(&(ro[31]), Tgo, ovs, &(ro[1]));
+				   }
+				   {
+					V T6l, T6s, T6B, T6C;
+					T6l = VADD(T6j, T6k);
+					T6s = VADD(T6o, T6r);
+					Tgp = VSUB(T6l, T6s);
+					STM4(&(ro[39]), Tgp, ovs, &(ro[1]));
+					Tgq = VADD(T6l, T6s);
+					STM4(&(ro[7]), Tgq, ovs, &(ro[1]));
+					T6B = VADD(T6t, T6u);
+					T6C = VADD(T6y, T6z);
+					Tgr = VSUB(T6B, T6C);
+					STM4(&(io[39]), Tgr, ovs, &(io[1]));
+					Tgs = VADD(T6B, T6C);
+					STM4(&(io[7]), Tgs, ovs, &(io[1]));
+				   }
+				   {
+					V T6v, T6w, T6x, T6A;
+					T6v = VSUB(T6t, T6u);
+					T6w = VSUB(T6r, T6o);
+					Tgt = VSUB(T6v, T6w);
+					STM4(&(io[55]), Tgt, ovs, &(io[1]));
+					Tgu = VADD(T6v, T6w);
+					STM4(&(io[23]), Tgu, ovs, &(io[1]));
+					T6x = VSUB(T6j, T6k);
+					T6A = VSUB(T6y, T6z);
+					Tgv = VSUB(T6x, T6A);
+					STM4(&(ro[55]), Tgv, ovs, &(ro[1]));
+					Tgw = VADD(T6x, T6A);
+					STM4(&(ro[23]), Tgw, ovs, &(ro[1]));
+				   }
+			      }
+			      {
+				   V T7L, T8X, T92, T9c, T95, T9d, T80, T98, T8k, T8S, T8L, T97, T8O, T8Y, T8D;
+				   V T8T;
+				   {
+					V T7D, T7K, T90, T91;
+					T7D = VSUB(T7B, T7C);
+					T7K = VSUB(T7G, T7J);
+					T7L = VSUB(T7D, T7K);
+					T8X = VADD(T7D, T7K);
+					T90 = VADD(T84, T8b);
+					T91 = VADD(T8f, T8i);
+					T92 = VFMA(LDK(KP471396736), T90, VMUL(LDK(KP881921264), T91));
+					T9c = VFNMS(LDK(KP471396736), T91, VMUL(LDK(KP881921264), T90));
+				   }
+				   {
+					V T93, T94, T7S, T7Z;
+					T93 = VADD(T8n, T8u);
+					T94 = VADD(T8y, T8B);
+					T95 = VFNMS(LDK(KP471396736), T94, VMUL(LDK(KP881921264), T93));
+					T9d = VFMA(LDK(KP881921264), T94, VMUL(LDK(KP471396736), T93));
+					T7S = VFNMS(LDK(KP831469612), T7R, VMUL(LDK(KP555570233), T7O));
+					T7Z = VFMA(LDK(KP831469612), T7V, VMUL(LDK(KP555570233), T7Y));
+					T80 = VSUB(T7S, T7Z);
+					T98 = VADD(T7S, T7Z);
+				   }
+				   {
+					V T8c, T8j, T8H, T8K;
+					T8c = VSUB(T84, T8b);
+					T8j = VSUB(T8f, T8i);
+					T8k = VFMA(LDK(KP956940335), T8c, VMUL(LDK(KP290284677), T8j));
+					T8S = VFNMS(LDK(KP956940335), T8j, VMUL(LDK(KP290284677), T8c));
+					T8H = VSUB(T8F, T8G);
+					T8K = VSUB(T8I, T8J);
+					T8L = VSUB(T8H, T8K);
+					T97 = VADD(T8H, T8K);
+				   }
+				   {
+					V T8M, T8N, T8v, T8C;
+					T8M = VFNMS(LDK(KP831469612), T7Y, VMUL(LDK(KP555570233), T7V));
+					T8N = VFMA(LDK(KP555570233), T7R, VMUL(LDK(KP831469612), T7O));
+					T8O = VSUB(T8M, T8N);
+					T8Y = VADD(T8N, T8M);
+					T8v = VSUB(T8n, T8u);
+					T8C = VSUB(T8y, T8B);
+					T8D = VFNMS(LDK(KP956940335), T8C, VMUL(LDK(KP290284677), T8v));
+					T8T = VFMA(LDK(KP290284677), T8C, VMUL(LDK(KP956940335), T8v));
+				   }
+				   {
+					V T81, T8E, Tgx, Tgy;
+					T81 = VADD(T7L, T80);
+					T8E = VADD(T8k, T8D);
+					Tgx = VSUB(T81, T8E);
+					STM4(&(ro[45]), Tgx, ovs, &(ro[1]));
+					STN4(&(ro[44]), TeZ, Tgx, Tfv, Tgh, ovs);
+					Tgy = VADD(T81, T8E);
+					STM4(&(ro[13]), Tgy, ovs, &(ro[1]));
+					STN4(&(ro[12]), Tf0, Tgy, Tfw, Tgi, ovs);
+				   }
+				   {
+					V T8V, T8W, Tgz, TgA;
+					T8V = VADD(T8L, T8O);
+					T8W = VADD(T8S, T8T);
+					Tgz = VSUB(T8V, T8W);
+					STM4(&(io[45]), Tgz, ovs, &(io[1]));
+					STN4(&(io[44]), Tf1, Tgz, Tfx, Tgj, ovs);
+					TgA = VADD(T8V, T8W);
+					STM4(&(io[13]), TgA, ovs, &(io[1]));
+					STN4(&(io[12]), Tf2, TgA, Tfy, Tgk, ovs);
+				   }
+				   {
+					V T8P, T8Q, TgB, TgC;
+					T8P = VSUB(T8L, T8O);
+					T8Q = VSUB(T8D, T8k);
+					TgB = VSUB(T8P, T8Q);
+					STM4(&(io[61]), TgB, ovs, &(io[1]));
+					STN4(&(io[60]), Tf3, TgB, Tfz, Tgl, ovs);
+					TgC = VADD(T8P, T8Q);
+					STM4(&(io[29]), TgC, ovs, &(io[1]));
+					STN4(&(io[28]), Tf4, TgC, TfA, Tgm, ovs);
+				   }
+				   {
+					V T8R, T8U, TgD, TgE;
+					T8R = VSUB(T7L, T80);
+					T8U = VSUB(T8S, T8T);
+					TgD = VSUB(T8R, T8U);
+					STM4(&(ro[61]), TgD, ovs, &(ro[1]));
+					STN4(&(ro[60]), Tf5, TgD, TfB, Tgn, ovs);
+					TgE = VADD(T8R, T8U);
+					STM4(&(ro[29]), TgE, ovs, &(ro[1]));
+					STN4(&(ro[28]), Tf6, TgE, TfC, Tgo, ovs);
+				   }
+				   {
+					V T8Z, T96, TgF, TgG;
+					T8Z = VADD(T8X, T8Y);
+					T96 = VADD(T92, T95);
+					TgF = VSUB(T8Z, T96);
+					STM4(&(ro[37]), TgF, ovs, &(ro[1]));
+					STN4(&(ro[36]), Tf7, TgF, TfD, Tgp, ovs);
+					TgG = VADD(T8Z, T96);
+					STM4(&(ro[5]), TgG, ovs, &(ro[1]));
+					STN4(&(ro[4]), Tf8, TgG, TfE, Tgq, ovs);
+				   }
+				   {
+					V T9f, T9g, TgH, TgI;
+					T9f = VADD(T97, T98);
+					T9g = VADD(T9c, T9d);
+					TgH = VSUB(T9f, T9g);
+					STM4(&(io[37]), TgH, ovs, &(io[1]));
+					STN4(&(io[36]), Tf9, TgH, TfF, Tgr, ovs);
+					TgI = VADD(T9f, T9g);
+					STM4(&(io[5]), TgI, ovs, &(io[1]));
+					STN4(&(io[4]), Tfa, TgI, TfG, Tgs, ovs);
+				   }
+				   {
+					V T99, T9a, TgJ, TgK;
+					T99 = VSUB(T97, T98);
+					T9a = VSUB(T95, T92);
+					TgJ = VSUB(T99, T9a);
+					STM4(&(io[53]), TgJ, ovs, &(io[1]));
+					STN4(&(io[52]), Tfb, TgJ, TfH, Tgt, ovs);
+					TgK = VADD(T99, T9a);
+					STM4(&(io[21]), TgK, ovs, &(io[1]));
+					STN4(&(io[20]), Tfc, TgK, TfI, Tgu, ovs);
+				   }
+				   {
+					V T9b, T9e, TgL, TgM;
+					T9b = VSUB(T8X, T8Y);
+					T9e = VSUB(T9c, T9d);
+					TgL = VSUB(T9b, T9e);
+					STM4(&(ro[53]), TgL, ovs, &(ro[1]));
+					STN4(&(ro[52]), Tfd, TgL, TfJ, Tgv, ovs);
+					TgM = VADD(T9b, T9e);
+					STM4(&(ro[21]), TgM, ovs, &(ro[1]));
+					STN4(&(ro[20]), Tfe, TgM, TfK, Tgw, ovs);
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 64, XSIMD_STRING("n2sv_64"), {808, 144, 104, 0}, &GENUS, 0, 1, 0, 0 };
+
+void XSIMD(codelet_n2sv_64) (planner *p) {
+     X(kdft_register) (p, n2sv_64, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/n2sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/n2sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:47 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name n2sv_8 -with-ostride 1 -include n2s.h -store-multiple 4 */
+
+/*
+ * This function contains 52 FP additions, 8 FP multiplications,
+ * (or, 44 additions, 0 multiplications, 8 fused multiply/add),
+ * 58 stack variables, 1 constants, and 36 memory accesses
+ */
+#include "n2s.h"
+
+static void n2sv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       V TF, TJ, TD, TR, TS, TT, TU, TV, TW, TE, TX, TY, TK, TI, TZ;
+	       V T10, T11, T12;
+	       {
+		    V Tb, Tn, T3, TC, Ti, TB, T6, To, Tl, Tc, Tw, Tx, T8, T9, Tr;
+		    V Ts;
+		    {
+			 V T1, T2, Tg, Th, T4, T5, Tj, Tk;
+			 T1 = LD(&(ri[0]), ivs, &(ri[0]));
+			 T2 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
+			 Tg = LD(&(ii[0]), ivs, &(ii[0]));
+			 Th = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
+			 T4 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
+			 T5 = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
+			 Tj = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
+			 Tk = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
+			 Tb = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
+			 Tn = VSUB(T1, T2);
+			 T3 = VADD(T1, T2);
+			 TC = VSUB(Tg, Th);
+			 Ti = VADD(Tg, Th);
+			 TB = VSUB(T4, T5);
+			 T6 = VADD(T4, T5);
+			 To = VSUB(Tj, Tk);
+			 Tl = VADD(Tj, Tk);
+			 Tc = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
+			 Tw = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
+			 Tx = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
+			 T8 = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
+			 T9 = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
+			 Tr = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
+			 Ts = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
+		    }
+		    {
+			 V TL, T7, TP, Tm, Tz, TH, Te, Tf, TO, TQ, TG, Tu, Tp, TA;
+			 {
+			      V Td, Tv, TN, Ty, Ta, Tq, TM, Tt;
+			      TL = VSUB(T3, T6);
+			      T7 = VADD(T3, T6);
+			      Td = VADD(Tb, Tc);
+			      Tv = VSUB(Tb, Tc);
+			      TN = VADD(Tw, Tx);
+			      Ty = VSUB(Tw, Tx);
+			      Ta = VADD(T8, T9);
+			      Tq = VSUB(T8, T9);
+			      TM = VADD(Tr, Ts);
+			      Tt = VSUB(Tr, Ts);
+			      TP = VADD(Ti, Tl);
+			      Tm = VSUB(Ti, Tl);
+			      Tz = VSUB(Tv, Ty);
+			      TH = VADD(Tv, Ty);
+			      Te = VADD(Ta, Td);
+			      Tf = VSUB(Td, Ta);
+			      TO = VSUB(TM, TN);
+			      TQ = VADD(TM, TN);
+			      TG = VSUB(Tt, Tq);
+			      Tu = VADD(Tq, Tt);
+			 }
+			 TF = VSUB(Tn, To);
+			 Tp = VADD(Tn, To);
+			 TJ = VSUB(TC, TB);
+			 TD = VADD(TB, TC);
+			 TR = VSUB(Tm, Tf);
+			 STM4(&(io[6]), TR, ovs, &(io[0]));
+			 TS = VADD(Tf, Tm);
+			 STM4(&(io[2]), TS, ovs, &(io[0]));
+			 TT = VADD(T7, Te);
+			 STM4(&(ro[0]), TT, ovs, &(ro[0]));
+			 TU = VSUB(T7, Te);
+			 STM4(&(ro[4]), TU, ovs, &(ro[0]));
+			 TV = VADD(TP, TQ);
+			 STM4(&(io[0]), TV, ovs, &(io[0]));
+			 TW = VSUB(TP, TQ);
+			 STM4(&(io[4]), TW, ovs, &(io[0]));
+			 TE = VSUB(Tz, Tu);
+			 TA = VADD(Tu, Tz);
+			 TX = VADD(TL, TO);
+			 STM4(&(ro[2]), TX, ovs, &(ro[0]));
+			 TY = VSUB(TL, TO);
+			 STM4(&(ro[6]), TY, ovs, &(ro[0]));
+			 TK = VADD(TG, TH);
+			 TI = VSUB(TG, TH);
+			 TZ = VFMA(LDK(KP707106781), TA, Tp);
+			 STM4(&(ro[1]), TZ, ovs, &(ro[1]));
+			 T10 = VFNMS(LDK(KP707106781), TA, Tp);
+			 STM4(&(ro[5]), T10, ovs, &(ro[1]));
+		    }
+	       }
+	       T11 = VFMA(LDK(KP707106781), TK, TJ);
+	       STM4(&(io[1]), T11, ovs, &(io[1]));
+	       T12 = VFNMS(LDK(KP707106781), TK, TJ);
+	       STM4(&(io[5]), T12, ovs, &(io[1]));
+	       {
+		    V T13, T14, T15, T16;
+		    T13 = VFMA(LDK(KP707106781), TE, TD);
+		    STM4(&(io[3]), T13, ovs, &(io[1]));
+		    STN4(&(io[0]), TV, T11, TS, T13, ovs);
+		    T14 = VFNMS(LDK(KP707106781), TE, TD);
+		    STM4(&(io[7]), T14, ovs, &(io[1]));
+		    STN4(&(io[4]), TW, T12, TR, T14, ovs);
+		    T15 = VFMA(LDK(KP707106781), TI, TF);
+		    STM4(&(ro[3]), T15, ovs, &(ro[1]));
+		    STN4(&(ro[0]), TT, TZ, TX, T15, ovs);
+		    T16 = VFNMS(LDK(KP707106781), TI, TF);
+		    STM4(&(ro[7]), T16, ovs, &(ro[1]));
+		    STN4(&(ro[4]), TU, T10, TY, T16, ovs);
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 8, XSIMD_STRING("n2sv_8"), {44, 0, 8, 0}, &GENUS, 0, 1, 0, 0 };
+
+void XSIMD(codelet_n2sv_8) (planner *p) {
+     X(kdft_register) (p, n2sv_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name n2sv_8 -with-ostride 1 -include n2s.h -store-multiple 4 */
+
+/*
+ * This function contains 52 FP additions, 4 FP multiplications,
+ * (or, 52 additions, 4 multiplications, 0 fused multiply/add),
+ * 34 stack variables, 1 constants, and 36 memory accesses
+ */
+#include "n2s.h"
+
+static void n2sv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
+	       V T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
+	       V TG;
+	       {
+		    V T1, T2, Tj, Tk;
+		    T1 = LD(&(ri[0]), ivs, &(ri[0]));
+		    T2 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
+		    T3 = VADD(T1, T2);
+		    Tn = VSUB(T1, T2);
+		    {
+			 V Tg, Th, T4, T5;
+			 Tg = LD(&(ii[0]), ivs, &(ii[0]));
+			 Th = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
+			 Ti = VADD(Tg, Th);
+			 TC = VSUB(Tg, Th);
+			 T4 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
+			 T5 = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
+			 T6 = VADD(T4, T5);
+			 TB = VSUB(T4, T5);
+		    }
+		    Tj = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
+		    Tk = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
+		    Tl = VADD(Tj, Tk);
+		    To = VSUB(Tj, Tk);
+		    {
+			 V Tb, Tc, Tv, Tw, Tx, Ty;
+			 Tb = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
+			 Tc = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
+			 Tv = VSUB(Tb, Tc);
+			 Tw = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
+			 Tx = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
+			 Ty = VSUB(Tw, Tx);
+			 Td = VADD(Tb, Tc);
+			 TN = VADD(Tw, Tx);
+			 Tz = VSUB(Tv, Ty);
+			 TH = VADD(Tv, Ty);
+		    }
+		    {
+			 V T8, T9, Tq, Tr, Ts, Tt;
+			 T8 = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
+			 T9 = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
+			 Tq = VSUB(T8, T9);
+			 Tr = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
+			 Ts = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
+			 Tt = VSUB(Tr, Ts);
+			 Ta = VADD(T8, T9);
+			 TM = VADD(Tr, Ts);
+			 Tu = VADD(Tq, Tt);
+			 TG = VSUB(Tt, Tq);
+		    }
+	       }
+	       {
+		    V TR, TS, TT, TU, TV, TW, TX, TY;
+		    {
+			 V T7, Te, TP, TQ;
+			 T7 = VADD(T3, T6);
+			 Te = VADD(Ta, Td);
+			 TR = VSUB(T7, Te);
+			 STM4(&(ro[4]), TR, ovs, &(ro[0]));
+			 TS = VADD(T7, Te);
+			 STM4(&(ro[0]), TS, ovs, &(ro[0]));
+			 TP = VADD(Ti, Tl);
+			 TQ = VADD(TM, TN);
+			 TT = VSUB(TP, TQ);
+			 STM4(&(io[4]), TT, ovs, &(io[0]));
+			 TU = VADD(TP, TQ);
+			 STM4(&(io[0]), TU, ovs, &(io[0]));
+		    }
+		    {
+			 V Tf, Tm, TL, TO;
+			 Tf = VSUB(Td, Ta);
+			 Tm = VSUB(Ti, Tl);
+			 TV = VADD(Tf, Tm);
+			 STM4(&(io[2]), TV, ovs, &(io[0]));
+			 TW = VSUB(Tm, Tf);
+			 STM4(&(io[6]), TW, ovs, &(io[0]));
+			 TL = VSUB(T3, T6);
+			 TO = VSUB(TM, TN);
+			 TX = VSUB(TL, TO);
+			 STM4(&(ro[6]), TX, ovs, &(ro[0]));
+			 TY = VADD(TL, TO);
+			 STM4(&(ro[2]), TY, ovs, &(ro[0]));
+		    }
+		    {
+			 V TZ, T10, T11, T12;
+			 {
+			      V Tp, TA, TJ, TK;
+			      Tp = VADD(Tn, To);
+			      TA = VMUL(LDK(KP707106781), VADD(Tu, Tz));
+			      TZ = VSUB(Tp, TA);
+			      STM4(&(ro[5]), TZ, ovs, &(ro[1]));
+			      T10 = VADD(Tp, TA);
+			      STM4(&(ro[1]), T10, ovs, &(ro[1]));
+			      TJ = VSUB(TC, TB);
+			      TK = VMUL(LDK(KP707106781), VADD(TG, TH));
+			      T11 = VSUB(TJ, TK);
+			      STM4(&(io[5]), T11, ovs, &(io[1]));
+			      T12 = VADD(TJ, TK);
+			      STM4(&(io[1]), T12, ovs, &(io[1]));
+			 }
+			 {
+			      V TD, TE, T13, T14;
+			      TD = VADD(TB, TC);
+			      TE = VMUL(LDK(KP707106781), VSUB(Tz, Tu));
+			      T13 = VSUB(TD, TE);
+			      STM4(&(io[7]), T13, ovs, &(io[1]));
+			      STN4(&(io[4]), TT, T11, TW, T13, ovs);
+			      T14 = VADD(TD, TE);
+			      STM4(&(io[3]), T14, ovs, &(io[1]));
+			      STN4(&(io[0]), TU, T12, TV, T14, ovs);
+			 }
+			 {
+			      V TF, TI, T15, T16;
+			      TF = VSUB(Tn, To);
+			      TI = VMUL(LDK(KP707106781), VSUB(TG, TH));
+			      T15 = VSUB(TF, TI);
+			      STM4(&(ro[7]), T15, ovs, &(ro[1]));
+			      STN4(&(ro[4]), TR, TZ, TX, T15, ovs);
+			      T16 = VADD(TF, TI);
+			      STM4(&(ro[3]), T16, ovs, &(ro[1]));
+			      STN4(&(ro[0]), TS, T10, TY, T16, ovs);
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const kdft_desc desc = { 8, XSIMD_STRING("n2sv_8"), {52, 4, 0, 0}, &GENUS, 0, 1, 0, 0 };
+
+void XSIMD(codelet_n2sv_8) (planner *p) {
+     X(kdft_register) (p, n2sv_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/q1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/q1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:33 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -dif -name q1bv_2 -include q1b.h -sign 1 */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 6 additions, 4 multiplications, 0 fused multiply/add),
+ * 8 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "q1b.h"
+
+static void q1bv_2(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(4, vs)) {
+	       V T1, T2, T4, T5, T3, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+	       T5 = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       ST(&(x[0]), VADD(T1, T2), ms, &(x[0]));
+	       T3 = BYTW(&(W[0]), VSUB(T1, T2));
+	       ST(&(x[WS(rs, 1)]), VADD(T4, T5), ms, &(x[WS(rs, 1)]));
+	       T6 = BYTW(&(W[0]), VSUB(T4, T5));
+	       ST(&(x[WS(vs, 1)]), T3, ms, &(x[WS(vs, 1)]));
+	       ST(&(x[WS(vs, 1) + WS(rs, 1)]), T6, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("q1bv_2"), twinstr, &GENUS, {6, 4, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1bv_2) (planner *p) {
+     X(kdft_difsq_register) (p, q1bv_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -dif -name q1bv_2 -include q1b.h -sign 1 */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 6 additions, 4 multiplications, 0 fused multiply/add),
+ * 8 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "q1b.h"
+
+static void q1bv_2(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(4, vs)) {
+	       V T1, T2, T3, T4, T5, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTW(&(W[0]), VSUB(T1, T2));
+	       T4 = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+	       T5 = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       T6 = BYTW(&(W[0]), VSUB(T4, T5));
+	       ST(&(x[WS(vs, 1)]), T3, ms, &(x[WS(vs, 1)]));
+	       ST(&(x[WS(vs, 1) + WS(rs, 1)]), T6, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       ST(&(x[0]), VADD(T1, T2), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VADD(T4, T5), ms, &(x[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("q1bv_2"), twinstr, &GENUS, {6, 4, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1bv_2) (planner *p) {
+     X(kdft_difsq_register) (p, q1bv_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/q1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/q1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:33 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1bv_4 -include q1b.h -sign 1 */
+
+/*
+ * This function contains 44 FP additions, 32 FP multiplications,
+ * (or, 36 additions, 24 multiplications, 8 fused multiply/add),
+ * 38 stack variables, 0 constants, and 32 memory accesses
+ */
+#include "q1b.h"
+
+static void q1bv_4(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, vs)) {
+	       V Tb, Tm, Tx, TI;
+	       {
+		    V Tc, T9, T3, TG, TA, TH, TD, Ta, T6, Td, Tn, To, Tq, Tr, Tf;
+		    V Tg;
+		    {
+			 V T1, T2, Ty, Tz, TB, TC, T4, T5;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Ty = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+			 Tz = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+			 TB = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 TC = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 Tc = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+			 T9 = VADD(T1, T2);
+			 T3 = VSUB(T1, T2);
+			 TG = VADD(Ty, Tz);
+			 TA = VSUB(Ty, Tz);
+			 TH = VADD(TB, TC);
+			 TD = VSUB(TB, TC);
+			 Ta = VADD(T4, T5);
+			 T6 = VSUB(T4, T5);
+			 Td = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+			 Tn = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+			 To = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+			 Tq = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 Tr = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 Tf = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 Tg = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    }
+		    {
+			 V Tk, Te, Tv, Tp, Tw, Ts, Tl, Th, T7, TE, Tu, TF;
+			 ST(&(x[0]), VADD(T9, Ta), ms, &(x[0]));
+			 Tk = VADD(Tc, Td);
+			 Te = VSUB(Tc, Td);
+			 Tv = VADD(Tn, To);
+			 Tp = VSUB(Tn, To);
+			 Tw = VADD(Tq, Tr);
+			 Ts = VSUB(Tq, Tr);
+			 Tl = VADD(Tf, Tg);
+			 Th = VSUB(Tf, Tg);
+			 ST(&(x[WS(rs, 3)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
+			 T7 = BYTW(&(W[TWVL * 4]), VFNMSI(T6, T3));
+			 TE = BYTW(&(W[TWVL * 4]), VFNMSI(TD, TA));
+			 {
+			      V Tt, Ti, Tj, T8;
+			      T8 = BYTW(&(W[0]), VFMAI(T6, T3));
+			      ST(&(x[WS(rs, 2)]), VADD(Tv, Tw), ms, &(x[0]));
+			      Tt = BYTW(&(W[TWVL * 4]), VFNMSI(Ts, Tp));
+			      ST(&(x[WS(rs, 1)]), VADD(Tk, Tl), ms, &(x[WS(rs, 1)]));
+			      Ti = BYTW(&(W[TWVL * 4]), VFNMSI(Th, Te));
+			      Tj = BYTW(&(W[0]), VFMAI(Th, Te));
+			      ST(&(x[WS(vs, 3)]), T7, ms, &(x[WS(vs, 3)]));
+			      ST(&(x[WS(vs, 3) + WS(rs, 3)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 1)]), T8, ms, &(x[WS(vs, 1)]));
+			      Tu = BYTW(&(W[0]), VFMAI(Ts, Tp));
+			      ST(&(x[WS(vs, 3) + WS(rs, 2)]), Tt, ms, &(x[WS(vs, 3)]));
+			      TF = BYTW(&(W[0]), VFMAI(TD, TA));
+			      ST(&(x[WS(vs, 3) + WS(rs, 1)]), Ti, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tj, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 }
+			 Tb = BYTW(&(W[TWVL * 2]), VSUB(T9, Ta));
+			 Tm = BYTW(&(W[TWVL * 2]), VSUB(Tk, Tl));
+			 Tx = BYTW(&(W[TWVL * 2]), VSUB(Tv, Tw));
+			 ST(&(x[WS(vs, 1) + WS(rs, 2)]), Tu, ms, &(x[WS(vs, 1)]));
+			 TI = BYTW(&(W[TWVL * 2]), VSUB(TG, TH));
+			 ST(&(x[WS(vs, 1) + WS(rs, 3)]), TF, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    }
+	       }
+	       ST(&(x[WS(vs, 2)]), Tb, ms, &(x[WS(vs, 2)]));
+	       ST(&(x[WS(vs, 2) + WS(rs, 1)]), Tm, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+	       ST(&(x[WS(vs, 2) + WS(rs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
+	       ST(&(x[WS(vs, 2) + WS(rs, 3)]), TI, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("q1bv_4"), twinstr, &GENUS, {36, 24, 8, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1bv_4) (planner *p) {
+     X(kdft_difsq_register) (p, q1bv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1bv_4 -include q1b.h -sign 1 */
+
+/*
+ * This function contains 44 FP additions, 24 FP multiplications,
+ * (or, 44 additions, 24 multiplications, 0 fused multiply/add),
+ * 22 stack variables, 0 constants, and 32 memory accesses
+ */
+#include "q1b.h"
+
+static void q1bv_4(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, vs)) {
+	       V T3, T9, TA, TG, TD, TH, T6, Ta, Te, Tk, Tp, Tv, Ts, Tw, Th;
+	       V Tl;
+	       {
+		    V T1, T2, Ty, Tz;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T3 = VSUB(T1, T2);
+		    T9 = VADD(T1, T2);
+		    Ty = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+		    Tz = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+		    TA = VSUB(Ty, Tz);
+		    TG = VADD(Ty, Tz);
+	       }
+	       {
+		    V TB, TC, T4, T5;
+		    TB = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    TC = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    TD = VBYI(VSUB(TB, TC));
+		    TH = VADD(TB, TC);
+		    T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T6 = VBYI(VSUB(T4, T5));
+		    Ta = VADD(T4, T5);
+	       }
+	       {
+		    V Tc, Td, Tn, To;
+		    Tc = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+		    Td = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+		    Te = VSUB(Tc, Td);
+		    Tk = VADD(Tc, Td);
+		    Tn = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+		    To = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+		    Tp = VSUB(Tn, To);
+		    Tv = VADD(Tn, To);
+	       }
+	       {
+		    V Tq, Tr, Tf, Tg;
+		    Tq = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    Tr = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    Ts = VBYI(VSUB(Tq, Tr));
+		    Tw = VADD(Tq, Tr);
+		    Tf = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    Tg = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    Th = VBYI(VSUB(Tf, Tg));
+		    Tl = VADD(Tf, Tg);
+	       }
+	       ST(&(x[0]), VADD(T9, Ta), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VADD(Tk, Tl), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 2)]), VADD(Tv, Tw), ms, &(x[0]));
+	       ST(&(x[WS(rs, 3)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T7, Ti, Tt, TE;
+		    T7 = BYTW(&(W[TWVL * 4]), VSUB(T3, T6));
+		    ST(&(x[WS(vs, 3)]), T7, ms, &(x[WS(vs, 3)]));
+		    Ti = BYTW(&(W[TWVL * 4]), VSUB(Te, Th));
+		    ST(&(x[WS(vs, 3) + WS(rs, 1)]), Ti, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    Tt = BYTW(&(W[TWVL * 4]), VSUB(Tp, Ts));
+		    ST(&(x[WS(vs, 3) + WS(rs, 2)]), Tt, ms, &(x[WS(vs, 3)]));
+		    TE = BYTW(&(W[TWVL * 4]), VSUB(TA, TD));
+		    ST(&(x[WS(vs, 3) + WS(rs, 3)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+	       }
+	       {
+		    V T8, Tj, Tu, TF;
+		    T8 = BYTW(&(W[0]), VADD(T3, T6));
+		    ST(&(x[WS(vs, 1)]), T8, ms, &(x[WS(vs, 1)]));
+		    Tj = BYTW(&(W[0]), VADD(Te, Th));
+		    ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tj, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    Tu = BYTW(&(W[0]), VADD(Tp, Ts));
+		    ST(&(x[WS(vs, 1) + WS(rs, 2)]), Tu, ms, &(x[WS(vs, 1)]));
+		    TF = BYTW(&(W[0]), VADD(TA, TD));
+		    ST(&(x[WS(vs, 1) + WS(rs, 3)]), TF, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       }
+	       {
+		    V Tb, Tm, Tx, TI;
+		    Tb = BYTW(&(W[TWVL * 2]), VSUB(T9, Ta));
+		    ST(&(x[WS(vs, 2)]), Tb, ms, &(x[WS(vs, 2)]));
+		    Tm = BYTW(&(W[TWVL * 2]), VSUB(Tk, Tl));
+		    ST(&(x[WS(vs, 2) + WS(rs, 1)]), Tm, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    Tx = BYTW(&(W[TWVL * 2]), VSUB(Tv, Tw));
+		    ST(&(x[WS(vs, 2) + WS(rs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
+		    TI = BYTW(&(W[TWVL * 2]), VSUB(TG, TH));
+		    ST(&(x[WS(vs, 2) + WS(rs, 3)]), TI, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("q1bv_4"), twinstr, &GENUS, {44, 24, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1bv_4) (planner *p) {
+     X(kdft_difsq_register) (p, q1bv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/q1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/q1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:33 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 5 -dif -name q1bv_5 -include q1b.h -sign 1 */
+
+/*
+ * This function contains 100 FP additions, 95 FP multiplications,
+ * (or, 55 additions, 50 multiplications, 45 fused multiply/add),
+ * 69 stack variables, 4 constants, and 50 memory accesses
+ */
+#include "q1b.h"
+
+static void q1bv_5(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(10, vs)) {
+	       V Te, T1w, Ty, TS, TW, Tb, T1t, Tv, T1g, T1c, TP, TV, T1f, T19, TY;
+	       V TX;
+	       {
+		    V T1, T1j, Tl, Ti, Ta, T8, T1A, T1q, T1s, T9, TF, T1r, TZ, TR, TL;
+		    V TC, Ts, Tu, TQ, TI, T15, T1b, T10, T11, Tt;
+		    {
+			 V T1n, T1o, T1k, T1l, T7, Td, T4, Tc;
+			 {
+			      V T5, T6, T2, T3;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      T1j = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
+			      T1n = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
+			      T1o = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			      T1k = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			      T1l = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
+			      T7 = VADD(T5, T6);
+			      Td = VSUB(T5, T6);
+			      T4 = VADD(T2, T3);
+			      Tc = VSUB(T2, T3);
+			 }
+			 {
+			      V Tm, Tn, Tr, Tx, T1v, T1p;
+			      Tl = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+			      T1v = VSUB(T1n, T1o);
+			      T1p = VADD(T1n, T1o);
+			      {
+				   V T1u, T1m, Tp, Tq;
+				   T1u = VSUB(T1k, T1l);
+				   T1m = VADD(T1k, T1l);
+				   Tp = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+				   Ti = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tc, Td));
+				   Te = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Td, Tc));
+				   Ta = VSUB(T4, T7);
+				   T8 = VADD(T4, T7);
+				   Tq = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+				   T1w = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1v, T1u));
+				   T1A = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1u, T1v));
+				   T1q = VADD(T1m, T1p);
+				   T1s = VSUB(T1m, T1p);
+				   Tm = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+				   T9 = VFNMS(LDK(KP250000000), T8, T1);
+				   Tn = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
+				   Tr = VADD(Tp, Tq);
+				   Tx = VSUB(Tp, Tq);
+			      }
+			      {
+				   V TJ, TK, TG, Tw, To, TH, T13, T14;
+				   TF = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+				   T1r = VFNMS(LDK(KP250000000), T1q, T1j);
+				   TJ = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+				   TK = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+				   TG = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+				   Tw = VSUB(Tm, Tn);
+				   To = VADD(Tm, Tn);
+				   TH = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
+				   TZ = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+				   T13 = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+				   T14 = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+				   TR = VSUB(TJ, TK);
+				   TL = VADD(TJ, TK);
+				   Ty = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tx, Tw));
+				   TC = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tw, Tx));
+				   Ts = VADD(To, Tr);
+				   Tu = VSUB(To, Tr);
+				   TQ = VSUB(TG, TH);
+				   TI = VADD(TG, TH);
+				   T15 = VADD(T13, T14);
+				   T1b = VSUB(T13, T14);
+				   T10 = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+				   T11 = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
+				   Tt = VFNMS(LDK(KP250000000), Ts, Tl);
+			      }
+			 }
+		    }
+		    {
+			 V TO, T12, T1a, Th, T1z, TN, TM, T18, T17;
+			 ST(&(x[0]), VADD(T1, T8), ms, &(x[0]));
+			 TS = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TR, TQ));
+			 TW = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TQ, TR));
+			 TM = VADD(TI, TL);
+			 TO = VSUB(TI, TL);
+			 ST(&(x[WS(rs, 4)]), VADD(T1j, T1q), ms, &(x[0]));
+			 T12 = VADD(T10, T11);
+			 T1a = VSUB(T10, T11);
+			 ST(&(x[WS(rs, 1)]), VADD(Tl, Ts), ms, &(x[WS(rs, 1)]));
+			 Th = VFNMS(LDK(KP559016994), Ta, T9);
+			 Tb = VFMA(LDK(KP559016994), Ta, T9);
+			 T1t = VFMA(LDK(KP559016994), T1s, T1r);
+			 T1z = VFNMS(LDK(KP559016994), T1s, T1r);
+			 ST(&(x[WS(rs, 2)]), VADD(TF, TM), ms, &(x[0]));
+			 TN = VFNMS(LDK(KP250000000), TM, TF);
+			 {
+			      V T16, Tk, Tj, T1C, T1B, TD, TE, TB;
+			      TB = VFNMS(LDK(KP559016994), Tu, Tt);
+			      Tv = VFMA(LDK(KP559016994), Tu, Tt);
+			      T1g = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1a, T1b));
+			      T1c = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1b, T1a));
+			      T18 = VSUB(T12, T15);
+			      T16 = VADD(T12, T15);
+			      Tk = BYTW(&(W[TWVL * 4]), VFMAI(Ti, Th));
+			      Tj = BYTW(&(W[TWVL * 2]), VFNMSI(Ti, Th));
+			      T1C = BYTW(&(W[TWVL * 4]), VFMAI(T1A, T1z));
+			      T1B = BYTW(&(W[TWVL * 2]), VFNMSI(T1A, T1z));
+			      TD = BYTW(&(W[TWVL * 2]), VFNMSI(TC, TB));
+			      TE = BYTW(&(W[TWVL * 4]), VFMAI(TC, TB));
+			      ST(&(x[WS(rs, 3)]), VADD(TZ, T16), ms, &(x[WS(rs, 1)]));
+			      T17 = VFNMS(LDK(KP250000000), T16, TZ);
+			      ST(&(x[WS(vs, 3)]), Tk, ms, &(x[WS(vs, 3)]));
+			      ST(&(x[WS(vs, 2)]), Tj, ms, &(x[WS(vs, 2)]));
+			      ST(&(x[WS(vs, 3) + WS(rs, 4)]), T1C, ms, &(x[WS(vs, 3)]));
+			      ST(&(x[WS(vs, 2) + WS(rs, 4)]), T1B, ms, &(x[WS(vs, 2)]));
+			      ST(&(x[WS(vs, 2) + WS(rs, 1)]), TD, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 3) + WS(rs, 1)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 }
+			 TP = VFMA(LDK(KP559016994), TO, TN);
+			 TV = VFNMS(LDK(KP559016994), TO, TN);
+			 T1f = VFNMS(LDK(KP559016994), T18, T17);
+			 T19 = VFMA(LDK(KP559016994), T18, T17);
+		    }
+	       }
+	       TY = BYTW(&(W[TWVL * 4]), VFMAI(TW, TV));
+	       TX = BYTW(&(W[TWVL * 2]), VFNMSI(TW, TV));
+	       {
+		    V T1i, T1h, TU, TT;
+		    T1i = BYTW(&(W[TWVL * 4]), VFMAI(T1g, T1f));
+		    T1h = BYTW(&(W[TWVL * 2]), VFNMSI(T1g, T1f));
+		    TU = BYTW(&(W[TWVL * 6]), VFNMSI(TS, TP));
+		    TT = BYTW(&(W[0]), VFMAI(TS, TP));
+		    {
+			 V Tg, Tf, TA, Tz;
+			 Tg = BYTW(&(W[TWVL * 6]), VFNMSI(Te, Tb));
+			 Tf = BYTW(&(W[0]), VFMAI(Te, Tb));
+			 TA = BYTW(&(W[TWVL * 6]), VFNMSI(Ty, Tv));
+			 Tz = BYTW(&(W[0]), VFMAI(Ty, Tv));
+			 {
+			      V T1e, T1d, T1y, T1x;
+			      T1e = BYTW(&(W[TWVL * 6]), VFNMSI(T1c, T19));
+			      T1d = BYTW(&(W[0]), VFMAI(T1c, T19));
+			      T1y = BYTW(&(W[TWVL * 6]), VFNMSI(T1w, T1t));
+			      T1x = BYTW(&(W[0]), VFMAI(T1w, T1t));
+			      ST(&(x[WS(vs, 3) + WS(rs, 2)]), TY, ms, &(x[WS(vs, 3)]));
+			      ST(&(x[WS(vs, 2) + WS(rs, 2)]), TX, ms, &(x[WS(vs, 2)]));
+			      ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1i, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 2) + WS(rs, 3)]), T1h, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 4) + WS(rs, 2)]), TU, ms, &(x[WS(vs, 4)]));
+			      ST(&(x[WS(vs, 1) + WS(rs, 2)]), TT, ms, &(x[WS(vs, 1)]));
+			      ST(&(x[WS(vs, 4)]), Tg, ms, &(x[WS(vs, 4)]));
+			      ST(&(x[WS(vs, 1)]), Tf, ms, &(x[WS(vs, 1)]));
+			      ST(&(x[WS(vs, 4) + WS(rs, 1)]), TA, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tz, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 4) + WS(rs, 3)]), T1e, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1d, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 4) + WS(rs, 4)]), T1y, ms, &(x[WS(vs, 4)]));
+			      ST(&(x[WS(vs, 1) + WS(rs, 4)]), T1x, ms, &(x[WS(vs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("q1bv_5"), twinstr, &GENUS, {55, 50, 45, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1bv_5) (planner *p) {
+     X(kdft_difsq_register) (p, q1bv_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -dif -name q1bv_5 -include q1b.h -sign 1 */
+
+/*
+ * This function contains 100 FP additions, 70 FP multiplications,
+ * (or, 85 additions, 55 multiplications, 15 fused multiply/add),
+ * 44 stack variables, 4 constants, and 50 memory accesses
+ */
+#include "q1b.h"
+
+static void q1bv_5(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(10, vs)) {
+	       V Tb, T7, Th, Ta, Tc, Td, T1t, T1p, T1z, T1s, T1u, T1v, Tv, Tr, TB;
+	       V Tu, Tw, Tx, TP, TL, TV, TO, TQ, TR, T19, T15, T1f, T18, T1a, T1b;
+	       {
+		    V T6, T9, T3, T8;
+		    Tb = LD(&(x[0]), ms, &(x[0]));
+		    {
+			 V T4, T5, T1, T2;
+			 T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T6 = VSUB(T4, T5);
+			 T9 = VADD(T4, T5);
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T3 = VSUB(T1, T2);
+			 T8 = VADD(T1, T2);
+		    }
+		    T7 = VBYI(VFMA(LDK(KP951056516), T3, VMUL(LDK(KP587785252), T6)));
+		    Th = VBYI(VFNMS(LDK(KP951056516), T6, VMUL(LDK(KP587785252), T3)));
+		    Ta = VMUL(LDK(KP559016994), VSUB(T8, T9));
+		    Tc = VADD(T8, T9);
+		    Td = VFNMS(LDK(KP250000000), Tc, Tb);
+	       }
+	       {
+		    V T1o, T1r, T1l, T1q;
+		    T1t = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
+		    {
+			 V T1m, T1n, T1j, T1k;
+			 T1m = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
+			 T1n = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T1o = VSUB(T1m, T1n);
+			 T1r = VADD(T1m, T1n);
+			 T1j = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T1k = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
+			 T1l = VSUB(T1j, T1k);
+			 T1q = VADD(T1j, T1k);
+		    }
+		    T1p = VBYI(VFMA(LDK(KP951056516), T1l, VMUL(LDK(KP587785252), T1o)));
+		    T1z = VBYI(VFNMS(LDK(KP951056516), T1o, VMUL(LDK(KP587785252), T1l)));
+		    T1s = VMUL(LDK(KP559016994), VSUB(T1q, T1r));
+		    T1u = VADD(T1q, T1r);
+		    T1v = VFNMS(LDK(KP250000000), T1u, T1t);
+	       }
+	       {
+		    V Tq, Tt, Tn, Ts;
+		    Tv = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+		    {
+			 V To, Tp, Tl, Tm;
+			 To = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+			 Tp = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 Tq = VSUB(To, Tp);
+			 Tt = VADD(To, Tp);
+			 Tl = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 Tm = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
+			 Tn = VSUB(Tl, Tm);
+			 Ts = VADD(Tl, Tm);
+		    }
+		    Tr = VBYI(VFMA(LDK(KP951056516), Tn, VMUL(LDK(KP587785252), Tq)));
+		    TB = VBYI(VFNMS(LDK(KP951056516), Tq, VMUL(LDK(KP587785252), Tn)));
+		    Tu = VMUL(LDK(KP559016994), VSUB(Ts, Tt));
+		    Tw = VADD(Ts, Tt);
+		    Tx = VFNMS(LDK(KP250000000), Tw, Tv);
+	       }
+	       {
+		    V TK, TN, TH, TM;
+		    TP = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+		    {
+			 V TI, TJ, TF, TG;
+			 TI = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+			 TJ = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 TK = VSUB(TI, TJ);
+			 TN = VADD(TI, TJ);
+			 TF = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 TG = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
+			 TH = VSUB(TF, TG);
+			 TM = VADD(TF, TG);
+		    }
+		    TL = VBYI(VFMA(LDK(KP951056516), TH, VMUL(LDK(KP587785252), TK)));
+		    TV = VBYI(VFNMS(LDK(KP951056516), TK, VMUL(LDK(KP587785252), TH)));
+		    TO = VMUL(LDK(KP559016994), VSUB(TM, TN));
+		    TQ = VADD(TM, TN);
+		    TR = VFNMS(LDK(KP250000000), TQ, TP);
+	       }
+	       {
+		    V T14, T17, T11, T16;
+		    T19 = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+		    {
+			 V T12, T13, TZ, T10;
+			 T12 = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+			 T13 = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T14 = VSUB(T12, T13);
+			 T17 = VADD(T12, T13);
+			 TZ = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T10 = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
+			 T11 = VSUB(TZ, T10);
+			 T16 = VADD(TZ, T10);
+		    }
+		    T15 = VBYI(VFMA(LDK(KP951056516), T11, VMUL(LDK(KP587785252), T14)));
+		    T1f = VBYI(VFNMS(LDK(KP951056516), T14, VMUL(LDK(KP587785252), T11)));
+		    T18 = VMUL(LDK(KP559016994), VSUB(T16, T17));
+		    T1a = VADD(T16, T17);
+		    T1b = VFNMS(LDK(KP250000000), T1a, T19);
+	       }
+	       ST(&(x[0]), VADD(Tb, Tc), ms, &(x[0]));
+	       ST(&(x[WS(rs, 4)]), VADD(T1t, T1u), ms, &(x[0]));
+	       ST(&(x[WS(rs, 2)]), VADD(TP, TQ), ms, &(x[0]));
+	       ST(&(x[WS(rs, 3)]), VADD(T19, T1a), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 1)]), VADD(Tv, Tw), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tj, Tk, Ti, T1B, T1C, T1A;
+		    Ti = VSUB(Td, Ta);
+		    Tj = BYTW(&(W[TWVL * 2]), VADD(Th, Ti));
+		    Tk = BYTW(&(W[TWVL * 4]), VSUB(Ti, Th));
+		    ST(&(x[WS(vs, 2)]), Tj, ms, &(x[WS(vs, 2)]));
+		    ST(&(x[WS(vs, 3)]), Tk, ms, &(x[WS(vs, 3)]));
+		    T1A = VSUB(T1v, T1s);
+		    T1B = BYTW(&(W[TWVL * 2]), VADD(T1z, T1A));
+		    T1C = BYTW(&(W[TWVL * 4]), VSUB(T1A, T1z));
+		    ST(&(x[WS(vs, 2) + WS(rs, 4)]), T1B, ms, &(x[WS(vs, 2)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 4)]), T1C, ms, &(x[WS(vs, 3)]));
+	       }
+	       {
+		    V T1h, T1i, T1g, TD, TE, TC;
+		    T1g = VSUB(T1b, T18);
+		    T1h = BYTW(&(W[TWVL * 2]), VADD(T1f, T1g));
+		    T1i = BYTW(&(W[TWVL * 4]), VSUB(T1g, T1f));
+		    ST(&(x[WS(vs, 2) + WS(rs, 3)]), T1h, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1i, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    TC = VSUB(Tx, Tu);
+		    TD = BYTW(&(W[TWVL * 2]), VADD(TB, TC));
+		    TE = BYTW(&(W[TWVL * 4]), VSUB(TC, TB));
+		    ST(&(x[WS(vs, 2) + WS(rs, 1)]), TD, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 1)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+	       }
+	       {
+		    V TX, TY, TW, TT, TU, TS;
+		    TW = VSUB(TR, TO);
+		    TX = BYTW(&(W[TWVL * 2]), VADD(TV, TW));
+		    TY = BYTW(&(W[TWVL * 4]), VSUB(TW, TV));
+		    ST(&(x[WS(vs, 2) + WS(rs, 2)]), TX, ms, &(x[WS(vs, 2)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 2)]), TY, ms, &(x[WS(vs, 3)]));
+		    TS = VADD(TO, TR);
+		    TT = BYTW(&(W[0]), VADD(TL, TS));
+		    TU = BYTW(&(W[TWVL * 6]), VSUB(TS, TL));
+		    ST(&(x[WS(vs, 1) + WS(rs, 2)]), TT, ms, &(x[WS(vs, 1)]));
+		    ST(&(x[WS(vs, 4) + WS(rs, 2)]), TU, ms, &(x[WS(vs, 4)]));
+	       }
+	       {
+		    V Tf, Tg, Te, Tz, TA, Ty;
+		    Te = VADD(Ta, Td);
+		    Tf = BYTW(&(W[0]), VADD(T7, Te));
+		    Tg = BYTW(&(W[TWVL * 6]), VSUB(Te, T7));
+		    ST(&(x[WS(vs, 1)]), Tf, ms, &(x[WS(vs, 1)]));
+		    ST(&(x[WS(vs, 4)]), Tg, ms, &(x[WS(vs, 4)]));
+		    Ty = VADD(Tu, Tx);
+		    Tz = BYTW(&(W[0]), VADD(Tr, Ty));
+		    TA = BYTW(&(W[TWVL * 6]), VSUB(Ty, Tr));
+		    ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tz, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 4) + WS(rs, 1)]), TA, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+	       }
+	       {
+		    V T1d, T1e, T1c, T1x, T1y, T1w;
+		    T1c = VADD(T18, T1b);
+		    T1d = BYTW(&(W[0]), VADD(T15, T1c));
+		    T1e = BYTW(&(W[TWVL * 6]), VSUB(T1c, T15));
+		    ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1d, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 4) + WS(rs, 3)]), T1e, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+		    T1w = VADD(T1s, T1v);
+		    T1x = BYTW(&(W[0]), VADD(T1p, T1w));
+		    T1y = BYTW(&(W[TWVL * 6]), VSUB(T1w, T1p));
+		    ST(&(x[WS(vs, 1) + WS(rs, 4)]), T1x, ms, &(x[WS(vs, 1)]));
+		    ST(&(x[WS(vs, 4) + WS(rs, 4)]), T1y, ms, &(x[WS(vs, 4)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("q1bv_5"), twinstr, &GENUS, {85, 55, 15, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1bv_5) (planner *p) {
+     X(kdft_difsq_register) (p, q1bv_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/q1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/q1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,994 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:33 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1bv_8 -include q1b.h -sign 1 */
+
+/*
+ * This function contains 264 FP additions, 192 FP multiplications,
+ * (or, 184 additions, 112 multiplications, 80 fused multiply/add),
+ * 121 stack variables, 1 constants, and 128 memory accesses
+ */
+#include "q1b.h"
+
+static void q1bv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
+	       V T42, T43, T1U, T1V, T2Y, T2Z, TT, TS, T45, T44;
+	       {
+		    V T3, Te, T1E, T1P, Tv, Tp, T26, T20, T2b, T2m, T3M, T2x, T2D, T3X, TA;
+		    V TL, T48, T4e, T17, T12, TW, T1i, T2I, T1z, T1t, T2T, T3f, T3q, T34, T3a;
+		    V T3H, T3B, Ts, Tw, Tf, Ta, T23, T27, T1Q, T1L, T2A, T2E, T2n, T2i, T4b;
+		    V T4f, T3Y, T3T, TZ, T13, TM, TH, T35, T2L, T3j, T1w, T1A, T1j, T1e, T36;
+		    V T2O, T3C, T3i, T3k;
+		    {
+			 V T3d, T32, T3e, T3o, T3p, T33;
+			 {
+			      V T2v, T2w, T3V, T46, T3W;
+			      {
+				   V T1, T2, Tc, Td, T1C, T1D, T1N, T1O;
+				   T1 = LD(&(x[0]), ms, &(x[0]));
+				   T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+				   Td = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   T1C = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+				   T1D = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
+				   T1N = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+				   T1O = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
+				   {
+					V T29, T1Y, T1Z, T2a, T2k, T2l, Tn, To, T3K, T3L;
+					T29 = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
+					T3 = VSUB(T1, T2);
+					Tn = VADD(T1, T2);
+					Te = VSUB(Tc, Td);
+					To = VADD(Tc, Td);
+					T1E = VSUB(T1C, T1D);
+					T1Y = VADD(T1C, T1D);
+					T1P = VSUB(T1N, T1O);
+					T1Z = VADD(T1N, T1O);
+					T2a = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
+					T2k = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
+					T2l = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
+					Tv = VADD(Tn, To);
+					Tp = VSUB(Tn, To);
+					T3K = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
+					T3L = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
+					T26 = VADD(T1Y, T1Z);
+					T20 = VSUB(T1Y, T1Z);
+					T2v = VADD(T29, T2a);
+					T2b = VSUB(T29, T2a);
+					T2w = VADD(T2k, T2l);
+					T2m = VSUB(T2k, T2l);
+					T3V = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
+					T46 = VADD(T3K, T3L);
+					T3M = VSUB(T3K, T3L);
+					T3W = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
+				   }
+			      }
+			      {
+				   V T15, TU, T16, T1g, TV, T1h;
+				   {
+					V Ty, Tz, TJ, TK, T47;
+					Ty = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+					Tz = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
+					TJ = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+					T2x = VSUB(T2v, T2w);
+					T2D = VADD(T2v, T2w);
+					TK = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
+					T47 = VADD(T3V, T3W);
+					T3X = VSUB(T3V, T3W);
+					T15 = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+					TA = VSUB(Ty, Tz);
+					TU = VADD(Ty, Tz);
+					T16 = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
+					T1g = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+					TL = VSUB(TJ, TK);
+					TV = VADD(TJ, TK);
+					T48 = VSUB(T46, T47);
+					T4e = VADD(T46, T47);
+					T1h = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
+				   }
+				   {
+					V T2G, T1r, T2H, T2R, T1s, T2S;
+					T2G = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
+					T17 = VSUB(T15, T16);
+					T1r = VADD(T15, T16);
+					T2H = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
+					T12 = VADD(TU, TV);
+					TW = VSUB(TU, TV);
+					T2R = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
+					T1i = VSUB(T1g, T1h);
+					T1s = VADD(T1g, T1h);
+					T2S = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
+					T3d = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
+					T2I = VSUB(T2G, T2H);
+					T32 = VADD(T2G, T2H);
+					T3e = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
+					T3o = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
+					T3p = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
+					T1z = VADD(T1r, T1s);
+					T1t = VSUB(T1r, T1s);
+					T33 = VADD(T2R, T2S);
+					T2T = VSUB(T2R, T2S);
+				   }
+			      }
+			 }
+			 {
+			      V T2y, T2e, T3Q, T2z, T2h, T49, T3P, T3R;
+			      {
+				   V T6, Tq, T1I, Tr, T9, T21, T1H, T1J;
+				   {
+					V T4, T3z, T3A, T5, T7, T8, T1F, T1G;
+					T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					T3f = VSUB(T3d, T3e);
+					T3z = VADD(T3d, T3e);
+					T3q = VSUB(T3o, T3p);
+					T3A = VADD(T3o, T3p);
+					T5 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+					T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					T34 = VSUB(T32, T33);
+					T3a = VADD(T32, T33);
+					T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+					T1F = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+					T1G = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+					T3H = VADD(T3z, T3A);
+					T3B = VSUB(T3z, T3A);
+					T6 = VSUB(T4, T5);
+					Tq = VADD(T4, T5);
+					T1I = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+					Tr = VADD(T7, T8);
+					T9 = VSUB(T7, T8);
+					T21 = VADD(T1F, T1G);
+					T1H = VSUB(T1F, T1G);
+					T1J = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+				   }
+				   {
+					V T2f, T22, T1K, T2g, T2c, T2d, T3N, T3O;
+					T2c = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					T2d = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					T2f = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					Ts = VSUB(Tq, Tr);
+					Tw = VADD(Tq, Tr);
+					Tf = VSUB(T6, T9);
+					Ta = VADD(T6, T9);
+					T22 = VADD(T1I, T1J);
+					T1K = VSUB(T1I, T1J);
+					T2y = VADD(T2c, T2d);
+					T2e = VSUB(T2c, T2d);
+					T2g = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					T3N = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+					T3O = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+					T3Q = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+					T23 = VSUB(T21, T22);
+					T27 = VADD(T21, T22);
+					T1Q = VSUB(T1H, T1K);
+					T1L = VADD(T1H, T1K);
+					T2z = VADD(T2f, T2g);
+					T2h = VSUB(T2f, T2g);
+					T49 = VADD(T3N, T3O);
+					T3P = VSUB(T3N, T3O);
+					T3R = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+				   }
+			      }
+			      {
+				   V TX, TD, T1b, TY, TG, T1u, T1a, T1c;
+				   {
+					V TE, T4a, T3S, TF, TB, TC, T18, T19;
+					TB = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					TC = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					TE = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					T2A = VSUB(T2y, T2z);
+					T2E = VADD(T2y, T2z);
+					T2n = VSUB(T2e, T2h);
+					T2i = VADD(T2e, T2h);
+					T4a = VADD(T3Q, T3R);
+					T3S = VSUB(T3Q, T3R);
+					TX = VADD(TB, TC);
+					TD = VSUB(TB, TC);
+					TF = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					T18 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+					T19 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+					T1b = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+					T4b = VSUB(T49, T4a);
+					T4f = VADD(T49, T4a);
+					T3Y = VSUB(T3P, T3S);
+					T3T = VADD(T3P, T3S);
+					TY = VADD(TE, TF);
+					TG = VSUB(TE, TF);
+					T1u = VADD(T18, T19);
+					T1a = VSUB(T18, T19);
+					T1c = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+				   }
+				   {
+					V T2M, T1v, T1d, T2N, T2J, T2K, T3g, T3h;
+					T2J = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					T2K = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					T2M = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					TZ = VSUB(TX, TY);
+					T13 = VADD(TX, TY);
+					TM = VSUB(TD, TG);
+					TH = VADD(TD, TG);
+					T1v = VADD(T1b, T1c);
+					T1d = VSUB(T1b, T1c);
+					T35 = VADD(T2J, T2K);
+					T2L = VSUB(T2J, T2K);
+					T2N = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					T3g = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+					T3h = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+					T3j = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+					T1w = VSUB(T1u, T1v);
+					T1A = VADD(T1u, T1v);
+					T1j = VSUB(T1a, T1d);
+					T1e = VADD(T1a, T1d);
+					T36 = VADD(T2M, T2N);
+					T2O = VSUB(T2M, T2N);
+					T3C = VADD(T3g, T3h);
+					T3i = VSUB(T3g, T3h);
+					T3k = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T3b, T2U, T2P, T3I, T3r, T3m, T11, T25, T39, T4d;
+			 {
+			      V T37, T3E, T2B, T24;
+			      {
+				   V T3D, T3l, Tt, T4c;
+				   ST(&(x[0]), VADD(Tv, Tw), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VADD(T1z, T1A), ms, &(x[0]));
+				   ST(&(x[WS(rs, 7)]), VADD(T4e, T4f), ms, &(x[WS(rs, 1)]));
+				   T37 = VSUB(T35, T36);
+				   T3b = VADD(T35, T36);
+				   T2U = VSUB(T2L, T2O);
+				   T2P = VADD(T2L, T2O);
+				   T3D = VADD(T3j, T3k);
+				   T3l = VSUB(T3j, T3k);
+				   ST(&(x[WS(rs, 4)]), VADD(T2D, T2E), ms, &(x[0]));
+				   ST(&(x[WS(rs, 3)]), VADD(T26, T27), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VADD(T12, T13), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 5)]), VADD(T3a, T3b), ms, &(x[WS(rs, 1)]));
+				   Tt = BYTW(&(W[TWVL * 10]), VFNMSI(Ts, Tp));
+				   T4c = BYTW(&(W[TWVL * 10]), VFNMSI(T4b, T48));
+				   T3E = VSUB(T3C, T3D);
+				   T3I = VADD(T3C, T3D);
+				   T3r = VSUB(T3i, T3l);
+				   T3m = VADD(T3i, T3l);
+				   T2B = BYTW(&(W[TWVL * 10]), VFNMSI(T2A, T2x));
+				   T24 = BYTW(&(W[TWVL * 10]), VFNMSI(T23, T20));
+				   ST(&(x[WS(vs, 6)]), Tt, ms, &(x[WS(vs, 6)]));
+				   ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+			      }
+			      {
+				   V T38, T1y, Tu, T10, T1x, T3F, T2C, T3G;
+				   T10 = BYTW(&(W[TWVL * 10]), VFNMSI(TZ, TW));
+				   ST(&(x[WS(rs, 6)]), VADD(T3H, T3I), ms, &(x[0]));
+				   T1x = BYTW(&(W[TWVL * 10]), VFNMSI(T1w, T1t));
+				   T3F = BYTW(&(W[TWVL * 10]), VFNMSI(T3E, T3B));
+				   ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 6)]));
+				   ST(&(x[WS(vs, 6) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+				   T38 = BYTW(&(W[TWVL * 10]), VFNMSI(T37, T34));
+				   T1y = BYTW(&(W[TWVL * 2]), VFMAI(T1w, T1t));
+				   ST(&(x[WS(vs, 6) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+				   Tu = BYTW(&(W[TWVL * 2]), VFMAI(Ts, Tp));
+				   ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 6)]));
+				   ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 6)]));
+				   T2C = BYTW(&(W[TWVL * 2]), VFMAI(T2A, T2x));
+				   T3G = BYTW(&(W[TWVL * 2]), VFMAI(T3E, T3B));
+				   ST(&(x[WS(vs, 6) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+				   ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1y, ms, &(x[WS(vs, 2)]));
+				   T11 = BYTW(&(W[TWVL * 2]), VFMAI(TZ, TW));
+				   ST(&(x[WS(vs, 2)]), Tu, ms, &(x[WS(vs, 2)]));
+				   T25 = BYTW(&(W[TWVL * 2]), VFMAI(T23, T20));
+				   T39 = BYTW(&(W[TWVL * 2]), VFMAI(T37, T34));
+				   ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2C, ms, &(x[WS(vs, 2)]));
+				   ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3G, ms, &(x[WS(vs, 2)]));
+				   T4d = BYTW(&(W[TWVL * 2]), VFMAI(T4b, T48));
+			      }
+			 }
+			 {
+			      V Tj, Tk, T2r, T2j, T2o, T2s, Ti, Th, T1M, T1R, T41, T40;
+			      {
+				   V T3c, T4g, T3J, T2F, Tx, T1B;
+				   Tx = BYTW(&(W[TWVL * 6]), VSUB(Tv, Tw));
+				   ST(&(x[WS(vs, 2) + WS(rs, 1)]), T11, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+				   T1B = BYTW(&(W[TWVL * 6]), VSUB(T1z, T1A));
+				   ST(&(x[WS(vs, 2) + WS(rs, 3)]), T25, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+				   ST(&(x[WS(vs, 2) + WS(rs, 5)]), T39, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+				   T3c = BYTW(&(W[TWVL * 6]), VSUB(T3a, T3b));
+				   T4g = BYTW(&(W[TWVL * 6]), VSUB(T4e, T4f));
+				   ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4d, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+				   ST(&(x[WS(vs, 4)]), Tx, ms, &(x[WS(vs, 4)]));
+				   T3J = BYTW(&(W[TWVL * 6]), VSUB(T3H, T3I));
+				   ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 4)]));
+				   T2F = BYTW(&(W[TWVL * 6]), VSUB(T2D, T2E));
+				   {
+					V T14, Tb, Tg, T28, T3U, T3Z;
+					T28 = BYTW(&(W[TWVL * 6]), VSUB(T26, T27));
+					ST(&(x[WS(vs, 4) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					T14 = BYTW(&(W[TWVL * 6]), VSUB(T12, T13));
+					Tj = VFMA(LDK(KP707106781), Ta, T3);
+					Tb = VFNMS(LDK(KP707106781), Ta, T3);
+					ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 4)]));
+					Tk = VFMA(LDK(KP707106781), Tf, Te);
+					Tg = VFNMS(LDK(KP707106781), Tf, Te);
+					ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 4)]));
+					ST(&(x[WS(vs, 4) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					T3U = VFNMS(LDK(KP707106781), T3T, T3M);
+					T42 = VFMA(LDK(KP707106781), T3T, T3M);
+					T43 = VFMA(LDK(KP707106781), T3Y, T3X);
+					T3Z = VFNMS(LDK(KP707106781), T3Y, T3X);
+					ST(&(x[WS(vs, 4) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					T2r = VFMA(LDK(KP707106781), T2i, T2b);
+					T2j = VFNMS(LDK(KP707106781), T2i, T2b);
+					T2o = VFNMS(LDK(KP707106781), T2n, T2m);
+					T2s = VFMA(LDK(KP707106781), T2n, T2m);
+					Ti = BYTW(&(W[TWVL * 8]), VFMAI(Tg, Tb));
+					Th = BYTW(&(W[TWVL * 4]), VFNMSI(Tg, Tb));
+					T1U = VFMA(LDK(KP707106781), T1L, T1E);
+					T1M = VFNMS(LDK(KP707106781), T1L, T1E);
+					T1R = VFNMS(LDK(KP707106781), T1Q, T1P);
+					T1V = VFMA(LDK(KP707106781), T1Q, T1P);
+					T41 = BYTW(&(W[TWVL * 8]), VFMAI(T3Z, T3U));
+					T40 = BYTW(&(W[TWVL * 4]), VFNMSI(T3Z, T3U));
+				   }
+			      }
+			      {
+				   V TQ, TR, T1n, T1o, T3v, T3w;
+				   {
+					V TI, TN, T1f, T1k, T3n, T3s;
+					{
+					     V T1T, T1S, T2q, T2p;
+					     TQ = VFMA(LDK(KP707106781), TH, TA);
+					     TI = VFNMS(LDK(KP707106781), TH, TA);
+					     T2q = BYTW(&(W[TWVL * 8]), VFMAI(T2o, T2j));
+					     T2p = BYTW(&(W[TWVL * 4]), VFNMSI(T2o, T2j));
+					     ST(&(x[WS(vs, 5)]), Ti, ms, &(x[WS(vs, 5)]));
+					     ST(&(x[WS(vs, 3)]), Th, ms, &(x[WS(vs, 3)]));
+					     T1T = BYTW(&(W[TWVL * 8]), VFMAI(T1R, T1M));
+					     T1S = BYTW(&(W[TWVL * 4]), VFNMSI(T1R, T1M));
+					     ST(&(x[WS(vs, 5) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					     ST(&(x[WS(vs, 3) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+					     ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 5)]));
+					     ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 3)]));
+					     TN = VFNMS(LDK(KP707106781), TM, TL);
+					     TR = VFMA(LDK(KP707106781), TM, TL);
+					     T1n = VFMA(LDK(KP707106781), T1e, T17);
+					     T1f = VFNMS(LDK(KP707106781), T1e, T17);
+					     ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					     ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+					     T1k = VFNMS(LDK(KP707106781), T1j, T1i);
+					     T1o = VFMA(LDK(KP707106781), T1j, T1i);
+					     T3v = VFMA(LDK(KP707106781), T3m, T3f);
+					     T3n = VFNMS(LDK(KP707106781), T3m, T3f);
+					     T3s = VFNMS(LDK(KP707106781), T3r, T3q);
+					     T3w = VFMA(LDK(KP707106781), T3r, T3q);
+					}
+					{
+					     V T2Q, TP, TO, T2V, T2X, T2W;
+					     T2Y = VFMA(LDK(KP707106781), T2P, T2I);
+					     T2Q = VFNMS(LDK(KP707106781), T2P, T2I);
+					     TP = BYTW(&(W[TWVL * 8]), VFMAI(TN, TI));
+					     TO = BYTW(&(W[TWVL * 4]), VFNMSI(TN, TI));
+					     T2V = VFNMS(LDK(KP707106781), T2U, T2T);
+					     T2Z = VFMA(LDK(KP707106781), T2U, T2T);
+					     {
+						  V T1m, T1l, T3u, T3t;
+						  T1m = BYTW(&(W[TWVL * 8]), VFMAI(T1k, T1f));
+						  T1l = BYTW(&(W[TWVL * 4]), VFNMSI(T1k, T1f));
+						  T3u = BYTW(&(W[TWVL * 8]), VFMAI(T3s, T3n));
+						  T3t = BYTW(&(W[TWVL * 4]), VFNMSI(T3s, T3n));
+						  ST(&(x[WS(vs, 5) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+						  ST(&(x[WS(vs, 3) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+						  T2X = BYTW(&(W[TWVL * 8]), VFMAI(T2V, T2Q));
+						  T2W = BYTW(&(W[TWVL * 4]), VFNMSI(T2V, T2Q));
+						  ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 5)]));
+						  ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 3)]));
+						  ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 5)]));
+						  ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 3)]));
+					     }
+					     ST(&(x[WS(vs, 5) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					     ST(&(x[WS(vs, 3) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+					}
+				   }
+				   {
+					V T3y, T3x, T1q, T1p;
+					T1q = BYTW(&(W[TWVL * 12]), VFNMSI(T1o, T1n));
+					T1p = BYTW(&(W[0]), VFMAI(T1o, T1n));
+					{
+					     V Tm, Tl, T2u, T2t;
+					     Tm = BYTW(&(W[TWVL * 12]), VFNMSI(Tk, Tj));
+					     Tl = BYTW(&(W[0]), VFMAI(Tk, Tj));
+					     T2u = BYTW(&(W[TWVL * 12]), VFNMSI(T2s, T2r));
+					     T2t = BYTW(&(W[0]), VFMAI(T2s, T2r));
+					     ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 7)]));
+					     ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 1)]));
+					     T3y = BYTW(&(W[TWVL * 12]), VFNMSI(T3w, T3v));
+					     T3x = BYTW(&(W[0]), VFMAI(T3w, T3v));
+					     ST(&(x[WS(vs, 7)]), Tm, ms, &(x[WS(vs, 7)]));
+					     ST(&(x[WS(vs, 1)]), Tl, ms, &(x[WS(vs, 1)]));
+					     ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 7)]));
+					     ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 1)]));
+					}
+					ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 7)]));
+					ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 1)]));
+					TT = BYTW(&(W[TWVL * 12]), VFNMSI(TR, TQ));
+					TS = BYTW(&(W[0]), VFMAI(TR, TQ));
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1X, T1W, T31, T30;
+		    T1X = BYTW(&(W[TWVL * 12]), VFNMSI(T1V, T1U));
+		    T1W = BYTW(&(W[0]), VFMAI(T1V, T1U));
+		    T31 = BYTW(&(W[TWVL * 12]), VFNMSI(T2Z, T2Y));
+		    T30 = BYTW(&(W[0]), VFMAI(T2Z, T2Y));
+		    ST(&(x[WS(vs, 7) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 1) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    T45 = BYTW(&(W[TWVL * 12]), VFNMSI(T43, T42));
+		    T44 = BYTW(&(W[0]), VFMAI(T43, T42));
+		    ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 7) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 1) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       }
+	       ST(&(x[WS(vs, 7) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+	       ST(&(x[WS(vs, 1) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("q1bv_8"), twinstr, &GENUS, {184, 112, 80, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1bv_8) (planner *p) {
+     X(kdft_difsq_register) (p, q1bv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1bv_8 -include q1b.h -sign 1 */
+
+/*
+ * This function contains 264 FP additions, 128 FP multiplications,
+ * (or, 264 additions, 128 multiplications, 0 fused multiply/add),
+ * 77 stack variables, 1 constants, and 128 memory accesses
+ */
+#include "q1b.h"
+
+static void q1bv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
+	       V Ta, Tv, Te, Tp, T1L, T26, T1P, T20, T2i, T2D, T2m, T2x, T3T, T4e, T3X;
+	       V T48, TH, T12, TL, TW, T1e, T1z, T1i, T1t, T2P, T3a, T2T, T34, T3m, T3H;
+	       V T3q, T3B, T7, Tw, Tf, Ts, T1I, T27, T1Q, T23, T2f, T2E, T2n, T2A, T3Q;
+	       V T4f, T3Y, T4b, TE, T13, TM, TZ, T1b, T1A, T1j, T1w, T2M, T3b, T2U, T37;
+	       V T3j, T3I, T3r, T3E, T28, T14;
+	       {
+		    V T8, T9, To, Tc, Td, Tn;
+		    T8 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T9 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    To = VADD(T8, T9);
+		    Tc = LD(&(x[0]), ms, &(x[0]));
+		    Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tn = VADD(Tc, Td);
+		    Ta = VSUB(T8, T9);
+		    Tv = VADD(Tn, To);
+		    Te = VSUB(Tc, Td);
+		    Tp = VSUB(Tn, To);
+	       }
+	       {
+		    V T1J, T1K, T1Z, T1N, T1O, T1Y;
+		    T1J = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+		    T1K = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
+		    T1Z = VADD(T1J, T1K);
+		    T1N = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+		    T1O = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
+		    T1Y = VADD(T1N, T1O);
+		    T1L = VSUB(T1J, T1K);
+		    T26 = VADD(T1Y, T1Z);
+		    T1P = VSUB(T1N, T1O);
+		    T20 = VSUB(T1Y, T1Z);
+	       }
+	       {
+		    V T2g, T2h, T2w, T2k, T2l, T2v;
+		    T2g = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
+		    T2h = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
+		    T2w = VADD(T2g, T2h);
+		    T2k = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
+		    T2l = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
+		    T2v = VADD(T2k, T2l);
+		    T2i = VSUB(T2g, T2h);
+		    T2D = VADD(T2v, T2w);
+		    T2m = VSUB(T2k, T2l);
+		    T2x = VSUB(T2v, T2w);
+	       }
+	       {
+		    V T3R, T3S, T47, T3V, T3W, T46;
+		    T3R = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
+		    T3S = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
+		    T47 = VADD(T3R, T3S);
+		    T3V = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
+		    T3W = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
+		    T46 = VADD(T3V, T3W);
+		    T3T = VSUB(T3R, T3S);
+		    T4e = VADD(T46, T47);
+		    T3X = VSUB(T3V, T3W);
+		    T48 = VSUB(T46, T47);
+	       }
+	       {
+		    V TF, TG, TV, TJ, TK, TU;
+		    TF = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+		    TG = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
+		    TV = VADD(TF, TG);
+		    TJ = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+		    TK = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
+		    TU = VADD(TJ, TK);
+		    TH = VSUB(TF, TG);
+		    T12 = VADD(TU, TV);
+		    TL = VSUB(TJ, TK);
+		    TW = VSUB(TU, TV);
+	       }
+	       {
+		    V T1c, T1d, T1s, T1g, T1h, T1r;
+		    T1c = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+		    T1d = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
+		    T1s = VADD(T1c, T1d);
+		    T1g = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+		    T1h = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
+		    T1r = VADD(T1g, T1h);
+		    T1e = VSUB(T1c, T1d);
+		    T1z = VADD(T1r, T1s);
+		    T1i = VSUB(T1g, T1h);
+		    T1t = VSUB(T1r, T1s);
+	       }
+	       {
+		    V T2N, T2O, T33, T2R, T2S, T32;
+		    T2N = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
+		    T2O = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
+		    T33 = VADD(T2N, T2O);
+		    T2R = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
+		    T2S = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
+		    T32 = VADD(T2R, T2S);
+		    T2P = VSUB(T2N, T2O);
+		    T3a = VADD(T32, T33);
+		    T2T = VSUB(T2R, T2S);
+		    T34 = VSUB(T32, T33);
+	       }
+	       {
+		    V T3k, T3l, T3A, T3o, T3p, T3z;
+		    T3k = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
+		    T3l = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
+		    T3A = VADD(T3k, T3l);
+		    T3o = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
+		    T3p = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
+		    T3z = VADD(T3o, T3p);
+		    T3m = VSUB(T3k, T3l);
+		    T3H = VADD(T3z, T3A);
+		    T3q = VSUB(T3o, T3p);
+		    T3B = VSUB(T3z, T3A);
+	       }
+	       {
+		    V T3, Tq, T6, Tr;
+		    {
+			 V T1, T2, T4, T5;
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T3 = VSUB(T1, T2);
+			 Tq = VADD(T1, T2);
+			 T4 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T6 = VSUB(T4, T5);
+			 Tr = VADD(T4, T5);
+		    }
+		    T7 = VMUL(LDK(KP707106781), VSUB(T3, T6));
+		    Tw = VADD(Tq, Tr);
+		    Tf = VMUL(LDK(KP707106781), VADD(T3, T6));
+		    Ts = VBYI(VSUB(Tq, Tr));
+	       }
+	       {
+		    V T1E, T21, T1H, T22;
+		    {
+			 V T1C, T1D, T1F, T1G;
+			 T1C = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T1D = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T1E = VSUB(T1C, T1D);
+			 T21 = VADD(T1C, T1D);
+			 T1F = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T1G = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T1H = VSUB(T1F, T1G);
+			 T22 = VADD(T1F, T1G);
+		    }
+		    T1I = VMUL(LDK(KP707106781), VSUB(T1E, T1H));
+		    T27 = VADD(T21, T22);
+		    T1Q = VMUL(LDK(KP707106781), VADD(T1E, T1H));
+		    T23 = VBYI(VSUB(T21, T22));
+	       }
+	       {
+		    V T2b, T2y, T2e, T2z;
+		    {
+			 V T29, T2a, T2c, T2d;
+			 T29 = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T2a = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T2b = VSUB(T29, T2a);
+			 T2y = VADD(T29, T2a);
+			 T2c = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T2d = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T2e = VSUB(T2c, T2d);
+			 T2z = VADD(T2c, T2d);
+		    }
+		    T2f = VMUL(LDK(KP707106781), VSUB(T2b, T2e));
+		    T2E = VADD(T2y, T2z);
+		    T2n = VMUL(LDK(KP707106781), VADD(T2b, T2e));
+		    T2A = VBYI(VSUB(T2y, T2z));
+	       }
+	       {
+		    V T3M, T49, T3P, T4a;
+		    {
+			 V T3K, T3L, T3N, T3O;
+			 T3K = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+			 T3L = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+			 T3M = VSUB(T3K, T3L);
+			 T49 = VADD(T3K, T3L);
+			 T3N = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+			 T3O = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+			 T3P = VSUB(T3N, T3O);
+			 T4a = VADD(T3N, T3O);
+		    }
+		    T3Q = VMUL(LDK(KP707106781), VSUB(T3M, T3P));
+		    T4f = VADD(T49, T4a);
+		    T3Y = VMUL(LDK(KP707106781), VADD(T3M, T3P));
+		    T4b = VBYI(VSUB(T49, T4a));
+	       }
+	       {
+		    V TA, TX, TD, TY;
+		    {
+			 V Ty, Tz, TB, TC;
+			 Ty = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 Tz = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 TA = VSUB(Ty, Tz);
+			 TX = VADD(Ty, Tz);
+			 TB = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 TC = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 TD = VSUB(TB, TC);
+			 TY = VADD(TB, TC);
+		    }
+		    TE = VMUL(LDK(KP707106781), VSUB(TA, TD));
+		    T13 = VADD(TX, TY);
+		    TM = VMUL(LDK(KP707106781), VADD(TA, TD));
+		    TZ = VBYI(VSUB(TX, TY));
+	       }
+	       {
+		    V T17, T1u, T1a, T1v;
+		    {
+			 V T15, T16, T18, T19;
+			 T15 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 T16 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 T17 = VSUB(T15, T16);
+			 T1u = VADD(T15, T16);
+			 T18 = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 T19 = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 T1a = VSUB(T18, T19);
+			 T1v = VADD(T18, T19);
+		    }
+		    T1b = VMUL(LDK(KP707106781), VSUB(T17, T1a));
+		    T1A = VADD(T1u, T1v);
+		    T1j = VMUL(LDK(KP707106781), VADD(T17, T1a));
+		    T1w = VBYI(VSUB(T1u, T1v));
+	       }
+	       {
+		    V T2I, T35, T2L, T36;
+		    {
+			 V T2G, T2H, T2J, T2K;
+			 T2G = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+			 T2H = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+			 T2I = VSUB(T2G, T2H);
+			 T35 = VADD(T2G, T2H);
+			 T2J = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+			 T2K = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+			 T2L = VSUB(T2J, T2K);
+			 T36 = VADD(T2J, T2K);
+		    }
+		    T2M = VMUL(LDK(KP707106781), VSUB(T2I, T2L));
+		    T3b = VADD(T35, T36);
+		    T2U = VMUL(LDK(KP707106781), VADD(T2I, T2L));
+		    T37 = VBYI(VSUB(T35, T36));
+	       }
+	       {
+		    V T3f, T3C, T3i, T3D;
+		    {
+			 V T3d, T3e, T3g, T3h;
+			 T3d = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+			 T3e = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+			 T3f = VSUB(T3d, T3e);
+			 T3C = VADD(T3d, T3e);
+			 T3g = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+			 T3h = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+			 T3i = VSUB(T3g, T3h);
+			 T3D = VADD(T3g, T3h);
+		    }
+		    T3j = VMUL(LDK(KP707106781), VSUB(T3f, T3i));
+		    T3I = VADD(T3C, T3D);
+		    T3r = VMUL(LDK(KP707106781), VADD(T3f, T3i));
+		    T3E = VBYI(VSUB(T3C, T3D));
+	       }
+	       ST(&(x[0]), VADD(Tv, Tw), ms, &(x[0]));
+	       ST(&(x[WS(rs, 2)]), VADD(T1z, T1A), ms, &(x[0]));
+	       ST(&(x[WS(rs, 5)]), VADD(T3a, T3b), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 7)]), VADD(T4e, T4f), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 6)]), VADD(T3H, T3I), ms, &(x[0]));
+	       ST(&(x[WS(rs, 4)]), VADD(T2D, T2E), ms, &(x[0]));
+	       {
+		    V Tt, T4c, T2B, T24;
+		    ST(&(x[WS(rs, 3)]), VADD(T26, T27), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(T12, T13), ms, &(x[WS(rs, 1)]));
+		    Tt = BYTW(&(W[TWVL * 10]), VSUB(Tp, Ts));
+		    ST(&(x[WS(vs, 6)]), Tt, ms, &(x[WS(vs, 6)]));
+		    T4c = BYTW(&(W[TWVL * 10]), VSUB(T48, T4b));
+		    ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+		    T2B = BYTW(&(W[TWVL * 10]), VSUB(T2x, T2A));
+		    ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 6)]));
+		    T24 = BYTW(&(W[TWVL * 10]), VSUB(T20, T23));
+		    ST(&(x[WS(vs, 6) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+	       }
+	       {
+		    V T10, T1x, T3F, T38, T1y, Tu;
+		    T10 = BYTW(&(W[TWVL * 10]), VSUB(TW, TZ));
+		    ST(&(x[WS(vs, 6) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+		    T1x = BYTW(&(W[TWVL * 10]), VSUB(T1t, T1w));
+		    ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 6)]));
+		    T3F = BYTW(&(W[TWVL * 10]), VSUB(T3B, T3E));
+		    ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 6)]));
+		    T38 = BYTW(&(W[TWVL * 10]), VSUB(T34, T37));
+		    ST(&(x[WS(vs, 6) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+		    T1y = BYTW(&(W[TWVL * 2]), VADD(T1t, T1w));
+		    ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1y, ms, &(x[WS(vs, 2)]));
+		    Tu = BYTW(&(W[TWVL * 2]), VADD(Tp, Ts));
+		    ST(&(x[WS(vs, 2)]), Tu, ms, &(x[WS(vs, 2)]));
+	       }
+	       {
+		    V T2C, T3G, T11, T25, T39, T4d;
+		    T2C = BYTW(&(W[TWVL * 2]), VADD(T2x, T2A));
+		    ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2C, ms, &(x[WS(vs, 2)]));
+		    T3G = BYTW(&(W[TWVL * 2]), VADD(T3B, T3E));
+		    ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3G, ms, &(x[WS(vs, 2)]));
+		    T11 = BYTW(&(W[TWVL * 2]), VADD(TW, TZ));
+		    ST(&(x[WS(vs, 2) + WS(rs, 1)]), T11, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    T25 = BYTW(&(W[TWVL * 2]), VADD(T20, T23));
+		    ST(&(x[WS(vs, 2) + WS(rs, 3)]), T25, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    T39 = BYTW(&(W[TWVL * 2]), VADD(T34, T37));
+		    ST(&(x[WS(vs, 2) + WS(rs, 5)]), T39, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    T4d = BYTW(&(W[TWVL * 2]), VADD(T48, T4b));
+		    ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4d, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+	       }
+	       {
+		    V Tx, T1B, T3c, T4g, T3J, T2F;
+		    Tx = BYTW(&(W[TWVL * 6]), VSUB(Tv, Tw));
+		    ST(&(x[WS(vs, 4)]), Tx, ms, &(x[WS(vs, 4)]));
+		    T1B = BYTW(&(W[TWVL * 6]), VSUB(T1z, T1A));
+		    ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 4)]));
+		    T3c = BYTW(&(W[TWVL * 6]), VSUB(T3a, T3b));
+		    ST(&(x[WS(vs, 4) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+		    T4g = BYTW(&(W[TWVL * 6]), VSUB(T4e, T4f));
+		    ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+		    T3J = BYTW(&(W[TWVL * 6]), VSUB(T3H, T3I));
+		    ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 4)]));
+		    T2F = BYTW(&(W[TWVL * 6]), VSUB(T2D, T2E));
+		    ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 4)]));
+	       }
+	       T28 = BYTW(&(W[TWVL * 6]), VSUB(T26, T27));
+	       ST(&(x[WS(vs, 4) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+	       T14 = BYTW(&(W[TWVL * 6]), VSUB(T12, T13));
+	       ST(&(x[WS(vs, 4) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+	       {
+		    V Th, Ti, Tb, Tg;
+		    Tb = VBYI(VSUB(T7, Ta));
+		    Tg = VSUB(Te, Tf);
+		    Th = BYTW(&(W[TWVL * 4]), VADD(Tb, Tg));
+		    Ti = BYTW(&(W[TWVL * 8]), VSUB(Tg, Tb));
+		    ST(&(x[WS(vs, 3)]), Th, ms, &(x[WS(vs, 3)]));
+		    ST(&(x[WS(vs, 5)]), Ti, ms, &(x[WS(vs, 5)]));
+	       }
+	       {
+		    V T40, T41, T3U, T3Z;
+		    T3U = VBYI(VSUB(T3Q, T3T));
+		    T3Z = VSUB(T3X, T3Y);
+		    T40 = BYTW(&(W[TWVL * 4]), VADD(T3U, T3Z));
+		    T41 = BYTW(&(W[TWVL * 8]), VSUB(T3Z, T3U));
+		    ST(&(x[WS(vs, 3) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+	       }
+	       {
+		    V T2p, T2q, T2j, T2o;
+		    T2j = VBYI(VSUB(T2f, T2i));
+		    T2o = VSUB(T2m, T2n);
+		    T2p = BYTW(&(W[TWVL * 4]), VADD(T2j, T2o));
+		    T2q = BYTW(&(W[TWVL * 8]), VSUB(T2o, T2j));
+		    ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 3)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 5)]));
+	       }
+	       {
+		    V T1S, T1T, T1M, T1R;
+		    T1M = VBYI(VSUB(T1I, T1L));
+		    T1R = VSUB(T1P, T1Q);
+		    T1S = BYTW(&(W[TWVL * 4]), VADD(T1M, T1R));
+		    T1T = BYTW(&(W[TWVL * 8]), VSUB(T1R, T1M));
+		    ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+	       }
+	       {
+		    V TO, TP, TI, TN;
+		    TI = VBYI(VSUB(TE, TH));
+		    TN = VSUB(TL, TM);
+		    TO = BYTW(&(W[TWVL * 4]), VADD(TI, TN));
+		    TP = BYTW(&(W[TWVL * 8]), VSUB(TN, TI));
+		    ST(&(x[WS(vs, 3) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+	       }
+	       {
+		    V T1l, T1m, T1f, T1k;
+		    T1f = VBYI(VSUB(T1b, T1e));
+		    T1k = VSUB(T1i, T1j);
+		    T1l = BYTW(&(W[TWVL * 4]), VADD(T1f, T1k));
+		    T1m = BYTW(&(W[TWVL * 8]), VSUB(T1k, T1f));
+		    ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 3)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 5)]));
+	       }
+	       {
+		    V T3t, T3u, T3n, T3s;
+		    T3n = VBYI(VSUB(T3j, T3m));
+		    T3s = VSUB(T3q, T3r);
+		    T3t = BYTW(&(W[TWVL * 4]), VADD(T3n, T3s));
+		    T3u = BYTW(&(W[TWVL * 8]), VSUB(T3s, T3n));
+		    ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 3)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 5)]));
+	       }
+	       {
+		    V T2W, T2X, T2Q, T2V;
+		    T2Q = VBYI(VSUB(T2M, T2P));
+		    T2V = VSUB(T2T, T2U);
+		    T2W = BYTW(&(W[TWVL * 4]), VADD(T2Q, T2V));
+		    T2X = BYTW(&(W[TWVL * 8]), VSUB(T2V, T2Q));
+		    ST(&(x[WS(vs, 3) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+	       }
+	       {
+		    V T1p, T1q, T1n, T1o;
+		    T1n = VBYI(VADD(T1e, T1b));
+		    T1o = VADD(T1i, T1j);
+		    T1p = BYTW(&(W[0]), VADD(T1n, T1o));
+		    T1q = BYTW(&(W[TWVL * 12]), VSUB(T1o, T1n));
+		    ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 1)]));
+		    ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 7)]));
+	       }
+	       {
+		    V Tl, Tm, Tj, Tk;
+		    Tj = VBYI(VADD(Ta, T7));
+		    Tk = VADD(Te, Tf);
+		    Tl = BYTW(&(W[0]), VADD(Tj, Tk));
+		    Tm = BYTW(&(W[TWVL * 12]), VSUB(Tk, Tj));
+		    ST(&(x[WS(vs, 1)]), Tl, ms, &(x[WS(vs, 1)]));
+		    ST(&(x[WS(vs, 7)]), Tm, ms, &(x[WS(vs, 7)]));
+	       }
+	       {
+		    V T2t, T2u, T2r, T2s;
+		    T2r = VBYI(VADD(T2i, T2f));
+		    T2s = VADD(T2m, T2n);
+		    T2t = BYTW(&(W[0]), VADD(T2r, T2s));
+		    T2u = BYTW(&(W[TWVL * 12]), VSUB(T2s, T2r));
+		    ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 1)]));
+		    ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 7)]));
+	       }
+	       {
+		    V T3x, T3y, T3v, T3w;
+		    T3v = VBYI(VADD(T3m, T3j));
+		    T3w = VADD(T3q, T3r);
+		    T3x = BYTW(&(W[0]), VADD(T3v, T3w));
+		    T3y = BYTW(&(W[TWVL * 12]), VSUB(T3w, T3v));
+		    ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 1)]));
+		    ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 7)]));
+	       }
+	       {
+		    V TS, TT, TQ, TR;
+		    TQ = VBYI(VADD(TH, TE));
+		    TR = VADD(TL, TM);
+		    TS = BYTW(&(W[0]), VADD(TQ, TR));
+		    TT = BYTW(&(W[TWVL * 12]), VSUB(TR, TQ));
+		    ST(&(x[WS(vs, 1) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 7) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+	       }
+	       {
+		    V T1W, T1X, T1U, T1V;
+		    T1U = VBYI(VADD(T1L, T1I));
+		    T1V = VADD(T1P, T1Q);
+		    T1W = BYTW(&(W[0]), VADD(T1U, T1V));
+		    T1X = BYTW(&(W[TWVL * 12]), VSUB(T1V, T1U));
+		    ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+	       }
+	       {
+		    V T30, T31, T2Y, T2Z;
+		    T2Y = VBYI(VADD(T2P, T2M));
+		    T2Z = VADD(T2T, T2U);
+		    T30 = BYTW(&(W[0]), VADD(T2Y, T2Z));
+		    T31 = BYTW(&(W[TWVL * 12]), VSUB(T2Z, T2Y));
+		    ST(&(x[WS(vs, 1) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 7) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+	       }
+	       {
+		    V T44, T45, T42, T43;
+		    T42 = VBYI(VADD(T3T, T3Q));
+		    T43 = VADD(T3X, T3Y);
+		    T44 = BYTW(&(W[0]), VADD(T42, T43));
+		    T45 = BYTW(&(W[TWVL * 12]), VSUB(T43, T42));
+		    ST(&(x[WS(vs, 1) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 7) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("q1bv_8"), twinstr, &GENUS, {264, 128, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1bv_8) (planner *p) {
+     X(kdft_difsq_register) (p, q1bv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/q1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/q1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:30 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -dif -name q1fv_2 -include q1f.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 6 additions, 4 multiplications, 0 fused multiply/add),
+ * 8 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "q1f.h"
+
+static void q1fv_2(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(4, vs)) {
+	       V T1, T2, T4, T5, T3, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+	       T5 = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       ST(&(x[0]), VADD(T1, T2), ms, &(x[0]));
+	       T3 = BYTWJ(&(W[0]), VSUB(T1, T2));
+	       ST(&(x[WS(rs, 1)]), VADD(T4, T5), ms, &(x[WS(rs, 1)]));
+	       T6 = BYTWJ(&(W[0]), VSUB(T4, T5));
+	       ST(&(x[WS(vs, 1)]), T3, ms, &(x[WS(vs, 1)]));
+	       ST(&(x[WS(vs, 1) + WS(rs, 1)]), T6, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("q1fv_2"), twinstr, &GENUS, {6, 4, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1fv_2) (planner *p) {
+     X(kdft_difsq_register) (p, q1fv_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -dif -name q1fv_2 -include q1f.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 6 additions, 4 multiplications, 0 fused multiply/add),
+ * 8 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "q1f.h"
+
+static void q1fv_2(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(4, vs)) {
+	       V T1, T2, T3, T4, T5, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[0]), VSUB(T1, T2));
+	       T4 = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+	       T5 = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       T6 = BYTWJ(&(W[0]), VSUB(T4, T5));
+	       ST(&(x[WS(vs, 1)]), T3, ms, &(x[WS(vs, 1)]));
+	       ST(&(x[WS(vs, 1) + WS(rs, 1)]), T6, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       ST(&(x[0]), VADD(T1, T2), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VADD(T4, T5), ms, &(x[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("q1fv_2"), twinstr, &GENUS, {6, 4, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1fv_2) (planner *p) {
+     X(kdft_difsq_register) (p, q1fv_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/q1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/q1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:31 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1fv_4 -include q1f.h */
+
+/*
+ * This function contains 44 FP additions, 32 FP multiplications,
+ * (or, 36 additions, 24 multiplications, 8 fused multiply/add),
+ * 38 stack variables, 0 constants, and 32 memory accesses
+ */
+#include "q1f.h"
+
+static void q1fv_4(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, vs)) {
+	       V Tb, Tm, Tx, TI;
+	       {
+		    V Tc, T9, T3, TG, TA, TH, TD, Ta, T6, Td, Tn, To, Tq, Tr, Tf;
+		    V Tg;
+		    {
+			 V T1, T2, Ty, Tz, TB, TC, T4, T5;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Ty = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+			 Tz = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+			 TB = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 TC = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 Tc = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+			 T9 = VADD(T1, T2);
+			 T3 = VSUB(T1, T2);
+			 TG = VADD(Ty, Tz);
+			 TA = VSUB(Ty, Tz);
+			 TH = VADD(TB, TC);
+			 TD = VSUB(TB, TC);
+			 Ta = VADD(T4, T5);
+			 T6 = VSUB(T4, T5);
+			 Td = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+			 Tn = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+			 To = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+			 Tq = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 Tr = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 Tf = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 Tg = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    }
+		    {
+			 V Tk, Te, Tv, Tp, Tw, Ts, Tl, Th, T7, TE, Tu, TF;
+			 ST(&(x[0]), VADD(T9, Ta), ms, &(x[0]));
+			 Tk = VADD(Tc, Td);
+			 Te = VSUB(Tc, Td);
+			 Tv = VADD(Tn, To);
+			 Tp = VSUB(Tn, To);
+			 Tw = VADD(Tq, Tr);
+			 Ts = VSUB(Tq, Tr);
+			 Tl = VADD(Tf, Tg);
+			 Th = VSUB(Tf, Tg);
+			 ST(&(x[WS(rs, 3)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
+			 T7 = BYTWJ(&(W[0]), VFNMSI(T6, T3));
+			 TE = BYTWJ(&(W[0]), VFNMSI(TD, TA));
+			 {
+			      V Tt, Ti, Tj, T8;
+			      T8 = BYTWJ(&(W[TWVL * 4]), VFMAI(T6, T3));
+			      ST(&(x[WS(rs, 2)]), VADD(Tv, Tw), ms, &(x[0]));
+			      Tt = BYTWJ(&(W[0]), VFNMSI(Ts, Tp));
+			      ST(&(x[WS(rs, 1)]), VADD(Tk, Tl), ms, &(x[WS(rs, 1)]));
+			      Ti = BYTWJ(&(W[0]), VFNMSI(Th, Te));
+			      Tj = BYTWJ(&(W[TWVL * 4]), VFMAI(Th, Te));
+			      ST(&(x[WS(vs, 1)]), T7, ms, &(x[WS(vs, 1)]));
+			      ST(&(x[WS(vs, 1) + WS(rs, 3)]), TE, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 3)]), T8, ms, &(x[WS(vs, 3)]));
+			      Tu = BYTWJ(&(W[TWVL * 4]), VFMAI(Ts, Tp));
+			      ST(&(x[WS(vs, 1) + WS(rs, 2)]), Tt, ms, &(x[WS(vs, 1)]));
+			      TF = BYTWJ(&(W[TWVL * 4]), VFMAI(TD, TA));
+			      ST(&(x[WS(vs, 1) + WS(rs, 1)]), Ti, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 3) + WS(rs, 1)]), Tj, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 }
+			 Tb = BYTWJ(&(W[TWVL * 2]), VSUB(T9, Ta));
+			 Tm = BYTWJ(&(W[TWVL * 2]), VSUB(Tk, Tl));
+			 Tx = BYTWJ(&(W[TWVL * 2]), VSUB(Tv, Tw));
+			 ST(&(x[WS(vs, 3) + WS(rs, 2)]), Tu, ms, &(x[WS(vs, 3)]));
+			 TI = BYTWJ(&(W[TWVL * 2]), VSUB(TG, TH));
+			 ST(&(x[WS(vs, 3) + WS(rs, 3)]), TF, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    }
+	       }
+	       ST(&(x[WS(vs, 2)]), Tb, ms, &(x[WS(vs, 2)]));
+	       ST(&(x[WS(vs, 2) + WS(rs, 1)]), Tm, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+	       ST(&(x[WS(vs, 2) + WS(rs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
+	       ST(&(x[WS(vs, 2) + WS(rs, 3)]), TI, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("q1fv_4"), twinstr, &GENUS, {36, 24, 8, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1fv_4) (planner *p) {
+     X(kdft_difsq_register) (p, q1fv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1fv_4 -include q1f.h */
+
+/*
+ * This function contains 44 FP additions, 24 FP multiplications,
+ * (or, 44 additions, 24 multiplications, 0 fused multiply/add),
+ * 22 stack variables, 0 constants, and 32 memory accesses
+ */
+#include "q1f.h"
+
+static void q1fv_4(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, vs)) {
+	       V T3, T9, TA, TG, TD, TH, T6, Ta, Te, Tk, Tp, Tv, Ts, Tw, Th;
+	       V Tl;
+	       {
+		    V T1, T2, Ty, Tz;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T3 = VSUB(T1, T2);
+		    T9 = VADD(T1, T2);
+		    Ty = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+		    Tz = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+		    TA = VSUB(Ty, Tz);
+		    TG = VADD(Ty, Tz);
+	       }
+	       {
+		    V TB, TC, T4, T5;
+		    TB = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    TC = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    TD = VBYI(VSUB(TB, TC));
+		    TH = VADD(TB, TC);
+		    T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T6 = VBYI(VSUB(T4, T5));
+		    Ta = VADD(T4, T5);
+	       }
+	       {
+		    V Tc, Td, Tn, To;
+		    Tc = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+		    Td = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+		    Te = VSUB(Tc, Td);
+		    Tk = VADD(Tc, Td);
+		    Tn = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+		    To = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+		    Tp = VSUB(Tn, To);
+		    Tv = VADD(Tn, To);
+	       }
+	       {
+		    V Tq, Tr, Tf, Tg;
+		    Tq = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    Tr = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    Ts = VBYI(VSUB(Tq, Tr));
+		    Tw = VADD(Tq, Tr);
+		    Tf = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    Tg = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    Th = VBYI(VSUB(Tf, Tg));
+		    Tl = VADD(Tf, Tg);
+	       }
+	       ST(&(x[0]), VADD(T9, Ta), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VADD(Tk, Tl), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 2)]), VADD(Tv, Tw), ms, &(x[0]));
+	       ST(&(x[WS(rs, 3)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T7, Ti, Tt, TE;
+		    T7 = BYTWJ(&(W[0]), VSUB(T3, T6));
+		    ST(&(x[WS(vs, 1)]), T7, ms, &(x[WS(vs, 1)]));
+		    Ti = BYTWJ(&(W[0]), VSUB(Te, Th));
+		    ST(&(x[WS(vs, 1) + WS(rs, 1)]), Ti, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    Tt = BYTWJ(&(W[0]), VSUB(Tp, Ts));
+		    ST(&(x[WS(vs, 1) + WS(rs, 2)]), Tt, ms, &(x[WS(vs, 1)]));
+		    TE = BYTWJ(&(W[0]), VSUB(TA, TD));
+		    ST(&(x[WS(vs, 1) + WS(rs, 3)]), TE, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       }
+	       {
+		    V T8, Tj, Tu, TF;
+		    T8 = BYTWJ(&(W[TWVL * 4]), VADD(T3, T6));
+		    ST(&(x[WS(vs, 3)]), T8, ms, &(x[WS(vs, 3)]));
+		    Tj = BYTWJ(&(W[TWVL * 4]), VADD(Te, Th));
+		    ST(&(x[WS(vs, 3) + WS(rs, 1)]), Tj, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    Tu = BYTWJ(&(W[TWVL * 4]), VADD(Tp, Ts));
+		    ST(&(x[WS(vs, 3) + WS(rs, 2)]), Tu, ms, &(x[WS(vs, 3)]));
+		    TF = BYTWJ(&(W[TWVL * 4]), VADD(TA, TD));
+		    ST(&(x[WS(vs, 3) + WS(rs, 3)]), TF, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+	       }
+	       {
+		    V Tb, Tm, Tx, TI;
+		    Tb = BYTWJ(&(W[TWVL * 2]), VSUB(T9, Ta));
+		    ST(&(x[WS(vs, 2)]), Tb, ms, &(x[WS(vs, 2)]));
+		    Tm = BYTWJ(&(W[TWVL * 2]), VSUB(Tk, Tl));
+		    ST(&(x[WS(vs, 2) + WS(rs, 1)]), Tm, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    Tx = BYTWJ(&(W[TWVL * 2]), VSUB(Tv, Tw));
+		    ST(&(x[WS(vs, 2) + WS(rs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
+		    TI = BYTWJ(&(W[TWVL * 2]), VSUB(TG, TH));
+		    ST(&(x[WS(vs, 2) + WS(rs, 3)]), TI, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("q1fv_4"), twinstr, &GENUS, {44, 24, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1fv_4) (planner *p) {
+     X(kdft_difsq_register) (p, q1fv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/q1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/q1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:31 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 5 -dif -name q1fv_5 -include q1f.h */
+
+/*
+ * This function contains 100 FP additions, 95 FP multiplications,
+ * (or, 55 additions, 50 multiplications, 45 fused multiply/add),
+ * 69 stack variables, 4 constants, and 50 memory accesses
+ */
+#include "q1f.h"
+
+static void q1fv_5(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(10, vs)) {
+	       V Te, T1w, Ty, TS, TW, Tb, T1t, Tv, T1g, T1c, TP, TV, T1f, T19, TY;
+	       V TX;
+	       {
+		    V T1, T1j, Tl, Ti, Ta, T8, T1A, T1q, T1s, T9, TF, T1r, TZ, TR, TL;
+		    V TC, Ts, Tu, TQ, TI, T15, T1b, T10, T11, Tt;
+		    {
+			 V T1n, T1o, T1k, T1l, T7, Td, T4, Tc;
+			 {
+			      V T5, T6, T2, T3;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      T1j = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
+			      T1n = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
+			      T1o = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			      T1k = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			      T1l = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
+			      T7 = VADD(T5, T6);
+			      Td = VSUB(T5, T6);
+			      T4 = VADD(T2, T3);
+			      Tc = VSUB(T2, T3);
+			 }
+			 {
+			      V Tm, Tn, Tr, Tx, T1v, T1p;
+			      Tl = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+			      T1v = VSUB(T1n, T1o);
+			      T1p = VADD(T1n, T1o);
+			      {
+				   V T1u, T1m, Tp, Tq;
+				   T1u = VSUB(T1k, T1l);
+				   T1m = VADD(T1k, T1l);
+				   Tp = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+				   Ti = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tc, Td));
+				   Te = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Td, Tc));
+				   Ta = VSUB(T4, T7);
+				   T8 = VADD(T4, T7);
+				   Tq = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+				   T1w = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1v, T1u));
+				   T1A = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1u, T1v));
+				   T1q = VADD(T1m, T1p);
+				   T1s = VSUB(T1m, T1p);
+				   Tm = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+				   T9 = VFNMS(LDK(KP250000000), T8, T1);
+				   Tn = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
+				   Tr = VADD(Tp, Tq);
+				   Tx = VSUB(Tp, Tq);
+			      }
+			      {
+				   V TJ, TK, TG, Tw, To, TH, T13, T14;
+				   TF = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+				   T1r = VFNMS(LDK(KP250000000), T1q, T1j);
+				   TJ = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+				   TK = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+				   TG = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+				   Tw = VSUB(Tm, Tn);
+				   To = VADD(Tm, Tn);
+				   TH = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
+				   TZ = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+				   T13 = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+				   T14 = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+				   TR = VSUB(TJ, TK);
+				   TL = VADD(TJ, TK);
+				   Ty = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tx, Tw));
+				   TC = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tw, Tx));
+				   Ts = VADD(To, Tr);
+				   Tu = VSUB(To, Tr);
+				   TQ = VSUB(TG, TH);
+				   TI = VADD(TG, TH);
+				   T15 = VADD(T13, T14);
+				   T1b = VSUB(T13, T14);
+				   T10 = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+				   T11 = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
+				   Tt = VFNMS(LDK(KP250000000), Ts, Tl);
+			      }
+			 }
+		    }
+		    {
+			 V TO, T12, T1a, Th, T1z, TN, TM, T18, T17;
+			 ST(&(x[0]), VADD(T1, T8), ms, &(x[0]));
+			 TS = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TR, TQ));
+			 TW = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TQ, TR));
+			 TM = VADD(TI, TL);
+			 TO = VSUB(TI, TL);
+			 ST(&(x[WS(rs, 4)]), VADD(T1j, T1q), ms, &(x[0]));
+			 T12 = VADD(T10, T11);
+			 T1a = VSUB(T10, T11);
+			 ST(&(x[WS(rs, 1)]), VADD(Tl, Ts), ms, &(x[WS(rs, 1)]));
+			 Th = VFNMS(LDK(KP559016994), Ta, T9);
+			 Tb = VFMA(LDK(KP559016994), Ta, T9);
+			 T1t = VFMA(LDK(KP559016994), T1s, T1r);
+			 T1z = VFNMS(LDK(KP559016994), T1s, T1r);
+			 ST(&(x[WS(rs, 2)]), VADD(TF, TM), ms, &(x[0]));
+			 TN = VFNMS(LDK(KP250000000), TM, TF);
+			 {
+			      V T16, Tk, Tj, T1C, T1B, TD, TE, TB;
+			      TB = VFNMS(LDK(KP559016994), Tu, Tt);
+			      Tv = VFMA(LDK(KP559016994), Tu, Tt);
+			      T1g = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1a, T1b));
+			      T1c = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1b, T1a));
+			      T18 = VSUB(T12, T15);
+			      T16 = VADD(T12, T15);
+			      Tk = BYTWJ(&(W[TWVL * 4]), VFNMSI(Ti, Th));
+			      Tj = BYTWJ(&(W[TWVL * 2]), VFMAI(Ti, Th));
+			      T1C = BYTWJ(&(W[TWVL * 4]), VFNMSI(T1A, T1z));
+			      T1B = BYTWJ(&(W[TWVL * 2]), VFMAI(T1A, T1z));
+			      TD = BYTWJ(&(W[TWVL * 2]), VFMAI(TC, TB));
+			      TE = BYTWJ(&(W[TWVL * 4]), VFNMSI(TC, TB));
+			      ST(&(x[WS(rs, 3)]), VADD(TZ, T16), ms, &(x[WS(rs, 1)]));
+			      T17 = VFNMS(LDK(KP250000000), T16, TZ);
+			      ST(&(x[WS(vs, 3)]), Tk, ms, &(x[WS(vs, 3)]));
+			      ST(&(x[WS(vs, 2)]), Tj, ms, &(x[WS(vs, 2)]));
+			      ST(&(x[WS(vs, 3) + WS(rs, 4)]), T1C, ms, &(x[WS(vs, 3)]));
+			      ST(&(x[WS(vs, 2) + WS(rs, 4)]), T1B, ms, &(x[WS(vs, 2)]));
+			      ST(&(x[WS(vs, 2) + WS(rs, 1)]), TD, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 3) + WS(rs, 1)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 }
+			 TP = VFMA(LDK(KP559016994), TO, TN);
+			 TV = VFNMS(LDK(KP559016994), TO, TN);
+			 T1f = VFNMS(LDK(KP559016994), T18, T17);
+			 T19 = VFMA(LDK(KP559016994), T18, T17);
+		    }
+	       }
+	       TY = BYTWJ(&(W[TWVL * 4]), VFNMSI(TW, TV));
+	       TX = BYTWJ(&(W[TWVL * 2]), VFMAI(TW, TV));
+	       {
+		    V T1i, T1h, TU, TT;
+		    T1i = BYTWJ(&(W[TWVL * 4]), VFNMSI(T1g, T1f));
+		    T1h = BYTWJ(&(W[TWVL * 2]), VFMAI(T1g, T1f));
+		    TU = BYTWJ(&(W[TWVL * 6]), VFMAI(TS, TP));
+		    TT = BYTWJ(&(W[0]), VFNMSI(TS, TP));
+		    {
+			 V Tg, Tf, TA, Tz;
+			 Tg = BYTWJ(&(W[TWVL * 6]), VFMAI(Te, Tb));
+			 Tf = BYTWJ(&(W[0]), VFNMSI(Te, Tb));
+			 TA = BYTWJ(&(W[TWVL * 6]), VFMAI(Ty, Tv));
+			 Tz = BYTWJ(&(W[0]), VFNMSI(Ty, Tv));
+			 {
+			      V T1e, T1d, T1y, T1x;
+			      T1e = BYTWJ(&(W[TWVL * 6]), VFMAI(T1c, T19));
+			      T1d = BYTWJ(&(W[0]), VFNMSI(T1c, T19));
+			      T1y = BYTWJ(&(W[TWVL * 6]), VFMAI(T1w, T1t));
+			      T1x = BYTWJ(&(W[0]), VFNMSI(T1w, T1t));
+			      ST(&(x[WS(vs, 3) + WS(rs, 2)]), TY, ms, &(x[WS(vs, 3)]));
+			      ST(&(x[WS(vs, 2) + WS(rs, 2)]), TX, ms, &(x[WS(vs, 2)]));
+			      ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1i, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 2) + WS(rs, 3)]), T1h, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 4) + WS(rs, 2)]), TU, ms, &(x[WS(vs, 4)]));
+			      ST(&(x[WS(vs, 1) + WS(rs, 2)]), TT, ms, &(x[WS(vs, 1)]));
+			      ST(&(x[WS(vs, 4)]), Tg, ms, &(x[WS(vs, 4)]));
+			      ST(&(x[WS(vs, 1)]), Tf, ms, &(x[WS(vs, 1)]));
+			      ST(&(x[WS(vs, 4) + WS(rs, 1)]), TA, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tz, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 4) + WS(rs, 3)]), T1e, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1d, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			      ST(&(x[WS(vs, 4) + WS(rs, 4)]), T1y, ms, &(x[WS(vs, 4)]));
+			      ST(&(x[WS(vs, 1) + WS(rs, 4)]), T1x, ms, &(x[WS(vs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("q1fv_5"), twinstr, &GENUS, {55, 50, 45, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1fv_5) (planner *p) {
+     X(kdft_difsq_register) (p, q1fv_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -dif -name q1fv_5 -include q1f.h */
+
+/*
+ * This function contains 100 FP additions, 70 FP multiplications,
+ * (or, 85 additions, 55 multiplications, 15 fused multiply/add),
+ * 44 stack variables, 4 constants, and 50 memory accesses
+ */
+#include "q1f.h"
+
+static void q1fv_5(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(10, vs)) {
+	       V T8, T7, Th, Te, T9, Ta, T1q, T1p, T1z, T1w, T1r, T1s, Ts, Tr, TB;
+	       V Ty, Tt, Tu, TM, TL, TV, TS, TN, TO, T16, T15, T1f, T1c, T17, T18;
+	       {
+		    V T6, Td, T3, Tc;
+		    T8 = LD(&(x[0]), ms, &(x[0]));
+		    {
+			 V T4, T5, T1, T2;
+			 T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T6 = VADD(T4, T5);
+			 Td = VSUB(T4, T5);
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T3 = VADD(T1, T2);
+			 Tc = VSUB(T1, T2);
+		    }
+		    T7 = VMUL(LDK(KP559016994), VSUB(T3, T6));
+		    Th = VBYI(VFNMS(LDK(KP587785252), Tc, VMUL(LDK(KP951056516), Td)));
+		    Te = VBYI(VFMA(LDK(KP951056516), Tc, VMUL(LDK(KP587785252), Td)));
+		    T9 = VADD(T3, T6);
+		    Ta = VFNMS(LDK(KP250000000), T9, T8);
+	       }
+	       {
+		    V T1o, T1v, T1l, T1u;
+		    T1q = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
+		    {
+			 V T1m, T1n, T1j, T1k;
+			 T1m = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
+			 T1n = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T1o = VADD(T1m, T1n);
+			 T1v = VSUB(T1m, T1n);
+			 T1j = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T1k = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
+			 T1l = VADD(T1j, T1k);
+			 T1u = VSUB(T1j, T1k);
+		    }
+		    T1p = VMUL(LDK(KP559016994), VSUB(T1l, T1o));
+		    T1z = VBYI(VFNMS(LDK(KP587785252), T1u, VMUL(LDK(KP951056516), T1v)));
+		    T1w = VBYI(VFMA(LDK(KP951056516), T1u, VMUL(LDK(KP587785252), T1v)));
+		    T1r = VADD(T1l, T1o);
+		    T1s = VFNMS(LDK(KP250000000), T1r, T1q);
+	       }
+	       {
+		    V Tq, Tx, Tn, Tw;
+		    Ts = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+		    {
+			 V To, Tp, Tl, Tm;
+			 To = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+			 Tp = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 Tq = VADD(To, Tp);
+			 Tx = VSUB(To, Tp);
+			 Tl = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 Tm = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
+			 Tn = VADD(Tl, Tm);
+			 Tw = VSUB(Tl, Tm);
+		    }
+		    Tr = VMUL(LDK(KP559016994), VSUB(Tn, Tq));
+		    TB = VBYI(VFNMS(LDK(KP587785252), Tw, VMUL(LDK(KP951056516), Tx)));
+		    Ty = VBYI(VFMA(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tx)));
+		    Tt = VADD(Tn, Tq);
+		    Tu = VFNMS(LDK(KP250000000), Tt, Ts);
+	       }
+	       {
+		    V TK, TR, TH, TQ;
+		    TM = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+		    {
+			 V TI, TJ, TF, TG;
+			 TI = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+			 TJ = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 TK = VADD(TI, TJ);
+			 TR = VSUB(TI, TJ);
+			 TF = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 TG = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
+			 TH = VADD(TF, TG);
+			 TQ = VSUB(TF, TG);
+		    }
+		    TL = VMUL(LDK(KP559016994), VSUB(TH, TK));
+		    TV = VBYI(VFNMS(LDK(KP587785252), TQ, VMUL(LDK(KP951056516), TR)));
+		    TS = VBYI(VFMA(LDK(KP951056516), TQ, VMUL(LDK(KP587785252), TR)));
+		    TN = VADD(TH, TK);
+		    TO = VFNMS(LDK(KP250000000), TN, TM);
+	       }
+	       {
+		    V T14, T1b, T11, T1a;
+		    T16 = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+		    {
+			 V T12, T13, TZ, T10;
+			 T12 = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+			 T13 = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T14 = VADD(T12, T13);
+			 T1b = VSUB(T12, T13);
+			 TZ = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T10 = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
+			 T11 = VADD(TZ, T10);
+			 T1a = VSUB(TZ, T10);
+		    }
+		    T15 = VMUL(LDK(KP559016994), VSUB(T11, T14));
+		    T1f = VBYI(VFNMS(LDK(KP587785252), T1a, VMUL(LDK(KP951056516), T1b)));
+		    T1c = VBYI(VFMA(LDK(KP951056516), T1a, VMUL(LDK(KP587785252), T1b)));
+		    T17 = VADD(T11, T14);
+		    T18 = VFNMS(LDK(KP250000000), T17, T16);
+	       }
+	       ST(&(x[0]), VADD(T8, T9), ms, &(x[0]));
+	       ST(&(x[WS(rs, 4)]), VADD(T1q, T1r), ms, &(x[0]));
+	       ST(&(x[WS(rs, 2)]), VADD(TM, TN), ms, &(x[0]));
+	       ST(&(x[WS(rs, 3)]), VADD(T16, T17), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 1)]), VADD(Ts, Tt), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tj, Tk, Ti, T1B, T1C, T1A;
+		    Ti = VSUB(Ta, T7);
+		    Tj = BYTWJ(&(W[TWVL * 2]), VADD(Th, Ti));
+		    Tk = BYTWJ(&(W[TWVL * 4]), VSUB(Ti, Th));
+		    ST(&(x[WS(vs, 2)]), Tj, ms, &(x[WS(vs, 2)]));
+		    ST(&(x[WS(vs, 3)]), Tk, ms, &(x[WS(vs, 3)]));
+		    T1A = VSUB(T1s, T1p);
+		    T1B = BYTWJ(&(W[TWVL * 2]), VADD(T1z, T1A));
+		    T1C = BYTWJ(&(W[TWVL * 4]), VSUB(T1A, T1z));
+		    ST(&(x[WS(vs, 2) + WS(rs, 4)]), T1B, ms, &(x[WS(vs, 2)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 4)]), T1C, ms, &(x[WS(vs, 3)]));
+	       }
+	       {
+		    V T1h, T1i, T1g, TD, TE, TC;
+		    T1g = VSUB(T18, T15);
+		    T1h = BYTWJ(&(W[TWVL * 2]), VADD(T1f, T1g));
+		    T1i = BYTWJ(&(W[TWVL * 4]), VSUB(T1g, T1f));
+		    ST(&(x[WS(vs, 2) + WS(rs, 3)]), T1h, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1i, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    TC = VSUB(Tu, Tr);
+		    TD = BYTWJ(&(W[TWVL * 2]), VADD(TB, TC));
+		    TE = BYTWJ(&(W[TWVL * 4]), VSUB(TC, TB));
+		    ST(&(x[WS(vs, 2) + WS(rs, 1)]), TD, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 1)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+	       }
+	       {
+		    V TX, TY, TW, TT, TU, TP;
+		    TW = VSUB(TO, TL);
+		    TX = BYTWJ(&(W[TWVL * 2]), VADD(TV, TW));
+		    TY = BYTWJ(&(W[TWVL * 4]), VSUB(TW, TV));
+		    ST(&(x[WS(vs, 2) + WS(rs, 2)]), TX, ms, &(x[WS(vs, 2)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 2)]), TY, ms, &(x[WS(vs, 3)]));
+		    TP = VADD(TL, TO);
+		    TT = BYTWJ(&(W[0]), VSUB(TP, TS));
+		    TU = BYTWJ(&(W[TWVL * 6]), VADD(TS, TP));
+		    ST(&(x[WS(vs, 1) + WS(rs, 2)]), TT, ms, &(x[WS(vs, 1)]));
+		    ST(&(x[WS(vs, 4) + WS(rs, 2)]), TU, ms, &(x[WS(vs, 4)]));
+	       }
+	       {
+		    V Tf, Tg, Tb, Tz, TA, Tv;
+		    Tb = VADD(T7, Ta);
+		    Tf = BYTWJ(&(W[0]), VSUB(Tb, Te));
+		    Tg = BYTWJ(&(W[TWVL * 6]), VADD(Te, Tb));
+		    ST(&(x[WS(vs, 1)]), Tf, ms, &(x[WS(vs, 1)]));
+		    ST(&(x[WS(vs, 4)]), Tg, ms, &(x[WS(vs, 4)]));
+		    Tv = VADD(Tr, Tu);
+		    Tz = BYTWJ(&(W[0]), VSUB(Tv, Ty));
+		    TA = BYTWJ(&(W[TWVL * 6]), VADD(Ty, Tv));
+		    ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tz, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 4) + WS(rs, 1)]), TA, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+	       }
+	       {
+		    V T1d, T1e, T19, T1x, T1y, T1t;
+		    T19 = VADD(T15, T18);
+		    T1d = BYTWJ(&(W[0]), VSUB(T19, T1c));
+		    T1e = BYTWJ(&(W[TWVL * 6]), VADD(T1c, T19));
+		    ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1d, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 4) + WS(rs, 3)]), T1e, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+		    T1t = VADD(T1p, T1s);
+		    T1x = BYTWJ(&(W[0]), VSUB(T1t, T1w));
+		    T1y = BYTWJ(&(W[TWVL * 6]), VADD(T1w, T1t));
+		    ST(&(x[WS(vs, 1) + WS(rs, 4)]), T1x, ms, &(x[WS(vs, 1)]));
+		    ST(&(x[WS(vs, 4) + WS(rs, 4)]), T1y, ms, &(x[WS(vs, 4)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("q1fv_5"), twinstr, &GENUS, {85, 55, 15, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1fv_5) (planner *p) {
+     X(kdft_difsq_register) (p, q1fv_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/q1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/q1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:31 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1fv_8 -include q1f.h */
+
+/*
+ * This function contains 264 FP additions, 192 FP multiplications,
+ * (or, 184 additions, 112 multiplications, 80 fused multiply/add),
+ * 117 stack variables, 1 constants, and 128 memory accesses
+ */
+#include "q1f.h"
+
+static void q1fv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
+	       V T42, T43, T1U, T1V, T2Y, T2Z, TT, TS;
+	       {
+		    V T3, Te, T1E, T1P, Tu, Tp, T25, T20, T2b, T2m, T3M, T2x, T2C, T3X, TA;
+		    V TL, T48, T4d, T17, T11, TW, T1i, T2I, T1y, T1t, T2T, T3f, T3q, T34, T39;
+		    V T3G, T3B, Ts, Tv, Tf, Ta, T23, T26, T1Q, T1L, T2A, T2D, T2n, T2i, T4b;
+		    V T4e, T3Y, T3T, TZ, T12, TM, TH, T35, T2L, T3j, T1w, T1z, T1j, T1e, T36;
+		    V T2O, T3C, T3i, T3k;
+		    {
+			 V T3d, T32, T3e, T3o, T3p, T33;
+			 {
+			      V T2v, T2w, T3V, T46, T3W;
+			      {
+				   V T1, T2, Tc, Td, T1C, T1D, T1N, T1O;
+				   T1 = LD(&(x[0]), ms, &(x[0]));
+				   T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+				   Td = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   T1C = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+				   T1D = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
+				   T1N = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+				   T1O = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
+				   {
+					V T29, T1Y, T1Z, T2a, T2k, T2l, Tn, To, T3K, T3L;
+					T29 = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
+					T3 = VSUB(T1, T2);
+					Tn = VADD(T1, T2);
+					Te = VSUB(Tc, Td);
+					To = VADD(Tc, Td);
+					T1E = VSUB(T1C, T1D);
+					T1Y = VADD(T1C, T1D);
+					T1P = VSUB(T1N, T1O);
+					T1Z = VADD(T1N, T1O);
+					T2a = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
+					T2k = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
+					T2l = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
+					Tu = VSUB(Tn, To);
+					Tp = VADD(Tn, To);
+					T3K = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
+					T3L = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
+					T25 = VSUB(T1Y, T1Z);
+					T20 = VADD(T1Y, T1Z);
+					T2v = VADD(T29, T2a);
+					T2b = VSUB(T29, T2a);
+					T2w = VADD(T2k, T2l);
+					T2m = VSUB(T2k, T2l);
+					T3V = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
+					T46 = VADD(T3K, T3L);
+					T3M = VSUB(T3K, T3L);
+					T3W = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
+				   }
+			      }
+			      {
+				   V T15, TU, T16, T1g, TV, T1h;
+				   {
+					V Ty, Tz, TJ, TK, T47;
+					Ty = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+					Tz = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
+					TJ = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+					T2x = VADD(T2v, T2w);
+					T2C = VSUB(T2v, T2w);
+					TK = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
+					T47 = VADD(T3V, T3W);
+					T3X = VSUB(T3V, T3W);
+					T15 = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+					TA = VSUB(Ty, Tz);
+					TU = VADD(Ty, Tz);
+					T16 = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
+					T1g = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+					TL = VSUB(TJ, TK);
+					TV = VADD(TJ, TK);
+					T48 = VADD(T46, T47);
+					T4d = VSUB(T46, T47);
+					T1h = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
+				   }
+				   {
+					V T2G, T1r, T2H, T2R, T1s, T2S;
+					T2G = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
+					T17 = VSUB(T15, T16);
+					T1r = VADD(T15, T16);
+					T2H = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
+					T11 = VSUB(TU, TV);
+					TW = VADD(TU, TV);
+					T2R = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
+					T1i = VSUB(T1g, T1h);
+					T1s = VADD(T1g, T1h);
+					T2S = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
+					T3d = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
+					T2I = VSUB(T2G, T2H);
+					T32 = VADD(T2G, T2H);
+					T3e = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
+					T3o = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
+					T3p = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
+					T1y = VSUB(T1r, T1s);
+					T1t = VADD(T1r, T1s);
+					T33 = VADD(T2R, T2S);
+					T2T = VSUB(T2R, T2S);
+				   }
+			      }
+			 }
+			 {
+			      V T2y, T2e, T3Q, T2z, T2h, T49, T3P, T3R;
+			      {
+				   V T6, Tq, T1I, Tr, T9, T21, T1H, T1J;
+				   {
+					V T4, T3z, T3A, T5, T7, T8, T1F, T1G;
+					T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					T3f = VSUB(T3d, T3e);
+					T3z = VADD(T3d, T3e);
+					T3q = VSUB(T3o, T3p);
+					T3A = VADD(T3o, T3p);
+					T5 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+					T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					T34 = VADD(T32, T33);
+					T39 = VSUB(T32, T33);
+					T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+					T1F = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+					T1G = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+					T3G = VSUB(T3z, T3A);
+					T3B = VADD(T3z, T3A);
+					T6 = VSUB(T4, T5);
+					Tq = VADD(T4, T5);
+					T1I = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+					Tr = VADD(T7, T8);
+					T9 = VSUB(T7, T8);
+					T21 = VADD(T1F, T1G);
+					T1H = VSUB(T1F, T1G);
+					T1J = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+				   }
+				   {
+					V T2f, T22, T1K, T2g, T2c, T2d, T3N, T3O;
+					T2c = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					T2d = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					T2f = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					Ts = VADD(Tq, Tr);
+					Tv = VSUB(Tr, Tq);
+					Tf = VSUB(T9, T6);
+					Ta = VADD(T6, T9);
+					T22 = VADD(T1I, T1J);
+					T1K = VSUB(T1I, T1J);
+					T2y = VADD(T2c, T2d);
+					T2e = VSUB(T2c, T2d);
+					T2g = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+					T3N = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+					T3O = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+					T3Q = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+					T23 = VADD(T21, T22);
+					T26 = VSUB(T22, T21);
+					T1Q = VSUB(T1K, T1H);
+					T1L = VADD(T1H, T1K);
+					T2z = VADD(T2f, T2g);
+					T2h = VSUB(T2f, T2g);
+					T49 = VADD(T3N, T3O);
+					T3P = VSUB(T3N, T3O);
+					T3R = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+				   }
+			      }
+			      {
+				   V TX, TD, T1b, TY, TG, T1u, T1a, T1c;
+				   {
+					V TE, T4a, T3S, TF, TB, TC, T18, T19;
+					TB = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					TC = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					TE = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					T2A = VADD(T2y, T2z);
+					T2D = VSUB(T2z, T2y);
+					T2n = VSUB(T2h, T2e);
+					T2i = VADD(T2e, T2h);
+					T4a = VADD(T3Q, T3R);
+					T3S = VSUB(T3Q, T3R);
+					TX = VADD(TB, TC);
+					TD = VSUB(TB, TC);
+					TF = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					T18 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+					T19 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+					T1b = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+					T4b = VADD(T49, T4a);
+					T4e = VSUB(T4a, T49);
+					T3Y = VSUB(T3S, T3P);
+					T3T = VADD(T3P, T3S);
+					TY = VADD(TE, TF);
+					TG = VSUB(TE, TF);
+					T1u = VADD(T18, T19);
+					T1a = VSUB(T18, T19);
+					T1c = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+				   }
+				   {
+					V T2M, T1v, T1d, T2N, T2J, T2K, T3g, T3h;
+					T2J = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					T2K = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					T2M = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					TZ = VADD(TX, TY);
+					T12 = VSUB(TY, TX);
+					TM = VSUB(TG, TD);
+					TH = VADD(TD, TG);
+					T1v = VADD(T1b, T1c);
+					T1d = VSUB(T1b, T1c);
+					T35 = VADD(T2J, T2K);
+					T2L = VSUB(T2J, T2K);
+					T2N = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+					T3g = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+					T3h = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+					T3j = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+					T1w = VADD(T1u, T1v);
+					T1z = VSUB(T1v, T1u);
+					T1j = VSUB(T1d, T1a);
+					T1e = VADD(T1a, T1d);
+					T36 = VADD(T2M, T2N);
+					T2O = VSUB(T2M, T2N);
+					T3C = VADD(T3g, T3h);
+					T3i = VSUB(T3g, T3h);
+					T3k = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T3a, T2U, T2P, T3H, T3r, T3m, T13, T27, T3b, T4f;
+			 {
+			      V T37, T3E, T2B, T24;
+			      {
+				   V T3D, T3l, Tt, T4c;
+				   ST(&(x[0]), VADD(Tp, Ts), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VADD(T1t, T1w), ms, &(x[0]));
+				   ST(&(x[WS(rs, 7)]), VADD(T48, T4b), ms, &(x[WS(rs, 1)]));
+				   T37 = VADD(T35, T36);
+				   T3a = VSUB(T36, T35);
+				   T2U = VSUB(T2O, T2L);
+				   T2P = VADD(T2L, T2O);
+				   T3D = VADD(T3j, T3k);
+				   T3l = VSUB(T3j, T3k);
+				   ST(&(x[WS(rs, 4)]), VADD(T2x, T2A), ms, &(x[0]));
+				   ST(&(x[WS(rs, 3)]), VADD(T20, T23), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 5)]), VADD(T34, T37), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VADD(TW, TZ), ms, &(x[WS(rs, 1)]));
+				   Tt = BYTWJ(&(W[TWVL * 6]), VSUB(Tp, Ts));
+				   T4c = BYTWJ(&(W[TWVL * 6]), VSUB(T48, T4b));
+				   T3E = VADD(T3C, T3D);
+				   T3H = VSUB(T3D, T3C);
+				   T3r = VSUB(T3l, T3i);
+				   T3m = VADD(T3i, T3l);
+				   T2B = BYTWJ(&(W[TWVL * 6]), VSUB(T2x, T2A));
+				   T24 = BYTWJ(&(W[TWVL * 6]), VSUB(T20, T23));
+				   ST(&(x[WS(vs, 4)]), Tt, ms, &(x[WS(vs, 4)]));
+				   ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+				   ST(&(x[WS(rs, 6)]), VADD(T3B, T3E), ms, &(x[0]));
+			      }
+			      {
+				   V T38, T1A, Tw, T10, T1x, T3F, T2E, T3I;
+				   T10 = BYTWJ(&(W[TWVL * 6]), VSUB(TW, TZ));
+				   T1x = BYTWJ(&(W[TWVL * 6]), VSUB(T1t, T1w));
+				   T3F = BYTWJ(&(W[TWVL * 6]), VSUB(T3B, T3E));
+				   ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 4)]));
+				   ST(&(x[WS(vs, 4) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+				   T38 = BYTWJ(&(W[TWVL * 6]), VSUB(T34, T37));
+				   T1A = BYTWJ(&(W[TWVL * 10]), VFNMSI(T1z, T1y));
+				   Tw = BYTWJ(&(W[TWVL * 10]), VFNMSI(Tv, Tu));
+				   ST(&(x[WS(vs, 4) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+				   ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 4)]));
+				   ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 4)]));
+				   T2E = BYTWJ(&(W[TWVL * 10]), VFNMSI(T2D, T2C));
+				   T3I = BYTWJ(&(W[TWVL * 10]), VFNMSI(T3H, T3G));
+				   ST(&(x[WS(vs, 4) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+				   ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1A, ms, &(x[WS(vs, 6)]));
+				   ST(&(x[WS(vs, 6)]), Tw, ms, &(x[WS(vs, 6)]));
+				   T13 = BYTWJ(&(W[TWVL * 10]), VFNMSI(T12, T11));
+				   T27 = BYTWJ(&(W[TWVL * 10]), VFNMSI(T26, T25));
+				   T3b = BYTWJ(&(W[TWVL * 10]), VFNMSI(T3a, T39));
+				   ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2E, ms, &(x[WS(vs, 6)]));
+				   ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3I, ms, &(x[WS(vs, 6)]));
+				   T4f = BYTWJ(&(W[TWVL * 10]), VFNMSI(T4e, T4d));
+			      }
+			 }
+			 {
+			      V Tj, Tk, T2r, T2j, Ti, Th, T2o, T2s, T1M, T1R, T41, T40;
+			      {
+				   V T3c, T4g, T3J, T2F, Tx, T1B;
+				   Tx = BYTWJ(&(W[TWVL * 2]), VFMAI(Tv, Tu));
+				   T1B = BYTWJ(&(W[TWVL * 2]), VFMAI(T1z, T1y));
+				   ST(&(x[WS(vs, 6) + WS(rs, 1)]), T13, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+				   ST(&(x[WS(vs, 6) + WS(rs, 3)]), T27, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+				   ST(&(x[WS(vs, 6) + WS(rs, 5)]), T3b, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+				   T3c = BYTWJ(&(W[TWVL * 2]), VFMAI(T3a, T39));
+				   T4g = BYTWJ(&(W[TWVL * 2]), VFMAI(T4e, T4d));
+				   ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4f, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+				   ST(&(x[WS(vs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
+				   ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 2)]));
+				   T3J = BYTWJ(&(W[TWVL * 2]), VFMAI(T3H, T3G));
+				   T2F = BYTWJ(&(W[TWVL * 2]), VFMAI(T2D, T2C));
+				   {
+					V T14, Tb, Tg, T28, T3U, T3Z;
+					T28 = BYTWJ(&(W[TWVL * 2]), VFMAI(T26, T25));
+					ST(&(x[WS(vs, 2) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+					ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+					T14 = BYTWJ(&(W[TWVL * 2]), VFMAI(T12, T11));
+					Tj = VFNMS(LDK(KP707106781), Ta, T3);
+					Tb = VFMA(LDK(KP707106781), Ta, T3);
+					Tg = VFNMS(LDK(KP707106781), Tf, Te);
+					Tk = VFMA(LDK(KP707106781), Tf, Te);
+					ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 2)]));
+					ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 2)]));
+					ST(&(x[WS(vs, 2) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+					T3U = VFMA(LDK(KP707106781), T3T, T3M);
+					T42 = VFNMS(LDK(KP707106781), T3T, T3M);
+					T43 = VFMA(LDK(KP707106781), T3Y, T3X);
+					T3Z = VFNMS(LDK(KP707106781), T3Y, T3X);
+					ST(&(x[WS(vs, 2) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+					T2r = VFNMS(LDK(KP707106781), T2i, T2b);
+					T2j = VFMA(LDK(KP707106781), T2i, T2b);
+					Ti = BYTWJ(&(W[TWVL * 12]), VFMAI(Tg, Tb));
+					Th = BYTWJ(&(W[0]), VFNMSI(Tg, Tb));
+					T2o = VFNMS(LDK(KP707106781), T2n, T2m);
+					T2s = VFMA(LDK(KP707106781), T2n, T2m);
+					T1U = VFNMS(LDK(KP707106781), T1L, T1E);
+					T1M = VFMA(LDK(KP707106781), T1L, T1E);
+					T1R = VFNMS(LDK(KP707106781), T1Q, T1P);
+					T1V = VFMA(LDK(KP707106781), T1Q, T1P);
+					T41 = BYTWJ(&(W[TWVL * 12]), VFMAI(T3Z, T3U));
+					T40 = BYTWJ(&(W[0]), VFNMSI(T3Z, T3U));
+				   }
+			      }
+			      {
+				   V TQ, TR, T1n, T1o, T3v, T3w;
+				   {
+					V T1f, T1k, T3n, TP, TO, T3s, T2Q, T2V;
+					{
+					     V TI, T2q, T2p, T1T, T1S, TN;
+					     TQ = VFNMS(LDK(KP707106781), TH, TA);
+					     TI = VFMA(LDK(KP707106781), TH, TA);
+					     ST(&(x[WS(vs, 7)]), Ti, ms, &(x[WS(vs, 7)]));
+					     ST(&(x[WS(vs, 1)]), Th, ms, &(x[WS(vs, 1)]));
+					     T2q = BYTWJ(&(W[TWVL * 12]), VFMAI(T2o, T2j));
+					     T2p = BYTWJ(&(W[0]), VFNMSI(T2o, T2j));
+					     T1T = BYTWJ(&(W[TWVL * 12]), VFMAI(T1R, T1M));
+					     T1S = BYTWJ(&(W[0]), VFNMSI(T1R, T1M));
+					     ST(&(x[WS(vs, 7) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+					     ST(&(x[WS(vs, 1) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					     TN = VFNMS(LDK(KP707106781), TM, TL);
+					     TR = VFMA(LDK(KP707106781), TM, TL);
+					     T1n = VFNMS(LDK(KP707106781), T1e, T17);
+					     T1f = VFMA(LDK(KP707106781), T1e, T17);
+					     ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 7)]));
+					     ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 1)]));
+					     ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+					     ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					     T1k = VFNMS(LDK(KP707106781), T1j, T1i);
+					     T1o = VFMA(LDK(KP707106781), T1j, T1i);
+					     T3v = VFNMS(LDK(KP707106781), T3m, T3f);
+					     T3n = VFMA(LDK(KP707106781), T3m, T3f);
+					     TP = BYTWJ(&(W[TWVL * 12]), VFMAI(TN, TI));
+					     TO = BYTWJ(&(W[0]), VFNMSI(TN, TI));
+					     T3s = VFNMS(LDK(KP707106781), T3r, T3q);
+					     T3w = VFMA(LDK(KP707106781), T3r, T3q);
+					}
+					T2Y = VFNMS(LDK(KP707106781), T2P, T2I);
+					T2Q = VFMA(LDK(KP707106781), T2P, T2I);
+					T2V = VFNMS(LDK(KP707106781), T2U, T2T);
+					T2Z = VFMA(LDK(KP707106781), T2U, T2T);
+					{
+					     V T3u, T3t, T2X, T2W, T1m, T1l;
+					     T1m = BYTWJ(&(W[TWVL * 12]), VFMAI(T1k, T1f));
+					     T1l = BYTWJ(&(W[0]), VFNMSI(T1k, T1f));
+					     ST(&(x[WS(vs, 7) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+					     ST(&(x[WS(vs, 1) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					     T3u = BYTWJ(&(W[TWVL * 12]), VFMAI(T3s, T3n));
+					     T3t = BYTWJ(&(W[0]), VFNMSI(T3s, T3n));
+					     T2X = BYTWJ(&(W[TWVL * 12]), VFMAI(T2V, T2Q));
+					     T2W = BYTWJ(&(W[0]), VFNMSI(T2V, T2Q));
+					     ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 7)]));
+					     ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 1)]));
+					     ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 7)]));
+					     ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 1)]));
+					     ST(&(x[WS(vs, 7) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+					     ST(&(x[WS(vs, 1) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+					}
+				   }
+				   {
+					V T2u, T2t, T3y, T3x;
+					{
+					     V T1q, T1p, Tm, Tl;
+					     T1q = BYTWJ(&(W[TWVL * 4]), VFMAI(T1o, T1n));
+					     T1p = BYTWJ(&(W[TWVL * 8]), VFNMSI(T1o, T1n));
+					     Tm = BYTWJ(&(W[TWVL * 4]), VFMAI(Tk, Tj));
+					     Tl = BYTWJ(&(W[TWVL * 8]), VFNMSI(Tk, Tj));
+					     ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 3)]));
+					     ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 5)]));
+					     T2u = BYTWJ(&(W[TWVL * 4]), VFMAI(T2s, T2r));
+					     T2t = BYTWJ(&(W[TWVL * 8]), VFNMSI(T2s, T2r));
+					     T3y = BYTWJ(&(W[TWVL * 4]), VFMAI(T3w, T3v));
+					     T3x = BYTWJ(&(W[TWVL * 8]), VFNMSI(T3w, T3v));
+					     ST(&(x[WS(vs, 3)]), Tm, ms, &(x[WS(vs, 3)]));
+					     ST(&(x[WS(vs, 5)]), Tl, ms, &(x[WS(vs, 5)]));
+					}
+					ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 3)]));
+					ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 5)]));
+					ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 3)]));
+					ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 5)]));
+					TT = BYTWJ(&(W[TWVL * 4]), VFMAI(TR, TQ));
+					TS = BYTWJ(&(W[TWVL * 8]), VFNMSI(TR, TQ));
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T31, T30, T45, T44, T1X, T1W;
+		    T1X = BYTWJ(&(W[TWVL * 4]), VFMAI(T1V, T1U));
+		    T1W = BYTWJ(&(W[TWVL * 8]), VFNMSI(T1V, T1U));
+		    ST(&(x[WS(vs, 3) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+		    T31 = BYTWJ(&(W[TWVL * 4]), VFMAI(T2Z, T2Y));
+		    T30 = BYTWJ(&(W[TWVL * 8]), VFNMSI(T2Z, T2Y));
+		    T45 = BYTWJ(&(W[TWVL * 4]), VFMAI(T43, T42));
+		    T44 = BYTWJ(&(W[TWVL * 8]), VFNMSI(T43, T42));
+		    ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 5) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("q1fv_8"), twinstr, &GENUS, {184, 112, 80, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1fv_8) (planner *p) {
+     X(kdft_difsq_register) (p, q1fv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1fv_8 -include q1f.h */
+
+/*
+ * This function contains 264 FP additions, 128 FP multiplications,
+ * (or, 264 additions, 128 multiplications, 0 fused multiply/add),
+ * 77 stack variables, 1 constants, and 128 memory accesses
+ */
+#include "q1f.h"
+
+static void q1fv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
+	       V T3, Tu, Tf, Tp, T1E, T25, T1Q, T20, T2b, T2C, T2n, T2x, T3M, T4d, T3Y;
+	       V T48, TA, T11, TM, TW, T17, T1y, T1j, T1t, T2I, T39, T2U, T34, T3f, T3G;
+	       V T3r, T3B, Ta, Tv, Tc, Ts, T1L, T26, T1N, T23, T2i, T2D, T2k, T2A, T3T;
+	       V T4e, T3V, T4b, TH, T12, TJ, TZ, T1e, T1z, T1g, T1w, T2P, T3a, T2R, T37;
+	       V T3m, T3H, T3o, T3E, T28, T14;
+	       {
+		    V T1, T2, Tn, Td, Te, To;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tn = VADD(T1, T2);
+		    Td = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Te = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    To = VADD(Td, Te);
+		    T3 = VSUB(T1, T2);
+		    Tu = VSUB(Tn, To);
+		    Tf = VSUB(Td, Te);
+		    Tp = VADD(Tn, To);
+	       }
+	       {
+		    V T1C, T1D, T1Y, T1O, T1P, T1Z;
+		    T1C = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
+		    T1D = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
+		    T1Y = VADD(T1C, T1D);
+		    T1O = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
+		    T1P = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
+		    T1Z = VADD(T1O, T1P);
+		    T1E = VSUB(T1C, T1D);
+		    T25 = VSUB(T1Y, T1Z);
+		    T1Q = VSUB(T1O, T1P);
+		    T20 = VADD(T1Y, T1Z);
+	       }
+	       {
+		    V T29, T2a, T2v, T2l, T2m, T2w;
+		    T29 = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
+		    T2a = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
+		    T2v = VADD(T29, T2a);
+		    T2l = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
+		    T2m = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
+		    T2w = VADD(T2l, T2m);
+		    T2b = VSUB(T29, T2a);
+		    T2C = VSUB(T2v, T2w);
+		    T2n = VSUB(T2l, T2m);
+		    T2x = VADD(T2v, T2w);
+	       }
+	       {
+		    V T3K, T3L, T46, T3W, T3X, T47;
+		    T3K = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
+		    T3L = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
+		    T46 = VADD(T3K, T3L);
+		    T3W = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
+		    T3X = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
+		    T47 = VADD(T3W, T3X);
+		    T3M = VSUB(T3K, T3L);
+		    T4d = VSUB(T46, T47);
+		    T3Y = VSUB(T3W, T3X);
+		    T48 = VADD(T46, T47);
+	       }
+	       {
+		    V Ty, Tz, TU, TK, TL, TV;
+		    Ty = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
+		    Tz = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
+		    TU = VADD(Ty, Tz);
+		    TK = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
+		    TL = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
+		    TV = VADD(TK, TL);
+		    TA = VSUB(Ty, Tz);
+		    T11 = VSUB(TU, TV);
+		    TM = VSUB(TK, TL);
+		    TW = VADD(TU, TV);
+	       }
+	       {
+		    V T15, T16, T1r, T1h, T1i, T1s;
+		    T15 = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
+		    T16 = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
+		    T1r = VADD(T15, T16);
+		    T1h = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
+		    T1i = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
+		    T1s = VADD(T1h, T1i);
+		    T17 = VSUB(T15, T16);
+		    T1y = VSUB(T1r, T1s);
+		    T1j = VSUB(T1h, T1i);
+		    T1t = VADD(T1r, T1s);
+	       }
+	       {
+		    V T2G, T2H, T32, T2S, T2T, T33;
+		    T2G = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
+		    T2H = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
+		    T32 = VADD(T2G, T2H);
+		    T2S = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
+		    T2T = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
+		    T33 = VADD(T2S, T2T);
+		    T2I = VSUB(T2G, T2H);
+		    T39 = VSUB(T32, T33);
+		    T2U = VSUB(T2S, T2T);
+		    T34 = VADD(T32, T33);
+	       }
+	       {
+		    V T3d, T3e, T3z, T3p, T3q, T3A;
+		    T3d = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
+		    T3e = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
+		    T3z = VADD(T3d, T3e);
+		    T3p = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
+		    T3q = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
+		    T3A = VADD(T3p, T3q);
+		    T3f = VSUB(T3d, T3e);
+		    T3G = VSUB(T3z, T3A);
+		    T3r = VSUB(T3p, T3q);
+		    T3B = VADD(T3z, T3A);
+	       }
+	       {
+		    V T6, Tq, T9, Tr;
+		    {
+			 V T4, T5, T7, T8;
+			 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T5 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T6 = VSUB(T4, T5);
+			 Tq = VADD(T4, T5);
+			 T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = VSUB(T7, T8);
+			 Tr = VADD(T7, T8);
+		    }
+		    Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
+		    Tv = VBYI(VSUB(Tr, Tq));
+		    Tc = VMUL(LDK(KP707106781), VSUB(T9, T6));
+		    Ts = VADD(Tq, Tr);
+	       }
+	       {
+		    V T1H, T21, T1K, T22;
+		    {
+			 V T1F, T1G, T1I, T1J;
+			 T1F = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T1G = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T1H = VSUB(T1F, T1G);
+			 T21 = VADD(T1F, T1G);
+			 T1I = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T1J = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+			 T1K = VSUB(T1I, T1J);
+			 T22 = VADD(T1I, T1J);
+		    }
+		    T1L = VMUL(LDK(KP707106781), VADD(T1H, T1K));
+		    T26 = VBYI(VSUB(T22, T21));
+		    T1N = VMUL(LDK(KP707106781), VSUB(T1K, T1H));
+		    T23 = VADD(T21, T22);
+	       }
+	       {
+		    V T2e, T2y, T2h, T2z;
+		    {
+			 V T2c, T2d, T2f, T2g;
+			 T2c = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T2d = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T2e = VSUB(T2c, T2d);
+			 T2y = VADD(T2c, T2d);
+			 T2f = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T2g = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+			 T2h = VSUB(T2f, T2g);
+			 T2z = VADD(T2f, T2g);
+		    }
+		    T2i = VMUL(LDK(KP707106781), VADD(T2e, T2h));
+		    T2D = VBYI(VSUB(T2z, T2y));
+		    T2k = VMUL(LDK(KP707106781), VSUB(T2h, T2e));
+		    T2A = VADD(T2y, T2z);
+	       }
+	       {
+		    V T3P, T49, T3S, T4a;
+		    {
+			 V T3N, T3O, T3Q, T3R;
+			 T3N = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+			 T3O = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+			 T3P = VSUB(T3N, T3O);
+			 T49 = VADD(T3N, T3O);
+			 T3Q = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+			 T3R = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+			 T3S = VSUB(T3Q, T3R);
+			 T4a = VADD(T3Q, T3R);
+		    }
+		    T3T = VMUL(LDK(KP707106781), VADD(T3P, T3S));
+		    T4e = VBYI(VSUB(T4a, T49));
+		    T3V = VMUL(LDK(KP707106781), VSUB(T3S, T3P));
+		    T4b = VADD(T49, T4a);
+	       }
+	       {
+		    V TD, TX, TG, TY;
+		    {
+			 V TB, TC, TE, TF;
+			 TB = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 TC = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 TD = VSUB(TB, TC);
+			 TX = VADD(TB, TC);
+			 TE = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 TF = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+			 TG = VSUB(TE, TF);
+			 TY = VADD(TE, TF);
+		    }
+		    TH = VMUL(LDK(KP707106781), VADD(TD, TG));
+		    T12 = VBYI(VSUB(TY, TX));
+		    TJ = VMUL(LDK(KP707106781), VSUB(TG, TD));
+		    TZ = VADD(TX, TY);
+	       }
+	       {
+		    V T1a, T1u, T1d, T1v;
+		    {
+			 V T18, T19, T1b, T1c;
+			 T18 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 T19 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 T1a = VSUB(T18, T19);
+			 T1u = VADD(T18, T19);
+			 T1b = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 T1c = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+			 T1d = VSUB(T1b, T1c);
+			 T1v = VADD(T1b, T1c);
+		    }
+		    T1e = VMUL(LDK(KP707106781), VADD(T1a, T1d));
+		    T1z = VBYI(VSUB(T1v, T1u));
+		    T1g = VMUL(LDK(KP707106781), VSUB(T1d, T1a));
+		    T1w = VADD(T1u, T1v);
+	       }
+	       {
+		    V T2L, T35, T2O, T36;
+		    {
+			 V T2J, T2K, T2M, T2N;
+			 T2J = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+			 T2K = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+			 T2L = VSUB(T2J, T2K);
+			 T35 = VADD(T2J, T2K);
+			 T2M = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+			 T2N = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+			 T2O = VSUB(T2M, T2N);
+			 T36 = VADD(T2M, T2N);
+		    }
+		    T2P = VMUL(LDK(KP707106781), VADD(T2L, T2O));
+		    T3a = VBYI(VSUB(T36, T35));
+		    T2R = VMUL(LDK(KP707106781), VSUB(T2O, T2L));
+		    T37 = VADD(T35, T36);
+	       }
+	       {
+		    V T3i, T3C, T3l, T3D;
+		    {
+			 V T3g, T3h, T3j, T3k;
+			 T3g = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+			 T3h = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+			 T3i = VSUB(T3g, T3h);
+			 T3C = VADD(T3g, T3h);
+			 T3j = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+			 T3k = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+			 T3l = VSUB(T3j, T3k);
+			 T3D = VADD(T3j, T3k);
+		    }
+		    T3m = VMUL(LDK(KP707106781), VADD(T3i, T3l));
+		    T3H = VBYI(VSUB(T3D, T3C));
+		    T3o = VMUL(LDK(KP707106781), VSUB(T3l, T3i));
+		    T3E = VADD(T3C, T3D);
+	       }
+	       ST(&(x[0]), VADD(Tp, Ts), ms, &(x[0]));
+	       ST(&(x[WS(rs, 2)]), VADD(T1t, T1w), ms, &(x[0]));
+	       ST(&(x[WS(rs, 5)]), VADD(T34, T37), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 7)]), VADD(T48, T4b), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 6)]), VADD(T3B, T3E), ms, &(x[0]));
+	       ST(&(x[WS(rs, 4)]), VADD(T2x, T2A), ms, &(x[0]));
+	       {
+		    V Tt, T4c, T2B, T24;
+		    ST(&(x[WS(rs, 3)]), VADD(T20, T23), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(TW, TZ), ms, &(x[WS(rs, 1)]));
+		    Tt = BYTWJ(&(W[TWVL * 6]), VSUB(Tp, Ts));
+		    ST(&(x[WS(vs, 4)]), Tt, ms, &(x[WS(vs, 4)]));
+		    T4c = BYTWJ(&(W[TWVL * 6]), VSUB(T48, T4b));
+		    ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+		    T2B = BYTWJ(&(W[TWVL * 6]), VSUB(T2x, T2A));
+		    ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 4)]));
+		    T24 = BYTWJ(&(W[TWVL * 6]), VSUB(T20, T23));
+		    ST(&(x[WS(vs, 4) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+	       }
+	       {
+		    V T10, T1x, T3F, T38, T1A, Tw;
+		    T10 = BYTWJ(&(W[TWVL * 6]), VSUB(TW, TZ));
+		    ST(&(x[WS(vs, 4) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+		    T1x = BYTWJ(&(W[TWVL * 6]), VSUB(T1t, T1w));
+		    ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 4)]));
+		    T3F = BYTWJ(&(W[TWVL * 6]), VSUB(T3B, T3E));
+		    ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 4)]));
+		    T38 = BYTWJ(&(W[TWVL * 6]), VSUB(T34, T37));
+		    ST(&(x[WS(vs, 4) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
+		    T1A = BYTWJ(&(W[TWVL * 10]), VSUB(T1y, T1z));
+		    ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1A, ms, &(x[WS(vs, 6)]));
+		    Tw = BYTWJ(&(W[TWVL * 10]), VSUB(Tu, Tv));
+		    ST(&(x[WS(vs, 6)]), Tw, ms, &(x[WS(vs, 6)]));
+	       }
+	       {
+		    V T2E, T3I, T13, T27, T3b, T4f;
+		    T2E = BYTWJ(&(W[TWVL * 10]), VSUB(T2C, T2D));
+		    ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2E, ms, &(x[WS(vs, 6)]));
+		    T3I = BYTWJ(&(W[TWVL * 10]), VSUB(T3G, T3H));
+		    ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3I, ms, &(x[WS(vs, 6)]));
+		    T13 = BYTWJ(&(W[TWVL * 10]), VSUB(T11, T12));
+		    ST(&(x[WS(vs, 6) + WS(rs, 1)]), T13, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+		    T27 = BYTWJ(&(W[TWVL * 10]), VSUB(T25, T26));
+		    ST(&(x[WS(vs, 6) + WS(rs, 3)]), T27, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+		    T3b = BYTWJ(&(W[TWVL * 10]), VSUB(T39, T3a));
+		    ST(&(x[WS(vs, 6) + WS(rs, 5)]), T3b, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+		    T4f = BYTWJ(&(W[TWVL * 10]), VSUB(T4d, T4e));
+		    ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4f, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
+	       }
+	       {
+		    V Tx, T1B, T3c, T4g, T3J, T2F;
+		    Tx = BYTWJ(&(W[TWVL * 2]), VADD(Tu, Tv));
+		    ST(&(x[WS(vs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
+		    T1B = BYTWJ(&(W[TWVL * 2]), VADD(T1y, T1z));
+		    ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 2)]));
+		    T3c = BYTWJ(&(W[TWVL * 2]), VADD(T39, T3a));
+		    ST(&(x[WS(vs, 2) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    T4g = BYTWJ(&(W[TWVL * 2]), VADD(T4d, T4e));
+		    ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+		    T3J = BYTWJ(&(W[TWVL * 2]), VADD(T3G, T3H));
+		    ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 2)]));
+		    T2F = BYTWJ(&(W[TWVL * 2]), VADD(T2C, T2D));
+		    ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 2)]));
+	       }
+	       T28 = BYTWJ(&(W[TWVL * 2]), VADD(T25, T26));
+	       ST(&(x[WS(vs, 2) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+	       T14 = BYTWJ(&(W[TWVL * 2]), VADD(T11, T12));
+	       ST(&(x[WS(vs, 2) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
+	       {
+		    V Th, Ti, Tb, Tg;
+		    Tb = VADD(T3, Ta);
+		    Tg = VBYI(VSUB(Tc, Tf));
+		    Th = BYTWJ(&(W[TWVL * 12]), VSUB(Tb, Tg));
+		    Ti = BYTWJ(&(W[0]), VADD(Tb, Tg));
+		    ST(&(x[WS(vs, 7)]), Th, ms, &(x[WS(vs, 7)]));
+		    ST(&(x[WS(vs, 1)]), Ti, ms, &(x[WS(vs, 1)]));
+	       }
+	       {
+		    V T40, T41, T3U, T3Z;
+		    T3U = VADD(T3M, T3T);
+		    T3Z = VBYI(VSUB(T3V, T3Y));
+		    T40 = BYTWJ(&(W[TWVL * 12]), VSUB(T3U, T3Z));
+		    T41 = BYTWJ(&(W[0]), VADD(T3U, T3Z));
+		    ST(&(x[WS(vs, 7) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 1) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       }
+	       {
+		    V T2p, T2q, T2j, T2o;
+		    T2j = VADD(T2b, T2i);
+		    T2o = VBYI(VSUB(T2k, T2n));
+		    T2p = BYTWJ(&(W[TWVL * 12]), VSUB(T2j, T2o));
+		    T2q = BYTWJ(&(W[0]), VADD(T2j, T2o));
+		    ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 7)]));
+		    ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 1)]));
+	       }
+	       {
+		    V T1S, T1T, T1M, T1R;
+		    T1M = VADD(T1E, T1L);
+		    T1R = VBYI(VSUB(T1N, T1Q));
+		    T1S = BYTWJ(&(W[TWVL * 12]), VSUB(T1M, T1R));
+		    T1T = BYTWJ(&(W[0]), VADD(T1M, T1R));
+		    ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       }
+	       {
+		    V TO, TP, TI, TN;
+		    TI = VADD(TA, TH);
+		    TN = VBYI(VSUB(TJ, TM));
+		    TO = BYTWJ(&(W[TWVL * 12]), VSUB(TI, TN));
+		    TP = BYTWJ(&(W[0]), VADD(TI, TN));
+		    ST(&(x[WS(vs, 7) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 1) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       }
+	       {
+		    V T1l, T1m, T1f, T1k;
+		    T1f = VADD(T17, T1e);
+		    T1k = VBYI(VSUB(T1g, T1j));
+		    T1l = BYTWJ(&(W[TWVL * 12]), VSUB(T1f, T1k));
+		    T1m = BYTWJ(&(W[0]), VADD(T1f, T1k));
+		    ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 7)]));
+		    ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 1)]));
+	       }
+	       {
+		    V T3t, T3u, T3n, T3s;
+		    T3n = VADD(T3f, T3m);
+		    T3s = VBYI(VSUB(T3o, T3r));
+		    T3t = BYTWJ(&(W[TWVL * 12]), VSUB(T3n, T3s));
+		    T3u = BYTWJ(&(W[0]), VADD(T3n, T3s));
+		    ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 7)]));
+		    ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 1)]));
+	       }
+	       {
+		    V T2W, T2X, T2Q, T2V;
+		    T2Q = VADD(T2I, T2P);
+		    T2V = VBYI(VSUB(T2R, T2U));
+		    T2W = BYTWJ(&(W[TWVL * 12]), VSUB(T2Q, T2V));
+		    T2X = BYTWJ(&(W[0]), VADD(T2Q, T2V));
+		    ST(&(x[WS(vs, 7) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 1) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
+	       }
+	       {
+		    V T1p, T1q, T1n, T1o;
+		    T1n = VSUB(T17, T1e);
+		    T1o = VBYI(VADD(T1j, T1g));
+		    T1p = BYTWJ(&(W[TWVL * 8]), VSUB(T1n, T1o));
+		    T1q = BYTWJ(&(W[TWVL * 4]), VADD(T1n, T1o));
+		    ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 5)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 3)]));
+	       }
+	       {
+		    V Tl, Tm, Tj, Tk;
+		    Tj = VSUB(T3, Ta);
+		    Tk = VBYI(VADD(Tf, Tc));
+		    Tl = BYTWJ(&(W[TWVL * 8]), VSUB(Tj, Tk));
+		    Tm = BYTWJ(&(W[TWVL * 4]), VADD(Tj, Tk));
+		    ST(&(x[WS(vs, 5)]), Tl, ms, &(x[WS(vs, 5)]));
+		    ST(&(x[WS(vs, 3)]), Tm, ms, &(x[WS(vs, 3)]));
+	       }
+	       {
+		    V T2t, T2u, T2r, T2s;
+		    T2r = VSUB(T2b, T2i);
+		    T2s = VBYI(VADD(T2n, T2k));
+		    T2t = BYTWJ(&(W[TWVL * 8]), VSUB(T2r, T2s));
+		    T2u = BYTWJ(&(W[TWVL * 4]), VADD(T2r, T2s));
+		    ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 5)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 3)]));
+	       }
+	       {
+		    V T3x, T3y, T3v, T3w;
+		    T3v = VSUB(T3f, T3m);
+		    T3w = VBYI(VADD(T3r, T3o));
+		    T3x = BYTWJ(&(W[TWVL * 8]), VSUB(T3v, T3w));
+		    T3y = BYTWJ(&(W[TWVL * 4]), VADD(T3v, T3w));
+		    ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 5)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 3)]));
+	       }
+	       {
+		    V TS, TT, TQ, TR;
+		    TQ = VSUB(TA, TH);
+		    TR = VBYI(VADD(TM, TJ));
+		    TS = BYTWJ(&(W[TWVL * 8]), VSUB(TQ, TR));
+		    TT = BYTWJ(&(W[TWVL * 4]), VADD(TQ, TR));
+		    ST(&(x[WS(vs, 5) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+	       }
+	       {
+		    V T1W, T1X, T1U, T1V;
+		    T1U = VSUB(T1E, T1L);
+		    T1V = VBYI(VADD(T1Q, T1N));
+		    T1W = BYTWJ(&(W[TWVL * 8]), VSUB(T1U, T1V));
+		    T1X = BYTWJ(&(W[TWVL * 4]), VADD(T1U, T1V));
+		    ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+	       }
+	       {
+		    V T30, T31, T2Y, T2Z;
+		    T2Y = VSUB(T2I, T2P);
+		    T2Z = VBYI(VADD(T2U, T2R));
+		    T30 = BYTWJ(&(W[TWVL * 8]), VSUB(T2Y, T2Z));
+		    T31 = BYTWJ(&(W[TWVL * 4]), VADD(T2Y, T2Z));
+		    ST(&(x[WS(vs, 5) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+	       }
+	       {
+		    V T44, T45, T42, T43;
+		    T42 = VSUB(T3M, T3T);
+		    T43 = VBYI(VADD(T3Y, T3V));
+		    T44 = BYTWJ(&(W[TWVL * 8]), VSUB(T42, T43));
+		    T45 = BYTWJ(&(W[TWVL * 4]), VADD(T42, T43));
+		    ST(&(x[WS(vs, 5) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
+		    ST(&(x[WS(vs, 3) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("q1fv_8"), twinstr, &GENUS, {264, 128, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_q1fv_8) (planner *p) {
+     X(kdft_difsq_register) (p, q1fv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1buv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1buv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1buv_10 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 51 FP additions, 40 FP multiplications,
+ * (or, 33 additions, 22 multiplications, 18 fused multiply/add),
+ * 43 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Td, TA, T4, Ta, Tk, TE, Tp, TF, TB, T9, T1, T2, Tb;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tg, Tn, Ti, Tl;
+		    Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tn = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Ti = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+		    Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    {
+			 V T6, T8, T5, Tc;
+			 T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 {
+			      V T3, Th, To, Tj, Tm, T7;
+			      T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T3 = BYTW(&(W[TWVL * 8]), T2);
+			      Th = BYTW(&(W[TWVL * 6]), Tg);
+			      To = BYTW(&(W[0]), Tn);
+			      Tj = BYTW(&(W[TWVL * 16]), Ti);
+			      Tm = BYTW(&(W[TWVL * 10]), Tl);
+			      T6 = BYTW(&(W[TWVL * 2]), T5);
+			      Td = BYTW(&(W[TWVL * 4]), Tc);
+			      T8 = BYTW(&(W[TWVL * 12]), T7);
+			      TA = VADD(T1, T3);
+			      T4 = VSUB(T1, T3);
+			      Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Tk = VSUB(Th, Tj);
+			      TE = VADD(Th, Tj);
+			      Tp = VSUB(Tm, To);
+			      TF = VADD(Tm, To);
+			 }
+			 TB = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+		    }
+	       }
+	       Tb = BYTW(&(W[TWVL * 14]), Ta);
+	       {
+		    V TL, TG, Tw, Tq, TC, Te;
+		    TL = VSUB(TE, TF);
+		    TG = VADD(TE, TF);
+		    Tw = VSUB(Tk, Tp);
+		    Tq = VADD(Tk, Tp);
+		    TC = VADD(Tb, Td);
+		    Te = VSUB(Tb, Td);
+		    {
+			 V TM, TD, Tv, Tf;
+			 TM = VSUB(TB, TC);
+			 TD = VADD(TB, TC);
+			 Tv = VSUB(T9, Te);
+			 Tf = VADD(T9, Te);
+			 {
+			      V TP, TN, TH, TJ, Tz, Tx, Tr, Tt, TI, Ts;
+			      TP = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TL, TM));
+			      TN = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TM, TL));
+			      TH = VADD(TD, TG);
+			      TJ = VSUB(TD, TG);
+			      Tz = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tv, Tw));
+			      Tx = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tw, Tv));
+			      Tr = VADD(Tf, Tq);
+			      Tt = VSUB(Tf, Tq);
+			      ST(&(x[0]), VADD(TA, TH), ms, &(x[0]));
+			      TI = VFNMS(LDK(KP250000000), TH, TA);
+			      ST(&(x[WS(rs, 5)]), VADD(T4, Tr), ms, &(x[WS(rs, 1)]));
+			      Ts = VFNMS(LDK(KP250000000), Tr, T4);
+			      {
+				   V TK, TO, Tu, Ty;
+				   TK = VFNMS(LDK(KP559016994), TJ, TI);
+				   TO = VFMA(LDK(KP559016994), TJ, TI);
+				   Tu = VFMA(LDK(KP559016994), Tt, Ts);
+				   Ty = VFNMS(LDK(KP559016994), Tt, Ts);
+				   ST(&(x[WS(rs, 8)]), VFMAI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFNMSI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFMAI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VFNMSI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 9)]), VFNMSI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFMAI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFNMSI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t1buv_10"), twinstr, &GENUS, {33, 22, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_10) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1buv_10 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 51 FP additions, 30 FP multiplications,
+ * (or, 45 additions, 24 multiplications, 6 fused multiply/add),
+ * 32 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Tu, TH, Tg, Tl, Tp, TD, TE, TJ, T5, Ta, To, TA, TB, TI, Tr;
+	       V Tt, Ts;
+	       Tr = LD(&(x[0]), ms, &(x[0]));
+	       Ts = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Tt = BYTW(&(W[TWVL * 8]), Ts);
+	       Tu = VSUB(Tr, Tt);
+	       TH = VADD(Tr, Tt);
+	       {
+		    V Td, Tk, Tf, Ti;
+		    {
+			 V Tc, Tj, Te, Th;
+			 Tc = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Td = BYTW(&(W[TWVL * 6]), Tc);
+			 Tj = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Tk = BYTW(&(W[0]), Tj);
+			 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTW(&(W[TWVL * 16]), Te);
+			 Th = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Ti = BYTW(&(W[TWVL * 10]), Th);
+		    }
+		    Tg = VSUB(Td, Tf);
+		    Tl = VSUB(Ti, Tk);
+		    Tp = VADD(Tg, Tl);
+		    TD = VADD(Td, Tf);
+		    TE = VADD(Ti, Tk);
+		    TJ = VADD(TD, TE);
+	       }
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T2 = BYTW(&(W[TWVL * 2]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTW(&(W[TWVL * 12]), T3);
+			 T6 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T7 = BYTW(&(W[TWVL * 14]), T6);
+		    }
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    To = VADD(T5, Ta);
+		    TA = VADD(T2, T4);
+		    TB = VADD(T7, T9);
+		    TI = VADD(TA, TB);
+	       }
+	       {
+		    V Tq, Tv, Tw, Tn, Tz, Tb, Tm, Ty, Tx;
+		    Tq = VMUL(LDK(KP559016994), VSUB(To, Tp));
+		    Tv = VADD(To, Tp);
+		    Tw = VFNMS(LDK(KP250000000), Tv, Tu);
+		    Tb = VSUB(T5, Ta);
+		    Tm = VSUB(Tg, Tl);
+		    Tn = VBYI(VFMA(LDK(KP951056516), Tb, VMUL(LDK(KP587785252), Tm)));
+		    Tz = VBYI(VFNMS(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tb)));
+		    ST(&(x[WS(rs, 5)]), VADD(Tu, Tv), ms, &(x[WS(rs, 1)]));
+		    Ty = VSUB(Tw, Tq);
+		    ST(&(x[WS(rs, 3)]), VSUB(Ty, Tz), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VADD(Tz, Ty), ms, &(x[WS(rs, 1)]));
+		    Tx = VADD(Tq, Tw);
+		    ST(&(x[WS(rs, 1)]), VADD(Tn, Tx), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VSUB(Tx, Tn), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TM, TK, TL, TG, TP, TC, TF, TO, TN;
+		    TM = VMUL(LDK(KP559016994), VSUB(TI, TJ));
+		    TK = VADD(TI, TJ);
+		    TL = VFNMS(LDK(KP250000000), TK, TH);
+		    TC = VSUB(TA, TB);
+		    TF = VSUB(TD, TE);
+		    TG = VBYI(VFNMS(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TC)));
+		    TP = VBYI(VFMA(LDK(KP951056516), TC, VMUL(LDK(KP587785252), TF)));
+		    ST(&(x[0]), VADD(TH, TK), ms, &(x[0]));
+		    TO = VADD(TM, TL);
+		    ST(&(x[WS(rs, 4)]), VSUB(TO, TP), ms, &(x[0]));
+		    ST(&(x[WS(rs, 6)]), VADD(TP, TO), ms, &(x[0]));
+		    TN = VSUB(TL, TM);
+		    ST(&(x[WS(rs, 2)]), VADD(TG, TN), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VSUB(TN, TG), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t1buv_10"), twinstr, &GENUS, {45, 24, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_10) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1buv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1buv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t1buv_2 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T2, T3;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTW(&(W[0]), T2);
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t1buv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_2) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t1buv_2 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTW(&(W[0]), T2);
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t1buv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_2) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1buv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1buv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 3 -name t1buv_3 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 8 FP additions, 8 FP multiplications,
+ * (or, 5 additions, 5 multiplications, 3 fused multiply/add),
+ * 12 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(3, rs)) {
+	       V T1, T2, T4;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, T5, T8, T6, T7;
+		    T3 = BYTW(&(W[0]), T2);
+		    T5 = BYTW(&(W[TWVL * 2]), T4);
+		    T8 = VMUL(LDK(KP866025403), VSUB(T3, T5));
+		    T6 = VADD(T3, T5);
+		    T7 = VFNMS(LDK(KP500000000), T6, T1);
+		    ST(&(x[0]), VADD(T1, T6), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VFNMSI(T8, T7), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T8, T7), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 3, XSIMD_STRING("t1buv_3"), twinstr, &GENUS, {5, 5, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_3) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_3, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 3 -name t1buv_3 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 8 FP additions, 6 FP multiplications,
+ * (or, 7 additions, 5 multiplications, 1 fused multiply/add),
+ * 12 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(3, rs)) {
+	       V T6, T2, T4, T7, T1, T3, T5, T8;
+	       T6 = LD(&(x[0]), ms, &(x[0]));
+	       T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T2 = BYTW(&(W[0]), T1);
+	       T3 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T4 = BYTW(&(W[TWVL * 2]), T3);
+	       T7 = VADD(T2, T4);
+	       ST(&(x[0]), VADD(T6, T7), ms, &(x[0]));
+	       T5 = VBYI(VMUL(LDK(KP866025403), VSUB(T2, T4)));
+	       T8 = VFNMS(LDK(KP500000000), T7, T6);
+	       ST(&(x[WS(rs, 1)]), VADD(T5, T8), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 2)]), VSUB(T8, T5), ms, &(x[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 3, XSIMD_STRING("t1buv_3"), twinstr, &GENUS, {7, 5, 1, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_3) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_3, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1buv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1buv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1buv_4 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 11 FP additions, 8 FP multiplications,
+ * (or, 9 additions, 6 multiplications, 2 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T7, T2, T5, T8, T3, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTW(&(W[TWVL * 4]), T7);
+	       T3 = BYTW(&(W[TWVL * 2]), T2);
+	       T6 = BYTW(&(W[0]), T5);
+	       {
+		    V Ta, T4, Tb, T9;
+		    Ta = VADD(T1, T3);
+		    T4 = VSUB(T1, T3);
+		    Tb = VADD(T6, T8);
+		    T9 = VSUB(T6, T8);
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T9, T4), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VFNMSI(T9, T4), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t1buv_4"), twinstr, &GENUS, {9, 6, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_4) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1buv_4 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 11 FP additions, 6 FP multiplications,
+ * (or, 11 additions, 6 multiplications, 0 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T8, T3, T6, T7, T2, T5;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTW(&(W[TWVL * 4]), T7);
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T3 = BYTW(&(W[TWVL * 2]), T2);
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T6 = BYTW(&(W[0]), T5);
+	       {
+		    V T4, T9, Ta, Tb;
+		    T4 = VSUB(T1, T3);
+		    T9 = VBYI(VSUB(T6, T8));
+		    ST(&(x[WS(rs, 3)]), VSUB(T4, T9), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(T4, T9), ms, &(x[WS(rs, 1)]));
+		    Ta = VADD(T1, T3);
+		    Tb = VADD(T6, T8);
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t1buv_4"), twinstr, &GENUS, {11, 6, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_4) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1buv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1buv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t1buv_5 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 20 FP additions, 19 FP multiplications,
+ * (or, 11 additions, 10 multiplications, 9 fused multiply/add),
+ * 26 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V T1, T2, T9, T4, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T9 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, Ta, T5, T8;
+		    T3 = BYTW(&(W[0]), T2);
+		    Ta = BYTW(&(W[TWVL * 4]), T9);
+		    T5 = BYTW(&(W[TWVL * 6]), T4);
+		    T8 = BYTW(&(W[TWVL * 2]), T7);
+		    {
+			 V T6, Tg, Tb, Th;
+			 T6 = VADD(T3, T5);
+			 Tg = VSUB(T3, T5);
+			 Tb = VADD(T8, Ta);
+			 Th = VSUB(T8, Ta);
+			 {
+			      V Te, Tc, Tk, Ti, Td, Tj, Tf;
+			      Te = VSUB(T6, Tb);
+			      Tc = VADD(T6, Tb);
+			      Tk = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tg, Th));
+			      Ti = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Th, Tg));
+			      Td = VFNMS(LDK(KP250000000), Tc, T1);
+			      ST(&(x[0]), VADD(T1, Tc), ms, &(x[0]));
+			      Tj = VFNMS(LDK(KP559016994), Te, Td);
+			      Tf = VFMA(LDK(KP559016994), Te, Td);
+			      ST(&(x[WS(rs, 2)]), VFNMSI(Tk, Tj), ms, &(x[0]));
+			      ST(&(x[WS(rs, 3)]), VFMAI(Tk, Tj), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 4)]), VFNMSI(Ti, Tf), ms, &(x[0]));
+			      ST(&(x[WS(rs, 1)]), VFMAI(Ti, Tf), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t1buv_5"), twinstr, &GENUS, {11, 10, 9, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_5) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t1buv_5 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 20 FP additions, 14 FP multiplications,
+ * (or, 17 additions, 11 multiplications, 3 fused multiply/add),
+ * 20 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V Tf, T5, Ta, Tc, Td, Tg;
+	       Tf = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTW(&(W[0]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T4 = BYTW(&(W[TWVL * 6]), T3);
+			 T6 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T7 = BYTW(&(W[TWVL * 2]), T6);
+		    }
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    Tc = VADD(T2, T4);
+		    Td = VADD(T7, T9);
+		    Tg = VADD(Tc, Td);
+	       }
+	       ST(&(x[0]), VADD(Tf, Tg), ms, &(x[0]));
+	       {
+		    V Tb, Tj, Ti, Tk, Te, Th;
+		    Tb = VBYI(VFMA(LDK(KP951056516), T5, VMUL(LDK(KP587785252), Ta)));
+		    Tj = VBYI(VFNMS(LDK(KP951056516), Ta, VMUL(LDK(KP587785252), T5)));
+		    Te = VMUL(LDK(KP559016994), VSUB(Tc, Td));
+		    Th = VFNMS(LDK(KP250000000), Tg, Tf);
+		    Ti = VADD(Te, Th);
+		    Tk = VSUB(Th, Te);
+		    ST(&(x[WS(rs, 1)]), VADD(Tb, Ti), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VSUB(Tk, Tj), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 4)]), VSUB(Ti, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Tj, Tk), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t1buv_5"), twinstr, &GENUS, {17, 11, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_5) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1buv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1buv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name t1buv_6 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 23 FP additions, 18 FP multiplications,
+ * (or, 17 additions, 12 multiplications, 6 fused multiply/add),
+ * 27 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 10)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(6, rs)) {
+	       V T1, T2, Ta, Tc, T5, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T3, Tb, Td, T6, T8;
+		    T3 = BYTW(&(W[TWVL * 4]), T2);
+		    Tb = BYTW(&(W[TWVL * 6]), Ta);
+		    Td = BYTW(&(W[0]), Tc);
+		    T6 = BYTW(&(W[TWVL * 2]), T5);
+		    T8 = BYTW(&(W[TWVL * 8]), T7);
+		    {
+			 V Ti, T4, Tk, Te, Tj, T9;
+			 Ti = VADD(T1, T3);
+			 T4 = VSUB(T1, T3);
+			 Tk = VADD(Tb, Td);
+			 Te = VSUB(Tb, Td);
+			 Tj = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+			 {
+			      V Tl, Tn, Tf, Th, Tm, Tg;
+			      Tl = VADD(Tj, Tk);
+			      Tn = VMUL(LDK(KP866025403), VSUB(Tj, Tk));
+			      Tf = VADD(T9, Te);
+			      Th = VMUL(LDK(KP866025403), VSUB(T9, Te));
+			      ST(&(x[0]), VADD(Ti, Tl), ms, &(x[0]));
+			      Tm = VFNMS(LDK(KP500000000), Tl, Ti);
+			      ST(&(x[WS(rs, 3)]), VADD(T4, Tf), ms, &(x[WS(rs, 1)]));
+			      Tg = VFNMS(LDK(KP500000000), Tf, T4);
+			      ST(&(x[WS(rs, 4)]), VFMAI(Tn, Tm), ms, &(x[0]));
+			      ST(&(x[WS(rs, 2)]), VFNMSI(Tn, Tm), ms, &(x[0]));
+			      ST(&(x[WS(rs, 5)]), VFNMSI(Th, Tg), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VFMAI(Th, Tg), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 6, XSIMD_STRING("t1buv_6"), twinstr, &GENUS, {17, 12, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_6) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_6, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name t1buv_6 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 23 FP additions, 14 FP multiplications,
+ * (or, 21 additions, 12 multiplications, 2 fused multiply/add),
+ * 19 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 10)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(6, rs)) {
+	       V Tf, Ti, Ta, Tk, T5, Tj, Tc, Te, Td;
+	       Tc = LD(&(x[0]), ms, &(x[0]));
+	       Td = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       Te = BYTW(&(W[TWVL * 4]), Td);
+	       Tf = VSUB(Tc, Te);
+	       Ti = VADD(Tc, Te);
+	       {
+		    V T7, T9, T6, T8;
+		    T6 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    T7 = BYTW(&(W[TWVL * 6]), T6);
+		    T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T9 = BYTW(&(W[0]), T8);
+		    Ta = VSUB(T7, T9);
+		    Tk = VADD(T7, T9);
+	       }
+	       {
+		    V T2, T4, T1, T3;
+		    T1 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T2 = BYTW(&(W[TWVL * 2]), T1);
+		    T3 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T4 = BYTW(&(W[TWVL * 8]), T3);
+		    T5 = VSUB(T2, T4);
+		    Tj = VADD(T2, T4);
+	       }
+	       {
+		    V Tb, Tg, Th, Tn, Tl, Tm;
+		    Tb = VBYI(VMUL(LDK(KP866025403), VSUB(T5, Ta)));
+		    Tg = VADD(T5, Ta);
+		    Th = VFNMS(LDK(KP500000000), Tg, Tf);
+		    ST(&(x[WS(rs, 1)]), VADD(Tb, Th), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VADD(Tf, Tg), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VSUB(Th, Tb), ms, &(x[WS(rs, 1)]));
+		    Tn = VBYI(VMUL(LDK(KP866025403), VSUB(Tj, Tk)));
+		    Tl = VADD(Tj, Tk);
+		    Tm = VFNMS(LDK(KP500000000), Tl, Ti);
+		    ST(&(x[WS(rs, 2)]), VSUB(Tm, Tn), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ti, Tl), ms, &(x[0]));
+		    ST(&(x[WS(rs, 4)]), VADD(Tn, Tm), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 6, XSIMD_STRING("t1buv_6"), twinstr, &GENUS, {21, 12, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_6) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_6, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1buv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1buv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name t1buv_7 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 36 FP additions, 36 FP multiplications,
+ * (or, 15 additions, 15 multiplications, 21 fused multiply/add),
+ * 42 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 12)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 12), MAKE_VOLATILE_STRIDE(7, rs)) {
+	       V T1, T2, T4, Te, Tc, T9, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       Te = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T9 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, T5, Tf, Td, Ta, T8;
+		    T3 = BYTW(&(W[0]), T2);
+		    T5 = BYTW(&(W[TWVL * 10]), T4);
+		    Tf = BYTW(&(W[TWVL * 6]), Te);
+		    Td = BYTW(&(W[TWVL * 4]), Tc);
+		    Ta = BYTW(&(W[TWVL * 8]), T9);
+		    T8 = BYTW(&(W[TWVL * 2]), T7);
+		    {
+			 V T6, Tm, Tg, Tk, Tb, Tl;
+			 T6 = VADD(T3, T5);
+			 Tm = VSUB(T3, T5);
+			 Tg = VADD(Td, Tf);
+			 Tk = VSUB(Td, Tf);
+			 Tb = VADD(T8, Ta);
+			 Tl = VSUB(T8, Ta);
+			 {
+			      V Tp, Tx, Tu, Th, Ts, Tn, Tq, Ty;
+			      Tp = VFNMS(LDK(KP356895867), T6, Tg);
+			      Tx = VFMA(LDK(KP554958132), Tk, Tm);
+			      ST(&(x[0]), VADD(T1, VADD(T6, VADD(Tb, Tg))), ms, &(x[0]));
+			      Tu = VFNMS(LDK(KP356895867), Tb, T6);
+			      Th = VFNMS(LDK(KP356895867), Tg, Tb);
+			      Ts = VFMA(LDK(KP554958132), Tl, Tk);
+			      Tn = VFNMS(LDK(KP554958132), Tm, Tl);
+			      Tq = VFNMS(LDK(KP692021471), Tp, Tb);
+			      Ty = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), Tx, Tl));
+			      {
+				   V Tv, Ti, Tt, To, Tr, Tw, Tj;
+				   Tv = VFNMS(LDK(KP692021471), Tu, Tg);
+				   Ti = VFNMS(LDK(KP692021471), Th, T6);
+				   Tt = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Ts, Tm));
+				   To = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tn, Tk));
+				   Tr = VFNMS(LDK(KP900968867), Tq, T1);
+				   Tw = VFNMS(LDK(KP900968867), Tv, T1);
+				   Tj = VFNMS(LDK(KP900968867), Ti, T1);
+				   ST(&(x[WS(rs, 5)]), VFNMSI(Tt, Tr), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 2)]), VFMAI(Tt, Tr), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(Ty, Tw), ms, &(x[0]));
+				   ST(&(x[WS(rs, 1)]), VFMAI(Ty, Tw), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 4)]), VFNMSI(To, Tj), ms, &(x[0]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(To, Tj), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 7, XSIMD_STRING("t1buv_7"), twinstr, &GENUS, {15, 15, 21, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_7) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_7, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name t1buv_7 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 36 FP additions, 30 FP multiplications,
+ * (or, 24 additions, 18 multiplications, 12 fused multiply/add),
+ * 21 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 12)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 12), MAKE_VOLATILE_STRIDE(7, rs)) {
+	       V Th, Tf, Ti, T5, Tk, Ta, Tj, To, Tp;
+	       Th = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V Tc, Te, Tb, Td;
+		    Tb = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tc = BYTW(&(W[TWVL * 2]), Tb);
+		    Td = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Te = BYTW(&(W[TWVL * 8]), Td);
+		    Tf = VSUB(Tc, Te);
+		    Ti = VADD(Tc, Te);
+	       }
+	       {
+		    V T2, T4, T1, T3;
+		    T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T2 = BYTW(&(W[0]), T1);
+		    T3 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T4 = BYTW(&(W[TWVL * 10]), T3);
+		    T5 = VSUB(T2, T4);
+		    Tk = VADD(T2, T4);
+	       }
+	       {
+		    V T7, T9, T6, T8;
+		    T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T7 = BYTW(&(W[TWVL * 4]), T6);
+		    T8 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    T9 = BYTW(&(W[TWVL * 6]), T8);
+		    Ta = VSUB(T7, T9);
+		    Tj = VADD(T7, T9);
+	       }
+	       ST(&(x[0]), VADD(Th, VADD(Tk, VADD(Ti, Tj))), ms, &(x[0]));
+	       To = VBYI(VFNMS(LDK(KP781831482), Ta, VFNMS(LDK(KP433883739), Tf, VMUL(LDK(KP974927912), T5))));
+	       Tp = VFMA(LDK(KP623489801), Tj, VFNMS(LDK(KP900968867), Ti, VFNMS(LDK(KP222520933), Tk, Th)));
+	       ST(&(x[WS(rs, 2)]), VADD(To, Tp), ms, &(x[0]));
+	       ST(&(x[WS(rs, 5)]), VSUB(Tp, To), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tg, Tl, Tm, Tn;
+		    Tg = VBYI(VFMA(LDK(KP433883739), T5, VFNMS(LDK(KP781831482), Tf, VMUL(LDK(KP974927912), Ta))));
+		    Tl = VFMA(LDK(KP623489801), Ti, VFNMS(LDK(KP222520933), Tj, VFNMS(LDK(KP900968867), Tk, Th)));
+		    ST(&(x[WS(rs, 3)]), VADD(Tg, Tl), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 4)]), VSUB(Tl, Tg), ms, &(x[0]));
+		    Tm = VBYI(VFMA(LDK(KP781831482), T5, VFMA(LDK(KP974927912), Tf, VMUL(LDK(KP433883739), Ta))));
+		    Tn = VFMA(LDK(KP623489801), Tk, VFNMS(LDK(KP900968867), Tj, VFNMS(LDK(KP222520933), Ti, Th)));
+		    ST(&(x[WS(rs, 1)]), VADD(Tm, Tn), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VSUB(Tn, Tm), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 7, XSIMD_STRING("t1buv_7"), twinstr, &GENUS, {24, 18, 12, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_7) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_7, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1buv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1buv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1buv_8 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 33 FP additions, 24 FP multiplications,
+ * (or, 23 additions, 14 multiplications, 10 fused multiply/add),
+ * 36 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T1, T2, Th, Tj, T5, T7, Ta, Tc;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       Tj = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+	       Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T3, Ti, Tk, T6, T8, Tb, Td;
+		    T3 = BYTW(&(W[TWVL * 6]), T2);
+		    Ti = BYTW(&(W[TWVL * 2]), Th);
+		    Tk = BYTW(&(W[TWVL * 10]), Tj);
+		    T6 = BYTW(&(W[0]), T5);
+		    T8 = BYTW(&(W[TWVL * 8]), T7);
+		    Tb = BYTW(&(W[TWVL * 12]), Ta);
+		    Td = BYTW(&(W[TWVL * 4]), Tc);
+		    {
+			 V Tq, T4, Tr, Tl, Tt, T9, Tu, Te, Tw, Ts;
+			 Tq = VADD(T1, T3);
+			 T4 = VSUB(T1, T3);
+			 Tr = VADD(Ti, Tk);
+			 Tl = VSUB(Ti, Tk);
+			 Tt = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+			 Tu = VADD(Tb, Td);
+			 Te = VSUB(Tb, Td);
+			 Tw = VADD(Tq, Tr);
+			 Ts = VSUB(Tq, Tr);
+			 {
+			      V Tx, Tv, Tm, Tf;
+			      Tx = VADD(Tt, Tu);
+			      Tv = VSUB(Tt, Tu);
+			      Tm = VSUB(T9, Te);
+			      Tf = VADD(T9, Te);
+			      {
+				   V Tp, Tn, To, Tg;
+				   ST(&(x[0]), VADD(Tw, Tx), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VSUB(Tw, Tx), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFMAI(Tv, Ts), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(Tv, Ts), ms, &(x[0]));
+				   Tp = VFMA(LDK(KP707106781), Tm, Tl);
+				   Tn = VFNMS(LDK(KP707106781), Tm, Tl);
+				   To = VFMA(LDK(KP707106781), Tf, T4);
+				   Tg = VFNMS(LDK(KP707106781), Tf, T4);
+				   ST(&(x[WS(rs, 1)]), VFMAI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFNMSI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 5)]), VFMAI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFNMSI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t1buv_8"), twinstr, &GENUS, {23, 14, 10, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_8) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1buv_8 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 33 FP additions, 16 FP multiplications,
+ * (or, 33 additions, 16 multiplications, 0 fused multiply/add),
+ * 24 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V Tl, Tq, Tg, Tr, T5, Tt, Ta, Tu, Ti, Tk, Tj;
+	       Ti = LD(&(x[0]), ms, &(x[0]));
+	       Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tk = BYTW(&(W[TWVL * 6]), Tj);
+	       Tl = VSUB(Ti, Tk);
+	       Tq = VADD(Ti, Tk);
+	       {
+		    V Td, Tf, Tc, Te;
+		    Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Td = BYTW(&(W[TWVL * 2]), Tc);
+		    Te = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    Tf = BYTW(&(W[TWVL * 10]), Te);
+		    Tg = VSUB(Td, Tf);
+		    Tr = VADD(Td, Tf);
+	       }
+	       {
+		    V T2, T4, T1, T3;
+		    T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T2 = BYTW(&(W[0]), T1);
+		    T3 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T4 = BYTW(&(W[TWVL * 8]), T3);
+		    T5 = VSUB(T2, T4);
+		    Tt = VADD(T2, T4);
+	       }
+	       {
+		    V T7, T9, T6, T8;
+		    T6 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    T7 = BYTW(&(W[TWVL * 12]), T6);
+		    T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T9 = BYTW(&(W[TWVL * 4]), T8);
+		    Ta = VSUB(T7, T9);
+		    Tu = VADD(T7, T9);
+	       }
+	       {
+		    V Ts, Tv, Tw, Tx;
+		    Ts = VSUB(Tq, Tr);
+		    Tv = VBYI(VSUB(Tt, Tu));
+		    ST(&(x[WS(rs, 6)]), VSUB(Ts, Tv), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Ts, Tv), ms, &(x[0]));
+		    Tw = VADD(Tq, Tr);
+		    Tx = VADD(Tt, Tu);
+		    ST(&(x[WS(rs, 4)]), VSUB(Tw, Tx), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Tw, Tx), ms, &(x[0]));
+		    {
+			 V Th, To, Tn, Tp, Tb, Tm;
+			 Tb = VMUL(LDK(KP707106781), VSUB(T5, Ta));
+			 Th = VBYI(VSUB(Tb, Tg));
+			 To = VBYI(VADD(Tg, Tb));
+			 Tm = VMUL(LDK(KP707106781), VADD(T5, Ta));
+			 Tn = VSUB(Tl, Tm);
+			 Tp = VADD(Tl, Tm);
+			 ST(&(x[WS(rs, 3)]), VADD(Th, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VSUB(Tp, To), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VSUB(Tn, Th), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t1buv_8"), twinstr, &GENUS, {33, 16, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_8) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1buv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1buv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1buv_9 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 54 FP additions, 54 FP multiplications,
+ * (or, 20 additions, 20 multiplications, 34 fused multiply/add),
+ * 67 stack variables, 19 constants, and 18 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
+     DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
+     DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
+     DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
+     DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
+     DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
+     DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
+     DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
+     DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
+     DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
+     DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
+	       V T1, T3, T5, T9, Tn, Tb, Td, Th, Tj, Tx, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T2, T4, T8, Tm;
+		    T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T8 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    {
+			 V Ta, Tc, Tg, Ti;
+			 Ta = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 Tc = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Ti = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T3 = BYTW(&(W[TWVL * 4]), T2);
+			 T5 = BYTW(&(W[TWVL * 10]), T4);
+			 T9 = BYTW(&(W[TWVL * 2]), T8);
+			 Tn = BYTW(&(W[0]), Tm);
+			 Tb = BYTW(&(W[TWVL * 8]), Ta);
+			 Td = BYTW(&(W[TWVL * 14]), Tc);
+			 Th = BYTW(&(W[TWVL * 6]), Tg);
+			 Tj = BYTW(&(W[TWVL * 12]), Ti);
+		    }
+	       }
+	       Tx = VSUB(T3, T5);
+	       T6 = VADD(T3, T5);
+	       {
+		    V Tl, Te, Tk, To, T7, TN;
+		    Tl = VSUB(Td, Tb);
+		    Te = VADD(Tb, Td);
+		    Tk = VSUB(Th, Tj);
+		    To = VADD(Th, Tj);
+		    T7 = VFNMS(LDK(KP500000000), T6, T1);
+		    TN = VADD(T1, T6);
+		    {
+			 V Tf, TP, Tp, TO;
+			 Tf = VFNMS(LDK(KP500000000), Te, T9);
+			 TP = VADD(T9, Te);
+			 Tp = VFNMS(LDK(KP500000000), To, Tn);
+			 TO = VADD(Tn, To);
+			 {
+			      V Tz, TC, Tu, TD, TA, Tq, TQ, TS;
+			      Tz = VFNMS(LDK(KP152703644), Tl, Tf);
+			      TC = VFMA(LDK(KP203604859), Tf, Tl);
+			      Tu = VFNMS(LDK(KP439692620), Tk, Tf);
+			      TD = VFNMS(LDK(KP726681596), Tk, Tp);
+			      TA = VFMA(LDK(KP968908795), Tp, Tk);
+			      Tq = VFNMS(LDK(KP586256827), Tp, Tl);
+			      TQ = VADD(TO, TP);
+			      TS = VMUL(LDK(KP866025403), VSUB(TO, TP));
+			      {
+				   V TI, TB, TH, TE, Tr, TR, Tw, Tv;
+				   Tv = VFNMS(LDK(KP420276625), Tu, Tl);
+				   TI = VFMA(LDK(KP673648177), TA, Tz);
+				   TB = VFNMS(LDK(KP673648177), TA, Tz);
+				   TH = VFNMS(LDK(KP898197570), TD, TC);
+				   TE = VFMA(LDK(KP898197570), TD, TC);
+				   Tr = VFNMS(LDK(KP347296355), Tq, Tk);
+				   ST(&(x[0]), VADD(TQ, TN), ms, &(x[0]));
+				   TR = VFNMS(LDK(KP500000000), TQ, TN);
+				   Tw = VFNMS(LDK(KP826351822), Tv, Tp);
+				   {
+					V TM, TL, TF, TJ, Ts, Ty, TG, TK, Tt;
+					TM = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tx, TI));
+					TL = VFMA(LDK(KP852868531), TE, T7);
+					TF = VFNMS(LDK(KP500000000), TE, TB);
+					TJ = VFMA(LDK(KP666666666), TI, TH);
+					Ts = VFNMS(LDK(KP907603734), Tr, Tf);
+					ST(&(x[WS(rs, 6)]), VFNMSI(TS, TR), ms, &(x[0]));
+					ST(&(x[WS(rs, 3)]), VFMAI(TS, TR), ms, &(x[WS(rs, 1)]));
+					Ty = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tx, Tw));
+					ST(&(x[WS(rs, 8)]), VFNMSI(TM, TL), ms, &(x[0]));
+					ST(&(x[WS(rs, 1)]), VFMAI(TM, TL), ms, &(x[WS(rs, 1)]));
+					TG = VFMA(LDK(KP852868531), TF, T7);
+					TK = VMUL(LDK(KP866025403), VFNMS(LDK(KP852868531), TJ, Tx));
+					Tt = VFNMS(LDK(KP939692620), Ts, T7);
+					ST(&(x[WS(rs, 5)]), VFNMSI(TK, TG), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 4)]), VFMAI(TK, TG), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFMAI(Ty, Tt), ms, &(x[0]));
+					ST(&(x[WS(rs, 7)]), VFNMSI(Ty, Tt), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 9, XSIMD_STRING("t1buv_9"), twinstr, &GENUS, {20, 20, 34, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_9) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_9, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1buv_9 -include t1bu.h -sign 1 */
+
+/*
+ * This function contains 54 FP additions, 42 FP multiplications,
+ * (or, 38 additions, 26 multiplications, 16 fused multiply/add),
+ * 38 stack variables, 14 constants, and 18 memory accesses
+ */
+#include "t1bu.h"
+
+static void t1buv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
+     DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
+     DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
+     DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
+	       V T1, T6, Tu, Tg, Tf, TD, Tq, Tp, TE;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T3, T5, T2, T4;
+		    T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T3 = BYTW(&(W[TWVL * 4]), T2);
+		    T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T5 = BYTW(&(W[TWVL * 10]), T4);
+		    T6 = VADD(T3, T5);
+		    Tu = VMUL(LDK(KP866025403), VSUB(T3, T5));
+	       }
+	       {
+		    V T9, Td, Tb, T8, Tc, Ta, Te;
+		    T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T9 = BYTW(&(W[0]), T8);
+		    Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTW(&(W[TWVL * 12]), Tc);
+		    Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tb = BYTW(&(W[TWVL * 6]), Ta);
+		    Tg = VSUB(Tb, Td);
+		    Te = VADD(Tb, Td);
+		    Tf = VFNMS(LDK(KP500000000), Te, T9);
+		    TD = VADD(T9, Te);
+	       }
+	       {
+		    V Tj, Tn, Tl, Ti, Tm, Tk, To;
+		    Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tj = BYTW(&(W[TWVL * 2]), Ti);
+		    Tm = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    Tn = BYTW(&(W[TWVL * 14]), Tm);
+		    Tk = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Tl = BYTW(&(W[TWVL * 8]), Tk);
+		    Tq = VSUB(Tl, Tn);
+		    To = VADD(Tl, Tn);
+		    Tp = VFNMS(LDK(KP500000000), To, Tj);
+		    TE = VADD(Tj, To);
+	       }
+	       {
+		    V TF, TG, TH, TI;
+		    TF = VBYI(VMUL(LDK(KP866025403), VSUB(TD, TE)));
+		    TG = VADD(T1, T6);
+		    TH = VADD(TD, TE);
+		    TI = VFNMS(LDK(KP500000000), TH, TG);
+		    ST(&(x[WS(rs, 3)]), VADD(TF, TI), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[0]), VADD(TG, TH), ms, &(x[0]));
+		    ST(&(x[WS(rs, 6)]), VSUB(TI, TF), ms, &(x[0]));
+	       }
+	       {
+		    V TC, Tv, Tw, Tx, Th, Tr, Ts, T7, TB;
+		    TC = VBYI(VSUB(VFMA(LDK(KP984807753), Tf, VFMA(LDK(KP813797681), Tq, VFNMS(LDK(KP150383733), Tg, VMUL(LDK(KP342020143), Tp)))), Tu));
+		    Tv = VFMA(LDK(KP663413948), Tg, VMUL(LDK(KP642787609), Tf));
+		    Tw = VFMA(LDK(KP150383733), Tq, VMUL(LDK(KP984807753), Tp));
+		    Tx = VADD(Tv, Tw);
+		    Th = VFNMS(LDK(KP556670399), Tg, VMUL(LDK(KP766044443), Tf));
+		    Tr = VFNMS(LDK(KP852868531), Tq, VMUL(LDK(KP173648177), Tp));
+		    Ts = VADD(Th, Tr);
+		    T7 = VFNMS(LDK(KP500000000), T6, T1);
+		    TB = VFMA(LDK(KP852868531), Tg, VFMA(LDK(KP173648177), Tf, VFMA(LDK(KP296198132), Tq, VFNMS(LDK(KP939692620), Tp, T7))));
+		    ST(&(x[WS(rs, 7)]), VSUB(TB, TC), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 2)]), VADD(TB, TC), ms, &(x[0]));
+		    {
+			 V Tt, Ty, Tz, TA;
+			 Tt = VADD(T7, Ts);
+			 Ty = VBYI(VADD(Tu, Tx));
+			 ST(&(x[WS(rs, 8)]), VSUB(Tt, Ty), ms, &(x[0]));
+			 ST(&(x[WS(rs, 1)]), VADD(Tt, Ty), ms, &(x[WS(rs, 1)]));
+			 Tz = VBYI(VADD(Tu, VFNMS(LDK(KP500000000), Tx, VMUL(LDK(KP866025403), VSUB(Th, Tr)))));
+			 TA = VFMA(LDK(KP866025403), VSUB(Tw, Tv), VFNMS(LDK(KP500000000), Ts, T7));
+			 ST(&(x[WS(rs, 4)]), VADD(Tz, TA), ms, &(x[0]));
+			 ST(&(x[WS(rs, 5)]), VSUB(TA, Tz), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 9, XSIMD_STRING("t1buv_9"), twinstr, &GENUS, {38, 26, 16, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1buv_9) (planner *p) {
+     X(kdft_dit_register) (p, t1buv_9, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1bv_10 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 51 FP additions, 40 FP multiplications,
+ * (or, 33 additions, 22 multiplications, 18 fused multiply/add),
+ * 43 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Td, TA, T4, Ta, Tk, TE, Tp, TF, TB, T9, T1, T2, Tb;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tg, Tn, Ti, Tl;
+		    Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tn = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Ti = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+		    Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    {
+			 V T6, T8, T5, Tc;
+			 T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 {
+			      V T3, Th, To, Tj, Tm, T7;
+			      T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T3 = BYTW(&(W[TWVL * 8]), T2);
+			      Th = BYTW(&(W[TWVL * 6]), Tg);
+			      To = BYTW(&(W[0]), Tn);
+			      Tj = BYTW(&(W[TWVL * 16]), Ti);
+			      Tm = BYTW(&(W[TWVL * 10]), Tl);
+			      T6 = BYTW(&(W[TWVL * 2]), T5);
+			      Td = BYTW(&(W[TWVL * 4]), Tc);
+			      T8 = BYTW(&(W[TWVL * 12]), T7);
+			      TA = VADD(T1, T3);
+			      T4 = VSUB(T1, T3);
+			      Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Tk = VSUB(Th, Tj);
+			      TE = VADD(Th, Tj);
+			      Tp = VSUB(Tm, To);
+			      TF = VADD(Tm, To);
+			 }
+			 TB = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+		    }
+	       }
+	       Tb = BYTW(&(W[TWVL * 14]), Ta);
+	       {
+		    V TL, TG, Tw, Tq, TC, Te;
+		    TL = VSUB(TE, TF);
+		    TG = VADD(TE, TF);
+		    Tw = VSUB(Tk, Tp);
+		    Tq = VADD(Tk, Tp);
+		    TC = VADD(Tb, Td);
+		    Te = VSUB(Tb, Td);
+		    {
+			 V TM, TD, Tv, Tf;
+			 TM = VSUB(TB, TC);
+			 TD = VADD(TB, TC);
+			 Tv = VSUB(T9, Te);
+			 Tf = VADD(T9, Te);
+			 {
+			      V TP, TN, TH, TJ, Tz, Tx, Tr, Tt, TI, Ts;
+			      TP = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TL, TM));
+			      TN = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TM, TL));
+			      TH = VADD(TD, TG);
+			      TJ = VSUB(TD, TG);
+			      Tz = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tv, Tw));
+			      Tx = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tw, Tv));
+			      Tr = VADD(Tf, Tq);
+			      Tt = VSUB(Tf, Tq);
+			      ST(&(x[0]), VADD(TA, TH), ms, &(x[0]));
+			      TI = VFNMS(LDK(KP250000000), TH, TA);
+			      ST(&(x[WS(rs, 5)]), VADD(T4, Tr), ms, &(x[WS(rs, 1)]));
+			      Ts = VFNMS(LDK(KP250000000), Tr, T4);
+			      {
+				   V TK, TO, Tu, Ty;
+				   TK = VFNMS(LDK(KP559016994), TJ, TI);
+				   TO = VFMA(LDK(KP559016994), TJ, TI);
+				   Tu = VFMA(LDK(KP559016994), Tt, Ts);
+				   Ty = VFNMS(LDK(KP559016994), Tt, Ts);
+				   ST(&(x[WS(rs, 8)]), VFMAI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFNMSI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFMAI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VFNMSI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 9)]), VFNMSI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFMAI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFNMSI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t1bv_10"), twinstr, &GENUS, {33, 22, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_10) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1bv_10 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 51 FP additions, 30 FP multiplications,
+ * (or, 45 additions, 24 multiplications, 6 fused multiply/add),
+ * 32 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Tu, TH, Tg, Tl, Tp, TD, TE, TJ, T5, Ta, To, TA, TB, TI, Tr;
+	       V Tt, Ts;
+	       Tr = LD(&(x[0]), ms, &(x[0]));
+	       Ts = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Tt = BYTW(&(W[TWVL * 8]), Ts);
+	       Tu = VSUB(Tr, Tt);
+	       TH = VADD(Tr, Tt);
+	       {
+		    V Td, Tk, Tf, Ti;
+		    {
+			 V Tc, Tj, Te, Th;
+			 Tc = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Td = BYTW(&(W[TWVL * 6]), Tc);
+			 Tj = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Tk = BYTW(&(W[0]), Tj);
+			 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTW(&(W[TWVL * 16]), Te);
+			 Th = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Ti = BYTW(&(W[TWVL * 10]), Th);
+		    }
+		    Tg = VSUB(Td, Tf);
+		    Tl = VSUB(Ti, Tk);
+		    Tp = VADD(Tg, Tl);
+		    TD = VADD(Td, Tf);
+		    TE = VADD(Ti, Tk);
+		    TJ = VADD(TD, TE);
+	       }
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T2 = BYTW(&(W[TWVL * 2]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTW(&(W[TWVL * 12]), T3);
+			 T6 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T7 = BYTW(&(W[TWVL * 14]), T6);
+		    }
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    To = VADD(T5, Ta);
+		    TA = VADD(T2, T4);
+		    TB = VADD(T7, T9);
+		    TI = VADD(TA, TB);
+	       }
+	       {
+		    V Tq, Tv, Tw, Tn, Tz, Tb, Tm, Ty, Tx;
+		    Tq = VMUL(LDK(KP559016994), VSUB(To, Tp));
+		    Tv = VADD(To, Tp);
+		    Tw = VFNMS(LDK(KP250000000), Tv, Tu);
+		    Tb = VSUB(T5, Ta);
+		    Tm = VSUB(Tg, Tl);
+		    Tn = VBYI(VFMA(LDK(KP951056516), Tb, VMUL(LDK(KP587785252), Tm)));
+		    Tz = VBYI(VFNMS(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tb)));
+		    ST(&(x[WS(rs, 5)]), VADD(Tu, Tv), ms, &(x[WS(rs, 1)]));
+		    Ty = VSUB(Tw, Tq);
+		    ST(&(x[WS(rs, 3)]), VSUB(Ty, Tz), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VADD(Tz, Ty), ms, &(x[WS(rs, 1)]));
+		    Tx = VADD(Tq, Tw);
+		    ST(&(x[WS(rs, 1)]), VADD(Tn, Tx), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VSUB(Tx, Tn), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TM, TK, TL, TG, TP, TC, TF, TO, TN;
+		    TM = VMUL(LDK(KP559016994), VSUB(TI, TJ));
+		    TK = VADD(TI, TJ);
+		    TL = VFNMS(LDK(KP250000000), TK, TH);
+		    TC = VSUB(TA, TB);
+		    TF = VSUB(TD, TE);
+		    TG = VBYI(VFNMS(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TC)));
+		    TP = VBYI(VFMA(LDK(KP951056516), TC, VMUL(LDK(KP587785252), TF)));
+		    ST(&(x[0]), VADD(TH, TK), ms, &(x[0]));
+		    TO = VADD(TM, TL);
+		    ST(&(x[WS(rs, 4)]), VSUB(TO, TP), ms, &(x[0]));
+		    ST(&(x[WS(rs, 6)]), VADD(TP, TO), ms, &(x[0]));
+		    TN = VSUB(TL, TM);
+		    ST(&(x[WS(rs, 2)]), VADD(TG, TN), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VSUB(TN, TG), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t1bv_10"), twinstr, &GENUS, {45, 24, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_10) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1bv_12 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 59 FP additions, 42 FP multiplications,
+ * (or, 41 additions, 24 multiplications, 18 fused multiply/add),
+ * 41 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
+	       V TI, Ti, TA, T7, Tm, TE, Tw, Tk, Tf, TB, TU, TM;
+	       {
+		    V T9, TK, Tj, TL, Te;
+		    {
+			 V T1, T4, T2, Tp, Tt, Tr;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Tp = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 {
+			      V T5, T3, Tq, Tu, Ts, Td, Tb, T8, Tc, Ta;
+			      T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      T5 = BYTW(&(W[TWVL * 14]), T4);
+			      T3 = BYTW(&(W[TWVL * 6]), T2);
+			      Tq = BYTW(&(W[TWVL * 16]), Tp);
+			      Tu = BYTW(&(W[TWVL * 8]), Tt);
+			      Ts = BYTW(&(W[0]), Tr);
+			      T9 = BYTW(&(W[TWVL * 10]), T8);
+			      Td = BYTW(&(W[TWVL * 2]), Tc);
+			      Tb = BYTW(&(W[TWVL * 18]), Ta);
+			      {
+				   V Th, T6, Tl, Tv;
+				   Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   TK = VSUB(T3, T5);
+				   T6 = VADD(T3, T5);
+				   Tl = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   Tv = VADD(Ts, Tu);
+				   TI = VSUB(Tu, Ts);
+				   Tj = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+				   TL = VSUB(Tb, Td);
+				   Te = VADD(Tb, Td);
+				   Ti = BYTW(&(W[TWVL * 4]), Th);
+				   TA = VFNMS(LDK(KP500000000), T6, T1);
+				   T7 = VADD(T1, T6);
+				   Tm = BYTW(&(W[TWVL * 20]), Tl);
+				   TE = VFNMS(LDK(KP500000000), Tv, Tq);
+				   Tw = VADD(Tq, Tv);
+			      }
+			 }
+		    }
+		    Tk = BYTW(&(W[TWVL * 12]), Tj);
+		    Tf = VADD(T9, Te);
+		    TB = VFNMS(LDK(KP500000000), Te, T9);
+		    TU = VSUB(TK, TL);
+		    TM = VADD(TK, TL);
+	       }
+	       {
+		    V Tn, TH, TC, TQ, Ty, Tg;
+		    Tn = VADD(Tk, Tm);
+		    TH = VSUB(Tk, Tm);
+		    TC = VADD(TA, TB);
+		    TQ = VSUB(TA, TB);
+		    Ty = VADD(T7, Tf);
+		    Tg = VSUB(T7, Tf);
+		    {
+			 V To, TD, TJ, TR;
+			 To = VADD(Ti, Tn);
+			 TD = VFNMS(LDK(KP500000000), Tn, Ti);
+			 TJ = VSUB(TH, TI);
+			 TR = VADD(TH, TI);
+			 {
+			      V TP, TN, TW, TS, TO, TG, TX, TV;
+			      {
+				   V Tz, Tx, TF, TT;
+				   Tz = VADD(To, Tw);
+				   Tx = VSUB(To, Tw);
+				   TF = VADD(TD, TE);
+				   TT = VSUB(TD, TE);
+				   TP = VMUL(LDK(KP866025403), VADD(TM, TJ));
+				   TN = VMUL(LDK(KP866025403), VSUB(TJ, TM));
+				   TW = VFMA(LDK(KP866025403), TR, TQ);
+				   TS = VFNMS(LDK(KP866025403), TR, TQ);
+				   ST(&(x[WS(rs, 6)]), VSUB(Ty, Tz), ms, &(x[0]));
+				   ST(&(x[0]), VADD(Ty, Tz), ms, &(x[0]));
+				   ST(&(x[WS(rs, 9)]), VFMAI(Tx, Tg), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFNMSI(Tx, Tg), ms, &(x[WS(rs, 1)]));
+				   TO = VADD(TC, TF);
+				   TG = VSUB(TC, TF);
+				   TX = VFNMS(LDK(KP866025403), TU, TT);
+				   TV = VFMA(LDK(KP866025403), TU, TT);
+			      }
+			      ST(&(x[WS(rs, 8)]), VFNMSI(TP, TO), ms, &(x[0]));
+			      ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
+			      ST(&(x[WS(rs, 2)]), VFMAI(TN, TG), ms, &(x[0]));
+			      ST(&(x[WS(rs, 10)]), VFNMSI(TN, TG), ms, &(x[0]));
+			      ST(&(x[WS(rs, 5)]), VFMAI(TX, TW), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 7)]), VFNMSI(TX, TW), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 11)]), VFNMSI(TV, TS), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VFMAI(TV, TS), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 12, XSIMD_STRING("t1bv_12"), twinstr, &GENUS, {41, 24, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_12) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_12, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1bv_12 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 59 FP additions, 30 FP multiplications,
+ * (or, 55 additions, 26 multiplications, 4 fused multiply/add),
+ * 28 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
+	       V T1, Tt, T6, T7, TB, Tq, TC, TD, T9, Tu, Te, Tf, Tx, Tl, Ty;
+	       V Tz;
+	       {
+		    V T5, T3, T4, T2;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    T5 = BYTW(&(W[TWVL * 14]), T4);
+		    T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    T3 = BYTW(&(W[TWVL * 6]), T2);
+		    Tt = VSUB(T3, T5);
+		    T6 = VADD(T3, T5);
+		    T7 = VFNMS(LDK(KP500000000), T6, T1);
+	       }
+	       {
+		    V Tn, Tp, Tm, TA, To;
+		    Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Tn = BYTW(&(W[0]), Tm);
+		    TA = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+		    TB = BYTW(&(W[TWVL * 16]), TA);
+		    To = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Tp = BYTW(&(W[TWVL * 8]), To);
+		    Tq = VSUB(Tn, Tp);
+		    TC = VADD(Tn, Tp);
+		    TD = VFNMS(LDK(KP500000000), TC, TB);
+	       }
+	       {
+		    V Td, Tb, T8, Tc, Ta;
+		    T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T9 = BYTW(&(W[TWVL * 10]), T8);
+		    Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Td = BYTW(&(W[TWVL * 2]), Tc);
+		    Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    Tb = BYTW(&(W[TWVL * 18]), Ta);
+		    Tu = VSUB(Tb, Td);
+		    Te = VADD(Tb, Td);
+		    Tf = VFNMS(LDK(KP500000000), Te, T9);
+	       }
+	       {
+		    V Ti, Tk, Th, Tw, Tj;
+		    Th = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Ti = BYTW(&(W[TWVL * 12]), Th);
+		    Tw = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Tx = BYTW(&(W[TWVL * 4]), Tw);
+		    Tj = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+		    Tk = BYTW(&(W[TWVL * 20]), Tj);
+		    Tl = VSUB(Ti, Tk);
+		    Ty = VADD(Ti, Tk);
+		    Tz = VFNMS(LDK(KP500000000), Ty, Tx);
+	       }
+	       {
+		    V Ts, TG, TF, TH;
+		    {
+			 V Tg, Tr, Tv, TE;
+			 Tg = VSUB(T7, Tf);
+			 Tr = VMUL(LDK(KP866025403), VSUB(Tl, Tq));
+			 Ts = VSUB(Tg, Tr);
+			 TG = VADD(Tg, Tr);
+			 Tv = VMUL(LDK(KP866025403), VSUB(Tt, Tu));
+			 TE = VSUB(Tz, TD);
+			 TF = VBYI(VADD(Tv, TE));
+			 TH = VBYI(VSUB(TE, Tv));
+		    }
+		    ST(&(x[WS(rs, 11)]), VSUB(Ts, TF), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(Ts, TF), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VSUB(TG, TH), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TS, TW, TV, TX;
+		    {
+			 V TQ, TR, TT, TU;
+			 TQ = VADD(T1, T6);
+			 TR = VADD(T9, Te);
+			 TS = VSUB(TQ, TR);
+			 TW = VADD(TQ, TR);
+			 TT = VADD(Tx, Ty);
+			 TU = VADD(TB, TC);
+			 TV = VBYI(VSUB(TT, TU));
+			 TX = VADD(TT, TU);
+		    }
+		    ST(&(x[WS(rs, 3)]), VSUB(TS, TV), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[0]), VADD(TW, TX), ms, &(x[0]));
+		    ST(&(x[WS(rs, 9)]), VADD(TS, TV), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VSUB(TW, TX), ms, &(x[0]));
+	       }
+	       {
+		    V TK, TO, TN, TP;
+		    {
+			 V TI, TJ, TL, TM;
+			 TI = VADD(Tl, Tq);
+			 TJ = VADD(Tt, Tu);
+			 TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
+			 TO = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
+			 TL = VADD(T7, Tf);
+			 TM = VADD(Tz, TD);
+			 TN = VSUB(TL, TM);
+			 TP = VADD(TL, TM);
+		    }
+		    ST(&(x[WS(rs, 2)]), VADD(TK, TN), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VSUB(TP, TO), ms, &(x[0]));
+		    ST(&(x[WS(rs, 10)]), VSUB(TN, TK), ms, &(x[0]));
+		    ST(&(x[WS(rs, 4)]), VADD(TO, TP), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 12, XSIMD_STRING("t1bv_12"), twinstr, &GENUS, {55, 26, 4, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_12) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_12, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name t1bv_15 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 92 FP additions, 77 FP multiplications,
+ * (or, 50 additions, 35 multiplications, 42 fused multiply/add),
+ * 81 stack variables, 8 constants, and 30 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DVK(KP910592997, +0.910592997310029334643087372129977886038870291);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 28)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 28), MAKE_VOLATILE_STRIDE(15, rs)) {
+	       V Tq, Ty, Th, TV, TK, Ts, T1f, T7, Tu, TA, TC, Tj, Tk, T1g, Tf;
+	       {
+		    V T1, T4, T2, T9, Te;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T4 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    {
+			 V T8, Tp, Tx, Tg;
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tg = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 {
+			      V Tb, Td, Tr, T6, Tt, Tz, TB, Ti;
+			      {
+				   V T5, T3, Ta, Tc;
+				   Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+				   Tc = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   T5 = BYTW(&(W[TWVL * 18]), T4);
+				   T3 = BYTW(&(W[TWVL * 8]), T2);
+				   T9 = BYTW(&(W[TWVL * 4]), T8);
+				   Tq = BYTW(&(W[TWVL * 10]), Tp);
+				   Ty = BYTW(&(W[TWVL * 16]), Tx);
+				   Th = BYTW(&(W[TWVL * 22]), Tg);
+				   Tb = BYTW(&(W[TWVL * 14]), Ta);
+				   Td = BYTW(&(W[TWVL * 24]), Tc);
+				   Tr = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   TV = VSUB(T3, T5);
+				   T6 = VADD(T3, T5);
+				   Tt = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      }
+			      Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      TB = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      Te = VADD(Tb, Td);
+			      TK = VSUB(Tb, Td);
+			      Ts = BYTW(&(W[TWVL * 20]), Tr);
+			      T1f = VADD(T1, T6);
+			      T7 = VFNMS(LDK(KP500000000), T6, T1);
+			      Tu = BYTW(&(W[0]), Tt);
+			      TA = BYTW(&(W[TWVL * 26]), Tz);
+			      TC = BYTW(&(W[TWVL * 6]), TB);
+			      Tj = BYTW(&(W[TWVL * 2]), Ti);
+			      Tk = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+		    T1g = VADD(T9, Te);
+		    Tf = VFNMS(LDK(KP500000000), Te, T9);
+	       }
+	       {
+		    V Tv, TN, TD, TO, Tl;
+		    Tv = VADD(Ts, Tu);
+		    TN = VSUB(Ts, Tu);
+		    TD = VADD(TA, TC);
+		    TO = VSUB(TA, TC);
+		    Tl = BYTW(&(W[TWVL * 12]), Tk);
+		    {
+			 V Tw, T1j, TX, TP, TE, T1k, TL, Tm;
+			 Tw = VFNMS(LDK(KP500000000), Tv, Tq);
+			 T1j = VADD(Tq, Tv);
+			 TX = VADD(TN, TO);
+			 TP = VSUB(TN, TO);
+			 TE = VFNMS(LDK(KP500000000), TD, Ty);
+			 T1k = VADD(Ty, TD);
+			 TL = VSUB(Tj, Tl);
+			 Tm = VADD(Tj, Tl);
+			 {
+			      V TT, TF, T1q, T1l, TW, TM, T1h, Tn;
+			      TT = VSUB(Tw, TE);
+			      TF = VADD(Tw, TE);
+			      T1q = VSUB(T1j, T1k);
+			      T1l = VADD(T1j, T1k);
+			      TW = VADD(TK, TL);
+			      TM = VSUB(TK, TL);
+			      T1h = VADD(Th, Tm);
+			      Tn = VFNMS(LDK(KP500000000), Tm, Th);
+			      {
+				   V T10, TY, T16, TQ, T1r, T1i, TS, To, TZ, T1e;
+				   T10 = VSUB(TW, TX);
+				   TY = VADD(TW, TX);
+				   T16 = VFNMS(LDK(KP618033988), TM, TP);
+				   TQ = VFMA(LDK(KP618033988), TP, TM);
+				   T1r = VSUB(T1g, T1h);
+				   T1i = VADD(T1g, T1h);
+				   TS = VSUB(Tf, Tn);
+				   To = VADD(Tf, Tn);
+				   TZ = VFNMS(LDK(KP250000000), TY, TV);
+				   T1e = VMUL(LDK(KP866025403), VADD(TV, TY));
+				   {
+					V T1u, T1s, T1o, T18, TU, TG, TI, T19, T11, T1n, T1m;
+					T1u = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1q, T1r));
+					T1s = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1r, T1q));
+					T1m = VADD(T1i, T1l);
+					T1o = VSUB(T1i, T1l);
+					T18 = VFNMS(LDK(KP618033988), TS, TT);
+					TU = VFMA(LDK(KP618033988), TT, TS);
+					TG = VADD(To, TF);
+					TI = VSUB(To, TF);
+					T19 = VFNMS(LDK(KP559016994), T10, TZ);
+					T11 = VFMA(LDK(KP559016994), T10, TZ);
+					ST(&(x[0]), VADD(T1f, T1m), ms, &(x[0]));
+					T1n = VFNMS(LDK(KP250000000), T1m, T1f);
+					{
+					     V T1a, T1c, T14, T12, T1p, T1t, T15, TJ, T1d, TH;
+					     T1d = VADD(T7, TG);
+					     TH = VFNMS(LDK(KP250000000), TG, T7);
+					     T1a = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T19, T18));
+					     T1c = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T19, T18));
+					     T14 = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T11, TU));
+					     T12 = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T11, TU));
+					     T1p = VFNMS(LDK(KP559016994), T1o, T1n);
+					     T1t = VFMA(LDK(KP559016994), T1o, T1n);
+					     ST(&(x[WS(rs, 10)]), VFMAI(T1e, T1d), ms, &(x[0]));
+					     ST(&(x[WS(rs, 5)]), VFNMSI(T1e, T1d), ms, &(x[WS(rs, 1)]));
+					     T15 = VFNMS(LDK(KP559016994), TI, TH);
+					     TJ = VFMA(LDK(KP559016994), TI, TH);
+					     {
+						  V T17, T1b, T13, TR;
+						  ST(&(x[WS(rs, 12)]), VFNMSI(T1s, T1p), ms, &(x[0]));
+						  ST(&(x[WS(rs, 3)]), VFMAI(T1s, T1p), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 9)]), VFNMSI(T1u, T1t), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 6)]), VFMAI(T1u, T1t), ms, &(x[0]));
+						  T17 = VFNMS(LDK(KP823639103), T16, T15);
+						  T1b = VFMA(LDK(KP823639103), T16, T15);
+						  T13 = VFMA(LDK(KP823639103), TQ, TJ);
+						  TR = VFNMS(LDK(KP823639103), TQ, TJ);
+						  ST(&(x[WS(rs, 13)]), VFMAI(T1a, T17), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 2)]), VFNMSI(T1a, T17), ms, &(x[0]));
+						  ST(&(x[WS(rs, 8)]), VFMAI(T1c, T1b), ms, &(x[0]));
+						  ST(&(x[WS(rs, 7)]), VFNMSI(T1c, T1b), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 11)]), VFMAI(T14, T13), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 4)]), VFNMSI(T14, T13), ms, &(x[0]));
+						  ST(&(x[WS(rs, 14)]), VFNMSI(T12, TR), ms, &(x[0]));
+						  ST(&(x[WS(rs, 1)]), VFMAI(T12, TR), ms, &(x[WS(rs, 1)]));
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 15, XSIMD_STRING("t1bv_15"), twinstr, &GENUS, {50, 35, 42, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_15) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_15, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name t1bv_15 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 92 FP additions, 53 FP multiplications,
+ * (or, 78 additions, 39 multiplications, 14 fused multiply/add),
+ * 52 stack variables, 10 constants, and 30 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP216506350, +0.216506350946109661690930792688234045867850657);
+     DVK(KP484122918, +0.484122918275927110647408174972799951354115213);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP509036960, +0.509036960455127183450980863393907648510733164);
+     DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 28)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 28), MAKE_VOLATILE_STRIDE(15, rs)) {
+	       V Ts, TV, T1f, TZ, T10, Tb, Tm, Tt, T1j, T1k, T1l, TI, TM, TR, Tz;
+	       V TD, TQ, T1g, T1h, T1i;
+	       {
+		    V TT, Tr, Tp, Tq, To, TU;
+		    TT = LD(&(x[0]), ms, &(x[0]));
+		    Tq = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    Tr = BYTW(&(W[TWVL * 18]), Tq);
+		    To = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Tp = BYTW(&(W[TWVL * 8]), To);
+		    Ts = VSUB(Tp, Tr);
+		    TU = VADD(Tp, Tr);
+		    TV = VFNMS(LDK(KP500000000), TU, TT);
+		    T1f = VADD(TT, TU);
+	       }
+	       {
+		    V Tx, TG, TK, TB, T5, Ty, Tg, TH, Tl, TL, Ta, TC;
+		    {
+			 V Tw, TF, TJ, TA;
+			 Tw = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 Tx = BYTW(&(W[TWVL * 4]), Tw);
+			 TF = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 TG = BYTW(&(W[TWVL * 10]), TF);
+			 TJ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 TK = BYTW(&(W[TWVL * 16]), TJ);
+			 TA = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 TB = BYTW(&(W[TWVL * 22]), TA);
+		    }
+		    {
+			 V T2, T4, T1, T3;
+			 T1 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T2 = BYTW(&(W[TWVL * 14]), T1);
+			 T3 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTW(&(W[TWVL * 24]), T3);
+			 T5 = VSUB(T2, T4);
+			 Ty = VADD(T2, T4);
+		    }
+		    {
+			 V Td, Tf, Tc, Te;
+			 Tc = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 Td = BYTW(&(W[TWVL * 20]), Tc);
+			 Te = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTW(&(W[0]), Te);
+			 Tg = VSUB(Td, Tf);
+			 TH = VADD(Td, Tf);
+		    }
+		    {
+			 V Ti, Tk, Th, Tj;
+			 Th = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 Ti = BYTW(&(W[TWVL * 26]), Th);
+			 Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Tk = BYTW(&(W[TWVL * 6]), Tj);
+			 Tl = VSUB(Ti, Tk);
+			 TL = VADD(Ti, Tk);
+		    }
+		    {
+			 V T7, T9, T6, T8;
+			 T6 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T7 = BYTW(&(W[TWVL * 2]), T6);
+			 T8 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 12]), T8);
+			 Ta = VSUB(T7, T9);
+			 TC = VADD(T7, T9);
+		    }
+		    TZ = VSUB(T5, Ta);
+		    T10 = VSUB(Tg, Tl);
+		    Tb = VADD(T5, Ta);
+		    Tm = VADD(Tg, Tl);
+		    Tt = VADD(Tb, Tm);
+		    T1j = VADD(TG, TH);
+		    T1k = VADD(TK, TL);
+		    T1l = VADD(T1j, T1k);
+		    TI = VFNMS(LDK(KP500000000), TH, TG);
+		    TM = VFNMS(LDK(KP500000000), TL, TK);
+		    TR = VADD(TI, TM);
+		    Tz = VFNMS(LDK(KP500000000), Ty, Tx);
+		    TD = VFNMS(LDK(KP500000000), TC, TB);
+		    TQ = VADD(Tz, TD);
+		    T1g = VADD(Tx, Ty);
+		    T1h = VADD(TB, TC);
+		    T1i = VADD(T1g, T1h);
+	       }
+	       {
+		    V T1o, T1m, T1n, T1s, T1t, T1q, T1r, T1u, T1p;
+		    T1o = VMUL(LDK(KP559016994), VSUB(T1i, T1l));
+		    T1m = VADD(T1i, T1l);
+		    T1n = VFNMS(LDK(KP250000000), T1m, T1f);
+		    T1q = VSUB(T1g, T1h);
+		    T1r = VSUB(T1j, T1k);
+		    T1s = VBYI(VFNMS(LDK(KP951056516), T1r, VMUL(LDK(KP587785252), T1q)));
+		    T1t = VBYI(VFMA(LDK(KP951056516), T1q, VMUL(LDK(KP587785252), T1r)));
+		    ST(&(x[0]), VADD(T1f, T1m), ms, &(x[0]));
+		    T1u = VADD(T1o, T1n);
+		    ST(&(x[WS(rs, 6)]), VADD(T1t, T1u), ms, &(x[0]));
+		    ST(&(x[WS(rs, 9)]), VSUB(T1u, T1t), ms, &(x[WS(rs, 1)]));
+		    T1p = VSUB(T1n, T1o);
+		    ST(&(x[WS(rs, 3)]), VSUB(T1p, T1s), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 12)]), VADD(T1s, T1p), ms, &(x[0]));
+	       }
+	       {
+		    V T11, T18, T1e, TO, T16, Tv, T15, TY, T1d, T19, TE, TN;
+		    T11 = VFMA(LDK(KP823639103), TZ, VMUL(LDK(KP509036960), T10));
+		    T18 = VFNMS(LDK(KP823639103), T10, VMUL(LDK(KP509036960), TZ));
+		    T1e = VBYI(VMUL(LDK(KP866025403), VADD(Ts, Tt)));
+		    TE = VSUB(Tz, TD);
+		    TN = VSUB(TI, TM);
+		    TO = VFMA(LDK(KP951056516), TE, VMUL(LDK(KP587785252), TN));
+		    T16 = VFNMS(LDK(KP951056516), TN, VMUL(LDK(KP587785252), TE));
+		    {
+			 V Tn, Tu, TS, TW, TX;
+			 Tn = VMUL(LDK(KP484122918), VSUB(Tb, Tm));
+			 Tu = VFNMS(LDK(KP216506350), Tt, VMUL(LDK(KP866025403), Ts));
+			 Tv = VADD(Tn, Tu);
+			 T15 = VSUB(Tn, Tu);
+			 TS = VMUL(LDK(KP559016994), VSUB(TQ, TR));
+			 TW = VADD(TQ, TR);
+			 TX = VFNMS(LDK(KP250000000), TW, TV);
+			 TY = VADD(TS, TX);
+			 T1d = VADD(TV, TW);
+			 T19 = VSUB(TX, TS);
+		    }
+		    {
+			 V TP, T12, T1b, T1c;
+			 ST(&(x[WS(rs, 5)]), VSUB(T1d, T1e), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 10)]), VADD(T1e, T1d), ms, &(x[0]));
+			 TP = VBYI(VADD(Tv, TO));
+			 T12 = VSUB(TY, T11);
+			 ST(&(x[WS(rs, 1)]), VADD(TP, T12), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 14)]), VSUB(T12, TP), ms, &(x[0]));
+			 T1b = VBYI(VSUB(T16, T15));
+			 T1c = VSUB(T19, T18);
+			 ST(&(x[WS(rs, 7)]), VADD(T1b, T1c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 8)]), VSUB(T1c, T1b), ms, &(x[0]));
+			 {
+			      V T17, T1a, T13, T14;
+			      T17 = VBYI(VADD(T15, T16));
+			      T1a = VADD(T18, T19);
+			      ST(&(x[WS(rs, 2)]), VADD(T17, T1a), ms, &(x[0]));
+			      ST(&(x[WS(rs, 13)]), VSUB(T1a, T17), ms, &(x[WS(rs, 1)]));
+			      T13 = VBYI(VSUB(Tv, TO));
+			      T14 = VADD(T11, TY);
+			      ST(&(x[WS(rs, 4)]), VADD(T13, T14), ms, &(x[0]));
+			      ST(&(x[WS(rs, 11)]), VSUB(T14, T13), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 15, XSIMD_STRING("t1bv_15"), twinstr, &GENUS, {78, 39, 14, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_15) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_15, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1bv_16 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 87 FP additions, 64 FP multiplications,
+ * (or, 53 additions, 30 multiplications, 34 fused multiply/add),
+ * 61 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V TO, Ta, TJ, TP, T14, Tq, T1i, T10, T1b, T1l, T13, T1c, TR, Tl, T15;
+	       V Tv;
+	       {
+		    V Tc, TW, T4, T19, T9, TD, TI, Tj, TZ, T1a, Te, Th, Tn, Tr, Tu;
+		    V Tp;
+		    {
+			 V T1, T2, T5, T7;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T2 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T7 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 {
+			      V Tz, TG, TB, TE;
+			      Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      TG = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      TB = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      TE = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      {
+				   V Ti, TX, TY, Td, Tg, Tm, Tt, To;
+				   {
+					V T3, T6, T8, TA, TH, TC, TF, Tb;
+					Tb = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					T3 = BYTW(&(W[TWVL * 14]), T2);
+					T6 = BYTW(&(W[TWVL * 6]), T5);
+					T8 = BYTW(&(W[TWVL * 22]), T7);
+					TA = BYTW(&(W[TWVL * 2]), Tz);
+					TH = BYTW(&(W[TWVL * 10]), TG);
+					TC = BYTW(&(W[TWVL * 18]), TB);
+					TF = BYTW(&(W[TWVL * 26]), TE);
+					Tc = BYTW(&(W[0]), Tb);
+					TW = VSUB(T1, T3);
+					T4 = VADD(T1, T3);
+					T19 = VSUB(T6, T8);
+					T9 = VADD(T6, T8);
+					Ti = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+					TD = VADD(TA, TC);
+					TX = VSUB(TA, TC);
+					TI = VADD(TF, TH);
+					TY = VSUB(TF, TH);
+				   }
+				   Td = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+				   Tg = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+				   Tm = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+				   Tj = BYTW(&(W[TWVL * 24]), Ti);
+				   Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   To = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+				   TZ = VADD(TX, TY);
+				   T1a = VSUB(TX, TY);
+				   Te = BYTW(&(W[TWVL * 16]), Td);
+				   Th = BYTW(&(W[TWVL * 8]), Tg);
+				   Tn = BYTW(&(W[TWVL * 28]), Tm);
+				   Tr = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   Tu = BYTW(&(W[TWVL * 20]), Tt);
+				   Tp = BYTW(&(W[TWVL * 12]), To);
+			      }
+			 }
+		    }
+		    {
+			 V Tf, T11, Tk, T12, Ts;
+			 TO = VADD(T4, T9);
+			 Ta = VSUB(T4, T9);
+			 TJ = VSUB(TD, TI);
+			 TP = VADD(TD, TI);
+			 Tf = VADD(Tc, Te);
+			 T11 = VSUB(Tc, Te);
+			 Tk = VADD(Th, Tj);
+			 T12 = VSUB(Th, Tj);
+			 Ts = BYTW(&(W[TWVL * 4]), Tr);
+			 T14 = VSUB(Tn, Tp);
+			 Tq = VADD(Tn, Tp);
+			 T1i = VFNMS(LDK(KP707106781), TZ, TW);
+			 T10 = VFMA(LDK(KP707106781), TZ, TW);
+			 T1b = VFMA(LDK(KP707106781), T1a, T19);
+			 T1l = VFNMS(LDK(KP707106781), T1a, T19);
+			 T13 = VFNMS(LDK(KP414213562), T12, T11);
+			 T1c = VFMA(LDK(KP414213562), T11, T12);
+			 TR = VADD(Tf, Tk);
+			 Tl = VSUB(Tf, Tk);
+			 T15 = VSUB(Tu, Ts);
+			 Tv = VADD(Ts, Tu);
+		    }
+	       }
+	       {
+		    V T1d, T16, TS, Tw, TU, TQ;
+		    T1d = VFMA(LDK(KP414213562), T14, T15);
+		    T16 = VFNMS(LDK(KP414213562), T15, T14);
+		    TS = VADD(Tq, Tv);
+		    Tw = VSUB(Tq, Tv);
+		    TU = VADD(TO, TP);
+		    TQ = VSUB(TO, TP);
+		    {
+			 V T1e, T1j, T17, T1m;
+			 T1e = VSUB(T1c, T1d);
+			 T1j = VADD(T1c, T1d);
+			 T17 = VADD(T13, T16);
+			 T1m = VSUB(T13, T16);
+			 {
+			      V TV, TT, TK, Tx;
+			      TV = VADD(TR, TS);
+			      TT = VSUB(TR, TS);
+			      TK = VSUB(Tl, Tw);
+			      Tx = VADD(Tl, Tw);
+			      {
+				   V T1h, T1f, T1o, T1k;
+				   T1h = VFMA(LDK(KP923879532), T1e, T1b);
+				   T1f = VFNMS(LDK(KP923879532), T1e, T1b);
+				   T1o = VFMA(LDK(KP923879532), T1j, T1i);
+				   T1k = VFNMS(LDK(KP923879532), T1j, T1i);
+				   {
+					V T1g, T18, T1p, T1n;
+					T1g = VFMA(LDK(KP923879532), T17, T10);
+					T18 = VFNMS(LDK(KP923879532), T17, T10);
+					T1p = VFNMS(LDK(KP923879532), T1m, T1l);
+					T1n = VFMA(LDK(KP923879532), T1m, T1l);
+					ST(&(x[WS(rs, 8)]), VSUB(TU, TV), ms, &(x[0]));
+					ST(&(x[0]), VADD(TU, TV), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VFMAI(TT, TQ), ms, &(x[0]));
+					ST(&(x[WS(rs, 12)]), VFNMSI(TT, TQ), ms, &(x[0]));
+					{
+					     V TN, TL, TM, Ty;
+					     TN = VFMA(LDK(KP707106781), TK, TJ);
+					     TL = VFNMS(LDK(KP707106781), TK, TJ);
+					     TM = VFMA(LDK(KP707106781), Tx, Ta);
+					     Ty = VFNMS(LDK(KP707106781), Tx, Ta);
+					     ST(&(x[WS(rs, 15)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 1)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 9)]), VFMAI(T1f, T18), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 7)]), VFNMSI(T1f, T18), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 3)]), VFNMSI(T1p, T1o), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 13)]), VFMAI(T1p, T1o), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 11)]), VFNMSI(T1n, T1k), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 5)]), VFMAI(T1n, T1k), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 2)]), VFMAI(TN, TM), ms, &(x[0]));
+					     ST(&(x[WS(rs, 14)]), VFNMSI(TN, TM), ms, &(x[0]));
+					     ST(&(x[WS(rs, 10)]), VFMAI(TL, Ty), ms, &(x[0]));
+					     ST(&(x[WS(rs, 6)]), VFNMSI(TL, Ty), ms, &(x[0]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t1bv_16"), twinstr, &GENUS, {53, 30, 34, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_16) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1bv_16 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 87 FP additions, 42 FP multiplications,
+ * (or, 83 additions, 38 multiplications, 4 fused multiply/add),
+ * 36 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V TJ, T1b, TD, T1c, T17, T18, Ty, TK, T10, T11, T12, Tb, TM, T13, T14;
+	       V T15, Tm, TN, TG, TI, TH;
+	       TG = LD(&(x[0]), ms, &(x[0]));
+	       TH = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+	       TI = BYTW(&(W[TWVL * 14]), TH);
+	       TJ = VSUB(TG, TI);
+	       T1b = VADD(TG, TI);
+	       {
+		    V TA, TC, Tz, TB;
+		    Tz = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    TA = BYTW(&(W[TWVL * 6]), Tz);
+		    TB = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+		    TC = BYTW(&(W[TWVL * 22]), TB);
+		    TD = VSUB(TA, TC);
+		    T1c = VADD(TA, TC);
+	       }
+	       {
+		    V Tp, Tw, Tr, Tu, Ts, Tx;
+		    {
+			 V To, Tv, Tq, Tt;
+			 To = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tp = BYTW(&(W[TWVL * 2]), To);
+			 Tv = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tw = BYTW(&(W[TWVL * 10]), Tv);
+			 Tq = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 Tr = BYTW(&(W[TWVL * 18]), Tq);
+			 Tt = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 Tu = BYTW(&(W[TWVL * 26]), Tt);
+		    }
+		    T17 = VADD(Tp, Tr);
+		    T18 = VADD(Tu, Tw);
+		    Ts = VSUB(Tp, Tr);
+		    Tx = VSUB(Tu, Tw);
+		    Ty = VMUL(LDK(KP707106781), VSUB(Ts, Tx));
+		    TK = VMUL(LDK(KP707106781), VADD(Ts, Tx));
+	       }
+	       {
+		    V T2, T9, T4, T7, T5, Ta;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTW(&(W[0]), T1);
+			 T8 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 24]), T8);
+			 T3 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTW(&(W[TWVL * 16]), T3);
+			 T6 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T7 = BYTW(&(W[TWVL * 8]), T6);
+		    }
+		    T10 = VADD(T2, T4);
+		    T11 = VADD(T7, T9);
+		    T12 = VSUB(T10, T11);
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    Tb = VFNMS(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), T5));
+		    TM = VFMA(LDK(KP382683432), T5, VMUL(LDK(KP923879532), Ta));
+	       }
+	       {
+		    V Td, Tk, Tf, Ti, Tg, Tl;
+		    {
+			 V Tc, Tj, Te, Th;
+			 Tc = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 Td = BYTW(&(W[TWVL * 28]), Tc);
+			 Tj = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 Tk = BYTW(&(W[TWVL * 20]), Tj);
+			 Te = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTW(&(W[TWVL * 12]), Te);
+			 Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 Ti = BYTW(&(W[TWVL * 4]), Th);
+		    }
+		    T13 = VADD(Td, Tf);
+		    T14 = VADD(Ti, Tk);
+		    T15 = VSUB(T13, T14);
+		    Tg = VSUB(Td, Tf);
+		    Tl = VSUB(Ti, Tk);
+		    Tm = VFMA(LDK(KP923879532), Tg, VMUL(LDK(KP382683432), Tl));
+		    TN = VFNMS(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), Tl));
+	       }
+	       {
+		    V T1a, T1g, T1f, T1h;
+		    {
+			 V T16, T19, T1d, T1e;
+			 T16 = VMUL(LDK(KP707106781), VSUB(T12, T15));
+			 T19 = VSUB(T17, T18);
+			 T1a = VBYI(VSUB(T16, T19));
+			 T1g = VBYI(VADD(T19, T16));
+			 T1d = VSUB(T1b, T1c);
+			 T1e = VMUL(LDK(KP707106781), VADD(T12, T15));
+			 T1f = VSUB(T1d, T1e);
+			 T1h = VADD(T1d, T1e);
+		    }
+		    ST(&(x[WS(rs, 6)]), VADD(T1a, T1f), ms, &(x[0]));
+		    ST(&(x[WS(rs, 14)]), VSUB(T1h, T1g), ms, &(x[0]));
+		    ST(&(x[WS(rs, 10)]), VSUB(T1f, T1a), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(T1g, T1h), ms, &(x[0]));
+	       }
+	       {
+		    V T1k, T1o, T1n, T1p;
+		    {
+			 V T1i, T1j, T1l, T1m;
+			 T1i = VADD(T1b, T1c);
+			 T1j = VADD(T17, T18);
+			 T1k = VSUB(T1i, T1j);
+			 T1o = VADD(T1i, T1j);
+			 T1l = VADD(T10, T11);
+			 T1m = VADD(T13, T14);
+			 T1n = VBYI(VSUB(T1l, T1m));
+			 T1p = VADD(T1l, T1m);
+		    }
+		    ST(&(x[WS(rs, 12)]), VSUB(T1k, T1n), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T1o, T1p), ms, &(x[0]));
+		    ST(&(x[WS(rs, 4)]), VADD(T1k, T1n), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VSUB(T1o, T1p), ms, &(x[0]));
+	       }
+	       {
+		    V TF, TQ, TP, TR;
+		    {
+			 V Tn, TE, TL, TO;
+			 Tn = VSUB(Tb, Tm);
+			 TE = VSUB(Ty, TD);
+			 TF = VBYI(VSUB(Tn, TE));
+			 TQ = VBYI(VADD(TE, Tn));
+			 TL = VSUB(TJ, TK);
+			 TO = VSUB(TM, TN);
+			 TP = VSUB(TL, TO);
+			 TR = VADD(TL, TO);
+		    }
+		    ST(&(x[WS(rs, 5)]), VADD(TF, TP), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 13)]), VSUB(TR, TQ), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 11)]), VSUB(TP, TF), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VADD(TQ, TR), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TU, TY, TX, TZ;
+		    {
+			 V TS, TT, TV, TW;
+			 TS = VADD(TJ, TK);
+			 TT = VADD(Tb, Tm);
+			 TU = VADD(TS, TT);
+			 TY = VSUB(TS, TT);
+			 TV = VADD(TD, Ty);
+			 TW = VADD(TM, TN);
+			 TX = VBYI(VADD(TV, TW));
+			 TZ = VBYI(VSUB(TW, TV));
+		    }
+		    ST(&(x[WS(rs, 15)]), VSUB(TU, TX), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VADD(TY, TZ), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(TU, TX), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VSUB(TY, TZ), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t1bv_16"), twinstr, &GENUS, {83, 38, 4, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_16) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t1bv_2 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T2, T3;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTW(&(W[0]), T2);
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t1bv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_2) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t1bv_2 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTW(&(W[0]), T2);
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t1bv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_2) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:05 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t1bv_20 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 123 FP additions, 88 FP multiplications,
+ * (or, 77 additions, 42 multiplications, 46 fused multiply/add),
+ * 68 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T4, TX, T1m, T1K, T1y, Tk, Tf, T14, TQ, TZ, T1O, T1w, T1L, T1p, T1M;
+	       V T1s, TF, TY, T1x, Tp;
+	       {
+		    V T1, TV, T2, TT;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    TV = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+		    T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    TT = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    {
+			 V T9, T1n, TK, T1v, TP, Te, T1q, T1u, TB, TD, Tm, T1o, Tz, Tn, T1r;
+			 V TE, To;
+			 {
+			      V TM, TO, Ta, Tc;
+			      {
+				   V T5, T7, TG, TI, T1k, T1l;
+				   T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   {
+					V TW, T3, TU, T6, T8, TH, TJ, TL, TN;
+					TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					TW = BYTW(&(W[TWVL * 28]), TV);
+					T3 = BYTW(&(W[TWVL * 18]), T2);
+					TU = BYTW(&(W[TWVL * 8]), TT);
+					T6 = BYTW(&(W[TWVL * 6]), T5);
+					T8 = BYTW(&(W[TWVL * 26]), T7);
+					TH = BYTW(&(W[TWVL * 24]), TG);
+					TJ = BYTW(&(W[TWVL * 4]), TI);
+					TM = BYTW(&(W[TWVL * 32]), TL);
+					TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					T4 = VSUB(T1, T3);
+					T1k = VADD(T1, T3);
+					TX = VSUB(TU, TW);
+					T1l = VADD(TU, TW);
+					T9 = VSUB(T6, T8);
+					T1n = VADD(T6, T8);
+					TK = VSUB(TH, TJ);
+					T1v = VADD(TH, TJ);
+					TO = BYTW(&(W[TWVL * 12]), TN);
+				   }
+				   Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T1m = VSUB(T1k, T1l);
+				   T1K = VADD(T1k, T1l);
+				   Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      }
+			      {
+				   V Tb, Tx, Td, Th, Tj, Tw, Tg, Ti, Tv;
+				   Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+				   Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+				   Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+				   TP = VSUB(TM, TO);
+				   T1y = VADD(TM, TO);
+				   Tb = BYTW(&(W[TWVL * 30]), Ta);
+				   Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+				   Td = BYTW(&(W[TWVL * 10]), Tc);
+				   Th = BYTW(&(W[TWVL * 14]), Tg);
+				   Tj = BYTW(&(W[TWVL * 34]), Ti);
+				   Tw = BYTW(&(W[TWVL * 16]), Tv);
+				   {
+					V TA, TC, Ty, Tl;
+					TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					Ty = BYTW(&(W[TWVL * 36]), Tx);
+					Te = VSUB(Tb, Td);
+					T1q = VADD(Tb, Td);
+					Tk = VSUB(Th, Tj);
+					T1u = VADD(Th, Tj);
+					TB = BYTW(&(W[0]), TA);
+					TD = BYTW(&(W[TWVL * 20]), TC);
+					Tm = BYTW(&(W[TWVL * 22]), Tl);
+					T1o = VADD(Tw, Ty);
+					Tz = VSUB(Tw, Ty);
+					Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+				   }
+			      }
+			 }
+			 Tf = VADD(T9, Te);
+			 T14 = VSUB(T9, Te);
+			 TQ = VSUB(TK, TP);
+			 TZ = VADD(TK, TP);
+			 T1r = VADD(TB, TD);
+			 TE = VSUB(TB, TD);
+			 T1O = VADD(T1u, T1v);
+			 T1w = VSUB(T1u, T1v);
+			 To = BYTW(&(W[TWVL * 2]), Tn);
+			 T1L = VADD(T1n, T1o);
+			 T1p = VSUB(T1n, T1o);
+			 T1M = VADD(T1q, T1r);
+			 T1s = VSUB(T1q, T1r);
+			 TF = VSUB(Tz, TE);
+			 TY = VADD(Tz, TE);
+			 T1x = VADD(Tm, To);
+			 Tp = VSUB(Tm, To);
+		    }
+	       }
+	       {
+		    V T1V, T1N, T12, T1b, TR, T1G, T1t, T1z, T1P, Tq, T15, T11, T1j, T10;
+		    T1V = VSUB(T1L, T1M);
+		    T1N = VADD(T1L, T1M);
+		    T12 = VSUB(TY, TZ);
+		    T10 = VADD(TY, TZ);
+		    T1b = VFNMS(LDK(KP618033988), TF, TQ);
+		    TR = VFMA(LDK(KP618033988), TQ, TF);
+		    T1G = VSUB(T1p, T1s);
+		    T1t = VADD(T1p, T1s);
+		    T1z = VSUB(T1x, T1y);
+		    T1P = VADD(T1x, T1y);
+		    Tq = VADD(Tk, Tp);
+		    T15 = VSUB(Tk, Tp);
+		    T11 = VFNMS(LDK(KP250000000), T10, TX);
+		    T1j = VADD(TX, T10);
+		    {
+			 V T1J, T1H, T1D, T1Z, T1X, T1T, T1f, T1h, T19, T17, T1C, T1S, T1a, Tu, T1F;
+			 V T1A;
+			 T1F = VSUB(T1w, T1z);
+			 T1A = VADD(T1w, T1z);
+			 {
+			      V T1W, T1Q, Tt, Tr;
+			      T1W = VSUB(T1O, T1P);
+			      T1Q = VADD(T1O, T1P);
+			      Tt = VSUB(Tf, Tq);
+			      Tr = VADD(Tf, Tq);
+			      {
+				   V T1e, T16, T1d, T13;
+				   T1e = VFNMS(LDK(KP618033988), T14, T15);
+				   T16 = VFMA(LDK(KP618033988), T15, T14);
+				   T1d = VFNMS(LDK(KP559016994), T12, T11);
+				   T13 = VFMA(LDK(KP559016994), T12, T11);
+				   T1J = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
+				   T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
+				   {
+					V T1B, T1R, Ts, T1i;
+					T1B = VADD(T1t, T1A);
+					T1D = VSUB(T1t, T1A);
+					T1Z = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1V, T1W));
+					T1X = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1W, T1V));
+					T1R = VADD(T1N, T1Q);
+					T1T = VSUB(T1N, T1Q);
+					Ts = VFNMS(LDK(KP250000000), Tr, T4);
+					T1i = VADD(T4, Tr);
+					T1f = VFNMS(LDK(KP951056516), T1e, T1d);
+					T1h = VFMA(LDK(KP951056516), T1e, T1d);
+					T19 = VFNMS(LDK(KP951056516), T16, T13);
+					T17 = VFMA(LDK(KP951056516), T16, T13);
+					ST(&(x[WS(rs, 10)]), VADD(T1m, T1B), ms, &(x[0]));
+					T1C = VFNMS(LDK(KP250000000), T1B, T1m);
+					ST(&(x[0]), VADD(T1K, T1R), ms, &(x[0]));
+					T1S = VFNMS(LDK(KP250000000), T1R, T1K);
+					T1a = VFNMS(LDK(KP559016994), Tt, Ts);
+					Tu = VFMA(LDK(KP559016994), Tt, Ts);
+					ST(&(x[WS(rs, 5)]), VFMAI(T1j, T1i), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 15)]), VFNMSI(T1j, T1i), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+			 {
+			      V T1E, T1I, T1U, T1Y;
+			      T1E = VFNMS(LDK(KP559016994), T1D, T1C);
+			      T1I = VFMA(LDK(KP559016994), T1D, T1C);
+			      T1U = VFMA(LDK(KP559016994), T1T, T1S);
+			      T1Y = VFNMS(LDK(KP559016994), T1T, T1S);
+			      {
+				   V T1c, T1g, T18, TS;
+				   T1c = VFMA(LDK(KP951056516), T1b, T1a);
+				   T1g = VFNMS(LDK(KP951056516), T1b, T1a);
+				   T18 = VFMA(LDK(KP951056516), TR, Tu);
+				   TS = VFNMS(LDK(KP951056516), TR, Tu);
+				   ST(&(x[WS(rs, 18)]), VFMAI(T1H, T1E), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFNMSI(T1H, T1E), ms, &(x[0]));
+				   ST(&(x[WS(rs, 14)]), VFNMSI(T1J, T1I), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFMAI(T1J, T1I), ms, &(x[0]));
+				   ST(&(x[WS(rs, 16)]), VFMAI(T1X, T1U), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VFNMSI(T1X, T1U), ms, &(x[0]));
+				   ST(&(x[WS(rs, 12)]), VFNMSI(T1Z, T1Y), ms, &(x[0]));
+				   ST(&(x[WS(rs, 8)]), VFMAI(T1Z, T1Y), ms, &(x[0]));
+				   ST(&(x[WS(rs, 17)]), VFMAI(T1f, T1c), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFNMSI(T1f, T1c), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 13)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 9)]), VFMAI(T19, T18), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 11)]), VFNMSI(T19, T18), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFMAI(T17, TS), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 19)]), VFNMSI(T17, TS), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t1bv_20"), twinstr, &GENUS, {77, 42, 46, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_20) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t1bv_20 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 123 FP additions, 62 FP multiplications,
+ * (or, 111 additions, 50 multiplications, 12 fused multiply/add),
+ * 54 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T4, T10, T1B, T1R, TF, T14, T15, TQ, Tf, Tq, Tr, T1N, T1O, T1P, T1t;
+	       V T1w, T1D, TT, TU, T11, T1K, T1L, T1M, T1m, T1p, T1C, T1i, T1j;
+	       {
+		    V T1, TZ, T3, TX, TY, T2, TW, T1z, T1A;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    TY = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+		    TZ = BYTW(&(W[TWVL * 28]), TY);
+		    T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    T3 = BYTW(&(W[TWVL * 18]), T2);
+		    TW = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    TX = BYTW(&(W[TWVL * 8]), TW);
+		    T4 = VSUB(T1, T3);
+		    T10 = VSUB(TX, TZ);
+		    T1z = VADD(T1, T3);
+		    T1A = VADD(TX, TZ);
+		    T1B = VSUB(T1z, T1A);
+		    T1R = VADD(T1z, T1A);
+	       }
+	       {
+		    V T9, T1k, TK, T1s, TP, T1v, Te, T1n, Tk, T1r, Tz, T1l, TE, T1o, Tp;
+		    V T1u;
+		    {
+			 V T6, T8, T5, T7;
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T6 = BYTW(&(W[TWVL * 6]), T5);
+			 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 T8 = BYTW(&(W[TWVL * 26]), T7);
+			 T9 = VSUB(T6, T8);
+			 T1k = VADD(T6, T8);
+		    }
+		    {
+			 V TH, TJ, TG, TI;
+			 TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 TH = BYTW(&(W[TWVL * 24]), TG);
+			 TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 TJ = BYTW(&(W[TWVL * 4]), TI);
+			 TK = VSUB(TH, TJ);
+			 T1s = VADD(TH, TJ);
+		    }
+		    {
+			 V TM, TO, TL, TN;
+			 TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 TM = BYTW(&(W[TWVL * 32]), TL);
+			 TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 TO = BYTW(&(W[TWVL * 12]), TN);
+			 TP = VSUB(TM, TO);
+			 T1v = VADD(TM, TO);
+		    }
+		    {
+			 V Tb, Td, Ta, Tc;
+			 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			 Tb = BYTW(&(W[TWVL * 30]), Ta);
+			 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Td = BYTW(&(W[TWVL * 10]), Tc);
+			 Te = VSUB(Tb, Td);
+			 T1n = VADD(Tb, Td);
+		    }
+		    {
+			 V Th, Tj, Tg, Ti;
+			 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 Th = BYTW(&(W[TWVL * 14]), Tg);
+			 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 Tj = BYTW(&(W[TWVL * 34]), Ti);
+			 Tk = VSUB(Th, Tj);
+			 T1r = VADD(Th, Tj);
+		    }
+		    {
+			 V Tw, Ty, Tv, Tx;
+			 Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tw = BYTW(&(W[TWVL * 16]), Tv);
+			 Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 Ty = BYTW(&(W[TWVL * 36]), Tx);
+			 Tz = VSUB(Tw, Ty);
+			 T1l = VADD(Tw, Ty);
+		    }
+		    {
+			 V TB, TD, TA, TC;
+			 TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 TB = BYTW(&(W[0]), TA);
+			 TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 TD = BYTW(&(W[TWVL * 20]), TC);
+			 TE = VSUB(TB, TD);
+			 T1o = VADD(TB, TD);
+		    }
+		    {
+			 V Tm, To, Tl, Tn;
+			 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Tm = BYTW(&(W[TWVL * 22]), Tl);
+			 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 To = BYTW(&(W[TWVL * 2]), Tn);
+			 Tp = VSUB(Tm, To);
+			 T1u = VADD(Tm, To);
+		    }
+		    TF = VSUB(Tz, TE);
+		    T14 = VSUB(T9, Te);
+		    T15 = VSUB(Tk, Tp);
+		    TQ = VSUB(TK, TP);
+		    Tf = VADD(T9, Te);
+		    Tq = VADD(Tk, Tp);
+		    Tr = VADD(Tf, Tq);
+		    T1N = VADD(T1r, T1s);
+		    T1O = VADD(T1u, T1v);
+		    T1P = VADD(T1N, T1O);
+		    T1t = VSUB(T1r, T1s);
+		    T1w = VSUB(T1u, T1v);
+		    T1D = VADD(T1t, T1w);
+		    TT = VADD(Tz, TE);
+		    TU = VADD(TK, TP);
+		    T11 = VADD(TT, TU);
+		    T1K = VADD(T1k, T1l);
+		    T1L = VADD(T1n, T1o);
+		    T1M = VADD(T1K, T1L);
+		    T1m = VSUB(T1k, T1l);
+		    T1p = VSUB(T1n, T1o);
+		    T1C = VADD(T1m, T1p);
+	       }
+	       T1i = VADD(T4, Tr);
+	       T1j = VBYI(VADD(T10, T11));
+	       ST(&(x[WS(rs, 15)]), VSUB(T1i, T1j), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 5)]), VADD(T1i, T1j), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T1Q, T1S, T1T, T1X, T1Z, T1V, T1W, T1Y, T1U;
+		    T1Q = VMUL(LDK(KP559016994), VSUB(T1M, T1P));
+		    T1S = VADD(T1M, T1P);
+		    T1T = VFNMS(LDK(KP250000000), T1S, T1R);
+		    T1V = VSUB(T1K, T1L);
+		    T1W = VSUB(T1N, T1O);
+		    T1X = VBYI(VFMA(LDK(KP951056516), T1V, VMUL(LDK(KP587785252), T1W)));
+		    T1Z = VBYI(VFNMS(LDK(KP951056516), T1W, VMUL(LDK(KP587785252), T1V)));
+		    ST(&(x[0]), VADD(T1R, T1S), ms, &(x[0]));
+		    T1Y = VSUB(T1T, T1Q);
+		    ST(&(x[WS(rs, 8)]), VSUB(T1Y, T1Z), ms, &(x[0]));
+		    ST(&(x[WS(rs, 12)]), VADD(T1Z, T1Y), ms, &(x[0]));
+		    T1U = VADD(T1Q, T1T);
+		    ST(&(x[WS(rs, 4)]), VSUB(T1U, T1X), ms, &(x[0]));
+		    ST(&(x[WS(rs, 16)]), VADD(T1X, T1U), ms, &(x[0]));
+	       }
+	       {
+		    V T1G, T1E, T1F, T1y, T1I, T1q, T1x, T1J, T1H;
+		    T1G = VMUL(LDK(KP559016994), VSUB(T1C, T1D));
+		    T1E = VADD(T1C, T1D);
+		    T1F = VFNMS(LDK(KP250000000), T1E, T1B);
+		    T1q = VSUB(T1m, T1p);
+		    T1x = VSUB(T1t, T1w);
+		    T1y = VBYI(VFNMS(LDK(KP951056516), T1x, VMUL(LDK(KP587785252), T1q)));
+		    T1I = VBYI(VFMA(LDK(KP951056516), T1q, VMUL(LDK(KP587785252), T1x)));
+		    ST(&(x[WS(rs, 10)]), VADD(T1B, T1E), ms, &(x[0]));
+		    T1J = VADD(T1G, T1F);
+		    ST(&(x[WS(rs, 6)]), VADD(T1I, T1J), ms, &(x[0]));
+		    ST(&(x[WS(rs, 14)]), VSUB(T1J, T1I), ms, &(x[0]));
+		    T1H = VSUB(T1F, T1G);
+		    ST(&(x[WS(rs, 2)]), VADD(T1y, T1H), ms, &(x[0]));
+		    ST(&(x[WS(rs, 18)]), VSUB(T1H, T1y), ms, &(x[0]));
+	       }
+	       {
+		    V TR, T16, T1d, T1b, T13, T1e, Tu, T1a;
+		    TR = VFNMS(LDK(KP951056516), TQ, VMUL(LDK(KP587785252), TF));
+		    T16 = VFNMS(LDK(KP951056516), T15, VMUL(LDK(KP587785252), T14));
+		    T1d = VFMA(LDK(KP951056516), T14, VMUL(LDK(KP587785252), T15));
+		    T1b = VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TQ));
+		    {
+			 V TV, T12, Ts, Tt;
+			 TV = VMUL(LDK(KP559016994), VSUB(TT, TU));
+			 T12 = VFNMS(LDK(KP250000000), T11, T10);
+			 T13 = VSUB(TV, T12);
+			 T1e = VADD(TV, T12);
+			 Ts = VFNMS(LDK(KP250000000), Tr, T4);
+			 Tt = VMUL(LDK(KP559016994), VSUB(Tf, Tq));
+			 Tu = VSUB(Ts, Tt);
+			 T1a = VADD(Tt, Ts);
+		    }
+		    {
+			 V TS, T17, T1g, T1h;
+			 TS = VSUB(Tu, TR);
+			 T17 = VBYI(VSUB(T13, T16));
+			 ST(&(x[WS(rs, 17)]), VSUB(TS, T17), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(TS, T17), ms, &(x[WS(rs, 1)]));
+			 T1g = VADD(T1a, T1b);
+			 T1h = VBYI(VSUB(T1e, T1d));
+			 ST(&(x[WS(rs, 11)]), VSUB(T1g, T1h), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 9)]), VADD(T1g, T1h), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T18, T19, T1c, T1f;
+			 T18 = VADD(Tu, TR);
+			 T19 = VBYI(VADD(T16, T13));
+			 ST(&(x[WS(rs, 13)]), VSUB(T18, T19), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T18, T19), ms, &(x[WS(rs, 1)]));
+			 T1c = VSUB(T1a, T1b);
+			 T1f = VBYI(VADD(T1d, T1e));
+			 ST(&(x[WS(rs, 19)]), VSUB(T1c, T1f), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T1c, T1f), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t1bv_20"), twinstr, &GENUS, {111, 50, 12, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_20) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,934 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:06 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t1bv_25 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 248 FP additions, 241 FP multiplications,
+ * (or, 67 additions, 60 multiplications, 181 fused multiply/add),
+ * 208 stack variables, 67 constants, and 50 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP792626838, +0.792626838241819413632131824093538848057784557);
+     DVK(KP876091699, +0.876091699473550838204498029706869638173524346);
+     DVK(KP617882369, +0.617882369114440893914546919006756321695042882);
+     DVK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DVK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP999544308, +0.999544308746292983948881682379742149196758193);
+     DVK(KP916574801, +0.916574801383451584742370439148878693530976769);
+     DVK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DVK(KP809385824, +0.809385824416008241660603814668679683846476688);
+     DVK(KP447417479, +0.447417479732227551498980015410057305749330693);
+     DVK(KP894834959, +0.894834959464455102997960030820114611498661386);
+     DVK(KP867381224, +0.867381224396525206773171885031575671309956167);
+     DVK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DVK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DVK(KP958953096, +0.958953096729998668045963838399037225970891871);
+     DVK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DVK(KP829049696, +0.829049696159252993975487806364305442437946767);
+     DVK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DVK(KP897376177, +0.897376177523557693138608077137219684419427330);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DVK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DVK(KP855719849, +0.855719849902058969314654733608091555096772472);
+     DVK(KP237294955, +0.237294955877110315393888866460840817927895961);
+     DVK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DVK(KP904508497, +0.904508497187473712051146708591409529430077295);
+     DVK(KP997675361, +0.997675361079556513670859573984492383596555031);
+     DVK(KP763932022, +0.763932022500210303590826331268723764559381640);
+     DVK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP952936919, +0.952936919628306576880750665357914584765951388);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP262346850, +0.262346850930607871785420028382979691334784273);
+     DVK(KP570584518, +0.570584518783621657366766175430996792655723863);
+     DVK(KP669429328, +0.669429328479476605641803240971985825917022098);
+     DVK(KP923225144, +0.923225144846402650453449441572664695995209956);
+     DVK(KP945422727, +0.945422727388575946270360266328811958657216298);
+     DVK(KP522616830, +0.522616830205754336872861364785224694908468440);
+     DVK(KP956723877, +0.956723877038460305821989399535483155872969262);
+     DVK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DVK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DVK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP921078979, +0.921078979742360627699756128143719920817673854);
+     DVK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DVK(KP982009705, +0.982009705009746369461829878184175962711969869);
+     DVK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DVK(KP494780565, +0.494780565770515410344588413655324772219443730);
+     DVK(KP447533225, +0.447533225982656890041886979663652563063114397);
+     DVK(KP269969613, +0.269969613759572083574752974412347470060951301);
+     DVK(KP244189809, +0.244189809627953270309879511234821255780225091);
+     DVK(KP667278218, +0.667278218140296670899089292254759909713898805);
+     DVK(KP603558818, +0.603558818296015001454675132653458027918768137);
+     DVK(KP522847744, +0.522847744331509716623755382187077770911012542);
+     DVK(KP578046249, +0.578046249379945007321754579646815604023525655);
+     DVK(KP987388751, +0.987388751065621252324603216482382109400433949);
+     DVK(KP893101515, +0.893101515366181661711202267938416198338079437);
+     DVK(KP120146378, +0.120146378570687701782758537356596213647956445);
+     DVK(KP132830569, +0.132830569247582714407653942074819768844536507);
+     DVK(KP869845200, +0.869845200362138853122720822420327157933056305);
+     DVK(KP786782374, +0.786782374965295178365099601674911834788448471);
+     DVK(KP066152395, +0.066152395967733048213034281011006031460903353);
+     DVK(KP059835404, +0.059835404262124915169548397419498386427871950);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V T25, T1B, T2y, T1K, T2s, T23, T1S, T26, T20, T1X;
+	       {
+		    V T1O, T2X, Te, T3L, Td, T3Q, T3j, T3b, T2R, T2M, T2f, T27, T1y, T1H, T3M;
+		    V TW, TR, TK, T2B, T3n, T3e, T2U, T2F, T2i, T2a, Tz, T1C, T3N, TQ, T11;
+		    V T1b, T1c, T16;
+		    {
+			 V T1, T1g, T1i, T1p, T1k, T1m, Tb, T1N, T6, T1M;
+			 {
+			      V T7, T9, T2, T4, T1f, T1h, T1o;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T7 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      T9 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      T4 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			      T1f = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T1h = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      T1o = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      {
+				   V T8, Ta, T3, T5, T1j;
+				   T1j = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+				   T8 = BYTW(&(W[TWVL * 18]), T7);
+				   Ta = BYTW(&(W[TWVL * 28]), T9);
+				   T3 = BYTW(&(W[TWVL * 8]), T2);
+				   T5 = BYTW(&(W[TWVL * 38]), T4);
+				   T1g = BYTW(&(W[TWVL * 4]), T1f);
+				   T1i = BYTW(&(W[TWVL * 14]), T1h);
+				   T1p = BYTW(&(W[TWVL * 34]), T1o);
+				   T1k = BYTW(&(W[TWVL * 44]), T1j);
+				   T1m = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   Tb = VADD(T8, Ta);
+				   T1N = VSUB(T8, Ta);
+				   T6 = VADD(T3, T5);
+				   T1M = VSUB(T3, T5);
+			      }
+			 }
+			 {
+			      V T1v, T1l, Th, Tj, T1w, T1q, Tq, Tk, Tn, Tg;
+			      Tg = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      {
+				   V Tc, Ti, T1n, Tp;
+				   Ti = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   T1v = VSUB(T1i, T1k);
+				   T1l = VADD(T1i, T1k);
+				   T1n = BYTW(&(W[TWVL * 24]), T1m);
+				   Tp = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T1O = VFMA(LDK(KP618033988), T1N, T1M);
+				   T2X = VFNMS(LDK(KP618033988), T1M, T1N);
+				   Te = VSUB(T6, Tb);
+				   Tc = VADD(T6, Tb);
+				   Th = BYTW(&(W[0]), Tg);
+				   Tj = BYTW(&(W[TWVL * 10]), Ti);
+				   T1w = VSUB(T1n, T1p);
+				   T1q = VADD(T1n, T1p);
+				   Tq = BYTW(&(W[TWVL * 30]), Tp);
+				   Tk = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+				   T3L = VADD(T1, Tc);
+				   Td = VFNMS(LDK(KP250000000), Tc, T1);
+				   Tn = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      }
+			      {
+				   V T1x, T2K, TM, TB, Tw, Tm, Tx, Tr, TI, T2L, T1u, TD, TF, TL;
+				   TL = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   {
+					V T1t, Tl, To, TH, T1s, T1r, TA, TC;
+					TA = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+					T1r = VADD(T1l, T1q);
+					T1t = VSUB(T1q, T1l);
+					T1x = VFMA(LDK(KP618033988), T1w, T1v);
+					T2K = VFNMS(LDK(KP618033988), T1v, T1w);
+					Tl = BYTW(&(W[TWVL * 40]), Tk);
+					To = BYTW(&(W[TWVL * 20]), Tn);
+					TM = BYTW(&(W[TWVL * 6]), TL);
+					TB = BYTW(&(W[TWVL * 46]), TA);
+					TH = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					T1s = VFNMS(LDK(KP250000000), T1r, T1g);
+					T3Q = VADD(T1g, T1r);
+					TC = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					Tw = VSUB(Tj, Tl);
+					Tm = VADD(Tj, Tl);
+					Tx = VSUB(Tq, To);
+					Tr = VADD(To, Tq);
+					TI = BYTW(&(W[TWVL * 26]), TH);
+					T2L = VFMA(LDK(KP559016994), T1t, T1s);
+					T1u = VFNMS(LDK(KP559016994), T1t, T1s);
+					TD = BYTW(&(W[TWVL * 16]), TC);
+					TF = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+				   }
+				   {
+					V Tu, Ty, T2E, TE, TN, TG, Tt, TV, Ts;
+					TV = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					Ts = VADD(Tm, Tr);
+					Tu = VSUB(Tm, Tr);
+					Ty = VFNMS(LDK(KP618033988), Tx, Tw);
+					T2E = VFMA(LDK(KP618033988), Tw, Tx);
+					T3j = VFNMS(LDK(KP059835404), T2K, T2L);
+					T3b = VFMA(LDK(KP066152395), T2L, T2K);
+					T2R = VFNMS(LDK(KP786782374), T2K, T2L);
+					T2M = VFMA(LDK(KP869845200), T2L, T2K);
+					T2f = VFMA(LDK(KP132830569), T1u, T1x);
+					T27 = VFNMS(LDK(KP120146378), T1x, T1u);
+					T1y = VFNMS(LDK(KP893101515), T1x, T1u);
+					T1H = VFMA(LDK(KP987388751), T1u, T1x);
+					TE = VSUB(TB, TD);
+					TN = VADD(TD, TB);
+					TG = BYTW(&(W[TWVL * 36]), TF);
+					Tt = VFNMS(LDK(KP250000000), Ts, Th);
+					T3M = VADD(Th, Ts);
+					TW = BYTW(&(W[TWVL * 2]), TV);
+					{
+					     V TJ, TO, Tv, T2D, TY, T15, T10, T13, TP;
+					     {
+						  V TX, T14, TZ, T12;
+						  TX = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+						  T14 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+						  TZ = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+						  T12 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+						  TJ = VSUB(TG, TI);
+						  TO = VADD(TI, TG);
+						  Tv = VFMA(LDK(KP559016994), Tu, Tt);
+						  T2D = VFNMS(LDK(KP559016994), Tu, Tt);
+						  TY = BYTW(&(W[TWVL * 12]), TX);
+						  T15 = BYTW(&(W[TWVL * 32]), T14);
+						  T10 = BYTW(&(W[TWVL * 42]), TZ);
+						  T13 = BYTW(&(W[TWVL * 22]), T12);
+					     }
+					     TP = VADD(TN, TO);
+					     TR = VSUB(TN, TO);
+					     TK = VFMA(LDK(KP618033988), TJ, TE);
+					     T2B = VFNMS(LDK(KP618033988), TE, TJ);
+					     T3n = VFMA(LDK(KP578046249), T2D, T2E);
+					     T3e = VFNMS(LDK(KP522847744), T2E, T2D);
+					     T2U = VFNMS(LDK(KP987388751), T2D, T2E);
+					     T2F = VFMA(LDK(KP893101515), T2E, T2D);
+					     T2i = VFNMS(LDK(KP603558818), Ty, Tv);
+					     T2a = VFMA(LDK(KP667278218), Tv, Ty);
+					     Tz = VFNMS(LDK(KP244189809), Ty, Tv);
+					     T1C = VFMA(LDK(KP269969613), Tv, Ty);
+					     T3N = VADD(TM, TP);
+					     TQ = VFMS(LDK(KP250000000), TP, TM);
+					     T11 = VADD(TY, T10);
+					     T1b = VSUB(TY, T10);
+					     T1c = VSUB(T15, T13);
+					     T16 = VADD(T13, T15);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T2z, Tf, T3W, T3O, T1d, T2H, T3m, T2j, T2b, TT, T1D, T2G, T35, T2V, T2Z;
+			 V T3A, T3g, T2I, T1a, T3R, T3X;
+			 T2z = VFNMS(LDK(KP559016994), Te, Td);
+			 Tf = VFMA(LDK(KP559016994), Te, Td);
+			 {
+			      V TS, T2A, T17, T19;
+			      TS = VFNMS(LDK(KP559016994), TR, TQ);
+			      T2A = VFMA(LDK(KP559016994), TR, TQ);
+			      T3W = VSUB(T3M, T3N);
+			      T3O = VADD(T3M, T3N);
+			      T1d = VFNMS(LDK(KP618033988), T1c, T1b);
+			      T2H = VFMA(LDK(KP618033988), T1b, T1c);
+			      T17 = VADD(T11, T16);
+			      T19 = VSUB(T16, T11);
+			      {
+				   V T3f, T2T, T2C, T18, T3P;
+				   T3m = VFMA(LDK(KP447533225), T2B, T2A);
+				   T3f = VFNMS(LDK(KP494780565), T2A, T2B);
+				   T2T = VFNMS(LDK(KP132830569), T2A, T2B);
+				   T2C = VFMA(LDK(KP120146378), T2B, T2A);
+				   T2j = VFNMS(LDK(KP786782374), TK, TS);
+				   T2b = VFMA(LDK(KP869845200), TS, TK);
+				   TT = VFNMS(LDK(KP667278218), TS, TK);
+				   T1D = VFMA(LDK(KP603558818), TK, TS);
+				   T18 = VFNMS(LDK(KP250000000), T17, TW);
+				   T3P = VADD(TW, T17);
+				   T2G = VFMA(LDK(KP734762448), T2F, T2C);
+				   T35 = VFNMS(LDK(KP734762448), T2F, T2C);
+				   T2V = VFNMS(LDK(KP734762448), T2U, T2T);
+				   T2Z = VFMA(LDK(KP734762448), T2U, T2T);
+				   T3A = VFMA(LDK(KP982009705), T3f, T3e);
+				   T3g = VFNMS(LDK(KP982009705), T3f, T3e);
+				   T2I = VFMA(LDK(KP559016994), T19, T18);
+				   T1a = VFNMS(LDK(KP559016994), T19, T18);
+				   T3R = VADD(T3P, T3Q);
+				   T3X = VSUB(T3P, T3Q);
+			      }
+			 }
+			 {
+			      V T2n, T2t, T1V, T22, T2l, T2d, T1Q, T1I, T2w, T1A, T1F, T2q;
+			      {
+				   V T2k, T1G, T28, T2g, T3K, T3E, T3a, T34, T3x, T3H, T2c, TU, T1T, T1U, T1z;
+				   V T3o, T3t;
+				   T2n = VFNMS(LDK(KP912575812), T2j, T2i);
+				   T2k = VFMA(LDK(KP912575812), T2j, T2i);
+				   T3o = VFNMS(LDK(KP921078979), T3n, T3m);
+				   T3t = VFMA(LDK(KP921078979), T3n, T3m);
+				   {
+					V T3c, T2Q, T2J, T3k, T1e;
+					T3c = VFNMS(LDK(KP667278218), T2I, T2H);
+					T2Q = VFNMS(LDK(KP059835404), T2H, T2I);
+					T2J = VFMA(LDK(KP066152395), T2I, T2H);
+					T3k = VFMA(LDK(KP603558818), T2H, T2I);
+					T1G = VFMA(LDK(KP578046249), T1a, T1d);
+					T1e = VFNMS(LDK(KP522847744), T1d, T1a);
+					T28 = VFNMS(LDK(KP494780565), T1a, T1d);
+					T2g = VFMA(LDK(KP447533225), T1d, T1a);
+					{
+					     V T3U, T3S, T40, T3Y;
+					     T3U = VSUB(T3O, T3R);
+					     T3S = VADD(T3O, T3R);
+					     T40 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T3W, T3X));
+					     T3Y = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T3X, T3W));
+					     {
+						  V T3s, T3l, T2N, T36;
+						  T3s = VFNMS(LDK(KP845997307), T3k, T3j);
+						  T3l = VFMA(LDK(KP845997307), T3k, T3j);
+						  T2N = VFNMS(LDK(KP772036680), T2M, T2J);
+						  T36 = VFMA(LDK(KP772036680), T2M, T2J);
+						  {
+						       V T30, T2S, T3d, T3z, T3T;
+						       T30 = VFNMS(LDK(KP772036680), T2R, T2Q);
+						       T2S = VFMA(LDK(KP772036680), T2R, T2Q);
+						       T3d = VFNMS(LDK(KP845997307), T3c, T3b);
+						       T3z = VFMA(LDK(KP845997307), T3c, T3b);
+						       ST(&(x[0]), VADD(T3S, T3L), ms, &(x[0]));
+						       T3T = VFNMS(LDK(KP250000000), T3S, T3L);
+						       {
+							    V T3C, T3p, T2O, T37;
+							    T3C = VFMA(LDK(KP906616052), T3o, T3l);
+							    T3p = VFNMS(LDK(KP906616052), T3o, T3l);
+							    T2O = VFMA(LDK(KP956723877), T2N, T2G);
+							    T37 = VFMA(LDK(KP522616830), T2V, T36);
+							    {
+								 V T31, T2W, T3u, T3h;
+								 T31 = VFNMS(LDK(KP522616830), T2G, T30);
+								 T2W = VFMA(LDK(KP945422727), T2V, T2S);
+								 T3u = VFNMS(LDK(KP923225144), T3g, T3d);
+								 T3h = VFMA(LDK(KP923225144), T3g, T3d);
+								 {
+								      V T3I, T3B, T3V, T3Z;
+								      T3I = VFNMS(LDK(KP669429328), T3z, T3A);
+								      T3B = VFMA(LDK(KP570584518), T3A, T3z);
+								      T3V = VFMA(LDK(KP559016994), T3U, T3T);
+								      T3Z = VFNMS(LDK(KP559016994), T3U, T3T);
+								      {
+									   V T3y, T3q, T2P, T38;
+									   T3y = VFMA(LDK(KP262346850), T3p, T2X);
+									   T3q = VMUL(LDK(KP998026728), VFNMS(LDK(KP952936919), T2X, T3p));
+									   T2P = VFMA(LDK(KP992114701), T2O, T2z);
+									   T38 = VFNMS(LDK(KP690983005), T37, T2S);
+									   {
+										V T32, T2Y, T3v, T3F;
+										T32 = VFMA(LDK(KP763932022), T31, T2N);
+										T2Y = VMUL(LDK(KP998026728), VFMA(LDK(KP952936919), T2X, T2W));
+										T3v = VFNMS(LDK(KP997675361), T3u, T3t);
+										T3F = VFNMS(LDK(KP904508497), T3u, T3s);
+										{
+										     V T3i, T3r, T3J, T3D;
+										     T3i = VFMA(LDK(KP949179823), T3h, T2z);
+										     T3r = VFNMS(LDK(KP237294955), T3h, T2z);
+										     T3J = VFNMS(LDK(KP669429328), T3C, T3I);
+										     T3D = VFMA(LDK(KP618033988), T3C, T3B);
+										     ST(&(x[WS(rs, 20)]), VFNMSI(T3Y, T3V), ms, &(x[0]));
+										     ST(&(x[WS(rs, 5)]), VFMAI(T3Y, T3V), ms, &(x[WS(rs, 1)]));
+										     ST(&(x[WS(rs, 15)]), VFMAI(T40, T3Z), ms, &(x[WS(rs, 1)]));
+										     ST(&(x[WS(rs, 10)]), VFNMSI(T40, T3Z), ms, &(x[0]));
+										     {
+											  V T39, T33, T3w, T3G;
+											  T39 = VFMA(LDK(KP855719849), T38, T35);
+											  T33 = VFNMS(LDK(KP855719849), T32, T2Z);
+											  ST(&(x[WS(rs, 3)]), VFMAI(T2Y, T2P), ms, &(x[WS(rs, 1)]));
+											  ST(&(x[WS(rs, 22)]), VFNMSI(T2Y, T2P), ms, &(x[0]));
+											  T3w = VFMA(LDK(KP560319534), T3v, T3s);
+											  T3G = VFNMS(LDK(KP681693190), T3F, T3t);
+											  ST(&(x[WS(rs, 2)]), VFMAI(T3q, T3i), ms, &(x[0]));
+											  ST(&(x[WS(rs, 23)]), VFNMSI(T3q, T3i), ms, &(x[WS(rs, 1)]));
+											  T3K = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T3J, T3y));
+											  T3E = VMUL(LDK(KP951056516), VFNMS(LDK(KP949179823), T3D, T3y));
+											  T3a = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T39, T2X));
+											  T34 = VFMA(LDK(KP897376177), T33, T2z);
+											  T3x = VFNMS(LDK(KP949179823), T3w, T3r);
+											  T3H = VFNMS(LDK(KP860541664), T3G, T3r);
+											  T2t = VFNMS(LDK(KP912575812), T2b, T2a);
+											  T2c = VFMA(LDK(KP912575812), T2b, T2a);
+											  TU = VFMA(LDK(KP829049696), TT, Tz);
+											  T1T = VFNMS(LDK(KP829049696), TT, Tz);
+											  T1U = VFNMS(LDK(KP831864738), T1y, T1e);
+											  T1z = VFMA(LDK(KP831864738), T1y, T1e);
+										     }
+										}
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+				   {
+					V T2o, T2h, T29, T2u, T2v, T2p;
+					T2o = VFNMS(LDK(KP958953096), T2g, T2f);
+					T2h = VFMA(LDK(KP958953096), T2g, T2f);
+					ST(&(x[WS(rs, 17)]), VFNMSI(T3a, T34), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 8)]), VFMAI(T3a, T34), ms, &(x[0]));
+					ST(&(x[WS(rs, 13)]), VFMAI(T3E, T3x), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 12)]), VFNMSI(T3E, T3x), ms, &(x[0]));
+					ST(&(x[WS(rs, 7)]), VFNMSI(T3K, T3H), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 18)]), VFMAI(T3K, T3H), ms, &(x[0]));
+					T1V = VFMA(LDK(KP559154169), T1U, T1T);
+					T22 = VFNMS(LDK(KP683113946), T1T, T1U);
+					T29 = VFNMS(LDK(KP867381224), T28, T27);
+					T2u = VFMA(LDK(KP867381224), T28, T27);
+					T2l = VFMA(LDK(KP894834959), T2k, T2h);
+					T2v = VFMA(LDK(KP447417479), T2k, T2u);
+					T2d = VFNMS(LDK(KP809385824), T2c, T29);
+					T2p = VFMA(LDK(KP447417479), T2c, T2o);
+					T1Q = VFMA(LDK(KP831864738), T1H, T1G);
+					T1I = VFNMS(LDK(KP831864738), T1H, T1G);
+					T2w = VFNMS(LDK(KP763932022), T2v, T2h);
+					T1A = VFMA(LDK(KP904730450), T1z, TU);
+					T1F = VFNMS(LDK(KP904730450), T1z, TU);
+					T2q = VFMA(LDK(KP690983005), T2p, T29);
+				   }
+			      }
+			      {
+				   V T2e, T1E, T1P, T2m;
+				   T2e = VFNMS(LDK(KP992114701), T2d, Tf);
+				   T1E = VFMA(LDK(KP916574801), T1D, T1C);
+				   T1P = VFNMS(LDK(KP916574801), T1D, T1C);
+				   T2m = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T2l, T1O));
+				   {
+					V T1J, T2r, T1R, T1W, T1Z, T2x;
+					T2x = VFNMS(LDK(KP999544308), T2w, T2t);
+					T1J = VFNMS(LDK(KP904730450), T1I, T1F);
+					T25 = VFMA(LDK(KP968583161), T1A, Tf);
+					T1B = VFNMS(LDK(KP242145790), T1A, Tf);
+					T2r = VFNMS(LDK(KP999544308), T2q, T2n);
+					T1R = VFMA(LDK(KP904730450), T1Q, T1P);
+					T1W = VFNMS(LDK(KP904730450), T1Q, T1P);
+					T1Z = VADD(T1E, T1F);
+					ST(&(x[WS(rs, 21)]), VFMAI(T2m, T2e), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 4)]), VFNMSI(T2m, T2e), ms, &(x[0]));
+					T2y = VMUL(LDK(KP951056516), VFNMS(LDK(KP803003575), T2x, T1O));
+					T1K = VFNMS(LDK(KP618033988), T1J, T1E);
+					T2s = VFNMS(LDK(KP803003575), T2r, Tf);
+					T23 = VFMA(LDK(KP617882369), T1W, T22);
+					T1S = VFNMS(LDK(KP242145790), T1R, T1O);
+					T26 = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1R, T1O));
+					T20 = VFNMS(LDK(KP683113946), T1Z, T1I);
+					T1X = VFMA(LDK(KP559016994), T1W, T1V);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1L, T24, T21, T1Y;
+		    T1L = VFNMS(LDK(KP876091699), T1K, T1B);
+		    ST(&(x[WS(rs, 16)]), VFMAI(T2y, T2s), ms, &(x[0]));
+		    ST(&(x[WS(rs, 9)]), VFNMSI(T2y, T2s), ms, &(x[WS(rs, 1)]));
+		    T24 = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T23, T1S));
+		    ST(&(x[WS(rs, 24)]), VFNMSI(T26, T25), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T26, T25), ms, &(x[WS(rs, 1)]));
+		    T21 = VFMA(LDK(KP792626838), T20, T1B);
+		    T1Y = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1X, T1S));
+		    ST(&(x[WS(rs, 11)]), VFMAI(T24, T21), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 14)]), VFNMSI(T24, T21), ms, &(x[0]));
+		    ST(&(x[WS(rs, 19)]), VFNMSI(T1Y, T1L), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VFMAI(T1Y, T1L), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t1bv_25"), twinstr, &GENUS, {67, 60, 181, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_25) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t1bv_25 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 248 FP additions, 188 FP multiplications,
+ * (or, 171 additions, 111 multiplications, 77 fused multiply/add),
+ * 100 stack variables, 40 constants, and 50 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP497379774, +0.497379774329709576484567492012895936835134813);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DVK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DVK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DVK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DVK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
+     DVK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DVK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DVK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DVK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DVK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
+     DVK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DVK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DVK(KP851558583, +0.851558583130145297725004891488503407959946084);
+     DVK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DVK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DVK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
+     DVK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
+     DVK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DVK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DVK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
+     DVK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DVK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DVK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DVK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
+     DVK(KP125581039, +0.125581039058626752152356449131262266244969664);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP250666467, +0.250666467128608490746237519633017587885836494);
+     DVK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DVK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
+     DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V T1A, T1z, T1R, T1S, T1B, T1C, T1Q, T2L, T1l, T2v, T1i, T3e, T2u, Tb, T2i;
+	       V Tj, T3b, T2h, Tv, T2k, TD, T3a, T2l, T11, T2s, TY, T3d, T2r;
+	       {
+		    V T1v, T1x, T1y, T1q, T1s, T1t, T1P;
+		    T1A = LD(&(x[0]), ms, &(x[0]));
+		    {
+			 V T1u, T1w, T1p, T1r;
+			 T1u = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 T1v = BYTW(&(W[TWVL * 18]), T1u);
+			 T1w = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T1x = BYTW(&(W[TWVL * 28]), T1w);
+			 T1y = VADD(T1v, T1x);
+			 T1p = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T1q = BYTW(&(W[TWVL * 8]), T1p);
+			 T1r = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 T1s = BYTW(&(W[TWVL * 38]), T1r);
+			 T1t = VADD(T1q, T1s);
+		    }
+		    T1z = VMUL(LDK(KP559016994), VSUB(T1t, T1y));
+		    T1R = VSUB(T1v, T1x);
+		    T1S = VMUL(LDK(KP587785252), T1R);
+		    T1B = VADD(T1t, T1y);
+		    T1C = VFNMS(LDK(KP250000000), T1B, T1A);
+		    T1P = VSUB(T1q, T1s);
+		    T1Q = VMUL(LDK(KP951056516), T1P);
+		    T2L = VMUL(LDK(KP587785252), T1P);
+	       }
+	       {
+		    V T1f, T19, T1b, T1c, T14, T16, T17, T1e;
+		    T1e = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T1f = BYTW(&(W[TWVL * 4]), T1e);
+		    {
+			 V T18, T1a, T13, T15;
+			 T18 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T19 = BYTW(&(W[TWVL * 24]), T18);
+			 T1a = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 T1b = BYTW(&(W[TWVL * 34]), T1a);
+			 T1c = VADD(T19, T1b);
+			 T13 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T14 = BYTW(&(W[TWVL * 14]), T13);
+			 T15 = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T16 = BYTW(&(W[TWVL * 44]), T15);
+			 T17 = VADD(T14, T16);
+		    }
+		    {
+			 V T1j, T1k, T1d, T1g, T1h;
+			 T1j = VSUB(T14, T16);
+			 T1k = VSUB(T19, T1b);
+			 T1l = VFMA(LDK(KP475528258), T1j, VMUL(LDK(KP293892626), T1k));
+			 T2v = VFNMS(LDK(KP475528258), T1k, VMUL(LDK(KP293892626), T1j));
+			 T1d = VMUL(LDK(KP559016994), VSUB(T17, T1c));
+			 T1g = VADD(T17, T1c);
+			 T1h = VFNMS(LDK(KP250000000), T1g, T1f);
+			 T1i = VADD(T1d, T1h);
+			 T3e = VADD(T1f, T1g);
+			 T2u = VSUB(T1h, T1d);
+		    }
+	       }
+	       {
+		    V Tg, T7, T9, Td, T2, T4, Tc, Tf;
+		    Tf = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tg = BYTW(&(W[TWVL * 6]), Tf);
+		    {
+			 V T6, T8, T1, T3;
+			 T6 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 T7 = BYTW(&(W[TWVL * 26]), T6);
+			 T8 = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 36]), T8);
+			 Td = VADD(T7, T9);
+			 T1 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTW(&(W[TWVL * 16]), T1);
+			 T3 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			 T4 = BYTW(&(W[TWVL * 46]), T3);
+			 Tc = VADD(T2, T4);
+		    }
+		    {
+			 V T5, Ta, Te, Th, Ti;
+			 T5 = VSUB(T2, T4);
+			 Ta = VSUB(T7, T9);
+			 Tb = VFMA(LDK(KP475528258), T5, VMUL(LDK(KP293892626), Ta));
+			 T2i = VFNMS(LDK(KP475528258), Ta, VMUL(LDK(KP293892626), T5));
+			 Te = VMUL(LDK(KP559016994), VSUB(Tc, Td));
+			 Th = VADD(Tc, Td);
+			 Ti = VFNMS(LDK(KP250000000), Th, Tg);
+			 Tj = VADD(Te, Ti);
+			 T3b = VADD(Tg, Th);
+			 T2h = VSUB(Ti, Te);
+		    }
+	       }
+	       {
+		    V TA, Tr, Tt, Tx, Tm, To, Tw, Tz;
+		    Tz = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    TA = BYTW(&(W[0]), Tz);
+		    {
+			 V Tq, Ts, Tl, Tn;
+			 Tq = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 Tr = BYTW(&(W[TWVL * 20]), Tq);
+			 Ts = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			 Tt = BYTW(&(W[TWVL * 30]), Ts);
+			 Tx = VADD(Tr, Tt);
+			 Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tm = BYTW(&(W[TWVL * 10]), Tl);
+			 Tn = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			 To = BYTW(&(W[TWVL * 40]), Tn);
+			 Tw = VADD(Tm, To);
+		    }
+		    {
+			 V Tp, Tu, Ty, TB, TC;
+			 Tp = VSUB(Tm, To);
+			 Tu = VSUB(Tr, Tt);
+			 Tv = VFMA(LDK(KP475528258), Tp, VMUL(LDK(KP293892626), Tu));
+			 T2k = VFNMS(LDK(KP475528258), Tu, VMUL(LDK(KP293892626), Tp));
+			 Ty = VMUL(LDK(KP559016994), VSUB(Tw, Tx));
+			 TB = VADD(Tw, Tx);
+			 TC = VFNMS(LDK(KP250000000), TB, TA);
+			 TD = VADD(Ty, TC);
+			 T3a = VADD(TA, TB);
+			 T2l = VSUB(TC, Ty);
+		    }
+	       }
+	       {
+		    V TV, TP, TR, TS, TK, TM, TN, TU;
+		    TU = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    TV = BYTW(&(W[TWVL * 2]), TU);
+		    {
+			 V TO, TQ, TJ, TL;
+			 TO = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 TP = BYTW(&(W[TWVL * 22]), TO);
+			 TQ = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 TR = BYTW(&(W[TWVL * 32]), TQ);
+			 TS = VADD(TP, TR);
+			 TJ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 TK = BYTW(&(W[TWVL * 12]), TJ);
+			 TL = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 TM = BYTW(&(W[TWVL * 42]), TL);
+			 TN = VADD(TK, TM);
+		    }
+		    {
+			 V TZ, T10, TT, TW, TX;
+			 TZ = VSUB(TK, TM);
+			 T10 = VSUB(TP, TR);
+			 T11 = VFMA(LDK(KP475528258), TZ, VMUL(LDK(KP293892626), T10));
+			 T2s = VFNMS(LDK(KP475528258), T10, VMUL(LDK(KP293892626), TZ));
+			 TT = VMUL(LDK(KP559016994), VSUB(TN, TS));
+			 TW = VADD(TN, TS);
+			 TX = VFNMS(LDK(KP250000000), TW, TV);
+			 TY = VADD(TT, TX);
+			 T3d = VADD(TV, TW);
+			 T2r = VSUB(TX, TT);
+		    }
+	       }
+	       {
+		    V T3g, T3o, T3k, T3l, T3j, T3m, T3p, T3n;
+		    {
+			 V T3c, T3f, T3h, T3i;
+			 T3c = VSUB(T3a, T3b);
+			 T3f = VSUB(T3d, T3e);
+			 T3g = VBYI(VFMA(LDK(KP951056516), T3c, VMUL(LDK(KP587785252), T3f)));
+			 T3o = VBYI(VFNMS(LDK(KP951056516), T3f, VMUL(LDK(KP587785252), T3c)));
+			 T3k = VADD(T1A, T1B);
+			 T3h = VADD(T3a, T3b);
+			 T3i = VADD(T3d, T3e);
+			 T3l = VADD(T3h, T3i);
+			 T3j = VMUL(LDK(KP559016994), VSUB(T3h, T3i));
+			 T3m = VFNMS(LDK(KP250000000), T3l, T3k);
+		    }
+		    ST(&(x[0]), VADD(T3k, T3l), ms, &(x[0]));
+		    T3p = VSUB(T3m, T3j);
+		    ST(&(x[WS(rs, 10)]), VADD(T3o, T3p), ms, &(x[0]));
+		    ST(&(x[WS(rs, 15)]), VSUB(T3p, T3o), ms, &(x[WS(rs, 1)]));
+		    T3n = VADD(T3j, T3m);
+		    ST(&(x[WS(rs, 5)]), VADD(T3g, T3n), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 20)]), VSUB(T3n, T3g), ms, &(x[0]));
+	       }
+	       {
+		    V T2z, T2M, T2U, T2V, T2W, T34, T35, T36, T2X, T2Y, T2Z, T31, T32, T33, T2n;
+		    V T2N, T2E, T2K, T2y, T2H, T2A, T2G, T38, T39;
+		    T2z = VSUB(T1C, T1z);
+		    T2M = VFNMS(LDK(KP951056516), T1R, T2L);
+		    T2U = VFMA(LDK(KP1_369094211), T2k, VMUL(LDK(KP728968627), T2l));
+		    T2V = VFNMS(LDK(KP992114701), T2h, VMUL(LDK(KP250666467), T2i));
+		    T2W = VADD(T2U, T2V);
+		    T34 = VFNMS(LDK(KP125581039), T2s, VMUL(LDK(KP998026728), T2r));
+		    T35 = VFMA(LDK(KP1_274847979), T2v, VMUL(LDK(KP770513242), T2u));
+		    T36 = VADD(T34, T35);
+		    T2X = VFMA(LDK(KP1_996053456), T2s, VMUL(LDK(KP062790519), T2r));
+		    T2Y = VFNMS(LDK(KP637423989), T2u, VMUL(LDK(KP1_541026485), T2v));
+		    T2Z = VADD(T2X, T2Y);
+		    T31 = VFNMS(LDK(KP1_457937254), T2k, VMUL(LDK(KP684547105), T2l));
+		    T32 = VFMA(LDK(KP1_984229402), T2i, VMUL(LDK(KP125333233), T2h));
+		    T33 = VADD(T31, T32);
+		    {
+			 V T2j, T2m, T2I, T2C, T2D, T2J;
+			 T2j = VFNMS(LDK(KP851558583), T2i, VMUL(LDK(KP904827052), T2h));
+			 T2m = VFMA(LDK(KP1_752613360), T2k, VMUL(LDK(KP481753674), T2l));
+			 T2I = VADD(T2m, T2j);
+			 T2C = VFMA(LDK(KP1_071653589), T2s, VMUL(LDK(KP844327925), T2r));
+			 T2D = VFMA(LDK(KP125581039), T2v, VMUL(LDK(KP998026728), T2u));
+			 T2J = VADD(T2C, T2D);
+			 T2n = VSUB(T2j, T2m);
+			 T2N = VADD(T2I, T2J);
+			 T2E = VSUB(T2C, T2D);
+			 T2K = VMUL(LDK(KP559016994), VSUB(T2I, T2J));
+		    }
+		    {
+			 V T2o, T2p, T2q, T2t, T2w, T2x;
+			 T2o = VFNMS(LDK(KP963507348), T2k, VMUL(LDK(KP876306680), T2l));
+			 T2p = VFMA(LDK(KP1_809654104), T2i, VMUL(LDK(KP425779291), T2h));
+			 T2q = VSUB(T2o, T2p);
+			 T2t = VFNMS(LDK(KP1_688655851), T2s, VMUL(LDK(KP535826794), T2r));
+			 T2w = VFNMS(LDK(KP1_996053456), T2v, VMUL(LDK(KP062790519), T2u));
+			 T2x = VADD(T2t, T2w);
+			 T2y = VMUL(LDK(KP559016994), VSUB(T2q, T2x));
+			 T2H = VSUB(T2t, T2w);
+			 T2A = VADD(T2q, T2x);
+			 T2G = VADD(T2o, T2p);
+		    }
+		    {
+			 V T2S, T2T, T30, T37;
+			 T2S = VADD(T2z, T2A);
+			 T2T = VBYI(VADD(T2M, T2N));
+			 ST(&(x[WS(rs, 23)]), VSUB(T2S, T2T), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 2)]), VADD(T2S, T2T), ms, &(x[0]));
+			 T30 = VADD(T2z, VADD(T2W, T2Z));
+			 T37 = VBYI(VSUB(VADD(T33, T36), T2M));
+			 ST(&(x[WS(rs, 22)]), VSUB(T30, T37), ms, &(x[0]));
+			 ST(&(x[WS(rs, 3)]), VADD(T30, T37), ms, &(x[WS(rs, 1)]));
+		    }
+		    T38 = VBYI(VSUB(VFMA(LDK(KP951056516), VSUB(T2U, T2V), VFMA(LDK(KP309016994), T33, VFNMS(LDK(KP809016994), T36, VMUL(LDK(KP587785252), VSUB(T2X, T2Y))))), T2M));
+		    T39 = VFMA(LDK(KP309016994), T2W, VFMA(LDK(KP951056516), VSUB(T32, T31), VFMA(LDK(KP587785252), VSUB(T35, T34), VFNMS(LDK(KP809016994), T2Z, T2z))));
+		    ST(&(x[WS(rs, 8)]), VADD(T38, T39), ms, &(x[0]));
+		    ST(&(x[WS(rs, 17)]), VSUB(T39, T38), ms, &(x[WS(rs, 1)]));
+		    {
+			 V T2F, T2Q, T2P, T2R, T2B, T2O;
+			 T2B = VFNMS(LDK(KP250000000), T2A, T2z);
+			 T2F = VFMA(LDK(KP951056516), T2n, VADD(T2y, VFNMS(LDK(KP587785252), T2E, T2B)));
+			 T2Q = VFMA(LDK(KP587785252), T2n, VFMA(LDK(KP951056516), T2E, VSUB(T2B, T2y)));
+			 T2O = VFNMS(LDK(KP250000000), T2N, T2M);
+			 T2P = VBYI(VADD(VFMA(LDK(KP951056516), T2G, VMUL(LDK(KP587785252), T2H)), VADD(T2K, T2O)));
+			 T2R = VBYI(VADD(VFNMS(LDK(KP951056516), T2H, VMUL(LDK(KP587785252), T2G)), VSUB(T2O, T2K)));
+			 ST(&(x[WS(rs, 18)]), VSUB(T2F, T2P), ms, &(x[0]));
+			 ST(&(x[WS(rs, 12)]), VADD(T2Q, T2R), ms, &(x[0]));
+			 ST(&(x[WS(rs, 7)]), VADD(T2F, T2P), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VSUB(T2Q, T2R), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T1D, T1T, T21, T22, T23, T2b, T2c, T2d, T24, T25, T26, T28, T29, T2a, TF;
+		    V T1U, T1I, T1O, T1o, T1L, T1E, T1K, T2f, T2g;
+		    T1D = VADD(T1z, T1C);
+		    T1T = VADD(T1Q, T1S);
+		    T21 = VFMA(LDK(KP1_688655851), Tv, VMUL(LDK(KP535826794), TD));
+		    T22 = VFMA(LDK(KP1_541026485), Tb, VMUL(LDK(KP637423989), Tj));
+		    T23 = VSUB(T21, T22);
+		    T2b = VFMA(LDK(KP851558583), T11, VMUL(LDK(KP904827052), TY));
+		    T2c = VFMA(LDK(KP1_984229402), T1l, VMUL(LDK(KP125333233), T1i));
+		    T2d = VADD(T2b, T2c);
+		    T24 = VFNMS(LDK(KP425779291), TY, VMUL(LDK(KP1_809654104), T11));
+		    T25 = VFNMS(LDK(KP992114701), T1i, VMUL(LDK(KP250666467), T1l));
+		    T26 = VADD(T24, T25);
+		    T28 = VFNMS(LDK(KP1_071653589), Tv, VMUL(LDK(KP844327925), TD));
+		    T29 = VFNMS(LDK(KP770513242), Tj, VMUL(LDK(KP1_274847979), Tb));
+		    T2a = VADD(T28, T29);
+		    {
+			 V Tk, TE, T1M, T1G, T1H, T1N;
+			 Tk = VFMA(LDK(KP1_071653589), Tb, VMUL(LDK(KP844327925), Tj));
+			 TE = VFMA(LDK(KP1_937166322), Tv, VMUL(LDK(KP248689887), TD));
+			 T1M = VADD(TE, Tk);
+			 T1G = VFMA(LDK(KP1_752613360), T11, VMUL(LDK(KP481753674), TY));
+			 T1H = VFMA(LDK(KP1_457937254), T1l, VMUL(LDK(KP684547105), T1i));
+			 T1N = VADD(T1G, T1H);
+			 TF = VSUB(Tk, TE);
+			 T1U = VADD(T1M, T1N);
+			 T1I = VSUB(T1G, T1H);
+			 T1O = VMUL(LDK(KP559016994), VSUB(T1M, T1N));
+		    }
+		    {
+			 V TG, TH, TI, T12, T1m, T1n;
+			 TG = VFNMS(LDK(KP497379774), Tv, VMUL(LDK(KP968583161), TD));
+			 TH = VFNMS(LDK(KP1_688655851), Tb, VMUL(LDK(KP535826794), Tj));
+			 TI = VADD(TG, TH);
+			 T12 = VFNMS(LDK(KP963507348), T11, VMUL(LDK(KP876306680), TY));
+			 T1m = VFNMS(LDK(KP1_369094211), T1l, VMUL(LDK(KP728968627), T1i));
+			 T1n = VADD(T12, T1m);
+			 T1o = VMUL(LDK(KP559016994), VSUB(TI, T1n));
+			 T1L = VSUB(T12, T1m);
+			 T1E = VADD(TI, T1n);
+			 T1K = VSUB(TG, TH);
+		    }
+		    {
+			 V T1Z, T20, T27, T2e;
+			 T1Z = VADD(T1D, T1E);
+			 T20 = VBYI(VADD(T1T, T1U));
+			 ST(&(x[WS(rs, 24)]), VSUB(T1Z, T20), ms, &(x[0]));
+			 ST(&(x[WS(rs, 1)]), VADD(T1Z, T20), ms, &(x[WS(rs, 1)]));
+			 T27 = VADD(T1D, VADD(T23, T26));
+			 T2e = VBYI(VSUB(VADD(T2a, T2d), T1T));
+			 ST(&(x[WS(rs, 21)]), VSUB(T27, T2e), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 4)]), VADD(T27, T2e), ms, &(x[0]));
+		    }
+		    T2f = VBYI(VSUB(VFMA(LDK(KP309016994), T2a, VFMA(LDK(KP951056516), VADD(T21, T22), VFNMS(LDK(KP809016994), T2d, VMUL(LDK(KP587785252), VSUB(T24, T25))))), T1T));
+		    T2g = VFMA(LDK(KP951056516), VSUB(T29, T28), VFMA(LDK(KP309016994), T23, VFMA(LDK(KP587785252), VSUB(T2c, T2b), VFNMS(LDK(KP809016994), T26, T1D))));
+		    ST(&(x[WS(rs, 9)]), VADD(T2f, T2g), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 16)]), VSUB(T2g, T2f), ms, &(x[0]));
+		    {
+			 V T1J, T1X, T1W, T1Y, T1F, T1V;
+			 T1F = VFNMS(LDK(KP250000000), T1E, T1D);
+			 T1J = VFMA(LDK(KP951056516), TF, VADD(T1o, VFNMS(LDK(KP587785252), T1I, T1F)));
+			 T1X = VFMA(LDK(KP587785252), TF, VFMA(LDK(KP951056516), T1I, VSUB(T1F, T1o)));
+			 T1V = VFNMS(LDK(KP250000000), T1U, T1T);
+			 T1W = VBYI(VADD(VFMA(LDK(KP951056516), T1K, VMUL(LDK(KP587785252), T1L)), VADD(T1O, T1V)));
+			 T1Y = VBYI(VADD(VFNMS(LDK(KP951056516), T1L, VMUL(LDK(KP587785252), T1K)), VSUB(T1V, T1O)));
+			 ST(&(x[WS(rs, 19)]), VSUB(T1J, T1W), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VADD(T1X, T1Y), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 6)]), VADD(T1J, T1W), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VSUB(T1X, T1Y), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t1bv_25"), twinstr, &GENUS, {171, 111, 77, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_25) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 3 -name t1bv_3 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 8 FP additions, 8 FP multiplications,
+ * (or, 5 additions, 5 multiplications, 3 fused multiply/add),
+ * 12 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(3, rs)) {
+	       V T1, T2, T4;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, T5, T8, T6, T7;
+		    T3 = BYTW(&(W[0]), T2);
+		    T5 = BYTW(&(W[TWVL * 2]), T4);
+		    T8 = VMUL(LDK(KP866025403), VSUB(T3, T5));
+		    T6 = VADD(T3, T5);
+		    T7 = VFNMS(LDK(KP500000000), T6, T1);
+		    ST(&(x[0]), VADD(T1, T6), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VFNMSI(T8, T7), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T8, T7), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 3, XSIMD_STRING("t1bv_3"), twinstr, &GENUS, {5, 5, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_3) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_3, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 3 -name t1bv_3 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 8 FP additions, 6 FP multiplications,
+ * (or, 7 additions, 5 multiplications, 1 fused multiply/add),
+ * 12 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(3, rs)) {
+	       V T6, T2, T4, T7, T1, T3, T5, T8;
+	       T6 = LD(&(x[0]), ms, &(x[0]));
+	       T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T2 = BYTW(&(W[0]), T1);
+	       T3 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T4 = BYTW(&(W[TWVL * 2]), T3);
+	       T7 = VADD(T2, T4);
+	       ST(&(x[0]), VADD(T6, T7), ms, &(x[0]));
+	       T5 = VBYI(VMUL(LDK(KP866025403), VSUB(T2, T4)));
+	       T8 = VFNMS(LDK(KP500000000), T7, T6);
+	       ST(&(x[WS(rs, 1)]), VADD(T5, T8), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 2)]), VSUB(T8, T5), ms, &(x[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 3, XSIMD_STRING("t1bv_3"), twinstr, &GENUS, {7, 5, 1, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_3) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_3, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,865 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1bv_32 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 217 FP additions, 160 FP multiplications,
+ * (or, 119 additions, 62 multiplications, 98 fused multiply/add),
+ * 104 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T26, T25, T2a, T2i, T24, T2c, T2g, T2k, T2h, T27;
+	       {
+		    V T4, T1z, T2o, T32, T2r, T3f, Tf, T1A, T34, T2O, T1D, TC, T33, T2L, T1C;
+		    V Tr, T2C, T3a, T2F, T3b, T1r, T21, T1k, T20, TQ, TM, TS, TL, T2t, TJ;
+		    V T10, T2u;
+		    {
+			 V Tt, T9, T2p, Te, T2q, TA, Tu, Tx;
+			 {
+			      V T1, T1x, T2, T1v;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T1x = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			      T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			      T1v = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      {
+				   V T5, Tc, T7, Ta, T2m, T2n;
+				   T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+				   Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+				   {
+					V T1y, T3, T1w, T6, Td, T8, Tb, Ts, Tz;
+					Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+					T1y = BYTW(&(W[TWVL * 46]), T1x);
+					T3 = BYTW(&(W[TWVL * 30]), T2);
+					T1w = BYTW(&(W[TWVL * 14]), T1v);
+					T6 = BYTW(&(W[TWVL * 6]), T5);
+					Td = BYTW(&(W[TWVL * 22]), Tc);
+					T8 = BYTW(&(W[TWVL * 38]), T7);
+					Tb = BYTW(&(W[TWVL * 54]), Ta);
+					Tt = BYTW(&(W[TWVL * 58]), Ts);
+					Tz = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+					T4 = VSUB(T1, T3);
+					T2m = VADD(T1, T3);
+					T1z = VSUB(T1w, T1y);
+					T2n = VADD(T1w, T1y);
+					T9 = VSUB(T6, T8);
+					T2p = VADD(T6, T8);
+					Te = VSUB(Tb, Td);
+					T2q = VADD(Tb, Td);
+					TA = BYTW(&(W[TWVL * 10]), Tz);
+				   }
+				   Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   T2o = VADD(T2m, T2n);
+				   T32 = VSUB(T2m, T2n);
+				   Tx = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			      }
+			 }
+			 {
+			      V Tv, To, Ty, Ti, Tj, Tm, Th;
+			      Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      T2r = VADD(T2p, T2q);
+			      T3f = VSUB(T2p, T2q);
+			      Tf = VADD(T9, Te);
+			      T1A = VSUB(T9, Te);
+			      Tv = BYTW(&(W[TWVL * 26]), Tu);
+			      To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			      Ty = BYTW(&(W[TWVL * 42]), Tx);
+			      Ti = BYTW(&(W[TWVL * 2]), Th);
+			      Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      {
+				   V T1f, T1h, T1a, T1c, T18, T2A, T2B, T1p;
+				   {
+					V T15, T17, T1o, T1m;
+					{
+					     V Tw, T2M, Tp, T2N, TB, Tk, Tn, T1n, T14, T16;
+					     T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+					     T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					     Tw = VSUB(Tt, Tv);
+					     T2M = VADD(Tt, Tv);
+					     Tp = BYTW(&(W[TWVL * 50]), To);
+					     T2N = VADD(TA, Ty);
+					     TB = VSUB(Ty, TA);
+					     Tk = BYTW(&(W[TWVL * 34]), Tj);
+					     Tn = BYTW(&(W[TWVL * 18]), Tm);
+					     T15 = BYTW(&(W[TWVL * 60]), T14);
+					     T17 = BYTW(&(W[TWVL * 28]), T16);
+					     T1n = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     {
+						  V T2J, Tl, T2K, Tq, T1l;
+						  T1l = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+						  T34 = VSUB(T2M, T2N);
+						  T2O = VADD(T2M, T2N);
+						  T1D = VFMA(LDK(KP414213562), Tw, TB);
+						  TC = VFNMS(LDK(KP414213562), TB, Tw);
+						  T2J = VADD(Ti, Tk);
+						  Tl = VSUB(Ti, Tk);
+						  T2K = VADD(Tn, Tp);
+						  Tq = VSUB(Tn, Tp);
+						  T1o = BYTW(&(W[TWVL * 12]), T1n);
+						  T1m = BYTW(&(W[TWVL * 44]), T1l);
+						  {
+						       V T1e, T1g, T19, T1b;
+						       T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+						       T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+						       T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+						       T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+						       T33 = VSUB(T2J, T2K);
+						       T2L = VADD(T2J, T2K);
+						       T1C = VFMA(LDK(KP414213562), Tl, Tq);
+						       Tr = VFNMS(LDK(KP414213562), Tq, Tl);
+						       T1f = BYTW(&(W[TWVL * 52]), T1e);
+						       T1h = BYTW(&(W[TWVL * 20]), T1g);
+						       T1a = BYTW(&(W[TWVL * 4]), T19);
+						       T1c = BYTW(&(W[TWVL * 36]), T1b);
+						  }
+					     }
+					}
+					T18 = VSUB(T15, T17);
+					T2A = VADD(T15, T17);
+					T2B = VADD(T1o, T1m);
+					T1p = VSUB(T1m, T1o);
+				   }
+				   {
+					V TG, TI, TZ, TX;
+					{
+					     V T1i, T2E, T1d, T2D, TH, TY, TF;
+					     TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					     T1i = VSUB(T1f, T1h);
+					     T2E = VADD(T1f, T1h);
+					     T1d = VSUB(T1a, T1c);
+					     T2D = VADD(T1a, T1c);
+					     TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					     TY = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+					     T2C = VADD(T2A, T2B);
+					     T3a = VSUB(T2A, T2B);
+					     TG = BYTW(&(W[0]), TF);
+					     {
+						  V TW, T1j, T1q, TP, TR, TK;
+						  TW = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+						  T2F = VADD(T2D, T2E);
+						  T3b = VSUB(T2E, T2D);
+						  T1j = VADD(T1d, T1i);
+						  T1q = VSUB(T1i, T1d);
+						  TI = BYTW(&(W[TWVL * 32]), TH);
+						  TZ = BYTW(&(W[TWVL * 48]), TY);
+						  TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+						  TX = BYTW(&(W[TWVL * 16]), TW);
+						  TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+						  TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+						  T1r = VFMA(LDK(KP707106781), T1q, T1p);
+						  T21 = VFNMS(LDK(KP707106781), T1q, T1p);
+						  T1k = VFMA(LDK(KP707106781), T1j, T18);
+						  T20 = VFNMS(LDK(KP707106781), T1j, T18);
+						  TQ = BYTW(&(W[TWVL * 56]), TP);
+						  TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+						  TS = BYTW(&(W[TWVL * 24]), TR);
+						  TL = BYTW(&(W[TWVL * 8]), TK);
+					     }
+					}
+					T2t = VADD(TG, TI);
+					TJ = VSUB(TG, TI);
+					T10 = VSUB(TX, TZ);
+					T2u = VADD(TX, TZ);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T2s, TT, T2x, T2P, T2Y, T2G, T37, T2v, T2w, TO, T2W, T30, T2U, TN, T2V;
+			 T2s = VSUB(T2o, T2r);
+			 T2U = VADD(T2o, T2r);
+			 TN = BYTW(&(W[TWVL * 40]), TM);
+			 TT = VSUB(TQ, TS);
+			 T2x = VADD(TQ, TS);
+			 T2P = VSUB(T2L, T2O);
+			 T2V = VADD(T2L, T2O);
+			 T2Y = VADD(T2C, T2F);
+			 T2G = VSUB(T2C, T2F);
+			 T37 = VSUB(T2t, T2u);
+			 T2v = VADD(T2t, T2u);
+			 T2w = VADD(TL, TN);
+			 TO = VSUB(TL, TN);
+			 T2W = VSUB(T2U, T2V);
+			 T30 = VADD(T2U, T2V);
+			 {
+			      V T1Y, T12, T1X, TV, T3n, T3t, T3m, T3q;
+			      {
+				   V T3o, T36, T3r, T3h, T3k, T3p, T3d, T3s, T2H, T2Q, T2Z, T31;
+				   {
+					V T35, T3g, T38, T2y, T11, TU, T3c, T3j;
+					T35 = VADD(T33, T34);
+					T3g = VSUB(T33, T34);
+					T38 = VSUB(T2w, T2x);
+					T2y = VADD(T2w, T2x);
+					T11 = VSUB(TO, TT);
+					TU = VADD(TO, TT);
+					T3c = VFNMS(LDK(KP414213562), T3b, T3a);
+					T3j = VFMA(LDK(KP414213562), T3a, T3b);
+					T3o = VFNMS(LDK(KP707106781), T35, T32);
+					T36 = VFMA(LDK(KP707106781), T35, T32);
+					T3r = VFNMS(LDK(KP707106781), T3g, T3f);
+					T3h = VFMA(LDK(KP707106781), T3g, T3f);
+					{
+					     V T3i, T39, T2z, T2X;
+					     T3i = VFMA(LDK(KP414213562), T37, T38);
+					     T39 = VFNMS(LDK(KP414213562), T38, T37);
+					     T2z = VSUB(T2v, T2y);
+					     T2X = VADD(T2v, T2y);
+					     T1Y = VFNMS(LDK(KP707106781), T11, T10);
+					     T12 = VFMA(LDK(KP707106781), T11, T10);
+					     T1X = VFNMS(LDK(KP707106781), TU, TJ);
+					     TV = VFMA(LDK(KP707106781), TU, TJ);
+					     T3k = VSUB(T3i, T3j);
+					     T3p = VADD(T3i, T3j);
+					     T3d = VADD(T39, T3c);
+					     T3s = VSUB(T39, T3c);
+					     T2H = VADD(T2z, T2G);
+					     T2Q = VSUB(T2z, T2G);
+					     T2Z = VSUB(T2X, T2Y);
+					     T31 = VADD(T2X, T2Y);
+					}
+				   }
+				   {
+					V T3v, T3u, T3l, T3e;
+					T3l = VFNMS(LDK(KP923879532), T3k, T3h);
+					T3n = VFMA(LDK(KP923879532), T3k, T3h);
+					T3t = VFMA(LDK(KP923879532), T3s, T3r);
+					T3v = VFNMS(LDK(KP923879532), T3s, T3r);
+					T3e = VFNMS(LDK(KP923879532), T3d, T36);
+					T3m = VFMA(LDK(KP923879532), T3d, T36);
+					{
+					     V T2R, T2T, T2I, T2S;
+					     T2R = VFNMS(LDK(KP707106781), T2Q, T2P);
+					     T2T = VFMA(LDK(KP707106781), T2Q, T2P);
+					     T2I = VFNMS(LDK(KP707106781), T2H, T2s);
+					     T2S = VFMA(LDK(KP707106781), T2H, T2s);
+					     ST(&(x[WS(rs, 16)]), VSUB(T30, T31), ms, &(x[0]));
+					     ST(&(x[0]), VADD(T30, T31), ms, &(x[0]));
+					     ST(&(x[WS(rs, 8)]), VFMAI(T2Z, T2W), ms, &(x[0]));
+					     ST(&(x[WS(rs, 24)]), VFNMSI(T2Z, T2W), ms, &(x[0]));
+					     T3q = VFNMS(LDK(KP923879532), T3p, T3o);
+					     T3u = VFMA(LDK(KP923879532), T3p, T3o);
+					     ST(&(x[WS(rs, 18)]), VFMAI(T3l, T3e), ms, &(x[0]));
+					     ST(&(x[WS(rs, 14)]), VFNMSI(T3l, T3e), ms, &(x[0]));
+					     ST(&(x[WS(rs, 28)]), VFNMSI(T2T, T2S), ms, &(x[0]));
+					     ST(&(x[WS(rs, 4)]), VFMAI(T2T, T2S), ms, &(x[0]));
+					     ST(&(x[WS(rs, 20)]), VFMAI(T2R, T2I), ms, &(x[0]));
+					     ST(&(x[WS(rs, 12)]), VFNMSI(T2R, T2I), ms, &(x[0]));
+					}
+					ST(&(x[WS(rs, 26)]), VFMAI(T3v, T3u), ms, &(x[0]));
+					ST(&(x[WS(rs, 6)]), VFNMSI(T3v, T3u), ms, &(x[0]));
+				   }
+			      }
+			      {
+				   V T1U, T13, T1s, TE, T1M, T1I, T1N, T1B, T1V, T1E;
+				   {
+					V Tg, TD, T1G, T1H;
+					Tg = VFMA(LDK(KP707106781), Tf, T4);
+					T1U = VFNMS(LDK(KP707106781), Tf, T4);
+					T26 = VSUB(Tr, TC);
+					TD = VADD(Tr, TC);
+					T1G = VFMA(LDK(KP198912367), TV, T12);
+					T13 = VFNMS(LDK(KP198912367), T12, TV);
+					T1s = VFNMS(LDK(KP198912367), T1r, T1k);
+					T1H = VFMA(LDK(KP198912367), T1k, T1r);
+					ST(&(x[WS(rs, 2)]), VFMAI(T3n, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 30)]), VFNMSI(T3n, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 22)]), VFNMSI(T3t, T3q), ms, &(x[0]));
+					ST(&(x[WS(rs, 10)]), VFMAI(T3t, T3q), ms, &(x[0]));
+					TE = VFMA(LDK(KP923879532), TD, Tg);
+					T1M = VFNMS(LDK(KP923879532), TD, Tg);
+					T1I = VSUB(T1G, T1H);
+					T1N = VADD(T1G, T1H);
+					T1B = VFMA(LDK(KP707106781), T1A, T1z);
+					T25 = VFNMS(LDK(KP707106781), T1A, T1z);
+					T1V = VADD(T1C, T1D);
+					T1E = VSUB(T1C, T1D);
+				   }
+				   {
+					V T1W, T2e, T2f, T23;
+					{
+					     V T28, T1Z, T1S, T1O, T1t, T1Q, T1F, T1P, T22, T29;
+					     T28 = VFNMS(LDK(KP668178637), T1X, T1Y);
+					     T1Z = VFMA(LDK(KP668178637), T1Y, T1X);
+					     T1S = VFMA(LDK(KP980785280), T1N, T1M);
+					     T1O = VFNMS(LDK(KP980785280), T1N, T1M);
+					     T1t = VADD(T13, T1s);
+					     T1Q = VSUB(T13, T1s);
+					     T1F = VFMA(LDK(KP923879532), T1E, T1B);
+					     T1P = VFNMS(LDK(KP923879532), T1E, T1B);
+					     T1W = VFMA(LDK(KP923879532), T1V, T1U);
+					     T2e = VFNMS(LDK(KP923879532), T1V, T1U);
+					     T22 = VFMA(LDK(KP668178637), T21, T20);
+					     T29 = VFNMS(LDK(KP668178637), T20, T21);
+					     {
+						  V T1K, T1u, T1R, T1T, T1L, T1J;
+						  T1K = VFMA(LDK(KP980785280), T1t, TE);
+						  T1u = VFNMS(LDK(KP980785280), T1t, TE);
+						  T1R = VFMA(LDK(KP980785280), T1Q, T1P);
+						  T1T = VFNMS(LDK(KP980785280), T1Q, T1P);
+						  T1L = VFMA(LDK(KP980785280), T1I, T1F);
+						  T1J = VFNMS(LDK(KP980785280), T1I, T1F);
+						  T2f = VADD(T28, T29);
+						  T2a = VSUB(T28, T29);
+						  T23 = VADD(T1Z, T22);
+						  T2i = VSUB(T1Z, T22);
+						  ST(&(x[WS(rs, 23)]), VFNMSI(T1R, T1O), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 9)]), VFMAI(T1R, T1O), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 25)]), VFMAI(T1T, T1S), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 7)]), VFNMSI(T1T, T1S), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 1)]), VFMAI(T1L, T1K), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 31)]), VFNMSI(T1L, T1K), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 17)]), VFMAI(T1J, T1u), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 15)]), VFNMSI(T1J, T1u), ms, &(x[WS(rs, 1)]));
+					     }
+					}
+					T24 = VFNMS(LDK(KP831469612), T23, T1W);
+					T2c = VFMA(LDK(KP831469612), T23, T1W);
+					T2g = VFMA(LDK(KP831469612), T2f, T2e);
+					T2k = VFNMS(LDK(KP831469612), T2f, T2e);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T2h = VFMA(LDK(KP923879532), T26, T25);
+	       T27 = VFNMS(LDK(KP923879532), T26, T25);
+	       {
+		    V T2j, T2l, T2d, T2b;
+		    T2j = VFNMS(LDK(KP831469612), T2i, T2h);
+		    T2l = VFMA(LDK(KP831469612), T2i, T2h);
+		    T2d = VFMA(LDK(KP831469612), T2a, T27);
+		    T2b = VFNMS(LDK(KP831469612), T2a, T27);
+		    ST(&(x[WS(rs, 21)]), VFMAI(T2j, T2g), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 11)]), VFNMSI(T2j, T2g), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 27)]), VFNMSI(T2l, T2k), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VFMAI(T2l, T2k), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 29)]), VFMAI(T2d, T2c), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VFNMSI(T2d, T2c), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 13)]), VFMAI(T2b, T24), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 19)]), VFNMSI(T2b, T24), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t1bv_32"), twinstr, &GENUS, {119, 62, 98, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_32) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1bv_32 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 217 FP additions, 104 FP multiplications,
+ * (or, 201 additions, 88 multiplications, 16 fused multiply/add),
+ * 59 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T4, T1D, T2P, T3h, Tf, T1y, T2K, T3i, TC, T1w, T2G, T3e, Tr, T1v, T2D;
+	       V T3d, T1k, T20, T2y, T3a, T1r, T21, T2v, T39, TV, T1X, T2r, T37, T12, T1Y;
+	       V T2o, T36;
+	       {
+		    V T1, T1C, T3, T1A, T1B, T2, T1z, T2N, T2O;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T1B = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+		    T1C = BYTW(&(W[TWVL * 46]), T1B);
+		    T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+		    T3 = BYTW(&(W[TWVL * 30]), T2);
+		    T1z = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    T1A = BYTW(&(W[TWVL * 14]), T1z);
+		    T4 = VSUB(T1, T3);
+		    T1D = VSUB(T1A, T1C);
+		    T2N = VADD(T1, T3);
+		    T2O = VADD(T1A, T1C);
+		    T2P = VSUB(T2N, T2O);
+		    T3h = VADD(T2N, T2O);
+	       }
+	       {
+		    V T6, Td, T8, Tb;
+		    {
+			 V T5, Tc, T7, Ta;
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T6 = BYTW(&(W[TWVL * 6]), T5);
+			 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Td = BYTW(&(W[TWVL * 22]), Tc);
+			 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 T8 = BYTW(&(W[TWVL * 38]), T7);
+			 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+			 Tb = BYTW(&(W[TWVL * 54]), Ta);
+		    }
+		    {
+			 V T9, Te, T2I, T2J;
+			 T9 = VSUB(T6, T8);
+			 Te = VSUB(Tb, Td);
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 T1y = VMUL(LDK(KP707106781), VSUB(T9, Te));
+			 T2I = VADD(T6, T8);
+			 T2J = VADD(Tb, Td);
+			 T2K = VSUB(T2I, T2J);
+			 T3i = VADD(T2I, T2J);
+		    }
+	       }
+	       {
+		    V Tt, TA, Tv, Ty;
+		    {
+			 V Ts, Tz, Tu, Tx;
+			 Ts = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tt = BYTW(&(W[TWVL * 10]), Ts);
+			 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 TA = BYTW(&(W[TWVL * 26]), Tz);
+			 Tu = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 Tv = BYTW(&(W[TWVL * 42]), Tu);
+			 Tx = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+			 Ty = BYTW(&(W[TWVL * 58]), Tx);
+		    }
+		    {
+			 V Tw, TB, T2E, T2F;
+			 Tw = VSUB(Tt, Tv);
+			 TB = VSUB(Ty, TA);
+			 TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw));
+			 T1w = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
+			 T2E = VADD(Ty, TA);
+			 T2F = VADD(Tt, Tv);
+			 T2G = VSUB(T2E, T2F);
+			 T3e = VADD(T2E, T2F);
+		    }
+	       }
+	       {
+		    V Ti, Tp, Tk, Tn;
+		    {
+			 V Th, To, Tj, Tm;
+			 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Ti = BYTW(&(W[TWVL * 2]), Th);
+			 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			 Tp = BYTW(&(W[TWVL * 50]), To);
+			 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 Tk = BYTW(&(W[TWVL * 34]), Tj);
+			 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 Tn = BYTW(&(W[TWVL * 18]), Tm);
+		    }
+		    {
+			 V Tl, Tq, T2B, T2C;
+			 Tl = VSUB(Ti, Tk);
+			 Tq = VSUB(Tn, Tp);
+			 Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
+			 T1v = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
+			 T2B = VADD(Ti, Tk);
+			 T2C = VADD(Tn, Tp);
+			 T2D = VSUB(T2B, T2C);
+			 T3d = VADD(T2B, T2C);
+		    }
+	       }
+	       {
+		    V T1g, T1i, T1o, T1m, T1a, T1c, T1d, T15, T17, T18;
+		    {
+			 V T1f, T1h, T1n, T1l;
+			 T1f = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T1g = BYTW(&(W[TWVL * 12]), T1f);
+			 T1h = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T1i = BYTW(&(W[TWVL * 44]), T1h);
+			 T1n = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T1o = BYTW(&(W[TWVL * 28]), T1n);
+			 T1l = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+			 T1m = BYTW(&(W[TWVL * 60]), T1l);
+			 {
+			      V T19, T1b, T14, T16;
+			      T19 = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+			      T1a = BYTW(&(W[TWVL * 52]), T19);
+			      T1b = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      T1c = BYTW(&(W[TWVL * 20]), T1b);
+			      T1d = VSUB(T1a, T1c);
+			      T14 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T15 = BYTW(&(W[TWVL * 4]), T14);
+			      T16 = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			      T17 = BYTW(&(W[TWVL * 36]), T16);
+			      T18 = VSUB(T15, T17);
+			 }
+		    }
+		    {
+			 V T1e, T1j, T2w, T2x;
+			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
+			 T1j = VSUB(T1g, T1i);
+			 T1k = VSUB(T1e, T1j);
+			 T20 = VADD(T1j, T1e);
+			 T2w = VADD(T15, T17);
+			 T2x = VADD(T1a, T1c);
+			 T2y = VSUB(T2w, T2x);
+			 T3a = VADD(T2w, T2x);
+		    }
+		    {
+			 V T1p, T1q, T2t, T2u;
+			 T1p = VSUB(T1m, T1o);
+			 T1q = VMUL(LDK(KP707106781), VADD(T18, T1d));
+			 T1r = VSUB(T1p, T1q);
+			 T21 = VADD(T1p, T1q);
+			 T2t = VADD(T1m, T1o);
+			 T2u = VADD(T1g, T1i);
+			 T2v = VSUB(T2t, T2u);
+			 T39 = VADD(T2t, T2u);
+		    }
+	       }
+	       {
+		    V TR, TT, TZ, TX, TL, TN, TO, TG, TI, TJ;
+		    {
+			 V TQ, TS, TY, TW;
+			 TQ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 TR = BYTW(&(W[TWVL * 16]), TQ);
+			 TS = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+			 TT = BYTW(&(W[TWVL * 48]), TS);
+			 TY = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 TZ = BYTW(&(W[TWVL * 32]), TY);
+			 TW = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 TX = BYTW(&(W[0]), TW);
+			 {
+			      V TK, TM, TF, TH;
+			      TK = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+			      TL = BYTW(&(W[TWVL * 56]), TK);
+			      TM = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			      TN = BYTW(&(W[TWVL * 24]), TM);
+			      TO = VSUB(TL, TN);
+			      TF = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      TG = BYTW(&(W[TWVL * 8]), TF);
+			      TH = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			      TI = BYTW(&(W[TWVL * 40]), TH);
+			      TJ = VSUB(TG, TI);
+			 }
+		    }
+		    {
+			 V TP, TU, T2p, T2q;
+			 TP = VMUL(LDK(KP707106781), VSUB(TJ, TO));
+			 TU = VSUB(TR, TT);
+			 TV = VSUB(TP, TU);
+			 T1X = VADD(TU, TP);
+			 T2p = VADD(TG, TI);
+			 T2q = VADD(TL, TN);
+			 T2r = VSUB(T2p, T2q);
+			 T37 = VADD(T2p, T2q);
+		    }
+		    {
+			 V T10, T11, T2m, T2n;
+			 T10 = VSUB(TX, TZ);
+			 T11 = VMUL(LDK(KP707106781), VADD(TJ, TO));
+			 T12 = VSUB(T10, T11);
+			 T1Y = VADD(T10, T11);
+			 T2m = VADD(TX, TZ);
+			 T2n = VADD(TR, TT);
+			 T2o = VSUB(T2m, T2n);
+			 T36 = VADD(T2m, T2n);
+		    }
+	       }
+	       {
+		    V T3q, T3u, T3t, T3v;
+		    {
+			 V T3o, T3p, T3r, T3s;
+			 T3o = VADD(T3h, T3i);
+			 T3p = VADD(T3d, T3e);
+			 T3q = VSUB(T3o, T3p);
+			 T3u = VADD(T3o, T3p);
+			 T3r = VADD(T36, T37);
+			 T3s = VADD(T39, T3a);
+			 T3t = VBYI(VSUB(T3r, T3s));
+			 T3v = VADD(T3r, T3s);
+		    }
+		    ST(&(x[WS(rs, 24)]), VSUB(T3q, T3t), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T3u, T3v), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VADD(T3q, T3t), ms, &(x[0]));
+		    ST(&(x[WS(rs, 16)]), VSUB(T3u, T3v), ms, &(x[0]));
+	       }
+	       {
+		    V T3f, T3j, T3c, T3k, T38, T3b;
+		    T3f = VSUB(T3d, T3e);
+		    T3j = VSUB(T3h, T3i);
+		    T38 = VSUB(T36, T37);
+		    T3b = VSUB(T39, T3a);
+		    T3c = VMUL(LDK(KP707106781), VSUB(T38, T3b));
+		    T3k = VMUL(LDK(KP707106781), VADD(T38, T3b));
+		    {
+			 V T3g, T3l, T3m, T3n;
+			 T3g = VBYI(VSUB(T3c, T3f));
+			 T3l = VSUB(T3j, T3k);
+			 ST(&(x[WS(rs, 12)]), VADD(T3g, T3l), ms, &(x[0]));
+			 ST(&(x[WS(rs, 20)]), VSUB(T3l, T3g), ms, &(x[0]));
+			 T3m = VBYI(VADD(T3f, T3c));
+			 T3n = VADD(T3j, T3k);
+			 ST(&(x[WS(rs, 4)]), VADD(T3m, T3n), ms, &(x[0]));
+			 ST(&(x[WS(rs, 28)]), VSUB(T3n, T3m), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T2L, T31, T2R, T2Y, T2A, T2Z, T2U, T32, T2H, T2Q;
+		    T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G));
+		    T2L = VSUB(T2H, T2K);
+		    T31 = VADD(T2K, T2H);
+		    T2Q = VMUL(LDK(KP707106781), VADD(T2D, T2G));
+		    T2R = VSUB(T2P, T2Q);
+		    T2Y = VADD(T2P, T2Q);
+		    {
+			 V T2s, T2z, T2S, T2T;
+			 T2s = VFNMS(LDK(KP382683432), T2r, VMUL(LDK(KP923879532), T2o));
+			 T2z = VFMA(LDK(KP923879532), T2v, VMUL(LDK(KP382683432), T2y));
+			 T2A = VSUB(T2s, T2z);
+			 T2Z = VADD(T2s, T2z);
+			 T2S = VFMA(LDK(KP382683432), T2o, VMUL(LDK(KP923879532), T2r));
+			 T2T = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2y));
+			 T2U = VSUB(T2S, T2T);
+			 T32 = VADD(T2S, T2T);
+		    }
+		    {
+			 V T2M, T2V, T34, T35;
+			 T2M = VBYI(VSUB(T2A, T2L));
+			 T2V = VSUB(T2R, T2U);
+			 ST(&(x[WS(rs, 10)]), VADD(T2M, T2V), ms, &(x[0]));
+			 ST(&(x[WS(rs, 22)]), VSUB(T2V, T2M), ms, &(x[0]));
+			 T34 = VSUB(T2Y, T2Z);
+			 T35 = VBYI(VSUB(T32, T31));
+			 ST(&(x[WS(rs, 18)]), VSUB(T34, T35), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VADD(T34, T35), ms, &(x[0]));
+		    }
+		    {
+			 V T2W, T2X, T30, T33;
+			 T2W = VBYI(VADD(T2L, T2A));
+			 T2X = VADD(T2R, T2U);
+			 ST(&(x[WS(rs, 6)]), VADD(T2W, T2X), ms, &(x[0]));
+			 ST(&(x[WS(rs, 26)]), VSUB(T2X, T2W), ms, &(x[0]));
+			 T30 = VADD(T2Y, T2Z);
+			 T33 = VBYI(VADD(T31, T32));
+			 ST(&(x[WS(rs, 30)]), VSUB(T30, T33), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(T30, T33), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V TE, T1P, T1I, T1Q, T1t, T1M, T1F, T1N;
+		    {
+			 V Tg, TD, T1G, T1H;
+			 Tg = VSUB(T4, Tf);
+			 TD = VSUB(Tr, TC);
+			 TE = VSUB(Tg, TD);
+			 T1P = VADD(Tg, TD);
+			 T1G = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12));
+			 T1H = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r));
+			 T1I = VSUB(T1G, T1H);
+			 T1Q = VADD(T1G, T1H);
+		    }
+		    {
+			 V T13, T1s, T1x, T1E;
+			 T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12));
+			 T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k));
+			 T1t = VSUB(T13, T1s);
+			 T1M = VADD(T13, T1s);
+			 T1x = VSUB(T1v, T1w);
+			 T1E = VSUB(T1y, T1D);
+			 T1F = VSUB(T1x, T1E);
+			 T1N = VADD(T1E, T1x);
+		    }
+		    {
+			 V T1u, T1J, T1S, T1T;
+			 T1u = VADD(TE, T1t);
+			 T1J = VBYI(VADD(T1F, T1I));
+			 ST(&(x[WS(rs, 27)]), VSUB(T1u, T1J), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VADD(T1u, T1J), ms, &(x[WS(rs, 1)]));
+			 T1S = VBYI(VADD(T1N, T1M));
+			 T1T = VADD(T1P, T1Q);
+			 ST(&(x[WS(rs, 3)]), VADD(T1S, T1T), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 29)]), VSUB(T1T, T1S), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T1K, T1L, T1O, T1R;
+			 T1K = VSUB(TE, T1t);
+			 T1L = VBYI(VSUB(T1I, T1F));
+			 ST(&(x[WS(rs, 21)]), VSUB(T1K, T1L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VADD(T1K, T1L), ms, &(x[WS(rs, 1)]));
+			 T1O = VBYI(VSUB(T1M, T1N));
+			 T1R = VSUB(T1P, T1Q);
+			 ST(&(x[WS(rs, 13)]), VADD(T1O, T1R), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 19)]), VSUB(T1R, T1O), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T1W, T2h, T2a, T2i, T23, T2e, T27, T2f;
+		    {
+			 V T1U, T1V, T28, T29;
+			 T1U = VADD(T4, Tf);
+			 T1V = VADD(T1v, T1w);
+			 T1W = VSUB(T1U, T1V);
+			 T2h = VADD(T1U, T1V);
+			 T28 = VFNMS(LDK(KP195090322), T1X, VMUL(LDK(KP980785280), T1Y));
+			 T29 = VFMA(LDK(KP195090322), T20, VMUL(LDK(KP980785280), T21));
+			 T2a = VSUB(T28, T29);
+			 T2i = VADD(T28, T29);
+		    }
+		    {
+			 V T1Z, T22, T25, T26;
+			 T1Z = VFMA(LDK(KP980785280), T1X, VMUL(LDK(KP195090322), T1Y));
+			 T22 = VFNMS(LDK(KP195090322), T21, VMUL(LDK(KP980785280), T20));
+			 T23 = VSUB(T1Z, T22);
+			 T2e = VADD(T1Z, T22);
+			 T25 = VADD(Tr, TC);
+			 T26 = VADD(T1D, T1y);
+			 T27 = VSUB(T25, T26);
+			 T2f = VADD(T26, T25);
+		    }
+		    {
+			 V T24, T2b, T2k, T2l;
+			 T24 = VADD(T1W, T23);
+			 T2b = VBYI(VADD(T27, T2a));
+			 ST(&(x[WS(rs, 25)]), VSUB(T24, T2b), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T24, T2b), ms, &(x[WS(rs, 1)]));
+			 T2k = VBYI(VADD(T2f, T2e));
+			 T2l = VADD(T2h, T2i);
+			 ST(&(x[WS(rs, 1)]), VADD(T2k, T2l), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 31)]), VSUB(T2l, T2k), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T2c, T2d, T2g, T2j;
+			 T2c = VSUB(T1W, T23);
+			 T2d = VBYI(VSUB(T2a, T27));
+			 ST(&(x[WS(rs, 23)]), VSUB(T2c, T2d), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 9)]), VADD(T2c, T2d), ms, &(x[WS(rs, 1)]));
+			 T2g = VBYI(VSUB(T2e, T2f));
+			 T2j = VSUB(T2h, T2i);
+			 ST(&(x[WS(rs, 15)]), VADD(T2g, T2j), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 17)]), VSUB(T2j, T2g), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t1bv_32"), twinstr, &GENUS, {201, 88, 16, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_32) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1bv_4 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 11 FP additions, 8 FP multiplications,
+ * (or, 9 additions, 6 multiplications, 2 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T7, T2, T5, T8, T3, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTW(&(W[TWVL * 4]), T7);
+	       T3 = BYTW(&(W[TWVL * 2]), T2);
+	       T6 = BYTW(&(W[0]), T5);
+	       {
+		    V Ta, T4, Tb, T9;
+		    Ta = VADD(T1, T3);
+		    T4 = VSUB(T1, T3);
+		    Tb = VADD(T6, T8);
+		    T9 = VSUB(T6, T8);
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T9, T4), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VFNMSI(T9, T4), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t1bv_4"), twinstr, &GENUS, {9, 6, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_4) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1bv_4 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 11 FP additions, 6 FP multiplications,
+ * (or, 11 additions, 6 multiplications, 0 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T8, T3, T6, T7, T2, T5;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTW(&(W[TWVL * 4]), T7);
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T3 = BYTW(&(W[TWVL * 2]), T2);
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T6 = BYTW(&(W[0]), T5);
+	       {
+		    V T4, T9, Ta, Tb;
+		    T4 = VSUB(T1, T3);
+		    T9 = VBYI(VSUB(T6, T8));
+		    ST(&(x[WS(rs, 3)]), VSUB(T4, T9), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(T4, T9), ms, &(x[WS(rs, 1)]));
+		    Ta = VADD(T1, T3);
+		    Tb = VADD(T6, T8);
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t1bv_4"), twinstr, &GENUS, {11, 6, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_4) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t1bv_5 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 20 FP additions, 19 FP multiplications,
+ * (or, 11 additions, 10 multiplications, 9 fused multiply/add),
+ * 26 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V T1, T2, T9, T4, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T9 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, Ta, T5, T8;
+		    T3 = BYTW(&(W[0]), T2);
+		    Ta = BYTW(&(W[TWVL * 4]), T9);
+		    T5 = BYTW(&(W[TWVL * 6]), T4);
+		    T8 = BYTW(&(W[TWVL * 2]), T7);
+		    {
+			 V T6, Tg, Tb, Th;
+			 T6 = VADD(T3, T5);
+			 Tg = VSUB(T3, T5);
+			 Tb = VADD(T8, Ta);
+			 Th = VSUB(T8, Ta);
+			 {
+			      V Te, Tc, Tk, Ti, Td, Tj, Tf;
+			      Te = VSUB(T6, Tb);
+			      Tc = VADD(T6, Tb);
+			      Tk = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tg, Th));
+			      Ti = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Th, Tg));
+			      Td = VFNMS(LDK(KP250000000), Tc, T1);
+			      ST(&(x[0]), VADD(T1, Tc), ms, &(x[0]));
+			      Tj = VFNMS(LDK(KP559016994), Te, Td);
+			      Tf = VFMA(LDK(KP559016994), Te, Td);
+			      ST(&(x[WS(rs, 2)]), VFNMSI(Tk, Tj), ms, &(x[0]));
+			      ST(&(x[WS(rs, 3)]), VFMAI(Tk, Tj), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 4)]), VFNMSI(Ti, Tf), ms, &(x[0]));
+			      ST(&(x[WS(rs, 1)]), VFMAI(Ti, Tf), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t1bv_5"), twinstr, &GENUS, {11, 10, 9, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_5) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t1bv_5 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 20 FP additions, 14 FP multiplications,
+ * (or, 17 additions, 11 multiplications, 3 fused multiply/add),
+ * 20 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V Tf, T5, Ta, Tc, Td, Tg;
+	       Tf = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTW(&(W[0]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T4 = BYTW(&(W[TWVL * 6]), T3);
+			 T6 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T7 = BYTW(&(W[TWVL * 2]), T6);
+		    }
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    Tc = VADD(T2, T4);
+		    Td = VADD(T7, T9);
+		    Tg = VADD(Tc, Td);
+	       }
+	       ST(&(x[0]), VADD(Tf, Tg), ms, &(x[0]));
+	       {
+		    V Tb, Tj, Ti, Tk, Te, Th;
+		    Tb = VBYI(VFMA(LDK(KP951056516), T5, VMUL(LDK(KP587785252), Ta)));
+		    Tj = VBYI(VFNMS(LDK(KP951056516), Ta, VMUL(LDK(KP587785252), T5)));
+		    Te = VMUL(LDK(KP559016994), VSUB(Tc, Td));
+		    Th = VFNMS(LDK(KP250000000), Tg, Tf);
+		    Ti = VADD(Te, Th);
+		    Tk = VSUB(Th, Te);
+		    ST(&(x[WS(rs, 1)]), VADD(Tb, Ti), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VSUB(Tk, Tj), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 4)]), VSUB(Ti, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Tj, Tk), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t1bv_5"), twinstr, &GENUS, {17, 11, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_5) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name t1bv_6 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 23 FP additions, 18 FP multiplications,
+ * (or, 17 additions, 12 multiplications, 6 fused multiply/add),
+ * 27 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 10)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(6, rs)) {
+	       V T1, T2, Ta, Tc, T5, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T3, Tb, Td, T6, T8;
+		    T3 = BYTW(&(W[TWVL * 4]), T2);
+		    Tb = BYTW(&(W[TWVL * 6]), Ta);
+		    Td = BYTW(&(W[0]), Tc);
+		    T6 = BYTW(&(W[TWVL * 2]), T5);
+		    T8 = BYTW(&(W[TWVL * 8]), T7);
+		    {
+			 V Ti, T4, Tk, Te, Tj, T9;
+			 Ti = VADD(T1, T3);
+			 T4 = VSUB(T1, T3);
+			 Tk = VADD(Tb, Td);
+			 Te = VSUB(Tb, Td);
+			 Tj = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+			 {
+			      V Tl, Tn, Tf, Th, Tm, Tg;
+			      Tl = VADD(Tj, Tk);
+			      Tn = VMUL(LDK(KP866025403), VSUB(Tj, Tk));
+			      Tf = VADD(T9, Te);
+			      Th = VMUL(LDK(KP866025403), VSUB(T9, Te));
+			      ST(&(x[0]), VADD(Ti, Tl), ms, &(x[0]));
+			      Tm = VFNMS(LDK(KP500000000), Tl, Ti);
+			      ST(&(x[WS(rs, 3)]), VADD(T4, Tf), ms, &(x[WS(rs, 1)]));
+			      Tg = VFNMS(LDK(KP500000000), Tf, T4);
+			      ST(&(x[WS(rs, 4)]), VFMAI(Tn, Tm), ms, &(x[0]));
+			      ST(&(x[WS(rs, 2)]), VFNMSI(Tn, Tm), ms, &(x[0]));
+			      ST(&(x[WS(rs, 5)]), VFNMSI(Th, Tg), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VFMAI(Th, Tg), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 6, XSIMD_STRING("t1bv_6"), twinstr, &GENUS, {17, 12, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_6) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_6, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name t1bv_6 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 23 FP additions, 14 FP multiplications,
+ * (or, 21 additions, 12 multiplications, 2 fused multiply/add),
+ * 19 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 10)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(6, rs)) {
+	       V Tf, Ti, Ta, Tk, T5, Tj, Tc, Te, Td;
+	       Tc = LD(&(x[0]), ms, &(x[0]));
+	       Td = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       Te = BYTW(&(W[TWVL * 4]), Td);
+	       Tf = VSUB(Tc, Te);
+	       Ti = VADD(Tc, Te);
+	       {
+		    V T7, T9, T6, T8;
+		    T6 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    T7 = BYTW(&(W[TWVL * 6]), T6);
+		    T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T9 = BYTW(&(W[0]), T8);
+		    Ta = VSUB(T7, T9);
+		    Tk = VADD(T7, T9);
+	       }
+	       {
+		    V T2, T4, T1, T3;
+		    T1 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T2 = BYTW(&(W[TWVL * 2]), T1);
+		    T3 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T4 = BYTW(&(W[TWVL * 8]), T3);
+		    T5 = VSUB(T2, T4);
+		    Tj = VADD(T2, T4);
+	       }
+	       {
+		    V Tb, Tg, Th, Tn, Tl, Tm;
+		    Tb = VBYI(VMUL(LDK(KP866025403), VSUB(T5, Ta)));
+		    Tg = VADD(T5, Ta);
+		    Th = VFNMS(LDK(KP500000000), Tg, Tf);
+		    ST(&(x[WS(rs, 1)]), VADD(Tb, Th), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VADD(Tf, Tg), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VSUB(Th, Tb), ms, &(x[WS(rs, 1)]));
+		    Tn = VBYI(VMUL(LDK(KP866025403), VSUB(Tj, Tk)));
+		    Tl = VADD(Tj, Tk);
+		    Tm = VFNMS(LDK(KP500000000), Tl, Ti);
+		    ST(&(x[WS(rs, 2)]), VSUB(Tm, Tn), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ti, Tl), ms, &(x[0]));
+		    ST(&(x[WS(rs, 4)]), VADD(Tn, Tm), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 6, XSIMD_STRING("t1bv_6"), twinstr, &GENUS, {21, 12, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_6) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_6, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1877 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:05 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t1bv_64 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 519 FP additions, 384 FP multiplications,
+ * (or, 261 additions, 126 multiplications, 258 fused multiply/add),
+ * 187 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V T6L, T6M, T6O, T6P, T75, T6V, T5A, T6A, T72, T6K, T6t, T6D, T6w, T6B, T6h;
+	       V T6E;
+	       {
+		    V Ta, T3U, T3V, T37, T7a, T58, T7B, T6l, T1v, T24, T5Q, T7o, T5F, T7l, T43;
+		    V T4F, T2i, T2R, T6b, T7v, T60, T7s, T4a, T4I, T5u, T7h, T5x, T7g, T1i, T3b;
+		    V T4m, T4C, T7e, T5l, T7d, T5o, T3a, TV, T4B, T4j, T3X, T3Y, T6o, T7b, T5f;
+		    V T7C, Tx, T38, T2p, T61, T2n, T65, T2D, T7p, T5M, T7m, T5T, T4G, T46, T25;
+		    V T1S, T2q, T2u, T2w;
+		    {
+			 V T5q, T10, T5v, T15, T1b, T5s, T1c, T1e;
+			 {
+			      V T1V, T1p, T5B, T5O, T1u, T1X, T20, T21;
+			      {
+				   V T1, T2, T7, T5, T32, T34, T2X, T2Z;
+				   T1 = LD(&(x[0]), ms, &(x[0]));
+				   T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
+				   T5 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T32 = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
+				   T34 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+				   T2X = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+				   T2Z = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
+				   {
+					V T1m, T54, T6j, T36, T56, T31, T55, T1n, T1q, T1s, T4, T9;
+					{
+					     V T3, T8, T6, T33, T35, T2Y, T30, T1l;
+					     T1l = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					     T3 = BYTW(&(W[TWVL * 62]), T2);
+					     T8 = BYTW(&(W[TWVL * 94]), T7);
+					     T6 = BYTW(&(W[TWVL * 30]), T5);
+					     T33 = BYTW(&(W[TWVL * 110]), T32);
+					     T35 = BYTW(&(W[TWVL * 46]), T34);
+					     T2Y = BYTW(&(W[TWVL * 14]), T2X);
+					     T30 = BYTW(&(W[TWVL * 78]), T2Z);
+					     T1m = BYTW(&(W[0]), T1l);
+					     T54 = VSUB(T1, T3);
+					     T4 = VADD(T1, T3);
+					     T6j = VSUB(T6, T8);
+					     T9 = VADD(T6, T8);
+					     T36 = VADD(T33, T35);
+					     T56 = VSUB(T33, T35);
+					     T31 = VADD(T2Y, T30);
+					     T55 = VSUB(T2Y, T30);
+					     T1n = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
+					}
+					T1q = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					T1s = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
+					Ta = VSUB(T4, T9);
+					T3U = VADD(T4, T9);
+					{
+					     V T57, T6k, T1o, T1r, T1t, T1W, T1U, T1Z;
+					     T1U = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					     T3V = VADD(T31, T36);
+					     T37 = VSUB(T31, T36);
+					     T57 = VADD(T55, T56);
+					     T6k = VSUB(T55, T56);
+					     T1o = BYTW(&(W[TWVL * 64]), T1n);
+					     T1r = BYTW(&(W[TWVL * 32]), T1q);
+					     T1t = BYTW(&(W[TWVL * 96]), T1s);
+					     T1V = BYTW(&(W[TWVL * 16]), T1U);
+					     T1W = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
+					     T1Z = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
+					     T7a = VFNMS(LDK(KP707106781), T57, T54);
+					     T58 = VFMA(LDK(KP707106781), T57, T54);
+					     T7B = VFNMS(LDK(KP707106781), T6k, T6j);
+					     T6l = VFMA(LDK(KP707106781), T6k, T6j);
+					     T1p = VADD(T1m, T1o);
+					     T5B = VSUB(T1m, T1o);
+					     T5O = VSUB(T1r, T1t);
+					     T1u = VADD(T1r, T1t);
+					     T1X = BYTW(&(W[TWVL * 80]), T1W);
+					     T20 = BYTW(&(W[TWVL * 112]), T1Z);
+					     T21 = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+					}
+				   }
+			      }
+			      {
+				   V T5W, T2N, T69, T2L, T5Y, T2P, T48, T2c, T2h;
+				   {
+					V T41, T1Y, T5C, T22, T2d, T29, T2b, T2f, T28, T2a, T2H, T2J;
+					T28 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
+					T2a = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+					T1v = VSUB(T1p, T1u);
+					T41 = VADD(T1p, T1u);
+					T1Y = VADD(T1V, T1X);
+					T5C = VSUB(T1V, T1X);
+					T22 = BYTW(&(W[TWVL * 48]), T21);
+					T2d = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					T29 = BYTW(&(W[TWVL * 124]), T28);
+					T2b = BYTW(&(W[TWVL * 60]), T2a);
+					T2f = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
+					T2H = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
+					T2J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V T23, T5D, T2e, T2g, T2I, T2K, T2M;
+					     T2M = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     T23 = VADD(T20, T22);
+					     T5D = VSUB(T20, T22);
+					     T2e = BYTW(&(W[TWVL * 28]), T2d);
+					     T2c = VADD(T29, T2b);
+					     T5W = VSUB(T29, T2b);
+					     T2g = BYTW(&(W[TWVL * 92]), T2f);
+					     T2I = BYTW(&(W[TWVL * 108]), T2H);
+					     T2K = BYTW(&(W[TWVL * 44]), T2J);
+					     T2N = BYTW(&(W[TWVL * 12]), T2M);
+					     {
+						  V T5E, T5P, T42, T2O;
+						  T5E = VADD(T5C, T5D);
+						  T5P = VSUB(T5C, T5D);
+						  T24 = VSUB(T1Y, T23);
+						  T42 = VADD(T1Y, T23);
+						  T69 = VSUB(T2g, T2e);
+						  T2h = VADD(T2e, T2g);
+						  T2O = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
+						  T2L = VADD(T2I, T2K);
+						  T5Y = VSUB(T2I, T2K);
+						  T5Q = VFMA(LDK(KP707106781), T5P, T5O);
+						  T7o = VFNMS(LDK(KP707106781), T5P, T5O);
+						  T5F = VFMA(LDK(KP707106781), T5E, T5B);
+						  T7l = VFNMS(LDK(KP707106781), T5E, T5B);
+						  T43 = VADD(T41, T42);
+						  T4F = VSUB(T41, T42);
+						  T2P = BYTW(&(W[TWVL * 76]), T2O);
+					     }
+					}
+				   }
+				   T2i = VSUB(T2c, T2h);
+				   T48 = VADD(T2c, T2h);
+				   {
+					V TW, TY, T11, T2Q, T5X, T13;
+					TW = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
+					TY = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+					T11 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					T2Q = VADD(T2N, T2P);
+					T5X = VSUB(T2N, T2P);
+					T13 = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
+					{
+					     V T12, T5Z, T6a, T49, T14, T18, T1a;
+					     {
+						  V T17, T19, TX, TZ;
+						  T17 = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
+						  T19 = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+						  TX = BYTW(&(W[TWVL * 122]), TW);
+						  TZ = BYTW(&(W[TWVL * 58]), TY);
+						  T12 = BYTW(&(W[TWVL * 26]), T11);
+						  T5Z = VADD(T5X, T5Y);
+						  T6a = VSUB(T5Y, T5X);
+						  T2R = VSUB(T2L, T2Q);
+						  T49 = VADD(T2Q, T2L);
+						  T14 = BYTW(&(W[TWVL * 90]), T13);
+						  T18 = BYTW(&(W[TWVL * 106]), T17);
+						  T5q = VSUB(TX, TZ);
+						  T10 = VADD(TX, TZ);
+						  T1a = BYTW(&(W[TWVL * 42]), T19);
+					     }
+					     T6b = VFMA(LDK(KP707106781), T6a, T69);
+					     T7v = VFNMS(LDK(KP707106781), T6a, T69);
+					     T60 = VFMA(LDK(KP707106781), T5Z, T5W);
+					     T7s = VFNMS(LDK(KP707106781), T5Z, T5W);
+					     T4a = VADD(T48, T49);
+					     T4I = VSUB(T48, T49);
+					     T5v = VSUB(T14, T12);
+					     T15 = VADD(T12, T14);
+					     T1b = VADD(T18, T1a);
+					     T5s = VSUB(T18, T1a);
+					}
+					T1c = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+					T1e = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
+				   }
+			      }
+			 }
+			 {
+			      V Th, T59, Tf, Tv, T5d, Tj, Tm, To;
+			      {
+				   V T5h, TQ, T5m, T5i, TO, TS, TJ, T4h, TD, TI;
+				   {
+					V T4k, T16, TB, T1d, T1f, TE, TG, TA, Tz, TK, TM, TC;
+					Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					T4k = VADD(T10, T15);
+					T16 = VSUB(T10, T15);
+					TB = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
+					T1d = BYTW(&(W[TWVL * 10]), T1c);
+					T1f = BYTW(&(W[TWVL * 74]), T1e);
+					TE = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+					TG = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
+					TA = BYTW(&(W[TWVL * 2]), Tz);
+					TK = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+					TM = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
+					TC = BYTW(&(W[TWVL * 66]), TB);
+					{
+					     V T1g, T5r, TF, TH, TL, TN, TP;
+					     TP = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
+					     T1g = VADD(T1d, T1f);
+					     T5r = VSUB(T1d, T1f);
+					     TF = BYTW(&(W[TWVL * 34]), TE);
+					     TH = BYTW(&(W[TWVL * 98]), TG);
+					     TL = BYTW(&(W[TWVL * 18]), TK);
+					     TN = BYTW(&(W[TWVL * 82]), TM);
+					     T5h = VSUB(TA, TC);
+					     TD = VADD(TA, TC);
+					     TQ = BYTW(&(W[TWVL * 114]), TP);
+					     {
+						  V T5w, T5t, T4l, T1h, TR;
+						  T5w = VSUB(T5s, T5r);
+						  T5t = VADD(T5r, T5s);
+						  T4l = VADD(T1g, T1b);
+						  T1h = VSUB(T1b, T1g);
+						  T5m = VSUB(TF, TH);
+						  TI = VADD(TF, TH);
+						  T5i = VSUB(TL, TN);
+						  TO = VADD(TL, TN);
+						  TR = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+						  T5u = VFMA(LDK(KP707106781), T5t, T5q);
+						  T7h = VFNMS(LDK(KP707106781), T5t, T5q);
+						  T5x = VFMA(LDK(KP707106781), T5w, T5v);
+						  T7g = VFNMS(LDK(KP707106781), T5w, T5v);
+						  T1i = VFNMS(LDK(KP414213562), T1h, T16);
+						  T3b = VFMA(LDK(KP414213562), T16, T1h);
+						  T4m = VADD(T4k, T4l);
+						  T4C = VSUB(T4k, T4l);
+						  TS = BYTW(&(W[TWVL * 50]), TR);
+					     }
+					}
+				   }
+				   TJ = VSUB(TD, TI);
+				   T4h = VADD(TD, TI);
+				   {
+					V Tb, Td, Tr, T5j, TT, Tt, Tg;
+					Tb = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+					Td = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
+					Tr = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					T5j = VSUB(TQ, TS);
+					TT = VADD(TQ, TS);
+					Tt = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
+					Tg = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+					{
+					     V Ti, Tc, Te, Ts;
+					     Ti = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
+					     Tc = BYTW(&(W[TWVL * 6]), Tb);
+					     Te = BYTW(&(W[TWVL * 70]), Td);
+					     Ts = BYTW(&(W[TWVL * 22]), Tr);
+					     {
+						  V T5k, T5n, TU, T4i, Tu;
+						  T5k = VADD(T5i, T5j);
+						  T5n = VSUB(T5i, T5j);
+						  TU = VSUB(TO, TT);
+						  T4i = VADD(TO, TT);
+						  Tu = BYTW(&(W[TWVL * 86]), Tt);
+						  Th = BYTW(&(W[TWVL * 38]), Tg);
+						  T59 = VSUB(Tc, Te);
+						  Tf = VADD(Tc, Te);
+						  T7e = VFNMS(LDK(KP707106781), T5k, T5h);
+						  T5l = VFMA(LDK(KP707106781), T5k, T5h);
+						  T7d = VFNMS(LDK(KP707106781), T5n, T5m);
+						  T5o = VFMA(LDK(KP707106781), T5n, T5m);
+						  T3a = VFMA(LDK(KP414213562), TJ, TU);
+						  TV = VFNMS(LDK(KP414213562), TU, TJ);
+						  T4B = VSUB(T4h, T4i);
+						  T4j = VADD(T4h, T4i);
+						  Tv = VADD(Ts, Tu);
+						  T5d = VSUB(Tu, Ts);
+						  Tj = BYTW(&(W[TWVL * 102]), Ti);
+					     }
+					}
+					Tm = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
+					To = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+				   }
+			      }
+			      {
+				   V T5b, T6m, Tl, T1A, T5G, T1Q, T5K, T1C, T1D, T5e, T6n, Tw, T1H, T1J;
+				   {
+					V T1w, T1y, T1M, T1O, Tq, T5c, T1B;
+					T1w = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+					T1y = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
+					T1M = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+					T1O = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
+					T1B = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V Tk, T5a, Tn, Tp;
+					     Tk = VADD(Th, Tj);
+					     T5a = VSUB(Th, Tj);
+					     Tn = BYTW(&(W[TWVL * 118]), Tm);
+					     Tp = BYTW(&(W[TWVL * 54]), To);
+					     {
+						  V T1x, T1z, T1N, T1P;
+						  T1x = BYTW(&(W[TWVL * 8]), T1w);
+						  T1z = BYTW(&(W[TWVL * 72]), T1y);
+						  T1N = BYTW(&(W[TWVL * 24]), T1M);
+						  T1P = BYTW(&(W[TWVL * 88]), T1O);
+						  T5b = VFNMS(LDK(KP414213562), T5a, T59);
+						  T6m = VFMA(LDK(KP414213562), T59, T5a);
+						  T3X = VADD(Tf, Tk);
+						  Tl = VSUB(Tf, Tk);
+						  Tq = VADD(Tn, Tp);
+						  T5c = VSUB(Tn, Tp);
+						  T1A = VADD(T1x, T1z);
+						  T5G = VSUB(T1x, T1z);
+						  T1Q = VADD(T1N, T1P);
+						  T5K = VSUB(T1N, T1P);
+						  T1C = BYTW(&(W[TWVL * 40]), T1B);
+					     }
+					}
+					T1D = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
+					T5e = VFNMS(LDK(KP414213562), T5d, T5c);
+					T6n = VFMA(LDK(KP414213562), T5c, T5d);
+					T3Y = VADD(Tq, Tv);
+					Tw = VSUB(Tq, Tv);
+					T1H = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
+					T1J = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+				   }
+				   {
+					V T1I, T1K, T1F, T5H, T2k, T2l, T2z, T2B, T2j, T1E;
+					T2j = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+					T1E = BYTW(&(W[TWVL * 104]), T1D);
+					T6o = VSUB(T6m, T6n);
+					T7b = VADD(T6m, T6n);
+					T5f = VADD(T5b, T5e);
+					T7C = VSUB(T5b, T5e);
+					Tx = VADD(Tl, Tw);
+					T38 = VSUB(Tl, Tw);
+					T1I = BYTW(&(W[TWVL * 120]), T1H);
+					T1K = BYTW(&(W[TWVL * 56]), T1J);
+					T1F = VADD(T1C, T1E);
+					T5H = VSUB(T1C, T1E);
+					T2k = BYTW(&(W[TWVL * 4]), T2j);
+					T2l = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
+					T2z = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					T2B = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V T5I, T5R, T44, T1G, T2m, T2A, T2C, T5S, T5L, T1R, T45, T2o, T5J, T1L;
+					     T2o = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+					     T5J = VSUB(T1I, T1K);
+					     T1L = VADD(T1I, T1K);
+					     T5I = VFNMS(LDK(KP414213562), T5H, T5G);
+					     T5R = VFMA(LDK(KP414213562), T5G, T5H);
+					     T44 = VADD(T1A, T1F);
+					     T1G = VSUB(T1A, T1F);
+					     T2m = BYTW(&(W[TWVL * 68]), T2l);
+					     T2A = BYTW(&(W[TWVL * 20]), T2z);
+					     T2C = BYTW(&(W[TWVL * 84]), T2B);
+					     T5S = VFNMS(LDK(KP414213562), T5J, T5K);
+					     T5L = VFMA(LDK(KP414213562), T5K, T5J);
+					     T1R = VSUB(T1L, T1Q);
+					     T45 = VADD(T1L, T1Q);
+					     T2p = BYTW(&(W[TWVL * 36]), T2o);
+					     T61 = VSUB(T2k, T2m);
+					     T2n = VADD(T2k, T2m);
+					     T65 = VSUB(T2C, T2A);
+					     T2D = VADD(T2A, T2C);
+					     T7p = VSUB(T5I, T5L);
+					     T5M = VADD(T5I, T5L);
+					     T7m = VSUB(T5R, T5S);
+					     T5T = VADD(T5R, T5S);
+					     T4G = VSUB(T44, T45);
+					     T46 = VADD(T44, T45);
+					     T25 = VSUB(T1G, T1R);
+					     T1S = VADD(T1G, T1R);
+					     T2q = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
+					}
+					T2u = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
+					T2w = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T67, T7w, T6e, T7t, T3s, T3E, T39, T3D, T1k, T3k, T3t, T3c, T1T, T3v, T3w;
+			 V T26, T2G, T3y, T3z, T2T;
+			 {
+			      V T4A, T4N, T47, T4v, T2r, T2v, T2x, T4s, T40, T3W, T3Z;
+			      T4A = VSUB(T3U, T3V);
+			      T3W = VADD(T3U, T3V);
+			      T3Z = VADD(T3X, T3Y);
+			      T4N = VSUB(T3X, T3Y);
+			      T47 = VSUB(T43, T46);
+			      T4v = VADD(T43, T46);
+			      T2r = BYTW(&(W[TWVL * 100]), T2q);
+			      T2v = BYTW(&(W[TWVL * 116]), T2u);
+			      T2x = BYTW(&(W[TWVL * 52]), T2w);
+			      T4s = VADD(T3W, T3Z);
+			      T40 = VSUB(T3W, T3Z);
+			      {
+				   V T4O, T4n, T4Q, T4H, T4E, T4W, T4u, T4y, T4d, T4J, T2F, T2S;
+				   {
+					V T6c, T63, T2t, T4b, T6d, T66, T2E, T4c;
+					{
+					     V T4D, T62, T2s, T64, T2y, T4t;
+					     T4O = VSUB(T4B, T4C);
+					     T4D = VADD(T4B, T4C);
+					     T62 = VSUB(T2r, T2p);
+					     T2s = VADD(T2p, T2r);
+					     T64 = VSUB(T2v, T2x);
+					     T2y = VADD(T2v, T2x);
+					     T4t = VADD(T4j, T4m);
+					     T4n = VSUB(T4j, T4m);
+					     T4Q = VFMA(LDK(KP414213562), T4F, T4G);
+					     T4H = VFNMS(LDK(KP414213562), T4G, T4F);
+					     T4E = VFMA(LDK(KP707106781), T4D, T4A);
+					     T4W = VFNMS(LDK(KP707106781), T4D, T4A);
+					     T6c = VFNMS(LDK(KP414213562), T61, T62);
+					     T63 = VFMA(LDK(KP414213562), T62, T61);
+					     T2t = VSUB(T2n, T2s);
+					     T4b = VADD(T2n, T2s);
+					     T6d = VFMA(LDK(KP414213562), T64, T65);
+					     T66 = VFNMS(LDK(KP414213562), T65, T64);
+					     T2E = VSUB(T2y, T2D);
+					     T4c = VADD(T2y, T2D);
+					     T4u = VSUB(T4s, T4t);
+					     T4y = VADD(T4s, T4t);
+					}
+					T67 = VADD(T63, T66);
+					T7w = VSUB(T66, T63);
+					T6e = VADD(T6c, T6d);
+					T7t = VSUB(T6d, T6c);
+					T4d = VADD(T4b, T4c);
+					T4J = VSUB(T4c, T4b);
+					T2F = VADD(T2t, T2E);
+					T2S = VSUB(T2E, T2t);
+				   }
+				   {
+					V Ty, T1j, T4R, T4K;
+					Ty = VFMA(LDK(KP707106781), Tx, Ta);
+					T3s = VFNMS(LDK(KP707106781), Tx, Ta);
+					T3E = VSUB(TV, T1i);
+					T1j = VADD(TV, T1i);
+					T39 = VFMA(LDK(KP707106781), T38, T37);
+					T3D = VFNMS(LDK(KP707106781), T38, T37);
+					T4R = VFMA(LDK(KP414213562), T4I, T4J);
+					T4K = VFNMS(LDK(KP414213562), T4J, T4I);
+					{
+					     V T4w, T4e, T4P, T4Z;
+					     T4w = VADD(T4a, T4d);
+					     T4e = VSUB(T4a, T4d);
+					     T4P = VFMA(LDK(KP707106781), T4O, T4N);
+					     T4Z = VFNMS(LDK(KP707106781), T4O, T4N);
+					     T1k = VFMA(LDK(KP923879532), T1j, Ty);
+					     T3k = VFNMS(LDK(KP923879532), T1j, Ty);
+					     {
+						  V T4L, T50, T4S, T4X;
+						  T4L = VADD(T4H, T4K);
+						  T50 = VSUB(T4H, T4K);
+						  T4S = VSUB(T4Q, T4R);
+						  T4X = VADD(T4Q, T4R);
+						  {
+						       V T4f, T4o, T4x, T4z;
+						       T4f = VADD(T47, T4e);
+						       T4o = VSUB(T47, T4e);
+						       T4x = VSUB(T4v, T4w);
+						       T4z = VADD(T4v, T4w);
+						       {
+							    V T53, T51, T4M, T4U;
+							    T53 = VFNMS(LDK(KP923879532), T50, T4Z);
+							    T51 = VFMA(LDK(KP923879532), T50, T4Z);
+							    T4M = VFNMS(LDK(KP923879532), T4L, T4E);
+							    T4U = VFMA(LDK(KP923879532), T4L, T4E);
+							    {
+								 V T52, T4Y, T4T, T4V;
+								 T52 = VFMA(LDK(KP923879532), T4X, T4W);
+								 T4Y = VFNMS(LDK(KP923879532), T4X, T4W);
+								 T4T = VFNMS(LDK(KP923879532), T4S, T4P);
+								 T4V = VFMA(LDK(KP923879532), T4S, T4P);
+								 {
+								      V T4p, T4r, T4g, T4q;
+								      T4p = VFNMS(LDK(KP707106781), T4o, T4n);
+								      T4r = VFMA(LDK(KP707106781), T4o, T4n);
+								      T4g = VFNMS(LDK(KP707106781), T4f, T40);
+								      T4q = VFMA(LDK(KP707106781), T4f, T40);
+								      ST(&(x[0]), VADD(T4y, T4z), ms, &(x[0]));
+								      ST(&(x[WS(rs, 32)]), VSUB(T4y, T4z), ms, &(x[0]));
+								      ST(&(x[WS(rs, 16)]), VFMAI(T4x, T4u), ms, &(x[0]));
+								      ST(&(x[WS(rs, 48)]), VFNMSI(T4x, T4u), ms, &(x[0]));
+								      ST(&(x[WS(rs, 44)]), VFNMSI(T51, T4Y), ms, &(x[0]));
+								      ST(&(x[WS(rs, 20)]), VFMAI(T51, T4Y), ms, &(x[0]));
+								      ST(&(x[WS(rs, 52)]), VFMAI(T53, T52), ms, &(x[0]));
+								      ST(&(x[WS(rs, 12)]), VFNMSI(T53, T52), ms, &(x[0]));
+								      ST(&(x[WS(rs, 4)]), VFMAI(T4V, T4U), ms, &(x[0]));
+								      ST(&(x[WS(rs, 60)]), VFNMSI(T4V, T4U), ms, &(x[0]));
+								      ST(&(x[WS(rs, 36)]), VFMAI(T4T, T4M), ms, &(x[0]));
+								      ST(&(x[WS(rs, 28)]), VFNMSI(T4T, T4M), ms, &(x[0]));
+								      ST(&(x[WS(rs, 56)]), VFNMSI(T4r, T4q), ms, &(x[0]));
+								      ST(&(x[WS(rs, 8)]), VFMAI(T4r, T4q), ms, &(x[0]));
+								      ST(&(x[WS(rs, 40)]), VFMAI(T4p, T4g), ms, &(x[0]));
+								      ST(&(x[WS(rs, 24)]), VFNMSI(T4p, T4g), ms, &(x[0]));
+								      T3t = VADD(T3a, T3b);
+								      T3c = VSUB(T3a, T3b);
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					T1T = VFMA(LDK(KP707106781), T1S, T1v);
+					T3v = VFNMS(LDK(KP707106781), T1S, T1v);
+					T3w = VFNMS(LDK(KP707106781), T25, T24);
+					T26 = VFMA(LDK(KP707106781), T25, T24);
+					T2G = VFMA(LDK(KP707106781), T2F, T2i);
+					T3y = VFNMS(LDK(KP707106781), T2F, T2i);
+					T3z = VFNMS(LDK(KP707106781), T2S, T2R);
+					T2T = VFMA(LDK(KP707106781), T2S, T2R);
+				   }
+			      }
+			 }
+			 {
+			      V T3u, T3M, T3F, T3P, T3x, T3G, T3q, T3m, T3h, T3j, T3r, T3p, T2W, T3i;
+			      {
+				   V T3d, T3n, T27, T3e, T2U, T3f;
+				   T3d = VFMA(LDK(KP923879532), T3c, T39);
+				   T3n = VFNMS(LDK(KP923879532), T3c, T39);
+				   T27 = VFNMS(LDK(KP198912367), T26, T1T);
+				   T3e = VFMA(LDK(KP198912367), T1T, T26);
+				   T2U = VFNMS(LDK(KP198912367), T2T, T2G);
+				   T3f = VFMA(LDK(KP198912367), T2G, T2T);
+				   T3u = VFMA(LDK(KP923879532), T3t, T3s);
+				   T3M = VFNMS(LDK(KP923879532), T3t, T3s);
+				   {
+					V T3g, T3l, T2V, T3o;
+					T3g = VSUB(T3e, T3f);
+					T3l = VADD(T3e, T3f);
+					T2V = VADD(T27, T2U);
+					T3o = VSUB(T27, T2U);
+					T3F = VFNMS(LDK(KP923879532), T3E, T3D);
+					T3P = VFMA(LDK(KP923879532), T3E, T3D);
+					T3x = VFMA(LDK(KP668178637), T3w, T3v);
+					T3G = VFNMS(LDK(KP668178637), T3v, T3w);
+					T3q = VFMA(LDK(KP980785280), T3l, T3k);
+					T3m = VFNMS(LDK(KP980785280), T3l, T3k);
+					T3h = VFNMS(LDK(KP980785280), T3g, T3d);
+					T3j = VFMA(LDK(KP980785280), T3g, T3d);
+					T3r = VFNMS(LDK(KP980785280), T3o, T3n);
+					T3p = VFMA(LDK(KP980785280), T3o, T3n);
+					T2W = VFNMS(LDK(KP980785280), T2V, T1k);
+					T3i = VFMA(LDK(KP980785280), T2V, T1k);
+				   }
+			      }
+			      {
+				   V T7n, T7Z, T8j, T89, T7k, T7O, T8g, T7Y, T7H, T7R, T80, T7q, T7u, T82, T83;
+				   V T7x;
+				   {
+					V T7c, T7W, T7D, T87, T7f, T7E, T3A, T3H, T7F, T7i;
+					T7c = VFNMS(LDK(KP923879532), T7b, T7a);
+					T7W = VFMA(LDK(KP923879532), T7b, T7a);
+					T7D = VFMA(LDK(KP923879532), T7C, T7B);
+					T87 = VFNMS(LDK(KP923879532), T7C, T7B);
+					T7f = VFNMS(LDK(KP668178637), T7e, T7d);
+					T7E = VFMA(LDK(KP668178637), T7d, T7e);
+					ST(&(x[WS(rs, 46)]), VFNMSI(T3p, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 18)]), VFMAI(T3p, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 50)]), VFMAI(T3r, T3q), ms, &(x[0]));
+					ST(&(x[WS(rs, 14)]), VFNMSI(T3r, T3q), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFMAI(T3j, T3i), ms, &(x[0]));
+					ST(&(x[WS(rs, 62)]), VFNMSI(T3j, T3i), ms, &(x[0]));
+					ST(&(x[WS(rs, 34)]), VFMAI(T3h, T2W), ms, &(x[0]));
+					ST(&(x[WS(rs, 30)]), VFNMSI(T3h, T2W), ms, &(x[0]));
+					T3A = VFMA(LDK(KP668178637), T3z, T3y);
+					T3H = VFNMS(LDK(KP668178637), T3y, T3z);
+					T7F = VFMA(LDK(KP668178637), T7g, T7h);
+					T7i = VFNMS(LDK(KP668178637), T7h, T7g);
+					T7n = VFNMS(LDK(KP923879532), T7m, T7l);
+					T7Z = VFMA(LDK(KP923879532), T7m, T7l);
+					{
+					     V T3I, T3N, T3B, T3Q;
+					     T3I = VSUB(T3G, T3H);
+					     T3N = VADD(T3G, T3H);
+					     T3B = VADD(T3x, T3A);
+					     T3Q = VSUB(T3x, T3A);
+					     {
+						  V T7j, T88, T7G, T7X;
+						  T7j = VADD(T7f, T7i);
+						  T88 = VSUB(T7f, T7i);
+						  T7G = VSUB(T7E, T7F);
+						  T7X = VADD(T7E, T7F);
+						  {
+						       V T3S, T3O, T3J, T3L;
+						       T3S = VFNMS(LDK(KP831469612), T3N, T3M);
+						       T3O = VFMA(LDK(KP831469612), T3N, T3M);
+						       T3J = VFNMS(LDK(KP831469612), T3I, T3F);
+						       T3L = VFMA(LDK(KP831469612), T3I, T3F);
+						       {
+							    V T3T, T3R, T3C, T3K;
+							    T3T = VFMA(LDK(KP831469612), T3Q, T3P);
+							    T3R = VFNMS(LDK(KP831469612), T3Q, T3P);
+							    T3C = VFNMS(LDK(KP831469612), T3B, T3u);
+							    T3K = VFMA(LDK(KP831469612), T3B, T3u);
+							    T8j = VFNMS(LDK(KP831469612), T88, T87);
+							    T89 = VFMA(LDK(KP831469612), T88, T87);
+							    T7k = VFNMS(LDK(KP831469612), T7j, T7c);
+							    T7O = VFMA(LDK(KP831469612), T7j, T7c);
+							    T8g = VFNMS(LDK(KP831469612), T7X, T7W);
+							    T7Y = VFMA(LDK(KP831469612), T7X, T7W);
+							    T7H = VFMA(LDK(KP831469612), T7G, T7D);
+							    T7R = VFNMS(LDK(KP831469612), T7G, T7D);
+							    ST(&(x[WS(rs, 42)]), VFMAI(T3R, T3O), ms, &(x[0]));
+							    ST(&(x[WS(rs, 22)]), VFNMSI(T3R, T3O), ms, &(x[0]));
+							    ST(&(x[WS(rs, 54)]), VFNMSI(T3T, T3S), ms, &(x[0]));
+							    ST(&(x[WS(rs, 10)]), VFMAI(T3T, T3S), ms, &(x[0]));
+							    ST(&(x[WS(rs, 58)]), VFMAI(T3L, T3K), ms, &(x[0]));
+							    ST(&(x[WS(rs, 6)]), VFNMSI(T3L, T3K), ms, &(x[0]));
+							    ST(&(x[WS(rs, 26)]), VFMAI(T3J, T3C), ms, &(x[0]));
+							    ST(&(x[WS(rs, 38)]), VFNMSI(T3J, T3C), ms, &(x[0]));
+							    T80 = VFNMS(LDK(KP923879532), T7p, T7o);
+							    T7q = VFMA(LDK(KP923879532), T7p, T7o);
+						       }
+						  }
+					     }
+					}
+					T7u = VFNMS(LDK(KP923879532), T7t, T7s);
+					T82 = VFMA(LDK(KP923879532), T7t, T7s);
+					T83 = VFNMS(LDK(KP923879532), T7w, T7v);
+					T7x = VFMA(LDK(KP923879532), T7w, T7v);
+				   }
+				   {
+					V T5g, T6I, T6p, T6T, T5p, T6q, T6r, T5y;
+					T5g = VFMA(LDK(KP923879532), T5f, T58);
+					T6I = VFNMS(LDK(KP923879532), T5f, T58);
+					{
+					     V T7r, T7I, T7y, T7J;
+					     T7r = VFNMS(LDK(KP534511135), T7q, T7n);
+					     T7I = VFMA(LDK(KP534511135), T7n, T7q);
+					     T7y = VFNMS(LDK(KP534511135), T7x, T7u);
+					     T7J = VFMA(LDK(KP534511135), T7u, T7x);
+					     {
+						  V T81, T8a, T84, T8b;
+						  T81 = VFMA(LDK(KP303346683), T80, T7Z);
+						  T8a = VFNMS(LDK(KP303346683), T7Z, T80);
+						  T84 = VFMA(LDK(KP303346683), T83, T82);
+						  T8b = VFNMS(LDK(KP303346683), T82, T83);
+						  T6p = VFMA(LDK(KP923879532), T6o, T6l);
+						  T6T = VFNMS(LDK(KP923879532), T6o, T6l);
+						  T5p = VFNMS(LDK(KP198912367), T5o, T5l);
+						  T6q = VFMA(LDK(KP198912367), T5l, T5o);
+						  {
+						       V T7K, T7P, T7z, T7S;
+						       T7K = VSUB(T7I, T7J);
+						       T7P = VADD(T7I, T7J);
+						       T7z = VADD(T7r, T7y);
+						       T7S = VSUB(T7r, T7y);
+						       {
+							    V T8c, T8h, T85, T8k;
+							    T8c = VSUB(T8a, T8b);
+							    T8h = VADD(T8a, T8b);
+							    T85 = VADD(T81, T84);
+							    T8k = VSUB(T81, T84);
+							    {
+								 V T7Q, T7U, T7L, T7N;
+								 T7Q = VFNMS(LDK(KP881921264), T7P, T7O);
+								 T7U = VFMA(LDK(KP881921264), T7P, T7O);
+								 T7L = VFNMS(LDK(KP881921264), T7K, T7H);
+								 T7N = VFMA(LDK(KP881921264), T7K, T7H);
+								 {
+								      V T7T, T7V, T7A, T7M;
+								      T7T = VFMA(LDK(KP881921264), T7S, T7R);
+								      T7V = VFNMS(LDK(KP881921264), T7S, T7R);
+								      T7A = VFNMS(LDK(KP881921264), T7z, T7k);
+								      T7M = VFMA(LDK(KP881921264), T7z, T7k);
+								      {
+									   V T8i, T8m, T8d, T8f;
+									   T8i = VFMA(LDK(KP956940335), T8h, T8g);
+									   T8m = VFNMS(LDK(KP956940335), T8h, T8g);
+									   T8d = VFNMS(LDK(KP956940335), T8c, T89);
+									   T8f = VFMA(LDK(KP956940335), T8c, T89);
+									   {
+										V T8l, T8n, T86, T8e;
+										T8l = VFNMS(LDK(KP956940335), T8k, T8j);
+										T8n = VFMA(LDK(KP956940335), T8k, T8j);
+										T86 = VFNMS(LDK(KP956940335), T85, T7Y);
+										T8e = VFMA(LDK(KP956940335), T85, T7Y);
+										ST(&(x[WS(rs, 53)]), VFMAI(T7V, T7U), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 11)]), VFNMSI(T7V, T7U), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 43)]), VFNMSI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 21)]), VFMAI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 5)]), VFMAI(T7N, T7M), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 59)]), VFNMSI(T7N, T7M), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 37)]), VFMAI(T7L, T7A), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 27)]), VFNMSI(T7L, T7A), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 51)]), VFNMSI(T8n, T8m), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 13)]), VFMAI(T8n, T8m), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 45)]), VFMAI(T8l, T8i), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 19)]), VFNMSI(T8l, T8i), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 61)]), VFMAI(T8f, T8e), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 3)]), VFNMSI(T8f, T8e), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 29)]), VFMAI(T8d, T86), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 35)]), VFNMSI(T8d, T86), ms, &(x[WS(rs, 1)]));
+										T6r = VFMA(LDK(KP198912367), T5u, T5x);
+										T5y = VFNMS(LDK(KP198912367), T5x, T5u);
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					{
+					     V T5N, T5U, T68, T5z, T6U, T6f;
+					     T5N = VFMA(LDK(KP923879532), T5M, T5F);
+					     T6L = VFNMS(LDK(KP923879532), T5M, T5F);
+					     T6M = VFNMS(LDK(KP923879532), T5T, T5Q);
+					     T5U = VFMA(LDK(KP923879532), T5T, T5Q);
+					     T68 = VFMA(LDK(KP923879532), T67, T60);
+					     T6O = VFNMS(LDK(KP923879532), T67, T60);
+					     T5z = VADD(T5p, T5y);
+					     T6U = VSUB(T5p, T5y);
+					     T6P = VFNMS(LDK(KP923879532), T6e, T6b);
+					     T6f = VFMA(LDK(KP923879532), T6e, T6b);
+					     {
+						  V T5V, T6u, T6g, T6v, T6s, T6J;
+						  T6s = VSUB(T6q, T6r);
+						  T6J = VADD(T6q, T6r);
+						  T5V = VFNMS(LDK(KP098491403), T5U, T5N);
+						  T6u = VFMA(LDK(KP098491403), T5N, T5U);
+						  T75 = VFMA(LDK(KP980785280), T6U, T6T);
+						  T6V = VFNMS(LDK(KP980785280), T6U, T6T);
+						  T5A = VFMA(LDK(KP980785280), T5z, T5g);
+						  T6A = VFNMS(LDK(KP980785280), T5z, T5g);
+						  T6g = VFNMS(LDK(KP098491403), T6f, T68);
+						  T6v = VFMA(LDK(KP098491403), T68, T6f);
+						  T72 = VFNMS(LDK(KP980785280), T6J, T6I);
+						  T6K = VFMA(LDK(KP980785280), T6J, T6I);
+						  T6t = VFMA(LDK(KP980785280), T6s, T6p);
+						  T6D = VFNMS(LDK(KP980785280), T6s, T6p);
+						  T6w = VSUB(T6u, T6v);
+						  T6B = VADD(T6u, T6v);
+						  T6h = VADD(T5V, T6g);
+						  T6E = VSUB(T5V, T6g);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T6W, T6N, T6G, T6C, T6z, T6x, T6H, T6F, T6y, T6i, T6X, T6Q;
+		    T6W = VFNMS(LDK(KP820678790), T6L, T6M);
+		    T6N = VFMA(LDK(KP820678790), T6M, T6L);
+		    T6G = VFMA(LDK(KP995184726), T6B, T6A);
+		    T6C = VFNMS(LDK(KP995184726), T6B, T6A);
+		    T6z = VFMA(LDK(KP995184726), T6w, T6t);
+		    T6x = VFNMS(LDK(KP995184726), T6w, T6t);
+		    T6H = VFNMS(LDK(KP995184726), T6E, T6D);
+		    T6F = VFMA(LDK(KP995184726), T6E, T6D);
+		    T6y = VFMA(LDK(KP995184726), T6h, T5A);
+		    T6i = VFNMS(LDK(KP995184726), T6h, T5A);
+		    T6X = VFNMS(LDK(KP820678790), T6O, T6P);
+		    T6Q = VFMA(LDK(KP820678790), T6P, T6O);
+		    {
+			 V T73, T6Y, T76, T6R;
+			 ST(&(x[WS(rs, 49)]), VFMAI(T6H, T6G), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 15)]), VFNMSI(T6H, T6G), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 47)]), VFNMSI(T6F, T6C), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 17)]), VFMAI(T6F, T6C), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VFMAI(T6z, T6y), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 63)]), VFNMSI(T6z, T6y), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 33)]), VFMAI(T6x, T6i), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 31)]), VFNMSI(T6x, T6i), ms, &(x[WS(rs, 1)]));
+			 T73 = VADD(T6W, T6X);
+			 T6Y = VSUB(T6W, T6X);
+			 T76 = VSUB(T6N, T6Q);
+			 T6R = VADD(T6N, T6Q);
+			 {
+			      V T78, T74, T71, T6Z, T79, T77, T70, T6S;
+			      T78 = VFNMS(LDK(KP773010453), T73, T72);
+			      T74 = VFMA(LDK(KP773010453), T73, T72);
+			      T71 = VFMA(LDK(KP773010453), T6Y, T6V);
+			      T6Z = VFNMS(LDK(KP773010453), T6Y, T6V);
+			      T79 = VFMA(LDK(KP773010453), T76, T75);
+			      T77 = VFNMS(LDK(KP773010453), T76, T75);
+			      T70 = VFMA(LDK(KP773010453), T6R, T6K);
+			      T6S = VFNMS(LDK(KP773010453), T6R, T6K);
+			      ST(&(x[WS(rs, 55)]), VFNMSI(T79, T78), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 9)]), VFMAI(T79, T78), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 41)]), VFMAI(T77, T74), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 23)]), VFNMSI(T77, T74), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 57)]), VFMAI(T71, T70), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 7)]), VFNMSI(T71, T70), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 25)]), VFMAI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 39)]), VFNMSI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     VTW(0, 32),
+     VTW(0, 33),
+     VTW(0, 34),
+     VTW(0, 35),
+     VTW(0, 36),
+     VTW(0, 37),
+     VTW(0, 38),
+     VTW(0, 39),
+     VTW(0, 40),
+     VTW(0, 41),
+     VTW(0, 42),
+     VTW(0, 43),
+     VTW(0, 44),
+     VTW(0, 45),
+     VTW(0, 46),
+     VTW(0, 47),
+     VTW(0, 48),
+     VTW(0, 49),
+     VTW(0, 50),
+     VTW(0, 51),
+     VTW(0, 52),
+     VTW(0, 53),
+     VTW(0, 54),
+     VTW(0, 55),
+     VTW(0, 56),
+     VTW(0, 57),
+     VTW(0, 58),
+     VTW(0, 59),
+     VTW(0, 60),
+     VTW(0, 61),
+     VTW(0, 62),
+     VTW(0, 63),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 64, XSIMD_STRING("t1bv_64"), twinstr, &GENUS, {261, 126, 258, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_64) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_64, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t1bv_64 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 519 FP additions, 250 FP multiplications,
+ * (or, 467 additions, 198 multiplications, 52 fused multiply/add),
+ * 107 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V Tg, T4B, T6v, T7G, T3r, T4w, T5q, T7F, T5Y, T62, T28, T4d, T2g, T4a, T7g;
+	       V T7Y, T6f, T6j, T2Z, T4k, T37, T4h, T7n, T81, T7w, T7x, T7y, T5M, T6q, T1k;
+	       V T4s, T1r, T4t, T7t, T7u, T7v, T5F, T6p, TV, T4p, T12, T4q, T7A, T7B, TD;
+	       V T4x, T3k, T4C, T5x, T6s, T1R, T4b, T7j, T7Z, T2j, T4e, T5V, T63, T2I, T4i;
+	       V T7q, T82, T3a, T4l, T6c, T6k;
+	       {
+		    V T1, T3, T3p, T3n, Tb, Td, Te, T6, T8, T9, T2, T3o, T3m;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
+		    T3 = BYTW(&(W[TWVL * 62]), T2);
+		    T3o = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
+		    T3p = BYTW(&(W[TWVL * 94]), T3o);
+		    T3m = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+		    T3n = BYTW(&(W[TWVL * 30]), T3m);
+		    {
+			 V Ta, Tc, T5, T7;
+			 Ta = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
+			 Tb = BYTW(&(W[TWVL * 110]), Ta);
+			 Tc = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			 Td = BYTW(&(W[TWVL * 46]), Tc);
+			 Te = VSUB(Tb, Td);
+			 T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T6 = BYTW(&(W[TWVL * 14]), T5);
+			 T7 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
+			 T8 = BYTW(&(W[TWVL * 78]), T7);
+			 T9 = VSUB(T6, T8);
+		    }
+		    {
+			 V T4, Tf, T6t, T6u;
+			 T4 = VSUB(T1, T3);
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 Tg = VSUB(T4, Tf);
+			 T4B = VADD(T4, Tf);
+			 T6t = VADD(T6, T8);
+			 T6u = VADD(Tb, Td);
+			 T6v = VSUB(T6t, T6u);
+			 T7G = VADD(T6t, T6u);
+		    }
+		    {
+			 V T3l, T3q, T5o, T5p;
+			 T3l = VMUL(LDK(KP707106781), VSUB(T9, Te));
+			 T3q = VSUB(T3n, T3p);
+			 T3r = VSUB(T3l, T3q);
+			 T4w = VADD(T3q, T3l);
+			 T5o = VADD(T1, T3);
+			 T5p = VADD(T3n, T3p);
+			 T5q = VSUB(T5o, T5p);
+			 T7F = VADD(T5o, T5p);
+		    }
+	       }
+	       {
+		    V T24, T26, T61, T2b, T2d, T60, T1W, T5W, T21, T5X, T22, T27;
+		    {
+			 V T23, T25, T2a, T2c;
+			 T23 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 T24 = BYTW(&(W[TWVL * 32]), T23);
+			 T25 = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
+			 T26 = BYTW(&(W[TWVL * 96]), T25);
+			 T61 = VADD(T24, T26);
+			 T2a = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2b = BYTW(&(W[0]), T2a);
+			 T2c = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
+			 T2d = BYTW(&(W[TWVL * 64]), T2c);
+			 T60 = VADD(T2b, T2d);
+		    }
+		    {
+			 V T1T, T1V, T1S, T1U;
+			 T1S = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 T1T = BYTW(&(W[TWVL * 16]), T1S);
+			 T1U = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
+			 T1V = BYTW(&(W[TWVL * 80]), T1U);
+			 T1W = VSUB(T1T, T1V);
+			 T5W = VADD(T1T, T1V);
+		    }
+		    {
+			 V T1Y, T20, T1X, T1Z;
+			 T1X = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
+			 T1Y = BYTW(&(W[TWVL * 112]), T1X);
+			 T1Z = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+			 T20 = BYTW(&(W[TWVL * 48]), T1Z);
+			 T21 = VSUB(T1Y, T20);
+			 T5X = VADD(T1Y, T20);
+		    }
+		    T5Y = VSUB(T5W, T5X);
+		    T62 = VSUB(T60, T61);
+		    T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
+		    T27 = VSUB(T24, T26);
+		    T28 = VSUB(T22, T27);
+		    T4d = VADD(T27, T22);
+		    {
+			 V T2e, T2f, T7e, T7f;
+			 T2e = VSUB(T2b, T2d);
+			 T2f = VMUL(LDK(KP707106781), VADD(T1W, T21));
+			 T2g = VSUB(T2e, T2f);
+			 T4a = VADD(T2e, T2f);
+			 T7e = VADD(T60, T61);
+			 T7f = VADD(T5W, T5X);
+			 T7g = VSUB(T7e, T7f);
+			 T7Y = VADD(T7e, T7f);
+		    }
+	       }
+	       {
+		    V T2V, T2X, T6i, T32, T34, T6h, T2N, T6d, T2S, T6e, T2T, T2Y;
+		    {
+			 V T2U, T2W, T31, T33;
+			 T2U = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T2V = BYTW(&(W[TWVL * 28]), T2U);
+			 T2W = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
+			 T2X = BYTW(&(W[TWVL * 92]), T2W);
+			 T6i = VADD(T2V, T2X);
+			 T31 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
+			 T32 = BYTW(&(W[TWVL * 124]), T31);
+			 T33 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+			 T34 = BYTW(&(W[TWVL * 60]), T33);
+			 T6h = VADD(T32, T34);
+		    }
+		    {
+			 V T2K, T2M, T2J, T2L;
+			 T2J = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T2K = BYTW(&(W[TWVL * 12]), T2J);
+			 T2L = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
+			 T2M = BYTW(&(W[TWVL * 76]), T2L);
+			 T2N = VSUB(T2K, T2M);
+			 T6d = VADD(T2K, T2M);
+		    }
+		    {
+			 V T2P, T2R, T2O, T2Q;
+			 T2O = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
+			 T2P = BYTW(&(W[TWVL * 108]), T2O);
+			 T2Q = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T2R = BYTW(&(W[TWVL * 44]), T2Q);
+			 T2S = VSUB(T2P, T2R);
+			 T6e = VADD(T2P, T2R);
+		    }
+		    T6f = VSUB(T6d, T6e);
+		    T6j = VSUB(T6h, T6i);
+		    T2T = VMUL(LDK(KP707106781), VSUB(T2N, T2S));
+		    T2Y = VSUB(T2V, T2X);
+		    T2Z = VSUB(T2T, T2Y);
+		    T4k = VADD(T2Y, T2T);
+		    {
+			 V T35, T36, T7l, T7m;
+			 T35 = VSUB(T32, T34);
+			 T36 = VMUL(LDK(KP707106781), VADD(T2N, T2S));
+			 T37 = VSUB(T35, T36);
+			 T4h = VADD(T35, T36);
+			 T7l = VADD(T6h, T6i);
+			 T7m = VADD(T6d, T6e);
+			 T7n = VSUB(T7l, T7m);
+			 T81 = VADD(T7l, T7m);
+		    }
+	       }
+	       {
+		    V T1g, T1i, T5K, T1m, T1o, T5J, T18, T5G, T1d, T5H, T5I, T5L;
+		    {
+			 V T1f, T1h, T1l, T1n;
+			 T1f = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 T1g = BYTW(&(W[TWVL * 26]), T1f);
+			 T1h = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
+			 T1i = BYTW(&(W[TWVL * 90]), T1h);
+			 T5K = VADD(T1g, T1i);
+			 T1l = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
+			 T1m = BYTW(&(W[TWVL * 122]), T1l);
+			 T1n = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+			 T1o = BYTW(&(W[TWVL * 58]), T1n);
+			 T5J = VADD(T1m, T1o);
+		    }
+		    {
+			 V T15, T17, T14, T16;
+			 T14 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 T15 = BYTW(&(W[TWVL * 10]), T14);
+			 T16 = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
+			 T17 = BYTW(&(W[TWVL * 74]), T16);
+			 T18 = VSUB(T15, T17);
+			 T5G = VADD(T15, T17);
+		    }
+		    {
+			 V T1a, T1c, T19, T1b;
+			 T19 = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
+			 T1a = BYTW(&(W[TWVL * 106]), T19);
+			 T1b = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 T1c = BYTW(&(W[TWVL * 42]), T1b);
+			 T1d = VSUB(T1a, T1c);
+			 T5H = VADD(T1a, T1c);
+		    }
+		    T7w = VADD(T5J, T5K);
+		    T7x = VADD(T5G, T5H);
+		    T7y = VSUB(T7w, T7x);
+		    T5I = VSUB(T5G, T5H);
+		    T5L = VSUB(T5J, T5K);
+		    T5M = VFNMS(LDK(KP382683432), T5L, VMUL(LDK(KP923879532), T5I));
+		    T6q = VFMA(LDK(KP923879532), T5L, VMUL(LDK(KP382683432), T5I));
+		    {
+			 V T1e, T1j, T1p, T1q;
+			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
+			 T1j = VSUB(T1g, T1i);
+			 T1k = VSUB(T1e, T1j);
+			 T4s = VADD(T1j, T1e);
+			 T1p = VSUB(T1m, T1o);
+			 T1q = VMUL(LDK(KP707106781), VADD(T18, T1d));
+			 T1r = VSUB(T1p, T1q);
+			 T4t = VADD(T1p, T1q);
+		    }
+	       }
+	       {
+		    V TR, TT, T5A, TX, TZ, T5z, TJ, T5C, TO, T5D, T5B, T5E;
+		    {
+			 V TQ, TS, TW, TY;
+			 TQ = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 TR = BYTW(&(W[TWVL * 34]), TQ);
+			 TS = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
+			 TT = BYTW(&(W[TWVL * 98]), TS);
+			 T5A = VADD(TR, TT);
+			 TW = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 TX = BYTW(&(W[TWVL * 2]), TW);
+			 TY = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
+			 TZ = BYTW(&(W[TWVL * 66]), TY);
+			 T5z = VADD(TX, TZ);
+		    }
+		    {
+			 V TG, TI, TF, TH;
+			 TF = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 TG = BYTW(&(W[TWVL * 18]), TF);
+			 TH = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
+			 TI = BYTW(&(W[TWVL * 82]), TH);
+			 TJ = VSUB(TG, TI);
+			 T5C = VADD(TG, TI);
+		    }
+		    {
+			 V TL, TN, TK, TM;
+			 TK = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
+			 TL = BYTW(&(W[TWVL * 114]), TK);
+			 TM = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			 TN = BYTW(&(W[TWVL * 50]), TM);
+			 TO = VSUB(TL, TN);
+			 T5D = VADD(TL, TN);
+		    }
+		    T7t = VADD(T5z, T5A);
+		    T7u = VADD(T5C, T5D);
+		    T7v = VSUB(T7t, T7u);
+		    T5B = VSUB(T5z, T5A);
+		    T5E = VSUB(T5C, T5D);
+		    T5F = VFMA(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5E));
+		    T6p = VFNMS(LDK(KP382683432), T5E, VMUL(LDK(KP923879532), T5B));
+		    {
+			 V TP, TU, T10, T11;
+			 TP = VMUL(LDK(KP707106781), VSUB(TJ, TO));
+			 TU = VSUB(TR, TT);
+			 TV = VSUB(TP, TU);
+			 T4p = VADD(TU, TP);
+			 T10 = VSUB(TX, TZ);
+			 T11 = VMUL(LDK(KP707106781), VADD(TJ, TO));
+			 T12 = VSUB(T10, T11);
+			 T4q = VADD(T10, T11);
+		    }
+	       }
+	       {
+		    V Tl, T5r, TB, T5u, Tq, T5s, Tw, T5v, Tr, TC;
+		    {
+			 V Ti, Tk, Th, Tj;
+			 Th = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Ti = BYTW(&(W[TWVL * 6]), Th);
+			 Tj = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
+			 Tk = BYTW(&(W[TWVL * 70]), Tj);
+			 Tl = VSUB(Ti, Tk);
+			 T5r = VADD(Ti, Tk);
+		    }
+		    {
+			 V Ty, TA, Tx, Tz;
+			 Tx = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
+			 Ty = BYTW(&(W[TWVL * 118]), Tx);
+			 Tz = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+			 TA = BYTW(&(W[TWVL * 54]), Tz);
+			 TB = VSUB(Ty, TA);
+			 T5u = VADD(Ty, TA);
+		    }
+		    {
+			 V Tn, Tp, Tm, To;
+			 Tm = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 Tn = BYTW(&(W[TWVL * 38]), Tm);
+			 To = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
+			 Tp = BYTW(&(W[TWVL * 102]), To);
+			 Tq = VSUB(Tn, Tp);
+			 T5s = VADD(Tn, Tp);
+		    }
+		    {
+			 V Tt, Tv, Ts, Tu;
+			 Ts = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Tt = BYTW(&(W[TWVL * 22]), Ts);
+			 Tu = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
+			 Tv = BYTW(&(W[TWVL * 86]), Tu);
+			 Tw = VSUB(Tt, Tv);
+			 T5v = VADD(Tt, Tv);
+		    }
+		    T7A = VADD(T5r, T5s);
+		    T7B = VADD(T5u, T5v);
+		    Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
+		    TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw));
+		    TD = VSUB(Tr, TC);
+		    T4x = VADD(Tr, TC);
+		    {
+			 V T3i, T3j, T5t, T5w;
+			 T3i = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
+			 T3j = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
+			 T3k = VSUB(T3i, T3j);
+			 T4C = VADD(T3i, T3j);
+			 T5t = VSUB(T5r, T5s);
+			 T5w = VSUB(T5u, T5v);
+			 T5x = VMUL(LDK(KP707106781), VADD(T5t, T5w));
+			 T6s = VMUL(LDK(KP707106781), VSUB(T5t, T5w));
+		    }
+	       }
+	       {
+		    V T1z, T5P, T1P, T5T, T1E, T5Q, T1K, T5S;
+		    {
+			 V T1w, T1y, T1v, T1x;
+			 T1v = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T1w = BYTW(&(W[TWVL * 8]), T1v);
+			 T1x = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
+			 T1y = BYTW(&(W[TWVL * 72]), T1x);
+			 T1z = VSUB(T1w, T1y);
+			 T5P = VADD(T1w, T1y);
+		    }
+		    {
+			 V T1M, T1O, T1L, T1N;
+			 T1L = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T1M = BYTW(&(W[TWVL * 24]), T1L);
+			 T1N = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
+			 T1O = BYTW(&(W[TWVL * 88]), T1N);
+			 T1P = VSUB(T1M, T1O);
+			 T5T = VADD(T1M, T1O);
+		    }
+		    {
+			 V T1B, T1D, T1A, T1C;
+			 T1A = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			 T1B = BYTW(&(W[TWVL * 40]), T1A);
+			 T1C = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
+			 T1D = BYTW(&(W[TWVL * 104]), T1C);
+			 T1E = VSUB(T1B, T1D);
+			 T5Q = VADD(T1B, T1D);
+		    }
+		    {
+			 V T1H, T1J, T1G, T1I;
+			 T1G = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
+			 T1H = BYTW(&(W[TWVL * 120]), T1G);
+			 T1I = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+			 T1J = BYTW(&(W[TWVL * 56]), T1I);
+			 T1K = VSUB(T1H, T1J);
+			 T5S = VADD(T1H, T1J);
+		    }
+		    {
+			 V T1F, T1Q, T7h, T7i;
+			 T1F = VFNMS(LDK(KP382683432), T1E, VMUL(LDK(KP923879532), T1z));
+			 T1Q = VFMA(LDK(KP923879532), T1K, VMUL(LDK(KP382683432), T1P));
+			 T1R = VSUB(T1F, T1Q);
+			 T4b = VADD(T1F, T1Q);
+			 T7h = VADD(T5P, T5Q);
+			 T7i = VADD(T5S, T5T);
+			 T7j = VSUB(T7h, T7i);
+			 T7Z = VADD(T7h, T7i);
+		    }
+		    {
+			 V T2h, T2i, T5R, T5U;
+			 T2h = VFMA(LDK(KP382683432), T1z, VMUL(LDK(KP923879532), T1E));
+			 T2i = VFNMS(LDK(KP382683432), T1K, VMUL(LDK(KP923879532), T1P));
+			 T2j = VSUB(T2h, T2i);
+			 T4e = VADD(T2h, T2i);
+			 T5R = VSUB(T5P, T5Q);
+			 T5U = VSUB(T5S, T5T);
+			 T5V = VMUL(LDK(KP707106781), VSUB(T5R, T5U));
+			 T63 = VMUL(LDK(KP707106781), VADD(T5R, T5U));
+		    }
+	       }
+	       {
+		    V T2q, T66, T2G, T6a, T2v, T67, T2B, T69;
+		    {
+			 V T2n, T2p, T2m, T2o;
+			 T2m = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T2n = BYTW(&(W[TWVL * 4]), T2m);
+			 T2o = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
+			 T2p = BYTW(&(W[TWVL * 68]), T2o);
+			 T2q = VSUB(T2n, T2p);
+			 T66 = VADD(T2n, T2p);
+		    }
+		    {
+			 V T2D, T2F, T2C, T2E;
+			 T2C = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 T2D = BYTW(&(W[TWVL * 20]), T2C);
+			 T2E = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
+			 T2F = BYTW(&(W[TWVL * 84]), T2E);
+			 T2G = VSUB(T2D, T2F);
+			 T6a = VADD(T2D, T2F);
+		    }
+		    {
+			 V T2s, T2u, T2r, T2t;
+			 T2r = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 T2s = BYTW(&(W[TWVL * 36]), T2r);
+			 T2t = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
+			 T2u = BYTW(&(W[TWVL * 100]), T2t);
+			 T2v = VSUB(T2s, T2u);
+			 T67 = VADD(T2s, T2u);
+		    }
+		    {
+			 V T2y, T2A, T2x, T2z;
+			 T2x = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
+			 T2y = BYTW(&(W[TWVL * 116]), T2x);
+			 T2z = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+			 T2A = BYTW(&(W[TWVL * 52]), T2z);
+			 T2B = VSUB(T2y, T2A);
+			 T69 = VADD(T2y, T2A);
+		    }
+		    {
+			 V T2w, T2H, T7o, T7p;
+			 T2w = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2q));
+			 T2H = VFMA(LDK(KP923879532), T2B, VMUL(LDK(KP382683432), T2G));
+			 T2I = VSUB(T2w, T2H);
+			 T4i = VADD(T2w, T2H);
+			 T7o = VADD(T66, T67);
+			 T7p = VADD(T69, T6a);
+			 T7q = VSUB(T7o, T7p);
+			 T82 = VADD(T7o, T7p);
+		    }
+		    {
+			 V T38, T39, T68, T6b;
+			 T38 = VFMA(LDK(KP382683432), T2q, VMUL(LDK(KP923879532), T2v));
+			 T39 = VFNMS(LDK(KP382683432), T2B, VMUL(LDK(KP923879532), T2G));
+			 T3a = VSUB(T38, T39);
+			 T4l = VADD(T38, T39);
+			 T68 = VSUB(T66, T67);
+			 T6b = VSUB(T69, T6a);
+			 T6c = VMUL(LDK(KP707106781), VSUB(T68, T6b));
+			 T6k = VMUL(LDK(KP707106781), VADD(T68, T6b));
+		    }
+	       }
+	       {
+		    V T7s, T7R, T7M, T7U, T7D, T7T, T7J, T7Q;
+		    {
+			 V T7k, T7r, T7K, T7L;
+			 T7k = VFNMS(LDK(KP382683432), T7j, VMUL(LDK(KP923879532), T7g));
+			 T7r = VFMA(LDK(KP923879532), T7n, VMUL(LDK(KP382683432), T7q));
+			 T7s = VSUB(T7k, T7r);
+			 T7R = VADD(T7k, T7r);
+			 T7K = VFMA(LDK(KP382683432), T7g, VMUL(LDK(KP923879532), T7j));
+			 T7L = VFNMS(LDK(KP382683432), T7n, VMUL(LDK(KP923879532), T7q));
+			 T7M = VSUB(T7K, T7L);
+			 T7U = VADD(T7K, T7L);
+		    }
+		    {
+			 V T7z, T7C, T7H, T7I;
+			 T7z = VMUL(LDK(KP707106781), VSUB(T7v, T7y));
+			 T7C = VSUB(T7A, T7B);
+			 T7D = VSUB(T7z, T7C);
+			 T7T = VADD(T7C, T7z);
+			 T7H = VSUB(T7F, T7G);
+			 T7I = VMUL(LDK(KP707106781), VADD(T7v, T7y));
+			 T7J = VSUB(T7H, T7I);
+			 T7Q = VADD(T7H, T7I);
+		    }
+		    {
+			 V T7E, T7N, T7W, T7X;
+			 T7E = VBYI(VSUB(T7s, T7D));
+			 T7N = VSUB(T7J, T7M);
+			 ST(&(x[WS(rs, 20)]), VADD(T7E, T7N), ms, &(x[0]));
+			 ST(&(x[WS(rs, 44)]), VSUB(T7N, T7E), ms, &(x[0]));
+			 T7W = VSUB(T7Q, T7R);
+			 T7X = VBYI(VSUB(T7U, T7T));
+			 ST(&(x[WS(rs, 36)]), VSUB(T7W, T7X), ms, &(x[0]));
+			 ST(&(x[WS(rs, 28)]), VADD(T7W, T7X), ms, &(x[0]));
+		    }
+		    {
+			 V T7O, T7P, T7S, T7V;
+			 T7O = VBYI(VADD(T7D, T7s));
+			 T7P = VADD(T7J, T7M);
+			 ST(&(x[WS(rs, 12)]), VADD(T7O, T7P), ms, &(x[0]));
+			 ST(&(x[WS(rs, 52)]), VSUB(T7P, T7O), ms, &(x[0]));
+			 T7S = VADD(T7Q, T7R);
+			 T7V = VBYI(VADD(T7T, T7U));
+			 ST(&(x[WS(rs, 60)]), VSUB(T7S, T7V), ms, &(x[0]));
+			 ST(&(x[WS(rs, 4)]), VADD(T7S, T7V), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T84, T8c, T8l, T8n, T87, T8h, T8b, T8g, T8i, T8m;
+		    {
+			 V T80, T83, T8j, T8k;
+			 T80 = VSUB(T7Y, T7Z);
+			 T83 = VSUB(T81, T82);
+			 T84 = VMUL(LDK(KP707106781), VSUB(T80, T83));
+			 T8c = VMUL(LDK(KP707106781), VADD(T80, T83));
+			 T8j = VADD(T7Y, T7Z);
+			 T8k = VADD(T81, T82);
+			 T8l = VBYI(VSUB(T8j, T8k));
+			 T8n = VADD(T8j, T8k);
+		    }
+		    {
+			 V T85, T86, T89, T8a;
+			 T85 = VADD(T7t, T7u);
+			 T86 = VADD(T7w, T7x);
+			 T87 = VSUB(T85, T86);
+			 T8h = VADD(T85, T86);
+			 T89 = VADD(T7F, T7G);
+			 T8a = VADD(T7A, T7B);
+			 T8b = VSUB(T89, T8a);
+			 T8g = VADD(T89, T8a);
+		    }
+		    T8i = VSUB(T8g, T8h);
+		    ST(&(x[WS(rs, 48)]), VSUB(T8i, T8l), ms, &(x[0]));
+		    ST(&(x[WS(rs, 16)]), VADD(T8i, T8l), ms, &(x[0]));
+		    T8m = VADD(T8g, T8h);
+		    ST(&(x[WS(rs, 32)]), VSUB(T8m, T8n), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T8m, T8n), ms, &(x[0]));
+		    {
+			 V T88, T8d, T8e, T8f;
+			 T88 = VBYI(VSUB(T84, T87));
+			 T8d = VSUB(T8b, T8c);
+			 ST(&(x[WS(rs, 24)]), VADD(T88, T8d), ms, &(x[0]));
+			 ST(&(x[WS(rs, 40)]), VSUB(T8d, T88), ms, &(x[0]));
+			 T8e = VBYI(VADD(T87, T84));
+			 T8f = VADD(T8b, T8c);
+			 ST(&(x[WS(rs, 8)]), VADD(T8e, T8f), ms, &(x[0]));
+			 ST(&(x[WS(rs, 56)]), VSUB(T8f, T8e), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T5O, T6H, T6x, T6F, T6n, T6I, T6A, T6E;
+		    {
+			 V T5y, T5N, T6r, T6w;
+			 T5y = VSUB(T5q, T5x);
+			 T5N = VSUB(T5F, T5M);
+			 T5O = VSUB(T5y, T5N);
+			 T6H = VADD(T5y, T5N);
+			 T6r = VSUB(T6p, T6q);
+			 T6w = VSUB(T6s, T6v);
+			 T6x = VSUB(T6r, T6w);
+			 T6F = VADD(T6w, T6r);
+			 {
+			      V T65, T6y, T6m, T6z;
+			      {
+				   V T5Z, T64, T6g, T6l;
+				   T5Z = VSUB(T5V, T5Y);
+				   T64 = VSUB(T62, T63);
+				   T65 = VFMA(LDK(KP831469612), T5Z, VMUL(LDK(KP555570233), T64));
+				   T6y = VFNMS(LDK(KP555570233), T5Z, VMUL(LDK(KP831469612), T64));
+				   T6g = VSUB(T6c, T6f);
+				   T6l = VSUB(T6j, T6k);
+				   T6m = VFNMS(LDK(KP555570233), T6l, VMUL(LDK(KP831469612), T6g));
+				   T6z = VFMA(LDK(KP555570233), T6g, VMUL(LDK(KP831469612), T6l));
+			      }
+			      T6n = VSUB(T65, T6m);
+			      T6I = VADD(T6y, T6z);
+			      T6A = VSUB(T6y, T6z);
+			      T6E = VADD(T65, T6m);
+			 }
+		    }
+		    {
+			 V T6o, T6B, T6K, T6L;
+			 T6o = VADD(T5O, T6n);
+			 T6B = VBYI(VADD(T6x, T6A));
+			 ST(&(x[WS(rs, 54)]), VSUB(T6o, T6B), ms, &(x[0]));
+			 ST(&(x[WS(rs, 10)]), VADD(T6o, T6B), ms, &(x[0]));
+			 T6K = VBYI(VADD(T6F, T6E));
+			 T6L = VADD(T6H, T6I);
+			 ST(&(x[WS(rs, 6)]), VADD(T6K, T6L), ms, &(x[0]));
+			 ST(&(x[WS(rs, 58)]), VSUB(T6L, T6K), ms, &(x[0]));
+		    }
+		    {
+			 V T6C, T6D, T6G, T6J;
+			 T6C = VSUB(T5O, T6n);
+			 T6D = VBYI(VSUB(T6A, T6x));
+			 ST(&(x[WS(rs, 42)]), VSUB(T6C, T6D), ms, &(x[0]));
+			 ST(&(x[WS(rs, 22)]), VADD(T6C, T6D), ms, &(x[0]));
+			 T6G = VBYI(VSUB(T6E, T6F));
+			 T6J = VSUB(T6H, T6I);
+			 ST(&(x[WS(rs, 26)]), VADD(T6G, T6J), ms, &(x[0]));
+			 ST(&(x[WS(rs, 38)]), VSUB(T6J, T6G), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T6O, T79, T6Z, T77, T6V, T7a, T72, T76;
+		    {
+			 V T6M, T6N, T6X, T6Y;
+			 T6M = VADD(T5q, T5x);
+			 T6N = VADD(T6p, T6q);
+			 T6O = VSUB(T6M, T6N);
+			 T79 = VADD(T6M, T6N);
+			 T6X = VADD(T5F, T5M);
+			 T6Y = VADD(T6v, T6s);
+			 T6Z = VSUB(T6X, T6Y);
+			 T77 = VADD(T6Y, T6X);
+			 {
+			      V T6R, T70, T6U, T71;
+			      {
+				   V T6P, T6Q, T6S, T6T;
+				   T6P = VADD(T5Y, T5V);
+				   T6Q = VADD(T62, T63);
+				   T6R = VFMA(LDK(KP980785280), T6P, VMUL(LDK(KP195090322), T6Q));
+				   T70 = VFNMS(LDK(KP195090322), T6P, VMUL(LDK(KP980785280), T6Q));
+				   T6S = VADD(T6f, T6c);
+				   T6T = VADD(T6j, T6k);
+				   T6U = VFNMS(LDK(KP195090322), T6T, VMUL(LDK(KP980785280), T6S));
+				   T71 = VFMA(LDK(KP195090322), T6S, VMUL(LDK(KP980785280), T6T));
+			      }
+			      T6V = VSUB(T6R, T6U);
+			      T7a = VADD(T70, T71);
+			      T72 = VSUB(T70, T71);
+			      T76 = VADD(T6R, T6U);
+			 }
+		    }
+		    {
+			 V T6W, T73, T7c, T7d;
+			 T6W = VADD(T6O, T6V);
+			 T73 = VBYI(VADD(T6Z, T72));
+			 ST(&(x[WS(rs, 50)]), VSUB(T6W, T73), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VADD(T6W, T73), ms, &(x[0]));
+			 T7c = VBYI(VADD(T77, T76));
+			 T7d = VADD(T79, T7a);
+			 ST(&(x[WS(rs, 2)]), VADD(T7c, T7d), ms, &(x[0]));
+			 ST(&(x[WS(rs, 62)]), VSUB(T7d, T7c), ms, &(x[0]));
+		    }
+		    {
+			 V T74, T75, T78, T7b;
+			 T74 = VSUB(T6O, T6V);
+			 T75 = VBYI(VSUB(T72, T6Z));
+			 ST(&(x[WS(rs, 46)]), VSUB(T74, T75), ms, &(x[0]));
+			 ST(&(x[WS(rs, 18)]), VADD(T74, T75), ms, &(x[0]));
+			 T78 = VBYI(VSUB(T76, T77));
+			 T7b = VSUB(T79, T7a);
+			 ST(&(x[WS(rs, 30)]), VADD(T78, T7b), ms, &(x[0]));
+			 ST(&(x[WS(rs, 34)]), VSUB(T7b, T78), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T4z, T5g, T4R, T59, T4H, T5j, T4O, T55, T4o, T4S, T4K, T4P, T52, T5k, T5c;
+		    V T5h;
+		    {
+			 V T4y, T57, T4v, T58, T4r, T4u;
+			 T4y = VADD(T4w, T4x);
+			 T57 = VSUB(T4B, T4C);
+			 T4r = VFMA(LDK(KP980785280), T4p, VMUL(LDK(KP195090322), T4q));
+			 T4u = VFNMS(LDK(KP195090322), T4t, VMUL(LDK(KP980785280), T4s));
+			 T4v = VADD(T4r, T4u);
+			 T58 = VSUB(T4r, T4u);
+			 T4z = VSUB(T4v, T4y);
+			 T5g = VADD(T57, T58);
+			 T4R = VADD(T4y, T4v);
+			 T59 = VSUB(T57, T58);
+		    }
+		    {
+			 V T4D, T54, T4G, T53, T4E, T4F;
+			 T4D = VADD(T4B, T4C);
+			 T54 = VSUB(T4x, T4w);
+			 T4E = VFNMS(LDK(KP195090322), T4p, VMUL(LDK(KP980785280), T4q));
+			 T4F = VFMA(LDK(KP195090322), T4s, VMUL(LDK(KP980785280), T4t));
+			 T4G = VADD(T4E, T4F);
+			 T53 = VSUB(T4E, T4F);
+			 T4H = VSUB(T4D, T4G);
+			 T5j = VADD(T54, T53);
+			 T4O = VADD(T4D, T4G);
+			 T55 = VSUB(T53, T54);
+		    }
+		    {
+			 V T4g, T4I, T4n, T4J;
+			 {
+			      V T4c, T4f, T4j, T4m;
+			      T4c = VADD(T4a, T4b);
+			      T4f = VADD(T4d, T4e);
+			      T4g = VFNMS(LDK(KP098017140), T4f, VMUL(LDK(KP995184726), T4c));
+			      T4I = VFMA(LDK(KP098017140), T4c, VMUL(LDK(KP995184726), T4f));
+			      T4j = VADD(T4h, T4i);
+			      T4m = VADD(T4k, T4l);
+			      T4n = VFMA(LDK(KP995184726), T4j, VMUL(LDK(KP098017140), T4m));
+			      T4J = VFNMS(LDK(KP098017140), T4j, VMUL(LDK(KP995184726), T4m));
+			 }
+			 T4o = VSUB(T4g, T4n);
+			 T4S = VADD(T4I, T4J);
+			 T4K = VSUB(T4I, T4J);
+			 T4P = VADD(T4g, T4n);
+		    }
+		    {
+			 V T4Y, T5a, T51, T5b;
+			 {
+			      V T4W, T4X, T4Z, T50;
+			      T4W = VSUB(T4a, T4b);
+			      T4X = VSUB(T4e, T4d);
+			      T4Y = VFNMS(LDK(KP634393284), T4X, VMUL(LDK(KP773010453), T4W));
+			      T5a = VFMA(LDK(KP634393284), T4W, VMUL(LDK(KP773010453), T4X));
+			      T4Z = VSUB(T4h, T4i);
+			      T50 = VSUB(T4l, T4k);
+			      T51 = VFMA(LDK(KP773010453), T4Z, VMUL(LDK(KP634393284), T50));
+			      T5b = VFNMS(LDK(KP634393284), T4Z, VMUL(LDK(KP773010453), T50));
+			 }
+			 T52 = VSUB(T4Y, T51);
+			 T5k = VADD(T5a, T5b);
+			 T5c = VSUB(T5a, T5b);
+			 T5h = VADD(T4Y, T51);
+		    }
+		    {
+			 V T4A, T4L, T5i, T5l;
+			 T4A = VBYI(VSUB(T4o, T4z));
+			 T4L = VSUB(T4H, T4K);
+			 ST(&(x[WS(rs, 17)]), VADD(T4A, T4L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 47)]), VSUB(T4L, T4A), ms, &(x[WS(rs, 1)]));
+			 T5i = VADD(T5g, T5h);
+			 T5l = VBYI(VADD(T5j, T5k));
+			 ST(&(x[WS(rs, 57)]), VSUB(T5i, T5l), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T5i, T5l), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T5m, T5n, T4M, T4N;
+			 T5m = VSUB(T5g, T5h);
+			 T5n = VBYI(VSUB(T5k, T5j));
+			 ST(&(x[WS(rs, 39)]), VSUB(T5m, T5n), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 25)]), VADD(T5m, T5n), ms, &(x[WS(rs, 1)]));
+			 T4M = VBYI(VADD(T4z, T4o));
+			 T4N = VADD(T4H, T4K);
+			 ST(&(x[WS(rs, 15)]), VADD(T4M, T4N), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 49)]), VSUB(T4N, T4M), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T4Q, T4T, T56, T5d;
+			 T4Q = VADD(T4O, T4P);
+			 T4T = VBYI(VADD(T4R, T4S));
+			 ST(&(x[WS(rs, 63)]), VSUB(T4Q, T4T), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T4Q, T4T), ms, &(x[WS(rs, 1)]));
+			 T56 = VBYI(VSUB(T52, T55));
+			 T5d = VSUB(T59, T5c);
+			 ST(&(x[WS(rs, 23)]), VADD(T56, T5d), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 41)]), VSUB(T5d, T56), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T5e, T5f, T4U, T4V;
+			 T5e = VBYI(VADD(T55, T52));
+			 T5f = VADD(T59, T5c);
+			 ST(&(x[WS(rs, 9)]), VADD(T5e, T5f), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 55)]), VSUB(T5f, T5e), ms, &(x[WS(rs, 1)]));
+			 T4U = VSUB(T4O, T4P);
+			 T4V = VBYI(VSUB(T4S, T4R));
+			 ST(&(x[WS(rs, 33)]), VSUB(T4U, T4V), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 31)]), VADD(T4U, T4V), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T1u, T43, T3D, T3V, T3t, T45, T3B, T3K, T3d, T3E, T3w, T3A, T3R, T46, T3Y;
+		    V T42;
+		    {
+			 V TE, T3U, T1t, T3T, T13, T1s;
+			 TE = VSUB(Tg, TD);
+			 T3U = VADD(T3r, T3k);
+			 T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12));
+			 T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k));
+			 T1t = VSUB(T13, T1s);
+			 T3T = VADD(T13, T1s);
+			 T1u = VSUB(TE, T1t);
+			 T43 = VADD(T3U, T3T);
+			 T3D = VADD(TE, T1t);
+			 T3V = VSUB(T3T, T3U);
+		    }
+		    {
+			 V T3s, T3I, T3h, T3J, T3f, T3g;
+			 T3s = VSUB(T3k, T3r);
+			 T3I = VADD(Tg, TD);
+			 T3f = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12));
+			 T3g = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r));
+			 T3h = VSUB(T3f, T3g);
+			 T3J = VADD(T3f, T3g);
+			 T3t = VSUB(T3h, T3s);
+			 T45 = VADD(T3I, T3J);
+			 T3B = VADD(T3s, T3h);
+			 T3K = VSUB(T3I, T3J);
+		    }
+		    {
+			 V T2l, T3u, T3c, T3v;
+			 {
+			      V T29, T2k, T30, T3b;
+			      T29 = VSUB(T1R, T28);
+			      T2k = VSUB(T2g, T2j);
+			      T2l = VFMA(LDK(KP881921264), T29, VMUL(LDK(KP471396736), T2k));
+			      T3u = VFNMS(LDK(KP471396736), T29, VMUL(LDK(KP881921264), T2k));
+			      T30 = VSUB(T2I, T2Z);
+			      T3b = VSUB(T37, T3a);
+			      T3c = VFNMS(LDK(KP471396736), T3b, VMUL(LDK(KP881921264), T30));
+			      T3v = VFMA(LDK(KP471396736), T30, VMUL(LDK(KP881921264), T3b));
+			 }
+			 T3d = VSUB(T2l, T3c);
+			 T3E = VADD(T3u, T3v);
+			 T3w = VSUB(T3u, T3v);
+			 T3A = VADD(T2l, T3c);
+		    }
+		    {
+			 V T3N, T3W, T3Q, T3X;
+			 {
+			      V T3L, T3M, T3O, T3P;
+			      T3L = VADD(T28, T1R);
+			      T3M = VADD(T2g, T2j);
+			      T3N = VFMA(LDK(KP956940335), T3L, VMUL(LDK(KP290284677), T3M));
+			      T3W = VFNMS(LDK(KP290284677), T3L, VMUL(LDK(KP956940335), T3M));
+			      T3O = VADD(T2Z, T2I);
+			      T3P = VADD(T37, T3a);
+			      T3Q = VFNMS(LDK(KP290284677), T3P, VMUL(LDK(KP956940335), T3O));
+			      T3X = VFMA(LDK(KP290284677), T3O, VMUL(LDK(KP956940335), T3P));
+			 }
+			 T3R = VSUB(T3N, T3Q);
+			 T46 = VADD(T3W, T3X);
+			 T3Y = VSUB(T3W, T3X);
+			 T42 = VADD(T3N, T3Q);
+		    }
+		    {
+			 V T3e, T3x, T44, T47;
+			 T3e = VADD(T1u, T3d);
+			 T3x = VBYI(VADD(T3t, T3w));
+			 ST(&(x[WS(rs, 53)]), VSUB(T3e, T3x), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VADD(T3e, T3x), ms, &(x[WS(rs, 1)]));
+			 T44 = VBYI(VSUB(T42, T43));
+			 T47 = VSUB(T45, T46);
+			 ST(&(x[WS(rs, 29)]), VADD(T44, T47), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 35)]), VSUB(T47, T44), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T48, T49, T3y, T3z;
+			 T48 = VBYI(VADD(T43, T42));
+			 T49 = VADD(T45, T46);
+			 ST(&(x[WS(rs, 3)]), VADD(T48, T49), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 61)]), VSUB(T49, T48), ms, &(x[WS(rs, 1)]));
+			 T3y = VSUB(T1u, T3d);
+			 T3z = VBYI(VSUB(T3w, T3t));
+			 ST(&(x[WS(rs, 43)]), VSUB(T3y, T3z), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 21)]), VADD(T3y, T3z), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T3C, T3F, T3S, T3Z;
+			 T3C = VBYI(VSUB(T3A, T3B));
+			 T3F = VSUB(T3D, T3E);
+			 ST(&(x[WS(rs, 27)]), VADD(T3C, T3F), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 37)]), VSUB(T3F, T3C), ms, &(x[WS(rs, 1)]));
+			 T3S = VADD(T3K, T3R);
+			 T3Z = VBYI(VADD(T3V, T3Y));
+			 ST(&(x[WS(rs, 51)]), VSUB(T3S, T3Z), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VADD(T3S, T3Z), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T40, T41, T3G, T3H;
+			 T40 = VSUB(T3K, T3R);
+			 T41 = VBYI(VSUB(T3Y, T3V));
+			 ST(&(x[WS(rs, 45)]), VSUB(T40, T41), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 19)]), VADD(T40, T41), ms, &(x[WS(rs, 1)]));
+			 T3G = VBYI(VADD(T3B, T3A));
+			 T3H = VADD(T3D, T3E);
+			 ST(&(x[WS(rs, 5)]), VADD(T3G, T3H), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 59)]), VSUB(T3H, T3G), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     VTW(0, 32),
+     VTW(0, 33),
+     VTW(0, 34),
+     VTW(0, 35),
+     VTW(0, 36),
+     VTW(0, 37),
+     VTW(0, 38),
+     VTW(0, 39),
+     VTW(0, 40),
+     VTW(0, 41),
+     VTW(0, 42),
+     VTW(0, 43),
+     VTW(0, 44),
+     VTW(0, 45),
+     VTW(0, 46),
+     VTW(0, 47),
+     VTW(0, 48),
+     VTW(0, 49),
+     VTW(0, 50),
+     VTW(0, 51),
+     VTW(0, 52),
+     VTW(0, 53),
+     VTW(0, 54),
+     VTW(0, 55),
+     VTW(0, 56),
+     VTW(0, 57),
+     VTW(0, 58),
+     VTW(0, 59),
+     VTW(0, 60),
+     VTW(0, 61),
+     VTW(0, 62),
+     VTW(0, 63),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 64, XSIMD_STRING("t1bv_64"), twinstr, &GENUS, {467, 198, 52, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_64) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_64, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name t1bv_7 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 36 FP additions, 36 FP multiplications,
+ * (or, 15 additions, 15 multiplications, 21 fused multiply/add),
+ * 42 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 12)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 12), MAKE_VOLATILE_STRIDE(7, rs)) {
+	       V T1, T2, T4, Te, Tc, T9, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       Te = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T9 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, T5, Tf, Td, Ta, T8;
+		    T3 = BYTW(&(W[0]), T2);
+		    T5 = BYTW(&(W[TWVL * 10]), T4);
+		    Tf = BYTW(&(W[TWVL * 6]), Te);
+		    Td = BYTW(&(W[TWVL * 4]), Tc);
+		    Ta = BYTW(&(W[TWVL * 8]), T9);
+		    T8 = BYTW(&(W[TWVL * 2]), T7);
+		    {
+			 V T6, Tm, Tg, Tk, Tb, Tl;
+			 T6 = VADD(T3, T5);
+			 Tm = VSUB(T3, T5);
+			 Tg = VADD(Td, Tf);
+			 Tk = VSUB(Td, Tf);
+			 Tb = VADD(T8, Ta);
+			 Tl = VSUB(T8, Ta);
+			 {
+			      V Tp, Tx, Tu, Th, Ts, Tn, Tq, Ty;
+			      Tp = VFNMS(LDK(KP356895867), T6, Tg);
+			      Tx = VFMA(LDK(KP554958132), Tk, Tm);
+			      ST(&(x[0]), VADD(T1, VADD(T6, VADD(Tb, Tg))), ms, &(x[0]));
+			      Tu = VFNMS(LDK(KP356895867), Tb, T6);
+			      Th = VFNMS(LDK(KP356895867), Tg, Tb);
+			      Ts = VFMA(LDK(KP554958132), Tl, Tk);
+			      Tn = VFNMS(LDK(KP554958132), Tm, Tl);
+			      Tq = VFNMS(LDK(KP692021471), Tp, Tb);
+			      Ty = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), Tx, Tl));
+			      {
+				   V Tv, Ti, Tt, To, Tr, Tw, Tj;
+				   Tv = VFNMS(LDK(KP692021471), Tu, Tg);
+				   Ti = VFNMS(LDK(KP692021471), Th, T6);
+				   Tt = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Ts, Tm));
+				   To = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tn, Tk));
+				   Tr = VFNMS(LDK(KP900968867), Tq, T1);
+				   Tw = VFNMS(LDK(KP900968867), Tv, T1);
+				   Tj = VFNMS(LDK(KP900968867), Ti, T1);
+				   ST(&(x[WS(rs, 5)]), VFNMSI(Tt, Tr), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 2)]), VFMAI(Tt, Tr), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(Ty, Tw), ms, &(x[0]));
+				   ST(&(x[WS(rs, 1)]), VFMAI(Ty, Tw), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 4)]), VFNMSI(To, Tj), ms, &(x[0]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(To, Tj), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 7, XSIMD_STRING("t1bv_7"), twinstr, &GENUS, {15, 15, 21, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_7) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_7, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name t1bv_7 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 36 FP additions, 30 FP multiplications,
+ * (or, 24 additions, 18 multiplications, 12 fused multiply/add),
+ * 21 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 12)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 12), MAKE_VOLATILE_STRIDE(7, rs)) {
+	       V Th, Tf, Ti, T5, Tk, Ta, Tj, To, Tp;
+	       Th = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V Tc, Te, Tb, Td;
+		    Tb = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tc = BYTW(&(W[TWVL * 2]), Tb);
+		    Td = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Te = BYTW(&(W[TWVL * 8]), Td);
+		    Tf = VSUB(Tc, Te);
+		    Ti = VADD(Tc, Te);
+	       }
+	       {
+		    V T2, T4, T1, T3;
+		    T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T2 = BYTW(&(W[0]), T1);
+		    T3 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T4 = BYTW(&(W[TWVL * 10]), T3);
+		    T5 = VSUB(T2, T4);
+		    Tk = VADD(T2, T4);
+	       }
+	       {
+		    V T7, T9, T6, T8;
+		    T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T7 = BYTW(&(W[TWVL * 4]), T6);
+		    T8 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    T9 = BYTW(&(W[TWVL * 6]), T8);
+		    Ta = VSUB(T7, T9);
+		    Tj = VADD(T7, T9);
+	       }
+	       ST(&(x[0]), VADD(Th, VADD(Tk, VADD(Ti, Tj))), ms, &(x[0]));
+	       To = VBYI(VFNMS(LDK(KP781831482), Ta, VFNMS(LDK(KP433883739), Tf, VMUL(LDK(KP974927912), T5))));
+	       Tp = VFMA(LDK(KP623489801), Tj, VFNMS(LDK(KP900968867), Ti, VFNMS(LDK(KP222520933), Tk, Th)));
+	       ST(&(x[WS(rs, 2)]), VADD(To, Tp), ms, &(x[0]));
+	       ST(&(x[WS(rs, 5)]), VSUB(Tp, To), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tg, Tl, Tm, Tn;
+		    Tg = VBYI(VFMA(LDK(KP433883739), T5, VFNMS(LDK(KP781831482), Tf, VMUL(LDK(KP974927912), Ta))));
+		    Tl = VFMA(LDK(KP623489801), Ti, VFNMS(LDK(KP222520933), Tj, VFNMS(LDK(KP900968867), Tk, Th)));
+		    ST(&(x[WS(rs, 3)]), VADD(Tg, Tl), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 4)]), VSUB(Tl, Tg), ms, &(x[0]));
+		    Tm = VBYI(VFMA(LDK(KP781831482), T5, VFMA(LDK(KP974927912), Tf, VMUL(LDK(KP433883739), Ta))));
+		    Tn = VFMA(LDK(KP623489801), Tk, VFNMS(LDK(KP900968867), Tj, VFNMS(LDK(KP222520933), Ti, Th)));
+		    ST(&(x[WS(rs, 1)]), VADD(Tm, Tn), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VSUB(Tn, Tm), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 7, XSIMD_STRING("t1bv_7"), twinstr, &GENUS, {24, 18, 12, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_7) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_7, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1bv_8 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 33 FP additions, 24 FP multiplications,
+ * (or, 23 additions, 14 multiplications, 10 fused multiply/add),
+ * 36 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T1, T2, Th, Tj, T5, T7, Ta, Tc;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       Tj = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+	       Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T3, Ti, Tk, T6, T8, Tb, Td;
+		    T3 = BYTW(&(W[TWVL * 6]), T2);
+		    Ti = BYTW(&(W[TWVL * 2]), Th);
+		    Tk = BYTW(&(W[TWVL * 10]), Tj);
+		    T6 = BYTW(&(W[0]), T5);
+		    T8 = BYTW(&(W[TWVL * 8]), T7);
+		    Tb = BYTW(&(W[TWVL * 12]), Ta);
+		    Td = BYTW(&(W[TWVL * 4]), Tc);
+		    {
+			 V Tq, T4, Tr, Tl, Tt, T9, Tu, Te, Tw, Ts;
+			 Tq = VADD(T1, T3);
+			 T4 = VSUB(T1, T3);
+			 Tr = VADD(Ti, Tk);
+			 Tl = VSUB(Ti, Tk);
+			 Tt = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+			 Tu = VADD(Tb, Td);
+			 Te = VSUB(Tb, Td);
+			 Tw = VADD(Tq, Tr);
+			 Ts = VSUB(Tq, Tr);
+			 {
+			      V Tx, Tv, Tm, Tf;
+			      Tx = VADD(Tt, Tu);
+			      Tv = VSUB(Tt, Tu);
+			      Tm = VSUB(T9, Te);
+			      Tf = VADD(T9, Te);
+			      {
+				   V Tp, Tn, To, Tg;
+				   ST(&(x[0]), VADD(Tw, Tx), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VSUB(Tw, Tx), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFMAI(Tv, Ts), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(Tv, Ts), ms, &(x[0]));
+				   Tp = VFMA(LDK(KP707106781), Tm, Tl);
+				   Tn = VFNMS(LDK(KP707106781), Tm, Tl);
+				   To = VFMA(LDK(KP707106781), Tf, T4);
+				   Tg = VFNMS(LDK(KP707106781), Tf, T4);
+				   ST(&(x[WS(rs, 1)]), VFMAI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFNMSI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 5)]), VFMAI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFNMSI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t1bv_8"), twinstr, &GENUS, {23, 14, 10, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_8) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1bv_8 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 33 FP additions, 16 FP multiplications,
+ * (or, 33 additions, 16 multiplications, 0 fused multiply/add),
+ * 24 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V Tl, Tq, Tg, Tr, T5, Tt, Ta, Tu, Ti, Tk, Tj;
+	       Ti = LD(&(x[0]), ms, &(x[0]));
+	       Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tk = BYTW(&(W[TWVL * 6]), Tj);
+	       Tl = VSUB(Ti, Tk);
+	       Tq = VADD(Ti, Tk);
+	       {
+		    V Td, Tf, Tc, Te;
+		    Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Td = BYTW(&(W[TWVL * 2]), Tc);
+		    Te = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    Tf = BYTW(&(W[TWVL * 10]), Te);
+		    Tg = VSUB(Td, Tf);
+		    Tr = VADD(Td, Tf);
+	       }
+	       {
+		    V T2, T4, T1, T3;
+		    T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T2 = BYTW(&(W[0]), T1);
+		    T3 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T4 = BYTW(&(W[TWVL * 8]), T3);
+		    T5 = VSUB(T2, T4);
+		    Tt = VADD(T2, T4);
+	       }
+	       {
+		    V T7, T9, T6, T8;
+		    T6 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    T7 = BYTW(&(W[TWVL * 12]), T6);
+		    T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T9 = BYTW(&(W[TWVL * 4]), T8);
+		    Ta = VSUB(T7, T9);
+		    Tu = VADD(T7, T9);
+	       }
+	       {
+		    V Ts, Tv, Tw, Tx;
+		    Ts = VSUB(Tq, Tr);
+		    Tv = VBYI(VSUB(Tt, Tu));
+		    ST(&(x[WS(rs, 6)]), VSUB(Ts, Tv), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Ts, Tv), ms, &(x[0]));
+		    Tw = VADD(Tq, Tr);
+		    Tx = VADD(Tt, Tu);
+		    ST(&(x[WS(rs, 4)]), VSUB(Tw, Tx), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Tw, Tx), ms, &(x[0]));
+		    {
+			 V Th, To, Tn, Tp, Tb, Tm;
+			 Tb = VMUL(LDK(KP707106781), VSUB(T5, Ta));
+			 Th = VBYI(VSUB(Tb, Tg));
+			 To = VBYI(VADD(Tg, Tb));
+			 Tm = VMUL(LDK(KP707106781), VADD(T5, Ta));
+			 Tn = VSUB(Tl, Tm);
+			 Tp = VADD(Tl, Tm);
+			 ST(&(x[WS(rs, 3)]), VADD(Th, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VSUB(Tp, To), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VSUB(Tn, Th), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t1bv_8"), twinstr, &GENUS, {33, 16, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_8) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1bv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1bv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1bv_9 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 54 FP additions, 54 FP multiplications,
+ * (or, 20 additions, 20 multiplications, 34 fused multiply/add),
+ * 67 stack variables, 19 constants, and 18 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
+     DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
+     DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
+     DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
+     DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
+     DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
+     DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
+     DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
+     DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
+     DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
+     DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
+	       V T1, T3, T5, T9, Tn, Tb, Td, Th, Tj, Tx, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T2, T4, T8, Tm;
+		    T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T8 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    {
+			 V Ta, Tc, Tg, Ti;
+			 Ta = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 Tc = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Ti = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T3 = BYTW(&(W[TWVL * 4]), T2);
+			 T5 = BYTW(&(W[TWVL * 10]), T4);
+			 T9 = BYTW(&(W[TWVL * 2]), T8);
+			 Tn = BYTW(&(W[0]), Tm);
+			 Tb = BYTW(&(W[TWVL * 8]), Ta);
+			 Td = BYTW(&(W[TWVL * 14]), Tc);
+			 Th = BYTW(&(W[TWVL * 6]), Tg);
+			 Tj = BYTW(&(W[TWVL * 12]), Ti);
+		    }
+	       }
+	       Tx = VSUB(T3, T5);
+	       T6 = VADD(T3, T5);
+	       {
+		    V Tl, Te, Tk, To, T7, TN;
+		    Tl = VSUB(Td, Tb);
+		    Te = VADD(Tb, Td);
+		    Tk = VSUB(Th, Tj);
+		    To = VADD(Th, Tj);
+		    T7 = VFNMS(LDK(KP500000000), T6, T1);
+		    TN = VADD(T1, T6);
+		    {
+			 V Tf, TP, Tp, TO;
+			 Tf = VFNMS(LDK(KP500000000), Te, T9);
+			 TP = VADD(T9, Te);
+			 Tp = VFNMS(LDK(KP500000000), To, Tn);
+			 TO = VADD(Tn, To);
+			 {
+			      V Tz, TC, Tu, TD, TA, Tq, TQ, TS;
+			      Tz = VFNMS(LDK(KP152703644), Tl, Tf);
+			      TC = VFMA(LDK(KP203604859), Tf, Tl);
+			      Tu = VFNMS(LDK(KP439692620), Tk, Tf);
+			      TD = VFNMS(LDK(KP726681596), Tk, Tp);
+			      TA = VFMA(LDK(KP968908795), Tp, Tk);
+			      Tq = VFNMS(LDK(KP586256827), Tp, Tl);
+			      TQ = VADD(TO, TP);
+			      TS = VMUL(LDK(KP866025403), VSUB(TO, TP));
+			      {
+				   V TI, TB, TH, TE, Tr, TR, Tw, Tv;
+				   Tv = VFNMS(LDK(KP420276625), Tu, Tl);
+				   TI = VFMA(LDK(KP673648177), TA, Tz);
+				   TB = VFNMS(LDK(KP673648177), TA, Tz);
+				   TH = VFNMS(LDK(KP898197570), TD, TC);
+				   TE = VFMA(LDK(KP898197570), TD, TC);
+				   Tr = VFNMS(LDK(KP347296355), Tq, Tk);
+				   ST(&(x[0]), VADD(TQ, TN), ms, &(x[0]));
+				   TR = VFNMS(LDK(KP500000000), TQ, TN);
+				   Tw = VFNMS(LDK(KP826351822), Tv, Tp);
+				   {
+					V TM, TL, TF, TJ, Ts, Ty, TG, TK, Tt;
+					TM = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tx, TI));
+					TL = VFMA(LDK(KP852868531), TE, T7);
+					TF = VFNMS(LDK(KP500000000), TE, TB);
+					TJ = VFMA(LDK(KP666666666), TI, TH);
+					Ts = VFNMS(LDK(KP907603734), Tr, Tf);
+					ST(&(x[WS(rs, 6)]), VFNMSI(TS, TR), ms, &(x[0]));
+					ST(&(x[WS(rs, 3)]), VFMAI(TS, TR), ms, &(x[WS(rs, 1)]));
+					Ty = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tx, Tw));
+					ST(&(x[WS(rs, 8)]), VFNMSI(TM, TL), ms, &(x[0]));
+					ST(&(x[WS(rs, 1)]), VFMAI(TM, TL), ms, &(x[WS(rs, 1)]));
+					TG = VFMA(LDK(KP852868531), TF, T7);
+					TK = VMUL(LDK(KP866025403), VFNMS(LDK(KP852868531), TJ, Tx));
+					Tt = VFNMS(LDK(KP939692620), Ts, T7);
+					ST(&(x[WS(rs, 5)]), VFNMSI(TK, TG), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 4)]), VFMAI(TK, TG), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFMAI(Ty, Tt), ms, &(x[0]));
+					ST(&(x[WS(rs, 7)]), VFNMSI(Ty, Tt), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 9, XSIMD_STRING("t1bv_9"), twinstr, &GENUS, {20, 20, 34, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_9) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_9, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1bv_9 -include t1b.h -sign 1 */
+
+/*
+ * This function contains 54 FP additions, 42 FP multiplications,
+ * (or, 38 additions, 26 multiplications, 16 fused multiply/add),
+ * 38 stack variables, 14 constants, and 18 memory accesses
+ */
+#include "t1b.h"
+
+static void t1bv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
+     DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
+     DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
+     DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
+	       V T1, T6, Tu, Tg, Tf, TD, Tq, Tp, TE;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T3, T5, T2, T4;
+		    T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T3 = BYTW(&(W[TWVL * 4]), T2);
+		    T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T5 = BYTW(&(W[TWVL * 10]), T4);
+		    T6 = VADD(T3, T5);
+		    Tu = VMUL(LDK(KP866025403), VSUB(T3, T5));
+	       }
+	       {
+		    V T9, Td, Tb, T8, Tc, Ta, Te;
+		    T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T9 = BYTW(&(W[0]), T8);
+		    Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTW(&(W[TWVL * 12]), Tc);
+		    Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tb = BYTW(&(W[TWVL * 6]), Ta);
+		    Tg = VSUB(Tb, Td);
+		    Te = VADD(Tb, Td);
+		    Tf = VFNMS(LDK(KP500000000), Te, T9);
+		    TD = VADD(T9, Te);
+	       }
+	       {
+		    V Tj, Tn, Tl, Ti, Tm, Tk, To;
+		    Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tj = BYTW(&(W[TWVL * 2]), Ti);
+		    Tm = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    Tn = BYTW(&(W[TWVL * 14]), Tm);
+		    Tk = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Tl = BYTW(&(W[TWVL * 8]), Tk);
+		    Tq = VSUB(Tl, Tn);
+		    To = VADD(Tl, Tn);
+		    Tp = VFNMS(LDK(KP500000000), To, Tj);
+		    TE = VADD(Tj, To);
+	       }
+	       {
+		    V TF, TG, TH, TI;
+		    TF = VBYI(VMUL(LDK(KP866025403), VSUB(TD, TE)));
+		    TG = VADD(T1, T6);
+		    TH = VADD(TD, TE);
+		    TI = VFNMS(LDK(KP500000000), TH, TG);
+		    ST(&(x[WS(rs, 3)]), VADD(TF, TI), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[0]), VADD(TG, TH), ms, &(x[0]));
+		    ST(&(x[WS(rs, 6)]), VSUB(TI, TF), ms, &(x[0]));
+	       }
+	       {
+		    V TC, Tv, Tw, Tx, Th, Tr, Ts, T7, TB;
+		    TC = VBYI(VSUB(VFMA(LDK(KP984807753), Tf, VFMA(LDK(KP813797681), Tq, VFNMS(LDK(KP150383733), Tg, VMUL(LDK(KP342020143), Tp)))), Tu));
+		    Tv = VFMA(LDK(KP663413948), Tg, VMUL(LDK(KP642787609), Tf));
+		    Tw = VFMA(LDK(KP150383733), Tq, VMUL(LDK(KP984807753), Tp));
+		    Tx = VADD(Tv, Tw);
+		    Th = VFNMS(LDK(KP556670399), Tg, VMUL(LDK(KP766044443), Tf));
+		    Tr = VFNMS(LDK(KP852868531), Tq, VMUL(LDK(KP173648177), Tp));
+		    Ts = VADD(Th, Tr);
+		    T7 = VFNMS(LDK(KP500000000), T6, T1);
+		    TB = VFMA(LDK(KP852868531), Tg, VFMA(LDK(KP173648177), Tf, VFMA(LDK(KP296198132), Tq, VFNMS(LDK(KP939692620), Tp, T7))));
+		    ST(&(x[WS(rs, 7)]), VSUB(TB, TC), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 2)]), VADD(TB, TC), ms, &(x[0]));
+		    {
+			 V Tt, Ty, Tz, TA;
+			 Tt = VADD(T7, Ts);
+			 Ty = VBYI(VADD(Tu, Tx));
+			 ST(&(x[WS(rs, 8)]), VSUB(Tt, Ty), ms, &(x[0]));
+			 ST(&(x[WS(rs, 1)]), VADD(Tt, Ty), ms, &(x[WS(rs, 1)]));
+			 Tz = VBYI(VADD(Tu, VFNMS(LDK(KP500000000), Tx, VMUL(LDK(KP866025403), VSUB(Th, Tr)))));
+			 TA = VFMA(LDK(KP866025403), VSUB(Tw, Tv), VFNMS(LDK(KP500000000), Ts, T7));
+			 ST(&(x[WS(rs, 4)]), VADD(Tz, TA), ms, &(x[0]));
+			 ST(&(x[WS(rs, 5)]), VSUB(TA, Tz), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 9, XSIMD_STRING("t1bv_9"), twinstr, &GENUS, {38, 26, 16, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1bv_9) (planner *p) {
+     X(kdft_dit_register) (p, t1bv_9, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fuv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fuv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:01 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1fuv_10 -include t1fu.h */
+
+/*
+ * This function contains 51 FP additions, 40 FP multiplications,
+ * (or, 33 additions, 22 multiplications, 18 fused multiply/add),
+ * 43 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Td, TA, T4, Ta, Tk, TE, Tp, TF, TB, T9, T1, T2, Tb;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tg, Tn, Ti, Tl;
+		    Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tn = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Ti = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+		    Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    {
+			 V T6, T8, T5, Tc;
+			 T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 {
+			      V T3, Th, To, Tj, Tm, T7;
+			      T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T3 = BYTWJ(&(W[TWVL * 8]), T2);
+			      Th = BYTWJ(&(W[TWVL * 6]), Tg);
+			      To = BYTWJ(&(W[0]), Tn);
+			      Tj = BYTWJ(&(W[TWVL * 16]), Ti);
+			      Tm = BYTWJ(&(W[TWVL * 10]), Tl);
+			      T6 = BYTWJ(&(W[TWVL * 2]), T5);
+			      Td = BYTWJ(&(W[TWVL * 4]), Tc);
+			      T8 = BYTWJ(&(W[TWVL * 12]), T7);
+			      TA = VADD(T1, T3);
+			      T4 = VSUB(T1, T3);
+			      Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Tk = VSUB(Th, Tj);
+			      TE = VADD(Th, Tj);
+			      Tp = VSUB(Tm, To);
+			      TF = VADD(Tm, To);
+			 }
+			 TB = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+		    }
+	       }
+	       Tb = BYTWJ(&(W[TWVL * 14]), Ta);
+	       {
+		    V TL, TG, Tw, Tq, TC, Te;
+		    TL = VSUB(TE, TF);
+		    TG = VADD(TE, TF);
+		    Tw = VSUB(Tk, Tp);
+		    Tq = VADD(Tk, Tp);
+		    TC = VADD(Tb, Td);
+		    Te = VSUB(Tb, Td);
+		    {
+			 V TM, TD, Tv, Tf;
+			 TM = VSUB(TB, TC);
+			 TD = VADD(TB, TC);
+			 Tv = VSUB(T9, Te);
+			 Tf = VADD(T9, Te);
+			 {
+			      V TP, TN, TH, TJ, Tz, Tx, Tr, Tt, TI, Ts;
+			      TP = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TL, TM));
+			      TN = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TM, TL));
+			      TH = VADD(TD, TG);
+			      TJ = VSUB(TD, TG);
+			      Tz = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tv, Tw));
+			      Tx = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tw, Tv));
+			      Tr = VADD(Tf, Tq);
+			      Tt = VSUB(Tf, Tq);
+			      ST(&(x[0]), VADD(TA, TH), ms, &(x[0]));
+			      TI = VFNMS(LDK(KP250000000), TH, TA);
+			      ST(&(x[WS(rs, 5)]), VADD(T4, Tr), ms, &(x[WS(rs, 1)]));
+			      Ts = VFNMS(LDK(KP250000000), Tr, T4);
+			      {
+				   V TK, TO, Tu, Ty;
+				   TK = VFNMS(LDK(KP559016994), TJ, TI);
+				   TO = VFMA(LDK(KP559016994), TJ, TI);
+				   Tu = VFMA(LDK(KP559016994), Tt, Ts);
+				   Ty = VFNMS(LDK(KP559016994), Tt, Ts);
+				   ST(&(x[WS(rs, 8)]), VFNMSI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFMAI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 9)]), VFMAI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFNMSI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFMAI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFNMSI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t1fuv_10"), twinstr, &GENUS, {33, 22, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_10) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1fuv_10 -include t1fu.h */
+
+/*
+ * This function contains 51 FP additions, 30 FP multiplications,
+ * (or, 45 additions, 24 multiplications, 6 fused multiply/add),
+ * 32 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Tr, TH, Tg, Tl, Tm, TA, TB, TJ, T5, Ta, Tb, TD, TE, TI, To;
+	       V Tq, Tp;
+	       To = LD(&(x[0]), ms, &(x[0]));
+	       Tp = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Tq = BYTWJ(&(W[TWVL * 8]), Tp);
+	       Tr = VSUB(To, Tq);
+	       TH = VADD(To, Tq);
+	       {
+		    V Td, Tk, Tf, Ti;
+		    {
+			 V Tc, Tj, Te, Th;
+			 Tc = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Td = BYTWJ(&(W[TWVL * 6]), Tc);
+			 Tj = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Tk = BYTWJ(&(W[0]), Tj);
+			 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTWJ(&(W[TWVL * 16]), Te);
+			 Th = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Ti = BYTWJ(&(W[TWVL * 10]), Th);
+		    }
+		    Tg = VSUB(Td, Tf);
+		    Tl = VSUB(Ti, Tk);
+		    Tm = VADD(Tg, Tl);
+		    TA = VADD(Td, Tf);
+		    TB = VADD(Ti, Tk);
+		    TJ = VADD(TA, TB);
+	       }
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T2 = BYTWJ(&(W[TWVL * 2]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTWJ(&(W[TWVL * 12]), T3);
+			 T6 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T7 = BYTWJ(&(W[TWVL * 14]), T6);
+		    }
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    Tb = VADD(T5, Ta);
+		    TD = VADD(T2, T4);
+		    TE = VADD(T7, T9);
+		    TI = VADD(TD, TE);
+	       }
+	       {
+		    V Tn, Ts, Tt, Tx, Tz, Tv, Tw, Ty, Tu;
+		    Tn = VMUL(LDK(KP559016994), VSUB(Tb, Tm));
+		    Ts = VADD(Tb, Tm);
+		    Tt = VFNMS(LDK(KP250000000), Ts, Tr);
+		    Tv = VSUB(T5, Ta);
+		    Tw = VSUB(Tg, Tl);
+		    Tx = VBYI(VFMA(LDK(KP951056516), Tv, VMUL(LDK(KP587785252), Tw)));
+		    Tz = VBYI(VFNMS(LDK(KP587785252), Tv, VMUL(LDK(KP951056516), Tw)));
+		    ST(&(x[WS(rs, 5)]), VADD(Tr, Ts), ms, &(x[WS(rs, 1)]));
+		    Ty = VSUB(Tt, Tn);
+		    ST(&(x[WS(rs, 3)]), VSUB(Ty, Tz), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VADD(Tz, Ty), ms, &(x[WS(rs, 1)]));
+		    Tu = VADD(Tn, Tt);
+		    ST(&(x[WS(rs, 1)]), VSUB(Tu, Tx), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VADD(Tx, Tu), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TM, TK, TL, TG, TO, TC, TF, TP, TN;
+		    TM = VMUL(LDK(KP559016994), VSUB(TI, TJ));
+		    TK = VADD(TI, TJ);
+		    TL = VFNMS(LDK(KP250000000), TK, TH);
+		    TC = VSUB(TA, TB);
+		    TF = VSUB(TD, TE);
+		    TG = VBYI(VFNMS(LDK(KP587785252), TF, VMUL(LDK(KP951056516), TC)));
+		    TO = VBYI(VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TC)));
+		    ST(&(x[0]), VADD(TH, TK), ms, &(x[0]));
+		    TP = VADD(TM, TL);
+		    ST(&(x[WS(rs, 4)]), VADD(TO, TP), ms, &(x[0]));
+		    ST(&(x[WS(rs, 6)]), VSUB(TP, TO), ms, &(x[0]));
+		    TN = VSUB(TL, TM);
+		    ST(&(x[WS(rs, 2)]), VADD(TG, TN), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VSUB(TN, TG), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t1fuv_10"), twinstr, &GENUS, {45, 24, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_10) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fuv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fuv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t1fuv_2 -include t1fu.h */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T2, T3;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[0]), T2);
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t1fuv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_2) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t1fuv_2 -include t1fu.h */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[0]), T2);
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t1fuv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_2) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fuv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fuv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 3 -name t1fuv_3 -include t1fu.h */
+
+/*
+ * This function contains 8 FP additions, 8 FP multiplications,
+ * (or, 5 additions, 5 multiplications, 3 fused multiply/add),
+ * 12 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(3, rs)) {
+	       V T1, T2, T4;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, T5, T8, T6, T7;
+		    T3 = BYTWJ(&(W[0]), T2);
+		    T5 = BYTWJ(&(W[TWVL * 2]), T4);
+		    T8 = VMUL(LDK(KP866025403), VSUB(T5, T3));
+		    T6 = VADD(T3, T5);
+		    T7 = VFNMS(LDK(KP500000000), T6, T1);
+		    ST(&(x[0]), VADD(T1, T6), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T8, T7), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 2)]), VFNMSI(T8, T7), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 3, XSIMD_STRING("t1fuv_3"), twinstr, &GENUS, {5, 5, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_3) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_3, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 3 -name t1fuv_3 -include t1fu.h */
+
+/*
+ * This function contains 8 FP additions, 6 FP multiplications,
+ * (or, 7 additions, 5 multiplications, 1 fused multiply/add),
+ * 12 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(3, rs)) {
+	       V T1, T3, T5, T6, T2, T4, T7, T8;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[0]), T2);
+	       T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T5 = BYTWJ(&(W[TWVL * 2]), T4);
+	       T6 = VADD(T3, T5);
+	       ST(&(x[0]), VADD(T1, T6), ms, &(x[0]));
+	       T7 = VFNMS(LDK(KP500000000), T6, T1);
+	       T8 = VBYI(VMUL(LDK(KP866025403), VSUB(T5, T3)));
+	       ST(&(x[WS(rs, 2)]), VSUB(T7, T8), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VADD(T7, T8), ms, &(x[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 3, XSIMD_STRING("t1fuv_3"), twinstr, &GENUS, {7, 5, 1, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_3) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_3, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fuv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fuv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1fuv_4 -include t1fu.h */
+
+/*
+ * This function contains 11 FP additions, 8 FP multiplications,
+ * (or, 9 additions, 6 multiplications, 2 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T7, T2, T5, T8, T3, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTWJ(&(W[TWVL * 4]), T7);
+	       T3 = BYTWJ(&(W[TWVL * 2]), T2);
+	       T6 = BYTWJ(&(W[0]), T5);
+	       {
+		    V Ta, T4, Tb, T9;
+		    Ta = VADD(T1, T3);
+		    T4 = VSUB(T1, T3);
+		    Tb = VADD(T6, T8);
+		    T9 = VSUB(T6, T8);
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 3)]), VFMAI(T9, T4), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VFNMSI(T9, T4), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t1fuv_4"), twinstr, &GENUS, {9, 6, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_4) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1fuv_4 -include t1fu.h */
+
+/*
+ * This function contains 11 FP additions, 6 FP multiplications,
+ * (or, 11 additions, 6 multiplications, 0 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T8, T3, T6, T7, T2, T5;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTWJ(&(W[TWVL * 4]), T7);
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T3 = BYTWJ(&(W[TWVL * 2]), T2);
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T6 = BYTWJ(&(W[0]), T5);
+	       {
+		    V T4, T9, Ta, Tb;
+		    T4 = VSUB(T1, T3);
+		    T9 = VBYI(VSUB(T6, T8));
+		    ST(&(x[WS(rs, 1)]), VSUB(T4, T9), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VADD(T4, T9), ms, &(x[WS(rs, 1)]));
+		    Ta = VADD(T1, T3);
+		    Tb = VADD(T6, T8);
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t1fuv_4"), twinstr, &GENUS, {11, 6, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_4) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fuv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fuv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t1fuv_5 -include t1fu.h */
+
+/*
+ * This function contains 20 FP additions, 19 FP multiplications,
+ * (or, 11 additions, 10 multiplications, 9 fused multiply/add),
+ * 26 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V T1, T2, T9, T4, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T9 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, Ta, T5, T8;
+		    T3 = BYTWJ(&(W[0]), T2);
+		    Ta = BYTWJ(&(W[TWVL * 4]), T9);
+		    T5 = BYTWJ(&(W[TWVL * 6]), T4);
+		    T8 = BYTWJ(&(W[TWVL * 2]), T7);
+		    {
+			 V T6, Tg, Tb, Th;
+			 T6 = VADD(T3, T5);
+			 Tg = VSUB(T3, T5);
+			 Tb = VADD(T8, Ta);
+			 Th = VSUB(T8, Ta);
+			 {
+			      V Te, Tc, Tk, Ti, Td, Tj, Tf;
+			      Te = VSUB(T6, Tb);
+			      Tc = VADD(T6, Tb);
+			      Tk = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tg, Th));
+			      Ti = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Th, Tg));
+			      Td = VFNMS(LDK(KP250000000), Tc, T1);
+			      ST(&(x[0]), VADD(T1, Tc), ms, &(x[0]));
+			      Tj = VFNMS(LDK(KP559016994), Te, Td);
+			      Tf = VFMA(LDK(KP559016994), Te, Td);
+			      ST(&(x[WS(rs, 2)]), VFMAI(Tk, Tj), ms, &(x[0]));
+			      ST(&(x[WS(rs, 3)]), VFNMSI(Tk, Tj), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 4)]), VFMAI(Ti, Tf), ms, &(x[0]));
+			      ST(&(x[WS(rs, 1)]), VFNMSI(Ti, Tf), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t1fuv_5"), twinstr, &GENUS, {11, 10, 9, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_5) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t1fuv_5 -include t1fu.h */
+
+/*
+ * This function contains 20 FP additions, 14 FP multiplications,
+ * (or, 17 additions, 11 multiplications, 3 fused multiply/add),
+ * 20 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V Tc, Tg, Th, T5, Ta, Td;
+	       Tc = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTWJ(&(W[0]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T4 = BYTWJ(&(W[TWVL * 6]), T3);
+			 T6 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T7 = BYTWJ(&(W[TWVL * 2]), T6);
+		    }
+		    Tg = VSUB(T2, T4);
+		    Th = VSUB(T7, T9);
+		    T5 = VADD(T2, T4);
+		    Ta = VADD(T7, T9);
+		    Td = VADD(T5, Ta);
+	       }
+	       ST(&(x[0]), VADD(Tc, Td), ms, &(x[0]));
+	       {
+		    V Ti, Tj, Tf, Tk, Tb, Te;
+		    Ti = VBYI(VFMA(LDK(KP951056516), Tg, VMUL(LDK(KP587785252), Th)));
+		    Tj = VBYI(VFNMS(LDK(KP587785252), Tg, VMUL(LDK(KP951056516), Th)));
+		    Tb = VMUL(LDK(KP559016994), VSUB(T5, Ta));
+		    Te = VFNMS(LDK(KP250000000), Td, Tc);
+		    Tf = VADD(Tb, Te);
+		    Tk = VSUB(Te, Tb);
+		    ST(&(x[WS(rs, 1)]), VSUB(Tf, Ti), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VSUB(Tk, Tj), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 4)]), VADD(Ti, Tf), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Tj, Tk), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t1fuv_5"), twinstr, &GENUS, {17, 11, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_5) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fuv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fuv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name t1fuv_6 -include t1fu.h */
+
+/*
+ * This function contains 23 FP additions, 18 FP multiplications,
+ * (or, 17 additions, 12 multiplications, 6 fused multiply/add),
+ * 27 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 10)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(6, rs)) {
+	       V T1, T2, Ta, Tc, T5, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T3, Tb, Td, T6, T8;
+		    T3 = BYTWJ(&(W[TWVL * 4]), T2);
+		    Tb = BYTWJ(&(W[TWVL * 6]), Ta);
+		    Td = BYTWJ(&(W[0]), Tc);
+		    T6 = BYTWJ(&(W[TWVL * 2]), T5);
+		    T8 = BYTWJ(&(W[TWVL * 8]), T7);
+		    {
+			 V Ti, T4, Tk, Te, Tj, T9;
+			 Ti = VADD(T1, T3);
+			 T4 = VSUB(T1, T3);
+			 Tk = VADD(Tb, Td);
+			 Te = VSUB(Tb, Td);
+			 Tj = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+			 {
+			      V Tl, Tn, Tf, Th, Tm, Tg;
+			      Tl = VADD(Tj, Tk);
+			      Tn = VMUL(LDK(KP866025403), VSUB(Tk, Tj));
+			      Tf = VADD(T9, Te);
+			      Th = VMUL(LDK(KP866025403), VSUB(Te, T9));
+			      ST(&(x[0]), VADD(Ti, Tl), ms, &(x[0]));
+			      Tm = VFNMS(LDK(KP500000000), Tl, Ti);
+			      ST(&(x[WS(rs, 3)]), VADD(T4, Tf), ms, &(x[WS(rs, 1)]));
+			      Tg = VFNMS(LDK(KP500000000), Tf, T4);
+			      ST(&(x[WS(rs, 2)]), VFNMSI(Tn, Tm), ms, &(x[0]));
+			      ST(&(x[WS(rs, 4)]), VFMAI(Tn, Tm), ms, &(x[0]));
+			      ST(&(x[WS(rs, 5)]), VFNMSI(Th, Tg), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VFMAI(Th, Tg), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 6, XSIMD_STRING("t1fuv_6"), twinstr, &GENUS, {17, 12, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_6) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_6, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name t1fuv_6 -include t1fu.h */
+
+/*
+ * This function contains 23 FP additions, 14 FP multiplications,
+ * (or, 21 additions, 12 multiplications, 2 fused multiply/add),
+ * 19 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 10)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(6, rs)) {
+	       V T4, Ti, Te, Tk, T9, Tj, T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[TWVL * 4]), T2);
+	       T4 = VSUB(T1, T3);
+	       Ti = VADD(T1, T3);
+	       {
+		    V Tb, Td, Ta, Tc;
+		    Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tb = BYTWJ(&(W[TWVL * 6]), Ta);
+		    Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTWJ(&(W[0]), Tc);
+		    Te = VSUB(Tb, Td);
+		    Tk = VADD(Tb, Td);
+	       }
+	       {
+		    V T6, T8, T5, T7;
+		    T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T6 = BYTWJ(&(W[TWVL * 2]), T5);
+		    T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T8 = BYTWJ(&(W[TWVL * 8]), T7);
+		    T9 = VSUB(T6, T8);
+		    Tj = VADD(T6, T8);
+	       }
+	       {
+		    V Th, Tf, Tg, Tn, Tl, Tm;
+		    Th = VBYI(VMUL(LDK(KP866025403), VSUB(Te, T9)));
+		    Tf = VADD(T9, Te);
+		    Tg = VFNMS(LDK(KP500000000), Tf, T4);
+		    ST(&(x[WS(rs, 3)]), VADD(T4, Tf), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(Tg, Th), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VSUB(Tg, Th), ms, &(x[WS(rs, 1)]));
+		    Tn = VBYI(VMUL(LDK(KP866025403), VSUB(Tk, Tj)));
+		    Tl = VADD(Tj, Tk);
+		    Tm = VFNMS(LDK(KP500000000), Tl, Ti);
+		    ST(&(x[0]), VADD(Ti, Tl), ms, &(x[0]));
+		    ST(&(x[WS(rs, 4)]), VADD(Tm, Tn), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VSUB(Tm, Tn), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 6, XSIMD_STRING("t1fuv_6"), twinstr, &GENUS, {21, 12, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_6) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_6, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fuv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fuv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:37:59 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name t1fuv_7 -include t1fu.h */
+
+/*
+ * This function contains 36 FP additions, 36 FP multiplications,
+ * (or, 15 additions, 15 multiplications, 21 fused multiply/add),
+ * 42 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 12)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 12), MAKE_VOLATILE_STRIDE(7, rs)) {
+	       V T1, T2, T4, Te, Tc, T9, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       Te = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T9 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, T5, Tf, Td, Ta, T8;
+		    T3 = BYTWJ(&(W[0]), T2);
+		    T5 = BYTWJ(&(W[TWVL * 10]), T4);
+		    Tf = BYTWJ(&(W[TWVL * 6]), Te);
+		    Td = BYTWJ(&(W[TWVL * 4]), Tc);
+		    Ta = BYTWJ(&(W[TWVL * 8]), T9);
+		    T8 = BYTWJ(&(W[TWVL * 2]), T7);
+		    {
+			 V T6, Tk, Tg, Tl, Tb, Tm;
+			 T6 = VADD(T3, T5);
+			 Tk = VSUB(T5, T3);
+			 Tg = VADD(Td, Tf);
+			 Tl = VSUB(Tf, Td);
+			 Tb = VADD(T8, Ta);
+			 Tm = VSUB(Ta, T8);
+			 {
+			      V Th, Ts, Tp, Tu, Tn, Tx, Ti, Tt;
+			      Th = VFNMS(LDK(KP356895867), T6, Tg);
+			      Ts = VFMA(LDK(KP554958132), Tl, Tk);
+			      ST(&(x[0]), VADD(T1, VADD(T6, VADD(Tb, Tg))), ms, &(x[0]));
+			      Tp = VFNMS(LDK(KP356895867), Tb, T6);
+			      Tu = VFNMS(LDK(KP356895867), Tg, Tb);
+			      Tn = VFMA(LDK(KP554958132), Tm, Tl);
+			      Tx = VFNMS(LDK(KP554958132), Tk, Tm);
+			      Ti = VFNMS(LDK(KP692021471), Th, Tb);
+			      Tt = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), Ts, Tm));
+			      {
+				   V Tq, Tv, To, Ty, Tj, Tr, Tw;
+				   Tq = VFNMS(LDK(KP692021471), Tp, Tg);
+				   Tv = VFNMS(LDK(KP692021471), Tu, T6);
+				   To = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tn, Tk));
+				   Ty = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tx, Tl));
+				   Tj = VFNMS(LDK(KP900968867), Ti, T1);
+				   Tr = VFNMS(LDK(KP900968867), Tq, T1);
+				   Tw = VFNMS(LDK(KP900968867), Tv, T1);
+				   ST(&(x[WS(rs, 2)]), VFMAI(To, Tj), ms, &(x[0]));
+				   ST(&(x[WS(rs, 5)]), VFNMSI(To, Tj), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFMAI(Tt, Tr), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(Tt, Tr), ms, &(x[0]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(Ty, Tw), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 4)]), VFNMSI(Ty, Tw), ms, &(x[0]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 7, XSIMD_STRING("t1fuv_7"), twinstr, &GENUS, {15, 15, 21, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_7) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_7, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name t1fuv_7 -include t1fu.h */
+
+/*
+ * This function contains 36 FP additions, 30 FP multiplications,
+ * (or, 24 additions, 18 multiplications, 12 fused multiply/add),
+ * 21 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 12)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 12), MAKE_VOLATILE_STRIDE(7, rs)) {
+	       V T1, Tg, Tj, T6, Ti, Tb, Tk, Tp, To;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V Td, Tf, Tc, Te;
+		    Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTWJ(&(W[TWVL * 4]), Tc);
+		    Te = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tf = BYTWJ(&(W[TWVL * 6]), Te);
+		    Tg = VADD(Td, Tf);
+		    Tj = VSUB(Tf, Td);
+	       }
+	       {
+		    V T3, T5, T2, T4;
+		    T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T3 = BYTWJ(&(W[0]), T2);
+		    T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T5 = BYTWJ(&(W[TWVL * 10]), T4);
+		    T6 = VADD(T3, T5);
+		    Ti = VSUB(T5, T3);
+	       }
+	       {
+		    V T8, Ta, T7, T9;
+		    T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T8 = BYTWJ(&(W[TWVL * 2]), T7);
+		    T9 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Ta = BYTWJ(&(W[TWVL * 8]), T9);
+		    Tb = VADD(T8, Ta);
+		    Tk = VSUB(Ta, T8);
+	       }
+	       ST(&(x[0]), VADD(T1, VADD(T6, VADD(Tb, Tg))), ms, &(x[0]));
+	       Tp = VBYI(VFMA(LDK(KP433883739), Ti, VFNMS(LDK(KP781831482), Tk, VMUL(LDK(KP974927912), Tj))));
+	       To = VFMA(LDK(KP623489801), Tb, VFNMS(LDK(KP222520933), Tg, VFNMS(LDK(KP900968867), T6, T1)));
+	       ST(&(x[WS(rs, 4)]), VSUB(To, Tp), ms, &(x[0]));
+	       ST(&(x[WS(rs, 3)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tl, Th, Tn, Tm;
+		    Tl = VBYI(VFNMS(LDK(KP781831482), Tj, VFNMS(LDK(KP433883739), Tk, VMUL(LDK(KP974927912), Ti))));
+		    Th = VFMA(LDK(KP623489801), Tg, VFNMS(LDK(KP900968867), Tb, VFNMS(LDK(KP222520933), T6, T1)));
+		    ST(&(x[WS(rs, 5)]), VSUB(Th, Tl), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 2)]), VADD(Th, Tl), ms, &(x[0]));
+		    Tn = VBYI(VFMA(LDK(KP781831482), Ti, VFMA(LDK(KP974927912), Tk, VMUL(LDK(KP433883739), Tj))));
+		    Tm = VFMA(LDK(KP623489801), T6, VFNMS(LDK(KP900968867), Tg, VFNMS(LDK(KP222520933), Tb, T1)));
+		    ST(&(x[WS(rs, 6)]), VSUB(Tm, Tn), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VADD(Tm, Tn), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 7, XSIMD_STRING("t1fuv_7"), twinstr, &GENUS, {24, 18, 12, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_7) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_7, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fuv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fuv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:00 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1fuv_8 -include t1fu.h */
+
+/*
+ * This function contains 33 FP additions, 24 FP multiplications,
+ * (or, 23 additions, 14 multiplications, 10 fused multiply/add),
+ * 36 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T1, T2, Th, Tj, T5, T7, Ta, Tc;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       Tj = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+	       Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T3, Ti, Tk, T6, T8, Tb, Td;
+		    T3 = BYTWJ(&(W[TWVL * 6]), T2);
+		    Ti = BYTWJ(&(W[TWVL * 2]), Th);
+		    Tk = BYTWJ(&(W[TWVL * 10]), Tj);
+		    T6 = BYTWJ(&(W[0]), T5);
+		    T8 = BYTWJ(&(W[TWVL * 8]), T7);
+		    Tb = BYTWJ(&(W[TWVL * 12]), Ta);
+		    Td = BYTWJ(&(W[TWVL * 4]), Tc);
+		    {
+			 V Tq, T4, Tr, Tl, Tt, T9, Tu, Te, Tw, Ts;
+			 Tq = VADD(T1, T3);
+			 T4 = VSUB(T1, T3);
+			 Tr = VADD(Ti, Tk);
+			 Tl = VSUB(Ti, Tk);
+			 Tt = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+			 Tu = VADD(Tb, Td);
+			 Te = VSUB(Tb, Td);
+			 Tw = VSUB(Tq, Tr);
+			 Ts = VADD(Tq, Tr);
+			 {
+			      V Tx, Tv, Tm, Tf;
+			      Tx = VSUB(Tu, Tt);
+			      Tv = VADD(Tt, Tu);
+			      Tm = VSUB(Te, T9);
+			      Tf = VADD(T9, Te);
+			      {
+				   V Tp, Tn, To, Tg;
+				   ST(&(x[WS(rs, 2)]), VFMAI(Tx, Tw), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(Tx, Tw), ms, &(x[0]));
+				   ST(&(x[0]), VADD(Ts, Tv), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VSUB(Ts, Tv), ms, &(x[0]));
+				   Tp = VFMA(LDK(KP707106781), Tm, Tl);
+				   Tn = VFNMS(LDK(KP707106781), Tm, Tl);
+				   To = VFNMS(LDK(KP707106781), Tf, T4);
+				   Tg = VFMA(LDK(KP707106781), Tf, T4);
+				   ST(&(x[WS(rs, 5)]), VFNMSI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFMAI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFNMSI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t1fuv_8"), twinstr, &GENUS, {23, 14, 10, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_8) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1fuv_8 -include t1fu.h */
+
+/*
+ * This function contains 33 FP additions, 16 FP multiplications,
+ * (or, 33 additions, 16 multiplications, 0 fused multiply/add),
+ * 24 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T4, Tq, Tm, Tr, T9, Tt, Te, Tu, T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       T3 = BYTWJ(&(W[TWVL * 6]), T2);
+	       T4 = VSUB(T1, T3);
+	       Tq = VADD(T1, T3);
+	       {
+		    V Tj, Tl, Ti, Tk;
+		    Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tj = BYTWJ(&(W[TWVL * 2]), Ti);
+		    Tk = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    Tl = BYTWJ(&(W[TWVL * 10]), Tk);
+		    Tm = VSUB(Tj, Tl);
+		    Tr = VADD(Tj, Tl);
+	       }
+	       {
+		    V T6, T8, T5, T7;
+		    T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T6 = BYTWJ(&(W[0]), T5);
+		    T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T8 = BYTWJ(&(W[TWVL * 8]), T7);
+		    T9 = VSUB(T6, T8);
+		    Tt = VADD(T6, T8);
+	       }
+	       {
+		    V Tb, Td, Ta, Tc;
+		    Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Tb = BYTWJ(&(W[TWVL * 12]), Ta);
+		    Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTWJ(&(W[TWVL * 4]), Tc);
+		    Te = VSUB(Tb, Td);
+		    Tu = VADD(Tb, Td);
+	       }
+	       {
+		    V Ts, Tv, Tw, Tx;
+		    Ts = VADD(Tq, Tr);
+		    Tv = VADD(Tt, Tu);
+		    ST(&(x[WS(rs, 4)]), VSUB(Ts, Tv), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ts, Tv), ms, &(x[0]));
+		    Tw = VSUB(Tq, Tr);
+		    Tx = VBYI(VSUB(Tu, Tt));
+		    ST(&(x[WS(rs, 6)]), VSUB(Tw, Tx), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Tw, Tx), ms, &(x[0]));
+		    {
+			 V Tg, To, Tn, Tp, Tf, Th;
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 Tg = VADD(T4, Tf);
+			 To = VSUB(T4, Tf);
+			 Th = VMUL(LDK(KP707106781), VSUB(Te, T9));
+			 Tn = VBYI(VSUB(Th, Tm));
+			 Tp = VBYI(VADD(Tm, Th));
+			 ST(&(x[WS(rs, 7)]), VSUB(Tg, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(Tg, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VSUB(To, Tp), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t1fuv_8"), twinstr, &GENUS, {33, 16, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_8) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fuv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fuv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:00 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1fuv_9 -include t1fu.h */
+
+/*
+ * This function contains 54 FP additions, 54 FP multiplications,
+ * (or, 20 additions, 20 multiplications, 34 fused multiply/add),
+ * 67 stack variables, 19 constants, and 18 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
+     DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
+     DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
+     DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
+     DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
+     DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
+     DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
+     DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
+     DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
+     DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
+     DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
+	       V T1, T3, T5, T9, Th, Tb, Td, Tj, Tl, TD, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T2, T4, T8, Tg;
+		    T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Tg = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    {
+			 V Ta, Tc, Ti, Tk;
+			 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 Ti = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 Tk = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T3 = BYTWJ(&(W[TWVL * 4]), T2);
+			 T5 = BYTWJ(&(W[TWVL * 10]), T4);
+			 T9 = BYTWJ(&(W[0]), T8);
+			 Th = BYTWJ(&(W[TWVL * 2]), Tg);
+			 Tb = BYTWJ(&(W[TWVL * 6]), Ta);
+			 Td = BYTWJ(&(W[TWVL * 12]), Tc);
+			 Tj = BYTWJ(&(W[TWVL * 8]), Ti);
+			 Tl = BYTWJ(&(W[TWVL * 14]), Tk);
+		    }
+	       }
+	       TD = VSUB(T5, T3);
+	       T6 = VADD(T3, T5);
+	       {
+		    V Tt, Te, Tu, Tm, Tr, T7;
+		    Tt = VSUB(Tb, Td);
+		    Te = VADD(Tb, Td);
+		    Tu = VSUB(Tl, Tj);
+		    Tm = VADD(Tj, Tl);
+		    Tr = VFNMS(LDK(KP500000000), T6, T1);
+		    T7 = VADD(T1, T6);
+		    {
+			 V Tv, Tf, Ts, Tn;
+			 Tv = VFNMS(LDK(KP500000000), Te, T9);
+			 Tf = VADD(T9, Te);
+			 Ts = VFNMS(LDK(KP500000000), Tm, Th);
+			 Tn = VADD(Th, Tm);
+			 {
+			      V TG, TK, Tw, TJ, TF, TA, To, Tq;
+			      TG = VFNMS(LDK(KP726681596), Tt, Tv);
+			      TK = VFMA(LDK(KP968908795), Tv, Tt);
+			      Tw = VFNMS(LDK(KP586256827), Tv, Tu);
+			      TJ = VFNMS(LDK(KP152703644), Tu, Ts);
+			      TF = VFMA(LDK(KP203604859), Ts, Tu);
+			      TA = VFNMS(LDK(KP439692620), Tt, Ts);
+			      To = VADD(Tf, Tn);
+			      Tq = VMUL(LDK(KP866025403), VSUB(Tn, Tf));
+			      {
+				   V TQ, TH, TL, TN, TB, Tp, Ty, TI, Tx;
+				   Tx = VFNMS(LDK(KP347296355), Tw, Tt);
+				   TQ = VFNMS(LDK(KP898197570), TG, TF);
+				   TH = VFMA(LDK(KP898197570), TG, TF);
+				   TL = VFMA(LDK(KP673648177), TK, TJ);
+				   TN = VFNMS(LDK(KP673648177), TK, TJ);
+				   TB = VFNMS(LDK(KP420276625), TA, Tu);
+				   ST(&(x[0]), VADD(T7, To), ms, &(x[0]));
+				   Tp = VFNMS(LDK(KP500000000), To, T7);
+				   Ty = VFNMS(LDK(KP907603734), Tx, Ts);
+				   TI = VFMA(LDK(KP852868531), TH, Tr);
+				   {
+					V TO, TR, TM, TC, Tz, TP, TS, TE;
+					TO = VFNMS(LDK(KP500000000), TH, TN);
+					TR = VFMA(LDK(KP666666666), TL, TQ);
+					TM = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), TD, TL));
+					TC = VFNMS(LDK(KP826351822), TB, Tv);
+					ST(&(x[WS(rs, 6)]), VFNMSI(Tq, Tp), ms, &(x[0]));
+					ST(&(x[WS(rs, 3)]), VFMAI(Tq, Tp), ms, &(x[WS(rs, 1)]));
+					Tz = VFNMS(LDK(KP939692620), Ty, Tr);
+					TP = VFMA(LDK(KP852868531), TO, Tr);
+					TS = VMUL(LDK(KP866025403), VFMA(LDK(KP852868531), TR, TD));
+					ST(&(x[WS(rs, 8)]), VFMAI(TM, TI), ms, &(x[0]));
+					ST(&(x[WS(rs, 1)]), VFNMSI(TM, TI), ms, &(x[WS(rs, 1)]));
+					TE = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), TD, TC));
+					ST(&(x[WS(rs, 4)]), VFMAI(TS, TP), ms, &(x[0]));
+					ST(&(x[WS(rs, 5)]), VFNMSI(TS, TP), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 7)]), VFMAI(TE, Tz), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 2)]), VFNMSI(TE, Tz), ms, &(x[0]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 9, XSIMD_STRING("t1fuv_9"), twinstr, &GENUS, {20, 20, 34, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_9) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_9, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1fuv_9 -include t1fu.h */
+
+/*
+ * This function contains 54 FP additions, 42 FP multiplications,
+ * (or, 38 additions, 26 multiplications, 16 fused multiply/add),
+ * 38 stack variables, 14 constants, and 18 memory accesses
+ */
+#include "t1fu.h"
+
+static void t1fuv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
+     DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
+     DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
+	       V T1, T6, TA, Tt, Tf, Ts, Tw, Tn, Tv;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T3, T5, T2, T4;
+		    T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T3 = BYTWJ(&(W[TWVL * 4]), T2);
+		    T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T5 = BYTWJ(&(W[TWVL * 10]), T4);
+		    T6 = VADD(T3, T5);
+		    TA = VMUL(LDK(KP866025403), VSUB(T5, T3));
+	       }
+	       {
+		    V T9, Td, Tb, T8, Tc, Ta, Te;
+		    T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T9 = BYTWJ(&(W[0]), T8);
+		    Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTWJ(&(W[TWVL * 12]), Tc);
+		    Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tb = BYTWJ(&(W[TWVL * 6]), Ta);
+		    Tt = VSUB(Td, Tb);
+		    Te = VADD(Tb, Td);
+		    Tf = VADD(T9, Te);
+		    Ts = VFNMS(LDK(KP500000000), Te, T9);
+	       }
+	       {
+		    V Th, Tl, Tj, Tg, Tk, Ti, Tm;
+		    Tg = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Th = BYTWJ(&(W[TWVL * 2]), Tg);
+		    Tk = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    Tl = BYTWJ(&(W[TWVL * 14]), Tk);
+		    Ti = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Tj = BYTWJ(&(W[TWVL * 8]), Ti);
+		    Tw = VSUB(Tl, Tj);
+		    Tm = VADD(Tj, Tl);
+		    Tn = VADD(Th, Tm);
+		    Tv = VFNMS(LDK(KP500000000), Tm, Th);
+	       }
+	       {
+		    V Tq, T7, To, Tp;
+		    Tq = VBYI(VMUL(LDK(KP866025403), VSUB(Tn, Tf)));
+		    T7 = VADD(T1, T6);
+		    To = VADD(Tf, Tn);
+		    Tp = VFNMS(LDK(KP500000000), To, T7);
+		    ST(&(x[0]), VADD(T7, To), ms, &(x[0]));
+		    ST(&(x[WS(rs, 3)]), VADD(Tp, Tq), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VSUB(Tp, Tq), ms, &(x[0]));
+	       }
+	       {
+		    V TI, TB, TC, TD, Tu, Tx, Ty, Tr, TH;
+		    TI = VBYI(VSUB(VFNMS(LDK(KP342020143), Tv, VFNMS(LDK(KP150383733), Tt, VFNMS(LDK(KP984807753), Ts, VMUL(LDK(KP813797681), Tw)))), TA));
+		    TB = VFNMS(LDK(KP642787609), Ts, VMUL(LDK(KP663413948), Tt));
+		    TC = VFNMS(LDK(KP984807753), Tv, VMUL(LDK(KP150383733), Tw));
+		    TD = VADD(TB, TC);
+		    Tu = VFMA(LDK(KP766044443), Ts, VMUL(LDK(KP556670399), Tt));
+		    Tx = VFMA(LDK(KP173648177), Tv, VMUL(LDK(KP852868531), Tw));
+		    Ty = VADD(Tu, Tx);
+		    Tr = VFNMS(LDK(KP500000000), T6, T1);
+		    TH = VFMA(LDK(KP173648177), Ts, VFNMS(LDK(KP296198132), Tw, VFNMS(LDK(KP939692620), Tv, VFNMS(LDK(KP852868531), Tt, Tr))));
+		    ST(&(x[WS(rs, 7)]), VSUB(TH, TI), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 2)]), VADD(TH, TI), ms, &(x[0]));
+		    {
+			 V Tz, TE, TF, TG;
+			 Tz = VADD(Tr, Ty);
+			 TE = VBYI(VADD(TA, TD));
+			 ST(&(x[WS(rs, 8)]), VSUB(Tz, TE), ms, &(x[0]));
+			 ST(&(x[WS(rs, 1)]), VADD(TE, Tz), ms, &(x[WS(rs, 1)]));
+			 TF = VFMA(LDK(KP866025403), VSUB(TB, TC), VFNMS(LDK(KP500000000), Ty, Tr));
+			 TG = VBYI(VADD(TA, VFNMS(LDK(KP500000000), TD, VMUL(LDK(KP866025403), VSUB(Tx, Tu)))));
+			 ST(&(x[WS(rs, 5)]), VSUB(TF, TG), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 4)]), VADD(TF, TG), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 9, XSIMD_STRING("t1fuv_9"), twinstr, &GENUS, {38, 26, 16, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fuv_9) (planner *p) {
+     X(kdft_dit_register) (p, t1fuv_9, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1fv_10 -include t1f.h */
+
+/*
+ * This function contains 51 FP additions, 40 FP multiplications,
+ * (or, 33 additions, 22 multiplications, 18 fused multiply/add),
+ * 43 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Td, TA, T4, Ta, Tk, TE, Tp, TF, TB, T9, T1, T2, Tb;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tg, Tn, Ti, Tl;
+		    Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tn = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Ti = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+		    Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    {
+			 V T6, T8, T5, Tc;
+			 T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 {
+			      V T3, Th, To, Tj, Tm, T7;
+			      T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T3 = BYTWJ(&(W[TWVL * 8]), T2);
+			      Th = BYTWJ(&(W[TWVL * 6]), Tg);
+			      To = BYTWJ(&(W[0]), Tn);
+			      Tj = BYTWJ(&(W[TWVL * 16]), Ti);
+			      Tm = BYTWJ(&(W[TWVL * 10]), Tl);
+			      T6 = BYTWJ(&(W[TWVL * 2]), T5);
+			      Td = BYTWJ(&(W[TWVL * 4]), Tc);
+			      T8 = BYTWJ(&(W[TWVL * 12]), T7);
+			      TA = VADD(T1, T3);
+			      T4 = VSUB(T1, T3);
+			      Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Tk = VSUB(Th, Tj);
+			      TE = VADD(Th, Tj);
+			      Tp = VSUB(Tm, To);
+			      TF = VADD(Tm, To);
+			 }
+			 TB = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+		    }
+	       }
+	       Tb = BYTWJ(&(W[TWVL * 14]), Ta);
+	       {
+		    V TL, TG, Tw, Tq, TC, Te;
+		    TL = VSUB(TE, TF);
+		    TG = VADD(TE, TF);
+		    Tw = VSUB(Tk, Tp);
+		    Tq = VADD(Tk, Tp);
+		    TC = VADD(Tb, Td);
+		    Te = VSUB(Tb, Td);
+		    {
+			 V TM, TD, Tv, Tf;
+			 TM = VSUB(TB, TC);
+			 TD = VADD(TB, TC);
+			 Tv = VSUB(T9, Te);
+			 Tf = VADD(T9, Te);
+			 {
+			      V TP, TN, TH, TJ, Tz, Tx, Tr, Tt, TI, Ts;
+			      TP = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TL, TM));
+			      TN = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TM, TL));
+			      TH = VADD(TD, TG);
+			      TJ = VSUB(TD, TG);
+			      Tz = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tv, Tw));
+			      Tx = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tw, Tv));
+			      Tr = VADD(Tf, Tq);
+			      Tt = VSUB(Tf, Tq);
+			      ST(&(x[0]), VADD(TA, TH), ms, &(x[0]));
+			      TI = VFNMS(LDK(KP250000000), TH, TA);
+			      ST(&(x[WS(rs, 5)]), VADD(T4, Tr), ms, &(x[WS(rs, 1)]));
+			      Ts = VFNMS(LDK(KP250000000), Tr, T4);
+			      {
+				   V TK, TO, Tu, Ty;
+				   TK = VFNMS(LDK(KP559016994), TJ, TI);
+				   TO = VFMA(LDK(KP559016994), TJ, TI);
+				   Tu = VFMA(LDK(KP559016994), Tt, Ts);
+				   Ty = VFNMS(LDK(KP559016994), Tt, Ts);
+				   ST(&(x[WS(rs, 8)]), VFNMSI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFMAI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 9)]), VFMAI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFNMSI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFMAI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFNMSI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t1fv_10"), twinstr, &GENUS, {33, 22, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_10) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1fv_10 -include t1f.h */
+
+/*
+ * This function contains 51 FP additions, 30 FP multiplications,
+ * (or, 45 additions, 24 multiplications, 6 fused multiply/add),
+ * 32 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Tr, TH, Tg, Tl, Tm, TA, TB, TJ, T5, Ta, Tb, TD, TE, TI, To;
+	       V Tq, Tp;
+	       To = LD(&(x[0]), ms, &(x[0]));
+	       Tp = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Tq = BYTWJ(&(W[TWVL * 8]), Tp);
+	       Tr = VSUB(To, Tq);
+	       TH = VADD(To, Tq);
+	       {
+		    V Td, Tk, Tf, Ti;
+		    {
+			 V Tc, Tj, Te, Th;
+			 Tc = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Td = BYTWJ(&(W[TWVL * 6]), Tc);
+			 Tj = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Tk = BYTWJ(&(W[0]), Tj);
+			 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTWJ(&(W[TWVL * 16]), Te);
+			 Th = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Ti = BYTWJ(&(W[TWVL * 10]), Th);
+		    }
+		    Tg = VSUB(Td, Tf);
+		    Tl = VSUB(Ti, Tk);
+		    Tm = VADD(Tg, Tl);
+		    TA = VADD(Td, Tf);
+		    TB = VADD(Ti, Tk);
+		    TJ = VADD(TA, TB);
+	       }
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T2 = BYTWJ(&(W[TWVL * 2]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTWJ(&(W[TWVL * 12]), T3);
+			 T6 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T7 = BYTWJ(&(W[TWVL * 14]), T6);
+		    }
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    Tb = VADD(T5, Ta);
+		    TD = VADD(T2, T4);
+		    TE = VADD(T7, T9);
+		    TI = VADD(TD, TE);
+	       }
+	       {
+		    V Tn, Ts, Tt, Tx, Tz, Tv, Tw, Ty, Tu;
+		    Tn = VMUL(LDK(KP559016994), VSUB(Tb, Tm));
+		    Ts = VADD(Tb, Tm);
+		    Tt = VFNMS(LDK(KP250000000), Ts, Tr);
+		    Tv = VSUB(T5, Ta);
+		    Tw = VSUB(Tg, Tl);
+		    Tx = VBYI(VFMA(LDK(KP951056516), Tv, VMUL(LDK(KP587785252), Tw)));
+		    Tz = VBYI(VFNMS(LDK(KP587785252), Tv, VMUL(LDK(KP951056516), Tw)));
+		    ST(&(x[WS(rs, 5)]), VADD(Tr, Ts), ms, &(x[WS(rs, 1)]));
+		    Ty = VSUB(Tt, Tn);
+		    ST(&(x[WS(rs, 3)]), VSUB(Ty, Tz), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VADD(Tz, Ty), ms, &(x[WS(rs, 1)]));
+		    Tu = VADD(Tn, Tt);
+		    ST(&(x[WS(rs, 1)]), VSUB(Tu, Tx), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VADD(Tx, Tu), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TM, TK, TL, TG, TO, TC, TF, TP, TN;
+		    TM = VMUL(LDK(KP559016994), VSUB(TI, TJ));
+		    TK = VADD(TI, TJ);
+		    TL = VFNMS(LDK(KP250000000), TK, TH);
+		    TC = VSUB(TA, TB);
+		    TF = VSUB(TD, TE);
+		    TG = VBYI(VFNMS(LDK(KP587785252), TF, VMUL(LDK(KP951056516), TC)));
+		    TO = VBYI(VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TC)));
+		    ST(&(x[0]), VADD(TH, TK), ms, &(x[0]));
+		    TP = VADD(TM, TL);
+		    ST(&(x[WS(rs, 4)]), VADD(TO, TP), ms, &(x[0]));
+		    ST(&(x[WS(rs, 6)]), VSUB(TP, TO), ms, &(x[0]));
+		    TN = VSUB(TL, TM);
+		    ST(&(x[WS(rs, 2)]), VADD(TG, TN), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VSUB(TN, TG), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t1fv_10"), twinstr, &GENUS, {45, 24, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_10) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:03 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1fv_12 -include t1f.h */
+
+/*
+ * This function contains 59 FP additions, 42 FP multiplications,
+ * (or, 41 additions, 24 multiplications, 18 fused multiply/add),
+ * 41 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
+	       V Tq, Ti, T7, TQ, Tu, TA, TU, Tk, TR, Tf, TE, TM;
+	       {
+		    V T9, TC, Tj, TD, Te;
+		    {
+			 V T1, T4, T2, Tm, Tx, To;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 To = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 {
+			      V T5, T3, Tn, Ty, Tp, Td, Tb, T8, Tc, Ta;
+			      T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      T5 = BYTWJ(&(W[TWVL * 14]), T4);
+			      T3 = BYTWJ(&(W[TWVL * 6]), T2);
+			      Tn = BYTWJ(&(W[0]), Tm);
+			      Ty = BYTWJ(&(W[TWVL * 16]), Tx);
+			      Tp = BYTWJ(&(W[TWVL * 8]), To);
+			      T9 = BYTWJ(&(W[TWVL * 10]), T8);
+			      Td = BYTWJ(&(W[TWVL * 2]), Tc);
+			      Tb = BYTWJ(&(W[TWVL * 18]), Ta);
+			      {
+				   V Th, T6, Tt, Tz;
+				   Th = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   TC = VSUB(T5, T3);
+				   T6 = VADD(T3, T5);
+				   Tt = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   Tz = VADD(Tn, Tp);
+				   Tq = VSUB(Tn, Tp);
+				   Tj = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+				   TD = VSUB(Td, Tb);
+				   Te = VADD(Tb, Td);
+				   Ti = BYTWJ(&(W[TWVL * 20]), Th);
+				   T7 = VFNMS(LDK(KP500000000), T6, T1);
+				   TQ = VADD(T1, T6);
+				   Tu = BYTWJ(&(W[TWVL * 4]), Tt);
+				   TA = VFNMS(LDK(KP500000000), Tz, Ty);
+				   TU = VADD(Ty, Tz);
+			      }
+			 }
+		    }
+		    Tk = BYTWJ(&(W[TWVL * 12]), Tj);
+		    TR = VADD(T9, Te);
+		    Tf = VFNMS(LDK(KP500000000), Te, T9);
+		    TE = VSUB(TC, TD);
+		    TM = VADD(TC, TD);
+	       }
+	       {
+		    V Tv, Tl, TI, Tg, TW, TS;
+		    Tv = VADD(Tk, Ti);
+		    Tl = VSUB(Ti, Tk);
+		    TI = VADD(T7, Tf);
+		    Tg = VSUB(T7, Tf);
+		    TW = VADD(TQ, TR);
+		    TS = VSUB(TQ, TR);
+		    {
+			 V TT, Tw, TL, Tr;
+			 TT = VADD(Tu, Tv);
+			 Tw = VFNMS(LDK(KP500000000), Tv, Tu);
+			 TL = VSUB(Tl, Tq);
+			 Tr = VADD(Tl, Tq);
+			 {
+			      V TP, TN, TG, Ts, TO, TK, TH, TF;
+			      {
+				   V TX, TV, TJ, TB;
+				   TX = VADD(TT, TU);
+				   TV = VSUB(TT, TU);
+				   TJ = VADD(Tw, TA);
+				   TB = VSUB(Tw, TA);
+				   TP = VMUL(LDK(KP866025403), VADD(TM, TL));
+				   TN = VMUL(LDK(KP866025403), VSUB(TL, TM));
+				   TG = VFNMS(LDK(KP866025403), Tr, Tg);
+				   Ts = VFMA(LDK(KP866025403), Tr, Tg);
+				   ST(&(x[WS(rs, 6)]), VSUB(TW, TX), ms, &(x[0]));
+				   ST(&(x[0]), VADD(TW, TX), ms, &(x[0]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(TV, TS), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 9)]), VFNMSI(TV, TS), ms, &(x[WS(rs, 1)]));
+				   TO = VADD(TI, TJ);
+				   TK = VSUB(TI, TJ);
+				   TH = VFMA(LDK(KP866025403), TE, TB);
+				   TF = VFNMS(LDK(KP866025403), TE, TB);
+			      }
+			      ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
+			      ST(&(x[WS(rs, 8)]), VFNMSI(TP, TO), ms, &(x[0]));
+			      ST(&(x[WS(rs, 10)]), VFNMSI(TN, TK), ms, &(x[0]));
+			      ST(&(x[WS(rs, 2)]), VFMAI(TN, TK), ms, &(x[0]));
+			      ST(&(x[WS(rs, 5)]), VFNMSI(TH, TG), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 7)]), VFMAI(TH, TG), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 11)]), VFMAI(TF, Ts), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VFNMSI(TF, Ts), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 12, XSIMD_STRING("t1fv_12"), twinstr, &GENUS, {41, 24, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_12) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_12, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1fv_12 -include t1f.h */
+
+/*
+ * This function contains 59 FP additions, 30 FP multiplications,
+ * (or, 55 additions, 26 multiplications, 4 fused multiply/add),
+ * 28 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
+	       V T1, TH, T6, TA, Tq, TE, Tv, TL, T9, TI, Te, TB, Ti, TD, Tn;
+	       V TK;
+	       {
+		    V T5, T3, T4, T2;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    T5 = BYTWJ(&(W[TWVL * 14]), T4);
+		    T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    T3 = BYTWJ(&(W[TWVL * 6]), T2);
+		    TH = VSUB(T5, T3);
+		    T6 = VADD(T3, T5);
+		    TA = VFNMS(LDK(KP500000000), T6, T1);
+	       }
+	       {
+		    V Tu, Ts, Tp, Tt, Tr;
+		    Tp = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+		    Tq = BYTWJ(&(W[TWVL * 16]), Tp);
+		    Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Tu = BYTWJ(&(W[TWVL * 8]), Tt);
+		    Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Ts = BYTWJ(&(W[0]), Tr);
+		    TE = VSUB(Tu, Ts);
+		    Tv = VADD(Ts, Tu);
+		    TL = VFNMS(LDK(KP500000000), Tv, Tq);
+	       }
+	       {
+		    V Td, Tb, T8, Tc, Ta;
+		    T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T9 = BYTWJ(&(W[TWVL * 10]), T8);
+		    Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Td = BYTWJ(&(W[TWVL * 2]), Tc);
+		    Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    Tb = BYTWJ(&(W[TWVL * 18]), Ta);
+		    TI = VSUB(Td, Tb);
+		    Te = VADD(Tb, Td);
+		    TB = VFNMS(LDK(KP500000000), Te, T9);
+	       }
+	       {
+		    V Tm, Tk, Th, Tl, Tj;
+		    Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Ti = BYTWJ(&(W[TWVL * 4]), Th);
+		    Tl = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+		    Tm = BYTWJ(&(W[TWVL * 20]), Tl);
+		    Tj = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Tk = BYTWJ(&(W[TWVL * 12]), Tj);
+		    TD = VSUB(Tm, Tk);
+		    Tn = VADD(Tk, Tm);
+		    TK = VFNMS(LDK(KP500000000), Tn, Ti);
+	       }
+	       {
+		    V Tg, Ty, Tx, Tz;
+		    {
+			 V T7, Tf, To, Tw;
+			 T7 = VADD(T1, T6);
+			 Tf = VADD(T9, Te);
+			 Tg = VSUB(T7, Tf);
+			 Ty = VADD(T7, Tf);
+			 To = VADD(Ti, Tn);
+			 Tw = VADD(Tq, Tv);
+			 Tx = VBYI(VSUB(To, Tw));
+			 Tz = VADD(To, Tw);
+		    }
+		    ST(&(x[WS(rs, 9)]), VSUB(Tg, Tx), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[0]), VADD(Ty, Tz), ms, &(x[0]));
+		    ST(&(x[WS(rs, 3)]), VADD(Tg, Tx), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VSUB(Ty, Tz), ms, &(x[0]));
+	       }
+	       {
+		    V TS, TW, TV, TX;
+		    {
+			 V TQ, TR, TT, TU;
+			 TQ = VADD(TA, TB);
+			 TR = VADD(TK, TL);
+			 TS = VSUB(TQ, TR);
+			 TW = VADD(TQ, TR);
+			 TT = VADD(TD, TE);
+			 TU = VADD(TH, TI);
+			 TV = VBYI(VMUL(LDK(KP866025403), VSUB(TT, TU)));
+			 TX = VBYI(VMUL(LDK(KP866025403), VADD(TU, TT)));
+		    }
+		    ST(&(x[WS(rs, 10)]), VSUB(TS, TV), ms, &(x[0]));
+		    ST(&(x[WS(rs, 4)]), VADD(TW, TX), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(TS, TV), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VSUB(TW, TX), ms, &(x[0]));
+	       }
+	       {
+		    V TG, TP, TN, TO;
+		    {
+			 V TC, TF, TJ, TM;
+			 TC = VSUB(TA, TB);
+			 TF = VMUL(LDK(KP866025403), VSUB(TD, TE));
+			 TG = VSUB(TC, TF);
+			 TP = VADD(TC, TF);
+			 TJ = VMUL(LDK(KP866025403), VSUB(TH, TI));
+			 TM = VSUB(TK, TL);
+			 TN = VBYI(VADD(TJ, TM));
+			 TO = VBYI(VSUB(TJ, TM));
+		    }
+		    ST(&(x[WS(rs, 5)]), VSUB(TG, TN), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 11)]), VSUB(TP, TO), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VADD(TN, TG), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(TO, TP), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 12, XSIMD_STRING("t1fv_12"), twinstr, &GENUS, {55, 26, 4, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_12) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_12, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:04 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name t1fv_15 -include t1f.h */
+
+/*
+ * This function contains 92 FP additions, 77 FP multiplications,
+ * (or, 50 additions, 35 multiplications, 42 fused multiply/add),
+ * 81 stack variables, 8 constants, and 30 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DVK(KP910592997, +0.910592997310029334643087372129977886038870291);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 28)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 28), MAKE_VOLATILE_STRIDE(15, rs)) {
+	       V Tq, Ty, Th, T1b, T10, Ts, TP, T7, Tu, TA, TC, Tj, Tk, TQ, Tf;
+	       {
+		    V T1, T4, T2, T9, Te;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T4 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    {
+			 V T8, Tp, Tx, Tg;
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tg = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 {
+			      V Tb, Td, Tr, T6, Tt, Tz, TB, Ti;
+			      {
+				   V T5, T3, Ta, Tc;
+				   Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+				   Tc = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   T5 = BYTWJ(&(W[TWVL * 18]), T4);
+				   T3 = BYTWJ(&(W[TWVL * 8]), T2);
+				   T9 = BYTWJ(&(W[TWVL * 4]), T8);
+				   Tq = BYTWJ(&(W[TWVL * 10]), Tp);
+				   Ty = BYTWJ(&(W[TWVL * 16]), Tx);
+				   Th = BYTWJ(&(W[TWVL * 22]), Tg);
+				   Tb = BYTWJ(&(W[TWVL * 14]), Ta);
+				   Td = BYTWJ(&(W[TWVL * 24]), Tc);
+				   Tr = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   T1b = VSUB(T5, T3);
+				   T6 = VADD(T3, T5);
+				   Tt = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      }
+			      Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      TB = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      Te = VADD(Tb, Td);
+			      T10 = VSUB(Td, Tb);
+			      Ts = BYTWJ(&(W[TWVL * 20]), Tr);
+			      TP = VFNMS(LDK(KP500000000), T6, T1);
+			      T7 = VADD(T1, T6);
+			      Tu = BYTWJ(&(W[0]), Tt);
+			      TA = BYTWJ(&(W[TWVL * 26]), Tz);
+			      TC = BYTWJ(&(W[TWVL * 6]), TB);
+			      Tj = BYTWJ(&(W[TWVL * 2]), Ti);
+			      Tk = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+		    TQ = VFNMS(LDK(KP500000000), Te, T9);
+		    Tf = VADD(T9, Te);
+	       }
+	       {
+		    V Tv, T13, TD, T14, Tl;
+		    Tv = VADD(Ts, Tu);
+		    T13 = VSUB(Tu, Ts);
+		    TD = VADD(TA, TC);
+		    T14 = VSUB(TC, TA);
+		    Tl = BYTWJ(&(W[TWVL * 12]), Tk);
+		    {
+			 V TT, Tw, T1d, T15, TU, TE, T11, Tm;
+			 TT = VFNMS(LDK(KP500000000), Tv, Tq);
+			 Tw = VADD(Tq, Tv);
+			 T1d = VADD(T13, T14);
+			 T15 = VSUB(T13, T14);
+			 TU = VFNMS(LDK(KP500000000), TD, Ty);
+			 TE = VADD(Ty, TD);
+			 T11 = VSUB(Tl, Tj);
+			 Tm = VADD(Tj, Tl);
+			 {
+			      V T19, TV, TK, TF, T1c, T12, TR, Tn;
+			      T19 = VSUB(TT, TU);
+			      TV = VADD(TT, TU);
+			      TK = VSUB(Tw, TE);
+			      TF = VADD(Tw, TE);
+			      T1c = VADD(T10, T11);
+			      T12 = VSUB(T10, T11);
+			      TR = VFNMS(LDK(KP500000000), Tm, Th);
+			      Tn = VADD(Th, Tm);
+			      {
+				   V T1g, T1e, T1m, T16, T18, TS, TL, To, T1f, T1u;
+				   T1g = VSUB(T1c, T1d);
+				   T1e = VADD(T1c, T1d);
+				   T1m = VFNMS(LDK(KP618033988), T12, T15);
+				   T16 = VFMA(LDK(KP618033988), T15, T12);
+				   T18 = VSUB(TQ, TR);
+				   TS = VADD(TQ, TR);
+				   TL = VSUB(Tf, Tn);
+				   To = VADD(Tf, Tn);
+				   T1f = VFNMS(LDK(KP250000000), T1e, T1b);
+				   T1u = VMUL(LDK(KP866025403), VADD(T1b, T1e));
+				   {
+					V T1o, T1a, TY, TO, TM, TG, TI, T1p, T1h, T1t, TX, TW;
+					T1o = VFNMS(LDK(KP618033988), T18, T19);
+					T1a = VFMA(LDK(KP618033988), T19, T18);
+					TW = VADD(TS, TV);
+					TY = VSUB(TS, TV);
+					TO = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TK, TL));
+					TM = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TL, TK));
+					TG = VADD(To, TF);
+					TI = VSUB(To, TF);
+					T1p = VFNMS(LDK(KP559016994), T1g, T1f);
+					T1h = VFMA(LDK(KP559016994), T1g, T1f);
+					T1t = VADD(TP, TW);
+					TX = VFNMS(LDK(KP250000000), TW, TP);
+					{
+					     V T1q, T1s, T1k, T1i, T1l, TZ, TJ, TN, TH;
+					     ST(&(x[0]), VADD(T7, TG), ms, &(x[0]));
+					     TH = VFNMS(LDK(KP250000000), TG, T7);
+					     T1q = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T1p, T1o));
+					     T1s = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T1p, T1o));
+					     T1k = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T1h, T1a));
+					     T1i = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T1h, T1a));
+					     ST(&(x[WS(rs, 10)]), VFMAI(T1u, T1t), ms, &(x[0]));
+					     ST(&(x[WS(rs, 5)]), VFNMSI(T1u, T1t), ms, &(x[WS(rs, 1)]));
+					     T1l = VFNMS(LDK(KP559016994), TY, TX);
+					     TZ = VFMA(LDK(KP559016994), TY, TX);
+					     TJ = VFNMS(LDK(KP559016994), TI, TH);
+					     TN = VFMA(LDK(KP559016994), TI, TH);
+					     {
+						  V T1n, T1r, T1j, T17;
+						  T1n = VFMA(LDK(KP823639103), T1m, T1l);
+						  T1r = VFNMS(LDK(KP823639103), T1m, T1l);
+						  T1j = VFNMS(LDK(KP823639103), T16, TZ);
+						  T17 = VFMA(LDK(KP823639103), T16, TZ);
+						  ST(&(x[WS(rs, 12)]), VFMAI(TM, TJ), ms, &(x[0]));
+						  ST(&(x[WS(rs, 3)]), VFNMSI(TM, TJ), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 9)]), VFMAI(TO, TN), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 6)]), VFNMSI(TO, TN), ms, &(x[0]));
+						  ST(&(x[WS(rs, 2)]), VFMAI(T1q, T1n), ms, &(x[0]));
+						  ST(&(x[WS(rs, 13)]), VFNMSI(T1q, T1n), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 7)]), VFMAI(T1s, T1r), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 8)]), VFNMSI(T1s, T1r), ms, &(x[0]));
+						  ST(&(x[WS(rs, 4)]), VFMAI(T1k, T1j), ms, &(x[0]));
+						  ST(&(x[WS(rs, 11)]), VFNMSI(T1k, T1j), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 14)]), VFMAI(T1i, T17), ms, &(x[0]));
+						  ST(&(x[WS(rs, 1)]), VFNMSI(T1i, T17), ms, &(x[WS(rs, 1)]));
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 15, XSIMD_STRING("t1fv_15"), twinstr, &GENUS, {50, 35, 42, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_15) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_15, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name t1fv_15 -include t1f.h */
+
+/*
+ * This function contains 92 FP additions, 53 FP multiplications,
+ * (or, 78 additions, 39 multiplications, 14 fused multiply/add),
+ * 52 stack variables, 10 constants, and 30 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP216506350, +0.216506350946109661690930792688234045867850657);
+     DVK(KP484122918, +0.484122918275927110647408174972799951354115213);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP509036960, +0.509036960455127183450980863393907648510733164);
+     DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 28)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 28), MAKE_VOLATILE_STRIDE(15, rs)) {
+	       V T1e, T7, TP, T12, T15, Tf, Tn, To, T1b, T1c, T1f, TQ, TR, TS, Tw;
+	       V TE, TF, TT, TU, TV;
+	       {
+		    V T1, T5, T3, T4, T2, T6;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T4 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    T5 = BYTWJ(&(W[TWVL * 18]), T4);
+		    T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T3 = BYTWJ(&(W[TWVL * 8]), T2);
+		    T1e = VSUB(T5, T3);
+		    T6 = VADD(T3, T5);
+		    T7 = VADD(T1, T6);
+		    TP = VFNMS(LDK(KP500000000), T6, T1);
+	       }
+	       {
+		    V T9, Tq, Ty, Th, Te, T13, Tv, T10, TD, T11, Tm, T14;
+		    {
+			 V T8, Tp, Tx, Tg;
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 4]), T8);
+			 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tq = BYTWJ(&(W[TWVL * 10]), Tp);
+			 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Ty = BYTWJ(&(W[TWVL * 16]), Tx);
+			 Tg = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Th = BYTWJ(&(W[TWVL * 22]), Tg);
+		    }
+		    {
+			 V Tb, Td, Ta, Tc;
+			 Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 Tb = BYTWJ(&(W[TWVL * 14]), Ta);
+			 Tc = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 Td = BYTWJ(&(W[TWVL * 24]), Tc);
+			 Te = VADD(Tb, Td);
+			 T13 = VSUB(Td, Tb);
+		    }
+		    {
+			 V Ts, Tu, Tr, Tt;
+			 Tr = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 Ts = BYTWJ(&(W[TWVL * 20]), Tr);
+			 Tt = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Tu = BYTWJ(&(W[0]), Tt);
+			 Tv = VADD(Ts, Tu);
+			 T10 = VSUB(Tu, Ts);
+		    }
+		    {
+			 V TA, TC, Tz, TB;
+			 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 TA = BYTWJ(&(W[TWVL * 26]), Tz);
+			 TB = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 TC = BYTWJ(&(W[TWVL * 6]), TB);
+			 TD = VADD(TA, TC);
+			 T11 = VSUB(TC, TA);
+		    }
+		    {
+			 V Tj, Tl, Ti, Tk;
+			 Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tj = BYTWJ(&(W[TWVL * 2]), Ti);
+			 Tk = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 Tl = BYTWJ(&(W[TWVL * 12]), Tk);
+			 Tm = VADD(Tj, Tl);
+			 T14 = VSUB(Tl, Tj);
+		    }
+		    T12 = VSUB(T10, T11);
+		    T15 = VSUB(T13, T14);
+		    Tf = VADD(T9, Te);
+		    Tn = VADD(Th, Tm);
+		    To = VADD(Tf, Tn);
+		    T1b = VADD(T13, T14);
+		    T1c = VADD(T10, T11);
+		    T1f = VADD(T1b, T1c);
+		    TQ = VFNMS(LDK(KP500000000), Te, T9);
+		    TR = VFNMS(LDK(KP500000000), Tm, Th);
+		    TS = VADD(TQ, TR);
+		    Tw = VADD(Tq, Tv);
+		    TE = VADD(Ty, TD);
+		    TF = VADD(Tw, TE);
+		    TT = VFNMS(LDK(KP500000000), Tv, Tq);
+		    TU = VFNMS(LDK(KP500000000), TD, Ty);
+		    TV = VADD(TT, TU);
+	       }
+	       {
+		    V TI, TG, TH, TM, TO, TK, TL, TN, TJ;
+		    TI = VMUL(LDK(KP559016994), VSUB(To, TF));
+		    TG = VADD(To, TF);
+		    TH = VFNMS(LDK(KP250000000), TG, T7);
+		    TK = VSUB(Tw, TE);
+		    TL = VSUB(Tf, Tn);
+		    TM = VBYI(VFNMS(LDK(KP587785252), TL, VMUL(LDK(KP951056516), TK)));
+		    TO = VBYI(VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TK)));
+		    ST(&(x[0]), VADD(T7, TG), ms, &(x[0]));
+		    TN = VADD(TI, TH);
+		    ST(&(x[WS(rs, 6)]), VSUB(TN, TO), ms, &(x[0]));
+		    ST(&(x[WS(rs, 9)]), VADD(TO, TN), ms, &(x[WS(rs, 1)]));
+		    TJ = VSUB(TH, TI);
+		    ST(&(x[WS(rs, 3)]), VSUB(TJ, TM), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 12)]), VADD(TM, TJ), ms, &(x[0]));
+	       }
+	       {
+		    V T16, T1m, T1u, T1h, T1o, T1a, T1p, TZ, T1t, T1l, T1d, T1g;
+		    T16 = VFNMS(LDK(KP509036960), T15, VMUL(LDK(KP823639103), T12));
+		    T1m = VFMA(LDK(KP823639103), T15, VMUL(LDK(KP509036960), T12));
+		    T1u = VBYI(VMUL(LDK(KP866025403), VADD(T1e, T1f)));
+		    T1d = VMUL(LDK(KP484122918), VSUB(T1b, T1c));
+		    T1g = VFNMS(LDK(KP216506350), T1f, VMUL(LDK(KP866025403), T1e));
+		    T1h = VSUB(T1d, T1g);
+		    T1o = VADD(T1d, T1g);
+		    {
+			 V T18, T19, TY, TW, TX;
+			 T18 = VSUB(TT, TU);
+			 T19 = VSUB(TQ, TR);
+			 T1a = VFNMS(LDK(KP587785252), T19, VMUL(LDK(KP951056516), T18));
+			 T1p = VFMA(LDK(KP951056516), T19, VMUL(LDK(KP587785252), T18));
+			 TY = VMUL(LDK(KP559016994), VSUB(TS, TV));
+			 TW = VADD(TS, TV);
+			 TX = VFNMS(LDK(KP250000000), TW, TP);
+			 TZ = VSUB(TX, TY);
+			 T1t = VADD(TP, TW);
+			 T1l = VADD(TY, TX);
+		    }
+		    {
+			 V T17, T1i, T1r, T1s;
+			 ST(&(x[WS(rs, 5)]), VSUB(T1t, T1u), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 10)]), VADD(T1t, T1u), ms, &(x[0]));
+			 T17 = VSUB(TZ, T16);
+			 T1i = VBYI(VSUB(T1a, T1h));
+			 ST(&(x[WS(rs, 8)]), VSUB(T17, T1i), ms, &(x[0]));
+			 ST(&(x[WS(rs, 7)]), VADD(T17, T1i), ms, &(x[WS(rs, 1)]));
+			 T1r = VSUB(T1l, T1m);
+			 T1s = VBYI(VADD(T1p, T1o));
+			 ST(&(x[WS(rs, 11)]), VSUB(T1r, T1s), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 4)]), VADD(T1r, T1s), ms, &(x[0]));
+			 {
+			      V T1n, T1q, T1j, T1k;
+			      T1n = VADD(T1l, T1m);
+			      T1q = VBYI(VSUB(T1o, T1p));
+			      ST(&(x[WS(rs, 14)]), VSUB(T1n, T1q), ms, &(x[0]));
+			      ST(&(x[WS(rs, 1)]), VADD(T1n, T1q), ms, &(x[WS(rs, 1)]));
+			      T1j = VADD(TZ, T16);
+			      T1k = VBYI(VADD(T1a, T1h));
+			      ST(&(x[WS(rs, 13)]), VSUB(T1j, T1k), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 2)]), VADD(T1j, T1k), ms, &(x[0]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 15, XSIMD_STRING("t1fv_15"), twinstr, &GENUS, {78, 39, 14, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_15) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_15, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:05 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1fv_16 -include t1f.h */
+
+/*
+ * This function contains 87 FP additions, 64 FP multiplications,
+ * (or, 53 additions, 30 multiplications, 34 fused multiply/add),
+ * 61 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V TO, Ta, TJ, TP, T14, Tq, T1i, T10, T1b, T1l, T13, T1c, TR, Tl, T15;
+	       V Tv;
+	       {
+		    V Tc, TW, T4, T19, T9, TD, TI, Tj, TZ, T1a, Te, Th, Tn, Tr, Tu;
+		    V Tp;
+		    {
+			 V T1, T2, T5, T7;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T2 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T7 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 {
+			      V Tz, TG, TB, TE;
+			      Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      TG = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      TB = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      TE = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      {
+				   V Ti, TY, TX, Td, Tg, Tm, Tt, To;
+				   {
+					V T3, T6, T8, TA, TH, TC, TF, Tb;
+					Tb = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					T3 = BYTWJ(&(W[TWVL * 14]), T2);
+					T6 = BYTWJ(&(W[TWVL * 6]), T5);
+					T8 = BYTWJ(&(W[TWVL * 22]), T7);
+					TA = BYTWJ(&(W[TWVL * 26]), Tz);
+					TH = BYTWJ(&(W[TWVL * 18]), TG);
+					TC = BYTWJ(&(W[TWVL * 10]), TB);
+					TF = BYTWJ(&(W[TWVL * 2]), TE);
+					Tc = BYTWJ(&(W[0]), Tb);
+					TW = VSUB(T1, T3);
+					T4 = VADD(T1, T3);
+					T19 = VSUB(T6, T8);
+					T9 = VADD(T6, T8);
+					Ti = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+					TD = VADD(TA, TC);
+					TY = VSUB(TA, TC);
+					TI = VADD(TF, TH);
+					TX = VSUB(TF, TH);
+				   }
+				   Td = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+				   Tg = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+				   Tm = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+				   Tj = BYTWJ(&(W[TWVL * 24]), Ti);
+				   Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   To = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+				   TZ = VADD(TX, TY);
+				   T1a = VSUB(TY, TX);
+				   Te = BYTWJ(&(W[TWVL * 16]), Td);
+				   Th = BYTWJ(&(W[TWVL * 8]), Tg);
+				   Tn = BYTWJ(&(W[TWVL * 28]), Tm);
+				   Tr = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   Tu = BYTWJ(&(W[TWVL * 20]), Tt);
+				   Tp = BYTWJ(&(W[TWVL * 12]), To);
+			      }
+			 }
+		    }
+		    {
+			 V Tf, T11, Tk, T12, Ts;
+			 TO = VADD(T4, T9);
+			 Ta = VSUB(T4, T9);
+			 TJ = VSUB(TD, TI);
+			 TP = VADD(TI, TD);
+			 Tf = VADD(Tc, Te);
+			 T11 = VSUB(Tc, Te);
+			 Tk = VADD(Th, Tj);
+			 T12 = VSUB(Th, Tj);
+			 Ts = BYTWJ(&(W[TWVL * 4]), Tr);
+			 T14 = VSUB(Tn, Tp);
+			 Tq = VADD(Tn, Tp);
+			 T1i = VFNMS(LDK(KP707106781), TZ, TW);
+			 T10 = VFMA(LDK(KP707106781), TZ, TW);
+			 T1b = VFNMS(LDK(KP707106781), T1a, T19);
+			 T1l = VFMA(LDK(KP707106781), T1a, T19);
+			 T13 = VFNMS(LDK(KP414213562), T12, T11);
+			 T1c = VFMA(LDK(KP414213562), T11, T12);
+			 TR = VADD(Tf, Tk);
+			 Tl = VSUB(Tf, Tk);
+			 T15 = VSUB(Tu, Ts);
+			 Tv = VADD(Ts, Tu);
+		    }
+	       }
+	       {
+		    V T1d, T16, TS, Tw, TU, TQ;
+		    T1d = VFMA(LDK(KP414213562), T14, T15);
+		    T16 = VFNMS(LDK(KP414213562), T15, T14);
+		    TS = VADD(Tq, Tv);
+		    Tw = VSUB(Tq, Tv);
+		    TU = VSUB(TO, TP);
+		    TQ = VADD(TO, TP);
+		    {
+			 V T1e, T1j, T17, T1m;
+			 T1e = VSUB(T1c, T1d);
+			 T1j = VADD(T1c, T1d);
+			 T17 = VADD(T13, T16);
+			 T1m = VSUB(T16, T13);
+			 {
+			      V TV, TT, TK, Tx;
+			      TV = VSUB(TS, TR);
+			      TT = VADD(TR, TS);
+			      TK = VSUB(Tw, Tl);
+			      Tx = VADD(Tl, Tw);
+			      {
+				   V T1h, T1f, T1o, T1k;
+				   T1h = VFMA(LDK(KP923879532), T1e, T1b);
+				   T1f = VFNMS(LDK(KP923879532), T1e, T1b);
+				   T1o = VFMA(LDK(KP923879532), T1j, T1i);
+				   T1k = VFNMS(LDK(KP923879532), T1j, T1i);
+				   {
+					V T1g, T18, T1p, T1n;
+					T1g = VFMA(LDK(KP923879532), T17, T10);
+					T18 = VFNMS(LDK(KP923879532), T17, T10);
+					T1p = VFMA(LDK(KP923879532), T1m, T1l);
+					T1n = VFNMS(LDK(KP923879532), T1m, T1l);
+					ST(&(x[WS(rs, 12)]), VFNMSI(TV, TU), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VFMAI(TV, TU), ms, &(x[0]));
+					ST(&(x[0]), VADD(TQ, TT), ms, &(x[0]));
+					ST(&(x[WS(rs, 8)]), VSUB(TQ, TT), ms, &(x[0]));
+					{
+					     V TN, TL, TM, Ty;
+					     TN = VFMA(LDK(KP707106781), TK, TJ);
+					     TL = VFNMS(LDK(KP707106781), TK, TJ);
+					     TM = VFMA(LDK(KP707106781), Tx, Ta);
+					     Ty = VFNMS(LDK(KP707106781), Tx, Ta);
+					     ST(&(x[WS(rs, 1)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 15)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 7)]), VFMAI(T1f, T18), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 9)]), VFNMSI(T1f, T18), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 3)]), VFMAI(T1p, T1o), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 13)]), VFNMSI(T1p, T1o), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 11)]), VFMAI(T1n, T1k), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 5)]), VFNMSI(T1n, T1k), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 14)]), VFNMSI(TN, TM), ms, &(x[0]));
+					     ST(&(x[WS(rs, 2)]), VFMAI(TN, TM), ms, &(x[0]));
+					     ST(&(x[WS(rs, 10)]), VFMAI(TL, Ty), ms, &(x[0]));
+					     ST(&(x[WS(rs, 6)]), VFNMSI(TL, Ty), ms, &(x[0]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t1fv_16"), twinstr, &GENUS, {53, 30, 34, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_16) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1fv_16 -include t1f.h */
+
+/*
+ * This function contains 87 FP additions, 42 FP multiplications,
+ * (or, 83 additions, 38 multiplications, 4 fused multiply/add),
+ * 36 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V TJ, T10, TD, T11, T1b, T1c, Ty, TK, T16, T17, T18, Tb, TN, T13, T14;
+	       V T15, Tm, TM, TG, TI, TH;
+	       TG = LD(&(x[0]), ms, &(x[0]));
+	       TH = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+	       TI = BYTWJ(&(W[TWVL * 14]), TH);
+	       TJ = VSUB(TG, TI);
+	       T10 = VADD(TG, TI);
+	       {
+		    V TA, TC, Tz, TB;
+		    Tz = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    TA = BYTWJ(&(W[TWVL * 6]), Tz);
+		    TB = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+		    TC = BYTWJ(&(W[TWVL * 22]), TB);
+		    TD = VSUB(TA, TC);
+		    T11 = VADD(TA, TC);
+	       }
+	       {
+		    V Tp, Tw, Tr, Tu, Ts, Tx;
+		    {
+			 V To, Tv, Tq, Tt;
+			 To = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 Tp = BYTWJ(&(W[TWVL * 26]), To);
+			 Tv = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 Tw = BYTWJ(&(W[TWVL * 18]), Tv);
+			 Tq = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tr = BYTWJ(&(W[TWVL * 10]), Tq);
+			 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tu = BYTWJ(&(W[TWVL * 2]), Tt);
+		    }
+		    T1b = VADD(Tp, Tr);
+		    T1c = VADD(Tu, Tw);
+		    Ts = VSUB(Tp, Tr);
+		    Tx = VSUB(Tu, Tw);
+		    Ty = VMUL(LDK(KP707106781), VSUB(Ts, Tx));
+		    TK = VMUL(LDK(KP707106781), VADD(Tx, Ts));
+	       }
+	       {
+		    V T2, T9, T4, T7, T5, Ta;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTWJ(&(W[TWVL * 28]), T1);
+			 T8 = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 20]), T8);
+			 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTWJ(&(W[TWVL * 12]), T3);
+			 T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T7 = BYTWJ(&(W[TWVL * 4]), T6);
+		    }
+		    T16 = VADD(T2, T4);
+		    T17 = VADD(T7, T9);
+		    T18 = VSUB(T16, T17);
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    Tb = VFNMS(LDK(KP923879532), Ta, VMUL(LDK(KP382683432), T5));
+		    TN = VFMA(LDK(KP923879532), T5, VMUL(LDK(KP382683432), Ta));
+	       }
+	       {
+		    V Td, Tk, Tf, Ti, Tg, Tl;
+		    {
+			 V Tc, Tj, Te, Th;
+			 Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Td = BYTWJ(&(W[0]), Tc);
+			 Tj = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 Tk = BYTWJ(&(W[TWVL * 24]), Tj);
+			 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTWJ(&(W[TWVL * 16]), Te);
+			 Th = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 Ti = BYTWJ(&(W[TWVL * 8]), Th);
+		    }
+		    T13 = VADD(Td, Tf);
+		    T14 = VADD(Ti, Tk);
+		    T15 = VSUB(T13, T14);
+		    Tg = VSUB(Td, Tf);
+		    Tl = VSUB(Ti, Tk);
+		    Tm = VFMA(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), Tl));
+		    TM = VFNMS(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tg));
+	       }
+	       {
+		    V T1a, T1g, T1f, T1h;
+		    {
+			 V T12, T19, T1d, T1e;
+			 T12 = VSUB(T10, T11);
+			 T19 = VMUL(LDK(KP707106781), VADD(T15, T18));
+			 T1a = VADD(T12, T19);
+			 T1g = VSUB(T12, T19);
+			 T1d = VSUB(T1b, T1c);
+			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T15));
+			 T1f = VBYI(VADD(T1d, T1e));
+			 T1h = VBYI(VSUB(T1e, T1d));
+		    }
+		    ST(&(x[WS(rs, 14)]), VSUB(T1a, T1f), ms, &(x[0]));
+		    ST(&(x[WS(rs, 6)]), VADD(T1g, T1h), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(T1a, T1f), ms, &(x[0]));
+		    ST(&(x[WS(rs, 10)]), VSUB(T1g, T1h), ms, &(x[0]));
+	       }
+	       {
+		    V T1k, T1o, T1n, T1p;
+		    {
+			 V T1i, T1j, T1l, T1m;
+			 T1i = VADD(T10, T11);
+			 T1j = VADD(T1c, T1b);
+			 T1k = VADD(T1i, T1j);
+			 T1o = VSUB(T1i, T1j);
+			 T1l = VADD(T13, T14);
+			 T1m = VADD(T16, T17);
+			 T1n = VADD(T1l, T1m);
+			 T1p = VBYI(VSUB(T1m, T1l));
+		    }
+		    ST(&(x[WS(rs, 8)]), VSUB(T1k, T1n), ms, &(x[0]));
+		    ST(&(x[WS(rs, 4)]), VADD(T1o, T1p), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T1k, T1n), ms, &(x[0]));
+		    ST(&(x[WS(rs, 12)]), VSUB(T1o, T1p), ms, &(x[0]));
+	       }
+	       {
+		    V TF, TQ, TP, TR;
+		    {
+			 V Tn, TE, TL, TO;
+			 Tn = VSUB(Tb, Tm);
+			 TE = VSUB(Ty, TD);
+			 TF = VBYI(VSUB(Tn, TE));
+			 TQ = VBYI(VADD(TE, Tn));
+			 TL = VADD(TJ, TK);
+			 TO = VADD(TM, TN);
+			 TP = VSUB(TL, TO);
+			 TR = VADD(TL, TO);
+		    }
+		    ST(&(x[WS(rs, 7)]), VADD(TF, TP), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 15)]), VSUB(TR, TQ), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VSUB(TP, TF), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(TQ, TR), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TU, TY, TX, TZ;
+		    {
+			 V TS, TT, TV, TW;
+			 TS = VSUB(TJ, TK);
+			 TT = VADD(Tm, Tb);
+			 TU = VADD(TS, TT);
+			 TY = VSUB(TS, TT);
+			 TV = VADD(TD, Ty);
+			 TW = VSUB(TN, TM);
+			 TX = VBYI(VADD(TV, TW));
+			 TZ = VBYI(VSUB(TW, TV));
+		    }
+		    ST(&(x[WS(rs, 13)]), VSUB(TU, TX), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VADD(TY, TZ), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VADD(TU, TX), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 11)]), VSUB(TY, TZ), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t1fv_16"), twinstr, &GENUS, {83, 38, 4, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_16) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:01 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t1fv_2 -include t1f.h */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T2, T3;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[0]), T2);
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t1fv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_2) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t1fv_2 -include t1f.h */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[0]), T2);
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t1fv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_2) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:32 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t1fv_20 -include t1f.h */
+
+/*
+ * This function contains 123 FP additions, 88 FP multiplications,
+ * (or, 77 additions, 42 multiplications, 46 fused multiply/add),
+ * 68 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T4, Tx, T1m, T1K, T1y, Tk, Tf, T16, T10, TT, T1O, T1w, T1L, T1p, T1M;
+	       V T1s, TZ, TI, T1x, Tp;
+	       {
+		    V T1, Tv, T2, Tt;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+		    T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    {
+			 V T9, T1n, TN, T1v, TS, Te, T1q, T1u, TE, TG, Tm, T1o, TC, Tn, T1r;
+			 V TH, To;
+			 {
+			      V TP, TR, Ta, Tc;
+			      {
+				   V T5, T7, TJ, TL, T1k, T1l;
+				   T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   TJ = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   TL = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   {
+					V Tw, T3, Tu, T6, T8, TK, TM, TO, TQ;
+					TO = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					Tw = BYTWJ(&(W[TWVL * 28]), Tv);
+					T3 = BYTWJ(&(W[TWVL * 18]), T2);
+					Tu = BYTWJ(&(W[TWVL * 8]), Tt);
+					T6 = BYTWJ(&(W[TWVL * 6]), T5);
+					T8 = BYTWJ(&(W[TWVL * 26]), T7);
+					TK = BYTWJ(&(W[TWVL * 24]), TJ);
+					TM = BYTWJ(&(W[TWVL * 4]), TL);
+					TP = BYTWJ(&(W[TWVL * 32]), TO);
+					TQ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					T4 = VSUB(T1, T3);
+					T1k = VADD(T1, T3);
+					Tx = VSUB(Tu, Tw);
+					T1l = VADD(Tu, Tw);
+					T9 = VSUB(T6, T8);
+					T1n = VADD(T6, T8);
+					TN = VSUB(TK, TM);
+					T1v = VADD(TK, TM);
+					TR = BYTWJ(&(W[TWVL * 12]), TQ);
+				   }
+				   Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T1m = VSUB(T1k, T1l);
+				   T1K = VADD(T1k, T1l);
+				   Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      }
+			      {
+				   V Tb, TA, Td, Th, Tj, Tz, Tg, Ti, Ty;
+				   Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+				   Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+				   Ty = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+				   TS = VSUB(TP, TR);
+				   T1y = VADD(TP, TR);
+				   Tb = BYTWJ(&(W[TWVL * 30]), Ta);
+				   TA = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+				   Td = BYTWJ(&(W[TWVL * 10]), Tc);
+				   Th = BYTWJ(&(W[TWVL * 14]), Tg);
+				   Tj = BYTWJ(&(W[TWVL * 34]), Ti);
+				   Tz = BYTWJ(&(W[TWVL * 16]), Ty);
+				   {
+					V TD, TF, TB, Tl;
+					TD = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					TB = BYTWJ(&(W[TWVL * 36]), TA);
+					Te = VSUB(Tb, Td);
+					T1q = VADD(Tb, Td);
+					Tk = VSUB(Th, Tj);
+					T1u = VADD(Th, Tj);
+					TE = BYTWJ(&(W[0]), TD);
+					TG = BYTWJ(&(W[TWVL * 20]), TF);
+					Tm = BYTWJ(&(W[TWVL * 22]), Tl);
+					T1o = VADD(Tz, TB);
+					TC = VSUB(Tz, TB);
+					Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+				   }
+			      }
+			 }
+			 Tf = VADD(T9, Te);
+			 T16 = VSUB(T9, Te);
+			 T10 = VSUB(TS, TN);
+			 TT = VADD(TN, TS);
+			 T1r = VADD(TE, TG);
+			 TH = VSUB(TE, TG);
+			 T1O = VADD(T1u, T1v);
+			 T1w = VSUB(T1u, T1v);
+			 To = BYTWJ(&(W[TWVL * 2]), Tn);
+			 T1L = VADD(T1n, T1o);
+			 T1p = VSUB(T1n, T1o);
+			 T1M = VADD(T1q, T1r);
+			 T1s = VSUB(T1q, T1r);
+			 TZ = VSUB(TH, TC);
+			 TI = VADD(TC, TH);
+			 T1x = VADD(Tm, To);
+			 Tp = VSUB(Tm, To);
+		    }
+	       }
+	       {
+		    V T1V, T1N, T14, T1d, T11, T1G, T1t, T1z, T1P, Tq, T17, T13, TV, TU;
+		    T1V = VSUB(T1L, T1M);
+		    T1N = VADD(T1L, T1M);
+		    T14 = VSUB(TT, TI);
+		    TU = VADD(TI, TT);
+		    T1d = VFNMS(LDK(KP618033988), TZ, T10);
+		    T11 = VFMA(LDK(KP618033988), T10, TZ);
+		    T1G = VSUB(T1p, T1s);
+		    T1t = VADD(T1p, T1s);
+		    T1z = VSUB(T1x, T1y);
+		    T1P = VADD(T1x, T1y);
+		    Tq = VADD(Tk, Tp);
+		    T17 = VSUB(Tk, Tp);
+		    T13 = VFNMS(LDK(KP250000000), TU, Tx);
+		    TV = VADD(Tx, TU);
+		    {
+			 V T1J, T1H, T1D, T1Z, T1X, T1T, T1h, T1j, T1b, T19, T1C, T1S, T1c, TY, T1F;
+			 V T1A;
+			 T1F = VSUB(T1w, T1z);
+			 T1A = VADD(T1w, T1z);
+			 {
+			      V T1W, T1Q, TX, Tr;
+			      T1W = VSUB(T1O, T1P);
+			      T1Q = VADD(T1O, T1P);
+			      TX = VSUB(Tf, Tq);
+			      Tr = VADD(Tf, Tq);
+			      {
+				   V T1g, T18, T1f, T15;
+				   T1g = VFNMS(LDK(KP618033988), T16, T17);
+				   T18 = VFMA(LDK(KP618033988), T17, T16);
+				   T1f = VFMA(LDK(KP559016994), T14, T13);
+				   T15 = VFNMS(LDK(KP559016994), T14, T13);
+				   T1J = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
+				   T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
+				   {
+					V T1B, T1R, TW, Ts;
+					T1B = VADD(T1t, T1A);
+					T1D = VSUB(T1t, T1A);
+					T1Z = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1V, T1W));
+					T1X = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1W, T1V));
+					T1R = VADD(T1N, T1Q);
+					T1T = VSUB(T1N, T1Q);
+					TW = VFNMS(LDK(KP250000000), Tr, T4);
+					Ts = VADD(T4, Tr);
+					T1h = VFNMS(LDK(KP951056516), T1g, T1f);
+					T1j = VFMA(LDK(KP951056516), T1g, T1f);
+					T1b = VFNMS(LDK(KP951056516), T18, T15);
+					T19 = VFMA(LDK(KP951056516), T18, T15);
+					ST(&(x[WS(rs, 10)]), VADD(T1m, T1B), ms, &(x[0]));
+					T1C = VFNMS(LDK(KP250000000), T1B, T1m);
+					ST(&(x[0]), VADD(T1K, T1R), ms, &(x[0]));
+					T1S = VFNMS(LDK(KP250000000), T1R, T1K);
+					T1c = VFNMS(LDK(KP559016994), TX, TW);
+					TY = VFMA(LDK(KP559016994), TX, TW);
+					ST(&(x[WS(rs, 15)]), VFMAI(TV, Ts), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 5)]), VFNMSI(TV, Ts), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+			 {
+			      V T1E, T1I, T1U, T1Y;
+			      T1E = VFNMS(LDK(KP559016994), T1D, T1C);
+			      T1I = VFMA(LDK(KP559016994), T1D, T1C);
+			      T1U = VFMA(LDK(KP559016994), T1T, T1S);
+			      T1Y = VFNMS(LDK(KP559016994), T1T, T1S);
+			      {
+				   V T1e, T1i, T1a, T12;
+				   T1e = VFNMS(LDK(KP951056516), T1d, T1c);
+				   T1i = VFMA(LDK(KP951056516), T1d, T1c);
+				   T1a = VFNMS(LDK(KP951056516), T11, TY);
+				   T12 = VFMA(LDK(KP951056516), T11, TY);
+				   ST(&(x[WS(rs, 18)]), VFNMSI(T1H, T1E), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFMAI(T1H, T1E), ms, &(x[0]));
+				   ST(&(x[WS(rs, 14)]), VFMAI(T1J, T1I), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(T1J, T1I), ms, &(x[0]));
+				   ST(&(x[WS(rs, 16)]), VFNMSI(T1X, T1U), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VFMAI(T1X, T1U), ms, &(x[0]));
+				   ST(&(x[WS(rs, 12)]), VFMAI(T1Z, T1Y), ms, &(x[0]));
+				   ST(&(x[WS(rs, 8)]), VFNMSI(T1Z, T1Y), ms, &(x[0]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(T1h, T1e), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 17)]), VFNMSI(T1h, T1e), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFMAI(T1j, T1i), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 13)]), VFNMSI(T1j, T1i), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 11)]), VFMAI(T1b, T1a), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 9)]), VFNMSI(T1b, T1a), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 19)]), VFMAI(T19, T12), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFNMSI(T19, T12), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t1fv_20"), twinstr, &GENUS, {77, 42, 46, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_20) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t1fv_20 -include t1f.h */
+
+/*
+ * This function contains 123 FP additions, 62 FP multiplications,
+ * (or, 111 additions, 50 multiplications, 12 fused multiply/add),
+ * 54 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T4, Tx, T1B, T1U, TZ, T16, T17, T10, Tf, Tq, Tr, T1N, T1O, T1S, T1t;
+	       V T1w, T1C, TI, TT, TU, T1K, T1L, T1R, T1m, T1p, T1D, Ts, TV;
+	       {
+		    V T1, Tw, T3, Tu, Tv, T2, Tt, T1z, T1A;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+		    Tw = BYTWJ(&(W[TWVL * 28]), Tv);
+		    T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    T3 = BYTWJ(&(W[TWVL * 18]), T2);
+		    Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Tu = BYTWJ(&(W[TWVL * 8]), Tt);
+		    T4 = VSUB(T1, T3);
+		    Tx = VSUB(Tu, Tw);
+		    T1z = VADD(T1, T3);
+		    T1A = VADD(Tu, Tw);
+		    T1B = VSUB(T1z, T1A);
+		    T1U = VADD(T1z, T1A);
+	       }
+	       {
+		    V T9, T1r, TN, T1l, TS, T1o, Te, T1u, Tk, T1k, TC, T1s, TH, T1v, Tp;
+		    V T1n;
+		    {
+			 V T6, T8, T5, T7;
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T6 = BYTWJ(&(W[TWVL * 6]), T5);
+			 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 T8 = BYTWJ(&(W[TWVL * 26]), T7);
+			 T9 = VSUB(T6, T8);
+			 T1r = VADD(T6, T8);
+		    }
+		    {
+			 V TK, TM, TJ, TL;
+			 TJ = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 TK = BYTWJ(&(W[TWVL * 24]), TJ);
+			 TL = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 TM = BYTWJ(&(W[TWVL * 4]), TL);
+			 TN = VSUB(TK, TM);
+			 T1l = VADD(TK, TM);
+		    }
+		    {
+			 V TP, TR, TO, TQ;
+			 TO = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 TP = BYTWJ(&(W[TWVL * 32]), TO);
+			 TQ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 TR = BYTWJ(&(W[TWVL * 12]), TQ);
+			 TS = VSUB(TP, TR);
+			 T1o = VADD(TP, TR);
+		    }
+		    {
+			 V Tb, Td, Ta, Tc;
+			 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			 Tb = BYTWJ(&(W[TWVL * 30]), Ta);
+			 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Td = BYTWJ(&(W[TWVL * 10]), Tc);
+			 Te = VSUB(Tb, Td);
+			 T1u = VADD(Tb, Td);
+		    }
+		    {
+			 V Th, Tj, Tg, Ti;
+			 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 Th = BYTWJ(&(W[TWVL * 14]), Tg);
+			 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 Tj = BYTWJ(&(W[TWVL * 34]), Ti);
+			 Tk = VSUB(Th, Tj);
+			 T1k = VADD(Th, Tj);
+		    }
+		    {
+			 V Tz, TB, Ty, TA;
+			 Ty = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tz = BYTWJ(&(W[TWVL * 16]), Ty);
+			 TA = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 TB = BYTWJ(&(W[TWVL * 36]), TA);
+			 TC = VSUB(Tz, TB);
+			 T1s = VADD(Tz, TB);
+		    }
+		    {
+			 V TE, TG, TD, TF;
+			 TD = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 TE = BYTWJ(&(W[0]), TD);
+			 TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 TG = BYTWJ(&(W[TWVL * 20]), TF);
+			 TH = VSUB(TE, TG);
+			 T1v = VADD(TE, TG);
+		    }
+		    {
+			 V Tm, To, Tl, Tn;
+			 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Tm = BYTWJ(&(W[TWVL * 22]), Tl);
+			 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 To = BYTWJ(&(W[TWVL * 2]), Tn);
+			 Tp = VSUB(Tm, To);
+			 T1n = VADD(Tm, To);
+		    }
+		    TZ = VSUB(TH, TC);
+		    T16 = VSUB(T9, Te);
+		    T17 = VSUB(Tk, Tp);
+		    T10 = VSUB(TS, TN);
+		    Tf = VADD(T9, Te);
+		    Tq = VADD(Tk, Tp);
+		    Tr = VADD(Tf, Tq);
+		    T1N = VADD(T1k, T1l);
+		    T1O = VADD(T1n, T1o);
+		    T1S = VADD(T1N, T1O);
+		    T1t = VSUB(T1r, T1s);
+		    T1w = VSUB(T1u, T1v);
+		    T1C = VADD(T1t, T1w);
+		    TI = VADD(TC, TH);
+		    TT = VADD(TN, TS);
+		    TU = VADD(TI, TT);
+		    T1K = VADD(T1r, T1s);
+		    T1L = VADD(T1u, T1v);
+		    T1R = VADD(T1K, T1L);
+		    T1m = VSUB(T1k, T1l);
+		    T1p = VSUB(T1n, T1o);
+		    T1D = VADD(T1m, T1p);
+	       }
+	       Ts = VADD(T4, Tr);
+	       TV = VBYI(VADD(Tx, TU));
+	       ST(&(x[WS(rs, 5)]), VSUB(Ts, TV), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 15)]), VADD(Ts, TV), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T1T, T1V, T1W, T1Q, T1Z, T1M, T1P, T1Y, T1X;
+		    T1T = VMUL(LDK(KP559016994), VSUB(T1R, T1S));
+		    T1V = VADD(T1R, T1S);
+		    T1W = VFNMS(LDK(KP250000000), T1V, T1U);
+		    T1M = VSUB(T1K, T1L);
+		    T1P = VSUB(T1N, T1O);
+		    T1Q = VBYI(VFMA(LDK(KP951056516), T1M, VMUL(LDK(KP587785252), T1P)));
+		    T1Z = VBYI(VFNMS(LDK(KP587785252), T1M, VMUL(LDK(KP951056516), T1P)));
+		    ST(&(x[0]), VADD(T1U, T1V), ms, &(x[0]));
+		    T1Y = VSUB(T1W, T1T);
+		    ST(&(x[WS(rs, 8)]), VSUB(T1Y, T1Z), ms, &(x[0]));
+		    ST(&(x[WS(rs, 12)]), VADD(T1Z, T1Y), ms, &(x[0]));
+		    T1X = VADD(T1T, T1W);
+		    ST(&(x[WS(rs, 4)]), VADD(T1Q, T1X), ms, &(x[0]));
+		    ST(&(x[WS(rs, 16)]), VSUB(T1X, T1Q), ms, &(x[0]));
+	       }
+	       {
+		    V T1G, T1E, T1F, T1y, T1J, T1q, T1x, T1I, T1H;
+		    T1G = VMUL(LDK(KP559016994), VSUB(T1C, T1D));
+		    T1E = VADD(T1C, T1D);
+		    T1F = VFNMS(LDK(KP250000000), T1E, T1B);
+		    T1q = VSUB(T1m, T1p);
+		    T1x = VSUB(T1t, T1w);
+		    T1y = VBYI(VFNMS(LDK(KP587785252), T1x, VMUL(LDK(KP951056516), T1q)));
+		    T1J = VBYI(VFMA(LDK(KP951056516), T1x, VMUL(LDK(KP587785252), T1q)));
+		    ST(&(x[WS(rs, 10)]), VADD(T1B, T1E), ms, &(x[0]));
+		    T1I = VADD(T1G, T1F);
+		    ST(&(x[WS(rs, 6)]), VSUB(T1I, T1J), ms, &(x[0]));
+		    ST(&(x[WS(rs, 14)]), VADD(T1J, T1I), ms, &(x[0]));
+		    T1H = VSUB(T1F, T1G);
+		    ST(&(x[WS(rs, 2)]), VADD(T1y, T1H), ms, &(x[0]));
+		    ST(&(x[WS(rs, 18)]), VSUB(T1H, T1y), ms, &(x[0]));
+	       }
+	       {
+		    V T11, T18, T1g, T1d, T15, T1f, TY, T1c;
+		    T11 = VFMA(LDK(KP951056516), TZ, VMUL(LDK(KP587785252), T10));
+		    T18 = VFMA(LDK(KP951056516), T16, VMUL(LDK(KP587785252), T17));
+		    T1g = VFNMS(LDK(KP587785252), T16, VMUL(LDK(KP951056516), T17));
+		    T1d = VFNMS(LDK(KP587785252), TZ, VMUL(LDK(KP951056516), T10));
+		    {
+			 V T13, T14, TW, TX;
+			 T13 = VFMS(LDK(KP250000000), TU, Tx);
+			 T14 = VMUL(LDK(KP559016994), VSUB(TT, TI));
+			 T15 = VADD(T13, T14);
+			 T1f = VSUB(T14, T13);
+			 TW = VMUL(LDK(KP559016994), VSUB(Tf, Tq));
+			 TX = VFNMS(LDK(KP250000000), Tr, T4);
+			 TY = VADD(TW, TX);
+			 T1c = VSUB(TX, TW);
+		    }
+		    {
+			 V T12, T19, T1i, T1j;
+			 T12 = VADD(TY, T11);
+			 T19 = VBYI(VSUB(T15, T18));
+			 ST(&(x[WS(rs, 19)]), VSUB(T12, T19), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T12, T19), ms, &(x[WS(rs, 1)]));
+			 T1i = VADD(T1c, T1d);
+			 T1j = VBYI(VADD(T1g, T1f));
+			 ST(&(x[WS(rs, 13)]), VSUB(T1i, T1j), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T1i, T1j), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T1a, T1b, T1e, T1h;
+			 T1a = VSUB(TY, T11);
+			 T1b = VBYI(VADD(T18, T15));
+			 ST(&(x[WS(rs, 11)]), VSUB(T1a, T1b), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 9)]), VADD(T1a, T1b), ms, &(x[WS(rs, 1)]));
+			 T1e = VSUB(T1c, T1d);
+			 T1h = VBYI(VSUB(T1f, T1g));
+			 ST(&(x[WS(rs, 17)]), VSUB(T1e, T1h), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(T1e, T1h), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t1fv_20"), twinstr, &GENUS, {111, 50, 12, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_20) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,932 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:33 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t1fv_25 -include t1f.h */
+
+/*
+ * This function contains 248 FP additions, 241 FP multiplications,
+ * (or, 67 additions, 60 multiplications, 181 fused multiply/add),
+ * 208 stack variables, 67 constants, and 50 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP792626838, +0.792626838241819413632131824093538848057784557);
+     DVK(KP876091699, +0.876091699473550838204498029706869638173524346);
+     DVK(KP617882369, +0.617882369114440893914546919006756321695042882);
+     DVK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DVK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP999544308, +0.999544308746292983948881682379742149196758193);
+     DVK(KP916574801, +0.916574801383451584742370439148878693530976769);
+     DVK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DVK(KP809385824, +0.809385824416008241660603814668679683846476688);
+     DVK(KP447417479, +0.447417479732227551498980015410057305749330693);
+     DVK(KP894834959, +0.894834959464455102997960030820114611498661386);
+     DVK(KP867381224, +0.867381224396525206773171885031575671309956167);
+     DVK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DVK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DVK(KP958953096, +0.958953096729998668045963838399037225970891871);
+     DVK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DVK(KP829049696, +0.829049696159252993975487806364305442437946767);
+     DVK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DVK(KP897376177, +0.897376177523557693138608077137219684419427330);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DVK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DVK(KP855719849, +0.855719849902058969314654733608091555096772472);
+     DVK(KP237294955, +0.237294955877110315393888866460840817927895961);
+     DVK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DVK(KP904508497, +0.904508497187473712051146708591409529430077295);
+     DVK(KP997675361, +0.997675361079556513670859573984492383596555031);
+     DVK(KP763932022, +0.763932022500210303590826331268723764559381640);
+     DVK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP952936919, +0.952936919628306576880750665357914584765951388);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP262346850, +0.262346850930607871785420028382979691334784273);
+     DVK(KP570584518, +0.570584518783621657366766175430996792655723863);
+     DVK(KP669429328, +0.669429328479476605641803240971985825917022098);
+     DVK(KP923225144, +0.923225144846402650453449441572664695995209956);
+     DVK(KP945422727, +0.945422727388575946270360266328811958657216298);
+     DVK(KP522616830, +0.522616830205754336872861364785224694908468440);
+     DVK(KP956723877, +0.956723877038460305821989399535483155872969262);
+     DVK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DVK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DVK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP921078979, +0.921078979742360627699756128143719920817673854);
+     DVK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DVK(KP982009705, +0.982009705009746369461829878184175962711969869);
+     DVK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DVK(KP494780565, +0.494780565770515410344588413655324772219443730);
+     DVK(KP447533225, +0.447533225982656890041886979663652563063114397);
+     DVK(KP269969613, +0.269969613759572083574752974412347470060951301);
+     DVK(KP244189809, +0.244189809627953270309879511234821255780225091);
+     DVK(KP667278218, +0.667278218140296670899089292254759909713898805);
+     DVK(KP603558818, +0.603558818296015001454675132653458027918768137);
+     DVK(KP522847744, +0.522847744331509716623755382187077770911012542);
+     DVK(KP578046249, +0.578046249379945007321754579646815604023525655);
+     DVK(KP987388751, +0.987388751065621252324603216482382109400433949);
+     DVK(KP893101515, +0.893101515366181661711202267938416198338079437);
+     DVK(KP120146378, +0.120146378570687701782758537356596213647956445);
+     DVK(KP132830569, +0.132830569247582714407653942074819768844536507);
+     DVK(KP869845200, +0.869845200362138853122720822420327157933056305);
+     DVK(KP786782374, +0.786782374965295178365099601674911834788448471);
+     DVK(KP066152395, +0.066152395967733048213034281011006031460903353);
+     DVK(KP059835404, +0.059835404262124915169548397419498386427871950);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V T25, T1B, T2y, T1K, T2s, T23, T1S, T26, T20, T1X;
+	       {
+		    V T1O, T2X, Te, T3L, Td, T3Q, T3j, T3b, T2R, T2M, T2f, T27, T1y, T1H, T3M;
+		    V TW, TR, TK, T2B, T3n, T3e, T2U, T2F, T2i, T2a, Tz, T1C, T3N, TQ, T11;
+		    V T1b, T1c, T16;
+		    {
+			 V T1, T1g, T1i, T1p, T1k, T1m, Tb, T1N, T6, T1M;
+			 {
+			      V T7, T9, T2, T4, T1f, T1h, T1o;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T7 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      T9 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      T4 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			      T1f = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T1h = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      T1o = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      {
+				   V T8, Ta, T3, T5, T1j;
+				   T1j = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+				   T8 = BYTWJ(&(W[TWVL * 18]), T7);
+				   Ta = BYTWJ(&(W[TWVL * 28]), T9);
+				   T3 = BYTWJ(&(W[TWVL * 8]), T2);
+				   T5 = BYTWJ(&(W[TWVL * 38]), T4);
+				   T1g = BYTWJ(&(W[TWVL * 4]), T1f);
+				   T1i = BYTWJ(&(W[TWVL * 14]), T1h);
+				   T1p = BYTWJ(&(W[TWVL * 34]), T1o);
+				   T1k = BYTWJ(&(W[TWVL * 44]), T1j);
+				   T1m = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   Tb = VADD(T8, Ta);
+				   T1N = VSUB(T8, Ta);
+				   T6 = VADD(T3, T5);
+				   T1M = VSUB(T3, T5);
+			      }
+			 }
+			 {
+			      V T1v, T1l, Th, Tj, T1w, T1q, Tq, Tk, Tn, Tg;
+			      Tg = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      {
+				   V Tc, Ti, T1n, Tp;
+				   Ti = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   T1v = VSUB(T1i, T1k);
+				   T1l = VADD(T1i, T1k);
+				   T1n = BYTWJ(&(W[TWVL * 24]), T1m);
+				   Tp = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T1O = VFMA(LDK(KP618033988), T1N, T1M);
+				   T2X = VFNMS(LDK(KP618033988), T1M, T1N);
+				   Te = VSUB(T6, Tb);
+				   Tc = VADD(T6, Tb);
+				   Th = BYTWJ(&(W[0]), Tg);
+				   Tj = BYTWJ(&(W[TWVL * 10]), Ti);
+				   T1w = VSUB(T1n, T1p);
+				   T1q = VADD(T1n, T1p);
+				   Tq = BYTWJ(&(W[TWVL * 30]), Tp);
+				   Tk = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+				   T3L = VADD(T1, Tc);
+				   Td = VFNMS(LDK(KP250000000), Tc, T1);
+				   Tn = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      }
+			      {
+				   V T1x, T2K, TM, TB, Tw, Tm, Tx, Tr, TI, T2L, T1u, TD, TF, TL;
+				   TL = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   {
+					V T1t, Tl, To, TH, T1s, T1r, TA, TC;
+					TA = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+					T1r = VADD(T1l, T1q);
+					T1t = VSUB(T1q, T1l);
+					T1x = VFMA(LDK(KP618033988), T1w, T1v);
+					T2K = VFNMS(LDK(KP618033988), T1v, T1w);
+					Tl = BYTWJ(&(W[TWVL * 40]), Tk);
+					To = BYTWJ(&(W[TWVL * 20]), Tn);
+					TM = BYTWJ(&(W[TWVL * 6]), TL);
+					TB = BYTWJ(&(W[TWVL * 46]), TA);
+					TH = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					T1s = VFNMS(LDK(KP250000000), T1r, T1g);
+					T3Q = VADD(T1g, T1r);
+					TC = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					Tw = VSUB(Tj, Tl);
+					Tm = VADD(Tj, Tl);
+					Tx = VSUB(Tq, To);
+					Tr = VADD(To, Tq);
+					TI = BYTWJ(&(W[TWVL * 26]), TH);
+					T2L = VFMA(LDK(KP559016994), T1t, T1s);
+					T1u = VFNMS(LDK(KP559016994), T1t, T1s);
+					TD = BYTWJ(&(W[TWVL * 16]), TC);
+					TF = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+				   }
+				   {
+					V Tu, Ty, T2E, TE, TN, TG, Tt, TV, Ts;
+					TV = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					Ts = VADD(Tm, Tr);
+					Tu = VSUB(Tm, Tr);
+					Ty = VFNMS(LDK(KP618033988), Tx, Tw);
+					T2E = VFMA(LDK(KP618033988), Tw, Tx);
+					T3j = VFNMS(LDK(KP059835404), T2K, T2L);
+					T3b = VFMA(LDK(KP066152395), T2L, T2K);
+					T2R = VFNMS(LDK(KP786782374), T2K, T2L);
+					T2M = VFMA(LDK(KP869845200), T2L, T2K);
+					T2f = VFMA(LDK(KP132830569), T1u, T1x);
+					T27 = VFNMS(LDK(KP120146378), T1x, T1u);
+					T1y = VFNMS(LDK(KP893101515), T1x, T1u);
+					T1H = VFMA(LDK(KP987388751), T1u, T1x);
+					TE = VSUB(TB, TD);
+					TN = VADD(TD, TB);
+					TG = BYTWJ(&(W[TWVL * 36]), TF);
+					Tt = VFNMS(LDK(KP250000000), Ts, Th);
+					T3M = VADD(Th, Ts);
+					TW = BYTWJ(&(W[TWVL * 2]), TV);
+					{
+					     V TJ, TO, Tv, T2D, TY, T15, T10, T13, TP;
+					     {
+						  V TX, T14, TZ, T12;
+						  TX = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+						  T14 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+						  TZ = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+						  T12 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+						  TJ = VSUB(TG, TI);
+						  TO = VADD(TI, TG);
+						  Tv = VFMA(LDK(KP559016994), Tu, Tt);
+						  T2D = VFNMS(LDK(KP559016994), Tu, Tt);
+						  TY = BYTWJ(&(W[TWVL * 12]), TX);
+						  T15 = BYTWJ(&(W[TWVL * 32]), T14);
+						  T10 = BYTWJ(&(W[TWVL * 42]), TZ);
+						  T13 = BYTWJ(&(W[TWVL * 22]), T12);
+					     }
+					     TP = VADD(TN, TO);
+					     TR = VSUB(TN, TO);
+					     TK = VFMA(LDK(KP618033988), TJ, TE);
+					     T2B = VFNMS(LDK(KP618033988), TE, TJ);
+					     T3n = VFMA(LDK(KP578046249), T2D, T2E);
+					     T3e = VFNMS(LDK(KP522847744), T2E, T2D);
+					     T2U = VFNMS(LDK(KP987388751), T2D, T2E);
+					     T2F = VFMA(LDK(KP893101515), T2E, T2D);
+					     T2i = VFNMS(LDK(KP603558818), Ty, Tv);
+					     T2a = VFMA(LDK(KP667278218), Tv, Ty);
+					     Tz = VFNMS(LDK(KP244189809), Ty, Tv);
+					     T1C = VFMA(LDK(KP269969613), Tv, Ty);
+					     T3N = VADD(TM, TP);
+					     TQ = VFMS(LDK(KP250000000), TP, TM);
+					     T11 = VADD(TY, T10);
+					     T1b = VSUB(TY, T10);
+					     T1c = VSUB(T15, T13);
+					     T16 = VADD(T13, T15);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T2z, Tf, T3W, T3O, T1d, T2H, T3m, T2j, T2b, TT, T1D, T2G, T35, T2V, T2Z;
+			 V T3A, T3g, T2I, T1a, T3R, T3X;
+			 T2z = VFNMS(LDK(KP559016994), Te, Td);
+			 Tf = VFMA(LDK(KP559016994), Te, Td);
+			 {
+			      V TS, T2A, T17, T19;
+			      TS = VFNMS(LDK(KP559016994), TR, TQ);
+			      T2A = VFMA(LDK(KP559016994), TR, TQ);
+			      T3W = VSUB(T3M, T3N);
+			      T3O = VADD(T3M, T3N);
+			      T1d = VFNMS(LDK(KP618033988), T1c, T1b);
+			      T2H = VFMA(LDK(KP618033988), T1b, T1c);
+			      T17 = VADD(T11, T16);
+			      T19 = VSUB(T16, T11);
+			      {
+				   V T3f, T2T, T2C, T18, T3P;
+				   T3m = VFMA(LDK(KP447533225), T2B, T2A);
+				   T3f = VFNMS(LDK(KP494780565), T2A, T2B);
+				   T2T = VFNMS(LDK(KP132830569), T2A, T2B);
+				   T2C = VFMA(LDK(KP120146378), T2B, T2A);
+				   T2j = VFNMS(LDK(KP786782374), TK, TS);
+				   T2b = VFMA(LDK(KP869845200), TS, TK);
+				   TT = VFNMS(LDK(KP667278218), TS, TK);
+				   T1D = VFMA(LDK(KP603558818), TK, TS);
+				   T18 = VFNMS(LDK(KP250000000), T17, TW);
+				   T3P = VADD(TW, T17);
+				   T2G = VFMA(LDK(KP734762448), T2F, T2C);
+				   T35 = VFNMS(LDK(KP734762448), T2F, T2C);
+				   T2V = VFNMS(LDK(KP734762448), T2U, T2T);
+				   T2Z = VFMA(LDK(KP734762448), T2U, T2T);
+				   T3A = VFMA(LDK(KP982009705), T3f, T3e);
+				   T3g = VFNMS(LDK(KP982009705), T3f, T3e);
+				   T2I = VFMA(LDK(KP559016994), T19, T18);
+				   T1a = VFNMS(LDK(KP559016994), T19, T18);
+				   T3R = VADD(T3P, T3Q);
+				   T3X = VSUB(T3P, T3Q);
+			      }
+			 }
+			 {
+			      V T2n, T2t, T1V, T22, T2l, T2d, T1Q, T1I, T2w, T1A, T1F, T2q;
+			      {
+				   V T2k, T1G, T28, T2g, T3K, T3E, T3a, T34, T3x, T3H, T2c, TU, T1T, T1U, T1z;
+				   V T3o, T3t;
+				   T2n = VFNMS(LDK(KP912575812), T2j, T2i);
+				   T2k = VFMA(LDK(KP912575812), T2j, T2i);
+				   T3o = VFNMS(LDK(KP921078979), T3n, T3m);
+				   T3t = VFMA(LDK(KP921078979), T3n, T3m);
+				   {
+					V T3c, T2Q, T2J, T3k, T1e;
+					T3c = VFNMS(LDK(KP667278218), T2I, T2H);
+					T2Q = VFNMS(LDK(KP059835404), T2H, T2I);
+					T2J = VFMA(LDK(KP066152395), T2I, T2H);
+					T3k = VFMA(LDK(KP603558818), T2H, T2I);
+					T1G = VFMA(LDK(KP578046249), T1a, T1d);
+					T1e = VFNMS(LDK(KP522847744), T1d, T1a);
+					T28 = VFNMS(LDK(KP494780565), T1a, T1d);
+					T2g = VFMA(LDK(KP447533225), T1d, T1a);
+					{
+					     V T3U, T3S, T40, T3Y;
+					     T3U = VSUB(T3O, T3R);
+					     T3S = VADD(T3O, T3R);
+					     T40 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T3W, T3X));
+					     T3Y = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T3X, T3W));
+					     {
+						  V T3s, T3l, T2N, T36;
+						  T3s = VFNMS(LDK(KP845997307), T3k, T3j);
+						  T3l = VFMA(LDK(KP845997307), T3k, T3j);
+						  T2N = VFNMS(LDK(KP772036680), T2M, T2J);
+						  T36 = VFMA(LDK(KP772036680), T2M, T2J);
+						  {
+						       V T30, T2S, T3d, T3z, T3T;
+						       T30 = VFNMS(LDK(KP772036680), T2R, T2Q);
+						       T2S = VFMA(LDK(KP772036680), T2R, T2Q);
+						       T3d = VFNMS(LDK(KP845997307), T3c, T3b);
+						       T3z = VFMA(LDK(KP845997307), T3c, T3b);
+						       ST(&(x[0]), VADD(T3S, T3L), ms, &(x[0]));
+						       T3T = VFNMS(LDK(KP250000000), T3S, T3L);
+						       {
+							    V T3C, T3p, T2O, T37;
+							    T3C = VFMA(LDK(KP906616052), T3o, T3l);
+							    T3p = VFNMS(LDK(KP906616052), T3o, T3l);
+							    T2O = VFMA(LDK(KP956723877), T2N, T2G);
+							    T37 = VFMA(LDK(KP522616830), T2V, T36);
+							    {
+								 V T31, T2W, T3u, T3h;
+								 T31 = VFNMS(LDK(KP522616830), T2G, T30);
+								 T2W = VFMA(LDK(KP945422727), T2V, T2S);
+								 T3u = VFNMS(LDK(KP923225144), T3g, T3d);
+								 T3h = VFMA(LDK(KP923225144), T3g, T3d);
+								 {
+								      V T3I, T3B, T3V, T3Z;
+								      T3I = VFNMS(LDK(KP669429328), T3z, T3A);
+								      T3B = VFMA(LDK(KP570584518), T3A, T3z);
+								      T3V = VFMA(LDK(KP559016994), T3U, T3T);
+								      T3Z = VFNMS(LDK(KP559016994), T3U, T3T);
+								      {
+									   V T3y, T3q, T2P, T38;
+									   T3y = VFMA(LDK(KP262346850), T3p, T2X);
+									   T3q = VMUL(LDK(KP998026728), VFNMS(LDK(KP952936919), T2X, T3p));
+									   T2P = VFMA(LDK(KP992114701), T2O, T2z);
+									   T38 = VFNMS(LDK(KP690983005), T37, T2S);
+									   {
+										V T32, T2Y, T3v, T3F;
+										T32 = VFMA(LDK(KP763932022), T31, T2N);
+										T2Y = VMUL(LDK(KP998026728), VFMA(LDK(KP952936919), T2X, T2W));
+										T3v = VFNMS(LDK(KP997675361), T3u, T3t);
+										T3F = VFNMS(LDK(KP904508497), T3u, T3s);
+										{
+										     V T3i, T3r, T3J, T3D;
+										     T3i = VFMA(LDK(KP949179823), T3h, T2z);
+										     T3r = VFNMS(LDK(KP237294955), T3h, T2z);
+										     T3J = VFNMS(LDK(KP669429328), T3C, T3I);
+										     T3D = VFMA(LDK(KP618033988), T3C, T3B);
+										     ST(&(x[WS(rs, 20)]), VFMAI(T3Y, T3V), ms, &(x[0]));
+										     ST(&(x[WS(rs, 5)]), VFNMSI(T3Y, T3V), ms, &(x[WS(rs, 1)]));
+										     ST(&(x[WS(rs, 15)]), VFNMSI(T40, T3Z), ms, &(x[WS(rs, 1)]));
+										     ST(&(x[WS(rs, 10)]), VFMAI(T40, T3Z), ms, &(x[0]));
+										     {
+											  V T39, T33, T3w, T3G;
+											  T39 = VFMA(LDK(KP855719849), T38, T35);
+											  T33 = VFNMS(LDK(KP855719849), T32, T2Z);
+											  ST(&(x[WS(rs, 22)]), VFMAI(T2Y, T2P), ms, &(x[0]));
+											  ST(&(x[WS(rs, 3)]), VFNMSI(T2Y, T2P), ms, &(x[WS(rs, 1)]));
+											  T3w = VFMA(LDK(KP560319534), T3v, T3s);
+											  T3G = VFNMS(LDK(KP681693190), T3F, T3t);
+											  ST(&(x[WS(rs, 23)]), VFMAI(T3q, T3i), ms, &(x[WS(rs, 1)]));
+											  ST(&(x[WS(rs, 2)]), VFNMSI(T3q, T3i), ms, &(x[0]));
+											  T3K = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T3J, T3y));
+											  T3E = VMUL(LDK(KP951056516), VFNMS(LDK(KP949179823), T3D, T3y));
+											  T3a = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T39, T2X));
+											  T34 = VFMA(LDK(KP897376177), T33, T2z);
+											  T3x = VFNMS(LDK(KP949179823), T3w, T3r);
+											  T3H = VFNMS(LDK(KP860541664), T3G, T3r);
+											  T2t = VFNMS(LDK(KP912575812), T2b, T2a);
+											  T2c = VFMA(LDK(KP912575812), T2b, T2a);
+											  TU = VFMA(LDK(KP829049696), TT, Tz);
+											  T1T = VFNMS(LDK(KP829049696), TT, Tz);
+											  T1U = VFNMS(LDK(KP831864738), T1y, T1e);
+											  T1z = VFMA(LDK(KP831864738), T1y, T1e);
+										     }
+										}
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+				   {
+					V T2o, T2h, T29, T2u, T2v, T2p;
+					T2o = VFNMS(LDK(KP958953096), T2g, T2f);
+					T2h = VFMA(LDK(KP958953096), T2g, T2f);
+					ST(&(x[WS(rs, 17)]), VFMAI(T3a, T34), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 8)]), VFNMSI(T3a, T34), ms, &(x[0]));
+					ST(&(x[WS(rs, 12)]), VFMAI(T3E, T3x), ms, &(x[0]));
+					ST(&(x[WS(rs, 13)]), VFNMSI(T3E, T3x), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 18)]), VFNMSI(T3K, T3H), ms, &(x[0]));
+					ST(&(x[WS(rs, 7)]), VFMAI(T3K, T3H), ms, &(x[WS(rs, 1)]));
+					T1V = VFMA(LDK(KP559154169), T1U, T1T);
+					T22 = VFNMS(LDK(KP683113946), T1T, T1U);
+					T29 = VFNMS(LDK(KP867381224), T28, T27);
+					T2u = VFMA(LDK(KP867381224), T28, T27);
+					T2l = VFMA(LDK(KP894834959), T2k, T2h);
+					T2v = VFMA(LDK(KP447417479), T2k, T2u);
+					T2d = VFNMS(LDK(KP809385824), T2c, T29);
+					T2p = VFMA(LDK(KP447417479), T2c, T2o);
+					T1Q = VFMA(LDK(KP831864738), T1H, T1G);
+					T1I = VFNMS(LDK(KP831864738), T1H, T1G);
+					T2w = VFNMS(LDK(KP763932022), T2v, T2h);
+					T1A = VFMA(LDK(KP904730450), T1z, TU);
+					T1F = VFNMS(LDK(KP904730450), T1z, TU);
+					T2q = VFMA(LDK(KP690983005), T2p, T29);
+				   }
+			      }
+			      {
+				   V T2e, T1E, T1P, T2m;
+				   T2e = VFNMS(LDK(KP992114701), T2d, Tf);
+				   T1E = VFMA(LDK(KP916574801), T1D, T1C);
+				   T1P = VFNMS(LDK(KP916574801), T1D, T1C);
+				   T2m = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T2l, T1O));
+				   {
+					V T1J, T2r, T1R, T1W, T1Z, T2x;
+					T2x = VFNMS(LDK(KP999544308), T2w, T2t);
+					T1J = VFNMS(LDK(KP904730450), T1I, T1F);
+					T25 = VFMA(LDK(KP968583161), T1A, Tf);
+					T1B = VFNMS(LDK(KP242145790), T1A, Tf);
+					T2r = VFNMS(LDK(KP999544308), T2q, T2n);
+					T1R = VFMA(LDK(KP904730450), T1Q, T1P);
+					T1W = VFNMS(LDK(KP904730450), T1Q, T1P);
+					T1Z = VADD(T1E, T1F);
+					ST(&(x[WS(rs, 21)]), VFNMSI(T2m, T2e), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 4)]), VFMAI(T2m, T2e), ms, &(x[0]));
+					T2y = VMUL(LDK(KP951056516), VFNMS(LDK(KP803003575), T2x, T1O));
+					T1K = VFNMS(LDK(KP618033988), T1J, T1E);
+					T2s = VFNMS(LDK(KP803003575), T2r, Tf);
+					T23 = VFMA(LDK(KP617882369), T1W, T22);
+					T1S = VFNMS(LDK(KP242145790), T1R, T1O);
+					T26 = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1R, T1O));
+					T20 = VFNMS(LDK(KP683113946), T1Z, T1I);
+					T1X = VFMA(LDK(KP559016994), T1W, T1V);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1L, T24, T21, T1Y;
+		    T1L = VFNMS(LDK(KP876091699), T1K, T1B);
+		    ST(&(x[WS(rs, 9)]), VFMAI(T2y, T2s), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 16)]), VFNMSI(T2y, T2s), ms, &(x[0]));
+		    T24 = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T23, T1S));
+		    ST(&(x[WS(rs, 24)]), VFMAI(T26, T25), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFNMSI(T26, T25), ms, &(x[WS(rs, 1)]));
+		    T21 = VFMA(LDK(KP792626838), T20, T1B);
+		    T1Y = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1X, T1S));
+		    ST(&(x[WS(rs, 11)]), VFNMSI(T24, T21), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 14)]), VFMAI(T24, T21), ms, &(x[0]));
+		    ST(&(x[WS(rs, 19)]), VFMAI(T1Y, T1L), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VFNMSI(T1Y, T1L), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t1fv_25"), twinstr, &GENUS, {67, 60, 181, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_25) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t1fv_25 -include t1f.h */
+
+/*
+ * This function contains 248 FP additions, 188 FP multiplications,
+ * (or, 170 additions, 110 multiplications, 78 fused multiply/add),
+ * 99 stack variables, 40 constants, and 50 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP125581039, +0.125581039058626752152356449131262266244969664);
+     DVK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DVK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DVK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DVK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DVK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
+     DVK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DVK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP497379774, +0.497379774329709576484567492012895936835134813);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DVK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
+     DVK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DVK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DVK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DVK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP250666467, +0.250666467128608490746237519633017587885836494);
+     DVK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DVK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DVK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
+     DVK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DVK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DVK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
+     DVK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DVK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
+     DVK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DVK(KP851558583, +0.851558583130145297725004891488503407959946084);
+     DVK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DVK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
+     DVK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DVK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
+     DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V Tc, Tb, Td, Te, T1C, T2t, T1E, T1x, T2m, T1u, T3c, T2n, Ty, T2i, Tv;
+	       V T38, T2j, TS, T2f, TP, T39, T2g, T1d, T2p, T1a, T3b, T2q;
+	       {
+		    V T7, T9, Ta, T2, T4, T5, T1D;
+		    Tc = LD(&(x[0]), ms, &(x[0]));
+		    {
+			 V T6, T8, T1, T3;
+			 T6 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 T7 = BYTWJ(&(W[TWVL * 18]), T6);
+			 T8 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 28]), T8);
+			 Ta = VADD(T7, T9);
+			 T1 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTWJ(&(W[TWVL * 8]), T1);
+			 T3 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 T4 = BYTWJ(&(W[TWVL * 38]), T3);
+			 T5 = VADD(T2, T4);
+		    }
+		    Tb = VMUL(LDK(KP559016994), VSUB(T5, Ta));
+		    Td = VADD(T5, Ta);
+		    Te = VFNMS(LDK(KP250000000), Td, Tc);
+		    T1C = VSUB(T2, T4);
+		    T1D = VSUB(T7, T9);
+		    T2t = VMUL(LDK(KP951056516), T1D);
+		    T1E = VFMA(LDK(KP951056516), T1C, VMUL(LDK(KP587785252), T1D));
+	       }
+	       {
+		    V T1r, T1l, T1n, T1o, T1g, T1i, T1j, T1q;
+		    T1q = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T1r = BYTWJ(&(W[TWVL * 4]), T1q);
+		    {
+			 V T1k, T1m, T1f, T1h;
+			 T1k = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T1l = BYTWJ(&(W[TWVL * 24]), T1k);
+			 T1m = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 T1n = BYTWJ(&(W[TWVL * 34]), T1m);
+			 T1o = VADD(T1l, T1n);
+			 T1f = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T1g = BYTWJ(&(W[TWVL * 14]), T1f);
+			 T1h = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T1i = BYTWJ(&(W[TWVL * 44]), T1h);
+			 T1j = VADD(T1g, T1i);
+		    }
+		    {
+			 V T1v, T1w, T1p, T1s, T1t;
+			 T1v = VSUB(T1g, T1i);
+			 T1w = VSUB(T1l, T1n);
+			 T1x = VFMA(LDK(KP475528258), T1v, VMUL(LDK(KP293892626), T1w));
+			 T2m = VFNMS(LDK(KP293892626), T1v, VMUL(LDK(KP475528258), T1w));
+			 T1p = VMUL(LDK(KP559016994), VSUB(T1j, T1o));
+			 T1s = VADD(T1j, T1o);
+			 T1t = VFNMS(LDK(KP250000000), T1s, T1r);
+			 T1u = VADD(T1p, T1t);
+			 T3c = VADD(T1r, T1s);
+			 T2n = VSUB(T1t, T1p);
+		    }
+	       }
+	       {
+		    V Ts, Tm, To, Tp, Th, Tj, Tk, Tr;
+		    Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Ts = BYTWJ(&(W[0]), Tr);
+		    {
+			 V Tl, Tn, Tg, Ti;
+			 Tl = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 Tm = BYTWJ(&(W[TWVL * 20]), Tl);
+			 Tn = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			 To = BYTWJ(&(W[TWVL * 30]), Tn);
+			 Tp = VADD(Tm, To);
+			 Tg = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Th = BYTWJ(&(W[TWVL * 10]), Tg);
+			 Ti = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			 Tj = BYTWJ(&(W[TWVL * 40]), Ti);
+			 Tk = VADD(Th, Tj);
+		    }
+		    {
+			 V Tw, Tx, Tq, Tt, Tu;
+			 Tw = VSUB(Th, Tj);
+			 Tx = VSUB(Tm, To);
+			 Ty = VFMA(LDK(KP475528258), Tw, VMUL(LDK(KP293892626), Tx));
+			 T2i = VFNMS(LDK(KP293892626), Tw, VMUL(LDK(KP475528258), Tx));
+			 Tq = VMUL(LDK(KP559016994), VSUB(Tk, Tp));
+			 Tt = VADD(Tk, Tp);
+			 Tu = VFNMS(LDK(KP250000000), Tt, Ts);
+			 Tv = VADD(Tq, Tu);
+			 T38 = VADD(Ts, Tt);
+			 T2j = VSUB(Tu, Tq);
+		    }
+	       }
+	       {
+		    V TM, TG, TI, TJ, TB, TD, TE, TL;
+		    TL = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    TM = BYTWJ(&(W[TWVL * 6]), TL);
+		    {
+			 V TF, TH, TA, TC;
+			 TF = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 TG = BYTWJ(&(W[TWVL * 26]), TF);
+			 TH = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 TI = BYTWJ(&(W[TWVL * 36]), TH);
+			 TJ = VADD(TG, TI);
+			 TA = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 TB = BYTWJ(&(W[TWVL * 16]), TA);
+			 TC = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			 TD = BYTWJ(&(W[TWVL * 46]), TC);
+			 TE = VADD(TB, TD);
+		    }
+		    {
+			 V TQ, TR, TK, TN, TO;
+			 TQ = VSUB(TB, TD);
+			 TR = VSUB(TG, TI);
+			 TS = VFMA(LDK(KP475528258), TQ, VMUL(LDK(KP293892626), TR));
+			 T2f = VFNMS(LDK(KP293892626), TQ, VMUL(LDK(KP475528258), TR));
+			 TK = VMUL(LDK(KP559016994), VSUB(TE, TJ));
+			 TN = VADD(TE, TJ);
+			 TO = VFNMS(LDK(KP250000000), TN, TM);
+			 TP = VADD(TK, TO);
+			 T39 = VADD(TM, TN);
+			 T2g = VSUB(TO, TK);
+		    }
+	       }
+	       {
+		    V T17, T11, T13, T14, TW, TY, TZ, T16;
+		    T16 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T17 = BYTWJ(&(W[TWVL * 2]), T16);
+		    {
+			 V T10, T12, TV, TX;
+			 T10 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 T11 = BYTWJ(&(W[TWVL * 22]), T10);
+			 T12 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 T13 = BYTWJ(&(W[TWVL * 32]), T12);
+			 T14 = VADD(T11, T13);
+			 TV = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 TW = BYTWJ(&(W[TWVL * 12]), TV);
+			 TX = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 TY = BYTWJ(&(W[TWVL * 42]), TX);
+			 TZ = VADD(TW, TY);
+		    }
+		    {
+			 V T1b, T1c, T15, T18, T19;
+			 T1b = VSUB(TW, TY);
+			 T1c = VSUB(T11, T13);
+			 T1d = VFMA(LDK(KP475528258), T1b, VMUL(LDK(KP293892626), T1c));
+			 T2p = VFNMS(LDK(KP293892626), T1b, VMUL(LDK(KP475528258), T1c));
+			 T15 = VMUL(LDK(KP559016994), VSUB(TZ, T14));
+			 T18 = VADD(TZ, T14);
+			 T19 = VFNMS(LDK(KP250000000), T18, T17);
+			 T1a = VADD(T15, T19);
+			 T3b = VADD(T17, T18);
+			 T2q = VSUB(T19, T15);
+		    }
+	       }
+	       {
+		    V T3l, T3m, T3f, T3g, T3e, T3h, T3n, T3i;
+		    {
+			 V T3j, T3k, T3a, T3d;
+			 T3j = VSUB(T38, T39);
+			 T3k = VSUB(T3b, T3c);
+			 T3l = VBYI(VFMA(LDK(KP951056516), T3j, VMUL(LDK(KP587785252), T3k)));
+			 T3m = VBYI(VFNMS(LDK(KP587785252), T3j, VMUL(LDK(KP951056516), T3k)));
+			 T3f = VADD(Tc, Td);
+			 T3a = VADD(T38, T39);
+			 T3d = VADD(T3b, T3c);
+			 T3g = VADD(T3a, T3d);
+			 T3e = VMUL(LDK(KP559016994), VSUB(T3a, T3d));
+			 T3h = VFNMS(LDK(KP250000000), T3g, T3f);
+		    }
+		    ST(&(x[0]), VADD(T3f, T3g), ms, &(x[0]));
+		    T3n = VSUB(T3h, T3e);
+		    ST(&(x[WS(rs, 10)]), VADD(T3m, T3n), ms, &(x[0]));
+		    ST(&(x[WS(rs, 15)]), VSUB(T3n, T3m), ms, &(x[WS(rs, 1)]));
+		    T3i = VADD(T3e, T3h);
+		    ST(&(x[WS(rs, 5)]), VSUB(T3i, T3l), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 20)]), VADD(T3l, T3i), ms, &(x[0]));
+	       }
+	       {
+		    V Tf, T1Z, T20, T21, T29, T2a, T2b, T26, T27, T28, T22, T23, T24, T1L, T1U;
+		    V T1Q, T1S, T1A, T1V, T1N, T1O, T2d, T2e;
+		    Tf = VADD(Tb, Te);
+		    T1Z = VFMA(LDK(KP1_688655851), Ty, VMUL(LDK(KP535826794), Tv));
+		    T20 = VFMA(LDK(KP1_541026485), TS, VMUL(LDK(KP637423989), TP));
+		    T21 = VSUB(T1Z, T20);
+		    T29 = VFMA(LDK(KP851558583), T1d, VMUL(LDK(KP904827052), T1a));
+		    T2a = VFMA(LDK(KP1_984229402), T1x, VMUL(LDK(KP125333233), T1u));
+		    T2b = VADD(T29, T2a);
+		    T26 = VFNMS(LDK(KP844327925), Tv, VMUL(LDK(KP1_071653589), Ty));
+		    T27 = VFNMS(LDK(KP1_274847979), TS, VMUL(LDK(KP770513242), TP));
+		    T28 = VADD(T26, T27);
+		    T22 = VFNMS(LDK(KP425779291), T1a, VMUL(LDK(KP1_809654104), T1d));
+		    T23 = VFNMS(LDK(KP992114701), T1u, VMUL(LDK(KP250666467), T1x));
+		    T24 = VADD(T22, T23);
+		    {
+			 V T1F, T1G, T1H, T1I, T1J, T1K;
+			 T1F = VFMA(LDK(KP1_937166322), Ty, VMUL(LDK(KP248689887), Tv));
+			 T1G = VFMA(LDK(KP1_071653589), TS, VMUL(LDK(KP844327925), TP));
+			 T1H = VADD(T1F, T1G);
+			 T1I = VFMA(LDK(KP1_752613360), T1d, VMUL(LDK(KP481753674), T1a));
+			 T1J = VFMA(LDK(KP1_457937254), T1x, VMUL(LDK(KP684547105), T1u));
+			 T1K = VADD(T1I, T1J);
+			 T1L = VADD(T1H, T1K);
+			 T1U = VSUB(T1J, T1I);
+			 T1Q = VMUL(LDK(KP559016994), VSUB(T1K, T1H));
+			 T1S = VSUB(T1G, T1F);
+		    }
+		    {
+			 V Tz, TT, TU, T1e, T1y, T1z;
+			 Tz = VFNMS(LDK(KP497379774), Ty, VMUL(LDK(KP968583161), Tv));
+			 TT = VFNMS(LDK(KP1_688655851), TS, VMUL(LDK(KP535826794), TP));
+			 TU = VADD(Tz, TT);
+			 T1e = VFNMS(LDK(KP963507348), T1d, VMUL(LDK(KP876306680), T1a));
+			 T1y = VFNMS(LDK(KP1_369094211), T1x, VMUL(LDK(KP728968627), T1u));
+			 T1z = VADD(T1e, T1y);
+			 T1A = VADD(TU, T1z);
+			 T1V = VMUL(LDK(KP559016994), VSUB(TU, T1z));
+			 T1N = VSUB(TT, Tz);
+			 T1O = VSUB(T1e, T1y);
+		    }
+		    {
+			 V T1B, T1M, T25, T2c;
+			 T1B = VADD(Tf, T1A);
+			 T1M = VBYI(VADD(T1E, T1L));
+			 ST(&(x[WS(rs, 1)]), VSUB(T1B, T1M), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 24)]), VADD(T1B, T1M), ms, &(x[0]));
+			 T25 = VADD(Tf, VADD(T21, T24));
+			 T2c = VBYI(VADD(T1E, VSUB(T28, T2b)));
+			 ST(&(x[WS(rs, 21)]), VSUB(T25, T2c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 4)]), VADD(T25, T2c), ms, &(x[0]));
+		    }
+		    T2d = VBYI(VADD(T1E, VFMA(LDK(KP309016994), T28, VFMA(LDK(KP587785252), VSUB(T23, T22), VFNMS(LDK(KP951056516), VADD(T1Z, T20), VMUL(LDK(KP809016994), T2b))))));
+		    T2e = VFMA(LDK(KP309016994), T21, VFMA(LDK(KP951056516), VSUB(T26, T27), VFMA(LDK(KP587785252), VSUB(T2a, T29), VFNMS(LDK(KP809016994), T24, Tf))));
+		    ST(&(x[WS(rs, 9)]), VADD(T2d, T2e), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 16)]), VSUB(T2e, T2d), ms, &(x[0]));
+		    {
+			 V T1R, T1X, T1W, T1Y, T1P, T1T;
+			 T1P = VFMS(LDK(KP250000000), T1L, T1E);
+			 T1R = VBYI(VADD(VFMA(LDK(KP587785252), T1N, VMUL(LDK(KP951056516), T1O)), VSUB(T1P, T1Q)));
+			 T1X = VBYI(VADD(VFNMS(LDK(KP587785252), T1O, VMUL(LDK(KP951056516), T1N)), VADD(T1P, T1Q)));
+			 T1T = VFNMS(LDK(KP250000000), T1A, Tf);
+			 T1W = VFMA(LDK(KP587785252), T1S, VFNMS(LDK(KP951056516), T1U, VSUB(T1T, T1V)));
+			 T1Y = VFMA(LDK(KP951056516), T1S, VADD(T1V, VFMA(LDK(KP587785252), T1U, T1T)));
+			 ST(&(x[WS(rs, 11)]), VADD(T1R, T1W), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 19)]), VSUB(T1Y, T1X), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 14)]), VSUB(T1W, T1R), ms, &(x[0]));
+			 ST(&(x[WS(rs, 6)]), VADD(T1X, T1Y), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T2u, T2w, T2h, T2k, T2l, T2A, T2B, T2C, T2o, T2r, T2s, T2x, T2y, T2z, T2M;
+		    V T2X, T2N, T2W, T2R, T31, T2U, T30, T2E, T2F;
+		    T2u = VFNMS(LDK(KP587785252), T1C, T2t);
+		    T2w = VSUB(Te, Tb);
+		    T2h = VFNMS(LDK(KP125333233), T2g, VMUL(LDK(KP1_984229402), T2f));
+		    T2k = VFMA(LDK(KP1_457937254), T2i, VMUL(LDK(KP684547105), T2j));
+		    T2l = VSUB(T2h, T2k);
+		    T2A = VFNMS(LDK(KP1_996053456), T2p, VMUL(LDK(KP062790519), T2q));
+		    T2B = VFMA(LDK(KP1_541026485), T2m, VMUL(LDK(KP637423989), T2n));
+		    T2C = VSUB(T2A, T2B);
+		    T2o = VFNMS(LDK(KP770513242), T2n, VMUL(LDK(KP1_274847979), T2m));
+		    T2r = VFMA(LDK(KP125581039), T2p, VMUL(LDK(KP998026728), T2q));
+		    T2s = VSUB(T2o, T2r);
+		    T2x = VFNMS(LDK(KP1_369094211), T2i, VMUL(LDK(KP728968627), T2j));
+		    T2y = VFMA(LDK(KP250666467), T2f, VMUL(LDK(KP992114701), T2g));
+		    T2z = VSUB(T2x, T2y);
+		    {
+			 V T2G, T2H, T2I, T2J, T2K, T2L;
+			 T2G = VFNMS(LDK(KP481753674), T2j, VMUL(LDK(KP1_752613360), T2i));
+			 T2H = VFMA(LDK(KP851558583), T2f, VMUL(LDK(KP904827052), T2g));
+			 T2I = VSUB(T2G, T2H);
+			 T2J = VFNMS(LDK(KP844327925), T2q, VMUL(LDK(KP1_071653589), T2p));
+			 T2K = VFNMS(LDK(KP998026728), T2n, VMUL(LDK(KP125581039), T2m));
+			 T2L = VADD(T2J, T2K);
+			 T2M = VMUL(LDK(KP559016994), VSUB(T2I, T2L));
+			 T2X = VSUB(T2J, T2K);
+			 T2N = VADD(T2I, T2L);
+			 T2W = VADD(T2G, T2H);
+		    }
+		    {
+			 V T2P, T2Q, T2Y, T2S, T2T, T2Z;
+			 T2P = VFNMS(LDK(KP425779291), T2g, VMUL(LDK(KP1_809654104), T2f));
+			 T2Q = VFMA(LDK(KP963507348), T2i, VMUL(LDK(KP876306680), T2j));
+			 T2Y = VADD(T2Q, T2P);
+			 T2S = VFMA(LDK(KP1_688655851), T2p, VMUL(LDK(KP535826794), T2q));
+			 T2T = VFMA(LDK(KP1_996053456), T2m, VMUL(LDK(KP062790519), T2n));
+			 T2Z = VADD(T2S, T2T);
+			 T2R = VSUB(T2P, T2Q);
+			 T31 = VADD(T2Y, T2Z);
+			 T2U = VSUB(T2S, T2T);
+			 T30 = VMUL(LDK(KP559016994), VSUB(T2Y, T2Z));
+		    }
+		    {
+			 V T36, T37, T2v, T2D;
+			 T36 = VBYI(VADD(T2u, T2N));
+			 T37 = VADD(T2w, T31);
+			 ST(&(x[WS(rs, 2)]), VADD(T36, T37), ms, &(x[0]));
+			 ST(&(x[WS(rs, 23)]), VSUB(T37, T36), ms, &(x[WS(rs, 1)]));
+			 T2v = VBYI(VSUB(VADD(T2l, T2s), T2u));
+			 T2D = VADD(T2w, VADD(T2z, T2C));
+			 ST(&(x[WS(rs, 3)]), VADD(T2v, T2D), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 22)]), VSUB(T2D, T2v), ms, &(x[0]));
+		    }
+		    T2E = VFMA(LDK(KP309016994), T2z, VFNMS(LDK(KP809016994), T2C, VFNMS(LDK(KP587785252), VADD(T2r, T2o), VFNMS(LDK(KP951056516), VADD(T2k, T2h), T2w))));
+		    T2F = VBYI(VSUB(VFNMS(LDK(KP587785252), VADD(T2A, T2B), VFNMS(LDK(KP809016994), T2s, VFNMS(LDK(KP951056516), VADD(T2x, T2y), VMUL(LDK(KP309016994), T2l)))), T2u));
+		    ST(&(x[WS(rs, 17)]), VSUB(T2E, T2F), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 8)]), VADD(T2E, T2F), ms, &(x[0]));
+		    {
+			 V T2V, T34, T33, T35, T2O, T32;
+			 T2O = VFNMS(LDK(KP250000000), T2N, T2u);
+			 T2V = VBYI(VADD(T2M, VADD(T2O, VFNMS(LDK(KP587785252), T2U, VMUL(LDK(KP951056516), T2R)))));
+			 T34 = VBYI(VADD(T2O, VSUB(VFMA(LDK(KP587785252), T2R, VMUL(LDK(KP951056516), T2U)), T2M)));
+			 T32 = VFNMS(LDK(KP250000000), T31, T2w);
+			 T33 = VFMA(LDK(KP951056516), T2W, VFMA(LDK(KP587785252), T2X, VADD(T30, T32)));
+			 T35 = VFMA(LDK(KP587785252), T2W, VSUB(VFNMS(LDK(KP951056516), T2X, T32), T30));
+			 ST(&(x[WS(rs, 7)]), VADD(T2V, T33), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VSUB(T35, T34), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 18)]), VSUB(T33, T2V), ms, &(x[0]));
+			 ST(&(x[WS(rs, 12)]), VADD(T34, T35), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t1fv_25"), twinstr, &GENUS, {170, 110, 78, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_25) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:01 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 3 -name t1fv_3 -include t1f.h */
+
+/*
+ * This function contains 8 FP additions, 8 FP multiplications,
+ * (or, 5 additions, 5 multiplications, 3 fused multiply/add),
+ * 12 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(3, rs)) {
+	       V T1, T2, T4;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, T5, T8, T6, T7;
+		    T3 = BYTWJ(&(W[0]), T2);
+		    T5 = BYTWJ(&(W[TWVL * 2]), T4);
+		    T8 = VMUL(LDK(KP866025403), VSUB(T5, T3));
+		    T6 = VADD(T3, T5);
+		    T7 = VFNMS(LDK(KP500000000), T6, T1);
+		    ST(&(x[0]), VADD(T1, T6), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T8, T7), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 2)]), VFNMSI(T8, T7), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 3, XSIMD_STRING("t1fv_3"), twinstr, &GENUS, {5, 5, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_3) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_3, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 3 -name t1fv_3 -include t1f.h */
+
+/*
+ * This function contains 8 FP additions, 6 FP multiplications,
+ * (or, 7 additions, 5 multiplications, 1 fused multiply/add),
+ * 12 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(3, rs)) {
+	       V T1, T3, T5, T6, T2, T4, T7, T8;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[0]), T2);
+	       T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T5 = BYTWJ(&(W[TWVL * 2]), T4);
+	       T6 = VADD(T3, T5);
+	       ST(&(x[0]), VADD(T1, T6), ms, &(x[0]));
+	       T7 = VFNMS(LDK(KP500000000), T6, T1);
+	       T8 = VBYI(VMUL(LDK(KP866025403), VSUB(T5, T3)));
+	       ST(&(x[WS(rs, 2)]), VSUB(T7, T8), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VADD(T7, T8), ms, &(x[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 3, XSIMD_STRING("t1fv_3"), twinstr, &GENUS, {7, 5, 1, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_3) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_3, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,863 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:06 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1fv_32 -include t1f.h */
+
+/*
+ * This function contains 217 FP additions, 160 FP multiplications,
+ * (or, 119 additions, 62 multiplications, 98 fused multiply/add),
+ * 112 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T26, T25, T1Z, T22, T1W, T2a, T2k, T2g;
+	       {
+		    V T4, T1z, T2o, T32, T2r, T3f, Tf, T1A, T34, T2L, T1D, TC, T33, T2O, T1C;
+		    V Tr, T2C, T3a, T2F, T3b, T1r, T21, T1k, T20, TQ, TM, TS, TL, T2t, TJ;
+		    V T10, T2u;
+		    {
+			 V Tt, T9, T2p, Te, T2q, TA, Tu, Tx;
+			 {
+			      V T1, T1x, T2, T1v;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T1x = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			      T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			      T1v = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      {
+				   V T5, Tc, T7, Ta, T2m, T2n;
+				   T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+				   Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+				   {
+					V T1y, T3, T1w, T6, Td, T8, Tb, Ts, Tz;
+					Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+					T1y = BYTWJ(&(W[TWVL * 46]), T1x);
+					T3 = BYTWJ(&(W[TWVL * 30]), T2);
+					T1w = BYTWJ(&(W[TWVL * 14]), T1v);
+					T6 = BYTWJ(&(W[TWVL * 6]), T5);
+					Td = BYTWJ(&(W[TWVL * 22]), Tc);
+					T8 = BYTWJ(&(W[TWVL * 38]), T7);
+					Tb = BYTWJ(&(W[TWVL * 54]), Ta);
+					Tt = BYTWJ(&(W[TWVL * 58]), Ts);
+					Tz = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+					T4 = VSUB(T1, T3);
+					T2m = VADD(T1, T3);
+					T1z = VSUB(T1w, T1y);
+					T2n = VADD(T1w, T1y);
+					T9 = VSUB(T6, T8);
+					T2p = VADD(T6, T8);
+					Te = VSUB(Tb, Td);
+					T2q = VADD(Tb, Td);
+					TA = BYTWJ(&(W[TWVL * 10]), Tz);
+				   }
+				   Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   T2o = VADD(T2m, T2n);
+				   T32 = VSUB(T2m, T2n);
+				   Tx = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			      }
+			 }
+			 {
+			      V Tv, To, Ty, Ti, Tj, Tm, Th;
+			      Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      T2r = VADD(T2p, T2q);
+			      T3f = VSUB(T2q, T2p);
+			      Tf = VADD(T9, Te);
+			      T1A = VSUB(Te, T9);
+			      Tv = BYTWJ(&(W[TWVL * 26]), Tu);
+			      To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			      Ty = BYTWJ(&(W[TWVL * 42]), Tx);
+			      Ti = BYTWJ(&(W[TWVL * 2]), Th);
+			      Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      {
+				   V T1f, T1h, T1a, T1c, T18, T2A, T2B, T1p;
+				   {
+					V T15, T17, T1o, T1m;
+					{
+					     V Tw, T2J, Tp, T2K, TB, Tk, Tn, T1n, T14, T16;
+					     T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+					     T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					     Tw = VSUB(Tt, Tv);
+					     T2J = VADD(Tt, Tv);
+					     Tp = BYTWJ(&(W[TWVL * 50]), To);
+					     T2K = VADD(TA, Ty);
+					     TB = VSUB(Ty, TA);
+					     Tk = BYTWJ(&(W[TWVL * 34]), Tj);
+					     Tn = BYTWJ(&(W[TWVL * 18]), Tm);
+					     T15 = BYTWJ(&(W[TWVL * 60]), T14);
+					     T17 = BYTWJ(&(W[TWVL * 28]), T16);
+					     T1n = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     {
+						  V T2M, Tl, T2N, Tq, T1l;
+						  T1l = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+						  T34 = VSUB(T2J, T2K);
+						  T2L = VADD(T2J, T2K);
+						  T1D = VFMA(LDK(KP414213562), Tw, TB);
+						  TC = VFNMS(LDK(KP414213562), TB, Tw);
+						  T2M = VADD(Ti, Tk);
+						  Tl = VSUB(Ti, Tk);
+						  T2N = VADD(Tn, Tp);
+						  Tq = VSUB(Tn, Tp);
+						  T1o = BYTWJ(&(W[TWVL * 12]), T1n);
+						  T1m = BYTWJ(&(W[TWVL * 44]), T1l);
+						  {
+						       V T1e, T1g, T19, T1b;
+						       T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+						       T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+						       T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+						       T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+						       T33 = VSUB(T2M, T2N);
+						       T2O = VADD(T2M, T2N);
+						       T1C = VFMA(LDK(KP414213562), Tl, Tq);
+						       Tr = VFNMS(LDK(KP414213562), Tq, Tl);
+						       T1f = BYTWJ(&(W[TWVL * 52]), T1e);
+						       T1h = BYTWJ(&(W[TWVL * 20]), T1g);
+						       T1a = BYTWJ(&(W[TWVL * 4]), T19);
+						       T1c = BYTWJ(&(W[TWVL * 36]), T1b);
+						  }
+					     }
+					}
+					T18 = VSUB(T15, T17);
+					T2A = VADD(T15, T17);
+					T2B = VADD(T1o, T1m);
+					T1p = VSUB(T1m, T1o);
+				   }
+				   {
+					V TG, TI, TZ, TX;
+					{
+					     V T1i, T2E, T1d, T2D, TH, TY, TF;
+					     TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					     T1i = VSUB(T1f, T1h);
+					     T2E = VADD(T1f, T1h);
+					     T1d = VSUB(T1a, T1c);
+					     T2D = VADD(T1a, T1c);
+					     TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					     TY = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+					     T2C = VADD(T2A, T2B);
+					     T3a = VSUB(T2A, T2B);
+					     TG = BYTWJ(&(W[0]), TF);
+					     {
+						  V TW, T1j, T1q, TP, TR, TK;
+						  TW = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+						  T2F = VADD(T2D, T2E);
+						  T3b = VSUB(T2E, T2D);
+						  T1j = VADD(T1d, T1i);
+						  T1q = VSUB(T1i, T1d);
+						  TI = BYTWJ(&(W[TWVL * 32]), TH);
+						  TZ = BYTWJ(&(W[TWVL * 48]), TY);
+						  TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+						  TX = BYTWJ(&(W[TWVL * 16]), TW);
+						  TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+						  TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+						  T1r = VFMA(LDK(KP707106781), T1q, T1p);
+						  T21 = VFNMS(LDK(KP707106781), T1q, T1p);
+						  T1k = VFMA(LDK(KP707106781), T1j, T18);
+						  T20 = VFNMS(LDK(KP707106781), T1j, T18);
+						  TQ = BYTWJ(&(W[TWVL * 56]), TP);
+						  TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+						  TS = BYTWJ(&(W[TWVL * 24]), TR);
+						  TL = BYTWJ(&(W[TWVL * 8]), TK);
+					     }
+					}
+					T2t = VADD(TG, TI);
+					TJ = VSUB(TG, TI);
+					T10 = VSUB(TX, TZ);
+					T2u = VADD(TX, TZ);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T2s, TT, T2x, T2P, T2Y, T2G, T37, T2v, T2w, TO, T2W, T30, T2U, TN, T2V;
+			 T2s = VSUB(T2o, T2r);
+			 T2U = VADD(T2o, T2r);
+			 TN = BYTWJ(&(W[TWVL * 40]), TM);
+			 TT = VSUB(TQ, TS);
+			 T2x = VADD(TQ, TS);
+			 T2P = VSUB(T2L, T2O);
+			 T2V = VADD(T2O, T2L);
+			 T2Y = VADD(T2C, T2F);
+			 T2G = VSUB(T2C, T2F);
+			 T37 = VSUB(T2t, T2u);
+			 T2v = VADD(T2t, T2u);
+			 T2w = VADD(TL, TN);
+			 TO = VSUB(TL, TN);
+			 T2W = VADD(T2U, T2V);
+			 T30 = VSUB(T2U, T2V);
+			 {
+			      V T3i, T3o, T36, T3r, T3h, T3j, T12, T1Y, TV, T1X, T3s, T3d, T2Q, T2H, T31;
+			      V T2Z;
+			      {
+				   V T35, T3g, T38, T2y, T11, TU;
+				   T35 = VADD(T33, T34);
+				   T3g = VSUB(T34, T33);
+				   T38 = VSUB(T2w, T2x);
+				   T2y = VADD(T2w, T2x);
+				   T11 = VSUB(TO, TT);
+				   TU = VADD(TO, TT);
+				   {
+					V T3c, T39, T2X, T2z;
+					T3c = VFNMS(LDK(KP414213562), T3b, T3a);
+					T3i = VFMA(LDK(KP414213562), T3a, T3b);
+					T3o = VFNMS(LDK(KP707106781), T35, T32);
+					T36 = VFMA(LDK(KP707106781), T35, T32);
+					T3r = VFNMS(LDK(KP707106781), T3g, T3f);
+					T3h = VFMA(LDK(KP707106781), T3g, T3f);
+					T39 = VFNMS(LDK(KP414213562), T38, T37);
+					T3j = VFMA(LDK(KP414213562), T37, T38);
+					T2X = VADD(T2v, T2y);
+					T2z = VSUB(T2v, T2y);
+					T12 = VFMA(LDK(KP707106781), T11, T10);
+					T1Y = VFNMS(LDK(KP707106781), T11, T10);
+					TV = VFMA(LDK(KP707106781), TU, TJ);
+					T1X = VFNMS(LDK(KP707106781), TU, TJ);
+					T3s = VSUB(T3c, T39);
+					T3d = VADD(T39, T3c);
+					T2Q = VSUB(T2G, T2z);
+					T2H = VADD(T2z, T2G);
+					T31 = VSUB(T2Y, T2X);
+					T2Z = VADD(T2X, T2Y);
+				   }
+			      }
+			      {
+				   V Tg, T1U, TD, T1G, T13, T1s, T1H, T1B, T1V, T1E, T3k, T3p, T2e, T2f;
+				   Tg = VFMA(LDK(KP707106781), Tf, T4);
+				   T1U = VFNMS(LDK(KP707106781), Tf, T4);
+				   T3k = VSUB(T3i, T3j);
+				   T3p = VADD(T3j, T3i);
+				   {
+					V T3v, T3t, T3e, T3m;
+					T3v = VFNMS(LDK(KP923879532), T3s, T3r);
+					T3t = VFMA(LDK(KP923879532), T3s, T3r);
+					T3e = VFNMS(LDK(KP923879532), T3d, T36);
+					T3m = VFMA(LDK(KP923879532), T3d, T36);
+					{
+					     V T2R, T2T, T2I, T2S;
+					     T2R = VFNMS(LDK(KP707106781), T2Q, T2P);
+					     T2T = VFMA(LDK(KP707106781), T2Q, T2P);
+					     T2I = VFNMS(LDK(KP707106781), T2H, T2s);
+					     T2S = VFMA(LDK(KP707106781), T2H, T2s);
+					     ST(&(x[WS(rs, 24)]), VFNMSI(T31, T30), ms, &(x[0]));
+					     ST(&(x[WS(rs, 8)]), VFMAI(T31, T30), ms, &(x[0]));
+					     ST(&(x[0]), VADD(T2W, T2Z), ms, &(x[0]));
+					     ST(&(x[WS(rs, 16)]), VSUB(T2W, T2Z), ms, &(x[0]));
+					     {
+						  V T3u, T3q, T3l, T3n;
+						  T3u = VFMA(LDK(KP923879532), T3p, T3o);
+						  T3q = VFNMS(LDK(KP923879532), T3p, T3o);
+						  T3l = VFNMS(LDK(KP923879532), T3k, T3h);
+						  T3n = VFMA(LDK(KP923879532), T3k, T3h);
+						  ST(&(x[WS(rs, 4)]), VFMAI(T2T, T2S), ms, &(x[0]));
+						  ST(&(x[WS(rs, 28)]), VFNMSI(T2T, T2S), ms, &(x[0]));
+						  ST(&(x[WS(rs, 20)]), VFMAI(T2R, T2I), ms, &(x[0]));
+						  ST(&(x[WS(rs, 12)]), VFNMSI(T2R, T2I), ms, &(x[0]));
+						  ST(&(x[WS(rs, 22)]), VFNMSI(T3t, T3q), ms, &(x[0]));
+						  ST(&(x[WS(rs, 10)]), VFMAI(T3t, T3q), ms, &(x[0]));
+						  ST(&(x[WS(rs, 26)]), VFMAI(T3v, T3u), ms, &(x[0]));
+						  ST(&(x[WS(rs, 6)]), VFNMSI(T3v, T3u), ms, &(x[0]));
+						  ST(&(x[WS(rs, 2)]), VFMAI(T3n, T3m), ms, &(x[0]));
+						  ST(&(x[WS(rs, 30)]), VFNMSI(T3n, T3m), ms, &(x[0]));
+						  ST(&(x[WS(rs, 18)]), VFMAI(T3l, T3e), ms, &(x[0]));
+						  ST(&(x[WS(rs, 14)]), VFNMSI(T3l, T3e), ms, &(x[0]));
+						  T26 = VSUB(TC, Tr);
+						  TD = VADD(Tr, TC);
+					     }
+					}
+				   }
+				   T1G = VFMA(LDK(KP198912367), TV, T12);
+				   T13 = VFNMS(LDK(KP198912367), T12, TV);
+				   T1s = VFNMS(LDK(KP198912367), T1r, T1k);
+				   T1H = VFMA(LDK(KP198912367), T1k, T1r);
+				   T1B = VFNMS(LDK(KP707106781), T1A, T1z);
+				   T25 = VFMA(LDK(KP707106781), T1A, T1z);
+				   T1V = VADD(T1C, T1D);
+				   T1E = VSUB(T1C, T1D);
+				   {
+					V T1S, T1O, T1K, T1u, T1R, T1T, T1L, T1J;
+					{
+					     V TE, T1M, T1I, T1N, T1t, T1Q, T1F, T1P, T28, T29;
+					     TE = VFMA(LDK(KP923879532), TD, Tg);
+					     T1M = VFNMS(LDK(KP923879532), TD, Tg);
+					     T1I = VSUB(T1G, T1H);
+					     T1N = VADD(T1G, T1H);
+					     T1t = VADD(T13, T1s);
+					     T1Q = VSUB(T1s, T13);
+					     T1F = VFMA(LDK(KP923879532), T1E, T1B);
+					     T1P = VFNMS(LDK(KP923879532), T1E, T1B);
+					     T28 = VFNMS(LDK(KP668178637), T1X, T1Y);
+					     T1Z = VFMA(LDK(KP668178637), T1Y, T1X);
+					     T1S = VFMA(LDK(KP980785280), T1N, T1M);
+					     T1O = VFNMS(LDK(KP980785280), T1N, T1M);
+					     T22 = VFMA(LDK(KP668178637), T21, T20);
+					     T29 = VFNMS(LDK(KP668178637), T20, T21);
+					     T1K = VFMA(LDK(KP980785280), T1t, TE);
+					     T1u = VFNMS(LDK(KP980785280), T1t, TE);
+					     T1R = VFNMS(LDK(KP980785280), T1Q, T1P);
+					     T1T = VFMA(LDK(KP980785280), T1Q, T1P);
+					     T1L = VFMA(LDK(KP980785280), T1I, T1F);
+					     T1J = VFNMS(LDK(KP980785280), T1I, T1F);
+					     T2e = VFNMS(LDK(KP923879532), T1V, T1U);
+					     T1W = VFMA(LDK(KP923879532), T1V, T1U);
+					     T2a = VSUB(T28, T29);
+					     T2f = VADD(T28, T29);
+					}
+					ST(&(x[WS(rs, 23)]), VFMAI(T1R, T1O), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 9)]), VFNMSI(T1R, T1O), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 25)]), VFNMSI(T1T, T1S), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 7)]), VFMAI(T1T, T1S), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 31)]), VFMAI(T1L, T1K), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 1)]), VFNMSI(T1L, T1K), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 15)]), VFMAI(T1J, T1u), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 17)]), VFNMSI(T1J, T1u), ms, &(x[WS(rs, 1)]));
+				   }
+				   T2k = VFNMS(LDK(KP831469612), T2f, T2e);
+				   T2g = VFMA(LDK(KP831469612), T2f, T2e);
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T2i, T23, T2h, T27;
+		    T2i = VSUB(T22, T1Z);
+		    T23 = VADD(T1Z, T22);
+		    T2h = VFNMS(LDK(KP923879532), T26, T25);
+		    T27 = VFMA(LDK(KP923879532), T26, T25);
+		    {
+			 V T2c, T24, T2j, T2l, T2d, T2b;
+			 T2c = VFMA(LDK(KP831469612), T23, T1W);
+			 T24 = VFNMS(LDK(KP831469612), T23, T1W);
+			 T2j = VFMA(LDK(KP831469612), T2i, T2h);
+			 T2l = VFNMS(LDK(KP831469612), T2i, T2h);
+			 T2d = VFMA(LDK(KP831469612), T2a, T27);
+			 T2b = VFNMS(LDK(KP831469612), T2a, T27);
+			 ST(&(x[WS(rs, 21)]), VFNMSI(T2j, T2g), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VFMAI(T2j, T2g), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 27)]), VFMAI(T2l, T2k), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VFNMSI(T2l, T2k), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VFMAI(T2d, T2c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 29)]), VFNMSI(T2d, T2c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 19)]), VFMAI(T2b, T24), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VFNMSI(T2b, T24), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t1fv_32"), twinstr, &GENUS, {119, 62, 98, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_32) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1fv_32 -include t1f.h */
+
+/*
+ * This function contains 217 FP additions, 104 FP multiplications,
+ * (or, 201 additions, 88 multiplications, 16 fused multiply/add),
+ * 59 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T4, T1A, T2o, T32, Tf, T1v, T2r, T3f, TC, T1C, T2L, T34, Tr, T1D, T2O;
+	       V T33, T1k, T20, T2F, T3b, T1r, T21, T2C, T3a, TV, T1X, T2y, T38, T12, T1Y;
+	       V T2v, T37;
+	       {
+		    V T1, T1z, T3, T1x, T1y, T2, T1w, T2m, T2n;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T1y = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+		    T1z = BYTWJ(&(W[TWVL * 46]), T1y);
+		    T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+		    T3 = BYTWJ(&(W[TWVL * 30]), T2);
+		    T1w = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    T1x = BYTWJ(&(W[TWVL * 14]), T1w);
+		    T4 = VSUB(T1, T3);
+		    T1A = VSUB(T1x, T1z);
+		    T2m = VADD(T1, T3);
+		    T2n = VADD(T1x, T1z);
+		    T2o = VADD(T2m, T2n);
+		    T32 = VSUB(T2m, T2n);
+	       }
+	       {
+		    V T6, Td, T8, Tb;
+		    {
+			 V T5, Tc, T7, Ta;
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T6 = BYTWJ(&(W[TWVL * 6]), T5);
+			 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Td = BYTWJ(&(W[TWVL * 22]), Tc);
+			 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 T8 = BYTWJ(&(W[TWVL * 38]), T7);
+			 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+			 Tb = BYTWJ(&(W[TWVL * 54]), Ta);
+		    }
+		    {
+			 V T9, Te, T2p, T2q;
+			 T9 = VSUB(T6, T8);
+			 Te = VSUB(Tb, Td);
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 T1v = VMUL(LDK(KP707106781), VSUB(Te, T9));
+			 T2p = VADD(T6, T8);
+			 T2q = VADD(Tb, Td);
+			 T2r = VADD(T2p, T2q);
+			 T3f = VSUB(T2q, T2p);
+		    }
+	       }
+	       {
+		    V Tt, TA, Tv, Ty;
+		    {
+			 V Ts, Tz, Tu, Tx;
+			 Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+			 Tt = BYTWJ(&(W[TWVL * 58]), Ts);
+			 Tz = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 TA = BYTWJ(&(W[TWVL * 42]), Tz);
+			 Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 Tv = BYTWJ(&(W[TWVL * 26]), Tu);
+			 Tx = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Ty = BYTWJ(&(W[TWVL * 10]), Tx);
+		    }
+		    {
+			 V Tw, TB, T2J, T2K;
+			 Tw = VSUB(Tt, Tv);
+			 TB = VSUB(Ty, TA);
+			 TC = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), TB));
+			 T1C = VFNMS(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
+			 T2J = VADD(Tt, Tv);
+			 T2K = VADD(Ty, TA);
+			 T2L = VADD(T2J, T2K);
+			 T34 = VSUB(T2J, T2K);
+		    }
+	       }
+	       {
+		    V Ti, Tp, Tk, Tn;
+		    {
+			 V Th, To, Tj, Tm;
+			 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Ti = BYTWJ(&(W[TWVL * 2]), Th);
+			 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			 Tp = BYTWJ(&(W[TWVL * 50]), To);
+			 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 Tk = BYTWJ(&(W[TWVL * 34]), Tj);
+			 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 Tn = BYTWJ(&(W[TWVL * 18]), Tm);
+		    }
+		    {
+			 V Tl, Tq, T2M, T2N;
+			 Tl = VSUB(Ti, Tk);
+			 Tq = VSUB(Tn, Tp);
+			 Tr = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
+			 T1D = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
+			 T2M = VADD(Ti, Tk);
+			 T2N = VADD(Tn, Tp);
+			 T2O = VADD(T2M, T2N);
+			 T33 = VSUB(T2M, T2N);
+		    }
+	       }
+	       {
+		    V T15, T17, T1p, T1n, T1f, T1h, T1i, T1a, T1c, T1d;
+		    {
+			 V T14, T16, T1o, T1m;
+			 T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+			 T15 = BYTWJ(&(W[TWVL * 60]), T14);
+			 T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T17 = BYTWJ(&(W[TWVL * 28]), T16);
+			 T1o = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T1p = BYTWJ(&(W[TWVL * 44]), T1o);
+			 T1m = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T1n = BYTWJ(&(W[TWVL * 12]), T1m);
+			 {
+			      V T1e, T1g, T19, T1b;
+			      T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+			      T1f = BYTWJ(&(W[TWVL * 52]), T1e);
+			      T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      T1h = BYTWJ(&(W[TWVL * 20]), T1g);
+			      T1i = VSUB(T1f, T1h);
+			      T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T1a = BYTWJ(&(W[TWVL * 4]), T19);
+			      T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			      T1c = BYTWJ(&(W[TWVL * 36]), T1b);
+			      T1d = VSUB(T1a, T1c);
+			 }
+		    }
+		    {
+			 V T18, T1j, T2D, T2E;
+			 T18 = VSUB(T15, T17);
+			 T1j = VMUL(LDK(KP707106781), VADD(T1d, T1i));
+			 T1k = VADD(T18, T1j);
+			 T20 = VSUB(T18, T1j);
+			 T2D = VADD(T1a, T1c);
+			 T2E = VADD(T1f, T1h);
+			 T2F = VADD(T2D, T2E);
+			 T3b = VSUB(T2E, T2D);
+		    }
+		    {
+			 V T1l, T1q, T2A, T2B;
+			 T1l = VMUL(LDK(KP707106781), VSUB(T1i, T1d));
+			 T1q = VSUB(T1n, T1p);
+			 T1r = VSUB(T1l, T1q);
+			 T21 = VADD(T1q, T1l);
+			 T2A = VADD(T15, T17);
+			 T2B = VADD(T1n, T1p);
+			 T2C = VADD(T2A, T2B);
+			 T3a = VSUB(T2A, T2B);
+		    }
+	       }
+	       {
+		    V TG, TI, T10, TY, TQ, TS, TT, TL, TN, TO;
+		    {
+			 V TF, TH, TZ, TX;
+			 TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 TG = BYTWJ(&(W[0]), TF);
+			 TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 TI = BYTWJ(&(W[TWVL * 32]), TH);
+			 TZ = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+			 T10 = BYTWJ(&(W[TWVL * 48]), TZ);
+			 TX = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 TY = BYTWJ(&(W[TWVL * 16]), TX);
+			 {
+			      V TP, TR, TK, TM;
+			      TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+			      TQ = BYTWJ(&(W[TWVL * 56]), TP);
+			      TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			      TS = BYTWJ(&(W[TWVL * 24]), TR);
+			      TT = VSUB(TQ, TS);
+			      TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      TL = BYTWJ(&(W[TWVL * 8]), TK);
+			      TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			      TN = BYTWJ(&(W[TWVL * 40]), TM);
+			      TO = VSUB(TL, TN);
+			 }
+		    }
+		    {
+			 V TJ, TU, T2w, T2x;
+			 TJ = VSUB(TG, TI);
+			 TU = VMUL(LDK(KP707106781), VADD(TO, TT));
+			 TV = VADD(TJ, TU);
+			 T1X = VSUB(TJ, TU);
+			 T2w = VADD(TL, TN);
+			 T2x = VADD(TQ, TS);
+			 T2y = VADD(T2w, T2x);
+			 T38 = VSUB(T2x, T2w);
+		    }
+		    {
+			 V TW, T11, T2t, T2u;
+			 TW = VMUL(LDK(KP707106781), VSUB(TT, TO));
+			 T11 = VSUB(TY, T10);
+			 T12 = VSUB(TW, T11);
+			 T1Y = VADD(T11, TW);
+			 T2t = VADD(TG, TI);
+			 T2u = VADD(TY, T10);
+			 T2v = VADD(T2t, T2u);
+			 T37 = VSUB(T2t, T2u);
+		    }
+	       }
+	       {
+		    V T2W, T30, T2Z, T31;
+		    {
+			 V T2U, T2V, T2X, T2Y;
+			 T2U = VADD(T2o, T2r);
+			 T2V = VADD(T2O, T2L);
+			 T2W = VADD(T2U, T2V);
+			 T30 = VSUB(T2U, T2V);
+			 T2X = VADD(T2v, T2y);
+			 T2Y = VADD(T2C, T2F);
+			 T2Z = VADD(T2X, T2Y);
+			 T31 = VBYI(VSUB(T2Y, T2X));
+		    }
+		    ST(&(x[WS(rs, 16)]), VSUB(T2W, T2Z), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VADD(T30, T31), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T2W, T2Z), ms, &(x[0]));
+		    ST(&(x[WS(rs, 24)]), VSUB(T30, T31), ms, &(x[0]));
+	       }
+	       {
+		    V T2s, T2P, T2H, T2Q, T2z, T2G;
+		    T2s = VSUB(T2o, T2r);
+		    T2P = VSUB(T2L, T2O);
+		    T2z = VSUB(T2v, T2y);
+		    T2G = VSUB(T2C, T2F);
+		    T2H = VMUL(LDK(KP707106781), VADD(T2z, T2G));
+		    T2Q = VMUL(LDK(KP707106781), VSUB(T2G, T2z));
+		    {
+			 V T2I, T2R, T2S, T2T;
+			 T2I = VADD(T2s, T2H);
+			 T2R = VBYI(VADD(T2P, T2Q));
+			 ST(&(x[WS(rs, 28)]), VSUB(T2I, T2R), ms, &(x[0]));
+			 ST(&(x[WS(rs, 4)]), VADD(T2I, T2R), ms, &(x[0]));
+			 T2S = VSUB(T2s, T2H);
+			 T2T = VBYI(VSUB(T2Q, T2P));
+			 ST(&(x[WS(rs, 20)]), VSUB(T2S, T2T), ms, &(x[0]));
+			 ST(&(x[WS(rs, 12)]), VADD(T2S, T2T), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T36, T3r, T3h, T3p, T3d, T3o, T3k, T3s, T35, T3g;
+		    T35 = VMUL(LDK(KP707106781), VADD(T33, T34));
+		    T36 = VADD(T32, T35);
+		    T3r = VSUB(T32, T35);
+		    T3g = VMUL(LDK(KP707106781), VSUB(T34, T33));
+		    T3h = VADD(T3f, T3g);
+		    T3p = VSUB(T3g, T3f);
+		    {
+			 V T39, T3c, T3i, T3j;
+			 T39 = VFMA(LDK(KP923879532), T37, VMUL(LDK(KP382683432), T38));
+			 T3c = VFNMS(LDK(KP382683432), T3b, VMUL(LDK(KP923879532), T3a));
+			 T3d = VADD(T39, T3c);
+			 T3o = VSUB(T3c, T39);
+			 T3i = VFNMS(LDK(KP382683432), T37, VMUL(LDK(KP923879532), T38));
+			 T3j = VFMA(LDK(KP382683432), T3a, VMUL(LDK(KP923879532), T3b));
+			 T3k = VADD(T3i, T3j);
+			 T3s = VSUB(T3j, T3i);
+		    }
+		    {
+			 V T3e, T3l, T3u, T3v;
+			 T3e = VADD(T36, T3d);
+			 T3l = VBYI(VADD(T3h, T3k));
+			 ST(&(x[WS(rs, 30)]), VSUB(T3e, T3l), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(T3e, T3l), ms, &(x[0]));
+			 T3u = VBYI(VADD(T3p, T3o));
+			 T3v = VADD(T3r, T3s);
+			 ST(&(x[WS(rs, 6)]), VADD(T3u, T3v), ms, &(x[0]));
+			 ST(&(x[WS(rs, 26)]), VSUB(T3v, T3u), ms, &(x[0]));
+		    }
+		    {
+			 V T3m, T3n, T3q, T3t;
+			 T3m = VSUB(T36, T3d);
+			 T3n = VBYI(VSUB(T3k, T3h));
+			 ST(&(x[WS(rs, 18)]), VSUB(T3m, T3n), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VADD(T3m, T3n), ms, &(x[0]));
+			 T3q = VBYI(VSUB(T3o, T3p));
+			 T3t = VSUB(T3r, T3s);
+			 ST(&(x[WS(rs, 10)]), VADD(T3q, T3t), ms, &(x[0]));
+			 ST(&(x[WS(rs, 22)]), VSUB(T3t, T3q), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V TE, T1P, T1I, T1Q, T1t, T1M, T1F, T1N;
+		    {
+			 V Tg, TD, T1G, T1H;
+			 Tg = VADD(T4, Tf);
+			 TD = VADD(Tr, TC);
+			 TE = VADD(Tg, TD);
+			 T1P = VSUB(Tg, TD);
+			 T1G = VFNMS(LDK(KP195090322), TV, VMUL(LDK(KP980785280), T12));
+			 T1H = VFMA(LDK(KP195090322), T1k, VMUL(LDK(KP980785280), T1r));
+			 T1I = VADD(T1G, T1H);
+			 T1Q = VSUB(T1H, T1G);
+		    }
+		    {
+			 V T13, T1s, T1B, T1E;
+			 T13 = VFMA(LDK(KP980785280), TV, VMUL(LDK(KP195090322), T12));
+			 T1s = VFNMS(LDK(KP195090322), T1r, VMUL(LDK(KP980785280), T1k));
+			 T1t = VADD(T13, T1s);
+			 T1M = VSUB(T1s, T13);
+			 T1B = VSUB(T1v, T1A);
+			 T1E = VSUB(T1C, T1D);
+			 T1F = VADD(T1B, T1E);
+			 T1N = VSUB(T1E, T1B);
+		    }
+		    {
+			 V T1u, T1J, T1S, T1T;
+			 T1u = VADD(TE, T1t);
+			 T1J = VBYI(VADD(T1F, T1I));
+			 ST(&(x[WS(rs, 31)]), VSUB(T1u, T1J), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T1u, T1J), ms, &(x[WS(rs, 1)]));
+			 T1S = VBYI(VADD(T1N, T1M));
+			 T1T = VADD(T1P, T1Q);
+			 ST(&(x[WS(rs, 7)]), VADD(T1S, T1T), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 25)]), VSUB(T1T, T1S), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T1K, T1L, T1O, T1R;
+			 T1K = VSUB(TE, T1t);
+			 T1L = VBYI(VSUB(T1I, T1F));
+			 ST(&(x[WS(rs, 17)]), VSUB(T1K, T1L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 15)]), VADD(T1K, T1L), ms, &(x[WS(rs, 1)]));
+			 T1O = VBYI(VSUB(T1M, T1N));
+			 T1R = VSUB(T1P, T1Q);
+			 ST(&(x[WS(rs, 9)]), VADD(T1O, T1R), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 23)]), VSUB(T1R, T1O), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T1W, T2h, T2a, T2i, T23, T2e, T27, T2f;
+		    {
+			 V T1U, T1V, T28, T29;
+			 T1U = VSUB(T4, Tf);
+			 T1V = VADD(T1D, T1C);
+			 T1W = VADD(T1U, T1V);
+			 T2h = VSUB(T1U, T1V);
+			 T28 = VFNMS(LDK(KP555570233), T1X, VMUL(LDK(KP831469612), T1Y));
+			 T29 = VFMA(LDK(KP555570233), T20, VMUL(LDK(KP831469612), T21));
+			 T2a = VADD(T28, T29);
+			 T2i = VSUB(T29, T28);
+		    }
+		    {
+			 V T1Z, T22, T25, T26;
+			 T1Z = VFMA(LDK(KP831469612), T1X, VMUL(LDK(KP555570233), T1Y));
+			 T22 = VFNMS(LDK(KP555570233), T21, VMUL(LDK(KP831469612), T20));
+			 T23 = VADD(T1Z, T22);
+			 T2e = VSUB(T22, T1Z);
+			 T25 = VADD(T1A, T1v);
+			 T26 = VSUB(TC, Tr);
+			 T27 = VADD(T25, T26);
+			 T2f = VSUB(T26, T25);
+		    }
+		    {
+			 V T24, T2b, T2k, T2l;
+			 T24 = VADD(T1W, T23);
+			 T2b = VBYI(VADD(T27, T2a));
+			 ST(&(x[WS(rs, 29)]), VSUB(T24, T2b), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(T24, T2b), ms, &(x[WS(rs, 1)]));
+			 T2k = VBYI(VADD(T2f, T2e));
+			 T2l = VADD(T2h, T2i);
+			 ST(&(x[WS(rs, 5)]), VADD(T2k, T2l), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 27)]), VSUB(T2l, T2k), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T2c, T2d, T2g, T2j;
+			 T2c = VSUB(T1W, T23);
+			 T2d = VBYI(VSUB(T2a, T27));
+			 ST(&(x[WS(rs, 19)]), VSUB(T2c, T2d), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VADD(T2c, T2d), ms, &(x[WS(rs, 1)]));
+			 T2g = VBYI(VSUB(T2e, T2f));
+			 T2j = VSUB(T2h, T2i);
+			 ST(&(x[WS(rs, 11)]), VADD(T2g, T2j), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 21)]), VSUB(T2j, T2g), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t1fv_32"), twinstr, &GENUS, {201, 88, 16, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_32) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:01 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1fv_4 -include t1f.h */
+
+/*
+ * This function contains 11 FP additions, 8 FP multiplications,
+ * (or, 9 additions, 6 multiplications, 2 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T7, T2, T5, T8, T3, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTWJ(&(W[TWVL * 4]), T7);
+	       T3 = BYTWJ(&(W[TWVL * 2]), T2);
+	       T6 = BYTWJ(&(W[0]), T5);
+	       {
+		    V Ta, T4, Tb, T9;
+		    Ta = VADD(T1, T3);
+		    T4 = VSUB(T1, T3);
+		    Tb = VADD(T6, T8);
+		    T9 = VSUB(T6, T8);
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 3)]), VFMAI(T9, T4), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VFNMSI(T9, T4), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t1fv_4"), twinstr, &GENUS, {9, 6, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_4) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1fv_4 -include t1f.h */
+
+/*
+ * This function contains 11 FP additions, 6 FP multiplications,
+ * (or, 11 additions, 6 multiplications, 0 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T8, T3, T6, T7, T2, T5;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTWJ(&(W[TWVL * 4]), T7);
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T3 = BYTWJ(&(W[TWVL * 2]), T2);
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T6 = BYTWJ(&(W[0]), T5);
+	       {
+		    V T4, T9, Ta, Tb;
+		    T4 = VSUB(T1, T3);
+		    T9 = VBYI(VSUB(T6, T8));
+		    ST(&(x[WS(rs, 1)]), VSUB(T4, T9), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VADD(T4, T9), ms, &(x[WS(rs, 1)]));
+		    Ta = VADD(T1, T3);
+		    Tb = VADD(T6, T8);
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t1fv_4"), twinstr, &GENUS, {11, 6, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_4) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:01 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t1fv_5 -include t1f.h */
+
+/*
+ * This function contains 20 FP additions, 19 FP multiplications,
+ * (or, 11 additions, 10 multiplications, 9 fused multiply/add),
+ * 26 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V T1, T2, T9, T4, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T9 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, Ta, T5, T8;
+		    T3 = BYTWJ(&(W[0]), T2);
+		    Ta = BYTWJ(&(W[TWVL * 4]), T9);
+		    T5 = BYTWJ(&(W[TWVL * 6]), T4);
+		    T8 = BYTWJ(&(W[TWVL * 2]), T7);
+		    {
+			 V T6, Tg, Tb, Th;
+			 T6 = VADD(T3, T5);
+			 Tg = VSUB(T3, T5);
+			 Tb = VADD(T8, Ta);
+			 Th = VSUB(T8, Ta);
+			 {
+			      V Te, Tc, Tk, Ti, Td, Tj, Tf;
+			      Te = VSUB(T6, Tb);
+			      Tc = VADD(T6, Tb);
+			      Tk = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tg, Th));
+			      Ti = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Th, Tg));
+			      Td = VFNMS(LDK(KP250000000), Tc, T1);
+			      ST(&(x[0]), VADD(T1, Tc), ms, &(x[0]));
+			      Tj = VFNMS(LDK(KP559016994), Te, Td);
+			      Tf = VFMA(LDK(KP559016994), Te, Td);
+			      ST(&(x[WS(rs, 2)]), VFMAI(Tk, Tj), ms, &(x[0]));
+			      ST(&(x[WS(rs, 3)]), VFNMSI(Tk, Tj), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 4)]), VFMAI(Ti, Tf), ms, &(x[0]));
+			      ST(&(x[WS(rs, 1)]), VFNMSI(Ti, Tf), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t1fv_5"), twinstr, &GENUS, {11, 10, 9, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_5) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t1fv_5 -include t1f.h */
+
+/*
+ * This function contains 20 FP additions, 14 FP multiplications,
+ * (or, 17 additions, 11 multiplications, 3 fused multiply/add),
+ * 20 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V Tc, Tg, Th, T5, Ta, Td;
+	       Tc = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTWJ(&(W[0]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T4 = BYTWJ(&(W[TWVL * 6]), T3);
+			 T6 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T7 = BYTWJ(&(W[TWVL * 2]), T6);
+		    }
+		    Tg = VSUB(T2, T4);
+		    Th = VSUB(T7, T9);
+		    T5 = VADD(T2, T4);
+		    Ta = VADD(T7, T9);
+		    Td = VADD(T5, Ta);
+	       }
+	       ST(&(x[0]), VADD(Tc, Td), ms, &(x[0]));
+	       {
+		    V Ti, Tj, Tf, Tk, Tb, Te;
+		    Ti = VBYI(VFMA(LDK(KP951056516), Tg, VMUL(LDK(KP587785252), Th)));
+		    Tj = VBYI(VFNMS(LDK(KP587785252), Tg, VMUL(LDK(KP951056516), Th)));
+		    Tb = VMUL(LDK(KP559016994), VSUB(T5, Ta));
+		    Te = VFNMS(LDK(KP250000000), Td, Tc);
+		    Tf = VADD(Tb, Te);
+		    Tk = VSUB(Te, Tb);
+		    ST(&(x[WS(rs, 1)]), VSUB(Tf, Ti), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VSUB(Tk, Tj), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 4)]), VADD(Ti, Tf), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Tj, Tk), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t1fv_5"), twinstr, &GENUS, {17, 11, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_5) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:01 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name t1fv_6 -include t1f.h */
+
+/*
+ * This function contains 23 FP additions, 18 FP multiplications,
+ * (or, 17 additions, 12 multiplications, 6 fused multiply/add),
+ * 27 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 10)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(6, rs)) {
+	       V T1, T2, Ta, Tc, T5, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T3, Tb, Td, T6, T8;
+		    T3 = BYTWJ(&(W[TWVL * 4]), T2);
+		    Tb = BYTWJ(&(W[TWVL * 6]), Ta);
+		    Td = BYTWJ(&(W[0]), Tc);
+		    T6 = BYTWJ(&(W[TWVL * 2]), T5);
+		    T8 = BYTWJ(&(W[TWVL * 8]), T7);
+		    {
+			 V Ti, T4, Tk, Te, Tj, T9;
+			 Ti = VADD(T1, T3);
+			 T4 = VSUB(T1, T3);
+			 Tk = VADD(Tb, Td);
+			 Te = VSUB(Tb, Td);
+			 Tj = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+			 {
+			      V Tl, Tn, Tf, Th, Tm, Tg;
+			      Tl = VADD(Tj, Tk);
+			      Tn = VMUL(LDK(KP866025403), VSUB(Tk, Tj));
+			      Tf = VADD(T9, Te);
+			      Th = VMUL(LDK(KP866025403), VSUB(Te, T9));
+			      ST(&(x[0]), VADD(Ti, Tl), ms, &(x[0]));
+			      Tm = VFNMS(LDK(KP500000000), Tl, Ti);
+			      ST(&(x[WS(rs, 3)]), VADD(T4, Tf), ms, &(x[WS(rs, 1)]));
+			      Tg = VFNMS(LDK(KP500000000), Tf, T4);
+			      ST(&(x[WS(rs, 2)]), VFNMSI(Tn, Tm), ms, &(x[0]));
+			      ST(&(x[WS(rs, 4)]), VFMAI(Tn, Tm), ms, &(x[0]));
+			      ST(&(x[WS(rs, 5)]), VFNMSI(Th, Tg), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VFMAI(Th, Tg), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 6, XSIMD_STRING("t1fv_6"), twinstr, &GENUS, {17, 12, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_6) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_6, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 6 -name t1fv_6 -include t1f.h */
+
+/*
+ * This function contains 23 FP additions, 14 FP multiplications,
+ * (or, 21 additions, 12 multiplications, 2 fused multiply/add),
+ * 19 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 10)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(6, rs)) {
+	       V T4, Ti, Te, Tk, T9, Tj, T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[TWVL * 4]), T2);
+	       T4 = VSUB(T1, T3);
+	       Ti = VADD(T1, T3);
+	       {
+		    V Tb, Td, Ta, Tc;
+		    Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tb = BYTWJ(&(W[TWVL * 6]), Ta);
+		    Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTWJ(&(W[0]), Tc);
+		    Te = VSUB(Tb, Td);
+		    Tk = VADD(Tb, Td);
+	       }
+	       {
+		    V T6, T8, T5, T7;
+		    T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T6 = BYTWJ(&(W[TWVL * 2]), T5);
+		    T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T8 = BYTWJ(&(W[TWVL * 8]), T7);
+		    T9 = VSUB(T6, T8);
+		    Tj = VADD(T6, T8);
+	       }
+	       {
+		    V Th, Tf, Tg, Tn, Tl, Tm;
+		    Th = VBYI(VMUL(LDK(KP866025403), VSUB(Te, T9)));
+		    Tf = VADD(T9, Te);
+		    Tg = VFNMS(LDK(KP500000000), Tf, T4);
+		    ST(&(x[WS(rs, 3)]), VADD(T4, Tf), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(Tg, Th), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VSUB(Tg, Th), ms, &(x[WS(rs, 1)]));
+		    Tn = VBYI(VMUL(LDK(KP866025403), VSUB(Tk, Tj)));
+		    Tl = VADD(Tj, Tk);
+		    Tm = VFNMS(LDK(KP500000000), Tl, Ti);
+		    ST(&(x[0]), VADD(Ti, Tl), ms, &(x[0]));
+		    ST(&(x[WS(rs, 4)]), VADD(Tm, Tn), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VSUB(Tm, Tn), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 6, XSIMD_STRING("t1fv_6"), twinstr, &GENUS, {21, 12, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_6) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_6, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1877 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:10 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t1fv_64 -include t1f.h */
+
+/*
+ * This function contains 519 FP additions, 384 FP multiplications,
+ * (or, 261 additions, 126 multiplications, 258 fused multiply/add),
+ * 187 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V T6L, T6M, T6O, T6P, T75, T6V, T5A, T6A, T72, T6K, T6t, T6D, T6w, T6B, T6h;
+	       V T6E;
+	       {
+		    V Ta, T3U, T3V, T37, T7a, T58, T7B, T6l, T1v, T24, T5Q, T7o, T5F, T7l, T43;
+		    V T4F, T2i, T2R, T6b, T7v, T60, T7s, T4a, T4I, T5u, T7h, T5x, T7g, T1i, T3a;
+		    V T4j, T4C, T7e, T5l, T7d, T5o, T3b, TV, T4B, T4m, T3X, T3Y, T6o, T7b, T5f;
+		    V T7C, Tx, T38, T2p, T61, T2n, T65, T2D, T7p, T5M, T7m, T5T, T4G, T46, T25;
+		    V T1S, T2q, T2u, T2w;
+		    {
+			 V T5q, T10, T5v, T15, T1b, T5s, T1c, T1e;
+			 {
+			      V T1V, T1p, T5B, T5O, T1u, T1X, T20, T21;
+			      {
+				   V T1, T2, T7, T5, T32, T34, T2X, T2Z;
+				   T1 = LD(&(x[0]), ms, &(x[0]));
+				   T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
+				   T5 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T32 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+				   T34 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
+				   T2X = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
+				   T2Z = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+				   {
+					V T1m, T54, T6j, T36, T55, T31, T56, T1n, T1q, T1s, T4, T9;
+					{
+					     V T3, T8, T6, T33, T35, T2Y, T30, T1l;
+					     T1l = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					     T3 = BYTWJ(&(W[TWVL * 62]), T2);
+					     T8 = BYTWJ(&(W[TWVL * 94]), T7);
+					     T6 = BYTWJ(&(W[TWVL * 30]), T5);
+					     T33 = BYTWJ(&(W[TWVL * 14]), T32);
+					     T35 = BYTWJ(&(W[TWVL * 78]), T34);
+					     T2Y = BYTWJ(&(W[TWVL * 110]), T2X);
+					     T30 = BYTWJ(&(W[TWVL * 46]), T2Z);
+					     T1m = BYTWJ(&(W[0]), T1l);
+					     T54 = VSUB(T1, T3);
+					     T4 = VADD(T1, T3);
+					     T6j = VSUB(T6, T8);
+					     T9 = VADD(T6, T8);
+					     T36 = VADD(T33, T35);
+					     T55 = VSUB(T33, T35);
+					     T31 = VADD(T2Y, T30);
+					     T56 = VSUB(T2Y, T30);
+					     T1n = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
+					}
+					T1q = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					T1s = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
+					Ta = VSUB(T4, T9);
+					T3U = VADD(T4, T9);
+					{
+					     V T57, T6k, T1o, T1r, T1t, T1W, T1U, T1Z;
+					     T1U = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					     T3V = VADD(T36, T31);
+					     T37 = VSUB(T31, T36);
+					     T57 = VADD(T55, T56);
+					     T6k = VSUB(T56, T55);
+					     T1o = BYTWJ(&(W[TWVL * 64]), T1n);
+					     T1r = BYTWJ(&(W[TWVL * 32]), T1q);
+					     T1t = BYTWJ(&(W[TWVL * 96]), T1s);
+					     T1V = BYTWJ(&(W[TWVL * 16]), T1U);
+					     T1W = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
+					     T1Z = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
+					     T7a = VFNMS(LDK(KP707106781), T57, T54);
+					     T58 = VFMA(LDK(KP707106781), T57, T54);
+					     T7B = VFMA(LDK(KP707106781), T6k, T6j);
+					     T6l = VFNMS(LDK(KP707106781), T6k, T6j);
+					     T1p = VADD(T1m, T1o);
+					     T5B = VSUB(T1m, T1o);
+					     T5O = VSUB(T1r, T1t);
+					     T1u = VADD(T1r, T1t);
+					     T1X = BYTWJ(&(W[TWVL * 80]), T1W);
+					     T20 = BYTWJ(&(W[TWVL * 112]), T1Z);
+					     T21 = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+					}
+				   }
+			      }
+			      {
+				   V T5W, T2N, T69, T2L, T5Y, T2P, T48, T2c, T2h;
+				   {
+					V T41, T1Y, T5C, T22, T2d, T29, T2b, T2f, T28, T2a, T2H, T2J;
+					T28 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
+					T2a = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+					T1v = VSUB(T1p, T1u);
+					T41 = VADD(T1p, T1u);
+					T1Y = VADD(T1V, T1X);
+					T5C = VSUB(T1V, T1X);
+					T22 = BYTWJ(&(W[TWVL * 48]), T21);
+					T2d = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					T29 = BYTWJ(&(W[TWVL * 124]), T28);
+					T2b = BYTWJ(&(W[TWVL * 60]), T2a);
+					T2f = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
+					T2H = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
+					T2J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V T23, T5D, T2e, T2g, T2I, T2K, T2M;
+					     T2M = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     T23 = VADD(T20, T22);
+					     T5D = VSUB(T20, T22);
+					     T2e = BYTWJ(&(W[TWVL * 28]), T2d);
+					     T2c = VADD(T29, T2b);
+					     T5W = VSUB(T29, T2b);
+					     T2g = BYTWJ(&(W[TWVL * 92]), T2f);
+					     T2I = BYTWJ(&(W[TWVL * 108]), T2H);
+					     T2K = BYTWJ(&(W[TWVL * 44]), T2J);
+					     T2N = BYTWJ(&(W[TWVL * 12]), T2M);
+					     {
+						  V T5E, T5P, T42, T2O;
+						  T5E = VADD(T5C, T5D);
+						  T5P = VSUB(T5C, T5D);
+						  T24 = VSUB(T1Y, T23);
+						  T42 = VADD(T1Y, T23);
+						  T69 = VSUB(T2g, T2e);
+						  T2h = VADD(T2e, T2g);
+						  T2O = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
+						  T2L = VADD(T2I, T2K);
+						  T5Y = VSUB(T2I, T2K);
+						  T5Q = VFMA(LDK(KP707106781), T5P, T5O);
+						  T7o = VFNMS(LDK(KP707106781), T5P, T5O);
+						  T5F = VFMA(LDK(KP707106781), T5E, T5B);
+						  T7l = VFNMS(LDK(KP707106781), T5E, T5B);
+						  T43 = VADD(T41, T42);
+						  T4F = VSUB(T41, T42);
+						  T2P = BYTWJ(&(W[TWVL * 76]), T2O);
+					     }
+					}
+				   }
+				   T2i = VSUB(T2c, T2h);
+				   T48 = VADD(T2c, T2h);
+				   {
+					V TW, TY, T11, T2Q, T5X, T13;
+					TW = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
+					TY = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+					T11 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					T2Q = VADD(T2N, T2P);
+					T5X = VSUB(T2N, T2P);
+					T13 = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
+					{
+					     V T12, T5Z, T6a, T49, T14, T18, T1a;
+					     {
+						  V T17, T19, TX, TZ;
+						  T17 = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
+						  T19 = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+						  TX = BYTWJ(&(W[TWVL * 122]), TW);
+						  TZ = BYTWJ(&(W[TWVL * 58]), TY);
+						  T12 = BYTWJ(&(W[TWVL * 26]), T11);
+						  T5Z = VADD(T5X, T5Y);
+						  T6a = VSUB(T5Y, T5X);
+						  T2R = VSUB(T2L, T2Q);
+						  T49 = VADD(T2Q, T2L);
+						  T14 = BYTWJ(&(W[TWVL * 90]), T13);
+						  T18 = BYTWJ(&(W[TWVL * 106]), T17);
+						  T5q = VSUB(TX, TZ);
+						  T10 = VADD(TX, TZ);
+						  T1a = BYTWJ(&(W[TWVL * 42]), T19);
+					     }
+					     T6b = VFMA(LDK(KP707106781), T6a, T69);
+					     T7v = VFNMS(LDK(KP707106781), T6a, T69);
+					     T60 = VFMA(LDK(KP707106781), T5Z, T5W);
+					     T7s = VFNMS(LDK(KP707106781), T5Z, T5W);
+					     T4a = VADD(T48, T49);
+					     T4I = VSUB(T48, T49);
+					     T5v = VSUB(T14, T12);
+					     T15 = VADD(T12, T14);
+					     T1b = VADD(T18, T1a);
+					     T5s = VSUB(T18, T1a);
+					}
+					T1c = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+					T1e = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
+				   }
+			      }
+			 }
+			 {
+			      V Th, T59, Tf, Tv, T5d, Tj, Tm, To;
+			      {
+				   V T5h, TQ, T5m, T5i, TO, TS, TJ, T4k, TD, TI;
+				   {
+					V T4h, T16, TB, T1d, T1f, TE, TG, TA, Tz, TK, TM, TC;
+					Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					T4h = VADD(T10, T15);
+					T16 = VSUB(T10, T15);
+					TB = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
+					T1d = BYTWJ(&(W[TWVL * 10]), T1c);
+					T1f = BYTWJ(&(W[TWVL * 74]), T1e);
+					TE = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+					TG = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
+					TA = BYTWJ(&(W[TWVL * 2]), Tz);
+					TK = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+					TM = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
+					TC = BYTWJ(&(W[TWVL * 66]), TB);
+					{
+					     V T1g, T5r, TF, TH, TL, TN, TP;
+					     TP = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
+					     T1g = VADD(T1d, T1f);
+					     T5r = VSUB(T1d, T1f);
+					     TF = BYTWJ(&(W[TWVL * 34]), TE);
+					     TH = BYTWJ(&(W[TWVL * 98]), TG);
+					     TL = BYTWJ(&(W[TWVL * 18]), TK);
+					     TN = BYTWJ(&(W[TWVL * 82]), TM);
+					     T5h = VSUB(TA, TC);
+					     TD = VADD(TA, TC);
+					     TQ = BYTWJ(&(W[TWVL * 114]), TP);
+					     {
+						  V T5w, T5t, T4i, T1h, TR;
+						  T5w = VSUB(T5s, T5r);
+						  T5t = VADD(T5r, T5s);
+						  T4i = VADD(T1g, T1b);
+						  T1h = VSUB(T1b, T1g);
+						  T5m = VSUB(TF, TH);
+						  TI = VADD(TF, TH);
+						  T5i = VSUB(TL, TN);
+						  TO = VADD(TL, TN);
+						  TR = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+						  T5u = VFMA(LDK(KP707106781), T5t, T5q);
+						  T7h = VFNMS(LDK(KP707106781), T5t, T5q);
+						  T5x = VFMA(LDK(KP707106781), T5w, T5v);
+						  T7g = VFNMS(LDK(KP707106781), T5w, T5v);
+						  T1i = VFNMS(LDK(KP414213562), T1h, T16);
+						  T3a = VFMA(LDK(KP414213562), T16, T1h);
+						  T4j = VADD(T4h, T4i);
+						  T4C = VSUB(T4h, T4i);
+						  TS = BYTWJ(&(W[TWVL * 50]), TR);
+					     }
+					}
+				   }
+				   TJ = VSUB(TD, TI);
+				   T4k = VADD(TD, TI);
+				   {
+					V Tb, Td, Tr, T5j, TT, Tt, Tg;
+					Tb = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+					Td = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
+					Tr = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					T5j = VSUB(TQ, TS);
+					TT = VADD(TQ, TS);
+					Tt = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
+					Tg = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+					{
+					     V Ti, Tc, Te, Ts;
+					     Ti = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
+					     Tc = BYTWJ(&(W[TWVL * 6]), Tb);
+					     Te = BYTWJ(&(W[TWVL * 70]), Td);
+					     Ts = BYTWJ(&(W[TWVL * 22]), Tr);
+					     {
+						  V T5k, T5n, TU, T4l, Tu;
+						  T5k = VADD(T5i, T5j);
+						  T5n = VSUB(T5i, T5j);
+						  TU = VSUB(TO, TT);
+						  T4l = VADD(TO, TT);
+						  Tu = BYTWJ(&(W[TWVL * 86]), Tt);
+						  Th = BYTWJ(&(W[TWVL * 38]), Tg);
+						  T59 = VSUB(Tc, Te);
+						  Tf = VADD(Tc, Te);
+						  T7e = VFNMS(LDK(KP707106781), T5k, T5h);
+						  T5l = VFMA(LDK(KP707106781), T5k, T5h);
+						  T7d = VFNMS(LDK(KP707106781), T5n, T5m);
+						  T5o = VFMA(LDK(KP707106781), T5n, T5m);
+						  T3b = VFMA(LDK(KP414213562), TJ, TU);
+						  TV = VFNMS(LDK(KP414213562), TU, TJ);
+						  T4B = VSUB(T4k, T4l);
+						  T4m = VADD(T4k, T4l);
+						  Tv = VADD(Ts, Tu);
+						  T5d = VSUB(Tu, Ts);
+						  Tj = BYTWJ(&(W[TWVL * 102]), Ti);
+					     }
+					}
+					Tm = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
+					To = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+				   }
+			      }
+			      {
+				   V T5b, T6m, Tl, T1A, T5G, T1Q, T5K, T1C, T1D, T5e, T6n, Tw, T1H, T1J;
+				   {
+					V T1w, T1y, T1M, T1O, Tq, T5c, T1B;
+					T1w = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+					T1y = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
+					T1M = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+					T1O = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
+					T1B = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V Tk, T5a, Tn, Tp;
+					     Tk = VADD(Th, Tj);
+					     T5a = VSUB(Th, Tj);
+					     Tn = BYTWJ(&(W[TWVL * 118]), Tm);
+					     Tp = BYTWJ(&(W[TWVL * 54]), To);
+					     {
+						  V T1x, T1z, T1N, T1P;
+						  T1x = BYTWJ(&(W[TWVL * 8]), T1w);
+						  T1z = BYTWJ(&(W[TWVL * 72]), T1y);
+						  T1N = BYTWJ(&(W[TWVL * 24]), T1M);
+						  T1P = BYTWJ(&(W[TWVL * 88]), T1O);
+						  T5b = VFNMS(LDK(KP414213562), T5a, T59);
+						  T6m = VFMA(LDK(KP414213562), T59, T5a);
+						  T3X = VADD(Tf, Tk);
+						  Tl = VSUB(Tf, Tk);
+						  Tq = VADD(Tn, Tp);
+						  T5c = VSUB(Tn, Tp);
+						  T1A = VADD(T1x, T1z);
+						  T5G = VSUB(T1x, T1z);
+						  T1Q = VADD(T1N, T1P);
+						  T5K = VSUB(T1N, T1P);
+						  T1C = BYTWJ(&(W[TWVL * 40]), T1B);
+					     }
+					}
+					T1D = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
+					T5e = VFNMS(LDK(KP414213562), T5d, T5c);
+					T6n = VFMA(LDK(KP414213562), T5c, T5d);
+					T3Y = VADD(Tq, Tv);
+					Tw = VSUB(Tq, Tv);
+					T1H = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
+					T1J = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+				   }
+				   {
+					V T1I, T1K, T1F, T5H, T2k, T2l, T2z, T2B, T2j, T1E;
+					T2j = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+					T1E = BYTWJ(&(W[TWVL * 104]), T1D);
+					T6o = VSUB(T6m, T6n);
+					T7b = VADD(T6m, T6n);
+					T5f = VADD(T5b, T5e);
+					T7C = VSUB(T5e, T5b);
+					Tx = VADD(Tl, Tw);
+					T38 = VSUB(Tw, Tl);
+					T1I = BYTWJ(&(W[TWVL * 120]), T1H);
+					T1K = BYTWJ(&(W[TWVL * 56]), T1J);
+					T1F = VADD(T1C, T1E);
+					T5H = VSUB(T1C, T1E);
+					T2k = BYTWJ(&(W[TWVL * 4]), T2j);
+					T2l = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
+					T2z = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					T2B = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V T5I, T5R, T44, T1G, T2m, T2A, T2C, T5S, T5L, T1R, T45, T2o, T5J, T1L;
+					     T2o = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+					     T5J = VSUB(T1I, T1K);
+					     T1L = VADD(T1I, T1K);
+					     T5I = VFNMS(LDK(KP414213562), T5H, T5G);
+					     T5R = VFMA(LDK(KP414213562), T5G, T5H);
+					     T44 = VADD(T1A, T1F);
+					     T1G = VSUB(T1A, T1F);
+					     T2m = BYTWJ(&(W[TWVL * 68]), T2l);
+					     T2A = BYTWJ(&(W[TWVL * 20]), T2z);
+					     T2C = BYTWJ(&(W[TWVL * 84]), T2B);
+					     T5S = VFNMS(LDK(KP414213562), T5J, T5K);
+					     T5L = VFMA(LDK(KP414213562), T5K, T5J);
+					     T1R = VSUB(T1L, T1Q);
+					     T45 = VADD(T1L, T1Q);
+					     T2p = BYTWJ(&(W[TWVL * 36]), T2o);
+					     T61 = VSUB(T2k, T2m);
+					     T2n = VADD(T2k, T2m);
+					     T65 = VSUB(T2C, T2A);
+					     T2D = VADD(T2A, T2C);
+					     T7p = VSUB(T5I, T5L);
+					     T5M = VADD(T5I, T5L);
+					     T7m = VSUB(T5R, T5S);
+					     T5T = VADD(T5R, T5S);
+					     T4G = VSUB(T44, T45);
+					     T46 = VADD(T44, T45);
+					     T25 = VSUB(T1G, T1R);
+					     T1S = VADD(T1G, T1R);
+					     T2q = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
+					}
+					T2u = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
+					T2w = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T67, T7w, T6e, T7t, T3s, T3E, T39, T3D, T1k, T3k, T3t, T3c, T1T, T3v, T3w;
+			 V T26, T2G, T3y, T3z, T2T;
+			 {
+			      V T4A, T4N, T47, T4v, T2r, T2v, T2x, T4s, T40, T3W, T3Z;
+			      T4A = VSUB(T3U, T3V);
+			      T3W = VADD(T3U, T3V);
+			      T3Z = VADD(T3X, T3Y);
+			      T4N = VSUB(T3Y, T3X);
+			      T47 = VSUB(T43, T46);
+			      T4v = VADD(T43, T46);
+			      T2r = BYTWJ(&(W[TWVL * 100]), T2q);
+			      T2v = BYTWJ(&(W[TWVL * 116]), T2u);
+			      T2x = BYTWJ(&(W[TWVL * 52]), T2w);
+			      T4s = VADD(T3W, T3Z);
+			      T40 = VSUB(T3W, T3Z);
+			      {
+				   V T4O, T4n, T4R, T4H, T4E, T4W, T4u, T4y, T4d, T4J, T2F, T2S;
+				   {
+					V T6c, T63, T2t, T4b, T6d, T66, T2E, T4c;
+					{
+					     V T4D, T62, T2s, T64, T2y, T4t;
+					     T4O = VSUB(T4C, T4B);
+					     T4D = VADD(T4B, T4C);
+					     T62 = VSUB(T2r, T2p);
+					     T2s = VADD(T2p, T2r);
+					     T64 = VSUB(T2v, T2x);
+					     T2y = VADD(T2v, T2x);
+					     T4t = VADD(T4m, T4j);
+					     T4n = VSUB(T4j, T4m);
+					     T4R = VFMA(LDK(KP414213562), T4F, T4G);
+					     T4H = VFNMS(LDK(KP414213562), T4G, T4F);
+					     T4E = VFMA(LDK(KP707106781), T4D, T4A);
+					     T4W = VFNMS(LDK(KP707106781), T4D, T4A);
+					     T6c = VFNMS(LDK(KP414213562), T61, T62);
+					     T63 = VFMA(LDK(KP414213562), T62, T61);
+					     T2t = VSUB(T2n, T2s);
+					     T4b = VADD(T2n, T2s);
+					     T6d = VFMA(LDK(KP414213562), T64, T65);
+					     T66 = VFNMS(LDK(KP414213562), T65, T64);
+					     T2E = VSUB(T2y, T2D);
+					     T4c = VADD(T2y, T2D);
+					     T4u = VADD(T4s, T4t);
+					     T4y = VSUB(T4s, T4t);
+					}
+					T67 = VADD(T63, T66);
+					T7w = VSUB(T66, T63);
+					T6e = VADD(T6c, T6d);
+					T7t = VSUB(T6d, T6c);
+					T4d = VADD(T4b, T4c);
+					T4J = VSUB(T4c, T4b);
+					T2F = VADD(T2t, T2E);
+					T2S = VSUB(T2E, T2t);
+				   }
+				   {
+					V Ty, T1j, T4Q, T4K;
+					Ty = VFMA(LDK(KP707106781), Tx, Ta);
+					T3s = VFNMS(LDK(KP707106781), Tx, Ta);
+					T3E = VSUB(T1i, TV);
+					T1j = VADD(TV, T1i);
+					T39 = VFMA(LDK(KP707106781), T38, T37);
+					T3D = VFNMS(LDK(KP707106781), T38, T37);
+					T4Q = VFMA(LDK(KP414213562), T4I, T4J);
+					T4K = VFNMS(LDK(KP414213562), T4J, T4I);
+					{
+					     V T4w, T4e, T4P, T4Z;
+					     T4w = VADD(T4a, T4d);
+					     T4e = VSUB(T4a, T4d);
+					     T4P = VFMA(LDK(KP707106781), T4O, T4N);
+					     T4Z = VFNMS(LDK(KP707106781), T4O, T4N);
+					     T1k = VFMA(LDK(KP923879532), T1j, Ty);
+					     T3k = VFNMS(LDK(KP923879532), T1j, Ty);
+					     {
+						  V T4L, T50, T4S, T4X;
+						  T4L = VADD(T4H, T4K);
+						  T50 = VSUB(T4K, T4H);
+						  T4S = VSUB(T4Q, T4R);
+						  T4X = VADD(T4R, T4Q);
+						  {
+						       V T4f, T4o, T4x, T4z;
+						       T4f = VADD(T47, T4e);
+						       T4o = VSUB(T4e, T47);
+						       T4x = VADD(T4v, T4w);
+						       T4z = VSUB(T4w, T4v);
+						       {
+							    V T53, T51, T4M, T4U;
+							    T53 = VFNMS(LDK(KP923879532), T50, T4Z);
+							    T51 = VFMA(LDK(KP923879532), T50, T4Z);
+							    T4M = VFNMS(LDK(KP923879532), T4L, T4E);
+							    T4U = VFMA(LDK(KP923879532), T4L, T4E);
+							    {
+								 V T52, T4Y, T4T, T4V;
+								 T52 = VFMA(LDK(KP923879532), T4X, T4W);
+								 T4Y = VFNMS(LDK(KP923879532), T4X, T4W);
+								 T4T = VFNMS(LDK(KP923879532), T4S, T4P);
+								 T4V = VFMA(LDK(KP923879532), T4S, T4P);
+								 {
+								      V T4p, T4r, T4g, T4q;
+								      T4p = VFNMS(LDK(KP707106781), T4o, T4n);
+								      T4r = VFMA(LDK(KP707106781), T4o, T4n);
+								      T4g = VFNMS(LDK(KP707106781), T4f, T40);
+								      T4q = VFMA(LDK(KP707106781), T4f, T40);
+								      ST(&(x[WS(rs, 16)]), VFMAI(T4z, T4y), ms, &(x[0]));
+								      ST(&(x[WS(rs, 48)]), VFNMSI(T4z, T4y), ms, &(x[0]));
+								      ST(&(x[0]), VADD(T4u, T4x), ms, &(x[0]));
+								      ST(&(x[WS(rs, 32)]), VSUB(T4u, T4x), ms, &(x[0]));
+								      ST(&(x[WS(rs, 44)]), VFNMSI(T51, T4Y), ms, &(x[0]));
+								      ST(&(x[WS(rs, 20)]), VFMAI(T51, T4Y), ms, &(x[0]));
+								      ST(&(x[WS(rs, 52)]), VFMAI(T53, T52), ms, &(x[0]));
+								      ST(&(x[WS(rs, 12)]), VFNMSI(T53, T52), ms, &(x[0]));
+								      ST(&(x[WS(rs, 4)]), VFMAI(T4V, T4U), ms, &(x[0]));
+								      ST(&(x[WS(rs, 60)]), VFNMSI(T4V, T4U), ms, &(x[0]));
+								      ST(&(x[WS(rs, 36)]), VFMAI(T4T, T4M), ms, &(x[0]));
+								      ST(&(x[WS(rs, 28)]), VFNMSI(T4T, T4M), ms, &(x[0]));
+								      ST(&(x[WS(rs, 8)]), VFMAI(T4r, T4q), ms, &(x[0]));
+								      ST(&(x[WS(rs, 56)]), VFNMSI(T4r, T4q), ms, &(x[0]));
+								      ST(&(x[WS(rs, 40)]), VFMAI(T4p, T4g), ms, &(x[0]));
+								      ST(&(x[WS(rs, 24)]), VFNMSI(T4p, T4g), ms, &(x[0]));
+								      T3t = VADD(T3b, T3a);
+								      T3c = VSUB(T3a, T3b);
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					T1T = VFMA(LDK(KP707106781), T1S, T1v);
+					T3v = VFNMS(LDK(KP707106781), T1S, T1v);
+					T3w = VFNMS(LDK(KP707106781), T25, T24);
+					T26 = VFMA(LDK(KP707106781), T25, T24);
+					T2G = VFMA(LDK(KP707106781), T2F, T2i);
+					T3y = VFNMS(LDK(KP707106781), T2F, T2i);
+					T3z = VFNMS(LDK(KP707106781), T2S, T2R);
+					T2T = VFMA(LDK(KP707106781), T2S, T2R);
+				   }
+			      }
+			 }
+			 {
+			      V T3u, T3M, T3F, T3P, T3x, T3H, T3q, T3m, T3h, T3j, T3r, T3p, T2W, T3i;
+			      {
+				   V T3d, T3n, T27, T3f, T2U, T3e;
+				   T3d = VFMA(LDK(KP923879532), T3c, T39);
+				   T3n = VFNMS(LDK(KP923879532), T3c, T39);
+				   T27 = VFNMS(LDK(KP198912367), T26, T1T);
+				   T3f = VFMA(LDK(KP198912367), T1T, T26);
+				   T2U = VFNMS(LDK(KP198912367), T2T, T2G);
+				   T3e = VFMA(LDK(KP198912367), T2G, T2T);
+				   T3u = VFMA(LDK(KP923879532), T3t, T3s);
+				   T3M = VFNMS(LDK(KP923879532), T3t, T3s);
+				   {
+					V T3g, T3l, T2V, T3o;
+					T3g = VSUB(T3e, T3f);
+					T3l = VADD(T3f, T3e);
+					T2V = VADD(T27, T2U);
+					T3o = VSUB(T2U, T27);
+					T3F = VFNMS(LDK(KP923879532), T3E, T3D);
+					T3P = VFMA(LDK(KP923879532), T3E, T3D);
+					T3x = VFMA(LDK(KP668178637), T3w, T3v);
+					T3H = VFNMS(LDK(KP668178637), T3v, T3w);
+					T3q = VFMA(LDK(KP980785280), T3l, T3k);
+					T3m = VFNMS(LDK(KP980785280), T3l, T3k);
+					T3h = VFNMS(LDK(KP980785280), T3g, T3d);
+					T3j = VFMA(LDK(KP980785280), T3g, T3d);
+					T3r = VFNMS(LDK(KP980785280), T3o, T3n);
+					T3p = VFMA(LDK(KP980785280), T3o, T3n);
+					T2W = VFNMS(LDK(KP980785280), T2V, T1k);
+					T3i = VFMA(LDK(KP980785280), T2V, T1k);
+				   }
+			      }
+			      {
+				   V T7n, T7Z, T8j, T89, T7k, T7O, T8g, T7Y, T7H, T7R, T80, T7q, T7u, T82, T83;
+				   V T7x;
+				   {
+					V T7c, T7W, T7D, T87, T7f, T7F, T3A, T3G, T7E, T7i;
+					T7c = VFNMS(LDK(KP923879532), T7b, T7a);
+					T7W = VFMA(LDK(KP923879532), T7b, T7a);
+					T7D = VFNMS(LDK(KP923879532), T7C, T7B);
+					T87 = VFMA(LDK(KP923879532), T7C, T7B);
+					T7f = VFNMS(LDK(KP668178637), T7e, T7d);
+					T7F = VFMA(LDK(KP668178637), T7d, T7e);
+					ST(&(x[WS(rs, 46)]), VFNMSI(T3p, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 18)]), VFMAI(T3p, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 50)]), VFMAI(T3r, T3q), ms, &(x[0]));
+					ST(&(x[WS(rs, 14)]), VFNMSI(T3r, T3q), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFMAI(T3j, T3i), ms, &(x[0]));
+					ST(&(x[WS(rs, 62)]), VFNMSI(T3j, T3i), ms, &(x[0]));
+					ST(&(x[WS(rs, 34)]), VFMAI(T3h, T2W), ms, &(x[0]));
+					ST(&(x[WS(rs, 30)]), VFNMSI(T3h, T2W), ms, &(x[0]));
+					T3A = VFMA(LDK(KP668178637), T3z, T3y);
+					T3G = VFNMS(LDK(KP668178637), T3y, T3z);
+					T7E = VFMA(LDK(KP668178637), T7g, T7h);
+					T7i = VFNMS(LDK(KP668178637), T7h, T7g);
+					T7n = VFNMS(LDK(KP923879532), T7m, T7l);
+					T7Z = VFMA(LDK(KP923879532), T7m, T7l);
+					{
+					     V T3I, T3N, T3B, T3Q;
+					     T3I = VSUB(T3G, T3H);
+					     T3N = VADD(T3H, T3G);
+					     T3B = VADD(T3x, T3A);
+					     T3Q = VSUB(T3A, T3x);
+					     {
+						  V T7j, T88, T7G, T7X;
+						  T7j = VADD(T7f, T7i);
+						  T88 = VSUB(T7f, T7i);
+						  T7G = VSUB(T7E, T7F);
+						  T7X = VADD(T7F, T7E);
+						  {
+						       V T3S, T3O, T3J, T3L;
+						       T3S = VFNMS(LDK(KP831469612), T3N, T3M);
+						       T3O = VFMA(LDK(KP831469612), T3N, T3M);
+						       T3J = VFNMS(LDK(KP831469612), T3I, T3F);
+						       T3L = VFMA(LDK(KP831469612), T3I, T3F);
+						       {
+							    V T3T, T3R, T3C, T3K;
+							    T3T = VFMA(LDK(KP831469612), T3Q, T3P);
+							    T3R = VFNMS(LDK(KP831469612), T3Q, T3P);
+							    T3C = VFNMS(LDK(KP831469612), T3B, T3u);
+							    T3K = VFMA(LDK(KP831469612), T3B, T3u);
+							    T8j = VFNMS(LDK(KP831469612), T88, T87);
+							    T89 = VFMA(LDK(KP831469612), T88, T87);
+							    T7k = VFNMS(LDK(KP831469612), T7j, T7c);
+							    T7O = VFMA(LDK(KP831469612), T7j, T7c);
+							    T8g = VFNMS(LDK(KP831469612), T7X, T7W);
+							    T7Y = VFMA(LDK(KP831469612), T7X, T7W);
+							    T7H = VFNMS(LDK(KP831469612), T7G, T7D);
+							    T7R = VFMA(LDK(KP831469612), T7G, T7D);
+							    ST(&(x[WS(rs, 42)]), VFMAI(T3R, T3O), ms, &(x[0]));
+							    ST(&(x[WS(rs, 22)]), VFNMSI(T3R, T3O), ms, &(x[0]));
+							    ST(&(x[WS(rs, 54)]), VFNMSI(T3T, T3S), ms, &(x[0]));
+							    ST(&(x[WS(rs, 10)]), VFMAI(T3T, T3S), ms, &(x[0]));
+							    ST(&(x[WS(rs, 58)]), VFMAI(T3L, T3K), ms, &(x[0]));
+							    ST(&(x[WS(rs, 6)]), VFNMSI(T3L, T3K), ms, &(x[0]));
+							    ST(&(x[WS(rs, 26)]), VFMAI(T3J, T3C), ms, &(x[0]));
+							    ST(&(x[WS(rs, 38)]), VFNMSI(T3J, T3C), ms, &(x[0]));
+							    T80 = VFNMS(LDK(KP923879532), T7p, T7o);
+							    T7q = VFMA(LDK(KP923879532), T7p, T7o);
+						       }
+						  }
+					     }
+					}
+					T7u = VFNMS(LDK(KP923879532), T7t, T7s);
+					T82 = VFMA(LDK(KP923879532), T7t, T7s);
+					T83 = VFNMS(LDK(KP923879532), T7w, T7v);
+					T7x = VFMA(LDK(KP923879532), T7w, T7v);
+				   }
+				   {
+					V T5g, T6I, T6p, T6T, T5p, T6q, T6r, T5y;
+					T5g = VFMA(LDK(KP923879532), T5f, T58);
+					T6I = VFNMS(LDK(KP923879532), T5f, T58);
+					{
+					     V T7r, T7I, T7y, T7J;
+					     T7r = VFNMS(LDK(KP534511135), T7q, T7n);
+					     T7I = VFMA(LDK(KP534511135), T7n, T7q);
+					     T7y = VFNMS(LDK(KP534511135), T7x, T7u);
+					     T7J = VFMA(LDK(KP534511135), T7u, T7x);
+					     {
+						  V T81, T8a, T84, T8b;
+						  T81 = VFMA(LDK(KP303346683), T80, T7Z);
+						  T8a = VFNMS(LDK(KP303346683), T7Z, T80);
+						  T84 = VFMA(LDK(KP303346683), T83, T82);
+						  T8b = VFNMS(LDK(KP303346683), T82, T83);
+						  T6p = VFMA(LDK(KP923879532), T6o, T6l);
+						  T6T = VFNMS(LDK(KP923879532), T6o, T6l);
+						  T5p = VFNMS(LDK(KP198912367), T5o, T5l);
+						  T6q = VFMA(LDK(KP198912367), T5l, T5o);
+						  {
+						       V T7K, T7P, T7z, T7S;
+						       T7K = VSUB(T7I, T7J);
+						       T7P = VADD(T7I, T7J);
+						       T7z = VADD(T7r, T7y);
+						       T7S = VSUB(T7y, T7r);
+						       {
+							    V T8c, T8h, T85, T8k;
+							    T8c = VSUB(T8a, T8b);
+							    T8h = VADD(T8a, T8b);
+							    T85 = VADD(T81, T84);
+							    T8k = VSUB(T84, T81);
+							    {
+								 V T7Q, T7U, T7L, T7N;
+								 T7Q = VFNMS(LDK(KP881921264), T7P, T7O);
+								 T7U = VFMA(LDK(KP881921264), T7P, T7O);
+								 T7L = VFNMS(LDK(KP881921264), T7K, T7H);
+								 T7N = VFMA(LDK(KP881921264), T7K, T7H);
+								 {
+								      V T7T, T7V, T7A, T7M;
+								      T7T = VFNMS(LDK(KP881921264), T7S, T7R);
+								      T7V = VFMA(LDK(KP881921264), T7S, T7R);
+								      T7A = VFNMS(LDK(KP881921264), T7z, T7k);
+								      T7M = VFMA(LDK(KP881921264), T7z, T7k);
+								      {
+									   V T8i, T8m, T8d, T8f;
+									   T8i = VFMA(LDK(KP956940335), T8h, T8g);
+									   T8m = VFNMS(LDK(KP956940335), T8h, T8g);
+									   T8d = VFNMS(LDK(KP956940335), T8c, T89);
+									   T8f = VFMA(LDK(KP956940335), T8c, T89);
+									   {
+										V T8l, T8n, T86, T8e;
+										T8l = VFMA(LDK(KP956940335), T8k, T8j);
+										T8n = VFNMS(LDK(KP956940335), T8k, T8j);
+										T86 = VFNMS(LDK(KP956940335), T85, T7Y);
+										T8e = VFMA(LDK(KP956940335), T85, T7Y);
+										ST(&(x[WS(rs, 53)]), VFNMSI(T7V, T7U), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 11)]), VFMAI(T7V, T7U), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 43)]), VFMAI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 21)]), VFNMSI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 59)]), VFMAI(T7N, T7M), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 5)]), VFNMSI(T7N, T7M), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 27)]), VFMAI(T7L, T7A), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 37)]), VFNMSI(T7L, T7A), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 51)]), VFMAI(T8n, T8m), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 13)]), VFNMSI(T8n, T8m), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 45)]), VFNMSI(T8l, T8i), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 19)]), VFMAI(T8l, T8i), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 3)]), VFMAI(T8f, T8e), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 61)]), VFNMSI(T8f, T8e), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 35)]), VFMAI(T8d, T86), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 29)]), VFNMSI(T8d, T86), ms, &(x[WS(rs, 1)]));
+										T6r = VFMA(LDK(KP198912367), T5u, T5x);
+										T5y = VFNMS(LDK(KP198912367), T5x, T5u);
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					{
+					     V T5N, T5U, T68, T5z, T6U, T6f;
+					     T5N = VFMA(LDK(KP923879532), T5M, T5F);
+					     T6L = VFNMS(LDK(KP923879532), T5M, T5F);
+					     T6M = VFNMS(LDK(KP923879532), T5T, T5Q);
+					     T5U = VFMA(LDK(KP923879532), T5T, T5Q);
+					     T68 = VFMA(LDK(KP923879532), T67, T60);
+					     T6O = VFNMS(LDK(KP923879532), T67, T60);
+					     T5z = VADD(T5p, T5y);
+					     T6U = VSUB(T5y, T5p);
+					     T6P = VFNMS(LDK(KP923879532), T6e, T6b);
+					     T6f = VFMA(LDK(KP923879532), T6e, T6b);
+					     {
+						  V T5V, T6u, T6g, T6v, T6s, T6J;
+						  T6s = VSUB(T6q, T6r);
+						  T6J = VADD(T6q, T6r);
+						  T5V = VFNMS(LDK(KP098491403), T5U, T5N);
+						  T6u = VFMA(LDK(KP098491403), T5N, T5U);
+						  T75 = VFNMS(LDK(KP980785280), T6U, T6T);
+						  T6V = VFMA(LDK(KP980785280), T6U, T6T);
+						  T5A = VFMA(LDK(KP980785280), T5z, T5g);
+						  T6A = VFNMS(LDK(KP980785280), T5z, T5g);
+						  T6g = VFNMS(LDK(KP098491403), T6f, T68);
+						  T6v = VFMA(LDK(KP098491403), T68, T6f);
+						  T72 = VFNMS(LDK(KP980785280), T6J, T6I);
+						  T6K = VFMA(LDK(KP980785280), T6J, T6I);
+						  T6t = VFMA(LDK(KP980785280), T6s, T6p);
+						  T6D = VFNMS(LDK(KP980785280), T6s, T6p);
+						  T6w = VSUB(T6u, T6v);
+						  T6B = VADD(T6u, T6v);
+						  T6h = VADD(T5V, T6g);
+						  T6E = VSUB(T6g, T5V);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T6W, T6N, T6G, T6C, T6z, T6x, T6H, T6F, T6y, T6i, T6X, T6Q;
+		    T6W = VFNMS(LDK(KP820678790), T6L, T6M);
+		    T6N = VFMA(LDK(KP820678790), T6M, T6L);
+		    T6G = VFMA(LDK(KP995184726), T6B, T6A);
+		    T6C = VFNMS(LDK(KP995184726), T6B, T6A);
+		    T6z = VFMA(LDK(KP995184726), T6w, T6t);
+		    T6x = VFNMS(LDK(KP995184726), T6w, T6t);
+		    T6H = VFMA(LDK(KP995184726), T6E, T6D);
+		    T6F = VFNMS(LDK(KP995184726), T6E, T6D);
+		    T6y = VFMA(LDK(KP995184726), T6h, T5A);
+		    T6i = VFNMS(LDK(KP995184726), T6h, T5A);
+		    T6X = VFNMS(LDK(KP820678790), T6O, T6P);
+		    T6Q = VFMA(LDK(KP820678790), T6P, T6O);
+		    {
+			 V T73, T6Y, T76, T6R;
+			 ST(&(x[WS(rs, 49)]), VFNMSI(T6H, T6G), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 15)]), VFMAI(T6H, T6G), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 47)]), VFMAI(T6F, T6C), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 17)]), VFNMSI(T6F, T6C), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 63)]), VFMAI(T6z, T6y), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VFNMSI(T6z, T6y), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 31)]), VFMAI(T6x, T6i), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 33)]), VFNMSI(T6x, T6i), ms, &(x[WS(rs, 1)]));
+			 T73 = VADD(T6W, T6X);
+			 T6Y = VSUB(T6W, T6X);
+			 T76 = VSUB(T6Q, T6N);
+			 T6R = VADD(T6N, T6Q);
+			 {
+			      V T78, T74, T71, T6Z, T79, T77, T70, T6S;
+			      T78 = VFNMS(LDK(KP773010453), T73, T72);
+			      T74 = VFMA(LDK(KP773010453), T73, T72);
+			      T71 = VFMA(LDK(KP773010453), T6Y, T6V);
+			      T6Z = VFNMS(LDK(KP773010453), T6Y, T6V);
+			      T79 = VFNMS(LDK(KP773010453), T76, T75);
+			      T77 = VFMA(LDK(KP773010453), T76, T75);
+			      T70 = VFMA(LDK(KP773010453), T6R, T6K);
+			      T6S = VFNMS(LDK(KP773010453), T6R, T6K);
+			      ST(&(x[WS(rs, 55)]), VFMAI(T79, T78), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 9)]), VFNMSI(T79, T78), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 41)]), VFNMSI(T77, T74), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 23)]), VFMAI(T77, T74), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 7)]), VFMAI(T71, T70), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 57)]), VFNMSI(T71, T70), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 39)]), VFMAI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 25)]), VFNMSI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     VTW(0, 32),
+     VTW(0, 33),
+     VTW(0, 34),
+     VTW(0, 35),
+     VTW(0, 36),
+     VTW(0, 37),
+     VTW(0, 38),
+     VTW(0, 39),
+     VTW(0, 40),
+     VTW(0, 41),
+     VTW(0, 42),
+     VTW(0, 43),
+     VTW(0, 44),
+     VTW(0, 45),
+     VTW(0, 46),
+     VTW(0, 47),
+     VTW(0, 48),
+     VTW(0, 49),
+     VTW(0, 50),
+     VTW(0, 51),
+     VTW(0, 52),
+     VTW(0, 53),
+     VTW(0, 54),
+     VTW(0, 55),
+     VTW(0, 56),
+     VTW(0, 57),
+     VTW(0, 58),
+     VTW(0, 59),
+     VTW(0, 60),
+     VTW(0, 61),
+     VTW(0, 62),
+     VTW(0, 63),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 64, XSIMD_STRING("t1fv_64"), twinstr, &GENUS, {261, 126, 258, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_64) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_64, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t1fv_64 -include t1f.h */
+
+/*
+ * This function contains 519 FP additions, 250 FP multiplications,
+ * (or, 467 additions, 198 multiplications, 52 fused multiply/add),
+ * 107 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V Tg, T4a, T6r, T7f, T3o, T4B, T5q, T7e, T5R, T62, T28, T4o, T2g, T4l, T7n;
+	       V T7Z, T68, T6j, T2C, T4s, T3a, T4v, T7u, T82, T7E, T7F, T7V, T5F, T6u, T1k;
+	       V T4e, T1r, T4d, T7B, T7C, T7W, T5M, T6v, TV, T4g, T12, T4h, T7h, T7i, TD;
+	       V T4C, T3h, T4b, T5x, T6s, T1R, T4m, T7q, T80, T2j, T4p, T5Y, T63, T2Z, T4w;
+	       V T7x, T83, T33, T4t, T6f, T6k;
+	       {
+		    V T1, T3, T3m, T3k, Tb, Td, Te, T6, T8, T9, T2, T3l, T3j;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
+		    T3 = BYTWJ(&(W[TWVL * 62]), T2);
+		    T3l = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
+		    T3m = BYTWJ(&(W[TWVL * 94]), T3l);
+		    T3j = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+		    T3k = BYTWJ(&(W[TWVL * 30]), T3j);
+		    {
+			 V Ta, Tc, T5, T7;
+			 Ta = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
+			 Tb = BYTWJ(&(W[TWVL * 110]), Ta);
+			 Tc = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			 Td = BYTWJ(&(W[TWVL * 46]), Tc);
+			 Te = VSUB(Tb, Td);
+			 T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T6 = BYTWJ(&(W[TWVL * 14]), T5);
+			 T7 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
+			 T8 = BYTWJ(&(W[TWVL * 78]), T7);
+			 T9 = VSUB(T6, T8);
+		    }
+		    {
+			 V T4, Tf, T6p, T6q;
+			 T4 = VSUB(T1, T3);
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 Tg = VADD(T4, Tf);
+			 T4a = VSUB(T4, Tf);
+			 T6p = VADD(Tb, Td);
+			 T6q = VADD(T6, T8);
+			 T6r = VSUB(T6p, T6q);
+			 T7f = VADD(T6q, T6p);
+		    }
+		    {
+			 V T3i, T3n, T5o, T5p;
+			 T3i = VMUL(LDK(KP707106781), VSUB(Te, T9));
+			 T3n = VSUB(T3k, T3m);
+			 T3o = VSUB(T3i, T3n);
+			 T4B = VADD(T3n, T3i);
+			 T5o = VADD(T1, T3);
+			 T5p = VADD(T3k, T3m);
+			 T5q = VSUB(T5o, T5p);
+			 T7e = VADD(T5o, T5p);
+		    }
+	       }
+	       {
+		    V T24, T26, T5Q, T2b, T2d, T5P, T1W, T60, T21, T61, T22, T27;
+		    {
+			 V T23, T25, T2a, T2c;
+			 T23 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 T24 = BYTWJ(&(W[TWVL * 32]), T23);
+			 T25 = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
+			 T26 = BYTWJ(&(W[TWVL * 96]), T25);
+			 T5Q = VADD(T24, T26);
+			 T2a = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2b = BYTWJ(&(W[0]), T2a);
+			 T2c = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
+			 T2d = BYTWJ(&(W[TWVL * 64]), T2c);
+			 T5P = VADD(T2b, T2d);
+		    }
+		    {
+			 V T1T, T1V, T1S, T1U;
+			 T1S = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
+			 T1T = BYTWJ(&(W[TWVL * 112]), T1S);
+			 T1U = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+			 T1V = BYTWJ(&(W[TWVL * 48]), T1U);
+			 T1W = VSUB(T1T, T1V);
+			 T60 = VADD(T1T, T1V);
+		    }
+		    {
+			 V T1Y, T20, T1X, T1Z;
+			 T1X = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 T1Y = BYTWJ(&(W[TWVL * 16]), T1X);
+			 T1Z = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
+			 T20 = BYTWJ(&(W[TWVL * 80]), T1Z);
+			 T21 = VSUB(T1Y, T20);
+			 T61 = VADD(T1Y, T20);
+		    }
+		    T5R = VSUB(T5P, T5Q);
+		    T62 = VSUB(T60, T61);
+		    T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
+		    T27 = VSUB(T24, T26);
+		    T28 = VSUB(T22, T27);
+		    T4o = VADD(T27, T22);
+		    {
+			 V T2e, T2f, T7l, T7m;
+			 T2e = VSUB(T2b, T2d);
+			 T2f = VMUL(LDK(KP707106781), VADD(T21, T1W));
+			 T2g = VADD(T2e, T2f);
+			 T4l = VSUB(T2e, T2f);
+			 T7l = VADD(T5P, T5Q);
+			 T7m = VADD(T61, T60);
+			 T7n = VADD(T7l, T7m);
+			 T7Z = VSUB(T7l, T7m);
+		    }
+	       }
+	       {
+		    V T2n, T2p, T66, T36, T38, T67, T2v, T6i, T2A, T6h, T2q, T2B;
+		    {
+			 V T2m, T2o, T35, T37;
+			 T2m = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
+			 T2n = BYTWJ(&(W[TWVL * 124]), T2m);
+			 T2o = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+			 T2p = BYTWJ(&(W[TWVL * 60]), T2o);
+			 T66 = VADD(T2n, T2p);
+			 T35 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T36 = BYTWJ(&(W[TWVL * 28]), T35);
+			 T37 = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
+			 T38 = BYTWJ(&(W[TWVL * 92]), T37);
+			 T67 = VADD(T36, T38);
+		    }
+		    {
+			 V T2s, T2u, T2r, T2t;
+			 T2r = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T2s = BYTWJ(&(W[TWVL * 12]), T2r);
+			 T2t = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
+			 T2u = BYTWJ(&(W[TWVL * 76]), T2t);
+			 T2v = VSUB(T2s, T2u);
+			 T6i = VADD(T2s, T2u);
+		    }
+		    {
+			 V T2x, T2z, T2w, T2y;
+			 T2w = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
+			 T2x = BYTWJ(&(W[TWVL * 108]), T2w);
+			 T2y = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T2z = BYTWJ(&(W[TWVL * 44]), T2y);
+			 T2A = VSUB(T2x, T2z);
+			 T6h = VADD(T2x, T2z);
+		    }
+		    T68 = VSUB(T66, T67);
+		    T6j = VSUB(T6h, T6i);
+		    T2q = VSUB(T2n, T2p);
+		    T2B = VMUL(LDK(KP707106781), VADD(T2v, T2A));
+		    T2C = VADD(T2q, T2B);
+		    T4s = VSUB(T2q, T2B);
+		    {
+			 V T34, T39, T7s, T7t;
+			 T34 = VMUL(LDK(KP707106781), VSUB(T2A, T2v));
+			 T39 = VSUB(T36, T38);
+			 T3a = VSUB(T34, T39);
+			 T4v = VADD(T39, T34);
+			 T7s = VADD(T66, T67);
+			 T7t = VADD(T6i, T6h);
+			 T7u = VADD(T7s, T7t);
+			 T82 = VSUB(T7s, T7t);
+		    }
+	       }
+	       {
+		    V T1g, T1i, T5A, T1m, T1o, T5z, T18, T5C, T1d, T5D, T5B, T5E;
+		    {
+			 V T1f, T1h, T1l, T1n;
+			 T1f = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 T1g = BYTWJ(&(W[TWVL * 34]), T1f);
+			 T1h = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
+			 T1i = BYTWJ(&(W[TWVL * 98]), T1h);
+			 T5A = VADD(T1g, T1i);
+			 T1l = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T1m = BYTWJ(&(W[TWVL * 2]), T1l);
+			 T1n = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
+			 T1o = BYTWJ(&(W[TWVL * 66]), T1n);
+			 T5z = VADD(T1m, T1o);
+		    }
+		    {
+			 V T15, T17, T14, T16;
+			 T14 = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
+			 T15 = BYTWJ(&(W[TWVL * 114]), T14);
+			 T16 = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			 T17 = BYTWJ(&(W[TWVL * 50]), T16);
+			 T18 = VSUB(T15, T17);
+			 T5C = VADD(T15, T17);
+		    }
+		    {
+			 V T1a, T1c, T19, T1b;
+			 T19 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 T1a = BYTWJ(&(W[TWVL * 18]), T19);
+			 T1b = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
+			 T1c = BYTWJ(&(W[TWVL * 82]), T1b);
+			 T1d = VSUB(T1a, T1c);
+			 T5D = VADD(T1a, T1c);
+		    }
+		    T7E = VADD(T5z, T5A);
+		    T7F = VADD(T5D, T5C);
+		    T7V = VSUB(T7E, T7F);
+		    T5B = VSUB(T5z, T5A);
+		    T5E = VSUB(T5C, T5D);
+		    T5F = VFMA(LDK(KP923879532), T5B, VMUL(LDK(KP382683432), T5E));
+		    T6u = VFNMS(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5E));
+		    {
+			 V T1e, T1j, T1p, T1q;
+			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
+			 T1j = VSUB(T1g, T1i);
+			 T1k = VSUB(T1e, T1j);
+			 T4e = VADD(T1j, T1e);
+			 T1p = VSUB(T1m, T1o);
+			 T1q = VMUL(LDK(KP707106781), VADD(T1d, T18));
+			 T1r = VADD(T1p, T1q);
+			 T4d = VSUB(T1p, T1q);
+		    }
+	       }
+	       {
+		    V TG, TI, T5G, TY, T10, T5H, TO, T5K, TT, T5J, T5I, T5L;
+		    {
+			 V TF, TH, TX, TZ;
+			 TF = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
+			 TG = BYTWJ(&(W[TWVL * 122]), TF);
+			 TH = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+			 TI = BYTWJ(&(W[TWVL * 58]), TH);
+			 T5G = VADD(TG, TI);
+			 TX = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 TY = BYTWJ(&(W[TWVL * 26]), TX);
+			 TZ = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
+			 T10 = BYTWJ(&(W[TWVL * 90]), TZ);
+			 T5H = VADD(TY, T10);
+		    }
+		    {
+			 V TL, TN, TK, TM;
+			 TK = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 TL = BYTWJ(&(W[TWVL * 10]), TK);
+			 TM = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
+			 TN = BYTWJ(&(W[TWVL * 74]), TM);
+			 TO = VSUB(TL, TN);
+			 T5K = VADD(TL, TN);
+		    }
+		    {
+			 V TQ, TS, TP, TR;
+			 TP = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
+			 TQ = BYTWJ(&(W[TWVL * 106]), TP);
+			 TR = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 TS = BYTWJ(&(W[TWVL * 42]), TR);
+			 TT = VSUB(TQ, TS);
+			 T5J = VADD(TQ, TS);
+		    }
+		    T7B = VADD(T5G, T5H);
+		    T7C = VADD(T5K, T5J);
+		    T7W = VSUB(T7B, T7C);
+		    T5I = VSUB(T5G, T5H);
+		    T5L = VSUB(T5J, T5K);
+		    T5M = VFNMS(LDK(KP382683432), T5L, VMUL(LDK(KP923879532), T5I));
+		    T6v = VFMA(LDK(KP382683432), T5I, VMUL(LDK(KP923879532), T5L));
+		    {
+			 V TJ, TU, TW, T11;
+			 TJ = VSUB(TG, TI);
+			 TU = VMUL(LDK(KP707106781), VADD(TO, TT));
+			 TV = VADD(TJ, TU);
+			 T4g = VSUB(TJ, TU);
+			 TW = VMUL(LDK(KP707106781), VSUB(TT, TO));
+			 T11 = VSUB(TY, T10);
+			 T12 = VSUB(TW, T11);
+			 T4h = VADD(T11, TW);
+		    }
+	       }
+	       {
+		    V Tl, T5r, TB, T5v, Tq, T5s, Tw, T5u, Tr, TC;
+		    {
+			 V Ti, Tk, Th, Tj;
+			 Th = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Ti = BYTWJ(&(W[TWVL * 6]), Th);
+			 Tj = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
+			 Tk = BYTWJ(&(W[TWVL * 70]), Tj);
+			 Tl = VSUB(Ti, Tk);
+			 T5r = VADD(Ti, Tk);
+		    }
+		    {
+			 V Ty, TA, Tx, Tz;
+			 Tx = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Ty = BYTWJ(&(W[TWVL * 22]), Tx);
+			 Tz = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
+			 TA = BYTWJ(&(W[TWVL * 86]), Tz);
+			 TB = VSUB(Ty, TA);
+			 T5v = VADD(Ty, TA);
+		    }
+		    {
+			 V Tn, Tp, Tm, To;
+			 Tm = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 Tn = BYTWJ(&(W[TWVL * 38]), Tm);
+			 To = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
+			 Tp = BYTWJ(&(W[TWVL * 102]), To);
+			 Tq = VSUB(Tn, Tp);
+			 T5s = VADD(Tn, Tp);
+		    }
+		    {
+			 V Tt, Tv, Ts, Tu;
+			 Ts = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
+			 Tt = BYTWJ(&(W[TWVL * 118]), Ts);
+			 Tu = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+			 Tv = BYTWJ(&(W[TWVL * 54]), Tu);
+			 Tw = VSUB(Tt, Tv);
+			 T5u = VADD(Tt, Tv);
+		    }
+		    T7h = VADD(T5r, T5s);
+		    T7i = VADD(T5u, T5v);
+		    Tr = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
+		    TC = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), TB));
+		    TD = VADD(Tr, TC);
+		    T4C = VSUB(TC, Tr);
+		    {
+			 V T3f, T3g, T5t, T5w;
+			 T3f = VFNMS(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
+			 T3g = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
+			 T3h = VSUB(T3f, T3g);
+			 T4b = VADD(T3g, T3f);
+			 T5t = VSUB(T5r, T5s);
+			 T5w = VSUB(T5u, T5v);
+			 T5x = VMUL(LDK(KP707106781), VADD(T5t, T5w));
+			 T6s = VMUL(LDK(KP707106781), VSUB(T5w, T5t));
+		    }
+	       }
+	       {
+		    V T1z, T5V, T1P, T5T, T1E, T5W, T1K, T5S;
+		    {
+			 V T1w, T1y, T1v, T1x;
+			 T1v = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
+			 T1w = BYTWJ(&(W[TWVL * 120]), T1v);
+			 T1x = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+			 T1y = BYTWJ(&(W[TWVL * 56]), T1x);
+			 T1z = VSUB(T1w, T1y);
+			 T5V = VADD(T1w, T1y);
+		    }
+		    {
+			 V T1M, T1O, T1L, T1N;
+			 T1L = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			 T1M = BYTWJ(&(W[TWVL * 40]), T1L);
+			 T1N = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
+			 T1O = BYTWJ(&(W[TWVL * 104]), T1N);
+			 T1P = VSUB(T1M, T1O);
+			 T5T = VADD(T1M, T1O);
+		    }
+		    {
+			 V T1B, T1D, T1A, T1C;
+			 T1A = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T1B = BYTWJ(&(W[TWVL * 24]), T1A);
+			 T1C = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
+			 T1D = BYTWJ(&(W[TWVL * 88]), T1C);
+			 T1E = VSUB(T1B, T1D);
+			 T5W = VADD(T1B, T1D);
+		    }
+		    {
+			 V T1H, T1J, T1G, T1I;
+			 T1G = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T1H = BYTWJ(&(W[TWVL * 8]), T1G);
+			 T1I = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
+			 T1J = BYTWJ(&(W[TWVL * 72]), T1I);
+			 T1K = VSUB(T1H, T1J);
+			 T5S = VADD(T1H, T1J);
+		    }
+		    {
+			 V T1F, T1Q, T7o, T7p;
+			 T1F = VFNMS(LDK(KP923879532), T1E, VMUL(LDK(KP382683432), T1z));
+			 T1Q = VFMA(LDK(KP382683432), T1K, VMUL(LDK(KP923879532), T1P));
+			 T1R = VSUB(T1F, T1Q);
+			 T4m = VADD(T1Q, T1F);
+			 T7o = VADD(T5S, T5T);
+			 T7p = VADD(T5V, T5W);
+			 T7q = VADD(T7o, T7p);
+			 T80 = VSUB(T7p, T7o);
+		    }
+		    {
+			 V T2h, T2i, T5U, T5X;
+			 T2h = VFNMS(LDK(KP382683432), T1P, VMUL(LDK(KP923879532), T1K));
+			 T2i = VFMA(LDK(KP923879532), T1z, VMUL(LDK(KP382683432), T1E));
+			 T2j = VADD(T2h, T2i);
+			 T4p = VSUB(T2i, T2h);
+			 T5U = VSUB(T5S, T5T);
+			 T5X = VSUB(T5V, T5W);
+			 T5Y = VMUL(LDK(KP707106781), VADD(T5U, T5X));
+			 T63 = VMUL(LDK(KP707106781), VSUB(T5X, T5U));
+		    }
+	       }
+	       {
+		    V T2H, T69, T2X, T6d, T2M, T6a, T2S, T6c;
+		    {
+			 V T2E, T2G, T2D, T2F;
+			 T2D = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T2E = BYTWJ(&(W[TWVL * 4]), T2D);
+			 T2F = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
+			 T2G = BYTWJ(&(W[TWVL * 68]), T2F);
+			 T2H = VSUB(T2E, T2G);
+			 T69 = VADD(T2E, T2G);
+		    }
+		    {
+			 V T2U, T2W, T2T, T2V;
+			 T2T = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 T2U = BYTWJ(&(W[TWVL * 20]), T2T);
+			 T2V = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
+			 T2W = BYTWJ(&(W[TWVL * 84]), T2V);
+			 T2X = VSUB(T2U, T2W);
+			 T6d = VADD(T2U, T2W);
+		    }
+		    {
+			 V T2J, T2L, T2I, T2K;
+			 T2I = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 T2J = BYTWJ(&(W[TWVL * 36]), T2I);
+			 T2K = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
+			 T2L = BYTWJ(&(W[TWVL * 100]), T2K);
+			 T2M = VSUB(T2J, T2L);
+			 T6a = VADD(T2J, T2L);
+		    }
+		    {
+			 V T2P, T2R, T2O, T2Q;
+			 T2O = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
+			 T2P = BYTWJ(&(W[TWVL * 116]), T2O);
+			 T2Q = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+			 T2R = BYTWJ(&(W[TWVL * 52]), T2Q);
+			 T2S = VSUB(T2P, T2R);
+			 T6c = VADD(T2P, T2R);
+		    }
+		    {
+			 V T2N, T2Y, T7v, T7w;
+			 T2N = VFNMS(LDK(KP382683432), T2M, VMUL(LDK(KP923879532), T2H));
+			 T2Y = VFMA(LDK(KP923879532), T2S, VMUL(LDK(KP382683432), T2X));
+			 T2Z = VADD(T2N, T2Y);
+			 T4w = VSUB(T2Y, T2N);
+			 T7v = VADD(T69, T6a);
+			 T7w = VADD(T6c, T6d);
+			 T7x = VADD(T7v, T7w);
+			 T83 = VSUB(T7w, T7v);
+		    }
+		    {
+			 V T31, T32, T6b, T6e;
+			 T31 = VFNMS(LDK(KP923879532), T2X, VMUL(LDK(KP382683432), T2S));
+			 T32 = VFMA(LDK(KP382683432), T2H, VMUL(LDK(KP923879532), T2M));
+			 T33 = VSUB(T31, T32);
+			 T4t = VADD(T32, T31);
+			 T6b = VSUB(T69, T6a);
+			 T6e = VSUB(T6c, T6d);
+			 T6f = VMUL(LDK(KP707106781), VADD(T6b, T6e));
+			 T6k = VMUL(LDK(KP707106781), VSUB(T6e, T6b));
+		    }
+	       }
+	       {
+		    V T7k, T7M, T7R, T7T, T7z, T7I, T7H, T7N, T7O, T7S;
+		    {
+			 V T7g, T7j, T7P, T7Q;
+			 T7g = VADD(T7e, T7f);
+			 T7j = VADD(T7h, T7i);
+			 T7k = VSUB(T7g, T7j);
+			 T7M = VADD(T7g, T7j);
+			 T7P = VADD(T7n, T7q);
+			 T7Q = VADD(T7u, T7x);
+			 T7R = VADD(T7P, T7Q);
+			 T7T = VBYI(VSUB(T7Q, T7P));
+		    }
+		    {
+			 V T7r, T7y, T7D, T7G;
+			 T7r = VSUB(T7n, T7q);
+			 T7y = VSUB(T7u, T7x);
+			 T7z = VMUL(LDK(KP707106781), VADD(T7r, T7y));
+			 T7I = VMUL(LDK(KP707106781), VSUB(T7y, T7r));
+			 T7D = VADD(T7B, T7C);
+			 T7G = VADD(T7E, T7F);
+			 T7H = VSUB(T7D, T7G);
+			 T7N = VADD(T7G, T7D);
+		    }
+		    T7O = VADD(T7M, T7N);
+		    ST(&(x[WS(rs, 32)]), VSUB(T7O, T7R), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T7O, T7R), ms, &(x[0]));
+		    T7S = VSUB(T7M, T7N);
+		    ST(&(x[WS(rs, 48)]), VSUB(T7S, T7T), ms, &(x[0]));
+		    ST(&(x[WS(rs, 16)]), VADD(T7S, T7T), ms, &(x[0]));
+		    {
+			 V T7A, T7J, T7K, T7L;
+			 T7A = VADD(T7k, T7z);
+			 T7J = VBYI(VADD(T7H, T7I));
+			 ST(&(x[WS(rs, 56)]), VSUB(T7A, T7J), ms, &(x[0]));
+			 ST(&(x[WS(rs, 8)]), VADD(T7A, T7J), ms, &(x[0]));
+			 T7K = VSUB(T7k, T7z);
+			 T7L = VBYI(VSUB(T7I, T7H));
+			 ST(&(x[WS(rs, 40)]), VSUB(T7K, T7L), ms, &(x[0]));
+			 ST(&(x[WS(rs, 24)]), VADD(T7K, T7L), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T7Y, T8j, T8c, T8k, T85, T8g, T89, T8h;
+		    {
+			 V T7U, T7X, T8a, T8b;
+			 T7U = VSUB(T7e, T7f);
+			 T7X = VMUL(LDK(KP707106781), VADD(T7V, T7W));
+			 T7Y = VADD(T7U, T7X);
+			 T8j = VSUB(T7U, T7X);
+			 T8a = VFNMS(LDK(KP382683432), T7Z, VMUL(LDK(KP923879532), T80));
+			 T8b = VFMA(LDK(KP382683432), T82, VMUL(LDK(KP923879532), T83));
+			 T8c = VADD(T8a, T8b);
+			 T8k = VSUB(T8b, T8a);
+		    }
+		    {
+			 V T81, T84, T87, T88;
+			 T81 = VFMA(LDK(KP923879532), T7Z, VMUL(LDK(KP382683432), T80));
+			 T84 = VFNMS(LDK(KP382683432), T83, VMUL(LDK(KP923879532), T82));
+			 T85 = VADD(T81, T84);
+			 T8g = VSUB(T84, T81);
+			 T87 = VSUB(T7i, T7h);
+			 T88 = VMUL(LDK(KP707106781), VSUB(T7W, T7V));
+			 T89 = VADD(T87, T88);
+			 T8h = VSUB(T88, T87);
+		    }
+		    {
+			 V T86, T8d, T8m, T8n;
+			 T86 = VADD(T7Y, T85);
+			 T8d = VBYI(VADD(T89, T8c));
+			 ST(&(x[WS(rs, 60)]), VSUB(T86, T8d), ms, &(x[0]));
+			 ST(&(x[WS(rs, 4)]), VADD(T86, T8d), ms, &(x[0]));
+			 T8m = VBYI(VADD(T8h, T8g));
+			 T8n = VADD(T8j, T8k);
+			 ST(&(x[WS(rs, 12)]), VADD(T8m, T8n), ms, &(x[0]));
+			 ST(&(x[WS(rs, 52)]), VSUB(T8n, T8m), ms, &(x[0]));
+		    }
+		    {
+			 V T8e, T8f, T8i, T8l;
+			 T8e = VSUB(T7Y, T85);
+			 T8f = VBYI(VSUB(T8c, T89));
+			 ST(&(x[WS(rs, 36)]), VSUB(T8e, T8f), ms, &(x[0]));
+			 ST(&(x[WS(rs, 28)]), VADD(T8e, T8f), ms, &(x[0]));
+			 T8i = VBYI(VSUB(T8g, T8h));
+			 T8l = VSUB(T8j, T8k);
+			 ST(&(x[WS(rs, 20)]), VADD(T8i, T8l), ms, &(x[0]));
+			 ST(&(x[WS(rs, 44)]), VSUB(T8l, T8i), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T5O, T6H, T6x, T6F, T6n, T6I, T6A, T6E;
+		    {
+			 V T5y, T5N, T6t, T6w;
+			 T5y = VADD(T5q, T5x);
+			 T5N = VADD(T5F, T5M);
+			 T5O = VADD(T5y, T5N);
+			 T6H = VSUB(T5y, T5N);
+			 T6t = VADD(T6r, T6s);
+			 T6w = VADD(T6u, T6v);
+			 T6x = VADD(T6t, T6w);
+			 T6F = VSUB(T6w, T6t);
+			 {
+			      V T65, T6y, T6m, T6z;
+			      {
+				   V T5Z, T64, T6g, T6l;
+				   T5Z = VADD(T5R, T5Y);
+				   T64 = VADD(T62, T63);
+				   T65 = VFMA(LDK(KP980785280), T5Z, VMUL(LDK(KP195090322), T64));
+				   T6y = VFNMS(LDK(KP195090322), T5Z, VMUL(LDK(KP980785280), T64));
+				   T6g = VADD(T68, T6f);
+				   T6l = VADD(T6j, T6k);
+				   T6m = VFNMS(LDK(KP195090322), T6l, VMUL(LDK(KP980785280), T6g));
+				   T6z = VFMA(LDK(KP195090322), T6g, VMUL(LDK(KP980785280), T6l));
+			      }
+			      T6n = VADD(T65, T6m);
+			      T6I = VSUB(T6z, T6y);
+			      T6A = VADD(T6y, T6z);
+			      T6E = VSUB(T6m, T65);
+			 }
+		    }
+		    {
+			 V T6o, T6B, T6K, T6L;
+			 T6o = VADD(T5O, T6n);
+			 T6B = VBYI(VADD(T6x, T6A));
+			 ST(&(x[WS(rs, 62)]), VSUB(T6o, T6B), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(T6o, T6B), ms, &(x[0]));
+			 T6K = VBYI(VADD(T6F, T6E));
+			 T6L = VADD(T6H, T6I);
+			 ST(&(x[WS(rs, 14)]), VADD(T6K, T6L), ms, &(x[0]));
+			 ST(&(x[WS(rs, 50)]), VSUB(T6L, T6K), ms, &(x[0]));
+		    }
+		    {
+			 V T6C, T6D, T6G, T6J;
+			 T6C = VSUB(T5O, T6n);
+			 T6D = VBYI(VSUB(T6A, T6x));
+			 ST(&(x[WS(rs, 34)]), VSUB(T6C, T6D), ms, &(x[0]));
+			 ST(&(x[WS(rs, 30)]), VADD(T6C, T6D), ms, &(x[0]));
+			 T6G = VBYI(VSUB(T6E, T6F));
+			 T6J = VSUB(T6H, T6I);
+			 ST(&(x[WS(rs, 18)]), VADD(T6G, T6J), ms, &(x[0]));
+			 ST(&(x[WS(rs, 46)]), VSUB(T6J, T6G), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T6O, T79, T6Z, T77, T6V, T7a, T72, T76;
+		    {
+			 V T6M, T6N, T6X, T6Y;
+			 T6M = VSUB(T5q, T5x);
+			 T6N = VSUB(T6v, T6u);
+			 T6O = VADD(T6M, T6N);
+			 T79 = VSUB(T6M, T6N);
+			 T6X = VSUB(T6s, T6r);
+			 T6Y = VSUB(T5M, T5F);
+			 T6Z = VADD(T6X, T6Y);
+			 T77 = VSUB(T6Y, T6X);
+			 {
+			      V T6R, T70, T6U, T71;
+			      {
+				   V T6P, T6Q, T6S, T6T;
+				   T6P = VSUB(T5R, T5Y);
+				   T6Q = VSUB(T63, T62);
+				   T6R = VFMA(LDK(KP831469612), T6P, VMUL(LDK(KP555570233), T6Q));
+				   T70 = VFNMS(LDK(KP555570233), T6P, VMUL(LDK(KP831469612), T6Q));
+				   T6S = VSUB(T68, T6f);
+				   T6T = VSUB(T6k, T6j);
+				   T6U = VFNMS(LDK(KP555570233), T6T, VMUL(LDK(KP831469612), T6S));
+				   T71 = VFMA(LDK(KP555570233), T6S, VMUL(LDK(KP831469612), T6T));
+			      }
+			      T6V = VADD(T6R, T6U);
+			      T7a = VSUB(T71, T70);
+			      T72 = VADD(T70, T71);
+			      T76 = VSUB(T6U, T6R);
+			 }
+		    }
+		    {
+			 V T6W, T73, T7c, T7d;
+			 T6W = VADD(T6O, T6V);
+			 T73 = VBYI(VADD(T6Z, T72));
+			 ST(&(x[WS(rs, 58)]), VSUB(T6W, T73), ms, &(x[0]));
+			 ST(&(x[WS(rs, 6)]), VADD(T6W, T73), ms, &(x[0]));
+			 T7c = VBYI(VADD(T77, T76));
+			 T7d = VADD(T79, T7a);
+			 ST(&(x[WS(rs, 10)]), VADD(T7c, T7d), ms, &(x[0]));
+			 ST(&(x[WS(rs, 54)]), VSUB(T7d, T7c), ms, &(x[0]));
+		    }
+		    {
+			 V T74, T75, T78, T7b;
+			 T74 = VSUB(T6O, T6V);
+			 T75 = VBYI(VSUB(T72, T6Z));
+			 ST(&(x[WS(rs, 38)]), VSUB(T74, T75), ms, &(x[0]));
+			 ST(&(x[WS(rs, 26)]), VADD(T74, T75), ms, &(x[0]));
+			 T78 = VBYI(VSUB(T76, T77));
+			 T7b = VSUB(T79, T7a);
+			 ST(&(x[WS(rs, 22)]), VADD(T78, T7b), ms, &(x[0]));
+			 ST(&(x[WS(rs, 42)]), VSUB(T7b, T78), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T4k, T5h, T4R, T59, T4H, T5j, T4P, T4Y, T4z, T4S, T4K, T4O, T55, T5k, T5c;
+		    V T5g;
+		    {
+			 V T4c, T57, T4j, T58, T4f, T4i;
+			 T4c = VADD(T4a, T4b);
+			 T57 = VSUB(T4C, T4B);
+			 T4f = VFMA(LDK(KP831469612), T4d, VMUL(LDK(KP555570233), T4e));
+			 T4i = VFNMS(LDK(KP555570233), T4h, VMUL(LDK(KP831469612), T4g));
+			 T4j = VADD(T4f, T4i);
+			 T58 = VSUB(T4i, T4f);
+			 T4k = VADD(T4c, T4j);
+			 T5h = VSUB(T58, T57);
+			 T4R = VSUB(T4c, T4j);
+			 T59 = VADD(T57, T58);
+		    }
+		    {
+			 V T4D, T4W, T4G, T4X, T4E, T4F;
+			 T4D = VADD(T4B, T4C);
+			 T4W = VSUB(T4a, T4b);
+			 T4E = VFNMS(LDK(KP555570233), T4d, VMUL(LDK(KP831469612), T4e));
+			 T4F = VFMA(LDK(KP555570233), T4g, VMUL(LDK(KP831469612), T4h));
+			 T4G = VADD(T4E, T4F);
+			 T4X = VSUB(T4F, T4E);
+			 T4H = VADD(T4D, T4G);
+			 T5j = VSUB(T4W, T4X);
+			 T4P = VSUB(T4G, T4D);
+			 T4Y = VADD(T4W, T4X);
+		    }
+		    {
+			 V T4r, T4I, T4y, T4J;
+			 {
+			      V T4n, T4q, T4u, T4x;
+			      T4n = VADD(T4l, T4m);
+			      T4q = VADD(T4o, T4p);
+			      T4r = VFMA(LDK(KP956940335), T4n, VMUL(LDK(KP290284677), T4q));
+			      T4I = VFNMS(LDK(KP290284677), T4n, VMUL(LDK(KP956940335), T4q));
+			      T4u = VADD(T4s, T4t);
+			      T4x = VADD(T4v, T4w);
+			      T4y = VFNMS(LDK(KP290284677), T4x, VMUL(LDK(KP956940335), T4u));
+			      T4J = VFMA(LDK(KP290284677), T4u, VMUL(LDK(KP956940335), T4x));
+			 }
+			 T4z = VADD(T4r, T4y);
+			 T4S = VSUB(T4J, T4I);
+			 T4K = VADD(T4I, T4J);
+			 T4O = VSUB(T4y, T4r);
+		    }
+		    {
+			 V T51, T5a, T54, T5b;
+			 {
+			      V T4Z, T50, T52, T53;
+			      T4Z = VSUB(T4l, T4m);
+			      T50 = VSUB(T4p, T4o);
+			      T51 = VFMA(LDK(KP881921264), T4Z, VMUL(LDK(KP471396736), T50));
+			      T5a = VFNMS(LDK(KP471396736), T4Z, VMUL(LDK(KP881921264), T50));
+			      T52 = VSUB(T4s, T4t);
+			      T53 = VSUB(T4w, T4v);
+			      T54 = VFNMS(LDK(KP471396736), T53, VMUL(LDK(KP881921264), T52));
+			      T5b = VFMA(LDK(KP471396736), T52, VMUL(LDK(KP881921264), T53));
+			 }
+			 T55 = VADD(T51, T54);
+			 T5k = VSUB(T5b, T5a);
+			 T5c = VADD(T5a, T5b);
+			 T5g = VSUB(T54, T51);
+		    }
+		    {
+			 V T4A, T4L, T5i, T5l;
+			 T4A = VADD(T4k, T4z);
+			 T4L = VBYI(VADD(T4H, T4K));
+			 ST(&(x[WS(rs, 61)]), VSUB(T4A, T4L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(T4A, T4L), ms, &(x[WS(rs, 1)]));
+			 T5i = VBYI(VSUB(T5g, T5h));
+			 T5l = VSUB(T5j, T5k);
+			 ST(&(x[WS(rs, 21)]), VADD(T5i, T5l), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 43)]), VSUB(T5l, T5i), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T5m, T5n, T4M, T4N;
+			 T5m = VBYI(VADD(T5h, T5g));
+			 T5n = VADD(T5j, T5k);
+			 ST(&(x[WS(rs, 11)]), VADD(T5m, T5n), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 53)]), VSUB(T5n, T5m), ms, &(x[WS(rs, 1)]));
+			 T4M = VSUB(T4k, T4z);
+			 T4N = VBYI(VSUB(T4K, T4H));
+			 ST(&(x[WS(rs, 35)]), VSUB(T4M, T4N), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 29)]), VADD(T4M, T4N), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T4Q, T4T, T56, T5d;
+			 T4Q = VBYI(VSUB(T4O, T4P));
+			 T4T = VSUB(T4R, T4S);
+			 ST(&(x[WS(rs, 19)]), VADD(T4Q, T4T), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 45)]), VSUB(T4T, T4Q), ms, &(x[WS(rs, 1)]));
+			 T56 = VADD(T4Y, T55);
+			 T5d = VBYI(VADD(T59, T5c));
+			 ST(&(x[WS(rs, 59)]), VSUB(T56, T5d), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VADD(T56, T5d), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T5e, T5f, T4U, T4V;
+			 T5e = VSUB(T4Y, T55);
+			 T5f = VBYI(VSUB(T5c, T59));
+			 ST(&(x[WS(rs, 37)]), VSUB(T5e, T5f), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 27)]), VADD(T5e, T5f), ms, &(x[WS(rs, 1)]));
+			 T4U = VBYI(VADD(T4P, T4O));
+			 T4V = VADD(T4R, T4S);
+			 ST(&(x[WS(rs, 13)]), VADD(T4U, T4V), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 51)]), VSUB(T4V, T4U), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T1u, T43, T3D, T3V, T3t, T45, T3B, T3K, T3d, T3E, T3w, T3A, T3R, T46, T3Y;
+		    V T42;
+		    {
+			 V TE, T3T, T1t, T3U, T13, T1s;
+			 TE = VSUB(Tg, TD);
+			 T3T = VADD(T3o, T3h);
+			 T13 = VFMA(LDK(KP195090322), TV, VMUL(LDK(KP980785280), T12));
+			 T1s = VFNMS(LDK(KP195090322), T1r, VMUL(LDK(KP980785280), T1k));
+			 T1t = VSUB(T13, T1s);
+			 T3U = VADD(T1s, T13);
+			 T1u = VADD(TE, T1t);
+			 T43 = VSUB(T3U, T3T);
+			 T3D = VSUB(TE, T1t);
+			 T3V = VADD(T3T, T3U);
+		    }
+		    {
+			 V T3p, T3I, T3s, T3J, T3q, T3r;
+			 T3p = VSUB(T3h, T3o);
+			 T3I = VADD(Tg, TD);
+			 T3q = VFNMS(LDK(KP195090322), T12, VMUL(LDK(KP980785280), TV));
+			 T3r = VFMA(LDK(KP980785280), T1r, VMUL(LDK(KP195090322), T1k));
+			 T3s = VSUB(T3q, T3r);
+			 T3J = VADD(T3r, T3q);
+			 T3t = VADD(T3p, T3s);
+			 T45 = VSUB(T3I, T3J);
+			 T3B = VSUB(T3s, T3p);
+			 T3K = VADD(T3I, T3J);
+		    }
+		    {
+			 V T2l, T3u, T3c, T3v;
+			 {
+			      V T29, T2k, T30, T3b;
+			      T29 = VSUB(T1R, T28);
+			      T2k = VSUB(T2g, T2j);
+			      T2l = VFMA(LDK(KP634393284), T29, VMUL(LDK(KP773010453), T2k));
+			      T3u = VFNMS(LDK(KP634393284), T2k, VMUL(LDK(KP773010453), T29));
+			      T30 = VSUB(T2C, T2Z);
+			      T3b = VSUB(T33, T3a);
+			      T3c = VFNMS(LDK(KP634393284), T3b, VMUL(LDK(KP773010453), T30));
+			      T3v = VFMA(LDK(KP773010453), T3b, VMUL(LDK(KP634393284), T30));
+			 }
+			 T3d = VADD(T2l, T3c);
+			 T3E = VSUB(T3v, T3u);
+			 T3w = VADD(T3u, T3v);
+			 T3A = VSUB(T3c, T2l);
+		    }
+		    {
+			 V T3N, T3W, T3Q, T3X;
+			 {
+			      V T3L, T3M, T3O, T3P;
+			      T3L = VADD(T28, T1R);
+			      T3M = VADD(T2g, T2j);
+			      T3N = VFMA(LDK(KP098017140), T3L, VMUL(LDK(KP995184726), T3M));
+			      T3W = VFNMS(LDK(KP098017140), T3M, VMUL(LDK(KP995184726), T3L));
+			      T3O = VADD(T2C, T2Z);
+			      T3P = VADD(T3a, T33);
+			      T3Q = VFNMS(LDK(KP098017140), T3P, VMUL(LDK(KP995184726), T3O));
+			      T3X = VFMA(LDK(KP995184726), T3P, VMUL(LDK(KP098017140), T3O));
+			 }
+			 T3R = VADD(T3N, T3Q);
+			 T46 = VSUB(T3X, T3W);
+			 T3Y = VADD(T3W, T3X);
+			 T42 = VSUB(T3Q, T3N);
+		    }
+		    {
+			 V T3e, T3x, T44, T47;
+			 T3e = VADD(T1u, T3d);
+			 T3x = VBYI(VADD(T3t, T3w));
+			 ST(&(x[WS(rs, 57)]), VSUB(T3e, T3x), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T3e, T3x), ms, &(x[WS(rs, 1)]));
+			 T44 = VBYI(VSUB(T42, T43));
+			 T47 = VSUB(T45, T46);
+			 ST(&(x[WS(rs, 17)]), VADD(T44, T47), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 47)]), VSUB(T47, T44), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T48, T49, T3y, T3z;
+			 T48 = VBYI(VADD(T43, T42));
+			 T49 = VADD(T45, T46);
+			 ST(&(x[WS(rs, 15)]), VADD(T48, T49), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 49)]), VSUB(T49, T48), ms, &(x[WS(rs, 1)]));
+			 T3y = VSUB(T1u, T3d);
+			 T3z = VBYI(VSUB(T3w, T3t));
+			 ST(&(x[WS(rs, 39)]), VSUB(T3y, T3z), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 25)]), VADD(T3y, T3z), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T3C, T3F, T3S, T3Z;
+			 T3C = VBYI(VSUB(T3A, T3B));
+			 T3F = VSUB(T3D, T3E);
+			 ST(&(x[WS(rs, 23)]), VADD(T3C, T3F), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 41)]), VSUB(T3F, T3C), ms, &(x[WS(rs, 1)]));
+			 T3S = VADD(T3K, T3R);
+			 T3Z = VBYI(VADD(T3V, T3Y));
+			 ST(&(x[WS(rs, 63)]), VSUB(T3S, T3Z), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T3S, T3Z), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T40, T41, T3G, T3H;
+			 T40 = VSUB(T3K, T3R);
+			 T41 = VBYI(VSUB(T3Y, T3V));
+			 ST(&(x[WS(rs, 33)]), VSUB(T40, T41), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 31)]), VADD(T40, T41), ms, &(x[WS(rs, 1)]));
+			 T3G = VBYI(VADD(T3B, T3A));
+			 T3H = VADD(T3D, T3E);
+			 ST(&(x[WS(rs, 9)]), VADD(T3G, T3H), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 55)]), VSUB(T3H, T3G), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     VTW(0, 32),
+     VTW(0, 33),
+     VTW(0, 34),
+     VTW(0, 35),
+     VTW(0, 36),
+     VTW(0, 37),
+     VTW(0, 38),
+     VTW(0, 39),
+     VTW(0, 40),
+     VTW(0, 41),
+     VTW(0, 42),
+     VTW(0, 43),
+     VTW(0, 44),
+     VTW(0, 45),
+     VTW(0, 46),
+     VTW(0, 47),
+     VTW(0, 48),
+     VTW(0, 49),
+     VTW(0, 50),
+     VTW(0, 51),
+     VTW(0, 52),
+     VTW(0, 53),
+     VTW(0, 54),
+     VTW(0, 55),
+     VTW(0, 56),
+     VTW(0, 57),
+     VTW(0, 58),
+     VTW(0, 59),
+     VTW(0, 60),
+     VTW(0, 61),
+     VTW(0, 62),
+     VTW(0, 63),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 64, XSIMD_STRING("t1fv_64"), twinstr, &GENUS, {467, 198, 52, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_64) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_64, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:01 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name t1fv_7 -include t1f.h */
+
+/*
+ * This function contains 36 FP additions, 36 FP multiplications,
+ * (or, 15 additions, 15 multiplications, 21 fused multiply/add),
+ * 42 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 12)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 12), MAKE_VOLATILE_STRIDE(7, rs)) {
+	       V T1, T2, T4, Te, Tc, T9, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       Te = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T9 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, T5, Tf, Td, Ta, T8;
+		    T3 = BYTWJ(&(W[0]), T2);
+		    T5 = BYTWJ(&(W[TWVL * 10]), T4);
+		    Tf = BYTWJ(&(W[TWVL * 6]), Te);
+		    Td = BYTWJ(&(W[TWVL * 4]), Tc);
+		    Ta = BYTWJ(&(W[TWVL * 8]), T9);
+		    T8 = BYTWJ(&(W[TWVL * 2]), T7);
+		    {
+			 V T6, Tk, Tg, Tl, Tb, Tm;
+			 T6 = VADD(T3, T5);
+			 Tk = VSUB(T5, T3);
+			 Tg = VADD(Td, Tf);
+			 Tl = VSUB(Tf, Td);
+			 Tb = VADD(T8, Ta);
+			 Tm = VSUB(Ta, T8);
+			 {
+			      V Th, Ts, Tp, Tu, Tn, Tx, Ti, Tt;
+			      Th = VFNMS(LDK(KP356895867), T6, Tg);
+			      Ts = VFMA(LDK(KP554958132), Tl, Tk);
+			      ST(&(x[0]), VADD(T1, VADD(T6, VADD(Tb, Tg))), ms, &(x[0]));
+			      Tp = VFNMS(LDK(KP356895867), Tb, T6);
+			      Tu = VFNMS(LDK(KP356895867), Tg, Tb);
+			      Tn = VFMA(LDK(KP554958132), Tm, Tl);
+			      Tx = VFNMS(LDK(KP554958132), Tk, Tm);
+			      Ti = VFNMS(LDK(KP692021471), Th, Tb);
+			      Tt = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), Ts, Tm));
+			      {
+				   V Tq, Tv, To, Ty, Tj, Tr, Tw;
+				   Tq = VFNMS(LDK(KP692021471), Tp, Tg);
+				   Tv = VFNMS(LDK(KP692021471), Tu, T6);
+				   To = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tn, Tk));
+				   Ty = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tx, Tl));
+				   Tj = VFNMS(LDK(KP900968867), Ti, T1);
+				   Tr = VFNMS(LDK(KP900968867), Tq, T1);
+				   Tw = VFNMS(LDK(KP900968867), Tv, T1);
+				   ST(&(x[WS(rs, 2)]), VFMAI(To, Tj), ms, &(x[0]));
+				   ST(&(x[WS(rs, 5)]), VFNMSI(To, Tj), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFMAI(Tt, Tr), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(Tt, Tr), ms, &(x[0]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(Ty, Tw), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 4)]), VFNMSI(Ty, Tw), ms, &(x[0]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 7, XSIMD_STRING("t1fv_7"), twinstr, &GENUS, {15, 15, 21, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_7) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_7, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name t1fv_7 -include t1f.h */
+
+/*
+ * This function contains 36 FP additions, 30 FP multiplications,
+ * (or, 24 additions, 18 multiplications, 12 fused multiply/add),
+ * 21 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 12)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 12), MAKE_VOLATILE_STRIDE(7, rs)) {
+	       V T1, Tg, Tj, T6, Ti, Tb, Tk, Tp, To;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V Td, Tf, Tc, Te;
+		    Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTWJ(&(W[TWVL * 4]), Tc);
+		    Te = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tf = BYTWJ(&(W[TWVL * 6]), Te);
+		    Tg = VADD(Td, Tf);
+		    Tj = VSUB(Tf, Td);
+	       }
+	       {
+		    V T3, T5, T2, T4;
+		    T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T3 = BYTWJ(&(W[0]), T2);
+		    T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T5 = BYTWJ(&(W[TWVL * 10]), T4);
+		    T6 = VADD(T3, T5);
+		    Ti = VSUB(T5, T3);
+	       }
+	       {
+		    V T8, Ta, T7, T9;
+		    T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T8 = BYTWJ(&(W[TWVL * 2]), T7);
+		    T9 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Ta = BYTWJ(&(W[TWVL * 8]), T9);
+		    Tb = VADD(T8, Ta);
+		    Tk = VSUB(Ta, T8);
+	       }
+	       ST(&(x[0]), VADD(T1, VADD(T6, VADD(Tb, Tg))), ms, &(x[0]));
+	       Tp = VBYI(VFMA(LDK(KP433883739), Ti, VFNMS(LDK(KP781831482), Tk, VMUL(LDK(KP974927912), Tj))));
+	       To = VFMA(LDK(KP623489801), Tb, VFNMS(LDK(KP222520933), Tg, VFNMS(LDK(KP900968867), T6, T1)));
+	       ST(&(x[WS(rs, 4)]), VSUB(To, Tp), ms, &(x[0]));
+	       ST(&(x[WS(rs, 3)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tl, Th, Tn, Tm;
+		    Tl = VBYI(VFNMS(LDK(KP781831482), Tj, VFNMS(LDK(KP433883739), Tk, VMUL(LDK(KP974927912), Ti))));
+		    Th = VFMA(LDK(KP623489801), Tg, VFNMS(LDK(KP900968867), Tb, VFNMS(LDK(KP222520933), T6, T1)));
+		    ST(&(x[WS(rs, 5)]), VSUB(Th, Tl), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 2)]), VADD(Th, Tl), ms, &(x[0]));
+		    Tn = VBYI(VFMA(LDK(KP781831482), Ti, VFMA(LDK(KP974927912), Tk, VMUL(LDK(KP433883739), Tj))));
+		    Tm = VFMA(LDK(KP623489801), T6, VFNMS(LDK(KP900968867), Tg, VFNMS(LDK(KP222520933), Tb, T1)));
+		    ST(&(x[WS(rs, 6)]), VSUB(Tm, Tn), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VADD(Tm, Tn), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 7, XSIMD_STRING("t1fv_7"), twinstr, &GENUS, {24, 18, 12, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_7) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_7, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1fv_8 -include t1f.h */
+
+/*
+ * This function contains 33 FP additions, 24 FP multiplications,
+ * (or, 23 additions, 14 multiplications, 10 fused multiply/add),
+ * 36 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T1, T2, Th, Tj, T5, T7, Ta, Tc;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       Tj = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+	       Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T3, Ti, Tk, T6, T8, Tb, Td;
+		    T3 = BYTWJ(&(W[TWVL * 6]), T2);
+		    Ti = BYTWJ(&(W[TWVL * 2]), Th);
+		    Tk = BYTWJ(&(W[TWVL * 10]), Tj);
+		    T6 = BYTWJ(&(W[0]), T5);
+		    T8 = BYTWJ(&(W[TWVL * 8]), T7);
+		    Tb = BYTWJ(&(W[TWVL * 12]), Ta);
+		    Td = BYTWJ(&(W[TWVL * 4]), Tc);
+		    {
+			 V Tq, T4, Tr, Tl, Tt, T9, Tu, Te, Tw, Ts;
+			 Tq = VADD(T1, T3);
+			 T4 = VSUB(T1, T3);
+			 Tr = VADD(Ti, Tk);
+			 Tl = VSUB(Ti, Tk);
+			 Tt = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+			 Tu = VADD(Tb, Td);
+			 Te = VSUB(Tb, Td);
+			 Tw = VSUB(Tq, Tr);
+			 Ts = VADD(Tq, Tr);
+			 {
+			      V Tx, Tv, Tm, Tf;
+			      Tx = VSUB(Tu, Tt);
+			      Tv = VADD(Tt, Tu);
+			      Tm = VSUB(Te, T9);
+			      Tf = VADD(T9, Te);
+			      {
+				   V Tp, Tn, To, Tg;
+				   ST(&(x[WS(rs, 2)]), VFMAI(Tx, Tw), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(Tx, Tw), ms, &(x[0]));
+				   ST(&(x[0]), VADD(Ts, Tv), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VSUB(Ts, Tv), ms, &(x[0]));
+				   Tp = VFMA(LDK(KP707106781), Tm, Tl);
+				   Tn = VFNMS(LDK(KP707106781), Tm, Tl);
+				   To = VFNMS(LDK(KP707106781), Tf, T4);
+				   Tg = VFMA(LDK(KP707106781), Tf, T4);
+				   ST(&(x[WS(rs, 5)]), VFNMSI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFMAI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFNMSI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t1fv_8"), twinstr, &GENUS, {23, 14, 10, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_8) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1fv_8 -include t1f.h */
+
+/*
+ * This function contains 33 FP additions, 16 FP multiplications,
+ * (or, 33 additions, 16 multiplications, 0 fused multiply/add),
+ * 24 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T4, Tq, Tm, Tr, T9, Tt, Te, Tu, T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       T3 = BYTWJ(&(W[TWVL * 6]), T2);
+	       T4 = VSUB(T1, T3);
+	       Tq = VADD(T1, T3);
+	       {
+		    V Tj, Tl, Ti, Tk;
+		    Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tj = BYTWJ(&(W[TWVL * 2]), Ti);
+		    Tk = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    Tl = BYTWJ(&(W[TWVL * 10]), Tk);
+		    Tm = VSUB(Tj, Tl);
+		    Tr = VADD(Tj, Tl);
+	       }
+	       {
+		    V T6, T8, T5, T7;
+		    T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T6 = BYTWJ(&(W[0]), T5);
+		    T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T8 = BYTWJ(&(W[TWVL * 8]), T7);
+		    T9 = VSUB(T6, T8);
+		    Tt = VADD(T6, T8);
+	       }
+	       {
+		    V Tb, Td, Ta, Tc;
+		    Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Tb = BYTWJ(&(W[TWVL * 12]), Ta);
+		    Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTWJ(&(W[TWVL * 4]), Tc);
+		    Te = VSUB(Tb, Td);
+		    Tu = VADD(Tb, Td);
+	       }
+	       {
+		    V Ts, Tv, Tw, Tx;
+		    Ts = VADD(Tq, Tr);
+		    Tv = VADD(Tt, Tu);
+		    ST(&(x[WS(rs, 4)]), VSUB(Ts, Tv), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ts, Tv), ms, &(x[0]));
+		    Tw = VSUB(Tq, Tr);
+		    Tx = VBYI(VSUB(Tu, Tt));
+		    ST(&(x[WS(rs, 6)]), VSUB(Tw, Tx), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Tw, Tx), ms, &(x[0]));
+		    {
+			 V Tg, To, Tn, Tp, Tf, Th;
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 Tg = VADD(T4, Tf);
+			 To = VSUB(T4, Tf);
+			 Th = VMUL(LDK(KP707106781), VSUB(Te, T9));
+			 Tn = VBYI(VSUB(Th, Tm));
+			 Tp = VBYI(VADD(Tm, Th));
+			 ST(&(x[WS(rs, 7)]), VSUB(Tg, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(Tg, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VSUB(To, Tp), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t1fv_8"), twinstr, &GENUS, {33, 16, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_8) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1fv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1fv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:02 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1fv_9 -include t1f.h */
+
+/*
+ * This function contains 54 FP additions, 54 FP multiplications,
+ * (or, 20 additions, 20 multiplications, 34 fused multiply/add),
+ * 67 stack variables, 19 constants, and 18 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
+     DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
+     DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
+     DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
+     DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
+     DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
+     DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
+     DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
+     DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
+     DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
+     DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
+	       V T1, T3, T5, T9, Th, Tb, Td, Tj, Tl, TD, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T2, T4, T8, Tg;
+		    T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Tg = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    {
+			 V Ta, Tc, Ti, Tk;
+			 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 Ti = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 Tk = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T3 = BYTWJ(&(W[TWVL * 4]), T2);
+			 T5 = BYTWJ(&(W[TWVL * 10]), T4);
+			 T9 = BYTWJ(&(W[0]), T8);
+			 Th = BYTWJ(&(W[TWVL * 2]), Tg);
+			 Tb = BYTWJ(&(W[TWVL * 6]), Ta);
+			 Td = BYTWJ(&(W[TWVL * 12]), Tc);
+			 Tj = BYTWJ(&(W[TWVL * 8]), Ti);
+			 Tl = BYTWJ(&(W[TWVL * 14]), Tk);
+		    }
+	       }
+	       TD = VSUB(T5, T3);
+	       T6 = VADD(T3, T5);
+	       {
+		    V Tt, Te, Tu, Tm, Tr, T7;
+		    Tt = VSUB(Tb, Td);
+		    Te = VADD(Tb, Td);
+		    Tu = VSUB(Tl, Tj);
+		    Tm = VADD(Tj, Tl);
+		    Tr = VFNMS(LDK(KP500000000), T6, T1);
+		    T7 = VADD(T1, T6);
+		    {
+			 V Tv, Tf, Ts, Tn;
+			 Tv = VFNMS(LDK(KP500000000), Te, T9);
+			 Tf = VADD(T9, Te);
+			 Ts = VFNMS(LDK(KP500000000), Tm, Th);
+			 Tn = VADD(Th, Tm);
+			 {
+			      V TG, TK, Tw, TJ, TF, TA, To, Tq;
+			      TG = VFNMS(LDK(KP726681596), Tt, Tv);
+			      TK = VFMA(LDK(KP968908795), Tv, Tt);
+			      Tw = VFNMS(LDK(KP586256827), Tv, Tu);
+			      TJ = VFNMS(LDK(KP152703644), Tu, Ts);
+			      TF = VFMA(LDK(KP203604859), Ts, Tu);
+			      TA = VFNMS(LDK(KP439692620), Tt, Ts);
+			      To = VADD(Tf, Tn);
+			      Tq = VMUL(LDK(KP866025403), VSUB(Tn, Tf));
+			      {
+				   V TQ, TH, TL, TN, TB, Tp, Ty, TI, Tx;
+				   Tx = VFNMS(LDK(KP347296355), Tw, Tt);
+				   TQ = VFNMS(LDK(KP898197570), TG, TF);
+				   TH = VFMA(LDK(KP898197570), TG, TF);
+				   TL = VFMA(LDK(KP673648177), TK, TJ);
+				   TN = VFNMS(LDK(KP673648177), TK, TJ);
+				   TB = VFNMS(LDK(KP420276625), TA, Tu);
+				   ST(&(x[0]), VADD(T7, To), ms, &(x[0]));
+				   Tp = VFNMS(LDK(KP500000000), To, T7);
+				   Ty = VFNMS(LDK(KP907603734), Tx, Ts);
+				   TI = VFMA(LDK(KP852868531), TH, Tr);
+				   {
+					V TO, TR, TM, TC, Tz, TP, TS, TE;
+					TO = VFNMS(LDK(KP500000000), TH, TN);
+					TR = VFMA(LDK(KP666666666), TL, TQ);
+					TM = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), TD, TL));
+					TC = VFNMS(LDK(KP826351822), TB, Tv);
+					ST(&(x[WS(rs, 6)]), VFNMSI(Tq, Tp), ms, &(x[0]));
+					ST(&(x[WS(rs, 3)]), VFMAI(Tq, Tp), ms, &(x[WS(rs, 1)]));
+					Tz = VFNMS(LDK(KP939692620), Ty, Tr);
+					TP = VFMA(LDK(KP852868531), TO, Tr);
+					TS = VMUL(LDK(KP866025403), VFMA(LDK(KP852868531), TR, TD));
+					ST(&(x[WS(rs, 8)]), VFMAI(TM, TI), ms, &(x[0]));
+					ST(&(x[WS(rs, 1)]), VFNMSI(TM, TI), ms, &(x[WS(rs, 1)]));
+					TE = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), TD, TC));
+					ST(&(x[WS(rs, 4)]), VFMAI(TS, TP), ms, &(x[0]));
+					ST(&(x[WS(rs, 5)]), VFNMSI(TS, TP), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 7)]), VFMAI(TE, Tz), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 2)]), VFNMSI(TE, Tz), ms, &(x[0]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 9, XSIMD_STRING("t1fv_9"), twinstr, &GENUS, {20, 20, 34, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_9) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_9, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1fv_9 -include t1f.h */
+
+/*
+ * This function contains 54 FP additions, 42 FP multiplications,
+ * (or, 38 additions, 26 multiplications, 16 fused multiply/add),
+ * 38 stack variables, 14 constants, and 18 memory accesses
+ */
+#include "t1f.h"
+
+static void t1fv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
+     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
+     DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
+     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
+     DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
+	       V T1, T6, TA, Tt, Tf, Ts, Tw, Tn, Tv;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T3, T5, T2, T4;
+		    T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T3 = BYTWJ(&(W[TWVL * 4]), T2);
+		    T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    T5 = BYTWJ(&(W[TWVL * 10]), T4);
+		    T6 = VADD(T3, T5);
+		    TA = VMUL(LDK(KP866025403), VSUB(T5, T3));
+	       }
+	       {
+		    V T9, Td, Tb, T8, Tc, Ta, Te;
+		    T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T9 = BYTWJ(&(W[0]), T8);
+		    Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTWJ(&(W[TWVL * 12]), Tc);
+		    Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tb = BYTWJ(&(W[TWVL * 6]), Ta);
+		    Tt = VSUB(Td, Tb);
+		    Te = VADD(Tb, Td);
+		    Tf = VADD(T9, Te);
+		    Ts = VFNMS(LDK(KP500000000), Te, T9);
+	       }
+	       {
+		    V Th, Tl, Tj, Tg, Tk, Ti, Tm;
+		    Tg = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Th = BYTWJ(&(W[TWVL * 2]), Tg);
+		    Tk = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    Tl = BYTWJ(&(W[TWVL * 14]), Tk);
+		    Ti = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Tj = BYTWJ(&(W[TWVL * 8]), Ti);
+		    Tw = VSUB(Tl, Tj);
+		    Tm = VADD(Tj, Tl);
+		    Tn = VADD(Th, Tm);
+		    Tv = VFNMS(LDK(KP500000000), Tm, Th);
+	       }
+	       {
+		    V Tq, T7, To, Tp;
+		    Tq = VBYI(VMUL(LDK(KP866025403), VSUB(Tn, Tf)));
+		    T7 = VADD(T1, T6);
+		    To = VADD(Tf, Tn);
+		    Tp = VFNMS(LDK(KP500000000), To, T7);
+		    ST(&(x[0]), VADD(T7, To), ms, &(x[0]));
+		    ST(&(x[WS(rs, 3)]), VADD(Tp, Tq), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VSUB(Tp, Tq), ms, &(x[0]));
+	       }
+	       {
+		    V TI, TB, TC, TD, Tu, Tx, Ty, Tr, TH;
+		    TI = VBYI(VSUB(VFNMS(LDK(KP342020143), Tv, VFNMS(LDK(KP150383733), Tt, VFNMS(LDK(KP984807753), Ts, VMUL(LDK(KP813797681), Tw)))), TA));
+		    TB = VFNMS(LDK(KP642787609), Ts, VMUL(LDK(KP663413948), Tt));
+		    TC = VFNMS(LDK(KP984807753), Tv, VMUL(LDK(KP150383733), Tw));
+		    TD = VADD(TB, TC);
+		    Tu = VFMA(LDK(KP766044443), Ts, VMUL(LDK(KP556670399), Tt));
+		    Tx = VFMA(LDK(KP173648177), Tv, VMUL(LDK(KP852868531), Tw));
+		    Ty = VADD(Tu, Tx);
+		    Tr = VFNMS(LDK(KP500000000), T6, T1);
+		    TH = VFMA(LDK(KP173648177), Ts, VFNMS(LDK(KP296198132), Tw, VFNMS(LDK(KP939692620), Tv, VFNMS(LDK(KP852868531), Tt, Tr))));
+		    ST(&(x[WS(rs, 7)]), VSUB(TH, TI), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 2)]), VADD(TH, TI), ms, &(x[0]));
+		    {
+			 V Tz, TE, TF, TG;
+			 Tz = VADD(Tr, Ty);
+			 TE = VBYI(VADD(TA, TD));
+			 ST(&(x[WS(rs, 8)]), VSUB(Tz, TE), ms, &(x[0]));
+			 ST(&(x[WS(rs, 1)]), VADD(TE, Tz), ms, &(x[WS(rs, 1)]));
+			 TF = VFMA(LDK(KP866025403), VSUB(TB, TC), VFNMS(LDK(KP500000000), Ty, Tr));
+			 TG = VBYI(VADD(TA, VFNMS(LDK(KP500000000), TD, VMUL(LDK(KP866025403), VSUB(Tx, Tu)))));
+			 ST(&(x[WS(rs, 5)]), VSUB(TF, TG), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 4)]), VADD(TF, TG), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 9, XSIMD_STRING("t1fv_9"), twinstr, &GENUS, {38, 26, 16, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1fv_9) (planner *p) {
+     X(kdft_dit_register) (p, t1fv_9, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,809 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:24 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1sv_16 -include ts.h */
+
+/*
+ * This function contains 174 FP additions, 100 FP multiplications,
+ * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
+ * 113 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "ts.h"
+
+static void t1sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 30); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 30), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T2S, T2O, T2B, T2j, T2A, T24, T3J, T3L, T2Q, T2I, T2R, T2L, T2C, T2y, T3D;
+	       V T3F;
+	       {
+		    V T3o, T3z, T1I, T8, T35, T2o, T1s, T2r, T36, T2w, T1F, T2p, T1N, T3k, Tl;
+		    V T3A, T2V, T1T, Tz, T1U, T30, T29, T11, T2c, TH, TK, TJ, T31, T2h, T1e;
+		    V T2a, T1Z, TI, T1Y, TF;
+		    {
+			 V Ta, Td, Tg, Tj, T2t, T1y, Tf, T1J, Tb, Tc, T2v, T1E, Ti;
+			 {
+			      V T1, T3n, T3, T6, T5, T1h, T1k, T1n, T1q, T1m, T3l, T4, T1j, T1p, T2k;
+			      V T1i, T2, T1g;
+			      T1 = LD(&(ri[0]), ms, &(ri[0]));
+			      T3n = LD(&(ii[0]), ms, &(ii[0]));
+			      T3 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
+			      T6 = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
+			      T2 = LDW(&(W[TWVL * 14]));
+			      T5 = LDW(&(W[TWVL * 15]));
+			      T1h = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
+			      T1k = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
+			      T1g = LDW(&(W[TWVL * 28]));
+			      T1n = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+			      T1q = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+			      T1m = LDW(&(W[TWVL * 12]));
+			      T3l = VMUL(T2, T6);
+			      T4 = VMUL(T2, T3);
+			      T1j = LDW(&(W[TWVL * 29]));
+			      T1p = LDW(&(W[TWVL * 13]));
+			      T2k = VMUL(T1g, T1k);
+			      T1i = VMUL(T1g, T1h);
+			      {
+				   V T1u, T1x, T1A, T2s, T1v, T1D, T1z, T1w, T1C, T2u, T1B, T9;
+				   {
+					V T2l, T1l, T1t, T2n, T1r;
+					{
+					     V T2m, T1o, T3m, T7;
+					     T1u = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+					     T2m = VMUL(T1m, T1q);
+					     T1o = VMUL(T1m, T1n);
+					     T3m = VFNMS(T5, T3, T3l);
+					     T7 = VFMA(T5, T6, T4);
+					     T1x = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+					     T2l = VFNMS(T1j, T1h, T2k);
+					     T1l = VFMA(T1j, T1k, T1i);
+					     T1t = LDW(&(W[TWVL * 4]));
+					     T2n = VFNMS(T1p, T1n, T2m);
+					     T1r = VFMA(T1p, T1q, T1o);
+					     T3o = VADD(T3m, T3n);
+					     T3z = VSUB(T3n, T3m);
+					     T1I = VSUB(T1, T7);
+					     T8 = VADD(T1, T7);
+					}
+					T1A = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
+					T2s = VMUL(T1t, T1x);
+					T1v = VMUL(T1t, T1u);
+					T35 = VADD(T2l, T2n);
+					T2o = VSUB(T2l, T2n);
+					T1s = VADD(T1l, T1r);
+					T2r = VSUB(T1l, T1r);
+					T1D = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
+					T1z = LDW(&(W[TWVL * 20]));
+				   }
+				   T1w = LDW(&(W[TWVL * 5]));
+				   T1C = LDW(&(W[TWVL * 21]));
+				   Ta = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+				   Td = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+				   T9 = LDW(&(W[TWVL * 6]));
+				   Tg = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
+				   Tj = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
+				   T2u = VMUL(T1z, T1D);
+				   T1B = VMUL(T1z, T1A);
+				   T2t = VFNMS(T1w, T1u, T2s);
+				   T1y = VFMA(T1w, T1x, T1v);
+				   Tf = LDW(&(W[TWVL * 22]));
+				   T1J = VMUL(T9, Td);
+				   Tb = VMUL(T9, Ta);
+				   Tc = LDW(&(W[TWVL * 7]));
+				   T2v = VFNMS(T1C, T1A, T2u);
+				   T1E = VFMA(T1C, T1D, T1B);
+				   Ti = LDW(&(W[TWVL * 23]));
+			      }
+			 }
+			 {
+			      V TW, TZ, TY, T27, TX, T26, TU;
+			      {
+				   V To, Tr, Tu, Tx, Tq, Tw, T1P, Tp, T1R, Tv;
+				   {
+					V T1K, Te, T1M, Tk, Tn, Tt, T1L, Th;
+					To = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+					T1L = VMUL(Tf, Tj);
+					Th = VMUL(Tf, Tg);
+					Tr = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+					T1K = VFNMS(Tc, Ta, T1J);
+					Te = VFMA(Tc, Td, Tb);
+					T36 = VADD(T2t, T2v);
+					T2w = VSUB(T2t, T2v);
+					T1F = VADD(T1y, T1E);
+					T2p = VSUB(T1y, T1E);
+					T1M = VFNMS(Ti, Tg, T1L);
+					Tk = VFMA(Ti, Tj, Th);
+					Tn = LDW(&(W[TWVL * 2]));
+					Tu = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
+					Tx = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
+					Tt = LDW(&(W[TWVL * 18]));
+					Tq = LDW(&(W[TWVL * 3]));
+					Tw = LDW(&(W[TWVL * 19]));
+					T1N = VSUB(T1K, T1M);
+					T3k = VADD(T1K, T1M);
+					Tl = VADD(Te, Tk);
+					T3A = VSUB(Te, Tk);
+					T1P = VMUL(Tn, Tr);
+					Tp = VMUL(Tn, To);
+					T1R = VMUL(Tt, Tx);
+					Tv = VMUL(Tt, Tu);
+				   }
+				   {
+					V TQ, TT, T1Q, Ts, T1S, Ty, TV, T25, TR, TP, TS;
+					TQ = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+					TT = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+					TP = LDW(&(W[0]));
+					TW = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
+					T1Q = VFNMS(Tq, To, T1P);
+					Ts = VFMA(Tq, Tr, Tp);
+					T1S = VFNMS(Tw, Tu, T1R);
+					Ty = VFMA(Tw, Tx, Tv);
+					TZ = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
+					TV = LDW(&(W[TWVL * 16]));
+					T25 = VMUL(TP, TT);
+					TR = VMUL(TP, TQ);
+					TS = LDW(&(W[TWVL * 1]));
+					TY = LDW(&(W[TWVL * 17]));
+					T2V = VADD(T1Q, T1S);
+					T1T = VSUB(T1Q, T1S);
+					Tz = VADD(Ts, Ty);
+					T1U = VSUB(Ts, Ty);
+					T27 = VMUL(TV, TZ);
+					TX = VMUL(TV, TW);
+					T26 = VFNMS(TS, TQ, T25);
+					TU = VFMA(TS, TT, TR);
+				   }
+			      }
+			      {
+				   V T19, T1c, T1b, T2f, T1a, T2e, T17;
+				   {
+					V T13, T16, T12, T28, T10, T18, T15, T2d, T14;
+					T13 = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+					T16 = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+					T12 = LDW(&(W[TWVL * 8]));
+					T19 = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
+					T28 = VFNMS(TY, TW, T27);
+					T10 = VFMA(TY, TZ, TX);
+					T1c = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
+					T18 = LDW(&(W[TWVL * 24]));
+					T15 = LDW(&(W[TWVL * 9]));
+					T1b = LDW(&(W[TWVL * 25]));
+					T2d = VMUL(T12, T16);
+					T14 = VMUL(T12, T13);
+					T30 = VADD(T26, T28);
+					T29 = VSUB(T26, T28);
+					T11 = VADD(TU, T10);
+					T2c = VSUB(TU, T10);
+					T2f = VMUL(T18, T1c);
+					T1a = VMUL(T18, T19);
+					T2e = VFNMS(T15, T13, T2d);
+					T17 = VFMA(T15, T16, T14);
+				   }
+				   {
+					V TB, TE, TA, T2g, T1d, TG, TD, T1X, TC;
+					TB = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
+					TE = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
+					TA = LDW(&(W[TWVL * 26]));
+					TH = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+					T2g = VFNMS(T1b, T19, T2f);
+					T1d = VFMA(T1b, T1c, T1a);
+					TK = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+					TG = LDW(&(W[TWVL * 10]));
+					TD = LDW(&(W[TWVL * 27]));
+					TJ = LDW(&(W[TWVL * 11]));
+					T1X = VMUL(TA, TE);
+					TC = VMUL(TA, TB);
+					T31 = VADD(T2e, T2g);
+					T2h = VSUB(T2e, T2g);
+					T1e = VADD(T17, T1d);
+					T2a = VSUB(T17, T1d);
+					T1Z = VMUL(TG, TK);
+					TI = VMUL(TG, TH);
+					T1Y = VFNMS(TD, TB, T1X);
+					TF = VFMA(TD, TE, TC);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T2U, Tm, T3p, T3u, T34, T1G, T1f, T2Z, T20, TL, T32, T3f, T3g, T37;
+			 T2U = VSUB(T8, Tl);
+			 Tm = VADD(T8, Tl);
+			 T3p = VADD(T3k, T3o);
+			 T3u = VSUB(T3o, T3k);
+			 T34 = VSUB(T1s, T1F);
+			 T1G = VADD(T1s, T1F);
+			 T1f = VADD(T11, T1e);
+			 T2Z = VSUB(T11, T1e);
+			 T20 = VFNMS(TJ, TH, T1Z);
+			 TL = VFMA(TJ, TK, TI);
+			 T32 = VSUB(T30, T31);
+			 T3f = VADD(T30, T31);
+			 T3g = VADD(T35, T36);
+			 T37 = VSUB(T35, T36);
+			 {
+			      V T3r, T1H, T21, T1W, T3i, T3h, T3j, T2X, TN, T3t, T2W, TM;
+			      T3r = VSUB(T1G, T1f);
+			      T1H = VADD(T1f, T1G);
+			      T21 = VSUB(T1Y, T20);
+			      T2W = VADD(T1Y, T20);
+			      T1W = VSUB(TF, TL);
+			      TM = VADD(TF, TL);
+			      T3i = VADD(T3f, T3g);
+			      T3h = VSUB(T3f, T3g);
+			      T3j = VADD(T2V, T2W);
+			      T2X = VSUB(T2V, T2W);
+			      TN = VADD(Tz, TM);
+			      T3t = VSUB(TM, Tz);
+			      {
+				   V T2E, T1O, T3B, T3H, T2x, T2q, T2K, T2J, T3C, T23, T3I, T2H;
+				   {
+					V T2F, T1V, T22, T2G;
+					T2E = VADD(T1I, T1N);
+					T1O = VSUB(T1I, T1N);
+					{
+					     V T3b, T33, T3c, T38;
+					     T3b = VSUB(T32, T2Z);
+					     T33 = VADD(T2Z, T32);
+					     T3c = VADD(T34, T37);
+					     T38 = VSUB(T34, T37);
+					     {
+						  V T3a, T2Y, T3s, T3q;
+						  T3a = VSUB(T2U, T2X);
+						  T2Y = VADD(T2U, T2X);
+						  T3s = VSUB(T3p, T3j);
+						  T3q = VADD(T3j, T3p);
+						  {
+						       V T3x, T3v, T3e, TO;
+						       T3x = VSUB(T3u, T3t);
+						       T3v = VADD(T3t, T3u);
+						       T3e = VSUB(Tm, TN);
+						       TO = VADD(Tm, TN);
+						       {
+							    V T3d, T3w, T3y, T39;
+							    T3d = VSUB(T3b, T3c);
+							    T3w = VADD(T3b, T3c);
+							    T3y = VSUB(T38, T33);
+							    T39 = VADD(T33, T38);
+							    ST(&(ii[WS(rs, 4)]), VADD(T3r, T3s), ms, &(ii[0]));
+							    ST(&(ii[WS(rs, 12)]), VSUB(T3s, T3r), ms, &(ii[0]));
+							    ST(&(ii[0]), VADD(T3i, T3q), ms, &(ii[0]));
+							    ST(&(ii[WS(rs, 8)]), VSUB(T3q, T3i), ms, &(ii[0]));
+							    ST(&(ri[WS(rs, 4)]), VADD(T3e, T3h), ms, &(ri[0]));
+							    ST(&(ri[WS(rs, 12)]), VSUB(T3e, T3h), ms, &(ri[0]));
+							    ST(&(ri[0]), VADD(TO, T1H), ms, &(ri[0]));
+							    ST(&(ri[WS(rs, 8)]), VSUB(TO, T1H), ms, &(ri[0]));
+							    ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP707106781), T3d, T3a), ms, &(ri[0]));
+							    ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3d, T3a), ms, &(ri[0]));
+							    ST(&(ii[WS(rs, 10)]), VFNMS(LDK(KP707106781), T3w, T3v), ms, &(ii[0]));
+							    ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP707106781), T3w, T3v), ms, &(ii[0]));
+							    ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3y, T3x), ms, &(ii[0]));
+							    ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP707106781), T3y, T3x), ms, &(ii[0]));
+							    ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP707106781), T39, T2Y), ms, &(ri[0]));
+							    ST(&(ri[WS(rs, 10)]), VFNMS(LDK(KP707106781), T39, T2Y), ms, &(ri[0]));
+							    T3B = VSUB(T3z, T3A);
+							    T3H = VADD(T3A, T3z);
+						       }
+						  }
+					     }
+					}
+					T2F = VADD(T1U, T1T);
+					T1V = VSUB(T1T, T1U);
+					T22 = VADD(T1W, T21);
+					T2G = VSUB(T1W, T21);
+					{
+					     V T2M, T2N, T2b, T2i;
+					     T2x = VSUB(T2r, T2w);
+					     T2M = VADD(T2r, T2w);
+					     T2N = VSUB(T2o, T2p);
+					     T2q = VADD(T2o, T2p);
+					     T2K = VSUB(T29, T2a);
+					     T2b = VADD(T29, T2a);
+					     T2i = VSUB(T2c, T2h);
+					     T2J = VADD(T2c, T2h);
+					     T3C = VADD(T1V, T22);
+					     T23 = VSUB(T1V, T22);
+					     T2S = VFMA(LDK(KP414213562), T2M, T2N);
+					     T2O = VFNMS(LDK(KP414213562), T2N, T2M);
+					     T3I = VSUB(T2G, T2F);
+					     T2H = VADD(T2F, T2G);
+					     T2B = VFNMS(LDK(KP414213562), T2b, T2i);
+					     T2j = VFMA(LDK(KP414213562), T2i, T2b);
+					}
+				   }
+				   T2A = VFNMS(LDK(KP707106781), T23, T1O);
+				   T24 = VFMA(LDK(KP707106781), T23, T1O);
+				   T3J = VFMA(LDK(KP707106781), T3I, T3H);
+				   T3L = VFNMS(LDK(KP707106781), T3I, T3H);
+				   T2Q = VFNMS(LDK(KP707106781), T2H, T2E);
+				   T2I = VFMA(LDK(KP707106781), T2H, T2E);
+				   T2R = VFNMS(LDK(KP414213562), T2J, T2K);
+				   T2L = VFMA(LDK(KP414213562), T2K, T2J);
+				   T2C = VFMA(LDK(KP414213562), T2q, T2x);
+				   T2y = VFNMS(LDK(KP414213562), T2x, T2q);
+				   T3D = VFMA(LDK(KP707106781), T3C, T3B);
+				   T3F = VFNMS(LDK(KP707106781), T3C, T3B);
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T3E, T2T, T2P, T3G;
+		    T3E = VADD(T2R, T2S);
+		    T2T = VSUB(T2R, T2S);
+		    T2P = VADD(T2L, T2O);
+		    T3G = VSUB(T2O, T2L);
+		    {
+			 V T3K, T2D, T2z, T3M;
+			 T3K = VSUB(T2C, T2B);
+			 T2D = VADD(T2B, T2C);
+			 T2z = VSUB(T2j, T2y);
+			 T3M = VADD(T2j, T2y);
+			 ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP923879532), T2T, T2Q), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP923879532), T2T, T2Q), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 9)]), VFNMS(LDK(KP923879532), T3E, T3D), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP923879532), T3E, T3D), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP923879532), T3G, T3F), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP923879532), T3G, T3F), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP923879532), T2P, T2I), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 9)]), VFNMS(LDK(KP923879532), T2P, T2I), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 15)]), VFMA(LDK(KP923879532), T2D, T2A), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP923879532), T2D, T2A), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 11)]), VFNMS(LDK(KP923879532), T3K, T3J), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP923879532), T3K, T3J), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 15)]), VFMA(LDK(KP923879532), T3M, T3L), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP923879532), T3M, T3L), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP923879532), T2z, T24), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 11)]), VFNMS(LDK(KP923879532), T2z, T24), ms, &(ri[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t1sv_16"), twinstr, &GENUS, {104, 30, 70, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1sv_16) (planner *p) {
+     X(kdft_dit_register) (p, t1sv_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1sv_16 -include ts.h */
+
+/*
+ * This function contains 174 FP additions, 84 FP multiplications,
+ * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
+ * 52 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "ts.h"
+
+static void t1sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 30); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 30), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
+	       V T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
+	       V T2y, T2z, T1O, T2g, T1T, T2h;
+	       {
+		    V T1, T2T, T6, T2S;
+		    T1 = LD(&(ri[0]), ms, &(ri[0]));
+		    T2T = LD(&(ii[0]), ms, &(ii[0]));
+		    {
+			 V T3, T5, T2, T4;
+			 T3 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
+			 T5 = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
+			 T2 = LDW(&(W[TWVL * 14]));
+			 T4 = LDW(&(W[TWVL * 15]));
+			 T6 = VFMA(T2, T3, VMUL(T4, T5));
+			 T2S = VFNMS(T4, T3, VMUL(T2, T5));
+		    }
+		    T7 = VADD(T1, T6);
+		    T37 = VSUB(T2T, T2S);
+		    T1t = VSUB(T1, T6);
+		    T2U = VADD(T2S, T2T);
+	       }
+	       {
+		    V Tc, T1u, Th, T1v;
+		    {
+			 V T9, Tb, T8, Ta;
+			 T9 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+			 Tb = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+			 T8 = LDW(&(W[TWVL * 6]));
+			 Ta = LDW(&(W[TWVL * 7]));
+			 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
+			 T1u = VFNMS(Ta, T9, VMUL(T8, Tb));
+		    }
+		    {
+			 V Te, Tg, Td, Tf;
+			 Te = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
+			 Tg = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
+			 Td = LDW(&(W[TWVL * 22]));
+			 Tf = LDW(&(W[TWVL * 23]));
+			 Th = VFMA(Td, Te, VMUL(Tf, Tg));
+			 T1v = VFNMS(Tf, Te, VMUL(Td, Tg));
+		    }
+		    Ti = VADD(Tc, Th);
+		    T38 = VSUB(Tc, Th);
+		    T1w = VSUB(T1u, T1v);
+		    T2R = VADD(T1u, T1v);
+	       }
+	       {
+		    V To, T1y, Tt, T1z, T1A, T1B;
+		    {
+			 V Tl, Tn, Tk, Tm;
+			 Tl = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+			 Tn = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+			 Tk = LDW(&(W[TWVL * 2]));
+			 Tm = LDW(&(W[TWVL * 3]));
+			 To = VFMA(Tk, Tl, VMUL(Tm, Tn));
+			 T1y = VFNMS(Tm, Tl, VMUL(Tk, Tn));
+		    }
+		    {
+			 V Tq, Ts, Tp, Tr;
+			 Tq = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
+			 Ts = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
+			 Tp = LDW(&(W[TWVL * 18]));
+			 Tr = LDW(&(W[TWVL * 19]));
+			 Tt = VFMA(Tp, Tq, VMUL(Tr, Ts));
+			 T1z = VFNMS(Tr, Tq, VMUL(Tp, Ts));
+		    }
+		    Tu = VADD(To, Tt);
+		    T2s = VADD(T1y, T1z);
+		    T1A = VSUB(T1y, T1z);
+		    T1B = VSUB(To, Tt);
+		    T1C = VSUB(T1A, T1B);
+		    T2c = VADD(T1B, T1A);
+	       }
+	       {
+		    V Tz, T1E, TE, T1F, T1D, T1G;
+		    {
+			 V Tw, Ty, Tv, Tx;
+			 Tw = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
+			 Ty = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
+			 Tv = LDW(&(W[TWVL * 26]));
+			 Tx = LDW(&(W[TWVL * 27]));
+			 Tz = VFMA(Tv, Tw, VMUL(Tx, Ty));
+			 T1E = VFNMS(Tx, Tw, VMUL(Tv, Ty));
+		    }
+		    {
+			 V TB, TD, TA, TC;
+			 TB = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+			 TD = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+			 TA = LDW(&(W[TWVL * 10]));
+			 TC = LDW(&(W[TWVL * 11]));
+			 TE = VFMA(TA, TB, VMUL(TC, TD));
+			 T1F = VFNMS(TC, TB, VMUL(TA, TD));
+		    }
+		    TF = VADD(Tz, TE);
+		    T2t = VADD(T1E, T1F);
+		    T1D = VSUB(Tz, TE);
+		    T1G = VSUB(T1E, T1F);
+		    T1H = VADD(T1D, T1G);
+		    T2d = VSUB(T1D, T1G);
+	       }
+	       {
+		    V T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
+		    {
+			 V T16, T18, T15, T17;
+			 T16 = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
+			 T18 = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
+			 T15 = LDW(&(W[TWVL * 28]));
+			 T17 = LDW(&(W[TWVL * 29]));
+			 T19 = VFMA(T15, T16, VMUL(T17, T18));
+			 T20 = VFNMS(T17, T16, VMUL(T15, T18));
+		    }
+		    {
+			 V T1m, T1o, T1l, T1n;
+			 T1m = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
+			 T1o = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
+			 T1l = LDW(&(W[TWVL * 20]));
+			 T1n = LDW(&(W[TWVL * 21]));
+			 T1p = VFMA(T1l, T1m, VMUL(T1n, T1o));
+			 T1X = VFNMS(T1n, T1m, VMUL(T1l, T1o));
+		    }
+		    {
+			 V T1b, T1d, T1a, T1c;
+			 T1b = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+			 T1d = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+			 T1a = LDW(&(W[TWVL * 12]));
+			 T1c = LDW(&(W[TWVL * 13]));
+			 T1e = VFMA(T1a, T1b, VMUL(T1c, T1d));
+			 T21 = VFNMS(T1c, T1b, VMUL(T1a, T1d));
+		    }
+		    {
+			 V T1h, T1j, T1g, T1i;
+			 T1h = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+			 T1j = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+			 T1g = LDW(&(W[TWVL * 4]));
+			 T1i = LDW(&(W[TWVL * 5]));
+			 T1k = VFMA(T1g, T1h, VMUL(T1i, T1j));
+			 T1W = VFNMS(T1i, T1h, VMUL(T1g, T1j));
+		    }
+		    T1f = VADD(T19, T1e);
+		    T1q = VADD(T1k, T1p);
+		    T2B = VSUB(T1f, T1q);
+		    T2C = VADD(T20, T21);
+		    T2D = VADD(T1W, T1X);
+		    T2E = VSUB(T2C, T2D);
+		    {
+			 V T1V, T1Y, T22, T23;
+			 T1V = VSUB(T19, T1e);
+			 T1Y = VSUB(T1W, T1X);
+			 T1Z = VSUB(T1V, T1Y);
+			 T2j = VADD(T1V, T1Y);
+			 T22 = VSUB(T20, T21);
+			 T23 = VSUB(T1k, T1p);
+			 T24 = VADD(T22, T23);
+			 T2k = VSUB(T22, T23);
+		    }
+	       }
+	       {
+		    V TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
+		    {
+			 V TJ, TL, TI, TK;
+			 TJ = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+			 TL = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+			 TI = LDW(&(W[0]));
+			 TK = LDW(&(W[TWVL * 1]));
+			 TM = VFMA(TI, TJ, VMUL(TK, TL));
+			 T1K = VFNMS(TK, TJ, VMUL(TI, TL));
+		    }
+		    {
+			 V TZ, T11, TY, T10;
+			 TZ = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
+			 T11 = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
+			 TY = LDW(&(W[TWVL * 24]));
+			 T10 = LDW(&(W[TWVL * 25]));
+			 T12 = VFMA(TY, TZ, VMUL(T10, T11));
+			 T1R = VFNMS(T10, TZ, VMUL(TY, T11));
+		    }
+		    {
+			 V TO, TQ, TN, TP;
+			 TO = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
+			 TQ = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
+			 TN = LDW(&(W[TWVL * 16]));
+			 TP = LDW(&(W[TWVL * 17]));
+			 TR = VFMA(TN, TO, VMUL(TP, TQ));
+			 T1L = VFNMS(TP, TO, VMUL(TN, TQ));
+		    }
+		    {
+			 V TU, TW, TT, TV;
+			 TU = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+			 TW = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+			 TT = LDW(&(W[TWVL * 8]));
+			 TV = LDW(&(W[TWVL * 9]));
+			 TX = VFMA(TT, TU, VMUL(TV, TW));
+			 T1Q = VFNMS(TV, TU, VMUL(TT, TW));
+		    }
+		    TS = VADD(TM, TR);
+		    T13 = VADD(TX, T12);
+		    T2w = VSUB(TS, T13);
+		    T2x = VADD(T1K, T1L);
+		    T2y = VADD(T1Q, T1R);
+		    T2z = VSUB(T2x, T2y);
+		    {
+			 V T1M, T1N, T1P, T1S;
+			 T1M = VSUB(T1K, T1L);
+			 T1N = VSUB(TX, T12);
+			 T1O = VADD(T1M, T1N);
+			 T2g = VSUB(T1M, T1N);
+			 T1P = VSUB(TM, TR);
+			 T1S = VSUB(T1Q, T1R);
+			 T1T = VSUB(T1P, T1S);
+			 T2h = VADD(T1P, T1S);
+		    }
+	       }
+	       {
+		    V T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
+		    {
+			 V T1x, T1I, T3e, T3f;
+			 T1x = VSUB(T1t, T1w);
+			 T1I = VMUL(LDK(KP707106781), VSUB(T1C, T1H));
+			 T1J = VADD(T1x, T1I);
+			 T27 = VSUB(T1x, T1I);
+			 T3e = VMUL(LDK(KP707106781), VSUB(T2d, T2c));
+			 T3f = VADD(T38, T37);
+			 T3g = VADD(T3e, T3f);
+			 T3i = VSUB(T3f, T3e);
+		    }
+		    {
+			 V T1U, T25, T28, T29;
+			 T1U = VFMA(LDK(KP923879532), T1O, VMUL(LDK(KP382683432), T1T));
+			 T25 = VFNMS(LDK(KP923879532), T24, VMUL(LDK(KP382683432), T1Z));
+			 T26 = VADD(T1U, T25);
+			 T3h = VSUB(T25, T1U);
+			 T28 = VFNMS(LDK(KP923879532), T1T, VMUL(LDK(KP382683432), T1O));
+			 T29 = VFMA(LDK(KP382683432), T24, VMUL(LDK(KP923879532), T1Z));
+			 T2a = VSUB(T28, T29);
+			 T3d = VADD(T28, T29);
+		    }
+		    ST(&(ri[WS(rs, 11)]), VSUB(T1J, T26), ms, &(ri[WS(rs, 1)]));
+		    ST(&(ii[WS(rs, 11)]), VSUB(T3g, T3d), ms, &(ii[WS(rs, 1)]));
+		    ST(&(ri[WS(rs, 3)]), VADD(T1J, T26), ms, &(ri[WS(rs, 1)]));
+		    ST(&(ii[WS(rs, 3)]), VADD(T3d, T3g), ms, &(ii[WS(rs, 1)]));
+		    ST(&(ri[WS(rs, 15)]), VSUB(T27, T2a), ms, &(ri[WS(rs, 1)]));
+		    ST(&(ii[WS(rs, 15)]), VSUB(T3i, T3h), ms, &(ii[WS(rs, 1)]));
+		    ST(&(ri[WS(rs, 7)]), VADD(T27, T2a), ms, &(ri[WS(rs, 1)]));
+		    ST(&(ii[WS(rs, 7)]), VADD(T3h, T3i), ms, &(ii[WS(rs, 1)]));
+	       }
+	       {
+		    V T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
+		    {
+			 V T2r, T2u, T30, T31;
+			 T2r = VSUB(T7, Ti);
+			 T2u = VSUB(T2s, T2t);
+			 T2v = VADD(T2r, T2u);
+			 T2H = VSUB(T2r, T2u);
+			 T30 = VSUB(TF, Tu);
+			 T31 = VSUB(T2U, T2R);
+			 T32 = VADD(T30, T31);
+			 T34 = VSUB(T31, T30);
+		    }
+		    {
+			 V T2A, T2F, T2I, T2J;
+			 T2A = VADD(T2w, T2z);
+			 T2F = VSUB(T2B, T2E);
+			 T2G = VMUL(LDK(KP707106781), VADD(T2A, T2F));
+			 T33 = VMUL(LDK(KP707106781), VSUB(T2F, T2A));
+			 T2I = VSUB(T2z, T2w);
+			 T2J = VADD(T2B, T2E);
+			 T2K = VMUL(LDK(KP707106781), VSUB(T2I, T2J));
+			 T2Z = VMUL(LDK(KP707106781), VADD(T2I, T2J));
+		    }
+		    ST(&(ri[WS(rs, 10)]), VSUB(T2v, T2G), ms, &(ri[0]));
+		    ST(&(ii[WS(rs, 10)]), VSUB(T32, T2Z), ms, &(ii[0]));
+		    ST(&(ri[WS(rs, 2)]), VADD(T2v, T2G), ms, &(ri[0]));
+		    ST(&(ii[WS(rs, 2)]), VADD(T2Z, T32), ms, &(ii[0]));
+		    ST(&(ri[WS(rs, 14)]), VSUB(T2H, T2K), ms, &(ri[0]));
+		    ST(&(ii[WS(rs, 14)]), VSUB(T34, T33), ms, &(ii[0]));
+		    ST(&(ri[WS(rs, 6)]), VADD(T2H, T2K), ms, &(ri[0]));
+		    ST(&(ii[WS(rs, 6)]), VADD(T33, T34), ms, &(ii[0]));
+	       }
+	       {
+		    V T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
+		    {
+			 V T2b, T2e, T36, T39;
+			 T2b = VADD(T1t, T1w);
+			 T2e = VMUL(LDK(KP707106781), VADD(T2c, T2d));
+			 T2f = VADD(T2b, T2e);
+			 T2n = VSUB(T2b, T2e);
+			 T36 = VMUL(LDK(KP707106781), VADD(T1C, T1H));
+			 T39 = VSUB(T37, T38);
+			 T3a = VADD(T36, T39);
+			 T3c = VSUB(T39, T36);
+		    }
+		    {
+			 V T2i, T2l, T2o, T2p;
+			 T2i = VFMA(LDK(KP382683432), T2g, VMUL(LDK(KP923879532), T2h));
+			 T2l = VFNMS(LDK(KP382683432), T2k, VMUL(LDK(KP923879532), T2j));
+			 T2m = VADD(T2i, T2l);
+			 T3b = VSUB(T2l, T2i);
+			 T2o = VFNMS(LDK(KP382683432), T2h, VMUL(LDK(KP923879532), T2g));
+			 T2p = VFMA(LDK(KP923879532), T2k, VMUL(LDK(KP382683432), T2j));
+			 T2q = VSUB(T2o, T2p);
+			 T35 = VADD(T2o, T2p);
+		    }
+		    ST(&(ri[WS(rs, 9)]), VSUB(T2f, T2m), ms, &(ri[WS(rs, 1)]));
+		    ST(&(ii[WS(rs, 9)]), VSUB(T3a, T35), ms, &(ii[WS(rs, 1)]));
+		    ST(&(ri[WS(rs, 1)]), VADD(T2f, T2m), ms, &(ri[WS(rs, 1)]));
+		    ST(&(ii[WS(rs, 1)]), VADD(T35, T3a), ms, &(ii[WS(rs, 1)]));
+		    ST(&(ri[WS(rs, 13)]), VSUB(T2n, T2q), ms, &(ri[WS(rs, 1)]));
+		    ST(&(ii[WS(rs, 13)]), VSUB(T3c, T3b), ms, &(ii[WS(rs, 1)]));
+		    ST(&(ri[WS(rs, 5)]), VADD(T2n, T2q), ms, &(ri[WS(rs, 1)]));
+		    ST(&(ii[WS(rs, 5)]), VADD(T3b, T3c), ms, &(ii[WS(rs, 1)]));
+	       }
+	       {
+		    V TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
+		    {
+			 V Tj, TG, T2Q, T2V;
+			 Tj = VADD(T7, Ti);
+			 TG = VADD(Tu, TF);
+			 TH = VADD(Tj, TG);
+			 T2L = VSUB(Tj, TG);
+			 T2Q = VADD(T2s, T2t);
+			 T2V = VADD(T2R, T2U);
+			 T2W = VADD(T2Q, T2V);
+			 T2Y = VSUB(T2V, T2Q);
+		    }
+		    {
+			 V T14, T1r, T2M, T2N;
+			 T14 = VADD(TS, T13);
+			 T1r = VADD(T1f, T1q);
+			 T1s = VADD(T14, T1r);
+			 T2X = VSUB(T1r, T14);
+			 T2M = VADD(T2x, T2y);
+			 T2N = VADD(T2C, T2D);
+			 T2O = VSUB(T2M, T2N);
+			 T2P = VADD(T2M, T2N);
+		    }
+		    ST(&(ri[WS(rs, 8)]), VSUB(TH, T1s), ms, &(ri[0]));
+		    ST(&(ii[WS(rs, 8)]), VSUB(T2W, T2P), ms, &(ii[0]));
+		    ST(&(ri[0]), VADD(TH, T1s), ms, &(ri[0]));
+		    ST(&(ii[0]), VADD(T2P, T2W), ms, &(ii[0]));
+		    ST(&(ri[WS(rs, 12)]), VSUB(T2L, T2O), ms, &(ri[0]));
+		    ST(&(ii[WS(rs, 12)]), VSUB(T2Y, T2X), ms, &(ii[0]));
+		    ST(&(ri[WS(rs, 4)]), VADD(T2L, T2O), ms, &(ri[0]));
+		    ST(&(ii[WS(rs, 4)]), VADD(T2X, T2Y), ms, &(ii[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t1sv_16"), twinstr, &GENUS, {136, 46, 38, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1sv_16) (planner *p) {
+     X(kdft_dit_register) (p, t1sv_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1sv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1sv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:23 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t1sv_2 -include ts.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "ts.h"
+
+static void t1sv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 2); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 2), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, Ta, T3, T6, T2, T5;
+	       T1 = LD(&(ri[0]), ms, &(ri[0]));
+	       Ta = LD(&(ii[0]), ms, &(ii[0]));
+	       T3 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+	       T6 = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+	       T2 = LDW(&(W[0]));
+	       T5 = LDW(&(W[TWVL * 1]));
+	       {
+		    V T8, T4, T9, T7;
+		    T8 = VMUL(T2, T6);
+		    T4 = VMUL(T2, T3);
+		    T9 = VFNMS(T5, T3, T8);
+		    T7 = VFMA(T5, T6, T4);
+		    ST(&(ii[0]), VADD(T9, Ta), ms, &(ii[0]));
+		    ST(&(ii[WS(rs, 1)]), VSUB(Ta, T9), ms, &(ii[WS(rs, 1)]));
+		    ST(&(ri[0]), VADD(T1, T7), ms, &(ri[0]));
+		    ST(&(ri[WS(rs, 1)]), VSUB(T1, T7), ms, &(ri[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t1sv_2"), twinstr, &GENUS, {4, 2, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1sv_2) (planner *p) {
+     X(kdft_dit_register) (p, t1sv_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t1sv_2 -include ts.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 9 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "ts.h"
+
+static void t1sv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 2); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 2), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T8, T6, T7;
+	       T1 = LD(&(ri[0]), ms, &(ri[0]));
+	       T8 = LD(&(ii[0]), ms, &(ii[0]));
+	       {
+		    V T3, T5, T2, T4;
+		    T3 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+		    T5 = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+		    T2 = LDW(&(W[0]));
+		    T4 = LDW(&(W[TWVL * 1]));
+		    T6 = VFMA(T2, T3, VMUL(T4, T5));
+		    T7 = VFNMS(T4, T3, VMUL(T2, T5));
+	       }
+	       ST(&(ri[WS(rs, 1)]), VSUB(T1, T6), ms, &(ri[WS(rs, 1)]));
+	       ST(&(ii[WS(rs, 1)]), VSUB(T8, T7), ms, &(ii[WS(rs, 1)]));
+	       ST(&(ri[0]), VADD(T1, T6), ms, &(ri[0]));
+	       ST(&(ii[0]), VADD(T7, T8), ms, &(ii[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t1sv_2"), twinstr, &GENUS, {4, 2, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1sv_2) (planner *p) {
+     X(kdft_dit_register) (p, t1sv_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1784 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:25 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1sv_32 -include ts.h */
+
+/*
+ * This function contains 434 FP additions, 260 FP multiplications,
+ * (or, 236 additions, 62 multiplications, 198 fused multiply/add),
+ * 158 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "ts.h"
+
+static void t1sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 62); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 62), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V T8Z, T90;
+	       {
+		    V T87, T8x, T3w, T8, T3B, T83, Tl, T8y, T6F, Tz, T3J, T5T, T6G, TM, T3Q;
+		    V T5U, T46, T5Y, T7D, T6L, T5X, T3Z, T6M, T1f, T4l, T61, T7E, T6R, T60, T4e;
+		    V T6O, T1G, T5r, T6c, T78, T7N, T54, T6f, T32, T7b, T4S, T65, T6X, T7I, T4v;
+		    V T68, T29, T70, T4x, T2f, T5b, T5s, T7O, T7e, T5t, T5i, T79, T3t, T2h, T2k;
+		    V T2j, T2o, T2r, T4H, T2y, T2n, T2q, T4y, T2i;
+		    {
+			 V T3U, TU, TW, TZ, TY, T13, T16, T12, T15, T3V, TX, T44, T1d;
+			 {
+			      V T1, T86, T3, T6, T5, Ta, Td, Tg, Tj, Tf, T84, T4, Tc, Ti, T3x;
+			      V Tb, T2, T9;
+			      T1 = LD(&(ri[0]), ms, &(ri[0]));
+			      T86 = LD(&(ii[0]), ms, &(ii[0]));
+			      T3 = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
+			      T6 = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
+			      T2 = LDW(&(W[TWVL * 30]));
+			      T5 = LDW(&(W[TWVL * 31]));
+			      Ta = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
+			      Td = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
+			      T9 = LDW(&(W[TWVL * 14]));
+			      Tg = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
+			      Tj = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
+			      Tf = LDW(&(W[TWVL * 46]));
+			      T84 = VMUL(T2, T6);
+			      T4 = VMUL(T2, T3);
+			      Tc = LDW(&(W[TWVL * 15]));
+			      Ti = LDW(&(W[TWVL * 47]));
+			      T3x = VMUL(T9, Td);
+			      Tb = VMUL(T9, Ta);
+			      {
+				   V Tu, Tx, T3F, Ts, Tt, Tw;
+				   {
+					V To, Tr, Tq, T3E, Tp;
+					{
+					     V T3y, Te, Tn, T3A, Tk;
+					     {
+						  V T3z, Th, T85, T7;
+						  To = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+						  T3z = VMUL(Tf, Tj);
+						  Th = VMUL(Tf, Tg);
+						  T85 = VFNMS(T5, T3, T84);
+						  T7 = VFMA(T5, T6, T4);
+						  Tr = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+						  T3y = VFNMS(Tc, Ta, T3x);
+						  Te = VFMA(Tc, Td, Tb);
+						  Tn = LDW(&(W[TWVL * 6]));
+						  T3A = VFNMS(Ti, Tg, T3z);
+						  Tk = VFMA(Ti, Tj, Th);
+						  T87 = VADD(T85, T86);
+						  T8x = VSUB(T86, T85);
+						  T3w = VSUB(T1, T7);
+						  T8 = VADD(T1, T7);
+					     }
+					     Tq = LDW(&(W[TWVL * 7]));
+					     T3E = VMUL(Tn, Tr);
+					     Tp = VMUL(Tn, To);
+					     T3B = VSUB(T3y, T3A);
+					     T83 = VADD(T3y, T3A);
+					     Tl = VADD(Te, Tk);
+					     T8y = VSUB(Te, Tk);
+					}
+					Tu = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
+					Tx = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
+					T3F = VFNMS(Tq, To, T3E);
+					Ts = VFMA(Tq, Tr, Tp);
+					Tt = LDW(&(W[TWVL * 38]));
+					Tw = LDW(&(W[TWVL * 39]));
+				   }
+				   {
+					V TB, TE, TD, TH, TK, T3G, Tv, TG, TJ, T3L, TC, TA;
+					TB = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
+					TE = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
+					TA = LDW(&(W[TWVL * 54]));
+					TD = LDW(&(W[TWVL * 55]));
+					TH = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
+					TK = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
+					T3G = VMUL(Tt, Tx);
+					Tv = VMUL(Tt, Tu);
+					TG = LDW(&(W[TWVL * 22]));
+					TJ = LDW(&(W[TWVL * 23]));
+					T3L = VMUL(TA, TE);
+					TC = VMUL(TA, TB);
+					{
+					     V T19, T1c, T3P, T3K, T18, T1b, TV, T43, T1a;
+					     {
+						  V TQ, TT, T3M, TF, TS, T3I, T3D, T3O, TL, T3T, TR;
+						  {
+						       V T3H, Ty, T3N, TI, TP;
+						       TQ = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+						       TT = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+						       T3H = VFNMS(Tw, Tu, T3G);
+						       Ty = VFMA(Tw, Tx, Tv);
+						       T3N = VMUL(TG, TK);
+						       TI = VMUL(TG, TH);
+						       T3M = VFNMS(TD, TB, T3L);
+						       TF = VFMA(TD, TE, TC);
+						       TP = LDW(&(W[TWVL * 2]));
+						       TS = LDW(&(W[TWVL * 3]));
+						       T6F = VADD(T3F, T3H);
+						       T3I = VSUB(T3F, T3H);
+						       Tz = VADD(Ts, Ty);
+						       T3D = VSUB(Ts, Ty);
+						       T3O = VFNMS(TJ, TH, T3N);
+						       TL = VFMA(TJ, TK, TI);
+						       T3T = VMUL(TP, TT);
+						       TR = VMUL(TP, TQ);
+						  }
+						  T19 = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
+						  T1c = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
+						  T3J = VADD(T3D, T3I);
+						  T5T = VSUB(T3I, T3D);
+						  T6G = VADD(T3M, T3O);
+						  T3P = VSUB(T3M, T3O);
+						  TM = VADD(TF, TL);
+						  T3K = VSUB(TF, TL);
+						  T3U = VFNMS(TS, TQ, T3T);
+						  TU = VFMA(TS, TT, TR);
+						  T18 = LDW(&(W[TWVL * 50]));
+						  T1b = LDW(&(W[TWVL * 51]));
+					     }
+					     TW = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
+					     TZ = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
+					     T3Q = VSUB(T3K, T3P);
+					     T5U = VADD(T3K, T3P);
+					     TV = LDW(&(W[TWVL * 34]));
+					     TY = LDW(&(W[TWVL * 35]));
+					     T43 = VMUL(T18, T1c);
+					     T1a = VMUL(T18, T19);
+					     T13 = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
+					     T16 = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
+					     T12 = LDW(&(W[TWVL * 18]));
+					     T15 = LDW(&(W[TWVL * 19]));
+					     T3V = VMUL(TV, TZ);
+					     TX = VMUL(TV, TW);
+					     T44 = VFNMS(T1b, T19, T43);
+					     T1d = VFMA(T1b, T1c, T1a);
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T4Z, T2H, T2J, T2M, T2L, T2Q, T2T, T2P, T2S, T5p, T30, T50, T2K;
+			      {
+				   V T49, T1l, T1n, T1q, T1p, T1u, T1x, T4j, T1E, T1t, T1w, T4a, T1o;
+				   {
+					V T1A, T1D, T1C, T4i, T1B, T1m;
+					{
+					     V T1h, T1k, T41, T14, T3W, T10, T1g, T1j;
+					     T1h = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
+					     T1k = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
+					     T41 = VMUL(T12, T16);
+					     T14 = VMUL(T12, T13);
+					     T3W = VFNMS(TY, TW, T3V);
+					     T10 = VFMA(TY, TZ, TX);
+					     T1g = LDW(&(W[TWVL * 58]));
+					     T1j = LDW(&(W[TWVL * 59]));
+					     {
+						  V T6J, T3X, T11, T40, T48, T1i, T6K, T45, T1e, T3Y, T1z, T42, T17;
+						  T1A = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
+						  T1D = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
+						  T42 = VFNMS(T15, T13, T41);
+						  T17 = VFMA(T15, T16, T14);
+						  T6J = VADD(T3U, T3W);
+						  T3X = VSUB(T3U, T3W);
+						  T11 = VADD(TU, T10);
+						  T40 = VSUB(TU, T10);
+						  T48 = VMUL(T1g, T1k);
+						  T1i = VMUL(T1g, T1h);
+						  T6K = VADD(T42, T44);
+						  T45 = VSUB(T42, T44);
+						  T1e = VADD(T17, T1d);
+						  T3Y = VSUB(T17, T1d);
+						  T1z = LDW(&(W[TWVL * 42]));
+						  T1C = LDW(&(W[TWVL * 43]));
+						  T49 = VFNMS(T1j, T1h, T48);
+						  T1l = VFMA(T1j, T1k, T1i);
+						  T46 = VADD(T40, T45);
+						  T5Y = VSUB(T40, T45);
+						  T7D = VADD(T6J, T6K);
+						  T6L = VSUB(T6J, T6K);
+						  T5X = VADD(T3X, T3Y);
+						  T3Z = VSUB(T3X, T3Y);
+						  T6M = VSUB(T11, T1e);
+						  T1f = VADD(T11, T1e);
+						  T4i = VMUL(T1z, T1D);
+						  T1B = VMUL(T1z, T1A);
+					     }
+					}
+					T1n = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
+					T1q = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
+					T1m = LDW(&(W[TWVL * 26]));
+					T1p = LDW(&(W[TWVL * 27]));
+					T1u = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+					T1x = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+					T4j = VFNMS(T1C, T1A, T4i);
+					T1E = VFMA(T1C, T1D, T1B);
+					T1t = LDW(&(W[TWVL * 10]));
+					T1w = LDW(&(W[TWVL * 11]));
+					T4a = VMUL(T1m, T1q);
+					T1o = VMUL(T1m, T1n);
+				   }
+				   {
+					V T2W, T2Z, T6P, T4c, T1s, T4f, T6Q, T4k, T1F, T4d, T2V, T2Y, T5o, T2X, T2I;
+					{
+					     V T2D, T2G, T2C, T2F, T4g, T1v, T4b, T1r;
+					     T2D = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
+					     T2G = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
+					     T2C = LDW(&(W[TWVL * 60]));
+					     T2F = LDW(&(W[TWVL * 61]));
+					     T4g = VMUL(T1t, T1x);
+					     T1v = VMUL(T1t, T1u);
+					     T4b = VFNMS(T1p, T1n, T4a);
+					     T1r = VFMA(T1p, T1q, T1o);
+					     T2W = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
+					     T2Z = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
+					     {
+						  V T4Y, T2E, T4h, T1y;
+						  T4Y = VMUL(T2C, T2G);
+						  T2E = VMUL(T2C, T2D);
+						  T4h = VFNMS(T1w, T1u, T4g);
+						  T1y = VFMA(T1w, T1x, T1v);
+						  T6P = VADD(T49, T4b);
+						  T4c = VSUB(T49, T4b);
+						  T1s = VADD(T1l, T1r);
+						  T4f = VSUB(T1l, T1r);
+						  T4Z = VFNMS(T2F, T2D, T4Y);
+						  T2H = VFMA(T2F, T2G, T2E);
+						  T6Q = VADD(T4h, T4j);
+						  T4k = VSUB(T4h, T4j);
+						  T1F = VADD(T1y, T1E);
+						  T4d = VSUB(T1y, T1E);
+						  T2V = LDW(&(W[TWVL * 44]));
+					     }
+					     T2Y = LDW(&(W[TWVL * 45]));
+					}
+					T2J = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
+					T2M = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
+					T4l = VADD(T4f, T4k);
+					T61 = VSUB(T4f, T4k);
+					T7E = VADD(T6P, T6Q);
+					T6R = VSUB(T6P, T6Q);
+					T60 = VADD(T4c, T4d);
+					T4e = VSUB(T4c, T4d);
+					T6O = VSUB(T1s, T1F);
+					T1G = VADD(T1s, T1F);
+					T5o = VMUL(T2V, T2Z);
+					T2X = VMUL(T2V, T2W);
+					T2I = LDW(&(W[TWVL * 28]));
+					T2L = LDW(&(W[TWVL * 29]));
+					T2Q = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+					T2T = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+					T2P = LDW(&(W[TWVL * 12]));
+					T2S = LDW(&(W[TWVL * 13]));
+					T5p = VFNMS(T2Y, T2W, T5o);
+					T30 = VFMA(T2Y, T2Z, T2X);
+					T50 = VMUL(T2I, T2M);
+					T2K = VMUL(T2I, T2J);
+				   }
+			      }
+			      {
+				   V T4q, T1O, T1Q, T1T, T1S, T1X, T20, T4Q, T27, T1W, T1Z, T4r, T1R;
+				   {
+					V T23, T26, T25, T4P, T24, T1P;
+					{
+					     V T1K, T1N, T5m, T2R, T1J, T1M, T51, T2N;
+					     T1K = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+					     T1N = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+					     T5m = VMUL(T2P, T2T);
+					     T2R = VMUL(T2P, T2Q);
+					     T1J = LDW(&(W[0]));
+					     T1M = LDW(&(W[TWVL * 1]));
+					     T51 = VFNMS(T2L, T2J, T50);
+					     T2N = VFMA(T2L, T2M, T2K);
+					     {
+						  V T76, T52, T2O, T5l, T77, T5q, T31, T53, T22;
+						  T23 = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
+						  T26 = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
+						  {
+						       V T5n, T2U, T4p, T1L;
+						       T5n = VFNMS(T2S, T2Q, T5m);
+						       T2U = VFMA(T2S, T2T, T2R);
+						       T4p = VMUL(T1J, T1N);
+						       T1L = VMUL(T1J, T1K);
+						       T76 = VADD(T4Z, T51);
+						       T52 = VSUB(T4Z, T51);
+						       T2O = VADD(T2H, T2N);
+						       T5l = VSUB(T2H, T2N);
+						       T77 = VADD(T5n, T5p);
+						       T5q = VSUB(T5n, T5p);
+						       T31 = VADD(T2U, T30);
+						       T53 = VSUB(T2U, T30);
+						       T4q = VFNMS(T1M, T1K, T4p);
+						       T1O = VFMA(T1M, T1N, T1L);
+						       T22 = LDW(&(W[TWVL * 48]));
+						  }
+						  T25 = LDW(&(W[TWVL * 49]));
+						  T5r = VADD(T5l, T5q);
+						  T6c = VSUB(T5l, T5q);
+						  T78 = VSUB(T76, T77);
+						  T7N = VADD(T76, T77);
+						  T54 = VSUB(T52, T53);
+						  T6f = VADD(T52, T53);
+						  T32 = VADD(T2O, T31);
+						  T7b = VSUB(T2O, T31);
+						  T4P = VMUL(T22, T26);
+						  T24 = VMUL(T22, T23);
+					     }
+					}
+					T1Q = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
+					T1T = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
+					T1P = LDW(&(W[TWVL * 32]));
+					T1S = LDW(&(W[TWVL * 33]));
+					T1X = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
+					T20 = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
+					T4Q = VFNMS(T25, T23, T4P);
+					T27 = VFMA(T25, T26, T24);
+					T1W = LDW(&(W[TWVL * 16]));
+					T1Z = LDW(&(W[TWVL * 17]));
+					T4r = VMUL(T1P, T1T);
+					T1R = VMUL(T1P, T1Q);
+				   }
+				   {
+					V T56, T38, T3a, T3d, T3c, T3h, T3k, T3g, T3j, T5g, T3r, T57, T3b;
+					{
+					     V T3n, T3q, T6V, T4t, T1V, T4M, T6W, T4R, T28, T4u, T3m, T3p, T5f, T3o, T39;
+					     {
+						  V T34, T37, T33, T36, T4N, T1Y, T4s, T1U;
+						  T34 = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+						  T37 = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+						  T33 = LDW(&(W[TWVL * 4]));
+						  T36 = LDW(&(W[TWVL * 5]));
+						  T4N = VMUL(T1W, T20);
+						  T1Y = VMUL(T1W, T1X);
+						  T4s = VFNMS(T1S, T1Q, T4r);
+						  T1U = VFMA(T1S, T1T, T1R);
+						  T3n = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
+						  T3q = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
+						  {
+						       V T55, T35, T4O, T21;
+						       T55 = VMUL(T33, T37);
+						       T35 = VMUL(T33, T34);
+						       T4O = VFNMS(T1Z, T1X, T4N);
+						       T21 = VFMA(T1Z, T20, T1Y);
+						       T6V = VADD(T4q, T4s);
+						       T4t = VSUB(T4q, T4s);
+						       T1V = VADD(T1O, T1U);
+						       T4M = VSUB(T1O, T1U);
+						       T56 = VFNMS(T36, T34, T55);
+						       T38 = VFMA(T36, T37, T35);
+						       T6W = VADD(T4O, T4Q);
+						       T4R = VSUB(T4O, T4Q);
+						       T28 = VADD(T21, T27);
+						       T4u = VSUB(T21, T27);
+						       T3m = LDW(&(W[TWVL * 20]));
+						  }
+						  T3p = LDW(&(W[TWVL * 21]));
+					     }
+					     T3a = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
+					     T3d = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
+					     T4S = VADD(T4M, T4R);
+					     T65 = VSUB(T4M, T4R);
+					     T6X = VSUB(T6V, T6W);
+					     T7I = VADD(T6V, T6W);
+					     T4v = VSUB(T4t, T4u);
+					     T68 = VADD(T4t, T4u);
+					     T29 = VADD(T1V, T28);
+					     T70 = VSUB(T1V, T28);
+					     T5f = VMUL(T3m, T3q);
+					     T3o = VMUL(T3m, T3n);
+					     T39 = LDW(&(W[TWVL * 36]));
+					     T3c = LDW(&(W[TWVL * 37]));
+					     T3h = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
+					     T3k = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
+					     T3g = LDW(&(W[TWVL * 52]));
+					     T3j = LDW(&(W[TWVL * 53]));
+					     T5g = VFNMS(T3p, T3n, T5f);
+					     T3r = VFMA(T3p, T3q, T3o);
+					     T57 = VMUL(T39, T3d);
+					     T3b = VMUL(T39, T3a);
+					}
+					{
+					     V T2u, T2x, T2w, T4G, T2v, T2g;
+					     {
+						  V T2b, T2e, T5d, T3i, T2a, T2d, T58, T3e, T2t;
+						  T2b = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+						  T2e = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+						  T5d = VMUL(T3g, T3k);
+						  T3i = VMUL(T3g, T3h);
+						  T2a = LDW(&(W[TWVL * 8]));
+						  T2d = LDW(&(W[TWVL * 9]));
+						  T58 = VFNMS(T3c, T3a, T57);
+						  T3e = VFMA(T3c, T3d, T3b);
+						  T2u = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
+						  T2x = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
+						  {
+						       V T5e, T3l, T4w, T2c;
+						       T5e = VFNMS(T3j, T3h, T5d);
+						       T3l = VFMA(T3j, T3k, T3i);
+						       T4w = VMUL(T2a, T2e);
+						       T2c = VMUL(T2a, T2b);
+						       {
+							    V T7c, T59, T3f, T5a;
+							    T7c = VADD(T56, T58);
+							    T59 = VSUB(T56, T58);
+							    T3f = VADD(T38, T3e);
+							    T5a = VSUB(T38, T3e);
+							    {
+								 V T7d, T5h, T3s, T5c;
+								 T7d = VADD(T5e, T5g);
+								 T5h = VSUB(T5e, T5g);
+								 T3s = VADD(T3l, T3r);
+								 T5c = VSUB(T3l, T3r);
+								 T4x = VFNMS(T2d, T2b, T4w);
+								 T2f = VFMA(T2d, T2e, T2c);
+								 T5b = VSUB(T59, T5a);
+								 T5s = VADD(T5a, T59);
+								 T2t = LDW(&(W[TWVL * 24]));
+								 T7O = VADD(T7c, T7d);
+								 T7e = VSUB(T7c, T7d);
+								 T5t = VSUB(T5c, T5h);
+								 T5i = VADD(T5c, T5h);
+								 T79 = VSUB(T3s, T3f);
+								 T3t = VADD(T3f, T3s);
+							    }
+						       }
+						  }
+						  T2w = LDW(&(W[TWVL * 25]));
+						  T4G = VMUL(T2t, T2x);
+						  T2v = VMUL(T2t, T2u);
+					     }
+					     T2h = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
+					     T2k = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
+					     T2g = LDW(&(W[TWVL * 40]));
+					     T2j = LDW(&(W[TWVL * 41]));
+					     T2o = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
+					     T2r = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
+					     T4H = VFNMS(T2w, T2u, T4G);
+					     T2y = VFMA(T2w, T2x, T2v);
+					     T2n = LDW(&(W[TWVL * 56]));
+					     T2q = LDW(&(W[TWVL * 57]));
+					     T4y = VMUL(T2g, T2k);
+					     T2i = VMUL(T2g, T2h);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T4C, T4T, T4U, T4J, T7A, T7w, T7j, T75, T7i, T6U, T8p, T8n, T8v, T8t, T7q;
+			 V T7y, T7t, T7z, T7g, T7k;
+			 {
+			      V T6E, T8j, T6H, T8k, T73, T6Y, T7S, T8i, T8h, T7V;
+			      {
+				   V T7P, T7Y, T7C, TO, T89, T8e, T3u, T7M, T8d, T1H, T7K, T7X, T2B, T7H;
+				   {
+					V T71, T2m, T72, T4I, T2z, T4D, Tm, TN, T2A, T7J;
+					T6E = VSUB(T8, Tl);
+					Tm = VADD(T8, Tl);
+					TN = VADD(Tz, TM);
+					T8j = VSUB(TM, Tz);
+					T7P = VSUB(T7N, T7O);
+					T7Y = VADD(T7N, T7O);
+					{
+					     V T82, T4E, T2p, T4z, T2l, T88;
+					     T82 = VADD(T6F, T6G);
+					     T6H = VSUB(T6F, T6G);
+					     T4E = VMUL(T2n, T2r);
+					     T2p = VMUL(T2n, T2o);
+					     T4z = VFNMS(T2j, T2h, T4y);
+					     T2l = VFMA(T2j, T2k, T2i);
+					     T8k = VSUB(T87, T83);
+					     T88 = VADD(T83, T87);
+					     T7C = VSUB(Tm, TN);
+					     TO = VADD(Tm, TN);
+					     {
+						  V T4F, T2s, T4A, T4B;
+						  T4F = VFNMS(T2q, T2o, T4E);
+						  T2s = VFMA(T2q, T2r, T2p);
+						  T71 = VADD(T4x, T4z);
+						  T4A = VSUB(T4x, T4z);
+						  T2m = VADD(T2f, T2l);
+						  T4B = VSUB(T2f, T2l);
+						  T89 = VADD(T82, T88);
+						  T8e = VSUB(T88, T82);
+						  T72 = VADD(T4F, T4H);
+						  T4I = VSUB(T4F, T4H);
+						  T2z = VADD(T2s, T2y);
+						  T4D = VSUB(T2s, T2y);
+						  T4C = VSUB(T4A, T4B);
+						  T4T = VADD(T4B, T4A);
+					     }
+					}
+					T3u = VADD(T32, T3t);
+					T7M = VSUB(T32, T3t);
+					T7J = VADD(T71, T72);
+					T73 = VSUB(T71, T72);
+					T4U = VSUB(T4D, T4I);
+					T4J = VADD(T4D, T4I);
+					T6Y = VSUB(T2z, T2m);
+					T2A = VADD(T2m, T2z);
+					T8d = VSUB(T1G, T1f);
+					T1H = VADD(T1f, T1G);
+					T7K = VSUB(T7I, T7J);
+					T7X = VADD(T7I, T7J);
+					T2B = VADD(T29, T2A);
+					T7H = VSUB(T29, T2A);
+				   }
+				   {
+					V T1I, T80, T7Q, T7U, T7F, T7L, T7T, T3v, T8b, T8c, T8a, T7W, T81, T7Z;
+					T7W = VSUB(TO, T1H);
+					T1I = VADD(TO, T1H);
+					T7Z = VSUB(T7X, T7Y);
+					T80 = VADD(T7X, T7Y);
+					T7Q = VSUB(T7M, T7P);
+					T7U = VADD(T7M, T7P);
+					T7F = VSUB(T7D, T7E);
+					T81 = VADD(T7D, T7E);
+					T7L = VADD(T7H, T7K);
+					T7T = VSUB(T7K, T7H);
+					T3v = VADD(T2B, T3u);
+					T8b = VSUB(T3u, T2B);
+					ST(&(ri[WS(rs, 24)]), VSUB(T7W, T7Z), ms, &(ri[0]));
+					ST(&(ri[WS(rs, 8)]), VADD(T7W, T7Z), ms, &(ri[0]));
+					T8c = VSUB(T89, T81);
+					T8a = VADD(T81, T89);
+					{
+					     V T8f, T8g, T7G, T7R;
+					     T7S = VSUB(T7C, T7F);
+					     T7G = VADD(T7C, T7F);
+					     T7R = VADD(T7L, T7Q);
+					     T8i = VSUB(T7Q, T7L);
+					     T8h = VSUB(T8e, T8d);
+					     T8f = VADD(T8d, T8e);
+					     ST(&(ri[0]), VADD(T1I, T3v), ms, &(ri[0]));
+					     ST(&(ri[WS(rs, 16)]), VSUB(T1I, T3v), ms, &(ri[0]));
+					     T8g = VADD(T7T, T7U);
+					     T7V = VSUB(T7T, T7U);
+					     ST(&(ii[WS(rs, 16)]), VSUB(T8a, T80), ms, &(ii[0]));
+					     ST(&(ii[0]), VADD(T80, T8a), ms, &(ii[0]));
+					     ST(&(ii[WS(rs, 24)]), VSUB(T8c, T8b), ms, &(ii[0]));
+					     ST(&(ii[WS(rs, 8)]), VADD(T8b, T8c), ms, &(ii[0]));
+					     ST(&(ri[WS(rs, 4)]), VFMA(LDK(KP707106781), T7R, T7G), ms, &(ri[0]));
+					     ST(&(ri[WS(rs, 20)]), VFNMS(LDK(KP707106781), T7R, T7G), ms, &(ri[0]));
+					     ST(&(ii[WS(rs, 20)]), VFNMS(LDK(KP707106781), T8g, T8f), ms, &(ii[0]));
+					     ST(&(ii[WS(rs, 4)]), VFMA(LDK(KP707106781), T8g, T8f), ms, &(ii[0]));
+					}
+				   }
+			      }
+			      {
+				   V T7f, T7a, T7m, T6I, T7s, T7r, T8r, T8l, T8m, T6T, T8s, T7p;
+				   {
+					V T7n, T6N, T6S, T7o, T7u, T7v, T6Z, T74;
+					T7f = VSUB(T7b, T7e);
+					T7u = VADD(T7b, T7e);
+					T7v = VADD(T78, T79);
+					T7a = VSUB(T78, T79);
+					ST(&(ri[WS(rs, 12)]), VFMA(LDK(KP707106781), T7V, T7S), ms, &(ri[0]));
+					ST(&(ri[WS(rs, 28)]), VFNMS(LDK(KP707106781), T7V, T7S), ms, &(ri[0]));
+					ST(&(ii[WS(rs, 28)]), VFNMS(LDK(KP707106781), T8i, T8h), ms, &(ii[0]));
+					ST(&(ii[WS(rs, 12)]), VFMA(LDK(KP707106781), T8i, T8h), ms, &(ii[0]));
+					T7m = VADD(T6E, T6H);
+					T6I = VSUB(T6E, T6H);
+					T7A = VFMA(LDK(KP414213562), T7u, T7v);
+					T7w = VFNMS(LDK(KP414213562), T7v, T7u);
+					T7n = VADD(T6M, T6L);
+					T6N = VSUB(T6L, T6M);
+					T6S = VADD(T6O, T6R);
+					T7o = VSUB(T6O, T6R);
+					T7s = VADD(T6X, T6Y);
+					T6Z = VSUB(T6X, T6Y);
+					T74 = VSUB(T70, T73);
+					T7r = VADD(T70, T73);
+					T8r = VSUB(T8k, T8j);
+					T8l = VADD(T8j, T8k);
+					T8m = VADD(T6N, T6S);
+					T6T = VSUB(T6N, T6S);
+					T7j = VFNMS(LDK(KP414213562), T6Z, T74);
+					T75 = VFMA(LDK(KP414213562), T74, T6Z);
+					T8s = VSUB(T7o, T7n);
+					T7p = VADD(T7n, T7o);
+				   }
+				   T7i = VFNMS(LDK(KP707106781), T6T, T6I);
+				   T6U = VFMA(LDK(KP707106781), T6T, T6I);
+				   T8p = VFNMS(LDK(KP707106781), T8m, T8l);
+				   T8n = VFMA(LDK(KP707106781), T8m, T8l);
+				   T8v = VFNMS(LDK(KP707106781), T8s, T8r);
+				   T8t = VFMA(LDK(KP707106781), T8s, T8r);
+				   T7q = VFMA(LDK(KP707106781), T7p, T7m);
+				   T7y = VFNMS(LDK(KP707106781), T7p, T7m);
+				   T7t = VFMA(LDK(KP414213562), T7s, T7r);
+				   T7z = VFNMS(LDK(KP414213562), T7r, T7s);
+				   T7g = VFNMS(LDK(KP414213562), T7f, T7a);
+				   T7k = VFMA(LDK(KP414213562), T7a, T7f);
+			      }
+			 }
+			 {
+			      V T5S, T8O, T8N, T5V, T6d, T6g, T66, T4L, T5I, T69, T5y, T4o, T8J, T8L, T5M;
+			      V T5Q, T5A, T5w, T5H, T4W, T5O, T5G, T8D, T8F;
+			      {
+				   V T5C, T3S, T8C, T4n, T8H, T8B, T8I, T5F, T5L, T5k, T5K, T5v, T4V;
+				   {
+					V T5D, T47, T4m, T5E, T8z, T8A, T3C, T3R, T5j, T5u, T4K;
+					T5S = VSUB(T3w, T3B);
+					T3C = VADD(T3w, T3B);
+					T3R = VADD(T3J, T3Q);
+					T8O = VSUB(T3Q, T3J);
+					{
+					     V T8o, T7B, T7x, T8q;
+					     T8o = VADD(T7z, T7A);
+					     T7B = VSUB(T7z, T7A);
+					     T7x = VADD(T7t, T7w);
+					     T8q = VSUB(T7w, T7t);
+					     {
+						  V T8u, T7l, T7h, T8w;
+						  T8u = VSUB(T7k, T7j);
+						  T7l = VADD(T7j, T7k);
+						  T7h = VSUB(T75, T7g);
+						  T8w = VADD(T75, T7g);
+						  ST(&(ri[WS(rs, 10)]), VFMA(LDK(KP923879532), T7B, T7y), ms, &(ri[0]));
+						  ST(&(ri[WS(rs, 26)]), VFNMS(LDK(KP923879532), T7B, T7y), ms, &(ri[0]));
+						  ST(&(ii[WS(rs, 18)]), VFNMS(LDK(KP923879532), T8o, T8n), ms, &(ii[0]));
+						  ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP923879532), T8o, T8n), ms, &(ii[0]));
+						  ST(&(ii[WS(rs, 26)]), VFNMS(LDK(KP923879532), T8q, T8p), ms, &(ii[0]));
+						  ST(&(ii[WS(rs, 10)]), VFMA(LDK(KP923879532), T8q, T8p), ms, &(ii[0]));
+						  ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP923879532), T7x, T7q), ms, &(ri[0]));
+						  ST(&(ri[WS(rs, 18)]), VFNMS(LDK(KP923879532), T7x, T7q), ms, &(ri[0]));
+						  ST(&(ri[WS(rs, 30)]), VFMA(LDK(KP923879532), T7l, T7i), ms, &(ri[0]));
+						  ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP923879532), T7l, T7i), ms, &(ri[0]));
+						  ST(&(ii[WS(rs, 22)]), VFNMS(LDK(KP923879532), T8u, T8t), ms, &(ii[0]));
+						  ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP923879532), T8u, T8t), ms, &(ii[0]));
+						  ST(&(ii[WS(rs, 30)]), VFMA(LDK(KP923879532), T8w, T8v), ms, &(ii[0]));
+						  ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP923879532), T8w, T8v), ms, &(ii[0]));
+						  ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP923879532), T7h, T6U), ms, &(ri[0]));
+						  ST(&(ri[WS(rs, 22)]), VFNMS(LDK(KP923879532), T7h, T6U), ms, &(ri[0]));
+						  T5C = VFMA(LDK(KP707106781), T3R, T3C);
+						  T3S = VFNMS(LDK(KP707106781), T3R, T3C);
+					     }
+					}
+					T5D = VFMA(LDK(KP414213562), T3Z, T46);
+					T47 = VFNMS(LDK(KP414213562), T46, T3Z);
+					T4m = VFMA(LDK(KP414213562), T4l, T4e);
+					T5E = VFNMS(LDK(KP414213562), T4e, T4l);
+					T8N = VADD(T8y, T8x);
+					T8z = VSUB(T8x, T8y);
+					T8A = VADD(T5T, T5U);
+					T5V = VSUB(T5T, T5U);
+					T6d = VSUB(T5i, T5b);
+					T5j = VADD(T5b, T5i);
+					T5u = VADD(T5s, T5t);
+					T6g = VSUB(T5s, T5t);
+					T66 = VSUB(T4J, T4C);
+					T4K = VADD(T4C, T4J);
+					T8C = VADD(T47, T4m);
+					T4n = VSUB(T47, T4m);
+					T8H = VFNMS(LDK(KP707106781), T8A, T8z);
+					T8B = VFMA(LDK(KP707106781), T8A, T8z);
+					T8I = VSUB(T5E, T5D);
+					T5F = VADD(T5D, T5E);
+					T5L = VFMA(LDK(KP707106781), T5j, T54);
+					T5k = VFNMS(LDK(KP707106781), T5j, T54);
+					T5K = VFMA(LDK(KP707106781), T5u, T5r);
+					T5v = VFNMS(LDK(KP707106781), T5u, T5r);
+					T4L = VFNMS(LDK(KP707106781), T4K, T4v);
+					T5I = VFMA(LDK(KP707106781), T4K, T4v);
+					T4V = VADD(T4T, T4U);
+					T69 = VSUB(T4T, T4U);
+				   }
+				   T5y = VFNMS(LDK(KP923879532), T4n, T3S);
+				   T4o = VFMA(LDK(KP923879532), T4n, T3S);
+				   T8J = VFMA(LDK(KP923879532), T8I, T8H);
+				   T8L = VFNMS(LDK(KP923879532), T8I, T8H);
+				   T5M = VFNMS(LDK(KP198912367), T5L, T5K);
+				   T5Q = VFMA(LDK(KP198912367), T5K, T5L);
+				   T5A = VFMA(LDK(KP668178637), T5k, T5v);
+				   T5w = VFNMS(LDK(KP668178637), T5v, T5k);
+				   T5H = VFMA(LDK(KP707106781), T4V, T4S);
+				   T4W = VFNMS(LDK(KP707106781), T4V, T4S);
+				   T5O = VFNMS(LDK(KP923879532), T5F, T5C);
+				   T5G = VFMA(LDK(KP923879532), T5F, T5C);
+				   T8D = VFMA(LDK(KP923879532), T8C, T8B);
+				   T8F = VFNMS(LDK(KP923879532), T8C, T8B);
+			      }
+			      {
+				   V T6p, T6q, T6o, T5W, T8W, T63;
+				   {
+					V T5J, T5P, T5z, T4X, T5Z, T62;
+					T5J = VFMA(LDK(KP198912367), T5I, T5H);
+					T5P = VFNMS(LDK(KP198912367), T5H, T5I);
+					T5z = VFNMS(LDK(KP668178637), T4L, T4W);
+					T4X = VFMA(LDK(KP668178637), T4W, T4L);
+					T6p = VFNMS(LDK(KP414213562), T5X, T5Y);
+					T5Z = VFMA(LDK(KP414213562), T5Y, T5X);
+					T62 = VFNMS(LDK(KP414213562), T61, T60);
+					T6q = VFMA(LDK(KP414213562), T60, T61);
+					{
+					     V T8G, T5N, T5R, T8E;
+					     T8G = VSUB(T5M, T5J);
+					     T5N = VADD(T5J, T5M);
+					     T5R = VSUB(T5P, T5Q);
+					     T8E = VADD(T5P, T5Q);
+					     {
+						  V T5B, T8K, T8M, T5x;
+						  T5B = VADD(T5z, T5A);
+						  T8K = VSUB(T5A, T5z);
+						  T8M = VADD(T4X, T5w);
+						  T5x = VSUB(T4X, T5w);
+						  T6o = VFNMS(LDK(KP707106781), T5V, T5S);
+						  T5W = VFMA(LDK(KP707106781), T5V, T5S);
+						  T8W = VADD(T5Z, T62);
+						  T63 = VSUB(T5Z, T62);
+						  ST(&(ii[WS(rs, 25)]), VFNMS(LDK(KP980785280), T8G, T8F), ms, &(ii[WS(rs, 1)]));
+						  ST(&(ii[WS(rs, 9)]), VFMA(LDK(KP980785280), T8G, T8F), ms, &(ii[WS(rs, 1)]));
+						  ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP980785280), T5N, T5G), ms, &(ri[WS(rs, 1)]));
+						  ST(&(ri[WS(rs, 17)]), VFNMS(LDK(KP980785280), T5N, T5G), ms, &(ri[WS(rs, 1)]));
+						  ST(&(ri[WS(rs, 9)]), VFMA(LDK(KP980785280), T5R, T5O), ms, &(ri[WS(rs, 1)]));
+						  ST(&(ri[WS(rs, 25)]), VFNMS(LDK(KP980785280), T5R, T5O), ms, &(ri[WS(rs, 1)]));
+						  ST(&(ii[WS(rs, 17)]), VFNMS(LDK(KP980785280), T8E, T8D), ms, &(ii[WS(rs, 1)]));
+						  ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP980785280), T8E, T8D), ms, &(ii[WS(rs, 1)]));
+						  ST(&(ri[WS(rs, 29)]), VFMA(LDK(KP831469612), T5B, T5y), ms, &(ri[WS(rs, 1)]));
+						  ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP831469612), T5B, T5y), ms, &(ri[WS(rs, 1)]));
+						  ST(&(ii[WS(rs, 21)]), VFNMS(LDK(KP831469612), T8K, T8J), ms, &(ii[WS(rs, 1)]));
+						  ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP831469612), T8K, T8J), ms, &(ii[WS(rs, 1)]));
+						  ST(&(ii[WS(rs, 29)]), VFMA(LDK(KP831469612), T8M, T8L), ms, &(ii[WS(rs, 1)]));
+						  ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP831469612), T8M, T8L), ms, &(ii[WS(rs, 1)]));
+						  ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP831469612), T5x, T4o), ms, &(ri[WS(rs, 1)]));
+						  ST(&(ri[WS(rs, 21)]), VFNMS(LDK(KP831469612), T5x, T4o), ms, &(ri[WS(rs, 1)]));
+					     }
+					}
+				   }
+				   {
+					V T6k, T64, T8V, T6r, T8R, T8T, T6y, T6C, T6m, T6i, T6v, T6B, T6l, T6b, T6A;
+					V T6s, T8X;
+					{
+					     V T6x, T6e, T6w, T6h, T6u, T67, T6t, T6a, T8P, T8Q;
+					     T6k = VFNMS(LDK(KP923879532), T63, T5W);
+					     T64 = VFMA(LDK(KP923879532), T63, T5W);
+					     T8V = VFNMS(LDK(KP707106781), T8O, T8N);
+					     T8P = VFMA(LDK(KP707106781), T8O, T8N);
+					     T8Q = VSUB(T6q, T6p);
+					     T6r = VADD(T6p, T6q);
+					     T6x = VFMA(LDK(KP707106781), T6d, T6c);
+					     T6e = VFNMS(LDK(KP707106781), T6d, T6c);
+					     T6w = VFMA(LDK(KP707106781), T6g, T6f);
+					     T6h = VFNMS(LDK(KP707106781), T6g, T6f);
+					     T6u = VFMA(LDK(KP707106781), T66, T65);
+					     T67 = VFNMS(LDK(KP707106781), T66, T65);
+					     T6t = VFMA(LDK(KP707106781), T69, T68);
+					     T6a = VFNMS(LDK(KP707106781), T69, T68);
+					     T8R = VFMA(LDK(KP923879532), T8Q, T8P);
+					     T8T = VFNMS(LDK(KP923879532), T8Q, T8P);
+					     T6y = VFNMS(LDK(KP198912367), T6x, T6w);
+					     T6C = VFMA(LDK(KP198912367), T6w, T6x);
+					     T6m = VFMA(LDK(KP668178637), T6e, T6h);
+					     T6i = VFNMS(LDK(KP668178637), T6h, T6e);
+					     T6v = VFMA(LDK(KP198912367), T6u, T6t);
+					     T6B = VFNMS(LDK(KP198912367), T6t, T6u);
+					     T6l = VFNMS(LDK(KP668178637), T67, T6a);
+					     T6b = VFMA(LDK(KP668178637), T6a, T67);
+					}
+					T6A = VFMA(LDK(KP923879532), T6r, T6o);
+					T6s = VFNMS(LDK(KP923879532), T6r, T6o);
+					T8X = VFNMS(LDK(KP923879532), T8W, T8V);
+					T8Z = VFMA(LDK(KP923879532), T8W, T8V);
+					{
+					     V T6z, T6D, T8Y, T6n, T8S, T8U, T6j;
+					     T6z = VSUB(T6v, T6y);
+					     T90 = VADD(T6v, T6y);
+					     T6D = VADD(T6B, T6C);
+					     T8Y = VSUB(T6C, T6B);
+					     T6n = VSUB(T6l, T6m);
+					     T8S = VADD(T6l, T6m);
+					     T8U = VSUB(T6i, T6b);
+					     T6j = VADD(T6b, T6i);
+					     ST(&(ri[WS(rs, 7)]), VFMA(LDK(KP980785280), T6z, T6s), ms, &(ri[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 23)]), VFNMS(LDK(KP980785280), T6z, T6s), ms, &(ri[WS(rs, 1)]));
+					     ST(&(ii[WS(rs, 23)]), VFNMS(LDK(KP980785280), T8Y, T8X), ms, &(ii[WS(rs, 1)]));
+					     ST(&(ii[WS(rs, 7)]), VFMA(LDK(KP980785280), T8Y, T8X), ms, &(ii[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 11)]), VFMA(LDK(KP831469612), T6n, T6k), ms, &(ri[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 27)]), VFNMS(LDK(KP831469612), T6n, T6k), ms, &(ri[WS(rs, 1)]));
+					     ST(&(ii[WS(rs, 19)]), VFNMS(LDK(KP831469612), T8S, T8R), ms, &(ii[WS(rs, 1)]));
+					     ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP831469612), T8S, T8R), ms, &(ii[WS(rs, 1)]));
+					     ST(&(ii[WS(rs, 27)]), VFNMS(LDK(KP831469612), T8U, T8T), ms, &(ii[WS(rs, 1)]));
+					     ST(&(ii[WS(rs, 11)]), VFMA(LDK(KP831469612), T8U, T8T), ms, &(ii[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP831469612), T6j, T64), ms, &(ri[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 19)]), VFNMS(LDK(KP831469612), T6j, T64), ms, &(ri[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 31)]), VFMA(LDK(KP980785280), T6D, T6A), ms, &(ri[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 15)]), VFNMS(LDK(KP980785280), T6D, T6A), ms, &(ri[WS(rs, 1)]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ST(&(ii[WS(rs, 31)]), VFMA(LDK(KP980785280), T90, T8Z), ms, &(ii[WS(rs, 1)]));
+	       ST(&(ii[WS(rs, 15)]), VFNMS(LDK(KP980785280), T90, T8Z), ms, &(ii[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t1sv_32"), twinstr, &GENUS, {236, 62, 198, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1sv_32) (planner *p) {
+     X(kdft_dit_register) (p, t1sv_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1sv_32 -include ts.h */
+
+/*
+ * This function contains 434 FP additions, 208 FP multiplications,
+ * (or, 340 additions, 114 multiplications, 94 fused multiply/add),
+ * 96 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "ts.h"
+
+static void t1sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 62); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 62), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T59, T41;
+	       V T56, T2B, T67, T6e, T6O, T4b, T5d, T4s, T5g, TG, T7l, T5I, T73, T3a, T4U;
+	       V T3f, T4V, T14, T5N, T5M, T6E, T3m, T4Y, T3r, T4Z, T1r, T5P, T5S, T6F, T3x;
+	       V T51, T3C, T52, T2d, T5Z, T64, T6K, T3V, T57, T44, T5a, T2Y, T6f, T6a, T6P;
+	       V T4m, T5h, T4v, T5e;
+	       {
+		    V T1, T76, T6, T75, Tc, T32, Th, T33;
+		    T1 = LD(&(ri[0]), ms, &(ri[0]));
+		    T76 = LD(&(ii[0]), ms, &(ii[0]));
+		    {
+			 V T3, T5, T2, T4;
+			 T3 = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
+			 T5 = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
+			 T2 = LDW(&(W[TWVL * 30]));
+			 T4 = LDW(&(W[TWVL * 31]));
+			 T6 = VFMA(T2, T3, VMUL(T4, T5));
+			 T75 = VFNMS(T4, T3, VMUL(T2, T5));
+		    }
+		    {
+			 V T9, Tb, T8, Ta;
+			 T9 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
+			 Tb = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
+			 T8 = LDW(&(W[TWVL * 14]));
+			 Ta = LDW(&(W[TWVL * 15]));
+			 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
+			 T32 = VFNMS(Ta, T9, VMUL(T8, Tb));
+		    }
+		    {
+			 V Te, Tg, Td, Tf;
+			 Te = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
+			 Tg = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
+			 Td = LDW(&(W[TWVL * 46]));
+			 Tf = LDW(&(W[TWVL * 47]));
+			 Th = VFMA(Td, Te, VMUL(Tf, Tg));
+			 T33 = VFNMS(Tf, Te, VMUL(Td, Tg));
+		    }
+		    {
+			 V T7, Ti, T7A, T7B;
+			 T7 = VADD(T1, T6);
+			 Ti = VADD(Tc, Th);
+			 Tj = VADD(T7, Ti);
+			 T5F = VSUB(T7, Ti);
+			 T7A = VSUB(T76, T75);
+			 T7B = VSUB(Tc, Th);
+			 T7C = VSUB(T7A, T7B);
+			 T7Q = VADD(T7B, T7A);
+		    }
+		    {
+			 V T31, T34, T74, T77;
+			 T31 = VSUB(T1, T6);
+			 T34 = VSUB(T32, T33);
+			 T35 = VSUB(T31, T34);
+			 T4T = VADD(T31, T34);
+			 T74 = VADD(T32, T33);
+			 T77 = VADD(T75, T76);
+			 T78 = VADD(T74, T77);
+			 T7m = VSUB(T77, T74);
+		    }
+	       }
+	       {
+		    V T1y, T3G, T1O, T3Z, T1D, T3H, T1J, T3Y;
+		    {
+			 V T1v, T1x, T1u, T1w;
+			 T1v = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+			 T1x = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+			 T1u = LDW(&(W[0]));
+			 T1w = LDW(&(W[TWVL * 1]));
+			 T1y = VFMA(T1u, T1v, VMUL(T1w, T1x));
+			 T3G = VFNMS(T1w, T1v, VMUL(T1u, T1x));
+		    }
+		    {
+			 V T1L, T1N, T1K, T1M;
+			 T1L = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
+			 T1N = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
+			 T1K = LDW(&(W[TWVL * 48]));
+			 T1M = LDW(&(W[TWVL * 49]));
+			 T1O = VFMA(T1K, T1L, VMUL(T1M, T1N));
+			 T3Z = VFNMS(T1M, T1L, VMUL(T1K, T1N));
+		    }
+		    {
+			 V T1A, T1C, T1z, T1B;
+			 T1A = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
+			 T1C = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
+			 T1z = LDW(&(W[TWVL * 32]));
+			 T1B = LDW(&(W[TWVL * 33]));
+			 T1D = VFMA(T1z, T1A, VMUL(T1B, T1C));
+			 T3H = VFNMS(T1B, T1A, VMUL(T1z, T1C));
+		    }
+		    {
+			 V T1G, T1I, T1F, T1H;
+			 T1G = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
+			 T1I = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
+			 T1F = LDW(&(W[TWVL * 16]));
+			 T1H = LDW(&(W[TWVL * 17]));
+			 T1J = VFMA(T1F, T1G, VMUL(T1H, T1I));
+			 T3Y = VFNMS(T1H, T1G, VMUL(T1F, T1I));
+		    }
+		    {
+			 V T1E, T1P, T5W, T5X;
+			 T1E = VADD(T1y, T1D);
+			 T1P = VADD(T1J, T1O);
+			 T1Q = VADD(T1E, T1P);
+			 T61 = VSUB(T1E, T1P);
+			 T5W = VADD(T3G, T3H);
+			 T5X = VADD(T3Y, T3Z);
+			 T5Y = VSUB(T5W, T5X);
+			 T6J = VADD(T5W, T5X);
+		    }
+		    {
+			 V T3I, T3J, T3X, T40;
+			 T3I = VSUB(T3G, T3H);
+			 T3J = VSUB(T1J, T1O);
+			 T3K = VADD(T3I, T3J);
+			 T59 = VSUB(T3I, T3J);
+			 T3X = VSUB(T1y, T1D);
+			 T40 = VSUB(T3Y, T3Z);
+			 T41 = VSUB(T3X, T40);
+			 T56 = VADD(T3X, T40);
+		    }
+	       }
+	       {
+		    V T2j, T4o, T2z, T49, T2o, T4p, T2u, T48;
+		    {
+			 V T2g, T2i, T2f, T2h;
+			 T2g = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
+			 T2i = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
+			 T2f = LDW(&(W[TWVL * 60]));
+			 T2h = LDW(&(W[TWVL * 61]));
+			 T2j = VFMA(T2f, T2g, VMUL(T2h, T2i));
+			 T4o = VFNMS(T2h, T2g, VMUL(T2f, T2i));
+		    }
+		    {
+			 V T2w, T2y, T2v, T2x;
+			 T2w = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
+			 T2y = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
+			 T2v = LDW(&(W[TWVL * 44]));
+			 T2x = LDW(&(W[TWVL * 45]));
+			 T2z = VFMA(T2v, T2w, VMUL(T2x, T2y));
+			 T49 = VFNMS(T2x, T2w, VMUL(T2v, T2y));
+		    }
+		    {
+			 V T2l, T2n, T2k, T2m;
+			 T2l = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
+			 T2n = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
+			 T2k = LDW(&(W[TWVL * 28]));
+			 T2m = LDW(&(W[TWVL * 29]));
+			 T2o = VFMA(T2k, T2l, VMUL(T2m, T2n));
+			 T4p = VFNMS(T2m, T2l, VMUL(T2k, T2n));
+		    }
+		    {
+			 V T2r, T2t, T2q, T2s;
+			 T2r = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+			 T2t = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+			 T2q = LDW(&(W[TWVL * 12]));
+			 T2s = LDW(&(W[TWVL * 13]));
+			 T2u = VFMA(T2q, T2r, VMUL(T2s, T2t));
+			 T48 = VFNMS(T2s, T2r, VMUL(T2q, T2t));
+		    }
+		    {
+			 V T2p, T2A, T6c, T6d;
+			 T2p = VADD(T2j, T2o);
+			 T2A = VADD(T2u, T2z);
+			 T2B = VADD(T2p, T2A);
+			 T67 = VSUB(T2p, T2A);
+			 T6c = VADD(T4o, T4p);
+			 T6d = VADD(T48, T49);
+			 T6e = VSUB(T6c, T6d);
+			 T6O = VADD(T6c, T6d);
+		    }
+		    {
+			 V T47, T4a, T4q, T4r;
+			 T47 = VSUB(T2j, T2o);
+			 T4a = VSUB(T48, T49);
+			 T4b = VSUB(T47, T4a);
+			 T5d = VADD(T47, T4a);
+			 T4q = VSUB(T4o, T4p);
+			 T4r = VSUB(T2u, T2z);
+			 T4s = VADD(T4q, T4r);
+			 T5g = VSUB(T4q, T4r);
+		    }
+	       }
+	       {
+		    V To, T36, TE, T3d, Tt, T37, Tz, T3c;
+		    {
+			 V Tl, Tn, Tk, Tm;
+			 Tl = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+			 Tn = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+			 Tk = LDW(&(W[TWVL * 6]));
+			 Tm = LDW(&(W[TWVL * 7]));
+			 To = VFMA(Tk, Tl, VMUL(Tm, Tn));
+			 T36 = VFNMS(Tm, Tl, VMUL(Tk, Tn));
+		    }
+		    {
+			 V TB, TD, TA, TC;
+			 TB = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
+			 TD = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
+			 TA = LDW(&(W[TWVL * 22]));
+			 TC = LDW(&(W[TWVL * 23]));
+			 TE = VFMA(TA, TB, VMUL(TC, TD));
+			 T3d = VFNMS(TC, TB, VMUL(TA, TD));
+		    }
+		    {
+			 V Tq, Ts, Tp, Tr;
+			 Tq = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
+			 Ts = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
+			 Tp = LDW(&(W[TWVL * 38]));
+			 Tr = LDW(&(W[TWVL * 39]));
+			 Tt = VFMA(Tp, Tq, VMUL(Tr, Ts));
+			 T37 = VFNMS(Tr, Tq, VMUL(Tp, Ts));
+		    }
+		    {
+			 V Tw, Ty, Tv, Tx;
+			 Tw = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
+			 Ty = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
+			 Tv = LDW(&(W[TWVL * 54]));
+			 Tx = LDW(&(W[TWVL * 55]));
+			 Tz = VFMA(Tv, Tw, VMUL(Tx, Ty));
+			 T3c = VFNMS(Tx, Tw, VMUL(Tv, Ty));
+		    }
+		    {
+			 V Tu, TF, T5G, T5H;
+			 Tu = VADD(To, Tt);
+			 TF = VADD(Tz, TE);
+			 TG = VADD(Tu, TF);
+			 T7l = VSUB(TF, Tu);
+			 T5G = VADD(T36, T37);
+			 T5H = VADD(T3c, T3d);
+			 T5I = VSUB(T5G, T5H);
+			 T73 = VADD(T5G, T5H);
+		    }
+		    {
+			 V T38, T39, T3b, T3e;
+			 T38 = VSUB(T36, T37);
+			 T39 = VSUB(To, Tt);
+			 T3a = VSUB(T38, T39);
+			 T4U = VADD(T39, T38);
+			 T3b = VSUB(Tz, TE);
+			 T3e = VSUB(T3c, T3d);
+			 T3f = VADD(T3b, T3e);
+			 T4V = VSUB(T3b, T3e);
+		    }
+	       }
+	       {
+		    V TM, T3i, T12, T3p, TR, T3j, TX, T3o;
+		    {
+			 V TJ, TL, TI, TK;
+			 TJ = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+			 TL = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+			 TI = LDW(&(W[TWVL * 2]));
+			 TK = LDW(&(W[TWVL * 3]));
+			 TM = VFMA(TI, TJ, VMUL(TK, TL));
+			 T3i = VFNMS(TK, TJ, VMUL(TI, TL));
+		    }
+		    {
+			 V TZ, T11, TY, T10;
+			 TZ = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
+			 T11 = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
+			 TY = LDW(&(W[TWVL * 50]));
+			 T10 = LDW(&(W[TWVL * 51]));
+			 T12 = VFMA(TY, TZ, VMUL(T10, T11));
+			 T3p = VFNMS(T10, TZ, VMUL(TY, T11));
+		    }
+		    {
+			 V TO, TQ, TN, TP;
+			 TO = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
+			 TQ = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
+			 TN = LDW(&(W[TWVL * 34]));
+			 TP = LDW(&(W[TWVL * 35]));
+			 TR = VFMA(TN, TO, VMUL(TP, TQ));
+			 T3j = VFNMS(TP, TO, VMUL(TN, TQ));
+		    }
+		    {
+			 V TU, TW, TT, TV;
+			 TU = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
+			 TW = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
+			 TT = LDW(&(W[TWVL * 18]));
+			 TV = LDW(&(W[TWVL * 19]));
+			 TX = VFMA(TT, TU, VMUL(TV, TW));
+			 T3o = VFNMS(TV, TU, VMUL(TT, TW));
+		    }
+		    {
+			 V TS, T13, T5K, T5L;
+			 TS = VADD(TM, TR);
+			 T13 = VADD(TX, T12);
+			 T14 = VADD(TS, T13);
+			 T5N = VSUB(TS, T13);
+			 T5K = VADD(T3i, T3j);
+			 T5L = VADD(T3o, T3p);
+			 T5M = VSUB(T5K, T5L);
+			 T6E = VADD(T5K, T5L);
+		    }
+		    {
+			 V T3k, T3l, T3n, T3q;
+			 T3k = VSUB(T3i, T3j);
+			 T3l = VSUB(TX, T12);
+			 T3m = VADD(T3k, T3l);
+			 T4Y = VSUB(T3k, T3l);
+			 T3n = VSUB(TM, TR);
+			 T3q = VSUB(T3o, T3p);
+			 T3r = VSUB(T3n, T3q);
+			 T4Z = VADD(T3n, T3q);
+		    }
+	       }
+	       {
+		    V T19, T3t, T1p, T3A, T1e, T3u, T1k, T3z;
+		    {
+			 V T16, T18, T15, T17;
+			 T16 = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
+			 T18 = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
+			 T15 = LDW(&(W[TWVL * 58]));
+			 T17 = LDW(&(W[TWVL * 59]));
+			 T19 = VFMA(T15, T16, VMUL(T17, T18));
+			 T3t = VFNMS(T17, T16, VMUL(T15, T18));
+		    }
+		    {
+			 V T1m, T1o, T1l, T1n;
+			 T1m = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
+			 T1o = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
+			 T1l = LDW(&(W[TWVL * 42]));
+			 T1n = LDW(&(W[TWVL * 43]));
+			 T1p = VFMA(T1l, T1m, VMUL(T1n, T1o));
+			 T3A = VFNMS(T1n, T1m, VMUL(T1l, T1o));
+		    }
+		    {
+			 V T1b, T1d, T1a, T1c;
+			 T1b = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
+			 T1d = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
+			 T1a = LDW(&(W[TWVL * 26]));
+			 T1c = LDW(&(W[TWVL * 27]));
+			 T1e = VFMA(T1a, T1b, VMUL(T1c, T1d));
+			 T3u = VFNMS(T1c, T1b, VMUL(T1a, T1d));
+		    }
+		    {
+			 V T1h, T1j, T1g, T1i;
+			 T1h = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+			 T1j = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+			 T1g = LDW(&(W[TWVL * 10]));
+			 T1i = LDW(&(W[TWVL * 11]));
+			 T1k = VFMA(T1g, T1h, VMUL(T1i, T1j));
+			 T3z = VFNMS(T1i, T1h, VMUL(T1g, T1j));
+		    }
+		    {
+			 V T1f, T1q, T5Q, T5R;
+			 T1f = VADD(T19, T1e);
+			 T1q = VADD(T1k, T1p);
+			 T1r = VADD(T1f, T1q);
+			 T5P = VSUB(T1f, T1q);
+			 T5Q = VADD(T3t, T3u);
+			 T5R = VADD(T3z, T3A);
+			 T5S = VSUB(T5Q, T5R);
+			 T6F = VADD(T5Q, T5R);
+		    }
+		    {
+			 V T3v, T3w, T3y, T3B;
+			 T3v = VSUB(T3t, T3u);
+			 T3w = VSUB(T1k, T1p);
+			 T3x = VADD(T3v, T3w);
+			 T51 = VSUB(T3v, T3w);
+			 T3y = VSUB(T19, T1e);
+			 T3B = VSUB(T3z, T3A);
+			 T3C = VSUB(T3y, T3B);
+			 T52 = VADD(T3y, T3B);
+		    }
+	       }
+	       {
+		    V T1V, T3R, T20, T3S, T3Q, T3T, T26, T3M, T2b, T3N, T3L, T3O;
+		    {
+			 V T1S, T1U, T1R, T1T;
+			 T1S = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+			 T1U = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+			 T1R = LDW(&(W[TWVL * 8]));
+			 T1T = LDW(&(W[TWVL * 9]));
+			 T1V = VFMA(T1R, T1S, VMUL(T1T, T1U));
+			 T3R = VFNMS(T1T, T1S, VMUL(T1R, T1U));
+		    }
+		    {
+			 V T1X, T1Z, T1W, T1Y;
+			 T1X = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
+			 T1Z = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
+			 T1W = LDW(&(W[TWVL * 40]));
+			 T1Y = LDW(&(W[TWVL * 41]));
+			 T20 = VFMA(T1W, T1X, VMUL(T1Y, T1Z));
+			 T3S = VFNMS(T1Y, T1X, VMUL(T1W, T1Z));
+		    }
+		    T3Q = VSUB(T1V, T20);
+		    T3T = VSUB(T3R, T3S);
+		    {
+			 V T23, T25, T22, T24;
+			 T23 = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
+			 T25 = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
+			 T22 = LDW(&(W[TWVL * 56]));
+			 T24 = LDW(&(W[TWVL * 57]));
+			 T26 = VFMA(T22, T23, VMUL(T24, T25));
+			 T3M = VFNMS(T24, T23, VMUL(T22, T25));
+		    }
+		    {
+			 V T28, T2a, T27, T29;
+			 T28 = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
+			 T2a = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
+			 T27 = LDW(&(W[TWVL * 24]));
+			 T29 = LDW(&(W[TWVL * 25]));
+			 T2b = VFMA(T27, T28, VMUL(T29, T2a));
+			 T3N = VFNMS(T29, T28, VMUL(T27, T2a));
+		    }
+		    T3L = VSUB(T26, T2b);
+		    T3O = VSUB(T3M, T3N);
+		    {
+			 V T21, T2c, T62, T63;
+			 T21 = VADD(T1V, T20);
+			 T2c = VADD(T26, T2b);
+			 T2d = VADD(T21, T2c);
+			 T5Z = VSUB(T2c, T21);
+			 T62 = VADD(T3R, T3S);
+			 T63 = VADD(T3M, T3N);
+			 T64 = VSUB(T62, T63);
+			 T6K = VADD(T62, T63);
+		    }
+		    {
+			 V T3P, T3U, T42, T43;
+			 T3P = VSUB(T3L, T3O);
+			 T3U = VADD(T3Q, T3T);
+			 T3V = VMUL(LDK(KP707106781), VSUB(T3P, T3U));
+			 T57 = VMUL(LDK(KP707106781), VADD(T3U, T3P));
+			 T42 = VSUB(T3T, T3Q);
+			 T43 = VADD(T3L, T3O);
+			 T44 = VMUL(LDK(KP707106781), VSUB(T42, T43));
+			 T5a = VMUL(LDK(KP707106781), VADD(T42, T43));
+		    }
+	       }
+	       {
+		    V T2G, T4c, T2L, T4d, T4e, T4f, T2R, T4i, T2W, T4j, T4h, T4k;
+		    {
+			 V T2D, T2F, T2C, T2E;
+			 T2D = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+			 T2F = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+			 T2C = LDW(&(W[TWVL * 4]));
+			 T2E = LDW(&(W[TWVL * 5]));
+			 T2G = VFMA(T2C, T2D, VMUL(T2E, T2F));
+			 T4c = VFNMS(T2E, T2D, VMUL(T2C, T2F));
+		    }
+		    {
+			 V T2I, T2K, T2H, T2J;
+			 T2I = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
+			 T2K = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
+			 T2H = LDW(&(W[TWVL * 36]));
+			 T2J = LDW(&(W[TWVL * 37]));
+			 T2L = VFMA(T2H, T2I, VMUL(T2J, T2K));
+			 T4d = VFNMS(T2J, T2I, VMUL(T2H, T2K));
+		    }
+		    T4e = VSUB(T4c, T4d);
+		    T4f = VSUB(T2G, T2L);
+		    {
+			 V T2O, T2Q, T2N, T2P;
+			 T2O = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
+			 T2Q = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
+			 T2N = LDW(&(W[TWVL * 52]));
+			 T2P = LDW(&(W[TWVL * 53]));
+			 T2R = VFMA(T2N, T2O, VMUL(T2P, T2Q));
+			 T4i = VFNMS(T2P, T2O, VMUL(T2N, T2Q));
+		    }
+		    {
+			 V T2T, T2V, T2S, T2U;
+			 T2T = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
+			 T2V = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
+			 T2S = LDW(&(W[TWVL * 20]));
+			 T2U = LDW(&(W[TWVL * 21]));
+			 T2W = VFMA(T2S, T2T, VMUL(T2U, T2V));
+			 T4j = VFNMS(T2U, T2T, VMUL(T2S, T2V));
+		    }
+		    T4h = VSUB(T2R, T2W);
+		    T4k = VSUB(T4i, T4j);
+		    {
+			 V T2M, T2X, T68, T69;
+			 T2M = VADD(T2G, T2L);
+			 T2X = VADD(T2R, T2W);
+			 T2Y = VADD(T2M, T2X);
+			 T6f = VSUB(T2X, T2M);
+			 T68 = VADD(T4c, T4d);
+			 T69 = VADD(T4i, T4j);
+			 T6a = VSUB(T68, T69);
+			 T6P = VADD(T68, T69);
+		    }
+		    {
+			 V T4g, T4l, T4t, T4u;
+			 T4g = VSUB(T4e, T4f);
+			 T4l = VADD(T4h, T4k);
+			 T4m = VMUL(LDK(KP707106781), VSUB(T4g, T4l));
+			 T5h = VMUL(LDK(KP707106781), VADD(T4g, T4l));
+			 T4t = VSUB(T4h, T4k);
+			 T4u = VADD(T4f, T4e);
+			 T4v = VMUL(LDK(KP707106781), VSUB(T4t, T4u));
+			 T5e = VMUL(LDK(KP707106781), VADD(T4u, T4t));
+		    }
+	       }
+	       {
+		    V T1t, T6X, T7a, T7c, T30, T7b, T70, T71;
+		    {
+			 V TH, T1s, T72, T79;
+			 TH = VADD(Tj, TG);
+			 T1s = VADD(T14, T1r);
+			 T1t = VADD(TH, T1s);
+			 T6X = VSUB(TH, T1s);
+			 T72 = VADD(T6E, T6F);
+			 T79 = VADD(T73, T78);
+			 T7a = VADD(T72, T79);
+			 T7c = VSUB(T79, T72);
+		    }
+		    {
+			 V T2e, T2Z, T6Y, T6Z;
+			 T2e = VADD(T1Q, T2d);
+			 T2Z = VADD(T2B, T2Y);
+			 T30 = VADD(T2e, T2Z);
+			 T7b = VSUB(T2Z, T2e);
+			 T6Y = VADD(T6J, T6K);
+			 T6Z = VADD(T6O, T6P);
+			 T70 = VSUB(T6Y, T6Z);
+			 T71 = VADD(T6Y, T6Z);
+		    }
+		    ST(&(ri[WS(rs, 16)]), VSUB(T1t, T30), ms, &(ri[0]));
+		    ST(&(ii[WS(rs, 16)]), VSUB(T7a, T71), ms, &(ii[0]));
+		    ST(&(ri[0]), VADD(T1t, T30), ms, &(ri[0]));
+		    ST(&(ii[0]), VADD(T71, T7a), ms, &(ii[0]));
+		    ST(&(ri[WS(rs, 24)]), VSUB(T6X, T70), ms, &(ri[0]));
+		    ST(&(ii[WS(rs, 24)]), VSUB(T7c, T7b), ms, &(ii[0]));
+		    ST(&(ri[WS(rs, 8)]), VADD(T6X, T70), ms, &(ri[0]));
+		    ST(&(ii[WS(rs, 8)]), VADD(T7b, T7c), ms, &(ii[0]));
+	       }
+	       {
+		    V T6H, T6T, T7g, T7i, T6M, T6U, T6R, T6V;
+		    {
+			 V T6D, T6G, T7e, T7f;
+			 T6D = VSUB(Tj, TG);
+			 T6G = VSUB(T6E, T6F);
+			 T6H = VADD(T6D, T6G);
+			 T6T = VSUB(T6D, T6G);
+			 T7e = VSUB(T1r, T14);
+			 T7f = VSUB(T78, T73);
+			 T7g = VADD(T7e, T7f);
+			 T7i = VSUB(T7f, T7e);
+		    }
+		    {
+			 V T6I, T6L, T6N, T6Q;
+			 T6I = VSUB(T1Q, T2d);
+			 T6L = VSUB(T6J, T6K);
+			 T6M = VADD(T6I, T6L);
+			 T6U = VSUB(T6L, T6I);
+			 T6N = VSUB(T2B, T2Y);
+			 T6Q = VSUB(T6O, T6P);
+			 T6R = VSUB(T6N, T6Q);
+			 T6V = VADD(T6N, T6Q);
+		    }
+		    {
+			 V T6S, T7d, T6W, T7h;
+			 T6S = VMUL(LDK(KP707106781), VADD(T6M, T6R));
+			 ST(&(ri[WS(rs, 20)]), VSUB(T6H, T6S), ms, &(ri[0]));
+			 ST(&(ri[WS(rs, 4)]), VADD(T6H, T6S), ms, &(ri[0]));
+			 T7d = VMUL(LDK(KP707106781), VADD(T6U, T6V));
+			 ST(&(ii[WS(rs, 4)]), VADD(T7d, T7g), ms, &(ii[0]));
+			 ST(&(ii[WS(rs, 20)]), VSUB(T7g, T7d), ms, &(ii[0]));
+			 T6W = VMUL(LDK(KP707106781), VSUB(T6U, T6V));
+			 ST(&(ri[WS(rs, 28)]), VSUB(T6T, T6W), ms, &(ri[0]));
+			 ST(&(ri[WS(rs, 12)]), VADD(T6T, T6W), ms, &(ri[0]));
+			 T7h = VMUL(LDK(KP707106781), VSUB(T6R, T6M));
+			 ST(&(ii[WS(rs, 12)]), VADD(T7h, T7i), ms, &(ii[0]));
+			 ST(&(ii[WS(rs, 28)]), VSUB(T7i, T7h), ms, &(ii[0]));
+		    }
+	       }
+	       {
+		    V T5J, T7n, T7t, T6n, T5U, T7k, T6x, T6B, T6q, T7s, T66, T6k, T6u, T6A, T6h;
+		    V T6l;
+		    {
+			 V T5O, T5T, T60, T65;
+			 T5J = VSUB(T5F, T5I);
+			 T7n = VADD(T7l, T7m);
+			 T7t = VSUB(T7m, T7l);
+			 T6n = VADD(T5F, T5I);
+			 T5O = VSUB(T5M, T5N);
+			 T5T = VADD(T5P, T5S);
+			 T5U = VMUL(LDK(KP707106781), VSUB(T5O, T5T));
+			 T7k = VMUL(LDK(KP707106781), VADD(T5O, T5T));
+			 {
+			      V T6v, T6w, T6o, T6p;
+			      T6v = VADD(T67, T6a);
+			      T6w = VADD(T6e, T6f);
+			      T6x = VFNMS(LDK(KP382683432), T6w, VMUL(LDK(KP923879532), T6v));
+			      T6B = VFMA(LDK(KP923879532), T6w, VMUL(LDK(KP382683432), T6v));
+			      T6o = VADD(T5N, T5M);
+			      T6p = VSUB(T5P, T5S);
+			      T6q = VMUL(LDK(KP707106781), VADD(T6o, T6p));
+			      T7s = VMUL(LDK(KP707106781), VSUB(T6p, T6o));
+			 }
+			 T60 = VSUB(T5Y, T5Z);
+			 T65 = VSUB(T61, T64);
+			 T66 = VFMA(LDK(KP923879532), T60, VMUL(LDK(KP382683432), T65));
+			 T6k = VFNMS(LDK(KP923879532), T65, VMUL(LDK(KP382683432), T60));
+			 {
+			      V T6s, T6t, T6b, T6g;
+			      T6s = VADD(T5Y, T5Z);
+			      T6t = VADD(T61, T64);
+			      T6u = VFMA(LDK(KP382683432), T6s, VMUL(LDK(KP923879532), T6t));
+			      T6A = VFNMS(LDK(KP382683432), T6t, VMUL(LDK(KP923879532), T6s));
+			      T6b = VSUB(T67, T6a);
+			      T6g = VSUB(T6e, T6f);
+			      T6h = VFNMS(LDK(KP923879532), T6g, VMUL(LDK(KP382683432), T6b));
+			      T6l = VFMA(LDK(KP382683432), T6g, VMUL(LDK(KP923879532), T6b));
+			 }
+		    }
+		    {
+			 V T5V, T6i, T7r, T7u;
+			 T5V = VADD(T5J, T5U);
+			 T6i = VADD(T66, T6h);
+			 ST(&(ri[WS(rs, 22)]), VSUB(T5V, T6i), ms, &(ri[0]));
+			 ST(&(ri[WS(rs, 6)]), VADD(T5V, T6i), ms, &(ri[0]));
+			 T7r = VADD(T6k, T6l);
+			 T7u = VADD(T7s, T7t);
+			 ST(&(ii[WS(rs, 6)]), VADD(T7r, T7u), ms, &(ii[0]));
+			 ST(&(ii[WS(rs, 22)]), VSUB(T7u, T7r), ms, &(ii[0]));
+		    }
+		    {
+			 V T6j, T6m, T7v, T7w;
+			 T6j = VSUB(T5J, T5U);
+			 T6m = VSUB(T6k, T6l);
+			 ST(&(ri[WS(rs, 30)]), VSUB(T6j, T6m), ms, &(ri[0]));
+			 ST(&(ri[WS(rs, 14)]), VADD(T6j, T6m), ms, &(ri[0]));
+			 T7v = VSUB(T6h, T66);
+			 T7w = VSUB(T7t, T7s);
+			 ST(&(ii[WS(rs, 14)]), VADD(T7v, T7w), ms, &(ii[0]));
+			 ST(&(ii[WS(rs, 30)]), VSUB(T7w, T7v), ms, &(ii[0]));
+		    }
+		    {
+			 V T6r, T6y, T7j, T7o;
+			 T6r = VADD(T6n, T6q);
+			 T6y = VADD(T6u, T6x);
+			 ST(&(ri[WS(rs, 18)]), VSUB(T6r, T6y), ms, &(ri[0]));
+			 ST(&(ri[WS(rs, 2)]), VADD(T6r, T6y), ms, &(ri[0]));
+			 T7j = VADD(T6A, T6B);
+			 T7o = VADD(T7k, T7n);
+			 ST(&(ii[WS(rs, 2)]), VADD(T7j, T7o), ms, &(ii[0]));
+			 ST(&(ii[WS(rs, 18)]), VSUB(T7o, T7j), ms, &(ii[0]));
+		    }
+		    {
+			 V T6z, T6C, T7p, T7q;
+			 T6z = VSUB(T6n, T6q);
+			 T6C = VSUB(T6A, T6B);
+			 ST(&(ri[WS(rs, 26)]), VSUB(T6z, T6C), ms, &(ri[0]));
+			 ST(&(ri[WS(rs, 10)]), VADD(T6z, T6C), ms, &(ri[0]));
+			 T7p = VSUB(T6x, T6u);
+			 T7q = VSUB(T7n, T7k);
+			 ST(&(ii[WS(rs, 10)]), VADD(T7p, T7q), ms, &(ii[0]));
+			 ST(&(ii[WS(rs, 26)]), VSUB(T7q, T7p), ms, &(ii[0]));
+		    }
+	       }
+	       {
+		    V T3h, T4D, T7R, T7X, T3E, T7O, T4N, T4R, T46, T4A, T4G, T7W, T4K, T4Q, T4x;
+		    V T4B, T3g, T7P;
+		    T3g = VMUL(LDK(KP707106781), VSUB(T3a, T3f));
+		    T3h = VSUB(T35, T3g);
+		    T4D = VADD(T35, T3g);
+		    T7P = VMUL(LDK(KP707106781), VSUB(T4V, T4U));
+		    T7R = VADD(T7P, T7Q);
+		    T7X = VSUB(T7Q, T7P);
+		    {
+			 V T3s, T3D, T4L, T4M;
+			 T3s = VFNMS(LDK(KP923879532), T3r, VMUL(LDK(KP382683432), T3m));
+			 T3D = VFMA(LDK(KP382683432), T3x, VMUL(LDK(KP923879532), T3C));
+			 T3E = VSUB(T3s, T3D);
+			 T7O = VADD(T3s, T3D);
+			 T4L = VADD(T4b, T4m);
+			 T4M = VADD(T4s, T4v);
+			 T4N = VFNMS(LDK(KP555570233), T4M, VMUL(LDK(KP831469612), T4L));
+			 T4R = VFMA(LDK(KP831469612), T4M, VMUL(LDK(KP555570233), T4L));
+		    }
+		    {
+			 V T3W, T45, T4E, T4F;
+			 T3W = VSUB(T3K, T3V);
+			 T45 = VSUB(T41, T44);
+			 T46 = VFMA(LDK(KP980785280), T3W, VMUL(LDK(KP195090322), T45));
+			 T4A = VFNMS(LDK(KP980785280), T45, VMUL(LDK(KP195090322), T3W));
+			 T4E = VFMA(LDK(KP923879532), T3m, VMUL(LDK(KP382683432), T3r));
+			 T4F = VFNMS(LDK(KP923879532), T3x, VMUL(LDK(KP382683432), T3C));
+			 T4G = VADD(T4E, T4F);
+			 T7W = VSUB(T4F, T4E);
+		    }
+		    {
+			 V T4I, T4J, T4n, T4w;
+			 T4I = VADD(T3K, T3V);
+			 T4J = VADD(T41, T44);
+			 T4K = VFMA(LDK(KP555570233), T4I, VMUL(LDK(KP831469612), T4J));
+			 T4Q = VFNMS(LDK(KP555570233), T4J, VMUL(LDK(KP831469612), T4I));
+			 T4n = VSUB(T4b, T4m);
+			 T4w = VSUB(T4s, T4v);
+			 T4x = VFNMS(LDK(KP980785280), T4w, VMUL(LDK(KP195090322), T4n));
+			 T4B = VFMA(LDK(KP195090322), T4w, VMUL(LDK(KP980785280), T4n));
+		    }
+		    {
+			 V T3F, T4y, T7V, T7Y;
+			 T3F = VADD(T3h, T3E);
+			 T4y = VADD(T46, T4x);
+			 ST(&(ri[WS(rs, 23)]), VSUB(T3F, T4y), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 7)]), VADD(T3F, T4y), ms, &(ri[WS(rs, 1)]));
+			 T7V = VADD(T4A, T4B);
+			 T7Y = VADD(T7W, T7X);
+			 ST(&(ii[WS(rs, 7)]), VADD(T7V, T7Y), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 23)]), VSUB(T7Y, T7V), ms, &(ii[WS(rs, 1)]));
+		    }
+		    {
+			 V T4z, T4C, T7Z, T80;
+			 T4z = VSUB(T3h, T3E);
+			 T4C = VSUB(T4A, T4B);
+			 ST(&(ri[WS(rs, 31)]), VSUB(T4z, T4C), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 15)]), VADD(T4z, T4C), ms, &(ri[WS(rs, 1)]));
+			 T7Z = VSUB(T4x, T46);
+			 T80 = VSUB(T7X, T7W);
+			 ST(&(ii[WS(rs, 15)]), VADD(T7Z, T80), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 31)]), VSUB(T80, T7Z), ms, &(ii[WS(rs, 1)]));
+		    }
+		    {
+			 V T4H, T4O, T7N, T7S;
+			 T4H = VADD(T4D, T4G);
+			 T4O = VADD(T4K, T4N);
+			 ST(&(ri[WS(rs, 19)]), VSUB(T4H, T4O), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 3)]), VADD(T4H, T4O), ms, &(ri[WS(rs, 1)]));
+			 T7N = VADD(T4Q, T4R);
+			 T7S = VADD(T7O, T7R);
+			 ST(&(ii[WS(rs, 3)]), VADD(T7N, T7S), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 19)]), VSUB(T7S, T7N), ms, &(ii[WS(rs, 1)]));
+		    }
+		    {
+			 V T4P, T4S, T7T, T7U;
+			 T4P = VSUB(T4D, T4G);
+			 T4S = VSUB(T4Q, T4R);
+			 ST(&(ri[WS(rs, 27)]), VSUB(T4P, T4S), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 11)]), VADD(T4P, T4S), ms, &(ri[WS(rs, 1)]));
+			 T7T = VSUB(T4N, T4K);
+			 T7U = VSUB(T7R, T7O);
+			 ST(&(ii[WS(rs, 11)]), VADD(T7T, T7U), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 27)]), VSUB(T7U, T7T), ms, &(ii[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T4X, T5p, T7D, T7J, T54, T7y, T5z, T5D, T5c, T5m, T5s, T7I, T5w, T5C, T5j;
+		    V T5n, T4W, T7z;
+		    T4W = VMUL(LDK(KP707106781), VADD(T4U, T4V));
+		    T4X = VSUB(T4T, T4W);
+		    T5p = VADD(T4T, T4W);
+		    T7z = VMUL(LDK(KP707106781), VADD(T3a, T3f));
+		    T7D = VADD(T7z, T7C);
+		    T7J = VSUB(T7C, T7z);
+		    {
+			 V T50, T53, T5x, T5y;
+			 T50 = VFNMS(LDK(KP382683432), T4Z, VMUL(LDK(KP923879532), T4Y));
+			 T53 = VFMA(LDK(KP923879532), T51, VMUL(LDK(KP382683432), T52));
+			 T54 = VSUB(T50, T53);
+			 T7y = VADD(T50, T53);
+			 T5x = VADD(T5d, T5e);
+			 T5y = VADD(T5g, T5h);
+			 T5z = VFNMS(LDK(KP195090322), T5y, VMUL(LDK(KP980785280), T5x));
+			 T5D = VFMA(LDK(KP195090322), T5x, VMUL(LDK(KP980785280), T5y));
+		    }
+		    {
+			 V T58, T5b, T5q, T5r;
+			 T58 = VSUB(T56, T57);
+			 T5b = VSUB(T59, T5a);
+			 T5c = VFMA(LDK(KP555570233), T58, VMUL(LDK(KP831469612), T5b));
+			 T5m = VFNMS(LDK(KP831469612), T58, VMUL(LDK(KP555570233), T5b));
+			 T5q = VFMA(LDK(KP382683432), T4Y, VMUL(LDK(KP923879532), T4Z));
+			 T5r = VFNMS(LDK(KP382683432), T51, VMUL(LDK(KP923879532), T52));
+			 T5s = VADD(T5q, T5r);
+			 T7I = VSUB(T5r, T5q);
+		    }
+		    {
+			 V T5u, T5v, T5f, T5i;
+			 T5u = VADD(T56, T57);
+			 T5v = VADD(T59, T5a);
+			 T5w = VFMA(LDK(KP980785280), T5u, VMUL(LDK(KP195090322), T5v));
+			 T5C = VFNMS(LDK(KP195090322), T5u, VMUL(LDK(KP980785280), T5v));
+			 T5f = VSUB(T5d, T5e);
+			 T5i = VSUB(T5g, T5h);
+			 T5j = VFNMS(LDK(KP831469612), T5i, VMUL(LDK(KP555570233), T5f));
+			 T5n = VFMA(LDK(KP831469612), T5f, VMUL(LDK(KP555570233), T5i));
+		    }
+		    {
+			 V T55, T5k, T7H, T7K;
+			 T55 = VADD(T4X, T54);
+			 T5k = VADD(T5c, T5j);
+			 ST(&(ri[WS(rs, 21)]), VSUB(T55, T5k), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 5)]), VADD(T55, T5k), ms, &(ri[WS(rs, 1)]));
+			 T7H = VADD(T5m, T5n);
+			 T7K = VADD(T7I, T7J);
+			 ST(&(ii[WS(rs, 5)]), VADD(T7H, T7K), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 21)]), VSUB(T7K, T7H), ms, &(ii[WS(rs, 1)]));
+		    }
+		    {
+			 V T5l, T5o, T7L, T7M;
+			 T5l = VSUB(T4X, T54);
+			 T5o = VSUB(T5m, T5n);
+			 ST(&(ri[WS(rs, 29)]), VSUB(T5l, T5o), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 13)]), VADD(T5l, T5o), ms, &(ri[WS(rs, 1)]));
+			 T7L = VSUB(T5j, T5c);
+			 T7M = VSUB(T7J, T7I);
+			 ST(&(ii[WS(rs, 13)]), VADD(T7L, T7M), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 29)]), VSUB(T7M, T7L), ms, &(ii[WS(rs, 1)]));
+		    }
+		    {
+			 V T5t, T5A, T7x, T7E;
+			 T5t = VADD(T5p, T5s);
+			 T5A = VADD(T5w, T5z);
+			 ST(&(ri[WS(rs, 17)]), VSUB(T5t, T5A), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 1)]), VADD(T5t, T5A), ms, &(ri[WS(rs, 1)]));
+			 T7x = VADD(T5C, T5D);
+			 T7E = VADD(T7y, T7D);
+			 ST(&(ii[WS(rs, 1)]), VADD(T7x, T7E), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 17)]), VSUB(T7E, T7x), ms, &(ii[WS(rs, 1)]));
+		    }
+		    {
+			 V T5B, T5E, T7F, T7G;
+			 T5B = VSUB(T5p, T5s);
+			 T5E = VSUB(T5C, T5D);
+			 ST(&(ri[WS(rs, 25)]), VSUB(T5B, T5E), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 9)]), VADD(T5B, T5E), ms, &(ri[WS(rs, 1)]));
+			 T7F = VSUB(T5z, T5w);
+			 T7G = VSUB(T7D, T7y);
+			 ST(&(ii[WS(rs, 9)]), VADD(T7F, T7G), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 25)]), VSUB(T7G, T7F), ms, &(ii[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t1sv_32"), twinstr, &GENUS, {340, 114, 94, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1sv_32) (planner *p) {
+     X(kdft_dit_register) (p, t1sv_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:24 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1sv_4 -include ts.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 35 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "ts.h"
+
+static void t1sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T1, Tv, T3, T6, T5, Ta, Td, Tc, Tg, Tj, Tt, T4, Tf, Ti, Tn;
+	       V Tb, T2, T9;
+	       T1 = LD(&(ri[0]), ms, &(ri[0]));
+	       Tv = LD(&(ii[0]), ms, &(ii[0]));
+	       T3 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+	       T6 = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+	       T2 = LDW(&(W[TWVL * 2]));
+	       T5 = LDW(&(W[TWVL * 3]));
+	       Ta = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+	       Td = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+	       T9 = LDW(&(W[0]));
+	       Tc = LDW(&(W[TWVL * 1]));
+	       Tg = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+	       Tj = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+	       Tt = VMUL(T2, T6);
+	       T4 = VMUL(T2, T3);
+	       Tf = LDW(&(W[TWVL * 4]));
+	       Ti = LDW(&(W[TWVL * 5]));
+	       Tn = VMUL(T9, Td);
+	       Tb = VMUL(T9, Ta);
+	       {
+		    V Tu, T7, Tp, Th, To, Te;
+		    Tu = VFNMS(T5, T3, Tt);
+		    T7 = VFMA(T5, T6, T4);
+		    Tp = VMUL(Tf, Tj);
+		    Th = VMUL(Tf, Tg);
+		    To = VFNMS(Tc, Ta, Tn);
+		    Te = VFMA(Tc, Td, Tb);
+		    {
+			 V Tw, Tx, T8, Tm, Tq, Tk;
+			 Tw = VADD(Tu, Tv);
+			 Tx = VSUB(Tv, Tu);
+			 T8 = VADD(T1, T7);
+			 Tm = VSUB(T1, T7);
+			 Tq = VFNMS(Ti, Tg, Tp);
+			 Tk = VFMA(Ti, Tj, Th);
+			 {
+			      V Ts, Tr, Tl, Ty;
+			      Ts = VADD(To, Tq);
+			      Tr = VSUB(To, Tq);
+			      Tl = VADD(Te, Tk);
+			      Ty = VSUB(Te, Tk);
+			      ST(&(ri[WS(rs, 1)]), VADD(Tm, Tr), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 3)]), VSUB(Tm, Tr), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 2)]), VSUB(Tw, Ts), ms, &(ii[0]));
+			      ST(&(ii[0]), VADD(Ts, Tw), ms, &(ii[0]));
+			      ST(&(ii[WS(rs, 3)]), VADD(Ty, Tx), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 1)]), VSUB(Tx, Ty), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ri[0]), VADD(T8, Tl), ms, &(ri[0]));
+			      ST(&(ri[WS(rs, 2)]), VSUB(T8, Tl), ms, &(ri[0]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t1sv_4"), twinstr, &GENUS, {16, 6, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1sv_4) (planner *p) {
+     X(kdft_dit_register) (p, t1sv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1sv_4 -include ts.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 13 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "ts.h"
+
+static void t1sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T1, Tp, T6, To, Tc, Tk, Th, Tl;
+	       T1 = LD(&(ri[0]), ms, &(ri[0]));
+	       Tp = LD(&(ii[0]), ms, &(ii[0]));
+	       {
+		    V T3, T5, T2, T4;
+		    T3 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+		    T5 = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+		    T2 = LDW(&(W[TWVL * 2]));
+		    T4 = LDW(&(W[TWVL * 3]));
+		    T6 = VFMA(T2, T3, VMUL(T4, T5));
+		    To = VFNMS(T4, T3, VMUL(T2, T5));
+	       }
+	       {
+		    V T9, Tb, T8, Ta;
+		    T9 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+		    Tb = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+		    T8 = LDW(&(W[0]));
+		    Ta = LDW(&(W[TWVL * 1]));
+		    Tc = VFMA(T8, T9, VMUL(Ta, Tb));
+		    Tk = VFNMS(Ta, T9, VMUL(T8, Tb));
+	       }
+	       {
+		    V Te, Tg, Td, Tf;
+		    Te = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+		    Tg = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+		    Td = LDW(&(W[TWVL * 4]));
+		    Tf = LDW(&(W[TWVL * 5]));
+		    Th = VFMA(Td, Te, VMUL(Tf, Tg));
+		    Tl = VFNMS(Tf, Te, VMUL(Td, Tg));
+	       }
+	       {
+		    V T7, Ti, Tn, Tq;
+		    T7 = VADD(T1, T6);
+		    Ti = VADD(Tc, Th);
+		    ST(&(ri[WS(rs, 2)]), VSUB(T7, Ti), ms, &(ri[0]));
+		    ST(&(ri[0]), VADD(T7, Ti), ms, &(ri[0]));
+		    Tn = VADD(Tk, Tl);
+		    Tq = VADD(To, Tp);
+		    ST(&(ii[0]), VADD(Tn, Tq), ms, &(ii[0]));
+		    ST(&(ii[WS(rs, 2)]), VSUB(Tq, Tn), ms, &(ii[0]));
+	       }
+	       {
+		    V Tj, Tm, Tr, Ts;
+		    Tj = VSUB(T1, T6);
+		    Tm = VSUB(Tk, Tl);
+		    ST(&(ri[WS(rs, 3)]), VSUB(Tj, Tm), ms, &(ri[WS(rs, 1)]));
+		    ST(&(ri[WS(rs, 1)]), VADD(Tj, Tm), ms, &(ri[WS(rs, 1)]));
+		    Tr = VSUB(Tp, To);
+		    Ts = VSUB(Tc, Th);
+		    ST(&(ii[WS(rs, 1)]), VSUB(Tr, Ts), ms, &(ii[WS(rs, 1)]));
+		    ST(&(ii[WS(rs, 3)]), VADD(Ts, Tr), ms, &(ii[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t1sv_4"), twinstr, &GENUS, {16, 6, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1sv_4) (planner *p) {
+     X(kdft_dit_register) (p, t1sv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t1sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t1sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:24 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1sv_8 -include ts.h */
+
+/*
+ * This function contains 66 FP additions, 36 FP multiplications,
+ * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
+ * 59 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "ts.h"
+
+static void t1sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 14); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 14), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T1, T1m, T1l, T7, TS, Tk, TQ, Te, To, Tr, Tu, T14, TF, Tx, T16;
+	       V TL, Tt, TW, Tp, Tq, Tw;
+	       {
+		    V T3, T6, T2, T5;
+		    T1 = LD(&(ri[0]), ms, &(ri[0]));
+		    T1m = LD(&(ii[0]), ms, &(ii[0]));
+		    T3 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+		    T6 = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+		    T2 = LDW(&(W[TWVL * 6]));
+		    T5 = LDW(&(W[TWVL * 7]));
+		    {
+			 V Tg, Tj, Ti, Ta, Td, T1k, T4, T9, Tc, TR, Th, Tf;
+			 Tg = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+			 Tj = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+			 Tf = LDW(&(W[TWVL * 10]));
+			 Ti = LDW(&(W[TWVL * 11]));
+			 Ta = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+			 Td = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+			 T1k = VMUL(T2, T6);
+			 T4 = VMUL(T2, T3);
+			 T9 = LDW(&(W[TWVL * 2]));
+			 Tc = LDW(&(W[TWVL * 3]));
+			 TR = VMUL(Tf, Tj);
+			 Th = VMUL(Tf, Tg);
+			 {
+			      V TB, TE, TH, TK, TG, TD, TJ, T13, TC, TA, TP, Tb, T15, TI, Tn;
+			      TB = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+			      TE = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+			      T1l = VFNMS(T5, T3, T1k);
+			      T7 = VFMA(T5, T6, T4);
+			      TP = VMUL(T9, Td);
+			      Tb = VMUL(T9, Ta);
+			      TS = VFNMS(Ti, Tg, TR);
+			      Tk = VFMA(Ti, Tj, Th);
+			      TA = LDW(&(W[TWVL * 12]));
+			      TH = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+			      TK = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+			      TG = LDW(&(W[TWVL * 4]));
+			      TQ = VFNMS(Tc, Ta, TP);
+			      Te = VFMA(Tc, Td, Tb);
+			      TD = LDW(&(W[TWVL * 13]));
+			      TJ = LDW(&(W[TWVL * 5]));
+			      T13 = VMUL(TA, TE);
+			      TC = VMUL(TA, TB);
+			      To = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+			      T15 = VMUL(TG, TK);
+			      TI = VMUL(TG, TH);
+			      Tr = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+			      Tn = LDW(&(W[0]));
+			      Tu = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+			      T14 = VFNMS(TD, TB, T13);
+			      TF = VFMA(TD, TE, TC);
+			      Tx = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+			      T16 = VFNMS(TJ, TH, T15);
+			      TL = VFMA(TJ, TK, TI);
+			      Tt = LDW(&(W[TWVL * 8]));
+			      TW = VMUL(Tn, Tr);
+			      Tp = VMUL(Tn, To);
+			      Tq = LDW(&(W[TWVL * 1]));
+			      Tw = LDW(&(W[TWVL * 9]));
+			 }
+		    }
+	       }
+	       {
+		    V T8, T1g, TM, T1j, TX, Ts, T1n, T1r, T1s, Tl, T1c, T18, TZ, Ty, T1a;
+		    V TU;
+		    {
+			 V TO, T17, T12, TY, Tv, TT;
+			 T8 = VADD(T1, T7);
+			 TO = VSUB(T1, T7);
+			 T17 = VSUB(T14, T16);
+			 T1g = VADD(T14, T16);
+			 TM = VADD(TF, TL);
+			 T12 = VSUB(TF, TL);
+			 TY = VMUL(Tt, Tx);
+			 Tv = VMUL(Tt, Tu);
+			 TT = VSUB(TQ, TS);
+			 T1j = VADD(TQ, TS);
+			 TX = VFNMS(Tq, To, TW);
+			 Ts = VFMA(Tq, Tr, Tp);
+			 T1n = VADD(T1l, T1m);
+			 T1r = VSUB(T1m, T1l);
+			 T1s = VSUB(Te, Tk);
+			 Tl = VADD(Te, Tk);
+			 T1c = VADD(T12, T17);
+			 T18 = VSUB(T12, T17);
+			 TZ = VFNMS(Tw, Tu, TY);
+			 Ty = VFMA(Tw, Tx, Tv);
+			 T1a = VSUB(TO, TT);
+			 TU = VADD(TO, TT);
+		    }
+		    {
+			 V T1v, T1t, Tm, T1e, T1o, T1q, TN, T1p, T1d, T1u, T19, T1w, T1i, T1h;
+			 {
+			      V T10, T1f, Tz, TV, T11, T1b;
+			      T1v = VADD(T1s, T1r);
+			      T1t = VSUB(T1r, T1s);
+			      T10 = VSUB(TX, TZ);
+			      T1f = VADD(TX, TZ);
+			      Tz = VADD(Ts, Ty);
+			      TV = VSUB(Ts, Ty);
+			      T11 = VADD(TV, T10);
+			      T1b = VSUB(T10, TV);
+			      Tm = VADD(T8, Tl);
+			      T1e = VSUB(T8, Tl);
+			      T1o = VADD(T1j, T1n);
+			      T1q = VSUB(T1n, T1j);
+			      TN = VADD(Tz, TM);
+			      T1p = VSUB(TM, Tz);
+			      T1d = VSUB(T1b, T1c);
+			      T1u = VADD(T1b, T1c);
+			      T19 = VADD(T11, T18);
+			      T1w = VSUB(T18, T11);
+			      T1i = VADD(T1f, T1g);
+			      T1h = VSUB(T1f, T1g);
+			 }
+			 ST(&(ii[WS(rs, 6)]), VSUB(T1q, T1p), ms, &(ii[0]));
+			 ST(&(ri[0]), VADD(Tm, TN), ms, &(ri[0]));
+			 ST(&(ri[WS(rs, 4)]), VSUB(Tm, TN), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP707106781), T1u, T1t), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 5)]), VFNMS(LDK(KP707106781), T1u, T1t), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP707106781), T1d, T1a), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1d, T1a), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP707106781), T1w, T1v), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1w, T1v), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP707106781), T19, TU), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 5)]), VFNMS(LDK(KP707106781), T19, TU), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 6)]), VSUB(T1e, T1h), ms, &(ri[0]));
+			 ST(&(ii[0]), VADD(T1i, T1o), ms, &(ii[0]));
+			 ST(&(ii[WS(rs, 4)]), VSUB(T1o, T1i), ms, &(ii[0]));
+			 ST(&(ri[WS(rs, 2)]), VADD(T1e, T1h), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 2)]), VADD(T1p, T1q), ms, &(ii[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t1sv_8"), twinstr, &GENUS, {44, 14, 22, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1sv_8) (planner *p) {
+     X(kdft_dit_register) (p, t1sv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1sv_8 -include ts.h */
+
+/*
+ * This function contains 66 FP additions, 32 FP multiplications,
+ * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
+ * 28 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "ts.h"
+
+static void t1sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 14); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 14), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
+	       V TP;
+	       {
+		    V T1, T18, T6, T17;
+		    T1 = LD(&(ri[0]), ms, &(ri[0]));
+		    T18 = LD(&(ii[0]), ms, &(ii[0]));
+		    {
+			 V T3, T5, T2, T4;
+			 T3 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+			 T5 = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+			 T2 = LDW(&(W[TWVL * 6]));
+			 T4 = LDW(&(W[TWVL * 7]));
+			 T6 = VFMA(T2, T3, VMUL(T4, T5));
+			 T17 = VFNMS(T4, T3, VMUL(T2, T5));
+		    }
+		    T7 = VADD(T1, T6);
+		    T1e = VSUB(T18, T17);
+		    TH = VSUB(T1, T6);
+		    T19 = VADD(T17, T18);
+	       }
+	       {
+		    V Tz, TS, TE, TT;
+		    {
+			 V Tw, Ty, Tv, Tx;
+			 Tw = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+			 Ty = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+			 Tv = LDW(&(W[TWVL * 12]));
+			 Tx = LDW(&(W[TWVL * 13]));
+			 Tz = VFMA(Tv, Tw, VMUL(Tx, Ty));
+			 TS = VFNMS(Tx, Tw, VMUL(Tv, Ty));
+		    }
+		    {
+			 V TB, TD, TA, TC;
+			 TB = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+			 TD = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+			 TA = LDW(&(W[TWVL * 4]));
+			 TC = LDW(&(W[TWVL * 5]));
+			 TE = VFMA(TA, TB, VMUL(TC, TD));
+			 TT = VFNMS(TC, TB, VMUL(TA, TD));
+		    }
+		    TF = VADD(Tz, TE);
+		    T13 = VADD(TS, TT);
+		    TR = VSUB(Tz, TE);
+		    TU = VSUB(TS, TT);
+	       }
+	       {
+		    V Tc, TI, Th, TJ;
+		    {
+			 V T9, Tb, T8, Ta;
+			 T9 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+			 Tb = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+			 T8 = LDW(&(W[TWVL * 2]));
+			 Ta = LDW(&(W[TWVL * 3]));
+			 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
+			 TI = VFNMS(Ta, T9, VMUL(T8, Tb));
+		    }
+		    {
+			 V Te, Tg, Td, Tf;
+			 Te = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+			 Tg = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+			 Td = LDW(&(W[TWVL * 10]));
+			 Tf = LDW(&(W[TWVL * 11]));
+			 Th = VFMA(Td, Te, VMUL(Tf, Tg));
+			 TJ = VFNMS(Tf, Te, VMUL(Td, Tg));
+		    }
+		    Ti = VADD(Tc, Th);
+		    T1f = VSUB(Tc, Th);
+		    TK = VSUB(TI, TJ);
+		    T16 = VADD(TI, TJ);
+	       }
+	       {
+		    V To, TN, Tt, TO;
+		    {
+			 V Tl, Tn, Tk, Tm;
+			 Tl = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+			 Tn = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+			 Tk = LDW(&(W[0]));
+			 Tm = LDW(&(W[TWVL * 1]));
+			 To = VFMA(Tk, Tl, VMUL(Tm, Tn));
+			 TN = VFNMS(Tm, Tl, VMUL(Tk, Tn));
+		    }
+		    {
+			 V Tq, Ts, Tp, Tr;
+			 Tq = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+			 Ts = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+			 Tp = LDW(&(W[TWVL * 8]));
+			 Tr = LDW(&(W[TWVL * 9]));
+			 Tt = VFMA(Tp, Tq, VMUL(Tr, Ts));
+			 TO = VFNMS(Tr, Tq, VMUL(Tp, Ts));
+		    }
+		    Tu = VADD(To, Tt);
+		    T12 = VADD(TN, TO);
+		    TM = VSUB(To, Tt);
+		    TP = VSUB(TN, TO);
+	       }
+	       {
+		    V Tj, TG, T1b, T1c;
+		    Tj = VADD(T7, Ti);
+		    TG = VADD(Tu, TF);
+		    ST(&(ri[WS(rs, 4)]), VSUB(Tj, TG), ms, &(ri[0]));
+		    ST(&(ri[0]), VADD(Tj, TG), ms, &(ri[0]));
+		    {
+			 V T15, T1a, T11, T14;
+			 T15 = VADD(T12, T13);
+			 T1a = VADD(T16, T19);
+			 ST(&(ii[0]), VADD(T15, T1a), ms, &(ii[0]));
+			 ST(&(ii[WS(rs, 4)]), VSUB(T1a, T15), ms, &(ii[0]));
+			 T11 = VSUB(T7, Ti);
+			 T14 = VSUB(T12, T13);
+			 ST(&(ri[WS(rs, 6)]), VSUB(T11, T14), ms, &(ri[0]));
+			 ST(&(ri[WS(rs, 2)]), VADD(T11, T14), ms, &(ri[0]));
+		    }
+		    T1b = VSUB(TF, Tu);
+		    T1c = VSUB(T19, T16);
+		    ST(&(ii[WS(rs, 2)]), VADD(T1b, T1c), ms, &(ii[0]));
+		    ST(&(ii[WS(rs, 6)]), VSUB(T1c, T1b), ms, &(ii[0]));
+		    {
+			 V TX, T1g, T10, T1d, TY, TZ;
+			 TX = VSUB(TH, TK);
+			 T1g = VSUB(T1e, T1f);
+			 TY = VSUB(TP, TM);
+			 TZ = VADD(TR, TU);
+			 T10 = VMUL(LDK(KP707106781), VSUB(TY, TZ));
+			 T1d = VMUL(LDK(KP707106781), VADD(TY, TZ));
+			 ST(&(ri[WS(rs, 7)]), VSUB(TX, T10), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 5)]), VSUB(T1g, T1d), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 3)]), VADD(TX, T10), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 1)]), VADD(T1d, T1g), ms, &(ii[WS(rs, 1)]));
+		    }
+		    {
+			 V TL, T1i, TW, T1h, TQ, TV;
+			 TL = VADD(TH, TK);
+			 T1i = VADD(T1f, T1e);
+			 TQ = VADD(TM, TP);
+			 TV = VSUB(TR, TU);
+			 TW = VMUL(LDK(KP707106781), VADD(TQ, TV));
+			 T1h = VMUL(LDK(KP707106781), VSUB(TV, TQ));
+			 ST(&(ri[WS(rs, 5)]), VSUB(TL, TW), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 7)]), VSUB(T1i, T1h), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 1)]), VADD(TL, TW), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 3)]), VADD(T1h, T1i), ms, &(ii[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t1sv_8"), twinstr, &GENUS, {52, 18, 14, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t1sv_8) (planner *p) {
+     X(kdft_dit_register) (p, t1sv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:14 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t2bv_10 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 51 FP additions, 40 FP multiplications,
+ * (or, 33 additions, 22 multiplications, 18 fused multiply/add),
+ * 43 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Td, TA, T4, Ta, Tk, TE, Tp, TF, TB, T9, T1, T2, Tb;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tg, Tn, Ti, Tl;
+		    Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tn = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Ti = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+		    Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    {
+			 V T6, T8, T5, Tc;
+			 T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 {
+			      V T3, Th, To, Tj, Tm, T7;
+			      T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T3 = BYTW(&(W[TWVL * 8]), T2);
+			      Th = BYTW(&(W[TWVL * 6]), Tg);
+			      To = BYTW(&(W[0]), Tn);
+			      Tj = BYTW(&(W[TWVL * 16]), Ti);
+			      Tm = BYTW(&(W[TWVL * 10]), Tl);
+			      T6 = BYTW(&(W[TWVL * 2]), T5);
+			      Td = BYTW(&(W[TWVL * 4]), Tc);
+			      T8 = BYTW(&(W[TWVL * 12]), T7);
+			      TA = VADD(T1, T3);
+			      T4 = VSUB(T1, T3);
+			      Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Tk = VSUB(Th, Tj);
+			      TE = VADD(Th, Tj);
+			      Tp = VSUB(Tm, To);
+			      TF = VADD(Tm, To);
+			 }
+			 TB = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+		    }
+	       }
+	       Tb = BYTW(&(W[TWVL * 14]), Ta);
+	       {
+		    V TL, TG, Tw, Tq, TC, Te;
+		    TL = VSUB(TE, TF);
+		    TG = VADD(TE, TF);
+		    Tw = VSUB(Tk, Tp);
+		    Tq = VADD(Tk, Tp);
+		    TC = VADD(Tb, Td);
+		    Te = VSUB(Tb, Td);
+		    {
+			 V TM, TD, Tv, Tf;
+			 TM = VSUB(TB, TC);
+			 TD = VADD(TB, TC);
+			 Tv = VSUB(T9, Te);
+			 Tf = VADD(T9, Te);
+			 {
+			      V TP, TN, TH, TJ, Tz, Tx, Tr, Tt, TI, Ts;
+			      TP = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TL, TM));
+			      TN = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TM, TL));
+			      TH = VADD(TD, TG);
+			      TJ = VSUB(TD, TG);
+			      Tz = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tv, Tw));
+			      Tx = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tw, Tv));
+			      Tr = VADD(Tf, Tq);
+			      Tt = VSUB(Tf, Tq);
+			      ST(&(x[0]), VADD(TA, TH), ms, &(x[0]));
+			      TI = VFNMS(LDK(KP250000000), TH, TA);
+			      ST(&(x[WS(rs, 5)]), VADD(T4, Tr), ms, &(x[WS(rs, 1)]));
+			      Ts = VFNMS(LDK(KP250000000), Tr, T4);
+			      {
+				   V TK, TO, Tu, Ty;
+				   TK = VFNMS(LDK(KP559016994), TJ, TI);
+				   TO = VFMA(LDK(KP559016994), TJ, TI);
+				   Tu = VFMA(LDK(KP559016994), Tt, Ts);
+				   Ty = VFNMS(LDK(KP559016994), Tt, Ts);
+				   ST(&(x[WS(rs, 8)]), VFMAI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFNMSI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFMAI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VFNMSI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 9)]), VFNMSI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFMAI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFNMSI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t2bv_10"), twinstr, &GENUS, {33, 22, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_10) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t2bv_10 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 51 FP additions, 30 FP multiplications,
+ * (or, 45 additions, 24 multiplications, 6 fused multiply/add),
+ * 32 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Tu, TH, Tg, Tl, Tp, TD, TE, TJ, T5, Ta, To, TA, TB, TI, Tr;
+	       V Tt, Ts;
+	       Tr = LD(&(x[0]), ms, &(x[0]));
+	       Ts = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Tt = BYTW(&(W[TWVL * 8]), Ts);
+	       Tu = VSUB(Tr, Tt);
+	       TH = VADD(Tr, Tt);
+	       {
+		    V Td, Tk, Tf, Ti;
+		    {
+			 V Tc, Tj, Te, Th;
+			 Tc = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Td = BYTW(&(W[TWVL * 6]), Tc);
+			 Tj = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Tk = BYTW(&(W[0]), Tj);
+			 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTW(&(W[TWVL * 16]), Te);
+			 Th = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Ti = BYTW(&(W[TWVL * 10]), Th);
+		    }
+		    Tg = VSUB(Td, Tf);
+		    Tl = VSUB(Ti, Tk);
+		    Tp = VADD(Tg, Tl);
+		    TD = VADD(Td, Tf);
+		    TE = VADD(Ti, Tk);
+		    TJ = VADD(TD, TE);
+	       }
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T2 = BYTW(&(W[TWVL * 2]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTW(&(W[TWVL * 12]), T3);
+			 T6 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T7 = BYTW(&(W[TWVL * 14]), T6);
+		    }
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    To = VADD(T5, Ta);
+		    TA = VADD(T2, T4);
+		    TB = VADD(T7, T9);
+		    TI = VADD(TA, TB);
+	       }
+	       {
+		    V Tq, Tv, Tw, Tn, Tz, Tb, Tm, Ty, Tx;
+		    Tq = VMUL(LDK(KP559016994), VSUB(To, Tp));
+		    Tv = VADD(To, Tp);
+		    Tw = VFNMS(LDK(KP250000000), Tv, Tu);
+		    Tb = VSUB(T5, Ta);
+		    Tm = VSUB(Tg, Tl);
+		    Tn = VBYI(VFMA(LDK(KP951056516), Tb, VMUL(LDK(KP587785252), Tm)));
+		    Tz = VBYI(VFNMS(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tb)));
+		    ST(&(x[WS(rs, 5)]), VADD(Tu, Tv), ms, &(x[WS(rs, 1)]));
+		    Ty = VSUB(Tw, Tq);
+		    ST(&(x[WS(rs, 3)]), VSUB(Ty, Tz), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VADD(Tz, Ty), ms, &(x[WS(rs, 1)]));
+		    Tx = VADD(Tq, Tw);
+		    ST(&(x[WS(rs, 1)]), VADD(Tn, Tx), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VSUB(Tx, Tn), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TM, TK, TL, TG, TP, TC, TF, TO, TN;
+		    TM = VMUL(LDK(KP559016994), VSUB(TI, TJ));
+		    TK = VADD(TI, TJ);
+		    TL = VFNMS(LDK(KP250000000), TK, TH);
+		    TC = VSUB(TA, TB);
+		    TF = VSUB(TD, TE);
+		    TG = VBYI(VFNMS(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TC)));
+		    TP = VBYI(VFMA(LDK(KP951056516), TC, VMUL(LDK(KP587785252), TF)));
+		    ST(&(x[0]), VADD(TH, TK), ms, &(x[0]));
+		    TO = VADD(TM, TL);
+		    ST(&(x[WS(rs, 4)]), VSUB(TO, TP), ms, &(x[0]));
+		    ST(&(x[WS(rs, 6)]), VADD(TP, TO), ms, &(x[0]));
+		    TN = VSUB(TL, TM);
+		    ST(&(x[WS(rs, 2)]), VADD(TG, TN), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VSUB(TN, TG), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t2bv_10"), twinstr, &GENUS, {45, 24, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_10) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:09 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t2bv_16 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 87 FP additions, 64 FP multiplications,
+ * (or, 53 additions, 30 multiplications, 34 fused multiply/add),
+ * 61 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V TO, Ta, TJ, TP, T14, Tq, T1i, T10, T1b, T1l, T13, T1c, TR, Tl, T15;
+	       V Tv;
+	       {
+		    V Tc, TW, T4, T19, T9, TD, TI, Tj, TZ, T1a, Te, Th, Tn, Tr, Tu;
+		    V Tp;
+		    {
+			 V T1, T2, T5, T7;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T2 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T7 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 {
+			      V Tz, TG, TB, TE;
+			      Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      TG = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      TB = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      TE = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      {
+				   V Ti, TX, TY, Td, Tg, Tm, Tt, To;
+				   {
+					V T3, T6, T8, TA, TH, TC, TF, Tb;
+					Tb = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					T3 = BYTW(&(W[TWVL * 14]), T2);
+					T6 = BYTW(&(W[TWVL * 6]), T5);
+					T8 = BYTW(&(W[TWVL * 22]), T7);
+					TA = BYTW(&(W[TWVL * 2]), Tz);
+					TH = BYTW(&(W[TWVL * 10]), TG);
+					TC = BYTW(&(W[TWVL * 18]), TB);
+					TF = BYTW(&(W[TWVL * 26]), TE);
+					Tc = BYTW(&(W[0]), Tb);
+					TW = VSUB(T1, T3);
+					T4 = VADD(T1, T3);
+					T19 = VSUB(T6, T8);
+					T9 = VADD(T6, T8);
+					Ti = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+					TD = VADD(TA, TC);
+					TX = VSUB(TA, TC);
+					TI = VADD(TF, TH);
+					TY = VSUB(TF, TH);
+				   }
+				   Td = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+				   Tg = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+				   Tm = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+				   Tj = BYTW(&(W[TWVL * 24]), Ti);
+				   Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   To = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+				   TZ = VADD(TX, TY);
+				   T1a = VSUB(TX, TY);
+				   Te = BYTW(&(W[TWVL * 16]), Td);
+				   Th = BYTW(&(W[TWVL * 8]), Tg);
+				   Tn = BYTW(&(W[TWVL * 28]), Tm);
+				   Tr = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   Tu = BYTW(&(W[TWVL * 20]), Tt);
+				   Tp = BYTW(&(W[TWVL * 12]), To);
+			      }
+			 }
+		    }
+		    {
+			 V Tf, T11, Tk, T12, Ts;
+			 TO = VADD(T4, T9);
+			 Ta = VSUB(T4, T9);
+			 TJ = VSUB(TD, TI);
+			 TP = VADD(TD, TI);
+			 Tf = VADD(Tc, Te);
+			 T11 = VSUB(Tc, Te);
+			 Tk = VADD(Th, Tj);
+			 T12 = VSUB(Th, Tj);
+			 Ts = BYTW(&(W[TWVL * 4]), Tr);
+			 T14 = VSUB(Tn, Tp);
+			 Tq = VADD(Tn, Tp);
+			 T1i = VFNMS(LDK(KP707106781), TZ, TW);
+			 T10 = VFMA(LDK(KP707106781), TZ, TW);
+			 T1b = VFMA(LDK(KP707106781), T1a, T19);
+			 T1l = VFNMS(LDK(KP707106781), T1a, T19);
+			 T13 = VFNMS(LDK(KP414213562), T12, T11);
+			 T1c = VFMA(LDK(KP414213562), T11, T12);
+			 TR = VADD(Tf, Tk);
+			 Tl = VSUB(Tf, Tk);
+			 T15 = VSUB(Tu, Ts);
+			 Tv = VADD(Ts, Tu);
+		    }
+	       }
+	       {
+		    V T1d, T16, TS, Tw, TU, TQ;
+		    T1d = VFMA(LDK(KP414213562), T14, T15);
+		    T16 = VFNMS(LDK(KP414213562), T15, T14);
+		    TS = VADD(Tq, Tv);
+		    Tw = VSUB(Tq, Tv);
+		    TU = VADD(TO, TP);
+		    TQ = VSUB(TO, TP);
+		    {
+			 V T1e, T1j, T17, T1m;
+			 T1e = VSUB(T1c, T1d);
+			 T1j = VADD(T1c, T1d);
+			 T17 = VADD(T13, T16);
+			 T1m = VSUB(T13, T16);
+			 {
+			      V TV, TT, TK, Tx;
+			      TV = VADD(TR, TS);
+			      TT = VSUB(TR, TS);
+			      TK = VSUB(Tl, Tw);
+			      Tx = VADD(Tl, Tw);
+			      {
+				   V T1h, T1f, T1o, T1k;
+				   T1h = VFMA(LDK(KP923879532), T1e, T1b);
+				   T1f = VFNMS(LDK(KP923879532), T1e, T1b);
+				   T1o = VFMA(LDK(KP923879532), T1j, T1i);
+				   T1k = VFNMS(LDK(KP923879532), T1j, T1i);
+				   {
+					V T1g, T18, T1p, T1n;
+					T1g = VFMA(LDK(KP923879532), T17, T10);
+					T18 = VFNMS(LDK(KP923879532), T17, T10);
+					T1p = VFNMS(LDK(KP923879532), T1m, T1l);
+					T1n = VFMA(LDK(KP923879532), T1m, T1l);
+					ST(&(x[WS(rs, 8)]), VSUB(TU, TV), ms, &(x[0]));
+					ST(&(x[0]), VADD(TU, TV), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VFMAI(TT, TQ), ms, &(x[0]));
+					ST(&(x[WS(rs, 12)]), VFNMSI(TT, TQ), ms, &(x[0]));
+					{
+					     V TN, TL, TM, Ty;
+					     TN = VFMA(LDK(KP707106781), TK, TJ);
+					     TL = VFNMS(LDK(KP707106781), TK, TJ);
+					     TM = VFMA(LDK(KP707106781), Tx, Ta);
+					     Ty = VFNMS(LDK(KP707106781), Tx, Ta);
+					     ST(&(x[WS(rs, 15)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 1)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 9)]), VFMAI(T1f, T18), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 7)]), VFNMSI(T1f, T18), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 3)]), VFNMSI(T1p, T1o), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 13)]), VFMAI(T1p, T1o), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 11)]), VFNMSI(T1n, T1k), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 5)]), VFMAI(T1n, T1k), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 2)]), VFMAI(TN, TM), ms, &(x[0]));
+					     ST(&(x[WS(rs, 14)]), VFNMSI(TN, TM), ms, &(x[0]));
+					     ST(&(x[WS(rs, 10)]), VFMAI(TL, Ty), ms, &(x[0]));
+					     ST(&(x[WS(rs, 6)]), VFNMSI(TL, Ty), ms, &(x[0]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t2bv_16"), twinstr, &GENUS, {53, 30, 34, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_16) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t2bv_16 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 87 FP additions, 42 FP multiplications,
+ * (or, 83 additions, 38 multiplications, 4 fused multiply/add),
+ * 36 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V TJ, T1b, TD, T1c, T17, T18, Ty, TK, T10, T11, T12, Tb, TM, T13, T14;
+	       V T15, Tm, TN, TG, TI, TH;
+	       TG = LD(&(x[0]), ms, &(x[0]));
+	       TH = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+	       TI = BYTW(&(W[TWVL * 14]), TH);
+	       TJ = VSUB(TG, TI);
+	       T1b = VADD(TG, TI);
+	       {
+		    V TA, TC, Tz, TB;
+		    Tz = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    TA = BYTW(&(W[TWVL * 6]), Tz);
+		    TB = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+		    TC = BYTW(&(W[TWVL * 22]), TB);
+		    TD = VSUB(TA, TC);
+		    T1c = VADD(TA, TC);
+	       }
+	       {
+		    V Tp, Tw, Tr, Tu, Ts, Tx;
+		    {
+			 V To, Tv, Tq, Tt;
+			 To = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tp = BYTW(&(W[TWVL * 2]), To);
+			 Tv = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tw = BYTW(&(W[TWVL * 10]), Tv);
+			 Tq = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 Tr = BYTW(&(W[TWVL * 18]), Tq);
+			 Tt = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 Tu = BYTW(&(W[TWVL * 26]), Tt);
+		    }
+		    T17 = VADD(Tp, Tr);
+		    T18 = VADD(Tu, Tw);
+		    Ts = VSUB(Tp, Tr);
+		    Tx = VSUB(Tu, Tw);
+		    Ty = VMUL(LDK(KP707106781), VSUB(Ts, Tx));
+		    TK = VMUL(LDK(KP707106781), VADD(Ts, Tx));
+	       }
+	       {
+		    V T2, T9, T4, T7, T5, Ta;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTW(&(W[0]), T1);
+			 T8 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 24]), T8);
+			 T3 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTW(&(W[TWVL * 16]), T3);
+			 T6 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T7 = BYTW(&(W[TWVL * 8]), T6);
+		    }
+		    T10 = VADD(T2, T4);
+		    T11 = VADD(T7, T9);
+		    T12 = VSUB(T10, T11);
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    Tb = VFNMS(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), T5));
+		    TM = VFMA(LDK(KP382683432), T5, VMUL(LDK(KP923879532), Ta));
+	       }
+	       {
+		    V Td, Tk, Tf, Ti, Tg, Tl;
+		    {
+			 V Tc, Tj, Te, Th;
+			 Tc = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 Td = BYTW(&(W[TWVL * 28]), Tc);
+			 Tj = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 Tk = BYTW(&(W[TWVL * 20]), Tj);
+			 Te = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTW(&(W[TWVL * 12]), Te);
+			 Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 Ti = BYTW(&(W[TWVL * 4]), Th);
+		    }
+		    T13 = VADD(Td, Tf);
+		    T14 = VADD(Ti, Tk);
+		    T15 = VSUB(T13, T14);
+		    Tg = VSUB(Td, Tf);
+		    Tl = VSUB(Ti, Tk);
+		    Tm = VFMA(LDK(KP923879532), Tg, VMUL(LDK(KP382683432), Tl));
+		    TN = VFNMS(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), Tl));
+	       }
+	       {
+		    V T1a, T1g, T1f, T1h;
+		    {
+			 V T16, T19, T1d, T1e;
+			 T16 = VMUL(LDK(KP707106781), VSUB(T12, T15));
+			 T19 = VSUB(T17, T18);
+			 T1a = VBYI(VSUB(T16, T19));
+			 T1g = VBYI(VADD(T19, T16));
+			 T1d = VSUB(T1b, T1c);
+			 T1e = VMUL(LDK(KP707106781), VADD(T12, T15));
+			 T1f = VSUB(T1d, T1e);
+			 T1h = VADD(T1d, T1e);
+		    }
+		    ST(&(x[WS(rs, 6)]), VADD(T1a, T1f), ms, &(x[0]));
+		    ST(&(x[WS(rs, 14)]), VSUB(T1h, T1g), ms, &(x[0]));
+		    ST(&(x[WS(rs, 10)]), VSUB(T1f, T1a), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(T1g, T1h), ms, &(x[0]));
+	       }
+	       {
+		    V T1k, T1o, T1n, T1p;
+		    {
+			 V T1i, T1j, T1l, T1m;
+			 T1i = VADD(T1b, T1c);
+			 T1j = VADD(T17, T18);
+			 T1k = VSUB(T1i, T1j);
+			 T1o = VADD(T1i, T1j);
+			 T1l = VADD(T10, T11);
+			 T1m = VADD(T13, T14);
+			 T1n = VBYI(VSUB(T1l, T1m));
+			 T1p = VADD(T1l, T1m);
+		    }
+		    ST(&(x[WS(rs, 12)]), VSUB(T1k, T1n), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T1o, T1p), ms, &(x[0]));
+		    ST(&(x[WS(rs, 4)]), VADD(T1k, T1n), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VSUB(T1o, T1p), ms, &(x[0]));
+	       }
+	       {
+		    V TF, TQ, TP, TR;
+		    {
+			 V Tn, TE, TL, TO;
+			 Tn = VSUB(Tb, Tm);
+			 TE = VSUB(Ty, TD);
+			 TF = VBYI(VSUB(Tn, TE));
+			 TQ = VBYI(VADD(TE, Tn));
+			 TL = VSUB(TJ, TK);
+			 TO = VSUB(TM, TN);
+			 TP = VSUB(TL, TO);
+			 TR = VADD(TL, TO);
+		    }
+		    ST(&(x[WS(rs, 5)]), VADD(TF, TP), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 13)]), VSUB(TR, TQ), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 11)]), VSUB(TP, TF), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VADD(TQ, TR), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TU, TY, TX, TZ;
+		    {
+			 V TS, TT, TV, TW;
+			 TS = VADD(TJ, TK);
+			 TT = VADD(Tb, Tm);
+			 TU = VADD(TS, TT);
+			 TY = VSUB(TS, TT);
+			 TV = VADD(TD, Ty);
+			 TW = VADD(TM, TN);
+			 TX = VBYI(VADD(TV, TW));
+			 TZ = VBYI(VSUB(TW, TV));
+		    }
+		    ST(&(x[WS(rs, 15)]), VSUB(TU, TX), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VADD(TY, TZ), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(TU, TX), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VSUB(TY, TZ), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t2bv_16"), twinstr, &GENUS, {83, 38, 4, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_16) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:09 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t2bv_2 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T2, T3;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTW(&(W[0]), T2);
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t2bv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_2) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t2bv_2 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTW(&(W[0]), T2);
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t2bv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_2) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:14 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2bv_20 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 123 FP additions, 88 FP multiplications,
+ * (or, 77 additions, 42 multiplications, 46 fused multiply/add),
+ * 68 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T4, TX, T1m, T1K, T1y, Tk, Tf, T14, TQ, TZ, T1O, T1w, T1L, T1p, T1M;
+	       V T1s, TF, TY, T1x, Tp;
+	       {
+		    V T1, TV, T2, TT;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    TV = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+		    T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    TT = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    {
+			 V T9, T1n, TK, T1v, TP, Te, T1q, T1u, TB, TD, Tm, T1o, Tz, Tn, T1r;
+			 V TE, To;
+			 {
+			      V TM, TO, Ta, Tc;
+			      {
+				   V T5, T7, TG, TI, T1k, T1l;
+				   T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   {
+					V TW, T3, TU, T6, T8, TH, TJ, TL, TN;
+					TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					TW = BYTW(&(W[TWVL * 28]), TV);
+					T3 = BYTW(&(W[TWVL * 18]), T2);
+					TU = BYTW(&(W[TWVL * 8]), TT);
+					T6 = BYTW(&(W[TWVL * 6]), T5);
+					T8 = BYTW(&(W[TWVL * 26]), T7);
+					TH = BYTW(&(W[TWVL * 24]), TG);
+					TJ = BYTW(&(W[TWVL * 4]), TI);
+					TM = BYTW(&(W[TWVL * 32]), TL);
+					TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					T4 = VSUB(T1, T3);
+					T1k = VADD(T1, T3);
+					TX = VSUB(TU, TW);
+					T1l = VADD(TU, TW);
+					T9 = VSUB(T6, T8);
+					T1n = VADD(T6, T8);
+					TK = VSUB(TH, TJ);
+					T1v = VADD(TH, TJ);
+					TO = BYTW(&(W[TWVL * 12]), TN);
+				   }
+				   Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T1m = VSUB(T1k, T1l);
+				   T1K = VADD(T1k, T1l);
+				   Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      }
+			      {
+				   V Tb, Tx, Td, Th, Tj, Tw, Tg, Ti, Tv;
+				   Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+				   Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+				   Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+				   TP = VSUB(TM, TO);
+				   T1y = VADD(TM, TO);
+				   Tb = BYTW(&(W[TWVL * 30]), Ta);
+				   Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+				   Td = BYTW(&(W[TWVL * 10]), Tc);
+				   Th = BYTW(&(W[TWVL * 14]), Tg);
+				   Tj = BYTW(&(W[TWVL * 34]), Ti);
+				   Tw = BYTW(&(W[TWVL * 16]), Tv);
+				   {
+					V TA, TC, Ty, Tl;
+					TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					Ty = BYTW(&(W[TWVL * 36]), Tx);
+					Te = VSUB(Tb, Td);
+					T1q = VADD(Tb, Td);
+					Tk = VSUB(Th, Tj);
+					T1u = VADD(Th, Tj);
+					TB = BYTW(&(W[0]), TA);
+					TD = BYTW(&(W[TWVL * 20]), TC);
+					Tm = BYTW(&(W[TWVL * 22]), Tl);
+					T1o = VADD(Tw, Ty);
+					Tz = VSUB(Tw, Ty);
+					Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+				   }
+			      }
+			 }
+			 Tf = VADD(T9, Te);
+			 T14 = VSUB(T9, Te);
+			 TQ = VSUB(TK, TP);
+			 TZ = VADD(TK, TP);
+			 T1r = VADD(TB, TD);
+			 TE = VSUB(TB, TD);
+			 T1O = VADD(T1u, T1v);
+			 T1w = VSUB(T1u, T1v);
+			 To = BYTW(&(W[TWVL * 2]), Tn);
+			 T1L = VADD(T1n, T1o);
+			 T1p = VSUB(T1n, T1o);
+			 T1M = VADD(T1q, T1r);
+			 T1s = VSUB(T1q, T1r);
+			 TF = VSUB(Tz, TE);
+			 TY = VADD(Tz, TE);
+			 T1x = VADD(Tm, To);
+			 Tp = VSUB(Tm, To);
+		    }
+	       }
+	       {
+		    V T1V, T1N, T12, T1b, TR, T1G, T1t, T1z, T1P, Tq, T15, T11, T1j, T10;
+		    T1V = VSUB(T1L, T1M);
+		    T1N = VADD(T1L, T1M);
+		    T12 = VSUB(TY, TZ);
+		    T10 = VADD(TY, TZ);
+		    T1b = VFNMS(LDK(KP618033988), TF, TQ);
+		    TR = VFMA(LDK(KP618033988), TQ, TF);
+		    T1G = VSUB(T1p, T1s);
+		    T1t = VADD(T1p, T1s);
+		    T1z = VSUB(T1x, T1y);
+		    T1P = VADD(T1x, T1y);
+		    Tq = VADD(Tk, Tp);
+		    T15 = VSUB(Tk, Tp);
+		    T11 = VFNMS(LDK(KP250000000), T10, TX);
+		    T1j = VADD(TX, T10);
+		    {
+			 V T1J, T1H, T1D, T1Z, T1X, T1T, T1f, T1h, T19, T17, T1C, T1S, T1a, Tu, T1F;
+			 V T1A;
+			 T1F = VSUB(T1w, T1z);
+			 T1A = VADD(T1w, T1z);
+			 {
+			      V T1W, T1Q, Tt, Tr;
+			      T1W = VSUB(T1O, T1P);
+			      T1Q = VADD(T1O, T1P);
+			      Tt = VSUB(Tf, Tq);
+			      Tr = VADD(Tf, Tq);
+			      {
+				   V T1e, T16, T1d, T13;
+				   T1e = VFNMS(LDK(KP618033988), T14, T15);
+				   T16 = VFMA(LDK(KP618033988), T15, T14);
+				   T1d = VFNMS(LDK(KP559016994), T12, T11);
+				   T13 = VFMA(LDK(KP559016994), T12, T11);
+				   T1J = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
+				   T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
+				   {
+					V T1B, T1R, Ts, T1i;
+					T1B = VADD(T1t, T1A);
+					T1D = VSUB(T1t, T1A);
+					T1Z = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1V, T1W));
+					T1X = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1W, T1V));
+					T1R = VADD(T1N, T1Q);
+					T1T = VSUB(T1N, T1Q);
+					Ts = VFNMS(LDK(KP250000000), Tr, T4);
+					T1i = VADD(T4, Tr);
+					T1f = VFNMS(LDK(KP951056516), T1e, T1d);
+					T1h = VFMA(LDK(KP951056516), T1e, T1d);
+					T19 = VFNMS(LDK(KP951056516), T16, T13);
+					T17 = VFMA(LDK(KP951056516), T16, T13);
+					ST(&(x[WS(rs, 10)]), VADD(T1m, T1B), ms, &(x[0]));
+					T1C = VFNMS(LDK(KP250000000), T1B, T1m);
+					ST(&(x[0]), VADD(T1K, T1R), ms, &(x[0]));
+					T1S = VFNMS(LDK(KP250000000), T1R, T1K);
+					T1a = VFNMS(LDK(KP559016994), Tt, Ts);
+					Tu = VFMA(LDK(KP559016994), Tt, Ts);
+					ST(&(x[WS(rs, 5)]), VFMAI(T1j, T1i), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 15)]), VFNMSI(T1j, T1i), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+			 {
+			      V T1E, T1I, T1U, T1Y;
+			      T1E = VFNMS(LDK(KP559016994), T1D, T1C);
+			      T1I = VFMA(LDK(KP559016994), T1D, T1C);
+			      T1U = VFMA(LDK(KP559016994), T1T, T1S);
+			      T1Y = VFNMS(LDK(KP559016994), T1T, T1S);
+			      {
+				   V T1c, T1g, T18, TS;
+				   T1c = VFMA(LDK(KP951056516), T1b, T1a);
+				   T1g = VFNMS(LDK(KP951056516), T1b, T1a);
+				   T18 = VFMA(LDK(KP951056516), TR, Tu);
+				   TS = VFNMS(LDK(KP951056516), TR, Tu);
+				   ST(&(x[WS(rs, 18)]), VFMAI(T1H, T1E), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFNMSI(T1H, T1E), ms, &(x[0]));
+				   ST(&(x[WS(rs, 14)]), VFNMSI(T1J, T1I), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFMAI(T1J, T1I), ms, &(x[0]));
+				   ST(&(x[WS(rs, 16)]), VFMAI(T1X, T1U), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VFNMSI(T1X, T1U), ms, &(x[0]));
+				   ST(&(x[WS(rs, 12)]), VFNMSI(T1Z, T1Y), ms, &(x[0]));
+				   ST(&(x[WS(rs, 8)]), VFMAI(T1Z, T1Y), ms, &(x[0]));
+				   ST(&(x[WS(rs, 17)]), VFMAI(T1f, T1c), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFNMSI(T1f, T1c), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 13)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 9)]), VFMAI(T19, T18), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 11)]), VFNMSI(T19, T18), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFMAI(T17, TS), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 19)]), VFNMSI(T17, TS), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t2bv_20"), twinstr, &GENUS, {77, 42, 46, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_20) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2bv_20 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 123 FP additions, 62 FP multiplications,
+ * (or, 111 additions, 50 multiplications, 12 fused multiply/add),
+ * 54 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T4, T10, T1B, T1R, TF, T14, T15, TQ, Tf, Tq, Tr, T1N, T1O, T1P, T1t;
+	       V T1w, T1D, TT, TU, T11, T1K, T1L, T1M, T1m, T1p, T1C, T1i, T1j;
+	       {
+		    V T1, TZ, T3, TX, TY, T2, TW, T1z, T1A;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    TY = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+		    TZ = BYTW(&(W[TWVL * 28]), TY);
+		    T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    T3 = BYTW(&(W[TWVL * 18]), T2);
+		    TW = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    TX = BYTW(&(W[TWVL * 8]), TW);
+		    T4 = VSUB(T1, T3);
+		    T10 = VSUB(TX, TZ);
+		    T1z = VADD(T1, T3);
+		    T1A = VADD(TX, TZ);
+		    T1B = VSUB(T1z, T1A);
+		    T1R = VADD(T1z, T1A);
+	       }
+	       {
+		    V T9, T1k, TK, T1s, TP, T1v, Te, T1n, Tk, T1r, Tz, T1l, TE, T1o, Tp;
+		    V T1u;
+		    {
+			 V T6, T8, T5, T7;
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T6 = BYTW(&(W[TWVL * 6]), T5);
+			 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 T8 = BYTW(&(W[TWVL * 26]), T7);
+			 T9 = VSUB(T6, T8);
+			 T1k = VADD(T6, T8);
+		    }
+		    {
+			 V TH, TJ, TG, TI;
+			 TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 TH = BYTW(&(W[TWVL * 24]), TG);
+			 TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 TJ = BYTW(&(W[TWVL * 4]), TI);
+			 TK = VSUB(TH, TJ);
+			 T1s = VADD(TH, TJ);
+		    }
+		    {
+			 V TM, TO, TL, TN;
+			 TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 TM = BYTW(&(W[TWVL * 32]), TL);
+			 TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 TO = BYTW(&(W[TWVL * 12]), TN);
+			 TP = VSUB(TM, TO);
+			 T1v = VADD(TM, TO);
+		    }
+		    {
+			 V Tb, Td, Ta, Tc;
+			 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			 Tb = BYTW(&(W[TWVL * 30]), Ta);
+			 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Td = BYTW(&(W[TWVL * 10]), Tc);
+			 Te = VSUB(Tb, Td);
+			 T1n = VADD(Tb, Td);
+		    }
+		    {
+			 V Th, Tj, Tg, Ti;
+			 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 Th = BYTW(&(W[TWVL * 14]), Tg);
+			 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 Tj = BYTW(&(W[TWVL * 34]), Ti);
+			 Tk = VSUB(Th, Tj);
+			 T1r = VADD(Th, Tj);
+		    }
+		    {
+			 V Tw, Ty, Tv, Tx;
+			 Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tw = BYTW(&(W[TWVL * 16]), Tv);
+			 Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 Ty = BYTW(&(W[TWVL * 36]), Tx);
+			 Tz = VSUB(Tw, Ty);
+			 T1l = VADD(Tw, Ty);
+		    }
+		    {
+			 V TB, TD, TA, TC;
+			 TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 TB = BYTW(&(W[0]), TA);
+			 TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 TD = BYTW(&(W[TWVL * 20]), TC);
+			 TE = VSUB(TB, TD);
+			 T1o = VADD(TB, TD);
+		    }
+		    {
+			 V Tm, To, Tl, Tn;
+			 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Tm = BYTW(&(W[TWVL * 22]), Tl);
+			 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 To = BYTW(&(W[TWVL * 2]), Tn);
+			 Tp = VSUB(Tm, To);
+			 T1u = VADD(Tm, To);
+		    }
+		    TF = VSUB(Tz, TE);
+		    T14 = VSUB(T9, Te);
+		    T15 = VSUB(Tk, Tp);
+		    TQ = VSUB(TK, TP);
+		    Tf = VADD(T9, Te);
+		    Tq = VADD(Tk, Tp);
+		    Tr = VADD(Tf, Tq);
+		    T1N = VADD(T1r, T1s);
+		    T1O = VADD(T1u, T1v);
+		    T1P = VADD(T1N, T1O);
+		    T1t = VSUB(T1r, T1s);
+		    T1w = VSUB(T1u, T1v);
+		    T1D = VADD(T1t, T1w);
+		    TT = VADD(Tz, TE);
+		    TU = VADD(TK, TP);
+		    T11 = VADD(TT, TU);
+		    T1K = VADD(T1k, T1l);
+		    T1L = VADD(T1n, T1o);
+		    T1M = VADD(T1K, T1L);
+		    T1m = VSUB(T1k, T1l);
+		    T1p = VSUB(T1n, T1o);
+		    T1C = VADD(T1m, T1p);
+	       }
+	       T1i = VADD(T4, Tr);
+	       T1j = VBYI(VADD(T10, T11));
+	       ST(&(x[WS(rs, 15)]), VSUB(T1i, T1j), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 5)]), VADD(T1i, T1j), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T1Q, T1S, T1T, T1X, T1Z, T1V, T1W, T1Y, T1U;
+		    T1Q = VMUL(LDK(KP559016994), VSUB(T1M, T1P));
+		    T1S = VADD(T1M, T1P);
+		    T1T = VFNMS(LDK(KP250000000), T1S, T1R);
+		    T1V = VSUB(T1K, T1L);
+		    T1W = VSUB(T1N, T1O);
+		    T1X = VBYI(VFMA(LDK(KP951056516), T1V, VMUL(LDK(KP587785252), T1W)));
+		    T1Z = VBYI(VFNMS(LDK(KP951056516), T1W, VMUL(LDK(KP587785252), T1V)));
+		    ST(&(x[0]), VADD(T1R, T1S), ms, &(x[0]));
+		    T1Y = VSUB(T1T, T1Q);
+		    ST(&(x[WS(rs, 8)]), VSUB(T1Y, T1Z), ms, &(x[0]));
+		    ST(&(x[WS(rs, 12)]), VADD(T1Z, T1Y), ms, &(x[0]));
+		    T1U = VADD(T1Q, T1T);
+		    ST(&(x[WS(rs, 4)]), VSUB(T1U, T1X), ms, &(x[0]));
+		    ST(&(x[WS(rs, 16)]), VADD(T1X, T1U), ms, &(x[0]));
+	       }
+	       {
+		    V T1G, T1E, T1F, T1y, T1I, T1q, T1x, T1J, T1H;
+		    T1G = VMUL(LDK(KP559016994), VSUB(T1C, T1D));
+		    T1E = VADD(T1C, T1D);
+		    T1F = VFNMS(LDK(KP250000000), T1E, T1B);
+		    T1q = VSUB(T1m, T1p);
+		    T1x = VSUB(T1t, T1w);
+		    T1y = VBYI(VFNMS(LDK(KP951056516), T1x, VMUL(LDK(KP587785252), T1q)));
+		    T1I = VBYI(VFMA(LDK(KP951056516), T1q, VMUL(LDK(KP587785252), T1x)));
+		    ST(&(x[WS(rs, 10)]), VADD(T1B, T1E), ms, &(x[0]));
+		    T1J = VADD(T1G, T1F);
+		    ST(&(x[WS(rs, 6)]), VADD(T1I, T1J), ms, &(x[0]));
+		    ST(&(x[WS(rs, 14)]), VSUB(T1J, T1I), ms, &(x[0]));
+		    T1H = VSUB(T1F, T1G);
+		    ST(&(x[WS(rs, 2)]), VADD(T1y, T1H), ms, &(x[0]));
+		    ST(&(x[WS(rs, 18)]), VSUB(T1H, T1y), ms, &(x[0]));
+	       }
+	       {
+		    V TR, T16, T1d, T1b, T13, T1e, Tu, T1a;
+		    TR = VFNMS(LDK(KP951056516), TQ, VMUL(LDK(KP587785252), TF));
+		    T16 = VFNMS(LDK(KP951056516), T15, VMUL(LDK(KP587785252), T14));
+		    T1d = VFMA(LDK(KP951056516), T14, VMUL(LDK(KP587785252), T15));
+		    T1b = VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TQ));
+		    {
+			 V TV, T12, Ts, Tt;
+			 TV = VMUL(LDK(KP559016994), VSUB(TT, TU));
+			 T12 = VFNMS(LDK(KP250000000), T11, T10);
+			 T13 = VSUB(TV, T12);
+			 T1e = VADD(TV, T12);
+			 Ts = VFNMS(LDK(KP250000000), Tr, T4);
+			 Tt = VMUL(LDK(KP559016994), VSUB(Tf, Tq));
+			 Tu = VSUB(Ts, Tt);
+			 T1a = VADD(Tt, Ts);
+		    }
+		    {
+			 V TS, T17, T1g, T1h;
+			 TS = VSUB(Tu, TR);
+			 T17 = VBYI(VSUB(T13, T16));
+			 ST(&(x[WS(rs, 17)]), VSUB(TS, T17), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(TS, T17), ms, &(x[WS(rs, 1)]));
+			 T1g = VADD(T1a, T1b);
+			 T1h = VBYI(VSUB(T1e, T1d));
+			 ST(&(x[WS(rs, 11)]), VSUB(T1g, T1h), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 9)]), VADD(T1g, T1h), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T18, T19, T1c, T1f;
+			 T18 = VADD(Tu, TR);
+			 T19 = VBYI(VADD(T16, T13));
+			 ST(&(x[WS(rs, 13)]), VSUB(T18, T19), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T18, T19), ms, &(x[WS(rs, 1)]));
+			 T1c = VSUB(T1a, T1b);
+			 T1f = VBYI(VADD(T1d, T1e));
+			 ST(&(x[WS(rs, 19)]), VSUB(T1c, T1f), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T1c, T1f), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t2bv_20"), twinstr, &GENUS, {111, 50, 12, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_20) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,934 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:15 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t2bv_25 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 248 FP additions, 241 FP multiplications,
+ * (or, 67 additions, 60 multiplications, 181 fused multiply/add),
+ * 208 stack variables, 67 constants, and 50 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP792626838, +0.792626838241819413632131824093538848057784557);
+     DVK(KP876091699, +0.876091699473550838204498029706869638173524346);
+     DVK(KP617882369, +0.617882369114440893914546919006756321695042882);
+     DVK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DVK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP999544308, +0.999544308746292983948881682379742149196758193);
+     DVK(KP916574801, +0.916574801383451584742370439148878693530976769);
+     DVK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DVK(KP809385824, +0.809385824416008241660603814668679683846476688);
+     DVK(KP447417479, +0.447417479732227551498980015410057305749330693);
+     DVK(KP894834959, +0.894834959464455102997960030820114611498661386);
+     DVK(KP867381224, +0.867381224396525206773171885031575671309956167);
+     DVK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DVK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DVK(KP958953096, +0.958953096729998668045963838399037225970891871);
+     DVK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DVK(KP829049696, +0.829049696159252993975487806364305442437946767);
+     DVK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DVK(KP897376177, +0.897376177523557693138608077137219684419427330);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DVK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DVK(KP855719849, +0.855719849902058969314654733608091555096772472);
+     DVK(KP237294955, +0.237294955877110315393888866460840817927895961);
+     DVK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DVK(KP904508497, +0.904508497187473712051146708591409529430077295);
+     DVK(KP997675361, +0.997675361079556513670859573984492383596555031);
+     DVK(KP763932022, +0.763932022500210303590826331268723764559381640);
+     DVK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP952936919, +0.952936919628306576880750665357914584765951388);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP262346850, +0.262346850930607871785420028382979691334784273);
+     DVK(KP570584518, +0.570584518783621657366766175430996792655723863);
+     DVK(KP669429328, +0.669429328479476605641803240971985825917022098);
+     DVK(KP923225144, +0.923225144846402650453449441572664695995209956);
+     DVK(KP945422727, +0.945422727388575946270360266328811958657216298);
+     DVK(KP522616830, +0.522616830205754336872861364785224694908468440);
+     DVK(KP956723877, +0.956723877038460305821989399535483155872969262);
+     DVK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DVK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DVK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP921078979, +0.921078979742360627699756128143719920817673854);
+     DVK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DVK(KP982009705, +0.982009705009746369461829878184175962711969869);
+     DVK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DVK(KP494780565, +0.494780565770515410344588413655324772219443730);
+     DVK(KP447533225, +0.447533225982656890041886979663652563063114397);
+     DVK(KP269969613, +0.269969613759572083574752974412347470060951301);
+     DVK(KP244189809, +0.244189809627953270309879511234821255780225091);
+     DVK(KP667278218, +0.667278218140296670899089292254759909713898805);
+     DVK(KP603558818, +0.603558818296015001454675132653458027918768137);
+     DVK(KP522847744, +0.522847744331509716623755382187077770911012542);
+     DVK(KP578046249, +0.578046249379945007321754579646815604023525655);
+     DVK(KP987388751, +0.987388751065621252324603216482382109400433949);
+     DVK(KP893101515, +0.893101515366181661711202267938416198338079437);
+     DVK(KP120146378, +0.120146378570687701782758537356596213647956445);
+     DVK(KP132830569, +0.132830569247582714407653942074819768844536507);
+     DVK(KP869845200, +0.869845200362138853122720822420327157933056305);
+     DVK(KP786782374, +0.786782374965295178365099601674911834788448471);
+     DVK(KP066152395, +0.066152395967733048213034281011006031460903353);
+     DVK(KP059835404, +0.059835404262124915169548397419498386427871950);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V T25, T1B, T2y, T1K, T2s, T23, T1S, T26, T20, T1X;
+	       {
+		    V T1O, T2X, Te, T3L, Td, T3Q, T3j, T3b, T2R, T2M, T2f, T27, T1y, T1H, T3M;
+		    V TW, TR, TK, T2B, T3n, T3e, T2U, T2F, T2i, T2a, Tz, T1C, T3N, TQ, T11;
+		    V T1b, T1c, T16;
+		    {
+			 V T1, T1g, T1i, T1p, T1k, T1m, Tb, T1N, T6, T1M;
+			 {
+			      V T7, T9, T2, T4, T1f, T1h, T1o;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T7 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      T9 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      T4 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			      T1f = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T1h = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      T1o = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      {
+				   V T8, Ta, T3, T5, T1j;
+				   T1j = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+				   T8 = BYTW(&(W[TWVL * 18]), T7);
+				   Ta = BYTW(&(W[TWVL * 28]), T9);
+				   T3 = BYTW(&(W[TWVL * 8]), T2);
+				   T5 = BYTW(&(W[TWVL * 38]), T4);
+				   T1g = BYTW(&(W[TWVL * 4]), T1f);
+				   T1i = BYTW(&(W[TWVL * 14]), T1h);
+				   T1p = BYTW(&(W[TWVL * 34]), T1o);
+				   T1k = BYTW(&(W[TWVL * 44]), T1j);
+				   T1m = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   Tb = VADD(T8, Ta);
+				   T1N = VSUB(T8, Ta);
+				   T6 = VADD(T3, T5);
+				   T1M = VSUB(T3, T5);
+			      }
+			 }
+			 {
+			      V T1v, T1l, Th, Tj, T1w, T1q, Tq, Tk, Tn, Tg;
+			      Tg = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      {
+				   V Tc, Ti, T1n, Tp;
+				   Ti = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   T1v = VSUB(T1i, T1k);
+				   T1l = VADD(T1i, T1k);
+				   T1n = BYTW(&(W[TWVL * 24]), T1m);
+				   Tp = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T1O = VFMA(LDK(KP618033988), T1N, T1M);
+				   T2X = VFNMS(LDK(KP618033988), T1M, T1N);
+				   Te = VSUB(T6, Tb);
+				   Tc = VADD(T6, Tb);
+				   Th = BYTW(&(W[0]), Tg);
+				   Tj = BYTW(&(W[TWVL * 10]), Ti);
+				   T1w = VSUB(T1n, T1p);
+				   T1q = VADD(T1n, T1p);
+				   Tq = BYTW(&(W[TWVL * 30]), Tp);
+				   Tk = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+				   T3L = VADD(T1, Tc);
+				   Td = VFNMS(LDK(KP250000000), Tc, T1);
+				   Tn = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      }
+			      {
+				   V T1x, T2K, TM, TB, Tw, Tm, Tx, Tr, TI, T2L, T1u, TD, TF, TL;
+				   TL = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   {
+					V T1t, Tl, To, TH, T1s, T1r, TA, TC;
+					TA = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+					T1r = VADD(T1l, T1q);
+					T1t = VSUB(T1q, T1l);
+					T1x = VFMA(LDK(KP618033988), T1w, T1v);
+					T2K = VFNMS(LDK(KP618033988), T1v, T1w);
+					Tl = BYTW(&(W[TWVL * 40]), Tk);
+					To = BYTW(&(W[TWVL * 20]), Tn);
+					TM = BYTW(&(W[TWVL * 6]), TL);
+					TB = BYTW(&(W[TWVL * 46]), TA);
+					TH = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					T1s = VFNMS(LDK(KP250000000), T1r, T1g);
+					T3Q = VADD(T1g, T1r);
+					TC = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					Tw = VSUB(Tj, Tl);
+					Tm = VADD(Tj, Tl);
+					Tx = VSUB(Tq, To);
+					Tr = VADD(To, Tq);
+					TI = BYTW(&(W[TWVL * 26]), TH);
+					T2L = VFMA(LDK(KP559016994), T1t, T1s);
+					T1u = VFNMS(LDK(KP559016994), T1t, T1s);
+					TD = BYTW(&(W[TWVL * 16]), TC);
+					TF = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+				   }
+				   {
+					V Tu, Ty, T2E, TE, TN, TG, Tt, TV, Ts;
+					TV = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					Ts = VADD(Tm, Tr);
+					Tu = VSUB(Tm, Tr);
+					Ty = VFNMS(LDK(KP618033988), Tx, Tw);
+					T2E = VFMA(LDK(KP618033988), Tw, Tx);
+					T3j = VFNMS(LDK(KP059835404), T2K, T2L);
+					T3b = VFMA(LDK(KP066152395), T2L, T2K);
+					T2R = VFNMS(LDK(KP786782374), T2K, T2L);
+					T2M = VFMA(LDK(KP869845200), T2L, T2K);
+					T2f = VFMA(LDK(KP132830569), T1u, T1x);
+					T27 = VFNMS(LDK(KP120146378), T1x, T1u);
+					T1y = VFNMS(LDK(KP893101515), T1x, T1u);
+					T1H = VFMA(LDK(KP987388751), T1u, T1x);
+					TE = VSUB(TB, TD);
+					TN = VADD(TD, TB);
+					TG = BYTW(&(W[TWVL * 36]), TF);
+					Tt = VFNMS(LDK(KP250000000), Ts, Th);
+					T3M = VADD(Th, Ts);
+					TW = BYTW(&(W[TWVL * 2]), TV);
+					{
+					     V TJ, TO, Tv, T2D, TY, T15, T10, T13, TP;
+					     {
+						  V TX, T14, TZ, T12;
+						  TX = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+						  T14 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+						  TZ = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+						  T12 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+						  TJ = VSUB(TG, TI);
+						  TO = VADD(TI, TG);
+						  Tv = VFMA(LDK(KP559016994), Tu, Tt);
+						  T2D = VFNMS(LDK(KP559016994), Tu, Tt);
+						  TY = BYTW(&(W[TWVL * 12]), TX);
+						  T15 = BYTW(&(W[TWVL * 32]), T14);
+						  T10 = BYTW(&(W[TWVL * 42]), TZ);
+						  T13 = BYTW(&(W[TWVL * 22]), T12);
+					     }
+					     TP = VADD(TN, TO);
+					     TR = VSUB(TN, TO);
+					     TK = VFMA(LDK(KP618033988), TJ, TE);
+					     T2B = VFNMS(LDK(KP618033988), TE, TJ);
+					     T3n = VFMA(LDK(KP578046249), T2D, T2E);
+					     T3e = VFNMS(LDK(KP522847744), T2E, T2D);
+					     T2U = VFNMS(LDK(KP987388751), T2D, T2E);
+					     T2F = VFMA(LDK(KP893101515), T2E, T2D);
+					     T2i = VFNMS(LDK(KP603558818), Ty, Tv);
+					     T2a = VFMA(LDK(KP667278218), Tv, Ty);
+					     Tz = VFNMS(LDK(KP244189809), Ty, Tv);
+					     T1C = VFMA(LDK(KP269969613), Tv, Ty);
+					     T3N = VADD(TM, TP);
+					     TQ = VFMS(LDK(KP250000000), TP, TM);
+					     T11 = VADD(TY, T10);
+					     T1b = VSUB(TY, T10);
+					     T1c = VSUB(T15, T13);
+					     T16 = VADD(T13, T15);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T2z, Tf, T3W, T3O, T1d, T2H, T3m, T2j, T2b, TT, T1D, T2G, T35, T2V, T2Z;
+			 V T3A, T3g, T2I, T1a, T3R, T3X;
+			 T2z = VFNMS(LDK(KP559016994), Te, Td);
+			 Tf = VFMA(LDK(KP559016994), Te, Td);
+			 {
+			      V TS, T2A, T17, T19;
+			      TS = VFNMS(LDK(KP559016994), TR, TQ);
+			      T2A = VFMA(LDK(KP559016994), TR, TQ);
+			      T3W = VSUB(T3M, T3N);
+			      T3O = VADD(T3M, T3N);
+			      T1d = VFNMS(LDK(KP618033988), T1c, T1b);
+			      T2H = VFMA(LDK(KP618033988), T1b, T1c);
+			      T17 = VADD(T11, T16);
+			      T19 = VSUB(T16, T11);
+			      {
+				   V T3f, T2T, T2C, T18, T3P;
+				   T3m = VFMA(LDK(KP447533225), T2B, T2A);
+				   T3f = VFNMS(LDK(KP494780565), T2A, T2B);
+				   T2T = VFNMS(LDK(KP132830569), T2A, T2B);
+				   T2C = VFMA(LDK(KP120146378), T2B, T2A);
+				   T2j = VFNMS(LDK(KP786782374), TK, TS);
+				   T2b = VFMA(LDK(KP869845200), TS, TK);
+				   TT = VFNMS(LDK(KP667278218), TS, TK);
+				   T1D = VFMA(LDK(KP603558818), TK, TS);
+				   T18 = VFNMS(LDK(KP250000000), T17, TW);
+				   T3P = VADD(TW, T17);
+				   T2G = VFMA(LDK(KP734762448), T2F, T2C);
+				   T35 = VFNMS(LDK(KP734762448), T2F, T2C);
+				   T2V = VFNMS(LDK(KP734762448), T2U, T2T);
+				   T2Z = VFMA(LDK(KP734762448), T2U, T2T);
+				   T3A = VFMA(LDK(KP982009705), T3f, T3e);
+				   T3g = VFNMS(LDK(KP982009705), T3f, T3e);
+				   T2I = VFMA(LDK(KP559016994), T19, T18);
+				   T1a = VFNMS(LDK(KP559016994), T19, T18);
+				   T3R = VADD(T3P, T3Q);
+				   T3X = VSUB(T3P, T3Q);
+			      }
+			 }
+			 {
+			      V T2n, T2t, T1V, T22, T2l, T2d, T1Q, T1I, T2w, T1A, T1F, T2q;
+			      {
+				   V T2k, T1G, T28, T2g, T3K, T3E, T3a, T34, T3x, T3H, T2c, TU, T1T, T1U, T1z;
+				   V T3o, T3t;
+				   T2n = VFNMS(LDK(KP912575812), T2j, T2i);
+				   T2k = VFMA(LDK(KP912575812), T2j, T2i);
+				   T3o = VFNMS(LDK(KP921078979), T3n, T3m);
+				   T3t = VFMA(LDK(KP921078979), T3n, T3m);
+				   {
+					V T3c, T2Q, T2J, T3k, T1e;
+					T3c = VFNMS(LDK(KP667278218), T2I, T2H);
+					T2Q = VFNMS(LDK(KP059835404), T2H, T2I);
+					T2J = VFMA(LDK(KP066152395), T2I, T2H);
+					T3k = VFMA(LDK(KP603558818), T2H, T2I);
+					T1G = VFMA(LDK(KP578046249), T1a, T1d);
+					T1e = VFNMS(LDK(KP522847744), T1d, T1a);
+					T28 = VFNMS(LDK(KP494780565), T1a, T1d);
+					T2g = VFMA(LDK(KP447533225), T1d, T1a);
+					{
+					     V T3U, T3S, T40, T3Y;
+					     T3U = VSUB(T3O, T3R);
+					     T3S = VADD(T3O, T3R);
+					     T40 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T3W, T3X));
+					     T3Y = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T3X, T3W));
+					     {
+						  V T3s, T3l, T2N, T36;
+						  T3s = VFNMS(LDK(KP845997307), T3k, T3j);
+						  T3l = VFMA(LDK(KP845997307), T3k, T3j);
+						  T2N = VFNMS(LDK(KP772036680), T2M, T2J);
+						  T36 = VFMA(LDK(KP772036680), T2M, T2J);
+						  {
+						       V T30, T2S, T3d, T3z, T3T;
+						       T30 = VFNMS(LDK(KP772036680), T2R, T2Q);
+						       T2S = VFMA(LDK(KP772036680), T2R, T2Q);
+						       T3d = VFNMS(LDK(KP845997307), T3c, T3b);
+						       T3z = VFMA(LDK(KP845997307), T3c, T3b);
+						       ST(&(x[0]), VADD(T3S, T3L), ms, &(x[0]));
+						       T3T = VFNMS(LDK(KP250000000), T3S, T3L);
+						       {
+							    V T3C, T3p, T2O, T37;
+							    T3C = VFMA(LDK(KP906616052), T3o, T3l);
+							    T3p = VFNMS(LDK(KP906616052), T3o, T3l);
+							    T2O = VFMA(LDK(KP956723877), T2N, T2G);
+							    T37 = VFMA(LDK(KP522616830), T2V, T36);
+							    {
+								 V T31, T2W, T3u, T3h;
+								 T31 = VFNMS(LDK(KP522616830), T2G, T30);
+								 T2W = VFMA(LDK(KP945422727), T2V, T2S);
+								 T3u = VFNMS(LDK(KP923225144), T3g, T3d);
+								 T3h = VFMA(LDK(KP923225144), T3g, T3d);
+								 {
+								      V T3I, T3B, T3V, T3Z;
+								      T3I = VFNMS(LDK(KP669429328), T3z, T3A);
+								      T3B = VFMA(LDK(KP570584518), T3A, T3z);
+								      T3V = VFMA(LDK(KP559016994), T3U, T3T);
+								      T3Z = VFNMS(LDK(KP559016994), T3U, T3T);
+								      {
+									   V T3y, T3q, T2P, T38;
+									   T3y = VFMA(LDK(KP262346850), T3p, T2X);
+									   T3q = VMUL(LDK(KP998026728), VFNMS(LDK(KP952936919), T2X, T3p));
+									   T2P = VFMA(LDK(KP992114701), T2O, T2z);
+									   T38 = VFNMS(LDK(KP690983005), T37, T2S);
+									   {
+										V T32, T2Y, T3v, T3F;
+										T32 = VFMA(LDK(KP763932022), T31, T2N);
+										T2Y = VMUL(LDK(KP998026728), VFMA(LDK(KP952936919), T2X, T2W));
+										T3v = VFNMS(LDK(KP997675361), T3u, T3t);
+										T3F = VFNMS(LDK(KP904508497), T3u, T3s);
+										{
+										     V T3i, T3r, T3J, T3D;
+										     T3i = VFMA(LDK(KP949179823), T3h, T2z);
+										     T3r = VFNMS(LDK(KP237294955), T3h, T2z);
+										     T3J = VFNMS(LDK(KP669429328), T3C, T3I);
+										     T3D = VFMA(LDK(KP618033988), T3C, T3B);
+										     ST(&(x[WS(rs, 20)]), VFNMSI(T3Y, T3V), ms, &(x[0]));
+										     ST(&(x[WS(rs, 5)]), VFMAI(T3Y, T3V), ms, &(x[WS(rs, 1)]));
+										     ST(&(x[WS(rs, 15)]), VFMAI(T40, T3Z), ms, &(x[WS(rs, 1)]));
+										     ST(&(x[WS(rs, 10)]), VFNMSI(T40, T3Z), ms, &(x[0]));
+										     {
+											  V T39, T33, T3w, T3G;
+											  T39 = VFMA(LDK(KP855719849), T38, T35);
+											  T33 = VFNMS(LDK(KP855719849), T32, T2Z);
+											  ST(&(x[WS(rs, 3)]), VFMAI(T2Y, T2P), ms, &(x[WS(rs, 1)]));
+											  ST(&(x[WS(rs, 22)]), VFNMSI(T2Y, T2P), ms, &(x[0]));
+											  T3w = VFMA(LDK(KP560319534), T3v, T3s);
+											  T3G = VFNMS(LDK(KP681693190), T3F, T3t);
+											  ST(&(x[WS(rs, 2)]), VFMAI(T3q, T3i), ms, &(x[0]));
+											  ST(&(x[WS(rs, 23)]), VFNMSI(T3q, T3i), ms, &(x[WS(rs, 1)]));
+											  T3K = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T3J, T3y));
+											  T3E = VMUL(LDK(KP951056516), VFNMS(LDK(KP949179823), T3D, T3y));
+											  T3a = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T39, T2X));
+											  T34 = VFMA(LDK(KP897376177), T33, T2z);
+											  T3x = VFNMS(LDK(KP949179823), T3w, T3r);
+											  T3H = VFNMS(LDK(KP860541664), T3G, T3r);
+											  T2t = VFNMS(LDK(KP912575812), T2b, T2a);
+											  T2c = VFMA(LDK(KP912575812), T2b, T2a);
+											  TU = VFMA(LDK(KP829049696), TT, Tz);
+											  T1T = VFNMS(LDK(KP829049696), TT, Tz);
+											  T1U = VFNMS(LDK(KP831864738), T1y, T1e);
+											  T1z = VFMA(LDK(KP831864738), T1y, T1e);
+										     }
+										}
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+				   {
+					V T2o, T2h, T29, T2u, T2v, T2p;
+					T2o = VFNMS(LDK(KP958953096), T2g, T2f);
+					T2h = VFMA(LDK(KP958953096), T2g, T2f);
+					ST(&(x[WS(rs, 17)]), VFNMSI(T3a, T34), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 8)]), VFMAI(T3a, T34), ms, &(x[0]));
+					ST(&(x[WS(rs, 13)]), VFMAI(T3E, T3x), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 12)]), VFNMSI(T3E, T3x), ms, &(x[0]));
+					ST(&(x[WS(rs, 7)]), VFNMSI(T3K, T3H), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 18)]), VFMAI(T3K, T3H), ms, &(x[0]));
+					T1V = VFMA(LDK(KP559154169), T1U, T1T);
+					T22 = VFNMS(LDK(KP683113946), T1T, T1U);
+					T29 = VFNMS(LDK(KP867381224), T28, T27);
+					T2u = VFMA(LDK(KP867381224), T28, T27);
+					T2l = VFMA(LDK(KP894834959), T2k, T2h);
+					T2v = VFMA(LDK(KP447417479), T2k, T2u);
+					T2d = VFNMS(LDK(KP809385824), T2c, T29);
+					T2p = VFMA(LDK(KP447417479), T2c, T2o);
+					T1Q = VFMA(LDK(KP831864738), T1H, T1G);
+					T1I = VFNMS(LDK(KP831864738), T1H, T1G);
+					T2w = VFNMS(LDK(KP763932022), T2v, T2h);
+					T1A = VFMA(LDK(KP904730450), T1z, TU);
+					T1F = VFNMS(LDK(KP904730450), T1z, TU);
+					T2q = VFMA(LDK(KP690983005), T2p, T29);
+				   }
+			      }
+			      {
+				   V T2e, T1E, T1P, T2m;
+				   T2e = VFNMS(LDK(KP992114701), T2d, Tf);
+				   T1E = VFMA(LDK(KP916574801), T1D, T1C);
+				   T1P = VFNMS(LDK(KP916574801), T1D, T1C);
+				   T2m = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T2l, T1O));
+				   {
+					V T1J, T2r, T1R, T1W, T1Z, T2x;
+					T2x = VFNMS(LDK(KP999544308), T2w, T2t);
+					T1J = VFNMS(LDK(KP904730450), T1I, T1F);
+					T25 = VFMA(LDK(KP968583161), T1A, Tf);
+					T1B = VFNMS(LDK(KP242145790), T1A, Tf);
+					T2r = VFNMS(LDK(KP999544308), T2q, T2n);
+					T1R = VFMA(LDK(KP904730450), T1Q, T1P);
+					T1W = VFNMS(LDK(KP904730450), T1Q, T1P);
+					T1Z = VADD(T1E, T1F);
+					ST(&(x[WS(rs, 21)]), VFMAI(T2m, T2e), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 4)]), VFNMSI(T2m, T2e), ms, &(x[0]));
+					T2y = VMUL(LDK(KP951056516), VFNMS(LDK(KP803003575), T2x, T1O));
+					T1K = VFNMS(LDK(KP618033988), T1J, T1E);
+					T2s = VFNMS(LDK(KP803003575), T2r, Tf);
+					T23 = VFMA(LDK(KP617882369), T1W, T22);
+					T1S = VFNMS(LDK(KP242145790), T1R, T1O);
+					T26 = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1R, T1O));
+					T20 = VFNMS(LDK(KP683113946), T1Z, T1I);
+					T1X = VFMA(LDK(KP559016994), T1W, T1V);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1L, T24, T21, T1Y;
+		    T1L = VFNMS(LDK(KP876091699), T1K, T1B);
+		    ST(&(x[WS(rs, 16)]), VFMAI(T2y, T2s), ms, &(x[0]));
+		    ST(&(x[WS(rs, 9)]), VFNMSI(T2y, T2s), ms, &(x[WS(rs, 1)]));
+		    T24 = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T23, T1S));
+		    ST(&(x[WS(rs, 24)]), VFNMSI(T26, T25), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T26, T25), ms, &(x[WS(rs, 1)]));
+		    T21 = VFMA(LDK(KP792626838), T20, T1B);
+		    T1Y = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1X, T1S));
+		    ST(&(x[WS(rs, 11)]), VFMAI(T24, T21), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 14)]), VFNMSI(T24, T21), ms, &(x[0]));
+		    ST(&(x[WS(rs, 19)]), VFNMSI(T1Y, T1L), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VFMAI(T1Y, T1L), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t2bv_25"), twinstr, &GENUS, {67, 60, 181, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_25) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t2bv_25 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 248 FP additions, 188 FP multiplications,
+ * (or, 171 additions, 111 multiplications, 77 fused multiply/add),
+ * 100 stack variables, 40 constants, and 50 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP497379774, +0.497379774329709576484567492012895936835134813);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DVK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DVK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DVK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DVK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
+     DVK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DVK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DVK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DVK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DVK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
+     DVK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DVK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DVK(KP851558583, +0.851558583130145297725004891488503407959946084);
+     DVK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DVK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DVK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
+     DVK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
+     DVK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DVK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DVK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
+     DVK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DVK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DVK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DVK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
+     DVK(KP125581039, +0.125581039058626752152356449131262266244969664);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP250666467, +0.250666467128608490746237519633017587885836494);
+     DVK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DVK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
+     DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V T1A, T1z, T1R, T1S, T1B, T1C, T1Q, T2L, T1l, T2v, T1i, T3e, T2u, Tb, T2i;
+	       V Tj, T3b, T2h, Tv, T2k, TD, T3a, T2l, T11, T2s, TY, T3d, T2r;
+	       {
+		    V T1v, T1x, T1y, T1q, T1s, T1t, T1P;
+		    T1A = LD(&(x[0]), ms, &(x[0]));
+		    {
+			 V T1u, T1w, T1p, T1r;
+			 T1u = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 T1v = BYTW(&(W[TWVL * 18]), T1u);
+			 T1w = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T1x = BYTW(&(W[TWVL * 28]), T1w);
+			 T1y = VADD(T1v, T1x);
+			 T1p = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T1q = BYTW(&(W[TWVL * 8]), T1p);
+			 T1r = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 T1s = BYTW(&(W[TWVL * 38]), T1r);
+			 T1t = VADD(T1q, T1s);
+		    }
+		    T1z = VMUL(LDK(KP559016994), VSUB(T1t, T1y));
+		    T1R = VSUB(T1v, T1x);
+		    T1S = VMUL(LDK(KP587785252), T1R);
+		    T1B = VADD(T1t, T1y);
+		    T1C = VFNMS(LDK(KP250000000), T1B, T1A);
+		    T1P = VSUB(T1q, T1s);
+		    T1Q = VMUL(LDK(KP951056516), T1P);
+		    T2L = VMUL(LDK(KP587785252), T1P);
+	       }
+	       {
+		    V T1f, T19, T1b, T1c, T14, T16, T17, T1e;
+		    T1e = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T1f = BYTW(&(W[TWVL * 4]), T1e);
+		    {
+			 V T18, T1a, T13, T15;
+			 T18 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T19 = BYTW(&(W[TWVL * 24]), T18);
+			 T1a = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 T1b = BYTW(&(W[TWVL * 34]), T1a);
+			 T1c = VADD(T19, T1b);
+			 T13 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T14 = BYTW(&(W[TWVL * 14]), T13);
+			 T15 = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T16 = BYTW(&(W[TWVL * 44]), T15);
+			 T17 = VADD(T14, T16);
+		    }
+		    {
+			 V T1j, T1k, T1d, T1g, T1h;
+			 T1j = VSUB(T14, T16);
+			 T1k = VSUB(T19, T1b);
+			 T1l = VFMA(LDK(KP475528258), T1j, VMUL(LDK(KP293892626), T1k));
+			 T2v = VFNMS(LDK(KP475528258), T1k, VMUL(LDK(KP293892626), T1j));
+			 T1d = VMUL(LDK(KP559016994), VSUB(T17, T1c));
+			 T1g = VADD(T17, T1c);
+			 T1h = VFNMS(LDK(KP250000000), T1g, T1f);
+			 T1i = VADD(T1d, T1h);
+			 T3e = VADD(T1f, T1g);
+			 T2u = VSUB(T1h, T1d);
+		    }
+	       }
+	       {
+		    V Tg, T7, T9, Td, T2, T4, Tc, Tf;
+		    Tf = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tg = BYTW(&(W[TWVL * 6]), Tf);
+		    {
+			 V T6, T8, T1, T3;
+			 T6 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 T7 = BYTW(&(W[TWVL * 26]), T6);
+			 T8 = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 36]), T8);
+			 Td = VADD(T7, T9);
+			 T1 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTW(&(W[TWVL * 16]), T1);
+			 T3 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			 T4 = BYTW(&(W[TWVL * 46]), T3);
+			 Tc = VADD(T2, T4);
+		    }
+		    {
+			 V T5, Ta, Te, Th, Ti;
+			 T5 = VSUB(T2, T4);
+			 Ta = VSUB(T7, T9);
+			 Tb = VFMA(LDK(KP475528258), T5, VMUL(LDK(KP293892626), Ta));
+			 T2i = VFNMS(LDK(KP475528258), Ta, VMUL(LDK(KP293892626), T5));
+			 Te = VMUL(LDK(KP559016994), VSUB(Tc, Td));
+			 Th = VADD(Tc, Td);
+			 Ti = VFNMS(LDK(KP250000000), Th, Tg);
+			 Tj = VADD(Te, Ti);
+			 T3b = VADD(Tg, Th);
+			 T2h = VSUB(Ti, Te);
+		    }
+	       }
+	       {
+		    V TA, Tr, Tt, Tx, Tm, To, Tw, Tz;
+		    Tz = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    TA = BYTW(&(W[0]), Tz);
+		    {
+			 V Tq, Ts, Tl, Tn;
+			 Tq = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 Tr = BYTW(&(W[TWVL * 20]), Tq);
+			 Ts = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			 Tt = BYTW(&(W[TWVL * 30]), Ts);
+			 Tx = VADD(Tr, Tt);
+			 Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tm = BYTW(&(W[TWVL * 10]), Tl);
+			 Tn = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			 To = BYTW(&(W[TWVL * 40]), Tn);
+			 Tw = VADD(Tm, To);
+		    }
+		    {
+			 V Tp, Tu, Ty, TB, TC;
+			 Tp = VSUB(Tm, To);
+			 Tu = VSUB(Tr, Tt);
+			 Tv = VFMA(LDK(KP475528258), Tp, VMUL(LDK(KP293892626), Tu));
+			 T2k = VFNMS(LDK(KP475528258), Tu, VMUL(LDK(KP293892626), Tp));
+			 Ty = VMUL(LDK(KP559016994), VSUB(Tw, Tx));
+			 TB = VADD(Tw, Tx);
+			 TC = VFNMS(LDK(KP250000000), TB, TA);
+			 TD = VADD(Ty, TC);
+			 T3a = VADD(TA, TB);
+			 T2l = VSUB(TC, Ty);
+		    }
+	       }
+	       {
+		    V TV, TP, TR, TS, TK, TM, TN, TU;
+		    TU = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    TV = BYTW(&(W[TWVL * 2]), TU);
+		    {
+			 V TO, TQ, TJ, TL;
+			 TO = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 TP = BYTW(&(W[TWVL * 22]), TO);
+			 TQ = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 TR = BYTW(&(W[TWVL * 32]), TQ);
+			 TS = VADD(TP, TR);
+			 TJ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 TK = BYTW(&(W[TWVL * 12]), TJ);
+			 TL = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 TM = BYTW(&(W[TWVL * 42]), TL);
+			 TN = VADD(TK, TM);
+		    }
+		    {
+			 V TZ, T10, TT, TW, TX;
+			 TZ = VSUB(TK, TM);
+			 T10 = VSUB(TP, TR);
+			 T11 = VFMA(LDK(KP475528258), TZ, VMUL(LDK(KP293892626), T10));
+			 T2s = VFNMS(LDK(KP475528258), T10, VMUL(LDK(KP293892626), TZ));
+			 TT = VMUL(LDK(KP559016994), VSUB(TN, TS));
+			 TW = VADD(TN, TS);
+			 TX = VFNMS(LDK(KP250000000), TW, TV);
+			 TY = VADD(TT, TX);
+			 T3d = VADD(TV, TW);
+			 T2r = VSUB(TX, TT);
+		    }
+	       }
+	       {
+		    V T3g, T3o, T3k, T3l, T3j, T3m, T3p, T3n;
+		    {
+			 V T3c, T3f, T3h, T3i;
+			 T3c = VSUB(T3a, T3b);
+			 T3f = VSUB(T3d, T3e);
+			 T3g = VBYI(VFMA(LDK(KP951056516), T3c, VMUL(LDK(KP587785252), T3f)));
+			 T3o = VBYI(VFNMS(LDK(KP951056516), T3f, VMUL(LDK(KP587785252), T3c)));
+			 T3k = VADD(T1A, T1B);
+			 T3h = VADD(T3a, T3b);
+			 T3i = VADD(T3d, T3e);
+			 T3l = VADD(T3h, T3i);
+			 T3j = VMUL(LDK(KP559016994), VSUB(T3h, T3i));
+			 T3m = VFNMS(LDK(KP250000000), T3l, T3k);
+		    }
+		    ST(&(x[0]), VADD(T3k, T3l), ms, &(x[0]));
+		    T3p = VSUB(T3m, T3j);
+		    ST(&(x[WS(rs, 10)]), VADD(T3o, T3p), ms, &(x[0]));
+		    ST(&(x[WS(rs, 15)]), VSUB(T3p, T3o), ms, &(x[WS(rs, 1)]));
+		    T3n = VADD(T3j, T3m);
+		    ST(&(x[WS(rs, 5)]), VADD(T3g, T3n), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 20)]), VSUB(T3n, T3g), ms, &(x[0]));
+	       }
+	       {
+		    V T2z, T2M, T2U, T2V, T2W, T34, T35, T36, T2X, T2Y, T2Z, T31, T32, T33, T2n;
+		    V T2N, T2E, T2K, T2y, T2H, T2A, T2G, T38, T39;
+		    T2z = VSUB(T1C, T1z);
+		    T2M = VFNMS(LDK(KP951056516), T1R, T2L);
+		    T2U = VFMA(LDK(KP1_369094211), T2k, VMUL(LDK(KP728968627), T2l));
+		    T2V = VFNMS(LDK(KP992114701), T2h, VMUL(LDK(KP250666467), T2i));
+		    T2W = VADD(T2U, T2V);
+		    T34 = VFNMS(LDK(KP125581039), T2s, VMUL(LDK(KP998026728), T2r));
+		    T35 = VFMA(LDK(KP1_274847979), T2v, VMUL(LDK(KP770513242), T2u));
+		    T36 = VADD(T34, T35);
+		    T2X = VFMA(LDK(KP1_996053456), T2s, VMUL(LDK(KP062790519), T2r));
+		    T2Y = VFNMS(LDK(KP637423989), T2u, VMUL(LDK(KP1_541026485), T2v));
+		    T2Z = VADD(T2X, T2Y);
+		    T31 = VFNMS(LDK(KP1_457937254), T2k, VMUL(LDK(KP684547105), T2l));
+		    T32 = VFMA(LDK(KP1_984229402), T2i, VMUL(LDK(KP125333233), T2h));
+		    T33 = VADD(T31, T32);
+		    {
+			 V T2j, T2m, T2I, T2C, T2D, T2J;
+			 T2j = VFNMS(LDK(KP851558583), T2i, VMUL(LDK(KP904827052), T2h));
+			 T2m = VFMA(LDK(KP1_752613360), T2k, VMUL(LDK(KP481753674), T2l));
+			 T2I = VADD(T2m, T2j);
+			 T2C = VFMA(LDK(KP1_071653589), T2s, VMUL(LDK(KP844327925), T2r));
+			 T2D = VFMA(LDK(KP125581039), T2v, VMUL(LDK(KP998026728), T2u));
+			 T2J = VADD(T2C, T2D);
+			 T2n = VSUB(T2j, T2m);
+			 T2N = VADD(T2I, T2J);
+			 T2E = VSUB(T2C, T2D);
+			 T2K = VMUL(LDK(KP559016994), VSUB(T2I, T2J));
+		    }
+		    {
+			 V T2o, T2p, T2q, T2t, T2w, T2x;
+			 T2o = VFNMS(LDK(KP963507348), T2k, VMUL(LDK(KP876306680), T2l));
+			 T2p = VFMA(LDK(KP1_809654104), T2i, VMUL(LDK(KP425779291), T2h));
+			 T2q = VSUB(T2o, T2p);
+			 T2t = VFNMS(LDK(KP1_688655851), T2s, VMUL(LDK(KP535826794), T2r));
+			 T2w = VFNMS(LDK(KP1_996053456), T2v, VMUL(LDK(KP062790519), T2u));
+			 T2x = VADD(T2t, T2w);
+			 T2y = VMUL(LDK(KP559016994), VSUB(T2q, T2x));
+			 T2H = VSUB(T2t, T2w);
+			 T2A = VADD(T2q, T2x);
+			 T2G = VADD(T2o, T2p);
+		    }
+		    {
+			 V T2S, T2T, T30, T37;
+			 T2S = VADD(T2z, T2A);
+			 T2T = VBYI(VADD(T2M, T2N));
+			 ST(&(x[WS(rs, 23)]), VSUB(T2S, T2T), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 2)]), VADD(T2S, T2T), ms, &(x[0]));
+			 T30 = VADD(T2z, VADD(T2W, T2Z));
+			 T37 = VBYI(VSUB(VADD(T33, T36), T2M));
+			 ST(&(x[WS(rs, 22)]), VSUB(T30, T37), ms, &(x[0]));
+			 ST(&(x[WS(rs, 3)]), VADD(T30, T37), ms, &(x[WS(rs, 1)]));
+		    }
+		    T38 = VBYI(VSUB(VFMA(LDK(KP951056516), VSUB(T2U, T2V), VFMA(LDK(KP309016994), T33, VFNMS(LDK(KP809016994), T36, VMUL(LDK(KP587785252), VSUB(T2X, T2Y))))), T2M));
+		    T39 = VFMA(LDK(KP309016994), T2W, VFMA(LDK(KP951056516), VSUB(T32, T31), VFMA(LDK(KP587785252), VSUB(T35, T34), VFNMS(LDK(KP809016994), T2Z, T2z))));
+		    ST(&(x[WS(rs, 8)]), VADD(T38, T39), ms, &(x[0]));
+		    ST(&(x[WS(rs, 17)]), VSUB(T39, T38), ms, &(x[WS(rs, 1)]));
+		    {
+			 V T2F, T2Q, T2P, T2R, T2B, T2O;
+			 T2B = VFNMS(LDK(KP250000000), T2A, T2z);
+			 T2F = VFMA(LDK(KP951056516), T2n, VADD(T2y, VFNMS(LDK(KP587785252), T2E, T2B)));
+			 T2Q = VFMA(LDK(KP587785252), T2n, VFMA(LDK(KP951056516), T2E, VSUB(T2B, T2y)));
+			 T2O = VFNMS(LDK(KP250000000), T2N, T2M);
+			 T2P = VBYI(VADD(VFMA(LDK(KP951056516), T2G, VMUL(LDK(KP587785252), T2H)), VADD(T2K, T2O)));
+			 T2R = VBYI(VADD(VFNMS(LDK(KP951056516), T2H, VMUL(LDK(KP587785252), T2G)), VSUB(T2O, T2K)));
+			 ST(&(x[WS(rs, 18)]), VSUB(T2F, T2P), ms, &(x[0]));
+			 ST(&(x[WS(rs, 12)]), VADD(T2Q, T2R), ms, &(x[0]));
+			 ST(&(x[WS(rs, 7)]), VADD(T2F, T2P), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VSUB(T2Q, T2R), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T1D, T1T, T21, T22, T23, T2b, T2c, T2d, T24, T25, T26, T28, T29, T2a, TF;
+		    V T1U, T1I, T1O, T1o, T1L, T1E, T1K, T2f, T2g;
+		    T1D = VADD(T1z, T1C);
+		    T1T = VADD(T1Q, T1S);
+		    T21 = VFMA(LDK(KP1_688655851), Tv, VMUL(LDK(KP535826794), TD));
+		    T22 = VFMA(LDK(KP1_541026485), Tb, VMUL(LDK(KP637423989), Tj));
+		    T23 = VSUB(T21, T22);
+		    T2b = VFMA(LDK(KP851558583), T11, VMUL(LDK(KP904827052), TY));
+		    T2c = VFMA(LDK(KP1_984229402), T1l, VMUL(LDK(KP125333233), T1i));
+		    T2d = VADD(T2b, T2c);
+		    T24 = VFNMS(LDK(KP425779291), TY, VMUL(LDK(KP1_809654104), T11));
+		    T25 = VFNMS(LDK(KP992114701), T1i, VMUL(LDK(KP250666467), T1l));
+		    T26 = VADD(T24, T25);
+		    T28 = VFNMS(LDK(KP1_071653589), Tv, VMUL(LDK(KP844327925), TD));
+		    T29 = VFNMS(LDK(KP770513242), Tj, VMUL(LDK(KP1_274847979), Tb));
+		    T2a = VADD(T28, T29);
+		    {
+			 V Tk, TE, T1M, T1G, T1H, T1N;
+			 Tk = VFMA(LDK(KP1_071653589), Tb, VMUL(LDK(KP844327925), Tj));
+			 TE = VFMA(LDK(KP1_937166322), Tv, VMUL(LDK(KP248689887), TD));
+			 T1M = VADD(TE, Tk);
+			 T1G = VFMA(LDK(KP1_752613360), T11, VMUL(LDK(KP481753674), TY));
+			 T1H = VFMA(LDK(KP1_457937254), T1l, VMUL(LDK(KP684547105), T1i));
+			 T1N = VADD(T1G, T1H);
+			 TF = VSUB(Tk, TE);
+			 T1U = VADD(T1M, T1N);
+			 T1I = VSUB(T1G, T1H);
+			 T1O = VMUL(LDK(KP559016994), VSUB(T1M, T1N));
+		    }
+		    {
+			 V TG, TH, TI, T12, T1m, T1n;
+			 TG = VFNMS(LDK(KP497379774), Tv, VMUL(LDK(KP968583161), TD));
+			 TH = VFNMS(LDK(KP1_688655851), Tb, VMUL(LDK(KP535826794), Tj));
+			 TI = VADD(TG, TH);
+			 T12 = VFNMS(LDK(KP963507348), T11, VMUL(LDK(KP876306680), TY));
+			 T1m = VFNMS(LDK(KP1_369094211), T1l, VMUL(LDK(KP728968627), T1i));
+			 T1n = VADD(T12, T1m);
+			 T1o = VMUL(LDK(KP559016994), VSUB(TI, T1n));
+			 T1L = VSUB(T12, T1m);
+			 T1E = VADD(TI, T1n);
+			 T1K = VSUB(TG, TH);
+		    }
+		    {
+			 V T1Z, T20, T27, T2e;
+			 T1Z = VADD(T1D, T1E);
+			 T20 = VBYI(VADD(T1T, T1U));
+			 ST(&(x[WS(rs, 24)]), VSUB(T1Z, T20), ms, &(x[0]));
+			 ST(&(x[WS(rs, 1)]), VADD(T1Z, T20), ms, &(x[WS(rs, 1)]));
+			 T27 = VADD(T1D, VADD(T23, T26));
+			 T2e = VBYI(VSUB(VADD(T2a, T2d), T1T));
+			 ST(&(x[WS(rs, 21)]), VSUB(T27, T2e), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 4)]), VADD(T27, T2e), ms, &(x[0]));
+		    }
+		    T2f = VBYI(VSUB(VFMA(LDK(KP309016994), T2a, VFMA(LDK(KP951056516), VADD(T21, T22), VFNMS(LDK(KP809016994), T2d, VMUL(LDK(KP587785252), VSUB(T24, T25))))), T1T));
+		    T2g = VFMA(LDK(KP951056516), VSUB(T29, T28), VFMA(LDK(KP309016994), T23, VFMA(LDK(KP587785252), VSUB(T2c, T2b), VFNMS(LDK(KP809016994), T26, T1D))));
+		    ST(&(x[WS(rs, 9)]), VADD(T2f, T2g), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 16)]), VSUB(T2g, T2f), ms, &(x[0]));
+		    {
+			 V T1J, T1X, T1W, T1Y, T1F, T1V;
+			 T1F = VFNMS(LDK(KP250000000), T1E, T1D);
+			 T1J = VFMA(LDK(KP951056516), TF, VADD(T1o, VFNMS(LDK(KP587785252), T1I, T1F)));
+			 T1X = VFMA(LDK(KP587785252), TF, VFMA(LDK(KP951056516), T1I, VSUB(T1F, T1o)));
+			 T1V = VFNMS(LDK(KP250000000), T1U, T1T);
+			 T1W = VBYI(VADD(VFMA(LDK(KP951056516), T1K, VMUL(LDK(KP587785252), T1L)), VADD(T1O, T1V)));
+			 T1Y = VBYI(VADD(VFNMS(LDK(KP951056516), T1L, VMUL(LDK(KP587785252), T1K)), VSUB(T1V, T1O)));
+			 ST(&(x[WS(rs, 19)]), VSUB(T1J, T1W), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VADD(T1X, T1Y), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 6)]), VADD(T1J, T1W), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VSUB(T1X, T1Y), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t2bv_25"), twinstr, &GENUS, {171, 111, 77, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_25) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,865 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:10 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2bv_32 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 217 FP additions, 160 FP multiplications,
+ * (or, 119 additions, 62 multiplications, 98 fused multiply/add),
+ * 104 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T26, T25, T2a, T2i, T24, T2c, T2g, T2k, T2h, T27;
+	       {
+		    V T4, T1z, T2o, T32, T2r, T3f, Tf, T1A, T34, T2O, T1D, TC, T33, T2L, T1C;
+		    V Tr, T2C, T3a, T2F, T3b, T1r, T21, T1k, T20, TQ, TM, TS, TL, T2t, TJ;
+		    V T10, T2u;
+		    {
+			 V Tt, T9, T2p, Te, T2q, TA, Tu, Tx;
+			 {
+			      V T1, T1x, T2, T1v;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T1x = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			      T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			      T1v = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      {
+				   V T5, Tc, T7, Ta, T2m, T2n;
+				   T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+				   Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+				   {
+					V T1y, T3, T1w, T6, Td, T8, Tb, Ts, Tz;
+					Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+					T1y = BYTW(&(W[TWVL * 46]), T1x);
+					T3 = BYTW(&(W[TWVL * 30]), T2);
+					T1w = BYTW(&(W[TWVL * 14]), T1v);
+					T6 = BYTW(&(W[TWVL * 6]), T5);
+					Td = BYTW(&(W[TWVL * 22]), Tc);
+					T8 = BYTW(&(W[TWVL * 38]), T7);
+					Tb = BYTW(&(W[TWVL * 54]), Ta);
+					Tt = BYTW(&(W[TWVL * 58]), Ts);
+					Tz = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+					T4 = VSUB(T1, T3);
+					T2m = VADD(T1, T3);
+					T1z = VSUB(T1w, T1y);
+					T2n = VADD(T1w, T1y);
+					T9 = VSUB(T6, T8);
+					T2p = VADD(T6, T8);
+					Te = VSUB(Tb, Td);
+					T2q = VADD(Tb, Td);
+					TA = BYTW(&(W[TWVL * 10]), Tz);
+				   }
+				   Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   T2o = VADD(T2m, T2n);
+				   T32 = VSUB(T2m, T2n);
+				   Tx = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			      }
+			 }
+			 {
+			      V Tv, To, Ty, Ti, Tj, Tm, Th;
+			      Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      T2r = VADD(T2p, T2q);
+			      T3f = VSUB(T2p, T2q);
+			      Tf = VADD(T9, Te);
+			      T1A = VSUB(T9, Te);
+			      Tv = BYTW(&(W[TWVL * 26]), Tu);
+			      To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			      Ty = BYTW(&(W[TWVL * 42]), Tx);
+			      Ti = BYTW(&(W[TWVL * 2]), Th);
+			      Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      {
+				   V T1f, T1h, T1a, T1c, T18, T2A, T2B, T1p;
+				   {
+					V T15, T17, T1o, T1m;
+					{
+					     V Tw, T2M, Tp, T2N, TB, Tk, Tn, T1n, T14, T16;
+					     T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+					     T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					     Tw = VSUB(Tt, Tv);
+					     T2M = VADD(Tt, Tv);
+					     Tp = BYTW(&(W[TWVL * 50]), To);
+					     T2N = VADD(TA, Ty);
+					     TB = VSUB(Ty, TA);
+					     Tk = BYTW(&(W[TWVL * 34]), Tj);
+					     Tn = BYTW(&(W[TWVL * 18]), Tm);
+					     T15 = BYTW(&(W[TWVL * 60]), T14);
+					     T17 = BYTW(&(W[TWVL * 28]), T16);
+					     T1n = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     {
+						  V T2J, Tl, T2K, Tq, T1l;
+						  T1l = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+						  T34 = VSUB(T2M, T2N);
+						  T2O = VADD(T2M, T2N);
+						  T1D = VFMA(LDK(KP414213562), Tw, TB);
+						  TC = VFNMS(LDK(KP414213562), TB, Tw);
+						  T2J = VADD(Ti, Tk);
+						  Tl = VSUB(Ti, Tk);
+						  T2K = VADD(Tn, Tp);
+						  Tq = VSUB(Tn, Tp);
+						  T1o = BYTW(&(W[TWVL * 12]), T1n);
+						  T1m = BYTW(&(W[TWVL * 44]), T1l);
+						  {
+						       V T1e, T1g, T19, T1b;
+						       T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+						       T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+						       T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+						       T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+						       T33 = VSUB(T2J, T2K);
+						       T2L = VADD(T2J, T2K);
+						       T1C = VFMA(LDK(KP414213562), Tl, Tq);
+						       Tr = VFNMS(LDK(KP414213562), Tq, Tl);
+						       T1f = BYTW(&(W[TWVL * 52]), T1e);
+						       T1h = BYTW(&(W[TWVL * 20]), T1g);
+						       T1a = BYTW(&(W[TWVL * 4]), T19);
+						       T1c = BYTW(&(W[TWVL * 36]), T1b);
+						  }
+					     }
+					}
+					T18 = VSUB(T15, T17);
+					T2A = VADD(T15, T17);
+					T2B = VADD(T1o, T1m);
+					T1p = VSUB(T1m, T1o);
+				   }
+				   {
+					V TG, TI, TZ, TX;
+					{
+					     V T1i, T2E, T1d, T2D, TH, TY, TF;
+					     TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					     T1i = VSUB(T1f, T1h);
+					     T2E = VADD(T1f, T1h);
+					     T1d = VSUB(T1a, T1c);
+					     T2D = VADD(T1a, T1c);
+					     TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					     TY = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+					     T2C = VADD(T2A, T2B);
+					     T3a = VSUB(T2A, T2B);
+					     TG = BYTW(&(W[0]), TF);
+					     {
+						  V TW, T1j, T1q, TP, TR, TK;
+						  TW = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+						  T2F = VADD(T2D, T2E);
+						  T3b = VSUB(T2E, T2D);
+						  T1j = VADD(T1d, T1i);
+						  T1q = VSUB(T1i, T1d);
+						  TI = BYTW(&(W[TWVL * 32]), TH);
+						  TZ = BYTW(&(W[TWVL * 48]), TY);
+						  TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+						  TX = BYTW(&(W[TWVL * 16]), TW);
+						  TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+						  TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+						  T1r = VFMA(LDK(KP707106781), T1q, T1p);
+						  T21 = VFNMS(LDK(KP707106781), T1q, T1p);
+						  T1k = VFMA(LDK(KP707106781), T1j, T18);
+						  T20 = VFNMS(LDK(KP707106781), T1j, T18);
+						  TQ = BYTW(&(W[TWVL * 56]), TP);
+						  TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+						  TS = BYTW(&(W[TWVL * 24]), TR);
+						  TL = BYTW(&(W[TWVL * 8]), TK);
+					     }
+					}
+					T2t = VADD(TG, TI);
+					TJ = VSUB(TG, TI);
+					T10 = VSUB(TX, TZ);
+					T2u = VADD(TX, TZ);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T2s, TT, T2x, T2P, T2Y, T2G, T37, T2v, T2w, TO, T2W, T30, T2U, TN, T2V;
+			 T2s = VSUB(T2o, T2r);
+			 T2U = VADD(T2o, T2r);
+			 TN = BYTW(&(W[TWVL * 40]), TM);
+			 TT = VSUB(TQ, TS);
+			 T2x = VADD(TQ, TS);
+			 T2P = VSUB(T2L, T2O);
+			 T2V = VADD(T2L, T2O);
+			 T2Y = VADD(T2C, T2F);
+			 T2G = VSUB(T2C, T2F);
+			 T37 = VSUB(T2t, T2u);
+			 T2v = VADD(T2t, T2u);
+			 T2w = VADD(TL, TN);
+			 TO = VSUB(TL, TN);
+			 T2W = VSUB(T2U, T2V);
+			 T30 = VADD(T2U, T2V);
+			 {
+			      V T1Y, T12, T1X, TV, T3n, T3t, T3m, T3q;
+			      {
+				   V T3o, T36, T3r, T3h, T3k, T3p, T3d, T3s, T2H, T2Q, T2Z, T31;
+				   {
+					V T35, T3g, T38, T2y, T11, TU, T3c, T3j;
+					T35 = VADD(T33, T34);
+					T3g = VSUB(T33, T34);
+					T38 = VSUB(T2w, T2x);
+					T2y = VADD(T2w, T2x);
+					T11 = VSUB(TO, TT);
+					TU = VADD(TO, TT);
+					T3c = VFNMS(LDK(KP414213562), T3b, T3a);
+					T3j = VFMA(LDK(KP414213562), T3a, T3b);
+					T3o = VFNMS(LDK(KP707106781), T35, T32);
+					T36 = VFMA(LDK(KP707106781), T35, T32);
+					T3r = VFNMS(LDK(KP707106781), T3g, T3f);
+					T3h = VFMA(LDK(KP707106781), T3g, T3f);
+					{
+					     V T3i, T39, T2z, T2X;
+					     T3i = VFMA(LDK(KP414213562), T37, T38);
+					     T39 = VFNMS(LDK(KP414213562), T38, T37);
+					     T2z = VSUB(T2v, T2y);
+					     T2X = VADD(T2v, T2y);
+					     T1Y = VFNMS(LDK(KP707106781), T11, T10);
+					     T12 = VFMA(LDK(KP707106781), T11, T10);
+					     T1X = VFNMS(LDK(KP707106781), TU, TJ);
+					     TV = VFMA(LDK(KP707106781), TU, TJ);
+					     T3k = VSUB(T3i, T3j);
+					     T3p = VADD(T3i, T3j);
+					     T3d = VADD(T39, T3c);
+					     T3s = VSUB(T39, T3c);
+					     T2H = VADD(T2z, T2G);
+					     T2Q = VSUB(T2z, T2G);
+					     T2Z = VSUB(T2X, T2Y);
+					     T31 = VADD(T2X, T2Y);
+					}
+				   }
+				   {
+					V T3v, T3u, T3l, T3e;
+					T3l = VFNMS(LDK(KP923879532), T3k, T3h);
+					T3n = VFMA(LDK(KP923879532), T3k, T3h);
+					T3t = VFMA(LDK(KP923879532), T3s, T3r);
+					T3v = VFNMS(LDK(KP923879532), T3s, T3r);
+					T3e = VFNMS(LDK(KP923879532), T3d, T36);
+					T3m = VFMA(LDK(KP923879532), T3d, T36);
+					{
+					     V T2R, T2T, T2I, T2S;
+					     T2R = VFNMS(LDK(KP707106781), T2Q, T2P);
+					     T2T = VFMA(LDK(KP707106781), T2Q, T2P);
+					     T2I = VFNMS(LDK(KP707106781), T2H, T2s);
+					     T2S = VFMA(LDK(KP707106781), T2H, T2s);
+					     ST(&(x[WS(rs, 16)]), VSUB(T30, T31), ms, &(x[0]));
+					     ST(&(x[0]), VADD(T30, T31), ms, &(x[0]));
+					     ST(&(x[WS(rs, 8)]), VFMAI(T2Z, T2W), ms, &(x[0]));
+					     ST(&(x[WS(rs, 24)]), VFNMSI(T2Z, T2W), ms, &(x[0]));
+					     T3q = VFNMS(LDK(KP923879532), T3p, T3o);
+					     T3u = VFMA(LDK(KP923879532), T3p, T3o);
+					     ST(&(x[WS(rs, 18)]), VFMAI(T3l, T3e), ms, &(x[0]));
+					     ST(&(x[WS(rs, 14)]), VFNMSI(T3l, T3e), ms, &(x[0]));
+					     ST(&(x[WS(rs, 28)]), VFNMSI(T2T, T2S), ms, &(x[0]));
+					     ST(&(x[WS(rs, 4)]), VFMAI(T2T, T2S), ms, &(x[0]));
+					     ST(&(x[WS(rs, 20)]), VFMAI(T2R, T2I), ms, &(x[0]));
+					     ST(&(x[WS(rs, 12)]), VFNMSI(T2R, T2I), ms, &(x[0]));
+					}
+					ST(&(x[WS(rs, 26)]), VFMAI(T3v, T3u), ms, &(x[0]));
+					ST(&(x[WS(rs, 6)]), VFNMSI(T3v, T3u), ms, &(x[0]));
+				   }
+			      }
+			      {
+				   V T1U, T13, T1s, TE, T1M, T1I, T1N, T1B, T1V, T1E;
+				   {
+					V Tg, TD, T1G, T1H;
+					Tg = VFMA(LDK(KP707106781), Tf, T4);
+					T1U = VFNMS(LDK(KP707106781), Tf, T4);
+					T26 = VSUB(Tr, TC);
+					TD = VADD(Tr, TC);
+					T1G = VFMA(LDK(KP198912367), TV, T12);
+					T13 = VFNMS(LDK(KP198912367), T12, TV);
+					T1s = VFNMS(LDK(KP198912367), T1r, T1k);
+					T1H = VFMA(LDK(KP198912367), T1k, T1r);
+					ST(&(x[WS(rs, 2)]), VFMAI(T3n, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 30)]), VFNMSI(T3n, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 22)]), VFNMSI(T3t, T3q), ms, &(x[0]));
+					ST(&(x[WS(rs, 10)]), VFMAI(T3t, T3q), ms, &(x[0]));
+					TE = VFMA(LDK(KP923879532), TD, Tg);
+					T1M = VFNMS(LDK(KP923879532), TD, Tg);
+					T1I = VSUB(T1G, T1H);
+					T1N = VADD(T1G, T1H);
+					T1B = VFMA(LDK(KP707106781), T1A, T1z);
+					T25 = VFNMS(LDK(KP707106781), T1A, T1z);
+					T1V = VADD(T1C, T1D);
+					T1E = VSUB(T1C, T1D);
+				   }
+				   {
+					V T1W, T2e, T2f, T23;
+					{
+					     V T28, T1Z, T1S, T1O, T1t, T1Q, T1F, T1P, T22, T29;
+					     T28 = VFNMS(LDK(KP668178637), T1X, T1Y);
+					     T1Z = VFMA(LDK(KP668178637), T1Y, T1X);
+					     T1S = VFMA(LDK(KP980785280), T1N, T1M);
+					     T1O = VFNMS(LDK(KP980785280), T1N, T1M);
+					     T1t = VADD(T13, T1s);
+					     T1Q = VSUB(T13, T1s);
+					     T1F = VFMA(LDK(KP923879532), T1E, T1B);
+					     T1P = VFNMS(LDK(KP923879532), T1E, T1B);
+					     T1W = VFMA(LDK(KP923879532), T1V, T1U);
+					     T2e = VFNMS(LDK(KP923879532), T1V, T1U);
+					     T22 = VFMA(LDK(KP668178637), T21, T20);
+					     T29 = VFNMS(LDK(KP668178637), T20, T21);
+					     {
+						  V T1K, T1u, T1R, T1T, T1L, T1J;
+						  T1K = VFMA(LDK(KP980785280), T1t, TE);
+						  T1u = VFNMS(LDK(KP980785280), T1t, TE);
+						  T1R = VFMA(LDK(KP980785280), T1Q, T1P);
+						  T1T = VFNMS(LDK(KP980785280), T1Q, T1P);
+						  T1L = VFMA(LDK(KP980785280), T1I, T1F);
+						  T1J = VFNMS(LDK(KP980785280), T1I, T1F);
+						  T2f = VADD(T28, T29);
+						  T2a = VSUB(T28, T29);
+						  T23 = VADD(T1Z, T22);
+						  T2i = VSUB(T1Z, T22);
+						  ST(&(x[WS(rs, 23)]), VFNMSI(T1R, T1O), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 9)]), VFMAI(T1R, T1O), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 25)]), VFMAI(T1T, T1S), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 7)]), VFNMSI(T1T, T1S), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 1)]), VFMAI(T1L, T1K), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 31)]), VFNMSI(T1L, T1K), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 17)]), VFMAI(T1J, T1u), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 15)]), VFNMSI(T1J, T1u), ms, &(x[WS(rs, 1)]));
+					     }
+					}
+					T24 = VFNMS(LDK(KP831469612), T23, T1W);
+					T2c = VFMA(LDK(KP831469612), T23, T1W);
+					T2g = VFMA(LDK(KP831469612), T2f, T2e);
+					T2k = VFNMS(LDK(KP831469612), T2f, T2e);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T2h = VFMA(LDK(KP923879532), T26, T25);
+	       T27 = VFNMS(LDK(KP923879532), T26, T25);
+	       {
+		    V T2j, T2l, T2d, T2b;
+		    T2j = VFNMS(LDK(KP831469612), T2i, T2h);
+		    T2l = VFMA(LDK(KP831469612), T2i, T2h);
+		    T2d = VFMA(LDK(KP831469612), T2a, T27);
+		    T2b = VFNMS(LDK(KP831469612), T2a, T27);
+		    ST(&(x[WS(rs, 21)]), VFMAI(T2j, T2g), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 11)]), VFNMSI(T2j, T2g), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 27)]), VFNMSI(T2l, T2k), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VFMAI(T2l, T2k), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 29)]), VFMAI(T2d, T2c), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VFNMSI(T2d, T2c), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 13)]), VFMAI(T2b, T24), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 19)]), VFNMSI(T2b, T24), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t2bv_32"), twinstr, &GENUS, {119, 62, 98, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_32) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2bv_32 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 217 FP additions, 104 FP multiplications,
+ * (or, 201 additions, 88 multiplications, 16 fused multiply/add),
+ * 59 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T4, T1D, T2P, T3h, Tf, T1y, T2K, T3i, TC, T1w, T2G, T3e, Tr, T1v, T2D;
+	       V T3d, T1k, T20, T2y, T3a, T1r, T21, T2v, T39, TV, T1X, T2r, T37, T12, T1Y;
+	       V T2o, T36;
+	       {
+		    V T1, T1C, T3, T1A, T1B, T2, T1z, T2N, T2O;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T1B = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+		    T1C = BYTW(&(W[TWVL * 46]), T1B);
+		    T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+		    T3 = BYTW(&(W[TWVL * 30]), T2);
+		    T1z = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    T1A = BYTW(&(W[TWVL * 14]), T1z);
+		    T4 = VSUB(T1, T3);
+		    T1D = VSUB(T1A, T1C);
+		    T2N = VADD(T1, T3);
+		    T2O = VADD(T1A, T1C);
+		    T2P = VSUB(T2N, T2O);
+		    T3h = VADD(T2N, T2O);
+	       }
+	       {
+		    V T6, Td, T8, Tb;
+		    {
+			 V T5, Tc, T7, Ta;
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T6 = BYTW(&(W[TWVL * 6]), T5);
+			 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Td = BYTW(&(W[TWVL * 22]), Tc);
+			 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 T8 = BYTW(&(W[TWVL * 38]), T7);
+			 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+			 Tb = BYTW(&(W[TWVL * 54]), Ta);
+		    }
+		    {
+			 V T9, Te, T2I, T2J;
+			 T9 = VSUB(T6, T8);
+			 Te = VSUB(Tb, Td);
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 T1y = VMUL(LDK(KP707106781), VSUB(T9, Te));
+			 T2I = VADD(T6, T8);
+			 T2J = VADD(Tb, Td);
+			 T2K = VSUB(T2I, T2J);
+			 T3i = VADD(T2I, T2J);
+		    }
+	       }
+	       {
+		    V Tt, TA, Tv, Ty;
+		    {
+			 V Ts, Tz, Tu, Tx;
+			 Ts = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tt = BYTW(&(W[TWVL * 10]), Ts);
+			 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 TA = BYTW(&(W[TWVL * 26]), Tz);
+			 Tu = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 Tv = BYTW(&(W[TWVL * 42]), Tu);
+			 Tx = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+			 Ty = BYTW(&(W[TWVL * 58]), Tx);
+		    }
+		    {
+			 V Tw, TB, T2E, T2F;
+			 Tw = VSUB(Tt, Tv);
+			 TB = VSUB(Ty, TA);
+			 TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw));
+			 T1w = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
+			 T2E = VADD(Ty, TA);
+			 T2F = VADD(Tt, Tv);
+			 T2G = VSUB(T2E, T2F);
+			 T3e = VADD(T2E, T2F);
+		    }
+	       }
+	       {
+		    V Ti, Tp, Tk, Tn;
+		    {
+			 V Th, To, Tj, Tm;
+			 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Ti = BYTW(&(W[TWVL * 2]), Th);
+			 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			 Tp = BYTW(&(W[TWVL * 50]), To);
+			 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 Tk = BYTW(&(W[TWVL * 34]), Tj);
+			 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 Tn = BYTW(&(W[TWVL * 18]), Tm);
+		    }
+		    {
+			 V Tl, Tq, T2B, T2C;
+			 Tl = VSUB(Ti, Tk);
+			 Tq = VSUB(Tn, Tp);
+			 Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
+			 T1v = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
+			 T2B = VADD(Ti, Tk);
+			 T2C = VADD(Tn, Tp);
+			 T2D = VSUB(T2B, T2C);
+			 T3d = VADD(T2B, T2C);
+		    }
+	       }
+	       {
+		    V T1g, T1i, T1o, T1m, T1a, T1c, T1d, T15, T17, T18;
+		    {
+			 V T1f, T1h, T1n, T1l;
+			 T1f = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T1g = BYTW(&(W[TWVL * 12]), T1f);
+			 T1h = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T1i = BYTW(&(W[TWVL * 44]), T1h);
+			 T1n = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T1o = BYTW(&(W[TWVL * 28]), T1n);
+			 T1l = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+			 T1m = BYTW(&(W[TWVL * 60]), T1l);
+			 {
+			      V T19, T1b, T14, T16;
+			      T19 = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+			      T1a = BYTW(&(W[TWVL * 52]), T19);
+			      T1b = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      T1c = BYTW(&(W[TWVL * 20]), T1b);
+			      T1d = VSUB(T1a, T1c);
+			      T14 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T15 = BYTW(&(W[TWVL * 4]), T14);
+			      T16 = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			      T17 = BYTW(&(W[TWVL * 36]), T16);
+			      T18 = VSUB(T15, T17);
+			 }
+		    }
+		    {
+			 V T1e, T1j, T2w, T2x;
+			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
+			 T1j = VSUB(T1g, T1i);
+			 T1k = VSUB(T1e, T1j);
+			 T20 = VADD(T1j, T1e);
+			 T2w = VADD(T15, T17);
+			 T2x = VADD(T1a, T1c);
+			 T2y = VSUB(T2w, T2x);
+			 T3a = VADD(T2w, T2x);
+		    }
+		    {
+			 V T1p, T1q, T2t, T2u;
+			 T1p = VSUB(T1m, T1o);
+			 T1q = VMUL(LDK(KP707106781), VADD(T18, T1d));
+			 T1r = VSUB(T1p, T1q);
+			 T21 = VADD(T1p, T1q);
+			 T2t = VADD(T1m, T1o);
+			 T2u = VADD(T1g, T1i);
+			 T2v = VSUB(T2t, T2u);
+			 T39 = VADD(T2t, T2u);
+		    }
+	       }
+	       {
+		    V TR, TT, TZ, TX, TL, TN, TO, TG, TI, TJ;
+		    {
+			 V TQ, TS, TY, TW;
+			 TQ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 TR = BYTW(&(W[TWVL * 16]), TQ);
+			 TS = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+			 TT = BYTW(&(W[TWVL * 48]), TS);
+			 TY = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 TZ = BYTW(&(W[TWVL * 32]), TY);
+			 TW = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 TX = BYTW(&(W[0]), TW);
+			 {
+			      V TK, TM, TF, TH;
+			      TK = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+			      TL = BYTW(&(W[TWVL * 56]), TK);
+			      TM = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			      TN = BYTW(&(W[TWVL * 24]), TM);
+			      TO = VSUB(TL, TN);
+			      TF = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      TG = BYTW(&(W[TWVL * 8]), TF);
+			      TH = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			      TI = BYTW(&(W[TWVL * 40]), TH);
+			      TJ = VSUB(TG, TI);
+			 }
+		    }
+		    {
+			 V TP, TU, T2p, T2q;
+			 TP = VMUL(LDK(KP707106781), VSUB(TJ, TO));
+			 TU = VSUB(TR, TT);
+			 TV = VSUB(TP, TU);
+			 T1X = VADD(TU, TP);
+			 T2p = VADD(TG, TI);
+			 T2q = VADD(TL, TN);
+			 T2r = VSUB(T2p, T2q);
+			 T37 = VADD(T2p, T2q);
+		    }
+		    {
+			 V T10, T11, T2m, T2n;
+			 T10 = VSUB(TX, TZ);
+			 T11 = VMUL(LDK(KP707106781), VADD(TJ, TO));
+			 T12 = VSUB(T10, T11);
+			 T1Y = VADD(T10, T11);
+			 T2m = VADD(TX, TZ);
+			 T2n = VADD(TR, TT);
+			 T2o = VSUB(T2m, T2n);
+			 T36 = VADD(T2m, T2n);
+		    }
+	       }
+	       {
+		    V T3q, T3u, T3t, T3v;
+		    {
+			 V T3o, T3p, T3r, T3s;
+			 T3o = VADD(T3h, T3i);
+			 T3p = VADD(T3d, T3e);
+			 T3q = VSUB(T3o, T3p);
+			 T3u = VADD(T3o, T3p);
+			 T3r = VADD(T36, T37);
+			 T3s = VADD(T39, T3a);
+			 T3t = VBYI(VSUB(T3r, T3s));
+			 T3v = VADD(T3r, T3s);
+		    }
+		    ST(&(x[WS(rs, 24)]), VSUB(T3q, T3t), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T3u, T3v), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VADD(T3q, T3t), ms, &(x[0]));
+		    ST(&(x[WS(rs, 16)]), VSUB(T3u, T3v), ms, &(x[0]));
+	       }
+	       {
+		    V T3f, T3j, T3c, T3k, T38, T3b;
+		    T3f = VSUB(T3d, T3e);
+		    T3j = VSUB(T3h, T3i);
+		    T38 = VSUB(T36, T37);
+		    T3b = VSUB(T39, T3a);
+		    T3c = VMUL(LDK(KP707106781), VSUB(T38, T3b));
+		    T3k = VMUL(LDK(KP707106781), VADD(T38, T3b));
+		    {
+			 V T3g, T3l, T3m, T3n;
+			 T3g = VBYI(VSUB(T3c, T3f));
+			 T3l = VSUB(T3j, T3k);
+			 ST(&(x[WS(rs, 12)]), VADD(T3g, T3l), ms, &(x[0]));
+			 ST(&(x[WS(rs, 20)]), VSUB(T3l, T3g), ms, &(x[0]));
+			 T3m = VBYI(VADD(T3f, T3c));
+			 T3n = VADD(T3j, T3k);
+			 ST(&(x[WS(rs, 4)]), VADD(T3m, T3n), ms, &(x[0]));
+			 ST(&(x[WS(rs, 28)]), VSUB(T3n, T3m), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T2L, T31, T2R, T2Y, T2A, T2Z, T2U, T32, T2H, T2Q;
+		    T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G));
+		    T2L = VSUB(T2H, T2K);
+		    T31 = VADD(T2K, T2H);
+		    T2Q = VMUL(LDK(KP707106781), VADD(T2D, T2G));
+		    T2R = VSUB(T2P, T2Q);
+		    T2Y = VADD(T2P, T2Q);
+		    {
+			 V T2s, T2z, T2S, T2T;
+			 T2s = VFNMS(LDK(KP382683432), T2r, VMUL(LDK(KP923879532), T2o));
+			 T2z = VFMA(LDK(KP923879532), T2v, VMUL(LDK(KP382683432), T2y));
+			 T2A = VSUB(T2s, T2z);
+			 T2Z = VADD(T2s, T2z);
+			 T2S = VFMA(LDK(KP382683432), T2o, VMUL(LDK(KP923879532), T2r));
+			 T2T = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2y));
+			 T2U = VSUB(T2S, T2T);
+			 T32 = VADD(T2S, T2T);
+		    }
+		    {
+			 V T2M, T2V, T34, T35;
+			 T2M = VBYI(VSUB(T2A, T2L));
+			 T2V = VSUB(T2R, T2U);
+			 ST(&(x[WS(rs, 10)]), VADD(T2M, T2V), ms, &(x[0]));
+			 ST(&(x[WS(rs, 22)]), VSUB(T2V, T2M), ms, &(x[0]));
+			 T34 = VSUB(T2Y, T2Z);
+			 T35 = VBYI(VSUB(T32, T31));
+			 ST(&(x[WS(rs, 18)]), VSUB(T34, T35), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VADD(T34, T35), ms, &(x[0]));
+		    }
+		    {
+			 V T2W, T2X, T30, T33;
+			 T2W = VBYI(VADD(T2L, T2A));
+			 T2X = VADD(T2R, T2U);
+			 ST(&(x[WS(rs, 6)]), VADD(T2W, T2X), ms, &(x[0]));
+			 ST(&(x[WS(rs, 26)]), VSUB(T2X, T2W), ms, &(x[0]));
+			 T30 = VADD(T2Y, T2Z);
+			 T33 = VBYI(VADD(T31, T32));
+			 ST(&(x[WS(rs, 30)]), VSUB(T30, T33), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(T30, T33), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V TE, T1P, T1I, T1Q, T1t, T1M, T1F, T1N;
+		    {
+			 V Tg, TD, T1G, T1H;
+			 Tg = VSUB(T4, Tf);
+			 TD = VSUB(Tr, TC);
+			 TE = VSUB(Tg, TD);
+			 T1P = VADD(Tg, TD);
+			 T1G = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12));
+			 T1H = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r));
+			 T1I = VSUB(T1G, T1H);
+			 T1Q = VADD(T1G, T1H);
+		    }
+		    {
+			 V T13, T1s, T1x, T1E;
+			 T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12));
+			 T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k));
+			 T1t = VSUB(T13, T1s);
+			 T1M = VADD(T13, T1s);
+			 T1x = VSUB(T1v, T1w);
+			 T1E = VSUB(T1y, T1D);
+			 T1F = VSUB(T1x, T1E);
+			 T1N = VADD(T1E, T1x);
+		    }
+		    {
+			 V T1u, T1J, T1S, T1T;
+			 T1u = VADD(TE, T1t);
+			 T1J = VBYI(VADD(T1F, T1I));
+			 ST(&(x[WS(rs, 27)]), VSUB(T1u, T1J), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VADD(T1u, T1J), ms, &(x[WS(rs, 1)]));
+			 T1S = VBYI(VADD(T1N, T1M));
+			 T1T = VADD(T1P, T1Q);
+			 ST(&(x[WS(rs, 3)]), VADD(T1S, T1T), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 29)]), VSUB(T1T, T1S), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T1K, T1L, T1O, T1R;
+			 T1K = VSUB(TE, T1t);
+			 T1L = VBYI(VSUB(T1I, T1F));
+			 ST(&(x[WS(rs, 21)]), VSUB(T1K, T1L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VADD(T1K, T1L), ms, &(x[WS(rs, 1)]));
+			 T1O = VBYI(VSUB(T1M, T1N));
+			 T1R = VSUB(T1P, T1Q);
+			 ST(&(x[WS(rs, 13)]), VADD(T1O, T1R), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 19)]), VSUB(T1R, T1O), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T1W, T2h, T2a, T2i, T23, T2e, T27, T2f;
+		    {
+			 V T1U, T1V, T28, T29;
+			 T1U = VADD(T4, Tf);
+			 T1V = VADD(T1v, T1w);
+			 T1W = VSUB(T1U, T1V);
+			 T2h = VADD(T1U, T1V);
+			 T28 = VFNMS(LDK(KP195090322), T1X, VMUL(LDK(KP980785280), T1Y));
+			 T29 = VFMA(LDK(KP195090322), T20, VMUL(LDK(KP980785280), T21));
+			 T2a = VSUB(T28, T29);
+			 T2i = VADD(T28, T29);
+		    }
+		    {
+			 V T1Z, T22, T25, T26;
+			 T1Z = VFMA(LDK(KP980785280), T1X, VMUL(LDK(KP195090322), T1Y));
+			 T22 = VFNMS(LDK(KP195090322), T21, VMUL(LDK(KP980785280), T20));
+			 T23 = VSUB(T1Z, T22);
+			 T2e = VADD(T1Z, T22);
+			 T25 = VADD(Tr, TC);
+			 T26 = VADD(T1D, T1y);
+			 T27 = VSUB(T25, T26);
+			 T2f = VADD(T26, T25);
+		    }
+		    {
+			 V T24, T2b, T2k, T2l;
+			 T24 = VADD(T1W, T23);
+			 T2b = VBYI(VADD(T27, T2a));
+			 ST(&(x[WS(rs, 25)]), VSUB(T24, T2b), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T24, T2b), ms, &(x[WS(rs, 1)]));
+			 T2k = VBYI(VADD(T2f, T2e));
+			 T2l = VADD(T2h, T2i);
+			 ST(&(x[WS(rs, 1)]), VADD(T2k, T2l), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 31)]), VSUB(T2l, T2k), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T2c, T2d, T2g, T2j;
+			 T2c = VSUB(T1W, T23);
+			 T2d = VBYI(VSUB(T2a, T27));
+			 ST(&(x[WS(rs, 23)]), VSUB(T2c, T2d), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 9)]), VADD(T2c, T2d), ms, &(x[WS(rs, 1)]));
+			 T2g = VBYI(VSUB(T2e, T2f));
+			 T2j = VSUB(T2h, T2i);
+			 ST(&(x[WS(rs, 15)]), VADD(T2g, T2j), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 17)]), VSUB(T2j, T2g), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t2bv_32"), twinstr, &GENUS, {201, 88, 16, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_32) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:09 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t2bv_4 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 11 FP additions, 8 FP multiplications,
+ * (or, 9 additions, 6 multiplications, 2 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T7, T2, T5, T8, T3, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTW(&(W[TWVL * 4]), T7);
+	       T3 = BYTW(&(W[TWVL * 2]), T2);
+	       T6 = BYTW(&(W[0]), T5);
+	       {
+		    V Ta, T4, Tb, T9;
+		    Ta = VADD(T1, T3);
+		    T4 = VSUB(T1, T3);
+		    Tb = VADD(T6, T8);
+		    T9 = VSUB(T6, T8);
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T9, T4), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VFNMSI(T9, T4), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t2bv_4"), twinstr, &GENUS, {9, 6, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_4) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t2bv_4 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 11 FP additions, 6 FP multiplications,
+ * (or, 11 additions, 6 multiplications, 0 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T8, T3, T6, T7, T2, T5;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTW(&(W[TWVL * 4]), T7);
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T3 = BYTW(&(W[TWVL * 2]), T2);
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T6 = BYTW(&(W[0]), T5);
+	       {
+		    V T4, T9, Ta, Tb;
+		    T4 = VSUB(T1, T3);
+		    T9 = VBYI(VSUB(T6, T8));
+		    ST(&(x[WS(rs, 3)]), VSUB(T4, T9), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(T4, T9), ms, &(x[WS(rs, 1)]));
+		    Ta = VADD(T1, T3);
+		    Tb = VADD(T6, T8);
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t2bv_4"), twinstr, &GENUS, {11, 6, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_4) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:14 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t2bv_5 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 20 FP additions, 19 FP multiplications,
+ * (or, 11 additions, 10 multiplications, 9 fused multiply/add),
+ * 26 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V T1, T2, T9, T4, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T9 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, Ta, T5, T8;
+		    T3 = BYTW(&(W[0]), T2);
+		    Ta = BYTW(&(W[TWVL * 4]), T9);
+		    T5 = BYTW(&(W[TWVL * 6]), T4);
+		    T8 = BYTW(&(W[TWVL * 2]), T7);
+		    {
+			 V T6, Tg, Tb, Th;
+			 T6 = VADD(T3, T5);
+			 Tg = VSUB(T3, T5);
+			 Tb = VADD(T8, Ta);
+			 Th = VSUB(T8, Ta);
+			 {
+			      V Te, Tc, Tk, Ti, Td, Tj, Tf;
+			      Te = VSUB(T6, Tb);
+			      Tc = VADD(T6, Tb);
+			      Tk = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tg, Th));
+			      Ti = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Th, Tg));
+			      Td = VFNMS(LDK(KP250000000), Tc, T1);
+			      ST(&(x[0]), VADD(T1, Tc), ms, &(x[0]));
+			      Tj = VFNMS(LDK(KP559016994), Te, Td);
+			      Tf = VFMA(LDK(KP559016994), Te, Td);
+			      ST(&(x[WS(rs, 2)]), VFNMSI(Tk, Tj), ms, &(x[0]));
+			      ST(&(x[WS(rs, 3)]), VFMAI(Tk, Tj), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 4)]), VFNMSI(Ti, Tf), ms, &(x[0]));
+			      ST(&(x[WS(rs, 1)]), VFMAI(Ti, Tf), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t2bv_5"), twinstr, &GENUS, {11, 10, 9, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_5) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t2bv_5 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 20 FP additions, 14 FP multiplications,
+ * (or, 17 additions, 11 multiplications, 3 fused multiply/add),
+ * 20 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V Tf, T5, Ta, Tc, Td, Tg;
+	       Tf = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTW(&(W[0]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTW(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T4 = BYTW(&(W[TWVL * 6]), T3);
+			 T6 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T7 = BYTW(&(W[TWVL * 2]), T6);
+		    }
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    Tc = VADD(T2, T4);
+		    Td = VADD(T7, T9);
+		    Tg = VADD(Tc, Td);
+	       }
+	       ST(&(x[0]), VADD(Tf, Tg), ms, &(x[0]));
+	       {
+		    V Tb, Tj, Ti, Tk, Te, Th;
+		    Tb = VBYI(VFMA(LDK(KP951056516), T5, VMUL(LDK(KP587785252), Ta)));
+		    Tj = VBYI(VFNMS(LDK(KP951056516), Ta, VMUL(LDK(KP587785252), T5)));
+		    Te = VMUL(LDK(KP559016994), VSUB(Tc, Td));
+		    Th = VFNMS(LDK(KP250000000), Tg, Tf);
+		    Ti = VADD(Te, Th);
+		    Tk = VSUB(Th, Te);
+		    ST(&(x[WS(rs, 1)]), VADD(Tb, Ti), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VSUB(Tk, Tj), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 4)]), VSUB(Ti, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Tj, Tk), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t2bv_5"), twinstr, &GENUS, {17, 11, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_5) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1877 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:13 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2bv_64 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 519 FP additions, 384 FP multiplications,
+ * (or, 261 additions, 126 multiplications, 258 fused multiply/add),
+ * 187 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V T6L, T6M, T6O, T6P, T75, T6V, T5A, T6A, T72, T6K, T6t, T6D, T6w, T6B, T6h;
+	       V T6E;
+	       {
+		    V Ta, T3U, T3V, T37, T7a, T58, T7B, T6l, T1v, T24, T5Q, T7o, T5F, T7l, T43;
+		    V T4F, T2i, T2R, T6b, T7v, T60, T7s, T4a, T4I, T5u, T7h, T5x, T7g, T1i, T3b;
+		    V T4m, T4C, T7e, T5l, T7d, T5o, T3a, TV, T4B, T4j, T3X, T3Y, T6o, T7b, T5f;
+		    V T7C, Tx, T38, T2p, T61, T2n, T65, T2D, T7p, T5M, T7m, T5T, T4G, T46, T25;
+		    V T1S, T2q, T2u, T2w;
+		    {
+			 V T5q, T10, T5v, T15, T1b, T5s, T1c, T1e;
+			 {
+			      V T1V, T1p, T5B, T5O, T1u, T1X, T20, T21;
+			      {
+				   V T1, T2, T7, T5, T32, T34, T2X, T2Z;
+				   T1 = LD(&(x[0]), ms, &(x[0]));
+				   T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
+				   T5 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T32 = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
+				   T34 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+				   T2X = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+				   T2Z = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
+				   {
+					V T1m, T54, T6j, T36, T56, T31, T55, T1n, T1q, T1s, T4, T9;
+					{
+					     V T3, T8, T6, T33, T35, T2Y, T30, T1l;
+					     T1l = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					     T3 = BYTW(&(W[TWVL * 62]), T2);
+					     T8 = BYTW(&(W[TWVL * 94]), T7);
+					     T6 = BYTW(&(W[TWVL * 30]), T5);
+					     T33 = BYTW(&(W[TWVL * 110]), T32);
+					     T35 = BYTW(&(W[TWVL * 46]), T34);
+					     T2Y = BYTW(&(W[TWVL * 14]), T2X);
+					     T30 = BYTW(&(W[TWVL * 78]), T2Z);
+					     T1m = BYTW(&(W[0]), T1l);
+					     T54 = VSUB(T1, T3);
+					     T4 = VADD(T1, T3);
+					     T6j = VSUB(T6, T8);
+					     T9 = VADD(T6, T8);
+					     T36 = VADD(T33, T35);
+					     T56 = VSUB(T33, T35);
+					     T31 = VADD(T2Y, T30);
+					     T55 = VSUB(T2Y, T30);
+					     T1n = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
+					}
+					T1q = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					T1s = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
+					Ta = VSUB(T4, T9);
+					T3U = VADD(T4, T9);
+					{
+					     V T57, T6k, T1o, T1r, T1t, T1W, T1U, T1Z;
+					     T1U = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					     T3V = VADD(T31, T36);
+					     T37 = VSUB(T31, T36);
+					     T57 = VADD(T55, T56);
+					     T6k = VSUB(T55, T56);
+					     T1o = BYTW(&(W[TWVL * 64]), T1n);
+					     T1r = BYTW(&(W[TWVL * 32]), T1q);
+					     T1t = BYTW(&(W[TWVL * 96]), T1s);
+					     T1V = BYTW(&(W[TWVL * 16]), T1U);
+					     T1W = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
+					     T1Z = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
+					     T7a = VFNMS(LDK(KP707106781), T57, T54);
+					     T58 = VFMA(LDK(KP707106781), T57, T54);
+					     T7B = VFNMS(LDK(KP707106781), T6k, T6j);
+					     T6l = VFMA(LDK(KP707106781), T6k, T6j);
+					     T1p = VADD(T1m, T1o);
+					     T5B = VSUB(T1m, T1o);
+					     T5O = VSUB(T1r, T1t);
+					     T1u = VADD(T1r, T1t);
+					     T1X = BYTW(&(W[TWVL * 80]), T1W);
+					     T20 = BYTW(&(W[TWVL * 112]), T1Z);
+					     T21 = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+					}
+				   }
+			      }
+			      {
+				   V T5W, T2N, T69, T2L, T5Y, T2P, T48, T2c, T2h;
+				   {
+					V T41, T1Y, T5C, T22, T2d, T29, T2b, T2f, T28, T2a, T2H, T2J;
+					T28 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
+					T2a = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+					T1v = VSUB(T1p, T1u);
+					T41 = VADD(T1p, T1u);
+					T1Y = VADD(T1V, T1X);
+					T5C = VSUB(T1V, T1X);
+					T22 = BYTW(&(W[TWVL * 48]), T21);
+					T2d = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					T29 = BYTW(&(W[TWVL * 124]), T28);
+					T2b = BYTW(&(W[TWVL * 60]), T2a);
+					T2f = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
+					T2H = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
+					T2J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V T23, T5D, T2e, T2g, T2I, T2K, T2M;
+					     T2M = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     T23 = VADD(T20, T22);
+					     T5D = VSUB(T20, T22);
+					     T2e = BYTW(&(W[TWVL * 28]), T2d);
+					     T2c = VADD(T29, T2b);
+					     T5W = VSUB(T29, T2b);
+					     T2g = BYTW(&(W[TWVL * 92]), T2f);
+					     T2I = BYTW(&(W[TWVL * 108]), T2H);
+					     T2K = BYTW(&(W[TWVL * 44]), T2J);
+					     T2N = BYTW(&(W[TWVL * 12]), T2M);
+					     {
+						  V T5E, T5P, T42, T2O;
+						  T5E = VADD(T5C, T5D);
+						  T5P = VSUB(T5C, T5D);
+						  T24 = VSUB(T1Y, T23);
+						  T42 = VADD(T1Y, T23);
+						  T69 = VSUB(T2g, T2e);
+						  T2h = VADD(T2e, T2g);
+						  T2O = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
+						  T2L = VADD(T2I, T2K);
+						  T5Y = VSUB(T2I, T2K);
+						  T5Q = VFMA(LDK(KP707106781), T5P, T5O);
+						  T7o = VFNMS(LDK(KP707106781), T5P, T5O);
+						  T5F = VFMA(LDK(KP707106781), T5E, T5B);
+						  T7l = VFNMS(LDK(KP707106781), T5E, T5B);
+						  T43 = VADD(T41, T42);
+						  T4F = VSUB(T41, T42);
+						  T2P = BYTW(&(W[TWVL * 76]), T2O);
+					     }
+					}
+				   }
+				   T2i = VSUB(T2c, T2h);
+				   T48 = VADD(T2c, T2h);
+				   {
+					V TW, TY, T11, T2Q, T5X, T13;
+					TW = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
+					TY = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+					T11 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					T2Q = VADD(T2N, T2P);
+					T5X = VSUB(T2N, T2P);
+					T13 = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
+					{
+					     V T12, T5Z, T6a, T49, T14, T18, T1a;
+					     {
+						  V T17, T19, TX, TZ;
+						  T17 = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
+						  T19 = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+						  TX = BYTW(&(W[TWVL * 122]), TW);
+						  TZ = BYTW(&(W[TWVL * 58]), TY);
+						  T12 = BYTW(&(W[TWVL * 26]), T11);
+						  T5Z = VADD(T5X, T5Y);
+						  T6a = VSUB(T5Y, T5X);
+						  T2R = VSUB(T2L, T2Q);
+						  T49 = VADD(T2Q, T2L);
+						  T14 = BYTW(&(W[TWVL * 90]), T13);
+						  T18 = BYTW(&(W[TWVL * 106]), T17);
+						  T5q = VSUB(TX, TZ);
+						  T10 = VADD(TX, TZ);
+						  T1a = BYTW(&(W[TWVL * 42]), T19);
+					     }
+					     T6b = VFMA(LDK(KP707106781), T6a, T69);
+					     T7v = VFNMS(LDK(KP707106781), T6a, T69);
+					     T60 = VFMA(LDK(KP707106781), T5Z, T5W);
+					     T7s = VFNMS(LDK(KP707106781), T5Z, T5W);
+					     T4a = VADD(T48, T49);
+					     T4I = VSUB(T48, T49);
+					     T5v = VSUB(T14, T12);
+					     T15 = VADD(T12, T14);
+					     T1b = VADD(T18, T1a);
+					     T5s = VSUB(T18, T1a);
+					}
+					T1c = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+					T1e = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
+				   }
+			      }
+			 }
+			 {
+			      V Th, T59, Tf, Tv, T5d, Tj, Tm, To;
+			      {
+				   V T5h, TQ, T5m, T5i, TO, TS, TJ, T4h, TD, TI;
+				   {
+					V T4k, T16, TB, T1d, T1f, TE, TG, TA, Tz, TK, TM, TC;
+					Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					T4k = VADD(T10, T15);
+					T16 = VSUB(T10, T15);
+					TB = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
+					T1d = BYTW(&(W[TWVL * 10]), T1c);
+					T1f = BYTW(&(W[TWVL * 74]), T1e);
+					TE = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+					TG = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
+					TA = BYTW(&(W[TWVL * 2]), Tz);
+					TK = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+					TM = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
+					TC = BYTW(&(W[TWVL * 66]), TB);
+					{
+					     V T1g, T5r, TF, TH, TL, TN, TP;
+					     TP = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
+					     T1g = VADD(T1d, T1f);
+					     T5r = VSUB(T1d, T1f);
+					     TF = BYTW(&(W[TWVL * 34]), TE);
+					     TH = BYTW(&(W[TWVL * 98]), TG);
+					     TL = BYTW(&(W[TWVL * 18]), TK);
+					     TN = BYTW(&(W[TWVL * 82]), TM);
+					     T5h = VSUB(TA, TC);
+					     TD = VADD(TA, TC);
+					     TQ = BYTW(&(W[TWVL * 114]), TP);
+					     {
+						  V T5w, T5t, T4l, T1h, TR;
+						  T5w = VSUB(T5s, T5r);
+						  T5t = VADD(T5r, T5s);
+						  T4l = VADD(T1g, T1b);
+						  T1h = VSUB(T1b, T1g);
+						  T5m = VSUB(TF, TH);
+						  TI = VADD(TF, TH);
+						  T5i = VSUB(TL, TN);
+						  TO = VADD(TL, TN);
+						  TR = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+						  T5u = VFMA(LDK(KP707106781), T5t, T5q);
+						  T7h = VFNMS(LDK(KP707106781), T5t, T5q);
+						  T5x = VFMA(LDK(KP707106781), T5w, T5v);
+						  T7g = VFNMS(LDK(KP707106781), T5w, T5v);
+						  T1i = VFNMS(LDK(KP414213562), T1h, T16);
+						  T3b = VFMA(LDK(KP414213562), T16, T1h);
+						  T4m = VADD(T4k, T4l);
+						  T4C = VSUB(T4k, T4l);
+						  TS = BYTW(&(W[TWVL * 50]), TR);
+					     }
+					}
+				   }
+				   TJ = VSUB(TD, TI);
+				   T4h = VADD(TD, TI);
+				   {
+					V Tb, Td, Tr, T5j, TT, Tt, Tg;
+					Tb = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+					Td = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
+					Tr = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					T5j = VSUB(TQ, TS);
+					TT = VADD(TQ, TS);
+					Tt = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
+					Tg = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+					{
+					     V Ti, Tc, Te, Ts;
+					     Ti = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
+					     Tc = BYTW(&(W[TWVL * 6]), Tb);
+					     Te = BYTW(&(W[TWVL * 70]), Td);
+					     Ts = BYTW(&(W[TWVL * 22]), Tr);
+					     {
+						  V T5k, T5n, TU, T4i, Tu;
+						  T5k = VADD(T5i, T5j);
+						  T5n = VSUB(T5i, T5j);
+						  TU = VSUB(TO, TT);
+						  T4i = VADD(TO, TT);
+						  Tu = BYTW(&(W[TWVL * 86]), Tt);
+						  Th = BYTW(&(W[TWVL * 38]), Tg);
+						  T59 = VSUB(Tc, Te);
+						  Tf = VADD(Tc, Te);
+						  T7e = VFNMS(LDK(KP707106781), T5k, T5h);
+						  T5l = VFMA(LDK(KP707106781), T5k, T5h);
+						  T7d = VFNMS(LDK(KP707106781), T5n, T5m);
+						  T5o = VFMA(LDK(KP707106781), T5n, T5m);
+						  T3a = VFMA(LDK(KP414213562), TJ, TU);
+						  TV = VFNMS(LDK(KP414213562), TU, TJ);
+						  T4B = VSUB(T4h, T4i);
+						  T4j = VADD(T4h, T4i);
+						  Tv = VADD(Ts, Tu);
+						  T5d = VSUB(Tu, Ts);
+						  Tj = BYTW(&(W[TWVL * 102]), Ti);
+					     }
+					}
+					Tm = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
+					To = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+				   }
+			      }
+			      {
+				   V T5b, T6m, Tl, T1A, T5G, T1Q, T5K, T1C, T1D, T5e, T6n, Tw, T1H, T1J;
+				   {
+					V T1w, T1y, T1M, T1O, Tq, T5c, T1B;
+					T1w = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+					T1y = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
+					T1M = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+					T1O = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
+					T1B = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V Tk, T5a, Tn, Tp;
+					     Tk = VADD(Th, Tj);
+					     T5a = VSUB(Th, Tj);
+					     Tn = BYTW(&(W[TWVL * 118]), Tm);
+					     Tp = BYTW(&(W[TWVL * 54]), To);
+					     {
+						  V T1x, T1z, T1N, T1P;
+						  T1x = BYTW(&(W[TWVL * 8]), T1w);
+						  T1z = BYTW(&(W[TWVL * 72]), T1y);
+						  T1N = BYTW(&(W[TWVL * 24]), T1M);
+						  T1P = BYTW(&(W[TWVL * 88]), T1O);
+						  T5b = VFNMS(LDK(KP414213562), T5a, T59);
+						  T6m = VFMA(LDK(KP414213562), T59, T5a);
+						  T3X = VADD(Tf, Tk);
+						  Tl = VSUB(Tf, Tk);
+						  Tq = VADD(Tn, Tp);
+						  T5c = VSUB(Tn, Tp);
+						  T1A = VADD(T1x, T1z);
+						  T5G = VSUB(T1x, T1z);
+						  T1Q = VADD(T1N, T1P);
+						  T5K = VSUB(T1N, T1P);
+						  T1C = BYTW(&(W[TWVL * 40]), T1B);
+					     }
+					}
+					T1D = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
+					T5e = VFNMS(LDK(KP414213562), T5d, T5c);
+					T6n = VFMA(LDK(KP414213562), T5c, T5d);
+					T3Y = VADD(Tq, Tv);
+					Tw = VSUB(Tq, Tv);
+					T1H = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
+					T1J = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+				   }
+				   {
+					V T1I, T1K, T1F, T5H, T2k, T2l, T2z, T2B, T2j, T1E;
+					T2j = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+					T1E = BYTW(&(W[TWVL * 104]), T1D);
+					T6o = VSUB(T6m, T6n);
+					T7b = VADD(T6m, T6n);
+					T5f = VADD(T5b, T5e);
+					T7C = VSUB(T5b, T5e);
+					Tx = VADD(Tl, Tw);
+					T38 = VSUB(Tl, Tw);
+					T1I = BYTW(&(W[TWVL * 120]), T1H);
+					T1K = BYTW(&(W[TWVL * 56]), T1J);
+					T1F = VADD(T1C, T1E);
+					T5H = VSUB(T1C, T1E);
+					T2k = BYTW(&(W[TWVL * 4]), T2j);
+					T2l = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
+					T2z = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					T2B = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V T5I, T5R, T44, T1G, T2m, T2A, T2C, T5S, T5L, T1R, T45, T2o, T5J, T1L;
+					     T2o = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+					     T5J = VSUB(T1I, T1K);
+					     T1L = VADD(T1I, T1K);
+					     T5I = VFNMS(LDK(KP414213562), T5H, T5G);
+					     T5R = VFMA(LDK(KP414213562), T5G, T5H);
+					     T44 = VADD(T1A, T1F);
+					     T1G = VSUB(T1A, T1F);
+					     T2m = BYTW(&(W[TWVL * 68]), T2l);
+					     T2A = BYTW(&(W[TWVL * 20]), T2z);
+					     T2C = BYTW(&(W[TWVL * 84]), T2B);
+					     T5S = VFNMS(LDK(KP414213562), T5J, T5K);
+					     T5L = VFMA(LDK(KP414213562), T5K, T5J);
+					     T1R = VSUB(T1L, T1Q);
+					     T45 = VADD(T1L, T1Q);
+					     T2p = BYTW(&(W[TWVL * 36]), T2o);
+					     T61 = VSUB(T2k, T2m);
+					     T2n = VADD(T2k, T2m);
+					     T65 = VSUB(T2C, T2A);
+					     T2D = VADD(T2A, T2C);
+					     T7p = VSUB(T5I, T5L);
+					     T5M = VADD(T5I, T5L);
+					     T7m = VSUB(T5R, T5S);
+					     T5T = VADD(T5R, T5S);
+					     T4G = VSUB(T44, T45);
+					     T46 = VADD(T44, T45);
+					     T25 = VSUB(T1G, T1R);
+					     T1S = VADD(T1G, T1R);
+					     T2q = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
+					}
+					T2u = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
+					T2w = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T67, T7w, T6e, T7t, T3s, T3E, T39, T3D, T1k, T3k, T3t, T3c, T1T, T3v, T3w;
+			 V T26, T2G, T3y, T3z, T2T;
+			 {
+			      V T4A, T4N, T47, T4v, T2r, T2v, T2x, T4s, T40, T3W, T3Z;
+			      T4A = VSUB(T3U, T3V);
+			      T3W = VADD(T3U, T3V);
+			      T3Z = VADD(T3X, T3Y);
+			      T4N = VSUB(T3X, T3Y);
+			      T47 = VSUB(T43, T46);
+			      T4v = VADD(T43, T46);
+			      T2r = BYTW(&(W[TWVL * 100]), T2q);
+			      T2v = BYTW(&(W[TWVL * 116]), T2u);
+			      T2x = BYTW(&(W[TWVL * 52]), T2w);
+			      T4s = VADD(T3W, T3Z);
+			      T40 = VSUB(T3W, T3Z);
+			      {
+				   V T4O, T4n, T4Q, T4H, T4E, T4W, T4u, T4y, T4d, T4J, T2F, T2S;
+				   {
+					V T6c, T63, T2t, T4b, T6d, T66, T2E, T4c;
+					{
+					     V T4D, T62, T2s, T64, T2y, T4t;
+					     T4O = VSUB(T4B, T4C);
+					     T4D = VADD(T4B, T4C);
+					     T62 = VSUB(T2r, T2p);
+					     T2s = VADD(T2p, T2r);
+					     T64 = VSUB(T2v, T2x);
+					     T2y = VADD(T2v, T2x);
+					     T4t = VADD(T4j, T4m);
+					     T4n = VSUB(T4j, T4m);
+					     T4Q = VFMA(LDK(KP414213562), T4F, T4G);
+					     T4H = VFNMS(LDK(KP414213562), T4G, T4F);
+					     T4E = VFMA(LDK(KP707106781), T4D, T4A);
+					     T4W = VFNMS(LDK(KP707106781), T4D, T4A);
+					     T6c = VFNMS(LDK(KP414213562), T61, T62);
+					     T63 = VFMA(LDK(KP414213562), T62, T61);
+					     T2t = VSUB(T2n, T2s);
+					     T4b = VADD(T2n, T2s);
+					     T6d = VFMA(LDK(KP414213562), T64, T65);
+					     T66 = VFNMS(LDK(KP414213562), T65, T64);
+					     T2E = VSUB(T2y, T2D);
+					     T4c = VADD(T2y, T2D);
+					     T4u = VSUB(T4s, T4t);
+					     T4y = VADD(T4s, T4t);
+					}
+					T67 = VADD(T63, T66);
+					T7w = VSUB(T66, T63);
+					T6e = VADD(T6c, T6d);
+					T7t = VSUB(T6d, T6c);
+					T4d = VADD(T4b, T4c);
+					T4J = VSUB(T4c, T4b);
+					T2F = VADD(T2t, T2E);
+					T2S = VSUB(T2E, T2t);
+				   }
+				   {
+					V Ty, T1j, T4R, T4K;
+					Ty = VFMA(LDK(KP707106781), Tx, Ta);
+					T3s = VFNMS(LDK(KP707106781), Tx, Ta);
+					T3E = VSUB(TV, T1i);
+					T1j = VADD(TV, T1i);
+					T39 = VFMA(LDK(KP707106781), T38, T37);
+					T3D = VFNMS(LDK(KP707106781), T38, T37);
+					T4R = VFMA(LDK(KP414213562), T4I, T4J);
+					T4K = VFNMS(LDK(KP414213562), T4J, T4I);
+					{
+					     V T4w, T4e, T4P, T4Z;
+					     T4w = VADD(T4a, T4d);
+					     T4e = VSUB(T4a, T4d);
+					     T4P = VFMA(LDK(KP707106781), T4O, T4N);
+					     T4Z = VFNMS(LDK(KP707106781), T4O, T4N);
+					     T1k = VFMA(LDK(KP923879532), T1j, Ty);
+					     T3k = VFNMS(LDK(KP923879532), T1j, Ty);
+					     {
+						  V T4L, T50, T4S, T4X;
+						  T4L = VADD(T4H, T4K);
+						  T50 = VSUB(T4H, T4K);
+						  T4S = VSUB(T4Q, T4R);
+						  T4X = VADD(T4Q, T4R);
+						  {
+						       V T4f, T4o, T4x, T4z;
+						       T4f = VADD(T47, T4e);
+						       T4o = VSUB(T47, T4e);
+						       T4x = VSUB(T4v, T4w);
+						       T4z = VADD(T4v, T4w);
+						       {
+							    V T53, T51, T4M, T4U;
+							    T53 = VFNMS(LDK(KP923879532), T50, T4Z);
+							    T51 = VFMA(LDK(KP923879532), T50, T4Z);
+							    T4M = VFNMS(LDK(KP923879532), T4L, T4E);
+							    T4U = VFMA(LDK(KP923879532), T4L, T4E);
+							    {
+								 V T52, T4Y, T4T, T4V;
+								 T52 = VFMA(LDK(KP923879532), T4X, T4W);
+								 T4Y = VFNMS(LDK(KP923879532), T4X, T4W);
+								 T4T = VFNMS(LDK(KP923879532), T4S, T4P);
+								 T4V = VFMA(LDK(KP923879532), T4S, T4P);
+								 {
+								      V T4p, T4r, T4g, T4q;
+								      T4p = VFNMS(LDK(KP707106781), T4o, T4n);
+								      T4r = VFMA(LDK(KP707106781), T4o, T4n);
+								      T4g = VFNMS(LDK(KP707106781), T4f, T40);
+								      T4q = VFMA(LDK(KP707106781), T4f, T40);
+								      ST(&(x[0]), VADD(T4y, T4z), ms, &(x[0]));
+								      ST(&(x[WS(rs, 32)]), VSUB(T4y, T4z), ms, &(x[0]));
+								      ST(&(x[WS(rs, 16)]), VFMAI(T4x, T4u), ms, &(x[0]));
+								      ST(&(x[WS(rs, 48)]), VFNMSI(T4x, T4u), ms, &(x[0]));
+								      ST(&(x[WS(rs, 44)]), VFNMSI(T51, T4Y), ms, &(x[0]));
+								      ST(&(x[WS(rs, 20)]), VFMAI(T51, T4Y), ms, &(x[0]));
+								      ST(&(x[WS(rs, 52)]), VFMAI(T53, T52), ms, &(x[0]));
+								      ST(&(x[WS(rs, 12)]), VFNMSI(T53, T52), ms, &(x[0]));
+								      ST(&(x[WS(rs, 4)]), VFMAI(T4V, T4U), ms, &(x[0]));
+								      ST(&(x[WS(rs, 60)]), VFNMSI(T4V, T4U), ms, &(x[0]));
+								      ST(&(x[WS(rs, 36)]), VFMAI(T4T, T4M), ms, &(x[0]));
+								      ST(&(x[WS(rs, 28)]), VFNMSI(T4T, T4M), ms, &(x[0]));
+								      ST(&(x[WS(rs, 56)]), VFNMSI(T4r, T4q), ms, &(x[0]));
+								      ST(&(x[WS(rs, 8)]), VFMAI(T4r, T4q), ms, &(x[0]));
+								      ST(&(x[WS(rs, 40)]), VFMAI(T4p, T4g), ms, &(x[0]));
+								      ST(&(x[WS(rs, 24)]), VFNMSI(T4p, T4g), ms, &(x[0]));
+								      T3t = VADD(T3a, T3b);
+								      T3c = VSUB(T3a, T3b);
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					T1T = VFMA(LDK(KP707106781), T1S, T1v);
+					T3v = VFNMS(LDK(KP707106781), T1S, T1v);
+					T3w = VFNMS(LDK(KP707106781), T25, T24);
+					T26 = VFMA(LDK(KP707106781), T25, T24);
+					T2G = VFMA(LDK(KP707106781), T2F, T2i);
+					T3y = VFNMS(LDK(KP707106781), T2F, T2i);
+					T3z = VFNMS(LDK(KP707106781), T2S, T2R);
+					T2T = VFMA(LDK(KP707106781), T2S, T2R);
+				   }
+			      }
+			 }
+			 {
+			      V T3u, T3M, T3F, T3P, T3x, T3G, T3q, T3m, T3h, T3j, T3r, T3p, T2W, T3i;
+			      {
+				   V T3d, T3n, T27, T3e, T2U, T3f;
+				   T3d = VFMA(LDK(KP923879532), T3c, T39);
+				   T3n = VFNMS(LDK(KP923879532), T3c, T39);
+				   T27 = VFNMS(LDK(KP198912367), T26, T1T);
+				   T3e = VFMA(LDK(KP198912367), T1T, T26);
+				   T2U = VFNMS(LDK(KP198912367), T2T, T2G);
+				   T3f = VFMA(LDK(KP198912367), T2G, T2T);
+				   T3u = VFMA(LDK(KP923879532), T3t, T3s);
+				   T3M = VFNMS(LDK(KP923879532), T3t, T3s);
+				   {
+					V T3g, T3l, T2V, T3o;
+					T3g = VSUB(T3e, T3f);
+					T3l = VADD(T3e, T3f);
+					T2V = VADD(T27, T2U);
+					T3o = VSUB(T27, T2U);
+					T3F = VFNMS(LDK(KP923879532), T3E, T3D);
+					T3P = VFMA(LDK(KP923879532), T3E, T3D);
+					T3x = VFMA(LDK(KP668178637), T3w, T3v);
+					T3G = VFNMS(LDK(KP668178637), T3v, T3w);
+					T3q = VFMA(LDK(KP980785280), T3l, T3k);
+					T3m = VFNMS(LDK(KP980785280), T3l, T3k);
+					T3h = VFNMS(LDK(KP980785280), T3g, T3d);
+					T3j = VFMA(LDK(KP980785280), T3g, T3d);
+					T3r = VFNMS(LDK(KP980785280), T3o, T3n);
+					T3p = VFMA(LDK(KP980785280), T3o, T3n);
+					T2W = VFNMS(LDK(KP980785280), T2V, T1k);
+					T3i = VFMA(LDK(KP980785280), T2V, T1k);
+				   }
+			      }
+			      {
+				   V T7n, T7Z, T8j, T89, T7k, T7O, T8g, T7Y, T7H, T7R, T80, T7q, T7u, T82, T83;
+				   V T7x;
+				   {
+					V T7c, T7W, T7D, T87, T7f, T7E, T3A, T3H, T7F, T7i;
+					T7c = VFNMS(LDK(KP923879532), T7b, T7a);
+					T7W = VFMA(LDK(KP923879532), T7b, T7a);
+					T7D = VFMA(LDK(KP923879532), T7C, T7B);
+					T87 = VFNMS(LDK(KP923879532), T7C, T7B);
+					T7f = VFNMS(LDK(KP668178637), T7e, T7d);
+					T7E = VFMA(LDK(KP668178637), T7d, T7e);
+					ST(&(x[WS(rs, 46)]), VFNMSI(T3p, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 18)]), VFMAI(T3p, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 50)]), VFMAI(T3r, T3q), ms, &(x[0]));
+					ST(&(x[WS(rs, 14)]), VFNMSI(T3r, T3q), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFMAI(T3j, T3i), ms, &(x[0]));
+					ST(&(x[WS(rs, 62)]), VFNMSI(T3j, T3i), ms, &(x[0]));
+					ST(&(x[WS(rs, 34)]), VFMAI(T3h, T2W), ms, &(x[0]));
+					ST(&(x[WS(rs, 30)]), VFNMSI(T3h, T2W), ms, &(x[0]));
+					T3A = VFMA(LDK(KP668178637), T3z, T3y);
+					T3H = VFNMS(LDK(KP668178637), T3y, T3z);
+					T7F = VFMA(LDK(KP668178637), T7g, T7h);
+					T7i = VFNMS(LDK(KP668178637), T7h, T7g);
+					T7n = VFNMS(LDK(KP923879532), T7m, T7l);
+					T7Z = VFMA(LDK(KP923879532), T7m, T7l);
+					{
+					     V T3I, T3N, T3B, T3Q;
+					     T3I = VSUB(T3G, T3H);
+					     T3N = VADD(T3G, T3H);
+					     T3B = VADD(T3x, T3A);
+					     T3Q = VSUB(T3x, T3A);
+					     {
+						  V T7j, T88, T7G, T7X;
+						  T7j = VADD(T7f, T7i);
+						  T88 = VSUB(T7f, T7i);
+						  T7G = VSUB(T7E, T7F);
+						  T7X = VADD(T7E, T7F);
+						  {
+						       V T3S, T3O, T3J, T3L;
+						       T3S = VFNMS(LDK(KP831469612), T3N, T3M);
+						       T3O = VFMA(LDK(KP831469612), T3N, T3M);
+						       T3J = VFNMS(LDK(KP831469612), T3I, T3F);
+						       T3L = VFMA(LDK(KP831469612), T3I, T3F);
+						       {
+							    V T3T, T3R, T3C, T3K;
+							    T3T = VFMA(LDK(KP831469612), T3Q, T3P);
+							    T3R = VFNMS(LDK(KP831469612), T3Q, T3P);
+							    T3C = VFNMS(LDK(KP831469612), T3B, T3u);
+							    T3K = VFMA(LDK(KP831469612), T3B, T3u);
+							    T8j = VFNMS(LDK(KP831469612), T88, T87);
+							    T89 = VFMA(LDK(KP831469612), T88, T87);
+							    T7k = VFNMS(LDK(KP831469612), T7j, T7c);
+							    T7O = VFMA(LDK(KP831469612), T7j, T7c);
+							    T8g = VFNMS(LDK(KP831469612), T7X, T7W);
+							    T7Y = VFMA(LDK(KP831469612), T7X, T7W);
+							    T7H = VFMA(LDK(KP831469612), T7G, T7D);
+							    T7R = VFNMS(LDK(KP831469612), T7G, T7D);
+							    ST(&(x[WS(rs, 42)]), VFMAI(T3R, T3O), ms, &(x[0]));
+							    ST(&(x[WS(rs, 22)]), VFNMSI(T3R, T3O), ms, &(x[0]));
+							    ST(&(x[WS(rs, 54)]), VFNMSI(T3T, T3S), ms, &(x[0]));
+							    ST(&(x[WS(rs, 10)]), VFMAI(T3T, T3S), ms, &(x[0]));
+							    ST(&(x[WS(rs, 58)]), VFMAI(T3L, T3K), ms, &(x[0]));
+							    ST(&(x[WS(rs, 6)]), VFNMSI(T3L, T3K), ms, &(x[0]));
+							    ST(&(x[WS(rs, 26)]), VFMAI(T3J, T3C), ms, &(x[0]));
+							    ST(&(x[WS(rs, 38)]), VFNMSI(T3J, T3C), ms, &(x[0]));
+							    T80 = VFNMS(LDK(KP923879532), T7p, T7o);
+							    T7q = VFMA(LDK(KP923879532), T7p, T7o);
+						       }
+						  }
+					     }
+					}
+					T7u = VFNMS(LDK(KP923879532), T7t, T7s);
+					T82 = VFMA(LDK(KP923879532), T7t, T7s);
+					T83 = VFNMS(LDK(KP923879532), T7w, T7v);
+					T7x = VFMA(LDK(KP923879532), T7w, T7v);
+				   }
+				   {
+					V T5g, T6I, T6p, T6T, T5p, T6q, T6r, T5y;
+					T5g = VFMA(LDK(KP923879532), T5f, T58);
+					T6I = VFNMS(LDK(KP923879532), T5f, T58);
+					{
+					     V T7r, T7I, T7y, T7J;
+					     T7r = VFNMS(LDK(KP534511135), T7q, T7n);
+					     T7I = VFMA(LDK(KP534511135), T7n, T7q);
+					     T7y = VFNMS(LDK(KP534511135), T7x, T7u);
+					     T7J = VFMA(LDK(KP534511135), T7u, T7x);
+					     {
+						  V T81, T8a, T84, T8b;
+						  T81 = VFMA(LDK(KP303346683), T80, T7Z);
+						  T8a = VFNMS(LDK(KP303346683), T7Z, T80);
+						  T84 = VFMA(LDK(KP303346683), T83, T82);
+						  T8b = VFNMS(LDK(KP303346683), T82, T83);
+						  T6p = VFMA(LDK(KP923879532), T6o, T6l);
+						  T6T = VFNMS(LDK(KP923879532), T6o, T6l);
+						  T5p = VFNMS(LDK(KP198912367), T5o, T5l);
+						  T6q = VFMA(LDK(KP198912367), T5l, T5o);
+						  {
+						       V T7K, T7P, T7z, T7S;
+						       T7K = VSUB(T7I, T7J);
+						       T7P = VADD(T7I, T7J);
+						       T7z = VADD(T7r, T7y);
+						       T7S = VSUB(T7r, T7y);
+						       {
+							    V T8c, T8h, T85, T8k;
+							    T8c = VSUB(T8a, T8b);
+							    T8h = VADD(T8a, T8b);
+							    T85 = VADD(T81, T84);
+							    T8k = VSUB(T81, T84);
+							    {
+								 V T7Q, T7U, T7L, T7N;
+								 T7Q = VFNMS(LDK(KP881921264), T7P, T7O);
+								 T7U = VFMA(LDK(KP881921264), T7P, T7O);
+								 T7L = VFNMS(LDK(KP881921264), T7K, T7H);
+								 T7N = VFMA(LDK(KP881921264), T7K, T7H);
+								 {
+								      V T7T, T7V, T7A, T7M;
+								      T7T = VFMA(LDK(KP881921264), T7S, T7R);
+								      T7V = VFNMS(LDK(KP881921264), T7S, T7R);
+								      T7A = VFNMS(LDK(KP881921264), T7z, T7k);
+								      T7M = VFMA(LDK(KP881921264), T7z, T7k);
+								      {
+									   V T8i, T8m, T8d, T8f;
+									   T8i = VFMA(LDK(KP956940335), T8h, T8g);
+									   T8m = VFNMS(LDK(KP956940335), T8h, T8g);
+									   T8d = VFNMS(LDK(KP956940335), T8c, T89);
+									   T8f = VFMA(LDK(KP956940335), T8c, T89);
+									   {
+										V T8l, T8n, T86, T8e;
+										T8l = VFNMS(LDK(KP956940335), T8k, T8j);
+										T8n = VFMA(LDK(KP956940335), T8k, T8j);
+										T86 = VFNMS(LDK(KP956940335), T85, T7Y);
+										T8e = VFMA(LDK(KP956940335), T85, T7Y);
+										ST(&(x[WS(rs, 53)]), VFMAI(T7V, T7U), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 11)]), VFNMSI(T7V, T7U), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 43)]), VFNMSI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 21)]), VFMAI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 5)]), VFMAI(T7N, T7M), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 59)]), VFNMSI(T7N, T7M), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 37)]), VFMAI(T7L, T7A), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 27)]), VFNMSI(T7L, T7A), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 51)]), VFNMSI(T8n, T8m), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 13)]), VFMAI(T8n, T8m), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 45)]), VFMAI(T8l, T8i), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 19)]), VFNMSI(T8l, T8i), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 61)]), VFMAI(T8f, T8e), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 3)]), VFNMSI(T8f, T8e), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 29)]), VFMAI(T8d, T86), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 35)]), VFNMSI(T8d, T86), ms, &(x[WS(rs, 1)]));
+										T6r = VFMA(LDK(KP198912367), T5u, T5x);
+										T5y = VFNMS(LDK(KP198912367), T5x, T5u);
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					{
+					     V T5N, T5U, T68, T5z, T6U, T6f;
+					     T5N = VFMA(LDK(KP923879532), T5M, T5F);
+					     T6L = VFNMS(LDK(KP923879532), T5M, T5F);
+					     T6M = VFNMS(LDK(KP923879532), T5T, T5Q);
+					     T5U = VFMA(LDK(KP923879532), T5T, T5Q);
+					     T68 = VFMA(LDK(KP923879532), T67, T60);
+					     T6O = VFNMS(LDK(KP923879532), T67, T60);
+					     T5z = VADD(T5p, T5y);
+					     T6U = VSUB(T5p, T5y);
+					     T6P = VFNMS(LDK(KP923879532), T6e, T6b);
+					     T6f = VFMA(LDK(KP923879532), T6e, T6b);
+					     {
+						  V T5V, T6u, T6g, T6v, T6s, T6J;
+						  T6s = VSUB(T6q, T6r);
+						  T6J = VADD(T6q, T6r);
+						  T5V = VFNMS(LDK(KP098491403), T5U, T5N);
+						  T6u = VFMA(LDK(KP098491403), T5N, T5U);
+						  T75 = VFMA(LDK(KP980785280), T6U, T6T);
+						  T6V = VFNMS(LDK(KP980785280), T6U, T6T);
+						  T5A = VFMA(LDK(KP980785280), T5z, T5g);
+						  T6A = VFNMS(LDK(KP980785280), T5z, T5g);
+						  T6g = VFNMS(LDK(KP098491403), T6f, T68);
+						  T6v = VFMA(LDK(KP098491403), T68, T6f);
+						  T72 = VFNMS(LDK(KP980785280), T6J, T6I);
+						  T6K = VFMA(LDK(KP980785280), T6J, T6I);
+						  T6t = VFMA(LDK(KP980785280), T6s, T6p);
+						  T6D = VFNMS(LDK(KP980785280), T6s, T6p);
+						  T6w = VSUB(T6u, T6v);
+						  T6B = VADD(T6u, T6v);
+						  T6h = VADD(T5V, T6g);
+						  T6E = VSUB(T5V, T6g);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T6W, T6N, T6G, T6C, T6z, T6x, T6H, T6F, T6y, T6i, T6X, T6Q;
+		    T6W = VFNMS(LDK(KP820678790), T6L, T6M);
+		    T6N = VFMA(LDK(KP820678790), T6M, T6L);
+		    T6G = VFMA(LDK(KP995184726), T6B, T6A);
+		    T6C = VFNMS(LDK(KP995184726), T6B, T6A);
+		    T6z = VFMA(LDK(KP995184726), T6w, T6t);
+		    T6x = VFNMS(LDK(KP995184726), T6w, T6t);
+		    T6H = VFNMS(LDK(KP995184726), T6E, T6D);
+		    T6F = VFMA(LDK(KP995184726), T6E, T6D);
+		    T6y = VFMA(LDK(KP995184726), T6h, T5A);
+		    T6i = VFNMS(LDK(KP995184726), T6h, T5A);
+		    T6X = VFNMS(LDK(KP820678790), T6O, T6P);
+		    T6Q = VFMA(LDK(KP820678790), T6P, T6O);
+		    {
+			 V T73, T6Y, T76, T6R;
+			 ST(&(x[WS(rs, 49)]), VFMAI(T6H, T6G), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 15)]), VFNMSI(T6H, T6G), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 47)]), VFNMSI(T6F, T6C), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 17)]), VFMAI(T6F, T6C), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VFMAI(T6z, T6y), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 63)]), VFNMSI(T6z, T6y), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 33)]), VFMAI(T6x, T6i), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 31)]), VFNMSI(T6x, T6i), ms, &(x[WS(rs, 1)]));
+			 T73 = VADD(T6W, T6X);
+			 T6Y = VSUB(T6W, T6X);
+			 T76 = VSUB(T6N, T6Q);
+			 T6R = VADD(T6N, T6Q);
+			 {
+			      V T78, T74, T71, T6Z, T79, T77, T70, T6S;
+			      T78 = VFNMS(LDK(KP773010453), T73, T72);
+			      T74 = VFMA(LDK(KP773010453), T73, T72);
+			      T71 = VFMA(LDK(KP773010453), T6Y, T6V);
+			      T6Z = VFNMS(LDK(KP773010453), T6Y, T6V);
+			      T79 = VFMA(LDK(KP773010453), T76, T75);
+			      T77 = VFNMS(LDK(KP773010453), T76, T75);
+			      T70 = VFMA(LDK(KP773010453), T6R, T6K);
+			      T6S = VFNMS(LDK(KP773010453), T6R, T6K);
+			      ST(&(x[WS(rs, 55)]), VFNMSI(T79, T78), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 9)]), VFMAI(T79, T78), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 41)]), VFMAI(T77, T74), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 23)]), VFNMSI(T77, T74), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 57)]), VFMAI(T71, T70), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 7)]), VFNMSI(T71, T70), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 25)]), VFMAI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 39)]), VFNMSI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     VTW(0, 32),
+     VTW(0, 33),
+     VTW(0, 34),
+     VTW(0, 35),
+     VTW(0, 36),
+     VTW(0, 37),
+     VTW(0, 38),
+     VTW(0, 39),
+     VTW(0, 40),
+     VTW(0, 41),
+     VTW(0, 42),
+     VTW(0, 43),
+     VTW(0, 44),
+     VTW(0, 45),
+     VTW(0, 46),
+     VTW(0, 47),
+     VTW(0, 48),
+     VTW(0, 49),
+     VTW(0, 50),
+     VTW(0, 51),
+     VTW(0, 52),
+     VTW(0, 53),
+     VTW(0, 54),
+     VTW(0, 55),
+     VTW(0, 56),
+     VTW(0, 57),
+     VTW(0, 58),
+     VTW(0, 59),
+     VTW(0, 60),
+     VTW(0, 61),
+     VTW(0, 62),
+     VTW(0, 63),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 64, XSIMD_STRING("t2bv_64"), twinstr, &GENUS, {261, 126, 258, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_64) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_64, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2bv_64 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 519 FP additions, 250 FP multiplications,
+ * (or, 467 additions, 198 multiplications, 52 fused multiply/add),
+ * 107 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V Tg, T4B, T6v, T7G, T3r, T4w, T5q, T7F, T5Y, T62, T28, T4d, T2g, T4a, T7g;
+	       V T7Y, T6f, T6j, T2Z, T4k, T37, T4h, T7n, T81, T7w, T7x, T7y, T5M, T6q, T1k;
+	       V T4s, T1r, T4t, T7t, T7u, T7v, T5F, T6p, TV, T4p, T12, T4q, T7A, T7B, TD;
+	       V T4x, T3k, T4C, T5x, T6s, T1R, T4b, T7j, T7Z, T2j, T4e, T5V, T63, T2I, T4i;
+	       V T7q, T82, T3a, T4l, T6c, T6k;
+	       {
+		    V T1, T3, T3p, T3n, Tb, Td, Te, T6, T8, T9, T2, T3o, T3m;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
+		    T3 = BYTW(&(W[TWVL * 62]), T2);
+		    T3o = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
+		    T3p = BYTW(&(W[TWVL * 94]), T3o);
+		    T3m = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+		    T3n = BYTW(&(W[TWVL * 30]), T3m);
+		    {
+			 V Ta, Tc, T5, T7;
+			 Ta = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
+			 Tb = BYTW(&(W[TWVL * 110]), Ta);
+			 Tc = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			 Td = BYTW(&(W[TWVL * 46]), Tc);
+			 Te = VSUB(Tb, Td);
+			 T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T6 = BYTW(&(W[TWVL * 14]), T5);
+			 T7 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
+			 T8 = BYTW(&(W[TWVL * 78]), T7);
+			 T9 = VSUB(T6, T8);
+		    }
+		    {
+			 V T4, Tf, T6t, T6u;
+			 T4 = VSUB(T1, T3);
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 Tg = VSUB(T4, Tf);
+			 T4B = VADD(T4, Tf);
+			 T6t = VADD(T6, T8);
+			 T6u = VADD(Tb, Td);
+			 T6v = VSUB(T6t, T6u);
+			 T7G = VADD(T6t, T6u);
+		    }
+		    {
+			 V T3l, T3q, T5o, T5p;
+			 T3l = VMUL(LDK(KP707106781), VSUB(T9, Te));
+			 T3q = VSUB(T3n, T3p);
+			 T3r = VSUB(T3l, T3q);
+			 T4w = VADD(T3q, T3l);
+			 T5o = VADD(T1, T3);
+			 T5p = VADD(T3n, T3p);
+			 T5q = VSUB(T5o, T5p);
+			 T7F = VADD(T5o, T5p);
+		    }
+	       }
+	       {
+		    V T24, T26, T61, T2b, T2d, T60, T1W, T5W, T21, T5X, T22, T27;
+		    {
+			 V T23, T25, T2a, T2c;
+			 T23 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 T24 = BYTW(&(W[TWVL * 32]), T23);
+			 T25 = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
+			 T26 = BYTW(&(W[TWVL * 96]), T25);
+			 T61 = VADD(T24, T26);
+			 T2a = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2b = BYTW(&(W[0]), T2a);
+			 T2c = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
+			 T2d = BYTW(&(W[TWVL * 64]), T2c);
+			 T60 = VADD(T2b, T2d);
+		    }
+		    {
+			 V T1T, T1V, T1S, T1U;
+			 T1S = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 T1T = BYTW(&(W[TWVL * 16]), T1S);
+			 T1U = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
+			 T1V = BYTW(&(W[TWVL * 80]), T1U);
+			 T1W = VSUB(T1T, T1V);
+			 T5W = VADD(T1T, T1V);
+		    }
+		    {
+			 V T1Y, T20, T1X, T1Z;
+			 T1X = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
+			 T1Y = BYTW(&(W[TWVL * 112]), T1X);
+			 T1Z = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+			 T20 = BYTW(&(W[TWVL * 48]), T1Z);
+			 T21 = VSUB(T1Y, T20);
+			 T5X = VADD(T1Y, T20);
+		    }
+		    T5Y = VSUB(T5W, T5X);
+		    T62 = VSUB(T60, T61);
+		    T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
+		    T27 = VSUB(T24, T26);
+		    T28 = VSUB(T22, T27);
+		    T4d = VADD(T27, T22);
+		    {
+			 V T2e, T2f, T7e, T7f;
+			 T2e = VSUB(T2b, T2d);
+			 T2f = VMUL(LDK(KP707106781), VADD(T1W, T21));
+			 T2g = VSUB(T2e, T2f);
+			 T4a = VADD(T2e, T2f);
+			 T7e = VADD(T60, T61);
+			 T7f = VADD(T5W, T5X);
+			 T7g = VSUB(T7e, T7f);
+			 T7Y = VADD(T7e, T7f);
+		    }
+	       }
+	       {
+		    V T2V, T2X, T6i, T32, T34, T6h, T2N, T6d, T2S, T6e, T2T, T2Y;
+		    {
+			 V T2U, T2W, T31, T33;
+			 T2U = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T2V = BYTW(&(W[TWVL * 28]), T2U);
+			 T2W = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
+			 T2X = BYTW(&(W[TWVL * 92]), T2W);
+			 T6i = VADD(T2V, T2X);
+			 T31 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
+			 T32 = BYTW(&(W[TWVL * 124]), T31);
+			 T33 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+			 T34 = BYTW(&(W[TWVL * 60]), T33);
+			 T6h = VADD(T32, T34);
+		    }
+		    {
+			 V T2K, T2M, T2J, T2L;
+			 T2J = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T2K = BYTW(&(W[TWVL * 12]), T2J);
+			 T2L = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
+			 T2M = BYTW(&(W[TWVL * 76]), T2L);
+			 T2N = VSUB(T2K, T2M);
+			 T6d = VADD(T2K, T2M);
+		    }
+		    {
+			 V T2P, T2R, T2O, T2Q;
+			 T2O = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
+			 T2P = BYTW(&(W[TWVL * 108]), T2O);
+			 T2Q = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T2R = BYTW(&(W[TWVL * 44]), T2Q);
+			 T2S = VSUB(T2P, T2R);
+			 T6e = VADD(T2P, T2R);
+		    }
+		    T6f = VSUB(T6d, T6e);
+		    T6j = VSUB(T6h, T6i);
+		    T2T = VMUL(LDK(KP707106781), VSUB(T2N, T2S));
+		    T2Y = VSUB(T2V, T2X);
+		    T2Z = VSUB(T2T, T2Y);
+		    T4k = VADD(T2Y, T2T);
+		    {
+			 V T35, T36, T7l, T7m;
+			 T35 = VSUB(T32, T34);
+			 T36 = VMUL(LDK(KP707106781), VADD(T2N, T2S));
+			 T37 = VSUB(T35, T36);
+			 T4h = VADD(T35, T36);
+			 T7l = VADD(T6h, T6i);
+			 T7m = VADD(T6d, T6e);
+			 T7n = VSUB(T7l, T7m);
+			 T81 = VADD(T7l, T7m);
+		    }
+	       }
+	       {
+		    V T1g, T1i, T5K, T1m, T1o, T5J, T18, T5G, T1d, T5H, T5I, T5L;
+		    {
+			 V T1f, T1h, T1l, T1n;
+			 T1f = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 T1g = BYTW(&(W[TWVL * 26]), T1f);
+			 T1h = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
+			 T1i = BYTW(&(W[TWVL * 90]), T1h);
+			 T5K = VADD(T1g, T1i);
+			 T1l = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
+			 T1m = BYTW(&(W[TWVL * 122]), T1l);
+			 T1n = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+			 T1o = BYTW(&(W[TWVL * 58]), T1n);
+			 T5J = VADD(T1m, T1o);
+		    }
+		    {
+			 V T15, T17, T14, T16;
+			 T14 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 T15 = BYTW(&(W[TWVL * 10]), T14);
+			 T16 = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
+			 T17 = BYTW(&(W[TWVL * 74]), T16);
+			 T18 = VSUB(T15, T17);
+			 T5G = VADD(T15, T17);
+		    }
+		    {
+			 V T1a, T1c, T19, T1b;
+			 T19 = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
+			 T1a = BYTW(&(W[TWVL * 106]), T19);
+			 T1b = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 T1c = BYTW(&(W[TWVL * 42]), T1b);
+			 T1d = VSUB(T1a, T1c);
+			 T5H = VADD(T1a, T1c);
+		    }
+		    T7w = VADD(T5J, T5K);
+		    T7x = VADD(T5G, T5H);
+		    T7y = VSUB(T7w, T7x);
+		    T5I = VSUB(T5G, T5H);
+		    T5L = VSUB(T5J, T5K);
+		    T5M = VFNMS(LDK(KP382683432), T5L, VMUL(LDK(KP923879532), T5I));
+		    T6q = VFMA(LDK(KP923879532), T5L, VMUL(LDK(KP382683432), T5I));
+		    {
+			 V T1e, T1j, T1p, T1q;
+			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
+			 T1j = VSUB(T1g, T1i);
+			 T1k = VSUB(T1e, T1j);
+			 T4s = VADD(T1j, T1e);
+			 T1p = VSUB(T1m, T1o);
+			 T1q = VMUL(LDK(KP707106781), VADD(T18, T1d));
+			 T1r = VSUB(T1p, T1q);
+			 T4t = VADD(T1p, T1q);
+		    }
+	       }
+	       {
+		    V TR, TT, T5A, TX, TZ, T5z, TJ, T5C, TO, T5D, T5B, T5E;
+		    {
+			 V TQ, TS, TW, TY;
+			 TQ = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 TR = BYTW(&(W[TWVL * 34]), TQ);
+			 TS = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
+			 TT = BYTW(&(W[TWVL * 98]), TS);
+			 T5A = VADD(TR, TT);
+			 TW = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 TX = BYTW(&(W[TWVL * 2]), TW);
+			 TY = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
+			 TZ = BYTW(&(W[TWVL * 66]), TY);
+			 T5z = VADD(TX, TZ);
+		    }
+		    {
+			 V TG, TI, TF, TH;
+			 TF = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 TG = BYTW(&(W[TWVL * 18]), TF);
+			 TH = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
+			 TI = BYTW(&(W[TWVL * 82]), TH);
+			 TJ = VSUB(TG, TI);
+			 T5C = VADD(TG, TI);
+		    }
+		    {
+			 V TL, TN, TK, TM;
+			 TK = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
+			 TL = BYTW(&(W[TWVL * 114]), TK);
+			 TM = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			 TN = BYTW(&(W[TWVL * 50]), TM);
+			 TO = VSUB(TL, TN);
+			 T5D = VADD(TL, TN);
+		    }
+		    T7t = VADD(T5z, T5A);
+		    T7u = VADD(T5C, T5D);
+		    T7v = VSUB(T7t, T7u);
+		    T5B = VSUB(T5z, T5A);
+		    T5E = VSUB(T5C, T5D);
+		    T5F = VFMA(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5E));
+		    T6p = VFNMS(LDK(KP382683432), T5E, VMUL(LDK(KP923879532), T5B));
+		    {
+			 V TP, TU, T10, T11;
+			 TP = VMUL(LDK(KP707106781), VSUB(TJ, TO));
+			 TU = VSUB(TR, TT);
+			 TV = VSUB(TP, TU);
+			 T4p = VADD(TU, TP);
+			 T10 = VSUB(TX, TZ);
+			 T11 = VMUL(LDK(KP707106781), VADD(TJ, TO));
+			 T12 = VSUB(T10, T11);
+			 T4q = VADD(T10, T11);
+		    }
+	       }
+	       {
+		    V Tl, T5r, TB, T5u, Tq, T5s, Tw, T5v, Tr, TC;
+		    {
+			 V Ti, Tk, Th, Tj;
+			 Th = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Ti = BYTW(&(W[TWVL * 6]), Th);
+			 Tj = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
+			 Tk = BYTW(&(W[TWVL * 70]), Tj);
+			 Tl = VSUB(Ti, Tk);
+			 T5r = VADD(Ti, Tk);
+		    }
+		    {
+			 V Ty, TA, Tx, Tz;
+			 Tx = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
+			 Ty = BYTW(&(W[TWVL * 118]), Tx);
+			 Tz = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+			 TA = BYTW(&(W[TWVL * 54]), Tz);
+			 TB = VSUB(Ty, TA);
+			 T5u = VADD(Ty, TA);
+		    }
+		    {
+			 V Tn, Tp, Tm, To;
+			 Tm = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 Tn = BYTW(&(W[TWVL * 38]), Tm);
+			 To = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
+			 Tp = BYTW(&(W[TWVL * 102]), To);
+			 Tq = VSUB(Tn, Tp);
+			 T5s = VADD(Tn, Tp);
+		    }
+		    {
+			 V Tt, Tv, Ts, Tu;
+			 Ts = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Tt = BYTW(&(W[TWVL * 22]), Ts);
+			 Tu = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
+			 Tv = BYTW(&(W[TWVL * 86]), Tu);
+			 Tw = VSUB(Tt, Tv);
+			 T5v = VADD(Tt, Tv);
+		    }
+		    T7A = VADD(T5r, T5s);
+		    T7B = VADD(T5u, T5v);
+		    Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
+		    TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw));
+		    TD = VSUB(Tr, TC);
+		    T4x = VADD(Tr, TC);
+		    {
+			 V T3i, T3j, T5t, T5w;
+			 T3i = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
+			 T3j = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
+			 T3k = VSUB(T3i, T3j);
+			 T4C = VADD(T3i, T3j);
+			 T5t = VSUB(T5r, T5s);
+			 T5w = VSUB(T5u, T5v);
+			 T5x = VMUL(LDK(KP707106781), VADD(T5t, T5w));
+			 T6s = VMUL(LDK(KP707106781), VSUB(T5t, T5w));
+		    }
+	       }
+	       {
+		    V T1z, T5P, T1P, T5T, T1E, T5Q, T1K, T5S;
+		    {
+			 V T1w, T1y, T1v, T1x;
+			 T1v = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T1w = BYTW(&(W[TWVL * 8]), T1v);
+			 T1x = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
+			 T1y = BYTW(&(W[TWVL * 72]), T1x);
+			 T1z = VSUB(T1w, T1y);
+			 T5P = VADD(T1w, T1y);
+		    }
+		    {
+			 V T1M, T1O, T1L, T1N;
+			 T1L = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T1M = BYTW(&(W[TWVL * 24]), T1L);
+			 T1N = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
+			 T1O = BYTW(&(W[TWVL * 88]), T1N);
+			 T1P = VSUB(T1M, T1O);
+			 T5T = VADD(T1M, T1O);
+		    }
+		    {
+			 V T1B, T1D, T1A, T1C;
+			 T1A = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			 T1B = BYTW(&(W[TWVL * 40]), T1A);
+			 T1C = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
+			 T1D = BYTW(&(W[TWVL * 104]), T1C);
+			 T1E = VSUB(T1B, T1D);
+			 T5Q = VADD(T1B, T1D);
+		    }
+		    {
+			 V T1H, T1J, T1G, T1I;
+			 T1G = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
+			 T1H = BYTW(&(W[TWVL * 120]), T1G);
+			 T1I = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+			 T1J = BYTW(&(W[TWVL * 56]), T1I);
+			 T1K = VSUB(T1H, T1J);
+			 T5S = VADD(T1H, T1J);
+		    }
+		    {
+			 V T1F, T1Q, T7h, T7i;
+			 T1F = VFNMS(LDK(KP382683432), T1E, VMUL(LDK(KP923879532), T1z));
+			 T1Q = VFMA(LDK(KP923879532), T1K, VMUL(LDK(KP382683432), T1P));
+			 T1R = VSUB(T1F, T1Q);
+			 T4b = VADD(T1F, T1Q);
+			 T7h = VADD(T5P, T5Q);
+			 T7i = VADD(T5S, T5T);
+			 T7j = VSUB(T7h, T7i);
+			 T7Z = VADD(T7h, T7i);
+		    }
+		    {
+			 V T2h, T2i, T5R, T5U;
+			 T2h = VFMA(LDK(KP382683432), T1z, VMUL(LDK(KP923879532), T1E));
+			 T2i = VFNMS(LDK(KP382683432), T1K, VMUL(LDK(KP923879532), T1P));
+			 T2j = VSUB(T2h, T2i);
+			 T4e = VADD(T2h, T2i);
+			 T5R = VSUB(T5P, T5Q);
+			 T5U = VSUB(T5S, T5T);
+			 T5V = VMUL(LDK(KP707106781), VSUB(T5R, T5U));
+			 T63 = VMUL(LDK(KP707106781), VADD(T5R, T5U));
+		    }
+	       }
+	       {
+		    V T2q, T66, T2G, T6a, T2v, T67, T2B, T69;
+		    {
+			 V T2n, T2p, T2m, T2o;
+			 T2m = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T2n = BYTW(&(W[TWVL * 4]), T2m);
+			 T2o = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
+			 T2p = BYTW(&(W[TWVL * 68]), T2o);
+			 T2q = VSUB(T2n, T2p);
+			 T66 = VADD(T2n, T2p);
+		    }
+		    {
+			 V T2D, T2F, T2C, T2E;
+			 T2C = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 T2D = BYTW(&(W[TWVL * 20]), T2C);
+			 T2E = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
+			 T2F = BYTW(&(W[TWVL * 84]), T2E);
+			 T2G = VSUB(T2D, T2F);
+			 T6a = VADD(T2D, T2F);
+		    }
+		    {
+			 V T2s, T2u, T2r, T2t;
+			 T2r = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 T2s = BYTW(&(W[TWVL * 36]), T2r);
+			 T2t = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
+			 T2u = BYTW(&(W[TWVL * 100]), T2t);
+			 T2v = VSUB(T2s, T2u);
+			 T67 = VADD(T2s, T2u);
+		    }
+		    {
+			 V T2y, T2A, T2x, T2z;
+			 T2x = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
+			 T2y = BYTW(&(W[TWVL * 116]), T2x);
+			 T2z = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+			 T2A = BYTW(&(W[TWVL * 52]), T2z);
+			 T2B = VSUB(T2y, T2A);
+			 T69 = VADD(T2y, T2A);
+		    }
+		    {
+			 V T2w, T2H, T7o, T7p;
+			 T2w = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2q));
+			 T2H = VFMA(LDK(KP923879532), T2B, VMUL(LDK(KP382683432), T2G));
+			 T2I = VSUB(T2w, T2H);
+			 T4i = VADD(T2w, T2H);
+			 T7o = VADD(T66, T67);
+			 T7p = VADD(T69, T6a);
+			 T7q = VSUB(T7o, T7p);
+			 T82 = VADD(T7o, T7p);
+		    }
+		    {
+			 V T38, T39, T68, T6b;
+			 T38 = VFMA(LDK(KP382683432), T2q, VMUL(LDK(KP923879532), T2v));
+			 T39 = VFNMS(LDK(KP382683432), T2B, VMUL(LDK(KP923879532), T2G));
+			 T3a = VSUB(T38, T39);
+			 T4l = VADD(T38, T39);
+			 T68 = VSUB(T66, T67);
+			 T6b = VSUB(T69, T6a);
+			 T6c = VMUL(LDK(KP707106781), VSUB(T68, T6b));
+			 T6k = VMUL(LDK(KP707106781), VADD(T68, T6b));
+		    }
+	       }
+	       {
+		    V T7s, T7R, T7M, T7U, T7D, T7T, T7J, T7Q;
+		    {
+			 V T7k, T7r, T7K, T7L;
+			 T7k = VFNMS(LDK(KP382683432), T7j, VMUL(LDK(KP923879532), T7g));
+			 T7r = VFMA(LDK(KP923879532), T7n, VMUL(LDK(KP382683432), T7q));
+			 T7s = VSUB(T7k, T7r);
+			 T7R = VADD(T7k, T7r);
+			 T7K = VFMA(LDK(KP382683432), T7g, VMUL(LDK(KP923879532), T7j));
+			 T7L = VFNMS(LDK(KP382683432), T7n, VMUL(LDK(KP923879532), T7q));
+			 T7M = VSUB(T7K, T7L);
+			 T7U = VADD(T7K, T7L);
+		    }
+		    {
+			 V T7z, T7C, T7H, T7I;
+			 T7z = VMUL(LDK(KP707106781), VSUB(T7v, T7y));
+			 T7C = VSUB(T7A, T7B);
+			 T7D = VSUB(T7z, T7C);
+			 T7T = VADD(T7C, T7z);
+			 T7H = VSUB(T7F, T7G);
+			 T7I = VMUL(LDK(KP707106781), VADD(T7v, T7y));
+			 T7J = VSUB(T7H, T7I);
+			 T7Q = VADD(T7H, T7I);
+		    }
+		    {
+			 V T7E, T7N, T7W, T7X;
+			 T7E = VBYI(VSUB(T7s, T7D));
+			 T7N = VSUB(T7J, T7M);
+			 ST(&(x[WS(rs, 20)]), VADD(T7E, T7N), ms, &(x[0]));
+			 ST(&(x[WS(rs, 44)]), VSUB(T7N, T7E), ms, &(x[0]));
+			 T7W = VSUB(T7Q, T7R);
+			 T7X = VBYI(VSUB(T7U, T7T));
+			 ST(&(x[WS(rs, 36)]), VSUB(T7W, T7X), ms, &(x[0]));
+			 ST(&(x[WS(rs, 28)]), VADD(T7W, T7X), ms, &(x[0]));
+		    }
+		    {
+			 V T7O, T7P, T7S, T7V;
+			 T7O = VBYI(VADD(T7D, T7s));
+			 T7P = VADD(T7J, T7M);
+			 ST(&(x[WS(rs, 12)]), VADD(T7O, T7P), ms, &(x[0]));
+			 ST(&(x[WS(rs, 52)]), VSUB(T7P, T7O), ms, &(x[0]));
+			 T7S = VADD(T7Q, T7R);
+			 T7V = VBYI(VADD(T7T, T7U));
+			 ST(&(x[WS(rs, 60)]), VSUB(T7S, T7V), ms, &(x[0]));
+			 ST(&(x[WS(rs, 4)]), VADD(T7S, T7V), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T84, T8c, T8l, T8n, T87, T8h, T8b, T8g, T8i, T8m;
+		    {
+			 V T80, T83, T8j, T8k;
+			 T80 = VSUB(T7Y, T7Z);
+			 T83 = VSUB(T81, T82);
+			 T84 = VMUL(LDK(KP707106781), VSUB(T80, T83));
+			 T8c = VMUL(LDK(KP707106781), VADD(T80, T83));
+			 T8j = VADD(T7Y, T7Z);
+			 T8k = VADD(T81, T82);
+			 T8l = VBYI(VSUB(T8j, T8k));
+			 T8n = VADD(T8j, T8k);
+		    }
+		    {
+			 V T85, T86, T89, T8a;
+			 T85 = VADD(T7t, T7u);
+			 T86 = VADD(T7w, T7x);
+			 T87 = VSUB(T85, T86);
+			 T8h = VADD(T85, T86);
+			 T89 = VADD(T7F, T7G);
+			 T8a = VADD(T7A, T7B);
+			 T8b = VSUB(T89, T8a);
+			 T8g = VADD(T89, T8a);
+		    }
+		    T8i = VSUB(T8g, T8h);
+		    ST(&(x[WS(rs, 48)]), VSUB(T8i, T8l), ms, &(x[0]));
+		    ST(&(x[WS(rs, 16)]), VADD(T8i, T8l), ms, &(x[0]));
+		    T8m = VADD(T8g, T8h);
+		    ST(&(x[WS(rs, 32)]), VSUB(T8m, T8n), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T8m, T8n), ms, &(x[0]));
+		    {
+			 V T88, T8d, T8e, T8f;
+			 T88 = VBYI(VSUB(T84, T87));
+			 T8d = VSUB(T8b, T8c);
+			 ST(&(x[WS(rs, 24)]), VADD(T88, T8d), ms, &(x[0]));
+			 ST(&(x[WS(rs, 40)]), VSUB(T8d, T88), ms, &(x[0]));
+			 T8e = VBYI(VADD(T87, T84));
+			 T8f = VADD(T8b, T8c);
+			 ST(&(x[WS(rs, 8)]), VADD(T8e, T8f), ms, &(x[0]));
+			 ST(&(x[WS(rs, 56)]), VSUB(T8f, T8e), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T5O, T6H, T6x, T6F, T6n, T6I, T6A, T6E;
+		    {
+			 V T5y, T5N, T6r, T6w;
+			 T5y = VSUB(T5q, T5x);
+			 T5N = VSUB(T5F, T5M);
+			 T5O = VSUB(T5y, T5N);
+			 T6H = VADD(T5y, T5N);
+			 T6r = VSUB(T6p, T6q);
+			 T6w = VSUB(T6s, T6v);
+			 T6x = VSUB(T6r, T6w);
+			 T6F = VADD(T6w, T6r);
+			 {
+			      V T65, T6y, T6m, T6z;
+			      {
+				   V T5Z, T64, T6g, T6l;
+				   T5Z = VSUB(T5V, T5Y);
+				   T64 = VSUB(T62, T63);
+				   T65 = VFMA(LDK(KP831469612), T5Z, VMUL(LDK(KP555570233), T64));
+				   T6y = VFNMS(LDK(KP555570233), T5Z, VMUL(LDK(KP831469612), T64));
+				   T6g = VSUB(T6c, T6f);
+				   T6l = VSUB(T6j, T6k);
+				   T6m = VFNMS(LDK(KP555570233), T6l, VMUL(LDK(KP831469612), T6g));
+				   T6z = VFMA(LDK(KP555570233), T6g, VMUL(LDK(KP831469612), T6l));
+			      }
+			      T6n = VSUB(T65, T6m);
+			      T6I = VADD(T6y, T6z);
+			      T6A = VSUB(T6y, T6z);
+			      T6E = VADD(T65, T6m);
+			 }
+		    }
+		    {
+			 V T6o, T6B, T6K, T6L;
+			 T6o = VADD(T5O, T6n);
+			 T6B = VBYI(VADD(T6x, T6A));
+			 ST(&(x[WS(rs, 54)]), VSUB(T6o, T6B), ms, &(x[0]));
+			 ST(&(x[WS(rs, 10)]), VADD(T6o, T6B), ms, &(x[0]));
+			 T6K = VBYI(VADD(T6F, T6E));
+			 T6L = VADD(T6H, T6I);
+			 ST(&(x[WS(rs, 6)]), VADD(T6K, T6L), ms, &(x[0]));
+			 ST(&(x[WS(rs, 58)]), VSUB(T6L, T6K), ms, &(x[0]));
+		    }
+		    {
+			 V T6C, T6D, T6G, T6J;
+			 T6C = VSUB(T5O, T6n);
+			 T6D = VBYI(VSUB(T6A, T6x));
+			 ST(&(x[WS(rs, 42)]), VSUB(T6C, T6D), ms, &(x[0]));
+			 ST(&(x[WS(rs, 22)]), VADD(T6C, T6D), ms, &(x[0]));
+			 T6G = VBYI(VSUB(T6E, T6F));
+			 T6J = VSUB(T6H, T6I);
+			 ST(&(x[WS(rs, 26)]), VADD(T6G, T6J), ms, &(x[0]));
+			 ST(&(x[WS(rs, 38)]), VSUB(T6J, T6G), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T6O, T79, T6Z, T77, T6V, T7a, T72, T76;
+		    {
+			 V T6M, T6N, T6X, T6Y;
+			 T6M = VADD(T5q, T5x);
+			 T6N = VADD(T6p, T6q);
+			 T6O = VSUB(T6M, T6N);
+			 T79 = VADD(T6M, T6N);
+			 T6X = VADD(T5F, T5M);
+			 T6Y = VADD(T6v, T6s);
+			 T6Z = VSUB(T6X, T6Y);
+			 T77 = VADD(T6Y, T6X);
+			 {
+			      V T6R, T70, T6U, T71;
+			      {
+				   V T6P, T6Q, T6S, T6T;
+				   T6P = VADD(T5Y, T5V);
+				   T6Q = VADD(T62, T63);
+				   T6R = VFMA(LDK(KP980785280), T6P, VMUL(LDK(KP195090322), T6Q));
+				   T70 = VFNMS(LDK(KP195090322), T6P, VMUL(LDK(KP980785280), T6Q));
+				   T6S = VADD(T6f, T6c);
+				   T6T = VADD(T6j, T6k);
+				   T6U = VFNMS(LDK(KP195090322), T6T, VMUL(LDK(KP980785280), T6S));
+				   T71 = VFMA(LDK(KP195090322), T6S, VMUL(LDK(KP980785280), T6T));
+			      }
+			      T6V = VSUB(T6R, T6U);
+			      T7a = VADD(T70, T71);
+			      T72 = VSUB(T70, T71);
+			      T76 = VADD(T6R, T6U);
+			 }
+		    }
+		    {
+			 V T6W, T73, T7c, T7d;
+			 T6W = VADD(T6O, T6V);
+			 T73 = VBYI(VADD(T6Z, T72));
+			 ST(&(x[WS(rs, 50)]), VSUB(T6W, T73), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VADD(T6W, T73), ms, &(x[0]));
+			 T7c = VBYI(VADD(T77, T76));
+			 T7d = VADD(T79, T7a);
+			 ST(&(x[WS(rs, 2)]), VADD(T7c, T7d), ms, &(x[0]));
+			 ST(&(x[WS(rs, 62)]), VSUB(T7d, T7c), ms, &(x[0]));
+		    }
+		    {
+			 V T74, T75, T78, T7b;
+			 T74 = VSUB(T6O, T6V);
+			 T75 = VBYI(VSUB(T72, T6Z));
+			 ST(&(x[WS(rs, 46)]), VSUB(T74, T75), ms, &(x[0]));
+			 ST(&(x[WS(rs, 18)]), VADD(T74, T75), ms, &(x[0]));
+			 T78 = VBYI(VSUB(T76, T77));
+			 T7b = VSUB(T79, T7a);
+			 ST(&(x[WS(rs, 30)]), VADD(T78, T7b), ms, &(x[0]));
+			 ST(&(x[WS(rs, 34)]), VSUB(T7b, T78), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T4z, T5g, T4R, T59, T4H, T5j, T4O, T55, T4o, T4S, T4K, T4P, T52, T5k, T5c;
+		    V T5h;
+		    {
+			 V T4y, T57, T4v, T58, T4r, T4u;
+			 T4y = VADD(T4w, T4x);
+			 T57 = VSUB(T4B, T4C);
+			 T4r = VFMA(LDK(KP980785280), T4p, VMUL(LDK(KP195090322), T4q));
+			 T4u = VFNMS(LDK(KP195090322), T4t, VMUL(LDK(KP980785280), T4s));
+			 T4v = VADD(T4r, T4u);
+			 T58 = VSUB(T4r, T4u);
+			 T4z = VSUB(T4v, T4y);
+			 T5g = VADD(T57, T58);
+			 T4R = VADD(T4y, T4v);
+			 T59 = VSUB(T57, T58);
+		    }
+		    {
+			 V T4D, T54, T4G, T53, T4E, T4F;
+			 T4D = VADD(T4B, T4C);
+			 T54 = VSUB(T4x, T4w);
+			 T4E = VFNMS(LDK(KP195090322), T4p, VMUL(LDK(KP980785280), T4q));
+			 T4F = VFMA(LDK(KP195090322), T4s, VMUL(LDK(KP980785280), T4t));
+			 T4G = VADD(T4E, T4F);
+			 T53 = VSUB(T4E, T4F);
+			 T4H = VSUB(T4D, T4G);
+			 T5j = VADD(T54, T53);
+			 T4O = VADD(T4D, T4G);
+			 T55 = VSUB(T53, T54);
+		    }
+		    {
+			 V T4g, T4I, T4n, T4J;
+			 {
+			      V T4c, T4f, T4j, T4m;
+			      T4c = VADD(T4a, T4b);
+			      T4f = VADD(T4d, T4e);
+			      T4g = VFNMS(LDK(KP098017140), T4f, VMUL(LDK(KP995184726), T4c));
+			      T4I = VFMA(LDK(KP098017140), T4c, VMUL(LDK(KP995184726), T4f));
+			      T4j = VADD(T4h, T4i);
+			      T4m = VADD(T4k, T4l);
+			      T4n = VFMA(LDK(KP995184726), T4j, VMUL(LDK(KP098017140), T4m));
+			      T4J = VFNMS(LDK(KP098017140), T4j, VMUL(LDK(KP995184726), T4m));
+			 }
+			 T4o = VSUB(T4g, T4n);
+			 T4S = VADD(T4I, T4J);
+			 T4K = VSUB(T4I, T4J);
+			 T4P = VADD(T4g, T4n);
+		    }
+		    {
+			 V T4Y, T5a, T51, T5b;
+			 {
+			      V T4W, T4X, T4Z, T50;
+			      T4W = VSUB(T4a, T4b);
+			      T4X = VSUB(T4e, T4d);
+			      T4Y = VFNMS(LDK(KP634393284), T4X, VMUL(LDK(KP773010453), T4W));
+			      T5a = VFMA(LDK(KP634393284), T4W, VMUL(LDK(KP773010453), T4X));
+			      T4Z = VSUB(T4h, T4i);
+			      T50 = VSUB(T4l, T4k);
+			      T51 = VFMA(LDK(KP773010453), T4Z, VMUL(LDK(KP634393284), T50));
+			      T5b = VFNMS(LDK(KP634393284), T4Z, VMUL(LDK(KP773010453), T50));
+			 }
+			 T52 = VSUB(T4Y, T51);
+			 T5k = VADD(T5a, T5b);
+			 T5c = VSUB(T5a, T5b);
+			 T5h = VADD(T4Y, T51);
+		    }
+		    {
+			 V T4A, T4L, T5i, T5l;
+			 T4A = VBYI(VSUB(T4o, T4z));
+			 T4L = VSUB(T4H, T4K);
+			 ST(&(x[WS(rs, 17)]), VADD(T4A, T4L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 47)]), VSUB(T4L, T4A), ms, &(x[WS(rs, 1)]));
+			 T5i = VADD(T5g, T5h);
+			 T5l = VBYI(VADD(T5j, T5k));
+			 ST(&(x[WS(rs, 57)]), VSUB(T5i, T5l), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T5i, T5l), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T5m, T5n, T4M, T4N;
+			 T5m = VSUB(T5g, T5h);
+			 T5n = VBYI(VSUB(T5k, T5j));
+			 ST(&(x[WS(rs, 39)]), VSUB(T5m, T5n), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 25)]), VADD(T5m, T5n), ms, &(x[WS(rs, 1)]));
+			 T4M = VBYI(VADD(T4z, T4o));
+			 T4N = VADD(T4H, T4K);
+			 ST(&(x[WS(rs, 15)]), VADD(T4M, T4N), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 49)]), VSUB(T4N, T4M), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T4Q, T4T, T56, T5d;
+			 T4Q = VADD(T4O, T4P);
+			 T4T = VBYI(VADD(T4R, T4S));
+			 ST(&(x[WS(rs, 63)]), VSUB(T4Q, T4T), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T4Q, T4T), ms, &(x[WS(rs, 1)]));
+			 T56 = VBYI(VSUB(T52, T55));
+			 T5d = VSUB(T59, T5c);
+			 ST(&(x[WS(rs, 23)]), VADD(T56, T5d), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 41)]), VSUB(T5d, T56), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T5e, T5f, T4U, T4V;
+			 T5e = VBYI(VADD(T55, T52));
+			 T5f = VADD(T59, T5c);
+			 ST(&(x[WS(rs, 9)]), VADD(T5e, T5f), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 55)]), VSUB(T5f, T5e), ms, &(x[WS(rs, 1)]));
+			 T4U = VSUB(T4O, T4P);
+			 T4V = VBYI(VSUB(T4S, T4R));
+			 ST(&(x[WS(rs, 33)]), VSUB(T4U, T4V), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 31)]), VADD(T4U, T4V), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T1u, T43, T3D, T3V, T3t, T45, T3B, T3K, T3d, T3E, T3w, T3A, T3R, T46, T3Y;
+		    V T42;
+		    {
+			 V TE, T3U, T1t, T3T, T13, T1s;
+			 TE = VSUB(Tg, TD);
+			 T3U = VADD(T3r, T3k);
+			 T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12));
+			 T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k));
+			 T1t = VSUB(T13, T1s);
+			 T3T = VADD(T13, T1s);
+			 T1u = VSUB(TE, T1t);
+			 T43 = VADD(T3U, T3T);
+			 T3D = VADD(TE, T1t);
+			 T3V = VSUB(T3T, T3U);
+		    }
+		    {
+			 V T3s, T3I, T3h, T3J, T3f, T3g;
+			 T3s = VSUB(T3k, T3r);
+			 T3I = VADD(Tg, TD);
+			 T3f = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12));
+			 T3g = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r));
+			 T3h = VSUB(T3f, T3g);
+			 T3J = VADD(T3f, T3g);
+			 T3t = VSUB(T3h, T3s);
+			 T45 = VADD(T3I, T3J);
+			 T3B = VADD(T3s, T3h);
+			 T3K = VSUB(T3I, T3J);
+		    }
+		    {
+			 V T2l, T3u, T3c, T3v;
+			 {
+			      V T29, T2k, T30, T3b;
+			      T29 = VSUB(T1R, T28);
+			      T2k = VSUB(T2g, T2j);
+			      T2l = VFMA(LDK(KP881921264), T29, VMUL(LDK(KP471396736), T2k));
+			      T3u = VFNMS(LDK(KP471396736), T29, VMUL(LDK(KP881921264), T2k));
+			      T30 = VSUB(T2I, T2Z);
+			      T3b = VSUB(T37, T3a);
+			      T3c = VFNMS(LDK(KP471396736), T3b, VMUL(LDK(KP881921264), T30));
+			      T3v = VFMA(LDK(KP471396736), T30, VMUL(LDK(KP881921264), T3b));
+			 }
+			 T3d = VSUB(T2l, T3c);
+			 T3E = VADD(T3u, T3v);
+			 T3w = VSUB(T3u, T3v);
+			 T3A = VADD(T2l, T3c);
+		    }
+		    {
+			 V T3N, T3W, T3Q, T3X;
+			 {
+			      V T3L, T3M, T3O, T3P;
+			      T3L = VADD(T28, T1R);
+			      T3M = VADD(T2g, T2j);
+			      T3N = VFMA(LDK(KP956940335), T3L, VMUL(LDK(KP290284677), T3M));
+			      T3W = VFNMS(LDK(KP290284677), T3L, VMUL(LDK(KP956940335), T3M));
+			      T3O = VADD(T2Z, T2I);
+			      T3P = VADD(T37, T3a);
+			      T3Q = VFNMS(LDK(KP290284677), T3P, VMUL(LDK(KP956940335), T3O));
+			      T3X = VFMA(LDK(KP290284677), T3O, VMUL(LDK(KP956940335), T3P));
+			 }
+			 T3R = VSUB(T3N, T3Q);
+			 T46 = VADD(T3W, T3X);
+			 T3Y = VSUB(T3W, T3X);
+			 T42 = VADD(T3N, T3Q);
+		    }
+		    {
+			 V T3e, T3x, T44, T47;
+			 T3e = VADD(T1u, T3d);
+			 T3x = VBYI(VADD(T3t, T3w));
+			 ST(&(x[WS(rs, 53)]), VSUB(T3e, T3x), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VADD(T3e, T3x), ms, &(x[WS(rs, 1)]));
+			 T44 = VBYI(VSUB(T42, T43));
+			 T47 = VSUB(T45, T46);
+			 ST(&(x[WS(rs, 29)]), VADD(T44, T47), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 35)]), VSUB(T47, T44), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T48, T49, T3y, T3z;
+			 T48 = VBYI(VADD(T43, T42));
+			 T49 = VADD(T45, T46);
+			 ST(&(x[WS(rs, 3)]), VADD(T48, T49), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 61)]), VSUB(T49, T48), ms, &(x[WS(rs, 1)]));
+			 T3y = VSUB(T1u, T3d);
+			 T3z = VBYI(VSUB(T3w, T3t));
+			 ST(&(x[WS(rs, 43)]), VSUB(T3y, T3z), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 21)]), VADD(T3y, T3z), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T3C, T3F, T3S, T3Z;
+			 T3C = VBYI(VSUB(T3A, T3B));
+			 T3F = VSUB(T3D, T3E);
+			 ST(&(x[WS(rs, 27)]), VADD(T3C, T3F), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 37)]), VSUB(T3F, T3C), ms, &(x[WS(rs, 1)]));
+			 T3S = VADD(T3K, T3R);
+			 T3Z = VBYI(VADD(T3V, T3Y));
+			 ST(&(x[WS(rs, 51)]), VSUB(T3S, T3Z), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VADD(T3S, T3Z), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T40, T41, T3G, T3H;
+			 T40 = VSUB(T3K, T3R);
+			 T41 = VBYI(VSUB(T3Y, T3V));
+			 ST(&(x[WS(rs, 45)]), VSUB(T40, T41), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 19)]), VADD(T40, T41), ms, &(x[WS(rs, 1)]));
+			 T3G = VBYI(VADD(T3B, T3A));
+			 T3H = VADD(T3D, T3E);
+			 ST(&(x[WS(rs, 5)]), VADD(T3G, T3H), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 59)]), VSUB(T3H, T3G), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     VTW(0, 32),
+     VTW(0, 33),
+     VTW(0, 34),
+     VTW(0, 35),
+     VTW(0, 36),
+     VTW(0, 37),
+     VTW(0, 38),
+     VTW(0, 39),
+     VTW(0, 40),
+     VTW(0, 41),
+     VTW(0, 42),
+     VTW(0, 43),
+     VTW(0, 44),
+     VTW(0, 45),
+     VTW(0, 46),
+     VTW(0, 47),
+     VTW(0, 48),
+     VTW(0, 49),
+     VTW(0, 50),
+     VTW(0, 51),
+     VTW(0, 52),
+     VTW(0, 53),
+     VTW(0, 54),
+     VTW(0, 55),
+     VTW(0, 56),
+     VTW(0, 57),
+     VTW(0, 58),
+     VTW(0, 59),
+     VTW(0, 60),
+     VTW(0, 61),
+     VTW(0, 62),
+     VTW(0, 63),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 64, XSIMD_STRING("t2bv_64"), twinstr, &GENUS, {467, 198, 52, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_64) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_64, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:09 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t2bv_8 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 33 FP additions, 24 FP multiplications,
+ * (or, 23 additions, 14 multiplications, 10 fused multiply/add),
+ * 36 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T1, T2, Th, Tj, T5, T7, Ta, Tc;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       Tj = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+	       Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T3, Ti, Tk, T6, T8, Tb, Td;
+		    T3 = BYTW(&(W[TWVL * 6]), T2);
+		    Ti = BYTW(&(W[TWVL * 2]), Th);
+		    Tk = BYTW(&(W[TWVL * 10]), Tj);
+		    T6 = BYTW(&(W[0]), T5);
+		    T8 = BYTW(&(W[TWVL * 8]), T7);
+		    Tb = BYTW(&(W[TWVL * 12]), Ta);
+		    Td = BYTW(&(W[TWVL * 4]), Tc);
+		    {
+			 V Tq, T4, Tr, Tl, Tt, T9, Tu, Te, Tw, Ts;
+			 Tq = VADD(T1, T3);
+			 T4 = VSUB(T1, T3);
+			 Tr = VADD(Ti, Tk);
+			 Tl = VSUB(Ti, Tk);
+			 Tt = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+			 Tu = VADD(Tb, Td);
+			 Te = VSUB(Tb, Td);
+			 Tw = VADD(Tq, Tr);
+			 Ts = VSUB(Tq, Tr);
+			 {
+			      V Tx, Tv, Tm, Tf;
+			      Tx = VADD(Tt, Tu);
+			      Tv = VSUB(Tt, Tu);
+			      Tm = VSUB(T9, Te);
+			      Tf = VADD(T9, Te);
+			      {
+				   V Tp, Tn, To, Tg;
+				   ST(&(x[0]), VADD(Tw, Tx), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VSUB(Tw, Tx), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFMAI(Tv, Ts), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(Tv, Ts), ms, &(x[0]));
+				   Tp = VFMA(LDK(KP707106781), Tm, Tl);
+				   Tn = VFNMS(LDK(KP707106781), Tm, Tl);
+				   To = VFMA(LDK(KP707106781), Tf, T4);
+				   Tg = VFNMS(LDK(KP707106781), Tf, T4);
+				   ST(&(x[WS(rs, 1)]), VFMAI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFNMSI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 5)]), VFMAI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFNMSI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t2bv_8"), twinstr, &GENUS, {23, 14, 10, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_8) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t2bv_8 -include t2b.h -sign 1 */
+
+/*
+ * This function contains 33 FP additions, 16 FP multiplications,
+ * (or, 33 additions, 16 multiplications, 0 fused multiply/add),
+ * 24 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t2b.h"
+
+static void t2bv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V Tl, Tq, Tg, Tr, T5, Tt, Ta, Tu, Ti, Tk, Tj;
+	       Ti = LD(&(x[0]), ms, &(x[0]));
+	       Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tk = BYTW(&(W[TWVL * 6]), Tj);
+	       Tl = VSUB(Ti, Tk);
+	       Tq = VADD(Ti, Tk);
+	       {
+		    V Td, Tf, Tc, Te;
+		    Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Td = BYTW(&(W[TWVL * 2]), Tc);
+		    Te = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    Tf = BYTW(&(W[TWVL * 10]), Te);
+		    Tg = VSUB(Td, Tf);
+		    Tr = VADD(Td, Tf);
+	       }
+	       {
+		    V T2, T4, T1, T3;
+		    T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T2 = BYTW(&(W[0]), T1);
+		    T3 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T4 = BYTW(&(W[TWVL * 8]), T3);
+		    T5 = VSUB(T2, T4);
+		    Tt = VADD(T2, T4);
+	       }
+	       {
+		    V T7, T9, T6, T8;
+		    T6 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    T7 = BYTW(&(W[TWVL * 12]), T6);
+		    T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T9 = BYTW(&(W[TWVL * 4]), T8);
+		    Ta = VSUB(T7, T9);
+		    Tu = VADD(T7, T9);
+	       }
+	       {
+		    V Ts, Tv, Tw, Tx;
+		    Ts = VSUB(Tq, Tr);
+		    Tv = VBYI(VSUB(Tt, Tu));
+		    ST(&(x[WS(rs, 6)]), VSUB(Ts, Tv), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Ts, Tv), ms, &(x[0]));
+		    Tw = VADD(Tq, Tr);
+		    Tx = VADD(Tt, Tu);
+		    ST(&(x[WS(rs, 4)]), VSUB(Tw, Tx), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Tw, Tx), ms, &(x[0]));
+		    {
+			 V Th, To, Tn, Tp, Tb, Tm;
+			 Tb = VMUL(LDK(KP707106781), VSUB(T5, Ta));
+			 Th = VBYI(VSUB(Tb, Tg));
+			 To = VBYI(VADD(Tg, Tb));
+			 Tm = VMUL(LDK(KP707106781), VADD(T5, Ta));
+			 Tn = VSUB(Tl, Tm);
+			 Tp = VADD(Tl, Tm);
+			 ST(&(x[WS(rs, 3)]), VADD(Th, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VSUB(Tp, To), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VSUB(Tn, Th), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t2bv_8"), twinstr, &GENUS, {33, 16, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2bv_8) (planner *p) {
+     X(kdft_dit_register) (p, t2bv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:41 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t2fv_10 -include t2f.h */
+
+/*
+ * This function contains 51 FP additions, 40 FP multiplications,
+ * (or, 33 additions, 22 multiplications, 18 fused multiply/add),
+ * 43 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Td, TA, T4, Ta, Tk, TE, Tp, TF, TB, T9, T1, T2, Tb;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V Tg, Tn, Ti, Tl;
+		    Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tn = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Ti = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+		    Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    {
+			 V T6, T8, T5, Tc;
+			 T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 {
+			      V T3, Th, To, Tj, Tm, T7;
+			      T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T3 = BYTWJ(&(W[TWVL * 8]), T2);
+			      Th = BYTWJ(&(W[TWVL * 6]), Tg);
+			      To = BYTWJ(&(W[0]), Tn);
+			      Tj = BYTWJ(&(W[TWVL * 16]), Ti);
+			      Tm = BYTWJ(&(W[TWVL * 10]), Tl);
+			      T6 = BYTWJ(&(W[TWVL * 2]), T5);
+			      Td = BYTWJ(&(W[TWVL * 4]), Tc);
+			      T8 = BYTWJ(&(W[TWVL * 12]), T7);
+			      TA = VADD(T1, T3);
+			      T4 = VSUB(T1, T3);
+			      Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Tk = VSUB(Th, Tj);
+			      TE = VADD(Th, Tj);
+			      Tp = VSUB(Tm, To);
+			      TF = VADD(Tm, To);
+			 }
+			 TB = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+		    }
+	       }
+	       Tb = BYTWJ(&(W[TWVL * 14]), Ta);
+	       {
+		    V TL, TG, Tw, Tq, TC, Te;
+		    TL = VSUB(TE, TF);
+		    TG = VADD(TE, TF);
+		    Tw = VSUB(Tk, Tp);
+		    Tq = VADD(Tk, Tp);
+		    TC = VADD(Tb, Td);
+		    Te = VSUB(Tb, Td);
+		    {
+			 V TM, TD, Tv, Tf;
+			 TM = VSUB(TB, TC);
+			 TD = VADD(TB, TC);
+			 Tv = VSUB(T9, Te);
+			 Tf = VADD(T9, Te);
+			 {
+			      V TP, TN, TH, TJ, Tz, Tx, Tr, Tt, TI, Ts;
+			      TP = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TL, TM));
+			      TN = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TM, TL));
+			      TH = VADD(TD, TG);
+			      TJ = VSUB(TD, TG);
+			      Tz = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tv, Tw));
+			      Tx = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tw, Tv));
+			      Tr = VADD(Tf, Tq);
+			      Tt = VSUB(Tf, Tq);
+			      ST(&(x[0]), VADD(TA, TH), ms, &(x[0]));
+			      TI = VFNMS(LDK(KP250000000), TH, TA);
+			      ST(&(x[WS(rs, 5)]), VADD(T4, Tr), ms, &(x[WS(rs, 1)]));
+			      Ts = VFNMS(LDK(KP250000000), Tr, T4);
+			      {
+				   V TK, TO, Tu, Ty;
+				   TK = VFNMS(LDK(KP559016994), TJ, TI);
+				   TO = VFMA(LDK(KP559016994), TJ, TI);
+				   Tu = VFMA(LDK(KP559016994), Tt, Ts);
+				   Ty = VFNMS(LDK(KP559016994), Tt, Ts);
+				   ST(&(x[WS(rs, 8)]), VFNMSI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFMAI(TN, TK), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
+				   ST(&(x[WS(rs, 9)]), VFMAI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFNMSI(Tx, Tu), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFMAI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFNMSI(Tz, Ty), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t2fv_10"), twinstr, &GENUS, {33, 22, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_10) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t2fv_10 -include t2f.h */
+
+/*
+ * This function contains 51 FP additions, 30 FP multiplications,
+ * (or, 45 additions, 24 multiplications, 6 fused multiply/add),
+ * 32 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V Tr, TH, Tg, Tl, Tm, TA, TB, TJ, T5, Ta, Tb, TD, TE, TI, To;
+	       V Tq, Tp;
+	       To = LD(&(x[0]), ms, &(x[0]));
+	       Tp = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Tq = BYTWJ(&(W[TWVL * 8]), Tp);
+	       Tr = VSUB(To, Tq);
+	       TH = VADD(To, Tq);
+	       {
+		    V Td, Tk, Tf, Ti;
+		    {
+			 V Tc, Tj, Te, Th;
+			 Tc = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Td = BYTWJ(&(W[TWVL * 6]), Tc);
+			 Tj = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Tk = BYTWJ(&(W[0]), Tj);
+			 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTWJ(&(W[TWVL * 16]), Te);
+			 Th = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Ti = BYTWJ(&(W[TWVL * 10]), Th);
+		    }
+		    Tg = VSUB(Td, Tf);
+		    Tl = VSUB(Ti, Tk);
+		    Tm = VADD(Tg, Tl);
+		    TA = VADD(Td, Tf);
+		    TB = VADD(Ti, Tk);
+		    TJ = VADD(TA, TB);
+	       }
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T2 = BYTWJ(&(W[TWVL * 2]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTWJ(&(W[TWVL * 12]), T3);
+			 T6 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T7 = BYTWJ(&(W[TWVL * 14]), T6);
+		    }
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    Tb = VADD(T5, Ta);
+		    TD = VADD(T2, T4);
+		    TE = VADD(T7, T9);
+		    TI = VADD(TD, TE);
+	       }
+	       {
+		    V Tn, Ts, Tt, Tx, Tz, Tv, Tw, Ty, Tu;
+		    Tn = VMUL(LDK(KP559016994), VSUB(Tb, Tm));
+		    Ts = VADD(Tb, Tm);
+		    Tt = VFNMS(LDK(KP250000000), Ts, Tr);
+		    Tv = VSUB(T5, Ta);
+		    Tw = VSUB(Tg, Tl);
+		    Tx = VBYI(VFMA(LDK(KP951056516), Tv, VMUL(LDK(KP587785252), Tw)));
+		    Tz = VBYI(VFNMS(LDK(KP587785252), Tv, VMUL(LDK(KP951056516), Tw)));
+		    ST(&(x[WS(rs, 5)]), VADD(Tr, Ts), ms, &(x[WS(rs, 1)]));
+		    Ty = VSUB(Tt, Tn);
+		    ST(&(x[WS(rs, 3)]), VSUB(Ty, Tz), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VADD(Tz, Ty), ms, &(x[WS(rs, 1)]));
+		    Tu = VADD(Tn, Tt);
+		    ST(&(x[WS(rs, 1)]), VSUB(Tu, Tx), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VADD(Tx, Tu), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TM, TK, TL, TG, TO, TC, TF, TP, TN;
+		    TM = VMUL(LDK(KP559016994), VSUB(TI, TJ));
+		    TK = VADD(TI, TJ);
+		    TL = VFNMS(LDK(KP250000000), TK, TH);
+		    TC = VSUB(TA, TB);
+		    TF = VSUB(TD, TE);
+		    TG = VBYI(VFNMS(LDK(KP587785252), TF, VMUL(LDK(KP951056516), TC)));
+		    TO = VBYI(VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TC)));
+		    ST(&(x[0]), VADD(TH, TK), ms, &(x[0]));
+		    TP = VADD(TM, TL);
+		    ST(&(x[WS(rs, 4)]), VADD(TO, TP), ms, &(x[0]));
+		    ST(&(x[WS(rs, 6)]), VSUB(TP, TO), ms, &(x[0]));
+		    TN = VSUB(TL, TM);
+		    ST(&(x[WS(rs, 2)]), VADD(TG, TN), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VSUB(TN, TG), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t2fv_10"), twinstr, &GENUS, {45, 24, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_10) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:35 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t2fv_16 -include t2f.h */
+
+/*
+ * This function contains 87 FP additions, 64 FP multiplications,
+ * (or, 53 additions, 30 multiplications, 34 fused multiply/add),
+ * 61 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V TO, Ta, TJ, TP, T14, Tq, T1i, T10, T1b, T1l, T13, T1c, TR, Tl, T15;
+	       V Tv;
+	       {
+		    V Tc, TW, T4, T19, T9, TD, TI, Tj, TZ, T1a, Te, Th, Tn, Tr, Tu;
+		    V Tp;
+		    {
+			 V T1, T2, T5, T7;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T2 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T7 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 {
+			      V Tz, TG, TB, TE;
+			      Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      TG = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      TB = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      TE = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      {
+				   V Ti, TY, TX, Td, Tg, Tm, Tt, To;
+				   {
+					V T3, T6, T8, TA, TH, TC, TF, Tb;
+					Tb = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					T3 = BYTWJ(&(W[TWVL * 14]), T2);
+					T6 = BYTWJ(&(W[TWVL * 6]), T5);
+					T8 = BYTWJ(&(W[TWVL * 22]), T7);
+					TA = BYTWJ(&(W[TWVL * 26]), Tz);
+					TH = BYTWJ(&(W[TWVL * 18]), TG);
+					TC = BYTWJ(&(W[TWVL * 10]), TB);
+					TF = BYTWJ(&(W[TWVL * 2]), TE);
+					Tc = BYTWJ(&(W[0]), Tb);
+					TW = VSUB(T1, T3);
+					T4 = VADD(T1, T3);
+					T19 = VSUB(T6, T8);
+					T9 = VADD(T6, T8);
+					Ti = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+					TD = VADD(TA, TC);
+					TY = VSUB(TA, TC);
+					TI = VADD(TF, TH);
+					TX = VSUB(TF, TH);
+				   }
+				   Td = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+				   Tg = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+				   Tm = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+				   Tj = BYTWJ(&(W[TWVL * 24]), Ti);
+				   Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   To = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+				   TZ = VADD(TX, TY);
+				   T1a = VSUB(TY, TX);
+				   Te = BYTWJ(&(W[TWVL * 16]), Td);
+				   Th = BYTWJ(&(W[TWVL * 8]), Tg);
+				   Tn = BYTWJ(&(W[TWVL * 28]), Tm);
+				   Tr = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   Tu = BYTWJ(&(W[TWVL * 20]), Tt);
+				   Tp = BYTWJ(&(W[TWVL * 12]), To);
+			      }
+			 }
+		    }
+		    {
+			 V Tf, T11, Tk, T12, Ts;
+			 TO = VADD(T4, T9);
+			 Ta = VSUB(T4, T9);
+			 TJ = VSUB(TD, TI);
+			 TP = VADD(TI, TD);
+			 Tf = VADD(Tc, Te);
+			 T11 = VSUB(Tc, Te);
+			 Tk = VADD(Th, Tj);
+			 T12 = VSUB(Th, Tj);
+			 Ts = BYTWJ(&(W[TWVL * 4]), Tr);
+			 T14 = VSUB(Tn, Tp);
+			 Tq = VADD(Tn, Tp);
+			 T1i = VFNMS(LDK(KP707106781), TZ, TW);
+			 T10 = VFMA(LDK(KP707106781), TZ, TW);
+			 T1b = VFNMS(LDK(KP707106781), T1a, T19);
+			 T1l = VFMA(LDK(KP707106781), T1a, T19);
+			 T13 = VFNMS(LDK(KP414213562), T12, T11);
+			 T1c = VFMA(LDK(KP414213562), T11, T12);
+			 TR = VADD(Tf, Tk);
+			 Tl = VSUB(Tf, Tk);
+			 T15 = VSUB(Tu, Ts);
+			 Tv = VADD(Ts, Tu);
+		    }
+	       }
+	       {
+		    V T1d, T16, TS, Tw, TU, TQ;
+		    T1d = VFMA(LDK(KP414213562), T14, T15);
+		    T16 = VFNMS(LDK(KP414213562), T15, T14);
+		    TS = VADD(Tq, Tv);
+		    Tw = VSUB(Tq, Tv);
+		    TU = VSUB(TO, TP);
+		    TQ = VADD(TO, TP);
+		    {
+			 V T1e, T1j, T17, T1m;
+			 T1e = VSUB(T1c, T1d);
+			 T1j = VADD(T1c, T1d);
+			 T17 = VADD(T13, T16);
+			 T1m = VSUB(T16, T13);
+			 {
+			      V TV, TT, TK, Tx;
+			      TV = VSUB(TS, TR);
+			      TT = VADD(TR, TS);
+			      TK = VSUB(Tw, Tl);
+			      Tx = VADD(Tl, Tw);
+			      {
+				   V T1h, T1f, T1o, T1k;
+				   T1h = VFMA(LDK(KP923879532), T1e, T1b);
+				   T1f = VFNMS(LDK(KP923879532), T1e, T1b);
+				   T1o = VFMA(LDK(KP923879532), T1j, T1i);
+				   T1k = VFNMS(LDK(KP923879532), T1j, T1i);
+				   {
+					V T1g, T18, T1p, T1n;
+					T1g = VFMA(LDK(KP923879532), T17, T10);
+					T18 = VFNMS(LDK(KP923879532), T17, T10);
+					T1p = VFMA(LDK(KP923879532), T1m, T1l);
+					T1n = VFNMS(LDK(KP923879532), T1m, T1l);
+					ST(&(x[WS(rs, 12)]), VFNMSI(TV, TU), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VFMAI(TV, TU), ms, &(x[0]));
+					ST(&(x[0]), VADD(TQ, TT), ms, &(x[0]));
+					ST(&(x[WS(rs, 8)]), VSUB(TQ, TT), ms, &(x[0]));
+					{
+					     V TN, TL, TM, Ty;
+					     TN = VFMA(LDK(KP707106781), TK, TJ);
+					     TL = VFNMS(LDK(KP707106781), TK, TJ);
+					     TM = VFMA(LDK(KP707106781), Tx, Ta);
+					     Ty = VFNMS(LDK(KP707106781), Tx, Ta);
+					     ST(&(x[WS(rs, 1)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 15)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 7)]), VFMAI(T1f, T18), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 9)]), VFNMSI(T1f, T18), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 3)]), VFMAI(T1p, T1o), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 13)]), VFNMSI(T1p, T1o), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 11)]), VFMAI(T1n, T1k), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 5)]), VFNMSI(T1n, T1k), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 14)]), VFNMSI(TN, TM), ms, &(x[0]));
+					     ST(&(x[WS(rs, 2)]), VFMAI(TN, TM), ms, &(x[0]));
+					     ST(&(x[WS(rs, 10)]), VFMAI(TL, Ty), ms, &(x[0]));
+					     ST(&(x[WS(rs, 6)]), VFNMSI(TL, Ty), ms, &(x[0]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t2fv_16"), twinstr, &GENUS, {53, 30, 34, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_16) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t2fv_16 -include t2f.h */
+
+/*
+ * This function contains 87 FP additions, 42 FP multiplications,
+ * (or, 83 additions, 38 multiplications, 4 fused multiply/add),
+ * 36 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V TJ, T10, TD, T11, T1b, T1c, Ty, TK, T16, T17, T18, Tb, TN, T13, T14;
+	       V T15, Tm, TM, TG, TI, TH;
+	       TG = LD(&(x[0]), ms, &(x[0]));
+	       TH = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+	       TI = BYTWJ(&(W[TWVL * 14]), TH);
+	       TJ = VSUB(TG, TI);
+	       T10 = VADD(TG, TI);
+	       {
+		    V TA, TC, Tz, TB;
+		    Tz = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    TA = BYTWJ(&(W[TWVL * 6]), Tz);
+		    TB = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+		    TC = BYTWJ(&(W[TWVL * 22]), TB);
+		    TD = VSUB(TA, TC);
+		    T11 = VADD(TA, TC);
+	       }
+	       {
+		    V Tp, Tw, Tr, Tu, Ts, Tx;
+		    {
+			 V To, Tv, Tq, Tt;
+			 To = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 Tp = BYTWJ(&(W[TWVL * 26]), To);
+			 Tv = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 Tw = BYTWJ(&(W[TWVL * 18]), Tv);
+			 Tq = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tr = BYTWJ(&(W[TWVL * 10]), Tq);
+			 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tu = BYTWJ(&(W[TWVL * 2]), Tt);
+		    }
+		    T1b = VADD(Tp, Tr);
+		    T1c = VADD(Tu, Tw);
+		    Ts = VSUB(Tp, Tr);
+		    Tx = VSUB(Tu, Tw);
+		    Ty = VMUL(LDK(KP707106781), VSUB(Ts, Tx));
+		    TK = VMUL(LDK(KP707106781), VADD(Tx, Ts));
+	       }
+	       {
+		    V T2, T9, T4, T7, T5, Ta;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTWJ(&(W[TWVL * 28]), T1);
+			 T8 = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 20]), T8);
+			 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T4 = BYTWJ(&(W[TWVL * 12]), T3);
+			 T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T7 = BYTWJ(&(W[TWVL * 4]), T6);
+		    }
+		    T16 = VADD(T2, T4);
+		    T17 = VADD(T7, T9);
+		    T18 = VSUB(T16, T17);
+		    T5 = VSUB(T2, T4);
+		    Ta = VSUB(T7, T9);
+		    Tb = VFNMS(LDK(KP923879532), Ta, VMUL(LDK(KP382683432), T5));
+		    TN = VFMA(LDK(KP923879532), T5, VMUL(LDK(KP382683432), Ta));
+	       }
+	       {
+		    V Td, Tk, Tf, Ti, Tg, Tl;
+		    {
+			 V Tc, Tj, Te, Th;
+			 Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 Td = BYTWJ(&(W[0]), Tc);
+			 Tj = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 Tk = BYTWJ(&(W[TWVL * 24]), Tj);
+			 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tf = BYTWJ(&(W[TWVL * 16]), Te);
+			 Th = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 Ti = BYTWJ(&(W[TWVL * 8]), Th);
+		    }
+		    T13 = VADD(Td, Tf);
+		    T14 = VADD(Ti, Tk);
+		    T15 = VSUB(T13, T14);
+		    Tg = VSUB(Td, Tf);
+		    Tl = VSUB(Ti, Tk);
+		    Tm = VFMA(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), Tl));
+		    TM = VFNMS(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tg));
+	       }
+	       {
+		    V T1a, T1g, T1f, T1h;
+		    {
+			 V T12, T19, T1d, T1e;
+			 T12 = VSUB(T10, T11);
+			 T19 = VMUL(LDK(KP707106781), VADD(T15, T18));
+			 T1a = VADD(T12, T19);
+			 T1g = VSUB(T12, T19);
+			 T1d = VSUB(T1b, T1c);
+			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T15));
+			 T1f = VBYI(VADD(T1d, T1e));
+			 T1h = VBYI(VSUB(T1e, T1d));
+		    }
+		    ST(&(x[WS(rs, 14)]), VSUB(T1a, T1f), ms, &(x[0]));
+		    ST(&(x[WS(rs, 6)]), VADD(T1g, T1h), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(T1a, T1f), ms, &(x[0]));
+		    ST(&(x[WS(rs, 10)]), VSUB(T1g, T1h), ms, &(x[0]));
+	       }
+	       {
+		    V T1k, T1o, T1n, T1p;
+		    {
+			 V T1i, T1j, T1l, T1m;
+			 T1i = VADD(T10, T11);
+			 T1j = VADD(T1c, T1b);
+			 T1k = VADD(T1i, T1j);
+			 T1o = VSUB(T1i, T1j);
+			 T1l = VADD(T13, T14);
+			 T1m = VADD(T16, T17);
+			 T1n = VADD(T1l, T1m);
+			 T1p = VBYI(VSUB(T1m, T1l));
+		    }
+		    ST(&(x[WS(rs, 8)]), VSUB(T1k, T1n), ms, &(x[0]));
+		    ST(&(x[WS(rs, 4)]), VADD(T1o, T1p), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T1k, T1n), ms, &(x[0]));
+		    ST(&(x[WS(rs, 12)]), VSUB(T1o, T1p), ms, &(x[0]));
+	       }
+	       {
+		    V TF, TQ, TP, TR;
+		    {
+			 V Tn, TE, TL, TO;
+			 Tn = VSUB(Tb, Tm);
+			 TE = VSUB(Ty, TD);
+			 TF = VBYI(VSUB(Tn, TE));
+			 TQ = VBYI(VADD(TE, Tn));
+			 TL = VADD(TJ, TK);
+			 TO = VADD(TM, TN);
+			 TP = VSUB(TL, TO);
+			 TR = VADD(TL, TO);
+		    }
+		    ST(&(x[WS(rs, 7)]), VADD(TF, TP), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 15)]), VSUB(TR, TQ), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VSUB(TP, TF), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VADD(TQ, TR), ms, &(x[WS(rs, 1)]));
+	       }
+	       {
+		    V TU, TY, TX, TZ;
+		    {
+			 V TS, TT, TV, TW;
+			 TS = VSUB(TJ, TK);
+			 TT = VADD(Tm, Tb);
+			 TU = VADD(TS, TT);
+			 TY = VSUB(TS, TT);
+			 TV = VADD(TD, Ty);
+			 TW = VSUB(TN, TM);
+			 TX = VBYI(VADD(TV, TW));
+			 TZ = VBYI(VSUB(TW, TV));
+		    }
+		    ST(&(x[WS(rs, 13)]), VSUB(TU, TX), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VADD(TY, TZ), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VADD(TU, TX), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 11)]), VSUB(TY, TZ), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t2fv_16"), twinstr, &GENUS, {83, 38, 4, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_16) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:35 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t2fv_2 -include t2f.h */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T2, T3;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[0]), T2);
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t2fv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_2) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 2 -name t2fv_2 -include t2f.h */
+
+/*
+ * This function contains 3 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 0 fused multiply/add),
+ * 5 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(2, rs)) {
+	       V T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T3 = BYTWJ(&(W[0]), T2);
+	       ST(&(x[WS(rs, 1)]), VSUB(T1, T3), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[0]), VADD(T1, T3), ms, &(x[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 2, XSIMD_STRING("t2fv_2"), twinstr, &GENUS, {3, 2, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_2) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:41 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2fv_20 -include t2f.h */
+
+/*
+ * This function contains 123 FP additions, 88 FP multiplications,
+ * (or, 77 additions, 42 multiplications, 46 fused multiply/add),
+ * 68 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T4, Tx, T1m, T1K, T1y, Tk, Tf, T16, T10, TT, T1O, T1w, T1L, T1p, T1M;
+	       V T1s, TZ, TI, T1x, Tp;
+	       {
+		    V T1, Tv, T2, Tt;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+		    T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    {
+			 V T9, T1n, TN, T1v, TS, Te, T1q, T1u, TE, TG, Tm, T1o, TC, Tn, T1r;
+			 V TH, To;
+			 {
+			      V TP, TR, Ta, Tc;
+			      {
+				   V T5, T7, TJ, TL, T1k, T1l;
+				   T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   TJ = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   TL = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   {
+					V Tw, T3, Tu, T6, T8, TK, TM, TO, TQ;
+					TO = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					Tw = BYTWJ(&(W[TWVL * 28]), Tv);
+					T3 = BYTWJ(&(W[TWVL * 18]), T2);
+					Tu = BYTWJ(&(W[TWVL * 8]), Tt);
+					T6 = BYTWJ(&(W[TWVL * 6]), T5);
+					T8 = BYTWJ(&(W[TWVL * 26]), T7);
+					TK = BYTWJ(&(W[TWVL * 24]), TJ);
+					TM = BYTWJ(&(W[TWVL * 4]), TL);
+					TP = BYTWJ(&(W[TWVL * 32]), TO);
+					TQ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					T4 = VSUB(T1, T3);
+					T1k = VADD(T1, T3);
+					Tx = VSUB(Tu, Tw);
+					T1l = VADD(Tu, Tw);
+					T9 = VSUB(T6, T8);
+					T1n = VADD(T6, T8);
+					TN = VSUB(TK, TM);
+					T1v = VADD(TK, TM);
+					TR = BYTWJ(&(W[TWVL * 12]), TQ);
+				   }
+				   Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T1m = VSUB(T1k, T1l);
+				   T1K = VADD(T1k, T1l);
+				   Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      }
+			      {
+				   V Tb, TA, Td, Th, Tj, Tz, Tg, Ti, Ty;
+				   Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+				   Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+				   Ty = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+				   TS = VSUB(TP, TR);
+				   T1y = VADD(TP, TR);
+				   Tb = BYTWJ(&(W[TWVL * 30]), Ta);
+				   TA = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+				   Td = BYTWJ(&(W[TWVL * 10]), Tc);
+				   Th = BYTWJ(&(W[TWVL * 14]), Tg);
+				   Tj = BYTWJ(&(W[TWVL * 34]), Ti);
+				   Tz = BYTWJ(&(W[TWVL * 16]), Ty);
+				   {
+					V TD, TF, TB, Tl;
+					TD = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					TB = BYTWJ(&(W[TWVL * 36]), TA);
+					Te = VSUB(Tb, Td);
+					T1q = VADD(Tb, Td);
+					Tk = VSUB(Th, Tj);
+					T1u = VADD(Th, Tj);
+					TE = BYTWJ(&(W[0]), TD);
+					TG = BYTWJ(&(W[TWVL * 20]), TF);
+					Tm = BYTWJ(&(W[TWVL * 22]), Tl);
+					T1o = VADD(Tz, TB);
+					TC = VSUB(Tz, TB);
+					Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+				   }
+			      }
+			 }
+			 Tf = VADD(T9, Te);
+			 T16 = VSUB(T9, Te);
+			 T10 = VSUB(TS, TN);
+			 TT = VADD(TN, TS);
+			 T1r = VADD(TE, TG);
+			 TH = VSUB(TE, TG);
+			 T1O = VADD(T1u, T1v);
+			 T1w = VSUB(T1u, T1v);
+			 To = BYTWJ(&(W[TWVL * 2]), Tn);
+			 T1L = VADD(T1n, T1o);
+			 T1p = VSUB(T1n, T1o);
+			 T1M = VADD(T1q, T1r);
+			 T1s = VSUB(T1q, T1r);
+			 TZ = VSUB(TH, TC);
+			 TI = VADD(TC, TH);
+			 T1x = VADD(Tm, To);
+			 Tp = VSUB(Tm, To);
+		    }
+	       }
+	       {
+		    V T1V, T1N, T14, T1d, T11, T1G, T1t, T1z, T1P, Tq, T17, T13, TV, TU;
+		    T1V = VSUB(T1L, T1M);
+		    T1N = VADD(T1L, T1M);
+		    T14 = VSUB(TT, TI);
+		    TU = VADD(TI, TT);
+		    T1d = VFNMS(LDK(KP618033988), TZ, T10);
+		    T11 = VFMA(LDK(KP618033988), T10, TZ);
+		    T1G = VSUB(T1p, T1s);
+		    T1t = VADD(T1p, T1s);
+		    T1z = VSUB(T1x, T1y);
+		    T1P = VADD(T1x, T1y);
+		    Tq = VADD(Tk, Tp);
+		    T17 = VSUB(Tk, Tp);
+		    T13 = VFNMS(LDK(KP250000000), TU, Tx);
+		    TV = VADD(Tx, TU);
+		    {
+			 V T1J, T1H, T1D, T1Z, T1X, T1T, T1h, T1j, T1b, T19, T1C, T1S, T1c, TY, T1F;
+			 V T1A;
+			 T1F = VSUB(T1w, T1z);
+			 T1A = VADD(T1w, T1z);
+			 {
+			      V T1W, T1Q, TX, Tr;
+			      T1W = VSUB(T1O, T1P);
+			      T1Q = VADD(T1O, T1P);
+			      TX = VSUB(Tf, Tq);
+			      Tr = VADD(Tf, Tq);
+			      {
+				   V T1g, T18, T1f, T15;
+				   T1g = VFNMS(LDK(KP618033988), T16, T17);
+				   T18 = VFMA(LDK(KP618033988), T17, T16);
+				   T1f = VFMA(LDK(KP559016994), T14, T13);
+				   T15 = VFNMS(LDK(KP559016994), T14, T13);
+				   T1J = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
+				   T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
+				   {
+					V T1B, T1R, TW, Ts;
+					T1B = VADD(T1t, T1A);
+					T1D = VSUB(T1t, T1A);
+					T1Z = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1V, T1W));
+					T1X = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1W, T1V));
+					T1R = VADD(T1N, T1Q);
+					T1T = VSUB(T1N, T1Q);
+					TW = VFNMS(LDK(KP250000000), Tr, T4);
+					Ts = VADD(T4, Tr);
+					T1h = VFNMS(LDK(KP951056516), T1g, T1f);
+					T1j = VFMA(LDK(KP951056516), T1g, T1f);
+					T1b = VFNMS(LDK(KP951056516), T18, T15);
+					T19 = VFMA(LDK(KP951056516), T18, T15);
+					ST(&(x[WS(rs, 10)]), VADD(T1m, T1B), ms, &(x[0]));
+					T1C = VFNMS(LDK(KP250000000), T1B, T1m);
+					ST(&(x[0]), VADD(T1K, T1R), ms, &(x[0]));
+					T1S = VFNMS(LDK(KP250000000), T1R, T1K);
+					T1c = VFNMS(LDK(KP559016994), TX, TW);
+					TY = VFMA(LDK(KP559016994), TX, TW);
+					ST(&(x[WS(rs, 15)]), VFMAI(TV, Ts), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 5)]), VFNMSI(TV, Ts), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+			 {
+			      V T1E, T1I, T1U, T1Y;
+			      T1E = VFNMS(LDK(KP559016994), T1D, T1C);
+			      T1I = VFMA(LDK(KP559016994), T1D, T1C);
+			      T1U = VFMA(LDK(KP559016994), T1T, T1S);
+			      T1Y = VFNMS(LDK(KP559016994), T1T, T1S);
+			      {
+				   V T1e, T1i, T1a, T12;
+				   T1e = VFNMS(LDK(KP951056516), T1d, T1c);
+				   T1i = VFMA(LDK(KP951056516), T1d, T1c);
+				   T1a = VFNMS(LDK(KP951056516), T11, TY);
+				   T12 = VFMA(LDK(KP951056516), T11, TY);
+				   ST(&(x[WS(rs, 18)]), VFNMSI(T1H, T1E), ms, &(x[0]));
+				   ST(&(x[WS(rs, 2)]), VFMAI(T1H, T1E), ms, &(x[0]));
+				   ST(&(x[WS(rs, 14)]), VFMAI(T1J, T1I), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(T1J, T1I), ms, &(x[0]));
+				   ST(&(x[WS(rs, 16)]), VFNMSI(T1X, T1U), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VFMAI(T1X, T1U), ms, &(x[0]));
+				   ST(&(x[WS(rs, 12)]), VFMAI(T1Z, T1Y), ms, &(x[0]));
+				   ST(&(x[WS(rs, 8)]), VFNMSI(T1Z, T1Y), ms, &(x[0]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(T1h, T1e), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 17)]), VFNMSI(T1h, T1e), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFMAI(T1j, T1i), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 13)]), VFNMSI(T1j, T1i), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 11)]), VFMAI(T1b, T1a), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 9)]), VFNMSI(T1b, T1a), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 19)]), VFMAI(T19, T12), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFNMSI(T19, T12), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t2fv_20"), twinstr, &GENUS, {77, 42, 46, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_20) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2fv_20 -include t2f.h */
+
+/*
+ * This function contains 123 FP additions, 62 FP multiplications,
+ * (or, 111 additions, 50 multiplications, 12 fused multiply/add),
+ * 54 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T4, Tx, T1B, T1U, TZ, T16, T17, T10, Tf, Tq, Tr, T1N, T1O, T1S, T1t;
+	       V T1w, T1C, TI, TT, TU, T1K, T1L, T1R, T1m, T1p, T1D, Ts, TV;
+	       {
+		    V T1, Tw, T3, Tu, Tv, T2, Tt, T1z, T1A;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+		    Tw = BYTWJ(&(W[TWVL * 28]), Tv);
+		    T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+		    T3 = BYTWJ(&(W[TWVL * 18]), T2);
+		    Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Tu = BYTWJ(&(W[TWVL * 8]), Tt);
+		    T4 = VSUB(T1, T3);
+		    Tx = VSUB(Tu, Tw);
+		    T1z = VADD(T1, T3);
+		    T1A = VADD(Tu, Tw);
+		    T1B = VSUB(T1z, T1A);
+		    T1U = VADD(T1z, T1A);
+	       }
+	       {
+		    V T9, T1r, TN, T1l, TS, T1o, Te, T1u, Tk, T1k, TC, T1s, TH, T1v, Tp;
+		    V T1n;
+		    {
+			 V T6, T8, T5, T7;
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T6 = BYTWJ(&(W[TWVL * 6]), T5);
+			 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 T8 = BYTWJ(&(W[TWVL * 26]), T7);
+			 T9 = VSUB(T6, T8);
+			 T1r = VADD(T6, T8);
+		    }
+		    {
+			 V TK, TM, TJ, TL;
+			 TJ = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 TK = BYTWJ(&(W[TWVL * 24]), TJ);
+			 TL = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 TM = BYTWJ(&(W[TWVL * 4]), TL);
+			 TN = VSUB(TK, TM);
+			 T1l = VADD(TK, TM);
+		    }
+		    {
+			 V TP, TR, TO, TQ;
+			 TO = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 TP = BYTWJ(&(W[TWVL * 32]), TO);
+			 TQ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 TR = BYTWJ(&(W[TWVL * 12]), TQ);
+			 TS = VSUB(TP, TR);
+			 T1o = VADD(TP, TR);
+		    }
+		    {
+			 V Tb, Td, Ta, Tc;
+			 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			 Tb = BYTWJ(&(W[TWVL * 30]), Ta);
+			 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Td = BYTWJ(&(W[TWVL * 10]), Tc);
+			 Te = VSUB(Tb, Td);
+			 T1u = VADD(Tb, Td);
+		    }
+		    {
+			 V Th, Tj, Tg, Ti;
+			 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 Th = BYTWJ(&(W[TWVL * 14]), Tg);
+			 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 Tj = BYTWJ(&(W[TWVL * 34]), Ti);
+			 Tk = VSUB(Th, Tj);
+			 T1k = VADD(Th, Tj);
+		    }
+		    {
+			 V Tz, TB, Ty, TA;
+			 Ty = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 Tz = BYTWJ(&(W[TWVL * 16]), Ty);
+			 TA = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 TB = BYTWJ(&(W[TWVL * 36]), TA);
+			 TC = VSUB(Tz, TB);
+			 T1s = VADD(Tz, TB);
+		    }
+		    {
+			 V TE, TG, TD, TF;
+			 TD = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 TE = BYTWJ(&(W[0]), TD);
+			 TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 TG = BYTWJ(&(W[TWVL * 20]), TF);
+			 TH = VSUB(TE, TG);
+			 T1v = VADD(TE, TG);
+		    }
+		    {
+			 V Tm, To, Tl, Tn;
+			 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Tm = BYTWJ(&(W[TWVL * 22]), Tl);
+			 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 To = BYTWJ(&(W[TWVL * 2]), Tn);
+			 Tp = VSUB(Tm, To);
+			 T1n = VADD(Tm, To);
+		    }
+		    TZ = VSUB(TH, TC);
+		    T16 = VSUB(T9, Te);
+		    T17 = VSUB(Tk, Tp);
+		    T10 = VSUB(TS, TN);
+		    Tf = VADD(T9, Te);
+		    Tq = VADD(Tk, Tp);
+		    Tr = VADD(Tf, Tq);
+		    T1N = VADD(T1k, T1l);
+		    T1O = VADD(T1n, T1o);
+		    T1S = VADD(T1N, T1O);
+		    T1t = VSUB(T1r, T1s);
+		    T1w = VSUB(T1u, T1v);
+		    T1C = VADD(T1t, T1w);
+		    TI = VADD(TC, TH);
+		    TT = VADD(TN, TS);
+		    TU = VADD(TI, TT);
+		    T1K = VADD(T1r, T1s);
+		    T1L = VADD(T1u, T1v);
+		    T1R = VADD(T1K, T1L);
+		    T1m = VSUB(T1k, T1l);
+		    T1p = VSUB(T1n, T1o);
+		    T1D = VADD(T1m, T1p);
+	       }
+	       Ts = VADD(T4, Tr);
+	       TV = VBYI(VADD(Tx, TU));
+	       ST(&(x[WS(rs, 5)]), VSUB(Ts, TV), ms, &(x[WS(rs, 1)]));
+	       ST(&(x[WS(rs, 15)]), VADD(Ts, TV), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T1T, T1V, T1W, T1Q, T1Z, T1M, T1P, T1Y, T1X;
+		    T1T = VMUL(LDK(KP559016994), VSUB(T1R, T1S));
+		    T1V = VADD(T1R, T1S);
+		    T1W = VFNMS(LDK(KP250000000), T1V, T1U);
+		    T1M = VSUB(T1K, T1L);
+		    T1P = VSUB(T1N, T1O);
+		    T1Q = VBYI(VFMA(LDK(KP951056516), T1M, VMUL(LDK(KP587785252), T1P)));
+		    T1Z = VBYI(VFNMS(LDK(KP587785252), T1M, VMUL(LDK(KP951056516), T1P)));
+		    ST(&(x[0]), VADD(T1U, T1V), ms, &(x[0]));
+		    T1Y = VSUB(T1W, T1T);
+		    ST(&(x[WS(rs, 8)]), VSUB(T1Y, T1Z), ms, &(x[0]));
+		    ST(&(x[WS(rs, 12)]), VADD(T1Z, T1Y), ms, &(x[0]));
+		    T1X = VADD(T1T, T1W);
+		    ST(&(x[WS(rs, 4)]), VADD(T1Q, T1X), ms, &(x[0]));
+		    ST(&(x[WS(rs, 16)]), VSUB(T1X, T1Q), ms, &(x[0]));
+	       }
+	       {
+		    V T1G, T1E, T1F, T1y, T1J, T1q, T1x, T1I, T1H;
+		    T1G = VMUL(LDK(KP559016994), VSUB(T1C, T1D));
+		    T1E = VADD(T1C, T1D);
+		    T1F = VFNMS(LDK(KP250000000), T1E, T1B);
+		    T1q = VSUB(T1m, T1p);
+		    T1x = VSUB(T1t, T1w);
+		    T1y = VBYI(VFNMS(LDK(KP587785252), T1x, VMUL(LDK(KP951056516), T1q)));
+		    T1J = VBYI(VFMA(LDK(KP951056516), T1x, VMUL(LDK(KP587785252), T1q)));
+		    ST(&(x[WS(rs, 10)]), VADD(T1B, T1E), ms, &(x[0]));
+		    T1I = VADD(T1G, T1F);
+		    ST(&(x[WS(rs, 6)]), VSUB(T1I, T1J), ms, &(x[0]));
+		    ST(&(x[WS(rs, 14)]), VADD(T1J, T1I), ms, &(x[0]));
+		    T1H = VSUB(T1F, T1G);
+		    ST(&(x[WS(rs, 2)]), VADD(T1y, T1H), ms, &(x[0]));
+		    ST(&(x[WS(rs, 18)]), VSUB(T1H, T1y), ms, &(x[0]));
+	       }
+	       {
+		    V T11, T18, T1g, T1d, T15, T1f, TY, T1c;
+		    T11 = VFMA(LDK(KP951056516), TZ, VMUL(LDK(KP587785252), T10));
+		    T18 = VFMA(LDK(KP951056516), T16, VMUL(LDK(KP587785252), T17));
+		    T1g = VFNMS(LDK(KP587785252), T16, VMUL(LDK(KP951056516), T17));
+		    T1d = VFNMS(LDK(KP587785252), TZ, VMUL(LDK(KP951056516), T10));
+		    {
+			 V T13, T14, TW, TX;
+			 T13 = VFMS(LDK(KP250000000), TU, Tx);
+			 T14 = VMUL(LDK(KP559016994), VSUB(TT, TI));
+			 T15 = VADD(T13, T14);
+			 T1f = VSUB(T14, T13);
+			 TW = VMUL(LDK(KP559016994), VSUB(Tf, Tq));
+			 TX = VFNMS(LDK(KP250000000), Tr, T4);
+			 TY = VADD(TW, TX);
+			 T1c = VSUB(TX, TW);
+		    }
+		    {
+			 V T12, T19, T1i, T1j;
+			 T12 = VADD(TY, T11);
+			 T19 = VBYI(VSUB(T15, T18));
+			 ST(&(x[WS(rs, 19)]), VSUB(T12, T19), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T12, T19), ms, &(x[WS(rs, 1)]));
+			 T1i = VADD(T1c, T1d);
+			 T1j = VBYI(VADD(T1g, T1f));
+			 ST(&(x[WS(rs, 13)]), VSUB(T1i, T1j), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T1i, T1j), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T1a, T1b, T1e, T1h;
+			 T1a = VSUB(TY, T11);
+			 T1b = VBYI(VADD(T18, T15));
+			 ST(&(x[WS(rs, 11)]), VSUB(T1a, T1b), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 9)]), VADD(T1a, T1b), ms, &(x[WS(rs, 1)]));
+			 T1e = VSUB(T1c, T1d);
+			 T1h = VBYI(VSUB(T1f, T1g));
+			 ST(&(x[WS(rs, 17)]), VSUB(T1e, T1h), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(T1e, T1h), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t2fv_20"), twinstr, &GENUS, {111, 50, 12, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_20) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,932 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:42 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t2fv_25 -include t2f.h */
+
+/*
+ * This function contains 248 FP additions, 241 FP multiplications,
+ * (or, 67 additions, 60 multiplications, 181 fused multiply/add),
+ * 208 stack variables, 67 constants, and 50 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP792626838, +0.792626838241819413632131824093538848057784557);
+     DVK(KP876091699, +0.876091699473550838204498029706869638173524346);
+     DVK(KP617882369, +0.617882369114440893914546919006756321695042882);
+     DVK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DVK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP999544308, +0.999544308746292983948881682379742149196758193);
+     DVK(KP916574801, +0.916574801383451584742370439148878693530976769);
+     DVK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DVK(KP809385824, +0.809385824416008241660603814668679683846476688);
+     DVK(KP447417479, +0.447417479732227551498980015410057305749330693);
+     DVK(KP894834959, +0.894834959464455102997960030820114611498661386);
+     DVK(KP867381224, +0.867381224396525206773171885031575671309956167);
+     DVK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DVK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DVK(KP958953096, +0.958953096729998668045963838399037225970891871);
+     DVK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DVK(KP829049696, +0.829049696159252993975487806364305442437946767);
+     DVK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DVK(KP897376177, +0.897376177523557693138608077137219684419427330);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DVK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DVK(KP855719849, +0.855719849902058969314654733608091555096772472);
+     DVK(KP237294955, +0.237294955877110315393888866460840817927895961);
+     DVK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DVK(KP904508497, +0.904508497187473712051146708591409529430077295);
+     DVK(KP997675361, +0.997675361079556513670859573984492383596555031);
+     DVK(KP763932022, +0.763932022500210303590826331268723764559381640);
+     DVK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP952936919, +0.952936919628306576880750665357914584765951388);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP262346850, +0.262346850930607871785420028382979691334784273);
+     DVK(KP570584518, +0.570584518783621657366766175430996792655723863);
+     DVK(KP669429328, +0.669429328479476605641803240971985825917022098);
+     DVK(KP923225144, +0.923225144846402650453449441572664695995209956);
+     DVK(KP945422727, +0.945422727388575946270360266328811958657216298);
+     DVK(KP522616830, +0.522616830205754336872861364785224694908468440);
+     DVK(KP956723877, +0.956723877038460305821989399535483155872969262);
+     DVK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DVK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DVK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP921078979, +0.921078979742360627699756128143719920817673854);
+     DVK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DVK(KP982009705, +0.982009705009746369461829878184175962711969869);
+     DVK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DVK(KP494780565, +0.494780565770515410344588413655324772219443730);
+     DVK(KP447533225, +0.447533225982656890041886979663652563063114397);
+     DVK(KP269969613, +0.269969613759572083574752974412347470060951301);
+     DVK(KP244189809, +0.244189809627953270309879511234821255780225091);
+     DVK(KP667278218, +0.667278218140296670899089292254759909713898805);
+     DVK(KP603558818, +0.603558818296015001454675132653458027918768137);
+     DVK(KP522847744, +0.522847744331509716623755382187077770911012542);
+     DVK(KP578046249, +0.578046249379945007321754579646815604023525655);
+     DVK(KP987388751, +0.987388751065621252324603216482382109400433949);
+     DVK(KP893101515, +0.893101515366181661711202267938416198338079437);
+     DVK(KP120146378, +0.120146378570687701782758537356596213647956445);
+     DVK(KP132830569, +0.132830569247582714407653942074819768844536507);
+     DVK(KP869845200, +0.869845200362138853122720822420327157933056305);
+     DVK(KP786782374, +0.786782374965295178365099601674911834788448471);
+     DVK(KP066152395, +0.066152395967733048213034281011006031460903353);
+     DVK(KP059835404, +0.059835404262124915169548397419498386427871950);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V T25, T1B, T2y, T1K, T2s, T23, T1S, T26, T20, T1X;
+	       {
+		    V T1O, T2X, Te, T3L, Td, T3Q, T3j, T3b, T2R, T2M, T2f, T27, T1y, T1H, T3M;
+		    V TW, TR, TK, T2B, T3n, T3e, T2U, T2F, T2i, T2a, Tz, T1C, T3N, TQ, T11;
+		    V T1b, T1c, T16;
+		    {
+			 V T1, T1g, T1i, T1p, T1k, T1m, Tb, T1N, T6, T1M;
+			 {
+			      V T7, T9, T2, T4, T1f, T1h, T1o;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T7 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      T9 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      T4 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			      T1f = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T1h = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      T1o = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      {
+				   V T8, Ta, T3, T5, T1j;
+				   T1j = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+				   T8 = BYTWJ(&(W[TWVL * 18]), T7);
+				   Ta = BYTWJ(&(W[TWVL * 28]), T9);
+				   T3 = BYTWJ(&(W[TWVL * 8]), T2);
+				   T5 = BYTWJ(&(W[TWVL * 38]), T4);
+				   T1g = BYTWJ(&(W[TWVL * 4]), T1f);
+				   T1i = BYTWJ(&(W[TWVL * 14]), T1h);
+				   T1p = BYTWJ(&(W[TWVL * 34]), T1o);
+				   T1k = BYTWJ(&(W[TWVL * 44]), T1j);
+				   T1m = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   Tb = VADD(T8, Ta);
+				   T1N = VSUB(T8, Ta);
+				   T6 = VADD(T3, T5);
+				   T1M = VSUB(T3, T5);
+			      }
+			 }
+			 {
+			      V T1v, T1l, Th, Tj, T1w, T1q, Tq, Tk, Tn, Tg;
+			      Tg = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      {
+				   V Tc, Ti, T1n, Tp;
+				   Ti = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   T1v = VSUB(T1i, T1k);
+				   T1l = VADD(T1i, T1k);
+				   T1n = BYTWJ(&(W[TWVL * 24]), T1m);
+				   Tp = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T1O = VFMA(LDK(KP618033988), T1N, T1M);
+				   T2X = VFNMS(LDK(KP618033988), T1M, T1N);
+				   Te = VSUB(T6, Tb);
+				   Tc = VADD(T6, Tb);
+				   Th = BYTWJ(&(W[0]), Tg);
+				   Tj = BYTWJ(&(W[TWVL * 10]), Ti);
+				   T1w = VSUB(T1n, T1p);
+				   T1q = VADD(T1n, T1p);
+				   Tq = BYTWJ(&(W[TWVL * 30]), Tp);
+				   Tk = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+				   T3L = VADD(T1, Tc);
+				   Td = VFNMS(LDK(KP250000000), Tc, T1);
+				   Tn = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      }
+			      {
+				   V T1x, T2K, TM, TB, Tw, Tm, Tx, Tr, TI, T2L, T1u, TD, TF, TL;
+				   TL = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   {
+					V T1t, Tl, To, TH, T1s, T1r, TA, TC;
+					TA = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+					T1r = VADD(T1l, T1q);
+					T1t = VSUB(T1q, T1l);
+					T1x = VFMA(LDK(KP618033988), T1w, T1v);
+					T2K = VFNMS(LDK(KP618033988), T1v, T1w);
+					Tl = BYTWJ(&(W[TWVL * 40]), Tk);
+					To = BYTWJ(&(W[TWVL * 20]), Tn);
+					TM = BYTWJ(&(W[TWVL * 6]), TL);
+					TB = BYTWJ(&(W[TWVL * 46]), TA);
+					TH = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					T1s = VFNMS(LDK(KP250000000), T1r, T1g);
+					T3Q = VADD(T1g, T1r);
+					TC = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					Tw = VSUB(Tj, Tl);
+					Tm = VADD(Tj, Tl);
+					Tx = VSUB(Tq, To);
+					Tr = VADD(To, Tq);
+					TI = BYTWJ(&(W[TWVL * 26]), TH);
+					T2L = VFMA(LDK(KP559016994), T1t, T1s);
+					T1u = VFNMS(LDK(KP559016994), T1t, T1s);
+					TD = BYTWJ(&(W[TWVL * 16]), TC);
+					TF = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+				   }
+				   {
+					V Tu, Ty, T2E, TE, TN, TG, Tt, TV, Ts;
+					TV = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					Ts = VADD(Tm, Tr);
+					Tu = VSUB(Tm, Tr);
+					Ty = VFNMS(LDK(KP618033988), Tx, Tw);
+					T2E = VFMA(LDK(KP618033988), Tw, Tx);
+					T3j = VFNMS(LDK(KP059835404), T2K, T2L);
+					T3b = VFMA(LDK(KP066152395), T2L, T2K);
+					T2R = VFNMS(LDK(KP786782374), T2K, T2L);
+					T2M = VFMA(LDK(KP869845200), T2L, T2K);
+					T2f = VFMA(LDK(KP132830569), T1u, T1x);
+					T27 = VFNMS(LDK(KP120146378), T1x, T1u);
+					T1y = VFNMS(LDK(KP893101515), T1x, T1u);
+					T1H = VFMA(LDK(KP987388751), T1u, T1x);
+					TE = VSUB(TB, TD);
+					TN = VADD(TD, TB);
+					TG = BYTWJ(&(W[TWVL * 36]), TF);
+					Tt = VFNMS(LDK(KP250000000), Ts, Th);
+					T3M = VADD(Th, Ts);
+					TW = BYTWJ(&(W[TWVL * 2]), TV);
+					{
+					     V TJ, TO, Tv, T2D, TY, T15, T10, T13, TP;
+					     {
+						  V TX, T14, TZ, T12;
+						  TX = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+						  T14 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+						  TZ = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+						  T12 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+						  TJ = VSUB(TG, TI);
+						  TO = VADD(TI, TG);
+						  Tv = VFMA(LDK(KP559016994), Tu, Tt);
+						  T2D = VFNMS(LDK(KP559016994), Tu, Tt);
+						  TY = BYTWJ(&(W[TWVL * 12]), TX);
+						  T15 = BYTWJ(&(W[TWVL * 32]), T14);
+						  T10 = BYTWJ(&(W[TWVL * 42]), TZ);
+						  T13 = BYTWJ(&(W[TWVL * 22]), T12);
+					     }
+					     TP = VADD(TN, TO);
+					     TR = VSUB(TN, TO);
+					     TK = VFMA(LDK(KP618033988), TJ, TE);
+					     T2B = VFNMS(LDK(KP618033988), TE, TJ);
+					     T3n = VFMA(LDK(KP578046249), T2D, T2E);
+					     T3e = VFNMS(LDK(KP522847744), T2E, T2D);
+					     T2U = VFNMS(LDK(KP987388751), T2D, T2E);
+					     T2F = VFMA(LDK(KP893101515), T2E, T2D);
+					     T2i = VFNMS(LDK(KP603558818), Ty, Tv);
+					     T2a = VFMA(LDK(KP667278218), Tv, Ty);
+					     Tz = VFNMS(LDK(KP244189809), Ty, Tv);
+					     T1C = VFMA(LDK(KP269969613), Tv, Ty);
+					     T3N = VADD(TM, TP);
+					     TQ = VFMS(LDK(KP250000000), TP, TM);
+					     T11 = VADD(TY, T10);
+					     T1b = VSUB(TY, T10);
+					     T1c = VSUB(T15, T13);
+					     T16 = VADD(T13, T15);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T2z, Tf, T3W, T3O, T1d, T2H, T3m, T2j, T2b, TT, T1D, T2G, T35, T2V, T2Z;
+			 V T3A, T3g, T2I, T1a, T3R, T3X;
+			 T2z = VFNMS(LDK(KP559016994), Te, Td);
+			 Tf = VFMA(LDK(KP559016994), Te, Td);
+			 {
+			      V TS, T2A, T17, T19;
+			      TS = VFNMS(LDK(KP559016994), TR, TQ);
+			      T2A = VFMA(LDK(KP559016994), TR, TQ);
+			      T3W = VSUB(T3M, T3N);
+			      T3O = VADD(T3M, T3N);
+			      T1d = VFNMS(LDK(KP618033988), T1c, T1b);
+			      T2H = VFMA(LDK(KP618033988), T1b, T1c);
+			      T17 = VADD(T11, T16);
+			      T19 = VSUB(T16, T11);
+			      {
+				   V T3f, T2T, T2C, T18, T3P;
+				   T3m = VFMA(LDK(KP447533225), T2B, T2A);
+				   T3f = VFNMS(LDK(KP494780565), T2A, T2B);
+				   T2T = VFNMS(LDK(KP132830569), T2A, T2B);
+				   T2C = VFMA(LDK(KP120146378), T2B, T2A);
+				   T2j = VFNMS(LDK(KP786782374), TK, TS);
+				   T2b = VFMA(LDK(KP869845200), TS, TK);
+				   TT = VFNMS(LDK(KP667278218), TS, TK);
+				   T1D = VFMA(LDK(KP603558818), TK, TS);
+				   T18 = VFNMS(LDK(KP250000000), T17, TW);
+				   T3P = VADD(TW, T17);
+				   T2G = VFMA(LDK(KP734762448), T2F, T2C);
+				   T35 = VFNMS(LDK(KP734762448), T2F, T2C);
+				   T2V = VFNMS(LDK(KP734762448), T2U, T2T);
+				   T2Z = VFMA(LDK(KP734762448), T2U, T2T);
+				   T3A = VFMA(LDK(KP982009705), T3f, T3e);
+				   T3g = VFNMS(LDK(KP982009705), T3f, T3e);
+				   T2I = VFMA(LDK(KP559016994), T19, T18);
+				   T1a = VFNMS(LDK(KP559016994), T19, T18);
+				   T3R = VADD(T3P, T3Q);
+				   T3X = VSUB(T3P, T3Q);
+			      }
+			 }
+			 {
+			      V T2n, T2t, T1V, T22, T2l, T2d, T1Q, T1I, T2w, T1A, T1F, T2q;
+			      {
+				   V T2k, T1G, T28, T2g, T3K, T3E, T3a, T34, T3x, T3H, T2c, TU, T1T, T1U, T1z;
+				   V T3o, T3t;
+				   T2n = VFNMS(LDK(KP912575812), T2j, T2i);
+				   T2k = VFMA(LDK(KP912575812), T2j, T2i);
+				   T3o = VFNMS(LDK(KP921078979), T3n, T3m);
+				   T3t = VFMA(LDK(KP921078979), T3n, T3m);
+				   {
+					V T3c, T2Q, T2J, T3k, T1e;
+					T3c = VFNMS(LDK(KP667278218), T2I, T2H);
+					T2Q = VFNMS(LDK(KP059835404), T2H, T2I);
+					T2J = VFMA(LDK(KP066152395), T2I, T2H);
+					T3k = VFMA(LDK(KP603558818), T2H, T2I);
+					T1G = VFMA(LDK(KP578046249), T1a, T1d);
+					T1e = VFNMS(LDK(KP522847744), T1d, T1a);
+					T28 = VFNMS(LDK(KP494780565), T1a, T1d);
+					T2g = VFMA(LDK(KP447533225), T1d, T1a);
+					{
+					     V T3U, T3S, T40, T3Y;
+					     T3U = VSUB(T3O, T3R);
+					     T3S = VADD(T3O, T3R);
+					     T40 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T3W, T3X));
+					     T3Y = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T3X, T3W));
+					     {
+						  V T3s, T3l, T2N, T36;
+						  T3s = VFNMS(LDK(KP845997307), T3k, T3j);
+						  T3l = VFMA(LDK(KP845997307), T3k, T3j);
+						  T2N = VFNMS(LDK(KP772036680), T2M, T2J);
+						  T36 = VFMA(LDK(KP772036680), T2M, T2J);
+						  {
+						       V T30, T2S, T3d, T3z, T3T;
+						       T30 = VFNMS(LDK(KP772036680), T2R, T2Q);
+						       T2S = VFMA(LDK(KP772036680), T2R, T2Q);
+						       T3d = VFNMS(LDK(KP845997307), T3c, T3b);
+						       T3z = VFMA(LDK(KP845997307), T3c, T3b);
+						       ST(&(x[0]), VADD(T3S, T3L), ms, &(x[0]));
+						       T3T = VFNMS(LDK(KP250000000), T3S, T3L);
+						       {
+							    V T3C, T3p, T2O, T37;
+							    T3C = VFMA(LDK(KP906616052), T3o, T3l);
+							    T3p = VFNMS(LDK(KP906616052), T3o, T3l);
+							    T2O = VFMA(LDK(KP956723877), T2N, T2G);
+							    T37 = VFMA(LDK(KP522616830), T2V, T36);
+							    {
+								 V T31, T2W, T3u, T3h;
+								 T31 = VFNMS(LDK(KP522616830), T2G, T30);
+								 T2W = VFMA(LDK(KP945422727), T2V, T2S);
+								 T3u = VFNMS(LDK(KP923225144), T3g, T3d);
+								 T3h = VFMA(LDK(KP923225144), T3g, T3d);
+								 {
+								      V T3I, T3B, T3V, T3Z;
+								      T3I = VFNMS(LDK(KP669429328), T3z, T3A);
+								      T3B = VFMA(LDK(KP570584518), T3A, T3z);
+								      T3V = VFMA(LDK(KP559016994), T3U, T3T);
+								      T3Z = VFNMS(LDK(KP559016994), T3U, T3T);
+								      {
+									   V T3y, T3q, T2P, T38;
+									   T3y = VFMA(LDK(KP262346850), T3p, T2X);
+									   T3q = VMUL(LDK(KP998026728), VFNMS(LDK(KP952936919), T2X, T3p));
+									   T2P = VFMA(LDK(KP992114701), T2O, T2z);
+									   T38 = VFNMS(LDK(KP690983005), T37, T2S);
+									   {
+										V T32, T2Y, T3v, T3F;
+										T32 = VFMA(LDK(KP763932022), T31, T2N);
+										T2Y = VMUL(LDK(KP998026728), VFMA(LDK(KP952936919), T2X, T2W));
+										T3v = VFNMS(LDK(KP997675361), T3u, T3t);
+										T3F = VFNMS(LDK(KP904508497), T3u, T3s);
+										{
+										     V T3i, T3r, T3J, T3D;
+										     T3i = VFMA(LDK(KP949179823), T3h, T2z);
+										     T3r = VFNMS(LDK(KP237294955), T3h, T2z);
+										     T3J = VFNMS(LDK(KP669429328), T3C, T3I);
+										     T3D = VFMA(LDK(KP618033988), T3C, T3B);
+										     ST(&(x[WS(rs, 20)]), VFMAI(T3Y, T3V), ms, &(x[0]));
+										     ST(&(x[WS(rs, 5)]), VFNMSI(T3Y, T3V), ms, &(x[WS(rs, 1)]));
+										     ST(&(x[WS(rs, 15)]), VFNMSI(T40, T3Z), ms, &(x[WS(rs, 1)]));
+										     ST(&(x[WS(rs, 10)]), VFMAI(T40, T3Z), ms, &(x[0]));
+										     {
+											  V T39, T33, T3w, T3G;
+											  T39 = VFMA(LDK(KP855719849), T38, T35);
+											  T33 = VFNMS(LDK(KP855719849), T32, T2Z);
+											  ST(&(x[WS(rs, 22)]), VFMAI(T2Y, T2P), ms, &(x[0]));
+											  ST(&(x[WS(rs, 3)]), VFNMSI(T2Y, T2P), ms, &(x[WS(rs, 1)]));
+											  T3w = VFMA(LDK(KP560319534), T3v, T3s);
+											  T3G = VFNMS(LDK(KP681693190), T3F, T3t);
+											  ST(&(x[WS(rs, 23)]), VFMAI(T3q, T3i), ms, &(x[WS(rs, 1)]));
+											  ST(&(x[WS(rs, 2)]), VFNMSI(T3q, T3i), ms, &(x[0]));
+											  T3K = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T3J, T3y));
+											  T3E = VMUL(LDK(KP951056516), VFNMS(LDK(KP949179823), T3D, T3y));
+											  T3a = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T39, T2X));
+											  T34 = VFMA(LDK(KP897376177), T33, T2z);
+											  T3x = VFNMS(LDK(KP949179823), T3w, T3r);
+											  T3H = VFNMS(LDK(KP860541664), T3G, T3r);
+											  T2t = VFNMS(LDK(KP912575812), T2b, T2a);
+											  T2c = VFMA(LDK(KP912575812), T2b, T2a);
+											  TU = VFMA(LDK(KP829049696), TT, Tz);
+											  T1T = VFNMS(LDK(KP829049696), TT, Tz);
+											  T1U = VFNMS(LDK(KP831864738), T1y, T1e);
+											  T1z = VFMA(LDK(KP831864738), T1y, T1e);
+										     }
+										}
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+				   {
+					V T2o, T2h, T29, T2u, T2v, T2p;
+					T2o = VFNMS(LDK(KP958953096), T2g, T2f);
+					T2h = VFMA(LDK(KP958953096), T2g, T2f);
+					ST(&(x[WS(rs, 17)]), VFMAI(T3a, T34), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 8)]), VFNMSI(T3a, T34), ms, &(x[0]));
+					ST(&(x[WS(rs, 12)]), VFMAI(T3E, T3x), ms, &(x[0]));
+					ST(&(x[WS(rs, 13)]), VFNMSI(T3E, T3x), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 18)]), VFNMSI(T3K, T3H), ms, &(x[0]));
+					ST(&(x[WS(rs, 7)]), VFMAI(T3K, T3H), ms, &(x[WS(rs, 1)]));
+					T1V = VFMA(LDK(KP559154169), T1U, T1T);
+					T22 = VFNMS(LDK(KP683113946), T1T, T1U);
+					T29 = VFNMS(LDK(KP867381224), T28, T27);
+					T2u = VFMA(LDK(KP867381224), T28, T27);
+					T2l = VFMA(LDK(KP894834959), T2k, T2h);
+					T2v = VFMA(LDK(KP447417479), T2k, T2u);
+					T2d = VFNMS(LDK(KP809385824), T2c, T29);
+					T2p = VFMA(LDK(KP447417479), T2c, T2o);
+					T1Q = VFMA(LDK(KP831864738), T1H, T1G);
+					T1I = VFNMS(LDK(KP831864738), T1H, T1G);
+					T2w = VFNMS(LDK(KP763932022), T2v, T2h);
+					T1A = VFMA(LDK(KP904730450), T1z, TU);
+					T1F = VFNMS(LDK(KP904730450), T1z, TU);
+					T2q = VFMA(LDK(KP690983005), T2p, T29);
+				   }
+			      }
+			      {
+				   V T2e, T1E, T1P, T2m;
+				   T2e = VFNMS(LDK(KP992114701), T2d, Tf);
+				   T1E = VFMA(LDK(KP916574801), T1D, T1C);
+				   T1P = VFNMS(LDK(KP916574801), T1D, T1C);
+				   T2m = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T2l, T1O));
+				   {
+					V T1J, T2r, T1R, T1W, T1Z, T2x;
+					T2x = VFNMS(LDK(KP999544308), T2w, T2t);
+					T1J = VFNMS(LDK(KP904730450), T1I, T1F);
+					T25 = VFMA(LDK(KP968583161), T1A, Tf);
+					T1B = VFNMS(LDK(KP242145790), T1A, Tf);
+					T2r = VFNMS(LDK(KP999544308), T2q, T2n);
+					T1R = VFMA(LDK(KP904730450), T1Q, T1P);
+					T1W = VFNMS(LDK(KP904730450), T1Q, T1P);
+					T1Z = VADD(T1E, T1F);
+					ST(&(x[WS(rs, 21)]), VFNMSI(T2m, T2e), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 4)]), VFMAI(T2m, T2e), ms, &(x[0]));
+					T2y = VMUL(LDK(KP951056516), VFNMS(LDK(KP803003575), T2x, T1O));
+					T1K = VFNMS(LDK(KP618033988), T1J, T1E);
+					T2s = VFNMS(LDK(KP803003575), T2r, Tf);
+					T23 = VFMA(LDK(KP617882369), T1W, T22);
+					T1S = VFNMS(LDK(KP242145790), T1R, T1O);
+					T26 = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1R, T1O));
+					T20 = VFNMS(LDK(KP683113946), T1Z, T1I);
+					T1X = VFMA(LDK(KP559016994), T1W, T1V);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1L, T24, T21, T1Y;
+		    T1L = VFNMS(LDK(KP876091699), T1K, T1B);
+		    ST(&(x[WS(rs, 9)]), VFMAI(T2y, T2s), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 16)]), VFNMSI(T2y, T2s), ms, &(x[0]));
+		    T24 = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T23, T1S));
+		    ST(&(x[WS(rs, 24)]), VFMAI(T26, T25), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFNMSI(T26, T25), ms, &(x[WS(rs, 1)]));
+		    T21 = VFMA(LDK(KP792626838), T20, T1B);
+		    T1Y = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1X, T1S));
+		    ST(&(x[WS(rs, 11)]), VFNMSI(T24, T21), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 14)]), VFMAI(T24, T21), ms, &(x[0]));
+		    ST(&(x[WS(rs, 19)]), VFMAI(T1Y, T1L), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VFNMSI(T1Y, T1L), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t2fv_25"), twinstr, &GENUS, {67, 60, 181, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_25) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t2fv_25 -include t2f.h */
+
+/*
+ * This function contains 248 FP additions, 188 FP multiplications,
+ * (or, 170 additions, 110 multiplications, 78 fused multiply/add),
+ * 99 stack variables, 40 constants, and 50 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP125581039, +0.125581039058626752152356449131262266244969664);
+     DVK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DVK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DVK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DVK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DVK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
+     DVK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DVK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP497379774, +0.497379774329709576484567492012895936835134813);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DVK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
+     DVK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DVK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DVK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DVK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP250666467, +0.250666467128608490746237519633017587885836494);
+     DVK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DVK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DVK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
+     DVK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DVK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DVK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
+     DVK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DVK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
+     DVK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DVK(KP851558583, +0.851558583130145297725004891488503407959946084);
+     DVK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DVK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
+     DVK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DVK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
+     DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V Tc, Tb, Td, Te, T1C, T2t, T1E, T1x, T2m, T1u, T3c, T2n, Ty, T2i, Tv;
+	       V T38, T2j, TS, T2f, TP, T39, T2g, T1d, T2p, T1a, T3b, T2q;
+	       {
+		    V T7, T9, Ta, T2, T4, T5, T1D;
+		    Tc = LD(&(x[0]), ms, &(x[0]));
+		    {
+			 V T6, T8, T1, T3;
+			 T6 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 T7 = BYTWJ(&(W[TWVL * 18]), T6);
+			 T8 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 28]), T8);
+			 Ta = VADD(T7, T9);
+			 T1 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTWJ(&(W[TWVL * 8]), T1);
+			 T3 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 T4 = BYTWJ(&(W[TWVL * 38]), T3);
+			 T5 = VADD(T2, T4);
+		    }
+		    Tb = VMUL(LDK(KP559016994), VSUB(T5, Ta));
+		    Td = VADD(T5, Ta);
+		    Te = VFNMS(LDK(KP250000000), Td, Tc);
+		    T1C = VSUB(T2, T4);
+		    T1D = VSUB(T7, T9);
+		    T2t = VMUL(LDK(KP951056516), T1D);
+		    T1E = VFMA(LDK(KP951056516), T1C, VMUL(LDK(KP587785252), T1D));
+	       }
+	       {
+		    V T1r, T1l, T1n, T1o, T1g, T1i, T1j, T1q;
+		    T1q = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    T1r = BYTWJ(&(W[TWVL * 4]), T1q);
+		    {
+			 V T1k, T1m, T1f, T1h;
+			 T1k = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T1l = BYTWJ(&(W[TWVL * 24]), T1k);
+			 T1m = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 T1n = BYTWJ(&(W[TWVL * 34]), T1m);
+			 T1o = VADD(T1l, T1n);
+			 T1f = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T1g = BYTWJ(&(W[TWVL * 14]), T1f);
+			 T1h = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T1i = BYTWJ(&(W[TWVL * 44]), T1h);
+			 T1j = VADD(T1g, T1i);
+		    }
+		    {
+			 V T1v, T1w, T1p, T1s, T1t;
+			 T1v = VSUB(T1g, T1i);
+			 T1w = VSUB(T1l, T1n);
+			 T1x = VFMA(LDK(KP475528258), T1v, VMUL(LDK(KP293892626), T1w));
+			 T2m = VFNMS(LDK(KP293892626), T1v, VMUL(LDK(KP475528258), T1w));
+			 T1p = VMUL(LDK(KP559016994), VSUB(T1j, T1o));
+			 T1s = VADD(T1j, T1o);
+			 T1t = VFNMS(LDK(KP250000000), T1s, T1r);
+			 T1u = VADD(T1p, T1t);
+			 T3c = VADD(T1r, T1s);
+			 T2n = VSUB(T1t, T1p);
+		    }
+	       }
+	       {
+		    V Ts, Tm, To, Tp, Th, Tj, Tk, Tr;
+		    Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Ts = BYTWJ(&(W[0]), Tr);
+		    {
+			 V Tl, Tn, Tg, Ti;
+			 Tl = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 Tm = BYTWJ(&(W[TWVL * 20]), Tl);
+			 Tn = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			 To = BYTWJ(&(W[TWVL * 30]), Tn);
+			 Tp = VADD(Tm, To);
+			 Tg = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Th = BYTWJ(&(W[TWVL * 10]), Tg);
+			 Ti = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			 Tj = BYTWJ(&(W[TWVL * 40]), Ti);
+			 Tk = VADD(Th, Tj);
+		    }
+		    {
+			 V Tw, Tx, Tq, Tt, Tu;
+			 Tw = VSUB(Th, Tj);
+			 Tx = VSUB(Tm, To);
+			 Ty = VFMA(LDK(KP475528258), Tw, VMUL(LDK(KP293892626), Tx));
+			 T2i = VFNMS(LDK(KP293892626), Tw, VMUL(LDK(KP475528258), Tx));
+			 Tq = VMUL(LDK(KP559016994), VSUB(Tk, Tp));
+			 Tt = VADD(Tk, Tp);
+			 Tu = VFNMS(LDK(KP250000000), Tt, Ts);
+			 Tv = VADD(Tq, Tu);
+			 T38 = VADD(Ts, Tt);
+			 T2j = VSUB(Tu, Tq);
+		    }
+	       }
+	       {
+		    V TM, TG, TI, TJ, TB, TD, TE, TL;
+		    TL = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    TM = BYTWJ(&(W[TWVL * 6]), TL);
+		    {
+			 V TF, TH, TA, TC;
+			 TF = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 TG = BYTWJ(&(W[TWVL * 26]), TF);
+			 TH = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 TI = BYTWJ(&(W[TWVL * 36]), TH);
+			 TJ = VADD(TG, TI);
+			 TA = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 TB = BYTWJ(&(W[TWVL * 16]), TA);
+			 TC = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			 TD = BYTWJ(&(W[TWVL * 46]), TC);
+			 TE = VADD(TB, TD);
+		    }
+		    {
+			 V TQ, TR, TK, TN, TO;
+			 TQ = VSUB(TB, TD);
+			 TR = VSUB(TG, TI);
+			 TS = VFMA(LDK(KP475528258), TQ, VMUL(LDK(KP293892626), TR));
+			 T2f = VFNMS(LDK(KP293892626), TQ, VMUL(LDK(KP475528258), TR));
+			 TK = VMUL(LDK(KP559016994), VSUB(TE, TJ));
+			 TN = VADD(TE, TJ);
+			 TO = VFNMS(LDK(KP250000000), TN, TM);
+			 TP = VADD(TK, TO);
+			 T39 = VADD(TM, TN);
+			 T2g = VSUB(TO, TK);
+		    }
+	       }
+	       {
+		    V T17, T11, T13, T14, TW, TY, TZ, T16;
+		    T16 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T17 = BYTWJ(&(W[TWVL * 2]), T16);
+		    {
+			 V T10, T12, TV, TX;
+			 T10 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 T11 = BYTWJ(&(W[TWVL * 22]), T10);
+			 T12 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 T13 = BYTWJ(&(W[TWVL * 32]), T12);
+			 T14 = VADD(T11, T13);
+			 TV = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 TW = BYTWJ(&(W[TWVL * 12]), TV);
+			 TX = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 TY = BYTWJ(&(W[TWVL * 42]), TX);
+			 TZ = VADD(TW, TY);
+		    }
+		    {
+			 V T1b, T1c, T15, T18, T19;
+			 T1b = VSUB(TW, TY);
+			 T1c = VSUB(T11, T13);
+			 T1d = VFMA(LDK(KP475528258), T1b, VMUL(LDK(KP293892626), T1c));
+			 T2p = VFNMS(LDK(KP293892626), T1b, VMUL(LDK(KP475528258), T1c));
+			 T15 = VMUL(LDK(KP559016994), VSUB(TZ, T14));
+			 T18 = VADD(TZ, T14);
+			 T19 = VFNMS(LDK(KP250000000), T18, T17);
+			 T1a = VADD(T15, T19);
+			 T3b = VADD(T17, T18);
+			 T2q = VSUB(T19, T15);
+		    }
+	       }
+	       {
+		    V T3l, T3m, T3f, T3g, T3e, T3h, T3n, T3i;
+		    {
+			 V T3j, T3k, T3a, T3d;
+			 T3j = VSUB(T38, T39);
+			 T3k = VSUB(T3b, T3c);
+			 T3l = VBYI(VFMA(LDK(KP951056516), T3j, VMUL(LDK(KP587785252), T3k)));
+			 T3m = VBYI(VFNMS(LDK(KP587785252), T3j, VMUL(LDK(KP951056516), T3k)));
+			 T3f = VADD(Tc, Td);
+			 T3a = VADD(T38, T39);
+			 T3d = VADD(T3b, T3c);
+			 T3g = VADD(T3a, T3d);
+			 T3e = VMUL(LDK(KP559016994), VSUB(T3a, T3d));
+			 T3h = VFNMS(LDK(KP250000000), T3g, T3f);
+		    }
+		    ST(&(x[0]), VADD(T3f, T3g), ms, &(x[0]));
+		    T3n = VSUB(T3h, T3e);
+		    ST(&(x[WS(rs, 10)]), VADD(T3m, T3n), ms, &(x[0]));
+		    ST(&(x[WS(rs, 15)]), VSUB(T3n, T3m), ms, &(x[WS(rs, 1)]));
+		    T3i = VADD(T3e, T3h);
+		    ST(&(x[WS(rs, 5)]), VSUB(T3i, T3l), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 20)]), VADD(T3l, T3i), ms, &(x[0]));
+	       }
+	       {
+		    V Tf, T1Z, T20, T21, T29, T2a, T2b, T26, T27, T28, T22, T23, T24, T1L, T1U;
+		    V T1Q, T1S, T1A, T1V, T1N, T1O, T2d, T2e;
+		    Tf = VADD(Tb, Te);
+		    T1Z = VFMA(LDK(KP1_688655851), Ty, VMUL(LDK(KP535826794), Tv));
+		    T20 = VFMA(LDK(KP1_541026485), TS, VMUL(LDK(KP637423989), TP));
+		    T21 = VSUB(T1Z, T20);
+		    T29 = VFMA(LDK(KP851558583), T1d, VMUL(LDK(KP904827052), T1a));
+		    T2a = VFMA(LDK(KP1_984229402), T1x, VMUL(LDK(KP125333233), T1u));
+		    T2b = VADD(T29, T2a);
+		    T26 = VFNMS(LDK(KP844327925), Tv, VMUL(LDK(KP1_071653589), Ty));
+		    T27 = VFNMS(LDK(KP1_274847979), TS, VMUL(LDK(KP770513242), TP));
+		    T28 = VADD(T26, T27);
+		    T22 = VFNMS(LDK(KP425779291), T1a, VMUL(LDK(KP1_809654104), T1d));
+		    T23 = VFNMS(LDK(KP992114701), T1u, VMUL(LDK(KP250666467), T1x));
+		    T24 = VADD(T22, T23);
+		    {
+			 V T1F, T1G, T1H, T1I, T1J, T1K;
+			 T1F = VFMA(LDK(KP1_937166322), Ty, VMUL(LDK(KP248689887), Tv));
+			 T1G = VFMA(LDK(KP1_071653589), TS, VMUL(LDK(KP844327925), TP));
+			 T1H = VADD(T1F, T1G);
+			 T1I = VFMA(LDK(KP1_752613360), T1d, VMUL(LDK(KP481753674), T1a));
+			 T1J = VFMA(LDK(KP1_457937254), T1x, VMUL(LDK(KP684547105), T1u));
+			 T1K = VADD(T1I, T1J);
+			 T1L = VADD(T1H, T1K);
+			 T1U = VSUB(T1J, T1I);
+			 T1Q = VMUL(LDK(KP559016994), VSUB(T1K, T1H));
+			 T1S = VSUB(T1G, T1F);
+		    }
+		    {
+			 V Tz, TT, TU, T1e, T1y, T1z;
+			 Tz = VFNMS(LDK(KP497379774), Ty, VMUL(LDK(KP968583161), Tv));
+			 TT = VFNMS(LDK(KP1_688655851), TS, VMUL(LDK(KP535826794), TP));
+			 TU = VADD(Tz, TT);
+			 T1e = VFNMS(LDK(KP963507348), T1d, VMUL(LDK(KP876306680), T1a));
+			 T1y = VFNMS(LDK(KP1_369094211), T1x, VMUL(LDK(KP728968627), T1u));
+			 T1z = VADD(T1e, T1y);
+			 T1A = VADD(TU, T1z);
+			 T1V = VMUL(LDK(KP559016994), VSUB(TU, T1z));
+			 T1N = VSUB(TT, Tz);
+			 T1O = VSUB(T1e, T1y);
+		    }
+		    {
+			 V T1B, T1M, T25, T2c;
+			 T1B = VADD(Tf, T1A);
+			 T1M = VBYI(VADD(T1E, T1L));
+			 ST(&(x[WS(rs, 1)]), VSUB(T1B, T1M), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 24)]), VADD(T1B, T1M), ms, &(x[0]));
+			 T25 = VADD(Tf, VADD(T21, T24));
+			 T2c = VBYI(VADD(T1E, VSUB(T28, T2b)));
+			 ST(&(x[WS(rs, 21)]), VSUB(T25, T2c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 4)]), VADD(T25, T2c), ms, &(x[0]));
+		    }
+		    T2d = VBYI(VADD(T1E, VFMA(LDK(KP309016994), T28, VFMA(LDK(KP587785252), VSUB(T23, T22), VFNMS(LDK(KP951056516), VADD(T1Z, T20), VMUL(LDK(KP809016994), T2b))))));
+		    T2e = VFMA(LDK(KP309016994), T21, VFMA(LDK(KP951056516), VSUB(T26, T27), VFMA(LDK(KP587785252), VSUB(T2a, T29), VFNMS(LDK(KP809016994), T24, Tf))));
+		    ST(&(x[WS(rs, 9)]), VADD(T2d, T2e), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 16)]), VSUB(T2e, T2d), ms, &(x[0]));
+		    {
+			 V T1R, T1X, T1W, T1Y, T1P, T1T;
+			 T1P = VFMS(LDK(KP250000000), T1L, T1E);
+			 T1R = VBYI(VADD(VFMA(LDK(KP587785252), T1N, VMUL(LDK(KP951056516), T1O)), VSUB(T1P, T1Q)));
+			 T1X = VBYI(VADD(VFNMS(LDK(KP587785252), T1O, VMUL(LDK(KP951056516), T1N)), VADD(T1P, T1Q)));
+			 T1T = VFNMS(LDK(KP250000000), T1A, Tf);
+			 T1W = VFMA(LDK(KP587785252), T1S, VFNMS(LDK(KP951056516), T1U, VSUB(T1T, T1V)));
+			 T1Y = VFMA(LDK(KP951056516), T1S, VADD(T1V, VFMA(LDK(KP587785252), T1U, T1T)));
+			 ST(&(x[WS(rs, 11)]), VADD(T1R, T1W), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 19)]), VSUB(T1Y, T1X), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 14)]), VSUB(T1W, T1R), ms, &(x[0]));
+			 ST(&(x[WS(rs, 6)]), VADD(T1X, T1Y), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T2u, T2w, T2h, T2k, T2l, T2A, T2B, T2C, T2o, T2r, T2s, T2x, T2y, T2z, T2M;
+		    V T2X, T2N, T2W, T2R, T31, T2U, T30, T2E, T2F;
+		    T2u = VFNMS(LDK(KP587785252), T1C, T2t);
+		    T2w = VSUB(Te, Tb);
+		    T2h = VFNMS(LDK(KP125333233), T2g, VMUL(LDK(KP1_984229402), T2f));
+		    T2k = VFMA(LDK(KP1_457937254), T2i, VMUL(LDK(KP684547105), T2j));
+		    T2l = VSUB(T2h, T2k);
+		    T2A = VFNMS(LDK(KP1_996053456), T2p, VMUL(LDK(KP062790519), T2q));
+		    T2B = VFMA(LDK(KP1_541026485), T2m, VMUL(LDK(KP637423989), T2n));
+		    T2C = VSUB(T2A, T2B);
+		    T2o = VFNMS(LDK(KP770513242), T2n, VMUL(LDK(KP1_274847979), T2m));
+		    T2r = VFMA(LDK(KP125581039), T2p, VMUL(LDK(KP998026728), T2q));
+		    T2s = VSUB(T2o, T2r);
+		    T2x = VFNMS(LDK(KP1_369094211), T2i, VMUL(LDK(KP728968627), T2j));
+		    T2y = VFMA(LDK(KP250666467), T2f, VMUL(LDK(KP992114701), T2g));
+		    T2z = VSUB(T2x, T2y);
+		    {
+			 V T2G, T2H, T2I, T2J, T2K, T2L;
+			 T2G = VFNMS(LDK(KP481753674), T2j, VMUL(LDK(KP1_752613360), T2i));
+			 T2H = VFMA(LDK(KP851558583), T2f, VMUL(LDK(KP904827052), T2g));
+			 T2I = VSUB(T2G, T2H);
+			 T2J = VFNMS(LDK(KP844327925), T2q, VMUL(LDK(KP1_071653589), T2p));
+			 T2K = VFNMS(LDK(KP998026728), T2n, VMUL(LDK(KP125581039), T2m));
+			 T2L = VADD(T2J, T2K);
+			 T2M = VMUL(LDK(KP559016994), VSUB(T2I, T2L));
+			 T2X = VSUB(T2J, T2K);
+			 T2N = VADD(T2I, T2L);
+			 T2W = VADD(T2G, T2H);
+		    }
+		    {
+			 V T2P, T2Q, T2Y, T2S, T2T, T2Z;
+			 T2P = VFNMS(LDK(KP425779291), T2g, VMUL(LDK(KP1_809654104), T2f));
+			 T2Q = VFMA(LDK(KP963507348), T2i, VMUL(LDK(KP876306680), T2j));
+			 T2Y = VADD(T2Q, T2P);
+			 T2S = VFMA(LDK(KP1_688655851), T2p, VMUL(LDK(KP535826794), T2q));
+			 T2T = VFMA(LDK(KP1_996053456), T2m, VMUL(LDK(KP062790519), T2n));
+			 T2Z = VADD(T2S, T2T);
+			 T2R = VSUB(T2P, T2Q);
+			 T31 = VADD(T2Y, T2Z);
+			 T2U = VSUB(T2S, T2T);
+			 T30 = VMUL(LDK(KP559016994), VSUB(T2Y, T2Z));
+		    }
+		    {
+			 V T36, T37, T2v, T2D;
+			 T36 = VBYI(VADD(T2u, T2N));
+			 T37 = VADD(T2w, T31);
+			 ST(&(x[WS(rs, 2)]), VADD(T36, T37), ms, &(x[0]));
+			 ST(&(x[WS(rs, 23)]), VSUB(T37, T36), ms, &(x[WS(rs, 1)]));
+			 T2v = VBYI(VSUB(VADD(T2l, T2s), T2u));
+			 T2D = VADD(T2w, VADD(T2z, T2C));
+			 ST(&(x[WS(rs, 3)]), VADD(T2v, T2D), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 22)]), VSUB(T2D, T2v), ms, &(x[0]));
+		    }
+		    T2E = VFMA(LDK(KP309016994), T2z, VFNMS(LDK(KP809016994), T2C, VFNMS(LDK(KP587785252), VADD(T2r, T2o), VFNMS(LDK(KP951056516), VADD(T2k, T2h), T2w))));
+		    T2F = VBYI(VSUB(VFNMS(LDK(KP587785252), VADD(T2A, T2B), VFNMS(LDK(KP809016994), T2s, VFNMS(LDK(KP951056516), VADD(T2x, T2y), VMUL(LDK(KP309016994), T2l)))), T2u));
+		    ST(&(x[WS(rs, 17)]), VSUB(T2E, T2F), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 8)]), VADD(T2E, T2F), ms, &(x[0]));
+		    {
+			 V T2V, T34, T33, T35, T2O, T32;
+			 T2O = VFNMS(LDK(KP250000000), T2N, T2u);
+			 T2V = VBYI(VADD(T2M, VADD(T2O, VFNMS(LDK(KP587785252), T2U, VMUL(LDK(KP951056516), T2R)))));
+			 T34 = VBYI(VADD(T2O, VSUB(VFMA(LDK(KP587785252), T2R, VMUL(LDK(KP951056516), T2U)), T2M)));
+			 T32 = VFNMS(LDK(KP250000000), T31, T2w);
+			 T33 = VFMA(LDK(KP951056516), T2W, VFMA(LDK(KP587785252), T2X, VADD(T30, T32)));
+			 T35 = VFMA(LDK(KP587785252), T2W, VSUB(VFNMS(LDK(KP951056516), T2X, T32), T30));
+			 ST(&(x[WS(rs, 7)]), VADD(T2V, T33), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VSUB(T35, T34), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 18)]), VSUB(T33, T2V), ms, &(x[0]));
+			 ST(&(x[WS(rs, 12)]), VADD(T34, T35), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t2fv_25"), twinstr, &GENUS, {170, 110, 78, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_25) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,863 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:36 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2fv_32 -include t2f.h */
+
+/*
+ * This function contains 217 FP additions, 160 FP multiplications,
+ * (or, 119 additions, 62 multiplications, 98 fused multiply/add),
+ * 112 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T26, T25, T1Z, T22, T1W, T2a, T2k, T2g;
+	       {
+		    V T4, T1z, T2o, T32, T2r, T3f, Tf, T1A, T34, T2L, T1D, TC, T33, T2O, T1C;
+		    V Tr, T2C, T3a, T2F, T3b, T1r, T21, T1k, T20, TQ, TM, TS, TL, T2t, TJ;
+		    V T10, T2u;
+		    {
+			 V Tt, T9, T2p, Te, T2q, TA, Tu, Tx;
+			 {
+			      V T1, T1x, T2, T1v;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T1x = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			      T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			      T1v = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      {
+				   V T5, Tc, T7, Ta, T2m, T2n;
+				   T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+				   Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+				   Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+				   {
+					V T1y, T3, T1w, T6, Td, T8, Tb, Ts, Tz;
+					Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+					T1y = BYTWJ(&(W[TWVL * 46]), T1x);
+					T3 = BYTWJ(&(W[TWVL * 30]), T2);
+					T1w = BYTWJ(&(W[TWVL * 14]), T1v);
+					T6 = BYTWJ(&(W[TWVL * 6]), T5);
+					Td = BYTWJ(&(W[TWVL * 22]), Tc);
+					T8 = BYTWJ(&(W[TWVL * 38]), T7);
+					Tb = BYTWJ(&(W[TWVL * 54]), Ta);
+					Tt = BYTWJ(&(W[TWVL * 58]), Ts);
+					Tz = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+					T4 = VSUB(T1, T3);
+					T2m = VADD(T1, T3);
+					T1z = VSUB(T1w, T1y);
+					T2n = VADD(T1w, T1y);
+					T9 = VSUB(T6, T8);
+					T2p = VADD(T6, T8);
+					Te = VSUB(Tb, Td);
+					T2q = VADD(Tb, Td);
+					TA = BYTWJ(&(W[TWVL * 10]), Tz);
+				   }
+				   Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   T2o = VADD(T2m, T2n);
+				   T32 = VSUB(T2m, T2n);
+				   Tx = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			      }
+			 }
+			 {
+			      V Tv, To, Ty, Ti, Tj, Tm, Th;
+			      Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      T2r = VADD(T2p, T2q);
+			      T3f = VSUB(T2q, T2p);
+			      Tf = VADD(T9, Te);
+			      T1A = VSUB(Te, T9);
+			      Tv = BYTWJ(&(W[TWVL * 26]), Tu);
+			      To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			      Ty = BYTWJ(&(W[TWVL * 42]), Tx);
+			      Ti = BYTWJ(&(W[TWVL * 2]), Th);
+			      Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      {
+				   V T1f, T1h, T1a, T1c, T18, T2A, T2B, T1p;
+				   {
+					V T15, T17, T1o, T1m;
+					{
+					     V Tw, T2J, Tp, T2K, TB, Tk, Tn, T1n, T14, T16;
+					     T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+					     T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					     Tw = VSUB(Tt, Tv);
+					     T2J = VADD(Tt, Tv);
+					     Tp = BYTWJ(&(W[TWVL * 50]), To);
+					     T2K = VADD(TA, Ty);
+					     TB = VSUB(Ty, TA);
+					     Tk = BYTWJ(&(W[TWVL * 34]), Tj);
+					     Tn = BYTWJ(&(W[TWVL * 18]), Tm);
+					     T15 = BYTWJ(&(W[TWVL * 60]), T14);
+					     T17 = BYTWJ(&(W[TWVL * 28]), T16);
+					     T1n = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     {
+						  V T2M, Tl, T2N, Tq, T1l;
+						  T1l = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+						  T34 = VSUB(T2J, T2K);
+						  T2L = VADD(T2J, T2K);
+						  T1D = VFMA(LDK(KP414213562), Tw, TB);
+						  TC = VFNMS(LDK(KP414213562), TB, Tw);
+						  T2M = VADD(Ti, Tk);
+						  Tl = VSUB(Ti, Tk);
+						  T2N = VADD(Tn, Tp);
+						  Tq = VSUB(Tn, Tp);
+						  T1o = BYTWJ(&(W[TWVL * 12]), T1n);
+						  T1m = BYTWJ(&(W[TWVL * 44]), T1l);
+						  {
+						       V T1e, T1g, T19, T1b;
+						       T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+						       T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+						       T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+						       T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+						       T33 = VSUB(T2M, T2N);
+						       T2O = VADD(T2M, T2N);
+						       T1C = VFMA(LDK(KP414213562), Tl, Tq);
+						       Tr = VFNMS(LDK(KP414213562), Tq, Tl);
+						       T1f = BYTWJ(&(W[TWVL * 52]), T1e);
+						       T1h = BYTWJ(&(W[TWVL * 20]), T1g);
+						       T1a = BYTWJ(&(W[TWVL * 4]), T19);
+						       T1c = BYTWJ(&(W[TWVL * 36]), T1b);
+						  }
+					     }
+					}
+					T18 = VSUB(T15, T17);
+					T2A = VADD(T15, T17);
+					T2B = VADD(T1o, T1m);
+					T1p = VSUB(T1m, T1o);
+				   }
+				   {
+					V TG, TI, TZ, TX;
+					{
+					     V T1i, T2E, T1d, T2D, TH, TY, TF;
+					     TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					     T1i = VSUB(T1f, T1h);
+					     T2E = VADD(T1f, T1h);
+					     T1d = VSUB(T1a, T1c);
+					     T2D = VADD(T1a, T1c);
+					     TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					     TY = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+					     T2C = VADD(T2A, T2B);
+					     T3a = VSUB(T2A, T2B);
+					     TG = BYTWJ(&(W[0]), TF);
+					     {
+						  V TW, T1j, T1q, TP, TR, TK;
+						  TW = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+						  T2F = VADD(T2D, T2E);
+						  T3b = VSUB(T2E, T2D);
+						  T1j = VADD(T1d, T1i);
+						  T1q = VSUB(T1i, T1d);
+						  TI = BYTWJ(&(W[TWVL * 32]), TH);
+						  TZ = BYTWJ(&(W[TWVL * 48]), TY);
+						  TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+						  TX = BYTWJ(&(W[TWVL * 16]), TW);
+						  TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+						  TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+						  T1r = VFMA(LDK(KP707106781), T1q, T1p);
+						  T21 = VFNMS(LDK(KP707106781), T1q, T1p);
+						  T1k = VFMA(LDK(KP707106781), T1j, T18);
+						  T20 = VFNMS(LDK(KP707106781), T1j, T18);
+						  TQ = BYTWJ(&(W[TWVL * 56]), TP);
+						  TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+						  TS = BYTWJ(&(W[TWVL * 24]), TR);
+						  TL = BYTWJ(&(W[TWVL * 8]), TK);
+					     }
+					}
+					T2t = VADD(TG, TI);
+					TJ = VSUB(TG, TI);
+					T10 = VSUB(TX, TZ);
+					T2u = VADD(TX, TZ);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T2s, TT, T2x, T2P, T2Y, T2G, T37, T2v, T2w, TO, T2W, T30, T2U, TN, T2V;
+			 T2s = VSUB(T2o, T2r);
+			 T2U = VADD(T2o, T2r);
+			 TN = BYTWJ(&(W[TWVL * 40]), TM);
+			 TT = VSUB(TQ, TS);
+			 T2x = VADD(TQ, TS);
+			 T2P = VSUB(T2L, T2O);
+			 T2V = VADD(T2O, T2L);
+			 T2Y = VADD(T2C, T2F);
+			 T2G = VSUB(T2C, T2F);
+			 T37 = VSUB(T2t, T2u);
+			 T2v = VADD(T2t, T2u);
+			 T2w = VADD(TL, TN);
+			 TO = VSUB(TL, TN);
+			 T2W = VADD(T2U, T2V);
+			 T30 = VSUB(T2U, T2V);
+			 {
+			      V T3i, T3o, T36, T3r, T3h, T3j, T12, T1Y, TV, T1X, T3s, T3d, T2Q, T2H, T31;
+			      V T2Z;
+			      {
+				   V T35, T3g, T38, T2y, T11, TU;
+				   T35 = VADD(T33, T34);
+				   T3g = VSUB(T34, T33);
+				   T38 = VSUB(T2w, T2x);
+				   T2y = VADD(T2w, T2x);
+				   T11 = VSUB(TO, TT);
+				   TU = VADD(TO, TT);
+				   {
+					V T3c, T39, T2X, T2z;
+					T3c = VFNMS(LDK(KP414213562), T3b, T3a);
+					T3i = VFMA(LDK(KP414213562), T3a, T3b);
+					T3o = VFNMS(LDK(KP707106781), T35, T32);
+					T36 = VFMA(LDK(KP707106781), T35, T32);
+					T3r = VFNMS(LDK(KP707106781), T3g, T3f);
+					T3h = VFMA(LDK(KP707106781), T3g, T3f);
+					T39 = VFNMS(LDK(KP414213562), T38, T37);
+					T3j = VFMA(LDK(KP414213562), T37, T38);
+					T2X = VADD(T2v, T2y);
+					T2z = VSUB(T2v, T2y);
+					T12 = VFMA(LDK(KP707106781), T11, T10);
+					T1Y = VFNMS(LDK(KP707106781), T11, T10);
+					TV = VFMA(LDK(KP707106781), TU, TJ);
+					T1X = VFNMS(LDK(KP707106781), TU, TJ);
+					T3s = VSUB(T3c, T39);
+					T3d = VADD(T39, T3c);
+					T2Q = VSUB(T2G, T2z);
+					T2H = VADD(T2z, T2G);
+					T31 = VSUB(T2Y, T2X);
+					T2Z = VADD(T2X, T2Y);
+				   }
+			      }
+			      {
+				   V Tg, T1U, TD, T1G, T13, T1s, T1H, T1B, T1V, T1E, T3k, T3p, T2e, T2f;
+				   Tg = VFMA(LDK(KP707106781), Tf, T4);
+				   T1U = VFNMS(LDK(KP707106781), Tf, T4);
+				   T3k = VSUB(T3i, T3j);
+				   T3p = VADD(T3j, T3i);
+				   {
+					V T3v, T3t, T3e, T3m;
+					T3v = VFNMS(LDK(KP923879532), T3s, T3r);
+					T3t = VFMA(LDK(KP923879532), T3s, T3r);
+					T3e = VFNMS(LDK(KP923879532), T3d, T36);
+					T3m = VFMA(LDK(KP923879532), T3d, T36);
+					{
+					     V T2R, T2T, T2I, T2S;
+					     T2R = VFNMS(LDK(KP707106781), T2Q, T2P);
+					     T2T = VFMA(LDK(KP707106781), T2Q, T2P);
+					     T2I = VFNMS(LDK(KP707106781), T2H, T2s);
+					     T2S = VFMA(LDK(KP707106781), T2H, T2s);
+					     ST(&(x[WS(rs, 24)]), VFNMSI(T31, T30), ms, &(x[0]));
+					     ST(&(x[WS(rs, 8)]), VFMAI(T31, T30), ms, &(x[0]));
+					     ST(&(x[0]), VADD(T2W, T2Z), ms, &(x[0]));
+					     ST(&(x[WS(rs, 16)]), VSUB(T2W, T2Z), ms, &(x[0]));
+					     {
+						  V T3u, T3q, T3l, T3n;
+						  T3u = VFMA(LDK(KP923879532), T3p, T3o);
+						  T3q = VFNMS(LDK(KP923879532), T3p, T3o);
+						  T3l = VFNMS(LDK(KP923879532), T3k, T3h);
+						  T3n = VFMA(LDK(KP923879532), T3k, T3h);
+						  ST(&(x[WS(rs, 4)]), VFMAI(T2T, T2S), ms, &(x[0]));
+						  ST(&(x[WS(rs, 28)]), VFNMSI(T2T, T2S), ms, &(x[0]));
+						  ST(&(x[WS(rs, 20)]), VFMAI(T2R, T2I), ms, &(x[0]));
+						  ST(&(x[WS(rs, 12)]), VFNMSI(T2R, T2I), ms, &(x[0]));
+						  ST(&(x[WS(rs, 22)]), VFNMSI(T3t, T3q), ms, &(x[0]));
+						  ST(&(x[WS(rs, 10)]), VFMAI(T3t, T3q), ms, &(x[0]));
+						  ST(&(x[WS(rs, 26)]), VFMAI(T3v, T3u), ms, &(x[0]));
+						  ST(&(x[WS(rs, 6)]), VFNMSI(T3v, T3u), ms, &(x[0]));
+						  ST(&(x[WS(rs, 2)]), VFMAI(T3n, T3m), ms, &(x[0]));
+						  ST(&(x[WS(rs, 30)]), VFNMSI(T3n, T3m), ms, &(x[0]));
+						  ST(&(x[WS(rs, 18)]), VFMAI(T3l, T3e), ms, &(x[0]));
+						  ST(&(x[WS(rs, 14)]), VFNMSI(T3l, T3e), ms, &(x[0]));
+						  T26 = VSUB(TC, Tr);
+						  TD = VADD(Tr, TC);
+					     }
+					}
+				   }
+				   T1G = VFMA(LDK(KP198912367), TV, T12);
+				   T13 = VFNMS(LDK(KP198912367), T12, TV);
+				   T1s = VFNMS(LDK(KP198912367), T1r, T1k);
+				   T1H = VFMA(LDK(KP198912367), T1k, T1r);
+				   T1B = VFNMS(LDK(KP707106781), T1A, T1z);
+				   T25 = VFMA(LDK(KP707106781), T1A, T1z);
+				   T1V = VADD(T1C, T1D);
+				   T1E = VSUB(T1C, T1D);
+				   {
+					V T1S, T1O, T1K, T1u, T1R, T1T, T1L, T1J;
+					{
+					     V TE, T1M, T1I, T1N, T1t, T1Q, T1F, T1P, T28, T29;
+					     TE = VFMA(LDK(KP923879532), TD, Tg);
+					     T1M = VFNMS(LDK(KP923879532), TD, Tg);
+					     T1I = VSUB(T1G, T1H);
+					     T1N = VADD(T1G, T1H);
+					     T1t = VADD(T13, T1s);
+					     T1Q = VSUB(T1s, T13);
+					     T1F = VFMA(LDK(KP923879532), T1E, T1B);
+					     T1P = VFNMS(LDK(KP923879532), T1E, T1B);
+					     T28 = VFNMS(LDK(KP668178637), T1X, T1Y);
+					     T1Z = VFMA(LDK(KP668178637), T1Y, T1X);
+					     T1S = VFMA(LDK(KP980785280), T1N, T1M);
+					     T1O = VFNMS(LDK(KP980785280), T1N, T1M);
+					     T22 = VFMA(LDK(KP668178637), T21, T20);
+					     T29 = VFNMS(LDK(KP668178637), T20, T21);
+					     T1K = VFMA(LDK(KP980785280), T1t, TE);
+					     T1u = VFNMS(LDK(KP980785280), T1t, TE);
+					     T1R = VFNMS(LDK(KP980785280), T1Q, T1P);
+					     T1T = VFMA(LDK(KP980785280), T1Q, T1P);
+					     T1L = VFMA(LDK(KP980785280), T1I, T1F);
+					     T1J = VFNMS(LDK(KP980785280), T1I, T1F);
+					     T2e = VFNMS(LDK(KP923879532), T1V, T1U);
+					     T1W = VFMA(LDK(KP923879532), T1V, T1U);
+					     T2a = VSUB(T28, T29);
+					     T2f = VADD(T28, T29);
+					}
+					ST(&(x[WS(rs, 23)]), VFMAI(T1R, T1O), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 9)]), VFNMSI(T1R, T1O), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 25)]), VFNMSI(T1T, T1S), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 7)]), VFMAI(T1T, T1S), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 31)]), VFMAI(T1L, T1K), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 1)]), VFNMSI(T1L, T1K), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 15)]), VFMAI(T1J, T1u), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 17)]), VFNMSI(T1J, T1u), ms, &(x[WS(rs, 1)]));
+				   }
+				   T2k = VFNMS(LDK(KP831469612), T2f, T2e);
+				   T2g = VFMA(LDK(KP831469612), T2f, T2e);
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T2i, T23, T2h, T27;
+		    T2i = VSUB(T22, T1Z);
+		    T23 = VADD(T1Z, T22);
+		    T2h = VFNMS(LDK(KP923879532), T26, T25);
+		    T27 = VFMA(LDK(KP923879532), T26, T25);
+		    {
+			 V T2c, T24, T2j, T2l, T2d, T2b;
+			 T2c = VFMA(LDK(KP831469612), T23, T1W);
+			 T24 = VFNMS(LDK(KP831469612), T23, T1W);
+			 T2j = VFMA(LDK(KP831469612), T2i, T2h);
+			 T2l = VFNMS(LDK(KP831469612), T2i, T2h);
+			 T2d = VFMA(LDK(KP831469612), T2a, T27);
+			 T2b = VFNMS(LDK(KP831469612), T2a, T27);
+			 ST(&(x[WS(rs, 21)]), VFNMSI(T2j, T2g), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VFMAI(T2j, T2g), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 27)]), VFMAI(T2l, T2k), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VFNMSI(T2l, T2k), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VFMAI(T2d, T2c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 29)]), VFNMSI(T2d, T2c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 19)]), VFMAI(T2b, T24), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VFNMSI(T2b, T24), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t2fv_32"), twinstr, &GENUS, {119, 62, 98, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_32) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2fv_32 -include t2f.h */
+
+/*
+ * This function contains 217 FP additions, 104 FP multiplications,
+ * (or, 201 additions, 88 multiplications, 16 fused multiply/add),
+ * 59 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T4, T1A, T2o, T32, Tf, T1v, T2r, T3f, TC, T1C, T2L, T34, Tr, T1D, T2O;
+	       V T33, T1k, T20, T2F, T3b, T1r, T21, T2C, T3a, TV, T1X, T2y, T38, T12, T1Y;
+	       V T2v, T37;
+	       {
+		    V T1, T1z, T3, T1x, T1y, T2, T1w, T2m, T2n;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T1y = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+		    T1z = BYTWJ(&(W[TWVL * 46]), T1y);
+		    T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+		    T3 = BYTWJ(&(W[TWVL * 30]), T2);
+		    T1w = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    T1x = BYTWJ(&(W[TWVL * 14]), T1w);
+		    T4 = VSUB(T1, T3);
+		    T1A = VSUB(T1x, T1z);
+		    T2m = VADD(T1, T3);
+		    T2n = VADD(T1x, T1z);
+		    T2o = VADD(T2m, T2n);
+		    T32 = VSUB(T2m, T2n);
+	       }
+	       {
+		    V T6, Td, T8, Tb;
+		    {
+			 V T5, Tc, T7, Ta;
+			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T6 = BYTWJ(&(W[TWVL * 6]), T5);
+			 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Td = BYTWJ(&(W[TWVL * 22]), Tc);
+			 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 T8 = BYTWJ(&(W[TWVL * 38]), T7);
+			 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+			 Tb = BYTWJ(&(W[TWVL * 54]), Ta);
+		    }
+		    {
+			 V T9, Te, T2p, T2q;
+			 T9 = VSUB(T6, T8);
+			 Te = VSUB(Tb, Td);
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 T1v = VMUL(LDK(KP707106781), VSUB(Te, T9));
+			 T2p = VADD(T6, T8);
+			 T2q = VADD(Tb, Td);
+			 T2r = VADD(T2p, T2q);
+			 T3f = VSUB(T2q, T2p);
+		    }
+	       }
+	       {
+		    V Tt, TA, Tv, Ty;
+		    {
+			 V Ts, Tz, Tu, Tx;
+			 Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+			 Tt = BYTWJ(&(W[TWVL * 58]), Ts);
+			 Tz = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 TA = BYTWJ(&(W[TWVL * 42]), Tz);
+			 Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 Tv = BYTWJ(&(W[TWVL * 26]), Tu);
+			 Tx = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Ty = BYTWJ(&(W[TWVL * 10]), Tx);
+		    }
+		    {
+			 V Tw, TB, T2J, T2K;
+			 Tw = VSUB(Tt, Tv);
+			 TB = VSUB(Ty, TA);
+			 TC = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), TB));
+			 T1C = VFNMS(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
+			 T2J = VADD(Tt, Tv);
+			 T2K = VADD(Ty, TA);
+			 T2L = VADD(T2J, T2K);
+			 T34 = VSUB(T2J, T2K);
+		    }
+	       }
+	       {
+		    V Ti, Tp, Tk, Tn;
+		    {
+			 V Th, To, Tj, Tm;
+			 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Ti = BYTWJ(&(W[TWVL * 2]), Th);
+			 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			 Tp = BYTWJ(&(W[TWVL * 50]), To);
+			 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 Tk = BYTWJ(&(W[TWVL * 34]), Tj);
+			 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 Tn = BYTWJ(&(W[TWVL * 18]), Tm);
+		    }
+		    {
+			 V Tl, Tq, T2M, T2N;
+			 Tl = VSUB(Ti, Tk);
+			 Tq = VSUB(Tn, Tp);
+			 Tr = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
+			 T1D = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
+			 T2M = VADD(Ti, Tk);
+			 T2N = VADD(Tn, Tp);
+			 T2O = VADD(T2M, T2N);
+			 T33 = VSUB(T2M, T2N);
+		    }
+	       }
+	       {
+		    V T15, T17, T1p, T1n, T1f, T1h, T1i, T1a, T1c, T1d;
+		    {
+			 V T14, T16, T1o, T1m;
+			 T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+			 T15 = BYTWJ(&(W[TWVL * 60]), T14);
+			 T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T17 = BYTWJ(&(W[TWVL * 28]), T16);
+			 T1o = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T1p = BYTWJ(&(W[TWVL * 44]), T1o);
+			 T1m = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T1n = BYTWJ(&(W[TWVL * 12]), T1m);
+			 {
+			      V T1e, T1g, T19, T1b;
+			      T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+			      T1f = BYTWJ(&(W[TWVL * 52]), T1e);
+			      T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      T1h = BYTWJ(&(W[TWVL * 20]), T1g);
+			      T1i = VSUB(T1f, T1h);
+			      T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T1a = BYTWJ(&(W[TWVL * 4]), T19);
+			      T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			      T1c = BYTWJ(&(W[TWVL * 36]), T1b);
+			      T1d = VSUB(T1a, T1c);
+			 }
+		    }
+		    {
+			 V T18, T1j, T2D, T2E;
+			 T18 = VSUB(T15, T17);
+			 T1j = VMUL(LDK(KP707106781), VADD(T1d, T1i));
+			 T1k = VADD(T18, T1j);
+			 T20 = VSUB(T18, T1j);
+			 T2D = VADD(T1a, T1c);
+			 T2E = VADD(T1f, T1h);
+			 T2F = VADD(T2D, T2E);
+			 T3b = VSUB(T2E, T2D);
+		    }
+		    {
+			 V T1l, T1q, T2A, T2B;
+			 T1l = VMUL(LDK(KP707106781), VSUB(T1i, T1d));
+			 T1q = VSUB(T1n, T1p);
+			 T1r = VSUB(T1l, T1q);
+			 T21 = VADD(T1q, T1l);
+			 T2A = VADD(T15, T17);
+			 T2B = VADD(T1n, T1p);
+			 T2C = VADD(T2A, T2B);
+			 T3a = VSUB(T2A, T2B);
+		    }
+	       }
+	       {
+		    V TG, TI, T10, TY, TQ, TS, TT, TL, TN, TO;
+		    {
+			 V TF, TH, TZ, TX;
+			 TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 TG = BYTWJ(&(W[0]), TF);
+			 TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 TI = BYTWJ(&(W[TWVL * 32]), TH);
+			 TZ = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+			 T10 = BYTWJ(&(W[TWVL * 48]), TZ);
+			 TX = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 TY = BYTWJ(&(W[TWVL * 16]), TX);
+			 {
+			      V TP, TR, TK, TM;
+			      TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+			      TQ = BYTWJ(&(W[TWVL * 56]), TP);
+			      TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			      TS = BYTWJ(&(W[TWVL * 24]), TR);
+			      TT = VSUB(TQ, TS);
+			      TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      TL = BYTWJ(&(W[TWVL * 8]), TK);
+			      TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			      TN = BYTWJ(&(W[TWVL * 40]), TM);
+			      TO = VSUB(TL, TN);
+			 }
+		    }
+		    {
+			 V TJ, TU, T2w, T2x;
+			 TJ = VSUB(TG, TI);
+			 TU = VMUL(LDK(KP707106781), VADD(TO, TT));
+			 TV = VADD(TJ, TU);
+			 T1X = VSUB(TJ, TU);
+			 T2w = VADD(TL, TN);
+			 T2x = VADD(TQ, TS);
+			 T2y = VADD(T2w, T2x);
+			 T38 = VSUB(T2x, T2w);
+		    }
+		    {
+			 V TW, T11, T2t, T2u;
+			 TW = VMUL(LDK(KP707106781), VSUB(TT, TO));
+			 T11 = VSUB(TY, T10);
+			 T12 = VSUB(TW, T11);
+			 T1Y = VADD(T11, TW);
+			 T2t = VADD(TG, TI);
+			 T2u = VADD(TY, T10);
+			 T2v = VADD(T2t, T2u);
+			 T37 = VSUB(T2t, T2u);
+		    }
+	       }
+	       {
+		    V T2W, T30, T2Z, T31;
+		    {
+			 V T2U, T2V, T2X, T2Y;
+			 T2U = VADD(T2o, T2r);
+			 T2V = VADD(T2O, T2L);
+			 T2W = VADD(T2U, T2V);
+			 T30 = VSUB(T2U, T2V);
+			 T2X = VADD(T2v, T2y);
+			 T2Y = VADD(T2C, T2F);
+			 T2Z = VADD(T2X, T2Y);
+			 T31 = VBYI(VSUB(T2Y, T2X));
+		    }
+		    ST(&(x[WS(rs, 16)]), VSUB(T2W, T2Z), ms, &(x[0]));
+		    ST(&(x[WS(rs, 8)]), VADD(T30, T31), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T2W, T2Z), ms, &(x[0]));
+		    ST(&(x[WS(rs, 24)]), VSUB(T30, T31), ms, &(x[0]));
+	       }
+	       {
+		    V T2s, T2P, T2H, T2Q, T2z, T2G;
+		    T2s = VSUB(T2o, T2r);
+		    T2P = VSUB(T2L, T2O);
+		    T2z = VSUB(T2v, T2y);
+		    T2G = VSUB(T2C, T2F);
+		    T2H = VMUL(LDK(KP707106781), VADD(T2z, T2G));
+		    T2Q = VMUL(LDK(KP707106781), VSUB(T2G, T2z));
+		    {
+			 V T2I, T2R, T2S, T2T;
+			 T2I = VADD(T2s, T2H);
+			 T2R = VBYI(VADD(T2P, T2Q));
+			 ST(&(x[WS(rs, 28)]), VSUB(T2I, T2R), ms, &(x[0]));
+			 ST(&(x[WS(rs, 4)]), VADD(T2I, T2R), ms, &(x[0]));
+			 T2S = VSUB(T2s, T2H);
+			 T2T = VBYI(VSUB(T2Q, T2P));
+			 ST(&(x[WS(rs, 20)]), VSUB(T2S, T2T), ms, &(x[0]));
+			 ST(&(x[WS(rs, 12)]), VADD(T2S, T2T), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T36, T3r, T3h, T3p, T3d, T3o, T3k, T3s, T35, T3g;
+		    T35 = VMUL(LDK(KP707106781), VADD(T33, T34));
+		    T36 = VADD(T32, T35);
+		    T3r = VSUB(T32, T35);
+		    T3g = VMUL(LDK(KP707106781), VSUB(T34, T33));
+		    T3h = VADD(T3f, T3g);
+		    T3p = VSUB(T3g, T3f);
+		    {
+			 V T39, T3c, T3i, T3j;
+			 T39 = VFMA(LDK(KP923879532), T37, VMUL(LDK(KP382683432), T38));
+			 T3c = VFNMS(LDK(KP382683432), T3b, VMUL(LDK(KP923879532), T3a));
+			 T3d = VADD(T39, T3c);
+			 T3o = VSUB(T3c, T39);
+			 T3i = VFNMS(LDK(KP382683432), T37, VMUL(LDK(KP923879532), T38));
+			 T3j = VFMA(LDK(KP382683432), T3a, VMUL(LDK(KP923879532), T3b));
+			 T3k = VADD(T3i, T3j);
+			 T3s = VSUB(T3j, T3i);
+		    }
+		    {
+			 V T3e, T3l, T3u, T3v;
+			 T3e = VADD(T36, T3d);
+			 T3l = VBYI(VADD(T3h, T3k));
+			 ST(&(x[WS(rs, 30)]), VSUB(T3e, T3l), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(T3e, T3l), ms, &(x[0]));
+			 T3u = VBYI(VADD(T3p, T3o));
+			 T3v = VADD(T3r, T3s);
+			 ST(&(x[WS(rs, 6)]), VADD(T3u, T3v), ms, &(x[0]));
+			 ST(&(x[WS(rs, 26)]), VSUB(T3v, T3u), ms, &(x[0]));
+		    }
+		    {
+			 V T3m, T3n, T3q, T3t;
+			 T3m = VSUB(T36, T3d);
+			 T3n = VBYI(VSUB(T3k, T3h));
+			 ST(&(x[WS(rs, 18)]), VSUB(T3m, T3n), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VADD(T3m, T3n), ms, &(x[0]));
+			 T3q = VBYI(VSUB(T3o, T3p));
+			 T3t = VSUB(T3r, T3s);
+			 ST(&(x[WS(rs, 10)]), VADD(T3q, T3t), ms, &(x[0]));
+			 ST(&(x[WS(rs, 22)]), VSUB(T3t, T3q), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V TE, T1P, T1I, T1Q, T1t, T1M, T1F, T1N;
+		    {
+			 V Tg, TD, T1G, T1H;
+			 Tg = VADD(T4, Tf);
+			 TD = VADD(Tr, TC);
+			 TE = VADD(Tg, TD);
+			 T1P = VSUB(Tg, TD);
+			 T1G = VFNMS(LDK(KP195090322), TV, VMUL(LDK(KP980785280), T12));
+			 T1H = VFMA(LDK(KP195090322), T1k, VMUL(LDK(KP980785280), T1r));
+			 T1I = VADD(T1G, T1H);
+			 T1Q = VSUB(T1H, T1G);
+		    }
+		    {
+			 V T13, T1s, T1B, T1E;
+			 T13 = VFMA(LDK(KP980785280), TV, VMUL(LDK(KP195090322), T12));
+			 T1s = VFNMS(LDK(KP195090322), T1r, VMUL(LDK(KP980785280), T1k));
+			 T1t = VADD(T13, T1s);
+			 T1M = VSUB(T1s, T13);
+			 T1B = VSUB(T1v, T1A);
+			 T1E = VSUB(T1C, T1D);
+			 T1F = VADD(T1B, T1E);
+			 T1N = VSUB(T1E, T1B);
+		    }
+		    {
+			 V T1u, T1J, T1S, T1T;
+			 T1u = VADD(TE, T1t);
+			 T1J = VBYI(VADD(T1F, T1I));
+			 ST(&(x[WS(rs, 31)]), VSUB(T1u, T1J), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T1u, T1J), ms, &(x[WS(rs, 1)]));
+			 T1S = VBYI(VADD(T1N, T1M));
+			 T1T = VADD(T1P, T1Q);
+			 ST(&(x[WS(rs, 7)]), VADD(T1S, T1T), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 25)]), VSUB(T1T, T1S), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T1K, T1L, T1O, T1R;
+			 T1K = VSUB(TE, T1t);
+			 T1L = VBYI(VSUB(T1I, T1F));
+			 ST(&(x[WS(rs, 17)]), VSUB(T1K, T1L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 15)]), VADD(T1K, T1L), ms, &(x[WS(rs, 1)]));
+			 T1O = VBYI(VSUB(T1M, T1N));
+			 T1R = VSUB(T1P, T1Q);
+			 ST(&(x[WS(rs, 9)]), VADD(T1O, T1R), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 23)]), VSUB(T1R, T1O), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T1W, T2h, T2a, T2i, T23, T2e, T27, T2f;
+		    {
+			 V T1U, T1V, T28, T29;
+			 T1U = VSUB(T4, Tf);
+			 T1V = VADD(T1D, T1C);
+			 T1W = VADD(T1U, T1V);
+			 T2h = VSUB(T1U, T1V);
+			 T28 = VFNMS(LDK(KP555570233), T1X, VMUL(LDK(KP831469612), T1Y));
+			 T29 = VFMA(LDK(KP555570233), T20, VMUL(LDK(KP831469612), T21));
+			 T2a = VADD(T28, T29);
+			 T2i = VSUB(T29, T28);
+		    }
+		    {
+			 V T1Z, T22, T25, T26;
+			 T1Z = VFMA(LDK(KP831469612), T1X, VMUL(LDK(KP555570233), T1Y));
+			 T22 = VFNMS(LDK(KP555570233), T21, VMUL(LDK(KP831469612), T20));
+			 T23 = VADD(T1Z, T22);
+			 T2e = VSUB(T22, T1Z);
+			 T25 = VADD(T1A, T1v);
+			 T26 = VSUB(TC, Tr);
+			 T27 = VADD(T25, T26);
+			 T2f = VSUB(T26, T25);
+		    }
+		    {
+			 V T24, T2b, T2k, T2l;
+			 T24 = VADD(T1W, T23);
+			 T2b = VBYI(VADD(T27, T2a));
+			 ST(&(x[WS(rs, 29)]), VSUB(T24, T2b), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(T24, T2b), ms, &(x[WS(rs, 1)]));
+			 T2k = VBYI(VADD(T2f, T2e));
+			 T2l = VADD(T2h, T2i);
+			 ST(&(x[WS(rs, 5)]), VADD(T2k, T2l), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 27)]), VSUB(T2l, T2k), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T2c, T2d, T2g, T2j;
+			 T2c = VSUB(T1W, T23);
+			 T2d = VBYI(VSUB(T2a, T27));
+			 ST(&(x[WS(rs, 19)]), VSUB(T2c, T2d), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VADD(T2c, T2d), ms, &(x[WS(rs, 1)]));
+			 T2g = VBYI(VSUB(T2e, T2f));
+			 T2j = VSUB(T2h, T2i);
+			 ST(&(x[WS(rs, 11)]), VADD(T2g, T2j), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 21)]), VSUB(T2j, T2g), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t2fv_32"), twinstr, &GENUS, {201, 88, 16, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_32) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:35 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t2fv_4 -include t2f.h */
+
+/*
+ * This function contains 11 FP additions, 8 FP multiplications,
+ * (or, 9 additions, 6 multiplications, 2 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T7, T2, T5, T8, T3, T6;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTWJ(&(W[TWVL * 4]), T7);
+	       T3 = BYTWJ(&(W[TWVL * 2]), T2);
+	       T6 = BYTWJ(&(W[0]), T5);
+	       {
+		    V Ta, T4, Tb, T9;
+		    Ta = VADD(T1, T3);
+		    T4 = VSUB(T1, T3);
+		    Tb = VADD(T6, T8);
+		    T9 = VSUB(T6, T8);
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[WS(rs, 3)]), VFMAI(T9, T4), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VFNMSI(T9, T4), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t2fv_4"), twinstr, &GENUS, {9, 6, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_4) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t2fv_4 -include t2f.h */
+
+/*
+ * This function contains 11 FP additions, 6 FP multiplications,
+ * (or, 11 additions, 6 multiplications, 0 fused multiply/add),
+ * 13 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T1, T8, T3, T6, T7, T2, T5;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T8 = BYTWJ(&(W[TWVL * 4]), T7);
+	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T3 = BYTWJ(&(W[TWVL * 2]), T2);
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T6 = BYTWJ(&(W[0]), T5);
+	       {
+		    V T4, T9, Ta, Tb;
+		    T4 = VSUB(T1, T3);
+		    T9 = VBYI(VSUB(T6, T8));
+		    ST(&(x[WS(rs, 1)]), VSUB(T4, T9), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VADD(T4, T9), ms, &(x[WS(rs, 1)]));
+		    Ta = VADD(T1, T3);
+		    Tb = VADD(T6, T8);
+		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t2fv_4"), twinstr, &GENUS, {11, 6, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_4) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:41 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t2fv_5 -include t2f.h */
+
+/*
+ * This function contains 20 FP additions, 19 FP multiplications,
+ * (or, 11 additions, 10 multiplications, 9 fused multiply/add),
+ * 26 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V T1, T2, T9, T4, T7;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T9 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T4 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V T3, Ta, T5, T8;
+		    T3 = BYTWJ(&(W[0]), T2);
+		    Ta = BYTWJ(&(W[TWVL * 4]), T9);
+		    T5 = BYTWJ(&(W[TWVL * 6]), T4);
+		    T8 = BYTWJ(&(W[TWVL * 2]), T7);
+		    {
+			 V T6, Tg, Tb, Th;
+			 T6 = VADD(T3, T5);
+			 Tg = VSUB(T3, T5);
+			 Tb = VADD(T8, Ta);
+			 Th = VSUB(T8, Ta);
+			 {
+			      V Te, Tc, Tk, Ti, Td, Tj, Tf;
+			      Te = VSUB(T6, Tb);
+			      Tc = VADD(T6, Tb);
+			      Tk = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tg, Th));
+			      Ti = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Th, Tg));
+			      Td = VFNMS(LDK(KP250000000), Tc, T1);
+			      ST(&(x[0]), VADD(T1, Tc), ms, &(x[0]));
+			      Tj = VFNMS(LDK(KP559016994), Te, Td);
+			      Tf = VFMA(LDK(KP559016994), Te, Td);
+			      ST(&(x[WS(rs, 2)]), VFMAI(Tk, Tj), ms, &(x[0]));
+			      ST(&(x[WS(rs, 3)]), VFNMSI(Tk, Tj), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 4)]), VFMAI(Ti, Tf), ms, &(x[0]));
+			      ST(&(x[WS(rs, 1)]), VFNMSI(Ti, Tf), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t2fv_5"), twinstr, &GENUS, {11, 10, 9, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_5) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -name t2fv_5 -include t2f.h */
+
+/*
+ * This function contains 20 FP additions, 14 FP multiplications,
+ * (or, 17 additions, 11 multiplications, 3 fused multiply/add),
+ * 20 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V Tc, Tg, Th, T5, Ta, Td;
+	       Tc = LD(&(x[0]), ms, &(x[0]));
+	       {
+		    V T2, T9, T4, T7;
+		    {
+			 V T1, T8, T3, T6;
+			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2 = BYTWJ(&(W[0]), T1);
+			 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T9 = BYTWJ(&(W[TWVL * 4]), T8);
+			 T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T4 = BYTWJ(&(W[TWVL * 6]), T3);
+			 T6 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T7 = BYTWJ(&(W[TWVL * 2]), T6);
+		    }
+		    Tg = VSUB(T2, T4);
+		    Th = VSUB(T7, T9);
+		    T5 = VADD(T2, T4);
+		    Ta = VADD(T7, T9);
+		    Td = VADD(T5, Ta);
+	       }
+	       ST(&(x[0]), VADD(Tc, Td), ms, &(x[0]));
+	       {
+		    V Ti, Tj, Tf, Tk, Tb, Te;
+		    Ti = VBYI(VFMA(LDK(KP951056516), Tg, VMUL(LDK(KP587785252), Th)));
+		    Tj = VBYI(VFNMS(LDK(KP587785252), Tg, VMUL(LDK(KP951056516), Th)));
+		    Tb = VMUL(LDK(KP559016994), VSUB(T5, Ta));
+		    Te = VFNMS(LDK(KP250000000), Td, Tc);
+		    Tf = VADD(Tb, Te);
+		    Tk = VSUB(Te, Tb);
+		    ST(&(x[WS(rs, 1)]), VSUB(Tf, Ti), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VSUB(Tk, Tj), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 4)]), VADD(Ti, Tf), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Tj, Tk), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t2fv_5"), twinstr, &GENUS, {17, 11, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_5) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1877 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:40 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2fv_64 -include t2f.h */
+
+/*
+ * This function contains 519 FP additions, 384 FP multiplications,
+ * (or, 261 additions, 126 multiplications, 258 fused multiply/add),
+ * 187 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V T6L, T6M, T6O, T6P, T75, T6V, T5A, T6A, T72, T6K, T6t, T6D, T6w, T6B, T6h;
+	       V T6E;
+	       {
+		    V Ta, T3U, T3V, T37, T7a, T58, T7B, T6l, T1v, T24, T5Q, T7o, T5F, T7l, T43;
+		    V T4F, T2i, T2R, T6b, T7v, T60, T7s, T4a, T4I, T5u, T7h, T5x, T7g, T1i, T3a;
+		    V T4j, T4C, T7e, T5l, T7d, T5o, T3b, TV, T4B, T4m, T3X, T3Y, T6o, T7b, T5f;
+		    V T7C, Tx, T38, T2p, T61, T2n, T65, T2D, T7p, T5M, T7m, T5T, T4G, T46, T25;
+		    V T1S, T2q, T2u, T2w;
+		    {
+			 V T5q, T10, T5v, T15, T1b, T5s, T1c, T1e;
+			 {
+			      V T1V, T1p, T5B, T5O, T1u, T1X, T20, T21;
+			      {
+				   V T1, T2, T7, T5, T32, T34, T2X, T2Z;
+				   T1 = LD(&(x[0]), ms, &(x[0]));
+				   T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
+				   T7 = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
+				   T5 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   T32 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+				   T34 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
+				   T2X = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
+				   T2Z = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+				   {
+					V T1m, T54, T6j, T36, T55, T31, T56, T1n, T1q, T1s, T4, T9;
+					{
+					     V T3, T8, T6, T33, T35, T2Y, T30, T1l;
+					     T1l = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					     T3 = BYTWJ(&(W[TWVL * 62]), T2);
+					     T8 = BYTWJ(&(W[TWVL * 94]), T7);
+					     T6 = BYTWJ(&(W[TWVL * 30]), T5);
+					     T33 = BYTWJ(&(W[TWVL * 14]), T32);
+					     T35 = BYTWJ(&(W[TWVL * 78]), T34);
+					     T2Y = BYTWJ(&(W[TWVL * 110]), T2X);
+					     T30 = BYTWJ(&(W[TWVL * 46]), T2Z);
+					     T1m = BYTWJ(&(W[0]), T1l);
+					     T54 = VSUB(T1, T3);
+					     T4 = VADD(T1, T3);
+					     T6j = VSUB(T6, T8);
+					     T9 = VADD(T6, T8);
+					     T36 = VADD(T33, T35);
+					     T55 = VSUB(T33, T35);
+					     T31 = VADD(T2Y, T30);
+					     T56 = VSUB(T2Y, T30);
+					     T1n = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
+					}
+					T1q = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					T1s = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
+					Ta = VSUB(T4, T9);
+					T3U = VADD(T4, T9);
+					{
+					     V T57, T6k, T1o, T1r, T1t, T1W, T1U, T1Z;
+					     T1U = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					     T3V = VADD(T36, T31);
+					     T37 = VSUB(T31, T36);
+					     T57 = VADD(T55, T56);
+					     T6k = VSUB(T56, T55);
+					     T1o = BYTWJ(&(W[TWVL * 64]), T1n);
+					     T1r = BYTWJ(&(W[TWVL * 32]), T1q);
+					     T1t = BYTWJ(&(W[TWVL * 96]), T1s);
+					     T1V = BYTWJ(&(W[TWVL * 16]), T1U);
+					     T1W = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
+					     T1Z = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
+					     T7a = VFNMS(LDK(KP707106781), T57, T54);
+					     T58 = VFMA(LDK(KP707106781), T57, T54);
+					     T7B = VFMA(LDK(KP707106781), T6k, T6j);
+					     T6l = VFNMS(LDK(KP707106781), T6k, T6j);
+					     T1p = VADD(T1m, T1o);
+					     T5B = VSUB(T1m, T1o);
+					     T5O = VSUB(T1r, T1t);
+					     T1u = VADD(T1r, T1t);
+					     T1X = BYTWJ(&(W[TWVL * 80]), T1W);
+					     T20 = BYTWJ(&(W[TWVL * 112]), T1Z);
+					     T21 = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+					}
+				   }
+			      }
+			      {
+				   V T5W, T2N, T69, T2L, T5Y, T2P, T48, T2c, T2h;
+				   {
+					V T41, T1Y, T5C, T22, T2d, T29, T2b, T2f, T28, T2a, T2H, T2J;
+					T28 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
+					T2a = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+					T1v = VSUB(T1p, T1u);
+					T41 = VADD(T1p, T1u);
+					T1Y = VADD(T1V, T1X);
+					T5C = VSUB(T1V, T1X);
+					T22 = BYTWJ(&(W[TWVL * 48]), T21);
+					T2d = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					T29 = BYTWJ(&(W[TWVL * 124]), T28);
+					T2b = BYTWJ(&(W[TWVL * 60]), T2a);
+					T2f = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
+					T2H = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
+					T2J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V T23, T5D, T2e, T2g, T2I, T2K, T2M;
+					     T2M = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     T23 = VADD(T20, T22);
+					     T5D = VSUB(T20, T22);
+					     T2e = BYTWJ(&(W[TWVL * 28]), T2d);
+					     T2c = VADD(T29, T2b);
+					     T5W = VSUB(T29, T2b);
+					     T2g = BYTWJ(&(W[TWVL * 92]), T2f);
+					     T2I = BYTWJ(&(W[TWVL * 108]), T2H);
+					     T2K = BYTWJ(&(W[TWVL * 44]), T2J);
+					     T2N = BYTWJ(&(W[TWVL * 12]), T2M);
+					     {
+						  V T5E, T5P, T42, T2O;
+						  T5E = VADD(T5C, T5D);
+						  T5P = VSUB(T5C, T5D);
+						  T24 = VSUB(T1Y, T23);
+						  T42 = VADD(T1Y, T23);
+						  T69 = VSUB(T2g, T2e);
+						  T2h = VADD(T2e, T2g);
+						  T2O = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
+						  T2L = VADD(T2I, T2K);
+						  T5Y = VSUB(T2I, T2K);
+						  T5Q = VFMA(LDK(KP707106781), T5P, T5O);
+						  T7o = VFNMS(LDK(KP707106781), T5P, T5O);
+						  T5F = VFMA(LDK(KP707106781), T5E, T5B);
+						  T7l = VFNMS(LDK(KP707106781), T5E, T5B);
+						  T43 = VADD(T41, T42);
+						  T4F = VSUB(T41, T42);
+						  T2P = BYTWJ(&(W[TWVL * 76]), T2O);
+					     }
+					}
+				   }
+				   T2i = VSUB(T2c, T2h);
+				   T48 = VADD(T2c, T2h);
+				   {
+					V TW, TY, T11, T2Q, T5X, T13;
+					TW = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
+					TY = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+					T11 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					T2Q = VADD(T2N, T2P);
+					T5X = VSUB(T2N, T2P);
+					T13 = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
+					{
+					     V T12, T5Z, T6a, T49, T14, T18, T1a;
+					     {
+						  V T17, T19, TX, TZ;
+						  T17 = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
+						  T19 = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+						  TX = BYTWJ(&(W[TWVL * 122]), TW);
+						  TZ = BYTWJ(&(W[TWVL * 58]), TY);
+						  T12 = BYTWJ(&(W[TWVL * 26]), T11);
+						  T5Z = VADD(T5X, T5Y);
+						  T6a = VSUB(T5Y, T5X);
+						  T2R = VSUB(T2L, T2Q);
+						  T49 = VADD(T2Q, T2L);
+						  T14 = BYTWJ(&(W[TWVL * 90]), T13);
+						  T18 = BYTWJ(&(W[TWVL * 106]), T17);
+						  T5q = VSUB(TX, TZ);
+						  T10 = VADD(TX, TZ);
+						  T1a = BYTWJ(&(W[TWVL * 42]), T19);
+					     }
+					     T6b = VFMA(LDK(KP707106781), T6a, T69);
+					     T7v = VFNMS(LDK(KP707106781), T6a, T69);
+					     T60 = VFMA(LDK(KP707106781), T5Z, T5W);
+					     T7s = VFNMS(LDK(KP707106781), T5Z, T5W);
+					     T4a = VADD(T48, T49);
+					     T4I = VSUB(T48, T49);
+					     T5v = VSUB(T14, T12);
+					     T15 = VADD(T12, T14);
+					     T1b = VADD(T18, T1a);
+					     T5s = VSUB(T18, T1a);
+					}
+					T1c = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+					T1e = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
+				   }
+			      }
+			 }
+			 {
+			      V Th, T59, Tf, Tv, T5d, Tj, Tm, To;
+			      {
+				   V T5h, TQ, T5m, T5i, TO, TS, TJ, T4k, TD, TI;
+				   {
+					V T4h, T16, TB, T1d, T1f, TE, TG, TA, Tz, TK, TM, TC;
+					Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					T4h = VADD(T10, T15);
+					T16 = VSUB(T10, T15);
+					TB = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
+					T1d = BYTWJ(&(W[TWVL * 10]), T1c);
+					T1f = BYTWJ(&(W[TWVL * 74]), T1e);
+					TE = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+					TG = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
+					TA = BYTWJ(&(W[TWVL * 2]), Tz);
+					TK = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+					TM = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
+					TC = BYTWJ(&(W[TWVL * 66]), TB);
+					{
+					     V T1g, T5r, TF, TH, TL, TN, TP;
+					     TP = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
+					     T1g = VADD(T1d, T1f);
+					     T5r = VSUB(T1d, T1f);
+					     TF = BYTWJ(&(W[TWVL * 34]), TE);
+					     TH = BYTWJ(&(W[TWVL * 98]), TG);
+					     TL = BYTWJ(&(W[TWVL * 18]), TK);
+					     TN = BYTWJ(&(W[TWVL * 82]), TM);
+					     T5h = VSUB(TA, TC);
+					     TD = VADD(TA, TC);
+					     TQ = BYTWJ(&(W[TWVL * 114]), TP);
+					     {
+						  V T5w, T5t, T4i, T1h, TR;
+						  T5w = VSUB(T5s, T5r);
+						  T5t = VADD(T5r, T5s);
+						  T4i = VADD(T1g, T1b);
+						  T1h = VSUB(T1b, T1g);
+						  T5m = VSUB(TF, TH);
+						  TI = VADD(TF, TH);
+						  T5i = VSUB(TL, TN);
+						  TO = VADD(TL, TN);
+						  TR = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+						  T5u = VFMA(LDK(KP707106781), T5t, T5q);
+						  T7h = VFNMS(LDK(KP707106781), T5t, T5q);
+						  T5x = VFMA(LDK(KP707106781), T5w, T5v);
+						  T7g = VFNMS(LDK(KP707106781), T5w, T5v);
+						  T1i = VFNMS(LDK(KP414213562), T1h, T16);
+						  T3a = VFMA(LDK(KP414213562), T16, T1h);
+						  T4j = VADD(T4h, T4i);
+						  T4C = VSUB(T4h, T4i);
+						  TS = BYTWJ(&(W[TWVL * 50]), TR);
+					     }
+					}
+				   }
+				   TJ = VSUB(TD, TI);
+				   T4k = VADD(TD, TI);
+				   {
+					V Tb, Td, Tr, T5j, TT, Tt, Tg;
+					Tb = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+					Td = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
+					Tr = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					T5j = VSUB(TQ, TS);
+					TT = VADD(TQ, TS);
+					Tt = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
+					Tg = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+					{
+					     V Ti, Tc, Te, Ts;
+					     Ti = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
+					     Tc = BYTWJ(&(W[TWVL * 6]), Tb);
+					     Te = BYTWJ(&(W[TWVL * 70]), Td);
+					     Ts = BYTWJ(&(W[TWVL * 22]), Tr);
+					     {
+						  V T5k, T5n, TU, T4l, Tu;
+						  T5k = VADD(T5i, T5j);
+						  T5n = VSUB(T5i, T5j);
+						  TU = VSUB(TO, TT);
+						  T4l = VADD(TO, TT);
+						  Tu = BYTWJ(&(W[TWVL * 86]), Tt);
+						  Th = BYTWJ(&(W[TWVL * 38]), Tg);
+						  T59 = VSUB(Tc, Te);
+						  Tf = VADD(Tc, Te);
+						  T7e = VFNMS(LDK(KP707106781), T5k, T5h);
+						  T5l = VFMA(LDK(KP707106781), T5k, T5h);
+						  T7d = VFNMS(LDK(KP707106781), T5n, T5m);
+						  T5o = VFMA(LDK(KP707106781), T5n, T5m);
+						  T3b = VFMA(LDK(KP414213562), TJ, TU);
+						  TV = VFNMS(LDK(KP414213562), TU, TJ);
+						  T4B = VSUB(T4k, T4l);
+						  T4m = VADD(T4k, T4l);
+						  Tv = VADD(Ts, Tu);
+						  T5d = VSUB(Tu, Ts);
+						  Tj = BYTWJ(&(W[TWVL * 102]), Ti);
+					     }
+					}
+					Tm = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
+					To = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+				   }
+			      }
+			      {
+				   V T5b, T6m, Tl, T1A, T5G, T1Q, T5K, T1C, T1D, T5e, T6n, Tw, T1H, T1J;
+				   {
+					V T1w, T1y, T1M, T1O, Tq, T5c, T1B;
+					T1w = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+					T1y = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
+					T1M = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+					T1O = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
+					T1B = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V Tk, T5a, Tn, Tp;
+					     Tk = VADD(Th, Tj);
+					     T5a = VSUB(Th, Tj);
+					     Tn = BYTWJ(&(W[TWVL * 118]), Tm);
+					     Tp = BYTWJ(&(W[TWVL * 54]), To);
+					     {
+						  V T1x, T1z, T1N, T1P;
+						  T1x = BYTWJ(&(W[TWVL * 8]), T1w);
+						  T1z = BYTWJ(&(W[TWVL * 72]), T1y);
+						  T1N = BYTWJ(&(W[TWVL * 24]), T1M);
+						  T1P = BYTWJ(&(W[TWVL * 88]), T1O);
+						  T5b = VFNMS(LDK(KP414213562), T5a, T59);
+						  T6m = VFMA(LDK(KP414213562), T59, T5a);
+						  T3X = VADD(Tf, Tk);
+						  Tl = VSUB(Tf, Tk);
+						  Tq = VADD(Tn, Tp);
+						  T5c = VSUB(Tn, Tp);
+						  T1A = VADD(T1x, T1z);
+						  T5G = VSUB(T1x, T1z);
+						  T1Q = VADD(T1N, T1P);
+						  T5K = VSUB(T1N, T1P);
+						  T1C = BYTWJ(&(W[TWVL * 40]), T1B);
+					     }
+					}
+					T1D = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
+					T5e = VFNMS(LDK(KP414213562), T5d, T5c);
+					T6n = VFMA(LDK(KP414213562), T5c, T5d);
+					T3Y = VADD(Tq, Tv);
+					Tw = VSUB(Tq, Tv);
+					T1H = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
+					T1J = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+				   }
+				   {
+					V T1I, T1K, T1F, T5H, T2k, T2l, T2z, T2B, T2j, T1E;
+					T2j = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+					T1E = BYTWJ(&(W[TWVL * 104]), T1D);
+					T6o = VSUB(T6m, T6n);
+					T7b = VADD(T6m, T6n);
+					T5f = VADD(T5b, T5e);
+					T7C = VSUB(T5e, T5b);
+					Tx = VADD(Tl, Tw);
+					T38 = VSUB(Tw, Tl);
+					T1I = BYTWJ(&(W[TWVL * 120]), T1H);
+					T1K = BYTWJ(&(W[TWVL * 56]), T1J);
+					T1F = VADD(T1C, T1E);
+					T5H = VSUB(T1C, T1E);
+					T2k = BYTWJ(&(W[TWVL * 4]), T2j);
+					T2l = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
+					T2z = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					T2B = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V T5I, T5R, T44, T1G, T2m, T2A, T2C, T5S, T5L, T1R, T45, T2o, T5J, T1L;
+					     T2o = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+					     T5J = VSUB(T1I, T1K);
+					     T1L = VADD(T1I, T1K);
+					     T5I = VFNMS(LDK(KP414213562), T5H, T5G);
+					     T5R = VFMA(LDK(KP414213562), T5G, T5H);
+					     T44 = VADD(T1A, T1F);
+					     T1G = VSUB(T1A, T1F);
+					     T2m = BYTWJ(&(W[TWVL * 68]), T2l);
+					     T2A = BYTWJ(&(W[TWVL * 20]), T2z);
+					     T2C = BYTWJ(&(W[TWVL * 84]), T2B);
+					     T5S = VFNMS(LDK(KP414213562), T5J, T5K);
+					     T5L = VFMA(LDK(KP414213562), T5K, T5J);
+					     T1R = VSUB(T1L, T1Q);
+					     T45 = VADD(T1L, T1Q);
+					     T2p = BYTWJ(&(W[TWVL * 36]), T2o);
+					     T61 = VSUB(T2k, T2m);
+					     T2n = VADD(T2k, T2m);
+					     T65 = VSUB(T2C, T2A);
+					     T2D = VADD(T2A, T2C);
+					     T7p = VSUB(T5I, T5L);
+					     T5M = VADD(T5I, T5L);
+					     T7m = VSUB(T5R, T5S);
+					     T5T = VADD(T5R, T5S);
+					     T4G = VSUB(T44, T45);
+					     T46 = VADD(T44, T45);
+					     T25 = VSUB(T1G, T1R);
+					     T1S = VADD(T1G, T1R);
+					     T2q = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
+					}
+					T2u = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
+					T2w = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+		    {
+			 V T67, T7w, T6e, T7t, T3s, T3E, T39, T3D, T1k, T3k, T3t, T3c, T1T, T3v, T3w;
+			 V T26, T2G, T3y, T3z, T2T;
+			 {
+			      V T4A, T4N, T47, T4v, T2r, T2v, T2x, T4s, T40, T3W, T3Z;
+			      T4A = VSUB(T3U, T3V);
+			      T3W = VADD(T3U, T3V);
+			      T3Z = VADD(T3X, T3Y);
+			      T4N = VSUB(T3Y, T3X);
+			      T47 = VSUB(T43, T46);
+			      T4v = VADD(T43, T46);
+			      T2r = BYTWJ(&(W[TWVL * 100]), T2q);
+			      T2v = BYTWJ(&(W[TWVL * 116]), T2u);
+			      T2x = BYTWJ(&(W[TWVL * 52]), T2w);
+			      T4s = VADD(T3W, T3Z);
+			      T40 = VSUB(T3W, T3Z);
+			      {
+				   V T4O, T4n, T4R, T4H, T4E, T4W, T4u, T4y, T4d, T4J, T2F, T2S;
+				   {
+					V T6c, T63, T2t, T4b, T6d, T66, T2E, T4c;
+					{
+					     V T4D, T62, T2s, T64, T2y, T4t;
+					     T4O = VSUB(T4C, T4B);
+					     T4D = VADD(T4B, T4C);
+					     T62 = VSUB(T2r, T2p);
+					     T2s = VADD(T2p, T2r);
+					     T64 = VSUB(T2v, T2x);
+					     T2y = VADD(T2v, T2x);
+					     T4t = VADD(T4m, T4j);
+					     T4n = VSUB(T4j, T4m);
+					     T4R = VFMA(LDK(KP414213562), T4F, T4G);
+					     T4H = VFNMS(LDK(KP414213562), T4G, T4F);
+					     T4E = VFMA(LDK(KP707106781), T4D, T4A);
+					     T4W = VFNMS(LDK(KP707106781), T4D, T4A);
+					     T6c = VFNMS(LDK(KP414213562), T61, T62);
+					     T63 = VFMA(LDK(KP414213562), T62, T61);
+					     T2t = VSUB(T2n, T2s);
+					     T4b = VADD(T2n, T2s);
+					     T6d = VFMA(LDK(KP414213562), T64, T65);
+					     T66 = VFNMS(LDK(KP414213562), T65, T64);
+					     T2E = VSUB(T2y, T2D);
+					     T4c = VADD(T2y, T2D);
+					     T4u = VADD(T4s, T4t);
+					     T4y = VSUB(T4s, T4t);
+					}
+					T67 = VADD(T63, T66);
+					T7w = VSUB(T66, T63);
+					T6e = VADD(T6c, T6d);
+					T7t = VSUB(T6d, T6c);
+					T4d = VADD(T4b, T4c);
+					T4J = VSUB(T4c, T4b);
+					T2F = VADD(T2t, T2E);
+					T2S = VSUB(T2E, T2t);
+				   }
+				   {
+					V Ty, T1j, T4Q, T4K;
+					Ty = VFMA(LDK(KP707106781), Tx, Ta);
+					T3s = VFNMS(LDK(KP707106781), Tx, Ta);
+					T3E = VSUB(T1i, TV);
+					T1j = VADD(TV, T1i);
+					T39 = VFMA(LDK(KP707106781), T38, T37);
+					T3D = VFNMS(LDK(KP707106781), T38, T37);
+					T4Q = VFMA(LDK(KP414213562), T4I, T4J);
+					T4K = VFNMS(LDK(KP414213562), T4J, T4I);
+					{
+					     V T4w, T4e, T4P, T4Z;
+					     T4w = VADD(T4a, T4d);
+					     T4e = VSUB(T4a, T4d);
+					     T4P = VFMA(LDK(KP707106781), T4O, T4N);
+					     T4Z = VFNMS(LDK(KP707106781), T4O, T4N);
+					     T1k = VFMA(LDK(KP923879532), T1j, Ty);
+					     T3k = VFNMS(LDK(KP923879532), T1j, Ty);
+					     {
+						  V T4L, T50, T4S, T4X;
+						  T4L = VADD(T4H, T4K);
+						  T50 = VSUB(T4K, T4H);
+						  T4S = VSUB(T4Q, T4R);
+						  T4X = VADD(T4R, T4Q);
+						  {
+						       V T4f, T4o, T4x, T4z;
+						       T4f = VADD(T47, T4e);
+						       T4o = VSUB(T4e, T47);
+						       T4x = VADD(T4v, T4w);
+						       T4z = VSUB(T4w, T4v);
+						       {
+							    V T53, T51, T4M, T4U;
+							    T53 = VFNMS(LDK(KP923879532), T50, T4Z);
+							    T51 = VFMA(LDK(KP923879532), T50, T4Z);
+							    T4M = VFNMS(LDK(KP923879532), T4L, T4E);
+							    T4U = VFMA(LDK(KP923879532), T4L, T4E);
+							    {
+								 V T52, T4Y, T4T, T4V;
+								 T52 = VFMA(LDK(KP923879532), T4X, T4W);
+								 T4Y = VFNMS(LDK(KP923879532), T4X, T4W);
+								 T4T = VFNMS(LDK(KP923879532), T4S, T4P);
+								 T4V = VFMA(LDK(KP923879532), T4S, T4P);
+								 {
+								      V T4p, T4r, T4g, T4q;
+								      T4p = VFNMS(LDK(KP707106781), T4o, T4n);
+								      T4r = VFMA(LDK(KP707106781), T4o, T4n);
+								      T4g = VFNMS(LDK(KP707106781), T4f, T40);
+								      T4q = VFMA(LDK(KP707106781), T4f, T40);
+								      ST(&(x[WS(rs, 16)]), VFMAI(T4z, T4y), ms, &(x[0]));
+								      ST(&(x[WS(rs, 48)]), VFNMSI(T4z, T4y), ms, &(x[0]));
+								      ST(&(x[0]), VADD(T4u, T4x), ms, &(x[0]));
+								      ST(&(x[WS(rs, 32)]), VSUB(T4u, T4x), ms, &(x[0]));
+								      ST(&(x[WS(rs, 44)]), VFNMSI(T51, T4Y), ms, &(x[0]));
+								      ST(&(x[WS(rs, 20)]), VFMAI(T51, T4Y), ms, &(x[0]));
+								      ST(&(x[WS(rs, 52)]), VFMAI(T53, T52), ms, &(x[0]));
+								      ST(&(x[WS(rs, 12)]), VFNMSI(T53, T52), ms, &(x[0]));
+								      ST(&(x[WS(rs, 4)]), VFMAI(T4V, T4U), ms, &(x[0]));
+								      ST(&(x[WS(rs, 60)]), VFNMSI(T4V, T4U), ms, &(x[0]));
+								      ST(&(x[WS(rs, 36)]), VFMAI(T4T, T4M), ms, &(x[0]));
+								      ST(&(x[WS(rs, 28)]), VFNMSI(T4T, T4M), ms, &(x[0]));
+								      ST(&(x[WS(rs, 8)]), VFMAI(T4r, T4q), ms, &(x[0]));
+								      ST(&(x[WS(rs, 56)]), VFNMSI(T4r, T4q), ms, &(x[0]));
+								      ST(&(x[WS(rs, 40)]), VFMAI(T4p, T4g), ms, &(x[0]));
+								      ST(&(x[WS(rs, 24)]), VFNMSI(T4p, T4g), ms, &(x[0]));
+								      T3t = VADD(T3b, T3a);
+								      T3c = VSUB(T3a, T3b);
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					T1T = VFMA(LDK(KP707106781), T1S, T1v);
+					T3v = VFNMS(LDK(KP707106781), T1S, T1v);
+					T3w = VFNMS(LDK(KP707106781), T25, T24);
+					T26 = VFMA(LDK(KP707106781), T25, T24);
+					T2G = VFMA(LDK(KP707106781), T2F, T2i);
+					T3y = VFNMS(LDK(KP707106781), T2F, T2i);
+					T3z = VFNMS(LDK(KP707106781), T2S, T2R);
+					T2T = VFMA(LDK(KP707106781), T2S, T2R);
+				   }
+			      }
+			 }
+			 {
+			      V T3u, T3M, T3F, T3P, T3x, T3H, T3q, T3m, T3h, T3j, T3r, T3p, T2W, T3i;
+			      {
+				   V T3d, T3n, T27, T3f, T2U, T3e;
+				   T3d = VFMA(LDK(KP923879532), T3c, T39);
+				   T3n = VFNMS(LDK(KP923879532), T3c, T39);
+				   T27 = VFNMS(LDK(KP198912367), T26, T1T);
+				   T3f = VFMA(LDK(KP198912367), T1T, T26);
+				   T2U = VFNMS(LDK(KP198912367), T2T, T2G);
+				   T3e = VFMA(LDK(KP198912367), T2G, T2T);
+				   T3u = VFMA(LDK(KP923879532), T3t, T3s);
+				   T3M = VFNMS(LDK(KP923879532), T3t, T3s);
+				   {
+					V T3g, T3l, T2V, T3o;
+					T3g = VSUB(T3e, T3f);
+					T3l = VADD(T3f, T3e);
+					T2V = VADD(T27, T2U);
+					T3o = VSUB(T2U, T27);
+					T3F = VFNMS(LDK(KP923879532), T3E, T3D);
+					T3P = VFMA(LDK(KP923879532), T3E, T3D);
+					T3x = VFMA(LDK(KP668178637), T3w, T3v);
+					T3H = VFNMS(LDK(KP668178637), T3v, T3w);
+					T3q = VFMA(LDK(KP980785280), T3l, T3k);
+					T3m = VFNMS(LDK(KP980785280), T3l, T3k);
+					T3h = VFNMS(LDK(KP980785280), T3g, T3d);
+					T3j = VFMA(LDK(KP980785280), T3g, T3d);
+					T3r = VFNMS(LDK(KP980785280), T3o, T3n);
+					T3p = VFMA(LDK(KP980785280), T3o, T3n);
+					T2W = VFNMS(LDK(KP980785280), T2V, T1k);
+					T3i = VFMA(LDK(KP980785280), T2V, T1k);
+				   }
+			      }
+			      {
+				   V T7n, T7Z, T8j, T89, T7k, T7O, T8g, T7Y, T7H, T7R, T80, T7q, T7u, T82, T83;
+				   V T7x;
+				   {
+					V T7c, T7W, T7D, T87, T7f, T7F, T3A, T3G, T7E, T7i;
+					T7c = VFNMS(LDK(KP923879532), T7b, T7a);
+					T7W = VFMA(LDK(KP923879532), T7b, T7a);
+					T7D = VFNMS(LDK(KP923879532), T7C, T7B);
+					T87 = VFMA(LDK(KP923879532), T7C, T7B);
+					T7f = VFNMS(LDK(KP668178637), T7e, T7d);
+					T7F = VFMA(LDK(KP668178637), T7d, T7e);
+					ST(&(x[WS(rs, 46)]), VFNMSI(T3p, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 18)]), VFMAI(T3p, T3m), ms, &(x[0]));
+					ST(&(x[WS(rs, 50)]), VFMAI(T3r, T3q), ms, &(x[0]));
+					ST(&(x[WS(rs, 14)]), VFNMSI(T3r, T3q), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFMAI(T3j, T3i), ms, &(x[0]));
+					ST(&(x[WS(rs, 62)]), VFNMSI(T3j, T3i), ms, &(x[0]));
+					ST(&(x[WS(rs, 34)]), VFMAI(T3h, T2W), ms, &(x[0]));
+					ST(&(x[WS(rs, 30)]), VFNMSI(T3h, T2W), ms, &(x[0]));
+					T3A = VFMA(LDK(KP668178637), T3z, T3y);
+					T3G = VFNMS(LDK(KP668178637), T3y, T3z);
+					T7E = VFMA(LDK(KP668178637), T7g, T7h);
+					T7i = VFNMS(LDK(KP668178637), T7h, T7g);
+					T7n = VFNMS(LDK(KP923879532), T7m, T7l);
+					T7Z = VFMA(LDK(KP923879532), T7m, T7l);
+					{
+					     V T3I, T3N, T3B, T3Q;
+					     T3I = VSUB(T3G, T3H);
+					     T3N = VADD(T3H, T3G);
+					     T3B = VADD(T3x, T3A);
+					     T3Q = VSUB(T3A, T3x);
+					     {
+						  V T7j, T88, T7G, T7X;
+						  T7j = VADD(T7f, T7i);
+						  T88 = VSUB(T7f, T7i);
+						  T7G = VSUB(T7E, T7F);
+						  T7X = VADD(T7F, T7E);
+						  {
+						       V T3S, T3O, T3J, T3L;
+						       T3S = VFNMS(LDK(KP831469612), T3N, T3M);
+						       T3O = VFMA(LDK(KP831469612), T3N, T3M);
+						       T3J = VFNMS(LDK(KP831469612), T3I, T3F);
+						       T3L = VFMA(LDK(KP831469612), T3I, T3F);
+						       {
+							    V T3T, T3R, T3C, T3K;
+							    T3T = VFMA(LDK(KP831469612), T3Q, T3P);
+							    T3R = VFNMS(LDK(KP831469612), T3Q, T3P);
+							    T3C = VFNMS(LDK(KP831469612), T3B, T3u);
+							    T3K = VFMA(LDK(KP831469612), T3B, T3u);
+							    T8j = VFNMS(LDK(KP831469612), T88, T87);
+							    T89 = VFMA(LDK(KP831469612), T88, T87);
+							    T7k = VFNMS(LDK(KP831469612), T7j, T7c);
+							    T7O = VFMA(LDK(KP831469612), T7j, T7c);
+							    T8g = VFNMS(LDK(KP831469612), T7X, T7W);
+							    T7Y = VFMA(LDK(KP831469612), T7X, T7W);
+							    T7H = VFNMS(LDK(KP831469612), T7G, T7D);
+							    T7R = VFMA(LDK(KP831469612), T7G, T7D);
+							    ST(&(x[WS(rs, 42)]), VFMAI(T3R, T3O), ms, &(x[0]));
+							    ST(&(x[WS(rs, 22)]), VFNMSI(T3R, T3O), ms, &(x[0]));
+							    ST(&(x[WS(rs, 54)]), VFNMSI(T3T, T3S), ms, &(x[0]));
+							    ST(&(x[WS(rs, 10)]), VFMAI(T3T, T3S), ms, &(x[0]));
+							    ST(&(x[WS(rs, 58)]), VFMAI(T3L, T3K), ms, &(x[0]));
+							    ST(&(x[WS(rs, 6)]), VFNMSI(T3L, T3K), ms, &(x[0]));
+							    ST(&(x[WS(rs, 26)]), VFMAI(T3J, T3C), ms, &(x[0]));
+							    ST(&(x[WS(rs, 38)]), VFNMSI(T3J, T3C), ms, &(x[0]));
+							    T80 = VFNMS(LDK(KP923879532), T7p, T7o);
+							    T7q = VFMA(LDK(KP923879532), T7p, T7o);
+						       }
+						  }
+					     }
+					}
+					T7u = VFNMS(LDK(KP923879532), T7t, T7s);
+					T82 = VFMA(LDK(KP923879532), T7t, T7s);
+					T83 = VFNMS(LDK(KP923879532), T7w, T7v);
+					T7x = VFMA(LDK(KP923879532), T7w, T7v);
+				   }
+				   {
+					V T5g, T6I, T6p, T6T, T5p, T6q, T6r, T5y;
+					T5g = VFMA(LDK(KP923879532), T5f, T58);
+					T6I = VFNMS(LDK(KP923879532), T5f, T58);
+					{
+					     V T7r, T7I, T7y, T7J;
+					     T7r = VFNMS(LDK(KP534511135), T7q, T7n);
+					     T7I = VFMA(LDK(KP534511135), T7n, T7q);
+					     T7y = VFNMS(LDK(KP534511135), T7x, T7u);
+					     T7J = VFMA(LDK(KP534511135), T7u, T7x);
+					     {
+						  V T81, T8a, T84, T8b;
+						  T81 = VFMA(LDK(KP303346683), T80, T7Z);
+						  T8a = VFNMS(LDK(KP303346683), T7Z, T80);
+						  T84 = VFMA(LDK(KP303346683), T83, T82);
+						  T8b = VFNMS(LDK(KP303346683), T82, T83);
+						  T6p = VFMA(LDK(KP923879532), T6o, T6l);
+						  T6T = VFNMS(LDK(KP923879532), T6o, T6l);
+						  T5p = VFNMS(LDK(KP198912367), T5o, T5l);
+						  T6q = VFMA(LDK(KP198912367), T5l, T5o);
+						  {
+						       V T7K, T7P, T7z, T7S;
+						       T7K = VSUB(T7I, T7J);
+						       T7P = VADD(T7I, T7J);
+						       T7z = VADD(T7r, T7y);
+						       T7S = VSUB(T7y, T7r);
+						       {
+							    V T8c, T8h, T85, T8k;
+							    T8c = VSUB(T8a, T8b);
+							    T8h = VADD(T8a, T8b);
+							    T85 = VADD(T81, T84);
+							    T8k = VSUB(T84, T81);
+							    {
+								 V T7Q, T7U, T7L, T7N;
+								 T7Q = VFNMS(LDK(KP881921264), T7P, T7O);
+								 T7U = VFMA(LDK(KP881921264), T7P, T7O);
+								 T7L = VFNMS(LDK(KP881921264), T7K, T7H);
+								 T7N = VFMA(LDK(KP881921264), T7K, T7H);
+								 {
+								      V T7T, T7V, T7A, T7M;
+								      T7T = VFNMS(LDK(KP881921264), T7S, T7R);
+								      T7V = VFMA(LDK(KP881921264), T7S, T7R);
+								      T7A = VFNMS(LDK(KP881921264), T7z, T7k);
+								      T7M = VFMA(LDK(KP881921264), T7z, T7k);
+								      {
+									   V T8i, T8m, T8d, T8f;
+									   T8i = VFMA(LDK(KP956940335), T8h, T8g);
+									   T8m = VFNMS(LDK(KP956940335), T8h, T8g);
+									   T8d = VFNMS(LDK(KP956940335), T8c, T89);
+									   T8f = VFMA(LDK(KP956940335), T8c, T89);
+									   {
+										V T8l, T8n, T86, T8e;
+										T8l = VFMA(LDK(KP956940335), T8k, T8j);
+										T8n = VFNMS(LDK(KP956940335), T8k, T8j);
+										T86 = VFNMS(LDK(KP956940335), T85, T7Y);
+										T8e = VFMA(LDK(KP956940335), T85, T7Y);
+										ST(&(x[WS(rs, 53)]), VFNMSI(T7V, T7U), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 11)]), VFMAI(T7V, T7U), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 43)]), VFMAI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 21)]), VFNMSI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 59)]), VFMAI(T7N, T7M), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 5)]), VFNMSI(T7N, T7M), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 27)]), VFMAI(T7L, T7A), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 37)]), VFNMSI(T7L, T7A), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 51)]), VFMAI(T8n, T8m), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 13)]), VFNMSI(T8n, T8m), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 45)]), VFNMSI(T8l, T8i), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 19)]), VFMAI(T8l, T8i), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 3)]), VFMAI(T8f, T8e), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 61)]), VFNMSI(T8f, T8e), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 35)]), VFMAI(T8d, T86), ms, &(x[WS(rs, 1)]));
+										ST(&(x[WS(rs, 29)]), VFNMSI(T8d, T86), ms, &(x[WS(rs, 1)]));
+										T6r = VFMA(LDK(KP198912367), T5u, T5x);
+										T5y = VFNMS(LDK(KP198912367), T5x, T5u);
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					{
+					     V T5N, T5U, T68, T5z, T6U, T6f;
+					     T5N = VFMA(LDK(KP923879532), T5M, T5F);
+					     T6L = VFNMS(LDK(KP923879532), T5M, T5F);
+					     T6M = VFNMS(LDK(KP923879532), T5T, T5Q);
+					     T5U = VFMA(LDK(KP923879532), T5T, T5Q);
+					     T68 = VFMA(LDK(KP923879532), T67, T60);
+					     T6O = VFNMS(LDK(KP923879532), T67, T60);
+					     T5z = VADD(T5p, T5y);
+					     T6U = VSUB(T5y, T5p);
+					     T6P = VFNMS(LDK(KP923879532), T6e, T6b);
+					     T6f = VFMA(LDK(KP923879532), T6e, T6b);
+					     {
+						  V T5V, T6u, T6g, T6v, T6s, T6J;
+						  T6s = VSUB(T6q, T6r);
+						  T6J = VADD(T6q, T6r);
+						  T5V = VFNMS(LDK(KP098491403), T5U, T5N);
+						  T6u = VFMA(LDK(KP098491403), T5N, T5U);
+						  T75 = VFNMS(LDK(KP980785280), T6U, T6T);
+						  T6V = VFMA(LDK(KP980785280), T6U, T6T);
+						  T5A = VFMA(LDK(KP980785280), T5z, T5g);
+						  T6A = VFNMS(LDK(KP980785280), T5z, T5g);
+						  T6g = VFNMS(LDK(KP098491403), T6f, T68);
+						  T6v = VFMA(LDK(KP098491403), T68, T6f);
+						  T72 = VFNMS(LDK(KP980785280), T6J, T6I);
+						  T6K = VFMA(LDK(KP980785280), T6J, T6I);
+						  T6t = VFMA(LDK(KP980785280), T6s, T6p);
+						  T6D = VFNMS(LDK(KP980785280), T6s, T6p);
+						  T6w = VSUB(T6u, T6v);
+						  T6B = VADD(T6u, T6v);
+						  T6h = VADD(T5V, T6g);
+						  T6E = VSUB(T6g, T5V);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T6W, T6N, T6G, T6C, T6z, T6x, T6H, T6F, T6y, T6i, T6X, T6Q;
+		    T6W = VFNMS(LDK(KP820678790), T6L, T6M);
+		    T6N = VFMA(LDK(KP820678790), T6M, T6L);
+		    T6G = VFMA(LDK(KP995184726), T6B, T6A);
+		    T6C = VFNMS(LDK(KP995184726), T6B, T6A);
+		    T6z = VFMA(LDK(KP995184726), T6w, T6t);
+		    T6x = VFNMS(LDK(KP995184726), T6w, T6t);
+		    T6H = VFMA(LDK(KP995184726), T6E, T6D);
+		    T6F = VFNMS(LDK(KP995184726), T6E, T6D);
+		    T6y = VFMA(LDK(KP995184726), T6h, T5A);
+		    T6i = VFNMS(LDK(KP995184726), T6h, T5A);
+		    T6X = VFNMS(LDK(KP820678790), T6O, T6P);
+		    T6Q = VFMA(LDK(KP820678790), T6P, T6O);
+		    {
+			 V T73, T6Y, T76, T6R;
+			 ST(&(x[WS(rs, 49)]), VFNMSI(T6H, T6G), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 15)]), VFMAI(T6H, T6G), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 47)]), VFMAI(T6F, T6C), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 17)]), VFNMSI(T6F, T6C), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 63)]), VFMAI(T6z, T6y), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VFNMSI(T6z, T6y), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 31)]), VFMAI(T6x, T6i), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 33)]), VFNMSI(T6x, T6i), ms, &(x[WS(rs, 1)]));
+			 T73 = VADD(T6W, T6X);
+			 T6Y = VSUB(T6W, T6X);
+			 T76 = VSUB(T6Q, T6N);
+			 T6R = VADD(T6N, T6Q);
+			 {
+			      V T78, T74, T71, T6Z, T79, T77, T70, T6S;
+			      T78 = VFNMS(LDK(KP773010453), T73, T72);
+			      T74 = VFMA(LDK(KP773010453), T73, T72);
+			      T71 = VFMA(LDK(KP773010453), T6Y, T6V);
+			      T6Z = VFNMS(LDK(KP773010453), T6Y, T6V);
+			      T79 = VFNMS(LDK(KP773010453), T76, T75);
+			      T77 = VFMA(LDK(KP773010453), T76, T75);
+			      T70 = VFMA(LDK(KP773010453), T6R, T6K);
+			      T6S = VFNMS(LDK(KP773010453), T6R, T6K);
+			      ST(&(x[WS(rs, 55)]), VFMAI(T79, T78), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 9)]), VFNMSI(T79, T78), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 41)]), VFNMSI(T77, T74), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 23)]), VFMAI(T77, T74), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 7)]), VFMAI(T71, T70), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 57)]), VFNMSI(T71, T70), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 39)]), VFMAI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 25)]), VFNMSI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     VTW(0, 32),
+     VTW(0, 33),
+     VTW(0, 34),
+     VTW(0, 35),
+     VTW(0, 36),
+     VTW(0, 37),
+     VTW(0, 38),
+     VTW(0, 39),
+     VTW(0, 40),
+     VTW(0, 41),
+     VTW(0, 42),
+     VTW(0, 43),
+     VTW(0, 44),
+     VTW(0, 45),
+     VTW(0, 46),
+     VTW(0, 47),
+     VTW(0, 48),
+     VTW(0, 49),
+     VTW(0, 50),
+     VTW(0, 51),
+     VTW(0, 52),
+     VTW(0, 53),
+     VTW(0, 54),
+     VTW(0, 55),
+     VTW(0, 56),
+     VTW(0, 57),
+     VTW(0, 58),
+     VTW(0, 59),
+     VTW(0, 60),
+     VTW(0, 61),
+     VTW(0, 62),
+     VTW(0, 63),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 64, XSIMD_STRING("t2fv_64"), twinstr, &GENUS, {261, 126, 258, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_64) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_64, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2fv_64 -include t2f.h */
+
+/*
+ * This function contains 519 FP additions, 250 FP multiplications,
+ * (or, 467 additions, 198 multiplications, 52 fused multiply/add),
+ * 107 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V Tg, T4a, T6r, T7f, T3o, T4B, T5q, T7e, T5R, T62, T28, T4o, T2g, T4l, T7n;
+	       V T7Z, T68, T6j, T2C, T4s, T3a, T4v, T7u, T82, T7E, T7F, T7V, T5F, T6u, T1k;
+	       V T4e, T1r, T4d, T7B, T7C, T7W, T5M, T6v, TV, T4g, T12, T4h, T7h, T7i, TD;
+	       V T4C, T3h, T4b, T5x, T6s, T1R, T4m, T7q, T80, T2j, T4p, T5Y, T63, T2Z, T4w;
+	       V T7x, T83, T33, T4t, T6f, T6k;
+	       {
+		    V T1, T3, T3m, T3k, Tb, Td, Te, T6, T8, T9, T2, T3l, T3j;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
+		    T3 = BYTWJ(&(W[TWVL * 62]), T2);
+		    T3l = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
+		    T3m = BYTWJ(&(W[TWVL * 94]), T3l);
+		    T3j = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+		    T3k = BYTWJ(&(W[TWVL * 30]), T3j);
+		    {
+			 V Ta, Tc, T5, T7;
+			 Ta = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
+			 Tb = BYTWJ(&(W[TWVL * 110]), Ta);
+			 Tc = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			 Td = BYTWJ(&(W[TWVL * 46]), Tc);
+			 Te = VSUB(Tb, Td);
+			 T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T6 = BYTWJ(&(W[TWVL * 14]), T5);
+			 T7 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
+			 T8 = BYTWJ(&(W[TWVL * 78]), T7);
+			 T9 = VSUB(T6, T8);
+		    }
+		    {
+			 V T4, Tf, T6p, T6q;
+			 T4 = VSUB(T1, T3);
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 Tg = VADD(T4, Tf);
+			 T4a = VSUB(T4, Tf);
+			 T6p = VADD(Tb, Td);
+			 T6q = VADD(T6, T8);
+			 T6r = VSUB(T6p, T6q);
+			 T7f = VADD(T6q, T6p);
+		    }
+		    {
+			 V T3i, T3n, T5o, T5p;
+			 T3i = VMUL(LDK(KP707106781), VSUB(Te, T9));
+			 T3n = VSUB(T3k, T3m);
+			 T3o = VSUB(T3i, T3n);
+			 T4B = VADD(T3n, T3i);
+			 T5o = VADD(T1, T3);
+			 T5p = VADD(T3k, T3m);
+			 T5q = VSUB(T5o, T5p);
+			 T7e = VADD(T5o, T5p);
+		    }
+	       }
+	       {
+		    V T24, T26, T5Q, T2b, T2d, T5P, T1W, T60, T21, T61, T22, T27;
+		    {
+			 V T23, T25, T2a, T2c;
+			 T23 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			 T24 = BYTWJ(&(W[TWVL * 32]), T23);
+			 T25 = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
+			 T26 = BYTWJ(&(W[TWVL * 96]), T25);
+			 T5Q = VADD(T24, T26);
+			 T2a = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T2b = BYTWJ(&(W[0]), T2a);
+			 T2c = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
+			 T2d = BYTWJ(&(W[TWVL * 64]), T2c);
+			 T5P = VADD(T2b, T2d);
+		    }
+		    {
+			 V T1T, T1V, T1S, T1U;
+			 T1S = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
+			 T1T = BYTWJ(&(W[TWVL * 112]), T1S);
+			 T1U = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+			 T1V = BYTWJ(&(W[TWVL * 48]), T1U);
+			 T1W = VSUB(T1T, T1V);
+			 T60 = VADD(T1T, T1V);
+		    }
+		    {
+			 V T1Y, T20, T1X, T1Z;
+			 T1X = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			 T1Y = BYTWJ(&(W[TWVL * 16]), T1X);
+			 T1Z = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
+			 T20 = BYTWJ(&(W[TWVL * 80]), T1Z);
+			 T21 = VSUB(T1Y, T20);
+			 T61 = VADD(T1Y, T20);
+		    }
+		    T5R = VSUB(T5P, T5Q);
+		    T62 = VSUB(T60, T61);
+		    T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
+		    T27 = VSUB(T24, T26);
+		    T28 = VSUB(T22, T27);
+		    T4o = VADD(T27, T22);
+		    {
+			 V T2e, T2f, T7l, T7m;
+			 T2e = VSUB(T2b, T2d);
+			 T2f = VMUL(LDK(KP707106781), VADD(T21, T1W));
+			 T2g = VADD(T2e, T2f);
+			 T4l = VSUB(T2e, T2f);
+			 T7l = VADD(T5P, T5Q);
+			 T7m = VADD(T61, T60);
+			 T7n = VADD(T7l, T7m);
+			 T7Z = VSUB(T7l, T7m);
+		    }
+	       }
+	       {
+		    V T2n, T2p, T66, T36, T38, T67, T2v, T6i, T2A, T6h, T2q, T2B;
+		    {
+			 V T2m, T2o, T35, T37;
+			 T2m = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
+			 T2n = BYTWJ(&(W[TWVL * 124]), T2m);
+			 T2o = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+			 T2p = BYTWJ(&(W[TWVL * 60]), T2o);
+			 T66 = VADD(T2n, T2p);
+			 T35 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T36 = BYTWJ(&(W[TWVL * 28]), T35);
+			 T37 = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
+			 T38 = BYTWJ(&(W[TWVL * 92]), T37);
+			 T67 = VADD(T36, T38);
+		    }
+		    {
+			 V T2s, T2u, T2r, T2t;
+			 T2r = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 T2s = BYTWJ(&(W[TWVL * 12]), T2r);
+			 T2t = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
+			 T2u = BYTWJ(&(W[TWVL * 76]), T2t);
+			 T2v = VSUB(T2s, T2u);
+			 T6i = VADD(T2s, T2u);
+		    }
+		    {
+			 V T2x, T2z, T2w, T2y;
+			 T2w = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
+			 T2x = BYTWJ(&(W[TWVL * 108]), T2w);
+			 T2y = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			 T2z = BYTWJ(&(W[TWVL * 44]), T2y);
+			 T2A = VSUB(T2x, T2z);
+			 T6h = VADD(T2x, T2z);
+		    }
+		    T68 = VSUB(T66, T67);
+		    T6j = VSUB(T6h, T6i);
+		    T2q = VSUB(T2n, T2p);
+		    T2B = VMUL(LDK(KP707106781), VADD(T2v, T2A));
+		    T2C = VADD(T2q, T2B);
+		    T4s = VSUB(T2q, T2B);
+		    {
+			 V T34, T39, T7s, T7t;
+			 T34 = VMUL(LDK(KP707106781), VSUB(T2A, T2v));
+			 T39 = VSUB(T36, T38);
+			 T3a = VSUB(T34, T39);
+			 T4v = VADD(T39, T34);
+			 T7s = VADD(T66, T67);
+			 T7t = VADD(T6i, T6h);
+			 T7u = VADD(T7s, T7t);
+			 T82 = VSUB(T7s, T7t);
+		    }
+	       }
+	       {
+		    V T1g, T1i, T5A, T1m, T1o, T5z, T18, T5C, T1d, T5D, T5B, T5E;
+		    {
+			 V T1f, T1h, T1l, T1n;
+			 T1f = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			 T1g = BYTWJ(&(W[TWVL * 34]), T1f);
+			 T1h = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
+			 T1i = BYTWJ(&(W[TWVL * 98]), T1h);
+			 T5A = VADD(T1g, T1i);
+			 T1l = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T1m = BYTWJ(&(W[TWVL * 2]), T1l);
+			 T1n = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
+			 T1o = BYTWJ(&(W[TWVL * 66]), T1n);
+			 T5z = VADD(T1m, T1o);
+		    }
+		    {
+			 V T15, T17, T14, T16;
+			 T14 = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
+			 T15 = BYTWJ(&(W[TWVL * 114]), T14);
+			 T16 = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			 T17 = BYTWJ(&(W[TWVL * 50]), T16);
+			 T18 = VSUB(T15, T17);
+			 T5C = VADD(T15, T17);
+		    }
+		    {
+			 V T1a, T1c, T19, T1b;
+			 T19 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 T1a = BYTWJ(&(W[TWVL * 18]), T19);
+			 T1b = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
+			 T1c = BYTWJ(&(W[TWVL * 82]), T1b);
+			 T1d = VSUB(T1a, T1c);
+			 T5D = VADD(T1a, T1c);
+		    }
+		    T7E = VADD(T5z, T5A);
+		    T7F = VADD(T5D, T5C);
+		    T7V = VSUB(T7E, T7F);
+		    T5B = VSUB(T5z, T5A);
+		    T5E = VSUB(T5C, T5D);
+		    T5F = VFMA(LDK(KP923879532), T5B, VMUL(LDK(KP382683432), T5E));
+		    T6u = VFNMS(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5E));
+		    {
+			 V T1e, T1j, T1p, T1q;
+			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
+			 T1j = VSUB(T1g, T1i);
+			 T1k = VSUB(T1e, T1j);
+			 T4e = VADD(T1j, T1e);
+			 T1p = VSUB(T1m, T1o);
+			 T1q = VMUL(LDK(KP707106781), VADD(T1d, T18));
+			 T1r = VADD(T1p, T1q);
+			 T4d = VSUB(T1p, T1q);
+		    }
+	       }
+	       {
+		    V TG, TI, T5G, TY, T10, T5H, TO, T5K, TT, T5J, T5I, T5L;
+		    {
+			 V TF, TH, TX, TZ;
+			 TF = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
+			 TG = BYTWJ(&(W[TWVL * 122]), TF);
+			 TH = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+			 TI = BYTWJ(&(W[TWVL * 58]), TH);
+			 T5G = VADD(TG, TI);
+			 TX = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			 TY = BYTWJ(&(W[TWVL * 26]), TX);
+			 TZ = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
+			 T10 = BYTWJ(&(W[TWVL * 90]), TZ);
+			 T5H = VADD(TY, T10);
+		    }
+		    {
+			 V TL, TN, TK, TM;
+			 TK = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 TL = BYTWJ(&(W[TWVL * 10]), TK);
+			 TM = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
+			 TN = BYTWJ(&(W[TWVL * 74]), TM);
+			 TO = VSUB(TL, TN);
+			 T5K = VADD(TL, TN);
+		    }
+		    {
+			 V TQ, TS, TP, TR;
+			 TP = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
+			 TQ = BYTWJ(&(W[TWVL * 106]), TP);
+			 TR = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			 TS = BYTWJ(&(W[TWVL * 42]), TR);
+			 TT = VSUB(TQ, TS);
+			 T5J = VADD(TQ, TS);
+		    }
+		    T7B = VADD(T5G, T5H);
+		    T7C = VADD(T5K, T5J);
+		    T7W = VSUB(T7B, T7C);
+		    T5I = VSUB(T5G, T5H);
+		    T5L = VSUB(T5J, T5K);
+		    T5M = VFNMS(LDK(KP382683432), T5L, VMUL(LDK(KP923879532), T5I));
+		    T6v = VFMA(LDK(KP382683432), T5I, VMUL(LDK(KP923879532), T5L));
+		    {
+			 V TJ, TU, TW, T11;
+			 TJ = VSUB(TG, TI);
+			 TU = VMUL(LDK(KP707106781), VADD(TO, TT));
+			 TV = VADD(TJ, TU);
+			 T4g = VSUB(TJ, TU);
+			 TW = VMUL(LDK(KP707106781), VSUB(TT, TO));
+			 T11 = VSUB(TY, T10);
+			 T12 = VSUB(TW, T11);
+			 T4h = VADD(T11, TW);
+		    }
+	       }
+	       {
+		    V Tl, T5r, TB, T5v, Tq, T5s, Tw, T5u, Tr, TC;
+		    {
+			 V Ti, Tk, Th, Tj;
+			 Th = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Ti = BYTWJ(&(W[TWVL * 6]), Th);
+			 Tj = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
+			 Tk = BYTWJ(&(W[TWVL * 70]), Tj);
+			 Tl = VSUB(Ti, Tk);
+			 T5r = VADD(Ti, Tk);
+		    }
+		    {
+			 V Ty, TA, Tx, Tz;
+			 Tx = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 Ty = BYTWJ(&(W[TWVL * 22]), Tx);
+			 Tz = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
+			 TA = BYTWJ(&(W[TWVL * 86]), Tz);
+			 TB = VSUB(Ty, TA);
+			 T5v = VADD(Ty, TA);
+		    }
+		    {
+			 V Tn, Tp, Tm, To;
+			 Tm = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			 Tn = BYTWJ(&(W[TWVL * 38]), Tm);
+			 To = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
+			 Tp = BYTWJ(&(W[TWVL * 102]), To);
+			 Tq = VSUB(Tn, Tp);
+			 T5s = VADD(Tn, Tp);
+		    }
+		    {
+			 V Tt, Tv, Ts, Tu;
+			 Ts = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
+			 Tt = BYTWJ(&(W[TWVL * 118]), Ts);
+			 Tu = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+			 Tv = BYTWJ(&(W[TWVL * 54]), Tu);
+			 Tw = VSUB(Tt, Tv);
+			 T5u = VADD(Tt, Tv);
+		    }
+		    T7h = VADD(T5r, T5s);
+		    T7i = VADD(T5u, T5v);
+		    Tr = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
+		    TC = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), TB));
+		    TD = VADD(Tr, TC);
+		    T4C = VSUB(TC, Tr);
+		    {
+			 V T3f, T3g, T5t, T5w;
+			 T3f = VFNMS(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
+			 T3g = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
+			 T3h = VSUB(T3f, T3g);
+			 T4b = VADD(T3g, T3f);
+			 T5t = VSUB(T5r, T5s);
+			 T5w = VSUB(T5u, T5v);
+			 T5x = VMUL(LDK(KP707106781), VADD(T5t, T5w));
+			 T6s = VMUL(LDK(KP707106781), VSUB(T5w, T5t));
+		    }
+	       }
+	       {
+		    V T1z, T5V, T1P, T5T, T1E, T5W, T1K, T5S;
+		    {
+			 V T1w, T1y, T1v, T1x;
+			 T1v = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
+			 T1w = BYTWJ(&(W[TWVL * 120]), T1v);
+			 T1x = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+			 T1y = BYTWJ(&(W[TWVL * 56]), T1x);
+			 T1z = VSUB(T1w, T1y);
+			 T5V = VADD(T1w, T1y);
+		    }
+		    {
+			 V T1M, T1O, T1L, T1N;
+			 T1L = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			 T1M = BYTWJ(&(W[TWVL * 40]), T1L);
+			 T1N = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
+			 T1O = BYTWJ(&(W[TWVL * 104]), T1N);
+			 T1P = VSUB(T1M, T1O);
+			 T5T = VADD(T1M, T1O);
+		    }
+		    {
+			 V T1B, T1D, T1A, T1C;
+			 T1A = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			 T1B = BYTWJ(&(W[TWVL * 24]), T1A);
+			 T1C = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
+			 T1D = BYTWJ(&(W[TWVL * 88]), T1C);
+			 T1E = VSUB(T1B, T1D);
+			 T5W = VADD(T1B, T1D);
+		    }
+		    {
+			 V T1H, T1J, T1G, T1I;
+			 T1G = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T1H = BYTWJ(&(W[TWVL * 8]), T1G);
+			 T1I = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
+			 T1J = BYTWJ(&(W[TWVL * 72]), T1I);
+			 T1K = VSUB(T1H, T1J);
+			 T5S = VADD(T1H, T1J);
+		    }
+		    {
+			 V T1F, T1Q, T7o, T7p;
+			 T1F = VFNMS(LDK(KP923879532), T1E, VMUL(LDK(KP382683432), T1z));
+			 T1Q = VFMA(LDK(KP382683432), T1K, VMUL(LDK(KP923879532), T1P));
+			 T1R = VSUB(T1F, T1Q);
+			 T4m = VADD(T1Q, T1F);
+			 T7o = VADD(T5S, T5T);
+			 T7p = VADD(T5V, T5W);
+			 T7q = VADD(T7o, T7p);
+			 T80 = VSUB(T7p, T7o);
+		    }
+		    {
+			 V T2h, T2i, T5U, T5X;
+			 T2h = VFNMS(LDK(KP382683432), T1P, VMUL(LDK(KP923879532), T1K));
+			 T2i = VFMA(LDK(KP923879532), T1z, VMUL(LDK(KP382683432), T1E));
+			 T2j = VADD(T2h, T2i);
+			 T4p = VSUB(T2i, T2h);
+			 T5U = VSUB(T5S, T5T);
+			 T5X = VSUB(T5V, T5W);
+			 T5Y = VMUL(LDK(KP707106781), VADD(T5U, T5X));
+			 T63 = VMUL(LDK(KP707106781), VSUB(T5X, T5U));
+		    }
+	       }
+	       {
+		    V T2H, T69, T2X, T6d, T2M, T6a, T2S, T6c;
+		    {
+			 V T2E, T2G, T2D, T2F;
+			 T2D = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T2E = BYTWJ(&(W[TWVL * 4]), T2D);
+			 T2F = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
+			 T2G = BYTWJ(&(W[TWVL * 68]), T2F);
+			 T2H = VSUB(T2E, T2G);
+			 T69 = VADD(T2E, T2G);
+		    }
+		    {
+			 V T2U, T2W, T2T, T2V;
+			 T2T = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			 T2U = BYTWJ(&(W[TWVL * 20]), T2T);
+			 T2V = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
+			 T2W = BYTWJ(&(W[TWVL * 84]), T2V);
+			 T2X = VSUB(T2U, T2W);
+			 T6d = VADD(T2U, T2W);
+		    }
+		    {
+			 V T2J, T2L, T2I, T2K;
+			 T2I = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			 T2J = BYTWJ(&(W[TWVL * 36]), T2I);
+			 T2K = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
+			 T2L = BYTWJ(&(W[TWVL * 100]), T2K);
+			 T2M = VSUB(T2J, T2L);
+			 T6a = VADD(T2J, T2L);
+		    }
+		    {
+			 V T2P, T2R, T2O, T2Q;
+			 T2O = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
+			 T2P = BYTWJ(&(W[TWVL * 116]), T2O);
+			 T2Q = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+			 T2R = BYTWJ(&(W[TWVL * 52]), T2Q);
+			 T2S = VSUB(T2P, T2R);
+			 T6c = VADD(T2P, T2R);
+		    }
+		    {
+			 V T2N, T2Y, T7v, T7w;
+			 T2N = VFNMS(LDK(KP382683432), T2M, VMUL(LDK(KP923879532), T2H));
+			 T2Y = VFMA(LDK(KP923879532), T2S, VMUL(LDK(KP382683432), T2X));
+			 T2Z = VADD(T2N, T2Y);
+			 T4w = VSUB(T2Y, T2N);
+			 T7v = VADD(T69, T6a);
+			 T7w = VADD(T6c, T6d);
+			 T7x = VADD(T7v, T7w);
+			 T83 = VSUB(T7w, T7v);
+		    }
+		    {
+			 V T31, T32, T6b, T6e;
+			 T31 = VFNMS(LDK(KP923879532), T2X, VMUL(LDK(KP382683432), T2S));
+			 T32 = VFMA(LDK(KP382683432), T2H, VMUL(LDK(KP923879532), T2M));
+			 T33 = VSUB(T31, T32);
+			 T4t = VADD(T32, T31);
+			 T6b = VSUB(T69, T6a);
+			 T6e = VSUB(T6c, T6d);
+			 T6f = VMUL(LDK(KP707106781), VADD(T6b, T6e));
+			 T6k = VMUL(LDK(KP707106781), VSUB(T6e, T6b));
+		    }
+	       }
+	       {
+		    V T7k, T7M, T7R, T7T, T7z, T7I, T7H, T7N, T7O, T7S;
+		    {
+			 V T7g, T7j, T7P, T7Q;
+			 T7g = VADD(T7e, T7f);
+			 T7j = VADD(T7h, T7i);
+			 T7k = VSUB(T7g, T7j);
+			 T7M = VADD(T7g, T7j);
+			 T7P = VADD(T7n, T7q);
+			 T7Q = VADD(T7u, T7x);
+			 T7R = VADD(T7P, T7Q);
+			 T7T = VBYI(VSUB(T7Q, T7P));
+		    }
+		    {
+			 V T7r, T7y, T7D, T7G;
+			 T7r = VSUB(T7n, T7q);
+			 T7y = VSUB(T7u, T7x);
+			 T7z = VMUL(LDK(KP707106781), VADD(T7r, T7y));
+			 T7I = VMUL(LDK(KP707106781), VSUB(T7y, T7r));
+			 T7D = VADD(T7B, T7C);
+			 T7G = VADD(T7E, T7F);
+			 T7H = VSUB(T7D, T7G);
+			 T7N = VADD(T7G, T7D);
+		    }
+		    T7O = VADD(T7M, T7N);
+		    ST(&(x[WS(rs, 32)]), VSUB(T7O, T7R), ms, &(x[0]));
+		    ST(&(x[0]), VADD(T7O, T7R), ms, &(x[0]));
+		    T7S = VSUB(T7M, T7N);
+		    ST(&(x[WS(rs, 48)]), VSUB(T7S, T7T), ms, &(x[0]));
+		    ST(&(x[WS(rs, 16)]), VADD(T7S, T7T), ms, &(x[0]));
+		    {
+			 V T7A, T7J, T7K, T7L;
+			 T7A = VADD(T7k, T7z);
+			 T7J = VBYI(VADD(T7H, T7I));
+			 ST(&(x[WS(rs, 56)]), VSUB(T7A, T7J), ms, &(x[0]));
+			 ST(&(x[WS(rs, 8)]), VADD(T7A, T7J), ms, &(x[0]));
+			 T7K = VSUB(T7k, T7z);
+			 T7L = VBYI(VSUB(T7I, T7H));
+			 ST(&(x[WS(rs, 40)]), VSUB(T7K, T7L), ms, &(x[0]));
+			 ST(&(x[WS(rs, 24)]), VADD(T7K, T7L), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T7Y, T8j, T8c, T8k, T85, T8g, T89, T8h;
+		    {
+			 V T7U, T7X, T8a, T8b;
+			 T7U = VSUB(T7e, T7f);
+			 T7X = VMUL(LDK(KP707106781), VADD(T7V, T7W));
+			 T7Y = VADD(T7U, T7X);
+			 T8j = VSUB(T7U, T7X);
+			 T8a = VFNMS(LDK(KP382683432), T7Z, VMUL(LDK(KP923879532), T80));
+			 T8b = VFMA(LDK(KP382683432), T82, VMUL(LDK(KP923879532), T83));
+			 T8c = VADD(T8a, T8b);
+			 T8k = VSUB(T8b, T8a);
+		    }
+		    {
+			 V T81, T84, T87, T88;
+			 T81 = VFMA(LDK(KP923879532), T7Z, VMUL(LDK(KP382683432), T80));
+			 T84 = VFNMS(LDK(KP382683432), T83, VMUL(LDK(KP923879532), T82));
+			 T85 = VADD(T81, T84);
+			 T8g = VSUB(T84, T81);
+			 T87 = VSUB(T7i, T7h);
+			 T88 = VMUL(LDK(KP707106781), VSUB(T7W, T7V));
+			 T89 = VADD(T87, T88);
+			 T8h = VSUB(T88, T87);
+		    }
+		    {
+			 V T86, T8d, T8m, T8n;
+			 T86 = VADD(T7Y, T85);
+			 T8d = VBYI(VADD(T89, T8c));
+			 ST(&(x[WS(rs, 60)]), VSUB(T86, T8d), ms, &(x[0]));
+			 ST(&(x[WS(rs, 4)]), VADD(T86, T8d), ms, &(x[0]));
+			 T8m = VBYI(VADD(T8h, T8g));
+			 T8n = VADD(T8j, T8k);
+			 ST(&(x[WS(rs, 12)]), VADD(T8m, T8n), ms, &(x[0]));
+			 ST(&(x[WS(rs, 52)]), VSUB(T8n, T8m), ms, &(x[0]));
+		    }
+		    {
+			 V T8e, T8f, T8i, T8l;
+			 T8e = VSUB(T7Y, T85);
+			 T8f = VBYI(VSUB(T8c, T89));
+			 ST(&(x[WS(rs, 36)]), VSUB(T8e, T8f), ms, &(x[0]));
+			 ST(&(x[WS(rs, 28)]), VADD(T8e, T8f), ms, &(x[0]));
+			 T8i = VBYI(VSUB(T8g, T8h));
+			 T8l = VSUB(T8j, T8k);
+			 ST(&(x[WS(rs, 20)]), VADD(T8i, T8l), ms, &(x[0]));
+			 ST(&(x[WS(rs, 44)]), VSUB(T8l, T8i), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T5O, T6H, T6x, T6F, T6n, T6I, T6A, T6E;
+		    {
+			 V T5y, T5N, T6t, T6w;
+			 T5y = VADD(T5q, T5x);
+			 T5N = VADD(T5F, T5M);
+			 T5O = VADD(T5y, T5N);
+			 T6H = VSUB(T5y, T5N);
+			 T6t = VADD(T6r, T6s);
+			 T6w = VADD(T6u, T6v);
+			 T6x = VADD(T6t, T6w);
+			 T6F = VSUB(T6w, T6t);
+			 {
+			      V T65, T6y, T6m, T6z;
+			      {
+				   V T5Z, T64, T6g, T6l;
+				   T5Z = VADD(T5R, T5Y);
+				   T64 = VADD(T62, T63);
+				   T65 = VFMA(LDK(KP980785280), T5Z, VMUL(LDK(KP195090322), T64));
+				   T6y = VFNMS(LDK(KP195090322), T5Z, VMUL(LDK(KP980785280), T64));
+				   T6g = VADD(T68, T6f);
+				   T6l = VADD(T6j, T6k);
+				   T6m = VFNMS(LDK(KP195090322), T6l, VMUL(LDK(KP980785280), T6g));
+				   T6z = VFMA(LDK(KP195090322), T6g, VMUL(LDK(KP980785280), T6l));
+			      }
+			      T6n = VADD(T65, T6m);
+			      T6I = VSUB(T6z, T6y);
+			      T6A = VADD(T6y, T6z);
+			      T6E = VSUB(T6m, T65);
+			 }
+		    }
+		    {
+			 V T6o, T6B, T6K, T6L;
+			 T6o = VADD(T5O, T6n);
+			 T6B = VBYI(VADD(T6x, T6A));
+			 ST(&(x[WS(rs, 62)]), VSUB(T6o, T6B), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(T6o, T6B), ms, &(x[0]));
+			 T6K = VBYI(VADD(T6F, T6E));
+			 T6L = VADD(T6H, T6I);
+			 ST(&(x[WS(rs, 14)]), VADD(T6K, T6L), ms, &(x[0]));
+			 ST(&(x[WS(rs, 50)]), VSUB(T6L, T6K), ms, &(x[0]));
+		    }
+		    {
+			 V T6C, T6D, T6G, T6J;
+			 T6C = VSUB(T5O, T6n);
+			 T6D = VBYI(VSUB(T6A, T6x));
+			 ST(&(x[WS(rs, 34)]), VSUB(T6C, T6D), ms, &(x[0]));
+			 ST(&(x[WS(rs, 30)]), VADD(T6C, T6D), ms, &(x[0]));
+			 T6G = VBYI(VSUB(T6E, T6F));
+			 T6J = VSUB(T6H, T6I);
+			 ST(&(x[WS(rs, 18)]), VADD(T6G, T6J), ms, &(x[0]));
+			 ST(&(x[WS(rs, 46)]), VSUB(T6J, T6G), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T6O, T79, T6Z, T77, T6V, T7a, T72, T76;
+		    {
+			 V T6M, T6N, T6X, T6Y;
+			 T6M = VSUB(T5q, T5x);
+			 T6N = VSUB(T6v, T6u);
+			 T6O = VADD(T6M, T6N);
+			 T79 = VSUB(T6M, T6N);
+			 T6X = VSUB(T6s, T6r);
+			 T6Y = VSUB(T5M, T5F);
+			 T6Z = VADD(T6X, T6Y);
+			 T77 = VSUB(T6Y, T6X);
+			 {
+			      V T6R, T70, T6U, T71;
+			      {
+				   V T6P, T6Q, T6S, T6T;
+				   T6P = VSUB(T5R, T5Y);
+				   T6Q = VSUB(T63, T62);
+				   T6R = VFMA(LDK(KP831469612), T6P, VMUL(LDK(KP555570233), T6Q));
+				   T70 = VFNMS(LDK(KP555570233), T6P, VMUL(LDK(KP831469612), T6Q));
+				   T6S = VSUB(T68, T6f);
+				   T6T = VSUB(T6k, T6j);
+				   T6U = VFNMS(LDK(KP555570233), T6T, VMUL(LDK(KP831469612), T6S));
+				   T71 = VFMA(LDK(KP555570233), T6S, VMUL(LDK(KP831469612), T6T));
+			      }
+			      T6V = VADD(T6R, T6U);
+			      T7a = VSUB(T71, T70);
+			      T72 = VADD(T70, T71);
+			      T76 = VSUB(T6U, T6R);
+			 }
+		    }
+		    {
+			 V T6W, T73, T7c, T7d;
+			 T6W = VADD(T6O, T6V);
+			 T73 = VBYI(VADD(T6Z, T72));
+			 ST(&(x[WS(rs, 58)]), VSUB(T6W, T73), ms, &(x[0]));
+			 ST(&(x[WS(rs, 6)]), VADD(T6W, T73), ms, &(x[0]));
+			 T7c = VBYI(VADD(T77, T76));
+			 T7d = VADD(T79, T7a);
+			 ST(&(x[WS(rs, 10)]), VADD(T7c, T7d), ms, &(x[0]));
+			 ST(&(x[WS(rs, 54)]), VSUB(T7d, T7c), ms, &(x[0]));
+		    }
+		    {
+			 V T74, T75, T78, T7b;
+			 T74 = VSUB(T6O, T6V);
+			 T75 = VBYI(VSUB(T72, T6Z));
+			 ST(&(x[WS(rs, 38)]), VSUB(T74, T75), ms, &(x[0]));
+			 ST(&(x[WS(rs, 26)]), VADD(T74, T75), ms, &(x[0]));
+			 T78 = VBYI(VSUB(T76, T77));
+			 T7b = VSUB(T79, T7a);
+			 ST(&(x[WS(rs, 22)]), VADD(T78, T7b), ms, &(x[0]));
+			 ST(&(x[WS(rs, 42)]), VSUB(T7b, T78), ms, &(x[0]));
+		    }
+	       }
+	       {
+		    V T4k, T5h, T4R, T59, T4H, T5j, T4P, T4Y, T4z, T4S, T4K, T4O, T55, T5k, T5c;
+		    V T5g;
+		    {
+			 V T4c, T57, T4j, T58, T4f, T4i;
+			 T4c = VADD(T4a, T4b);
+			 T57 = VSUB(T4C, T4B);
+			 T4f = VFMA(LDK(KP831469612), T4d, VMUL(LDK(KP555570233), T4e));
+			 T4i = VFNMS(LDK(KP555570233), T4h, VMUL(LDK(KP831469612), T4g));
+			 T4j = VADD(T4f, T4i);
+			 T58 = VSUB(T4i, T4f);
+			 T4k = VADD(T4c, T4j);
+			 T5h = VSUB(T58, T57);
+			 T4R = VSUB(T4c, T4j);
+			 T59 = VADD(T57, T58);
+		    }
+		    {
+			 V T4D, T4W, T4G, T4X, T4E, T4F;
+			 T4D = VADD(T4B, T4C);
+			 T4W = VSUB(T4a, T4b);
+			 T4E = VFNMS(LDK(KP555570233), T4d, VMUL(LDK(KP831469612), T4e));
+			 T4F = VFMA(LDK(KP555570233), T4g, VMUL(LDK(KP831469612), T4h));
+			 T4G = VADD(T4E, T4F);
+			 T4X = VSUB(T4F, T4E);
+			 T4H = VADD(T4D, T4G);
+			 T5j = VSUB(T4W, T4X);
+			 T4P = VSUB(T4G, T4D);
+			 T4Y = VADD(T4W, T4X);
+		    }
+		    {
+			 V T4r, T4I, T4y, T4J;
+			 {
+			      V T4n, T4q, T4u, T4x;
+			      T4n = VADD(T4l, T4m);
+			      T4q = VADD(T4o, T4p);
+			      T4r = VFMA(LDK(KP956940335), T4n, VMUL(LDK(KP290284677), T4q));
+			      T4I = VFNMS(LDK(KP290284677), T4n, VMUL(LDK(KP956940335), T4q));
+			      T4u = VADD(T4s, T4t);
+			      T4x = VADD(T4v, T4w);
+			      T4y = VFNMS(LDK(KP290284677), T4x, VMUL(LDK(KP956940335), T4u));
+			      T4J = VFMA(LDK(KP290284677), T4u, VMUL(LDK(KP956940335), T4x));
+			 }
+			 T4z = VADD(T4r, T4y);
+			 T4S = VSUB(T4J, T4I);
+			 T4K = VADD(T4I, T4J);
+			 T4O = VSUB(T4y, T4r);
+		    }
+		    {
+			 V T51, T5a, T54, T5b;
+			 {
+			      V T4Z, T50, T52, T53;
+			      T4Z = VSUB(T4l, T4m);
+			      T50 = VSUB(T4p, T4o);
+			      T51 = VFMA(LDK(KP881921264), T4Z, VMUL(LDK(KP471396736), T50));
+			      T5a = VFNMS(LDK(KP471396736), T4Z, VMUL(LDK(KP881921264), T50));
+			      T52 = VSUB(T4s, T4t);
+			      T53 = VSUB(T4w, T4v);
+			      T54 = VFNMS(LDK(KP471396736), T53, VMUL(LDK(KP881921264), T52));
+			      T5b = VFMA(LDK(KP471396736), T52, VMUL(LDK(KP881921264), T53));
+			 }
+			 T55 = VADD(T51, T54);
+			 T5k = VSUB(T5b, T5a);
+			 T5c = VADD(T5a, T5b);
+			 T5g = VSUB(T54, T51);
+		    }
+		    {
+			 V T4A, T4L, T5i, T5l;
+			 T4A = VADD(T4k, T4z);
+			 T4L = VBYI(VADD(T4H, T4K));
+			 ST(&(x[WS(rs, 61)]), VSUB(T4A, T4L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(T4A, T4L), ms, &(x[WS(rs, 1)]));
+			 T5i = VBYI(VSUB(T5g, T5h));
+			 T5l = VSUB(T5j, T5k);
+			 ST(&(x[WS(rs, 21)]), VADD(T5i, T5l), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 43)]), VSUB(T5l, T5i), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T5m, T5n, T4M, T4N;
+			 T5m = VBYI(VADD(T5h, T5g));
+			 T5n = VADD(T5j, T5k);
+			 ST(&(x[WS(rs, 11)]), VADD(T5m, T5n), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 53)]), VSUB(T5n, T5m), ms, &(x[WS(rs, 1)]));
+			 T4M = VSUB(T4k, T4z);
+			 T4N = VBYI(VSUB(T4K, T4H));
+			 ST(&(x[WS(rs, 35)]), VSUB(T4M, T4N), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 29)]), VADD(T4M, T4N), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T4Q, T4T, T56, T5d;
+			 T4Q = VBYI(VSUB(T4O, T4P));
+			 T4T = VSUB(T4R, T4S);
+			 ST(&(x[WS(rs, 19)]), VADD(T4Q, T4T), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 45)]), VSUB(T4T, T4Q), ms, &(x[WS(rs, 1)]));
+			 T56 = VADD(T4Y, T55);
+			 T5d = VBYI(VADD(T59, T5c));
+			 ST(&(x[WS(rs, 59)]), VSUB(T56, T5d), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VADD(T56, T5d), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T5e, T5f, T4U, T4V;
+			 T5e = VSUB(T4Y, T55);
+			 T5f = VBYI(VSUB(T5c, T59));
+			 ST(&(x[WS(rs, 37)]), VSUB(T5e, T5f), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 27)]), VADD(T5e, T5f), ms, &(x[WS(rs, 1)]));
+			 T4U = VBYI(VADD(T4P, T4O));
+			 T4V = VADD(T4R, T4S);
+			 ST(&(x[WS(rs, 13)]), VADD(T4U, T4V), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 51)]), VSUB(T4V, T4U), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	       {
+		    V T1u, T43, T3D, T3V, T3t, T45, T3B, T3K, T3d, T3E, T3w, T3A, T3R, T46, T3Y;
+		    V T42;
+		    {
+			 V TE, T3T, T1t, T3U, T13, T1s;
+			 TE = VSUB(Tg, TD);
+			 T3T = VADD(T3o, T3h);
+			 T13 = VFMA(LDK(KP195090322), TV, VMUL(LDK(KP980785280), T12));
+			 T1s = VFNMS(LDK(KP195090322), T1r, VMUL(LDK(KP980785280), T1k));
+			 T1t = VSUB(T13, T1s);
+			 T3U = VADD(T1s, T13);
+			 T1u = VADD(TE, T1t);
+			 T43 = VSUB(T3U, T3T);
+			 T3D = VSUB(TE, T1t);
+			 T3V = VADD(T3T, T3U);
+		    }
+		    {
+			 V T3p, T3I, T3s, T3J, T3q, T3r;
+			 T3p = VSUB(T3h, T3o);
+			 T3I = VADD(Tg, TD);
+			 T3q = VFNMS(LDK(KP195090322), T12, VMUL(LDK(KP980785280), TV));
+			 T3r = VFMA(LDK(KP980785280), T1r, VMUL(LDK(KP195090322), T1k));
+			 T3s = VSUB(T3q, T3r);
+			 T3J = VADD(T3r, T3q);
+			 T3t = VADD(T3p, T3s);
+			 T45 = VSUB(T3I, T3J);
+			 T3B = VSUB(T3s, T3p);
+			 T3K = VADD(T3I, T3J);
+		    }
+		    {
+			 V T2l, T3u, T3c, T3v;
+			 {
+			      V T29, T2k, T30, T3b;
+			      T29 = VSUB(T1R, T28);
+			      T2k = VSUB(T2g, T2j);
+			      T2l = VFMA(LDK(KP634393284), T29, VMUL(LDK(KP773010453), T2k));
+			      T3u = VFNMS(LDK(KP634393284), T2k, VMUL(LDK(KP773010453), T29));
+			      T30 = VSUB(T2C, T2Z);
+			      T3b = VSUB(T33, T3a);
+			      T3c = VFNMS(LDK(KP634393284), T3b, VMUL(LDK(KP773010453), T30));
+			      T3v = VFMA(LDK(KP773010453), T3b, VMUL(LDK(KP634393284), T30));
+			 }
+			 T3d = VADD(T2l, T3c);
+			 T3E = VSUB(T3v, T3u);
+			 T3w = VADD(T3u, T3v);
+			 T3A = VSUB(T3c, T2l);
+		    }
+		    {
+			 V T3N, T3W, T3Q, T3X;
+			 {
+			      V T3L, T3M, T3O, T3P;
+			      T3L = VADD(T28, T1R);
+			      T3M = VADD(T2g, T2j);
+			      T3N = VFMA(LDK(KP098017140), T3L, VMUL(LDK(KP995184726), T3M));
+			      T3W = VFNMS(LDK(KP098017140), T3M, VMUL(LDK(KP995184726), T3L));
+			      T3O = VADD(T2C, T2Z);
+			      T3P = VADD(T3a, T33);
+			      T3Q = VFNMS(LDK(KP098017140), T3P, VMUL(LDK(KP995184726), T3O));
+			      T3X = VFMA(LDK(KP995184726), T3P, VMUL(LDK(KP098017140), T3O));
+			 }
+			 T3R = VADD(T3N, T3Q);
+			 T46 = VSUB(T3X, T3W);
+			 T3Y = VADD(T3W, T3X);
+			 T42 = VSUB(T3Q, T3N);
+		    }
+		    {
+			 V T3e, T3x, T44, T47;
+			 T3e = VADD(T1u, T3d);
+			 T3x = VBYI(VADD(T3t, T3w));
+			 ST(&(x[WS(rs, 57)]), VSUB(T3e, T3x), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T3e, T3x), ms, &(x[WS(rs, 1)]));
+			 T44 = VBYI(VSUB(T42, T43));
+			 T47 = VSUB(T45, T46);
+			 ST(&(x[WS(rs, 17)]), VADD(T44, T47), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 47)]), VSUB(T47, T44), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T48, T49, T3y, T3z;
+			 T48 = VBYI(VADD(T43, T42));
+			 T49 = VADD(T45, T46);
+			 ST(&(x[WS(rs, 15)]), VADD(T48, T49), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 49)]), VSUB(T49, T48), ms, &(x[WS(rs, 1)]));
+			 T3y = VSUB(T1u, T3d);
+			 T3z = VBYI(VSUB(T3w, T3t));
+			 ST(&(x[WS(rs, 39)]), VSUB(T3y, T3z), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 25)]), VADD(T3y, T3z), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T3C, T3F, T3S, T3Z;
+			 T3C = VBYI(VSUB(T3A, T3B));
+			 T3F = VSUB(T3D, T3E);
+			 ST(&(x[WS(rs, 23)]), VADD(T3C, T3F), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 41)]), VSUB(T3F, T3C), ms, &(x[WS(rs, 1)]));
+			 T3S = VADD(T3K, T3R);
+			 T3Z = VBYI(VADD(T3V, T3Y));
+			 ST(&(x[WS(rs, 63)]), VSUB(T3S, T3Z), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T3S, T3Z), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T40, T41, T3G, T3H;
+			 T40 = VSUB(T3K, T3R);
+			 T41 = VBYI(VSUB(T3Y, T3V));
+			 ST(&(x[WS(rs, 33)]), VSUB(T40, T41), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 31)]), VADD(T40, T41), ms, &(x[WS(rs, 1)]));
+			 T3G = VBYI(VADD(T3B, T3A));
+			 T3H = VADD(T3D, T3E);
+			 ST(&(x[WS(rs, 9)]), VADD(T3G, T3H), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 55)]), VSUB(T3H, T3G), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     VTW(0, 8),
+     VTW(0, 9),
+     VTW(0, 10),
+     VTW(0, 11),
+     VTW(0, 12),
+     VTW(0, 13),
+     VTW(0, 14),
+     VTW(0, 15),
+     VTW(0, 16),
+     VTW(0, 17),
+     VTW(0, 18),
+     VTW(0, 19),
+     VTW(0, 20),
+     VTW(0, 21),
+     VTW(0, 22),
+     VTW(0, 23),
+     VTW(0, 24),
+     VTW(0, 25),
+     VTW(0, 26),
+     VTW(0, 27),
+     VTW(0, 28),
+     VTW(0, 29),
+     VTW(0, 30),
+     VTW(0, 31),
+     VTW(0, 32),
+     VTW(0, 33),
+     VTW(0, 34),
+     VTW(0, 35),
+     VTW(0, 36),
+     VTW(0, 37),
+     VTW(0, 38),
+     VTW(0, 39),
+     VTW(0, 40),
+     VTW(0, 41),
+     VTW(0, 42),
+     VTW(0, 43),
+     VTW(0, 44),
+     VTW(0, 45),
+     VTW(0, 46),
+     VTW(0, 47),
+     VTW(0, 48),
+     VTW(0, 49),
+     VTW(0, 50),
+     VTW(0, 51),
+     VTW(0, 52),
+     VTW(0, 53),
+     VTW(0, 54),
+     VTW(0, 55),
+     VTW(0, 56),
+     VTW(0, 57),
+     VTW(0, 58),
+     VTW(0, 59),
+     VTW(0, 60),
+     VTW(0, 61),
+     VTW(0, 62),
+     VTW(0, 63),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 64, XSIMD_STRING("t2fv_64"), twinstr, &GENUS, {467, 198, 52, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_64) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_64, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:35 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t2fv_8 -include t2f.h */
+
+/*
+ * This function contains 33 FP additions, 24 FP multiplications,
+ * (or, 23 additions, 14 multiplications, 10 fused multiply/add),
+ * 36 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T1, T2, Th, Tj, T5, T7, Ta, Tc;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       Tj = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+	       Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T3, Ti, Tk, T6, T8, Tb, Td;
+		    T3 = BYTWJ(&(W[TWVL * 6]), T2);
+		    Ti = BYTWJ(&(W[TWVL * 2]), Th);
+		    Tk = BYTWJ(&(W[TWVL * 10]), Tj);
+		    T6 = BYTWJ(&(W[0]), T5);
+		    T8 = BYTWJ(&(W[TWVL * 8]), T7);
+		    Tb = BYTWJ(&(W[TWVL * 12]), Ta);
+		    Td = BYTWJ(&(W[TWVL * 4]), Tc);
+		    {
+			 V Tq, T4, Tr, Tl, Tt, T9, Tu, Te, Tw, Ts;
+			 Tq = VADD(T1, T3);
+			 T4 = VSUB(T1, T3);
+			 Tr = VADD(Ti, Tk);
+			 Tl = VSUB(Ti, Tk);
+			 Tt = VADD(T6, T8);
+			 T9 = VSUB(T6, T8);
+			 Tu = VADD(Tb, Td);
+			 Te = VSUB(Tb, Td);
+			 Tw = VSUB(Tq, Tr);
+			 Ts = VADD(Tq, Tr);
+			 {
+			      V Tx, Tv, Tm, Tf;
+			      Tx = VSUB(Tu, Tt);
+			      Tv = VADD(Tt, Tu);
+			      Tm = VSUB(Te, T9);
+			      Tf = VADD(T9, Te);
+			      {
+				   V Tp, Tn, To, Tg;
+				   ST(&(x[WS(rs, 2)]), VFMAI(Tx, Tw), ms, &(x[0]));
+				   ST(&(x[WS(rs, 6)]), VFNMSI(Tx, Tw), ms, &(x[0]));
+				   ST(&(x[0]), VADD(Ts, Tv), ms, &(x[0]));
+				   ST(&(x[WS(rs, 4)]), VSUB(Ts, Tv), ms, &(x[0]));
+				   Tp = VFMA(LDK(KP707106781), Tm, Tl);
+				   Tn = VFNMS(LDK(KP707106781), Tm, Tl);
+				   To = VFNMS(LDK(KP707106781), Tf, T4);
+				   Tg = VFMA(LDK(KP707106781), Tf, T4);
+				   ST(&(x[WS(rs, 5)]), VFNMSI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 3)]), VFMAI(Tp, To), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 7)]), VFMAI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 1)]), VFNMSI(Tn, Tg), ms, &(x[WS(rs, 1)]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t2fv_8"), twinstr, &GENUS, {23, 14, 10, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_8) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t2fv_8 -include t2f.h */
+
+/*
+ * This function contains 33 FP additions, 16 FP multiplications,
+ * (or, 33 additions, 16 multiplications, 0 fused multiply/add),
+ * 24 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t2f.h"
+
+static void t2fv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T4, Tq, Tm, Tr, T9, Tt, Te, Tu, T1, T3, T2;
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       T3 = BYTWJ(&(W[TWVL * 6]), T2);
+	       T4 = VSUB(T1, T3);
+	       Tq = VADD(T1, T3);
+	       {
+		    V Tj, Tl, Ti, Tk;
+		    Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tj = BYTWJ(&(W[TWVL * 2]), Ti);
+		    Tk = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    Tl = BYTWJ(&(W[TWVL * 10]), Tk);
+		    Tm = VSUB(Tj, Tl);
+		    Tr = VADD(Tj, Tl);
+	       }
+	       {
+		    V T6, T8, T5, T7;
+		    T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T6 = BYTWJ(&(W[0]), T5);
+		    T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    T8 = BYTWJ(&(W[TWVL * 8]), T7);
+		    T9 = VSUB(T6, T8);
+		    Tt = VADD(T6, T8);
+	       }
+	       {
+		    V Tb, Td, Ta, Tc;
+		    Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Tb = BYTWJ(&(W[TWVL * 12]), Ta);
+		    Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Td = BYTWJ(&(W[TWVL * 4]), Tc);
+		    Te = VSUB(Tb, Td);
+		    Tu = VADD(Tb, Td);
+	       }
+	       {
+		    V Ts, Tv, Tw, Tx;
+		    Ts = VADD(Tq, Tr);
+		    Tv = VADD(Tt, Tu);
+		    ST(&(x[WS(rs, 4)]), VSUB(Ts, Tv), ms, &(x[0]));
+		    ST(&(x[0]), VADD(Ts, Tv), ms, &(x[0]));
+		    Tw = VSUB(Tq, Tr);
+		    Tx = VBYI(VSUB(Tu, Tt));
+		    ST(&(x[WS(rs, 6)]), VSUB(Tw, Tx), ms, &(x[0]));
+		    ST(&(x[WS(rs, 2)]), VADD(Tw, Tx), ms, &(x[0]));
+		    {
+			 V Tg, To, Tn, Tp, Tf, Th;
+			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
+			 Tg = VADD(T4, Tf);
+			 To = VSUB(T4, Tf);
+			 Th = VMUL(LDK(KP707106781), VSUB(Te, T9));
+			 Tn = VBYI(VSUB(Th, Tm));
+			 Tp = VBYI(VADD(Tm, Th));
+			 ST(&(x[WS(rs, 7)]), VSUB(Tg, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(Tg, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VSUB(To, Tp), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 2),
+     VTW(0, 3),
+     VTW(0, 4),
+     VTW(0, 5),
+     VTW(0, 6),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t2fv_8"), twinstr, &GENUS, {33, 16, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2fv_8) (planner *p) {
+     X(kdft_dit_register) (p, t2fv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,824 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:26 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 16 -name t2sv_16 -include ts.h */
+
+/*
+ * This function contains 196 FP additions, 134 FP multiplications,
+ * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
+ * 120 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "ts.h"
+
+static void t2sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T34, T30, T2N, T2v, T2M, T2g, T3V, T3X, T32, T2U, T33, T2X, T2O, T2K, T3P;
+	       V T3R;
+	       {
+		    V T2, Tf, TM, TO, T3, T6, T5, Th;
+		    T2 = LDW(&(W[0]));
+		    Tf = LDW(&(W[TWVL * 2]));
+		    TM = LDW(&(W[TWVL * 6]));
+		    TO = LDW(&(W[TWVL * 7]));
+		    T3 = LDW(&(W[TWVL * 4]));
+		    T6 = LDW(&(W[TWVL * 5]));
+		    T5 = LDW(&(W[TWVL * 1]));
+		    Th = LDW(&(W[TWVL * 3]));
+		    {
+			 V TW, TZ, Te, T1U, T3A, T3L, T2D, T1G, T3h, T2A, T2B, T1R, T3i, T2I, Tx;
+			 V T3M, T1Z, T3w, TL, T26, T25, T37, T1l, T2q, T1d, T2o, T2l, T3c, T1r, T2s;
+			 V TX, T10, TV, T2a;
+			 {
+			      V Tz, TP, TT, Tq, TF, Tu, TI, Tm, TC, T1j, T1p, T1m, T1f, T1O, T1M;
+			      V T1K, T2F, Tj, Tn, T1Q, T2G, Tk, T1V, Tr, Tv;
+			      {
+				   V T1, Ti, Tb, T3z, T8, Tc, T1u, T1D, T1L, T1z, T9, T3x, T1v, T1w, T1A;
+				   V T1E;
+				   {
+					V T7, T1i, T1e, T1C, T1y;
+					T1 = LD(&(ri[0]), ms, &(ri[0]));
+					{
+					     V Tg, TN, TS, Tp;
+					     Tg = VMUL(T2, Tf);
+					     TN = VMUL(T2, TM);
+					     TS = VMUL(T2, TO);
+					     Tp = VMUL(Tf, T3);
+					     {
+						  V T4, Tt, Ta, Tl;
+						  T4 = VMUL(T2, T3);
+						  Tt = VMUL(Tf, T6);
+						  Ta = VMUL(T2, T6);
+						  Tl = VMUL(T2, Th);
+						  Ti = VFNMS(T5, Th, Tg);
+						  Tz = VFMA(T5, Th, Tg);
+						  TP = VFMA(T5, TO, TN);
+						  TT = VFNMS(T5, TM, TS);
+						  TW = VFMA(Th, T6, Tp);
+						  Tq = VFNMS(Th, T6, Tp);
+						  TF = VFNMS(T5, T6, T4);
+						  T7 = VFMA(T5, T6, T4);
+						  Tu = VFMA(Th, T3, Tt);
+						  TZ = VFNMS(Th, T3, Tt);
+						  TI = VFMA(T5, T3, Ta);
+						  Tb = VFNMS(T5, T3, Ta);
+						  Tm = VFMA(T5, Tf, Tl);
+						  TC = VFNMS(T5, Tf, Tl);
+						  T1i = VMUL(Ti, T6);
+						  T1e = VMUL(Ti, T3);
+						  T1C = VMUL(Tz, T6);
+						  T1y = VMUL(Tz, T3);
+						  T3z = LD(&(ii[0]), ms, &(ii[0]));
+					     }
+					}
+					T8 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
+					Tc = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
+					T1u = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
+					T1j = VFNMS(Tm, T3, T1i);
+					T1p = VFMA(Tm, T3, T1i);
+					T1m = VFNMS(Tm, T6, T1e);
+					T1f = VFMA(Tm, T6, T1e);
+					T1D = VFNMS(TC, T3, T1C);
+					T1O = VFMA(TC, T3, T1C);
+					T1L = VFNMS(TC, T6, T1y);
+					T1z = VFMA(TC, T6, T1y);
+					T9 = VMUL(T7, T8);
+					T3x = VMUL(T7, Tc);
+					T1v = VMUL(TM, T1u);
+					T1w = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
+					T1A = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+					T1E = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+				   }
+				   {
+					V T1x, T2x, T1F, T2z, T1N, T1P;
+					{
+					     V T1H, T1J, T1I, T2E;
+					     {
+						  V Td, T3y, T2w, T1B, T2y;
+						  T1H = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+						  T1J = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+						  Td = VFMA(Tb, Tc, T9);
+						  T3y = VFNMS(Tb, T8, T3x);
+						  T1M = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
+						  T1x = VFMA(TO, T1w, T1v);
+						  T2w = VMUL(TM, T1w);
+						  T1B = VMUL(T1z, T1A);
+						  T2y = VMUL(T1z, T1E);
+						  T1I = VMUL(Tf, T1H);
+						  T2E = VMUL(Tf, T1J);
+						  Te = VADD(T1, Td);
+						  T1U = VSUB(T1, Td);
+						  T3A = VADD(T3y, T3z);
+						  T3L = VSUB(T3z, T3y);
+						  T2x = VFNMS(TO, T1u, T2w);
+						  T1F = VFMA(T1D, T1E, T1B);
+						  T2z = VFNMS(T1D, T1A, T2y);
+						  T1N = VMUL(T1L, T1M);
+						  T1P = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
+					     }
+					     T1K = VFMA(Th, T1J, T1I);
+					     T2F = VFNMS(Th, T1H, T2E);
+					}
+					Tj = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+					Tn = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+					T2D = VSUB(T1x, T1F);
+					T1G = VADD(T1x, T1F);
+					T3h = VADD(T2x, T2z);
+					T2A = VSUB(T2x, T2z);
+					T1Q = VFMA(T1O, T1P, T1N);
+					T2G = VMUL(T1L, T1P);
+					Tk = VMUL(Ti, Tj);
+					T1V = VMUL(Ti, Tn);
+					Tr = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
+					Tv = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
+				   }
+			      }
+			      {
+				   V TE, T22, T15, T17, TK, T16, T2h, T24, T19, T1b;
+				   {
+					V To, T1W, TG, TJ, Tw, T1Y, TH, T23;
+					{
+					     V TA, TD, TB, T21, T2H, Ts, T1X;
+					     TA = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+					     TD = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+					     T2B = VSUB(T1K, T1Q);
+					     T1R = VADD(T1K, T1Q);
+					     T2H = VFNMS(T1O, T1M, T2G);
+					     To = VFMA(Tm, Tn, Tk);
+					     T1W = VFNMS(Tm, Tj, T1V);
+					     Ts = VMUL(Tq, Tr);
+					     T1X = VMUL(Tq, Tv);
+					     TB = VMUL(Tz, TA);
+					     T21 = VMUL(Tz, TD);
+					     TG = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
+					     T3i = VADD(T2F, T2H);
+					     T2I = VSUB(T2F, T2H);
+					     TJ = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
+					     Tw = VFMA(Tu, Tv, Ts);
+					     T1Y = VFNMS(Tu, Tr, T1X);
+					     TE = VFMA(TC, TD, TB);
+					     T22 = VFNMS(TC, TA, T21);
+					     TH = VMUL(TF, TG);
+					}
+					T15 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+					T17 = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+					T23 = VMUL(TF, TJ);
+					Tx = VADD(To, Tw);
+					T3M = VSUB(To, Tw);
+					T1Z = VSUB(T1W, T1Y);
+					T3w = VADD(T1W, T1Y);
+					TK = VFMA(TI, TJ, TH);
+					T16 = VMUL(T2, T15);
+					T2h = VMUL(T2, T17);
+					T24 = VFNMS(TI, TG, T23);
+					T19 = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
+					T1b = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
+				   }
+				   {
+					V T1g, T1k, T18, T2i, T1a, T2j, T1h, T2p, T1n, T1q;
+					T1g = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+					T1k = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+					TL = VADD(TE, TK);
+					T26 = VSUB(TE, TK);
+					T18 = VFMA(T5, T17, T16);
+					T2i = VFNMS(T5, T15, T2h);
+					T25 = VSUB(T22, T24);
+					T37 = VADD(T22, T24);
+					T1a = VMUL(T3, T19);
+					T2j = VMUL(T3, T1b);
+					T1h = VMUL(T1f, T1g);
+					T2p = VMUL(T1f, T1k);
+					T1n = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
+					T1q = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
+					{
+					     V TQ, TU, TR, T29;
+					     {
+						  V T1c, T2k, T1o, T2r;
+						  TQ = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
+						  TU = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
+						  T1c = VFMA(T6, T1b, T1a);
+						  T2k = VFNMS(T6, T19, T2j);
+						  T1l = VFMA(T1j, T1k, T1h);
+						  T2q = VFNMS(T1j, T1g, T2p);
+						  T1o = VMUL(T1m, T1n);
+						  T2r = VMUL(T1m, T1q);
+						  TR = VMUL(TP, TQ);
+						  T29 = VMUL(TP, TU);
+						  T1d = VADD(T18, T1c);
+						  T2o = VSUB(T18, T1c);
+						  T2l = VSUB(T2i, T2k);
+						  T3c = VADD(T2i, T2k);
+						  T1r = VFMA(T1p, T1q, T1o);
+						  T2s = VFNMS(T1p, T1n, T2r);
+						  TX = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+						  T10 = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+					     }
+					     TV = VFMA(TT, TU, TR);
+					     T2a = VFNMS(TT, TQ, T29);
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T36, Ty, T3B, T3G, T1s, T2m, T2t, T3d, TY, T2b, T3g, T1S, T3s, T3j;
+			      T36 = VSUB(Te, Tx);
+			      Ty = VADD(Te, Tx);
+			      T3B = VADD(T3w, T3A);
+			      T3G = VSUB(T3A, T3w);
+			      T1s = VADD(T1l, T1r);
+			      T2m = VSUB(T1l, T1r);
+			      T2t = VSUB(T2q, T2s);
+			      T3d = VADD(T2q, T2s);
+			      TY = VMUL(TW, TX);
+			      T2b = VMUL(TW, T10);
+			      T3g = VSUB(T1G, T1R);
+			      T1S = VADD(T1G, T1R);
+			      T3s = VADD(T3h, T3i);
+			      T3j = VSUB(T3h, T3i);
+			      {
+				   V T3D, T1T, T3u, T3t, T28, T12, T38, T2d, T3n, T3f;
+				   {
+					V T1t, T3b, T3e, T3r, T11, T2c;
+					T1t = VADD(T1d, T1s);
+					T3b = VSUB(T1d, T1s);
+					T3e = VSUB(T3c, T3d);
+					T3r = VADD(T3c, T3d);
+					T11 = VFMA(TZ, T10, TY);
+					T2c = VFNMS(TZ, TX, T2b);
+					T3D = VSUB(T1S, T1t);
+					T1T = VADD(T1t, T1S);
+					T3u = VADD(T3r, T3s);
+					T3t = VSUB(T3r, T3s);
+					T28 = VSUB(TV, T11);
+					T12 = VADD(TV, T11);
+					T38 = VADD(T2a, T2c);
+					T2d = VSUB(T2a, T2c);
+					T3n = VSUB(T3e, T3b);
+					T3f = VADD(T3b, T3e);
+				   }
+				   {
+					V T2Q, T20, T3N, T3T, T2J, T2C, T2W, T2V, T3O, T2f, T3U, T2T;
+					{
+					     V T2R, T27, T2e, T2S, T13, T3F;
+					     T2Q = VADD(T1U, T1Z);
+					     T20 = VSUB(T1U, T1Z);
+					     T3N = VSUB(T3L, T3M);
+					     T3T = VADD(T3M, T3L);
+					     T13 = VADD(TL, T12);
+					     T3F = VSUB(T12, TL);
+					     {
+						  V T3v, T39, T3o, T3k;
+						  T3v = VADD(T37, T38);
+						  T39 = VSUB(T37, T38);
+						  T3o = VADD(T3g, T3j);
+						  T3k = VSUB(T3g, T3j);
+						  {
+						       V T3H, T3J, T14, T3q;
+						       T3H = VADD(T3F, T3G);
+						       T3J = VSUB(T3G, T3F);
+						       T14 = VADD(Ty, T13);
+						       T3q = VSUB(Ty, T13);
+						       {
+							    V T3a, T3m, T3C, T3E;
+							    T3a = VADD(T36, T39);
+							    T3m = VSUB(T36, T39);
+							    T3C = VADD(T3v, T3B);
+							    T3E = VSUB(T3B, T3v);
+							    {
+								 V T3I, T3p, T3l, T3K;
+								 T3I = VADD(T3n, T3o);
+								 T3p = VSUB(T3n, T3o);
+								 T3l = VADD(T3f, T3k);
+								 T3K = VSUB(T3k, T3f);
+								 ST(&(ri[WS(rs, 4)]), VADD(T3q, T3t), ms, &(ri[0]));
+								 ST(&(ri[WS(rs, 12)]), VSUB(T3q, T3t), ms, &(ri[0]));
+								 ST(&(ri[0]), VADD(T14, T1T), ms, &(ri[0]));
+								 ST(&(ri[WS(rs, 8)]), VSUB(T14, T1T), ms, &(ri[0]));
+								 ST(&(ii[WS(rs, 4)]), VADD(T3D, T3E), ms, &(ii[0]));
+								 ST(&(ii[WS(rs, 12)]), VSUB(T3E, T3D), ms, &(ii[0]));
+								 ST(&(ii[0]), VADD(T3u, T3C), ms, &(ii[0]));
+								 ST(&(ii[WS(rs, 8)]), VSUB(T3C, T3u), ms, &(ii[0]));
+								 ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP707106781), T3p, T3m), ms, &(ri[0]));
+								 ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3p, T3m), ms, &(ri[0]));
+								 ST(&(ii[WS(rs, 10)]), VFNMS(LDK(KP707106781), T3I, T3H), ms, &(ii[0]));
+								 ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP707106781), T3I, T3H), ms, &(ii[0]));
+								 ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3K, T3J), ms, &(ii[0]));
+								 ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP707106781), T3K, T3J), ms, &(ii[0]));
+								 ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP707106781), T3l, T3a), ms, &(ri[0]));
+								 ST(&(ri[WS(rs, 10)]), VFNMS(LDK(KP707106781), T3l, T3a), ms, &(ri[0]));
+								 T2R = VADD(T26, T25);
+								 T27 = VSUB(T25, T26);
+								 T2e = VADD(T28, T2d);
+								 T2S = VSUB(T28, T2d);
+							    }
+						       }
+						  }
+					     }
+					     {
+						  V T2Y, T2Z, T2n, T2u;
+						  T2J = VSUB(T2D, T2I);
+						  T2Y = VADD(T2D, T2I);
+						  T2Z = VSUB(T2A, T2B);
+						  T2C = VADD(T2A, T2B);
+						  T2W = VSUB(T2l, T2m);
+						  T2n = VADD(T2l, T2m);
+						  T2u = VSUB(T2o, T2t);
+						  T2V = VADD(T2o, T2t);
+						  T3O = VADD(T27, T2e);
+						  T2f = VSUB(T27, T2e);
+						  T34 = VFMA(LDK(KP414213562), T2Y, T2Z);
+						  T30 = VFNMS(LDK(KP414213562), T2Z, T2Y);
+						  T3U = VSUB(T2S, T2R);
+						  T2T = VADD(T2R, T2S);
+						  T2N = VFNMS(LDK(KP414213562), T2n, T2u);
+						  T2v = VFMA(LDK(KP414213562), T2u, T2n);
+					     }
+					}
+					T2M = VFNMS(LDK(KP707106781), T2f, T20);
+					T2g = VFMA(LDK(KP707106781), T2f, T20);
+					T3V = VFMA(LDK(KP707106781), T3U, T3T);
+					T3X = VFNMS(LDK(KP707106781), T3U, T3T);
+					T32 = VFNMS(LDK(KP707106781), T2T, T2Q);
+					T2U = VFMA(LDK(KP707106781), T2T, T2Q);
+					T33 = VFNMS(LDK(KP414213562), T2V, T2W);
+					T2X = VFMA(LDK(KP414213562), T2W, T2V);
+					T2O = VFMA(LDK(KP414213562), T2C, T2J);
+					T2K = VFNMS(LDK(KP414213562), T2J, T2C);
+					T3P = VFMA(LDK(KP707106781), T3O, T3N);
+					T3R = VFNMS(LDK(KP707106781), T3O, T3N);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T3Q, T35, T31, T3S;
+		    T3Q = VADD(T33, T34);
+		    T35 = VSUB(T33, T34);
+		    T31 = VADD(T2X, T30);
+		    T3S = VSUB(T30, T2X);
+		    {
+			 V T3W, T2P, T2L, T3Y;
+			 T3W = VSUB(T2O, T2N);
+			 T2P = VADD(T2N, T2O);
+			 T2L = VSUB(T2v, T2K);
+			 T3Y = VADD(T2v, T2K);
+			 ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP923879532), T35, T32), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP923879532), T35, T32), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 9)]), VFNMS(LDK(KP923879532), T3Q, T3P), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP923879532), T3Q, T3P), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP923879532), T3S, T3R), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP923879532), T3S, T3R), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP923879532), T31, T2U), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 9)]), VFNMS(LDK(KP923879532), T31, T2U), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 15)]), VFMA(LDK(KP923879532), T2P, T2M), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP923879532), T2P, T2M), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 11)]), VFNMS(LDK(KP923879532), T3W, T3V), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP923879532), T3W, T3V), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 15)]), VFMA(LDK(KP923879532), T3Y, T3X), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP923879532), T3Y, T3X), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP923879532), T2L, T2g), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 11)]), VFNMS(LDK(KP923879532), T2L, T2g), ms, &(ri[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 15),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t2sv_16"), twinstr, &GENUS, {104, 42, 92, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2sv_16) (planner *p) {
+     X(kdft_dit_register) (p, t2sv_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 16 -name t2sv_16 -include ts.h */
+
+/*
+ * This function contains 196 FP additions, 108 FP multiplications,
+ * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
+ * 82 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "ts.h"
+
+static void t2sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
+	       V Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
+	       {
+		    V T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
+		    {
+			 V Th, Tn, Tj, Tm;
+			 T2 = LDW(&(W[0]));
+			 T5 = LDW(&(W[TWVL * 1]));
+			 Tg = LDW(&(W[TWVL * 2]));
+			 Ti = LDW(&(W[TWVL * 3]));
+			 Th = VMUL(T2, Tg);
+			 Tn = VMUL(T5, Tg);
+			 Tj = VMUL(T5, Ti);
+			 Tm = VMUL(T2, Ti);
+			 Tk = VSUB(Th, Tj);
+			 To = VADD(Tm, Tn);
+			 TE = VSUB(Tm, Tn);
+			 TC = VADD(Th, Tj);
+			 T6 = LDW(&(W[TWVL * 5]));
+			 T7 = VMUL(T5, T6);
+			 Tv = VMUL(Tg, T6);
+			 Ta = VMUL(T2, T6);
+			 Ts = VMUL(Ti, T6);
+			 T3 = LDW(&(W[TWVL * 4]));
+			 T4 = VMUL(T2, T3);
+			 Tw = VMUL(Ti, T3);
+			 Tb = VMUL(T5, T3);
+			 Tr = VMUL(Tg, T3);
+		    }
+		    T8 = VADD(T4, T7);
+		    TW = VSUB(Tv, Tw);
+		    TJ = VADD(Ta, Tb);
+		    Tt = VSUB(Tr, Ts);
+		    TU = VADD(Tr, Ts);
+		    Tc = VSUB(Ta, Tb);
+		    Tx = VADD(Tv, Tw);
+		    TH = VSUB(T4, T7);
+		    TN = LDW(&(W[TWVL * 6]));
+		    TO = LDW(&(W[TWVL * 7]));
+		    TP = VFMA(T2, TN, VMUL(T5, TO));
+		    TR = VFNMS(T5, TN, VMUL(T2, TO));
+		    {
+			 V T1d, T1e, T19, T1a;
+			 T1d = VMUL(Tk, T6);
+			 T1e = VMUL(To, T3);
+			 T1f = VSUB(T1d, T1e);
+			 T1k = VADD(T1d, T1e);
+			 T19 = VMUL(Tk, T3);
+			 T1a = VMUL(To, T6);
+			 T1b = VADD(T19, T1a);
+			 T1i = VSUB(T19, T1a);
+		    }
+		    {
+			 V T1w, T1x, T1s, T1t;
+			 T1w = VMUL(TC, T6);
+			 T1x = VMUL(TE, T3);
+			 T1y = VSUB(T1w, T1x);
+			 T1H = VADD(T1w, T1x);
+			 T1s = VMUL(TC, T3);
+			 T1t = VMUL(TE, T6);
+			 T1u = VADD(T1s, T1t);
+			 T1F = VSUB(T1s, T1t);
+		    }
+	       }
+	       {
+		    V Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
+		    V T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
+		    V T2S, T2T, T28, T2A, T2d, T2B;
+		    {
+			 V T1, T3d, Te, T3c, T9, Td;
+			 T1 = LD(&(ri[0]), ms, &(ri[0]));
+			 T3d = LD(&(ii[0]), ms, &(ii[0]));
+			 T9 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
+			 Td = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
+			 Te = VFMA(T8, T9, VMUL(Tc, Td));
+			 T3c = VFNMS(Tc, T9, VMUL(T8, Td));
+			 Tf = VADD(T1, Te);
+			 T3r = VSUB(T3d, T3c);
+			 T1N = VSUB(T1, Te);
+			 T3e = VADD(T3c, T3d);
+		    }
+		    {
+			 V Tq, T1O, Tz, T1P;
+			 {
+			      V Tl, Tp, Tu, Ty;
+			      Tl = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+			      Tp = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+			      Tq = VFMA(Tk, Tl, VMUL(To, Tp));
+			      T1O = VFNMS(To, Tl, VMUL(Tk, Tp));
+			      Tu = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
+			      Ty = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
+			      Tz = VFMA(Tt, Tu, VMUL(Tx, Ty));
+			      T1P = VFNMS(Tx, Tu, VMUL(Tt, Ty));
+			 }
+			 TA = VADD(Tq, Tz);
+			 T3s = VSUB(Tq, Tz);
+			 T1Q = VSUB(T1O, T1P);
+			 T3b = VADD(T1O, T1P);
+		    }
+		    {
+			 V TG, T1S, TL, T1T, T1U, T1V;
+			 {
+			      V TD, TF, TI, TK;
+			      TD = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+			      TF = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+			      TG = VFMA(TC, TD, VMUL(TE, TF));
+			      T1S = VFNMS(TE, TD, VMUL(TC, TF));
+			      TI = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
+			      TK = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
+			      TL = VFMA(TH, TI, VMUL(TJ, TK));
+			      T1T = VFNMS(TJ, TI, VMUL(TH, TK));
+			 }
+			 TM = VADD(TG, TL);
+			 T2M = VADD(T1S, T1T);
+			 T1U = VSUB(T1S, T1T);
+			 T1V = VSUB(TG, TL);
+			 T1W = VSUB(T1U, T1V);
+			 T2w = VADD(T1V, T1U);
+		    }
+		    {
+			 V TT, T1Y, TY, T1Z, T1X, T20;
+			 {
+			      V TQ, TS, TV, TX;
+			      TQ = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
+			      TS = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
+			      TT = VFMA(TP, TQ, VMUL(TR, TS));
+			      T1Y = VFNMS(TR, TQ, VMUL(TP, TS));
+			      TV = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+			      TX = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+			      TY = VFMA(TU, TV, VMUL(TW, TX));
+			      T1Z = VFNMS(TW, TV, VMUL(TU, TX));
+			 }
+			 TZ = VADD(TT, TY);
+			 T2N = VADD(T1Y, T1Z);
+			 T1X = VSUB(TT, TY);
+			 T20 = VSUB(T1Y, T1Z);
+			 T21 = VADD(T1X, T20);
+			 T2x = VSUB(T1X, T20);
+		    }
+		    {
+			 V T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
+			 {
+			      V T1p, T1q, T1G, T1I;
+			      T1p = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
+			      T1q = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
+			      T1r = VFMA(TN, T1p, VMUL(TO, T1q));
+			      T2k = VFNMS(TO, T1p, VMUL(TN, T1q));
+			      T1G = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
+			      T1I = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
+			      T1J = VFMA(T1F, T1G, VMUL(T1H, T1I));
+			      T2h = VFNMS(T1H, T1G, VMUL(T1F, T1I));
+			 }
+			 {
+			      V T1v, T1z, T1C, T1D;
+			      T1v = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+			      T1z = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+			      T1A = VFMA(T1u, T1v, VMUL(T1y, T1z));
+			      T2l = VFNMS(T1y, T1v, VMUL(T1u, T1z));
+			      T1C = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+			      T1D = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+			      T1E = VFMA(Tg, T1C, VMUL(Ti, T1D));
+			      T2g = VFNMS(Ti, T1C, VMUL(Tg, T1D));
+			 }
+			 T1B = VADD(T1r, T1A);
+			 T1K = VADD(T1E, T1J);
+			 T2V = VSUB(T1B, T1K);
+			 T2W = VADD(T2k, T2l);
+			 T2X = VADD(T2g, T2h);
+			 T2Y = VSUB(T2W, T2X);
+			 {
+			      V T2f, T2i, T2m, T2n;
+			      T2f = VSUB(T1r, T1A);
+			      T2i = VSUB(T2g, T2h);
+			      T2j = VSUB(T2f, T2i);
+			      T2D = VADD(T2f, T2i);
+			      T2m = VSUB(T2k, T2l);
+			      T2n = VSUB(T1E, T1J);
+			      T2o = VADD(T2m, T2n);
+			      T2E = VSUB(T2m, T2n);
+			 }
+		    }
+		    {
+			 V T14, T24, T1m, T2b, T17, T25, T1h, T2a;
+			 {
+			      V T12, T13, T1j, T1l;
+			      T12 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+			      T13 = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+			      T14 = VFMA(T2, T12, VMUL(T5, T13));
+			      T24 = VFNMS(T5, T12, VMUL(T2, T13));
+			      T1j = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
+			      T1l = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
+			      T1m = VFMA(T1i, T1j, VMUL(T1k, T1l));
+			      T2b = VFNMS(T1k, T1j, VMUL(T1i, T1l));
+			 }
+			 {
+			      V T15, T16, T1c, T1g;
+			      T15 = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
+			      T16 = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
+			      T17 = VFMA(T3, T15, VMUL(T6, T16));
+			      T25 = VFNMS(T6, T15, VMUL(T3, T16));
+			      T1c = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+			      T1g = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+			      T1h = VFMA(T1b, T1c, VMUL(T1f, T1g));
+			      T2a = VFNMS(T1f, T1c, VMUL(T1b, T1g));
+			 }
+			 T18 = VADD(T14, T17);
+			 T1n = VADD(T1h, T1m);
+			 T2Q = VSUB(T18, T1n);
+			 T2R = VADD(T24, T25);
+			 T2S = VADD(T2a, T2b);
+			 T2T = VSUB(T2R, T2S);
+			 {
+			      V T26, T27, T29, T2c;
+			      T26 = VSUB(T24, T25);
+			      T27 = VSUB(T1h, T1m);
+			      T28 = VADD(T26, T27);
+			      T2A = VSUB(T26, T27);
+			      T29 = VSUB(T14, T17);
+			      T2c = VSUB(T2a, T2b);
+			      T2d = VSUB(T29, T2c);
+			      T2B = VADD(T29, T2c);
+			 }
+		    }
+		    {
+			 V T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
+			 {
+			      V T1R, T22, T3y, T3z;
+			      T1R = VSUB(T1N, T1Q);
+			      T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
+			      T23 = VADD(T1R, T22);
+			      T2r = VSUB(T1R, T22);
+			      T3y = VMUL(LDK(KP707106781), VSUB(T2x, T2w));
+			      T3z = VADD(T3s, T3r);
+			      T3A = VADD(T3y, T3z);
+			      T3C = VSUB(T3z, T3y);
+			 }
+			 {
+			      V T2e, T2p, T2s, T2t;
+			      T2e = VFMA(LDK(KP923879532), T28, VMUL(LDK(KP382683432), T2d));
+			      T2p = VFNMS(LDK(KP923879532), T2o, VMUL(LDK(KP382683432), T2j));
+			      T2q = VADD(T2e, T2p);
+			      T3B = VSUB(T2p, T2e);
+			      T2s = VFNMS(LDK(KP923879532), T2d, VMUL(LDK(KP382683432), T28));
+			      T2t = VFMA(LDK(KP382683432), T2o, VMUL(LDK(KP923879532), T2j));
+			      T2u = VSUB(T2s, T2t);
+			      T3x = VADD(T2s, T2t);
+			 }
+			 ST(&(ri[WS(rs, 11)]), VSUB(T23, T2q), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 11)]), VSUB(T3A, T3x), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 3)]), VADD(T23, T2q), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 3)]), VADD(T3x, T3A), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 15)]), VSUB(T2r, T2u), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 15)]), VSUB(T3C, T3B), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 7)]), VADD(T2r, T2u), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 7)]), VADD(T3B, T3C), ms, &(ii[WS(rs, 1)]));
+		    }
+		    {
+			 V T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
+			 {
+			      V T2L, T2O, T3k, T3l;
+			      T2L = VSUB(Tf, TA);
+			      T2O = VSUB(T2M, T2N);
+			      T2P = VADD(T2L, T2O);
+			      T31 = VSUB(T2L, T2O);
+			      T3k = VSUB(TZ, TM);
+			      T3l = VSUB(T3e, T3b);
+			      T3m = VADD(T3k, T3l);
+			      T3o = VSUB(T3l, T3k);
+			 }
+			 {
+			      V T2U, T2Z, T32, T33;
+			      T2U = VADD(T2Q, T2T);
+			      T2Z = VSUB(T2V, T2Y);
+			      T30 = VMUL(LDK(KP707106781), VADD(T2U, T2Z));
+			      T3n = VMUL(LDK(KP707106781), VSUB(T2Z, T2U));
+			      T32 = VSUB(T2T, T2Q);
+			      T33 = VADD(T2V, T2Y);
+			      T34 = VMUL(LDK(KP707106781), VSUB(T32, T33));
+			      T3j = VMUL(LDK(KP707106781), VADD(T32, T33));
+			 }
+			 ST(&(ri[WS(rs, 10)]), VSUB(T2P, T30), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 10)]), VSUB(T3m, T3j), ms, &(ii[0]));
+			 ST(&(ri[WS(rs, 2)]), VADD(T2P, T30), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 2)]), VADD(T3j, T3m), ms, &(ii[0]));
+			 ST(&(ri[WS(rs, 14)]), VSUB(T31, T34), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 14)]), VSUB(T3o, T3n), ms, &(ii[0]));
+			 ST(&(ri[WS(rs, 6)]), VADD(T31, T34), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 6)]), VADD(T3n, T3o), ms, &(ii[0]));
+		    }
+		    {
+			 V T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
+			 {
+			      V T2v, T2y, T3q, T3t;
+			      T2v = VADD(T1N, T1Q);
+			      T2y = VMUL(LDK(KP707106781), VADD(T2w, T2x));
+			      T2z = VADD(T2v, T2y);
+			      T2H = VSUB(T2v, T2y);
+			      T3q = VMUL(LDK(KP707106781), VADD(T1W, T21));
+			      T3t = VSUB(T3r, T3s);
+			      T3u = VADD(T3q, T3t);
+			      T3w = VSUB(T3t, T3q);
+			 }
+			 {
+			      V T2C, T2F, T2I, T2J;
+			      T2C = VFMA(LDK(KP382683432), T2A, VMUL(LDK(KP923879532), T2B));
+			      T2F = VFNMS(LDK(KP382683432), T2E, VMUL(LDK(KP923879532), T2D));
+			      T2G = VADD(T2C, T2F);
+			      T3v = VSUB(T2F, T2C);
+			      T2I = VFNMS(LDK(KP382683432), T2B, VMUL(LDK(KP923879532), T2A));
+			      T2J = VFMA(LDK(KP923879532), T2E, VMUL(LDK(KP382683432), T2D));
+			      T2K = VSUB(T2I, T2J);
+			      T3p = VADD(T2I, T2J);
+			 }
+			 ST(&(ri[WS(rs, 9)]), VSUB(T2z, T2G), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 9)]), VSUB(T3u, T3p), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 1)]), VADD(T2z, T2G), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 1)]), VADD(T3p, T3u), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 13)]), VSUB(T2H, T2K), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 13)]), VSUB(T3w, T3v), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 5)]), VADD(T2H, T2K), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 5)]), VADD(T3v, T3w), ms, &(ii[WS(rs, 1)]));
+		    }
+		    {
+			 V T11, T35, T3g, T3i, T1M, T3h, T38, T39;
+			 {
+			      V TB, T10, T3a, T3f;
+			      TB = VADD(Tf, TA);
+			      T10 = VADD(TM, TZ);
+			      T11 = VADD(TB, T10);
+			      T35 = VSUB(TB, T10);
+			      T3a = VADD(T2M, T2N);
+			      T3f = VADD(T3b, T3e);
+			      T3g = VADD(T3a, T3f);
+			      T3i = VSUB(T3f, T3a);
+			 }
+			 {
+			      V T1o, T1L, T36, T37;
+			      T1o = VADD(T18, T1n);
+			      T1L = VADD(T1B, T1K);
+			      T1M = VADD(T1o, T1L);
+			      T3h = VSUB(T1L, T1o);
+			      T36 = VADD(T2R, T2S);
+			      T37 = VADD(T2W, T2X);
+			      T38 = VSUB(T36, T37);
+			      T39 = VADD(T36, T37);
+			 }
+			 ST(&(ri[WS(rs, 8)]), VSUB(T11, T1M), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 8)]), VSUB(T3g, T39), ms, &(ii[0]));
+			 ST(&(ri[0]), VADD(T11, T1M), ms, &(ri[0]));
+			 ST(&(ii[0]), VADD(T39, T3g), ms, &(ii[0]));
+			 ST(&(ri[WS(rs, 12)]), VSUB(T35, T38), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 12)]), VSUB(T3i, T3h), ms, &(ii[0]));
+			 ST(&(ri[WS(rs, 4)]), VADD(T35, T38), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 4)]), VADD(T3h, T3i), ms, &(ii[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 15),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t2sv_16"), twinstr, &GENUS, {156, 68, 40, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2sv_16) (planner *p) {
+     X(kdft_dit_register) (p, t2sv_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1800 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:28 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 32 -name t2sv_32 -include ts.h */
+
+/*
+ * This function contains 488 FP additions, 350 FP multiplications,
+ * (or, 236 additions, 98 multiplications, 252 fused multiply/add),
+ * 204 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "ts.h"
+
+static void t2sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V T6H, T74, T6U, T6E, T9r, T9t, T78, T7c, T6W, T6S, T73, T6K, T7a, T72, T9x;
+	       V T9z;
+	       {
+		    V T2, T8, T3, T6, Te, Ti, T5, Tc;
+		    T2 = LDW(&(W[0]));
+		    T8 = LDW(&(W[TWVL * 4]));
+		    T3 = LDW(&(W[TWVL * 2]));
+		    T6 = LDW(&(W[TWVL * 3]));
+		    Te = LDW(&(W[TWVL * 6]));
+		    Ti = LDW(&(W[TWVL * 7]));
+		    T5 = LDW(&(W[TWVL * 1]));
+		    Tc = LDW(&(W[TWVL * 5]));
+		    {
+			 V T2X, T2T, T34, T31, Tq, T46, T97, T8H, TH, T98, T4b, T8D, TZ, T7f, T1g;
+			 V T7g, T4j, T6t, T4q, T6u, T6x, T4z, T7m, T1J, T4G, T6y, T8d, T7l, T4O, T6A;
+			 V T2k, T7o, T6B, T4V, T7r, T8e, T5E, T6P, T3G, T7L, T6M, T61, T8n, T7I, T55;
+			 V T6I, T2N, T7A, T5s, T6F, T7x, T8i, T2R, T2U, T57, T3a, T5h, T62, T5L, T7J;
+			 V T43, T63, T5S, T8o, T7O, T2V, T2Y, T32, T35;
+			 {
+			      V T1w, T23, T1K, T1F, T1s, T1N, T26, T1z, T2w, T2s, T3Q, T3M, T3r, T3n, T2b;
+			      V T1U, T3C, T3j, T3z, T3f, T1R, T29, TR, Th, T2J, T2F, Td, TP, T1Z, T1V;
+			      V T2g, T2c, T1m, T4u, T1D, T1G, T1p, T1t, T1E, T4D, T1x, T1A, T1q, T4v;
+			      {
+				   V T1, Ts, T19, TJ, T7, TM, Tb, T11, T1C, T1o, TA, T15, TE, T1d, Tw;
+				   V T8G, Tk, Tn, Tj, TW, TS, To, Tt, Tx, TB, TF, Tl;
+				   {
+					V T1Y, T1S, T2f, T2a;
+					T1 = LD(&(ri[0]), ms, &(ri[0]));
+					{
+					     V Tr, T18, T4, Ta;
+					     Tr = VMUL(T2, T8);
+					     T18 = VMUL(T3, T8);
+					     T4 = VMUL(T2, T3);
+					     Ta = VMUL(T2, T6);
+					     {
+						  V T10, T1n, Tz, T14;
+						  T10 = VMUL(T2, Te);
+						  T1n = VMUL(T8, Te);
+						  Tz = VMUL(T3, Te);
+						  T14 = VMUL(T2, Ti);
+						  {
+						       V T1r, TD, T1c, Tv;
+						       T1r = VMUL(T8, Ti);
+						       TD = VMUL(T3, Ti);
+						       T1c = VMUL(T3, Tc);
+						       Tv = VMUL(T2, Tc);
+						       T1w = VFNMS(T5, Tc, Tr);
+						       Ts = VFMA(T5, Tc, Tr);
+						       T19 = VFNMS(T6, Tc, T18);
+						       T23 = VFMA(T6, Tc, T18);
+						       TJ = VFNMS(T5, T6, T4);
+						       T7 = VFMA(T5, T6, T4);
+						       TM = VFMA(T5, T3, Ta);
+						       Tb = VFNMS(T5, T3, Ta);
+						       T11 = VFNMS(T5, Ti, T10);
+						       T1C = VFMA(T5, Ti, T10);
+						       T1o = VFMA(Tc, Ti, T1n);
+						       TA = VFMA(T6, Ti, Tz);
+						       T1K = VFNMS(T6, Ti, Tz);
+						       T1F = VFNMS(T5, Te, T14);
+						       T15 = VFMA(T5, Te, T14);
+						       T1s = VFNMS(Tc, Te, T1r);
+						       T1N = VFMA(T6, Te, TD);
+						       TE = VFNMS(T6, Te, TD);
+						       T26 = VFNMS(T6, T8, T1c);
+						       T1d = VFMA(T6, T8, T1c);
+						       T1z = VFMA(T5, T8, Tv);
+						       Tw = VFNMS(T5, T8, Tv);
+						       {
+							    V T2v, T2r, T3P, T3L;
+							    T2v = VMUL(T1w, Ti);
+							    T2r = VMUL(T1w, Te);
+							    T3P = VMUL(Ts, Ti);
+							    T3L = VMUL(Ts, Te);
+							    {
+								 V T3q, T3m, T2W, T2S;
+								 T3q = VMUL(T19, Ti);
+								 T3m = VMUL(T19, Te);
+								 T2W = VMUL(T23, Ti);
+								 T2S = VMUL(T23, Te);
+								 {
+								      V T1T, T3i, T3e, T1Q;
+								      T1T = VMUL(TJ, Tc);
+								      T3i = VMUL(TJ, Ti);
+								      T3e = VMUL(TJ, Te);
+								      T1Q = VMUL(TJ, T8);
+								      {
+									   V Tg, T2I, T2E, T9;
+									   Tg = VMUL(T7, Tc);
+									   T2I = VMUL(T7, Ti);
+									   T2E = VMUL(T7, Te);
+									   T9 = VMUL(T7, T8);
+									   T2w = VFNMS(T1z, Te, T2v);
+									   T2s = VFMA(T1z, Ti, T2r);
+									   T3Q = VFNMS(Tw, Te, T3P);
+									   T3M = VFMA(Tw, Ti, T3L);
+									   T3r = VFNMS(T1d, Te, T3q);
+									   T3n = VFMA(T1d, Ti, T3m);
+									   T2X = VFNMS(T26, Te, T2W);
+									   T2T = VFMA(T26, Ti, T2S);
+									   T2b = VFNMS(TM, T8, T1T);
+									   T1U = VFMA(TM, T8, T1T);
+									   T3C = VFNMS(TM, Te, T3i);
+									   T3j = VFMA(TM, Te, T3i);
+									   T3z = VFMA(TM, Ti, T3e);
+									   T3f = VFNMS(TM, Ti, T3e);
+									   T1R = VFNMS(TM, Tc, T1Q);
+									   T29 = VFMA(TM, Tc, T1Q);
+									   TR = VFNMS(Tb, T8, Tg);
+									   Th = VFMA(Tb, T8, Tg);
+									   T34 = VFMA(Tb, Te, T2I);
+									   T2J = VFNMS(Tb, Te, T2I);
+									   T31 = VFNMS(Tb, Ti, T2E);
+									   T2F = VFMA(Tb, Ti, T2E);
+									   Td = VFNMS(Tb, Tc, T9);
+									   TP = VFMA(Tb, Tc, T9);
+									   T1Y = VMUL(T1R, Ti);
+									   T1S = VMUL(T1R, Te);
+									   T2f = VMUL(T29, Ti);
+									   T2a = VMUL(T29, Te);
+									   T8G = LD(&(ii[0]), ms, &(ii[0]));
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					Tk = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
+					{
+					     V Tm, Tf, TV, TQ;
+					     Tm = VMUL(Td, Ti);
+					     Tf = VMUL(Td, Te);
+					     TV = VMUL(TP, Ti);
+					     TQ = VMUL(TP, Te);
+					     T1Z = VFNMS(T1U, Te, T1Y);
+					     T1V = VFMA(T1U, Ti, T1S);
+					     T2g = VFNMS(T2b, Te, T2f);
+					     T2c = VFMA(T2b, Ti, T2a);
+					     Tn = VFNMS(Th, Te, Tm);
+					     Tj = VFMA(Th, Ti, Tf);
+					     TW = VFNMS(TR, Te, TV);
+					     TS = VFMA(TR, Ti, TQ);
+					}
+					To = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
+				   }
+				   Tt = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
+				   Tx = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
+				   TB = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
+				   TF = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
+				   Tl = VMUL(Tj, Tk);
+				   {
+					V TO, T4f, TT, TX;
+					{
+					     V Ty, T48, TG, T4a;
+					     {
+						  V TK, TN, T8E, Tu, T47, TC, T49, Tp, TL, T4e, T8F;
+						  TK = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+						  TN = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+						  T8E = VMUL(Tj, To);
+						  Tu = VMUL(Ts, Tt);
+						  T47 = VMUL(Ts, Tx);
+						  TC = VMUL(TA, TB);
+						  T49 = VMUL(TA, TF);
+						  Tp = VFMA(Tn, To, Tl);
+						  TL = VMUL(TJ, TK);
+						  T4e = VMUL(TJ, TN);
+						  T8F = VFNMS(Tn, Tk, T8E);
+						  Ty = VFMA(Tw, Tx, Tu);
+						  T48 = VFNMS(Tw, Tt, T47);
+						  TG = VFMA(TE, TF, TC);
+						  T4a = VFNMS(TE, TB, T49);
+						  Tq = VADD(T1, Tp);
+						  T46 = VSUB(T1, Tp);
+						  TO = VFMA(TM, TN, TL);
+						  T97 = VSUB(T8G, T8F);
+						  T8H = VADD(T8F, T8G);
+						  T4f = VFNMS(TM, TK, T4e);
+					     }
+					     TH = VADD(Ty, TG);
+					     T98 = VSUB(Ty, TG);
+					     T4b = VSUB(T48, T4a);
+					     T8D = VADD(T48, T4a);
+					     TT = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
+					     TX = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
+					}
+					{
+					     V T12, T16, T1a, T1e, T4k, T4p;
+					     T12 = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
+					     T16 = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
+					     T1a = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
+					     T1e = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
+					     {
+						  V TY, T4h, T17, T4m, T1f, T4o, T4d, T4i;
+						  {
+						       V T1j, T1l, TU, T4g, T13, T4l, T1b, T4n, T1k, T4t;
+						       T1j = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+						       T1l = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+						       TU = VMUL(TS, TT);
+						       T4g = VMUL(TS, TX);
+						       T13 = VMUL(T11, T12);
+						       T4l = VMUL(T11, T16);
+						       T1b = VMUL(T19, T1a);
+						       T4n = VMUL(T19, T1e);
+						       T1k = VMUL(T7, T1j);
+						       T4t = VMUL(T7, T1l);
+						       TY = VFMA(TW, TX, TU);
+						       T4h = VFNMS(TW, TT, T4g);
+						       T17 = VFMA(T15, T16, T13);
+						       T4m = VFNMS(T15, T12, T4l);
+						       T1f = VFMA(T1d, T1e, T1b);
+						       T4o = VFNMS(T1d, T1a, T4n);
+						       T1m = VFMA(Tb, T1l, T1k);
+						       T4u = VFNMS(Tb, T1j, T4t);
+						  }
+						  TZ = VADD(TO, TY);
+						  T4d = VSUB(TO, TY);
+						  T7f = VADD(T4f, T4h);
+						  T4i = VSUB(T4f, T4h);
+						  T1g = VADD(T17, T1f);
+						  T4k = VSUB(T17, T1f);
+						  T7g = VADD(T4m, T4o);
+						  T4p = VSUB(T4m, T4o);
+						  T1D = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
+						  T1G = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
+						  T4j = VADD(T4d, T4i);
+						  T6t = VSUB(T4i, T4d);
+					     }
+					     T1p = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
+					     T1t = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
+					     T4q = VSUB(T4k, T4p);
+					     T6u = VADD(T4k, T4p);
+					     T1E = VMUL(T1C, T1D);
+					     T4D = VMUL(T1C, T1G);
+					     T1x = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
+					     T1A = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
+					     T1q = VMUL(T1o, T1p);
+					     T4v = VMUL(T1o, T1t);
+					}
+				   }
+			      }
+			      {
+				   V T3l, T5z, T3E, T5Z, T3v, T3x, T3w, T3t, T5B, T5W;
+				   {
+					V T1P, T4J, T1W, T20, T2i, T4T, T1X, T4K, T24, T27;
+					{
+					     V T2d, T2h, T1v, T4A, T7j, T4x, T2e, T4y, T1I, T4F, T7k, T4S;
+					     {
+						  V T1L, T1O, T1H, T4E, T1y, T4B, T1u, T4w, T1M, T4I, T1B, T4C;
+						  T1L = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
+						  T1O = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
+						  T1H = VFMA(T1F, T1G, T1E);
+						  T4E = VFNMS(T1F, T1D, T4D);
+						  T1y = VMUL(T1w, T1x);
+						  T4B = VMUL(T1w, T1A);
+						  T1u = VFMA(T1s, T1t, T1q);
+						  T4w = VFNMS(T1s, T1p, T4v);
+						  T1M = VMUL(T1K, T1L);
+						  T4I = VMUL(T1K, T1O);
+						  T2d = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
+						  T2h = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
+						  T1B = VFMA(T1z, T1A, T1y);
+						  T4C = VFNMS(T1z, T1x, T4B);
+						  T1v = VADD(T1m, T1u);
+						  T4A = VSUB(T1m, T1u);
+						  T7j = VADD(T4u, T4w);
+						  T4x = VSUB(T4u, T4w);
+						  T1P = VFMA(T1N, T1O, T1M);
+						  T4J = VFNMS(T1N, T1L, T4I);
+						  T2e = VMUL(T2c, T2d);
+						  T4y = VSUB(T1B, T1H);
+						  T1I = VADD(T1B, T1H);
+						  T4F = VSUB(T4C, T4E);
+						  T7k = VADD(T4C, T4E);
+						  T4S = VMUL(T2c, T2h);
+					     }
+					     T1W = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
+					     T20 = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
+					     T2i = VFMA(T2g, T2h, T2e);
+					     T6x = VADD(T4x, T4y);
+					     T4z = VSUB(T4x, T4y);
+					     T7m = VSUB(T1v, T1I);
+					     T1J = VADD(T1v, T1I);
+					     T4G = VADD(T4A, T4F);
+					     T6y = VSUB(T4A, T4F);
+					     T8d = VADD(T7j, T7k);
+					     T7l = VSUB(T7j, T7k);
+					     T4T = VFNMS(T2g, T2d, T4S);
+					     T1X = VMUL(T1V, T1W);
+					     T4K = VMUL(T1V, T20);
+					     T24 = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+					     T27 = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+					}
+					{
+					     V T22, T4P, T7p, T4M, T28, T4R, T3g, T3k;
+					     T3g = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
+					     T3k = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
+					     {
+						  V T3A, T3D, T21, T4L, T25, T4Q, T3h, T5y, T3B, T5Y;
+						  T3A = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
+						  T3D = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
+						  T21 = VFMA(T1Z, T20, T1X);
+						  T4L = VFNMS(T1Z, T1W, T4K);
+						  T25 = VMUL(T23, T24);
+						  T4Q = VMUL(T23, T27);
+						  T3h = VMUL(T3f, T3g);
+						  T5y = VMUL(T3f, T3k);
+						  T3B = VMUL(T3z, T3A);
+						  T5Y = VMUL(T3z, T3D);
+						  T22 = VADD(T1P, T21);
+						  T4P = VSUB(T1P, T21);
+						  T7p = VADD(T4J, T4L);
+						  T4M = VSUB(T4J, T4L);
+						  T28 = VFMA(T26, T27, T25);
+						  T4R = VFNMS(T26, T24, T4Q);
+						  T3l = VFMA(T3j, T3k, T3h);
+						  T5z = VFNMS(T3j, T3g, T5y);
+						  T3E = VFMA(T3C, T3D, T3B);
+						  T5Z = VFNMS(T3C, T3A, T5Y);
+					     }
+					     {
+						  V T3o, T3s, T2j, T4N, T7q, T4U, T3p, T5A;
+						  T3o = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
+						  T3s = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
+						  T2j = VADD(T28, T2i);
+						  T4N = VSUB(T28, T2i);
+						  T7q = VADD(T4R, T4T);
+						  T4U = VSUB(T4R, T4T);
+						  T3v = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+						  T3x = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+						  T3p = VMUL(T3n, T3o);
+						  T5A = VMUL(T3n, T3s);
+						  T4O = VSUB(T4M, T4N);
+						  T6A = VADD(T4M, T4N);
+						  T2k = VADD(T22, T2j);
+						  T7o = VSUB(T22, T2j);
+						  T6B = VSUB(T4P, T4U);
+						  T4V = VADD(T4P, T4U);
+						  T7r = VSUB(T7p, T7q);
+						  T8e = VADD(T7p, T7q);
+						  T3w = VMUL(TP, T3v);
+						  T3t = VFMA(T3r, T3s, T3p);
+						  T5B = VFNMS(T3r, T3o, T5A);
+						  T5W = VMUL(TP, T3x);
+					     }
+					}
+				   }
+				   {
+					V T2t, T2q, T50, T2L, T5q, T2u, T2x, T2A, T2C;
+					{
+					     V T2n, T2p, T2G, T2K, T5V, T3u, T5C, T7G, T5X, T2o, T4Z, T2H, T5D, T3F, T5p;
+					     V T3y, T60, T7H;
+					     T2n = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+					     T2p = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+					     T2G = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
+					     T2K = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
+					     T3y = VFMA(TR, T3x, T3w);
+					     T5V = VSUB(T3l, T3t);
+					     T3u = VADD(T3l, T3t);
+					     T5C = VSUB(T5z, T5B);
+					     T7G = VADD(T5z, T5B);
+					     T5X = VFNMS(TR, T3v, T5W);
+					     T2o = VMUL(T2, T2n);
+					     T4Z = VMUL(T2, T2p);
+					     T2H = VMUL(T2F, T2G);
+					     T5D = VSUB(T3y, T3E);
+					     T3F = VADD(T3y, T3E);
+					     T5p = VMUL(T2F, T2K);
+					     T2t = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
+					     T60 = VSUB(T5X, T5Z);
+					     T7H = VADD(T5X, T5Z);
+					     T2q = VFMA(T5, T2p, T2o);
+					     T50 = VFNMS(T5, T2n, T4Z);
+					     T2L = VFMA(T2J, T2K, T2H);
+					     T5E = VSUB(T5C, T5D);
+					     T6P = VADD(T5C, T5D);
+					     T3G = VADD(T3u, T3F);
+					     T7L = VSUB(T3u, T3F);
+					     T5q = VFNMS(T2J, T2G, T5p);
+					     T6M = VSUB(T5V, T60);
+					     T61 = VADD(T5V, T60);
+					     T8n = VADD(T7G, T7H);
+					     T7I = VSUB(T7G, T7H);
+					     T2u = VMUL(T2s, T2t);
+					     T2x = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
+					     T2A = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
+					     T2C = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
+					}
+					{
+					     V T3N, T2z, T5m, T3K, T5G, T41, T5Q, T3O, T7v, T53, T2M, T54, T7w, T5r, T3R;
+					     V T3U, T3W;
+					     {
+						  V T3H, T3J, T3Y, T40, T52, T2D, T5o;
+						  T3H = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+						  T3J = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+						  T3Y = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
+						  T40 = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
+						  T3N = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
+						  {
+						       V T2y, T51, T2B, T5n;
+						       T2y = VFMA(T2w, T2x, T2u);
+						       T51 = VMUL(T2s, T2x);
+						       T2B = VMUL(T8, T2A);
+						       T5n = VMUL(T8, T2C);
+						       {
+							    V T3I, T5F, T3Z, T5P;
+							    T3I = VMUL(T3, T3H);
+							    T5F = VMUL(T3, T3J);
+							    T3Z = VMUL(Td, T3Y);
+							    T5P = VMUL(Td, T40);
+							    T2z = VADD(T2q, T2y);
+							    T5m = VSUB(T2q, T2y);
+							    T52 = VFNMS(T2w, T2t, T51);
+							    T2D = VFMA(Tc, T2C, T2B);
+							    T5o = VFNMS(Tc, T2A, T5n);
+							    T3K = VFMA(T6, T3J, T3I);
+							    T5G = VFNMS(T6, T3H, T5F);
+							    T41 = VFMA(Th, T40, T3Z);
+							    T5Q = VFNMS(Th, T3Y, T5P);
+							    T3O = VMUL(T3M, T3N);
+						       }
+						  }
+						  T7v = VADD(T50, T52);
+						  T53 = VSUB(T50, T52);
+						  T2M = VADD(T2D, T2L);
+						  T54 = VSUB(T2D, T2L);
+						  T7w = VADD(T5o, T5q);
+						  T5r = VSUB(T5o, T5q);
+						  T3R = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
+						  T3U = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
+						  T3W = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
+					     }
+					     {
+						  V T2O, T37, T39, T3T, T5K, T5I, T3X, T5O, T56, T38, T5g, T7M, T5J;
+						  {
+						       V T3S, T5H, T3V, T5N, T2P, T2Q;
+						       T2O = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+						       T55 = VSUB(T53, T54);
+						       T6I = VADD(T53, T54);
+						       T2N = VADD(T2z, T2M);
+						       T7A = VSUB(T2z, T2M);
+						       T5s = VADD(T5m, T5r);
+						       T6F = VSUB(T5m, T5r);
+						       T7x = VSUB(T7v, T7w);
+						       T8i = VADD(T7v, T7w);
+						       T3S = VFMA(T3Q, T3R, T3O);
+						       T5H = VMUL(T3M, T3R);
+						       T3V = VMUL(Te, T3U);
+						       T5N = VMUL(Te, T3W);
+						       T2P = VMUL(T29, T2O);
+						       T2Q = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+						       T37 = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
+						       T39 = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
+						       T3T = VADD(T3K, T3S);
+						       T5K = VSUB(T3K, T3S);
+						       T5I = VFNMS(T3Q, T3N, T5H);
+						       T3X = VFMA(Ti, T3W, T3V);
+						       T5O = VFNMS(Ti, T3U, T5N);
+						       T2R = VFMA(T2b, T2Q, T2P);
+						       T56 = VMUL(T29, T2Q);
+						       T38 = VMUL(T1R, T37);
+						       T5g = VMUL(T1R, T39);
+						  }
+						  T2U = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
+						  T7M = VADD(T5G, T5I);
+						  T5J = VSUB(T5G, T5I);
+						  {
+						       V T42, T5M, T7N, T5R;
+						       T42 = VADD(T3X, T41);
+						       T5M = VSUB(T3X, T41);
+						       T7N = VADD(T5O, T5Q);
+						       T5R = VSUB(T5O, T5Q);
+						       T57 = VFNMS(T2b, T2O, T56);
+						       T3a = VFMA(T1U, T39, T38);
+						       T5h = VFNMS(T1U, T37, T5g);
+						       T62 = VADD(T5K, T5J);
+						       T5L = VSUB(T5J, T5K);
+						       T7J = VSUB(T42, T3T);
+						       T43 = VADD(T3T, T42);
+						       T63 = VSUB(T5M, T5R);
+						       T5S = VADD(T5M, T5R);
+						       T8o = VADD(T7M, T7N);
+						       T7O = VSUB(T7M, T7N);
+						       T2V = VMUL(T2T, T2U);
+						  }
+						  T2Y = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
+						  T32 = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
+						  T35 = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T5t, T5c, T5u, T5j, T8Z, T90;
+			      {
+				   V T7e, T8T, T8y, T7h, T8U, T8c, T8J, T44, T8u, T8q, T7y, T7D, T8w, T2m, T3d;
+				   V T8h, T8R, T8P, T8k, T8x, T8B, T8f;
+				   {
+					V T1i, T8O, T8N, T2l, T3c, T8j;
+					{
+					     V T8p, T5b, T30, T59, T36, T5f, TI, T1h, T8m, T5a, T7B;
+					     TI = VADD(Tq, TH);
+					     T7e = VSUB(Tq, TH);
+					     T8T = VSUB(T1g, TZ);
+					     T1h = VADD(TZ, T1g);
+					     T8y = VADD(T8n, T8o);
+					     T8p = VSUB(T8n, T8o);
+					     {
+						  V T8C, T8I, T2Z, T58, T33, T5e;
+						  T7h = VSUB(T7f, T7g);
+						  T8C = VADD(T7f, T7g);
+						  T8I = VADD(T8D, T8H);
+						  T8U = VSUB(T8H, T8D);
+						  T2Z = VFMA(T2X, T2Y, T2V);
+						  T58 = VMUL(T2T, T2Y);
+						  T33 = VMUL(T31, T32);
+						  T5e = VMUL(T31, T35);
+						  T1i = VADD(TI, T1h);
+						  T8c = VSUB(TI, T1h);
+						  T8O = VSUB(T8I, T8C);
+						  T8J = VADD(T8C, T8I);
+						  T5b = VSUB(T2R, T2Z);
+						  T30 = VADD(T2R, T2Z);
+						  T59 = VFNMS(T2X, T2U, T58);
+						  T36 = VFMA(T34, T35, T33);
+						  T5f = VFNMS(T34, T32, T5e);
+					     }
+					     T44 = VADD(T3G, T43);
+					     T8m = VSUB(T3G, T43);
+					     T5a = VSUB(T57, T59);
+					     T7B = VADD(T57, T59);
+					     {
+						  V T5d, T3b, T5i, T7C;
+						  T5d = VSUB(T36, T3a);
+						  T3b = VADD(T36, T3a);
+						  T5i = VSUB(T5f, T5h);
+						  T7C = VADD(T5f, T5h);
+						  T8N = VSUB(T2k, T1J);
+						  T2l = VADD(T1J, T2k);
+						  T8u = VADD(T8m, T8p);
+						  T8q = VSUB(T8m, T8p);
+						  T5t = VADD(T5b, T5a);
+						  T5c = VSUB(T5a, T5b);
+						  T7y = VSUB(T3b, T30);
+						  T3c = VADD(T30, T3b);
+						  T5u = VSUB(T5d, T5i);
+						  T5j = VADD(T5d, T5i);
+						  T8j = VADD(T7B, T7C);
+						  T7D = VSUB(T7B, T7C);
+					     }
+					}
+					T8w = VSUB(T1i, T2l);
+					T2m = VADD(T1i, T2l);
+					T3d = VADD(T2N, T3c);
+					T8h = VSUB(T2N, T3c);
+					T8R = VSUB(T8O, T8N);
+					T8P = VADD(T8N, T8O);
+					T8k = VSUB(T8i, T8j);
+					T8x = VADD(T8i, T8j);
+					T8B = VADD(T8d, T8e);
+					T8f = VSUB(T8d, T8e);
+				   }
+				   {
+					V T7P, T7K, T7X, T7Y, T82, T7z, T7W, T7i, T8a, T86, T91, T8V, T8W, T7t, T7E;
+					V T81;
+					{
+					     V T84, T85, T7n, T7s, T8L, T45;
+					     T8L = VSUB(T44, T3d);
+					     T45 = VADD(T3d, T44);
+					     {
+						  V T8t, T8l, T8A, T8z;
+						  T8t = VSUB(T8k, T8h);
+						  T8l = VADD(T8h, T8k);
+						  T8A = VADD(T8x, T8y);
+						  T8z = VSUB(T8x, T8y);
+						  {
+						       V T8M, T8K, T8s, T8g;
+						       T8M = VSUB(T8J, T8B);
+						       T8K = VADD(T8B, T8J);
+						       T8s = VSUB(T8c, T8f);
+						       T8g = VADD(T8c, T8f);
+						       ST(&(ri[0]), VADD(T2m, T45), ms, &(ri[0]));
+						       ST(&(ri[WS(rs, 16)]), VSUB(T2m, T45), ms, &(ri[0]));
+						       {
+							    V T8v, T8Q, T8S, T8r;
+							    T8v = VSUB(T8t, T8u);
+							    T8Q = VADD(T8t, T8u);
+							    T8S = VSUB(T8q, T8l);
+							    T8r = VADD(T8l, T8q);
+							    ST(&(ri[WS(rs, 8)]), VADD(T8w, T8z), ms, &(ri[0]));
+							    ST(&(ri[WS(rs, 24)]), VSUB(T8w, T8z), ms, &(ri[0]));
+							    ST(&(ii[WS(rs, 24)]), VSUB(T8M, T8L), ms, &(ii[0]));
+							    ST(&(ii[WS(rs, 8)]), VADD(T8L, T8M), ms, &(ii[0]));
+							    ST(&(ii[WS(rs, 16)]), VSUB(T8K, T8A), ms, &(ii[0]));
+							    ST(&(ii[0]), VADD(T8A, T8K), ms, &(ii[0]));
+							    ST(&(ri[WS(rs, 12)]), VFMA(LDK(KP707106781), T8v, T8s), ms, &(ri[0]));
+							    ST(&(ri[WS(rs, 28)]), VFNMS(LDK(KP707106781), T8v, T8s), ms, &(ri[0]));
+							    ST(&(ii[WS(rs, 20)]), VFNMS(LDK(KP707106781), T8Q, T8P), ms, &(ii[0]));
+							    ST(&(ii[WS(rs, 4)]), VFMA(LDK(KP707106781), T8Q, T8P), ms, &(ii[0]));
+							    ST(&(ii[WS(rs, 28)]), VFNMS(LDK(KP707106781), T8S, T8R), ms, &(ii[0]));
+							    ST(&(ii[WS(rs, 12)]), VFMA(LDK(KP707106781), T8S, T8R), ms, &(ii[0]));
+							    ST(&(ri[WS(rs, 4)]), VFMA(LDK(KP707106781), T8r, T8g), ms, &(ri[0]));
+							    ST(&(ri[WS(rs, 20)]), VFNMS(LDK(KP707106781), T8r, T8g), ms, &(ri[0]));
+						       }
+						  }
+					     }
+					     T7P = VSUB(T7L, T7O);
+					     T84 = VADD(T7L, T7O);
+					     T85 = VADD(T7I, T7J);
+					     T7K = VSUB(T7I, T7J);
+					     T7X = VADD(T7m, T7l);
+					     T7n = VSUB(T7l, T7m);
+					     T7s = VADD(T7o, T7r);
+					     T7Y = VSUB(T7o, T7r);
+					     T82 = VADD(T7x, T7y);
+					     T7z = VSUB(T7x, T7y);
+					     T7W = VADD(T7e, T7h);
+					     T7i = VSUB(T7e, T7h);
+					     T8a = VFMA(LDK(KP414213562), T84, T85);
+					     T86 = VFNMS(LDK(KP414213562), T85, T84);
+					     T91 = VSUB(T8U, T8T);
+					     T8V = VADD(T8T, T8U);
+					     T8W = VADD(T7n, T7s);
+					     T7t = VSUB(T7n, T7s);
+					     T7E = VSUB(T7A, T7D);
+					     T81 = VADD(T7A, T7D);
+					}
+					{
+					     V T7S, T7u, T7T, T7F, T92, T7Z, T89, T83, T7U, T7Q;
+					     T7S = VFNMS(LDK(KP707106781), T7t, T7i);
+					     T7u = VFMA(LDK(KP707106781), T7t, T7i);
+					     T7T = VFNMS(LDK(KP414213562), T7z, T7E);
+					     T7F = VFMA(LDK(KP414213562), T7E, T7z);
+					     T92 = VSUB(T7Y, T7X);
+					     T7Z = VADD(T7X, T7Y);
+					     T89 = VFNMS(LDK(KP414213562), T81, T82);
+					     T83 = VFMA(LDK(KP414213562), T82, T81);
+					     T7U = VFMA(LDK(KP414213562), T7K, T7P);
+					     T7Q = VFNMS(LDK(KP414213562), T7P, T7K);
+					     {
+						  V T8X, T95, T93, T80, T88, T87, T7V, T94, T96, T7R, T8Y, T8b;
+						  T8Z = VFNMS(LDK(KP707106781), T8W, T8V);
+						  T8X = VFMA(LDK(KP707106781), T8W, T8V);
+						  T95 = VFNMS(LDK(KP707106781), T92, T91);
+						  T93 = VFMA(LDK(KP707106781), T92, T91);
+						  T80 = VFMA(LDK(KP707106781), T7Z, T7W);
+						  T88 = VFNMS(LDK(KP707106781), T7Z, T7W);
+						  T90 = VSUB(T86, T83);
+						  T87 = VADD(T83, T86);
+						  T7V = VADD(T7T, T7U);
+						  T94 = VSUB(T7U, T7T);
+						  T96 = VADD(T7F, T7Q);
+						  T7R = VSUB(T7F, T7Q);
+						  T8Y = VADD(T89, T8a);
+						  T8b = VSUB(T89, T8a);
+						  ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP923879532), T87, T80), ms, &(ri[0]));
+						  ST(&(ri[WS(rs, 18)]), VFNMS(LDK(KP923879532), T87, T80), ms, &(ri[0]));
+						  ST(&(ri[WS(rs, 30)]), VFMA(LDK(KP923879532), T7V, T7S), ms, &(ri[0]));
+						  ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP923879532), T7V, T7S), ms, &(ri[0]));
+						  ST(&(ii[WS(rs, 22)]), VFNMS(LDK(KP923879532), T94, T93), ms, &(ii[0]));
+						  ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP923879532), T94, T93), ms, &(ii[0]));
+						  ST(&(ii[WS(rs, 30)]), VFMA(LDK(KP923879532), T96, T95), ms, &(ii[0]));
+						  ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP923879532), T96, T95), ms, &(ii[0]));
+						  ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP923879532), T7R, T7u), ms, &(ri[0]));
+						  ST(&(ri[WS(rs, 22)]), VFNMS(LDK(KP923879532), T7R, T7u), ms, &(ri[0]));
+						  ST(&(ii[WS(rs, 18)]), VFNMS(LDK(KP923879532), T8Y, T8X), ms, &(ii[0]));
+						  ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP923879532), T8Y, T8X), ms, &(ii[0]));
+						  ST(&(ri[WS(rs, 26)]), VFNMS(LDK(KP923879532), T8b, T88), ms, &(ri[0]));
+						  ST(&(ri[WS(rs, 10)]), VFMA(LDK(KP923879532), T8b, T88), ms, &(ri[0]));
+					     }
+					}
+				   }
+			      }
+			      {
+				   V T6s, T9o, T9n, T6v, T6N, T6Q, T6G, T6J, T68, T4Y, T9f, T9d, T9l, T9j, T6g;
+				   V T6o, T6q, T6m, T66, T6a, T6p, T6j, T5x, T69;
+				   {
+					V T6d, T6e, T6c, T4s, T9c, T4X, T9h, T9b, T5T, T64, T5k, T5v, T9i, T6f;
+					{
+					     V T4c, T4r, T4H, T4W, T99, T9a;
+					     T6s = VSUB(T46, T4b);
+					     T4c = VADD(T46, T4b);
+					     T4r = VADD(T4j, T4q);
+					     T9o = VSUB(T4q, T4j);
+					     T6d = VFMA(LDK(KP414213562), T4z, T4G);
+					     T4H = VFNMS(LDK(KP414213562), T4G, T4z);
+					     T4W = VFMA(LDK(KP414213562), T4V, T4O);
+					     T6e = VFNMS(LDK(KP414213562), T4O, T4V);
+					     T9n = VADD(T98, T97);
+					     T99 = VSUB(T97, T98);
+					     T9a = VADD(T6t, T6u);
+					     T6v = VSUB(T6t, T6u);
+					     ST(&(ii[WS(rs, 26)]), VFNMS(LDK(KP923879532), T90, T8Z), ms, &(ii[0]));
+					     ST(&(ii[WS(rs, 10)]), VFMA(LDK(KP923879532), T90, T8Z), ms, &(ii[0]));
+					     T6c = VFMA(LDK(KP707106781), T4r, T4c);
+					     T4s = VFNMS(LDK(KP707106781), T4r, T4c);
+					     T9c = VADD(T4H, T4W);
+					     T4X = VSUB(T4H, T4W);
+					     T9h = VFNMS(LDK(KP707106781), T9a, T99);
+					     T9b = VFMA(LDK(KP707106781), T9a, T99);
+					     T6N = VSUB(T5S, T5L);
+					     T5T = VADD(T5L, T5S);
+					     T64 = VADD(T62, T63);
+					     T6Q = VSUB(T62, T63);
+					     T6G = VSUB(T5j, T5c);
+					     T5k = VADD(T5c, T5j);
+					     T5v = VADD(T5t, T5u);
+					     T6J = VSUB(T5t, T5u);
+					}
+					T68 = VFNMS(LDK(KP923879532), T4X, T4s);
+					T4Y = VFMA(LDK(KP923879532), T4X, T4s);
+					T9f = VFNMS(LDK(KP923879532), T9c, T9b);
+					T9d = VFMA(LDK(KP923879532), T9c, T9b);
+					T9i = VSUB(T6e, T6d);
+					T6f = VADD(T6d, T6e);
+					{
+					     V T6l, T5U, T6k, T65;
+					     T6l = VFMA(LDK(KP707106781), T5T, T5E);
+					     T5U = VFNMS(LDK(KP707106781), T5T, T5E);
+					     T6k = VFMA(LDK(KP707106781), T64, T61);
+					     T65 = VFNMS(LDK(KP707106781), T64, T61);
+					     {
+						  V T6i, T5l, T6h, T5w;
+						  T6i = VFMA(LDK(KP707106781), T5k, T55);
+						  T5l = VFNMS(LDK(KP707106781), T5k, T55);
+						  T6h = VFMA(LDK(KP707106781), T5v, T5s);
+						  T5w = VFNMS(LDK(KP707106781), T5v, T5s);
+						  T9l = VFNMS(LDK(KP923879532), T9i, T9h);
+						  T9j = VFMA(LDK(KP923879532), T9i, T9h);
+						  T6g = VFMA(LDK(KP923879532), T6f, T6c);
+						  T6o = VFNMS(LDK(KP923879532), T6f, T6c);
+						  T6q = VFMA(LDK(KP198912367), T6k, T6l);
+						  T6m = VFNMS(LDK(KP198912367), T6l, T6k);
+						  T66 = VFNMS(LDK(KP668178637), T65, T5U);
+						  T6a = VFMA(LDK(KP668178637), T5U, T65);
+						  T6p = VFNMS(LDK(KP198912367), T6h, T6i);
+						  T6j = VFMA(LDK(KP198912367), T6i, T6h);
+						  T5x = VFMA(LDK(KP668178637), T5w, T5l);
+						  T69 = VFNMS(LDK(KP668178637), T5l, T5w);
+					     }
+					}
+				   }
+				   {
+					V T6Y, T6w, T9w, T6D, T9v, T9p, T9q, T71, T77, T6O, T76, T6R;
+					{
+					     V T6Z, T6z, T6C, T70;
+					     {
+						  V T6n, T9g, T9e, T6r;
+						  T6n = VADD(T6j, T6m);
+						  T9g = VSUB(T6m, T6j);
+						  T9e = VADD(T6p, T6q);
+						  T6r = VSUB(T6p, T6q);
+						  {
+						       V T9k, T6b, T67, T9m;
+						       T9k = VSUB(T6a, T69);
+						       T6b = VADD(T69, T6a);
+						       T67 = VSUB(T5x, T66);
+						       T9m = VADD(T5x, T66);
+						       ST(&(ii[WS(rs, 25)]), VFNMS(LDK(KP980785280), T9g, T9f), ms, &(ii[WS(rs, 1)]));
+						       ST(&(ii[WS(rs, 9)]), VFMA(LDK(KP980785280), T9g, T9f), ms, &(ii[WS(rs, 1)]));
+						       ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP980785280), T6n, T6g), ms, &(ri[WS(rs, 1)]));
+						       ST(&(ri[WS(rs, 17)]), VFNMS(LDK(KP980785280), T6n, T6g), ms, &(ri[WS(rs, 1)]));
+						       ST(&(ri[WS(rs, 9)]), VFMA(LDK(KP980785280), T6r, T6o), ms, &(ri[WS(rs, 1)]));
+						       ST(&(ri[WS(rs, 25)]), VFNMS(LDK(KP980785280), T6r, T6o), ms, &(ri[WS(rs, 1)]));
+						       ST(&(ii[WS(rs, 17)]), VFNMS(LDK(KP980785280), T9e, T9d), ms, &(ii[WS(rs, 1)]));
+						       ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP980785280), T9e, T9d), ms, &(ii[WS(rs, 1)]));
+						       ST(&(ri[WS(rs, 29)]), VFMA(LDK(KP831469612), T6b, T68), ms, &(ri[WS(rs, 1)]));
+						       ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP831469612), T6b, T68), ms, &(ri[WS(rs, 1)]));
+						       ST(&(ii[WS(rs, 21)]), VFNMS(LDK(KP831469612), T9k, T9j), ms, &(ii[WS(rs, 1)]));
+						       ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP831469612), T9k, T9j), ms, &(ii[WS(rs, 1)]));
+						       ST(&(ii[WS(rs, 29)]), VFMA(LDK(KP831469612), T9m, T9l), ms, &(ii[WS(rs, 1)]));
+						       ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP831469612), T9m, T9l), ms, &(ii[WS(rs, 1)]));
+						       ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP831469612), T67, T4Y), ms, &(ri[WS(rs, 1)]));
+						       ST(&(ri[WS(rs, 21)]), VFNMS(LDK(KP831469612), T67, T4Y), ms, &(ri[WS(rs, 1)]));
+						       T6Y = VFNMS(LDK(KP707106781), T6v, T6s);
+						       T6w = VFMA(LDK(KP707106781), T6v, T6s);
+						  }
+					     }
+					     T6Z = VFNMS(LDK(KP414213562), T6x, T6y);
+					     T6z = VFMA(LDK(KP414213562), T6y, T6x);
+					     T6C = VFNMS(LDK(KP414213562), T6B, T6A);
+					     T70 = VFMA(LDK(KP414213562), T6A, T6B);
+					     T9w = VADD(T6z, T6C);
+					     T6D = VSUB(T6z, T6C);
+					     T9v = VFNMS(LDK(KP707106781), T9o, T9n);
+					     T9p = VFMA(LDK(KP707106781), T9o, T9n);
+					     T9q = VSUB(T70, T6Z);
+					     T71 = VADD(T6Z, T70);
+					     T77 = VFMA(LDK(KP707106781), T6N, T6M);
+					     T6O = VFNMS(LDK(KP707106781), T6N, T6M);
+					     T76 = VFMA(LDK(KP707106781), T6Q, T6P);
+					     T6R = VFNMS(LDK(KP707106781), T6Q, T6P);
+					     T6H = VFNMS(LDK(KP707106781), T6G, T6F);
+					     T74 = VFMA(LDK(KP707106781), T6G, T6F);
+					}
+					T6U = VFNMS(LDK(KP923879532), T6D, T6w);
+					T6E = VFMA(LDK(KP923879532), T6D, T6w);
+					T9r = VFMA(LDK(KP923879532), T9q, T9p);
+					T9t = VFNMS(LDK(KP923879532), T9q, T9p);
+					T78 = VFNMS(LDK(KP198912367), T77, T76);
+					T7c = VFMA(LDK(KP198912367), T76, T77);
+					T6W = VFMA(LDK(KP668178637), T6O, T6R);
+					T6S = VFNMS(LDK(KP668178637), T6R, T6O);
+					T73 = VFMA(LDK(KP707106781), T6J, T6I);
+					T6K = VFNMS(LDK(KP707106781), T6J, T6I);
+					T7a = VFMA(LDK(KP923879532), T71, T6Y);
+					T72 = VFNMS(LDK(KP923879532), T71, T6Y);
+					T9x = VFNMS(LDK(KP923879532), T9w, T9v);
+					T9z = VFMA(LDK(KP923879532), T9w, T9v);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T7b, T75, T6L, T6V;
+		    T7b = VFNMS(LDK(KP198912367), T73, T74);
+		    T75 = VFMA(LDK(KP198912367), T74, T73);
+		    T6L = VFMA(LDK(KP668178637), T6K, T6H);
+		    T6V = VFNMS(LDK(KP668178637), T6H, T6K);
+		    {
+			 V T79, T9A, T9y, T7d;
+			 T79 = VSUB(T75, T78);
+			 T9A = VADD(T75, T78);
+			 T9y = VSUB(T7c, T7b);
+			 T7d = VADD(T7b, T7c);
+			 {
+			      V T9s, T6X, T6T, T9u;
+			      T9s = VADD(T6V, T6W);
+			      T6X = VSUB(T6V, T6W);
+			      T6T = VADD(T6L, T6S);
+			      T9u = VSUB(T6S, T6L);
+			      ST(&(ii[WS(rs, 31)]), VFMA(LDK(KP980785280), T9A, T9z), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 15)]), VFNMS(LDK(KP980785280), T9A, T9z), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 7)]), VFMA(LDK(KP980785280), T79, T72), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 23)]), VFNMS(LDK(KP980785280), T79, T72), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 31)]), VFMA(LDK(KP980785280), T7d, T7a), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 15)]), VFNMS(LDK(KP980785280), T7d, T7a), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 23)]), VFNMS(LDK(KP980785280), T9y, T9x), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 7)]), VFMA(LDK(KP980785280), T9y, T9x), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 11)]), VFMA(LDK(KP831469612), T6X, T6U), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 27)]), VFNMS(LDK(KP831469612), T6X, T6U), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 19)]), VFNMS(LDK(KP831469612), T9s, T9r), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP831469612), T9s, T9r), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 27)]), VFNMS(LDK(KP831469612), T9u, T9t), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 11)]), VFMA(LDK(KP831469612), T9u, T9t), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP831469612), T6T, T6E), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 19)]), VFNMS(LDK(KP831469612), T6T, T6E), ms, &(ri[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 27),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t2sv_32"), twinstr, &GENUS, {236, 98, 252, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2sv_32) (planner *p) {
+     X(kdft_dit_register) (p, t2sv_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 32 -name t2sv_32 -include ts.h */
+
+/*
+ * This function contains 488 FP additions, 280 FP multiplications,
+ * (or, 376 additions, 168 multiplications, 112 fused multiply/add),
+ * 158 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "ts.h"
+
+static void t2sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V T2, T5, T3, T6, T8, TM, TO, Td, T9, Te, Th, Tl, TD, TH, T1y;
+	       V T1H, T15, T1A, T11, T1F, T1n, T1p, T2q, T2I, T2u, T2K, T2V, T3b, T2Z, T3d;
+	       V Tu, Ty, T3l, T3n, T1t, T1v, T2f, T2h, T1a, T1e, T32, T34, T1W, T1Y, T2C;
+	       V T2E, Tg, TR, Tk, TS, Tm, TV, To, TT, T1M, T21, T1P, T22, T1Q, T25;
+	       V T1S, T23;
+	       {
+		    V Ts, T1d, Tx, T18, Tt, T1c, Tw, T19, TB, T14, TG, TZ, TC, T13, TF;
+		    V T10;
+		    {
+			 V T4, Tc, T7, Tb;
+			 T2 = LDW(&(W[0]));
+			 T5 = LDW(&(W[TWVL * 1]));
+			 T3 = LDW(&(W[TWVL * 2]));
+			 T6 = LDW(&(W[TWVL * 3]));
+			 T4 = VMUL(T2, T3);
+			 Tc = VMUL(T5, T3);
+			 T7 = VMUL(T5, T6);
+			 Tb = VMUL(T2, T6);
+			 T8 = VADD(T4, T7);
+			 TM = VSUB(T4, T7);
+			 TO = VADD(Tb, Tc);
+			 Td = VSUB(Tb, Tc);
+			 T9 = LDW(&(W[TWVL * 4]));
+			 Ts = VMUL(T2, T9);
+			 T1d = VMUL(T6, T9);
+			 Tx = VMUL(T5, T9);
+			 T18 = VMUL(T3, T9);
+			 Te = LDW(&(W[TWVL * 5]));
+			 Tt = VMUL(T5, Te);
+			 T1c = VMUL(T3, Te);
+			 Tw = VMUL(T2, Te);
+			 T19 = VMUL(T6, Te);
+			 Th = LDW(&(W[TWVL * 6]));
+			 TB = VMUL(T3, Th);
+			 T14 = VMUL(T5, Th);
+			 TG = VMUL(T6, Th);
+			 TZ = VMUL(T2, Th);
+			 Tl = LDW(&(W[TWVL * 7]));
+			 TC = VMUL(T6, Tl);
+			 T13 = VMUL(T2, Tl);
+			 TF = VMUL(T3, Tl);
+			 T10 = VMUL(T5, Tl);
+		    }
+		    TD = VADD(TB, TC);
+		    TH = VSUB(TF, TG);
+		    T1y = VADD(TZ, T10);
+		    T1H = VADD(TF, TG);
+		    T15 = VADD(T13, T14);
+		    T1A = VSUB(T13, T14);
+		    T11 = VSUB(TZ, T10);
+		    T1F = VSUB(TB, TC);
+		    T1n = VFMA(T9, Th, VMUL(Te, Tl));
+		    T1p = VFNMS(Te, Th, VMUL(T9, Tl));
+		    {
+			 V T2o, T2p, T2s, T2t;
+			 T2o = VMUL(T8, Th);
+			 T2p = VMUL(Td, Tl);
+			 T2q = VADD(T2o, T2p);
+			 T2I = VSUB(T2o, T2p);
+			 T2s = VMUL(T8, Tl);
+			 T2t = VMUL(Td, Th);
+			 T2u = VSUB(T2s, T2t);
+			 T2K = VADD(T2s, T2t);
+		    }
+		    {
+			 V T2T, T2U, T2X, T2Y;
+			 T2T = VMUL(TM, Th);
+			 T2U = VMUL(TO, Tl);
+			 T2V = VSUB(T2T, T2U);
+			 T3b = VADD(T2T, T2U);
+			 T2X = VMUL(TM, Tl);
+			 T2Y = VMUL(TO, Th);
+			 T2Z = VADD(T2X, T2Y);
+			 T3d = VSUB(T2X, T2Y);
+			 Tu = VADD(Ts, Tt);
+			 Ty = VSUB(Tw, Tx);
+			 T3l = VFMA(Tu, Th, VMUL(Ty, Tl));
+			 T3n = VFNMS(Ty, Th, VMUL(Tu, Tl));
+		    }
+		    T1t = VSUB(Ts, Tt);
+		    T1v = VADD(Tw, Tx);
+		    T2f = VFMA(T1t, Th, VMUL(T1v, Tl));
+		    T2h = VFNMS(T1v, Th, VMUL(T1t, Tl));
+		    T1a = VSUB(T18, T19);
+		    T1e = VADD(T1c, T1d);
+		    T32 = VFMA(T1a, Th, VMUL(T1e, Tl));
+		    T34 = VFNMS(T1e, Th, VMUL(T1a, Tl));
+		    T1W = VADD(T18, T19);
+		    T1Y = VSUB(T1c, T1d);
+		    T2C = VFMA(T1W, Th, VMUL(T1Y, Tl));
+		    T2E = VFNMS(T1Y, Th, VMUL(T1W, Tl));
+		    {
+			 V Ta, Tf, Ti, Tj;
+			 Ta = VMUL(T8, T9);
+			 Tf = VMUL(Td, Te);
+			 Tg = VSUB(Ta, Tf);
+			 TR = VADD(Ta, Tf);
+			 Ti = VMUL(T8, Te);
+			 Tj = VMUL(Td, T9);
+			 Tk = VADD(Ti, Tj);
+			 TS = VSUB(Ti, Tj);
+		    }
+		    Tm = VFMA(Tg, Th, VMUL(Tk, Tl));
+		    TV = VFNMS(TS, Th, VMUL(TR, Tl));
+		    To = VFNMS(Tk, Th, VMUL(Tg, Tl));
+		    TT = VFMA(TR, Th, VMUL(TS, Tl));
+		    {
+			 V T1K, T1L, T1N, T1O;
+			 T1K = VMUL(TM, T9);
+			 T1L = VMUL(TO, Te);
+			 T1M = VSUB(T1K, T1L);
+			 T21 = VADD(T1K, T1L);
+			 T1N = VMUL(TM, Te);
+			 T1O = VMUL(TO, T9);
+			 T1P = VADD(T1N, T1O);
+			 T22 = VSUB(T1N, T1O);
+		    }
+		    T1Q = VFMA(T1M, Th, VMUL(T1P, Tl));
+		    T25 = VFNMS(T22, Th, VMUL(T21, Tl));
+		    T1S = VFNMS(T1P, Th, VMUL(T1M, Tl));
+		    T23 = VFMA(T21, Th, VMUL(T22, Tl));
+	       }
+	       {
+		    V TL, T6f, T8c, T8q, T3F, T5t, T7I, T7W, T2y, T6B, T6y, T7j, T4k, T5J, T4B;
+		    V T5G, T3h, T6H, T6O, T7o, T4L, T5N, T52, T5Q, T1i, T7V, T6i, T7D, T3K, T5u;
+		    V T3P, T5v, T1E, T6n, T6m, T7e, T3W, T5y, T41, T5z, T29, T6p, T6s, T7f, T47;
+		    V T5B, T4c, T5C, T2R, T6z, T6E, T7k, T4v, T5H, T4E, T5K, T3y, T6P, T6K, T7p;
+		    V T4W, T5R, T55, T5O;
+		    {
+			 V T1, T7G, Tq, T7F, TA, T3C, TJ, T3D, Tn, Tp;
+			 T1 = LD(&(ri[0]), ms, &(ri[0]));
+			 T7G = LD(&(ii[0]), ms, &(ii[0]));
+			 Tn = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
+			 Tp = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
+			 Tq = VFMA(Tm, Tn, VMUL(To, Tp));
+			 T7F = VFNMS(To, Tn, VMUL(Tm, Tp));
+			 {
+			      V Tv, Tz, TE, TI;
+			      Tv = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
+			      Tz = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
+			      TA = VFMA(Tu, Tv, VMUL(Ty, Tz));
+			      T3C = VFNMS(Ty, Tv, VMUL(Tu, Tz));
+			      TE = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
+			      TI = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
+			      TJ = VFMA(TD, TE, VMUL(TH, TI));
+			      T3D = VFNMS(TH, TE, VMUL(TD, TI));
+			 }
+			 {
+			      V Tr, TK, T8a, T8b;
+			      Tr = VADD(T1, Tq);
+			      TK = VADD(TA, TJ);
+			      TL = VADD(Tr, TK);
+			      T6f = VSUB(Tr, TK);
+			      T8a = VSUB(T7G, T7F);
+			      T8b = VSUB(TA, TJ);
+			      T8c = VSUB(T8a, T8b);
+			      T8q = VADD(T8b, T8a);
+			 }
+			 {
+			      V T3B, T3E, T7E, T7H;
+			      T3B = VSUB(T1, Tq);
+			      T3E = VSUB(T3C, T3D);
+			      T3F = VSUB(T3B, T3E);
+			      T5t = VADD(T3B, T3E);
+			      T7E = VADD(T3C, T3D);
+			      T7H = VADD(T7F, T7G);
+			      T7I = VADD(T7E, T7H);
+			      T7W = VSUB(T7H, T7E);
+			 }
+		    }
+		    {
+			 V T2e, T4g, T2w, T4z, T2j, T4h, T2n, T4y;
+			 {
+			      V T2c, T2d, T2r, T2v;
+			      T2c = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+			      T2d = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+			      T2e = VFMA(T2, T2c, VMUL(T5, T2d));
+			      T4g = VFNMS(T5, T2c, VMUL(T2, T2d));
+			      T2r = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
+			      T2v = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
+			      T2w = VFMA(T2q, T2r, VMUL(T2u, T2v));
+			      T4z = VFNMS(T2u, T2r, VMUL(T2q, T2v));
+			 }
+			 {
+			      V T2g, T2i, T2l, T2m;
+			      T2g = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
+			      T2i = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
+			      T2j = VFMA(T2f, T2g, VMUL(T2h, T2i));
+			      T4h = VFNMS(T2h, T2g, VMUL(T2f, T2i));
+			      T2l = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
+			      T2m = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
+			      T2n = VFMA(T9, T2l, VMUL(Te, T2m));
+			      T4y = VFNMS(Te, T2l, VMUL(T9, T2m));
+			 }
+			 {
+			      V T2k, T2x, T6w, T6x;
+			      T2k = VADD(T2e, T2j);
+			      T2x = VADD(T2n, T2w);
+			      T2y = VADD(T2k, T2x);
+			      T6B = VSUB(T2k, T2x);
+			      T6w = VADD(T4g, T4h);
+			      T6x = VADD(T4y, T4z);
+			      T6y = VSUB(T6w, T6x);
+			      T7j = VADD(T6w, T6x);
+			 }
+			 {
+			      V T4i, T4j, T4x, T4A;
+			      T4i = VSUB(T4g, T4h);
+			      T4j = VSUB(T2n, T2w);
+			      T4k = VADD(T4i, T4j);
+			      T5J = VSUB(T4i, T4j);
+			      T4x = VSUB(T2e, T2j);
+			      T4A = VSUB(T4y, T4z);
+			      T4B = VSUB(T4x, T4A);
+			      T5G = VADD(T4x, T4A);
+			 }
+		    }
+		    {
+			 V T31, T4Y, T3f, T4J, T36, T4Z, T3a, T4I;
+			 {
+			      V T2W, T30, T3c, T3e;
+			      T2W = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
+			      T30 = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
+			      T31 = VFMA(T2V, T2W, VMUL(T2Z, T30));
+			      T4Y = VFNMS(T2Z, T2W, VMUL(T2V, T30));
+			      T3c = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
+			      T3e = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
+			      T3f = VFMA(T3b, T3c, VMUL(T3d, T3e));
+			      T4J = VFNMS(T3d, T3c, VMUL(T3b, T3e));
+			 }
+			 {
+			      V T33, T35, T38, T39;
+			      T33 = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
+			      T35 = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
+			      T36 = VFMA(T32, T33, VMUL(T34, T35));
+			      T4Z = VFNMS(T34, T33, VMUL(T32, T35));
+			      T38 = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+			      T39 = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+			      T3a = VFMA(TR, T38, VMUL(TS, T39));
+			      T4I = VFNMS(TS, T38, VMUL(TR, T39));
+			 }
+			 {
+			      V T37, T3g, T6M, T6N;
+			      T37 = VADD(T31, T36);
+			      T3g = VADD(T3a, T3f);
+			      T3h = VADD(T37, T3g);
+			      T6H = VSUB(T37, T3g);
+			      T6M = VADD(T4Y, T4Z);
+			      T6N = VADD(T4I, T4J);
+			      T6O = VSUB(T6M, T6N);
+			      T7o = VADD(T6M, T6N);
+			 }
+			 {
+			      V T4H, T4K, T50, T51;
+			      T4H = VSUB(T31, T36);
+			      T4K = VSUB(T4I, T4J);
+			      T4L = VSUB(T4H, T4K);
+			      T5N = VADD(T4H, T4K);
+			      T50 = VSUB(T4Y, T4Z);
+			      T51 = VSUB(T3a, T3f);
+			      T52 = VADD(T50, T51);
+			      T5Q = VSUB(T50, T51);
+			 }
+		    }
+		    {
+			 V TQ, T3G, T1g, T3N, TX, T3H, T17, T3M;
+			 {
+			      V TN, TP, T1b, T1f;
+			      TN = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+			      TP = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+			      TQ = VFMA(TM, TN, VMUL(TO, TP));
+			      T3G = VFNMS(TO, TN, VMUL(TM, TP));
+			      T1b = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
+			      T1f = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
+			      T1g = VFMA(T1a, T1b, VMUL(T1e, T1f));
+			      T3N = VFNMS(T1e, T1b, VMUL(T1a, T1f));
+			 }
+			 {
+			      V TU, TW, T12, T16;
+			      TU = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
+			      TW = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
+			      TX = VFMA(TT, TU, VMUL(TV, TW));
+			      T3H = VFNMS(TV, TU, VMUL(TT, TW));
+			      T12 = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
+			      T16 = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
+			      T17 = VFMA(T11, T12, VMUL(T15, T16));
+			      T3M = VFNMS(T15, T12, VMUL(T11, T16));
+			 }
+			 {
+			      V TY, T1h, T6g, T6h;
+			      TY = VADD(TQ, TX);
+			      T1h = VADD(T17, T1g);
+			      T1i = VADD(TY, T1h);
+			      T7V = VSUB(T1h, TY);
+			      T6g = VADD(T3G, T3H);
+			      T6h = VADD(T3M, T3N);
+			      T6i = VSUB(T6g, T6h);
+			      T7D = VADD(T6g, T6h);
+			 }
+			 {
+			      V T3I, T3J, T3L, T3O;
+			      T3I = VSUB(T3G, T3H);
+			      T3J = VSUB(TQ, TX);
+			      T3K = VSUB(T3I, T3J);
+			      T5u = VADD(T3J, T3I);
+			      T3L = VSUB(T17, T1g);
+			      T3O = VSUB(T3M, T3N);
+			      T3P = VADD(T3L, T3O);
+			      T5v = VSUB(T3L, T3O);
+			 }
+		    }
+		    {
+			 V T1m, T3S, T1C, T3Z, T1r, T3T, T1x, T3Y;
+			 {
+			      V T1k, T1l, T1z, T1B;
+			      T1k = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+			      T1l = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+			      T1m = VFMA(T8, T1k, VMUL(Td, T1l));
+			      T3S = VFNMS(Td, T1k, VMUL(T8, T1l));
+			      T1z = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
+			      T1B = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
+			      T1C = VFMA(T1y, T1z, VMUL(T1A, T1B));
+			      T3Z = VFNMS(T1A, T1z, VMUL(T1y, T1B));
+			 }
+			 {
+			      V T1o, T1q, T1u, T1w;
+			      T1o = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
+			      T1q = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
+			      T1r = VFMA(T1n, T1o, VMUL(T1p, T1q));
+			      T3T = VFNMS(T1p, T1o, VMUL(T1n, T1q));
+			      T1u = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
+			      T1w = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
+			      T1x = VFMA(T1t, T1u, VMUL(T1v, T1w));
+			      T3Y = VFNMS(T1v, T1u, VMUL(T1t, T1w));
+			 }
+			 {
+			      V T1s, T1D, T6k, T6l;
+			      T1s = VADD(T1m, T1r);
+			      T1D = VADD(T1x, T1C);
+			      T1E = VADD(T1s, T1D);
+			      T6n = VSUB(T1s, T1D);
+			      T6k = VADD(T3S, T3T);
+			      T6l = VADD(T3Y, T3Z);
+			      T6m = VSUB(T6k, T6l);
+			      T7e = VADD(T6k, T6l);
+			 }
+			 {
+			      V T3U, T3V, T3X, T40;
+			      T3U = VSUB(T3S, T3T);
+			      T3V = VSUB(T1x, T1C);
+			      T3W = VADD(T3U, T3V);
+			      T5y = VSUB(T3U, T3V);
+			      T3X = VSUB(T1m, T1r);
+			      T40 = VSUB(T3Y, T3Z);
+			      T41 = VSUB(T3X, T40);
+			      T5z = VADD(T3X, T40);
+			 }
+		    }
+		    {
+			 V T1J, T43, T27, T4a, T1U, T44, T20, T49;
+			 {
+			      V T1G, T1I, T24, T26;
+			      T1G = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
+			      T1I = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
+			      T1J = VFMA(T1F, T1G, VMUL(T1H, T1I));
+			      T43 = VFNMS(T1H, T1G, VMUL(T1F, T1I));
+			      T24 = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
+			      T26 = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
+			      T27 = VFMA(T23, T24, VMUL(T25, T26));
+			      T4a = VFNMS(T25, T24, VMUL(T23, T26));
+			 }
+			 {
+			      V T1R, T1T, T1X, T1Z;
+			      T1R = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
+			      T1T = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
+			      T1U = VFMA(T1Q, T1R, VMUL(T1S, T1T));
+			      T44 = VFNMS(T1S, T1R, VMUL(T1Q, T1T));
+			      T1X = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+			      T1Z = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+			      T20 = VFMA(T1W, T1X, VMUL(T1Y, T1Z));
+			      T49 = VFNMS(T1Y, T1X, VMUL(T1W, T1Z));
+			 }
+			 {
+			      V T1V, T28, T6q, T6r;
+			      T1V = VADD(T1J, T1U);
+			      T28 = VADD(T20, T27);
+			      T29 = VADD(T1V, T28);
+			      T6p = VSUB(T1V, T28);
+			      T6q = VADD(T43, T44);
+			      T6r = VADD(T49, T4a);
+			      T6s = VSUB(T6q, T6r);
+			      T7f = VADD(T6q, T6r);
+			 }
+			 {
+			      V T45, T46, T48, T4b;
+			      T45 = VSUB(T43, T44);
+			      T46 = VSUB(T20, T27);
+			      T47 = VADD(T45, T46);
+			      T5B = VSUB(T45, T46);
+			      T48 = VSUB(T1J, T1U);
+			      T4b = VSUB(T49, T4a);
+			      T4c = VSUB(T48, T4b);
+			      T5C = VADD(T48, T4b);
+			 }
+		    }
+		    {
+			 V T2B, T4r, T2G, T4s, T4q, T4t, T2M, T4m, T2P, T4n, T4l, T4o;
+			 {
+			      V T2z, T2A, T2D, T2F;
+			      T2z = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+			      T2A = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+			      T2B = VFMA(T21, T2z, VMUL(T22, T2A));
+			      T4r = VFNMS(T22, T2z, VMUL(T21, T2A));
+			      T2D = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
+			      T2F = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
+			      T2G = VFMA(T2C, T2D, VMUL(T2E, T2F));
+			      T4s = VFNMS(T2E, T2D, VMUL(T2C, T2F));
+			 }
+			 T4q = VSUB(T2B, T2G);
+			 T4t = VSUB(T4r, T4s);
+			 {
+			      V T2J, T2L, T2N, T2O;
+			      T2J = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
+			      T2L = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
+			      T2M = VFMA(T2I, T2J, VMUL(T2K, T2L));
+			      T4m = VFNMS(T2K, T2J, VMUL(T2I, T2L));
+			      T2N = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
+			      T2O = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
+			      T2P = VFMA(T1M, T2N, VMUL(T1P, T2O));
+			      T4n = VFNMS(T1P, T2N, VMUL(T1M, T2O));
+			 }
+			 T4l = VSUB(T2M, T2P);
+			 T4o = VSUB(T4m, T4n);
+			 {
+			      V T2H, T2Q, T6C, T6D;
+			      T2H = VADD(T2B, T2G);
+			      T2Q = VADD(T2M, T2P);
+			      T2R = VADD(T2H, T2Q);
+			      T6z = VSUB(T2Q, T2H);
+			      T6C = VADD(T4r, T4s);
+			      T6D = VADD(T4m, T4n);
+			      T6E = VSUB(T6C, T6D);
+			      T7k = VADD(T6C, T6D);
+			 }
+			 {
+			      V T4p, T4u, T4C, T4D;
+			      T4p = VSUB(T4l, T4o);
+			      T4u = VADD(T4q, T4t);
+			      T4v = VMUL(LDK(KP707106781), VSUB(T4p, T4u));
+			      T5H = VMUL(LDK(KP707106781), VADD(T4u, T4p));
+			      T4C = VSUB(T4t, T4q);
+			      T4D = VADD(T4l, T4o);
+			      T4E = VMUL(LDK(KP707106781), VSUB(T4C, T4D));
+			      T5K = VMUL(LDK(KP707106781), VADD(T4C, T4D));
+			 }
+		    }
+		    {
+			 V T3k, T4M, T3p, T4N, T4O, T4P, T3t, T4S, T3w, T4T, T4R, T4U;
+			 {
+			      V T3i, T3j, T3m, T3o;
+			      T3i = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+			      T3j = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+			      T3k = VFMA(T3, T3i, VMUL(T6, T3j));
+			      T4M = VFNMS(T6, T3i, VMUL(T3, T3j));
+			      T3m = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
+			      T3o = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
+			      T3p = VFMA(T3l, T3m, VMUL(T3n, T3o));
+			      T4N = VFNMS(T3n, T3m, VMUL(T3l, T3o));
+			 }
+			 T4O = VSUB(T4M, T4N);
+			 T4P = VSUB(T3k, T3p);
+			 {
+			      V T3r, T3s, T3u, T3v;
+			      T3r = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
+			      T3s = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
+			      T3t = VFMA(Th, T3r, VMUL(Tl, T3s));
+			      T4S = VFNMS(Tl, T3r, VMUL(Th, T3s));
+			      T3u = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
+			      T3v = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
+			      T3w = VFMA(Tg, T3u, VMUL(Tk, T3v));
+			      T4T = VFNMS(Tk, T3u, VMUL(Tg, T3v));
+			 }
+			 T4R = VSUB(T3t, T3w);
+			 T4U = VSUB(T4S, T4T);
+			 {
+			      V T3q, T3x, T6I, T6J;
+			      T3q = VADD(T3k, T3p);
+			      T3x = VADD(T3t, T3w);
+			      T3y = VADD(T3q, T3x);
+			      T6P = VSUB(T3x, T3q);
+			      T6I = VADD(T4M, T4N);
+			      T6J = VADD(T4S, T4T);
+			      T6K = VSUB(T6I, T6J);
+			      T7p = VADD(T6I, T6J);
+			 }
+			 {
+			      V T4Q, T4V, T53, T54;
+			      T4Q = VSUB(T4O, T4P);
+			      T4V = VADD(T4R, T4U);
+			      T4W = VMUL(LDK(KP707106781), VSUB(T4Q, T4V));
+			      T5R = VMUL(LDK(KP707106781), VADD(T4Q, T4V));
+			      T53 = VSUB(T4R, T4U);
+			      T54 = VADD(T4P, T4O);
+			      T55 = VMUL(LDK(KP707106781), VSUB(T53, T54));
+			      T5O = VMUL(LDK(KP707106781), VADD(T54, T53));
+			 }
+		    }
+		    {
+			 V T2b, T7x, T7K, T7M, T3A, T7L, T7A, T7B;
+			 {
+			      V T1j, T2a, T7C, T7J;
+			      T1j = VADD(TL, T1i);
+			      T2a = VADD(T1E, T29);
+			      T2b = VADD(T1j, T2a);
+			      T7x = VSUB(T1j, T2a);
+			      T7C = VADD(T7e, T7f);
+			      T7J = VADD(T7D, T7I);
+			      T7K = VADD(T7C, T7J);
+			      T7M = VSUB(T7J, T7C);
+			 }
+			 {
+			      V T2S, T3z, T7y, T7z;
+			      T2S = VADD(T2y, T2R);
+			      T3z = VADD(T3h, T3y);
+			      T3A = VADD(T2S, T3z);
+			      T7L = VSUB(T3z, T2S);
+			      T7y = VADD(T7j, T7k);
+			      T7z = VADD(T7o, T7p);
+			      T7A = VSUB(T7y, T7z);
+			      T7B = VADD(T7y, T7z);
+			 }
+			 ST(&(ri[WS(rs, 16)]), VSUB(T2b, T3A), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 16)]), VSUB(T7K, T7B), ms, &(ii[0]));
+			 ST(&(ri[0]), VADD(T2b, T3A), ms, &(ri[0]));
+			 ST(&(ii[0]), VADD(T7B, T7K), ms, &(ii[0]));
+			 ST(&(ri[WS(rs, 24)]), VSUB(T7x, T7A), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 24)]), VSUB(T7M, T7L), ms, &(ii[0]));
+			 ST(&(ri[WS(rs, 8)]), VADD(T7x, T7A), ms, &(ri[0]));
+			 ST(&(ii[WS(rs, 8)]), VADD(T7L, T7M), ms, &(ii[0]));
+		    }
+		    {
+			 V T7h, T7t, T7Q, T7S, T7m, T7u, T7r, T7v;
+			 {
+			      V T7d, T7g, T7O, T7P;
+			      T7d = VSUB(TL, T1i);
+			      T7g = VSUB(T7e, T7f);
+			      T7h = VADD(T7d, T7g);
+			      T7t = VSUB(T7d, T7g);
+			      T7O = VSUB(T29, T1E);
+			      T7P = VSUB(T7I, T7D);
+			      T7Q = VADD(T7O, T7P);
+			      T7S = VSUB(T7P, T7O);
+			 }
+			 {
+			      V T7i, T7l, T7n, T7q;
+			      T7i = VSUB(T2y, T2R);
+			      T7l = VSUB(T7j, T7k);
+			      T7m = VADD(T7i, T7l);
+			      T7u = VSUB(T7l, T7i);
+			      T7n = VSUB(T3h, T3y);
+			      T7q = VSUB(T7o, T7p);
+			      T7r = VSUB(T7n, T7q);
+			      T7v = VADD(T7n, T7q);
+			 }
+			 {
+			      V T7s, T7N, T7w, T7R;
+			      T7s = VMUL(LDK(KP707106781), VADD(T7m, T7r));
+			      ST(&(ri[WS(rs, 20)]), VSUB(T7h, T7s), ms, &(ri[0]));
+			      ST(&(ri[WS(rs, 4)]), VADD(T7h, T7s), ms, &(ri[0]));
+			      T7N = VMUL(LDK(KP707106781), VADD(T7u, T7v));
+			      ST(&(ii[WS(rs, 4)]), VADD(T7N, T7Q), ms, &(ii[0]));
+			      ST(&(ii[WS(rs, 20)]), VSUB(T7Q, T7N), ms, &(ii[0]));
+			      T7w = VMUL(LDK(KP707106781), VSUB(T7u, T7v));
+			      ST(&(ri[WS(rs, 28)]), VSUB(T7t, T7w), ms, &(ri[0]));
+			      ST(&(ri[WS(rs, 12)]), VADD(T7t, T7w), ms, &(ri[0]));
+			      T7R = VMUL(LDK(KP707106781), VSUB(T7r, T7m));
+			      ST(&(ii[WS(rs, 12)]), VADD(T7R, T7S), ms, &(ii[0]));
+			      ST(&(ii[WS(rs, 28)]), VSUB(T7S, T7R), ms, &(ii[0]));
+			 }
+		    }
+		    {
+			 V T6j, T7X, T83, T6X, T6u, T7U, T77, T7b, T70, T82, T6G, T6U, T74, T7a, T6R;
+			 V T6V;
+			 {
+			      V T6o, T6t, T6A, T6F;
+			      T6j = VSUB(T6f, T6i);
+			      T7X = VADD(T7V, T7W);
+			      T83 = VSUB(T7W, T7V);
+			      T6X = VADD(T6f, T6i);
+			      T6o = VSUB(T6m, T6n);
+			      T6t = VADD(T6p, T6s);
+			      T6u = VMUL(LDK(KP707106781), VSUB(T6o, T6t));
+			      T7U = VMUL(LDK(KP707106781), VADD(T6o, T6t));
+			      {
+				   V T75, T76, T6Y, T6Z;
+				   T75 = VADD(T6H, T6K);
+				   T76 = VADD(T6O, T6P);
+				   T77 = VFNMS(LDK(KP382683432), T76, VMUL(LDK(KP923879532), T75));
+				   T7b = VFMA(LDK(KP923879532), T76, VMUL(LDK(KP382683432), T75));
+				   T6Y = VADD(T6n, T6m);
+				   T6Z = VSUB(T6p, T6s);
+				   T70 = VMUL(LDK(KP707106781), VADD(T6Y, T6Z));
+				   T82 = VMUL(LDK(KP707106781), VSUB(T6Z, T6Y));
+			      }
+			      T6A = VSUB(T6y, T6z);
+			      T6F = VSUB(T6B, T6E);
+			      T6G = VFMA(LDK(KP923879532), T6A, VMUL(LDK(KP382683432), T6F));
+			      T6U = VFNMS(LDK(KP923879532), T6F, VMUL(LDK(KP382683432), T6A));
+			      {
+				   V T72, T73, T6L, T6Q;
+				   T72 = VADD(T6y, T6z);
+				   T73 = VADD(T6B, T6E);
+				   T74 = VFMA(LDK(KP382683432), T72, VMUL(LDK(KP923879532), T73));
+				   T7a = VFNMS(LDK(KP382683432), T73, VMUL(LDK(KP923879532), T72));
+				   T6L = VSUB(T6H, T6K);
+				   T6Q = VSUB(T6O, T6P);
+				   T6R = VFNMS(LDK(KP923879532), T6Q, VMUL(LDK(KP382683432), T6L));
+				   T6V = VFMA(LDK(KP382683432), T6Q, VMUL(LDK(KP923879532), T6L));
+			      }
+			 }
+			 {
+			      V T6v, T6S, T81, T84;
+			      T6v = VADD(T6j, T6u);
+			      T6S = VADD(T6G, T6R);
+			      ST(&(ri[WS(rs, 22)]), VSUB(T6v, T6S), ms, &(ri[0]));
+			      ST(&(ri[WS(rs, 6)]), VADD(T6v, T6S), ms, &(ri[0]));
+			      T81 = VADD(T6U, T6V);
+			      T84 = VADD(T82, T83);
+			      ST(&(ii[WS(rs, 6)]), VADD(T81, T84), ms, &(ii[0]));
+			      ST(&(ii[WS(rs, 22)]), VSUB(T84, T81), ms, &(ii[0]));
+			 }
+			 {
+			      V T6T, T6W, T85, T86;
+			      T6T = VSUB(T6j, T6u);
+			      T6W = VSUB(T6U, T6V);
+			      ST(&(ri[WS(rs, 30)]), VSUB(T6T, T6W), ms, &(ri[0]));
+			      ST(&(ri[WS(rs, 14)]), VADD(T6T, T6W), ms, &(ri[0]));
+			      T85 = VSUB(T6R, T6G);
+			      T86 = VSUB(T83, T82);
+			      ST(&(ii[WS(rs, 14)]), VADD(T85, T86), ms, &(ii[0]));
+			      ST(&(ii[WS(rs, 30)]), VSUB(T86, T85), ms, &(ii[0]));
+			 }
+			 {
+			      V T71, T78, T7T, T7Y;
+			      T71 = VADD(T6X, T70);
+			      T78 = VADD(T74, T77);
+			      ST(&(ri[WS(rs, 18)]), VSUB(T71, T78), ms, &(ri[0]));
+			      ST(&(ri[WS(rs, 2)]), VADD(T71, T78), ms, &(ri[0]));
+			      T7T = VADD(T7a, T7b);
+			      T7Y = VADD(T7U, T7X);
+			      ST(&(ii[WS(rs, 2)]), VADD(T7T, T7Y), ms, &(ii[0]));
+			      ST(&(ii[WS(rs, 18)]), VSUB(T7Y, T7T), ms, &(ii[0]));
+			 }
+			 {
+			      V T79, T7c, T7Z, T80;
+			      T79 = VSUB(T6X, T70);
+			      T7c = VSUB(T7a, T7b);
+			      ST(&(ri[WS(rs, 26)]), VSUB(T79, T7c), ms, &(ri[0]));
+			      ST(&(ri[WS(rs, 10)]), VADD(T79, T7c), ms, &(ri[0]));
+			      T7Z = VSUB(T77, T74);
+			      T80 = VSUB(T7X, T7U);
+			      ST(&(ii[WS(rs, 10)]), VADD(T7Z, T80), ms, &(ii[0]));
+			      ST(&(ii[WS(rs, 26)]), VSUB(T80, T7Z), ms, &(ii[0]));
+			 }
+		    }
+		    {
+			 V T3R, T5d, T8r, T8x, T4e, T8o, T5n, T5r, T4G, T5a, T5g, T8w, T5k, T5q, T57;
+			 V T5b, T3Q, T8p;
+			 T3Q = VMUL(LDK(KP707106781), VSUB(T3K, T3P));
+			 T3R = VSUB(T3F, T3Q);
+			 T5d = VADD(T3F, T3Q);
+			 T8p = VMUL(LDK(KP707106781), VSUB(T5v, T5u));
+			 T8r = VADD(T8p, T8q);
+			 T8x = VSUB(T8q, T8p);
+			 {
+			      V T42, T4d, T5l, T5m;
+			      T42 = VFNMS(LDK(KP923879532), T41, VMUL(LDK(KP382683432), T3W));
+			      T4d = VFMA(LDK(KP382683432), T47, VMUL(LDK(KP923879532), T4c));
+			      T4e = VSUB(T42, T4d);
+			      T8o = VADD(T42, T4d);
+			      T5l = VADD(T4L, T4W);
+			      T5m = VADD(T52, T55);
+			      T5n = VFNMS(LDK(KP555570233), T5m, VMUL(LDK(KP831469612), T5l));
+			      T5r = VFMA(LDK(KP831469612), T5m, VMUL(LDK(KP555570233), T5l));
+			 }
+			 {
+			      V T4w, T4F, T5e, T5f;
+			      T4w = VSUB(T4k, T4v);
+			      T4F = VSUB(T4B, T4E);
+			      T4G = VFMA(LDK(KP980785280), T4w, VMUL(LDK(KP195090322), T4F));
+			      T5a = VFNMS(LDK(KP980785280), T4F, VMUL(LDK(KP195090322), T4w));
+			      T5e = VFMA(LDK(KP923879532), T3W, VMUL(LDK(KP382683432), T41));
+			      T5f = VFNMS(LDK(KP923879532), T47, VMUL(LDK(KP382683432), T4c));
+			      T5g = VADD(T5e, T5f);
+			      T8w = VSUB(T5f, T5e);
+			 }
+			 {
+			      V T5i, T5j, T4X, T56;
+			      T5i = VADD(T4k, T4v);
+			      T5j = VADD(T4B, T4E);
+			      T5k = VFMA(LDK(KP555570233), T5i, VMUL(LDK(KP831469612), T5j));
+			      T5q = VFNMS(LDK(KP555570233), T5j, VMUL(LDK(KP831469612), T5i));
+			      T4X = VSUB(T4L, T4W);
+			      T56 = VSUB(T52, T55);
+			      T57 = VFNMS(LDK(KP980785280), T56, VMUL(LDK(KP195090322), T4X));
+			      T5b = VFMA(LDK(KP195090322), T56, VMUL(LDK(KP980785280), T4X));
+			 }
+			 {
+			      V T4f, T58, T8v, T8y;
+			      T4f = VADD(T3R, T4e);
+			      T58 = VADD(T4G, T57);
+			      ST(&(ri[WS(rs, 23)]), VSUB(T4f, T58), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 7)]), VADD(T4f, T58), ms, &(ri[WS(rs, 1)]));
+			      T8v = VADD(T5a, T5b);
+			      T8y = VADD(T8w, T8x);
+			      ST(&(ii[WS(rs, 7)]), VADD(T8v, T8y), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 23)]), VSUB(T8y, T8v), ms, &(ii[WS(rs, 1)]));
+			 }
+			 {
+			      V T59, T5c, T8z, T8A;
+			      T59 = VSUB(T3R, T4e);
+			      T5c = VSUB(T5a, T5b);
+			      ST(&(ri[WS(rs, 31)]), VSUB(T59, T5c), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 15)]), VADD(T59, T5c), ms, &(ri[WS(rs, 1)]));
+			      T8z = VSUB(T57, T4G);
+			      T8A = VSUB(T8x, T8w);
+			      ST(&(ii[WS(rs, 15)]), VADD(T8z, T8A), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 31)]), VSUB(T8A, T8z), ms, &(ii[WS(rs, 1)]));
+			 }
+			 {
+			      V T5h, T5o, T8n, T8s;
+			      T5h = VADD(T5d, T5g);
+			      T5o = VADD(T5k, T5n);
+			      ST(&(ri[WS(rs, 19)]), VSUB(T5h, T5o), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 3)]), VADD(T5h, T5o), ms, &(ri[WS(rs, 1)]));
+			      T8n = VADD(T5q, T5r);
+			      T8s = VADD(T8o, T8r);
+			      ST(&(ii[WS(rs, 3)]), VADD(T8n, T8s), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 19)]), VSUB(T8s, T8n), ms, &(ii[WS(rs, 1)]));
+			 }
+			 {
+			      V T5p, T5s, T8t, T8u;
+			      T5p = VSUB(T5d, T5g);
+			      T5s = VSUB(T5q, T5r);
+			      ST(&(ri[WS(rs, 27)]), VSUB(T5p, T5s), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 11)]), VADD(T5p, T5s), ms, &(ri[WS(rs, 1)]));
+			      T8t = VSUB(T5n, T5k);
+			      T8u = VSUB(T8r, T8o);
+			      ST(&(ii[WS(rs, 11)]), VADD(T8t, T8u), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 27)]), VSUB(T8u, T8t), ms, &(ii[WS(rs, 1)]));
+			 }
+		    }
+		    {
+			 V T5x, T5Z, T8d, T8j, T5E, T88, T69, T6d, T5M, T5W, T62, T8i, T66, T6c, T5T;
+			 V T5X, T5w, T89;
+			 T5w = VMUL(LDK(KP707106781), VADD(T5u, T5v));
+			 T5x = VSUB(T5t, T5w);
+			 T5Z = VADD(T5t, T5w);
+			 T89 = VMUL(LDK(KP707106781), VADD(T3K, T3P));
+			 T8d = VADD(T89, T8c);
+			 T8j = VSUB(T8c, T89);
+			 {
+			      V T5A, T5D, T67, T68;
+			      T5A = VFNMS(LDK(KP382683432), T5z, VMUL(LDK(KP923879532), T5y));
+			      T5D = VFMA(LDK(KP923879532), T5B, VMUL(LDK(KP382683432), T5C));
+			      T5E = VSUB(T5A, T5D);
+			      T88 = VADD(T5A, T5D);
+			      T67 = VADD(T5N, T5O);
+			      T68 = VADD(T5Q, T5R);
+			      T69 = VFNMS(LDK(KP195090322), T68, VMUL(LDK(KP980785280), T67));
+			      T6d = VFMA(LDK(KP195090322), T67, VMUL(LDK(KP980785280), T68));
+			 }
+			 {
+			      V T5I, T5L, T60, T61;
+			      T5I = VSUB(T5G, T5H);
+			      T5L = VSUB(T5J, T5K);
+			      T5M = VFMA(LDK(KP555570233), T5I, VMUL(LDK(KP831469612), T5L));
+			      T5W = VFNMS(LDK(KP831469612), T5I, VMUL(LDK(KP555570233), T5L));
+			      T60 = VFMA(LDK(KP382683432), T5y, VMUL(LDK(KP923879532), T5z));
+			      T61 = VFNMS(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5C));
+			      T62 = VADD(T60, T61);
+			      T8i = VSUB(T61, T60);
+			 }
+			 {
+			      V T64, T65, T5P, T5S;
+			      T64 = VADD(T5G, T5H);
+			      T65 = VADD(T5J, T5K);
+			      T66 = VFMA(LDK(KP980785280), T64, VMUL(LDK(KP195090322), T65));
+			      T6c = VFNMS(LDK(KP195090322), T64, VMUL(LDK(KP980785280), T65));
+			      T5P = VSUB(T5N, T5O);
+			      T5S = VSUB(T5Q, T5R);
+			      T5T = VFNMS(LDK(KP831469612), T5S, VMUL(LDK(KP555570233), T5P));
+			      T5X = VFMA(LDK(KP831469612), T5P, VMUL(LDK(KP555570233), T5S));
+			 }
+			 {
+			      V T5F, T5U, T8h, T8k;
+			      T5F = VADD(T5x, T5E);
+			      T5U = VADD(T5M, T5T);
+			      ST(&(ri[WS(rs, 21)]), VSUB(T5F, T5U), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 5)]), VADD(T5F, T5U), ms, &(ri[WS(rs, 1)]));
+			      T8h = VADD(T5W, T5X);
+			      T8k = VADD(T8i, T8j);
+			      ST(&(ii[WS(rs, 5)]), VADD(T8h, T8k), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 21)]), VSUB(T8k, T8h), ms, &(ii[WS(rs, 1)]));
+			 }
+			 {
+			      V T5V, T5Y, T8l, T8m;
+			      T5V = VSUB(T5x, T5E);
+			      T5Y = VSUB(T5W, T5X);
+			      ST(&(ri[WS(rs, 29)]), VSUB(T5V, T5Y), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 13)]), VADD(T5V, T5Y), ms, &(ri[WS(rs, 1)]));
+			      T8l = VSUB(T5T, T5M);
+			      T8m = VSUB(T8j, T8i);
+			      ST(&(ii[WS(rs, 13)]), VADD(T8l, T8m), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 29)]), VSUB(T8m, T8l), ms, &(ii[WS(rs, 1)]));
+			 }
+			 {
+			      V T63, T6a, T87, T8e;
+			      T63 = VADD(T5Z, T62);
+			      T6a = VADD(T66, T69);
+			      ST(&(ri[WS(rs, 17)]), VSUB(T63, T6a), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 1)]), VADD(T63, T6a), ms, &(ri[WS(rs, 1)]));
+			      T87 = VADD(T6c, T6d);
+			      T8e = VADD(T88, T8d);
+			      ST(&(ii[WS(rs, 1)]), VADD(T87, T8e), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 17)]), VSUB(T8e, T87), ms, &(ii[WS(rs, 1)]));
+			 }
+			 {
+			      V T6b, T6e, T8f, T8g;
+			      T6b = VSUB(T5Z, T62);
+			      T6e = VSUB(T6c, T6d);
+			      ST(&(ri[WS(rs, 25)]), VSUB(T6b, T6e), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 9)]), VADD(T6b, T6e), ms, &(ri[WS(rs, 1)]));
+			      T8f = VSUB(T69, T66);
+			      T8g = VSUB(T8d, T88);
+			      ST(&(ii[WS(rs, 9)]), VADD(T8f, T8g), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 25)]), VSUB(T8g, T8f), ms, &(ii[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 27),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t2sv_32"), twinstr, &GENUS, {376, 168, 112, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2sv_32) (planner *p) {
+     X(kdft_dit_register) (p, t2sv_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:26 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 4 -name t2sv_4 -include ts.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 37 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "ts.h"
+
+static void t2sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 4); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 4), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T2, T6, T3, T5, T1, Tx, T8, Tc, Tf, Ta, T4, Th, Tj, Tl;
+	       T2 = LDW(&(W[0]));
+	       T6 = LDW(&(W[TWVL * 3]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       T5 = LDW(&(W[TWVL * 1]));
+	       T1 = LD(&(ri[0]), ms, &(ri[0]));
+	       Tx = LD(&(ii[0]), ms, &(ii[0]));
+	       T8 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+	       Tc = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+	       Tf = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+	       Ta = VMUL(T2, T6);
+	       T4 = VMUL(T2, T3);
+	       Th = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+	       Tj = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+	       Tl = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+	       {
+		    V Tg, Tb, T7, Tp, Tk, Tr, Ti;
+		    Tg = VMUL(T2, Tf);
+		    Tb = VFNMS(T5, T3, Ta);
+		    T7 = VFMA(T5, T6, T4);
+		    Tp = VMUL(T2, Th);
+		    Tk = VMUL(T3, Tj);
+		    Tr = VMUL(T3, Tl);
+		    Ti = VFMA(T5, Th, Tg);
+		    {
+			 V Tv, T9, Tq, Tm, Ts, Tw, Td;
+			 Tv = VMUL(T7, Tc);
+			 T9 = VMUL(T7, T8);
+			 Tq = VFNMS(T5, Tf, Tp);
+			 Tm = VFMA(T6, Tl, Tk);
+			 Ts = VFNMS(T6, Tj, Tr);
+			 Tw = VFNMS(Tb, T8, Tv);
+			 Td = VFMA(Tb, Tc, T9);
+			 {
+			      V Tn, TA, Tu, Tt;
+			      Tn = VADD(Ti, Tm);
+			      TA = VSUB(Ti, Tm);
+			      Tu = VADD(Tq, Ts);
+			      Tt = VSUB(Tq, Ts);
+			      {
+				   V Ty, Tz, Te, To;
+				   Ty = VADD(Tw, Tx);
+				   Tz = VSUB(Tx, Tw);
+				   Te = VADD(T1, Td);
+				   To = VSUB(T1, Td);
+				   ST(&(ii[WS(rs, 3)]), VADD(TA, Tz), ms, &(ii[WS(rs, 1)]));
+				   ST(&(ii[WS(rs, 1)]), VSUB(Tz, TA), ms, &(ii[WS(rs, 1)]));
+				   ST(&(ii[WS(rs, 2)]), VSUB(Ty, Tu), ms, &(ii[0]));
+				   ST(&(ii[0]), VADD(Tu, Ty), ms, &(ii[0]));
+				   ST(&(ri[WS(rs, 1)]), VADD(To, Tt), ms, &(ri[WS(rs, 1)]));
+				   ST(&(ri[WS(rs, 3)]), VSUB(To, Tt), ms, &(ri[WS(rs, 1)]));
+				   ST(&(ri[0]), VADD(Te, Tn), ms, &(ri[0]));
+				   ST(&(ri[WS(rs, 2)]), VSUB(Te, Tn), ms, &(ri[0]));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t2sv_4"), twinstr, &GENUS, {16, 8, 8, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2sv_4) (planner *p) {
+     X(kdft_dit_register) (p, t2sv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 4 -name t2sv_4 -include ts.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 21 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "ts.h"
+
+static void t2sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 4); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 4), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T2, T4, T3, T5, T6, T8;
+	       T2 = LDW(&(W[0]));
+	       T4 = LDW(&(W[TWVL * 1]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       T5 = LDW(&(W[TWVL * 3]));
+	       T6 = VFMA(T2, T3, VMUL(T4, T5));
+	       T8 = VFNMS(T4, T3, VMUL(T2, T5));
+	       {
+		    V T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
+		    T1 = LD(&(ri[0]), ms, &(ri[0]));
+		    Tp = LD(&(ii[0]), ms, &(ii[0]));
+		    T7 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+		    T9 = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+		    Ta = VFMA(T6, T7, VMUL(T8, T9));
+		    To = VFNMS(T8, T7, VMUL(T6, T9));
+		    {
+			 V Tc, Td, Tf, Tg;
+			 Tc = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+			 Td = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+			 Te = VFMA(T2, Tc, VMUL(T4, Td));
+			 Tk = VFNMS(T4, Tc, VMUL(T2, Td));
+			 Tf = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+			 Tg = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+			 Th = VFMA(T3, Tf, VMUL(T5, Tg));
+			 Tl = VFNMS(T5, Tf, VMUL(T3, Tg));
+		    }
+		    {
+			 V Tb, Ti, Tn, Tq;
+			 Tb = VADD(T1, Ta);
+			 Ti = VADD(Te, Th);
+			 ST(&(ri[WS(rs, 2)]), VSUB(Tb, Ti), ms, &(ri[0]));
+			 ST(&(ri[0]), VADD(Tb, Ti), ms, &(ri[0]));
+			 Tn = VADD(Tk, Tl);
+			 Tq = VADD(To, Tp);
+			 ST(&(ii[0]), VADD(Tn, Tq), ms, &(ii[0]));
+			 ST(&(ii[WS(rs, 2)]), VSUB(Tq, Tn), ms, &(ii[0]));
+		    }
+		    {
+			 V Tj, Tm, Tr, Ts;
+			 Tj = VSUB(T1, Ta);
+			 Tm = VSUB(Tk, Tl);
+			 ST(&(ri[WS(rs, 3)]), VSUB(Tj, Tm), ms, &(ri[WS(rs, 1)]));
+			 ST(&(ri[WS(rs, 1)]), VADD(Tj, Tm), ms, &(ri[WS(rs, 1)]));
+			 Tr = VSUB(Tp, To);
+			 Ts = VSUB(Te, Th);
+			 ST(&(ii[WS(rs, 1)]), VSUB(Tr, Ts), ms, &(ii[WS(rs, 1)]));
+			 ST(&(ii[WS(rs, 3)]), VADD(Ts, Tr), ms, &(ii[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t2sv_4"), twinstr, &GENUS, {16, 8, 8, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2sv_4) (planner *p) {
+     X(kdft_dit_register) (p, t2sv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t2sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t2sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:26 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 8 -name t2sv_8 -include ts.h */
+
+/*
+ * This function contains 74 FP additions, 50 FP multiplications,
+ * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
+ * 64 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "ts.h"
+
+static void t2sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T1m, T1l, T1k, T1u, T1n, T1o;
+	       {
+		    V T2, T3, Tl, Tn, T5, T6;
+		    T2 = LDW(&(W[0]));
+		    T3 = LDW(&(W[TWVL * 2]));
+		    Tl = LDW(&(W[TWVL * 4]));
+		    Tn = LDW(&(W[TWVL * 5]));
+		    T5 = LDW(&(W[TWVL * 1]));
+		    T6 = LDW(&(W[TWVL * 3]));
+		    {
+			 V T1, T1s, TK, T1r, Td, Tk, TG, TC, TY, Tu, TW, TL, TM, TO, TQ;
+			 V Tx, Tz, TD, TH;
+			 {
+			      V T8, T4, Tm, Tr, Tc, Ta;
+			      T1 = LD(&(ri[0]), ms, &(ri[0]));
+			      T1s = LD(&(ii[0]), ms, &(ii[0]));
+			      T8 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+			      T4 = VMUL(T2, T3);
+			      Tm = VMUL(T2, Tl);
+			      Tr = VMUL(T2, Tn);
+			      Tc = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+			      Ta = VMUL(T2, T6);
+			      {
+				   V Tp, Tt, Tg, T7, Tf, To, Ts, Ti, Tb, Tj;
+				   Tp = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+				   Tt = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+				   Tg = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+				   T7 = VFNMS(T5, T6, T4);
+				   Tf = VFMA(T5, T6, T4);
+				   To = VFMA(T5, Tn, Tm);
+				   Ts = VFNMS(T5, Tl, Tr);
+				   Ti = VFNMS(T5, T3, Ta);
+				   Tb = VFMA(T5, T3, Ta);
+				   Tj = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+				   TK = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+				   {
+					V T1q, T9, Th, TF;
+					T1q = VMUL(T7, Tc);
+					T9 = VMUL(T7, T8);
+					Th = VMUL(Tf, Tg);
+					TF = VMUL(Tf, Tn);
+					{
+					     V TB, TX, Tq, TV;
+					     TB = VMUL(Tf, Tl);
+					     TX = VMUL(To, Tt);
+					     Tq = VMUL(To, Tp);
+					     TV = VMUL(Tf, Tj);
+					     T1r = VFNMS(Tb, T8, T1q);
+					     Td = VFMA(Tb, Tc, T9);
+					     Tk = VFMA(Ti, Tj, Th);
+					     TG = VFNMS(Ti, Tl, TF);
+					     TC = VFMA(Ti, Tn, TB);
+					     TY = VFNMS(Ts, Tp, TX);
+					     Tu = VFMA(Ts, Tt, Tq);
+					     TW = VFNMS(Ti, Tg, TV);
+					     TL = VMUL(Tl, TK);
+					}
+				   }
+				   TM = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+				   TO = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+				   TQ = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+				   Tx = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+				   Tz = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+				   TD = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+				   TH = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+			      }
+			 }
+			 {
+			      V Te, T1p, T1g, T10, TS, T18, T1d, T1t, T1x, T1y, Tv, TJ, T11, T16;
+			      {
+				   V TN, T1a, TR, T1c, TA, T13, TI, T15;
+				   {
+					V TU, T19, TP, T1b, Ty, T12, TE, T14, TZ;
+					TU = VSUB(T1, Td);
+					Te = VADD(T1, Td);
+					TN = VFMA(Tn, TM, TL);
+					T19 = VMUL(Tl, TM);
+					TP = VMUL(T3, TO);
+					T1b = VMUL(T3, TQ);
+					Ty = VMUL(T2, Tx);
+					T12 = VMUL(T2, Tz);
+					TE = VMUL(TC, TD);
+					T14 = VMUL(TC, TH);
+					T1p = VADD(TW, TY);
+					TZ = VSUB(TW, TY);
+					T1a = VFNMS(Tn, TK, T19);
+					TR = VFMA(T6, TQ, TP);
+					T1c = VFNMS(T6, TO, T1b);
+					TA = VFMA(T5, Tz, Ty);
+					T13 = VFNMS(T5, Tx, T12);
+					TI = VFMA(TG, TH, TE);
+					T15 = VFNMS(TG, TD, T14);
+					T1g = VSUB(TU, TZ);
+					T10 = VADD(TU, TZ);
+				   }
+				   TS = VADD(TN, TR);
+				   T18 = VSUB(TN, TR);
+				   T1d = VSUB(T1a, T1c);
+				   T1m = VADD(T1a, T1c);
+				   T1t = VADD(T1r, T1s);
+				   T1x = VSUB(T1s, T1r);
+				   T1y = VSUB(Tk, Tu);
+				   Tv = VADD(Tk, Tu);
+				   TJ = VADD(TA, TI);
+				   T11 = VSUB(TA, TI);
+				   T16 = VSUB(T13, T15);
+				   T1l = VADD(T13, T15);
+			      }
+			      {
+				   V Tw, T1w, T1v, TT;
+				   {
+					V T1i, T1e, T1B, T1z, T1h, T17;
+					T1i = VADD(T18, T1d);
+					T1e = VSUB(T18, T1d);
+					T1B = VADD(T1y, T1x);
+					T1z = VSUB(T1x, T1y);
+					T1h = VSUB(T16, T11);
+					T17 = VADD(T11, T16);
+					T1k = VSUB(Te, Tv);
+					Tw = VADD(Te, Tv);
+					{
+					     V T1A, T1j, T1C, T1f;
+					     T1A = VADD(T1h, T1i);
+					     T1j = VSUB(T1h, T1i);
+					     T1C = VSUB(T1e, T17);
+					     T1f = VADD(T17, T1e);
+					     T1w = VSUB(T1t, T1p);
+					     T1u = VADD(T1p, T1t);
+					     T1v = VSUB(TS, TJ);
+					     TT = VADD(TJ, TS);
+					     ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP707106781), T1A, T1z), ms, &(ii[WS(rs, 1)]));
+					     ST(&(ii[WS(rs, 5)]), VFNMS(LDK(KP707106781), T1A, T1z), ms, &(ii[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP707106781), T1j, T1g), ms, &(ri[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1j, T1g), ms, &(ri[WS(rs, 1)]));
+					     ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP707106781), T1C, T1B), ms, &(ii[WS(rs, 1)]));
+					     ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1C, T1B), ms, &(ii[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP707106781), T1f, T10), ms, &(ri[WS(rs, 1)]));
+					     ST(&(ri[WS(rs, 5)]), VFNMS(LDK(KP707106781), T1f, T10), ms, &(ri[WS(rs, 1)]));
+					}
+				   }
+				   ST(&(ri[WS(rs, 4)]), VSUB(Tw, TT), ms, &(ri[0]));
+				   ST(&(ri[0]), VADD(Tw, TT), ms, &(ri[0]));
+				   ST(&(ii[WS(rs, 6)]), VSUB(T1w, T1v), ms, &(ii[0]));
+				   ST(&(ii[WS(rs, 2)]), VADD(T1v, T1w), ms, &(ii[0]));
+			      }
+			 }
+		    }
+	       }
+	       T1n = VSUB(T1l, T1m);
+	       T1o = VADD(T1l, T1m);
+	       ST(&(ii[0]), VADD(T1o, T1u), ms, &(ii[0]));
+	       ST(&(ii[WS(rs, 4)]), VSUB(T1u, T1o), ms, &(ii[0]));
+	       ST(&(ri[WS(rs, 2)]), VADD(T1k, T1n), ms, &(ri[0]));
+	       ST(&(ri[WS(rs, 6)]), VSUB(T1k, T1n), ms, &(ri[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 7),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t2sv_8"), twinstr, &GENUS, {44, 20, 30, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2sv_8) (planner *p) {
+     X(kdft_dit_register) (p, t2sv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 8 -name t2sv_8 -include ts.h */
+
+/*
+ * This function contains 74 FP additions, 44 FP multiplications,
+ * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
+ * 42 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "ts.h"
+
+static void t2sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
+	       {
+		    V T4, Tb, T7, Ta;
+		    T2 = LDW(&(W[0]));
+		    T5 = LDW(&(W[TWVL * 1]));
+		    T3 = LDW(&(W[TWVL * 2]));
+		    T6 = LDW(&(W[TWVL * 3]));
+		    T4 = VMUL(T2, T3);
+		    Tb = VMUL(T5, T3);
+		    T7 = VMUL(T5, T6);
+		    Ta = VMUL(T2, T6);
+		    T8 = VSUB(T4, T7);
+		    Tc = VADD(Ta, Tb);
+		    Tg = VADD(T4, T7);
+		    Ti = VSUB(Ta, Tb);
+		    Tl = LDW(&(W[TWVL * 4]));
+		    Tm = LDW(&(W[TWVL * 5]));
+		    Tn = VFMA(T2, Tl, VMUL(T5, Tm));
+		    Tz = VFNMS(Ti, Tl, VMUL(Tg, Tm));
+		    Tp = VFNMS(T5, Tl, VMUL(T2, Tm));
+		    Tx = VFMA(Tg, Tl, VMUL(Ti, Tm));
+	       }
+	       {
+		    V Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
+		    V TT;
+		    {
+			 V T1, T1c, Te, T1b, T9, Td;
+			 T1 = LD(&(ri[0]), ms, &(ri[0]));
+			 T1c = LD(&(ii[0]), ms, &(ii[0]));
+			 T9 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
+			 Td = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
+			 Te = VFMA(T8, T9, VMUL(Tc, Td));
+			 T1b = VFNMS(Tc, T9, VMUL(T8, Td));
+			 Tf = VADD(T1, Te);
+			 T1i = VSUB(T1c, T1b);
+			 TL = VSUB(T1, Te);
+			 T1d = VADD(T1b, T1c);
+		    }
+		    {
+			 V TF, TW, TI, TX;
+			 {
+			      V TD, TE, TG, TH;
+			      TD = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
+			      TE = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
+			      TF = VFMA(Tl, TD, VMUL(Tm, TE));
+			      TW = VFNMS(Tm, TD, VMUL(Tl, TE));
+			      TG = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
+			      TH = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
+			      TI = VFMA(T3, TG, VMUL(T6, TH));
+			      TX = VFNMS(T6, TG, VMUL(T3, TH));
+			 }
+			 TJ = VADD(TF, TI);
+			 T17 = VADD(TW, TX);
+			 TV = VSUB(TF, TI);
+			 TY = VSUB(TW, TX);
+		    }
+		    {
+			 V Tk, TM, Tr, TN;
+			 {
+			      V Th, Tj, To, Tq;
+			      Th = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
+			      Tj = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
+			      Tk = VFMA(Tg, Th, VMUL(Ti, Tj));
+			      TM = VFNMS(Ti, Th, VMUL(Tg, Tj));
+			      To = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
+			      Tq = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
+			      Tr = VFMA(Tn, To, VMUL(Tp, Tq));
+			      TN = VFNMS(Tp, To, VMUL(Tn, Tq));
+			 }
+			 Ts = VADD(Tk, Tr);
+			 T1j = VSUB(Tk, Tr);
+			 TO = VSUB(TM, TN);
+			 T1a = VADD(TM, TN);
+		    }
+		    {
+			 V Tw, TR, TB, TS;
+			 {
+			      V Tu, Tv, Ty, TA;
+			      Tu = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
+			      Tv = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
+			      Tw = VFMA(T2, Tu, VMUL(T5, Tv));
+			      TR = VFNMS(T5, Tu, VMUL(T2, Tv));
+			      Ty = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
+			      TA = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
+			      TB = VFMA(Tx, Ty, VMUL(Tz, TA));
+			      TS = VFNMS(Tz, Ty, VMUL(Tx, TA));
+			 }
+			 TC = VADD(Tw, TB);
+			 T16 = VADD(TR, TS);
+			 TQ = VSUB(Tw, TB);
+			 TT = VSUB(TR, TS);
+		    }
+		    {
+			 V Tt, TK, T1f, T1g;
+			 Tt = VADD(Tf, Ts);
+			 TK = VADD(TC, TJ);
+			 ST(&(ri[WS(rs, 4)]), VSUB(Tt, TK), ms, &(ri[0]));
+			 ST(&(ri[0]), VADD(Tt, TK), ms, &(ri[0]));
+			 {
+			      V T19, T1e, T15, T18;
+			      T19 = VADD(T16, T17);
+			      T1e = VADD(T1a, T1d);
+			      ST(&(ii[0]), VADD(T19, T1e), ms, &(ii[0]));
+			      ST(&(ii[WS(rs, 4)]), VSUB(T1e, T19), ms, &(ii[0]));
+			      T15 = VSUB(Tf, Ts);
+			      T18 = VSUB(T16, T17);
+			      ST(&(ri[WS(rs, 6)]), VSUB(T15, T18), ms, &(ri[0]));
+			      ST(&(ri[WS(rs, 2)]), VADD(T15, T18), ms, &(ri[0]));
+			 }
+			 T1f = VSUB(TJ, TC);
+			 T1g = VSUB(T1d, T1a);
+			 ST(&(ii[WS(rs, 2)]), VADD(T1f, T1g), ms, &(ii[0]));
+			 ST(&(ii[WS(rs, 6)]), VSUB(T1g, T1f), ms, &(ii[0]));
+			 {
+			      V T11, T1k, T14, T1h, T12, T13;
+			      T11 = VSUB(TL, TO);
+			      T1k = VSUB(T1i, T1j);
+			      T12 = VSUB(TT, TQ);
+			      T13 = VADD(TV, TY);
+			      T14 = VMUL(LDK(KP707106781), VSUB(T12, T13));
+			      T1h = VMUL(LDK(KP707106781), VADD(T12, T13));
+			      ST(&(ri[WS(rs, 7)]), VSUB(T11, T14), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 5)]), VSUB(T1k, T1h), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 3)]), VADD(T11, T14), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 1)]), VADD(T1h, T1k), ms, &(ii[WS(rs, 1)]));
+			 }
+			 {
+			      V TP, T1m, T10, T1l, TU, TZ;
+			      TP = VADD(TL, TO);
+			      T1m = VADD(T1j, T1i);
+			      TU = VADD(TQ, TT);
+			      TZ = VSUB(TV, TY);
+			      T10 = VMUL(LDK(KP707106781), VADD(TU, TZ));
+			      T1l = VMUL(LDK(KP707106781), VSUB(TZ, TU));
+			      ST(&(ri[WS(rs, 5)]), VSUB(TP, T10), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 7)]), VSUB(T1m, T1l), ms, &(ii[WS(rs, 1)]));
+			      ST(&(ri[WS(rs, 1)]), VADD(TP, T10), ms, &(ri[WS(rs, 1)]));
+			      ST(&(ii[WS(rs, 3)]), VADD(T1l, T1m), ms, &(ii[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 7),
+     {TW_NEXT, (2 * VL), 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t2sv_8"), twinstr, &GENUS, {56, 26, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t2sv_8) (planner *p) {
+     X(kdft_dit_register) (p, t2sv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:22 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 10 -name t3bv_10 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 57 FP additions, 52 FP multiplications,
+ * (or, 39 additions, 34 multiplications, 18 fused multiply/add),
+ * 57 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V T1, T7, Th, Tx, Tr, Td, Tp, T6, Tv, Tc, Te, Ti, Tl, T2, T3;
+	       V T5;
+	       T2 = LDW(&(W[0]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       T5 = LDW(&(W[TWVL * 4]));
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V To, Tw, Tq, Tu, Ta, T4, Tt, Tk, Tb;
+		    To = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tw = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Tq = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+		    Tu = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    Ta = VZMULJ(T2, T3);
+		    T4 = VZMUL(T2, T3);
+		    Th = VZMULJ(T2, T5);
+		    Tt = VZMULJ(T3, T5);
+		    Tb = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tx = VZMUL(T2, Tw);
+		    Tr = VZMUL(T5, Tq);
+		    Tk = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Td = VZMULJ(Ta, T5);
+		    Tp = VZMUL(T4, To);
+		    T6 = VZMULJ(T4, T5);
+		    Tv = VZMUL(Tt, Tu);
+		    Tc = VZMUL(Ta, Tb);
+		    Te = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Ti = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    Tl = VZMUL(T3, Tk);
+	       }
+	       {
+		    V TN, Ts, T8, Ty, TO, Tf, Tj;
+		    TN = VADD(Tp, Tr);
+		    Ts = VSUB(Tp, Tr);
+		    T8 = VZMUL(T6, T7);
+		    Ty = VSUB(Tv, Tx);
+		    TO = VADD(Tv, Tx);
+		    Tf = VZMUL(Td, Te);
+		    Tj = VZMUL(Th, Ti);
+		    {
+			 V T9, TJ, TP, TU, Tz, TF, Tg, TK, Tm, TL;
+			 T9 = VSUB(T1, T8);
+			 TJ = VADD(T1, T8);
+			 TP = VADD(TN, TO);
+			 TU = VSUB(TN, TO);
+			 Tz = VADD(Ts, Ty);
+			 TF = VSUB(Ts, Ty);
+			 Tg = VSUB(Tc, Tf);
+			 TK = VADD(Tc, Tf);
+			 Tm = VSUB(Tj, Tl);
+			 TL = VADD(Tj, Tl);
+			 {
+			      V TM, TV, Tn, TE;
+			      TM = VADD(TK, TL);
+			      TV = VSUB(TK, TL);
+			      Tn = VADD(Tg, Tm);
+			      TE = VSUB(Tg, Tm);
+			      {
+				   V TW, TY, TS, TQ, TG, TI, TC, TA, TR, TB;
+				   TW = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TV, TU));
+				   TY = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TU, TV));
+				   TS = VSUB(TM, TP);
+				   TQ = VADD(TM, TP);
+				   TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TF, TE));
+				   TI = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TE, TF));
+				   TC = VSUB(Tn, Tz);
+				   TA = VADD(Tn, Tz);
+				   ST(&(x[0]), VADD(TJ, TQ), ms, &(x[0]));
+				   TR = VFNMS(LDK(KP250000000), TQ, TJ);
+				   ST(&(x[WS(rs, 5)]), VADD(T9, TA), ms, &(x[WS(rs, 1)]));
+				   TB = VFNMS(LDK(KP250000000), TA, T9);
+				   {
+					V TX, TT, TH, TD;
+					TX = VFMA(LDK(KP559016994), TS, TR);
+					TT = VFNMS(LDK(KP559016994), TS, TR);
+					TH = VFNMS(LDK(KP559016994), TC, TB);
+					TD = VFMA(LDK(KP559016994), TC, TB);
+					ST(&(x[WS(rs, 8)]), VFMAI(TW, TT), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFNMSI(TW, TT), ms, &(x[0]));
+					ST(&(x[WS(rs, 6)]), VFMAI(TY, TX), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VFNMSI(TY, TX), ms, &(x[0]));
+					ST(&(x[WS(rs, 9)]), VFNMSI(TG, TD), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 1)]), VFMAI(TG, TD), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 7)]), VFNMSI(TI, TH), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 3)]), VFMAI(TI, TH), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t3bv_10"), twinstr, &GENUS, {39, 34, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_10) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 10 -name t3bv_10 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 57 FP additions, 42 FP multiplications,
+ * (or, 51 additions, 36 multiplications, 6 fused multiply/add),
+ * 41 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V T1, T2, T3, Ti, T6, T7, TA, Tb, To;
+	       T1 = LDW(&(W[0]));
+	       T2 = LDW(&(W[TWVL * 2]));
+	       T3 = VZMULJ(T1, T2);
+	       Ti = VZMUL(T1, T2);
+	       T6 = LDW(&(W[TWVL * 4]));
+	       T7 = VZMULJ(T3, T6);
+	       TA = VZMULJ(Ti, T6);
+	       Tb = VZMULJ(T1, T6);
+	       To = VZMULJ(T2, T6);
+	       {
+		    V TD, TQ, Tn, Tt, Tx, TM, TN, TS, Ta, Tg, Tw, TJ, TK, TR, Tz;
+		    V TC, TB;
+		    Tz = LD(&(x[0]), ms, &(x[0]));
+		    TB = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    TC = VZMUL(TA, TB);
+		    TD = VSUB(Tz, TC);
+		    TQ = VADD(Tz, TC);
+		    {
+			 V Tk, Ts, Tm, Tq;
+			 {
+			      V Tj, Tr, Tl, Tp;
+			      Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      Tk = VZMUL(Ti, Tj);
+			      Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      Ts = VZMUL(T1, Tr);
+			      Tl = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			      Tm = VZMUL(T6, Tl);
+			      Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      Tq = VZMUL(To, Tp);
+			 }
+			 Tn = VSUB(Tk, Tm);
+			 Tt = VSUB(Tq, Ts);
+			 Tx = VADD(Tn, Tt);
+			 TM = VADD(Tk, Tm);
+			 TN = VADD(Tq, Ts);
+			 TS = VADD(TM, TN);
+		    }
+		    {
+			 V T5, Tf, T9, Td;
+			 {
+			      V T4, Te, T8, Tc;
+			      T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      T5 = VZMUL(T3, T4);
+			      Te = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      Tf = VZMUL(T2, Te);
+			      T8 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T9 = VZMUL(T7, T8);
+			      Tc = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Td = VZMUL(Tb, Tc);
+			 }
+			 Ta = VSUB(T5, T9);
+			 Tg = VSUB(Td, Tf);
+			 Tw = VADD(Ta, Tg);
+			 TJ = VADD(T5, T9);
+			 TK = VADD(Td, Tf);
+			 TR = VADD(TJ, TK);
+		    }
+		    {
+			 V Ty, TE, TF, Tv, TI, Th, Tu, TH, TG;
+			 Ty = VMUL(LDK(KP559016994), VSUB(Tw, Tx));
+			 TE = VADD(Tw, Tx);
+			 TF = VFNMS(LDK(KP250000000), TE, TD);
+			 Th = VSUB(Ta, Tg);
+			 Tu = VSUB(Tn, Tt);
+			 Tv = VBYI(VFMA(LDK(KP951056516), Th, VMUL(LDK(KP587785252), Tu)));
+			 TI = VBYI(VFNMS(LDK(KP951056516), Tu, VMUL(LDK(KP587785252), Th)));
+			 ST(&(x[WS(rs, 5)]), VADD(TD, TE), ms, &(x[WS(rs, 1)]));
+			 TH = VSUB(TF, Ty);
+			 ST(&(x[WS(rs, 3)]), VSUB(TH, TI), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(TI, TH), ms, &(x[WS(rs, 1)]));
+			 TG = VADD(Ty, TF);
+			 ST(&(x[WS(rs, 1)]), VADD(Tv, TG), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 9)]), VSUB(TG, Tv), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V TV, TT, TU, TP, TY, TL, TO, TX, TW;
+			 TV = VMUL(LDK(KP559016994), VSUB(TR, TS));
+			 TT = VADD(TR, TS);
+			 TU = VFNMS(LDK(KP250000000), TT, TQ);
+			 TL = VSUB(TJ, TK);
+			 TO = VSUB(TM, TN);
+			 TP = VBYI(VFNMS(LDK(KP951056516), TO, VMUL(LDK(KP587785252), TL)));
+			 TY = VBYI(VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TO)));
+			 ST(&(x[0]), VADD(TQ, TT), ms, &(x[0]));
+			 TX = VADD(TV, TU);
+			 ST(&(x[WS(rs, 4)]), VSUB(TX, TY), ms, &(x[0]));
+			 ST(&(x[WS(rs, 6)]), VADD(TY, TX), ms, &(x[0]));
+			 TW = VSUB(TU, TV);
+			 ST(&(x[WS(rs, 2)]), VADD(TP, TW), ms, &(x[0]));
+			 ST(&(x[WS(rs, 8)]), VSUB(TW, TP), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t3bv_10"), twinstr, &GENUS, {51, 36, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_10) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:18 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 16 -name t3bv_16 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 98 FP additions, 86 FP multiplications,
+ * (or, 64 additions, 52 multiplications, 34 fused multiply/add),
+ * 70 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T13, Tg, TY, T14, T1A, T1q, T1f, T1x, T1r, T1i, Tt, T16, TB, T1j, T1k;
+	       V TH;
+	       {
+		    V T2, T8, Tu, T3;
+		    T2 = LDW(&(W[0]));
+		    T8 = LDW(&(W[TWVL * 2]));
+		    Tu = LDW(&(W[TWVL * 6]));
+		    T3 = LDW(&(W[TWVL * 4]));
+		    {
+			 V Ty, T1o, Tf, T1b, T7, Tr, TQ, TX, T1g, Tl, To, Tw, TG, Tz, T1p;
+			 V T1e, TC;
+			 {
+			      V T1, T5, Ta, Td;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      Td = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			      {
+				   V TR, TN, TM, TE, Tb, Tp, Tm, Te, T6, TW, TO, TS;
+				   {
+					V TL, Tx, T9, TU, Tc, T4, TV;
+					TL = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					Tx = VZMULJ(T2, T8);
+					T9 = VZMUL(T2, T8);
+					TR = VZMULJ(T2, Tu);
+					TU = VZMULJ(T8, T3);
+					Tc = VZMUL(T8, T3);
+					T4 = VZMULJ(T2, T3);
+					TN = VZMUL(T2, T3);
+					TV = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+					TM = VZMUL(Tx, TL);
+					Ty = VZMULJ(Tx, T3);
+					TE = VZMUL(Tx, T3);
+					Tb = VZMUL(T9, Ta);
+					Tp = VZMUL(T9, T3);
+					Tm = VZMULJ(T9, T3);
+					Te = VZMUL(Tc, Td);
+					T6 = VZMUL(T4, T5);
+					TW = VZMUL(TU, TV);
+				   }
+				   TO = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+				   TS = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   {
+					V TP, TT, Ti, Tk, Tn, Th, Tq, Tj;
+					Th = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					Tq = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+					Tj = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					T1o = VSUB(Tb, Te);
+					Tf = VADD(Tb, Te);
+					T1b = VSUB(T1, T6);
+					T7 = VADD(T1, T6);
+					TP = VZMUL(TN, TO);
+					TT = VZMUL(TR, TS);
+					Ti = VZMUL(T2, Th);
+					Tr = VZMUL(Tp, Tq);
+					Tk = VZMUL(T3, Tj);
+					Tn = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V T1c, T1d, Tv, TF;
+					     Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					     TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					     T1c = VSUB(TM, TP);
+					     TQ = VADD(TM, TP);
+					     T1d = VSUB(TT, TW);
+					     TX = VADD(TT, TW);
+					     T1g = VSUB(Ti, Tk);
+					     Tl = VADD(Ti, Tk);
+					     To = VZMUL(Tm, Tn);
+					     Tw = VZMUL(Tu, Tv);
+					     TG = VZMUL(TE, TF);
+					     Tz = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     T1p = VSUB(T1c, T1d);
+					     T1e = VADD(T1c, T1d);
+					     TC = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T1h, Ts, TA, TD;
+			      T13 = VADD(T7, Tf);
+			      Tg = VSUB(T7, Tf);
+			      T1h = VSUB(To, Tr);
+			      Ts = VADD(To, Tr);
+			      TY = VSUB(TQ, TX);
+			      T14 = VADD(TQ, TX);
+			      TA = VZMUL(Ty, Tz);
+			      T1A = VFNMS(LDK(KP707106781), T1p, T1o);
+			      T1q = VFMA(LDK(KP707106781), T1p, T1o);
+			      T1f = VFMA(LDK(KP707106781), T1e, T1b);
+			      T1x = VFNMS(LDK(KP707106781), T1e, T1b);
+			      TD = VZMUL(T8, TC);
+			      T1r = VFMA(LDK(KP414213562), T1g, T1h);
+			      T1i = VFNMS(LDK(KP414213562), T1h, T1g);
+			      Tt = VSUB(Tl, Ts);
+			      T16 = VADD(Tl, Ts);
+			      TB = VADD(Tw, TA);
+			      T1j = VSUB(Tw, TA);
+			      T1k = VSUB(TG, TD);
+			      TH = VADD(TD, TG);
+			 }
+		    }
+	       }
+	       {
+		    V T15, T19, T1l, T1s, TI, T17;
+		    T15 = VSUB(T13, T14);
+		    T19 = VADD(T13, T14);
+		    T1l = VFNMS(LDK(KP414213562), T1k, T1j);
+		    T1s = VFMA(LDK(KP414213562), T1j, T1k);
+		    TI = VSUB(TB, TH);
+		    T17 = VADD(TB, TH);
+		    {
+			 V T1y, T1t, T1B, T1m;
+			 T1y = VADD(T1r, T1s);
+			 T1t = VSUB(T1r, T1s);
+			 T1B = VSUB(T1i, T1l);
+			 T1m = VADD(T1i, T1l);
+			 {
+			      V T18, T1a, TJ, TZ;
+			      T18 = VSUB(T16, T17);
+			      T1a = VADD(T16, T17);
+			      TJ = VADD(Tt, TI);
+			      TZ = VSUB(Tt, TI);
+			      {
+				   V T1u, T1w, T1z, T1D;
+				   T1u = VFNMS(LDK(KP923879532), T1t, T1q);
+				   T1w = VFMA(LDK(KP923879532), T1t, T1q);
+				   T1z = VFNMS(LDK(KP923879532), T1y, T1x);
+				   T1D = VFMA(LDK(KP923879532), T1y, T1x);
+				   {
+					V T1n, T1v, T1C, T1E;
+					T1n = VFNMS(LDK(KP923879532), T1m, T1f);
+					T1v = VFMA(LDK(KP923879532), T1m, T1f);
+					T1C = VFMA(LDK(KP923879532), T1B, T1A);
+					T1E = VFNMS(LDK(KP923879532), T1B, T1A);
+					ST(&(x[WS(rs, 8)]), VSUB(T19, T1a), ms, &(x[0]));
+					ST(&(x[0]), VADD(T19, T1a), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VFMAI(T18, T15), ms, &(x[0]));
+					ST(&(x[WS(rs, 12)]), VFNMSI(T18, T15), ms, &(x[0]));
+					{
+					     V T10, T12, TK, T11;
+					     T10 = VFNMS(LDK(KP707106781), TZ, TY);
+					     T12 = VFMA(LDK(KP707106781), TZ, TY);
+					     TK = VFNMS(LDK(KP707106781), TJ, Tg);
+					     T11 = VFMA(LDK(KP707106781), TJ, Tg);
+					     ST(&(x[WS(rs, 15)]), VFNMSI(T1w, T1v), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 1)]), VFMAI(T1w, T1v), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 9)]), VFMAI(T1u, T1n), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 7)]), VFNMSI(T1u, T1n), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 3)]), VFNMSI(T1E, T1D), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 13)]), VFMAI(T1E, T1D), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 11)]), VFNMSI(T1C, T1z), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 5)]), VFMAI(T1C, T1z), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 2)]), VFMAI(T12, T11), ms, &(x[0]));
+					     ST(&(x[WS(rs, 14)]), VFNMSI(T12, T11), ms, &(x[0]));
+					     ST(&(x[WS(rs, 10)]), VFMAI(T10, TK), ms, &(x[0]));
+					     ST(&(x[WS(rs, 6)]), VFNMSI(T10, TK), ms, &(x[0]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t3bv_16"), twinstr, &GENUS, {64, 52, 34, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_16) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 16 -name t3bv_16 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 98 FP additions, 64 FP multiplications,
+ * (or, 94 additions, 60 multiplications, 4 fused multiply/add),
+ * 51 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T1, T8, T9, Tl, Ti, TE, T4, Ta, TO, TV, Td, Tm, TA, TH, Ts;
+	       T1 = LDW(&(W[0]));
+	       T8 = LDW(&(W[TWVL * 2]));
+	       T9 = VZMUL(T1, T8);
+	       Tl = VZMULJ(T1, T8);
+	       Ti = LDW(&(W[TWVL * 6]));
+	       TE = VZMULJ(T1, Ti);
+	       T4 = LDW(&(W[TWVL * 4]));
+	       Ta = VZMULJ(T9, T4);
+	       TO = VZMUL(T8, T4);
+	       TV = VZMULJ(T1, T4);
+	       Td = VZMUL(T9, T4);
+	       Tm = VZMULJ(Tl, T4);
+	       TA = VZMUL(T1, T4);
+	       TH = VZMULJ(T8, T4);
+	       Ts = VZMUL(Tl, T4);
+	       {
+		    V TY, T1q, TR, T1r, T1m, T1n, TL, TZ, T1f, T1g, T1h, Th, T11, T1i, T1j;
+		    V T1k, Tw, T12, TU, TX, TW;
+		    TU = LD(&(x[0]), ms, &(x[0]));
+		    TW = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    TX = VZMUL(TV, TW);
+		    TY = VSUB(TU, TX);
+		    T1q = VADD(TU, TX);
+		    {
+			 V TN, TQ, TM, TP;
+			 TM = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 TN = VZMUL(T9, TM);
+			 TP = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 TQ = VZMUL(TO, TP);
+			 TR = VSUB(TN, TQ);
+			 T1r = VADD(TN, TQ);
+		    }
+		    {
+			 V Tz, TJ, TC, TG, TD, TK;
+			 {
+			      V Ty, TI, TB, TF;
+			      Ty = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      Tz = VZMUL(Tl, Ty);
+			      TI = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      TJ = VZMUL(TH, TI);
+			      TB = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      TC = VZMUL(TA, TB);
+			      TF = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      TG = VZMUL(TE, TF);
+			 }
+			 T1m = VADD(Tz, TC);
+			 T1n = VADD(TG, TJ);
+			 TD = VSUB(Tz, TC);
+			 TK = VSUB(TG, TJ);
+			 TL = VMUL(LDK(KP707106781), VSUB(TD, TK));
+			 TZ = VMUL(LDK(KP707106781), VADD(TD, TK));
+		    }
+		    {
+			 V T3, Tf, T6, Tc, T7, Tg;
+			 {
+			      V T2, Te, T5, Tb;
+			      T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      T3 = VZMUL(T1, T2);
+			      Te = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			      Tf = VZMUL(Td, Te);
+			      T5 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			      T6 = VZMUL(T4, T5);
+			      Tb = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      Tc = VZMUL(Ta, Tb);
+			 }
+			 T1f = VADD(T3, T6);
+			 T1g = VADD(Tc, Tf);
+			 T1h = VSUB(T1f, T1g);
+			 T7 = VSUB(T3, T6);
+			 Tg = VSUB(Tc, Tf);
+			 Th = VFNMS(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), T7));
+			 T11 = VFMA(LDK(KP382683432), T7, VMUL(LDK(KP923879532), Tg));
+		    }
+		    {
+			 V Tk, Tu, To, Tr, Tp, Tv;
+			 {
+			      V Tj, Tt, Tn, Tq;
+			      Tj = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      Tk = VZMUL(Ti, Tj);
+			      Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      Tu = VZMUL(Ts, Tt);
+			      Tn = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      To = VZMUL(Tm, Tn);
+			      Tq = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      Tr = VZMUL(T8, Tq);
+			 }
+			 T1i = VADD(Tk, To);
+			 T1j = VADD(Tr, Tu);
+			 T1k = VSUB(T1i, T1j);
+			 Tp = VSUB(Tk, To);
+			 Tv = VSUB(Tr, Tu);
+			 Tw = VFMA(LDK(KP923879532), Tp, VMUL(LDK(KP382683432), Tv));
+			 T12 = VFNMS(LDK(KP382683432), Tp, VMUL(LDK(KP923879532), Tv));
+		    }
+		    {
+			 V T1p, T1v, T1u, T1w;
+			 {
+			      V T1l, T1o, T1s, T1t;
+			      T1l = VMUL(LDK(KP707106781), VSUB(T1h, T1k));
+			      T1o = VSUB(T1m, T1n);
+			      T1p = VBYI(VSUB(T1l, T1o));
+			      T1v = VBYI(VADD(T1o, T1l));
+			      T1s = VSUB(T1q, T1r);
+			      T1t = VMUL(LDK(KP707106781), VADD(T1h, T1k));
+			      T1u = VSUB(T1s, T1t);
+			      T1w = VADD(T1s, T1t);
+			 }
+			 ST(&(x[WS(rs, 6)]), VADD(T1p, T1u), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VSUB(T1w, T1v), ms, &(x[0]));
+			 ST(&(x[WS(rs, 10)]), VSUB(T1u, T1p), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(T1v, T1w), ms, &(x[0]));
+		    }
+		    {
+			 V T1z, T1D, T1C, T1E;
+			 {
+			      V T1x, T1y, T1A, T1B;
+			      T1x = VADD(T1q, T1r);
+			      T1y = VADD(T1m, T1n);
+			      T1z = VSUB(T1x, T1y);
+			      T1D = VADD(T1x, T1y);
+			      T1A = VADD(T1f, T1g);
+			      T1B = VADD(T1i, T1j);
+			      T1C = VBYI(VSUB(T1A, T1B));
+			      T1E = VADD(T1A, T1B);
+			 }
+			 ST(&(x[WS(rs, 12)]), VSUB(T1z, T1C), ms, &(x[0]));
+			 ST(&(x[0]), VADD(T1D, T1E), ms, &(x[0]));
+			 ST(&(x[WS(rs, 4)]), VADD(T1z, T1C), ms, &(x[0]));
+			 ST(&(x[WS(rs, 8)]), VSUB(T1D, T1E), ms, &(x[0]));
+		    }
+		    {
+			 V TT, T15, T14, T16;
+			 {
+			      V Tx, TS, T10, T13;
+			      Tx = VSUB(Th, Tw);
+			      TS = VSUB(TL, TR);
+			      TT = VBYI(VSUB(Tx, TS));
+			      T15 = VBYI(VADD(TS, Tx));
+			      T10 = VSUB(TY, TZ);
+			      T13 = VSUB(T11, T12);
+			      T14 = VSUB(T10, T13);
+			      T16 = VADD(T10, T13);
+			 }
+			 ST(&(x[WS(rs, 5)]), VADD(TT, T14), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VSUB(T16, T15), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VSUB(T14, TT), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(T15, T16), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T19, T1d, T1c, T1e;
+			 {
+			      V T17, T18, T1a, T1b;
+			      T17 = VADD(TY, TZ);
+			      T18 = VADD(Th, Tw);
+			      T19 = VADD(T17, T18);
+			      T1d = VSUB(T17, T18);
+			      T1a = VADD(TR, TL);
+			      T1b = VADD(T11, T12);
+			      T1c = VBYI(VADD(T1a, T1b));
+			      T1e = VBYI(VSUB(T1b, T1a));
+			 }
+			 ST(&(x[WS(rs, 15)]), VSUB(T19, T1c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(T1d, T1e), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T19, T1c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 9)]), VSUB(T1d, T1e), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t3bv_16"), twinstr, &GENUS, {94, 60, 4, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_16) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:23 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3bv_20 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 138 FP additions, 118 FP multiplications,
+ * (or, 92 additions, 72 multiplications, 46 fused multiply/add),
+ * 90 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T19, T1u, T1p, T1x, T1m, T1w, T1t, TI;
+	       {
+		    V T2, T8, T3, Td;
+		    T2 = LDW(&(W[0]));
+		    T8 = LDW(&(W[TWVL * 2]));
+		    T3 = LDW(&(W[TWVL * 4]));
+		    Td = LDW(&(W[TWVL * 6]));
+		    {
+			 V T7, T1g, T1F, T23, T1n, Tp, T18, T27, T1P, T1I, TU, T1L, T28, T1S, T1o;
+			 V TE, T1l, T1j, T26, T2e;
+			 {
+			      V T1, T1e, T5, T1b;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T1e = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      T1b = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      {
+				   V TA, Tx, TQ, T1O, T10, Th, T1G, T1R, T17, T1J, To, Ts, TR, Tv, TK;
+				   V TM, TP, Ty, TB;
+				   {
+					V Tq, Tt, T13, T16, Tk, Tn;
+					{
+					     V Tl, Ti, T11, T14, TV, Tc, T6, Tb, Tf, TW, TY, T1f;
+					     {
+						  V T1d, Ta, T9, T4;
+						  Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+						  TA = VZMULJ(T2, T8);
+						  T9 = VZMUL(T2, T8);
+						  Tx = VZMUL(T8, T3);
+						  Tl = VZMULJ(T8, T3);
+						  T4 = VZMUL(T2, T3);
+						  Tq = VZMULJ(T2, T3);
+						  Tt = VZMULJ(T2, Td);
+						  Ti = VZMULJ(T8, Td);
+						  T11 = VZMULJ(TA, Td);
+						  T14 = VZMULJ(TA, T3);
+						  TQ = VZMUL(TA, T3);
+						  T1d = VZMULJ(T9, Td);
+						  TV = VZMUL(T9, T3);
+						  Tc = VZMULJ(T9, T3);
+						  T6 = VZMUL(T4, T5);
+						  Tb = VZMUL(T9, Ta);
+						  Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+						  TW = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+						  TY = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+						  T1f = VZMUL(T1d, T1e);
+					     }
+					     {
+						  V T1D, TX, TZ, T15, T1E, Tg, T12, T1c, Te, Tj, Tm;
+						  T12 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+						  T1c = VZMUL(Tc, T1b);
+						  Te = VZMULJ(Tc, Td);
+						  T7 = VSUB(T1, T6);
+						  T1D = VADD(T1, T6);
+						  TX = VZMUL(TV, TW);
+						  TZ = VZMUL(T8, TY);
+						  T15 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+						  T13 = VZMUL(T11, T12);
+						  T1g = VSUB(T1c, T1f);
+						  T1E = VADD(T1c, T1f);
+						  Tg = VZMUL(Te, Tf);
+						  Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+						  Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+						  T1O = VADD(TX, TZ);
+						  T10 = VSUB(TX, TZ);
+						  T16 = VZMUL(T14, T15);
+						  T1F = VSUB(T1D, T1E);
+						  T23 = VADD(T1D, T1E);
+						  Th = VSUB(Tb, Tg);
+						  T1G = VADD(Tb, Tg);
+						  Tk = VZMUL(Ti, Tj);
+						  Tn = VZMUL(Tl, Tm);
+					     }
+					}
+					{
+					     V Tr, Tu, TJ, TL, TO;
+					     Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+					     T1R = VADD(T13, T16);
+					     T17 = VSUB(T13, T16);
+					     Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+					     TJ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					     TL = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+					     TO = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					     T1J = VADD(Tk, Tn);
+					     To = VSUB(Tk, Tn);
+					     Ts = VZMUL(Tq, Tr);
+					     TR = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					     Tv = VZMUL(Tt, Tu);
+					     TK = VZMUL(T3, TJ);
+					     TM = VZMUL(Td, TL);
+					     TP = VZMUL(T2, TO);
+					     Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					     TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					}
+				   }
+				   {
+					V T1N, Tw, T1H, TN, Tz, TC, T1i, TT, T1K, TS;
+					T1n = VSUB(Th, To);
+					Tp = VADD(Th, To);
+					TS = VZMUL(TQ, TR);
+					T1N = VADD(Ts, Tv);
+					Tw = VSUB(Ts, Tv);
+					T1H = VADD(TK, TM);
+					TN = VSUB(TK, TM);
+					Tz = VZMUL(Tx, Ty);
+					TC = VZMUL(TA, TB);
+					T18 = VSUB(T10, T17);
+					T1i = VADD(T10, T17);
+					TT = VSUB(TP, TS);
+					T1K = VADD(TP, TS);
+					T27 = VADD(T1N, T1O);
+					T1P = VSUB(T1N, T1O);
+					{
+					     V TD, T1Q, T24, T1h, T25;
+					     TD = VSUB(Tz, TC);
+					     T1Q = VADD(Tz, TC);
+					     T1I = VSUB(T1G, T1H);
+					     T24 = VADD(T1G, T1H);
+					     T1h = VADD(TN, TT);
+					     TU = VSUB(TN, TT);
+					     T25 = VADD(T1J, T1K);
+					     T1L = VSUB(T1J, T1K);
+					     T28 = VADD(T1Q, T1R);
+					     T1S = VSUB(T1Q, T1R);
+					     T1o = VSUB(Tw, TD);
+					     TE = VADD(Tw, TD);
+					     T1l = VSUB(T1h, T1i);
+					     T1j = VADD(T1h, T1i);
+					     T26 = VADD(T24, T25);
+					     T2e = VSUB(T24, T25);
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T1M, T1Z, T1Y, T1T, T29, T2f, TH, TF, T1k, T1C;
+			      T1M = VADD(T1I, T1L);
+			      T1Z = VSUB(T1I, T1L);
+			      T1Y = VSUB(T1P, T1S);
+			      T1T = VADD(T1P, T1S);
+			      T29 = VADD(T27, T28);
+			      T2f = VSUB(T27, T28);
+			      TH = VSUB(Tp, TE);
+			      TF = VADD(Tp, TE);
+			      T1k = VFNMS(LDK(KP250000000), T1j, T1g);
+			      T1C = VADD(T1g, T1j);
+			      {
+				   V T1W, T2c, TG, T2i, T2g, T22, T20, T1V, T2b, T1U, T2a, T1B;
+				   T19 = VFMA(LDK(KP618033988), T18, TU);
+				   T1u = VFNMS(LDK(KP618033988), TU, T18);
+				   T1W = VSUB(T1M, T1T);
+				   T1U = VADD(T1M, T1T);
+				   T2c = VSUB(T26, T29);
+				   T2a = VADD(T26, T29);
+				   TG = VFNMS(LDK(KP250000000), TF, T7);
+				   T1B = VADD(T7, TF);
+				   T2i = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T2e, T2f));
+				   T2g = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2f, T2e));
+				   T22 = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1Y, T1Z));
+				   T20 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1Z, T1Y));
+				   ST(&(x[WS(rs, 10)]), VADD(T1F, T1U), ms, &(x[0]));
+				   T1V = VFNMS(LDK(KP250000000), T1U, T1F);
+				   ST(&(x[0]), VADD(T23, T2a), ms, &(x[0]));
+				   T2b = VFNMS(LDK(KP250000000), T2a, T23);
+				   ST(&(x[WS(rs, 5)]), VFMAI(T1C, T1B), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 15)]), VFNMSI(T1C, T1B), ms, &(x[WS(rs, 1)]));
+				   T1p = VFMA(LDK(KP618033988), T1o, T1n);
+				   T1x = VFNMS(LDK(KP618033988), T1n, T1o);
+				   {
+					V T21, T1X, T2h, T2d;
+					T21 = VFMA(LDK(KP559016994), T1W, T1V);
+					T1X = VFNMS(LDK(KP559016994), T1W, T1V);
+					T2h = VFNMS(LDK(KP559016994), T2c, T2b);
+					T2d = VFMA(LDK(KP559016994), T2c, T2b);
+					ST(&(x[WS(rs, 18)]), VFMAI(T20, T1X), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFNMSI(T20, T1X), ms, &(x[0]));
+					ST(&(x[WS(rs, 14)]), VFNMSI(T22, T21), ms, &(x[0]));
+					ST(&(x[WS(rs, 6)]), VFMAI(T22, T21), ms, &(x[0]));
+					ST(&(x[WS(rs, 16)]), VFMAI(T2g, T2d), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VFNMSI(T2g, T2d), ms, &(x[0]));
+					ST(&(x[WS(rs, 12)]), VFNMSI(T2i, T2h), ms, &(x[0]));
+					ST(&(x[WS(rs, 8)]), VFMAI(T2i, T2h), ms, &(x[0]));
+					T1m = VFMA(LDK(KP559016994), T1l, T1k);
+					T1w = VFNMS(LDK(KP559016994), T1l, T1k);
+					T1t = VFNMS(LDK(KP559016994), TH, TG);
+					TI = VFMA(LDK(KP559016994), TH, TG);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1A, T1y, T1q, T1s, T1a, T1r, T1z, T1v;
+		    T1A = VFMA(LDK(KP951056516), T1x, T1w);
+		    T1y = VFNMS(LDK(KP951056516), T1x, T1w);
+		    T1q = VFMA(LDK(KP951056516), T1p, T1m);
+		    T1s = VFNMS(LDK(KP951056516), T1p, T1m);
+		    T1a = VFNMS(LDK(KP951056516), T19, TI);
+		    T1r = VFMA(LDK(KP951056516), T19, TI);
+		    T1z = VFNMS(LDK(KP951056516), T1u, T1t);
+		    T1v = VFMA(LDK(KP951056516), T1u, T1t);
+		    ST(&(x[WS(rs, 9)]), VFMAI(T1s, T1r), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 11)]), VFNMSI(T1s, T1r), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T1q, T1a), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 19)]), VFNMSI(T1q, T1a), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 17)]), VFMAI(T1y, T1v), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VFNMSI(T1y, T1v), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 13)]), VFMAI(T1A, T1z), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VFNMSI(T1A, T1z), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t3bv_20"), twinstr, &GENUS, {92, 72, 46, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_20) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3bv_20 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 138 FP additions, 92 FP multiplications,
+ * (or, 126 additions, 80 multiplications, 12 fused multiply/add),
+ * 73 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T2, T8, T9, TA, T3, Tc, T4, TV, T14, Tl, Tq, Tx, TQ, Td, Te;
+	       V T1g, Ti, Tt, T11;
+	       T2 = LDW(&(W[0]));
+	       T8 = LDW(&(W[TWVL * 2]));
+	       T9 = VZMUL(T2, T8);
+	       TA = VZMULJ(T2, T8);
+	       T3 = LDW(&(W[TWVL * 4]));
+	       Tc = VZMULJ(T9, T3);
+	       T4 = VZMUL(T2, T3);
+	       TV = VZMUL(T9, T3);
+	       T14 = VZMULJ(TA, T3);
+	       Tl = VZMULJ(T8, T3);
+	       Tq = VZMULJ(T2, T3);
+	       Tx = VZMUL(T8, T3);
+	       TQ = VZMUL(TA, T3);
+	       Td = LDW(&(W[TWVL * 6]));
+	       Te = VZMULJ(Tc, Td);
+	       T1g = VZMULJ(T9, Td);
+	       Ti = VZMULJ(T8, Td);
+	       Tt = VZMULJ(T2, Td);
+	       T11 = VZMULJ(TA, Td);
+	       {
+		    V T7, T1j, T1U, T2a, TU, T1n, T1o, T18, Tp, TE, TF, T26, T27, T28, T1M;
+		    V T1P, T1W, T1b, T1c, T1k, T23, T24, T25, T1F, T1I, T1V, T1B, T1C;
+		    {
+			 V T1, T1i, T6, T1f, T1h, T5, T1e, T1S, T1T;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T1h = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 T1i = VZMUL(T1g, T1h);
+			 T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 T6 = VZMUL(T4, T5);
+			 T1e = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T1f = VZMUL(Tc, T1e);
+			 T7 = VSUB(T1, T6);
+			 T1j = VSUB(T1f, T1i);
+			 T1S = VADD(T1, T6);
+			 T1T = VADD(T1f, T1i);
+			 T1U = VSUB(T1S, T1T);
+			 T2a = VADD(T1S, T1T);
+		    }
+		    {
+			 V Th, T1D, T10, T1L, T17, T1O, To, T1G, Tw, T1K, TN, T1E, TT, T1H, TD;
+			 V T1N;
+			 {
+			      V Tb, Tg, Ta, Tf;
+			      Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      Tb = VZMUL(T9, Ta);
+			      Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      Tg = VZMUL(Te, Tf);
+			      Th = VSUB(Tb, Tg);
+			      T1D = VADD(Tb, Tg);
+			 }
+			 {
+			      V TX, TZ, TW, TY;
+			      TW = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			      TX = VZMUL(TV, TW);
+			      TY = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      TZ = VZMUL(T8, TY);
+			      T10 = VSUB(TX, TZ);
+			      T1L = VADD(TX, TZ);
+			 }
+			 {
+			      V T13, T16, T12, T15;
+			      T12 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			      T13 = VZMUL(T11, T12);
+			      T15 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T16 = VZMUL(T14, T15);
+			      T17 = VSUB(T13, T16);
+			      T1O = VADD(T13, T16);
+			 }
+			 {
+			      V Tk, Tn, Tj, Tm;
+			      Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			      Tk = VZMUL(Ti, Tj);
+			      Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      Tn = VZMUL(Tl, Tm);
+			      To = VSUB(Tk, Tn);
+			      T1G = VADD(Tk, Tn);
+			 }
+			 {
+			      V Ts, Tv, Tr, Tu;
+			      Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Ts = VZMUL(Tq, Tr);
+			      Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      Tv = VZMUL(Tt, Tu);
+			      Tw = VSUB(Ts, Tv);
+			      T1K = VADD(Ts, Tv);
+			 }
+			 {
+			      V TK, TM, TJ, TL;
+			      TJ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			      TK = VZMUL(T3, TJ);
+			      TL = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			      TM = VZMUL(Td, TL);
+			      TN = VSUB(TK, TM);
+			      T1E = VADD(TK, TM);
+			 }
+			 {
+			      V TP, TS, TO, TR;
+			      TO = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      TP = VZMUL(T2, TO);
+			      TR = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      TS = VZMUL(TQ, TR);
+			      TT = VSUB(TP, TS);
+			      T1H = VADD(TP, TS);
+			 }
+			 {
+			      V Tz, TC, Ty, TB;
+			      Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			      Tz = VZMUL(Tx, Ty);
+			      TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      TC = VZMUL(TA, TB);
+			      TD = VSUB(Tz, TC);
+			      T1N = VADD(Tz, TC);
+			 }
+			 TU = VSUB(TN, TT);
+			 T1n = VSUB(Th, To);
+			 T1o = VSUB(Tw, TD);
+			 T18 = VSUB(T10, T17);
+			 Tp = VADD(Th, To);
+			 TE = VADD(Tw, TD);
+			 TF = VADD(Tp, TE);
+			 T26 = VADD(T1K, T1L);
+			 T27 = VADD(T1N, T1O);
+			 T28 = VADD(T26, T27);
+			 T1M = VSUB(T1K, T1L);
+			 T1P = VSUB(T1N, T1O);
+			 T1W = VADD(T1M, T1P);
+			 T1b = VADD(TN, TT);
+			 T1c = VADD(T10, T17);
+			 T1k = VADD(T1b, T1c);
+			 T23 = VADD(T1D, T1E);
+			 T24 = VADD(T1G, T1H);
+			 T25 = VADD(T23, T24);
+			 T1F = VSUB(T1D, T1E);
+			 T1I = VSUB(T1G, T1H);
+			 T1V = VADD(T1F, T1I);
+		    }
+		    T1B = VADD(T7, TF);
+		    T1C = VBYI(VADD(T1j, T1k));
+		    ST(&(x[WS(rs, 15)]), VSUB(T1B, T1C), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VADD(T1B, T1C), ms, &(x[WS(rs, 1)]));
+		    {
+			 V T29, T2b, T2c, T2g, T2i, T2e, T2f, T2h, T2d;
+			 T29 = VMUL(LDK(KP559016994), VSUB(T25, T28));
+			 T2b = VADD(T25, T28);
+			 T2c = VFNMS(LDK(KP250000000), T2b, T2a);
+			 T2e = VSUB(T23, T24);
+			 T2f = VSUB(T26, T27);
+			 T2g = VBYI(VFMA(LDK(KP951056516), T2e, VMUL(LDK(KP587785252), T2f)));
+			 T2i = VBYI(VFNMS(LDK(KP951056516), T2f, VMUL(LDK(KP587785252), T2e)));
+			 ST(&(x[0]), VADD(T2a, T2b), ms, &(x[0]));
+			 T2h = VSUB(T2c, T29);
+			 ST(&(x[WS(rs, 8)]), VSUB(T2h, T2i), ms, &(x[0]));
+			 ST(&(x[WS(rs, 12)]), VADD(T2i, T2h), ms, &(x[0]));
+			 T2d = VADD(T29, T2c);
+			 ST(&(x[WS(rs, 4)]), VSUB(T2d, T2g), ms, &(x[0]));
+			 ST(&(x[WS(rs, 16)]), VADD(T2g, T2d), ms, &(x[0]));
+		    }
+		    {
+			 V T1Z, T1X, T1Y, T1R, T21, T1J, T1Q, T22, T20;
+			 T1Z = VMUL(LDK(KP559016994), VSUB(T1V, T1W));
+			 T1X = VADD(T1V, T1W);
+			 T1Y = VFNMS(LDK(KP250000000), T1X, T1U);
+			 T1J = VSUB(T1F, T1I);
+			 T1Q = VSUB(T1M, T1P);
+			 T1R = VBYI(VFNMS(LDK(KP951056516), T1Q, VMUL(LDK(KP587785252), T1J)));
+			 T21 = VBYI(VFMA(LDK(KP951056516), T1J, VMUL(LDK(KP587785252), T1Q)));
+			 ST(&(x[WS(rs, 10)]), VADD(T1U, T1X), ms, &(x[0]));
+			 T22 = VADD(T1Z, T1Y);
+			 ST(&(x[WS(rs, 6)]), VADD(T21, T22), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VSUB(T22, T21), ms, &(x[0]));
+			 T20 = VSUB(T1Y, T1Z);
+			 ST(&(x[WS(rs, 2)]), VADD(T1R, T20), ms, &(x[0]));
+			 ST(&(x[WS(rs, 18)]), VSUB(T20, T1R), ms, &(x[0]));
+		    }
+		    {
+			 V T19, T1p, T1w, T1u, T1m, T1x, TI, T1t;
+			 T19 = VFNMS(LDK(KP951056516), T18, VMUL(LDK(KP587785252), TU));
+			 T1p = VFNMS(LDK(KP951056516), T1o, VMUL(LDK(KP587785252), T1n));
+			 T1w = VFMA(LDK(KP951056516), T1n, VMUL(LDK(KP587785252), T1o));
+			 T1u = VFMA(LDK(KP951056516), TU, VMUL(LDK(KP587785252), T18));
+			 {
+			      V T1d, T1l, TG, TH;
+			      T1d = VMUL(LDK(KP559016994), VSUB(T1b, T1c));
+			      T1l = VFNMS(LDK(KP250000000), T1k, T1j);
+			      T1m = VSUB(T1d, T1l);
+			      T1x = VADD(T1d, T1l);
+			      TG = VFNMS(LDK(KP250000000), TF, T7);
+			      TH = VMUL(LDK(KP559016994), VSUB(Tp, TE));
+			      TI = VSUB(TG, TH);
+			      T1t = VADD(TH, TG);
+			 }
+			 {
+			      V T1a, T1q, T1z, T1A;
+			      T1a = VSUB(TI, T19);
+			      T1q = VBYI(VSUB(T1m, T1p));
+			      ST(&(x[WS(rs, 17)]), VSUB(T1a, T1q), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 3)]), VADD(T1a, T1q), ms, &(x[WS(rs, 1)]));
+			      T1z = VADD(T1t, T1u);
+			      T1A = VBYI(VSUB(T1x, T1w));
+			      ST(&(x[WS(rs, 11)]), VSUB(T1z, T1A), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 9)]), VADD(T1z, T1A), ms, &(x[WS(rs, 1)]));
+			 }
+			 {
+			      V T1r, T1s, T1v, T1y;
+			      T1r = VADD(TI, T19);
+			      T1s = VBYI(VADD(T1p, T1m));
+			      ST(&(x[WS(rs, 13)]), VSUB(T1r, T1s), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 7)]), VADD(T1r, T1s), ms, &(x[WS(rs, 1)]));
+			      T1v = VSUB(T1t, T1u);
+			      T1y = VBYI(VADD(T1w, T1x));
+			      ST(&(x[WS(rs, 19)]), VSUB(T1v, T1y), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VADD(T1v, T1y), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t3bv_20"), twinstr, &GENUS, {126, 80, 12, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_20) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:23 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 25 -name t3bv_25 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 268 FP additions, 281 FP multiplications,
+ * (or, 87 additions, 100 multiplications, 181 fused multiply/add),
+ * 223 stack variables, 67 constants, and 50 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP792626838, +0.792626838241819413632131824093538848057784557);
+     DVK(KP876091699, +0.876091699473550838204498029706869638173524346);
+     DVK(KP617882369, +0.617882369114440893914546919006756321695042882);
+     DVK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DVK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP999544308, +0.999544308746292983948881682379742149196758193);
+     DVK(KP916574801, +0.916574801383451584742370439148878693530976769);
+     DVK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DVK(KP809385824, +0.809385824416008241660603814668679683846476688);
+     DVK(KP447417479, +0.447417479732227551498980015410057305749330693);
+     DVK(KP894834959, +0.894834959464455102997960030820114611498661386);
+     DVK(KP867381224, +0.867381224396525206773171885031575671309956167);
+     DVK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DVK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DVK(KP958953096, +0.958953096729998668045963838399037225970891871);
+     DVK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DVK(KP829049696, +0.829049696159252993975487806364305442437946767);
+     DVK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DVK(KP897376177, +0.897376177523557693138608077137219684419427330);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DVK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DVK(KP855719849, +0.855719849902058969314654733608091555096772472);
+     DVK(KP237294955, +0.237294955877110315393888866460840817927895961);
+     DVK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DVK(KP904508497, +0.904508497187473712051146708591409529430077295);
+     DVK(KP997675361, +0.997675361079556513670859573984492383596555031);
+     DVK(KP763932022, +0.763932022500210303590826331268723764559381640);
+     DVK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP952936919, +0.952936919628306576880750665357914584765951388);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP262346850, +0.262346850930607871785420028382979691334784273);
+     DVK(KP570584518, +0.570584518783621657366766175430996792655723863);
+     DVK(KP669429328, +0.669429328479476605641803240971985825917022098);
+     DVK(KP923225144, +0.923225144846402650453449441572664695995209956);
+     DVK(KP945422727, +0.945422727388575946270360266328811958657216298);
+     DVK(KP522616830, +0.522616830205754336872861364785224694908468440);
+     DVK(KP956723877, +0.956723877038460305821989399535483155872969262);
+     DVK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DVK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DVK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DVK(KP921078979, +0.921078979742360627699756128143719920817673854);
+     DVK(KP982009705, +0.982009705009746369461829878184175962711969869);
+     DVK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DVK(KP494780565, +0.494780565770515410344588413655324772219443730);
+     DVK(KP447533225, +0.447533225982656890041886979663652563063114397);
+     DVK(KP603558818, +0.603558818296015001454675132653458027918768137);
+     DVK(KP667278218, +0.667278218140296670899089292254759909713898805);
+     DVK(KP244189809, +0.244189809627953270309879511234821255780225091);
+     DVK(KP269969613, +0.269969613759572083574752974412347470060951301);
+     DVK(KP578046249, +0.578046249379945007321754579646815604023525655);
+     DVK(KP522847744, +0.522847744331509716623755382187077770911012542);
+     DVK(KP132830569, +0.132830569247582714407653942074819768844536507);
+     DVK(KP120146378, +0.120146378570687701782758537356596213647956445);
+     DVK(KP893101515, +0.893101515366181661711202267938416198338079437);
+     DVK(KP987388751, +0.987388751065621252324603216482382109400433949);
+     DVK(KP059835404, +0.059835404262124915169548397419498386427871950);
+     DVK(KP066152395, +0.066152395967733048213034281011006031460903353);
+     DVK(KP786782374, +0.786782374965295178365099601674911834788448471);
+     DVK(KP869845200, +0.869845200362138853122720822420327157933056305);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V T2t, T1Z, T2W, T28, T2Q, T2r, T2g, T2u, T2o, T2l;
+	       {
+		    V T2, T5, T3, T9;
+		    T2 = LDW(&(W[0]));
+		    T5 = LDW(&(W[TWVL * 4]));
+		    T3 = LDW(&(W[TWVL * 2]));
+		    T9 = LDW(&(W[TWVL * 6]));
+		    {
+			 V T2c, T3l, Tn, T49, Tm, T4e, TN, T32, T1d, T3a, T3f, T3z, T3H, T25, T1W;
+			 V T2v, T2D, T4a, T1g, T18, T2Z, T11, T31, TK, T1q, T1j, T1n, T4b, T17;
+			 {
+			      V T1, T1l, Tr, T4, Ty, T1E, Tu, TX, TD, T1h, Tz, T1e, T1I, T1o, TU;
+			      V Tk, T2b, T1B, T1D, T1N, T1F, Td, T2a, T1J;
+			      {
+				   V T7, Tb, TC, Tg, T1L, Ta, T6, Tj, T1A;
+				   T1 = LD(&(x[0]), ms, &(x[0]));
+				   {
+					V Tf, Ti, Te, Th;
+					Tf = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+					Ti = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+					Tb = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+					Te = VZMUL(T2, T5);
+					TC = VZMULJ(T2, T5);
+					T1l = VZMUL(T3, T5);
+					Tr = VZMULJ(T3, T5);
+					T4 = VZMUL(T2, T3);
+					Ty = VZMULJ(T2, T3);
+					T1E = VZMULJ(T2, T9);
+					Th = VZMULJ(T5, T9);
+					Tu = VZMULJ(T3, T9);
+					Tg = VZMUL(Te, Tf);
+					TX = VZMULJ(Te, T9);
+					TD = VZMULJ(TC, T9);
+					T1h = VZMULJ(Ty, T9);
+					Tz = VZMUL(Ty, T5);
+					T1e = VZMULJ(Ty, T5);
+					T1L = VZMULJ(Tr, T9);
+					Ta = VZMULJ(T4, T9);
+					T1I = VZMUL(T4, T5);
+					T6 = VZMULJ(T4, T5);
+					Tj = VZMUL(Th, Ti);
+				   }
+				   T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   T1o = VZMULJ(T1e, T9);
+				   {
+					V Tc, T8, T1C, T1M;
+					T1C = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+					T1M = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+					Tc = VZMUL(Ta, Tb);
+					T8 = VZMUL(T6, T7);
+					TU = VZMULJ(T6, T9);
+					Tk = VADD(Tg, Tj);
+					T2b = VSUB(Tg, Tj);
+					T1B = VZMUL(T3, T1A);
+					T1D = VZMUL(TC, T1C);
+					T1N = VZMUL(T1L, T1M);
+					T1F = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+					Td = VADD(T8, Tc);
+					T2a = VSUB(T8, Tc);
+					T1J = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			      {
+				   V Tq, Tt, TF, T1T, T1H, Tw, T1U, T1O, TA, Tp, Ts, TE;
+				   Tp = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+				   Ts = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   TE = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   {
+					V T1K, Tv, T1G, Tl;
+					Tv = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+					T1G = VZMUL(T1E, T1F);
+					T2c = VFMA(LDK(KP618033988), T2b, T2a);
+					T3l = VFNMS(LDK(KP618033988), T2a, T2b);
+					Tn = VSUB(Td, Tk);
+					Tl = VADD(Td, Tk);
+					T1K = VZMUL(T1I, T1J);
+					Tq = VZMUL(T2, Tp);
+					Tt = VZMUL(Tr, Ts);
+					TF = VZMUL(TD, TE);
+					T1T = VSUB(T1D, T1G);
+					T1H = VADD(T1D, T1G);
+					T49 = VADD(T1, Tl);
+					Tm = VFNMS(LDK(KP250000000), Tl, T1);
+					Tw = VZMUL(Tu, Tv);
+					T1U = VSUB(T1K, T1N);
+					T1O = VADD(T1K, T1N);
+					TA = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   }
+				   {
+					V Tx, TL, T1R, T38, T1V, T13, TQ, TZ, TS, T1Q, TV, TG, TM, T12, T1c;
+					V T16;
+					T12 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+					{
+					     V TP, TY, T1P, TB, TR;
+					     TP = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+					     TY = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					     TR = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					     Tx = VADD(Tt, Tw);
+					     TL = VSUB(Tt, Tw);
+					     T1R = VSUB(T1O, T1H);
+					     T1P = VADD(T1H, T1O);
+					     T38 = VFNMS(LDK(KP618033988), T1T, T1U);
+					     T1V = VFMA(LDK(KP618033988), T1U, T1T);
+					     TB = VZMUL(Tz, TA);
+					     T13 = VZMUL(T4, T12);
+					     TQ = VZMUL(T9, TP);
+					     TZ = VZMUL(TX, TY);
+					     TS = VZMUL(T5, TR);
+					     T4e = VADD(T1B, T1P);
+					     T1Q = VFNMS(LDK(KP250000000), T1P, T1B);
+					     TV = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+					     TG = VADD(TB, TF);
+					     TM = VSUB(TF, TB);
+					}
+					T1c = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					{
+					     V T14, TT, TJ, T15, T10, TI, T1p, T1f, T1i, T1m;
+					     T1f = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     T14 = VADD(TS, TQ);
+					     TT = VSUB(TQ, TS);
+					     {
+						  V T39, T1S, TW, TH;
+						  T39 = VFMA(LDK(KP559016994), T1R, T1Q);
+						  T1S = VFNMS(LDK(KP559016994), T1R, T1Q);
+						  TW = VZMUL(TU, TV);
+						  TH = VADD(Tx, TG);
+						  TJ = VSUB(Tx, TG);
+						  TN = VFNMS(LDK(KP618033988), TM, TL);
+						  T32 = VFMA(LDK(KP618033988), TL, TM);
+						  T1d = VZMUL(Ty, T1c);
+						  T3a = VFMA(LDK(KP869845200), T39, T38);
+						  T3f = VFNMS(LDK(KP786782374), T38, T39);
+						  T3z = VFMA(LDK(KP066152395), T39, T38);
+						  T3H = VFNMS(LDK(KP059835404), T38, T39);
+						  T25 = VFMA(LDK(KP987388751), T1S, T1V);
+						  T1W = VFNMS(LDK(KP893101515), T1V, T1S);
+						  T2v = VFNMS(LDK(KP120146378), T1V, T1S);
+						  T2D = VFMA(LDK(KP132830569), T1S, T1V);
+						  T15 = VADD(TZ, TW);
+						  T10 = VSUB(TW, TZ);
+						  TI = VFNMS(LDK(KP250000000), TH, Tq);
+						  T4a = VADD(Tq, TH);
+						  T1g = VZMUL(T1e, T1f);
+					     }
+					     T1p = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					     T1i = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+					     T1m = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					     T18 = VSUB(T14, T15);
+					     T16 = VADD(T14, T15);
+					     T2Z = VFNMS(LDK(KP618033988), TT, T10);
+					     T11 = VFMA(LDK(KP618033988), T10, TT);
+					     T31 = VFNMS(LDK(KP559016994), TJ, TI);
+					     TK = VFMA(LDK(KP559016994), TJ, TI);
+					     T1q = VZMUL(T1o, T1p);
+					     T1j = VZMUL(T1h, T1i);
+					     T1n = VZMUL(T1l, T1m);
+					}
+					T4b = VADD(T13, T16);
+					T17 = VFMS(LDK(KP250000000), T16, T13);
+				   }
+			      }
+			 }
+			 {
+			      V T33, T3i, T3C, T3L, T20, TO, T2y, T2G, T1k, T1w, T1r, T1x, T2Y, T19, T4k;
+			      V T4c;
+			      T33 = VFMA(LDK(KP893101515), T32, T31);
+			      T3i = VFNMS(LDK(KP987388751), T31, T32);
+			      T3C = VFNMS(LDK(KP522847744), T32, T31);
+			      T3L = VFMA(LDK(KP578046249), T31, T32);
+			      T20 = VFMA(LDK(KP269969613), TK, TN);
+			      TO = VFNMS(LDK(KP244189809), TN, TK);
+			      T2y = VFMA(LDK(KP667278218), TK, TN);
+			      T2G = VFNMS(LDK(KP603558818), TN, TK);
+			      T1k = VADD(T1g, T1j);
+			      T1w = VSUB(T1g, T1j);
+			      T1r = VADD(T1n, T1q);
+			      T1x = VSUB(T1q, T1n);
+			      T2Y = VFMA(LDK(KP559016994), T18, T17);
+			      T19 = VFNMS(LDK(KP559016994), T18, T17);
+			      T4k = VSUB(T4a, T4b);
+			      T4c = VADD(T4a, T4b);
+			      {
+				   V T2X, To, T35, T1y, T2H, T2z, T1a, T21, T3t, T34, T3n, T3j, T3E, T3Y, T3M;
+				   V T3R, T1v, T36, T4l, T4f, T1u, T1s;
+				   T2X = VFNMS(LDK(KP559016994), Tn, Tm);
+				   To = VFMA(LDK(KP559016994), Tn, Tm);
+				   T1u = VSUB(T1r, T1k);
+				   T1s = VADD(T1k, T1r);
+				   T35 = VFMA(LDK(KP618033988), T1w, T1x);
+				   T1y = VFNMS(LDK(KP618033988), T1x, T1w);
+				   {
+					V T3K, T30, T3h, T3D, T4d, T1t;
+					T3K = VFMA(LDK(KP447533225), T2Z, T2Y);
+					T30 = VFMA(LDK(KP120146378), T2Z, T2Y);
+					T3h = VFNMS(LDK(KP132830569), T2Y, T2Z);
+					T3D = VFNMS(LDK(KP494780565), T2Y, T2Z);
+					T2H = VFNMS(LDK(KP786782374), T11, T19);
+					T2z = VFMA(LDK(KP869845200), T19, T11);
+					T1a = VFNMS(LDK(KP667278218), T19, T11);
+					T21 = VFMA(LDK(KP603558818), T11, T19);
+					T4d = VADD(T1d, T1s);
+					T1t = VFNMS(LDK(KP250000000), T1s, T1d);
+					T3t = VFNMS(LDK(KP734762448), T33, T30);
+					T34 = VFMA(LDK(KP734762448), T33, T30);
+					T3n = VFMA(LDK(KP734762448), T3i, T3h);
+					T3j = VFNMS(LDK(KP734762448), T3i, T3h);
+					T3E = VFNMS(LDK(KP982009705), T3D, T3C);
+					T3Y = VFMA(LDK(KP982009705), T3D, T3C);
+					T3M = VFNMS(LDK(KP921078979), T3L, T3K);
+					T3R = VFMA(LDK(KP921078979), T3L, T3K);
+					T1v = VFNMS(LDK(KP559016994), T1u, T1t);
+					T36 = VFMA(LDK(KP559016994), T1u, T1t);
+					T4l = VSUB(T4d, T4e);
+					T4f = VADD(T4d, T4e);
+				   }
+				   {
+					V T2L, T2R, T2j, T2q, T2J, T2B, T2e, T26, T2U, T1Y, T23, T2O;
+					{
+					     V T2I, T24, T2w, T2E, T48, T42, T3y, T3s, T3V, T45, T2A, T1b, T2h, T2i, T1X;
+					     T2L = VFNMS(LDK(KP912575812), T2H, T2G);
+					     T2I = VFMA(LDK(KP912575812), T2H, T2G);
+					     {
+						  V T3A, T3e, T37, T3I, T1z;
+						  T3A = VFNMS(LDK(KP667278218), T36, T35);
+						  T3e = VFNMS(LDK(KP059835404), T35, T36);
+						  T37 = VFMA(LDK(KP066152395), T36, T35);
+						  T3I = VFMA(LDK(KP603558818), T35, T36);
+						  T24 = VFMA(LDK(KP578046249), T1v, T1y);
+						  T1z = VFNMS(LDK(KP522847744), T1y, T1v);
+						  T2w = VFNMS(LDK(KP494780565), T1v, T1y);
+						  T2E = VFMA(LDK(KP447533225), T1y, T1v);
+						  {
+						       V T4i, T4g, T4o, T4m;
+						       T4i = VSUB(T4c, T4f);
+						       T4g = VADD(T4c, T4f);
+						       T4o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T4k, T4l));
+						       T4m = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T4l, T4k));
+						       {
+							    V T3Q, T3J, T3b, T3u;
+							    T3Q = VFNMS(LDK(KP845997307), T3I, T3H);
+							    T3J = VFMA(LDK(KP845997307), T3I, T3H);
+							    T3b = VFNMS(LDK(KP772036680), T3a, T37);
+							    T3u = VFMA(LDK(KP772036680), T3a, T37);
+							    {
+								 V T3o, T3g, T3B, T3X, T4h;
+								 T3o = VFNMS(LDK(KP772036680), T3f, T3e);
+								 T3g = VFMA(LDK(KP772036680), T3f, T3e);
+								 T3B = VFNMS(LDK(KP845997307), T3A, T3z);
+								 T3X = VFMA(LDK(KP845997307), T3A, T3z);
+								 ST(&(x[0]), VADD(T4g, T49), ms, &(x[0]));
+								 T4h = VFNMS(LDK(KP250000000), T4g, T49);
+								 {
+								      V T40, T3N, T3c, T3v;
+								      T40 = VFMA(LDK(KP906616052), T3M, T3J);
+								      T3N = VFNMS(LDK(KP906616052), T3M, T3J);
+								      T3c = VFMA(LDK(KP956723877), T3b, T34);
+								      T3v = VFMA(LDK(KP522616830), T3j, T3u);
+								      {
+									   V T3p, T3k, T3S, T3F;
+									   T3p = VFNMS(LDK(KP522616830), T34, T3o);
+									   T3k = VFMA(LDK(KP945422727), T3j, T3g);
+									   T3S = VFNMS(LDK(KP923225144), T3E, T3B);
+									   T3F = VFMA(LDK(KP923225144), T3E, T3B);
+									   {
+										V T46, T3Z, T4j, T4n;
+										T46 = VFNMS(LDK(KP669429328), T3X, T3Y);
+										T3Z = VFMA(LDK(KP570584518), T3Y, T3X);
+										T4j = VFMA(LDK(KP559016994), T4i, T4h);
+										T4n = VFNMS(LDK(KP559016994), T4i, T4h);
+										{
+										     V T3W, T3O, T3d, T3w;
+										     T3W = VFMA(LDK(KP262346850), T3N, T3l);
+										     T3O = VMUL(LDK(KP998026728), VFNMS(LDK(KP952936919), T3l, T3N));
+										     T3d = VFMA(LDK(KP992114701), T3c, T2X);
+										     T3w = VFNMS(LDK(KP690983005), T3v, T3g);
+										     {
+											  V T3q, T3m, T3T, T43;
+											  T3q = VFMA(LDK(KP763932022), T3p, T3b);
+											  T3m = VMUL(LDK(KP998026728), VFMA(LDK(KP952936919), T3l, T3k));
+											  T3T = VFNMS(LDK(KP997675361), T3S, T3R);
+											  T43 = VFNMS(LDK(KP904508497), T3S, T3Q);
+											  {
+											       V T3G, T3P, T47, T41;
+											       T3G = VFMA(LDK(KP949179823), T3F, T2X);
+											       T3P = VFNMS(LDK(KP237294955), T3F, T2X);
+											       T47 = VFNMS(LDK(KP669429328), T40, T46);
+											       T41 = VFMA(LDK(KP618033988), T40, T3Z);
+											       ST(&(x[WS(rs, 20)]), VFNMSI(T4m, T4j), ms, &(x[0]));
+											       ST(&(x[WS(rs, 5)]), VFMAI(T4m, T4j), ms, &(x[WS(rs, 1)]));
+											       ST(&(x[WS(rs, 15)]), VFMAI(T4o, T4n), ms, &(x[WS(rs, 1)]));
+											       ST(&(x[WS(rs, 10)]), VFNMSI(T4o, T4n), ms, &(x[0]));
+											       {
+												    V T3x, T3r, T3U, T44;
+												    T3x = VFMA(LDK(KP855719849), T3w, T3t);
+												    T3r = VFNMS(LDK(KP855719849), T3q, T3n);
+												    ST(&(x[WS(rs, 3)]), VFMAI(T3m, T3d), ms, &(x[WS(rs, 1)]));
+												    ST(&(x[WS(rs, 22)]), VFNMSI(T3m, T3d), ms, &(x[0]));
+												    T3U = VFMA(LDK(KP560319534), T3T, T3Q);
+												    T44 = VFNMS(LDK(KP681693190), T43, T3R);
+												    ST(&(x[WS(rs, 2)]), VFMAI(T3O, T3G), ms, &(x[0]));
+												    ST(&(x[WS(rs, 23)]), VFNMSI(T3O, T3G), ms, &(x[WS(rs, 1)]));
+												    T48 = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T47, T3W));
+												    T42 = VMUL(LDK(KP951056516), VFNMS(LDK(KP949179823), T41, T3W));
+												    T3y = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T3x, T3l));
+												    T3s = VFMA(LDK(KP897376177), T3r, T2X);
+												    T3V = VFNMS(LDK(KP949179823), T3U, T3P);
+												    T45 = VFNMS(LDK(KP860541664), T44, T3P);
+												    T2R = VFNMS(LDK(KP912575812), T2z, T2y);
+												    T2A = VFMA(LDK(KP912575812), T2z, T2y);
+												    T1b = VFMA(LDK(KP829049696), T1a, TO);
+												    T2h = VFNMS(LDK(KP829049696), T1a, TO);
+												    T2i = VFNMS(LDK(KP831864738), T1W, T1z);
+												    T1X = VFMA(LDK(KP831864738), T1W, T1z);
+											       }
+											  }
+										     }
+										}
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					     {
+						  V T2M, T2F, T2x, T2S, T2T, T2N;
+						  T2M = VFNMS(LDK(KP958953096), T2E, T2D);
+						  T2F = VFMA(LDK(KP958953096), T2E, T2D);
+						  ST(&(x[WS(rs, 17)]), VFNMSI(T3y, T3s), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 8)]), VFMAI(T3y, T3s), ms, &(x[0]));
+						  ST(&(x[WS(rs, 13)]), VFMAI(T42, T3V), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 12)]), VFNMSI(T42, T3V), ms, &(x[0]));
+						  ST(&(x[WS(rs, 7)]), VFNMSI(T48, T45), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 18)]), VFMAI(T48, T45), ms, &(x[0]));
+						  T2j = VFMA(LDK(KP559154169), T2i, T2h);
+						  T2q = VFNMS(LDK(KP683113946), T2h, T2i);
+						  T2x = VFNMS(LDK(KP867381224), T2w, T2v);
+						  T2S = VFMA(LDK(KP867381224), T2w, T2v);
+						  T2J = VFMA(LDK(KP894834959), T2I, T2F);
+						  T2T = VFMA(LDK(KP447417479), T2I, T2S);
+						  T2B = VFNMS(LDK(KP809385824), T2A, T2x);
+						  T2N = VFMA(LDK(KP447417479), T2A, T2M);
+						  T2e = VFMA(LDK(KP831864738), T25, T24);
+						  T26 = VFNMS(LDK(KP831864738), T25, T24);
+						  T2U = VFNMS(LDK(KP763932022), T2T, T2F);
+						  T1Y = VFMA(LDK(KP904730450), T1X, T1b);
+						  T23 = VFNMS(LDK(KP904730450), T1X, T1b);
+						  T2O = VFMA(LDK(KP690983005), T2N, T2x);
+					     }
+					}
+					{
+					     V T2C, T22, T2d, T2K;
+					     T2C = VFNMS(LDK(KP992114701), T2B, To);
+					     T22 = VFMA(LDK(KP916574801), T21, T20);
+					     T2d = VFNMS(LDK(KP916574801), T21, T20);
+					     T2K = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T2J, T2c));
+					     {
+						  V T27, T2P, T2f, T2k, T2n, T2V;
+						  T2V = VFNMS(LDK(KP999544308), T2U, T2R);
+						  T27 = VFNMS(LDK(KP904730450), T26, T23);
+						  T2t = VFMA(LDK(KP968583161), T1Y, To);
+						  T1Z = VFNMS(LDK(KP242145790), T1Y, To);
+						  T2P = VFNMS(LDK(KP999544308), T2O, T2L);
+						  T2f = VFMA(LDK(KP904730450), T2e, T2d);
+						  T2k = VFNMS(LDK(KP904730450), T2e, T2d);
+						  T2n = VADD(T22, T23);
+						  ST(&(x[WS(rs, 21)]), VFMAI(T2K, T2C), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 4)]), VFNMSI(T2K, T2C), ms, &(x[0]));
+						  T2W = VMUL(LDK(KP951056516), VFNMS(LDK(KP803003575), T2V, T2c));
+						  T28 = VFNMS(LDK(KP618033988), T27, T22);
+						  T2Q = VFNMS(LDK(KP803003575), T2P, To);
+						  T2r = VFMA(LDK(KP617882369), T2k, T2q);
+						  T2g = VFNMS(LDK(KP242145790), T2f, T2c);
+						  T2u = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T2f, T2c));
+						  T2o = VFNMS(LDK(KP683113946), T2n, T26);
+						  T2l = VFMA(LDK(KP559016994), T2k, T2j);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T29, T2s, T2p, T2m;
+		    T29 = VFNMS(LDK(KP876091699), T28, T1Z);
+		    ST(&(x[WS(rs, 16)]), VFMAI(T2W, T2Q), ms, &(x[0]));
+		    ST(&(x[WS(rs, 9)]), VFNMSI(T2W, T2Q), ms, &(x[WS(rs, 1)]));
+		    T2s = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T2r, T2g));
+		    ST(&(x[WS(rs, 24)]), VFNMSI(T2u, T2t), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFMAI(T2u, T2t), ms, &(x[WS(rs, 1)]));
+		    T2p = VFMA(LDK(KP792626838), T2o, T1Z);
+		    T2m = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T2l, T2g));
+		    ST(&(x[WS(rs, 11)]), VFMAI(T2s, T2p), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 14)]), VFNMSI(T2s, T2p), ms, &(x[0]));
+		    ST(&(x[WS(rs, 19)]), VFNMSI(T2m, T29), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VFMAI(T2m, T29), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t3bv_25"), twinstr, &GENUS, {87, 100, 181, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_25) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 25 -name t3bv_25 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 268 FP additions, 228 FP multiplications,
+ * (or, 191 additions, 151 multiplications, 77 fused multiply/add),
+ * 124 stack variables, 40 constants, and 50 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP497379774, +0.497379774329709576484567492012895936835134813);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DVK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DVK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DVK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DVK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
+     DVK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DVK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DVK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DVK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DVK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
+     DVK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DVK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DVK(KP851558583, +0.851558583130145297725004891488503407959946084);
+     DVK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DVK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DVK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
+     DVK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
+     DVK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DVK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DVK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
+     DVK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DVK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DVK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DVK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
+     DVK(KP125581039, +0.125581039058626752152356449131262266244969664);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP250666467, +0.250666467128608490746237519633017587885836494);
+     DVK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DVK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
+     DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V T1, Td, T8, T9, TF, Te, Tu, TB, TC, T1s, T15, Tf, TY, T4, Ta;
+	       V Tx, T1T, Tg, T1N, T1v, T18, TG, T1o, T11;
+	       T1 = LDW(&(W[TWVL * 4]));
+	       Td = LDW(&(W[TWVL * 2]));
+	       T8 = LDW(&(W[0]));
+	       T9 = VZMUL(T8, T1);
+	       TF = VZMULJ(T8, T1);
+	       Te = VZMUL(T8, Td);
+	       Tu = VZMULJ(Td, T1);
+	       TB = VZMULJ(T8, Td);
+	       TC = VZMUL(TB, T1);
+	       T1s = VZMUL(Te, T1);
+	       T15 = VZMUL(Td, T1);
+	       Tf = VZMULJ(Te, T1);
+	       TY = VZMULJ(TB, T1);
+	       T4 = LDW(&(W[TWVL * 6]));
+	       Ta = VZMULJ(T9, T4);
+	       Tx = VZMULJ(Td, T4);
+	       T1T = VZMULJ(T1, T4);
+	       Tg = VZMULJ(Tf, T4);
+	       T1N = VZMULJ(Te, T4);
+	       T1v = VZMULJ(Tu, T4);
+	       T18 = VZMULJ(TY, T4);
+	       TG = VZMULJ(TF, T4);
+	       T1o = VZMULJ(T8, T4);
+	       T11 = VZMULJ(TB, T4);
+	       {
+		    V T1Y, T1X, T2f, T2g, T1Z, T20, T2e, T39, T1H, T2T, T1E, T3C, T2S, Tk, T2G;
+		    V Ts, T3z, T2F, TK, T2I, TS, T3y, T2J, T1k, T2Q, T1h, T3B, T2P;
+		    {
+			 V T1S, T1V, T1W, T1M, T1P, T1Q, T2d;
+			 T1Y = LD(&(x[0]), ms, &(x[0]));
+			 {
+			      V T1R, T1U, T1L, T1O;
+			      T1R = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      T1S = VZMUL(T9, T1R);
+			      T1U = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      T1V = VZMUL(T1T, T1U);
+			      T1W = VADD(T1S, T1V);
+			      T1L = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      T1M = VZMUL(Tf, T1L);
+			      T1O = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			      T1P = VZMUL(T1N, T1O);
+			      T1Q = VADD(T1M, T1P);
+			 }
+			 T1X = VMUL(LDK(KP559016994), VSUB(T1Q, T1W));
+			 T2f = VSUB(T1S, T1V);
+			 T2g = VMUL(LDK(KP587785252), T2f);
+			 T1Z = VADD(T1Q, T1W);
+			 T20 = VFNMS(LDK(KP250000000), T1Z, T1Y);
+			 T2d = VSUB(T1M, T1P);
+			 T2e = VMUL(LDK(KP951056516), T2d);
+			 T39 = VMUL(LDK(KP587785252), T2d);
+		    }
+		    {
+			 V T1B, T1u, T1x, T1y, T1n, T1q, T1r, T1A;
+			 T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T1B = VZMUL(Td, T1A);
+			 {
+			      V T1t, T1w, T1m, T1p;
+			      T1t = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			      T1u = VZMUL(T1s, T1t);
+			      T1w = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      T1x = VZMUL(T1v, T1w);
+			      T1y = VADD(T1u, T1x);
+			      T1m = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      T1n = VZMUL(TF, T1m);
+			      T1p = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			      T1q = VZMUL(T1o, T1p);
+			      T1r = VADD(T1n, T1q);
+			 }
+			 {
+			      V T1F, T1G, T1z, T1C, T1D;
+			      T1F = VSUB(T1n, T1q);
+			      T1G = VSUB(T1u, T1x);
+			      T1H = VFMA(LDK(KP475528258), T1F, VMUL(LDK(KP293892626), T1G));
+			      T2T = VFNMS(LDK(KP475528258), T1G, VMUL(LDK(KP293892626), T1F));
+			      T1z = VMUL(LDK(KP559016994), VSUB(T1r, T1y));
+			      T1C = VADD(T1r, T1y);
+			      T1D = VFNMS(LDK(KP250000000), T1C, T1B);
+			      T1E = VADD(T1z, T1D);
+			      T3C = VADD(T1B, T1C);
+			      T2S = VSUB(T1D, T1z);
+			 }
+		    }
+		    {
+			 V Tp, Tc, Ti, Tm, T3, T6, Tl, To;
+			 To = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 Tp = VZMUL(Te, To);
+			 {
+			      V Tb, Th, T2, T5;
+			      Tb = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      Tc = VZMUL(Ta, Tb);
+			      Th = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			      Ti = VZMUL(Tg, Th);
+			      Tm = VADD(Tc, Ti);
+			      T2 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			      T3 = VZMUL(T1, T2);
+			      T5 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			      T6 = VZMUL(T4, T5);
+			      Tl = VADD(T3, T6);
+			 }
+			 {
+			      V T7, Tj, Tn, Tq, Tr;
+			      T7 = VSUB(T3, T6);
+			      Tj = VSUB(Tc, Ti);
+			      Tk = VFMA(LDK(KP475528258), T7, VMUL(LDK(KP293892626), Tj));
+			      T2G = VFNMS(LDK(KP475528258), Tj, VMUL(LDK(KP293892626), T7));
+			      Tn = VMUL(LDK(KP559016994), VSUB(Tl, Tm));
+			      Tq = VADD(Tl, Tm);
+			      Tr = VFNMS(LDK(KP250000000), Tq, Tp);
+			      Ts = VADD(Tn, Tr);
+			      T3z = VADD(Tp, Tq);
+			      T2F = VSUB(Tr, Tn);
+			 }
+		    }
+		    {
+			 V TP, TE, TI, TM, Tw, Tz, TL, TO;
+			 TO = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 TP = VZMUL(T8, TO);
+			 {
+			      V TD, TH, Tv, Ty;
+			      TD = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      TE = VZMUL(TC, TD);
+			      TH = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			      TI = VZMUL(TG, TH);
+			      TM = VADD(TE, TI);
+			      Tv = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      Tw = VZMUL(Tu, Tv);
+			      Ty = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			      Tz = VZMUL(Tx, Ty);
+			      TL = VADD(Tw, Tz);
+			 }
+			 {
+			      V TA, TJ, TN, TQ, TR;
+			      TA = VSUB(Tw, Tz);
+			      TJ = VSUB(TE, TI);
+			      TK = VFMA(LDK(KP475528258), TA, VMUL(LDK(KP293892626), TJ));
+			      T2I = VFNMS(LDK(KP475528258), TJ, VMUL(LDK(KP293892626), TA));
+			      TN = VMUL(LDK(KP559016994), VSUB(TL, TM));
+			      TQ = VADD(TL, TM);
+			      TR = VFNMS(LDK(KP250000000), TQ, TP);
+			      TS = VADD(TN, TR);
+			      T3y = VADD(TP, TQ);
+			      T2J = VSUB(TR, TN);
+			 }
+		    }
+		    {
+			 V T1e, T17, T1a, T1b, T10, T13, T14, T1d;
+			 T1d = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T1e = VZMUL(TB, T1d);
+			 {
+			      V T16, T19, TZ, T12;
+			      T16 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			      T17 = VZMUL(T15, T16);
+			      T19 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			      T1a = VZMUL(T18, T19);
+			      T1b = VADD(T17, T1a);
+			      TZ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T10 = VZMUL(TY, TZ);
+			      T12 = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			      T13 = VZMUL(T11, T12);
+			      T14 = VADD(T10, T13);
+			 }
+			 {
+			      V T1i, T1j, T1c, T1f, T1g;
+			      T1i = VSUB(T10, T13);
+			      T1j = VSUB(T17, T1a);
+			      T1k = VFMA(LDK(KP475528258), T1i, VMUL(LDK(KP293892626), T1j));
+			      T2Q = VFNMS(LDK(KP475528258), T1j, VMUL(LDK(KP293892626), T1i));
+			      T1c = VMUL(LDK(KP559016994), VSUB(T14, T1b));
+			      T1f = VADD(T14, T1b);
+			      T1g = VFNMS(LDK(KP250000000), T1f, T1e);
+			      T1h = VADD(T1c, T1g);
+			      T3B = VADD(T1e, T1f);
+			      T2P = VSUB(T1g, T1c);
+			 }
+		    }
+		    {
+			 V T3E, T3M, T3I, T3J, T3H, T3K, T3N, T3L;
+			 {
+			      V T3A, T3D, T3F, T3G;
+			      T3A = VSUB(T3y, T3z);
+			      T3D = VSUB(T3B, T3C);
+			      T3E = VBYI(VFMA(LDK(KP951056516), T3A, VMUL(LDK(KP587785252), T3D)));
+			      T3M = VBYI(VFNMS(LDK(KP951056516), T3D, VMUL(LDK(KP587785252), T3A)));
+			      T3I = VADD(T1Y, T1Z);
+			      T3F = VADD(T3y, T3z);
+			      T3G = VADD(T3B, T3C);
+			      T3J = VADD(T3F, T3G);
+			      T3H = VMUL(LDK(KP559016994), VSUB(T3F, T3G));
+			      T3K = VFNMS(LDK(KP250000000), T3J, T3I);
+			 }
+			 ST(&(x[0]), VADD(T3I, T3J), ms, &(x[0]));
+			 T3N = VSUB(T3K, T3H);
+			 ST(&(x[WS(rs, 10)]), VADD(T3M, T3N), ms, &(x[0]));
+			 ST(&(x[WS(rs, 15)]), VSUB(T3N, T3M), ms, &(x[WS(rs, 1)]));
+			 T3L = VADD(T3H, T3K);
+			 ST(&(x[WS(rs, 5)]), VADD(T3E, T3L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 20)]), VSUB(T3L, T3E), ms, &(x[0]));
+		    }
+		    {
+			 V T2X, T3a, T3i, T3j, T3k, T3s, T3t, T3u, T3l, T3m, T3n, T3p, T3q, T3r, T2L;
+			 V T3b, T32, T38, T2W, T35, T2Y, T34, T3w, T3x;
+			 T2X = VSUB(T20, T1X);
+			 T3a = VFNMS(LDK(KP951056516), T2f, T39);
+			 T3i = VFMA(LDK(KP1_369094211), T2I, VMUL(LDK(KP728968627), T2J));
+			 T3j = VFNMS(LDK(KP992114701), T2F, VMUL(LDK(KP250666467), T2G));
+			 T3k = VADD(T3i, T3j);
+			 T3s = VFNMS(LDK(KP125581039), T2Q, VMUL(LDK(KP998026728), T2P));
+			 T3t = VFMA(LDK(KP1_274847979), T2T, VMUL(LDK(KP770513242), T2S));
+			 T3u = VADD(T3s, T3t);
+			 T3l = VFMA(LDK(KP1_996053456), T2Q, VMUL(LDK(KP062790519), T2P));
+			 T3m = VFNMS(LDK(KP637423989), T2S, VMUL(LDK(KP1_541026485), T2T));
+			 T3n = VADD(T3l, T3m);
+			 T3p = VFNMS(LDK(KP1_457937254), T2I, VMUL(LDK(KP684547105), T2J));
+			 T3q = VFMA(LDK(KP1_984229402), T2G, VMUL(LDK(KP125333233), T2F));
+			 T3r = VADD(T3p, T3q);
+			 {
+			      V T2H, T2K, T36, T30, T31, T37;
+			      T2H = VFNMS(LDK(KP851558583), T2G, VMUL(LDK(KP904827052), T2F));
+			      T2K = VFMA(LDK(KP1_752613360), T2I, VMUL(LDK(KP481753674), T2J));
+			      T36 = VADD(T2K, T2H);
+			      T30 = VFMA(LDK(KP1_071653589), T2Q, VMUL(LDK(KP844327925), T2P));
+			      T31 = VFMA(LDK(KP125581039), T2T, VMUL(LDK(KP998026728), T2S));
+			      T37 = VADD(T30, T31);
+			      T2L = VSUB(T2H, T2K);
+			      T3b = VADD(T36, T37);
+			      T32 = VSUB(T30, T31);
+			      T38 = VMUL(LDK(KP559016994), VSUB(T36, T37));
+			 }
+			 {
+			      V T2M, T2N, T2O, T2R, T2U, T2V;
+			      T2M = VFNMS(LDK(KP963507348), T2I, VMUL(LDK(KP876306680), T2J));
+			      T2N = VFMA(LDK(KP1_809654104), T2G, VMUL(LDK(KP425779291), T2F));
+			      T2O = VSUB(T2M, T2N);
+			      T2R = VFNMS(LDK(KP1_688655851), T2Q, VMUL(LDK(KP535826794), T2P));
+			      T2U = VFNMS(LDK(KP1_996053456), T2T, VMUL(LDK(KP062790519), T2S));
+			      T2V = VADD(T2R, T2U);
+			      T2W = VMUL(LDK(KP559016994), VSUB(T2O, T2V));
+			      T35 = VSUB(T2R, T2U);
+			      T2Y = VADD(T2O, T2V);
+			      T34 = VADD(T2M, T2N);
+			 }
+			 {
+			      V T3g, T3h, T3o, T3v;
+			      T3g = VADD(T2X, T2Y);
+			      T3h = VBYI(VADD(T3a, T3b));
+			      ST(&(x[WS(rs, 23)]), VSUB(T3g, T3h), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 2)]), VADD(T3g, T3h), ms, &(x[0]));
+			      T3o = VADD(T2X, VADD(T3k, T3n));
+			      T3v = VBYI(VSUB(VADD(T3r, T3u), T3a));
+			      ST(&(x[WS(rs, 22)]), VSUB(T3o, T3v), ms, &(x[0]));
+			      ST(&(x[WS(rs, 3)]), VADD(T3o, T3v), ms, &(x[WS(rs, 1)]));
+			 }
+			 T3w = VBYI(VSUB(VFMA(LDK(KP951056516), VSUB(T3i, T3j), VFMA(LDK(KP309016994), T3r, VFNMS(LDK(KP809016994), T3u, VMUL(LDK(KP587785252), VSUB(T3l, T3m))))), T3a));
+			 T3x = VFMA(LDK(KP309016994), T3k, VFMA(LDK(KP951056516), VSUB(T3q, T3p), VFMA(LDK(KP587785252), VSUB(T3t, T3s), VFNMS(LDK(KP809016994), T3n, T2X))));
+			 ST(&(x[WS(rs, 8)]), VADD(T3w, T3x), ms, &(x[0]));
+			 ST(&(x[WS(rs, 17)]), VSUB(T3x, T3w), ms, &(x[WS(rs, 1)]));
+			 {
+			      V T33, T3e, T3d, T3f, T2Z, T3c;
+			      T2Z = VFNMS(LDK(KP250000000), T2Y, T2X);
+			      T33 = VFMA(LDK(KP951056516), T2L, VADD(T2W, VFNMS(LDK(KP587785252), T32, T2Z)));
+			      T3e = VFMA(LDK(KP587785252), T2L, VFMA(LDK(KP951056516), T32, VSUB(T2Z, T2W)));
+			      T3c = VFNMS(LDK(KP250000000), T3b, T3a);
+			      T3d = VBYI(VADD(VFMA(LDK(KP951056516), T34, VMUL(LDK(KP587785252), T35)), VADD(T38, T3c)));
+			      T3f = VBYI(VADD(VFNMS(LDK(KP951056516), T35, VMUL(LDK(KP587785252), T34)), VSUB(T3c, T38)));
+			      ST(&(x[WS(rs, 18)]), VSUB(T33, T3d), ms, &(x[0]));
+			      ST(&(x[WS(rs, 12)]), VADD(T3e, T3f), ms, &(x[0]));
+			      ST(&(x[WS(rs, 7)]), VADD(T33, T3d), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 13)]), VSUB(T3e, T3f), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+		    {
+			 V T21, T2h, T2p, T2q, T2r, T2z, T2A, T2B, T2s, T2t, T2u, T2w, T2x, T2y, TU;
+			 V T2i, T26, T2c, T1K, T29, T22, T28, T2D, T2E;
+			 T21 = VADD(T1X, T20);
+			 T2h = VADD(T2e, T2g);
+			 T2p = VFMA(LDK(KP1_688655851), TK, VMUL(LDK(KP535826794), TS));
+			 T2q = VFMA(LDK(KP1_541026485), Tk, VMUL(LDK(KP637423989), Ts));
+			 T2r = VSUB(T2p, T2q);
+			 T2z = VFMA(LDK(KP851558583), T1k, VMUL(LDK(KP904827052), T1h));
+			 T2A = VFMA(LDK(KP1_984229402), T1H, VMUL(LDK(KP125333233), T1E));
+			 T2B = VADD(T2z, T2A);
+			 T2s = VFNMS(LDK(KP425779291), T1h, VMUL(LDK(KP1_809654104), T1k));
+			 T2t = VFNMS(LDK(KP992114701), T1E, VMUL(LDK(KP250666467), T1H));
+			 T2u = VADD(T2s, T2t);
+			 T2w = VFNMS(LDK(KP1_071653589), TK, VMUL(LDK(KP844327925), TS));
+			 T2x = VFNMS(LDK(KP770513242), Ts, VMUL(LDK(KP1_274847979), Tk));
+			 T2y = VADD(T2w, T2x);
+			 {
+			      V Tt, TT, T2a, T24, T25, T2b;
+			      Tt = VFMA(LDK(KP1_071653589), Tk, VMUL(LDK(KP844327925), Ts));
+			      TT = VFMA(LDK(KP1_937166322), TK, VMUL(LDK(KP248689887), TS));
+			      T2a = VADD(TT, Tt);
+			      T24 = VFMA(LDK(KP1_752613360), T1k, VMUL(LDK(KP481753674), T1h));
+			      T25 = VFMA(LDK(KP1_457937254), T1H, VMUL(LDK(KP684547105), T1E));
+			      T2b = VADD(T24, T25);
+			      TU = VSUB(Tt, TT);
+			      T2i = VADD(T2a, T2b);
+			      T26 = VSUB(T24, T25);
+			      T2c = VMUL(LDK(KP559016994), VSUB(T2a, T2b));
+			 }
+			 {
+			      V TV, TW, TX, T1l, T1I, T1J;
+			      TV = VFNMS(LDK(KP497379774), TK, VMUL(LDK(KP968583161), TS));
+			      TW = VFNMS(LDK(KP1_688655851), Tk, VMUL(LDK(KP535826794), Ts));
+			      TX = VADD(TV, TW);
+			      T1l = VFNMS(LDK(KP963507348), T1k, VMUL(LDK(KP876306680), T1h));
+			      T1I = VFNMS(LDK(KP1_369094211), T1H, VMUL(LDK(KP728968627), T1E));
+			      T1J = VADD(T1l, T1I);
+			      T1K = VMUL(LDK(KP559016994), VSUB(TX, T1J));
+			      T29 = VSUB(T1l, T1I);
+			      T22 = VADD(TX, T1J);
+			      T28 = VSUB(TV, TW);
+			 }
+			 {
+			      V T2n, T2o, T2v, T2C;
+			      T2n = VADD(T21, T22);
+			      T2o = VBYI(VADD(T2h, T2i));
+			      ST(&(x[WS(rs, 24)]), VSUB(T2n, T2o), ms, &(x[0]));
+			      ST(&(x[WS(rs, 1)]), VADD(T2n, T2o), ms, &(x[WS(rs, 1)]));
+			      T2v = VADD(T21, VADD(T2r, T2u));
+			      T2C = VBYI(VSUB(VADD(T2y, T2B), T2h));
+			      ST(&(x[WS(rs, 21)]), VSUB(T2v, T2C), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 4)]), VADD(T2v, T2C), ms, &(x[0]));
+			 }
+			 T2D = VBYI(VSUB(VFMA(LDK(KP309016994), T2y, VFMA(LDK(KP951056516), VADD(T2p, T2q), VFNMS(LDK(KP809016994), T2B, VMUL(LDK(KP587785252), VSUB(T2s, T2t))))), T2h));
+			 T2E = VFMA(LDK(KP951056516), VSUB(T2x, T2w), VFMA(LDK(KP309016994), T2r, VFMA(LDK(KP587785252), VSUB(T2A, T2z), VFNMS(LDK(KP809016994), T2u, T21))));
+			 ST(&(x[WS(rs, 9)]), VADD(T2D, T2E), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 16)]), VSUB(T2E, T2D), ms, &(x[0]));
+			 {
+			      V T27, T2l, T2k, T2m, T23, T2j;
+			      T23 = VFNMS(LDK(KP250000000), T22, T21);
+			      T27 = VFMA(LDK(KP951056516), TU, VADD(T1K, VFNMS(LDK(KP587785252), T26, T23)));
+			      T2l = VFMA(LDK(KP587785252), TU, VFMA(LDK(KP951056516), T26, VSUB(T23, T1K)));
+			      T2j = VFNMS(LDK(KP250000000), T2i, T2h);
+			      T2k = VBYI(VADD(VFMA(LDK(KP951056516), T28, VMUL(LDK(KP587785252), T29)), VADD(T2c, T2j)));
+			      T2m = VBYI(VADD(VFNMS(LDK(KP951056516), T29, VMUL(LDK(KP587785252), T28)), VSUB(T2j, T2c)));
+			      ST(&(x[WS(rs, 19)]), VSUB(T27, T2k), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 11)]), VADD(T2l, T2m), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 6)]), VADD(T27, T2k), ms, &(x[0]));
+			      ST(&(x[WS(rs, 14)]), VSUB(T2l, T2m), ms, &(x[0]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t3bv_25"), twinstr, &GENUS, {191, 151, 77, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_25) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,883 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:19 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3bv_32 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 244 FP additions, 214 FP multiplications,
+ * (or, 146 additions, 116 multiplications, 98 fused multiply/add),
+ * 120 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T2B, T2A, T2F, T2N, T2H, T2z, T2P, T2L, T2C, T2M;
+	       {
+		    V T2, T5, T3, T7;
+		    T2 = LDW(&(W[0]));
+		    T5 = LDW(&(W[TWVL * 4]));
+		    T3 = LDW(&(W[TWVL * 2]));
+		    T7 = LDW(&(W[TWVL * 6]));
+		    {
+			 V T24, Tb, T3x, T2T, T3K, T2W, T25, Tr, T3z, T3j, T28, TX, T3y, T3g, T27;
+			 V TG, T37, T3F, T3G, T3a, T2Y, T15, T1p, T2Z, T2w, T1V, T2v, T1N, T32, T1h;
+			 V T17, T1a;
+			 {
+			      V T1, Tz, TT, T4, TC, Tv, T12, T1D, T1w, T18, T1t, T1O, TK, TP, T1c;
+			      V T1m, Tf, T6, Te, TL, TQ, T2S, Tp, TU, Ti, Ta, TM, TR, Tm, TJ;
+			      V T22, T9, T1Z;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T22 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			      T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			      T1Z = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      {
+				   V Tn, TH, Tk, To, Th, Tg, T8, Tl, T20, T23, TI;
+				   {
+					V Td, T1C, Tc, T21;
+					Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+					Tz = VZMUL(T2, T5);
+					T1C = VZMULJ(T2, T5);
+					Tn = VZMUL(T3, T5);
+					TT = VZMULJ(T3, T5);
+					Tc = VZMUL(T2, T3);
+					T4 = VZMULJ(T2, T3);
+					TH = VZMUL(T3, T7);
+					T21 = VZMULJ(T3, T7);
+					Tk = VZMUL(T2, T7);
+					TC = VZMULJ(T2, T7);
+					Tv = VZMULJ(T5, T7);
+					T12 = VZMULJ(Tz, T7);
+					T20 = VZMUL(T1C, T1Z);
+					T1D = VZMULJ(T1C, T7);
+					T1w = VZMULJ(Tn, T7);
+					T18 = VZMULJ(TT, T7);
+					T1t = VZMUL(Tc, T7);
+					T1O = VZMULJ(Tc, T7);
+					TK = VZMUL(Tc, T5);
+					TP = VZMULJ(Tc, T5);
+					T1c = VZMUL(T4, T7);
+					T1m = VZMULJ(T4, T7);
+					Tf = VZMULJ(T4, T5);
+					T6 = VZMUL(T4, T5);
+					T23 = VZMUL(T21, T22);
+					Te = VZMUL(Tc, Td);
+				   }
+				   TL = VZMULJ(TK, T7);
+				   TQ = VZMULJ(TP, T7);
+				   To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+				   Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+				   Tg = VZMULJ(Tf, T7);
+				   T8 = VZMULJ(T6, T7);
+				   T2S = VADD(T20, T23);
+				   T24 = VSUB(T20, T23);
+				   Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+				   TI = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+				   Tp = VZMUL(Tn, To);
+				   TU = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   Ti = VZMUL(Tg, Th);
+				   Ta = VZMUL(T8, T9);
+				   TM = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   TR = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+				   Tm = VZMUL(Tk, Tl);
+				   TJ = VZMUL(TH, TI);
+			      }
+			      {
+				   V Tu, TE, Tw, TA;
+				   {
+					V T3h, TO, T3i, TW;
+					{
+					     V TV, T2U, Tj, T2R, TN, TS, T2V, Tq, Tt, TD;
+					     Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					     TV = VZMUL(TT, TU);
+					     T2U = VADD(Te, Ti);
+					     Tj = VSUB(Te, Ti);
+					     T2R = VADD(T1, Ta);
+					     Tb = VSUB(T1, Ta);
+					     TN = VZMUL(TL, TM);
+					     TS = VZMUL(TQ, TR);
+					     T2V = VADD(Tm, Tp);
+					     Tq = VSUB(Tm, Tp);
+					     Tu = VZMUL(T4, Tt);
+					     TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+					     T3x = VSUB(T2R, T2S);
+					     T2T = VADD(T2R, T2S);
+					     T3h = VADD(TJ, TN);
+					     TO = VSUB(TJ, TN);
+					     T3i = VADD(TV, TS);
+					     TW = VSUB(TS, TV);
+					     T3K = VSUB(T2U, T2V);
+					     T2W = VADD(T2U, T2V);
+					     T25 = VSUB(Tj, Tq);
+					     Tr = VADD(Tj, Tq);
+					     TE = VZMUL(TC, TD);
+					}
+					Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+					T3z = VSUB(T3h, T3i);
+					T3j = VADD(T3h, T3i);
+					T28 = VFMA(LDK(KP414213562), TO, TW);
+					TX = VFNMS(LDK(KP414213562), TW, TO);
+					TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+				   }
+				   {
+					V T35, T1z, T1T, T36, T39, T1L, T1B, T1F;
+					{
+					     V T1v, T1y, Ty, T3e, T1S, T1Q, T1I, T3f, TF, T1K, T1A, T1E;
+					     {
+						  V T1u, T1x, Tx, T1R;
+						  T1u = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+						  T1x = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+						  Tx = VZMUL(Tv, Tw);
+						  T1R = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+						  {
+						       V T1P, T1H, T1J, TB;
+						       T1P = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+						       T1H = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+						       T1J = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+						       TB = VZMUL(Tz, TA);
+						       T1v = VZMUL(T1t, T1u);
+						       T1y = VZMUL(T1w, T1x);
+						       Ty = VSUB(Tu, Tx);
+						       T3e = VADD(Tu, Tx);
+						       T1S = VZMUL(Tf, T1R);
+						       T1Q = VZMUL(T1O, T1P);
+						       T1I = VZMUL(T7, T1H);
+						       T3f = VADD(TB, TE);
+						       TF = VSUB(TB, TE);
+						       T1K = VZMUL(T6, T1J);
+						       T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+						       T1E = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+						  }
+					     }
+					     T35 = VADD(T1v, T1y);
+					     T1z = VSUB(T1v, T1y);
+					     T1T = VSUB(T1Q, T1S);
+					     T36 = VADD(T1S, T1Q);
+					     T3y = VSUB(T3e, T3f);
+					     T3g = VADD(T3e, T3f);
+					     T27 = VFMA(LDK(KP414213562), Ty, TF);
+					     TG = VFNMS(LDK(KP414213562), TF, Ty);
+					     T39 = VADD(T1I, T1K);
+					     T1L = VSUB(T1I, T1K);
+					     T1B = VZMUL(T3, T1A);
+					     T1F = VZMUL(T1D, T1E);
+					}
+					{
+					     V T11, T14, T1o, T1l, T1e, T1U, T1M, T1g, T16, T19;
+					     {
+						  V T10, T13, T1n, T1k;
+						  T10 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+						  T13 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+						  T1n = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+						  T1k = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+						  {
+						       V T1d, T1f, T1G, T38;
+						       T1d = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+						       T1f = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+						       T1G = VSUB(T1B, T1F);
+						       T38 = VADD(T1B, T1F);
+						       T37 = VADD(T35, T36);
+						       T3F = VSUB(T35, T36);
+						       T11 = VZMUL(T2, T10);
+						       T14 = VZMUL(T12, T13);
+						       T1o = VZMUL(T1m, T1n);
+						       T1l = VZMUL(T5, T1k);
+						       T1e = VZMUL(T1c, T1d);
+						       T3G = VSUB(T39, T38);
+						       T3a = VADD(T38, T39);
+						       T1U = VSUB(T1L, T1G);
+						       T1M = VADD(T1G, T1L);
+						       T1g = VZMUL(TK, T1f);
+						  }
+						  T16 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+						  T19 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+					     }
+					     T2Y = VADD(T11, T14);
+					     T15 = VSUB(T11, T14);
+					     T1p = VSUB(T1l, T1o);
+					     T2Z = VADD(T1l, T1o);
+					     T2w = VFNMS(LDK(KP707106781), T1U, T1T);
+					     T1V = VFMA(LDK(KP707106781), T1U, T1T);
+					     T2v = VFNMS(LDK(KP707106781), T1M, T1z);
+					     T1N = VFMA(LDK(KP707106781), T1M, T1z);
+					     T32 = VADD(T1e, T1g);
+					     T1h = VSUB(T1e, T1g);
+					     T17 = VZMUL(TP, T16);
+					     T1a = VZMUL(T18, T19);
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T2X, T3k, T3b, T3t, T1b, T31, T30, T3C, T3r, T3v, T3p, T3q;
+			      T2X = VSUB(T2T, T2W);
+			      T3p = VADD(T2T, T2W);
+			      T3q = VADD(T3g, T3j);
+			      T3k = VSUB(T3g, T3j);
+			      T3b = VSUB(T37, T3a);
+			      T3t = VADD(T37, T3a);
+			      T1b = VSUB(T17, T1a);
+			      T31 = VADD(T17, T1a);
+			      T30 = VADD(T2Y, T2Z);
+			      T3C = VSUB(T2Y, T2Z);
+			      T3r = VSUB(T3p, T3q);
+			      T3v = VADD(T3p, T3q);
+			      {
+				   V T1r, T2t, T1j, T2s, T3S, T3Y, T3R, T3V;
+				   {
+					V T3B, T3T, T3M, T3W, T3U, T3P, T3X, T3I, T3l, T3c, T3w, T3u;
+					{
+					     V T3L, T3A, T33, T3D, T1i, T1q, T3O, T3H;
+					     T3L = VSUB(T3y, T3z);
+					     T3A = VADD(T3y, T3z);
+					     T33 = VADD(T31, T32);
+					     T3D = VSUB(T31, T32);
+					     T1i = VADD(T1b, T1h);
+					     T1q = VSUB(T1b, T1h);
+					     T3O = VFMA(LDK(KP414213562), T3F, T3G);
+					     T3H = VFNMS(LDK(KP414213562), T3G, T3F);
+					     T3B = VFMA(LDK(KP707106781), T3A, T3x);
+					     T3T = VFNMS(LDK(KP707106781), T3A, T3x);
+					     T3M = VFMA(LDK(KP707106781), T3L, T3K);
+					     T3W = VFNMS(LDK(KP707106781), T3L, T3K);
+					     {
+						  V T3E, T3N, T3s, T34;
+						  T3E = VFNMS(LDK(KP414213562), T3D, T3C);
+						  T3N = VFMA(LDK(KP414213562), T3C, T3D);
+						  T3s = VADD(T30, T33);
+						  T34 = VSUB(T30, T33);
+						  T1r = VFMA(LDK(KP707106781), T1q, T1p);
+						  T2t = VFNMS(LDK(KP707106781), T1q, T1p);
+						  T1j = VFMA(LDK(KP707106781), T1i, T15);
+						  T2s = VFNMS(LDK(KP707106781), T1i, T15);
+						  T3U = VADD(T3N, T3O);
+						  T3P = VSUB(T3N, T3O);
+						  T3X = VSUB(T3E, T3H);
+						  T3I = VADD(T3E, T3H);
+						  T3l = VSUB(T34, T3b);
+						  T3c = VADD(T34, T3b);
+						  T3w = VADD(T3s, T3t);
+						  T3u = VSUB(T3s, T3t);
+					     }
+					}
+					{
+					     V T40, T3Z, T3Q, T3J;
+					     T3S = VFMA(LDK(KP923879532), T3P, T3M);
+					     T3Q = VFNMS(LDK(KP923879532), T3P, T3M);
+					     T40 = VFNMS(LDK(KP923879532), T3X, T3W);
+					     T3Y = VFMA(LDK(KP923879532), T3X, T3W);
+					     T3R = VFMA(LDK(KP923879532), T3I, T3B);
+					     T3J = VFNMS(LDK(KP923879532), T3I, T3B);
+					     {
+						  V T3o, T3m, T3n, T3d;
+						  T3o = VFMA(LDK(KP707106781), T3l, T3k);
+						  T3m = VFNMS(LDK(KP707106781), T3l, T3k);
+						  T3n = VFMA(LDK(KP707106781), T3c, T2X);
+						  T3d = VFNMS(LDK(KP707106781), T3c, T2X);
+						  ST(&(x[WS(rs, 16)]), VSUB(T3v, T3w), ms, &(x[0]));
+						  ST(&(x[0]), VADD(T3v, T3w), ms, &(x[0]));
+						  ST(&(x[WS(rs, 8)]), VFMAI(T3u, T3r), ms, &(x[0]));
+						  ST(&(x[WS(rs, 24)]), VFNMSI(T3u, T3r), ms, &(x[0]));
+						  T3Z = VFMA(LDK(KP923879532), T3U, T3T);
+						  T3V = VFNMS(LDK(KP923879532), T3U, T3T);
+						  ST(&(x[WS(rs, 18)]), VFMAI(T3Q, T3J), ms, &(x[0]));
+						  ST(&(x[WS(rs, 14)]), VFNMSI(T3Q, T3J), ms, &(x[0]));
+						  ST(&(x[WS(rs, 28)]), VFNMSI(T3o, T3n), ms, &(x[0]));
+						  ST(&(x[WS(rs, 4)]), VFMAI(T3o, T3n), ms, &(x[0]));
+						  ST(&(x[WS(rs, 20)]), VFMAI(T3m, T3d), ms, &(x[0]));
+						  ST(&(x[WS(rs, 12)]), VFNMSI(T3m, T3d), ms, &(x[0]));
+					     }
+					     ST(&(x[WS(rs, 26)]), VFMAI(T40, T3Z), ms, &(x[0]));
+					     ST(&(x[WS(rs, 6)]), VFNMSI(T40, T3Z), ms, &(x[0]));
+					}
+				   }
+				   {
+					V T2p, T1s, T1W, T2h, TZ, T2i, T2d, T26, T29, T2q;
+					{
+					     V Ts, TY, T2b, T2c;
+					     T2p = VFNMS(LDK(KP707106781), Tr, Tb);
+					     Ts = VFMA(LDK(KP707106781), Tr, Tb);
+					     TY = VADD(TG, TX);
+					     T2B = VSUB(TG, TX);
+					     T1s = VFNMS(LDK(KP198912367), T1r, T1j);
+					     T2b = VFMA(LDK(KP198912367), T1j, T1r);
+					     T2c = VFMA(LDK(KP198912367), T1N, T1V);
+					     T1W = VFNMS(LDK(KP198912367), T1V, T1N);
+					     ST(&(x[WS(rs, 2)]), VFMAI(T3S, T3R), ms, &(x[0]));
+					     ST(&(x[WS(rs, 30)]), VFNMSI(T3S, T3R), ms, &(x[0]));
+					     ST(&(x[WS(rs, 22)]), VFNMSI(T3Y, T3V), ms, &(x[0]));
+					     ST(&(x[WS(rs, 10)]), VFMAI(T3Y, T3V), ms, &(x[0]));
+					     T2h = VFNMS(LDK(KP923879532), TY, Ts);
+					     TZ = VFMA(LDK(KP923879532), TY, Ts);
+					     T2i = VADD(T2b, T2c);
+					     T2d = VSUB(T2b, T2c);
+					     T2A = VFNMS(LDK(KP707106781), T25, T24);
+					     T26 = VFMA(LDK(KP707106781), T25, T24);
+					     T29 = VSUB(T27, T28);
+					     T2q = VADD(T27, T28);
+					}
+					{
+					     V T2J, T2r, T2K, T2y;
+					     {
+						  V T2u, T2D, T2j, T2n, T2l, T1X, T2k, T2a, T2E, T2x;
+						  T2u = VFMA(LDK(KP668178637), T2t, T2s);
+						  T2D = VFNMS(LDK(KP668178637), T2s, T2t);
+						  T2j = VFNMS(LDK(KP980785280), T2i, T2h);
+						  T2n = VFMA(LDK(KP980785280), T2i, T2h);
+						  T2l = VSUB(T1s, T1W);
+						  T1X = VADD(T1s, T1W);
+						  T2k = VFNMS(LDK(KP923879532), T29, T26);
+						  T2a = VFMA(LDK(KP923879532), T29, T26);
+						  T2J = VFNMS(LDK(KP923879532), T2q, T2p);
+						  T2r = VFMA(LDK(KP923879532), T2q, T2p);
+						  T2E = VFNMS(LDK(KP668178637), T2v, T2w);
+						  T2x = VFMA(LDK(KP668178637), T2w, T2v);
+						  {
+						       V T1Y, T2f, T2o, T2m, T2e, T2g;
+						       T1Y = VFNMS(LDK(KP980785280), T1X, TZ);
+						       T2f = VFMA(LDK(KP980785280), T1X, TZ);
+						       T2o = VFNMS(LDK(KP980785280), T2l, T2k);
+						       T2m = VFMA(LDK(KP980785280), T2l, T2k);
+						       T2e = VFNMS(LDK(KP980785280), T2d, T2a);
+						       T2g = VFMA(LDK(KP980785280), T2d, T2a);
+						       T2F = VSUB(T2D, T2E);
+						       T2K = VADD(T2D, T2E);
+						       T2N = VSUB(T2u, T2x);
+						       T2y = VADD(T2u, T2x);
+						       ST(&(x[WS(rs, 23)]), VFNMSI(T2m, T2j), ms, &(x[WS(rs, 1)]));
+						       ST(&(x[WS(rs, 9)]), VFMAI(T2m, T2j), ms, &(x[WS(rs, 1)]));
+						       ST(&(x[WS(rs, 25)]), VFMAI(T2o, T2n), ms, &(x[WS(rs, 1)]));
+						       ST(&(x[WS(rs, 7)]), VFNMSI(T2o, T2n), ms, &(x[WS(rs, 1)]));
+						       ST(&(x[WS(rs, 1)]), VFMAI(T2g, T2f), ms, &(x[WS(rs, 1)]));
+						       ST(&(x[WS(rs, 31)]), VFNMSI(T2g, T2f), ms, &(x[WS(rs, 1)]));
+						       ST(&(x[WS(rs, 17)]), VFMAI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
+						       ST(&(x[WS(rs, 15)]), VFNMSI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
+						  }
+					     }
+					     T2H = VFMA(LDK(KP831469612), T2y, T2r);
+					     T2z = VFNMS(LDK(KP831469612), T2y, T2r);
+					     T2P = VFNMS(LDK(KP831469612), T2K, T2J);
+					     T2L = VFMA(LDK(KP831469612), T2K, T2J);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T2C = VFNMS(LDK(KP923879532), T2B, T2A);
+	       T2M = VFMA(LDK(KP923879532), T2B, T2A);
+	       {
+		    V T2Q, T2O, T2G, T2I;
+		    T2Q = VFMA(LDK(KP831469612), T2N, T2M);
+		    T2O = VFNMS(LDK(KP831469612), T2N, T2M);
+		    T2G = VFNMS(LDK(KP831469612), T2F, T2C);
+		    T2I = VFMA(LDK(KP831469612), T2F, T2C);
+		    ST(&(x[WS(rs, 21)]), VFMAI(T2O, T2L), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 11)]), VFNMSI(T2O, T2L), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 27)]), VFNMSI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 5)]), VFMAI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 29)]), VFMAI(T2I, T2H), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VFNMSI(T2I, T2H), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 13)]), VFMAI(T2G, T2z), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 19)]), VFNMSI(T2G, T2z), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 27),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t3bv_32"), twinstr, &GENUS, {146, 116, 98, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_32) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3bv_32 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 244 FP additions, 158 FP multiplications,
+ * (or, 228 additions, 142 multiplications, 16 fused multiply/add),
+ * 90 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T2, T5, T3, T4, Tc, T1v, TH, Tz, Tn, T6, TS, Tf, TK, T7, T8;
+	       V Tv, T1I, T25, Tg, Tk, T1N, T1Q, TC, T16, T12, T1w, TL, TP, TT, T1m;
+	       V T1f;
+	       T2 = LDW(&(W[0]));
+	       T5 = LDW(&(W[TWVL * 4]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       T4 = VZMULJ(T2, T3);
+	       Tc = VZMUL(T2, T3);
+	       T1v = VZMULJ(T2, T5);
+	       TH = VZMULJ(T3, T5);
+	       Tz = VZMUL(T2, T5);
+	       Tn = VZMUL(T3, T5);
+	       T6 = VZMUL(T4, T5);
+	       TS = VZMUL(Tc, T5);
+	       Tf = VZMULJ(T4, T5);
+	       TK = VZMULJ(Tc, T5);
+	       T7 = LDW(&(W[TWVL * 6]));
+	       T8 = VZMULJ(T6, T7);
+	       Tv = VZMULJ(T5, T7);
+	       T1I = VZMULJ(Tc, T7);
+	       T25 = VZMULJ(T3, T7);
+	       Tg = VZMULJ(Tf, T7);
+	       Tk = VZMUL(T2, T7);
+	       T1N = VZMUL(Tc, T7);
+	       T1Q = VZMULJ(Tn, T7);
+	       TC = VZMULJ(T2, T7);
+	       T16 = VZMUL(T4, T7);
+	       T12 = VZMULJ(TH, T7);
+	       T1w = VZMULJ(T1v, T7);
+	       TL = VZMULJ(TK, T7);
+	       TP = VZMUL(T3, T7);
+	       TT = VZMULJ(TS, T7);
+	       T1m = VZMULJ(Tz, T7);
+	       T1f = VZMULJ(T4, T7);
+	       {
+		    V Tb, T28, T3k, T3M, Tr, T22, T3f, T3N, TX, T20, T3b, T3J, TG, T1Z, T38;
+		    V T3I, T1M, T2v, T33, T3F, T1V, T2w, T30, T3E, T1j, T2s, T2W, T3C, T1r, T2t;
+		    V T2T, T3B;
+		    {
+			 V T1, T27, Ta, T24, T26, T9, T23, T3i, T3j;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T26 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			 T27 = VZMUL(T25, T26);
+			 T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			 Ta = VZMUL(T8, T9);
+			 T23 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T24 = VZMUL(T1v, T23);
+			 Tb = VSUB(T1, Ta);
+			 T28 = VSUB(T24, T27);
+			 T3i = VADD(T1, Ta);
+			 T3j = VADD(T24, T27);
+			 T3k = VSUB(T3i, T3j);
+			 T3M = VADD(T3i, T3j);
+		    }
+		    {
+			 V Te, Tp, Ti, Tm;
+			 {
+			      V Td, To, Th, Tl;
+			      Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      Te = VZMUL(Tc, Td);
+			      To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			      Tp = VZMUL(Tn, To);
+			      Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			      Ti = VZMUL(Tg, Th);
+			      Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+			      Tm = VZMUL(Tk, Tl);
+			 }
+			 {
+			      V Tj, Tq, T3d, T3e;
+			      Tj = VSUB(Te, Ti);
+			      Tq = VSUB(Tm, Tp);
+			      Tr = VMUL(LDK(KP707106781), VADD(Tj, Tq));
+			      T22 = VMUL(LDK(KP707106781), VSUB(Tj, Tq));
+			      T3d = VADD(Te, Ti);
+			      T3e = VADD(Tm, Tp);
+			      T3f = VSUB(T3d, T3e);
+			      T3N = VADD(T3d, T3e);
+			 }
+		    }
+		    {
+			 V TJ, TV, TN, TR;
+			 {
+			      V TI, TU, TM, TQ;
+			      TI = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      TJ = VZMUL(TH, TI);
+			      TU = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      TV = VZMUL(TT, TU);
+			      TM = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			      TN = VZMUL(TL, TM);
+			      TQ = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+			      TR = VZMUL(TP, TQ);
+			 }
+			 {
+			      V TO, TW, T39, T3a;
+			      TO = VSUB(TJ, TN);
+			      TW = VSUB(TR, TV);
+			      TX = VFNMS(LDK(KP382683432), TW, VMUL(LDK(KP923879532), TO));
+			      T20 = VFMA(LDK(KP923879532), TW, VMUL(LDK(KP382683432), TO));
+			      T39 = VADD(TR, TV);
+			      T3a = VADD(TJ, TN);
+			      T3b = VSUB(T39, T3a);
+			      T3J = VADD(T39, T3a);
+			 }
+		    }
+		    {
+			 V Tu, TE, Tx, TB;
+			 {
+			      V Tt, TD, Tw, TA;
+			      Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      Tu = VZMUL(T4, Tt);
+			      TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			      TE = VZMUL(TC, TD);
+			      Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      Tx = VZMUL(Tv, Tw);
+			      TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      TB = VZMUL(Tz, TA);
+			 }
+			 {
+			      V Ty, TF, T36, T37;
+			      Ty = VSUB(Tu, Tx);
+			      TF = VSUB(TB, TE);
+			      TG = VFMA(LDK(KP382683432), Ty, VMUL(LDK(KP923879532), TF));
+			      T1Z = VFNMS(LDK(KP382683432), TF, VMUL(LDK(KP923879532), Ty));
+			      T36 = VADD(Tu, Tx);
+			      T37 = VADD(TB, TE);
+			      T38 = VSUB(T36, T37);
+			      T3I = VADD(T36, T37);
+			 }
+		    }
+		    {
+			 V T1H, T1K, T1S, T1P, T1B, T1D, T1E, T1u, T1y, T1z;
+			 {
+			      V T1G, T1J, T1R, T1O;
+			      T1G = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T1H = VZMUL(Tf, T1G);
+			      T1J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			      T1K = VZMUL(T1I, T1J);
+			      T1R = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      T1S = VZMUL(T1Q, T1R);
+			      T1O = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+			      T1P = VZMUL(T1N, T1O);
+			      {
+				   V T1A, T1C, T1t, T1x;
+				   T1A = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+				   T1B = VZMUL(T7, T1A);
+				   T1C = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   T1D = VZMUL(T6, T1C);
+				   T1E = VSUB(T1B, T1D);
+				   T1t = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   T1u = VZMUL(T3, T1t);
+				   T1x = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+				   T1y = VZMUL(T1w, T1x);
+				   T1z = VSUB(T1u, T1y);
+			      }
+			 }
+			 {
+			      V T1F, T1L, T31, T32;
+			      T1F = VMUL(LDK(KP707106781), VSUB(T1z, T1E));
+			      T1L = VSUB(T1H, T1K);
+			      T1M = VSUB(T1F, T1L);
+			      T2v = VADD(T1L, T1F);
+			      T31 = VADD(T1u, T1y);
+			      T32 = VADD(T1B, T1D);
+			      T33 = VSUB(T31, T32);
+			      T3F = VADD(T31, T32);
+			 }
+			 {
+			      V T1T, T1U, T2Y, T2Z;
+			      T1T = VSUB(T1P, T1S);
+			      T1U = VMUL(LDK(KP707106781), VADD(T1z, T1E));
+			      T1V = VSUB(T1T, T1U);
+			      T2w = VADD(T1T, T1U);
+			      T2Y = VADD(T1P, T1S);
+			      T2Z = VADD(T1H, T1K);
+			      T30 = VSUB(T2Y, T2Z);
+			      T3E = VADD(T2Y, T2Z);
+			 }
+		    }
+		    {
+			 V T1e, T1h, T1o, T1l, T18, T1a, T1b, T11, T14, T15;
+			 {
+			      V T1d, T1g, T1n, T1k;
+			      T1d = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			      T1e = VZMUL(T5, T1d);
+			      T1g = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+			      T1h = VZMUL(T1f, T1g);
+			      T1n = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			      T1o = VZMUL(T1m, T1n);
+			      T1k = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      T1l = VZMUL(T2, T1k);
+			      {
+				   V T17, T19, T10, T13;
+				   T17 = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+				   T18 = VZMUL(T16, T17);
+				   T19 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   T1a = VZMUL(TS, T19);
+				   T1b = VSUB(T18, T1a);
+				   T10 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+				   T11 = VZMUL(TK, T10);
+				   T13 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+				   T14 = VZMUL(T12, T13);
+				   T15 = VSUB(T11, T14);
+			      }
+			 }
+			 {
+			      V T1c, T1i, T2U, T2V;
+			      T1c = VMUL(LDK(KP707106781), VSUB(T15, T1b));
+			      T1i = VSUB(T1e, T1h);
+			      T1j = VSUB(T1c, T1i);
+			      T2s = VADD(T1i, T1c);
+			      T2U = VADD(T11, T14);
+			      T2V = VADD(T18, T1a);
+			      T2W = VSUB(T2U, T2V);
+			      T3C = VADD(T2U, T2V);
+			 }
+			 {
+			      V T1p, T1q, T2R, T2S;
+			      T1p = VSUB(T1l, T1o);
+			      T1q = VMUL(LDK(KP707106781), VADD(T15, T1b));
+			      T1r = VSUB(T1p, T1q);
+			      T2t = VADD(T1p, T1q);
+			      T2R = VADD(T1l, T1o);
+			      T2S = VADD(T1e, T1h);
+			      T2T = VSUB(T2R, T2S);
+			      T3B = VADD(T2R, T2S);
+			 }
+		    }
+		    {
+			 V T3V, T3Z, T3Y, T40;
+			 {
+			      V T3T, T3U, T3W, T3X;
+			      T3T = VADD(T3M, T3N);
+			      T3U = VADD(T3I, T3J);
+			      T3V = VSUB(T3T, T3U);
+			      T3Z = VADD(T3T, T3U);
+			      T3W = VADD(T3B, T3C);
+			      T3X = VADD(T3E, T3F);
+			      T3Y = VBYI(VSUB(T3W, T3X));
+			      T40 = VADD(T3W, T3X);
+			 }
+			 ST(&(x[WS(rs, 24)]), VSUB(T3V, T3Y), ms, &(x[0]));
+			 ST(&(x[0]), VADD(T3Z, T40), ms, &(x[0]));
+			 ST(&(x[WS(rs, 8)]), VADD(T3V, T3Y), ms, &(x[0]));
+			 ST(&(x[WS(rs, 16)]), VSUB(T3Z, T40), ms, &(x[0]));
+		    }
+		    {
+			 V T3K, T3O, T3H, T3P, T3D, T3G;
+			 T3K = VSUB(T3I, T3J);
+			 T3O = VSUB(T3M, T3N);
+			 T3D = VSUB(T3B, T3C);
+			 T3G = VSUB(T3E, T3F);
+			 T3H = VMUL(LDK(KP707106781), VSUB(T3D, T3G));
+			 T3P = VMUL(LDK(KP707106781), VADD(T3D, T3G));
+			 {
+			      V T3L, T3Q, T3R, T3S;
+			      T3L = VBYI(VSUB(T3H, T3K));
+			      T3Q = VSUB(T3O, T3P);
+			      ST(&(x[WS(rs, 12)]), VADD(T3L, T3Q), ms, &(x[0]));
+			      ST(&(x[WS(rs, 20)]), VSUB(T3Q, T3L), ms, &(x[0]));
+			      T3R = VBYI(VADD(T3K, T3H));
+			      T3S = VADD(T3O, T3P);
+			      ST(&(x[WS(rs, 4)]), VADD(T3R, T3S), ms, &(x[0]));
+			      ST(&(x[WS(rs, 28)]), VSUB(T3S, T3R), ms, &(x[0]));
+			 }
+		    }
+		    {
+			 V T3g, T3w, T3m, T3t, T35, T3u, T3p, T3x, T3c, T3l;
+			 T3c = VMUL(LDK(KP707106781), VSUB(T38, T3b));
+			 T3g = VSUB(T3c, T3f);
+			 T3w = VADD(T3f, T3c);
+			 T3l = VMUL(LDK(KP707106781), VADD(T38, T3b));
+			 T3m = VSUB(T3k, T3l);
+			 T3t = VADD(T3k, T3l);
+			 {
+			      V T2X, T34, T3n, T3o;
+			      T2X = VFNMS(LDK(KP382683432), T2W, VMUL(LDK(KP923879532), T2T));
+			      T34 = VFMA(LDK(KP923879532), T30, VMUL(LDK(KP382683432), T33));
+			      T35 = VSUB(T2X, T34);
+			      T3u = VADD(T2X, T34);
+			      T3n = VFMA(LDK(KP382683432), T2T, VMUL(LDK(KP923879532), T2W));
+			      T3o = VFNMS(LDK(KP382683432), T30, VMUL(LDK(KP923879532), T33));
+			      T3p = VSUB(T3n, T3o);
+			      T3x = VADD(T3n, T3o);
+			 }
+			 {
+			      V T3h, T3q, T3z, T3A;
+			      T3h = VBYI(VSUB(T35, T3g));
+			      T3q = VSUB(T3m, T3p);
+			      ST(&(x[WS(rs, 10)]), VADD(T3h, T3q), ms, &(x[0]));
+			      ST(&(x[WS(rs, 22)]), VSUB(T3q, T3h), ms, &(x[0]));
+			      T3z = VSUB(T3t, T3u);
+			      T3A = VBYI(VSUB(T3x, T3w));
+			      ST(&(x[WS(rs, 18)]), VSUB(T3z, T3A), ms, &(x[0]));
+			      ST(&(x[WS(rs, 14)]), VADD(T3z, T3A), ms, &(x[0]));
+			 }
+			 {
+			      V T3r, T3s, T3v, T3y;
+			      T3r = VBYI(VADD(T3g, T35));
+			      T3s = VADD(T3m, T3p);
+			      ST(&(x[WS(rs, 6)]), VADD(T3r, T3s), ms, &(x[0]));
+			      ST(&(x[WS(rs, 26)]), VSUB(T3s, T3r), ms, &(x[0]));
+			      T3v = VADD(T3t, T3u);
+			      T3y = VBYI(VADD(T3w, T3x));
+			      ST(&(x[WS(rs, 30)]), VSUB(T3v, T3y), ms, &(x[0]));
+			      ST(&(x[WS(rs, 2)]), VADD(T3v, T3y), ms, &(x[0]));
+			 }
+		    }
+		    {
+			 V TZ, T2k, T2d, T2l, T1X, T2h, T2a, T2i;
+			 {
+			      V Ts, TY, T2b, T2c;
+			      Ts = VSUB(Tb, Tr);
+			      TY = VSUB(TG, TX);
+			      TZ = VSUB(Ts, TY);
+			      T2k = VADD(Ts, TY);
+			      T2b = VFNMS(LDK(KP555570233), T1j, VMUL(LDK(KP831469612), T1r));
+			      T2c = VFMA(LDK(KP555570233), T1M, VMUL(LDK(KP831469612), T1V));
+			      T2d = VSUB(T2b, T2c);
+			      T2l = VADD(T2b, T2c);
+			 }
+			 {
+			      V T1s, T1W, T21, T29;
+			      T1s = VFMA(LDK(KP831469612), T1j, VMUL(LDK(KP555570233), T1r));
+			      T1W = VFNMS(LDK(KP555570233), T1V, VMUL(LDK(KP831469612), T1M));
+			      T1X = VSUB(T1s, T1W);
+			      T2h = VADD(T1s, T1W);
+			      T21 = VSUB(T1Z, T20);
+			      T29 = VSUB(T22, T28);
+			      T2a = VSUB(T21, T29);
+			      T2i = VADD(T29, T21);
+			 }
+			 {
+			      V T1Y, T2e, T2n, T2o;
+			      T1Y = VADD(TZ, T1X);
+			      T2e = VBYI(VADD(T2a, T2d));
+			      ST(&(x[WS(rs, 27)]), VSUB(T1Y, T2e), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 5)]), VADD(T1Y, T2e), ms, &(x[WS(rs, 1)]));
+			      T2n = VBYI(VADD(T2i, T2h));
+			      T2o = VADD(T2k, T2l);
+			      ST(&(x[WS(rs, 3)]), VADD(T2n, T2o), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 29)]), VSUB(T2o, T2n), ms, &(x[WS(rs, 1)]));
+			 }
+			 {
+			      V T2f, T2g, T2j, T2m;
+			      T2f = VSUB(TZ, T1X);
+			      T2g = VBYI(VSUB(T2d, T2a));
+			      ST(&(x[WS(rs, 21)]), VSUB(T2f, T2g), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 11)]), VADD(T2f, T2g), ms, &(x[WS(rs, 1)]));
+			      T2j = VBYI(VSUB(T2h, T2i));
+			      T2m = VSUB(T2k, T2l);
+			      ST(&(x[WS(rs, 13)]), VADD(T2j, T2m), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 19)]), VSUB(T2m, T2j), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+		    {
+			 V T2r, T2M, T2F, T2N, T2y, T2J, T2C, T2K;
+			 {
+			      V T2p, T2q, T2D, T2E;
+			      T2p = VADD(Tb, Tr);
+			      T2q = VADD(T1Z, T20);
+			      T2r = VSUB(T2p, T2q);
+			      T2M = VADD(T2p, T2q);
+			      T2D = VFNMS(LDK(KP195090322), T2s, VMUL(LDK(KP980785280), T2t));
+			      T2E = VFMA(LDK(KP195090322), T2v, VMUL(LDK(KP980785280), T2w));
+			      T2F = VSUB(T2D, T2E);
+			      T2N = VADD(T2D, T2E);
+			 }
+			 {
+			      V T2u, T2x, T2A, T2B;
+			      T2u = VFMA(LDK(KP980785280), T2s, VMUL(LDK(KP195090322), T2t));
+			      T2x = VFNMS(LDK(KP195090322), T2w, VMUL(LDK(KP980785280), T2v));
+			      T2y = VSUB(T2u, T2x);
+			      T2J = VADD(T2u, T2x);
+			      T2A = VADD(TG, TX);
+			      T2B = VADD(T28, T22);
+			      T2C = VSUB(T2A, T2B);
+			      T2K = VADD(T2B, T2A);
+			 }
+			 {
+			      V T2z, T2G, T2P, T2Q;
+			      T2z = VADD(T2r, T2y);
+			      T2G = VBYI(VADD(T2C, T2F));
+			      ST(&(x[WS(rs, 25)]), VSUB(T2z, T2G), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 7)]), VADD(T2z, T2G), ms, &(x[WS(rs, 1)]));
+			      T2P = VBYI(VADD(T2K, T2J));
+			      T2Q = VADD(T2M, T2N);
+			      ST(&(x[WS(rs, 1)]), VADD(T2P, T2Q), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 31)]), VSUB(T2Q, T2P), ms, &(x[WS(rs, 1)]));
+			 }
+			 {
+			      V T2H, T2I, T2L, T2O;
+			      T2H = VSUB(T2r, T2y);
+			      T2I = VBYI(VSUB(T2F, T2C));
+			      ST(&(x[WS(rs, 23)]), VSUB(T2H, T2I), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 9)]), VADD(T2H, T2I), ms, &(x[WS(rs, 1)]));
+			      T2L = VBYI(VSUB(T2J, T2K));
+			      T2O = VSUB(T2M, T2N);
+			      ST(&(x[WS(rs, 15)]), VADD(T2L, T2O), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 17)]), VSUB(T2O, T2L), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 27),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t3bv_32"), twinstr, &GENUS, {228, 142, 16, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_32) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:18 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 4 -name t3bv_4 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 12 FP additions, 10 FP multiplications,
+ * (or, 10 additions, 8 multiplications, 2 fused multiply/add),
+ * 16 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T2, T3, T1, Ta, T5, T8;
+	       T2 = LDW(&(W[0]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       Ta = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T4, Tb, T9, T6;
+		    T4 = VZMULJ(T2, T3);
+		    Tb = VZMUL(T3, Ta);
+		    T9 = VZMUL(T2, T8);
+		    T6 = VZMUL(T4, T5);
+		    {
+			 V Tc, Te, T7, Td;
+			 Tc = VSUB(T9, Tb);
+			 Te = VADD(T9, Tb);
+			 T7 = VSUB(T1, T6);
+			 Td = VADD(T1, T6);
+			 ST(&(x[0]), VADD(Td, Te), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VSUB(Td, Te), ms, &(x[0]));
+			 ST(&(x[WS(rs, 1)]), VFMAI(Tc, T7), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VFNMSI(Tc, T7), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t3bv_4"), twinstr, &GENUS, {10, 8, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_4) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 4 -name t3bv_4 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 12 FP additions, 8 FP multiplications,
+ * (or, 12 additions, 8 multiplications, 0 fused multiply/add),
+ * 16 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T2, T3, T4;
+	       T2 = LDW(&(W[0]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       T4 = VZMULJ(T2, T3);
+	       {
+		    V T1, Tb, T6, T9, Ta, T5, T8;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    Ta = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Tb = VZMUL(T3, Ta);
+		    T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T6 = VZMUL(T4, T5);
+		    T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T9 = VZMUL(T2, T8);
+		    {
+			 V T7, Tc, Td, Te;
+			 T7 = VSUB(T1, T6);
+			 Tc = VBYI(VSUB(T9, Tb));
+			 ST(&(x[WS(rs, 3)]), VSUB(T7, Tc), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T7, Tc), ms, &(x[WS(rs, 1)]));
+			 Td = VADD(T1, T6);
+			 Te = VADD(T9, Tb);
+			 ST(&(x[WS(rs, 2)]), VSUB(Td, Te), ms, &(x[0]));
+			 ST(&(x[0]), VADD(Td, Te), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t3bv_4"), twinstr, &GENUS, {12, 8, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_4) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:22 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 5 -name t3bv_5 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 22 FP additions, 23 FP multiplications,
+ * (or, 13 additions, 14 multiplications, 9 fused multiply/add),
+ * 30 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V T2, T5, T1, T3, Td, T7, Tb;
+	       T2 = LDW(&(W[0]));
+	       T5 = LDW(&(W[TWVL * 2]));
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T3 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       Td = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tb = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V Ta, T6, T4, Te, Tc, T8;
+		    Ta = VZMULJ(T2, T5);
+		    T6 = VZMUL(T2, T5);
+		    T4 = VZMUL(T2, T3);
+		    Te = VZMUL(T5, Td);
+		    Tc = VZMUL(Ta, Tb);
+		    T8 = VZMUL(T6, T7);
+		    {
+			 V Tf, Tl, T9, Tk;
+			 Tf = VADD(Tc, Te);
+			 Tl = VSUB(Tc, Te);
+			 T9 = VADD(T4, T8);
+			 Tk = VSUB(T4, T8);
+			 {
+			      V Ti, Tg, To, Tm, Th, Tn, Tj;
+			      Ti = VSUB(T9, Tf);
+			      Tg = VADD(T9, Tf);
+			      To = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tk, Tl));
+			      Tm = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tl, Tk));
+			      Th = VFNMS(LDK(KP250000000), Tg, T1);
+			      ST(&(x[0]), VADD(T1, Tg), ms, &(x[0]));
+			      Tn = VFNMS(LDK(KP559016994), Ti, Th);
+			      Tj = VFMA(LDK(KP559016994), Ti, Th);
+			      ST(&(x[WS(rs, 2)]), VFNMSI(To, Tn), ms, &(x[0]));
+			      ST(&(x[WS(rs, 3)]), VFMAI(To, Tn), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 4)]), VFNMSI(Tm, Tj), ms, &(x[0]));
+			      ST(&(x[WS(rs, 1)]), VFMAI(Tm, Tj), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t3bv_5"), twinstr, &GENUS, {13, 14, 9, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_5) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 5 -name t3bv_5 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 22 FP additions, 18 FP multiplications,
+ * (or, 19 additions, 15 multiplications, 3 fused multiply/add),
+ * 24 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V T1, T4, T5, T9;
+	       T1 = LDW(&(W[0]));
+	       T4 = LDW(&(W[TWVL * 2]));
+	       T5 = VZMUL(T1, T4);
+	       T9 = VZMULJ(T1, T4);
+	       {
+		    V Tj, T8, Te, Tg, Th, Tk;
+		    Tj = LD(&(x[0]), ms, &(x[0]));
+		    {
+			 V T3, Td, T7, Tb;
+			 {
+			      V T2, Tc, T6, Ta;
+			      T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      T3 = VZMUL(T1, T2);
+			      Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      Td = VZMUL(T4, Tc);
+			      T6 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      T7 = VZMUL(T5, T6);
+			      Ta = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      Tb = VZMUL(T9, Ta);
+			 }
+			 T8 = VSUB(T3, T7);
+			 Te = VSUB(Tb, Td);
+			 Tg = VADD(T3, T7);
+			 Th = VADD(Tb, Td);
+			 Tk = VADD(Tg, Th);
+		    }
+		    ST(&(x[0]), VADD(Tj, Tk), ms, &(x[0]));
+		    {
+			 V Tf, Tn, Tm, To, Ti, Tl;
+			 Tf = VBYI(VFMA(LDK(KP951056516), T8, VMUL(LDK(KP587785252), Te)));
+			 Tn = VBYI(VFNMS(LDK(KP951056516), Te, VMUL(LDK(KP587785252), T8)));
+			 Ti = VMUL(LDK(KP559016994), VSUB(Tg, Th));
+			 Tl = VFNMS(LDK(KP250000000), Tk, Tj);
+			 Tm = VADD(Ti, Tl);
+			 To = VSUB(Tl, Ti);
+			 ST(&(x[WS(rs, 1)]), VADD(Tf, Tm), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VSUB(To, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 4)]), VSUB(Tm, Tf), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(Tn, To), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t3bv_5"), twinstr, &GENUS, {19, 15, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_5) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:18 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 8 -name t3bv_8 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 37 FP additions, 32 FP multiplications,
+ * (or, 27 additions, 22 multiplications, 10 fused multiply/add),
+ * 43 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T2, T3, Tb, T1, T5, Tn, Tq, T8, Td, T4, Ta, Tp, Tg, Ti, T9;
+	       T2 = LDW(&(W[0]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       Tb = LDW(&(W[TWVL * 4]));
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       Tq = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       Td = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       T4 = VZMUL(T2, T3);
+	       Ta = VZMULJ(T2, T3);
+	       Tp = VZMULJ(T2, Tb);
+	       Tg = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+	       Ti = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T9 = VZMUL(T2, T8);
+	       {
+		    V T6, To, Tc, Tr, Th, Tj;
+		    T6 = VZMUL(T4, T5);
+		    To = VZMUL(Ta, Tn);
+		    Tc = VZMULJ(Ta, Tb);
+		    Tr = VZMUL(Tp, Tq);
+		    Th = VZMUL(Tb, Tg);
+		    Tj = VZMUL(T3, Ti);
+		    {
+			 V Tx, T7, Te, Ts, Ty, Tk, TB;
+			 Tx = VADD(T1, T6);
+			 T7 = VSUB(T1, T6);
+			 Te = VZMUL(Tc, Td);
+			 Ts = VSUB(To, Tr);
+			 Ty = VADD(To, Tr);
+			 Tk = VSUB(Th, Tj);
+			 TB = VADD(Th, Tj);
+			 {
+			      V Tf, TA, Tz, TD;
+			      Tf = VSUB(T9, Te);
+			      TA = VADD(T9, Te);
+			      Tz = VSUB(Tx, Ty);
+			      TD = VADD(Tx, Ty);
+			      {
+				   V TC, TE, Tl, Tt;
+				   TC = VSUB(TA, TB);
+				   TE = VADD(TA, TB);
+				   Tl = VADD(Tf, Tk);
+				   Tt = VSUB(Tf, Tk);
+				   {
+					V Tu, Tw, Tm, Tv;
+					ST(&(x[0]), VADD(TD, TE), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VSUB(TD, TE), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFMAI(TC, Tz), ms, &(x[0]));
+					ST(&(x[WS(rs, 6)]), VFNMSI(TC, Tz), ms, &(x[0]));
+					Tu = VFNMS(LDK(KP707106781), Tt, Ts);
+					Tw = VFMA(LDK(KP707106781), Tt, Ts);
+					Tm = VFNMS(LDK(KP707106781), Tl, T7);
+					Tv = VFMA(LDK(KP707106781), Tl, T7);
+					ST(&(x[WS(rs, 1)]), VFMAI(Tw, Tv), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 7)]), VFNMSI(Tw, Tv), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 5)]), VFMAI(Tu, Tm), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 3)]), VFNMSI(Tu, Tm), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t3bv_8"), twinstr, &GENUS, {27, 22, 10, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_8) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 8 -name t3bv_8 -include t3b.h -sign 1 */
+
+/*
+ * This function contains 37 FP additions, 24 FP multiplications,
+ * (or, 37 additions, 24 multiplications, 0 fused multiply/add),
+ * 31 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t3b.h"
+
+static void t3bv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ii;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T1, T4, T5, Tp, T6, T7, Tj;
+	       T1 = LDW(&(W[0]));
+	       T4 = LDW(&(W[TWVL * 2]));
+	       T5 = VZMULJ(T1, T4);
+	       Tp = VZMUL(T1, T4);
+	       T6 = LDW(&(W[TWVL * 4]));
+	       T7 = VZMULJ(T5, T6);
+	       Tj = VZMULJ(T1, T6);
+	       {
+		    V Ts, Tx, Tm, Ty, Ta, TA, Tf, TB, To, Tr, Tq;
+		    To = LD(&(x[0]), ms, &(x[0]));
+		    Tq = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tr = VZMUL(Tp, Tq);
+		    Ts = VSUB(To, Tr);
+		    Tx = VADD(To, Tr);
+		    {
+			 V Ti, Tl, Th, Tk;
+			 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Ti = VZMUL(T5, Th);
+			 Tk = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Tl = VZMUL(Tj, Tk);
+			 Tm = VSUB(Ti, Tl);
+			 Ty = VADD(Ti, Tl);
+		    }
+		    {
+			 V T3, T9, T2, T8;
+			 T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T3 = VZMUL(T1, T2);
+			 T8 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 T9 = VZMUL(T7, T8);
+			 Ta = VSUB(T3, T9);
+			 TA = VADD(T3, T9);
+		    }
+		    {
+			 V Tc, Te, Tb, Td;
+			 Tb = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 Tc = VZMUL(T6, Tb);
+			 Td = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 Te = VZMUL(T4, Td);
+			 Tf = VSUB(Tc, Te);
+			 TB = VADD(Tc, Te);
+		    }
+		    {
+			 V Tz, TC, TD, TE;
+			 Tz = VSUB(Tx, Ty);
+			 TC = VBYI(VSUB(TA, TB));
+			 ST(&(x[WS(rs, 6)]), VSUB(Tz, TC), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(Tz, TC), ms, &(x[0]));
+			 TD = VADD(Tx, Ty);
+			 TE = VADD(TA, TB);
+			 ST(&(x[WS(rs, 4)]), VSUB(TD, TE), ms, &(x[0]));
+			 ST(&(x[0]), VADD(TD, TE), ms, &(x[0]));
+			 {
+			      V Tn, Tv, Tu, Tw, Tg, Tt;
+			      Tg = VMUL(LDK(KP707106781), VSUB(Ta, Tf));
+			      Tn = VBYI(VSUB(Tg, Tm));
+			      Tv = VBYI(VADD(Tm, Tg));
+			      Tt = VMUL(LDK(KP707106781), VADD(Ta, Tf));
+			      Tu = VSUB(Ts, Tt);
+			      Tw = VADD(Ts, Tt);
+			      ST(&(x[WS(rs, 3)]), VADD(Tn, Tu), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 7)]), VSUB(Tw, Tv), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 5)]), VSUB(Tu, Tn), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VADD(Tv, Tw), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t3bv_8"), twinstr, &GENUS, {37, 24, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3bv_8) (planner *p) {
+     X(kdft_dit_register) (p, t3bv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:55 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 10 -name t3fv_10 -include t3f.h */
+
+/*
+ * This function contains 57 FP additions, 52 FP multiplications,
+ * (or, 39 additions, 34 multiplications, 18 fused multiply/add),
+ * 57 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V T1, T7, Th, Tx, Tr, Td, Tp, T6, Tv, Tc, Te, Ti, Tl, T2, T3;
+	       V T5;
+	       T2 = LDW(&(W[0]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       T5 = LDW(&(W[TWVL * 4]));
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V To, Tw, Tq, Tu, Ta, T4, Tt, Tk, Tb;
+		    To = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    Tw = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    Tq = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+		    Tu = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+		    Ta = VZMULJ(T2, T3);
+		    T4 = VZMUL(T2, T3);
+		    Th = VZMULJ(T2, T5);
+		    Tt = VZMULJ(T3, T5);
+		    Tb = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    Tx = VZMULJ(T2, Tw);
+		    Tr = VZMULJ(T5, Tq);
+		    Tk = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Td = VZMULJ(Ta, T5);
+		    Tp = VZMULJ(T4, To);
+		    T6 = VZMULJ(T4, T5);
+		    Tv = VZMULJ(Tt, Tu);
+		    Tc = VZMULJ(Ta, Tb);
+		    Te = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+		    Ti = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    Tl = VZMULJ(T3, Tk);
+	       }
+	       {
+		    V TN, Ts, T8, Ty, TO, Tf, Tj;
+		    TN = VADD(Tp, Tr);
+		    Ts = VSUB(Tp, Tr);
+		    T8 = VZMULJ(T6, T7);
+		    Ty = VSUB(Tv, Tx);
+		    TO = VADD(Tv, Tx);
+		    Tf = VZMULJ(Td, Te);
+		    Tj = VZMULJ(Th, Ti);
+		    {
+			 V T9, TJ, TP, TU, Tz, TF, Tg, TK, Tm, TL;
+			 T9 = VSUB(T1, T8);
+			 TJ = VADD(T1, T8);
+			 TP = VADD(TN, TO);
+			 TU = VSUB(TN, TO);
+			 Tz = VADD(Ts, Ty);
+			 TF = VSUB(Ts, Ty);
+			 Tg = VSUB(Tc, Tf);
+			 TK = VADD(Tc, Tf);
+			 Tm = VSUB(Tj, Tl);
+			 TL = VADD(Tj, Tl);
+			 {
+			      V TM, TV, Tn, TE;
+			      TM = VADD(TK, TL);
+			      TV = VSUB(TK, TL);
+			      Tn = VADD(Tg, Tm);
+			      TE = VSUB(Tg, Tm);
+			      {
+				   V TW, TY, TS, TQ, TG, TI, TC, TA, TR, TB;
+				   TW = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TV, TU));
+				   TY = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TU, TV));
+				   TS = VSUB(TM, TP);
+				   TQ = VADD(TM, TP);
+				   TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TF, TE));
+				   TI = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TE, TF));
+				   TC = VSUB(Tn, Tz);
+				   TA = VADD(Tn, Tz);
+				   ST(&(x[0]), VADD(TJ, TQ), ms, &(x[0]));
+				   TR = VFNMS(LDK(KP250000000), TQ, TJ);
+				   ST(&(x[WS(rs, 5)]), VADD(T9, TA), ms, &(x[WS(rs, 1)]));
+				   TB = VFNMS(LDK(KP250000000), TA, T9);
+				   {
+					V TX, TT, TH, TD;
+					TX = VFMA(LDK(KP559016994), TS, TR);
+					TT = VFNMS(LDK(KP559016994), TS, TR);
+					TH = VFNMS(LDK(KP559016994), TC, TB);
+					TD = VFMA(LDK(KP559016994), TC, TB);
+					ST(&(x[WS(rs, 8)]), VFNMSI(TW, TT), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFMAI(TW, TT), ms, &(x[0]));
+					ST(&(x[WS(rs, 6)]), VFNMSI(TY, TX), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VFMAI(TY, TX), ms, &(x[0]));
+					ST(&(x[WS(rs, 9)]), VFMAI(TG, TD), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 1)]), VFNMSI(TG, TD), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 7)]), VFMAI(TI, TH), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 3)]), VFNMSI(TI, TH), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t3fv_10"), twinstr, &GENUS, {39, 34, 18, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_10) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 10 -name t3fv_10 -include t3f.h */
+
+/*
+ * This function contains 57 FP additions, 42 FP multiplications,
+ * (or, 51 additions, 36 multiplications, 6 fused multiply/add),
+ * 41 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(10, rs)) {
+	       V T1, T2, T3, Ti, T6, T7, Tx, Tb, To;
+	       T1 = LDW(&(W[0]));
+	       T2 = LDW(&(W[TWVL * 2]));
+	       T3 = VZMULJ(T1, T2);
+	       Ti = VZMUL(T1, T2);
+	       T6 = LDW(&(W[TWVL * 4]));
+	       T7 = VZMULJ(T3, T6);
+	       Tx = VZMULJ(Ti, T6);
+	       Tb = VZMULJ(T1, T6);
+	       To = VZMULJ(T2, T6);
+	       {
+		    V TA, TQ, Tn, Tt, Tu, TJ, TK, TS, Ta, Tg, Th, TM, TN, TR, Tw;
+		    V Tz, Ty;
+		    Tw = LD(&(x[0]), ms, &(x[0]));
+		    Ty = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+		    Tz = VZMULJ(Tx, Ty);
+		    TA = VSUB(Tw, Tz);
+		    TQ = VADD(Tw, Tz);
+		    {
+			 V Tk, Ts, Tm, Tq;
+			 {
+			      V Tj, Tr, Tl, Tp;
+			      Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      Tk = VZMULJ(Ti, Tj);
+			      Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      Ts = VZMULJ(T1, Tr);
+			      Tl = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			      Tm = VZMULJ(T6, Tl);
+			      Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      Tq = VZMULJ(To, Tp);
+			 }
+			 Tn = VSUB(Tk, Tm);
+			 Tt = VSUB(Tq, Ts);
+			 Tu = VADD(Tn, Tt);
+			 TJ = VADD(Tk, Tm);
+			 TK = VADD(Tq, Ts);
+			 TS = VADD(TJ, TK);
+		    }
+		    {
+			 V T5, Tf, T9, Td;
+			 {
+			      V T4, Te, T8, Tc;
+			      T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      T5 = VZMULJ(T3, T4);
+			      Te = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      Tf = VZMULJ(T2, Te);
+			      T8 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T9 = VZMULJ(T7, T8);
+			      Tc = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Td = VZMULJ(Tb, Tc);
+			 }
+			 Ta = VSUB(T5, T9);
+			 Tg = VSUB(Td, Tf);
+			 Th = VADD(Ta, Tg);
+			 TM = VADD(T5, T9);
+			 TN = VADD(Td, Tf);
+			 TR = VADD(TM, TN);
+		    }
+		    {
+			 V Tv, TB, TC, TG, TI, TE, TF, TH, TD;
+			 Tv = VMUL(LDK(KP559016994), VSUB(Th, Tu));
+			 TB = VADD(Th, Tu);
+			 TC = VFNMS(LDK(KP250000000), TB, TA);
+			 TE = VSUB(Ta, Tg);
+			 TF = VSUB(Tn, Tt);
+			 TG = VBYI(VFMA(LDK(KP951056516), TE, VMUL(LDK(KP587785252), TF)));
+			 TI = VBYI(VFNMS(LDK(KP587785252), TE, VMUL(LDK(KP951056516), TF)));
+			 ST(&(x[WS(rs, 5)]), VADD(TA, TB), ms, &(x[WS(rs, 1)]));
+			 TH = VSUB(TC, Tv);
+			 ST(&(x[WS(rs, 3)]), VSUB(TH, TI), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 7)]), VADD(TI, TH), ms, &(x[WS(rs, 1)]));
+			 TD = VADD(Tv, TC);
+			 ST(&(x[WS(rs, 1)]), VSUB(TD, TG), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 9)]), VADD(TG, TD), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V TV, TT, TU, TP, TX, TL, TO, TY, TW;
+			 TV = VMUL(LDK(KP559016994), VSUB(TR, TS));
+			 TT = VADD(TR, TS);
+			 TU = VFNMS(LDK(KP250000000), TT, TQ);
+			 TL = VSUB(TJ, TK);
+			 TO = VSUB(TM, TN);
+			 TP = VBYI(VFNMS(LDK(KP587785252), TO, VMUL(LDK(KP951056516), TL)));
+			 TX = VBYI(VFMA(LDK(KP951056516), TO, VMUL(LDK(KP587785252), TL)));
+			 ST(&(x[0]), VADD(TQ, TT), ms, &(x[0]));
+			 TY = VADD(TV, TU);
+			 ST(&(x[WS(rs, 4)]), VADD(TX, TY), ms, &(x[0]));
+			 ST(&(x[WS(rs, 6)]), VSUB(TY, TX), ms, &(x[0]));
+			 TW = VSUB(TU, TV);
+			 ST(&(x[WS(rs, 2)]), VADD(TP, TW), ms, &(x[0]));
+			 ST(&(x[WS(rs, 8)]), VSUB(TW, TP), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 10, XSIMD_STRING("t3fv_10"), twinstr, &GENUS, {51, 36, 6, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_10) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:49 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 16 -name t3fv_16 -include t3f.h */
+
+/*
+ * This function contains 98 FP additions, 86 FP multiplications,
+ * (or, 64 additions, 52 multiplications, 34 fused multiply/add),
+ * 70 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T13, Tg, TY, T14, T1A, T1q, T1f, T1x, T1r, T1i, Tt, T16, TB, T1j, T1k;
+	       V TH;
+	       {
+		    V T2, T8, Tu, T3;
+		    T2 = LDW(&(W[0]));
+		    T8 = LDW(&(W[TWVL * 2]));
+		    Tu = LDW(&(W[TWVL * 6]));
+		    T3 = LDW(&(W[TWVL * 4]));
+		    {
+			 V Ty, T1o, Tf, T1b, T7, Tr, TR, TX, T1g, Tl, To, Tw, TG, Tz, T1p;
+			 V T1e, TC;
+			 {
+			      V T1, T5, Ta, Td;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      Td = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			      {
+				   V Tx, TO, TE, Tb, Tm, Tp, TN, Te, T6, TW, TP, TS;
+				   {
+					V TM, T9, TL, Tc, TU, T4, TV;
+					TM = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					Tx = VZMULJ(T2, T8);
+					T9 = VZMUL(T2, T8);
+					TL = VZMULJ(T2, Tu);
+					TO = VZMULJ(T8, T3);
+					Tc = VZMUL(T8, T3);
+					TU = VZMUL(T2, T3);
+					T4 = VZMULJ(T2, T3);
+					TV = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+					TE = VZMUL(Tx, T3);
+					Ty = VZMULJ(Tx, T3);
+					Tb = VZMULJ(T9, Ta);
+					Tm = VZMULJ(T9, T3);
+					Tp = VZMUL(T9, T3);
+					TN = VZMULJ(TL, TM);
+					Te = VZMULJ(Tc, Td);
+					T6 = VZMULJ(T4, T5);
+					TW = VZMULJ(TU, TV);
+				   }
+				   TP = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   TS = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+				   {
+					V TQ, TT, Ti, Tk, Tn, Th, Tq, Tj;
+					Th = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					Tq = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+					Tj = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					T1o = VSUB(Tb, Te);
+					Tf = VADD(Tb, Te);
+					T1b = VSUB(T1, T6);
+					T7 = VADD(T1, T6);
+					TQ = VZMULJ(TO, TP);
+					TT = VZMULJ(Tx, TS);
+					Ti = VZMULJ(T2, Th);
+					Tr = VZMULJ(Tp, Tq);
+					Tk = VZMULJ(T3, Tj);
+					Tn = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+					{
+					     V T1d, T1c, Tv, TF;
+					     Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					     TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					     T1d = VSUB(TN, TQ);
+					     TR = VADD(TN, TQ);
+					     T1c = VSUB(TT, TW);
+					     TX = VADD(TT, TW);
+					     T1g = VSUB(Ti, Tk);
+					     Tl = VADD(Ti, Tk);
+					     To = VZMULJ(Tm, Tn);
+					     Tw = VZMULJ(Tu, Tv);
+					     TG = VZMULJ(TE, TF);
+					     Tz = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     T1p = VSUB(T1d, T1c);
+					     T1e = VADD(T1c, T1d);
+					     TC = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T1h, Ts, TA, TD;
+			      T13 = VADD(T7, Tf);
+			      Tg = VSUB(T7, Tf);
+			      T1h = VSUB(To, Tr);
+			      Ts = VADD(To, Tr);
+			      TY = VSUB(TR, TX);
+			      T14 = VADD(TX, TR);
+			      TA = VZMULJ(Ty, Tz);
+			      T1A = VFMA(LDK(KP707106781), T1p, T1o);
+			      T1q = VFNMS(LDK(KP707106781), T1p, T1o);
+			      T1f = VFMA(LDK(KP707106781), T1e, T1b);
+			      T1x = VFNMS(LDK(KP707106781), T1e, T1b);
+			      TD = VZMULJ(T8, TC);
+			      T1r = VFMA(LDK(KP414213562), T1g, T1h);
+			      T1i = VFNMS(LDK(KP414213562), T1h, T1g);
+			      Tt = VSUB(Tl, Ts);
+			      T16 = VADD(Tl, Ts);
+			      TB = VADD(Tw, TA);
+			      T1j = VSUB(Tw, TA);
+			      T1k = VSUB(TG, TD);
+			      TH = VADD(TD, TG);
+			 }
+		    }
+	       }
+	       {
+		    V T15, T19, T1l, T1s, TI, T17;
+		    T15 = VADD(T13, T14);
+		    T19 = VSUB(T13, T14);
+		    T1l = VFNMS(LDK(KP414213562), T1k, T1j);
+		    T1s = VFMA(LDK(KP414213562), T1j, T1k);
+		    TI = VSUB(TB, TH);
+		    T17 = VADD(TB, TH);
+		    {
+			 V T1y, T1t, T1B, T1m;
+			 T1y = VADD(T1r, T1s);
+			 T1t = VSUB(T1r, T1s);
+			 T1B = VSUB(T1l, T1i);
+			 T1m = VADD(T1i, T1l);
+			 {
+			      V T18, T1a, TJ, TZ;
+			      T18 = VADD(T16, T17);
+			      T1a = VSUB(T17, T16);
+			      TJ = VADD(Tt, TI);
+			      TZ = VSUB(TI, Tt);
+			      {
+				   V T1u, T1w, T1z, T1D;
+				   T1u = VFNMS(LDK(KP923879532), T1t, T1q);
+				   T1w = VFMA(LDK(KP923879532), T1t, T1q);
+				   T1z = VFNMS(LDK(KP923879532), T1y, T1x);
+				   T1D = VFMA(LDK(KP923879532), T1y, T1x);
+				   {
+					V T1n, T1v, T1C, T1E;
+					T1n = VFNMS(LDK(KP923879532), T1m, T1f);
+					T1v = VFMA(LDK(KP923879532), T1m, T1f);
+					T1C = VFNMS(LDK(KP923879532), T1B, T1A);
+					T1E = VFMA(LDK(KP923879532), T1B, T1A);
+					ST(&(x[WS(rs, 12)]), VFNMSI(T1a, T19), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VFMAI(T1a, T19), ms, &(x[0]));
+					ST(&(x[0]), VADD(T15, T18), ms, &(x[0]));
+					ST(&(x[WS(rs, 8)]), VSUB(T15, T18), ms, &(x[0]));
+					{
+					     V T10, T12, TK, T11;
+					     T10 = VFNMS(LDK(KP707106781), TZ, TY);
+					     T12 = VFMA(LDK(KP707106781), TZ, TY);
+					     TK = VFNMS(LDK(KP707106781), TJ, Tg);
+					     T11 = VFMA(LDK(KP707106781), TJ, Tg);
+					     ST(&(x[WS(rs, 1)]), VFNMSI(T1w, T1v), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 15)]), VFMAI(T1w, T1v), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 7)]), VFMAI(T1u, T1n), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 9)]), VFNMSI(T1u, T1n), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 3)]), VFMAI(T1E, T1D), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 13)]), VFNMSI(T1E, T1D), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 11)]), VFMAI(T1C, T1z), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 5)]), VFNMSI(T1C, T1z), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 14)]), VFNMSI(T12, T11), ms, &(x[0]));
+					     ST(&(x[WS(rs, 2)]), VFMAI(T12, T11), ms, &(x[0]));
+					     ST(&(x[WS(rs, 10)]), VFMAI(T10, TK), ms, &(x[0]));
+					     ST(&(x[WS(rs, 6)]), VFNMSI(T10, TK), ms, &(x[0]));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t3fv_16"), twinstr, &GENUS, {64, 52, 34, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_16) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 16 -name t3fv_16 -include t3f.h */
+
+/*
+ * This function contains 98 FP additions, 64 FP multiplications,
+ * (or, 94 additions, 60 multiplications, 4 fused multiply/add),
+ * 51 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T4, T5, T6, To, T1, Ty, T7, T8, TO, TV, Te, Tp, TB, TH, Ts;
+	       T4 = LDW(&(W[0]));
+	       T5 = LDW(&(W[TWVL * 2]));
+	       T6 = VZMULJ(T4, T5);
+	       To = VZMUL(T4, T5);
+	       T1 = LDW(&(W[TWVL * 6]));
+	       Ty = VZMULJ(T4, T1);
+	       T7 = LDW(&(W[TWVL * 4]));
+	       T8 = VZMULJ(T6, T7);
+	       TO = VZMUL(T5, T7);
+	       TV = VZMULJ(T4, T7);
+	       Te = VZMUL(T6, T7);
+	       Tp = VZMULJ(To, T7);
+	       TB = VZMULJ(T5, T7);
+	       TH = VZMUL(T4, T7);
+	       Ts = VZMUL(To, T7);
+	       {
+		    V TY, T1f, TR, T1g, T1q, T1r, TL, TZ, T1l, T1m, T1n, Ti, T12, T1i, T1j;
+		    V T1k, Tw, T11, TU, TX, TW;
+		    TU = LD(&(x[0]), ms, &(x[0]));
+		    TW = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+		    TX = VZMULJ(TV, TW);
+		    TY = VSUB(TU, TX);
+		    T1f = VADD(TU, TX);
+		    {
+			 V TN, TQ, TM, TP;
+			 TM = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 TN = VZMULJ(To, TM);
+			 TP = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			 TQ = VZMULJ(TO, TP);
+			 TR = VSUB(TN, TQ);
+			 T1g = VADD(TN, TQ);
+		    }
+		    {
+			 V TA, TJ, TD, TG, TE, TK;
+			 {
+			      V Tz, TI, TC, TF;
+			      Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      TA = VZMULJ(Ty, Tz);
+			      TI = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      TJ = VZMULJ(TH, TI);
+			      TC = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      TD = VZMULJ(TB, TC);
+			      TF = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      TG = VZMULJ(T6, TF);
+			 }
+			 T1q = VADD(TA, TD);
+			 T1r = VADD(TG, TJ);
+			 TE = VSUB(TA, TD);
+			 TK = VSUB(TG, TJ);
+			 TL = VMUL(LDK(KP707106781), VSUB(TE, TK));
+			 TZ = VMUL(LDK(KP707106781), VADD(TK, TE));
+		    }
+		    {
+			 V T3, Tg, Ta, Td, Tb, Th;
+			 {
+			      V T2, Tf, T9, Tc;
+			      T2 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      T3 = VZMULJ(T1, T2);
+			      Tf = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      Tg = VZMULJ(Te, Tf);
+			      T9 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      Ta = VZMULJ(T8, T9);
+			      Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      Td = VZMULJ(T5, Tc);
+			 }
+			 T1l = VADD(T3, Ta);
+			 T1m = VADD(Td, Tg);
+			 T1n = VSUB(T1l, T1m);
+			 Tb = VSUB(T3, Ta);
+			 Th = VSUB(Td, Tg);
+			 Ti = VFNMS(LDK(KP923879532), Th, VMUL(LDK(KP382683432), Tb));
+			 T12 = VFMA(LDK(KP923879532), Tb, VMUL(LDK(KP382683432), Th));
+		    }
+		    {
+			 V Tk, Tu, Tm, Tr, Tn, Tv;
+			 {
+			      V Tj, Tt, Tl, Tq;
+			      Tj = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      Tk = VZMULJ(T4, Tj);
+			      Tt = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			      Tu = VZMULJ(Ts, Tt);
+			      Tl = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			      Tm = VZMULJ(T7, Tl);
+			      Tq = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      Tr = VZMULJ(Tp, Tq);
+			 }
+			 T1i = VADD(Tk, Tm);
+			 T1j = VADD(Tr, Tu);
+			 T1k = VSUB(T1i, T1j);
+			 Tn = VSUB(Tk, Tm);
+			 Tv = VSUB(Tr, Tu);
+			 Tw = VFMA(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tv));
+			 T11 = VFNMS(LDK(KP382683432), Tv, VMUL(LDK(KP923879532), Tn));
+		    }
+		    {
+			 V T1p, T1v, T1u, T1w;
+			 {
+			      V T1h, T1o, T1s, T1t;
+			      T1h = VSUB(T1f, T1g);
+			      T1o = VMUL(LDK(KP707106781), VADD(T1k, T1n));
+			      T1p = VADD(T1h, T1o);
+			      T1v = VSUB(T1h, T1o);
+			      T1s = VSUB(T1q, T1r);
+			      T1t = VMUL(LDK(KP707106781), VSUB(T1n, T1k));
+			      T1u = VBYI(VADD(T1s, T1t));
+			      T1w = VBYI(VSUB(T1t, T1s));
+			 }
+			 ST(&(x[WS(rs, 14)]), VSUB(T1p, T1u), ms, &(x[0]));
+			 ST(&(x[WS(rs, 6)]), VADD(T1v, T1w), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(T1p, T1u), ms, &(x[0]));
+			 ST(&(x[WS(rs, 10)]), VSUB(T1v, T1w), ms, &(x[0]));
+		    }
+		    {
+			 V T1z, T1D, T1C, T1E;
+			 {
+			      V T1x, T1y, T1A, T1B;
+			      T1x = VADD(T1f, T1g);
+			      T1y = VADD(T1r, T1q);
+			      T1z = VADD(T1x, T1y);
+			      T1D = VSUB(T1x, T1y);
+			      T1A = VADD(T1i, T1j);
+			      T1B = VADD(T1l, T1m);
+			      T1C = VADD(T1A, T1B);
+			      T1E = VBYI(VSUB(T1B, T1A));
+			 }
+			 ST(&(x[WS(rs, 8)]), VSUB(T1z, T1C), ms, &(x[0]));
+			 ST(&(x[WS(rs, 4)]), VADD(T1D, T1E), ms, &(x[0]));
+			 ST(&(x[0]), VADD(T1z, T1C), ms, &(x[0]));
+			 ST(&(x[WS(rs, 12)]), VSUB(T1D, T1E), ms, &(x[0]));
+		    }
+		    {
+			 V TT, T15, T14, T16;
+			 {
+			      V Tx, TS, T10, T13;
+			      Tx = VSUB(Ti, Tw);
+			      TS = VSUB(TL, TR);
+			      TT = VBYI(VSUB(Tx, TS));
+			      T15 = VBYI(VADD(TS, Tx));
+			      T10 = VADD(TY, TZ);
+			      T13 = VADD(T11, T12);
+			      T14 = VSUB(T10, T13);
+			      T16 = VADD(T10, T13);
+			 }
+			 ST(&(x[WS(rs, 7)]), VADD(TT, T14), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 15)]), VSUB(T16, T15), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 9)]), VSUB(T14, TT), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VADD(T15, T16), ms, &(x[WS(rs, 1)]));
+		    }
+		    {
+			 V T19, T1d, T1c, T1e;
+			 {
+			      V T17, T18, T1a, T1b;
+			      T17 = VSUB(TY, TZ);
+			      T18 = VADD(Tw, Ti);
+			      T19 = VADD(T17, T18);
+			      T1d = VSUB(T17, T18);
+			      T1a = VADD(TR, TL);
+			      T1b = VSUB(T12, T11);
+			      T1c = VBYI(VADD(T1a, T1b));
+			      T1e = VBYI(VSUB(T1b, T1a));
+			 }
+			 ST(&(x[WS(rs, 13)]), VSUB(T19, T1c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VADD(T1d, T1e), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(T19, T1c), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VSUB(T1d, T1e), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 16, XSIMD_STRING("t3fv_16"), twinstr, &GENUS, {94, 60, 4, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_16) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:55 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3fv_20 -include t3f.h */
+
+/*
+ * This function contains 138 FP additions, 118 FP multiplications,
+ * (or, 92 additions, 72 multiplications, 46 fused multiply/add),
+ * 90 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T1k, T1w, T1r, T1z, T1o, T1y, T1v, T1h;
+	       {
+		    V T2, T8, T3, Td;
+		    T2 = LDW(&(W[0]));
+		    T8 = LDW(&(W[TWVL * 2]));
+		    T3 = LDW(&(W[TWVL * 4]));
+		    Td = LDW(&(W[TWVL * 6]));
+		    {
+			 V T7, TM, T1F, T23, T1p, Tp, T1j, T27, T1P, T1I, T1i, T1L, T28, T1S, T1q;
+			 V TE, T1n, T1d, T26, T2e;
+			 {
+			      V T1, TK, T5, TH;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      TK = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      TH = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      {
+				   V TA, Tx, TU, T1O, T14, Th, T1G, T1R, T1b, T1J, To, Ts, TV, Tv, TO;
+				   V TQ, TT, Ty, TB;
+				   {
+					V Tq, Tt, T17, T1a, Tk, Tn;
+					{
+					     V Tl, Ti, T15, T18, TZ, Tc, T6, Tb, Tf, T10, T12, TL;
+					     {
+						  V TJ, Ta, T9, T4;
+						  Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+						  TA = VZMULJ(T2, T8);
+						  T9 = VZMUL(T2, T8);
+						  Tx = VZMUL(T8, T3);
+						  Tl = VZMULJ(T8, T3);
+						  T4 = VZMUL(T2, T3);
+						  Tq = VZMULJ(T2, T3);
+						  Tt = VZMULJ(T2, Td);
+						  Ti = VZMULJ(T8, Td);
+						  T15 = VZMULJ(TA, Td);
+						  T18 = VZMULJ(TA, T3);
+						  TU = VZMUL(TA, T3);
+						  TJ = VZMULJ(T9, Td);
+						  TZ = VZMUL(T9, T3);
+						  Tc = VZMULJ(T9, T3);
+						  T6 = VZMULJ(T4, T5);
+						  Tb = VZMULJ(T9, Ta);
+						  Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+						  T10 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+						  T12 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+						  TL = VZMULJ(TJ, TK);
+					     }
+					     {
+						  V T1D, T11, T13, T19, T1E, Tg, T16, TI, Te, Tj, Tm;
+						  T16 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+						  TI = VZMULJ(Tc, TH);
+						  Te = VZMULJ(Tc, Td);
+						  T7 = VSUB(T1, T6);
+						  T1D = VADD(T1, T6);
+						  T11 = VZMULJ(TZ, T10);
+						  T13 = VZMULJ(T8, T12);
+						  T19 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+						  T17 = VZMULJ(T15, T16);
+						  TM = VSUB(TI, TL);
+						  T1E = VADD(TI, TL);
+						  Tg = VZMULJ(Te, Tf);
+						  Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+						  Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+						  T1O = VADD(T11, T13);
+						  T14 = VSUB(T11, T13);
+						  T1a = VZMULJ(T18, T19);
+						  T1F = VSUB(T1D, T1E);
+						  T23 = VADD(T1D, T1E);
+						  Th = VSUB(Tb, Tg);
+						  T1G = VADD(Tb, Tg);
+						  Tk = VZMULJ(Ti, Tj);
+						  Tn = VZMULJ(Tl, Tm);
+					     }
+					}
+					{
+					     V Tr, Tu, TN, TP, TS;
+					     Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+					     T1R = VADD(T17, T1a);
+					     T1b = VSUB(T17, T1a);
+					     Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+					     TN = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					     TP = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+					     TS = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+					     T1J = VADD(Tk, Tn);
+					     To = VSUB(Tk, Tn);
+					     Ts = VZMULJ(Tq, Tr);
+					     TV = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+					     Tv = VZMULJ(Tt, Tu);
+					     TO = VZMULJ(T3, TN);
+					     TQ = VZMULJ(Td, TP);
+					     TT = VZMULJ(T2, TS);
+					     Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					     TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					}
+				   }
+				   {
+					V T1N, Tw, T1H, TR, Tz, TC, T1c, TX, T1K, TW;
+					T1p = VSUB(Th, To);
+					Tp = VADD(Th, To);
+					TW = VZMULJ(TU, TV);
+					T1N = VADD(Ts, Tv);
+					Tw = VSUB(Ts, Tv);
+					T1H = VADD(TO, TQ);
+					TR = VSUB(TO, TQ);
+					Tz = VZMULJ(Tx, Ty);
+					TC = VZMULJ(TA, TB);
+					T1j = VSUB(T1b, T14);
+					T1c = VADD(T14, T1b);
+					TX = VSUB(TT, TW);
+					T1K = VADD(TT, TW);
+					T27 = VADD(T1N, T1O);
+					T1P = VSUB(T1N, T1O);
+					{
+					     V TD, T1Q, T24, TY, T25;
+					     TD = VSUB(Tz, TC);
+					     T1Q = VADD(Tz, TC);
+					     T1I = VSUB(T1G, T1H);
+					     T24 = VADD(T1G, T1H);
+					     TY = VADD(TR, TX);
+					     T1i = VSUB(TX, TR);
+					     T25 = VADD(T1J, T1K);
+					     T1L = VSUB(T1J, T1K);
+					     T28 = VADD(T1Q, T1R);
+					     T1S = VSUB(T1Q, T1R);
+					     T1q = VSUB(Tw, TD);
+					     TE = VADD(Tw, TD);
+					     T1n = VSUB(T1c, TY);
+					     T1d = VADD(TY, T1c);
+					     T26 = VADD(T24, T25);
+					     T2e = VSUB(T24, T25);
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T1M, T1Z, T1Y, T1T, T29, T2f, T1g, TF, T1m, T1e;
+			      T1M = VADD(T1I, T1L);
+			      T1Z = VSUB(T1I, T1L);
+			      T1Y = VSUB(T1P, T1S);
+			      T1T = VADD(T1P, T1S);
+			      T29 = VADD(T27, T28);
+			      T2f = VSUB(T27, T28);
+			      T1g = VSUB(Tp, TE);
+			      TF = VADD(Tp, TE);
+			      T1m = VFNMS(LDK(KP250000000), T1d, TM);
+			      T1e = VADD(TM, T1d);
+			      {
+				   V T1W, T2c, T1f, T2i, T2g, T22, T20, T1V, T2b, T1U, T2a, TG;
+				   T1k = VFMA(LDK(KP618033988), T1j, T1i);
+				   T1w = VFNMS(LDK(KP618033988), T1i, T1j);
+				   T1W = VSUB(T1M, T1T);
+				   T1U = VADD(T1M, T1T);
+				   T2c = VSUB(T26, T29);
+				   T2a = VADD(T26, T29);
+				   T1f = VFNMS(LDK(KP250000000), TF, T7);
+				   TG = VADD(T7, TF);
+				   T2i = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T2e, T2f));
+				   T2g = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2f, T2e));
+				   T22 = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1Y, T1Z));
+				   T20 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1Z, T1Y));
+				   ST(&(x[WS(rs, 10)]), VADD(T1F, T1U), ms, &(x[0]));
+				   T1V = VFNMS(LDK(KP250000000), T1U, T1F);
+				   ST(&(x[0]), VADD(T23, T2a), ms, &(x[0]));
+				   T2b = VFNMS(LDK(KP250000000), T2a, T23);
+				   ST(&(x[WS(rs, 15)]), VFMAI(T1e, TG), ms, &(x[WS(rs, 1)]));
+				   ST(&(x[WS(rs, 5)]), VFNMSI(T1e, TG), ms, &(x[WS(rs, 1)]));
+				   T1r = VFMA(LDK(KP618033988), T1q, T1p);
+				   T1z = VFNMS(LDK(KP618033988), T1p, T1q);
+				   {
+					V T21, T1X, T2h, T2d;
+					T21 = VFMA(LDK(KP559016994), T1W, T1V);
+					T1X = VFNMS(LDK(KP559016994), T1W, T1V);
+					T2h = VFNMS(LDK(KP559016994), T2c, T2b);
+					T2d = VFMA(LDK(KP559016994), T2c, T2b);
+					ST(&(x[WS(rs, 18)]), VFNMSI(T20, T1X), ms, &(x[0]));
+					ST(&(x[WS(rs, 2)]), VFMAI(T20, T1X), ms, &(x[0]));
+					ST(&(x[WS(rs, 14)]), VFMAI(T22, T21), ms, &(x[0]));
+					ST(&(x[WS(rs, 6)]), VFNMSI(T22, T21), ms, &(x[0]));
+					ST(&(x[WS(rs, 16)]), VFNMSI(T2g, T2d), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VFMAI(T2g, T2d), ms, &(x[0]));
+					ST(&(x[WS(rs, 12)]), VFMAI(T2i, T2h), ms, &(x[0]));
+					ST(&(x[WS(rs, 8)]), VFNMSI(T2i, T2h), ms, &(x[0]));
+					T1o = VFNMS(LDK(KP559016994), T1n, T1m);
+					T1y = VFMA(LDK(KP559016994), T1n, T1m);
+					T1v = VFNMS(LDK(KP559016994), T1g, T1f);
+					T1h = VFMA(LDK(KP559016994), T1g, T1f);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T1C, T1A, T1s, T1u, T1l, T1t, T1B, T1x;
+		    T1C = VFMA(LDK(KP951056516), T1z, T1y);
+		    T1A = VFNMS(LDK(KP951056516), T1z, T1y);
+		    T1s = VFMA(LDK(KP951056516), T1r, T1o);
+		    T1u = VFNMS(LDK(KP951056516), T1r, T1o);
+		    T1l = VFMA(LDK(KP951056516), T1k, T1h);
+		    T1t = VFNMS(LDK(KP951056516), T1k, T1h);
+		    T1B = VFMA(LDK(KP951056516), T1w, T1v);
+		    T1x = VFNMS(LDK(KP951056516), T1w, T1v);
+		    ST(&(x[WS(rs, 11)]), VFMAI(T1u, T1t), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 9)]), VFNMSI(T1u, T1t), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 19)]), VFMAI(T1s, T1l), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 1)]), VFNMSI(T1s, T1l), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 3)]), VFMAI(T1A, T1x), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 17)]), VFNMSI(T1A, T1x), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 7)]), VFMAI(T1C, T1B), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 13)]), VFNMSI(T1C, T1B), ms, &(x[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t3fv_20"), twinstr, &GENUS, {92, 72, 46, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_20) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3fv_20 -include t3f.h */
+
+/*
+ * This function contains 138 FP additions, 92 FP multiplications,
+ * (or, 126 additions, 80 multiplications, 12 fused multiply/add),
+ * 73 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
+	       V T2, T8, T9, TA, T3, Tc, T4, TZ, T18, Tl, Tq, Tx, TU, Td, Te;
+	       V T15, Ti, Tt, TJ;
+	       T2 = LDW(&(W[0]));
+	       T8 = LDW(&(W[TWVL * 2]));
+	       T9 = VZMUL(T2, T8);
+	       TA = VZMULJ(T2, T8);
+	       T3 = LDW(&(W[TWVL * 4]));
+	       Tc = VZMULJ(T9, T3);
+	       T4 = VZMUL(T2, T3);
+	       TZ = VZMUL(T9, T3);
+	       T18 = VZMULJ(TA, T3);
+	       Tl = VZMULJ(T8, T3);
+	       Tq = VZMULJ(T2, T3);
+	       Tx = VZMUL(T8, T3);
+	       TU = VZMUL(TA, T3);
+	       Td = LDW(&(W[TWVL * 6]));
+	       Te = VZMULJ(Tc, Td);
+	       T15 = VZMULJ(TA, Td);
+	       Ti = VZMULJ(T8, Td);
+	       Tt = VZMULJ(T2, Td);
+	       TJ = VZMULJ(T9, Td);
+	       {
+		    V T7, TM, T1U, T2d, T1i, T1p, T1q, T1j, Tp, TE, TF, T26, T27, T2b, T1M;
+		    V T1P, T1V, TY, T1c, T1d, T23, T24, T2a, T1F, T1I, T1W, TG, T1e;
+		    {
+			 V T1, TL, T6, TI, TK, T5, TH, T1S, T1T;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 TK = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			 TL = VZMULJ(TJ, TK);
+			 T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			 T6 = VZMULJ(T4, T5);
+			 TH = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 TI = VZMULJ(Tc, TH);
+			 T7 = VSUB(T1, T6);
+			 TM = VSUB(TI, TL);
+			 T1S = VADD(T1, T6);
+			 T1T = VADD(TI, TL);
+			 T1U = VSUB(T1S, T1T);
+			 T2d = VADD(T1S, T1T);
+		    }
+		    {
+			 V Th, T1K, T14, T1E, T1b, T1H, To, T1N, Tw, T1D, TR, T1L, TX, T1O, TD;
+			 V T1G;
+			 {
+			      V Tb, Tg, Ta, Tf;
+			      Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      Tb = VZMULJ(T9, Ta);
+			      Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      Tg = VZMULJ(Te, Tf);
+			      Th = VSUB(Tb, Tg);
+			      T1K = VADD(Tb, Tg);
+			 }
+			 {
+			      V T11, T13, T10, T12;
+			      T10 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			      T11 = VZMULJ(TZ, T10);
+			      T12 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      T13 = VZMULJ(T8, T12);
+			      T14 = VSUB(T11, T13);
+			      T1E = VADD(T11, T13);
+			 }
+			 {
+			      V T17, T1a, T16, T19;
+			      T16 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			      T17 = VZMULJ(T15, T16);
+			      T19 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T1a = VZMULJ(T18, T19);
+			      T1b = VSUB(T17, T1a);
+			      T1H = VADD(T17, T1a);
+			 }
+			 {
+			      V Tk, Tn, Tj, Tm;
+			      Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			      Tk = VZMULJ(Ti, Tj);
+			      Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      Tn = VZMULJ(Tl, Tm);
+			      To = VSUB(Tk, Tn);
+			      T1N = VADD(Tk, Tn);
+			 }
+			 {
+			      V Ts, Tv, Tr, Tu;
+			      Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      Ts = VZMULJ(Tq, Tr);
+			      Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      Tv = VZMULJ(Tt, Tu);
+			      Tw = VSUB(Ts, Tv);
+			      T1D = VADD(Ts, Tv);
+			 }
+			 {
+			      V TO, TQ, TN, TP;
+			      TN = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			      TO = VZMULJ(T3, TN);
+			      TP = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			      TQ = VZMULJ(Td, TP);
+			      TR = VSUB(TO, TQ);
+			      T1L = VADD(TO, TQ);
+			 }
+			 {
+			      V TT, TW, TS, TV;
+			      TS = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      TT = VZMULJ(T2, TS);
+			      TV = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      TW = VZMULJ(TU, TV);
+			      TX = VSUB(TT, TW);
+			      T1O = VADD(TT, TW);
+			 }
+			 {
+			      V Tz, TC, Ty, TB;
+			      Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			      Tz = VZMULJ(Tx, Ty);
+			      TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      TC = VZMULJ(TA, TB);
+			      TD = VSUB(Tz, TC);
+			      T1G = VADD(Tz, TC);
+			 }
+			 T1i = VSUB(TX, TR);
+			 T1p = VSUB(Th, To);
+			 T1q = VSUB(Tw, TD);
+			 T1j = VSUB(T1b, T14);
+			 Tp = VADD(Th, To);
+			 TE = VADD(Tw, TD);
+			 TF = VADD(Tp, TE);
+			 T26 = VADD(T1D, T1E);
+			 T27 = VADD(T1G, T1H);
+			 T2b = VADD(T26, T27);
+			 T1M = VSUB(T1K, T1L);
+			 T1P = VSUB(T1N, T1O);
+			 T1V = VADD(T1M, T1P);
+			 TY = VADD(TR, TX);
+			 T1c = VADD(T14, T1b);
+			 T1d = VADD(TY, T1c);
+			 T23 = VADD(T1K, T1L);
+			 T24 = VADD(T1N, T1O);
+			 T2a = VADD(T23, T24);
+			 T1F = VSUB(T1D, T1E);
+			 T1I = VSUB(T1G, T1H);
+			 T1W = VADD(T1F, T1I);
+		    }
+		    TG = VADD(T7, TF);
+		    T1e = VBYI(VADD(TM, T1d));
+		    ST(&(x[WS(rs, 5)]), VSUB(TG, T1e), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 15)]), VADD(TG, T1e), ms, &(x[WS(rs, 1)]));
+		    {
+			 V T2c, T2e, T2f, T29, T2i, T25, T28, T2h, T2g;
+			 T2c = VMUL(LDK(KP559016994), VSUB(T2a, T2b));
+			 T2e = VADD(T2a, T2b);
+			 T2f = VFNMS(LDK(KP250000000), T2e, T2d);
+			 T25 = VSUB(T23, T24);
+			 T28 = VSUB(T26, T27);
+			 T29 = VBYI(VFMA(LDK(KP951056516), T25, VMUL(LDK(KP587785252), T28)));
+			 T2i = VBYI(VFNMS(LDK(KP587785252), T25, VMUL(LDK(KP951056516), T28)));
+			 ST(&(x[0]), VADD(T2d, T2e), ms, &(x[0]));
+			 T2h = VSUB(T2f, T2c);
+			 ST(&(x[WS(rs, 8)]), VSUB(T2h, T2i), ms, &(x[0]));
+			 ST(&(x[WS(rs, 12)]), VADD(T2i, T2h), ms, &(x[0]));
+			 T2g = VADD(T2c, T2f);
+			 ST(&(x[WS(rs, 4)]), VADD(T29, T2g), ms, &(x[0]));
+			 ST(&(x[WS(rs, 16)]), VSUB(T2g, T29), ms, &(x[0]));
+		    }
+		    {
+			 V T1Z, T1X, T1Y, T1R, T22, T1J, T1Q, T21, T20;
+			 T1Z = VMUL(LDK(KP559016994), VSUB(T1V, T1W));
+			 T1X = VADD(T1V, T1W);
+			 T1Y = VFNMS(LDK(KP250000000), T1X, T1U);
+			 T1J = VSUB(T1F, T1I);
+			 T1Q = VSUB(T1M, T1P);
+			 T1R = VBYI(VFNMS(LDK(KP587785252), T1Q, VMUL(LDK(KP951056516), T1J)));
+			 T22 = VBYI(VFMA(LDK(KP951056516), T1Q, VMUL(LDK(KP587785252), T1J)));
+			 ST(&(x[WS(rs, 10)]), VADD(T1U, T1X), ms, &(x[0]));
+			 T21 = VADD(T1Z, T1Y);
+			 ST(&(x[WS(rs, 6)]), VSUB(T21, T22), ms, &(x[0]));
+			 ST(&(x[WS(rs, 14)]), VADD(T22, T21), ms, &(x[0]));
+			 T20 = VSUB(T1Y, T1Z);
+			 ST(&(x[WS(rs, 2)]), VADD(T1R, T20), ms, &(x[0]));
+			 ST(&(x[WS(rs, 18)]), VSUB(T20, T1R), ms, &(x[0]));
+		    }
+		    {
+			 V T1k, T1r, T1z, T1w, T1o, T1y, T1h, T1v;
+			 T1k = VFMA(LDK(KP951056516), T1i, VMUL(LDK(KP587785252), T1j));
+			 T1r = VFMA(LDK(KP951056516), T1p, VMUL(LDK(KP587785252), T1q));
+			 T1z = VFNMS(LDK(KP587785252), T1p, VMUL(LDK(KP951056516), T1q));
+			 T1w = VFNMS(LDK(KP587785252), T1i, VMUL(LDK(KP951056516), T1j));
+			 {
+			      V T1m, T1n, T1f, T1g;
+			      T1m = VFMS(LDK(KP250000000), T1d, TM);
+			      T1n = VMUL(LDK(KP559016994), VSUB(T1c, TY));
+			      T1o = VADD(T1m, T1n);
+			      T1y = VSUB(T1n, T1m);
+			      T1f = VMUL(LDK(KP559016994), VSUB(Tp, TE));
+			      T1g = VFNMS(LDK(KP250000000), TF, T7);
+			      T1h = VADD(T1f, T1g);
+			      T1v = VSUB(T1g, T1f);
+			 }
+			 {
+			      V T1l, T1s, T1B, T1C;
+			      T1l = VADD(T1h, T1k);
+			      T1s = VBYI(VSUB(T1o, T1r));
+			      ST(&(x[WS(rs, 19)]), VSUB(T1l, T1s), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VADD(T1l, T1s), ms, &(x[WS(rs, 1)]));
+			      T1B = VADD(T1v, T1w);
+			      T1C = VBYI(VADD(T1z, T1y));
+			      ST(&(x[WS(rs, 13)]), VSUB(T1B, T1C), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 7)]), VADD(T1B, T1C), ms, &(x[WS(rs, 1)]));
+			 }
+			 {
+			      V T1t, T1u, T1x, T1A;
+			      T1t = VSUB(T1h, T1k);
+			      T1u = VBYI(VADD(T1r, T1o));
+			      ST(&(x[WS(rs, 11)]), VSUB(T1t, T1u), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 9)]), VADD(T1t, T1u), ms, &(x[WS(rs, 1)]));
+			      T1x = VSUB(T1v, T1w);
+			      T1A = VBYI(VSUB(T1y, T1z));
+			      ST(&(x[WS(rs, 17)]), VSUB(T1x, T1A), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 3)]), VADD(T1x, T1A), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 20, XSIMD_STRING("t3fv_20"), twinstr, &GENUS, {126, 80, 12, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_20) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,948 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:56 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 25 -name t3fv_25 -include t3f.h */
+
+/*
+ * This function contains 268 FP additions, 281 FP multiplications,
+ * (or, 87 additions, 100 multiplications, 181 fused multiply/add),
+ * 223 stack variables, 67 constants, and 50 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP792626838, +0.792626838241819413632131824093538848057784557);
+     DVK(KP876091699, +0.876091699473550838204498029706869638173524346);
+     DVK(KP617882369, +0.617882369114440893914546919006756321695042882);
+     DVK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DVK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP999544308, +0.999544308746292983948881682379742149196758193);
+     DVK(KP916574801, +0.916574801383451584742370439148878693530976769);
+     DVK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DVK(KP809385824, +0.809385824416008241660603814668679683846476688);
+     DVK(KP447417479, +0.447417479732227551498980015410057305749330693);
+     DVK(KP894834959, +0.894834959464455102997960030820114611498661386);
+     DVK(KP867381224, +0.867381224396525206773171885031575671309956167);
+     DVK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DVK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DVK(KP958953096, +0.958953096729998668045963838399037225970891871);
+     DVK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DVK(KP829049696, +0.829049696159252993975487806364305442437946767);
+     DVK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DVK(KP897376177, +0.897376177523557693138608077137219684419427330);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DVK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DVK(KP855719849, +0.855719849902058969314654733608091555096772472);
+     DVK(KP237294955, +0.237294955877110315393888866460840817927895961);
+     DVK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DVK(KP904508497, +0.904508497187473712051146708591409529430077295);
+     DVK(KP997675361, +0.997675361079556513670859573984492383596555031);
+     DVK(KP763932022, +0.763932022500210303590826331268723764559381640);
+     DVK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP952936919, +0.952936919628306576880750665357914584765951388);
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP262346850, +0.262346850930607871785420028382979691334784273);
+     DVK(KP570584518, +0.570584518783621657366766175430996792655723863);
+     DVK(KP669429328, +0.669429328479476605641803240971985825917022098);
+     DVK(KP923225144, +0.923225144846402650453449441572664695995209956);
+     DVK(KP945422727, +0.945422727388575946270360266328811958657216298);
+     DVK(KP522616830, +0.522616830205754336872861364785224694908468440);
+     DVK(KP956723877, +0.956723877038460305821989399535483155872969262);
+     DVK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DVK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DVK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DVK(KP921078979, +0.921078979742360627699756128143719920817673854);
+     DVK(KP982009705, +0.982009705009746369461829878184175962711969869);
+     DVK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DVK(KP494780565, +0.494780565770515410344588413655324772219443730);
+     DVK(KP447533225, +0.447533225982656890041886979663652563063114397);
+     DVK(KP603558818, +0.603558818296015001454675132653458027918768137);
+     DVK(KP667278218, +0.667278218140296670899089292254759909713898805);
+     DVK(KP244189809, +0.244189809627953270309879511234821255780225091);
+     DVK(KP269969613, +0.269969613759572083574752974412347470060951301);
+     DVK(KP578046249, +0.578046249379945007321754579646815604023525655);
+     DVK(KP522847744, +0.522847744331509716623755382187077770911012542);
+     DVK(KP132830569, +0.132830569247582714407653942074819768844536507);
+     DVK(KP120146378, +0.120146378570687701782758537356596213647956445);
+     DVK(KP893101515, +0.893101515366181661711202267938416198338079437);
+     DVK(KP987388751, +0.987388751065621252324603216482382109400433949);
+     DVK(KP059835404, +0.059835404262124915169548397419498386427871950);
+     DVK(KP066152395, +0.066152395967733048213034281011006031460903353);
+     DVK(KP786782374, +0.786782374965295178365099601674911834788448471);
+     DVK(KP869845200, +0.869845200362138853122720822420327157933056305);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V T2t, T1Z, T2W, T28, T2Q, T2r, T2g, T2u, T2o, T2l;
+	       {
+		    V T2, T5, T3, T9;
+		    T2 = LDW(&(W[0]));
+		    T5 = LDW(&(W[TWVL * 4]));
+		    T3 = LDW(&(W[TWVL * 2]));
+		    T9 = LDW(&(W[TWVL * 6]));
+		    {
+			 V T2c, T3l, Tn, T49, Tm, T4e, TN, T32, T1d, T3a, T3f, T3z, T3H, T25, T1W;
+			 V T2v, T2D, T4a, T1g, T18, T2Z, T11, T31, TK, T1q, T1j, T1n, T4b, T17;
+			 {
+			      V T1, T1l, Tr, T4, Ty, T1E, Tu, TX, TD, T1h, Tz, T1e, T1I, T1o, TU;
+			      V Tk, T2b, T1B, T1D, T1N, T1F, Td, T2a, T1J;
+			      {
+				   V T7, Tb, TC, Tg, T1L, Ta, T6, Tj, T1A;
+				   T1 = LD(&(x[0]), ms, &(x[0]));
+				   {
+					V Tf, Ti, Te, Th;
+					Tf = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+					Ti = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+					T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+					Tb = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+					Te = VZMUL(T2, T5);
+					TC = VZMULJ(T2, T5);
+					T1l = VZMUL(T3, T5);
+					Tr = VZMULJ(T3, T5);
+					T4 = VZMUL(T2, T3);
+					Ty = VZMULJ(T2, T3);
+					T1E = VZMULJ(T2, T9);
+					Th = VZMULJ(T5, T9);
+					Tu = VZMULJ(T3, T9);
+					Tg = VZMULJ(Te, Tf);
+					TX = VZMULJ(Te, T9);
+					TD = VZMULJ(TC, T9);
+					T1h = VZMULJ(Ty, T9);
+					Tz = VZMUL(Ty, T5);
+					T1e = VZMULJ(Ty, T5);
+					T1L = VZMULJ(Tr, T9);
+					Ta = VZMULJ(T4, T9);
+					T1I = VZMUL(T4, T5);
+					T6 = VZMULJ(T4, T5);
+					Tj = VZMULJ(Th, Ti);
+				   }
+				   T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   T1o = VZMULJ(T1e, T9);
+				   {
+					V Tc, T8, T1C, T1M;
+					T1C = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+					T1M = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+					Tc = VZMULJ(Ta, Tb);
+					T8 = VZMULJ(T6, T7);
+					TU = VZMULJ(T6, T9);
+					Tk = VADD(Tg, Tj);
+					T2b = VSUB(Tg, Tj);
+					T1B = VZMULJ(T3, T1A);
+					T1D = VZMULJ(TC, T1C);
+					T1N = VZMULJ(T1L, T1M);
+					T1F = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+					Td = VADD(T8, Tc);
+					T2a = VSUB(T8, Tc);
+					T1J = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			      {
+				   V Tq, Tt, TF, T1T, T1H, Tw, T1U, T1O, TA, Tp, Ts, TE;
+				   Tp = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+				   Ts = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   TE = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+				   {
+					V T1K, Tv, T1G, Tl;
+					Tv = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+					T1G = VZMULJ(T1E, T1F);
+					T2c = VFMA(LDK(KP618033988), T2b, T2a);
+					T3l = VFNMS(LDK(KP618033988), T2a, T2b);
+					Tn = VSUB(Td, Tk);
+					Tl = VADD(Td, Tk);
+					T1K = VZMULJ(T1I, T1J);
+					Tq = VZMULJ(T2, Tp);
+					Tt = VZMULJ(Tr, Ts);
+					TF = VZMULJ(TD, TE);
+					T1T = VSUB(T1D, T1G);
+					T1H = VADD(T1D, T1G);
+					T49 = VADD(T1, Tl);
+					Tm = VFNMS(LDK(KP250000000), Tl, T1);
+					Tw = VZMULJ(Tu, Tv);
+					T1U = VSUB(T1K, T1N);
+					T1O = VADD(T1K, T1N);
+					TA = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   }
+				   {
+					V Tx, TL, T1R, T38, T1V, T13, TQ, TZ, TS, T1Q, TV, TG, TM, T12, T1c;
+					V T16;
+					T12 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+					{
+					     V TP, TY, T1P, TB, TR;
+					     TP = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+					     TY = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+					     TR = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+					     Tx = VADD(Tt, Tw);
+					     TL = VSUB(Tt, Tw);
+					     T1R = VSUB(T1O, T1H);
+					     T1P = VADD(T1H, T1O);
+					     T38 = VFNMS(LDK(KP618033988), T1T, T1U);
+					     T1V = VFMA(LDK(KP618033988), T1U, T1T);
+					     TB = VZMULJ(Tz, TA);
+					     T13 = VZMULJ(T4, T12);
+					     TQ = VZMULJ(T9, TP);
+					     TZ = VZMULJ(TX, TY);
+					     TS = VZMULJ(T5, TR);
+					     T4e = VADD(T1B, T1P);
+					     T1Q = VFNMS(LDK(KP250000000), T1P, T1B);
+					     TV = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+					     TG = VADD(TB, TF);
+					     TM = VSUB(TF, TB);
+					}
+					T1c = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					{
+					     V T14, TT, TJ, T15, T10, TI, T1p, T1f, T1i, T1m;
+					     T1f = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+					     T14 = VADD(TS, TQ);
+					     TT = VSUB(TQ, TS);
+					     {
+						  V T39, T1S, TW, TH;
+						  T39 = VFMA(LDK(KP559016994), T1R, T1Q);
+						  T1S = VFNMS(LDK(KP559016994), T1R, T1Q);
+						  TW = VZMULJ(TU, TV);
+						  TH = VADD(Tx, TG);
+						  TJ = VSUB(Tx, TG);
+						  TN = VFNMS(LDK(KP618033988), TM, TL);
+						  T32 = VFMA(LDK(KP618033988), TL, TM);
+						  T1d = VZMULJ(Ty, T1c);
+						  T3a = VFMA(LDK(KP869845200), T39, T38);
+						  T3f = VFNMS(LDK(KP786782374), T38, T39);
+						  T3z = VFMA(LDK(KP066152395), T39, T38);
+						  T3H = VFNMS(LDK(KP059835404), T38, T39);
+						  T25 = VFMA(LDK(KP987388751), T1S, T1V);
+						  T1W = VFNMS(LDK(KP893101515), T1V, T1S);
+						  T2v = VFNMS(LDK(KP120146378), T1V, T1S);
+						  T2D = VFMA(LDK(KP132830569), T1S, T1V);
+						  T15 = VADD(TZ, TW);
+						  T10 = VSUB(TW, TZ);
+						  TI = VFNMS(LDK(KP250000000), TH, Tq);
+						  T4a = VADD(Tq, TH);
+						  T1g = VZMULJ(T1e, T1f);
+					     }
+					     T1p = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+					     T1i = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+					     T1m = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+					     T18 = VSUB(T14, T15);
+					     T16 = VADD(T14, T15);
+					     T2Z = VFNMS(LDK(KP618033988), TT, T10);
+					     T11 = VFMA(LDK(KP618033988), T10, TT);
+					     T31 = VFNMS(LDK(KP559016994), TJ, TI);
+					     TK = VFMA(LDK(KP559016994), TJ, TI);
+					     T1q = VZMULJ(T1o, T1p);
+					     T1j = VZMULJ(T1h, T1i);
+					     T1n = VZMULJ(T1l, T1m);
+					}
+					T4b = VADD(T13, T16);
+					T17 = VFMS(LDK(KP250000000), T16, T13);
+				   }
+			      }
+			 }
+			 {
+			      V T33, T3i, T3C, T3L, T20, TO, T2y, T2G, T1k, T1w, T1r, T1x, T2Y, T19, T4k;
+			      V T4c;
+			      T33 = VFMA(LDK(KP893101515), T32, T31);
+			      T3i = VFNMS(LDK(KP987388751), T31, T32);
+			      T3C = VFNMS(LDK(KP522847744), T32, T31);
+			      T3L = VFMA(LDK(KP578046249), T31, T32);
+			      T20 = VFMA(LDK(KP269969613), TK, TN);
+			      TO = VFNMS(LDK(KP244189809), TN, TK);
+			      T2y = VFMA(LDK(KP667278218), TK, TN);
+			      T2G = VFNMS(LDK(KP603558818), TN, TK);
+			      T1k = VADD(T1g, T1j);
+			      T1w = VSUB(T1g, T1j);
+			      T1r = VADD(T1n, T1q);
+			      T1x = VSUB(T1q, T1n);
+			      T2Y = VFMA(LDK(KP559016994), T18, T17);
+			      T19 = VFNMS(LDK(KP559016994), T18, T17);
+			      T4k = VSUB(T4a, T4b);
+			      T4c = VADD(T4a, T4b);
+			      {
+				   V T2X, To, T35, T1y, T2H, T2z, T1a, T21, T3t, T34, T3n, T3j, T3E, T3Y, T3M;
+				   V T3R, T1v, T36, T4l, T4f, T1u, T1s;
+				   T2X = VFNMS(LDK(KP559016994), Tn, Tm);
+				   To = VFMA(LDK(KP559016994), Tn, Tm);
+				   T1u = VSUB(T1r, T1k);
+				   T1s = VADD(T1k, T1r);
+				   T35 = VFMA(LDK(KP618033988), T1w, T1x);
+				   T1y = VFNMS(LDK(KP618033988), T1x, T1w);
+				   {
+					V T3K, T30, T3h, T3D, T4d, T1t;
+					T3K = VFMA(LDK(KP447533225), T2Z, T2Y);
+					T30 = VFMA(LDK(KP120146378), T2Z, T2Y);
+					T3h = VFNMS(LDK(KP132830569), T2Y, T2Z);
+					T3D = VFNMS(LDK(KP494780565), T2Y, T2Z);
+					T2H = VFNMS(LDK(KP786782374), T11, T19);
+					T2z = VFMA(LDK(KP869845200), T19, T11);
+					T1a = VFNMS(LDK(KP667278218), T19, T11);
+					T21 = VFMA(LDK(KP603558818), T11, T19);
+					T4d = VADD(T1d, T1s);
+					T1t = VFNMS(LDK(KP250000000), T1s, T1d);
+					T3t = VFNMS(LDK(KP734762448), T33, T30);
+					T34 = VFMA(LDK(KP734762448), T33, T30);
+					T3n = VFMA(LDK(KP734762448), T3i, T3h);
+					T3j = VFNMS(LDK(KP734762448), T3i, T3h);
+					T3E = VFNMS(LDK(KP982009705), T3D, T3C);
+					T3Y = VFMA(LDK(KP982009705), T3D, T3C);
+					T3M = VFNMS(LDK(KP921078979), T3L, T3K);
+					T3R = VFMA(LDK(KP921078979), T3L, T3K);
+					T1v = VFNMS(LDK(KP559016994), T1u, T1t);
+					T36 = VFMA(LDK(KP559016994), T1u, T1t);
+					T4l = VSUB(T4d, T4e);
+					T4f = VADD(T4d, T4e);
+				   }
+				   {
+					V T2L, T2R, T2j, T2q, T2J, T2B, T2e, T26, T2U, T1Y, T23, T2O;
+					{
+					     V T2I, T24, T2w, T2E, T48, T42, T3y, T3s, T3V, T45, T2A, T1b, T2h, T2i, T1X;
+					     T2L = VFNMS(LDK(KP912575812), T2H, T2G);
+					     T2I = VFMA(LDK(KP912575812), T2H, T2G);
+					     {
+						  V T3A, T3e, T37, T3I, T1z;
+						  T3A = VFNMS(LDK(KP667278218), T36, T35);
+						  T3e = VFNMS(LDK(KP059835404), T35, T36);
+						  T37 = VFMA(LDK(KP066152395), T36, T35);
+						  T3I = VFMA(LDK(KP603558818), T35, T36);
+						  T24 = VFMA(LDK(KP578046249), T1v, T1y);
+						  T1z = VFNMS(LDK(KP522847744), T1y, T1v);
+						  T2w = VFNMS(LDK(KP494780565), T1v, T1y);
+						  T2E = VFMA(LDK(KP447533225), T1y, T1v);
+						  {
+						       V T4i, T4g, T4o, T4m;
+						       T4i = VSUB(T4c, T4f);
+						       T4g = VADD(T4c, T4f);
+						       T4o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T4k, T4l));
+						       T4m = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T4l, T4k));
+						       {
+							    V T3Q, T3J, T3b, T3u;
+							    T3Q = VFNMS(LDK(KP845997307), T3I, T3H);
+							    T3J = VFMA(LDK(KP845997307), T3I, T3H);
+							    T3b = VFNMS(LDK(KP772036680), T3a, T37);
+							    T3u = VFMA(LDK(KP772036680), T3a, T37);
+							    {
+								 V T3o, T3g, T3B, T3X, T4h;
+								 T3o = VFNMS(LDK(KP772036680), T3f, T3e);
+								 T3g = VFMA(LDK(KP772036680), T3f, T3e);
+								 T3B = VFNMS(LDK(KP845997307), T3A, T3z);
+								 T3X = VFMA(LDK(KP845997307), T3A, T3z);
+								 ST(&(x[0]), VADD(T4g, T49), ms, &(x[0]));
+								 T4h = VFNMS(LDK(KP250000000), T4g, T49);
+								 {
+								      V T40, T3N, T3c, T3v;
+								      T40 = VFMA(LDK(KP906616052), T3M, T3J);
+								      T3N = VFNMS(LDK(KP906616052), T3M, T3J);
+								      T3c = VFMA(LDK(KP956723877), T3b, T34);
+								      T3v = VFMA(LDK(KP522616830), T3j, T3u);
+								      {
+									   V T3p, T3k, T3S, T3F;
+									   T3p = VFNMS(LDK(KP522616830), T34, T3o);
+									   T3k = VFMA(LDK(KP945422727), T3j, T3g);
+									   T3S = VFNMS(LDK(KP923225144), T3E, T3B);
+									   T3F = VFMA(LDK(KP923225144), T3E, T3B);
+									   {
+										V T46, T3Z, T4j, T4n;
+										T46 = VFNMS(LDK(KP669429328), T3X, T3Y);
+										T3Z = VFMA(LDK(KP570584518), T3Y, T3X);
+										T4j = VFMA(LDK(KP559016994), T4i, T4h);
+										T4n = VFNMS(LDK(KP559016994), T4i, T4h);
+										{
+										     V T3W, T3O, T3d, T3w;
+										     T3W = VFMA(LDK(KP262346850), T3N, T3l);
+										     T3O = VMUL(LDK(KP998026728), VFNMS(LDK(KP952936919), T3l, T3N));
+										     T3d = VFMA(LDK(KP992114701), T3c, T2X);
+										     T3w = VFNMS(LDK(KP690983005), T3v, T3g);
+										     {
+											  V T3q, T3m, T3T, T43;
+											  T3q = VFMA(LDK(KP763932022), T3p, T3b);
+											  T3m = VMUL(LDK(KP998026728), VFMA(LDK(KP952936919), T3l, T3k));
+											  T3T = VFNMS(LDK(KP997675361), T3S, T3R);
+											  T43 = VFNMS(LDK(KP904508497), T3S, T3Q);
+											  {
+											       V T3G, T3P, T47, T41;
+											       T3G = VFMA(LDK(KP949179823), T3F, T2X);
+											       T3P = VFNMS(LDK(KP237294955), T3F, T2X);
+											       T47 = VFNMS(LDK(KP669429328), T40, T46);
+											       T41 = VFMA(LDK(KP618033988), T40, T3Z);
+											       ST(&(x[WS(rs, 20)]), VFMAI(T4m, T4j), ms, &(x[0]));
+											       ST(&(x[WS(rs, 5)]), VFNMSI(T4m, T4j), ms, &(x[WS(rs, 1)]));
+											       ST(&(x[WS(rs, 15)]), VFNMSI(T4o, T4n), ms, &(x[WS(rs, 1)]));
+											       ST(&(x[WS(rs, 10)]), VFMAI(T4o, T4n), ms, &(x[0]));
+											       {
+												    V T3x, T3r, T3U, T44;
+												    T3x = VFMA(LDK(KP855719849), T3w, T3t);
+												    T3r = VFNMS(LDK(KP855719849), T3q, T3n);
+												    ST(&(x[WS(rs, 22)]), VFMAI(T3m, T3d), ms, &(x[0]));
+												    ST(&(x[WS(rs, 3)]), VFNMSI(T3m, T3d), ms, &(x[WS(rs, 1)]));
+												    T3U = VFMA(LDK(KP560319534), T3T, T3Q);
+												    T44 = VFNMS(LDK(KP681693190), T43, T3R);
+												    ST(&(x[WS(rs, 23)]), VFMAI(T3O, T3G), ms, &(x[WS(rs, 1)]));
+												    ST(&(x[WS(rs, 2)]), VFNMSI(T3O, T3G), ms, &(x[0]));
+												    T48 = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T47, T3W));
+												    T42 = VMUL(LDK(KP951056516), VFNMS(LDK(KP949179823), T41, T3W));
+												    T3y = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T3x, T3l));
+												    T3s = VFMA(LDK(KP897376177), T3r, T2X);
+												    T3V = VFNMS(LDK(KP949179823), T3U, T3P);
+												    T45 = VFNMS(LDK(KP860541664), T44, T3P);
+												    T2R = VFNMS(LDK(KP912575812), T2z, T2y);
+												    T2A = VFMA(LDK(KP912575812), T2z, T2y);
+												    T1b = VFMA(LDK(KP829049696), T1a, TO);
+												    T2h = VFNMS(LDK(KP829049696), T1a, TO);
+												    T2i = VFNMS(LDK(KP831864738), T1W, T1z);
+												    T1X = VFMA(LDK(KP831864738), T1W, T1z);
+											       }
+											  }
+										     }
+										}
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					     {
+						  V T2M, T2F, T2x, T2S, T2T, T2N;
+						  T2M = VFNMS(LDK(KP958953096), T2E, T2D);
+						  T2F = VFMA(LDK(KP958953096), T2E, T2D);
+						  ST(&(x[WS(rs, 17)]), VFMAI(T3y, T3s), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 8)]), VFNMSI(T3y, T3s), ms, &(x[0]));
+						  ST(&(x[WS(rs, 12)]), VFMAI(T42, T3V), ms, &(x[0]));
+						  ST(&(x[WS(rs, 13)]), VFNMSI(T42, T3V), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 18)]), VFNMSI(T48, T45), ms, &(x[0]));
+						  ST(&(x[WS(rs, 7)]), VFMAI(T48, T45), ms, &(x[WS(rs, 1)]));
+						  T2j = VFMA(LDK(KP559154169), T2i, T2h);
+						  T2q = VFNMS(LDK(KP683113946), T2h, T2i);
+						  T2x = VFNMS(LDK(KP867381224), T2w, T2v);
+						  T2S = VFMA(LDK(KP867381224), T2w, T2v);
+						  T2J = VFMA(LDK(KP894834959), T2I, T2F);
+						  T2T = VFMA(LDK(KP447417479), T2I, T2S);
+						  T2B = VFNMS(LDK(KP809385824), T2A, T2x);
+						  T2N = VFMA(LDK(KP447417479), T2A, T2M);
+						  T2e = VFMA(LDK(KP831864738), T25, T24);
+						  T26 = VFNMS(LDK(KP831864738), T25, T24);
+						  T2U = VFNMS(LDK(KP763932022), T2T, T2F);
+						  T1Y = VFMA(LDK(KP904730450), T1X, T1b);
+						  T23 = VFNMS(LDK(KP904730450), T1X, T1b);
+						  T2O = VFMA(LDK(KP690983005), T2N, T2x);
+					     }
+					}
+					{
+					     V T2C, T22, T2d, T2K;
+					     T2C = VFNMS(LDK(KP992114701), T2B, To);
+					     T22 = VFMA(LDK(KP916574801), T21, T20);
+					     T2d = VFNMS(LDK(KP916574801), T21, T20);
+					     T2K = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T2J, T2c));
+					     {
+						  V T27, T2P, T2f, T2k, T2n, T2V;
+						  T2V = VFNMS(LDK(KP999544308), T2U, T2R);
+						  T27 = VFNMS(LDK(KP904730450), T26, T23);
+						  T2t = VFMA(LDK(KP968583161), T1Y, To);
+						  T1Z = VFNMS(LDK(KP242145790), T1Y, To);
+						  T2P = VFNMS(LDK(KP999544308), T2O, T2L);
+						  T2f = VFMA(LDK(KP904730450), T2e, T2d);
+						  T2k = VFNMS(LDK(KP904730450), T2e, T2d);
+						  T2n = VADD(T22, T23);
+						  ST(&(x[WS(rs, 21)]), VFNMSI(T2K, T2C), ms, &(x[WS(rs, 1)]));
+						  ST(&(x[WS(rs, 4)]), VFMAI(T2K, T2C), ms, &(x[0]));
+						  T2W = VMUL(LDK(KP951056516), VFNMS(LDK(KP803003575), T2V, T2c));
+						  T28 = VFNMS(LDK(KP618033988), T27, T22);
+						  T2Q = VFNMS(LDK(KP803003575), T2P, To);
+						  T2r = VFMA(LDK(KP617882369), T2k, T2q);
+						  T2g = VFNMS(LDK(KP242145790), T2f, T2c);
+						  T2u = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T2f, T2c));
+						  T2o = VFNMS(LDK(KP683113946), T2n, T26);
+						  T2l = VFMA(LDK(KP559016994), T2k, T2j);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T29, T2s, T2p, T2m;
+		    T29 = VFNMS(LDK(KP876091699), T28, T1Z);
+		    ST(&(x[WS(rs, 9)]), VFMAI(T2W, T2Q), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 16)]), VFNMSI(T2W, T2Q), ms, &(x[0]));
+		    T2s = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T2r, T2g));
+		    ST(&(x[WS(rs, 24)]), VFMAI(T2u, T2t), ms, &(x[0]));
+		    ST(&(x[WS(rs, 1)]), VFNMSI(T2u, T2t), ms, &(x[WS(rs, 1)]));
+		    T2p = VFMA(LDK(KP792626838), T2o, T1Z);
+		    T2m = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T2l, T2g));
+		    ST(&(x[WS(rs, 11)]), VFNMSI(T2s, T2p), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 14)]), VFMAI(T2s, T2p), ms, &(x[0]));
+		    ST(&(x[WS(rs, 19)]), VFMAI(T2m, T29), ms, &(x[WS(rs, 1)]));
+		    ST(&(x[WS(rs, 6)]), VFNMSI(T2m, T29), ms, &(x[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t3fv_25"), twinstr, &GENUS, {87, 100, 181, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_25) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 25 -name t3fv_25 -include t3f.h */
+
+/*
+ * This function contains 268 FP additions, 228 FP multiplications,
+ * (or, 190 additions, 150 multiplications, 78 fused multiply/add),
+ * 123 stack variables, 40 constants, and 50 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DVK(KP125581039, +0.125581039058626752152356449131262266244969664);
+     DVK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DVK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DVK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DVK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DVK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
+     DVK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DVK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DVK(KP497379774, +0.497379774329709576484567492012895936835134813);
+     DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DVK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DVK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
+     DVK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DVK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DVK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DVK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DVK(KP250666467, +0.250666467128608490746237519633017587885836494);
+     DVK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DVK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DVK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
+     DVK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DVK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DVK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
+     DVK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DVK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
+     DVK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DVK(KP851558583, +0.851558583130145297725004891488503407959946084);
+     DVK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DVK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
+     DVK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DVK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
+     DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(25, rs)) {
+	       V T1, T4, T2, T3, TA, Td, Tp, Tw, Tx, T1G, T1j, T5, T1c, T8, T9;
+	       V Ts, T1J, Tg, T1C, T1m, TX, TB, T1f, TU;
+	       T1 = LDW(&(W[0]));
+	       T4 = LDW(&(W[TWVL * 4]));
+	       T2 = LDW(&(W[TWVL * 2]));
+	       T3 = VZMUL(T1, T2);
+	       TA = VZMULJ(T1, T4);
+	       Td = VZMUL(T1, T4);
+	       Tp = VZMULJ(T2, T4);
+	       Tw = VZMULJ(T1, T2);
+	       Tx = VZMUL(Tw, T4);
+	       T1G = VZMUL(T3, T4);
+	       T1j = VZMUL(T2, T4);
+	       T5 = VZMULJ(T3, T4);
+	       T1c = VZMULJ(Tw, T4);
+	       T8 = LDW(&(W[TWVL * 6]));
+	       T9 = VZMULJ(T3, T8);
+	       Ts = VZMULJ(T2, T8);
+	       T1J = VZMULJ(Tp, T8);
+	       Tg = VZMULJ(T4, T8);
+	       T1C = VZMULJ(T1, T8);
+	       T1m = VZMULJ(T1c, T8);
+	       TX = VZMULJ(T5, T8);
+	       TB = VZMULJ(TA, T8);
+	       T1f = VZMULJ(Tw, T8);
+	       TU = VZMULJ(Td, T8);
+	       {
+		    V Tl, Tk, Tm, Tn, T20, T2R, T22, T1V, T2K, T1S, T3A, T2L, TN, T2G, TK;
+		    V T3w, T2H, T19, T2D, T16, T3x, T2E, T1y, T2N, T1v, T3z, T2O;
+		    {
+			 V Tf, Ti, Tj, T7, Tb, Tc, T21;
+			 Tl = LD(&(x[0]), ms, &(x[0]));
+			 {
+			      V Te, Th, T6, Ta;
+			      Te = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      Tf = VZMULJ(Td, Te);
+			      Th = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      Ti = VZMULJ(Tg, Th);
+			      Tj = VADD(Tf, Ti);
+			      T6 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			      T7 = VZMULJ(T5, T6);
+			      Ta = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			      Tb = VZMULJ(T9, Ta);
+			      Tc = VADD(T7, Tb);
+			 }
+			 Tk = VMUL(LDK(KP559016994), VSUB(Tc, Tj));
+			 Tm = VADD(Tc, Tj);
+			 Tn = VFNMS(LDK(KP250000000), Tm, Tl);
+			 T20 = VSUB(T7, Tb);
+			 T21 = VSUB(Tf, Ti);
+			 T2R = VMUL(LDK(KP951056516), T21);
+			 T22 = VFMA(LDK(KP951056516), T20, VMUL(LDK(KP587785252), T21));
+		    }
+		    {
+			 V T1P, T1I, T1L, T1M, T1B, T1E, T1F, T1O;
+			 T1O = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 T1P = VZMULJ(T2, T1O);
+			 {
+			      V T1H, T1K, T1A, T1D;
+			      T1H = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+			      T1I = VZMULJ(T1G, T1H);
+			      T1K = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      T1L = VZMULJ(T1J, T1K);
+			      T1M = VADD(T1I, T1L);
+			      T1A = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      T1B = VZMULJ(TA, T1A);
+			      T1D = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			      T1E = VZMULJ(T1C, T1D);
+			      T1F = VADD(T1B, T1E);
+			 }
+			 {
+			      V T1T, T1U, T1N, T1Q, T1R;
+			      T1T = VSUB(T1B, T1E);
+			      T1U = VSUB(T1I, T1L);
+			      T1V = VFMA(LDK(KP475528258), T1T, VMUL(LDK(KP293892626), T1U));
+			      T2K = VFNMS(LDK(KP293892626), T1T, VMUL(LDK(KP475528258), T1U));
+			      T1N = VMUL(LDK(KP559016994), VSUB(T1F, T1M));
+			      T1Q = VADD(T1F, T1M);
+			      T1R = VFNMS(LDK(KP250000000), T1Q, T1P);
+			      T1S = VADD(T1N, T1R);
+			      T3A = VADD(T1P, T1Q);
+			      T2L = VSUB(T1R, T1N);
+			 }
+		    }
+		    {
+			 V TH, Tz, TD, TE, Tr, Tu, Tv, TG;
+			 TG = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 TH = VZMULJ(T1, TG);
+			 {
+			      V Ty, TC, Tq, Tt;
+			      Ty = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+			      Tz = VZMULJ(Tx, Ty);
+			      TC = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			      TD = VZMULJ(TB, TC);
+			      TE = VADD(Tz, TD);
+			      Tq = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      Tr = VZMULJ(Tp, Tq);
+			      Tt = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+			      Tu = VZMULJ(Ts, Tt);
+			      Tv = VADD(Tr, Tu);
+			 }
+			 {
+			      V TL, TM, TF, TI, TJ;
+			      TL = VSUB(Tr, Tu);
+			      TM = VSUB(Tz, TD);
+			      TN = VFMA(LDK(KP475528258), TL, VMUL(LDK(KP293892626), TM));
+			      T2G = VFNMS(LDK(KP293892626), TL, VMUL(LDK(KP475528258), TM));
+			      TF = VMUL(LDK(KP559016994), VSUB(Tv, TE));
+			      TI = VADD(Tv, TE);
+			      TJ = VFNMS(LDK(KP250000000), TI, TH);
+			      TK = VADD(TF, TJ);
+			      T3w = VADD(TH, TI);
+			      T2H = VSUB(TJ, TF);
+			 }
+		    }
+		    {
+			 V T13, TW, TZ, T10, TQ, TS, TT, T12;
+			 T12 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			 T13 = VZMULJ(T3, T12);
+			 {
+			      V TV, TY, TP, TR;
+			      TV = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      TW = VZMULJ(TU, TV);
+			      TY = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+			      TZ = VZMULJ(TX, TY);
+			      T10 = VADD(TW, TZ);
+			      TP = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			      TQ = VZMULJ(T4, TP);
+			      TR = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			      TS = VZMULJ(T8, TR);
+			      TT = VADD(TQ, TS);
+			 }
+			 {
+			      V T17, T18, T11, T14, T15;
+			      T17 = VSUB(TQ, TS);
+			      T18 = VSUB(TW, TZ);
+			      T19 = VFMA(LDK(KP475528258), T17, VMUL(LDK(KP293892626), T18));
+			      T2D = VFNMS(LDK(KP293892626), T17, VMUL(LDK(KP475528258), T18));
+			      T11 = VMUL(LDK(KP559016994), VSUB(TT, T10));
+			      T14 = VADD(TT, T10);
+			      T15 = VFNMS(LDK(KP250000000), T14, T13);
+			      T16 = VADD(T11, T15);
+			      T3x = VADD(T13, T14);
+			      T2E = VSUB(T15, T11);
+			 }
+		    }
+		    {
+			 V T1s, T1l, T1o, T1p, T1e, T1h, T1i, T1r;
+			 T1r = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 T1s = VZMULJ(Tw, T1r);
+			 {
+			      V T1k, T1n, T1d, T1g;
+			      T1k = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			      T1l = VZMULJ(T1j, T1k);
+			      T1n = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			      T1o = VZMULJ(T1m, T1n);
+			      T1p = VADD(T1l, T1o);
+			      T1d = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T1e = VZMULJ(T1c, T1d);
+			      T1g = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			      T1h = VZMULJ(T1f, T1g);
+			      T1i = VADD(T1e, T1h);
+			 }
+			 {
+			      V T1w, T1x, T1q, T1t, T1u;
+			      T1w = VSUB(T1e, T1h);
+			      T1x = VSUB(T1l, T1o);
+			      T1y = VFMA(LDK(KP475528258), T1w, VMUL(LDK(KP293892626), T1x));
+			      T2N = VFNMS(LDK(KP293892626), T1w, VMUL(LDK(KP475528258), T1x));
+			      T1q = VMUL(LDK(KP559016994), VSUB(T1i, T1p));
+			      T1t = VADD(T1i, T1p);
+			      T1u = VFNMS(LDK(KP250000000), T1t, T1s);
+			      T1v = VADD(T1q, T1u);
+			      T3z = VADD(T1s, T1t);
+			      T2O = VSUB(T1u, T1q);
+			 }
+		    }
+		    {
+			 V T3J, T3K, T3D, T3E, T3C, T3F, T3L, T3G;
+			 {
+			      V T3H, T3I, T3y, T3B;
+			      T3H = VSUB(T3w, T3x);
+			      T3I = VSUB(T3z, T3A);
+			      T3J = VBYI(VFMA(LDK(KP951056516), T3H, VMUL(LDK(KP587785252), T3I)));
+			      T3K = VBYI(VFNMS(LDK(KP587785252), T3H, VMUL(LDK(KP951056516), T3I)));
+			      T3D = VADD(Tl, Tm);
+			      T3y = VADD(T3w, T3x);
+			      T3B = VADD(T3z, T3A);
+			      T3E = VADD(T3y, T3B);
+			      T3C = VMUL(LDK(KP559016994), VSUB(T3y, T3B));
+			      T3F = VFNMS(LDK(KP250000000), T3E, T3D);
+			 }
+			 ST(&(x[0]), VADD(T3D, T3E), ms, &(x[0]));
+			 T3L = VSUB(T3F, T3C);
+			 ST(&(x[WS(rs, 10)]), VADD(T3K, T3L), ms, &(x[0]));
+			 ST(&(x[WS(rs, 15)]), VSUB(T3L, T3K), ms, &(x[WS(rs, 1)]));
+			 T3G = VADD(T3C, T3F);
+			 ST(&(x[WS(rs, 5)]), VSUB(T3G, T3J), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 20)]), VADD(T3J, T3G), ms, &(x[0]));
+		    }
+		    {
+			 V To, T2n, T2o, T2p, T2x, T2y, T2z, T2u, T2v, T2w, T2q, T2r, T2s, T29, T2i;
+			 V T2e, T2g, T1Y, T2j, T2b, T2c, T2B, T2C;
+			 To = VADD(Tk, Tn);
+			 T2n = VFMA(LDK(KP1_688655851), TN, VMUL(LDK(KP535826794), TK));
+			 T2o = VFMA(LDK(KP1_541026485), T19, VMUL(LDK(KP637423989), T16));
+			 T2p = VSUB(T2n, T2o);
+			 T2x = VFMA(LDK(KP851558583), T1y, VMUL(LDK(KP904827052), T1v));
+			 T2y = VFMA(LDK(KP1_984229402), T1V, VMUL(LDK(KP125333233), T1S));
+			 T2z = VADD(T2x, T2y);
+			 T2u = VFNMS(LDK(KP844327925), TK, VMUL(LDK(KP1_071653589), TN));
+			 T2v = VFNMS(LDK(KP1_274847979), T19, VMUL(LDK(KP770513242), T16));
+			 T2w = VADD(T2u, T2v);
+			 T2q = VFNMS(LDK(KP425779291), T1v, VMUL(LDK(KP1_809654104), T1y));
+			 T2r = VFNMS(LDK(KP992114701), T1S, VMUL(LDK(KP250666467), T1V));
+			 T2s = VADD(T2q, T2r);
+			 {
+			      V T23, T24, T25, T26, T27, T28;
+			      T23 = VFMA(LDK(KP1_937166322), TN, VMUL(LDK(KP248689887), TK));
+			      T24 = VFMA(LDK(KP1_071653589), T19, VMUL(LDK(KP844327925), T16));
+			      T25 = VADD(T23, T24);
+			      T26 = VFMA(LDK(KP1_752613360), T1y, VMUL(LDK(KP481753674), T1v));
+			      T27 = VFMA(LDK(KP1_457937254), T1V, VMUL(LDK(KP684547105), T1S));
+			      T28 = VADD(T26, T27);
+			      T29 = VADD(T25, T28);
+			      T2i = VSUB(T27, T26);
+			      T2e = VMUL(LDK(KP559016994), VSUB(T28, T25));
+			      T2g = VSUB(T24, T23);
+			 }
+			 {
+			      V TO, T1a, T1b, T1z, T1W, T1X;
+			      TO = VFNMS(LDK(KP497379774), TN, VMUL(LDK(KP968583161), TK));
+			      T1a = VFNMS(LDK(KP1_688655851), T19, VMUL(LDK(KP535826794), T16));
+			      T1b = VADD(TO, T1a);
+			      T1z = VFNMS(LDK(KP963507348), T1y, VMUL(LDK(KP876306680), T1v));
+			      T1W = VFNMS(LDK(KP1_369094211), T1V, VMUL(LDK(KP728968627), T1S));
+			      T1X = VADD(T1z, T1W);
+			      T1Y = VADD(T1b, T1X);
+			      T2j = VMUL(LDK(KP559016994), VSUB(T1b, T1X));
+			      T2b = VSUB(T1a, TO);
+			      T2c = VSUB(T1z, T1W);
+			 }
+			 {
+			      V T1Z, T2a, T2t, T2A;
+			      T1Z = VADD(To, T1Y);
+			      T2a = VBYI(VADD(T22, T29));
+			      ST(&(x[WS(rs, 1)]), VSUB(T1Z, T2a), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 24)]), VADD(T1Z, T2a), ms, &(x[0]));
+			      T2t = VADD(To, VADD(T2p, T2s));
+			      T2A = VBYI(VADD(T22, VSUB(T2w, T2z)));
+			      ST(&(x[WS(rs, 21)]), VSUB(T2t, T2A), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 4)]), VADD(T2t, T2A), ms, &(x[0]));
+			 }
+			 T2B = VBYI(VADD(T22, VFMA(LDK(KP309016994), T2w, VFMA(LDK(KP587785252), VSUB(T2r, T2q), VFNMS(LDK(KP951056516), VADD(T2n, T2o), VMUL(LDK(KP809016994), T2z))))));
+			 T2C = VFMA(LDK(KP309016994), T2p, VFMA(LDK(KP951056516), VSUB(T2u, T2v), VFMA(LDK(KP587785252), VSUB(T2y, T2x), VFNMS(LDK(KP809016994), T2s, To))));
+			 ST(&(x[WS(rs, 9)]), VADD(T2B, T2C), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 16)]), VSUB(T2C, T2B), ms, &(x[0]));
+			 {
+			      V T2f, T2l, T2k, T2m, T2d, T2h;
+			      T2d = VFMS(LDK(KP250000000), T29, T22);
+			      T2f = VBYI(VADD(VFMA(LDK(KP587785252), T2b, VMUL(LDK(KP951056516), T2c)), VSUB(T2d, T2e)));
+			      T2l = VBYI(VADD(VFNMS(LDK(KP587785252), T2c, VMUL(LDK(KP951056516), T2b)), VADD(T2d, T2e)));
+			      T2h = VFNMS(LDK(KP250000000), T1Y, To);
+			      T2k = VFMA(LDK(KP587785252), T2g, VFNMS(LDK(KP951056516), T2i, VSUB(T2h, T2j)));
+			      T2m = VFMA(LDK(KP951056516), T2g, VADD(T2j, VFMA(LDK(KP587785252), T2i, T2h)));
+			      ST(&(x[WS(rs, 11)]), VADD(T2f, T2k), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 19)]), VSUB(T2m, T2l), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 14)]), VSUB(T2k, T2f), ms, &(x[0]));
+			      ST(&(x[WS(rs, 6)]), VADD(T2l, T2m), ms, &(x[0]));
+			 }
+		    }
+		    {
+			 V T2S, T2U, T2F, T2I, T2J, T2Y, T2Z, T30, T2M, T2P, T2Q, T2V, T2W, T2X, T3a;
+			 V T3l, T3b, T3k, T3f, T3p, T3i, T3o, T32, T33;
+			 T2S = VFNMS(LDK(KP587785252), T20, T2R);
+			 T2U = VSUB(Tn, Tk);
+			 T2F = VFNMS(LDK(KP125333233), T2E, VMUL(LDK(KP1_984229402), T2D));
+			 T2I = VFMA(LDK(KP1_457937254), T2G, VMUL(LDK(KP684547105), T2H));
+			 T2J = VSUB(T2F, T2I);
+			 T2Y = VFNMS(LDK(KP1_996053456), T2N, VMUL(LDK(KP062790519), T2O));
+			 T2Z = VFMA(LDK(KP1_541026485), T2K, VMUL(LDK(KP637423989), T2L));
+			 T30 = VSUB(T2Y, T2Z);
+			 T2M = VFNMS(LDK(KP770513242), T2L, VMUL(LDK(KP1_274847979), T2K));
+			 T2P = VFMA(LDK(KP125581039), T2N, VMUL(LDK(KP998026728), T2O));
+			 T2Q = VSUB(T2M, T2P);
+			 T2V = VFNMS(LDK(KP1_369094211), T2G, VMUL(LDK(KP728968627), T2H));
+			 T2W = VFMA(LDK(KP250666467), T2D, VMUL(LDK(KP992114701), T2E));
+			 T2X = VSUB(T2V, T2W);
+			 {
+			      V T34, T35, T36, T37, T38, T39;
+			      T34 = VFNMS(LDK(KP481753674), T2H, VMUL(LDK(KP1_752613360), T2G));
+			      T35 = VFMA(LDK(KP851558583), T2D, VMUL(LDK(KP904827052), T2E));
+			      T36 = VSUB(T34, T35);
+			      T37 = VFNMS(LDK(KP844327925), T2O, VMUL(LDK(KP1_071653589), T2N));
+			      T38 = VFNMS(LDK(KP998026728), T2L, VMUL(LDK(KP125581039), T2K));
+			      T39 = VADD(T37, T38);
+			      T3a = VMUL(LDK(KP559016994), VSUB(T36, T39));
+			      T3l = VSUB(T37, T38);
+			      T3b = VADD(T36, T39);
+			      T3k = VADD(T34, T35);
+			 }
+			 {
+			      V T3d, T3e, T3m, T3g, T3h, T3n;
+			      T3d = VFNMS(LDK(KP425779291), T2E, VMUL(LDK(KP1_809654104), T2D));
+			      T3e = VFMA(LDK(KP963507348), T2G, VMUL(LDK(KP876306680), T2H));
+			      T3m = VADD(T3e, T3d);
+			      T3g = VFMA(LDK(KP1_688655851), T2N, VMUL(LDK(KP535826794), T2O));
+			      T3h = VFMA(LDK(KP1_996053456), T2K, VMUL(LDK(KP062790519), T2L));
+			      T3n = VADD(T3g, T3h);
+			      T3f = VSUB(T3d, T3e);
+			      T3p = VADD(T3m, T3n);
+			      T3i = VSUB(T3g, T3h);
+			      T3o = VMUL(LDK(KP559016994), VSUB(T3m, T3n));
+			 }
+			 {
+			      V T3u, T3v, T2T, T31;
+			      T3u = VBYI(VADD(T2S, T3b));
+			      T3v = VADD(T2U, T3p);
+			      ST(&(x[WS(rs, 2)]), VADD(T3u, T3v), ms, &(x[0]));
+			      ST(&(x[WS(rs, 23)]), VSUB(T3v, T3u), ms, &(x[WS(rs, 1)]));
+			      T2T = VBYI(VSUB(VADD(T2J, T2Q), T2S));
+			      T31 = VADD(T2U, VADD(T2X, T30));
+			      ST(&(x[WS(rs, 3)]), VADD(T2T, T31), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 22)]), VSUB(T31, T2T), ms, &(x[0]));
+			 }
+			 T32 = VFMA(LDK(KP309016994), T2X, VFNMS(LDK(KP809016994), T30, VFNMS(LDK(KP587785252), VADD(T2P, T2M), VFNMS(LDK(KP951056516), VADD(T2I, T2F), T2U))));
+			 T33 = VBYI(VSUB(VFNMS(LDK(KP587785252), VADD(T2Y, T2Z), VFNMS(LDK(KP809016994), T2Q, VFNMS(LDK(KP951056516), VADD(T2V, T2W), VMUL(LDK(KP309016994), T2J)))), T2S));
+			 ST(&(x[WS(rs, 17)]), VSUB(T32, T33), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 8)]), VADD(T32, T33), ms, &(x[0]));
+			 {
+			      V T3j, T3s, T3r, T3t, T3c, T3q;
+			      T3c = VFNMS(LDK(KP250000000), T3b, T2S);
+			      T3j = VBYI(VADD(T3a, VADD(T3c, VFNMS(LDK(KP587785252), T3i, VMUL(LDK(KP951056516), T3f)))));
+			      T3s = VBYI(VADD(T3c, VSUB(VFMA(LDK(KP587785252), T3f, VMUL(LDK(KP951056516), T3i)), T3a)));
+			      T3q = VFNMS(LDK(KP250000000), T3p, T2U);
+			      T3r = VFMA(LDK(KP951056516), T3k, VFMA(LDK(KP587785252), T3l, VADD(T3o, T3q)));
+			      T3t = VFMA(LDK(KP587785252), T3k, VSUB(VFNMS(LDK(KP951056516), T3l, T3q), T3o));
+			      ST(&(x[WS(rs, 7)]), VADD(T3j, T3r), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 13)]), VSUB(T3t, T3s), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 18)]), VSUB(T3r, T3j), ms, &(x[0]));
+			      ST(&(x[WS(rs, 12)]), VADD(T3s, T3t), ms, &(x[0]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 24),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 25, XSIMD_STRING("t3fv_25"), twinstr, &GENUS, {190, 150, 78, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_25) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:50 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3fv_32 -include t3f.h */
+
+/*
+ * This function contains 244 FP additions, 214 FP multiplications,
+ * (or, 146 additions, 116 multiplications, 98 fused multiply/add),
+ * 118 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T2B, T2A, T2u, T2x, T2r, T2F, T2L, T2P;
+	       {
+		    V T2, T5, T3, T7;
+		    T2 = LDW(&(W[0]));
+		    T5 = LDW(&(W[TWVL * 4]));
+		    T3 = LDW(&(W[TWVL * 2]));
+		    T7 = LDW(&(W[TWVL * 6]));
+		    {
+			 V T24, Tb, T3x, T2T, T3K, T2W, T25, Tr, T3z, T3g, T28, TX, T3y, T3j, T27;
+			 V TG, T37, T3F, T3G, T3a, T2Y, T15, T1p, T2Z, T2w, T1V, T2v, T1N, T32, T1h;
+			 V T17, T1a;
+			 {
+			      V T1, Tz, TT, T4, TC, Tv, T12, T1D, T1w, T18, T1t, T1O, TK, TP, T1c;
+			      V T1m, Tf, T6, Te, TL, TQ, T2S, Tp, TU, Ti, Ta, TM, TR, Tm, TJ;
+			      V T22, T9, T1Z;
+			      T1 = LD(&(x[0]), ms, &(x[0]));
+			      T22 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			      T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			      T1Z = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			      {
+				   V Tn, TH, Tk, To, Th, Tg, T8, Tl, T20, T23, TI;
+				   {
+					V Td, T1C, Tc, T21;
+					Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+					Tz = VZMUL(T2, T5);
+					T1C = VZMULJ(T2, T5);
+					Tn = VZMUL(T3, T5);
+					TT = VZMULJ(T3, T5);
+					Tc = VZMUL(T2, T3);
+					T4 = VZMULJ(T2, T3);
+					TH = VZMUL(T3, T7);
+					T21 = VZMULJ(T3, T7);
+					Tk = VZMUL(T2, T7);
+					TC = VZMULJ(T2, T7);
+					Tv = VZMULJ(T5, T7);
+					T12 = VZMULJ(Tz, T7);
+					T20 = VZMULJ(T1C, T1Z);
+					T1D = VZMULJ(T1C, T7);
+					T1w = VZMULJ(Tn, T7);
+					T18 = VZMULJ(TT, T7);
+					T1t = VZMUL(Tc, T7);
+					T1O = VZMULJ(Tc, T7);
+					TK = VZMUL(Tc, T5);
+					TP = VZMULJ(Tc, T5);
+					T1c = VZMUL(T4, T7);
+					T1m = VZMULJ(T4, T7);
+					Tf = VZMULJ(T4, T5);
+					T6 = VZMUL(T4, T5);
+					T23 = VZMULJ(T21, T22);
+					Te = VZMULJ(Tc, Td);
+				   }
+				   TL = VZMULJ(TK, T7);
+				   TQ = VZMULJ(TP, T7);
+				   To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+				   Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+				   Tg = VZMULJ(Tf, T7);
+				   T8 = VZMULJ(T6, T7);
+				   T2S = VADD(T20, T23);
+				   T24 = VSUB(T20, T23);
+				   Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+				   TI = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+				   Tp = VZMULJ(Tn, To);
+				   TU = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+				   Ti = VZMULJ(Tg, Th);
+				   Ta = VZMULJ(T8, T9);
+				   TM = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+				   TR = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+				   Tm = VZMULJ(Tk, Tl);
+				   TJ = VZMULJ(TH, TI);
+			      }
+			      {
+				   V Tu, TE, Tw, TA;
+				   {
+					V T3e, TO, T3f, TW;
+					{
+					     V TV, T2U, Tj, T2R, TN, TS, T2V, Tq, Tt, TD;
+					     Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+					     TV = VZMULJ(TT, TU);
+					     T2U = VADD(Te, Ti);
+					     Tj = VSUB(Te, Ti);
+					     T2R = VADD(T1, Ta);
+					     Tb = VSUB(T1, Ta);
+					     TN = VZMULJ(TL, TM);
+					     TS = VZMULJ(TQ, TR);
+					     T2V = VADD(Tm, Tp);
+					     Tq = VSUB(Tm, Tp);
+					     Tu = VZMULJ(T4, Tt);
+					     TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+					     T3x = VSUB(T2R, T2S);
+					     T2T = VADD(T2R, T2S);
+					     T3e = VADD(TJ, TN);
+					     TO = VSUB(TJ, TN);
+					     T3f = VADD(TV, TS);
+					     TW = VSUB(TS, TV);
+					     T3K = VSUB(T2V, T2U);
+					     T2W = VADD(T2U, T2V);
+					     T25 = VSUB(Tq, Tj);
+					     Tr = VADD(Tj, Tq);
+					     TE = VZMULJ(TC, TD);
+					}
+					Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+					T3z = VSUB(T3e, T3f);
+					T3g = VADD(T3e, T3f);
+					T28 = VFMA(LDK(KP414213562), TO, TW);
+					TX = VFNMS(LDK(KP414213562), TW, TO);
+					TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+				   }
+				   {
+					V T35, T1z, T1T, T36, T39, T1L, T1B, T1F;
+					{
+					     V T1v, T1y, Ty, T3h, T1S, T1Q, T1I, T3i, TF, T1K, T1A, T1E;
+					     {
+						  V T1u, T1x, Tx, T1R;
+						  T1u = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+						  T1x = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+						  Tx = VZMULJ(Tv, Tw);
+						  T1R = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+						  {
+						       V T1P, T1H, T1J, TB;
+						       T1P = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+						       T1H = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+						       T1J = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+						       TB = VZMULJ(Tz, TA);
+						       T1v = VZMULJ(T1t, T1u);
+						       T1y = VZMULJ(T1w, T1x);
+						       Ty = VSUB(Tu, Tx);
+						       T3h = VADD(Tu, Tx);
+						       T1S = VZMULJ(Tf, T1R);
+						       T1Q = VZMULJ(T1O, T1P);
+						       T1I = VZMULJ(T7, T1H);
+						       T3i = VADD(TB, TE);
+						       TF = VSUB(TB, TE);
+						       T1K = VZMULJ(T6, T1J);
+						       T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+						       T1E = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+						  }
+					     }
+					     T35 = VADD(T1v, T1y);
+					     T1z = VSUB(T1v, T1y);
+					     T1T = VSUB(T1Q, T1S);
+					     T36 = VADD(T1S, T1Q);
+					     T3y = VSUB(T3h, T3i);
+					     T3j = VADD(T3h, T3i);
+					     T27 = VFMA(LDK(KP414213562), Ty, TF);
+					     TG = VFNMS(LDK(KP414213562), TF, Ty);
+					     T39 = VADD(T1I, T1K);
+					     T1L = VSUB(T1I, T1K);
+					     T1B = VZMULJ(T3, T1A);
+					     T1F = VZMULJ(T1D, T1E);
+					}
+					{
+					     V T11, T14, T1o, T1l, T1e, T1U, T1M, T1g, T16, T19;
+					     {
+						  V T10, T13, T1n, T1k;
+						  T10 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+						  T13 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+						  T1n = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+						  T1k = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+						  {
+						       V T1d, T1f, T1G, T38;
+						       T1d = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+						       T1f = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+						       T1G = VSUB(T1B, T1F);
+						       T38 = VADD(T1B, T1F);
+						       T37 = VADD(T35, T36);
+						       T3F = VSUB(T35, T36);
+						       T11 = VZMULJ(T2, T10);
+						       T14 = VZMULJ(T12, T13);
+						       T1o = VZMULJ(T1m, T1n);
+						       T1l = VZMULJ(T5, T1k);
+						       T1e = VZMULJ(T1c, T1d);
+						       T3G = VSUB(T39, T38);
+						       T3a = VADD(T38, T39);
+						       T1U = VSUB(T1L, T1G);
+						       T1M = VADD(T1G, T1L);
+						       T1g = VZMULJ(TK, T1f);
+						  }
+						  T16 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+						  T19 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+					     }
+					     T2Y = VADD(T11, T14);
+					     T15 = VSUB(T11, T14);
+					     T1p = VSUB(T1l, T1o);
+					     T2Z = VADD(T1l, T1o);
+					     T2w = VFNMS(LDK(KP707106781), T1U, T1T);
+					     T1V = VFMA(LDK(KP707106781), T1U, T1T);
+					     T2v = VFNMS(LDK(KP707106781), T1M, T1z);
+					     T1N = VFMA(LDK(KP707106781), T1M, T1z);
+					     T32 = VADD(T1e, T1g);
+					     T1h = VSUB(T1e, T1g);
+					     T17 = VZMULJ(TP, T16);
+					     T1a = VZMULJ(T18, T19);
+					}
+				   }
+			      }
+			 }
+			 {
+			      V T2X, T3k, T3b, T3t, T1b, T31, T30, T3C, T3r, T3v, T3p, T3q;
+			      T2X = VSUB(T2T, T2W);
+			      T3p = VADD(T2T, T2W);
+			      T3q = VADD(T3j, T3g);
+			      T3k = VSUB(T3g, T3j);
+			      T3b = VSUB(T37, T3a);
+			      T3t = VADD(T37, T3a);
+			      T1b = VSUB(T17, T1a);
+			      T31 = VADD(T17, T1a);
+			      T30 = VADD(T2Y, T2Z);
+			      T3C = VSUB(T2Y, T2Z);
+			      T3r = VADD(T3p, T3q);
+			      T3v = VSUB(T3p, T3q);
+			      {
+				   V T3N, T3B, T3T, T3M, T3W, T3O, T2t, T1r, T2s, T1j, T3I, T3X, T3c, T3l, T3u;
+				   V T3w;
+				   {
+					V T3L, T3A, T33, T3D, T1i, T1q;
+					T3L = VSUB(T3z, T3y);
+					T3A = VADD(T3y, T3z);
+					T33 = VADD(T31, T32);
+					T3D = VSUB(T31, T32);
+					T1i = VADD(T1b, T1h);
+					T1q = VSUB(T1b, T1h);
+					{
+					     V T3H, T3E, T34, T3s;
+					     T3N = VFMA(LDK(KP414213562), T3F, T3G);
+					     T3H = VFNMS(LDK(KP414213562), T3G, T3F);
+					     T3B = VFMA(LDK(KP707106781), T3A, T3x);
+					     T3T = VFNMS(LDK(KP707106781), T3A, T3x);
+					     T3M = VFMA(LDK(KP707106781), T3L, T3K);
+					     T3W = VFNMS(LDK(KP707106781), T3L, T3K);
+					     T3O = VFMA(LDK(KP414213562), T3C, T3D);
+					     T3E = VFNMS(LDK(KP414213562), T3D, T3C);
+					     T34 = VSUB(T30, T33);
+					     T3s = VADD(T30, T33);
+					     T2t = VFNMS(LDK(KP707106781), T1q, T1p);
+					     T1r = VFMA(LDK(KP707106781), T1q, T1p);
+					     T2s = VFNMS(LDK(KP707106781), T1i, T15);
+					     T1j = VFMA(LDK(KP707106781), T1i, T15);
+					     T3I = VADD(T3E, T3H);
+					     T3X = VSUB(T3H, T3E);
+					     T3c = VADD(T34, T3b);
+					     T3l = VSUB(T3b, T34);
+					     T3u = VADD(T3s, T3t);
+					     T3w = VSUB(T3t, T3s);
+					}
+				   }
+				   {
+					V T2p, Ts, TY, T1s, T2b, T2c, T1W, T26, T29, T2q, T3U, T3P, T2J, T2K;
+					T2p = VFNMS(LDK(KP707106781), Tr, Tb);
+					Ts = VFMA(LDK(KP707106781), Tr, Tb);
+					T3U = VADD(T3O, T3N);
+					T3P = VSUB(T3N, T3O);
+					{
+					     V T3Y, T40, T3R, T3J;
+					     T3Y = VFMA(LDK(KP923879532), T3X, T3W);
+					     T40 = VFNMS(LDK(KP923879532), T3X, T3W);
+					     T3R = VFMA(LDK(KP923879532), T3I, T3B);
+					     T3J = VFNMS(LDK(KP923879532), T3I, T3B);
+					     {
+						  V T3o, T3m, T3n, T3d;
+						  T3o = VFMA(LDK(KP707106781), T3l, T3k);
+						  T3m = VFNMS(LDK(KP707106781), T3l, T3k);
+						  T3n = VFMA(LDK(KP707106781), T3c, T2X);
+						  T3d = VFNMS(LDK(KP707106781), T3c, T2X);
+						  ST(&(x[WS(rs, 24)]), VFNMSI(T3w, T3v), ms, &(x[0]));
+						  ST(&(x[WS(rs, 8)]), VFMAI(T3w, T3v), ms, &(x[0]));
+						  ST(&(x[0]), VADD(T3r, T3u), ms, &(x[0]));
+						  ST(&(x[WS(rs, 16)]), VSUB(T3r, T3u), ms, &(x[0]));
+						  {
+						       V T3V, T3Z, T3S, T3Q;
+						       T3V = VFNMS(LDK(KP923879532), T3U, T3T);
+						       T3Z = VFMA(LDK(KP923879532), T3U, T3T);
+						       T3S = VFMA(LDK(KP923879532), T3P, T3M);
+						       T3Q = VFNMS(LDK(KP923879532), T3P, T3M);
+						       ST(&(x[WS(rs, 4)]), VFMAI(T3o, T3n), ms, &(x[0]));
+						       ST(&(x[WS(rs, 28)]), VFNMSI(T3o, T3n), ms, &(x[0]));
+						       ST(&(x[WS(rs, 20)]), VFMAI(T3m, T3d), ms, &(x[0]));
+						       ST(&(x[WS(rs, 12)]), VFNMSI(T3m, T3d), ms, &(x[0]));
+						       ST(&(x[WS(rs, 22)]), VFNMSI(T3Y, T3V), ms, &(x[0]));
+						       ST(&(x[WS(rs, 10)]), VFMAI(T3Y, T3V), ms, &(x[0]));
+						       ST(&(x[WS(rs, 26)]), VFMAI(T40, T3Z), ms, &(x[0]));
+						       ST(&(x[WS(rs, 6)]), VFNMSI(T40, T3Z), ms, &(x[0]));
+						       ST(&(x[WS(rs, 2)]), VFMAI(T3S, T3R), ms, &(x[0]));
+						       ST(&(x[WS(rs, 30)]), VFNMSI(T3S, T3R), ms, &(x[0]));
+						       ST(&(x[WS(rs, 18)]), VFMAI(T3Q, T3J), ms, &(x[0]));
+						       ST(&(x[WS(rs, 14)]), VFNMSI(T3Q, T3J), ms, &(x[0]));
+						       TY = VADD(TG, TX);
+						       T2B = VSUB(TX, TG);
+						  }
+					     }
+					}
+					T1s = VFNMS(LDK(KP198912367), T1r, T1j);
+					T2b = VFMA(LDK(KP198912367), T1j, T1r);
+					T2c = VFMA(LDK(KP198912367), T1N, T1V);
+					T1W = VFNMS(LDK(KP198912367), T1V, T1N);
+					T2A = VFMA(LDK(KP707106781), T25, T24);
+					T26 = VFNMS(LDK(KP707106781), T25, T24);
+					T29 = VSUB(T27, T28);
+					T2q = VADD(T27, T28);
+					{
+					     V T2j, T2n, T1Y, T2f, T2o, T2m, T2e, T2g;
+					     {
+						  V T2h, TZ, T2i, T2d, T2l, T1X, T2k, T2a, T2D, T2E;
+						  T2h = VFNMS(LDK(KP923879532), TY, Ts);
+						  TZ = VFMA(LDK(KP923879532), TY, Ts);
+						  T2i = VADD(T2b, T2c);
+						  T2d = VSUB(T2b, T2c);
+						  T2l = VSUB(T1W, T1s);
+						  T1X = VADD(T1s, T1W);
+						  T2k = VFNMS(LDK(KP923879532), T29, T26);
+						  T2a = VFMA(LDK(KP923879532), T29, T26);
+						  T2u = VFMA(LDK(KP668178637), T2t, T2s);
+						  T2D = VFNMS(LDK(KP668178637), T2s, T2t);
+						  T2j = VFNMS(LDK(KP980785280), T2i, T2h);
+						  T2n = VFMA(LDK(KP980785280), T2i, T2h);
+						  T2E = VFNMS(LDK(KP668178637), T2v, T2w);
+						  T2x = VFMA(LDK(KP668178637), T2w, T2v);
+						  T1Y = VFNMS(LDK(KP980785280), T1X, TZ);
+						  T2f = VFMA(LDK(KP980785280), T1X, TZ);
+						  T2o = VFMA(LDK(KP980785280), T2l, T2k);
+						  T2m = VFNMS(LDK(KP980785280), T2l, T2k);
+						  T2e = VFNMS(LDK(KP980785280), T2d, T2a);
+						  T2g = VFMA(LDK(KP980785280), T2d, T2a);
+						  T2r = VFMA(LDK(KP923879532), T2q, T2p);
+						  T2J = VFNMS(LDK(KP923879532), T2q, T2p);
+						  T2K = VADD(T2D, T2E);
+						  T2F = VSUB(T2D, T2E);
+					     }
+					     ST(&(x[WS(rs, 23)]), VFMAI(T2m, T2j), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 9)]), VFNMSI(T2m, T2j), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 25)]), VFNMSI(T2o, T2n), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 7)]), VFMAI(T2o, T2n), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 31)]), VFMAI(T2g, T2f), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 1)]), VFNMSI(T2g, T2f), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 15)]), VFMAI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
+					     ST(&(x[WS(rs, 17)]), VFNMSI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
+					}
+					T2L = VFMA(LDK(KP831469612), T2K, T2J);
+					T2P = VFNMS(LDK(KP831469612), T2K, T2J);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    V T2y, T2N, T2C, T2M;
+		    T2y = VADD(T2u, T2x);
+		    T2N = VSUB(T2x, T2u);
+		    T2C = VFMA(LDK(KP923879532), T2B, T2A);
+		    T2M = VFNMS(LDK(KP923879532), T2B, T2A);
+		    {
+			 V T2z, T2H, T2Q, T2O, T2G, T2I;
+			 T2z = VFNMS(LDK(KP831469612), T2y, T2r);
+			 T2H = VFMA(LDK(KP831469612), T2y, T2r);
+			 T2Q = VFNMS(LDK(KP831469612), T2N, T2M);
+			 T2O = VFMA(LDK(KP831469612), T2N, T2M);
+			 T2G = VFNMS(LDK(KP831469612), T2F, T2C);
+			 T2I = VFMA(LDK(KP831469612), T2F, T2C);
+			 ST(&(x[WS(rs, 21)]), VFNMSI(T2O, T2L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 11)]), VFMAI(T2O, T2L), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 27)]), VFMAI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 5)]), VFNMSI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VFMAI(T2I, T2H), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 29)]), VFNMSI(T2I, T2H), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 19)]), VFMAI(T2G, T2z), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 13)]), VFNMSI(T2G, T2z), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 27),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t3fv_32"), twinstr, &GENUS, {146, 116, 98, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_32) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3fv_32 -include t3f.h */
+
+/*
+ * This function contains 244 FP additions, 158 FP multiplications,
+ * (or, 228 additions, 142 multiplications, 16 fused multiply/add),
+ * 90 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T2, T5, T3, T4, Tc, T1C, TP, Tz, Tn, T6, TS, Tf, TK, T7, T8;
+	       V Tv, T1w, T22, Tg, Tk, T1D, T1R, TC, T18, T12, T1t, TH, TL, TT, T1n;
+	       V T1c;
+	       T2 = LDW(&(W[0]));
+	       T5 = LDW(&(W[TWVL * 4]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       T4 = VZMULJ(T2, T3);
+	       Tc = VZMUL(T2, T3);
+	       T1C = VZMULJ(T2, T5);
+	       TP = VZMULJ(T3, T5);
+	       Tz = VZMUL(T2, T5);
+	       Tn = VZMUL(T3, T5);
+	       T6 = VZMUL(T4, T5);
+	       TS = VZMULJ(Tc, T5);
+	       Tf = VZMULJ(T4, T5);
+	       TK = VZMUL(Tc, T5);
+	       T7 = LDW(&(W[TWVL * 6]));
+	       T8 = VZMULJ(T6, T7);
+	       Tv = VZMULJ(T5, T7);
+	       T1w = VZMULJ(Tn, T7);
+	       T22 = VZMULJ(T3, T7);
+	       Tg = VZMULJ(Tf, T7);
+	       Tk = VZMUL(T2, T7);
+	       T1D = VZMULJ(T1C, T7);
+	       T1R = VZMULJ(Tc, T7);
+	       TC = VZMULJ(T2, T7);
+	       T18 = VZMULJ(TP, T7);
+	       T12 = VZMULJ(Tz, T7);
+	       T1t = VZMUL(Tc, T7);
+	       TH = VZMUL(T3, T7);
+	       TL = VZMULJ(TK, T7);
+	       TT = VZMULJ(TS, T7);
+	       T1n = VZMULJ(T4, T7);
+	       T1c = VZMUL(T4, T7);
+	       {
+		    V Tb, T25, T2T, T3x, Tr, T1Z, T2W, T3K, TX, T27, T3g, T3z, TG, T28, T3j;
+		    V T3y, T1N, T2v, T3a, T3G, T1V, T2w, T37, T3F, T1j, T2s, T33, T3D, T1r, T2t;
+		    V T30, T3C;
+		    {
+			 V T1, T24, Ta, T21, T23, T9, T20, T2R, T2S;
+			 T1 = LD(&(x[0]), ms, &(x[0]));
+			 T23 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
+			 T24 = VZMULJ(T22, T23);
+			 T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
+			 Ta = VZMULJ(T8, T9);
+			 T20 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
+			 T21 = VZMULJ(T1C, T20);
+			 Tb = VSUB(T1, Ta);
+			 T25 = VSUB(T21, T24);
+			 T2R = VADD(T1, Ta);
+			 T2S = VADD(T21, T24);
+			 T2T = VADD(T2R, T2S);
+			 T3x = VSUB(T2R, T2S);
+		    }
+		    {
+			 V Te, Tp, Ti, Tm;
+			 {
+			      V Td, To, Th, Tl;
+			      Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      Te = VZMULJ(Tc, Td);
+			      To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
+			      Tp = VZMULJ(Tn, To);
+			      Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
+			      Ti = VZMULJ(Tg, Th);
+			      Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
+			      Tm = VZMULJ(Tk, Tl);
+			 }
+			 {
+			      V Tj, Tq, T2U, T2V;
+			      Tj = VSUB(Te, Ti);
+			      Tq = VSUB(Tm, Tp);
+			      Tr = VMUL(LDK(KP707106781), VADD(Tj, Tq));
+			      T1Z = VMUL(LDK(KP707106781), VSUB(Tq, Tj));
+			      T2U = VADD(Te, Ti);
+			      T2V = VADD(Tm, Tp);
+			      T2W = VADD(T2U, T2V);
+			      T3K = VSUB(T2V, T2U);
+			 }
+		    }
+		    {
+			 V TJ, TV, TN, TR;
+			 {
+			      V TI, TU, TM, TQ;
+			      TI = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
+			      TJ = VZMULJ(TH, TI);
+			      TU = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
+			      TV = VZMULJ(TT, TU);
+			      TM = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
+			      TN = VZMULJ(TL, TM);
+			      TQ = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			      TR = VZMULJ(TP, TQ);
+			 }
+			 {
+			      V TO, TW, T3e, T3f;
+			      TO = VSUB(TJ, TN);
+			      TW = VSUB(TR, TV);
+			      TX = VFMA(LDK(KP923879532), TO, VMUL(LDK(KP382683432), TW));
+			      T27 = VFNMS(LDK(KP923879532), TW, VMUL(LDK(KP382683432), TO));
+			      T3e = VADD(TJ, TN);
+			      T3f = VADD(TR, TV);
+			      T3g = VADD(T3e, T3f);
+			      T3z = VSUB(T3e, T3f);
+			 }
+		    }
+		    {
+			 V Tu, TE, Tx, TB;
+			 {
+			      V Tt, TD, Tw, TA;
+			      Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      Tu = VZMULJ(T4, Tt);
+			      TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
+			      TE = VZMULJ(TC, TD);
+			      Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
+			      Tx = VZMULJ(Tv, Tw);
+			      TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
+			      TB = VZMULJ(Tz, TA);
+			 }
+			 {
+			      V Ty, TF, T3h, T3i;
+			      Ty = VSUB(Tu, Tx);
+			      TF = VSUB(TB, TE);
+			      TG = VFNMS(LDK(KP382683432), TF, VMUL(LDK(KP923879532), Ty));
+			      T28 = VFMA(LDK(KP382683432), Ty, VMUL(LDK(KP923879532), TF));
+			      T3h = VADD(Tu, Tx);
+			      T3i = VADD(TB, TE);
+			      T3j = VADD(T3h, T3i);
+			      T3y = VSUB(T3h, T3i);
+			 }
+		    }
+		    {
+			 V T1v, T1y, T1T, T1Q, T1I, T1K, T1L, T1B, T1F, T1G;
+			 {
+			      V T1u, T1x, T1S, T1P;
+			      T1u = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
+			      T1v = VZMULJ(T1t, T1u);
+			      T1x = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
+			      T1y = VZMULJ(T1w, T1x);
+			      T1S = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
+			      T1T = VZMULJ(T1R, T1S);
+			      T1P = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			      T1Q = VZMULJ(Tf, T1P);
+			      {
+				   V T1H, T1J, T1A, T1E;
+				   T1H = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
+				   T1I = VZMULJ(T7, T1H);
+				   T1J = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
+				   T1K = VZMULJ(T6, T1J);
+				   T1L = VSUB(T1I, T1K);
+				   T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+				   T1B = VZMULJ(T3, T1A);
+				   T1E = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
+				   T1F = VZMULJ(T1D, T1E);
+				   T1G = VSUB(T1B, T1F);
+			      }
+			 }
+			 {
+			      V T1z, T1M, T38, T39;
+			      T1z = VSUB(T1v, T1y);
+			      T1M = VMUL(LDK(KP707106781), VADD(T1G, T1L));
+			      T1N = VADD(T1z, T1M);
+			      T2v = VSUB(T1z, T1M);
+			      T38 = VADD(T1B, T1F);
+			      T39 = VADD(T1I, T1K);
+			      T3a = VADD(T38, T39);
+			      T3G = VSUB(T39, T38);
+			 }
+			 {
+			      V T1O, T1U, T35, T36;
+			      T1O = VMUL(LDK(KP707106781), VSUB(T1L, T1G));
+			      T1U = VSUB(T1Q, T1T);
+			      T1V = VSUB(T1O, T1U);
+			      T2w = VADD(T1U, T1O);
+			      T35 = VADD(T1v, T1y);
+			      T36 = VADD(T1Q, T1T);
+			      T37 = VADD(T35, T36);
+			      T3F = VSUB(T35, T36);
+			 }
+		    }
+		    {
+			 V T11, T14, T1p, T1m, T1e, T1g, T1h, T17, T1a, T1b;
+			 {
+			      V T10, T13, T1o, T1l;
+			      T10 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      T11 = VZMULJ(T2, T10);
+			      T13 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
+			      T14 = VZMULJ(T12, T13);
+			      T1o = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
+			      T1p = VZMULJ(T1n, T1o);
+			      T1l = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
+			      T1m = VZMULJ(T5, T1l);
+			      {
+				   V T1d, T1f, T16, T19;
+				   T1d = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
+				   T1e = VZMULJ(T1c, T1d);
+				   T1f = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
+				   T1g = VZMULJ(TK, T1f);
+				   T1h = VSUB(T1e, T1g);
+				   T16 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+				   T17 = VZMULJ(TS, T16);
+				   T19 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
+				   T1a = VZMULJ(T18, T19);
+				   T1b = VSUB(T17, T1a);
+			      }
+			 }
+			 {
+			      V T15, T1i, T31, T32;
+			      T15 = VSUB(T11, T14);
+			      T1i = VMUL(LDK(KP707106781), VADD(T1b, T1h));
+			      T1j = VADD(T15, T1i);
+			      T2s = VSUB(T15, T1i);
+			      T31 = VADD(T17, T1a);
+			      T32 = VADD(T1e, T1g);
+			      T33 = VADD(T31, T32);
+			      T3D = VSUB(T32, T31);
+			 }
+			 {
+			      V T1k, T1q, T2Y, T2Z;
+			      T1k = VMUL(LDK(KP707106781), VSUB(T1h, T1b));
+			      T1q = VSUB(T1m, T1p);
+			      T1r = VSUB(T1k, T1q);
+			      T2t = VADD(T1q, T1k);
+			      T2Y = VADD(T11, T14);
+			      T2Z = VADD(T1m, T1p);
+			      T30 = VADD(T2Y, T2Z);
+			      T3C = VSUB(T2Y, T2Z);
+			 }
+		    }
+		    {
+			 V T3r, T3v, T3u, T3w;
+			 {
+			      V T3p, T3q, T3s, T3t;
+			      T3p = VADD(T2T, T2W);
+			      T3q = VADD(T3j, T3g);
+			      T3r = VADD(T3p, T3q);
+			      T3v = VSUB(T3p, T3q);
+			      T3s = VADD(T30, T33);
+			      T3t = VADD(T37, T3a);
+			      T3u = VADD(T3s, T3t);
+			      T3w = VBYI(VSUB(T3t, T3s));
+			 }
+			 ST(&(x[WS(rs, 16)]), VSUB(T3r, T3u), ms, &(x[0]));
+			 ST(&(x[WS(rs, 8)]), VADD(T3v, T3w), ms, &(x[0]));
+			 ST(&(x[0]), VADD(T3r, T3u), ms, &(x[0]));
+			 ST(&(x[WS(rs, 24)]), VSUB(T3v, T3w), ms, &(x[0]));
+		    }
+		    {
+			 V T2X, T3k, T3c, T3l, T34, T3b;
+			 T2X = VSUB(T2T, T2W);
+			 T3k = VSUB(T3g, T3j);
+			 T34 = VSUB(T30, T33);
+			 T3b = VSUB(T37, T3a);
+			 T3c = VMUL(LDK(KP707106781), VADD(T34, T3b));
+			 T3l = VMUL(LDK(KP707106781), VSUB(T3b, T34));
+			 {
+			      V T3d, T3m, T3n, T3o;
+			      T3d = VADD(T2X, T3c);
+			      T3m = VBYI(VADD(T3k, T3l));
+			      ST(&(x[WS(rs, 28)]), VSUB(T3d, T3m), ms, &(x[0]));
+			      ST(&(x[WS(rs, 4)]), VADD(T3d, T3m), ms, &(x[0]));
+			      T3n = VSUB(T2X, T3c);
+			      T3o = VBYI(VSUB(T3l, T3k));
+			      ST(&(x[WS(rs, 20)]), VSUB(T3n, T3o), ms, &(x[0]));
+			      ST(&(x[WS(rs, 12)]), VADD(T3n, T3o), ms, &(x[0]));
+			 }
+		    }
+		    {
+			 V T3B, T3W, T3M, T3U, T3I, T3T, T3P, T3X, T3A, T3L;
+			 T3A = VMUL(LDK(KP707106781), VADD(T3y, T3z));
+			 T3B = VADD(T3x, T3A);
+			 T3W = VSUB(T3x, T3A);
+			 T3L = VMUL(LDK(KP707106781), VSUB(T3z, T3y));
+			 T3M = VADD(T3K, T3L);
+			 T3U = VSUB(T3L, T3K);
+			 {
+			      V T3E, T3H, T3N, T3O;
+			      T3E = VFMA(LDK(KP923879532), T3C, VMUL(LDK(KP382683432), T3D));
+			      T3H = VFNMS(LDK(KP382683432), T3G, VMUL(LDK(KP923879532), T3F));
+			      T3I = VADD(T3E, T3H);
+			      T3T = VSUB(T3H, T3E);
+			      T3N = VFNMS(LDK(KP382683432), T3C, VMUL(LDK(KP923879532), T3D));
+			      T3O = VFMA(LDK(KP382683432), T3F, VMUL(LDK(KP923879532), T3G));
+			      T3P = VADD(T3N, T3O);
+			      T3X = VSUB(T3O, T3N);
+			 }
+			 {
+			      V T3J, T3Q, T3Z, T40;
+			      T3J = VADD(T3B, T3I);
+			      T3Q = VBYI(VADD(T3M, T3P));
+			      ST(&(x[WS(rs, 30)]), VSUB(T3J, T3Q), ms, &(x[0]));
+			      ST(&(x[WS(rs, 2)]), VADD(T3J, T3Q), ms, &(x[0]));
+			      T3Z = VBYI(VADD(T3U, T3T));
+			      T40 = VADD(T3W, T3X);
+			      ST(&(x[WS(rs, 6)]), VADD(T3Z, T40), ms, &(x[0]));
+			      ST(&(x[WS(rs, 26)]), VSUB(T40, T3Z), ms, &(x[0]));
+			 }
+			 {
+			      V T3R, T3S, T3V, T3Y;
+			      T3R = VSUB(T3B, T3I);
+			      T3S = VBYI(VSUB(T3P, T3M));
+			      ST(&(x[WS(rs, 18)]), VSUB(T3R, T3S), ms, &(x[0]));
+			      ST(&(x[WS(rs, 14)]), VADD(T3R, T3S), ms, &(x[0]));
+			      T3V = VBYI(VSUB(T3T, T3U));
+			      T3Y = VSUB(T3W, T3X);
+			      ST(&(x[WS(rs, 10)]), VADD(T3V, T3Y), ms, &(x[0]));
+			      ST(&(x[WS(rs, 22)]), VSUB(T3Y, T3V), ms, &(x[0]));
+			 }
+		    }
+		    {
+			 V TZ, T2k, T2d, T2l, T1X, T2h, T2a, T2i;
+			 {
+			      V Ts, TY, T2b, T2c;
+			      Ts = VADD(Tb, Tr);
+			      TY = VADD(TG, TX);
+			      TZ = VADD(Ts, TY);
+			      T2k = VSUB(Ts, TY);
+			      T2b = VFNMS(LDK(KP195090322), T1j, VMUL(LDK(KP980785280), T1r));
+			      T2c = VFMA(LDK(KP195090322), T1N, VMUL(LDK(KP980785280), T1V));
+			      T2d = VADD(T2b, T2c);
+			      T2l = VSUB(T2c, T2b);
+			 }
+			 {
+			      V T1s, T1W, T26, T29;
+			      T1s = VFMA(LDK(KP980785280), T1j, VMUL(LDK(KP195090322), T1r));
+			      T1W = VFNMS(LDK(KP195090322), T1V, VMUL(LDK(KP980785280), T1N));
+			      T1X = VADD(T1s, T1W);
+			      T2h = VSUB(T1W, T1s);
+			      T26 = VSUB(T1Z, T25);
+			      T29 = VSUB(T27, T28);
+			      T2a = VADD(T26, T29);
+			      T2i = VSUB(T29, T26);
+			 }
+			 {
+			      V T1Y, T2e, T2n, T2o;
+			      T1Y = VADD(TZ, T1X);
+			      T2e = VBYI(VADD(T2a, T2d));
+			      ST(&(x[WS(rs, 31)]), VSUB(T1Y, T2e), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VADD(T1Y, T2e), ms, &(x[WS(rs, 1)]));
+			      T2n = VBYI(VADD(T2i, T2h));
+			      T2o = VADD(T2k, T2l);
+			      ST(&(x[WS(rs, 7)]), VADD(T2n, T2o), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 25)]), VSUB(T2o, T2n), ms, &(x[WS(rs, 1)]));
+			 }
+			 {
+			      V T2f, T2g, T2j, T2m;
+			      T2f = VSUB(TZ, T1X);
+			      T2g = VBYI(VSUB(T2d, T2a));
+			      ST(&(x[WS(rs, 17)]), VSUB(T2f, T2g), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 15)]), VADD(T2f, T2g), ms, &(x[WS(rs, 1)]));
+			      T2j = VBYI(VSUB(T2h, T2i));
+			      T2m = VSUB(T2k, T2l);
+			      ST(&(x[WS(rs, 9)]), VADD(T2j, T2m), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 23)]), VSUB(T2m, T2j), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+		    {
+			 V T2r, T2M, T2F, T2N, T2y, T2J, T2C, T2K;
+			 {
+			      V T2p, T2q, T2D, T2E;
+			      T2p = VSUB(Tb, Tr);
+			      T2q = VADD(T28, T27);
+			      T2r = VADD(T2p, T2q);
+			      T2M = VSUB(T2p, T2q);
+			      T2D = VFNMS(LDK(KP555570233), T2s, VMUL(LDK(KP831469612), T2t));
+			      T2E = VFMA(LDK(KP555570233), T2v, VMUL(LDK(KP831469612), T2w));
+			      T2F = VADD(T2D, T2E);
+			      T2N = VSUB(T2E, T2D);
+			 }
+			 {
+			      V T2u, T2x, T2A, T2B;
+			      T2u = VFMA(LDK(KP831469612), T2s, VMUL(LDK(KP555570233), T2t));
+			      T2x = VFNMS(LDK(KP555570233), T2w, VMUL(LDK(KP831469612), T2v));
+			      T2y = VADD(T2u, T2x);
+			      T2J = VSUB(T2x, T2u);
+			      T2A = VADD(T25, T1Z);
+			      T2B = VSUB(TX, TG);
+			      T2C = VADD(T2A, T2B);
+			      T2K = VSUB(T2B, T2A);
+			 }
+			 {
+			      V T2z, T2G, T2P, T2Q;
+			      T2z = VADD(T2r, T2y);
+			      T2G = VBYI(VADD(T2C, T2F));
+			      ST(&(x[WS(rs, 29)]), VSUB(T2z, T2G), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 3)]), VADD(T2z, T2G), ms, &(x[WS(rs, 1)]));
+			      T2P = VBYI(VADD(T2K, T2J));
+			      T2Q = VADD(T2M, T2N);
+			      ST(&(x[WS(rs, 5)]), VADD(T2P, T2Q), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 27)]), VSUB(T2Q, T2P), ms, &(x[WS(rs, 1)]));
+			 }
+			 {
+			      V T2H, T2I, T2L, T2O;
+			      T2H = VSUB(T2r, T2y);
+			      T2I = VBYI(VSUB(T2F, T2C));
+			      ST(&(x[WS(rs, 19)]), VSUB(T2H, T2I), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 13)]), VADD(T2H, T2I), ms, &(x[WS(rs, 1)]));
+			      T2L = VBYI(VSUB(T2J, T2K));
+			      T2O = VSUB(T2M, T2N);
+			      ST(&(x[WS(rs, 11)]), VADD(T2L, T2O), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 21)]), VSUB(T2O, T2L), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 9),
+     VTW(0, 27),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 32, XSIMD_STRING("t3fv_32"), twinstr, &GENUS, {228, 142, 16, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_32) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:49 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 4 -name t3fv_4 -include t3f.h */
+
+/*
+ * This function contains 12 FP additions, 10 FP multiplications,
+ * (or, 10 additions, 8 multiplications, 2 fused multiply/add),
+ * 16 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T2, T3, T1, Ta, T5, T8;
+	       T2 = LDW(&(W[0]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       Ta = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       {
+		    V T4, Tb, T9, T6;
+		    T4 = VZMULJ(T2, T3);
+		    Tb = VZMULJ(T3, Ta);
+		    T9 = VZMULJ(T2, T8);
+		    T6 = VZMULJ(T4, T5);
+		    {
+			 V Tc, Te, T7, Td;
+			 Tc = VSUB(T9, Tb);
+			 Te = VADD(T9, Tb);
+			 T7 = VSUB(T1, T6);
+			 Td = VADD(T1, T6);
+			 ST(&(x[0]), VADD(Td, Te), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VSUB(Td, Te), ms, &(x[0]));
+			 ST(&(x[WS(rs, 3)]), VFMAI(Tc, T7), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 1)]), VFNMSI(Tc, T7), ms, &(x[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t3fv_4"), twinstr, &GENUS, {10, 8, 2, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_4) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 4 -name t3fv_4 -include t3f.h */
+
+/*
+ * This function contains 12 FP additions, 8 FP multiplications,
+ * (or, 12 additions, 8 multiplications, 0 fused multiply/add),
+ * 16 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(4, rs)) {
+	       V T2, T3, T4;
+	       T2 = LDW(&(W[0]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       T4 = VZMULJ(T2, T3);
+	       {
+		    V T1, Tb, T6, T9, Ta, T5, T8;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    Ta = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+		    Tb = VZMULJ(T3, Ta);
+		    T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+		    T6 = VZMULJ(T4, T5);
+		    T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+		    T9 = VZMULJ(T2, T8);
+		    {
+			 V T7, Tc, Td, Te;
+			 T7 = VSUB(T1, T6);
+			 Tc = VBYI(VSUB(T9, Tb));
+			 ST(&(x[WS(rs, 1)]), VSUB(T7, Tc), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VADD(T7, Tc), ms, &(x[WS(rs, 1)]));
+			 Td = VADD(T1, T6);
+			 Te = VADD(T9, Tb);
+			 ST(&(x[WS(rs, 2)]), VSUB(Td, Te), ms, &(x[0]));
+			 ST(&(x[0]), VADD(Td, Te), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 4, XSIMD_STRING("t3fv_4"), twinstr, &GENUS, {12, 8, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_4) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:54 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 5 -name t3fv_5 -include t3f.h */
+
+/*
+ * This function contains 22 FP additions, 23 FP multiplications,
+ * (or, 13 additions, 14 multiplications, 9 fused multiply/add),
+ * 30 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V T2, T5, T1, T3, Td, T7, Tb;
+	       T2 = LDW(&(W[0]));
+	       T5 = LDW(&(W[TWVL * 2]));
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T3 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       Td = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T7 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tb = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       {
+		    V Ta, T6, T4, Te, Tc, T8;
+		    Ta = VZMULJ(T2, T5);
+		    T6 = VZMUL(T2, T5);
+		    T4 = VZMULJ(T2, T3);
+		    Te = VZMULJ(T5, Td);
+		    Tc = VZMULJ(Ta, Tb);
+		    T8 = VZMULJ(T6, T7);
+		    {
+			 V Tf, Tl, T9, Tk;
+			 Tf = VADD(Tc, Te);
+			 Tl = VSUB(Tc, Te);
+			 T9 = VADD(T4, T8);
+			 Tk = VSUB(T4, T8);
+			 {
+			      V Ti, Tg, To, Tm, Th, Tn, Tj;
+			      Ti = VSUB(T9, Tf);
+			      Tg = VADD(T9, Tf);
+			      To = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tk, Tl));
+			      Tm = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tl, Tk));
+			      Th = VFNMS(LDK(KP250000000), Tg, T1);
+			      ST(&(x[0]), VADD(T1, Tg), ms, &(x[0]));
+			      Tn = VFNMS(LDK(KP559016994), Ti, Th);
+			      Tj = VFMA(LDK(KP559016994), Ti, Th);
+			      ST(&(x[WS(rs, 2)]), VFMAI(To, Tn), ms, &(x[0]));
+			      ST(&(x[WS(rs, 3)]), VFNMSI(To, Tn), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 4)]), VFMAI(Tm, Tj), ms, &(x[0]));
+			      ST(&(x[WS(rs, 1)]), VFNMSI(Tm, Tj), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t3fv_5"), twinstr, &GENUS, {13, 14, 9, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_5) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 5 -name t3fv_5 -include t3f.h */
+
+/*
+ * This function contains 22 FP additions, 18 FP multiplications,
+ * (or, 19 additions, 15 multiplications, 3 fused multiply/add),
+ * 24 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(5, rs)) {
+	       V T1, T4, T5, T9;
+	       T1 = LDW(&(W[0]));
+	       T4 = LDW(&(W[TWVL * 2]));
+	       T5 = VZMUL(T1, T4);
+	       T9 = VZMULJ(T1, T4);
+	       {
+		    V Tg, Tk, Tl, T8, Te, Th;
+		    Tg = LD(&(x[0]), ms, &(x[0]));
+		    {
+			 V T3, Td, T7, Tb;
+			 {
+			      V T2, Tc, T6, Ta;
+			      T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			      T3 = VZMULJ(T1, T2);
+			      Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			      Td = VZMULJ(T4, Tc);
+			      T6 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+			      T7 = VZMULJ(T5, T6);
+			      Ta = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			      Tb = VZMULJ(T9, Ta);
+			 }
+			 Tk = VSUB(T3, T7);
+			 Tl = VSUB(Tb, Td);
+			 T8 = VADD(T3, T7);
+			 Te = VADD(Tb, Td);
+			 Th = VADD(T8, Te);
+		    }
+		    ST(&(x[0]), VADD(Tg, Th), ms, &(x[0]));
+		    {
+			 V Tm, Tn, Tj, To, Tf, Ti;
+			 Tm = VBYI(VFMA(LDK(KP951056516), Tk, VMUL(LDK(KP587785252), Tl)));
+			 Tn = VBYI(VFNMS(LDK(KP587785252), Tk, VMUL(LDK(KP951056516), Tl)));
+			 Tf = VMUL(LDK(KP559016994), VSUB(T8, Te));
+			 Ti = VFNMS(LDK(KP250000000), Th, Tg);
+			 Tj = VADD(Tf, Ti);
+			 To = VSUB(Ti, Tf);
+			 ST(&(x[WS(rs, 1)]), VSUB(Tj, Tm), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 3)]), VSUB(To, Tn), ms, &(x[WS(rs, 1)]));
+			 ST(&(x[WS(rs, 4)]), VADD(Tm, Tj), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(Tn, To), ms, &(x[0]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 5, XSIMD_STRING("t3fv_5"), twinstr, &GENUS, {19, 15, 3, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_5) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/common/t3fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/common/t3fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:38:49 EST 2012 */
+
+#include "codelet-dft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 8 -name t3fv_8 -include t3f.h */
+
+/*
+ * This function contains 37 FP additions, 32 FP multiplications,
+ * (or, 27 additions, 22 multiplications, 10 fused multiply/add),
+ * 43 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T2, T3, Tb, T1, T5, Tn, Tq, T8, Td, T4, Ta, Tp, Tg, Ti, T9;
+	       T2 = LDW(&(W[0]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       Tb = LDW(&(W[TWVL * 4]));
+	       T1 = LD(&(x[0]), ms, &(x[0]));
+	       T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+	       Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+	       Tq = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+	       T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+	       Td = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+	       T4 = VZMUL(T2, T3);
+	       Ta = VZMULJ(T2, T3);
+	       Tp = VZMULJ(T2, Tb);
+	       Tg = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+	       Ti = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+	       T9 = VZMULJ(T2, T8);
+	       {
+		    V T6, To, Tc, Tr, Th, Tj;
+		    T6 = VZMULJ(T4, T5);
+		    To = VZMULJ(Ta, Tn);
+		    Tc = VZMULJ(Ta, Tb);
+		    Tr = VZMULJ(Tp, Tq);
+		    Th = VZMULJ(Tb, Tg);
+		    Tj = VZMULJ(T3, Ti);
+		    {
+			 V Tx, T7, Te, Ts, Ty, Tk, TB;
+			 Tx = VADD(T1, T6);
+			 T7 = VSUB(T1, T6);
+			 Te = VZMULJ(Tc, Td);
+			 Ts = VSUB(To, Tr);
+			 Ty = VADD(To, Tr);
+			 Tk = VSUB(Th, Tj);
+			 TB = VADD(Th, Tj);
+			 {
+			      V Tf, TA, Tz, TD;
+			      Tf = VSUB(T9, Te);
+			      TA = VADD(T9, Te);
+			      Tz = VADD(Tx, Ty);
+			      TD = VSUB(Tx, Ty);
+			      {
+				   V TC, TE, Tl, Tt;
+				   TC = VADD(TA, TB);
+				   TE = VSUB(TB, TA);
+				   Tl = VADD(Tf, Tk);
+				   Tt = VSUB(Tk, Tf);
+				   {
+					V Tu, Tw, Tm, Tv;
+					ST(&(x[WS(rs, 2)]), VFMAI(TE, TD), ms, &(x[0]));
+					ST(&(x[WS(rs, 6)]), VFNMSI(TE, TD), ms, &(x[0]));
+					ST(&(x[0]), VADD(Tz, TC), ms, &(x[0]));
+					ST(&(x[WS(rs, 4)]), VSUB(Tz, TC), ms, &(x[0]));
+					Tu = VFNMS(LDK(KP707106781), Tt, Ts);
+					Tw = VFMA(LDK(KP707106781), Tt, Ts);
+					Tm = VFMA(LDK(KP707106781), Tl, T7);
+					Tv = VFNMS(LDK(KP707106781), Tl, T7);
+					ST(&(x[WS(rs, 5)]), VFNMSI(Tw, Tv), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 3)]), VFMAI(Tw, Tv), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 7)]), VFMAI(Tu, Tm), ms, &(x[WS(rs, 1)]));
+					ST(&(x[WS(rs, 1)]), VFNMSI(Tu, Tm), ms, &(x[WS(rs, 1)]));
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t3fv_8"), twinstr, &GENUS, {27, 22, 10, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_8) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 8 -name t3fv_8 -include t3f.h */
+
+/*
+ * This function contains 37 FP additions, 24 FP multiplications,
+ * (or, 37 additions, 24 multiplications, 0 fused multiply/add),
+ * 31 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "t3f.h"
+
+static void t3fv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  R *x;
+	  x = ri;
+	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T2, T3, Ta, T4, Tb, Tc, Tq;
+	       T2 = LDW(&(W[0]));
+	       T3 = LDW(&(W[TWVL * 2]));
+	       Ta = VZMULJ(T2, T3);
+	       T4 = VZMUL(T2, T3);
+	       Tb = LDW(&(W[TWVL * 4]));
+	       Tc = VZMULJ(Ta, Tb);
+	       Tq = VZMULJ(T2, Tb);
+	       {
+		    V T7, Tx, Tt, Ty, Tf, TA, Tk, TB, T1, T6, T5;
+		    T1 = LD(&(x[0]), ms, &(x[0]));
+		    T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
+		    T6 = VZMULJ(T4, T5);
+		    T7 = VSUB(T1, T6);
+		    Tx = VADD(T1, T6);
+		    {
+			 V Tp, Ts, To, Tr;
+			 To = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
+			 Tp = VZMULJ(Ta, To);
+			 Tr = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
+			 Ts = VZMULJ(Tq, Tr);
+			 Tt = VSUB(Tp, Ts);
+			 Ty = VADD(Tp, Ts);
+		    }
+		    {
+			 V T9, Te, T8, Td;
+			 T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
+			 T9 = VZMULJ(T2, T8);
+			 Td = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
+			 Te = VZMULJ(Tc, Td);
+			 Tf = VSUB(T9, Te);
+			 TA = VADD(T9, Te);
+		    }
+		    {
+			 V Th, Tj, Tg, Ti;
+			 Tg = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
+			 Th = VZMULJ(Tb, Tg);
+			 Ti = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
+			 Tj = VZMULJ(T3, Ti);
+			 Tk = VSUB(Th, Tj);
+			 TB = VADD(Th, Tj);
+		    }
+		    {
+			 V Tz, TC, TD, TE;
+			 Tz = VADD(Tx, Ty);
+			 TC = VADD(TA, TB);
+			 ST(&(x[WS(rs, 4)]), VSUB(Tz, TC), ms, &(x[0]));
+			 ST(&(x[0]), VADD(Tz, TC), ms, &(x[0]));
+			 TD = VSUB(Tx, Ty);
+			 TE = VBYI(VSUB(TB, TA));
+			 ST(&(x[WS(rs, 6)]), VSUB(TD, TE), ms, &(x[0]));
+			 ST(&(x[WS(rs, 2)]), VADD(TD, TE), ms, &(x[0]));
+			 {
+			      V Tm, Tv, Tu, Tw, Tl, Tn;
+			      Tl = VMUL(LDK(KP707106781), VADD(Tf, Tk));
+			      Tm = VADD(T7, Tl);
+			      Tv = VSUB(T7, Tl);
+			      Tn = VMUL(LDK(KP707106781), VSUB(Tk, Tf));
+			      Tu = VBYI(VSUB(Tn, Tt));
+			      Tw = VBYI(VADD(Tt, Tn));
+			      ST(&(x[WS(rs, 7)]), VSUB(Tm, Tu), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 3)]), VADD(Tv, Tw), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 1)]), VADD(Tm, Tu), ms, &(x[WS(rs, 1)]));
+			      ST(&(x[WS(rs, 5)]), VSUB(Tv, Tw), ms, &(x[WS(rs, 1)]));
+			 }
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(0, 1),
+     VTW(0, 3),
+     VTW(0, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const ct_desc desc = { 8, XSIMD_STRING("t3fv_8"), twinstr, &GENUS, {37, 24, 0, 0}, 0, 0, 0 };
+
+void XSIMD(codelet_t3fv_8) (planner *p) {
+     X(kdft_dit_register) (p, t3fv_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/n1b.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/n1b.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#define GENUS XSIMD(dft_n1bsimd_genus)
+extern const kdft_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/n1f.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/n1f.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#define GENUS XSIMD(dft_n1fsimd_genus)
+extern const kdft_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/n2b.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/n2b.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#undef LD
+#define LD LDA
+
+#define GENUS XSIMD(dft_n2bsimd_genus)
+extern const kdft_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/n2f.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/n2f.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#undef LD
+#define LD LDA
+
+#define GENUS XSIMD(dft_n2fsimd_genus)
+extern const kdft_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/n2s.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/n2s.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#undef LD
+#define LD LDA
+
+#define GENUS XSIMD(dft_n2ssimd_genus)
+extern const kdft_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(NEON_CFLAGS)
+SIMD_HEADER=simd-neon.h
+
+include $(top_srcdir)/dft/simd/codlist.mk
+include $(top_srcdir)/dft/simd/simd.mk
+
+if HAVE_NEON
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = libdft_neon_codelets.la
+libdft_neon_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,893 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This file contains a standard list of DFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/dft/simd/codlist.mk \
+	$(top_srcdir)/dft/simd/simd.mk
+subdir = dft/simd/neon
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libdft_neon_codelets_la_LIBADD =
+am__libdft_neon_codelets_la_SOURCES_DIST = n1fv_2.c n1fv_3.c n1fv_4.c \
+	n1fv_5.c n1fv_6.c n1fv_7.c n1fv_8.c n1fv_9.c n1fv_10.c \
+	n1fv_11.c n1fv_12.c n1fv_13.c n1fv_14.c n1fv_15.c n1fv_16.c \
+	n1fv_32.c n1fv_64.c n1fv_128.c n1fv_20.c n1fv_25.c n1bv_2.c \
+	n1bv_3.c n1bv_4.c n1bv_5.c n1bv_6.c n1bv_7.c n1bv_8.c n1bv_9.c \
+	n1bv_10.c n1bv_11.c n1bv_12.c n1bv_13.c n1bv_14.c n1bv_15.c \
+	n1bv_16.c n1bv_32.c n1bv_64.c n1bv_128.c n1bv_20.c n1bv_25.c \
+	n2fv_2.c n2fv_4.c n2fv_6.c n2fv_8.c n2fv_10.c n2fv_12.c \
+	n2fv_14.c n2fv_16.c n2fv_32.c n2fv_64.c n2fv_20.c n2bv_2.c \
+	n2bv_4.c n2bv_6.c n2bv_8.c n2bv_10.c n2bv_12.c n2bv_14.c \
+	n2bv_16.c n2bv_32.c n2bv_64.c n2bv_20.c n2sv_4.c n2sv_8.c \
+	n2sv_16.c n2sv_32.c n2sv_64.c t1fuv_2.c t1fuv_3.c t1fuv_4.c \
+	t1fuv_5.c t1fuv_6.c t1fuv_7.c t1fuv_8.c t1fuv_9.c t1fuv_10.c \
+	t1fv_2.c t1fv_3.c t1fv_4.c t1fv_5.c t1fv_6.c t1fv_7.c t1fv_8.c \
+	t1fv_9.c t1fv_10.c t1fv_12.c t1fv_15.c t1fv_16.c t1fv_32.c \
+	t1fv_64.c t1fv_20.c t1fv_25.c t2fv_2.c t2fv_4.c t2fv_8.c \
+	t2fv_16.c t2fv_32.c t2fv_64.c t2fv_5.c t2fv_10.c t2fv_20.c \
+	t2fv_25.c t3fv_4.c t3fv_8.c t3fv_16.c t3fv_32.c t3fv_5.c \
+	t3fv_10.c t3fv_20.c t3fv_25.c t1buv_2.c t1buv_3.c t1buv_4.c \
+	t1buv_5.c t1buv_6.c t1buv_7.c t1buv_8.c t1buv_9.c t1buv_10.c \
+	t1bv_2.c t1bv_3.c t1bv_4.c t1bv_5.c t1bv_6.c t1bv_7.c t1bv_8.c \
+	t1bv_9.c t1bv_10.c t1bv_12.c t1bv_15.c t1bv_16.c t1bv_32.c \
+	t1bv_64.c t1bv_20.c t1bv_25.c t2bv_2.c t2bv_4.c t2bv_8.c \
+	t2bv_16.c t2bv_32.c t2bv_64.c t2bv_5.c t2bv_10.c t2bv_20.c \
+	t2bv_25.c t3bv_4.c t3bv_8.c t3bv_16.c t3bv_32.c t3bv_5.c \
+	t3bv_10.c t3bv_20.c t3bv_25.c t1sv_2.c t1sv_4.c t1sv_8.c \
+	t1sv_16.c t1sv_32.c t2sv_4.c t2sv_8.c t2sv_16.c t2sv_32.c \
+	q1fv_2.c q1fv_4.c q1fv_5.c q1fv_8.c q1bv_2.c q1bv_4.c q1bv_5.c \
+	q1bv_8.c genus.c codlist.c
+am__objects_1 = n1fv_2.lo n1fv_3.lo n1fv_4.lo n1fv_5.lo n1fv_6.lo \
+	n1fv_7.lo n1fv_8.lo n1fv_9.lo n1fv_10.lo n1fv_11.lo n1fv_12.lo \
+	n1fv_13.lo n1fv_14.lo n1fv_15.lo n1fv_16.lo n1fv_32.lo \
+	n1fv_64.lo n1fv_128.lo n1fv_20.lo n1fv_25.lo
+am__objects_2 = n1bv_2.lo n1bv_3.lo n1bv_4.lo n1bv_5.lo n1bv_6.lo \
+	n1bv_7.lo n1bv_8.lo n1bv_9.lo n1bv_10.lo n1bv_11.lo n1bv_12.lo \
+	n1bv_13.lo n1bv_14.lo n1bv_15.lo n1bv_16.lo n1bv_32.lo \
+	n1bv_64.lo n1bv_128.lo n1bv_20.lo n1bv_25.lo
+am__objects_3 = n2fv_2.lo n2fv_4.lo n2fv_6.lo n2fv_8.lo n2fv_10.lo \
+	n2fv_12.lo n2fv_14.lo n2fv_16.lo n2fv_32.lo n2fv_64.lo \
+	n2fv_20.lo
+am__objects_4 = n2bv_2.lo n2bv_4.lo n2bv_6.lo n2bv_8.lo n2bv_10.lo \
+	n2bv_12.lo n2bv_14.lo n2bv_16.lo n2bv_32.lo n2bv_64.lo \
+	n2bv_20.lo
+am__objects_5 = n2sv_4.lo n2sv_8.lo n2sv_16.lo n2sv_32.lo n2sv_64.lo
+am__objects_6 = t1fuv_2.lo t1fuv_3.lo t1fuv_4.lo t1fuv_5.lo t1fuv_6.lo \
+	t1fuv_7.lo t1fuv_8.lo t1fuv_9.lo t1fuv_10.lo
+am__objects_7 = t1fv_2.lo t1fv_3.lo t1fv_4.lo t1fv_5.lo t1fv_6.lo \
+	t1fv_7.lo t1fv_8.lo t1fv_9.lo t1fv_10.lo t1fv_12.lo t1fv_15.lo \
+	t1fv_16.lo t1fv_32.lo t1fv_64.lo t1fv_20.lo t1fv_25.lo
+am__objects_8 = t2fv_2.lo t2fv_4.lo t2fv_8.lo t2fv_16.lo t2fv_32.lo \
+	t2fv_64.lo t2fv_5.lo t2fv_10.lo t2fv_20.lo t2fv_25.lo
+am__objects_9 = t3fv_4.lo t3fv_8.lo t3fv_16.lo t3fv_32.lo t3fv_5.lo \
+	t3fv_10.lo t3fv_20.lo t3fv_25.lo
+am__objects_10 = t1buv_2.lo t1buv_3.lo t1buv_4.lo t1buv_5.lo \
+	t1buv_6.lo t1buv_7.lo t1buv_8.lo t1buv_9.lo t1buv_10.lo
+am__objects_11 = t1bv_2.lo t1bv_3.lo t1bv_4.lo t1bv_5.lo t1bv_6.lo \
+	t1bv_7.lo t1bv_8.lo t1bv_9.lo t1bv_10.lo t1bv_12.lo t1bv_15.lo \
+	t1bv_16.lo t1bv_32.lo t1bv_64.lo t1bv_20.lo t1bv_25.lo
+am__objects_12 = t2bv_2.lo t2bv_4.lo t2bv_8.lo t2bv_16.lo t2bv_32.lo \
+	t2bv_64.lo t2bv_5.lo t2bv_10.lo t2bv_20.lo t2bv_25.lo
+am__objects_13 = t3bv_4.lo t3bv_8.lo t3bv_16.lo t3bv_32.lo t3bv_5.lo \
+	t3bv_10.lo t3bv_20.lo t3bv_25.lo
+am__objects_14 = t1sv_2.lo t1sv_4.lo t1sv_8.lo t1sv_16.lo t1sv_32.lo
+am__objects_15 = t2sv_4.lo t2sv_8.lo t2sv_16.lo t2sv_32.lo
+am__objects_16 = q1fv_2.lo q1fv_4.lo q1fv_5.lo q1fv_8.lo
+am__objects_17 = q1bv_2.lo q1bv_4.lo q1bv_5.lo q1bv_8.lo
+am__objects_18 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
+	$(am__objects_4) $(am__objects_5) $(am__objects_6) \
+	$(am__objects_7) $(am__objects_8) $(am__objects_9) \
+	$(am__objects_10) $(am__objects_11) $(am__objects_12) \
+	$(am__objects_13) $(am__objects_14) $(am__objects_15) \
+	$(am__objects_16) $(am__objects_17)
+am__objects_19 = $(am__objects_18) genus.lo codlist.lo
+@HAVE_NEON_TRUE@am__objects_20 = $(am__objects_19)
+@HAVE_NEON_TRUE@am_libdft_neon_codelets_la_OBJECTS =  \
+@HAVE_NEON_TRUE@	$(am__objects_20)
+libdft_neon_codelets_la_OBJECTS =  \
+	$(am_libdft_neon_codelets_la_OBJECTS)
+@HAVE_NEON_TRUE@am_libdft_neon_codelets_la_rpath =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libdft_neon_codelets_la_SOURCES)
+DIST_SOURCES = $(am__libdft_neon_codelets_la_SOURCES_DIST)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CFLAGS = $(NEON_CFLAGS)
+SIMD_HEADER = simd-neon.h
+
+###########################################################################
+# n1fv_<n> is a hard-coded FFTW_FORWARD FFT of size <n>, using SIMD
+N1F = n1fv_2.c n1fv_3.c n1fv_4.c n1fv_5.c n1fv_6.c n1fv_7.c n1fv_8.c	\
+n1fv_9.c n1fv_10.c n1fv_11.c n1fv_12.c n1fv_13.c n1fv_14.c n1fv_15.c	\
+n1fv_16.c n1fv_32.c n1fv_64.c n1fv_128.c n1fv_20.c n1fv_25.c
+
+
+# as above, with restricted input vector stride
+N2F = n2fv_2.c n2fv_4.c n2fv_6.c n2fv_8.c n2fv_10.c n2fv_12.c	\
+n2fv_14.c n2fv_16.c n2fv_32.c n2fv_64.c n2fv_20.c
+
+
+# as above, but FFTW_BACKWARD
+N1B = n1bv_2.c n1bv_3.c n1bv_4.c n1bv_5.c n1bv_6.c n1bv_7.c n1bv_8.c	\
+n1bv_9.c n1bv_10.c n1bv_11.c n1bv_12.c n1bv_13.c n1bv_14.c n1bv_15.c	\
+n1bv_16.c n1bv_32.c n1bv_64.c n1bv_128.c n1bv_20.c n1bv_25.c
+
+N2B = n2bv_2.c n2bv_4.c n2bv_6.c n2bv_8.c n2bv_10.c n2bv_12.c	\
+n2bv_14.c n2bv_16.c n2bv_32.c n2bv_64.c n2bv_20.c
+
+
+# split-complex codelets 
+N2S = n2sv_4.c n2sv_8.c n2sv_16.c n2sv_32.c n2sv_64.c
+
+###########################################################################
+# t1fv_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
+# for an FFTW_FORWARD transform, using SIMD
+T1F = t1fv_2.c t1fv_3.c t1fv_4.c t1fv_5.c t1fv_6.c t1fv_7.c t1fv_8.c	\
+t1fv_9.c t1fv_10.c t1fv_12.c t1fv_15.c t1fv_16.c t1fv_32.c t1fv_64.c	\
+t1fv_20.c t1fv_25.c
+
+
+# same as t1fv_*, but with different twiddle storage scheme
+T2F = t2fv_2.c t2fv_4.c t2fv_8.c t2fv_16.c t2fv_32.c t2fv_64.c	\
+t2fv_5.c t2fv_10.c t2fv_20.c t2fv_25.c
+
+T3F = t3fv_4.c t3fv_8.c t3fv_16.c t3fv_32.c t3fv_5.c t3fv_10.c	\
+t3fv_20.c t3fv_25.c
+
+T1FU = t1fuv_2.c t1fuv_3.c t1fuv_4.c t1fuv_5.c t1fuv_6.c t1fuv_7.c	\
+t1fuv_8.c t1fuv_9.c t1fuv_10.c
+
+
+# as above, but FFTW_BACKWARD
+T1B = t1bv_2.c t1bv_3.c t1bv_4.c t1bv_5.c t1bv_6.c t1bv_7.c t1bv_8.c	\
+t1bv_9.c t1bv_10.c t1bv_12.c t1bv_15.c t1bv_16.c t1bv_32.c t1bv_64.c	\
+t1bv_20.c t1bv_25.c
+
+
+# same as t1bv_*, but with different twiddle storage scheme
+T2B = t2bv_2.c t2bv_4.c t2bv_8.c t2bv_16.c t2bv_32.c t2bv_64.c	\
+t2bv_5.c t2bv_10.c t2bv_20.c t2bv_25.c
+
+T3B = t3bv_4.c t3bv_8.c t3bv_16.c t3bv_32.c t3bv_5.c t3bv_10.c	\
+t3bv_20.c t3bv_25.c
+
+T1BU = t1buv_2.c t1buv_3.c t1buv_4.c t1buv_5.c t1buv_6.c t1buv_7.c	\
+t1buv_8.c t1buv_9.c t1buv_10.c
+
+
+# split-complex codelets
+T1S = t1sv_2.c t1sv_4.c t1sv_8.c t1sv_16.c t1sv_32.c
+T2S = t2sv_4.c t2sv_8.c t2sv_16.c t2sv_32.c
+
+###########################################################################
+# q1fv_<r> is <r> twiddle FFTW_FORWARD FFTs of size <r> (DIF step),
+# where the output is transposed, using SIMD.  This is used for
+# in-place transposes in sizes that are divisible by <r>^2.  These
+# codelets have size ~ <r>^2, so you should probably not use <r>
+# bigger than 8 or so.
+Q1F = q1fv_2.c q1fv_4.c q1fv_5.c q1fv_8.c
+
+# as above, but FFTW_BACKWARD
+Q1B = q1bv_2.c q1bv_4.c q1bv_5.c q1bv_8.c
+
+###########################################################################
+SIMD_CODELETS = $(N1F) $(N1B) $(N2F) $(N2B) $(N2S) $(T1FU) $(T1F)	\
+$(T2F) $(T3F) $(T1BU) $(T1B) $(T2B) $(T3B) $(T1S) $(T2S) $(Q1F) $(Q1B)
+
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/dft/simd -I$(top_srcdir)/simd-support
+
+EXTRA_DIST = $(SIMD_CODELETS) genus.c codlist.c
+@HAVE_NEON_TRUE@BUILT_SOURCES = $(EXTRA_DIST)
+@HAVE_NEON_TRUE@noinst_LTLIBRARIES = libdft_neon_codelets.la
+@HAVE_NEON_TRUE@libdft_neon_codelets_la_SOURCES = $(BUILT_SOURCES)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/dft/simd/codlist.mk $(top_srcdir)/dft/simd/simd.mk $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/simd/neon/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu dft/simd/neon/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/dft/simd/codlist.mk $(top_srcdir)/dft/simd/simd.mk:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libdft_neon_codelets.la: $(libdft_neon_codelets_la_OBJECTS) $(libdft_neon_codelets_la_DEPENDENCIES) $(EXTRA_libdft_neon_codelets_la_DEPENDENCIES) 
+	$(LINK) $(am_libdft_neon_codelets_la_rpath) $(libdft_neon_codelets_la_OBJECTS) $(libdft_neon_codelets_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/genus.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_128.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_128.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_8.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+$(EXTRA_DIST): Makefile
+	(							\
+	echo "/* Generated automatically.  DO NOT EDIT! */";	\
+	echo "#define SIMD_HEADER \"$(SIMD_HEADER)\"";		\
+	echo "#include \"../common/"$*".c\"";			\
+	) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/codlist.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/genus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/genus.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/genus.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_11.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_128.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_13.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1bv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1bv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1bv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_11.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_128.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_13.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n1fv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n1fv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n1fv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2sv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2sv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2sv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/n2sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/n2sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/n2sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/q1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/q1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/q1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/q1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/q1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/q1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/q1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/q1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/q1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/q1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/q1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/q1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/q1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/q1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/q1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/q1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/q1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/q1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/q1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/q1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/q1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/q1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/q1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/q1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1buv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1buv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1buv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1buv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1buv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1buv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1buv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1buv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1buv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1buv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1buv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1buv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1buv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1buv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1buv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1buv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1buv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1buv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1buv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1buv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1buv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1buv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1buv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1buv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1buv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1buv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1buv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1bv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1bv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1bv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fuv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fuv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fuv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fuv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fuv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fuv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fuv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fuv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fuv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fuv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fuv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fuv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fuv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fuv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fuv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fuv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fuv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fuv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fuv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fuv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fuv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fuv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fuv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fuv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fuv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fuv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fuv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1fv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1fv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1fv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1sv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1sv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1sv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t1sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t1sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t1sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t2sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t2sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t2sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/neon/t3fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/neon/t3fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/t3fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/q1b.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/q1b.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#define VTW VTW1
+#define TWVL TWVL1
+#define BYTW BYTW1
+#define BYTWJ BYTWJ1
+
+#define GENUS XSIMD(dft_q1bsimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/q1f.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/q1f.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#define VTW VTW1
+#define TWVL TWVL1
+#define BYTW BYTW1
+#define BYTWJ BYTWJ1
+
+#define GENUS XSIMD(dft_q1fsimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/simd.mk
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/simd.mk	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,12 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/dft/simd -I$(top_srcdir)/simd-support
+
+EXTRA_DIST = $(SIMD_CODELETS) genus.c codlist.c
+
+$(EXTRA_DIST): Makefile
+	(							\
+	echo "/* Generated automatically.  DO NOT EDIT! */";	\
+	echo "#define SIMD_HEADER \"$(SIMD_HEADER)\"";		\
+	echo "#include \"../common/"$*".c\"";			\
+	) >$@
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SSE2_CFLAGS)
+SIMD_HEADER=simd-sse2.h
+
+include $(top_srcdir)/dft/simd/codlist.mk
+include $(top_srcdir)/dft/simd/simd.mk
+
+if HAVE_SSE2
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = libdft_sse2_codelets.la
+libdft_sse2_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,893 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This file contains a standard list of DFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/dft/simd/codlist.mk \
+	$(top_srcdir)/dft/simd/simd.mk
+subdir = dft/simd/sse2
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libdft_sse2_codelets_la_LIBADD =
+am__libdft_sse2_codelets_la_SOURCES_DIST = n1fv_2.c n1fv_3.c n1fv_4.c \
+	n1fv_5.c n1fv_6.c n1fv_7.c n1fv_8.c n1fv_9.c n1fv_10.c \
+	n1fv_11.c n1fv_12.c n1fv_13.c n1fv_14.c n1fv_15.c n1fv_16.c \
+	n1fv_32.c n1fv_64.c n1fv_128.c n1fv_20.c n1fv_25.c n1bv_2.c \
+	n1bv_3.c n1bv_4.c n1bv_5.c n1bv_6.c n1bv_7.c n1bv_8.c n1bv_9.c \
+	n1bv_10.c n1bv_11.c n1bv_12.c n1bv_13.c n1bv_14.c n1bv_15.c \
+	n1bv_16.c n1bv_32.c n1bv_64.c n1bv_128.c n1bv_20.c n1bv_25.c \
+	n2fv_2.c n2fv_4.c n2fv_6.c n2fv_8.c n2fv_10.c n2fv_12.c \
+	n2fv_14.c n2fv_16.c n2fv_32.c n2fv_64.c n2fv_20.c n2bv_2.c \
+	n2bv_4.c n2bv_6.c n2bv_8.c n2bv_10.c n2bv_12.c n2bv_14.c \
+	n2bv_16.c n2bv_32.c n2bv_64.c n2bv_20.c n2sv_4.c n2sv_8.c \
+	n2sv_16.c n2sv_32.c n2sv_64.c t1fuv_2.c t1fuv_3.c t1fuv_4.c \
+	t1fuv_5.c t1fuv_6.c t1fuv_7.c t1fuv_8.c t1fuv_9.c t1fuv_10.c \
+	t1fv_2.c t1fv_3.c t1fv_4.c t1fv_5.c t1fv_6.c t1fv_7.c t1fv_8.c \
+	t1fv_9.c t1fv_10.c t1fv_12.c t1fv_15.c t1fv_16.c t1fv_32.c \
+	t1fv_64.c t1fv_20.c t1fv_25.c t2fv_2.c t2fv_4.c t2fv_8.c \
+	t2fv_16.c t2fv_32.c t2fv_64.c t2fv_5.c t2fv_10.c t2fv_20.c \
+	t2fv_25.c t3fv_4.c t3fv_8.c t3fv_16.c t3fv_32.c t3fv_5.c \
+	t3fv_10.c t3fv_20.c t3fv_25.c t1buv_2.c t1buv_3.c t1buv_4.c \
+	t1buv_5.c t1buv_6.c t1buv_7.c t1buv_8.c t1buv_9.c t1buv_10.c \
+	t1bv_2.c t1bv_3.c t1bv_4.c t1bv_5.c t1bv_6.c t1bv_7.c t1bv_8.c \
+	t1bv_9.c t1bv_10.c t1bv_12.c t1bv_15.c t1bv_16.c t1bv_32.c \
+	t1bv_64.c t1bv_20.c t1bv_25.c t2bv_2.c t2bv_4.c t2bv_8.c \
+	t2bv_16.c t2bv_32.c t2bv_64.c t2bv_5.c t2bv_10.c t2bv_20.c \
+	t2bv_25.c t3bv_4.c t3bv_8.c t3bv_16.c t3bv_32.c t3bv_5.c \
+	t3bv_10.c t3bv_20.c t3bv_25.c t1sv_2.c t1sv_4.c t1sv_8.c \
+	t1sv_16.c t1sv_32.c t2sv_4.c t2sv_8.c t2sv_16.c t2sv_32.c \
+	q1fv_2.c q1fv_4.c q1fv_5.c q1fv_8.c q1bv_2.c q1bv_4.c q1bv_5.c \
+	q1bv_8.c genus.c codlist.c
+am__objects_1 = n1fv_2.lo n1fv_3.lo n1fv_4.lo n1fv_5.lo n1fv_6.lo \
+	n1fv_7.lo n1fv_8.lo n1fv_9.lo n1fv_10.lo n1fv_11.lo n1fv_12.lo \
+	n1fv_13.lo n1fv_14.lo n1fv_15.lo n1fv_16.lo n1fv_32.lo \
+	n1fv_64.lo n1fv_128.lo n1fv_20.lo n1fv_25.lo
+am__objects_2 = n1bv_2.lo n1bv_3.lo n1bv_4.lo n1bv_5.lo n1bv_6.lo \
+	n1bv_7.lo n1bv_8.lo n1bv_9.lo n1bv_10.lo n1bv_11.lo n1bv_12.lo \
+	n1bv_13.lo n1bv_14.lo n1bv_15.lo n1bv_16.lo n1bv_32.lo \
+	n1bv_64.lo n1bv_128.lo n1bv_20.lo n1bv_25.lo
+am__objects_3 = n2fv_2.lo n2fv_4.lo n2fv_6.lo n2fv_8.lo n2fv_10.lo \
+	n2fv_12.lo n2fv_14.lo n2fv_16.lo n2fv_32.lo n2fv_64.lo \
+	n2fv_20.lo
+am__objects_4 = n2bv_2.lo n2bv_4.lo n2bv_6.lo n2bv_8.lo n2bv_10.lo \
+	n2bv_12.lo n2bv_14.lo n2bv_16.lo n2bv_32.lo n2bv_64.lo \
+	n2bv_20.lo
+am__objects_5 = n2sv_4.lo n2sv_8.lo n2sv_16.lo n2sv_32.lo n2sv_64.lo
+am__objects_6 = t1fuv_2.lo t1fuv_3.lo t1fuv_4.lo t1fuv_5.lo t1fuv_6.lo \
+	t1fuv_7.lo t1fuv_8.lo t1fuv_9.lo t1fuv_10.lo
+am__objects_7 = t1fv_2.lo t1fv_3.lo t1fv_4.lo t1fv_5.lo t1fv_6.lo \
+	t1fv_7.lo t1fv_8.lo t1fv_9.lo t1fv_10.lo t1fv_12.lo t1fv_15.lo \
+	t1fv_16.lo t1fv_32.lo t1fv_64.lo t1fv_20.lo t1fv_25.lo
+am__objects_8 = t2fv_2.lo t2fv_4.lo t2fv_8.lo t2fv_16.lo t2fv_32.lo \
+	t2fv_64.lo t2fv_5.lo t2fv_10.lo t2fv_20.lo t2fv_25.lo
+am__objects_9 = t3fv_4.lo t3fv_8.lo t3fv_16.lo t3fv_32.lo t3fv_5.lo \
+	t3fv_10.lo t3fv_20.lo t3fv_25.lo
+am__objects_10 = t1buv_2.lo t1buv_3.lo t1buv_4.lo t1buv_5.lo \
+	t1buv_6.lo t1buv_7.lo t1buv_8.lo t1buv_9.lo t1buv_10.lo
+am__objects_11 = t1bv_2.lo t1bv_3.lo t1bv_4.lo t1bv_5.lo t1bv_6.lo \
+	t1bv_7.lo t1bv_8.lo t1bv_9.lo t1bv_10.lo t1bv_12.lo t1bv_15.lo \
+	t1bv_16.lo t1bv_32.lo t1bv_64.lo t1bv_20.lo t1bv_25.lo
+am__objects_12 = t2bv_2.lo t2bv_4.lo t2bv_8.lo t2bv_16.lo t2bv_32.lo \
+	t2bv_64.lo t2bv_5.lo t2bv_10.lo t2bv_20.lo t2bv_25.lo
+am__objects_13 = t3bv_4.lo t3bv_8.lo t3bv_16.lo t3bv_32.lo t3bv_5.lo \
+	t3bv_10.lo t3bv_20.lo t3bv_25.lo
+am__objects_14 = t1sv_2.lo t1sv_4.lo t1sv_8.lo t1sv_16.lo t1sv_32.lo
+am__objects_15 = t2sv_4.lo t2sv_8.lo t2sv_16.lo t2sv_32.lo
+am__objects_16 = q1fv_2.lo q1fv_4.lo q1fv_5.lo q1fv_8.lo
+am__objects_17 = q1bv_2.lo q1bv_4.lo q1bv_5.lo q1bv_8.lo
+am__objects_18 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
+	$(am__objects_4) $(am__objects_5) $(am__objects_6) \
+	$(am__objects_7) $(am__objects_8) $(am__objects_9) \
+	$(am__objects_10) $(am__objects_11) $(am__objects_12) \
+	$(am__objects_13) $(am__objects_14) $(am__objects_15) \
+	$(am__objects_16) $(am__objects_17)
+am__objects_19 = $(am__objects_18) genus.lo codlist.lo
+@HAVE_SSE2_TRUE@am__objects_20 = $(am__objects_19)
+@HAVE_SSE2_TRUE@am_libdft_sse2_codelets_la_OBJECTS =  \
+@HAVE_SSE2_TRUE@	$(am__objects_20)
+libdft_sse2_codelets_la_OBJECTS =  \
+	$(am_libdft_sse2_codelets_la_OBJECTS)
+@HAVE_SSE2_TRUE@am_libdft_sse2_codelets_la_rpath =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libdft_sse2_codelets_la_SOURCES)
+DIST_SOURCES = $(am__libdft_sse2_codelets_la_SOURCES_DIST)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CFLAGS = $(SSE2_CFLAGS)
+SIMD_HEADER = simd-sse2.h
+
+###########################################################################
+# n1fv_<n> is a hard-coded FFTW_FORWARD FFT of size <n>, using SIMD
+N1F = n1fv_2.c n1fv_3.c n1fv_4.c n1fv_5.c n1fv_6.c n1fv_7.c n1fv_8.c	\
+n1fv_9.c n1fv_10.c n1fv_11.c n1fv_12.c n1fv_13.c n1fv_14.c n1fv_15.c	\
+n1fv_16.c n1fv_32.c n1fv_64.c n1fv_128.c n1fv_20.c n1fv_25.c
+
+
+# as above, with restricted input vector stride
+N2F = n2fv_2.c n2fv_4.c n2fv_6.c n2fv_8.c n2fv_10.c n2fv_12.c	\
+n2fv_14.c n2fv_16.c n2fv_32.c n2fv_64.c n2fv_20.c
+
+
+# as above, but FFTW_BACKWARD
+N1B = n1bv_2.c n1bv_3.c n1bv_4.c n1bv_5.c n1bv_6.c n1bv_7.c n1bv_8.c	\
+n1bv_9.c n1bv_10.c n1bv_11.c n1bv_12.c n1bv_13.c n1bv_14.c n1bv_15.c	\
+n1bv_16.c n1bv_32.c n1bv_64.c n1bv_128.c n1bv_20.c n1bv_25.c
+
+N2B = n2bv_2.c n2bv_4.c n2bv_6.c n2bv_8.c n2bv_10.c n2bv_12.c	\
+n2bv_14.c n2bv_16.c n2bv_32.c n2bv_64.c n2bv_20.c
+
+
+# split-complex codelets 
+N2S = n2sv_4.c n2sv_8.c n2sv_16.c n2sv_32.c n2sv_64.c
+
+###########################################################################
+# t1fv_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
+# for an FFTW_FORWARD transform, using SIMD
+T1F = t1fv_2.c t1fv_3.c t1fv_4.c t1fv_5.c t1fv_6.c t1fv_7.c t1fv_8.c	\
+t1fv_9.c t1fv_10.c t1fv_12.c t1fv_15.c t1fv_16.c t1fv_32.c t1fv_64.c	\
+t1fv_20.c t1fv_25.c
+
+
+# same as t1fv_*, but with different twiddle storage scheme
+T2F = t2fv_2.c t2fv_4.c t2fv_8.c t2fv_16.c t2fv_32.c t2fv_64.c	\
+t2fv_5.c t2fv_10.c t2fv_20.c t2fv_25.c
+
+T3F = t3fv_4.c t3fv_8.c t3fv_16.c t3fv_32.c t3fv_5.c t3fv_10.c	\
+t3fv_20.c t3fv_25.c
+
+T1FU = t1fuv_2.c t1fuv_3.c t1fuv_4.c t1fuv_5.c t1fuv_6.c t1fuv_7.c	\
+t1fuv_8.c t1fuv_9.c t1fuv_10.c
+
+
+# as above, but FFTW_BACKWARD
+T1B = t1bv_2.c t1bv_3.c t1bv_4.c t1bv_5.c t1bv_6.c t1bv_7.c t1bv_8.c	\
+t1bv_9.c t1bv_10.c t1bv_12.c t1bv_15.c t1bv_16.c t1bv_32.c t1bv_64.c	\
+t1bv_20.c t1bv_25.c
+
+
+# same as t1bv_*, but with different twiddle storage scheme
+T2B = t2bv_2.c t2bv_4.c t2bv_8.c t2bv_16.c t2bv_32.c t2bv_64.c	\
+t2bv_5.c t2bv_10.c t2bv_20.c t2bv_25.c
+
+T3B = t3bv_4.c t3bv_8.c t3bv_16.c t3bv_32.c t3bv_5.c t3bv_10.c	\
+t3bv_20.c t3bv_25.c
+
+T1BU = t1buv_2.c t1buv_3.c t1buv_4.c t1buv_5.c t1buv_6.c t1buv_7.c	\
+t1buv_8.c t1buv_9.c t1buv_10.c
+
+
+# split-complex codelets
+T1S = t1sv_2.c t1sv_4.c t1sv_8.c t1sv_16.c t1sv_32.c
+T2S = t2sv_4.c t2sv_8.c t2sv_16.c t2sv_32.c
+
+###########################################################################
+# q1fv_<r> is <r> twiddle FFTW_FORWARD FFTs of size <r> (DIF step),
+# where the output is transposed, using SIMD.  This is used for
+# in-place transposes in sizes that are divisible by <r>^2.  These
+# codelets have size ~ <r>^2, so you should probably not use <r>
+# bigger than 8 or so.
+Q1F = q1fv_2.c q1fv_4.c q1fv_5.c q1fv_8.c
+
+# as above, but FFTW_BACKWARD
+Q1B = q1bv_2.c q1bv_4.c q1bv_5.c q1bv_8.c
+
+###########################################################################
+SIMD_CODELETS = $(N1F) $(N1B) $(N2F) $(N2B) $(N2S) $(T1FU) $(T1F)	\
+$(T2F) $(T3F) $(T1BU) $(T1B) $(T2B) $(T3B) $(T1S) $(T2S) $(Q1F) $(Q1B)
+
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/dft/simd -I$(top_srcdir)/simd-support
+
+EXTRA_DIST = $(SIMD_CODELETS) genus.c codlist.c
+@HAVE_SSE2_TRUE@BUILT_SOURCES = $(EXTRA_DIST)
+@HAVE_SSE2_TRUE@noinst_LTLIBRARIES = libdft_sse2_codelets.la
+@HAVE_SSE2_TRUE@libdft_sse2_codelets_la_SOURCES = $(BUILT_SOURCES)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/dft/simd/codlist.mk $(top_srcdir)/dft/simd/simd.mk $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/simd/sse2/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu dft/simd/sse2/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/dft/simd/codlist.mk $(top_srcdir)/dft/simd/simd.mk:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libdft_sse2_codelets.la: $(libdft_sse2_codelets_la_OBJECTS) $(libdft_sse2_codelets_la_DEPENDENCIES) $(EXTRA_libdft_sse2_codelets_la_DEPENDENCIES) 
+	$(LINK) $(am_libdft_sse2_codelets_la_rpath) $(libdft_sse2_codelets_la_OBJECTS) $(libdft_sse2_codelets_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/genus.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_128.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1bv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_128.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1fv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n2sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1buv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1bv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fuv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1fv_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2fv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2sv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3bv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t3fv_8.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+$(EXTRA_DIST): Makefile
+	(							\
+	echo "/* Generated automatically.  DO NOT EDIT! */";	\
+	echo "#define SIMD_HEADER \"$(SIMD_HEADER)\"";		\
+	echo "#include \"../common/"$*".c\"";			\
+	) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/codlist.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/genus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/genus.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/genus.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_11.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_128.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_13.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1bv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1bv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1bv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_11.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_128.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_13.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n1fv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n1fv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n1fv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_14.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2sv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2sv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2sv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/n2sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/n2sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/n2sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/q1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/q1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/q1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/q1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/q1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/q1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/q1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/q1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/q1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/q1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/q1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/q1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/q1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/q1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/q1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/q1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/q1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/q1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/q1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/q1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/q1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/q1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/q1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/q1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1buv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1buv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1buv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1buv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1buv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1buv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1buv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1buv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1buv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1buv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1buv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1buv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1buv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1buv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1buv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1buv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1buv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1buv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1buv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1buv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1buv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1buv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1buv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1buv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1buv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1buv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1buv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1bv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1bv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1bv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fuv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fuv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fuv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fuv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fuv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fuv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fuv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fuv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fuv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fuv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fuv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fuv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fuv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fuv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fuv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fuv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fuv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fuv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fuv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fuv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fuv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fuv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fuv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fuv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fuv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fuv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fuv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_15.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_3.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_7.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1fv_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1fv_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1fv_9.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1sv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1sv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1sv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t1sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t1sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t1sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2bv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2bv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2bv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2bv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2bv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2bv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2fv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2fv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2fv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2fv_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2fv_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2fv_64.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2sv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2sv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2sv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2sv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2sv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2sv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2sv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2sv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2sv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t2sv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t2sv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t2sv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3bv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3bv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3bv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3bv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3bv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3bv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3bv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3bv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3bv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3bv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3bv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3bv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3bv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3bv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3bv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3bv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3bv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3bv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3bv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3bv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3bv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3bv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3bv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3bv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3fv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3fv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3fv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3fv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3fv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3fv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3fv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3fv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3fv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3fv_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3fv_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3fv_25.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3fv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3fv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3fv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3fv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3fv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3fv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3fv_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3fv_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3fv_5.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/sse2/t3fv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/sse2/t3fv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/t3fv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/t1b.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/t1b.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#undef LD
+#define LD LDA
+#undef ST
+#define ST STA
+
+#define VTW VTW1
+#define TWVL TWVL1
+#define BYTW BYTW1
+#define BYTWJ BYTWJ1
+
+#define GENUS XSIMD(dft_t1bsimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/t1bu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/t1bu.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#define VTW VTW1
+#define TWVL TWVL1
+#define BYTW BYTW1
+#define BYTWJ BYTWJ1
+
+#define GENUS XSIMD(dft_t1busimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/t1f.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/t1f.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#undef LD
+#define LD LDA
+#undef ST
+#define ST STA
+
+#define VTW VTW1
+#define TWVL TWVL1
+#define BYTW BYTW1
+#define BYTWJ BYTWJ1
+
+#define GENUS XSIMD(dft_t1fsimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/t1fu.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/t1fu.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#define VTW VTW1
+#define TWVL TWVL1
+#define BYTW BYTW1
+#define BYTWJ BYTWJ1
+
+#define GENUS XSIMD(dft_t1fusimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/t2b.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/t2b.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#undef LD
+#define LD LDA
+#undef ST
+#define ST STA
+
+#define VTW VTW2
+#define TWVL TWVL2
+#define BYTW BYTW2
+#define BYTWJ BYTWJ2
+
+#define GENUS XSIMD(dft_t2bsimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/t2f.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/t2f.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#undef LD
+#define LD LDA
+#undef ST
+#define ST STA
+
+#define VTW VTW2
+#define TWVL TWVL2
+#define BYTW BYTW2
+#define BYTWJ BYTWJ2
+
+#define GENUS XSIMD(dft_t2fsimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/t3b.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/t3b.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#undef LD
+#define LD LDA
+#undef ST
+#define ST STA
+
+#define VTW VTW3
+#define TWVL TWVL3
+#define LDW(x) LDA(x, 0, 0) /* load twiddle factor */
+
+/* same as t1b otherwise */
+#define GENUS XSIMD(dft_t1bsimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/t3f.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/t3f.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#undef LD
+#define LD LDA
+#undef ST
+#define ST STA
+
+#define VTW VTW3
+#define TWVL TWVL3
+#define LDW(x) LDA(x, 0, 0) /* load twiddle factor */
+
+/* same as t1f otherwise */
+#define GENUS XSIMD(dft_t1fsimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/simd/ts.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/simd/ts.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#undef LD
+#define LD LDA
+#undef ST
+#define ST STA
+
+#define VTW VTWS
+#define TWVL TWVLS
+#define LDW(x) LDA(x, 0, 0) /* load twiddle factor */
+
+#define GENUS XSIMD(dft_tssimd_genus)
+extern const ct_genus GENUS;
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/solve.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/solve.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "dft.h"
+
+/* use the apply() operation for DFT problems */
+void X(dft_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_dft *ego = (const plan_dft *) ego_;
+     const problem_dft *p = (const problem_dft *) p_;
+     ego->apply(ego_, 
+		UNTAINT(p->ri), UNTAINT(p->ii), 
+		UNTAINT(p->ro), UNTAINT(p->io));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/vrank-geq1.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/vrank-geq1.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+
+/* Plans for handling vector transform loops.  These are *just* the
+   loops, and rely on child plans for the actual DFTs.
+ 
+   They form a wrapper around solvers that don't have apply functions
+   for non-null vectors.
+ 
+   vrank-geq1 plans also recursively handle the case of multi-dimensional
+   vectors, obviating the need for most solvers to deal with this.  We
+   can also play games here, such as reordering the vector loops.
+ 
+   Each vrank-geq1 plan reduces the vector rank by 1, picking out a
+   dimension determined by the vecloop_dim field of the solver. */
+
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     int vecloop_dim;
+     const int *buddies;
+     int nbuddies;
+} S;
+
+typedef struct {
+     plan_dft super;
+
+     plan *cld;
+     INT vl;
+     INT ivs, ovs;
+     const S *solver;
+} P;
+
+static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     INT i, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     dftapply cldapply = ((plan_dft *) ego->cld)->apply;
+
+     for (i = 0; i < vl; ++i) {
+          cldapply(ego->cld,
+                   ri + i * ivs, ii + i * ivs, ro + i * ovs, io + i * ovs);
+     }
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->solver;
+     p->print(p, "(dft-vrank>=1-x%D/%d%(%p%))",
+ 	      ego->vl, s->vecloop_dim, ego->cld);
+}
+
+static int pickdim(const S *ego, const tensor *vecsz, int oop, int *dp)
+{
+     return X(pickdim)(ego->vecloop_dim, ego->buddies, ego->nbuddies,
+		       vecsz, oop, dp);
+}
+
+static int applicable0(const solver *ego_, const problem *p_, int *dp)
+{
+     const S *ego = (const S *) ego_;
+     const problem_dft *p = (const problem_dft *) p_;
+
+     return (1
+	     && FINITE_RNK(p->vecsz->rnk)
+	     && p->vecsz->rnk > 0
+
+	     /* do not bother looping over rank-0 problems,
+		since they are handled via rdft */
+	     && p->sz->rnk > 0
+
+	     && pickdim(ego, p->vecsz, p->ri != p->ro, dp)
+	  );
+}
+
+static int applicable(const solver *ego_, const problem *p_, 
+		      const planner *plnr, int *dp)
+{
+     const S *ego = (const S *)ego_;
+     const problem_dft *p;
+
+     if (!applicable0(ego_, p_, dp)) return 0;
+
+     /* fftw2 behavior */
+     if (NO_VRANK_SPLITSP(plnr) && (ego->vecloop_dim != ego->buddies[0]))
+	  return 0;
+
+     p = (const problem_dft *) p_;
+
+     if (NO_UGLYP(plnr)) {
+	  /* Heuristic: if the transform is multi-dimensional, and the
+	     vector stride is less than the transform size, then we
+	     probably want to use a rank>=2 plan first in order to combine
+	     this vector with the transform-dimension vectors. */
+	  {
+	       iodim *d = p->vecsz->dims + *dp;
+	       if (1
+		   && p->sz->rnk > 1 
+		   && X(imin)(X(iabs)(d->is), X(iabs)(d->os)) 
+		   < X(tensor_max_index)(p->sz)
+		    )
+		    return 0;
+	  }
+
+	  if (NO_NONTHREADEDP(plnr)) return 0; /* prefer threaded version */
+     }
+
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_dft *p;
+     P *pln;
+     plan *cld;
+     int vdim;
+     iodim *d;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &vdim))
+          return (plan *) 0;
+     p = (const problem_dft *) p_;
+
+     d = p->vecsz->dims + vdim;
+
+     A(d->n > 1);
+     cld = X(mkplan_d)(plnr,
+		       X(mkproblem_dft_d)(
+			    X(tensor_copy)(p->sz),
+			    X(tensor_copy_except)(p->vecsz, vdim),
+			    TAINT(p->ri, d->is), TAINT(p->ii, d->is),
+			    TAINT(p->ro, d->os), TAINT(p->io, d->os)));
+     if (!cld) return (plan *) 0;
+
+     pln = MKPLAN_DFT(P, &padt, apply);
+
+     pln->cld = cld;
+     pln->vl = d->n;
+     pln->ivs = d->is;
+     pln->ovs = d->os;
+
+     pln->solver = ego;
+     X(ops_zero)(&pln->super.super.ops);
+     pln->super.super.ops.other = 3.14159; /* magic to prefer codelet loops */
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+
+     if (p->sz->rnk != 1 || (p->sz->dims[0].n > 64))
+	  pln->super.super.pcost = pln->vl * cld->pcost;
+
+     return &(pln->super.super);
+}
+
+static solver *mksolver(int vecloop_dim, const int *buddies, int nbuddies)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->vecloop_dim = vecloop_dim;
+     slv->buddies = buddies;
+     slv->nbuddies = nbuddies;
+     return &(slv->super);
+}
+
+void X(dft_vrank_geq1_register)(planner *p)
+{
+     int i;
+
+     /* FIXME: Should we try other vecloop_dim values? */
+     static const int buddies[] = { 1, -1 };
+
+     const int nbuddies = (int)(sizeof(buddies) / sizeof(buddies[0]));
+
+     for (i = 0; i < nbuddies; ++i)
+          REGISTER_SOLVER(p, mksolver(buddies[i], buddies, nbuddies));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/dft/zero.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/dft/zero.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "dft.h"
+
+/* fill a complex array with zeros. */
+static void recur(const iodim *dims, int rnk, R *ri, R *ii)
+{
+     if (rnk == RNK_MINFTY)
+          return;
+     else if (rnk == 0)
+          ri[0] = ii[0] = K(0.0);
+     else if (rnk > 0) {
+          INT i, n = dims[0].n;
+          INT is = dims[0].is;
+
+	  if (rnk == 1) {
+	       /* this case is redundant but faster */
+	       for (i = 0; i < n; ++i)
+		    ri[i * is] = ii[i * is] = K(0.0);
+	  } else {
+	       for (i = 0; i < n; ++i)
+		    recur(dims + 1, rnk - 1, ri + i * is, ii + i * is);
+	  }
+     }
+}
+
+
+void X(dft_zerotens)(tensor *sz, R *ri, R *ii)
+{
+     recur(sz->dims, sz->rnk, ri, ii);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/FAQ/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/FAQ/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,19 @@
+BFNNCONV_SRC = bfnnconv.pl m-ascii.pl m-html.pl m-info.pl m-lout.pl m-post.pl
+
+FAQ = fftw-faq.ascii fftw-faq.html
+EXTRA_DIST = fftw-faq.bfnn $(FAQ) $(BFNNCONV_SRC) html.refs
+
+html.refs2: html.refs
+	cp -f ${srcdir}/html.refs html.refs2
+
+$(FAQ): $(BFNNCONV_SRC) fftw-faq.bfnn html.refs2
+	@echo converting...
+	perl -I${srcdir} ${srcdir}/bfnnconv.pl ${srcdir}/fftw-faq.bfnn
+	@echo converting again...
+	perl -I${srcdir} ${srcdir}/bfnnconv.pl ${srcdir}/fftw-faq.bfnn
+
+faq: $(FAQ)
+
+clean-local:
+	rm -f *~ core a.out *.lout *.ps *.info *.ascii *.xrefdb *.post
+	rm -rf *.html html.refs2
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/FAQ/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/FAQ/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,435 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = doc/FAQ
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+BFNNCONV_SRC = bfnnconv.pl m-ascii.pl m-html.pl m-info.pl m-lout.pl m-post.pl
+FAQ = fftw-faq.ascii fftw-faq.html
+EXTRA_DIST = fftw-faq.bfnn $(FAQ) $(BFNNCONV_SRC) html.refs
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu doc/FAQ/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu doc/FAQ/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+tags: TAGS
+TAGS:
+
+ctags: CTAGS
+CTAGS:
+
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-local mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+	clean-local distclean distclean-generic distclean-libtool \
+	distdir dvi dvi-am html html-am info info-am install \
+	install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	uninstall uninstall-am
+
+
+html.refs2: html.refs
+	cp -f ${srcdir}/html.refs html.refs2
+
+$(FAQ): $(BFNNCONV_SRC) fftw-faq.bfnn html.refs2
+	@echo converting...
+	perl -I${srcdir} ${srcdir}/bfnnconv.pl ${srcdir}/fftw-faq.bfnn
+	@echo converting again...
+	perl -I${srcdir} ${srcdir}/bfnnconv.pl ${srcdir}/fftw-faq.bfnn
+
+faq: $(FAQ)
+
+clean-local:
+	rm -f *~ core a.out *.lout *.ps *.info *.ascii *.xrefdb *.post
+	rm -rf *.html html.refs2
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/FAQ/bfnnconv.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/FAQ/bfnnconv.pl	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,298 @@
+#!/usr/bin/perl --
+# Copyright (C) 1993-1995 Ian Jackson.
+
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# It is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with GNU Emacs; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# (Note: I do not consider works produced using these BFNN processing
+# tools to be derivative works of the tools, so they are NOT covered
+# by the GPL.  However, I would appreciate it if you credited me if
+# appropriate in any documents you format using BFNN.)
+
+@outputs=('ascii','info','html');
+
+while ($ARGV[0] =~ m/^\-/) {
+    $_= shift(@ARGV);
+    if (m/^-only/) {
+        @outputs= (shift(@ARGV));
+    } else {
+        warn "unknown option `$_' ignored";
+    }
+}
+
+$prefix= $ARGV[0];
+$prefix= 'stdin' unless length($prefix);
+$prefix =~ s/\.bfnn$//;
+
+if (open(O,"$prefix.xrefdb")) {
+    @xrefdb= <O>;
+    close(O);
+} else {
+    warn "no $prefix.xrefdb ($!)";
+}
+
+$section= -1;
+for $thisxr (@xrefdb) {
+    $_= $thisxr;
+    chop;
+    if (m/^Q (\w+) ((\d+)\.(\d+)) (.*)$/) {
+        $qrefn{$1}= $2;
+        $qreft{$1}= $5;
+        $qn2ref{$3,$4}= $1;
+        $maxsection= $3;
+        $maxquestion[$3]= $4;
+    } elsif (m/^S (\d+) /) {
+        $maxsection= $1;
+        $sn2title{$1}=$';
+    }
+}
+
+open(U,">$prefix.xrefdb-new");
+
+for $x (@outputs) { require("m-$x.pl"); }
+
+&call('init');
+
+while (<>) {
+    chop;
+    next if m/^\\comment\b/;
+    if (!m/\S/) {
+        &call('endpara');
+        next;
+    }
+    if (s/^\\section +//) {
+        $line= $_;
+        $section++; $question=0;
+        print U "S $section $line\n";
+        $|=1; print "S$section",' 'x10,"\r"; $|=0;
+        &call('endpara');
+        &call('startmajorheading',"$section",
+              "Section $section",
+              $section<$maxsection ? "Section ".($section+1) : '',
+              $section>1 ? 'Section '.($section-1) : 'Top');
+        &text($line);
+        &call('endmajorheading');
+        if ($section) {
+            &call('endpara');
+            &call('startindex');
+            for $thisxr (@xrefdb) {
+                $_= $thisxr;
+                chop;
+                if (m/^Q (\w+) (\d+)\.(\d+) (.*)$/) {
+                    $ref= $1; $num1= $2; $num2= $3; $text= $4;
+                    next unless $num1 == $section;
+                    &call('startindexitem',$ref,"Q$num1.$num2","Question $num1.$num2");
+                    &text($text);
+                    &call('endindexitem');
+                }
+            }
+            &call('endindex');
+        }
+    } elsif (s/^\\question \d{2}[a-z]{3}((:\w+)?) +//) {
+        $line= $_;
+        $question++;
+        $qrefstring= $1;
+        $qrefstring= "q_${section}_$question" unless $qrefstring =~ s/^://;
+        print U "Q $qrefstring $section.$question $line\n";
+        $|=1; print "Q$section.$question",' 'x10,"\r"; $|=0;
+        &call('endpara');
+        &call('startminorheading',$qrefstring,
+              "Question $section.$question",
+              $question < $maxquestion[$section] ? "Question $section.".($question+1) :
+              $section < $maxsection ? "Question ".($section+1).".1" : '',
+              $question > 1 ? "Question $section.".($question-1) :
+              $section > 1 ? "Question ".($section-1).'.'.($maxquestion[$section-1]) :
+              'Top',
+              "Section $section");
+        &text("Question $section.$question.  $line");
+        &call('endminorheading');
+    } elsif (s/^\\only +//) {
+        @saveoutputs= @outputs;
+        @outputs=();
+        for $x (split(/\s+/,$_)) {
+            push(@outputs,$x) if grep($x eq $_, @saveoutputs);
+        }
+    } elsif (s/^\\endonly$//) {
+        @outputs= @saveoutputs;
+    } elsif (s/^\\copyto +//) {
+        $fh= $';
+        while(<>) {
+            last if m/^\\endcopy$/;
+            while (s/^([^\`]*)\`//) {
+                print $fh $1;
+                m/([^\\])\`/ || warn "`$_'";
+                $_= $';
+                $cmd= $`.$1;
+                $it= `$cmd`; chop $it;
+                print $fh $it;
+            }
+            print $fh $_;
+        }
+    } elsif (m/\\index$/) {
+        &call('startindex');
+        for $thisxr (@xrefdb) {
+            $_= $thisxr;
+            chop;
+            if (m/^Q (\w+) (\d+\.\d+) (.*)$/) {
+                $ref= $1; $num= $2; $text= $3;
+                &call('startindexitem',$ref,"Q$num","Question $num");
+                &text($text);
+                &call('endindexitem');
+            } elsif (m/^S (\d+) (.*)$/) {
+                $num= $1; $text= $2;
+                next unless $num;
+                &call('startindexmainitem',"s_$num",
+                      "Section $num.","Section $num");
+                &text($text);
+                &call('endindexitem');
+            } else {
+                warn $_;
+            }
+        }
+        &call('endindex');
+    } elsif (m/^\\call-(\w+) +(\w+)\s*(.*)$/) {
+        $fn= $1.'_'.$2;
+        eval { &$fn($3); };
+        warn $@ if length($@);
+    } elsif (m/^\\call +(\w+)\s*(.*)$/) {
+        eval { &call($1,$2); };
+        warn $@ if length($@);
+    } elsif (s/^\\set +(\w+)\s*//) {
+        $svalue= $'; $svari= $1;
+        eval("\$user_$svari=\$svalue"); $@ && warn "setting $svalue failed: $@\n";
+    } elsif (m/^\\verbatim$/) {
+        &call('startverbatim');
+        while (<>) {
+            chop;
+            last if m/^\\endverbatim$/;
+            &call('verbatim',$_);
+        }
+        &call('endverbatim');
+    } else {
+        s/\.$/\. /;
+        &text($_." ");
+    }
+}
+
+print ' 'x25,"\r";
+&call('finish');
+rename("$prefix.xrefdb-new","$prefix.xrefdb") || warn "rename xrefdb: $!";
+exit 0;
+
+
+sub text {
+    local($in,$rhs,$word,$refn,$reft,$fn,$style);
+    $in= "$holdover$_[0]";
+    $holdover= '';
+    while ($in =~ m/\\/) {
+#print STDERR ">$`##$'\n";
+        $rhs=$';
+        &call('text',$`);
+        $_= $rhs;
+        if (m/^\w+ $/) {
+            $holdover= "\\$&";
+            $in= '';
+        } elsif (s/^fn\s+([^\s\\]*\w)//) {
+            $in= $_;
+            $word= $1;
+            &call('courier');
+            &call('text',$word);
+            &call('endcourier');
+        } elsif (s/^tab\s+(\d+)\s+//) {
+            $in= $_; &call('tab',$1);
+        } elsif (s/^nl\s+//) {
+            $in= $_; &call('newline');
+        } elsif (s/^qref\s+(\w+)//) {
+            $refn= $qrefn{$1};
+            $reft= $qreft{$1};
+            if (!length($refn)) {
+                warn "unknown question `$1'";
+            }
+            $in= "$`\\pageref:$1:$refn:$reft\\endpageref.$_";
+        } elsif (s/^pageref:(\w+):([^:\n]+)://) {
+            $in= $_;
+            &call('pageref',$1,$2);
+        } elsif (s/^endpageref\.//) {
+            $in= $_; &call('endpageref');
+        } elsif (s/^(\w+)\{//) {
+            $in= $_; $fn= $1;
+            eval { &call("$fn"); };
+            if (length($@)) { warn $@; $fn= 'x'; }
+            push(@styles,$fn);
+        } elsif (s/^\}//) {
+            $in= $_;
+            $fn= pop(@styles);
+            if ($fn ne 'x') { &call("end$fn"); }
+        } elsif (s/^\\//) {
+            $in= $_;
+            &call('text',"\\");
+        } elsif (s,^(\w+)\s+([-A-Za-z0-9.\@:/]*\w),,) {
+#print STDERR "**$&**$_\n";
+            $in= $_;
+            $style=$1; $word= $2;
+            &call($style);
+            &call('text',$word);
+            &call("end$style");
+        } else {
+            warn "unknown control `\\$_'";
+            $in= $_;
+        }
+    }
+    &call('text',$in);
+}
+
+
+sub call {
+    local ($fnbase, @callargs) = @_;
+    local ($coutput);
+    for $coutput (@outputs) {
+        if ($fnbase eq 'text' && eval("\@${coutput}_cmds")) {
+#print STDERR "special handling text (@callargs) for $coutput\n";
+            $evstrg= "\$${coutput}_args[\$#${coutput}_args].=\"\@callargs\"";
+            eval($evstrg);
+            length($@) && warn "call adding for $coutput (($evstrg)): $@";
+        } else {
+            $fntc= $coutput.'_'.$fnbase; 
+            &$fntc(@callargs);
+        }
+    }
+}
+
+
+sub recurse {
+    local (@outputs) = $coutput;
+    local ($holdover);
+    &text($_[0]);
+}
+
+
+sub arg {
+#print STDERR "arg($_[0]) from $coutput\n";
+    $cmd= $_[0];
+    eval("push(\@${coutput}_cmds,\$cmd); push(\@${coutput}_args,'')");
+    length($@) && warn "arg setting up for $coutput: $@";
+}
+
+sub endarg {
+#print STDERR "endarg($_[0]) from $coutput\n";
+    $evstrg= "\$${coutput}_cmd= \$cmd= pop(\@${coutput}_cmds); ".
+             "\$${coutput}_arg= \$arg= pop(\@${coutput}_args); ";
+    eval($evstrg);
+    length($@) && warn "endarg extracting for $coutput (($evstrg)): $@";
+#print STDERR ">call $coutput $cmd $arg< (($evstrg))\n";
+    $evstrg= "&${coutput}_do_${cmd}(\$arg)";
+    eval($evstrg);
+    length($@) && warn "endarg running ${coutput}_do_${cmd} (($evstrg)): $@";
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/FAQ/fftw-faq.bfnn
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/FAQ/fftw-faq.bfnn	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,707 @@
+\comment This is the source for the FFTW FAQ list, in
+\comment the Bizarre Format With No Name.  It is turned into Lout
+\comment input, HTML, plain ASCII and an Info document by a Perl script.
+\comment
+\comment The format and scripts come from the Linux FAQ, by
+\comment Ian Jackson.
+\set brieftitle FFTW FAQ
+\set author     <A href="http://www.fftw.org">Matteo Frigo and Steven G. Johnson</A> / <A href="mailto:fftw@fftw.org">fftw@fftw.org</A>
+\set authormail fftw@fftw.org
+\set title      FFTW Frequently Asked Questions with Answers
+\set copyholder Matteo Frigo and Massachusetts Institute of Technology
+\call-html startup html.refs2
+\copyto ASCII
+            FFTW FREQUENTLY ASKED QUESTIONS WITH ANSWERS
+                            `date '+%d %h %Y'`
+			     Matteo Frigo
+			   Steven G. Johnson
+ 			    <fftw@fftw.org>
+
+\endcopy
+\copyto INFO
+START-INFO-DIR-ENTRY
+* FFTW FAQ: (fftw-faq). FFTW Frequently Asked Questions with Answers.
+END-INFO-DIR-ENTRY
+
+
+File: $prefix.info, Node: Top, Next: Question 1.1, Up: (dir)
+
+            FFTW FREQUENTLY ASKED QUESTIONS WITH ANSWERS
+                            `date '+%d %h %Y'`
+			     Matteo Frigo
+			   Steven G. Johnson
+			    <fftw@fftw.org>
+
+\endcopy
+
+This is the list of Frequently Asked Questions about FFTW, a
+collection of fast C routines for computing the Discrete Fourier
+Transform in one or more dimensions.
+
+\section  Index
+
+\index
+
+\comment ######################################################################
+
+\section  Introduction and General Information
+
+\question 26aug:whatisfftw  What is FFTW?
+
+FFTW is a free collection of fast C routines for computing the
+Discrete Fourier Transform in one or more dimensions.  It includes
+complex, real, symmetric, and parallel transforms, and can handle
+arbitrary array sizes efficiently.  FFTW is typically faster than
+other publically-available FFT implementations, and is even
+competitive with vendor-tuned libraries.  (See our web page for
+extensive benchmarks.)  To achieve this performance, FFTW uses novel
+code-generation and runtime self-optimization techniques (along with
+many other tricks).
+
+\question 26aug:whereisfftw  How do I obtain FFTW?
+
+FFTW can be found at \docref{the FFTW web page\}.  You can also
+retrieve it from \ftpon ftp.fftw.org in \ftpin /pub/fftw.
+
+\question 26aug:isfftwfree  Is FFTW free software?
+
+Starting with version 1.3, FFTW is Free Software in the technical
+sense defined by the Free Software Foundation (see \docref{Categories
+of Free and Non-Free Software\}), and is distributed under the terms
+of the GNU General Public License.  Previous versions of FFTW were
+distributed without fee for noncommercial use, but were not
+technically ``free.''
+
+Non-free licenses for FFTW are also available that permit different
+terms of use than the GPL.
+
+\question 10apr:nonfree  What is this about non-free licenses?
+
+The non-free licenses are for companies that wish to use FFTW in their
+products but are unwilling to release their software under the GPL
+(which would require them to release source code and allow free
+redistribution).  Such users can purchase an unlimited-use license
+from MIT.  Contact us for more details.
+
+We could instead have released FFTW under the LGPL, or even disallowed
+non-Free usage.  Suffice it to say, however, that MIT owns the
+copyright to FFTW and they only let us GPL it because we convinced
+them that it would neither affect their licensing revenue nor irritate
+existing licensees.
+
+\question 24oct:west In the West? I thought MIT was in the East?
+
+Not to an Italian.  You could say that we're a Spaghetti Western
+(with apologies to Sergio Leone).
+
+\comment ######################################################################
+
+\section  Installing FFTW
+
+\question 26aug:systems  Which systems does FFTW run on?
+
+FFTW is written in ANSI C, and should work on any system with a decent
+C compiler.  (See also \qref runOnWindows, \qref compilerCrashes.)
+FFTW can also take advantage of certain hardware-specific features,
+such as cycle counters and SIMD instructions, but this is optional.
+
+\question 26aug:runOnWindows  Does FFTW run on Windows?
+
+Yes, many people have reported successfully using FFTW on Windows with
+various compilers.  FFTW was not developed on Windows, but the source
+code is essentially straight ANSI C.  See also the \docref{FFTW
+Windows installation notes\}, \qref compilerCrashes, and \qref
+vbetalia.
+
+\question 26aug:compilerCrashes  My compiler has trouble with FFTW.
+
+Complain fiercely to the vendor of the compiler. 
+
+We have successfully used \courier{gcc\} 3.2.x on x86 and PPC, a
+recent Compaq C compiler for Alpha, version 6 of IBM's \courier{xlc\}
+compiler for AIX, Intel's \courier{icc\} versions 5-7, and Sun
+WorkShop \courier{cc\} version 6.  
+
+FFTW is likely to push compilers to their limits, however, and several
+compiler bugs have been exposed by FFTW.  A partial list follows.
+
+\courier{gcc\} 2.95.x for Solaris/SPARC produces incorrect code for
+the test program (workaround: recompile the \courier{libbench2\}
+directory with \courier{-O2\}).
+
+NetBSD/macppc 1.6 comes with a \courier{gcc\} version that also
+miscompiles the test program. (Please report a workaround if you know
+one.)
+
+\courier{gcc\} 3.2.3 for ARM reportedly crashes during compilation.
+This bug is reportedly fixed in later versions of \courier{gcc\}.
+
+Versions 8.0 and 8.1 of Intel's \courier{icc\} falsely claim to be
+\courier{gcc\}, so you should specify \courier{CC="icc -no-gcc"\};
+this is automatic in FFTW 3.1.  \courier{icc-8.0.066\} reportely
+produces incorrect code for FFTW 2.1.5, but is fixed in version 8.1.
+\courier{icc-7.1\} compiler build 20030402Z appears to produce
+incorrect dependencies, causing the compilation to fail.
+\courier{icc-7.1\} build 20030307Z appears to work fine.  (Use
+\courier{icc -V\} to check which build you have.)  As of 2003/04/18,
+build 20030402Z appears not to be available any longer on Intel's
+website, whereas the older build 20030307Z is available.
+
+\courier{ranlib\} of GNU \courier{binutils\} 2.9.1 on Irix has been
+observed to corrupt the FFTW libraries, causing a link failure when
+FFTW is compiled.  Since \courier{ranlib\} is completely superfluous
+on Irix, we suggest deleting it from your system and replacing it with
+a symbolic link to \courier{/bin/echo\}.
+
+If support for SIMD instructions is enabled in FFTW, further compiler
+problems may appear:
+
+\courier{gcc\} 3.4.[0123] for x86 produces incorrect SSE2 code for
+FFTW when \courier{-O2\} (the best choice for FFTW) is used, causing
+FFTW to crash (\courier{make check\} crashes).  This bug is fixed in
+\courier{gcc\} 3.4.4.  On x86_64 (amd64/em64t), \courier{gcc\} 3.4.4
+reportedly still has a similar problem, but this is fixed as of
+\courier{gcc\} 3.4.6.
+
+\courier{gcc-3.2\} for x86 produces incorrect SIMD code if
+\courier{-O3\} is used.  The same compiler produces incorrect SIMD
+code if no optimization is used, too.  When using \courier{gcc-3.2\},
+it is a good idea not to change the default \courier{CFLAGS\} selected
+by the \courier{configure\} script.
+
+Some 3.0.x and 3.1.x versions of \courier{gcc\} on \courier{x86\} may
+crash.  \courier{gcc\} so-called 2.96 shipping with RedHat 7.3 crashes
+when compiling SIMD code.  In both cases, please upgrade to
+\courier{gcc-3.2\} or later.
+
+Intel's \courier{icc\} 6.0 misaligns SSE constants, but FFTW has a
+workaround. \courier{icc\} 8.x fails to compile FFTW 3.0.x because it
+falsely claims to be \courier{gcc\}; we believe this to be a bug in
+\courier{icc\}, but FFTW 3.1 has a workaround.
+
+Visual C++ 2003 reportedly produces incorrect code for SSE/SSE2 when
+compiling FFTW.  This bug was reportedly fixed in VC++ 2005;
+alternatively, you could switch to the Intel compiler. VC++ 6.0 also
+reportedly produces incorrect code for the file
+\courier{reodft11e-r2hc-odd.c\} unless optimizations are disabled for
+that file.
+
+\courier{gcc\} 2.95 on MacOS X miscompiles AltiVec code (fixed in
+later versions).  \courier{gcc\} 3.2.x miscompiles AltiVec
+permutations, but FFTW has a workaround.  \courier{gcc\} 4.0.1 on
+MacOS for Intel crashes when compiling FFTW; a workaround is to
+compile one file without optimization: \courier{cd kernel; make
+CFLAGS=" " trig.lo\}.
+
+\courier{gcc\} 4.1.1 reportedly crashes when compiling FFTW for MIPS;
+the workaround is to compile the file it crashes on
+(\courier{t2_64.c\}) with a lower optimization level.
+
+\courier{gcc\} versions 4.1.2 to 4.2.0 for x86 reportedly miscompile
+FFTW 3.1's test program, causing \courier{make check\} to crash
+(\courier{gcc\} bug #26528).  The bug was reportedly fixed in
+\courier{gcc\} version 4.2.1 and later.  A workaround is to compile
+\courier{libbench2/verify-lib.c\} without optimization.
+
+\question 26aug:solarisSucks FFTW does not compile on Solaris, complaining about \courier{const\}.
+
+We know that at least on Solaris 2.5.x with Sun's compilers 4.2 you
+might get error messages from \courier{make\} such as
+
+\courier{"./fftw.h", line 88: warning: const is a keyword in ANSI C\}
+
+This is the case when the \courier{configure\} script reports that
+\courier{const\} does not work:
+
+\courier{checking for working const... (cached) no\}
+
+You should be aware that Solaris comes with two compilers, namely,
+\courier{/opt/SUNWspro/SC4.2/bin/cc\} and \courier{/usr/ucb/cc\}.  The
+latter compiler is non-ANSI.  Indeed, it is a perverse shell script
+that calls the real compiler in non-ANSI mode.  In order
+to compile FFTW, change your path so that the right \courier{cc\}
+is used.
+
+To know whether your compiler is the right one,  type
+\courier{cc -V\}.  If the compiler prints ``\courier{ucbcc\}'',
+as in 
+
+\courier{ucbcc: WorkShop Compilers 4.2 30 Oct 1996 C 4.2\}
+
+then the compiler is wrong.  The right message is something like
+
+\courier{cc: WorkShop Compilers 4.2 30 Oct 1996 C 4.2\}
+
+\question 19mar:3dnow  What's the difference between \courier{--enable-3dnow\} and \courier{--enable-k7\}?
+
+\courier{--enable-k7\} enables 3DNow! instructions on K7 processors
+(AMD Athlon and its variants).  K7 support is provided by assembly
+routines generated by a special purpose compiler.
+As of fftw-3.2, --enable-k7 is no longer supported.
+
+\courier{--enable-3dnow\} enables generic 3DNow! support using
+\courier{gcc\} builtin functions.  This works on earlier AMD
+processors, but it is not as fast as our special assembly routines.
+As of fftw-3.1, --enable-3dnow is no longer supported.
+
+\question 18apr:fma What's the difference between the fma and the non-fma versions?
+
+The fma version tries to exploit the fused multiply-add instructions
+implemented in many processors such as PowerPC, ia-64, and MIPS.  The
+two FFTW packages are otherwise identical.  In FFTW 3.1, the fma and
+non-fma versions were merged together into a single package, and the
+\courier{configure\} script attempts to automatically guess which
+version to use.  
+
+The FFTW 3.1 \courier{configure\} script enables fma by default on
+PowerPC, Itanium, and PA-RISC, and disables it otherwise.  You can
+force one or the other by using the \courier{--enable-fma\} or
+\courier{--disable-fma\} flag for \courier{configure\}.
+
+Definitely use fma if you have a PowerPC-based system with
+\courier{gcc\} (or IBM \courier{xlc\}).  This includes all GNU/Linux
+systems for PowerPC and the older PowerPC-based MacOS systems.  Also
+use it on PA-RISC and Itanium with the HP/UX compiler.
+
+Definitely do not use the fma version if you have an ia-32 processor
+(Intel, AMD, MacOS on Intel, etcetera).
+
+For other architectures/compilers, the situation is not so clear.  For
+example, ia-64 has the fma instruction, but \courier{gcc-3.2\} appears
+not to exploit it correctly.  Other compilers may do the right thing,
+but we have not tried them.  Please send us your feedback so that we
+can update this FAQ entry.
+
+\question 26aug:languages  Which language is FFTW written in?
+
+FFTW is written in ANSI C.  Most of the code, however, was
+automatically generated by a program called \courier{genfft\}, written
+in the Objective Caml dialect of ML.  You do not need to know ML or to
+have an Objective Caml compiler in order to use FFTW.
+
+\courier{genfft\} is provided with the FFTW sources, which means that
+you can play with the code generator if you want.  In this case, you
+need a working Objective Caml system.  Objective Caml is available
+from \docref{the Caml web page\}.
+
+\question 26aug:fortran  Can I call FFTW from Fortran?
+
+Yes, FFTW (versions 1.3 and higher) contains a Fortran-callable
+interface, documented in the FFTW manual.
+
+By default, FFTW configures its Fortran interface to work with the
+first compiler it finds, e.g. \courier{g77\}.  To configure for a
+different, incompatible Fortran compiler \courier{foobar\}, use
+\courier{./configure F77=foobar\} when installing FFTW.  (In the case
+of \courier{g77\}, however, FFTW 3.x also includes an extra set of
+Fortran-callable routines with one less underscore at the end of
+identifiers, which should cover most other Fortran compilers on Linux
+at least.)
+
+\question 26aug:cplusplus  Can I call FFTW from C++?
+
+Most definitely.  FFTW should compile and/or link under any C++
+compiler.  Moreover, it is likely that the C++ \courier{<complex>\}
+template class is bit-compatible with FFTW's complex-number format
+(see the FFTW manual for more details).
+
+\question 26aug:whynotfortran  Why isn't FFTW written in Fortran/C++?
+
+Because we don't like those languages, and neither approaches the
+portability of C.
+
+\question 29mar:singleprec How do I compile FFTW to run in single precision?
+
+On a Unix system: \courier{configure --enable-float\}.  On a non-Unix
+system: edit \courier{config.h\} to \courier{#define\} the symbol
+\courier{FFTW_SINGLE\} (for FFTW 3.x).  In both cases, you must then
+recompile FFTW.  In FFTW 3, all FFTW identifiers will then begin with
+\courier{fftwf_\} instead of \courier{fftw_\}.
+
+\question 28mar:64bitk7 --enable-k7 does not work on x86-64
+
+Support for --enable-k7 was discontinued in fftw-3.2.
+
+The fftw-3.1 release supports --enable-k7.  This option only works on
+32-bit x86 machines that implement 3DNow!, including the AMD Athlon
+and the AMD Opteron in 32-bit mode.  --enable-k7 does not work on AMD
+Opteron in 64-bit mode.  Use --enable-sse for x86-64 machines.
+
+FFTW supports 3DNow! by means of assembly code generated by a
+special-purpose compiler.  It is hard to produce assembly code that
+works in both 32-bit and 64-bit mode.
+
+\comment ######################################################################
+
+\section  Using FFTW
+
+\question 15mar:fftw2to3 Why not support the FFTW 2 interface in FFTW 3?
+
+FFTW 3 has semantics incompatible with earlier versions: its plans can
+only be used for a given stride, multiplicity, and other
+characteristics of the input and output arrays; these stronger
+semantics are necessary for performance reasons.  Thus, it is
+impossible to efficiently emulate the older interface (whose plans can
+be used for any transform of the same size).  We believe that it
+should be possible to upgrade most programs without any difficulty,
+however.
+
+\question 20mar:planperarray Why do FFTW 3 plans encapsulate the input/output arrays and not just the algorithm?
+
+There are several reasons:
+
+\call startlist
+\call item
+It was important for performance reasons that the plan be specific to
+array characteristics like the stride (and alignment, for SIMD), and
+requiring that the user maintain these invariants is error prone.
+\call item
+In most high-performance applications, as far as we can tell, you are
+usually transforming the same array over and over, so FFTW's semantics
+should not be a burden.
+\call item
+If you need to transform another array of the same size, creating a
+new plan once the first exists is a cheap operation.
+\call item
+If you need to transform many arrays of the same size at once, you
+should really use the \courier{plan_many\} routines in FFTW's "advanced"
+interface.
+\call item
+If the abovementioned array characteristics are the same, you are
+willing to pay close attention to the documentation, and you really
+need to, we provide a "new-array execution" interface to apply a plan
+to a new array.
+\call endlist
+
+\question 25may:slow FFTW seems really slow.
+
+You are probably recreating the plan before every transform, rather
+than creating it once and reusing it for all transforms of the same
+size.  FFTW is designed to be used in the following way:
+
+\call startlist
+\call item
+First, you create a plan.  This will take several seconds.
+\call item
+Then, you reuse the plan many times to perform FFTs.  These are fast.
+\call endlist
+
+If you don't need to compute many transforms and the time for the
+planner is significant, you have two options.  First, you can use the
+\courier{FFTW_ESTIMATE\} option in the planner, which uses heuristics
+instead of runtime measurements and produces a good plan in a short
+time.  Second, you can use the wisdom feature to precompute the plan;
+see \qref savePlans
+
+\question 22oct:slows FFTW slows down after repeated calls.
+
+Probably, NaNs or similar are creeping into your data, and the
+slowdown is due to the resulting floating-point exceptions.  For
+example, be aware that repeatedly FFTing the same array is a diverging
+process (because FFTW computes the unnormalized transform).
+
+\question 22oct:segfault An FFTW routine is crashing when I call it.
+
+Did the FFTW test programs pass (\courier{make check\}, or \courier{cd
+tests; make bigcheck\} if you want to be paranoid)?  If so, you almost
+certainly have a bug in your own code.  For example, you could be
+passing invalid arguments (such as wrongly-sized arrays) to FFTW, or
+you could simply have memory corruption elsewhere in your program that
+causes random crashes later on.  Please don't complain to us unless
+you can come up with a minimal self-contained program (preferably
+under 30 lines) that illustrates the problem.
+
+\question 22oct:fortran64 My Fortran program crashes when calling FFTW.
+
+As described in the manual, on 64-bit machines you must store the
+plans in variables large enough to hold a pointer, for example
+\courier{integer*8\}.  We recommend using \courier{integer*8\} on
+32-bit machines as well, to simplify porting.
+
+\question 24mar:conventions FFTW gives results different from my old FFT.
+
+People follow many different conventions for the DFT, and you should
+be sure to know the ones that we use (described in the FFTW manual).
+In particular, you should be aware that the
+\courier{FFTW_FORWARD\}/\courier{FFTW_BACKWARD\} directions correspond
+to signs of -1/+1 in the exponent of the DFT definition.
+(\italic{Numerical Recipes\} uses the opposite convention.)  
+
+You should also know that we compute an unnormalized transform.  In
+contrast, Matlab is an example of program that computes a normalized
+transform.  See \qref whyscaled.
+
+Finally, note that floating-point arithmetic is not exact, so
+different FFT algorithms will give slightly different results (on the
+order of the numerical accuracy; typically a fractional difference of
+1e-15 or so in double precision).
+
+\question 31aug:nondeterministic FFTW gives different results between runs
+
+If you use \courier{FFTW_MEASURE\} or \courier{FFTW_PATIENT\} mode,
+then the algorithm FFTW employs is not deterministic: it depends on
+runtime performance measurements.  This will cause the results to vary
+slightly from run to run.  However, the differences should be slight,
+on the order of the floating-point precision, and therefore should
+have no practical impact on most applications.
+
+If you use saved plans (wisdom) or \courier{FFTW_ESTIMATE\} mode,
+however, then the algorithm is deterministic and the results should be
+identical between runs.
+
+\question 26aug:savePlans Can I save FFTW's plans?
+
+Yes. Starting with version 1.2, FFTW provides the \courier{wisdom\}
+mechanism for saving plans; see the FFTW manual.
+
+\question 14sep:whyscaled Why does your inverse transform return a scaled result?
+
+Computing the forward transform followed by the backward transform (or
+vice versa) yields the original array scaled by the size of the array.
+(For multi-dimensional transforms, the size of the array is the
+product of the dimensions.)  We could, instead, have chosen a
+normalization that would have returned the unscaled array. Or, to
+accomodate the many conventions in this matter, the transform routines
+could have accepted a "scale factor" parameter. We did not do this,
+however, for two reasons. First, we didn't want to sacrifice
+performance in the common case where the scale factor is 1. Second, in
+real applications the FFT is followed or preceded by some computation
+on the data, into which the scale factor can typically be absorbed at
+little or no cost.
+
+\question 02dec:centerorigin How can I make FFTW put the origin (zero frequency) at the center of its output?
+
+For human viewing of a spectrum, it is often convenient to put the
+origin in frequency space at the center of the output array, rather
+than in the zero-th element (the default in FFTW).  If all of the
+dimensions of your array are even, you can accomplish this by simply
+multiplying each element of the input array by (-1)^(i + j + ...),
+where i, j, etcetera are the indices of the element.  (This trick is a
+general property of the DFT, and is not specific to FFTW.)
+
+\question 08may:imageaudio How do I FFT an image/audio file in \italic{foobar\} format?
+
+FFTW performs an FFT on an array of floating-point values.  You can
+certainly use it to compute the transform of an image or audio stream,
+but you are responsible for figuring out your data format and
+converting it to the form FFTW requires.
+
+\question 09apr:linkfails My program does not link (on Unix).
+
+The libraries must be listed in the correct order (\courier{-lfftw3
+-lm\} for FFTW 3.x) and \italic{after\} your program sources/objects.
+(The general rule is that if \italic{A\} uses \italic{B\}, then
+\italic{A\} must be listed before \italic{B\} in the link command.).
+
+\question 15mar:linkheader I included your header, but linking still fails.
+
+You're a C++ programmer, aren't you?  You have to compile the FFTW
+library and link it into your program, not just \courier{#include
+<fftw3.h>\}.  (Yes, this is really a FAQ.)
+
+\question 22oct:nostack My program crashes, complaining about stack space.
+
+You cannot declare large arrays with automatic storage (e.g. via
+\courier{fftw_complex array[N]\}); you should use
+\courier{fftw_malloc\} (or equivalent) to allocate the arrays you want
+to transform if they are larger than a few hundred elements.
+
+\question 13may:leaks FFTW seems to have a memory leak.
+
+After you create a plan, FFTW caches the information required to
+quickly recreate the plan.  (See \qref savePlans) It also maintains a
+small amount of other persistent memory.  You can deallocate all of
+FFTW's internally allocated memory, if you wish, by calling
+\courier{fftw_cleanup()\}, as documented in the manual.
+
+\question 16may:allzero The output of FFTW's transform is all zeros.
+
+You should initialize your input array \italic{after\} creating the
+plan, unless you use \courier{FFTW_ESTIMATE\}: planning with
+\courier{FFTW_MEASURE\} or \courier{FFTW_PATIENT\} overwrites the
+input/output arrays, as described in the manual.
+
+\question 05sep:vbetalia How do I call FFTW from the Microsoft language du jour?
+
+Please \italic{do not\} ask us Windows-specific questions.  We do not
+use Windows.  We know nothing about Visual Basic, Visual C++, or .NET.
+Please find the appropriate Usenet discussion group and ask your
+question there.  See also \qref runOnWindows.
+
+\question 15oct:pruned Can I compute only a subset of the DFT outputs?
+
+In general, no, an FFT intrinsically computes all outputs from all
+inputs.  In principle, there is something called a \italic{pruned
+FFT\} that can do what you want, but to compute K outputs out of N the
+complexity is in general O(N log K) instead of O(N log N), thus saving
+only a small additive factor in the log.  (The same argument holds if
+you instead have only K nonzero inputs.)
+
+There are some specific cases in which you can get the O(N log K)
+performance benefits easily, however, by combining a few ordinary
+FFTs.  In particular, the case where you want the first K outputs,
+where K divides N, can be handled by performing N/K transforms of size
+K and then summing the outputs multiplied by appropriate phase
+factors.  For more details, see \docref{pruned FFTs with FFTW\}.
+
+There are also some algorithms that compute pruned transforms
+\italic{approximately\}, but they are beyond the scope of this FAQ.
+
+\question 21jan:transpose  Can I use FFTW's routines for in-place and out-of-place matrix transposition?
+
+You can use the FFTW guru interface to create a rank-0 transform of
+vector rank 2 where the vector strides are transposed.  (A rank-0
+transform is equivalent to a 1D transform of size 1, which.  just
+copies the input into the output.)  Specifying the same location for
+the input and output makes the transpose in-place.
+
+For double-valued data stored in row-major format, plan creation looks like
+this:
+
+\verbatim
+fftw_plan plan_transpose(int rows, int cols, double *in, double *out)
+{
+    const unsigned flags = FFTW_ESTIMATE; /* other flags are possible */
+    fftw_iodim howmany_dims[2];
+
+    howmany_dims[0].n  = rows;
+    howmany_dims[0].is = cols;
+    howmany_dims[0].os = 1;
+
+    howmany_dims[1].n  = cols;
+    howmany_dims[1].is = 1;
+    howmany_dims[1].os = rows;
+
+    return fftw_plan_guru_r2r(/*rank=*/ 0, /*dims=*/ NULL,
+                              /*howmany_rank=*/ 2, howmany_dims,
+                              in, out, /*kind=*/ NULL, flags);
+}
+\endverbatim
+
+(This entry was written by Rhys Ulerich.)
+
+\comment ######################################################################
+
+\section  Internals of FFTW
+
+\question 26aug:howworks  How does FFTW work?
+
+The innovation (if it can be so called) in FFTW consists in having a
+variety of composable \italic{solvers\}, representing different FFT
+algorithms and implementation strategies, whose combination into a
+particular \italic{plan\} for a given size can be determined at
+runtime according to the characteristics of your machine/compiler.
+This peculiar software architecture allows FFTW to adapt itself to
+almost any machine.
+
+For more details (albeit somewhat outdated), see the paper "FFTW: An
+Adaptive Software Architecture for the FFT", by M. Frigo and
+S. G. Johnson, \italic{Proc. ICASSP\} 3, 1381 (1998), also
+available at \docref{the FFTW web page\}.
+
+\question 26aug:whyfast Why is FFTW so fast?
+
+This is a complex question, and there is no simple answer.  In fact,
+the authors do not fully know the answer, either.  In addition to many
+small performance hacks throughout FFTW, there are three general
+reasons for FFTW's speed.
+
+\call startlist
+\call item
+	FFTW uses a variety of FFT algorithms and implementation styles
+that can be arbitrarily composed to adapt itself to
+a machine.  See \qref howworks.
+\call item
+	FFTW uses a code generator to produce highly-optimized
+routines for computing small transforms.
+\call item
+	FFTW uses explicit divide-and-conquer to take advantage
+of the memory hierarchy.
+\call endlist
+
+For more details (albeit somewhat outdated), see the paper "FFTW: An
+Adaptive Software Architecture for the FFT", by M. Frigo and
+S. G. Johnson, \italic{Proc. ICASSP\} 3, 1381 (1998),
+available along with other references at \docref{the FFTW web page\}.
+
+\comment ######################################################################
+
+\section  Known bugs
+
+\question 27aug:rfftwndbug  FFTW 1.1 crashes in rfftwnd on Linux.
+
+This bug was fixed in FFTW 1.2.  There was a bug in \courier{rfftwnd\}
+causing an incorrect amount of memory to be allocated.  The bug showed
+up in Linux with libc-5.3.12 (and nowhere else that we know of).
+
+\question 15oct:fftwmpibug The MPI transforms in FFTW 1.2 give incorrect results/leak memory.
+
+These bugs were corrected in FFTW 1.2.1.  The MPI transforms (really,
+just the transpose routines) in FFTW 1.2 had bugs that could cause
+errors in some situations.
+
+\question 05nov:testsingbug The test programs in FFTW 1.2.1 fail when I change FFTW to use single precision.
+
+This bug was fixed in FFTW 1.3.  (Older versions of FFTW did
+work in single precision, but the test programs didn't--the error
+tolerances in the tests were set for double precision.)
+
+\question 24mar:teststoobig The test program in FFTW 1.2.1 fails for n > 46340.
+
+This bug was fixed in FFTW 1.3.  FFTW 1.2.1 produced the right answer,
+but the test program was wrong.  For large n, n*n in the naive
+transform that we used for comparison overflows 32 bit integer
+precision, breaking the test.
+
+\question 24aug:linuxthreads The threaded code fails on Linux Redhat 5.0
+
+We had problems with glibc-2.0.5.  The code should work with
+glibc-2.0.7.
+
+\question 26sep:bigrfftwnd FFTW 2.0's rfftwnd fails for rank > 1 transforms with a final dimension >= 65536.
+
+This bug was fixed in FFTW 2.0.1.  (There was a 32-bit integer overflow due
+to a poorly-parenthesized expression.)
+
+\question 26mar:primebug FFTW 2.0's complex transforms give the wrong results with prime factors 17 to 97.
+
+There was a bug in the complex transforms that could cause incorrect
+results under (hopefully rare) circumstances for lengths with
+intermediate-size prime factors (17-97).  This bug was fixed in FFTW
+2.1.1.
+
+\question 05apr:mpichbug FFTW 2.1.1's MPI test programs crash with MPICH.
+
+This bug was fixed in FFTW 2.1.2.  The 2.1/2.1.1 MPI test programs crashed
+when using the MPICH implementation of MPI with the \courier{ch_p4\}
+device (TCP/IP); the transforms themselves worked fine.
+
+\question 25may:aixthreadbug FFTW 2.1.2's multi-threaded transforms don't work on AIX.
+
+This bug was fixed in FFTW 2.1.3.  The multi-threaded transforms in
+previous versions didn't work with AIX's \courier{pthreads\}
+implementation, which idiosyncratically creates threads in detached
+(non-joinable) mode by default.
+
+\question 27sep:bigprimebug FFTW 2.1.2's complex transforms give incorrect results for large prime sizes.
+
+This bug was fixed in FFTW 2.1.3.  FFTW's complex-transform algorithm
+for prime sizes (in versions 2.0 to 2.1.2) had an integer overflow
+problem that caused incorrect results for many primes greater than
+32768 (on 32-bit machines).  (Sizes without large prime factors are
+not affected.)
+
+\question 25may:solaristhreadbug FFTW 2.1.3's multi-threaded transforms don't give any speedup on Solaris.
+
+This bug was fixed in FFTW 2.1.4.  (By default, Solaris creates
+threads that do not parallelize over multiple processors, so one has
+to request the proper behavior specifically.)
+
+\question 03may:aixflags FFTW 2.1.3 crashes on AIX.
+
+The FFTW 2.1.3 \courier{configure\} script picked incorrect compiler
+flags for the \courier{xlc\} compiler on newer IBM processors.  This
+is fixed in FFTW 2.1.4.
+
+\comment Here it ends!
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/FAQ/html.refs
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/FAQ/html.refs	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,7 @@
+\ References for the FFTW FAQ
+\
+the FFTW web page		\ http://www.fftw.org
+FFTW Windows installation notes \ http://www.fftw.org/install/windows.html
+Categories of Free and Non-Free Software \ http://www.gnu.org/philosophy/categories.html
+the Caml web page		\ http://caml.inria.fr
+pruned FFTs with FFTW           \ http://www.fftw.org/pruned.html
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/FAQ/m-ascii.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/FAQ/m-ascii.pl	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,189 @@
+## ASCII output
+# Copyright (C) 1993-1995 Ian Jackson.
+
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# It is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with GNU Emacs; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# (Note: I do not consider works produced using these BFNN processing
+# tools to be derivative works of the tools, so they are NOT covered
+# by the GPL.  However, I would appreciate it if you credited me if
+# appropriate in any documents you format using BFNN.)
+
+sub ascii_init {
+    open(ASCII,">$prefix.ascii");
+}
+
+sub ascii_startmajorheading {
+    print ASCII '='x79,"\n\n";
+    $ascii_status= 'h';
+    &ascii_text($_[0] ? "Section $_[0].  " : '');
+}
+
+sub ascii_startminorheading {
+    print ASCII '-'x79,"\n\n";
+    $ascii_status= 'h';
+}
+
+sub ascii_italic { &ascii_text('*'); }
+sub ascii_enditalic { $ascii_para .= '*'; }
+
+sub ascii_email { &ascii_text('<'); } sub ascii_endemail { &ascii_text('>'); }
+
+sub ascii_ftpon { } sub ascii_endftpon { }
+sub ascii_ftpin { } sub ascii_endftpin { }
+sub ascii_docref { } sub ascii_enddocref { }
+sub ascii_courier { } sub ascii_endcourier { }
+sub ascii_newsgroup { }  sub ascii_endnewsgroup { }
+sub ascii_ftpsilent { $ascii_ignore++; }
+sub ascii_endftpsilent { $ascii_ignore--; }
+
+sub ascii_text {
+    return if $ascii_ignore;
+    if ($ascii_status eq '') {
+        $ascii_status= 'p';
+    }
+    $ascii_para .= $_[0];
+}
+
+sub ascii_tab {
+    local ($n) = $_[0]-length($ascii_para);
+    $ascii_para .= ' 'x$n if $n>0;
+}
+
+sub ascii_newline {
+    return unless $ascii_status eq 'p';
+    &ascii_writepara;
+}
+
+sub ascii_writepara {
+    local ($thisline, $thisword, $rest);
+    for (;;) {
+        last unless $ascii_para =~ m/\S/;
+        $thisline= $ascii_indentstring;
+        for (;;) {
+            last unless $ascii_para =~ m/^(\s*\S+)/;
+            unless (length($1) + length($thisline) < 75 ||
+                    length($thisline) == length($ascii_indentstring)) {
+                last;
+            }
+            $thisline .= $1;
+            $ascii_para= $';
+        }
+        $ascii_para =~ s/^\s*//;
+        print ASCII $thisline,"\n";
+        $ascii_indentstring= $ascii_nextindent;
+        last unless length($ascii_para);
+    }
+    $ascii_status= '';  $ascii_para= '';
+}    
+
+sub ascii_endpara {
+    return unless $ascii_status eq 'p';
+    &ascii_writepara;
+    print ASCII "\n";
+}
+
+sub ascii_endheading {
+    $ascii_para =~ s/\s*$//;
+    print ASCII "$ascii_para\n\n";
+    $ascii_status= '';
+    $ascii_para= '';
+}
+
+sub ascii_endmajorheading { &ascii_endheading(@_); }
+sub ascii_endminorheading { &ascii_endheading(@_); }
+
+sub ascii_startverbatim {
+    $ascii_vstatus= $ascii_status;
+    &ascii_writepara;
+}
+
+sub ascii_verbatim {
+    print ASCII $_[0],"\n";
+}
+
+sub ascii_endverbatim {
+    $ascii_status= $ascii_vstatus;
+}
+
+sub ascii_finish {
+    close(ASCII);
+}
+
+sub ascii_startindex { $ascii_status= ''; }
+sub ascii_endindex { $ascii_status= 'p'; }
+
+sub ascii_endindexitem {
+    printf ASCII " %-11s %-.66s\n",$ascii_left,$ascii_para;
+    $ascii_status= 'p';
+    $ascii_para= '';
+}
+
+sub ascii_startindexitem {
+    $ascii_left= $_[1];
+}
+
+sub ascii_startindexmainitem {
+    $ascii_left= $_[1];
+    print ASCII "\n" if $ascii_status eq 'p';
+}
+
+sub ascii_startindent {
+    $ascii_istatus= $ascii_status;
+    &ascii_writepara;
+    $ascii_indentstring= "   $ascii_indentstring";
+    $ascii_nextindent= "   $ascii_nextindent";
+}
+
+sub ascii_endindent {
+    $ascii_indentstring =~ s/^   //;
+    $ascii_nextindent =~ s/^   //;
+    $ascii_status= $ascii_istatus;
+}
+
+sub ascii_startpackedlist { $ascii_plc=0; }
+sub ascii_endpackedlist { &ascii_newline if !$ascii_plc; }
+sub ascii_packeditem {
+    &ascii_newline if !$ascii_plc;
+    &ascii_tab($ascii_plc*40+5);
+    $ascii_plc= !$ascii_plc;
+}
+
+sub ascii_startlist {
+    &ascii_endpara;
+    $ascii_indentstring= "  $ascii_indentstring";
+    $ascii_nextindent= "  $ascii_nextindent";
+}
+
+sub ascii_endlist {
+    &ascii_endpara;
+    $ascii_indentstring =~ s/^  //;
+    $ascii_nextindent =~ s/^  //;
+}
+
+sub ascii_item {
+    &ascii_newline;
+    $ascii_indentstring =~ s/  $/* /;
+}
+
+sub ascii_pageref {
+    &ascii_text("Q$_[1] \`");
+}
+
+sub ascii_endpageref {
+    &ascii_text("'");
+}
+
+1;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/FAQ/m-html.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/FAQ/m-html.pl	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,337 @@
+## HTML output
+# Copyright (C) 1993-1995 Ian Jackson.
+
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# It is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with GNU Emacs; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# (Note: I do not consider works produced using these BFNN processing
+# tools to be derivative works of the tools, so they are NOT covered
+# by the GPL.  However, I would appreciate it if you credited me if
+# appropriate in any documents you format using BFNN.)
+
+%saniarray= ('<','lt', '>','gt', '&','amp', '"','quot');
+
+sub html_init {
+    $html_prefix = './'.$prefix;
+    $html_prefix =~ s:^\.//:/:;
+    system('rm','-r',"$html_prefix.html");
+    system('mkdir',"$html_prefix.html");
+    open(HTML,">$html_prefix.html/index.html");
+    print HTML "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2//EN\">\n";
+    print HTML "<html>\n";
+    $html_needpara= -1;
+    $html_end='';
+    chop($html_date=`date '+%d %B %Y'`);
+    chop($html_year=`date '+%Y'`);
+}
+
+sub html_startup {
+    print HTML <<END;
+<head><title>
+$user_title
+</title>
+<link rev="made" href="mailto:$user_authormail">
+<link rel="Contents" href="index.html">
+<link rel="Start" href="index.html">
+<META name="description"
+      content="Frequently asked questions and answers (FAQ) for FFTW.">
+<link rel="Bookmark" title="FFTW FAQ" href="index.html">
+<LINK rel="Bookmark" title="FFTW Home Page"
+      href="http://www.fftw.org">
+<LINK rel="Bookmark" title="FFTW Manual"
+      href="http://www.fftw.org/doc/">
+</head><body text="#000000" bgcolor="#FFFFFF"><h1>
+$user_title
+</h1>
+END
+    &html_readrefs($_[0]);
+    if (length($user_copyrightref)) {
+        local ($refn) = $qrefn{$user_copyrightref};
+        if (!length($refn)) {
+            warn "unknown question (copyright) `$user_copyrightref'";
+        }
+        $refn =~ m/(\d+)\.(\d+)/;
+        local ($s,$n) = ($1,$2);
+        $html_copyrighthref= ($s == $html_sectionn)?'':"section$s.html";
+        $html_copyrighthref.= "#$qn2ref{$s,$n}";
+    }
+}
+
+sub html_close {
+    print HTML $html_end,"<address>\n$user_author\n";
+    print HTML "- $html_date\n</address><br>\n";
+    print HTML "Extracted from $user_title,\n";
+    print HTML "<A href=\"$html_copyrighthref\">" if length($html_copyrighthref);
+    print HTML "Copyright &copy; $html_year $user_copyholder.";
+    print HTML "</A>" if length($html_copyrighthref);
+    print HTML "\n</body></html>\n";
+    close(HTML);
+}
+
+sub html_startmajorheading {
+    local ($ref, $this,$next,$back) = @_;
+    local ($nextt,$backt);
+    $this =~ s/^Section /section/;  $html_sectionn= $ref;
+    $next =~ s/^Section /section/ && ($nextt= $sn2title{$'});
+    $back =~ s/^Section /section/ ? ($backt= $sn2title{$'}) : ($back='');
+    if ($html_sectionn) {
+        &html_close;
+        open(HTML,">$html_prefix.html/$this.html");
+	print HTML "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2//EN\">\n";
+        print HTML "<html>\n";
+        $html_end= "<hr>\n";
+        $html_end.= "Next: <a href=\"$next.html\" rel=precedes>$nextt</a>.<br>\n"
+            if $next;
+        $html_end.= "Back: <a href=\"$back.html\" rev=precedes>$backt</a>.<br>\n"
+            if $back;
+        $html_end.= "<a href=\"index.html\" rev=subdocument>";
+        $html_end.= "Return to contents</a>.<p>\n";
+        print HTML <<END;
+<head><title>
+$user_brieftitle - Section $html_sectionn
+</title>
+<link rev="made" href="mailto:$user_authormail">
+<link rel="Contents" href="index.html">
+<link rel="Start" href="index.html">
+END
+        print HTML "<link rel=\"Next\" href=\"$next.html\">" if $next;
+	print HTML "<link rel=\"Previous\" href=\"$back.html\">" if $back;
+        print HTML <<END;
+<link rel="Bookmark" title="FFTW FAQ" href="index.html">
+</head><body text="#000000" bgcolor="#FFFFFF"><h1>
+$user_brieftitle - Section $html_sectionn <br>
+END
+        $html_needpara= -1;
+    }
+    else {
+	print HTML "\n<h1>\n";
+	$html_needpara=-1;
+    }
+}
+
+sub html_endmajorheading {
+    print HTML "\n</h1>\n\n";
+    $html_needpara=-1;
+}
+
+sub html_startminorheading {
+    local ($ref, $this) = @_;
+    $html_needpara=0;
+    $this =~ m/^Question (\d+)\.(\d+)/;
+    local ($s,$n) = ($1,$2);
+    print HTML "\n<h2><A name=\"$qn2ref{$s,$n}\">\n";
+}
+
+sub html_endminorheading {
+    print HTML "\n</A></h2>\n\n";
+    $html_needpara=-1;
+}
+
+sub html_newsgroup { &arg('newsgroup'); }
+sub html_endnewsgroup { &endarg('newsgroup'); }
+sub html_do_newsgroup {
+    print HTML "<A href=\"news:$_[0]\"><code>$_[0]</code></A>";
+}
+
+sub html_email { &arg('email'); }
+sub html_endemail { &endarg('email'); }
+sub html_do_email {
+    print HTML "<A href=\"mailto:$_[0]\"><code>$_[0]</code></A>";
+}
+
+sub html_courier    { print HTML "<code>" ; }
+sub html_endcourier { print HTML "</code>"; }
+sub html_italic     { print HTML "<i>"   ; }
+sub html_enditalic  { print HTML "</i>"  ; }
+
+sub html_docref { &arg('docref'); }
+sub html_enddocref { &endarg('docref'); }
+sub html_do_docref {
+    if (!defined($html_refval{$_[0]})) {
+        warn "undefined HTML reference $_[0]";
+        $html_refval{$n}='UNDEFINED';
+    }
+    print HTML "<A href=\"$html_refval{$_[0]}\">";
+    &recurse($_[0]);
+    print HTML "</A>";
+}
+
+sub html_readrefs {
+    local ($p);
+    open(HTMLREFS,"<$_[0]") || (warn("failed to open HTML refs $_[0]: $!"),return);
+    while(<HTMLREFS>) {
+        next if m/^\\\s/;
+        s/\s*\n$//;
+        if (s/^\\prefix\s*//) {
+            $p= $'; next;
+        } elsif (s/^\s*(\S.*\S)\s*\\\s*//) {
+            $_=$1; $v=$';
+            s/\\\\/\\/g;
+            $html_refval{$_}= $p.$v;
+        } else {
+            warn("ununderstood line in HTML refs >$_<");
+        }
+    }
+    close(HTMLREFS);
+}
+    
+sub html_ftpsilent { &arg('ftpsilent'); }
+sub html_endftpsilent { &endarg('ftpsilent'); }
+sub html_do_ftpsilent {
+    if ($_[0] =~ m/:/) {
+        $html_ftpsite= $`;
+        $html_ftpdir= $'.'/';
+    } else {
+        $html_ftpsite= $_[0];
+        $html_ftpdir= '';
+    }
+}
+
+sub html_ftpon { &arg('ftpon'); }
+sub html_endftpon { &endarg('ftpon'); }
+sub html_do_ftpon {
+#print STDERR "ftpon($_[0])\n";
+    $html_ftpsite= $_[0]; $html_ftpdir= '';
+    print HTML "<code>";
+    &recurse($_[0]);
+    print HTML "</code>";
+}
+
+sub html_ftpin { &arg('ftpin'); }
+sub html_endftpin { &endarg('ftpin'); }
+sub html_do_ftpin {
+#print STDERR "ftpin($_[0])\n";
+    print HTML "<A href=\"ftp://$html_ftpsite$html_ftpdir$_[0]\"><code>";
+    &recurse($_[0]);
+    print HTML "</code></A>";
+}
+
+sub html_text {
+    print HTML "\n<p>\n" if $html_needpara > 0;
+    $html_needpara=0;
+    $html_stuff= &html_sanitise($_[0]);
+    while ($html_stuff =~ s/^(.{40,70}) //) {
+        print HTML "$1\n";
+    }
+    print HTML $html_stuff;
+}
+
+sub html_tab {
+    $htmltabignore++ || warn "html tab ignored";
+}
+
+sub html_newline       { print HTML "<br>\n"    ;                       }
+sub html_startverbatim { print HTML "<pre>\n"   ;                       }
+sub html_verbatim      { print HTML &html_sanitise($_[0]),"\n";         }
+sub html_endverbatim   { print HTML "</pre>\n"  ;  $html_needpara= -1;  }
+
+sub html_endpara {
+    $html_needpara || $html_needpara++;
+}
+
+sub html_finish {
+    &html_close;
+}
+
+sub html_startindex {
+    print HTML "<ul>\n";
+}
+
+sub html_endindex {
+    print HTML "</ul><hr>\n";
+}
+
+sub html_startindexitem {
+    local ($ref,$qval) = @_;
+    $qval =~ m/Q(\d+)\.(\d+)/;
+    local ($s,$n) = ($1,$2);
+    print HTML "<li><a href=\"";
+    print HTML ($s == $html_sectionn)?'':"section$s.html";
+    print HTML "#$qn2ref{$s,$n}\" rel=subdocument>Q$s.$n. ";
+    $html_indexunhead='';
+}
+
+sub html_startindexmainitem {
+    local ($ref,$s) = @_;
+    $s =~ m/\d+/; $s= $&;
+    print HTML "<br><br>" if ($s > 1);
+    print HTML "<li><b><font size=\"+2\"><a href=\"section$s.html\" rel=subdocument>Section $s.  ";
+    $html_indexunhead='</font></b>';
+}
+
+sub html_endindexitem {
+    print HTML "</a>$html_indexunhead\n";
+}
+
+sub html_startlist {
+    print HTML "\n";
+    $html_itemend="<ul>";
+}
+
+sub html_endlist {
+    print HTML "$html_itemend\n</ul>\n";
+    $html_needpara=-1
+}
+
+sub html_item {
+    print HTML "$html_itemend\n<li>";
+    $html_itemend="";
+    $html_needpara=-1;
+}
+
+sub html_startpackedlist {
+    print HTML "\n";
+    $html_itemend="<dir>";
+}
+
+sub html_endpackedlist {
+    print HTML "$html_itemend\n</dir>\n";
+    $html_needpara=-1;
+}
+
+sub html_packeditem {
+    print HTML "$html_itemend\n<li>";
+    $html_itemend="";
+    $html_needpara=-1;
+}
+
+sub html_startindent   { print HTML "<blockquote>\n"; }
+sub html_endindent     { print HTML "</blockquote>\n"; }
+
+sub html_pageref {
+    local ($ref,$sq) = @_;
+    $sq =~ m/(\d+)\.(\d+)/;
+    local ($s,$n) = ($1,$2);
+    print HTML "<A href=\"";
+    print HTML ($s == $html_sectionn)?'':"section$s.html";
+    print HTML "#$qn2ref{$s,$n}\">Q$sq \`";
+}
+
+sub html_endpageref {
+    print HTML "'</A>";
+}
+
+sub html_sanitise {
+    local ($in) = @_;
+    local ($out);
+    while ($in =~ m/[<>&"]/) {
+        $out.= $`. '&'. $saniarray{$&}. ';';
+        $in=$';
+    }
+    $out.= $in;
+    $out;
+}
+
+1;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/FAQ/m-info.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/FAQ/m-info.pl	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,226 @@
+## Info output
+# Copyright (C) 1993-1995 Ian Jackson.
+
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# It is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with GNU Emacs; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# (Note: I do not consider works produced using these BFNN processing
+# tools to be derivative works of the tools, so they are NOT covered
+# by the GPL.  However, I would appreciate it if you credited me if
+# appropriate in any documents you format using BFNN.)
+
+sub info_init {
+    open(INFO,">$prefix.info");
+    print INFO <<END;
+Info file: $prefix.info,    -*-Text-*-
+produced by bfnnconv.pl from the Bizarre Format With No Name.
+
+END
+}
+
+sub info_heading {
+    # refstring  Node  Next  Previous Up
+    print INFO "\nFile: $prefix.info, Node: $_[1]";
+    print INFO ", Next: $_[2]" if length($_[2]);
+    print INFO ", Previous: $_[3]" if length($_[3]);
+    print INFO ", Up: $_[4]" if length($_[4]);
+    print INFO "\n\n";
+    $info_status= '';
+}
+
+sub info_startmajorheading {
+    return if $_[0] eq '0';
+    &info_heading('s_'.$_[0],@_[1..$#_],'Top');
+}
+
+sub info_startminorheading {
+    &info_heading(@_);
+}
+
+sub info_italic { &info_text('*'); }
+sub info_enditalic { $info_para .= '*'; }
+
+sub info_email { &info_text('<'); } sub info_endemail { &info_text('>'); }
+
+sub info_ftpon { } sub info_endftpon { }
+sub info_ftpin { } sub info_endftpin { }
+sub info_docref { } sub info_enddocref { }
+sub info_courier { } sub info_endcourier { }
+sub info_newsgroup { }  sub info_endnewsgroup { }
+sub info_ftpsilent { $info_ignore++; }
+sub info_endftpsilent { $info_ignore--; }
+
+sub info_text {
+    return if $info_ignore;
+    if ($info_status eq '') {
+        $info_status= 'p';
+    }
+    $info_para .= $_[0];
+}
+
+sub info_tab {
+    local ($n) = $_[0]-length($info_para);
+    $info_para .= ' 'x$n if $n>0;
+}
+
+sub info_newline {
+    return unless $info_status eq 'p';
+    print INFO &info_writepara;
+}
+
+sub info_writepara {
+    local ($thisline, $thisword, $rest, $output);
+    for (;;) {
+        last unless $info_para =~ m/\S/;
+        $thisline= $info_indentstring;
+        for (;;) {
+            last unless $info_para =~ m/^(\s*\S+)/;
+            unless (length($1) + length($thisline) < 75 ||
+                    length($thisline) == length($info_indentstring)) {
+                last;
+            }
+            $thisline .= $1;
+            $info_para= $';
+        }
+        $info_para =~ s/^\s*//;
+        $output.= $thisline."\n";
+        $info_indentstring= $info_nextindent;
+        last unless length($info_para);
+    }
+    $info_status= '';  $info_para= '';
+    return $output;
+}    
+
+sub info_endpara {
+    return unless $info_status eq 'p';
+    print INFO &info_writepara;
+    print INFO "\n";
+}
+
+sub info_endheading {
+    $info_para =~ s/\s*$//;
+    print INFO "$info_para\n\n";
+    $info_status= '';
+    $info_para= '';
+}
+
+sub info_endmajorheading { &info_endheading(@_); }
+sub info_endminorheading { &info_endheading(@_); }
+
+sub info_startverbatim {
+    print INFO &info_writepara;
+}
+
+sub info_verbatim {
+    print INFO $_[0],"\n";
+}
+
+sub info_endverbatim {
+    $info_status= $info_vstatus;
+}
+
+sub info_finish {
+    close(INFO);
+}
+
+sub info_startindex {
+    &info_endpara;
+    $info_moredetail= '';
+    $info_status= '';
+}
+
+sub info_endindex {
+    print INFO "$info_moredetail\n" if length($info_moredetail);
+}
+
+sub info_endindexitem {
+    $info_indentstring= sprintf("* %-17s ",$info_label.'::');
+    $info_nextindent= ' 'x20;
+    local ($txt);
+    $txt= &info_writepara;
+    if ($info_main) {
+        print INFO $label.$txt;
+        $txt =~ s/^.{20}//;
+        $info_moredetail.= $txt;
+    } else {
+        $info_moredetail.= $label.$txt;
+    }
+    $info_indentstring= $info_nextindent= '';
+    $info_status='p';
+}
+
+sub info_startindexitem {
+    print INFO "* Menu:\n" if $info_status eq '';
+    $info_status= '';
+    $info_label= $_[2];
+    $info_main= 0;
+}
+
+sub info_startindexmainitem {
+    print INFO "* Menu:\n" if $info_status eq '';
+    $info_label= $_[2];
+    $info_main= 1;
+    $info_moredetail .= "\n$_[2], ";
+    $info_status= '';
+}
+
+sub info_startindent {
+    $info_istatus= $info_status;
+    print INFO &info_writepara;
+    $info_indentstring= "   $info_indentstring";
+    $info_nextindent= "   $info_nextindent";
+}
+
+sub info_endindent {
+    $info_indentstring =~ s/^   //;
+    $info_nextindent =~ s/^   //;
+    $info_status= $info_istatus;
+}
+
+sub info_startpackedlist { $info_plc=0; }
+sub info_endpackedlist { &info_newline if !$info_plc; }
+sub info_packeditem {
+    &info_newline if !$info_plc;
+    &info_tab($info_plc*40+5);
+    $info_plc= !$info_plc;
+}
+
+sub info_startlist {
+    $info_istatus= $info_status;
+    print INFO &info_writepara;
+    $info_indentstring= "  $info_indentstring";
+    $info_nextindent= "  $info_nextindent";
+}
+
+sub info_endlist {
+    $info_indentstring =~ s/^  //;
+    $info_nextindent =~ s/^  //;
+    $info_status= $info_lstatus;
+}
+
+sub info_item {
+    &info_newline;
+    $info_indentstring =~ s/  $/* /;
+}
+
+sub info_pageref {
+    &info_text("*Note Question $_[1]:: \`");
+}
+
+sub info_endpageref {
+    &info_text("'");
+}
+
+1;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/FAQ/m-lout.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/FAQ/m-lout.pl	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,242 @@
+## Lout output
+# Copyright (C) 1993-1995 Ian Jackson.
+
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# It is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with GNU Emacs; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# (Note: I do not consider works produced using these BFNN processing
+# tools to be derivative works of the tools, so they are NOT covered
+# by the GPL.  However, I would appreciate it if you credited me if
+# appropriate in any documents you format using BFNN.)
+
+sub lout_init {
+    open(LOUT,">$prefix.lout");
+    chop($dprint= `date '+%d %B %Y'`);
+    $dprint =~ s/^0//;
+}
+
+sub lout_startup {
+    local ($lbs) = &lout_sanitise($user_brieftitle);
+    print LOUT <<END;
+\@SysInclude{ fontdefs }
+\@SysInclude{ langdefs }
+\@SysInclude{ dl }
+\@SysInclude{ docf }
+\@Use { \@DocumentLayout
+  \@OddTop { \@Null }
+  \@EvenTop { \@Null }
+  \@StartOddTop { \@Null }
+  \@StartEvenTop { \@Null }
+  \@OddFoot { { $lbs } \@Centre{ - \@PageNum - } \@Right{ $dprint } }
+  \@EvenFoot { { $lbs } \@Centre{ - \@PageNum - } \@Right{ $dprint } }
+  \@StartOddFoot { { $lbs } \@Centre{ - \@PageNum - } \@Right{ $dprint } }
+  \@StartEvenFoot { { $lbs } \@Centre{ - \@PageNum - } \@Right{ $dprint } }
+  \@ParaGap { 1.70vx }
+  \@InitialBreak { 1.0fx ragged hyphen }
+}
+\@Use { \@OrdinaryLayout }
+END
+    $lout_textstatus= 'p';
+}
+
+sub lout_pageref {
+    print LOUT "Q$_[1] (page {\@PageOf{$_[0]}}) ";
+    &lout_text("\`");
+}
+
+sub lout_endpageref {
+    &lout_text("'");
+}
+
+sub lout_finish {
+    print LOUT "\@End \@Text\n";
+    close(L);
+}
+
+sub lout_startmajorheading {
+    $lout_styles .= 'h';
+    print LOUT <<END
+\@CNP
+{
+  newpath   0  ysize 0.3 ft sub  moveto
+            xsize  0  rlineto
+            0  0.2 ft  rlineto
+            xsize neg  0  rlineto
+  closepath fill
+} \@Graphic { //1.6f \@HAdjust \@Heading{
+END
+    ;
+    $endh= "}\n{\@PageMark s_$_[0]}\n/1.0fo\n";
+    &lout_text($_[0] ? "Section $_[0].  " : '');
+}
+
+sub lout_startminorheading {
+    $lout_styles .= 'h';
+    print LOUT "//0.2f \@CNP {\@PageMark $_[0]} \@Heading{\n";
+    $endh= '';
+}
+
+sub lout_endheading {
+    $lout_styles =~ s/.$//; print LOUT "}\n$endh";
+    $lout_status= 'p';
+}
+
+sub lout_endmajorheading { &lout_endheading(@_); }
+sub lout_endminorheading { &lout_endheading(@_); }
+
+sub lout_courier {
+    $lout_styles .= 'f';
+    print LOUT "{{0.7 1.0} \@Scale {Courier Bold} \@Font {";
+}
+
+sub lout_endcourier {
+    $lout_styles =~ s/.$//; print LOUT "}}";
+}
+
+sub lout_italic { $lout_styles .= 'f'; print LOUT "{Slope \@Font {"; }
+sub lout_enditalic { $lout_styles =~ s/.$//; print LOUT "}}"; }
+
+sub lout_startindent { $lout_styles .= 'i'; print LOUT "\@IndentedDisplay {\n"; }
+
+sub lout_endindent {
+    &lout_endpara;
+    $lout_styles =~ s/.$//; print LOUT "}\n\@LP\n";
+}
+
+sub lout_startpackedlist { $lout_plc=-1; }
+sub lout_endpackedlist { &lout_newline if !$lout_plc; }
+sub lout_packeditem {
+    &lout_newline if !$lout_plc;
+    &lout_tab(($lout_plc>0)*40+5);
+    $lout_plc= !$lout_plc;
+}
+
+sub lout_startlist {
+    &lout_endpara;
+    print LOUT "\@RawIndentedList style {\@Bullet} indent {0.5i} gap {1.1vx}\n";
+    $lout_styles .= 'l';
+    $lout_status= '';
+}
+
+sub lout_endlist {
+    &lout_endpara;
+    print LOUT "\@EndList\n\n";
+    $lout_styles =~ s/.$//;
+}
+
+sub lout_item {
+    &lout_endpara;
+    print LOUT "\@ListItem{";
+    $lout_styles.= 'I';
+}
+
+sub lout_startindex {
+    print LOUT "//0.0fe\n";
+}
+
+sub lout_endindex {
+    $lout_status='p';
+}
+
+sub lout_startindexmainitem {
+    $lout_marker= $_[0];
+    $lout_status= '';
+    print LOUT "//0.3vx Bold \@Font \@HAdjust { \@HContract { { $_[1] } |3cx {";
+    $lout_iiendheight= '1.00';
+    $lout_styles .= 'X';
+}
+
+sub lout_startindexitem {
+    $lout_marker= $_[0];
+    print LOUT "\@HAdjust { \@HContract { { $_[1] } |3cx {";
+    $lout_iiendheight= '0.95';
+    $lout_styles .= 'X';
+}
+
+sub lout_endindexitem {
+    print LOUT "} } |0c \@PageOf { $lout_marker } } //${lout_iiendheight}vx\n";
+    $lout_styles =~ s/.$//;
+}
+
+sub lout_email { &lout_courier; &lout_text('<'); }
+sub lout_endemail { &lout_text('>'); &lout_endcourier; }
+
+sub lout_ftpon { &lout_courier; }  sub lout_endftpon { &lout_endcourier; }
+sub lout_ftpin { &lout_courier; }  sub lout_endftpin { &lout_endcourier; }
+sub lout_docref { }  sub lout_enddocref { }
+sub lout_ftpsilent { $lout_ignore++; }
+sub lout_endftpsilent { $lout_ignore--; }
+
+sub lout_newsgroup { &lout_courier; }
+sub lout_endnewsgroup { &lout_endcourier; }
+
+sub lout_text {
+    return if $lout_ignore;
+    $lout_status= 'p';
+    $_= &lout_sanitise($_[0]);
+    s/ $/\n/ unless $lout_styles =~ m/[fhX]/;
+    print LOUT $_;
+}
+
+sub lout_tab {
+    local ($size) = $_[0]*0.5;
+    print LOUT " |${size}ft ";
+}
+
+sub lout_newline {
+    print LOUT " //1.0vx\n";
+}
+
+sub lout_sanitise {
+    local ($in) = @_;
+    local ($out);
+    $in= ' '.$in.' ';
+    $out='';
+    while ($in =~ m/(\s)(\S*[\@\/|\\\"\^\&\{\}\#]\S*)(\s)/) {
+        $out .= $`.$1;
+        $in = $3.$';
+        $_= $2;
+        s/[\\\"]/\\$&/g;
+        $out .= '"'.$_.'"';
+    }
+    $out .= $in;
+    $out =~ s/^ //;  $out =~ s/ $//;
+    $out;
+}
+
+sub lout_endpara {
+    return if $lout_status eq '';
+    if ($lout_styles eq '') {
+        print LOUT "\@LP\n\n";
+    } elsif ($lout_styles =~ s/I$//) {
+        print LOUT "}\n";
+    }
+    $lout_status= '';
+}
+
+sub lout_startverbatim {
+    print LOUT "//0.4f\n\@RawIndentedDisplay lines \@Break".
+               " { {0.7 1.0} \@Scale {Courier Bold} \@Font {\n";
+}
+
+sub lout_verbatim {
+    $_= $_[0];
+    s/^\s*//;
+    print LOUT &lout_sanitise($_),"\n";
+}
+
+sub lout_endverbatim { print LOUT "}\n}\n//0.4f\n"; }
+
+1;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/FAQ/m-post.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/FAQ/m-post.pl	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,189 @@
+## POST output
+# Copyright (C) 1993-1995 Ian Jackson.
+
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# It is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with GNU Emacs; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# (Note: I do not consider works produced using these BFNN processing
+# tools to be derivative works of the tools, so they are NOT covered
+# by the GPL.  However, I would appreciate it if you credited me if
+# appropriate in any documents you format using BFNN.)
+
+sub post_init {
+    open(POST,">$prefix.post");
+}
+
+sub post_startmajorheading {
+    print POST '='x79,"\n\n";
+    $post_status= 'h';
+    &post_text($_[0] ? "Section $_[0].  " : '');
+}
+
+sub post_startminorheading {
+    print POST '-'x77,"\n\n";
+    $post_status= 'h';
+}
+
+sub post_italic { &post_text('*'); }
+sub post_enditalic { $post_para .= '*'; }
+
+sub post_email { &post_text('<'); } sub post_endemail { &post_text('>'); }
+
+sub post_ftpon { } sub post_endftpon { }
+sub post_ftpin { } sub post_endftpin { }
+sub post_docref { } sub post_enddocref { }
+sub post_courier { } sub post_endcourier { }
+sub post_newsgroup { }  sub post_endnewsgroup { }
+sub post_ftpsilent { $post_ignore++; }
+sub post_endftpsilent { $post_ignore--; }
+
+sub post_text {
+    return if $post_ignore;
+    if ($post_status eq '') {
+        $post_status= 'p';
+    }
+    $post_para .= $_[0];
+}
+
+sub post_tab {
+    local ($n) = $_[0]-length($post_para);
+    $post_para .= ' 'x$n if $n>0;
+}
+
+sub post_newline {
+    return unless $post_status eq 'p';
+    &post_writepara;
+}
+
+sub post_writepara {
+    local ($thisline, $thisword, $rest);
+    for (;;) {
+        last unless $post_para =~ m/\S/;
+        $thisline= $post_indentstring;
+        for (;;) {
+            last unless $post_para =~ m/^(\s*\S+)/;
+            unless (length($1) + length($thisline) < 75 ||
+                    length($thisline) == length($post_indentstring)) {
+                last;
+            }
+            $thisline .= $1;
+            $post_para= $';
+        }
+        $post_para =~ s/^\s*//;
+        print POST $thisline,"\n";
+        $post_indentstring= $post_nextindent;
+        last unless length($post_para);
+    }
+    $post_status= '';  $post_para= '';
+}    
+
+sub post_endpara {
+    return unless $post_status eq 'p';
+    &post_writepara;
+    print POST "\n";
+}
+
+sub post_endheading {
+    $post_para =~ s/\s*$//;
+    print POST "$post_para\n\n";
+    $post_status= '';
+    $post_para= '';
+}
+
+sub post_endmajorheading { &post_endheading(@_); }
+sub post_endminorheading { &post_endheading(@_); }
+
+sub post_startverbatim {
+    $post_vstatus= $post_status;
+    &post_writepara;
+}
+
+sub post_verbatim {
+    print POST $_[0],"\n";
+}
+
+sub post_endverbatim {
+    $post_status= $post_vstatus;
+}
+
+sub post_finish {
+    close(POST);
+}
+
+sub post_startindex { $post_status= ''; }
+sub post_endindex { $post_status= 'p'; }
+
+sub post_endindexitem {
+    printf POST " %-11s %-.66s\n",$post_left,$post_para;
+    $post_status= 'p';
+    $post_para= '';
+}
+
+sub post_startindexitem {
+    $post_left= $_[1];
+}
+
+sub post_startindexmainitem {
+    $post_left= $_[1];
+    print POST "\n" if $post_status eq 'p';
+}
+
+sub post_startindent {
+    $post_istatus= $post_status;
+    &post_writepara;
+    $post_indentstring= "   $post_indentstring";
+    $post_nextindent= "   $post_nextindent";
+}
+
+sub post_endindent {
+    $post_indentstring =~ s/^   //;
+    $post_nextindent =~ s/^   //;
+    $post_status= $post_istatus;
+}
+
+sub post_startpackedlist { $post_plc=0; }
+sub post_endpackedlist { &post_newline if !$post_plc; }
+sub post_packeditem {
+    &post_newline if !$post_plc;
+    &post_tab($post_plc*40+5);
+    $post_plc= !$post_plc;
+}
+
+sub post_startlist {
+    &post_endpara;
+    $post_indentstring= "  $post_indentstring";
+    $post_nextindent= "  $post_nextindent";
+}
+
+sub post_endlist {
+    &post_endpara;
+    $post_indentstring =~ s/^  //;
+    $post_nextindent =~ s/^  //;
+}
+
+sub post_item {
+    &post_newline;
+    $post_indentstring =~ s/  $/* /;
+}
+
+sub post_pageref {
+    &post_text("Q$_[1] \`");
+}
+
+sub post_endpageref {
+    &post_text("'");
+}
+
+1;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,37 @@
+SUBDIRS = FAQ
+
+info_TEXINFOS = fftw3.texi
+fftw3_TEXINFOS = acknowledgements.texi cindex.texi fftw3.texi findex.texi install.texi intro.texi legacy-fortran.texi license.texi modern-fortran.texi mpi.texi other.texi reference.texi threads.texi tutorial.texi upgrading.texi version.texi rfftwnd.pdf rfftwnd.eps
+
+DVIPS = dvips -Pwww
+
+EQN_IMAGES = equation-dft.png equation-dht.png equation-idft.png	\
+equation-redft00.png equation-redft01.png equation-redft10.png		\
+equation-redft11.png equation-rodft00.png equation-rodft01.png		\
+equation-rodft10.png equation-rodft11.png
+
+EXTRA_DIST = f77_wisdom.f fftw3.pdf html rfftwnd.fig rfftwnd.eps	\
+rfftwnd.pdf rfftwnd-for-html.png $(EQN_IMAGES)
+
+html: $(fftw3_TEXINFOS) $(EQN_IMAGES) rfftwnd-for-html.png
+	$(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir) \
+		--html --number-sections -o html fftw3.texi
+	for i in $(EQN_IMAGES); do cp -f ${srcdir}/$$i html; done
+	cp -f rfftwnd-for-html.png html
+
+maintainer-clean-local:
+	rm -rf html
+
+if MAINTAINER_MODE
+# generate the figure for the manual and distribute the binaries, so that
+# people don't need to have fig2dev installed.
+rfftwnd.eps: rfftwnd.fig
+	fig2dev -L eps -m .7 ${srcdir}/rfftwnd.fig rfftwnd.eps
+
+rfftwnd-for-html.png: rfftwnd.fig
+	fig2dev -L png -m 1 ${srcdir}/rfftwnd.fig rfftwnd-for-html.png
+
+rfftwnd.pdf: rfftwnd.fig
+	fig2dev -L pdf -m .7 ${srcdir}/rfftwnd.fig rfftwnd.pdf
+
+endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,956 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = doc
+DIST_COMMON = $(fftw3_TEXINFOS) $(srcdir)/Makefile.am \
+	$(srcdir)/Makefile.in $(srcdir)/stamp-vti \
+	$(srcdir)/version.texi mdate-sh texinfo.tex
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+SOURCES =
+DIST_SOURCES =
+INFO_DEPS = $(srcdir)/fftw3.info
+am__TEXINFO_TEX_DIR = $(srcdir)
+DVIS = fftw3.dvi
+PDFS = fftw3.pdf
+PSS = fftw3.ps
+HTMLS = fftw3.html
+TEXINFOS = fftw3.texi
+TEXI2DVI = texi2dvi
+TEXI2PDF = $(TEXI2DVI) --pdf --batch
+MAKEINFOHTML = $(MAKEINFO) --html
+AM_MAKEINFOHTMLFLAGS = $(AM_MAKEINFOFLAGS)
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__installdirs = "$(DESTDIR)$(infodir)"
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
+	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
+	distdir
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+SUBDIRS = FAQ
+info_TEXINFOS = fftw3.texi
+fftw3_TEXINFOS = acknowledgements.texi cindex.texi fftw3.texi findex.texi install.texi intro.texi legacy-fortran.texi license.texi modern-fortran.texi mpi.texi other.texi reference.texi threads.texi tutorial.texi upgrading.texi version.texi rfftwnd.pdf rfftwnd.eps
+DVIPS = dvips -Pwww
+EQN_IMAGES = equation-dft.png equation-dht.png equation-idft.png	\
+equation-redft00.png equation-redft01.png equation-redft10.png		\
+equation-redft11.png equation-rodft00.png equation-rodft01.png		\
+equation-rodft10.png equation-rodft11.png
+
+EXTRA_DIST = f77_wisdom.f fftw3.pdf html rfftwnd.fig rfftwnd.eps	\
+rfftwnd.pdf rfftwnd-for-html.png $(EQN_IMAGES)
+
+all: all-recursive
+
+.SUFFIXES:
+.SUFFIXES: .dvi .html .info .pdf .ps .texi
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu doc/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu doc/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+.texi.info:
+	restore=: && backupdir="$(am__leading_dot)am$$$$" && \
+	am__cwd=`pwd` && $(am__cd) $(srcdir) && \
+	rm -rf $$backupdir && mkdir $$backupdir && \
+	if ($(MAKEINFO) --version) >/dev/null 2>&1; then \
+	  for f in $@ $@-[0-9] $@-[0-9][0-9] $(@:.info=).i[0-9] $(@:.info=).i[0-9][0-9]; do \
+	    if test -f $$f; then mv $$f $$backupdir; restore=mv; else :; fi; \
+	  done; \
+	else :; fi && \
+	cd "$$am__cwd"; \
+	if $(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir) \
+	 -o $@ $<; \
+	then \
+	  rc=0; \
+	  $(am__cd) $(srcdir); \
+	else \
+	  rc=$$?; \
+	  $(am__cd) $(srcdir) && \
+	  $$restore $$backupdir/* `echo "./$@" | sed 's|[^/]*$$||'`; \
+	fi; \
+	rm -rf $$backupdir; exit $$rc
+
+.texi.dvi:
+	TEXINPUTS="$(am__TEXINFO_TEX_DIR)$(PATH_SEPARATOR)$$TEXINPUTS" \
+	MAKEINFO='$(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir)' \
+	$(TEXI2DVI) $<
+
+.texi.pdf:
+	TEXINPUTS="$(am__TEXINFO_TEX_DIR)$(PATH_SEPARATOR)$$TEXINPUTS" \
+	MAKEINFO='$(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir)' \
+	$(TEXI2PDF) $<
+
+.texi.html:
+	rm -rf $(@:.html=.htp)
+	if $(MAKEINFOHTML) $(AM_MAKEINFOHTMLFLAGS) $(MAKEINFOFLAGS) -I $(srcdir) \
+	 -o $(@:.html=.htp) $<; \
+	then \
+	  rm -rf $@; \
+	  if test ! -d $(@:.html=.htp) && test -d $(@:.html=); then \
+	    mv $(@:.html=) $@; else mv $(@:.html=.htp) $@; fi; \
+	else \
+	  if test ! -d $(@:.html=.htp) && test -d $(@:.html=); then \
+	    rm -rf $(@:.html=); else rm -Rf $(@:.html=.htp) $@; fi; \
+	  exit 1; \
+	fi
+$(srcdir)/fftw3.info: fftw3.texi $(srcdir)/version.texi $(fftw3_TEXINFOS)
+fftw3.dvi: fftw3.texi $(srcdir)/version.texi $(fftw3_TEXINFOS)
+fftw3.pdf: fftw3.texi $(srcdir)/version.texi $(fftw3_TEXINFOS)
+fftw3.html: fftw3.texi $(srcdir)/version.texi $(fftw3_TEXINFOS)
+$(srcdir)/version.texi: @MAINTAINER_MODE_TRUE@ $(srcdir)/stamp-vti
+$(srcdir)/stamp-vti: fftw3.texi $(top_srcdir)/configure
+	@(dir=.; test -f ./fftw3.texi || dir=$(srcdir); \
+	set `$(SHELL) $(srcdir)/mdate-sh $$dir/fftw3.texi`; \
+	echo "@set UPDATED $$1 $$2 $$3"; \
+	echo "@set UPDATED-MONTH $$2 $$3"; \
+	echo "@set EDITION $(VERSION)"; \
+	echo "@set VERSION $(VERSION)") > vti.tmp
+	@cmp -s vti.tmp $(srcdir)/version.texi \
+	  || (echo "Updating $(srcdir)/version.texi"; \
+	      cp vti.tmp $(srcdir)/version.texi)
+	-@rm -f vti.tmp
+	@cp $(srcdir)/version.texi $@
+
+mostlyclean-vti:
+	-rm -f vti.tmp
+
+maintainer-clean-vti:
+@MAINTAINER_MODE_TRUE@	-rm -f $(srcdir)/stamp-vti $(srcdir)/version.texi
+.dvi.ps:
+	TEXINPUTS="$(am__TEXINFO_TEX_DIR)$(PATH_SEPARATOR)$$TEXINPUTS" \
+	$(DVIPS) -o $@ $<
+
+uninstall-dvi-am:
+	@$(NORMAL_UNINSTALL)
+	@list='$(DVIS)'; test -n "$(dvidir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " rm -f '$(DESTDIR)$(dvidir)/$$f'"; \
+	  rm -f "$(DESTDIR)$(dvidir)/$$f"; \
+	done
+
+uninstall-html-am:
+	@$(NORMAL_UNINSTALL)
+	@list='$(HTMLS)'; test -n "$(htmldir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " rm -rf '$(DESTDIR)$(htmldir)/$$f'"; \
+	  rm -rf "$(DESTDIR)$(htmldir)/$$f"; \
+	done
+
+uninstall-info-am:
+	@$(PRE_UNINSTALL)
+	@if test -d '$(DESTDIR)$(infodir)' && $(am__can_run_installinfo); then \
+	  list='$(INFO_DEPS)'; \
+	  for file in $$list; do \
+	    relfile=`echo "$$file" | sed 's|^.*/||'`; \
+	    echo " install-info --info-dir='$(DESTDIR)$(infodir)' --remove '$(DESTDIR)$(infodir)/$$relfile'"; \
+	    if install-info --info-dir="$(DESTDIR)$(infodir)" --remove "$(DESTDIR)$(infodir)/$$relfile"; \
+	    then :; else test ! -f "$(DESTDIR)$(infodir)/$$relfile" || exit 1; fi; \
+	  done; \
+	else :; fi
+	@$(NORMAL_UNINSTALL)
+	@list='$(INFO_DEPS)'; \
+	for file in $$list; do \
+	  relfile=`echo "$$file" | sed 's|^.*/||'`; \
+	  relfile_i=`echo "$$relfile" | sed 's|\.info$$||;s|$$|.i|'`; \
+	  (if test -d "$(DESTDIR)$(infodir)" && cd "$(DESTDIR)$(infodir)"; then \
+	     echo " cd '$(DESTDIR)$(infodir)' && rm -f $$relfile $$relfile-[0-9] $$relfile-[0-9][0-9] $$relfile_i[0-9] $$relfile_i[0-9][0-9]"; \
+	     rm -f $$relfile $$relfile-[0-9] $$relfile-[0-9][0-9] $$relfile_i[0-9] $$relfile_i[0-9][0-9]; \
+	   else :; fi); \
+	done
+
+uninstall-pdf-am:
+	@$(NORMAL_UNINSTALL)
+	@list='$(PDFS)'; test -n "$(pdfdir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " rm -f '$(DESTDIR)$(pdfdir)/$$f'"; \
+	  rm -f "$(DESTDIR)$(pdfdir)/$$f"; \
+	done
+
+uninstall-ps-am:
+	@$(NORMAL_UNINSTALL)
+	@list='$(PSS)'; test -n "$(psdir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " rm -f '$(DESTDIR)$(psdir)/$$f'"; \
+	  rm -f "$(DESTDIR)$(psdir)/$$f"; \
+	done
+
+dist-info: $(INFO_DEPS)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
+	list='$(INFO_DEPS)'; \
+	for base in $$list; do \
+	  case $$base in \
+	    $(srcdir)/*) base=`echo "$$base" | sed "s|^$$srcdirstrip/||"`;; \
+	  esac; \
+	  if test -f $$base; then d=.; else d=$(srcdir); fi; \
+	  base_i=`echo "$$base" | sed 's|\.info$$||;s|$$|.i|'`; \
+	  for file in $$d/$$base $$d/$$base-[0-9] $$d/$$base-[0-9][0-9] $$d/$$base_i[0-9] $$d/$$base_i[0-9][0-9]; do \
+	    if test -f $$file; then \
+	      relfile=`expr "$$file" : "$$d/\(.*\)"`; \
+	      test -f "$(distdir)/$$relfile" || \
+		cp -p $$file "$(distdir)/$$relfile"; \
+	    else :; fi; \
+	  done; \
+	done
+
+mostlyclean-aminfo:
+	-rm -rf fftw3.aux fftw3.cp fftw3.cps fftw3.ct fftw3.fc fftw3.ff fftw3.fn \
+	  fftw3.fns fftw3.fp fftw3.ky fftw3.kys fftw3.log fftw3.pg \
+	  fftw3.tmp fftw3.toc fftw3.tp fftw3.vr
+
+clean-aminfo:
+	-test -z "fftw3.dvi fftw3.pdf fftw3.ps fftw3.html" \
+	|| rm -rf fftw3.dvi fftw3.pdf fftw3.ps fftw3.html
+
+maintainer-clean-aminfo:
+	@list='$(INFO_DEPS)'; for i in $$list; do \
+	  i_i=`echo "$$i" | sed 's|\.info$$||;s|$$|.i|'`; \
+	  echo " rm -f $$i $$i-[0-9] $$i-[0-9][0-9] $$i_i[0-9] $$i_i[0-9][0-9]"; \
+	  rm -f $$i $$i-[0-9] $$i-[0-9][0-9] $$i_i[0-9] $$i_i[0-9][0-9]; \
+	done
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+	$(MAKE) $(AM_MAKEFLAGS) \
+	  top_distdir="$(top_distdir)" distdir="$(distdir)" \
+	  dist-info
+check-am: all-am
+check: check-recursive
+all-am: Makefile $(INFO_DEPS)
+installdirs: installdirs-recursive
+installdirs-am:
+	for dir in "$(DESTDIR)$(infodir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-aminfo clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am: $(DVIS)
+
+html-am: $(HTMLS)
+
+info: info-recursive
+
+info-am: $(INFO_DEPS)
+
+install-data-am: install-info-am
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am: $(DVIS)
+	@$(NORMAL_INSTALL)
+	@list='$(DVIS)'; test -n "$(dvidir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(dvidir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(dvidir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(dvidir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(dvidir)" || exit $$?; \
+	done
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-html-am: $(HTMLS)
+	@$(NORMAL_INSTALL)
+	@list='$(HTMLS)'; list2=; test -n "$(htmldir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(htmldir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(htmldir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p" || test -d "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  $(am__strip_dir) \
+	  d2=$$d$$p; \
+	  if test -d "$$d2"; then \
+	    echo " $(MKDIR_P) '$(DESTDIR)$(htmldir)/$$f'"; \
+	    $(MKDIR_P) "$(DESTDIR)$(htmldir)/$$f" || exit 1; \
+	    echo " $(INSTALL_DATA) '$$d2'/* '$(DESTDIR)$(htmldir)/$$f'"; \
+	    $(INSTALL_DATA) "$$d2"/* "$(DESTDIR)$(htmldir)/$$f" || exit $$?; \
+	  else \
+	    list2="$$list2 $$d2"; \
+	  fi; \
+	done; \
+	test -z "$$list2" || { echo "$$list2" | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(htmldir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(htmldir)" || exit $$?; \
+	done; }
+install-info: install-info-recursive
+
+install-info-am: $(INFO_DEPS)
+	@$(NORMAL_INSTALL)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
+	list='$(INFO_DEPS)'; test -n "$(infodir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(infodir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(infodir)" || exit 1; \
+	fi; \
+	for file in $$list; do \
+	  case $$file in \
+	    $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \
+	  esac; \
+	  if test -f $$file; then d=.; else d=$(srcdir); fi; \
+	  file_i=`echo "$$file" | sed 's|\.info$$||;s|$$|.i|'`; \
+	  for ifile in $$d/$$file $$d/$$file-[0-9] $$d/$$file-[0-9][0-9] \
+	               $$d/$$file_i[0-9] $$d/$$file_i[0-9][0-9] ; do \
+	    if test -f $$ifile; then \
+	      echo "$$ifile"; \
+	    else : ; fi; \
+	  done; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(infodir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(infodir)" || exit $$?; done
+	@$(POST_INSTALL)
+	@if $(am__can_run_installinfo); then \
+	  list='$(INFO_DEPS)'; test -n "$(infodir)" || list=; \
+	  for file in $$list; do \
+	    relfile=`echo "$$file" | sed 's|^.*/||'`; \
+	    echo " install-info --info-dir='$(DESTDIR)$(infodir)' '$(DESTDIR)$(infodir)/$$relfile'";\
+	    install-info --info-dir="$(DESTDIR)$(infodir)" "$(DESTDIR)$(infodir)/$$relfile" || :;\
+	  done; \
+	else : ; fi
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am: $(PDFS)
+	@$(NORMAL_INSTALL)
+	@list='$(PDFS)'; test -n "$(pdfdir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(pdfdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(pdfdir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(pdfdir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(pdfdir)" || exit $$?; done
+install-ps: install-ps-recursive
+
+install-ps-am: $(PSS)
+	@$(NORMAL_INSTALL)
+	@list='$(PSS)'; test -n "$(psdir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(psdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(psdir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(psdir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(psdir)" || exit $$?; done
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-aminfo \
+	maintainer-clean-generic maintainer-clean-local \
+	maintainer-clean-vti
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-aminfo mostlyclean-generic \
+	mostlyclean-libtool mostlyclean-vti
+
+pdf: pdf-recursive
+
+pdf-am: $(PDFS)
+
+ps: ps-recursive
+
+ps-am: $(PSS)
+
+uninstall-am: uninstall-dvi-am uninstall-html-am uninstall-info-am \
+	uninstall-pdf-am uninstall-ps-am
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \
+	install-am install-strip tags-recursive
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am check check-am clean clean-aminfo clean-generic \
+	clean-libtool ctags ctags-recursive dist-info distclean \
+	distclean-generic distclean-libtool distclean-tags distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs installdirs-am \
+	maintainer-clean maintainer-clean-aminfo \
+	maintainer-clean-generic maintainer-clean-local \
+	maintainer-clean-vti mostlyclean mostlyclean-aminfo \
+	mostlyclean-generic mostlyclean-libtool mostlyclean-vti pdf \
+	pdf-am ps ps-am tags tags-recursive uninstall uninstall-am \
+	uninstall-dvi-am uninstall-html-am uninstall-info-am \
+	uninstall-pdf-am uninstall-ps-am
+
+
+html: $(fftw3_TEXINFOS) $(EQN_IMAGES) rfftwnd-for-html.png
+	$(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir) \
+		--html --number-sections -o html fftw3.texi
+	for i in $(EQN_IMAGES); do cp -f ${srcdir}/$$i html; done
+	cp -f rfftwnd-for-html.png html
+
+maintainer-clean-local:
+	rm -rf html
+
+# generate the figure for the manual and distribute the binaries, so that
+# people don't need to have fig2dev installed.
+@MAINTAINER_MODE_TRUE@rfftwnd.eps: rfftwnd.fig
+@MAINTAINER_MODE_TRUE@	fig2dev -L eps -m .7 ${srcdir}/rfftwnd.fig rfftwnd.eps
+
+@MAINTAINER_MODE_TRUE@rfftwnd-for-html.png: rfftwnd.fig
+@MAINTAINER_MODE_TRUE@	fig2dev -L png -m 1 ${srcdir}/rfftwnd.fig rfftwnd-for-html.png
+
+@MAINTAINER_MODE_TRUE@rfftwnd.pdf: rfftwnd.fig
+@MAINTAINER_MODE_TRUE@	fig2dev -L pdf -m .7 ${srcdir}/rfftwnd.fig rfftwnd.pdf
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/acknowledgements.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/acknowledgements.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,89 @@
+@node Acknowledgments, License and Copyright, Installation and Customization, Top
+@chapter Acknowledgments
+
+Matteo Frigo was supported in part by the Special Research Program SFB
+F011 ``AURORA'' of the Austrian Science Fund FWF and by MIT Lincoln
+Laboratory.  For previous versions of FFTW, he was supported in part by the
+Defense Advanced Research Projects Agency (DARPA), under Grants
+N00014-94-1-0985 and F30602-97-1-0270, and by a Digital Equipment
+Corporation Fellowship.  
+
+Steven G. Johnson was supported in part by a Dept.@ of Defense NDSEG
+Fellowship, an MIT Karl Taylor Compton Fellowship, and by the Materials
+Research Science and Engineering Center program of the National Science
+Foundation under award DMR-9400334.
+
+Code for the Cell Broadband Engine was graciously donated to the FFTW
+project by the IBM Austin Research Lab and included in fftw-3.2.  (This
+code was removed in fftw-3.3.)
+
+Code for the MIPS paired-single SIMD support was graciously donated to
+the FFTW project by CodeSourcery, Inc.
+
+We are grateful to Sun Microsystems Inc.@ for its donation of a
+cluster of 9 8-processor Ultra HPC 5000 SMPs (24 Gflops peak). These
+machines served as the primary platform for the development of early
+versions of FFTW.
+
+We thank Intel Corporation for donating a four-processor Pentium Pro
+machine.  We thank the GNU/Linux community for giving us a decent OS to
+run on that machine.
+
+We are thankful to the AMD corporation for donating an AMD Athlon XP 1700+
+computer to the FFTW project.
+
+We thank the Compaq/HP testdrive program and VA Software Corporation
+(SourceForge.net) for providing remote access to machines that were used
+to test FFTW.
+
+The @code{genfft} suite of code generators was written using Objective
+Caml, a dialect of ML.  Objective Caml is a small and elegant language
+developed by Xavier Leroy.  The implementation is available from
+@uref{http://caml.inria.fr/, @code{http://caml.inria.fr/}}.  In previous
+releases of FFTW, @code{genfft} was written in Caml Light, by the same
+authors.  An even earlier implementation of @code{genfft} was written in
+Scheme, but Caml is definitely better for this kind of application.
+@cindex Caml
+@cindex LISP
+
+
+FFTW uses many tools from the GNU project, including @code{automake},
+@code{texinfo}, and @code{libtool}.
+
+Prof.@ Charles E.@ Leiserson of MIT provided continuous support and
+encouragement.  This program would not exist without him.  Charles also
+proposed the name ``codelets'' for the basic FFT blocks.
+@cindex codelet
+
+
+Prof.@ John D.@ Joannopoulos of MIT demonstrated continuing tolerance of
+Steven's ``extra-curricular'' computer-science activities, as well as
+remarkable creativity in working them into his grant proposals.
+Steven's physics degree would not exist without him.
+
+Franz Franchetti wrote SIMD extensions to FFTW 2, which eventually
+led to the SIMD support in FFTW 3.
+
+Stefan Kral wrote most of the K7 code generator distributed with FFTW
+3.0.x and 3.1.x.
+
+Andrew Sterian contributed the Windows timing code in FFTW 2.  
+
+Didier Miras reported a bug in the test procedure used in FFTW 1.2.  We
+now use a completely different test algorithm by Funda Ergun that does
+not require a separate FFT program to compare against.
+
+Wolfgang Reimer contributed the Pentium cycle counter and a few fixes
+that help portability.
+
+Ming-Chang Liu uncovered a well-hidden bug in the complex transforms of
+FFTW 2.0 and supplied a patch to correct it.
+
+The FFTW FAQ was written in @code{bfnn} (Bizarre Format With No Name)
+and formatted using the tools developed by Ian Jackson for the Linux
+FAQ.
+
+@emph{We are especially thankful to all of our users for their
+continuing support, feedback, and interest during our development of
+FFTW.}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/cindex.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/cindex.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+@node Concept Index, Library Index, License and Copyright, Top
+@chapter Concept Index
+@printindex cp
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-dft.png
Binary file src/fftw-3.3.3/doc/equation-dft.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-dht.png
Binary file src/fftw-3.3.3/doc/equation-dht.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-idft.png
Binary file src/fftw-3.3.3/doc/equation-idft.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-redft00.png
Binary file src/fftw-3.3.3/doc/equation-redft00.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-redft01.png
Binary file src/fftw-3.3.3/doc/equation-redft01.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-redft10.png
Binary file src/fftw-3.3.3/doc/equation-redft10.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-redft11.png
Binary file src/fftw-3.3.3/doc/equation-redft11.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-rodft00.png
Binary file src/fftw-3.3.3/doc/equation-rodft00.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-rodft01.png
Binary file src/fftw-3.3.3/doc/equation-rodft01.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-rodft10.png
Binary file src/fftw-3.3.3/doc/equation-rodft10.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/equation-rodft11.png
Binary file src/fftw-3.3.3/doc/equation-rodft11.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/f77_wisdom.f
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/f77_wisdom.f	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,79 @@
+c     Copyright (c) 2003, 2007-11 Matteo Frigo
+c     Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+c     
+c     This program is free software; you can redistribute it and/or modify
+c     it under the terms of the GNU General Public License as published by
+c     the Free Software Foundation; either version 2 of the License, or
+c     (at your option) any later version.
+c     
+c     This program is distributed in the hope that it will be useful,
+c     but WITHOUT ANY WARRANTY; without even the implied warranty of
+c     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+c     GNU General Public License for more details.
+c     
+c     You should have received a copy of the GNU General Public License
+c     along with this program; if not, write to the Free Software
+c     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+c
+cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
+c     
+c     This is an example implementation of Fortran wisdom export/import
+c     to/from a Fortran unit (file), exploiting the generic
+c     dfftw_export_wisdom/dfftw_import_wisdom functions.
+c     
+c     We cannot compile this file into the FFTW library itself, lest all
+c     FFTW-calling programs be required to link to the Fortran I/O
+c     libraries.
+c     
+cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
+
+c     Strictly speaking, the '$' format specifier, which allows us to
+c     write a character without a trailing newline, is not standard F77.
+c     However, it seems to be a nearly universal extension.
+      subroutine write_char(c, iunit)
+      character c
+      integer iunit
+      write(iunit,321) c
+ 321  format(a,$)
+      end      
+
+      subroutine export_wisdom_to_file(iunit)
+      integer iunit
+      external write_char
+      call dfftw_export_wisdom(write_char, iunit)
+      end
+
+c     Fortran 77 does not have any portable way to read an arbitrary
+c     file one character at a time.  The best alternative seems to be to
+c     read a whole line into a buffer, since for fftw-exported wisdom we
+c     can bound the line length.  (If the file contains longer lines,
+c     then the lines will be truncated and the wisdom import should
+c     simply fail.)  Ugh.
+      subroutine read_char(ic, iunit)
+      integer ic
+      integer iunit
+      character*256 buf
+      save buf
+      integer ibuf
+      data ibuf/257/
+      save ibuf
+      if (ibuf .lt. 257) then
+         ic = ichar(buf(ibuf:ibuf))
+         ibuf = ibuf + 1
+         return
+      endif
+      read(iunit,123,end=666) buf
+      ic = ichar(buf(1:1))
+      ibuf = 2
+      return
+ 666  ic = -1
+      ibuf = 257
+ 123  format(a256)
+      end
+      
+      subroutine import_wisdom_from_file(isuccess, iunit)
+      integer isuccess
+      integer iunit
+      external read_char
+      call dfftw_import_wisdom(isuccess, read_char, iunit)
+      end
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/fftw3.info
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/fftw3.info	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,165 @@
+This is fftw3.info, produced by makeinfo version 4.13 from fftw3.texi.
+
+This manual is for FFTW (version 3.3.3, 25 November 2012).
+
+   Copyright (C) 2003 Matteo Frigo.
+
+   Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+
+INFO-DIR-SECTION Texinfo documentation system
+START-INFO-DIR-ENTRY
+* fftw3: (fftw3).	FFTW User's Manual.
+END-INFO-DIR-ENTRY
+
+
+Indirect:
+fftw3.info-1: 1076
+fftw3.info-2: 297310
+
+Tag Table:
+(Indirect)
+Node: Top1076
+Node: Introduction1749
+Node: Tutorial8085
+Ref: Tutorial-Footnote-19329
+Node: Complex One-Dimensional DFTs9423
+Node: Complex Multi-Dimensional DFTs15179
+Ref: Complex Multi-Dimensional DFTs-Footnote-118611
+Node: One-Dimensional DFTs of Real Data18746
+Node: Multi-Dimensional DFTs of Real Data23191
+Node: More DFTs of Real Data27121
+Node: The Halfcomplex-format DFT30623
+Node: Real even/odd DFTs (cosine/sine transforms)33232
+Ref: Real even/odd DFTs (cosine/sine transforms)-Footnote-138842
+Ref: Real even/odd DFTs (cosine/sine transforms)-Footnote-239031
+Node: The Discrete Hartley Transform39964
+Ref: The Discrete Hartley Transform-Footnote-142149
+Node: Other Important Topics42398
+Node: SIMD alignment and fftw_malloc42691
+Node: Multi-dimensional Array Format44951
+Node: Row-major Format45572
+Node: Column-major Format47265
+Node: Fixed-size Arrays in C48349
+Node: Dynamic Arrays in C49785
+Node: Dynamic Arrays in C-The Wrong Way51423
+Node: Words of Wisdom-Saving Plans53171
+Node: Caveats in Using Wisdom55846
+Node: FFTW Reference57934
+Node: Data Types and Files58422
+Node: Complex numbers58854
+Node: Precision60595
+Node: Memory Allocation62157
+Node: Using Plans63728
+Node: Basic Interface67608
+Ref: Basic Interface-Footnote-168352
+Node: Complex DFTs68416
+Node: Planner Flags72383
+Node: Real-data DFTs77740
+Node: Real-data DFT Array Format82736
+Node: Real-to-Real Transforms84991
+Node: Real-to-Real Transform Kinds88961
+Node: Advanced Interface91429
+Node: Advanced Complex DFTs92169
+Node: Advanced Real-data DFTs96428
+Node: Advanced Real-to-real Transforms98755
+Node: Guru Interface99861
+Node: Interleaved and split arrays100784
+Node: Guru vector and transform sizes101827
+Node: Guru Complex DFTs104392
+Node: Guru Real-data DFTs107228
+Node: Guru Real-to-real Transforms110151
+Node: 64-bit Guru Interface111470
+Node: New-array Execute Functions113793
+Node: Wisdom117791
+Node: Wisdom Export118150
+Node: Wisdom Import120124
+Node: Forgetting Wisdom122146
+Node: Wisdom Utilities122518
+Node: What FFTW Really Computes123885
+Node: The 1d Discrete Fourier Transform (DFT)124710
+Node: The 1d Real-data DFT126069
+Node: 1d Real-even DFTs (DCTs)127723
+Node: 1d Real-odd DFTs (DSTs)130932
+Node: 1d Discrete Hartley Transforms (DHTs)133874
+Node: Multi-dimensional Transforms134550
+Node: Multi-threaded FFTW137153
+Node: Installation and Supported Hardware/Software138622
+Node: Usage of Multi-threaded FFTW140447
+Node: How Many Threads to Use?143755
+Node: Thread safety144779
+Node: Distributed-memory FFTW with MPI146947
+Node: FFTW MPI Installation149526
+Node: Linking and Initializing MPI FFTW151318
+Node: 2d MPI example152548
+Node: MPI Data Distribution156784
+Node: Basic and advanced distribution interfaces159662
+Node: Load balancing164097
+Node: Transposed distributions165783
+Node: One-dimensional distributions169555
+Node: Multi-dimensional MPI DFTs of Real Data172124
+Node: Other Multi-dimensional Real-data MPI Transforms176772
+Node: FFTW MPI Transposes178945
+Node: Basic distributed-transpose interface179785
+Node: Advanced distributed-transpose interface181969
+Node: An improved replacement for MPI_Alltoall183257
+Node: FFTW MPI Wisdom185233
+Ref: FFTW MPI Wisdom-Footnote-1187976
+Node: Avoiding MPI Deadlocks188889
+Node: FFTW MPI Performance Tips189918
+Node: Combining MPI and Threads191387
+Node: FFTW MPI Reference194858
+Node: MPI Files and Data Types195437
+Node: MPI Initialization196433
+Node: Using MPI Plans197532
+Node: MPI Data Distribution Functions199358
+Node: MPI Plan Creation204814
+Node: MPI Wisdom Communication215491
+Node: FFTW MPI Fortran Interface216417
+Ref: FFTW MPI Fortran Interface-Footnote-1222446
+Node: Calling FFTW from Modern Fortran222853
+Node: Overview of Fortran interface224204
+Node: Extended and quadruple precision in Fortran227656
+Node: Reversing array dimensions229037
+Node: FFTW Fortran type reference232572
+Node: Plan execution in Fortran237059
+Node: Allocating aligned memory in Fortran239955
+Node: Accessing the wisdom API from Fortran243319
+Node: Wisdom File Export/Import from Fortran244096
+Node: Wisdom String Export/Import from Fortran245758
+Node: Wisdom Generic Export/Import from Fortran247746
+Node: Defining an FFTW module249976
+Node: Calling FFTW from Legacy Fortran251045
+Node: Fortran-interface routines252602
+Ref: Fortran-interface routines-Footnote-1256260
+Ref: Fortran-interface routines-Footnote-2256463
+Node: FFTW Constants in Fortran256596
+Node: FFTW Execution in Fortran257751
+Node: Fortran Examples260507
+Node: Wisdom of Fortran?263926
+Node: Upgrading from FFTW version 2265606
+Ref: Upgrading from FFTW version 2-Footnote-1275229
+Node: Installation and Customization275412
+Node: Installation on Unix277056
+Node: Installation on non-Unix systems285719
+Node: Cycle Counters287934
+Node: Generating your own code289686
+Node: Acknowledgments291721
+Node: License and Copyright295441
+Node: Concept Index297310
+Node: Library Index333952
+
+End Tag Table
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/fftw3.info-1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/fftw3.info-1	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,6280 @@
+This is fftw3.info, produced by makeinfo version 4.13 from fftw3.texi.
+
+This manual is for FFTW (version 3.3.3, 25 November 2012).
+
+   Copyright (C) 2003 Matteo Frigo.
+
+   Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+
+INFO-DIR-SECTION Texinfo documentation system
+START-INFO-DIR-ENTRY
+* fftw3: (fftw3).	FFTW User's Manual.
+END-INFO-DIR-ENTRY
+
+
+File: fftw3.info,  Node: Top,  Next: Introduction,  Prev: (dir),  Up: (dir)
+
+FFTW User Manual
+****************
+
+Welcome to FFTW, the Fastest Fourier Transform in the West.  FFTW is a
+collection of fast C routines to compute the discrete Fourier transform.
+This manual documents FFTW version 3.3.3.
+
+* Menu:
+
+* Introduction::
+* Tutorial::
+* Other Important Topics::
+* FFTW Reference::
+* Multi-threaded FFTW::
+* Distributed-memory FFTW with MPI::
+* Calling FFTW from Modern Fortran::
+* Calling FFTW from Legacy Fortran::
+* Upgrading from FFTW version 2::
+* Installation and Customization::
+* Acknowledgments::
+* License and Copyright::
+* Concept Index::
+* Library Index::
+
+
+File: fftw3.info,  Node: Introduction,  Next: Tutorial,  Prev: Top,  Up: Top
+
+1 Introduction
+**************
+
+This manual documents version 3.3.3 of FFTW, the _Fastest Fourier
+Transform in the West_.  FFTW is a comprehensive collection of fast C
+routines for computing the discrete Fourier transform (DFT) and various
+special cases thereof.  
+   * FFTW computes the DFT of complex data, real data, even-   or
+     odd-symmetric real data (these symmetric transforms are usually
+     known as the discrete cosine or sine transform, respectively), and
+     the   discrete Hartley transform (DHT) of real data.
+
+   * The input data can have arbitrary length.         FFTW employs O(n
+     log n)  algorithms for all lengths, including        prime numbers.
+
+   * FFTW supports arbitrary multi-dimensional data.
+
+   * FFTW supports the SSE, SSE2, AVX, Altivec, and MIPS PS instruction
+           sets.
+
+   * FFTW includes parallel (multi-threaded) transforms        for
+     shared-memory systems.
+
+   * Starting with version 3.3, FFTW includes distributed-memory
+     parallel        transforms using MPI.
+
+   We assume herein that you are familiar with the properties and uses
+of the DFT that are relevant to your application.  Otherwise, see e.g.
+`The Fast Fourier Transform and Its Applications' by E. O. Brigham
+(Prentice-Hall, Englewood Cliffs, NJ, 1988).  Our web page
+(http://www.fftw.org) also has links to FFT-related information online.  
+
+   In order to use FFTW effectively, you need to learn one basic concept
+of FFTW's internal structure: FFTW does not use a fixed algorithm for
+computing the transform, but instead it adapts the DFT algorithm to
+details of the underlying hardware in order to maximize performance.
+Hence, the computation of the transform is split into two phases.
+First, FFTW's "planner" "learns" the fastest way to compute the
+transform on your machine.  The planner produces a data structure
+called a "plan" that contains this information.  Subsequently, the plan
+is "executed" to transform the array of input data as dictated by the
+plan.  The plan can be reused as many times as needed.  In typical
+high-performance applications, many transforms of the same size are
+computed and, consequently, a relatively expensive initialization of
+this sort is acceptable.  On the other hand, if you need a single
+transform of a given size, the one-time cost of the planner becomes
+significant.  For this case, FFTW provides fast planners based on
+heuristics or on previously computed plans.
+
+   FFTW supports transforms of data with arbitrary length, rank,
+multiplicity, and a general memory layout.  In simple cases, however,
+this generality may be unnecessary and confusing.  Consequently, we
+organized the interface to FFTW into three levels of increasing
+generality.
+   * The "basic interface" computes a single       transform of
+     contiguous data.
+
+   * The "advanced interface" computes transforms       of multiple or
+     strided arrays.
+
+   * The "guru interface" supports the most general data       layouts,
+     multiplicities, and strides.
+   We expect that most users will be best served by the basic interface,
+whereas the guru interface requires careful attention to the
+documentation to avoid problems.  
+
+   Besides the automatic performance adaptation performed by the
+planner, it is also possible for advanced users to customize FFTW
+manually.  For example, if code space is a concern, we provide a tool
+that links only the subset of FFTW needed by your application.
+Conversely, you may need to extend FFTW because the standard
+distribution is not sufficient for your needs.  For example, the
+standard FFTW distribution works most efficiently for arrays whose size
+can be factored into small primes (2, 3, 5, and 7), and otherwise it
+uses a slower general-purpose routine.  If you need efficient
+transforms of other sizes, you can use FFTW's code generator, which
+produces fast C programs ("codelets") for any particular array size you
+may care about.  For example, if you need transforms of size 513 = 19 x
+3^3, you can customize FFTW to support the factor 19 efficiently.
+
+   For more information regarding FFTW, see the paper, "The Design and
+Implementation of FFTW3," by M. Frigo and S. G. Johnson, which was an
+invited paper in `Proc. IEEE' 93 (2), p. 216 (2005).  The code
+generator is described in the paper "A fast Fourier transform compiler", by
+M. Frigo, in the `Proceedings of the 1999 ACM SIGPLAN Conference on
+Programming Language Design and Implementation (PLDI), Atlanta,
+Georgia, May 1999'.  These papers, along with the latest version of
+FFTW, the FAQ, benchmarks, and other links, are available at the FFTW
+home page (http://www.fftw.org).
+
+   The current version of FFTW incorporates many good ideas from the
+past thirty years of FFT literature.  In one way or another, FFTW uses
+the Cooley-Tukey algorithm, the prime factor algorithm, Rader's
+algorithm for prime sizes, and a split-radix algorithm (with a
+"conjugate-pair" variation pointed out to us by Dan Bernstein).  FFTW's
+code generator also produces new algorithms that we do not completely
+understand.  The reader is referred to the cited papers for the
+appropriate references.
+
+   The rest of this manual is organized as follows.  We first discuss
+the sequential (single-processor) implementation.  We start by
+describing the basic interface/features of FFTW in *note Tutorial::.
+Next, *note Other Important Topics:: discusses data alignment (*note
+SIMD alignment and fftw_malloc::), the storage scheme of
+multi-dimensional arrays (*note Multi-dimensional Array Format::), and
+FFTW's mechanism for storing plans on disk (*note Words of
+Wisdom-Saving Plans::).  Next, *note FFTW Reference:: provides
+comprehensive documentation of all FFTW's features.  Parallel
+transforms are discussed in their own chapters: *note Multi-threaded
+FFTW:: and *note Distributed-memory FFTW with MPI::.  Fortran
+programmers can also use FFTW, as described in *note Calling FFTW from
+Legacy Fortran:: and *note Calling FFTW from Modern Fortran::.  *note
+Installation and Customization:: explains how to install FFTW in your
+computer system and how to adapt FFTW to your needs.  License and
+copyright information is given in *note License and Copyright::.
+Finally, we thank all the people who helped us in *note
+Acknowledgments::.
+
+
+File: fftw3.info,  Node: Tutorial,  Next: Other Important Topics,  Prev: Introduction,  Up: Top
+
+2 Tutorial
+**********
+
+* Menu:
+
+* Complex One-Dimensional DFTs::
+* Complex Multi-Dimensional DFTs::
+* One-Dimensional DFTs of Real Data::
+* Multi-Dimensional DFTs of Real Data::
+* More DFTs of Real Data::
+
+   This chapter describes the basic usage of FFTW, i.e., how to compute the
+Fourier transform of a single array.  This chapter tells the truth, but
+not the _whole_ truth. Specifically, FFTW implements additional
+routines and flags that are not documented here, although in many cases
+we try to indicate where added capabilities exist.  For more complete
+information, see *note FFTW Reference::.  (Note that you need to
+compile and install FFTW before you can use it in a program.  For the
+details of the installation, see *note Installation and
+Customization::.)
+
+   We recommend that you read this tutorial in order.(1)  At the least,
+read the first section (*note Complex One-Dimensional DFTs::) before
+reading any of the others, even if your main interest lies in one of
+the other transform types.
+
+   Users of FFTW version 2 and earlier may also want to read *note
+Upgrading from FFTW version 2::.
+
+   ---------- Footnotes ----------
+
+   (1) You can read the tutorial in bit-reversed order after computing
+your first transform.
+
+
+File: fftw3.info,  Node: Complex One-Dimensional DFTs,  Next: Complex Multi-Dimensional DFTs,  Prev: Tutorial,  Up: Tutorial
+
+2.1 Complex One-Dimensional DFTs
+================================
+
+     Plan: To bother about the best method of accomplishing an
+     accidental result.  [Ambrose Bierce, `The Enlarged Devil's
+     Dictionary'.]  
+
+   The basic usage of FFTW to compute a one-dimensional DFT of size `N'
+is simple, and it typically looks something like this code:
+
+     #include <fftw3.h>
+     ...
+     {
+         fftw_complex *in, *out;
+         fftw_plan p;
+         ...
+         in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
+         out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
+         p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
+         ...
+         fftw_execute(p); /* repeat as needed */
+         ...
+         fftw_destroy_plan(p);
+         fftw_free(in); fftw_free(out);
+     }
+
+   You must link this code with the `fftw3' library.  On Unix systems,
+link with `-lfftw3 -lm'.
+
+   The example code first allocates the input and output arrays.  You
+can allocate them in any way that you like, but we recommend using
+`fftw_malloc', which behaves like `malloc' except that it properly
+aligns the array when SIMD instructions (such as SSE and Altivec) are
+available (*note SIMD alignment and fftw_malloc::). [Alternatively, we
+provide a convenient wrapper function `fftw_alloc_complex(N)' which has
+the same effect.]  
+
+   The data is an array of type `fftw_complex', which is by default a
+`double[2]' composed of the real (`in[i][0]') and imaginary
+(`in[i][1]') parts of a complex number.  
+
+   The next step is to create a "plan", which is an object that
+contains all the data that FFTW needs to compute the FFT.  This
+function creates the plan:
+
+     fftw_plan fftw_plan_dft_1d(int n, fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+   
+   The first argument, `n', is the size of the transform you are trying
+to compute.  The size `n' can be any positive integer, but sizes that
+are products of small factors are transformed most efficiently
+(although prime sizes still use an O(n log n)  algorithm).
+
+   The next two arguments are pointers to the input and output arrays of
+the transform.  These pointers can be equal, indicating an "in-place"
+transform.  
+
+   The fourth argument, `sign', can be either `FFTW_FORWARD' (`-1') or
+`FFTW_BACKWARD' (`+1'), and indicates the direction of the transform
+you are interested in; technically, it is the sign of the exponent in
+the transform.
+
+   The `flags' argument is usually either `FFTW_MEASURE' or `FFTW_ESTIMATE'.
+`FFTW_MEASURE' instructs FFTW to run and measure the execution time of
+several FFTs in order to find the best way to compute the transform of
+size `n'.  This process takes some time (usually a few seconds),
+depending on your machine and on the size of the transform.
+`FFTW_ESTIMATE', on the contrary, does not run any computation and just
+builds a reasonable plan that is probably sub-optimal.  In short, if
+your program performs many transforms of the same size and
+initialization time is not important, use `FFTW_MEASURE'; otherwise use
+the estimate.
+
+   _You must create the plan before initializing the input_, because
+`FFTW_MEASURE' overwrites the `in'/`out' arrays.  (Technically,
+`FFTW_ESTIMATE' does not touch your arrays, but you should always
+create plans first just to be sure.)
+
+   Once the plan has been created, you can use it as many times as you
+like for transforms on the specified `in'/`out' arrays, computing the
+actual transforms via `fftw_execute(plan)':
+     void fftw_execute(const fftw_plan plan);
+   
+   The DFT results are stored in-order in the array `out', with the
+zero-frequency (DC) component in `out[0]'.  If `in != out', the
+transform is "out-of-place" and the input array `in' is not modified.
+Otherwise, the input array is overwritten with the transform.
+
+   If you want to transform a _different_ array of the same size, you
+can create a new plan with `fftw_plan_dft_1d' and FFTW automatically
+reuses the information from the previous plan, if possible.
+Alternatively, with the "guru" interface you can apply a given plan to
+a different array, if you are careful.  *Note FFTW Reference::.
+
+   When you are done with the plan, you deallocate it by calling
+`fftw_destroy_plan(plan)':
+     void fftw_destroy_plan(fftw_plan plan);
+   If you allocate an array with `fftw_malloc()' you must deallocate it
+with `fftw_free()'.  Do not use `free()' or, heaven forbid, `delete'.  
+
+   FFTW computes an _unnormalized_ DFT.  Thus, computing a forward
+followed by a backward transform (or vice versa) results in the original
+array scaled by `n'.  For the definition of the DFT, see *note What
+FFTW Really Computes::.  
+
+   If you have a C compiler, such as `gcc', that supports the C99
+standard, and you `#include <complex.h>' _before_ `<fftw3.h>', then
+`fftw_complex' is the native double-precision complex type and you can
+manipulate it with ordinary arithmetic.  Otherwise, FFTW defines its
+own complex type, which is bit-compatible with the C99 complex type.
+*Note Complex numbers::.  (The C++ `<complex>' template class may also
+be usable via a typecast.)  
+
+   To use single or long-double precision versions of FFTW, replace the
+`fftw_' prefix by `fftwf_' or `fftwl_' and link with `-lfftw3f' or
+`-lfftw3l', but use the _same_ `<fftw3.h>' header file.  
+
+   Many more flags exist besides `FFTW_MEASURE' and `FFTW_ESTIMATE'.
+For example, use `FFTW_PATIENT' if you're willing to wait even longer
+for a possibly even faster plan (*note FFTW Reference::).  You can also
+save plans for future use, as described by *note Words of Wisdom-Saving
+Plans::.
+
+
+File: fftw3.info,  Node: Complex Multi-Dimensional DFTs,  Next: One-Dimensional DFTs of Real Data,  Prev: Complex One-Dimensional DFTs,  Up: Tutorial
+
+2.2 Complex Multi-Dimensional DFTs
+==================================
+
+Multi-dimensional transforms work much the same way as one-dimensional
+transforms: you allocate arrays of `fftw_complex' (preferably using
+`fftw_malloc'), create an `fftw_plan', execute it as many times as you
+want with `fftw_execute(plan)', and clean up with
+`fftw_destroy_plan(plan)' (and `fftw_free').
+
+   FFTW provides two routines for creating plans for 2d and 3d
+transforms, and one routine for creating plans of arbitrary
+dimensionality.  The 2d and 3d routines have the following signature:
+     fftw_plan fftw_plan_dft_2d(int n0, int n1,
+                                fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+     fftw_plan fftw_plan_dft_3d(int n0, int n1, int n2,
+                                fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+   
+   These routines create plans for `n0' by `n1' two-dimensional (2d)
+transforms and `n0' by `n1' by `n2' 3d transforms, respectively.  All
+of these transforms operate on contiguous arrays in the C-standard
+"row-major" order, so that the last dimension has the fastest-varying
+index in the array.  This layout is described further in *note
+Multi-dimensional Array Format::.
+
+   FFTW can also compute transforms of higher dimensionality.  In order
+to avoid confusion between the various meanings of the the word
+"dimension", we use the term _rank_ to denote the number of independent
+indices in an array.(1)  For example, we say that a 2d transform has
+rank 2, a 3d transform has rank 3, and so on.  You can plan transforms
+of arbitrary rank by means of the following function:
+
+     fftw_plan fftw_plan_dft(int rank, const int *n,
+                             fftw_complex *in, fftw_complex *out,
+                             int sign, unsigned flags);
+   
+   Here, `n' is a pointer to an array `n[rank]' denoting an `n[0]' by
+`n[1]' by ... by `n[rank-1]' transform.  Thus, for example, the call
+     fftw_plan_dft_2d(n0, n1, in, out, sign, flags);
+   is equivalent to the following code fragment:
+     int n[2];
+     n[0] = n0;
+     n[1] = n1;
+     fftw_plan_dft(2, n, in, out, sign, flags);
+   `fftw_plan_dft' is not restricted to 2d and 3d transforms, however,
+but it can plan transforms of arbitrary rank.
+
+   You may have noticed that all the planner routines described so far
+have overlapping functionality.  For example, you can plan a 1d or 2d
+transform by using `fftw_plan_dft' with a `rank' of `1' or `2', or even
+by calling `fftw_plan_dft_3d' with `n0' and/or `n1' equal to `1' (with
+no loss in efficiency).  This pattern continues, and FFTW's planning
+routines in general form a "partial order," sequences of interfaces
+with strictly increasing generality but correspondingly greater
+complexity.
+
+   `fftw_plan_dft' is the most general complex-DFT routine that we
+describe in this tutorial, but there are also the advanced and guru
+interfaces, which allow one to efficiently combine multiple/strided
+transforms into a single FFTW plan, transform a subset of a larger
+multi-dimensional array, and/or to handle more general complex-number
+formats.  For more information, see *note FFTW Reference::.
+
+   ---------- Footnotes ----------
+
+   (1) The term "rank" is commonly used in the APL, FORTRAN, and Common
+Lisp traditions, although it is not so common in the C world.
+
+
+File: fftw3.info,  Node: One-Dimensional DFTs of Real Data,  Next: Multi-Dimensional DFTs of Real Data,  Prev: Complex Multi-Dimensional DFTs,  Up: Tutorial
+
+2.3 One-Dimensional DFTs of Real Data
+=====================================
+
+In many practical applications, the input data `in[i]' are purely real
+numbers, in which case the DFT output satisfies the "Hermitian" redundancy:
+`out[i]' is the conjugate of `out[n-i]'.  It is possible to take
+advantage of these circumstances in order to achieve roughly a factor
+of two improvement in both speed and memory usage.
+
+   In exchange for these speed and space advantages, the user sacrifices
+some of the simplicity of FFTW's complex transforms. First of all, the
+input and output arrays are of _different sizes and types_: the input
+is `n' real numbers, while the output is `n/2+1' complex numbers (the
+non-redundant outputs); this also requires slight "padding" of the
+input array for in-place transforms.  Second, the inverse transform
+(complex to real) has the side-effect of _overwriting its input array_,
+by default.  Neither of these inconveniences should pose a serious
+problem for users, but it is important to be aware of them.
+
+   The routines to perform real-data transforms are almost the same as
+those for complex transforms: you allocate arrays of `double' and/or
+`fftw_complex' (preferably using `fftw_malloc' or
+`fftw_alloc_complex'), create an `fftw_plan', execute it as many times
+as you want with `fftw_execute(plan)', and clean up with
+`fftw_destroy_plan(plan)' (and `fftw_free').  The only differences are
+that the input (or output) is of type `double' and there are new
+routines to create the plan.  In one dimension:
+
+     fftw_plan fftw_plan_dft_r2c_1d(int n, double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_c2r_1d(int n, fftw_complex *in, double *out,
+                                    unsigned flags);
+   
+   for the real input to complex-Hermitian output ("r2c") and
+complex-Hermitian input to real output ("c2r") transforms.  Unlike the
+complex DFT planner, there is no `sign' argument.  Instead, r2c DFTs
+are always `FFTW_FORWARD' and c2r DFTs are always `FFTW_BACKWARD'.  (For
+single/long-double precision `fftwf' and `fftwl', `double' should be
+replaced by `float' and `long double', respectively.)  
+
+   Here, `n' is the "logical" size of the DFT, not necessarily the
+physical size of the array.  In particular, the real (`double') array
+has `n' elements, while the complex (`fftw_complex') array has `n/2+1'
+elements (where the division is rounded down).  For an in-place
+transform, `in' and `out' are aliased to the same array, which must be
+big enough to hold both; so, the real array would actually have
+`2*(n/2+1)' elements, where the elements beyond the first `n' are
+unused padding.  (Note that this is very different from the concept of
+"zero-padding" a transform to a larger length, which changes the
+logical size of the DFT by actually adding new input data.)  The kth
+element of the complex array is exactly the same as the kth element of
+the corresponding complex DFT.  All positive `n' are supported;
+products of small factors are most efficient, but an O(n log n)
+algorithm is used even for prime sizes.
+
+   As noted above, the c2r transform destroys its input array even for
+out-of-place transforms.  This can be prevented, if necessary, by
+including `FFTW_PRESERVE_INPUT' in the `flags', with unfortunately some
+sacrifice in performance.  This flag is also not currently supported
+for multi-dimensional real DFTs (next section).
+
+   Readers familiar with DFTs of real data will recall that the 0th (the
+"DC") and `n/2'-th (the "Nyquist" frequency, when `n' is even) elements
+of the complex output are purely real.  Some implementations therefore
+store the Nyquist element where the DC imaginary part would go, in
+order to make the input and output arrays the same size.  Such packing,
+however, does not generalize well to multi-dimensional transforms, and
+the space savings are miniscule in any case; FFTW does not support it.
+
+   An alternative interface for one-dimensional r2c and c2r DFTs can be
+found in the `r2r' interface (*note The Halfcomplex-format DFT::), with
+"halfcomplex"-format output that _is_ the same size (and type) as the
+input array.  That interface, although it is not very useful for
+multi-dimensional transforms, may sometimes yield better performance.
+
+
+File: fftw3.info,  Node: Multi-Dimensional DFTs of Real Data,  Next: More DFTs of Real Data,  Prev: One-Dimensional DFTs of Real Data,  Up: Tutorial
+
+2.4 Multi-Dimensional DFTs of Real Data
+=======================================
+
+Multi-dimensional DFTs of real data use the following planner routines:
+
+     fftw_plan fftw_plan_dft_r2c_2d(int n0, int n1,
+                                    double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_r2c_3d(int n0, int n1, int n2,
+                                    double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_r2c(int rank, const int *n,
+                                 double *in, fftw_complex *out,
+                                 unsigned flags);
+   
+   as well as the corresponding `c2r' routines with the input/output
+types swapped.  These routines work similarly to their complex
+analogues, except for the fact that here the complex output array is cut
+roughly in half and the real array requires padding for in-place
+transforms (as in 1d, above).
+
+   As before, `n' is the logical size of the array, and the
+consequences of this on the the format of the complex arrays deserve
+careful attention.  Suppose that the real data has dimensions n[0] x
+n[1] x n[2] x ... x n[d-1]  (in row-major order).  Then, after an r2c
+transform, the output is an n[0] x n[1] x n[2] x ... x (n[d-1]/2 + 1)
+array of `fftw_complex' values in row-major order, corresponding to
+slightly over half of the output of the corresponding complex DFT.
+(The division is rounded down.)  The ordering of the data is otherwise
+exactly the same as in the complex-DFT case.
+
+   For out-of-place transforms, this is the end of the story: the real
+data is stored as a row-major array of size n[0] x n[1] x n[2] x ... x
+n[d-1]  and the complex data is stored as a row-major array of size
+n[0] x n[1] x n[2] x ... x (n[d-1]/2 + 1) .
+
+   For in-place transforms, however, extra padding of the real-data
+array is necessary because the complex array is larger than the real
+array, and the two arrays share the same memory locations.  Thus, for
+in-place transforms, the final dimension of the real-data array must be
+padded with extra values to accommodate the size of the complex
+data--two values if the last dimension is even and one if it is odd.  That
+is, the last dimension of the real data must physically contain 2 *
+(n[d-1]/2+1) `double' values (exactly enough to hold the complex data).
+This physical array size does not, however, change the _logical_ array
+size--only n[d-1] values are actually stored in the last dimension, and
+n[d-1] is the last dimension passed to the plan-creation routine.
+
+   For example, consider the transform of a two-dimensional real array
+of size `n0' by `n1'.  The output of the r2c transform is a
+two-dimensional complex array of size `n0' by `n1/2+1', where the `y'
+dimension has been cut nearly in half because of redundancies in the
+output.  Because `fftw_complex' is twice the size of `double', the
+output array is slightly bigger than the input array.  Thus, if we want
+to compute the transform in place, we must _pad_ the input array so
+that it is of size `n0' by `2*(n1/2+1)'.  If `n1' is even, then there
+are two padding elements at the end of each row (which need not be
+initialized, as they are only used for output).
+
+   These transforms are unnormalized, so an r2c followed by a c2r
+transform (or vice versa) will result in the original data scaled by
+the number of real data elements--that is, the product of the (logical)
+dimensions of the real data.  
+
+   (Because the last dimension is treated specially, if it is equal to
+`1' the transform is _not_ equivalent to a lower-dimensional r2c/c2r
+transform.  In that case, the last complex dimension also has size `1'
+(`=1/2+1'), and no advantage is gained over the complex transforms.)
+
+
+File: fftw3.info,  Node: More DFTs of Real Data,  Prev: Multi-Dimensional DFTs of Real Data,  Up: Tutorial
+
+2.5 More DFTs of Real Data
+==========================
+
+* Menu:
+
+* The Halfcomplex-format DFT::
+* Real even/odd DFTs (cosine/sine transforms)::
+* The Discrete Hartley Transform::
+
+   FFTW supports several other transform types via a unified "r2r"
+(real-to-real) interface, so called because it takes a real (`double')
+array and outputs a real array of the same size.  These r2r transforms
+currently fall into three categories: DFTs of real input and
+complex-Hermitian output in halfcomplex format, DFTs of real input with
+even/odd symmetry (a.k.a. discrete cosine/sine transforms, DCTs/DSTs),
+and discrete Hartley transforms (DHTs), all described in more detail by
+the following sections.
+
+   The r2r transforms follow the by now familiar interface of creating
+an `fftw_plan', executing it with `fftw_execute(plan)', and destroying
+it with `fftw_destroy_plan(plan)'.  Furthermore, all r2r transforms
+share the same planner interface:
+
+     fftw_plan fftw_plan_r2r_1d(int n, double *in, double *out,
+                                fftw_r2r_kind kind, unsigned flags);
+     fftw_plan fftw_plan_r2r_2d(int n0, int n1, double *in, double *out,
+                                fftw_r2r_kind kind0, fftw_r2r_kind kind1,
+                                unsigned flags);
+     fftw_plan fftw_plan_r2r_3d(int n0, int n1, int n2,
+                                double *in, double *out,
+                                fftw_r2r_kind kind0,
+                                fftw_r2r_kind kind1,
+                                fftw_r2r_kind kind2,
+                                unsigned flags);
+     fftw_plan fftw_plan_r2r(int rank, const int *n, double *in, double *out,
+                             const fftw_r2r_kind *kind, unsigned flags);
+   
+   Just as for the complex DFT, these plan 1d/2d/3d/multi-dimensional
+transforms for contiguous arrays in row-major order, transforming (real)
+input to output of the same size, where `n' specifies the _physical_
+dimensions of the arrays.  All positive `n' are supported (with the
+exception of `n=1' for the `FFTW_REDFT00' kind, noted in the real-even
+subsection below); products of small factors are most efficient
+(factorizing `n-1' and `n+1' for `FFTW_REDFT00' and `FFTW_RODFT00'
+kinds, described below), but an O(n log n)  algorithm is used even for
+prime sizes.
+
+   Each dimension has a "kind" parameter, of type `fftw_r2r_kind',
+specifying the kind of r2r transform to be used for that dimension.  (In
+the case of `fftw_plan_r2r', this is an array `kind[rank]' where
+`kind[i]' is the transform kind for the dimension `n[i]'.)  The kind
+can be one of a set of predefined constants, defined in the following
+subsections.
+
+   In other words, FFTW computes the separable product of the specified
+r2r transforms over each dimension, which can be used e.g. for partial
+differential equations with mixed boundary conditions.  (For some r2r
+kinds, notably the halfcomplex DFT and the DHT, such a separable
+product is somewhat problematic in more than one dimension, however, as
+is described below.)
+
+   In the current version of FFTW, all r2r transforms except for the
+halfcomplex type are computed via pre- or post-processing of
+halfcomplex transforms, and they are therefore not as fast as they
+could be.  Since most other general DCT/DST codes employ a similar
+algorithm, however, FFTW's implementation should provide at least
+competitive performance.
+
+
+File: fftw3.info,  Node: The Halfcomplex-format DFT,  Next: Real even/odd DFTs (cosine/sine transforms),  Prev: More DFTs of Real Data,  Up: More DFTs of Real Data
+
+2.5.1 The Halfcomplex-format DFT
+--------------------------------
+
+An r2r kind of `FFTW_R2HC' ("r2hc") corresponds to an r2c DFT (*note
+One-Dimensional DFTs of Real Data::) but with "halfcomplex" format
+output, and may sometimes be faster and/or more convenient than the
+latter.  The inverse "hc2r" transform is of kind `FFTW_HC2R'.  This
+consists of the non-redundant half of the complex output for a 1d
+real-input DFT of size `n', stored as a sequence of `n' real numbers
+(`double') in the format:
+
+   r0, r1, r2, r(n/2), i((n+1)/2-1), ..., i2, i1
+
+   Here, rk is the real part of the kth output, and ik is the imaginary
+part.  (Division by 2 is rounded down.) For a halfcomplex array
+`hc[n]', the kth component thus has its real part in `hc[k]' and its
+imaginary part in `hc[n-k]', with the exception of `k' `==' `0' or
+`n/2' (the latter only if `n' is even)--in these two cases, the
+imaginary part is zero due to symmetries of the real-input DFT, and is
+not stored.  Thus, the r2hc transform of `n' real values is a
+halfcomplex array of length `n', and vice versa for hc2r.  
+
+   Aside from the differing format, the output of
+`FFTW_R2HC'/`FFTW_HC2R' is otherwise exactly the same as for the
+corresponding 1d r2c/c2r transform (i.e. `FFTW_FORWARD'/`FFTW_BACKWARD'
+transforms, respectively).  Recall that these transforms are
+unnormalized, so r2hc followed by hc2r will result in the original data
+multiplied by `n'.  Furthermore, like the c2r transform, an
+out-of-place hc2r transform will _destroy its input_ array.
+
+   Although these halfcomplex transforms can be used with the
+multi-dimensional r2r interface, the interpretation of such a separable
+product of transforms along each dimension is problematic.  For example,
+consider a two-dimensional `n0' by `n1', r2hc by r2hc transform planned
+by `fftw_plan_r2r_2d(n0, n1, in, out, FFTW_R2HC, FFTW_R2HC,
+FFTW_MEASURE)'.  Conceptually, FFTW first transforms the rows (of size
+`n1') to produce halfcomplex rows, and then transforms the columns (of
+size `n0').  Half of these column transforms, however, are of imaginary
+parts, and should therefore be multiplied by i and combined with the
+r2hc transforms of the real columns to produce the 2d DFT amplitudes;
+FFTW's r2r transform does _not_ perform this combination for you.
+Thus, if a multi-dimensional real-input/output DFT is required, we
+recommend using the ordinary r2c/c2r interface (*note Multi-Dimensional
+DFTs of Real Data::).
+
+
+File: fftw3.info,  Node: Real even/odd DFTs (cosine/sine transforms),  Next: The Discrete Hartley Transform,  Prev: The Halfcomplex-format DFT,  Up: More DFTs of Real Data
+
+2.5.2 Real even/odd DFTs (cosine/sine transforms)
+-------------------------------------------------
+
+The Fourier transform of a real-even function f(-x) = f(x) is
+real-even, and i times the Fourier transform of a real-odd function
+f(-x) = -f(x) is real-odd.  Similar results hold for a discrete Fourier
+transform, and thus for these symmetries the need for complex
+inputs/outputs is entirely eliminated.  Moreover, one gains a factor of
+two in speed/space from the fact that the data are real, and an
+additional factor of two from the even/odd symmetry: only the
+non-redundant (first) half of the array need be stored.  The result is
+the real-even DFT ("REDFT") and the real-odd DFT ("RODFT"), also known
+as the discrete cosine and sine transforms ("DCT" and "DST"),
+respectively.  
+
+   (In this section, we describe the 1d transforms; multi-dimensional
+transforms are just a separable product of these transforms operating
+along each dimension.)
+
+   Because of the discrete sampling, one has an additional choice: is
+the data even/odd around a sampling point, or around the point halfway
+between two samples?  The latter corresponds to _shifting_ the samples
+by _half_ an interval, and gives rise to several transform variants
+denoted by REDFTab and RODFTab: a and b are 0 or 1, and indicate
+whether the input (a) and/or output (b) are shifted by half a sample (1
+means it is shifted).  These are also known as types I-IV of the DCT
+and DST, and all four types are supported by FFTW's r2r interface.(1)
+
+   The r2r kinds for the various REDFT and RODFT types supported by
+FFTW, along with the boundary conditions at both ends of the _input_
+array (`n' real numbers `in[j=0..n-1]'), are:
+
+   * `FFTW_REDFT00' (DCT-I): even around j=0 and even around j=n-1.  
+
+   * `FFTW_REDFT10' (DCT-II, "the" DCT): even around j=-0.5 and even
+     around j=n-0.5.  
+
+   * `FFTW_REDFT01' (DCT-III, "the" IDCT): even around j=0 and odd
+     around j=n.  
+
+   * `FFTW_REDFT11' (DCT-IV): even around j=-0.5 and odd around j=n-0.5.  
+
+   * `FFTW_RODFT00' (DST-I): odd around j=-1 and odd around j=n.  
+
+   * `FFTW_RODFT10' (DST-II): odd around j=-0.5 and odd around j=n-0.5.  
+
+   * `FFTW_RODFT01' (DST-III): odd around j=-1 and even around j=n-1.  
+
+   * `FFTW_RODFT11' (DST-IV): odd around j=-0.5 and even around j=n-0.5.  
+
+
+   Note that these symmetries apply to the "logical" array being
+transformed; *there are no constraints on your physical input data*.
+So, for example, if you specify a size-5 REDFT00 (DCT-I) of the data
+abcde, it corresponds to the DFT of the logical even array abcdedcb of
+size 8.  A size-4 REDFT10 (DCT-II) of the data abcd corresponds to the
+size-8 logical DFT of the even array abcddcba, shifted by half a sample.
+
+   All of these transforms are invertible.  The inverse of R*DFT00 is
+R*DFT00; of R*DFT10 is R*DFT01 and vice versa (these are often called
+simply "the" DCT and IDCT, respectively); and of R*DFT11 is R*DFT11.
+However, the transforms computed by FFTW are unnormalized, exactly like
+the corresponding real and complex DFTs, so computing a transform
+followed by its inverse yields the original array scaled by N, where N
+is the _logical_ DFT size.  For REDFT00, N=2(n-1); for RODFT00,
+N=2(n+1); otherwise, N=2n.  
+
+   Note that the boundary conditions of the transform output array are
+given by the input boundary conditions of the inverse transform.  Thus,
+the above transforms are all inequivalent in terms of input/output
+boundary conditions, even neglecting the 0.5 shift difference.
+
+   FFTW is most efficient when N is a product of small factors; note
+that this _differs_ from the factorization of the physical size `n' for
+REDFT00 and RODFT00!  There is another oddity: `n=1' REDFT00 transforms
+correspond to N=0, and so are _not defined_ (the planner will return
+`NULL').  Otherwise, any positive `n' is supported.
+
+   For the precise mathematical definitions of these transforms as used
+by FFTW, see *note What FFTW Really Computes::.  (For people accustomed
+to the DCT/DST, FFTW's definitions have a coefficient of 2 in front of
+the cos/sin functions so that they correspond precisely to an even/odd
+DFT of size N.  Some authors also include additional multiplicative
+factors of sqrt(2) for selected inputs and outputs; this makes the
+transform orthogonal, but sacrifices the direct equivalence to a
+symmetric DFT.)
+
+Which type do you need?
+.......................
+
+Since the required flavor of even/odd DFT depends upon your problem,
+you are the best judge of this choice, but we can make a few comments
+on relative efficiency to help you in your selection.  In particular,
+R*DFT01 and R*DFT10 tend to be slightly faster than R*DFT11 (especially
+for odd sizes), while the R*DFT00 transforms are sometimes
+significantly slower (especially for even sizes).(2)
+
+   Thus, if only the boundary conditions on the transform inputs are
+specified, we generally recommend R*DFT10 over R*DFT00 and R*DFT01 over
+R*DFT11 (unless the half-sample shift or the self-inverse property is
+significant for your problem).
+
+   If performance is important to you and you are using only small sizes
+(say n<200), e.g. for multi-dimensional transforms, then you might
+consider generating hard-coded transforms of those sizes and types that
+you are interested in (*note Generating your own code::).
+
+   We are interested in hearing what types of symmetric transforms you
+find most useful.
+
+   ---------- Footnotes ----------
+
+   (1) There are also type V-VIII transforms, which correspond to a
+logical DFT of _odd_ size N, independent of whether the physical size
+`n' is odd, but we do not support these variants.
+
+   (2) R*DFT00 is sometimes slower in FFTW because we discovered that
+the standard algorithm for computing this by a pre/post-processed real
+DFT--the algorithm used in FFTPACK, Numerical Recipes, and other
+sources for decades now--has serious numerical problems: it already
+loses several decimal places of accuracy for 16k sizes.  There seem to
+be only two alternatives in the literature that do not suffer
+similarly: a recursive decomposition into smaller DCTs, which would
+require a large set of codelets for efficiency and generality, or
+sacrificing a factor of 2 in speed to use a real DFT of twice the size.
+We currently employ the latter technique for general n, as well as a
+limited form of the former method: a split-radix decomposition when n
+is odd (N a multiple of 4).  For N containing many factors of 2, the
+split-radix method seems to recover most of the speed of the standard
+algorithm without the accuracy tradeoff.
+
+
+File: fftw3.info,  Node: The Discrete Hartley Transform,  Prev: Real even/odd DFTs (cosine/sine transforms),  Up: More DFTs of Real Data
+
+2.5.3 The Discrete Hartley Transform
+------------------------------------
+
+If you are planning to use the DHT because you've heard that it is
+"faster" than the DFT (FFT), *stop here*.  The DHT is not faster than
+the DFT.  That story is an old but enduring misconception that was
+debunked in 1987.
+
+   The discrete Hartley transform (DHT) is an invertible linear
+transform closely related to the DFT.  In the DFT, one multiplies each
+input by cos - i * sin (a complex exponential), whereas in the DHT each
+input is multiplied by simply cos + sin.  Thus, the DHT transforms `n'
+real numbers to `n' real numbers, and has the convenient property of
+being its own inverse.  In FFTW, a DHT (of any positive `n') can be
+specified by an r2r kind of `FFTW_DHT'.  
+
+   Like the DFT, in FFTW the DHT is unnormalized, so computing a DHT of
+size `n' followed by another DHT of the same size will result in the
+original array multiplied by `n'.  
+
+   The DHT was originally proposed as a more efficient alternative to
+the DFT for real data, but it was subsequently shown that a specialized
+DFT (such as FFTW's r2hc or r2c transforms) could be just as fast.  In
+FFTW, the DHT is actually computed by post-processing an r2hc
+transform, so there is ordinarily no reason to prefer it from a
+performance perspective.(1) However, we have heard rumors that the DHT
+might be the most appropriate transform in its own right for certain
+applications, and we would be very interested to hear from anyone who
+finds it useful.
+
+   If `FFTW_DHT' is specified for multiple dimensions of a
+multi-dimensional transform, FFTW computes the separable product of 1d
+DHTs along each dimension.  Unfortunately, this is not quite the same
+thing as a true multi-dimensional DHT; you can compute the latter, if
+necessary, with at most `rank-1' post-processing passes [see e.g. H.
+Hao and R. N. Bracewell, Proc. IEEE 75, 264-266 (1987)].
+
+   For the precise mathematical definition of the DHT as used by FFTW,
+see *note What FFTW Really Computes::.
+
+   ---------- Footnotes ----------
+
+   (1) We provide the DHT mainly as a byproduct of some internal
+algorithms. FFTW computes a real input/output DFT of _prime_ size by
+re-expressing it as a DHT plus post/pre-processing and then using
+Rader's prime-DFT algorithm adapted to the DHT.
+
+
+File: fftw3.info,  Node: Other Important Topics,  Next: FFTW Reference,  Prev: Tutorial,  Up: Top
+
+3 Other Important Topics
+************************
+
+* Menu:
+
+* SIMD alignment and fftw_malloc::
+* Multi-dimensional Array Format::
+* Words of Wisdom-Saving Plans::
+* Caveats in Using Wisdom::
+
+
+File: fftw3.info,  Node: SIMD alignment and fftw_malloc,  Next: Multi-dimensional Array Format,  Prev: Other Important Topics,  Up: Other Important Topics
+
+3.1 SIMD alignment and fftw_malloc
+==================================
+
+SIMD, which stands for "Single Instruction Multiple Data," is a set of
+special operations supported by some processors to perform a single
+operation on several numbers (usually 2 or 4) simultaneously.  SIMD
+floating-point instructions are available on several popular CPUs:
+SSE/SSE2/AVX on recent x86/x86-64 processors, AltiVec (single precision)
+on some PowerPCs (Apple G4 and higher), NEON on some ARM models, and
+MIPS Paired Single (currently only in FFTW 3.2.x).  FFTW can be
+compiled to support the SIMD instructions on any of these systems.  
+
+   A program linking to an FFTW library compiled with SIMD support can
+obtain a nonnegligible speedup for most complex and r2c/c2r transforms.
+In order to obtain this speedup, however, the arrays of complex (or
+real) data passed to FFTW must be specially aligned in memory
+(typically 16-byte aligned), and often this alignment is more stringent
+than that provided by the usual `malloc' (etc.)  allocation routines.
+
+   In order to guarantee proper alignment for SIMD, therefore, in case
+your program is ever linked against a SIMD-using FFTW, we recommend
+allocating your transform data with `fftw_malloc' and de-allocating it
+with `fftw_free'.  These have exactly the same interface and behavior as
+`malloc'/`free', except that for a SIMD FFTW they ensure that the
+returned pointer has the necessary alignment (by calling `memalign' or
+its equivalent on your OS).
+
+   You are not _required_ to use `fftw_malloc'.  You can allocate your
+data in any way that you like, from `malloc' to `new' (in C++) to a
+fixed-size array declaration.  If the array happens not to be properly
+aligned, FFTW will not use the SIMD extensions.  
+
+   Since `fftw_malloc' only ever needs to be used for real and complex
+arrays, we provide two convenient wrapper routines `fftw_alloc_real(N)'
+and `fftw_alloc_complex(N)' that are equivalent to
+`(double*)fftw_malloc(sizeof(double) * N)' and
+`(fftw_complex*)fftw_malloc(sizeof(fftw_complex) * N)', respectively
+(or their equivalents in other precisions).
+
+
+File: fftw3.info,  Node: Multi-dimensional Array Format,  Next: Words of Wisdom-Saving Plans,  Prev: SIMD alignment and fftw_malloc,  Up: Other Important Topics
+
+3.2 Multi-dimensional Array Format
+==================================
+
+This section describes the format in which multi-dimensional arrays are
+stored in FFTW.  We felt that a detailed discussion of this topic was
+necessary.  Since several different formats are common, this topic is
+often a source of confusion.
+
+* Menu:
+
+* Row-major Format::
+* Column-major Format::
+* Fixed-size Arrays in C::
+* Dynamic Arrays in C::
+* Dynamic Arrays in C-The Wrong Way::
+
+
+File: fftw3.info,  Node: Row-major Format,  Next: Column-major Format,  Prev: Multi-dimensional Array Format,  Up: Multi-dimensional Array Format
+
+3.2.1 Row-major Format
+----------------------
+
+The multi-dimensional arrays passed to `fftw_plan_dft' etcetera are
+expected to be stored as a single contiguous block in "row-major" order
+(sometimes called "C order").  Basically, this means that as you step
+through adjacent memory locations, the first dimension's index varies
+most slowly and the last dimension's index varies most quickly.
+
+   To be more explicit, let us consider an array of rank d whose
+dimensions are n[0] x n[1] x n[2] x ... x n[d-1] . Now, we specify a
+location in the array by a sequence of d (zero-based) indices, one for
+each dimension: (i[0], i[1], ..., i[d-1]).  If the array is stored in
+row-major order, then this element is located at the position i[d-1] +
+n[d-1] * (i[d-2] + n[d-2] * (... + n[1] * i[0])).
+
+   Note that, for the ordinary complex DFT, each element of the array
+must be of type `fftw_complex'; i.e. a (real, imaginary) pair of
+(double-precision) numbers.
+
+   In the advanced FFTW interface, the physical dimensions n from which
+the indices are computed can be different from (larger than) the
+logical dimensions of the transform to be computed, in order to
+transform a subset of a larger array.  Note also that, in the advanced
+interface, the expression above is multiplied by a "stride" to get the
+actual array index--this is useful in situations where each element of
+the multi-dimensional array is actually a data structure (or another
+array), and you just want to transform a single field. In the basic
+interface, however, the stride is 1.  
+
+
+File: fftw3.info,  Node: Column-major Format,  Next: Fixed-size Arrays in C,  Prev: Row-major Format,  Up: Multi-dimensional Array Format
+
+3.2.2 Column-major Format
+-------------------------
+
+Readers from the Fortran world are used to arrays stored in
+"column-major" order (sometimes called "Fortran order").  This is
+essentially the exact opposite of row-major order in that, here, the
+_first_ dimension's index varies most quickly.
+
+   If you have an array stored in column-major order and wish to
+transform it using FFTW, it is quite easy to do.  When creating the
+plan, simply pass the dimensions of the array to the planner in
+_reverse order_.  For example, if your array is a rank three `N x M x
+L' matrix in column-major order, you should pass the dimensions of the
+array as if it were an `L x M x N' matrix (which it is, from the
+perspective of FFTW).  This is done for you _automatically_ by the FFTW
+legacy-Fortran interface (*note Calling FFTW from Legacy Fortran::),
+but you must do it manually with the modern Fortran interface (*note
+Reversing array dimensions::).  
+
+
+File: fftw3.info,  Node: Fixed-size Arrays in C,  Next: Dynamic Arrays in C,  Prev: Column-major Format,  Up: Multi-dimensional Array Format
+
+3.2.3 Fixed-size Arrays in C
+----------------------------
+
+A multi-dimensional array whose size is declared at compile time in C
+is _already_ in row-major order.  You don't have to do anything special
+to transform it.  For example:
+
+     {
+          fftw_complex data[N0][N1][N2];
+          fftw_plan plan;
+          ...
+          plan = fftw_plan_dft_3d(N0, N1, N2, &data[0][0][0], &data[0][0][0],
+                                  FFTW_FORWARD, FFTW_ESTIMATE);
+          ...
+     }
+
+   This will plan a 3d in-place transform of size `N0 x N1 x N2'.
+Notice how we took the address of the zero-th element to pass to the
+planner (we could also have used a typecast).
+
+   However, we tend to _discourage_ users from declaring their arrays
+in this way, for two reasons.  First, this allocates the array on the
+stack ("automatic" storage), which has a very limited size on most
+operating systems (declaring an array with more than a few thousand
+elements will often cause a crash).  (You can get around this
+limitation on many systems by declaring the array as `static' and/or
+global, but that has its own drawbacks.)  Second, it may not optimally
+align the array for use with a SIMD FFTW (*note SIMD alignment and
+fftw_malloc::).  Instead, we recommend using `fftw_malloc', as
+described below.
+
+
+File: fftw3.info,  Node: Dynamic Arrays in C,  Next: Dynamic Arrays in C-The Wrong Way,  Prev: Fixed-size Arrays in C,  Up: Multi-dimensional Array Format
+
+3.2.4 Dynamic Arrays in C
+-------------------------
+
+We recommend allocating most arrays dynamically, with `fftw_malloc'.
+This isn't too hard to do, although it is not as straightforward for
+multi-dimensional arrays as it is for one-dimensional arrays.
+
+   Creating the array is simple: using a dynamic-allocation routine like
+`fftw_malloc', allocate an array big enough to store N `fftw_complex'
+values (for a complex DFT), where N is the product of the sizes of the
+array dimensions (i.e. the total number of complex values in the
+array).  For example, here is code to allocate a 5 x 12 x 27  rank-3
+array: 
+
+     fftw_complex *an_array;
+     an_array = (fftw_complex*) fftw_malloc(5*12*27 * sizeof(fftw_complex));
+
+   Accessing the array elements, however, is more tricky--you can't
+simply use multiple applications of the `[]' operator like you could
+for fixed-size arrays.  Instead, you have to explicitly compute the
+offset into the array using the formula given earlier for row-major
+arrays.  For example, to reference the (i,j,k)-th element of the array
+allocated above, you would use the expression `an_array[k + 27 * (j +
+12 * i)]'.
+
+   This pain can be alleviated somewhat by defining appropriate macros,
+or, in C++, creating a class and overloading the `()' operator.  The
+recent C99 standard provides a way to reinterpret the dynamic array as
+a "variable-length" multi-dimensional array amenable to `[]', but this
+feature is not yet widely supported by compilers.  
+
+
+File: fftw3.info,  Node: Dynamic Arrays in C-The Wrong Way,  Prev: Dynamic Arrays in C,  Up: Multi-dimensional Array Format
+
+3.2.5 Dynamic Arrays in C--The Wrong Way
+----------------------------------------
+
+A different method for allocating multi-dimensional arrays in C is
+often suggested that is incompatible with FFTW: _using it will cause
+FFTW to die a painful death_.  We discuss the technique here, however,
+because it is so commonly known and used.  This method is to create
+arrays of pointers of arrays of pointers of ...etcetera.  For example,
+the analogue in this method to the example above is:
+
+     int i,j;
+     fftw_complex ***a_bad_array;  /* another way to make a 5x12x27 array */
+
+     a_bad_array = (fftw_complex ***) malloc(5 * sizeof(fftw_complex **));
+     for (i = 0; i < 5; ++i) {
+          a_bad_array[i] =
+             (fftw_complex **) malloc(12 * sizeof(fftw_complex *));
+          for (j = 0; j < 12; ++j)
+               a_bad_array[i][j] =
+                     (fftw_complex *) malloc(27 * sizeof(fftw_complex));
+     }
+
+   As you can see, this sort of array is inconvenient to allocate (and
+deallocate).  On the other hand, it has the advantage that the
+(i,j,k)-th element can be referenced simply by `a_bad_array[i][j][k]'.
+
+   If you like this technique and want to maximize convenience in
+accessing the array, but still want to pass the array to FFTW, you can
+use a hybrid method.  Allocate the array as one contiguous block, but
+also declare an array of arrays of pointers that point to appropriate
+places in the block.  That sort of trick is beyond the scope of this
+documentation; for more information on multi-dimensional arrays in C,
+see the `comp.lang.c' FAQ (http://c-faq.com/aryptr/dynmuldimary.html).
+
+
+File: fftw3.info,  Node: Words of Wisdom-Saving Plans,  Next: Caveats in Using Wisdom,  Prev: Multi-dimensional Array Format,  Up: Other Important Topics
+
+3.3 Words of Wisdom--Saving Plans
+=================================
+
+FFTW implements a method for saving plans to disk and restoring them.
+In fact, what FFTW does is more general than just saving and loading
+plans.  The mechanism is called "wisdom".  Here, we describe this
+feature at a high level. *Note FFTW Reference::, for a less casual but
+more complete discussion of how to use wisdom in FFTW.
+
+   Plans created with the `FFTW_MEASURE', `FFTW_PATIENT', or
+`FFTW_EXHAUSTIVE' options produce near-optimal FFT performance, but may
+require a long time to compute because FFTW must measure the runtime of
+many possible plans and select the best one.  This setup is designed
+for the situations where so many transforms of the same size must be
+computed that the start-up time is irrelevant.  For short
+initialization times, but slower transforms, we have provided
+`FFTW_ESTIMATE'.  The `wisdom' mechanism is a way to get the best of
+both worlds: you compute a good plan once, save it to disk, and later
+reload it as many times as necessary.  The wisdom mechanism can
+actually save and reload many plans at once, not just one.  
+
+   Whenever you create a plan, the FFTW planner accumulates wisdom,
+which is information sufficient to reconstruct the plan.  After
+planning, you can save this information to disk by means of the
+function:
+     int fftw_export_wisdom_to_filename(const char *filename);
+   (This function returns non-zero on success.)
+
+   The next time you run the program, you can restore the wisdom with
+`fftw_import_wisdom_from_filename' (which also returns non-zero on
+success), and then recreate the plan using the same flags as before.
+     int fftw_import_wisdom_from_filename(const char *filename);
+   
+   Wisdom is automatically used for any size to which it is applicable,
+as long as the planner flags are not more "patient" than those with
+which the wisdom was created.  For example, wisdom created with
+`FFTW_MEASURE' can be used if you later plan with `FFTW_ESTIMATE' or
+`FFTW_MEASURE', but not with `FFTW_PATIENT'.
+
+   The `wisdom' is cumulative, and is stored in a global, private data
+structure managed internally by FFTW.  The storage space required is
+minimal, proportional to the logarithm of the sizes the wisdom was
+generated from.  If memory usage is a concern, however, the wisdom can
+be forgotten and its associated memory freed by calling:
+     void fftw_forget_wisdom(void);
+   
+   Wisdom can be exported to a file, a string, or any other medium.
+For details, see *note Wisdom::.
+
+
+File: fftw3.info,  Node: Caveats in Using Wisdom,  Prev: Words of Wisdom-Saving Plans,  Up: Other Important Topics
+
+3.4 Caveats in Using Wisdom
+===========================
+
+     For in much wisdom is much grief, and he that increaseth knowledge
+     increaseth sorrow.  [Ecclesiastes 1:18] 
+
+   There are pitfalls to using wisdom, in that it can negate FFTW's
+ability to adapt to changing hardware and other conditions. For
+example, it would be perfectly possible to export wisdom from a program
+running on one processor and import it into a program running on
+another processor.  Doing so, however, would mean that the second
+program would use plans optimized for the first processor, instead of
+the one it is running on.
+
+   It should be safe to reuse wisdom as long as the hardware and program
+binaries remain unchanged. (Actually, the optimal plan may change even
+between runs of the same binary on identical hardware, due to
+differences in the virtual memory environment, etcetera.  Users
+seriously interested in performance should worry about this problem,
+too.)  It is likely that, if the same wisdom is used for two different
+program binaries, even running on the same machine, the plans may be
+sub-optimal because of differing code alignments.  It is therefore wise
+to recreate wisdom every time an application is recompiled.  The more
+the underlying hardware and software changes between the creation of
+wisdom and its use, the greater grows the risk of sub-optimal plans.
+
+   Nevertheless, if the choice is between using `FFTW_ESTIMATE' or
+using possibly-suboptimal wisdom (created on the same machine, but for a
+different binary), the wisdom is likely to be better.  For this reason,
+we provide a function to import wisdom from a standard system-wide
+location (`/etc/fftw/wisdom' on Unix): 
+
+     int fftw_import_system_wisdom(void);
+   
+   FFTW also provides a standalone program, `fftw-wisdom' (described by
+its own `man' page on Unix) with which users can create wisdom, e.g.
+for a canonical set of sizes to store in the system wisdom file.  *Note
+Wisdom Utilities::.  
+
+
+File: fftw3.info,  Node: FFTW Reference,  Next: Multi-threaded FFTW,  Prev: Other Important Topics,  Up: Top
+
+4 FFTW Reference
+****************
+
+This chapter provides a complete reference for all sequential (i.e.,
+one-processor) FFTW functions.  Parallel transforms are described in
+later chapters.
+
+* Menu:
+
+* Data Types and Files::
+* Using Plans::
+* Basic Interface::
+* Advanced Interface::
+* Guru Interface::
+* New-array Execute Functions::
+* Wisdom::
+* What FFTW Really Computes::
+
+
+File: fftw3.info,  Node: Data Types and Files,  Next: Using Plans,  Prev: FFTW Reference,  Up: FFTW Reference
+
+4.1 Data Types and Files
+========================
+
+All programs using FFTW should include its header file:
+
+     #include <fftw3.h>
+
+   You must also link to the FFTW library.  On Unix, this means adding
+`-lfftw3 -lm' at the _end_ of the link command.
+
+* Menu:
+
+* Complex numbers::
+* Precision::
+* Memory Allocation::
+
+
+File: fftw3.info,  Node: Complex numbers,  Next: Precision,  Prev: Data Types and Files,  Up: Data Types and Files
+
+4.1.1 Complex numbers
+---------------------
+
+The default FFTW interface uses `double' precision for all
+floating-point numbers, and defines a `fftw_complex' type to hold
+complex numbers as:
+
+     typedef double fftw_complex[2];
+   
+   Here, the `[0]' element holds the real part and the `[1]' element
+holds the imaginary part.
+
+   Alternatively, if you have a C compiler (such as `gcc') that
+supports the C99 revision of the ANSI C standard, you can use C's new
+native complex type (which is binary-compatible with the typedef above).
+In particular, if you `#include <complex.h>' _before_ `<fftw3.h>', then
+`fftw_complex' is defined to be the native complex type and you can
+manipulate it with ordinary arithmetic (e.g. `x = y * (3+4*I)', where
+`x' and `y' are `fftw_complex' and `I' is the standard symbol for the
+imaginary unit); 
+
+   C++ has its own `complex<T>' template class, defined in the standard
+`<complex>' header file.  Reportedly, the C++ standards committee has
+recently agreed to mandate that the storage format used for this type
+be binary-compatible with the C99 type, i.e. an array `T[2]' with
+consecutive real `[0]' and imaginary `[1]' parts.  (See report
+`http://www.open-std.org/jtc1/sc22/WG21/docs/papers/2002/n1388.pdf
+WG21/N1388'.)  Although not part of the official standard as of this
+writing, the proposal stated that: "This solution has been tested with
+all current major implementations of the standard library and shown to
+be working."  To the extent that this is true, if you have a variable
+`complex<double> *x', you can pass it directly to FFTW via
+`reinterpret_cast<fftw_complex*>(x)'.  
+
+
+File: fftw3.info,  Node: Precision,  Next: Memory Allocation,  Prev: Complex numbers,  Up: Data Types and Files
+
+4.1.2 Precision
+---------------
+
+You can install single and long-double precision versions of FFTW,
+which replace `double' with `float' and `long double', respectively
+(*note Installation and Customization::).  To use these interfaces, you:
+
+   * Link to the single/long-double libraries; on Unix, `-lfftw3f' or
+     `-lfftw3l' instead of (or in addition to) `-lfftw3'.  (You can
+     link to the different-precision libraries simultaneously.)
+
+   * Include the _same_ `<fftw3.h>' header file.
+
+   * Replace all lowercase instances of `fftw_' with `fftwf_' or
+     `fftwl_' for single or long-double precision, respectively.
+     (`fftw_complex' becomes `fftwf_complex', `fftw_execute' becomes
+     `fftwf_execute', etcetera.)
+
+   * Uppercase names, i.e. names beginning with `FFTW_', remain the
+     same.
+
+   * Replace `double' with `float' or `long double' for subroutine
+     parameters.
+
+
+   Depending upon your compiler and/or hardware, `long double' may not
+be any more precise than `double' (or may not be supported at all,
+although it is standard in C99).  
+
+   We also support using the nonstandard `__float128'
+quadruple-precision type provided by recent versions of `gcc' on 32-
+and 64-bit x86 hardware (*note Installation and Customization::).  To
+use this type, link with `-lfftw3q -lquadmath -lm' (the `libquadmath'
+library provided by `gcc' is needed for quadruple-precision
+trigonometric functions) and use `fftwq_' identifiers.
+
+
+File: fftw3.info,  Node: Memory Allocation,  Prev: Precision,  Up: Data Types and Files
+
+4.1.3 Memory Allocation
+-----------------------
+
+     void *fftw_malloc(size_t n);
+     void fftw_free(void *p);
+
+   These are functions that behave identically to `malloc' and `free',
+except that they guarantee that the returned pointer obeys any special
+alignment restrictions imposed by any algorithm in FFTW (e.g. for SIMD
+acceleration).  *Note SIMD alignment and fftw_malloc::.  
+
+   Data allocated by `fftw_malloc' _must_ be deallocated by `fftw_free'
+and not by the ordinary `free'.
+
+   These routines simply call through to your operating system's
+`malloc' or, if necessary, its aligned equivalent (e.g. `memalign'), so
+you normally need not worry about any significant time or space
+overhead.  You are _not required_ to use them to allocate your data,
+but we strongly recommend it.
+
+   Note: in C++, just as with ordinary `malloc', you must typecast the
+output of `fftw_malloc' to whatever pointer type you are allocating.  
+
+   We also provide the following two convenience functions to allocate
+real and complex arrays with `n' elements, which are equivalent to
+`(double *) fftw_malloc(sizeof(double) * n)' and `(fftw_complex *)
+fftw_malloc(sizeof(fftw_complex) * n)', respectively:
+
+     double *fftw_alloc_real(size_t n);
+     fftw_complex *fftw_alloc_complex(size_t n);
+   
+   The equivalent functions in other precisions allocate arrays of `n'
+elements in that precision.  e.g. `fftwf_alloc_real(n)' is equivalent
+to `(float *) fftwf_malloc(sizeof(float) * n)'.  
+
+
+File: fftw3.info,  Node: Using Plans,  Next: Basic Interface,  Prev: Data Types and Files,  Up: FFTW Reference
+
+4.2 Using Plans
+===============
+
+Plans for all transform types in FFTW are stored as type `fftw_plan'
+(an opaque pointer type), and are created by one of the various
+planning routines described in the following sections.  An `fftw_plan'
+contains all information necessary to compute the transform, including
+the pointers to the input and output arrays.
+
+     void fftw_execute(const fftw_plan plan);
+   
+   This executes the `plan', to compute the corresponding transform on
+the arrays for which it was planned (which must still exist).  The plan
+is not modified, and `fftw_execute' can be called as many times as
+desired.
+
+   To apply a given plan to a different array, you can use the
+new-array execute interface.  *Note New-array Execute Functions::.
+
+   `fftw_execute' (and equivalents) is the only function in FFTW
+guaranteed to be thread-safe; see *note Thread safety::.
+
+   This function:
+     void fftw_destroy_plan(fftw_plan plan);
+   deallocates the `plan' and all its associated data.
+
+   FFTW's planner saves some other persistent data, such as the
+accumulated wisdom and a list of algorithms available in the current
+configuration.  If you want to deallocate all of that and reset FFTW to
+the pristine state it was in when you started your program, you can
+call:
+
+     void fftw_cleanup(void);
+   
+   After calling `fftw_cleanup', all existing plans become undefined,
+and you should not attempt to execute them nor to destroy them.  You can
+however create and execute/destroy new plans, in which case FFTW starts
+accumulating wisdom information again.
+
+   `fftw_cleanup' does not deallocate your plans, however.  To prevent
+memory leaks, you must still call `fftw_destroy_plan' before executing
+`fftw_cleanup'.
+
+   Occasionally, it may useful to know FFTW's internal "cost" metric
+that it uses to compare plans to one another; this cost is proportional
+to an execution time of the plan, in undocumented units, if the plan
+was created with the `FFTW_MEASURE' or other timing-based options, or
+alternatively is a heuristic cost function for `FFTW_ESTIMATE' plans.
+(The cost values of measured and estimated plans are not comparable,
+being in different units.  Also, costs from different FFTW versions or
+the same version compiled differently may not be in the same units.
+Plans created from wisdom have a cost of 0 since no timing measurement
+is performed for them.  Finally, certain problems for which only one
+top-level algorithm was possible may have required no measurements of
+the cost of the whole plan, in which case `fftw_cost' will also return
+0.)  The cost metric for a given plan is returned by:
+
+     double fftw_cost(const fftw_plan plan);
+   
+   The following two routines are provided purely for academic purposes
+(that is, for entertainment).
+
+     void fftw_flops(const fftw_plan plan,
+                     double *add, double *mul, double *fma);
+   
+   Given a `plan', set `add', `mul', and `fma' to an exact count of the
+number of floating-point additions, multiplications, and fused
+multiply-add operations involved in the plan's execution.  The total
+number of floating-point operations (flops) is `add + mul + 2*fma', or
+`add + mul + fma' if the hardware supports fused multiply-add
+instructions (although the number of FMA operations is only approximate
+because of compiler voodoo).  (The number of operations should be an
+integer, but we use `double' to avoid overflowing `int' for large
+transforms; the arguments are of type `double' even for single and
+long-double precision versions of FFTW.)
+
+     void fftw_fprint_plan(const fftw_plan plan, FILE *output_file);
+     void fftw_print_plan(const fftw_plan plan);
+   
+   This outputs a "nerd-readable" representation of the `plan' to the
+given file or to `stdout', respectively.
+
+
+File: fftw3.info,  Node: Basic Interface,  Next: Advanced Interface,  Prev: Using Plans,  Up: FFTW Reference
+
+4.3 Basic Interface
+===================
+
+Recall that the FFTW API is divided into three parts(1): the "basic
+interface" computes a single transform of contiguous data, the "advanced
+interface" computes transforms of multiple or strided arrays, and the
+"guru interface" supports the most general data layouts,
+multiplicities, and strides.  This section describes the the basic
+interface, which we expect to satisfy the needs of most users.
+
+* Menu:
+
+* Complex DFTs::
+* Planner Flags::
+* Real-data DFTs::
+* Real-data DFT Array Format::
+* Real-to-Real Transforms::
+* Real-to-Real Transform Kinds::
+
+   ---------- Footnotes ----------
+
+   (1) Gallia est omnis divisa in partes tres (Julius Caesar).
+
+
+File: fftw3.info,  Node: Complex DFTs,  Next: Planner Flags,  Prev: Basic Interface,  Up: Basic Interface
+
+4.3.1 Complex DFTs
+------------------
+
+     fftw_plan fftw_plan_dft_1d(int n0,
+                                fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+     fftw_plan fftw_plan_dft_2d(int n0, int n1,
+                                fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+     fftw_plan fftw_plan_dft_3d(int n0, int n1, int n2,
+                                fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+     fftw_plan fftw_plan_dft(int rank, const int *n,
+                             fftw_complex *in, fftw_complex *out,
+                             int sign, unsigned flags);
+
+   Plan a complex input/output discrete Fourier transform (DFT) in zero
+or more dimensions, returning an `fftw_plan' (*note Using Plans::).
+
+   Once you have created a plan for a certain transform type and
+parameters, then creating another plan of the same type and parameters,
+but for different arrays, is fast and shares constant data with the
+first plan (if it still exists).
+
+   The planner returns `NULL' if the plan cannot be created.  In the
+standard FFTW distribution, the basic interface is guaranteed to return
+a non-`NULL' plan.  A plan may be `NULL', however, if you are using a
+customized FFTW configuration supporting a restricted set of transforms.
+
+Arguments
+.........
+
+   * `rank' is the rank of the transform (it should be the size of the
+     array `*n'), and can be any non-negative integer.  (*Note Complex
+     Multi-Dimensional DFTs::, for the definition of "rank".)  The
+     `_1d', `_2d', and `_3d' planners correspond to a `rank' of `1',
+     `2', and `3', respectively.  The rank may be zero, which is
+     equivalent to a rank-1 transform of size 1, i.e. a copy of one
+     number from input to output.
+
+   * `n0', `n1', `n2', or `n[0..rank-1]' (as appropriate for each
+     routine) specify the size of the transform dimensions.  They can
+     be any positive integer.
+
+        - Multi-dimensional arrays are stored in row-major order with
+          dimensions: `n0' x `n1'; or `n0' x `n1' x `n2'; or `n[0]' x
+          `n[1]' x ... x `n[rank-1]'.  *Note Multi-dimensional Array
+          Format::.
+
+        - FFTW is best at handling sizes of the form 2^a 3^b 5^c 7^d
+          11^e 13^f, where e+f is either 0 or 1, and the other exponents
+          are arbitrary.  Other sizes are computed by means of a slow,
+          general-purpose algorithm (which nevertheless retains O(n log
+          n)  performance even for prime sizes).  It is possible to
+          customize FFTW for different array sizes; see *note
+          Installation and Customization::.  Transforms whose sizes are
+          powers of 2 are especially fast.
+
+   * `in' and `out' point to the input and output arrays of the
+     transform, which may be the same (yielding an in-place transform).  These
+     arrays are overwritten during planning, unless `FFTW_ESTIMATE' is
+     used in the flags.  (The arrays need not be initialized, but they
+     must be allocated.)
+
+     If `in == out', the transform is "in-place" and the input array is
+     overwritten. If `in != out', the two arrays must not overlap (but
+     FFTW does not check for this condition).
+
+   * `sign' is the sign of the exponent in the formula that defines the
+     Fourier transform.  It can be -1 (= `FFTW_FORWARD') or +1 (=
+     `FFTW_BACKWARD').
+
+   * `flags' is a bitwise OR (`|') of zero or more planner flags, as
+     defined in *note Planner Flags::.
+
+
+   FFTW computes an unnormalized transform: computing a forward
+followed by a backward transform (or vice versa) will result in the
+original data multiplied by the size of the transform (the product of
+the dimensions).  For more information, see *note What FFTW Really
+Computes::.
+
+
+File: fftw3.info,  Node: Planner Flags,  Next: Real-data DFTs,  Prev: Complex DFTs,  Up: Basic Interface
+
+4.3.2 Planner Flags
+-------------------
+
+All of the planner routines in FFTW accept an integer `flags' argument,
+which is a bitwise OR (`|') of zero or more of the flag constants
+defined below.  These flags control the rigor (and time) of the
+planning process, and can also impose (or lift) restrictions on the
+type of transform algorithm that is employed.
+
+   _Important:_ the planner overwrites the input array during planning
+unless a saved plan (*note Wisdom::) is available for that problem, so
+you should initialize your input data after creating the plan.  The
+only exceptions to this are the `FFTW_ESTIMATE' and `FFTW_WISDOM_ONLY'
+flags, as mentioned below.
+
+   In all  cases, if  wisdom is  available for the  given problem  that
+was created  with equal-or-greater  planning rigor,  then the  more
+rigorous wisdom is used.  For example, in `FFTW_ESTIMATE' mode any
+available wisdom is used, whereas  in `FFTW_PATIENT' mode only wisdom
+created in patient or exhaustive mode can be used.  *Note Words of
+Wisdom-Saving Plans::.
+
+Planning-rigor flags
+....................
+
+   * `FFTW_ESTIMATE' specifies that, instead of actual measurements of
+     different algorithms, a simple heuristic is used to pick a
+     (probably sub-optimal) plan quickly.  With this flag, the
+     input/output arrays are not overwritten during planning.
+
+   * `FFTW_MEASURE' tells FFTW to find an optimized plan by actually
+     _computing_ several FFTs and measuring their execution time.
+     Depending on your machine, this can take some time (often a few
+     seconds).  `FFTW_MEASURE' is the default planning option.
+
+   * `FFTW_PATIENT' is like `FFTW_MEASURE', but considers a wider range
+     of algorithms and often produces a "more optimal" plan (especially
+     for large transforms), but at the expense of several times longer
+     planning time (especially for large transforms).
+
+   * `FFTW_EXHAUSTIVE' is like `FFTW_PATIENT', but considers an even
+     wider range of algorithms, including many that we think are
+     unlikely to be fast, to produce the most optimal plan but with a
+     substantially increased planning time.
+
+   * `FFTW_WISDOM_ONLY' is a special planning mode in which the plan is
+     only created if wisdom is available for the given problem, and
+     otherwise a `NULL' plan is returned.  This can be combined with
+     other flags, e.g. `FFTW_WISDOM_ONLY | FFTW_PATIENT' creates a plan
+     only if wisdom is available that was created in `FFTW_PATIENT' or
+     `FFTW_EXHAUSTIVE' mode.  The `FFTW_WISDOM_ONLY' flag is intended
+     for users who need to detect whether wisdom is available; for
+     example, if wisdom is not available one may wish to allocate new
+     arrays for planning so that user data is not overwritten.
+
+
+Algorithm-restriction flags
+...........................
+
+   * `FFTW_DESTROY_INPUT' specifies that an out-of-place transform is
+     allowed to _overwrite its input_ array with arbitrary data; this
+     can sometimes allow more efficient algorithms to be employed.  
+
+   * `FFTW_PRESERVE_INPUT' specifies that an out-of-place transform must
+     _not change its input_ array.  This is ordinarily the _default_,
+     except for c2r and hc2r (i.e. complex-to-real) transforms for
+     which `FFTW_DESTROY_INPUT' is the default.  In the latter cases,
+     passing `FFTW_PRESERVE_INPUT' will attempt to use algorithms that
+     do not destroy the input, at the expense of worse performance; for
+     multi-dimensional c2r transforms, however, no input-preserving
+     algorithms are implemented and the planner will return `NULL' if
+     one is requested.  
+
+   * `FFTW_UNALIGNED' specifies that the algorithm may not impose any
+     unusual alignment requirements on the input/output arrays (i.e. no
+     SIMD may be used).  This flag is normally _not necessary_, since
+     the planner automatically detects misaligned arrays.  The only use
+     for this flag is if you want to use the new-array execute
+     interface to execute a given plan on a different array that may
+     not be aligned like the original.  (Using `fftw_malloc' makes this
+     flag unnecessary even then.)
+
+
+Limiting planning time
+......................
+
+     extern void fftw_set_timelimit(double seconds);
+
+   This function instructs FFTW to spend at most `seconds' seconds
+(approximately) in the planner.  If `seconds == FFTW_NO_TIMELIMIT' (the
+default value, which is negative), then planning time is unbounded.
+Otherwise, FFTW plans with a progressively wider range of algorithms
+until the the given time limit is reached or the given range of
+algorithms is explored, returning the best available plan.  
+
+   For example, specifying `FFTW_PATIENT' first plans in
+`FFTW_ESTIMATE' mode, then in `FFTW_MEASURE' mode, then finally (time
+permitting) in `FFTW_PATIENT'.  If `FFTW_EXHAUSTIVE' is specified
+instead, the planner will further progress to `FFTW_EXHAUSTIVE' mode.
+
+   Note that the `seconds' argument specifies only a rough limit; in
+practice, the planner may use somewhat more time if the time limit is
+reached when the planner is in the middle of an operation that cannot
+be interrupted.  At the very least, the planner will complete planning
+in `FFTW_ESTIMATE' mode (which is thus equivalent to a time limit of 0).
+
+
+File: fftw3.info,  Node: Real-data DFTs,  Next: Real-data DFT Array Format,  Prev: Planner Flags,  Up: Basic Interface
+
+4.3.3 Real-data DFTs
+--------------------
+
+     fftw_plan fftw_plan_dft_r2c_1d(int n0,
+                                    double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_r2c_2d(int n0, int n1,
+                                    double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_r2c_3d(int n0, int n1, int n2,
+                                    double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_r2c(int rank, const int *n,
+                                 double *in, fftw_complex *out,
+                                 unsigned flags);
+
+   Plan a real-input/complex-output discrete Fourier transform (DFT) in
+zero or more dimensions, returning an `fftw_plan' (*note Using Plans::).
+
+   Once you have created a plan for a certain transform type and
+parameters, then creating another plan of the same type and parameters,
+but for different arrays, is fast and shares constant data with the
+first plan (if it still exists).
+
+   The planner returns `NULL' if the plan cannot be created.  A
+non-`NULL' plan is always returned by the basic interface unless you
+are using a customized FFTW configuration supporting a restricted set
+of transforms, or if you use the `FFTW_PRESERVE_INPUT' flag with a
+multi-dimensional out-of-place c2r transform (see below).
+
+Arguments
+.........
+
+   * `rank' is the rank of the transform (it should be the size of the
+     array `*n'), and can be any non-negative integer.  (*Note Complex
+     Multi-Dimensional DFTs::, for the definition of "rank".)  The
+     `_1d', `_2d', and `_3d' planners correspond to a `rank' of `1',
+     `2', and `3', respectively.  The rank may be zero, which is
+     equivalent to a rank-1 transform of size 1, i.e. a copy of one
+     real number (with zero imaginary part) from input to output.
+
+   * `n0', `n1', `n2', or `n[0..rank-1]', (as appropriate for each
+     routine) specify the size of the transform dimensions.  They can
+     be any positive integer.  This is different in general from the
+     _physical_ array dimensions, which are described in *note
+     Real-data DFT Array Format::.
+
+        - FFTW is best at handling sizes of the form 2^a 3^b 5^c 7^d
+          11^e 13^f, where e+f is either 0 or 1, and the other exponents
+          are arbitrary.  Other sizes are computed by means of a slow,
+          general-purpose algorithm (which nevertheless retains O(n log
+          n)  performance even for prime sizes).  (It is possible to
+          customize FFTW for different array sizes; see *note
+          Installation and Customization::.)  Transforms whose sizes
+          are powers of 2 are especially fast, and it is generally
+          beneficial for the _last_ dimension of an r2c/c2r transform
+          to be _even_.
+
+   * `in' and `out' point to the input and output arrays of the
+     transform, which may be the same (yielding an in-place transform).  These
+     arrays are overwritten during planning, unless `FFTW_ESTIMATE' is
+     used in the flags.  (The arrays need not be initialized, but they
+     must be allocated.)  For an in-place transform, it is important to
+     remember that the real array will require padding, described in
+     *note Real-data DFT Array Format::.  
+
+   * `flags' is a bitwise OR (`|') of zero or more planner flags, as
+     defined in *note Planner Flags::.
+
+
+   The inverse transforms, taking complex input (storing the
+non-redundant half of a logically Hermitian array) to real output, are
+given by:
+
+     fftw_plan fftw_plan_dft_c2r_1d(int n0,
+                                    fftw_complex *in, double *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_c2r_2d(int n0, int n1,
+                                    fftw_complex *in, double *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_c2r_3d(int n0, int n1, int n2,
+                                    fftw_complex *in, double *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_c2r(int rank, const int *n,
+                                 fftw_complex *in, double *out,
+                                 unsigned flags);
+   
+   The arguments are the same as for the r2c transforms, except that the
+input and output data formats are reversed.
+
+   FFTW computes an unnormalized transform: computing an r2c followed
+by a c2r transform (or vice versa) will result in the original data
+multiplied by the size of the transform (the product of the logical
+dimensions).  An r2c transform produces the same output as a
+`FFTW_FORWARD' complex DFT of the same input, and a c2r transform is
+correspondingly equivalent to `FFTW_BACKWARD'.  For more information,
+see *note What FFTW Really Computes::.
+
+
+File: fftw3.info,  Node: Real-data DFT Array Format,  Next: Real-to-Real Transforms,  Prev: Real-data DFTs,  Up: Basic Interface
+
+4.3.4 Real-data DFT Array Format
+--------------------------------
+
+The output of a DFT of real data (r2c) contains symmetries that, in
+principle, make half of the outputs redundant (*note What FFTW Really
+Computes::).  (Similarly for the input of an inverse c2r transform.)  In
+practice, it is not possible to entirely realize these savings in an
+efficient and understandable format that generalizes to
+multi-dimensional transforms.  Instead, the output of the r2c
+transforms is _slightly_ over half of the output of the corresponding
+complex transform.  We do not "pack" the data in any way, but store it
+as an ordinary array of `fftw_complex' values.  In fact, this data is
+simply a subsection of what would be the array in the corresponding
+complex transform.
+
+   Specifically, for a real transform of d (= `rank') dimensions n[0] x
+n[1] x n[2] x ... x n[d-1] , the complex data is an n[0] x n[1] x n[2]
+x ... x (n[d-1]/2 + 1)  array of `fftw_complex' values in row-major
+order (with the division rounded down).  That is, we only store the
+_lower_ half (non-negative frequencies), plus one element, of the last
+dimension of the data from the ordinary complex transform.  (We could
+have instead taken half of any other dimension, but implementation
+turns out to be simpler if the last, contiguous, dimension is used.)
+
+   For an out-of-place transform, the real data is simply an array with
+physical dimensions n[0] x n[1] x n[2] x ... x n[d-1]  in row-major
+order.
+
+   For an in-place transform, some complications arise since the
+complex data is slightly larger than the real data.  In this case, the
+final dimension of the real data must be _padded_ with extra values to
+accommodate the size of the complex data--two extra if the last
+dimension is even and one if it is odd.  That is, the last dimension of
+the real data must physically contain 2 * (n[d-1]/2+1) `double' values
+(exactly enough to hold the complex data).  This physical array size
+does not, however, change the _logical_ array size--only n[d-1] values
+are actually stored in the last dimension, and n[d-1] is the last
+dimension passed to the planner.
+
+
+File: fftw3.info,  Node: Real-to-Real Transforms,  Next: Real-to-Real Transform Kinds,  Prev: Real-data DFT Array Format,  Up: Basic Interface
+
+4.3.5 Real-to-Real Transforms
+-----------------------------
+
+     fftw_plan fftw_plan_r2r_1d(int n, double *in, double *out,
+                                fftw_r2r_kind kind, unsigned flags);
+     fftw_plan fftw_plan_r2r_2d(int n0, int n1, double *in, double *out,
+                                fftw_r2r_kind kind0, fftw_r2r_kind kind1,
+                                unsigned flags);
+     fftw_plan fftw_plan_r2r_3d(int n0, int n1, int n2,
+                                double *in, double *out,
+                                fftw_r2r_kind kind0,
+                                fftw_r2r_kind kind1,
+                                fftw_r2r_kind kind2,
+                                unsigned flags);
+     fftw_plan fftw_plan_r2r(int rank, const int *n, double *in, double *out,
+                             const fftw_r2r_kind *kind, unsigned flags);
+
+   Plan a real input/output (r2r) transform of various kinds in zero or
+more dimensions, returning an `fftw_plan' (*note Using Plans::).
+
+   Once you have created a plan for a certain transform type and
+parameters, then creating another plan of the same type and parameters,
+but for different arrays, is fast and shares constant data with the
+first plan (if it still exists).
+
+   The planner returns `NULL' if the plan cannot be created.  A
+non-`NULL' plan is always returned by the basic interface unless you
+are using a customized FFTW configuration supporting a restricted set
+of transforms, or for size-1 `FFTW_REDFT00' kinds (which are not
+defined).  
+
+Arguments
+.........
+
+   * `rank' is the dimensionality of the transform (it should be the
+     size of the arrays `*n' and `*kind'), and can be any non-negative
+     integer.  The `_1d', `_2d', and `_3d' planners correspond to a
+     `rank' of `1', `2', and `3', respectively.  A `rank' of zero is
+     equivalent to a copy of one number from input to output.
+
+   * `n', or `n0'/`n1'/`n2', or `n[rank]', respectively, gives the
+     (physical) size of the transform dimensions.  They can be any
+     positive integer.
+
+        - Multi-dimensional arrays are stored in row-major order with
+          dimensions: `n0' x `n1'; or `n0' x `n1' x `n2'; or `n[0]' x
+          `n[1]' x ... x `n[rank-1]'.  *Note Multi-dimensional Array
+          Format::.
+
+        - FFTW is generally best at handling sizes of the form 2^a 3^b
+          5^c 7^d 11^e 13^f, where e+f is either 0 or 1, and the other
+          exponents are arbitrary.  Other sizes are computed by means
+          of a slow, general-purpose algorithm (which nevertheless
+          retains O(n log n)  performance even for prime sizes).  (It
+          is possible to customize FFTW for different array sizes; see
+          *note Installation and Customization::.)  Transforms whose
+          sizes are powers of 2 are especially fast.
+
+        - For a `REDFT00' or `RODFT00' transform kind in a dimension of
+          size n, it is n-1 or n+1, respectively, that should be
+          factorizable in the above form.
+
+   * `in' and `out' point to the input and output arrays of the
+     transform, which may be the same (yielding an in-place transform).  These
+     arrays are overwritten during planning, unless `FFTW_ESTIMATE' is
+     used in the flags.  (The arrays need not be initialized, but they
+     must be allocated.)
+
+   * `kind', or `kind0'/`kind1'/`kind2', or `kind[rank]', is the kind
+     of r2r transform used for the corresponding dimension.  The valid
+     kind constants are described in *note Real-to-Real Transform
+     Kinds::.  In a multi-dimensional transform, what is computed is
+     the separable product formed by taking each transform kind along
+     the corresponding dimension, one dimension after another.
+
+   * `flags' is a bitwise OR (`|') of zero or more planner flags, as
+     defined in *note Planner Flags::.
+
+
+
+File: fftw3.info,  Node: Real-to-Real Transform Kinds,  Prev: Real-to-Real Transforms,  Up: Basic Interface
+
+4.3.6 Real-to-Real Transform Kinds
+----------------------------------
+
+FFTW currently supports 11 different r2r transform kinds, specified by
+one of the constants below.  For the precise definitions of these
+transforms, see *note What FFTW Really Computes::.  For a more
+colloquial introduction to these transform kinds, see *note More DFTs
+of Real Data::.
+
+   For dimension of size `n', there is a corresponding "logical"
+dimension `N' that determines the normalization (and the optimal
+factorization); the formula for `N' is given for each kind below.
+Also, with each transform kind is listed its corrsponding inverse
+transform.  FFTW computes unnormalized transforms: a transform followed
+by its inverse will result in the original data multiplied by `N' (or
+the product of the `N''s for each dimension, in multi-dimensions).  
+
+   * `FFTW_R2HC' computes a real-input DFT with output in "halfcomplex"
+     format, i.e. real and imaginary parts for a transform of size `n'
+     stored as: r0, r1, r2, r(n/2), i((n+1)/2-1), ..., i2, i1 (Logical
+     `N=n', inverse is `FFTW_HC2R'.)
+
+   * `FFTW_HC2R' computes the reverse of `FFTW_R2HC', above.  (Logical
+     `N=n', inverse is `FFTW_R2HC'.)
+
+   * `FFTW_DHT' computes a discrete Hartley transform.  (Logical `N=n',
+     inverse is `FFTW_DHT'.)  
+
+   * `FFTW_REDFT00' computes an REDFT00 transform, i.e. a DCT-I.
+     (Logical `N=2*(n-1)', inverse is `FFTW_REDFT00'.)  
+
+   * `FFTW_REDFT10' computes an REDFT10 transform, i.e. a DCT-II
+     (sometimes called "the" DCT).  (Logical `N=2*n', inverse is
+     `FFTW_REDFT01'.)
+
+   * `FFTW_REDFT01' computes an REDFT01 transform, i.e. a DCT-III
+     (sometimes called "the" IDCT, being the inverse of DCT-II).
+     (Logical `N=2*n', inverse is `FFTW_REDFT=10'.)  
+
+   * `FFTW_REDFT11' computes an REDFT11 transform, i.e. a DCT-IV.
+     (Logical `N=2*n', inverse is `FFTW_REDFT11'.)
+
+   * `FFTW_RODFT00' computes an RODFT00 transform, i.e. a DST-I.
+     (Logical `N=2*(n+1)', inverse is `FFTW_RODFT00'.)  
+
+   * `FFTW_RODFT10' computes an RODFT10 transform, i.e. a DST-II.
+     (Logical `N=2*n', inverse is `FFTW_RODFT01'.)
+
+   * `FFTW_RODFT01' computes an RODFT01 transform, i.e. a DST-III.
+     (Logical `N=2*n', inverse is `FFTW_RODFT=10'.)
+
+   * `FFTW_RODFT11' computes an RODFT11 transform, i.e. a DST-IV.
+     (Logical `N=2*n', inverse is `FFTW_RODFT11'.)
+
+
+
+File: fftw3.info,  Node: Advanced Interface,  Next: Guru Interface,  Prev: Basic Interface,  Up: FFTW Reference
+
+4.4 Advanced Interface
+======================
+
+FFTW's "advanced" interface supplements the basic interface with four
+new planner routines, providing a new level of flexibility: you can plan
+a transform of multiple arrays simultaneously, operate on non-contiguous
+(strided) data, and transform a subset of a larger multi-dimensional
+array.  Other than these additional features, the planner operates in
+the same fashion as in the basic interface, and the resulting
+`fftw_plan' is used in the same way (*note Using Plans::).
+
+* Menu:
+
+* Advanced Complex DFTs::
+* Advanced Real-data DFTs::
+* Advanced Real-to-real Transforms::
+
+
+File: fftw3.info,  Node: Advanced Complex DFTs,  Next: Advanced Real-data DFTs,  Prev: Advanced Interface,  Up: Advanced Interface
+
+4.4.1 Advanced Complex DFTs
+---------------------------
+
+     fftw_plan fftw_plan_many_dft(int rank, const int *n, int howmany,
+                                  fftw_complex *in, const int *inembed,
+                                  int istride, int idist,
+                                  fftw_complex *out, const int *onembed,
+                                  int ostride, int odist,
+                                  int sign, unsigned flags);
+
+   This routine plans multiple multidimensional complex DFTs, and it
+extends the `fftw_plan_dft' routine (*note Complex DFTs::) to compute
+`howmany' transforms, each having rank `rank' and size `n'.  In
+addition, the transform data need not be contiguous, but it may be laid
+out in memory with an arbitrary stride.  To account for these
+possibilities, `fftw_plan_many_dft' adds the new parameters `howmany',
+{`i',`o'}`nembed', {`i',`o'}`stride', and {`i',`o'}`dist'.  The FFTW
+basic interface (*note Complex DFTs::) provides routines specialized
+for ranks 1, 2, and 3, but the advanced interface handles only the
+general-rank case.
+
+   `howmany' is the number of transforms to compute.  The resulting
+plan computes `howmany' transforms, where the input of the `k'-th
+transform is at location `in+k*idist' (in C pointer arithmetic), and
+its output is at location `out+k*odist'.  Plans obtained in this way
+can often be faster than calling FFTW multiple times for the individual
+transforms.  The basic `fftw_plan_dft' interface corresponds to
+`howmany=1' (in which case the `dist' parameters are ignored).  
+
+   Each of the `howmany' transforms has rank `rank' and size `n', as in
+the basic interface.  In addition, the advanced interface allows the
+input and output arrays of each transform to be row-major subarrays of
+larger rank-`rank' arrays, described by `inembed' and `onembed'
+parameters, respectively.  {`i',`o'}`nembed' must be arrays of length
+`rank', and `n' should be elementwise less than or equal to
+{`i',`o'}`nembed'.  Passing `NULL' for an `nembed' parameter is
+equivalent to passing `n' (i.e. same physical and logical dimensions,
+as in the basic interface.)
+
+   The `stride' parameters indicate that the `j'-th element of the
+input or output arrays is located at `j*istride' or `j*ostride',
+respectively.  (For a multi-dimensional array, `j' is the ordinary
+row-major index.)  When combined with the `k'-th transform in a
+`howmany' loop, from above, this means that the (`j',`k')-th element is
+at `j*stride+k*dist'.  (The basic `fftw_plan_dft' interface corresponds
+to a stride of 1.)  
+
+   For in-place transforms, the input and output `stride' and `dist'
+parameters should be the same; otherwise, the planner may return `NULL'.
+
+   Arrays `n', `inembed', and `onembed' are not used after this
+function returns.  You can safely free or reuse them.
+
+   *Examples*: One transform of one 5 by 6 array contiguous in memory:
+        int rank = 2;
+        int n[] = {5, 6};
+        int howmany = 1;
+        int idist = odist = 0; /* unused because howmany = 1 */
+        int istride = ostride = 1; /* array is contiguous in memory */
+        int *inembed = n, *onembed = n;
+
+   Transform of three 5 by 6 arrays, each contiguous in memory, stored
+in memory one after another:
+        int rank = 2;
+        int n[] = {5, 6};
+        int howmany = 3;
+        int idist = odist = n[0]*n[1]; /* = 30, the distance in memory
+                                          between the first element
+                                          of the first array and the
+                                          first element of the second array */
+        int istride = ostride = 1; /* array is contiguous in memory */
+        int *inembed = n, *onembed = n;
+
+   Transform each column of a 2d array with 10 rows and 3 columns:
+        int rank = 1; /* not 2: we are computing 1d transforms */
+        int n[] = {10}; /* 1d transforms of length 10 */
+        int howmany = 3;
+        int idist = odist = 1;
+        int istride = ostride = 3; /* distance between two elements in
+                                      the same column */
+        int *inembed = n, *onembed = n;
+
+
+File: fftw3.info,  Node: Advanced Real-data DFTs,  Next: Advanced Real-to-real Transforms,  Prev: Advanced Complex DFTs,  Up: Advanced Interface
+
+4.4.2 Advanced Real-data DFTs
+-----------------------------
+
+     fftw_plan fftw_plan_many_dft_r2c(int rank, const int *n, int howmany,
+                                      double *in, const int *inembed,
+                                      int istride, int idist,
+                                      fftw_complex *out, const int *onembed,
+                                      int ostride, int odist,
+                                      unsigned flags);
+     fftw_plan fftw_plan_many_dft_c2r(int rank, const int *n, int howmany,
+                                      fftw_complex *in, const int *inembed,
+                                      int istride, int idist,
+                                      double *out, const int *onembed,
+                                      int ostride, int odist,
+                                      unsigned flags);
+
+   Like `fftw_plan_many_dft', these two functions add `howmany',
+`nembed', `stride', and `dist' parameters to the `fftw_plan_dft_r2c'
+and `fftw_plan_dft_c2r' functions, but otherwise behave the same as the
+basic interface.
+
+   The interpretation of `howmany', `stride', and `dist' are the same
+as for `fftw_plan_many_dft', above.  Note that the `stride' and `dist'
+for the real array are in units of `double', and for the complex array
+are in units of `fftw_complex'.
+
+   If an `nembed' parameter is `NULL', it is interpreted as what it
+would be in the basic interface, as described in *note Real-data DFT
+Array Format::.  That is, for the complex array the size is assumed to
+be the same as `n', but with the last dimension cut roughly in half.
+For the real array, the size is assumed to be `n' if the transform is
+out-of-place, or `n' with the last dimension "padded" if the transform
+is in-place.
+
+   If an `nembed' parameter is non-`NULL', it is interpreted as the
+physical size of the corresponding array, in row-major order, just as
+for `fftw_plan_many_dft'.  In this case, each dimension of `nembed'
+should be `>=' what it would be in the basic interface (e.g. the halved
+or padded `n').
+
+   Arrays `n', `inembed', and `onembed' are not used after this
+function returns.  You can safely free or reuse them.
+
+
+File: fftw3.info,  Node: Advanced Real-to-real Transforms,  Prev: Advanced Real-data DFTs,  Up: Advanced Interface
+
+4.4.3 Advanced Real-to-real Transforms
+--------------------------------------
+
+     fftw_plan fftw_plan_many_r2r(int rank, const int *n, int howmany,
+                                  double *in, const int *inembed,
+                                  int istride, int idist,
+                                  double *out, const int *onembed,
+                                  int ostride, int odist,
+                                  const fftw_r2r_kind *kind, unsigned flags);
+
+   Like `fftw_plan_many_dft', this functions adds `howmany', `nembed',
+`stride', and `dist' parameters to the `fftw_plan_r2r' function, but
+otherwise behave the same as the basic interface.  The interpretation
+of those additional parameters are the same as for
+`fftw_plan_many_dft'.  (Of course, the `stride' and `dist' parameters
+are now in units of `double', not `fftw_complex'.)
+
+   Arrays `n', `inembed', `onembed', and `kind' are not used after this
+function returns.  You can safely free or reuse them.
+
+
+File: fftw3.info,  Node: Guru Interface,  Next: New-array Execute Functions,  Prev: Advanced Interface,  Up: FFTW Reference
+
+4.5 Guru Interface
+==================
+
+The "guru" interface to FFTW is intended to expose as much as possible
+of the flexibility in the underlying FFTW architecture.  It allows one
+to compute multi-dimensional "vectors" (loops) of multi-dimensional
+transforms, where each vector/transform dimension has an independent
+size and stride.  One can also use more general complex-number formats,
+e.g. separate real and imaginary arrays.
+
+   For those users who require the flexibility of the guru interface,
+it is important that they pay special attention to the documentation
+lest they shoot themselves in the foot.
+
+* Menu:
+
+* Interleaved and split arrays::
+* Guru vector and transform sizes::
+* Guru Complex DFTs::
+* Guru Real-data DFTs::
+* Guru Real-to-real Transforms::
+* 64-bit Guru Interface::
+
+
+File: fftw3.info,  Node: Interleaved and split arrays,  Next: Guru vector and transform sizes,  Prev: Guru Interface,  Up: Guru Interface
+
+4.5.1 Interleaved and split arrays
+----------------------------------
+
+The guru interface supports two representations of complex numbers,
+which we call the interleaved and the split format.
+
+   The "interleaved" format is the same one used by the basic and
+advanced interfaces, and it is documented in *note Complex numbers::.
+In the interleaved format, you provide pointers to the real part of a
+complex number, and the imaginary part understood to be stored in the
+next memory location.  
+
+   The "split" format allows separate pointers to the real and
+imaginary parts of a complex array.  
+
+   Technically, the interleaved format is redundant, because you can
+always express an interleaved array in terms of a split array with
+appropriate pointers and strides.  On the other hand, the interleaved
+format is simpler to use, and it is common in practice.  Hence, FFTW
+supports it as a special case.
+
+
+File: fftw3.info,  Node: Guru vector and transform sizes,  Next: Guru Complex DFTs,  Prev: Interleaved and split arrays,  Up: Guru Interface
+
+4.5.2 Guru vector and transform sizes
+-------------------------------------
+
+The guru interface introduces one basic new data structure,
+`fftw_iodim', that is used to specify sizes and strides for
+multi-dimensional transforms and vectors:
+
+     typedef struct {
+          int n;
+          int is;
+          int os;
+     } fftw_iodim;
+   
+   Here, `n' is the size of the dimension, and `is' and `os' are the
+strides of that dimension for the input and output arrays.  (The stride
+is the separation of consecutive elements along this dimension.)
+
+   The meaning of the stride parameter depends on the type of the array
+that the stride refers to.  _If the array is interleaved complex,
+strides are expressed in units of complex numbers (`fftw_complex').  If
+the array is split complex or real, strides are expressed in units of
+real numbers (`double')._  This convention is consistent with the usual
+pointer arithmetic in the C language.  An interleaved array is denoted
+by a pointer `p' to `fftw_complex', so that `p+1' points to the next
+complex number.  Split arrays are denoted by pointers to `double', in
+which case pointer arithmetic operates in units of `sizeof(double)'.  
+
+   The guru planner interfaces all take a (`rank', `dims[rank]') pair
+describing the transform size, and a (`howmany_rank',
+`howmany_dims[howmany_rank]') pair describing the "vector" size (a
+multi-dimensional loop of transforms to perform), where `dims' and
+`howmany_dims' are arrays of `fftw_iodim'.
+
+   For example, the `howmany' parameter in the advanced complex-DFT
+interface corresponds to `howmany_rank' = 1, `howmany_dims[0].n' =
+`howmany', `howmany_dims[0].is' = `idist', and `howmany_dims[0].os' =
+`odist'.  (To compute a single transform, you can just use
+`howmany_rank' = 0.)
+
+   A row-major multidimensional array with dimensions `n[rank]' (*note
+Row-major Format::) corresponds to `dims[i].n' = `n[i]' and the
+recurrence `dims[i].is' = `n[i+1] * dims[i+1].is' (similarly for `os').
+The stride of the last (`i=rank-1') dimension is the overall stride of
+the array.  e.g. to be equivalent to the advanced complex-DFT
+interface, you would have `dims[rank-1].is' = `istride' and
+`dims[rank-1].os' = `ostride'.  
+
+   In general, we only guarantee FFTW to return a non-`NULL' plan if
+the vector and transform dimensions correspond to a set of distinct
+indices, and for in-place transforms the input/output strides should be
+the same.
+
+
+File: fftw3.info,  Node: Guru Complex DFTs,  Next: Guru Real-data DFTs,  Prev: Guru vector and transform sizes,  Up: Guru Interface
+
+4.5.3 Guru Complex DFTs
+-----------------------
+
+     fftw_plan fftw_plan_guru_dft(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          fftw_complex *in, fftw_complex *out,
+          int sign, unsigned flags);
+
+     fftw_plan fftw_plan_guru_split_dft(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          double *ri, double *ii, double *ro, double *io,
+          unsigned flags);
+
+   These two functions plan a complex-data, multi-dimensional DFT for
+the interleaved and split format, respectively.  Transform dimensions
+are given by (`rank', `dims') over a multi-dimensional vector (loop) of
+dimensions (`howmany_rank', `howmany_dims').  `dims' and `howmany_dims'
+should point to `fftw_iodim' arrays of length `rank' and
+`howmany_rank', respectively.
+
+   `flags' is a bitwise OR (`|') of zero or more planner flags, as
+defined in *note Planner Flags::.
+
+   In the `fftw_plan_guru_dft' function, the pointers `in' and `out'
+point to the interleaved input and output arrays, respectively.  The
+sign can be either -1 (= `FFTW_FORWARD') or +1 (= `FFTW_BACKWARD').  If
+the pointers are equal, the transform is in-place.
+
+   In the `fftw_plan_guru_split_dft' function, `ri' and `ii' point to
+the real and imaginary input arrays, and `ro' and `io' point to the
+real and imaginary output arrays.  The input and output pointers may be
+the same, indicating an in-place transform.  For example, for
+`fftw_complex' pointers `in' and `out', the corresponding parameters
+are:
+
+     ri = (double *) in;
+     ii = (double *) in + 1;
+     ro = (double *) out;
+     io = (double *) out + 1;
+
+   Because `fftw_plan_guru_split_dft' accepts split arrays, strides are
+expressed in units of `double'.  For a contiguous `fftw_complex' array,
+the overall stride of the transform should be 2, the distance between
+consecutive real parts or between consecutive imaginary parts; see
+*note Guru vector and transform sizes::.  Note that the dimension
+strides are applied equally to the real and imaginary parts; real and
+imaginary arrays with different strides are not supported.
+
+   There is no `sign' parameter in `fftw_plan_guru_split_dft'.  This
+function always plans for an `FFTW_FORWARD' transform.  To plan for an
+`FFTW_BACKWARD' transform, you can exploit the identity that the
+backwards DFT is equal to the forwards DFT with the real and imaginary
+parts swapped.  For example, in the case of the `fftw_complex' arrays
+above, the `FFTW_BACKWARD' transform is computed by the parameters:
+
+     ri = (double *) in + 1;
+     ii = (double *) in;
+     ro = (double *) out + 1;
+     io = (double *) out;
+
+
+File: fftw3.info,  Node: Guru Real-data DFTs,  Next: Guru Real-to-real Transforms,  Prev: Guru Complex DFTs,  Up: Guru Interface
+
+4.5.4 Guru Real-data DFTs
+-------------------------
+
+     fftw_plan fftw_plan_guru_dft_r2c(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          double *in, fftw_complex *out,
+          unsigned flags);
+
+     fftw_plan fftw_plan_guru_split_dft_r2c(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          double *in, double *ro, double *io,
+          unsigned flags);
+
+     fftw_plan fftw_plan_guru_dft_c2r(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          fftw_complex *in, double *out,
+          unsigned flags);
+
+     fftw_plan fftw_plan_guru_split_dft_c2r(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          double *ri, double *ii, double *out,
+          unsigned flags);
+
+   Plan a real-input (r2c) or real-output (c2r), multi-dimensional DFT
+with transform dimensions given by (`rank', `dims') over a
+multi-dimensional vector (loop) of dimensions (`howmany_rank',
+`howmany_dims').  `dims' and `howmany_dims' should point to
+`fftw_iodim' arrays of length `rank' and `howmany_rank', respectively.
+As for the basic and advanced interfaces, an r2c transform is
+`FFTW_FORWARD' and a c2r transform is `FFTW_BACKWARD'.
+
+   The _last_ dimension of `dims' is interpreted specially: that
+dimension of the real array has size `dims[rank-1].n', but that
+dimension of the complex array has size `dims[rank-1].n/2+1' (division
+rounded down).  The strides, on the other hand, are taken to be exactly
+as specified.  It is up to the user to specify the strides
+appropriately for the peculiar dimensions of the data, and we do not
+guarantee that the planner will succeed (return non-`NULL') for any
+dimensions other than those described in *note Real-data DFT Array
+Format:: and generalized in *note Advanced Real-data DFTs::.  (That is,
+for an in-place transform, each individual dimension should be able to
+operate in place.)  
+
+   `in' and `out' point to the input and output arrays for r2c and c2r
+transforms, respectively.  For split arrays, `ri' and `ii' point to the
+real and imaginary input arrays for a c2r transform, and `ro' and `io'
+point to the real and imaginary output arrays for an r2c transform.
+`in' and `ro' or `ri' and `out' may be the same, indicating an in-place
+transform.   (In-place transforms where `in' and `io' or `ii' and `out'
+are the same are not currently supported.)
+
+   `flags' is a bitwise OR (`|') of zero or more planner flags, as
+defined in *note Planner Flags::.
+
+   In-place transforms of rank greater than 1 are currently only
+supported for interleaved arrays.  For split arrays, the planner will
+return `NULL'.  
+
+
+File: fftw3.info,  Node: Guru Real-to-real Transforms,  Next: 64-bit Guru Interface,  Prev: Guru Real-data DFTs,  Up: Guru Interface
+
+4.5.5 Guru Real-to-real Transforms
+----------------------------------
+
+     fftw_plan fftw_plan_guru_r2r(int rank, const fftw_iodim *dims,
+                                  int howmany_rank,
+                                  const fftw_iodim *howmany_dims,
+                                  double *in, double *out,
+                                  const fftw_r2r_kind *kind,
+                                  unsigned flags);
+
+   Plan a real-to-real (r2r) multi-dimensional `FFTW_FORWARD' transform
+with transform dimensions given by (`rank', `dims') over a
+multi-dimensional vector (loop) of dimensions (`howmany_rank',
+`howmany_dims').  `dims' and `howmany_dims' should point to
+`fftw_iodim' arrays of length `rank' and `howmany_rank', respectively.
+
+   The transform kind of each dimension is given by the `kind'
+parameter, which should point to an array of length `rank'.  Valid
+`fftw_r2r_kind' constants are given in *note Real-to-Real Transform
+Kinds::.
+
+   `in' and `out' point to the real input and output arrays; they may
+be the same, indicating an in-place transform.
+
+   `flags' is a bitwise OR (`|') of zero or more planner flags, as
+defined in *note Planner Flags::.
+
+
+File: fftw3.info,  Node: 64-bit Guru Interface,  Prev: Guru Real-to-real Transforms,  Up: Guru Interface
+
+4.5.6 64-bit Guru Interface
+---------------------------
+
+When compiled in 64-bit mode on a 64-bit architecture (where addresses
+are 64 bits wide), FFTW uses 64-bit quantities internally for all
+transform sizes, strides, and so on--you don't have to do anything
+special to exploit this.  However, in the ordinary FFTW interfaces, you
+specify the transform size by an `int' quantity, which is normally only
+32 bits wide.  This means that, even though FFTW is using 64-bit sizes
+internally, you cannot specify a single transform dimension larger than
+2^31-1 numbers.
+
+   We expect that few users will require transforms larger than this,
+but, for those who do, we provide a 64-bit version of the guru
+interface in which all sizes are specified as integers of type
+`ptrdiff_t' instead of `int'.  (`ptrdiff_t' is a signed integer type
+defined by the C standard to be wide enough to represent address
+differences, and thus must be at least 64 bits wide on a 64-bit
+machine.)  We stress that there is _no performance advantage_ to using
+this interface--the same internal FFTW code is employed regardless--and
+it is only necessary if you want to specify very large transform sizes.  
+
+   In particular, the 64-bit guru interface is a set of planner routines
+that are exactly the same as the guru planner routines, except that
+they are named with `guru64' instead of `guru' and they take arguments
+of type `fftw_iodim64' instead of `fftw_iodim'.  For example, instead
+of `fftw_plan_guru_dft', we have `fftw_plan_guru64_dft'.
+
+     fftw_plan fftw_plan_guru64_dft(
+          int rank, const fftw_iodim64 *dims,
+          int howmany_rank, const fftw_iodim64 *howmany_dims,
+          fftw_complex *in, fftw_complex *out,
+          int sign, unsigned flags);
+   
+   The `fftw_iodim64' type is similar to `fftw_iodim', with the same
+interpretation, except that it uses type `ptrdiff_t' instead of type
+`int'.
+
+     typedef struct {
+          ptrdiff_t n;
+          ptrdiff_t is;
+          ptrdiff_t os;
+     } fftw_iodim64;
+   
+   Every other `fftw_plan_guru' function also has a `fftw_plan_guru64'
+equivalent, but we do not repeat their documentation here since they
+are identical to the 32-bit versions except as noted above.
+
+
+File: fftw3.info,  Node: New-array Execute Functions,  Next: Wisdom,  Prev: Guru Interface,  Up: FFTW Reference
+
+4.6 New-array Execute Functions
+===============================
+
+Normally, one executes a plan for the arrays with which the plan was
+created, by calling `fftw_execute(plan)' as described in *note Using
+Plans::.  However, it is possible for sophisticated users to apply a
+given plan to a _different_ array using the "new-array execute"
+functions detailed below, provided that the following conditions are
+met:
+
+   * The array size, strides, etcetera are the same (since those are
+     set by the plan).
+
+   * The input and output arrays are the same (in-place) or different
+     (out-of-place) if the plan was originally created to be in-place or
+     out-of-place, respectively.
+
+   * For split arrays, the separations between the real and imaginary
+     parts, `ii-ri' and `io-ro', are the same as they were for the
+     input and output arrays when the plan was created.  (This
+     condition is automatically satisfied for interleaved arrays.)
+
+   * The "alignment" of the new input/output arrays is the same as that
+     of the input/output arrays when the plan was created, unless the
+     plan was created with the `FFTW_UNALIGNED' flag.  Here, the
+     alignment is a platform-dependent quantity (for example, it is the
+     address modulo 16 if SSE SIMD instructions are used, but the
+     address modulo 4 for non-SIMD single-precision FFTW on the same
+     machine).  In general, only arrays allocated with `fftw_malloc'
+     are guaranteed to be equally aligned (*note SIMD alignment and
+     fftw_malloc::).
+
+
+   The alignment issue is especially critical, because if you don't use
+`fftw_malloc' then you may have little control over the alignment of
+arrays in memory.  For example, neither the C++ `new' function nor the
+Fortran `allocate' statement provide strong enough guarantees about
+data alignment.  If you don't use `fftw_malloc', therefore, you
+probably have to use `FFTW_UNALIGNED' (which disables most SIMD
+support).  If possible, it is probably better for you to simply create
+multiple plans (creating a new plan is quick once one exists for a
+given size), or better yet re-use the same array for your transforms.
+
+   If you are tempted to use the new-array execute interface because you
+want to transform a known bunch of arrays of the same size, you should
+probably go use the advanced interface instead (*note Advanced
+Interface::)).
+
+   The new-array execute functions are:
+
+     void fftw_execute_dft(
+          const fftw_plan p,
+          fftw_complex *in, fftw_complex *out);
+
+     void fftw_execute_split_dft(
+          const fftw_plan p,
+          double *ri, double *ii, double *ro, double *io);
+
+     void fftw_execute_dft_r2c(
+          const fftw_plan p,
+          double *in, fftw_complex *out);
+
+     void fftw_execute_split_dft_r2c(
+          const fftw_plan p,
+          double *in, double *ro, double *io);
+
+     void fftw_execute_dft_c2r(
+          const fftw_plan p,
+          fftw_complex *in, double *out);
+
+     void fftw_execute_split_dft_c2r(
+          const fftw_plan p,
+          double *ri, double *ii, double *out);
+
+     void fftw_execute_r2r(
+          const fftw_plan p,
+          double *in, double *out);
+   
+   These execute the `plan' to compute the corresponding transform on
+the input/output arrays specified by the subsequent arguments.  The
+input/output array arguments have the same meanings as the ones passed
+to the guru planner routines in the preceding sections.  The `plan' is
+not modified, and these routines can be called as many times as
+desired, or intermixed with calls to the ordinary `fftw_execute'.
+
+   The `plan' _must_ have been created for the transform type
+corresponding to the execute function, e.g. it must be a complex-DFT
+plan for `fftw_execute_dft'.  Any of the planner routines for that
+transform type, from the basic to the guru interface, could have been
+used to create the plan, however.
+
+
+File: fftw3.info,  Node: Wisdom,  Next: What FFTW Really Computes,  Prev: New-array Execute Functions,  Up: FFTW Reference
+
+4.7 Wisdom
+==========
+
+This section documents the FFTW mechanism for saving and restoring
+plans from disk.  This mechanism is called "wisdom".
+
+* Menu:
+
+* Wisdom Export::
+* Wisdom Import::
+* Forgetting Wisdom::
+* Wisdom Utilities::
+
+
+File: fftw3.info,  Node: Wisdom Export,  Next: Wisdom Import,  Prev: Wisdom,  Up: Wisdom
+
+4.7.1 Wisdom Export
+-------------------
+
+     int fftw_export_wisdom_to_filename(const char *filename);
+     void fftw_export_wisdom_to_file(FILE *output_file);
+     char *fftw_export_wisdom_to_string(void);
+     void fftw_export_wisdom(void (*write_char)(char c, void *), void *data);
+
+   These functions allow you to export all currently accumulated wisdom
+in a form from which it can be later imported and restored, even during
+a separate run of the program. (*Note Words of Wisdom-Saving Plans::.)
+The current store of wisdom is not affected by calling any of these
+routines.
+
+   `fftw_export_wisdom' exports the wisdom to any output medium, as
+specified by the callback function `write_char'. `write_char' is a
+`putc'-like function that writes the character `c' to some output; its
+second parameter is the `data' pointer passed to `fftw_export_wisdom'.
+For convenience, the following three "wrapper" routines are provided:
+
+   `fftw_export_wisdom_to_filename' writes wisdom to a file named
+`filename' (which is created or overwritten), returning `1' on success
+and `0' on failure.  A lower-level function, which requires you to open
+and close the file yourself (e.g. if you want to write wisdom to a
+portion of a larger file) is `fftw_export_wisdom_to_file'.  This writes
+the wisdom to the current position in `output_file', which should be
+open with write permission; upon exit, the file remains open and is
+positioned at the end of the wisdom data.
+
+   `fftw_export_wisdom_to_string' returns a pointer to a
+`NULL'-terminated string holding the wisdom data. This string is
+dynamically allocated, and it is the responsibility of the caller to
+deallocate it with `free' when it is no longer needed.
+
+   All of these routines export the wisdom in the same format, which we
+will not document here except to say that it is LISP-like ASCII text
+that is insensitive to white space.
+
+
+File: fftw3.info,  Node: Wisdom Import,  Next: Forgetting Wisdom,  Prev: Wisdom Export,  Up: Wisdom
+
+4.7.2 Wisdom Import
+-------------------
+
+     int fftw_import_system_wisdom(void);
+     int fftw_import_wisdom_from_filename(const char *filename);
+     int fftw_import_wisdom_from_string(const char *input_string);
+     int fftw_import_wisdom(int (*read_char)(void *), void *data);
+
+   These functions import wisdom into a program from data stored by the
+`fftw_export_wisdom' functions above. (*Note Words of Wisdom-Saving
+Plans::.)  The imported wisdom replaces any wisdom already accumulated
+by the running program.
+
+   `fftw_import_wisdom' imports wisdom from any input medium, as
+specified by the callback function `read_char'. `read_char' is a
+`getc'-like function that returns the next character in the input; its
+parameter is the `data' pointer passed to `fftw_import_wisdom'. If the
+end of the input data is reached (which should never happen for valid
+data), `read_char' should return `EOF' (as defined in `<stdio.h>').
+For convenience, the following three "wrapper" routines are provided:
+
+   `fftw_import_wisdom_from_filename' reads wisdom from a file named
+`filename'.  A lower-level function, which requires you to open and
+close the file yourself (e.g. if you want to read wisdom from a portion
+of a larger file) is `fftw_import_wisdom_from_file'. This reads wisdom
+from the current position in `input_file' (which should be open with
+read permission); upon exit, the file remains open, but the position of
+the read pointer is unspecified.
+
+   `fftw_import_wisdom_from_string' reads wisdom from the
+`NULL'-terminated string `input_string'.
+
+   `fftw_import_system_wisdom' reads wisdom from an
+implementation-defined standard file (`/etc/fftw/wisdom' on Unix and
+GNU systems).  
+
+   The return value of these import routines is `1' if the wisdom was
+read successfully and `0' otherwise. Note that, in all of these
+functions, any data in the input stream past the end of the wisdom data
+is simply ignored.
+
+
+File: fftw3.info,  Node: Forgetting Wisdom,  Next: Wisdom Utilities,  Prev: Wisdom Import,  Up: Wisdom
+
+4.7.3 Forgetting Wisdom
+-----------------------
+
+     void fftw_forget_wisdom(void);
+
+   Calling `fftw_forget_wisdom' causes all accumulated `wisdom' to be
+discarded and its associated memory to be freed. (New `wisdom' can
+still be gathered subsequently, however.)
+
+
+File: fftw3.info,  Node: Wisdom Utilities,  Prev: Forgetting Wisdom,  Up: Wisdom
+
+4.7.4 Wisdom Utilities
+----------------------
+
+FFTW includes two standalone utility programs that deal with wisdom.  We
+merely summarize them here, since they come with their own `man' pages
+for Unix and GNU systems (with HTML versions on our web site).
+
+   The first program is `fftw-wisdom' (or `fftwf-wisdom' in single
+precision, etcetera), which can be used to create a wisdom file
+containing plans for any of the transform sizes and types supported by
+FFTW.  It is preferable to create wisdom directly from your executable
+(*note Caveats in Using Wisdom::), but this program is useful for
+creating global wisdom files for `fftw_import_system_wisdom'.  
+
+   The second program is `fftw-wisdom-to-conf', which takes a wisdom
+file as input and produces a "configuration routine" as output.  The
+latter is a C subroutine that you can compile and link into your
+program, replacing a routine of the same name in the FFTW library, that
+determines which parts of FFTW are callable by your program.
+`fftw-wisdom-to-conf' produces a configuration routine that links to
+only those parts of FFTW needed by the saved plans in the wisdom,
+greatly reducing the size of statically linked executables (which should
+only attempt to create plans corresponding to those in the wisdom,
+however).  
+
+
+File: fftw3.info,  Node: What FFTW Really Computes,  Prev: Wisdom,  Up: FFTW Reference
+
+4.8 What FFTW Really Computes
+=============================
+
+In this section, we provide precise mathematical definitions for the
+transforms that FFTW computes.  These transform definitions are fairly
+standard, but some authors follow slightly different conventions for the
+normalization of the transform (the constant factor in front) and the
+sign of the complex exponent.  We begin by presenting the
+one-dimensional (1d) transform definitions, and then give the
+straightforward extension to multi-dimensional transforms.
+
+* Menu:
+
+* The 1d Discrete Fourier Transform (DFT)::
+* The 1d Real-data DFT::
+* 1d Real-even DFTs (DCTs)::
+* 1d Real-odd DFTs (DSTs)::
+* 1d Discrete Hartley Transforms (DHTs)::
+* Multi-dimensional Transforms::
+
+
+File: fftw3.info,  Node: The 1d Discrete Fourier Transform (DFT),  Next: The 1d Real-data DFT,  Prev: What FFTW Really Computes,  Up: What FFTW Really Computes
+
+4.8.1 The 1d Discrete Fourier Transform (DFT)
+---------------------------------------------
+
+The forward (`FFTW_FORWARD') discrete Fourier transform (DFT) of a 1d
+complex array X of size n computes an array Y, where:  Y[k] = sum for j = 0 to (n - 1) of X[j] * exp(-2 pi j k sqrt(-1)/n) .
+   The backward (`FFTW_BACKWARD') DFT computes:  Y[k] = sum for j = 0 to (n - 1) of X[j] * exp(2 pi j k sqrt(-1)/n) .
+   FFTW computes an unnormalized transform, in that there is no
+coefficient in front of the summation in the DFT.  In other words,
+applying the forward and then the backward transform will multiply the
+input by n.
+
+   From above, an `FFTW_FORWARD' transform corresponds to a sign of -1
+in the exponent of the DFT.  Note also that we use the standard
+"in-order" output ordering--the k-th output corresponds to the
+frequency k/n (or k/T, where T is your total sampling period).  For
+those who like to think in terms of positive and negative frequencies,
+this means that the positive frequencies are stored in the first half
+of the output and the negative frequencies are stored in backwards
+order in the second half of the output.  (The frequency -k/n is the
+same as the frequency (n-k)/n.)
+
+
+File: fftw3.info,  Node: The 1d Real-data DFT,  Next: 1d Real-even DFTs (DCTs),  Prev: The 1d Discrete Fourier Transform (DFT),  Up: What FFTW Really Computes
+
+4.8.2 The 1d Real-data DFT
+--------------------------
+
+The real-input (r2c) DFT in FFTW computes the _forward_ transform Y of
+the size `n' real array X, exactly as defined above, i.e.   Y[k] = sum for j = 0 to (n - 1) of X[j] * exp(-2 pi j k sqrt(-1)/n) .
+   This output array Y can easily be shown to possess the "Hermitian"
+symmetry Y[k] = Y[n-k]*, where we take Y to be periodic so that Y[n] =
+Y[0].
+
+   As a result of this symmetry, half of the output Y is redundant
+(being the complex conjugate of the other half), and so the 1d r2c
+transforms only output elements 0...n/2 of Y (n/2+1 complex numbers),
+where the division by 2 is rounded down.
+
+   Moreover, the Hermitian symmetry implies that Y[0] and, if n is
+even, the Y[n/2] element, are purely real.  So, for the `R2HC' r2r
+transform, these elements are not stored in the halfcomplex output
+format.  
+
+   The c2r and `H2RC' r2r transforms compute the backward DFT of the
+_complex_ array X with Hermitian symmetry, stored in the r2c/`R2HC'
+output formats, respectively, where the backward transform is defined
+exactly as for the complex case:  Y[k] = sum for j = 0 to (n - 1) of X[j] * exp(2 pi j k sqrt(-1)/n) .
+   The outputs `Y' of this transform can easily be seen to be purely
+real, and are stored as an array of real numbers.
+
+   Like FFTW's complex DFT, these transforms are unnormalized.  In other
+words, applying the real-to-complex (forward) and then the
+complex-to-real (backward) transform will multiply the input by n.
+
+
+File: fftw3.info,  Node: 1d Real-even DFTs (DCTs),  Next: 1d Real-odd DFTs (DSTs),  Prev: The 1d Real-data DFT,  Up: What FFTW Really Computes
+
+4.8.3 1d Real-even DFTs (DCTs)
+------------------------------
+
+The Real-even symmetry DFTs in FFTW are exactly equivalent to the
+unnormalized forward (and backward) DFTs as defined above, where the
+input array X of length N is purely real and is also "even" symmetry.
+In this case, the output array is likewise real and even symmetry.  
+
+   For the case of `REDFT00', this even symmetry means that X[j] =
+X[N-j], where we take X to be periodic so that X[N] = X[0].  Because of
+this redundancy, only the first n real numbers are actually stored,
+where N = 2(n-1).
+
+   The proper definition of even symmetry for `REDFT10', `REDFT01', and
+`REDFT11' transforms is somewhat more intricate because of the shifts
+by 1/2 of the input and/or output, although the corresponding boundary
+conditions are given in *note Real even/odd DFTs (cosine/sine
+transforms)::.  Because of the even symmetry, however, the sine terms
+in the DFT all cancel and the remaining cosine terms are written
+explicitly below.  This formulation often leads people to call such a
+transform a "discrete cosine transform" (DCT), although it is really
+just a special case of the DFT.  
+
+   In each of the definitions below, we transform a real array X of
+length n to a real array Y of length n:
+
+REDFT00 (DCT-I)
+...............
+
+An `REDFT00' transform (type-I DCT) in FFTW is defined by: Y[k] = X[0]
++ (-1)^k X[n-1] + 2 (sum for j = 1 to n-2 of X[j] cos(pi jk /(n-1))).
+Note that this transform is not defined for n=1.  For n=2, the
+summation term above is dropped as you might expect.
+
+REDFT10 (DCT-II)
+................
+
+An `REDFT10' transform (type-II DCT, sometimes called "the" DCT) in
+FFTW is defined by: Y[k] = 2 (sum for j = 0 to n-1 of X[j] cos(pi
+(j+1/2) k / n)).
+
+REDFT01 (DCT-III)
+.................
+
+An `REDFT01' transform (type-III DCT) in FFTW is defined by: Y[k] =
+X[0] + 2 (sum for j = 1 to n-1 of X[j] cos(pi j (k+1/2) / n)).  In the
+case of n=1, this reduces to Y[0] = X[0].  Up to a scale factor (see
+below), this is the inverse of `REDFT10' ("the" DCT), and so the
+`REDFT01' (DCT-III) is sometimes called the "IDCT".  
+
+REDFT11 (DCT-IV)
+................
+
+An `REDFT11' transform (type-IV DCT) in FFTW is defined by: Y[k] = 2
+(sum for j = 0 to n-1 of X[j] cos(pi (j+1/2) (k+1/2) / n)).
+
+Inverses and Normalization
+..........................
+
+These definitions correspond directly to the unnormalized DFTs used
+elsewhere in FFTW (hence the factors of 2 in front of the summations).
+The unnormalized inverse of `REDFT00' is `REDFT00', of `REDFT10' is
+`REDFT01' and vice versa, and of `REDFT11' is `REDFT11'.  Each
+unnormalized inverse results in the original array multiplied by N,
+where N is the _logical_ DFT size.  For `REDFT00', N=2(n-1) (note that
+n=1 is not defined); otherwise, N=2n.  
+
+   In defining the discrete cosine transform, some authors also include
+additional factors of sqrt(2) (or its inverse) multiplying selected
+inputs and/or outputs.  This is a mostly cosmetic change that makes the
+transform orthogonal, but sacrifices the direct equivalence to a
+symmetric DFT.
+
+
+File: fftw3.info,  Node: 1d Real-odd DFTs (DSTs),  Next: 1d Discrete Hartley Transforms (DHTs),  Prev: 1d Real-even DFTs (DCTs),  Up: What FFTW Really Computes
+
+4.8.4 1d Real-odd DFTs (DSTs)
+-----------------------------
+
+The Real-odd symmetry DFTs in FFTW are exactly equivalent to the
+unnormalized forward (and backward) DFTs as defined above, where the
+input array X of length N is purely real and is also "odd" symmetry.  In
+this case, the output is odd symmetry and purely imaginary.  
+
+   For the case of `RODFT00', this odd symmetry means that X[j] =
+-X[N-j], where we take X to be periodic so that X[N] = X[0].  Because
+of this redundancy, only the first n real numbers starting at j=1 are
+actually stored (the j=0 element is zero), where N = 2(n+1).
+
+   The proper definition of odd symmetry for `RODFT10', `RODFT01', and
+`RODFT11' transforms is somewhat more intricate because of the shifts
+by 1/2 of the input and/or output, although the corresponding boundary
+conditions are given in *note Real even/odd DFTs (cosine/sine
+transforms)::.  Because of the odd symmetry, however, the cosine terms
+in the DFT all cancel and the remaining sine terms are written
+explicitly below.  This formulation often leads people to call such a
+transform a "discrete sine transform" (DST), although it is really just
+a special case of the DFT.  
+
+   In each of the definitions below, we transform a real array X of
+length n to a real array Y of length n:
+
+RODFT00 (DST-I)
+...............
+
+An `RODFT00' transform (type-I DST) in FFTW is defined by: Y[k] = 2
+(sum for j = 0 to n-1 of X[j] sin(pi (j+1)(k+1) / (n+1))).
+
+RODFT10 (DST-II)
+................
+
+An `RODFT10' transform (type-II DST) in FFTW is defined by: Y[k] = 2
+(sum for j = 0 to n-1 of X[j] sin(pi (j+1/2) (k+1) / n)).
+
+RODFT01 (DST-III)
+.................
+
+An `RODFT01' transform (type-III DST) in FFTW is defined by: Y[k] =
+(-1)^k X[n-1] + 2 (sum for j = 0 to n-2 of X[j] sin(pi (j+1) (k+1/2) /
+n)).  In the case of n=1, this reduces to Y[0] = X[0].
+
+RODFT11 (DST-IV)
+................
+
+An `RODFT11' transform (type-IV DST) in FFTW is defined by: Y[k] = 2
+(sum for j = 0 to n-1 of X[j] sin(pi (j+1/2) (k+1/2) / n)).
+
+Inverses and Normalization
+..........................
+
+These definitions correspond directly to the unnormalized DFTs used
+elsewhere in FFTW (hence the factors of 2 in front of the summations).
+The unnormalized inverse of `RODFT00' is `RODFT00', of `RODFT10' is
+`RODFT01' and vice versa, and of `RODFT11' is `RODFT11'.  Each
+unnormalized inverse results in the original array multiplied by N,
+where N is the _logical_ DFT size.  For `RODFT00', N=2(n+1); otherwise,
+N=2n.  
+
+   In defining the discrete sine transform, some authors also include
+additional factors of sqrt(2) (or its inverse) multiplying selected
+inputs and/or outputs.  This is a mostly cosmetic change that makes the
+transform orthogonal, but sacrifices the direct equivalence to an
+antisymmetric DFT.
+
+
+File: fftw3.info,  Node: 1d Discrete Hartley Transforms (DHTs),  Next: Multi-dimensional Transforms,  Prev: 1d Real-odd DFTs (DSTs),  Up: What FFTW Really Computes
+
+4.8.5 1d Discrete Hartley Transforms (DHTs)
+-------------------------------------------
+
+The discrete Hartley transform (DHT) of a 1d real array X of size n
+computes a real array Y of the same size, where: Y[k] = sum for j = 0 to (n - 1) of X[j] * [cos(2 pi j k / n) + sin(2 pi j k / n)].
+   FFTW computes an unnormalized transform, in that there is no
+coefficient in front of the summation in the DHT.  In other words,
+applying the transform twice (the DHT is its own inverse) will multiply
+the input by n.
+
+
+File: fftw3.info,  Node: Multi-dimensional Transforms,  Prev: 1d Discrete Hartley Transforms (DHTs),  Up: What FFTW Really Computes
+
+4.8.6 Multi-dimensional Transforms
+----------------------------------
+
+The multi-dimensional transforms of FFTW, in general, compute simply the
+separable product of the given 1d transform along each dimension of the
+array.  Since each of these transforms is unnormalized, computing the
+forward followed by the backward/inverse multi-dimensional transform
+will result in the original array scaled by the product of the
+normalization factors for each dimension (e.g. the product of the
+dimension sizes, for a multi-dimensional DFT).
+
+   The definition of FFTW's multi-dimensional DFT of real data (r2c)
+deserves special attention.  In this case, we logically compute the full
+multi-dimensional DFT of the input data; since the input data are purely
+real, the output data have the Hermitian symmetry and therefore only one
+non-redundant half need be stored.  More specifically, for an n[0] x
+n[1] x n[2] x ... x n[d-1]  multi-dimensional real-input DFT, the full
+(logical) complex output array Y[k[0], k[1], ..., k[d-1]] has the
+symmetry: Y[k[0], k[1], ..., k[d-1]] = Y[n[0] - k[0], n[1] - k[1], ...,
+n[d-1] - k[d-1]]* (where each dimension is periodic).  Because of this
+symmetry, we only store the k[d-1] = 0...n[d-1]/2 elements of the
+_last_ dimension (division by 2 is rounded down).  (We could instead
+have cut any other dimension in half, but the last dimension proved
+computationally convenient.)  This results in the peculiar array format
+described in more detail by *note Real-data DFT Array Format::.
+
+   The multi-dimensional c2r transform is simply the unnormalized
+inverse of the r2c transform.  i.e. it is the same as FFTW's complex
+backward multi-dimensional DFT, operating on a Hermitian input array in
+the peculiar format mentioned above and outputting a real array (since
+the DFT output is purely real).
+
+   We should remind the user that the separable product of 1d transforms
+along each dimension, as computed by FFTW, is not always the same thing
+as the usual multi-dimensional transform.  A multi-dimensional `R2HC'
+(or `HC2R') transform is not identical to the multi-dimensional DFT,
+requiring some post-processing to combine the requisite real and
+imaginary parts, as was described in *note The Halfcomplex-format
+DFT::.  Likewise, FFTW's multidimensional `FFTW_DHT' r2r transform is
+not the same thing as the logical multi-dimensional discrete Hartley
+transform defined in the literature, as discussed in *note The Discrete
+Hartley Transform::.
+
+
+File: fftw3.info,  Node: Multi-threaded FFTW,  Next: Distributed-memory FFTW with MPI,  Prev: FFTW Reference,  Up: Top
+
+5 Multi-threaded FFTW
+*********************
+
+In this chapter we document the parallel FFTW routines for
+shared-memory parallel hardware.  These routines, which support
+parallel one- and multi-dimensional transforms of both real and complex
+data, are the easiest way to take advantage of multiple processors with
+FFTW.  They work just like the corresponding uniprocessor transform
+routines, except that you have an extra initialization routine to call,
+and there is a routine to set the number of threads to employ.  Any
+program that uses the uniprocessor FFTW can therefore be trivially
+modified to use the multi-threaded FFTW.
+
+   A shared-memory machine is one in which all CPUs can directly access
+the same main memory, and such machines are now common due to the
+ubiquity of multi-core CPUs.  FFTW's multi-threading support allows you
+to utilize these additional CPUs transparently from a single program.
+However, this does not necessarily translate into performance
+gains--when multiple threads/CPUs are employed, there is an overhead
+required for synchronization that may outweigh the computatational
+parallelism.  Therefore, you can only benefit from threads if your
+problem is sufficiently large.  
+
+* Menu:
+
+* Installation and Supported Hardware/Software::
+* Usage of Multi-threaded FFTW::
+* How Many Threads to Use?::
+* Thread safety::
+
+
+File: fftw3.info,  Node: Installation and Supported Hardware/Software,  Next: Usage of Multi-threaded FFTW,  Prev: Multi-threaded FFTW,  Up: Multi-threaded FFTW
+
+5.1 Installation and Supported Hardware/Software
+================================================
+
+All of the FFTW threads code is located in the `threads' subdirectory
+of the FFTW package.  On Unix systems, the FFTW threads libraries and
+header files can be automatically configured, compiled, and installed
+along with the uniprocessor FFTW libraries simply by including
+`--enable-threads' in the flags to the `configure' script (*note
+Installation on Unix::), or `--enable-openmp' to use OpenMP
+(http://www.openmp.org) threads.  
+
+   The threads routines require your operating system to have some sort
+of shared-memory threads support.  Specifically, the FFTW threads
+package works with POSIX threads (available on most Unix variants, from
+GNU/Linux to MacOS X) and Win32 threads.  OpenMP threads, which are
+supported in many common compilers (e.g. gcc) are also supported, and
+may give better performance on some systems.  (OpenMP threads are also
+useful if you are employing OpenMP in your own code, in order to
+minimize conflicts between threading models.)  If you have a
+shared-memory machine that uses a different threads API, it should be a
+simple matter of programming to include support for it; see the file
+`threads/threads.c' for more detail.
+
+   You can compile FFTW with _both_ `--enable-threads' and
+`--enable-openmp' at the same time, since they install libraries with
+different names (`fftw3_threads' and `fftw3_omp', as described below).
+However, your programs may only link to _one_ of these two libraries at
+a time.
+
+   Ideally, of course, you should also have multiple processors in
+order to get any benefit from the threaded transforms.
+
+
+File: fftw3.info,  Node: Usage of Multi-threaded FFTW,  Next: How Many Threads to Use?,  Prev: Installation and Supported Hardware/Software,  Up: Multi-threaded FFTW
+
+5.2 Usage of Multi-threaded FFTW
+================================
+
+Here, it is assumed that the reader is already familiar with the usage
+of the uniprocessor FFTW routines, described elsewhere in this manual.
+We only describe what one has to change in order to use the
+multi-threaded routines.
+
+   First, programs using the parallel complex transforms should be
+linked with `-lfftw3_threads -lfftw3 -lm' on Unix, or `-lfftw3_omp
+-lfftw3 -lm' if you compiled with OpenMP. You will also need to link
+with whatever library is responsible for threads on your system (e.g.
+`-lpthread' on GNU/Linux) or include whatever compiler flag enables
+OpenMP (e.g. `-fopenmp' with gcc).  
+
+   Second, before calling _any_ FFTW routines, you should call the
+function:
+
+     int fftw_init_threads(void);
+   
+   This function, which need only be called once, performs any one-time
+initialization required to use threads on your system.  It returns zero
+if there was some error (which should not happen under normal
+circumstances) and a non-zero value otherwise.
+
+   Third, before creating a plan that you want to parallelize, you
+should call:
+
+     void fftw_plan_with_nthreads(int nthreads);
+   
+   The `nthreads' argument indicates the number of threads you want
+FFTW to use (or actually, the maximum number).  All plans subsequently
+created with any planner routine will use that many threads.  You can
+call `fftw_plan_with_nthreads', create some plans, call
+`fftw_plan_with_nthreads' again with a different argument, and create
+some more plans for a new number of threads.  Plans already created
+before a call to `fftw_plan_with_nthreads' are unaffected.  If you pass
+an `nthreads' argument of `1' (the default), threads are disabled for
+subsequent plans.
+
+   With OpenMP, to configure FFTW to use all of the currently running
+OpenMP threads (set by `omp_set_num_threads(nthreads)' or by the
+`OMP_NUM_THREADS' environment variable), you can do:
+`fftw_plan_with_nthreads(omp_get_max_threads())'. (The `omp_' OpenMP
+functions are declared via `#include <omp.h>'.)
+
+   Given a plan, you then execute it as usual with
+`fftw_execute(plan)', and the execution will use the number of threads
+specified when the plan was created.  When done, you destroy it as
+usual with `fftw_destroy_plan'.  As described in *note Thread safety::,
+plan _execution_ is thread-safe, but plan creation and destruction are
+_not_: you should create/destroy plans only from a single thread, but
+can safely execute multiple plans in parallel.
+
+   There is one additional routine: if you want to get rid of all memory
+and other resources allocated internally by FFTW, you can call:
+
+     void fftw_cleanup_threads(void);
+   
+   which is much like the `fftw_cleanup()' function except that it also
+gets rid of threads-related data.  You must _not_ execute any
+previously created plans after calling this function.
+
+   We should also mention one other restriction: if you save wisdom
+from a program using the multi-threaded FFTW, that wisdom _cannot be
+used_ by a program using only the single-threaded FFTW (i.e. not calling
+`fftw_init_threads').  *Note Words of Wisdom-Saving Plans::.
+
+
+File: fftw3.info,  Node: How Many Threads to Use?,  Next: Thread safety,  Prev: Usage of Multi-threaded FFTW,  Up: Multi-threaded FFTW
+
+5.3 How Many Threads to Use?
+============================
+
+There is a fair amount of overhead involved in synchronizing threads,
+so the optimal number of threads to use depends upon the size of the
+transform as well as on the number of processors you have.
+
+   As a general rule, you don't want to use more threads than you have
+processors.  (Using more threads will work, but there will be extra
+overhead with no benefit.)  In fact, if the problem size is too small,
+you may want to use fewer threads than you have processors.
+
+   You will have to experiment with your system to see what level of
+parallelization is best for your problem size.  Typically, the problem
+will have to involve at least a few thousand data points before threads
+become beneficial.  If you plan with `FFTW_PATIENT', it will
+automatically disable threads for sizes that don't benefit from
+parallelization.  
+
+
+File: fftw3.info,  Node: Thread safety,  Prev: How Many Threads to Use?,  Up: Multi-threaded FFTW
+
+5.4 Thread safety
+=================
+
+Users writing multi-threaded programs (including OpenMP) must concern
+themselves with the "thread safety" of the libraries they use--that is,
+whether it is safe to call routines in parallel from multiple threads.
+FFTW can be used in such an environment, but some care must be taken
+because the planner routines share data (e.g. wisdom and trigonometric
+tables) between calls and plans.
+
+   The upshot is that the only thread-safe (re-entrant) routine in FFTW
+is `fftw_execute' (and the new-array variants thereof).  All other
+routines (e.g. the planner) should only be called from one thread at a
+time.  So, for example, you can wrap a semaphore lock around any calls
+to the planner; even more simply, you can just create all of your plans
+from one thread.  We do not think this should be an important
+restriction (FFTW is designed for the situation where the only
+performance-sensitive code is the actual execution of the transform),
+and the benefits of shared data between plans are great.
+
+   Note also that, since the plan is not modified by `fftw_execute', it
+is safe to execute the _same plan_ in parallel by multiple threads.
+However, since a given plan operates by default on a fixed array, you
+need to use one of the new-array execute functions (*note New-array
+Execute Functions::) so that different threads compute the transform of
+different data.
+
+   (Users should note that these comments only apply to programs using
+shared-memory threads or OpenMP.  Parallelism using MPI or forked
+processes involves a separate address-space and global variables for
+each process, and is not susceptible to problems of this sort.)
+
+   If you are configured FFTW with the `--enable-debug' or
+`--enable-debug-malloc' flags (*note Installation on Unix::), then
+`fftw_execute' is not thread-safe.  These flags are not documented
+because they are intended only for developing and debugging FFTW, but
+if you must use `--enable-debug' then you should also specifically pass
+`--disable-debug-malloc' for `fftw_execute' to be thread-safe.
+
+
+File: fftw3.info,  Node: Distributed-memory FFTW with MPI,  Next: Calling FFTW from Modern Fortran,  Prev: Multi-threaded FFTW,  Up: Top
+
+6 Distributed-memory FFTW with MPI
+**********************************
+
+In this chapter we document the parallel FFTW routines for parallel
+systems supporting the MPI message-passing interface.  Unlike the
+shared-memory threads described in the previous chapter, MPI allows you
+to use _distributed-memory_ parallelism, where each CPU has its own
+separate memory, and which can scale up to clusters of many thousands
+of processors.  This capability comes at a price, however: each process
+only stores a _portion_ of the data to be transformed, which means that
+the data structures and programming-interface are quite different from
+the serial or threads versions of FFTW.  
+
+   Distributed-memory parallelism is especially useful when you are
+transforming arrays so large that they do not fit into the memory of a
+single processor.  The storage per-process required by FFTW's MPI
+routines is proportional to the total array size divided by the number
+of processes.  Conversely, distributed-memory parallelism can easily
+pose an unacceptably high communications overhead for small problems;
+the threshold problem size for which parallelism becomes advantageous
+will depend on the precise problem you are interested in, your
+hardware, and your MPI implementation.
+
+   A note on terminology: in MPI, you divide the data among a set of
+"processes" which each run in their own memory address space.
+Generally, each process runs on a different physical processor, but
+this is not required.  A set of processes in MPI is described by an
+opaque data structure called a "communicator," the most common of which
+is the predefined communicator `MPI_COMM_WORLD' which refers to _all_
+processes.  For more information on these and other concepts common to
+all MPI programs, we refer the reader to the documentation at the MPI
+home page (http://www.mcs.anl.gov/research/projects/mpi/).  
+
+   We assume in this chapter that the reader is familiar with the usage
+of the serial (uniprocessor) FFTW, and focus only on the concepts new
+to the MPI interface.
+
+* Menu:
+
+* FFTW MPI Installation::
+* Linking and Initializing MPI FFTW::
+* 2d MPI example::
+* MPI Data Distribution::
+* Multi-dimensional MPI DFTs of Real Data::
+* Other Multi-dimensional Real-data MPI Transforms::
+* FFTW MPI Transposes::
+* FFTW MPI Wisdom::
+* Avoiding MPI Deadlocks::
+* FFTW MPI Performance Tips::
+* Combining MPI and Threads::
+* FFTW MPI Reference::
+* FFTW MPI Fortran Interface::
+
+
+File: fftw3.info,  Node: FFTW MPI Installation,  Next: Linking and Initializing MPI FFTW,  Prev: Distributed-memory FFTW with MPI,  Up: Distributed-memory FFTW with MPI
+
+6.1 FFTW MPI Installation
+=========================
+
+All of the FFTW MPI code is located in the `mpi' subdirectory of the
+FFTW package.  On Unix systems, the FFTW MPI libraries and header files
+are automatically configured, compiled, and installed along with the
+uniprocessor FFTW libraries simply by including `--enable-mpi' in the
+flags to the `configure' script (*note Installation on Unix::).  
+
+   Any implementation of the MPI standard, version 1 or later, should
+work with FFTW.  The `configure' script will attempt to automatically
+detect how to compile and link code using your MPI implementation.  In
+some cases, especially if you have multiple different MPI
+implementations installed or have an unusual MPI software package, you
+may need to provide this information explicitly.
+
+   Most commonly, one compiles MPI code by invoking a special compiler
+command, typically `mpicc' for C code.  The `configure' script knows
+the most common names for this command, but you can specify the MPI
+compilation command explicitly by setting the `MPICC' variable, as in
+`./configure MPICC=mpicc ...'.  
+
+   If, instead of a special compiler command, you need to link a certain
+library, you can specify the link command via the `MPILIBS' variable,
+as in `./configure MPILIBS=-lmpi ...'.  Note that if your MPI library
+is installed in a non-standard location (one the compiler does not know
+about by default), you may also have to specify the location of the
+library and header files via `LDFLAGS' and `CPPFLAGS' variables,
+respectively, as in `./configure LDFLAGS=-L/path/to/mpi/libs
+CPPFLAGS=-I/path/to/mpi/include ...'.
+
+
+File: fftw3.info,  Node: Linking and Initializing MPI FFTW,  Next: 2d MPI example,  Prev: FFTW MPI Installation,  Up: Distributed-memory FFTW with MPI
+
+6.2 Linking and Initializing MPI FFTW
+=====================================
+
+Programs using the MPI FFTW routines should be linked with `-lfftw3_mpi
+-lfftw3 -lm' on Unix in double precision, `-lfftw3f_mpi -lfftw3f -lm'
+in single precision, and so on (*note Precision::). You will also need
+to link with whatever library is responsible for MPI on your system; in
+most MPI implementations, there is a special compiler alias named
+`mpicc' to compile and link MPI code.  
+
+   Before calling any FFTW routines except possibly `fftw_init_threads'
+(*note Combining MPI and Threads::), but after calling `MPI_Init', you
+should call the function:
+
+     void fftw_mpi_init(void);
+   
+   If, at the end of your program, you want to get rid of all memory and
+other resources allocated internally by FFTW, for both the serial and
+MPI routines, you can call:
+
+     void fftw_mpi_cleanup(void);
+   
+   which is much like the `fftw_cleanup()' function except that it also
+gets rid of FFTW's MPI-related data.  You must _not_ execute any
+previously created plans after calling this function.
+
+
+File: fftw3.info,  Node: 2d MPI example,  Next: MPI Data Distribution,  Prev: Linking and Initializing MPI FFTW,  Up: Distributed-memory FFTW with MPI
+
+6.3 2d MPI example
+==================
+
+Before we document the FFTW MPI interface in detail, we begin with a
+simple example outlining how one would perform a two-dimensional `N0'
+by `N1' complex DFT.
+
+     #include <fftw3-mpi.h>
+
+     int main(int argc, char **argv)
+     {
+         const ptrdiff_t N0 = ..., N1 = ...;
+         fftw_plan plan;
+         fftw_complex *data;
+         ptrdiff_t alloc_local, local_n0, local_0_start, i, j;
+
+         MPI_Init(&argc, &argv);
+         fftw_mpi_init();
+
+         /* get local data size and allocate */
+         alloc_local = fftw_mpi_local_size_2d(N0, N1, MPI_COMM_WORLD,
+                                              &local_n0, &local_0_start);
+         data = fftw_alloc_complex(alloc_local);
+
+         /* create plan for in-place forward DFT */
+         plan = fftw_mpi_plan_dft_2d(N0, N1, data, data, MPI_COMM_WORLD,
+                                     FFTW_FORWARD, FFTW_ESTIMATE);
+
+         /* initialize data to some function my_function(x,y) */
+         for (i = 0; i < local_n0; ++i) for (j = 0; j < N1; ++j)
+            data[i*N1 + j] = my_function(local_0_start + i, j);
+
+         /* compute transforms, in-place, as many times as desired */
+         fftw_execute(plan);
+
+         fftw_destroy_plan(plan);
+
+         MPI_Finalize();
+     }
+
+   As can be seen above, the MPI interface follows the same basic style
+of allocate/plan/execute/destroy as the serial FFTW routines.  All of
+the MPI-specific routines are prefixed with `fftw_mpi_' instead of
+`fftw_'.  There are a few important differences, however:
+
+   First, we must call `fftw_mpi_init()' after calling `MPI_Init'
+(required in all MPI programs) and before calling any other `fftw_mpi_'
+routine.  
+
+   Second, when we create the plan with `fftw_mpi_plan_dft_2d',
+analogous to `fftw_plan_dft_2d', we pass an additional argument: the
+communicator, indicating which processes will participate in the
+transform (here `MPI_COMM_WORLD', indicating all processes).  Whenever
+you create, execute, or destroy a plan for an MPI transform, you must
+call the corresponding FFTW routine on _all_ processes in the
+communicator for that transform.  (That is, these are _collective_
+calls.)  Note that the plan for the MPI transform uses the standard
+`fftw_execute' and `fftw_destroy' routines (on the other hand, there
+are MPI-specific new-array execute functions documented below).  
+
+   Third, all of the FFTW MPI routines take `ptrdiff_t' arguments
+instead of `int' as for the serial FFTW.  `ptrdiff_t' is a standard C
+integer type which is (at least) 32 bits wide on a 32-bit machine and
+64 bits wide on a 64-bit machine.  This is to make it easy to specify
+very large parallel transforms on a 64-bit machine.  (You can specify
+64-bit transform sizes in the serial FFTW, too, but only by using the
+`guru64' planner interface.  *Note 64-bit Guru Interface::.)  
+
+   Fourth, and most importantly, you don't allocate the entire
+two-dimensional array on each process.  Instead, you call
+`fftw_mpi_local_size_2d' to find out what _portion_ of the array
+resides on each processor, and how much space to allocate.  Here, the
+portion of the array on each process is a `local_n0' by `N1' slice of
+the total array, starting at index `local_0_start'.  The total number
+of `fftw_complex' numbers to allocate is given by the `alloc_local'
+return value, which _may_ be greater than `local_n0 * N1' (in case some
+intermediate calculations require additional storage).  The data
+distribution in FFTW's MPI interface is described in more detail by the
+next section.  
+
+   Given the portion of the array that resides on the local process, it
+is straightforward to initialize the data (here to a function
+`myfunction') and otherwise manipulate it.  Of course, at the end of
+the program you may want to output the data somehow, but synchronizing
+this output is up to you and is beyond the scope of this manual.  (One
+good way to output a large multi-dimensional distributed array in MPI
+to a portable binary file is to use the free HDF5 library; see the HDF
+home page (http://www.hdfgroup.org/).)  
+
+
+File: fftw3.info,  Node: MPI Data Distribution,  Next: Multi-dimensional MPI DFTs of Real Data,  Prev: 2d MPI example,  Up: Distributed-memory FFTW with MPI
+
+6.4 MPI Data Distribution
+=========================
+
+The most important concept to understand in using FFTW's MPI interface
+is the data distribution.  With a serial or multithreaded FFT, all of
+the inputs and outputs are stored as a single contiguous chunk of
+memory.  With a distributed-memory FFT, the inputs and outputs are
+broken into disjoint blocks, one per process.
+
+   In particular, FFTW uses a _1d block distribution_ of the data,
+distributed along the _first dimension_.  For example, if you want to
+perform a 100 x 200  complex DFT, distributed over 4 processes, each
+process will get a 25 x 200  slice of the data.  That is, process 0
+will get rows 0 through 24, process 1 will get rows 25 through 49,
+process 2 will get rows 50 through 74, and process 3 will get rows 75
+through 99.  If you take the same array but distribute it over 3
+processes, then it is not evenly divisible so the different processes
+will have unequal chunks.  FFTW's default choice in this case is to
+assign 34 rows to processes 0 and 1, and 32 rows to process 2.  
+
+   FFTW provides several `fftw_mpi_local_size' routines that you can
+call to find out what portion of an array is stored on the current
+process.  In most cases, you should use the default block sizes picked
+by FFTW, but it is also possible to specify your own block size.  For
+example, with a 100 x 200  array on three processes, you can tell FFTW
+to use a block size of 40, which would assign 40 rows to processes 0
+and 1, and 20 rows to process 2.  FFTW's default is to divide the data
+equally among the processes if possible, and as best it can otherwise.
+The rows are always assigned in "rank order," i.e. process 0 gets the
+first block of rows, then process 1, and so on.  (You can change this
+by using `MPI_Comm_split' to create a new communicator with re-ordered
+processes.)  However, you should always call the `fftw_mpi_local_size'
+routines, if possible, rather than trying to predict FFTW's
+distribution choices.
+
+   In particular, it is critical that you allocate the storage size that
+is returned by `fftw_mpi_local_size', which is _not_ necessarily the
+size of the local slice of the array.  The reason is that intermediate
+steps of FFTW's algorithms involve transposing the array and
+redistributing the data, so at these intermediate steps FFTW may
+require more local storage space (albeit always proportional to the
+total size divided by the number of processes).  The
+`fftw_mpi_local_size' functions know how much storage is required for
+these intermediate steps and tell you the correct amount to allocate.
+
+* Menu:
+
+* Basic and advanced distribution interfaces::
+* Load balancing::
+* Transposed distributions::
+* One-dimensional distributions::
+
+
+File: fftw3.info,  Node: Basic and advanced distribution interfaces,  Next: Load balancing,  Prev: MPI Data Distribution,  Up: MPI Data Distribution
+
+6.4.1 Basic and advanced distribution interfaces
+------------------------------------------------
+
+As with the planner interface, the `fftw_mpi_local_size' distribution
+interface is broken into basic and advanced (`_many') interfaces, where
+the latter allows you to specify the block size manually and also to
+request block sizes when computing multiple transforms simultaneously.
+These functions are documented more exhaustively by the FFTW MPI
+Reference, but we summarize the basic ideas here using a couple of
+two-dimensional examples.
+
+   For the 100 x 200  complex-DFT example, above, we would find the
+distribution by calling the following function in the basic interface:
+
+     ptrdiff_t fftw_mpi_local_size_2d(ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                                      ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+   
+   Given the total size of the data to be transformed (here, `n0 = 100'
+and `n1 = 200') and an MPI communicator (`comm'), this function
+provides three numbers.
+
+   First, it describes the shape of the local data: the current process
+should store a `local_n0' by `n1' slice of the overall dataset, in
+row-major order (`n1' dimension contiguous), starting at index
+`local_0_start'.  That is, if the total dataset is viewed as a `n0' by
+`n1' matrix, the current process should store the rows `local_0_start'
+to `local_0_start+local_n0-1'.  Obviously, if you are running with only
+a single MPI process, that process will store the entire array:
+`local_0_start' will be zero and `local_n0' will be `n0'.  *Note
+Row-major Format::.  
+
+   Second, the return value is the total number of data elements (e.g.,
+complex numbers for a complex DFT) that should be allocated for the
+input and output arrays on the current process (ideally with
+`fftw_malloc' or an `fftw_alloc' function, to ensure optimal
+alignment).  It might seem that this should always be equal to
+`local_n0 * n1', but this is _not_ the case.  FFTW's distributed FFT
+algorithms require data redistributions at intermediate stages of the
+transform, and in some circumstances this may require slightly larger
+local storage.  This is discussed in more detail below, under *note
+Load balancing::.  
+
+   The advanced-interface `local_size' function for multidimensional
+transforms returns the same three things (`local_n0', `local_0_start',
+and the total number of elements to allocate), but takes more inputs:
+
+     ptrdiff_t fftw_mpi_local_size_many(int rnk, const ptrdiff_t *n,
+                                        ptrdiff_t howmany,
+                                        ptrdiff_t block0,
+                                        MPI_Comm comm,
+                                        ptrdiff_t *local_n0,
+                                        ptrdiff_t *local_0_start);
+   
+   The two-dimensional case above corresponds to `rnk = 2' and an array
+`n' of length 2 with `n[0] = n0' and `n[1] = n1'.  This routine is for
+any `rnk > 1'; one-dimensional transforms have their own interface
+because they work slightly differently, as discussed below.
+
+   First, the advanced interface allows you to perform multiple
+transforms at once, of interleaved data, as specified by the `howmany'
+parameter.  (`hoamany' is 1 for a single transform.)
+
+   Second, here you can specify your desired block size in the `n0'
+dimension, `block0'.  To use FFTW's default block size, pass
+`FFTW_MPI_DEFAULT_BLOCK' (0) for `block0'.  Otherwise, on `P'
+processes, FFTW will return `local_n0' equal to `block0' on the first
+`P / block0' processes (rounded down), return `local_n0' equal to `n0 -
+block0 * (P / block0)' on the next process, and `local_n0' equal to
+zero on any remaining processes.  In general, we recommend using the
+default block size (which corresponds to `n0 / P', rounded up).  
+
+   For example, suppose you have `P = 4' processes and `n0 = 21'.  The
+default will be a block size of `6', which will give `local_n0 = 6' on
+the first three processes and `local_n0 = 3' on the last process.
+Instead, however, you could specify `block0 = 5' if you wanted, which
+would give `local_n0 = 5' on processes 0 to 2, `local_n0 = 6' on
+process 3.  (This choice, while it may look superficially more
+"balanced," has the same critical path as FFTW's default but requires
+more communications.)
+
+
+File: fftw3.info,  Node: Load balancing,  Next: Transposed distributions,  Prev: Basic and advanced distribution interfaces,  Up: MPI Data Distribution
+
+6.4.2 Load balancing
+--------------------
+
+Ideally, when you parallelize a transform over some P processes, each
+process should end up with work that takes equal time.  Otherwise, all
+of the processes end up waiting on whichever process is slowest.  This
+goal is known as "load balancing."  In this section, we describe the
+circumstances under which FFTW is able to load-balance well, and in
+particular how you should choose your transform size in order to load
+balance.
+
+   Load balancing is especially difficult when you are parallelizing
+over heterogeneous machines; for example, if one of your processors is a
+old 486 and another is a Pentium IV, obviously you should give the
+Pentium more work to do than the 486 since the latter is much slower.
+FFTW does not deal with this problem, however--it assumes that your
+processes run on hardware of comparable speed, and that the goal is
+therefore to divide the problem as equally as possible.
+
+   For a multi-dimensional complex DFT, FFTW can divide the problem
+equally among the processes if: (i) the _first_ dimension `n0' is
+divisible by P; and (ii), the _product_ of the subsequent dimensions is
+divisible by P.  (For the advanced interface, where you can specify
+multiple simultaneous transforms via some "vector" length `howmany', a
+factor of `howmany' is included in the product of the subsequent
+dimensions.)
+
+   For a one-dimensional complex DFT, the length `N' of the data should
+be divisible by P _squared_ to be able to divide the problem equally
+among the processes.
+
+
+File: fftw3.info,  Node: Transposed distributions,  Next: One-dimensional distributions,  Prev: Load balancing,  Up: MPI Data Distribution
+
+6.4.3 Transposed distributions
+------------------------------
+
+Internally, FFTW's MPI transform algorithms work by first computing
+transforms of the data local to each process, then by globally
+_transposing_ the data in some fashion to redistribute the data among
+the processes, transforming the new data local to each process, and
+transposing back.  For example, a two-dimensional `n0' by `n1' array,
+distributed across the `n0' dimension, is transformd by: (i)
+transforming the `n1' dimension, which are local to each process; (ii)
+transposing to an `n1' by `n0' array, distributed across the `n1'
+dimension; (iii) transforming the `n0' dimension, which is now local to
+each process; (iv) transposing back.  
+
+   However, in many applications it is acceptable to compute a
+multidimensional DFT whose results are produced in transposed order
+(e.g., `n1' by `n0' in two dimensions).  This provides a significant
+performance advantage, because it means that the final transposition
+step can be omitted.  FFTW supports this optimization, which you
+specify by passing the flag `FFTW_MPI_TRANSPOSED_OUT' to the planner
+routines.  To compute the inverse transform of transposed output, you
+specify `FFTW_MPI_TRANSPOSED_IN' to tell it that the input is
+transposed.  In this section, we explain how to interpret the output
+format of such a transform.  
+
+   Suppose you have are transforming multi-dimensional data with (at
+least two) dimensions n[0] x n[1] x n[2] x ... x n[d-1] .  As always,
+it is distributed along the first dimension n[0] .  Now, if we compute
+its DFT with the `FFTW_MPI_TRANSPOSED_OUT' flag, the resulting output
+data are stored with the first _two_ dimensions transposed: n[1] x n[0]
+x n[2] x ... x n[d-1] , distributed along the n[1]  dimension.
+Conversely, if we take the n[1] x n[0] x n[2] x ... x n[d-1]  data and
+transform it with the `FFTW_MPI_TRANSPOSED_IN' flag, then the format
+goes back to the original n[0] x n[1] x n[2] x ... x n[d-1]  array.
+
+   There are two ways to find the portion of the transposed array that
+resides on the current process.  First, you can simply call the
+appropriate `local_size' function, passing n[1] x n[0] x n[2] x ... x
+n[d-1]  (the transposed dimensions).  This would mean calling the
+`local_size' function twice, once for the transposed and once for the
+non-transposed dimensions.  Alternatively, you can call one of the
+`local_size_transposed' functions, which returns both the
+non-transposed and transposed data distribution from a single call.
+For example, for a 3d transform with transposed output (or input), you
+might call:
+
+     ptrdiff_t fftw_mpi_local_size_3d_transposed(
+                     ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm,
+                     ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                     ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+   
+   Here, `local_n0' and `local_0_start' give the size and starting
+index of the `n0' dimension for the _non_-transposed data, as in the
+previous sections.  For _transposed_ data (e.g. the output for
+`FFTW_MPI_TRANSPOSED_OUT'), `local_n1' and `local_1_start' give the
+size and starting index of the `n1' dimension, which is the first
+dimension of the transposed data (`n1' by `n0' by `n2').
+
+   (Note that `FFTW_MPI_TRANSPOSED_IN' is completely equivalent to
+performing `FFTW_MPI_TRANSPOSED_OUT' and passing the first two
+dimensions to the planner in reverse order, or vice versa.  If you pass
+_both_ the `FFTW_MPI_TRANSPOSED_IN' and `FFTW_MPI_TRANSPOSED_OUT'
+flags, it is equivalent to swapping the first two dimensions passed to
+the planner and passing _neither_ flag.)
+
+
+File: fftw3.info,  Node: One-dimensional distributions,  Prev: Transposed distributions,  Up: MPI Data Distribution
+
+6.4.4 One-dimensional distributions
+-----------------------------------
+
+For one-dimensional distributed DFTs using FFTW, matters are slightly
+more complicated because the data distribution is more closely tied to
+how the algorithm works.  In particular, you can no longer pass an
+arbitrary block size and must accept FFTW's default; also, the block
+sizes may be different for input and output.  Also, the data
+distribution depends on the flags and transform direction, in order for
+forward and backward transforms to work correctly.
+
+     ptrdiff_t fftw_mpi_local_size_1d(ptrdiff_t n0, MPI_Comm comm,
+                     int sign, unsigned flags,
+                     ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+                     ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+   
+   This function computes the data distribution for a 1d transform of
+size `n0' with the given transform `sign' and `flags'.  Both input and
+output data use block distributions.  The input on the current process
+will consist of `local_ni' numbers starting at index `local_i_start';
+e.g. if only a single process is used, then `local_ni' will be `n0' and
+`local_i_start' will be `0'.  Similarly for the output, with `local_no'
+numbers starting at index `local_o_start'.  The return value of
+`fftw_mpi_local_size_1d' will be the total number of elements to
+allocate on the current process (which might be slightly larger than
+the local size due to intermediate steps in the algorithm).
+
+   As mentioned above (*note Load balancing::), the data will be divided
+equally among the processes if `n0' is divisible by the _square_ of the
+number of processes.  In this case, `local_ni' will equal `local_no'.
+Otherwise, they may be different.
+
+   For some applications, such as convolutions, the order of the output
+data is irrelevant.  In this case, performance can be improved by
+specifying that the output data be stored in an FFTW-defined
+"scrambled" format.  (In particular, this is the analogue of transposed
+output in the multidimensional case: scrambled output saves a
+communications step.)  If you pass `FFTW_MPI_SCRAMBLED_OUT' in the
+flags, then the output is stored in this (undocumented) scrambled
+order.  Conversely, to perform the inverse transform of data in
+scrambled order, pass the `FFTW_MPI_SCRAMBLED_IN' flag.  
+
+   In MPI FFTW, only composite sizes `n0' can be parallelized; we have
+not yet implemented a parallel algorithm for large prime sizes.
+
+
+File: fftw3.info,  Node: Multi-dimensional MPI DFTs of Real Data,  Next: Other Multi-dimensional Real-data MPI Transforms,  Prev: MPI Data Distribution,  Up: Distributed-memory FFTW with MPI
+
+6.5 Multi-dimensional MPI DFTs of Real Data
+===========================================
+
+FFTW's MPI interface also supports multi-dimensional DFTs of real data,
+similar to the serial r2c and c2r interfaces.  (Parallel
+one-dimensional real-data DFTs are not currently supported; you must
+use a complex transform and set the imaginary parts of the inputs to
+zero.)
+
+   The key points to understand for r2c and c2r MPI transforms (compared
+to the MPI complex DFTs or the serial r2c/c2r transforms), are:
+
+   * Just as for serial transforms, r2c/c2r DFTs transform n[0] x n[1]
+     x n[2] x ... x n[d-1]  real data to/from n[0] x n[1] x n[2] x ...
+     x (n[d-1]/2 + 1)  complex data: the last dimension of the complex
+     data is cut in half (rounded down), plus one.  As for the serial
+     transforms, the sizes you pass to the `plan_dft_r2c' and
+     `plan_dft_c2r' are the n[0] x n[1] x n[2] x ... x n[d-1]
+     dimensions of the real data.
+
+   * Although the real data is _conceptually_ n[0] x n[1] x n[2] x ...
+     x n[d-1] , it is _physically_ stored as an n[0] x n[1] x n[2] x
+     ... x [2 (n[d-1]/2 + 1)]  array, where the last dimension has been
+     _padded_ to make it the same size as the complex output.  This is
+     much like the in-place serial r2c/c2r interface (*note
+     Multi-Dimensional DFTs of Real Data::), except that in MPI the
+     padding is required even for out-of-place data.  The extra padding
+     numbers are ignored by FFTW (they are _not_ like zero-padding the
+     transform to a larger size); they are only used to determine the
+     data layout.
+
+   * The data distribution in MPI for _both_ the real and complex data
+     is determined by the shape of the _complex_ data.  That is, you
+     call the appropriate `local size' function for the n[0] x n[1] x
+     n[2] x ... x (n[d-1]/2 + 1)
+
+     complex data, and then use the _same_ distribution for the real
+     data except that the last complex dimension is replaced by a
+     (padded) real dimension of twice the length.
+
+
+   For example suppose we are performing an out-of-place r2c transform
+of L x M x N  real data [padded to L x M x 2(N/2+1) ], resulting in L x
+M x N/2+1  complex data.  Similar to the example in *note 2d MPI
+example::, we might do something like:
+
+     #include <fftw3-mpi.h>
+
+     int main(int argc, char **argv)
+     {
+         const ptrdiff_t L = ..., M = ..., N = ...;
+         fftw_plan plan;
+         double *rin;
+         fftw_complex *cout;
+         ptrdiff_t alloc_local, local_n0, local_0_start, i, j, k;
+
+         MPI_Init(&argc, &argv);
+         fftw_mpi_init();
+
+         /* get local data size and allocate */
+         alloc_local = fftw_mpi_local_size_3d(L, M, N/2+1, MPI_COMM_WORLD,
+                                              &local_n0, &local_0_start);
+         rin = fftw_alloc_real(2 * alloc_local);
+         cout = fftw_alloc_complex(alloc_local);
+
+         /* create plan for out-of-place r2c DFT */
+         plan = fftw_mpi_plan_dft_r2c_3d(L, M, N, rin, cout, MPI_COMM_WORLD,
+                                         FFTW_MEASURE);
+
+         /* initialize rin to some function my_func(x,y,z) */
+         for (i = 0; i < local_n0; ++i)
+            for (j = 0; j < M; ++j)
+              for (k = 0; k < N; ++k)
+            rin[(i*M + j) * (2*(N/2+1)) + k] = my_func(local_0_start+i, j, k);
+
+         /* compute transforms as many times as desired */
+         fftw_execute(plan);
+
+         fftw_destroy_plan(plan);
+
+         MPI_Finalize();
+     }
+
+   Note that we allocated `rin' using `fftw_alloc_real' with an
+argument of `2 * alloc_local': since `alloc_local' is the number of
+_complex_ values to allocate, the number of _real_ values is twice as
+many.  The `rin' array is then local_n0 x M x 2(N/2+1)  in row-major
+order, so its `(i,j,k)' element is at the index `(i*M + j) *
+(2*(N/2+1)) + k' (*note Multi-dimensional Array Format::).
+
+   As for the complex transforms, improved performance can be obtained
+by specifying that the output is the transpose of the input or vice
+versa (*note Transposed distributions::).  In our L x M x N  r2c
+example, including `FFTW_TRANSPOSED_OUT' in the flags means that the
+input would be a padded L x M x 2(N/2+1)  real array distributed over
+the `L' dimension, while the output would be a M x L x N/2+1  complex
+array distributed over the `M' dimension.  To perform the inverse c2r
+transform with the same data distributions, you would use the
+`FFTW_TRANSPOSED_IN' flag.
+
+
+File: fftw3.info,  Node: Other Multi-dimensional Real-data MPI Transforms,  Next: FFTW MPI Transposes,  Prev: Multi-dimensional MPI DFTs of Real Data,  Up: Distributed-memory FFTW with MPI
+
+6.6 Other multi-dimensional Real-Data MPI Transforms
+====================================================
+
+FFTW's MPI interface also supports multi-dimensional `r2r' transforms
+of all kinds supported by the serial interface (e.g. discrete cosine
+and sine transforms, discrete Hartley transforms, etc.).  Only
+multi-dimensional `r2r' transforms, not one-dimensional transforms, are
+currently parallelized.
+
+   These are used much like the multidimensional complex DFTs discussed
+above, except that the data is real rather than complex, and one needs
+to pass an r2r transform kind (`fftw_r2r_kind') for each dimension as
+in the serial FFTW (*note More DFTs of Real Data::).
+
+   For example, one might perform a two-dimensional L x M  that is an
+REDFT10 (DCT-II) in the first dimension and an RODFT10 (DST-II) in the
+second dimension with code like:
+
+         const ptrdiff_t L = ..., M = ...;
+         fftw_plan plan;
+         double *data;
+         ptrdiff_t alloc_local, local_n0, local_0_start, i, j;
+
+         /* get local data size and allocate */
+         alloc_local = fftw_mpi_local_size_2d(L, M, MPI_COMM_WORLD,
+                                              &local_n0, &local_0_start);
+         data = fftw_alloc_real(alloc_local);
+
+         /* create plan for in-place REDFT10 x RODFT10 */
+         plan = fftw_mpi_plan_r2r_2d(L, M, data, data, MPI_COMM_WORLD,
+                                     FFTW_REDFT10, FFTW_RODFT10, FFTW_MEASURE);
+
+         /* initialize data to some function my_function(x,y) */
+         for (i = 0; i < local_n0; ++i) for (j = 0; j < M; ++j)
+            data[i*M + j] = my_function(local_0_start + i, j);
+
+         /* compute transforms, in-place, as many times as desired */
+         fftw_execute(plan);
+
+         fftw_destroy_plan(plan);
+
+   Notice that we use the same `local_size' functions as we did for
+complex data, only now we interpret the sizes in terms of real rather
+than complex values, and correspondingly use `fftw_alloc_real'.
+
+
+File: fftw3.info,  Node: FFTW MPI Transposes,  Next: FFTW MPI Wisdom,  Prev: Other Multi-dimensional Real-data MPI Transforms,  Up: Distributed-memory FFTW with MPI
+
+6.7 FFTW MPI Transposes
+=======================
+
+The FFTW's MPI Fourier transforms rely on one or more _global
+transposition_ step for their communications.  For example, the
+multidimensional transforms work by transforming along some dimensions,
+then transposing to make the first dimension local and transforming
+that, then transposing back.  Because global transposition of a
+block-distributed matrix has many other potential uses besides FFTs,
+FFTW's transpose routines can be called directly, as documented in this
+section.
+
+* Menu:
+
+* Basic distributed-transpose interface::
+* Advanced distributed-transpose interface::
+* An improved replacement for MPI_Alltoall::
+
+
+File: fftw3.info,  Node: Basic distributed-transpose interface,  Next: Advanced distributed-transpose interface,  Prev: FFTW MPI Transposes,  Up: FFTW MPI Transposes
+
+6.7.1 Basic distributed-transpose interface
+-------------------------------------------
+
+In particular, suppose that we have an `n0' by `n1' array in row-major
+order, block-distributed across the `n0' dimension.  To transpose this
+into an `n1' by `n0' array block-distributed across the `n1' dimension,
+we would create a plan by calling the following function:
+
+     fftw_plan fftw_mpi_plan_transpose(ptrdiff_t n0, ptrdiff_t n1,
+                                       double *in, double *out,
+                                       MPI_Comm comm, unsigned flags);
+   
+   The input and output arrays (`in' and `out') can be the same.  The
+transpose is actually executed by calling `fftw_execute' on the plan,
+as usual.  
+
+   The `flags' are the usual FFTW planner flags, but support two
+additional flags: `FFTW_MPI_TRANSPOSED_OUT' and/or
+`FFTW_MPI_TRANSPOSED_IN'.  What these flags indicate, for transpose
+plans, is that the output and/or input, respectively, are _locally_
+transposed.  That is, on each process input data is normally stored as
+a `local_n0' by `n1' array in row-major order, but for an
+`FFTW_MPI_TRANSPOSED_IN' plan the input data is stored as `n1' by
+`local_n0' in row-major order.  Similarly, `FFTW_MPI_TRANSPOSED_OUT'
+means that the output is `n0' by `local_n1' instead of `local_n1' by
+`n0'.  
+
+   To determine the local size of the array on each process before and
+after the transpose, as well as the amount of storage that must be
+allocated, one should call `fftw_mpi_local_size_2d_transposed', just as
+for a 2d DFT as described in the previous section: 
+
+     ptrdiff_t fftw_mpi_local_size_2d_transposed
+                     (ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                      ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                      ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+   
+   Again, the return value is the local storage to allocate, which in
+this case is the number of _real_ (`double') values rather than complex
+numbers as in the previous examples.
+
+
+File: fftw3.info,  Node: Advanced distributed-transpose interface,  Next: An improved replacement for MPI_Alltoall,  Prev: Basic distributed-transpose interface,  Up: FFTW MPI Transposes
+
+6.7.2 Advanced distributed-transpose interface
+----------------------------------------------
+
+The above routines are for a transpose of a matrix of numbers (of type
+`double'), using FFTW's default block sizes.  More generally, one can
+perform transposes of _tuples_ of numbers, with user-specified block
+sizes for the input and output:
+
+     fftw_plan fftw_mpi_plan_many_transpose
+                     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany,
+                      ptrdiff_t block0, ptrdiff_t block1,
+                      double *in, double *out, MPI_Comm comm, unsigned flags);
+   
+   In this case, one is transposing an `n0' by `n1' matrix of
+`howmany'-tuples (e.g. `howmany = 2' for complex numbers).  The input
+is distributed along the `n0' dimension with block size `block0', and
+the `n1' by `n0' output is distributed along the `n1' dimension with
+block size `block1'.  If `FFTW_MPI_DEFAULT_BLOCK' (0) is passed for a
+block size then FFTW uses its default block size.  To get the local
+size of the data on each process, you should then call
+`fftw_mpi_local_size_many_transposed'.  
+
+
+File: fftw3.info,  Node: An improved replacement for MPI_Alltoall,  Prev: Advanced distributed-transpose interface,  Up: FFTW MPI Transposes
+
+6.7.3 An improved replacement for MPI_Alltoall
+----------------------------------------------
+
+We close this section by noting that FFTW's MPI transpose routines can
+be thought of as a generalization for the `MPI_Alltoall' function
+(albeit only for floating-point types), and in some circumstances can
+function as an improved replacement.  
+
+   `MPI_Alltoall' is defined by the MPI standard as:
+
+     int MPI_Alltoall(void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                      void *recvbuf, int recvcnt, MPI_Datatype recvtype,
+                      MPI_Comm comm);
+
+   In particular, for `double*' arrays `in' and `out', consider the
+call:
+
+     MPI_Alltoall(in, howmany, MPI_DOUBLE, out, howmany MPI_DOUBLE, comm);
+
+   This is completely equivalent to:
+
+     MPI_Comm_size(comm, &P);
+     plan = fftw_mpi_plan_many_transpose(P, P, howmany, 1, 1, in, out, comm, FFTW_ESTIMATE);
+     fftw_execute(plan);
+     fftw_destroy_plan(plan);
+
+   That is, computing a P x P  transpose on `P' processes, with a block
+size of 1, is just a standard all-to-all communication.
+
+   However, using the FFTW routine instead of `MPI_Alltoall' may have
+certain advantages.  First of all, FFTW's routine can operate in-place
+(`in == out') whereas `MPI_Alltoall' can only operate out-of-place.  
+
+   Second, even for out-of-place plans, FFTW's routine may be faster,
+especially if you need to perform the all-to-all communication many
+times and can afford to use `FFTW_MEASURE' or `FFTW_PATIENT'.  It
+should certainly be no slower, not including the time to create the
+plan, since one of the possible algorithms that FFTW uses for an
+out-of-place transpose _is_ simply to call `MPI_Alltoall'.  However,
+FFTW also considers several other possible algorithms that, depending
+on your MPI implementation and your hardware, may be faster.  
+
+
+File: fftw3.info,  Node: FFTW MPI Wisdom,  Next: Avoiding MPI Deadlocks,  Prev: FFTW MPI Transposes,  Up: Distributed-memory FFTW with MPI
+
+6.8 FFTW MPI Wisdom
+===================
+
+FFTW's "wisdom" facility (*note Words of Wisdom-Saving Plans::) can be
+used to save MPI plans as well as to save uniprocessor plans.  However,
+for MPI there are several unavoidable complications.
+
+   First, the MPI standard does not guarantee that every process can
+perform file I/O (at least, not using C stdio routines)--in general, we
+may only assume that process 0 is capable of I/O.(1) So, if we want to
+export the wisdom from a single process to a file, we must first export
+the wisdom to a string, then send it to process 0, then write it to a
+file.
+
+   Second, in principle we may want to have separate wisdom for every
+process, since in general the processes may run on different hardware
+even for a single MPI program.  However, in practice FFTW's MPI code is
+designed for the case of homogeneous hardware (*note Load balancing::),
+and in this case it is convenient to use the same wisdom for every
+process.  Thus, we need a mechanism to synchronize the wisdom.
+
+   To address both of these problems, FFTW provides the following two
+functions:
+
+     void fftw_mpi_broadcast_wisdom(MPI_Comm comm);
+     void fftw_mpi_gather_wisdom(MPI_Comm comm);
+   
+   Given a communicator `comm', `fftw_mpi_broadcast_wisdom' will
+broadcast the wisdom from process 0 to all other processes.
+Conversely, `fftw_mpi_gather_wisdom' will collect wisdom from all
+processes onto process 0.  (If the plans created for the same problem
+by different processes are not the same, `fftw_mpi_gather_wisdom' will
+arbitrarily choose one of the plans.)  Both of these functions may
+result in suboptimal plans for different processes if the processes are
+running on non-identical hardware.  Both of these functions are
+_collective_ calls, which means that they must be executed by all
+processes in the communicator.  
+
+   So, for example, a typical code snippet to import wisdom from a file
+and use it on all processes would be:
+
+     {
+         int rank;
+
+         fftw_mpi_init();
+         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+         if (rank == 0) fftw_import_wisdom_from_filename("mywisdom");
+         fftw_mpi_broadcast_wisdom(MPI_COMM_WORLD);
+     }
+
+   (Note that we must call `fftw_mpi_init' before importing any wisdom
+that might contain MPI plans.)  Similarly, a typical code snippet to
+export wisdom from all processes to a file is: 
+
+     {
+         int rank;
+
+         fftw_mpi_gather_wisdom(MPI_COMM_WORLD);
+         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+         if (rank == 0) fftw_export_wisdom_to_filename("mywisdom");
+     }
+
+   ---------- Footnotes ----------
+
+   (1) In fact, even this assumption is not technically guaranteed by
+the standard, although it seems to be universal in actual MPI
+implementations and is widely assumed by MPI-using software.
+Technically, you need to query the `MPI_IO' attribute of
+`MPI_COMM_WORLD' with `MPI_Attr_get'.  If this attribute is
+`MPI_PROC_NULL', no I/O is possible.  If it is `MPI_ANY_SOURCE', any
+process can perform I/O.  Otherwise, it is the rank of a process that
+can perform I/O ... but since it is not guaranteed to yield the _same_
+rank on all processes, you have to do an `MPI_Allreduce' of some kind
+if you want all processes to agree about which is going to do I/O.  And
+even then, the standard only guarantees that this process can perform
+output, but not input. See e.g. `Parallel Programming with MPI' by P.
+S. Pacheco, section 8.1.3.  Needless to say, in our experience
+virtually no MPI programmers worry about this.
+
+
+File: fftw3.info,  Node: Avoiding MPI Deadlocks,  Next: FFTW MPI Performance Tips,  Prev: FFTW MPI Wisdom,  Up: Distributed-memory FFTW with MPI
+
+6.9 Avoiding MPI Deadlocks
+==========================
+
+An MPI program can _deadlock_ if one process is waiting for a message
+from another process that never gets sent.  To avoid deadlocks when
+using FFTW's MPI routines, it is important to know which functions are
+_collective_: that is, which functions must _always_ be called in the
+_same order_ from _every_ process in a given communicator.  (For
+example, `MPI_Barrier' is the canonical example of a collective
+function in the MPI standard.)  
+
+   The functions in FFTW that are _always_ collective are: every
+function beginning with `fftw_mpi_plan', as well as
+`fftw_mpi_broadcast_wisdom' and `fftw_mpi_gather_wisdom'.  Also, the
+following functions from the ordinary FFTW interface are collective
+when they are applied to a plan created by an `fftw_mpi_plan' function:
+`fftw_execute', `fftw_destroy_plan', and `fftw_flops'.  
+
+
+File: fftw3.info,  Node: FFTW MPI Performance Tips,  Next: Combining MPI and Threads,  Prev: Avoiding MPI Deadlocks,  Up: Distributed-memory FFTW with MPI
+
+6.10 FFTW MPI Performance Tips
+==============================
+
+In this section, we collect a few tips on getting the best performance
+out of FFTW's MPI transforms.
+
+   First, because of the 1d block distribution, FFTW's parallelization
+is currently limited by the size of the first dimension.
+(Multidimensional block distributions may be supported by a future
+version.) More generally, you should ideally arrange the dimensions so
+that FFTW can divide them equally among the processes. *Note Load
+balancing::.  
+
+   Second, if it is not too inconvenient, you should consider working
+with transposed output for multidimensional plans, as this saves a
+considerable amount of communications.  *Note Transposed
+distributions::.  
+
+   Third, the fastest choices are generally either an in-place transform
+or an out-of-place transform with the `FFTW_DESTROY_INPUT' flag (which
+allows the input array to be used as scratch space).  In-place is
+especially beneficial if the amount of data per process is large.  
+
+   Fourth, if you have multiple arrays to transform at once, rather than
+calling FFTW's MPI transforms several times it usually seems to be
+faster to interleave the data and use the advanced interface.  (This
+groups the communications together instead of requiring separate
+messages for each transform.)
+
+
+File: fftw3.info,  Node: Combining MPI and Threads,  Next: FFTW MPI Reference,  Prev: FFTW MPI Performance Tips,  Up: Distributed-memory FFTW with MPI
+
+6.11 Combining MPI and Threads
+==============================
+
+In certain cases, it may be advantageous to combine MPI
+(distributed-memory) and threads (shared-memory) parallelization.  FFTW
+supports this, with certain caveats.  For example, if you have a
+cluster of 4-processor shared-memory nodes, you may want to use threads
+within the nodes and MPI between the nodes, instead of MPI for all
+parallelization.
+
+   In particular, it is possible to seamlessly combine the MPI FFTW
+routines with the multi-threaded FFTW routines (*note Multi-threaded
+FFTW::). However, some care must be taken in the initialization code,
+which should look something like this:
+
+     int threads_ok;
+
+     int main(int argc, char **argv)
+     {
+         int provided;
+         MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+         threads_ok = provided >= MPI_THREAD_FUNNELED;
+
+         if (threads_ok) threads_ok = fftw_init_threads();
+         fftw_mpi_init();
+
+         ...
+         if (threads_ok) fftw_plan_with_nthreads(...);
+         ...
+
+         MPI_Finalize();
+     }
+   
+   First, note that instead of calling `MPI_Init', you should call
+`MPI_Init_threads', which is the initialization routine defined by the
+MPI-2 standard to indicate to MPI that your program will be
+multithreaded.  We pass `MPI_THREAD_FUNNELED', which indicates that we
+will only call MPI routines from the main thread.  (FFTW will launch
+additional threads internally, but the extra threads will not call MPI
+code.)  (You may also pass `MPI_THREAD_SERIALIZED' or
+`MPI_THREAD_MULTIPLE', which requests additional multithreading support
+from the MPI implementation, but this is not required by FFTW.)  The
+`provided' parameter returns what level of threads support is actually
+supported by your MPI implementation; this _must_ be at least
+`MPI_THREAD_FUNNELED' if you want to call the FFTW threads routines, so
+we define a global variable `threads_ok' to record this.  You should
+only call `fftw_init_threads' or `fftw_plan_with_nthreads' if
+`threads_ok' is true.  For more information on thread safety in MPI,
+see the MPI and Threads
+(http://www.mpi-forum.org/docs/mpi-20-html/node162.htm) section of the
+MPI-2 standard.  
+
+   Second, we must call `fftw_init_threads' _before_ `fftw_mpi_init'.
+This is critical for technical reasons having to do with how FFTW
+initializes its list of algorithms.
+
+   Then, if you call `fftw_plan_with_nthreads(N)', _every_ MPI process
+will launch (up to) `N' threads to parallelize its transforms.
+
+   For example, in the hypothetical cluster of 4-processor nodes, you
+might wish to launch only a single MPI process per node, and then call
+`fftw_plan_with_nthreads(4)' on each process to use all processors in
+the nodes.
+
+   This may or may not be faster than simply using as many MPI processes
+as you have processors, however.  On the one hand, using threads within
+a node eliminates the need for explicit message passing within the
+node.  On the other hand, FFTW's transpose routines are not
+multi-threaded, and this means that the communications that do take
+place will not benefit from parallelization within the node.  Moreover,
+many MPI implementations already have optimizations to exploit shared
+memory when it is available, so adding the multithreaded FFTW on top of
+this may be superfluous.  
+
+
+File: fftw3.info,  Node: FFTW MPI Reference,  Next: FFTW MPI Fortran Interface,  Prev: Combining MPI and Threads,  Up: Distributed-memory FFTW with MPI
+
+6.12 FFTW MPI Reference
+=======================
+
+This chapter provides a complete reference to all FFTW MPI functions,
+datatypes, and constants.  See also *note FFTW Reference:: for
+information on functions and types in common with the serial interface.
+
+* Menu:
+
+* MPI Files and Data Types::
+* MPI Initialization::
+* Using MPI Plans::
+* MPI Data Distribution Functions::
+* MPI Plan Creation::
+* MPI Wisdom Communication::
+
+
+File: fftw3.info,  Node: MPI Files and Data Types,  Next: MPI Initialization,  Prev: FFTW MPI Reference,  Up: FFTW MPI Reference
+
+6.12.1 MPI Files and Data Types
+-------------------------------
+
+All programs using FFTW's MPI support should include its header file:
+
+     #include <fftw3-mpi.h>
+
+   Note that this header file includes the serial-FFTW `fftw3.h' header
+file, and also the `mpi.h' header file for MPI, so you need not include
+those files separately.
+
+   You must also link to _both_ the FFTW MPI library and to the serial
+FFTW library.  On Unix, this means adding `-lfftw3_mpi -lfftw3 -lm' at
+the end of the link command.
+
+   Different precisions are handled as in the serial interface: *Note
+Precision::.  That is, `fftw_' functions become `fftwf_' (in single
+precision) etcetera, and the libraries become `-lfftw3f_mpi -lfftw3f
+-lm' etcetera on Unix.  Long-double precision is supported in MPI, but
+quad precision (`fftwq_') is not due to the lack of MPI support for
+this type.
+
+
+File: fftw3.info,  Node: MPI Initialization,  Next: Using MPI Plans,  Prev: MPI Files and Data Types,  Up: FFTW MPI Reference
+
+6.12.2 MPI Initialization
+-------------------------
+
+Before calling any other FFTW MPI (`fftw_mpi_') function, and before
+importing any wisdom for MPI problems, you must call:
+
+     void fftw_mpi_init(void);
+
+   If FFTW threads support is used, however, `fftw_mpi_init' should be
+called _after_ `fftw_init_threads' (*note Combining MPI and Threads::).
+Calling `fftw_mpi_init' additional times (before `fftw_mpi_cleanup')
+has no effect.
+
+   If you want to deallocate all persistent data and reset FFTW to the
+pristine state it was in when you started your program, you can call:
+
+     void fftw_mpi_cleanup(void);
+
+   (This calls `fftw_cleanup', so you need not call the serial cleanup
+routine too, although it is safe to do so.)  After calling
+`fftw_mpi_cleanup', all existing plans become undefined, and you should
+not attempt to execute or destroy them.  You must call `fftw_mpi_init'
+again after `fftw_mpi_cleanup' if you want to resume using the MPI FFTW
+routines.
+
+
+File: fftw3.info,  Node: Using MPI Plans,  Next: MPI Data Distribution Functions,  Prev: MPI Initialization,  Up: FFTW MPI Reference
+
+6.12.3 Using MPI Plans
+----------------------
+
+Once an MPI plan is created, you can execute and destroy it using
+`fftw_execute', `fftw_destroy_plan', and the other functions in the
+serial interface that operate on generic plans (*note Using Plans::).
+
+   The `fftw_execute' and `fftw_destroy_plan' functions, applied to MPI
+plans, are _collective_ calls: they must be called for all processes in
+the communicator that was used to create the plan.
+
+   You must _not_ use the serial new-array plan-execution functions
+`fftw_execute_dft' and so on (*note New-array Execute Functions::) with
+MPI plans.  Such functions are specialized to the problem type, and
+there are specific new-array execute functions for MPI plans:
+
+     void fftw_mpi_execute_dft(fftw_plan p, fftw_complex *in, fftw_complex *out);
+     void fftw_mpi_execute_dft_r2c(fftw_plan p, double *in, fftw_complex *out);
+     void fftw_mpi_execute_dft_c2r(fftw_plan p, fftw_complex *in, double *out);
+     void fftw_mpi_execute_r2r(fftw_plan p, double *in, double *out);
+
+   These functions have the same restrictions as those of the serial
+new-array execute functions.  They are _always_ safe to apply to the
+_same_ `in' and `out' arrays that were used to create the plan.  They
+can only be applied to new arrarys if those arrays have the same types,
+dimensions, in-placeness, and alignment as the original arrays, where
+the best way to ensure the same alignment is to use FFTW's
+`fftw_malloc' and related allocation functions for all arrays (*note
+Memory Allocation::).  Note that distributed transposes (*note FFTW MPI
+Transposes::) use `fftw_mpi_execute_r2r', since they count as rank-zero
+r2r plans from FFTW's perspective.
+
+
+File: fftw3.info,  Node: MPI Data Distribution Functions,  Next: MPI Plan Creation,  Prev: Using MPI Plans,  Up: FFTW MPI Reference
+
+6.12.4 MPI Data Distribution Functions
+--------------------------------------
+
+As described above (*note MPI Data Distribution::), in order to
+allocate your arrays, _before_ creating a plan, you must first call one
+of the following routines to determine the required allocation size and
+the portion of the array locally stored on a given process.  The
+`MPI_Comm' communicator passed here must be equivalent to the
+communicator used below for plan creation.
+
+   The basic interface for multidimensional transforms consists of the
+functions:
+
+     ptrdiff_t fftw_mpi_local_size_2d(ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                                      ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+     ptrdiff_t fftw_mpi_local_size_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                      MPI_Comm comm,
+                                      ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+     ptrdiff_t fftw_mpi_local_size(int rnk, const ptrdiff_t *n, MPI_Comm comm,
+                                   ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+
+     ptrdiff_t fftw_mpi_local_size_2d_transposed(ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                                                 ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                                 ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+     ptrdiff_t fftw_mpi_local_size_3d_transposed(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                                 MPI_Comm comm,
+                                                 ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                                 ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+     ptrdiff_t fftw_mpi_local_size_transposed(int rnk, const ptrdiff_t *n, MPI_Comm comm,
+                                              ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                              ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+
+   These functions return the number of elements to allocate (complex
+numbers for DFT/r2c/c2r plans, real numbers for r2r plans), whereas the
+`local_n0' and `local_0_start' return the portion (`local_0_start' to
+`local_0_start + local_n0 - 1') of the first dimension of an n[0] x
+n[1] x n[2] x ... x n[d-1]  array that is stored on the local process.
+*Note Basic and advanced distribution interfaces::.  For
+`FFTW_MPI_TRANSPOSED_OUT' plans, the `_transposed' variants are useful
+in order to also return the local portion of the first dimension in the
+n[1] x n[0] x n[2] x ... x n[d-1]  transposed output.  *Note Transposed
+distributions::.  The advanced interface for multidimensional
+transforms is:
+
+     ptrdiff_t fftw_mpi_local_size_many(int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+                                        ptrdiff_t block0, MPI_Comm comm,
+                                        ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+     ptrdiff_t fftw_mpi_local_size_many_transposed(int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+                                                   ptrdiff_t block0, ptrdiff_t block1, MPI_Comm comm,
+                                                   ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                                   ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+
+   These differ from the basic interface in only two ways.  First, they
+allow you to specify block sizes `block0' and `block1' (the latter for
+the transposed output); you can pass `FFTW_MPI_DEFAULT_BLOCK' to use
+FFTW's default block size as in the basic interface.  Second, you can
+pass a `howmany' parameter, corresponding to the advanced planning
+interface below: this is for transforms of contiguous `howmany'-tuples
+of numbers (`howmany = 1' in the basic interface).
+
+   The corresponding basic and advanced routines for one-dimensional
+transforms (currently only complex DFTs) are:
+
+     ptrdiff_t fftw_mpi_local_size_1d(
+                  ptrdiff_t n0, MPI_Comm comm, int sign, unsigned flags,
+                  ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+                  ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+     ptrdiff_t fftw_mpi_local_size_many_1d(
+                  ptrdiff_t n0, ptrdiff_t howmany,
+                  MPI_Comm comm, int sign, unsigned flags,
+                  ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+                  ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+
+   As above, the return value is the number of elements to allocate
+(complex numbers, for complex DFTs).  The `local_ni' and
+`local_i_start' arguments return the portion (`local_i_start' to
+`local_i_start + local_ni - 1') of the 1d array that is stored on this
+process for the transform _input_, and `local_no' and `local_o_start'
+are the corresponding quantities for the input.  The `sign'
+(`FFTW_FORWARD' or `FFTW_BACKWARD') and `flags' must match the
+arguments passed when creating a plan.  Although the inputs and outputs
+have different data distributions in general, it is guaranteed that the
+_output_ data distribution of an `FFTW_FORWARD' plan will match the
+_input_ data distribution of an `FFTW_BACKWARD' plan and vice versa;
+similarly for the `FFTW_MPI_SCRAMBLED_OUT' and `FFTW_MPI_SCRAMBLED_IN'
+flags.  *Note One-dimensional distributions::.
+
+
+File: fftw3.info,  Node: MPI Plan Creation,  Next: MPI Wisdom Communication,  Prev: MPI Data Distribution Functions,  Up: FFTW MPI Reference
+
+6.12.5 MPI Plan Creation
+------------------------
+
+Complex-data MPI DFTs
+.....................
+
+Plans for complex-data DFTs (*note 2d MPI example::) are created by:
+
+     fftw_plan fftw_mpi_plan_dft_1d(ptrdiff_t n0, fftw_complex *in, fftw_complex *out,
+                                    MPI_Comm comm, int sign, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                    fftw_complex *in, fftw_complex *out,
+                                    MPI_Comm comm, int sign, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                    fftw_complex *in, fftw_complex *out,
+                                    MPI_Comm comm, int sign, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft(int rnk, const ptrdiff_t *n,
+                                 fftw_complex *in, fftw_complex *out,
+                                 MPI_Comm comm, int sign, unsigned flags);
+     fftw_plan fftw_mpi_plan_many_dft(int rnk, const ptrdiff_t *n,
+                                      ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock,
+                                      fftw_complex *in, fftw_complex *out,
+                                      MPI_Comm comm, int sign, unsigned flags);
+
+   These are similar to their serial counterparts (*note Complex DFTs::)
+in specifying the dimensions, sign, and flags of the transform.  The
+`comm' argument gives an MPI communicator that specifies the set of
+processes to participate in the transform; plan creation is a
+collective function that must be called for all processes in the
+communicator.  The `in' and `out' pointers refer only to a portion of
+the overall transform data (*note MPI Data Distribution::) as specified
+by the `local_size' functions in the previous section.  Unless `flags'
+contains `FFTW_ESTIMATE', these arrays are overwritten during plan
+creation as for the serial interface.  For multi-dimensional
+transforms, any dimensions `> 1' are supported; for one-dimensional
+transforms, only composite (non-prime) `n0' are currently supported
+(unlike the serial FFTW).  Requesting an unsupported transform size
+will yield a `NULL' plan.  (As in the serial interface, highly
+composite sizes generally yield the best performance.)
+
+   The advanced-interface `fftw_mpi_plan_many_dft' additionally allows
+you to specify the block sizes for the first dimension (`block') of the
+n[0] x n[1] x n[2] x ... x n[d-1]  input data and the first dimension
+(`tblock') of the n[1] x n[0] x n[2] x ... x n[d-1]  transposed data
+(at intermediate steps of the transform, and for the output if
+`FFTW_TRANSPOSED_OUT' is specified in `flags').  These must be the same
+block sizes as were passed to the corresponding `local_size' function;
+you can pass `FFTW_MPI_DEFAULT_BLOCK' to use FFTW's default block size
+as in the basic interface.  Also, the `howmany' parameter specifies
+that the transform is of contiguous `howmany'-tuples rather than
+individual complex numbers; this corresponds to the same parameter in
+the serial advanced interface (*note Advanced Complex DFTs::) with
+`stride = howmany' and `dist = 1'.
+
+MPI flags
+.........
+
+The `flags' can be any of those for the serial FFTW (*note Planner
+Flags::), and in addition may include one or more of the following
+MPI-specific flags, which improve performance at the cost of changing
+the output or input data formats.
+
+   * `FFTW_MPI_SCRAMBLED_OUT', `FFTW_MPI_SCRAMBLED_IN': valid for 1d
+     transforms only, these flags indicate that the output/input of the
+     transform are in an undocumented "scrambled" order.  A forward
+     `FFTW_MPI_SCRAMBLED_OUT' transform can be inverted by a backward
+     `FFTW_MPI_SCRAMBLED_IN' (times the usual 1/N normalization).
+     *Note One-dimensional distributions::.
+
+   * `FFTW_MPI_TRANSPOSED_OUT', `FFTW_MPI_TRANSPOSED_IN': valid for
+     multidimensional (`rnk > 1') transforms only, these flags specify
+     that the output or input of an n[0] x n[1] x n[2] x ... x n[d-1]
+     transform is transposed to n[1] x n[0] x n[2] x ... x n[d-1] .
+     *Note Transposed distributions::.
+
+
+Real-data MPI DFTs
+..................
+
+Plans for real-input/output (r2c/c2r) DFTs (*note Multi-dimensional MPI
+DFTs of Real Data::) are created by:
+
+     fftw_plan fftw_mpi_plan_dft_r2c_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                        double *in, fftw_complex *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_r2c_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                        double *in, fftw_complex *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_r2c_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                        double *in, fftw_complex *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_r2c(int rnk, const ptrdiff_t *n,
+                                     double *in, fftw_complex *out,
+                                     MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_c2r_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                        fftw_complex *in, double *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_c2r_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                        fftw_complex *in, double *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_c2r_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                        fftw_complex *in, double *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_c2r(int rnk, const ptrdiff_t *n,
+                                     fftw_complex *in, double *out,
+                                     MPI_Comm comm, unsigned flags);
+
+   Similar to the serial interface (*note Real-data DFTs::), these
+transform logically n[0] x n[1] x n[2] x ... x n[d-1]  real data
+to/from n[0] x n[1] x n[2] x ... x (n[d-1]/2 + 1)  complex data,
+representing the non-redundant half of the conjugate-symmetry output of
+a real-input DFT (*note Multi-dimensional Transforms::).  However, the
+real array must be stored within a padded n[0] x n[1] x n[2] x ... x [2
+(n[d-1]/2 + 1)]
+
+   array (much like the in-place serial r2c transforms, but here for
+out-of-place transforms as well). Currently, only multi-dimensional
+(`rnk > 1') r2c/c2r transforms are supported (requesting a plan for
+`rnk = 1' will yield `NULL').  As explained above (*note
+Multi-dimensional MPI DFTs of Real Data::), the data distribution of
+both the real and complex arrays is given by the `local_size' function
+called for the dimensions of the _complex_ array.  Similar to the other
+planning functions, the input and output arrays are overwritten when
+the plan is created except in `FFTW_ESTIMATE' mode.
+
+   As for the complex DFTs above, there is an advance interface that
+allows you to manually specify block sizes and to transform contiguous
+`howmany'-tuples of real/complex numbers:
+
+     fftw_plan fftw_mpi_plan_many_dft_r2c
+                   (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+                    ptrdiff_t iblock, ptrdiff_t oblock,
+                    double *in, fftw_complex *out,
+                    MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_many_dft_c2r
+                   (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+                    ptrdiff_t iblock, ptrdiff_t oblock,
+                    fftw_complex *in, double *out,
+                    MPI_Comm comm, unsigned flags);
+
+MPI r2r transforms
+..................
+
+There are corresponding plan-creation routines for r2r transforms
+(*note More DFTs of Real Data::), currently supporting multidimensional
+(`rnk > 1') transforms only (`rnk = 1' will yield a `NULL' plan):
+
+     fftw_plan fftw_mpi_plan_r2r_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                    double *in, double *out,
+                                    MPI_Comm comm,
+                                    fftw_r2r_kind kind0, fftw_r2r_kind kind1,
+                                    unsigned flags);
+     fftw_plan fftw_mpi_plan_r2r_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                    double *in, double *out,
+                                    MPI_Comm comm,
+                                    fftw_r2r_kind kind0, fftw_r2r_kind kind1, fftw_r2r_kind kind2,
+                                    unsigned flags);
+     fftw_plan fftw_mpi_plan_r2r(int rnk, const ptrdiff_t *n,
+                                 double *in, double *out,
+                                 MPI_Comm comm, const fftw_r2r_kind *kind,
+                                 unsigned flags);
+     fftw_plan fftw_mpi_plan_many_r2r(int rnk, const ptrdiff_t *n,
+                                      ptrdiff_t iblock, ptrdiff_t oblock,
+                                      double *in, double *out,
+                                      MPI_Comm comm, const fftw_r2r_kind *kind,
+                                      unsigned flags);
+
+   The parameters are much the same as for the complex DFTs above,
+except that the arrays are of real numbers (and hence the outputs of the
+`local_size' data-distribution functions should be interpreted as
+counts of real rather than complex numbers).  Also, the `kind'
+parameters specify the r2r kinds along each dimension as for the serial
+interface (*note Real-to-Real Transform Kinds::).  *Note Other
+Multi-dimensional Real-data MPI Transforms::.
+
+MPI transposition
+.................
+
+FFTW also provides routines to plan a transpose of a distributed `n0'
+by `n1' array of real numbers, or an array of `howmany'-tuples of real
+numbers with specified block sizes (*note FFTW MPI Transposes::):
+
+     fftw_plan fftw_mpi_plan_transpose(ptrdiff_t n0, ptrdiff_t n1,
+                                       double *in, double *out,
+                                       MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_many_transpose
+                     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany,
+                      ptrdiff_t block0, ptrdiff_t block1,
+                      double *in, double *out, MPI_Comm comm, unsigned flags);
+
+   These plans are used with the `fftw_mpi_execute_r2r' new-array
+execute function (*note Using MPI Plans::), since they count as (rank
+zero) r2r plans from FFTW's perspective.
+
+
+File: fftw3.info,  Node: MPI Wisdom Communication,  Prev: MPI Plan Creation,  Up: FFTW MPI Reference
+
+6.12.6 MPI Wisdom Communication
+-------------------------------
+
+To facilitate synchronizing wisdom among the different MPI processes,
+we provide two functions:
+
+     void fftw_mpi_gather_wisdom(MPI_Comm comm);
+     void fftw_mpi_broadcast_wisdom(MPI_Comm comm);
+
+   The `fftw_mpi_gather_wisdom' function gathers all wisdom in the
+given communicator `comm' to the process of rank 0 in the communicator:
+that process obtains the union of all wisdom on all the processes.  As
+a side effect, some other processes will gain additional wisdom from
+other processes, but only process 0 will gain the complete union.
+
+   The `fftw_mpi_broadcast_wisdom' does the reverse: it exports wisdom
+from process 0 in `comm' to all other processes in the communicator,
+replacing any wisdom they currently have.
+
+   *Note FFTW MPI Wisdom::.
+
+
+File: fftw3.info,  Node: FFTW MPI Fortran Interface,  Prev: FFTW MPI Reference,  Up: Distributed-memory FFTW with MPI
+
+6.13 FFTW MPI Fortran Interface
+===============================
+
+The FFTW MPI interface is callable from modern Fortran compilers
+supporting the Fortran 2003 `iso_c_binding' standard for calling C
+functions.  As described in *note Calling FFTW from Modern Fortran::,
+this means that you can directly call FFTW's C interface from Fortran
+with only minor changes in syntax.  There are, however, a few things
+specific to the MPI interface to keep in mind:
+
+   * Instead of including `fftw3.f03' as in *note Overview of Fortran
+     interface::, you should `include 'fftw3-mpi.f03'' (after `use,
+     intrinsic :: iso_c_binding' as before).  The `fftw3-mpi.f03' file
+     includes `fftw3.f03', so you should _not_ `include' them both
+     yourself.  (You will also want to include the MPI header file,
+     usually via `include 'mpif.h'' or similar, although though this is
+     not needed by `fftw3-mpi.f03' per se.)  (To use the `fftwl_' `long
+     double' extended-precision routines in supporting compilers, you
+     should include `fftw3f-mpi.f03' in _addition_ to `fftw3-mpi.f03'.
+     *Note Extended and quadruple precision in Fortran::.)
+
+   * Because of the different storage conventions between C and Fortran,
+     you reverse the order of your array dimensions when passing them to
+     FFTW (*note Reversing array dimensions::).  This is merely a
+     difference in notation and incurs no performance overhead.
+     However, it means that, whereas in C the _first_ dimension is
+     distributed, in Fortran the _last_ dimension of your array is
+     distributed.
+
+   * In Fortran, communicators are stored as `integer' types; there is
+     no `MPI_Comm' type, nor is there any way to access a C `MPI_Comm'.
+     Fortunately, this is taken care of for you by the FFTW Fortran
+     interface: whenever the C interface expects an `MPI_Comm' type,
+     you should pass the Fortran communicator as an `integer'.(1)
+
+   * Because you need to call the `local_size' function to find out how
+     much space to allocate, and this may be _larger_ than the local
+     portion of the array (*note MPI Data Distribution::), you should
+     _always_ allocate your arrays dynamically using FFTW's allocation
+     routines as described in *note Allocating aligned memory in
+     Fortran::.  (Coincidentally, this also provides the best
+     performance by guaranteeding proper data alignment.)
+
+   * Because all sizes in the MPI FFTW interface are declared as
+     `ptrdiff_t' in C, you should use `integer(C_INTPTR_T)' in Fortran
+     (*note FFTW Fortran type reference::).
+
+   * In Fortran, because of the language semantics, we generally
+     recommend using the new-array execute functions for all plans,
+     even in the common case where you are executing the plan on the
+     same arrays for which the plan was created (*note Plan execution
+     in Fortran::).  However, note that in the MPI interface these
+     functions are changed: `fftw_execute_dft' becomes
+     `fftw_mpi_execute_dft', etcetera. *Note Using MPI Plans::.
+
+
+   For example, here is a Fortran code snippet to perform a distributed
+L x M  complex DFT in-place.  (This assumes you have already
+initialized MPI with `MPI_init' and have also performed `call
+fftw_mpi_init'.)
+
+       use, intrinsic :: iso_c_binding
+       include 'fftw3-mpi.f03'
+       integer(C_INTPTR_T), parameter :: L = ...
+       integer(C_INTPTR_T), parameter :: M = ...
+       type(C_PTR) :: plan, cdata
+       complex(C_DOUBLE_COMPLEX), pointer :: data(:,:)
+       integer(C_INTPTR_T) :: i, j, alloc_local, local_M, local_j_offset
+
+     !   get local data size and allocate (note dimension reversal)
+       alloc_local = fftw_mpi_local_size_2d(M, L, MPI_COMM_WORLD, &
+                                            local_M, local_j_offset)
+       cdata = fftw_alloc_complex(alloc_local)
+       call c_f_pointer(cdata, data, [L,local_M])
+
+     !   create MPI plan for in-place forward DFT (note dimension reversal)
+       plan = fftw_mpi_plan_dft_2d(M, L, data, data, MPI_COMM_WORLD, &
+                                   FFTW_FORWARD, FFTW_MEASURE)
+
+     ! initialize data to some function my_function(i,j)
+       do j = 1, local_M
+         do i = 1, L
+           data(i, j) = my_function(i, j + local_j_offset)
+         end do
+       end do
+
+     ! compute transform (as many times as desired)
+       call fftw_mpi_execute_dft(plan, data, data)
+
+       call fftw_destroy_plan(plan)
+       call fftw_free(cdata)
+
+   Note that when we called `fftw_mpi_local_size_2d' and
+`fftw_mpi_plan_dft_2d' with the dimensions in reversed order, since a L
+x M  Fortran array is viewed by FFTW in C as a M x L  array.  This
+means that the array was distributed over the `M' dimension, the local
+portion of which is a L x local_M  array in Fortran.  (You must _not_
+use an `allocate' statement to allocate an L x local_M  array, however;
+you must allocate `alloc_local' complex numbers, which may be greater
+than `L * local_M', in order to reserve space for intermediate steps of
+the transform.)  Finally, we mention that because C's array indices are
+zero-based, the `local_j_offset' argument can conveniently be
+interpreted as an offset in the 1-based `j' index (rather than as a
+starting index as in C).
+
+   If instead you had used the `ior(FFTW_MEASURE,
+FFTW_MPI_TRANSPOSED_OUT)' flag, the output of the transform would be a
+transposed M x local_L  array, associated with the _same_ `cdata'
+allocation (since the transform is in-place), and which you could
+declare with:
+
+       complex(C_DOUBLE_COMPLEX), pointer :: tdata(:,:)
+       ...
+       call c_f_pointer(cdata, tdata, [M,local_L])
+
+   where `local_L' would have been obtained by changing the
+`fftw_mpi_local_size_2d' call to:
+
+       alloc_local = fftw_mpi_local_size_2d_transposed(M, L, MPI_COMM_WORLD, &
+                                local_M, local_j_offset, local_L, local_i_offset)
+
+   ---------- Footnotes ----------
+
+   (1) Technically, this is because you aren't actually calling the C
+functions directly. You are calling wrapper functions that translate
+the communicator with `MPI_Comm_f2c' before calling the ordinary C
+interface.  This is all done transparently, however, since the
+`fftw3-mpi.f03' interface file renames the wrappers so that they are
+called in Fortran with the same names as the C interface functions.
+
+
+File: fftw3.info,  Node: Calling FFTW from Modern Fortran,  Next: Calling FFTW from Legacy Fortran,  Prev: Distributed-memory FFTW with MPI,  Up: Top
+
+7 Calling FFTW from Modern Fortran
+**********************************
+
+Fortran 2003 standardized ways for Fortran code to call C libraries,
+and this allows us to support a direct translation of the FFTW C API
+into Fortran.  Compared to the legacy Fortran 77 interface (*note
+Calling FFTW from Legacy Fortran::), this direct interface offers many
+advantages, especially compile-time type-checking and aligned memory
+allocation.  As of this writing, support for these C interoperability
+features seems widespread, having been implemented in nearly all major
+Fortran compilers (e.g. GNU, Intel, IBM, Oracle/Solaris, Portland
+Group, NAG).  
+
+   This chapter documents that interface.  For the most part, since this
+interface allows Fortran to call the C interface directly, the usage is
+identical to C translated to Fortran syntax.  However, there are a few
+subtle points such as memory allocation, wisdom, and data types that
+deserve closer attention.
+
+* Menu:
+
+* Overview of Fortran interface::
+* Reversing array dimensions::
+* FFTW Fortran type reference::
+* Plan execution in Fortran::
+* Allocating aligned memory in Fortran::
+* Accessing the wisdom API from Fortran::
+* Defining an FFTW module::
+
+
+File: fftw3.info,  Node: Overview of Fortran interface,  Next: Reversing array dimensions,  Prev: Calling FFTW from Modern Fortran,  Up: Calling FFTW from Modern Fortran
+
+7.1 Overview of Fortran interface
+=================================
+
+FFTW provides a file `fftw3.f03' that defines Fortran 2003 interfaces
+for all of its C routines, except for the MPI routines described
+elsewhere, which can be found in the same directory as `fftw3.h' (the C
+header file).  In any Fortran subroutine where you want to use FFTW
+functions, you should begin with:
+
+       use, intrinsic :: iso_c_binding
+       include 'fftw3.f03'
+
+   This includes the interface definitions and the standard
+`iso_c_binding' module (which defines the equivalents of C types).  You
+can also put the FFTW functions into a module if you prefer (*note
+Defining an FFTW module::).
+
+   At this point, you can now call anything in the FFTW C interface
+directly, almost exactly as in C other than minor changes in syntax.
+For example:
+
+       type(C_PTR) :: plan
+       complex(C_DOUBLE_COMPLEX), dimension(1024,1000) :: in, out
+       plan = fftw_plan_dft_2d(1000,1024, in,out, FFTW_FORWARD,FFTW_ESTIMATE)
+       ...
+       call fftw_execute_dft(plan, in, out)
+       ...
+       call fftw_destroy_plan(plan)
+
+   A few important things to keep in mind are:
+
+   * FFTW plans are `type(C_PTR)'.  Other C types are mapped in the
+     obvious way via the `iso_c_binding' standard: `int' turns into
+     `integer(C_INT)', `fftw_complex' turns into
+     `complex(C_DOUBLE_COMPLEX)', `double' turns into `real(C_DOUBLE)',
+     and so on. *Note FFTW Fortran type reference::.
+
+   * Functions in C become functions in Fortran if they have a return
+     value, and subroutines in Fortran otherwise.
+
+   * The ordering of the Fortran array dimensions must be _reversed_
+     when they are passed to the FFTW plan creation, thanks to
+     differences in array indexing conventions (*note Multi-dimensional
+     Array Format::).  This is _unlike_ the legacy Fortran interface
+     (*note Fortran-interface routines::), which reversed the dimensions
+     for you.  *Note Reversing array dimensions::.
+
+   * Using ordinary Fortran array declarations like this works, but may
+     yield suboptimal performance because the data may not be not
+     aligned to exploit SIMD instructions on modern proessors (*note
+     SIMD alignment and fftw_malloc::). Better performance will often
+     be obtained by allocating with `fftw_alloc'. *Note Allocating
+     aligned memory in Fortran::.
+
+   * Similar to the legacy Fortran interface (*note FFTW Execution in
+     Fortran::), we currently recommend _not_ using `fftw_execute' but
+     rather using the more specialized functions like
+     `fftw_execute_dft' (*note New-array Execute Functions::).
+     However, you should execute the plan on the `same arrays' as the
+     ones for which you created the plan, unless you are especially
+     careful.  *Note Plan execution in Fortran::.  To prevent you from
+     using `fftw_execute' by mistake, the `fftw3.f03' file does not
+     provide an `fftw_execute' interface declaration.
+
+   * Multiple planner flags are combined with `ior' (equivalent to `|'
+     in C).  e.g. `FFTW_MEASURE | FFTW_DESTROY_INPUT' becomes
+     `ior(FFTW_MEASURE, FFTW_DESTROY_INPUT)'.  (You can also use `+' as
+     long as you don't try to include a given flag more than once.)
+
+
+* Menu:
+
+* Extended and quadruple precision in Fortran::
+
+
+File: fftw3.info,  Node: Extended and quadruple precision in Fortran,  Prev: Overview of Fortran interface,  Up: Overview of Fortran interface
+
+7.1.1 Extended and quadruple precision in Fortran
+-------------------------------------------------
+
+If FFTW is compiled in `long double' (extended) precision (*note
+Installation and Customization::), you may be able to call the
+resulting `fftwl_' routines (*note Precision::) from Fortran if your
+compiler supports the `C_LONG_DOUBLE_COMPLEX' type code.
+
+   Because some Fortran compilers do not support
+`C_LONG_DOUBLE_COMPLEX', the `fftwl_' declarations are segregated into
+a separate interface file `fftw3l.f03', which you should include _in
+addition_ to `fftw3.f03' (which declares precision-independent `FFTW_'
+constants):
+
+       use, intrinsic :: iso_c_binding
+       include 'fftw3.f03'
+       include 'fftw3l.f03'
+
+   We also support using the nonstandard `__float128'
+quadruple-precision type provided by recent versions of `gcc' on 32-
+and 64-bit x86 hardware (*note Installation and Customization::), using
+the corresponding `real(16)' and `complex(16)' types supported by
+`gfortran'.  The quadruple-precision `fftwq_' functions (*note
+Precision::) are declared in a `fftw3q.f03' interface file, which
+should be included in addition to `fftw3l.f03', as above.  You should
+also link with `-lfftw3q -lquadmath -lm' as in C.
+
+
+File: fftw3.info,  Node: Reversing array dimensions,  Next: FFTW Fortran type reference,  Prev: Overview of Fortran interface,  Up: Calling FFTW from Modern Fortran
+
+7.2 Reversing array dimensions
+==============================
+
+A minor annoyance in calling FFTW from Fortran is that FFTW's array
+dimensions are defined in the C convention (row-major order), while
+Fortran's array dimensions are the opposite convention (column-major
+order). *Note Multi-dimensional Array Format::.  This is just a
+bookkeeping difference, with no effect on performance.  The only
+consequence of this is that, whenever you create an FFTW plan for a
+multi-dimensional transform, you must always _reverse the ordering of
+the dimensions_.
+
+   For example, consider the three-dimensional (L x M x N ) arrays:
+
+       complex(C_DOUBLE_COMPLEX), dimension(L,M,N) :: in, out
+
+   To plan a DFT for these arrays using `fftw_plan_dft_3d', you could
+do:
+
+       plan = fftw_plan_dft_3d(N,M,L, in,out, FFTW_FORWARD,FFTW_ESTIMATE)
+
+   That is, from FFTW's perspective this is a N x M x L  array.  _No
+data transposition need occur_, as this is _only notation_.  Similarly,
+to use the more generic routine `fftw_plan_dft' with the same arrays,
+you could do:
+
+       integer(C_INT), dimension(3) :: n = [N,M,L]
+       plan = fftw_plan_dft_3d(3, n, in,out, FFTW_FORWARD,FFTW_ESTIMATE)
+
+   Note, by the way, that this is different from the legacy Fortran
+interface (*note Fortran-interface routines::), which automatically
+reverses the order of the array dimension for you.  Here, you are
+calling the C interface directly, so there is no "translation" layer.
+
+   An important thing to keep in mind is the implication of this for
+multidimensional real-to-complex transforms (*note Multi-Dimensional
+DFTs of Real Data::).  In C, a multidimensional real-to-complex DFT
+chops the last dimension roughly in half (N x M x L  real input goes to
+N x M x L/2+1  complex output).  In Fortran, because the array
+dimension notation is reversed, the _first_ dimension of the complex
+data is chopped roughly in half.  For example consider the `r2c'
+transform of L x M x N  real input in Fortran:
+
+       type(C_PTR) :: plan
+       real(C_DOUBLE), dimension(L,M,N) :: in
+       complex(C_DOUBLE_COMPLEX), dimension(L/2+1,M,N) :: out
+       plan = fftw_plan_dft_r2c_3d(N,M,L, in,out, FFTW_ESTIMATE)
+       ...
+       call fftw_execute_dft_r2c(plan, in, out)
+
+   Alternatively, for an in-place r2c transform, as described in the C
+documentation we must _pad_ the _first_ dimension of the real input
+with an extra two entries (which are ignored by FFTW) so as to leave
+enough space for the complex output. The input is _allocated_ as a
+2[L/2+1] x M x N  array, even though only L x M x N  of it is actually
+used.  In this example, we will allocate the array as a pointer type,
+using `fftw_alloc' to ensure aligned memory for maximum performance
+(*note Allocating aligned memory in Fortran::); this also makes it easy
+to reference the same memory as both a real array and a complex array.
+
+       real(C_DOUBLE), pointer :: in(:,:,:)
+       complex(C_DOUBLE_COMPLEX), pointer :: out(:,:,:)
+       type(C_PTR) :: plan, data
+       data = fftw_alloc_complex(int((L/2+1) * M * N, C_SIZE_T))
+       call c_f_pointer(data, in, [2*(L/2+1),M,N])
+       call c_f_pointer(data, out, [L/2+1,M,N])
+       plan = fftw_plan_dft_r2c_3d(N,M,L, in,out, FFTW_ESTIMATE)
+       ...
+       call fftw_execute_dft_r2c(plan, in, out)
+       ...
+       call fftw_destroy_plan(plan)
+       call fftw_free(data)
+
+
+File: fftw3.info,  Node: FFTW Fortran type reference,  Next: Plan execution in Fortran,  Prev: Reversing array dimensions,  Up: Calling FFTW from Modern Fortran
+
+7.3 FFTW Fortran type reference
+===============================
+
+The following are the most important type correspondences between the C
+interface and Fortran:
+
+   * Plans (`fftw_plan' and variants) are `type(C_PTR)' (i.e. an opaque
+     pointer).
+
+   * The C floating-point types `double', `float', and `long double'
+     correspond to `real(C_DOUBLE)', `real(C_FLOAT)', and
+     `real(C_LONG_DOUBLE)', respectively.  The C complex types
+     `fftw_complex', `fftwf_complex', and `fftwl_complex' correspond in
+     Fortran to `complex(C_DOUBLE_COMPLEX)',
+     `complex(C_FLOAT_COMPLEX)', and `complex(C_LONG_DOUBLE_COMPLEX)',
+     respectively.  Just as in C (*note Precision::), the FFTW
+     subroutines and types are prefixed with `fftw_', `fftwf_', and
+     `fftwl_' for the different precisions, and link to different
+     libraries (`-lfftw3', `-lfftw3f', and `-lfftw3l' on Unix), but use
+     the _same_ include file `fftw3.f03' and the _same_ constants (all
+     of which begin with `FFTW_').  The exception is `long double'
+     precision, for which you should _also_ include `fftw3l.f03' (*note
+     Extended and quadruple precision in Fortran::).
+
+   * The C integer types `int' and `unsigned' (used for planner flags)
+     become `integer(C_INT)'.  The C integer type `ptrdiff_t' (e.g. in
+     the *note 64-bit Guru Interface::) becomes `integer(C_INTPTR_T)',
+     and `size_t' (in `fftw_malloc' etc.) becomes `integer(C_SIZE_T)'.
+
+   * The `fftw_r2r_kind' type (*note Real-to-Real Transform Kinds::)
+     becomes `integer(C_FFTW_R2R_KIND)'.  The various constant values
+     of the C enumerated type (`FFTW_R2HC' etc.) become simply integer
+     constants of the same names in Fortran.
+
+   * Numeric array pointer arguments (e.g. `double *') become
+     `dimension(*), intent(out)' arrays of the same type, or
+     `dimension(*), intent(in)' if they are pointers to constant data
+     (e.g. `const int *').  There are a few exceptions where numeric
+     pointers refer to scalar outputs (e.g. for `fftw_flops'), in which
+     case they are `intent(out)' scalar arguments in Fortran too.  For
+     the new-array execute functions (*note New-array Execute
+     Functions::), the input arrays are declared `dimension(*),
+     intent(inout)', since they can be modified in the case of in-place
+     or `FFTW_DESTROY_INPUT' transforms.
+
+   * Pointer _return_ values (e.g `double *') become `type(C_PTR)'.
+     (If they are pointers to arrays, as for `fftw_alloc_real', you can
+     convert them back to Fortran array pointers with the standard
+     intrinsic function `c_f_pointer'.)
+
+   * The `fftw_iodim' type in the guru interface (*note Guru vector and
+     transform sizes::) becomes `type(fftw_iodim)' in Fortran, a
+     derived data type (the Fortran analogue of C's `struct') with
+     three `integer(C_INT)' components: `n', `is', and `os', with the
+     same meanings as in C.  The `fftw_iodim64' type in the 64-bit guru
+     interface (*note 64-bit Guru Interface::) is the same, except that
+     its components are of type `integer(C_INTPTR_T)'.
+
+   * Using the wisdom import/export functions from Fortran is a bit
+     tricky, and is discussed in *note Accessing the wisdom API from
+     Fortran::.  In brief, the `FILE *' arguments map to `type(C_PTR)',
+     `const char *' to `character(C_CHAR), dimension(*), intent(in)'
+     (null-terminated!), and the generic read-char/write-char functions
+     map to `type(C_FUNPTR)'.
+
+
+   You may be wondering if you need to search-and-replace
+`real(kind(0.0d0))' (or whatever your favorite Fortran spelling of
+"double precision" is) with `real(C_DOUBLE)' everywhere in your
+program, and similarly for `complex' and `integer' types.  The answer
+is no; you can still use your existing types.  As long as these types
+match their C counterparts, things should work without a hitch.  The
+worst that can happen, e.g. in the (unlikely) event of a system where
+`real(kind(0.0d0))' is different from `real(C_DOUBLE)', is that the
+compiler will give you a type-mismatch error.  That is, if you don't
+use the `iso_c_binding' kinds you need to accept at least the
+theoretical possibility of having to change your code in response to
+compiler errors on some future machine, but you don't need to worry
+about silently compiling incorrect code that yields runtime errors.
+
+
+File: fftw3.info,  Node: Plan execution in Fortran,  Next: Allocating aligned memory in Fortran,  Prev: FFTW Fortran type reference,  Up: Calling FFTW from Modern Fortran
+
+7.4 Plan execution in Fortran
+=============================
+
+In C, in order to use a plan, one normally calls `fftw_execute', which
+executes the plan to perform the transform on the input/output arrays
+passed when the plan was created (*note Using Plans::).  The
+corresponding subroutine call in modern Fortran is:
+      call fftw_execute(plan)
+   
+   However, we have had reports that this causes problems with some
+recent optimizing Fortran compilers.  The problem is, because the
+input/output arrays are not passed as explicit arguments to
+`fftw_execute', the semantics of Fortran (unlike C) allow the compiler
+to assume that the input/output arrays are not changed by
+`fftw_execute'.  As a consequence, certain compilers end up
+repositioning the call to `fftw_execute', assuming incorrectly that it
+does nothing to the arrays.
+
+   There are various workarounds to this, but the safest and simplest
+thing is to not use `fftw_execute' in Fortran.  Instead, use the
+functions described in *note New-array Execute Functions::, which take
+the input/output arrays as explicit arguments.  For example, if the
+plan is for a complex-data DFT and was created for the arrays `in' and
+`out', you would do:
+      call fftw_execute_dft(plan, in, out)
+   
+   There are a few things to be careful of, however:
+
+   * You must use the correct type of execute function, matching the way
+     the plan was created.  Complex DFT plans should use
+     `fftw_execute_dft', Real-input (r2c) DFT plans should use use
+     `fftw_execute_dft_r2c', and real-output (c2r) DFT plans should use
+     `fftw_execute_dft_c2r'.  The various r2r plans should use
+     `fftw_execute_r2r'.  Fortunately, if you use the wrong one you
+     will get a compile-time type-mismatch error (unlike legacy
+     Fortran).
+
+   * You should normally pass the same input/output arrays that were
+     used when creating the plan.  This is always safe.
+
+   * _If_ you pass _different_ input/output arrays compared to those
+     used when creating the plan, you must abide by all the
+     restrictions of the new-array execute functions (*note New-array
+     Execute Functions::).  The most tricky of these is the requirement
+     that the new arrays have the same alignment as the original
+     arrays; the best (and possibly only) way to guarantee this is to
+     use the `fftw_alloc' functions to allocate your arrays (*note
+     Allocating aligned memory in Fortran::). Alternatively, you can
+     use the `FFTW_UNALIGNED' flag when creating the plan, in which
+     case the plan does not depend on the alignment, but this may
+     sacrifice substantial performance on architectures (like x86) with
+     SIMD instructions (*note SIMD alignment and fftw_malloc::).  
+
+
+
+File: fftw3.info,  Node: Allocating aligned memory in Fortran,  Next: Accessing the wisdom API from Fortran,  Prev: Plan execution in Fortran,  Up: Calling FFTW from Modern Fortran
+
+7.5 Allocating aligned memory in Fortran
+========================================
+
+In order to obtain maximum performance in FFTW, you should store your
+data in arrays that have been specially aligned in memory (*note SIMD
+alignment and fftw_malloc::).  Enforcing alignment also permits you to
+safely use the new-array execute functions (*note New-array Execute
+Functions::) to apply a given plan to more than one pair of in/out
+arrays.  Unfortunately, standard Fortran arrays do _not_ provide any
+alignment guarantees.  The _only_ way to allocate aligned memory in
+standard Fortran is to allocate it with an external C function, like
+the `fftw_alloc_real' and `fftw_alloc_complex' functions.  Fortunately,
+Fortran 2003 provides a simple way to associate such allocated memory
+with a standard Fortran array pointer that you can then use normally.
+
+   We therefore recommend allocating all your input/output arrays using
+the following technique:
+
+  1. Declare a `pointer', `arr', to your array of the desired type and
+     dimensions.  For example, `real(C_DOUBLE), pointer :: a(:,:)' for
+     a 2d real array, or `complex(C_DOUBLE_COMPLEX), pointer ::
+     a(:,:,:)' for a 3d complex array.
+
+  2. The number of elements to allocate must be an `integer(C_SIZE_T)'.
+     You can either declare a variable of this type, e.g.
+     `integer(C_SIZE_T) :: sz', to store the number of elements to
+     allocate, or you can use the `int(..., C_SIZE_T)' intrinsic
+     function. e.g. set `sz = L * M * N' or use `int(L * M * N,
+     C_SIZE_T)' for an L x M x N  array.
+
+  3. Declare a `type(C_PTR) :: p' to hold the return value from FFTW's
+     allocation routine.  Set `p = fftw_alloc_real(sz)' for a real
+     array, or `p = fftw_alloc_complex(sz)' for a complex array.
+
+  4. Associate your pointer `arr' with the allocated memory `p' using
+     the standard `c_f_pointer' subroutine: `call c_f_pointer(p, arr,
+     [...dimensions...])', where `[...dimensions...])' are an array of
+     the dimensions of the array (in the usual Fortran order). e.g.
+     `call c_f_pointer(p, arr, [L,M,N])' for an L x M x N  array.
+     (Alternatively, you can omit the dimensions argument if you
+     specified the shape explicitly when declaring `arr'.)  You can now
+     use `arr' as a usual multidimensional array.
+
+  5. When you are done using the array, deallocate the memory by `call
+     fftw_free(p)' on `p'.
+
+
+   For example, here is how we would allocate an L x M  2d real array:
+
+       real(C_DOUBLE), pointer :: arr(:,:)
+       type(C_PTR) :: p
+       p = fftw_alloc_real(int(L * M, C_SIZE_T))
+       call c_f_pointer(p, arr, [L,M])
+       _...use arr and arr(i,j) as usual..._
+       call fftw_free(p)
+
+   and here is an L x M x N  3d complex array:
+
+       complex(C_DOUBLE_COMPLEX), pointer :: arr(:,:,:)
+       type(C_PTR) :: p
+       p = fftw_alloc_complex(int(L * M * N, C_SIZE_T))
+       call c_f_pointer(p, arr, [L,M,N])
+       _...use arr and arr(i,j,k) as usual..._
+       call fftw_free(p)
+
+   See *note Reversing array dimensions:: for an example allocating a
+single array and associating both real and complex array pointers with
+it, for in-place real-to-complex transforms.
+
+
+File: fftw3.info,  Node: Accessing the wisdom API from Fortran,  Next: Defining an FFTW module,  Prev: Allocating aligned memory in Fortran,  Up: Calling FFTW from Modern Fortran
+
+7.6 Accessing the wisdom API from Fortran
+=========================================
+
+As explained in *note Words of Wisdom-Saving Plans::, FFTW provides a
+"wisdom" API for saving plans to disk so that they can be recreated
+quickly.  The C API for exporting (*note Wisdom Export::) and importing
+(*note Wisdom Import::) wisdom is somewhat tricky to use from Fortran,
+however, because of differences in file I/O and string types between C
+and Fortran.
+
+* Menu:
+
+* Wisdom File Export/Import from Fortran::
+* Wisdom String Export/Import from Fortran::
+* Wisdom Generic Export/Import from Fortran::
+
+
+File: fftw3.info,  Node: Wisdom File Export/Import from Fortran,  Next: Wisdom String Export/Import from Fortran,  Prev: Accessing the wisdom API from Fortran,  Up: Accessing the wisdom API from Fortran
+
+7.6.1 Wisdom File Export/Import from Fortran
+--------------------------------------------
+
+The easiest way to export and import wisdom is to do so using
+`fftw_export_wisdom_to_filename' and `fftw_wisdom_from_filename'.  The
+only trick is that these require you to pass a C string, which is an
+array of type `CHARACTER(C_CHAR)' that is terminated by `C_NULL_CHAR'.
+You can call them like this:
+
+       integer(C_INT) :: ret
+       ret = fftw_export_wisdom_to_filename(C_CHAR_'my_wisdom.dat' // C_NULL_CHAR)
+       if (ret .eq. 0) stop 'error exporting wisdom to file'
+       ret = fftw_import_wisdom_from_filename(C_CHAR_'my_wisdom.dat' // C_NULL_CHAR)
+       if (ret .eq. 0) stop 'error importing wisdom from file'
+
+   Note that prepending `C_CHAR_' is needed to specify that the literal
+string is of kind `C_CHAR', and we null-terminate the string by
+appending `// C_NULL_CHAR'.  These functions return an `integer(C_INT)'
+(`ret') which is `0' if an error occurred during export/import and
+nonzero otherwise.
+
+   It is also possible to use the lower-level routines
+`fftw_export_wisdom_to_file' and `fftw_import_wisdom_from_file', which
+accept parameters of the C type `FILE*', expressed in Fortran as
+`type(C_PTR)'.  However, you are then responsible for creating the
+`FILE*' yourself.  You can do this by using `iso_c_binding' to define
+Fortran intefaces for the C library functions `fopen' and `fclose',
+which is a bit strange in Fortran but workable.
+
+
+File: fftw3.info,  Node: Wisdom String Export/Import from Fortran,  Next: Wisdom Generic Export/Import from Fortran,  Prev: Wisdom File Export/Import from Fortran,  Up: Accessing the wisdom API from Fortran
+
+7.6.2 Wisdom String Export/Import from Fortran
+----------------------------------------------
+
+Dealing with FFTW's C string export/import is a bit more painful.  In
+particular, the `fftw_export_wisdom_to_string' function requires you to
+deal with a dynamically allocated C string.  To get its length, you
+must define an interface to the C `strlen' function, and to deallocate
+it you must define an interface to C `free':
+
+       use, intrinsic :: iso_c_binding
+       interface
+         integer(C_INT) function strlen(s) bind(C, name='strlen')
+           import
+           type(C_PTR), value :: s
+         end function strlen
+         subroutine free(p) bind(C, name='free')
+           import
+           type(C_PTR), value :: p
+         end subroutine free
+       end interface
+
+   Given these definitions, you can then export wisdom to a Fortran
+character array:
+
+       character(C_CHAR), pointer :: s(:)
+       integer(C_SIZE_T) :: slen
+       type(C_PTR) :: p
+       p = fftw_export_wisdom_to_string()
+       if (.not. c_associated(p)) stop 'error exporting wisdom'
+       slen = strlen(p)
+       call c_f_pointer(p, s, [slen+1])
+       ...
+       call free(p)
+   
+   Note that `slen' is the length of the C string, but the length of
+the array is `slen+1' because it includes the terminating null
+character.  (You can omit the `+1' if you don't want Fortran to know
+about the null character.) The standard `c_associated' function checks
+whether `p' is a null pointer, which is returned by
+`fftw_export_wisdom_to_string' if there was an error.
+
+   To import wisdom from a string, use `fftw_import_wisdom_from_string'
+as usual; note that the argument of this function must be a
+`character(C_CHAR)' that is terminated by the `C_NULL_CHAR' character,
+like the `s' array above.
+
+
+File: fftw3.info,  Node: Wisdom Generic Export/Import from Fortran,  Prev: Wisdom String Export/Import from Fortran,  Up: Accessing the wisdom API from Fortran
+
+7.6.3 Wisdom Generic Export/Import from Fortran
+-----------------------------------------------
+
+The most generic wisdom export/import functions allow you to provide an
+arbitrary callback function to read/write one character at a time in
+any way you want.  However, your callback function must be written in a
+special way, using the `bind(C)' attribute to be passed to a C
+interface.
+
+   In particular, to call the generic wisdom export function
+`fftw_export_wisdom', you would write a callback subroutine of the form:
+
+       subroutine my_write_char(c, p) bind(C)
+         use, intrinsic :: iso_c_binding
+         character(C_CHAR), value :: c
+         type(C_PTR), value :: p
+         _...write c..._
+       end subroutine my_write_char
+
+   Given such a subroutine (along with the corresponding interface
+definition), you could then export wisdom using:
+
+       call fftw_export_wisdom(c_funloc(my_write_char), p)
+
+   The standard `c_funloc' intrinsic converts a Fortran `bind(C)'
+subroutine into a C function pointer.  The parameter `p' is a
+`type(C_PTR)' to any arbitrary data that you want to pass to
+`my_write_char' (or `C_NULL_PTR' if none).  (Note that you can get a C
+pointer to Fortran data using the intrinsic `c_loc', and convert it
+back to a Fortran pointer in `my_write_char' using `c_f_pointer'.)
+
+   Similarly, to use the generic `fftw_import_wisdom', you would define
+a callback function of the form:
+
+       integer(C_INT) function my_read_char(p) bind(C)
+         use, intrinsic :: iso_c_binding
+         type(C_PTR), value :: p
+         character :: c
+         _...read a character c..._
+         my_read_char = ichar(c, C_INT)
+       end function my_read_char
+
+       ....
+
+       integer(C_INT) :: ret
+       ret = fftw_import_wisdom(c_funloc(my_read_char), p)
+       if (ret .eq. 0) stop 'error importing wisdom'
+
+   Your function can return `-1' if the end of the input is reached.
+Again, `p' is an arbitrary `type(C_PTR' that is passed through to your
+function.  `fftw_import_wisdom' returns `0' if an error occurred and
+nonzero otherwise.
+
+
+File: fftw3.info,  Node: Defining an FFTW module,  Prev: Accessing the wisdom API from Fortran,  Up: Calling FFTW from Modern Fortran
+
+7.7 Defining an FFTW module
+===========================
+
+Rather than using the `include' statement to include the `fftw3.f03'
+interface file in any subroutine where you want to use FFTW, you might
+prefer to define an FFTW Fortran module.  FFTW does not install itself
+as a module, primarily because `fftw3.f03' can be shared between
+different Fortran compilers while modules (in general) cannot.
+However, it is trivial to define your own FFTW module if you want.
+Just create a file containing:
+
+       module FFTW3
+         use, intrinsic :: iso_c_binding
+         include 'fftw3.f03'
+       end module
+
+   Compile this file into a module as usual for your compiler (e.g. with
+`gfortran -c' you will get a file `fftw3.mod').  Now, instead of
+`include 'fftw3.f03'', whenever you want to use FFTW routines you can
+just do:
+
+       use FFTW3
+
+   as usual for Fortran modules.  (You still need to link to the FFTW
+library, of course.)
+
+
+File: fftw3.info,  Node: Calling FFTW from Legacy Fortran,  Next: Upgrading from FFTW version 2,  Prev: Calling FFTW from Modern Fortran,  Up: Top
+
+8 Calling FFTW from Legacy Fortran
+**********************************
+
+This chapter describes the interface to FFTW callable by Fortran code
+in older compilers not supporting the Fortran 2003 C interoperability
+features (*note Calling FFTW from Modern Fortran::).  This interface
+has the major disadvantage that it is not type-checked, so if you
+mistake the argument types or ordering then your program will not have
+any compiler errors, and will likely crash at runtime.  So, greater
+care is needed.  Also, technically interfacing older Fortran versions
+to C is nonstandard, but in practice we have found that the techniques
+used in this chapter have worked with all known Fortran compilers for
+many years.
+
+   The legacy Fortran interface differs from the C interface only in the
+prefix (`dfftw_' instead of `fftw_' in double precision) and a few
+other minor details.  This Fortran interface is included in the FFTW
+libraries by default, unless a Fortran compiler isn't found on your
+system or `--disable-fortran' is included in the `configure' flags.  We
+assume here that the reader is already familiar with the usage of FFTW
+in C, as described elsewhere in this manual.
+
+   The MPI parallel interface to FFTW is _not_ currently available to
+legacy Fortran.
+
+* Menu:
+
+* Fortran-interface routines::
+* FFTW Constants in Fortran::
+* FFTW Execution in Fortran::
+* Fortran Examples::
+* Wisdom of Fortran?::
+
+
+File: fftw3.info,  Node: Fortran-interface routines,  Next: FFTW Constants in Fortran,  Prev: Calling FFTW from Legacy Fortran,  Up: Calling FFTW from Legacy Fortran
+
+8.1 Fortran-interface routines
+==============================
+
+Nearly all of the FFTW functions have Fortran-callable equivalents.
+The name of the legacy Fortran routine is the same as that of the
+corresponding C routine, but with the `fftw_' prefix replaced by
+`dfftw_'.(1)  The single and long-double precision versions use
+`sfftw_' and `lfftw_', respectively, instead of `fftwf_' and `fftwl_';
+quadruple precision (`real*16') is available on some systems as
+`fftwq_' (*note Precision::).  (Note that `long double' on x86 hardware
+is usually at most 80-bit extended precision, _not_ quadruple
+precision.)
+
+   For the most part, all of the arguments to the functions are the
+same, with the following exceptions:
+
+   * `plan' variables (what would be of type `fftw_plan' in C), must be
+     declared as a type that is at least as big as a pointer (address)
+     on your machine.  We recommend using `integer*8' everywhere, since
+     this should always be big enough.  
+
+   * Any function that returns a value (e.g. `fftw_plan_dft') is
+     converted into a _subroutine_.  The return value is converted into
+     an additional _first_ parameter of this subroutine.(2)
+
+   * The Fortran routines expect multi-dimensional arrays to be in
+     _column-major_ order, which is the ordinary format of Fortran
+     arrays (*note Multi-dimensional Array Format::).  They do this
+     transparently and costlessly simply by reversing the order of the
+     dimensions passed to FFTW, but this has one important consequence
+     for multi-dimensional real-complex transforms, discussed below.
+
+   * Wisdom import and export is somewhat more tricky because one cannot
+     easily pass files or strings between C and Fortran; see *note
+     Wisdom of Fortran?::.
+
+   * Legacy Fortran cannot use the `fftw_malloc' dynamic-allocation
+     routine.  If you want to exploit the SIMD FFTW (*note SIMD
+     alignment and fftw_malloc::), you'll need to figure out some other
+     way to ensure that your arrays are at least 16-byte aligned.
+
+   * Since Fortran 77 does not have data structures, the `fftw_iodim'
+     structure from the guru interface (*note Guru vector and transform
+     sizes::) must be split into separate arguments.  In particular, any
+     `fftw_iodim' array arguments in the C guru interface become three
+     integer array arguments (`n', `is', and `os') in the Fortran guru
+     interface, all of whose lengths should be equal to the
+     corresponding `rank' argument.
+
+   * The guru planner interface in Fortran does _not_ do any automatic
+     translation between column-major and row-major; you are responsible
+     for setting the strides etcetera to correspond to your Fortran
+     arrays.  However, as a slight bug that we are preserving for
+     backwards compatibility, the `plan_guru_r2r' in Fortran _does_
+     reverse the order of its `kind' array parameter, so the `kind'
+     array of that routine should be in the reverse of the order of the
+     iodim arrays (see above).
+
+
+   In general, you should take care to use Fortran data types that
+correspond to (i.e. are the same size as) the C types used by FFTW.  In
+practice, this correspondence is usually straightforward (i.e.
+`integer' corresponds to `int', `real' corresponds to `float',
+etcetera).  The native Fortran double/single-precision complex type
+should be compatible with `fftw_complex'/`fftwf_complex'.  Such simple
+correspondences are assumed in the examples below.  
+
+   ---------- Footnotes ----------
+
+   (1) Technically, Fortran 77 identifiers are not allowed to have more
+than 6 characters, nor may they contain underscores.  Any compiler that
+enforces this limitation doesn't deserve to link to FFTW.
+
+   (2) The reason for this is that some Fortran implementations seem to
+have trouble with C function return values, and vice versa.
+
+
+File: fftw3.info,  Node: FFTW Constants in Fortran,  Next: FFTW Execution in Fortran,  Prev: Fortran-interface routines,  Up: Calling FFTW from Legacy Fortran
+
+8.2 FFTW Constants in Fortran
+=============================
+
+When creating plans in FFTW, a number of constants are used to specify
+options, such as `FFTW_MEASURE' or `FFTW_ESTIMATE'.  The same constants
+must be used with the wrapper routines, but of course the C header
+files where the constants are defined can't be incorporated directly
+into Fortran code.
+
+   Instead, we have placed Fortran equivalents of the FFTW constant
+definitions in the file `fftw3.f', which can be found in the same
+directory as `fftw3.h'.  If your Fortran compiler supports a
+preprocessor of some sort, you should be able to `include' or
+`#include' this file; otherwise, you can paste it directly into your
+code.
+
+   In C, you combine different flags (like `FFTW_PRESERVE_INPUT' and
+`FFTW_MEASURE') using the ``|'' operator; in Fortran you should just
+use ``+''.  (Take care not to add in the same flag more than once,
+though.  Alternatively, you can use the `ior' intrinsic function
+standardized in Fortran 95.)
+
+
+File: fftw3.info,  Node: FFTW Execution in Fortran,  Next: Fortran Examples,  Prev: FFTW Constants in Fortran,  Up: Calling FFTW from Legacy Fortran
+
+8.3 FFTW Execution in Fortran
+=============================
+
+In C, in order to use a plan, one normally calls `fftw_execute', which
+executes the plan to perform the transform on the input/output arrays
+passed when the plan was created (*note Using Plans::).  The
+corresponding subroutine call in legacy Fortran is:
+             call dfftw_execute(plan)
+   
+   However, we have had reports that this causes problems with some
+recent optimizing Fortran compilers.  The problem is, because the
+input/output arrays are not passed as explicit arguments to
+`dfftw_execute', the semantics of Fortran (unlike C) allow the compiler
+to assume that the input/output arrays are not changed by
+`dfftw_execute'.  As a consequence, certain compilers end up optimizing
+out or repositioning the call to `dfftw_execute', assuming incorrectly
+that it does nothing.
+
+   There are various workarounds to this, but the safest and simplest
+thing is to not use `dfftw_execute' in Fortran.  Instead, use the
+functions described in *note New-array Execute Functions::, which take
+the input/output arrays as explicit arguments.  For example, if the
+plan is for a complex-data DFT and was created for the arrays `in' and
+`out', you would do:
+             call dfftw_execute_dft(plan, in, out)
+   
+   There are a few things to be careful of, however:
+
+   * You must use the correct type of execute function, matching the way
+     the plan was created.  Complex DFT plans should use
+     `dfftw_execute_dft', Real-input (r2c) DFT plans should use use
+     `dfftw_execute_dft_r2c', and real-output (c2r) DFT plans should
+     use `dfftw_execute_dft_c2r'.  The various r2r plans should use
+     `dfftw_execute_r2r'.
+
+   * You should normally pass the same input/output arrays that were
+     used when creating the plan.  This is always safe.
+
+   * _If_ you pass _different_ input/output arrays compared to those
+     used when creating the plan, you must abide by all the
+     restrictions of the new-array execute functions (*note New-array
+     Execute Functions::).  The most difficult of these, in Fortran, is
+     the requirement that the new arrays have the same alignment as the
+     original arrays, because there seems to be no way in legacy
+     Fortran to obtain guaranteed-aligned arrays (analogous to
+     `fftw_malloc' in C).  You can, of course, use the `FFTW_UNALIGNED'
+     flag when creating the plan, in which case the plan does not
+     depend on the alignment, but this may sacrifice substantial
+     performance on architectures (like x86) with SIMD instructions
+     (*note SIMD alignment and fftw_malloc::).  
+
+
+
+File: fftw3.info,  Node: Fortran Examples,  Next: Wisdom of Fortran?,  Prev: FFTW Execution in Fortran,  Up: Calling FFTW from Legacy Fortran
+
+8.4 Fortran Examples
+====================
+
+In C, you might have something like the following to transform a
+one-dimensional complex array:
+
+             fftw_complex in[N], out[N];
+             fftw_plan plan;
+
+             plan = fftw_plan_dft_1d(N,in,out,FFTW_FORWARD,FFTW_ESTIMATE);
+             fftw_execute(plan);
+             fftw_destroy_plan(plan);
+
+   In Fortran, you would use the following to accomplish the same thing:
+
+             double complex in, out
+             dimension in(N), out(N)
+             integer*8 plan
+
+             call dfftw_plan_dft_1d(plan,N,in,out,FFTW_FORWARD,FFTW_ESTIMATE)
+             call dfftw_execute_dft(plan, in, out)
+             call dfftw_destroy_plan(plan)
+   
+   Notice how all routines are called as Fortran subroutines, and the
+plan is returned via the first argument to `dfftw_plan_dft_1d'.  Notice
+also that we changed `fftw_execute' to `dfftw_execute_dft' (*note FFTW
+Execution in Fortran::).  To do the same thing, but using 8 threads in
+parallel (*note Multi-threaded FFTW::), you would simply prefix these
+calls with:
+
+             integer iret
+             call dfftw_init_threads(iret)
+             call dfftw_plan_with_nthreads(8)
+   
+   (You might want to check the value of `iret': if it is zero, it
+indicates an unlikely error during thread initialization.)
+
+   To transform a three-dimensional array in-place with C, you might do:
+
+             fftw_complex arr[L][M][N];
+             fftw_plan plan;
+
+             plan = fftw_plan_dft_3d(L,M,N, arr,arr,
+                                     FFTW_FORWARD, FFTW_ESTIMATE);
+             fftw_execute(plan);
+             fftw_destroy_plan(plan);
+
+   In Fortran, you would use this instead:
+
+             double complex arr
+             dimension arr(L,M,N)
+             integer*8 plan
+
+             call dfftw_plan_dft_3d(plan, L,M,N, arr,arr,
+            &                       FFTW_FORWARD, FFTW_ESTIMATE)
+             call dfftw_execute_dft(plan, arr, arr)
+             call dfftw_destroy_plan(plan)
+   
+   Note that we pass the array dimensions in the "natural" order in
+both C and Fortran.
+
+   To transform a one-dimensional real array in Fortran, you might do:
+
+             double precision in
+             dimension in(N)
+             double complex out
+             dimension out(N/2 + 1)
+             integer*8 plan
+
+             call dfftw_plan_dft_r2c_1d(plan,N,in,out,FFTW_ESTIMATE)
+             call dfftw_execute_dft_r2c(plan, in, out)
+             call dfftw_destroy_plan(plan)
+   
+   To transform a two-dimensional real array, out of place, you might
+use the following:
+
+             double precision in
+             dimension in(M,N)
+             double complex out
+             dimension out(M/2 + 1, N)
+             integer*8 plan
+
+             call dfftw_plan_dft_r2c_2d(plan,M,N,in,out,FFTW_ESTIMATE)
+             call dfftw_execute_dft_r2c(plan, in, out)
+             call dfftw_destroy_plan(plan)
+   
+   *Important:* Notice that it is the _first_ dimension of the complex
+output array that is cut in half in Fortran, rather than the last
+dimension as in C.  This is a consequence of the interface routines
+reversing the order of the array dimensions passed to FFTW so that the
+Fortran program can use its ordinary column-major order.  
+
+
+File: fftw3.info,  Node: Wisdom of Fortran?,  Prev: Fortran Examples,  Up: Calling FFTW from Legacy Fortran
+
+8.5 Wisdom of Fortran?
+======================
+
+In this section, we discuss how one can import/export FFTW wisdom
+(saved plans) to/from a Fortran program; we assume that the reader is
+already familiar with wisdom, as described in *note Words of
+Wisdom-Saving Plans::.
+
+   The basic problem is that is difficult to (portably) pass files and
+strings between Fortran and C, so we cannot provide a direct Fortran
+equivalent to the `fftw_export_wisdom_to_file', etcetera, functions.
+Fortran interfaces _are_ provided for the functions that do not take
+file/string arguments, however: `dfftw_import_system_wisdom',
+`dfftw_import_wisdom', `dfftw_export_wisdom', and `dfftw_forget_wisdom'.  
+
+   So, for example, to import the system-wide wisdom, you would do:
+
+             integer isuccess
+             call dfftw_import_system_wisdom(isuccess)
+
+   As usual, the C return value is turned into a first parameter;
+`isuccess' is non-zero on success and zero on failure (e.g. if there is
+no system wisdom installed).
+
+   If you want to import/export wisdom from/to an arbitrary file or
+elsewhere, you can employ the generic `dfftw_import_wisdom' and
+`dfftw_export_wisdom' functions, for which you must supply a subroutine
+to read/write one character at a time.  The FFTW package contains an
+example file `doc/f77_wisdom.f' demonstrating how to implement
+`import_wisdom_from_file' and `export_wisdom_to_file' subroutines in
+this way.  (These routines cannot be compiled into the FFTW library
+itself, lest all FFTW-using programs be required to link with the
+Fortran I/O library.)
+
+
+File: fftw3.info,  Node: Upgrading from FFTW version 2,  Next: Installation and Customization,  Prev: Calling FFTW from Legacy Fortran,  Up: Top
+
+9 Upgrading from FFTW version 2
+*******************************
+
+In this chapter, we outline the process for updating codes designed for
+the older FFTW 2 interface to work with FFTW 3.  The interface for FFTW
+3 is not backwards-compatible with the interface for FFTW 2 and earlier
+versions; codes written to use those versions will fail to link with
+FFTW 3.  Nor is it possible to write "compatibility wrappers" to bridge
+the gap (at least not efficiently), because FFTW 3 has different
+semantics from previous versions.  However, upgrading should be a
+straightforward process because the data formats are identical and the
+overall style of planning/execution is essentially the same.
+
+   Unlike FFTW 2, there are no separate header files for real and
+complex transforms (or even for different precisions) in FFTW 3; all
+interfaces are defined in the `<fftw3.h>' header file.
+
+Numeric Types
+=============
+
+The main difference in data types is that `fftw_complex' in FFTW 2 was
+defined as a `struct' with macros `c_re' and `c_im' for accessing the
+real/imaginary parts.  (This is binary-compatible with FFTW 3 on any
+machine except perhaps for some older Crays in single precision.)  The
+equivalent macros for FFTW 3 are:
+
+     #define c_re(c) ((c)[0])
+     #define c_im(c) ((c)[1])
+
+   This does not work if you are using the C99 complex type, however,
+unless you insert a `double*' typecast into the above macros (*note
+Complex numbers::).
+
+   Also, FFTW 2 had an `fftw_real' typedef that was an alias for
+`double' (in double precision).  In FFTW 3 you should just use `double'
+(or whatever precision you are employing).
+
+Plans
+=====
+
+The major difference between FFTW 2 and FFTW 3 is in the
+planning/execution division of labor.  In FFTW 2, plans were found for a
+given transform size and type, and then could be applied to _any_
+arrays and for _any_ multiplicity/stride parameters.  In FFTW 3, you
+specify the particular arrays, stride parameters, etcetera when
+creating the plan, and the plan is then executed for _those_ arrays
+(unless the guru interface is used) and _those_ parameters _only_.
+(FFTW 2 had "specific planner" routines that planned for a particular
+array and stride, but the plan could still be used for other arrays and
+strides.)  That is, much of the information that was formerly specified
+at execution time is now specified at planning time.
+
+   Like FFTW 2's specific planner routines, the FFTW 3 planner
+overwrites the input/output arrays unless you use `FFTW_ESTIMATE'.
+
+   FFTW 2 had separate data types `fftw_plan', `fftwnd_plan',
+`rfftw_plan', and `rfftwnd_plan' for complex and real one- and
+multi-dimensional transforms, and each type had its own `destroy'
+function.  In FFTW 3, all plans are of type `fftw_plan' and all are
+destroyed by `fftw_destroy_plan(plan)'.
+
+   Where you formerly used `fftw_create_plan' and `fftw_one' to plan
+and compute a single 1d transform, you would now use `fftw_plan_dft_1d'
+to plan the transform.  If you used the generic `fftw' function to
+execute the transform with multiplicity (`howmany') and stride
+parameters, you would now use the advanced interface
+`fftw_plan_many_dft' to specify those parameters.  The plans are now
+executed with `fftw_execute(plan)', which takes all of its parameters
+(including the input/output arrays) from the plan.
+
+   In-place transforms no longer interpret their output argument as
+scratch space, nor is there an `FFTW_IN_PLACE' flag.  You simply pass
+the same pointer for both the input and output arguments.  (Previously,
+the output `ostride' and `odist' parameters were ignored for in-place
+transforms; now, if they are specified via the advanced interface, they
+are significant even in the in-place case, although they should
+normally equal the corresponding input parameters.)
+
+   The `FFTW_ESTIMATE' and `FFTW_MEASURE' flags have the same meaning
+as before, although the planning time will differ.  You may also
+consider using `FFTW_PATIENT', which is like `FFTW_MEASURE' except that
+it takes more time in order to consider a wider variety of algorithms.
+
+   For multi-dimensional complex DFTs, instead of `fftwnd_create_plan'
+(or `fftw2d_create_plan' or `fftw3d_create_plan'), followed by
+`fftwnd_one', you would use `fftw_plan_dft' (or `fftw_plan_dft_2d' or
+`fftw_plan_dft_3d').  followed by `fftw_execute'.  If you used `fftwnd'
+to to specify strides etcetera, you would instead specify these via
+`fftw_plan_many_dft'.
+
+   The analogues to `rfftw_create_plan' and `rfftw_one' with
+`FFTW_REAL_TO_COMPLEX' or `FFTW_COMPLEX_TO_REAL' directions are
+`fftw_plan_r2r_1d' with kind `FFTW_R2HC' or `FFTW_HC2R', followed by
+`fftw_execute'.  The stride etcetera arguments of `rfftw' are now in
+`fftw_plan_many_r2r'.
+
+   Instead of `rfftwnd_create_plan' (or `rfftw2d_create_plan' or
+`rfftw3d_create_plan') followed by `rfftwnd_one_real_to_complex' or
+`rfftwnd_one_complex_to_real', you now use `fftw_plan_dft_r2c' (or
+`fftw_plan_dft_r2c_2d' or `fftw_plan_dft_r2c_3d') or
+`fftw_plan_dft_c2r' (or `fftw_plan_dft_c2r_2d' or
+`fftw_plan_dft_c2r_3d'), respectively, followed by `fftw_execute'.  As
+usual, the strides etcetera of `rfftwnd_real_to_complex' or
+`rfftwnd_complex_to_real' are no specified in the advanced planner
+routines, `fftw_plan_many_dft_r2c' or `fftw_plan_many_dft_c2r'.
+
+Wisdom
+======
+
+In FFTW 2, you had to supply the `FFTW_USE_WISDOM' flag in order to use
+wisdom; in FFTW 3, wisdom is always used.  (You could simulate the FFTW
+2 wisdom-less behavior by calling `fftw_forget_wisdom' after every
+planner call.)
+
+   The FFTW 3 wisdom import/export routines are almost the same as
+before (although the storage format is entirely different).  There is
+one significant difference, however.  In FFTW 2, the import routines
+would never read past the end of the wisdom, so you could store extra
+data beyond the wisdom in the same file, for example.  In FFTW 3, the
+file-import routine may read up to a few hundred bytes past the end of
+the wisdom, so you cannot store other data just beyond it.(1)
+
+   Wisdom has been enhanced by additional humility in FFTW 3: whereas
+FFTW 2 would re-use wisdom for a given transform size regardless of the
+stride etc., in FFTW 3 wisdom is only used with the strides etc. for
+which it was created.  Unfortunately, this means FFTW 3 has to create
+new plans from scratch more often than FFTW 2 (in FFTW 2, planning e.g.
+one transform of size 1024 also created wisdom for all smaller powers
+of 2, but this no longer occurs).
+
+   FFTW 3 also has the new routine `fftw_import_system_wisdom' to
+import wisdom from a standard system-wide location.
+
+Memory allocation
+=================
+
+In FFTW 3, we recommend allocating your arrays with `fftw_malloc' and
+deallocating them with `fftw_free'; this is not required, but allows
+optimal performance when SIMD acceleration is used.  (Those two
+functions actually existed in FFTW 2, and worked the same way, but were
+not documented.)
+
+   In FFTW 2, there were `fftw_malloc_hook' and `fftw_free_hook'
+functions that allowed the user to replace FFTW's memory-allocation
+routines (e.g. to implement different error-handling, since by default
+FFTW prints an error message and calls `exit' to abort the program if
+`malloc' returns `NULL').  These hooks are not supported in FFTW 3;
+those few users who require this functionality can just directly modify
+the memory-allocation routines in FFTW (they are defined in
+`kernel/alloc.c').
+
+Fortran interface
+=================
+
+In FFTW 2, the subroutine names were obtained by replacing `fftw_' with
+`fftw_f77'; in FFTW 3, you replace `fftw_' with `dfftw_' (or `sfftw_'
+or `lfftw_', depending upon the precision).
+
+   In FFTW 3, we have begun recommending that you always declare the
+type used to store plans as `integer*8'.  (Too many people didn't notice
+our instruction to switch from `integer' to `integer*8' for 64-bit
+machines.)
+
+   In FFTW 3, we provide a `fftw3.f' "header file" to include in your
+code (and which is officially installed on Unix systems).  (In FFTW 2,
+we supplied a `fftw_f77.i' file, but it was not installed.)
+
+   Otherwise, the C-Fortran interface relationship is much the same as
+it was before (e.g. return values become initial parameters, and
+multi-dimensional arrays are in column-major order).  Unlike FFTW 2, we
+do provide some support for wisdom import/export in Fortran (*note
+Wisdom of Fortran?::).
+
+Threads
+=======
+
+Like FFTW 2, only the execution routines are thread-safe.  All planner
+routines, etcetera, should be called by only a single thread at a time
+(*note Thread safety::).  _Unlike_ FFTW 2, there is no special
+`FFTW_THREADSAFE' flag for the planner to allow a given plan to be
+usable by multiple threads in parallel; this is now the case by default.
+
+   The multi-threaded version of FFTW 2 required you to pass the number
+of threads each time you execute the transform.  The number of threads
+is now stored in the plan, and is specified before the planner is
+called by `fftw_plan_with_nthreads'.  The threads initialization
+routine used to be called `fftw_threads_init' and would return zero on
+success; the new routine is called `fftw_init_threads' and returns zero
+on failure.  *Note Multi-threaded FFTW::.
+
+   There is no separate threads header file in FFTW 3; all the function
+prototypes are in `<fftw3.h>'.  However, you still have to link to a
+separate library (`-lfftw3_threads -lfftw3 -lm' on Unix), as well as to
+the threading library (e.g. POSIX threads on Unix).
+
+   ---------- Footnotes ----------
+
+   (1) We do our own buffering because GNU libc I/O routines are
+horribly slow for single-character I/O, apparently for thread-safety
+reasons (whether you are using threads or not).
+
+
+File: fftw3.info,  Node: Installation and Customization,  Next: Acknowledgments,  Prev: Upgrading from FFTW version 2,  Up: Top
+
+10 Installation and Customization
+*********************************
+
+This chapter describes the installation and customization of FFTW, the
+latest version of which may be downloaded from the FFTW home page
+(http://www.fftw.org).
+
+   In principle, FFTW should work on any system with an ANSI C compiler
+(`gcc' is fine).  However, planner time is drastically reduced if FFTW
+can exploit a hardware cycle counter; FFTW comes with cycle-counter
+support for all modern general-purpose CPUs, but you may need to add a
+couple of lines of code if your compiler is not yet supported (*note
+Cycle Counters::).  (On Unix, there will be a warning at the end of the
+`configure' output if no cycle counter is found.)  
+
+   Installation of FFTW is simplest if you have a Unix or a GNU system,
+such as GNU/Linux, and we describe this case in the first section below,
+including the use of special configuration options to e.g. install
+different precisions or exploit optimizations for particular
+architectures (e.g. SIMD).  Compilation on non-Unix systems is a more
+manual process, but we outline the procedure in the second section.  It
+is also likely that pre-compiled binaries will be available for popular
+systems.
+
+   Finally, we describe how you can customize FFTW for particular needs
+by generating _codelets_ for fast transforms of sizes not supported
+efficiently by the standard FFTW distribution.  
+
+* Menu:
+
+* Installation on Unix::
+* Installation on non-Unix systems::
+* Cycle Counters::
+* Generating your own code::
+
+
+File: fftw3.info,  Node: Installation on Unix,  Next: Installation on non-Unix systems,  Prev: Installation and Customization,  Up: Installation and Customization
+
+10.1 Installation on Unix
+=========================
+
+FFTW comes with a `configure' program in the GNU style.  Installation
+can be as simple as: 
+
+     ./configure
+     make
+     make install
+
+   This will build the uniprocessor complex and real transform libraries
+along with the test programs.  (We recommend that you use GNU `make' if
+it is available; on some systems it is called `gmake'.)  The "`make
+install'" command installs the fftw and rfftw libraries in standard
+places, and typically requires root privileges (unless you specify a
+different install directory with the `--prefix' flag to `configure').
+You can also type "`make check'" to put the FFTW test programs through
+their paces.  If you have problems during configuration or compilation,
+you may want to run "`make distclean'" before trying again; this
+ensures that you don't have any stale files left over from previous
+compilation attempts.
+
+   The `configure' script chooses the `gcc' compiler by default, if it
+is available; you can select some other compiler with:
+     ./configure CC="<the name of your C compiler>"
+
+   The `configure' script knows good `CFLAGS' (C compiler flags) for a
+few systems.  If your system is not known, the `configure' script will
+print out a warning.  In this case, you should re-configure FFTW with
+the command
+     ./configure CFLAGS="<write your CFLAGS here>"
+   and then compile as usual.  If you do find an optimal set of
+`CFLAGS' for your system, please let us know what they are (along with
+the output of `config.guess') so that we can include them in future
+releases.
+
+   `configure' supports all the standard flags defined by the GNU
+Coding Standards; see the `INSTALL' file in FFTW or the GNU web page
+(http://www.gnu.org/prep/standards/html_node/index.html).  Note
+especially `--help' to list all flags and `--enable-shared' to create
+shared, rather than static, libraries.  `configure' also accepts a few
+FFTW-specific flags, particularly:
+
+   * `--enable-float': Produces a single-precision version of FFTW
+     (`float') instead of the default double-precision (`double').
+     *Note Precision::.
+
+   * `--enable-long-double': Produces a long-double precision version of
+     FFTW (`long double') instead of the default double-precision
+     (`double').  The `configure' script will halt with an error
+     message if `long double' is the same size as `double' on your
+     machine/compiler.  *Note Precision::.
+
+   * `--enable-quad-precision': Produces a quadruple-precision version
+     of FFTW using the nonstandard `__float128' type provided by `gcc'
+     4.6 or later on x86, x86-64, and Itanium architectures, instead of
+     the default double-precision (`double').  The `configure' script
+     will halt with an error message if the compiler is not `gcc'
+     version 4.6 or later or if `gcc''s `libquadmath' library is not
+     installed.  *Note Precision::.
+
+   * `--enable-threads': Enables compilation and installation of the
+     FFTW threads library (*note Multi-threaded FFTW::), which provides
+     a simple interface to parallel transforms for SMP systems.  By
+     default, the threads routines are not compiled.
+
+   * `--enable-openmp': Like `--enable-threads', but using OpenMP
+     compiler directives in order to induce parallelism rather than
+     spawning its own threads directly, and installing an `fftw3_omp'
+     library rather than an `fftw3_threads' library (*note
+     Multi-threaded FFTW::).  You can use both `--enable-openmp' and
+     `--enable-threads' since they compile/install libraries with
+     different names.  By default, the OpenMP routines are not compiled.
+
+   * `--with-combined-threads': By default, if `--enable-threads' is
+     used, the threads support is compiled into a separate library that
+     must be linked in addition to the main FFTW library.  This is so
+     that users of the serial library do not need to link the system
+     threads libraries.  If `--with-combined-threads' is specified,
+     however, then no separate threads library is created, and threads
+     are included in the main FFTW library.  This is mainly useful
+     under Windows, where no system threads library is required and
+     inter-library dependencies are problematic.
+
+   * `--enable-mpi': Enables compilation and installation of the FFTW
+     MPI library (*note Distributed-memory FFTW with MPI::), which
+     provides parallel transforms for distributed-memory systems with
+     MPI.  (By default, the MPI routines are not compiled.)  *Note FFTW
+     MPI Installation::.
+
+   * `--disable-fortran': Disables inclusion of legacy-Fortran wrapper
+     routines (*note Calling FFTW from Legacy Fortran::) in the standard
+     FFTW libraries.  These wrapper routines increase the library size
+     by only a negligible amount, so they are included by default as
+     long as the `configure' script finds a Fortran compiler on your
+     system.  (To specify a particular Fortran compiler foo, pass
+     `F77='foo to `configure'.)
+
+   * `--with-g77-wrappers': By default, when Fortran wrappers are
+     included, the wrappers employ the linking conventions of the
+     Fortran compiler detected by the `configure' script.  If this
+     compiler is GNU `g77', however, then _two_ versions of the
+     wrappers are included: one with `g77''s idiosyncratic convention
+     of appending two underscores to identifiers, and one with the more
+     common convention of appending only a single underscore.  This
+     way, the same FFTW library will work with both `g77' and other
+     Fortran compilers, such as GNU `gfortran'.  However, the converse
+     is not true: if you configure with a different compiler, then the
+     `g77'-compatible wrappers are not included.  By specifying
+     `--with-g77-wrappers', the `g77'-compatible wrappers are included
+     in addition to wrappers for whatever Fortran compiler `configure'
+     finds.  
+
+   * `--with-slow-timer': Disables the use of hardware cycle counters,
+     and falls back on `gettimeofday' or `clock'.  This greatly worsens
+     performance, and should generally not be used (unless you don't
+     have a cycle counter but still really want an optimized plan
+     regardless of the time).  *Note Cycle Counters::.
+
+   * `--enable-sse', `--enable-sse2', `--enable-avx',
+     `--enable-altivec', `--enable-neon': Enable the compilation of
+     SIMD code for SSE (Pentium III+), SSE2 (Pentium IV+), AVX (Sandy
+     Bridge, Interlagos), AltiVec (PowerPC G4+), NEON (some ARM
+     processors).  SSE, AltiVec, and NEON only work with
+     `--enable-float' (above).  SSE2 works in both single and double
+     precision (and is simply SSE in single precision).  The resulting
+     code will _still work_ on earlier CPUs lacking the SIMD extensions
+     (SIMD is automatically disabled, although the FFTW library is
+     still larger).
+        - These options require a compiler supporting SIMD extensions,
+          and compiler support is always a bit flaky: see the FFTW FAQ
+          for a list of compiler versions that have problems compiling
+          FFTW.
+
+        - With AltiVec and `gcc', you may have to use the
+          `-mabi=altivec' option when compiling any code that links to
+          FFTW, in order to properly align the stack; otherwise, FFTW
+          could crash when it tries to use an AltiVec feature.  (This
+          is not necessary on MacOS X.)
+
+        - With SSE/SSE2 and `gcc', you should use a version of gcc that
+          properly aligns the stack when compiling any code that links
+          to FFTW.  By default, `gcc' 2.95 and later versions align the
+          stack as needed, but you should not compile FFTW with the
+          `-Os' option or the `-mpreferred-stack-boundary' option with
+          an argument less than 4.
+
+        - Because of the large variety of ARM processors and ABIs, FFTW
+          does not attempt to guess the correct `gcc' flags for
+          generating NEON code.  In general, you will have to provide
+          them on the command line.  This command line is known to have
+          worked at least once:
+               ./configure --with-slow-timer --host=arm-linux-gnueabi \
+                 --enable-single --enable-neon \
+                 "CC=arm-linux-gnueabi-gcc -march=armv7-a -mfloat-abi=softfp"
+
+
+   To force `configure' to use a particular C compiler foo (instead of
+the default, usually `gcc'), pass `CC='foo to the `configure' script;
+you may also need to set the flags via the variable `CFLAGS' as
+described above.  
+
+
+File: fftw3.info,  Node: Installation on non-Unix systems,  Next: Cycle Counters,  Prev: Installation on Unix,  Up: Installation and Customization
+
+10.2 Installation on non-Unix systems
+=====================================
+
+It should be relatively straightforward to compile FFTW even on non-Unix
+systems lacking the niceties of a `configure' script.  Basically, you
+need to edit the `config.h' header (copy it from `config.h.in') to
+`#define' the various options and compiler characteristics, and then
+compile all the `.c' files in the relevant directories.
+
+   The `config.h' header contains about 100 options to set, each one
+initially an `#undef', each documented with a comment, and most of them
+fairly obvious.  For most of the options, you should simply `#define'
+them to `1' if they are applicable, although a few options require a
+particular value (e.g. `SIZEOF_LONG_LONG' should be defined to the size
+of the `long long' type, in bytes, or zero if it is not supported).  We
+will likely post some sample `config.h' files for various operating
+systems and compilers for you to use (at least as a starting point).
+Please let us know if you have to hand-create a configuration file
+(and/or a pre-compiled binary) that you want to share.
+
+   To create the FFTW library, you will then need to compile all of the
+`.c' files in the `kernel', `dft', `dft/scalar', `dft/scalar/codelets',
+`rdft', `rdft/scalar', `rdft/scalar/r2cf', `rdft/scalar/r2cb',
+`rdft/scalar/r2r', `reodft', and `api' directories.  If you are
+compiling with SIMD support (e.g. you defined `HAVE_SSE2' in
+`config.h'), then you also need to compile the `.c' files in the
+`simd-support', `{dft,rdft}/simd', `{dft,rdft}/simd/*' directories.
+
+   Once these files are all compiled, link them into a library, or a
+shared library, or directly into your program.
+
+   To compile the FFTW test program, additionally compile the code in
+the `libbench2/' directory, and link it into a library.  Then compile
+the code in the `tests/' directory and link it to the `libbench2' and
+FFTW libraries.  To compile the `fftw-wisdom' (command-line) tool
+(*note Wisdom Utilities::), compile `tools/fftw-wisdom.c' and link it
+to the `libbench2' and FFTW libraries
+
+
+File: fftw3.info,  Node: Cycle Counters,  Next: Generating your own code,  Prev: Installation on non-Unix systems,  Up: Installation and Customization
+
+10.3 Cycle Counters
+===================
+
+FFTW's planner actually executes and times different possible FFT
+algorithms in order to pick the fastest plan for a given n.  In order
+to do this in as short a time as possible, however, the timer must have
+a very high resolution, and to accomplish this we employ the hardware
+"cycle counters" that are available on most CPUs.  Currently, FFTW
+supports the cycle counters on x86, PowerPC/POWER, Alpha, UltraSPARC
+(SPARC v9), IA64, PA-RISC, and MIPS processors.
+
+   Access to the cycle counters, unfortunately, is a compiler and/or
+operating-system dependent task, often requiring inline assembly
+language, and it may be that your compiler is not supported.  If you are
+_not_ supported, FFTW will by default fall back on its estimator
+(effectively using `FFTW_ESTIMATE' for all plans).  
+
+   You can add support by editing the file `kernel/cycle.h'; normally,
+this will involve adapting one of the examples already present in order
+to use the inline-assembler syntax for your C compiler, and will only
+require a couple of lines of code.  Anyone adding support for a new
+system to `cycle.h' is encouraged to email us at <fftw@fftw.org>.
+
+   If a cycle counter is not available on your system (e.g. some
+embedded processor), and you don't want to use estimated plans, as a
+last resort you can use the `--with-slow-timer' option to `configure'
+(on Unix) or `#define WITH_SLOW_TIMER' in `config.h' (elsewhere).  This
+will use the much lower-resolution `gettimeofday' function, or even
+`clock' if the former is unavailable, and planning will be extremely
+slow.
+
+
+File: fftw3.info,  Node: Generating your own code,  Prev: Cycle Counters,  Up: Installation and Customization
+
+10.4 Generating your own code
+=============================
+
+The directory `genfft' contains the programs that were used to generate
+FFTW's "codelets," which are hard-coded transforms of small sizes.  We
+do not expect casual users to employ the generator, which is a rather
+sophisticated program that generates directed acyclic graphs of FFT
+algorithms and performs algebraic simplifications on them.  It was
+written in Objective Caml, a dialect of ML, which is available at
+`http://caml.inria.fr/ocaml/index.en.html'.  
+
+   If you have Objective Caml installed (along with recent versions of
+GNU `autoconf', `automake', and `libtool'), then you can change the set
+of codelets that are generated or play with the generation options.
+The set of generated codelets is specified by the
+`{dft,rdft}/{codelets,simd}/*/Makefile.am' files.  For example, you can
+add efficient REDFT codelets of small sizes by modifying
+`rdft/codelets/r2r/Makefile.am'.  After you modify any `Makefile.am'
+files, you can type `sh bootstrap.sh' in the top-level directory
+followed by `make' to re-generate the files.
+
+   We do not provide more details about the code-generation process,
+since we do not expect that most users will need to generate their own
+code.  However, feel free to contact us at <fftw@fftw.org> if you are
+interested in the subject.
+
+   You might find it interesting to learn Caml and/or some modern
+programming techniques that we used in the generator (including monadic
+programming), especially if you heard the rumor that Java and
+object-oriented programming are the latest advancement in the field.
+The internal operation of the codelet generator is described in the
+paper, "A Fast Fourier Transform Compiler," by M. Frigo, which is
+available from the FFTW home page (http://www.fftw.org) and also
+appeared in the `Proceedings of the 1999 ACM SIGPLAN Conference on
+Programming Language Design and Implementation (PLDI)'.
+
+
+File: fftw3.info,  Node: Acknowledgments,  Next: License and Copyright,  Prev: Installation and Customization,  Up: Top
+
+11 Acknowledgments
+******************
+
+Matteo Frigo was supported in part by the Special Research Program SFB
+F011 "AURORA" of the Austrian Science Fund FWF and by MIT Lincoln
+Laboratory.  For previous versions of FFTW, he was supported in part by
+the Defense Advanced Research Projects Agency (DARPA), under Grants
+N00014-94-1-0985 and F30602-97-1-0270, and by a Digital Equipment
+Corporation Fellowship.
+
+   Steven G. Johnson was supported in part by a Dept. of Defense NDSEG
+Fellowship, an MIT Karl Taylor Compton Fellowship, and by the Materials
+Research Science and Engineering Center program of the National Science
+Foundation under award DMR-9400334.
+
+   Code for the Cell Broadband Engine was graciously donated to the FFTW
+project by the IBM Austin Research Lab and included in fftw-3.2.  (This
+code was removed in fftw-3.3.)
+
+   Code for the MIPS paired-single SIMD support was graciously donated
+to the FFTW project by CodeSourcery, Inc.
+
+   We are grateful to Sun Microsystems Inc. for its donation of a
+cluster of 9 8-processor Ultra HPC 5000 SMPs (24 Gflops peak). These
+machines served as the primary platform for the development of early
+versions of FFTW.
+
+   We thank Intel Corporation for donating a four-processor Pentium Pro
+machine.  We thank the GNU/Linux community for giving us a decent OS to
+run on that machine.
+
+   We are thankful to the AMD corporation for donating an AMD Athlon XP
+1700+ computer to the FFTW project.
+
+   We thank the Compaq/HP testdrive program and VA Software Corporation
+(SourceForge.net) for providing remote access to machines that were used
+to test FFTW.
+
+   The `genfft' suite of code generators was written using Objective
+Caml, a dialect of ML.  Objective Caml is a small and elegant language
+developed by Xavier Leroy.  The implementation is available from
+`http://caml.inria.fr/' (http://caml.inria.fr/).  In previous releases
+of FFTW, `genfft' was written in Caml Light, by the same authors.  An
+even earlier implementation of `genfft' was written in Scheme, but Caml
+is definitely better for this kind of application.  
+
+   FFTW uses many tools from the GNU project, including `automake',
+`texinfo', and `libtool'.
+
+   Prof. Charles E. Leiserson of MIT provided continuous support and
+encouragement.  This program would not exist without him.  Charles also
+proposed the name "codelets" for the basic FFT blocks.  
+
+   Prof. John D. Joannopoulos of MIT demonstrated continuing tolerance
+of Steven's "extra-curricular" computer-science activities, as well as
+remarkable creativity in working them into his grant proposals.
+Steven's physics degree would not exist without him.
+
+   Franz Franchetti wrote SIMD extensions to FFTW 2, which eventually
+led to the SIMD support in FFTW 3.
+
+   Stefan Kral wrote most of the K7 code generator distributed with FFTW
+3.0.x and 3.1.x.
+
+   Andrew Sterian contributed the Windows timing code in FFTW 2.
+
+   Didier Miras reported a bug in the test procedure used in FFTW 1.2.
+We now use a completely different test algorithm by Funda Ergun that
+does not require a separate FFT program to compare against.
+
+   Wolfgang Reimer contributed the Pentium cycle counter and a few fixes
+that help portability.
+
+   Ming-Chang Liu uncovered a well-hidden bug in the complex transforms
+of FFTW 2.0 and supplied a patch to correct it.
+
+   The FFTW FAQ was written in `bfnn' (Bizarre Format With No Name) and
+formatted using the tools developed by Ian Jackson for the Linux FAQ.
+
+   _We are especially thankful to all of our users for their continuing
+support, feedback, and interest during our development of FFTW._
+
+
+File: fftw3.info,  Node: License and Copyright,  Next: Concept Index,  Prev: Acknowledgments,  Up: Top
+
+12 License and Copyright
+************************
+
+FFTW is Copyright (C) 2003, 2007-11 Matteo Frigo, Copyright (C) 2003,
+2007-11 Massachusetts Institute of Technology.
+
+   FFTW is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software Foundation,
+Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA You
+can also find the GPL on the GNU web site
+(http://www.gnu.org/licenses/gpl-2.0.html).
+
+   In addition, we kindly ask you to acknowledge FFTW and its authors in
+any program or publication in which you use FFTW.  (You are not
+_required_ to do so; it is up to your common sense to decide whether
+you want to comply with this request or not.)  For general
+publications, we suggest referencing: Matteo Frigo and Steven G.
+Johnson, "The design and implementation of FFTW3," Proc. IEEE 93 (2),
+216-231 (2005).
+
+   Non-free versions of FFTW are available under terms different from
+those of the General Public License. (e.g. they do not require you to
+accompany any object code using FFTW with the corresponding source
+code.)  For these alternative terms you must purchase a license from
+MIT's Technology Licensing Office.  Users interested in such a license
+should contact us (<fftw@fftw.org>) for more information.
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/fftw3.info-2
Binary file src/fftw-3.3.3/doc/fftw3.info-2 has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/fftw3.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/fftw3.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,210 @@
+\input texinfo    @c -*-texinfo-*-
+@c Update by C-x C-e on: (texinfo-multiple-files-update "fftw3.texi" nil t)
+@setfilename fftw3.info
+@include version.texi
+@settitle FFTW @value{VERSION}
+@setchapternewpage odd
+@c define constant index (ct)
+@defcodeindex ct
+@syncodeindex ct fn
+@syncodeindex vr fn
+@syncodeindex pg fn
+@syncodeindex tp fn
+@c define foreign function index (ff)
+@defcodeindex ff
+@syncodeindex ff cp
+@c define foreign constant index (fc)
+@defcodeindex fc
+@syncodeindex fc cp
+@c define foreign program index (fp)
+@defcodeindex fp
+@syncodeindex fp cp
+@comment %**end of header
+
+@iftex
+@paragraphindent 0
+@parskip=@medskipamount
+@end iftex
+
+@macro Onlogn
+@ifinfo
+O(n log n)
+@end ifinfo
+@html
+<i>O</i>(<i>n</i>&nbsp;log&nbsp;<i>n</i>)
+@end html
+@tex
+$O(n \\log n)$
+@end tex
+@end macro
+
+@macro ndims
+@ifinfo
+n[0] x n[1] x n[2] x ... x n[d-1]
+@end ifinfo
+@html
+n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub>
+@end html
+@tex
+$n_0 \\times n_1 \\times n_2 \\times \\cdots \\times n_{d-1}$
+@end tex
+@end macro
+
+@macro ndimshalf
+@ifinfo
+n[0] x n[1] x n[2] x ... x (n[d-1]/2 + 1)
+@end ifinfo
+@html
+n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;(n<sub>d-1</sub>/2 + 1)
+@end html
+@tex
+$n_0 \\times n_1 \\times n_2 \\times \\cdots \\times (n_{d-1}/2 + 1)$
+@end tex
+@end macro
+
+@macro ndimspad
+@ifinfo
+n[0] x n[1] x n[2] x ... x [2 (n[d-1]/2 + 1)]
+@end ifinfo
+@html
+n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;[2&nbsp;(n<sub>d-1</sub>/2 + 1)]
+@end html
+@tex
+$n_0 \\times n_1 \\times n_2 \\times \\cdots \\times [2(n_{d-1}/2 + 1)]$
+@end tex
+@end macro
+
+@macro twodims{d1, d2}
+@ifinfo
+\d1\ x \d2\
+@end ifinfo
+@html
+\d1\&nbsp;&times;&nbsp;\d2\
+@end html
+@tex
+$\d1\ \\times \d2\$
+@end tex
+@end macro
+
+@macro threedims{d1, d2, d3}
+@ifinfo
+\d1\ x \d2\ x \d3\
+@end ifinfo
+@html
+\d1\&nbsp;&times;&nbsp;\d2\&nbsp;&times;&nbsp;\d3\
+@end html
+@tex
+$\d1\ \\times \d2\ \\times \d3\$
+@end tex
+@end macro
+
+@macro dimk{k}
+@ifinfo
+n[\k\]
+@end ifinfo
+@html
+n<sub>\k\</sub>
+@end html
+@tex
+$n_\k\$
+@end tex
+@end macro
+
+
+@macro ndimstrans
+@ifinfo
+n[1] x n[0] x n[2] x ... x n[d-1]
+@end ifinfo
+@html
+n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&hellip;&times;&nbsp;n<sub>d-1</sub>
+@end html
+@tex
+$n_1 \\times n_0 \\times n_2 \\times \\cdots \\times n_{d-1}$
+@end tex
+@end macro
+
+@copying
+This manual is for FFTW
+(version @value{VERSION}, @value{UPDATED}).
+
+Copyright @copyright{} 2003 Matteo Frigo.
+
+Copyright @copyright{} 2003 Massachusetts Institute of Technology.
+
+@quotation
+Permission is granted to make and distribute verbatim copies of this
+manual provided the copyright notice and this permission notice are
+preserved on all copies.
+
+Permission is granted to copy and distribute modified versions of this
+manual under the conditions for verbatim copying, provided that the
+entire resulting derived work is distributed under the terms of a
+permission notice identical to this one.
+
+Permission is granted to copy and distribute translations of this manual
+into another language, under the above conditions for modified versions,
+except that this permission notice may be stated in a translation
+approved by the Free Software Foundation.
+@end quotation
+@end copying
+
+@dircategory Texinfo documentation system
+@direntry
+* fftw3: (fftw3).	FFTW User's Manual.
+@end direntry
+
+@titlepage
+@title FFTW
+@subtitle for version @value{VERSION}, @value{UPDATED}
+@author{Matteo Frigo}
+@author{Steven G. Johnson}
+@page
+@vskip 0pt plus 1filll
+@insertcopying
+@end titlepage
+
+@contents
+
+@ifnottex
+@node Top, Introduction, (dir), (dir)
+@top FFTW User Manual
+Welcome to FFTW, the Fastest Fourier Transform in the West.  FFTW is a
+collection of fast C routines to compute the discrete Fourier transform.
+This manual documents FFTW version @value{VERSION}.
+@end ifnottex
+
+@menu
+* Introduction::                
+* Tutorial::                    
+* Other Important Topics::      
+* FFTW Reference::              
+* Multi-threaded FFTW::         
+* Distributed-memory FFTW with MPI::  
+* Calling FFTW from Modern Fortran::  
+* Calling FFTW from Legacy Fortran::  
+* Upgrading from FFTW version 2::  
+* Installation and Customization::  
+* Acknowledgments::             
+* License and Copyright::       
+* Concept Index::               
+* Library Index::               
+@end menu
+
+@c ************************************************************
+@include intro.texi
+@include tutorial.texi
+@include other.texi
+@include reference.texi
+@include threads.texi
+@include mpi.texi
+@include modern-fortran.texi
+@include legacy-fortran.texi
+@include upgrading.texi
+@include install.texi
+@include acknowledgements.texi
+@include license.texi
+@include cindex.texi
+@include findex.texi
+@c ************************************************************
+
+@bye
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/findex.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/findex.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+@node Library Index, , Concept Index, Top
+@chapter Library Index
+@printindex fn
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,71 @@
+<html lang="en">
+<head>
+<title>1d Discrete Hartley Transforms (DHTs) - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes" title="What FFTW Really Computes">
+<link rel="prev" href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029" title="1d Real-odd DFTs (DSTs)">
+<link rel="next" href="Multi_002ddimensional-Transforms.html#Multi_002ddimensional-Transforms" title="Multi-dimensional Transforms">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="1d-Discrete-Hartley-Transforms-(DHTs)"></a>
+<a name="g_t1d-Discrete-Hartley-Transforms-_0028DHTs_0029"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Multi_002ddimensional-Transforms.html#Multi_002ddimensional-Transforms">Multi-dimensional Transforms</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.8.5 1d Discrete Hartley Transforms (DHTs)</h4>
+
+<p><a name="index-discrete-Hartley-transform-322"></a><a name="index-DHT-323"></a>The discrete Hartley transform (DHT) of a 1d real array X of size
+n computes a real array Y of the same size, where:
+<center><img src="equation-dht.png" align="top">.</center>
+
+   <p><a name="index-normalization-324"></a>FFTW computes an unnormalized transform, in that there is no coefficient
+in front of the summation in the DHT.  In other words, applying the
+transform twice (the DHT is its own inverse) will multiply the input by
+n.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/1d-Real_002deven-DFTs-_0028DCTs_0029.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/1d-Real_002deven-DFTs-_0028DCTs_0029.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,127 @@
+<html lang="en">
+<head>
+<title>1d Real-even DFTs (DCTs) - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes" title="What FFTW Really Computes">
+<link rel="prev" href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT" title="The 1d Real-data DFT">
+<link rel="next" href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029" title="1d Real-odd DFTs (DSTs)">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="1d-Real-even-DFTs-(DCTs)"></a>
+<a name="g_t1d-Real_002deven-DFTs-_0028DCTs_0029"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT">The 1d Real-data DFT</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.8.3 1d Real-even DFTs (DCTs)</h4>
+
+<p>The Real-even symmetry DFTs in FFTW are exactly equivalent to the unnormalized
+forward (and backward) DFTs as defined above, where the input array
+X of length N is purely real and is also <dfn>even</dfn> symmetry.  In
+this case, the output array is likewise real and even symmetry. 
+<a name="index-real_002deven-DFT-301"></a><a name="index-REDFT-302"></a>
+
+   <p><a name="index-REDFT00-303"></a>For the case of <code>REDFT00</code>, this even symmetry means that
+<i>X<sub>j</sub> = X<sub>N-j</sub></i>,where we take X to be periodic so that
+<i>X<sub>N</sub> = X</i><sub>0</sub>. Because of this redundancy, only the first n real numbers are
+actually stored, where N = 2(n-1).
+
+   <p>The proper definition of even symmetry for <code>REDFT10</code>,
+<code>REDFT01</code>, and <code>REDFT11</code> transforms is somewhat more intricate
+because of the shifts by 1/2 of the input and/or output, although
+the corresponding boundary conditions are given in <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a>.  Because of the even symmetry, however,
+the sine terms in the DFT all cancel and the remaining cosine terms are
+written explicitly below.  This formulation often leads people to call
+such a transform a <dfn>discrete cosine transform</dfn> (DCT), although it is
+really just a special case of the DFT. 
+<a name="index-discrete-cosine-transform-304"></a><a name="index-DCT-305"></a>
+
+   <p>In each of the definitions below, we transform a real array X of
+length n to a real array Y of length n:
+
+<h5 class="subsubheading">REDFT00 (DCT-I)</h5>
+
+<p><a name="index-REDFT00-306"></a>An <code>REDFT00</code> transform (type-I DCT) in FFTW is defined by:
+<center><img src="equation-redft00.png" align="top">.</center>Note that this transform is not defined for n=1.  For n=2,
+the summation term above is dropped as you might expect.
+
+<h5 class="subsubheading">REDFT10 (DCT-II)</h5>
+
+<p><a name="index-REDFT10-307"></a>An <code>REDFT10</code> transform (type-II DCT, sometimes called &ldquo;the&rdquo; DCT) in FFTW is defined by:
+<center><img src="equation-redft10.png" align="top">.</center>
+
+<h5 class="subsubheading">REDFT01 (DCT-III)</h5>
+
+<p><a name="index-REDFT01-308"></a>An <code>REDFT01</code> transform (type-III DCT) in FFTW is defined by:
+<center><img src="equation-redft01.png" align="top">.</center>In the case of n=1, this reduces to
+<i>Y</i><sub>0</sub> = <i>X</i><sub>0</sub>. Up to a scale factor (see below), this is the inverse of <code>REDFT10</code> (&ldquo;the&rdquo; DCT), and so the <code>REDFT01</code> (DCT-III) is sometimes called the &ldquo;IDCT&rdquo;. 
+<a name="index-IDCT-309"></a>
+
+<h5 class="subsubheading">REDFT11 (DCT-IV)</h5>
+
+<p><a name="index-REDFT11-310"></a>An <code>REDFT11</code> transform (type-IV DCT) in FFTW is defined by:
+<center><img src="equation-redft11.png" align="top">.</center>
+
+<h5 class="subsubheading">Inverses and Normalization</h5>
+
+<p>These definitions correspond directly to the unnormalized DFTs used
+elsewhere in FFTW (hence the factors of 2 in front of the
+summations).  The unnormalized inverse of <code>REDFT00</code> is
+<code>REDFT00</code>, of <code>REDFT10</code> is <code>REDFT01</code> and vice versa, and
+of <code>REDFT11</code> is <code>REDFT11</code>.  Each unnormalized inverse results
+in the original array multiplied by N, where N is the
+<em>logical</em> DFT size.  For <code>REDFT00</code>, N=2(n-1) (note that
+n=1 is not defined); otherwise, N=2n. 
+<a name="index-normalization-311"></a>
+
+   <p>In defining the discrete cosine transform, some authors also include
+additional factors of
+&radic;2(or its inverse) multiplying selected inputs and/or outputs.  This is a
+mostly cosmetic change that makes the transform orthogonal, but
+sacrifices the direct equivalence to a symmetric DFT.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/1d-Real_002dodd-DFTs-_0028DSTs_0029.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/1d-Real_002dodd-DFTs-_0028DSTs_0029.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,126 @@
+<html lang="en">
+<head>
+<title>1d Real-odd DFTs (DSTs) - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes" title="What FFTW Really Computes">
+<link rel="prev" href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029" title="1d Real-even DFTs (DCTs)">
+<link rel="next" href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#g_t1d-Discrete-Hartley-Transforms-_0028DHTs_0029" title="1d Discrete Hartley Transforms (DHTs)">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="1d-Real-odd-DFTs-(DSTs)"></a>
+<a name="g_t1d-Real_002dodd-DFTs-_0028DSTs_0029"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#g_t1d-Discrete-Hartley-Transforms-_0028DHTs_0029">1d Discrete Hartley Transforms (DHTs)</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.8.4 1d Real-odd DFTs (DSTs)</h4>
+
+<p>The Real-odd symmetry DFTs in FFTW are exactly equivalent to the unnormalized
+forward (and backward) DFTs as defined above, where the input array
+X of length N is purely real and is also <dfn>odd</dfn> symmetry.  In
+this case, the output is odd symmetry and purely imaginary. 
+<a name="index-real_002dodd-DFT-312"></a><a name="index-RODFT-313"></a>
+
+   <p><a name="index-RODFT00-314"></a>For the case of <code>RODFT00</code>, this odd symmetry means that
+<i>X<sub>j</sub> = -X<sub>N-j</sub></i>,where we take X to be periodic so that
+<i>X<sub>N</sub> = X</i><sub>0</sub>. Because of this redundancy, only the first n real numbers
+starting at j=1 are actually stored (the j=0 element is
+zero), where N = 2(n+1).
+
+   <p>The proper definition of odd symmetry for <code>RODFT10</code>,
+<code>RODFT01</code>, and <code>RODFT11</code> transforms is somewhat more intricate
+because of the shifts by 1/2 of the input and/or output, although
+the corresponding boundary conditions are given in <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a>.  Because of the odd symmetry, however,
+the cosine terms in the DFT all cancel and the remaining sine terms are
+written explicitly below.  This formulation often leads people to call
+such a transform a <dfn>discrete sine transform</dfn> (DST), although it is
+really just a special case of the DFT. 
+<a name="index-discrete-sine-transform-315"></a><a name="index-DST-316"></a>
+
+   <p>In each of the definitions below, we transform a real array X of
+length n to a real array Y of length n:
+
+<h5 class="subsubheading">RODFT00 (DST-I)</h5>
+
+<p><a name="index-RODFT00-317"></a>An <code>RODFT00</code> transform (type-I DST) in FFTW is defined by:
+<center><img src="equation-rodft00.png" align="top">.</center>
+
+<h5 class="subsubheading">RODFT10 (DST-II)</h5>
+
+<p><a name="index-RODFT10-318"></a>An <code>RODFT10</code> transform (type-II DST) in FFTW is defined by:
+<center><img src="equation-rodft10.png" align="top">.</center>
+
+<h5 class="subsubheading">RODFT01 (DST-III)</h5>
+
+<p><a name="index-RODFT01-319"></a>An <code>RODFT01</code> transform (type-III DST) in FFTW is defined by:
+<center><img src="equation-rodft01.png" align="top">.</center>In the case of n=1, this reduces to
+<i>Y</i><sub>0</sub> = <i>X</i><sub>0</sub>.
+
+<h5 class="subsubheading">RODFT11 (DST-IV)</h5>
+
+<p><a name="index-RODFT11-320"></a>An <code>RODFT11</code> transform (type-IV DST) in FFTW is defined by:
+<center><img src="equation-rodft11.png" align="top">.</center>
+
+<h5 class="subsubheading">Inverses and Normalization</h5>
+
+<p>These definitions correspond directly to the unnormalized DFTs used
+elsewhere in FFTW (hence the factors of 2 in front of the
+summations).  The unnormalized inverse of <code>RODFT00</code> is
+<code>RODFT00</code>, of <code>RODFT10</code> is <code>RODFT01</code> and vice versa, and
+of <code>RODFT11</code> is <code>RODFT11</code>.  Each unnormalized inverse results
+in the original array multiplied by N, where N is the
+<em>logical</em> DFT size.  For <code>RODFT00</code>, N=2(n+1);
+otherwise, N=2n. 
+<a name="index-normalization-321"></a>
+
+   <p>In defining the discrete sine transform, some authors also include
+additional factors of
+&radic;2(or its inverse) multiplying selected inputs and/or outputs.  This is a
+mostly cosmetic change that makes the transform orthogonal, but
+sacrifices the direct equivalence to an antisymmetric DFT.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/2d-MPI-example.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/2d-MPI-example.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,154 @@
+<html lang="en">
+<head>
+<title>2d MPI example - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW" title="Linking and Initializing MPI FFTW">
+<link rel="next" href="MPI-Data-Distribution.html#MPI-Data-Distribution" title="MPI Data Distribution">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="g_t2d-MPI-example"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW">Linking and Initializing MPI FFTW</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.3 2d MPI example</h3>
+
+<p>Before we document the FFTW MPI interface in detail, we begin with a
+simple example outlining how one would perform a two-dimensional
+<code>N0</code> by <code>N1</code> complex DFT.
+
+<pre class="example">     #include &lt;fftw3-mpi.h&gt;
+     
+     int main(int argc, char **argv)
+     {
+         const ptrdiff_t N0 = ..., N1 = ...;
+         fftw_plan plan;
+         fftw_complex *data;
+         ptrdiff_t alloc_local, local_n0, local_0_start, i, j;
+     
+         MPI_Init(&amp;argc, &amp;argv);
+         fftw_mpi_init();
+     
+         /* <span class="roman">get local data size and allocate</span> */
+         alloc_local = fftw_mpi_local_size_2d(N0, N1, MPI_COMM_WORLD,
+                                              &amp;local_n0, &amp;local_0_start);
+         data = fftw_alloc_complex(alloc_local);
+     
+         /* <span class="roman">create plan for in-place forward DFT</span> */
+         plan = fftw_mpi_plan_dft_2d(N0, N1, data, data, MPI_COMM_WORLD,
+                                     FFTW_FORWARD, FFTW_ESTIMATE);
+     
+         /* <span class="roman">initialize data to some function</span> my_function(x,y) */
+         for (i = 0; i &lt; local_n0; ++i) for (j = 0; j &lt; N1; ++j)
+            data[i*N1 + j] = my_function(local_0_start + i, j);
+     
+         /* <span class="roman">compute transforms, in-place, as many times as desired</span> */
+         fftw_execute(plan);
+     
+         fftw_destroy_plan(plan);
+     
+         MPI_Finalize();
+     }
+</pre>
+   <p>As can be seen above, the MPI interface follows the same basic style
+of allocate/plan/execute/destroy as the serial FFTW routines.  All of
+the MPI-specific routines are prefixed with &lsquo;<samp><span class="samp">fftw_mpi_</span></samp>&rsquo; instead
+of &lsquo;<samp><span class="samp">fftw_</span></samp>&rsquo;.  There are a few important differences, however:
+
+   <p>First, we must call <code>fftw_mpi_init()</code> after calling
+<code>MPI_Init</code> (required in all MPI programs) and before calling any
+other &lsquo;<samp><span class="samp">fftw_mpi_</span></samp>&rsquo; routine. 
+<a name="index-MPI_005fInit-357"></a><a name="index-fftw_005fmpi_005finit-358"></a>
+
+   <p>Second, when we create the plan with <code>fftw_mpi_plan_dft_2d</code>,
+analogous to <code>fftw_plan_dft_2d</code>, we pass an additional argument:
+the communicator, indicating which processes will participate in the
+transform (here <code>MPI_COMM_WORLD</code>, indicating all processes). 
+Whenever you create, execute, or destroy a plan for an MPI transform,
+you must call the corresponding FFTW routine on <em>all</em> processes
+in the communicator for that transform.  (That is, these are
+<em>collective</em> calls.)  Note that the plan for the MPI transform
+uses the standard <code>fftw_execute</code> and <code>fftw_destroy</code> routines
+(on the other hand, there are MPI-specific new-array execute functions
+documented below). 
+<a name="index-collective-function-359"></a><a name="index-fftw_005fmpi_005fplan_005fdft_005f2d-360"></a><a name="index-MPI_005fCOMM_005fWORLD-361"></a>
+
+   <p>Third, all of the FFTW MPI routines take <code>ptrdiff_t</code> arguments
+instead of <code>int</code> as for the serial FFTW.  <code>ptrdiff_t</code> is a
+standard C integer type which is (at least) 32 bits wide on a 32-bit
+machine and 64 bits wide on a 64-bit machine.  This is to make it easy
+to specify very large parallel transforms on a 64-bit machine.  (You
+can specify 64-bit transform sizes in the serial FFTW, too, but only
+by using the &lsquo;<samp><span class="samp">guru64</span></samp>&rsquo; planner interface.  See <a href="64_002dbit-Guru-Interface.html#g_t64_002dbit-Guru-Interface">64-bit Guru Interface</a>.) 
+<a name="index-ptrdiff_005ft-362"></a><a name="index-g_t64_002dbit-architecture-363"></a>
+
+   <p>Fourth, and most importantly, you don't allocate the entire
+two-dimensional array on each process.  Instead, you call
+<code>fftw_mpi_local_size_2d</code> to find out what <em>portion</em> of the
+array resides on each processor, and how much space to allocate. 
+Here, the portion of the array on each process is a <code>local_n0</code> by
+<code>N1</code> slice of the total array, starting at index
+<code>local_0_start</code>.  The total number of <code>fftw_complex</code> numbers
+to allocate is given by the <code>alloc_local</code> return value, which
+<em>may</em> be greater than <code>local_n0 * N1</code> (in case some
+intermediate calculations require additional storage).  The data
+distribution in FFTW's MPI interface is described in more detail by
+the next section. 
+<a name="index-fftw_005fmpi_005flocal_005fsize_005f2d-364"></a><a name="index-data-distribution-365"></a>
+
+   <p>Given the portion of the array that resides on the local process, it
+is straightforward to initialize the data (here to a function
+<code>myfunction</code>) and otherwise manipulate it.  Of course, at the end
+of the program you may want to output the data somehow, but
+synchronizing this output is up to you and is beyond the scope of this
+manual.  (One good way to output a large multi-dimensional distributed
+array in MPI to a portable binary file is to use the free HDF5
+library; see the <a href="http://www.hdfgroup.org/">HDF home page</a>.) 
+<a name="index-HDF5-366"></a><a name="index-MPI-I_002fO-367"></a>
+<!--  -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/64_002dbit-Guru-Interface.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/64_002dbit-Guru-Interface.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,113 @@
+<html lang="en">
+<head>
+<title>64-bit Guru Interface - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Guru-Interface.html#Guru-Interface" title="Guru Interface">
+<link rel="prev" href="Guru-Real_002dto_002dreal-Transforms.html#Guru-Real_002dto_002dreal-Transforms" title="Guru Real-to-real Transforms">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="64-bit-Guru-Interface"></a>
+<a name="g_t64_002dbit-Guru-Interface"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Guru-Real_002dto_002dreal-Transforms.html#Guru-Real_002dto_002dreal-Transforms">Guru Real-to-real Transforms</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Guru-Interface.html#Guru-Interface">Guru Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.5.6 64-bit Guru Interface</h4>
+
+<p><a name="index-g_t64_002dbit-architecture-260"></a>
+When compiled in 64-bit mode on a 64-bit architecture (where addresses
+are 64 bits wide), FFTW uses 64-bit quantities internally for all
+transform sizes, strides, and so on&mdash;you don't have to do anything
+special to exploit this.  However, in the ordinary FFTW interfaces,
+you specify the transform size by an <code>int</code> quantity, which is
+normally only 32 bits wide.  This means that, even though FFTW is
+using 64-bit sizes internally, you cannot specify a single transform
+dimension larger than
+2<sup><small>31</small></sup>&minus;1numbers.
+
+   <p>We expect that few users will require transforms larger than this, but,
+for those who do, we provide a 64-bit version of the guru interface in
+which all sizes are specified as integers of type <code>ptrdiff_t</code>
+instead of <code>int</code>.  (<code>ptrdiff_t</code> is a signed integer type
+defined by the C standard to be wide enough to represent address
+differences, and thus must be at least 64 bits wide on a 64-bit
+machine.)  We stress that there is <em>no performance advantage</em> to
+using this interface&mdash;the same internal FFTW code is employed
+regardless&mdash;and it is only necessary if you want to specify very
+large transform sizes. 
+<a name="index-ptrdiff_005ft-261"></a>
+
+   <p>In particular, the 64-bit guru interface is a set of planner routines
+that are exactly the same as the guru planner routines, except that
+they are named with &lsquo;<samp><span class="samp">guru64</span></samp>&rsquo; instead of &lsquo;<samp><span class="samp">guru</span></samp>&rsquo; and they take
+arguments of type <code>fftw_iodim64</code> instead of <code>fftw_iodim</code>. 
+For example, instead of <code>fftw_plan_guru_dft</code>, we have
+<code>fftw_plan_guru64_dft</code>.
+
+<pre class="example">     fftw_plan fftw_plan_guru64_dft(
+          int rank, const fftw_iodim64 *dims,
+          int howmany_rank, const fftw_iodim64 *howmany_dims,
+          fftw_complex *in, fftw_complex *out,
+          int sign, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fguru64_005fdft-262"></a>
+The <code>fftw_iodim64</code> type is similar to <code>fftw_iodim</code>, with the
+same interpretation, except that it uses type <code>ptrdiff_t</code> instead
+of type <code>int</code>.
+
+<pre class="example">     typedef struct {
+          ptrdiff_t n;
+          ptrdiff_t is;
+          ptrdiff_t os;
+     } fftw_iodim64;
+</pre>
+   <p><a name="index-fftw_005fiodim64-263"></a>
+Every other &lsquo;<samp><span class="samp">fftw_plan_guru</span></samp>&rsquo; function also has a
+&lsquo;<samp><span class="samp">fftw_plan_guru64</span></samp>&rsquo; equivalent, but we do not repeat their
+documentation here since they are identical to the 32-bit versions
+except as noted above.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Accessing-the-wisdom-API-from-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Accessing-the-wisdom-API-from-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,75 @@
+<html lang="en">
+<head>
+<title>Accessing the wisdom API from Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran" title="Calling FFTW from Modern Fortran">
+<link rel="prev" href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran" title="Allocating aligned memory in Fortran">
+<link rel="next" href="Defining-an-FFTW-module.html#Defining-an-FFTW-module" title="Defining an FFTW module">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Accessing-the-wisdom-API-from-Fortran"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Defining-an-FFTW-module.html#Defining-an-FFTW-module">Defining an FFTW module</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">7.6 Accessing the wisdom API from Fortran</h3>
+
+<p><a name="index-wisdom-564"></a><a name="index-saving-plans-to-disk-565"></a>
+As explained in <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>, FFTW provides a
+&ldquo;wisdom&rdquo; API for saving plans to disk so that they can be recreated
+quickly.  The C API for exporting (see <a href="Wisdom-Export.html#Wisdom-Export">Wisdom Export</a>) and
+importing (see <a href="Wisdom-Import.html#Wisdom-Import">Wisdom Import</a>) wisdom is somewhat tricky to use
+from Fortran, however, because of differences in file I/O and string
+types between C and Fortran.
+
+<ul class="menu">
+<li><a accesskey="1" href="Wisdom-File-Export_002fImport-from-Fortran.html#Wisdom-File-Export_002fImport-from-Fortran">Wisdom File Export/Import from Fortran</a>
+<li><a accesskey="2" href="Wisdom-String-Export_002fImport-from-Fortran.html#Wisdom-String-Export_002fImport-from-Fortran">Wisdom String Export/Import from Fortran</a>
+<li><a accesskey="3" href="Wisdom-Generic-Export_002fImport-from-Fortran.html#Wisdom-Generic-Export_002fImport-from-Fortran">Wisdom Generic Export/Import from Fortran</a>
+</ul>
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Acknowledgments.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Acknowledgments.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,142 @@
+<html lang="en">
+<head>
+<title>Acknowledgments - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Installation-and-Customization.html#Installation-and-Customization" title="Installation and Customization">
+<link rel="next" href="License-and-Copyright.html#License-and-Copyright" title="License and Copyright">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Acknowledgments"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="License-and-Copyright.html#License-and-Copyright">License and Copyright</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">11 Acknowledgments</h2>
+
+<p>Matteo Frigo was supported in part by the Special Research Program SFB
+F011 &ldquo;AURORA&rdquo; of the Austrian Science Fund FWF and by MIT Lincoln
+Laboratory.  For previous versions of FFTW, he was supported in part by the
+Defense Advanced Research Projects Agency (DARPA), under Grants
+N00014-94-1-0985 and F30602-97-1-0270, and by a Digital Equipment
+Corporation Fellowship.
+
+   <p>Steven G. Johnson was supported in part by a Dept. of Defense NDSEG
+Fellowship, an MIT Karl Taylor Compton Fellowship, and by the Materials
+Research Science and Engineering Center program of the National Science
+Foundation under award DMR-9400334.
+
+   <p>Code for the Cell Broadband Engine was graciously donated to the FFTW
+project by the IBM Austin Research Lab and included in fftw-3.2.  (This
+code was removed in fftw-3.3.)
+
+   <p>Code for the MIPS paired-single SIMD support was graciously donated to
+the FFTW project by CodeSourcery, Inc.
+
+   <p>We are grateful to Sun Microsystems Inc. for its donation of a
+cluster of 9 8-processor Ultra HPC 5000 SMPs (24 Gflops peak). These
+machines served as the primary platform for the development of early
+versions of FFTW.
+
+   <p>We thank Intel Corporation for donating a four-processor Pentium Pro
+machine.  We thank the GNU/Linux community for giving us a decent OS to
+run on that machine.
+
+   <p>We are thankful to the AMD corporation for donating an AMD Athlon XP 1700+
+computer to the FFTW project.
+
+   <p>We thank the Compaq/HP testdrive program and VA Software Corporation
+(SourceForge.net) for providing remote access to machines that were used
+to test FFTW.
+
+   <p>The <code>genfft</code> suite of code generators was written using Objective
+Caml, a dialect of ML.  Objective Caml is a small and elegant language
+developed by Xavier Leroy.  The implementation is available from
+<a href="http://caml.inria.fr/"><code>http://caml.inria.fr/</code></a>.  In previous
+releases of FFTW, <code>genfft</code> was written in Caml Light, by the same
+authors.  An even earlier implementation of <code>genfft</code> was written in
+Scheme, but Caml is definitely better for this kind of application. 
+<a name="index-Caml-627"></a><a name="index-LISP-628"></a>
+
+   <p>FFTW uses many tools from the GNU project, including <code>automake</code>,
+<code>texinfo</code>, and <code>libtool</code>.
+
+   <p>Prof. Charles E. Leiserson of MIT provided continuous support and
+encouragement.  This program would not exist without him.  Charles also
+proposed the name &ldquo;codelets&rdquo; for the basic FFT blocks. 
+<a name="index-codelet-629"></a>
+
+   <p>Prof. John D. Joannopoulos of MIT demonstrated continuing tolerance of
+Steven's &ldquo;extra-curricular&rdquo; computer-science activities, as well as
+remarkable creativity in working them into his grant proposals. 
+Steven's physics degree would not exist without him.
+
+   <p>Franz Franchetti wrote SIMD extensions to FFTW 2, which eventually
+led to the SIMD support in FFTW 3.
+
+   <p>Stefan Kral wrote most of the K7 code generator distributed with FFTW
+3.0.x and 3.1.x.
+
+   <p>Andrew Sterian contributed the Windows timing code in FFTW 2.
+
+   <p>Didier Miras reported a bug in the test procedure used in FFTW 1.2.  We
+now use a completely different test algorithm by Funda Ergun that does
+not require a separate FFT program to compare against.
+
+   <p>Wolfgang Reimer contributed the Pentium cycle counter and a few fixes
+that help portability.
+
+   <p>Ming-Chang Liu uncovered a well-hidden bug in the complex transforms of
+FFTW 2.0 and supplied a patch to correct it.
+
+   <p>The FFTW FAQ was written in <code>bfnn</code> (Bizarre Format With No Name)
+and formatted using the tools developed by Ian Jackson for the Linux
+FAQ.
+
+   <p><em>We are especially thankful to all of our users for their
+continuing support, feedback, and interest during our development of
+FFTW.</em>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Advanced-Complex-DFTs.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Advanced-Complex-DFTs.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,149 @@
+<html lang="en">
+<head>
+<title>Advanced Complex DFTs - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Advanced-Interface.html#Advanced-Interface" title="Advanced Interface">
+<link rel="prev" href="Advanced-Interface.html#Advanced-Interface" title="Advanced Interface">
+<link rel="next" href="Advanced-Real_002ddata-DFTs.html#Advanced-Real_002ddata-DFTs" title="Advanced Real-data DFTs">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Advanced-Complex-DFTs"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Advanced-Real_002ddata-DFTs.html#Advanced-Real_002ddata-DFTs">Advanced Real-data DFTs</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Advanced-Interface.html#Advanced-Interface">Advanced Interface</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Advanced-Interface.html#Advanced-Interface">Advanced Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.4.1 Advanced Complex DFTs</h4>
+
+<pre class="example">     fftw_plan fftw_plan_many_dft(int rank, const int *n, int howmany,
+                                  fftw_complex *in, const int *inembed,
+                                  int istride, int idist,
+                                  fftw_complex *out, const int *onembed,
+                                  int ostride, int odist,
+                                  int sign, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fmany_005fdft-232"></a>
+This routine plans multiple multidimensional complex DFTs, and it
+extends the <code>fftw_plan_dft</code> routine (see <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a>) to
+compute <code>howmany</code> transforms, each having rank <code>rank</code> and size
+<code>n</code>.  In addition, the transform data need not be contiguous, but
+it may be laid out in memory with an arbitrary stride.  To account for
+these possibilities, <code>fftw_plan_many_dft</code> adds the new parameters
+<code>howmany</code>, {<code>i</code>,<code>o</code>}<code>nembed</code>,
+{<code>i</code>,<code>o</code>}<code>stride</code>, and
+{<code>i</code>,<code>o</code>}<code>dist</code>.  The FFTW basic interface
+(see <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a>) provides routines specialized for ranks 1, 2,
+and&nbsp;3, but the advanced interface handles only the general-rank
+case.
+
+   <p><code>howmany</code> is the number of transforms to compute.  The resulting
+plan computes <code>howmany</code> transforms, where the input of the
+<code>k</code>-th transform is at location <code>in+k*idist</code> (in C pointer
+arithmetic), and its output is at location <code>out+k*odist</code>.  Plans
+obtained in this way can often be faster than calling FFTW multiple
+times for the individual transforms.  The basic <code>fftw_plan_dft</code>
+interface corresponds to <code>howmany=1</code> (in which case the <code>dist</code>
+parameters are ignored). 
+<a name="index-howmany-parameter-233"></a><a name="index-dist-234"></a>
+
+   <p>Each of the <code>howmany</code> transforms has rank <code>rank</code> and size
+<code>n</code>, as in the basic interface.  In addition, the advanced
+interface allows the input and output arrays of each transform to be
+row-major subarrays of larger rank-<code>rank</code> arrays, described by
+<code>inembed</code> and <code>onembed</code> parameters, respectively. 
+{<code>i</code>,<code>o</code>}<code>nembed</code> must be arrays of length <code>rank</code>,
+and <code>n</code> should be elementwise less than or equal to
+{<code>i</code>,<code>o</code>}<code>nembed</code>.  Passing <code>NULL</code> for an
+<code>nembed</code> parameter is equivalent to passing <code>n</code> (i.e. same
+physical and logical dimensions, as in the basic interface.)
+
+   <p>The <code>stride</code> parameters indicate that the <code>j</code>-th element of
+the input or output arrays is located at <code>j*istride</code> or
+<code>j*ostride</code>, respectively.  (For a multi-dimensional array,
+<code>j</code> is the ordinary row-major index.)  When combined with the
+<code>k</code>-th transform in a <code>howmany</code> loop, from above, this means
+that the (<code>j</code>,<code>k</code>)-th element is at <code>j*stride+k*dist</code>. 
+(The basic <code>fftw_plan_dft</code> interface corresponds to a stride of 1.) 
+<a name="index-stride-235"></a>
+
+   <p>For in-place transforms, the input and output <code>stride</code> and
+<code>dist</code> parameters should be the same; otherwise, the planner may
+return <code>NULL</code>.
+
+   <p>Arrays <code>n</code>, <code>inembed</code>, and <code>onembed</code> are not used after
+this function returns.  You can safely free or reuse them.
+
+   <p><strong>Examples</strong>:
+One transform of one 5 by 6 array contiguous in memory:
+<pre class="example">        int rank = 2;
+        int n[] = {5, 6};
+        int howmany = 1;
+        int idist = odist = 0; /* unused because howmany = 1 */
+        int istride = ostride = 1; /* array is contiguous in memory */
+        int *inembed = n, *onembed = n;
+</pre>
+   <p>Transform of three 5 by 6 arrays, each contiguous in memory,
+stored in memory one after another:
+<pre class="example">        int rank = 2;
+        int n[] = {5, 6};
+        int howmany = 3;
+        int idist = odist = n[0]*n[1]; /* = 30, the distance in memory
+                                          between the first element
+                                          of the first array and the
+                                          first element of the second array */
+        int istride = ostride = 1; /* array is contiguous in memory */
+        int *inembed = n, *onembed = n;
+</pre>
+   <p>Transform each column of a 2d array with 10 rows and 3 columns:
+<pre class="example">        int rank = 1; /* not 2: we are computing 1d transforms */
+        int n[] = {10}; /* 1d transforms of length 10 */
+        int howmany = 3;
+        int idist = odist = 1;
+        int istride = ostride = 3; /* distance between two elements in
+                                      the same column */
+        int *inembed = n, *onembed = n;
+</pre>
+   <!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Advanced-Interface.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Advanced-Interface.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,76 @@
+<html lang="en">
+<head>
+<title>Advanced Interface - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link rel="prev" href="Basic-Interface.html#Basic-Interface" title="Basic Interface">
+<link rel="next" href="Guru-Interface.html#Guru-Interface" title="Guru Interface">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Advanced-Interface"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Guru-Interface.html#Guru-Interface">Guru Interface</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Basic-Interface.html#Basic-Interface">Basic Interface</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>
+<hr>
+</div>
+
+<h3 class="section">4.4 Advanced Interface</h3>
+
+<p><a name="index-advanced-interface-231"></a>
+FFTW's &ldquo;advanced&rdquo; interface supplements the basic interface with four
+new planner routines, providing a new level of flexibility: you can plan
+a transform of multiple arrays simultaneously, operate on non-contiguous
+(strided) data, and transform a subset of a larger multi-dimensional
+array.  Other than these additional features, the planner operates in
+the same fashion as in the basic interface, and the resulting
+<code>fftw_plan</code> is used in the same way (see <a href="Using-Plans.html#Using-Plans">Using Plans</a>).
+
+<ul class="menu">
+<li><a accesskey="1" href="Advanced-Complex-DFTs.html#Advanced-Complex-DFTs">Advanced Complex DFTs</a>
+<li><a accesskey="2" href="Advanced-Real_002ddata-DFTs.html#Advanced-Real_002ddata-DFTs">Advanced Real-data DFTs</a>
+<li><a accesskey="3" href="Advanced-Real_002dto_002dreal-Transforms.html#Advanced-Real_002dto_002dreal-Transforms">Advanced Real-to-real Transforms</a>
+</ul>
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Advanced-Real_002ddata-DFTs.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Advanced-Real_002ddata-DFTs.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,103 @@
+<html lang="en">
+<head>
+<title>Advanced Real-data DFTs - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Advanced-Interface.html#Advanced-Interface" title="Advanced Interface">
+<link rel="prev" href="Advanced-Complex-DFTs.html#Advanced-Complex-DFTs" title="Advanced Complex DFTs">
+<link rel="next" href="Advanced-Real_002dto_002dreal-Transforms.html#Advanced-Real_002dto_002dreal-Transforms" title="Advanced Real-to-real Transforms">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Advanced-Real-data-DFTs"></a>
+<a name="Advanced-Real_002ddata-DFTs"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Advanced-Real_002dto_002dreal-Transforms.html#Advanced-Real_002dto_002dreal-Transforms">Advanced Real-to-real Transforms</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Advanced-Complex-DFTs.html#Advanced-Complex-DFTs">Advanced Complex DFTs</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Advanced-Interface.html#Advanced-Interface">Advanced Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.4.2 Advanced Real-data DFTs</h4>
+
+<pre class="example">     fftw_plan fftw_plan_many_dft_r2c(int rank, const int *n, int howmany,
+                                      double *in, const int *inembed,
+                                      int istride, int idist,
+                                      fftw_complex *out, const int *onembed,
+                                      int ostride, int odist,
+                                      unsigned flags);
+     fftw_plan fftw_plan_many_dft_c2r(int rank, const int *n, int howmany,
+                                      fftw_complex *in, const int *inembed,
+                                      int istride, int idist,
+                                      double *out, const int *onembed,
+                                      int ostride, int odist,
+                                      unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fmany_005fdft_005fr2c-236"></a><a name="index-fftw_005fplan_005fmany_005fdft_005fc2r-237"></a>
+Like <code>fftw_plan_many_dft</code>, these two functions add <code>howmany</code>,
+<code>nembed</code>, <code>stride</code>, and <code>dist</code> parameters to the
+<code>fftw_plan_dft_r2c</code> and <code>fftw_plan_dft_c2r</code> functions, but
+otherwise behave the same as the basic interface.
+
+   <p>The interpretation of <code>howmany</code>, <code>stride</code>, and <code>dist</code> are
+the same as for <code>fftw_plan_many_dft</code>, above.  Note that the
+<code>stride</code> and <code>dist</code> for the real array are in units of
+<code>double</code>, and for the complex array are in units of
+<code>fftw_complex</code>.
+
+   <p>If an <code>nembed</code> parameter is <code>NULL</code>, it is interpreted as what
+it would be in the basic interface, as described in <a href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a>.  That is, for the complex array the size is assumed to be
+the same as <code>n</code>, but with the last dimension cut roughly in half. 
+For the real array, the size is assumed to be <code>n</code> if the transform
+is out-of-place, or <code>n</code> with the last dimension &ldquo;padded&rdquo; if the
+transform is in-place.
+
+   <p>If an <code>nembed</code> parameter is non-<code>NULL</code>, it is interpreted as
+the physical size of the corresponding array, in row-major order, just
+as for <code>fftw_plan_many_dft</code>.  In this case, each dimension of
+<code>nembed</code> should be <code>&gt;=</code> what it would be in the basic
+interface (e.g. the halved or padded <code>n</code>).
+
+   <p>Arrays <code>n</code>, <code>inembed</code>, and <code>onembed</code> are not used after
+this function returns.  You can safely free or reuse them.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Advanced-Real_002dto_002dreal-Transforms.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Advanced-Real_002dto_002dreal-Transforms.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,79 @@
+<html lang="en">
+<head>
+<title>Advanced Real-to-real Transforms - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Advanced-Interface.html#Advanced-Interface" title="Advanced Interface">
+<link rel="prev" href="Advanced-Real_002ddata-DFTs.html#Advanced-Real_002ddata-DFTs" title="Advanced Real-data DFTs">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Advanced-Real-to-real-Transforms"></a>
+<a name="Advanced-Real_002dto_002dreal-Transforms"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Advanced-Real_002ddata-DFTs.html#Advanced-Real_002ddata-DFTs">Advanced Real-data DFTs</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Advanced-Interface.html#Advanced-Interface">Advanced Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.4.3 Advanced Real-to-real Transforms</h4>
+
+<pre class="example">     fftw_plan fftw_plan_many_r2r(int rank, const int *n, int howmany,
+                                  double *in, const int *inembed,
+                                  int istride, int idist,
+                                  double *out, const int *onembed,
+                                  int ostride, int odist,
+                                  const fftw_r2r_kind *kind, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fmany_005fr2r-238"></a>
+Like <code>fftw_plan_many_dft</code>, this functions adds <code>howmany</code>,
+<code>nembed</code>, <code>stride</code>, and <code>dist</code> parameters to the
+<code>fftw_plan_r2r</code> function, but otherwise behave the same as the
+basic interface.  The interpretation of those additional parameters are
+the same as for <code>fftw_plan_many_dft</code>.  (Of course, the
+<code>stride</code> and <code>dist</code> parameters are now in units of
+<code>double</code>, not <code>fftw_complex</code>.)
+
+   <p>Arrays <code>n</code>, <code>inembed</code>, <code>onembed</code>, and <code>kind</code> are not
+used after this function returns.  You can safely free or reuse them.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Advanced-distributed_002dtranspose-interface.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Advanced-distributed_002dtranspose-interface.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,82 @@
+<html lang="en">
+<head>
+<title>Advanced distributed-transpose interface - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes" title="FFTW MPI Transposes">
+<link rel="prev" href="Basic-distributed_002dtranspose-interface.html#Basic-distributed_002dtranspose-interface" title="Basic distributed-transpose interface">
+<link rel="next" href="An-improved-replacement-for-MPI_005fAlltoall.html#An-improved-replacement-for-MPI_005fAlltoall" title="An improved replacement for MPI_Alltoall">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Advanced-distributed-transpose-interface"></a>
+<a name="Advanced-distributed_002dtranspose-interface"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="An-improved-replacement-for-MPI_005fAlltoall.html#An-improved-replacement-for-MPI_005fAlltoall">An improved replacement for MPI_Alltoall</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Basic-distributed_002dtranspose-interface.html#Basic-distributed_002dtranspose-interface">Basic distributed-transpose interface</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">FFTW MPI Transposes</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.7.2 Advanced distributed-transpose interface</h4>
+
+<p>The above routines are for a transpose of a matrix of numbers (of type
+<code>double</code>), using FFTW's default block sizes.  More generally, one
+can perform transposes of <em>tuples</em> of numbers, with
+user-specified block sizes for the input and output:
+
+<pre class="example">     fftw_plan fftw_mpi_plan_many_transpose
+                     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany,
+                      ptrdiff_t block0, ptrdiff_t block1,
+                      double *in, double *out, MPI_Comm comm, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fmpi_005fplan_005fmany_005ftranspose-403"></a>
+In this case, one is transposing an <code>n0</code> by <code>n1</code> matrix of
+<code>howmany</code>-tuples (e.g. <code>howmany = 2</code> for complex numbers). 
+The input is distributed along the <code>n0</code> dimension with block size
+<code>block0</code>, and the <code>n1</code> by <code>n0</code> output is distributed
+along the <code>n1</code> dimension with block size <code>block1</code>.  If
+<code>FFTW_MPI_DEFAULT_BLOCK</code> (0) is passed for a block size then FFTW
+uses its default block size.  To get the local size of the data on
+each process, you should then call <code>fftw_mpi_local_size_many_transposed</code>. 
+<a name="index-FFTW_005fMPI_005fDEFAULT_005fBLOCK-404"></a><a name="index-fftw_005fmpi_005flocal_005fsize_005fmany_005ftransposed-405"></a>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Allocating-aligned-memory-in-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Allocating-aligned-memory-in-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,129 @@
+<html lang="en">
+<head>
+<title>Allocating aligned memory in Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran" title="Calling FFTW from Modern Fortran">
+<link rel="prev" href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran" title="Plan execution in Fortran">
+<link rel="next" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran" title="Accessing the wisdom API from Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Allocating-aligned-memory-in-Fortran"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">Accessing the wisdom API from Fortran</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">7.5 Allocating aligned memory in Fortran</h3>
+
+<p><a name="index-alignment-560"></a><a name="index-fftw_005falloc_005freal-561"></a><a name="index-fftw_005falloc_005fcomplex-562"></a>In order to obtain maximum performance in FFTW, you should store your
+data in arrays that have been specially aligned in memory (see <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>).  Enforcing alignment also permits you to
+safely use the new-array execute functions (see <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>) to apply a given plan to more than one pair of in/out
+arrays.  Unfortunately, standard Fortran arrays do <em>not</em> provide
+any alignment guarantees.  The <em>only</em> way to allocate aligned
+memory in standard Fortran is to allocate it with an external C
+function, like the <code>fftw_alloc_real</code> and
+<code>fftw_alloc_complex</code> functions.  Fortunately, Fortran 2003 provides
+a simple way to associate such allocated memory with a standard Fortran
+array pointer that you can then use normally.
+
+   <p>We therefore recommend allocating all your input/output arrays using
+the following technique:
+
+     <ol type=1 start=1>
+
+     <li>Declare a <code>pointer</code>, <code>arr</code>, to your array of the desired type
+and dimensions.  For example, <code>real(C_DOUBLE), pointer :: a(:,:)</code>
+for a 2d real array, or <code>complex(C_DOUBLE_COMPLEX), pointer ::
+a(:,:,:)</code> for a 3d complex array.
+
+     <li>The number of elements to allocate must be an
+<code>integer(C_SIZE_T)</code>.  You can either declare a variable of this
+type, e.g. <code>integer(C_SIZE_T) :: sz</code>, to store the number of
+elements to allocate, or you can use the <code>int(..., C_SIZE_T)</code>
+intrinsic function. e.g. set <code>sz = L * M * N</code> or use
+<code>int(L * M * N, C_SIZE_T)</code> for an L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;N array.
+
+     <li>Declare a <code>type(C_PTR) :: p</code> to hold the return value from
+FFTW's allocation routine.  Set <code>p = fftw_alloc_real(sz)</code> for a real array, or <code>p = fftw_alloc_complex(sz)</code> for a complex array.
+
+     <li><a name="index-c_005ff_005fpointer-563"></a>Associate your pointer <code>arr</code> with the allocated memory <code>p</code>
+using the standard <code>c_f_pointer</code> subroutine: <code>call
+c_f_pointer(p, arr, [...dimensions...])</code>, where
+<code>[...dimensions...])</code> are an array of the dimensions of the array
+(in the usual Fortran order). e.g. <code>call c_f_pointer(p, arr,
+[L,M,N])</code> for an L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;N array.  (Alternatively, you can
+omit the dimensions argument if you specified the shape explicitly
+when declaring <code>arr</code>.)  You can now use <code>arr</code> as a usual
+multidimensional array.
+
+     <li>When you are done using the array, deallocate the memory by <code>call
+fftw_free(p)</code> on <code>p</code>.
+
+        </ol>
+
+   <p>For example, here is how we would allocate an L&nbsp;&times;&nbsp;M 2d real array:
+
+<pre class="example">       real(C_DOUBLE), pointer :: arr(:,:)
+       type(C_PTR) :: p
+       p = fftw_alloc_real(int(L * M, C_SIZE_T))
+       call c_f_pointer(p, arr, [L,M])
+       <em>...use arr and arr(i,j) as usual...</em>
+       call fftw_free(p)
+</pre>
+   <p>and here is an L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;N 3d complex array:
+
+<pre class="example">       complex(C_DOUBLE_COMPLEX), pointer :: arr(:,:,:)
+       type(C_PTR) :: p
+       p = fftw_alloc_complex(int(L * M * N, C_SIZE_T))
+       call c_f_pointer(p, arr, [L,M,N])
+       <em>...use arr and arr(i,j,k) as usual...</em>
+       call fftw_free(p)
+</pre>
+   <p>See <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a> for an example allocating a
+single array and associating both real and complex array pointers with
+it, for in-place real-to-complex transforms.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/An-improved-replacement-for-MPI_005fAlltoall.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/An-improved-replacement-for-MPI_005fAlltoall.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+<html lang="en">
+<head>
+<title>An improved replacement for MPI_Alltoall - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes" title="FFTW MPI Transposes">
+<link rel="prev" href="Advanced-distributed_002dtranspose-interface.html#Advanced-distributed_002dtranspose-interface" title="Advanced distributed-transpose interface">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="An-improved-replacement-for-MPI_Alltoall"></a>
+<a name="An-improved-replacement-for-MPI_005fAlltoall"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Advanced-distributed_002dtranspose-interface.html#Advanced-distributed_002dtranspose-interface">Advanced distributed-transpose interface</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">FFTW MPI Transposes</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.7.3 An improved replacement for MPI_Alltoall</h4>
+
+<p>We close this section by noting that FFTW's MPI transpose routines can
+be thought of as a generalization for the <code>MPI_Alltoall</code> function
+(albeit only for floating-point types), and in some circumstances can
+function as an improved replacement. 
+<a name="index-MPI_005fAlltoall-406"></a>
+
+   <p><code>MPI_Alltoall</code> is defined by the MPI standard as:
+
+<pre class="example">     int MPI_Alltoall(void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                      void *recvbuf, int recvcnt, MPI_Datatype recvtype,
+                      MPI_Comm comm);
+</pre>
+   <p>In particular, for <code>double*</code> arrays <code>in</code> and <code>out</code>,
+consider the call:
+
+<pre class="example">     MPI_Alltoall(in, howmany, MPI_DOUBLE, out, howmany MPI_DOUBLE, comm);
+</pre>
+   <p>This is completely equivalent to:
+
+<pre class="example">     MPI_Comm_size(comm, &amp;P);
+     plan = fftw_mpi_plan_many_transpose(P, P, howmany, 1, 1, in, out, comm, FFTW_ESTIMATE);
+     fftw_execute(plan);
+     fftw_destroy_plan(plan);
+</pre>
+   <p>That is, computing a P&nbsp;&times;&nbsp;P transpose on <code>P</code> processes,
+with a block size of 1, is just a standard all-to-all communication.
+
+   <p>However, using the FFTW routine instead of <code>MPI_Alltoall</code> may
+have certain advantages.  First of all, FFTW's routine can operate
+in-place (<code>in == out</code>) whereas <code>MPI_Alltoall</code> can only
+operate out-of-place. 
+<a name="index-in_002dplace-407"></a>
+
+   <p>Second, even for out-of-place plans, FFTW's routine may be faster,
+especially if you need to perform the all-to-all communication many
+times and can afford to use <code>FFTW_MEASURE</code> or
+<code>FFTW_PATIENT</code>.  It should certainly be no slower, not including
+the time to create the plan, since one of the possible algorithms that
+FFTW uses for an out-of-place transpose <em>is</em> simply to call
+<code>MPI_Alltoall</code>.  However, FFTW also considers several other
+possible algorithms that, depending on your MPI implementation and
+your hardware, may be faster. 
+<a name="index-FFTW_005fMEASURE-408"></a><a name="index-FFTW_005fPATIENT-409"></a>
+<!--  -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Avoiding-MPI-Deadlocks.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Avoiding-MPI-Deadlocks.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,80 @@
+<html lang="en">
+<head>
+<title>Avoiding MPI Deadlocks - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom" title="FFTW MPI Wisdom">
+<link rel="next" href="FFTW-MPI-Performance-Tips.html#FFTW-MPI-Performance-Tips" title="FFTW MPI Performance Tips">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Avoiding-MPI-Deadlocks"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="FFTW-MPI-Performance-Tips.html#FFTW-MPI-Performance-Tips">FFTW MPI Performance Tips</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.9 Avoiding MPI Deadlocks</h3>
+
+<p><a name="index-deadlock-417"></a>
+An MPI program can <em>deadlock</em> if one process is waiting for a
+message from another process that never gets sent.  To avoid deadlocks
+when using FFTW's MPI routines, it is important to know which
+functions are <em>collective</em>: that is, which functions must
+<em>always</em> be called in the <em>same order</em> from <em>every</em>
+process in a given communicator.  (For example, <code>MPI_Barrier</code> is
+the canonical example of a collective function in the MPI standard.) 
+<a name="index-collective-function-418"></a><a name="index-MPI_005fBarrier-419"></a>
+
+   <p>The functions in FFTW that are <em>always</em> collective are: every
+function beginning with &lsquo;<samp><span class="samp">fftw_mpi_plan</span></samp>&rsquo;, as well as
+<code>fftw_mpi_broadcast_wisdom</code> and <code>fftw_mpi_gather_wisdom</code>. 
+Also, the following functions from the ordinary FFTW interface are
+collective when they are applied to a plan created by an
+&lsquo;<samp><span class="samp">fftw_mpi_plan</span></samp>&rsquo; function: <code>fftw_execute</code>,
+<code>fftw_destroy_plan</code>, and <code>fftw_flops</code>. 
+<a name="index-fftw_005fexecute-420"></a><a name="index-fftw_005fdestroy_005fplan-421"></a><a name="index-fftw_005fflops-422"></a>
+<!--  -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Basic-Interface.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Basic-Interface.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,85 @@
+<html lang="en">
+<head>
+<title>Basic Interface - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link rel="prev" href="Using-Plans.html#Using-Plans" title="Using Plans">
+<link rel="next" href="Advanced-Interface.html#Advanced-Interface" title="Advanced Interface">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Basic-Interface"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Advanced-Interface.html#Advanced-Interface">Advanced Interface</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Using-Plans.html#Using-Plans">Using Plans</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>
+<hr>
+</div>
+
+<h3 class="section">4.3 Basic Interface</h3>
+
+<p><a name="index-basic-interface-160"></a>
+Recall that the FFTW API is divided into three parts<a rel="footnote" href="#fn-1" name="fnd-1"><sup>1</sup></a>: the <dfn>basic interface</dfn>
+computes a single transform of contiguous data, the <dfn>advanced
+interface</dfn> computes transforms of multiple or strided arrays, and the
+<dfn>guru interface</dfn> supports the most general data layouts,
+multiplicities, and strides.  This section describes the the basic
+interface, which we expect to satisfy the needs of most users.
+
+<ul class="menu">
+<li><a accesskey="1" href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a>
+<li><a accesskey="2" href="Planner-Flags.html#Planner-Flags">Planner Flags</a>
+<li><a accesskey="3" href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a>
+<li><a accesskey="4" href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a>
+<li><a accesskey="5" href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a>
+<li><a accesskey="6" href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a>
+</ul>
+
+<!-- =========> -->
+   <div class="footnote">
+<hr>
+<h4>Footnotes</h4><p class="footnote"><small>[<a name="fn-1" href="#fnd-1">1</a>]</small> <i>Gallia est
+omnis divisa in partes tres</i> (Julius Caesar).</p>
+
+   <hr></div>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Basic-and-advanced-distribution-interfaces.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Basic-and-advanced-distribution-interfaces.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,148 @@
+<html lang="en">
+<head>
+<title>Basic and advanced distribution interfaces - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="MPI-Data-Distribution.html#MPI-Data-Distribution" title="MPI Data Distribution">
+<link rel="prev" href="MPI-Data-Distribution.html#MPI-Data-Distribution" title="MPI Data Distribution">
+<link rel="next" href="Load-balancing.html#Load-balancing" title="Load balancing">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Basic-and-advanced-distribution-interfaces"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Load-balancing.html#Load-balancing">Load balancing</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.4.1 Basic and advanced distribution interfaces</h4>
+
+<p>As with the planner interface, the &lsquo;<samp><span class="samp">fftw_mpi_local_size</span></samp>&rsquo;
+distribution interface is broken into basic and advanced
+(&lsquo;<samp><span class="samp">_many</span></samp>&rsquo;) interfaces, where the latter allows you to specify the
+block size manually and also to request block sizes when computing
+multiple transforms simultaneously.  These functions are documented
+more exhaustively by the FFTW MPI Reference, but we summarize the
+basic ideas here using a couple of two-dimensional examples.
+
+   <p>For the 100&nbsp;&times;&nbsp;200 complex-DFT example, above, we would find
+the distribution by calling the following function in the basic
+interface:
+
+<pre class="example">     ptrdiff_t fftw_mpi_local_size_2d(ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                                      ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+</pre>
+   <p><a name="index-fftw_005fmpi_005flocal_005fsize_005f2d-370"></a>
+Given the total size of the data to be transformed (here, <code>n0 =
+100</code> and <code>n1 = 200</code>) and an MPI communicator (<code>comm</code>), this
+function provides three numbers.
+
+   <p>First, it describes the shape of the local data: the current process
+should store a <code>local_n0</code> by <code>n1</code> slice of the overall
+dataset, in row-major order (<code>n1</code> dimension contiguous), starting
+at index <code>local_0_start</code>.  That is, if the total dataset is
+viewed as a <code>n0</code> by <code>n1</code> matrix, the current process should
+store the rows <code>local_0_start</code> to
+<code>local_0_start+local_n0-1</code>.  Obviously, if you are running with
+only a single MPI process, that process will store the entire array:
+<code>local_0_start</code> will be zero and <code>local_n0</code> will be
+<code>n0</code>.  See <a href="Row_002dmajor-Format.html#Row_002dmajor-Format">Row-major Format</a>. 
+<a name="index-row_002dmajor-371"></a>
+
+   <p>Second, the return value is the total number of data elements (e.g.,
+complex numbers for a complex DFT) that should be allocated for the
+input and output arrays on the current process (ideally with
+<code>fftw_malloc</code> or an &lsquo;<samp><span class="samp">fftw_alloc</span></samp>&rsquo; function, to ensure optimal
+alignment).  It might seem that this should always be equal to
+<code>local_n0 * n1</code>, but this is <em>not</em> the case.  FFTW's
+distributed FFT algorithms require data redistributions at
+intermediate stages of the transform, and in some circumstances this
+may require slightly larger local storage.  This is discussed in more
+detail below, under <a href="Load-balancing.html#Load-balancing">Load balancing</a>. 
+<a name="index-fftw_005fmalloc-372"></a><a name="index-fftw_005falloc_005fcomplex-373"></a>
+
+   <p><a name="index-advanced-interface-374"></a>The advanced-interface &lsquo;<samp><span class="samp">local_size</span></samp>&rsquo; function for multidimensional
+transforms returns the same three things (<code>local_n0</code>,
+<code>local_0_start</code>, and the total number of elements to allocate),
+but takes more inputs:
+
+<pre class="example">     ptrdiff_t fftw_mpi_local_size_many(int rnk, const ptrdiff_t *n,
+                                        ptrdiff_t howmany,
+                                        ptrdiff_t block0,
+                                        MPI_Comm comm,
+                                        ptrdiff_t *local_n0,
+                                        ptrdiff_t *local_0_start);
+</pre>
+   <p><a name="index-fftw_005fmpi_005flocal_005fsize_005fmany-375"></a>
+The two-dimensional case above corresponds to <code>rnk = 2</code> and an
+array <code>n</code> of length 2 with <code>n[0] = n0</code> and <code>n[1] = n1</code>. 
+This routine is for any <code>rnk &gt; 1</code>; one-dimensional transforms
+have their own interface because they work slightly differently, as
+discussed below.
+
+   <p>First, the advanced interface allows you to perform multiple
+transforms at once, of interleaved data, as specified by the
+<code>howmany</code> parameter.  (<code>hoamany</code> is 1 for a single
+transform.)
+
+   <p>Second, here you can specify your desired block size in the <code>n0</code>
+dimension, <code>block0</code>.  To use FFTW's default block size, pass
+<code>FFTW_MPI_DEFAULT_BLOCK</code> (0) for <code>block0</code>.  Otherwise, on
+<code>P</code> processes, FFTW will return <code>local_n0</code> equal to
+<code>block0</code> on the first <code>P / block0</code> processes (rounded down),
+return <code>local_n0</code> equal to <code>n0 - block0 * (P / block0)</code> on
+the next process, and <code>local_n0</code> equal to zero on any remaining
+processes.  In general, we recommend using the default block size
+(which corresponds to <code>n0 / P</code>, rounded up). 
+<a name="index-FFTW_005fMPI_005fDEFAULT_005fBLOCK-376"></a><a name="index-block-distribution-377"></a>
+
+   <p>For example, suppose you have <code>P = 4</code> processes and <code>n0 =
+21</code>.  The default will be a block size of <code>6</code>, which will give
+<code>local_n0 = 6</code> on the first three processes and <code>local_n0 =
+3</code> on the last process.  Instead, however, you could specify
+<code>block0 = 5</code> if you wanted, which would give <code>local_n0 = 5</code>
+on processes 0 to 2, <code>local_n0 = 6</code> on process 3.  (This choice,
+while it may look superficially more &ldquo;balanced,&rdquo; has the same
+critical path as FFTW's default but requires more communications.)
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Basic-distributed_002dtranspose-interface.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Basic-distributed_002dtranspose-interface.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+<html lang="en">
+<head>
+<title>Basic distributed-transpose interface - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes" title="FFTW MPI Transposes">
+<link rel="prev" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes" title="FFTW MPI Transposes">
+<link rel="next" href="Advanced-distributed_002dtranspose-interface.html#Advanced-distributed_002dtranspose-interface" title="Advanced distributed-transpose interface">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Basic-distributed-transpose-interface"></a>
+<a name="Basic-distributed_002dtranspose-interface"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Advanced-distributed_002dtranspose-interface.html#Advanced-distributed_002dtranspose-interface">Advanced distributed-transpose interface</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">FFTW MPI Transposes</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">FFTW MPI Transposes</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.7.1 Basic distributed-transpose interface</h4>
+
+<p>In particular, suppose that we have an <code>n0</code> by <code>n1</code> array in
+row-major order, block-distributed across the <code>n0</code> dimension.  To
+transpose this into an <code>n1</code> by <code>n0</code> array block-distributed
+across the <code>n1</code> dimension, we would create a plan by calling the
+following function:
+
+<pre class="example">     fftw_plan fftw_mpi_plan_transpose(ptrdiff_t n0, ptrdiff_t n1,
+                                       double *in, double *out,
+                                       MPI_Comm comm, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fmpi_005fplan_005ftranspose-397"></a>
+The input and output arrays (<code>in</code> and <code>out</code>) can be the
+same.  The transpose is actually executed by calling
+<code>fftw_execute</code> on the plan, as usual. 
+<a name="index-fftw_005fexecute-398"></a>
+
+   <p>The <code>flags</code> are the usual FFTW planner flags, but support
+two additional flags: <code>FFTW_MPI_TRANSPOSED_OUT</code> and/or
+<code>FFTW_MPI_TRANSPOSED_IN</code>.  What these flags indicate, for
+transpose plans, is that the output and/or input, respectively, are
+<em>locally</em> transposed.  That is, on each process input data is
+normally stored as a <code>local_n0</code> by <code>n1</code> array in row-major
+order, but for an <code>FFTW_MPI_TRANSPOSED_IN</code> plan the input data is
+stored as <code>n1</code> by <code>local_n0</code> in row-major order.  Similarly,
+<code>FFTW_MPI_TRANSPOSED_OUT</code> means that the output is <code>n0</code> by
+<code>local_n1</code> instead of <code>local_n1</code> by <code>n0</code>. 
+<a name="index-FFTW_005fMPI_005fTRANSPOSED_005fOUT-399"></a><a name="index-FFTW_005fMPI_005fTRANSPOSED_005fIN-400"></a>
+
+   <p>To determine the local size of the array on each process before and
+after the transpose, as well as the amount of storage that must be
+allocated, one should call <code>fftw_mpi_local_size_2d_transposed</code>,
+just as for a 2d DFT as described in the previous section:
+<a name="index-data-distribution-401"></a>
+<pre class="example">     ptrdiff_t fftw_mpi_local_size_2d_transposed
+                     (ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                      ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                      ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+</pre>
+   <p><a name="index-fftw_005fmpi_005flocal_005fsize_005f2d_005ftransposed-402"></a>
+Again, the return value is the local storage to allocate, which in
+this case is the number of <em>real</em> (<code>double</code>) values rather
+than complex numbers as in the previous examples.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Calling-FFTW-from-Legacy-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Calling-FFTW-from-Legacy-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,92 @@
+<html lang="en">
+<head>
+<title>Calling FFTW from Legacy Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran" title="Calling FFTW from Modern Fortran">
+<link rel="next" href="Upgrading-from-FFTW-version-2.html#Upgrading-from-FFTW-version-2" title="Upgrading from FFTW version 2">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Calling-FFTW-from-Legacy-Fortran"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Upgrading-from-FFTW-version-2.html#Upgrading-from-FFTW-version-2">Upgrading from FFTW version 2</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">8 Calling FFTW from Legacy Fortran</h2>
+
+<p><a name="index-Fortran-interface-577"></a>
+This chapter describes the interface to FFTW callable by Fortran code
+in older compilers not supporting the Fortran 2003 C interoperability
+features (see <a href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>).  This interface
+has the major disadvantage that it is not type-checked, so if you
+mistake the argument types or ordering then your program will not have
+any compiler errors, and will likely crash at runtime.  So, greater
+care is needed.  Also, technically interfacing older Fortran versions
+to C is nonstandard, but in practice we have found that the techniques
+used in this chapter have worked with all known Fortran compilers for
+many years.
+
+   <p>The legacy Fortran interface differs from the C interface only in the
+prefix (&lsquo;<samp><span class="samp">dfftw_</span></samp>&rsquo; instead of &lsquo;<samp><span class="samp">fftw_</span></samp>&rsquo; in double precision) and
+a few other minor details.  This Fortran interface is included in the
+FFTW libraries by default, unless a Fortran compiler isn't found on
+your system or <code>--disable-fortran</code> is included in the
+<code>configure</code> flags.  We assume here that the reader is already
+familiar with the usage of FFTW in C, as described elsewhere in this
+manual.
+
+   <p>The MPI parallel interface to FFTW is <em>not</em> currently available
+to legacy Fortran.
+
+<ul class="menu">
+<li><a accesskey="1" href="Fortran_002dinterface-routines.html#Fortran_002dinterface-routines">Fortran-interface routines</a>
+<li><a accesskey="2" href="FFTW-Constants-in-Fortran.html#FFTW-Constants-in-Fortran">FFTW Constants in Fortran</a>
+<li><a accesskey="3" href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran">FFTW Execution in Fortran</a>
+<li><a accesskey="4" href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a>
+<li><a accesskey="5" href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f">Wisdom of Fortran?</a>
+</ul>
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Calling-FFTW-from-Modern-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Calling-FFTW-from-Modern-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,87 @@
+<html lang="en">
+<head>
+<title>Calling FFTW from Modern Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="next" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran" title="Calling FFTW from Legacy Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Calling-FFTW-from-Modern-Fortran"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">7 Calling FFTW from Modern Fortran</h2>
+
+<p><a name="index-Fortran-interface-500"></a>
+Fortran 2003 standardized ways for Fortran code to call C libraries,
+and this allows us to support a direct translation of the FFTW C API
+into Fortran.  Compared to the legacy Fortran 77 interface
+(see <a href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>), this direct interface
+offers many advantages, especially compile-time type-checking and
+aligned memory allocation.  As of this writing, support for these C
+interoperability features seems widespread, having been implemented in
+nearly all major Fortran compilers (e.g. GNU, Intel, IBM,
+Oracle/Solaris, Portland Group, NAG). 
+<a name="index-portability-501"></a>
+This chapter documents that interface.  For the most part, since this
+interface allows Fortran to call the C interface directly, the usage
+is identical to C translated to Fortran syntax.  However, there are a
+few subtle points such as memory allocation, wisdom, and data types
+that deserve closer attention.
+
+<ul class="menu">
+<li><a accesskey="1" href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a>
+<li><a accesskey="2" href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a>
+<li><a accesskey="3" href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a>
+<li><a accesskey="4" href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a>
+<li><a accesskey="5" href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a>
+<li><a accesskey="6" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">Accessing the wisdom API from Fortran</a>
+<li><a accesskey="7" href="Defining-an-FFTW-module.html#Defining-an-FFTW-module">Defining an FFTW module</a>
+</ul>
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Caveats-in-Using-Wisdom.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Caveats-in-Using-Wisdom.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,101 @@
+<html lang="en">
+<head>
+<title>Caveats in Using Wisdom - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Other-Important-Topics.html#Other-Important-Topics" title="Other Important Topics">
+<link rel="prev" href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans" title="Words of Wisdom-Saving Plans">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Caveats-in-Using-Wisdom"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Other-Important-Topics.html#Other-Important-Topics">Other Important Topics</a>
+<hr>
+</div>
+
+<h3 class="section">3.4 Caveats in Using Wisdom</h3>
+
+<p><a name="index-wisdom_002c-problems-with-133"></a>
+<blockquote>
+<i>For in much wisdom is much grief, and he that increaseth knowledge
+increaseth sorrow. 
+</i>[Ecclesiastes 1:18]
+<a name="index-Ecclesiastes-134"></a></blockquote>
+
+   <p><a name="index-portability-135"></a>There are pitfalls to using wisdom, in that it can negate FFTW's
+ability to adapt to changing hardware and other conditions. For
+example, it would be perfectly possible to export wisdom from a
+program running on one processor and import it into a program running
+on another processor.  Doing so, however, would mean that the second
+program would use plans optimized for the first processor, instead of
+the one it is running on.
+
+   <p>It should be safe to reuse wisdom as long as the hardware and program
+binaries remain unchanged. (Actually, the optimal plan may change even
+between runs of the same binary on identical hardware, due to
+differences in the virtual memory environment, etcetera.  Users
+seriously interested in performance should worry about this problem,
+too.)  It is likely that, if the same wisdom is used for two
+different program binaries, even running on the same machine, the
+plans may be sub-optimal because of differing code alignments.  It is
+therefore wise to recreate wisdom every time an application is
+recompiled.  The more the underlying hardware and software changes
+between the creation of wisdom and its use, the greater grows
+the risk of sub-optimal plans.
+
+   <p>Nevertheless, if the choice is between using <code>FFTW_ESTIMATE</code> or
+using possibly-suboptimal wisdom (created on the same machine, but for a
+different binary), the wisdom is likely to be better.  For this reason,
+we provide a function to import wisdom from a standard system-wide
+location (<code>/etc/fftw/wisdom</code> on Unix):
+<a name="index-wisdom_002c-system_002dwide-136"></a>
+<pre class="example">     int fftw_import_system_wisdom(void);
+</pre>
+   <p><a name="index-fftw_005fimport_005fsystem_005fwisdom-137"></a>
+FFTW also provides a standalone program, <code>fftw-wisdom</code> (described
+by its own <code>man</code> page on Unix) with which users can create wisdom,
+e.g. for a canonical set of sizes to store in the system wisdom file. 
+See <a href="Wisdom-Utilities.html#Wisdom-Utilities">Wisdom Utilities</a>. 
+<a name="index-fftw_002dwisdom-utility-138"></a>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Column_002dmajor-Format.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Column_002dmajor-Format.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,80 @@
+<html lang="en">
+<head>
+<title>Column-major Format - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format" title="Multi-dimensional Array Format">
+<link rel="prev" href="Row_002dmajor-Format.html#Row_002dmajor-Format" title="Row-major Format">
+<link rel="next" href="Fixed_002dsize-Arrays-in-C.html#Fixed_002dsize-Arrays-in-C" title="Fixed-size Arrays in C">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Column-major-Format"></a>
+<a name="Column_002dmajor-Format"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Fixed_002dsize-Arrays-in-C.html#Fixed_002dsize-Arrays-in-C">Fixed-size Arrays in C</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Row_002dmajor-Format.html#Row_002dmajor-Format">Row-major Format</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>
+<hr>
+</div>
+
+<h4 class="subsection">3.2.2 Column-major Format</h4>
+
+<p><a name="index-column_002dmajor-118"></a>
+Readers from the Fortran world are used to arrays stored in
+<dfn>column-major</dfn> order (sometimes called &ldquo;Fortran order&rdquo;).  This is
+essentially the exact opposite of row-major order in that, here, the
+<em>first</em> dimension's index varies most quickly.
+
+   <p>If you have an array stored in column-major order and wish to
+transform it using FFTW, it is quite easy to do.  When creating the
+plan, simply pass the dimensions of the array to the planner in
+<em>reverse order</em>.  For example, if your array is a rank three
+<code>N x M x L</code> matrix in column-major order, you should pass the
+dimensions of the array as if it were an <code>L x M x N</code> matrix
+(which it is, from the perspective of FFTW).  This is done for you
+<em>automatically</em> by the FFTW legacy-Fortran interface
+(see <a href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>), but you must do it
+manually with the modern Fortran interface (see <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a>). 
+<a name="index-Fortran-interface-119"></a>
+<!-- =========> -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Combining-MPI-and-Threads.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Combining-MPI-and-Threads.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,135 @@
+<html lang="en">
+<head>
+<title>Combining MPI and Threads - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="FFTW-MPI-Performance-Tips.html#FFTW-MPI-Performance-Tips" title="FFTW MPI Performance Tips">
+<link rel="next" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference" title="FFTW MPI Reference">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Combining-MPI-and-Threads"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">FFTW MPI Reference</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-MPI-Performance-Tips.html#FFTW-MPI-Performance-Tips">FFTW MPI Performance Tips</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.11 Combining MPI and Threads</h3>
+
+<p><a name="index-threads-427"></a>
+In certain cases, it may be advantageous to combine MPI
+(distributed-memory) and threads (shared-memory) parallelization. 
+FFTW supports this, with certain caveats.  For example, if you have a
+cluster of 4-processor shared-memory nodes, you may want to use
+threads within the nodes and MPI between the nodes, instead of MPI for
+all parallelization.
+
+   <p>In particular, it is possible to seamlessly combine the MPI FFTW
+routines with the multi-threaded FFTW routines (see <a href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>). However, some care must be taken in the initialization code,
+which should look something like this:
+
+<pre class="example">     int threads_ok;
+     
+     int main(int argc, char **argv)
+     {
+         int provided;
+         MPI_Init_thread(&amp;argc, &amp;argv, MPI_THREAD_FUNNELED, &amp;provided);
+         threads_ok = provided &gt;= MPI_THREAD_FUNNELED;
+     
+         if (threads_ok) threads_ok = fftw_init_threads();
+         fftw_mpi_init();
+     
+         ...
+         if (threads_ok) fftw_plan_with_nthreads(...);
+         ...
+     
+         MPI_Finalize();
+     }
+</pre>
+   <p><a name="index-fftw_005fmpi_005finit-428"></a><a name="index-fftw_005finit_005fthreads-429"></a><a name="index-fftw_005fplan_005fwith_005fnthreads-430"></a>
+First, note that instead of calling <code>MPI_Init</code>, you should call
+<code>MPI_Init_threads</code>, which is the initialization routine defined
+by the MPI-2 standard to indicate to MPI that your program will be
+multithreaded.  We pass <code>MPI_THREAD_FUNNELED</code>, which indicates
+that we will only call MPI routines from the main thread.  (FFTW will
+launch additional threads internally, but the extra threads will not
+call MPI code.)  (You may also pass <code>MPI_THREAD_SERIALIZED</code> or
+<code>MPI_THREAD_MULTIPLE</code>, which requests additional multithreading
+support from the MPI implementation, but this is not required by
+FFTW.)  The <code>provided</code> parameter returns what level of threads
+support is actually supported by your MPI implementation; this
+<em>must</em> be at least <code>MPI_THREAD_FUNNELED</code> if you want to call
+the FFTW threads routines, so we define a global variable
+<code>threads_ok</code> to record this.  You should only call
+<code>fftw_init_threads</code> or <code>fftw_plan_with_nthreads</code> if
+<code>threads_ok</code> is true.  For more information on thread safety in
+MPI, see the
+<a href="http://www.mpi-forum.org/docs/mpi-20-html/node162.htm">MPI and Threads</a> section of the MPI-2 standard. 
+<a name="index-thread-safety-431"></a>
+
+   <p>Second, we must call <code>fftw_init_threads</code> <em>before</em>
+<code>fftw_mpi_init</code>.  This is critical for technical reasons having
+to do with how FFTW initializes its list of algorithms.
+
+   <p>Then, if you call <code>fftw_plan_with_nthreads(N)</code>, <em>every</em> MPI
+process will launch (up to) <code>N</code> threads to parallelize its transforms.
+
+   <p>For example, in the hypothetical cluster of 4-processor nodes, you
+might wish to launch only a single MPI process per node, and then call
+<code>fftw_plan_with_nthreads(4)</code> on each process to use all
+processors in the nodes.
+
+   <p>This may or may not be faster than simply using as many MPI processes
+as you have processors, however.  On the one hand, using threads
+within a node eliminates the need for explicit message passing within
+the node.  On the other hand, FFTW's transpose routines are not
+multi-threaded, and this means that the communications that do take
+place will not benefit from parallelization within the node. 
+Moreover, many MPI implementations already have optimizations to
+exploit shared memory when it is available, so adding the
+multithreaded FFTW on top of this may be superfluous. 
+<a name="index-transpose-432"></a>
+<!--  -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Complex-DFTs.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Complex-DFTs.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,141 @@
+<html lang="en">
+<head>
+<title>Complex DFTs - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Basic-Interface.html#Basic-Interface" title="Basic Interface">
+<link rel="prev" href="Basic-Interface.html#Basic-Interface" title="Basic Interface">
+<link rel="next" href="Planner-Flags.html#Planner-Flags" title="Planner Flags">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Complex-DFTs"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Planner-Flags.html#Planner-Flags">Planner Flags</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Basic-Interface.html#Basic-Interface">Basic Interface</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Basic-Interface.html#Basic-Interface">Basic Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.3.1 Complex DFTs</h4>
+
+<pre class="example">     fftw_plan fftw_plan_dft_1d(int n0,
+                                fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+     fftw_plan fftw_plan_dft_2d(int n0, int n1,
+                                fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+     fftw_plan fftw_plan_dft_3d(int n0, int n1, int n2,
+                                fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+     fftw_plan fftw_plan_dft(int rank, const int *n,
+                             fftw_complex *in, fftw_complex *out,
+                             int sign, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fdft_005f1d-161"></a><a name="index-fftw_005fplan_005fdft_005f2d-162"></a><a name="index-fftw_005fplan_005fdft_005f3d-163"></a><a name="index-fftw_005fplan_005fdft-164"></a>
+Plan a complex input/output discrete Fourier transform (DFT) in zero or
+more dimensions, returning an <code>fftw_plan</code> (see <a href="Using-Plans.html#Using-Plans">Using Plans</a>).
+
+   <p>Once you have created a plan for a certain transform type and
+parameters, then creating another plan of the same type and parameters,
+but for different arrays, is fast and shares constant data with the
+first plan (if it still exists).
+
+   <p>The planner returns <code>NULL</code> if the plan cannot be created.  In the
+standard FFTW distribution, the basic interface is guaranteed to return
+a non-<code>NULL</code> plan.  A plan may be <code>NULL</code>, however, if you are
+using a customized FFTW configuration supporting a restricted set of
+transforms.
+
+<h5 class="subsubheading">Arguments</h5>
+
+     <ul>
+<li><code>rank</code> is the rank of the transform (it should be the size of the
+array <code>*n</code>), and can be any non-negative integer.  (See <a href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a>, for the definition of &ldquo;rank&rdquo;.)  The
+&lsquo;<samp><span class="samp">_1d</span></samp>&rsquo;, &lsquo;<samp><span class="samp">_2d</span></samp>&rsquo;, and &lsquo;<samp><span class="samp">_3d</span></samp>&rsquo; planners correspond to a
+<code>rank</code> of <code>1</code>, <code>2</code>, and <code>3</code>, respectively.  The rank
+may be zero, which is equivalent to a rank-1 transform of size 1, i.e. a
+copy of one number from input to output.
+
+     <li><code>n0</code>, <code>n1</code>, <code>n2</code>, or <code>n[0..rank-1]</code> (as appropriate
+for each routine) specify the size of the transform dimensions.  They
+can be any positive integer.
+
+          <ul>
+<li><a name="index-row_002dmajor-165"></a>Multi-dimensional arrays are stored in row-major order with dimensions:
+<code>n0</code> x <code>n1</code>; or <code>n0</code> x <code>n1</code> x <code>n2</code>; or
+<code>n[0]</code> x <code>n[1]</code> x ... x <code>n[rank-1]</code>. 
+See <a href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>. 
+<li>FFTW is best at handling sizes of the form
+2<sup>a</sup> 3<sup>b</sup> 5<sup>c</sup> 7<sup>d</sup>
+        11<sup>e</sup> 13<sup>f</sup>,where e+f is either 0 or 1, and the other exponents
+are arbitrary.  Other sizes are computed by means of a slow,
+general-purpose algorithm (which nevertheless retains <i>O</i>(<i>n</i>&nbsp;log&nbsp;<i>n</i>) performance even for prime sizes).  It is possible to customize FFTW
+for different array sizes; see <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>. 
+Transforms whose sizes are powers of 2 are especially fast. 
+</ul>
+
+     <li><code>in</code> and <code>out</code> point to the input and output arrays of the
+transform, which may be the same (yielding an in-place transform). 
+<a name="index-in_002dplace-166"></a>These arrays are overwritten during planning, unless
+<code>FFTW_ESTIMATE</code> is used in the flags.  (The arrays need not be
+initialized, but they must be allocated.)
+
+     <p>If <code>in == out</code>, the transform is <dfn>in-place</dfn> and the input
+array is overwritten. If <code>in != out</code>, the two arrays must
+not overlap (but FFTW does not check for this condition).
+
+     <li><a name="index-FFTW_005fFORWARD-167"></a><a name="index-FFTW_005fBACKWARD-168"></a><code>sign</code> is the sign of the exponent in the formula that defines the
+Fourier transform.  It can be -1 (= <code>FFTW_FORWARD</code>) or
++1 (= <code>FFTW_BACKWARD</code>).
+
+     <li><a name="index-flags-169"></a><code>flags</code> is a bitwise OR (&lsquo;<samp><span class="samp">|</span></samp>&rsquo;) of zero or more planner flags,
+as defined in <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a>.
+
+</ul>
+
+   <p>FFTW computes an unnormalized transform: computing a forward followed by
+a backward transform (or vice versa) will result in the original data
+multiplied by the size of the transform (the product of the dimensions). 
+<a name="index-normalization-170"></a>For more information, see <a href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Complex-Multi_002dDimensional-DFTs.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Complex-Multi_002dDimensional-DFTs.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,138 @@
+<html lang="en">
+<head>
+<title>Complex Multi-Dimensional DFTs - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Tutorial.html#Tutorial" title="Tutorial">
+<link rel="prev" href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs" title="Complex One-Dimensional DFTs">
+<link rel="next" href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data" title="One-Dimensional DFTs of Real Data">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Complex-Multi-Dimensional-DFTs"></a>
+<a name="Complex-Multi_002dDimensional-DFTs"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Tutorial.html#Tutorial">Tutorial</a>
+<hr>
+</div>
+
+<h3 class="section">2.2 Complex Multi-Dimensional DFTs</h3>
+
+<p>Multi-dimensional transforms work much the same way as one-dimensional
+transforms: you allocate arrays of <code>fftw_complex</code> (preferably
+using <code>fftw_malloc</code>), create an <code>fftw_plan</code>, execute it as
+many times as you want with <code>fftw_execute(plan)</code>, and clean up
+with <code>fftw_destroy_plan(plan)</code> (and <code>fftw_free</code>).
+
+   <p>FFTW provides two routines for creating plans for 2d and 3d transforms,
+and one routine for creating plans of arbitrary dimensionality. 
+The 2d and 3d routines have the following signature:
+<pre class="example">     fftw_plan fftw_plan_dft_2d(int n0, int n1,
+                                fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+     fftw_plan fftw_plan_dft_3d(int n0, int n1, int n2,
+                                fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fdft_005f2d-39"></a><a name="index-fftw_005fplan_005fdft_005f3d-40"></a>
+These routines create plans for <code>n0</code> by <code>n1</code> two-dimensional
+(2d) transforms and <code>n0</code> by <code>n1</code> by <code>n2</code> 3d transforms,
+respectively.  All of these transforms operate on contiguous arrays in
+the C-standard <dfn>row-major</dfn> order, so that the last dimension has the
+fastest-varying index in the array.  This layout is described further in
+<a href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>.
+
+   <p>FFTW can also compute transforms of higher dimensionality.  In order to
+avoid confusion between the various meanings of the the word
+&ldquo;dimension&rdquo;, we use the term <em>rank</em>
+<a name="index-rank-41"></a>to denote the number of independent indices in an array.<a rel="footnote" href="#fn-1" name="fnd-1"><sup>1</sup></a>  For
+example, we say that a 2d transform has rank&nbsp;2, a 3d transform has
+rank&nbsp;3, and so on.  You can plan transforms of arbitrary rank by
+means of the following function:
+
+<pre class="example">     fftw_plan fftw_plan_dft(int rank, const int *n,
+                             fftw_complex *in, fftw_complex *out,
+                             int sign, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fdft-42"></a>
+Here, <code>n</code> is a pointer to an array <code>n[rank]</code> denoting an
+<code>n[0]</code> by <code>n[1]</code> by <small class="dots">...</small> by <code>n[rank-1]</code> transform. 
+Thus, for example, the call
+<pre class="example">     fftw_plan_dft_2d(n0, n1, in, out, sign, flags);
+</pre>
+   <p>is equivalent to the following code fragment:
+<pre class="example">     int n[2];
+     n[0] = n0;
+     n[1] = n1;
+     fftw_plan_dft(2, n, in, out, sign, flags);
+</pre>
+   <p><code>fftw_plan_dft</code> is not restricted to 2d and 3d transforms,
+however, but it can plan transforms of arbitrary rank.
+
+   <p>You may have noticed that all the planner routines described so far
+have overlapping functionality.  For example, you can plan a 1d or 2d
+transform by using <code>fftw_plan_dft</code> with a <code>rank</code> of <code>1</code>
+or <code>2</code>, or even by calling <code>fftw_plan_dft_3d</code> with <code>n0</code>
+and/or <code>n1</code> equal to <code>1</code> (with no loss in efficiency).  This
+pattern continues, and FFTW's planning routines in general form a
+&ldquo;partial order,&rdquo; sequences of
+<a name="index-partial-order-43"></a>interfaces with strictly increasing generality but correspondingly
+greater complexity.
+
+   <p><code>fftw_plan_dft</code> is the most general complex-DFT routine that we
+describe in this tutorial, but there are also the advanced and guru interfaces,
+<a name="index-advanced-interface-44"></a><a name="index-guru-interface-45"></a>which allow one to efficiently combine multiple/strided transforms
+into a single FFTW plan, transform a subset of a larger
+multi-dimensional array, and/or to handle more general complex-number
+formats.  For more information, see <a href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>.
+
+<!--  -->
+   <div class="footnote">
+<hr>
+<h4>Footnotes</h4><p class="footnote"><small>[<a name="fn-1" href="#fnd-1">1</a>]</small> The
+term &ldquo;rank&rdquo; is commonly used in the APL, FORTRAN, and Common Lisp
+traditions, although it is not so common in the C&nbsp;world.</p>
+
+   <hr></div>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Complex-One_002dDimensional-DFTs.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Complex-One_002dDimensional-DFTs.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,192 @@
+<html lang="en">
+<head>
+<title>Complex One-Dimensional DFTs - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Tutorial.html#Tutorial" title="Tutorial">
+<link rel="prev" href="Tutorial.html#Tutorial" title="Tutorial">
+<link rel="next" href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs" title="Complex Multi-Dimensional DFTs">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Complex-One-Dimensional-DFTs"></a>
+<a name="Complex-One_002dDimensional-DFTs"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Tutorial.html#Tutorial">Tutorial</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Tutorial.html#Tutorial">Tutorial</a>
+<hr>
+</div>
+
+<h3 class="section">2.1 Complex One-Dimensional DFTs</h3>
+
+<blockquote>
+Plan: To bother about the best method of accomplishing an accidental result. 
+[Ambrose Bierce, <cite>The Enlarged Devil's Dictionary</cite>.] 
+<a name="index-Devil-15"></a></blockquote>
+
+   <p>The basic usage of FFTW to compute a one-dimensional DFT of size
+<code>N</code> is simple, and it typically looks something like this code:
+
+<pre class="example">     #include &lt;fftw3.h&gt;
+     ...
+     {
+         fftw_complex *in, *out;
+         fftw_plan p;
+         ...
+         in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
+         out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
+         p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
+         ...
+         fftw_execute(p); /* <span class="roman">repeat as needed</span> */
+         ...
+         fftw_destroy_plan(p);
+         fftw_free(in); fftw_free(out);
+     }
+</pre>
+   <p>You must link this code with the <code>fftw3</code> library.  On Unix systems,
+link with <code>-lfftw3 -lm</code>.
+
+   <p>The example code first allocates the input and output arrays.  You can
+allocate them in any way that you like, but we recommend using
+<code>fftw_malloc</code>, which behaves like
+<a name="index-fftw_005fmalloc-16"></a><code>malloc</code> except that it properly aligns the array when SIMD
+instructions (such as SSE and Altivec) are available (see <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>). [Alternatively, we provide a convenient wrapper function <code>fftw_alloc_complex(N)</code> which has the same effect.] 
+<a name="index-fftw_005falloc_005fcomplex-17"></a><a name="index-SIMD-18"></a>
+
+   <p>The data is an array of type <code>fftw_complex</code>, which is by default a
+<code>double[2]</code> composed of the real (<code>in[i][0]</code>) and imaginary
+(<code>in[i][1]</code>) parts of a complex number. 
+<a name="index-fftw_005fcomplex-19"></a>
+The next step is to create a <dfn>plan</dfn>, which is an object
+<a name="index-plan-20"></a>that contains all the data that FFTW needs to compute the FFT. 
+This function creates the plan:
+
+<pre class="example">     fftw_plan fftw_plan_dft_1d(int n, fftw_complex *in, fftw_complex *out,
+                                int sign, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fdft_005f1d-21"></a><a name="index-fftw_005fplan-22"></a>
+The first argument, <code>n</code>, is the size of the transform you are
+trying to compute.  The size <code>n</code> can be any positive integer, but
+sizes that are products of small factors are transformed most
+efficiently (although prime sizes still use an <i>O</i>(<i>n</i>&nbsp;log&nbsp;<i>n</i>) algorithm).
+
+   <p>The next two arguments are pointers to the input and output arrays of
+the transform.  These pointers can be equal, indicating an
+<dfn>in-place</dfn> transform. 
+<a name="index-in_002dplace-23"></a>
+
+   <p>The fourth argument, <code>sign</code>, can be either <code>FFTW_FORWARD</code>
+(<code>-1</code>) or <code>FFTW_BACKWARD</code> (<code>+1</code>),
+<a name="index-FFTW_005fFORWARD-24"></a><a name="index-FFTW_005fBACKWARD-25"></a>and indicates the direction of the transform you are interested in;
+technically, it is the sign of the exponent in the transform.
+
+   <p>The <code>flags</code> argument is usually either <code>FFTW_MEASURE</code> or
+<a name="index-flags-26"></a><code>FFTW_ESTIMATE</code>.  <code>FFTW_MEASURE</code> instructs FFTW to run
+<a name="index-FFTW_005fMEASURE-27"></a>and measure the execution time of several FFTs in order to find the
+best way to compute the transform of size <code>n</code>.  This process takes
+some time (usually a few seconds), depending on your machine and on
+the size of the transform.  <code>FFTW_ESTIMATE</code>, on the contrary,
+does not run any computation and just builds a
+<a name="index-FFTW_005fESTIMATE-28"></a>reasonable plan that is probably sub-optimal.  In short, if your
+program performs many transforms of the same size and initialization
+time is not important, use <code>FFTW_MEASURE</code>; otherwise use the
+estimate.
+
+   <p><em>You must create the plan before initializing the input</em>, because
+<code>FFTW_MEASURE</code> overwrites the <code>in</code>/<code>out</code> arrays. 
+(Technically, <code>FFTW_ESTIMATE</code> does not touch your arrays, but you
+should always create plans first just to be sure.)
+
+   <p>Once the plan has been created, you can use it as many times as you
+like for transforms on the specified <code>in</code>/<code>out</code> arrays,
+computing the actual transforms via <code>fftw_execute(plan)</code>:
+<pre class="example">     void fftw_execute(const fftw_plan plan);
+</pre>
+   <p><a name="index-fftw_005fexecute-29"></a>
+The DFT results are stored in-order in the array <code>out</code>, with the
+zero-frequency (DC) component in <code>out[0]</code>. 
+<a name="index-frequency-30"></a>If <code>in != out</code>, the transform is <dfn>out-of-place</dfn> and the input
+array <code>in</code> is not modified.  Otherwise, the input array is
+overwritten with the transform.
+
+   <p><a name="index-execute-31"></a>If you want to transform a <em>different</em> array of the same size, you
+can create a new plan with <code>fftw_plan_dft_1d</code> and FFTW
+automatically reuses the information from the previous plan, if
+possible.  Alternatively, with the &ldquo;guru&rdquo; interface you can apply a
+given plan to a different array, if you are careful. 
+See <a href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>.
+
+   <p>When you are done with the plan, you deallocate it by calling
+<code>fftw_destroy_plan(plan)</code>:
+<pre class="example">     void fftw_destroy_plan(fftw_plan plan);
+</pre>
+   <p><a name="index-fftw_005fdestroy_005fplan-32"></a>If you allocate an array with <code>fftw_malloc()</code> you must deallocate
+it with <code>fftw_free()</code>.  Do not use <code>free()</code> or, heaven
+forbid, <code>delete</code>. 
+<a name="index-fftw_005ffree-33"></a>
+FFTW computes an <em>unnormalized</em> DFT.  Thus, computing a forward
+followed by a backward transform (or vice versa) results in the original
+array scaled by <code>n</code>.  For the definition of the DFT, see <a href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>. 
+<a name="index-DFT-34"></a><a name="index-normalization-35"></a>
+
+   <p>If you have a C compiler, such as <code>gcc</code>, that supports the
+C99 standard, and you <code>#include &lt;complex.h&gt;</code> <em>before</em>
+<code>&lt;fftw3.h&gt;</code>, then <code>fftw_complex</code> is the native
+double-precision complex type and you can manipulate it with ordinary
+arithmetic.  Otherwise, FFTW defines its own complex type, which is
+bit-compatible with the C99 complex type. See <a href="Complex-numbers.html#Complex-numbers">Complex numbers</a>. 
+(The C++ <code>&lt;complex&gt;</code> template class may also be usable via a
+typecast.) 
+<a name="index-C_002b_002b-36"></a>
+To use single or long-double precision versions of FFTW, replace the
+<code>fftw_</code> prefix by <code>fftwf_</code> or <code>fftwl_</code> and link with
+<code>-lfftw3f</code> or <code>-lfftw3l</code>, but use the <em>same</em>
+<code>&lt;fftw3.h&gt;</code> header file. 
+<a name="index-precision-37"></a>
+
+   <p>Many more flags exist besides <code>FFTW_MEASURE</code> and
+<code>FFTW_ESTIMATE</code>.  For example, use <code>FFTW_PATIENT</code> if you're
+willing to wait even longer for a possibly even faster plan (see <a href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>). 
+<a name="index-FFTW_005fPATIENT-38"></a>You can also save plans for future use, as described by <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Complex-numbers.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Complex-numbers.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,96 @@
+<html lang="en">
+<head>
+<title>Complex numbers - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Data-Types-and-Files.html#Data-Types-and-Files" title="Data Types and Files">
+<link rel="prev" href="Data-Types-and-Files.html#Data-Types-and-Files" title="Data Types and Files">
+<link rel="next" href="Precision.html#Precision" title="Precision">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Complex-numbers"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Precision.html#Precision">Precision</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Data-Types-and-Files.html#Data-Types-and-Files">Data Types and Files</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Data-Types-and-Files.html#Data-Types-and-Files">Data Types and Files</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.1.1 Complex numbers</h4>
+
+<p>The default FFTW interface uses <code>double</code> precision for all
+floating-point numbers, and defines a <code>fftw_complex</code> type to hold
+complex numbers as:
+
+<pre class="example">     typedef double fftw_complex[2];
+</pre>
+   <p><a name="index-fftw_005fcomplex-139"></a>
+Here, the <code>[0]</code> element holds the real part and the <code>[1]</code>
+element holds the imaginary part.
+
+   <p>Alternatively, if you have a C compiler (such as <code>gcc</code>) that
+supports the C99 revision of the ANSI C standard, you can use C's new
+native complex type (which is binary-compatible with the typedef above). 
+In particular, if you <code>#include &lt;complex.h&gt;</code> <em>before</em>
+<code>&lt;fftw3.h&gt;</code>, then <code>fftw_complex</code> is defined to be the native
+complex type and you can manipulate it with ordinary arithmetic
+(e.g. <code>x = y * (3+4*I)</code>, where <code>x</code> and <code>y</code> are
+<code>fftw_complex</code> and <code>I</code> is the standard symbol for the
+imaginary unit);
+<a name="index-C99-140"></a>
+
+   <p>C++ has its own <code>complex&lt;T&gt;</code> template class, defined in the
+standard <code>&lt;complex&gt;</code> header file.  Reportedly, the C++ standards
+committee has recently agreed to mandate that the storage format used
+for this type be binary-compatible with the C99 type, i.e. an array
+<code>T[2]</code> with consecutive real <code>[0]</code> and imaginary <code>[1]</code>
+parts.  (See report
+<a href="http://www.open-std.org/jtc1/sc22/WG21/docs/papers/2002/n1388.pdf WG21/N1388">http://www.open-std.org/jtc1/sc22/WG21/docs/papers/2002/n1388.pdf WG21/N1388</a>.)  Although not part of the official standard as of this
+writing, the proposal stated that: &ldquo;This solution has been tested with
+all current major implementations of the standard library and shown to
+be working.&rdquo;  To the extent that this is true, if you have a variable
+<code>complex&lt;double&gt; *x</code>, you can pass it directly to FFTW via
+<code>reinterpret_cast&lt;fftw_complex*&gt;(x)</code>. 
+<a name="index-C_002b_002b-141"></a><a name="index-portability-142"></a>
+<!-- =========> -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Concept-Index.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Concept-Index.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,357 @@
+<html lang="en">
+<head>
+<title>Concept Index - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="License-and-Copyright.html#License-and-Copyright" title="License and Copyright">
+<link rel="next" href="Library-Index.html#Library-Index" title="Library Index">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Concept-Index"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Library-Index.html#Library-Index">Library Index</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="License-and-Copyright.html#License-and-Copyright">License and Copyright</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">13 Concept Index</h2>
+
+<ul class="index-cp" compact>
+<li><a href="FFTW-Fortran-type-reference.html#index-g_t64_002dbit-architecture-551">64-bit architecture</a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="2d-MPI-example.html#index-g_t64_002dbit-architecture-363">64-bit architecture</a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="64_002dbit-Guru-Interface.html#index-g_t64_002dbit-architecture-260">64-bit architecture</a>: <a href="64_002dbit-Guru-Interface.html#g_t64_002dbit-Guru-Interface">64-bit Guru Interface</a></li>
+<li><a href="MPI-Plan-Creation.html#index-advanced-interface-468">advanced interface</a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-advanced-interface-454">advanced interface</a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="Basic-and-advanced-distribution-interfaces.html#index-advanced-interface-374">advanced interface</a>: <a href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a></li>
+<li><a href="Advanced-Interface.html#index-advanced-interface-231">advanced interface</a>: <a href="Advanced-Interface.html#Advanced-Interface">Advanced Interface</a></li>
+<li><a href="Row_002dmajor-Format.html#index-advanced-interface-116">advanced interface</a>: <a href="Row_002dmajor-Format.html#Row_002dmajor-Format">Row-major Format</a></li>
+<li><a href="Complex-Multi_002dDimensional-DFTs.html#index-advanced-interface-44">advanced interface</a>: <a href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a></li>
+<li><a href="Introduction.html#index-advanced-interface-8">advanced interface</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="Introduction.html#index-algorithm-13">algorithm</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="Allocating-aligned-memory-in-Fortran.html#index-alignment-560">alignment</a>: <a href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-alignment-511">alignment</a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="Using-MPI-Plans.html#index-alignment-445">alignment</a>: <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-alignment-268">alignment</a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="Planner-Flags.html#index-alignment-182">alignment</a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="Memory-Allocation.html#index-alignment-147">alignment</a>: <a href="Memory-Allocation.html#Memory-Allocation">Memory Allocation</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-AltiVec-106">AltiVec</a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-AVX-105">AVX</a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="Basic-Interface.html#index-basic-interface-160">basic interface</a>: <a href="Basic-Interface.html#Basic-Interface">Basic Interface</a></li>
+<li><a href="Tutorial.html#index-basic-interface-14">basic interface</a>: <a href="Tutorial.html#Tutorial">Tutorial</a></li>
+<li><a href="Introduction.html#index-basic-interface-7">basic interface</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="FFTW-MPI-Performance-Tips.html#index-block-distribution-423">block distribution</a>: <a href="FFTW-MPI-Performance-Tips.html#FFTW-MPI-Performance-Tips">FFTW MPI Performance Tips</a></li>
+<li><a href="Basic-and-advanced-distribution-interfaces.html#index-block-distribution-377">block distribution</a>: <a href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a></li>
+<li><a href="MPI-Data-Distribution.html#index-block-distribution-369">block distribution</a>: <a href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a></li>
+<li><a href="Fixed_002dsize-Arrays-in-C.html#index-C-multi_002ddimensional-arrays-120">C multi-dimensional arrays</a>: <a href="Fixed_002dsize-Arrays-in-C.html#Fixed_002dsize-Arrays-in-C">Fixed-size Arrays in C</a></li>
+<li><a href="Memory-Allocation.html#index-C_002b_002b-148">C++</a>: <a href="Memory-Allocation.html#Memory-Allocation">Memory Allocation</a></li>
+<li><a href="Complex-numbers.html#index-C_002b_002b-141">C++</a>: <a href="Complex-numbers.html#Complex-numbers">Complex numbers</a></li>
+<li><a href="Dynamic-Arrays-in-C.html#index-C_002b_002b-123">C++</a>: <a href="Dynamic-Arrays-in-C.html#Dynamic-Arrays-in-C">Dynamic Arrays in C</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-C_002b_002b-112">C++</a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-C_002b_002b-36">C++</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-c2r-197">c2r</a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Planner-Flags.html#index-c2r-179">c2r</a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-c2r-51">c2r</a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Precision.html#index-C99-144">C99</a>: <a href="Precision.html#Precision">Precision</a></li>
+<li><a href="Complex-numbers.html#index-C99-140">C99</a>: <a href="Complex-numbers.html#Complex-numbers">Complex numbers</a></li>
+<li><a href="Dynamic-Arrays-in-C.html#index-C99-122">C99</a>: <a href="Dynamic-Arrays-in-C.html#Dynamic-Arrays-in-C">Dynamic Arrays in C</a></li>
+<li><a href="Acknowledgments.html#index-Caml-627">Caml</a>: <a href="Acknowledgments.html#Acknowledgments">Acknowledgments</a></li>
+<li><a href="Generating-your-own-code.html#index-Caml-624">Caml</a>: <a href="Generating-your-own-code.html#Generating-your-own-code">Generating your own code</a></li>
+<li><a href="Generating-your-own-code.html#index-code-generator-622">code generator</a>: <a href="Generating-your-own-code.html#Generating-your-own-code">Generating your own code</a></li>
+<li><a href="Introduction.html#index-code-generator-10">code generator</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="Acknowledgments.html#index-codelet-629">codelet</a>: <a href="Acknowledgments.html#Acknowledgments">Acknowledgments</a></li>
+<li><a href="Generating-your-own-code.html#index-codelet-623">codelet</a>: <a href="Generating-your-own-code.html#Generating-your-own-code">Generating your own code</a></li>
+<li><a href="Installation-and-Customization.html#index-codelet-607">codelet</a>: <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a></li>
+<li><a href="Introduction.html#index-codelet-11">codelet</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="MPI-Plan-Creation.html#index-collective-function-467">collective function</a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Using-MPI-Plans.html#index-collective-function-438">collective function</a>: <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a></li>
+<li><a href="Avoiding-MPI-Deadlocks.html#index-collective-function-418">collective function</a>: <a href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks">Avoiding MPI Deadlocks</a></li>
+<li><a href="FFTW-MPI-Wisdom.html#index-collective-function-415">collective function</a>: <a href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a></li>
+<li><a href="2d-MPI-example.html#index-collective-function-359">collective function</a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="Fortran-Examples.html#index-column_002dmajor-596">column-major</a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="Fortran_002dinterface-routines.html#index-column_002dmajor-579">column-major</a>: <a href="Fortran_002dinterface-routines.html#Fortran_002dinterface-routines">Fortran-interface routines</a></li>
+<li><a href="Reversing-array-dimensions.html#index-column_002dmajor-518">column-major</a>: <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a></li>
+<li><a href="Column_002dmajor-Format.html#index-column_002dmajor-118">column-major</a>: <a href="Column_002dmajor-Format.html#Column_002dmajor-Format">Column-major Format</a></li>
+<li><a href="Cycle-Counters.html#index-compiler-620">compiler</a>: <a href="Cycle-Counters.html#Cycle-Counters">Cycle Counters</a></li>
+<li><a href="Installation-on-Unix.html#index-compiler-617">compiler</a>: <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a></li>
+<li><a href="Installation-and-Customization.html#index-compiler-605">compiler</a>: <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a></li>
+<li><a href="Introduction.html#index-compiler-12">compiler</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="Installation-on-Unix.html#index-compiler-flags-609">compiler flags</a>: <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a></li>
+<li><a href="Wisdom-Utilities.html#index-configuration-routines-291">configuration routines</a>: <a href="Wisdom-Utilities.html#Wisdom-Utilities">Wisdom Utilities</a></li>
+<li><a href="Installation-on-Unix.html#index-configure-608"><code>configure</code></a>: <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a></li>
+<li><a href="FFTW-MPI-Installation.html#index-configure-349"><code>configure</code></a>: <a href="FFTW-MPI-Installation.html#FFTW-MPI-Installation">FFTW MPI Installation</a></li>
+<li><a href="Installation-and-Supported-Hardware_002fSoftware.html#index-configure-329"><code>configure</code></a>: <a href="Installation-and-Supported-Hardware_002fSoftware.html#Installation-and-Supported-Hardware_002fSoftware">Installation and Supported Hardware/Software</a></li>
+<li><a href="Cycle-Counters.html#index-cycle-counter-619">cycle counter</a>: <a href="Cycle-Counters.html#Cycle-Counters">Cycle Counters</a></li>
+<li><a href="Installation-and-Customization.html#index-cycle-counter-604">cycle counter</a>: <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-data-distribution-447">data distribution</a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="Basic-distributed_002dtranspose-interface.html#index-data-distribution-401">data distribution</a>: <a href="Basic-distributed_002dtranspose-interface.html#Basic-distributed_002dtranspose-interface">Basic distributed-transpose interface</a></li>
+<li><a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#index-data-distribution-387">data distribution</a>: <a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a></li>
+<li><a href="MPI-Data-Distribution.html#index-data-distribution-368">data distribution</a>: <a href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a></li>
+<li><a href="2d-MPI-example.html#index-data-distribution-365">data distribution</a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="Distributed_002dmemory-FFTW-with-MPI.html#index-data-distribution-346">data distribution</a>: <a href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a></li>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#index-DCT-305">DCT</a>: <a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-DCT-220">DCT</a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-DCT-84">DCT</a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Avoiding-MPI-Deadlocks.html#index-deadlock-417">deadlock</a>: <a href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks">Avoiding MPI Deadlocks</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-Devil-15">Devil</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#index-DFT-293">DFT</a>: <a href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#The-1d-Discrete-Fourier-Transform-_0028DFT_0029">The 1d Discrete Fourier Transform (DFT)</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-DFT-34">DFT</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Introduction.html#index-DFT-2">DFT</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#index-DHT-323">DHT</a>: <a href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#g_t1d-Discrete-Hartley-Transforms-_0028DHTs_0029">1d Discrete Hartley Transforms (DHTs)</a></li>
+<li><a href="The-Discrete-Hartley-Transform.html#index-DHT-100">DHT</a>: <a href="The-Discrete-Hartley-Transform.html#The-Discrete-Hartley-Transform">The Discrete Hartley Transform</a></li>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#index-discrete-cosine-transform-304">discrete cosine transform</a>: <a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-discrete-cosine-transform-219">discrete cosine transform</a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-discrete-cosine-transform-83">discrete cosine transform</a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#index-discrete-Fourier-transform-292">discrete Fourier transform</a>: <a href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#The-1d-Discrete-Fourier-Transform-_0028DFT_0029">The 1d Discrete Fourier Transform (DFT)</a></li>
+<li><a href="Introduction.html#index-discrete-Fourier-transform-1">discrete Fourier transform</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#index-discrete-Hartley-transform-322">discrete Hartley transform</a>: <a href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#g_t1d-Discrete-Hartley-Transforms-_0028DHTs_0029">1d Discrete Hartley Transforms (DHTs)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-discrete-Hartley-transform-217">discrete Hartley transform</a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="The-Discrete-Hartley-Transform.html#index-discrete-Hartley-transform-99">discrete Hartley transform</a>: <a href="The-Discrete-Hartley-Transform.html#The-Discrete-Hartley-Transform">The Discrete Hartley Transform</a></li>
+<li><a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#index-discrete-sine-transform-315">discrete sine transform</a>: <a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-discrete-sine-transform-226">discrete sine transform</a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-discrete-sine-transform-85">discrete sine transform</a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Guru-vector-and-transform-sizes.html#index-dist-246">dist</a>: <a href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a></li>
+<li><a href="Advanced-Complex-DFTs.html#index-dist-234">dist</a>: <a href="Advanced-Complex-DFTs.html#Advanced-Complex-DFTs">Advanced Complex DFTs</a></li>
+<li><a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#index-DST-316">DST</a>: <a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-DST-227">DST</a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-DST-86">DST</a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Caveats-in-Using-Wisdom.html#index-Ecclesiastes-134">Ecclesiastes</a>: <a href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom">Caveats in Using Wisdom</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-execute-264">execute</a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-execute-31">execute</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Introduction.html#index-execute-6">execute</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="Introduction.html#index-FFTW-3">FFTW</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="Wisdom-Utilities.html#index-fftw_002dwisdom-utility-289">fftw-wisdom utility</a>: <a href="Wisdom-Utilities.html#Wisdom-Utilities">Wisdom Utilities</a></li>
+<li><a href="Caveats-in-Using-Wisdom.html#index-fftw_002dwisdom-utility-138">fftw-wisdom utility</a>: <a href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom">Caveats in Using Wisdom</a></li>
+<li><a href="Wisdom-Utilities.html#index-fftw_002dwisdom_002dto_002dconf-utility-290">fftw-wisdom-to-conf utility</a>: <a href="Wisdom-Utilities.html#Wisdom-Utilities">Wisdom Utilities</a></li>
+<li><a href="FFTW-Constants-in-Fortran.html#index-flags-583">flags</a>: <a href="FFTW-Constants-in-Fortran.html#FFTW-Constants-in-Fortran">FFTW Constants in Fortran</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-flags-514">flags</a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="Guru-Real_002dto_002dreal-Transforms.html#index-flags-259">flags</a>: <a href="Guru-Real_002dto_002dreal-Transforms.html#Guru-Real_002dto_002dreal-Transforms">Guru Real-to-real Transforms</a></li>
+<li><a href="Guru-Real_002ddata-DFTs.html#index-flags-256">flags</a>: <a href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs">Guru Real-data DFTs</a></li>
+<li><a href="Guru-Complex-DFTs.html#index-flags-250">flags</a>: <a href="Guru-Complex-DFTs.html#Guru-Complex-DFTs">Guru Complex DFTs</a></li>
+<li><a href="Real_002dto_002dReal-Transforms.html#index-flags-211">flags</a>: <a href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-flags-192">flags</a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Complex-DFTs.html#index-flags-169">flags</a>: <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-flags-56">flags</a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-flags-26">flags</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Calling-FFTW-from-Legacy-Fortran.html#index-Fortran-interface-577">Fortran interface</a>: <a href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a></li>
+<li><a href="Calling-FFTW-from-Modern-Fortran.html#index-Fortran-interface-500">Fortran interface</a>: <a href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a></li>
+<li><a href="FFTW-MPI-Fortran-Interface.html#index-Fortran-interface-494">Fortran interface</a>: <a href="FFTW-MPI-Fortran-Interface.html#FFTW-MPI-Fortran-Interface">FFTW MPI Fortran Interface</a></li>
+<li><a href="Column_002dmajor-Format.html#index-Fortran-interface-119">Fortran interface</a>: <a href="Column_002dmajor-Format.html#Column_002dmajor-Format">Column-major Format</a></li>
+<li><a href="Installation-on-Unix.html#index-Fortran_002dcallable-wrappers-615">Fortran-callable wrappers</a>: <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a></li>
+<li><a href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#index-frequency-295">frequency</a>: <a href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#The-1d-Discrete-Fourier-Transform-_0028DFT_0029">The 1d Discrete Fourier Transform (DFT)</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-frequency-30">frequency</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Installation-on-Unix.html#index-g77-616"><code>g77</code></a>: <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a></li>
+<li><a href="Fortran_002dinterface-routines.html#index-guru-interface-581">guru interface</a>: <a href="Fortran_002dinterface-routines.html#Fortran_002dinterface-routines">Fortran-interface routines</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-guru-interface-548">guru interface</a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Guru-Interface.html#index-guru-interface-239">guru interface</a>: <a href="Guru-Interface.html#Guru-Interface">Guru Interface</a></li>
+<li><a href="Complex-Multi_002dDimensional-DFTs.html#index-guru-interface-45">guru interface</a>: <a href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a></li>
+<li><a href="Introduction.html#index-guru-interface-9">guru interface</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="The-1d-Real_002ddata-DFT.html#index-halfcomplex-format-299">halfcomplex format</a>: <a href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT">The 1d Real-data DFT</a></li>
+<li><a href="The-Halfcomplex_002dformat-DFT.html#index-halfcomplex-format-75">halfcomplex format</a>: <a href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-halfcomplex-format-58">halfcomplex format</a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Planner-Flags.html#index-hc2r-180">hc2r</a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="The-Halfcomplex_002dformat-DFT.html#index-hc2r-77">hc2r</a>: <a href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a></li>
+<li><a href="2d-MPI-example.html#index-HDF5-366">HDF5</a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="The-1d-Real_002ddata-DFT.html#index-Hermitian-296">Hermitian</a>: <a href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT">The 1d Real-data DFT</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-Hermitian-46">Hermitian</a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Guru-vector-and-transform-sizes.html#index-howmany-loop-245">howmany loop</a>: <a href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a></li>
+<li><a href="Advanced-Complex-DFTs.html#index-howmany-parameter-233">howmany parameter</a>: <a href="Advanced-Complex-DFTs.html#Advanced-Complex-DFTs">Advanced Complex DFTs</a></li>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#index-IDCT-309">IDCT</a>: <a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-IDCT-223">IDCT</a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-IDCT-90">IDCT</a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-in_002dplace-544">in-place</a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Reversing-array-dimensions.html#index-in_002dplace-523">in-place</a>: <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a></li>
+<li><a href="An-improved-replacement-for-MPI_005fAlltoall.html#index-in_002dplace-407">in-place</a>: <a href="An-improved-replacement-for-MPI_005fAlltoall.html#An-improved-replacement-for-MPI_005fAlltoall">An improved replacement for MPI_Alltoall</a></li>
+<li><a href="Guru-Real_002ddata-DFTs.html#index-in_002dplace-255">in-place</a>: <a href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs">Guru Real-data DFTs</a></li>
+<li><a href="Real_002dto_002dReal-Transforms.html#index-in_002dplace-210">in-place</a>: <a href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a></li>
+<li><a href="Real_002ddata-DFT-Array-Format.html#index-in_002dplace-201">in-place</a>: <a href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-in_002dplace-190">in-place</a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Complex-DFTs.html#index-in_002dplace-166">in-place</a>: <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-in_002dplace-55">in-place</a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-in_002dplace-23">in-place</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Installation-and-Customization.html#index-installation-603">installation</a>: <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a></li>
+<li><a href="Interleaved-and-split-arrays.html#index-interleaved-format-241">interleaved format</a>: <a href="Interleaved-and-split-arrays.html#Interleaved-and-split-arrays">Interleaved and split arrays</a></li>
+<li><a href="Extended-and-quadruple-precision-in-Fortran.html#index-iso_005fc_005fbinding-516">iso_c_binding</a>: <a href="Extended-and-quadruple-precision-in-Fortran.html#Extended-and-quadruple-precision-in-Fortran">Extended and quadruple precision in Fortran</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-iso_005fc_005fbinding-502">iso_c_binding</a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="FFTW-MPI-Fortran-Interface.html#index-iso_005fc_005fbinding-495">iso_c_binding</a>: <a href="FFTW-MPI-Fortran-Interface.html#FFTW-MPI-Fortran-Interface">FFTW MPI Fortran Interface</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-kind-_0028r2r_0029-212">kind (r2r)</a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="More-DFTs-of-Real-Data.html#index-kind-_0028r2r_0029-70">kind (r2r)</a>: <a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a></li>
+<li><a href="Linking-and-Initializing-MPI-FFTW.html#index-linking-on-Unix-352">linking on Unix</a>: <a href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW">Linking and Initializing MPI FFTW</a></li>
+<li><a href="Usage-of-Multi_002dthreaded-FFTW.html#index-linking-on-Unix-333">linking on Unix</a>: <a href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW">Usage of Multi-threaded FFTW</a></li>
+<li><a href="Acknowledgments.html#index-LISP-628">LISP</a>: <a href="Acknowledgments.html#Acknowledgments">Acknowledgments</a></li>
+<li><a href="FFTW-MPI-Performance-Tips.html#index-load-balancing-424">load balancing</a>: <a href="FFTW-MPI-Performance-Tips.html#FFTW-MPI-Performance-Tips">FFTW MPI Performance Tips</a></li>
+<li><a href="Load-balancing.html#index-load-balancing-378">load balancing</a>: <a href="Load-balancing.html#Load-balancing">Load balancing</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-MIPS-PS-107">MIPS PS</a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="Generating-your-own-code.html#index-monadic-programming-626">monadic programming</a>: <a href="Generating-your-own-code.html#Generating-your-own-code">Generating your own code</a></li>
+<li><a href="Installation-on-Unix.html#index-MPI-614">MPI</a>: <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a></li>
+<li><a href="Distributed_002dmemory-FFTW-with-MPI.html#index-MPI-344">MPI</a>: <a href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a></li>
+<li><a href="FFTW-MPI-Fortran-Interface.html#index-MPI-communicator-496">MPI communicator</a>: <a href="FFTW-MPI-Fortran-Interface.html#FFTW-MPI-Fortran-Interface">FFTW MPI Fortran Interface</a></li>
+<li><a href="MPI-Plan-Creation.html#index-MPI-communicator-466">MPI communicator</a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Using-MPI-Plans.html#index-MPI-communicator-439">MPI communicator</a>: <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a></li>
+<li><a href="Distributed_002dmemory-FFTW-with-MPI.html#index-MPI-communicator-347">MPI communicator</a>: <a href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a></li>
+<li><a href="FFTW-MPI-Wisdom.html#index-MPI-I_002fO-412">MPI I/O</a>: <a href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a></li>
+<li><a href="2d-MPI-example.html#index-MPI-I_002fO-367">MPI I/O</a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="Linking-and-Initializing-MPI-FFTW.html#index-mpicc-351"><code>mpicc</code></a>: <a href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW">Linking and Initializing MPI FFTW</a></li>
+<li><a href="FFTW-MPI-Installation.html#index-mpicc-350"><code>mpicc</code></a>: <a href="FFTW-MPI-Installation.html#FFTW-MPI-Installation">FFTW MPI Installation</a></li>
+<li><a href="FFTW-MPI-Fortran-Interface.html#index-new_002darray-execution-499">new-array execution</a>: <a href="FFTW-MPI-Fortran-Interface.html#FFTW-MPI-Fortran-Interface">FFTW MPI Fortran Interface</a></li>
+<li><a href="MPI-Plan-Creation.html#index-new_002darray-execution-490">new-array execution</a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Using-MPI-Plans.html#index-new_002darray-execution-440">new-array execution</a>: <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-new_002darray-execution-265">new-array execution</a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#index-normalization-324">normalization</a>: <a href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#g_t1d-Discrete-Hartley-Transforms-_0028DHTs_0029">1d Discrete Hartley Transforms (DHTs)</a></li>
+<li><a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#index-normalization-321">normalization</a>: <a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a></li>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#index-normalization-311">normalization</a>: <a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a></li>
+<li><a href="The-1d-Real_002ddata-DFT.html#index-normalization-300">normalization</a>: <a href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT">The 1d Real-data DFT</a></li>
+<li><a href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#index-normalization-294">normalization</a>: <a href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#The-1d-Discrete-Fourier-Transform-_0028DFT_0029">The 1d Discrete Fourier Transform (DFT)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-normalization-213">normalization</a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-normalization-198">normalization</a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Complex-DFTs.html#index-normalization-170">normalization</a>: <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a></li>
+<li><a href="The-Discrete-Hartley-Transform.html#index-normalization-101">normalization</a>: <a href="The-Discrete-Hartley-Transform.html#The-Discrete-Hartley-Transform">The Discrete Hartley Transform</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-normalization-96">normalization</a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="The-Halfcomplex_002dformat-DFT.html#index-normalization-78">normalization</a>: <a href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a></li>
+<li><a href="Multi_002dDimensional-DFTs-of-Real-Data.html#index-normalization-64">normalization</a>: <a href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-normalization-35">normalization</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="How-Many-Threads-to-Use_003f.html#index-number-of-threads-339">number of threads</a>: <a href="How-Many-Threads-to-Use_003f.html#How-Many-Threads-to-Use_003f">How Many Threads to Use?</a></li>
+<li><a href="Thread-safety.html#index-OpenMP-342">OpenMP</a>: <a href="Thread-safety.html#Thread-safety">Thread safety</a></li>
+<li><a href="Usage-of-Multi_002dthreaded-FFTW.html#index-OpenMP-332">OpenMP</a>: <a href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW">Usage of Multi-threaded FFTW</a></li>
+<li><a href="Installation-and-Supported-Hardware_002fSoftware.html#index-OpenMP-331">OpenMP</a>: <a href="Installation-and-Supported-Hardware_002fSoftware.html#Installation-and-Supported-Hardware_002fSoftware">Installation and Supported Hardware/Software</a></li>
+<li><a href="Real_002ddata-DFT-Array-Format.html#index-out_002dof_002dplace-200">out-of-place</a>: <a href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a></li>
+<li><a href="Planner-Flags.html#index-out_002dof_002dplace-177">out-of-place</a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="Reversing-array-dimensions.html#index-padding-524">padding</a>: <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a></li>
+<li><a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#index-padding-386">padding</a>: <a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a></li>
+<li><a href="Real_002ddata-DFT-Array-Format.html#index-padding-202">padding</a>: <a href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-padding-191">padding</a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Multi_002dDimensional-DFTs-of-Real-Data.html#index-padding-63">padding</a>: <a href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-padding-47">padding</a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Distributed_002dmemory-FFTW-with-MPI.html#index-parallel-transform-345">parallel transform</a>: <a href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a></li>
+<li><a href="Multi_002dthreaded-FFTW.html#index-parallel-transform-326">parallel transform</a>: <a href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a></li>
+<li><a href="Complex-Multi_002dDimensional-DFTs.html#index-partial-order-43">partial order</a>: <a href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-plan-20">plan</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Introduction.html#index-plan-5">plan</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="Introduction.html#index-planner-4">planner</a>: <a href="Introduction.html#Introduction">Introduction</a></li>
+<li><a href="Installation-and-Customization.html#index-portability-606">portability</a>: <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a></li>
+<li><a href="Wisdom-of-Fortran_003f.html#index-portability-598">portability</a>: <a href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f">Wisdom of Fortran?</a></li>
+<li><a href="Fortran_002dinterface-routines.html#index-portability-578">portability</a>: <a href="Fortran_002dinterface-routines.html#Fortran_002dinterface-routines">Fortran-interface routines</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-portability-553">portability</a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Calling-FFTW-from-Modern-Fortran.html#index-portability-501">portability</a>: <a href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a></li>
+<li><a href="Installation-and-Supported-Hardware_002fSoftware.html#index-portability-330">portability</a>: <a href="Installation-and-Supported-Hardware_002fSoftware.html#Installation-and-Supported-Hardware_002fSoftware">Installation and Supported Hardware/Software</a></li>
+<li><a href="Complex-numbers.html#index-portability-142">portability</a>: <a href="Complex-numbers.html#Complex-numbers">Complex numbers</a></li>
+<li><a href="Caveats-in-Using-Wisdom.html#index-portability-135">portability</a>: <a href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom">Caveats in Using Wisdom</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-portability-109">portability</a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="Installation-on-Unix.html#index-precision-610">precision</a>: <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-precision-529">precision</a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Extended-and-quadruple-precision-in-Fortran.html#index-precision-515">precision</a>: <a href="Extended-and-quadruple-precision-in-Fortran.html#Extended-and-quadruple-precision-in-Fortran">Extended and quadruple precision in Fortran</a></li>
+<li><a href="MPI-Files-and-Data-Types.html#index-precision-433">precision</a>: <a href="MPI-Files-and-Data-Types.html#MPI-Files-and-Data-Types">MPI Files and Data Types</a></li>
+<li><a href="Linking-and-Initializing-MPI-FFTW.html#index-precision-353">precision</a>: <a href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW">Linking and Initializing MPI FFTW</a></li>
+<li><a href="Memory-Allocation.html#index-precision-151">precision</a>: <a href="Memory-Allocation.html#Memory-Allocation">Memory Allocation</a></li>
+<li><a href="Precision.html#index-precision-143">precision</a>: <a href="Precision.html#Precision">Precision</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-precision-108">precision</a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-precision-54">precision</a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-precision-37">precision</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="MPI-Plan-Creation.html#index-r2c-475">r2c</a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Multi_002ddimensional-Transforms.html#index-r2c-325">r2c</a>: <a href="Multi_002ddimensional-Transforms.html#Multi_002ddimensional-Transforms">Multi-dimensional Transforms</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-r2c-189">r2c</a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="The-Halfcomplex_002dformat-DFT.html#index-r2c-73">r2c</a>: <a href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-r2c-50">r2c</a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Fortran-Examples.html#index-r2c_002fc2r-multi_002ddimensional-array-format-597">r2c/c2r multi-dimensional array format</a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="Reversing-array-dimensions.html#index-r2c_002fc2r-multi_002ddimensional-array-format-520">r2c/c2r multi-dimensional array format</a>: <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a></li>
+<li><a href="Real_002ddata-DFT-Array-Format.html#index-r2c_002fc2r-multi_002ddimensional-array-format-199">r2c/c2r multi-dimensional array format</a>: <a href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a></li>
+<li><a href="Multi_002dDimensional-DFTs-of-Real-Data.html#index-r2c_002fc2r-multi_002ddimensional-array-format-62">r2c/c2r multi-dimensional array format</a>: <a href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a></li>
+<li><a href="The-Halfcomplex_002dformat-DFT.html#index-r2hc-74">r2hc</a>: <a href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a></li>
+<li><a href="MPI-Plan-Creation.html#index-r2r-486">r2r</a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#index-r2r-393">r2r</a>: <a href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms">Other Multi-dimensional Real-data MPI Transforms</a></li>
+<li><a href="The-1d-Real_002ddata-DFT.html#index-r2r-297">r2r</a>: <a href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT">The 1d Real-data DFT</a></li>
+<li><a href="Real_002dto_002dReal-Transforms.html#index-r2r-203">r2r</a>: <a href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a></li>
+<li><a href="More-DFTs-of-Real-Data.html#index-r2r-65">r2r</a>: <a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a></li>
+<li><a href="Complex-Multi_002dDimensional-DFTs.html#index-rank-41">rank</a>: <a href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a></li>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#index-real_002deven-DFT-301">real-even DFT</a>: <a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-real_002deven-DFT-79">real-even DFT</a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#index-real_002dodd-DFT-312">real-odd DFT</a>: <a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-real_002dodd-DFT-81">real-odd DFT</a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Generating-your-own-code.html#index-REDFT-625">REDFT</a>: <a href="Generating-your-own-code.html#Generating-your-own-code">Generating your own code</a></li>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#index-REDFT-302">REDFT</a>: <a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-REDFT-80">REDFT</a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#index-RODFT-313">RODFT</a>: <a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-RODFT-82">RODFT</a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Reversing-array-dimensions.html#index-row_002dmajor-517">row-major</a>: <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a></li>
+<li><a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#index-row_002dmajor-389">row-major</a>: <a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a></li>
+<li><a href="Basic-and-advanced-distribution-interfaces.html#index-row_002dmajor-371">row-major</a>: <a href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a></li>
+<li><a href="Guru-vector-and-transform-sizes.html#index-row_002dmajor-247">row-major</a>: <a href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a></li>
+<li><a href="Real_002dto_002dReal-Transforms.html#index-row_002dmajor-209">row-major</a>: <a href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a></li>
+<li><a href="Complex-DFTs.html#index-row_002dmajor-165">row-major</a>: <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a></li>
+<li><a href="Row_002dmajor-Format.html#index-row_002dmajor-115">row-major</a>: <a href="Row_002dmajor-Format.html#Row_002dmajor-Format">Row-major Format</a></li>
+<li><a href="Accessing-the-wisdom-API-from-Fortran.html#index-saving-plans-to-disk-565">saving plans to disk</a>: <a href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">Accessing the wisdom API from Fortran</a></li>
+<li><a href="FFTW-MPI-Wisdom.html#index-saving-plans-to-disk-411">saving plans to disk</a>: <a href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a></li>
+<li><a href="Wisdom.html#index-saving-plans-to-disk-277">saving plans to disk</a>: <a href="Wisdom.html#Wisdom">Wisdom</a></li>
+<li><a href="Words-of-Wisdom_002dSaving-Plans.html#index-saving-plans-to-disk-125">saving plans to disk</a>: <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a></li>
+<li><a href="Multi_002dthreaded-FFTW.html#index-shared_002dmemory-327">shared-memory</a>: <a href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-SIMD-512">SIMD</a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-SIMD-102">SIMD</a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-SIMD-18">SIMD</a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Interleaved-and-split-arrays.html#index-split-format-242">split format</a>: <a href="Interleaved-and-split-arrays.html#Interleaved-and-split-arrays">Interleaved and split arrays</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-SSE-103">SSE</a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-SSE2-104">SSE2</a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="MPI-Plan-Creation.html#index-stride-470">stride</a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Guru-vector-and-transform-sizes.html#index-stride-244">stride</a>: <a href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a></li>
+<li><a href="Advanced-Complex-DFTs.html#index-stride-235">stride</a>: <a href="Advanced-Complex-DFTs.html#Advanced-Complex-DFTs">Advanced Complex DFTs</a></li>
+<li><a href="Row_002dmajor-Format.html#index-stride-117">stride</a>: <a href="Row_002dmajor-Format.html#Row_002dmajor-Format">Row-major Format</a></li>
+<li><a href="Combining-MPI-and-Threads.html#index-thread-safety-431">thread safety</a>: <a href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a></li>
+<li><a href="Thread-safety.html#index-thread-safety-343">thread safety</a>: <a href="Thread-safety.html#Thread-safety">Thread safety</a></li>
+<li><a href="Usage-of-Multi_002dthreaded-FFTW.html#index-thread-safety-337">thread safety</a>: <a href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW">Usage of Multi-threaded FFTW</a></li>
+<li><a href="Installation-on-Unix.html#index-threads-613">threads</a>: <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a></li>
+<li><a href="Combining-MPI-and-Threads.html#index-threads-427">threads</a>: <a href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a></li>
+<li><a href="Thread-safety.html#index-threads-341">threads</a>: <a href="Thread-safety.html#Thread-safety">Thread safety</a></li>
+<li><a href="Multi_002dthreaded-FFTW.html#index-threads-328">threads</a>: <a href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a></li>
+<li><a href="MPI-Plan-Creation.html#index-transpose-487">transpose</a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Combining-MPI-and-Threads.html#index-transpose-432">transpose</a>: <a href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a></li>
+<li><a href="FFTW-MPI-Performance-Tips.html#index-transpose-425">transpose</a>: <a href="FFTW-MPI-Performance-Tips.html#FFTW-MPI-Performance-Tips">FFTW MPI Performance Tips</a></li>
+<li><a href="FFTW-MPI-Transposes.html#index-transpose-396">transpose</a>: <a href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">FFTW MPI Transposes</a></li>
+<li><a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#index-transpose-390">transpose</a>: <a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a></li>
+<li><a href="Transposed-distributions.html#index-transpose-379">transpose</a>: <a href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a></li>
+<li><a href="Guru-Interface.html#index-vector-240">vector</a>: <a href="Guru-Interface.html#Guru-Interface">Guru Interface</a></li>
+<li><a href="Accessing-the-wisdom-API-from-Fortran.html#index-wisdom-564">wisdom</a>: <a href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">Accessing the wisdom API from Fortran</a></li>
+<li><a href="FFTW-MPI-Wisdom.html#index-wisdom-410">wisdom</a>: <a href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a></li>
+<li><a href="Wisdom.html#index-wisdom-276">wisdom</a>: <a href="Wisdom.html#Wisdom">Wisdom</a></li>
+<li><a href="Words-of-Wisdom_002dSaving-Plans.html#index-wisdom-124">wisdom</a>: <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a></li>
+<li><a href="Caveats-in-Using-Wisdom.html#index-wisdom_002c-problems-with-133">wisdom, problems with</a>: <a href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom">Caveats in Using Wisdom</a></li>
+<li><a href="Wisdom-Import.html#index-wisdom_002c-system_002dwide-287">wisdom, system-wide</a>: <a href="Wisdom-Import.html#Wisdom-Import">Wisdom Import</a></li>
+<li><a href="Caveats-in-Using-Wisdom.html#index-wisdom_002c-system_002dwide-136">wisdom, system-wide</a>: <a href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom">Caveats in Using Wisdom</a></li>
+   </ul></body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Cycle-Counters.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Cycle-Counters.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,90 @@
+<html lang="en">
+<head>
+<title>Cycle Counters - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Installation-and-Customization.html#Installation-and-Customization" title="Installation and Customization">
+<link rel="prev" href="Installation-on-non_002dUnix-systems.html#Installation-on-non_002dUnix-systems" title="Installation on non-Unix systems">
+<link rel="next" href="Generating-your-own-code.html#Generating-your-own-code" title="Generating your own code">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Cycle-Counters"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Generating-your-own-code.html#Generating-your-own-code">Generating your own code</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Installation-on-non_002dUnix-systems.html#Installation-on-non_002dUnix-systems">Installation on non-Unix systems</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>
+<hr>
+</div>
+
+<h3 class="section">10.3 Cycle Counters</h3>
+
+<p><a name="index-cycle-counter-619"></a>
+FFTW's planner actually executes and times different possible FFT
+algorithms in order to pick the fastest plan for a given n.  In
+order to do this in as short a time as possible, however, the timer must
+have a very high resolution, and to accomplish this we employ the
+hardware <dfn>cycle counters</dfn> that are available on most CPUs. 
+Currently, FFTW supports the cycle counters on x86, PowerPC/POWER, Alpha,
+UltraSPARC (SPARC v9), IA64, PA-RISC, and MIPS processors.
+
+   <p><a name="index-compiler-620"></a>Access to the cycle counters, unfortunately, is a compiler and/or
+operating-system dependent task, often requiring inline assembly
+language, and it may be that your compiler is not supported.  If you are
+<em>not</em> supported, FFTW will by default fall back on its estimator
+(effectively using <code>FFTW_ESTIMATE</code> for all plans). 
+<a name="index-FFTW_005fESTIMATE-621"></a>
+You can add support by editing the file <code>kernel/cycle.h</code>; normally,
+this will involve adapting one of the examples already present in order
+to use the inline-assembler syntax for your C compiler, and will only
+require a couple of lines of code.  Anyone adding support for a new
+system to <code>cycle.h</code> is encouraged to email us at <a href="mailto:fftw@fftw.org">fftw@fftw.org</a>.
+
+   <p>If a cycle counter is not available on your system (e.g. some embedded
+processor), and you don't want to use estimated plans, as a last resort
+you can use the <code>--with-slow-timer</code> option to <code>configure</code> (on
+Unix) or <code>#define WITH_SLOW_TIMER</code> in <code>config.h</code> (elsewhere). 
+This will use the much lower-resolution <code>gettimeofday</code> function, or even
+<code>clock</code> if the former is unavailable, and planning will be
+extremely slow.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Data-Types-and-Files.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Data-Types-and-Files.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,74 @@
+<html lang="en">
+<head>
+<title>Data Types and Files - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link rel="prev" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link rel="next" href="Using-Plans.html#Using-Plans" title="Using Plans">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Data-Types-and-Files"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Using-Plans.html#Using-Plans">Using Plans</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>
+<hr>
+</div>
+
+<h3 class="section">4.1 Data Types and Files</h3>
+
+<p>All programs using FFTW should include its header file:
+
+<pre class="example">     #include &lt;fftw3.h&gt;
+</pre>
+   <p>You must also link to the FFTW library.  On Unix, this
+means adding <code>-lfftw3 -lm</code> at the <em>end</em> of the link command.
+
+<ul class="menu">
+<li><a accesskey="1" href="Complex-numbers.html#Complex-numbers">Complex numbers</a>
+<li><a accesskey="2" href="Precision.html#Precision">Precision</a>
+<li><a accesskey="3" href="Memory-Allocation.html#Memory-Allocation">Memory Allocation</a>
+</ul>
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Defining-an-FFTW-module.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Defining-an-FFTW-module.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,81 @@
+<html lang="en">
+<head>
+<title>Defining an FFTW module - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran" title="Calling FFTW from Modern Fortran">
+<link rel="prev" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran" title="Accessing the wisdom API from Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Defining-an-FFTW-module"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">Accessing the wisdom API from Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">7.7 Defining an FFTW module</h3>
+
+<p>Rather than using the <code>include</code> statement to include the
+<code>fftw3.f03</code> interface file in any subroutine where you want to
+use FFTW, you might prefer to define an FFTW Fortran module.  FFTW
+does not install itself as a module, primarily because
+<code>fftw3.f03</code> can be shared between different Fortran compilers while
+modules (in general) cannot.  However, it is trivial to define your
+own FFTW module if you want.  Just create a file containing:
+
+<pre class="example">       module FFTW3
+         use, intrinsic :: iso_c_binding
+         include 'fftw3.f03'
+       end module
+</pre>
+   <p>Compile this file into a module as usual for your compiler (e.g. with
+<code>gfortran -c</code> you will get a file <code>fftw3.mod</code>).  Now,
+instead of <code>include 'fftw3.f03'</code>, whenever you want to use FFTW
+routines you can just do:
+
+<pre class="example">       use FFTW3
+</pre>
+   <p>as usual for Fortran modules.  (You still need to link to the FFTW
+library, of course.)
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Distributed_002dmemory-FFTW-with-MPI.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Distributed_002dmemory-FFTW-with-MPI.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,115 @@
+<html lang="en">
+<head>
+<title>Distributed-memory FFTW with MPI - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW" title="Multi-threaded FFTW">
+<link rel="next" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran" title="Calling FFTW from Modern Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Distributed-memory-FFTW-with-MPI"></a>
+<a name="Distributed_002dmemory-FFTW-with-MPI"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">6 Distributed-memory FFTW with MPI</h2>
+
+<p><a name="index-MPI-344"></a>
+<a name="index-parallel-transform-345"></a>In this chapter we document the parallel FFTW routines for parallel
+systems supporting the MPI message-passing interface.  Unlike the
+shared-memory threads described in the previous chapter, MPI allows
+you to use <em>distributed-memory</em> parallelism, where each CPU has
+its own separate memory, and which can scale up to clusters of many
+thousands of processors.  This capability comes at a price, however:
+each process only stores a <em>portion</em> of the data to be
+transformed, which means that the data structures and
+programming-interface are quite different from the serial or threads
+versions of FFTW. 
+<a name="index-data-distribution-346"></a>
+
+   <p>Distributed-memory parallelism is especially useful when you are
+transforming arrays so large that they do not fit into the memory of a
+single processor.  The storage per-process required by FFTW's MPI
+routines is proportional to the total array size divided by the number
+of processes.  Conversely, distributed-memory parallelism can easily
+pose an unacceptably high communications overhead for small problems;
+the threshold problem size for which parallelism becomes advantageous
+will depend on the precise problem you are interested in, your
+hardware, and your MPI implementation.
+
+   <p>A note on terminology: in MPI, you divide the data among a set of
+&ldquo;processes&rdquo; which each run in their own memory address space. 
+Generally, each process runs on a different physical processor, but
+this is not required.  A set of processes in MPI is described by an
+opaque data structure called a &ldquo;communicator,&rdquo; the most common of
+which is the predefined communicator <code>MPI_COMM_WORLD</code> which
+refers to <em>all</em> processes.  For more information on these and
+other concepts common to all MPI programs, we refer the reader to the
+documentation at <a href="http://www.mcs.anl.gov/research/projects/mpi/">the MPI home page</a>. 
+<a name="index-MPI-communicator-347"></a><a name="index-MPI_005fCOMM_005fWORLD-348"></a>
+
+   <p>We assume in this chapter that the reader is familiar with the usage
+of the serial (uniprocessor) FFTW, and focus only on the concepts new
+to the MPI interface.
+
+<ul class="menu">
+<li><a accesskey="1" href="FFTW-MPI-Installation.html#FFTW-MPI-Installation">FFTW MPI Installation</a>
+<li><a accesskey="2" href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW">Linking and Initializing MPI FFTW</a>
+<li><a accesskey="3" href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a>
+<li><a accesskey="4" href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>
+<li><a accesskey="5" href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a>
+<li><a accesskey="6" href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms">Other Multi-dimensional Real-data MPI Transforms</a>
+<li><a accesskey="7" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">FFTW MPI Transposes</a>
+<li><a accesskey="8" href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a>
+<li><a accesskey="9" href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks">Avoiding MPI Deadlocks</a>
+<li><a href="FFTW-MPI-Performance-Tips.html#FFTW-MPI-Performance-Tips">FFTW MPI Performance Tips</a>
+<li><a href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a>
+<li><a href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">FFTW MPI Reference</a>
+<li><a href="FFTW-MPI-Fortran-Interface.html#FFTW-MPI-Fortran-Interface">FFTW MPI Fortran Interface</a>
+</ul>
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Dynamic-Arrays-in-C.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Dynamic-Arrays-in-C.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,91 @@
+<html lang="en">
+<head>
+<title>Dynamic Arrays in C - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format" title="Multi-dimensional Array Format">
+<link rel="prev" href="Fixed_002dsize-Arrays-in-C.html#Fixed_002dsize-Arrays-in-C" title="Fixed-size Arrays in C">
+<link rel="next" href="Dynamic-Arrays-in-C_002dThe-Wrong-Way.html#Dynamic-Arrays-in-C_002dThe-Wrong-Way" title="Dynamic Arrays in C-The Wrong Way">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Dynamic-Arrays-in-C"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Dynamic-Arrays-in-C_002dThe-Wrong-Way.html#Dynamic-Arrays-in-C_002dThe-Wrong-Way">Dynamic Arrays in C-The Wrong Way</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Fixed_002dsize-Arrays-in-C.html#Fixed_002dsize-Arrays-in-C">Fixed-size Arrays in C</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>
+<hr>
+</div>
+
+<h4 class="subsection">3.2.4 Dynamic Arrays in C</h4>
+
+<p>We recommend allocating most arrays dynamically, with
+<code>fftw_malloc</code>.  This isn't too hard to do, although it is not as
+straightforward for multi-dimensional arrays as it is for
+one-dimensional arrays.
+
+   <p>Creating the array is simple: using a dynamic-allocation routine like
+<code>fftw_malloc</code>, allocate an array big enough to store N
+<code>fftw_complex</code> values (for a complex DFT), where N is the product
+of the sizes of the array dimensions (i.e. the total number of complex
+values in the array).  For example, here is code to allocate a
+5&nbsp;&times;&nbsp;12&nbsp;&times;&nbsp;27 rank-3 array:
+<a name="index-fftw_005fmalloc-121"></a>
+<pre class="example">     fftw_complex *an_array;
+     an_array = (fftw_complex*) fftw_malloc(5*12*27 * sizeof(fftw_complex));
+</pre>
+   <p>Accessing the array elements, however, is more tricky&mdash;you can't
+simply use multiple applications of the &lsquo;<samp><span class="samp">[]</span></samp>&rsquo; operator like you
+could for fixed-size arrays.  Instead, you have to explicitly compute
+the offset into the array using the formula given earlier for
+row-major arrays.  For example, to reference the (i,j,k)-th
+element of the array allocated above, you would use the expression
+<code>an_array[k + 27 * (j + 12 * i)]</code>.
+
+   <p>This pain can be alleviated somewhat by defining appropriate macros,
+or, in C++, creating a class and overloading the &lsquo;<samp><span class="samp">()</span></samp>&rsquo; operator. 
+The recent C99 standard provides a way to reinterpret the dynamic
+array as a &ldquo;variable-length&rdquo; multi-dimensional array amenable to
+&lsquo;<samp><span class="samp">[]</span></samp>&rsquo;, but this feature is not yet widely supported by compilers. 
+<a name="index-C99-122"></a><a name="index-C_002b_002b-123"></a>
+<!-- =========> -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Dynamic-Arrays-in-C_002dThe-Wrong-Way.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Dynamic-Arrays-in-C_002dThe-Wrong-Way.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,93 @@
+<html lang="en">
+<head>
+<title>Dynamic Arrays in C-The Wrong Way - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format" title="Multi-dimensional Array Format">
+<link rel="prev" href="Dynamic-Arrays-in-C.html#Dynamic-Arrays-in-C" title="Dynamic Arrays in C">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Dynamic-Arrays-in-C-The-Wrong-Way"></a>
+<a name="Dynamic-Arrays-in-C_002dThe-Wrong-Way"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Dynamic-Arrays-in-C.html#Dynamic-Arrays-in-C">Dynamic Arrays in C</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>
+<hr>
+</div>
+
+<h4 class="subsection">3.2.5 Dynamic Arrays in C&mdash;The Wrong Way</h4>
+
+<p>A different method for allocating multi-dimensional arrays in C is
+often suggested that is incompatible with FFTW: <em>using it will
+cause FFTW to die a painful death</em>.  We discuss the technique here,
+however, because it is so commonly known and used.  This method is to
+create arrays of pointers of arrays of pointers of <small class="dots">...</small>etcetera. 
+For example, the analogue in this method to the example above is:
+
+<pre class="example">     int i,j;
+     fftw_complex ***a_bad_array;  /* <span class="roman">another way to make a 5x12x27 array</span> */
+     
+     a_bad_array = (fftw_complex ***) malloc(5 * sizeof(fftw_complex **));
+     for (i = 0; i &lt; 5; ++i) {
+          a_bad_array[i] =
+             (fftw_complex **) malloc(12 * sizeof(fftw_complex *));
+          for (j = 0; j &lt; 12; ++j)
+               a_bad_array[i][j] =
+                     (fftw_complex *) malloc(27 * sizeof(fftw_complex));
+     }
+</pre>
+   <p>As you can see, this sort of array is inconvenient to allocate (and
+deallocate).  On the other hand, it has the advantage that the
+(i,j,k)-th element can be referenced simply by
+<code>a_bad_array[i][j][k]</code>.
+
+   <p>If you like this technique and want to maximize convenience in accessing
+the array, but still want to pass the array to FFTW, you can use a
+hybrid method.  Allocate the array as one contiguous block, but also
+declare an array of arrays of pointers that point to appropriate places
+in the block.  That sort of trick is beyond the scope of this
+documentation; for more information on multi-dimensional arrays in C,
+see the <code>comp.lang.c</code>
+<a href="http://c-faq.com/aryptr/dynmuldimary.html">FAQ</a>.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Extended-and-quadruple-precision-in-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Extended-and-quadruple-precision-in-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,86 @@
+<html lang="en">
+<head>
+<title>Extended and quadruple precision in Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface" title="Overview of Fortran interface">
+<link rel="prev" href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface" title="Overview of Fortran interface">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Extended-and-quadruple-precision-in-Fortran"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">7.1.1 Extended and quadruple precision in Fortran</h4>
+
+<p><a name="index-precision-515"></a>
+If FFTW is compiled in <code>long double</code> (extended) precision
+(see <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>), you may be able to call the
+resulting <code>fftwl_</code> routines (see <a href="Precision.html#Precision">Precision</a>) from Fortran if
+your compiler supports the <code>C_LONG_DOUBLE_COMPLEX</code> type code.
+
+   <p>Because some Fortran compilers do not support
+<code>C_LONG_DOUBLE_COMPLEX</code>, the <code>fftwl_</code> declarations are
+segregated into a separate interface file <code>fftw3l.f03</code>, which you
+should include <em>in addition</em> to <code>fftw3.f03</code> (which declares
+precision-independent &lsquo;<samp><span class="samp">FFTW_</span></samp>&rsquo; constants):
+
+   <p><a name="index-iso_005fc_005fbinding-516"></a>
+<pre class="example">       use, intrinsic :: iso_c_binding
+       include 'fftw3.f03'
+       include 'fftw3l.f03'
+</pre>
+   <p>We also support using the nonstandard <code>__float128</code>
+quadruple-precision type provided by recent versions of <code>gcc</code> on
+32- and 64-bit x86 hardware (see <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>),
+using the corresponding <code>real(16)</code> and <code>complex(16)</code> types
+supported by <code>gfortran</code>.  The quadruple-precision &lsquo;<samp><span class="samp">fftwq_</span></samp>&rsquo;
+functions (see <a href="Precision.html#Precision">Precision</a>) are declared in a <code>fftw3q.f03</code>
+interface file, which should be included in addition to
+<code>fftw3l.f03</code>, as above.  You should also link with
+<code>-lfftw3q -lquadmath -lm</code> as in C.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/FFTW-Constants-in-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/FFTW-Constants-in-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,80 @@
+<html lang="en">
+<head>
+<title>FFTW Constants in Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran" title="Calling FFTW from Legacy Fortran">
+<link rel="prev" href="Fortran_002dinterface-routines.html#Fortran_002dinterface-routines" title="Fortran-interface routines">
+<link rel="next" href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran" title="FFTW Execution in Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="FFTW-Constants-in-Fortran"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran">FFTW Execution in Fortran</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Fortran_002dinterface-routines.html#Fortran_002dinterface-routines">Fortran-interface routines</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">8.2 FFTW Constants in Fortran</h3>
+
+<p>When creating plans in FFTW, a number of constants are used to specify
+options, such as <code>FFTW_MEASURE</code> or <code>FFTW_ESTIMATE</code>.  The
+same constants must be used with the wrapper routines, but of course the
+C header files where the constants are defined can't be incorporated
+directly into Fortran code.
+
+   <p>Instead, we have placed Fortran equivalents of the FFTW constant
+definitions in the file <code>fftw3.f</code>, which can be found in the same
+directory as <code>fftw3.h</code>.  If your Fortran compiler supports a
+preprocessor of some sort, you should be able to <code>include</code> or
+<code>#include</code> this file; otherwise, you can paste it directly into
+your code.
+
+   <p><a name="index-flags-583"></a>In C, you combine different flags (like <code>FFTW_PRESERVE_INPUT</code> and
+<code>FFTW_MEASURE</code>) using the &lsquo;<samp><code>|</code></samp>&rsquo; operator; in Fortran
+you should just use &lsquo;<samp><code>+</code></samp>&rsquo;.  (Take care not to add in the
+same flag more than once, though.  Alternatively, you can use the
+<code>ior</code> intrinsic function standardized in Fortran 95.)
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/FFTW-Execution-in-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/FFTW-Execution-in-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,112 @@
+<html lang="en">
+<head>
+<title>FFTW Execution in Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran" title="Calling FFTW from Legacy Fortran">
+<link rel="prev" href="FFTW-Constants-in-Fortran.html#FFTW-Constants-in-Fortran" title="FFTW Constants in Fortran">
+<link rel="next" href="Fortran-Examples.html#Fortran-Examples" title="Fortran Examples">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="FFTW-Execution-in-Fortran"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-Constants-in-Fortran.html#FFTW-Constants-in-Fortran">FFTW Constants in Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">8.3 FFTW Execution in Fortran</h3>
+
+<p>In C, in order to use a plan, one normally calls <code>fftw_execute</code>,
+which executes the plan to perform the transform on the input/output
+arrays passed when the plan was created (see <a href="Using-Plans.html#Using-Plans">Using Plans</a>).  The
+corresponding subroutine call in legacy Fortran is:
+<pre class="example">             call dfftw_execute(plan)
+</pre>
+   <p><a name="index-dfftw_005fexecute-584"></a>
+However, we have had reports that this causes problems with some
+recent optimizing Fortran compilers.  The problem is, because the
+input/output arrays are not passed as explicit arguments to
+<code>dfftw_execute</code>, the semantics of Fortran (unlike C) allow the
+compiler to assume that the input/output arrays are not changed by
+<code>dfftw_execute</code>.  As a consequence, certain compilers end up
+optimizing out or repositioning the call to <code>dfftw_execute</code>,
+assuming incorrectly that it does nothing.
+
+   <p>There are various workarounds to this, but the safest and simplest
+thing is to not use <code>dfftw_execute</code> in Fortran.  Instead, use the
+functions described in <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>, which take
+the input/output arrays as explicit arguments.  For example, if the
+plan is for a complex-data DFT and was created for the arrays
+<code>in</code> and <code>out</code>, you would do:
+<pre class="example">             call dfftw_execute_dft(plan, in, out)
+</pre>
+   <p><a name="index-dfftw_005fexecute_005fdft-585"></a>
+There are a few things to be careful of, however:
+
+     <ul>
+<li>You must use the correct type of execute function, matching the way
+the plan was created.  Complex DFT plans should use
+<code>dfftw_execute_dft</code>, Real-input (r2c) DFT plans should use use
+<code>dfftw_execute_dft_r2c</code>, and real-output (c2r) DFT plans should
+use <code>dfftw_execute_dft_c2r</code>.  The various r2r plans should use
+<code>dfftw_execute_r2r</code>.
+
+     <li>You should normally pass the same input/output arrays that were used when
+creating the plan.  This is always safe.
+
+     <li><em>If</em> you pass <em>different</em> input/output arrays compared to
+those used when creating the plan, you must abide by all the
+restrictions of the new-array execute functions (see <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>).  The most difficult of these, in Fortran, is the
+requirement that the new arrays have the same alignment as the
+original arrays, because there seems to be no way in legacy Fortran to obtain
+guaranteed-aligned arrays (analogous to <code>fftw_malloc</code> in C).  You
+can, of course, use the <code>FFTW_UNALIGNED</code> flag when creating the
+plan, in which case the plan does not depend on the alignment, but
+this may sacrifice substantial performance on architectures (like x86)
+with SIMD instructions (see <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>). 
+<a name="index-FFTW_005fUNALIGNED-586"></a>
+</ul>
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/FFTW-Fortran-type-reference.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/FFTW-Fortran-type-reference.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,129 @@
+<html lang="en">
+<head>
+<title>FFTW Fortran type reference - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran" title="Calling FFTW from Modern Fortran">
+<link rel="prev" href="Reversing-array-dimensions.html#Reversing-array-dimensions" title="Reversing array dimensions">
+<link rel="next" href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran" title="Plan execution in Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="FFTW-Fortran-type-reference"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">7.3 FFTW Fortran type reference</h3>
+
+<p>The following are the most important type correspondences between the
+C interface and Fortran:
+
+     <ul>
+<li><a name="index-fftw_005fplan-527"></a>Plans (<code>fftw_plan</code> and variants) are <code>type(C_PTR)</code> (i.e. an
+opaque pointer).
+
+     <li><a name="index-fftw_005fcomplex-528"></a><a name="index-precision-529"></a><a name="index-C_005fDOUBLE-530"></a><a name="index-C_005fFLOAT-531"></a><a name="index-C_005fLONG_005fDOUBLE-532"></a><a name="index-C_005fDOUBLE_005fCOMPLEX-533"></a><a name="index-C_005fFLOAT_005fCOMPLEX-534"></a><a name="index-C_005fLONG_005fDOUBLE_005fCOMPLEX-535"></a>The C floating-point types <code>double</code>, <code>float</code>, and <code>long
+double</code> correspond to <code>real(C_DOUBLE)</code>, <code>real(C_FLOAT)</code>, and
+<code>real(C_LONG_DOUBLE)</code>, respectively.  The C complex types
+<code>fftw_complex</code>, <code>fftwf_complex</code>, and <code>fftwl_complex</code>
+correspond in Fortran to <code>complex(C_DOUBLE_COMPLEX)</code>,
+<code>complex(C_FLOAT_COMPLEX)</code>, and
+<code>complex(C_LONG_DOUBLE_COMPLEX)</code>, respectively. 
+Just as in C
+(see <a href="Precision.html#Precision">Precision</a>), the FFTW subroutines and types are prefixed with
+&lsquo;<samp><span class="samp">fftw_</span></samp>&rsquo;, <code>fftwf_</code>, and <code>fftwl_</code> for the different precisions, and link to different libraries (<code>-lfftw3</code>, <code>-lfftw3f</code>, and <code>-lfftw3l</code> on Unix), but use the <em>same</em> include file <code>fftw3.f03</code> and the <em>same</em> constants (all of which begin with &lsquo;<samp><span class="samp">FFTW_</span></samp>&rsquo;).  The exception is <code>long double</code> precision, for which you should <em>also</em> include <code>fftw3l.f03</code> (see <a href="Extended-and-quadruple-precision-in-Fortran.html#Extended-and-quadruple-precision-in-Fortran">Extended and quadruple precision in Fortran</a>).
+
+     <li><a name="index-ptrdiff_005ft-536"></a><a name="index-C_005fINT-537"></a><a name="index-C_005fINTPTR_005fT-538"></a><a name="index-C_005fSIZE_005fT-539"></a><a name="index-fftw_005fmalloc-540"></a>The C integer types <code>int</code> and <code>unsigned</code> (used for planner
+flags) become <code>integer(C_INT)</code>.  The C integer type <code>ptrdiff_t</code> (e.g. in the <a href="64_002dbit-Guru-Interface.html#g_t64_002dbit-Guru-Interface">64-bit Guru Interface</a>) becomes <code>integer(C_INTPTR_T)</code>, and <code>size_t</code> (in <code>fftw_malloc</code> etc.) becomes <code>integer(C_SIZE_T)</code>.
+
+     <li><a name="index-fftw_005fr2r_005fkind-541"></a><a name="index-C_005fFFTW_005fR2R_005fKIND-542"></a>The <code>fftw_r2r_kind</code> type (see <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a>)
+becomes <code>integer(C_FFTW_R2R_KIND)</code>.  The various constant values
+of the C enumerated type (<code>FFTW_R2HC</code> etc.) become simply integer
+constants of the same names in Fortran.
+
+     <li><a name="index-FFTW_005fDESTROY_005fINPUT-543"></a><a name="index-in_002dplace-544"></a><a name="index-fftw_005fflops-545"></a>Numeric array pointer arguments (e.g. <code>double *</code>)
+become <code>dimension(*), intent(out)</code> arrays of the same type, or
+<code>dimension(*), intent(in)</code> if they are pointers to constant data
+(e.g. <code>const int *</code>).  There are a few exceptions where numeric
+pointers refer to scalar outputs (e.g. for <code>fftw_flops</code>), in which
+case they are <code>intent(out)</code> scalar arguments in Fortran too. 
+For the new-array execute functions (see <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>),
+the input arrays are declared <code>dimension(*), intent(inout)</code>, since
+they can be modified in the case of in-place or <code>FFTW_DESTROY_INPUT</code>
+transforms.
+
+     <li><a name="index-fftw_005falloc_005freal-546"></a><a name="index-c_005ff_005fpointer-547"></a>Pointer <em>return</em> values (e.g <code>double *</code>) become
+<code>type(C_PTR)</code>.  (If they are pointers to arrays, as for
+<code>fftw_alloc_real</code>, you can convert them back to Fortran array
+pointers with the standard intrinsic function <code>c_f_pointer</code>.)
+
+     <li><a name="index-guru-interface-548"></a><a name="index-fftw_005fiodim-549"></a><a name="index-fftw_005fiodim64-550"></a><a name="index-g_t64_002dbit-architecture-551"></a>The <code>fftw_iodim</code> type in the guru interface (see <a href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a>) becomes <code>type(fftw_iodim)</code> in Fortran, a
+derived data type (the Fortran analogue of C's <code>struct</code>) with
+three <code>integer(C_INT)</code> components: <code>n</code>, <code>is</code>, and
+<code>os</code>, with the same meanings as in C.  The <code>fftw_iodim64</code> type in the 64-bit guru interface (see <a href="64_002dbit-Guru-Interface.html#g_t64_002dbit-Guru-Interface">64-bit Guru Interface</a>) is the same, except that its components are of type <code>integer(C_INTPTR_T)</code>.
+
+     <li><a name="index-C_005fFUNPTR-552"></a>Using the wisdom import/export functions from Fortran is a bit tricky,
+and is discussed in <a href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">Accessing the wisdom API from Fortran</a>.  In
+brief, the <code>FILE *</code> arguments map to <code>type(C_PTR)</code>, <code>const char *</code> to <code>character(C_CHAR), dimension(*), intent(in)</code> (null-terminated!), and the generic read-char/write-char functions map to <code>type(C_FUNPTR)</code>.
+
+   </ul>
+
+   <p><a name="index-portability-553"></a>You may be wondering if you need to search-and-replace
+<code>real(kind(0.0d0))</code> (or whatever your favorite Fortran spelling
+of &ldquo;double precision&rdquo; is) with <code>real(C_DOUBLE)</code> everywhere in
+your program, and similarly for <code>complex</code> and <code>integer</code>
+types.  The answer is no; you can still use your existing types.  As
+long as these types match their C counterparts, things should work
+without a hitch.  The worst that can happen, e.g. in the (unlikely)
+event of a system where <code>real(kind(0.0d0))</code> is different from
+<code>real(C_DOUBLE)</code>, is that the compiler will give you a
+type-mismatch error.  That is, if you don't use the
+<code>iso_c_binding</code> kinds you need to accept at least the theoretical
+possibility of having to change your code in response to compiler
+errors on some future machine, but you don't need to worry about
+silently compiling incorrect code that yields runtime errors.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/FFTW-MPI-Fortran-Interface.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/FFTW-MPI-Fortran-Interface.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,189 @@
+<html lang="en">
+<head>
+<title>FFTW MPI Fortran Interface - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference" title="FFTW MPI Reference">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="FFTW-MPI-Fortran-Interface"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">FFTW MPI Reference</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.13 FFTW MPI Fortran Interface</h3>
+
+<p><a name="index-Fortran-interface-494"></a>
+<a name="index-iso_005fc_005fbinding-495"></a>The FFTW MPI interface is callable from modern Fortran compilers
+supporting the Fortran 2003 <code>iso_c_binding</code> standard for calling
+C functions.  As described in <a href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>,
+this means that you can directly call FFTW's C interface from Fortran
+with only minor changes in syntax.  There are, however, a few things
+specific to the MPI interface to keep in mind:
+
+     <ul>
+<li>Instead of including <code>fftw3.f03</code> as in <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a>, you should <code>include 'fftw3-mpi.f03'</code> (after
+<code>use, intrinsic :: iso_c_binding</code> as before).  The
+<code>fftw3-mpi.f03</code> file includes <code>fftw3.f03</code>, so you should
+<em>not</em> <code>include</code> them both yourself.  (You will also want to
+include the MPI header file, usually via <code>include 'mpif.h'</code> or
+similar, although though this is not needed by <code>fftw3-mpi.f03</code>
+<i>per se</i>.)  (To use the &lsquo;<samp><span class="samp">fftwl_</span></samp>&rsquo; <code>long double</code> extended-precision routines in supporting compilers, you should include <code>fftw3f-mpi.f03</code> in <em>addition</em> to <code>fftw3-mpi.f03</code>. See <a href="Extended-and-quadruple-precision-in-Fortran.html#Extended-and-quadruple-precision-in-Fortran">Extended and quadruple precision in Fortran</a>.)
+
+     <li>Because of the different storage conventions between C and Fortran,
+you reverse the order of your array dimensions when passing them to
+FFTW (see <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a>).  This is merely a
+difference in notation and incurs no performance overhead.  However,
+it means that, whereas in C the <em>first</em> dimension is distributed,
+in Fortran the <em>last</em> dimension of your array is distributed.
+
+     <li><a name="index-MPI-communicator-496"></a>In Fortran, communicators are stored as <code>integer</code> types; there is
+no <code>MPI_Comm</code> type, nor is there any way to access a C
+<code>MPI_Comm</code>.  Fortunately, this is taken care of for you by the
+FFTW Fortran interface: whenever the C interface expects an
+<code>MPI_Comm</code> type, you should pass the Fortran communicator as an
+<code>integer</code>.<a rel="footnote" href="#fn-1" name="fnd-1"><sup>1</sup></a>
+
+     <li>Because you need to call the &lsquo;<samp><span class="samp">local_size</span></samp>&rsquo; function to find out
+how much space to allocate, and this may be <em>larger</em> than the
+local portion of the array (see <a href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>), you should
+<em>always</em> allocate your arrays dynamically using FFTW's allocation
+routines as described in <a href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a>. 
+(Coincidentally, this also provides the best performance by
+guaranteeding proper data alignment.)
+
+     <li>Because all sizes in the MPI FFTW interface are declared as
+<code>ptrdiff_t</code> in C, you should use <code>integer(C_INTPTR_T)</code> in
+Fortran (see <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a>).
+
+     <li><a name="index-fftw_005fexecute_005fdft-497"></a><a name="index-fftw_005fmpi_005fexecute_005fdft-498"></a><a name="index-new_002darray-execution-499"></a>In Fortran, because of the language semantics, we generally recommend
+using the new-array execute functions for all plans, even in the
+common case where you are executing the plan on the same arrays for
+which the plan was created (see <a href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a>). 
+However, note that in the MPI interface these functions are changed:
+<code>fftw_execute_dft</code> becomes <code>fftw_mpi_execute_dft</code>,
+etcetera. See <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a>.
+
+   </ul>
+
+   <p>For example, here is a Fortran code snippet to perform a distributed
+L&nbsp;&times;&nbsp;M complex DFT in-place.  (This assumes you have already
+initialized MPI with <code>MPI_init</code> and have also performed
+<code>call fftw_mpi_init</code>.)
+
+<pre class="example">       use, intrinsic :: iso_c_binding
+       include 'fftw3-mpi.f03'
+       integer(C_INTPTR_T), parameter :: L = ...
+       integer(C_INTPTR_T), parameter :: M = ...
+       type(C_PTR) :: plan, cdata
+       complex(C_DOUBLE_COMPLEX), pointer :: data(:,:)
+       integer(C_INTPTR_T) :: i, j, alloc_local, local_M, local_j_offset
+     
+     !   <span class="roman">get local data size and allocate (note dimension reversal)</span>
+       alloc_local = fftw_mpi_local_size_2d(M, L, MPI_COMM_WORLD, &amp;
+                                            local_M, local_j_offset)
+       cdata = fftw_alloc_complex(alloc_local)
+       call c_f_pointer(cdata, data, [L,local_M])
+     
+     !   <span class="roman">create MPI plan for in-place forward DFT (note dimension reversal)</span>
+       plan = fftw_mpi_plan_dft_2d(M, L, data, data, MPI_COMM_WORLD, &amp;
+                                   FFTW_FORWARD, FFTW_MEASURE)
+     
+     ! <span class="roman">initialize data to some function</span> my_function(i,j)
+       do j = 1, local_M
+         do i = 1, L
+           data(i, j) = my_function(i, j + local_j_offset)
+         end do
+       end do
+     
+     ! <span class="roman">compute transform (as many times as desired)</span>
+       call fftw_mpi_execute_dft(plan, data, data)
+     
+       call fftw_destroy_plan(plan)
+       call fftw_free(cdata)
+</pre>
+   <p>Note that when we called <code>fftw_mpi_local_size_2d</code> and
+<code>fftw_mpi_plan_dft_2d</code> with the dimensions in reversed order,
+since a L&nbsp;&times;&nbsp;M Fortran array is viewed by FFTW in C as a
+M&nbsp;&times;&nbsp;L array.  This means that the array was distributed over
+the <code>M</code> dimension, the local portion of which is a
+L&nbsp;&times;&nbsp;local_M array in Fortran.  (You must <em>not</em> use an
+<code>allocate</code> statement to allocate an L&nbsp;&times;&nbsp;local_M array,
+however; you must allocate <code>alloc_local</code> complex numbers, which
+may be greater than <code>L * local_M</code>, in order to reserve space for
+intermediate steps of the transform.)  Finally, we mention that
+because C's array indices are zero-based, the <code>local_j_offset</code>
+argument can conveniently be interpreted as an offset in the 1-based
+<code>j</code> index (rather than as a starting index as in C).
+
+   <p>If instead you had used the <code>ior(FFTW_MEASURE,
+FFTW_MPI_TRANSPOSED_OUT)</code> flag, the output of the transform would be a
+transposed M&nbsp;&times;&nbsp;local_L array, associated with the <em>same</em>
+<code>cdata</code> allocation (since the transform is in-place), and which
+you could declare with:
+
+<pre class="example">       complex(C_DOUBLE_COMPLEX), pointer :: tdata(:,:)
+       ...
+       call c_f_pointer(cdata, tdata, [M,local_L])
+</pre>
+   <p>where <code>local_L</code> would have been obtained by changing the
+<code>fftw_mpi_local_size_2d</code> call to:
+
+<pre class="example">       alloc_local = fftw_mpi_local_size_2d_transposed(M, L, MPI_COMM_WORLD, &amp;
+                                local_M, local_j_offset, local_L, local_i_offset)
+</pre>
+   <div class="footnote">
+<hr>
+<h4>Footnotes</h4><p class="footnote"><small>[<a name="fn-1" href="#fnd-1">1</a>]</small> Technically, this is because you aren't
+actually calling the C functions directly. You are calling wrapper
+functions that translate the communicator with <code>MPI_Comm_f2c</code>
+before calling the ordinary C interface.  This is all done
+transparently, however, since the <code>fftw3-mpi.f03</code> interface file
+renames the wrappers so that they are called in Fortran with the same
+names as the C interface functions.</p>
+
+   <hr></div>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/FFTW-MPI-Installation.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/FFTW-MPI-Installation.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,92 @@
+<html lang="en">
+<head>
+<title>FFTW MPI Installation - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="next" href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW" title="Linking and Initializing MPI FFTW">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="FFTW-MPI-Installation"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW">Linking and Initializing MPI FFTW</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.1 FFTW MPI Installation</h3>
+
+<p>All of the FFTW MPI code is located in the <code>mpi</code> subdirectory of
+the FFTW package.  On Unix systems, the FFTW MPI libraries and header
+files are automatically configured, compiled, and installed along with
+the uniprocessor FFTW libraries simply by including
+<code>--enable-mpi</code> in the flags to the <code>configure</code> script
+(see <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a>). 
+<a name="index-configure-349"></a>
+
+   <p>Any implementation of the MPI standard, version 1 or later, should
+work with FFTW.  The <code>configure</code> script will attempt to
+automatically detect how to compile and link code using your MPI
+implementation.  In some cases, especially if you have multiple
+different MPI implementations installed or have an unusual MPI
+software package, you may need to provide this information explicitly.
+
+   <p>Most commonly, one compiles MPI code by invoking a special compiler
+command, typically <code>mpicc</code> for C code.  The <code>configure</code>
+script knows the most common names for this command, but you can
+specify the MPI compilation command explicitly by setting the
+<code>MPICC</code> variable, as in &lsquo;<samp><span class="samp">./configure MPICC=mpicc ...</span></samp>&rsquo;. 
+<a name="index-mpicc-350"></a>
+
+   <p>If, instead of a special compiler command, you need to link a certain
+library, you can specify the link command via the <code>MPILIBS</code>
+variable, as in &lsquo;<samp><span class="samp">./configure MPILIBS=-lmpi ...</span></samp>&rsquo;.  Note that if
+your MPI library is installed in a non-standard location (one the
+compiler does not know about by default), you may also have to specify
+the location of the library and header files via <code>LDFLAGS</code> and
+<code>CPPFLAGS</code> variables, respectively, as in &lsquo;<samp><span class="samp">./configure
+LDFLAGS=-L/path/to/mpi/libs CPPFLAGS=-I/path/to/mpi/include ...</span></samp>&rsquo;.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/FFTW-MPI-Performance-Tips.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/FFTW-MPI-Performance-Tips.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,88 @@
+<html lang="en">
+<head>
+<title>FFTW MPI Performance Tips - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks" title="Avoiding MPI Deadlocks">
+<link rel="next" href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads" title="Combining MPI and Threads">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="FFTW-MPI-Performance-Tips"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks">Avoiding MPI Deadlocks</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.10 FFTW MPI Performance Tips</h3>
+
+<p>In this section, we collect a few tips on getting the best performance
+out of FFTW's MPI transforms.
+
+   <p>First, because of the 1d block distribution, FFTW's parallelization is
+currently limited by the size of the first dimension. 
+(Multidimensional block distributions may be supported by a future
+version.) More generally, you should ideally arrange the dimensions so
+that FFTW can divide them equally among the processes. See <a href="Load-balancing.html#Load-balancing">Load balancing</a>. 
+<a name="index-block-distribution-423"></a><a name="index-load-balancing-424"></a>
+
+   <p>Second, if it is not too inconvenient, you should consider working
+with transposed output for multidimensional plans, as this saves a
+considerable amount of communications.  See <a href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a>. 
+<a name="index-transpose-425"></a>
+
+   <p>Third, the fastest choices are generally either an in-place transform
+or an out-of-place transform with the <code>FFTW_DESTROY_INPUT</code> flag
+(which allows the input array to be used as scratch space).  In-place
+is especially beneficial if the amount of data per process is large. 
+<a name="index-FFTW_005fDESTROY_005fINPUT-426"></a>
+
+   <p>Fourth, if you have multiple arrays to transform at once, rather than
+calling FFTW's MPI transforms several times it usually seems to be
+faster to interleave the data and use the advanced interface.  (This
+groups the communications together instead of requiring separate
+messages for each transform.)
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/FFTW-MPI-Reference.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/FFTW-MPI-Reference.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,73 @@
+<html lang="en">
+<head>
+<title>FFTW MPI Reference - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads" title="Combining MPI and Threads">
+<link rel="next" href="FFTW-MPI-Fortran-Interface.html#FFTW-MPI-Fortran-Interface" title="FFTW MPI Fortran Interface">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="FFTW-MPI-Reference"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="FFTW-MPI-Fortran-Interface.html#FFTW-MPI-Fortran-Interface">FFTW MPI Fortran Interface</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.12 FFTW MPI Reference</h3>
+
+<p>This chapter provides a complete reference to all FFTW MPI functions,
+datatypes, and constants.  See also <a href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a> for information
+on functions and types in common with the serial interface.
+
+<ul class="menu">
+<li><a accesskey="1" href="MPI-Files-and-Data-Types.html#MPI-Files-and-Data-Types">MPI Files and Data Types</a>
+<li><a accesskey="2" href="MPI-Initialization.html#MPI-Initialization">MPI Initialization</a>
+<li><a accesskey="3" href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a>
+<li><a accesskey="4" href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a>
+<li><a accesskey="5" href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a>
+<li><a accesskey="6" href="MPI-Wisdom-Communication.html#MPI-Wisdom-Communication">MPI Wisdom Communication</a>
+</ul>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/FFTW-MPI-Transposes.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/FFTW-MPI-Transposes.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,76 @@
+<html lang="en">
+<head>
+<title>FFTW MPI Transposes - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms" title="Other Multi-dimensional Real-data MPI Transforms">
+<link rel="next" href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom" title="FFTW MPI Wisdom">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="FFTW-MPI-Transposes"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms">Other Multi-dimensional Real-data MPI Transforms</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.7 FFTW MPI Transposes</h3>
+
+<p><a name="index-transpose-396"></a>
+The FFTW's MPI Fourier transforms rely on one or more <em>global
+transposition</em> step for their communications.  For example, the
+multidimensional transforms work by transforming along some
+dimensions, then transposing to make the first dimension local and
+transforming that, then transposing back.  Because global
+transposition of a block-distributed matrix has many other potential
+uses besides FFTs, FFTW's transpose routines can be called directly,
+as documented in this section.
+
+<ul class="menu">
+<li><a accesskey="1" href="Basic-distributed_002dtranspose-interface.html#Basic-distributed_002dtranspose-interface">Basic distributed-transpose interface</a>
+<li><a accesskey="2" href="Advanced-distributed_002dtranspose-interface.html#Advanced-distributed_002dtranspose-interface">Advanced distributed-transpose interface</a>
+<li><a accesskey="3" href="An-improved-replacement-for-MPI_005fAlltoall.html#An-improved-replacement-for-MPI_005fAlltoall">An improved replacement for MPI_Alltoall</a>
+</ul>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/FFTW-MPI-Wisdom.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/FFTW-MPI-Wisdom.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,142 @@
+<html lang="en">
+<head>
+<title>FFTW MPI Wisdom - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes" title="FFTW MPI Transposes">
+<link rel="next" href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks" title="Avoiding MPI Deadlocks">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="FFTW-MPI-Wisdom"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks">Avoiding MPI Deadlocks</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">FFTW MPI Transposes</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.8 FFTW MPI Wisdom</h3>
+
+<p><a name="index-wisdom-410"></a><a name="index-saving-plans-to-disk-411"></a>
+FFTW's &ldquo;wisdom&rdquo; facility (see <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>) can
+be used to save MPI plans as well as to save uniprocessor plans. 
+However, for MPI there are several unavoidable complications.
+
+   <p><a name="index-MPI-I_002fO-412"></a>First, the MPI standard does not guarantee that every process can
+perform file I/O (at least, not using C stdio routines)&mdash;in general,
+we may only assume that process 0 is capable of I/O.<a rel="footnote" href="#fn-1" name="fnd-1"><sup>1</sup></a> So, if we
+want to export the wisdom from a single process to a file, we must
+first export the wisdom to a string, then send it to process 0, then
+write it to a file.
+
+   <p>Second, in principle we may want to have separate wisdom for every
+process, since in general the processes may run on different hardware
+even for a single MPI program.  However, in practice FFTW's MPI code
+is designed for the case of homogeneous hardware (see <a href="Load-balancing.html#Load-balancing">Load balancing</a>), and in this case it is convenient to use the same wisdom
+for every process.  Thus, we need a mechanism to synchronize the wisdom.
+
+   <p>To address both of these problems, FFTW provides the following two
+functions:
+
+<pre class="example">     void fftw_mpi_broadcast_wisdom(MPI_Comm comm);
+     void fftw_mpi_gather_wisdom(MPI_Comm comm);
+</pre>
+   <p><a name="index-fftw_005fmpi_005fgather_005fwisdom-413"></a><a name="index-fftw_005fmpi_005fbroadcast_005fwisdom-414"></a>
+Given a communicator <code>comm</code>, <code>fftw_mpi_broadcast_wisdom</code>
+will broadcast the wisdom from process 0 to all other processes. 
+Conversely, <code>fftw_mpi_gather_wisdom</code> will collect wisdom from all
+processes onto process 0.  (If the plans created for the same problem
+by different processes are not the same, <code>fftw_mpi_gather_wisdom</code>
+will arbitrarily choose one of the plans.)  Both of these functions
+may result in suboptimal plans for different processes if the
+processes are running on non-identical hardware.  Both of these
+functions are <em>collective</em> calls, which means that they must be
+executed by all processes in the communicator. 
+<a name="index-collective-function-415"></a>
+
+   <p>So, for example, a typical code snippet to import wisdom from a file
+and use it on all processes would be:
+
+<pre class="example">     {
+         int rank;
+     
+         fftw_mpi_init();
+         MPI_Comm_rank(MPI_COMM_WORLD, &amp;rank);
+         if (rank == 0) fftw_import_wisdom_from_filename("mywisdom");
+         fftw_mpi_broadcast_wisdom(MPI_COMM_WORLD);
+     }
+</pre>
+   <p>(Note that we must call <code>fftw_mpi_init</code> before importing any
+wisdom that might contain MPI plans.)  Similarly, a typical code
+snippet to export wisdom from all processes to a file is:
+<a name="index-fftw_005fmpi_005finit-416"></a>
+<pre class="example">     {
+         int rank;
+     
+         fftw_mpi_gather_wisdom(MPI_COMM_WORLD);
+         MPI_Comm_rank(MPI_COMM_WORLD, &amp;rank);
+         if (rank == 0) fftw_export_wisdom_to_filename("mywisdom");
+     }
+</pre>
+   <!--  -->
+   <div class="footnote">
+<hr>
+<h4>Footnotes</h4><p class="footnote"><small>[<a name="fn-1" href="#fnd-1">1</a>]</small> In fact,
+even this assumption is not technically guaranteed by the standard,
+although it seems to be universal in actual MPI implementations and is
+widely assumed by MPI-using software.  Technically, you need to query
+the <code>MPI_IO</code> attribute of <code>MPI_COMM_WORLD</code> with
+<code>MPI_Attr_get</code>.  If this attribute is <code>MPI_PROC_NULL</code>, no
+I/O is possible.  If it is <code>MPI_ANY_SOURCE</code>, any process can
+perform I/O.  Otherwise, it is the rank of a process that can perform
+I/O ... but since it is not guaranteed to yield the <em>same</em> rank
+on all processes, you have to do an <code>MPI_Allreduce</code> of some kind
+if you want all processes to agree about which is going to do I/O. 
+And even then, the standard only guarantees that this process can
+perform output, but not input. See e.g. <cite>Parallel Programming
+with MPI</cite> by P. S. Pacheco, section 8.1.3.  Needless to say, in our
+experience virtually no MPI programmers worry about this.</p>
+
+   <hr></div>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/FFTW-Reference.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/FFTW-Reference.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,75 @@
+<html lang="en">
+<head>
+<title>FFTW Reference - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Other-Important-Topics.html#Other-Important-Topics" title="Other Important Topics">
+<link rel="next" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW" title="Multi-threaded FFTW">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="FFTW-Reference"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Other-Important-Topics.html#Other-Important-Topics">Other Important Topics</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">4 FFTW Reference</h2>
+
+<p>This chapter provides a complete reference for all sequential (i.e.,
+one-processor) FFTW functions.  Parallel transforms are described in
+later chapters.
+
+<ul class="menu">
+<li><a accesskey="1" href="Data-Types-and-Files.html#Data-Types-and-Files">Data Types and Files</a>
+<li><a accesskey="2" href="Using-Plans.html#Using-Plans">Using Plans</a>
+<li><a accesskey="3" href="Basic-Interface.html#Basic-Interface">Basic Interface</a>
+<li><a accesskey="4" href="Advanced-Interface.html#Advanced-Interface">Advanced Interface</a>
+<li><a accesskey="5" href="Guru-Interface.html#Guru-Interface">Guru Interface</a>
+<li><a accesskey="6" href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>
+<li><a accesskey="7" href="Wisdom.html#Wisdom">Wisdom</a>
+<li><a accesskey="8" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>
+</ul>
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Fixed_002dsize-Arrays-in-C.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Fixed_002dsize-Arrays-in-C.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,91 @@
+<html lang="en">
+<head>
+<title>Fixed-size Arrays in C - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format" title="Multi-dimensional Array Format">
+<link rel="prev" href="Column_002dmajor-Format.html#Column_002dmajor-Format" title="Column-major Format">
+<link rel="next" href="Dynamic-Arrays-in-C.html#Dynamic-Arrays-in-C" title="Dynamic Arrays in C">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Fixed-size-Arrays-in-C"></a>
+<a name="Fixed_002dsize-Arrays-in-C"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Dynamic-Arrays-in-C.html#Dynamic-Arrays-in-C">Dynamic Arrays in C</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Column_002dmajor-Format.html#Column_002dmajor-Format">Column-major Format</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>
+<hr>
+</div>
+
+<h4 class="subsection">3.2.3 Fixed-size Arrays in C</h4>
+
+<p><a name="index-C-multi_002ddimensional-arrays-120"></a>
+A multi-dimensional array whose size is declared at compile time in C
+is <em>already</em> in row-major order.  You don't have to do anything
+special to transform it.  For example:
+
+<pre class="example">     {
+          fftw_complex data[N0][N1][N2];
+          fftw_plan plan;
+          ...
+          plan = fftw_plan_dft_3d(N0, N1, N2, &amp;data[0][0][0], &amp;data[0][0][0],
+                                  FFTW_FORWARD, FFTW_ESTIMATE);
+          ...
+     }
+</pre>
+   <p>This will plan a 3d in-place transform of size <code>N0 x N1 x N2</code>. 
+Notice how we took the address of the zero-th element to pass to the
+planner (we could also have used a typecast).
+
+   <p>However, we tend to <em>discourage</em> users from declaring their
+arrays in this way, for two reasons.  First, this allocates the array
+on the stack (&ldquo;automatic&rdquo; storage), which has a very limited size on
+most operating systems (declaring an array with more than a few
+thousand elements will often cause a crash).  (You can get around this
+limitation on many systems by declaring the array as
+<code>static</code> and/or global, but that has its own drawbacks.) 
+Second, it may not optimally align the array for use with a SIMD
+FFTW (see <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>).  Instead, we recommend
+using <code>fftw_malloc</code>, as described below.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Forgetting-Wisdom.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Forgetting-Wisdom.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,68 @@
+<html lang="en">
+<head>
+<title>Forgetting Wisdom - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Wisdom.html#Wisdom" title="Wisdom">
+<link rel="prev" href="Wisdom-Import.html#Wisdom-Import" title="Wisdom Import">
+<link rel="next" href="Wisdom-Utilities.html#Wisdom-Utilities" title="Wisdom Utilities">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Forgetting-Wisdom"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Wisdom-Utilities.html#Wisdom-Utilities">Wisdom Utilities</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Wisdom-Import.html#Wisdom-Import">Wisdom Import</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Wisdom.html#Wisdom">Wisdom</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.7.3 Forgetting Wisdom</h4>
+
+<pre class="example">     void fftw_forget_wisdom(void);
+</pre>
+   <p><a name="index-fftw_005fforget_005fwisdom-288"></a>
+Calling <code>fftw_forget_wisdom</code> causes all accumulated <code>wisdom</code>
+to be discarded and its associated memory to be freed. (New
+<code>wisdom</code> can still be gathered subsequently, however.)
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Fortran-Examples.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Fortran-Examples.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,155 @@
+<html lang="en">
+<head>
+<title>Fortran Examples - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran" title="Calling FFTW from Legacy Fortran">
+<link rel="prev" href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran" title="FFTW Execution in Fortran">
+<link rel="next" href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f" title="Wisdom of Fortran?">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Fortran-Examples"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f">Wisdom of Fortran?</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran">FFTW Execution in Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">8.4 Fortran Examples</h3>
+
+<p>In C, you might have something like the following to transform a
+one-dimensional complex array:
+
+<pre class="example">             fftw_complex in[N], out[N];
+             fftw_plan plan;
+     
+             plan = fftw_plan_dft_1d(N,in,out,FFTW_FORWARD,FFTW_ESTIMATE);
+             fftw_execute(plan);
+             fftw_destroy_plan(plan);
+</pre>
+   <p>In Fortran, you would use the following to accomplish the same thing:
+
+<pre class="example">             double complex in, out
+             dimension in(N), out(N)
+             integer*8 plan
+     
+             call dfftw_plan_dft_1d(plan,N,in,out,FFTW_FORWARD,FFTW_ESTIMATE)
+             call dfftw_execute_dft(plan, in, out)
+             call dfftw_destroy_plan(plan)
+</pre>
+   <p><a name="index-dfftw_005fplan_005fdft_005f1d-587"></a><a name="index-dfftw_005fexecute_005fdft-588"></a><a name="index-dfftw_005fdestroy_005fplan-589"></a>
+Notice how all routines are called as Fortran subroutines, and the
+plan is returned via the first argument to <code>dfftw_plan_dft_1d</code>. 
+Notice also that we changed <code>fftw_execute</code> to
+<code>dfftw_execute_dft</code> (see <a href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran">FFTW Execution in Fortran</a>).  To do
+the same thing, but using 8 threads in parallel (see <a href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>), you would simply prefix these calls with:
+
+<pre class="example">             integer iret
+             call dfftw_init_threads(iret)
+             call dfftw_plan_with_nthreads(8)
+</pre>
+   <p><a name="index-dfftw_005finit_005fthreads-590"></a><a name="index-dfftw_005fplan_005fwith_005fnthreads-591"></a>
+(You might want to check the value of <code>iret</code>: if it is zero, it
+indicates an unlikely error during thread initialization.)
+
+   <p>To transform a three-dimensional array in-place with C, you might do:
+
+<pre class="example">             fftw_complex arr[L][M][N];
+             fftw_plan plan;
+     
+             plan = fftw_plan_dft_3d(L,M,N, arr,arr,
+                                     FFTW_FORWARD, FFTW_ESTIMATE);
+             fftw_execute(plan);
+             fftw_destroy_plan(plan);
+</pre>
+   <p>In Fortran, you would use this instead:
+
+<pre class="example">             double complex arr
+             dimension arr(L,M,N)
+             integer*8 plan
+     
+             call dfftw_plan_dft_3d(plan, L,M,N, arr,arr,
+            &amp;                       FFTW_FORWARD, FFTW_ESTIMATE)
+             call dfftw_execute_dft(plan, arr, arr)
+             call dfftw_destroy_plan(plan)
+</pre>
+   <p><a name="index-dfftw_005fplan_005fdft_005f3d-592"></a>
+Note that we pass the array dimensions in the &ldquo;natural&rdquo; order in both C
+and Fortran.
+
+   <p>To transform a one-dimensional real array in Fortran, you might do:
+
+<pre class="example">             double precision in
+             dimension in(N)
+             double complex out
+             dimension out(N/2 + 1)
+             integer*8 plan
+     
+             call dfftw_plan_dft_r2c_1d(plan,N,in,out,FFTW_ESTIMATE)
+             call dfftw_execute_dft_r2c(plan, in, out)
+             call dfftw_destroy_plan(plan)
+</pre>
+   <p><a name="index-dfftw_005fplan_005fdft_005fr2c_005f1d-593"></a><a name="index-dfftw_005fexecute_005fdft_005fr2c-594"></a>
+To transform a two-dimensional real array, out of place, you might use
+the following:
+
+<pre class="example">             double precision in
+             dimension in(M,N)
+             double complex out
+             dimension out(M/2 + 1, N)
+             integer*8 plan
+     
+             call dfftw_plan_dft_r2c_2d(plan,M,N,in,out,FFTW_ESTIMATE)
+             call dfftw_execute_dft_r2c(plan, in, out)
+             call dfftw_destroy_plan(plan)
+</pre>
+   <p><a name="index-dfftw_005fplan_005fdft_005fr2c_005f2d-595"></a>
+<strong>Important:</strong> Notice that it is the <em>first</em> dimension of the
+complex output array that is cut in half in Fortran, rather than the
+last dimension as in C.  This is a consequence of the interface routines
+reversing the order of the array dimensions passed to FFTW so that the
+Fortran program can use its ordinary column-major order. 
+<a name="index-column_002dmajor-596"></a><a name="index-r2c_002fc2r-multi_002ddimensional-array-format-597"></a>
+<!--  -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Fortran_002dinterface-routines.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Fortran_002dinterface-routines.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,141 @@
+<html lang="en">
+<head>
+<title>Fortran-interface routines - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran" title="Calling FFTW from Legacy Fortran">
+<link rel="prev" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran" title="Calling FFTW from Legacy Fortran">
+<link rel="next" href="FFTW-Constants-in-Fortran.html#FFTW-Constants-in-Fortran" title="FFTW Constants in Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Fortran-interface-routines"></a>
+<a name="Fortran_002dinterface-routines"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="FFTW-Constants-in-Fortran.html#FFTW-Constants-in-Fortran">FFTW Constants in Fortran</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">8.1 Fortran-interface routines</h3>
+
+<p>Nearly all of the FFTW functions have Fortran-callable equivalents. 
+The name of the legacy Fortran routine is the same as that of the
+corresponding C routine, but with the &lsquo;<samp><span class="samp">fftw_</span></samp>&rsquo; prefix replaced by
+&lsquo;<samp><span class="samp">dfftw_</span></samp>&rsquo;.<a rel="footnote" href="#fn-1" name="fnd-1"><sup>1</sup></a>  The single and long-double precision
+versions use &lsquo;<samp><span class="samp">sfftw_</span></samp>&rsquo; and &lsquo;<samp><span class="samp">lfftw_</span></samp>&rsquo;, respectively, instead of
+&lsquo;<samp><span class="samp">fftwf_</span></samp>&rsquo; and &lsquo;<samp><span class="samp">fftwl_</span></samp>&rsquo;; quadruple precision (<code>real*16</code>)
+is available on some systems as &lsquo;<samp><span class="samp">fftwq_</span></samp>&rsquo; (see <a href="Precision.html#Precision">Precision</a>). 
+(Note that <code>long double</code> on x86 hardware is usually at most
+80-bit extended precision, <em>not</em> quadruple precision.)
+
+   <p>For the most part, all of the arguments to the functions are the same,
+with the following exceptions:
+
+     <ul>
+<li><code>plan</code> variables (what would be of type <code>fftw_plan</code> in C),
+must be declared as a type that is at least as big as a pointer
+(address) on your machine.  We recommend using <code>integer*8</code> everywhere,
+since this should always be big enough. 
+<a name="index-portability-578"></a>
+<li>Any function that returns a value (e.g. <code>fftw_plan_dft</code>) is
+converted into a <em>subroutine</em>.  The return value is converted into
+an additional <em>first</em> parameter of this subroutine.<a rel="footnote" href="#fn-2" name="fnd-2"><sup>2</sup></a>
+
+     <li><a name="index-column_002dmajor-579"></a>The Fortran routines expect multi-dimensional arrays to be in
+<em>column-major</em> order, which is the ordinary format of Fortran
+arrays (see <a href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>).  They do this
+transparently and costlessly simply by reversing the order of the
+dimensions passed to FFTW, but this has one important consequence for
+multi-dimensional real-complex transforms, discussed below.
+
+     <li>Wisdom import and export is somewhat more tricky because one cannot
+easily pass files or strings between C and Fortran; see <a href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f">Wisdom of Fortran?</a>.
+
+     <li>Legacy Fortran cannot use the <code>fftw_malloc</code> dynamic-allocation routine. 
+If you want to exploit the SIMD FFTW (see <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>), you'll
+need to figure out some other way to ensure that your arrays are at
+least 16-byte aligned.
+
+     <li><a name="index-fftw_005fiodim-580"></a><a name="index-guru-interface-581"></a>Since Fortran 77 does not have data structures, the <code>fftw_iodim</code>
+structure from the guru interface (see <a href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a>) must be split into separate arguments.  In particular, any
+<code>fftw_iodim</code> array arguments in the C guru interface become three
+integer array arguments (<code>n</code>, <code>is</code>, and <code>os</code>) in the
+Fortran guru interface, all of whose lengths should be equal to the
+corresponding <code>rank</code> argument.
+
+     <li>The guru planner interface in Fortran does <em>not</em> do any automatic
+translation between column-major and row-major; you are responsible
+for setting the strides etcetera to correspond to your Fortran arrays. 
+However, as a slight bug that we are preserving for backwards
+compatibility, the &lsquo;<samp><span class="samp">plan_guru_r2r</span></samp>&rsquo; in Fortran <em>does</em> reverse the
+order of its <code>kind</code> array parameter, so the <code>kind</code> array
+of that routine should be in the reverse of the order of the iodim
+arrays (see above).
+
+   </ul>
+
+   <p>In general, you should take care to use Fortran data types that
+correspond to (i.e. are the same size as) the C types used by FFTW. 
+In practice, this correspondence is usually straightforward
+(i.e. <code>integer</code> corresponds to <code>int</code>, <code>real</code>
+corresponds to <code>float</code>, etcetera).  The native Fortran
+double/single-precision complex type should be compatible with
+<code>fftw_complex</code>/<code>fftwf_complex</code>.  Such simple correspondences
+are assumed in the examples below. 
+<a name="index-portability-582"></a>
+<!--  -->
+
+   <div class="footnote">
+<hr>
+<h4>Footnotes</h4><p class="footnote"><small>[<a name="fn-1" href="#fnd-1">1</a>]</small> Technically, Fortran 77 identifiers are not
+allowed to have more than 6 characters, nor may they contain
+underscores.  Any compiler that enforces this limitation doesn't
+deserve to link to FFTW.</p>
+
+   <p class="footnote"><small>[<a name="fn-2" href="#fnd-2">2</a>]</small> The
+reason for this is that some Fortran implementations seem to have
+trouble with C function return values, and vice versa.</p>
+
+   <hr></div>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Generating-your-own-code.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Generating-your-own-code.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,95 @@
+<html lang="en">
+<head>
+<title>Generating your own code - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Installation-and-Customization.html#Installation-and-Customization" title="Installation and Customization">
+<link rel="prev" href="Cycle-Counters.html#Cycle-Counters" title="Cycle Counters">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Generating-your-own-code"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Cycle-Counters.html#Cycle-Counters">Cycle Counters</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>
+<hr>
+</div>
+
+<h3 class="section">10.4 Generating your own code</h3>
+
+<p><a name="index-code-generator-622"></a>
+The directory <code>genfft</code> contains the programs that were used to
+generate FFTW's &ldquo;codelets,&rdquo; which are hard-coded transforms of small
+sizes. 
+<a name="index-codelet-623"></a>We do not expect casual users to employ the generator, which is a rather
+sophisticated program that generates directed acyclic graphs of FFT
+algorithms and performs algebraic simplifications on them.  It was
+written in Objective Caml, a dialect of ML, which is available at
+<a href="http://caml.inria.fr/ocaml/index.en.html">http://caml.inria.fr/ocaml/index.en.html</a>. 
+<a name="index-Caml-624"></a>
+
+   <p>If you have Objective Caml installed (along with recent versions of
+GNU <code>autoconf</code>, <code>automake</code>, and <code>libtool</code>), then you
+can change the set of codelets that are generated or play with the
+generation options.  The set of generated codelets is specified by the
+<code>{dft,rdft}/{codelets,simd}/*/Makefile.am</code> files.  For example, you can add
+efficient REDFT codelets of small sizes by modifying
+<code>rdft/codelets/r2r/Makefile.am</code>. 
+<a name="index-REDFT-625"></a>After you modify any <code>Makefile.am</code> files, you can type <code>sh
+bootstrap.sh</code> in the top-level directory followed by <code>make</code> to
+re-generate the files.
+
+   <p>We do not provide more details about the code-generation process, since
+we do not expect that most users will need to generate their own code. 
+However, feel free to contact us at <a href="mailto:fftw@fftw.org">fftw@fftw.org</a> if
+you are interested in the subject.
+
+   <p><a name="index-monadic-programming-626"></a>You might find it interesting to learn Caml and/or some modern
+programming techniques that we used in the generator (including monadic
+programming), especially if you heard the rumor that Java and
+object-oriented programming are the latest advancement in the field. 
+The internal operation of the codelet generator is described in the
+paper, &ldquo;A Fast Fourier Transform Compiler,&rdquo; by M. Frigo, which is
+available from the <a href="http://www.fftw.org">FFTW home page</a> and also
+appeared in the <cite>Proceedings of the 1999 ACM SIGPLAN Conference on
+Programming Language Design and Implementation (PLDI)</cite>.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Guru-Complex-DFTs.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Guru-Complex-DFTs.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,124 @@
+<html lang="en">
+<head>
+<title>Guru Complex DFTs - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Guru-Interface.html#Guru-Interface" title="Guru Interface">
+<link rel="prev" href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes" title="Guru vector and transform sizes">
+<link rel="next" href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs" title="Guru Real-data DFTs">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Guru-Complex-DFTs"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs">Guru Real-data DFTs</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Guru-Interface.html#Guru-Interface">Guru Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.5.3 Guru Complex DFTs</h4>
+
+<pre class="example">     fftw_plan fftw_plan_guru_dft(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          fftw_complex *in, fftw_complex *out,
+          int sign, unsigned flags);
+     
+     fftw_plan fftw_plan_guru_split_dft(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          double *ri, double *ii, double *ro, double *io,
+          unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fguru_005fdft-248"></a><a name="index-fftw_005fplan_005fguru_005fsplit_005fdft-249"></a>
+These two functions plan a complex-data, multi-dimensional DFT
+for the interleaved and split format, respectively. 
+Transform dimensions are given by (<code>rank</code>, <code>dims</code>) over a
+multi-dimensional vector (loop) of dimensions (<code>howmany_rank</code>,
+<code>howmany_dims</code>).  <code>dims</code> and <code>howmany_dims</code> should point
+to <code>fftw_iodim</code> arrays of length <code>rank</code> and
+<code>howmany_rank</code>, respectively.
+
+   <p><a name="index-flags-250"></a><code>flags</code> is a bitwise OR (&lsquo;<samp><span class="samp">|</span></samp>&rsquo;) of zero or more planner flags,
+as defined in <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a>.
+
+   <p>In the <code>fftw_plan_guru_dft</code> function, the pointers <code>in</code> and
+<code>out</code> point to the interleaved input and output arrays,
+respectively.  The sign can be either -1 (=
+<code>FFTW_FORWARD</code>) or +1 (= <code>FFTW_BACKWARD</code>).  If the
+pointers are equal, the transform is in-place.
+
+   <p>In the <code>fftw_plan_guru_split_dft</code> function,
+<code>ri</code> and <code>ii</code> point to the real and imaginary input arrays,
+and <code>ro</code> and <code>io</code> point to the real and imaginary output
+arrays.  The input and output pointers may be the same, indicating an
+in-place transform.  For example, for <code>fftw_complex</code> pointers
+<code>in</code> and <code>out</code>, the corresponding parameters are:
+
+<pre class="example">     ri = (double *) in;
+     ii = (double *) in + 1;
+     ro = (double *) out;
+     io = (double *) out + 1;
+</pre>
+   <p>Because <code>fftw_plan_guru_split_dft</code> accepts split arrays, strides
+are expressed in units of <code>double</code>.  For a contiguous
+<code>fftw_complex</code> array, the overall stride of the transform should
+be 2, the distance between consecutive real parts or between
+consecutive imaginary parts; see <a href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a>.  Note that the dimension strides are applied equally to the
+real and imaginary parts; real and imaginary arrays with different
+strides are not supported.
+
+   <p>There is no <code>sign</code> parameter in <code>fftw_plan_guru_split_dft</code>. 
+This function always plans for an <code>FFTW_FORWARD</code> transform.  To
+plan for an <code>FFTW_BACKWARD</code> transform, you can exploit the
+identity that the backwards DFT is equal to the forwards DFT with the
+real and imaginary parts swapped.  For example, in the case of the
+<code>fftw_complex</code> arrays above, the <code>FFTW_BACKWARD</code> transform
+is computed by the parameters:
+
+<pre class="example">     ri = (double *) in + 1;
+     ii = (double *) in;
+     ro = (double *) out + 1;
+     io = (double *) out;
+</pre>
+   <!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Guru-Interface.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Guru-Interface.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,83 @@
+<html lang="en">
+<head>
+<title>Guru Interface - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link rel="prev" href="Advanced-Interface.html#Advanced-Interface" title="Advanced Interface">
+<link rel="next" href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions" title="New-array Execute Functions">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Guru-Interface"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Advanced-Interface.html#Advanced-Interface">Advanced Interface</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>
+<hr>
+</div>
+
+<h3 class="section">4.5 Guru Interface</h3>
+
+<p><a name="index-guru-interface-239"></a>
+The &ldquo;guru&rdquo; interface to FFTW is intended to expose as much as possible
+of the flexibility in the underlying FFTW architecture.  It allows one
+to compute multi-dimensional &ldquo;vectors&rdquo; (loops) of multi-dimensional
+transforms, where each vector/transform dimension has an independent
+size and stride. 
+<a name="index-vector-240"></a>One can also use more general complex-number formats, e.g. separate real
+and imaginary arrays.
+
+   <p>For those users who require the flexibility of the guru interface, it is
+important that they pay special attention to the documentation lest they
+shoot themselves in the foot.
+
+<ul class="menu">
+<li><a accesskey="1" href="Interleaved-and-split-arrays.html#Interleaved-and-split-arrays">Interleaved and split arrays</a>
+<li><a accesskey="2" href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a>
+<li><a accesskey="3" href="Guru-Complex-DFTs.html#Guru-Complex-DFTs">Guru Complex DFTs</a>
+<li><a accesskey="4" href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs">Guru Real-data DFTs</a>
+<li><a accesskey="5" href="Guru-Real_002dto_002dreal-Transforms.html#Guru-Real_002dto_002dreal-Transforms">Guru Real-to-real Transforms</a>
+<li><a accesskey="6" href="64_002dbit-Guru-Interface.html#g_t64_002dbit-Guru-Interface">64-bit Guru Interface</a>
+</ul>
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Guru-Real_002ddata-DFTs.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Guru-Real_002ddata-DFTs.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,125 @@
+<html lang="en">
+<head>
+<title>Guru Real-data DFTs - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Guru-Interface.html#Guru-Interface" title="Guru Interface">
+<link rel="prev" href="Guru-Complex-DFTs.html#Guru-Complex-DFTs" title="Guru Complex DFTs">
+<link rel="next" href="Guru-Real_002dto_002dreal-Transforms.html#Guru-Real_002dto_002dreal-Transforms" title="Guru Real-to-real Transforms">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Guru-Real-data-DFTs"></a>
+<a name="Guru-Real_002ddata-DFTs"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Guru-Real_002dto_002dreal-Transforms.html#Guru-Real_002dto_002dreal-Transforms">Guru Real-to-real Transforms</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Guru-Complex-DFTs.html#Guru-Complex-DFTs">Guru Complex DFTs</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Guru-Interface.html#Guru-Interface">Guru Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.5.4 Guru Real-data DFTs</h4>
+
+<pre class="example">     fftw_plan fftw_plan_guru_dft_r2c(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          double *in, fftw_complex *out,
+          unsigned flags);
+     
+     fftw_plan fftw_plan_guru_split_dft_r2c(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          double *in, double *ro, double *io,
+          unsigned flags);
+     
+     fftw_plan fftw_plan_guru_dft_c2r(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          fftw_complex *in, double *out,
+          unsigned flags);
+     
+     fftw_plan fftw_plan_guru_split_dft_c2r(
+          int rank, const fftw_iodim *dims,
+          int howmany_rank, const fftw_iodim *howmany_dims,
+          double *ri, double *ii, double *out,
+          unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fguru_005fdft_005fr2c-251"></a><a name="index-fftw_005fplan_005fguru_005fsplit_005fdft_005fr2c-252"></a><a name="index-fftw_005fplan_005fguru_005fdft_005fc2r-253"></a><a name="index-fftw_005fplan_005fguru_005fsplit_005fdft_005fc2r-254"></a>
+Plan a real-input (r2c) or real-output (c2r), multi-dimensional DFT with
+transform dimensions given by (<code>rank</code>, <code>dims</code>) over a
+multi-dimensional vector (loop) of dimensions (<code>howmany_rank</code>,
+<code>howmany_dims</code>).  <code>dims</code> and <code>howmany_dims</code> should point
+to <code>fftw_iodim</code> arrays of length <code>rank</code> and
+<code>howmany_rank</code>, respectively.  As for the basic and advanced
+interfaces, an r2c transform is <code>FFTW_FORWARD</code> and a c2r transform
+is <code>FFTW_BACKWARD</code>.
+
+   <p>The <em>last</em> dimension of <code>dims</code> is interpreted specially:
+that dimension of the real array has size <code>dims[rank-1].n</code>, but
+that dimension of the complex array has size <code>dims[rank-1].n/2+1</code>
+(division rounded down).  The strides, on the other hand, are taken to
+be exactly as specified.  It is up to the user to specify the strides
+appropriately for the peculiar dimensions of the data, and we do not
+guarantee that the planner will succeed (return non-<code>NULL</code>) for
+any dimensions other than those described in <a href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a> and generalized in <a href="Advanced-Real_002ddata-DFTs.html#Advanced-Real_002ddata-DFTs">Advanced Real-data DFTs</a>.  (That is,
+for an in-place transform, each individual dimension should be able to
+operate in place.) 
+<a name="index-in_002dplace-255"></a>
+
+   <p><code>in</code> and <code>out</code> point to the input and output arrays for r2c
+and c2r transforms, respectively.  For split arrays, <code>ri</code> and
+<code>ii</code> point to the real and imaginary input arrays for a c2r
+transform, and <code>ro</code> and <code>io</code> point to the real and imaginary
+output arrays for an r2c transform.  <code>in</code> and <code>ro</code> or
+<code>ri</code> and <code>out</code> may be the same, indicating an in-place
+transform.   (In-place transforms where <code>in</code> and <code>io</code> or
+<code>ii</code> and <code>out</code> are the same are not currently supported.)
+
+   <p><a name="index-flags-256"></a><code>flags</code> is a bitwise OR (&lsquo;<samp><span class="samp">|</span></samp>&rsquo;) of zero or more planner flags,
+as defined in <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a>.
+
+   <p>In-place transforms of rank greater than 1 are currently only
+supported for interleaved arrays.  For split arrays, the planner will
+return <code>NULL</code>. 
+<a name="index-in_002dplace-257"></a>
+<!-- =========> -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Guru-Real_002dto_002dreal-Transforms.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Guru-Real_002dto_002dreal-Transforms.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,87 @@
+<html lang="en">
+<head>
+<title>Guru Real-to-real Transforms - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Guru-Interface.html#Guru-Interface" title="Guru Interface">
+<link rel="prev" href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs" title="Guru Real-data DFTs">
+<link rel="next" href="64_002dbit-Guru-Interface.html#g_t64_002dbit-Guru-Interface" title="64-bit Guru Interface">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Guru-Real-to-real-Transforms"></a>
+<a name="Guru-Real_002dto_002dreal-Transforms"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="64_002dbit-Guru-Interface.html#g_t64_002dbit-Guru-Interface">64-bit Guru Interface</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs">Guru Real-data DFTs</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Guru-Interface.html#Guru-Interface">Guru Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.5.5 Guru Real-to-real Transforms</h4>
+
+<pre class="example">     fftw_plan fftw_plan_guru_r2r(int rank, const fftw_iodim *dims,
+                                  int howmany_rank,
+                                  const fftw_iodim *howmany_dims,
+                                  double *in, double *out,
+                                  const fftw_r2r_kind *kind,
+                                  unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fguru_005fr2r-258"></a>
+Plan a real-to-real (r2r) multi-dimensional <code>FFTW_FORWARD</code>
+transform with transform dimensions given by (<code>rank</code>, <code>dims</code>)
+over a multi-dimensional vector (loop) of dimensions
+(<code>howmany_rank</code>, <code>howmany_dims</code>).  <code>dims</code> and
+<code>howmany_dims</code> should point to <code>fftw_iodim</code> arrays of length
+<code>rank</code> and <code>howmany_rank</code>, respectively.
+
+   <p>The transform kind of each dimension is given by the <code>kind</code>
+parameter, which should point to an array of length <code>rank</code>.  Valid
+<code>fftw_r2r_kind</code> constants are given in <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a>.
+
+   <p><code>in</code> and <code>out</code> point to the real input and output arrays; they
+may be the same, indicating an in-place transform.
+
+   <p><a name="index-flags-259"></a><code>flags</code> is a bitwise OR (&lsquo;<samp><span class="samp">|</span></samp>&rsquo;) of zero or more planner flags,
+as defined in <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a>.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Guru-vector-and-transform-sizes.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Guru-vector-and-transform-sizes.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,116 @@
+<html lang="en">
+<head>
+<title>Guru vector and transform sizes - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Guru-Interface.html#Guru-Interface" title="Guru Interface">
+<link rel="prev" href="Interleaved-and-split-arrays.html#Interleaved-and-split-arrays" title="Interleaved and split arrays">
+<link rel="next" href="Guru-Complex-DFTs.html#Guru-Complex-DFTs" title="Guru Complex DFTs">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Guru-vector-and-transform-sizes"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Guru-Complex-DFTs.html#Guru-Complex-DFTs">Guru Complex DFTs</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Interleaved-and-split-arrays.html#Interleaved-and-split-arrays">Interleaved and split arrays</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Guru-Interface.html#Guru-Interface">Guru Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.5.2 Guru vector and transform sizes</h4>
+
+<p>The guru interface introduces one basic new data structure,
+<code>fftw_iodim</code>, that is used to specify sizes and strides for
+multi-dimensional transforms and vectors:
+
+<pre class="example">     typedef struct {
+          int n;
+          int is;
+          int os;
+     } fftw_iodim;
+</pre>
+   <p><a name="index-fftw_005fiodim-243"></a>
+Here, <code>n</code> is the size of the dimension, and <code>is</code> and <code>os</code>
+are the strides of that dimension for the input and output arrays.  (The
+stride is the separation of consecutive elements along this dimension.)
+
+   <p>The meaning of the stride parameter depends on the type of the array
+that the stride refers to.  <em>If the array is interleaved complex,
+strides are expressed in units of complex numbers
+(</em><code>fftw_complex</code><em>).  If the array is split complex or real, strides
+are expressed in units of real numbers (</em><code>double</code><em>).</em>  This
+convention is consistent with the usual pointer arithmetic in the C
+language.  An interleaved array is denoted by a pointer <code>p</code> to
+<code>fftw_complex</code>, so that <code>p+1</code> points to the next complex
+number.  Split arrays are denoted by pointers to <code>double</code>, in
+which case pointer arithmetic operates in units of
+<code>sizeof(double)</code>. 
+<a name="index-stride-244"></a>
+
+   <p>The guru planner interfaces all take a (<code>rank</code>, <code>dims[rank]</code>)
+pair describing the transform size, and a (<code>howmany_rank</code>,
+<code>howmany_dims[howmany_rank]</code>) pair describing the &ldquo;vector&rdquo; size (a
+multi-dimensional loop of transforms to perform), where <code>dims</code> and
+<code>howmany_dims</code> are arrays of <code>fftw_iodim</code>.
+
+   <p>For example, the <code>howmany</code> parameter in the advanced complex-DFT
+interface corresponds to <code>howmany_rank</code> = 1,
+<code>howmany_dims[0].n</code> = <code>howmany</code>, <code>howmany_dims[0].is</code> =
+<code>idist</code>, and <code>howmany_dims[0].os</code> = <code>odist</code>. 
+<a name="index-howmany-loop-245"></a><a name="index-dist-246"></a>(To compute a single transform, you can just use <code>howmany_rank</code> = 0.)
+
+   <p>A row-major multidimensional array with dimensions <code>n[rank]</code>
+(see <a href="Row_002dmajor-Format.html#Row_002dmajor-Format">Row-major Format</a>) corresponds to <code>dims[i].n</code> =
+<code>n[i]</code> and the recurrence <code>dims[i].is</code> = <code>n[i+1] *
+dims[i+1].is</code> (similarly for <code>os</code>).  The stride of the last
+(<code>i=rank-1</code>) dimension is the overall stride of the array. 
+e.g. to be equivalent to the advanced complex-DFT interface, you would
+have <code>dims[rank-1].is</code> = <code>istride</code> and
+<code>dims[rank-1].os</code> = <code>ostride</code>. 
+<a name="index-row_002dmajor-247"></a>
+
+   <p>In general, we only guarantee FFTW to return a non-<code>NULL</code> plan if
+the vector and transform dimensions correspond to a set of distinct
+indices, and for in-place transforms the input/output strides should
+be the same.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/How-Many-Threads-to-Use_003f.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/How-Many-Threads-to-Use_003f.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,79 @@
+<html lang="en">
+<head>
+<title>How Many Threads to Use? - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW" title="Multi-threaded FFTW">
+<link rel="prev" href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW" title="Usage of Multi-threaded FFTW">
+<link rel="next" href="Thread-safety.html#Thread-safety" title="Thread safety">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="How-Many-Threads-to-Use%3f"></a>
+<a name="How-Many-Threads-to-Use_003f"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Thread-safety.html#Thread-safety">Thread safety</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW">Usage of Multi-threaded FFTW</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>
+<hr>
+</div>
+
+<h3 class="section">5.3 How Many Threads to Use?</h3>
+
+<p><a name="index-number-of-threads-339"></a>There is a fair amount of overhead involved in synchronizing threads,
+so the optimal number of threads to use depends upon the size of the
+transform as well as on the number of processors you have.
+
+   <p>As a general rule, you don't want to use more threads than you have
+processors.  (Using more threads will work, but there will be extra
+overhead with no benefit.)  In fact, if the problem size is too small,
+you may want to use fewer threads than you have processors.
+
+   <p>You will have to experiment with your system to see what level of
+parallelization is best for your problem size.  Typically, the problem
+will have to involve at least a few thousand data points before threads
+become beneficial.  If you plan with <code>FFTW_PATIENT</code>, it will
+automatically disable threads for sizes that don't benefit from
+parallelization. 
+<a name="index-FFTW_005fPATIENT-340"></a>
+<!--  -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Installation-and-Customization.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Installation-and-Customization.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,95 @@
+<html lang="en">
+<head>
+<title>Installation and Customization - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Upgrading-from-FFTW-version-2.html#Upgrading-from-FFTW-version-2" title="Upgrading from FFTW version 2">
+<link rel="next" href="Acknowledgments.html#Acknowledgments" title="Acknowledgments">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Installation-and-Customization"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Acknowledgments.html#Acknowledgments">Acknowledgments</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Upgrading-from-FFTW-version-2.html#Upgrading-from-FFTW-version-2">Upgrading from FFTW version 2</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">10 Installation and Customization</h2>
+
+<p><a name="index-installation-603"></a>
+This chapter describes the installation and customization of FFTW, the
+latest version of which may be downloaded from
+<a href="http://www.fftw.org">the FFTW home page</a>.
+
+   <p>In principle, FFTW should work on any system with an ANSI C compiler
+(<code>gcc</code> is fine).  However, planner time is drastically reduced if
+FFTW can exploit a hardware cycle counter; FFTW comes with cycle-counter
+support for all modern general-purpose CPUs, but you may need to add a
+couple of lines of code if your compiler is not yet supported
+(see <a href="Cycle-Counters.html#Cycle-Counters">Cycle Counters</a>).  (On Unix, there will be a warning at the end
+of the <code>configure</code> output if no cycle counter is found.) 
+<a name="index-cycle-counter-604"></a><a name="index-compiler-605"></a><a name="index-portability-606"></a>
+
+   <p>Installation of FFTW is simplest if you have a Unix or a GNU system,
+such as GNU/Linux, and we describe this case in the first section below,
+including the use of special configuration options to e.g. install
+different precisions or exploit optimizations for particular
+architectures (e.g. SIMD).  Compilation on non-Unix systems is a more
+manual process, but we outline the procedure in the second section.  It
+is also likely that pre-compiled binaries will be available for popular
+systems.
+
+   <p>Finally, we describe how you can customize FFTW for particular needs by
+generating <em>codelets</em> for fast transforms of sizes not supported
+efficiently by the standard FFTW distribution. 
+<a name="index-codelet-607"></a>
+
+<ul class="menu">
+<li><a accesskey="1" href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a>
+<li><a accesskey="2" href="Installation-on-non_002dUnix-systems.html#Installation-on-non_002dUnix-systems">Installation on non-Unix systems</a>
+<li><a accesskey="3" href="Cycle-Counters.html#Cycle-Counters">Cycle Counters</a>
+<li><a accesskey="4" href="Generating-your-own-code.html#Generating-your-own-code">Generating your own code</a>
+</ul>
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Installation-and-Supported-Hardware_002fSoftware.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Installation-and-Supported-Hardware_002fSoftware.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,92 @@
+<html lang="en">
+<head>
+<title>Installation and Supported Hardware/Software - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW" title="Multi-threaded FFTW">
+<link rel="prev" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW" title="Multi-threaded FFTW">
+<link rel="next" href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW" title="Usage of Multi-threaded FFTW">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Installation-and-Supported-Hardware%2fSoftware"></a>
+<a name="Installation-and-Supported-Hardware_002fSoftware"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW">Usage of Multi-threaded FFTW</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>
+<hr>
+</div>
+
+<h3 class="section">5.1 Installation and Supported Hardware/Software</h3>
+
+<p>All of the FFTW threads code is located in the <code>threads</code>
+subdirectory of the FFTW package.  On Unix systems, the FFTW threads
+libraries and header files can be automatically configured, compiled,
+and installed along with the uniprocessor FFTW libraries simply by
+including <code>--enable-threads</code> in the flags to the <code>configure</code>
+script (see <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a>), or <code>--enable-openmp</code> to use
+<a href="http://www.openmp.org">OpenMP</a> threads. 
+<a name="index-configure-329"></a>
+
+   <p><a name="index-portability-330"></a><a name="index-OpenMP-331"></a>The threads routines require your operating system to have some sort
+of shared-memory threads support.  Specifically, the FFTW threads
+package works with POSIX threads (available on most Unix variants,
+from GNU/Linux to MacOS X) and Win32 threads.  OpenMP threads, which
+are supported in many common compilers (e.g. gcc) are also supported,
+and may give better performance on some systems.  (OpenMP threads are
+also useful if you are employing OpenMP in your own code, in order to
+minimize conflicts between threading models.)  If you have a
+shared-memory machine that uses a different threads API, it should be
+a simple matter of programming to include support for it; see the file
+<code>threads/threads.c</code> for more detail.
+
+   <p>You can compile FFTW with <em>both</em> <code>--enable-threads</code> and
+<code>--enable-openmp</code> at the same time, since they install libraries
+with different names (&lsquo;<samp><span class="samp">fftw3_threads</span></samp>&rsquo; and &lsquo;<samp><span class="samp">fftw3_omp</span></samp>&rsquo;, as
+described below).  However, your programs may only link to <em>one</em>
+of these two libraries at a time.
+
+   <p>Ideally, of course, you should also have multiple processors in order to
+get any benefit from the threaded transforms.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Installation-on-Unix.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Installation-on-Unix.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,218 @@
+<html lang="en">
+<head>
+<title>Installation on Unix - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Installation-and-Customization.html#Installation-and-Customization" title="Installation and Customization">
+<link rel="prev" href="Installation-and-Customization.html#Installation-and-Customization" title="Installation and Customization">
+<link rel="next" href="Installation-on-non_002dUnix-systems.html#Installation-on-non_002dUnix-systems" title="Installation on non-Unix systems">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Installation-on-Unix"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Installation-on-non_002dUnix-systems.html#Installation-on-non_002dUnix-systems">Installation on non-Unix systems</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>
+<hr>
+</div>
+
+<h3 class="section">10.1 Installation on Unix</h3>
+
+<p>FFTW comes with a <code>configure</code> program in the GNU style. 
+Installation can be as simple as:
+<a name="index-configure-608"></a>
+<pre class="example">     ./configure
+     make
+     make install
+</pre>
+   <p>This will build the uniprocessor complex and real transform libraries
+along with the test programs.  (We recommend that you use GNU
+<code>make</code> if it is available; on some systems it is called
+<code>gmake</code>.)  The &ldquo;<code>make install</code>&rdquo; command installs the fftw
+and rfftw libraries in standard places, and typically requires root
+privileges (unless you specify a different install directory with the
+<code>--prefix</code> flag to <code>configure</code>).  You can also type
+&ldquo;<code>make check</code>&rdquo; to put the FFTW test programs through their paces. 
+If you have problems during configuration or compilation, you may want
+to run &ldquo;<code>make distclean</code>&rdquo; before trying again; this ensures that
+you don't have any stale files left over from previous compilation
+attempts.
+
+   <p>The <code>configure</code> script chooses the <code>gcc</code> compiler by default,
+if it is available; you can select some other compiler with:
+<pre class="example">     ./configure CC="<i>&lt;the name of your C compiler&gt;</i>"
+</pre>
+   <p>The <code>configure</code> script knows good <code>CFLAGS</code> (C compiler flags)
+<a name="index-compiler-flags-609"></a>for a few systems.  If your system is not known, the <code>configure</code>
+script will print out a warning.  In this case, you should re-configure
+FFTW with the command
+<pre class="example">     ./configure CFLAGS="<i>&lt;write your CFLAGS here&gt;</i>"
+</pre>
+   <p>and then compile as usual.  If you do find an optimal set of
+<code>CFLAGS</code> for your system, please let us know what they are (along
+with the output of <code>config.guess</code>) so that we can include them in
+future releases.
+
+   <p><code>configure</code> supports all the standard flags defined by the GNU
+Coding Standards; see the <code>INSTALL</code> file in FFTW or
+<a href="http://www.gnu.org/prep/standards/html_node/index.html">the GNU web page</a>. 
+Note especially <code>--help</code> to list all flags and
+<code>--enable-shared</code> to create shared, rather than static, libraries. 
+<code>configure</code> also accepts a few FFTW-specific flags, particularly:
+
+     <ul>
+<li><a name="index-precision-610"></a><code>--enable-float</code>: Produces a single-precision version of FFTW
+(<code>float</code>) instead of the default double-precision (<code>double</code>). 
+See <a href="Precision.html#Precision">Precision</a>.
+
+     <li><a name="index-precision-611"></a><code>--enable-long-double</code>: Produces a long-double precision version of
+FFTW (<code>long double</code>) instead of the default double-precision
+(<code>double</code>).  The <code>configure</code> script will halt with an error
+message if <code>long double</code> is the same size as <code>double</code> on your
+machine/compiler.  See <a href="Precision.html#Precision">Precision</a>.
+
+     <li><a name="index-precision-612"></a><code>--enable-quad-precision</code>: Produces a quadruple-precision version
+of FFTW using the nonstandard <code>__float128</code> type provided by
+<code>gcc</code> 4.6 or later on x86, x86-64, and Itanium architectures,
+instead of the default double-precision (<code>double</code>).  The
+<code>configure</code> script will halt with an error message if the
+compiler is not <code>gcc</code> version 4.6 or later or if <code>gcc</code>'s
+<code>libquadmath</code> library is not installed.  See <a href="Precision.html#Precision">Precision</a>.
+
+     <li><a name="index-threads-613"></a><code>--enable-threads</code>: Enables compilation and installation of the
+FFTW threads library (see <a href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>), which provides a
+simple interface to parallel transforms for SMP systems.  By default,
+the threads routines are not compiled.
+
+     <li><code>--enable-openmp</code>: Like <code>--enable-threads</code>, but using OpenMP
+compiler directives in order to induce parallelism rather than
+spawning its own threads directly, and installing an &lsquo;<samp><span class="samp">fftw3_omp</span></samp>&rsquo; library
+rather than an &lsquo;<samp><span class="samp">fftw3_threads</span></samp>&rsquo; library (see <a href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>).  You can use both <code>--enable-openmp</code> and <code>--enable-threads</code>
+since they compile/install libraries with different names.  By default,
+the OpenMP routines are not compiled.
+
+     <li><code>--with-combined-threads</code>: By default, if <code>--enable-threads</code>
+is used, the threads support is compiled into a separate library that
+must be linked in addition to the main FFTW library.  This is so that
+users of the serial library do not need to link the system threads
+libraries.  If <code>--with-combined-threads</code> is specified, however,
+then no separate threads library is created, and threads are included
+in the main FFTW library.  This is mainly useful under Windows, where
+no system threads library is required and inter-library dependencies
+are problematic.
+
+     <li><a name="index-MPI-614"></a><code>--enable-mpi</code>: Enables compilation and installation of the FFTW
+MPI library (see <a href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>), which provides
+parallel transforms for distributed-memory systems with MPI.  (By
+default, the MPI routines are not compiled.)  See <a href="FFTW-MPI-Installation.html#FFTW-MPI-Installation">FFTW MPI Installation</a>.
+
+     <li><a name="index-Fortran_002dcallable-wrappers-615"></a><code>--disable-fortran</code>: Disables inclusion of legacy-Fortran
+wrapper routines (see <a href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>) in the standard
+FFTW libraries.  These wrapper routines increase the library size by
+only a negligible amount, so they are included by default as long as
+the <code>configure</code> script finds a Fortran compiler on your system. 
+(To specify a particular Fortran compiler <i>foo</i>, pass
+<code>F77=</code><i>foo</i> to <code>configure</code>.)
+
+     <li><code>--with-g77-wrappers</code>: By default, when Fortran wrappers are
+included, the wrappers employ the linking conventions of the Fortran
+compiler detected by the <code>configure</code> script.  If this compiler is
+GNU <code>g77</code>, however, then <em>two</em> versions of the wrappers are
+included: one with <code>g77</code>'s idiosyncratic convention of appending
+two underscores to identifiers, and one with the more common
+convention of appending only a single underscore.  This way, the same
+FFTW library will work with both <code>g77</code> and other Fortran
+compilers, such as GNU <code>gfortran</code>.  However, the converse is not
+true: if you configure with a different compiler, then the
+<code>g77</code>-compatible wrappers are not included.  By specifying
+<code>--with-g77-wrappers</code>, the <code>g77</code>-compatible wrappers are
+included in addition to wrappers for whatever Fortran compiler
+<code>configure</code> finds. 
+<a name="index-g77-616"></a>
+<li><code>--with-slow-timer</code>: Disables the use of hardware cycle counters,
+and falls back on <code>gettimeofday</code> or <code>clock</code>.  This greatly
+worsens performance, and should generally not be used (unless you don't
+have a cycle counter but still really want an optimized plan regardless
+of the time).  See <a href="Cycle-Counters.html#Cycle-Counters">Cycle Counters</a>.
+
+     <li><code>--enable-sse</code>, <code>--enable-sse2</code>, <code>--enable-avx</code>,
+<code>--enable-altivec</code>, <code>--enable-neon</code>: Enable the compilation of
+SIMD code for SSE (Pentium III+), SSE2 (Pentium IV+), AVX (Sandy Bridge,
+Interlagos), AltiVec (PowerPC G4+), NEON (some ARM processors).  SSE,
+AltiVec, and NEON only work with <code>--enable-float</code> (above).  SSE2
+works in both single and double precision (and is simply SSE in single
+precision).  The resulting code will <em>still work</em> on earlier CPUs
+lacking the SIMD extensions (SIMD is automatically disabled, although
+the FFTW library is still larger).
+          <ul>
+<li>These options require a compiler supporting SIMD extensions, and
+compiler support is always a bit flaky: see the FFTW FAQ for a list of
+compiler versions that have problems compiling FFTW. 
+<li>With AltiVec and <code>gcc</code>, you may have to use the
+<code>-mabi=altivec</code> option when compiling any code that links to FFTW,
+in order to properly align the stack; otherwise, FFTW could crash when
+it tries to use an AltiVec feature.  (This is not necessary on MacOS X.) 
+<li>With SSE/SSE2 and <code>gcc</code>, you should use a version of gcc that
+properly aligns the stack when compiling any code that links to FFTW. 
+By default, <code>gcc</code> 2.95 and later versions align the stack as
+needed, but you should not compile FFTW with the <code>-Os</code> option or the
+<code>-mpreferred-stack-boundary</code> option with an argument less than 4. 
+<li>Because of the large variety of ARM processors and ABIs, FFTW
+does not attempt to guess the correct <code>gcc</code> flags for generating
+NEON code.  In general, you will have to provide them on the command line. 
+This command line is known to have worked at least once:
+          <pre class="example">               ./configure --with-slow-timer --host=arm-linux-gnueabi \
+                 --enable-single --enable-neon \
+                 "CC=arm-linux-gnueabi-gcc -march=armv7-a -mfloat-abi=softfp"
+</pre>
+          </ul>
+
+   </ul>
+
+   <p><a name="index-compiler-617"></a>To force <code>configure</code> to use a particular C compiler <i>foo</i>
+(instead of the default, usually <code>gcc</code>), pass <code>CC=</code><i>foo</i> to the
+<code>configure</code> script; you may also need to set the flags via the variable
+<code>CFLAGS</code> as described above. 
+<a name="index-compiler-flags-618"></a>
+<!--  -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Installation-on-non_002dUnix-systems.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Installation-on-non_002dUnix-systems.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,102 @@
+<html lang="en">
+<head>
+<title>Installation on non-Unix systems - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Installation-and-Customization.html#Installation-and-Customization" title="Installation and Customization">
+<link rel="prev" href="Installation-on-Unix.html#Installation-on-Unix" title="Installation on Unix">
+<link rel="next" href="Cycle-Counters.html#Cycle-Counters" title="Cycle Counters">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Installation-on-non-Unix-systems"></a>
+<a name="Installation-on-non_002dUnix-systems"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Cycle-Counters.html#Cycle-Counters">Cycle Counters</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>
+<hr>
+</div>
+
+<h3 class="section">10.2 Installation on non-Unix systems</h3>
+
+<p>It should be relatively straightforward to compile FFTW even on non-Unix
+systems lacking the niceties of a <code>configure</code> script.  Basically,
+you need to edit the <code>config.h</code> header (copy it from
+<code>config.h.in</code>) to <code>#define</code> the various options and compiler
+characteristics, and then compile all the &lsquo;<samp><span class="samp">.c</span></samp>&rsquo; files in the
+relevant directories.
+
+   <p>The <code>config.h</code> header contains about 100 options to set, each one
+initially an <code>#undef</code>, each documented with a comment, and most of
+them fairly obvious.  For most of the options, you should simply
+<code>#define</code> them to <code>1</code> if they are applicable, although a few
+options require a particular value (e.g. <code>SIZEOF_LONG_LONG</code> should
+be defined to the size of the <code>long long</code> type, in bytes, or zero
+if it is not supported).  We will likely post some sample
+<code>config.h</code> files for various operating systems and compilers for
+you to use (at least as a starting point).  Please let us know if you
+have to hand-create a configuration file (and/or a pre-compiled binary)
+that you want to share.
+
+   <p>To create the FFTW library, you will then need to compile all of the
+&lsquo;<samp><span class="samp">.c</span></samp>&rsquo; files in the <code>kernel</code>, <code>dft</code>, <code>dft/scalar</code>,
+<code>dft/scalar/codelets</code>, <code>rdft</code>, <code>rdft/scalar</code>,
+<code>rdft/scalar/r2cf</code>, <code>rdft/scalar/r2cb</code>,
+<code>rdft/scalar/r2r</code>, <code>reodft</code>, and <code>api</code> directories. 
+If you are compiling with SIMD support (e.g. you defined
+<code>HAVE_SSE2</code> in <code>config.h</code>), then you also need to compile
+the <code>.c</code> files in the <code>simd-support</code>,
+<code>{dft,rdft}/simd</code>, <code>{dft,rdft}/simd/*</code> directories.
+
+   <p>Once these files are all compiled, link them into a library, or a shared
+library, or directly into your program.
+
+   <p>To compile the FFTW test program, additionally compile the code in the
+<code>libbench2/</code> directory, and link it into a library.  Then compile
+the code in the <code>tests/</code> directory and link it to the
+<code>libbench2</code> and FFTW libraries.  To compile the <code>fftw-wisdom</code>
+(command-line) tool (see <a href="Wisdom-Utilities.html#Wisdom-Utilities">Wisdom Utilities</a>), compile
+<code>tools/fftw-wisdom.c</code> and link it to the <code>libbench2</code> and FFTW
+libraries
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Interleaved-and-split-arrays.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Interleaved-and-split-arrays.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,81 @@
+<html lang="en">
+<head>
+<title>Interleaved and split arrays - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Guru-Interface.html#Guru-Interface" title="Guru Interface">
+<link rel="prev" href="Guru-Interface.html#Guru-Interface" title="Guru Interface">
+<link rel="next" href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes" title="Guru vector and transform sizes">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Interleaved-and-split-arrays"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Guru-Interface.html#Guru-Interface">Guru Interface</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Guru-Interface.html#Guru-Interface">Guru Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.5.1 Interleaved and split arrays</h4>
+
+<p>The guru interface supports two representations of complex numbers,
+which we call the interleaved and the split format.
+
+   <p>The <dfn>interleaved</dfn> format is the same one used by the basic and
+advanced interfaces, and it is documented in <a href="Complex-numbers.html#Complex-numbers">Complex numbers</a>. 
+In the interleaved format, you provide pointers to the real part of a
+complex number, and the imaginary part understood to be stored in the
+next memory location. 
+<a name="index-interleaved-format-241"></a>
+
+   <p>The <dfn>split</dfn> format allows separate pointers to the real and
+imaginary parts of a complex array. 
+<a name="index-split-format-242"></a>
+
+   <p>Technically, the interleaved format is redundant, because you can
+always express an interleaved array in terms of a split array with
+appropriate pointers and strides.  On the other hand, the interleaved
+format is simpler to use, and it is common in practice.  Hence, FFTW
+supports it as a special case.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Introduction.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Introduction.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,197 @@
+<html lang="en">
+<head>
+<title>Introduction - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="index.html#Top" title="Top">
+<link rel="next" href="Tutorial.html#Tutorial" title="Tutorial">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Introduction"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Tutorial.html#Tutorial">Tutorial</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="index.html#Top">Top</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">1 Introduction</h2>
+
+<p>This manual documents version 3.3.3 of FFTW, the
+<em>Fastest Fourier Transform in the West</em>.  FFTW is a comprehensive
+collection of fast C routines for computing the discrete Fourier
+transform (DFT) and various special cases thereof. 
+<a name="index-discrete-Fourier-transform-1"></a><a name="index-DFT-2"></a>
+     <ul>
+<li>FFTW computes the DFT of complex data, real data, even-
+  or odd-symmetric real data (these symmetric transforms are usually
+  known as the discrete cosine or sine transform, respectively), and the
+  discrete Hartley transform (DHT) of real data.
+
+     <li>The input data can have arbitrary length. 
+       FFTW employs <i>O</i>(<i>n</i>&nbsp;log&nbsp;<i>n</i>) algorithms for all lengths, including
+       prime numbers.
+
+     <li>FFTW supports arbitrary multi-dimensional data.
+
+     <li>FFTW supports the SSE, SSE2, AVX, Altivec, and MIPS PS instruction
+       sets.
+
+     <li>FFTW includes parallel (multi-threaded) transforms
+       for shared-memory systems. 
+<li>Starting with version 3.3, FFTW includes distributed-memory parallel
+       transforms using MPI. 
+</ul>
+
+   <p>We assume herein that you are familiar with the properties and uses of
+the DFT that are relevant to your application.  Otherwise, see
+e.g. <cite>The Fast Fourier Transform and Its Applications</cite> by E. O. Brigham
+(Prentice-Hall, Englewood Cliffs, NJ, 1988). 
+<a href="http://www.fftw.org">Our web page</a> also has links to FFT-related
+information online. 
+<a name="index-FFTW-3"></a>
+<!-- TODO: revise.  We don't need to brag any longer -->
+<!-- FFTW is usually faster (and sometimes much faster) than all other -->
+<!-- freely-available Fourier transform programs found on the Net.  It is -->
+<!-- competitive with (and often faster than) the FFT codes in Sun's -->
+<!-- Performance Library, IBM's ESSL library, HP's CXML library, and -->
+<!-- Intel's MKL library, which are targeted at specific machines. -->
+<!-- Moreover, FFTW's performance is @emph{portable}.  Indeed, FFTW is -->
+<!-- unique in that it automatically adapts itself to your machine, your -->
+<!-- cache, the size of your memory, your number of registers, and all the -->
+<!-- other factors that normally make it impossible to optimize a program -->
+<!-- for more than one machine.  An extensive comparison of FFTW's -->
+<!-- performance with that of other Fourier transform codes has been made, -->
+<!-- and the results are available on the Web at -->
+<!-- @uref{http://fftw.org/benchfft, the benchFFT home page}. -->
+<!-- @cindex benchmark -->
+<!-- @fpindex benchfft -->
+
+   <p>In order to use FFTW effectively, you need to learn one basic concept
+of FFTW's internal structure: FFTW does not use a fixed algorithm for
+computing the transform, but instead it adapts the DFT algorithm to
+details of the underlying hardware in order to maximize performance. 
+Hence, the computation of the transform is split into two phases. 
+First, FFTW's <dfn>planner</dfn> &ldquo;learns&rdquo; the fastest way to compute the
+transform on your machine.  The planner
+<a name="index-planner-4"></a>produces a data structure called a <dfn>plan</dfn> that contains this
+<a name="index-plan-5"></a>information.  Subsequently, the plan is <dfn>executed</dfn>
+<a name="index-execute-6"></a>to transform the array of input data as dictated by the plan.  The
+plan can be reused as many times as needed.  In typical
+high-performance applications, many transforms of the same size are
+computed and, consequently, a relatively expensive initialization of
+this sort is acceptable.  On the other hand, if you need a single
+transform of a given size, the one-time cost of the planner becomes
+significant.  For this case, FFTW provides fast planners based on
+heuristics or on previously computed plans.
+
+   <p>FFTW supports transforms of data with arbitrary length, rank,
+multiplicity, and a general memory layout.  In simple cases, however,
+this generality may be unnecessary and confusing.  Consequently, we
+organized the interface to FFTW into three levels of increasing
+generality.
+     <ul>
+<li>The <dfn>basic interface</dfn> computes a single
+      transform of contiguous data. 
+<li>The <dfn>advanced interface</dfn> computes transforms
+      of multiple or strided arrays. 
+<li>The <dfn>guru interface</dfn> supports the most general data
+      layouts, multiplicities, and strides. 
+</ul>
+   We expect that most users will be best served by the basic interface,
+whereas the guru interface requires careful attention to the
+documentation to avoid problems. 
+<a name="index-basic-interface-7"></a><a name="index-advanced-interface-8"></a><a name="index-guru-interface-9"></a>
+
+   <p>Besides the automatic performance adaptation performed by the planner,
+it is also possible for advanced users to customize FFTW manually.  For
+example, if code space is a concern, we provide a tool that links only
+the subset of FFTW needed by your application.  Conversely, you may need
+to extend FFTW because the standard distribution is not sufficient for
+your needs.  For example, the standard FFTW distribution works most
+efficiently for arrays whose size can be factored into small primes
+(2, 3, 5, and 7), and otherwise it uses a
+slower general-purpose routine.  If you need efficient transforms of
+other sizes, you can use FFTW's code generator, which produces fast C
+programs (&ldquo;codelets&rdquo;) for any particular array size you may care
+about. 
+<a name="index-code-generator-10"></a><a name="index-codelet-11"></a>For example, if you need transforms of size
+513&nbsp;=&nbsp;19*3<sup>3</sup>,you can customize FFTW to support the factor 19 efficiently.
+
+   <p>For more information regarding FFTW, see the paper, &ldquo;The Design and
+Implementation of FFTW3,&rdquo; by M. Frigo and S. G. Johnson, which was an
+invited paper in <cite>Proc. IEEE</cite> <b>93</b> (2), p. 216 (2005).  The
+code generator is described in the paper &ldquo;A fast Fourier transform
+compiler&rdquo;,
+<a name="index-compiler-12"></a>by M. Frigo, in the <cite>Proceedings of the 1999 ACM SIGPLAN Conference
+on Programming Language Design and Implementation (PLDI), Atlanta,
+Georgia, May 1999</cite>.  These papers, along with the latest version of
+FFTW, the FAQ, benchmarks, and other links, are available at
+<a href="http://www.fftw.org">the FFTW home page</a>.
+
+   <p>The current version of FFTW incorporates many good ideas from the past
+thirty years of FFT literature.  In one way or another, FFTW uses the
+Cooley-Tukey algorithm, the prime factor algorithm, Rader's algorithm
+for prime sizes, and a split-radix algorithm (with a
+&ldquo;conjugate-pair&rdquo; variation pointed out to us by Dan Bernstein). 
+FFTW's code generator also produces new algorithms that we do not
+completely understand. 
+<a name="index-algorithm-13"></a>The reader is referred to the cited papers for the appropriate
+references.
+
+   <p>The rest of this manual is organized as follows.  We first discuss the
+sequential (single-processor) implementation.  We start by describing
+the basic interface/features of FFTW in <a href="Tutorial.html#Tutorial">Tutorial</a>. 
+Next, <a href="Other-Important-Topics.html#Other-Important-Topics">Other Important Topics</a> discusses data alignment
+(see <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>),
+the storage scheme of multi-dimensional arrays
+(see <a href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>), and FFTW's mechanism for
+storing plans on disk (see <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>).  Next,
+<a href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a> provides comprehensive documentation of all
+FFTW's features.  Parallel transforms are discussed in their own
+chapters: <a href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a> and <a href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>.  Fortran programmers can also use FFTW, as described in
+<a href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a> and <a href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>.  <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a> explains how to
+install FFTW in your computer system and how to adapt FFTW to your
+needs.  License and copyright information is given in <a href="License-and-Copyright.html#License-and-Copyright">License and Copyright</a>.  Finally, we thank all the people who helped us in
+<a href="Acknowledgments.html#Acknowledgments">Acknowledgments</a>.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Library-Index.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Library-Index.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,382 @@
+<html lang="en">
+<head>
+<title>Library Index - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Concept-Index.html#Concept-Index" title="Concept Index">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Library-Index"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Concept-Index.html#Concept-Index">Concept Index</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">14 Library Index</h2>
+
+
+
+<ul class="index-fn" compact>
+<li><a href="Wisdom-String-Export_002fImport-from-Fortran.html#index-c_005fassociated-569"><code>c_associated</code></a>: <a href="Wisdom-String-Export_002fImport-from-Fortran.html#Wisdom-String-Export_002fImport-from-Fortran">Wisdom String Export/Import from Fortran</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fDOUBLE-530"><code>C_DOUBLE</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-C_005fDOUBLE-509"><code>C_DOUBLE</code></a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fDOUBLE_005fCOMPLEX-533"><code>C_DOUBLE_COMPLEX</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-C_005fDOUBLE_005fCOMPLEX-510"><code>C_DOUBLE_COMPLEX</code></a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#index-c_005ff_005fpointer-575"><code>c_f_pointer</code></a>: <a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#Wisdom-Generic-Export_002fImport-from-Fortran">Wisdom Generic Export/Import from Fortran</a></li>
+<li><a href="Wisdom-String-Export_002fImport-from-Fortran.html#index-c_005ff_005fpointer-570"><code>c_f_pointer</code></a>: <a href="Wisdom-String-Export_002fImport-from-Fortran.html#Wisdom-String-Export_002fImport-from-Fortran">Wisdom String Export/Import from Fortran</a></li>
+<li><a href="Allocating-aligned-memory-in-Fortran.html#index-c_005ff_005fpointer-563"><code>c_f_pointer</code></a>: <a href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-c_005ff_005fpointer-547"><code>c_f_pointer</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Reversing-array-dimensions.html#index-c_005ff_005fpointer-526"><code>c_f_pointer</code></a>: <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fFFTW_005fR2R_005fKIND-542"><code>C_FFTW_R2R_KIND</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fFLOAT-531"><code>C_FLOAT</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fFLOAT_005fCOMPLEX-534"><code>C_FLOAT_COMPLEX</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#index-c_005ffunloc-573"><code>c_funloc</code></a>: <a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#Wisdom-Generic-Export_002fImport-from-Fortran">Wisdom Generic Export/Import from Fortran</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fFUNPTR-552"><code>C_FUNPTR</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fINT-537"><code>C_INT</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-C_005fINT-508"><code>C_INT</code></a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fINTPTR_005fT-538"><code>C_INTPTR_T</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#index-c_005floc-574"><code>c_loc</code></a>: <a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#Wisdom-Generic-Export_002fImport-from-Fortran">Wisdom Generic Export/Import from Fortran</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fLONG_005fDOUBLE-532"><code>C_LONG_DOUBLE</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fLONG_005fDOUBLE_005fCOMPLEX-535"><code>C_LONG_DOUBLE_COMPLEX</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-C_005fPTR-507"><code>C_PTR</code></a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-C_005fSIZE_005fT-539"><code>C_SIZE_T</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Fortran-Examples.html#index-dfftw_005fdestroy_005fplan-589"><code>dfftw_destroy_plan</code></a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="FFTW-Execution-in-Fortran.html#index-dfftw_005fexecute-584"><code>dfftw_execute</code></a>: <a href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran">FFTW Execution in Fortran</a></li>
+<li><a href="Fortran-Examples.html#index-dfftw_005fexecute_005fdft-588"><code>dfftw_execute_dft</code></a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="FFTW-Execution-in-Fortran.html#index-dfftw_005fexecute_005fdft-585"><code>dfftw_execute_dft</code></a>: <a href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran">FFTW Execution in Fortran</a></li>
+<li><a href="Fortran-Examples.html#index-dfftw_005fexecute_005fdft_005fr2c-594"><code>dfftw_execute_dft_r2c</code></a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="Wisdom-of-Fortran_003f.html#index-dfftw_005fexport_005fwisdom-601"><code>dfftw_export_wisdom</code></a>: <a href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f">Wisdom of Fortran?</a></li>
+<li><a href="Wisdom-of-Fortran_003f.html#index-dfftw_005fforget_005fwisdom-602"><code>dfftw_forget_wisdom</code></a>: <a href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f">Wisdom of Fortran?</a></li>
+<li><a href="Wisdom-of-Fortran_003f.html#index-dfftw_005fimport_005fsystem_005fwisdom-599"><code>dfftw_import_system_wisdom</code></a>: <a href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f">Wisdom of Fortran?</a></li>
+<li><a href="Wisdom-of-Fortran_003f.html#index-dfftw_005fimport_005fwisdom-600"><code>dfftw_import_wisdom</code></a>: <a href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f">Wisdom of Fortran?</a></li>
+<li><a href="Fortran-Examples.html#index-dfftw_005finit_005fthreads-590"><code>dfftw_init_threads</code></a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="Fortran-Examples.html#index-dfftw_005fplan_005fdft_005f1d-587"><code>dfftw_plan_dft_1d</code></a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="Fortran-Examples.html#index-dfftw_005fplan_005fdft_005f3d-592"><code>dfftw_plan_dft_3d</code></a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="Fortran-Examples.html#index-dfftw_005fplan_005fdft_005fr2c_005f1d-593"><code>dfftw_plan_dft_r2c_1d</code></a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="Fortran-Examples.html#index-dfftw_005fplan_005fdft_005fr2c_005f2d-595"><code>dfftw_plan_dft_r2c_2d</code></a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="Fortran-Examples.html#index-dfftw_005fplan_005fwith_005fnthreads-591"><code>dfftw_plan_with_nthreads</code></a>: <a href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a></li>
+<li><a href="Allocating-aligned-memory-in-Fortran.html#index-fftw_005falloc_005fcomplex-562"><code>fftw_alloc_complex</code></a>: <a href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a></li>
+<li><a href="Reversing-array-dimensions.html#index-fftw_005falloc_005fcomplex-525"><code>fftw_alloc_complex</code></a>: <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a></li>
+<li><a href="Basic-and-advanced-distribution-interfaces.html#index-fftw_005falloc_005fcomplex-373"><code>fftw_alloc_complex</code></a>: <a href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a></li>
+<li><a href="Memory-Allocation.html#index-fftw_005falloc_005fcomplex-150"><code>fftw_alloc_complex</code></a>: <a href="Memory-Allocation.html#Memory-Allocation">Memory Allocation</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-fftw_005falloc_005fcomplex-114"><code>fftw_alloc_complex</code></a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-fftw_005falloc_005fcomplex-17"><code>fftw_alloc_complex</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Allocating-aligned-memory-in-Fortran.html#index-fftw_005falloc_005freal-561"><code>fftw_alloc_real</code></a>: <a href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-fftw_005falloc_005freal-546"><code>fftw_alloc_real</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#index-fftw_005falloc_005freal-395"><code>fftw_alloc_real</code></a>: <a href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms">Other Multi-dimensional Real-data MPI Transforms</a></li>
+<li><a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#index-fftw_005falloc_005freal-388"><code>fftw_alloc_real</code></a>: <a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a></li>
+<li><a href="Memory-Allocation.html#index-fftw_005falloc_005freal-149"><code>fftw_alloc_real</code></a>: <a href="Memory-Allocation.html#Memory-Allocation">Memory Allocation</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-fftw_005falloc_005freal-113"><code>fftw_alloc_real</code></a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="Complex-DFTs.html#index-FFTW_005fBACKWARD-168"><code>FFTW_BACKWARD</code></a>: <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-FFTW_005fBACKWARD-53"><code>FFTW_BACKWARD</code></a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-FFTW_005fBACKWARD-25"><code>FFTW_BACKWARD</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="MPI-Initialization.html#index-fftw_005fcleanup-437"><code>fftw_cleanup</code></a>: <a href="MPI-Initialization.html#MPI-Initialization">MPI Initialization</a></li>
+<li><a href="Using-Plans.html#index-fftw_005fcleanup-155"><code>fftw_cleanup</code></a>: <a href="Using-Plans.html#Using-Plans">Using Plans</a></li>
+<li><a href="Usage-of-Multi_002dthreaded-FFTW.html#index-fftw_005fcleanup_005fthreads-338"><code>fftw_cleanup_threads</code></a>: <a href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW">Usage of Multi-threaded FFTW</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-fftw_005fcomplex-528"><code>fftw_complex</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-fftw_005fcomplex-506"><code>fftw_complex</code></a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="Complex-numbers.html#index-fftw_005fcomplex-139"><code>fftw_complex</code></a>: <a href="Complex-numbers.html#Complex-numbers">Complex numbers</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-fftw_005fcomplex-19"><code>fftw_complex</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Using-Plans.html#index-fftw_005fcost-156"><code>fftw_cost</code></a>: <a href="Using-Plans.html#Using-Plans">Using Plans</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-FFTW_005fDESTROY_005fINPUT-543"><code>FFTW_DESTROY_INPUT</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="FFTW-MPI-Performance-Tips.html#index-FFTW_005fDESTROY_005fINPUT-426"><code>FFTW_DESTROY_INPUT</code></a>: <a href="FFTW-MPI-Performance-Tips.html#FFTW-MPI-Performance-Tips">FFTW MPI Performance Tips</a></li>
+<li><a href="Planner-Flags.html#index-FFTW_005fDESTROY_005fINPUT-176"><code>FFTW_DESTROY_INPUT</code></a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-fftw_005fdestroy_005fplan-505"><code>fftw_destroy_plan</code></a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="Avoiding-MPI-Deadlocks.html#index-fftw_005fdestroy_005fplan-421"><code>fftw_destroy_plan</code></a>: <a href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks">Avoiding MPI Deadlocks</a></li>
+<li><a href="Using-Plans.html#index-fftw_005fdestroy_005fplan-154"><code>fftw_destroy_plan</code></a>: <a href="Using-Plans.html#Using-Plans">Using Plans</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-fftw_005fdestroy_005fplan-32"><code>fftw_destroy_plan</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fDHT-216"><code>FFTW_DHT</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="The-Discrete-Hartley-Transform.html#index-FFTW_005fDHT-98"><code>FFTW_DHT</code></a>: <a href="The-Discrete-Hartley-Transform.html#The-Discrete-Hartley-Transform">The Discrete Hartley Transform</a></li>
+<li><a href="Cycle-Counters.html#index-FFTW_005fESTIMATE-621"><code>FFTW_ESTIMATE</code></a>: <a href="Cycle-Counters.html#Cycle-Counters">Cycle Counters</a></li>
+<li><a href="Planner-Flags.html#index-FFTW_005fESTIMATE-171"><code>FFTW_ESTIMATE</code></a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="Words-of-Wisdom_002dSaving-Plans.html#index-FFTW_005fESTIMATE-129"><code>FFTW_ESTIMATE</code></a>: <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-FFTW_005fESTIMATE-28"><code>FFTW_ESTIMATE</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Plan-execution-in-Fortran.html#index-fftw_005fexecute-554"><code>fftw_execute</code></a>: <a href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-fftw_005fexecute-513"><code>fftw_execute</code></a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="Avoiding-MPI-Deadlocks.html#index-fftw_005fexecute-420"><code>fftw_execute</code></a>: <a href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks">Avoiding MPI Deadlocks</a></li>
+<li><a href="Basic-distributed_002dtranspose-interface.html#index-fftw_005fexecute-398"><code>fftw_execute</code></a>: <a href="Basic-distributed_002dtranspose-interface.html#Basic-distributed_002dtranspose-interface">Basic distributed-transpose interface</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-fftw_005fexecute-266"><code>fftw_execute</code></a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="Using-Plans.html#index-fftw_005fexecute-153"><code>fftw_execute</code></a>: <a href="Using-Plans.html#Using-Plans">Using Plans</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-fftw_005fexecute-29"><code>fftw_execute</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Plan-execution-in-Fortran.html#index-fftw_005fexecute_005fdft-555"><code>fftw_execute_dft</code></a>: <a href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-fftw_005fexecute_005fdft-504"><code>fftw_execute_dft</code></a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="FFTW-MPI-Fortran-Interface.html#index-fftw_005fexecute_005fdft-497"><code>fftw_execute_dft</code></a>: <a href="FFTW-MPI-Fortran-Interface.html#FFTW-MPI-Fortran-Interface">FFTW MPI Fortran Interface</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-fftw_005fexecute_005fdft-269"><code>fftw_execute_dft</code></a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="Plan-execution-in-Fortran.html#index-fftw_005fexecute_005fdft_005fc2r-557"><code>fftw_execute_dft_c2r</code></a>: <a href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-fftw_005fexecute_005fdft_005fc2r-273"><code>fftw_execute_dft_c2r</code></a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="Plan-execution-in-Fortran.html#index-fftw_005fexecute_005fdft_005fr2c-556"><code>fftw_execute_dft_r2c</code></a>: <a href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a></li>
+<li><a href="Reversing-array-dimensions.html#index-fftw_005fexecute_005fdft_005fr2c-522"><code>fftw_execute_dft_r2c</code></a>: <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-fftw_005fexecute_005fdft_005fr2c-271"><code>fftw_execute_dft_r2c</code></a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="Plan-execution-in-Fortran.html#index-fftw_005fexecute_005fr2r-558"><code>fftw_execute_r2r</code></a>: <a href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-fftw_005fexecute_005fr2r-275"><code>fftw_execute_r2r</code></a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-fftw_005fexecute_005fsplit_005fdft-270"><code>fftw_execute_split_dft</code></a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-fftw_005fexecute_005fsplit_005fdft_005fc2r-274"><code>fftw_execute_split_dft_c2r</code></a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-fftw_005fexecute_005fsplit_005fdft_005fr2c-272"><code>fftw_execute_split_dft_r2c</code></a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="Planner-Flags.html#index-FFTW_005fEXHAUSTIVE-174"><code>FFTW_EXHAUSTIVE</code></a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="Words-of-Wisdom_002dSaving-Plans.html#index-FFTW_005fEXHAUSTIVE-128"><code>FFTW_EXHAUSTIVE</code></a>: <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a></li>
+<li><a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#index-fftw_005fexport_005fwisdom-572"><code>fftw_export_wisdom</code></a>: <a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#Wisdom-Generic-Export_002fImport-from-Fortran">Wisdom Generic Export/Import from Fortran</a></li>
+<li><a href="Wisdom-Export.html#index-fftw_005fexport_005fwisdom-278"><code>fftw_export_wisdom</code></a>: <a href="Wisdom-Export.html#Wisdom-Export">Wisdom Export</a></li>
+<li><a href="Wisdom-Export.html#index-fftw_005fexport_005fwisdom_005fto_005ffile-280"><code>fftw_export_wisdom_to_file</code></a>: <a href="Wisdom-Export.html#Wisdom-Export">Wisdom Export</a></li>
+<li><a href="Wisdom-File-Export_002fImport-from-Fortran.html#index-fftw_005fexport_005fwisdom_005fto_005ffilename-567"><code>fftw_export_wisdom_to_filename</code></a>: <a href="Wisdom-File-Export_002fImport-from-Fortran.html#Wisdom-File-Export_002fImport-from-Fortran">Wisdom File Export/Import from Fortran</a></li>
+<li><a href="Wisdom-Export.html#index-fftw_005fexport_005fwisdom_005fto_005ffilename-279"><code>fftw_export_wisdom_to_filename</code></a>: <a href="Wisdom-Export.html#Wisdom-Export">Wisdom Export</a></li>
+<li><a href="Words-of-Wisdom_002dSaving-Plans.html#index-fftw_005fexport_005fwisdom_005fto_005ffilename-130"><code>fftw_export_wisdom_to_filename</code></a>: <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a></li>
+<li><a href="Wisdom-String-Export_002fImport-from-Fortran.html#index-fftw_005fexport_005fwisdom_005fto_005fstring-568"><code>fftw_export_wisdom_to_string</code></a>: <a href="Wisdom-String-Export_002fImport-from-Fortran.html#Wisdom-String-Export_002fImport-from-Fortran">Wisdom String Export/Import from Fortran</a></li>
+<li><a href="Wisdom-Export.html#index-fftw_005fexport_005fwisdom_005fto_005fstring-281"><code>fftw_export_wisdom_to_string</code></a>: <a href="Wisdom-Export.html#Wisdom-Export">Wisdom Export</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-fftw_005fflops-545"><code>fftw_flops</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Avoiding-MPI-Deadlocks.html#index-fftw_005fflops-422"><code>fftw_flops</code></a>: <a href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks">Avoiding MPI Deadlocks</a></li>
+<li><a href="Using-Plans.html#index-fftw_005fflops-157"><code>fftw_flops</code></a>: <a href="Using-Plans.html#Using-Plans">Using Plans</a></li>
+<li><a href="Forgetting-Wisdom.html#index-fftw_005fforget_005fwisdom-288"><code>fftw_forget_wisdom</code></a>: <a href="Forgetting-Wisdom.html#Forgetting-Wisdom">Forgetting Wisdom</a></li>
+<li><a href="Words-of-Wisdom_002dSaving-Plans.html#index-fftw_005fforget_005fwisdom-132"><code>fftw_forget_wisdom</code></a>: <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a></li>
+<li><a href="Complex-DFTs.html#index-FFTW_005fFORWARD-167"><code>FFTW_FORWARD</code></a>: <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-FFTW_005fFORWARD-52"><code>FFTW_FORWARD</code></a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-FFTW_005fFORWARD-24"><code>FFTW_FORWARD</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Using-Plans.html#index-fftw_005ffprint_005fplan-158"><code>fftw_fprint_plan</code></a>: <a href="Using-Plans.html#Using-Plans">Using Plans</a></li>
+<li><a href="Memory-Allocation.html#index-fftw_005ffree-146"><code>fftw_free</code></a>: <a href="Memory-Allocation.html#Memory-Allocation">Memory Allocation</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-fftw_005ffree-111"><code>fftw_free</code></a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-fftw_005ffree-33"><code>fftw_free</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fHC2R-215"><code>FFTW_HC2R</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="The-Halfcomplex_002dformat-DFT.html#index-FFTW_005fHC2R-76"><code>FFTW_HC2R</code></a>: <a href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a></li>
+<li><a href="Wisdom-File-Export_002fImport-from-Fortran.html#index-fftw_005fimport-wisdom_005ffrom_005ffilename-566"><code>fftw_import wisdom_from_filename</code></a>: <a href="Wisdom-File-Export_002fImport-from-Fortran.html#Wisdom-File-Export_002fImport-from-Fortran">Wisdom File Export/Import from Fortran</a></li>
+<li><a href="Wisdom-Import.html#index-fftw_005fimport_005fsystem_005fwisdom-283"><code>fftw_import_system_wisdom</code></a>: <a href="Wisdom-Import.html#Wisdom-Import">Wisdom Import</a></li>
+<li><a href="Caveats-in-Using-Wisdom.html#index-fftw_005fimport_005fsystem_005fwisdom-137"><code>fftw_import_system_wisdom</code></a>: <a href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom">Caveats in Using Wisdom</a></li>
+<li><a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#index-fftw_005fimport_005fwisdom-576"><code>fftw_import_wisdom</code></a>: <a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#Wisdom-Generic-Export_002fImport-from-Fortran">Wisdom Generic Export/Import from Fortran</a></li>
+<li><a href="Wisdom-Import.html#index-fftw_005fimport_005fwisdom-282"><code>fftw_import_wisdom</code></a>: <a href="Wisdom-Import.html#Wisdom-Import">Wisdom Import</a></li>
+<li><a href="Wisdom-Import.html#index-fftw_005fimport_005fwisdom_005ffrom_005ffile-285"><code>fftw_import_wisdom_from_file</code></a>: <a href="Wisdom-Import.html#Wisdom-Import">Wisdom Import</a></li>
+<li><a href="Wisdom-Import.html#index-fftw_005fimport_005fwisdom_005ffrom_005ffilename-284"><code>fftw_import_wisdom_from_filename</code></a>: <a href="Wisdom-Import.html#Wisdom-Import">Wisdom Import</a></li>
+<li><a href="Words-of-Wisdom_002dSaving-Plans.html#index-fftw_005fimport_005fwisdom_005ffrom_005ffilename-131"><code>fftw_import_wisdom_from_filename</code></a>: <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a></li>
+<li><a href="Wisdom-String-Export_002fImport-from-Fortran.html#index-fftw_005fimport_005fwisdom_005ffrom_005fstring-571"><code>fftw_import_wisdom_from_string</code></a>: <a href="Wisdom-String-Export_002fImport-from-Fortran.html#Wisdom-String-Export_002fImport-from-Fortran">Wisdom String Export/Import from Fortran</a></li>
+<li><a href="Wisdom-Import.html#index-fftw_005fimport_005fwisdom_005ffrom_005fstring-286"><code>fftw_import_wisdom_from_string</code></a>: <a href="Wisdom-Import.html#Wisdom-Import">Wisdom Import</a></li>
+<li><a href="MPI-Initialization.html#index-fftw_005finit_005fthreads-435"><code>fftw_init_threads</code></a>: <a href="MPI-Initialization.html#MPI-Initialization">MPI Initialization</a></li>
+<li><a href="Combining-MPI-and-Threads.html#index-fftw_005finit_005fthreads-429"><code>fftw_init_threads</code></a>: <a href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a></li>
+<li><a href="Linking-and-Initializing-MPI-FFTW.html#index-fftw_005finit_005fthreads-354"><code>fftw_init_threads</code></a>: <a href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW">Linking and Initializing MPI FFTW</a></li>
+<li><a href="Usage-of-Multi_002dthreaded-FFTW.html#index-fftw_005finit_005fthreads-334"><code>fftw_init_threads</code></a>: <a href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW">Usage of Multi-threaded FFTW</a></li>
+<li><a href="Fortran_002dinterface-routines.html#index-fftw_005fiodim-580"><code>fftw_iodim</code></a>: <a href="Fortran_002dinterface-routines.html#Fortran_002dinterface-routines">Fortran-interface routines</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-fftw_005fiodim-549"><code>fftw_iodim</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Guru-vector-and-transform-sizes.html#index-fftw_005fiodim-243"><code>fftw_iodim</code></a>: <a href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">Guru vector and transform sizes</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-fftw_005fiodim64-550"><code>fftw_iodim64</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="64_002dbit-Guru-Interface.html#index-fftw_005fiodim64-263"><code>fftw_iodim64</code></a>: <a href="64_002dbit-Guru-Interface.html#g_t64_002dbit-Guru-Interface">64-bit Guru Interface</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-fftw_005fmalloc-540"><code>fftw_malloc</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Using-MPI-Plans.html#index-fftw_005fmalloc-446"><code>fftw_malloc</code></a>: <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a></li>
+<li><a href="Basic-and-advanced-distribution-interfaces.html#index-fftw_005fmalloc-372"><code>fftw_malloc</code></a>: <a href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a></li>
+<li><a href="Memory-Allocation.html#index-fftw_005fmalloc-145"><code>fftw_malloc</code></a>: <a href="Memory-Allocation.html#Memory-Allocation">Memory Allocation</a></li>
+<li><a href="Dynamic-Arrays-in-C.html#index-fftw_005fmalloc-121"><code>fftw_malloc</code></a>: <a href="Dynamic-Arrays-in-C.html#Dynamic-Arrays-in-C">Dynamic Arrays in C</a></li>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#index-fftw_005fmalloc-110"><code>fftw_malloc</code></a>: <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-fftw_005fmalloc-16"><code>fftw_malloc</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="An-improved-replacement-for-MPI_005fAlltoall.html#index-FFTW_005fMEASURE-408"><code>FFTW_MEASURE</code></a>: <a href="An-improved-replacement-for-MPI_005fAlltoall.html#An-improved-replacement-for-MPI_005fAlltoall">An improved replacement for MPI_Alltoall</a></li>
+<li><a href="Planner-Flags.html#index-FFTW_005fMEASURE-172"><code>FFTW_MEASURE</code></a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="Words-of-Wisdom_002dSaving-Plans.html#index-FFTW_005fMEASURE-126"><code>FFTW_MEASURE</code></a>: <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-FFTW_005fMEASURE-27"><code>FFTW_MEASURE</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="MPI-Wisdom-Communication.html#index-fftw_005fmpi_005fbroadcast_005fwisdom-493"><code>fftw_mpi_broadcast_wisdom</code></a>: <a href="MPI-Wisdom-Communication.html#MPI-Wisdom-Communication">MPI Wisdom Communication</a></li>
+<li><a href="FFTW-MPI-Wisdom.html#index-fftw_005fmpi_005fbroadcast_005fwisdom-414"><code>fftw_mpi_broadcast_wisdom</code></a>: <a href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a></li>
+<li><a href="MPI-Initialization.html#index-fftw_005fmpi_005fcleanup-436"><code>fftw_mpi_cleanup</code></a>: <a href="MPI-Initialization.html#MPI-Initialization">MPI Initialization</a></li>
+<li><a href="Linking-and-Initializing-MPI-FFTW.html#index-fftw_005fmpi_005fcleanup-356"><code>fftw_mpi_cleanup</code></a>: <a href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW">Linking and Initializing MPI FFTW</a></li>
+<li><a href="MPI-Plan-Creation.html#index-FFTW_005fMPI_005fDEFAULT_005fBLOCK-469"><code>FFTW_MPI_DEFAULT_BLOCK</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Advanced-distributed_002dtranspose-interface.html#index-FFTW_005fMPI_005fDEFAULT_005fBLOCK-404"><code>FFTW_MPI_DEFAULT_BLOCK</code></a>: <a href="Advanced-distributed_002dtranspose-interface.html#Advanced-distributed_002dtranspose-interface">Advanced distributed-transpose interface</a></li>
+<li><a href="Basic-and-advanced-distribution-interfaces.html#index-FFTW_005fMPI_005fDEFAULT_005fBLOCK-376"><code>FFTW_MPI_DEFAULT_BLOCK</code></a>: <a href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a></li>
+<li><a href="FFTW-MPI-Fortran-Interface.html#index-fftw_005fmpi_005fexecute_005fdft-498"><code>fftw_mpi_execute_dft</code></a>: <a href="FFTW-MPI-Fortran-Interface.html#FFTW-MPI-Fortran-Interface">FFTW MPI Fortran Interface</a></li>
+<li><a href="Using-MPI-Plans.html#index-fftw_005fmpi_005fexecute_005fdft-441"><code>fftw_mpi_execute_dft</code></a>: <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a></li>
+<li><a href="Using-MPI-Plans.html#index-fftw_005fmpi_005fexecute_005fdft_005fc2r-443"><code>fftw_mpi_execute_dft_c2r</code></a>: <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a></li>
+<li><a href="Using-MPI-Plans.html#index-fftw_005fmpi_005fexecute_005fdft_005fr2c-442"><code>fftw_mpi_execute_dft_r2c</code></a>: <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fexecute_005fr2r-491"><code>fftw_mpi_execute_r2r</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Using-MPI-Plans.html#index-fftw_005fmpi_005fexecute_005fr2r-444"><code>fftw_mpi_execute_r2r</code></a>: <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a></li>
+<li><a href="MPI-Wisdom-Communication.html#index-fftw_005fmpi_005fgather_005fwisdom-492"><code>fftw_mpi_gather_wisdom</code></a>: <a href="MPI-Wisdom-Communication.html#MPI-Wisdom-Communication">MPI Wisdom Communication</a></li>
+<li><a href="FFTW-MPI-Wisdom.html#index-fftw_005fmpi_005fgather_005fwisdom-413"><code>fftw_mpi_gather_wisdom</code></a>: <a href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a></li>
+<li><a href="MPI-Initialization.html#index-fftw_005fmpi_005finit-434"><code>fftw_mpi_init</code></a>: <a href="MPI-Initialization.html#MPI-Initialization">MPI Initialization</a></li>
+<li><a href="Combining-MPI-and-Threads.html#index-fftw_005fmpi_005finit-428"><code>fftw_mpi_init</code></a>: <a href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a></li>
+<li><a href="FFTW-MPI-Wisdom.html#index-fftw_005fmpi_005finit-416"><code>fftw_mpi_init</code></a>: <a href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a></li>
+<li><a href="2d-MPI-example.html#index-fftw_005fmpi_005finit-358"><code>fftw_mpi_init</code></a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="Linking-and-Initializing-MPI-FFTW.html#index-fftw_005fmpi_005finit-355"><code>fftw_mpi_init</code></a>: <a href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW">Linking and Initializing MPI FFTW</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-fftw_005fmpi_005flocal_005fsize-450"><code>fftw_mpi_local_size</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-fftw_005fmpi_005flocal_005fsize_005f1d-457"><code>fftw_mpi_local_size_1d</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="One_002ddimensional-distributions.html#index-fftw_005fmpi_005flocal_005fsize_005f1d-383"><code>fftw_mpi_local_size_1d</code></a>: <a href="One_002ddimensional-distributions.html#One_002ddimensional-distributions">One-dimensional distributions</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-fftw_005fmpi_005flocal_005fsize_005f2d-448"><code>fftw_mpi_local_size_2d</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="Basic-and-advanced-distribution-interfaces.html#index-fftw_005fmpi_005flocal_005fsize_005f2d-370"><code>fftw_mpi_local_size_2d</code></a>: <a href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a></li>
+<li><a href="2d-MPI-example.html#index-fftw_005fmpi_005flocal_005fsize_005f2d-364"><code>fftw_mpi_local_size_2d</code></a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-fftw_005fmpi_005flocal_005fsize_005f2d_005ftransposed-451"><code>fftw_mpi_local_size_2d_transposed</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="Basic-distributed_002dtranspose-interface.html#index-fftw_005fmpi_005flocal_005fsize_005f2d_005ftransposed-402"><code>fftw_mpi_local_size_2d_transposed</code></a>: <a href="Basic-distributed_002dtranspose-interface.html#Basic-distributed_002dtranspose-interface">Basic distributed-transpose interface</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-fftw_005fmpi_005flocal_005fsize_005f3d-449"><code>fftw_mpi_local_size_3d</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-fftw_005fmpi_005flocal_005fsize_005f3d_005ftransposed-452"><code>fftw_mpi_local_size_3d_transposed</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="Transposed-distributions.html#index-fftw_005fmpi_005flocal_005fsize_005f3d_005ftransposed-382"><code>fftw_mpi_local_size_3d_transposed</code></a>: <a href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-fftw_005fmpi_005flocal_005fsize_005fmany-455"><code>fftw_mpi_local_size_many</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="Basic-and-advanced-distribution-interfaces.html#index-fftw_005fmpi_005flocal_005fsize_005fmany-375"><code>fftw_mpi_local_size_many</code></a>: <a href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-fftw_005fmpi_005flocal_005fsize_005fmany_005f1d-458"><code>fftw_mpi_local_size_many_1d</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-fftw_005fmpi_005flocal_005fsize_005fmany_005ftransposed-456"><code>fftw_mpi_local_size_many_transposed</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="Advanced-distributed_002dtranspose-interface.html#index-fftw_005fmpi_005flocal_005fsize_005fmany_005ftransposed-405"><code>fftw_mpi_local_size_many_transposed</code></a>: <a href="Advanced-distributed_002dtranspose-interface.html#Advanced-distributed_002dtranspose-interface">Advanced distributed-transpose interface</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-fftw_005fmpi_005flocal_005fsize_005ftransposed-453"><code>fftw_mpi_local_size_transposed</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fdft-464"><code>fftw_mpi_plan_dft</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fdft_005f1d-461"><code>fftw_mpi_plan_dft_1d</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fdft_005f2d-462"><code>fftw_mpi_plan_dft_2d</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="2d-MPI-example.html#index-fftw_005fmpi_005fplan_005fdft_005f2d-360"><code>fftw_mpi_plan_dft_2d</code></a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fdft_005f3d-463"><code>fftw_mpi_plan_dft_3d</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fdft_005fc2r-483"><code>fftw_mpi_plan_dft_c2r</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fdft_005fc2r_005f2d-480"><code>fftw_mpi_plan_dft_c2r_2d</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fdft_005fc2r_005f3d-482"><code>fftw_mpi_plan_dft_c2r_3d</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fdft_005fr2c-479"><code>fftw_mpi_plan_dft_r2c</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fdft_005fr2c_005f2d-476"><code>fftw_mpi_plan_dft_r2c_2d</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fdft_005fr2c_005f3d-478"><code>fftw_mpi_plan_dft_r2c_3d</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fmany_005fdft-465"><code>fftw_mpi_plan_many_dft</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fmany_005fdft_005fc2r-485"><code>fftw_mpi_plan_many_dft_c2r</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fmany_005fdft_005fr2c-484"><code>fftw_mpi_plan_many_dft_r2c</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005fmany_005ftranspose-489"><code>fftw_mpi_plan_many_transpose</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Advanced-distributed_002dtranspose-interface.html#index-fftw_005fmpi_005fplan_005fmany_005ftranspose-403"><code>fftw_mpi_plan_many_transpose</code></a>: <a href="Advanced-distributed_002dtranspose-interface.html#Advanced-distributed_002dtranspose-interface">Advanced distributed-transpose interface</a></li>
+<li><a href="MPI-Plan-Creation.html#index-fftw_005fmpi_005fplan_005ftranspose-488"><code>fftw_mpi_plan_transpose</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Basic-distributed_002dtranspose-interface.html#index-fftw_005fmpi_005fplan_005ftranspose-397"><code>fftw_mpi_plan_transpose</code></a>: <a href="Basic-distributed_002dtranspose-interface.html#Basic-distributed_002dtranspose-interface">Basic distributed-transpose interface</a></li>
+<li><a href="MPI-Plan-Creation.html#index-FFTW_005fMPI_005fSCRAMBLED_005fIN-472"><code>FFTW_MPI_SCRAMBLED_IN</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-FFTW_005fMPI_005fSCRAMBLED_005fIN-460"><code>FFTW_MPI_SCRAMBLED_IN</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="One_002ddimensional-distributions.html#index-FFTW_005fMPI_005fSCRAMBLED_005fIN-385"><code>FFTW_MPI_SCRAMBLED_IN</code></a>: <a href="One_002ddimensional-distributions.html#One_002ddimensional-distributions">One-dimensional distributions</a></li>
+<li><a href="MPI-Plan-Creation.html#index-FFTW_005fMPI_005fSCRAMBLED_005fOUT-471"><code>FFTW_MPI_SCRAMBLED_OUT</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="MPI-Data-Distribution-Functions.html#index-FFTW_005fMPI_005fSCRAMBLED_005fOUT-459"><code>FFTW_MPI_SCRAMBLED_OUT</code></a>: <a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a></li>
+<li><a href="One_002ddimensional-distributions.html#index-FFTW_005fMPI_005fSCRAMBLED_005fOUT-384"><code>FFTW_MPI_SCRAMBLED_OUT</code></a>: <a href="One_002ddimensional-distributions.html#One_002ddimensional-distributions">One-dimensional distributions</a></li>
+<li><a href="MPI-Plan-Creation.html#index-FFTW_005fMPI_005fTRANSPOSED_005fIN-474"><code>FFTW_MPI_TRANSPOSED_IN</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Basic-distributed_002dtranspose-interface.html#index-FFTW_005fMPI_005fTRANSPOSED_005fIN-400"><code>FFTW_MPI_TRANSPOSED_IN</code></a>: <a href="Basic-distributed_002dtranspose-interface.html#Basic-distributed_002dtranspose-interface">Basic distributed-transpose interface</a></li>
+<li><a href="Transposed-distributions.html#index-FFTW_005fMPI_005fTRANSPOSED_005fIN-381"><code>FFTW_MPI_TRANSPOSED_IN</code></a>: <a href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a></li>
+<li><a href="MPI-Plan-Creation.html#index-FFTW_005fMPI_005fTRANSPOSED_005fOUT-473"><code>FFTW_MPI_TRANSPOSED_OUT</code></a>: <a href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a></li>
+<li><a href="Basic-distributed_002dtranspose-interface.html#index-FFTW_005fMPI_005fTRANSPOSED_005fOUT-399"><code>FFTW_MPI_TRANSPOSED_OUT</code></a>: <a href="Basic-distributed_002dtranspose-interface.html#Basic-distributed_002dtranspose-interface">Basic distributed-transpose interface</a></li>
+<li><a href="Transposed-distributions.html#index-FFTW_005fMPI_005fTRANSPOSED_005fOUT-380"><code>FFTW_MPI_TRANSPOSED_OUT</code></a>: <a href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a></li>
+<li><a href="Planner-Flags.html#index-FFTW_005fNO_005fTIMELIMIT-184"><code>FFTW_NO_TIMELIMIT</code></a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="An-improved-replacement-for-MPI_005fAlltoall.html#index-FFTW_005fPATIENT-409"><code>FFTW_PATIENT</code></a>: <a href="An-improved-replacement-for-MPI_005fAlltoall.html#An-improved-replacement-for-MPI_005fAlltoall">An improved replacement for MPI_Alltoall</a></li>
+<li><a href="How-Many-Threads-to-Use_003f.html#index-FFTW_005fPATIENT-340"><code>FFTW_PATIENT</code></a>: <a href="How-Many-Threads-to-Use_003f.html#How-Many-Threads-to-Use_003f">How Many Threads to Use?</a></li>
+<li><a href="Planner-Flags.html#index-FFTW_005fPATIENT-173"><code>FFTW_PATIENT</code></a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="Words-of-Wisdom_002dSaving-Plans.html#index-FFTW_005fPATIENT-127"><code>FFTW_PATIENT</code></a>: <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-FFTW_005fPATIENT-38"><code>FFTW_PATIENT</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-fftw_005fplan-527"><code>fftw_plan</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Using-Plans.html#index-fftw_005fplan-152"><code>fftw_plan</code></a>: <a href="Using-Plans.html#Using-Plans">Using Plans</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-fftw_005fplan-22"><code>fftw_plan</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Complex-DFTs.html#index-fftw_005fplan_005fdft-164"><code>fftw_plan_dft</code></a>: <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a></li>
+<li><a href="Complex-Multi_002dDimensional-DFTs.html#index-fftw_005fplan_005fdft-42"><code>fftw_plan_dft</code></a>: <a href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a></li>
+<li><a href="Complex-DFTs.html#index-fftw_005fplan_005fdft_005f1d-161"><code>fftw_plan_dft_1d</code></a>: <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a></li>
+<li><a href="Complex-One_002dDimensional-DFTs.html#index-fftw_005fplan_005fdft_005f1d-21"><code>fftw_plan_dft_1d</code></a>: <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a></li>
+<li><a href="Overview-of-Fortran-interface.html#index-fftw_005fplan_005fdft_005f2d-503"><code>fftw_plan_dft_2d</code></a>: <a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a></li>
+<li><a href="Complex-DFTs.html#index-fftw_005fplan_005fdft_005f2d-162"><code>fftw_plan_dft_2d</code></a>: <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a></li>
+<li><a href="Complex-Multi_002dDimensional-DFTs.html#index-fftw_005fplan_005fdft_005f2d-39"><code>fftw_plan_dft_2d</code></a>: <a href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a></li>
+<li><a href="Reversing-array-dimensions.html#index-fftw_005fplan_005fdft_005f3d-519"><code>fftw_plan_dft_3d</code></a>: <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a></li>
+<li><a href="Complex-DFTs.html#index-fftw_005fplan_005fdft_005f3d-163"><code>fftw_plan_dft_3d</code></a>: <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a></li>
+<li><a href="Complex-Multi_002dDimensional-DFTs.html#index-fftw_005fplan_005fdft_005f3d-40"><code>fftw_plan_dft_3d</code></a>: <a href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-fftw_005fplan_005fdft_005fc2r-196"><code>fftw_plan_dft_c2r</code></a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-fftw_005fplan_005fdft_005fc2r_005f1d-193"><code>fftw_plan_dft_c2r_1d</code></a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-fftw_005fplan_005fdft_005fc2r_005f1d-49"><code>fftw_plan_dft_c2r_1d</code></a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-fftw_005fplan_005fdft_005fc2r_005f2d-194"><code>fftw_plan_dft_c2r_2d</code></a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-fftw_005fplan_005fdft_005fc2r_005f3d-195"><code>fftw_plan_dft_c2r_3d</code></a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-fftw_005fplan_005fdft_005fr2c-188"><code>fftw_plan_dft_r2c</code></a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Multi_002dDimensional-DFTs-of-Real-Data.html#index-fftw_005fplan_005fdft_005fr2c-61"><code>fftw_plan_dft_r2c</code></a>: <a href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-fftw_005fplan_005fdft_005fr2c_005f1d-185"><code>fftw_plan_dft_r2c_1d</code></a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-fftw_005fplan_005fdft_005fr2c_005f1d-48"><code>fftw_plan_dft_r2c_1d</code></a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-fftw_005fplan_005fdft_005fr2c_005f2d-186"><code>fftw_plan_dft_r2c_2d</code></a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Multi_002dDimensional-DFTs-of-Real-Data.html#index-fftw_005fplan_005fdft_005fr2c_005f2d-59"><code>fftw_plan_dft_r2c_2d</code></a>: <a href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a></li>
+<li><a href="Reversing-array-dimensions.html#index-fftw_005fplan_005fdft_005fr2c_005f3d-521"><code>fftw_plan_dft_r2c_3d</code></a>: <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a></li>
+<li><a href="Real_002ddata-DFTs.html#index-fftw_005fplan_005fdft_005fr2c_005f3d-187"><code>fftw_plan_dft_r2c_3d</code></a>: <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a></li>
+<li><a href="Multi_002dDimensional-DFTs-of-Real-Data.html#index-fftw_005fplan_005fdft_005fr2c_005f3d-60"><code>fftw_plan_dft_r2c_3d</code></a>: <a href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a></li>
+<li><a href="64_002dbit-Guru-Interface.html#index-fftw_005fplan_005fguru64_005fdft-262"><code>fftw_plan_guru64_dft</code></a>: <a href="64_002dbit-Guru-Interface.html#g_t64_002dbit-Guru-Interface">64-bit Guru Interface</a></li>
+<li><a href="Guru-Complex-DFTs.html#index-fftw_005fplan_005fguru_005fdft-248"><code>fftw_plan_guru_dft</code></a>: <a href="Guru-Complex-DFTs.html#Guru-Complex-DFTs">Guru Complex DFTs</a></li>
+<li><a href="Guru-Real_002ddata-DFTs.html#index-fftw_005fplan_005fguru_005fdft_005fc2r-253"><code>fftw_plan_guru_dft_c2r</code></a>: <a href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs">Guru Real-data DFTs</a></li>
+<li><a href="Guru-Real_002ddata-DFTs.html#index-fftw_005fplan_005fguru_005fdft_005fr2c-251"><code>fftw_plan_guru_dft_r2c</code></a>: <a href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs">Guru Real-data DFTs</a></li>
+<li><a href="Guru-Real_002dto_002dreal-Transforms.html#index-fftw_005fplan_005fguru_005fr2r-258"><code>fftw_plan_guru_r2r</code></a>: <a href="Guru-Real_002dto_002dreal-Transforms.html#Guru-Real_002dto_002dreal-Transforms">Guru Real-to-real Transforms</a></li>
+<li><a href="Guru-Complex-DFTs.html#index-fftw_005fplan_005fguru_005fsplit_005fdft-249"><code>fftw_plan_guru_split_dft</code></a>: <a href="Guru-Complex-DFTs.html#Guru-Complex-DFTs">Guru Complex DFTs</a></li>
+<li><a href="Guru-Real_002ddata-DFTs.html#index-fftw_005fplan_005fguru_005fsplit_005fdft_005fc2r-254"><code>fftw_plan_guru_split_dft_c2r</code></a>: <a href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs">Guru Real-data DFTs</a></li>
+<li><a href="Guru-Real_002ddata-DFTs.html#index-fftw_005fplan_005fguru_005fsplit_005fdft_005fr2c-252"><code>fftw_plan_guru_split_dft_r2c</code></a>: <a href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs">Guru Real-data DFTs</a></li>
+<li><a href="Advanced-Complex-DFTs.html#index-fftw_005fplan_005fmany_005fdft-232"><code>fftw_plan_many_dft</code></a>: <a href="Advanced-Complex-DFTs.html#Advanced-Complex-DFTs">Advanced Complex DFTs</a></li>
+<li><a href="Advanced-Real_002ddata-DFTs.html#index-fftw_005fplan_005fmany_005fdft_005fc2r-237"><code>fftw_plan_many_dft_c2r</code></a>: <a href="Advanced-Real_002ddata-DFTs.html#Advanced-Real_002ddata-DFTs">Advanced Real-data DFTs</a></li>
+<li><a href="Advanced-Real_002ddata-DFTs.html#index-fftw_005fplan_005fmany_005fdft_005fr2c-236"><code>fftw_plan_many_dft_r2c</code></a>: <a href="Advanced-Real_002ddata-DFTs.html#Advanced-Real_002ddata-DFTs">Advanced Real-data DFTs</a></li>
+<li><a href="Advanced-Real_002dto_002dreal-Transforms.html#index-fftw_005fplan_005fmany_005fr2r-238"><code>fftw_plan_many_r2r</code></a>: <a href="Advanced-Real_002dto_002dreal-Transforms.html#Advanced-Real_002dto_002dreal-Transforms">Advanced Real-to-real Transforms</a></li>
+<li><a href="Real_002dto_002dReal-Transforms.html#index-fftw_005fplan_005fr2r-207"><code>fftw_plan_r2r</code></a>: <a href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a></li>
+<li><a href="More-DFTs-of-Real-Data.html#index-fftw_005fplan_005fr2r-69"><code>fftw_plan_r2r</code></a>: <a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a></li>
+<li><a href="Real_002dto_002dReal-Transforms.html#index-fftw_005fplan_005fr2r_005f1d-204"><code>fftw_plan_r2r_1d</code></a>: <a href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a></li>
+<li><a href="More-DFTs-of-Real-Data.html#index-fftw_005fplan_005fr2r_005f1d-66"><code>fftw_plan_r2r_1d</code></a>: <a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a></li>
+<li><a href="Real_002dto_002dReal-Transforms.html#index-fftw_005fplan_005fr2r_005f2d-205"><code>fftw_plan_r2r_2d</code></a>: <a href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a></li>
+<li><a href="More-DFTs-of-Real-Data.html#index-fftw_005fplan_005fr2r_005f2d-67"><code>fftw_plan_r2r_2d</code></a>: <a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a></li>
+<li><a href="Real_002dto_002dReal-Transforms.html#index-fftw_005fplan_005fr2r_005f3d-206"><code>fftw_plan_r2r_3d</code></a>: <a href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a></li>
+<li><a href="More-DFTs-of-Real-Data.html#index-fftw_005fplan_005fr2r_005f3d-68"><code>fftw_plan_r2r_3d</code></a>: <a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a></li>
+<li><a href="Combining-MPI-and-Threads.html#index-fftw_005fplan_005fwith_005fnthreads-430"><code>fftw_plan_with_nthreads</code></a>: <a href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a></li>
+<li><a href="Usage-of-Multi_002dthreaded-FFTW.html#index-fftw_005fplan_005fwith_005fnthreads-335"><code>fftw_plan_with_nthreads</code></a>: <a href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW">Usage of Multi-threaded FFTW</a></li>
+<li><a href="Planner-Flags.html#index-FFTW_005fPRESERVE_005fINPUT-178"><code>FFTW_PRESERVE_INPUT</code></a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#index-FFTW_005fPRESERVE_005fINPUT-57"><code>FFTW_PRESERVE_INPUT</code></a>: <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a></li>
+<li><a href="Using-Plans.html#index-fftw_005fprint_005fplan-159"><code>fftw_print_plan</code></a>: <a href="Using-Plans.html#Using-Plans">Using Plans</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fR2HC-214"><code>FFTW_R2HC</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="The-Halfcomplex_002dformat-DFT.html#index-FFTW_005fR2HC-72"><code>FFTW_R2HC</code></a>: <a href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-fftw_005fr2r_005fkind-541"><code>fftw_r2r_kind</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#index-fftw_005fr2r_005fkind-394"><code>fftw_r2r_kind</code></a>: <a href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms">Other Multi-dimensional Real-data MPI Transforms</a></li>
+<li><a href="More-DFTs-of-Real-Data.html#index-fftw_005fr2r_005fkind-71"><code>fftw_r2r_kind</code></a>: <a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fREDFT00-218"><code>FFTW_REDFT00</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real_002dto_002dReal-Transforms.html#index-FFTW_005fREDFT00-208"><code>FFTW_REDFT00</code></a>: <a href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-FFTW_005fREDFT00-87"><code>FFTW_REDFT00</code></a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fREDFT01-222"><code>FFTW_REDFT01</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-FFTW_005fREDFT01-89"><code>FFTW_REDFT01</code></a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fREDFT10-221"><code>FFTW_REDFT10</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-FFTW_005fREDFT10-88"><code>FFTW_REDFT10</code></a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fREDFT11-224"><code>FFTW_REDFT11</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-FFTW_005fREDFT11-91"><code>FFTW_REDFT11</code></a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fRODFT00-225"><code>FFTW_RODFT00</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-FFTW_005fRODFT00-92"><code>FFTW_RODFT00</code></a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fRODFT01-229"><code>FFTW_RODFT01</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-FFTW_005fRODFT01-94"><code>FFTW_RODFT01</code></a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fRODFT10-228"><code>FFTW_RODFT10</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-FFTW_005fRODFT10-93"><code>FFTW_RODFT10</code></a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#index-FFTW_005fRODFT11-230"><code>FFTW_RODFT11</code></a>: <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a></li>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#index-FFTW_005fRODFT11-95"><code>FFTW_RODFT11</code></a>: <a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a></li>
+<li><a href="Planner-Flags.html#index-fftw_005fset_005ftimelimit-183"><code>fftw_set_timelimit</code></a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#index-FFTW_005fTRANSPOSED_005fIN-392"><code>FFTW_TRANSPOSED_IN</code></a>: <a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a></li>
+<li><a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#index-FFTW_005fTRANSPOSED_005fOUT-391"><code>FFTW_TRANSPOSED_OUT</code></a>: <a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a></li>
+<li><a href="FFTW-Execution-in-Fortran.html#index-FFTW_005fUNALIGNED-586"><code>FFTW_UNALIGNED</code></a>: <a href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran">FFTW Execution in Fortran</a></li>
+<li><a href="Plan-execution-in-Fortran.html#index-FFTW_005fUNALIGNED-559"><code>FFTW_UNALIGNED</code></a>: <a href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a></li>
+<li><a href="New_002darray-Execute-Functions.html#index-FFTW_005fUNALIGNED-267"><code>FFTW_UNALIGNED</code></a>: <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a></li>
+<li><a href="Planner-Flags.html#index-FFTW_005fUNALIGNED-181"><code>FFTW_UNALIGNED</code></a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="Planner-Flags.html#index-FFTW_005fWISDOM_005fONLY-175"><code>FFTW_WISDOM_ONLY</code></a>: <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a></li>
+<li><a href="An-improved-replacement-for-MPI_005fAlltoall.html#index-MPI_005fAlltoall-406"><code>MPI_Alltoall</code></a>: <a href="An-improved-replacement-for-MPI_005fAlltoall.html#An-improved-replacement-for-MPI_005fAlltoall">An improved replacement for MPI_Alltoall</a></li>
+<li><a href="Avoiding-MPI-Deadlocks.html#index-MPI_005fBarrier-419"><code>MPI_Barrier</code></a>: <a href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks">Avoiding MPI Deadlocks</a></li>
+<li><a href="2d-MPI-example.html#index-MPI_005fCOMM_005fWORLD-361"><code>MPI_COMM_WORLD</code></a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="Distributed_002dmemory-FFTW-with-MPI.html#index-MPI_005fCOMM_005fWORLD-348"><code>MPI_COMM_WORLD</code></a>: <a href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a></li>
+<li><a href="2d-MPI-example.html#index-MPI_005fInit-357"><code>MPI_Init</code></a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="FFTW-Fortran-type-reference.html#index-ptrdiff_005ft-536"><code>ptrdiff_t</code></a>: <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a></li>
+<li><a href="2d-MPI-example.html#index-ptrdiff_005ft-362"><code>ptrdiff_t</code></a>: <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a></li>
+<li><a href="64_002dbit-Guru-Interface.html#index-ptrdiff_005ft-261"><code>ptrdiff_t</code></a>: <a href="64_002dbit-Guru-Interface.html#g_t64_002dbit-Guru-Interface">64-bit Guru Interface</a></li>
+<li><a href="The-1d-Real_002ddata-DFT.html#index-R2HC-298"><code>R2HC</code></a>: <a href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT">The 1d Real-data DFT</a></li>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#index-REDFT00-303"><code>REDFT00</code></a>: <a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a></li>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#index-REDFT01-308"><code>REDFT01</code></a>: <a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a></li>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#index-REDFT10-307"><code>REDFT10</code></a>: <a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a></li>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#index-REDFT11-310"><code>REDFT11</code></a>: <a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a></li>
+<li><a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#index-RODFT00-314"><code>RODFT00</code></a>: <a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a></li>
+<li><a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#index-RODFT01-319"><code>RODFT01</code></a>: <a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a></li>
+<li><a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#index-RODFT10-318"><code>RODFT10</code></a>: <a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a></li>
+<li><a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#index-RODFT11-320"><code>RODFT11</code></a>: <a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a></li>
+   </ul><!-- ************************************************************ -->
+</body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/License-and-Copyright.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/License-and-Copyright.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,92 @@
+<html lang="en">
+<head>
+<title>License and Copyright - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Acknowledgments.html#Acknowledgments" title="Acknowledgments">
+<link rel="next" href="Concept-Index.html#Concept-Index" title="Concept Index">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="License-and-Copyright"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Concept-Index.html#Concept-Index">Concept Index</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Acknowledgments.html#Acknowledgments">Acknowledgments</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">12 License and Copyright</h2>
+
+<p>FFTW is Copyright &copy; 2003, 2007-11 Matteo Frigo, Copyright
+&copy; 2003, 2007-11 Massachusetts Institute of Technology.
+
+   <p>FFTW is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+   <p>This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+   <p>You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA You can also
+find the <a href="http://www.gnu.org/licenses/gpl-2.0.html">GPL on the GNU web site</a>.
+
+   <p>In addition, we kindly ask you to acknowledge FFTW and its authors in
+any program or publication in which you use FFTW.  (You are not
+<em>required</em> to do so; it is up to your common sense to decide
+whether you want to comply with this request or not.)  For general
+publications, we suggest referencing: Matteo Frigo and Steven
+G. Johnson, &ldquo;The design and implementation of FFTW3,&rdquo;
+<i>Proc. IEEE</i> <b>93</b> (2), 216&ndash;231 (2005).
+
+   <p>Non-free versions of FFTW are available under terms different from those
+of the General Public License. (e.g. they do not require you to
+accompany any object code using FFTW with the corresponding source
+code.)  For these alternative terms you must purchase a license from MIT's
+Technology Licensing Office.  Users interested in such a license should
+contact us (<a href="mailto:fftw@fftw.org">fftw@fftw.org</a>) for more information.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Linking-and-Initializing-MPI-FFTW.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Linking-and-Initializing-MPI-FFTW.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,88 @@
+<html lang="en">
+<head>
+<title>Linking and Initializing MPI FFTW - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="FFTW-MPI-Installation.html#FFTW-MPI-Installation" title="FFTW MPI Installation">
+<link rel="next" href="2d-MPI-example.html#g_t2d-MPI-example" title="2d MPI example">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Linking-and-Initializing-MPI-FFTW"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-MPI-Installation.html#FFTW-MPI-Installation">FFTW MPI Installation</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.2 Linking and Initializing MPI FFTW</h3>
+
+<p>Programs using the MPI FFTW routines should be linked with
+<code>-lfftw3_mpi -lfftw3 -lm</code> on Unix in double precision,
+<code>-lfftw3f_mpi -lfftw3f -lm</code> in single precision, and so on
+(see <a href="Precision.html#Precision">Precision</a>). You will also need to link with whatever library
+is responsible for MPI on your system; in most MPI implementations,
+there is a special compiler alias named <code>mpicc</code> to compile and
+link MPI code. 
+<a name="index-mpicc-351"></a><a name="index-linking-on-Unix-352"></a><a name="index-precision-353"></a>
+
+   <p><a name="index-fftw_005finit_005fthreads-354"></a>Before calling any FFTW routines except possibly
+<code>fftw_init_threads</code> (see <a href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a>), but after calling
+<code>MPI_Init</code>, you should call the function:
+
+<pre class="example">     void fftw_mpi_init(void);
+</pre>
+   <p><a name="index-fftw_005fmpi_005finit-355"></a>
+If, at the end of your program, you want to get rid of all memory and
+other resources allocated internally by FFTW, for both the serial and
+MPI routines, you can call:
+
+<pre class="example">     void fftw_mpi_cleanup(void);
+</pre>
+   <p><a name="index-fftw_005fmpi_005fcleanup-356"></a>
+which is much like the <code>fftw_cleanup()</code> function except that it
+also gets rid of FFTW's MPI-related data.  You must <em>not</em> execute
+any previously created plans after calling this function.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Load-balancing.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Load-balancing.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,89 @@
+<html lang="en">
+<head>
+<title>Load balancing - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="MPI-Data-Distribution.html#MPI-Data-Distribution" title="MPI Data Distribution">
+<link rel="prev" href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces" title="Basic and advanced distribution interfaces">
+<link rel="next" href="Transposed-distributions.html#Transposed-distributions" title="Transposed distributions">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Load-balancing"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.4.2 Load balancing</h4>
+
+<p><a name="index-load-balancing-378"></a>
+Ideally, when you parallelize a transform over some P
+processes, each process should end up with work that takes equal time. 
+Otherwise, all of the processes end up waiting on whichever process is
+slowest.  This goal is known as &ldquo;load balancing.&rdquo;  In this section,
+we describe the circumstances under which FFTW is able to load-balance
+well, and in particular how you should choose your transform size in
+order to load balance.
+
+   <p>Load balancing is especially difficult when you are parallelizing over
+heterogeneous machines; for example, if one of your processors is a
+old 486 and another is a Pentium IV, obviously you should give the
+Pentium more work to do than the 486 since the latter is much slower. 
+FFTW does not deal with this problem, however&mdash;it assumes that your
+processes run on hardware of comparable speed, and that the goal is
+therefore to divide the problem as equally as possible.
+
+   <p>For a multi-dimensional complex DFT, FFTW can divide the problem
+equally among the processes if: (i) the <em>first</em> dimension
+<code>n0</code> is divisible by P; and (ii), the <em>product</em> of
+the subsequent dimensions is divisible by P.  (For the advanced
+interface, where you can specify multiple simultaneous transforms via
+some &ldquo;vector&rdquo; length <code>howmany</code>, a factor of <code>howmany</code> is
+included in the product of the subsequent dimensions.)
+
+   <p>For a one-dimensional complex DFT, the length <code>N</code> of the data
+should be divisible by P <em>squared</em> to be able to divide
+the problem equally among the processes.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/MPI-Data-Distribution-Functions.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/MPI-Data-Distribution-Functions.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,149 @@
+<html lang="en">
+<head>
+<title>MPI Data Distribution Functions - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference" title="FFTW MPI Reference">
+<link rel="prev" href="Using-MPI-Plans.html#Using-MPI-Plans" title="Using MPI Plans">
+<link rel="next" href="MPI-Plan-Creation.html#MPI-Plan-Creation" title="MPI Plan Creation">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="MPI-Data-Distribution-Functions"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">FFTW MPI Reference</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.12.4 MPI Data Distribution Functions</h4>
+
+<p><a name="index-data-distribution-447"></a>As described above (see <a href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>), in order to
+allocate your arrays, <em>before</em> creating a plan, you must first
+call one of the following routines to determine the required
+allocation size and the portion of the array locally stored on a given
+process.  The <code>MPI_Comm</code> communicator passed here must be
+equivalent to the communicator used below for plan creation.
+
+   <p>The basic interface for multidimensional transforms consists of the
+functions:
+
+   <p><a name="index-fftw_005fmpi_005flocal_005fsize_005f2d-448"></a><a name="index-fftw_005fmpi_005flocal_005fsize_005f3d-449"></a><a name="index-fftw_005fmpi_005flocal_005fsize-450"></a><a name="index-fftw_005fmpi_005flocal_005fsize_005f2d_005ftransposed-451"></a><a name="index-fftw_005fmpi_005flocal_005fsize_005f3d_005ftransposed-452"></a><a name="index-fftw_005fmpi_005flocal_005fsize_005ftransposed-453"></a>
+<pre class="example">     ptrdiff_t fftw_mpi_local_size_2d(ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                                      ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+     ptrdiff_t fftw_mpi_local_size_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                      MPI_Comm comm,
+                                      ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+     ptrdiff_t fftw_mpi_local_size(int rnk, const ptrdiff_t *n, MPI_Comm comm,
+                                   ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+     
+     ptrdiff_t fftw_mpi_local_size_2d_transposed(ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                                                 ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                                 ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+     ptrdiff_t fftw_mpi_local_size_3d_transposed(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                                 MPI_Comm comm,
+                                                 ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                                 ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+     ptrdiff_t fftw_mpi_local_size_transposed(int rnk, const ptrdiff_t *n, MPI_Comm comm,
+                                              ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                              ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+</pre>
+   <p>These functions return the number of elements to allocate (complex
+numbers for DFT/r2c/c2r plans, real numbers for r2r plans), whereas
+the <code>local_n0</code> and <code>local_0_start</code> return the portion
+(<code>local_0_start</code> to <code>local_0_start + local_n0 - 1</code>) of the
+first dimension of an n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> array that is stored on the local
+process.  See <a href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a>.  For
+<code>FFTW_MPI_TRANSPOSED_OUT</code> plans, the &lsquo;<samp><span class="samp">_transposed</span></samp>&rsquo; variants
+are useful in order to also return the local portion of the first
+dimension in the n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&hellip;&times;&nbsp;n<sub>d-1</sub> transposed output.  See <a href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a>.  The advanced interface for multidimensional
+transforms is:
+
+   <p><a name="index-advanced-interface-454"></a><a name="index-fftw_005fmpi_005flocal_005fsize_005fmany-455"></a><a name="index-fftw_005fmpi_005flocal_005fsize_005fmany_005ftransposed-456"></a>
+<pre class="example">     ptrdiff_t fftw_mpi_local_size_many(int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+                                        ptrdiff_t block0, MPI_Comm comm,
+                                        ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+     ptrdiff_t fftw_mpi_local_size_many_transposed(int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+                                                   ptrdiff_t block0, ptrdiff_t block1, MPI_Comm comm,
+                                                   ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                                   ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+</pre>
+   <p>These differ from the basic interface in only two ways.  First, they
+allow you to specify block sizes <code>block0</code> and <code>block1</code> (the
+latter for the transposed output); you can pass
+<code>FFTW_MPI_DEFAULT_BLOCK</code> to use FFTW's default block size as in
+the basic interface.  Second, you can pass a <code>howmany</code> parameter,
+corresponding to the advanced planning interface below: this is for
+transforms of contiguous <code>howmany</code>-tuples of numbers
+(<code>howmany = 1</code> in the basic interface).
+
+   <p>The corresponding basic and advanced routines for one-dimensional
+transforms (currently only complex DFTs) are:
+
+   <p><a name="index-fftw_005fmpi_005flocal_005fsize_005f1d-457"></a><a name="index-fftw_005fmpi_005flocal_005fsize_005fmany_005f1d-458"></a>
+<pre class="example">     ptrdiff_t fftw_mpi_local_size_1d(
+                  ptrdiff_t n0, MPI_Comm comm, int sign, unsigned flags,
+                  ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+                  ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+     ptrdiff_t fftw_mpi_local_size_many_1d(
+                  ptrdiff_t n0, ptrdiff_t howmany,
+                  MPI_Comm comm, int sign, unsigned flags,
+                  ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+                  ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+</pre>
+   <p><a name="index-FFTW_005fMPI_005fSCRAMBLED_005fOUT-459"></a><a name="index-FFTW_005fMPI_005fSCRAMBLED_005fIN-460"></a>As above, the return value is the number of elements to allocate
+(complex numbers, for complex DFTs).  The <code>local_ni</code> and
+<code>local_i_start</code> arguments return the portion
+(<code>local_i_start</code> to <code>local_i_start + local_ni - 1</code>) of the
+1d array that is stored on this process for the transform
+<em>input</em>, and <code>local_no</code> and <code>local_o_start</code> are the
+corresponding quantities for the input.  The <code>sign</code>
+(<code>FFTW_FORWARD</code> or <code>FFTW_BACKWARD</code>) and <code>flags</code> must
+match the arguments passed when creating a plan.  Although the inputs
+and outputs have different data distributions in general, it is
+guaranteed that the <em>output</em> data distribution of an
+<code>FFTW_FORWARD</code> plan will match the <em>input</em> data distribution
+of an <code>FFTW_BACKWARD</code> plan and vice versa; similarly for the
+<code>FFTW_MPI_SCRAMBLED_OUT</code> and <code>FFTW_MPI_SCRAMBLED_IN</code> flags. 
+See <a href="One_002ddimensional-distributions.html#One_002ddimensional-distributions">One-dimensional distributions</a>.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/MPI-Data-Distribution.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/MPI-Data-Distribution.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,113 @@
+<html lang="en">
+<head>
+<title>MPI Data Distribution - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="2d-MPI-example.html#g_t2d-MPI-example" title="2d MPI example">
+<link rel="next" href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data" title="Multi-dimensional MPI DFTs of Real Data">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="MPI-Data-Distribution"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.4 MPI Data Distribution</h3>
+
+<p><a name="index-data-distribution-368"></a>
+The most important concept to understand in using FFTW's MPI interface
+is the data distribution.  With a serial or multithreaded FFT, all of
+the inputs and outputs are stored as a single contiguous chunk of
+memory.  With a distributed-memory FFT, the inputs and outputs are
+broken into disjoint blocks, one per process.
+
+   <p>In particular, FFTW uses a <em>1d block distribution</em> of the data,
+distributed along the <em>first dimension</em>.  For example, if you
+want to perform a 100&nbsp;&times;&nbsp;200 complex DFT, distributed over 4
+processes, each process will get a 25&nbsp;&times;&nbsp;200 slice of the data. 
+That is, process 0 will get rows 0 through 24, process 1 will get rows
+25 through 49, process 2 will get rows 50 through 74, and process 3
+will get rows 75 through 99.  If you take the same array but
+distribute it over 3 processes, then it is not evenly divisible so the
+different processes will have unequal chunks.  FFTW's default choice
+in this case is to assign 34 rows to processes 0 and 1, and 32 rows to
+process 2. 
+<a name="index-block-distribution-369"></a>
+
+   <p>FFTW provides several &lsquo;<samp><span class="samp">fftw_mpi_local_size</span></samp>&rsquo; routines that you can
+call to find out what portion of an array is stored on the current
+process.  In most cases, you should use the default block sizes picked
+by FFTW, but it is also possible to specify your own block size.  For
+example, with a 100&nbsp;&times;&nbsp;200 array on three processes, you can
+tell FFTW to use a block size of 40, which would assign 40 rows to
+processes 0 and 1, and 20 rows to process 2.  FFTW's default is to
+divide the data equally among the processes if possible, and as best
+it can otherwise.  The rows are always assigned in &ldquo;rank order,&rdquo;
+i.e. process 0 gets the first block of rows, then process 1, and so
+on.  (You can change this by using <code>MPI_Comm_split</code> to create a
+new communicator with re-ordered processes.)  However, you should
+always call the &lsquo;<samp><span class="samp">fftw_mpi_local_size</span></samp>&rsquo; routines, if possible,
+rather than trying to predict FFTW's distribution choices.
+
+   <p>In particular, it is critical that you allocate the storage size that
+is returned by &lsquo;<samp><span class="samp">fftw_mpi_local_size</span></samp>&rsquo;, which is <em>not</em>
+necessarily the size of the local slice of the array.  The reason is
+that intermediate steps of FFTW's algorithms involve transposing the
+array and redistributing the data, so at these intermediate steps FFTW
+may require more local storage space (albeit always proportional to
+the total size divided by the number of processes).  The
+&lsquo;<samp><span class="samp">fftw_mpi_local_size</span></samp>&rsquo; functions know how much storage is required
+for these intermediate steps and tell you the correct amount to
+allocate.
+
+<ul class="menu">
+<li><a accesskey="1" href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">Basic and advanced distribution interfaces</a>
+<li><a accesskey="2" href="Load-balancing.html#Load-balancing">Load balancing</a>
+<li><a accesskey="3" href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a>
+<li><a accesskey="4" href="One_002ddimensional-distributions.html#One_002ddimensional-distributions">One-dimensional distributions</a>
+</ul>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/MPI-Files-and-Data-Types.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/MPI-Files-and-Data-Types.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,79 @@
+<html lang="en">
+<head>
+<title>MPI Files and Data Types - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference" title="FFTW MPI Reference">
+<link rel="prev" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference" title="FFTW MPI Reference">
+<link rel="next" href="MPI-Initialization.html#MPI-Initialization" title="MPI Initialization">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="MPI-Files-and-Data-Types"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="MPI-Initialization.html#MPI-Initialization">MPI Initialization</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">FFTW MPI Reference</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">FFTW MPI Reference</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.12.1 MPI Files and Data Types</h4>
+
+<p>All programs using FFTW's MPI support should include its header file:
+
+<pre class="example">     #include &lt;fftw3-mpi.h&gt;
+</pre>
+   <p>Note that this header file includes the serial-FFTW <code>fftw3.h</code>
+header file, and also the <code>mpi.h</code> header file for MPI, so you
+need not include those files separately.
+
+   <p>You must also link to <em>both</em> the FFTW MPI library and to the
+serial FFTW library.  On Unix, this means adding <code>-lfftw3_mpi
+-lfftw3 -lm</code> at the end of the link command.
+
+   <p><a name="index-precision-433"></a>Different precisions are handled as in the serial interface:
+See <a href="Precision.html#Precision">Precision</a>.  That is, &lsquo;<samp><span class="samp">fftw_</span></samp>&rsquo; functions become
+<code>fftwf_</code> (in single precision) etcetera, and the libraries become
+<code>-lfftw3f_mpi -lfftw3f -lm</code> etcetera on Unix.  Long-double
+precision is supported in MPI, but quad precision (&lsquo;<samp><span class="samp">fftwq_</span></samp>&rsquo;) is
+not due to the lack of MPI support for this type.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/MPI-Initialization.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/MPI-Initialization.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,83 @@
+<html lang="en">
+<head>
+<title>MPI Initialization - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference" title="FFTW MPI Reference">
+<link rel="prev" href="MPI-Files-and-Data-Types.html#MPI-Files-and-Data-Types" title="MPI Files and Data Types">
+<link rel="next" href="Using-MPI-Plans.html#Using-MPI-Plans" title="Using MPI Plans">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="MPI-Initialization"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="MPI-Files-and-Data-Types.html#MPI-Files-and-Data-Types">MPI Files and Data Types</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">FFTW MPI Reference</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.12.2 MPI Initialization</h4>
+
+<p>Before calling any other FFTW MPI (&lsquo;<samp><span class="samp">fftw_mpi_</span></samp>&rsquo;) function, and
+before importing any wisdom for MPI problems, you must call:
+
+   <p><a name="index-fftw_005fmpi_005finit-434"></a>
+<pre class="example">     void fftw_mpi_init(void);
+</pre>
+   <p><a name="index-fftw_005finit_005fthreads-435"></a>If FFTW threads support is used, however, <code>fftw_mpi_init</code> should
+be called <em>after</em> <code>fftw_init_threads</code> (see <a href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">Combining MPI and Threads</a>).  Calling <code>fftw_mpi_init</code> additional times (before
+<code>fftw_mpi_cleanup</code>) has no effect.
+
+   <p>If you want to deallocate all persistent data and reset FFTW to the
+pristine state it was in when you started your program, you can call:
+
+   <p><a name="index-fftw_005fmpi_005fcleanup-436"></a>
+<pre class="example">     void fftw_mpi_cleanup(void);
+</pre>
+   <p><a name="index-fftw_005fcleanup-437"></a>(This calls <code>fftw_cleanup</code>, so you need not call the serial
+cleanup routine too, although it is safe to do so.)  After calling
+<code>fftw_mpi_cleanup</code>, all existing plans become undefined, and you
+should not attempt to execute or destroy them.  You must call
+<code>fftw_mpi_init</code> again after <code>fftw_mpi_cleanup</code> if you want
+to resume using the MPI FFTW routines.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/MPI-Plan-Creation.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/MPI-Plan-Creation.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,253 @@
+<html lang="en">
+<head>
+<title>MPI Plan Creation - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference" title="FFTW MPI Reference">
+<link rel="prev" href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions" title="MPI Data Distribution Functions">
+<link rel="next" href="MPI-Wisdom-Communication.html#MPI-Wisdom-Communication" title="MPI Wisdom Communication">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="MPI-Plan-Creation"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="MPI-Wisdom-Communication.html#MPI-Wisdom-Communication">MPI Wisdom Communication</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">FFTW MPI Reference</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.12.5 MPI Plan Creation</h4>
+
+<h5 class="subsubheading">Complex-data MPI DFTs</h5>
+
+<p>Plans for complex-data DFTs (see <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a>) are created by:
+
+   <p><a name="index-fftw_005fmpi_005fplan_005fdft_005f1d-461"></a><a name="index-fftw_005fmpi_005fplan_005fdft_005f2d-462"></a><a name="index-fftw_005fmpi_005fplan_005fdft_005f3d-463"></a><a name="index-fftw_005fmpi_005fplan_005fdft-464"></a><a name="index-fftw_005fmpi_005fplan_005fmany_005fdft-465"></a>
+<pre class="example">     fftw_plan fftw_mpi_plan_dft_1d(ptrdiff_t n0, fftw_complex *in, fftw_complex *out,
+                                    MPI_Comm comm, int sign, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                    fftw_complex *in, fftw_complex *out,
+                                    MPI_Comm comm, int sign, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                    fftw_complex *in, fftw_complex *out,
+                                    MPI_Comm comm, int sign, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft(int rnk, const ptrdiff_t *n,
+                                 fftw_complex *in, fftw_complex *out,
+                                 MPI_Comm comm, int sign, unsigned flags);
+     fftw_plan fftw_mpi_plan_many_dft(int rnk, const ptrdiff_t *n,
+                                      ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock,
+                                      fftw_complex *in, fftw_complex *out,
+                                      MPI_Comm comm, int sign, unsigned flags);
+</pre>
+   <p><a name="index-MPI-communicator-466"></a><a name="index-collective-function-467"></a>These are similar to their serial counterparts (see <a href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a>)
+in specifying the dimensions, sign, and flags of the transform.  The
+<code>comm</code> argument gives an MPI communicator that specifies the set
+of processes to participate in the transform; plan creation is a
+collective function that must be called for all processes in the
+communicator.  The <code>in</code> and <code>out</code> pointers refer only to a
+portion of the overall transform data (see <a href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>)
+as specified by the &lsquo;<samp><span class="samp">local_size</span></samp>&rsquo; functions in the previous
+section.  Unless <code>flags</code> contains <code>FFTW_ESTIMATE</code>, these
+arrays are overwritten during plan creation as for the serial
+interface.  For multi-dimensional transforms, any dimensions <code>&gt;
+1</code> are supported; for one-dimensional transforms, only composite
+(non-prime) <code>n0</code> are currently supported (unlike the serial
+FFTW).  Requesting an unsupported transform size will yield a
+<code>NULL</code> plan.  (As in the serial interface, highly composite sizes
+generally yield the best performance.)
+
+   <p><a name="index-advanced-interface-468"></a><a name="index-FFTW_005fMPI_005fDEFAULT_005fBLOCK-469"></a><a name="index-stride-470"></a>The advanced-interface <code>fftw_mpi_plan_many_dft</code> additionally
+allows you to specify the block sizes for the first dimension
+(<code>block</code>) of the n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> input data and the first dimension
+(<code>tblock</code>) of the n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&hellip;&times;&nbsp;n<sub>d-1</sub> transposed data (at intermediate
+steps of the transform, and for the output if
+<code>FFTW_TRANSPOSED_OUT</code> is specified in <code>flags</code>).  These must
+be the same block sizes as were passed to the corresponding
+&lsquo;<samp><span class="samp">local_size</span></samp>&rsquo; function; you can pass <code>FFTW_MPI_DEFAULT_BLOCK</code>
+to use FFTW's default block size as in the basic interface.  Also, the
+<code>howmany</code> parameter specifies that the transform is of contiguous
+<code>howmany</code>-tuples rather than individual complex numbers; this
+corresponds to the same parameter in the serial advanced interface
+(see <a href="Advanced-Complex-DFTs.html#Advanced-Complex-DFTs">Advanced Complex DFTs</a>) with <code>stride = howmany</code> and
+<code>dist = 1</code>.
+
+<h5 class="subsubheading">MPI flags</h5>
+
+<p>The <code>flags</code> can be any of those for the serial FFTW
+(see <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a>), and in addition may include one or more of
+the following MPI-specific flags, which improve performance at the
+cost of changing the output or input data formats.
+
+     <ul>
+<li><a name="index-FFTW_005fMPI_005fSCRAMBLED_005fOUT-471"></a><a name="index-FFTW_005fMPI_005fSCRAMBLED_005fIN-472"></a><code>FFTW_MPI_SCRAMBLED_OUT</code>, <code>FFTW_MPI_SCRAMBLED_IN</code>: valid for
+1d transforms only, these flags indicate that the output/input of the
+transform are in an undocumented &ldquo;scrambled&rdquo; order.  A forward
+<code>FFTW_MPI_SCRAMBLED_OUT</code> transform can be inverted by a backward
+<code>FFTW_MPI_SCRAMBLED_IN</code> (times the usual 1/<i>N</i> normalization). 
+See <a href="One_002ddimensional-distributions.html#One_002ddimensional-distributions">One-dimensional distributions</a>.
+
+     <li><a name="index-FFTW_005fMPI_005fTRANSPOSED_005fOUT-473"></a><a name="index-FFTW_005fMPI_005fTRANSPOSED_005fIN-474"></a><code>FFTW_MPI_TRANSPOSED_OUT</code>, <code>FFTW_MPI_TRANSPOSED_IN</code>: valid
+for multidimensional (<code>rnk &gt; 1</code>) transforms only, these flags
+specify that the output or input of an n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> transform is
+transposed to n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&hellip;&times;&nbsp;n<sub>d-1</sub>.  See <a href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a>.
+
+   </ul>
+
+<h5 class="subsubheading">Real-data MPI DFTs</h5>
+
+<p><a name="index-r2c-475"></a>Plans for real-input/output (r2c/c2r) DFTs (see <a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a>) are created by:
+
+   <p><a name="index-fftw_005fmpi_005fplan_005fdft_005fr2c_005f2d-476"></a><a name="index-fftw_005fmpi_005fplan_005fdft_005fr2c_005f2d-477"></a><a name="index-fftw_005fmpi_005fplan_005fdft_005fr2c_005f3d-478"></a><a name="index-fftw_005fmpi_005fplan_005fdft_005fr2c-479"></a><a name="index-fftw_005fmpi_005fplan_005fdft_005fc2r_005f2d-480"></a><a name="index-fftw_005fmpi_005fplan_005fdft_005fc2r_005f2d-481"></a><a name="index-fftw_005fmpi_005fplan_005fdft_005fc2r_005f3d-482"></a><a name="index-fftw_005fmpi_005fplan_005fdft_005fc2r-483"></a>
+<pre class="example">     fftw_plan fftw_mpi_plan_dft_r2c_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                        double *in, fftw_complex *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_r2c_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                        double *in, fftw_complex *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_r2c_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                        double *in, fftw_complex *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_r2c(int rnk, const ptrdiff_t *n,
+                                     double *in, fftw_complex *out,
+                                     MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_c2r_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                        fftw_complex *in, double *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_c2r_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                        fftw_complex *in, double *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_c2r_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                        fftw_complex *in, double *out,
+                                        MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_dft_c2r(int rnk, const ptrdiff_t *n,
+                                     fftw_complex *in, double *out,
+                                     MPI_Comm comm, unsigned flags);
+</pre>
+   <p>Similar to the serial interface (see <a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a>), these
+transform logically n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> real data to/from n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;(n<sub>d-1</sub>/2 + 1) complex
+data, representing the non-redundant half of the conjugate-symmetry
+output of a real-input DFT (see <a href="Multi_002ddimensional-Transforms.html#Multi_002ddimensional-Transforms">Multi-dimensional Transforms</a>). 
+However, the real array must be stored within a padded n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;[2&nbsp;(n<sub>d-1</sub>/2 + 1)]
+
+   <p>array (much like the in-place serial r2c transforms, but here for
+out-of-place transforms as well). Currently, only multi-dimensional
+(<code>rnk &gt; 1</code>) r2c/c2r transforms are supported (requesting a plan
+for <code>rnk = 1</code> will yield <code>NULL</code>).  As explained above
+(see <a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a>), the data
+distribution of both the real and complex arrays is given by the
+&lsquo;<samp><span class="samp">local_size</span></samp>&rsquo; function called for the dimensions of the
+<em>complex</em> array.  Similar to the other planning functions, the
+input and output arrays are overwritten when the plan is created
+except in <code>FFTW_ESTIMATE</code> mode.
+
+   <p>As for the complex DFTs above, there is an advance interface that
+allows you to manually specify block sizes and to transform contiguous
+<code>howmany</code>-tuples of real/complex numbers:
+
+   <p><a name="index-fftw_005fmpi_005fplan_005fmany_005fdft_005fr2c-484"></a><a name="index-fftw_005fmpi_005fplan_005fmany_005fdft_005fc2r-485"></a>
+<pre class="example">     fftw_plan fftw_mpi_plan_many_dft_r2c
+                   (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+                    ptrdiff_t iblock, ptrdiff_t oblock,
+                    double *in, fftw_complex *out,
+                    MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_many_dft_c2r
+                   (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+                    ptrdiff_t iblock, ptrdiff_t oblock,
+                    fftw_complex *in, double *out,
+                    MPI_Comm comm, unsigned flags);
+</pre>
+   <h5 class="subsubheading">MPI r2r transforms</h5>
+
+<p><a name="index-r2r-486"></a>There are corresponding plan-creation routines for r2r
+transforms (see <a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a>), currently supporting
+multidimensional (<code>rnk &gt; 1</code>) transforms only (<code>rnk = 1</code> will
+yield a <code>NULL</code> plan):
+
+<pre class="example">     fftw_plan fftw_mpi_plan_r2r_2d(ptrdiff_t n0, ptrdiff_t n1,
+                                    double *in, double *out,
+                                    MPI_Comm comm,
+                                    fftw_r2r_kind kind0, fftw_r2r_kind kind1,
+                                    unsigned flags);
+     fftw_plan fftw_mpi_plan_r2r_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                    double *in, double *out,
+                                    MPI_Comm comm,
+                                    fftw_r2r_kind kind0, fftw_r2r_kind kind1, fftw_r2r_kind kind2,
+                                    unsigned flags);
+     fftw_plan fftw_mpi_plan_r2r(int rnk, const ptrdiff_t *n,
+                                 double *in, double *out,
+                                 MPI_Comm comm, const fftw_r2r_kind *kind,
+                                 unsigned flags);
+     fftw_plan fftw_mpi_plan_many_r2r(int rnk, const ptrdiff_t *n,
+                                      ptrdiff_t iblock, ptrdiff_t oblock,
+                                      double *in, double *out,
+                                      MPI_Comm comm, const fftw_r2r_kind *kind,
+                                      unsigned flags);
+</pre>
+   <p>The parameters are much the same as for the complex DFTs above, except
+that the arrays are of real numbers (and hence the outputs of the
+&lsquo;<samp><span class="samp">local_size</span></samp>&rsquo; data-distribution functions should be interpreted as
+counts of real rather than complex numbers).  Also, the <code>kind</code>
+parameters specify the r2r kinds along each dimension as for the
+serial interface (see <a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a>).  See <a href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms">Other Multi-dimensional Real-data MPI Transforms</a>.
+
+<h5 class="subsubheading">MPI transposition</h5>
+
+<p><a name="index-transpose-487"></a>
+FFTW also provides routines to plan a transpose of a distributed
+<code>n0</code> by <code>n1</code> array of real numbers, or an array of
+<code>howmany</code>-tuples of real numbers with specified block sizes
+(see <a href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">FFTW MPI Transposes</a>):
+
+   <p><a name="index-fftw_005fmpi_005fplan_005ftranspose-488"></a><a name="index-fftw_005fmpi_005fplan_005fmany_005ftranspose-489"></a>
+<pre class="example">     fftw_plan fftw_mpi_plan_transpose(ptrdiff_t n0, ptrdiff_t n1,
+                                       double *in, double *out,
+                                       MPI_Comm comm, unsigned flags);
+     fftw_plan fftw_mpi_plan_many_transpose
+                     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany,
+                      ptrdiff_t block0, ptrdiff_t block1,
+                      double *in, double *out, MPI_Comm comm, unsigned flags);
+</pre>
+   <p><a name="index-new_002darray-execution-490"></a><a name="index-fftw_005fmpi_005fexecute_005fr2r-491"></a>These plans are used with the <code>fftw_mpi_execute_r2r</code> new-array
+execute function (see <a href="Using-MPI-Plans.html#Using-MPI-Plans">Using MPI Plans</a>), since they count as (rank
+zero) r2r plans from FFTW's perspective.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/MPI-Wisdom-Communication.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/MPI-Wisdom-Communication.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,79 @@
+<html lang="en">
+<head>
+<title>MPI Wisdom Communication - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference" title="FFTW MPI Reference">
+<link rel="prev" href="MPI-Plan-Creation.html#MPI-Plan-Creation" title="MPI Plan Creation">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="MPI-Wisdom-Communication"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="MPI-Plan-Creation.html#MPI-Plan-Creation">MPI Plan Creation</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">FFTW MPI Reference</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.12.6 MPI Wisdom Communication</h4>
+
+<p>To facilitate synchronizing wisdom among the different MPI processes,
+we provide two functions:
+
+   <p><a name="index-fftw_005fmpi_005fgather_005fwisdom-492"></a><a name="index-fftw_005fmpi_005fbroadcast_005fwisdom-493"></a>
+<pre class="example">     void fftw_mpi_gather_wisdom(MPI_Comm comm);
+     void fftw_mpi_broadcast_wisdom(MPI_Comm comm);
+</pre>
+   <p>The <code>fftw_mpi_gather_wisdom</code> function gathers all wisdom in the
+given communicator <code>comm</code> to the process of rank 0 in the
+communicator: that process obtains the union of all wisdom on all the
+processes.  As a side effect, some other processes will gain
+additional wisdom from other processes, but only process 0 will gain
+the complete union.
+
+   <p>The <code>fftw_mpi_broadcast_wisdom</code> does the reverse: it exports
+wisdom from process 0 in <code>comm</code> to all other processes in the
+communicator, replacing any wisdom they currently have.
+
+   <p>See <a href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">FFTW MPI Wisdom</a>.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Memory-Allocation.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Memory-Allocation.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,98 @@
+<html lang="en">
+<head>
+<title>Memory Allocation - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Data-Types-and-Files.html#Data-Types-and-Files" title="Data Types and Files">
+<link rel="prev" href="Precision.html#Precision" title="Precision">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Memory-Allocation"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Precision.html#Precision">Precision</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Data-Types-and-Files.html#Data-Types-and-Files">Data Types and Files</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.1.3 Memory Allocation</h4>
+
+<pre class="example">     void *fftw_malloc(size_t n);
+     void fftw_free(void *p);
+</pre>
+   <p><a name="index-fftw_005fmalloc-145"></a><a name="index-fftw_005ffree-146"></a>
+These are functions that behave identically to <code>malloc</code> and
+<code>free</code>, except that they guarantee that the returned pointer obeys
+any special alignment restrictions imposed by any algorithm in FFTW
+(e.g. for SIMD acceleration).  See <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>. 
+<a name="index-alignment-147"></a>
+
+   <p>Data allocated by <code>fftw_malloc</code> <em>must</em> be deallocated by
+<code>fftw_free</code> and not by the ordinary <code>free</code>.
+
+   <p>These routines simply call through to your operating system's
+<code>malloc</code> or, if necessary, its aligned equivalent
+(e.g. <code>memalign</code>), so you normally need not worry about any
+significant time or space overhead.  You are <em>not required</em> to use
+them to allocate your data, but we strongly recommend it.
+
+   <p>Note: in C++, just as with ordinary <code>malloc</code>, you must typecast
+the output of <code>fftw_malloc</code> to whatever pointer type you are
+allocating. 
+<a name="index-C_002b_002b-148"></a>
+
+   <p>We also provide the following two convenience functions to allocate
+real and complex arrays with <code>n</code> elements, which are equivalent
+to <code>(double *) fftw_malloc(sizeof(double) * n)</code> and
+<code>(fftw_complex *) fftw_malloc(sizeof(fftw_complex) * n)</code>,
+respectively:
+
+<pre class="example">     double *fftw_alloc_real(size_t n);
+     fftw_complex *fftw_alloc_complex(size_t n);
+</pre>
+   <p><a name="index-fftw_005falloc_005freal-149"></a><a name="index-fftw_005falloc_005fcomplex-150"></a>
+The equivalent functions in other precisions allocate arrays of <code>n</code>
+elements in that precision.  e.g. <code>fftwf_alloc_real(n)</code> is
+equivalent to <code>(float *) fftwf_malloc(sizeof(float) * n)</code>. 
+<a name="index-precision-151"></a>
+<!--  -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/More-DFTs-of-Real-Data.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/More-DFTs-of-Real-Data.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,127 @@
+<html lang="en">
+<head>
+<title>More DFTs of Real Data - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Tutorial.html#Tutorial" title="Tutorial">
+<link rel="prev" href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data" title="Multi-Dimensional DFTs of Real Data">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="More-DFTs-of-Real-Data"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Tutorial.html#Tutorial">Tutorial</a>
+<hr>
+</div>
+
+<h3 class="section">2.5 More DFTs of Real Data</h3>
+
+<ul class="menu">
+<li><a accesskey="1" href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a>
+<li><a accesskey="2" href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a>
+<li><a accesskey="3" href="The-Discrete-Hartley-Transform.html#The-Discrete-Hartley-Transform">The Discrete Hartley Transform</a>
+</ul>
+
+<p>FFTW supports several other transform types via a unified <dfn>r2r</dfn>
+(real-to-real) interface,
+<a name="index-r2r-65"></a>so called because it takes a real (<code>double</code>) array and outputs a
+real array of the same size.  These r2r transforms currently fall into
+three categories: DFTs of real input and complex-Hermitian output in
+halfcomplex format, DFTs of real input with even/odd symmetry
+(a.k.a. discrete cosine/sine transforms, DCTs/DSTs), and discrete
+Hartley transforms (DHTs), all described in more detail by the
+following sections.
+
+   <p>The r2r transforms follow the by now familiar interface of creating an
+<code>fftw_plan</code>, executing it with <code>fftw_execute(plan)</code>, and
+destroying it with <code>fftw_destroy_plan(plan)</code>.  Furthermore, all
+r2r transforms share the same planner interface:
+
+<pre class="example">     fftw_plan fftw_plan_r2r_1d(int n, double *in, double *out,
+                                fftw_r2r_kind kind, unsigned flags);
+     fftw_plan fftw_plan_r2r_2d(int n0, int n1, double *in, double *out,
+                                fftw_r2r_kind kind0, fftw_r2r_kind kind1,
+                                unsigned flags);
+     fftw_plan fftw_plan_r2r_3d(int n0, int n1, int n2,
+                                double *in, double *out,
+                                fftw_r2r_kind kind0,
+                                fftw_r2r_kind kind1,
+                                fftw_r2r_kind kind2,
+                                unsigned flags);
+     fftw_plan fftw_plan_r2r(int rank, const int *n, double *in, double *out,
+                             const fftw_r2r_kind *kind, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fr2r_005f1d-66"></a><a name="index-fftw_005fplan_005fr2r_005f2d-67"></a><a name="index-fftw_005fplan_005fr2r_005f3d-68"></a><a name="index-fftw_005fplan_005fr2r-69"></a>
+Just as for the complex DFT, these plan 1d/2d/3d/multi-dimensional
+transforms for contiguous arrays in row-major order, transforming (real)
+input to output of the same size, where <code>n</code> specifies the
+<em>physical</em> dimensions of the arrays.  All positive <code>n</code> are
+supported (with the exception of <code>n=1</code> for the <code>FFTW_REDFT00</code>
+kind, noted in the real-even subsection below); products of small
+factors are most efficient (factorizing <code>n-1</code> and <code>n+1</code> for
+<code>FFTW_REDFT00</code> and <code>FFTW_RODFT00</code> kinds, described below), but
+an <i>O</i>(<i>n</i>&nbsp;log&nbsp;<i>n</i>) algorithm is used even for prime sizes.
+
+   <p>Each dimension has a <dfn>kind</dfn> parameter, of type
+<code>fftw_r2r_kind</code>, specifying the kind of r2r transform to be used
+for that dimension. 
+<a name="index-kind-_0028r2r_0029-70"></a><a name="index-fftw_005fr2r_005fkind-71"></a>(In the case of <code>fftw_plan_r2r</code>, this is an array <code>kind[rank]</code>
+where <code>kind[i]</code> is the transform kind for the dimension
+<code>n[i]</code>.)  The kind can be one of a set of predefined constants,
+defined in the following subsections.
+
+   <p>In other words, FFTW computes the separable product of the specified
+r2r transforms over each dimension, which can be used e.g. for partial
+differential equations with mixed boundary conditions.  (For some r2r
+kinds, notably the halfcomplex DFT and the DHT, such a separable
+product is somewhat problematic in more than one dimension, however,
+as is described below.)
+
+   <p>In the current version of FFTW, all r2r transforms except for the
+halfcomplex type are computed via pre- or post-processing of
+halfcomplex transforms, and they are therefore not as fast as they
+could be.  Since most other general DCT/DST codes employ a similar
+algorithm, however, FFTW's implementation should provide at least
+competitive performance.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Multi_002dDimensional-DFTs-of-Real-Data.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Multi_002dDimensional-DFTs-of-Real-Data.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,137 @@
+<html lang="en">
+<head>
+<title>Multi-Dimensional DFTs of Real Data - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Tutorial.html#Tutorial" title="Tutorial">
+<link rel="prev" href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data" title="One-Dimensional DFTs of Real Data">
+<link rel="next" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data" title="More DFTs of Real Data">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Multi-Dimensional-DFTs-of-Real-Data"></a>
+<a name="Multi_002dDimensional-DFTs-of-Real-Data"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Tutorial.html#Tutorial">Tutorial</a>
+<hr>
+</div>
+
+<h3 class="section">2.4 Multi-Dimensional DFTs of Real Data</h3>
+
+<p>Multi-dimensional DFTs of real data use the following planner routines:
+
+<pre class="example">     fftw_plan fftw_plan_dft_r2c_2d(int n0, int n1,
+                                    double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_r2c_3d(int n0, int n1, int n2,
+                                    double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_r2c(int rank, const int *n,
+                                 double *in, fftw_complex *out,
+                                 unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fdft_005fr2c_005f2d-59"></a><a name="index-fftw_005fplan_005fdft_005fr2c_005f3d-60"></a><a name="index-fftw_005fplan_005fdft_005fr2c-61"></a>
+as well as the corresponding <code>c2r</code> routines with the input/output
+types swapped.  These routines work similarly to their complex
+analogues, except for the fact that here the complex output array is cut
+roughly in half and the real array requires padding for in-place
+transforms (as in 1d, above).
+
+   <p>As before, <code>n</code> is the logical size of the array, and the
+consequences of this on the the format of the complex arrays deserve
+careful attention. 
+<a name="index-r2c_002fc2r-multi_002ddimensional-array-format-62"></a>Suppose that the real data has dimensions n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> (in row-major order). 
+Then, after an r2c transform, the output is an n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;(n<sub>d-1</sub>/2 + 1) array of
+<code>fftw_complex</code> values in row-major order, corresponding to slightly
+over half of the output of the corresponding complex DFT.  (The division
+is rounded down.)  The ordering of the data is otherwise exactly the
+same as in the complex-DFT case.
+
+   <p>For out-of-place transforms, this is the end of the story: the real
+data is stored as a row-major array of size n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> and the complex
+data is stored as a row-major array of size n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;(n<sub>d-1</sub>/2 + 1).
+
+   <p>For in-place transforms, however, extra padding of the real-data array
+is necessary because the complex array is larger than the real array,
+and the two arrays share the same memory locations.  Thus, for
+in-place transforms, the final dimension of the real-data array must
+be padded with extra values to accommodate the size of the complex
+data&mdash;two values if the last dimension is even and one if it is odd. 
+<a name="index-padding-63"></a>That is, the last dimension of the real data must physically contain
+2 * (n<sub>d-1</sub>/2+1)<code>double</code> values (exactly enough to hold the complex data). 
+This physical array size does not, however, change the <em>logical</em>
+array size&mdash;only
+n<sub>d-1</sub>values are actually stored in the last dimension, and
+n<sub>d-1</sub>is the last dimension passed to the plan-creation routine.
+
+   <p>For example, consider the transform of a two-dimensional real array of
+size <code>n0</code> by <code>n1</code>.  The output of the r2c transform is a
+two-dimensional complex array of size <code>n0</code> by <code>n1/2+1</code>, where
+the <code>y</code> dimension has been cut nearly in half because of
+redundancies in the output.  Because <code>fftw_complex</code> is twice the
+size of <code>double</code>, the output array is slightly bigger than the
+input array.  Thus, if we want to compute the transform in place, we
+must <em>pad</em> the input array so that it is of size <code>n0</code> by
+<code>2*(n1/2+1)</code>.  If <code>n1</code> is even, then there are two padding
+elements at the end of each row (which need not be initialized, as they
+are only used for output).
+
+   <p>The following illustration depicts the input and output arrays just
+described, for both the out-of-place and in-place transforms (with the
+arrows indicating consecutive memory locations):
+<img src="rfftwnd-for-html.png" alt="rfftwnd-for-html.png">
+
+   <p>These transforms are unnormalized, so an r2c followed by a c2r
+transform (or vice versa) will result in the original data scaled by
+the number of real data elements&mdash;that is, the product of the
+(logical) dimensions of the real data. 
+<a name="index-normalization-64"></a>
+
+   <p>(Because the last dimension is treated specially, if it is equal to
+<code>1</code> the transform is <em>not</em> equivalent to a lower-dimensional
+r2c/c2r transform.  In that case, the last complex dimension also has
+size <code>1</code> (<code>=1/2+1</code>), and no advantage is gained over the
+complex transforms.)
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Multi_002ddimensional-Array-Format.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Multi_002ddimensional-Array-Format.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,75 @@
+<html lang="en">
+<head>
+<title>Multi-dimensional Array Format - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Other-Important-Topics.html#Other-Important-Topics" title="Other Important Topics">
+<link rel="prev" href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc" title="SIMD alignment and fftw_malloc">
+<link rel="next" href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans" title="Words of Wisdom-Saving Plans">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Multi-dimensional-Array-Format"></a>
+<a name="Multi_002ddimensional-Array-Format"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Other-Important-Topics.html#Other-Important-Topics">Other Important Topics</a>
+<hr>
+</div>
+
+<h3 class="section">3.2 Multi-dimensional Array Format</h3>
+
+<p>This section describes the format in which multi-dimensional arrays
+are stored in FFTW.  We felt that a detailed discussion of this topic
+was necessary.  Since several different formats are common, this topic
+is often a source of confusion.
+
+<ul class="menu">
+<li><a accesskey="1" href="Row_002dmajor-Format.html#Row_002dmajor-Format">Row-major Format</a>
+<li><a accesskey="2" href="Column_002dmajor-Format.html#Column_002dmajor-Format">Column-major Format</a>
+<li><a accesskey="3" href="Fixed_002dsize-Arrays-in-C.html#Fixed_002dsize-Arrays-in-C">Fixed-size Arrays in C</a>
+<li><a accesskey="4" href="Dynamic-Arrays-in-C.html#Dynamic-Arrays-in-C">Dynamic Arrays in C</a>
+<li><a accesskey="5" href="Dynamic-Arrays-in-C_002dThe-Wrong-Way.html#Dynamic-Arrays-in-C_002dThe-Wrong-Way">Dynamic Arrays in C-The Wrong Way</a>
+</ul>
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Multi_002ddimensional-MPI-DFTs-of-Real-Data.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Multi_002ddimensional-MPI-DFTs-of-Real-Data.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,158 @@
+<html lang="en">
+<head>
+<title>Multi-dimensional MPI DFTs of Real Data - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="MPI-Data-Distribution.html#MPI-Data-Distribution" title="MPI Data Distribution">
+<link rel="next" href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms" title="Other Multi-dimensional Real-data MPI Transforms">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Multi-dimensional-MPI-DFTs-of-Real-Data"></a>
+<a name="Multi_002ddimensional-MPI-DFTs-of-Real-Data"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms">Other Multi-dimensional Real-data MPI Transforms</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.5 Multi-dimensional MPI DFTs of Real Data</h3>
+
+<p>FFTW's MPI interface also supports multi-dimensional DFTs of real
+data, similar to the serial r2c and c2r interfaces.  (Parallel
+one-dimensional real-data DFTs are not currently supported; you must
+use a complex transform and set the imaginary parts of the inputs to
+zero.)
+
+   <p>The key points to understand for r2c and c2r MPI transforms (compared
+to the MPI complex DFTs or the serial r2c/c2r transforms), are:
+
+     <ul>
+<li>Just as for serial transforms, r2c/c2r DFTs transform n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> real
+data to/from n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;(n<sub>d-1</sub>/2 + 1) complex data: the last dimension of the
+complex data is cut in half (rounded down), plus one.  As for the
+serial transforms, the sizes you pass to the &lsquo;<samp><span class="samp">plan_dft_r2c</span></samp>&rsquo; and
+&lsquo;<samp><span class="samp">plan_dft_c2r</span></samp>&rsquo; are the n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> dimensions of the real data.
+
+     <li><a name="index-padding-386"></a>Although the real data is <em>conceptually</em> n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub>, it is
+<em>physically</em> stored as an n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;[2&nbsp;(n<sub>d-1</sub>/2 + 1)] array, where the last
+dimension has been <em>padded</em> to make it the same size as the
+complex output.  This is much like the in-place serial r2c/c2r
+interface (see <a href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a>), except that
+in MPI the padding is required even for out-of-place data.  The extra
+padding numbers are ignored by FFTW (they are <em>not</em> like
+zero-padding the transform to a larger size); they are only used to
+determine the data layout.
+
+     <li><a name="index-data-distribution-387"></a>The data distribution in MPI for <em>both</em> the real and complex data
+is determined by the shape of the <em>complex</em> data.  That is, you
+call the appropriate &lsquo;<samp><span class="samp">local size</span></samp>&rsquo; function for the n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;(n<sub>d-1</sub>/2 + 1)
+
+     <p>complex data, and then use the <em>same</em> distribution for the real
+data except that the last complex dimension is replaced by a (padded)
+real dimension of twice the length.
+
+   </ul>
+
+   <p>For example suppose we are performing an out-of-place r2c transform of
+L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;N real data [padded to L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;2(N/2+1)],
+resulting in L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;N/2+1 complex data.  Similar to the
+example in <a href="2d-MPI-example.html#g_t2d-MPI-example">2d MPI example</a>, we might do something like:
+
+<pre class="example">     #include &lt;fftw3-mpi.h&gt;
+     
+     int main(int argc, char **argv)
+     {
+         const ptrdiff_t L = ..., M = ..., N = ...;
+         fftw_plan plan;
+         double *rin;
+         fftw_complex *cout;
+         ptrdiff_t alloc_local, local_n0, local_0_start, i, j, k;
+     
+         MPI_Init(&amp;argc, &amp;argv);
+         fftw_mpi_init();
+     
+         /* <span class="roman">get local data size and allocate</span> */
+         alloc_local = fftw_mpi_local_size_3d(L, M, N/2+1, MPI_COMM_WORLD,
+                                              &amp;local_n0, &amp;local_0_start);
+         rin = fftw_alloc_real(2 * alloc_local);
+         cout = fftw_alloc_complex(alloc_local);
+     
+         /* <span class="roman">create plan for out-of-place r2c DFT</span> */
+         plan = fftw_mpi_plan_dft_r2c_3d(L, M, N, rin, cout, MPI_COMM_WORLD,
+                                         FFTW_MEASURE);
+     
+         /* <span class="roman">initialize rin to some function</span> my_func(x,y,z) */
+         for (i = 0; i &lt; local_n0; ++i)
+            for (j = 0; j &lt; M; ++j)
+              for (k = 0; k &lt; N; ++k)
+            rin[(i*M + j) * (2*(N/2+1)) + k] = my_func(local_0_start+i, j, k);
+     
+         /* <span class="roman">compute transforms as many times as desired</span> */
+         fftw_execute(plan);
+     
+         fftw_destroy_plan(plan);
+     
+         MPI_Finalize();
+     }
+</pre>
+   <p><a name="index-fftw_005falloc_005freal-388"></a><a name="index-row_002dmajor-389"></a>Note that we allocated <code>rin</code> using <code>fftw_alloc_real</code> with an
+argument of <code>2 * alloc_local</code>: since <code>alloc_local</code> is the
+number of <em>complex</em> values to allocate, the number of <em>real</em>
+values is twice as many.  The <code>rin</code> array is then
+local_n0&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;2(N/2+1) in row-major order, so its
+<code>(i,j,k)</code> element is at the index <code>(i*M + j) * (2*(N/2+1)) +
+k</code> (see <a href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>).
+
+   <p><a name="index-transpose-390"></a><a name="index-FFTW_005fTRANSPOSED_005fOUT-391"></a><a name="index-FFTW_005fTRANSPOSED_005fIN-392"></a>As for the complex transforms, improved performance can be obtained by
+specifying that the output is the transpose of the input or vice versa
+(see <a href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a>).  In our L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;N r2c
+example, including <code>FFTW_TRANSPOSED_OUT</code> in the flags means that
+the input would be a padded L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;2(N/2+1) real array
+distributed over the <code>L</code> dimension, while the output would be a
+M&nbsp;&times;&nbsp;L&nbsp;&times;&nbsp;N/2+1 complex array distributed over the <code>M</code>
+dimension.  To perform the inverse c2r transform with the same data
+distributions, you would use the <code>FFTW_TRANSPOSED_IN</code> flag.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Multi_002ddimensional-Transforms.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Multi_002ddimensional-Transforms.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,100 @@
+<html lang="en">
+<head>
+<title>Multi-dimensional Transforms - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes" title="What FFTW Really Computes">
+<link rel="prev" href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#g_t1d-Discrete-Hartley-Transforms-_0028DHTs_0029" title="1d Discrete Hartley Transforms (DHTs)">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Multi-dimensional-Transforms"></a>
+<a name="Multi_002ddimensional-Transforms"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#g_t1d-Discrete-Hartley-Transforms-_0028DHTs_0029">1d Discrete Hartley Transforms (DHTs)</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.8.6 Multi-dimensional Transforms</h4>
+
+<p>The multi-dimensional transforms of FFTW, in general, compute simply the
+separable product of the given 1d transform along each dimension of the
+array.  Since each of these transforms is unnormalized, computing the
+forward followed by the backward/inverse multi-dimensional transform
+will result in the original array scaled by the product of the
+normalization factors for each dimension (e.g. the product of the
+dimension sizes, for a multi-dimensional DFT).
+
+   <p><a name="index-r2c-325"></a>The definition of FFTW's multi-dimensional DFT of real data (r2c)
+deserves special attention.  In this case, we logically compute the full
+multi-dimensional DFT of the input data; since the input data are purely
+real, the output data have the Hermitian symmetry and therefore only one
+non-redundant half need be stored.  More specifically, for an n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> multi-dimensional real-input DFT, the full (logical) complex output array
+<i>Y</i>[<i>k</i><sub>0</sub>, <i>k</i><sub>1</sub>, ...,
+<i>k</i><sub><i>d-1</i></sub>]has the symmetry:
+<i>Y</i>[<i>k</i><sub>0</sub>, <i>k</i><sub>1</sub>, ...,
+<i>k</i><sub><i>d-1</i></sub>] = <i>Y</i>[<i>n</i><sub>0</sub> -
+<i>k</i><sub>0</sub>, <i>n</i><sub>1</sub> - <i>k</i><sub>1</sub>, ...,
+<i>n</i><sub><i>d-1</i></sub> - <i>k</i><sub><i>d-1</i></sub>]<sup>*</sup>(where each dimension is periodic).  Because of this symmetry, we only
+store the
+<i>k</i><sub><i>d-1</i></sub> = 0...<i>n</i><sub><i>d-1</i></sub>/2+1elements of the <em>last</em> dimension (division by 2 is rounded
+down).  (We could instead have cut any other dimension in half, but the
+last dimension proved computationally convenient.)  This results in the
+peculiar array format described in more detail by <a href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a>.
+
+   <p>The multi-dimensional c2r transform is simply the unnormalized inverse
+of the r2c transform.  i.e. it is the same as FFTW's complex backward
+multi-dimensional DFT, operating on a Hermitian input array in the
+peculiar format mentioned above and outputting a real array (since the
+DFT output is purely real).
+
+   <p>We should remind the user that the separable product of 1d transforms
+along each dimension, as computed by FFTW, is not always the same thing
+as the usual multi-dimensional transform.  A multi-dimensional
+<code>R2HC</code> (or <code>HC2R</code>) transform is not identical to the
+multi-dimensional DFT, requiring some post-processing to combine the
+requisite real and imaginary parts, as was described in <a href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a>.  Likewise, FFTW's multidimensional
+<code>FFTW_DHT</code> r2r transform is not the same thing as the logical
+multi-dimensional discrete Hartley transform defined in the literature,
+as discussed in <a href="The-Discrete-Hartley-Transform.html#The-Discrete-Hartley-Transform">The Discrete Hartley Transform</a>.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Multi_002dthreaded-FFTW.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Multi_002dthreaded-FFTW.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,90 @@
+<html lang="en">
+<head>
+<title>Multi-threaded FFTW - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link rel="next" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Multi-threaded-FFTW"></a>
+<a name="Multi_002dthreaded-FFTW"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">5 Multi-threaded FFTW</h2>
+
+<p><a name="index-parallel-transform-326"></a>In this chapter we document the parallel FFTW routines for
+shared-memory parallel hardware.  These routines, which support
+parallel one- and multi-dimensional transforms of both real and
+complex data, are the easiest way to take advantage of multiple
+processors with FFTW.  They work just like the corresponding
+uniprocessor transform routines, except that you have an extra
+initialization routine to call, and there is a routine to set the
+number of threads to employ.  Any program that uses the uniprocessor
+FFTW can therefore be trivially modified to use the multi-threaded
+FFTW.
+
+   <p>A shared-memory machine is one in which all CPUs can directly access
+the same main memory, and such machines are now common due to the
+ubiquity of multi-core CPUs.  FFTW's multi-threading support allows
+you to utilize these additional CPUs transparently from a single
+program.  However, this does not necessarily translate into
+performance gains&mdash;when multiple threads/CPUs are employed, there is
+an overhead required for synchronization that may outweigh the
+computatational parallelism.  Therefore, you can only benefit from
+threads if your problem is sufficiently large. 
+<a name="index-shared_002dmemory-327"></a><a name="index-threads-328"></a>
+
+<ul class="menu">
+<li><a accesskey="1" href="Installation-and-Supported-Hardware_002fSoftware.html#Installation-and-Supported-Hardware_002fSoftware">Installation and Supported Hardware/Software</a>
+<li><a accesskey="2" href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW">Usage of Multi-threaded FFTW</a>
+<li><a accesskey="3" href="How-Many-Threads-to-Use_003f.html#How-Many-Threads-to-Use_003f">How Many Threads to Use?</a>
+<li><a accesskey="4" href="Thread-safety.html#Thread-safety">Thread safety</a>
+</ul>
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/New_002darray-Execute-Functions.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/New_002darray-Execute-Functions.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,152 @@
+<html lang="en">
+<head>
+<title>New-array Execute Functions - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link rel="prev" href="Guru-Interface.html#Guru-Interface" title="Guru Interface">
+<link rel="next" href="Wisdom.html#Wisdom" title="Wisdom">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="New-array-Execute-Functions"></a>
+<a name="New_002darray-Execute-Functions"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Wisdom.html#Wisdom">Wisdom</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Guru-Interface.html#Guru-Interface">Guru Interface</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>
+<hr>
+</div>
+
+<h3 class="section">4.6 New-array Execute Functions</h3>
+
+<p><a name="index-execute-264"></a><a name="index-new_002darray-execution-265"></a>
+Normally, one executes a plan for the arrays with which the plan was
+created, by calling <code>fftw_execute(plan)</code> as described in <a href="Using-Plans.html#Using-Plans">Using Plans</a>. 
+<a name="index-fftw_005fexecute-266"></a>However, it is possible for sophisticated users to apply a given plan
+to a <em>different</em> array using the &ldquo;new-array execute&rdquo; functions
+detailed below, provided that the following conditions are met:
+
+     <ul>
+<li>The array size, strides, etcetera are the same (since those are set by
+the plan).
+
+     <li>The input and output arrays are the same (in-place) or different
+(out-of-place) if the plan was originally created to be in-place or
+out-of-place, respectively.
+
+     <li>For split arrays, the separations between the real and imaginary
+parts, <code>ii-ri</code> and <code>io-ro</code>, are the same as they were for
+the input and output arrays when the plan was created.  (This
+condition is automatically satisfied for interleaved arrays.)
+
+     <li>The <dfn>alignment</dfn> of the new input/output arrays is the same as that
+of the input/output arrays when the plan was created, unless the plan
+was created with the <code>FFTW_UNALIGNED</code> flag. 
+<a name="index-FFTW_005fUNALIGNED-267"></a>Here, the alignment is a platform-dependent quantity (for example, it is
+the address modulo 16 if SSE SIMD instructions are used, but the address
+modulo 4 for non-SIMD single-precision FFTW on the same machine).  In
+general, only arrays allocated with <code>fftw_malloc</code> are guaranteed to
+be equally aligned (see <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>).
+
+   </ul>
+
+   <p><a name="index-alignment-268"></a>The alignment issue is especially critical, because if you don't use
+<code>fftw_malloc</code> then you may have little control over the alignment
+of arrays in memory.  For example, neither the C++ <code>new</code> function
+nor the Fortran <code>allocate</code> statement provide strong enough
+guarantees about data alignment.  If you don't use <code>fftw_malloc</code>,
+therefore, you probably have to use <code>FFTW_UNALIGNED</code> (which
+disables most SIMD support).  If possible, it is probably better for
+you to simply create multiple plans (creating a new plan is quick once
+one exists for a given size), or better yet re-use the same array for
+your transforms.
+
+   <p>If you are tempted to use the new-array execute interface because you
+want to transform a known bunch of arrays of the same size, you should
+probably go use the advanced interface instead (see <a href="Advanced-Interface.html#Advanced-Interface">Advanced Interface</a>)).
+
+   <p>The new-array execute functions are:
+
+<pre class="example">     void fftw_execute_dft(
+          const fftw_plan p,
+          fftw_complex *in, fftw_complex *out);
+     
+     void fftw_execute_split_dft(
+          const fftw_plan p,
+          double *ri, double *ii, double *ro, double *io);
+     
+     void fftw_execute_dft_r2c(
+          const fftw_plan p,
+          double *in, fftw_complex *out);
+     
+     void fftw_execute_split_dft_r2c(
+          const fftw_plan p,
+          double *in, double *ro, double *io);
+     
+     void fftw_execute_dft_c2r(
+          const fftw_plan p,
+          fftw_complex *in, double *out);
+     
+     void fftw_execute_split_dft_c2r(
+          const fftw_plan p,
+          double *ri, double *ii, double *out);
+     
+     void fftw_execute_r2r(
+          const fftw_plan p,
+          double *in, double *out);
+</pre>
+   <p><a name="index-fftw_005fexecute_005fdft-269"></a><a name="index-fftw_005fexecute_005fsplit_005fdft-270"></a><a name="index-fftw_005fexecute_005fdft_005fr2c-271"></a><a name="index-fftw_005fexecute_005fsplit_005fdft_005fr2c-272"></a><a name="index-fftw_005fexecute_005fdft_005fc2r-273"></a><a name="index-fftw_005fexecute_005fsplit_005fdft_005fc2r-274"></a><a name="index-fftw_005fexecute_005fr2r-275"></a>
+These execute the <code>plan</code> to compute the corresponding transform on
+the input/output arrays specified by the subsequent arguments.  The
+input/output array arguments have the same meanings as the ones passed
+to the guru planner routines in the preceding sections.  The <code>plan</code>
+is not modified, and these routines can be called as many times as
+desired, or intermixed with calls to the ordinary <code>fftw_execute</code>.
+
+   <p>The <code>plan</code> <em>must</em> have been created for the transform type
+corresponding to the execute function, e.g. it must be a complex-DFT
+plan for <code>fftw_execute_dft</code>.  Any of the planner routines for that
+transform type, from the basic to the guru interface, could have been
+used to create the plan, however.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/One_002dDimensional-DFTs-of-Real-Data.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/One_002dDimensional-DFTs-of-Real-Data.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,142 @@
+<html lang="en">
+<head>
+<title>One-Dimensional DFTs of Real Data - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Tutorial.html#Tutorial" title="Tutorial">
+<link rel="prev" href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs" title="Complex Multi-Dimensional DFTs">
+<link rel="next" href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data" title="Multi-Dimensional DFTs of Real Data">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="One-Dimensional-DFTs-of-Real-Data"></a>
+<a name="One_002dDimensional-DFTs-of-Real-Data"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Tutorial.html#Tutorial">Tutorial</a>
+<hr>
+</div>
+
+<h3 class="section">2.3 One-Dimensional DFTs of Real Data</h3>
+
+<p>In many practical applications, the input data <code>in[i]</code> are purely
+real numbers, in which case the DFT output satisfies the &ldquo;Hermitian&rdquo;
+<a name="index-Hermitian-46"></a>redundancy: <code>out[i]</code> is the conjugate of <code>out[n-i]</code>.  It is
+possible to take advantage of these circumstances in order to achieve
+roughly a factor of two improvement in both speed and memory usage.
+
+   <p>In exchange for these speed and space advantages, the user sacrifices
+some of the simplicity of FFTW's complex transforms. First of all, the
+input and output arrays are of <em>different sizes and types</em>: the
+input is <code>n</code> real numbers, while the output is <code>n/2+1</code>
+complex numbers (the non-redundant outputs); this also requires slight
+&ldquo;padding&rdquo; of the input array for
+<a name="index-padding-47"></a>in-place transforms.  Second, the inverse transform (complex to real)
+has the side-effect of <em>overwriting its input array</em>, by default. 
+Neither of these inconveniences should pose a serious problem for
+users, but it is important to be aware of them.
+
+   <p>The routines to perform real-data transforms are almost the same as
+those for complex transforms: you allocate arrays of <code>double</code>
+and/or <code>fftw_complex</code> (preferably using <code>fftw_malloc</code> or
+<code>fftw_alloc_complex</code>), create an <code>fftw_plan</code>, execute it as
+many times as you want with <code>fftw_execute(plan)</code>, and clean up
+with <code>fftw_destroy_plan(plan)</code> (and <code>fftw_free</code>).  The only
+differences are that the input (or output) is of type <code>double</code>
+and there are new routines to create the plan.  In one dimension:
+
+<pre class="example">     fftw_plan fftw_plan_dft_r2c_1d(int n, double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_c2r_1d(int n, fftw_complex *in, double *out,
+                                    unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fdft_005fr2c_005f1d-48"></a><a name="index-fftw_005fplan_005fdft_005fc2r_005f1d-49"></a>
+for the real input to complex-Hermitian output (<dfn>r2c</dfn>) and
+complex-Hermitian input to real output (<dfn>c2r</dfn>) transforms. 
+<a name="index-r2c-50"></a><a name="index-c2r-51"></a>Unlike the complex DFT planner, there is no <code>sign</code> argument. 
+Instead, r2c DFTs are always <code>FFTW_FORWARD</code> and c2r DFTs are
+always <code>FFTW_BACKWARD</code>. 
+<a name="index-FFTW_005fFORWARD-52"></a><a name="index-FFTW_005fBACKWARD-53"></a>(For single/long-double precision
+<code>fftwf</code> and <code>fftwl</code>, <code>double</code> should be replaced by
+<code>float</code> and <code>long double</code>, respectively.) 
+<a name="index-precision-54"></a>
+
+   <p>Here, <code>n</code> is the &ldquo;logical&rdquo; size of the DFT, not necessarily the
+physical size of the array.  In particular, the real (<code>double</code>)
+array has <code>n</code> elements, while the complex (<code>fftw_complex</code>)
+array has <code>n/2+1</code> elements (where the division is rounded down). 
+For an in-place transform,
+<a name="index-in_002dplace-55"></a><code>in</code> and <code>out</code> are aliased to the same array, which must be
+big enough to hold both; so, the real array would actually have
+<code>2*(n/2+1)</code> elements, where the elements beyond the first
+<code>n</code> are unused padding.  (Note that this is very different from
+the concept of &ldquo;zero-padding&rdquo; a transform to a larger length, which
+changes the logical size of the DFT by actually adding new input
+data.)  The kth element of the complex array is exactly the
+same as the kth element of the corresponding complex DFT.  All
+positive <code>n</code> are supported; products of small factors are most
+efficient, but an <i>O</i>(<i>n</i>&nbsp;log&nbsp;<i>n</i>) algorithm is used even for prime sizes.
+
+   <p>As noted above, the c2r transform destroys its input array even for
+out-of-place transforms.  This can be prevented, if necessary, by
+including <code>FFTW_PRESERVE_INPUT</code> in the <code>flags</code>, with
+unfortunately some sacrifice in performance. 
+<a name="index-flags-56"></a><a name="index-FFTW_005fPRESERVE_005fINPUT-57"></a>This flag is also not currently supported for multi-dimensional real
+DFTs (next section).
+
+   <p>Readers familiar with DFTs of real data will recall that the 0th (the
+&ldquo;DC&rdquo;) and <code>n/2</code>-th (the &ldquo;Nyquist&rdquo; frequency, when <code>n</code> is
+even) elements of the complex output are purely real.  Some
+implementations therefore store the Nyquist element where the DC
+imaginary part would go, in order to make the input and output arrays
+the same size.  Such packing, however, does not generalize well to
+multi-dimensional transforms, and the space savings are miniscule in
+any case; FFTW does not support it.
+
+   <p>An alternative interface for one-dimensional r2c and c2r DFTs can be
+found in the &lsquo;<samp><span class="samp">r2r</span></samp>&rsquo; interface (see <a href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a>), with &ldquo;halfcomplex&rdquo;-format output that <em>is</em> the same size
+(and type) as the input array. 
+<a name="index-halfcomplex-format-58"></a>That interface, although it is not very useful for multi-dimensional
+transforms, may sometimes yield better performance.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/One_002ddimensional-distributions.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/One_002ddimensional-distributions.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,106 @@
+<html lang="en">
+<head>
+<title>One-dimensional distributions - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="MPI-Data-Distribution.html#MPI-Data-Distribution" title="MPI Data Distribution">
+<link rel="prev" href="Transposed-distributions.html#Transposed-distributions" title="Transposed distributions">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="One-dimensional-distributions"></a>
+<a name="One_002ddimensional-distributions"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Transposed-distributions.html#Transposed-distributions">Transposed distributions</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.4.4 One-dimensional distributions</h4>
+
+<p>For one-dimensional distributed DFTs using FFTW, matters are slightly
+more complicated because the data distribution is more closely tied to
+how the algorithm works.  In particular, you can no longer pass an
+arbitrary block size and must accept FFTW's default; also, the block
+sizes may be different for input and output.  Also, the data
+distribution depends on the flags and transform direction, in order
+for forward and backward transforms to work correctly.
+
+<pre class="example">     ptrdiff_t fftw_mpi_local_size_1d(ptrdiff_t n0, MPI_Comm comm,
+                     int sign, unsigned flags,
+                     ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+                     ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+</pre>
+   <p><a name="index-fftw_005fmpi_005flocal_005fsize_005f1d-383"></a>
+This function computes the data distribution for a 1d transform of
+size <code>n0</code> with the given transform <code>sign</code> and <code>flags</code>. 
+Both input and output data use block distributions.  The input on the
+current process will consist of <code>local_ni</code> numbers starting at
+index <code>local_i_start</code>; e.g. if only a single process is used,
+then <code>local_ni</code> will be <code>n0</code> and <code>local_i_start</code> will
+be <code>0</code>.  Similarly for the output, with <code>local_no</code> numbers
+starting at index <code>local_o_start</code>.  The return value of
+<code>fftw_mpi_local_size_1d</code> will be the total number of elements to
+allocate on the current process (which might be slightly larger than
+the local size due to intermediate steps in the algorithm).
+
+   <p>As mentioned above (see <a href="Load-balancing.html#Load-balancing">Load balancing</a>), the data will be divided
+equally among the processes if <code>n0</code> is divisible by the
+<em>square</em> of the number of processes.  In this case,
+<code>local_ni</code> will equal <code>local_no</code>.  Otherwise, they may be
+different.
+
+   <p>For some applications, such as convolutions, the order of the output
+data is irrelevant.  In this case, performance can be improved by
+specifying that the output data be stored in an FFTW-defined
+&ldquo;scrambled&rdquo; format.  (In particular, this is the analogue of
+transposed output in the multidimensional case: scrambled output saves
+a communications step.)  If you pass <code>FFTW_MPI_SCRAMBLED_OUT</code> in
+the flags, then the output is stored in this (undocumented) scrambled
+order.  Conversely, to perform the inverse transform of data in
+scrambled order, pass the <code>FFTW_MPI_SCRAMBLED_IN</code> flag. 
+<a name="index-FFTW_005fMPI_005fSCRAMBLED_005fOUT-384"></a><a name="index-FFTW_005fMPI_005fSCRAMBLED_005fIN-385"></a>
+
+   <p>In MPI FFTW, only composite sizes <code>n0</code> can be parallelized; we
+have not yet implemented a parallel algorithm for large prime sizes.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Other-Important-Topics.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Other-Important-Topics.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,67 @@
+<html lang="en">
+<head>
+<title>Other Important Topics - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Tutorial.html#Tutorial" title="Tutorial">
+<link rel="next" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Other-Important-Topics"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Tutorial.html#Tutorial">Tutorial</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">3 Other Important Topics</h2>
+
+<ul class="menu">
+<li><a accesskey="1" href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>
+<li><a accesskey="2" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>
+<li><a accesskey="3" href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>
+<li><a accesskey="4" href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom">Caveats in Using Wisdom</a>
+</ul>
+
+<!--  -->
+</body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+<html lang="en">
+<head>
+<title>Other Multi-dimensional Real-data MPI Transforms - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI" title="Distributed-memory FFTW with MPI">
+<link rel="prev" href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data" title="Multi-dimensional MPI DFTs of Real Data">
+<link rel="next" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes" title="FFTW MPI Transposes">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Other-Multi-dimensional-Real-data-MPI-Transforms"></a>
+<a name="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">FFTW MPI Transposes</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">Multi-dimensional MPI DFTs of Real Data</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<hr>
+</div>
+
+<h3 class="section">6.6 Other multi-dimensional Real-Data MPI Transforms</h3>
+
+<p><a name="index-r2r-393"></a>FFTW's MPI interface also supports multi-dimensional &lsquo;<samp><span class="samp">r2r</span></samp>&rsquo;
+transforms of all kinds supported by the serial interface
+(e.g. discrete cosine and sine transforms, discrete Hartley
+transforms, etc.).  Only multi-dimensional &lsquo;<samp><span class="samp">r2r</span></samp>&rsquo; transforms, not
+one-dimensional transforms, are currently parallelized.
+
+   <p><a name="index-fftw_005fr2r_005fkind-394"></a>These are used much like the multidimensional complex DFTs discussed
+above, except that the data is real rather than complex, and one needs
+to pass an r2r transform kind (<code>fftw_r2r_kind</code>) for each
+dimension as in the serial FFTW (see <a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a>).
+
+   <p>For example, one might perform a two-dimensional L&nbsp;&times;&nbsp;M that is
+an REDFT10 (DCT-II) in the first dimension and an RODFT10 (DST-II) in
+the second dimension with code like:
+
+<pre class="example">         const ptrdiff_t L = ..., M = ...;
+         fftw_plan plan;
+         double *data;
+         ptrdiff_t alloc_local, local_n0, local_0_start, i, j;
+     
+         /* <span class="roman">get local data size and allocate</span> */
+         alloc_local = fftw_mpi_local_size_2d(L, M, MPI_COMM_WORLD,
+                                              &amp;local_n0, &amp;local_0_start);
+         data = fftw_alloc_real(alloc_local);
+     
+         /* <span class="roman">create plan for in-place REDFT10 x RODFT10</span> */
+         plan = fftw_mpi_plan_r2r_2d(L, M, data, data, MPI_COMM_WORLD,
+                                     FFTW_REDFT10, FFTW_RODFT10, FFTW_MEASURE);
+     
+         /* <span class="roman">initialize data to some function</span> my_function(x,y) */
+         for (i = 0; i &lt; local_n0; ++i) for (j = 0; j &lt; M; ++j)
+            data[i*M + j] = my_function(local_0_start + i, j);
+     
+         /* <span class="roman">compute transforms, in-place, as many times as desired</span> */
+         fftw_execute(plan);
+     
+         fftw_destroy_plan(plan);
+</pre>
+   <p><a name="index-fftw_005falloc_005freal-395"></a>Notice that we use the same &lsquo;<samp><span class="samp">local_size</span></samp>&rsquo; functions as we did for
+complex data, only now we interpret the sizes in terms of real rather
+than complex values, and correspondingly use <code>fftw_alloc_real</code>.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Overview-of-Fortran-interface.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Overview-of-Fortran-interface.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,128 @@
+<html lang="en">
+<head>
+<title>Overview of Fortran interface - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran" title="Calling FFTW from Modern Fortran">
+<link rel="prev" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran" title="Calling FFTW from Modern Fortran">
+<link rel="next" href="Reversing-array-dimensions.html#Reversing-array-dimensions" title="Reversing array dimensions">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Overview-of-Fortran-interface"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">7.1 Overview of Fortran interface</h3>
+
+<p>FFTW provides a file <code>fftw3.f03</code> that defines Fortran 2003
+interfaces for all of its C routines, except for the MPI routines
+described elsewhere, which can be found in the same directory as
+<code>fftw3.h</code> (the C header file).  In any Fortran subroutine where
+you want to use FFTW functions, you should begin with:
+
+   <p><a name="index-iso_005fc_005fbinding-502"></a>
+<pre class="example">       use, intrinsic :: iso_c_binding
+       include 'fftw3.f03'
+</pre>
+   <p>This includes the interface definitions and the standard
+<code>iso_c_binding</code> module (which defines the equivalents of C
+types).  You can also put the FFTW functions into a module if you
+prefer (see <a href="Defining-an-FFTW-module.html#Defining-an-FFTW-module">Defining an FFTW module</a>).
+
+   <p>At this point, you can now call anything in the FFTW C interface
+directly, almost exactly as in C other than minor changes in syntax. 
+For example:
+
+   <p><a name="index-fftw_005fplan_005fdft_005f2d-503"></a><a name="index-fftw_005fexecute_005fdft-504"></a><a name="index-fftw_005fdestroy_005fplan-505"></a>
+<pre class="example">       type(C_PTR) :: plan
+       complex(C_DOUBLE_COMPLEX), dimension(1024,1000) :: in, out
+       plan = fftw_plan_dft_2d(1000,1024, in,out, FFTW_FORWARD,FFTW_ESTIMATE)
+       ...
+       call fftw_execute_dft(plan, in, out)
+       ...
+       call fftw_destroy_plan(plan)
+</pre>
+   <p>A few important things to keep in mind are:
+
+     <ul>
+<li><a name="index-fftw_005fcomplex-506"></a><a name="index-C_005fPTR-507"></a><a name="index-C_005fINT-508"></a><a name="index-C_005fDOUBLE-509"></a><a name="index-C_005fDOUBLE_005fCOMPLEX-510"></a>FFTW plans are <code>type(C_PTR)</code>.  Other C types are mapped in the
+obvious way via the <code>iso_c_binding</code> standard: <code>int</code> turns
+into <code>integer(C_INT)</code>, <code>fftw_complex</code> turns into
+<code>complex(C_DOUBLE_COMPLEX)</code>, <code>double</code> turns into
+<code>real(C_DOUBLE)</code>, and so on. See <a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a>.
+
+     <li>Functions in C become functions in Fortran if they have a return value,
+and subroutines in Fortran otherwise.
+
+     <li>The ordering of the Fortran array dimensions must be <em>reversed</em>
+when they are passed to the FFTW plan creation, thanks to differences
+in array indexing conventions (see <a href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>).  This is <em>unlike</em> the legacy Fortran interface
+(see <a href="Fortran_002dinterface-routines.html#Fortran_002dinterface-routines">Fortran-interface routines</a>), which reversed the dimensions
+for you.  See <a href="Reversing-array-dimensions.html#Reversing-array-dimensions">Reversing array dimensions</a>.
+
+     <li><a name="index-alignment-511"></a><a name="index-SIMD-512"></a>Using ordinary Fortran array declarations like this works, but may
+yield suboptimal performance because the data may not be not aligned
+to exploit SIMD instructions on modern proessors (see <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>). Better performance will often be obtained
+by allocating with &lsquo;<samp><span class="samp">fftw_alloc</span></samp>&rsquo;. See <a href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a>.
+
+     <li><a name="index-fftw_005fexecute-513"></a>Similar to the legacy Fortran interface (see <a href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran">FFTW Execution in Fortran</a>), we currently recommend <em>not</em> using <code>fftw_execute</code>
+but rather using the more specialized functions like
+<code>fftw_execute_dft</code> (see <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>). 
+However, you should execute the plan on the <code>same arrays</code> as the
+ones for which you created the plan, unless you are especially
+careful.  See <a href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">Plan execution in Fortran</a>.  To prevent
+you from using <code>fftw_execute</code> by mistake, the <code>fftw3.f03</code>
+file does not provide an <code>fftw_execute</code> interface declaration.
+
+     <li><a name="index-flags-514"></a>Multiple planner flags are combined with <code>ior</code> (equivalent to &lsquo;<samp><span class="samp">|</span></samp>&rsquo; in C).  e.g. <code>FFTW_MEASURE | FFTW_DESTROY_INPUT</code> becomes <code>ior(FFTW_MEASURE, FFTW_DESTROY_INPUT)</code>.  (You can also use &lsquo;<samp><span class="samp">+</span></samp>&rsquo; as long as you don't try to include a given flag more than once.)
+
+   </ul>
+
+<ul class="menu">
+<li><a accesskey="1" href="Extended-and-quadruple-precision-in-Fortran.html#Extended-and-quadruple-precision-in-Fortran">Extended and quadruple precision in Fortran</a>
+</ul>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Plan-execution-in-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Plan-execution-in-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,113 @@
+<html lang="en">
+<head>
+<title>Plan execution in Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran" title="Calling FFTW from Modern Fortran">
+<link rel="prev" href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference" title="FFTW Fortran type reference">
+<link rel="next" href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran" title="Allocating aligned memory in Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Plan-execution-in-Fortran"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">7.4 Plan execution in Fortran</h3>
+
+<p>In C, in order to use a plan, one normally calls <code>fftw_execute</code>,
+which executes the plan to perform the transform on the input/output
+arrays passed when the plan was created (see <a href="Using-Plans.html#Using-Plans">Using Plans</a>).  The
+corresponding subroutine call in modern Fortran is:
+<pre class="example">      call fftw_execute(plan)
+</pre>
+   <p><a name="index-fftw_005fexecute-554"></a>
+However, we have had reports that this causes problems with some
+recent optimizing Fortran compilers.  The problem is, because the
+input/output arrays are not passed as explicit arguments to
+<code>fftw_execute</code>, the semantics of Fortran (unlike C) allow the
+compiler to assume that the input/output arrays are not changed by
+<code>fftw_execute</code>.  As a consequence, certain compilers end up
+repositioning the call to <code>fftw_execute</code>, assuming incorrectly
+that it does nothing to the arrays.
+
+   <p>There are various workarounds to this, but the safest and simplest
+thing is to not use <code>fftw_execute</code> in Fortran.  Instead, use the
+functions described in <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>, which take
+the input/output arrays as explicit arguments.  For example, if the
+plan is for a complex-data DFT and was created for the arrays
+<code>in</code> and <code>out</code>, you would do:
+<pre class="example">      call fftw_execute_dft(plan, in, out)
+</pre>
+   <p><a name="index-fftw_005fexecute_005fdft-555"></a>
+There are a few things to be careful of, however:
+
+     <ul>
+<li><a name="index-fftw_005fexecute_005fdft_005fr2c-556"></a><a name="index-fftw_005fexecute_005fdft_005fc2r-557"></a><a name="index-fftw_005fexecute_005fr2r-558"></a>You must use the correct type of execute function, matching the way
+the plan was created.  Complex DFT plans should use
+<code>fftw_execute_dft</code>, Real-input (r2c) DFT plans should use use
+<code>fftw_execute_dft_r2c</code>, and real-output (c2r) DFT plans should
+use <code>fftw_execute_dft_c2r</code>.  The various r2r plans should use
+<code>fftw_execute_r2r</code>.  Fortunately, if you use the wrong one you
+will get a compile-time type-mismatch error (unlike legacy Fortran).
+
+     <li>You should normally pass the same input/output arrays that were used when
+creating the plan.  This is always safe.
+
+     <li><em>If</em> you pass <em>different</em> input/output arrays compared to
+those used when creating the plan, you must abide by all the
+restrictions of the new-array execute functions (see <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>).  The most tricky of these is the
+requirement that the new arrays have the same alignment as the
+original arrays; the best (and possibly only) way to guarantee this
+is to use the &lsquo;<samp><span class="samp">fftw_alloc</span></samp>&rsquo; functions to allocate your arrays (see <a href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a>). Alternatively, you can
+use the <code>FFTW_UNALIGNED</code> flag when creating the
+plan, in which case the plan does not depend on the alignment, but
+this may sacrifice substantial performance on architectures (like x86)
+with SIMD instructions (see <a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">SIMD alignment and fftw_malloc</a>). 
+<a name="index-FFTW_005fUNALIGNED-559"></a>
+</ul>
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Planner-Flags.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Planner-Flags.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,170 @@
+<html lang="en">
+<head>
+<title>Planner Flags - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Basic-Interface.html#Basic-Interface" title="Basic Interface">
+<link rel="prev" href="Complex-DFTs.html#Complex-DFTs" title="Complex DFTs">
+<link rel="next" href="Real_002ddata-DFTs.html#Real_002ddata-DFTs" title="Real-data DFTs">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Planner-Flags"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Complex-DFTs.html#Complex-DFTs">Complex DFTs</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Basic-Interface.html#Basic-Interface">Basic Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.3.2 Planner Flags</h4>
+
+<p>All of the planner routines in FFTW accept an integer <code>flags</code>
+argument, which is a bitwise OR (&lsquo;<samp><span class="samp">|</span></samp>&rsquo;) of zero or more of the flag
+constants defined below.  These flags control the rigor (and time) of
+the planning process, and can also impose (or lift) restrictions on the
+type of transform algorithm that is employed.
+
+   <p><em>Important:</em> the planner overwrites the input array during
+planning unless a saved plan (see <a href="Wisdom.html#Wisdom">Wisdom</a>) is available for that
+problem, so you should initialize your input data after creating the
+plan.  The only exceptions to this are the <code>FFTW_ESTIMATE</code> and
+<code>FFTW_WISDOM_ONLY</code> flags, as mentioned below.
+
+   <p>In all  cases, if  wisdom is  available for the  given problem  that was
+created  with equal-or-greater  planning rigor,  then the  more rigorous
+wisdom is used.  For example, in <code>FFTW_ESTIMATE</code> mode any available
+wisdom is used, whereas  in <code>FFTW_PATIENT</code> mode only wisdom created
+in patient or exhaustive mode can be used.  See <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>.
+
+<h5 class="subsubheading">Planning-rigor flags</h5>
+
+     <ul>
+<li><a name="index-FFTW_005fESTIMATE-171"></a><code>FFTW_ESTIMATE</code> specifies that, instead of actual measurements of
+different algorithms, a simple heuristic is used to pick a (probably
+sub-optimal) plan quickly.  With this flag, the input/output arrays are
+not overwritten during planning.
+
+     <li><a name="index-FFTW_005fMEASURE-172"></a><code>FFTW_MEASURE</code> tells FFTW to find an optimized plan by actually
+<em>computing</em> several FFTs and measuring their execution time. 
+Depending on your machine, this can take some time (often a few
+seconds).  <code>FFTW_MEASURE</code> is the default planning option.
+
+     <li><a name="index-FFTW_005fPATIENT-173"></a><code>FFTW_PATIENT</code> is like <code>FFTW_MEASURE</code>, but considers a wider
+range of algorithms and often produces a &ldquo;more optimal&rdquo; plan
+(especially for large transforms), but at the expense of several times
+longer planning time (especially for large transforms).
+
+     <li><a name="index-FFTW_005fEXHAUSTIVE-174"></a><code>FFTW_EXHAUSTIVE</code> is like <code>FFTW_PATIENT</code>, but considers an
+even wider range of algorithms, including many that we think are
+unlikely to be fast, to produce the most optimal plan but with a
+substantially increased planning time.
+
+     <li><a name="index-FFTW_005fWISDOM_005fONLY-175"></a><code>FFTW_WISDOM_ONLY</code> is a special planning mode in which the plan
+is only created if wisdom is available for the given problem, and
+otherwise a <code>NULL</code> plan is returned.  This can be combined with
+other flags, e.g. &lsquo;<samp><span class="samp">FFTW_WISDOM_ONLY | FFTW_PATIENT</span></samp>&rsquo; creates a
+plan only if wisdom is available that was created in
+<code>FFTW_PATIENT</code> or <code>FFTW_EXHAUSTIVE</code> mode.  The
+<code>FFTW_WISDOM_ONLY</code> flag is intended for users who need to detect
+whether wisdom is available; for example, if wisdom is not available
+one may wish to allocate new arrays for planning so that user data is
+not overwritten.
+
+</ul>
+
+<h5 class="subsubheading">Algorithm-restriction flags</h5>
+
+     <ul>
+<li><a name="index-FFTW_005fDESTROY_005fINPUT-176"></a><code>FFTW_DESTROY_INPUT</code> specifies that an out-of-place transform is
+allowed to <em>overwrite its input</em> array with arbitrary data; this
+can sometimes allow more efficient algorithms to be employed. 
+<a name="index-out_002dof_002dplace-177"></a>
+<li><a name="index-FFTW_005fPRESERVE_005fINPUT-178"></a><code>FFTW_PRESERVE_INPUT</code> specifies that an out-of-place transform must
+<em>not change its input</em> array.  This is ordinarily the
+<em>default</em>, except for c2r and hc2r (i.e. complex-to-real)
+transforms for which <code>FFTW_DESTROY_INPUT</code> is the default.  In the
+latter cases, passing <code>FFTW_PRESERVE_INPUT</code> will attempt to use
+algorithms that do not destroy the input, at the expense of worse
+performance; for multi-dimensional c2r transforms, however, no
+input-preserving algorithms are implemented and the planner will return
+<code>NULL</code> if one is requested. 
+<a name="index-c2r-179"></a><a name="index-hc2r-180"></a>
+<li><a name="index-FFTW_005fUNALIGNED-181"></a><a name="index-alignment-182"></a><code>FFTW_UNALIGNED</code> specifies that the algorithm may not impose any
+unusual alignment requirements on the input/output arrays (i.e. no
+SIMD may be used).  This flag is normally <em>not necessary</em>, since
+the planner automatically detects misaligned arrays.  The only use for
+this flag is if you want to use the new-array execute interface to
+execute a given plan on a different array that may not be aligned like
+the original.  (Using <code>fftw_malloc</code> makes this flag unnecessary
+even then.)
+
+</ul>
+
+<h5 class="subsubheading">Limiting planning time</h5>
+
+<pre class="example">     extern void fftw_set_timelimit(double seconds);
+</pre>
+   <p><a name="index-fftw_005fset_005ftimelimit-183"></a>
+This function instructs FFTW to spend at most <code>seconds</code> seconds
+(approximately) in the planner.  If <code>seconds ==
+FFTW_NO_TIMELIMIT</code> (the default value, which is negative), then
+planning time is unbounded.  Otherwise, FFTW plans with a
+progressively wider range of algorithms until the the given time limit
+is reached or the given range of algorithms is explored, returning the
+best available plan. 
+<a name="index-FFTW_005fNO_005fTIMELIMIT-184"></a>
+
+   <p>For example, specifying <code>FFTW_PATIENT</code> first plans in
+<code>FFTW_ESTIMATE</code> mode, then in <code>FFTW_MEASURE</code> mode, then
+finally (time permitting) in <code>FFTW_PATIENT</code>.  If
+<code>FFTW_EXHAUSTIVE</code> is specified instead, the planner will further
+progress to <code>FFTW_EXHAUSTIVE</code> mode.
+
+   <p>Note that the <code>seconds</code> argument specifies only a rough limit; in
+practice, the planner may use somewhat more time if the time limit is
+reached when the planner is in the middle of an operation that cannot
+be interrupted.  At the very least, the planner will complete planning
+in <code>FFTW_ESTIMATE</code> mode (which is thus equivalent to a time limit
+of 0).
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Precision.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Precision.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,100 @@
+<html lang="en">
+<head>
+<title>Precision - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Data-Types-and-Files.html#Data-Types-and-Files" title="Data Types and Files">
+<link rel="prev" href="Complex-numbers.html#Complex-numbers" title="Complex numbers">
+<link rel="next" href="Memory-Allocation.html#Memory-Allocation" title="Memory Allocation">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Precision"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Memory-Allocation.html#Memory-Allocation">Memory Allocation</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Complex-numbers.html#Complex-numbers">Complex numbers</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Data-Types-and-Files.html#Data-Types-and-Files">Data Types and Files</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.1.2 Precision</h4>
+
+<p><a name="index-precision-143"></a>
+You can install single and long-double precision versions of FFTW,
+which replace <code>double</code> with <code>float</code> and <code>long double</code>,
+respectively (see <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>).  To use these
+interfaces, you:
+
+     <ul>
+<li>Link to the single/long-double libraries; on Unix, <code>-lfftw3f</code> or
+<code>-lfftw3l</code> instead of (or in addition to) <code>-lfftw3</code>.  (You
+can link to the different-precision libraries simultaneously.)
+
+     <li>Include the <em>same</em> <code>&lt;fftw3.h&gt;</code> header file.
+
+     <li>Replace all lowercase instances of &lsquo;<samp><span class="samp">fftw_</span></samp>&rsquo; with &lsquo;<samp><span class="samp">fftwf_</span></samp>&rsquo; or
+&lsquo;<samp><span class="samp">fftwl_</span></samp>&rsquo; for single or long-double precision, respectively. 
+(<code>fftw_complex</code> becomes <code>fftwf_complex</code>, <code>fftw_execute</code>
+becomes <code>fftwf_execute</code>, etcetera.)
+
+     <li>Uppercase names, i.e. names beginning with &lsquo;<samp><span class="samp">FFTW_</span></samp>&rsquo;, remain the
+same.
+
+     <li>Replace <code>double</code> with <code>float</code> or <code>long double</code> for
+subroutine parameters.
+
+   </ul>
+
+   <p>Depending upon your compiler and/or hardware, <code>long double</code> may not
+be any more precise than <code>double</code> (or may not be supported at all,
+although it is standard in C99). 
+<a name="index-C99-144"></a>
+
+   <p>We also support using the nonstandard <code>__float128</code>
+quadruple-precision type provided by recent versions of <code>gcc</code> on
+32- and 64-bit x86 hardware (see <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>). 
+To use this type, link with <code>-lfftw3q -lquadmath -lm</code> (the
+<code>libquadmath</code> library provided by <code>gcc</code> is needed for
+quadruple-precision trigonometric functions) and use &lsquo;<samp><span class="samp">fftwq_</span></samp>&rsquo;
+identifiers.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,202 @@
+<html lang="en">
+<head>
+<title>Real even/odd DFTs (cosine/sine transforms) - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data" title="More DFTs of Real Data">
+<link rel="prev" href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT" title="The Halfcomplex-format DFT">
+<link rel="next" href="The-Discrete-Hartley-Transform.html#The-Discrete-Hartley-Transform" title="The Discrete Hartley Transform">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Real-even%2fodd-DFTs-(cosine%2fsine-transforms)"></a>
+<a name="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="The-Discrete-Hartley-Transform.html#The-Discrete-Hartley-Transform">The Discrete Hartley Transform</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">The Halfcomplex-format DFT</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a>
+<hr>
+</div>
+
+<h4 class="subsection">2.5.2 Real even/odd DFTs (cosine/sine transforms)</h4>
+
+<p>The Fourier transform of a real-even function f(-x) = f(x) is
+real-even, and i times the Fourier transform of a real-odd
+function f(-x) = -f(x) is real-odd.  Similar results hold for a
+discrete Fourier transform, and thus for these symmetries the need for
+complex inputs/outputs is entirely eliminated.  Moreover, one gains a
+factor of two in speed/space from the fact that the data are real, and
+an additional factor of two from the even/odd symmetry: only the
+non-redundant (first) half of the array need be stored.  The result is
+the real-even DFT (<dfn>REDFT</dfn>) and the real-odd DFT (<dfn>RODFT</dfn>), also
+known as the discrete cosine and sine transforms (<dfn>DCT</dfn> and
+<dfn>DST</dfn>), respectively. 
+<a name="index-real_002deven-DFT-79"></a><a name="index-REDFT-80"></a><a name="index-real_002dodd-DFT-81"></a><a name="index-RODFT-82"></a><a name="index-discrete-cosine-transform-83"></a><a name="index-DCT-84"></a><a name="index-discrete-sine-transform-85"></a><a name="index-DST-86"></a>
+
+   <p>(In this section, we describe the 1d transforms; multi-dimensional
+transforms are just a separable product of these transforms operating
+along each dimension.)
+
+   <p>Because of the discrete sampling, one has an additional choice: is the
+data even/odd around a sampling point, or around the point halfway
+between two samples?  The latter corresponds to <em>shifting</em> the
+samples by <em>half</em> an interval, and gives rise to several transform
+variants denoted by REDFTab and RODFTab: a and
+b are 0 or 1, and indicate whether the input
+(a) and/or output (b) are shifted by half a sample
+(1 means it is shifted).  These are also known as types I-IV of
+the DCT and DST, and all four types are supported by FFTW's r2r
+interface.<a rel="footnote" href="#fn-1" name="fnd-1"><sup>1</sup></a>
+
+   <p>The r2r kinds for the various REDFT and RODFT types supported by FFTW,
+along with the boundary conditions at both ends of the <em>input</em>
+array (<code>n</code> real numbers <code>in[j=0..n-1]</code>), are:
+
+     <ul>
+<li><code>FFTW_REDFT00</code> (DCT-I): even around j=0 and even around j=n-1. 
+<a name="index-FFTW_005fREDFT00-87"></a>
+<li><code>FFTW_REDFT10</code> (DCT-II, &ldquo;the&rdquo; DCT): even around j=-0.5 and even around j=n-0.5. 
+<a name="index-FFTW_005fREDFT10-88"></a>
+<li><code>FFTW_REDFT01</code> (DCT-III, &ldquo;the&rdquo; IDCT): even around j=0 and odd around j=n. 
+<a name="index-FFTW_005fREDFT01-89"></a><a name="index-IDCT-90"></a>
+<li><code>FFTW_REDFT11</code> (DCT-IV): even around j=-0.5 and odd around j=n-0.5. 
+<a name="index-FFTW_005fREDFT11-91"></a>
+<li><code>FFTW_RODFT00</code> (DST-I): odd around j=-1 and odd around j=n. 
+<a name="index-FFTW_005fRODFT00-92"></a>
+<li><code>FFTW_RODFT10</code> (DST-II): odd around j=-0.5 and odd around j=n-0.5. 
+<a name="index-FFTW_005fRODFT10-93"></a>
+<li><code>FFTW_RODFT01</code> (DST-III): odd around j=-1 and even around j=n-1. 
+<a name="index-FFTW_005fRODFT01-94"></a>
+<li><code>FFTW_RODFT11</code> (DST-IV): odd around j=-0.5 and even around j=n-0.5. 
+<a name="index-FFTW_005fRODFT11-95"></a>
+</ul>
+
+   <p>Note that these symmetries apply to the &ldquo;logical&rdquo; array being
+transformed; <strong>there are no constraints on your physical input
+data</strong>.  So, for example, if you specify a size-5 REDFT00 (DCT-I) of the
+data abcde, it corresponds to the DFT of the logical even array
+abcdedcb of size 8.  A size-4 REDFT10 (DCT-II) of the data
+abcd corresponds to the size-8 logical DFT of the even array
+abcddcba, shifted by half a sample.
+
+   <p>All of these transforms are invertible.  The inverse of R*DFT00 is
+R*DFT00; of R*DFT10 is R*DFT01 and vice versa (these are often called
+simply &ldquo;the&rdquo; DCT and IDCT, respectively); and of R*DFT11 is R*DFT11. 
+However, the transforms computed by FFTW are unnormalized, exactly
+like the corresponding real and complex DFTs, so computing a transform
+followed by its inverse yields the original array scaled by N,
+where N is the <em>logical</em> DFT size.  For REDFT00,
+N=2(n-1); for RODFT00, N=2(n+1); otherwise, N=2n. 
+<a name="index-normalization-96"></a><a name="index-IDCT-97"></a>
+
+   <p>Note that the boundary conditions of the transform output array are
+given by the input boundary conditions of the inverse transform. 
+Thus, the above transforms are all inequivalent in terms of
+input/output boundary conditions, even neglecting the 0.5 shift
+difference.
+
+   <p>FFTW is most efficient when N is a product of small factors; note
+that this <em>differs</em> from the factorization of the physical size
+<code>n</code> for REDFT00 and RODFT00!  There is another oddity: <code>n=1</code>
+REDFT00 transforms correspond to N=0, and so are <em>not
+defined</em> (the planner will return <code>NULL</code>).  Otherwise, any positive
+<code>n</code> is supported.
+
+   <p>For the precise mathematical definitions of these transforms as used by
+FFTW, see <a href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>.  (For people accustomed to
+the DCT/DST, FFTW's definitions have a coefficient of 2 in front
+of the cos/sin functions so that they correspond precisely to an
+even/odd DFT of size N.  Some authors also include additional
+multiplicative factors of
+&radic;2for selected inputs and outputs; this makes
+the transform orthogonal, but sacrifices the direct equivalence to a
+symmetric DFT.)
+
+<h5 class="subsubheading">Which type do you need?</h5>
+
+<p>Since the required flavor of even/odd DFT depends upon your problem,
+you are the best judge of this choice, but we can make a few comments
+on relative efficiency to help you in your selection.  In particular,
+R*DFT01 and R*DFT10 tend to be slightly faster than R*DFT11
+(especially for odd sizes), while the R*DFT00 transforms are sometimes
+significantly slower (especially for even sizes).<a rel="footnote" href="#fn-2" name="fnd-2"><sup>2</sup></a>
+
+   <p>Thus, if only the boundary conditions on the transform inputs are
+specified, we generally recommend R*DFT10 over R*DFT00 and R*DFT01 over
+R*DFT11 (unless the half-sample shift or the self-inverse property is
+significant for your problem).
+
+   <p>If performance is important to you and you are using only small sizes
+(say n&lt;200), e.g. for multi-dimensional transforms, then you
+might consider generating hard-coded transforms of those sizes and types
+that you are interested in (see <a href="Generating-your-own-code.html#Generating-your-own-code">Generating your own code</a>).
+
+   <p>We are interested in hearing what types of symmetric transforms you find
+most useful.
+
+<!-- =========> -->
+   <div class="footnote">
+<hr>
+<h4>Footnotes</h4><p class="footnote"><small>[<a name="fn-1" href="#fnd-1">1</a>]</small> There are also type V-VIII transforms, which
+correspond to a logical DFT of <em>odd</em> size N, independent of
+whether the physical size <code>n</code> is odd, but we do not support these
+variants.</p>
+
+   <p class="footnote"><small>[<a name="fn-2" href="#fnd-2">2</a>]</small> R*DFT00 is
+sometimes slower in FFTW because we discovered that the standard
+algorithm for computing this by a pre/post-processed real DFT&mdash;the
+algorithm used in FFTPACK, Numerical Recipes, and other sources for
+decades now&mdash;has serious numerical problems: it already loses several
+decimal places of accuracy for 16k sizes.  There seem to be only two
+alternatives in the literature that do not suffer similarly: a
+recursive decomposition into smaller DCTs, which would require a large
+set of codelets for efficiency and generality, or sacrificing a factor of
+2
+in speed to use a real DFT of twice the size.  We currently
+employ the latter technique for general n, as well as a limited
+form of the former method: a split-radix decomposition when n
+is odd (N a multiple of 4).  For N containing many
+factors of 2, the split-radix method seems to recover most of the
+speed of the standard algorithm without the accuracy tradeoff.</p>
+
+   <hr></div>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Real_002ddata-DFT-Array-Format.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Real_002ddata-DFT-Array-Format.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,98 @@
+<html lang="en">
+<head>
+<title>Real-data DFT Array Format - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Basic-Interface.html#Basic-Interface" title="Basic Interface">
+<link rel="prev" href="Real_002ddata-DFTs.html#Real_002ddata-DFTs" title="Real-data DFTs">
+<link rel="next" href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms" title="Real-to-Real Transforms">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Real-data-DFT-Array-Format"></a>
+<a name="Real_002ddata-DFT-Array-Format"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">Real-data DFTs</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Basic-Interface.html#Basic-Interface">Basic Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.3.4 Real-data DFT Array Format</h4>
+
+<p><a name="index-r2c_002fc2r-multi_002ddimensional-array-format-199"></a>
+The output of a DFT of real data (r2c) contains symmetries that, in
+principle, make half of the outputs redundant (see <a href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>).  (Similarly for the input of an inverse c2r transform.)  In
+practice, it is not possible to entirely realize these savings in an
+efficient and understandable format that generalizes to
+multi-dimensional transforms.  Instead, the output of the r2c
+transforms is <em>slightly</em> over half of the output of the
+corresponding complex transform.  We do not &ldquo;pack&rdquo; the data in any
+way, but store it as an ordinary array of <code>fftw_complex</code> values. 
+In fact, this data is simply a subsection of what would be the array in
+the corresponding complex transform.
+
+   <p>Specifically, for a real transform of d (= <code>rank</code>)
+dimensions n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub>, the complex data is an n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;(n<sub>d-1</sub>/2 + 1) array of
+<code>fftw_complex</code> values in row-major order (with the division rounded
+down).  That is, we only store the <em>lower</em> half (non-negative
+frequencies), plus one element, of the last dimension of the data from
+the ordinary complex transform.  (We could have instead taken half of
+any other dimension, but implementation turns out to be simpler if the
+last, contiguous, dimension is used.)
+
+   <p><a name="index-out_002dof_002dplace-200"></a>For an out-of-place transform, the real data is simply an array with
+physical dimensions n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> in row-major order.
+
+   <p><a name="index-in_002dplace-201"></a><a name="index-padding-202"></a>For an in-place transform, some complications arise since the complex data
+is slightly larger than the real data.  In this case, the final
+dimension of the real data must be <em>padded</em> with extra values to
+accommodate the size of the complex data&mdash;two extra if the last
+dimension is even and one if it is odd.  That is, the last dimension of
+the real data must physically contain
+2 * (n<sub>d-1</sub>/2+1)<code>double</code> values (exactly enough to hold the complex data).  This
+physical array size does not, however, change the <em>logical</em> array
+size&mdash;only
+n<sub>d-1</sub>values are actually stored in the last dimension, and
+n<sub>d-1</sub>is the last dimension passed to the planner.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Real_002ddata-DFTs.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Real_002ddata-DFTs.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,158 @@
+<html lang="en">
+<head>
+<title>Real-data DFTs - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Basic-Interface.html#Basic-Interface" title="Basic Interface">
+<link rel="prev" href="Planner-Flags.html#Planner-Flags" title="Planner Flags">
+<link rel="next" href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format" title="Real-data DFT Array Format">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Real-data-DFTs"></a>
+<a name="Real_002ddata-DFTs"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Planner-Flags.html#Planner-Flags">Planner Flags</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Basic-Interface.html#Basic-Interface">Basic Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.3.3 Real-data DFTs</h4>
+
+<pre class="example">     fftw_plan fftw_plan_dft_r2c_1d(int n0,
+                                    double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_r2c_2d(int n0, int n1,
+                                    double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_r2c_3d(int n0, int n1, int n2,
+                                    double *in, fftw_complex *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_r2c(int rank, const int *n,
+                                 double *in, fftw_complex *out,
+                                 unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fdft_005fr2c_005f1d-185"></a><a name="index-fftw_005fplan_005fdft_005fr2c_005f2d-186"></a><a name="index-fftw_005fplan_005fdft_005fr2c_005f3d-187"></a><a name="index-fftw_005fplan_005fdft_005fr2c-188"></a><a name="index-r2c-189"></a>
+Plan a real-input/complex-output discrete Fourier transform (DFT) in
+zero or more dimensions, returning an <code>fftw_plan</code> (see <a href="Using-Plans.html#Using-Plans">Using Plans</a>).
+
+   <p>Once you have created a plan for a certain transform type and
+parameters, then creating another plan of the same type and parameters,
+but for different arrays, is fast and shares constant data with the
+first plan (if it still exists).
+
+   <p>The planner returns <code>NULL</code> if the plan cannot be created.  A
+non-<code>NULL</code> plan is always returned by the basic interface unless
+you are using a customized FFTW configuration supporting a restricted
+set of transforms, or if you use the <code>FFTW_PRESERVE_INPUT</code> flag
+with a multi-dimensional out-of-place c2r transform (see below).
+
+<h5 class="subsubheading">Arguments</h5>
+
+     <ul>
+<li><code>rank</code> is the rank of the transform (it should be the size of the
+array <code>*n</code>), and can be any non-negative integer.  (See <a href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a>, for the definition of &ldquo;rank&rdquo;.)  The
+&lsquo;<samp><span class="samp">_1d</span></samp>&rsquo;, &lsquo;<samp><span class="samp">_2d</span></samp>&rsquo;, and &lsquo;<samp><span class="samp">_3d</span></samp>&rsquo; planners correspond to a
+<code>rank</code> of <code>1</code>, <code>2</code>, and <code>3</code>, respectively.  The rank
+may be zero, which is equivalent to a rank-1 transform of size 1, i.e. a
+copy of one real number (with zero imaginary part) from input to output.
+
+     <li><code>n0</code>, <code>n1</code>, <code>n2</code>, or <code>n[0..rank-1]</code>, (as appropriate
+for each routine) specify the size of the transform dimensions.  They
+can be any positive integer.  This is different in general from the
+<em>physical</em> array dimensions, which are described in <a href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a>.
+
+          <ul>
+<li>FFTW is best at handling sizes of the form
+2<sup>a</sup> 3<sup>b</sup> 5<sup>c</sup> 7<sup>d</sup>
+        11<sup>e</sup> 13<sup>f</sup>,where e+f is either 0 or 1, and the other exponents
+are arbitrary.  Other sizes are computed by means of a slow,
+general-purpose algorithm (which nevertheless retains <i>O</i>(<i>n</i>&nbsp;log&nbsp;<i>n</i>) performance even for prime sizes).  (It is possible to customize FFTW
+for different array sizes; see <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>.) 
+Transforms whose sizes are powers of 2 are especially fast, and
+it is generally beneficial for the <em>last</em> dimension of an r2c/c2r
+transform to be <em>even</em>. 
+</ul>
+
+     <li><code>in</code> and <code>out</code> point to the input and output arrays of the
+transform, which may be the same (yielding an in-place transform). 
+<a name="index-in_002dplace-190"></a>These arrays are overwritten during planning, unless
+<code>FFTW_ESTIMATE</code> is used in the flags.  (The arrays need not be
+initialized, but they must be allocated.)  For an in-place transform, it
+is important to remember that the real array will require padding,
+described in <a href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a>. 
+<a name="index-padding-191"></a>
+<li><a name="index-flags-192"></a><code>flags</code> is a bitwise OR (&lsquo;<samp><span class="samp">|</span></samp>&rsquo;) of zero or more planner flags,
+as defined in <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a>.
+
+</ul>
+
+   <p>The inverse transforms, taking complex input (storing the non-redundant
+half of a logically Hermitian array) to real output, are given by:
+
+<pre class="example">     fftw_plan fftw_plan_dft_c2r_1d(int n0,
+                                    fftw_complex *in, double *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_c2r_2d(int n0, int n1,
+                                    fftw_complex *in, double *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_c2r_3d(int n0, int n1, int n2,
+                                    fftw_complex *in, double *out,
+                                    unsigned flags);
+     fftw_plan fftw_plan_dft_c2r(int rank, const int *n,
+                                 fftw_complex *in, double *out,
+                                 unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fdft_005fc2r_005f1d-193"></a><a name="index-fftw_005fplan_005fdft_005fc2r_005f2d-194"></a><a name="index-fftw_005fplan_005fdft_005fc2r_005f3d-195"></a><a name="index-fftw_005fplan_005fdft_005fc2r-196"></a><a name="index-c2r-197"></a>
+The arguments are the same as for the r2c transforms, except that the
+input and output data formats are reversed.
+
+   <p>FFTW computes an unnormalized transform: computing an r2c followed by a
+c2r transform (or vice versa) will result in the original data
+multiplied by the size of the transform (the product of the logical
+dimensions). 
+<a name="index-normalization-198"></a>An r2c transform produces the same output as a <code>FFTW_FORWARD</code>
+complex DFT of the same input, and a c2r transform is correspondingly
+equivalent to <code>FFTW_BACKWARD</code>.  For more information, see <a href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Real_002dto_002dReal-Transform-Kinds.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Real_002dto_002dReal-Transform-Kinds.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,115 @@
+<html lang="en">
+<head>
+<title>Real-to-Real Transform Kinds - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Basic-Interface.html#Basic-Interface" title="Basic Interface">
+<link rel="prev" href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms" title="Real-to-Real Transforms">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Real-to-Real-Transform-Kinds"></a>
+<a name="Real_002dto_002dReal-Transform-Kinds"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">Real-to-Real Transforms</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Basic-Interface.html#Basic-Interface">Basic Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.3.6 Real-to-Real Transform Kinds</h4>
+
+<p><a name="index-kind-_0028r2r_0029-212"></a>
+FFTW currently supports 11 different r2r transform kinds, specified by
+one of the constants below.  For the precise definitions of these
+transforms, see <a href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>.  For a more colloquial
+introduction to these transform kinds, see <a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a>.
+
+   <p>For dimension of size <code>n</code>, there is a corresponding &ldquo;logical&rdquo;
+dimension <code>N</code> that determines the normalization (and the optimal
+factorization); the formula for <code>N</code> is given for each kind below. 
+Also, with each transform kind is listed its corrsponding inverse
+transform.  FFTW computes unnormalized transforms: a transform followed
+by its inverse will result in the original data multiplied by <code>N</code>
+(or the product of the <code>N</code>'s for each dimension, in
+multi-dimensions). 
+<a name="index-normalization-213"></a>
+     <ul>
+<li><a name="index-FFTW_005fR2HC-214"></a><code>FFTW_R2HC</code> computes a real-input DFT with output in
+&ldquo;halfcomplex&rdquo; format, i.e. real and imaginary parts for a transform of
+size <code>n</code> stored as:
+<p align=center>
+r<sub>0</sub>, r<sub>1</sub>, r<sub>2</sub>, ..., r<sub>n/2</sub>, i<sub>(n+1)/2-1</sub>, ..., i<sub>2</sub>, i<sub>1</sub>
+</p>(Logical <code>N=n</code>, inverse is <code>FFTW_HC2R</code>.)
+
+     <li><a name="index-FFTW_005fHC2R-215"></a><code>FFTW_HC2R</code> computes the reverse of <code>FFTW_R2HC</code>, above. 
+(Logical <code>N=n</code>, inverse is <code>FFTW_R2HC</code>.)
+
+     <li><a name="index-FFTW_005fDHT-216"></a><code>FFTW_DHT</code> computes a discrete Hartley transform. 
+(Logical <code>N=n</code>, inverse is <code>FFTW_DHT</code>.) 
+<a name="index-discrete-Hartley-transform-217"></a>
+<li><a name="index-FFTW_005fREDFT00-218"></a><code>FFTW_REDFT00</code> computes an REDFT00 transform, i.e. a DCT-I. 
+(Logical <code>N=2*(n-1)</code>, inverse is <code>FFTW_REDFT00</code>.) 
+<a name="index-discrete-cosine-transform-219"></a><a name="index-DCT-220"></a>
+<li><a name="index-FFTW_005fREDFT10-221"></a><code>FFTW_REDFT10</code> computes an REDFT10 transform, i.e. a DCT-II (sometimes called &ldquo;the&rdquo; DCT). 
+(Logical <code>N=2*n</code>, inverse is <code>FFTW_REDFT01</code>.)
+
+     <li><a name="index-FFTW_005fREDFT01-222"></a><code>FFTW_REDFT01</code> computes an REDFT01 transform, i.e. a DCT-III (sometimes called &ldquo;the&rdquo; IDCT, being the inverse of DCT-II). 
+(Logical <code>N=2*n</code>, inverse is <code>FFTW_REDFT=10</code>.) 
+<a name="index-IDCT-223"></a>
+<li><a name="index-FFTW_005fREDFT11-224"></a><code>FFTW_REDFT11</code> computes an REDFT11 transform, i.e. a DCT-IV. 
+(Logical <code>N=2*n</code>, inverse is <code>FFTW_REDFT11</code>.)
+
+     <li><a name="index-FFTW_005fRODFT00-225"></a><code>FFTW_RODFT00</code> computes an RODFT00 transform, i.e. a DST-I. 
+(Logical <code>N=2*(n+1)</code>, inverse is <code>FFTW_RODFT00</code>.) 
+<a name="index-discrete-sine-transform-226"></a><a name="index-DST-227"></a>
+<li><a name="index-FFTW_005fRODFT10-228"></a><code>FFTW_RODFT10</code> computes an RODFT10 transform, i.e. a DST-II. 
+(Logical <code>N=2*n</code>, inverse is <code>FFTW_RODFT01</code>.)
+
+     <li><a name="index-FFTW_005fRODFT01-229"></a><code>FFTW_RODFT01</code> computes an RODFT01 transform, i.e. a DST-III. 
+(Logical <code>N=2*n</code>, inverse is <code>FFTW_RODFT=10</code>.)
+
+     <li><a name="index-FFTW_005fRODFT11-230"></a><code>FFTW_RODFT11</code> computes an RODFT11 transform, i.e. a DST-IV. 
+(Logical <code>N=2*n</code>, inverse is <code>FFTW_RODFT11</code>.)
+
+   </ul>
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Real_002dto_002dReal-Transforms.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Real_002dto_002dReal-Transforms.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,143 @@
+<html lang="en">
+<head>
+<title>Real-to-Real Transforms - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Basic-Interface.html#Basic-Interface" title="Basic Interface">
+<link rel="prev" href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format" title="Real-data DFT Array Format">
+<link rel="next" href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds" title="Real-to-Real Transform Kinds">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Real-to-Real-Transforms"></a>
+<a name="Real_002dto_002dReal-Transforms"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">Real-data DFT Array Format</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Basic-Interface.html#Basic-Interface">Basic Interface</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.3.5 Real-to-Real Transforms</h4>
+
+<p><a name="index-r2r-203"></a>
+<pre class="example">     fftw_plan fftw_plan_r2r_1d(int n, double *in, double *out,
+                                fftw_r2r_kind kind, unsigned flags);
+     fftw_plan fftw_plan_r2r_2d(int n0, int n1, double *in, double *out,
+                                fftw_r2r_kind kind0, fftw_r2r_kind kind1,
+                                unsigned flags);
+     fftw_plan fftw_plan_r2r_3d(int n0, int n1, int n2,
+                                double *in, double *out,
+                                fftw_r2r_kind kind0,
+                                fftw_r2r_kind kind1,
+                                fftw_r2r_kind kind2,
+                                unsigned flags);
+     fftw_plan fftw_plan_r2r(int rank, const int *n, double *in, double *out,
+                             const fftw_r2r_kind *kind, unsigned flags);
+</pre>
+   <p><a name="index-fftw_005fplan_005fr2r_005f1d-204"></a><a name="index-fftw_005fplan_005fr2r_005f2d-205"></a><a name="index-fftw_005fplan_005fr2r_005f3d-206"></a><a name="index-fftw_005fplan_005fr2r-207"></a>
+Plan a real input/output (r2r) transform of various kinds in zero or
+more dimensions, returning an <code>fftw_plan</code> (see <a href="Using-Plans.html#Using-Plans">Using Plans</a>).
+
+   <p>Once you have created a plan for a certain transform type and
+parameters, then creating another plan of the same type and parameters,
+but for different arrays, is fast and shares constant data with the
+first plan (if it still exists).
+
+   <p>The planner returns <code>NULL</code> if the plan cannot be created.  A
+non-<code>NULL</code> plan is always returned by the basic interface unless
+you are using a customized FFTW configuration supporting a restricted
+set of transforms, or for size-1 <code>FFTW_REDFT00</code> kinds (which are
+not defined). 
+<a name="index-FFTW_005fREDFT00-208"></a>
+
+<h5 class="subsubheading">Arguments</h5>
+
+     <ul>
+<li><code>rank</code> is the dimensionality of the transform (it should be the
+size of the arrays <code>*n</code> and <code>*kind</code>), and can be any
+non-negative integer.  The &lsquo;<samp><span class="samp">_1d</span></samp>&rsquo;, &lsquo;<samp><span class="samp">_2d</span></samp>&rsquo;, and &lsquo;<samp><span class="samp">_3d</span></samp>&rsquo;
+planners correspond to a <code>rank</code> of <code>1</code>, <code>2</code>, and
+<code>3</code>, respectively.  A <code>rank</code> of zero is equivalent to a copy
+of one number from input to output.
+
+     <li><code>n</code>, or <code>n0</code>/<code>n1</code>/<code>n2</code>, or <code>n[rank]</code>,
+respectively, gives the (physical) size of the transform dimensions. 
+They can be any positive integer.
+
+          <ul>
+<li><a name="index-row_002dmajor-209"></a>Multi-dimensional arrays are stored in row-major order with dimensions:
+<code>n0</code> x <code>n1</code>; or <code>n0</code> x <code>n1</code> x <code>n2</code>; or
+<code>n[0]</code> x <code>n[1]</code> x ... x <code>n[rank-1]</code>. 
+See <a href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>. 
+<li>FFTW is generally best at handling sizes of the form
+2<sup>a</sup> 3<sup>b</sup> 5<sup>c</sup> 7<sup>d</sup>
+        11<sup>e</sup> 13<sup>f</sup>,where e+f is either 0 or 1, and the other exponents
+are arbitrary.  Other sizes are computed by means of a slow,
+general-purpose algorithm (which nevertheless retains <i>O</i>(<i>n</i>&nbsp;log&nbsp;<i>n</i>) performance even for prime sizes).  (It is possible to customize FFTW
+for different array sizes; see <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>.) 
+Transforms whose sizes are powers of 2 are especially fast. 
+<li>For a <code>REDFT00</code> or <code>RODFT00</code> transform kind in a dimension of
+size n, it is n-1 or n+1, respectively, that
+should be factorizable in the above form. 
+</ul>
+
+     <li><code>in</code> and <code>out</code> point to the input and output arrays of the
+transform, which may be the same (yielding an in-place transform). 
+<a name="index-in_002dplace-210"></a>These arrays are overwritten during planning, unless
+<code>FFTW_ESTIMATE</code> is used in the flags.  (The arrays need not be
+initialized, but they must be allocated.)
+
+     <li><code>kind</code>, or <code>kind0</code>/<code>kind1</code>/<code>kind2</code>, or
+<code>kind[rank]</code>, is the kind of r2r transform used for the
+corresponding dimension.  The valid kind constants are described in
+<a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">Real-to-Real Transform Kinds</a>.  In a multi-dimensional transform,
+what is computed is the separable product formed by taking each
+transform kind along the corresponding dimension, one dimension after
+another.
+
+     <li><a name="index-flags-211"></a><code>flags</code> is a bitwise OR (&lsquo;<samp><span class="samp">|</span></samp>&rsquo;) of zero or more planner flags,
+as defined in <a href="Planner-Flags.html#Planner-Flags">Planner Flags</a>.
+
+</ul>
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Reversing-array-dimensions.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Reversing-array-dimensions.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,132 @@
+<html lang="en">
+<head>
+<title>Reversing array dimensions - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran" title="Calling FFTW from Modern Fortran">
+<link rel="prev" href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface" title="Overview of Fortran interface">
+<link rel="next" href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference" title="FFTW Fortran type reference">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Reversing-array-dimensions"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">FFTW Fortran type reference</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">Overview of Fortran interface</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">7.2 Reversing array dimensions</h3>
+
+<p><a name="index-row_002dmajor-517"></a><a name="index-column_002dmajor-518"></a>A minor annoyance in calling FFTW from Fortran is that FFTW's array
+dimensions are defined in the C convention (row-major order), while
+Fortran's array dimensions are the opposite convention (column-major
+order). See <a href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>.  This is just a
+bookkeeping difference, with no effect on performance.  The only
+consequence of this is that, whenever you create an FFTW plan for a
+multi-dimensional transform, you must always <em>reverse the
+ordering of the dimensions</em>.
+
+   <p>For example, consider the three-dimensional (L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;N) arrays:
+
+<pre class="example">       complex(C_DOUBLE_COMPLEX), dimension(L,M,N) :: in, out
+</pre>
+   <p>To plan a DFT for these arrays using <code>fftw_plan_dft_3d</code>, you could do:
+
+   <p><a name="index-fftw_005fplan_005fdft_005f3d-519"></a>
+<pre class="example">       plan = fftw_plan_dft_3d(N,M,L, in,out, FFTW_FORWARD,FFTW_ESTIMATE)
+</pre>
+   <p>That is, from FFTW's perspective this is a N&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;L array. 
+<em>No data transposition need occur</em>, as this is <em>only
+notation</em>.  Similarly, to use the more generic routine
+<code>fftw_plan_dft</code> with the same arrays, you could do:
+
+<pre class="example">       integer(C_INT), dimension(3) :: n = [N,M,L]
+       plan = fftw_plan_dft_3d(3, n, in,out, FFTW_FORWARD,FFTW_ESTIMATE)
+</pre>
+   <p>Note, by the way, that this is different from the legacy Fortran
+interface (see <a href="Fortran_002dinterface-routines.html#Fortran_002dinterface-routines">Fortran-interface routines</a>), which automatically
+reverses the order of the array dimension for you.  Here, you are
+calling the C interface directly, so there is no &ldquo;translation&rdquo; layer.
+
+   <p><a name="index-r2c_002fc2r-multi_002ddimensional-array-format-520"></a>An important thing to keep in mind is the implication of this for
+multidimensional real-to-complex transforms (see <a href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a>).  In C, a multidimensional real-to-complex DFT
+chops the last dimension roughly in half (N&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;L real input
+goes to N&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;L/2+1 complex output).  In Fortran, because
+the array dimension notation is reversed, the <em>first</em> dimension of
+the complex data is chopped roughly in half.  For example consider the
+&lsquo;<samp><span class="samp">r2c</span></samp>&rsquo; transform of L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;N real input in Fortran:
+
+   <p><a name="index-fftw_005fplan_005fdft_005fr2c_005f3d-521"></a><a name="index-fftw_005fexecute_005fdft_005fr2c-522"></a>
+<pre class="example">       type(C_PTR) :: plan
+       real(C_DOUBLE), dimension(L,M,N) :: in
+       complex(C_DOUBLE_COMPLEX), dimension(L/2+1,M,N) :: out
+       plan = fftw_plan_dft_r2c_3d(N,M,L, in,out, FFTW_ESTIMATE)
+       ...
+       call fftw_execute_dft_r2c(plan, in, out)
+</pre>
+   <p><a name="index-in_002dplace-523"></a><a name="index-padding-524"></a>Alternatively, for an in-place r2c transform, as described in the C
+documentation we must <em>pad</em> the <em>first</em> dimension of the
+real input with an extra two entries (which are ignored by FFTW) so as
+to leave enough space for the complex output. The input is
+<em>allocated</em> as a 2[L/2+1]&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;N array, even though only
+L&nbsp;&times;&nbsp;M&nbsp;&times;&nbsp;N of it is actually used.  In this example, we will
+allocate the array as a pointer type, using &lsquo;<samp><span class="samp">fftw_alloc</span></samp>&rsquo; to
+ensure aligned memory for maximum performance (see <a href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">Allocating aligned memory in Fortran</a>); this also makes it easy to reference the
+same memory as both a real array and a complex array.
+
+   <p><a name="index-fftw_005falloc_005fcomplex-525"></a><a name="index-c_005ff_005fpointer-526"></a>
+<pre class="example">       real(C_DOUBLE), pointer :: in(:,:,:)
+       complex(C_DOUBLE_COMPLEX), pointer :: out(:,:,:)
+       type(C_PTR) :: plan, data
+       data = fftw_alloc_complex(int((L/2+1) * M * N, C_SIZE_T))
+       call c_f_pointer(data, in, [2*(L/2+1),M,N])
+       call c_f_pointer(data, out, [L/2+1,M,N])
+       plan = fftw_plan_dft_r2c_3d(N,M,L, in,out, FFTW_ESTIMATE)
+       ...
+       call fftw_execute_dft_r2c(plan, in, out)
+       ...
+       call fftw_destroy_plan(plan)
+       call fftw_free(data)
+</pre>
+   <!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Row_002dmajor-Format.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Row_002dmajor-Format.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,93 @@
+<html lang="en">
+<head>
+<title>Row-major Format - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format" title="Multi-dimensional Array Format">
+<link rel="prev" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format" title="Multi-dimensional Array Format">
+<link rel="next" href="Column_002dmajor-Format.html#Column_002dmajor-Format" title="Column-major Format">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Row-major-Format"></a>
+<a name="Row_002dmajor-Format"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Column_002dmajor-Format.html#Column_002dmajor-Format">Column-major Format</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>
+<hr>
+</div>
+
+<h4 class="subsection">3.2.1 Row-major Format</h4>
+
+<p><a name="index-row_002dmajor-115"></a>
+The multi-dimensional arrays passed to <code>fftw_plan_dft</code> etcetera
+are expected to be stored as a single contiguous block in
+<dfn>row-major</dfn> order (sometimes called &ldquo;C order&rdquo;).  Basically, this
+means that as you step through adjacent memory locations, the first
+dimension's index varies most slowly and the last dimension's index
+varies most quickly.
+
+   <p>To be more explicit, let us consider an array of rank d whose
+dimensions are n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub>. Now, we specify a location in the array by a
+sequence of d (zero-based) indices, one for each dimension:
+(i<sub>0</sub>, i<sub>1</sub>, i<sub>2</sub>,..., i<sub>d-1</sub>). If the array is stored in row-major
+order, then this element is located at the position
+i<sub>d-1</sub> + n<sub>d-1</sub> * (i<sub>d-2</sub> + n<sub>d-2</sub> * (... + n<sub>1</sub> * i<sub>0</sub>)).
+
+   <p>Note that, for the ordinary complex DFT, each element of the array
+must be of type <code>fftw_complex</code>; i.e. a (real, imaginary) pair of
+(double-precision) numbers.
+
+   <p>In the advanced FFTW interface, the physical dimensions n from
+which the indices are computed can be different from (larger than)
+the logical dimensions of the transform to be computed, in order to
+transform a subset of a larger array. 
+<a name="index-advanced-interface-116"></a>Note also that, in the advanced interface, the expression above is
+multiplied by a <dfn>stride</dfn> to get the actual array index&mdash;this is
+useful in situations where each element of the multi-dimensional array
+is actually a data structure (or another array), and you just want to
+transform a single field. In the basic interface, however, the stride
+is 1. 
+<a name="index-stride-117"></a>
+<!-- =========> -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/SIMD-alignment-and-fftw_005fmalloc.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/SIMD-alignment-and-fftw_005fmalloc.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,102 @@
+<html lang="en">
+<head>
+<title>SIMD alignment and fftw_malloc - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Other-Important-Topics.html#Other-Important-Topics" title="Other Important Topics">
+<link rel="prev" href="Other-Important-Topics.html#Other-Important-Topics" title="Other Important Topics">
+<link rel="next" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format" title="Multi-dimensional Array Format">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="SIMD-alignment-and-fftw_malloc"></a>
+<a name="SIMD-alignment-and-fftw_005fmalloc"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Other-Important-Topics.html#Other-Important-Topics">Other Important Topics</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Other-Important-Topics.html#Other-Important-Topics">Other Important Topics</a>
+<hr>
+</div>
+
+<h3 class="section">3.1 SIMD alignment and fftw_malloc</h3>
+
+<p>SIMD, which stands for &ldquo;Single Instruction Multiple Data,&rdquo; is a set of
+special operations supported by some processors to perform a single
+operation on several numbers (usually 2 or 4) simultaneously.  SIMD
+floating-point instructions are available on several popular CPUs:
+SSE/SSE2/AVX on recent x86/x86-64 processors, AltiVec (single precision)
+on some PowerPCs (Apple G4 and higher), NEON on some ARM models, and MIPS Paired Single
+(currently only in FFTW 3.2.x).  FFTW can be compiled to support the
+SIMD instructions on any of these systems. 
+<a name="index-SIMD-102"></a><a name="index-SSE-103"></a><a name="index-SSE2-104"></a><a name="index-AVX-105"></a><a name="index-AltiVec-106"></a><a name="index-MIPS-PS-107"></a><a name="index-precision-108"></a>
+
+   <p>A program linking to an FFTW library compiled with SIMD support can
+obtain a nonnegligible speedup for most complex and r2c/c2r
+transforms.  In order to obtain this speedup, however, the arrays of
+complex (or real) data passed to FFTW must be specially aligned in
+memory (typically 16-byte aligned), and often this alignment is more
+stringent than that provided by the usual <code>malloc</code> (etc.) 
+allocation routines.
+
+   <p><a name="index-portability-109"></a>In order to guarantee proper alignment for SIMD, therefore, in case
+your program is ever linked against a SIMD-using FFTW, we recommend
+allocating your transform data with <code>fftw_malloc</code> and
+de-allocating it with <code>fftw_free</code>. 
+<a name="index-fftw_005fmalloc-110"></a><a name="index-fftw_005ffree-111"></a>These have exactly the same interface and behavior as
+<code>malloc</code>/<code>free</code>, except that for a SIMD FFTW they ensure
+that the returned pointer has the necessary alignment (by calling
+<code>memalign</code> or its equivalent on your OS).
+
+   <p>You are not <em>required</em> to use <code>fftw_malloc</code>.  You can
+allocate your data in any way that you like, from <code>malloc</code> to
+<code>new</code> (in C++) to a fixed-size array declaration.  If the array
+happens not to be properly aligned, FFTW will not use the SIMD
+extensions. 
+<a name="index-C_002b_002b-112"></a>
+<a name="index-fftw_005falloc_005freal-113"></a><a name="index-fftw_005falloc_005fcomplex-114"></a>Since <code>fftw_malloc</code> only ever needs to be used for real and
+complex arrays, we provide two convenient wrapper routines
+<code>fftw_alloc_real(N)</code> and <code>fftw_alloc_complex(N)</code> that are
+equivalent to <code>(double*)fftw_malloc(sizeof(double) * N)</code> and
+<code>(fftw_complex*)fftw_malloc(sizeof(fftw_complex) * N)</code>,
+respectively (or their equivalents in other precisions).
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,84 @@
+<html lang="en">
+<head>
+<title>The 1d Discrete Fourier Transform (DFT) - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes" title="What FFTW Really Computes">
+<link rel="prev" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes" title="What FFTW Really Computes">
+<link rel="next" href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT" title="The 1d Real-data DFT">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="The-1d-Discrete-Fourier-Transform-(DFT)"></a>
+<a name="The-1d-Discrete-Fourier-Transform-_0028DFT_0029"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT">The 1d Real-data DFT</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.8.1 The 1d Discrete Fourier Transform (DFT)</h4>
+
+<p><a name="index-discrete-Fourier-transform-292"></a><a name="index-DFT-293"></a>The forward (<code>FFTW_FORWARD</code>) discrete Fourier transform (DFT) of a
+1d complex array X of size n computes an array Y,
+where:
+<center><img src="equation-dft.png" align="top">.</center>The backward (<code>FFTW_BACKWARD</code>) DFT computes:
+<center><img src="equation-idft.png" align="top">.</center>
+
+   <p><a name="index-normalization-294"></a>FFTW computes an unnormalized transform, in that there is no coefficient
+in front of the summation in the DFT.  In other words, applying the
+forward and then the backward transform will multiply the input by
+n.
+
+   <p><a name="index-frequency-295"></a>From above, an <code>FFTW_FORWARD</code> transform corresponds to a sign of
+-1 in the exponent of the DFT.  Note also that we use the
+standard &ldquo;in-order&rdquo; output ordering&mdash;the k-th output
+corresponds to the frequency k/n (or k/T, where T
+is your total sampling period).  For those who like to think in terms of
+positive and negative frequencies, this means that the positive
+frequencies are stored in the first half of the output and the negative
+frequencies are stored in backwards order in the second half of the
+output.  (The frequency -k/n is the same as the frequency
+(n-k)/n.)
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/The-1d-Real_002ddata-DFT.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/The-1d-Real_002ddata-DFT.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,94 @@
+<html lang="en">
+<head>
+<title>The 1d Real-data DFT - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes" title="What FFTW Really Computes">
+<link rel="prev" href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#The-1d-Discrete-Fourier-Transform-_0028DFT_0029" title="The 1d Discrete Fourier Transform (DFT)">
+<link rel="next" href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029" title="1d Real-even DFTs (DCTs)">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="The-1d-Real-data-DFT"></a>
+<a name="The-1d-Real_002ddata-DFT"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#The-1d-Discrete-Fourier-Transform-_0028DFT_0029">The 1d Discrete Fourier Transform (DFT)</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.8.2 The 1d Real-data DFT</h4>
+
+<p>The real-input (r2c) DFT in FFTW computes the <em>forward</em> transform
+Y of the size <code>n</code> real array X, exactly as defined
+above, i.e. 
+<center><img src="equation-dft.png" align="top">.</center>This output array Y can easily be shown to possess the
+&ldquo;Hermitian&rdquo; symmetry
+<a name="index-Hermitian-296"></a><i>Y<sub>k</sub> = Y<sub>n-k</sub></i><sup>*</sup>,where we take Y to be periodic so that
+<i>Y<sub>n</sub> = Y</i><sub>0</sub>.
+
+   <p>As a result of this symmetry, half of the output Y is redundant
+(being the complex conjugate of the other half), and so the 1d r2c
+transforms only output elements 0<small class="dots">...</small>n/2 of Y
+(n/2+1 complex numbers), where the division by 2 is
+rounded down.
+
+   <p>Moreover, the Hermitian symmetry implies that
+<i>Y</i><sub>0</sub>and, if n is even, the
+<i>Y</i><sub><i>n</i>/2</sub>element, are purely real.  So, for the <code>R2HC</code> r2r transform, these
+elements are not stored in the halfcomplex output format. 
+<a name="index-r2r-297"></a><a name="index-R2HC-298"></a><a name="index-halfcomplex-format-299"></a>
+
+   <p>The c2r and <code>H2RC</code> r2r transforms compute the backward DFT of the
+<em>complex</em> array X with Hermitian symmetry, stored in the
+r2c/<code>R2HC</code> output formats, respectively, where the backward
+transform is defined exactly as for the complex case:
+<center><img src="equation-idft.png" align="top">.</center>The outputs <code>Y</code> of this transform can easily be seen to be purely
+real, and are stored as an array of real numbers.
+
+   <p><a name="index-normalization-300"></a>Like FFTW's complex DFT, these transforms are unnormalized.  In other
+words, applying the real-to-complex (forward) and then the
+complex-to-real (backward) transform will multiply the input by
+n.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/The-Discrete-Hartley-Transform.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/The-Discrete-Hartley-Transform.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+<html lang="en">
+<head>
+<title>The Discrete Hartley Transform - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data" title="More DFTs of Real Data">
+<link rel="prev" href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029" title="Real even/odd DFTs (cosine/sine transforms)">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="The-Discrete-Hartley-Transform"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a>
+<hr>
+</div>
+
+<h4 class="subsection">2.5.3 The Discrete Hartley Transform</h4>
+
+<p>If you are planning to use the DHT because you've heard that it is
+&ldquo;faster&rdquo; than the DFT (FFT), <strong>stop here</strong>.  The DHT is not
+faster than the DFT.  That story is an old but enduring misconception
+that was debunked in 1987.
+
+   <p>The discrete Hartley transform (DHT) is an invertible linear transform
+closely related to the DFT.  In the DFT, one multiplies each input by
+cos - i * sin (a complex exponential), whereas in the DHT each
+input is multiplied by simply cos + sin.  Thus, the DHT
+transforms <code>n</code> real numbers to <code>n</code> real numbers, and has the
+convenient property of being its own inverse.  In FFTW, a DHT (of any
+positive <code>n</code>) can be specified by an r2r kind of <code>FFTW_DHT</code>. 
+<a name="index-FFTW_005fDHT-98"></a><a name="index-discrete-Hartley-transform-99"></a><a name="index-DHT-100"></a>
+Like the DFT, in FFTW the DHT is unnormalized, so computing a DHT of
+size <code>n</code> followed by another DHT of the same size will result in
+the original array multiplied by <code>n</code>. 
+<a name="index-normalization-101"></a>
+The DHT was originally proposed as a more efficient alternative to the
+DFT for real data, but it was subsequently shown that a specialized DFT
+(such as FFTW's r2hc or r2c transforms) could be just as fast.  In FFTW,
+the DHT is actually computed by post-processing an r2hc transform, so
+there is ordinarily no reason to prefer it from a performance
+perspective.<a rel="footnote" href="#fn-1" name="fnd-1"><sup>1</sup></a>
+However, we have heard rumors that the DHT might be the most appropriate
+transform in its own right for certain applications, and we would be
+very interested to hear from anyone who finds it useful.
+
+   <p>If <code>FFTW_DHT</code> is specified for multiple dimensions of a
+multi-dimensional transform, FFTW computes the separable product of 1d
+DHTs along each dimension.  Unfortunately, this is not quite the same
+thing as a true multi-dimensional DHT; you can compute the latter, if
+necessary, with at most <code>rank-1</code> post-processing passes
+[see e.g. H. Hao and R. N. Bracewell, <i>Proc. IEEE</i> <b>75</b>, 264&ndash;266 (1987)].
+
+   <p>For the precise mathematical definition of the DHT as used by FFTW, see
+<a href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>.
+
+   <div class="footnote">
+<hr>
+<h4>Footnotes</h4><p class="footnote"><small>[<a name="fn-1" href="#fnd-1">1</a>]</small> We provide the DHT mainly as a byproduct of some
+internal algorithms. FFTW computes a real input/output DFT of
+<em>prime</em> size by re-expressing it as a DHT plus post/pre-processing
+and then using Rader's prime-DFT algorithm adapted to the DHT.</p>
+
+   <hr></div>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/The-Halfcomplex_002dformat-DFT.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/The-Halfcomplex_002dformat-DFT.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,111 @@
+<html lang="en">
+<head>
+<title>The Halfcomplex-format DFT - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data" title="More DFTs of Real Data">
+<link rel="prev" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data" title="More DFTs of Real Data">
+<link rel="next" href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029" title="Real even/odd DFTs (cosine/sine transforms)">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="The-Halfcomplex-format-DFT"></a>
+<a name="The-Halfcomplex_002dformat-DFT"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">Real even/odd DFTs (cosine/sine transforms)</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a>
+<hr>
+</div>
+
+<h4 class="subsection">2.5.1 The Halfcomplex-format DFT</h4>
+
+<p>An r2r kind of <code>FFTW_R2HC</code> (<dfn>r2hc</dfn>) corresponds to an r2c DFT
+<a name="index-FFTW_005fR2HC-72"></a><a name="index-r2c-73"></a><a name="index-r2hc-74"></a>(see <a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a>) but with &ldquo;halfcomplex&rdquo;
+format output, and may sometimes be faster and/or more convenient than
+the latter. 
+<a name="index-halfcomplex-format-75"></a>The inverse <dfn>hc2r</dfn> transform is of kind <code>FFTW_HC2R</code>. 
+<a name="index-FFTW_005fHC2R-76"></a><a name="index-hc2r-77"></a>This consists of the non-redundant half of the complex output for a 1d
+real-input DFT of size <code>n</code>, stored as a sequence of <code>n</code> real
+numbers (<code>double</code>) in the format:
+
+   <p><p align=center>
+r<sub>0</sub>, r<sub>1</sub>, r<sub>2</sub>, ..., r<sub>n/2</sub>, i<sub>(n+1)/2-1</sub>, ..., i<sub>2</sub>, i<sub>1</sub>
+</p>
+
+   <p>Here,
+r<sub>k</sub>is the real part of the kth output, and
+i<sub>k</sub>is the imaginary part.  (Division by 2 is rounded down.) For a
+halfcomplex array <code>hc[n]</code>, the kth component thus has its
+real part in <code>hc[k]</code> and its imaginary part in <code>hc[n-k]</code>, with
+the exception of <code>k</code> <code>==</code> <code>0</code> or <code>n/2</code> (the latter
+only if <code>n</code> is even)&mdash;in these two cases, the imaginary part is
+zero due to symmetries of the real-input DFT, and is not stored. 
+Thus, the r2hc transform of <code>n</code> real values is a halfcomplex array of
+length <code>n</code>, and vice versa for hc2r. 
+<a name="index-normalization-78"></a>
+
+   <p>Aside from the differing format, the output of
+<code>FFTW_R2HC</code>/<code>FFTW_HC2R</code> is otherwise exactly the same as for
+the corresponding 1d r2c/c2r transform
+(i.e. <code>FFTW_FORWARD</code>/<code>FFTW_BACKWARD</code> transforms, respectively). 
+Recall that these transforms are unnormalized, so r2hc followed by hc2r
+will result in the original data multiplied by <code>n</code>.  Furthermore,
+like the c2r transform, an out-of-place hc2r transform will
+<em>destroy its input</em> array.
+
+   <p>Although these halfcomplex transforms can be used with the
+multi-dimensional r2r interface, the interpretation of such a separable
+product of transforms along each dimension is problematic.  For example,
+consider a two-dimensional <code>n0</code> by <code>n1</code>, r2hc by r2hc
+transform planned by <code>fftw_plan_r2r_2d(n0, n1, in, out, FFTW_R2HC,
+FFTW_R2HC, FFTW_MEASURE)</code>.  Conceptually, FFTW first transforms the rows
+(of size <code>n1</code>) to produce halfcomplex rows, and then transforms the
+columns (of size <code>n0</code>).  Half of these column transforms, however,
+are of imaginary parts, and should therefore be multiplied by i
+and combined with the r2hc transforms of the real columns to produce the
+2d DFT amplitudes; FFTW's r2r transform does <em>not</em> perform this
+combination for you.  Thus, if a multi-dimensional real-input/output DFT
+is required, we recommend using the ordinary r2c/c2r
+interface (see <a href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a>).
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Thread-safety.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Thread-safety.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,93 @@
+<html lang="en">
+<head>
+<title>Thread safety - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW" title="Multi-threaded FFTW">
+<link rel="prev" href="How-Many-Threads-to-Use_003f.html#How-Many-Threads-to-Use_003f" title="How Many Threads to Use?">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Thread-safety"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="How-Many-Threads-to-Use_003f.html#How-Many-Threads-to-Use_003f">How Many Threads to Use?</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>
+<hr>
+</div>
+
+<h3 class="section">5.4 Thread safety</h3>
+
+<p><a name="index-threads-341"></a><a name="index-OpenMP-342"></a><a name="index-thread-safety-343"></a>Users writing multi-threaded programs (including OpenMP) must concern
+themselves with the <dfn>thread safety</dfn> of the libraries they
+use&mdash;that is, whether it is safe to call routines in parallel from
+multiple threads.  FFTW can be used in such an environment, but some
+care must be taken because the planner routines share data
+(e.g. wisdom and trigonometric tables) between calls and plans.
+
+   <p>The upshot is that the only thread-safe (re-entrant) routine in FFTW is
+<code>fftw_execute</code> (and the new-array variants thereof).  All other routines
+(e.g. the planner) should only be called from one thread at a time.  So,
+for example, you can wrap a semaphore lock around any calls to the
+planner; even more simply, you can just create all of your plans from
+one thread.  We do not think this should be an important restriction
+(FFTW is designed for the situation where the only performance-sensitive
+code is the actual execution of the transform), and the benefits of
+shared data between plans are great.
+
+   <p>Note also that, since the plan is not modified by <code>fftw_execute</code>,
+it is safe to execute the <em>same plan</em> in parallel by multiple
+threads.  However, since a given plan operates by default on a fixed
+array, you need to use one of the new-array execute functions (see <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>) so that different threads compute the transform of different data.
+
+   <p>(Users should note that these comments only apply to programs using
+shared-memory threads or OpenMP.  Parallelism using MPI or forked processes
+involves a separate address-space and global variables for each process,
+and is not susceptible to problems of this sort.)
+
+   <p>If you are configured FFTW with the <code>--enable-debug</code> or
+<code>--enable-debug-malloc</code> flags (see <a href="Installation-on-Unix.html#Installation-on-Unix">Installation on Unix</a>),
+then <code>fftw_execute</code> is not thread-safe.  These flags are not
+documented because they are intended only for developing
+and debugging FFTW, but if you must use <code>--enable-debug</code> then you
+should also specifically pass <code>--disable-debug-malloc</code> for
+<code>fftw_execute</code> to be thread-safe.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Transposed-distributions.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Transposed-distributions.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,129 @@
+<html lang="en">
+<head>
+<title>Transposed distributions - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="MPI-Data-Distribution.html#MPI-Data-Distribution" title="MPI Data Distribution">
+<link rel="prev" href="Load-balancing.html#Load-balancing" title="Load balancing">
+<link rel="next" href="One_002ddimensional-distributions.html#One_002ddimensional-distributions" title="One-dimensional distributions">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Transposed-distributions"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="One_002ddimensional-distributions.html#One_002ddimensional-distributions">One-dimensional distributions</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Load-balancing.html#Load-balancing">Load balancing</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="MPI-Data-Distribution.html#MPI-Data-Distribution">MPI Data Distribution</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.4.3 Transposed distributions</h4>
+
+<p>Internally, FFTW's MPI transform algorithms work by first computing
+transforms of the data local to each process, then by globally
+<em>transposing</em> the data in some fashion to redistribute the data
+among the processes, transforming the new data local to each process,
+and transposing back.  For example, a two-dimensional <code>n0</code> by
+<code>n1</code> array, distributed across the <code>n0</code> dimension, is
+transformd by: (i) transforming the <code>n1</code> dimension, which are
+local to each process; (ii) transposing to an <code>n1</code> by <code>n0</code>
+array, distributed across the <code>n1</code> dimension; (iii) transforming
+the <code>n0</code> dimension, which is now local to each process; (iv)
+transposing back. 
+<a name="index-transpose-379"></a>
+
+   <p>However, in many applications it is acceptable to compute a
+multidimensional DFT whose results are produced in transposed order
+(e.g., <code>n1</code> by <code>n0</code> in two dimensions).  This provides a
+significant performance advantage, because it means that the final
+transposition step can be omitted.  FFTW supports this optimization,
+which you specify by passing the flag <code>FFTW_MPI_TRANSPOSED_OUT</code>
+to the planner routines.  To compute the inverse transform of
+transposed output, you specify <code>FFTW_MPI_TRANSPOSED_IN</code> to tell
+it that the input is transposed.  In this section, we explain how to
+interpret the output format of such a transform. 
+<a name="index-FFTW_005fMPI_005fTRANSPOSED_005fOUT-380"></a><a name="index-FFTW_005fMPI_005fTRANSPOSED_005fIN-381"></a>
+
+   <p>Suppose you have are transforming multi-dimensional data with (at
+least two) dimensions n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub>.  As always, it is distributed along
+the first dimension n<sub>0</sub>.  Now, if we compute its DFT with the
+<code>FFTW_MPI_TRANSPOSED_OUT</code> flag, the resulting output data are stored
+with the first <em>two</em> dimensions transposed: n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&hellip;&times;&nbsp;n<sub>d-1</sub>,
+distributed along the n<sub>1</sub> dimension.  Conversely, if we take the
+n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&hellip;&times;&nbsp;n<sub>d-1</sub> data and transform it with the
+<code>FFTW_MPI_TRANSPOSED_IN</code> flag, then the format goes back to the
+original n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&nbsp;&hellip;&nbsp;&times;&nbsp;n<sub>d-1</sub> array.
+
+   <p>There are two ways to find the portion of the transposed array that
+resides on the current process.  First, you can simply call the
+appropriate &lsquo;<samp><span class="samp">local_size</span></samp>&rsquo; function, passing n<sub>1</sub>&nbsp;&times;&nbsp;n<sub>0</sub>&nbsp;&times;&nbsp;n<sub>2</sub>&nbsp;&times;&hellip;&times;&nbsp;n<sub>d-1</sub> (the
+transposed dimensions).  This would mean calling the &lsquo;<samp><span class="samp">local_size</span></samp>&rsquo;
+function twice, once for the transposed and once for the
+non-transposed dimensions.  Alternatively, you can call one of the
+&lsquo;<samp><span class="samp">local_size_transposed</span></samp>&rsquo; functions, which returns both the
+non-transposed and transposed data distribution from a single call. 
+For example, for a 3d transform with transposed output (or input), you
+might call:
+
+<pre class="example">     ptrdiff_t fftw_mpi_local_size_3d_transposed(
+                     ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm,
+                     ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                     ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+</pre>
+   <p><a name="index-fftw_005fmpi_005flocal_005fsize_005f3d_005ftransposed-382"></a>
+Here, <code>local_n0</code> and <code>local_0_start</code> give the size and
+starting index of the <code>n0</code> dimension for the
+<em>non</em>-transposed data, as in the previous sections.  For
+<em>transposed</em> data (e.g. the output for
+<code>FFTW_MPI_TRANSPOSED_OUT</code>), <code>local_n1</code> and
+<code>local_1_start</code> give the size and starting index of the <code>n1</code>
+dimension, which is the first dimension of the transposed data
+(<code>n1</code> by <code>n0</code> by <code>n2</code>).
+
+   <p>(Note that <code>FFTW_MPI_TRANSPOSED_IN</code> is completely equivalent to
+performing <code>FFTW_MPI_TRANSPOSED_OUT</code> and passing the first two
+dimensions to the planner in reverse order, or vice versa.  If you
+pass <em>both</em> the <code>FFTW_MPI_TRANSPOSED_IN</code> and
+<code>FFTW_MPI_TRANSPOSED_OUT</code> flags, it is equivalent to swapping the
+first two dimensions passed to the planner and passing <em>neither</em>
+flag.)
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Tutorial.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Tutorial.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,90 @@
+<html lang="en">
+<head>
+<title>Tutorial - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Introduction.html#Introduction" title="Introduction">
+<link rel="next" href="Other-Important-Topics.html#Other-Important-Topics" title="Other Important Topics">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Tutorial"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Other-Important-Topics.html#Other-Important-Topics">Other Important Topics</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Introduction.html#Introduction">Introduction</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">2 Tutorial</h2>
+
+<ul class="menu">
+<li><a accesskey="1" href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a>
+<li><a accesskey="2" href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">Complex Multi-Dimensional DFTs</a>
+<li><a accesskey="3" href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">One-Dimensional DFTs of Real Data</a>
+<li><a accesskey="4" href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">Multi-Dimensional DFTs of Real Data</a>
+<li><a accesskey="5" href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">More DFTs of Real Data</a>
+</ul>
+
+<p>This chapter describes the basic usage of FFTW, i.e., how to compute
+<a name="index-basic-interface-14"></a>the Fourier transform of a single array.  This chapter tells the
+truth, but not the <em>whole</em> truth. Specifically, FFTW implements
+additional routines and flags that are not documented here, although
+in many cases we try to indicate where added capabilities exist.  For
+more complete information, see <a href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>.  (Note that you
+need to compile and install FFTW before you can use it in a program. 
+For the details of the installation, see <a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>.)
+
+   <p>We recommend that you read this tutorial in order.<a rel="footnote" href="#fn-1" name="fnd-1"><sup>1</sup></a>  At the least, read the first section (see <a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">Complex One-Dimensional DFTs</a>) before reading any of the others, even if your
+main interest lies in one of the other transform types.
+
+   <p>Users of FFTW version 2 and earlier may also want to read <a href="Upgrading-from-FFTW-version-2.html#Upgrading-from-FFTW-version-2">Upgrading from FFTW version 2</a>.
+
+<!--  -->
+   <div class="footnote">
+<hr>
+<h4>Footnotes</h4><p class="footnote"><small>[<a name="fn-1" href="#fnd-1">1</a>]</small> You can
+read the tutorial in bit-reversed order after computing your first
+transform.</p>
+
+   <hr></div>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Upgrading-from-FFTW-version-2.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Upgrading-from-FFTW-version-2.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,258 @@
+<html lang="en">
+<head>
+<title>Upgrading from FFTW version 2 - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="prev" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran" title="Calling FFTW from Legacy Fortran">
+<link rel="next" href="Installation-and-Customization.html#Installation-and-Customization" title="Installation and Customization">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Upgrading-from-FFTW-version-2"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="index.html#Top">Top</a>
+<hr>
+</div>
+
+<h2 class="chapter">9 Upgrading from FFTW version 2</h2>
+
+<p>In this chapter, we outline the process for updating codes designed for
+the older FFTW 2 interface to work with FFTW 3.  The interface for FFTW
+3 is not backwards-compatible with the interface for FFTW 2 and earlier
+versions; codes written to use those versions will fail to link with
+FFTW 3.  Nor is it possible to write &ldquo;compatibility wrappers&rdquo; to
+bridge the gap (at least not efficiently), because FFTW 3 has different
+semantics from previous versions.  However, upgrading should be a
+straightforward process because the data formats are identical and the
+overall style of planning/execution is essentially the same.
+
+   <p>Unlike FFTW 2, there are no separate header files for real and complex
+transforms (or even for different precisions) in FFTW 3; all interfaces
+are defined in the <code>&lt;fftw3.h&gt;</code> header file.
+
+<h3 class="heading">Numeric Types</h3>
+
+<p>The main difference in data types is that <code>fftw_complex</code> in FFTW 2
+was defined as a <code>struct</code> with macros <code>c_re</code> and <code>c_im</code>
+for accessing the real/imaginary parts.  (This is binary-compatible with
+FFTW 3 on any machine except perhaps for some older Crays in single
+precision.)  The equivalent macros for FFTW 3 are:
+
+<pre class="example">     #define c_re(c) ((c)[0])
+     #define c_im(c) ((c)[1])
+</pre>
+   <p>This does not work if you are using the C99 complex type, however,
+unless you insert a <code>double*</code> typecast into the above macros
+(see <a href="Complex-numbers.html#Complex-numbers">Complex numbers</a>).
+
+   <p>Also, FFTW 2 had an <code>fftw_real</code> typedef that was an alias for
+<code>double</code> (in double precision).  In FFTW 3 you should just use
+<code>double</code> (or whatever precision you are employing).
+
+<h3 class="heading">Plans</h3>
+
+<p>The major difference between FFTW 2 and FFTW 3 is in the
+planning/execution division of labor.  In FFTW 2, plans were found for a
+given transform size and type, and then could be applied to <em>any</em>
+arrays and for <em>any</em> multiplicity/stride parameters.  In FFTW 3,
+you specify the particular arrays, stride parameters, etcetera when
+creating the plan, and the plan is then executed for <em>those</em> arrays
+(unless the guru interface is used) and <em>those</em> parameters
+<em>only</em>.  (FFTW 2 had &ldquo;specific planner&rdquo; routines that planned for
+a particular array and stride, but the plan could still be used for
+other arrays and strides.)  That is, much of the information that was
+formerly specified at execution time is now specified at planning time.
+
+   <p>Like FFTW 2's specific planner routines, the FFTW 3 planner overwrites
+the input/output arrays unless you use <code>FFTW_ESTIMATE</code>.
+
+   <p>FFTW 2 had separate data types <code>fftw_plan</code>, <code>fftwnd_plan</code>,
+<code>rfftw_plan</code>, and <code>rfftwnd_plan</code> for complex and real one- and
+multi-dimensional transforms, and each type had its own &lsquo;<samp><span class="samp">destroy</span></samp>&rsquo;
+function.  In FFTW 3, all plans are of type <code>fftw_plan</code> and all are
+destroyed by <code>fftw_destroy_plan(plan)</code>.
+
+   <p>Where you formerly used <code>fftw_create_plan</code> and <code>fftw_one</code> to
+plan and compute a single 1d transform, you would now use
+<code>fftw_plan_dft_1d</code> to plan the transform.  If you used the generic
+<code>fftw</code> function to execute the transform with multiplicity
+(<code>howmany</code>) and stride parameters, you would now use the advanced
+interface <code>fftw_plan_many_dft</code> to specify those parameters.  The
+plans are now executed with <code>fftw_execute(plan)</code>, which takes all
+of its parameters (including the input/output arrays) from the plan.
+
+   <p>In-place transforms no longer interpret their output argument as scratch
+space, nor is there an <code>FFTW_IN_PLACE</code> flag.  You simply pass the
+same pointer for both the input and output arguments.  (Previously, the
+output <code>ostride</code> and <code>odist</code> parameters were ignored for
+in-place transforms; now, if they are specified via the advanced
+interface, they are significant even in the in-place case, although they
+should normally equal the corresponding input parameters.)
+
+   <p>The <code>FFTW_ESTIMATE</code> and <code>FFTW_MEASURE</code> flags have the same
+meaning as before, although the planning time will differ.  You may also
+consider using <code>FFTW_PATIENT</code>, which is like <code>FFTW_MEASURE</code>
+except that it takes more time in order to consider a wider variety of
+algorithms.
+
+   <p>For multi-dimensional complex DFTs, instead of <code>fftwnd_create_plan</code>
+(or <code>fftw2d_create_plan</code> or <code>fftw3d_create_plan</code>), followed by
+<code>fftwnd_one</code>, you would use <code>fftw_plan_dft</code> (or
+<code>fftw_plan_dft_2d</code> or <code>fftw_plan_dft_3d</code>).  followed by
+<code>fftw_execute</code>.  If you used <code>fftwnd</code> to to specify strides
+etcetera, you would instead specify these via <code>fftw_plan_many_dft</code>.
+
+   <p>The analogues to <code>rfftw_create_plan</code> and <code>rfftw_one</code> with
+<code>FFTW_REAL_TO_COMPLEX</code> or <code>FFTW_COMPLEX_TO_REAL</code> directions
+are <code>fftw_plan_r2r_1d</code> with kind <code>FFTW_R2HC</code> or
+<code>FFTW_HC2R</code>, followed by <code>fftw_execute</code>.  The stride etcetera
+arguments of <code>rfftw</code> are now in <code>fftw_plan_many_r2r</code>.
+
+   <p>Instead of <code>rfftwnd_create_plan</code> (or <code>rfftw2d_create_plan</code> or
+<code>rfftw3d_create_plan</code>) followed by
+<code>rfftwnd_one_real_to_complex</code> or
+<code>rfftwnd_one_complex_to_real</code>, you now use <code>fftw_plan_dft_r2c</code>
+(or <code>fftw_plan_dft_r2c_2d</code> or <code>fftw_plan_dft_r2c_3d</code>) or
+<code>fftw_plan_dft_c2r</code> (or <code>fftw_plan_dft_c2r_2d</code> or
+<code>fftw_plan_dft_c2r_3d</code>), respectively, followed by
+<code>fftw_execute</code>.  As usual, the strides etcetera of
+<code>rfftwnd_real_to_complex</code> or <code>rfftwnd_complex_to_real</code> are no
+specified in the advanced planner routines,
+<code>fftw_plan_many_dft_r2c</code> or <code>fftw_plan_many_dft_c2r</code>.
+
+<h3 class="heading">Wisdom</h3>
+
+<p>In FFTW 2, you had to supply the <code>FFTW_USE_WISDOM</code> flag in order to
+use wisdom; in FFTW 3, wisdom is always used.  (You could simulate the
+FFTW 2 wisdom-less behavior by calling <code>fftw_forget_wisdom</code> after
+every planner call.)
+
+   <p>The FFTW 3 wisdom import/export routines are almost the same as before
+(although the storage format is entirely different).  There is one
+significant difference, however.  In FFTW 2, the import routines would
+never read past the end of the wisdom, so you could store extra data
+beyond the wisdom in the same file, for example.  In FFTW 3, the
+file-import routine may read up to a few hundred bytes past the end of
+the wisdom, so you cannot store other data just beyond it.<a rel="footnote" href="#fn-1" name="fnd-1"><sup>1</sup></a>
+
+   <p>Wisdom has been enhanced by additional humility in FFTW 3: whereas FFTW
+2 would re-use wisdom for a given transform size regardless of the
+stride etc., in FFTW 3 wisdom is only used with the strides etc. for
+which it was created.  Unfortunately, this means FFTW 3 has to create
+new plans from scratch more often than FFTW 2 (in FFTW 2, planning
+e.g. one transform of size 1024 also created wisdom for all smaller
+powers of 2, but this no longer occurs).
+
+   <p>FFTW 3 also has the new routine <code>fftw_import_system_wisdom</code> to
+import wisdom from a standard system-wide location.
+
+<h3 class="heading">Memory allocation</h3>
+
+<p>In FFTW 3, we recommend allocating your arrays with <code>fftw_malloc</code>
+and deallocating them with <code>fftw_free</code>; this is not required, but
+allows optimal performance when SIMD acceleration is used.  (Those two
+functions actually existed in FFTW 2, and worked the same way, but were
+not documented.)
+
+   <p>In FFTW 2, there were <code>fftw_malloc_hook</code> and <code>fftw_free_hook</code>
+functions that allowed the user to replace FFTW's memory-allocation
+routines (e.g. to implement different error-handling, since by default
+FFTW prints an error message and calls <code>exit</code> to abort the program
+if <code>malloc</code> returns <code>NULL</code>).  These hooks are not supported in
+FFTW 3; those few users who require this functionality can just
+directly modify the memory-allocation routines in FFTW (they are defined
+in <code>kernel/alloc.c</code>).
+
+<h3 class="heading">Fortran interface</h3>
+
+<p>In FFTW 2, the subroutine names were obtained by replacing &lsquo;<samp><span class="samp">fftw_</span></samp>&rsquo;
+with &lsquo;<samp><span class="samp">fftw_f77</span></samp>&rsquo;; in FFTW 3, you replace &lsquo;<samp><span class="samp">fftw_</span></samp>&rsquo; with
+&lsquo;<samp><span class="samp">dfftw_</span></samp>&rsquo; (or &lsquo;<samp><span class="samp">sfftw_</span></samp>&rsquo; or &lsquo;<samp><span class="samp">lfftw_</span></samp>&rsquo;, depending upon the
+precision).
+
+   <p>In FFTW 3, we have begun recommending that you always declare the type
+used to store plans as <code>integer*8</code>.  (Too many people didn't notice
+our instruction to switch from <code>integer</code> to <code>integer*8</code> for
+64-bit machines.)
+
+   <p>In FFTW 3, we provide a <code>fftw3.f</code> &ldquo;header file&rdquo; to include in
+your code (and which is officially installed on Unix systems).  (In FFTW
+2, we supplied a <code>fftw_f77.i</code> file, but it was not installed.)
+
+   <p>Otherwise, the C-Fortran interface relationship is much the same as it
+was before (e.g. return values become initial parameters, and
+multi-dimensional arrays are in column-major order).  Unlike FFTW 2, we
+do provide some support for wisdom import/export in Fortran
+(see <a href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f">Wisdom of Fortran?</a>).
+
+<h3 class="heading">Threads</h3>
+
+<p>Like FFTW 2, only the execution routines are thread-safe.  All planner
+routines, etcetera, should be called by only a single thread at a time
+(see <a href="Thread-safety.html#Thread-safety">Thread safety</a>).  <em>Unlike</em> FFTW 2, there is no special
+<code>FFTW_THREADSAFE</code> flag for the planner to allow a given plan to be
+usable by multiple threads in parallel; this is now the case by default.
+
+   <p>The multi-threaded version of FFTW 2 required you to pass the number of
+threads each time you execute the transform.  The number of threads is
+now stored in the plan, and is specified before the planner is called by
+<code>fftw_plan_with_nthreads</code>.  The threads initialization routine used
+to be called <code>fftw_threads_init</code> and would return zero on success;
+the new routine is called <code>fftw_init_threads</code> and returns zero on
+failure.  See <a href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>.
+
+   <p>There is no separate threads header file in FFTW 3; all the function
+prototypes are in <code>&lt;fftw3.h&gt;</code>.  However, you still have to link to
+a separate library (<code>-lfftw3_threads -lfftw3 -lm</code> on Unix), as well as
+to the threading library (e.g. POSIX threads on Unix).
+
+   <div class="footnote">
+<hr>
+<h4>Footnotes</h4><p class="footnote"><small>[<a name="fn-1" href="#fnd-1">1</a>]</small> We
+do our own buffering because GNU libc I/O routines are horribly slow for
+single-character I/O, apparently for thread-safety reasons (whether you
+are using threads or not).</p>
+
+   <hr></div>
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Usage-of-Multi_002dthreaded-FFTW.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Usage-of-Multi_002dthreaded-FFTW.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,132 @@
+<html lang="en">
+<head>
+<title>Usage of Multi-threaded FFTW - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW" title="Multi-threaded FFTW">
+<link rel="prev" href="Installation-and-Supported-Hardware_002fSoftware.html#Installation-and-Supported-Hardware_002fSoftware" title="Installation and Supported Hardware/Software">
+<link rel="next" href="How-Many-Threads-to-Use_003f.html#How-Many-Threads-to-Use_003f" title="How Many Threads to Use?">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Usage-of-Multi-threaded-FFTW"></a>
+<a name="Usage-of-Multi_002dthreaded-FFTW"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="How-Many-Threads-to-Use_003f.html#How-Many-Threads-to-Use_003f">How Many Threads to Use?</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Installation-and-Supported-Hardware_002fSoftware.html#Installation-and-Supported-Hardware_002fSoftware">Installation and Supported Hardware/Software</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>
+<hr>
+</div>
+
+<h3 class="section">5.2 Usage of Multi-threaded FFTW</h3>
+
+<p>Here, it is assumed that the reader is already familiar with the usage
+of the uniprocessor FFTW routines, described elsewhere in this manual. 
+We only describe what one has to change in order to use the
+multi-threaded routines.
+
+   <p><a name="index-OpenMP-332"></a>First, programs using the parallel complex transforms should be linked
+with <code>-lfftw3_threads -lfftw3 -lm</code> on Unix, or <code>-lfftw3_omp
+-lfftw3 -lm</code> if you compiled with OpenMP. You will also need to link
+with whatever library is responsible for threads on your system
+(e.g. <code>-lpthread</code> on GNU/Linux) or include whatever compiler flag
+enables OpenMP (e.g. <code>-fopenmp</code> with gcc). 
+<a name="index-linking-on-Unix-333"></a>
+
+   <p>Second, before calling <em>any</em> FFTW routines, you should call the
+function:
+
+<pre class="example">     int fftw_init_threads(void);
+</pre>
+   <p><a name="index-fftw_005finit_005fthreads-334"></a>
+This function, which need only be called once, performs any one-time
+initialization required to use threads on your system.  It returns zero
+if there was some error (which should not happen under normal
+circumstances) and a non-zero value otherwise.
+
+   <p>Third, before creating a plan that you want to parallelize, you should
+call:
+
+<pre class="example">     void fftw_plan_with_nthreads(int nthreads);
+</pre>
+   <p><a name="index-fftw_005fplan_005fwith_005fnthreads-335"></a>
+The <code>nthreads</code> argument indicates the number of threads you want
+FFTW to use (or actually, the maximum number).  All plans subsequently
+created with any planner routine will use that many threads.  You can
+call <code>fftw_plan_with_nthreads</code>, create some plans, call
+<code>fftw_plan_with_nthreads</code> again with a different argument, and
+create some more plans for a new number of threads.  Plans already created
+before a call to <code>fftw_plan_with_nthreads</code> are unaffected.  If you
+pass an <code>nthreads</code> argument of <code>1</code> (the default), threads are
+disabled for subsequent plans.
+
+   <p><a name="index-OpenMP-336"></a>With OpenMP, to configure FFTW to use all of the currently running
+OpenMP threads (set by <code>omp_set_num_threads(nthreads)</code> or by the
+<code>OMP_NUM_THREADS</code> environment variable), you can do:
+<code>fftw_plan_with_nthreads(omp_get_max_threads())</code>. (The &lsquo;<samp><span class="samp">omp_</span></samp>&rsquo;
+OpenMP functions are declared via <code>#include &lt;omp.h&gt;</code>.)
+
+   <p><a name="index-thread-safety-337"></a>Given a plan, you then execute it as usual with
+<code>fftw_execute(plan)</code>, and the execution will use the number of
+threads specified when the plan was created.  When done, you destroy
+it as usual with <code>fftw_destroy_plan</code>.  As described in
+<a href="Thread-safety.html#Thread-safety">Thread safety</a>, plan <em>execution</em> is thread-safe, but plan
+creation and destruction are <em>not</em>: you should create/destroy
+plans only from a single thread, but can safely execute multiple plans
+in parallel.
+
+   <p>There is one additional routine: if you want to get rid of all memory
+and other resources allocated internally by FFTW, you can call:
+
+<pre class="example">     void fftw_cleanup_threads(void);
+</pre>
+   <p><a name="index-fftw_005fcleanup_005fthreads-338"></a>
+which is much like the <code>fftw_cleanup()</code> function except that it
+also gets rid of threads-related data.  You must <em>not</em> execute any
+previously created plans after calling this function.
+
+   <p>We should also mention one other restriction: if you save wisdom from a
+program using the multi-threaded FFTW, that wisdom <em>cannot be used</em>
+by a program using only the single-threaded FFTW (i.e. not calling
+<code>fftw_init_threads</code>).  See <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Using-MPI-Plans.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Using-MPI-Plans.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,90 @@
+<html lang="en">
+<head>
+<title>Using MPI Plans - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference" title="FFTW MPI Reference">
+<link rel="prev" href="MPI-Initialization.html#MPI-Initialization" title="MPI Initialization">
+<link rel="next" href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions" title="MPI Data Distribution Functions">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Using-MPI-Plans"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">MPI Data Distribution Functions</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="MPI-Initialization.html#MPI-Initialization">MPI Initialization</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">FFTW MPI Reference</a>
+<hr>
+</div>
+
+<h4 class="subsection">6.12.3 Using MPI Plans</h4>
+
+<p>Once an MPI plan is created, you can execute and destroy it using
+<code>fftw_execute</code>, <code>fftw_destroy_plan</code>, and the other functions
+in the serial interface that operate on generic plans (see <a href="Using-Plans.html#Using-Plans">Using Plans</a>).
+
+   <p><a name="index-collective-function-438"></a><a name="index-MPI-communicator-439"></a>The <code>fftw_execute</code> and <code>fftw_destroy_plan</code> functions, applied to
+MPI plans, are <em>collective</em> calls: they must be called for all processes
+in the communicator that was used to create the plan.
+
+   <p><a name="index-new_002darray-execution-440"></a>You must <em>not</em> use the serial new-array plan-execution functions
+<code>fftw_execute_dft</code> and so on (see <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>) with MPI plans.  Such functions are specialized to the
+problem type, and there are specific new-array execute functions for MPI plans:
+
+   <p><a name="index-fftw_005fmpi_005fexecute_005fdft-441"></a><a name="index-fftw_005fmpi_005fexecute_005fdft_005fr2c-442"></a><a name="index-fftw_005fmpi_005fexecute_005fdft_005fc2r-443"></a><a name="index-fftw_005fmpi_005fexecute_005fr2r-444"></a>
+<pre class="example">     void fftw_mpi_execute_dft(fftw_plan p, fftw_complex *in, fftw_complex *out);
+     void fftw_mpi_execute_dft_r2c(fftw_plan p, double *in, fftw_complex *out);
+     void fftw_mpi_execute_dft_c2r(fftw_plan p, fftw_complex *in, double *out);
+     void fftw_mpi_execute_r2r(fftw_plan p, double *in, double *out);
+</pre>
+   <p><a name="index-alignment-445"></a><a name="index-fftw_005fmalloc-446"></a>These functions have the same restrictions as those of the serial
+new-array execute functions.  They are <em>always</em> safe to apply to
+the <em>same</em> <code>in</code> and <code>out</code> arrays that were used to
+create the plan.  They can only be applied to new arrarys if those
+arrays have the same types, dimensions, in-placeness, and alignment as
+the original arrays, where the best way to ensure the same alignment
+is to use FFTW's <code>fftw_malloc</code> and related allocation functions
+for all arrays (see <a href="Memory-Allocation.html#Memory-Allocation">Memory Allocation</a>).  Note that distributed
+transposes (see <a href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">FFTW MPI Transposes</a>) use
+<code>fftw_mpi_execute_r2r</code>, since they count as rank-zero r2r plans
+from FFTW's perspective.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Using-Plans.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Using-Plans.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,147 @@
+<html lang="en">
+<head>
+<title>Using Plans - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link rel="prev" href="Data-Types-and-Files.html#Data-Types-and-Files" title="Data Types and Files">
+<link rel="next" href="Basic-Interface.html#Basic-Interface" title="Basic Interface">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Using-Plans"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Basic-Interface.html#Basic-Interface">Basic Interface</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Data-Types-and-Files.html#Data-Types-and-Files">Data Types and Files</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>
+<hr>
+</div>
+
+<h3 class="section">4.2 Using Plans</h3>
+
+<p>Plans for all transform types in FFTW are stored as type
+<code>fftw_plan</code> (an opaque pointer type), and are created by one of the
+various planning routines described in the following sections. 
+<a name="index-fftw_005fplan-152"></a>An <code>fftw_plan</code> contains all information necessary to compute the
+transform, including the pointers to the input and output arrays.
+
+<pre class="example">     void fftw_execute(const fftw_plan plan);
+</pre>
+   <p><a name="index-fftw_005fexecute-153"></a>
+This executes the <code>plan</code>, to compute the corresponding transform on
+the arrays for which it was planned (which must still exist).  The plan
+is not modified, and <code>fftw_execute</code> can be called as many times as
+desired.
+
+   <p>To apply a given plan to a different array, you can use the new-array execute
+interface.  See <a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>.
+
+   <p><code>fftw_execute</code> (and equivalents) is the only function in FFTW
+guaranteed to be thread-safe; see <a href="Thread-safety.html#Thread-safety">Thread safety</a>.
+
+   <p>This function:
+<pre class="example">     void fftw_destroy_plan(fftw_plan plan);
+</pre>
+   <p><a name="index-fftw_005fdestroy_005fplan-154"></a>deallocates the <code>plan</code> and all its associated data.
+
+   <p>FFTW's planner saves some other persistent data, such as the
+accumulated wisdom and a list of algorithms available in the current
+configuration.  If you want to deallocate all of that and reset FFTW
+to the pristine state it was in when you started your program, you can
+call:
+
+<pre class="example">     void fftw_cleanup(void);
+</pre>
+   <p><a name="index-fftw_005fcleanup-155"></a>
+After calling <code>fftw_cleanup</code>, all existing plans become undefined,
+and you should not attempt to execute them nor to destroy them.  You can
+however create and execute/destroy new plans, in which case FFTW starts
+accumulating wisdom information again.
+
+   <p><code>fftw_cleanup</code> does not deallocate your plans, however.  To prevent
+memory leaks, you must still call <code>fftw_destroy_plan</code> before
+executing <code>fftw_cleanup</code>.
+
+   <p>Occasionally, it may useful to know FFTW's internal &ldquo;cost&rdquo; metric
+that it uses to compare plans to one another; this cost is
+proportional to an execution time of the plan, in undocumented units,
+if the plan was created with the <code>FFTW_MEASURE</code> or other
+timing-based options, or alternatively is a heuristic cost function
+for <code>FFTW_ESTIMATE</code> plans.  (The cost values of measured and
+estimated plans are not comparable, being in different units.  Also,
+costs from different FFTW versions or the same version compiled
+differently may not be in the same units.  Plans created from wisdom
+have a cost of 0 since no timing measurement is performed for them. 
+Finally, certain problems for which only one top-level algorithm was
+possible may have required no measurements of the cost of the whole
+plan, in which case <code>fftw_cost</code> will also return 0.)  The cost
+metric for a given plan is returned by:
+
+<pre class="example">     double fftw_cost(const fftw_plan plan);
+</pre>
+   <p><a name="index-fftw_005fcost-156"></a>
+The following two routines are provided purely for academic purposes
+(that is, for entertainment).
+
+<pre class="example">     void fftw_flops(const fftw_plan plan,
+                     double *add, double *mul, double *fma);
+</pre>
+   <p><a name="index-fftw_005fflops-157"></a>
+Given a <code>plan</code>, set <code>add</code>, <code>mul</code>, and <code>fma</code> to an
+exact count of the number of floating-point additions, multiplications,
+and fused multiply-add operations involved in the plan's execution.  The
+total number of floating-point operations (flops) is <code>add + mul +
+2*fma</code>, or <code>add + mul + fma</code> if the hardware supports fused
+multiply-add instructions (although the number of FMA operations is only
+approximate because of compiler voodoo).  (The number of operations
+should be an integer, but we use <code>double</code> to avoid overflowing
+<code>int</code> for large transforms; the arguments are of type <code>double</code>
+even for single and long-double precision versions of FFTW.)
+
+<pre class="example">     void fftw_fprint_plan(const fftw_plan plan, FILE *output_file);
+     void fftw_print_plan(const fftw_plan plan);
+</pre>
+   <p><a name="index-fftw_005ffprint_005fplan-158"></a><a name="index-fftw_005fprint_005fplan-159"></a>
+This outputs a &ldquo;nerd-readable&rdquo; representation of the <code>plan</code> to
+the given file or to <code>stdout</code>, respectively.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/What-FFTW-Really-Computes.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/What-FFTW-Really-Computes.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,76 @@
+<html lang="en">
+<head>
+<title>What FFTW Really Computes - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link rel="prev" href="Wisdom.html#Wisdom" title="Wisdom">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="What-FFTW-Really-Computes"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Wisdom.html#Wisdom">Wisdom</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>
+<hr>
+</div>
+
+<h3 class="section">4.8 What FFTW Really Computes</h3>
+
+<p>In this section, we provide precise mathematical definitions for the
+transforms that FFTW computes.  These transform definitions are fairly
+standard, but some authors follow slightly different conventions for the
+normalization of the transform (the constant factor in front) and the
+sign of the complex exponent.  We begin by presenting the
+one-dimensional (1d) transform definitions, and then give the
+straightforward extension to multi-dimensional transforms.
+
+<ul class="menu">
+<li><a accesskey="1" href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#The-1d-Discrete-Fourier-Transform-_0028DFT_0029">The 1d Discrete Fourier Transform (DFT)</a>
+<li><a accesskey="2" href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT">The 1d Real-data DFT</a>
+<li><a accesskey="3" href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#g_t1d-Real_002deven-DFTs-_0028DCTs_0029">1d Real-even DFTs (DCTs)</a>
+<li><a accesskey="4" href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#g_t1d-Real_002dodd-DFTs-_0028DSTs_0029">1d Real-odd DFTs (DSTs)</a>
+<li><a accesskey="5" href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#g_t1d-Discrete-Hartley-Transforms-_0028DHTs_0029">1d Discrete Hartley Transforms (DHTs)</a>
+<li><a accesskey="6" href="Multi_002ddimensional-Transforms.html#Multi_002ddimensional-Transforms">Multi-dimensional Transforms</a>
+</ul>
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Wisdom-Export.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Wisdom-Export.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,98 @@
+<html lang="en">
+<head>
+<title>Wisdom Export - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Wisdom.html#Wisdom" title="Wisdom">
+<link rel="prev" href="Wisdom.html#Wisdom" title="Wisdom">
+<link rel="next" href="Wisdom-Import.html#Wisdom-Import" title="Wisdom Import">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Wisdom-Export"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Wisdom-Import.html#Wisdom-Import">Wisdom Import</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Wisdom.html#Wisdom">Wisdom</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Wisdom.html#Wisdom">Wisdom</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.7.1 Wisdom Export</h4>
+
+<pre class="example">     int fftw_export_wisdom_to_filename(const char *filename);
+     void fftw_export_wisdom_to_file(FILE *output_file);
+     char *fftw_export_wisdom_to_string(void);
+     void fftw_export_wisdom(void (*write_char)(char c, void *), void *data);
+</pre>
+   <p><a name="index-fftw_005fexport_005fwisdom-278"></a><a name="index-fftw_005fexport_005fwisdom_005fto_005ffilename-279"></a><a name="index-fftw_005fexport_005fwisdom_005fto_005ffile-280"></a><a name="index-fftw_005fexport_005fwisdom_005fto_005fstring-281"></a>
+These functions allow you to export all currently accumulated wisdom
+in a form from which it can be later imported and restored, even
+during a separate run of the program. (See <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>.)  The current store of wisdom is not affected by calling any
+of these routines.
+
+   <p><code>fftw_export_wisdom</code> exports the wisdom to any output
+medium, as specified by the callback function
+<code>write_char</code>. <code>write_char</code> is a <code>putc</code>-like function that
+writes the character <code>c</code> to some output; its second parameter is
+the <code>data</code> pointer passed to <code>fftw_export_wisdom</code>.  For
+convenience, the following three &ldquo;wrapper&rdquo; routines are provided:
+
+   <p><code>fftw_export_wisdom_to_filename</code> writes wisdom to a file named
+<code>filename</code> (which is created or overwritten), returning <code>1</code>
+on success and <code>0</code> on failure.  A lower-level function, which
+requires you to open and close the file yourself (e.g. if you want to
+write wisdom to a portion of a larger file) is
+<code>fftw_export_wisdom_to_file</code>.  This writes the wisdom to the
+current position in <code>output_file</code>, which should be open with
+write permission; upon exit, the file remains open and is positioned
+at the end of the wisdom data.
+
+   <p><code>fftw_export_wisdom_to_string</code> returns a pointer to a
+<code>NULL</code>-terminated string holding the wisdom data. This string is
+dynamically allocated, and it is the responsibility of the caller to
+deallocate it with <code>free</code> when it is no longer needed.
+
+   <p>All of these routines export the wisdom in the same format, which we
+will not document here except to say that it is LISP-like ASCII text
+that is insensitive to white space.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Wisdom-File-Export_002fImport-from-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Wisdom-File-Export_002fImport-from-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,90 @@
+<html lang="en">
+<head>
+<title>Wisdom File Export/Import from Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran" title="Accessing the wisdom API from Fortran">
+<link rel="prev" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran" title="Accessing the wisdom API from Fortran">
+<link rel="next" href="Wisdom-String-Export_002fImport-from-Fortran.html#Wisdom-String-Export_002fImport-from-Fortran" title="Wisdom String Export/Import from Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Wisdom-File-Export%2fImport-from-Fortran"></a>
+<a name="Wisdom-File-Export_002fImport-from-Fortran"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Wisdom-String-Export_002fImport-from-Fortran.html#Wisdom-String-Export_002fImport-from-Fortran">Wisdom String Export/Import from Fortran</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">Accessing the wisdom API from Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">Accessing the wisdom API from Fortran</a>
+<hr>
+</div>
+
+<h4 class="subsection">7.6.1 Wisdom File Export/Import from Fortran</h4>
+
+<p><a name="index-fftw_005fimport-wisdom_005ffrom_005ffilename-566"></a><a name="index-fftw_005fexport_005fwisdom_005fto_005ffilename-567"></a>The easiest way to export and import wisdom is to do so using
+<code>fftw_export_wisdom_to_filename</code> and
+<code>fftw_wisdom_from_filename</code>.  The only trick is that these
+require you to pass a C string, which is an array of type
+<code>CHARACTER(C_CHAR)</code> that is terminated by <code>C_NULL_CHAR</code>. 
+You can call them like this:
+
+<pre class="example">       integer(C_INT) :: ret
+       ret = fftw_export_wisdom_to_filename(C_CHAR_'my_wisdom.dat' // C_NULL_CHAR)
+       if (ret .eq. 0) stop 'error exporting wisdom to file'
+       ret = fftw_import_wisdom_from_filename(C_CHAR_'my_wisdom.dat' // C_NULL_CHAR)
+       if (ret .eq. 0) stop 'error importing wisdom from file'
+</pre>
+   <p>Note that prepending &lsquo;<samp><span class="samp">C_CHAR_</span></samp>&rsquo; is needed to specify that the
+literal string is of kind <code>C_CHAR</code>, and we null-terminate the
+string by appending &lsquo;<samp><span class="samp">// C_NULL_CHAR</span></samp>&rsquo;.  These functions return an
+<code>integer(C_INT)</code> (<code>ret</code>) which is <code>0</code> if an error
+occurred during export/import and nonzero otherwise.
+
+   <p>It is also possible to use the lower-level routines
+<code>fftw_export_wisdom_to_file</code> and
+<code>fftw_import_wisdom_from_file</code>, which accept parameters of the C
+type <code>FILE*</code>, expressed in Fortran as <code>type(C_PTR)</code>. 
+However, you are then responsible for creating the <code>FILE*</code>
+yourself.  You can do this by using <code>iso_c_binding</code> to define
+Fortran intefaces for the C library functions <code>fopen</code> and
+<code>fclose</code>, which is a bit strange in Fortran but workable.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Wisdom-Generic-Export_002fImport-from-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Wisdom-Generic-Export_002fImport-from-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,112 @@
+<html lang="en">
+<head>
+<title>Wisdom Generic Export/Import from Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran" title="Accessing the wisdom API from Fortran">
+<link rel="prev" href="Wisdom-String-Export_002fImport-from-Fortran.html#Wisdom-String-Export_002fImport-from-Fortran" title="Wisdom String Export/Import from Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Wisdom-Generic-Export%2fImport-from-Fortran"></a>
+<a name="Wisdom-Generic-Export_002fImport-from-Fortran"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Wisdom-String-Export_002fImport-from-Fortran.html#Wisdom-String-Export_002fImport-from-Fortran">Wisdom String Export/Import from Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">Accessing the wisdom API from Fortran</a>
+<hr>
+</div>
+
+<h4 class="subsection">7.6.3 Wisdom Generic Export/Import from Fortran</h4>
+
+<p>The most generic wisdom export/import functions allow you to provide
+an arbitrary callback function to read/write one character at a time
+in any way you want.  However, your callback function must be written
+in a special way, using the <code>bind(C)</code> attribute to be passed to a
+C interface.
+
+   <p><a name="index-fftw_005fexport_005fwisdom-572"></a>In particular, to call the generic wisdom export function
+<code>fftw_export_wisdom</code>, you would write a callback subroutine of the form:
+
+<pre class="example">       subroutine my_write_char(c, p) bind(C)
+         use, intrinsic :: iso_c_binding
+         character(C_CHAR), value :: c
+         type(C_PTR), value :: p
+         <em>...write c...</em>
+       end subroutine my_write_char
+</pre>
+   <p>Given such a subroutine (along with the corresponding interface definition), you could then export wisdom using:
+
+   <p><a name="index-c_005ffunloc-573"></a>
+<pre class="example">       call fftw_export_wisdom(c_funloc(my_write_char), p)
+</pre>
+   <p><a name="index-c_005floc-574"></a><a name="index-c_005ff_005fpointer-575"></a>The standard <code>c_funloc</code> intrinsic converts a Fortran
+<code>bind(C)</code> subroutine into a C function pointer.  The parameter
+<code>p</code> is a <code>type(C_PTR)</code> to any arbitrary data that you want
+to pass to <code>my_write_char</code> (or <code>C_NULL_PTR</code> if none).  (Note
+that you can get a C pointer to Fortran data using the intrinsic
+<code>c_loc</code>, and convert it back to a Fortran pointer in
+<code>my_write_char</code> using <code>c_f_pointer</code>.)
+
+   <p>Similarly, to use the generic <code>fftw_import_wisdom</code>, you would
+define a callback function of the form:
+
+   <p><a name="index-fftw_005fimport_005fwisdom-576"></a>
+<pre class="example">       integer(C_INT) function my_read_char(p) bind(C)
+         use, intrinsic :: iso_c_binding
+         type(C_PTR), value :: p
+         character :: c
+         <em>...read a character c...</em>
+         my_read_char = ichar(c, C_INT)
+       end function my_read_char
+     
+       ....
+     
+       integer(C_INT) :: ret
+       ret = fftw_import_wisdom(c_funloc(my_read_char), p)
+       if (ret .eq. 0) stop 'error importing wisdom'
+</pre>
+   <p>Your function can return <code>-1</code> if the end of the input is reached. 
+Again, <code>p</code> is an arbitrary <code>type(C_PTR</code> that is passed
+through to your function.  <code>fftw_import_wisdom</code> returns <code>0</code>
+if an error occurred and nonzero otherwise.
+
+<!--  -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Wisdom-Import.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Wisdom-Import.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,101 @@
+<html lang="en">
+<head>
+<title>Wisdom Import - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Wisdom.html#Wisdom" title="Wisdom">
+<link rel="prev" href="Wisdom-Export.html#Wisdom-Export" title="Wisdom Export">
+<link rel="next" href="Forgetting-Wisdom.html#Forgetting-Wisdom" title="Forgetting Wisdom">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Wisdom-Import"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Forgetting-Wisdom.html#Forgetting-Wisdom">Forgetting Wisdom</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Wisdom-Export.html#Wisdom-Export">Wisdom Export</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Wisdom.html#Wisdom">Wisdom</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.7.2 Wisdom Import</h4>
+
+<pre class="example">     int fftw_import_system_wisdom(void);
+     int fftw_import_wisdom_from_filename(const char *filename);
+     int fftw_import_wisdom_from_string(const char *input_string);
+     int fftw_import_wisdom(int (*read_char)(void *), void *data);
+</pre>
+   <p><a name="index-fftw_005fimport_005fwisdom-282"></a><a name="index-fftw_005fimport_005fsystem_005fwisdom-283"></a><a name="index-fftw_005fimport_005fwisdom_005ffrom_005ffilename-284"></a><a name="index-fftw_005fimport_005fwisdom_005ffrom_005ffile-285"></a><a name="index-fftw_005fimport_005fwisdom_005ffrom_005fstring-286"></a>
+These functions import wisdom into a program from data stored by the
+<code>fftw_export_wisdom</code> functions above. (See <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>.)  The imported wisdom replaces any wisdom
+already accumulated by the running program.
+
+   <p><code>fftw_import_wisdom</code> imports wisdom from any input medium, as
+specified by the callback function <code>read_char</code>. <code>read_char</code> is
+a <code>getc</code>-like function that returns the next character in the
+input; its parameter is the <code>data</code> pointer passed to
+<code>fftw_import_wisdom</code>. If the end of the input data is reached
+(which should never happen for valid data), <code>read_char</code> should
+return <code>EOF</code> (as defined in <code>&lt;stdio.h&gt;</code>).  For convenience,
+the following three &ldquo;wrapper&rdquo; routines are provided:
+
+   <p><code>fftw_import_wisdom_from_filename</code> reads wisdom from a file named
+<code>filename</code>.  A lower-level function, which requires you to open
+and close the file yourself (e.g. if you want to read wisdom from a
+portion of a larger file) is <code>fftw_import_wisdom_from_file</code>. This
+reads wisdom from the current position in <code>input_file</code> (which
+should be open with read permission); upon exit, the file remains
+open, but the position of the read pointer is unspecified.
+
+   <p><code>fftw_import_wisdom_from_string</code> reads wisdom from the
+<code>NULL</code>-terminated string <code>input_string</code>.
+
+   <p><code>fftw_import_system_wisdom</code> reads wisdom from an
+implementation-defined standard file (<code>/etc/fftw/wisdom</code> on Unix
+and GNU systems). 
+<a name="index-wisdom_002c-system_002dwide-287"></a>
+
+   <p>The return value of these import routines is <code>1</code> if the wisdom was
+read successfully and <code>0</code> otherwise. Note that, in all of these
+functions, any data in the input stream past the end of the wisdom data
+is simply ignored.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Wisdom-String-Export_002fImport-from-Fortran.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Wisdom-String-Export_002fImport-from-Fortran.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,106 @@
+<html lang="en">
+<head>
+<title>Wisdom String Export/Import from Fortran - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran" title="Accessing the wisdom API from Fortran">
+<link rel="prev" href="Wisdom-File-Export_002fImport-from-Fortran.html#Wisdom-File-Export_002fImport-from-Fortran" title="Wisdom File Export/Import from Fortran">
+<link rel="next" href="Wisdom-Generic-Export_002fImport-from-Fortran.html#Wisdom-Generic-Export_002fImport-from-Fortran" title="Wisdom Generic Export/Import from Fortran">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Wisdom-String-Export%2fImport-from-Fortran"></a>
+<a name="Wisdom-String-Export_002fImport-from-Fortran"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Wisdom-Generic-Export_002fImport-from-Fortran.html#Wisdom-Generic-Export_002fImport-from-Fortran">Wisdom Generic Export/Import from Fortran</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Wisdom-File-Export_002fImport-from-Fortran.html#Wisdom-File-Export_002fImport-from-Fortran">Wisdom File Export/Import from Fortran</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">Accessing the wisdom API from Fortran</a>
+<hr>
+</div>
+
+<h4 class="subsection">7.6.2 Wisdom String Export/Import from Fortran</h4>
+
+<p><a name="index-fftw_005fexport_005fwisdom_005fto_005fstring-568"></a>Dealing with FFTW's C string export/import is a bit more painful.  In
+particular, the <code>fftw_export_wisdom_to_string</code> function requires
+you to deal with a dynamically allocated C string.  To get its length,
+you must define an interface to the C <code>strlen</code> function, and to
+deallocate it you must define an interface to C <code>free</code>:
+
+<pre class="example">       use, intrinsic :: iso_c_binding
+       interface
+         integer(C_INT) function strlen(s) bind(C, name='strlen')
+           import
+           type(C_PTR), value :: s
+         end function strlen
+         subroutine free(p) bind(C, name='free')
+           import
+           type(C_PTR), value :: p
+         end subroutine free
+       end interface
+</pre>
+   <p>Given these definitions, you can then export wisdom to a Fortran
+character array:
+
+<pre class="example">       character(C_CHAR), pointer :: s(:)
+       integer(C_SIZE_T) :: slen
+       type(C_PTR) :: p
+       p = fftw_export_wisdom_to_string()
+       if (.not. c_associated(p)) stop 'error exporting wisdom'
+       slen = strlen(p)
+       call c_f_pointer(p, s, [slen+1])
+       ...
+       call free(p)
+</pre>
+   <p><a name="index-c_005fassociated-569"></a><a name="index-c_005ff_005fpointer-570"></a>
+Note that <code>slen</code> is the length of the C string, but the length of
+the array is <code>slen+1</code> because it includes the terminating null
+character.  (You can omit the &lsquo;<samp><span class="samp">+1</span></samp>&rsquo; if you don't want Fortran to
+know about the null character.) The standard <code>c_associated</code> function
+checks whether <code>p</code> is a null pointer, which is returned by
+<code>fftw_export_wisdom_to_string</code> if there was an error.
+
+   <p><a name="index-fftw_005fimport_005fwisdom_005ffrom_005fstring-571"></a>To import wisdom from a string, use
+<code>fftw_import_wisdom_from_string</code> as usual; note that the argument
+of this function must be a <code>character(C_CHAR)</code> that is terminated
+by the <code>C_NULL_CHAR</code> character, like the <code>s</code> array above.
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Wisdom-Utilities.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Wisdom-Utilities.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,83 @@
+<html lang="en">
+<head>
+<title>Wisdom Utilities - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Wisdom.html#Wisdom" title="Wisdom">
+<link rel="prev" href="Forgetting-Wisdom.html#Forgetting-Wisdom" title="Forgetting Wisdom">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Wisdom-Utilities"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Forgetting-Wisdom.html#Forgetting-Wisdom">Forgetting Wisdom</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Wisdom.html#Wisdom">Wisdom</a>
+<hr>
+</div>
+
+<h4 class="subsection">4.7.4 Wisdom Utilities</h4>
+
+<p>FFTW includes two standalone utility programs that deal with wisdom.  We
+merely summarize them here, since they come with their own <code>man</code>
+pages for Unix and GNU systems (with HTML versions on our web site).
+
+   <p>The first program is <code>fftw-wisdom</code> (or <code>fftwf-wisdom</code> in
+single precision, etcetera), which can be used to create a wisdom file
+containing plans for any of the transform sizes and types supported by
+FFTW.  It is preferable to create wisdom directly from your executable
+(see <a href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom">Caveats in Using Wisdom</a>), but this program is useful for
+creating global wisdom files for <code>fftw_import_system_wisdom</code>. 
+<a name="index-fftw_002dwisdom-utility-289"></a>
+
+   <p>The second program is <code>fftw-wisdom-to-conf</code>, which takes a wisdom
+file as input and produces a <dfn>configuration routine</dfn> as output.  The
+latter is a C subroutine that you can compile and link into your
+program, replacing a routine of the same name in the FFTW library, that
+determines which parts of FFTW are callable by your program. 
+<code>fftw-wisdom-to-conf</code> produces a configuration routine that links
+to only those parts of FFTW needed by the saved plans in the wisdom,
+greatly reducing the size of statically linked executables (which should
+only attempt to create plans corresponding to those in the wisdom,
+however). 
+<a name="index-fftw_002dwisdom_002dto_002dconf-utility-290"></a><a name="index-configuration-routines-291"></a>
+<!--  -->
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Wisdom-of-Fortran_003f.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Wisdom-of-Fortran_003f.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,91 @@
+<html lang="en">
+<head>
+<title>Wisdom of Fortran? - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran" title="Calling FFTW from Legacy Fortran">
+<link rel="prev" href="Fortran-Examples.html#Fortran-Examples" title="Fortran Examples">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Wisdom-of-Fortran%3f"></a>
+<a name="Wisdom-of-Fortran_003f"></a>
+<p>
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Fortran-Examples.html#Fortran-Examples">Fortran Examples</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>
+<hr>
+</div>
+
+<h3 class="section">8.5 Wisdom of Fortran?</h3>
+
+<p>In this section, we discuss how one can import/export FFTW wisdom
+(saved plans) to/from a Fortran program; we assume that the reader is
+already familiar with wisdom, as described in <a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">Words of Wisdom-Saving Plans</a>.
+
+   <p><a name="index-portability-598"></a>The basic problem is that is difficult to (portably) pass files and
+strings between Fortran and C, so we cannot provide a direct Fortran
+equivalent to the <code>fftw_export_wisdom_to_file</code>, etcetera,
+functions.  Fortran interfaces <em>are</em> provided for the functions
+that do not take file/string arguments, however:
+<code>dfftw_import_system_wisdom</code>, <code>dfftw_import_wisdom</code>,
+<code>dfftw_export_wisdom</code>, and <code>dfftw_forget_wisdom</code>. 
+<a name="index-dfftw_005fimport_005fsystem_005fwisdom-599"></a><a name="index-dfftw_005fimport_005fwisdom-600"></a><a name="index-dfftw_005fexport_005fwisdom-601"></a><a name="index-dfftw_005fforget_005fwisdom-602"></a>
+
+   <p>So, for example, to import the system-wide wisdom, you would do:
+
+<pre class="example">             integer isuccess
+             call dfftw_import_system_wisdom(isuccess)
+</pre>
+   <p>As usual, the C return value is turned into a first parameter;
+<code>isuccess</code> is non-zero on success and zero on failure (e.g. if
+there is no system wisdom installed).
+
+   <p>If you want to import/export wisdom from/to an arbitrary file or
+elsewhere, you can employ the generic <code>dfftw_import_wisdom</code> and
+<code>dfftw_export_wisdom</code> functions, for which you must supply a
+subroutine to read/write one character at a time.  The FFTW package
+contains an example file <code>doc/f77_wisdom.f</code> demonstrating how to
+implement <code>import_wisdom_from_file</code> and
+<code>export_wisdom_to_file</code> subroutines in this way.  (These routines
+cannot be compiled into the FFTW library itself, lest all FFTW-using
+programs be required to link with the Fortran I/O library.)
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Wisdom.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Wisdom.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,72 @@
+<html lang="en">
+<head>
+<title>Wisdom - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="FFTW-Reference.html#FFTW-Reference" title="FFTW Reference">
+<link rel="prev" href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions" title="New-array Execute Functions">
+<link rel="next" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes" title="What FFTW Really Computes">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Wisdom"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">What FFTW Really Computes</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">New-array Execute Functions</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>
+<hr>
+</div>
+
+<h3 class="section">4.7 Wisdom</h3>
+
+<p><a name="index-wisdom-276"></a><a name="index-saving-plans-to-disk-277"></a>
+This section documents the FFTW mechanism for saving and restoring
+plans from disk.  This mechanism is called <dfn>wisdom</dfn>.
+
+<ul class="menu">
+<li><a accesskey="1" href="Wisdom-Export.html#Wisdom-Export">Wisdom Export</a>
+<li><a accesskey="2" href="Wisdom-Import.html#Wisdom-Import">Wisdom Import</a>
+<li><a accesskey="3" href="Forgetting-Wisdom.html#Forgetting-Wisdom">Forgetting Wisdom</a>
+<li><a accesskey="4" href="Wisdom-Utilities.html#Wisdom-Utilities">Wisdom Utilities</a>
+</ul>
+
+<!-- =========> -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/Words-of-Wisdom_002dSaving-Plans.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/Words-of-Wisdom_002dSaving-Plans.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,113 @@
+<html lang="en">
+<head>
+<title>Words of Wisdom-Saving Plans - FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="index.html#Top">
+<link rel="up" href="Other-Important-Topics.html#Other-Important-Topics" title="Other Important Topics">
+<link rel="prev" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format" title="Multi-dimensional Array Format">
+<link rel="next" href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom" title="Caveats in Using Wisdom">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<div class="node">
+<a name="Words-of-Wisdom-Saving-Plans"></a>
+<a name="Words-of-Wisdom_002dSaving-Plans"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom">Caveats in Using Wisdom</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">Multi-dimensional Array Format</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="Other-Important-Topics.html#Other-Important-Topics">Other Important Topics</a>
+<hr>
+</div>
+
+<h3 class="section">3.3 Words of Wisdom&mdash;Saving Plans</h3>
+
+<p><a name="index-wisdom-124"></a><a name="index-saving-plans-to-disk-125"></a>
+FFTW implements a method for saving plans to disk and restoring them. 
+In fact, what FFTW does is more general than just saving and loading
+plans.  The mechanism is called <dfn>wisdom</dfn>.  Here, we describe
+this feature at a high level. See <a href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>, for a less casual
+but more complete discussion of how to use wisdom in FFTW.
+
+   <p>Plans created with the <code>FFTW_MEASURE</code>, <code>FFTW_PATIENT</code>, or
+<code>FFTW_EXHAUSTIVE</code> options produce near-optimal FFT performance,
+but may require a long time to compute because FFTW must measure the
+runtime of many possible plans and select the best one.  This setup is
+designed for the situations where so many transforms of the same size
+must be computed that the start-up time is irrelevant.  For short
+initialization times, but slower transforms, we have provided
+<code>FFTW_ESTIMATE</code>.  The <code>wisdom</code> mechanism is a way to get the
+best of both worlds: you compute a good plan once, save it to
+disk, and later reload it as many times as necessary.  The wisdom
+mechanism can actually save and reload many plans at once, not just
+one. 
+<a name="index-FFTW_005fMEASURE-126"></a><a name="index-FFTW_005fPATIENT-127"></a><a name="index-FFTW_005fEXHAUSTIVE-128"></a><a name="index-FFTW_005fESTIMATE-129"></a>
+
+   <p>Whenever you create a plan, the FFTW planner accumulates wisdom, which
+is information sufficient to reconstruct the plan.  After planning,
+you can save this information to disk by means of the function:
+<pre class="example">     int fftw_export_wisdom_to_filename(const char *filename);
+</pre>
+   <p><a name="index-fftw_005fexport_005fwisdom_005fto_005ffilename-130"></a>(This function returns non-zero on success.)
+
+   <p>The next time you run the program, you can restore the wisdom with
+<code>fftw_import_wisdom_from_filename</code> (which also returns non-zero on success),
+and then recreate the plan using the same flags as before.
+<pre class="example">     int fftw_import_wisdom_from_filename(const char *filename);
+</pre>
+   <p><a name="index-fftw_005fimport_005fwisdom_005ffrom_005ffilename-131"></a>
+Wisdom is automatically used for any size to which it is applicable, as
+long as the planner flags are not more &ldquo;patient&rdquo; than those with which
+the wisdom was created.  For example, wisdom created with
+<code>FFTW_MEASURE</code> can be used if you later plan with
+<code>FFTW_ESTIMATE</code> or <code>FFTW_MEASURE</code>, but not with
+<code>FFTW_PATIENT</code>.
+
+   <p>The <code>wisdom</code> is cumulative, and is stored in a global, private
+data structure managed internally by FFTW.  The storage space required
+is minimal, proportional to the logarithm of the sizes the wisdom was
+generated from.  If memory usage is a concern, however, the wisdom can
+be forgotten and its associated memory freed by calling:
+<pre class="example">     void fftw_forget_wisdom(void);
+</pre>
+   <p><a name="index-fftw_005fforget_005fwisdom-132"></a>
+Wisdom can be exported to a file, a string, or any other medium. 
+For details, see <a href="Wisdom.html#Wisdom">Wisdom</a>.
+
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-dft.png
Binary file src/fftw-3.3.3/doc/html/equation-dft.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-dht.png
Binary file src/fftw-3.3.3/doc/html/equation-dht.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-idft.png
Binary file src/fftw-3.3.3/doc/html/equation-idft.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-redft00.png
Binary file src/fftw-3.3.3/doc/html/equation-redft00.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-redft01.png
Binary file src/fftw-3.3.3/doc/html/equation-redft01.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-redft10.png
Binary file src/fftw-3.3.3/doc/html/equation-redft10.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-redft11.png
Binary file src/fftw-3.3.3/doc/html/equation-redft11.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-rodft00.png
Binary file src/fftw-3.3.3/doc/html/equation-rodft00.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-rodft01.png
Binary file src/fftw-3.3.3/doc/html/equation-rodft01.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-rodft10.png
Binary file src/fftw-3.3.3/doc/html/equation-rodft10.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/equation-rodft11.png
Binary file src/fftw-3.3.3/doc/html/equation-rodft11.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/index.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/html/index.html	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,248 @@
+<html lang="en">
+<head>
+<title>FFTW 3.3.3</title>
+<meta http-equiv="Content-Type" content="text/html">
+<meta name="description" content="FFTW 3.3.3">
+<meta name="generator" content="makeinfo 4.13">
+<link title="Top" rel="start" href="#Top">
+<link href="http://www.gnu.org/software/texinfo/" rel="generator-home" title="Texinfo Homepage">
+<!--
+This manual is for FFTW
+(version 3.3.3, 25 November 2012).
+
+Copyright (C) 2003 Matteo Frigo.
+
+Copyright (C) 2003 Massachusetts Institute of Technology.
+
+     Permission is granted to make and distribute verbatim copies of
+     this manual provided the copyright notice and this permission
+     notice are preserved on all copies.
+
+     Permission is granted to copy and distribute modified versions of
+     this manual under the conditions for verbatim copying, provided
+     that the entire resulting derived work is distributed under the
+     terms of a permission notice identical to this one.
+
+     Permission is granted to copy and distribute translations of this
+     manual into another language, under the above conditions for
+     modified versions, except that this permission notice may be
+     stated in a translation approved by the Free Software Foundation.
+   -->
+<meta http-equiv="Content-Style-Type" content="text/css">
+<style type="text/css"><!--
+  pre.display { font-family:inherit }
+  pre.format  { font-family:inherit }
+  pre.smalldisplay { font-family:inherit; font-size:smaller }
+  pre.smallformat  { font-family:inherit; font-size:smaller }
+  pre.smallexample { font-size:smaller }
+  pre.smalllisp    { font-size:smaller }
+  span.sc    { font-variant:small-caps }
+  span.roman { font-family:serif; font-weight:normal; } 
+  span.sansserif { font-family:sans-serif; font-weight:normal; } 
+--></style>
+</head>
+<body>
+<h1 class="settitle">FFTW 3.3.3</h1>
+<div class="contents">
+<h2>Table of Contents</h2>
+<ul>
+<li><a name="toc_Top" href="index.html#Top">FFTW User Manual</a>
+<li><a name="toc_Introduction" href="Introduction.html#Introduction">1 Introduction</a>
+<li><a name="toc_Tutorial" href="Tutorial.html#Tutorial">2 Tutorial</a>
+<ul>
+<li><a href="Complex-One_002dDimensional-DFTs.html#Complex-One_002dDimensional-DFTs">2.1 Complex One-Dimensional DFTs</a>
+<li><a href="Complex-Multi_002dDimensional-DFTs.html#Complex-Multi_002dDimensional-DFTs">2.2 Complex Multi-Dimensional DFTs</a>
+<li><a href="One_002dDimensional-DFTs-of-Real-Data.html#One_002dDimensional-DFTs-of-Real-Data">2.3 One-Dimensional DFTs of Real Data</a>
+<li><a href="Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data">2.4 Multi-Dimensional DFTs of Real Data</a>
+<li><a href="More-DFTs-of-Real-Data.html#More-DFTs-of-Real-Data">2.5 More DFTs of Real Data</a>
+<ul>
+<li><a href="The-Halfcomplex_002dformat-DFT.html#The-Halfcomplex_002dformat-DFT">2.5.1 The Halfcomplex-format DFT</a>
+<li><a href="Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029.html#Real-even_002fodd-DFTs-_0028cosine_002fsine-transforms_0029">2.5.2 Real even/odd DFTs (cosine/sine transforms)</a>
+<li><a href="The-Discrete-Hartley-Transform.html#The-Discrete-Hartley-Transform">2.5.3 The Discrete Hartley Transform</a>
+</li></ul>
+</li></ul>
+<li><a name="toc_Other-Important-Topics" href="Other-Important-Topics.html#Other-Important-Topics">3 Other Important Topics</a>
+<ul>
+<li><a href="SIMD-alignment-and-fftw_005fmalloc.html#SIMD-alignment-and-fftw_005fmalloc">3.1 SIMD alignment and fftw_malloc</a>
+<li><a href="Multi_002ddimensional-Array-Format.html#Multi_002ddimensional-Array-Format">3.2 Multi-dimensional Array Format</a>
+<ul>
+<li><a href="Row_002dmajor-Format.html#Row_002dmajor-Format">3.2.1 Row-major Format</a>
+<li><a href="Column_002dmajor-Format.html#Column_002dmajor-Format">3.2.2 Column-major Format</a>
+<li><a href="Fixed_002dsize-Arrays-in-C.html#Fixed_002dsize-Arrays-in-C">3.2.3 Fixed-size Arrays in C</a>
+<li><a href="Dynamic-Arrays-in-C.html#Dynamic-Arrays-in-C">3.2.4 Dynamic Arrays in C</a>
+<li><a href="Dynamic-Arrays-in-C_002dThe-Wrong-Way.html#Dynamic-Arrays-in-C_002dThe-Wrong-Way">3.2.5 Dynamic Arrays in C&mdash;The Wrong Way</a>
+</li></ul>
+<li><a href="Words-of-Wisdom_002dSaving-Plans.html#Words-of-Wisdom_002dSaving-Plans">3.3 Words of Wisdom&mdash;Saving Plans</a>
+<li><a href="Caveats-in-Using-Wisdom.html#Caveats-in-Using-Wisdom">3.4 Caveats in Using Wisdom</a>
+</li></ul>
+<li><a name="toc_FFTW-Reference" href="FFTW-Reference.html#FFTW-Reference">4 FFTW Reference</a>
+<ul>
+<li><a href="Data-Types-and-Files.html#Data-Types-and-Files">4.1 Data Types and Files</a>
+<ul>
+<li><a href="Complex-numbers.html#Complex-numbers">4.1.1 Complex numbers</a>
+<li><a href="Precision.html#Precision">4.1.2 Precision</a>
+<li><a href="Memory-Allocation.html#Memory-Allocation">4.1.3 Memory Allocation</a>
+</li></ul>
+<li><a href="Using-Plans.html#Using-Plans">4.2 Using Plans</a>
+<li><a href="Basic-Interface.html#Basic-Interface">4.3 Basic Interface</a>
+<ul>
+<li><a href="Complex-DFTs.html#Complex-DFTs">4.3.1 Complex DFTs</a>
+<li><a href="Planner-Flags.html#Planner-Flags">4.3.2 Planner Flags</a>
+<li><a href="Real_002ddata-DFTs.html#Real_002ddata-DFTs">4.3.3 Real-data DFTs</a>
+<li><a href="Real_002ddata-DFT-Array-Format.html#Real_002ddata-DFT-Array-Format">4.3.4 Real-data DFT Array Format</a>
+<li><a href="Real_002dto_002dReal-Transforms.html#Real_002dto_002dReal-Transforms">4.3.5 Real-to-Real Transforms</a>
+<li><a href="Real_002dto_002dReal-Transform-Kinds.html#Real_002dto_002dReal-Transform-Kinds">4.3.6 Real-to-Real Transform Kinds</a>
+</li></ul>
+<li><a href="Advanced-Interface.html#Advanced-Interface">4.4 Advanced Interface</a>
+<ul>
+<li><a href="Advanced-Complex-DFTs.html#Advanced-Complex-DFTs">4.4.1 Advanced Complex DFTs</a>
+<li><a href="Advanced-Real_002ddata-DFTs.html#Advanced-Real_002ddata-DFTs">4.4.2 Advanced Real-data DFTs</a>
+<li><a href="Advanced-Real_002dto_002dreal-Transforms.html#Advanced-Real_002dto_002dreal-Transforms">4.4.3 Advanced Real-to-real Transforms</a>
+</li></ul>
+<li><a href="Guru-Interface.html#Guru-Interface">4.5 Guru Interface</a>
+<ul>
+<li><a href="Interleaved-and-split-arrays.html#Interleaved-and-split-arrays">4.5.1 Interleaved and split arrays</a>
+<li><a href="Guru-vector-and-transform-sizes.html#Guru-vector-and-transform-sizes">4.5.2 Guru vector and transform sizes</a>
+<li><a href="Guru-Complex-DFTs.html#Guru-Complex-DFTs">4.5.3 Guru Complex DFTs</a>
+<li><a href="Guru-Real_002ddata-DFTs.html#Guru-Real_002ddata-DFTs">4.5.4 Guru Real-data DFTs</a>
+<li><a href="Guru-Real_002dto_002dreal-Transforms.html#Guru-Real_002dto_002dreal-Transforms">4.5.5 Guru Real-to-real Transforms</a>
+<li><a href="64_002dbit-Guru-Interface.html#64_002dbit-Guru-Interface">4.5.6 64-bit Guru Interface</a>
+</li></ul>
+<li><a href="New_002darray-Execute-Functions.html#New_002darray-Execute-Functions">4.6 New-array Execute Functions</a>
+<li><a href="Wisdom.html#Wisdom">4.7 Wisdom</a>
+<ul>
+<li><a href="Wisdom-Export.html#Wisdom-Export">4.7.1 Wisdom Export</a>
+<li><a href="Wisdom-Import.html#Wisdom-Import">4.7.2 Wisdom Import</a>
+<li><a href="Forgetting-Wisdom.html#Forgetting-Wisdom">4.7.3 Forgetting Wisdom</a>
+<li><a href="Wisdom-Utilities.html#Wisdom-Utilities">4.7.4 Wisdom Utilities</a>
+</li></ul>
+<li><a href="What-FFTW-Really-Computes.html#What-FFTW-Really-Computes">4.8 What FFTW Really Computes</a>
+<ul>
+<li><a href="The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html#The-1d-Discrete-Fourier-Transform-_0028DFT_0029">4.8.1 The 1d Discrete Fourier Transform (DFT)</a>
+<li><a href="The-1d-Real_002ddata-DFT.html#The-1d-Real_002ddata-DFT">4.8.2 The 1d Real-data DFT</a>
+<li><a href="1d-Real_002deven-DFTs-_0028DCTs_0029.html#1d-Real_002deven-DFTs-_0028DCTs_0029">4.8.3 1d Real-even DFTs (DCTs)</a>
+<li><a href="1d-Real_002dodd-DFTs-_0028DSTs_0029.html#1d-Real_002dodd-DFTs-_0028DSTs_0029">4.8.4 1d Real-odd DFTs (DSTs)</a>
+<li><a href="1d-Discrete-Hartley-Transforms-_0028DHTs_0029.html#1d-Discrete-Hartley-Transforms-_0028DHTs_0029">4.8.5 1d Discrete Hartley Transforms (DHTs)</a>
+<li><a href="Multi_002ddimensional-Transforms.html#Multi_002ddimensional-Transforms">4.8.6 Multi-dimensional Transforms</a>
+</li></ul>
+</li></ul>
+<li><a name="toc_Multi_002dthreaded-FFTW" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">5 Multi-threaded FFTW</a>
+<ul>
+<li><a href="Installation-and-Supported-Hardware_002fSoftware.html#Installation-and-Supported-Hardware_002fSoftware">5.1 Installation and Supported Hardware/Software</a>
+<li><a href="Usage-of-Multi_002dthreaded-FFTW.html#Usage-of-Multi_002dthreaded-FFTW">5.2 Usage of Multi-threaded FFTW</a>
+<li><a href="How-Many-Threads-to-Use_003f.html#How-Many-Threads-to-Use_003f">5.3 How Many Threads to Use?</a>
+<li><a href="Thread-safety.html#Thread-safety">5.4 Thread safety</a>
+</li></ul>
+<li><a name="toc_Distributed_002dmemory-FFTW-with-MPI" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">6 Distributed-memory FFTW with MPI</a>
+<ul>
+<li><a href="FFTW-MPI-Installation.html#FFTW-MPI-Installation">6.1 FFTW MPI Installation</a>
+<li><a href="Linking-and-Initializing-MPI-FFTW.html#Linking-and-Initializing-MPI-FFTW">6.2 Linking and Initializing MPI FFTW</a>
+<li><a href="2d-MPI-example.html#2d-MPI-example">6.3 2d MPI example</a>
+<li><a href="MPI-Data-Distribution.html#MPI-Data-Distribution">6.4 MPI Data Distribution</a>
+<ul>
+<li><a href="Basic-and-advanced-distribution-interfaces.html#Basic-and-advanced-distribution-interfaces">6.4.1 Basic and advanced distribution interfaces</a>
+<li><a href="Load-balancing.html#Load-balancing">6.4.2 Load balancing</a>
+<li><a href="Transposed-distributions.html#Transposed-distributions">6.4.3 Transposed distributions</a>
+<li><a href="One_002ddimensional-distributions.html#One_002ddimensional-distributions">6.4.4 One-dimensional distributions</a>
+</li></ul>
+<li><a href="Multi_002ddimensional-MPI-DFTs-of-Real-Data.html#Multi_002ddimensional-MPI-DFTs-of-Real-Data">6.5 Multi-dimensional MPI DFTs of Real Data</a>
+<li><a href="Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms.html#Other-Multi_002ddimensional-Real_002ddata-MPI-Transforms">6.6 Other multi-dimensional Real-Data MPI Transforms</a>
+<li><a href="FFTW-MPI-Transposes.html#FFTW-MPI-Transposes">6.7 FFTW MPI Transposes</a>
+<ul>
+<li><a href="Basic-distributed_002dtranspose-interface.html#Basic-distributed_002dtranspose-interface">6.7.1 Basic distributed-transpose interface</a>
+<li><a href="Advanced-distributed_002dtranspose-interface.html#Advanced-distributed_002dtranspose-interface">6.7.2 Advanced distributed-transpose interface</a>
+<li><a href="An-improved-replacement-for-MPI_005fAlltoall.html#An-improved-replacement-for-MPI_005fAlltoall">6.7.3 An improved replacement for MPI_Alltoall</a>
+</li></ul>
+<li><a href="FFTW-MPI-Wisdom.html#FFTW-MPI-Wisdom">6.8 FFTW MPI Wisdom</a>
+<li><a href="Avoiding-MPI-Deadlocks.html#Avoiding-MPI-Deadlocks">6.9 Avoiding MPI Deadlocks</a>
+<li><a href="FFTW-MPI-Performance-Tips.html#FFTW-MPI-Performance-Tips">6.10 FFTW MPI Performance Tips</a>
+<li><a href="Combining-MPI-and-Threads.html#Combining-MPI-and-Threads">6.11 Combining MPI and Threads</a>
+<li><a href="FFTW-MPI-Reference.html#FFTW-MPI-Reference">6.12 FFTW MPI Reference</a>
+<ul>
+<li><a href="MPI-Files-and-Data-Types.html#MPI-Files-and-Data-Types">6.12.1 MPI Files and Data Types</a>
+<li><a href="MPI-Initialization.html#MPI-Initialization">6.12.2 MPI Initialization</a>
+<li><a href="Using-MPI-Plans.html#Using-MPI-Plans">6.12.3 Using MPI Plans</a>
+<li><a href="MPI-Data-Distribution-Functions.html#MPI-Data-Distribution-Functions">6.12.4 MPI Data Distribution Functions</a>
+<li><a href="MPI-Plan-Creation.html#MPI-Plan-Creation">6.12.5 MPI Plan Creation</a>
+<li><a href="MPI-Wisdom-Communication.html#MPI-Wisdom-Communication">6.12.6 MPI Wisdom Communication</a>
+</li></ul>
+<li><a href="FFTW-MPI-Fortran-Interface.html#FFTW-MPI-Fortran-Interface">6.13 FFTW MPI Fortran Interface</a>
+</li></ul>
+<li><a name="toc_Calling-FFTW-from-Modern-Fortran" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">7 Calling FFTW from Modern Fortran</a>
+<ul>
+<li><a href="Overview-of-Fortran-interface.html#Overview-of-Fortran-interface">7.1 Overview of Fortran interface</a>
+<ul>
+<li><a href="Extended-and-quadruple-precision-in-Fortran.html#Extended-and-quadruple-precision-in-Fortran">7.1.1 Extended and quadruple precision in Fortran</a>
+</li></ul>
+<li><a href="Reversing-array-dimensions.html#Reversing-array-dimensions">7.2 Reversing array dimensions</a>
+<li><a href="FFTW-Fortran-type-reference.html#FFTW-Fortran-type-reference">7.3 FFTW Fortran type reference</a>
+<li><a href="Plan-execution-in-Fortran.html#Plan-execution-in-Fortran">7.4 Plan execution in Fortran</a>
+<li><a href="Allocating-aligned-memory-in-Fortran.html#Allocating-aligned-memory-in-Fortran">7.5 Allocating aligned memory in Fortran</a>
+<li><a href="Accessing-the-wisdom-API-from-Fortran.html#Accessing-the-wisdom-API-from-Fortran">7.6 Accessing the wisdom API from Fortran</a>
+<ul>
+<li><a href="Wisdom-File-Export_002fImport-from-Fortran.html#Wisdom-File-Export_002fImport-from-Fortran">7.6.1 Wisdom File Export/Import from Fortran</a>
+<li><a href="Wisdom-String-Export_002fImport-from-Fortran.html#Wisdom-String-Export_002fImport-from-Fortran">7.6.2 Wisdom String Export/Import from Fortran</a>
+<li><a href="Wisdom-Generic-Export_002fImport-from-Fortran.html#Wisdom-Generic-Export_002fImport-from-Fortran">7.6.3 Wisdom Generic Export/Import from Fortran</a>
+</li></ul>
+<li><a href="Defining-an-FFTW-module.html#Defining-an-FFTW-module">7.7 Defining an FFTW module</a>
+</li></ul>
+<li><a name="toc_Calling-FFTW-from-Legacy-Fortran" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">8 Calling FFTW from Legacy Fortran</a>
+<ul>
+<li><a href="Fortran_002dinterface-routines.html#Fortran_002dinterface-routines">8.1 Fortran-interface routines</a>
+<li><a href="FFTW-Constants-in-Fortran.html#FFTW-Constants-in-Fortran">8.2 FFTW Constants in Fortran</a>
+<li><a href="FFTW-Execution-in-Fortran.html#FFTW-Execution-in-Fortran">8.3 FFTW Execution in Fortran</a>
+<li><a href="Fortran-Examples.html#Fortran-Examples">8.4 Fortran Examples</a>
+<li><a href="Wisdom-of-Fortran_003f.html#Wisdom-of-Fortran_003f">8.5 Wisdom of Fortran?</a>
+</li></ul>
+<li><a name="toc_Upgrading-from-FFTW-version-2" href="Upgrading-from-FFTW-version-2.html#Upgrading-from-FFTW-version-2">9 Upgrading from FFTW version 2</a>
+<li><a name="toc_Installation-and-Customization" href="Installation-and-Customization.html#Installation-and-Customization">10 Installation and Customization</a>
+<ul>
+<li><a href="Installation-on-Unix.html#Installation-on-Unix">10.1 Installation on Unix</a>
+<li><a href="Installation-on-non_002dUnix-systems.html#Installation-on-non_002dUnix-systems">10.2 Installation on non-Unix systems</a>
+<li><a href="Cycle-Counters.html#Cycle-Counters">10.3 Cycle Counters</a>
+<li><a href="Generating-your-own-code.html#Generating-your-own-code">10.4 Generating your own code</a>
+</li></ul>
+<li><a name="toc_Acknowledgments" href="Acknowledgments.html#Acknowledgments">11 Acknowledgments</a>
+<li><a name="toc_License-and-Copyright" href="License-and-Copyright.html#License-and-Copyright">12 License and Copyright</a>
+<li><a name="toc_Concept-Index" href="Concept-Index.html#Concept-Index">13 Concept Index</a>
+<li><a name="toc_Library-Index" href="Library-Index.html#Library-Index">14 Library Index</a>
+</li></ul>
+</div>
+
+
+
+<div class="node">
+<a name="Top"></a>
+<p>
+Next:&nbsp;<a rel="next" accesskey="n" href="Introduction.html#Introduction">Introduction</a>,
+Previous:&nbsp;<a rel="previous" accesskey="p" href="../index.html#dir">(dir)</a>,
+Up:&nbsp;<a rel="up" accesskey="u" href="../index.html#dir">(dir)</a>
+<hr>
+</div>
+
+<h2 class="unnumbered">FFTW User Manual</h2>
+
+<p>Welcome to FFTW, the Fastest Fourier Transform in the West.  FFTW is a
+collection of fast C routines to compute the discrete Fourier transform. 
+This manual documents FFTW version 3.3.3.
+
+<ul class="menu">
+<li><a accesskey="1" href="Introduction.html#Introduction">Introduction</a>
+<li><a accesskey="2" href="Tutorial.html#Tutorial">Tutorial</a>
+<li><a accesskey="3" href="Other-Important-Topics.html#Other-Important-Topics">Other Important Topics</a>
+<li><a accesskey="4" href="FFTW-Reference.html#FFTW-Reference">FFTW Reference</a>
+<li><a accesskey="5" href="Multi_002dthreaded-FFTW.html#Multi_002dthreaded-FFTW">Multi-threaded FFTW</a>
+<li><a accesskey="6" href="Distributed_002dmemory-FFTW-with-MPI.html#Distributed_002dmemory-FFTW-with-MPI">Distributed-memory FFTW with MPI</a>
+<li><a accesskey="7" href="Calling-FFTW-from-Modern-Fortran.html#Calling-FFTW-from-Modern-Fortran">Calling FFTW from Modern Fortran</a>
+<li><a accesskey="8" href="Calling-FFTW-from-Legacy-Fortran.html#Calling-FFTW-from-Legacy-Fortran">Calling FFTW from Legacy Fortran</a>
+<li><a accesskey="9" href="Upgrading-from-FFTW-version-2.html#Upgrading-from-FFTW-version-2">Upgrading from FFTW version 2</a>
+<li><a href="Installation-and-Customization.html#Installation-and-Customization">Installation and Customization</a>
+<li><a href="Acknowledgments.html#Acknowledgments">Acknowledgments</a>
+<li><a href="License-and-Copyright.html#License-and-Copyright">License and Copyright</a>
+<li><a href="Concept-Index.html#Concept-Index">Concept Index</a>
+<li><a href="Library-Index.html#Library-Index">Library Index</a>
+</ul>
+
+<!-- ************************************************************ -->
+   </body></html>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/html/rfftwnd-for-html.png
Binary file src/fftw-3.3.3/doc/html/rfftwnd-for-html.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/install.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/install.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,361 @@
+@node Installation and Customization, Acknowledgments, Upgrading from FFTW version 2, Top
+@chapter Installation and Customization
+@cindex installation
+
+This chapter describes the installation and customization of FFTW, the
+latest version of which may be downloaded from
+@uref{http://www.fftw.org, the FFTW home page}.
+
+In principle, FFTW should work on any system with an ANSI C compiler
+(@code{gcc} is fine).  However, planner time is drastically reduced if
+FFTW can exploit a hardware cycle counter; FFTW comes with cycle-counter
+support for all modern general-purpose CPUs, but you may need to add a
+couple of lines of code if your compiler is not yet supported
+(@pxref{Cycle Counters}).  (On Unix, there will be a warning at the end
+of the @code{configure} output if no cycle counter is found.)
+@cindex cycle counter
+@cindex compiler
+@cindex portability
+
+
+Installation of FFTW is simplest if you have a Unix or a GNU system,
+such as GNU/Linux, and we describe this case in the first section below,
+including the use of special configuration options to e.g. install
+different precisions or exploit optimizations for particular
+architectures (e.g. SIMD).  Compilation on non-Unix systems is a more
+manual process, but we outline the procedure in the second section.  It
+is also likely that pre-compiled binaries will be available for popular
+systems.
+
+Finally, we describe how you can customize FFTW for particular needs by
+generating @emph{codelets} for fast transforms of sizes not supported
+efficiently by the standard FFTW distribution.
+@cindex codelet
+
+@menu
+* Installation on Unix::        
+* Installation on non-Unix systems::  
+* Cycle Counters::              
+* Generating your own code::    
+@end menu
+
+@c ------------------------------------------------------------
+
+@node Installation on Unix, Installation on non-Unix systems, Installation and Customization, Installation and Customization
+@section Installation on Unix
+
+FFTW comes with a @code{configure} program in the GNU style.
+Installation can be as simple as:
+@fpindex configure
+
+@example
+./configure
+make
+make install
+@end example
+
+This will build the uniprocessor complex and real transform libraries
+along with the test programs.  (We recommend that you use GNU
+@code{make} if it is available; on some systems it is called
+@code{gmake}.)  The ``@code{make install}'' command installs the fftw
+and rfftw libraries in standard places, and typically requires root
+privileges (unless you specify a different install directory with the
+@code{--prefix} flag to @code{configure}).  You can also type
+``@code{make check}'' to put the FFTW test programs through their paces.
+If you have problems during configuration or compilation, you may want
+to run ``@code{make distclean}'' before trying again; this ensures that
+you don't have any stale files left over from previous compilation
+attempts.
+
+The @code{configure} script chooses the @code{gcc} compiler by default,
+if it is available; you can select some other compiler with:
+@example
+./configure CC="@r{@i{<the name of your C compiler>}}"
+@end example
+
+The @code{configure} script knows good @code{CFLAGS} (C compiler flags)
+@cindex compiler flags
+for a few systems.  If your system is not known, the @code{configure}
+script will print out a warning.  In this case, you should re-configure
+FFTW with the command
+@example
+./configure CFLAGS="@r{@i{<write your CFLAGS here>}}"
+@end example
+and then compile as usual.  If you do find an optimal set of
+@code{CFLAGS} for your system, please let us know what they are (along
+with the output of @code{config.guess}) so that we can include them in
+future releases.
+
+@code{configure} supports all the standard flags defined by the GNU
+Coding Standards; see the @code{INSTALL} file in FFTW or
+@uref{http://www.gnu.org/prep/standards/html_node/index.html, the GNU web page}.
+Note especially @code{--help} to list all flags and
+@code{--enable-shared} to create shared, rather than static, libraries.
+@code{configure} also accepts a few FFTW-specific flags, particularly:
+
+@itemize @bullet
+
+@item
+@cindex precision
+@code{--enable-float}: Produces a single-precision version of FFTW
+(@code{float}) instead of the default double-precision (@code{double}).
+@xref{Precision}.
+
+@item
+@cindex precision
+@code{--enable-long-double}: Produces a long-double precision version of
+FFTW (@code{long double}) instead of the default double-precision
+(@code{double}).  The @code{configure} script will halt with an error
+message if @code{long double} is the same size as @code{double} on your
+machine/compiler.  @xref{Precision}.
+
+@item
+@cindex precision
+@code{--enable-quad-precision}: Produces a quadruple-precision version
+of FFTW using the nonstandard @code{__float128} type provided by
+@code{gcc} 4.6 or later on x86, x86-64, and Itanium architectures,
+instead of the default double-precision (@code{double}).  The
+@code{configure} script will halt with an error message if the
+compiler is not @code{gcc} version 4.6 or later or if @code{gcc}'s
+@code{libquadmath} library is not installed.  @xref{Precision}.
+
+@item
+@cindex threads
+@code{--enable-threads}: Enables compilation and installation of the
+FFTW threads library (@pxref{Multi-threaded FFTW}), which provides a
+simple interface to parallel transforms for SMP systems.  By default,
+the threads routines are not compiled.
+
+@item
+@code{--enable-openmp}: Like @code{--enable-threads}, but using OpenMP
+compiler directives in order to induce parallelism rather than
+spawning its own threads directly, and installing an @samp{fftw3_omp} library
+rather than an @samp{fftw3_threads} library (@pxref{Multi-threaded           
+FFTW}).  You can use both @code{--enable-openmp} and @code{--enable-threads}
+since they compile/install libraries with different names.  By default,
+the OpenMP routines are not compiled.
+
+@item
+@code{--with-combined-threads}: By default, if @code{--enable-threads}
+is used, the threads support is compiled into a separate library that
+must be linked in addition to the main FFTW library.  This is so that
+users of the serial library do not need to link the system threads
+libraries.  If @code{--with-combined-threads} is specified, however,
+then no separate threads library is created, and threads are included
+in the main FFTW library.  This is mainly useful under Windows, where
+no system threads library is required and inter-library dependencies
+are problematic.
+
+@item
+@cindex MPI
+@code{--enable-mpi}: Enables compilation and installation of the FFTW
+MPI library (@pxref{Distributed-memory FFTW with MPI}), which provides
+parallel transforms for distributed-memory systems with MPI.  (By
+default, the MPI routines are not compiled.)  @xref{FFTW MPI
+Installation}.
+
+@item
+@cindex Fortran-callable wrappers
+@code{--disable-fortran}: Disables inclusion of legacy-Fortran
+wrapper routines (@pxref{Calling FFTW from Legacy Fortran}) in the standard
+FFTW libraries.  These wrapper routines increase the library size by
+only a negligible amount, so they are included by default as long as
+the @code{configure} script finds a Fortran compiler on your system.
+(To specify a particular Fortran compiler @i{foo}, pass
+@code{F77=}@i{foo} to @code{configure}.)
+
+@item
+@code{--with-g77-wrappers}: By default, when Fortran wrappers are
+included, the wrappers employ the linking conventions of the Fortran
+compiler detected by the @code{configure} script.  If this compiler is
+GNU @code{g77}, however, then @emph{two} versions of the wrappers are
+included: one with @code{g77}'s idiosyncratic convention of appending
+two underscores to identifiers, and one with the more common
+convention of appending only a single underscore.  This way, the same
+FFTW library will work with both @code{g77} and other Fortran
+compilers, such as GNU @code{gfortran}.  However, the converse is not
+true: if you configure with a different compiler, then the
+@code{g77}-compatible wrappers are not included.  By specifying
+@code{--with-g77-wrappers}, the @code{g77}-compatible wrappers are
+included in addition to wrappers for whatever Fortran compiler
+@code{configure} finds.
+@fpindex g77
+
+@item
+@code{--with-slow-timer}: Disables the use of hardware cycle counters,
+and falls back on @code{gettimeofday} or @code{clock}.  This greatly
+worsens performance, and should generally not be used (unless you don't
+have a cycle counter but still really want an optimized plan regardless
+of the time).  @xref{Cycle Counters}.
+
+@item
+@code{--enable-sse}, @code{--enable-sse2}, @code{--enable-avx},
+@code{--enable-altivec}, @code{--enable-neon}: Enable the compilation of
+SIMD code for SSE (Pentium III+), SSE2 (Pentium IV+), AVX (Sandy Bridge,
+Interlagos), AltiVec (PowerPC G4+), NEON (some ARM processors).  SSE,
+AltiVec, and NEON only work with @code{--enable-float} (above).  SSE2
+works in both single and double precision (and is simply SSE in single
+precision).  The resulting code will @emph{still work} on earlier CPUs
+lacking the SIMD extensions (SIMD is automatically disabled, although
+the FFTW library is still larger).
+@itemize @minus
+@item
+These options require a compiler supporting SIMD extensions, and
+compiler support is always a bit flaky: see the FFTW FAQ for a list of
+compiler versions that have problems compiling FFTW.
+@item
+With AltiVec and @code{gcc}, you may have to use the
+@code{-mabi=altivec} option when compiling any code that links to FFTW,
+in order to properly align the stack; otherwise, FFTW could crash when
+it tries to use an AltiVec feature.  (This is not necessary on MacOS X.)
+@item
+With SSE/SSE2 and @code{gcc}, you should use a version of gcc that
+properly aligns the stack when compiling any code that links to FFTW.
+By default, @code{gcc} 2.95 and later versions align the stack as
+needed, but you should not compile FFTW with the @code{-Os} option or the
+@code{-mpreferred-stack-boundary} option with an argument less than 4.
+@item
+Because of the large variety of ARM processors and ABIs, FFTW
+does not attempt to guess the correct @code{gcc} flags for generating
+NEON code.  In general, you will have to provide them on the command line.
+This command line is known to have worked at least once:
+@example
+./configure --with-slow-timer --host=arm-linux-gnueabi \
+  --enable-single --enable-neon \
+  "CC=arm-linux-gnueabi-gcc -march=armv7-a -mfloat-abi=softfp"
+@end example
+@end itemize
+
+@end itemize
+
+@cindex compiler
+To force @code{configure} to use a particular C compiler @i{foo}
+(instead of the default, usually @code{gcc}), pass @code{CC=}@i{foo} to the 
+@code{configure} script; you may also need to set the flags via the variable
+@code{CFLAGS} as described above.
+@cindex compiler flags
+
+@c ------------------------------------------------------------
+@node Installation on non-Unix systems, Cycle Counters, Installation on Unix, Installation and Customization
+@section Installation on non-Unix systems
+
+It should be relatively straightforward to compile FFTW even on non-Unix
+systems lacking the niceties of a @code{configure} script.  Basically,
+you need to edit the @code{config.h} header (copy it from
+@code{config.h.in}) to @code{#define} the various options and compiler
+characteristics, and then compile all the @samp{.c} files in the
+relevant directories.  
+
+The @code{config.h} header contains about 100 options to set, each one
+initially an @code{#undef}, each documented with a comment, and most of
+them fairly obvious.  For most of the options, you should simply
+@code{#define} them to @code{1} if they are applicable, although a few
+options require a particular value (e.g. @code{SIZEOF_LONG_LONG} should
+be defined to the size of the @code{long long} type, in bytes, or zero
+if it is not supported).  We will likely post some sample
+@code{config.h} files for various operating systems and compilers for
+you to use (at least as a starting point).  Please let us know if you
+have to hand-create a configuration file (and/or a pre-compiled binary)
+that you want to share.
+
+To create the FFTW library, you will then need to compile all of the
+@samp{.c} files in the @code{kernel}, @code{dft}, @code{dft/scalar},
+@code{dft/scalar/codelets}, @code{rdft}, @code{rdft/scalar},
+@code{rdft/scalar/r2cf}, @code{rdft/scalar/r2cb},
+@code{rdft/scalar/r2r}, @code{reodft}, and @code{api} directories.
+If you are compiling with SIMD support (e.g. you defined
+@code{HAVE_SSE2} in @code{config.h}), then you also need to compile
+the @code{.c} files in the @code{simd-support},
+@code{@{dft,rdft@}/simd}, @code{@{dft,rdft@}/simd/*} directories.
+
+Once these files are all compiled, link them into a library, or a shared
+library, or directly into your program.
+
+To compile the FFTW test program, additionally compile the code in the
+@code{libbench2/} directory, and link it into a library.  Then compile
+the code in the @code{tests/} directory and link it to the
+@code{libbench2} and FFTW libraries.  To compile the @code{fftw-wisdom}
+(command-line) tool (@pxref{Wisdom Utilities}), compile
+@code{tools/fftw-wisdom.c} and link it to the @code{libbench2} and FFTW
+libraries
+
+@c ------------------------------------------------------------
+@node Cycle Counters, Generating your own code, Installation on non-Unix systems, Installation and Customization
+@section Cycle Counters
+@cindex cycle counter
+
+FFTW's planner actually executes and times different possible FFT
+algorithms in order to pick the fastest plan for a given @math{n}.  In
+order to do this in as short a time as possible, however, the timer must
+have a very high resolution, and to accomplish this we employ the
+hardware @dfn{cycle counters} that are available on most CPUs.
+Currently, FFTW supports the cycle counters on x86, PowerPC/POWER, Alpha,
+UltraSPARC (SPARC v9), IA64, PA-RISC, and MIPS processors.
+
+@cindex compiler
+Access to the cycle counters, unfortunately, is a compiler and/or
+operating-system dependent task, often requiring inline assembly
+language, and it may be that your compiler is not supported.  If you are
+@emph{not} supported, FFTW will by default fall back on its estimator
+(effectively using @code{FFTW_ESTIMATE} for all plans).
+@ctindex FFTW_ESTIMATE
+
+You can add support by editing the file @code{kernel/cycle.h}; normally,
+this will involve adapting one of the examples already present in order
+to use the inline-assembler syntax for your C compiler, and will only
+require a couple of lines of code.  Anyone adding support for a new
+system to @code{cycle.h} is encouraged to email us at @email{fftw@@fftw.org}.
+
+If a cycle counter is not available on your system (e.g. some embedded
+processor), and you don't want to use estimated plans, as a last resort
+you can use the @code{--with-slow-timer} option to @code{configure} (on
+Unix) or @code{#define WITH_SLOW_TIMER} in @code{config.h} (elsewhere).
+This will use the much lower-resolution @code{gettimeofday} function, or even
+@code{clock} if the former is unavailable, and planning will be
+extremely slow.
+
+@c ------------------------------------------------------------
+@node Generating your own code,  , Cycle Counters, Installation and Customization
+@section Generating your own code
+@cindex code generator
+
+The directory @code{genfft} contains the programs that were used to
+generate FFTW's ``codelets,'' which are hard-coded transforms of small
+sizes.
+@cindex codelet
+We do not expect casual users to employ the generator, which is a rather
+sophisticated program that generates directed acyclic graphs of FFT
+algorithms and performs algebraic simplifications on them.  It was
+written in Objective Caml, a dialect of ML, which is available at
+@uref{http://caml.inria.fr/ocaml/index.en.html}.
+@cindex Caml
+
+
+If you have Objective Caml installed (along with recent versions of
+GNU @code{autoconf}, @code{automake}, and @code{libtool}), then you
+can change the set of codelets that are generated or play with the
+generation options.  The set of generated codelets is specified by the
+@code{@{dft,rdft@}/@{codelets,simd@}/*/Makefile.am} files.  For example, you can add
+efficient REDFT codelets of small sizes by modifying
+@code{rdft/codelets/r2r/Makefile.am}.
+@cindex REDFT
+After you modify any @code{Makefile.am} files, you can type @code{sh
+bootstrap.sh} in the top-level directory followed by @code{make} to
+re-generate the files.
+
+We do not provide more details about the code-generation process, since
+we do not expect that most users will need to generate their own code.
+However, feel free to contact us at @email{fftw@@fftw.org} if
+you are interested in the subject.
+
+@cindex monadic programming
+You might find it interesting to learn Caml and/or some modern
+programming techniques that we used in the generator (including monadic
+programming), especially if you heard the rumor that Java and
+object-oriented programming are the latest advancement in the field.
+The internal operation of the codelet generator is described in the
+paper, ``A Fast Fourier Transform Compiler,'' by M. Frigo, which is
+available from the @uref{http://www.fftw.org,FFTW home page} and also
+appeared in the @cite{Proceedings of the 1999 ACM SIGPLAN Conference on
+Programming Language Design and Implementation (PLDI)}.
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/intro.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/intro.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,165 @@
+@node    Introduction, Tutorial, Top, Top
+@chapter Introduction
+This manual documents version @value{VERSION} of FFTW, the
+@emph{Fastest Fourier Transform in the West}.  FFTW is a comprehensive
+collection of fast C routines for computing the discrete Fourier
+transform (DFT) and various special cases thereof.
+@cindex discrete Fourier transform
+@cindex DFT
+@itemize @bullet
+@item FFTW computes the DFT of complex data, real data, even-
+  or odd-symmetric real data (these symmetric transforms are usually
+  known as the discrete cosine or sine transform, respectively), and the
+  discrete Hartley transform (DHT) of real data.
+
+@item  The input data can have arbitrary length.  
+       FFTW employs @Onlogn{} algorithms for all lengths, including
+       prime numbers.
+
+@item  FFTW supports arbitrary multi-dimensional data.
+
+@item  FFTW supports the SSE, SSE2, AVX, Altivec, and MIPS PS instruction
+       sets.
+
+@item  FFTW includes parallel (multi-threaded) transforms
+       for shared-memory systems.
+@item  Starting with version 3.3, FFTW includes distributed-memory parallel
+       transforms using MPI.
+@end itemize
+
+We assume herein that you are familiar with the properties and uses of
+the DFT that are relevant to your application.  Otherwise, see
+e.g. @cite{The Fast Fourier Transform and Its Applications} by E. O. Brigham
+(Prentice-Hall, Englewood Cliffs, NJ, 1988).
+@uref{http://www.fftw.org, Our web page} also has links to FFT-related
+information online.
+@cindex FFTW
+
+@c TODO: revise.  We don't need to brag any longer
+@c
+@c FFTW is usually faster (and sometimes much faster) than all other
+@c freely-available Fourier transform programs found on the Net.  It is
+@c competitive with (and often faster than) the FFT codes in Sun's
+@c Performance Library, IBM's ESSL library, HP's CXML library, and
+@c Intel's MKL library, which are targeted at specific machines.
+@c Moreover, FFTW's performance is @emph{portable}.  Indeed, FFTW is
+@c unique in that it automatically adapts itself to your machine, your
+@c cache, the size of your memory, your number of registers, and all the
+@c other factors that normally make it impossible to optimize a program
+@c for more than one machine.  An extensive comparison of FFTW's
+@c performance with that of other Fourier transform codes has been made,
+@c and the results are available on the Web at
+@c @uref{http://fftw.org/benchfft, the benchFFT home page}.
+@c @cindex benchmark
+@c @fpindex benchfft
+
+In order to use FFTW effectively, you need to learn one basic concept
+of FFTW's internal structure: FFTW does not use a fixed algorithm for
+computing the transform, but instead it adapts the DFT algorithm to
+details of the underlying hardware in order to maximize performance.
+Hence, the computation of the transform is split into two phases.
+First, FFTW's @dfn{planner} ``learns'' the fastest way to compute the
+transform on your machine.  The planner
+@cindex planner
+produces a data structure called a @dfn{plan} that contains this
+@cindex plan
+information.  Subsequently, the plan is @dfn{executed}
+@cindex execute
+to transform the array of input data as dictated by the plan.  The
+plan can be reused as many times as needed.  In typical
+high-performance applications, many transforms of the same size are
+computed and, consequently, a relatively expensive initialization of
+this sort is acceptable.  On the other hand, if you need a single
+transform of a given size, the one-time cost of the planner becomes
+significant.  For this case, FFTW provides fast planners based on
+heuristics or on previously computed plans.
+
+FFTW supports transforms of data with arbitrary length, rank,
+multiplicity, and a general memory layout.  In simple cases, however,
+this generality may be unnecessary and confusing.  Consequently, we
+organized the interface to FFTW into three levels of increasing
+generality.
+@itemize @bullet
+@item The @dfn{basic interface} computes a single 
+      transform of contiguous data.
+@item The @dfn{advanced interface} computes transforms 
+      of multiple or strided arrays.
+@item The @dfn{guru interface} supports the most general data 
+      layouts, multiplicities, and strides.
+@end itemize
+We expect that most users will be best served by the basic interface,
+whereas the guru interface requires careful attention to the
+documentation to avoid problems.
+@cindex basic interface
+@cindex advanced interface
+@cindex guru interface 
+
+
+Besides the automatic performance adaptation performed by the planner,
+it is also possible for advanced users to customize FFTW manually.  For
+example, if code space is a concern, we provide a tool that links only
+the subset of FFTW needed by your application.  Conversely, you may need
+to extend FFTW because the standard distribution is not sufficient for
+your needs.  For example, the standard FFTW distribution works most
+efficiently for arrays whose size can be factored into small primes
+(@math{2}, @math{3}, @math{5}, and @math{7}), and otherwise it uses a
+slower general-purpose routine.  If you need efficient transforms of
+other sizes, you can use FFTW's code generator, which produces fast C
+programs (``codelets'') for any particular array size you may care
+about.
+@cindex code generator
+@cindex codelet
+For example, if you need transforms of size
+@ifinfo
+@math{513 = 19 x 3^3},
+@end ifinfo
+@tex
+$513 = 19 \cdot 3^3$,
+@end tex
+@html
+513&nbsp;=&nbsp;19*3<sup>3</sup>,
+@end html
+you can customize FFTW to support the factor @math{19} efficiently.
+
+For more information regarding FFTW, see the paper, ``The Design and
+Implementation of FFTW3,'' by M. Frigo and S. G. Johnson, which was an
+invited paper in @cite{Proc. IEEE} @b{93} (2), p. 216 (2005).  The
+code generator is described in the paper ``A fast Fourier transform
+compiler'',
+@cindex compiler
+by M. Frigo, in the @cite{Proceedings of the 1999 ACM SIGPLAN Conference
+on Programming Language Design and Implementation (PLDI), Atlanta,
+Georgia, May 1999}.  These papers, along with the latest version of
+FFTW, the FAQ, benchmarks, and other links, are available at
+@uref{http://www.fftw.org, the FFTW home page}.  
+
+The current version of FFTW incorporates many good ideas from the past
+thirty years of FFT literature.  In one way or another, FFTW uses the
+Cooley-Tukey algorithm, the prime factor algorithm, Rader's algorithm
+for prime sizes, and a split-radix algorithm (with a
+``conjugate-pair'' variation pointed out to us by Dan Bernstein).
+FFTW's code generator also produces new algorithms that we do not
+completely understand.
+@cindex algorithm
+The reader is referred to the cited papers for the appropriate
+references.
+
+The rest of this manual is organized as follows.  We first discuss the
+sequential (single-processor) implementation.  We start by describing
+the basic interface/features of FFTW in @ref{Tutorial}.  
+Next, @ref{Other Important Topics} discusses data alignment
+(@pxref{SIMD alignment and fftw_malloc}),
+the storage scheme of multi-dimensional arrays
+(@pxref{Multi-dimensional Array Format}), and FFTW's mechanism for
+storing plans on disk (@pxref{Words of Wisdom-Saving Plans}).  Next,
+@ref{FFTW Reference} provides comprehensive documentation of all
+FFTW's features.  Parallel transforms are discussed in their own
+chapters: @ref{Multi-threaded FFTW} and @ref{Distributed-memory FFTW
+with MPI}.  Fortran programmers can also use FFTW, as described in
+@ref{Calling FFTW from Legacy Fortran} and @ref{Calling FFTW from
+Modern Fortran}.  @ref{Installation and Customization} explains how to
+install FFTW in your computer system and how to adapt FFTW to your
+needs.  License and copyright information is given in @ref{License and
+Copyright}.  Finally, we thank all the people who helped us in
+@ref{Acknowledgments}.
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/legacy-fortran.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/legacy-fortran.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,374 @@
+@node Calling FFTW from Legacy Fortran, Upgrading from FFTW version 2, Calling FFTW from Modern Fortran, Top
+@chapter Calling FFTW from Legacy Fortran
+@cindex Fortran interface
+
+This chapter describes the interface to FFTW callable by Fortran code
+in older compilers not supporting the Fortran 2003 C interoperability
+features (@pxref{Calling FFTW from Modern Fortran}).  This interface
+has the major disadvantage that it is not type-checked, so if you
+mistake the argument types or ordering then your program will not have
+any compiler errors, and will likely crash at runtime.  So, greater
+care is needed.  Also, technically interfacing older Fortran versions
+to C is nonstandard, but in practice we have found that the techniques
+used in this chapter have worked with all known Fortran compilers for
+many years.
+
+The legacy Fortran interface differs from the C interface only in the
+prefix (@samp{dfftw_} instead of @samp{fftw_} in double precision) and
+a few other minor details.  This Fortran interface is included in the
+FFTW libraries by default, unless a Fortran compiler isn't found on
+your system or @code{--disable-fortran} is included in the
+@code{configure} flags.  We assume here that the reader is already
+familiar with the usage of FFTW in C, as described elsewhere in this
+manual.
+
+The MPI parallel interface to FFTW is @emph{not} currently available
+to legacy Fortran.
+
+@menu
+* Fortran-interface routines::  
+* FFTW Constants in Fortran::   
+* FFTW Execution in Fortran::   
+* Fortran Examples::            
+* Wisdom of Fortran?::          
+@end menu
+
+@c -------------------------------------------------------
+@node Fortran-interface routines, FFTW Constants in Fortran, Calling FFTW from Legacy Fortran, Calling FFTW from Legacy Fortran
+@section Fortran-interface routines
+
+Nearly all of the FFTW functions have Fortran-callable equivalents.
+The name of the legacy Fortran routine is the same as that of the
+corresponding C routine, but with the @samp{fftw_} prefix replaced by
+@samp{dfftw_}.@footnote{Technically, Fortran 77 identifiers are not
+allowed to have more than 6 characters, nor may they contain
+underscores.  Any compiler that enforces this limitation doesn't
+deserve to link to FFTW.}  The single and long-double precision
+versions use @samp{sfftw_} and @samp{lfftw_}, respectively, instead of
+@samp{fftwf_} and @samp{fftwl_}; quadruple precision (@code{real*16})
+is available on some systems as @samp{fftwq_} (@pxref{Precision}).
+(Note that @code{long double} on x86 hardware is usually at most
+80-bit extended precision, @emph{not} quadruple precision.)
+
+For the most part, all of the arguments to the functions are the same,
+with the following exceptions:
+
+@itemize @bullet
+
+@item
+@code{plan} variables (what would be of type @code{fftw_plan} in C),
+must be declared as a type that is at least as big as a pointer
+(address) on your machine.  We recommend using @code{integer*8} everywhere,
+since this should always be big enough.
+@cindex portability
+
+@item
+Any function that returns a value (e.g. @code{fftw_plan_dft}) is
+converted into a @emph{subroutine}.  The return value is converted into
+an additional @emph{first} parameter of this subroutine.@footnote{The
+reason for this is that some Fortran implementations seem to have
+trouble with C function return values, and vice versa.}
+
+@item
+@cindex column-major
+The Fortran routines expect multi-dimensional arrays to be in
+@emph{column-major} order, which is the ordinary format of Fortran
+arrays (@pxref{Multi-dimensional Array Format}).  They do this
+transparently and costlessly simply by reversing the order of the
+dimensions passed to FFTW, but this has one important consequence for
+multi-dimensional real-complex transforms, discussed below.
+
+@item
+Wisdom import and export is somewhat more tricky because one cannot
+easily pass files or strings between C and Fortran; see @ref{Wisdom of
+Fortran?}.
+
+@item
+Legacy Fortran cannot use the @code{fftw_malloc} dynamic-allocation routine.
+If you want to exploit the SIMD FFTW (@pxref{SIMD alignment and fftw_malloc}), you'll
+need to figure out some other way to ensure that your arrays are at
+least 16-byte aligned.
+
+@item
+@tindex fftw_iodim
+@cindex guru interface
+Since Fortran 77 does not have data structures, the @code{fftw_iodim}
+structure from the guru interface (@pxref{Guru vector and transform
+sizes}) must be split into separate arguments.  In particular, any
+@code{fftw_iodim} array arguments in the C guru interface become three
+integer array arguments (@code{n}, @code{is}, and @code{os}) in the
+Fortran guru interface, all of whose lengths should be equal to the
+corresponding @code{rank} argument.
+
+@item
+The guru planner interface in Fortran does @emph{not} do any automatic
+translation between column-major and row-major; you are responsible
+for setting the strides etcetera to correspond to your Fortran arrays.
+However, as a slight bug that we are preserving for backwards
+compatibility, the @samp{plan_guru_r2r} in Fortran @emph{does} reverse the
+order of its @code{kind} array parameter, so the @code{kind} array
+of that routine should be in the reverse of the order of the iodim
+arrays (see above).
+
+@end itemize
+
+In general, you should take care to use Fortran data types that
+correspond to (i.e. are the same size as) the C types used by FFTW.
+In practice, this correspondence is usually straightforward
+(i.e. @code{integer} corresponds to @code{int}, @code{real}
+corresponds to @code{float}, etcetera).  The native Fortran
+double/single-precision complex type should be compatible with
+@code{fftw_complex}/@code{fftwf_complex}.  Such simple correspondences
+are assumed in the examples below.
+@cindex portability
+
+@c -------------------------------------------------------
+@node  FFTW Constants in Fortran, FFTW Execution in Fortran, Fortran-interface routines, Calling FFTW from Legacy Fortran
+@section FFTW Constants in Fortran
+
+When creating plans in FFTW, a number of constants are used to specify
+options, such as @code{FFTW_MEASURE} or @code{FFTW_ESTIMATE}.  The
+same constants must be used with the wrapper routines, but of course the
+C header files where the constants are defined can't be incorporated
+directly into Fortran code.
+
+Instead, we have placed Fortran equivalents of the FFTW constant
+definitions in the file @code{fftw3.f}, which can be found in the same
+directory as @code{fftw3.h}.  If your Fortran compiler supports a
+preprocessor of some sort, you should be able to @code{include} or
+@code{#include} this file; otherwise, you can paste it directly into
+your code.
+
+@cindex flags
+In C, you combine different flags (like @code{FFTW_PRESERVE_INPUT} and
+@code{FFTW_MEASURE}) using the @samp{@code{|}} operator; in Fortran
+you should just use @samp{@code{+}}.  (Take care not to add in the
+same flag more than once, though.  Alternatively, you can use the
+@code{ior} intrinsic function standardized in Fortran 95.)
+
+@c -------------------------------------------------------
+@node  FFTW Execution in Fortran, Fortran Examples, FFTW Constants in Fortran, Calling FFTW from Legacy Fortran
+@section FFTW Execution in Fortran
+
+In C, in order to use a plan, one normally calls @code{fftw_execute},
+which executes the plan to perform the transform on the input/output
+arrays passed when the plan was created (@pxref{Using Plans}).  The
+corresponding subroutine call in legacy Fortran is:
+@example
+        call dfftw_execute(plan)
+@end example
+@findex dfftw_execute
+
+However, we have had reports that this causes problems with some
+recent optimizing Fortran compilers.  The problem is, because the
+input/output arrays are not passed as explicit arguments to
+@code{dfftw_execute}, the semantics of Fortran (unlike C) allow the
+compiler to assume that the input/output arrays are not changed by
+@code{dfftw_execute}.  As a consequence, certain compilers end up
+optimizing out or repositioning the call to @code{dfftw_execute},
+assuming incorrectly that it does nothing.
+
+There are various workarounds to this, but the safest and simplest
+thing is to not use @code{dfftw_execute} in Fortran.  Instead, use the
+functions described in @ref{New-array Execute Functions}, which take
+the input/output arrays as explicit arguments.  For example, if the
+plan is for a complex-data DFT and was created for the arrays
+@code{in} and @code{out}, you would do:
+@example
+        call dfftw_execute_dft(plan, in, out)
+@end example
+@findex dfftw_execute_dft
+
+There are a few things to be careful of, however:
+
+@itemize @bullet
+
+@item
+You must use the correct type of execute function, matching the way
+the plan was created.  Complex DFT plans should use
+@code{dfftw_execute_dft}, Real-input (r2c) DFT plans should use use
+@code{dfftw_execute_dft_r2c}, and real-output (c2r) DFT plans should
+use @code{dfftw_execute_dft_c2r}.  The various r2r plans should use
+@code{dfftw_execute_r2r}.
+
+@item
+You should normally pass the same input/output arrays that were used when
+creating the plan.  This is always safe.
+
+@item
+@emph{If} you pass @emph{different} input/output arrays compared to
+those used when creating the plan, you must abide by all the
+restrictions of the new-array execute functions (@pxref{New-array
+Execute Functions}).  The most difficult of these, in Fortran, is the
+requirement that the new arrays have the same alignment as the
+original arrays, because there seems to be no way in legacy Fortran to obtain
+guaranteed-aligned arrays (analogous to @code{fftw_malloc} in C).  You
+can, of course, use the @code{FFTW_UNALIGNED} flag when creating the
+plan, in which case the plan does not depend on the alignment, but
+this may sacrifice substantial performance on architectures (like x86)
+with SIMD instructions (@pxref{SIMD alignment and fftw_malloc}).
+@ctindex FFTW_UNALIGNED
+
+@end itemize
+
+@c -------------------------------------------------------
+@node Fortran Examples, Wisdom of Fortran?, FFTW Execution in Fortran, Calling FFTW from Legacy Fortran
+@section Fortran Examples
+
+In C, you might have something like the following to transform a
+one-dimensional complex array:
+
+@example
+        fftw_complex in[N], out[N];
+        fftw_plan plan;
+
+        plan = fftw_plan_dft_1d(N,in,out,FFTW_FORWARD,FFTW_ESTIMATE);
+        fftw_execute(plan);
+        fftw_destroy_plan(plan);
+@end example
+
+In Fortran, you would use the following to accomplish the same thing:
+
+@example
+        double complex in, out
+        dimension in(N), out(N)
+        integer*8 plan
+
+        call dfftw_plan_dft_1d(plan,N,in,out,FFTW_FORWARD,FFTW_ESTIMATE)
+        call dfftw_execute_dft(plan, in, out)
+        call dfftw_destroy_plan(plan)
+@end example
+@findex dfftw_plan_dft_1d
+@findex dfftw_execute_dft
+@findex dfftw_destroy_plan
+
+Notice how all routines are called as Fortran subroutines, and the
+plan is returned via the first argument to @code{dfftw_plan_dft_1d}.
+Notice also that we changed @code{fftw_execute} to
+@code{dfftw_execute_dft} (@pxref{FFTW Execution in Fortran}).  To do
+the same thing, but using 8 threads in parallel (@pxref{Multi-threaded
+FFTW}), you would simply prefix these calls with:
+
+@example
+        integer iret
+        call dfftw_init_threads(iret)
+        call dfftw_plan_with_nthreads(8)
+@end example
+@findex dfftw_init_threads
+@findex dfftw_plan_with_nthreads
+
+(You might want to check the value of @code{iret}: if it is zero, it
+indicates an unlikely error during thread initialization.)
+
+To transform a three-dimensional array in-place with C, you might do:
+
+@example
+        fftw_complex arr[L][M][N];
+        fftw_plan plan;
+
+        plan = fftw_plan_dft_3d(L,M,N, arr,arr,
+                                FFTW_FORWARD, FFTW_ESTIMATE);
+        fftw_execute(plan);
+        fftw_destroy_plan(plan);
+@end example
+
+In Fortran, you would use this instead:
+
+@example
+        double complex arr
+        dimension arr(L,M,N)
+        integer*8 plan
+
+        call dfftw_plan_dft_3d(plan, L,M,N, arr,arr,
+       &                       FFTW_FORWARD, FFTW_ESTIMATE)
+        call dfftw_execute_dft(plan, arr, arr)
+        call dfftw_destroy_plan(plan)
+@end example
+@findex dfftw_plan_dft_3d
+
+Note that we pass the array dimensions in the ``natural'' order in both C
+and Fortran.
+
+To transform a one-dimensional real array in Fortran, you might do:
+
+@example
+        double precision in
+        dimension in(N)
+        double complex out
+        dimension out(N/2 + 1)
+        integer*8 plan
+
+        call dfftw_plan_dft_r2c_1d(plan,N,in,out,FFTW_ESTIMATE)
+        call dfftw_execute_dft_r2c(plan, in, out)
+        call dfftw_destroy_plan(plan)
+@end example
+@findex dfftw_plan_dft_r2c_1d
+@findex dfftw_execute_dft_r2c
+
+To transform a two-dimensional real array, out of place, you might use
+the following:
+
+@example
+        double precision in
+        dimension in(M,N)
+        double complex out
+        dimension out(M/2 + 1, N)
+        integer*8 plan
+
+        call dfftw_plan_dft_r2c_2d(plan,M,N,in,out,FFTW_ESTIMATE)
+        call dfftw_execute_dft_r2c(plan, in, out)
+        call dfftw_destroy_plan(plan)
+@end example
+@findex dfftw_plan_dft_r2c_2d
+
+@strong{Important:} Notice that it is the @emph{first} dimension of the
+complex output array that is cut in half in Fortran, rather than the
+last dimension as in C.  This is a consequence of the interface routines
+reversing the order of the array dimensions passed to FFTW so that the
+Fortran program can use its ordinary column-major order.
+@cindex column-major
+@cindex r2c/c2r multi-dimensional array format
+
+@c -------------------------------------------------------
+@node Wisdom of Fortran?,  , Fortran Examples, Calling FFTW from Legacy Fortran
+@section Wisdom of Fortran?
+
+In this section, we discuss how one can import/export FFTW wisdom
+(saved plans) to/from a Fortran program; we assume that the reader is
+already familiar with wisdom, as described in @ref{Words of
+Wisdom-Saving Plans}.
+
+@cindex portability
+The basic problem is that is difficult to (portably) pass files and
+strings between Fortran and C, so we cannot provide a direct Fortran
+equivalent to the @code{fftw_export_wisdom_to_file}, etcetera,
+functions.  Fortran interfaces @emph{are} provided for the functions
+that do not take file/string arguments, however:
+@code{dfftw_import_system_wisdom}, @code{dfftw_import_wisdom},
+@code{dfftw_export_wisdom}, and @code{dfftw_forget_wisdom}.
+@findex dfftw_import_system_wisdom
+@findex dfftw_import_wisdom
+@findex dfftw_export_wisdom
+@findex dfftw_forget_wisdom
+
+
+So, for example, to import the system-wide wisdom, you would do:
+
+@example
+        integer isuccess
+        call dfftw_import_system_wisdom(isuccess)
+@end example
+
+As usual, the C return value is turned into a first parameter;
+@code{isuccess} is non-zero on success and zero on failure (e.g. if
+there is no system wisdom installed).
+
+If you want to import/export wisdom from/to an arbitrary file or
+elsewhere, you can employ the generic @code{dfftw_import_wisdom} and
+@code{dfftw_export_wisdom} functions, for which you must supply a
+subroutine to read/write one character at a time.  The FFTW package
+contains an example file @code{doc/f77_wisdom.f} demonstrating how to
+implement @code{import_wisdom_from_file} and
+@code{export_wisdom_to_file} subroutines in this way.  (These routines
+cannot be compiled into the FFTW library itself, lest all FFTW-using
+programs be required to link with the Fortran I/O library.)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/license.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/license.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,38 @@
+@node License and Copyright, Concept Index, Acknowledgments, Top
+@chapter License and Copyright
+
+FFTW is Copyright @copyright{} 2003, 2007-11 Matteo Frigo, Copyright
+@copyright{} 2003, 2007-11 Massachusetts Institute of Technology.
+
+FFTW is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA You can also
+find the @uref{http://www.gnu.org/licenses/gpl-2.0.html, GPL on the GNU
+web site}.
+
+In addition, we kindly ask you to acknowledge FFTW and its authors in
+any program or publication in which you use FFTW.  (You are not
+@emph{required} to do so; it is up to your common sense to decide
+whether you want to comply with this request or not.)  For general
+publications, we suggest referencing: Matteo Frigo and Steven
+G. Johnson, ``The design and implementation of FFTW3,''
+@i{Proc. IEEE} @b{93} (2), 216--231 (2005).
+
+Non-free versions of FFTW are available under terms different from those
+of the General Public License. (e.g. they do not require you to
+accompany any object code using FFTW with the corresponding source
+code.)  For these alternative terms you must purchase a license from MIT's
+Technology Licensing Office.  Users interested in such a license should
+contact us (@email{fftw@@fftw.org}) for more information.
+
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/mdate-sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/mdate-sh	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,225 @@
+#!/bin/sh
+# Get modification time of a file or directory and pretty-print it.
+
+scriptversion=2010-08-21.06; # UTC
+
+# Copyright (C) 1995, 1996, 1997, 2003, 2004, 2005, 2007, 2009, 2010
+# Free Software Foundation, Inc.
+# written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, June 1995
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+fi
+
+case $1 in
+  '')
+     echo "$0: No file.  Try \`$0 --help' for more information." 1>&2
+     exit 1;
+     ;;
+  -h | --h*)
+    cat <<\EOF
+Usage: mdate-sh [--help] [--version] FILE
+
+Pretty-print the modification day of FILE, in the format:
+1 January 1970
+
+Report bugs to <bug-automake@gnu.org>.
+EOF
+    exit $?
+    ;;
+  -v | --v*)
+    echo "mdate-sh $scriptversion"
+    exit $?
+    ;;
+esac
+
+error ()
+{
+  echo "$0: $1" >&2
+  exit 1
+}
+
+
+# Prevent date giving response in another language.
+LANG=C
+export LANG
+LC_ALL=C
+export LC_ALL
+LC_TIME=C
+export LC_TIME
+
+# GNU ls changes its time format in response to the TIME_STYLE
+# variable.  Since we cannot assume `unset' works, revert this
+# variable to its documented default.
+if test "${TIME_STYLE+set}" = set; then
+  TIME_STYLE=posix-long-iso
+  export TIME_STYLE
+fi
+
+save_arg1=$1
+
+# Find out how to get the extended ls output of a file or directory.
+if ls -L /dev/null 1>/dev/null 2>&1; then
+  ls_command='ls -L -l -d'
+else
+  ls_command='ls -l -d'
+fi
+# Avoid user/group names that might have spaces, when possible.
+if ls -n /dev/null 1>/dev/null 2>&1; then
+  ls_command="$ls_command -n"
+fi
+
+# A `ls -l' line looks as follows on OS/2.
+#  drwxrwx---        0 Aug 11  2001 foo
+# This differs from Unix, which adds ownership information.
+#  drwxrwx---   2 root  root      4096 Aug 11  2001 foo
+#
+# To find the date, we split the line on spaces and iterate on words
+# until we find a month.  This cannot work with files whose owner is a
+# user named `Jan', or `Feb', etc.  However, it's unlikely that `/'
+# will be owned by a user whose name is a month.  So we first look at
+# the extended ls output of the root directory to decide how many
+# words should be skipped to get the date.
+
+# On HPUX /bin/sh, "set" interprets "-rw-r--r--" as options, so the "x" below.
+set x`$ls_command /`
+
+# Find which argument is the month.
+month=
+command=
+until test $month
+do
+  test $# -gt 0 || error "failed parsing \`$ls_command /' output"
+  shift
+  # Add another shift to the command.
+  command="$command shift;"
+  case $1 in
+    Jan) month=January; nummonth=1;;
+    Feb) month=February; nummonth=2;;
+    Mar) month=March; nummonth=3;;
+    Apr) month=April; nummonth=4;;
+    May) month=May; nummonth=5;;
+    Jun) month=June; nummonth=6;;
+    Jul) month=July; nummonth=7;;
+    Aug) month=August; nummonth=8;;
+    Sep) month=September; nummonth=9;;
+    Oct) month=October; nummonth=10;;
+    Nov) month=November; nummonth=11;;
+    Dec) month=December; nummonth=12;;
+  esac
+done
+
+test -n "$month" || error "failed parsing \`$ls_command /' output"
+
+# Get the extended ls output of the file or directory.
+set dummy x`eval "$ls_command \"\\\$save_arg1\""`
+
+# Remove all preceding arguments
+eval $command
+
+# Because of the dummy argument above, month is in $2.
+#
+# On a POSIX system, we should have
+#
+# $# = 5
+# $1 = file size
+# $2 = month
+# $3 = day
+# $4 = year or time
+# $5 = filename
+#
+# On Darwin 7.7.0 and 7.6.0, we have
+#
+# $# = 4
+# $1 = day
+# $2 = month
+# $3 = year or time
+# $4 = filename
+
+# Get the month.
+case $2 in
+  Jan) month=January; nummonth=1;;
+  Feb) month=February; nummonth=2;;
+  Mar) month=March; nummonth=3;;
+  Apr) month=April; nummonth=4;;
+  May) month=May; nummonth=5;;
+  Jun) month=June; nummonth=6;;
+  Jul) month=July; nummonth=7;;
+  Aug) month=August; nummonth=8;;
+  Sep) month=September; nummonth=9;;
+  Oct) month=October; nummonth=10;;
+  Nov) month=November; nummonth=11;;
+  Dec) month=December; nummonth=12;;
+esac
+
+case $3 in
+  ???*) day=$1;;
+  *) day=$3; shift;;
+esac
+
+# Here we have to deal with the problem that the ls output gives either
+# the time of day or the year.
+case $3 in
+  *:*) set `date`; eval year=\$$#
+       case $2 in
+	 Jan) nummonthtod=1;;
+	 Feb) nummonthtod=2;;
+	 Mar) nummonthtod=3;;
+	 Apr) nummonthtod=4;;
+	 May) nummonthtod=5;;
+	 Jun) nummonthtod=6;;
+	 Jul) nummonthtod=7;;
+	 Aug) nummonthtod=8;;
+	 Sep) nummonthtod=9;;
+	 Oct) nummonthtod=10;;
+	 Nov) nummonthtod=11;;
+	 Dec) nummonthtod=12;;
+       esac
+       # For the first six month of the year the time notation can also
+       # be used for files modified in the last year.
+       if (expr $nummonth \> $nummonthtod) > /dev/null;
+       then
+	 year=`expr $year - 1`
+       fi;;
+  *) year=$3;;
+esac
+
+# The result.
+echo $day $month $year
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/modern-fortran.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/modern-fortran.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,725 @@
+@node Calling FFTW from Modern Fortran, Calling FFTW from Legacy Fortran, Distributed-memory FFTW with MPI, Top
+@chapter Calling FFTW from Modern Fortran
+@cindex Fortran interface
+
+Fortran 2003 standardized ways for Fortran code to call C libraries,
+and this allows us to support a direct translation of the FFTW C API
+into Fortran.  Compared to the legacy Fortran 77 interface
+(@pxref{Calling FFTW from Legacy Fortran}), this direct interface
+offers many advantages, especially compile-time type-checking and
+aligned memory allocation.  As of this writing, support for these C
+interoperability features seems widespread, having been implemented in
+nearly all major Fortran compilers (e.g. GNU, Intel, IBM,
+Oracle/Solaris, Portland Group, NAG).
+@cindex portability
+
+This chapter documents that interface.  For the most part, since this
+interface allows Fortran to call the C interface directly, the usage
+is identical to C translated to Fortran syntax.  However, there are a
+few subtle points such as memory allocation, wisdom, and data types
+that deserve closer attention.
+
+@menu
+* Overview of Fortran interface::  
+* Reversing array dimensions::  
+* FFTW Fortran type reference::  
+* Plan execution in Fortran::   
+* Allocating aligned memory in Fortran::  
+* Accessing the wisdom API from Fortran::  
+* Defining an FFTW module::     
+@end menu
+
+@c -------------------------------------------------------
+@node Overview of Fortran interface, Reversing array dimensions, Calling FFTW from Modern Fortran, Calling FFTW from Modern Fortran
+@section Overview of Fortran interface
+
+FFTW provides a file @code{fftw3.f03} that defines Fortran 2003
+interfaces for all of its C routines, except for the MPI routines
+described elsewhere, which can be found in the same directory as
+@code{fftw3.h} (the C header file).  In any Fortran subroutine where
+you want to use FFTW functions, you should begin with:
+
+@cindex iso_c_binding
+@example
+  use, intrinsic :: iso_c_binding 
+  include 'fftw3.f03'
+@end example
+
+This includes the interface definitions and the standard
+@code{iso_c_binding} module (which defines the equivalents of C
+types).  You can also put the FFTW functions into a module if you
+prefer (@pxref{Defining an FFTW module}).
+
+At this point, you can now call anything in the FFTW C interface
+directly, almost exactly as in C other than minor changes in syntax.
+For example:
+
+@findex fftw_plan_dft_2d
+@findex fftw_execute_dft
+@findex fftw_destroy_plan
+@example
+  type(C_PTR) :: plan
+  complex(C_DOUBLE_COMPLEX), dimension(1024,1000) :: in, out
+  plan = fftw_plan_dft_2d(1000,1024, in,out, FFTW_FORWARD,FFTW_ESTIMATE)
+  ...
+  call fftw_execute_dft(plan, in, out)
+  ...
+  call fftw_destroy_plan(plan)
+@end example
+
+A few important things to keep in mind are:
+
+@itemize @bullet
+
+@item
+@tindex fftw_complex
+@ctindex C_PTR
+@ctindex C_INT
+@ctindex C_DOUBLE
+@ctindex C_DOUBLE_COMPLEX
+FFTW plans are @code{type(C_PTR)}.  Other C types are mapped in the
+obvious way via the @code{iso_c_binding} standard: @code{int} turns
+into @code{integer(C_INT)}, @code{fftw_complex} turns into
+@code{complex(C_DOUBLE_COMPLEX)}, @code{double} turns into
+@code{real(C_DOUBLE)}, and so on. @xref{FFTW Fortran type reference}.
+
+@item
+Functions in C become functions in Fortran if they have a return value,
+and subroutines in Fortran otherwise.
+
+@item
+The ordering of the Fortran array dimensions must be @emph{reversed}
+when they are passed to the FFTW plan creation, thanks to differences
+in array indexing conventions (@pxref{Multi-dimensional Array
+Format}).  This is @emph{unlike} the legacy Fortran interface
+(@pxref{Fortran-interface routines}), which reversed the dimensions
+for you.  @xref{Reversing array dimensions}.
+
+@item
+@cindex alignment
+@cindex SIMD
+Using ordinary Fortran array declarations like this works, but may
+yield suboptimal performance because the data may not be not aligned
+to exploit SIMD instructions on modern proessors (@pxref{SIMD
+alignment and fftw_malloc}). Better performance will often be obtained
+by allocating with @samp{fftw_alloc}. @xref{Allocating aligned memory
+in Fortran}.
+
+@item
+@findex fftw_execute
+Similar to the legacy Fortran interface (@pxref{FFTW Execution in
+Fortran}), we currently recommend @emph{not} using @code{fftw_execute}
+but rather using the more specialized functions like
+@code{fftw_execute_dft} (@pxref{New-array Execute Functions}).  
+However, you should execute the plan on the @code{same arrays} as the
+ones for which you created the plan, unless you are especially
+careful.  @xref{Plan execution in Fortran}.  To prevent
+you from using @code{fftw_execute} by mistake, the @code{fftw3.f03}
+file does not provide an @code{fftw_execute} interface declaration.
+
+@item
+@cindex flags
+Multiple planner flags are combined with @code{ior} (equivalent to @samp{|} in C).  e.g. @code{FFTW_MEASURE | FFTW_DESTROY_INPUT} becomes @code{ior(FFTW_MEASURE, FFTW_DESTROY_INPUT)}.  (You can also use @samp{+} as long as you don't try to include a given flag more than once.)
+
+@end itemize
+
+@menu
+* Extended and quadruple precision in Fortran::  
+@end menu
+
+@node Extended and quadruple precision in Fortran,  , Overview of Fortran interface, Overview of Fortran interface
+@subsection Extended and quadruple precision in Fortran
+@cindex precision
+
+If FFTW is compiled in @code{long double} (extended) precision
+(@pxref{Installation and Customization}), you may be able to call the
+resulting @code{fftwl_} routines (@pxref{Precision}) from Fortran if
+your compiler supports the @code{C_LONG_DOUBLE_COMPLEX} type code.
+
+Because some Fortran compilers do not support
+@code{C_LONG_DOUBLE_COMPLEX}, the @code{fftwl_} declarations are
+segregated into a separate interface file @code{fftw3l.f03}, which you
+should include @emph{in addition} to @code{fftw3.f03} (which declares
+precision-independent @samp{FFTW_} constants):
+
+@cindex iso_c_binding
+@example
+  use, intrinsic :: iso_c_binding 
+  include 'fftw3.f03'
+  include 'fftw3l.f03'
+@end example
+
+We also support using the nonstandard @code{__float128}
+quadruple-precision type provided by recent versions of @code{gcc} on
+32- and 64-bit x86 hardware (@pxref{Installation and Customization}),
+using the corresponding @code{real(16)} and @code{complex(16)} types
+supported by @code{gfortran}.  The quadruple-precision @samp{fftwq_}
+functions (@pxref{Precision}) are declared in a @code{fftw3q.f03}
+interface file, which should be included in addition to
+@code{fftw3l.f03}, as above.  You should also link with
+@code{-lfftw3q -lquadmath -lm} as in C.
+
+@c -------------------------------------------------------
+@node Reversing array dimensions, FFTW Fortran type reference, Overview of Fortran interface, Calling FFTW from Modern Fortran
+@section Reversing array dimensions
+
+@cindex row-major
+@cindex column-major
+A minor annoyance in calling FFTW from Fortran is that FFTW's array
+dimensions are defined in the C convention (row-major order), while
+Fortran's array dimensions are the opposite convention (column-major
+order). @xref{Multi-dimensional Array Format}.  This is just a
+bookkeeping difference, with no effect on performance.  The only
+consequence of this is that, whenever you create an FFTW plan for a
+multi-dimensional transform, you must always @emph{reverse the
+ordering of the dimensions}.
+
+For example, consider the three-dimensional (@threedims{L,M,N}) arrays:
+
+@example
+  complex(C_DOUBLE_COMPLEX), dimension(L,M,N) :: in, out
+@end example
+
+To plan a DFT for these arrays using @code{fftw_plan_dft_3d}, you could do:
+
+@findex fftw_plan_dft_3d
+@example
+  plan = fftw_plan_dft_3d(N,M,L, in,out, FFTW_FORWARD,FFTW_ESTIMATE)
+@end example
+
+That is, from FFTW's perspective this is a @threedims{N,M,L} array.
+@emph{No data transposition need occur}, as this is @emph{only
+notation}.  Similarly, to use the more generic routine
+@code{fftw_plan_dft} with the same arrays, you could do:
+
+@example
+  integer(C_INT), dimension(3) :: n = [N,M,L]
+  plan = fftw_plan_dft_3d(3, n, in,out, FFTW_FORWARD,FFTW_ESTIMATE)
+@end example
+
+Note, by the way, that this is different from the legacy Fortran
+interface (@pxref{Fortran-interface routines}), which automatically
+reverses the order of the array dimension for you.  Here, you are
+calling the C interface directly, so there is no ``translation'' layer.
+
+@cindex r2c/c2r multi-dimensional array format
+An important thing to keep in mind is the implication of this for
+multidimensional real-to-complex transforms (@pxref{Multi-Dimensional
+DFTs of Real Data}).  In C, a multidimensional real-to-complex DFT
+chops the last dimension roughly in half (@threedims{N,M,L} real input
+goes to @threedims{N,M,L/2+1} complex output).  In Fortran, because
+the array dimension notation is reversed, the @emph{first} dimension of
+the complex data is chopped roughly in half.  For example consider the
+@samp{r2c} transform of @threedims{L,M,N} real input in Fortran:
+
+@findex fftw_plan_dft_r2c_3d
+@findex fftw_execute_dft_r2c
+@example
+  type(C_PTR) :: plan
+  real(C_DOUBLE), dimension(L,M,N) :: in
+  complex(C_DOUBLE_COMPLEX), dimension(L/2+1,M,N) :: out
+  plan = fftw_plan_dft_r2c_3d(N,M,L, in,out, FFTW_ESTIMATE)
+  ...
+  call fftw_execute_dft_r2c(plan, in, out)
+@end example
+
+@cindex in-place
+@cindex padding
+Alternatively, for an in-place r2c transform, as described in the C
+documentation we must @emph{pad} the @emph{first} dimension of the
+real input with an extra two entries (which are ignored by FFTW) so as
+to leave enough space for the complex output. The input is
+@emph{allocated} as a @threedims{2[L/2+1],M,N} array, even though only
+@threedims{L,M,N} of it is actually used.  In this example, we will
+allocate the array as a pointer type, using @samp{fftw_alloc} to
+ensure aligned memory for maximum performance (@pxref{Allocating
+aligned memory in Fortran}); this also makes it easy to reference the
+same memory as both a real array and a complex array.
+
+@findex fftw_alloc_complex
+@findex c_f_pointer
+@example
+  real(C_DOUBLE), pointer :: in(:,:,:)
+  complex(C_DOUBLE_COMPLEX), pointer :: out(:,:,:)
+  type(C_PTR) :: plan, data
+  data = fftw_alloc_complex(int((L/2+1) * M * N, C_SIZE_T))
+  call c_f_pointer(data, in, [2*(L/2+1),M,N])
+  call c_f_pointer(data, out, [L/2+1,M,N])
+  plan = fftw_plan_dft_r2c_3d(N,M,L, in,out, FFTW_ESTIMATE)
+  ...
+  call fftw_execute_dft_r2c(plan, in, out)
+  ...
+  call fftw_destroy_plan(plan)
+  call fftw_free(data)
+@end example
+
+@c -------------------------------------------------------
+@node FFTW Fortran type reference, Plan execution in Fortran, Reversing array dimensions, Calling FFTW from Modern Fortran
+@section FFTW Fortran type reference
+
+The following are the most important type correspondences between the
+C interface and Fortran:
+
+@itemize @bullet
+
+@item
+@tindex fftw_plan
+Plans (@code{fftw_plan} and variants) are @code{type(C_PTR)} (i.e. an
+opaque pointer).
+
+@item
+@tindex fftw_complex
+@cindex precision
+@ctindex C_DOUBLE
+@ctindex C_FLOAT
+@ctindex C_LONG_DOUBLE
+@ctindex C_DOUBLE_COMPLEX
+@ctindex C_FLOAT_COMPLEX
+@ctindex C_LONG_DOUBLE_COMPLEX
+The C floating-point types @code{double}, @code{float}, and @code{long
+double} correspond to @code{real(C_DOUBLE)}, @code{real(C_FLOAT)}, and
+@code{real(C_LONG_DOUBLE)}, respectively.  The C complex types
+@code{fftw_complex}, @code{fftwf_complex}, and @code{fftwl_complex}
+correspond in Fortran to @code{complex(C_DOUBLE_COMPLEX)},
+@code{complex(C_FLOAT_COMPLEX)}, and
+@code{complex(C_LONG_DOUBLE_COMPLEX)}, respectively.  
+Just as in C
+(@pxref{Precision}), the FFTW subroutines and types are prefixed with
+@samp{fftw_}, @code{fftwf_}, and @code{fftwl_} for the different precisions, and link to different libraries (@code{-lfftw3}, @code{-lfftw3f}, and @code{-lfftw3l} on Unix), but use the @emph{same} include file @code{fftw3.f03} and the @emph{same} constants (all of which begin with @samp{FFTW_}).  The exception is @code{long double} precision, for which you should @emph{also} include @code{fftw3l.f03} (@pxref{Extended and quadruple precision in Fortran}).
+
+@item
+@tindex ptrdiff_t
+@ctindex C_INT
+@ctindex C_INTPTR_T
+@ctindex C_SIZE_T
+@findex fftw_malloc
+The C integer types @code{int} and @code{unsigned} (used for planner
+flags) become @code{integer(C_INT)}.  The C integer type @code{ptrdiff_t} (e.g. in the @ref{64-bit Guru Interface}) becomes @code{integer(C_INTPTR_T)}, and @code{size_t} (in @code{fftw_malloc} etc.) becomes @code{integer(C_SIZE_T)}.
+
+@item
+@tindex fftw_r2r_kind
+@ctindex C_FFTW_R2R_KIND
+The @code{fftw_r2r_kind} type (@pxref{Real-to-Real Transform Kinds})
+becomes @code{integer(C_FFTW_R2R_KIND)}.  The various constant values
+of the C enumerated type (@code{FFTW_R2HC} etc.) become simply integer
+constants of the same names in Fortran.
+
+@item
+@ctindex FFTW_DESTROY_INPUT
+@cindex in-place
+@findex fftw_flops
+Numeric array pointer arguments (e.g. @code{double *})
+become @code{dimension(*), intent(out)} arrays of the same type, or
+@code{dimension(*), intent(in)} if they are pointers to constant data
+(e.g. @code{const int *}).  There are a few exceptions where numeric
+pointers refer to scalar outputs (e.g. for @code{fftw_flops}), in which
+case they are @code{intent(out)} scalar arguments in Fortran too.
+For the new-array execute functions (@pxref{New-array Execute Functions}),
+the input arrays are declared @code{dimension(*), intent(inout)}, since
+they can be modified in the case of in-place or @code{FFTW_DESTROY_INPUT}
+transforms.
+
+@item
+@findex fftw_alloc_real
+@findex c_f_pointer
+Pointer @emph{return} values (e.g @code{double *}) become
+@code{type(C_PTR)}.  (If they are pointers to arrays, as for
+@code{fftw_alloc_real}, you can convert them back to Fortran array
+pointers with the standard intrinsic function @code{c_f_pointer}.)
+
+@item
+@cindex guru interface
+@tindex fftw_iodim
+@tindex fftw_iodim64
+@cindex 64-bit architecture
+The @code{fftw_iodim} type in the guru interface (@pxref{Guru vector
+and transform sizes}) becomes @code{type(fftw_iodim)} in Fortran, a
+derived data type (the Fortran analogue of C's @code{struct}) with
+three @code{integer(C_INT)} components: @code{n}, @code{is}, and
+@code{os}, with the same meanings as in C.  The @code{fftw_iodim64} type in the 64-bit guru interface (@pxref{64-bit Guru Interface}) is the same, except that its components are of type @code{integer(C_INTPTR_T)}.
+
+@item
+@ctindex C_FUNPTR
+Using the wisdom import/export functions from Fortran is a bit tricky,
+and is discussed in @ref{Accessing the wisdom API from Fortran}.  In
+brief, the @code{FILE *} arguments map to @code{type(C_PTR)}, @code{const char *} to @code{character(C_CHAR), dimension(*), intent(in)} (null-terminated!), and the generic read-char/write-char functions map to @code{type(C_FUNPTR)}.
+
+@end itemize
+
+@cindex portability
+You may be wondering if you need to search-and-replace
+@code{real(kind(0.0d0))} (or whatever your favorite Fortran spelling
+of ``double precision'' is) with @code{real(C_DOUBLE)} everywhere in
+your program, and similarly for @code{complex} and @code{integer}
+types.  The answer is no; you can still use your existing types.  As
+long as these types match their C counterparts, things should work
+without a hitch.  The worst that can happen, e.g. in the (unlikely)
+event of a system where @code{real(kind(0.0d0))} is different from
+@code{real(C_DOUBLE)}, is that the compiler will give you a
+type-mismatch error.  That is, if you don't use the
+@code{iso_c_binding} kinds you need to accept at least the theoretical
+possibility of having to change your code in response to compiler
+errors on some future machine, but you don't need to worry about
+silently compiling incorrect code that yields runtime errors.
+
+@c -------------------------------------------------------
+@node Plan execution in Fortran, Allocating aligned memory in Fortran, FFTW Fortran type reference, Calling FFTW from Modern Fortran
+@section Plan execution in Fortran
+
+In C, in order to use a plan, one normally calls @code{fftw_execute},
+which executes the plan to perform the transform on the input/output
+arrays passed when the plan was created (@pxref{Using Plans}).  The
+corresponding subroutine call in modern Fortran is:
+@example
+ call fftw_execute(plan)
+@end example
+@findex fftw_execute
+
+However, we have had reports that this causes problems with some
+recent optimizing Fortran compilers.  The problem is, because the
+input/output arrays are not passed as explicit arguments to
+@code{fftw_execute}, the semantics of Fortran (unlike C) allow the
+compiler to assume that the input/output arrays are not changed by
+@code{fftw_execute}.  As a consequence, certain compilers end up
+repositioning the call to @code{fftw_execute}, assuming incorrectly
+that it does nothing to the arrays.
+
+There are various workarounds to this, but the safest and simplest
+thing is to not use @code{fftw_execute} in Fortran.  Instead, use the
+functions described in @ref{New-array Execute Functions}, which take
+the input/output arrays as explicit arguments.  For example, if the
+plan is for a complex-data DFT and was created for the arrays
+@code{in} and @code{out}, you would do:
+@example
+ call fftw_execute_dft(plan, in, out)
+@end example
+@findex fftw_execute_dft
+
+There are a few things to be careful of, however:
+
+@itemize @bullet
+
+@item
+@findex fftw_execute_dft_r2c
+@findex fftw_execute_dft_c2r
+@findex fftw_execute_r2r
+You must use the correct type of execute function, matching the way
+the plan was created.  Complex DFT plans should use
+@code{fftw_execute_dft}, Real-input (r2c) DFT plans should use use
+@code{fftw_execute_dft_r2c}, and real-output (c2r) DFT plans should
+use @code{fftw_execute_dft_c2r}.  The various r2r plans should use
+@code{fftw_execute_r2r}.  Fortunately, if you use the wrong one you
+will get a compile-time type-mismatch error (unlike legacy Fortran).
+
+@item
+You should normally pass the same input/output arrays that were used when
+creating the plan.  This is always safe.
+
+@item
+@emph{If} you pass @emph{different} input/output arrays compared to
+those used when creating the plan, you must abide by all the
+restrictions of the new-array execute functions (@pxref{New-array
+Execute Functions}).  The most tricky of these is the
+requirement that the new arrays have the same alignment as the
+original arrays; the best (and possibly only) way to guarantee this
+is to use the @samp{fftw_alloc} functions to allocate your arrays (@pxref{Allocating aligned memory in Fortran}). Alternatively, you can
+use the @code{FFTW_UNALIGNED} flag when creating the
+plan, in which case the plan does not depend on the alignment, but
+this may sacrifice substantial performance on architectures (like x86)
+with SIMD instructions (@pxref{SIMD alignment and fftw_malloc}).
+@ctindex FFTW_UNALIGNED
+
+@end itemize
+
+@c -------------------------------------------------------
+@node Allocating aligned memory in Fortran, Accessing the wisdom API from Fortran, Plan execution in Fortran, Calling FFTW from Modern Fortran
+@section Allocating aligned memory in Fortran
+
+@cindex alignment
+@findex fftw_alloc_real
+@findex fftw_alloc_complex
+In order to obtain maximum performance in FFTW, you should store your
+data in arrays that have been specially aligned in memory (@pxref{SIMD
+alignment and fftw_malloc}).  Enforcing alignment also permits you to
+safely use the new-array execute functions (@pxref{New-array Execute
+Functions}) to apply a given plan to more than one pair of in/out
+arrays.  Unfortunately, standard Fortran arrays do @emph{not} provide
+any alignment guarantees.  The @emph{only} way to allocate aligned
+memory in standard Fortran is to allocate it with an external C
+function, like the @code{fftw_alloc_real} and
+@code{fftw_alloc_complex} functions.  Fortunately, Fortran 2003 provides
+a simple way to associate such allocated memory with a standard Fortran
+array pointer that you can then use normally.
+
+We therefore recommend allocating all your input/output arrays using
+the following technique:
+
+@enumerate
+
+@item
+Declare a @code{pointer}, @code{arr}, to your array of the desired type
+and dimensions.  For example, @code{real(C_DOUBLE), pointer :: a(:,:)}
+for a 2d real array, or @code{complex(C_DOUBLE_COMPLEX), pointer ::
+a(:,:,:)} for a 3d complex array.
+
+@item
+The number of elements to allocate must be an
+@code{integer(C_SIZE_T)}.  You can either declare a variable of this
+type, e.g. @code{integer(C_SIZE_T) :: sz}, to store the number of
+elements to allocate, or you can use the @code{int(..., C_SIZE_T)}
+intrinsic function. e.g. set @code{sz = L * M * N} or use
+@code{int(L * M * N, C_SIZE_T)} for an @threedims{L,M,N} array.
+
+@item
+Declare a @code{type(C_PTR) :: p} to hold the return value from
+FFTW's allocation routine.  Set @code{p = fftw_alloc_real(sz)} for a real array, or @code{p = fftw_alloc_complex(sz)} for a complex array.
+
+@item
+@findex c_f_pointer
+Associate your pointer @code{arr} with the allocated memory @code{p}
+using the standard @code{c_f_pointer} subroutine: @code{call
+c_f_pointer(p, arr, [...dimensions...])}, where
+@code{[...dimensions...])} are an array of the dimensions of the array
+(in the usual Fortran order). e.g. @code{call c_f_pointer(p, arr,
+[L,M,N])} for an @threedims{L,M,N} array.  (Alternatively, you can
+omit the dimensions argument if you specified the shape explicitly
+when declaring @code{arr}.)  You can now use @code{arr} as a usual
+multidimensional array.
+
+@item
+When you are done using the array, deallocate the memory by @code{call
+fftw_free(p)} on @code{p}.
+
+@end enumerate
+
+For example, here is how we would allocate an @twodims{L,M} 2d real array:
+
+@example
+  real(C_DOUBLE), pointer :: arr(:,:)
+  type(C_PTR) :: p
+  p = fftw_alloc_real(int(L * M, C_SIZE_T))
+  call c_f_pointer(p, arr, [L,M])
+  @emph{...use arr and arr(i,j) as usual...}
+  call fftw_free(p)
+@end example
+
+and here is an @threedims{L,M,N} 3d complex array:
+
+@example
+  complex(C_DOUBLE_COMPLEX), pointer :: arr(:,:,:)
+  type(C_PTR) :: p
+  p = fftw_alloc_complex(int(L * M * N, C_SIZE_T))
+  call c_f_pointer(p, arr, [L,M,N])
+  @emph{...use arr and arr(i,j,k) as usual...}
+  call fftw_free(p)
+@end example
+
+See @ref{Reversing array dimensions} for an example allocating a
+single array and associating both real and complex array pointers with
+it, for in-place real-to-complex transforms.
+
+@c -------------------------------------------------------
+@node Accessing the wisdom API from Fortran, Defining an FFTW module, Allocating aligned memory in Fortran, Calling FFTW from Modern Fortran
+@section Accessing the wisdom API from Fortran
+@cindex wisdom
+@cindex saving plans to disk
+
+As explained in @ref{Words of Wisdom-Saving Plans}, FFTW provides a
+``wisdom'' API for saving plans to disk so that they can be recreated
+quickly.  The C API for exporting (@pxref{Wisdom Export}) and
+importing (@pxref{Wisdom Import}) wisdom is somewhat tricky to use
+from Fortran, however, because of differences in file I/O and string
+types between C and Fortran.
+
+@menu
+* Wisdom File Export/Import from Fortran::  
+* Wisdom String Export/Import from Fortran::  
+* Wisdom Generic Export/Import from Fortran::  
+@end menu
+
+@c =========>
+@node Wisdom File Export/Import from Fortran, Wisdom String Export/Import from Fortran, Accessing the wisdom API from Fortran, Accessing the wisdom API from Fortran
+@subsection Wisdom File Export/Import from Fortran
+
+@findex fftw_import wisdom_from_filename
+@findex fftw_export_wisdom_to_filename
+The easiest way to export and import wisdom is to do so using
+@code{fftw_export_wisdom_to_filename} and
+@code{fftw_wisdom_from_filename}.  The only trick is that these
+require you to pass a C string, which is an array of type
+@code{CHARACTER(C_CHAR)} that is terminated by @code{C_NULL_CHAR}.
+You can call them like this:
+
+@example
+  integer(C_INT) :: ret
+  ret = fftw_export_wisdom_to_filename(C_CHAR_'my_wisdom.dat' // C_NULL_CHAR)
+  if (ret .eq. 0) stop 'error exporting wisdom to file'
+  ret = fftw_import_wisdom_from_filename(C_CHAR_'my_wisdom.dat' // C_NULL_CHAR)
+  if (ret .eq. 0) stop 'error importing wisdom from file'
+@end example
+
+Note that prepending @samp{C_CHAR_} is needed to specify that the
+literal string is of kind @code{C_CHAR}, and we null-terminate the
+string by appending @samp{// C_NULL_CHAR}.  These functions return an
+@code{integer(C_INT)} (@code{ret}) which is @code{0} if an error
+occurred during export/import and nonzero otherwise.
+
+It is also possible to use the lower-level routines
+@code{fftw_export_wisdom_to_file} and
+@code{fftw_import_wisdom_from_file}, which accept parameters of the C
+type @code{FILE*}, expressed in Fortran as @code{type(C_PTR)}.
+However, you are then responsible for creating the @code{FILE*}
+yourself.  You can do this by using @code{iso_c_binding} to define
+Fortran intefaces for the C library functions @code{fopen} and
+@code{fclose}, which is a bit strange in Fortran but workable.
+
+@c =========>
+@node Wisdom String Export/Import from Fortran, Wisdom Generic Export/Import from Fortran, Wisdom File Export/Import from Fortran, Accessing the wisdom API from Fortran
+@subsection Wisdom String Export/Import from Fortran
+
+@findex fftw_export_wisdom_to_string
+Dealing with FFTW's C string export/import is a bit more painful.  In
+particular, the @code{fftw_export_wisdom_to_string} function requires
+you to deal with a dynamically allocated C string.  To get its length,
+you must define an interface to the C @code{strlen} function, and to
+deallocate it you must define an interface to C @code{free}:
+
+@example
+  use, intrinsic :: iso_c_binding
+  interface
+    integer(C_INT) function strlen(s) bind(C, name='strlen')
+      import
+      type(C_PTR), value :: s
+    end function strlen
+    subroutine free(p) bind(C, name='free')
+      import
+      type(C_PTR), value :: p
+    end subroutine free
+  end interface
+@end example
+
+Given these definitions, you can then export wisdom to a Fortran
+character array:
+
+@example
+  character(C_CHAR), pointer :: s(:)
+  integer(C_SIZE_T) :: slen
+  type(C_PTR) :: p
+  p = fftw_export_wisdom_to_string()
+  if (.not. c_associated(p)) stop 'error exporting wisdom'
+  slen = strlen(p)
+  call c_f_pointer(p, s, [slen+1])
+  ...
+  call free(p)
+@end example
+@findex c_associated
+@findex c_f_pointer
+
+Note that @code{slen} is the length of the C string, but the length of
+the array is @code{slen+1} because it includes the terminating null
+character.  (You can omit the @samp{+1} if you don't want Fortran to
+know about the null character.) The standard @code{c_associated} function
+checks whether @code{p} is a null pointer, which is returned by
+@code{fftw_export_wisdom_to_string} if there was an error.
+
+@findex fftw_import_wisdom_from_string
+To import wisdom from a string, use
+@code{fftw_import_wisdom_from_string} as usual; note that the argument
+of this function must be a @code{character(C_CHAR)} that is terminated
+by the @code{C_NULL_CHAR} character, like the @code{s} array above.
+
+@c =========>
+@node Wisdom Generic Export/Import from Fortran,  , Wisdom String Export/Import from Fortran, Accessing the wisdom API from Fortran
+@subsection Wisdom Generic Export/Import from Fortran
+
+The most generic wisdom export/import functions allow you to provide
+an arbitrary callback function to read/write one character at a time
+in any way you want.  However, your callback function must be written
+in a special way, using the @code{bind(C)} attribute to be passed to a
+C interface.
+
+@findex fftw_export_wisdom
+In particular, to call the generic wisdom export function
+@code{fftw_export_wisdom}, you would write a callback subroutine of the form:
+
+@example
+  subroutine my_write_char(c, p) bind(C)
+    use, intrinsic :: iso_c_binding
+    character(C_CHAR), value :: c
+    type(C_PTR), value :: p
+    @emph{...write c...}
+  end subroutine my_write_char
+@end example
+
+Given such a subroutine (along with the corresponding interface definition), you could then export wisdom using:
+
+@findex c_funloc
+@example
+  call fftw_export_wisdom(c_funloc(my_write_char), p)
+@end example
+
+@findex c_loc
+@findex c_f_pointer
+The standard @code{c_funloc} intrinsic converts a Fortran
+@code{bind(C)} subroutine into a C function pointer.  The parameter
+@code{p} is a @code{type(C_PTR)} to any arbitrary data that you want
+to pass to @code{my_write_char} (or @code{C_NULL_PTR} if none).  (Note
+that you can get a C pointer to Fortran data using the intrinsic
+@code{c_loc}, and convert it back to a Fortran pointer in
+@code{my_write_char} using @code{c_f_pointer}.)
+
+Similarly, to use the generic @code{fftw_import_wisdom}, you would
+define a callback function of the form:
+
+@findex fftw_import_wisdom
+@example
+  integer(C_INT) function my_read_char(p) bind(C)
+    use, intrinsic :: iso_c_binding
+    type(C_PTR), value :: p
+    character :: c
+    @emph{...read a character c...}
+    my_read_char = ichar(c, C_INT)
+  end function my_read_char
+
+  ....
+
+  integer(C_INT) :: ret
+  ret = fftw_import_wisdom(c_funloc(my_read_char), p)
+  if (ret .eq. 0) stop 'error importing wisdom'
+@end example
+
+Your function can return @code{-1} if the end of the input is reached.
+Again, @code{p} is an arbitrary @code{type(C_PTR} that is passed
+through to your function.  @code{fftw_import_wisdom} returns @code{0}
+if an error occurred and nonzero otherwise.
+
+@c -------------------------------------------------------
+@node Defining an FFTW module,  , Accessing the wisdom API from Fortran, Calling FFTW from Modern Fortran
+@section Defining an FFTW module
+
+Rather than using the @code{include} statement to include the
+@code{fftw3.f03} interface file in any subroutine where you want to
+use FFTW, you might prefer to define an FFTW Fortran module.  FFTW
+does not install itself as a module, primarily because
+@code{fftw3.f03} can be shared between different Fortran compilers while
+modules (in general) cannot.  However, it is trivial to define your
+own FFTW module if you want.  Just create a file containing:
+
+@example
+  module FFTW3
+    use, intrinsic :: iso_c_binding
+    include 'fftw3.f03'
+  end module
+@end example
+
+Compile this file into a module as usual for your compiler (e.g. with
+@code{gfortran -c} you will get a file @code{fftw3.mod}).  Now,
+instead of @code{include 'fftw3.f03'}, whenever you want to use FFTW
+routines you can just do:
+
+@example
+  use FFTW3
+@end example
+
+as usual for Fortran modules.  (You still need to link to the FFTW
+library, of course.)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/mpi.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/mpi.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1768 @@
+@node Distributed-memory FFTW with MPI, Calling FFTW from Modern Fortran, Multi-threaded FFTW, Top
+@chapter Distributed-memory FFTW with MPI
+@cindex MPI
+
+@cindex parallel transform
+In this chapter we document the parallel FFTW routines for parallel
+systems supporting the MPI message-passing interface.  Unlike the
+shared-memory threads described in the previous chapter, MPI allows
+you to use @emph{distributed-memory} parallelism, where each CPU has
+its own separate memory, and which can scale up to clusters of many
+thousands of processors.  This capability comes at a price, however:
+each process only stores a @emph{portion} of the data to be
+transformed, which means that the data structures and
+programming-interface are quite different from the serial or threads
+versions of FFTW.
+@cindex data distribution
+
+
+Distributed-memory parallelism is especially useful when you are
+transforming arrays so large that they do not fit into the memory of a
+single processor.  The storage per-process required by FFTW's MPI
+routines is proportional to the total array size divided by the number
+of processes.  Conversely, distributed-memory parallelism can easily
+pose an unacceptably high communications overhead for small problems;
+the threshold problem size for which parallelism becomes advantageous
+will depend on the precise problem you are interested in, your
+hardware, and your MPI implementation.
+
+A note on terminology: in MPI, you divide the data among a set of
+``processes'' which each run in their own memory address space.
+Generally, each process runs on a different physical processor, but
+this is not required.  A set of processes in MPI is described by an
+opaque data structure called a ``communicator,'' the most common of
+which is the predefined communicator @code{MPI_COMM_WORLD} which
+refers to @emph{all} processes.  For more information on these and
+other concepts common to all MPI programs, we refer the reader to the
+documentation at @uref{http://www.mcs.anl.gov/research/projects/mpi/, the MPI home
+page}.
+@cindex MPI communicator
+@ctindex MPI_COMM_WORLD
+
+
+We assume in this chapter that the reader is familiar with the usage
+of the serial (uniprocessor) FFTW, and focus only on the concepts new
+to the MPI interface.
+
+@menu
+* FFTW MPI Installation::       
+* Linking and Initializing MPI FFTW::  
+* 2d MPI example::              
+* MPI Data Distribution::       
+* Multi-dimensional MPI DFTs of Real Data::  
+* Other Multi-dimensional Real-data MPI Transforms::  
+* FFTW MPI Transposes::         
+* FFTW MPI Wisdom::             
+* Avoiding MPI Deadlocks::      
+* FFTW MPI Performance Tips::   
+* Combining MPI and Threads::   
+* FFTW MPI Reference::          
+* FFTW MPI Fortran Interface::  
+@end menu
+
+@c ------------------------------------------------------------
+@node FFTW MPI Installation, Linking and Initializing MPI FFTW, Distributed-memory FFTW with MPI, Distributed-memory FFTW with MPI
+@section FFTW MPI Installation
+
+All of the FFTW MPI code is located in the @code{mpi} subdirectory of
+the FFTW package.  On Unix systems, the FFTW MPI libraries and header
+files are automatically configured, compiled, and installed along with
+the uniprocessor FFTW libraries simply by including
+@code{--enable-mpi} in the flags to the @code{configure} script
+(@pxref{Installation on Unix}).
+@fpindex configure
+
+
+Any implementation of the MPI standard, version 1 or later, should
+work with FFTW.  The @code{configure} script will attempt to
+automatically detect how to compile and link code using your MPI
+implementation.  In some cases, especially if you have multiple
+different MPI implementations installed or have an unusual MPI
+software package, you may need to provide this information explicitly.
+
+Most commonly, one compiles MPI code by invoking a special compiler
+command, typically @code{mpicc} for C code.  The @code{configure}
+script knows the most common names for this command, but you can
+specify the MPI compilation command explicitly by setting the
+@code{MPICC} variable, as in @samp{./configure MPICC=mpicc ...}.
+@fpindex mpicc
+
+
+If, instead of a special compiler command, you need to link a certain
+library, you can specify the link command via the @code{MPILIBS}
+variable, as in @samp{./configure MPILIBS=-lmpi ...}.  Note that if
+your MPI library is installed in a non-standard location (one the
+compiler does not know about by default), you may also have to specify
+the location of the library and header files via @code{LDFLAGS} and
+@code{CPPFLAGS} variables, respectively, as in @samp{./configure
+LDFLAGS=-L/path/to/mpi/libs CPPFLAGS=-I/path/to/mpi/include ...}.
+
+@c ------------------------------------------------------------
+@node Linking and Initializing MPI FFTW, 2d MPI example, FFTW MPI Installation, Distributed-memory FFTW with MPI
+@section Linking and Initializing MPI FFTW
+
+Programs using the MPI FFTW routines should be linked with
+@code{-lfftw3_mpi -lfftw3 -lm} on Unix in double precision,
+@code{-lfftw3f_mpi -lfftw3f -lm} in single precision, and so on
+(@pxref{Precision}). You will also need to link with whatever library
+is responsible for MPI on your system; in most MPI implementations,
+there is a special compiler alias named @code{mpicc} to compile and
+link MPI code.
+@fpindex mpicc
+@cindex linking on Unix
+@cindex precision
+
+
+@findex fftw_init_threads
+Before calling any FFTW routines except possibly
+@code{fftw_init_threads} (@pxref{Combining MPI and Threads}), but after calling
+@code{MPI_Init}, you should call the function:
+
+@example
+void fftw_mpi_init(void);
+@end example
+@findex fftw_mpi_init
+
+If, at the end of your program, you want to get rid of all memory and
+other resources allocated internally by FFTW, for both the serial and
+MPI routines, you can call:
+
+@example
+void fftw_mpi_cleanup(void);
+@end example
+@findex fftw_mpi_cleanup
+
+which is much like the @code{fftw_cleanup()} function except that it
+also gets rid of FFTW's MPI-related data.  You must @emph{not} execute
+any previously created plans after calling this function.
+
+@c ------------------------------------------------------------
+@node 2d MPI example, MPI Data Distribution, Linking and Initializing MPI FFTW, Distributed-memory FFTW with MPI
+@section 2d MPI example
+
+Before we document the FFTW MPI interface in detail, we begin with a
+simple example outlining how one would perform a two-dimensional
+@code{N0} by @code{N1} complex DFT. 
+
+@example
+#include <fftw3-mpi.h>
+
+int main(int argc, char **argv)
+@{
+    const ptrdiff_t N0 = ..., N1 = ...;
+    fftw_plan plan;
+    fftw_complex *data;
+    ptrdiff_t alloc_local, local_n0, local_0_start, i, j;
+
+    MPI_Init(&argc, &argv);
+    fftw_mpi_init();
+
+    /* @r{get local data size and allocate} */
+    alloc_local = fftw_mpi_local_size_2d(N0, N1, MPI_COMM_WORLD,
+                                         &local_n0, &local_0_start);
+    data = fftw_alloc_complex(alloc_local);
+
+    /* @r{create plan for in-place forward DFT} */
+    plan = fftw_mpi_plan_dft_2d(N0, N1, data, data, MPI_COMM_WORLD,
+                                FFTW_FORWARD, FFTW_ESTIMATE);    
+
+    /* @r{initialize data to some function} my_function(x,y) */
+    for (i = 0; i < local_n0; ++i) for (j = 0; j < N1; ++j)
+       data[i*N1 + j] = my_function(local_0_start + i, j);
+
+    /* @r{compute transforms, in-place, as many times as desired} */
+    fftw_execute(plan);
+
+    fftw_destroy_plan(plan);
+
+    MPI_Finalize();
+@}
+@end example
+
+As can be seen above, the MPI interface follows the same basic style
+of allocate/plan/execute/destroy as the serial FFTW routines.  All of
+the MPI-specific routines are prefixed with @samp{fftw_mpi_} instead
+of @samp{fftw_}.  There are a few important differences, however:
+
+First, we must call @code{fftw_mpi_init()} after calling
+@code{MPI_Init} (required in all MPI programs) and before calling any
+other @samp{fftw_mpi_} routine.
+@findex MPI_Init
+@findex fftw_mpi_init
+
+
+Second, when we create the plan with @code{fftw_mpi_plan_dft_2d},
+analogous to @code{fftw_plan_dft_2d}, we pass an additional argument:
+the communicator, indicating which processes will participate in the
+transform (here @code{MPI_COMM_WORLD}, indicating all processes).
+Whenever you create, execute, or destroy a plan for an MPI transform,
+you must call the corresponding FFTW routine on @emph{all} processes
+in the communicator for that transform.  (That is, these are
+@emph{collective} calls.)  Note that the plan for the MPI transform
+uses the standard @code{fftw_execute} and @code{fftw_destroy} routines
+(on the other hand, there are MPI-specific new-array execute functions
+documented below).
+@cindex collective function
+@findex fftw_mpi_plan_dft_2d
+@ctindex MPI_COMM_WORLD
+
+
+Third, all of the FFTW MPI routines take @code{ptrdiff_t} arguments
+instead of @code{int} as for the serial FFTW.  @code{ptrdiff_t} is a
+standard C integer type which is (at least) 32 bits wide on a 32-bit
+machine and 64 bits wide on a 64-bit machine.  This is to make it easy
+to specify very large parallel transforms on a 64-bit machine.  (You
+can specify 64-bit transform sizes in the serial FFTW, too, but only
+by using the @samp{guru64} planner interface.  @xref{64-bit Guru
+Interface}.)
+@tindex ptrdiff_t
+@cindex 64-bit architecture
+
+
+Fourth, and most importantly, you don't allocate the entire
+two-dimensional array on each process.  Instead, you call
+@code{fftw_mpi_local_size_2d} to find out what @emph{portion} of the
+array resides on each processor, and how much space to allocate.
+Here, the portion of the array on each process is a @code{local_n0} by
+@code{N1} slice of the total array, starting at index
+@code{local_0_start}.  The total number of @code{fftw_complex} numbers
+to allocate is given by the @code{alloc_local} return value, which
+@emph{may} be greater than @code{local_n0 * N1} (in case some
+intermediate calculations require additional storage).  The data
+distribution in FFTW's MPI interface is described in more detail by
+the next section.
+@findex fftw_mpi_local_size_2d
+@cindex data distribution
+
+
+Given the portion of the array that resides on the local process, it
+is straightforward to initialize the data (here to a function
+@code{myfunction}) and otherwise manipulate it.  Of course, at the end
+of the program you may want to output the data somehow, but
+synchronizing this output is up to you and is beyond the scope of this
+manual.  (One good way to output a large multi-dimensional distributed
+array in MPI to a portable binary file is to use the free HDF5
+library; see the @uref{http://www.hdfgroup.org/, HDF home page}.)
+@cindex HDF5
+@cindex MPI I/O
+
+@c ------------------------------------------------------------
+@node MPI Data Distribution, Multi-dimensional MPI DFTs of Real Data, 2d MPI example, Distributed-memory FFTW with MPI
+@section MPI Data Distribution
+@cindex data distribution
+
+The most important concept to understand in using FFTW's MPI interface
+is the data distribution.  With a serial or multithreaded FFT, all of
+the inputs and outputs are stored as a single contiguous chunk of
+memory.  With a distributed-memory FFT, the inputs and outputs are
+broken into disjoint blocks, one per process.
+
+In particular, FFTW uses a @emph{1d block distribution} of the data,
+distributed along the @emph{first dimension}.  For example, if you
+want to perform a @twodims{100,200} complex DFT, distributed over 4
+processes, each process will get a @twodims{25,200} slice of the data.
+That is, process 0 will get rows 0 through 24, process 1 will get rows
+25 through 49, process 2 will get rows 50 through 74, and process 3
+will get rows 75 through 99.  If you take the same array but
+distribute it over 3 processes, then it is not evenly divisible so the
+different processes will have unequal chunks.  FFTW's default choice
+in this case is to assign 34 rows to processes 0 and 1, and 32 rows to
+process 2.
+@cindex block distribution
+
+
+FFTW provides several @samp{fftw_mpi_local_size} routines that you can
+call to find out what portion of an array is stored on the current
+process.  In most cases, you should use the default block sizes picked
+by FFTW, but it is also possible to specify your own block size.  For
+example, with a @twodims{100,200} array on three processes, you can
+tell FFTW to use a block size of 40, which would assign 40 rows to
+processes 0 and 1, and 20 rows to process 2.  FFTW's default is to
+divide the data equally among the processes if possible, and as best
+it can otherwise.  The rows are always assigned in ``rank order,''
+i.e. process 0 gets the first block of rows, then process 1, and so
+on.  (You can change this by using @code{MPI_Comm_split} to create a
+new communicator with re-ordered processes.)  However, you should
+always call the @samp{fftw_mpi_local_size} routines, if possible,
+rather than trying to predict FFTW's distribution choices.
+
+In particular, it is critical that you allocate the storage size that
+is returned by @samp{fftw_mpi_local_size}, which is @emph{not}
+necessarily the size of the local slice of the array.  The reason is
+that intermediate steps of FFTW's algorithms involve transposing the
+array and redistributing the data, so at these intermediate steps FFTW
+may require more local storage space (albeit always proportional to
+the total size divided by the number of processes).  The
+@samp{fftw_mpi_local_size} functions know how much storage is required
+for these intermediate steps and tell you the correct amount to
+allocate.
+
+@menu
+* Basic and advanced distribution interfaces::  
+* Load balancing::              
+* Transposed distributions::    
+* One-dimensional distributions::  
+@end menu
+
+@node Basic and advanced distribution interfaces, Load balancing, MPI Data Distribution, MPI Data Distribution
+@subsection Basic and advanced distribution interfaces
+
+As with the planner interface, the @samp{fftw_mpi_local_size}
+distribution interface is broken into basic and advanced
+(@samp{_many}) interfaces, where the latter allows you to specify the
+block size manually and also to request block sizes when computing
+multiple transforms simultaneously.  These functions are documented
+more exhaustively by the FFTW MPI Reference, but we summarize the
+basic ideas here using a couple of two-dimensional examples.
+
+For the @twodims{100,200} complex-DFT example, above, we would find
+the distribution by calling the following function in the basic
+interface:
+
+@example
+ptrdiff_t fftw_mpi_local_size_2d(ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                                 ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+@end example
+@findex fftw_mpi_local_size_2d
+
+Given the total size of the data to be transformed (here, @code{n0 =
+100} and @code{n1 = 200}) and an MPI communicator (@code{comm}), this
+function provides three numbers.
+
+First, it describes the shape of the local data: the current process
+should store a @code{local_n0} by @code{n1} slice of the overall
+dataset, in row-major order (@code{n1} dimension contiguous), starting
+at index @code{local_0_start}.  That is, if the total dataset is
+viewed as a @code{n0} by @code{n1} matrix, the current process should
+store the rows @code{local_0_start} to
+@code{local_0_start+local_n0-1}.  Obviously, if you are running with
+only a single MPI process, that process will store the entire array:
+@code{local_0_start} will be zero and @code{local_n0} will be
+@code{n0}.  @xref{Row-major Format}.
+@cindex row-major
+
+
+Second, the return value is the total number of data elements (e.g.,
+complex numbers for a complex DFT) that should be allocated for the
+input and output arrays on the current process (ideally with
+@code{fftw_malloc} or an @samp{fftw_alloc} function, to ensure optimal
+alignment).  It might seem that this should always be equal to
+@code{local_n0 * n1}, but this is @emph{not} the case.  FFTW's
+distributed FFT algorithms require data redistributions at
+intermediate stages of the transform, and in some circumstances this
+may require slightly larger local storage.  This is discussed in more
+detail below, under @ref{Load balancing}.
+@findex fftw_malloc
+@findex fftw_alloc_complex
+
+
+@cindex advanced interface
+The advanced-interface @samp{local_size} function for multidimensional
+transforms returns the same three things (@code{local_n0},
+@code{local_0_start}, and the total number of elements to allocate),
+but takes more inputs:
+
+@example
+ptrdiff_t fftw_mpi_local_size_many(int rnk, const ptrdiff_t *n,
+                                   ptrdiff_t howmany,
+                                   ptrdiff_t block0,
+                                   MPI_Comm comm,
+                                   ptrdiff_t *local_n0,
+                                   ptrdiff_t *local_0_start);
+@end example
+@findex fftw_mpi_local_size_many
+
+The two-dimensional case above corresponds to @code{rnk = 2} and an
+array @code{n} of length 2 with @code{n[0] = n0} and @code{n[1] = n1}.
+This routine is for any @code{rnk > 1}; one-dimensional transforms
+have their own interface because they work slightly differently, as
+discussed below.
+
+First, the advanced interface allows you to perform multiple
+transforms at once, of interleaved data, as specified by the
+@code{howmany} parameter.  (@code{hoamany} is 1 for a single
+transform.)
+
+Second, here you can specify your desired block size in the @code{n0}
+dimension, @code{block0}.  To use FFTW's default block size, pass
+@code{FFTW_MPI_DEFAULT_BLOCK} (0) for @code{block0}.  Otherwise, on
+@code{P} processes, FFTW will return @code{local_n0} equal to
+@code{block0} on the first @code{P / block0} processes (rounded down),
+return @code{local_n0} equal to @code{n0 - block0 * (P / block0)} on
+the next process, and @code{local_n0} equal to zero on any remaining
+processes.  In general, we recommend using the default block size
+(which corresponds to @code{n0 / P}, rounded up).
+@ctindex FFTW_MPI_DEFAULT_BLOCK
+@cindex block distribution
+
+
+For example, suppose you have @code{P = 4} processes and @code{n0 =
+21}.  The default will be a block size of @code{6}, which will give
+@code{local_n0 = 6} on the first three processes and @code{local_n0 =
+3} on the last process.  Instead, however, you could specify
+@code{block0 = 5} if you wanted, which would give @code{local_n0 = 5}
+on processes 0 to 2, @code{local_n0 = 6} on process 3.  (This choice,
+while it may look superficially more ``balanced,'' has the same
+critical path as FFTW's default but requires more communications.)
+
+@node Load balancing, Transposed distributions, Basic and advanced distribution interfaces, MPI Data Distribution
+@subsection Load balancing
+@cindex load balancing
+
+Ideally, when you parallelize a transform over some @math{P}
+processes, each process should end up with work that takes equal time.
+Otherwise, all of the processes end up waiting on whichever process is
+slowest.  This goal is known as ``load balancing.''  In this section,
+we describe the circumstances under which FFTW is able to load-balance
+well, and in particular how you should choose your transform size in
+order to load balance.
+
+Load balancing is especially difficult when you are parallelizing over
+heterogeneous machines; for example, if one of your processors is a
+old 486 and another is a Pentium IV, obviously you should give the
+Pentium more work to do than the 486 since the latter is much slower.
+FFTW does not deal with this problem, however---it assumes that your
+processes run on hardware of comparable speed, and that the goal is
+therefore to divide the problem as equally as possible.
+
+For a multi-dimensional complex DFT, FFTW can divide the problem
+equally among the processes if: (i) the @emph{first} dimension
+@code{n0} is divisible by @math{P}; and (ii), the @emph{product} of
+the subsequent dimensions is divisible by @math{P}.  (For the advanced
+interface, where you can specify multiple simultaneous transforms via
+some ``vector'' length @code{howmany}, a factor of @code{howmany} is
+included in the product of the subsequent dimensions.)
+
+For a one-dimensional complex DFT, the length @code{N} of the data
+should be divisible by @math{P} @emph{squared} to be able to divide
+the problem equally among the processes.
+
+@node Transposed distributions, One-dimensional distributions, Load balancing, MPI Data Distribution
+@subsection Transposed distributions
+
+Internally, FFTW's MPI transform algorithms work by first computing
+transforms of the data local to each process, then by globally
+@emph{transposing} the data in some fashion to redistribute the data
+among the processes, transforming the new data local to each process,
+and transposing back.  For example, a two-dimensional @code{n0} by
+@code{n1} array, distributed across the @code{n0} dimension, is
+transformd by: (i) transforming the @code{n1} dimension, which are
+local to each process; (ii) transposing to an @code{n1} by @code{n0}
+array, distributed across the @code{n1} dimension; (iii) transforming
+the @code{n0} dimension, which is now local to each process; (iv)
+transposing back.
+@cindex transpose
+
+
+However, in many applications it is acceptable to compute a
+multidimensional DFT whose results are produced in transposed order
+(e.g., @code{n1} by @code{n0} in two dimensions).  This provides a
+significant performance advantage, because it means that the final
+transposition step can be omitted.  FFTW supports this optimization,
+which you specify by passing the flag @code{FFTW_MPI_TRANSPOSED_OUT}
+to the planner routines.  To compute the inverse transform of
+transposed output, you specify @code{FFTW_MPI_TRANSPOSED_IN} to tell
+it that the input is transposed.  In this section, we explain how to
+interpret the output format of such a transform.
+@ctindex FFTW_MPI_TRANSPOSED_OUT
+@ctindex FFTW_MPI_TRANSPOSED_IN
+
+
+Suppose you have are transforming multi-dimensional data with (at
+least two) dimensions @ndims{}.  As always, it is distributed along
+the first dimension @dimk{0}.  Now, if we compute its DFT with the
+@code{FFTW_MPI_TRANSPOSED_OUT} flag, the resulting output data are stored
+with the first @emph{two} dimensions transposed: @ndimstrans{},
+distributed along the @dimk{1} dimension.  Conversely, if we take the
+@ndimstrans{} data and transform it with the
+@code{FFTW_MPI_TRANSPOSED_IN} flag, then the format goes back to the
+original @ndims{} array.
+
+There are two ways to find the portion of the transposed array that
+resides on the current process.  First, you can simply call the
+appropriate @samp{local_size} function, passing @ndimstrans{} (the
+transposed dimensions).  This would mean calling the @samp{local_size}
+function twice, once for the transposed and once for the
+non-transposed dimensions.  Alternatively, you can call one of the
+@samp{local_size_transposed} functions, which returns both the
+non-transposed and transposed data distribution from a single call.
+For example, for a 3d transform with transposed output (or input), you
+might call:
+
+@example
+ptrdiff_t fftw_mpi_local_size_3d_transposed(
+                ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm,
+                ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+@end example
+@findex fftw_mpi_local_size_3d_transposed
+
+Here, @code{local_n0} and @code{local_0_start} give the size and
+starting index of the @code{n0} dimension for the
+@emph{non}-transposed data, as in the previous sections.  For
+@emph{transposed} data (e.g. the output for
+@code{FFTW_MPI_TRANSPOSED_OUT}), @code{local_n1} and
+@code{local_1_start} give the size and starting index of the @code{n1}
+dimension, which is the first dimension of the transposed data
+(@code{n1} by @code{n0} by @code{n2}).
+
+(Note that @code{FFTW_MPI_TRANSPOSED_IN} is completely equivalent to
+performing @code{FFTW_MPI_TRANSPOSED_OUT} and passing the first two
+dimensions to the planner in reverse order, or vice versa.  If you
+pass @emph{both} the @code{FFTW_MPI_TRANSPOSED_IN} and
+@code{FFTW_MPI_TRANSPOSED_OUT} flags, it is equivalent to swapping the
+first two dimensions passed to the planner and passing @emph{neither}
+flag.)
+
+@node One-dimensional distributions,  , Transposed distributions, MPI Data Distribution
+@subsection One-dimensional distributions
+
+For one-dimensional distributed DFTs using FFTW, matters are slightly
+more complicated because the data distribution is more closely tied to
+how the algorithm works.  In particular, you can no longer pass an
+arbitrary block size and must accept FFTW's default; also, the block
+sizes may be different for input and output.  Also, the data
+distribution depends on the flags and transform direction, in order
+for forward and backward transforms to work correctly.
+
+@example
+ptrdiff_t fftw_mpi_local_size_1d(ptrdiff_t n0, MPI_Comm comm,
+                int sign, unsigned flags,
+                ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+                ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+@end example
+@findex fftw_mpi_local_size_1d
+
+This function computes the data distribution for a 1d transform of
+size @code{n0} with the given transform @code{sign} and @code{flags}.
+Both input and output data use block distributions.  The input on the
+current process will consist of @code{local_ni} numbers starting at
+index @code{local_i_start}; e.g. if only a single process is used,
+then @code{local_ni} will be @code{n0} and @code{local_i_start} will
+be @code{0}.  Similarly for the output, with @code{local_no} numbers
+starting at index @code{local_o_start}.  The return value of
+@code{fftw_mpi_local_size_1d} will be the total number of elements to
+allocate on the current process (which might be slightly larger than
+the local size due to intermediate steps in the algorithm).
+
+As mentioned above (@pxref{Load balancing}), the data will be divided
+equally among the processes if @code{n0} is divisible by the
+@emph{square} of the number of processes.  In this case,
+@code{local_ni} will equal @code{local_no}.  Otherwise, they may be
+different.
+
+For some applications, such as convolutions, the order of the output
+data is irrelevant.  In this case, performance can be improved by
+specifying that the output data be stored in an FFTW-defined
+``scrambled'' format.  (In particular, this is the analogue of
+transposed output in the multidimensional case: scrambled output saves
+a communications step.)  If you pass @code{FFTW_MPI_SCRAMBLED_OUT} in
+the flags, then the output is stored in this (undocumented) scrambled
+order.  Conversely, to perform the inverse transform of data in
+scrambled order, pass the @code{FFTW_MPI_SCRAMBLED_IN} flag.
+@ctindex FFTW_MPI_SCRAMBLED_OUT
+@ctindex FFTW_MPI_SCRAMBLED_IN
+
+
+In MPI FFTW, only composite sizes @code{n0} can be parallelized; we
+have not yet implemented a parallel algorithm for large prime sizes.
+
+@c ------------------------------------------------------------
+@node Multi-dimensional MPI DFTs of Real Data, Other Multi-dimensional Real-data MPI Transforms, MPI Data Distribution, Distributed-memory FFTW with MPI
+@section Multi-dimensional MPI DFTs of Real Data
+
+FFTW's MPI interface also supports multi-dimensional DFTs of real
+data, similar to the serial r2c and c2r interfaces.  (Parallel
+one-dimensional real-data DFTs are not currently supported; you must
+use a complex transform and set the imaginary parts of the inputs to
+zero.)
+
+The key points to understand for r2c and c2r MPI transforms (compared
+to the MPI complex DFTs or the serial r2c/c2r transforms), are:
+
+@itemize @bullet
+
+@item
+Just as for serial transforms, r2c/c2r DFTs transform @ndims{} real
+data to/from @ndimshalf{} complex data: the last dimension of the
+complex data is cut in half (rounded down), plus one.  As for the
+serial transforms, the sizes you pass to the @samp{plan_dft_r2c} and
+@samp{plan_dft_c2r} are the @ndims{} dimensions of the real data.
+
+@item
+@cindex padding
+Although the real data is @emph{conceptually} @ndims{}, it is
+@emph{physically} stored as an @ndimspad{} array, where the last
+dimension has been @emph{padded} to make it the same size as the
+complex output.  This is much like the in-place serial r2c/c2r
+interface (@pxref{Multi-Dimensional DFTs of Real Data}), except that
+in MPI the padding is required even for out-of-place data.  The extra
+padding numbers are ignored by FFTW (they are @emph{not} like
+zero-padding the transform to a larger size); they are only used to
+determine the data layout.
+
+@item
+@cindex data distribution
+The data distribution in MPI for @emph{both} the real and complex data
+is determined by the shape of the @emph{complex} data.  That is, you
+call the appropriate @samp{local size} function for the @ndimshalf{}
+complex data, and then use the @emph{same} distribution for the real
+data except that the last complex dimension is replaced by a (padded)
+real dimension of twice the length.
+
+@end itemize
+
+For example suppose we are performing an out-of-place r2c transform of
+@threedims{L,M,N} real data [padded to @threedims{L,M,2(N/2+1)}],
+resulting in @threedims{L,M,N/2+1} complex data.  Similar to the
+example in @ref{2d MPI example}, we might do something like:
+
+@example
+#include <fftw3-mpi.h>
+
+int main(int argc, char **argv)
+@{
+    const ptrdiff_t L = ..., M = ..., N = ...;
+    fftw_plan plan;
+    double *rin;
+    fftw_complex *cout;
+    ptrdiff_t alloc_local, local_n0, local_0_start, i, j, k;
+
+    MPI_Init(&argc, &argv);
+    fftw_mpi_init();
+
+    /* @r{get local data size and allocate} */
+    alloc_local = fftw_mpi_local_size_3d(L, M, N/2+1, MPI_COMM_WORLD,
+                                         &local_n0, &local_0_start);
+    rin = fftw_alloc_real(2 * alloc_local);
+    cout = fftw_alloc_complex(alloc_local);
+
+    /* @r{create plan for out-of-place r2c DFT} */
+    plan = fftw_mpi_plan_dft_r2c_3d(L, M, N, rin, cout, MPI_COMM_WORLD,
+                                    FFTW_MEASURE);
+
+    /* @r{initialize rin to some function} my_func(x,y,z) */
+    for (i = 0; i < local_n0; ++i)
+       for (j = 0; j < M; ++j)
+         for (k = 0; k < N; ++k)
+       rin[(i*M + j) * (2*(N/2+1)) + k] = my_func(local_0_start+i, j, k);
+
+    /* @r{compute transforms as many times as desired} */
+    fftw_execute(plan);
+
+    fftw_destroy_plan(plan);
+
+    MPI_Finalize();
+@}
+@end example
+
+@findex fftw_alloc_real
+@cindex row-major
+Note that we allocated @code{rin} using @code{fftw_alloc_real} with an
+argument of @code{2 * alloc_local}: since @code{alloc_local} is the
+number of @emph{complex} values to allocate, the number of @emph{real}
+values is twice as many.  The @code{rin} array is then
+@threedims{local_n0,M,2(N/2+1)} in row-major order, so its
+@code{(i,j,k)} element is at the index @code{(i*M + j) * (2*(N/2+1)) +
+k} (@pxref{Multi-dimensional Array Format }).
+
+@cindex transpose
+@ctindex FFTW_TRANSPOSED_OUT
+@ctindex FFTW_TRANSPOSED_IN
+As for the complex transforms, improved performance can be obtained by
+specifying that the output is the transpose of the input or vice versa
+(@pxref{Transposed distributions}).  In our @threedims{L,M,N} r2c
+example, including @code{FFTW_TRANSPOSED_OUT} in the flags means that
+the input would be a padded @threedims{L,M,2(N/2+1)} real array
+distributed over the @code{L} dimension, while the output would be a
+@threedims{M,L,N/2+1} complex array distributed over the @code{M}
+dimension.  To perform the inverse c2r transform with the same data
+distributions, you would use the @code{FFTW_TRANSPOSED_IN} flag.
+
+@c ------------------------------------------------------------
+@node Other Multi-dimensional Real-data MPI Transforms, FFTW MPI Transposes, Multi-dimensional MPI DFTs of Real Data, Distributed-memory FFTW with MPI
+@section Other multi-dimensional Real-Data MPI Transforms
+
+@cindex r2r
+FFTW's MPI interface also supports multi-dimensional @samp{r2r}
+transforms of all kinds supported by the serial interface
+(e.g. discrete cosine and sine transforms, discrete Hartley
+transforms, etc.).  Only multi-dimensional @samp{r2r} transforms, not
+one-dimensional transforms, are currently parallelized.
+
+@tindex fftw_r2r_kind
+These are used much like the multidimensional complex DFTs discussed
+above, except that the data is real rather than complex, and one needs
+to pass an r2r transform kind (@code{fftw_r2r_kind}) for each
+dimension as in the serial FFTW (@pxref{More DFTs of Real Data}).
+
+For example, one might perform a two-dimensional @twodims{L,M} that is
+an REDFT10 (DCT-II) in the first dimension and an RODFT10 (DST-II) in
+the second dimension with code like:
+
+@example
+    const ptrdiff_t L = ..., M = ...;
+    fftw_plan plan;
+    double *data;
+    ptrdiff_t alloc_local, local_n0, local_0_start, i, j;
+
+    /* @r{get local data size and allocate} */
+    alloc_local = fftw_mpi_local_size_2d(L, M, MPI_COMM_WORLD,
+                                         &local_n0, &local_0_start);
+    data = fftw_alloc_real(alloc_local);
+
+    /* @r{create plan for in-place REDFT10 x RODFT10} */
+    plan = fftw_mpi_plan_r2r_2d(L, M, data, data, MPI_COMM_WORLD,
+                                FFTW_REDFT10, FFTW_RODFT10, FFTW_MEASURE);
+
+    /* @r{initialize data to some function} my_function(x,y) */
+    for (i = 0; i < local_n0; ++i) for (j = 0; j < M; ++j)
+       data[i*M + j] = my_function(local_0_start + i, j);
+
+    /* @r{compute transforms, in-place, as many times as desired} */
+    fftw_execute(plan);
+
+    fftw_destroy_plan(plan);
+@end example
+
+@findex fftw_alloc_real
+Notice that we use the same @samp{local_size} functions as we did for
+complex data, only now we interpret the sizes in terms of real rather
+than complex values, and correspondingly use @code{fftw_alloc_real}.
+
+@c ------------------------------------------------------------
+@node FFTW MPI Transposes, FFTW MPI Wisdom, Other Multi-dimensional Real-data MPI Transforms, Distributed-memory FFTW with MPI
+@section FFTW MPI Transposes
+@cindex transpose
+
+The FFTW's MPI Fourier transforms rely on one or more @emph{global
+transposition} step for their communications.  For example, the
+multidimensional transforms work by transforming along some
+dimensions, then transposing to make the first dimension local and
+transforming that, then transposing back.  Because global
+transposition of a block-distributed matrix has many other potential
+uses besides FFTs, FFTW's transpose routines can be called directly,
+as documented in this section. 
+
+@menu
+* Basic distributed-transpose interface::  
+* Advanced distributed-transpose interface::  
+* An improved replacement for MPI_Alltoall::  
+@end menu
+
+@node Basic distributed-transpose interface, Advanced distributed-transpose interface, FFTW MPI Transposes, FFTW MPI Transposes
+@subsection Basic distributed-transpose interface
+
+In particular, suppose that we have an @code{n0} by @code{n1} array in
+row-major order, block-distributed across the @code{n0} dimension.  To
+transpose this into an @code{n1} by @code{n0} array block-distributed
+across the @code{n1} dimension, we would create a plan by calling the
+following function:
+
+@example
+fftw_plan fftw_mpi_plan_transpose(ptrdiff_t n0, ptrdiff_t n1,
+                                  double *in, double *out,
+                                  MPI_Comm comm, unsigned flags);
+@end example
+@findex fftw_mpi_plan_transpose
+
+The input and output arrays (@code{in} and @code{out}) can be the
+same.  The transpose is actually executed by calling
+@code{fftw_execute} on the plan, as usual.
+@findex fftw_execute
+
+
+The @code{flags} are the usual FFTW planner flags, but support
+two additional flags: @code{FFTW_MPI_TRANSPOSED_OUT} and/or
+@code{FFTW_MPI_TRANSPOSED_IN}.  What these flags indicate, for
+transpose plans, is that the output and/or input, respectively, are
+@emph{locally} transposed.  That is, on each process input data is
+normally stored as a @code{local_n0} by @code{n1} array in row-major
+order, but for an @code{FFTW_MPI_TRANSPOSED_IN} plan the input data is
+stored as @code{n1} by @code{local_n0} in row-major order.  Similarly,
+@code{FFTW_MPI_TRANSPOSED_OUT} means that the output is @code{n0} by
+@code{local_n1} instead of @code{local_n1} by @code{n0}.
+@ctindex FFTW_MPI_TRANSPOSED_OUT
+@ctindex FFTW_MPI_TRANSPOSED_IN
+
+
+To determine the local size of the array on each process before and
+after the transpose, as well as the amount of storage that must be
+allocated, one should call @code{fftw_mpi_local_size_2d_transposed},
+just as for a 2d DFT as described in the previous section:
+@cindex data distribution
+
+@example
+ptrdiff_t fftw_mpi_local_size_2d_transposed
+                (ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                 ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                 ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+@end example
+@findex fftw_mpi_local_size_2d_transposed
+
+Again, the return value is the local storage to allocate, which in
+this case is the number of @emph{real} (@code{double}) values rather
+than complex numbers as in the previous examples.
+
+@node Advanced distributed-transpose interface, An improved replacement for MPI_Alltoall, Basic distributed-transpose interface, FFTW MPI Transposes
+@subsection Advanced distributed-transpose interface
+
+The above routines are for a transpose of a matrix of numbers (of type
+@code{double}), using FFTW's default block sizes.  More generally, one
+can perform transposes of @emph{tuples} of numbers, with
+user-specified block sizes for the input and output:
+
+@example
+fftw_plan fftw_mpi_plan_many_transpose
+                (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany,
+                 ptrdiff_t block0, ptrdiff_t block1,
+                 double *in, double *out, MPI_Comm comm, unsigned flags);
+@end example
+@findex fftw_mpi_plan_many_transpose
+
+In this case, one is transposing an @code{n0} by @code{n1} matrix of
+@code{howmany}-tuples (e.g. @code{howmany = 2} for complex numbers).
+The input is distributed along the @code{n0} dimension with block size
+@code{block0}, and the @code{n1} by @code{n0} output is distributed
+along the @code{n1} dimension with block size @code{block1}.  If
+@code{FFTW_MPI_DEFAULT_BLOCK} (0) is passed for a block size then FFTW
+uses its default block size.  To get the local size of the data on
+each process, you should then call @code{fftw_mpi_local_size_many_transposed}.
+@ctindex FFTW_MPI_DEFAULT_BLOCK
+@findex fftw_mpi_local_size_many_transposed
+
+@node An improved replacement for MPI_Alltoall,  , Advanced distributed-transpose interface, FFTW MPI Transposes
+@subsection An improved replacement for MPI_Alltoall
+
+We close this section by noting that FFTW's MPI transpose routines can
+be thought of as a generalization for the @code{MPI_Alltoall} function
+(albeit only for floating-point types), and in some circumstances can
+function as an improved replacement.
+@findex MPI_Alltoall
+
+
+@code{MPI_Alltoall} is defined by the MPI standard as:
+
+@example
+int MPI_Alltoall(void *sendbuf, int sendcount, MPI_Datatype sendtype, 
+                 void *recvbuf, int recvcnt, MPI_Datatype recvtype, 
+                 MPI_Comm comm);
+@end example
+
+In particular, for @code{double*} arrays @code{in} and @code{out},
+consider the call:
+
+@example
+MPI_Alltoall(in, howmany, MPI_DOUBLE, out, howmany MPI_DOUBLE, comm);
+@end example
+
+This is completely equivalent to:
+
+@example
+MPI_Comm_size(comm, &P);
+plan = fftw_mpi_plan_many_transpose(P, P, howmany, 1, 1, in, out, comm, FFTW_ESTIMATE);
+fftw_execute(plan);
+fftw_destroy_plan(plan);
+@end example
+
+That is, computing a @twodims{P,P} transpose on @code{P} processes,
+with a block size of 1, is just a standard all-to-all communication.
+
+However, using the FFTW routine instead of @code{MPI_Alltoall} may
+have certain advantages.  First of all, FFTW's routine can operate
+in-place (@code{in == out}) whereas @code{MPI_Alltoall} can only
+operate out-of-place.
+@cindex in-place
+
+
+Second, even for out-of-place plans, FFTW's routine may be faster,
+especially if you need to perform the all-to-all communication many
+times and can afford to use @code{FFTW_MEASURE} or
+@code{FFTW_PATIENT}.  It should certainly be no slower, not including
+the time to create the plan, since one of the possible algorithms that
+FFTW uses for an out-of-place transpose @emph{is} simply to call
+@code{MPI_Alltoall}.  However, FFTW also considers several other
+possible algorithms that, depending on your MPI implementation and
+your hardware, may be faster.
+@ctindex FFTW_MEASURE
+@ctindex FFTW_PATIENT
+
+@c ------------------------------------------------------------
+@node FFTW MPI Wisdom, Avoiding MPI Deadlocks, FFTW MPI Transposes, Distributed-memory FFTW with MPI
+@section FFTW MPI Wisdom
+@cindex wisdom
+@cindex saving plans to disk
+
+FFTW's ``wisdom'' facility (@pxref{Words of Wisdom-Saving Plans}) can
+be used to save MPI plans as well as to save uniprocessor plans.
+However, for MPI there are several unavoidable complications.
+
+@cindex MPI I/O
+First, the MPI standard does not guarantee that every process can
+perform file I/O (at least, not using C stdio routines)---in general,
+we may only assume that process 0 is capable of I/O.@footnote{In fact,
+even this assumption is not technically guaranteed by the standard,
+although it seems to be universal in actual MPI implementations and is
+widely assumed by MPI-using software.  Technically, you need to query
+the @code{MPI_IO} attribute of @code{MPI_COMM_WORLD} with
+@code{MPI_Attr_get}.  If this attribute is @code{MPI_PROC_NULL}, no
+I/O is possible.  If it is @code{MPI_ANY_SOURCE}, any process can
+perform I/O.  Otherwise, it is the rank of a process that can perform
+I/O ... but since it is not guaranteed to yield the @emph{same} rank
+on all processes, you have to do an @code{MPI_Allreduce} of some kind
+if you want all processes to agree about which is going to do I/O.
+And even then, the standard only guarantees that this process can
+perform output, but not input. See e.g. @cite{Parallel Programming
+with MPI} by P. S. Pacheco, section 8.1.3.  Needless to say, in our
+experience virtually no MPI programmers worry about this.} So, if we
+want to export the wisdom from a single process to a file, we must
+first export the wisdom to a string, then send it to process 0, then
+write it to a file.
+
+Second, in principle we may want to have separate wisdom for every
+process, since in general the processes may run on different hardware
+even for a single MPI program.  However, in practice FFTW's MPI code
+is designed for the case of homogeneous hardware (@pxref{Load
+balancing}), and in this case it is convenient to use the same wisdom
+for every process.  Thus, we need a mechanism to synchronize the wisdom.
+
+To address both of these problems, FFTW provides the following two
+functions:
+
+@example
+void fftw_mpi_broadcast_wisdom(MPI_Comm comm);
+void fftw_mpi_gather_wisdom(MPI_Comm comm);
+@end example
+@findex fftw_mpi_gather_wisdom
+@findex fftw_mpi_broadcast_wisdom
+
+Given a communicator @code{comm}, @code{fftw_mpi_broadcast_wisdom}
+will broadcast the wisdom from process 0 to all other processes.
+Conversely, @code{fftw_mpi_gather_wisdom} will collect wisdom from all
+processes onto process 0.  (If the plans created for the same problem
+by different processes are not the same, @code{fftw_mpi_gather_wisdom}
+will arbitrarily choose one of the plans.)  Both of these functions
+may result in suboptimal plans for different processes if the
+processes are running on non-identical hardware.  Both of these
+functions are @emph{collective} calls, which means that they must be
+executed by all processes in the communicator.
+@cindex collective function
+
+
+So, for example, a typical code snippet to import wisdom from a file
+and use it on all processes would be:
+
+@example
+@{
+    int rank;
+
+    fftw_mpi_init();
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    if (rank == 0) fftw_import_wisdom_from_filename("mywisdom");
+    fftw_mpi_broadcast_wisdom(MPI_COMM_WORLD);
+@}
+@end example
+
+(Note that we must call @code{fftw_mpi_init} before importing any
+wisdom that might contain MPI plans.)  Similarly, a typical code
+snippet to export wisdom from all processes to a file is:
+@findex fftw_mpi_init
+
+@example
+@{
+    int rank;
+
+    fftw_mpi_gather_wisdom(MPI_COMM_WORLD);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    if (rank == 0) fftw_export_wisdom_to_filename("mywisdom");
+@}
+@end example
+
+@c ------------------------------------------------------------
+@node Avoiding MPI Deadlocks, FFTW MPI Performance Tips, FFTW MPI Wisdom, Distributed-memory FFTW with MPI
+@section Avoiding MPI Deadlocks
+@cindex deadlock
+
+An MPI program can @emph{deadlock} if one process is waiting for a
+message from another process that never gets sent.  To avoid deadlocks
+when using FFTW's MPI routines, it is important to know which
+functions are @emph{collective}: that is, which functions must
+@emph{always} be called in the @emph{same order} from @emph{every}
+process in a given communicator.  (For example, @code{MPI_Barrier} is
+the canonical example of a collective function in the MPI standard.)
+@cindex collective function
+@findex MPI_Barrier
+
+
+The functions in FFTW that are @emph{always} collective are: every
+function beginning with @samp{fftw_mpi_plan}, as well as
+@code{fftw_mpi_broadcast_wisdom} and @code{fftw_mpi_gather_wisdom}.
+Also, the following functions from the ordinary FFTW interface are
+collective when they are applied to a plan created by an
+@samp{fftw_mpi_plan} function: @code{fftw_execute},
+@code{fftw_destroy_plan}, and @code{fftw_flops}.
+@findex fftw_execute
+@findex fftw_destroy_plan
+@findex fftw_flops
+
+@c ------------------------------------------------------------
+@node FFTW MPI Performance Tips, Combining MPI and Threads, Avoiding MPI Deadlocks, Distributed-memory FFTW with MPI
+@section FFTW MPI Performance Tips
+
+In this section, we collect a few tips on getting the best performance
+out of FFTW's MPI transforms.
+
+First, because of the 1d block distribution, FFTW's parallelization is
+currently limited by the size of the first dimension.
+(Multidimensional block distributions may be supported by a future
+version.) More generally, you should ideally arrange the dimensions so
+that FFTW can divide them equally among the processes. @xref{Load
+balancing}.
+@cindex block distribution
+@cindex load balancing
+
+
+Second, if it is not too inconvenient, you should consider working
+with transposed output for multidimensional plans, as this saves a
+considerable amount of communications.  @xref{Transposed distributions}.
+@cindex transpose
+
+
+Third, the fastest choices are generally either an in-place transform
+or an out-of-place transform with the @code{FFTW_DESTROY_INPUT} flag
+(which allows the input array to be used as scratch space).  In-place
+is especially beneficial if the amount of data per process is large.
+@ctindex FFTW_DESTROY_INPUT
+
+
+Fourth, if you have multiple arrays to transform at once, rather than
+calling FFTW's MPI transforms several times it usually seems to be
+faster to interleave the data and use the advanced interface.  (This
+groups the communications together instead of requiring separate
+messages for each transform.)
+
+@c ------------------------------------------------------------
+@node Combining MPI and Threads, FFTW MPI Reference, FFTW MPI Performance Tips, Distributed-memory FFTW with MPI
+@section Combining MPI and Threads
+@cindex threads
+
+In certain cases, it may be advantageous to combine MPI
+(distributed-memory) and threads (shared-memory) parallelization.
+FFTW supports this, with certain caveats.  For example, if you have a
+cluster of 4-processor shared-memory nodes, you may want to use
+threads within the nodes and MPI between the nodes, instead of MPI for
+all parallelization.
+
+In particular, it is possible to seamlessly combine the MPI FFTW
+routines with the multi-threaded FFTW routines (@pxref{Multi-threaded
+FFTW}). However, some care must be taken in the initialization code,
+which should look something like this:
+
+@example
+int threads_ok;
+
+int main(int argc, char **argv)
+@{
+    int provided;
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+    threads_ok = provided >= MPI_THREAD_FUNNELED;
+
+    if (threads_ok) threads_ok = fftw_init_threads();
+    fftw_mpi_init();
+
+    ...
+    if (threads_ok) fftw_plan_with_nthreads(...);
+    ...
+    
+    MPI_Finalize();
+@}
+@end example
+@findex fftw_mpi_init
+@findex fftw_init_threads
+@findex fftw_plan_with_nthreads
+
+First, note that instead of calling @code{MPI_Init}, you should call
+@code{MPI_Init_threads}, which is the initialization routine defined
+by the MPI-2 standard to indicate to MPI that your program will be
+multithreaded.  We pass @code{MPI_THREAD_FUNNELED}, which indicates
+that we will only call MPI routines from the main thread.  (FFTW will
+launch additional threads internally, but the extra threads will not
+call MPI code.)  (You may also pass @code{MPI_THREAD_SERIALIZED} or
+@code{MPI_THREAD_MULTIPLE}, which requests additional multithreading
+support from the MPI implementation, but this is not required by
+FFTW.)  The @code{provided} parameter returns what level of threads
+support is actually supported by your MPI implementation; this
+@emph{must} be at least @code{MPI_THREAD_FUNNELED} if you want to call
+the FFTW threads routines, so we define a global variable
+@code{threads_ok} to record this.  You should only call
+@code{fftw_init_threads} or @code{fftw_plan_with_nthreads} if
+@code{threads_ok} is true.  For more information on thread safety in
+MPI, see the
+@uref{http://www.mpi-forum.org/docs/mpi-20-html/node162.htm, MPI and
+Threads} section of the MPI-2 standard.
+@cindex thread safety
+
+
+Second, we must call @code{fftw_init_threads} @emph{before}
+@code{fftw_mpi_init}.  This is critical for technical reasons having
+to do with how FFTW initializes its list of algorithms.
+
+Then, if you call @code{fftw_plan_with_nthreads(N)}, @emph{every} MPI
+process will launch (up to) @code{N} threads to parallelize its transforms.
+
+For example, in the hypothetical cluster of 4-processor nodes, you
+might wish to launch only a single MPI process per node, and then call
+@code{fftw_plan_with_nthreads(4)} on each process to use all
+processors in the nodes.
+
+This may or may not be faster than simply using as many MPI processes
+as you have processors, however.  On the one hand, using threads
+within a node eliminates the need for explicit message passing within
+the node.  On the other hand, FFTW's transpose routines are not
+multi-threaded, and this means that the communications that do take
+place will not benefit from parallelization within the node.
+Moreover, many MPI implementations already have optimizations to
+exploit shared memory when it is available, so adding the
+multithreaded FFTW on top of this may be superfluous.
+@cindex transpose
+
+@c ------------------------------------------------------------
+@node FFTW MPI Reference, FFTW MPI Fortran Interface, Combining MPI and Threads, Distributed-memory FFTW with MPI
+@section FFTW MPI Reference
+
+This chapter provides a complete reference to all FFTW MPI functions,
+datatypes, and constants.  See also @ref{FFTW Reference} for information
+on functions and types in common with the serial interface.
+
+@menu
+* MPI Files and Data Types::    
+* MPI Initialization::          
+* Using MPI Plans::             
+* MPI Data Distribution Functions::  
+* MPI Plan Creation::           
+* MPI Wisdom Communication::    
+@end menu
+
+@node MPI Files and Data Types, MPI Initialization, FFTW MPI Reference, FFTW MPI Reference
+@subsection MPI Files and Data Types
+
+All programs using FFTW's MPI support should include its header file:
+
+@example
+#include <fftw3-mpi.h>
+@end example
+
+Note that this header file includes the serial-FFTW @code{fftw3.h}
+header file, and also the @code{mpi.h} header file for MPI, so you
+need not include those files separately.
+
+You must also link to @emph{both} the FFTW MPI library and to the
+serial FFTW library.  On Unix, this means adding @code{-lfftw3_mpi
+-lfftw3 -lm} at the end of the link command.
+
+@cindex precision
+Different precisions are handled as in the serial interface:
+@xref{Precision}.  That is, @samp{fftw_} functions become
+@code{fftwf_} (in single precision) etcetera, and the libraries become
+@code{-lfftw3f_mpi -lfftw3f -lm} etcetera on Unix.  Long-double
+precision is supported in MPI, but quad precision (@samp{fftwq_}) is
+not due to the lack of MPI support for this type.
+
+@node MPI Initialization, Using MPI Plans, MPI Files and Data Types, FFTW MPI Reference
+@subsection MPI Initialization
+
+Before calling any other FFTW MPI (@samp{fftw_mpi_}) function, and
+before importing any wisdom for MPI problems, you must call:
+
+@findex fftw_mpi_init
+@example
+void fftw_mpi_init(void);
+@end example
+
+@findex fftw_init_threads
+If FFTW threads support is used, however, @code{fftw_mpi_init} should
+be called @emph{after} @code{fftw_init_threads} (@pxref{Combining MPI
+and Threads}).  Calling @code{fftw_mpi_init} additional times (before
+@code{fftw_mpi_cleanup}) has no effect.
+
+
+If you want to deallocate all persistent data and reset FFTW to the
+pristine state it was in when you started your program, you can call:
+
+@findex fftw_mpi_cleanup
+@example
+void fftw_mpi_cleanup(void);
+@end example
+
+@findex fftw_cleanup
+(This calls @code{fftw_cleanup}, so you need not call the serial
+cleanup routine too, although it is safe to do so.)  After calling
+@code{fftw_mpi_cleanup}, all existing plans become undefined, and you
+should not attempt to execute or destroy them.  You must call
+@code{fftw_mpi_init} again after @code{fftw_mpi_cleanup} if you want
+to resume using the MPI FFTW routines.
+
+@node Using MPI Plans, MPI Data Distribution Functions, MPI Initialization, FFTW MPI Reference
+@subsection Using MPI Plans
+
+Once an MPI plan is created, you can execute and destroy it using
+@code{fftw_execute}, @code{fftw_destroy_plan}, and the other functions
+in the serial interface that operate on generic plans (@pxref{Using
+Plans}).  
+
+@cindex collective function
+@cindex MPI communicator
+The @code{fftw_execute} and @code{fftw_destroy_plan} functions, applied to
+MPI plans, are @emph{collective} calls: they must be called for all processes
+in the communicator that was used to create the plan.
+
+@cindex new-array execution
+You must @emph{not} use the serial new-array plan-execution functions
+@code{fftw_execute_dft} and so on (@pxref{New-array Execute
+Functions}) with MPI plans.  Such functions are specialized to the
+problem type, and there are specific new-array execute functions for MPI plans:
+
+@findex fftw_mpi_execute_dft
+@findex fftw_mpi_execute_dft_r2c
+@findex fftw_mpi_execute_dft_c2r
+@findex fftw_mpi_execute_r2r
+@example
+void fftw_mpi_execute_dft(fftw_plan p, fftw_complex *in, fftw_complex *out);
+void fftw_mpi_execute_dft_r2c(fftw_plan p, double *in, fftw_complex *out);
+void fftw_mpi_execute_dft_c2r(fftw_plan p, fftw_complex *in, double *out);
+void fftw_mpi_execute_r2r(fftw_plan p, double *in, double *out);
+@end example
+
+@cindex alignment
+@findex fftw_malloc
+These functions have the same restrictions as those of the serial
+new-array execute functions.  They are @emph{always} safe to apply to
+the @emph{same} @code{in} and @code{out} arrays that were used to
+create the plan.  They can only be applied to new arrarys if those
+arrays have the same types, dimensions, in-placeness, and alignment as
+the original arrays, where the best way to ensure the same alignment
+is to use FFTW's @code{fftw_malloc} and related allocation functions
+for all arrays (@pxref{Memory Allocation}).  Note that distributed
+transposes (@pxref{FFTW MPI Transposes}) use
+@code{fftw_mpi_execute_r2r}, since they count as rank-zero r2r plans
+from FFTW's perspective.
+
+@node MPI Data Distribution Functions, MPI Plan Creation, Using MPI Plans, FFTW MPI Reference
+@subsection MPI Data Distribution Functions
+
+@cindex data distribution
+As described above (@pxref{MPI Data Distribution}), in order to
+allocate your arrays, @emph{before} creating a plan, you must first
+call one of the following routines to determine the required
+allocation size and the portion of the array locally stored on a given
+process.  The @code{MPI_Comm} communicator passed here must be
+equivalent to the communicator used below for plan creation.
+
+The basic interface for multidimensional transforms consists of the
+functions:
+
+@findex fftw_mpi_local_size_2d
+@findex fftw_mpi_local_size_3d
+@findex fftw_mpi_local_size
+@findex fftw_mpi_local_size_2d_transposed
+@findex fftw_mpi_local_size_3d_transposed
+@findex fftw_mpi_local_size_transposed
+@example
+ptrdiff_t fftw_mpi_local_size_2d(ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                                 ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+ptrdiff_t fftw_mpi_local_size_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                 MPI_Comm comm,
+                                 ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+ptrdiff_t fftw_mpi_local_size(int rnk, const ptrdiff_t *n, MPI_Comm comm,
+                              ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+
+ptrdiff_t fftw_mpi_local_size_2d_transposed(ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,
+                                            ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                            ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+ptrdiff_t fftw_mpi_local_size_3d_transposed(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                            MPI_Comm comm,
+                                            ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                            ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+ptrdiff_t fftw_mpi_local_size_transposed(int rnk, const ptrdiff_t *n, MPI_Comm comm,
+                                         ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                         ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+@end example
+
+These functions return the number of elements to allocate (complex
+numbers for DFT/r2c/c2r plans, real numbers for r2r plans), whereas
+the @code{local_n0} and @code{local_0_start} return the portion
+(@code{local_0_start} to @code{local_0_start + local_n0 - 1}) of the
+first dimension of an @ndims{} array that is stored on the local
+process.  @xref{Basic and advanced distribution interfaces}.  For
+@code{FFTW_MPI_TRANSPOSED_OUT} plans, the @samp{_transposed} variants
+are useful in order to also return the local portion of the first
+dimension in the @ndimstrans{} transposed output.  @xref{Transposed
+distributions}.  The advanced interface for multidimensional
+transforms is:
+
+@cindex advanced interface
+@findex fftw_mpi_local_size_many
+@findex fftw_mpi_local_size_many_transposed
+@example
+ptrdiff_t fftw_mpi_local_size_many(int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+                                   ptrdiff_t block0, MPI_Comm comm,
+                                   ptrdiff_t *local_n0, ptrdiff_t *local_0_start);
+ptrdiff_t fftw_mpi_local_size_many_transposed(int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+                                              ptrdiff_t block0, ptrdiff_t block1, MPI_Comm comm,
+                                              ptrdiff_t *local_n0, ptrdiff_t *local_0_start,
+                                              ptrdiff_t *local_n1, ptrdiff_t *local_1_start);
+@end example
+
+These differ from the basic interface in only two ways.  First, they
+allow you to specify block sizes @code{block0} and @code{block1} (the
+latter for the transposed output); you can pass
+@code{FFTW_MPI_DEFAULT_BLOCK} to use FFTW's default block size as in
+the basic interface.  Second, you can pass a @code{howmany} parameter,
+corresponding to the advanced planning interface below: this is for
+transforms of contiguous @code{howmany}-tuples of numbers
+(@code{howmany = 1} in the basic interface).
+
+The corresponding basic and advanced routines for one-dimensional
+transforms (currently only complex DFTs) are:
+
+@findex fftw_mpi_local_size_1d
+@findex fftw_mpi_local_size_many_1d
+@example
+ptrdiff_t fftw_mpi_local_size_1d(
+             ptrdiff_t n0, MPI_Comm comm, int sign, unsigned flags,
+             ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+             ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+ptrdiff_t fftw_mpi_local_size_many_1d(
+             ptrdiff_t n0, ptrdiff_t howmany,
+             MPI_Comm comm, int sign, unsigned flags,
+             ptrdiff_t *local_ni, ptrdiff_t *local_i_start,
+             ptrdiff_t *local_no, ptrdiff_t *local_o_start);
+@end example
+
+@ctindex FFTW_MPI_SCRAMBLED_OUT
+@ctindex FFTW_MPI_SCRAMBLED_IN
+As above, the return value is the number of elements to allocate
+(complex numbers, for complex DFTs).  The @code{local_ni} and
+@code{local_i_start} arguments return the portion
+(@code{local_i_start} to @code{local_i_start + local_ni - 1}) of the
+1d array that is stored on this process for the transform
+@emph{input}, and @code{local_no} and @code{local_o_start} are the
+corresponding quantities for the input.  The @code{sign}
+(@code{FFTW_FORWARD} or @code{FFTW_BACKWARD}) and @code{flags} must
+match the arguments passed when creating a plan.  Although the inputs
+and outputs have different data distributions in general, it is
+guaranteed that the @emph{output} data distribution of an
+@code{FFTW_FORWARD} plan will match the @emph{input} data distribution
+of an @code{FFTW_BACKWARD} plan and vice versa; similarly for the
+@code{FFTW_MPI_SCRAMBLED_OUT} and @code{FFTW_MPI_SCRAMBLED_IN} flags.
+@xref{One-dimensional distributions}.
+
+@node MPI Plan Creation, MPI Wisdom Communication, MPI Data Distribution Functions, FFTW MPI Reference
+@subsection MPI Plan Creation
+
+@subsubheading Complex-data MPI DFTs
+
+Plans for complex-data DFTs (@pxref{2d MPI example}) are created by:
+
+@findex fftw_mpi_plan_dft_1d
+@findex fftw_mpi_plan_dft_2d
+@findex fftw_mpi_plan_dft_3d
+@findex fftw_mpi_plan_dft
+@findex fftw_mpi_plan_many_dft
+@example
+fftw_plan fftw_mpi_plan_dft_1d(ptrdiff_t n0, fftw_complex *in, fftw_complex *out,
+                               MPI_Comm comm, int sign, unsigned flags);
+fftw_plan fftw_mpi_plan_dft_2d(ptrdiff_t n0, ptrdiff_t n1,
+                               fftw_complex *in, fftw_complex *out,
+                               MPI_Comm comm, int sign, unsigned flags);
+fftw_plan fftw_mpi_plan_dft_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                               fftw_complex *in, fftw_complex *out,
+                               MPI_Comm comm, int sign, unsigned flags);
+fftw_plan fftw_mpi_plan_dft(int rnk, const ptrdiff_t *n, 
+                            fftw_complex *in, fftw_complex *out,
+                            MPI_Comm comm, int sign, unsigned flags);
+fftw_plan fftw_mpi_plan_many_dft(int rnk, const ptrdiff_t *n,
+                                 ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock,
+                                 fftw_complex *in, fftw_complex *out,
+                                 MPI_Comm comm, int sign, unsigned flags);
+@end example
+
+@cindex MPI communicator
+@cindex collective function
+These are similar to their serial counterparts (@pxref{Complex DFTs})
+in specifying the dimensions, sign, and flags of the transform.  The
+@code{comm} argument gives an MPI communicator that specifies the set
+of processes to participate in the transform; plan creation is a
+collective function that must be called for all processes in the
+communicator.  The @code{in} and @code{out} pointers refer only to a
+portion of the overall transform data (@pxref{MPI Data Distribution})
+as specified by the @samp{local_size} functions in the previous
+section.  Unless @code{flags} contains @code{FFTW_ESTIMATE}, these
+arrays are overwritten during plan creation as for the serial
+interface.  For multi-dimensional transforms, any dimensions @code{>
+1} are supported; for one-dimensional transforms, only composite
+(non-prime) @code{n0} are currently supported (unlike the serial
+FFTW).  Requesting an unsupported transform size will yield a
+@code{NULL} plan.  (As in the serial interface, highly composite sizes
+generally yield the best performance.)
+
+@cindex advanced interface
+@ctindex FFTW_MPI_DEFAULT_BLOCK
+@cindex stride
+The advanced-interface @code{fftw_mpi_plan_many_dft} additionally
+allows you to specify the block sizes for the first dimension
+(@code{block}) of the @ndims{} input data and the first dimension
+(@code{tblock}) of the @ndimstrans{} transposed data (at intermediate
+steps of the transform, and for the output if
+@code{FFTW_TRANSPOSED_OUT} is specified in @code{flags}).  These must
+be the same block sizes as were passed to the corresponding
+@samp{local_size} function; you can pass @code{FFTW_MPI_DEFAULT_BLOCK}
+to use FFTW's default block size as in the basic interface.  Also, the
+@code{howmany} parameter specifies that the transform is of contiguous
+@code{howmany}-tuples rather than individual complex numbers; this
+corresponds to the same parameter in the serial advanced interface
+(@pxref{Advanced Complex DFTs}) with @code{stride = howmany} and
+@code{dist = 1}.
+
+@subsubheading MPI flags
+
+The @code{flags} can be any of those for the serial FFTW
+(@pxref{Planner Flags}), and in addition may include one or more of
+the following MPI-specific flags, which improve performance at the
+cost of changing the output or input data formats.
+
+@itemize @bullet
+
+@item
+@ctindex FFTW_MPI_SCRAMBLED_OUT
+@ctindex FFTW_MPI_SCRAMBLED_IN
+@code{FFTW_MPI_SCRAMBLED_OUT}, @code{FFTW_MPI_SCRAMBLED_IN}: valid for
+1d transforms only, these flags indicate that the output/input of the
+transform are in an undocumented ``scrambled'' order.  A forward
+@code{FFTW_MPI_SCRAMBLED_OUT} transform can be inverted by a backward
+@code{FFTW_MPI_SCRAMBLED_IN} (times the usual 1/@i{N} normalization).
+@xref{One-dimensional distributions}.
+
+@item
+@ctindex FFTW_MPI_TRANSPOSED_OUT
+@ctindex FFTW_MPI_TRANSPOSED_IN
+@code{FFTW_MPI_TRANSPOSED_OUT}, @code{FFTW_MPI_TRANSPOSED_IN}: valid
+for multidimensional (@code{rnk > 1}) transforms only, these flags
+specify that the output or input of an @ndims{} transform is
+transposed to @ndimstrans{}.  @xref{Transposed distributions}.
+
+@end itemize
+
+@subsubheading Real-data MPI DFTs
+
+@cindex r2c
+Plans for real-input/output (r2c/c2r) DFTs (@pxref{Multi-dimensional
+MPI DFTs of Real Data}) are created by:
+
+@findex fftw_mpi_plan_dft_r2c_2d
+@findex fftw_mpi_plan_dft_r2c_2d
+@findex fftw_mpi_plan_dft_r2c_3d
+@findex fftw_mpi_plan_dft_r2c
+@findex fftw_mpi_plan_dft_c2r_2d
+@findex fftw_mpi_plan_dft_c2r_2d
+@findex fftw_mpi_plan_dft_c2r_3d
+@findex fftw_mpi_plan_dft_c2r
+@example
+fftw_plan fftw_mpi_plan_dft_r2c_2d(ptrdiff_t n0, ptrdiff_t n1, 
+                                   double *in, fftw_complex *out,
+                                   MPI_Comm comm, unsigned flags);
+fftw_plan fftw_mpi_plan_dft_r2c_2d(ptrdiff_t n0, ptrdiff_t n1, 
+                                   double *in, fftw_complex *out,
+                                   MPI_Comm comm, unsigned flags);
+fftw_plan fftw_mpi_plan_dft_r2c_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                   double *in, fftw_complex *out,
+                                   MPI_Comm comm, unsigned flags);
+fftw_plan fftw_mpi_plan_dft_r2c(int rnk, const ptrdiff_t *n,
+                                double *in, fftw_complex *out,
+                                MPI_Comm comm, unsigned flags);
+fftw_plan fftw_mpi_plan_dft_c2r_2d(ptrdiff_t n0, ptrdiff_t n1, 
+                                   fftw_complex *in, double *out,
+                                   MPI_Comm comm, unsigned flags);
+fftw_plan fftw_mpi_plan_dft_c2r_2d(ptrdiff_t n0, ptrdiff_t n1, 
+                                   fftw_complex *in, double *out,
+                                   MPI_Comm comm, unsigned flags);
+fftw_plan fftw_mpi_plan_dft_c2r_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                                   fftw_complex *in, double *out,
+                                   MPI_Comm comm, unsigned flags);
+fftw_plan fftw_mpi_plan_dft_c2r(int rnk, const ptrdiff_t *n,
+                                fftw_complex *in, double *out,
+                                MPI_Comm comm, unsigned flags);
+@end example
+
+Similar to the serial interface (@pxref{Real-data DFTs}), these
+transform logically @ndims{} real data to/from @ndimshalf{} complex
+data, representing the non-redundant half of the conjugate-symmetry
+output of a real-input DFT (@pxref{Multi-dimensional Transforms}).
+However, the real array must be stored within a padded @ndimspad{}
+array (much like the in-place serial r2c transforms, but here for
+out-of-place transforms as well). Currently, only multi-dimensional
+(@code{rnk > 1}) r2c/c2r transforms are supported (requesting a plan
+for @code{rnk = 1} will yield @code{NULL}).  As explained above
+(@pxref{Multi-dimensional MPI DFTs of Real Data}), the data
+distribution of both the real and complex arrays is given by the
+@samp{local_size} function called for the dimensions of the
+@emph{complex} array.  Similar to the other planning functions, the
+input and output arrays are overwritten when the plan is created
+except in @code{FFTW_ESTIMATE} mode.
+
+As for the complex DFTs above, there is an advance interface that
+allows you to manually specify block sizes and to transform contiguous
+@code{howmany}-tuples of real/complex numbers:
+
+@findex fftw_mpi_plan_many_dft_r2c
+@findex fftw_mpi_plan_many_dft_c2r
+@example
+fftw_plan fftw_mpi_plan_many_dft_r2c
+              (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+               ptrdiff_t iblock, ptrdiff_t oblock,
+               double *in, fftw_complex *out,
+               MPI_Comm comm, unsigned flags);
+fftw_plan fftw_mpi_plan_many_dft_c2r
+              (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,
+               ptrdiff_t iblock, ptrdiff_t oblock,
+               fftw_complex *in, double *out,
+               MPI_Comm comm, unsigned flags);               
+@end example
+
+@subsubheading MPI r2r transforms
+
+@cindex r2r
+There are corresponding plan-creation routines for r2r
+transforms (@pxref{More DFTs of Real Data}), currently supporting
+multidimensional (@code{rnk > 1}) transforms only (@code{rnk = 1} will
+yield a @code{NULL} plan):
+
+@example
+fftw_plan fftw_mpi_plan_r2r_2d(ptrdiff_t n0, ptrdiff_t n1,
+                               double *in, double *out,
+                               MPI_Comm comm,
+                               fftw_r2r_kind kind0, fftw_r2r_kind kind1,
+                               unsigned flags);
+fftw_plan fftw_mpi_plan_r2r_3d(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,
+                               double *in, double *out,
+                               MPI_Comm comm,
+                               fftw_r2r_kind kind0, fftw_r2r_kind kind1, fftw_r2r_kind kind2,
+                               unsigned flags);
+fftw_plan fftw_mpi_plan_r2r(int rnk, const ptrdiff_t *n,
+                            double *in, double *out,
+                            MPI_Comm comm, const fftw_r2r_kind *kind, 
+                            unsigned flags);
+fftw_plan fftw_mpi_plan_many_r2r(int rnk, const ptrdiff_t *n,
+                                 ptrdiff_t iblock, ptrdiff_t oblock,
+                                 double *in, double *out,
+                                 MPI_Comm comm, const fftw_r2r_kind *kind, 
+                                 unsigned flags);
+@end example
+
+The parameters are much the same as for the complex DFTs above, except
+that the arrays are of real numbers (and hence the outputs of the
+@samp{local_size} data-distribution functions should be interpreted as
+counts of real rather than complex numbers).  Also, the @code{kind}
+parameters specify the r2r kinds along each dimension as for the
+serial interface (@pxref{Real-to-Real Transform Kinds}).  @xref{Other
+Multi-dimensional Real-data MPI Transforms}.
+
+@subsubheading MPI transposition
+@cindex transpose
+
+FFTW also provides routines to plan a transpose of a distributed
+@code{n0} by @code{n1} array of real numbers, or an array of
+@code{howmany}-tuples of real numbers with specified block sizes
+(@pxref{FFTW MPI Transposes}):
+
+@findex fftw_mpi_plan_transpose
+@findex fftw_mpi_plan_many_transpose
+@example
+fftw_plan fftw_mpi_plan_transpose(ptrdiff_t n0, ptrdiff_t n1,
+                                  double *in, double *out,
+                                  MPI_Comm comm, unsigned flags);
+fftw_plan fftw_mpi_plan_many_transpose
+                (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany,
+                 ptrdiff_t block0, ptrdiff_t block1,
+                 double *in, double *out, MPI_Comm comm, unsigned flags);
+@end example
+
+@cindex new-array execution
+@findex fftw_mpi_execute_r2r
+These plans are used with the @code{fftw_mpi_execute_r2r} new-array
+execute function (@pxref{Using MPI Plans }), since they count as (rank
+zero) r2r plans from FFTW's perspective.
+
+@node MPI Wisdom Communication,  , MPI Plan Creation, FFTW MPI Reference
+@subsection MPI Wisdom Communication
+
+To facilitate synchronizing wisdom among the different MPI processes,
+we provide two functions:
+
+@findex fftw_mpi_gather_wisdom
+@findex fftw_mpi_broadcast_wisdom
+@example
+void fftw_mpi_gather_wisdom(MPI_Comm comm);
+void fftw_mpi_broadcast_wisdom(MPI_Comm comm);
+@end example
+
+The @code{fftw_mpi_gather_wisdom} function gathers all wisdom in the
+given communicator @code{comm} to the process of rank 0 in the
+communicator: that process obtains the union of all wisdom on all the
+processes.  As a side effect, some other processes will gain
+additional wisdom from other processes, but only process 0 will gain
+the complete union.
+
+The @code{fftw_mpi_broadcast_wisdom} does the reverse: it exports
+wisdom from process 0 in @code{comm} to all other processes in the
+communicator, replacing any wisdom they currently have.
+
+@xref{FFTW MPI Wisdom}.
+
+@c ------------------------------------------------------------
+@node FFTW MPI Fortran Interface,  , FFTW MPI Reference, Distributed-memory FFTW with MPI
+@section FFTW MPI Fortran Interface
+@cindex Fortran interface
+
+@cindex iso_c_binding
+The FFTW MPI interface is callable from modern Fortran compilers
+supporting the Fortran 2003 @code{iso_c_binding} standard for calling
+C functions.  As described in @ref{Calling FFTW from Modern Fortran},
+this means that you can directly call FFTW's C interface from Fortran
+with only minor changes in syntax.  There are, however, a few things
+specific to the MPI interface to keep in mind:
+
+@itemize @bullet
+
+@item
+Instead of including @code{fftw3.f03} as in @ref{Overview of Fortran
+interface }, you should @code{include 'fftw3-mpi.f03'} (after
+@code{use, intrinsic :: iso_c_binding} as before).  The
+@code{fftw3-mpi.f03} file includes @code{fftw3.f03}, so you should
+@emph{not} @code{include} them both yourself.  (You will also want to
+include the MPI header file, usually via @code{include 'mpif.h'} or
+similar, although though this is not needed by @code{fftw3-mpi.f03}
+@i{per se}.)  (To use the @samp{fftwl_} @code{long double} extended-precision routines in supporting compilers, you should include @code{fftw3f-mpi.f03} in @emph{addition} to @code{fftw3-mpi.f03}. @xref{Extended and quadruple precision in Fortran}.)
+
+@item
+Because of the different storage conventions between C and Fortran,
+you reverse the order of your array dimensions when passing them to
+FFTW (@pxref{Reversing array dimensions}).  This is merely a
+difference in notation and incurs no performance overhead.  However,
+it means that, whereas in C the @emph{first} dimension is distributed,
+in Fortran the @emph{last} dimension of your array is distributed.
+
+@item
+@cindex MPI communicator
+In Fortran, communicators are stored as @code{integer} types; there is
+no @code{MPI_Comm} type, nor is there any way to access a C
+@code{MPI_Comm}.  Fortunately, this is taken care of for you by the
+FFTW Fortran interface: whenever the C interface expects an
+@code{MPI_Comm} type, you should pass the Fortran communicator as an
+@code{integer}.@footnote{Technically, this is because you aren't
+actually calling the C functions directly. You are calling wrapper
+functions that translate the communicator with @code{MPI_Comm_f2c}
+before calling the ordinary C interface.  This is all done
+transparently, however, since the @code{fftw3-mpi.f03} interface file
+renames the wrappers so that they are called in Fortran with the same
+names as the C interface functions.}
+
+@item
+Because you need to call the @samp{local_size} function to find out
+how much space to allocate, and this may be @emph{larger} than the
+local portion of the array (@pxref{MPI Data Distribution}), you should
+@emph{always} allocate your arrays dynamically using FFTW's allocation
+routines as described in @ref{Allocating aligned memory in Fortran}.
+(Coincidentally, this also provides the best performance by
+guaranteeding proper data alignment.)
+
+@item
+Because all sizes in the MPI FFTW interface are declared as
+@code{ptrdiff_t} in C, you should use @code{integer(C_INTPTR_T)} in
+Fortran (@pxref{FFTW Fortran type reference}).
+
+@item
+@findex fftw_execute_dft
+@findex fftw_mpi_execute_dft
+@cindex new-array execution
+In Fortran, because of the language semantics, we generally recommend
+using the new-array execute functions for all plans, even in the
+common case where you are executing the plan on the same arrays for
+which the plan was created (@pxref{Plan execution in Fortran}).
+However, note that in the MPI interface these functions are changed:
+@code{fftw_execute_dft} becomes @code{fftw_mpi_execute_dft},
+etcetera. @xref{Using MPI Plans}.
+
+@end itemize
+
+For example, here is a Fortran code snippet to perform a distributed
+@twodims{L,M} complex DFT in-place.  (This assumes you have already
+initialized MPI with @code{MPI_init} and have also performed
+@code{call fftw_mpi_init}.)
+
+@example
+  use, intrinsic :: iso_c_binding
+  include 'fftw3-mpi.f03'
+  integer(C_INTPTR_T), parameter :: L = ...
+  integer(C_INTPTR_T), parameter :: M = ...
+  type(C_PTR) :: plan, cdata
+  complex(C_DOUBLE_COMPLEX), pointer :: data(:,:)
+  integer(C_INTPTR_T) :: i, j, alloc_local, local_M, local_j_offset
+
+!   @r{get local data size and allocate (note dimension reversal)}
+  alloc_local = fftw_mpi_local_size_2d(M, L, MPI_COMM_WORLD, &
+                                       local_M, local_j_offset)
+  cdata = fftw_alloc_complex(alloc_local)
+  call c_f_pointer(cdata, data, [L,local_M])
+
+!   @r{create MPI plan for in-place forward DFT (note dimension reversal)}
+  plan = fftw_mpi_plan_dft_2d(M, L, data, data, MPI_COMM_WORLD, &
+                              FFTW_FORWARD, FFTW_MEASURE)
+
+! @r{initialize data to some function} my_function(i,j)
+  do j = 1, local_M
+    do i = 1, L
+      data(i, j) = my_function(i, j + local_j_offset)
+    end do
+  end do
+
+! @r{compute transform (as many times as desired)}
+  call fftw_mpi_execute_dft(plan, data, data)
+
+  call fftw_destroy_plan(plan)
+  call fftw_free(cdata)
+@end example
+
+Note that when we called @code{fftw_mpi_local_size_2d} and
+@code{fftw_mpi_plan_dft_2d} with the dimensions in reversed order,
+since a @twodims{L,M} Fortran array is viewed by FFTW in C as a
+@twodims{M, L} array.  This means that the array was distributed over
+the @code{M} dimension, the local portion of which is a
+@twodims{L,local_M} array in Fortran.  (You must @emph{not} use an
+@code{allocate} statement to allocate an @twodims{L,local_M} array,
+however; you must allocate @code{alloc_local} complex numbers, which
+may be greater than @code{L * local_M}, in order to reserve space for
+intermediate steps of the transform.)  Finally, we mention that
+because C's array indices are zero-based, the @code{local_j_offset}
+argument can conveniently be interpreted as an offset in the 1-based
+@code{j} index (rather than as a starting index as in C).
+
+If instead you had used the @code{ior(FFTW_MEASURE,
+FFTW_MPI_TRANSPOSED_OUT)} flag, the output of the transform would be a
+transposed @twodims{M,local_L} array, associated with the @emph{same}
+@code{cdata} allocation (since the transform is in-place), and which
+you could declare with:
+
+@example
+  complex(C_DOUBLE_COMPLEX), pointer :: tdata(:,:)
+  ...
+  call c_f_pointer(cdata, tdata, [M,local_L])
+@end example
+
+where @code{local_L} would have been obtained by changing the
+@code{fftw_mpi_local_size_2d} call to:
+
+@example
+  alloc_local = fftw_mpi_local_size_2d_transposed(M, L, MPI_COMM_WORLD, &
+                           local_M, local_j_offset, local_L, local_i_offset)
+@end example
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/other.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/other.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,398 @@
+@node Other Important Topics, FFTW Reference, Tutorial, Top
+@chapter Other Important Topics
+@menu
+* SIMD alignment and fftw_malloc::  
+* Multi-dimensional Array Format::  
+* Words of Wisdom-Saving Plans::  
+* Caveats in Using Wisdom::     
+@end menu
+
+@c ------------------------------------------------------------
+@node SIMD alignment and fftw_malloc, Multi-dimensional Array Format, Other Important Topics, Other Important Topics
+@section SIMD alignment and fftw_malloc
+
+SIMD, which stands for ``Single Instruction Multiple Data,'' is a set of
+special operations supported by some processors to perform a single
+operation on several numbers (usually 2 or 4) simultaneously.  SIMD
+floating-point instructions are available on several popular CPUs:
+SSE/SSE2/AVX on recent x86/x86-64 processors, AltiVec (single precision)
+on some PowerPCs (Apple G4 and higher), NEON on some ARM models, and MIPS Paired Single
+(currently only in FFTW 3.2.x).  FFTW can be compiled to support the
+SIMD instructions on any of these systems.
+@cindex SIMD
+@cindex SSE
+@cindex SSE2
+@cindex AVX
+@cindex AltiVec
+@cindex MIPS PS
+@cindex precision
+
+
+A program linking to an FFTW library compiled with SIMD support can
+obtain a nonnegligible speedup for most complex and r2c/c2r
+transforms.  In order to obtain this speedup, however, the arrays of
+complex (or real) data passed to FFTW must be specially aligned in
+memory (typically 16-byte aligned), and often this alignment is more
+stringent than that provided by the usual @code{malloc} (etc.)
+allocation routines.
+
+@cindex portability
+In order to guarantee proper alignment for SIMD, therefore, in case
+your program is ever linked against a SIMD-using FFTW, we recommend
+allocating your transform data with @code{fftw_malloc} and
+de-allocating it with @code{fftw_free}.
+@findex fftw_malloc
+@findex fftw_free
+These have exactly the same interface and behavior as
+@code{malloc}/@code{free}, except that for a SIMD FFTW they ensure
+that the returned pointer has the necessary alignment (by calling
+@code{memalign} or its equivalent on your OS).
+
+You are not @emph{required} to use @code{fftw_malloc}.  You can
+allocate your data in any way that you like, from @code{malloc} to
+@code{new} (in C++) to a fixed-size array declaration.  If the array
+happens not to be properly aligned, FFTW will not use the SIMD
+extensions.
+@cindex C++
+
+@findex fftw_alloc_real
+@findex fftw_alloc_complex
+Since @code{fftw_malloc} only ever needs to be used for real and
+complex arrays, we provide two convenient wrapper routines
+@code{fftw_alloc_real(N)} and @code{fftw_alloc_complex(N)} that are
+equivalent to @code{(double*)fftw_malloc(sizeof(double) * N)} and
+@code{(fftw_complex*)fftw_malloc(sizeof(fftw_complex) * N)},
+respectively (or their equivalents in other precisions).
+
+@c ------------------------------------------------------------
+@node Multi-dimensional Array Format, Words of Wisdom-Saving Plans, SIMD alignment and fftw_malloc, Other Important Topics
+@section Multi-dimensional Array Format
+
+This section describes the format in which multi-dimensional arrays
+are stored in FFTW.  We felt that a detailed discussion of this topic
+was necessary.  Since several different formats are common, this topic
+is often a source of confusion.
+
+@menu
+* Row-major Format::            
+* Column-major Format::         
+* Fixed-size Arrays in C::      
+* Dynamic Arrays in C::         
+* Dynamic Arrays in C-The Wrong Way::  
+@end menu
+
+@c =========>
+@node Row-major Format, Column-major Format, Multi-dimensional Array Format, Multi-dimensional Array Format
+@subsection Row-major Format
+@cindex row-major
+
+The multi-dimensional arrays passed to @code{fftw_plan_dft} etcetera
+are expected to be stored as a single contiguous block in
+@dfn{row-major} order (sometimes called ``C order'').  Basically, this
+means that as you step through adjacent memory locations, the first
+dimension's index varies most slowly and the last dimension's index
+varies most quickly.
+
+To be more explicit, let us consider an array of rank @math{d} whose
+dimensions are @ndims{}. Now, we specify a location in the array by a
+sequence of @math{d} (zero-based) indices, one for each dimension:
+@tex
+$(i_0, i_1, i_2, \ldots, i_{d-1})$.
+@end tex
+@ifinfo
+(i[0], i[1], ..., i[d-1]).
+@end ifinfo
+@html
+(i<sub>0</sub>, i<sub>1</sub>, i<sub>2</sub>,..., i<sub>d-1</sub>).
+@end html
+If the array is stored in row-major
+order, then this element is located at the position
+@tex
+$i_{d-1} + n_{d-1} (i_{d-2} + n_{d-2} (\ldots + n_1 i_0))$.
+@end tex
+@ifinfo
+i[d-1] + n[d-1] * (i[d-2] + n[d-2] * (... + n[1] * i[0])).
+@end ifinfo
+@html
+i<sub>d-1</sub> + n<sub>d-1</sub> * (i<sub>d-2</sub> + n<sub>d-2</sub> * (... + n<sub>1</sub> * i<sub>0</sub>)).
+@end html
+
+Note that, for the ordinary complex DFT, each element of the array
+must be of type @code{fftw_complex}; i.e. a (real, imaginary) pair of
+(double-precision) numbers. 
+
+In the advanced FFTW interface, the physical dimensions @math{n} from
+which the indices are computed can be different from (larger than)
+the logical dimensions of the transform to be computed, in order to
+transform a subset of a larger array.
+@cindex advanced interface
+Note also that, in the advanced interface, the expression above is
+multiplied by a @dfn{stride} to get the actual array index---this is
+useful in situations where each element of the multi-dimensional array
+is actually a data structure (or another array), and you just want to
+transform a single field. In the basic interface, however, the stride
+is 1.
+@cindex stride
+
+@c =========>
+@node Column-major Format, Fixed-size Arrays in C, Row-major Format, Multi-dimensional Array Format
+@subsection Column-major Format
+@cindex column-major
+
+Readers from the Fortran world are used to arrays stored in
+@dfn{column-major} order (sometimes called ``Fortran order'').  This is
+essentially the exact opposite of row-major order in that, here, the
+@emph{first} dimension's index varies most quickly.
+
+If you have an array stored in column-major order and wish to
+transform it using FFTW, it is quite easy to do.  When creating the
+plan, simply pass the dimensions of the array to the planner in
+@emph{reverse order}.  For example, if your array is a rank three
+@code{N x M x L} matrix in column-major order, you should pass the
+dimensions of the array as if it were an @code{L x M x N} matrix
+(which it is, from the perspective of FFTW).  This is done for you
+@emph{automatically} by the FFTW legacy-Fortran interface
+(@pxref{Calling FFTW from Legacy Fortran}), but you must do it
+manually with the modern Fortran interface (@pxref{Reversing array
+dimensions}).
+@cindex Fortran interface
+
+@c =========>
+@node Fixed-size Arrays in C, Dynamic Arrays in C, Column-major Format, Multi-dimensional Array Format
+@subsection Fixed-size Arrays in C
+@cindex C multi-dimensional arrays
+
+A multi-dimensional array whose size is declared at compile time in C
+is @emph{already} in row-major order.  You don't have to do anything
+special to transform it.  For example:
+
+@example
+@{
+     fftw_complex data[N0][N1][N2];
+     fftw_plan plan;
+     ...
+     plan = fftw_plan_dft_3d(N0, N1, N2, &data[0][0][0], &data[0][0][0],
+                             FFTW_FORWARD, FFTW_ESTIMATE);
+     ...
+@}
+@end example
+
+This will plan a 3d in-place transform of size @code{N0 x N1 x N2}.
+Notice how we took the address of the zero-th element to pass to the
+planner (we could also have used a typecast).
+
+However, we tend to @emph{discourage} users from declaring their
+arrays in this way, for two reasons.  First, this allocates the array
+on the stack (``automatic'' storage), which has a very limited size on
+most operating systems (declaring an array with more than a few
+thousand elements will often cause a crash).  (You can get around this
+limitation on many systems by declaring the array as
+@code{static} and/or global, but that has its own drawbacks.)
+Second, it may not optimally align the array for use with a SIMD
+FFTW (@pxref{SIMD alignment and fftw_malloc}).  Instead, we recommend
+using @code{fftw_malloc}, as described below.
+
+@c =========>
+@node Dynamic Arrays in C, Dynamic Arrays in C-The Wrong Way, Fixed-size Arrays in C, Multi-dimensional Array Format
+@subsection Dynamic Arrays in C
+
+We recommend allocating most arrays dynamically, with
+@code{fftw_malloc}.  This isn't too hard to do, although it is not as
+straightforward for multi-dimensional arrays as it is for
+one-dimensional arrays.
+
+Creating the array is simple: using a dynamic-allocation routine like
+@code{fftw_malloc}, allocate an array big enough to store N
+@code{fftw_complex} values (for a complex DFT), where N is the product
+of the sizes of the array dimensions (i.e. the total number of complex
+values in the array).  For example, here is code to allocate a
+@threedims{5,12,27} rank-3 array:
+@findex fftw_malloc
+
+@example
+fftw_complex *an_array;
+an_array = (fftw_complex*) fftw_malloc(5*12*27 * sizeof(fftw_complex));
+@end example
+
+Accessing the array elements, however, is more tricky---you can't
+simply use multiple applications of the @samp{[]} operator like you
+could for fixed-size arrays.  Instead, you have to explicitly compute
+the offset into the array using the formula given earlier for
+row-major arrays.  For example, to reference the @math{(i,j,k)}-th
+element of the array allocated above, you would use the expression
+@code{an_array[k + 27 * (j + 12 * i)]}.
+
+This pain can be alleviated somewhat by defining appropriate macros,
+or, in C++, creating a class and overloading the @samp{()} operator.
+The recent C99 standard provides a way to reinterpret the dynamic
+array as a ``variable-length'' multi-dimensional array amenable to
+@samp{[]}, but this feature is not yet widely supported by compilers.
+@cindex C99
+@cindex C++
+
+@c =========>
+@node Dynamic Arrays in C-The Wrong Way,  , Dynamic Arrays in C, Multi-dimensional Array Format
+@subsection Dynamic Arrays in C---The Wrong Way
+
+A different method for allocating multi-dimensional arrays in C is
+often suggested that is incompatible with FFTW: @emph{using it will
+cause FFTW to die a painful death}.  We discuss the technique here,
+however, because it is so commonly known and used.  This method is to
+create arrays of pointers of arrays of pointers of @dots{}etcetera.
+For example, the analogue in this method to the example above is:
+
+@example
+int i,j;
+fftw_complex ***a_bad_array;  /* @r{another way to make a 5x12x27 array} */
+
+a_bad_array = (fftw_complex ***) malloc(5 * sizeof(fftw_complex **));
+for (i = 0; i < 5; ++i) @{
+     a_bad_array[i] = 
+        (fftw_complex **) malloc(12 * sizeof(fftw_complex *));
+     for (j = 0; j < 12; ++j)
+          a_bad_array[i][j] =
+                (fftw_complex *) malloc(27 * sizeof(fftw_complex));
+@}
+@end example
+
+As you can see, this sort of array is inconvenient to allocate (and
+deallocate).  On the other hand, it has the advantage that the
+@math{(i,j,k)}-th element can be referenced simply by
+@code{a_bad_array[i][j][k]}.
+
+If you like this technique and want to maximize convenience in accessing
+the array, but still want to pass the array to FFTW, you can use a
+hybrid method.  Allocate the array as one contiguous block, but also
+declare an array of arrays of pointers that point to appropriate places
+in the block.  That sort of trick is beyond the scope of this
+documentation; for more information on multi-dimensional arrays in C,
+see the @code{comp.lang.c}
+@uref{http://c-faq.com/aryptr/dynmuldimary.html, FAQ}.
+
+@c ------------------------------------------------------------
+@node Words of Wisdom-Saving Plans, Caveats in Using Wisdom, Multi-dimensional Array Format, Other Important Topics
+@section Words of Wisdom---Saving Plans
+@cindex wisdom
+@cindex saving plans to disk
+
+FFTW implements a method for saving plans to disk and restoring them.
+In fact, what FFTW does is more general than just saving and loading
+plans.  The mechanism is called @dfn{wisdom}.  Here, we describe
+this feature at a high level. @xref{FFTW Reference}, for a less casual
+but more complete discussion of how to use wisdom in FFTW.
+
+Plans created with the @code{FFTW_MEASURE}, @code{FFTW_PATIENT}, or
+@code{FFTW_EXHAUSTIVE} options produce near-optimal FFT performance,
+but may require a long time to compute because FFTW must measure the
+runtime of many possible plans and select the best one.  This setup is
+designed for the situations where so many transforms of the same size
+must be computed that the start-up time is irrelevant.  For short
+initialization times, but slower transforms, we have provided
+@code{FFTW_ESTIMATE}.  The @code{wisdom} mechanism is a way to get the
+best of both worlds: you compute a good plan once, save it to
+disk, and later reload it as many times as necessary.  The wisdom
+mechanism can actually save and reload many plans at once, not just
+one.
+@ctindex FFTW_MEASURE
+@ctindex FFTW_PATIENT
+@ctindex FFTW_EXHAUSTIVE
+@ctindex FFTW_ESTIMATE
+
+
+Whenever you create a plan, the FFTW planner accumulates wisdom, which
+is information sufficient to reconstruct the plan.  After planning,
+you can save this information to disk by means of the function:
+@example
+int fftw_export_wisdom_to_filename(const char *filename);
+@end example
+@findex fftw_export_wisdom_to_filename
+(This function returns non-zero on success.)
+
+The next time you run the program, you can restore the wisdom with
+@code{fftw_import_wisdom_from_filename} (which also returns non-zero on success),
+and then recreate the plan using the same flags as before.
+@example
+int fftw_import_wisdom_from_filename(const char *filename);
+@end example
+@findex fftw_import_wisdom_from_filename
+
+Wisdom is automatically used for any size to which it is applicable, as
+long as the planner flags are not more ``patient'' than those with which
+the wisdom was created.  For example, wisdom created with
+@code{FFTW_MEASURE} can be used if you later plan with
+@code{FFTW_ESTIMATE} or @code{FFTW_MEASURE}, but not with
+@code{FFTW_PATIENT}.
+
+The @code{wisdom} is cumulative, and is stored in a global, private
+data structure managed internally by FFTW.  The storage space required
+is minimal, proportional to the logarithm of the sizes the wisdom was
+generated from.  If memory usage is a concern, however, the wisdom can
+be forgotten and its associated memory freed by calling:
+@example
+void fftw_forget_wisdom(void);
+@end example
+@findex fftw_forget_wisdom
+
+Wisdom can be exported to a file, a string, or any other medium.
+For details, see @ref{Wisdom}.
+
+@node Caveats in Using Wisdom,  , Words of Wisdom-Saving Plans, Other Important Topics
+@section Caveats in Using Wisdom
+@cindex wisdom, problems with
+
+@quotation
+@html
+<i>
+@end html
+For in much wisdom is much grief, and he that increaseth knowledge
+increaseth sorrow.
+@html
+</i>
+@end html
+[Ecclesiastes 1:18]
+@cindex Ecclesiastes
+@end quotation
+@iftex
+@medskip
+@end iftex
+
+@cindex portability
+There are pitfalls to using wisdom, in that it can negate FFTW's
+ability to adapt to changing hardware and other conditions. For
+example, it would be perfectly possible to export wisdom from a
+program running on one processor and import it into a program running
+on another processor.  Doing so, however, would mean that the second
+program would use plans optimized for the first processor, instead of
+the one it is running on.
+
+It should be safe to reuse wisdom as long as the hardware and program
+binaries remain unchanged. (Actually, the optimal plan may change even
+between runs of the same binary on identical hardware, due to
+differences in the virtual memory environment, etcetera.  Users
+seriously interested in performance should worry about this problem,
+too.)  It is likely that, if the same wisdom is used for two
+different program binaries, even running on the same machine, the
+plans may be sub-optimal because of differing code alignments.  It is
+therefore wise to recreate wisdom every time an application is
+recompiled.  The more the underlying hardware and software changes
+between the creation of wisdom and its use, the greater grows
+the risk of sub-optimal plans.
+
+Nevertheless, if the choice is between using @code{FFTW_ESTIMATE} or
+using possibly-suboptimal wisdom (created on the same machine, but for a
+different binary), the wisdom is likely to be better.  For this reason,
+we provide a function to import wisdom from a standard system-wide
+location (@code{/etc/fftw/wisdom} on Unix):
+@cindex wisdom, system-wide
+
+@example
+int fftw_import_system_wisdom(void);
+@end example
+@findex fftw_import_system_wisdom
+
+FFTW also provides a standalone program, @code{fftw-wisdom} (described
+by its own @code{man} page on Unix) with which users can create wisdom,
+e.g. for a canonical set of sizes to store in the system wisdom file.
+@xref{Wisdom Utilities}.
+@cindex fftw-wisdom utility
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/reference.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/reference.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2435 @@
+@node FFTW Reference, Multi-threaded FFTW, Other Important Topics, Top
+@chapter FFTW Reference
+
+This chapter provides a complete reference for all sequential (i.e.,
+one-processor) FFTW functions.  Parallel transforms are described in
+later chapters.
+
+@menu
+* Data Types and Files::        
+* Using Plans::                 
+* Basic Interface::             
+* Advanced Interface::          
+* Guru Interface::              
+* New-array Execute Functions::  
+* Wisdom::                      
+* What FFTW Really Computes::   
+@end menu
+
+@c ------------------------------------------------------------
+@node Data Types and Files, Using Plans, FFTW Reference, FFTW Reference
+@section Data Types and Files
+
+All programs using FFTW should include its header file:
+
+@example
+#include <fftw3.h>
+@end example
+
+You must also link to the FFTW library.  On Unix, this
+means adding @code{-lfftw3 -lm} at the @emph{end} of the link command.
+
+@menu
+* Complex numbers::             
+* Precision::                   
+* Memory Allocation::           
+@end menu
+
+@c =========>
+@node Complex numbers, Precision, Data Types and Files, Data Types and Files
+@subsection Complex numbers
+
+The default FFTW interface uses @code{double} precision for all
+floating-point numbers, and defines a @code{fftw_complex} type to hold
+complex numbers as:
+
+@example
+typedef double fftw_complex[2];
+@end example
+@tindex fftw_complex
+
+Here, the @code{[0]} element holds the real part and the @code{[1]}
+element holds the imaginary part.
+
+Alternatively, if you have a C compiler (such as @code{gcc}) that
+supports the C99 revision of the ANSI C standard, you can use C's new
+native complex type (which is binary-compatible with the typedef above).
+In particular, if you @code{#include <complex.h>} @emph{before}
+@code{<fftw3.h>}, then @code{fftw_complex} is defined to be the native
+complex type and you can manipulate it with ordinary arithmetic
+(e.g. @code{x = y * (3+4*I)}, where @code{x} and @code{y} are
+@code{fftw_complex} and @code{I} is the standard symbol for the
+imaginary unit);
+@cindex C99
+
+
+C++ has its own @code{complex<T>} template class, defined in the
+standard @code{<complex>} header file.  Reportedly, the C++ standards
+committee has recently agreed to mandate that the storage format used
+for this type be binary-compatible with the C99 type, i.e. an array
+@code{T[2]} with consecutive real @code{[0]} and imaginary @code{[1]}
+parts.  (See report
+@uref{http://www.open-std.org/jtc1/sc22/WG21/docs/papers/2002/n1388.pdf
+WG21/N1388}.)  Although not part of the official standard as of this
+writing, the proposal stated that: ``This solution has been tested with
+all current major implementations of the standard library and shown to
+be working.''  To the extent that this is true, if you have a variable
+@code{complex<double> *x}, you can pass it directly to FFTW via
+@code{reinterpret_cast<fftw_complex*>(x)}.
+@cindex C++
+@cindex portability
+
+@c =========>
+@node Precision, Memory Allocation, Complex numbers, Data Types and Files
+@subsection Precision
+@cindex precision
+
+You can install single and long-double precision versions of FFTW,
+which replace @code{double} with @code{float} and @code{long double},
+respectively (@pxref{Installation and Customization}).  To use these
+interfaces, you:
+
+@itemize @bullet
+
+@item
+Link to the single/long-double libraries; on Unix, @code{-lfftw3f} or
+@code{-lfftw3l} instead of (or in addition to) @code{-lfftw3}.  (You
+can link to the different-precision libraries simultaneously.)
+
+@item
+Include the @emph{same} @code{<fftw3.h>} header file.
+
+@item
+Replace all lowercase instances of @samp{fftw_} with @samp{fftwf_} or
+@samp{fftwl_} for single or long-double precision, respectively.
+(@code{fftw_complex} becomes @code{fftwf_complex}, @code{fftw_execute}
+becomes @code{fftwf_execute}, etcetera.)
+
+@item
+Uppercase names, i.e. names beginning with @samp{FFTW_}, remain the
+same.
+
+@item
+Replace @code{double} with @code{float} or @code{long double} for
+subroutine parameters.
+
+@end itemize
+
+Depending upon your compiler and/or hardware, @code{long double} may not
+be any more precise than @code{double} (or may not be supported at all,
+although it is standard in C99).
+@cindex C99
+
+
+We also support using the nonstandard @code{__float128}
+quadruple-precision type provided by recent versions of @code{gcc} on
+32- and 64-bit x86 hardware (@pxref{Installation and Customization}).
+To use this type, link with @code{-lfftw3q -lquadmath -lm} (the
+@code{libquadmath} library provided by @code{gcc} is needed for
+quadruple-precision trigonometric functions) and use @samp{fftwq_}
+identifiers.
+
+@c =========>
+@node Memory Allocation,  , Precision, Data Types and Files
+@subsection Memory Allocation
+
+@example
+void *fftw_malloc(size_t n);
+void fftw_free(void *p);
+@end example
+@findex fftw_malloc
+@findex fftw_free
+
+These are functions that behave identically to @code{malloc} and
+@code{free}, except that they guarantee that the returned pointer obeys
+any special alignment restrictions imposed by any algorithm in FFTW
+(e.g. for SIMD acceleration).  @xref{SIMD alignment and fftw_malloc}.
+@cindex alignment
+
+
+Data allocated by @code{fftw_malloc} @emph{must} be deallocated by
+@code{fftw_free} and not by the ordinary @code{free}.
+
+These routines simply call through to your operating system's
+@code{malloc} or, if necessary, its aligned equivalent
+(e.g. @code{memalign}), so you normally need not worry about any
+significant time or space overhead.  You are @emph{not required} to use
+them to allocate your data, but we strongly recommend it.
+
+Note: in C++, just as with ordinary @code{malloc}, you must typecast
+the output of @code{fftw_malloc} to whatever pointer type you are
+allocating.
+@cindex C++
+
+
+We also provide the following two convenience functions to allocate
+real and complex arrays with @code{n} elements, which are equivalent
+to @code{(double *) fftw_malloc(sizeof(double) * n)} and
+@code{(fftw_complex *) fftw_malloc(sizeof(fftw_complex) * n)},
+respectively:
+
+@example
+double *fftw_alloc_real(size_t n);
+fftw_complex *fftw_alloc_complex(size_t n);
+@end example
+@findex fftw_alloc_real
+@findex fftw_alloc_complex
+
+The equivalent functions in other precisions allocate arrays of @code{n}
+elements in that precision.  e.g. @code{fftwf_alloc_real(n)} is
+equivalent to @code{(float *) fftwf_malloc(sizeof(float) * n)}.
+@cindex precision
+
+@c ------------------------------------------------------------
+@node Using Plans, Basic Interface, Data Types and Files, FFTW Reference
+@section Using Plans
+
+Plans for all transform types in FFTW are stored as type
+@code{fftw_plan} (an opaque pointer type), and are created by one of the
+various planning routines described in the following sections.
+@tindex fftw_plan
+An @code{fftw_plan} contains all information necessary to compute the
+transform, including the pointers to the input and output arrays.
+
+@example
+void fftw_execute(const fftw_plan plan);
+@end example
+@findex fftw_execute
+
+This executes the @code{plan}, to compute the corresponding transform on
+the arrays for which it was planned (which must still exist).  The plan
+is not modified, and @code{fftw_execute} can be called as many times as
+desired.
+
+To apply a given plan to a different array, you can use the new-array execute
+interface.  @xref{New-array Execute Functions}.
+
+@code{fftw_execute} (and equivalents) is the only function in FFTW
+guaranteed to be thread-safe; see @ref{Thread safety}.
+
+This function:
+@example
+void fftw_destroy_plan(fftw_plan plan);
+@end example
+@findex fftw_destroy_plan
+deallocates the @code{plan} and all its associated data.
+
+FFTW's planner saves some other persistent data, such as the
+accumulated wisdom and a list of algorithms available in the current
+configuration.  If you want to deallocate all of that and reset FFTW
+to the pristine state it was in when you started your program, you can
+call:
+
+@example
+void fftw_cleanup(void);
+@end example
+@findex fftw_cleanup
+
+After calling @code{fftw_cleanup}, all existing plans become undefined,
+and you should not attempt to execute them nor to destroy them.  You can
+however create and execute/destroy new plans, in which case FFTW starts
+accumulating wisdom information again.
+
+@code{fftw_cleanup} does not deallocate your plans, however.  To prevent
+memory leaks, you must still call @code{fftw_destroy_plan} before
+executing @code{fftw_cleanup}.
+
+Occasionally, it may useful to know FFTW's internal ``cost'' metric
+that it uses to compare plans to one another; this cost is
+proportional to an execution time of the plan, in undocumented units,
+if the plan was created with the @code{FFTW_MEASURE} or other
+timing-based options, or alternatively is a heuristic cost function
+for @code{FFTW_ESTIMATE} plans.  (The cost values of measured and
+estimated plans are not comparable, being in different units.  Also,
+costs from different FFTW versions or the same version compiled
+differently may not be in the same units.  Plans created from wisdom
+have a cost of 0 since no timing measurement is performed for them.
+Finally, certain problems for which only one top-level algorithm was
+possible may have required no measurements of the cost of the whole
+plan, in which case @code{fftw_cost} will also return 0.)  The cost
+metric for a given plan is returned by:
+
+@example
+double fftw_cost(const fftw_plan plan);
+@end example
+@findex fftw_cost
+
+The following two routines are provided purely for academic purposes
+(that is, for entertainment).
+
+@example
+void fftw_flops(const fftw_plan plan, 
+                double *add, double *mul, double *fma);
+@end example
+@findex fftw_flops
+
+Given a @code{plan}, set @code{add}, @code{mul}, and @code{fma} to an
+exact count of the number of floating-point additions, multiplications,
+and fused multiply-add operations involved in the plan's execution.  The
+total number of floating-point operations (flops) is @code{add + mul +
+2*fma}, or @code{add + mul + fma} if the hardware supports fused
+multiply-add instructions (although the number of FMA operations is only
+approximate because of compiler voodoo).  (The number of operations
+should be an integer, but we use @code{double} to avoid overflowing
+@code{int} for large transforms; the arguments are of type @code{double}
+even for single and long-double precision versions of FFTW.)
+
+@example
+void fftw_fprint_plan(const fftw_plan plan, FILE *output_file);
+void fftw_print_plan(const fftw_plan plan);
+@end example
+@findex fftw_fprint_plan
+@findex fftw_print_plan
+
+This outputs a ``nerd-readable'' representation of the @code{plan} to
+the given file or to @code{stdout}, respectively.
+
+@c ------------------------------------------------------------
+@node Basic Interface, Advanced Interface, Using Plans, FFTW Reference
+@section Basic Interface
+@cindex basic interface
+
+Recall that the FFTW API is divided into three parts@footnote{@i{Gallia est
+omnis divisa in partes tres} (Julius Caesar).}: the @dfn{basic interface}
+computes a single transform of contiguous data, the @dfn{advanced
+interface} computes transforms of multiple or strided arrays, and the
+@dfn{guru interface} supports the most general data layouts,
+multiplicities, and strides.  This section describes the the basic
+interface, which we expect to satisfy the needs of most users.
+
+@menu
+* Complex DFTs::                
+* Planner Flags::               
+* Real-data DFTs::              
+* Real-data DFT Array Format::  
+* Real-to-Real Transforms::     
+* Real-to-Real Transform Kinds::  
+@end menu
+
+@c =========>
+@node Complex DFTs, Planner Flags, Basic Interface, Basic Interface
+@subsection Complex DFTs
+
+@example
+fftw_plan fftw_plan_dft_1d(int n0,
+                           fftw_complex *in, fftw_complex *out,
+                           int sign, unsigned flags);
+fftw_plan fftw_plan_dft_2d(int n0, int n1,
+                           fftw_complex *in, fftw_complex *out,
+                           int sign, unsigned flags);
+fftw_plan fftw_plan_dft_3d(int n0, int n1, int n2,
+                           fftw_complex *in, fftw_complex *out,
+                           int sign, unsigned flags);
+fftw_plan fftw_plan_dft(int rank, const int *n,
+                        fftw_complex *in, fftw_complex *out,
+                        int sign, unsigned flags);
+@end example
+@findex fftw_plan_dft_1d
+@findex fftw_plan_dft_2d
+@findex fftw_plan_dft_3d
+@findex fftw_plan_dft
+
+Plan a complex input/output discrete Fourier transform (DFT) in zero or
+more dimensions, returning an @code{fftw_plan} (@pxref{Using Plans}).
+
+Once you have created a plan for a certain transform type and
+parameters, then creating another plan of the same type and parameters,
+but for different arrays, is fast and shares constant data with the
+first plan (if it still exists).
+
+The planner returns @code{NULL} if the plan cannot be created.  In the
+standard FFTW distribution, the basic interface is guaranteed to return
+a non-@code{NULL} plan.  A plan may be @code{NULL}, however, if you are
+using a customized FFTW configuration supporting a restricted set of
+transforms.
+
+@subsubheading Arguments
+@itemize @bullet
+
+@item
+@code{rank} is the rank of the transform (it should be the size of the
+array @code{*n}), and can be any non-negative integer.  (@xref{Complex
+Multi-Dimensional DFTs}, for the definition of ``rank''.)  The
+@samp{_1d}, @samp{_2d}, and @samp{_3d} planners correspond to a
+@code{rank} of @code{1}, @code{2}, and @code{3}, respectively.  The rank
+may be zero, which is equivalent to a rank-1 transform of size 1, i.e. a
+copy of one number from input to output.
+
+@item
+@code{n0}, @code{n1}, @code{n2}, or @code{n[0..rank-1]} (as appropriate
+for each routine) specify the size of the transform dimensions.  They
+can be any positive integer.
+ 
+@itemize @minus
+@item
+@cindex row-major
+Multi-dimensional arrays are stored in row-major order with dimensions:
+@code{n0} x @code{n1}; or @code{n0} x @code{n1} x @code{n2}; or
+@code{n[0]} x @code{n[1]} x ... x @code{n[rank-1]}.
+@xref{Multi-dimensional Array Format}.
+@item
+FFTW is best at handling sizes of the form
+@ifinfo
+@math{2^a 3^b 5^c 7^d 11^e 13^f},
+@end ifinfo
+@tex
+$2^a 3^b 5^c 7^d 11^e 13^f$,
+@end tex
+@html
+2<sup>a</sup> 3<sup>b</sup> 5<sup>c</sup> 7<sup>d</sup>
+        11<sup>e</sup> 13<sup>f</sup>,
+@end html
+where @math{e+f} is either @math{0} or @math{1}, and the other exponents
+are arbitrary.  Other sizes are computed by means of a slow,
+general-purpose algorithm (which nevertheless retains @Onlogn{} performance even for prime sizes).  It is possible to customize FFTW
+for different array sizes; see @ref{Installation and Customization}.
+Transforms whose sizes are powers of @math{2} are especially fast.
+@end itemize
+
+@item
+@code{in} and @code{out} point to the input and output arrays of the
+transform, which may be the same (yielding an in-place transform).
+@cindex in-place
+These arrays are overwritten during planning, unless
+@code{FFTW_ESTIMATE} is used in the flags.  (The arrays need not be
+initialized, but they must be allocated.)
+
+If @code{in == out}, the transform is @dfn{in-place} and the input
+array is overwritten. If @code{in != out}, the two arrays must
+not overlap (but FFTW does not check for this condition).
+
+@item
+@ctindex FFTW_FORWARD
+@ctindex FFTW_BACKWARD
+@code{sign} is the sign of the exponent in the formula that defines the
+Fourier transform.  It can be @math{-1} (= @code{FFTW_FORWARD}) or
+@math{+1} (= @code{FFTW_BACKWARD}).
+
+@item
+@cindex flags
+@code{flags} is a bitwise OR (@samp{|}) of zero or more planner flags,
+as defined in @ref{Planner Flags}.
+
+@end itemize
+
+FFTW computes an unnormalized transform: computing a forward followed by
+a backward transform (or vice versa) will result in the original data
+multiplied by the size of the transform (the product of the dimensions).
+@cindex normalization
+For more information, see @ref{What FFTW Really Computes}.
+
+@c =========>
+@node Planner Flags, Real-data DFTs, Complex DFTs, Basic Interface
+@subsection Planner Flags
+
+All of the planner routines in FFTW accept an integer @code{flags}
+argument, which is a bitwise OR (@samp{|}) of zero or more of the flag
+constants defined below.  These flags control the rigor (and time) of
+the planning process, and can also impose (or lift) restrictions on the
+type of transform algorithm that is employed.
+
+@emph{Important:} the planner overwrites the input array during
+planning unless a saved plan (@pxref{Wisdom}) is available for that
+problem, so you should initialize your input data after creating the
+plan.  The only exceptions to this are the @code{FFTW_ESTIMATE} and
+@code{FFTW_WISDOM_ONLY} flags, as mentioned below.
+
+In all  cases, if  wisdom is  available for the  given problem  that was
+created  with equal-or-greater  planning rigor,  then the  more rigorous
+wisdom is used.  For example, in @code{FFTW_ESTIMATE} mode any available
+wisdom is used, whereas  in @code{FFTW_PATIENT} mode only wisdom created
+in patient or exhaustive mode can be used.  @xref{Words of Wisdom-Saving
+Plans}.
+
+@subsubheading Planning-rigor flags
+@itemize @bullet
+
+@item
+@ctindex FFTW_ESTIMATE
+@code{FFTW_ESTIMATE} specifies that, instead of actual measurements of
+different algorithms, a simple heuristic is used to pick a (probably
+sub-optimal) plan quickly.  With this flag, the input/output arrays are
+not overwritten during planning.
+
+@item
+@ctindex FFTW_MEASURE
+@code{FFTW_MEASURE} tells FFTW to find an optimized plan by actually
+@emph{computing} several FFTs and measuring their execution time.
+Depending on your machine, this can take some time (often a few
+seconds).  @code{FFTW_MEASURE} is the default planning option.
+
+@item
+@ctindex FFTW_PATIENT
+@code{FFTW_PATIENT} is like @code{FFTW_MEASURE}, but considers a wider
+range of algorithms and often produces a ``more optimal'' plan
+(especially for large transforms), but at the expense of several times
+longer planning time (especially for large transforms).
+
+@item
+@ctindex FFTW_EXHAUSTIVE
+@code{FFTW_EXHAUSTIVE} is like @code{FFTW_PATIENT}, but considers an
+even wider range of algorithms, including many that we think are
+unlikely to be fast, to produce the most optimal plan but with a
+substantially increased planning time.
+
+@item
+@ctindex FFTW_WISDOM_ONLY
+@code{FFTW_WISDOM_ONLY} is a special planning mode in which the plan
+is only created if wisdom is available for the given problem, and
+otherwise a @code{NULL} plan is returned.  This can be combined with
+other flags, e.g. @samp{FFTW_WISDOM_ONLY | FFTW_PATIENT} creates a
+plan only if wisdom is available that was created in
+@code{FFTW_PATIENT} or @code{FFTW_EXHAUSTIVE} mode.  The
+@code{FFTW_WISDOM_ONLY} flag is intended for users who need to detect
+whether wisdom is available; for example, if wisdom is not available
+one may wish to allocate new arrays for planning so that user data is
+not overwritten.
+
+@end itemize
+
+@subsubheading Algorithm-restriction flags
+@itemize @bullet
+
+@item
+@ctindex FFTW_DESTROY_INPUT
+@code{FFTW_DESTROY_INPUT} specifies that an out-of-place transform is
+allowed to @emph{overwrite its input} array with arbitrary data; this
+can sometimes allow more efficient algorithms to be employed.
+@cindex out-of-place
+
+@item
+@ctindex FFTW_PRESERVE_INPUT
+@code{FFTW_PRESERVE_INPUT} specifies that an out-of-place transform must
+@emph{not change its input} array.  This is ordinarily the
+@emph{default}, except for c2r and hc2r (i.e. complex-to-real)
+transforms for which @code{FFTW_DESTROY_INPUT} is the default.  In the
+latter cases, passing @code{FFTW_PRESERVE_INPUT} will attempt to use
+algorithms that do not destroy the input, at the expense of worse
+performance; for multi-dimensional c2r transforms, however, no
+input-preserving algorithms are implemented and the planner will return
+@code{NULL} if one is requested.
+@cindex c2r
+@cindex hc2r
+
+@item
+@ctindex FFTW_UNALIGNED
+@cindex alignment
+@code{FFTW_UNALIGNED} specifies that the algorithm may not impose any
+unusual alignment requirements on the input/output arrays (i.e. no
+SIMD may be used).  This flag is normally @emph{not necessary}, since
+the planner automatically detects misaligned arrays.  The only use for
+this flag is if you want to use the new-array execute interface to
+execute a given plan on a different array that may not be aligned like
+the original.  (Using @code{fftw_malloc} makes this flag unnecessary
+even then.)
+
+@end itemize
+
+@subsubheading Limiting planning time
+
+@example
+extern void fftw_set_timelimit(double seconds);
+@end example
+@findex fftw_set_timelimit
+
+This function instructs FFTW to spend at most @code{seconds} seconds
+(approximately) in the planner.  If @code{seconds ==
+FFTW_NO_TIMELIMIT} (the default value, which is negative), then
+planning time is unbounded.  Otherwise, FFTW plans with a
+progressively wider range of algorithms until the the given time limit
+is reached or the given range of algorithms is explored, returning the
+best available plan.
+@ctindex FFTW_NO_TIMELIMIT
+
+
+For example, specifying @code{FFTW_PATIENT} first plans in
+@code{FFTW_ESTIMATE} mode, then in @code{FFTW_MEASURE} mode, then
+finally (time permitting) in @code{FFTW_PATIENT}.  If
+@code{FFTW_EXHAUSTIVE} is specified instead, the planner will further
+progress to @code{FFTW_EXHAUSTIVE} mode.
+
+Note that the @code{seconds} argument specifies only a rough limit; in
+practice, the planner may use somewhat more time if the time limit is
+reached when the planner is in the middle of an operation that cannot
+be interrupted.  At the very least, the planner will complete planning
+in @code{FFTW_ESTIMATE} mode (which is thus equivalent to a time limit
+of 0).
+
+
+@c =========>
+@node Real-data DFTs, Real-data DFT Array Format, Planner Flags, Basic Interface
+@subsection Real-data DFTs
+
+@example
+fftw_plan fftw_plan_dft_r2c_1d(int n0,
+                               double *in, fftw_complex *out,
+                               unsigned flags);
+fftw_plan fftw_plan_dft_r2c_2d(int n0, int n1,
+                               double *in, fftw_complex *out,
+                               unsigned flags);
+fftw_plan fftw_plan_dft_r2c_3d(int n0, int n1, int n2,
+                               double *in, fftw_complex *out,
+                               unsigned flags);
+fftw_plan fftw_plan_dft_r2c(int rank, const int *n,
+                            double *in, fftw_complex *out,
+                            unsigned flags);
+@end example
+@findex fftw_plan_dft_r2c_1d
+@findex fftw_plan_dft_r2c_2d
+@findex fftw_plan_dft_r2c_3d
+@findex fftw_plan_dft_r2c
+@cindex r2c
+
+Plan a real-input/complex-output discrete Fourier transform (DFT) in
+zero or more dimensions, returning an @code{fftw_plan} (@pxref{Using
+Plans}).
+
+Once you have created a plan for a certain transform type and
+parameters, then creating another plan of the same type and parameters,
+but for different arrays, is fast and shares constant data with the
+first plan (if it still exists).
+
+The planner returns @code{NULL} if the plan cannot be created.  A
+non-@code{NULL} plan is always returned by the basic interface unless
+you are using a customized FFTW configuration supporting a restricted
+set of transforms, or if you use the @code{FFTW_PRESERVE_INPUT} flag
+with a multi-dimensional out-of-place c2r transform (see below).
+
+@subsubheading Arguments
+@itemize @bullet
+
+@item
+@code{rank} is the rank of the transform (it should be the size of the
+array @code{*n}), and can be any non-negative integer.  (@xref{Complex
+Multi-Dimensional DFTs}, for the definition of ``rank''.)  The
+@samp{_1d}, @samp{_2d}, and @samp{_3d} planners correspond to a
+@code{rank} of @code{1}, @code{2}, and @code{3}, respectively.  The rank
+may be zero, which is equivalent to a rank-1 transform of size 1, i.e. a
+copy of one real number (with zero imaginary part) from input to output.
+
+@item
+@code{n0}, @code{n1}, @code{n2}, or @code{n[0..rank-1]}, (as appropriate
+for each routine) specify the size of the transform dimensions.  They
+can be any positive integer.  This is different in general from the
+@emph{physical} array dimensions, which are described in @ref{Real-data
+DFT Array Format}.
+ 
+@itemize @minus
+@item
+FFTW is best at handling sizes of the form
+@ifinfo
+@math{2^a 3^b 5^c 7^d 11^e 13^f},
+@end ifinfo
+@tex
+$2^a 3^b 5^c 7^d 11^e 13^f$,
+@end tex
+@html
+2<sup>a</sup> 3<sup>b</sup> 5<sup>c</sup> 7<sup>d</sup>
+        11<sup>e</sup> 13<sup>f</sup>,
+@end html
+where @math{e+f} is either @math{0} or @math{1}, and the other exponents
+are arbitrary.  Other sizes are computed by means of a slow,
+general-purpose algorithm (which nevertheless retains @Onlogn{} performance even for prime sizes).  (It is possible to customize FFTW
+for different array sizes; see @ref{Installation and Customization}.)
+Transforms whose sizes are powers of @math{2} are especially fast, and
+it is generally beneficial for the @emph{last} dimension of an r2c/c2r
+transform to be @emph{even}.
+@end itemize
+
+@item
+@code{in} and @code{out} point to the input and output arrays of the
+transform, which may be the same (yielding an in-place transform).
+@cindex in-place
+These arrays are overwritten during planning, unless
+@code{FFTW_ESTIMATE} is used in the flags.  (The arrays need not be
+initialized, but they must be allocated.)  For an in-place transform, it
+is important to remember that the real array will require padding,
+described in @ref{Real-data DFT Array Format}.
+@cindex padding
+
+@item
+@cindex flags
+@code{flags} is a bitwise OR (@samp{|}) of zero or more planner flags,
+as defined in @ref{Planner Flags}.
+
+@end itemize
+
+The inverse transforms, taking complex input (storing the non-redundant
+half of a logically Hermitian array) to real output, are given by:
+
+@example
+fftw_plan fftw_plan_dft_c2r_1d(int n0,
+                               fftw_complex *in, double *out,
+                               unsigned flags);
+fftw_plan fftw_plan_dft_c2r_2d(int n0, int n1,
+                               fftw_complex *in, double *out,
+                               unsigned flags);
+fftw_plan fftw_plan_dft_c2r_3d(int n0, int n1, int n2,
+                               fftw_complex *in, double *out,
+                               unsigned flags);
+fftw_plan fftw_plan_dft_c2r(int rank, const int *n,
+                            fftw_complex *in, double *out,
+                            unsigned flags);
+@end example
+@findex fftw_plan_dft_c2r_1d
+@findex fftw_plan_dft_c2r_2d
+@findex fftw_plan_dft_c2r_3d
+@findex fftw_plan_dft_c2r
+@cindex c2r
+
+The arguments are the same as for the r2c transforms, except that the
+input and output data formats are reversed.
+
+FFTW computes an unnormalized transform: computing an r2c followed by a
+c2r transform (or vice versa) will result in the original data
+multiplied by the size of the transform (the product of the logical
+dimensions).
+@cindex normalization
+An r2c transform produces the same output as a @code{FFTW_FORWARD}
+complex DFT of the same input, and a c2r transform is correspondingly
+equivalent to @code{FFTW_BACKWARD}.  For more information, see @ref{What
+FFTW Really Computes}.
+
+@c =========>
+@node Real-data DFT Array Format, Real-to-Real Transforms, Real-data DFTs, Basic Interface
+@subsection Real-data DFT Array Format
+@cindex r2c/c2r multi-dimensional array format
+
+The output of a DFT of real data (r2c) contains symmetries that, in
+principle, make half of the outputs redundant (@pxref{What FFTW Really
+Computes}).  (Similarly for the input of an inverse c2r transform.)  In
+practice, it is not possible to entirely realize these savings in an
+efficient and understandable format that generalizes to
+multi-dimensional transforms.  Instead, the output of the r2c
+transforms is @emph{slightly} over half of the output of the
+corresponding complex transform.  We do not ``pack'' the data in any
+way, but store it as an ordinary array of @code{fftw_complex} values.
+In fact, this data is simply a subsection of what would be the array in
+the corresponding complex transform.
+
+Specifically, for a real transform of @math{d} (= @code{rank})
+dimensions @ndims{}, the complex data is an @ndimshalf array of
+@code{fftw_complex} values in row-major order (with the division rounded
+down).  That is, we only store the @emph{lower} half (non-negative
+frequencies), plus one element, of the last dimension of the data from
+the ordinary complex transform.  (We could have instead taken half of
+any other dimension, but implementation turns out to be simpler if the
+last, contiguous, dimension is used.)
+
+@cindex out-of-place
+For an out-of-place transform, the real data is simply an array with
+physical dimensions @ndims in row-major order.
+
+@cindex in-place
+@cindex padding
+For an in-place transform, some complications arise since the complex data
+is slightly larger than the real data.  In this case, the final
+dimension of the real data must be @emph{padded} with extra values to
+accommodate the size of the complex data---two extra if the last
+dimension is even and one if it is odd.  That is, the last dimension of
+the real data must physically contain
+@tex
+$2 (n_{d-1}/2+1)$
+@end tex
+@ifinfo
+2 * (n[d-1]/2+1)
+@end ifinfo
+@html
+2 * (n<sub>d-1</sub>/2+1)
+@end html
+@code{double} values (exactly enough to hold the complex data).  This
+physical array size does not, however, change the @emph{logical} array
+size---only
+@tex
+$n_{d-1}$
+@end tex
+@ifinfo
+n[d-1]
+@end ifinfo
+@html
+n<sub>d-1</sub>
+@end html
+values are actually stored in the last dimension, and
+@tex
+$n_{d-1}$
+@end tex
+@ifinfo
+n[d-1]
+@end ifinfo
+@html
+n<sub>d-1</sub>
+@end html
+is the last dimension passed to the planner.
+
+@c =========>
+@node Real-to-Real Transforms, Real-to-Real Transform Kinds, Real-data DFT Array Format, Basic Interface
+@subsection Real-to-Real Transforms
+@cindex r2r
+
+@example
+fftw_plan fftw_plan_r2r_1d(int n, double *in, double *out,
+                           fftw_r2r_kind kind, unsigned flags);
+fftw_plan fftw_plan_r2r_2d(int n0, int n1, double *in, double *out,
+                           fftw_r2r_kind kind0, fftw_r2r_kind kind1,
+                           unsigned flags);
+fftw_plan fftw_plan_r2r_3d(int n0, int n1, int n2,
+                           double *in, double *out,
+                           fftw_r2r_kind kind0,
+                           fftw_r2r_kind kind1,
+                           fftw_r2r_kind kind2,
+                           unsigned flags);
+fftw_plan fftw_plan_r2r(int rank, const int *n, double *in, double *out,
+                        const fftw_r2r_kind *kind, unsigned flags);
+@end example
+@findex fftw_plan_r2r_1d
+@findex fftw_plan_r2r_2d
+@findex fftw_plan_r2r_3d
+@findex fftw_plan_r2r
+
+Plan a real input/output (r2r) transform of various kinds in zero or
+more dimensions, returning an @code{fftw_plan} (@pxref{Using Plans}).
+
+Once you have created a plan for a certain transform type and
+parameters, then creating another plan of the same type and parameters,
+but for different arrays, is fast and shares constant data with the
+first plan (if it still exists).
+
+The planner returns @code{NULL} if the plan cannot be created.  A
+non-@code{NULL} plan is always returned by the basic interface unless
+you are using a customized FFTW configuration supporting a restricted
+set of transforms, or for size-1 @code{FFTW_REDFT00} kinds (which are
+not defined).
+@ctindex FFTW_REDFT00
+
+@subsubheading Arguments
+@itemize @bullet
+
+@item
+@code{rank} is the dimensionality of the transform (it should be the
+size of the arrays @code{*n} and @code{*kind}), and can be any
+non-negative integer.  The @samp{_1d}, @samp{_2d}, and @samp{_3d}
+planners correspond to a @code{rank} of @code{1}, @code{2}, and
+@code{3}, respectively.  A @code{rank} of zero is equivalent to a copy
+of one number from input to output.
+
+@item
+@code{n}, or @code{n0}/@code{n1}/@code{n2}, or @code{n[rank]},
+respectively, gives the (physical) size of the transform dimensions.
+They can be any positive integer.
+ 
+@itemize @minus
+@item
+@cindex row-major
+Multi-dimensional arrays are stored in row-major order with dimensions:
+@code{n0} x @code{n1}; or @code{n0} x @code{n1} x @code{n2}; or
+@code{n[0]} x @code{n[1]} x ... x @code{n[rank-1]}.
+@xref{Multi-dimensional Array Format}.
+@item
+FFTW is generally best at handling sizes of the form
+@ifinfo
+@math{2^a 3^b 5^c 7^d 11^e 13^f},
+@end ifinfo
+@tex
+$2^a 3^b 5^c 7^d 11^e 13^f$,
+@end tex
+@html
+2<sup>a</sup> 3<sup>b</sup> 5<sup>c</sup> 7<sup>d</sup>
+        11<sup>e</sup> 13<sup>f</sup>,
+@end html
+where @math{e+f} is either @math{0} or @math{1}, and the other exponents
+are arbitrary.  Other sizes are computed by means of a slow,
+general-purpose algorithm (which nevertheless retains @Onlogn{} performance even for prime sizes).  (It is possible to customize FFTW
+for different array sizes; see @ref{Installation and Customization}.)
+Transforms whose sizes are powers of @math{2} are especially fast.
+@item
+For a @code{REDFT00} or @code{RODFT00} transform kind in a dimension of
+size @math{n}, it is @math{n-1} or @math{n+1}, respectively, that
+should be factorizable in the above form.
+@end itemize
+
+@item
+@code{in} and @code{out} point to the input and output arrays of the
+transform, which may be the same (yielding an in-place transform).
+@cindex in-place
+These arrays are overwritten during planning, unless
+@code{FFTW_ESTIMATE} is used in the flags.  (The arrays need not be
+initialized, but they must be allocated.)
+
+@item
+@code{kind}, or @code{kind0}/@code{kind1}/@code{kind2}, or
+@code{kind[rank]}, is the kind of r2r transform used for the
+corresponding dimension.  The valid kind constants are described in
+@ref{Real-to-Real Transform Kinds}.  In a multi-dimensional transform,
+what is computed is the separable product formed by taking each
+transform kind along the corresponding dimension, one dimension after
+another.
+
+@item
+@cindex flags
+@code{flags} is a bitwise OR (@samp{|}) of zero or more planner flags,
+as defined in @ref{Planner Flags}.
+
+@end itemize
+
+@c =========>
+@node Real-to-Real Transform Kinds,  , Real-to-Real Transforms, Basic Interface
+@subsection Real-to-Real Transform Kinds
+@cindex kind (r2r)
+
+FFTW currently supports 11 different r2r transform kinds, specified by
+one of the constants below.  For the precise definitions of these
+transforms, see @ref{What FFTW Really Computes}.  For a more colloquial
+introduction to these transform kinds, see @ref{More DFTs of Real Data}.
+
+For dimension of size @code{n}, there is a corresponding ``logical''
+dimension @code{N} that determines the normalization (and the optimal
+factorization); the formula for @code{N} is given for each kind below.
+Also, with each transform kind is listed its corrsponding inverse
+transform.  FFTW computes unnormalized transforms: a transform followed
+by its inverse will result in the original data multiplied by @code{N}
+(or the product of the @code{N}'s for each dimension, in
+multi-dimensions).
+@cindex normalization
+
+@itemize @bullet
+
+@item
+@ctindex FFTW_R2HC
+@code{FFTW_R2HC} computes a real-input DFT with output in
+``halfcomplex'' format, i.e. real and imaginary parts for a transform of
+size @code{n} stored as:
+@tex
+$$
+r_0, r_1, r_2, \ldots, r_{n/2}, i_{(n+1)/2-1}, \ldots, i_2, i_1
+$$
+@end tex
+@ifinfo
+r0, r1, r2, r(n/2), i((n+1)/2-1), ..., i2, i1
+@end ifinfo
+@html
+<p align=center>
+r<sub>0</sub>, r<sub>1</sub>, r<sub>2</sub>, ..., r<sub>n/2</sub>, i<sub>(n+1)/2-1</sub>, ..., i<sub>2</sub>, i<sub>1</sub>
+</p>
+@end html
+(Logical @code{N=n}, inverse is @code{FFTW_HC2R}.)
+
+@item
+@ctindex FFTW_HC2R
+@code{FFTW_HC2R} computes the reverse of @code{FFTW_R2HC}, above.
+(Logical @code{N=n}, inverse is @code{FFTW_R2HC}.)
+
+@item
+@ctindex FFTW_DHT
+@code{FFTW_DHT} computes a discrete Hartley transform.
+(Logical @code{N=n}, inverse is @code{FFTW_DHT}.)
+@cindex discrete Hartley transform
+
+@item
+@ctindex FFTW_REDFT00
+@code{FFTW_REDFT00} computes an REDFT00 transform, i.e. a DCT-I.
+(Logical @code{N=2*(n-1)}, inverse is @code{FFTW_REDFT00}.)
+@cindex discrete cosine transform
+@cindex DCT
+
+@item
+@ctindex FFTW_REDFT10
+@code{FFTW_REDFT10} computes an REDFT10 transform, i.e. a DCT-II (sometimes called ``the'' DCT).
+(Logical @code{N=2*n}, inverse is @code{FFTW_REDFT01}.)
+
+@item
+@ctindex FFTW_REDFT01
+@code{FFTW_REDFT01} computes an REDFT01 transform, i.e. a DCT-III (sometimes called ``the'' IDCT, being the inverse of DCT-II).
+(Logical @code{N=2*n}, inverse is @code{FFTW_REDFT=10}.)
+@cindex IDCT
+
+@item
+@ctindex FFTW_REDFT11
+@code{FFTW_REDFT11} computes an REDFT11 transform, i.e. a DCT-IV.
+(Logical @code{N=2*n}, inverse is @code{FFTW_REDFT11}.)
+
+@item
+@ctindex FFTW_RODFT00
+@code{FFTW_RODFT00} computes an RODFT00 transform, i.e. a DST-I.
+(Logical @code{N=2*(n+1)}, inverse is @code{FFTW_RODFT00}.)
+@cindex discrete sine transform
+@cindex DST
+
+@item
+@ctindex FFTW_RODFT10
+@code{FFTW_RODFT10} computes an RODFT10 transform, i.e. a DST-II.
+(Logical @code{N=2*n}, inverse is @code{FFTW_RODFT01}.)
+
+@item
+@ctindex FFTW_RODFT01
+@code{FFTW_RODFT01} computes an RODFT01 transform, i.e. a DST-III.
+(Logical @code{N=2*n}, inverse is @code{FFTW_RODFT=10}.)
+
+@item
+@ctindex FFTW_RODFT11
+@code{FFTW_RODFT11} computes an RODFT11 transform, i.e. a DST-IV.
+(Logical @code{N=2*n}, inverse is @code{FFTW_RODFT11}.)
+
+@end itemize
+
+@c ------------------------------------------------------------
+@node Advanced Interface, Guru Interface, Basic Interface, FFTW Reference
+@section Advanced Interface
+@cindex advanced interface
+
+FFTW's ``advanced'' interface supplements the basic interface with four
+new planner routines, providing a new level of flexibility: you can plan
+a transform of multiple arrays simultaneously, operate on non-contiguous
+(strided) data, and transform a subset of a larger multi-dimensional
+array.  Other than these additional features, the planner operates in
+the same fashion as in the basic interface, and the resulting
+@code{fftw_plan} is used in the same way (@pxref{Using Plans}).
+
+@menu
+* Advanced Complex DFTs::       
+* Advanced Real-data DFTs::     
+* Advanced Real-to-real Transforms::  
+@end menu
+
+@c =========>
+@node Advanced Complex DFTs, Advanced Real-data DFTs, Advanced Interface, Advanced Interface
+@subsection Advanced Complex DFTs
+
+@example
+fftw_plan fftw_plan_many_dft(int rank, const int *n, int howmany,
+                             fftw_complex *in, const int *inembed,
+                             int istride, int idist,
+                             fftw_complex *out, const int *onembed,
+                             int ostride, int odist,
+                             int sign, unsigned flags);
+@end example
+@findex fftw_plan_many_dft
+
+This routine plans multiple multidimensional complex DFTs, and it
+extends the @code{fftw_plan_dft} routine (@pxref{Complex DFTs}) to
+compute @code{howmany} transforms, each having rank @code{rank} and size
+@code{n}.  In addition, the transform data need not be contiguous, but
+it may be laid out in memory with an arbitrary stride.  To account for
+these possibilities, @code{fftw_plan_many_dft} adds the new parameters
+@code{howmany}, @{@code{i},@code{o}@}@code{nembed},
+@{@code{i},@code{o}@}@code{stride}, and
+@{@code{i},@code{o}@}@code{dist}.  The FFTW basic interface
+(@pxref{Complex DFTs}) provides routines specialized for ranks 1, 2,
+and@tie{}3, but the advanced interface handles only the general-rank
+case.
+
+@code{howmany} is the number of transforms to compute.  The resulting
+plan computes @code{howmany} transforms, where the input of the
+@code{k}-th transform is at location @code{in+k*idist} (in C pointer
+arithmetic), and its output is at location @code{out+k*odist}.  Plans
+obtained in this way can often be faster than calling FFTW multiple
+times for the individual transforms.  The basic @code{fftw_plan_dft}
+interface corresponds to @code{howmany=1} (in which case the @code{dist}
+parameters are ignored).
+@cindex howmany parameter
+@cindex dist
+
+
+Each of the @code{howmany} transforms has rank @code{rank} and size
+@code{n}, as in the basic interface.  In addition, the advanced
+interface allows the input and output arrays of each transform to be
+row-major subarrays of larger rank-@code{rank} arrays, described by
+@code{inembed} and @code{onembed} parameters, respectively.
+@{@code{i},@code{o}@}@code{nembed} must be arrays of length @code{rank},
+and @code{n} should be elementwise less than or equal to
+@{@code{i},@code{o}@}@code{nembed}.  Passing @code{NULL} for an
+@code{nembed} parameter is equivalent to passing @code{n} (i.e. same
+physical and logical dimensions, as in the basic interface.)
+
+The @code{stride} parameters indicate that the @code{j}-th element of
+the input or output arrays is located at @code{j*istride} or
+@code{j*ostride}, respectively.  (For a multi-dimensional array,
+@code{j} is the ordinary row-major index.)  When combined with the
+@code{k}-th transform in a @code{howmany} loop, from above, this means
+that the (@code{j},@code{k})-th element is at @code{j*stride+k*dist}.
+(The basic @code{fftw_plan_dft} interface corresponds to a stride of 1.)
+@cindex stride
+
+
+For in-place transforms, the input and output @code{stride} and
+@code{dist} parameters should be the same; otherwise, the planner may
+return @code{NULL}.
+
+Arrays @code{n}, @code{inembed}, and @code{onembed} are not used after
+this function returns.  You can safely free or reuse them.
+
+@strong{Examples}:
+One transform of one 5 by 6 array contiguous in memory:
+@example
+   int rank = 2;
+   int n[] = @{5, 6@};
+   int howmany = 1;
+   int idist = odist = 0; /* unused because howmany = 1 */
+   int istride = ostride = 1; /* array is contiguous in memory */
+   int *inembed = n, *onembed = n;
+@end example
+
+Transform of three 5 by 6 arrays, each contiguous in memory,
+stored in memory one after another:
+@example
+   int rank = 2;
+   int n[] = @{5, 6@};
+   int howmany = 3;
+   int idist = odist = n[0]*n[1]; /* = 30, the distance in memory
+                                     between the first element
+                                     of the first array and the
+                                     first element of the second array */
+   int istride = ostride = 1; /* array is contiguous in memory */
+   int *inembed = n, *onembed = n;
+@end example
+
+Transform each column of a 2d array with 10 rows and 3 columns:
+@example
+   int rank = 1; /* not 2: we are computing 1d transforms */
+   int n[] = @{10@}; /* 1d transforms of length 10 */
+   int howmany = 3;
+   int idist = odist = 1;
+   int istride = ostride = 3; /* distance between two elements in 
+                                 the same column */
+   int *inembed = n, *onembed = n;
+@end example
+
+@c =========>
+@node Advanced Real-data DFTs, Advanced Real-to-real Transforms, Advanced Complex DFTs, Advanced Interface
+@subsection Advanced Real-data DFTs
+
+@example
+fftw_plan fftw_plan_many_dft_r2c(int rank, const int *n, int howmany,
+                                 double *in, const int *inembed,
+                                 int istride, int idist,
+                                 fftw_complex *out, const int *onembed,
+                                 int ostride, int odist,
+                                 unsigned flags);
+fftw_plan fftw_plan_many_dft_c2r(int rank, const int *n, int howmany,
+                                 fftw_complex *in, const int *inembed,
+                                 int istride, int idist,
+                                 double *out, const int *onembed,
+                                 int ostride, int odist,
+                                 unsigned flags);
+@end example
+@findex fftw_plan_many_dft_r2c
+@findex fftw_plan_many_dft_c2r
+
+Like @code{fftw_plan_many_dft}, these two functions add @code{howmany},
+@code{nembed}, @code{stride}, and @code{dist} parameters to the
+@code{fftw_plan_dft_r2c} and @code{fftw_plan_dft_c2r} functions, but
+otherwise behave the same as the basic interface.
+
+The interpretation of @code{howmany}, @code{stride}, and @code{dist} are
+the same as for @code{fftw_plan_many_dft}, above.  Note that the
+@code{stride} and @code{dist} for the real array are in units of
+@code{double}, and for the complex array are in units of
+@code{fftw_complex}.
+
+If an @code{nembed} parameter is @code{NULL}, it is interpreted as what
+it would be in the basic interface, as described in @ref{Real-data DFT
+Array Format}.  That is, for the complex array the size is assumed to be
+the same as @code{n}, but with the last dimension cut roughly in half.
+For the real array, the size is assumed to be @code{n} if the transform
+is out-of-place, or @code{n} with the last dimension ``padded'' if the
+transform is in-place.
+
+If an @code{nembed} parameter is non-@code{NULL}, it is interpreted as
+the physical size of the corresponding array, in row-major order, just
+as for @code{fftw_plan_many_dft}.  In this case, each dimension of
+@code{nembed} should be @code{>=} what it would be in the basic
+interface (e.g. the halved or padded @code{n}).
+
+Arrays @code{n}, @code{inembed}, and @code{onembed} are not used after
+this function returns.  You can safely free or reuse them.
+
+@c =========>
+@node Advanced Real-to-real Transforms,  , Advanced Real-data DFTs, Advanced Interface
+@subsection Advanced Real-to-real Transforms
+
+@example
+fftw_plan fftw_plan_many_r2r(int rank, const int *n, int howmany,
+                             double *in, const int *inembed,
+                             int istride, int idist,
+                             double *out, const int *onembed,
+                             int ostride, int odist,
+                             const fftw_r2r_kind *kind, unsigned flags);
+@end example
+@findex fftw_plan_many_r2r
+
+Like @code{fftw_plan_many_dft}, this functions adds @code{howmany},
+@code{nembed}, @code{stride}, and @code{dist} parameters to the
+@code{fftw_plan_r2r} function, but otherwise behave the same as the
+basic interface.  The interpretation of those additional parameters are
+the same as for @code{fftw_plan_many_dft}.  (Of course, the
+@code{stride} and @code{dist} parameters are now in units of
+@code{double}, not @code{fftw_complex}.)
+
+Arrays @code{n}, @code{inembed}, @code{onembed}, and @code{kind} are not
+used after this function returns.  You can safely free or reuse them.
+
+@c ------------------------------------------------------------
+@node Guru Interface, New-array Execute Functions, Advanced Interface, FFTW Reference
+@section Guru Interface
+@cindex guru interface
+
+The ``guru'' interface to FFTW is intended to expose as much as possible
+of the flexibility in the underlying FFTW architecture.  It allows one
+to compute multi-dimensional ``vectors'' (loops) of multi-dimensional
+transforms, where each vector/transform dimension has an independent
+size and stride.
+@cindex vector
+One can also use more general complex-number formats, e.g. separate real
+and imaginary arrays.
+
+For those users who require the flexibility of the guru interface, it is
+important that they pay special attention to the documentation lest they
+shoot themselves in the foot.
+
+@menu
+* Interleaved and split arrays::  
+* Guru vector and transform sizes::  
+* Guru Complex DFTs::           
+* Guru Real-data DFTs::         
+* Guru Real-to-real Transforms::  
+* 64-bit Guru Interface::       
+@end menu
+
+@c =========>
+@node  Interleaved and split arrays, Guru vector and transform sizes, Guru Interface, Guru Interface
+@subsection Interleaved and split arrays
+
+The guru interface supports two representations of complex numbers,
+which we call the interleaved and the split format.
+
+The @dfn{interleaved} format is the same one used by the basic and
+advanced interfaces, and it is documented in @ref{Complex numbers}.
+In the interleaved format, you provide pointers to the real part of a
+complex number, and the imaginary part understood to be stored in the
+next memory location.
+@cindex interleaved format
+
+
+The @dfn{split} format allows separate pointers to the real and
+imaginary parts of a complex array.
+@cindex split format
+
+
+Technically, the interleaved format is redundant, because you can
+always express an interleaved array in terms of a split array with
+appropriate pointers and strides.  On the other hand, the interleaved
+format is simpler to use, and it is common in practice.  Hence, FFTW
+supports it as a special case.
+
+@c =========>
+@node Guru vector and transform sizes, Guru Complex DFTs, Interleaved and split arrays, Guru Interface
+@subsection Guru vector and transform sizes
+
+The guru interface introduces one basic new data structure,
+@code{fftw_iodim}, that is used to specify sizes and strides for
+multi-dimensional transforms and vectors:
+
+@example
+typedef struct @{
+     int n;
+     int is;
+     int os;
+@} fftw_iodim;
+@end example
+@tindex fftw_iodim
+
+Here, @code{n} is the size of the dimension, and @code{is} and @code{os}
+are the strides of that dimension for the input and output arrays.  (The
+stride is the separation of consecutive elements along this dimension.)
+
+The meaning of the stride parameter depends on the type of the array
+that the stride refers to.  @emph{If the array is interleaved complex,
+strides are expressed in units of complex numbers
+(@code{fftw_complex}).  If the array is split complex or real, strides
+are expressed in units of real numbers (@code{double}).}  This
+convention is consistent with the usual pointer arithmetic in the C
+language.  An interleaved array is denoted by a pointer @code{p} to
+@code{fftw_complex}, so that @code{p+1} points to the next complex
+number.  Split arrays are denoted by pointers to @code{double}, in
+which case pointer arithmetic operates in units of
+@code{sizeof(double)}.
+@cindex stride
+
+
+The guru planner interfaces all take a (@code{rank}, @code{dims[rank]})
+pair describing the transform size, and a (@code{howmany_rank},
+@code{howmany_dims[howmany_rank]}) pair describing the ``vector'' size (a
+multi-dimensional loop of transforms to perform), where @code{dims} and
+@code{howmany_dims} are arrays of @code{fftw_iodim}.
+
+For example, the @code{howmany} parameter in the advanced complex-DFT
+interface corresponds to @code{howmany_rank} = 1,
+@code{howmany_dims[0].n} = @code{howmany}, @code{howmany_dims[0].is} =
+@code{idist}, and @code{howmany_dims[0].os} = @code{odist}.
+@cindex howmany loop
+@cindex dist
+(To compute a single transform, you can just use @code{howmany_rank} = 0.)
+
+
+A row-major multidimensional array with dimensions @code{n[rank]}
+(@pxref{Row-major Format}) corresponds to @code{dims[i].n} =
+@code{n[i]} and the recurrence @code{dims[i].is} = @code{n[i+1] *
+dims[i+1].is} (similarly for @code{os}).  The stride of the last
+(@code{i=rank-1}) dimension is the overall stride of the array.
+e.g. to be equivalent to the advanced complex-DFT interface, you would
+have @code{dims[rank-1].is} = @code{istride} and
+@code{dims[rank-1].os} = @code{ostride}.
+@cindex row-major
+
+
+In general, we only guarantee FFTW to return a non-@code{NULL} plan if
+the vector and transform dimensions correspond to a set of distinct
+indices, and for in-place transforms the input/output strides should
+be the same.
+
+@c =========>
+@node Guru Complex DFTs, Guru Real-data DFTs, Guru vector and transform sizes, Guru Interface
+@subsection Guru Complex DFTs
+
+@example
+fftw_plan fftw_plan_guru_dft(
+     int rank, const fftw_iodim *dims,
+     int howmany_rank, const fftw_iodim *howmany_dims,
+     fftw_complex *in, fftw_complex *out,
+     int sign, unsigned flags);
+
+fftw_plan fftw_plan_guru_split_dft(
+     int rank, const fftw_iodim *dims,
+     int howmany_rank, const fftw_iodim *howmany_dims,
+     double *ri, double *ii, double *ro, double *io,
+     unsigned flags);
+@end example
+@findex fftw_plan_guru_dft
+@findex fftw_plan_guru_split_dft
+
+These two functions plan a complex-data, multi-dimensional DFT
+for the interleaved and split format, respectively.
+Transform dimensions are given by (@code{rank}, @code{dims}) over a
+multi-dimensional vector (loop) of dimensions (@code{howmany_rank},
+@code{howmany_dims}).  @code{dims} and @code{howmany_dims} should point
+to @code{fftw_iodim} arrays of length @code{rank} and
+@code{howmany_rank}, respectively.
+
+@cindex flags
+@code{flags} is a bitwise OR (@samp{|}) of zero or more planner flags,
+as defined in @ref{Planner Flags}.
+
+In the @code{fftw_plan_guru_dft} function, the pointers @code{in} and
+@code{out} point to the interleaved input and output arrays,
+respectively.  The sign can be either @math{-1} (=
+@code{FFTW_FORWARD}) or @math{+1} (= @code{FFTW_BACKWARD}).  If the
+pointers are equal, the transform is in-place.
+
+In the @code{fftw_plan_guru_split_dft} function,
+@code{ri} and @code{ii} point to the real and imaginary input arrays,
+and @code{ro} and @code{io} point to the real and imaginary output
+arrays.  The input and output pointers may be the same, indicating an
+in-place transform.  For example, for @code{fftw_complex} pointers
+@code{in} and @code{out}, the corresponding parameters are:
+
+@example
+ri = (double *) in;
+ii = (double *) in + 1;
+ro = (double *) out;
+io = (double *) out + 1;
+@end example
+
+Because @code{fftw_plan_guru_split_dft} accepts split arrays, strides
+are expressed in units of @code{double}.  For a contiguous
+@code{fftw_complex} array, the overall stride of the transform should
+be 2, the distance between consecutive real parts or between
+consecutive imaginary parts; see @ref{Guru vector and transform
+sizes}.  Note that the dimension strides are applied equally to the
+real and imaginary parts; real and imaginary arrays with different
+strides are not supported.
+
+There is no @code{sign} parameter in @code{fftw_plan_guru_split_dft}.
+This function always plans for an @code{FFTW_FORWARD} transform.  To
+plan for an @code{FFTW_BACKWARD} transform, you can exploit the
+identity that the backwards DFT is equal to the forwards DFT with the
+real and imaginary parts swapped.  For example, in the case of the
+@code{fftw_complex} arrays above, the @code{FFTW_BACKWARD} transform
+is computed by the parameters:
+
+@example
+ri = (double *) in + 1;
+ii = (double *) in;
+ro = (double *) out + 1;
+io = (double *) out;
+@end example
+
+@c =========>
+@node Guru Real-data DFTs, Guru Real-to-real Transforms, Guru Complex DFTs, Guru Interface
+@subsection Guru Real-data DFTs
+
+@example
+fftw_plan fftw_plan_guru_dft_r2c(
+     int rank, const fftw_iodim *dims,
+     int howmany_rank, const fftw_iodim *howmany_dims,
+     double *in, fftw_complex *out,
+     unsigned flags);
+
+fftw_plan fftw_plan_guru_split_dft_r2c(
+     int rank, const fftw_iodim *dims,
+     int howmany_rank, const fftw_iodim *howmany_dims,
+     double *in, double *ro, double *io,
+     unsigned flags);
+
+fftw_plan fftw_plan_guru_dft_c2r(
+     int rank, const fftw_iodim *dims,
+     int howmany_rank, const fftw_iodim *howmany_dims,
+     fftw_complex *in, double *out,
+     unsigned flags);
+
+fftw_plan fftw_plan_guru_split_dft_c2r(
+     int rank, const fftw_iodim *dims,
+     int howmany_rank, const fftw_iodim *howmany_dims,
+     double *ri, double *ii, double *out,
+     unsigned flags);
+@end example
+@findex fftw_plan_guru_dft_r2c
+@findex fftw_plan_guru_split_dft_r2c
+@findex fftw_plan_guru_dft_c2r
+@findex fftw_plan_guru_split_dft_c2r
+
+Plan a real-input (r2c) or real-output (c2r), multi-dimensional DFT with
+transform dimensions given by (@code{rank}, @code{dims}) over a
+multi-dimensional vector (loop) of dimensions (@code{howmany_rank},
+@code{howmany_dims}).  @code{dims} and @code{howmany_dims} should point
+to @code{fftw_iodim} arrays of length @code{rank} and
+@code{howmany_rank}, respectively.  As for the basic and advanced
+interfaces, an r2c transform is @code{FFTW_FORWARD} and a c2r transform
+is @code{FFTW_BACKWARD}.
+
+The @emph{last} dimension of @code{dims} is interpreted specially:
+that dimension of the real array has size @code{dims[rank-1].n}, but
+that dimension of the complex array has size @code{dims[rank-1].n/2+1}
+(division rounded down).  The strides, on the other hand, are taken to
+be exactly as specified.  It is up to the user to specify the strides
+appropriately for the peculiar dimensions of the data, and we do not
+guarantee that the planner will succeed (return non-@code{NULL}) for
+any dimensions other than those described in @ref{Real-data DFT Array
+Format} and generalized in @ref{Advanced Real-data DFTs}.  (That is,
+for an in-place transform, each individual dimension should be able to
+operate in place.)
+@cindex in-place
+
+
+@code{in} and @code{out} point to the input and output arrays for r2c
+and c2r transforms, respectively.  For split arrays, @code{ri} and
+@code{ii} point to the real and imaginary input arrays for a c2r
+transform, and @code{ro} and @code{io} point to the real and imaginary
+output arrays for an r2c transform.  @code{in} and @code{ro} or
+@code{ri} and @code{out} may be the same, indicating an in-place
+transform.   (In-place transforms where @code{in} and @code{io} or
+@code{ii} and @code{out} are the same are not currently supported.)
+
+@cindex flags
+@code{flags} is a bitwise OR (@samp{|}) of zero or more planner flags,
+as defined in @ref{Planner Flags}.
+
+In-place transforms of rank greater than 1 are currently only
+supported for interleaved arrays.  For split arrays, the planner will
+return @code{NULL}.
+@cindex in-place
+
+@c =========>
+@node Guru Real-to-real Transforms, 64-bit Guru Interface, Guru Real-data DFTs, Guru Interface
+@subsection Guru Real-to-real Transforms
+
+@example
+fftw_plan fftw_plan_guru_r2r(int rank, const fftw_iodim *dims,
+                             int howmany_rank,
+                             const fftw_iodim *howmany_dims,
+                             double *in, double *out,
+                             const fftw_r2r_kind *kind,
+                             unsigned flags);
+@end example
+@findex fftw_plan_guru_r2r
+
+Plan a real-to-real (r2r) multi-dimensional @code{FFTW_FORWARD}
+transform with transform dimensions given by (@code{rank}, @code{dims})
+over a multi-dimensional vector (loop) of dimensions
+(@code{howmany_rank}, @code{howmany_dims}).  @code{dims} and
+@code{howmany_dims} should point to @code{fftw_iodim} arrays of length
+@code{rank} and @code{howmany_rank}, respectively.
+
+The transform kind of each dimension is given by the @code{kind}
+parameter, which should point to an array of length @code{rank}.  Valid
+@code{fftw_r2r_kind} constants are given in @ref{Real-to-Real Transform
+Kinds}.
+
+@code{in} and @code{out} point to the real input and output arrays; they
+may be the same, indicating an in-place transform.
+
+@cindex flags
+@code{flags} is a bitwise OR (@samp{|}) of zero or more planner flags,
+as defined in @ref{Planner Flags}.
+
+@c =========>
+@node 64-bit Guru Interface,  , Guru Real-to-real Transforms, Guru Interface
+@subsection 64-bit Guru Interface
+@cindex 64-bit architecture
+
+When compiled in 64-bit mode on a 64-bit architecture (where addresses
+are 64 bits wide), FFTW uses 64-bit quantities internally for all
+transform sizes, strides, and so on---you don't have to do anything
+special to exploit this.  However, in the ordinary FFTW interfaces,
+you specify the transform size by an @code{int} quantity, which is
+normally only 32 bits wide.  This means that, even though FFTW is
+using 64-bit sizes internally, you cannot specify a single transform
+dimension larger than
+@ifinfo
+2^31-1
+@end ifinfo
+@html
+2<sup><small>31</small></sup>&minus;1
+@end html
+@tex
+$2^31-1$
+@end tex
+numbers.
+
+We expect that few users will require transforms larger than this, but,
+for those who do, we provide a 64-bit version of the guru interface in
+which all sizes are specified as integers of type @code{ptrdiff_t}
+instead of @code{int}.  (@code{ptrdiff_t} is a signed integer type
+defined by the C standard to be wide enough to represent address
+differences, and thus must be at least 64 bits wide on a 64-bit
+machine.)  We stress that there is @emph{no performance advantage} to
+using this interface---the same internal FFTW code is employed
+regardless---and it is only necessary if you want to specify very
+large transform sizes.
+@tindex ptrdiff_t
+
+
+In particular, the 64-bit guru interface is a set of planner routines
+that are exactly the same as the guru planner routines, except that
+they are named with @samp{guru64} instead of @samp{guru} and they take
+arguments of type @code{fftw_iodim64} instead of @code{fftw_iodim}.
+For example, instead of @code{fftw_plan_guru_dft}, we have
+@code{fftw_plan_guru64_dft}.
+
+@example
+fftw_plan fftw_plan_guru64_dft(
+     int rank, const fftw_iodim64 *dims,
+     int howmany_rank, const fftw_iodim64 *howmany_dims,
+     fftw_complex *in, fftw_complex *out,
+     int sign, unsigned flags);
+@end example
+@findex fftw_plan_guru64_dft
+
+The @code{fftw_iodim64} type is similar to @code{fftw_iodim}, with the
+same interpretation, except that it uses type @code{ptrdiff_t} instead
+of type @code{int}.
+
+@example
+typedef struct @{
+     ptrdiff_t n;
+     ptrdiff_t is;
+     ptrdiff_t os;
+@} fftw_iodim64;
+@end example
+@tindex fftw_iodim64
+
+Every other @samp{fftw_plan_guru} function also has a
+@samp{fftw_plan_guru64} equivalent, but we do not repeat their
+documentation here since they are identical to the 32-bit versions
+except as noted above.
+
+@c -----------------------------------------------------------
+@node New-array Execute Functions, Wisdom, Guru Interface, FFTW Reference
+@section New-array Execute Functions
+@cindex execute
+@cindex new-array execution
+
+Normally, one executes a plan for the arrays with which the plan was
+created, by calling @code{fftw_execute(plan)} as described in @ref{Using
+Plans}.
+@findex fftw_execute
+However, it is possible for sophisticated users to apply a given plan
+to a @emph{different} array using the ``new-array execute'' functions
+detailed below, provided that the following conditions are met:
+
+@itemize @bullet
+
+@item
+The array size, strides, etcetera are the same (since those are set by
+the plan).
+
+@item
+The input and output arrays are the same (in-place) or different
+(out-of-place) if the plan was originally created to be in-place or
+out-of-place, respectively.
+
+@item
+For split arrays, the separations between the real and imaginary
+parts, @code{ii-ri} and @code{io-ro}, are the same as they were for
+the input and output arrays when the plan was created.  (This
+condition is automatically satisfied for interleaved arrays.)
+
+@item
+The @dfn{alignment} of the new input/output arrays is the same as that
+of the input/output arrays when the plan was created, unless the plan
+was created with the @code{FFTW_UNALIGNED} flag.
+@ctindex FFTW_UNALIGNED
+Here, the alignment is a platform-dependent quantity (for example, it is
+the address modulo 16 if SSE SIMD instructions are used, but the address
+modulo 4 for non-SIMD single-precision FFTW on the same machine).  In
+general, only arrays allocated with @code{fftw_malloc} are guaranteed to
+be equally aligned (@pxref{SIMD alignment and fftw_malloc}).
+
+@end itemize
+
+@cindex alignment
+The alignment issue is especially critical, because if you don't use
+@code{fftw_malloc} then you may have little control over the alignment
+of arrays in memory.  For example, neither the C++ @code{new} function
+nor the Fortran @code{allocate} statement provide strong enough
+guarantees about data alignment.  If you don't use @code{fftw_malloc},
+therefore, you probably have to use @code{FFTW_UNALIGNED} (which
+disables most SIMD support).  If possible, it is probably better for
+you to simply create multiple plans (creating a new plan is quick once
+one exists for a given size), or better yet re-use the same array for
+your transforms.
+
+If you are tempted to use the new-array execute interface because you
+want to transform a known bunch of arrays of the same size, you should
+probably go use the advanced interface instead (@pxref{Advanced
+Interface})).
+
+The new-array execute functions are:
+
+@example
+void fftw_execute_dft(
+     const fftw_plan p, 
+     fftw_complex *in, fftw_complex *out);
+
+void fftw_execute_split_dft(
+     const fftw_plan p, 
+     double *ri, double *ii, double *ro, double *io);
+
+void fftw_execute_dft_r2c(
+     const fftw_plan p,
+     double *in, fftw_complex *out);
+
+void fftw_execute_split_dft_r2c(
+     const fftw_plan p,
+     double *in, double *ro, double *io);
+
+void fftw_execute_dft_c2r(
+     const fftw_plan p,
+     fftw_complex *in, double *out);
+
+void fftw_execute_split_dft_c2r(
+     const fftw_plan p,
+     double *ri, double *ii, double *out);
+
+void fftw_execute_r2r(
+     const fftw_plan p, 
+     double *in, double *out);
+@end example
+@findex fftw_execute_dft
+@findex fftw_execute_split_dft
+@findex fftw_execute_dft_r2c
+@findex fftw_execute_split_dft_r2c
+@findex fftw_execute_dft_c2r
+@findex fftw_execute_split_dft_c2r
+@findex fftw_execute_r2r
+
+These execute the @code{plan} to compute the corresponding transform on
+the input/output arrays specified by the subsequent arguments.  The
+input/output array arguments have the same meanings as the ones passed
+to the guru planner routines in the preceding sections.  The @code{plan}
+is not modified, and these routines can be called as many times as
+desired, or intermixed with calls to the ordinary @code{fftw_execute}.
+
+The @code{plan} @emph{must} have been created for the transform type
+corresponding to the execute function, e.g. it must be a complex-DFT
+plan for @code{fftw_execute_dft}.  Any of the planner routines for that
+transform type, from the basic to the guru interface, could have been
+used to create the plan, however.
+
+@c ------------------------------------------------------------
+@node Wisdom, What FFTW Really Computes, New-array Execute Functions, FFTW Reference
+@section Wisdom
+@cindex wisdom
+@cindex saving plans to disk
+
+This section documents the FFTW mechanism for saving and restoring
+plans from disk.  This mechanism is called @dfn{wisdom}.
+
+@menu
+* Wisdom Export::               
+* Wisdom Import::               
+* Forgetting Wisdom::           
+* Wisdom Utilities::            
+@end menu
+
+@c =========>
+@node Wisdom Export, Wisdom Import, Wisdom, Wisdom
+@subsection Wisdom Export
+
+@example
+int fftw_export_wisdom_to_filename(const char *filename);
+void fftw_export_wisdom_to_file(FILE *output_file);
+char *fftw_export_wisdom_to_string(void);
+void fftw_export_wisdom(void (*write_char)(char c, void *), void *data);
+@end example
+@findex fftw_export_wisdom
+@findex fftw_export_wisdom_to_filename
+@findex fftw_export_wisdom_to_file
+@findex fftw_export_wisdom_to_string
+
+These functions allow you to export all currently accumulated wisdom
+in a form from which it can be later imported and restored, even
+during a separate run of the program. (@xref{Words of Wisdom-Saving
+Plans}.)  The current store of wisdom is not affected by calling any
+of these routines.
+
+@code{fftw_export_wisdom} exports the wisdom to any output
+medium, as specified by the callback function
+@code{write_char}. @code{write_char} is a @code{putc}-like function that
+writes the character @code{c} to some output; its second parameter is
+the @code{data} pointer passed to @code{fftw_export_wisdom}.  For
+convenience, the following three ``wrapper'' routines are provided:
+
+@code{fftw_export_wisdom_to_filename} writes wisdom to a file named
+@code{filename} (which is created or overwritten), returning @code{1}
+on success and @code{0} on failure.  A lower-level function, which
+requires you to open and close the file yourself (e.g. if you want to
+write wisdom to a portion of a larger file) is
+@code{fftw_export_wisdom_to_file}.  This writes the wisdom to the
+current position in @code{output_file}, which should be open with
+write permission; upon exit, the file remains open and is positioned
+at the end of the wisdom data.
+
+@code{fftw_export_wisdom_to_string} returns a pointer to a
+@code{NULL}-terminated string holding the wisdom data. This string is
+dynamically allocated, and it is the responsibility of the caller to
+deallocate it with @code{free} when it is no longer needed.
+
+All of these routines export the wisdom in the same format, which we
+will not document here except to say that it is LISP-like ASCII text
+that is insensitive to white space.
+
+@c =========>
+@node Wisdom Import, Forgetting Wisdom, Wisdom Export, Wisdom
+@subsection Wisdom Import
+
+@example
+int fftw_import_system_wisdom(void);
+int fftw_import_wisdom_from_filename(const char *filename);
+int fftw_import_wisdom_from_string(const char *input_string);
+int fftw_import_wisdom(int (*read_char)(void *), void *data);
+@end example
+@findex fftw_import_wisdom
+@findex fftw_import_system_wisdom
+@findex fftw_import_wisdom_from_filename
+@findex fftw_import_wisdom_from_file
+@findex fftw_import_wisdom_from_string
+
+These functions import wisdom into a program from data stored by the
+@code{fftw_export_wisdom} functions above. (@xref{Words of
+Wisdom-Saving Plans}.)  The imported wisdom replaces any wisdom
+already accumulated by the running program.
+
+@code{fftw_import_wisdom} imports wisdom from any input medium, as
+specified by the callback function @code{read_char}. @code{read_char} is
+a @code{getc}-like function that returns the next character in the
+input; its parameter is the @code{data} pointer passed to
+@code{fftw_import_wisdom}. If the end of the input data is reached
+(which should never happen for valid data), @code{read_char} should
+return @code{EOF} (as defined in @code{<stdio.h>}).  For convenience,
+the following three ``wrapper'' routines are provided:
+
+@code{fftw_import_wisdom_from_filename} reads wisdom from a file named
+@code{filename}.  A lower-level function, which requires you to open
+and close the file yourself (e.g. if you want to read wisdom from a
+portion of a larger file) is @code{fftw_import_wisdom_from_file}. This
+reads wisdom from the current position in @code{input_file} (which
+should be open with read permission); upon exit, the file remains
+open, but the position of the read pointer is unspecified.
+
+@code{fftw_import_wisdom_from_string} reads wisdom from the
+@code{NULL}-terminated string @code{input_string}.
+
+@code{fftw_import_system_wisdom} reads wisdom from an
+implementation-defined standard file (@code{/etc/fftw/wisdom} on Unix
+and GNU systems).
+@cindex wisdom, system-wide
+
+
+The return value of these import routines is @code{1} if the wisdom was
+read successfully and @code{0} otherwise. Note that, in all of these
+functions, any data in the input stream past the end of the wisdom data
+is simply ignored.
+
+@c =========>
+@node Forgetting Wisdom, Wisdom Utilities, Wisdom Import, Wisdom
+@subsection Forgetting Wisdom
+
+@example
+void fftw_forget_wisdom(void);
+@end example
+@findex fftw_forget_wisdom
+
+Calling @code{fftw_forget_wisdom} causes all accumulated @code{wisdom}
+to be discarded and its associated memory to be freed. (New
+@code{wisdom} can still be gathered subsequently, however.)
+
+@c =========>
+@node Wisdom Utilities,  , Forgetting Wisdom, Wisdom
+@subsection Wisdom Utilities
+
+FFTW includes two standalone utility programs that deal with wisdom.  We
+merely summarize them here, since they come with their own @code{man}
+pages for Unix and GNU systems (with HTML versions on our web site).
+
+The first program is @code{fftw-wisdom} (or @code{fftwf-wisdom} in
+single precision, etcetera), which can be used to create a wisdom file
+containing plans for any of the transform sizes and types supported by
+FFTW.  It is preferable to create wisdom directly from your executable
+(@pxref{Caveats in Using Wisdom}), but this program is useful for
+creating global wisdom files for @code{fftw_import_system_wisdom}.
+@cindex fftw-wisdom utility
+
+
+The second program is @code{fftw-wisdom-to-conf}, which takes a wisdom
+file as input and produces a @dfn{configuration routine} as output.  The
+latter is a C subroutine that you can compile and link into your
+program, replacing a routine of the same name in the FFTW library, that
+determines which parts of FFTW are callable by your program.
+@code{fftw-wisdom-to-conf} produces a configuration routine that links
+to only those parts of FFTW needed by the saved plans in the wisdom,
+greatly reducing the size of statically linked executables (which should
+only attempt to create plans corresponding to those in the wisdom,
+however).
+@cindex fftw-wisdom-to-conf utility
+@cindex configuration routines
+
+@c ------------------------------------------------------------
+@node What FFTW Really Computes,  , Wisdom, FFTW Reference
+@section What FFTW Really Computes
+
+In this section, we provide precise mathematical definitions for the
+transforms that FFTW computes.  These transform definitions are fairly
+standard, but some authors follow slightly different conventions for the
+normalization of the transform (the constant factor in front) and the
+sign of the complex exponent.  We begin by presenting the
+one-dimensional (1d) transform definitions, and then give the
+straightforward extension to multi-dimensional transforms.
+
+@menu
+* The 1d Discrete Fourier Transform (DFT)::  
+* The 1d Real-data DFT::        
+* 1d Real-even DFTs (DCTs)::    
+* 1d Real-odd DFTs (DSTs)::     
+* 1d Discrete Hartley Transforms (DHTs)::  
+* Multi-dimensional Transforms::  
+@end menu
+
+@c =========>
+@node The 1d Discrete Fourier Transform (DFT), The 1d Real-data DFT, What FFTW Really Computes, What FFTW Really Computes
+@subsection The 1d Discrete Fourier Transform (DFT)
+
+@cindex discrete Fourier transform
+@cindex DFT
+The forward (@code{FFTW_FORWARD}) discrete Fourier transform (DFT) of a
+1d complex array @math{X} of size @math{n} computes an array @math{Y},
+where:
+@tex
+$$
+Y_k = \sum_{j = 0}^{n - 1} X_j e^{-2\pi j k \sqrt{-1}/n} \ .
+$$
+@end tex
+@ifinfo
+@center Y[k] = sum for j = 0 to (n - 1) of X[j] * exp(-2 pi j k sqrt(-1)/n) .
+@end ifinfo
+@html
+<center><img src="equation-dft.png" align="top">.</center>
+@end html
+The backward (@code{FFTW_BACKWARD}) DFT computes:
+@tex
+$$
+Y_k = \sum_{j = 0}^{n - 1} X_j e^{2\pi j k \sqrt{-1}/n} \ .
+$$
+@end tex
+@ifinfo
+@center Y[k] = sum for j = 0 to (n - 1) of X[j] * exp(2 pi j k sqrt(-1)/n) .
+@end ifinfo
+@html
+<center><img src="equation-idft.png" align="top">.</center>
+@end html
+
+@cindex normalization
+FFTW computes an unnormalized transform, in that there is no coefficient
+in front of the summation in the DFT.  In other words, applying the
+forward and then the backward transform will multiply the input by
+@math{n}.
+
+@cindex frequency
+From above, an @code{FFTW_FORWARD} transform corresponds to a sign of
+@math{-1} in the exponent of the DFT.  Note also that we use the
+standard ``in-order'' output ordering---the @math{k}-th output
+corresponds to the frequency @math{k/n} (or @math{k/T}, where @math{T}
+is your total sampling period).  For those who like to think in terms of
+positive and negative frequencies, this means that the positive
+frequencies are stored in the first half of the output and the negative
+frequencies are stored in backwards order in the second half of the
+output.  (The frequency @math{-k/n} is the same as the frequency
+@math{(n-k)/n}.)
+
+@c =========>
+@node The 1d Real-data DFT, 1d Real-even DFTs (DCTs), The 1d Discrete Fourier Transform (DFT), What FFTW Really Computes
+@subsection The 1d Real-data DFT
+
+The real-input (r2c) DFT in FFTW computes the @emph{forward} transform
+@math{Y} of the size @code{n} real array @math{X}, exactly as defined
+above, i.e.
+@tex
+$$
+Y_k = \sum_{j = 0}^{n - 1} X_j e^{-2\pi j k \sqrt{-1}/n} \ .
+$$
+@end tex
+@ifinfo
+@center Y[k] = sum for j = 0 to (n - 1) of X[j] * exp(-2 pi j k sqrt(-1)/n) .
+@end ifinfo
+@html
+<center><img src="equation-dft.png" align="top">.</center>
+@end html
+This output array @math{Y} can easily be shown to possess the
+``Hermitian'' symmetry
+@cindex Hermitian
+@tex
+$Y_k = Y_{n-k}^*$,
+@end tex
+@ifinfo
+Y[k] = Y[n-k]*,
+@end ifinfo
+@html
+<i>Y<sub>k</sub> = Y<sub>n-k</sub></i><sup>*</sup>,
+@end html
+where we take @math{Y} to be periodic so that
+@tex
+$Y_n = Y_0$.
+@end tex
+@ifinfo
+Y[n] = Y[0].
+@end ifinfo
+@html
+<i>Y<sub>n</sub> = Y</i><sub>0</sub>.
+@end html
+
+As a result of this symmetry, half of the output @math{Y} is redundant
+(being the complex conjugate of the other half), and so the 1d r2c
+transforms only output elements @math{0}@dots{}@math{n/2} of @math{Y}
+(@math{n/2+1} complex numbers), where the division by @math{2} is
+rounded down.
+
+Moreover, the Hermitian symmetry implies that
+@tex
+$Y_0$
+@end tex
+@ifinfo
+Y[0]
+@end ifinfo
+@html
+<i>Y</i><sub>0</sub>
+@end html
+and, if @math{n} is even, the
+@tex
+$Y_{n/2}$
+@end tex
+@ifinfo
+Y[n/2]
+@end ifinfo
+@html
+<i>Y</i><sub><i>n</i>/2</sub>
+@end html
+element, are purely real.  So, for the @code{R2HC} r2r transform, these
+elements are not stored in the halfcomplex output format.
+@cindex r2r
+@ctindex R2HC
+@cindex halfcomplex format
+
+
+The c2r and @code{H2RC} r2r transforms compute the backward DFT of the
+@emph{complex} array @math{X} with Hermitian symmetry, stored in the
+r2c/@code{R2HC} output formats, respectively, where the backward
+transform is defined exactly as for the complex case:
+@tex
+$$
+Y_k = \sum_{j = 0}^{n - 1} X_j e^{2\pi j k \sqrt{-1}/n} \ .
+$$
+@end tex
+@ifinfo
+@center Y[k] = sum for j = 0 to (n - 1) of X[j] * exp(2 pi j k sqrt(-1)/n) .
+@end ifinfo
+@html
+<center><img src="equation-idft.png" align="top">.</center>
+@end html
+The outputs @code{Y} of this transform can easily be seen to be purely
+real, and are stored as an array of real numbers.
+
+@cindex normalization
+Like FFTW's complex DFT, these transforms are unnormalized.  In other
+words, applying the real-to-complex (forward) and then the
+complex-to-real (backward) transform will multiply the input by
+@math{n}.
+
+@c =========>
+@node 1d Real-even DFTs (DCTs), 1d Real-odd DFTs (DSTs), The 1d Real-data DFT, What FFTW Really Computes
+@subsection 1d Real-even DFTs (DCTs)
+
+The Real-even symmetry DFTs in FFTW are exactly equivalent to the unnormalized
+forward (and backward) DFTs as defined above, where the input array
+@math{X} of length @math{N} is purely real and is also @dfn{even} symmetry.  In
+this case, the output array is likewise real and even symmetry.
+@cindex real-even DFT
+@cindex REDFT
+
+
+@ctindex REDFT00
+For the case of @code{REDFT00}, this even symmetry means that
+@tex
+$X_j = X_{N-j}$,
+@end tex
+@ifinfo
+X[j] = X[N-j],
+@end ifinfo
+@html
+<i>X<sub>j</sub> = X<sub>N-j</sub></i>,
+@end html
+where we take @math{X} to be periodic so that
+@tex
+$X_N = X_0$.
+@end tex
+@ifinfo
+X[N] = X[0].
+@end ifinfo
+@html
+<i>X<sub>N</sub> = X</i><sub>0</sub>.
+@end html
+Because of this redundancy, only the first @math{n} real numbers are
+actually stored, where @math{N = 2(n-1)}.
+
+The proper definition of even symmetry for @code{REDFT10},
+@code{REDFT01}, and @code{REDFT11} transforms is somewhat more intricate
+because of the shifts by @math{1/2} of the input and/or output, although
+the corresponding boundary conditions are given in @ref{Real even/odd
+DFTs (cosine/sine transforms)}.  Because of the even symmetry, however,
+the sine terms in the DFT all cancel and the remaining cosine terms are
+written explicitly below.  This formulation often leads people to call
+such a transform a @dfn{discrete cosine transform} (DCT), although it is
+really just a special case of the DFT.
+@cindex discrete cosine transform
+@cindex DCT
+
+
+In each of the definitions below, we transform a real array @math{X} of
+length @math{n} to a real array @math{Y} of length @math{n}:
+
+@subsubheading REDFT00 (DCT-I)
+@ctindex REDFT00
+An @code{REDFT00} transform (type-I DCT) in FFTW is defined by:
+@tex
+$$
+Y_k = X_0 + (-1)^k X_{n-1}
+       + 2 \sum_{j=1}^{n-2} X_j \cos [ \pi j k / (n-1)].
+$$
+@end tex
+@ifinfo
+Y[k] = X[0] + (-1)^k X[n-1] + 2 (sum for j = 1 to n-2 of X[j] cos(pi jk /(n-1))).
+@end ifinfo
+@html
+<center><img src="equation-redft00.png" align="top">.</center>
+@end html
+Note that this transform is not defined for @math{n=1}.  For @math{n=2},
+the summation term above is dropped as you might expect.
+
+@subsubheading REDFT10 (DCT-II)
+@ctindex REDFT10
+An @code{REDFT10} transform (type-II DCT, sometimes called ``the'' DCT) in FFTW is defined by:
+@tex
+$$
+Y_k = 2 \sum_{j=0}^{n-1} X_j \cos [\pi (j+1/2) k / n].
+$$
+@end tex
+@ifinfo
+Y[k] = 2 (sum for j = 0 to n-1 of X[j] cos(pi (j+1/2) k / n)).
+@end ifinfo
+@html
+<center><img src="equation-redft10.png" align="top">.</center>
+@end html
+
+@subsubheading REDFT01 (DCT-III)
+@ctindex REDFT01
+An @code{REDFT01} transform (type-III DCT) in FFTW is defined by:
+@tex
+$$
+Y_k = X_0 + 2 \sum_{j=1}^{n-1} X_j \cos [\pi j (k+1/2) / n].
+$$
+@end tex
+@ifinfo
+Y[k] = X[0] + 2 (sum for j = 1 to n-1 of X[j] cos(pi j (k+1/2) / n)).
+@end ifinfo
+@html
+<center><img src="equation-redft01.png" align="top">.</center>
+@end html
+In the case of @math{n=1}, this reduces to
+@tex
+$Y_0 = X_0$.
+@end tex
+@ifinfo
+Y[0] = X[0].
+@end ifinfo
+@html
+<i>Y</i><sub>0</sub> = <i>X</i><sub>0</sub>.
+@end html
+Up to a scale factor (see below), this is the inverse of @code{REDFT10} (``the'' DCT), and so the @code{REDFT01} (DCT-III) is sometimes called the ``IDCT''.
+@cindex IDCT
+
+@subsubheading REDFT11 (DCT-IV)
+@ctindex REDFT11
+An @code{REDFT11} transform (type-IV DCT) in FFTW is defined by:
+@tex
+$$
+Y_k = 2 \sum_{j=0}^{n-1} X_j \cos [\pi (j+1/2) (k+1/2) / n].
+$$
+@end tex
+@ifinfo
+Y[k] = 2 (sum for j = 0 to n-1 of X[j] cos(pi (j+1/2) (k+1/2) / n)).
+@end ifinfo
+@html
+<center><img src="equation-redft11.png" align="top">.</center>
+@end html
+
+@subsubheading Inverses and Normalization
+
+These definitions correspond directly to the unnormalized DFTs used
+elsewhere in FFTW (hence the factors of @math{2} in front of the
+summations).  The unnormalized inverse of @code{REDFT00} is
+@code{REDFT00}, of @code{REDFT10} is @code{REDFT01} and vice versa, and
+of @code{REDFT11} is @code{REDFT11}.  Each unnormalized inverse results
+in the original array multiplied by @math{N}, where @math{N} is the
+@emph{logical} DFT size.  For @code{REDFT00}, @math{N=2(n-1)} (note that
+@math{n=1} is not defined); otherwise, @math{N=2n}.
+@cindex normalization
+
+
+In defining the discrete cosine transform, some authors also include
+additional factors of
+@ifinfo
+sqrt(2)
+@end ifinfo
+@html
+&radic;2
+@end html
+@tex
+$\sqrt{2}$
+@end tex
+(or its inverse) multiplying selected inputs and/or outputs.  This is a
+mostly cosmetic change that makes the transform orthogonal, but
+sacrifices the direct equivalence to a symmetric DFT.
+
+@c =========>
+@node 1d Real-odd DFTs (DSTs), 1d Discrete Hartley Transforms (DHTs), 1d Real-even DFTs (DCTs), What FFTW Really Computes
+@subsection 1d Real-odd DFTs (DSTs)
+
+The Real-odd symmetry DFTs in FFTW are exactly equivalent to the unnormalized
+forward (and backward) DFTs as defined above, where the input array
+@math{X} of length @math{N} is purely real and is also @dfn{odd} symmetry.  In
+this case, the output is odd symmetry and purely imaginary.
+@cindex real-odd DFT
+@cindex RODFT
+
+
+@ctindex RODFT00
+For the case of @code{RODFT00}, this odd symmetry means that
+@tex
+$X_j = -X_{N-j}$,
+@end tex
+@ifinfo
+X[j] = -X[N-j],
+@end ifinfo
+@html
+<i>X<sub>j</sub> = -X<sub>N-j</sub></i>,
+@end html
+where we take @math{X} to be periodic so that
+@tex
+$X_N = X_0$.
+@end tex
+@ifinfo
+X[N] = X[0].
+@end ifinfo
+@html
+<i>X<sub>N</sub> = X</i><sub>0</sub>.
+@end html
+Because of this redundancy, only the first @math{n} real numbers
+starting at @math{j=1} are actually stored (the @math{j=0} element is
+zero), where @math{N = 2(n+1)}.
+
+The proper definition of odd symmetry for @code{RODFT10},
+@code{RODFT01}, and @code{RODFT11} transforms is somewhat more intricate
+because of the shifts by @math{1/2} of the input and/or output, although
+the corresponding boundary conditions are given in @ref{Real even/odd
+DFTs (cosine/sine transforms)}.  Because of the odd symmetry, however,
+the cosine terms in the DFT all cancel and the remaining sine terms are
+written explicitly below.  This formulation often leads people to call
+such a transform a @dfn{discrete sine transform} (DST), although it is
+really just a special case of the DFT.
+@cindex discrete sine transform
+@cindex DST
+
+
+In each of the definitions below, we transform a real array @math{X} of
+length @math{n} to a real array @math{Y} of length @math{n}:
+
+@subsubheading RODFT00 (DST-I)
+@ctindex RODFT00
+An @code{RODFT00} transform (type-I DST) in FFTW is defined by:
+@tex
+$$
+Y_k = 2 \sum_{j=0}^{n-1} X_j \sin [ \pi (j+1) (k+1) / (n+1)].
+$$
+@end tex
+@ifinfo
+Y[k] = 2 (sum for j = 0 to n-1 of X[j] sin(pi (j+1)(k+1) / (n+1))).
+@end ifinfo
+@html
+<center><img src="equation-rodft00.png" align="top">.</center>
+@end html
+
+@subsubheading RODFT10 (DST-II)
+@ctindex RODFT10
+An @code{RODFT10} transform (type-II DST) in FFTW is defined by:
+@tex
+$$
+Y_k = 2 \sum_{j=0}^{n-1} X_j \sin [\pi (j+1/2) (k+1) / n].
+$$
+@end tex
+@ifinfo
+Y[k] = 2 (sum for j = 0 to n-1 of X[j] sin(pi (j+1/2) (k+1) / n)).
+@end ifinfo
+@html
+<center><img src="equation-rodft10.png" align="top">.</center>
+@end html
+
+@subsubheading RODFT01 (DST-III)
+@ctindex RODFT01
+An @code{RODFT01} transform (type-III DST) in FFTW is defined by:
+@tex
+$$
+Y_k = (-1)^k X_{n-1} + 2 \sum_{j=0}^{n-2} X_j \sin [\pi (j+1) (k+1/2) / n].
+$$
+@end tex
+@ifinfo
+Y[k] = (-1)^k X[n-1] + 2 (sum for j = 0 to n-2 of X[j] sin(pi (j+1) (k+1/2) / n)).
+@end ifinfo
+@html
+<center><img src="equation-rodft01.png" align="top">.</center>
+@end html
+In the case of @math{n=1}, this reduces to
+@tex
+$Y_0 = X_0$.
+@end tex
+@ifinfo
+Y[0] = X[0].
+@end ifinfo
+@html
+<i>Y</i><sub>0</sub> = <i>X</i><sub>0</sub>.
+@end html
+
+@subsubheading RODFT11 (DST-IV)
+@ctindex RODFT11
+An @code{RODFT11} transform (type-IV DST) in FFTW is defined by:
+@tex
+$$
+Y_k = 2 \sum_{j=0}^{n-1} X_j \sin [\pi (j+1/2) (k+1/2) / n].
+$$
+@end tex
+@ifinfo
+Y[k] = 2 (sum for j = 0 to n-1 of X[j] sin(pi (j+1/2) (k+1/2) / n)).
+@end ifinfo
+@html
+<center><img src="equation-rodft11.png" align="top">.</center>
+@end html
+
+@subsubheading Inverses and Normalization
+
+These definitions correspond directly to the unnormalized DFTs used
+elsewhere in FFTW (hence the factors of @math{2} in front of the
+summations).  The unnormalized inverse of @code{RODFT00} is
+@code{RODFT00}, of @code{RODFT10} is @code{RODFT01} and vice versa, and
+of @code{RODFT11} is @code{RODFT11}.  Each unnormalized inverse results
+in the original array multiplied by @math{N}, where @math{N} is the
+@emph{logical} DFT size.  For @code{RODFT00}, @math{N=2(n+1)};
+otherwise, @math{N=2n}.
+@cindex normalization
+
+
+In defining the discrete sine transform, some authors also include
+additional factors of
+@ifinfo
+sqrt(2)
+@end ifinfo
+@html
+&radic;2
+@end html
+@tex
+$\sqrt{2}$
+@end tex
+(or its inverse) multiplying selected inputs and/or outputs.  This is a
+mostly cosmetic change that makes the transform orthogonal, but
+sacrifices the direct equivalence to an antisymmetric DFT.
+
+@c =========>
+@node 1d Discrete Hartley Transforms (DHTs), Multi-dimensional Transforms, 1d Real-odd DFTs (DSTs), What FFTW Really Computes
+@subsection 1d Discrete Hartley Transforms (DHTs)
+
+@cindex discrete Hartley transform
+@cindex DHT
+The discrete Hartley transform (DHT) of a 1d real array @math{X} of size
+@math{n} computes a real array @math{Y} of the same size, where:
+@tex
+$$
+Y_k = \sum_{j = 0}^{n - 1} X_j [ \cos(2\pi j k / n) + \sin(2\pi j k / n)].
+$$
+@end tex
+@ifinfo
+@center Y[k] = sum for j = 0 to (n - 1) of X[j] * [cos(2 pi j k / n) + sin(2 pi j k / n)].
+@end ifinfo
+@html
+<center><img src="equation-dht.png" align="top">.</center>
+@end html
+
+@cindex normalization
+FFTW computes an unnormalized transform, in that there is no coefficient
+in front of the summation in the DHT.  In other words, applying the
+transform twice (the DHT is its own inverse) will multiply the input by
+@math{n}.
+
+@c =========>
+@node Multi-dimensional Transforms,  , 1d Discrete Hartley Transforms (DHTs), What FFTW Really Computes
+@subsection Multi-dimensional Transforms
+
+The multi-dimensional transforms of FFTW, in general, compute simply the
+separable product of the given 1d transform along each dimension of the
+array.  Since each of these transforms is unnormalized, computing the
+forward followed by the backward/inverse multi-dimensional transform
+will result in the original array scaled by the product of the
+normalization factors for each dimension (e.g. the product of the
+dimension sizes, for a multi-dimensional DFT).
+
+@tex
+As an explicit example, consider the following exact mathematical
+definition of our multi-dimensional DFT.  Let $X$ be a $d$-dimensional
+complex array whose elements are $X[j_1, j_2, \ldots, j_d]$, where $0
+\leq j_s < n_s$ for all~$s \in \{ 1, 2, \ldots, d \}$.  Let also
+$\omega_s = e^{2\pi \sqrt{-1}/n_s}$, for all ~$s \in \{ 1, 2, \ldots, d
+\}$.
+
+The forward transform computes a complex array~$Y$, whose
+structure is the same as that of~$X$, defined by
+
+$$
+Y[k_1, k_2, \ldots, k_d] =
+    \sum_{j_1 = 0}^{n_1 - 1}
+        \sum_{j_2 = 0}^{n_2 - 1}
+           \cdots
+              \sum_{j_d = 0}^{n_d - 1}
+                  X[j_1, j_2, \ldots, j_d] 
+                      \omega_1^{-j_1 k_1}
+                      \omega_2^{-j_2 k_2}
+                      \cdots
+                      \omega_d^{-j_d k_d} \ .
+$$
+
+The backward transform computes
+$$
+Y[k_1, k_2, \ldots, k_d] =
+    \sum_{j_1 = 0}^{n_1 - 1}
+        \sum_{j_2 = 0}^{n_2 - 1}
+           \cdots
+              \sum_{j_d = 0}^{n_d - 1}
+                  X[j_1, j_2, \ldots, j_d] 
+                      \omega_1^{j_1 k_1}
+                      \omega_2^{j_2 k_2}
+                      \cdots
+                      \omega_d^{j_d k_d} \ .
+$$
+
+Computing the forward transform followed by the backward transform
+will multiply the array by $\prod_{s=1}^{d} n_d$.
+@end tex
+
+@cindex r2c
+The definition of FFTW's multi-dimensional DFT of real data (r2c)
+deserves special attention.  In this case, we logically compute the full
+multi-dimensional DFT of the input data; since the input data are purely
+real, the output data have the Hermitian symmetry and therefore only one
+non-redundant half need be stored.  More specifically, for an @ndims multi-dimensional real-input DFT, the full (logical) complex output array
+@tex
+$Y[k_0, k_1, \ldots, k_{d-1}]$
+@end tex
+@html
+<i>Y</i>[<i>k</i><sub>0</sub>, <i>k</i><sub>1</sub>, ...,
+<i>k</i><sub><i>d-1</i></sub>]
+@end html
+@ifinfo
+Y[k[0], k[1], ..., k[d-1]]
+@end ifinfo
+has the symmetry:
+@tex
+$$
+Y[k_0, k_1, \ldots, k_{d-1}] = Y[n_0 - k_0, n_1 - k_1, \ldots, n_{d-1} - k_{d-1}]^*
+$$
+@end tex
+@html
+<i>Y</i>[<i>k</i><sub>0</sub>, <i>k</i><sub>1</sub>, ...,
+<i>k</i><sub><i>d-1</i></sub>] = <i>Y</i>[<i>n</i><sub>0</sub> -
+<i>k</i><sub>0</sub>, <i>n</i><sub>1</sub> - <i>k</i><sub>1</sub>, ...,
+<i>n</i><sub><i>d-1</i></sub> - <i>k</i><sub><i>d-1</i></sub>]<sup>*</sup>
+@end html
+@ifinfo
+Y[k[0], k[1], ..., k[d-1]] = Y[n[0] - k[0], n[1] - k[1], ..., n[d-1] - k[d-1]]*
+@end ifinfo
+(where each dimension is periodic).  Because of this symmetry, we only
+store the
+@tex
+$k_{d-1} = 0 \cdots n_{d-1}/2$
+@end tex
+@html
+<i>k</i><sub><i>d-1</i></sub> = 0...<i>n</i><sub><i>d-1</i></sub>/2+1
+@end html
+@ifinfo
+k[d-1] = 0...n[d-1]/2
+@end ifinfo
+elements of the @emph{last} dimension (division by @math{2} is rounded
+down).  (We could instead have cut any other dimension in half, but the
+last dimension proved computationally convenient.)  This results in the
+peculiar array format described in more detail by @ref{Real-data DFT
+Array Format}.
+
+The multi-dimensional c2r transform is simply the unnormalized inverse
+of the r2c transform.  i.e. it is the same as FFTW's complex backward
+multi-dimensional DFT, operating on a Hermitian input array in the
+peculiar format mentioned above and outputting a real array (since the
+DFT output is purely real).
+
+We should remind the user that the separable product of 1d transforms
+along each dimension, as computed by FFTW, is not always the same thing
+as the usual multi-dimensional transform.  A multi-dimensional
+@code{R2HC} (or @code{HC2R}) transform is not identical to the
+multi-dimensional DFT, requiring some post-processing to combine the
+requisite real and imaginary parts, as was described in @ref{The
+Halfcomplex-format DFT}.  Likewise, FFTW's multidimensional
+@code{FFTW_DHT} r2r transform is not the same thing as the logical
+multi-dimensional discrete Hartley transform defined in the literature,
+as discussed in @ref{The Discrete Hartley Transform}.
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/rfftwnd-for-html.png
Binary file src/fftw-3.3.3/doc/rfftwnd-for-html.png has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/rfftwnd.eps
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/rfftwnd.eps	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2258 @@
+%!PS-Adobe-3.0 EPSF-3.0
+%%Title: ./rfftwnd.fig
+%%Creator: fig2dev Version 3.2 Patchlevel 5d
+%%CreationDate: Sun Nov 25 07:42:44 2012
+%%BoundingBox: 0 0 270 405
+%Magnification: 0.7000
+%%EndComments
+%%BeginProlog
+/$F2psDict 200 dict def
+$F2psDict begin
+$F2psDict /mtrx matrix put
+/col-1 {0 setgray} bind def
+/col0 {0.000 0.000 0.000 srgb} bind def
+/col1 {0.000 0.000 1.000 srgb} bind def
+/col2 {0.000 1.000 0.000 srgb} bind def
+/col3 {0.000 1.000 1.000 srgb} bind def
+/col4 {1.000 0.000 0.000 srgb} bind def
+/col5 {1.000 0.000 1.000 srgb} bind def
+/col6 {1.000 1.000 0.000 srgb} bind def
+/col7 {1.000 1.000 1.000 srgb} bind def
+/col8 {0.000 0.000 0.560 srgb} bind def
+/col9 {0.000 0.000 0.690 srgb} bind def
+/col10 {0.000 0.000 0.820 srgb} bind def
+/col11 {0.530 0.810 1.000 srgb} bind def
+/col12 {0.000 0.560 0.000 srgb} bind def
+/col13 {0.000 0.690 0.000 srgb} bind def
+/col14 {0.000 0.820 0.000 srgb} bind def
+/col15 {0.000 0.560 0.560 srgb} bind def
+/col16 {0.000 0.690 0.690 srgb} bind def
+/col17 {0.000 0.820 0.820 srgb} bind def
+/col18 {0.560 0.000 0.000 srgb} bind def
+/col19 {0.690 0.000 0.000 srgb} bind def
+/col20 {0.820 0.000 0.000 srgb} bind def
+/col21 {0.560 0.000 0.560 srgb} bind def
+/col22 {0.690 0.000 0.690 srgb} bind def
+/col23 {0.820 0.000 0.820 srgb} bind def
+/col24 {0.500 0.190 0.000 srgb} bind def
+/col25 {0.630 0.250 0.000 srgb} bind def
+/col26 {0.750 0.380 0.000 srgb} bind def
+/col27 {1.000 0.500 0.500 srgb} bind def
+/col28 {1.000 0.630 0.630 srgb} bind def
+/col29 {1.000 0.750 0.750 srgb} bind def
+/col30 {1.000 0.880 0.880 srgb} bind def
+/col31 {1.000 0.840 0.000 srgb} bind def
+/col32 {0.475 0.490 0.475 srgb} bind def
+/col33 {0.937 0.922 0.937 srgb} bind def
+/col34 {0.906 0.188 0.125 srgb} bind def
+/col35 {0.969 0.557 0.525 srgb} bind def
+/col36 {0.412 0.588 0.780 srgb} bind def
+/col37 {0.525 0.667 0.843 srgb} bind def
+/col38 {0.875 0.859 0.000 srgb} bind def
+
+end
+
+/cp {closepath} bind def
+/ef {eofill} bind def
+/gr {grestore} bind def
+/gs {gsave} bind def
+/sa {save} bind def
+/rs {restore} bind def
+/l {lineto} bind def
+/m {moveto} bind def
+/rm {rmoveto} bind def
+/n {newpath} bind def
+/s {stroke} bind def
+/sh {show} bind def
+/slc {setlinecap} bind def
+/slj {setlinejoin} bind def
+/slw {setlinewidth} bind def
+/srgb {setrgbcolor} bind def
+/rot {rotate} bind def
+/sc {scale} bind def
+/sd {setdash} bind def
+/ff {findfont} bind def
+/sf {setfont} bind def
+/scf {scalefont} bind def
+/sw {stringwidth} bind def
+/tr {translate} bind def
+/tnt {dup dup currentrgbcolor
+  4 -2 roll dup 1 exch sub 3 -1 roll mul add
+  4 -2 roll dup 1 exch sub 3 -1 roll mul add
+  4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
+  bind def
+/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
+  4 -2 roll mul srgb} bind def
+/reencdict 12 dict def /ReEncode { reencdict begin
+/newcodesandnames exch def /newfontname exch def /basefontname exch def
+/basefontdict basefontname findfont def /newfont basefontdict maxlength dict def
+basefontdict { exch dup /FID ne { dup /Encoding eq
+{ exch dup length array copy newfont 3 1 roll put }
+{ exch newfont 3 1 roll put } ifelse } { pop pop } ifelse } forall
+newfont /FontName newfontname put newcodesandnames aload pop
+128 1 255 { newfont /Encoding get exch /.notdef put } for
+newcodesandnames length 2 idiv { newfont /Encoding get 3 1 roll put } repeat
+newfontname newfont definefont pop end } def
+/isovec [
+8#055 /minus 8#200 /grave 8#201 /acute 8#202 /circumflex 8#203 /tilde
+8#204 /macron 8#205 /breve 8#206 /dotaccent 8#207 /dieresis
+8#210 /ring 8#211 /cedilla 8#212 /hungarumlaut 8#213 /ogonek 8#214 /caron
+8#220 /dotlessi 8#230 /oe 8#231 /OE
+8#240 /space 8#241 /exclamdown 8#242 /cent 8#243 /sterling
+8#244 /currency 8#245 /yen 8#246 /brokenbar 8#247 /section 8#250 /dieresis
+8#251 /copyright 8#252 /ordfeminine 8#253 /guillemotleft 8#254 /logicalnot
+8#255 /hyphen 8#256 /registered 8#257 /macron 8#260 /degree 8#261 /plusminus
+8#262 /twosuperior 8#263 /threesuperior 8#264 /acute 8#265 /mu 8#266 /paragraph
+8#267 /periodcentered 8#270 /cedilla 8#271 /onesuperior 8#272 /ordmasculine
+8#273 /guillemotright 8#274 /onequarter 8#275 /onehalf
+8#276 /threequarters 8#277 /questiondown 8#300 /Agrave 8#301 /Aacute
+8#302 /Acircumflex 8#303 /Atilde 8#304 /Adieresis 8#305 /Aring
+8#306 /AE 8#307 /Ccedilla 8#310 /Egrave 8#311 /Eacute
+8#312 /Ecircumflex 8#313 /Edieresis 8#314 /Igrave 8#315 /Iacute
+8#316 /Icircumflex 8#317 /Idieresis 8#320 /Eth 8#321 /Ntilde 8#322 /Ograve
+8#323 /Oacute 8#324 /Ocircumflex 8#325 /Otilde 8#326 /Odieresis 8#327 /multiply
+8#330 /Oslash 8#331 /Ugrave 8#332 /Uacute 8#333 /Ucircumflex
+8#334 /Udieresis 8#335 /Yacute 8#336 /Thorn 8#337 /germandbls 8#340 /agrave
+8#341 /aacute 8#342 /acircumflex 8#343 /atilde 8#344 /adieresis 8#345 /aring
+8#346 /ae 8#347 /ccedilla 8#350 /egrave 8#351 /eacute
+8#352 /ecircumflex 8#353 /edieresis 8#354 /igrave 8#355 /iacute
+8#356 /icircumflex 8#357 /idieresis 8#360 /eth 8#361 /ntilde 8#362 /ograve
+8#363 /oacute 8#364 /ocircumflex 8#365 /otilde 8#366 /odieresis 8#367 /divide
+8#370 /oslash 8#371 /ugrave 8#372 /uacute 8#373 /ucircumflex
+8#374 /udieresis 8#375 /yacute 8#376 /thorn 8#377 /ydieresis] def
+/Helvetica /Helvetica-iso isovec ReEncode
+/Helvetica-Bold /Helvetica-Bold-iso isovec ReEncode
+/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
+/$F2psEnd {$F2psEnteredState restore end} def
+
+/pageheader {
+save
+newpath 0 405 moveto 0 0 lineto 270 0 lineto 270 405 lineto closepath clip newpath
+-2.5 407.2 translate
+1 -1 scale
+$F2psBegin
+10 setmiterlimit
+0 slj 0 slc
+ 0.04200 0.04200 sc
+} bind def
+/pagefooter {
+$F2psEnd
+restore
+} bind def
+%%EndProlog
+pageheader
+%
+% Fig objects follow
+%
+% 
+% here starts figure with depth 998
+% Polyline
+0 slj
+0 slc
+0.000 slw
+n 1221 7280 m 6435 7280 l 6435 9676 l 1221 9676 l
+ 1221 7280 l  cp gs col7 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1221 7280 m 6435 7280 l 6435 9676 l 1221 9676 l
+ 1221 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1221 7280 m 1620 7280 l 1620 7656 l 1221 7656 l
+ 1221 7280 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1221 7280 m 1620 7280 l 1620 7656 l 1221 7656 l
+ 1221 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1620 7280 m 2019 7280 l 2019 7656 l 1620 7656 l
+ 1620 7280 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1620 7280 m 2019 7280 l 2019 7656 l 1620 7656 l
+ 1620 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2019 7280 m 2418 7280 l 2418 7656 l 2019 7656 l
+ 2019 7280 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 7280 m 2418 7280 l 2418 7656 l 2019 7656 l
+ 2019 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2418 7280 m 2817 7280 l 2817 7656 l 2418 7656 l
+ 2418 7280 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2418 7280 m 2817 7280 l 2817 7656 l 2418 7656 l
+ 2418 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4038 7280 m 4438 7280 l 4438 7656 l 4038 7656 l
+ 4038 7280 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4038 7280 m 4438 7280 l 4438 7656 l 4038 7656 l
+ 4038 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4438 7280 m 4837 7280 l 4837 7656 l 4438 7656 l
+ 4438 7280 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4438 7280 m 4837 7280 l 4837 7656 l 4438 7656 l
+ 4438 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4837 7280 m 5236 7280 l 5236 7656 l 4837 7656 l
+ 4837 7280 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4837 7280 m 5236 7280 l 5236 7656 l 4837 7656 l
+ 4837 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5236 7280 m 5635 7280 l 5635 7656 l 5236 7656 l
+ 5236 7280 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5236 7280 m 5635 7280 l 5635 7656 l 5236 7656 l
+ 5236 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1221 7656 m 1620 7656 l 1620 8032 l 1221 8032 l
+ 1221 7656 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1221 7656 m 1620 7656 l 1620 8032 l 1221 8032 l
+ 1221 7656 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1620 7656 m 2019 7656 l 2019 8032 l 1620 8032 l
+ 1620 7656 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1620 7656 m 2019 7656 l 2019 8032 l 1620 8032 l
+ 1620 7656 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2019 7656 m 2418 7656 l 2418 8032 l 2019 8032 l
+ 2019 7656 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 7656 m 2418 7656 l 2418 8032 l 2019 8032 l
+ 2019 7656 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2418 7656 m 2817 7656 l 2817 8032 l 2418 8032 l
+ 2418 7656 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2418 7656 m 2817 7656 l 2817 8032 l 2418 8032 l
+ 2418 7656 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4038 7656 m 4438 7656 l 4438 8032 l 4038 8032 l
+ 4038 7656 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4038 7656 m 4438 7656 l 4438 8032 l 4038 8032 l
+ 4038 7656 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4438 7656 m 4837 7656 l 4837 8032 l 4438 8032 l
+ 4438 7656 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4438 7656 m 4837 7656 l 4837 8032 l 4438 8032 l
+ 4438 7656 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4837 7656 m 5236 7656 l 5236 8032 l 4837 8032 l
+ 4837 7656 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4837 7656 m 5236 7656 l 5236 8032 l 4837 8032 l
+ 4837 7656 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5236 7656 m 5635 7656 l 5635 8032 l 5236 8032 l
+ 5236 7656 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5236 7656 m 5635 7656 l 5635 8032 l 5236 8032 l
+ 5236 7656 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1221 8924 m 1620 8924 l 1620 9300 l 1221 9300 l
+ 1221 8924 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1221 8924 m 1620 8924 l 1620 9300 l 1221 9300 l
+ 1221 8924 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1620 8924 m 2019 8924 l 2019 9300 l 1620 9300 l
+ 1620 8924 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1620 8924 m 2019 8924 l 2019 9300 l 1620 9300 l
+ 1620 8924 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2019 8924 m 2418 8924 l 2418 9300 l 2019 9300 l
+ 2019 8924 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 8924 m 2418 8924 l 2418 9300 l 2019 9300 l
+ 2019 8924 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2418 8924 m 2817 8924 l 2817 9300 l 2418 9300 l
+ 2418 8924 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2418 8924 m 2817 8924 l 2817 9300 l 2418 9300 l
+ 2418 8924 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4038 8924 m 4438 8924 l 4438 9300 l 4038 9300 l
+ 4038 8924 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4038 8924 m 4438 8924 l 4438 9300 l 4038 9300 l
+ 4038 8924 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4438 8924 m 4837 8924 l 4837 9300 l 4438 9300 l
+ 4438 8924 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4438 8924 m 4837 8924 l 4837 9300 l 4438 9300 l
+ 4438 8924 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4837 8924 m 5236 8924 l 5236 9300 l 4837 9300 l
+ 4837 8924 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4837 8924 m 5236 8924 l 5236 9300 l 4837 9300 l
+ 4837 8924 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5236 8924 m 5635 8924 l 5635 9300 l 5236 9300 l
+ 5236 8924 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5236 8924 m 5635 8924 l 5635 9300 l 5236 9300 l
+ 5236 8924 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1221 9300 m 1620 9300 l 1620 9676 l 1221 9676 l
+ 1221 9300 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1221 9300 m 1620 9300 l 1620 9676 l 1221 9676 l
+ 1221 9300 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1620 9300 m 2019 9300 l 2019 9676 l 1620 9676 l
+ 1620 9300 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1620 9300 m 2019 9300 l 2019 9676 l 1620 9676 l
+ 1620 9300 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2019 9300 m 2418 9300 l 2418 9676 l 2019 9676 l
+ 2019 9300 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 9300 m 2418 9300 l 2418 9676 l 2019 9676 l
+ 2019 9300 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2418 9300 m 2817 9300 l 2817 9676 l 2418 9676 l
+ 2418 9300 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2418 9300 m 2817 9300 l 2817 9676 l 2418 9676 l
+ 2418 9300 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4038 9300 m 4438 9300 l 4438 9676 l 4038 9676 l
+ 4038 9300 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4038 9300 m 4438 9300 l 4438 9676 l 4038 9676 l
+ 4038 9300 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4438 9300 m 4837 9300 l 4837 9676 l 4438 9676 l
+ 4438 9300 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4438 9300 m 4837 9300 l 4837 9676 l 4438 9676 l
+ 4438 9300 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4837 9300 m 5236 9300 l 5236 9676 l 4837 9676 l
+ 4837 9300 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4837 9300 m 5236 9300 l 5236 9676 l 4837 9676 l
+ 4837 9300 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5236 9300 m 5635 9300 l 5635 9676 l 5236 9676 l
+ 5236 9300 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5236 9300 m 5635 9300 l 5635 9676 l 5236 9676 l
+ 5236 9300 l  cp gs col32 s gr 
+/Helvetica-iso ff 225.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 225.00 scf sf
+2064 7726 m
+gs 1 -1 sc (...) col0 sh gr
+% Polyline
+n 2819 7658 m
+ 2869 7658 l gs col32 s gr 
+% Polyline
+n 2952 7658 m
+ 3002 7658 l gs col32 s gr 
+% Polyline
+n 3085 7658 m
+ 3135 7658 l gs col32 s gr 
+% Polyline
+n 3219 7658 m
+ 3252 7658 l gs col32 s gr 
+% Polyline
+n 2819 8033 m
+ 2869 8033 l gs col32 s gr 
+% Polyline
+n 2952 8033 m
+ 3002 8033 l gs col32 s gr 
+% Polyline
+n 3085 8033 m
+ 3135 8033 l gs col32 s gr 
+% Polyline
+n 3219 8033 m
+ 3252 8033 l gs col32 s gr 
+% Polyline
+n 2819 8025 m
+ 2819 8075 l gs col32 s gr 
+% Polyline
+n 2819 8158 m
+ 2819 8208 l gs col32 s gr 
+% Polyline
+n 2819 8291 m
+ 2819 8341 l gs col32 s gr 
+% Polyline
+n 2419 8025 m
+ 2419 8075 l gs col32 s gr 
+% Polyline
+n 2419 8158 m
+ 2419 8208 l gs col32 s gr 
+% Polyline
+n 2419 8291 m
+ 2419 8341 l gs col32 s gr 
+% Polyline
+n 2019 8025 m
+ 2019 8075 l gs col32 s gr 
+% Polyline
+n 2019 8158 m
+ 2019 8208 l gs col32 s gr 
+% Polyline
+n 2019 8291 m
+ 2019 8341 l gs col32 s gr 
+% Polyline
+n 1619 8025 m
+ 1619 8075 l gs col32 s gr 
+% Polyline
+n 1619 8158 m
+ 1619 8208 l gs col32 s gr 
+% Polyline
+n 1619 8291 m
+ 1619 8341 l gs col32 s gr 
+% Polyline
+n 4036 7658 m
+ 3986 7658 l gs col32 s gr 
+% Polyline
+n 3902 7658 m
+ 3852 7658 l gs col32 s gr 
+% Polyline
+n 3769 7658 m
+ 3719 7658 l gs col32 s gr 
+% Polyline
+n 3636 7658 m
+ 3602 7658 l gs col32 s gr 
+% Polyline
+n 4036 8033 m
+ 3986 8033 l gs col32 s gr 
+% Polyline
+n 3902 8033 m
+ 3852 8033 l gs col32 s gr 
+% Polyline
+n 3769 8033 m
+ 3719 8033 l gs col32 s gr 
+% Polyline
+n 3636 8033 m
+ 3602 8033 l gs col32 s gr 
+% Polyline
+n 4035 8025 m
+ 4035 8075 l gs col32 s gr 
+% Polyline
+n 4035 8158 m
+ 4035 8208 l gs col32 s gr 
+% Polyline
+n 4035 8291 m
+ 4035 8341 l gs col32 s gr 
+% Polyline
+n 4435 8025 m
+ 4435 8075 l gs col32 s gr 
+% Polyline
+n 4435 8158 m
+ 4435 8208 l gs col32 s gr 
+% Polyline
+n 4435 8291 m
+ 4435 8341 l gs col32 s gr 
+% Polyline
+n 4835 8025 m
+ 4835 8075 l gs col32 s gr 
+% Polyline
+n 4835 8158 m
+ 4835 8208 l gs col32 s gr 
+% Polyline
+n 4835 8291 m
+ 4835 8341 l gs col32 s gr 
+% Polyline
+n 5235 8025 m
+ 5235 8075 l gs col32 s gr 
+% Polyline
+n 5235 8158 m
+ 5235 8208 l gs col32 s gr 
+% Polyline
+n 5235 8291 m
+ 5235 8341 l gs col32 s gr 
+% Polyline
+n 4036 9300 m
+ 3986 9300 l gs col32 s gr 
+% Polyline
+n 3902 9300 m
+ 3852 9300 l gs col32 s gr 
+% Polyline
+n 3769 9300 m
+ 3719 9300 l gs col32 s gr 
+% Polyline
+n 3636 9300 m
+ 3602 9300 l gs col32 s gr 
+% Polyline
+n 4036 8925 m
+ 3986 8925 l gs col32 s gr 
+% Polyline
+n 3902 8925 m
+ 3852 8925 l gs col32 s gr 
+% Polyline
+n 3769 8925 m
+ 3719 8925 l gs col32 s gr 
+% Polyline
+n 3636 8925 m
+ 3602 8925 l gs col32 s gr 
+% Polyline
+n 4035 8933 m
+ 4035 8883 l gs col32 s gr 
+% Polyline
+n 4035 8800 m
+ 4035 8750 l gs col32 s gr 
+% Polyline
+n 4035 8666 m
+ 4035 8616 l gs col32 s gr 
+% Polyline
+n 4435 8933 m
+ 4435 8883 l gs col32 s gr 
+% Polyline
+n 4435 8800 m
+ 4435 8750 l gs col32 s gr 
+% Polyline
+n 4435 8666 m
+ 4435 8616 l gs col32 s gr 
+% Polyline
+n 4835 8933 m
+ 4835 8883 l gs col32 s gr 
+% Polyline
+n 4835 8800 m
+ 4835 8750 l gs col32 s gr 
+% Polyline
+n 4835 8666 m
+ 4835 8616 l gs col32 s gr 
+% Polyline
+n 5235 8933 m
+ 5235 8883 l gs col32 s gr 
+% Polyline
+n 5235 8800 m
+ 5235 8750 l gs col32 s gr 
+% Polyline
+n 5235 8666 m
+ 5235 8616 l gs col32 s gr 
+% Polyline
+n 2819 9300 m
+ 2869 9300 l gs col32 s gr 
+% Polyline
+n 2952 9300 m
+ 3002 9300 l gs col32 s gr 
+% Polyline
+n 3085 9300 m
+ 3135 9300 l gs col32 s gr 
+% Polyline
+n 3219 9300 m
+ 3252 9300 l gs col32 s gr 
+% Polyline
+n 2819 8925 m
+ 2869 8925 l gs col32 s gr 
+% Polyline
+n 2952 8925 m
+ 3002 8925 l gs col32 s gr 
+% Polyline
+n 3085 8925 m
+ 3135 8925 l gs col32 s gr 
+% Polyline
+n 3219 8925 m
+ 3252 8925 l gs col32 s gr 
+% Polyline
+n 2819 8933 m
+ 2819 8883 l gs col32 s gr 
+% Polyline
+n 2819 8800 m
+ 2819 8750 l gs col32 s gr 
+% Polyline
+n 2819 8666 m
+ 2819 8616 l gs col32 s gr 
+% Polyline
+n 2419 8933 m
+ 2419 8883 l gs col32 s gr 
+% Polyline
+n 2419 8800 m
+ 2419 8750 l gs col32 s gr 
+% Polyline
+n 2419 8666 m
+ 2419 8616 l gs col32 s gr 
+% Polyline
+n 2019 8933 m
+ 2019 8883 l gs col32 s gr 
+% Polyline
+n 2019 8800 m
+ 2019 8750 l gs col32 s gr 
+% Polyline
+n 2019 8666 m
+ 2019 8616 l gs col32 s gr 
+% Polyline
+n 1619 8933 m
+ 1619 8883 l gs col32 s gr 
+% Polyline
+n 1619 8800 m
+ 1619 8750 l gs col32 s gr 
+% Polyline
+n 1619 8666 m
+ 1619 8616 l gs col32 s gr 
+/Helvetica-iso ff 195.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+2338 7001 m
+gs 1 -1 sc (ny ) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+2605 7001 m
+gs 1 -1 sc (+ 2-ny%2) col34 sh gr
+/Helvetica-iso ff 195.00 scf sf
+3500 7001 m
+gs 1 -1 sc ( = 2*\(ny/2+1\)) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+681 8451 m
+gs 1 -1 sc (nx) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1364 7179 m
+gs 1 -1 sc (0) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+6097 7179 m
+gs 1 -1 sc (ny+1) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1064 7479 m
+gs 1 -1 sc (0) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+864 9479 m
+gs 1 -1 sc (nx-1) col0 sh gr
+% Polyline
+0.000 slw
+n 5636 7280 m 6035 7280 l 6035 7656 l 5636 7656 l
+ 5636 7280 l  cp gs col35 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5636 7280 m 6035 7280 l 6035 7656 l 5636 7656 l
+ 5636 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5636 7656 m 6035 7656 l 6035 8032 l 5636 8032 l
+ 5636 7656 l  cp gs col35 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5636 7656 m 6035 7656 l 6035 8032 l 5636 8032 l
+ 5636 7656 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5636 8924 m 6035 8924 l 6035 9300 l 5636 9300 l
+ 5636 8924 l  cp gs col35 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5636 8924 m 6035 8924 l 6035 9300 l 5636 9300 l
+ 5636 8924 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5636 9300 m 6035 9300 l 6035 9676 l 5636 9676 l
+ 5636 9300 l  cp gs col35 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5636 9300 m 6035 9300 l 6035 9676 l 5636 9676 l
+ 5636 9300 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 6036 7280 m 6435 7280 l 6435 7656 l 6036 7656 l
+ 6036 7280 l  cp gs col35 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 6036 7280 m 6435 7280 l 6435 7656 l 6036 7656 l
+ 6036 7280 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 6036 7656 m 6435 7656 l 6435 8032 l 6036 8032 l
+ 6036 7656 l  cp gs col35 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 6036 7656 m 6435 7656 l 6435 8032 l 6036 8032 l
+ 6036 7656 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 6036 8924 m 6435 8924 l 6435 9300 l 6036 9300 l
+ 6036 8924 l  cp gs col35 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 6036 8924 m 6435 8924 l 6435 9300 l 6036 9300 l
+ 6036 8924 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 6036 9300 m 6435 9300 l 6435 9676 l 6036 9676 l
+ 6036 9300 l  cp gs col35 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 6036 9300 m 6435 9300 l 6435 9676 l 6036 9676 l
+ 6036 9300 l  cp gs col32 s gr 
+% Polyline
+n 5635 7283 m
+ 5635 9683 l gs col0 s gr 
+% Polyline
+n 1420 7515 m
+ 6312 7515 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 6348 7518 m 6117 7462 l
+ 6117 7515 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 6348 7518 m 6117 7462 l
+ 6117 7515 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 6348 7512 m 6117 7568 l
+ 6117 7515 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 6348 7512 m 6117 7568 l
+ 6117 7515 l gs col0 s gr 
+% Polyline
+n 1420 7891 m
+ 5863 7891 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5895 7894 m 5685 7838 l
+ 5685 7891 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5895 7894 m 5685 7838 l
+ 5685 7891 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5895 7888 m 5685 7944 l
+ 5685 7891 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5895 7888 m 5685 7944 l
+ 5685 7891 l gs col0 s gr 
+% Polyline
+n 1420 9112 m
+ 5863 9112 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5895 9115 m 5685 9059 l
+ 5685 9112 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5895 9115 m 5685 9059 l
+ 5685 9112 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5895 9109 m 5685 9165 l
+ 5685 9112 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5895 9109 m 5685 9165 l
+ 5685 9112 l gs col0 s gr 
+% Polyline
+n 1420 9488 m
+ 5863 9488 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5895 9491 m 5685 9435 l
+ 5685 9488 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5895 9491 m 5685 9435 l
+ 5685 9488 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5895 9485 m 5685 9541 l
+ 5685 9488 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5895 9485 m 5685 9541 l
+ 5685 9488 l gs col0 s gr 
+/Helvetica-iso ff 375.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 375.00 scf sf
+6250 9461 m
+gs 1 -1 sc  90.0 rot (\(padding\)) col0 sh gr
+/Helvetica-Bold-iso ff 240.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-Bold-iso ff 240.00 scf sf
+428 9283 m
+gs 1 -1 sc  90.0 rot (input, in-place) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1264 7429 m
+gs 1 -1 sc (0) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1681 7429 m
+gs 1 -1 sc (1) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+2081 7429 m
+gs 1 -1 sc (2) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+2481 7429 m
+gs 1 -1 sc (3) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+4081 7429 m
+gs 1 -1 sc (ny-4) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+4897 7429 m
+gs 1 -1 sc (ny-2) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+5297 7429 m
+gs 1 -1 sc (ny-1) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+4497 7429 m
+gs 1 -1 sc (ny-3) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1264 7795 m
+gs 1 -1 sc (ny+2) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1664 7795 m
+gs 1 -1 sc (ny+3) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+5681 7429 m
+gs 1 -1 sc (ny) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+6081 7429 m
+gs 1 -1 sc (ny+1) col0 sh gr
+% Polyline
+0.000 slw
+n 5226 5196 m 5623 5196 l 5623 5572 l 5226 5572 l
+ 5226 5196 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 4827 5196 m 5226 5196 l 5226 5572 l 4827 5572 l
+ 4827 5196 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4835 5194 m 5631 5194 l 5631 5569 l 4835 5569 l
+ 4835 5194 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 4434 5196 m 4832 5196 l 4832 5572 l 4434 5572 l
+ 4434 5196 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 4035 5196 m 4434 5196 l 4434 5572 l 4035 5572 l
+ 4035 5196 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4044 5194 m 4840 5194 l 4840 5569 l 4044 5569 l
+ 4044 5194 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 6026 5196 m 6440 5196 l 6440 5572 l 6026 5572 l
+ 6026 5196 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 5627 5196 m 6026 5196 l 6026 5572 l 5627 5572 l
+ 5627 5196 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5635 5194 m 6440 5194 l 6440 5569 l 5635 5569 l
+ 5635 5194 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 5226 5571 m 5623 5571 l 5623 5947 l 5226 5947 l
+ 5226 5571 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 4827 5571 m 5226 5571 l 5226 5947 l 4827 5947 l
+ 4827 5571 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4835 5569 m 5631 5569 l 5631 5944 l 4835 5944 l
+ 4835 5569 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 4434 5571 m 4832 5571 l 4832 5947 l 4434 5947 l
+ 4434 5571 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 4035 5571 m 4434 5571 l 4434 5947 l 4035 5947 l
+ 4035 5571 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4044 5569 m 4840 5569 l 4840 5944 l 4044 5944 l
+ 4044 5569 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 6026 5571 m 6440 5571 l 6440 5947 l 6026 5947 l
+ 6026 5571 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 5627 5571 m 6026 5571 l 6026 5947 l 5627 5947 l
+ 5627 5571 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5635 5569 m 6440 5569 l 6440 5944 l 5635 5944 l
+ 5635 5569 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 2409 5571 m 2807 5571 l 2807 5947 l 2409 5947 l
+ 2409 5571 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 2010 5571 m 2409 5571 l 2409 5947 l 2010 5947 l
+ 2010 5571 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 5561 m 2815 5561 l 2815 5936 l 2019 5936 l
+ 2019 5561 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 1618 5571 m 2015 5571 l 2015 5947 l 1618 5947 l
+ 1618 5571 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 1219 5571 m 1618 5571 l 1618 5947 l 1219 5947 l
+ 1219 5571 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1227 5561 m 2023 5561 l 2023 5939 l 1227 5939 l
+ 1227 5561 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 2409 5196 m 2807 5196 l 2807 5572 l 2409 5572 l
+ 2409 5196 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 2010 5196 m 2409 5196 l 2409 5572 l 2010 5572 l
+ 2010 5196 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 5186 m 2815 5186 l 2815 5561 l 2019 5561 l
+ 2019 5186 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 1618 5196 m 2015 5196 l 2015 5572 l 1618 5572 l
+ 1618 5196 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 1219 5196 m 1618 5196 l 1618 5572 l 1219 5572 l
+ 1219 5196 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1227 5186 m 2023 5186 l 2023 5561 l 1227 5561 l
+ 1227 5186 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 5226 3546 m 5623 3546 l 5623 3922 l 5226 3922 l
+ 5226 3546 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 4827 3546 m 5226 3546 l 5226 3922 l 4827 3922 l
+ 4827 3546 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4835 3544 m 5631 3544 l 5631 3919 l 4835 3919 l
+ 4835 3544 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4434 3546 m 4832 3546 l 4832 3922 l 4434 3922 l
+ 4434 3546 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 4035 3546 m 4434 3546 l 4434 3922 l 4035 3922 l
+ 4035 3546 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4044 3544 m 4840 3544 l 4840 3919 l 4044 3919 l
+ 4044 3544 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 5990 3546 m 6432 3546 l 6432 3955 l 5990 3955 l
+ 5990 3546 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 5627 3546 m 6026 3546 l 6026 3922 l 5627 3922 l
+ 5627 3546 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5635 3544 m 6440 3544 l 6440 3919 l 5635 3919 l
+ 5635 3544 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 5226 3921 m 5623 3921 l 5623 4297 l 5226 4297 l
+ 5226 3921 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 4827 3921 m 5226 3921 l 5226 4297 l 4827 4297 l
+ 4827 3921 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4835 3919 m 5631 3919 l 5631 4294 l 4835 4294 l
+ 4835 3919 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 4434 3921 m 4832 3921 l 4832 4297 l 4434 4297 l
+ 4434 3921 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 4035 3921 m 4434 3921 l 4434 4297 l 4035 4297 l
+ 4035 3921 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4044 3919 m 4840 3919 l 4840 4294 l 4044 4294 l
+ 4044 3919 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 6026 3921 m 6432 3921 l 6432 4297 l 6026 4297 l
+ 6026 3921 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 5627 3921 m 6026 3921 l 6026 4297 l 5627 4297 l
+ 5627 3921 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5635 3919 m 6440 3919 l 6440 4294 l 5635 4294 l
+ 5635 3919 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 2409 3921 m 2807 3921 l 2807 4297 l 2409 4297 l
+ 2409 3921 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 2010 3921 m 2409 3921 l 2409 4297 l 2010 4297 l
+ 2010 3921 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 3919 m 2815 3919 l 2815 4294 l 2019 4294 l
+ 2019 3919 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 1618 3921 m 2015 3921 l 2015 4297 l 1618 4297 l
+ 1618 3921 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 1219 3921 m 1618 3921 l 1618 4297 l 1219 4297 l
+ 1219 3921 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1227 3919 m 2023 3919 l 2023 4294 l 1227 4294 l
+ 1227 3919 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 2409 3546 m 2815 3546 l 2815 3922 l 2409 3922 l
+ 2409 3546 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 2010 3546 m 2409 3546 l 2409 3922 l 2010 3922 l
+ 2010 3546 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 3544 m 2815 3544 l 2815 3919 l 2019 3919 l
+ 2019 3544 l  cp gs col38 s gr 
+% Polyline
+0.000 slw
+n 1618 3546 m 2015 3546 l 2015 3922 l 1618 3922 l
+ 1618 3546 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 1219 3546 m 1618 3546 l 1618 3922 l 1219 3922 l
+ 1219 3546 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1227 3544 m 2023 3544 l 2023 3919 l 1227 3919 l
+ 1227 3544 l  cp gs col38 s gr 
+% Polyline
+n 1221 3546 m 6440 3546 l 6440 5941 l 1221 5941 l
+ 1221 3546 l  cp gs col32 s gr 
+/Helvetica-iso ff 225.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 225.00 scf sf
+2064 3993 m
+gs 1 -1 sc (...) col0 sh gr
+% Polyline
+n 2819 3915 m
+ 2869 3915 l gs col32 s gr 
+% Polyline
+n 2952 3915 m
+ 3002 3915 l gs col32 s gr 
+% Polyline
+n 3085 3915 m
+ 3135 3915 l gs col32 s gr 
+% Polyline
+n 3219 3915 m
+ 3252 3915 l gs col32 s gr 
+% Polyline
+n 2819 4290 m
+ 2869 4290 l gs col32 s gr 
+% Polyline
+n 2952 4290 m
+ 3002 4290 l gs col32 s gr 
+% Polyline
+n 3085 4290 m
+ 3135 4290 l gs col32 s gr 
+% Polyline
+n 3219 4290 m
+ 3252 4290 l gs col32 s gr 
+% Polyline
+n 2819 4282 m
+ 2819 4332 l gs col32 s gr 
+% Polyline
+n 2819 4415 m
+ 2819 4465 l gs col32 s gr 
+% Polyline
+n 2819 4548 m
+ 2819 4598 l gs col32 s gr 
+% Polyline
+n 2019 4282 m
+ 2019 4332 l gs col32 s gr 
+% Polyline
+n 2019 4415 m
+ 2019 4465 l gs col32 s gr 
+% Polyline
+n 2019 4548 m
+ 2019 4598 l gs col32 s gr 
+% Polyline
+n 4036 3915 m
+ 3986 3915 l gs col32 s gr 
+% Polyline
+n 3902 3915 m
+ 3852 3915 l gs col32 s gr 
+% Polyline
+n 3769 3915 m
+ 3719 3915 l gs col32 s gr 
+% Polyline
+n 3636 3915 m
+ 3602 3915 l gs col32 s gr 
+% Polyline
+n 4036 4290 m
+ 3986 4290 l gs col32 s gr 
+% Polyline
+n 3902 4290 m
+ 3852 4290 l gs col32 s gr 
+% Polyline
+n 3769 4290 m
+ 3719 4290 l gs col32 s gr 
+% Polyline
+n 3636 4290 m
+ 3602 4290 l gs col32 s gr 
+% Polyline
+n 4035 4282 m
+ 4035 4332 l gs col32 s gr 
+% Polyline
+n 4035 4415 m
+ 4035 4465 l gs col32 s gr 
+% Polyline
+n 4035 4548 m
+ 4035 4598 l gs col32 s gr 
+% Polyline
+n 4835 4282 m
+ 4835 4332 l gs col32 s gr 
+% Polyline
+n 4835 4415 m
+ 4835 4465 l gs col32 s gr 
+% Polyline
+n 4835 4548 m
+ 4835 4598 l gs col32 s gr 
+% Polyline
+n 4036 5565 m
+ 3986 5565 l gs col32 s gr 
+% Polyline
+n 3902 5565 m
+ 3852 5565 l gs col32 s gr 
+% Polyline
+n 3769 5565 m
+ 3719 5565 l gs col32 s gr 
+% Polyline
+n 3636 5565 m
+ 3602 5565 l gs col32 s gr 
+% Polyline
+n 4036 5190 m
+ 3986 5190 l gs col32 s gr 
+% Polyline
+n 3902 5190 m
+ 3852 5190 l gs col32 s gr 
+% Polyline
+n 3769 5190 m
+ 3719 5190 l gs col32 s gr 
+% Polyline
+n 3636 5190 m
+ 3602 5190 l gs col32 s gr 
+% Polyline
+n 4035 5198 m
+ 4035 5148 l gs col32 s gr 
+% Polyline
+n 4035 5065 m
+ 4035 5015 l gs col32 s gr 
+% Polyline
+n 4035 4932 m
+ 4035 4882 l gs col32 s gr 
+% Polyline
+n 4835 5198 m
+ 4835 5148 l gs col32 s gr 
+% Polyline
+n 4835 5065 m
+ 4835 5015 l gs col32 s gr 
+% Polyline
+n 4835 4932 m
+ 4835 4882 l gs col32 s gr 
+% Polyline
+n 2819 5565 m
+ 2869 5565 l gs col32 s gr 
+% Polyline
+n 2952 5565 m
+ 3002 5565 l gs col32 s gr 
+% Polyline
+n 3085 5565 m
+ 3135 5565 l gs col32 s gr 
+% Polyline
+n 3219 5565 m
+ 3252 5565 l gs col32 s gr 
+% Polyline
+n 2819 5190 m
+ 2869 5190 l gs col32 s gr 
+% Polyline
+n 2952 5190 m
+ 3002 5190 l gs col32 s gr 
+% Polyline
+n 3085 5190 m
+ 3135 5190 l gs col32 s gr 
+% Polyline
+n 3219 5190 m
+ 3252 5190 l gs col32 s gr 
+% Polyline
+n 2819 5198 m
+ 2819 5148 l gs col32 s gr 
+% Polyline
+n 2819 5065 m
+ 2819 5015 l gs col32 s gr 
+% Polyline
+n 2819 4932 m
+ 2819 4882 l gs col32 s gr 
+% Polyline
+n 2019 5198 m
+ 2019 5148 l gs col32 s gr 
+% Polyline
+n 2019 5065 m
+ 2019 5015 l gs col32 s gr 
+% Polyline
+n 2019 4932 m
+ 2019 4882 l gs col32 s gr 
+/Helvetica-iso ff 195.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+3181 3267 m
+gs 1 -1 sc (ny/2+1) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+681 4717 m
+gs 1 -1 sc (nx) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1564 3445 m
+gs 1 -1 sc (0) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+5831 3445 m
+gs 1 -1 sc (ny/2) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1064 3745 m
+gs 1 -1 sc (0) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+864 5745 m
+gs 1 -1 sc (nx-1) col0 sh gr
+% Polyline
+n 5635 4282 m
+ 5635 4332 l gs col32 s gr 
+% Polyline
+n 5635 4415 m
+ 5635 4465 l gs col32 s gr 
+% Polyline
+n 5635 4548 m
+ 5635 4598 l gs col32 s gr 
+% Polyline
+n 5635 5198 m
+ 5635 5148 l gs col32 s gr 
+% Polyline
+n 5635 5065 m
+ 5635 5015 l gs col32 s gr 
+% Polyline
+n 5635 4932 m
+ 5635 4882 l gs col32 s gr 
+% Polyline
+n 1420 3781 m
+ 6312 3781 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 6348 3784 m 6117 3728 l
+ 6117 3781 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 6348 3784 m 6117 3728 l
+ 6117 3781 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 6348 3778 m 6117 3834 l
+ 6117 3781 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 6348 3778 m 6117 3834 l
+ 6117 3781 l gs col0 s gr 
+% Polyline
+n 1420 4169 m
+ 6312 4169 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 6348 4172 m 6117 4116 l
+ 6117 4169 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 6348 4172 m 6117 4116 l
+ 6117 4169 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 6348 4166 m 6117 4222 l
+ 6117 4169 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 6348 4166 m 6117 4222 l
+ 6117 4169 l gs col0 s gr 
+% Polyline
+n 1420 5390 m
+ 6312 5390 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 6348 5393 m 6117 5337 l
+ 6117 5390 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 6348 5393 m 6117 5337 l
+ 6117 5390 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 6348 5387 m 6117 5443 l
+ 6117 5390 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 6348 5387 m 6117 5443 l
+ 6117 5390 l gs col0 s gr 
+% Polyline
+n 1420 5766 m
+ 6312 5766 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 6348 5769 m 6117 5713 l
+ 6117 5766 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 6348 5769 m 6117 5713 l
+ 6117 5766 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 6348 5763 m 6117 5819 l
+ 6117 5766 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 6348 5763 m 6117 5819 l
+ 6117 5766 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 1469 6215 m 1868 6215 l 1868 6591 l 1469 6591 l
+ 1469 6215 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1469 6215 m 1868 6215 l 1868 6591 l 1469 6591 l
+ 1469 6215 l  cp gs col32 s gr 
+/Helvetica-iso ff 195.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+1981 6463 m
+gs 1 -1 sc (= double) col0 sh gr
+% Polyline
+0.000 slw
+n 4026 6217 m 4432 6217 l 4432 6593 l 4026 6593 l
+ 4026 6217 l  cp gs col36 1.00 shd ef gr 
+% Polyline
+n 3627 6217 m 4026 6217 l 4026 6593 l 3627 6593 l
+ 3627 6217 l  cp gs col37 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 3635 6215 m 4440 6215 l 4440 6590 l 3635 6590 l
+ 3635 6215 l  cp gs col38 s gr 
+/Helvetica-iso ff 195.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+4547 6463 m
+gs 1 -1 sc (= fftw_complex) col0 sh gr
+/Helvetica-Bold-iso ff 240.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-Bold-iso ff 240.00 scf sf
+428 5128 m
+gs 1 -1 sc  90.0 rot (output) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1264 3679 m
+gs 1 -1 sc (0) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+2081 3679 m
+gs 1 -1 sc (1) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+4097 3679 m
+gs 1 -1 sc (ny/2-2) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+4914 3679 m
+gs 1 -1 sc (ny/2-1) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1264 4062 m
+gs 1 -1 sc (ny/2+1) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+5697 3679 m
+gs 1 -1 sc (ny/2) col0 sh gr
+% Polyline
+0.000 slw
+n 1221 495 m 5635 495 l 5635 2890 l 1221 2890 l
+ 1221 495 l  cp gs col7 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1221 495 m 5635 495 l 5635 2890 l 1221 2890 l
+ 1221 495 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1221 495 m 1620 495 l 1620 871 l 1221 871 l
+ 1221 495 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1221 495 m 1620 495 l 1620 871 l 1221 871 l
+ 1221 495 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1620 495 m 2019 495 l 2019 871 l 1620 871 l
+ 1620 495 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1620 495 m 2019 495 l 2019 871 l 1620 871 l
+ 1620 495 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2019 495 m 2418 495 l 2418 871 l 2019 871 l
+ 2019 495 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 495 m 2418 495 l 2418 871 l 2019 871 l
+ 2019 495 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2418 495 m 2817 495 l 2817 871 l 2418 871 l
+ 2418 495 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2418 495 m 2817 495 l 2817 871 l 2418 871 l
+ 2418 495 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4038 495 m 4438 495 l 4438 871 l 4038 871 l
+ 4038 495 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4038 495 m 4438 495 l 4438 871 l 4038 871 l
+ 4038 495 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4438 495 m 4837 495 l 4837 871 l 4438 871 l
+ 4438 495 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4438 495 m 4837 495 l 4837 871 l 4438 871 l
+ 4438 495 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4837 495 m 5236 495 l 5236 871 l 4837 871 l
+ 4837 495 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4837 495 m 5236 495 l 5236 871 l 4837 871 l
+ 4837 495 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5236 495 m 5635 495 l 5635 871 l 5236 871 l
+ 5236 495 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5236 495 m 5635 495 l 5635 871 l 5236 871 l
+ 5236 495 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1221 871 m 1620 871 l 1620 1247 l 1221 1247 l
+ 1221 871 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1221 871 m 1620 871 l 1620 1247 l 1221 1247 l
+ 1221 871 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1620 871 m 2019 871 l 2019 1247 l 1620 1247 l
+ 1620 871 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1620 871 m 2019 871 l 2019 1247 l 1620 1247 l
+ 1620 871 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2019 871 m 2418 871 l 2418 1247 l 2019 1247 l
+ 2019 871 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 871 m 2418 871 l 2418 1247 l 2019 1247 l
+ 2019 871 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2418 871 m 2817 871 l 2817 1247 l 2418 1247 l
+ 2418 871 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2418 871 m 2817 871 l 2817 1247 l 2418 1247 l
+ 2418 871 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4038 871 m 4438 871 l 4438 1247 l 4038 1247 l
+ 4038 871 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4038 871 m 4438 871 l 4438 1247 l 4038 1247 l
+ 4038 871 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4438 871 m 4837 871 l 4837 1247 l 4438 1247 l
+ 4438 871 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4438 871 m 4837 871 l 4837 1247 l 4438 1247 l
+ 4438 871 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4837 871 m 5236 871 l 5236 1247 l 4837 1247 l
+ 4837 871 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4837 871 m 5236 871 l 5236 1247 l 4837 1247 l
+ 4837 871 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5236 871 m 5635 871 l 5635 1247 l 5236 1247 l
+ 5236 871 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5236 871 m 5635 871 l 5635 1247 l 5236 1247 l
+ 5236 871 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1221 2139 m 1620 2139 l 1620 2515 l 1221 2515 l
+ 1221 2139 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1221 2139 m 1620 2139 l 1620 2515 l 1221 2515 l
+ 1221 2139 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1620 2139 m 2019 2139 l 2019 2515 l 1620 2515 l
+ 1620 2139 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1620 2139 m 2019 2139 l 2019 2515 l 1620 2515 l
+ 1620 2139 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2019 2139 m 2418 2139 l 2418 2515 l 2019 2515 l
+ 2019 2139 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 2139 m 2418 2139 l 2418 2515 l 2019 2515 l
+ 2019 2139 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2418 2139 m 2817 2139 l 2817 2515 l 2418 2515 l
+ 2418 2139 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2418 2139 m 2817 2139 l 2817 2515 l 2418 2515 l
+ 2418 2139 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4038 2139 m 4438 2139 l 4438 2515 l 4038 2515 l
+ 4038 2139 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4038 2139 m 4438 2139 l 4438 2515 l 4038 2515 l
+ 4038 2139 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4438 2139 m 4837 2139 l 4837 2515 l 4438 2515 l
+ 4438 2139 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4438 2139 m 4837 2139 l 4837 2515 l 4438 2515 l
+ 4438 2139 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4837 2139 m 5236 2139 l 5236 2515 l 4837 2515 l
+ 4837 2139 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4837 2139 m 5236 2139 l 5236 2515 l 4837 2515 l
+ 4837 2139 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5236 2139 m 5635 2139 l 5635 2515 l 5236 2515 l
+ 5236 2139 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5236 2139 m 5635 2139 l 5635 2515 l 5236 2515 l
+ 5236 2139 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1221 2515 m 1620 2515 l 1620 2890 l 1221 2890 l
+ 1221 2515 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1221 2515 m 1620 2515 l 1620 2890 l 1221 2890 l
+ 1221 2515 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 1620 2515 m 2019 2515 l 2019 2890 l 1620 2890 l
+ 1620 2515 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 1620 2515 m 2019 2515 l 2019 2890 l 1620 2890 l
+ 1620 2515 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2019 2515 m 2418 2515 l 2418 2890 l 2019 2890 l
+ 2019 2515 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2019 2515 m 2418 2515 l 2418 2890 l 2019 2890 l
+ 2019 2515 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 2418 2515 m 2817 2515 l 2817 2890 l 2418 2890 l
+ 2418 2515 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 2418 2515 m 2817 2515 l 2817 2890 l 2418 2890 l
+ 2418 2515 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4038 2515 m 4438 2515 l 4438 2890 l 4038 2890 l
+ 4038 2515 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4038 2515 m 4438 2515 l 4438 2890 l 4038 2890 l
+ 4038 2515 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4438 2515 m 4837 2515 l 4837 2890 l 4438 2890 l
+ 4438 2515 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4438 2515 m 4837 2515 l 4837 2890 l 4438 2890 l
+ 4438 2515 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 4837 2515 m 5236 2515 l 5236 2890 l 4837 2890 l
+ 4837 2515 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 4837 2515 m 5236 2515 l 5236 2890 l 4837 2890 l
+ 4837 2515 l  cp gs col32 s gr 
+% Polyline
+0.000 slw
+n 5236 2515 m 5635 2515 l 5635 2890 l 5236 2890 l
+ 5236 2515 l  cp gs col33 1.00 shd ef gr 
+% Polyline
+7.500 slw
+n 5236 2515 m 5635 2515 l 5635 2890 l 5236 2890 l
+ 5236 2515 l  cp gs col32 s gr 
+% Polyline
+n 1420 730 m
+ 5459 730 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5488 733 m 5298 677 l
+ 5298 730 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5488 733 m 5298 677 l
+ 5298 730 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5488 727 m 5298 783 l
+ 5298 730 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5488 727 m 5298 783 l
+ 5298 730 l gs col0 s gr 
+/Helvetica-iso ff 225.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 225.00 scf sf
+2064 943 m
+gs 1 -1 sc (...) col0 sh gr
+% Polyline
+n 2819 873 m
+ 2869 873 l gs col32 s gr 
+% Polyline
+n 2952 873 m
+ 3002 873 l gs col32 s gr 
+% Polyline
+n 3085 873 m
+ 3135 873 l gs col32 s gr 
+% Polyline
+n 3219 873 m
+ 3252 873 l gs col32 s gr 
+% Polyline
+n 2819 1248 m
+ 2869 1248 l gs col32 s gr 
+% Polyline
+n 2952 1248 m
+ 3002 1248 l gs col32 s gr 
+% Polyline
+n 3085 1248 m
+ 3135 1248 l gs col32 s gr 
+% Polyline
+n 3219 1248 m
+ 3252 1248 l gs col32 s gr 
+% Polyline
+n 2819 1240 m
+ 2819 1290 l gs col32 s gr 
+% Polyline
+n 2819 1373 m
+ 2819 1423 l gs col32 s gr 
+% Polyline
+n 2819 1506 m
+ 2819 1556 l gs col32 s gr 
+% Polyline
+n 2419 1240 m
+ 2419 1290 l gs col32 s gr 
+% Polyline
+n 2419 1373 m
+ 2419 1423 l gs col32 s gr 
+% Polyline
+n 2419 1506 m
+ 2419 1556 l gs col32 s gr 
+% Polyline
+n 2019 1240 m
+ 2019 1290 l gs col32 s gr 
+% Polyline
+n 2019 1373 m
+ 2019 1423 l gs col32 s gr 
+% Polyline
+n 2019 1506 m
+ 2019 1556 l gs col32 s gr 
+% Polyline
+n 1619 1240 m
+ 1619 1290 l gs col32 s gr 
+% Polyline
+n 1619 1373 m
+ 1619 1423 l gs col32 s gr 
+% Polyline
+n 1619 1506 m
+ 1619 1556 l gs col32 s gr 
+% Polyline
+n 4036 873 m
+ 3986 873 l gs col32 s gr 
+% Polyline
+n 3902 873 m
+ 3852 873 l gs col32 s gr 
+% Polyline
+n 3769 873 m
+ 3719 873 l gs col32 s gr 
+% Polyline
+n 3636 873 m
+ 3602 873 l gs col32 s gr 
+% Polyline
+n 4036 1248 m
+ 3986 1248 l gs col32 s gr 
+% Polyline
+n 3902 1248 m
+ 3852 1248 l gs col32 s gr 
+% Polyline
+n 3769 1248 m
+ 3719 1248 l gs col32 s gr 
+% Polyline
+n 3636 1248 m
+ 3602 1248 l gs col32 s gr 
+% Polyline
+n 4035 1240 m
+ 4035 1290 l gs col32 s gr 
+% Polyline
+n 4035 1373 m
+ 4035 1423 l gs col32 s gr 
+% Polyline
+n 4035 1506 m
+ 4035 1556 l gs col32 s gr 
+% Polyline
+n 4435 1240 m
+ 4435 1290 l gs col32 s gr 
+% Polyline
+n 4435 1373 m
+ 4435 1423 l gs col32 s gr 
+% Polyline
+n 4435 1506 m
+ 4435 1556 l gs col32 s gr 
+% Polyline
+n 4835 1240 m
+ 4835 1290 l gs col32 s gr 
+% Polyline
+n 4835 1373 m
+ 4835 1423 l gs col32 s gr 
+% Polyline
+n 4835 1506 m
+ 4835 1556 l gs col32 s gr 
+% Polyline
+n 5235 1240 m
+ 5235 1290 l gs col32 s gr 
+% Polyline
+n 5235 1373 m
+ 5235 1423 l gs col32 s gr 
+% Polyline
+n 5235 1506 m
+ 5235 1556 l gs col32 s gr 
+% Polyline
+n 4036 2515 m
+ 3986 2515 l gs col32 s gr 
+% Polyline
+n 3902 2515 m
+ 3852 2515 l gs col32 s gr 
+% Polyline
+n 3769 2515 m
+ 3719 2515 l gs col32 s gr 
+% Polyline
+n 3636 2515 m
+ 3602 2515 l gs col32 s gr 
+% Polyline
+n 4036 2140 m
+ 3986 2140 l gs col32 s gr 
+% Polyline
+n 3902 2140 m
+ 3852 2140 l gs col32 s gr 
+% Polyline
+n 3769 2140 m
+ 3719 2140 l gs col32 s gr 
+% Polyline
+n 3636 2140 m
+ 3602 2140 l gs col32 s gr 
+% Polyline
+n 4035 2148 m
+ 4035 2098 l gs col32 s gr 
+% Polyline
+n 4035 2015 m
+ 4035 1965 l gs col32 s gr 
+% Polyline
+n 4035 1881 m
+ 4035 1831 l gs col32 s gr 
+% Polyline
+n 4435 2148 m
+ 4435 2098 l gs col32 s gr 
+% Polyline
+n 4435 2015 m
+ 4435 1965 l gs col32 s gr 
+% Polyline
+n 4435 1881 m
+ 4435 1831 l gs col32 s gr 
+% Polyline
+n 4835 2148 m
+ 4835 2098 l gs col32 s gr 
+% Polyline
+n 4835 2015 m
+ 4835 1965 l gs col32 s gr 
+% Polyline
+n 4835 1881 m
+ 4835 1831 l gs col32 s gr 
+% Polyline
+n 5235 2148 m
+ 5235 2098 l gs col32 s gr 
+% Polyline
+n 5235 2015 m
+ 5235 1965 l gs col32 s gr 
+% Polyline
+n 5235 1881 m
+ 5235 1831 l gs col32 s gr 
+% Polyline
+n 2819 2515 m
+ 2869 2515 l gs col32 s gr 
+% Polyline
+n 2952 2515 m
+ 3002 2515 l gs col32 s gr 
+% Polyline
+n 3085 2515 m
+ 3135 2515 l gs col32 s gr 
+% Polyline
+n 3219 2515 m
+ 3252 2515 l gs col32 s gr 
+% Polyline
+n 2819 2140 m
+ 2869 2140 l gs col32 s gr 
+% Polyline
+n 2952 2140 m
+ 3002 2140 l gs col32 s gr 
+% Polyline
+n 3085 2140 m
+ 3135 2140 l gs col32 s gr 
+% Polyline
+n 3219 2140 m
+ 3252 2140 l gs col32 s gr 
+% Polyline
+n 2819 2148 m
+ 2819 2098 l gs col32 s gr 
+% Polyline
+n 2819 2015 m
+ 2819 1965 l gs col32 s gr 
+% Polyline
+n 2819 1881 m
+ 2819 1831 l gs col32 s gr 
+% Polyline
+n 2419 2148 m
+ 2419 2098 l gs col32 s gr 
+% Polyline
+n 2419 2015 m
+ 2419 1965 l gs col32 s gr 
+% Polyline
+n 2419 1881 m
+ 2419 1831 l gs col32 s gr 
+% Polyline
+n 2019 2148 m
+ 2019 2098 l gs col32 s gr 
+% Polyline
+n 2019 2015 m
+ 2019 1965 l gs col32 s gr 
+% Polyline
+n 2019 1881 m
+ 2019 1831 l gs col32 s gr 
+% Polyline
+n 1619 2148 m
+ 1619 2098 l gs col32 s gr 
+% Polyline
+n 1619 2015 m
+ 1619 1965 l gs col32 s gr 
+% Polyline
+n 1619 1881 m
+ 1619 1831 l gs col32 s gr 
+/Helvetica-iso ff 195.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+3381 217 m
+gs 1 -1 sc (ny) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 195.00 scf sf
+681 1667 m
+gs 1 -1 sc (nx) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1364 395 m
+gs 1 -1 sc (0) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+5281 395 m
+gs 1 -1 sc (ny-1) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1064 695 m
+gs 1 -1 sc (0) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+864 2695 m
+gs 1 -1 sc (nx-1) col0 sh gr
+% Polyline
+n 1420 1106 m
+ 5459 1106 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5488 1109 m 5298 1053 l
+ 5298 1106 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5488 1109 m 5298 1053 l
+ 5298 1106 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5488 1103 m 5298 1159 l
+ 5298 1106 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5488 1103 m 5298 1159 l
+ 5298 1106 l gs col0 s gr 
+% Polyline
+n 1420 2327 m
+ 5459 2327 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5488 2330 m 5298 2274 l
+ 5298 2327 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5488 2330 m 5298 2274 l
+ 5298 2327 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5488 2324 m 5298 2380 l
+ 5298 2327 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5488 2324 m 5298 2380 l
+ 5298 2327 l gs col0 s gr 
+% Polyline
+n 1420 2703 m
+ 5459 2703 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5488 2706 m 5298 2650 l
+ 5298 2703 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5488 2706 m 5298 2650 l
+ 5298 2703 l gs col0 s gr 
+% Polyline
+0.000 slw
+n 5488 2700 m 5298 2755 l
+ 5298 2703 l gs 0.00 setgray ef gr 
+% Polyline
+7.500 slw
+n 5488 2700 m 5298 2755 l
+ 5298 2703 l gs col0 s gr 
+/Helvetica-Bold-iso ff 240.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-Bold-iso ff 240.00 scf sf
+428 2734 m
+gs 1 -1 sc  90.0 rot (input, out-of-place) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1264 629 m
+gs 1 -1 sc (0) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1681 629 m
+gs 1 -1 sc (1) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+2081 629 m
+gs 1 -1 sc (2) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+2481 629 m
+gs 1 -1 sc (3) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+4081 629 m
+gs 1 -1 sc (ny-4) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+4897 629 m
+gs 1 -1 sc (ny-2) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+5297 629 m
+gs 1 -1 sc (ny-1) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+4497 629 m
+gs 1 -1 sc (ny-3) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+74 89 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1264 1012 m
+gs 1 -1 sc (ny) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1425 4800 m
+gs 1 -1 sc ( ) col0 sh gr
+/Helvetica-iso ff 150.00 scf sf
+1664 1012 m
+gs 1 -1 sc (ny+1) col0 sh gr
+% Polyline
+n 273 3662 m
+ 273 3039 l gs col0 s gr 
+% Polyline
+n 382 3920 m
+ 156 3662 l gs col0 s gr 
+% Polyline
+n 273 3662 m
+ 148 3662 l gs col0 s gr 
+% Polyline
+n 487 3662 m
+ 487 3039 l gs col0 s gr 
+% Polyline
+n 378 3920 m
+ 604 3662 l gs col0 s gr 
+% Polyline
+n 487 3662 m
+ 612 3662 l gs col0 s gr 
+% Polyline
+n 273 6130 m
+ 273 6753 l gs col0 s gr 
+% Polyline
+n 382 5872 m
+ 156 6130 l gs col0 s gr 
+% Polyline
+n 273 6130 m
+ 148 6130 l gs col0 s gr 
+% Polyline
+n 487 6129 m
+ 487 6753 l gs col0 s gr 
+% Polyline
+n 378 5872 m
+ 604 6129 l gs col0 s gr 
+% Polyline
+n 487 6129 m
+ 612 6129 l gs col0 s gr 
+% here ends figure;
+pagefooter
+showpage
+%%Trailer
+%EOF
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/rfftwnd.fig
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/rfftwnd.fig	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1148 @@
+#FIG 3.2
+Portrait
+Flush left
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+0 32 #797d79
+0 33 #efebef
+0 34 #e73020
+0 35 #f78e86
+0 36 #6996c7
+0 37 #86aad7
+0 38 #dfdb00
+6 75 75 6450 9750
+2 1 0 0 7 7 998 0 20 4.000 0 0 0 0 0 5
+	 1221 7280 6435 7280 6435 9676 1221 9676 1221 7280
+2 1 0 1 32 32 997 0 -1 4.000 0 0 0 0 0 5
+	 1221 7280 6435 7280 6435 9676 1221 9676 1221 7280
+2 1 0 0 33 33 996 0 20 4.000 0 0 0 0 0 5
+	 1221 7280 1620 7280 1620 7656 1221 7656 1221 7280
+2 1 0 1 32 32 995 0 -1 4.000 0 0 0 0 0 5
+	 1221 7280 1620 7280 1620 7656 1221 7656 1221 7280
+2 1 0 0 33 33 994 0 20 4.000 0 0 0 0 0 5
+	 1620 7280 2019 7280 2019 7656 1620 7656 1620 7280
+2 1 0 1 32 32 993 0 -1 4.000 0 0 0 0 0 5
+	 1620 7280 2019 7280 2019 7656 1620 7656 1620 7280
+2 1 0 0 33 33 992 0 20 4.000 0 0 0 0 0 5
+	 2019 7280 2418 7280 2418 7656 2019 7656 2019 7280
+2 1 0 1 32 32 991 0 -1 4.000 0 0 0 0 0 5
+	 2019 7280 2418 7280 2418 7656 2019 7656 2019 7280
+2 1 0 0 33 33 990 0 20 4.000 0 0 0 0 0 5
+	 2418 7280 2817 7280 2817 7656 2418 7656 2418 7280
+2 1 0 1 32 32 989 0 -1 4.000 0 0 0 0 0 5
+	 2418 7280 2817 7280 2817 7656 2418 7656 2418 7280
+2 1 0 0 33 33 988 0 20 4.000 0 0 0 0 0 5
+	 4038 7280 4438 7280 4438 7656 4038 7656 4038 7280
+2 1 0 1 32 32 987 0 -1 4.000 0 0 0 0 0 5
+	 4038 7280 4438 7280 4438 7656 4038 7656 4038 7280
+2 1 0 0 33 33 986 0 20 4.000 0 0 0 0 0 5
+	 4438 7280 4837 7280 4837 7656 4438 7656 4438 7280
+2 1 0 1 32 32 985 0 -1 4.000 0 0 0 0 0 5
+	 4438 7280 4837 7280 4837 7656 4438 7656 4438 7280
+2 1 0 0 33 33 984 0 20 4.000 0 0 0 0 0 5
+	 4837 7280 5236 7280 5236 7656 4837 7656 4837 7280
+2 1 0 1 32 32 983 0 -1 4.000 0 0 0 0 0 5
+	 4837 7280 5236 7280 5236 7656 4837 7656 4837 7280
+2 1 0 0 33 33 982 0 20 4.000 0 0 0 0 0 5
+	 5236 7280 5635 7280 5635 7656 5236 7656 5236 7280
+2 1 0 1 32 32 981 0 -1 4.000 0 0 0 0 0 5
+	 5236 7280 5635 7280 5635 7656 5236 7656 5236 7280
+2 1 0 0 33 33 980 0 20 4.000 0 0 0 0 0 5
+	 1221 7656 1620 7656 1620 8032 1221 8032 1221 7656
+2 1 0 1 32 32 979 0 -1 4.000 0 0 0 0 0 5
+	 1221 7656 1620 7656 1620 8032 1221 8032 1221 7656
+2 1 0 0 33 33 978 0 20 4.000 0 0 0 0 0 5
+	 1620 7656 2019 7656 2019 8032 1620 8032 1620 7656
+2 1 0 1 32 32 977 0 -1 4.000 0 0 0 0 0 5
+	 1620 7656 2019 7656 2019 8032 1620 8032 1620 7656
+2 1 0 0 33 33 976 0 20 4.000 0 0 0 0 0 5
+	 2019 7656 2418 7656 2418 8032 2019 8032 2019 7656
+2 1 0 1 32 32 975 0 -1 4.000 0 0 0 0 0 5
+	 2019 7656 2418 7656 2418 8032 2019 8032 2019 7656
+2 1 0 0 33 33 974 0 20 4.000 0 0 0 0 0 5
+	 2418 7656 2817 7656 2817 8032 2418 8032 2418 7656
+2 1 0 1 32 32 973 0 -1 4.000 0 0 0 0 0 5
+	 2418 7656 2817 7656 2817 8032 2418 8032 2418 7656
+2 1 0 0 33 33 972 0 20 4.000 0 0 0 0 0 5
+	 4038 7656 4438 7656 4438 8032 4038 8032 4038 7656
+2 1 0 1 32 32 971 0 -1 4.000 0 0 0 0 0 5
+	 4038 7656 4438 7656 4438 8032 4038 8032 4038 7656
+2 1 0 0 33 33 970 0 20 4.000 0 0 0 0 0 5
+	 4438 7656 4837 7656 4837 8032 4438 8032 4438 7656
+2 1 0 1 32 32 969 0 -1 4.000 0 0 0 0 0 5
+	 4438 7656 4837 7656 4837 8032 4438 8032 4438 7656
+2 1 0 0 33 33 968 0 20 4.000 0 0 0 0 0 5
+	 4837 7656 5236 7656 5236 8032 4837 8032 4837 7656
+2 1 0 1 32 32 967 0 -1 4.000 0 0 0 0 0 5
+	 4837 7656 5236 7656 5236 8032 4837 8032 4837 7656
+2 1 0 0 33 33 966 0 20 4.000 0 0 0 0 0 5
+	 5236 7656 5635 7656 5635 8032 5236 8032 5236 7656
+2 1 0 1 32 32 965 0 -1 4.000 0 0 0 0 0 5
+	 5236 7656 5635 7656 5635 8032 5236 8032 5236 7656
+2 1 0 0 33 33 964 0 20 4.000 0 0 0 0 0 5
+	 1221 8924 1620 8924 1620 9300 1221 9300 1221 8924
+2 1 0 1 32 32 963 0 -1 4.000 0 0 0 0 0 5
+	 1221 8924 1620 8924 1620 9300 1221 9300 1221 8924
+2 1 0 0 33 33 962 0 20 4.000 0 0 0 0 0 5
+	 1620 8924 2019 8924 2019 9300 1620 9300 1620 8924
+2 1 0 1 32 32 961 0 -1 4.000 0 0 0 0 0 5
+	 1620 8924 2019 8924 2019 9300 1620 9300 1620 8924
+2 1 0 0 33 33 960 0 20 4.000 0 0 0 0 0 5
+	 2019 8924 2418 8924 2418 9300 2019 9300 2019 8924
+2 1 0 1 32 32 959 0 -1 4.000 0 0 0 0 0 5
+	 2019 8924 2418 8924 2418 9300 2019 9300 2019 8924
+2 1 0 0 33 33 958 0 20 4.000 0 0 0 0 0 5
+	 2418 8924 2817 8924 2817 9300 2418 9300 2418 8924
+2 1 0 1 32 32 957 0 -1 4.000 0 0 0 0 0 5
+	 2418 8924 2817 8924 2817 9300 2418 9300 2418 8924
+2 1 0 0 33 33 956 0 20 4.000 0 0 0 0 0 5
+	 4038 8924 4438 8924 4438 9300 4038 9300 4038 8924
+2 1 0 1 32 32 955 0 -1 4.000 0 0 0 0 0 5
+	 4038 8924 4438 8924 4438 9300 4038 9300 4038 8924
+2 1 0 0 33 33 954 0 20 4.000 0 0 0 0 0 5
+	 4438 8924 4837 8924 4837 9300 4438 9300 4438 8924
+2 1 0 1 32 32 953 0 -1 4.000 0 0 0 0 0 5
+	 4438 8924 4837 8924 4837 9300 4438 9300 4438 8924
+2 1 0 0 33 33 952 0 20 4.000 0 0 0 0 0 5
+	 4837 8924 5236 8924 5236 9300 4837 9300 4837 8924
+2 1 0 1 32 32 951 0 -1 4.000 0 0 0 0 0 5
+	 4837 8924 5236 8924 5236 9300 4837 9300 4837 8924
+2 1 0 0 33 33 950 0 20 4.000 0 0 0 0 0 5
+	 5236 8924 5635 8924 5635 9300 5236 9300 5236 8924
+2 1 0 1 32 32 949 0 -1 4.000 0 0 0 0 0 5
+	 5236 8924 5635 8924 5635 9300 5236 9300 5236 8924
+2 1 0 0 33 33 948 0 20 4.000 0 0 0 0 0 5
+	 1221 9300 1620 9300 1620 9676 1221 9676 1221 9300
+2 1 0 1 32 32 947 0 -1 4.000 0 0 0 0 0 5
+	 1221 9300 1620 9300 1620 9676 1221 9676 1221 9300
+2 1 0 0 33 33 946 0 20 4.000 0 0 0 0 0 5
+	 1620 9300 2019 9300 2019 9676 1620 9676 1620 9300
+2 1 0 1 32 32 945 0 -1 4.000 0 0 0 0 0 5
+	 1620 9300 2019 9300 2019 9676 1620 9676 1620 9300
+2 1 0 0 33 33 944 0 20 4.000 0 0 0 0 0 5
+	 2019 9300 2418 9300 2418 9676 2019 9676 2019 9300
+2 1 0 1 32 32 943 0 -1 4.000 0 0 0 0 0 5
+	 2019 9300 2418 9300 2418 9676 2019 9676 2019 9300
+2 1 0 0 33 33 942 0 20 4.000 0 0 0 0 0 5
+	 2418 9300 2817 9300 2817 9676 2418 9676 2418 9300
+2 1 0 1 32 32 941 0 -1 4.000 0 0 0 0 0 5
+	 2418 9300 2817 9300 2817 9676 2418 9676 2418 9300
+2 1 0 0 33 33 940 0 20 4.000 0 0 0 0 0 5
+	 4038 9300 4438 9300 4438 9676 4038 9676 4038 9300
+2 1 0 1 32 32 939 0 -1 4.000 0 0 0 0 0 5
+	 4038 9300 4438 9300 4438 9676 4038 9676 4038 9300
+2 1 0 0 33 33 938 0 20 4.000 0 0 0 0 0 5
+	 4438 9300 4837 9300 4837 9676 4438 9676 4438 9300
+2 1 0 1 32 32 937 0 -1 4.000 0 0 0 0 0 5
+	 4438 9300 4837 9300 4837 9676 4438 9676 4438 9300
+2 1 0 0 33 33 936 0 20 4.000 0 0 0 0 0 5
+	 4837 9300 5236 9300 5236 9676 4837 9676 4837 9300
+2 1 0 1 32 32 935 0 -1 4.000 0 0 0 0 0 5
+	 4837 9300 5236 9300 5236 9676 4837 9676 4837 9300
+2 1 0 0 33 33 934 0 20 4.000 0 0 0 0 0 5
+	 5236 9300 5635 9300 5635 9676 5236 9676 5236 9300
+2 1 0 1 32 32 933 0 -1 4.000 0 0 0 0 0 5
+	 5236 9300 5635 9300 5635 9676 5236 9676 5236 9300
+2 1 0 1 32 32 930 0 -1 4.000 0 0 0 0 0 2
+	 2819 7658 2869 7658
+2 1 0 1 32 32 929 0 -1 4.000 0 0 0 0 0 2
+	 2952 7658 3002 7658
+2 1 0 1 32 32 928 0 -1 4.000 0 0 0 0 0 2
+	 3085 7658 3135 7658
+2 1 0 1 32 32 927 0 -1 4.000 0 0 0 0 0 2
+	 3219 7658 3252 7658
+2 1 0 1 32 32 926 0 -1 4.000 0 0 0 0 0 2
+	 2819 8033 2869 8033
+2 1 0 1 32 32 925 0 -1 4.000 0 0 0 0 0 2
+	 2952 8033 3002 8033
+2 1 0 1 32 32 924 0 -1 4.000 0 0 0 0 0 2
+	 3085 8033 3135 8033
+2 1 0 1 32 32 923 0 -1 4.000 0 0 0 0 0 2
+	 3219 8033 3252 8033
+2 1 0 1 32 32 922 0 -1 4.000 0 0 0 0 0 2
+	 2819 8025 2819 8075
+2 1 0 1 32 32 921 0 -1 4.000 0 0 0 0 0 2
+	 2819 8158 2819 8208
+2 1 0 1 32 32 920 0 -1 4.000 0 0 0 0 0 2
+	 2819 8291 2819 8341
+2 1 0 1 32 32 919 0 -1 4.000 0 0 0 0 0 2
+	 2419 8025 2419 8075
+2 1 0 1 32 32 918 0 -1 4.000 0 0 0 0 0 2
+	 2419 8158 2419 8208
+2 1 0 1 32 32 917 0 -1 4.000 0 0 0 0 0 2
+	 2419 8291 2419 8341
+2 1 0 1 32 32 916 0 -1 4.000 0 0 0 0 0 2
+	 2019 8025 2019 8075
+2 1 0 1 32 32 915 0 -1 4.000 0 0 0 0 0 2
+	 2019 8158 2019 8208
+2 1 0 1 32 32 914 0 -1 4.000 0 0 0 0 0 2
+	 2019 8291 2019 8341
+2 1 0 1 32 32 913 0 -1 4.000 0 0 0 0 0 2
+	 1619 8025 1619 8075
+2 1 0 1 32 32 912 0 -1 4.000 0 0 0 0 0 2
+	 1619 8158 1619 8208
+2 1 0 1 32 32 911 0 -1 4.000 0 0 0 0 0 2
+	 1619 8291 1619 8341
+2 1 0 1 32 32 910 0 -1 4.000 0 0 0 0 0 2
+	 4036 7658 3986 7658
+2 1 0 1 32 32 909 0 -1 4.000 0 0 0 0 0 2
+	 3902 7658 3852 7658
+2 1 0 1 32 32 908 0 -1 4.000 0 0 0 0 0 2
+	 3769 7658 3719 7658
+2 1 0 1 32 32 907 0 -1 4.000 0 0 0 0 0 2
+	 3636 7658 3602 7658
+2 1 0 1 32 32 906 0 -1 4.000 0 0 0 0 0 2
+	 4036 8033 3986 8033
+2 1 0 1 32 32 905 0 -1 4.000 0 0 0 0 0 2
+	 3902 8033 3852 8033
+2 1 0 1 32 32 904 0 -1 4.000 0 0 0 0 0 2
+	 3769 8033 3719 8033
+2 1 0 1 32 32 903 0 -1 4.000 0 0 0 0 0 2
+	 3636 8033 3602 8033
+2 1 0 1 32 32 902 0 -1 4.000 0 0 0 0 0 2
+	 4035 8025 4035 8075
+2 1 0 1 32 32 901 0 -1 4.000 0 0 0 0 0 2
+	 4035 8158 4035 8208
+2 1 0 1 32 32 900 0 -1 4.000 0 0 0 0 0 2
+	 4035 8291 4035 8341
+2 1 0 1 32 32 899 0 -1 4.000 0 0 0 0 0 2
+	 4435 8025 4435 8075
+2 1 0 1 32 32 898 0 -1 4.000 0 0 0 0 0 2
+	 4435 8158 4435 8208
+2 1 0 1 32 32 897 0 -1 4.000 0 0 0 0 0 2
+	 4435 8291 4435 8341
+2 1 0 1 32 32 896 0 -1 4.000 0 0 0 0 0 2
+	 4835 8025 4835 8075
+2 1 0 1 32 32 895 0 -1 4.000 0 0 0 0 0 2
+	 4835 8158 4835 8208
+2 1 0 1 32 32 894 0 -1 4.000 0 0 0 0 0 2
+	 4835 8291 4835 8341
+2 1 0 1 32 32 893 0 -1 4.000 0 0 0 0 0 2
+	 5235 8025 5235 8075
+2 1 0 1 32 32 892 0 -1 4.000 0 0 0 0 0 2
+	 5235 8158 5235 8208
+2 1 0 1 32 32 891 0 -1 4.000 0 0 0 0 0 2
+	 5235 8291 5235 8341
+2 1 0 1 32 32 890 0 -1 4.000 0 0 0 0 0 2
+	 4036 9300 3986 9300
+2 1 0 1 32 32 889 0 -1 4.000 0 0 0 0 0 2
+	 3902 9300 3852 9300
+2 1 0 1 32 32 888 0 -1 4.000 0 0 0 0 0 2
+	 3769 9300 3719 9300
+2 1 0 1 32 32 887 0 -1 4.000 0 0 0 0 0 2
+	 3636 9300 3602 9300
+2 1 0 1 32 32 886 0 -1 4.000 0 0 0 0 0 2
+	 4036 8925 3986 8925
+2 1 0 1 32 32 885 0 -1 4.000 0 0 0 0 0 2
+	 3902 8925 3852 8925
+2 1 0 1 32 32 884 0 -1 4.000 0 0 0 0 0 2
+	 3769 8925 3719 8925
+2 1 0 1 32 32 883 0 -1 4.000 0 0 0 0 0 2
+	 3636 8925 3602 8925
+2 1 0 1 32 32 882 0 -1 4.000 0 0 0 0 0 2
+	 4035 8933 4035 8883
+2 1 0 1 32 32 881 0 -1 4.000 0 0 0 0 0 2
+	 4035 8800 4035 8750
+2 1 0 1 32 32 880 0 -1 4.000 0 0 0 0 0 2
+	 4035 8666 4035 8616
+2 1 0 1 32 32 879 0 -1 4.000 0 0 0 0 0 2
+	 4435 8933 4435 8883
+2 1 0 1 32 32 878 0 -1 4.000 0 0 0 0 0 2
+	 4435 8800 4435 8750
+2 1 0 1 32 32 877 0 -1 4.000 0 0 0 0 0 2
+	 4435 8666 4435 8616
+2 1 0 1 32 32 876 0 -1 4.000 0 0 0 0 0 2
+	 4835 8933 4835 8883
+2 1 0 1 32 32 875 0 -1 4.000 0 0 0 0 0 2
+	 4835 8800 4835 8750
+2 1 0 1 32 32 874 0 -1 4.000 0 0 0 0 0 2
+	 4835 8666 4835 8616
+2 1 0 1 32 32 873 0 -1 4.000 0 0 0 0 0 2
+	 5235 8933 5235 8883
+2 1 0 1 32 32 872 0 -1 4.000 0 0 0 0 0 2
+	 5235 8800 5235 8750
+2 1 0 1 32 32 871 0 -1 4.000 0 0 0 0 0 2
+	 5235 8666 5235 8616
+2 1 0 1 32 32 870 0 -1 4.000 0 0 0 0 0 2
+	 2819 9300 2869 9300
+2 1 0 1 32 32 869 0 -1 4.000 0 0 0 0 0 2
+	 2952 9300 3002 9300
+2 1 0 1 32 32 868 0 -1 4.000 0 0 0 0 0 2
+	 3085 9300 3135 9300
+2 1 0 1 32 32 867 0 -1 4.000 0 0 0 0 0 2
+	 3219 9300 3252 9300
+2 1 0 1 32 32 866 0 -1 4.000 0 0 0 0 0 2
+	 2819 8925 2869 8925
+2 1 0 1 32 32 865 0 -1 4.000 0 0 0 0 0 2
+	 2952 8925 3002 8925
+2 1 0 1 32 32 864 0 -1 4.000 0 0 0 0 0 2
+	 3085 8925 3135 8925
+2 1 0 1 32 32 863 0 -1 4.000 0 0 0 0 0 2
+	 3219 8925 3252 8925
+2 1 0 1 32 32 862 0 -1 4.000 0 0 0 0 0 2
+	 2819 8933 2819 8883
+2 1 0 1 32 32 861 0 -1 4.000 0 0 0 0 0 2
+	 2819 8800 2819 8750
+2 1 0 1 32 32 860 0 -1 4.000 0 0 0 0 0 2
+	 2819 8666 2819 8616
+2 1 0 1 32 32 859 0 -1 4.000 0 0 0 0 0 2
+	 2419 8933 2419 8883
+2 1 0 1 32 32 858 0 -1 4.000 0 0 0 0 0 2
+	 2419 8800 2419 8750
+2 1 0 1 32 32 857 0 -1 4.000 0 0 0 0 0 2
+	 2419 8666 2419 8616
+2 1 0 1 32 32 856 0 -1 4.000 0 0 0 0 0 2
+	 2019 8933 2019 8883
+2 1 0 1 32 32 855 0 -1 4.000 0 0 0 0 0 2
+	 2019 8800 2019 8750
+2 1 0 1 32 32 854 0 -1 4.000 0 0 0 0 0 2
+	 2019 8666 2019 8616
+2 1 0 1 32 32 853 0 -1 4.000 0 0 0 0 0 2
+	 1619 8933 1619 8883
+2 1 0 1 32 32 852 0 -1 4.000 0 0 0 0 0 2
+	 1619 8800 1619 8750
+2 1 0 1 32 32 851 0 -1 4.000 0 0 0 0 0 2
+	 1619 8666 1619 8616
+2 1 0 0 35 35 836 0 20 4.000 0 0 0 0 0 5
+	 5636 7280 6035 7280 6035 7656 5636 7656 5636 7280
+2 1 0 1 32 32 835 0 -1 4.000 0 0 0 0 0 5
+	 5636 7280 6035 7280 6035 7656 5636 7656 5636 7280
+2 1 0 0 35 35 834 0 20 4.000 0 0 0 0 0 5
+	 5636 7656 6035 7656 6035 8032 5636 8032 5636 7656
+2 1 0 1 32 32 833 0 -1 4.000 0 0 0 0 0 5
+	 5636 7656 6035 7656 6035 8032 5636 8032 5636 7656
+2 1 0 0 35 35 832 0 20 4.000 0 0 0 0 0 5
+	 5636 8924 6035 8924 6035 9300 5636 9300 5636 8924
+2 1 0 1 32 32 831 0 -1 4.000 0 0 0 0 0 5
+	 5636 8924 6035 8924 6035 9300 5636 9300 5636 8924
+2 1 0 0 35 35 830 0 20 4.000 0 0 0 0 0 5
+	 5636 9300 6035 9300 6035 9676 5636 9676 5636 9300
+2 1 0 1 32 32 829 0 -1 4.000 0 0 0 0 0 5
+	 5636 9300 6035 9300 6035 9676 5636 9676 5636 9300
+2 1 0 0 35 35 828 0 20 4.000 0 0 0 0 0 5
+	 6036 7280 6435 7280 6435 7656 6036 7656 6036 7280
+2 1 0 1 32 32 827 0 -1 4.000 0 0 0 0 0 5
+	 6036 7280 6435 7280 6435 7656 6036 7656 6036 7280
+2 1 0 0 35 35 826 0 20 4.000 0 0 0 0 0 5
+	 6036 7656 6435 7656 6435 8032 6036 8032 6036 7656
+2 1 0 1 32 32 825 0 -1 4.000 0 0 0 0 0 5
+	 6036 7656 6435 7656 6435 8032 6036 8032 6036 7656
+2 1 0 0 35 35 824 0 20 4.000 0 0 0 0 0 5
+	 6036 8924 6435 8924 6435 9300 6036 9300 6036 8924
+2 1 0 1 32 32 823 0 -1 4.000 0 0 0 0 0 5
+	 6036 8924 6435 8924 6435 9300 6036 9300 6036 8924
+2 1 0 0 35 35 822 0 20 4.000 0 0 0 0 0 5
+	 6036 9300 6435 9300 6435 9676 6036 9676 6036 9300
+2 1 0 1 32 32 821 0 -1 4.000 0 0 0 0 0 5
+	 6036 9300 6435 9300 6435 9676 6036 9676 6036 9300
+2 1 0 1 0 0 820 0 -1 4.000 0 0 0 0 0 2
+	 5635 7283 5635 9683
+2 1 0 1 0 0 819 0 -1 4.000 0 0 0 0 0 2
+	 1420 7515 6312 7515
+2 1 0 0 0 0 818 0 20 4.000 0 0 0 0 0 3
+	 6348 7518 6117 7462 6117 7515
+2 1 0 1 0 0 817 0 -1 4.000 0 0 0 0 0 3
+	 6348 7518 6117 7462 6117 7515
+2 1 0 0 0 0 816 0 20 4.000 0 0 0 0 0 3
+	 6348 7512 6117 7568 6117 7515
+2 1 0 1 0 0 815 0 -1 4.000 0 0 0 0 0 3
+	 6348 7512 6117 7568 6117 7515
+2 1 0 1 0 0 814 0 -1 4.000 0 0 0 0 0 2
+	 1420 7891 5863 7891
+2 1 0 0 0 0 813 0 20 4.000 0 0 0 0 0 3
+	 5895 7894 5685 7838 5685 7891
+2 1 0 1 0 0 812 0 -1 4.000 0 0 0 0 0 3
+	 5895 7894 5685 7838 5685 7891
+2 1 0 0 0 0 811 0 20 4.000 0 0 0 0 0 3
+	 5895 7888 5685 7944 5685 7891
+2 1 0 1 0 0 810 0 -1 4.000 0 0 0 0 0 3
+	 5895 7888 5685 7944 5685 7891
+2 1 0 1 0 0 809 0 -1 4.000 0 0 0 0 0 2
+	 1420 9112 5863 9112
+2 1 0 0 0 0 808 0 20 4.000 0 0 0 0 0 3
+	 5895 9115 5685 9059 5685 9112
+2 1 0 1 0 0 807 0 -1 4.000 0 0 0 0 0 3
+	 5895 9115 5685 9059 5685 9112
+2 1 0 0 0 0 806 0 20 4.000 0 0 0 0 0 3
+	 5895 9109 5685 9165 5685 9112
+2 1 0 1 0 0 805 0 -1 4.000 0 0 0 0 0 3
+	 5895 9109 5685 9165 5685 9112
+2 1 0 1 0 0 804 0 -1 4.000 0 0 0 0 0 2
+	 1420 9488 5863 9488
+2 1 0 0 0 0 803 0 20 4.000 0 0 0 0 0 3
+	 5895 9491 5685 9435 5685 9488
+2 1 0 1 0 0 802 0 -1 4.000 0 0 0 0 0 3
+	 5895 9491 5685 9435 5685 9488
+2 1 0 0 0 0 801 0 20 4.000 0 0 0 0 0 3
+	 5895 9485 5685 9541 5685 9488
+2 1 0 1 0 0 800 0 -1 4.000 0 0 0 0 0 3
+	 5895 9485 5685 9541 5685 9488
+2 1 0 0 36 36 771 0 20 4.000 0 0 0 0 0 5
+	 5226 5196 5623 5196 5623 5572 5226 5572 5226 5196
+2 1 0 0 37 37 770 0 20 4.000 0 0 0 0 0 5
+	 4827 5196 5226 5196 5226 5572 4827 5572 4827 5196
+2 1 0 1 38 38 769 0 -1 4.000 0 0 0 0 0 5
+	 4835 5194 5631 5194 5631 5569 4835 5569 4835 5194
+2 1 0 0 36 36 768 0 20 4.000 0 0 0 0 0 5
+	 4434 5196 4832 5196 4832 5572 4434 5572 4434 5196
+2 1 0 0 37 37 767 0 20 4.000 0 0 0 0 0 5
+	 4035 5196 4434 5196 4434 5572 4035 5572 4035 5196
+2 1 0 1 38 38 766 0 -1 4.000 0 0 0 0 0 5
+	 4044 5194 4840 5194 4840 5569 4044 5569 4044 5194
+2 1 0 0 36 36 765 0 20 4.000 0 0 0 0 0 5
+	 6026 5196 6440 5196 6440 5572 6026 5572 6026 5196
+2 1 0 0 37 37 764 0 20 4.000 0 0 0 0 0 5
+	 5627 5196 6026 5196 6026 5572 5627 5572 5627 5196
+2 1 0 1 38 38 763 0 -1 4.000 0 0 0 0 0 5
+	 5635 5194 6440 5194 6440 5569 5635 5569 5635 5194
+2 1 0 0 36 36 762 0 20 4.000 0 0 0 0 0 5
+	 5226 5571 5623 5571 5623 5947 5226 5947 5226 5571
+2 1 0 0 37 37 761 0 20 4.000 0 0 0 0 0 5
+	 4827 5571 5226 5571 5226 5947 4827 5947 4827 5571
+2 1 0 1 38 38 760 0 -1 4.000 0 0 0 0 0 5
+	 4835 5569 5631 5569 5631 5944 4835 5944 4835 5569
+2 1 0 0 36 36 759 0 20 4.000 0 0 0 0 0 5
+	 4434 5571 4832 5571 4832 5947 4434 5947 4434 5571
+2 1 0 0 37 37 758 0 20 4.000 0 0 0 0 0 5
+	 4035 5571 4434 5571 4434 5947 4035 5947 4035 5571
+2 1 0 1 38 38 757 0 -1 4.000 0 0 0 0 0 5
+	 4044 5569 4840 5569 4840 5944 4044 5944 4044 5569
+2 1 0 0 36 36 756 0 20 4.000 0 0 0 0 0 5
+	 6026 5571 6440 5571 6440 5947 6026 5947 6026 5571
+2 1 0 0 37 37 755 0 20 4.000 0 0 0 0 0 5
+	 5627 5571 6026 5571 6026 5947 5627 5947 5627 5571
+2 1 0 1 38 38 754 0 -1 4.000 0 0 0 0 0 5
+	 5635 5569 6440 5569 6440 5944 5635 5944 5635 5569
+2 1 0 0 36 36 753 0 20 4.000 0 0 0 0 0 5
+	 2409 5571 2807 5571 2807 5947 2409 5947 2409 5571
+2 1 0 0 37 37 752 0 20 4.000 0 0 0 0 0 5
+	 2010 5571 2409 5571 2409 5947 2010 5947 2010 5571
+2 1 0 1 38 38 751 0 -1 4.000 0 0 0 0 0 5
+	 2019 5561 2815 5561 2815 5936 2019 5936 2019 5561
+2 1 0 0 36 36 750 0 20 4.000 0 0 0 0 0 5
+	 1618 5571 2015 5571 2015 5947 1618 5947 1618 5571
+2 1 0 0 37 37 749 0 20 4.000 0 0 0 0 0 5
+	 1219 5571 1618 5571 1618 5947 1219 5947 1219 5571
+2 1 0 1 38 38 748 0 -1 4.000 0 0 0 0 0 5
+	 1227 5561 2023 5561 2023 5939 1227 5939 1227 5561
+2 1 0 0 36 36 747 0 20 4.000 0 0 0 0 0 5
+	 2409 5196 2807 5196 2807 5572 2409 5572 2409 5196
+2 1 0 0 37 37 746 0 20 4.000 0 0 0 0 0 5
+	 2010 5196 2409 5196 2409 5572 2010 5572 2010 5196
+2 1 0 1 38 38 745 0 -1 4.000 0 0 0 0 0 5
+	 2019 5186 2815 5186 2815 5561 2019 5561 2019 5186
+2 1 0 0 36 36 744 0 20 4.000 0 0 0 0 0 5
+	 1618 5196 2015 5196 2015 5572 1618 5572 1618 5196
+2 1 0 0 37 37 743 0 20 4.000 0 0 0 0 0 5
+	 1219 5196 1618 5196 1618 5572 1219 5572 1219 5196
+2 1 0 1 38 38 742 0 -1 4.000 0 0 0 0 0 5
+	 1227 5186 2023 5186 2023 5561 1227 5561 1227 5186
+2 1 0 0 36 36 741 0 20 4.000 0 0 0 0 0 5
+	 5226 3546 5623 3546 5623 3922 5226 3922 5226 3546
+2 1 0 0 37 37 740 0 20 4.000 0 0 0 0 0 5
+	 4827 3546 5226 3546 5226 3922 4827 3922 4827 3546
+2 1 0 1 32 32 739 0 -1 4.000 0 0 0 0 0 5
+	 4835 3544 5631 3544 5631 3919 4835 3919 4835 3544
+2 1 0 0 36 36 738 0 20 4.000 0 0 0 0 0 5
+	 4434 3546 4832 3546 4832 3922 4434 3922 4434 3546
+2 1 0 0 37 37 737 0 20 4.000 0 0 0 0 0 5
+	 4035 3546 4434 3546 4434 3922 4035 3922 4035 3546
+2 1 0 1 38 38 736 0 -1 4.000 0 0 0 0 0 5
+	 4044 3544 4840 3544 4840 3919 4044 3919 4044 3544
+2 1 0 0 36 36 735 0 20 4.000 0 0 0 0 0 5
+	 5990 3546 6432 3546 6432 3955 5990 3955 5990 3546
+2 1 0 0 37 37 734 0 20 4.000 0 0 0 0 0 5
+	 5627 3546 6026 3546 6026 3922 5627 3922 5627 3546
+2 1 0 1 38 38 733 0 -1 4.000 0 0 0 0 0 5
+	 5635 3544 6440 3544 6440 3919 5635 3919 5635 3544
+2 1 0 0 36 36 732 0 20 4.000 0 0 0 0 0 5
+	 5226 3921 5623 3921 5623 4297 5226 4297 5226 3921
+2 1 0 0 37 37 731 0 20 4.000 0 0 0 0 0 5
+	 4827 3921 5226 3921 5226 4297 4827 4297 4827 3921
+2 1 0 1 38 38 730 0 -1 4.000 0 0 0 0 0 5
+	 4835 3919 5631 3919 5631 4294 4835 4294 4835 3919
+2 1 0 0 36 36 729 0 20 4.000 0 0 0 0 0 5
+	 4434 3921 4832 3921 4832 4297 4434 4297 4434 3921
+2 1 0 0 37 37 728 0 20 4.000 0 0 0 0 0 5
+	 4035 3921 4434 3921 4434 4297 4035 4297 4035 3921
+2 1 0 1 38 38 727 0 -1 4.000 0 0 0 0 0 5
+	 4044 3919 4840 3919 4840 4294 4044 4294 4044 3919
+2 1 0 0 36 36 726 0 20 4.000 0 0 0 0 0 5
+	 6026 3921 6432 3921 6432 4297 6026 4297 6026 3921
+2 1 0 0 37 37 725 0 20 4.000 0 0 0 0 0 5
+	 5627 3921 6026 3921 6026 4297 5627 4297 5627 3921
+2 1 0 1 38 38 724 0 -1 4.000 0 0 0 0 0 5
+	 5635 3919 6440 3919 6440 4294 5635 4294 5635 3919
+2 1 0 0 36 36 723 0 20 4.000 0 0 0 0 0 5
+	 2409 3921 2807 3921 2807 4297 2409 4297 2409 3921
+2 1 0 0 37 37 722 0 20 4.000 0 0 0 0 0 5
+	 2010 3921 2409 3921 2409 4297 2010 4297 2010 3921
+2 1 0 1 38 38 721 0 -1 4.000 0 0 0 0 0 5
+	 2019 3919 2815 3919 2815 4294 2019 4294 2019 3919
+2 1 0 0 36 36 720 0 20 4.000 0 0 0 0 0 5
+	 1618 3921 2015 3921 2015 4297 1618 4297 1618 3921
+2 1 0 0 37 37 719 0 20 4.000 0 0 0 0 0 5
+	 1219 3921 1618 3921 1618 4297 1219 4297 1219 3921
+2 1 0 1 38 38 718 0 -1 4.000 0 0 0 0 0 5
+	 1227 3919 2023 3919 2023 4294 1227 4294 1227 3919
+2 1 0 0 36 36 717 0 20 4.000 0 0 0 0 0 5
+	 2409 3546 2815 3546 2815 3922 2409 3922 2409 3546
+2 1 0 0 37 37 716 0 20 4.000 0 0 0 0 0 5
+	 2010 3546 2409 3546 2409 3922 2010 3922 2010 3546
+2 1 0 1 38 38 715 0 -1 4.000 0 0 0 0 0 5
+	 2019 3544 2815 3544 2815 3919 2019 3919 2019 3544
+2 1 0 0 36 36 714 0 20 4.000 0 0 0 0 0 5
+	 1618 3546 2015 3546 2015 3922 1618 3922 1618 3546
+2 1 0 0 37 37 713 0 20 4.000 0 0 0 0 0 5
+	 1219 3546 1618 3546 1618 3922 1219 3922 1219 3546
+2 1 0 1 38 38 712 0 -1 4.000 0 0 0 0 0 5
+	 1227 3544 2023 3544 2023 3919 1227 3919 1227 3544
+2 1 0 1 32 32 711 0 -1 4.000 0 0 0 0 0 5
+	 1221 3546 6440 3546 6440 5941 1221 5941 1221 3546
+2 1 0 1 32 32 708 0 -1 4.000 0 0 0 0 0 2
+	 2819 3915 2869 3915
+2 1 0 1 32 32 707 0 -1 4.000 0 0 0 0 0 2
+	 2952 3915 3002 3915
+2 1 0 1 32 32 706 0 -1 4.000 0 0 0 0 0 2
+	 3085 3915 3135 3915
+2 1 0 1 32 32 705 0 -1 4.000 0 0 0 0 0 2
+	 3219 3915 3252 3915
+2 1 0 1 32 32 704 0 -1 4.000 0 0 0 0 0 2
+	 2819 4290 2869 4290
+2 1 0 1 32 32 703 0 -1 4.000 0 0 0 0 0 2
+	 2952 4290 3002 4290
+2 1 0 1 32 32 702 0 -1 4.000 0 0 0 0 0 2
+	 3085 4290 3135 4290
+2 1 0 1 32 32 701 0 -1 4.000 0 0 0 0 0 2
+	 3219 4290 3252 4290
+2 1 0 1 32 32 700 0 -1 4.000 0 0 0 0 0 2
+	 2819 4282 2819 4332
+2 1 0 1 32 32 699 0 -1 4.000 0 0 0 0 0 2
+	 2819 4415 2819 4465
+2 1 0 1 32 32 698 0 -1 4.000 0 0 0 0 0 2
+	 2819 4548 2819 4598
+2 1 0 1 32 32 697 0 -1 4.000 0 0 0 0 0 2
+	 2019 4282 2019 4332
+2 1 0 1 32 32 696 0 -1 4.000 0 0 0 0 0 2
+	 2019 4415 2019 4465
+2 1 0 1 32 32 695 0 -1 4.000 0 0 0 0 0 2
+	 2019 4548 2019 4598
+2 1 0 1 32 32 694 0 -1 4.000 0 0 0 0 0 2
+	 4036 3915 3986 3915
+2 1 0 1 32 32 693 0 -1 4.000 0 0 0 0 0 2
+	 3902 3915 3852 3915
+2 1 0 1 32 32 692 0 -1 4.000 0 0 0 0 0 2
+	 3769 3915 3719 3915
+2 1 0 1 32 32 691 0 -1 4.000 0 0 0 0 0 2
+	 3636 3915 3602 3915
+2 1 0 1 32 32 690 0 -1 4.000 0 0 0 0 0 2
+	 4036 4290 3986 4290
+2 1 0 1 32 32 689 0 -1 4.000 0 0 0 0 0 2
+	 3902 4290 3852 4290
+2 1 0 1 32 32 688 0 -1 4.000 0 0 0 0 0 2
+	 3769 4290 3719 4290
+2 1 0 1 32 32 687 0 -1 4.000 0 0 0 0 0 2
+	 3636 4290 3602 4290
+2 1 0 1 32 32 686 0 -1 4.000 0 0 0 0 0 2
+	 4035 4282 4035 4332
+2 1 0 1 32 32 685 0 -1 4.000 0 0 0 0 0 2
+	 4035 4415 4035 4465
+2 1 0 1 32 32 684 0 -1 4.000 0 0 0 0 0 2
+	 4035 4548 4035 4598
+2 1 0 1 32 32 683 0 -1 4.000 0 0 0 0 0 2
+	 4835 4282 4835 4332
+2 1 0 1 32 32 682 0 -1 4.000 0 0 0 0 0 2
+	 4835 4415 4835 4465
+2 1 0 1 32 32 681 0 -1 4.000 0 0 0 0 0 2
+	 4835 4548 4835 4598
+2 1 0 1 32 32 680 0 -1 4.000 0 0 0 0 0 2
+	 4036 5565 3986 5565
+2 1 0 1 32 32 679 0 -1 4.000 0 0 0 0 0 2
+	 3902 5565 3852 5565
+2 1 0 1 32 32 678 0 -1 4.000 0 0 0 0 0 2
+	 3769 5565 3719 5565
+2 1 0 1 32 32 677 0 -1 4.000 0 0 0 0 0 2
+	 3636 5565 3602 5565
+2 1 0 1 32 32 676 0 -1 4.000 0 0 0 0 0 2
+	 4036 5190 3986 5190
+2 1 0 1 32 32 675 0 -1 4.000 0 0 0 0 0 2
+	 3902 5190 3852 5190
+2 1 0 1 32 32 674 0 -1 4.000 0 0 0 0 0 2
+	 3769 5190 3719 5190
+2 1 0 1 32 32 673 0 -1 4.000 0 0 0 0 0 2
+	 3636 5190 3602 5190
+2 1 0 1 32 32 672 0 -1 4.000 0 0 0 0 0 2
+	 4035 5198 4035 5148
+2 1 0 1 32 32 671 0 -1 4.000 0 0 0 0 0 2
+	 4035 5065 4035 5015
+2 1 0 1 32 32 670 0 -1 4.000 0 0 0 0 0 2
+	 4035 4932 4035 4882
+2 1 0 1 32 32 669 0 -1 4.000 0 0 0 0 0 2
+	 4835 5198 4835 5148
+2 1 0 1 32 32 668 0 -1 4.000 0 0 0 0 0 2
+	 4835 5065 4835 5015
+2 1 0 1 32 32 667 0 -1 4.000 0 0 0 0 0 2
+	 4835 4932 4835 4882
+2 1 0 1 32 32 666 0 -1 4.000 0 0 0 0 0 2
+	 2819 5565 2869 5565
+2 1 0 1 32 32 665 0 -1 4.000 0 0 0 0 0 2
+	 2952 5565 3002 5565
+2 1 0 1 32 32 664 0 -1 4.000 0 0 0 0 0 2
+	 3085 5565 3135 5565
+2 1 0 1 32 32 663 0 -1 4.000 0 0 0 0 0 2
+	 3219 5565 3252 5565
+2 1 0 1 32 32 662 0 -1 4.000 0 0 0 0 0 2
+	 2819 5190 2869 5190
+2 1 0 1 32 32 661 0 -1 4.000 0 0 0 0 0 2
+	 2952 5190 3002 5190
+2 1 0 1 32 32 660 0 -1 4.000 0 0 0 0 0 2
+	 3085 5190 3135 5190
+2 1 0 1 32 32 659 0 -1 4.000 0 0 0 0 0 2
+	 3219 5190 3252 5190
+2 1 0 1 32 32 658 0 -1 4.000 0 0 0 0 0 2
+	 2819 5198 2819 5148
+2 1 0 1 32 32 657 0 -1 4.000 0 0 0 0 0 2
+	 2819 5065 2819 5015
+2 1 0 1 32 32 656 0 -1 4.000 0 0 0 0 0 2
+	 2819 4932 2819 4882
+2 1 0 1 32 32 655 0 -1 4.000 0 0 0 0 0 2
+	 2019 5198 2019 5148
+2 1 0 1 32 32 654 0 -1 4.000 0 0 0 0 0 2
+	 2019 5065 2019 5015
+2 1 0 1 32 32 653 0 -1 4.000 0 0 0 0 0 2
+	 2019 4932 2019 4882
+2 1 0 1 32 32 640 0 -1 4.000 0 0 0 0 0 2
+	 5635 4282 5635 4332
+2 1 0 1 32 32 639 0 -1 4.000 0 0 0 0 0 2
+	 5635 4415 5635 4465
+2 1 0 1 32 32 638 0 -1 4.000 0 0 0 0 0 2
+	 5635 4548 5635 4598
+2 1 0 1 32 32 637 0 -1 4.000 0 0 0 0 0 2
+	 5635 5198 5635 5148
+2 1 0 1 32 32 636 0 -1 4.000 0 0 0 0 0 2
+	 5635 5065 5635 5015
+2 1 0 1 32 32 635 0 -1 4.000 0 0 0 0 0 2
+	 5635 4932 5635 4882
+2 1 0 1 0 0 634 0 -1 4.000 0 0 0 0 0 2
+	 1420 3781 6312 3781
+2 1 0 0 0 0 633 0 20 4.000 0 0 0 0 0 3
+	 6348 3784 6117 3728 6117 3781
+2 1 0 1 0 0 632 0 -1 4.000 0 0 0 0 0 3
+	 6348 3784 6117 3728 6117 3781
+2 1 0 0 0 0 631 0 20 4.000 0 0 0 0 0 3
+	 6348 3778 6117 3834 6117 3781
+2 1 0 1 0 0 630 0 -1 4.000 0 0 0 0 0 3
+	 6348 3778 6117 3834 6117 3781
+2 1 0 1 0 0 629 0 -1 4.000 0 0 0 0 0 2
+	 1420 4169 6312 4169
+2 1 0 0 0 0 628 0 20 4.000 0 0 0 0 0 3
+	 6348 4172 6117 4116 6117 4169
+2 1 0 1 0 0 627 0 -1 4.000 0 0 0 0 0 3
+	 6348 4172 6117 4116 6117 4169
+2 1 0 0 0 0 626 0 20 4.000 0 0 0 0 0 3
+	 6348 4166 6117 4222 6117 4169
+2 1 0 1 0 0 625 0 -1 4.000 0 0 0 0 0 3
+	 6348 4166 6117 4222 6117 4169
+2 1 0 1 0 0 624 0 -1 4.000 0 0 0 0 0 2
+	 1420 5390 6312 5390
+2 1 0 0 0 0 623 0 20 4.000 0 0 0 0 0 3
+	 6348 5393 6117 5337 6117 5390
+2 1 0 1 0 0 622 0 -1 4.000 0 0 0 0 0 3
+	 6348 5393 6117 5337 6117 5390
+2 1 0 0 0 0 621 0 20 4.000 0 0 0 0 0 3
+	 6348 5387 6117 5443 6117 5390
+2 1 0 1 0 0 620 0 -1 4.000 0 0 0 0 0 3
+	 6348 5387 6117 5443 6117 5390
+2 1 0 1 0 0 619 0 -1 4.000 0 0 0 0 0 2
+	 1420 5766 6312 5766
+2 1 0 0 0 0 618 0 20 4.000 0 0 0 0 0 3
+	 6348 5769 6117 5713 6117 5766
+2 1 0 1 0 0 617 0 -1 4.000 0 0 0 0 0 3
+	 6348 5769 6117 5713 6117 5766
+2 1 0 0 0 0 616 0 20 4.000 0 0 0 0 0 3
+	 6348 5763 6117 5819 6117 5766
+2 1 0 1 0 0 615 0 -1 4.000 0 0 0 0 0 3
+	 6348 5763 6117 5819 6117 5766
+2 1 0 0 33 33 614 0 20 4.000 0 0 0 0 0 5
+	 1469 6215 1868 6215 1868 6591 1469 6591 1469 6215
+2 1 0 1 32 32 613 0 -1 4.000 0 0 0 0 0 5
+	 1469 6215 1868 6215 1868 6591 1469 6591 1469 6215
+2 1 0 0 36 36 610 0 20 4.000 0 0 0 0 0 5
+	 4026 6217 4432 6217 4432 6593 4026 6593 4026 6217
+2 1 0 0 37 37 609 0 20 4.000 0 0 0 0 0 5
+	 3627 6217 4026 6217 4026 6593 3627 6593 3627 6217
+2 1 0 1 38 38 608 0 -1 4.000 0 0 0 0 0 5
+	 3635 6215 4440 6215 4440 6590 3635 6590 3635 6215
+2 1 0 0 7 7 591 0 20 4.000 0 0 0 0 0 5
+	 1221 495 5635 495 5635 2890 1221 2890 1221 495
+2 1 0 1 32 32 590 0 -1 4.000 0 0 0 0 0 5
+	 1221 495 5635 495 5635 2890 1221 2890 1221 495
+2 1 0 0 33 33 589 0 20 4.000 0 0 0 0 0 5
+	 1221 495 1620 495 1620 871 1221 871 1221 495
+2 1 0 1 32 32 588 0 -1 4.000 0 0 0 0 0 5
+	 1221 495 1620 495 1620 871 1221 871 1221 495
+2 1 0 0 33 33 587 0 20 4.000 0 0 0 0 0 5
+	 1620 495 2019 495 2019 871 1620 871 1620 495
+2 1 0 1 32 32 586 0 -1 4.000 0 0 0 0 0 5
+	 1620 495 2019 495 2019 871 1620 871 1620 495
+2 1 0 0 33 33 585 0 20 4.000 0 0 0 0 0 5
+	 2019 495 2418 495 2418 871 2019 871 2019 495
+2 1 0 1 32 32 584 0 -1 4.000 0 0 0 0 0 5
+	 2019 495 2418 495 2418 871 2019 871 2019 495
+2 1 0 0 33 33 583 0 20 4.000 0 0 0 0 0 5
+	 2418 495 2817 495 2817 871 2418 871 2418 495
+2 1 0 1 32 32 582 0 -1 4.000 0 0 0 0 0 5
+	 2418 495 2817 495 2817 871 2418 871 2418 495
+2 1 0 0 33 33 581 0 20 4.000 0 0 0 0 0 5
+	 4038 495 4438 495 4438 871 4038 871 4038 495
+2 1 0 1 32 32 580 0 -1 4.000 0 0 0 0 0 5
+	 4038 495 4438 495 4438 871 4038 871 4038 495
+2 1 0 0 33 33 579 0 20 4.000 0 0 0 0 0 5
+	 4438 495 4837 495 4837 871 4438 871 4438 495
+2 1 0 1 32 32 578 0 -1 4.000 0 0 0 0 0 5
+	 4438 495 4837 495 4837 871 4438 871 4438 495
+2 1 0 0 33 33 577 0 20 4.000 0 0 0 0 0 5
+	 4837 495 5236 495 5236 871 4837 871 4837 495
+2 1 0 1 32 32 576 0 -1 4.000 0 0 0 0 0 5
+	 4837 495 5236 495 5236 871 4837 871 4837 495
+2 1 0 0 33 33 575 0 20 4.000 0 0 0 0 0 5
+	 5236 495 5635 495 5635 871 5236 871 5236 495
+2 1 0 1 32 32 574 0 -1 4.000 0 0 0 0 0 5
+	 5236 495 5635 495 5635 871 5236 871 5236 495
+2 1 0 0 33 33 573 0 20 4.000 0 0 0 0 0 5
+	 1221 871 1620 871 1620 1247 1221 1247 1221 871
+2 1 0 1 32 32 572 0 -1 4.000 0 0 0 0 0 5
+	 1221 871 1620 871 1620 1247 1221 1247 1221 871
+2 1 0 0 33 33 571 0 20 4.000 0 0 0 0 0 5
+	 1620 871 2019 871 2019 1247 1620 1247 1620 871
+2 1 0 1 32 32 570 0 -1 4.000 0 0 0 0 0 5
+	 1620 871 2019 871 2019 1247 1620 1247 1620 871
+2 1 0 0 33 33 569 0 20 4.000 0 0 0 0 0 5
+	 2019 871 2418 871 2418 1247 2019 1247 2019 871
+2 1 0 1 32 32 568 0 -1 4.000 0 0 0 0 0 5
+	 2019 871 2418 871 2418 1247 2019 1247 2019 871
+2 1 0 0 33 33 567 0 20 4.000 0 0 0 0 0 5
+	 2418 871 2817 871 2817 1247 2418 1247 2418 871
+2 1 0 1 32 32 566 0 -1 4.000 0 0 0 0 0 5
+	 2418 871 2817 871 2817 1247 2418 1247 2418 871
+2 1 0 0 33 33 565 0 20 4.000 0 0 0 0 0 5
+	 4038 871 4438 871 4438 1247 4038 1247 4038 871
+2 1 0 1 32 32 564 0 -1 4.000 0 0 0 0 0 5
+	 4038 871 4438 871 4438 1247 4038 1247 4038 871
+2 1 0 0 33 33 563 0 20 4.000 0 0 0 0 0 5
+	 4438 871 4837 871 4837 1247 4438 1247 4438 871
+2 1 0 1 32 32 562 0 -1 4.000 0 0 0 0 0 5
+	 4438 871 4837 871 4837 1247 4438 1247 4438 871
+2 1 0 0 33 33 561 0 20 4.000 0 0 0 0 0 5
+	 4837 871 5236 871 5236 1247 4837 1247 4837 871
+2 1 0 1 32 32 560 0 -1 4.000 0 0 0 0 0 5
+	 4837 871 5236 871 5236 1247 4837 1247 4837 871
+2 1 0 0 33 33 559 0 20 4.000 0 0 0 0 0 5
+	 5236 871 5635 871 5635 1247 5236 1247 5236 871
+2 1 0 1 32 32 558 0 -1 4.000 0 0 0 0 0 5
+	 5236 871 5635 871 5635 1247 5236 1247 5236 871
+2 1 0 0 33 33 557 0 20 4.000 0 0 0 0 0 5
+	 1221 2139 1620 2139 1620 2515 1221 2515 1221 2139
+2 1 0 1 32 32 556 0 -1 4.000 0 0 0 0 0 5
+	 1221 2139 1620 2139 1620 2515 1221 2515 1221 2139
+2 1 0 0 33 33 555 0 20 4.000 0 0 0 0 0 5
+	 1620 2139 2019 2139 2019 2515 1620 2515 1620 2139
+2 1 0 1 32 32 554 0 -1 4.000 0 0 0 0 0 5
+	 1620 2139 2019 2139 2019 2515 1620 2515 1620 2139
+2 1 0 0 33 33 553 0 20 4.000 0 0 0 0 0 5
+	 2019 2139 2418 2139 2418 2515 2019 2515 2019 2139
+2 1 0 1 32 32 552 0 -1 4.000 0 0 0 0 0 5
+	 2019 2139 2418 2139 2418 2515 2019 2515 2019 2139
+2 1 0 0 33 33 551 0 20 4.000 0 0 0 0 0 5
+	 2418 2139 2817 2139 2817 2515 2418 2515 2418 2139
+2 1 0 1 32 32 550 0 -1 4.000 0 0 0 0 0 5
+	 2418 2139 2817 2139 2817 2515 2418 2515 2418 2139
+2 1 0 0 33 33 549 0 20 4.000 0 0 0 0 0 5
+	 4038 2139 4438 2139 4438 2515 4038 2515 4038 2139
+2 1 0 1 32 32 548 0 -1 4.000 0 0 0 0 0 5
+	 4038 2139 4438 2139 4438 2515 4038 2515 4038 2139
+2 1 0 0 33 33 547 0 20 4.000 0 0 0 0 0 5
+	 4438 2139 4837 2139 4837 2515 4438 2515 4438 2139
+2 1 0 1 32 32 546 0 -1 4.000 0 0 0 0 0 5
+	 4438 2139 4837 2139 4837 2515 4438 2515 4438 2139
+2 1 0 0 33 33 545 0 20 4.000 0 0 0 0 0 5
+	 4837 2139 5236 2139 5236 2515 4837 2515 4837 2139
+2 1 0 1 32 32 544 0 -1 4.000 0 0 0 0 0 5
+	 4837 2139 5236 2139 5236 2515 4837 2515 4837 2139
+2 1 0 0 33 33 543 0 20 4.000 0 0 0 0 0 5
+	 5236 2139 5635 2139 5635 2515 5236 2515 5236 2139
+2 1 0 1 32 32 542 0 -1 4.000 0 0 0 0 0 5
+	 5236 2139 5635 2139 5635 2515 5236 2515 5236 2139
+2 1 0 0 33 33 541 0 20 4.000 0 0 0 0 0 5
+	 1221 2515 1620 2515 1620 2890 1221 2890 1221 2515
+2 1 0 1 32 32 540 0 -1 4.000 0 0 0 0 0 5
+	 1221 2515 1620 2515 1620 2890 1221 2890 1221 2515
+2 1 0 0 33 33 539 0 20 4.000 0 0 0 0 0 5
+	 1620 2515 2019 2515 2019 2890 1620 2890 1620 2515
+2 1 0 1 32 32 538 0 -1 4.000 0 0 0 0 0 5
+	 1620 2515 2019 2515 2019 2890 1620 2890 1620 2515
+2 1 0 0 33 33 537 0 20 4.000 0 0 0 0 0 5
+	 2019 2515 2418 2515 2418 2890 2019 2890 2019 2515
+2 1 0 1 32 32 536 0 -1 4.000 0 0 0 0 0 5
+	 2019 2515 2418 2515 2418 2890 2019 2890 2019 2515
+2 1 0 0 33 33 535 0 20 4.000 0 0 0 0 0 5
+	 2418 2515 2817 2515 2817 2890 2418 2890 2418 2515
+2 1 0 1 32 32 534 0 -1 4.000 0 0 0 0 0 5
+	 2418 2515 2817 2515 2817 2890 2418 2890 2418 2515
+2 1 0 0 33 33 533 0 20 4.000 0 0 0 0 0 5
+	 4038 2515 4438 2515 4438 2890 4038 2890 4038 2515
+2 1 0 1 32 32 532 0 -1 4.000 0 0 0 0 0 5
+	 4038 2515 4438 2515 4438 2890 4038 2890 4038 2515
+2 1 0 0 33 33 531 0 20 4.000 0 0 0 0 0 5
+	 4438 2515 4837 2515 4837 2890 4438 2890 4438 2515
+2 1 0 1 32 32 530 0 -1 4.000 0 0 0 0 0 5
+	 4438 2515 4837 2515 4837 2890 4438 2890 4438 2515
+2 1 0 0 33 33 529 0 20 4.000 0 0 0 0 0 5
+	 4837 2515 5236 2515 5236 2890 4837 2890 4837 2515
+2 1 0 1 32 32 528 0 -1 4.000 0 0 0 0 0 5
+	 4837 2515 5236 2515 5236 2890 4837 2890 4837 2515
+2 1 0 0 33 33 527 0 20 4.000 0 0 0 0 0 5
+	 5236 2515 5635 2515 5635 2890 5236 2890 5236 2515
+2 1 0 1 32 32 526 0 -1 4.000 0 0 0 0 0 5
+	 5236 2515 5635 2515 5635 2890 5236 2890 5236 2515
+2 1 0 1 0 0 525 0 -1 4.000 0 0 0 0 0 2
+	 1420 730 5459 730
+2 1 0 0 0 0 524 0 20 4.000 0 0 0 0 0 3
+	 5488 733 5298 677 5298 730
+2 1 0 1 0 0 523 0 -1 4.000 0 0 0 0 0 3
+	 5488 733 5298 677 5298 730
+2 1 0 0 0 0 522 0 20 4.000 0 0 0 0 0 3
+	 5488 727 5298 783 5298 730
+2 1 0 1 0 0 521 0 -1 4.000 0 0 0 0 0 3
+	 5488 727 5298 783 5298 730
+2 1 0 1 32 32 518 0 -1 4.000 0 0 0 0 0 2
+	 2819 873 2869 873
+2 1 0 1 32 32 517 0 -1 4.000 0 0 0 0 0 2
+	 2952 873 3002 873
+2 1 0 1 32 32 516 0 -1 4.000 0 0 0 0 0 2
+	 3085 873 3135 873
+2 1 0 1 32 32 515 0 -1 4.000 0 0 0 0 0 2
+	 3219 873 3252 873
+2 1 0 1 32 32 514 0 -1 4.000 0 0 0 0 0 2
+	 2819 1248 2869 1248
+2 1 0 1 32 32 513 0 -1 4.000 0 0 0 0 0 2
+	 2952 1248 3002 1248
+2 1 0 1 32 32 512 0 -1 4.000 0 0 0 0 0 2
+	 3085 1248 3135 1248
+2 1 0 1 32 32 511 0 -1 4.000 0 0 0 0 0 2
+	 3219 1248 3252 1248
+2 1 0 1 32 32 510 0 -1 4.000 0 0 0 0 0 2
+	 2819 1240 2819 1290
+2 1 0 1 32 32 509 0 -1 4.000 0 0 0 0 0 2
+	 2819 1373 2819 1423
+2 1 0 1 32 32 508 0 -1 4.000 0 0 0 0 0 2
+	 2819 1506 2819 1556
+2 1 0 1 32 32 507 0 -1 4.000 0 0 0 0 0 2
+	 2419 1240 2419 1290
+2 1 0 1 32 32 506 0 -1 4.000 0 0 0 0 0 2
+	 2419 1373 2419 1423
+2 1 0 1 32 32 505 0 -1 4.000 0 0 0 0 0 2
+	 2419 1506 2419 1556
+2 1 0 1 32 32 504 0 -1 4.000 0 0 0 0 0 2
+	 2019 1240 2019 1290
+2 1 0 1 32 32 503 0 -1 4.000 0 0 0 0 0 2
+	 2019 1373 2019 1423
+2 1 0 1 32 32 502 0 -1 4.000 0 0 0 0 0 2
+	 2019 1506 2019 1556
+2 1 0 1 32 32 501 0 -1 4.000 0 0 0 0 0 2
+	 1619 1240 1619 1290
+2 1 0 1 32 32 500 0 -1 4.000 0 0 0 0 0 2
+	 1619 1373 1619 1423
+2 1 0 1 32 32 499 0 -1 4.000 0 0 0 0 0 2
+	 1619 1506 1619 1556
+2 1 0 1 32 32 498 0 -1 4.000 0 0 0 0 0 2
+	 4036 873 3986 873
+2 1 0 1 32 32 497 0 -1 4.000 0 0 0 0 0 2
+	 3902 873 3852 873
+2 1 0 1 32 32 496 0 -1 4.000 0 0 0 0 0 2
+	 3769 873 3719 873
+2 1 0 1 32 32 495 0 -1 4.000 0 0 0 0 0 2
+	 3636 873 3602 873
+2 1 0 1 32 32 494 0 -1 4.000 0 0 0 0 0 2
+	 4036 1248 3986 1248
+2 1 0 1 32 32 493 0 -1 4.000 0 0 0 0 0 2
+	 3902 1248 3852 1248
+2 1 0 1 32 32 492 0 -1 4.000 0 0 0 0 0 2
+	 3769 1248 3719 1248
+2 1 0 1 32 32 491 0 -1 4.000 0 0 0 0 0 2
+	 3636 1248 3602 1248
+2 1 0 1 32 32 490 0 -1 4.000 0 0 0 0 0 2
+	 4035 1240 4035 1290
+2 1 0 1 32 32 489 0 -1 4.000 0 0 0 0 0 2
+	 4035 1373 4035 1423
+2 1 0 1 32 32 488 0 -1 4.000 0 0 0 0 0 2
+	 4035 1506 4035 1556
+2 1 0 1 32 32 487 0 -1 4.000 0 0 0 0 0 2
+	 4435 1240 4435 1290
+2 1 0 1 32 32 486 0 -1 4.000 0 0 0 0 0 2
+	 4435 1373 4435 1423
+2 1 0 1 32 32 485 0 -1 4.000 0 0 0 0 0 2
+	 4435 1506 4435 1556
+2 1 0 1 32 32 484 0 -1 4.000 0 0 0 0 0 2
+	 4835 1240 4835 1290
+2 1 0 1 32 32 483 0 -1 4.000 0 0 0 0 0 2
+	 4835 1373 4835 1423
+2 1 0 1 32 32 482 0 -1 4.000 0 0 0 0 0 2
+	 4835 1506 4835 1556
+2 1 0 1 32 32 481 0 -1 4.000 0 0 0 0 0 2
+	 5235 1240 5235 1290
+2 1 0 1 32 32 480 0 -1 4.000 0 0 0 0 0 2
+	 5235 1373 5235 1423
+2 1 0 1 32 32 479 0 -1 4.000 0 0 0 0 0 2
+	 5235 1506 5235 1556
+2 1 0 1 32 32 478 0 -1 4.000 0 0 0 0 0 2
+	 4036 2515 3986 2515
+2 1 0 1 32 32 477 0 -1 4.000 0 0 0 0 0 2
+	 3902 2515 3852 2515
+2 1 0 1 32 32 476 0 -1 4.000 0 0 0 0 0 2
+	 3769 2515 3719 2515
+2 1 0 1 32 32 475 0 -1 4.000 0 0 0 0 0 2
+	 3636 2515 3602 2515
+2 1 0 1 32 32 474 0 -1 4.000 0 0 0 0 0 2
+	 4036 2140 3986 2140
+2 1 0 1 32 32 473 0 -1 4.000 0 0 0 0 0 2
+	 3902 2140 3852 2140
+2 1 0 1 32 32 472 0 -1 4.000 0 0 0 0 0 2
+	 3769 2140 3719 2140
+2 1 0 1 32 32 471 0 -1 4.000 0 0 0 0 0 2
+	 3636 2140 3602 2140
+2 1 0 1 32 32 470 0 -1 4.000 0 0 0 0 0 2
+	 4035 2148 4035 2098
+2 1 0 1 32 32 469 0 -1 4.000 0 0 0 0 0 2
+	 4035 2015 4035 1965
+2 1 0 1 32 32 468 0 -1 4.000 0 0 0 0 0 2
+	 4035 1881 4035 1831
+2 1 0 1 32 32 467 0 -1 4.000 0 0 0 0 0 2
+	 4435 2148 4435 2098
+2 1 0 1 32 32 466 0 -1 4.000 0 0 0 0 0 2
+	 4435 2015 4435 1965
+2 1 0 1 32 32 465 0 -1 4.000 0 0 0 0 0 2
+	 4435 1881 4435 1831
+2 1 0 1 32 32 464 0 -1 4.000 0 0 0 0 0 2
+	 4835 2148 4835 2098
+2 1 0 1 32 32 463 0 -1 4.000 0 0 0 0 0 2
+	 4835 2015 4835 1965
+2 1 0 1 32 32 462 0 -1 4.000 0 0 0 0 0 2
+	 4835 1881 4835 1831
+2 1 0 1 32 32 461 0 -1 4.000 0 0 0 0 0 2
+	 5235 2148 5235 2098
+2 1 0 1 32 32 460 0 -1 4.000 0 0 0 0 0 2
+	 5235 2015 5235 1965
+2 1 0 1 32 32 459 0 -1 4.000 0 0 0 0 0 2
+	 5235 1881 5235 1831
+2 1 0 1 32 32 458 0 -1 4.000 0 0 0 0 0 2
+	 2819 2515 2869 2515
+2 1 0 1 32 32 457 0 -1 4.000 0 0 0 0 0 2
+	 2952 2515 3002 2515
+2 1 0 1 32 32 456 0 -1 4.000 0 0 0 0 0 2
+	 3085 2515 3135 2515
+2 1 0 1 32 32 455 0 -1 4.000 0 0 0 0 0 2
+	 3219 2515 3252 2515
+2 1 0 1 32 32 454 0 -1 4.000 0 0 0 0 0 2
+	 2819 2140 2869 2140
+2 1 0 1 32 32 453 0 -1 4.000 0 0 0 0 0 2
+	 2952 2140 3002 2140
+2 1 0 1 32 32 452 0 -1 4.000 0 0 0 0 0 2
+	 3085 2140 3135 2140
+2 1 0 1 32 32 451 0 -1 4.000 0 0 0 0 0 2
+	 3219 2140 3252 2140
+2 1 0 1 32 32 450 0 -1 4.000 0 0 0 0 0 2
+	 2819 2148 2819 2098
+2 1 0 1 32 32 449 0 -1 4.000 0 0 0 0 0 2
+	 2819 2015 2819 1965
+2 1 0 1 32 32 448 0 -1 4.000 0 0 0 0 0 2
+	 2819 1881 2819 1831
+2 1 0 1 32 32 447 0 -1 4.000 0 0 0 0 0 2
+	 2419 2148 2419 2098
+2 1 0 1 32 32 446 0 -1 4.000 0 0 0 0 0 2
+	 2419 2015 2419 1965
+2 1 0 1 32 32 445 0 -1 4.000 0 0 0 0 0 2
+	 2419 1881 2419 1831
+2 1 0 1 32 32 444 0 -1 4.000 0 0 0 0 0 2
+	 2019 2148 2019 2098
+2 1 0 1 32 32 443 0 -1 4.000 0 0 0 0 0 2
+	 2019 2015 2019 1965
+2 1 0 1 32 32 442 0 -1 4.000 0 0 0 0 0 2
+	 2019 1881 2019 1831
+2 1 0 1 32 32 441 0 -1 4.000 0 0 0 0 0 2
+	 1619 2148 1619 2098
+2 1 0 1 32 32 440 0 -1 4.000 0 0 0 0 0 2
+	 1619 2015 1619 1965
+2 1 0 1 32 32 439 0 -1 4.000 0 0 0 0 0 2
+	 1619 1881 1619 1831
+2 1 0 1 0 0 426 0 -1 4.000 0 0 0 0 0 2
+	 1420 1106 5459 1106
+2 1 0 0 0 0 425 0 20 4.000 0 0 0 0 0 3
+	 5488 1109 5298 1053 5298 1106
+2 1 0 1 0 0 424 0 -1 4.000 0 0 0 0 0 3
+	 5488 1109 5298 1053 5298 1106
+2 1 0 0 0 0 423 0 20 4.000 0 0 0 0 0 3
+	 5488 1103 5298 1159 5298 1106
+2 1 0 1 0 0 422 0 -1 4.000 0 0 0 0 0 3
+	 5488 1103 5298 1159 5298 1106
+2 1 0 1 0 0 421 0 -1 4.000 0 0 0 0 0 2
+	 1420 2327 5459 2327
+2 1 0 0 0 0 420 0 20 4.000 0 0 0 0 0 3
+	 5488 2330 5298 2274 5298 2327
+2 1 0 1 0 0 419 0 -1 4.000 0 0 0 0 0 3
+	 5488 2330 5298 2274 5298 2327
+2 1 0 0 0 0 418 0 20 4.000 0 0 0 0 0 3
+	 5488 2324 5298 2380 5298 2327
+2 1 0 1 0 0 417 0 -1 4.000 0 0 0 0 0 3
+	 5488 2324 5298 2380 5298 2327
+2 1 0 1 0 0 416 0 -1 4.000 0 0 0 0 0 2
+	 1420 2703 5459 2703
+2 1 0 0 0 0 415 0 20 4.000 0 0 0 0 0 3
+	 5488 2706 5298 2650 5298 2703
+2 1 0 1 0 0 414 0 -1 4.000 0 0 0 0 0 3
+	 5488 2706 5298 2650 5298 2703
+2 1 0 0 0 0 413 0 20 4.000 0 0 0 0 0 3
+	 5488 2700 5298 2755 5298 2703
+2 1 0 1 0 0 412 0 -1 4.000 0 0 0 0 0 3
+	 5488 2700 5298 2755 5298 2703
+2 1 0 1 0 0 389 0 -1 4.000 0 0 0 0 0 2
+	 273 3662 273 3039
+2 1 0 1 0 0 388 0 -1 4.000 0 0 0 0 0 2
+	 382 3920 156 3662
+2 1 0 1 0 0 387 0 -1 4.000 0 0 0 0 0 2
+	 273 3662 148 3662
+2 1 0 1 0 0 386 0 -1 4.000 0 0 0 0 0 2
+	 487 3662 487 3039
+2 1 0 1 0 0 385 0 -1 4.000 0 0 0 0 0 2
+	 378 3920 604 3662
+2 1 0 1 0 0 384 0 -1 4.000 0 0 0 0 0 2
+	 487 3662 612 3662
+2 1 0 1 0 0 383 0 -1 4.000 0 0 0 0 0 2
+	 273 6130 273 6753
+2 1 0 1 0 0 382 0 -1 4.000 0 0 0 0 0 2
+	 382 5872 156 6130
+2 1 0 1 0 0 381 0 -1 4.000 0 0 0 0 0 2
+	 273 6130 148 6130
+2 1 0 1 0 0 380 0 -1 4.000 0 0 0 0 0 2
+	 487 6129 487 6753
+2 1 0 1 0 0 379 0 -1 4.000 0 0 0 0 0 2
+	 378 5872 604 6129
+2 1 0 1 0 0 378 0 -1 4.000 0 0 0 0 0 2
+	 487 6129 612 6129
+4 0 0 931 -1 16 15 0.0000 4 30 135 2064 7726 ...\001
+4 0 0 849 -1 16 13 0.0000 4 150 270 2338 7001 ny \001
+4 0 34 848 -1 16 13 0.0000 4 180 870 2605 7001 + 2-ny%2\001
+4 0 0 847 -1 16 13 0.0000 4 180 1110 3500 7001  = 2*(ny/2+1)\001
+4 0 0 845 -1 16 13 0.0000 4 105 195 681 8451 nx\001
+4 0 0 843 -1 16 10 0.0000 4 120 90 1364 7179 0\001
+4 0 0 841 -1 16 10 0.0000 4 150 345 6097 7179 ny+1\001
+4 0 0 839 -1 16 10 0.0000 4 120 90 1064 7479 0\001
+4 0 0 837 -1 16 10 0.0000 4 120 375 864 9479 nx-1\001
+4 0 0 798 -1 16 25 1.5708 4 360 1575 6250 9461 (padding)\001
+4 0 0 796 -1 18 16 1.5708 4 240 1695 428 9283 input, in-place\001
+4 0 0 794 -1 16 10 0.0000 4 120 90 1264 7429 0\001
+4 0 0 792 -1 16 10 0.0000 4 120 90 1681 7429 1\001
+4 0 0 790 -1 16 10 0.0000 4 120 90 2081 7429 2\001
+4 0 0 788 -1 16 10 0.0000 4 120 90 2481 7429 3\001
+4 0 0 786 -1 16 10 0.0000 4 150 360 4081 7429 ny-4\001
+4 0 0 784 -1 16 10 0.0000 4 150 360 4897 7429 ny-2\001
+4 0 0 782 -1 16 10 0.0000 4 150 360 5297 7429 ny-1\001
+4 0 0 780 -1 16 10 0.0000 4 150 360 4497 7429 ny-3\001
+4 0 0 778 -1 16 10 0.0000 4 150 345 1264 7795 ny+2\001
+4 0 0 776 -1 16 10 0.0000 4 150 345 1664 7795 ny+3\001
+4 0 0 774 -1 16 10 0.0000 4 120 165 5681 7429 ny\001
+4 0 0 772 -1 16 10 0.0000 4 150 345 6081 7429 ny+1\001
+4 0 0 709 -1 16 15 0.0000 4 30 135 2064 3993 ...\001
+4 0 0 651 -1 16 13 0.0000 4 180 585 3181 3267 ny/2+1\001
+4 0 0 649 -1 16 13 0.0000 4 105 195 681 4717 nx\001
+4 0 0 647 -1 16 10 0.0000 4 120 90 1564 3445 0\001
+4 0 0 645 -1 16 10 0.0000 4 150 300 5831 3445 ny/2\001
+4 0 0 643 -1 16 10 0.0000 4 120 90 1064 3745 0\001
+4 0 0 641 -1 16 10 0.0000 4 120 375 864 5745 nx-1\001
+4 0 0 611 -1 16 13 0.0000 4 165 855 1981 6463 = double\001
+4 0 0 606 -1 16 13 0.0000 4 180 1230 4547 6463 = fftw_complex\001
+4 0 0 604 -1 18 16 1.5708 4 225 780 428 5128 output\001
+4 0 0 602 -1 16 10 0.0000 4 120 90 1264 3679 0\001
+4 0 0 600 -1 16 10 0.0000 4 120 90 2081 3679 1\001
+4 0 0 598 -1 16 10 0.0000 4 150 495 4097 3679 ny/2-2\001
+4 0 0 596 -1 16 10 0.0000 4 150 495 4914 3679 ny/2-1\001
+4 0 0 594 -1 16 10 0.0000 4 150 480 1264 4062 ny/2+1\001
+4 0 0 592 -1 16 10 0.0000 4 150 300 5697 3679 ny/2\001
+4 0 0 519 -1 16 15 0.0000 4 30 135 2064 943 ...\001
+4 0 0 437 -1 16 13 0.0000 4 150 210 3381 217 ny\001
+4 0 0 435 -1 16 13 0.0000 4 105 195 681 1667 nx\001
+4 0 0 433 -1 16 10 0.0000 4 120 90 1364 395 0\001
+4 0 0 431 -1 16 10 0.0000 4 150 360 5281 395 ny-1\001
+4 0 0 429 -1 16 10 0.0000 4 120 90 1064 695 0\001
+4 0 0 427 -1 16 10 0.0000 4 120 375 864 2695 nx-1\001
+4 0 0 410 -1 18 16 1.5708 4 240 2235 428 2734 input, out-of-place\001
+4 0 0 408 -1 16 10 0.0000 4 120 90 1264 629 0\001
+4 0 0 406 -1 16 10 0.0000 4 120 90 1681 629 1\001
+4 0 0 404 -1 16 10 0.0000 4 120 90 2081 629 2\001
+4 0 0 402 -1 16 10 0.0000 4 120 90 2481 629 3\001
+4 0 0 400 -1 16 10 0.0000 4 150 360 4081 629 ny-4\001
+4 0 0 398 -1 16 10 0.0000 4 150 360 4897 629 ny-2\001
+4 0 0 396 -1 16 10 0.0000 4 150 360 5297 629 ny-1\001
+4 0 0 394 -1 16 10 0.0000 4 150 360 4497 629 ny-3\001
+4 0 0 392 -1 16 10 0.0000 4 120 165 1264 1012 ny\001
+4 0 0 390 -1 16 10 0.0000 4 150 345 1664 1012 ny+1\001
+-6
+4 0 0 932 -1 16 15 0.0000 4 15 60 74 89  \001
+4 0 0 850 -1 16 13 0.0000 4 15 60 74 89  \001
+4 0 0 846 -1 16 13 0.0000 4 15 60 74 89  \001
+4 0 0 844 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 842 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 840 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 838 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 799 -1 16 25 0.0000 4 15 90 74 89  \001
+4 0 0 797 -1 18 16 0.0000 4 15 60 74 89  \001
+4 0 0 795 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 793 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 791 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 789 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 787 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 785 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 783 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 781 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 779 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 777 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 775 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 773 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 710 -1 16 15 0.0000 4 15 60 74 89  \001
+4 0 0 652 -1 16 13 0.0000 4 15 60 74 89  \001
+4 0 0 650 -1 16 13 0.0000 4 15 60 74 89  \001
+4 0 0 648 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 646 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 644 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 642 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 612 -1 16 13 0.0000 4 15 60 74 89  \001
+4 0 0 607 -1 16 13 0.0000 4 15 60 74 89  \001
+4 0 0 605 -1 18 16 0.0000 4 15 60 74 89  \001
+4 0 0 603 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 601 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 599 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 597 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 595 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 593 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 520 -1 16 15 0.0000 4 15 60 74 89  \001
+4 0 0 438 -1 16 13 0.0000 4 15 60 74 89  \001
+4 0 0 436 -1 16 13 0.0000 4 15 60 74 89  \001
+4 0 0 434 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 432 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 430 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 428 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 411 -1 18 16 0.0000 4 15 60 74 89  \001
+4 0 0 409 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 407 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 405 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 403 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 401 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 399 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 397 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 395 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 393 -1 16 10 0.0000 4 15 45 74 89  \001
+4 0 0 391 -1 16 10 0.0000 4 15 45 1425 4800  \001
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/rfftwnd.pdf
Binary file src/fftw-3.3.3/doc/rfftwnd.pdf has changed
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/stamp-vti
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/stamp-vti	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,4 @@
+@set UPDATED 25 November 2012
+@set UPDATED-MONTH November 2012
+@set EDITION 3.3.3
+@set VERSION 3.3.3
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/texinfo.tex
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/texinfo.tex	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,9977 @@
+% texinfo.tex -- TeX macros to handle Texinfo files.
+% 
+% Load plain if necessary, i.e., if running under initex.
+\expandafter\ifx\csname fmtname\endcsname\relax\input plain\fi
+%
+\def\texinfoversion{2012-03-11.15}
+%
+% Copyright 1985, 1986, 1988, 1990, 1991, 1992, 1993, 1994, 1995,
+% 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
+% 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+%
+% This texinfo.tex file is free software: you can redistribute it and/or
+% modify it under the terms of the GNU General Public License as
+% published by the Free Software Foundation, either version 3 of the
+% License, or (at your option) any later version.
+%
+% This texinfo.tex file is distributed in the hope that it will be
+% useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+% of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+% General Public License for more details.
+%
+% You should have received a copy of the GNU General Public License
+% along with this program.  If not, see <http://www.gnu.org/licenses/>.
+%
+% As a special exception, when this file is read by TeX when processing
+% a Texinfo source document, you may use the result without
+% restriction.  (This has been our intent since Texinfo was invented.)
+%
+% Please try the latest version of texinfo.tex before submitting bug
+% reports; you can get the latest version from:
+%   http://www.gnu.org/software/texinfo/ (the Texinfo home page), or
+%   ftp://tug.org/tex/texinfo.tex
+%     (and all CTAN mirrors, see http://www.ctan.org).
+% The texinfo.tex in any given distribution could well be out
+% of date, so if that's what you're using, please check.
+%
+% Send bug reports to bug-texinfo@gnu.org.  Please include including a
+% complete document in each bug report with which we can reproduce the
+% problem.  Patches are, of course, greatly appreciated.
+%
+% To process a Texinfo manual with TeX, it's most reliable to use the
+% texi2dvi shell script that comes with the distribution.  For a simple
+% manual foo.texi, however, you can get away with this:
+%   tex foo.texi
+%   texindex foo.??
+%   tex foo.texi
+%   tex foo.texi
+%   dvips foo.dvi -o  # or whatever; this makes foo.ps.
+% The extra TeX runs get the cross-reference information correct.
+% Sometimes one run after texindex suffices, and sometimes you need more
+% than two; texi2dvi does it as many times as necessary.
+%
+% It is possible to adapt texinfo.tex for other languages, to some
+% extent.  You can get the existing language-specific files from the
+% full Texinfo distribution.
+%
+% The GNU Texinfo home page is http://www.gnu.org/software/texinfo.
+
+
+\message{Loading texinfo [version \texinfoversion]:}
+
+% If in a .fmt file, print the version number
+% and turn on active characters that we couldn't do earlier because
+% they might have appeared in the input file name.
+\everyjob{\message{[Texinfo version \texinfoversion]}%
+  \catcode`+=\active \catcode`\_=\active}
+
+\chardef\other=12
+
+% We never want plain's \outer definition of \+ in Texinfo.
+% For @tex, we can use \tabalign.
+\let\+ = \relax
+
+% Save some plain tex macros whose names we will redefine.
+\let\ptexb=\b
+\let\ptexbullet=\bullet
+\let\ptexc=\c
+\let\ptexcomma=\,
+\let\ptexdot=\.
+\let\ptexdots=\dots
+\let\ptexend=\end
+\let\ptexequiv=\equiv
+\let\ptexexclam=\!
+\let\ptexfootnote=\footnote
+\let\ptexgtr=>
+\let\ptexhat=^
+\let\ptexi=\i
+\let\ptexindent=\indent
+\let\ptexinsert=\insert
+\let\ptexlbrace=\{
+\let\ptexless=<
+\let\ptexnewwrite\newwrite
+\let\ptexnoindent=\noindent
+\let\ptexplus=+
+\let\ptexraggedright=\raggedright
+\let\ptexrbrace=\}
+\let\ptexslash=\/
+\let\ptexstar=\*
+\let\ptext=\t
+\let\ptextop=\top
+{\catcode`\'=\active \global\let\ptexquoteright'}% active in plain's math mode
+
+% If this character appears in an error message or help string, it
+% starts a new line in the output.
+\newlinechar = `^^J
+
+% Use TeX 3.0's \inputlineno to get the line number, for better error
+% messages, but if we're using an old version of TeX, don't do anything.
+%
+\ifx\inputlineno\thisisundefined
+  \let\linenumber = \empty % Pre-3.0.
+\else
+  \def\linenumber{l.\the\inputlineno:\space}
+\fi
+
+% Set up fixed words for English if not already set.
+\ifx\putwordAppendix\undefined  \gdef\putwordAppendix{Appendix}\fi
+\ifx\putwordChapter\undefined   \gdef\putwordChapter{Chapter}\fi
+\ifx\putworderror\undefined     \gdef\putworderror{error}\fi
+\ifx\putwordfile\undefined      \gdef\putwordfile{file}\fi
+\ifx\putwordin\undefined        \gdef\putwordin{in}\fi
+\ifx\putwordIndexIsEmpty\undefined       \gdef\putwordIndexIsEmpty{(Index is empty)}\fi
+\ifx\putwordIndexNonexistent\undefined   \gdef\putwordIndexNonexistent{(Index is nonexistent)}\fi
+\ifx\putwordInfo\undefined      \gdef\putwordInfo{Info}\fi
+\ifx\putwordInstanceVariableof\undefined \gdef\putwordInstanceVariableof{Instance Variable of}\fi
+\ifx\putwordMethodon\undefined  \gdef\putwordMethodon{Method on}\fi
+\ifx\putwordNoTitle\undefined   \gdef\putwordNoTitle{No Title}\fi
+\ifx\putwordof\undefined        \gdef\putwordof{of}\fi
+\ifx\putwordon\undefined        \gdef\putwordon{on}\fi
+\ifx\putwordpage\undefined      \gdef\putwordpage{page}\fi
+\ifx\putwordsection\undefined   \gdef\putwordsection{section}\fi
+\ifx\putwordSection\undefined   \gdef\putwordSection{Section}\fi
+\ifx\putwordsee\undefined       \gdef\putwordsee{see}\fi
+\ifx\putwordSee\undefined       \gdef\putwordSee{See}\fi
+\ifx\putwordShortTOC\undefined  \gdef\putwordShortTOC{Short Contents}\fi
+\ifx\putwordTOC\undefined       \gdef\putwordTOC{Table of Contents}\fi
+%
+\ifx\putwordMJan\undefined \gdef\putwordMJan{January}\fi
+\ifx\putwordMFeb\undefined \gdef\putwordMFeb{February}\fi
+\ifx\putwordMMar\undefined \gdef\putwordMMar{March}\fi
+\ifx\putwordMApr\undefined \gdef\putwordMApr{April}\fi
+\ifx\putwordMMay\undefined \gdef\putwordMMay{May}\fi
+\ifx\putwordMJun\undefined \gdef\putwordMJun{June}\fi
+\ifx\putwordMJul\undefined \gdef\putwordMJul{July}\fi
+\ifx\putwordMAug\undefined \gdef\putwordMAug{August}\fi
+\ifx\putwordMSep\undefined \gdef\putwordMSep{September}\fi
+\ifx\putwordMOct\undefined \gdef\putwordMOct{October}\fi
+\ifx\putwordMNov\undefined \gdef\putwordMNov{November}\fi
+\ifx\putwordMDec\undefined \gdef\putwordMDec{December}\fi
+%
+\ifx\putwordDefmac\undefined    \gdef\putwordDefmac{Macro}\fi
+\ifx\putwordDefspec\undefined   \gdef\putwordDefspec{Special Form}\fi
+\ifx\putwordDefvar\undefined    \gdef\putwordDefvar{Variable}\fi
+\ifx\putwordDefopt\undefined    \gdef\putwordDefopt{User Option}\fi
+\ifx\putwordDeffunc\undefined   \gdef\putwordDeffunc{Function}\fi
+
+% Since the category of space is not known, we have to be careful.
+\chardef\spacecat = 10
+\def\spaceisspace{\catcode`\ =\spacecat}
+
+% sometimes characters are active, so we need control sequences.
+\chardef\ampChar   = `\&
+\chardef\colonChar = `\:
+\chardef\commaChar = `\,
+\chardef\dashChar  = `\-
+\chardef\dotChar   = `\.
+\chardef\exclamChar= `\!
+\chardef\hashChar  = `\#
+\chardef\lquoteChar= `\`
+\chardef\questChar = `\?
+\chardef\rquoteChar= `\'
+\chardef\semiChar  = `\;
+\chardef\slashChar = `\/
+\chardef\underChar = `\_
+
+% Ignore a token.
+%
+\def\gobble#1{}
+
+% The following is used inside several \edef's.
+\def\makecsname#1{\expandafter\noexpand\csname#1\endcsname}
+
+% Hyphenation fixes.
+\hyphenation{
+  Flor-i-da Ghost-script Ghost-view Mac-OS Post-Script
+  ap-pen-dix bit-map bit-maps
+  data-base data-bases eshell fall-ing half-way long-est man-u-script
+  man-u-scripts mini-buf-fer mini-buf-fers over-view par-a-digm
+  par-a-digms rath-er rec-tan-gu-lar ro-bot-ics se-vere-ly set-up spa-ces
+  spell-ing spell-ings
+  stand-alone strong-est time-stamp time-stamps which-ever white-space
+  wide-spread wrap-around
+}
+
+% Margin to add to right of even pages, to left of odd pages.
+\newdimen\bindingoffset
+\newdimen\normaloffset
+\newdimen\pagewidth \newdimen\pageheight
+
+% For a final copy, take out the rectangles
+% that mark overfull boxes (in case you have decided
+% that the text looks ok even though it passes the margin).
+%
+\def\finalout{\overfullrule=0pt }
+
+% Sometimes it is convenient to have everything in the transcript file
+% and nothing on the terminal.  We don't just call \tracingall here,
+% since that produces some useless output on the terminal.  We also make
+% some effort to order the tracing commands to reduce output in the log
+% file; cf. trace.sty in LaTeX.
+%
+\def\gloggingall{\begingroup \globaldefs = 1 \loggingall \endgroup}%
+\def\loggingall{%
+  \tracingstats2
+  \tracingpages1
+  \tracinglostchars2  % 2 gives us more in etex
+  \tracingparagraphs1
+  \tracingoutput1
+  \tracingmacros2
+  \tracingrestores1
+  \showboxbreadth\maxdimen \showboxdepth\maxdimen
+  \ifx\eTeXversion\thisisundefined\else % etex gives us more logging
+    \tracingscantokens1
+    \tracingifs1
+    \tracinggroups1
+    \tracingnesting2
+    \tracingassigns1
+  \fi
+  \tracingcommands3  % 3 gives us more in etex
+  \errorcontextlines16
+}%
+
+% @errormsg{MSG}.  Do the index-like expansions on MSG, but if things
+% aren't perfect, it's not the end of the world, being an error message,
+% after all.
+% 
+\def\errormsg{\begingroup \indexnofonts \doerrormsg}
+\def\doerrormsg#1{\errmessage{#1}}
+
+% add check for \lastpenalty to plain's definitions.  If the last thing
+% we did was a \nobreak, we don't want to insert more space.
+%
+\def\smallbreak{\ifnum\lastpenalty<10000\par\ifdim\lastskip<\smallskipamount
+  \removelastskip\penalty-50\smallskip\fi\fi}
+\def\medbreak{\ifnum\lastpenalty<10000\par\ifdim\lastskip<\medskipamount
+  \removelastskip\penalty-100\medskip\fi\fi}
+\def\bigbreak{\ifnum\lastpenalty<10000\par\ifdim\lastskip<\bigskipamount
+  \removelastskip\penalty-200\bigskip\fi\fi}
+
+% Do @cropmarks to get crop marks.
+%
+\newif\ifcropmarks
+\let\cropmarks = \cropmarkstrue
+%
+% Dimensions to add cropmarks at corners.
+% Added by P. A. MacKay, 12 Nov. 1986
+%
+\newdimen\outerhsize \newdimen\outervsize % set by the paper size routines
+\newdimen\cornerlong  \cornerlong=1pc
+\newdimen\cornerthick \cornerthick=.3pt
+\newdimen\topandbottommargin \topandbottommargin=.75in
+
+% Output a mark which sets \thischapter, \thissection and \thiscolor.
+% We dump everything together because we only have one kind of mark.
+% This works because we only use \botmark / \topmark, not \firstmark.
+%
+% A mark contains a subexpression of the \ifcase ... \fi construct.
+% \get*marks macros below extract the needed part using \ifcase.
+%
+% Another complication is to let the user choose whether \thischapter
+% (\thissection) refers to the chapter (section) in effect at the top
+% of a page, or that at the bottom of a page.  The solution is
+% described on page 260 of The TeXbook.  It involves outputting two
+% marks for the sectioning macros, one before the section break, and
+% one after.  I won't pretend I can describe this better than DEK...
+\def\domark{%
+  \toks0=\expandafter{\lastchapterdefs}%
+  \toks2=\expandafter{\lastsectiondefs}%
+  \toks4=\expandafter{\prevchapterdefs}%
+  \toks6=\expandafter{\prevsectiondefs}%
+  \toks8=\expandafter{\lastcolordefs}%
+  \mark{%
+                   \the\toks0 \the\toks2
+      \noexpand\or \the\toks4 \the\toks6
+    \noexpand\else \the\toks8
+  }%
+}
+% \topmark doesn't work for the very first chapter (after the title
+% page or the contents), so we use \firstmark there -- this gets us
+% the mark with the chapter defs, unless the user sneaks in, e.g.,
+% @setcolor (or @url, or @link, etc.) between @contents and the very
+% first @chapter.
+\def\gettopheadingmarks{%
+  \ifcase0\topmark\fi
+  \ifx\thischapter\empty \ifcase0\firstmark\fi \fi
+}
+\def\getbottomheadingmarks{\ifcase1\botmark\fi}
+\def\getcolormarks{\ifcase2\topmark\fi}
+
+% Avoid "undefined control sequence" errors.
+\def\lastchapterdefs{}
+\def\lastsectiondefs{}
+\def\prevchapterdefs{}
+\def\prevsectiondefs{}
+\def\lastcolordefs{}
+
+% Main output routine.
+\chardef\PAGE = 255
+\output = {\onepageout{\pagecontents\PAGE}}
+
+\newbox\headlinebox
+\newbox\footlinebox
+
+% \onepageout takes a vbox as an argument.  Note that \pagecontents
+% does insertions, but you have to call it yourself.
+\def\onepageout#1{%
+  \ifcropmarks \hoffset=0pt \else \hoffset=\normaloffset \fi
+  %
+  \ifodd\pageno  \advance\hoffset by \bindingoffset
+  \else \advance\hoffset by -\bindingoffset\fi
+  %
+  % Do this outside of the \shipout so @code etc. will be expanded in
+  % the headline as they should be, not taken literally (outputting ''code).
+  \ifodd\pageno \getoddheadingmarks \else \getevenheadingmarks \fi
+  \setbox\headlinebox = \vbox{\let\hsize=\pagewidth \makeheadline}%
+  \ifodd\pageno \getoddfootingmarks \else \getevenfootingmarks \fi
+  \setbox\footlinebox = \vbox{\let\hsize=\pagewidth \makefootline}%
+  %
+  {%
+    % Have to do this stuff outside the \shipout because we want it to
+    % take effect in \write's, yet the group defined by the \vbox ends
+    % before the \shipout runs.
+    %
+    \indexdummies         % don't expand commands in the output.
+    \normalturnoffactive  % \ in index entries must not stay \, e.g., if
+               % the page break happens to be in the middle of an example.
+               % We don't want .vr (or whatever) entries like this:
+               % \entry{{\tt \indexbackslash }acronym}{32}{\code {\acronym}}
+               % "\acronym" won't work when it's read back in;
+               % it needs to be
+               % {\code {{\tt \backslashcurfont }acronym}
+    \shipout\vbox{%
+      % Do this early so pdf references go to the beginning of the page.
+      \ifpdfmakepagedest \pdfdest name{\the\pageno} xyz\fi
+      %
+      \ifcropmarks \vbox to \outervsize\bgroup
+        \hsize = \outerhsize
+        \vskip-\topandbottommargin
+        \vtop to0pt{%
+          \line{\ewtop\hfil\ewtop}%
+          \nointerlineskip
+          \line{%
+            \vbox{\moveleft\cornerthick\nstop}%
+            \hfill
+            \vbox{\moveright\cornerthick\nstop}%
+          }%
+          \vss}%
+        \vskip\topandbottommargin
+        \line\bgroup
+          \hfil % center the page within the outer (page) hsize.
+          \ifodd\pageno\hskip\bindingoffset\fi
+          \vbox\bgroup
+      \fi
+      %
+      \unvbox\headlinebox
+      \pagebody{#1}%
+      \ifdim\ht\footlinebox > 0pt
+        % Only leave this space if the footline is nonempty.
+        % (We lessened \vsize for it in \oddfootingyyy.)
+        % The \baselineskip=24pt in plain's \makefootline has no effect.
+        \vskip 24pt
+        \unvbox\footlinebox
+      \fi
+      %
+      \ifcropmarks
+          \egroup % end of \vbox\bgroup
+        \hfil\egroup % end of (centering) \line\bgroup
+        \vskip\topandbottommargin plus1fill minus1fill
+        \boxmaxdepth = \cornerthick
+        \vbox to0pt{\vss
+          \line{%
+            \vbox{\moveleft\cornerthick\nsbot}%
+            \hfill
+            \vbox{\moveright\cornerthick\nsbot}%
+          }%
+          \nointerlineskip
+          \line{\ewbot\hfil\ewbot}%
+        }%
+      \egroup % \vbox from first cropmarks clause
+      \fi
+    }% end of \shipout\vbox
+  }% end of group with \indexdummies
+  \advancepageno
+  \ifnum\outputpenalty>-20000 \else\dosupereject\fi
+}
+
+\newinsert\margin \dimen\margin=\maxdimen
+
+\def\pagebody#1{\vbox to\pageheight{\boxmaxdepth=\maxdepth #1}}
+{\catcode`\@ =11
+\gdef\pagecontents#1{\ifvoid\topins\else\unvbox\topins\fi
+% marginal hacks, juha@viisa.uucp (Juha Takala)
+\ifvoid\margin\else % marginal info is present
+  \rlap{\kern\hsize\vbox to\z@{\kern1pt\box\margin \vss}}\fi
+\dimen@=\dp#1\relax \unvbox#1\relax
+\ifvoid\footins\else\vskip\skip\footins\footnoterule \unvbox\footins\fi
+\ifr@ggedbottom \kern-\dimen@ \vfil \fi}
+}
+
+% Here are the rules for the cropmarks.  Note that they are
+% offset so that the space between them is truly \outerhsize or \outervsize
+% (P. A. MacKay, 12 November, 1986)
+%
+\def\ewtop{\vrule height\cornerthick depth0pt width\cornerlong}
+\def\nstop{\vbox
+  {\hrule height\cornerthick depth\cornerlong width\cornerthick}}
+\def\ewbot{\vrule height0pt depth\cornerthick width\cornerlong}
+\def\nsbot{\vbox
+  {\hrule height\cornerlong depth\cornerthick width\cornerthick}}
+
+% Parse an argument, then pass it to #1.  The argument is the rest of
+% the input line (except we remove a trailing comment).  #1 should be a
+% macro which expects an ordinary undelimited TeX argument.
+%
+\def\parsearg{\parseargusing{}}
+\def\parseargusing#1#2{%
+  \def\argtorun{#2}%
+  \begingroup
+    \obeylines
+    \spaceisspace
+    #1%
+    \parseargline\empty% Insert the \empty token, see \finishparsearg below.
+}
+
+{\obeylines %
+  \gdef\parseargline#1^^M{%
+    \endgroup % End of the group started in \parsearg.
+    \argremovecomment #1\comment\ArgTerm%
+  }%
+}
+
+% First remove any @comment, then any @c comment.
+\def\argremovecomment#1\comment#2\ArgTerm{\argremovec #1\c\ArgTerm}
+\def\argremovec#1\c#2\ArgTerm{\argcheckspaces#1\^^M\ArgTerm}
+
+% Each occurrence of `\^^M' or `<space>\^^M' is replaced by a single space.
+%
+% \argremovec might leave us with trailing space, e.g.,
+%    @end itemize  @c foo
+% This space token undergoes the same procedure and is eventually removed
+% by \finishparsearg.
+%
+\def\argcheckspaces#1\^^M{\argcheckspacesX#1\^^M \^^M}
+\def\argcheckspacesX#1 \^^M{\argcheckspacesY#1\^^M}
+\def\argcheckspacesY#1\^^M#2\^^M#3\ArgTerm{%
+  \def\temp{#3}%
+  \ifx\temp\empty
+    % Do not use \next, perhaps the caller of \parsearg uses it; reuse \temp:
+    \let\temp\finishparsearg
+  \else
+    \let\temp\argcheckspaces
+  \fi
+  % Put the space token in:
+  \temp#1 #3\ArgTerm
+}
+
+% If a _delimited_ argument is enclosed in braces, they get stripped; so
+% to get _exactly_ the rest of the line, we had to prevent such situation.
+% We prepended an \empty token at the very beginning and we expand it now,
+% just before passing the control to \argtorun.
+% (Similarly, we have to think about #3 of \argcheckspacesY above: it is
+% either the null string, or it ends with \^^M---thus there is no danger
+% that a pair of braces would be stripped.
+%
+% But first, we have to remove the trailing space token.
+%
+\def\finishparsearg#1 \ArgTerm{\expandafter\argtorun\expandafter{#1}}
+
+% \parseargdef\foo{...}
+%	is roughly equivalent to
+% \def\foo{\parsearg\Xfoo}
+% \def\Xfoo#1{...}
+%
+% Actually, I use \csname\string\foo\endcsname, ie. \\foo, as it is my
+% favourite TeX trick.  --kasal, 16nov03
+
+\def\parseargdef#1{%
+  \expandafter \doparseargdef \csname\string#1\endcsname #1%
+}
+\def\doparseargdef#1#2{%
+  \def#2{\parsearg#1}%
+  \def#1##1%
+}
+
+% Several utility definitions with active space:
+{
+  \obeyspaces
+  \gdef\obeyedspace{ }
+
+  % Make each space character in the input produce a normal interword
+  % space in the output.  Don't allow a line break at this space, as this
+  % is used only in environments like @example, where each line of input
+  % should produce a line of output anyway.
+  %
+  \gdef\sepspaces{\obeyspaces\let =\tie}
+
+  % If an index command is used in an @example environment, any spaces
+  % therein should become regular spaces in the raw index file, not the
+  % expansion of \tie (\leavevmode \penalty \@M \ ).
+  \gdef\unsepspaces{\let =\space}
+}
+
+
+\def\flushcr{\ifx\par\lisppar \def\next##1{}\else \let\next=\relax \fi \next}
+
+% Define the framework for environments in texinfo.tex.  It's used like this:
+%
+%   \envdef\foo{...}
+%   \def\Efoo{...}
+%
+% It's the responsibility of \envdef to insert \begingroup before the
+% actual body; @end closes the group after calling \Efoo.  \envdef also
+% defines \thisenv, so the current environment is known; @end checks
+% whether the environment name matches.  The \checkenv macro can also be
+% used to check whether the current environment is the one expected.
+%
+% Non-false conditionals (@iftex, @ifset) don't fit into this, so they
+% are not treated as environments; they don't open a group.  (The
+% implementation of @end takes care not to call \endgroup in this
+% special case.)
+
+
+% At run-time, environments start with this:
+\def\startenvironment#1{\begingroup\def\thisenv{#1}}
+% initialize
+\let\thisenv\empty
+
+% ... but they get defined via ``\envdef\foo{...}'':
+\long\def\envdef#1#2{\def#1{\startenvironment#1#2}}
+\def\envparseargdef#1#2{\parseargdef#1{\startenvironment#1#2}}
+
+% Check whether we're in the right environment:
+\def\checkenv#1{%
+  \def\temp{#1}%
+  \ifx\thisenv\temp
+  \else
+    \badenverr
+  \fi
+}
+
+% Environment mismatch, #1 expected:
+\def\badenverr{%
+  \errhelp = \EMsimple
+  \errmessage{This command can appear only \inenvironment\temp,
+    not \inenvironment\thisenv}%
+}
+\def\inenvironment#1{%
+  \ifx#1\empty
+    outside of any environment%
+  \else
+    in environment \expandafter\string#1%
+  \fi
+}
+
+% @end foo executes the definition of \Efoo.
+% But first, it executes a specialized version of \checkenv
+%
+\parseargdef\end{%
+  \if 1\csname iscond.#1\endcsname
+  \else
+    % The general wording of \badenverr may not be ideal.
+    \expandafter\checkenv\csname#1\endcsname
+    \csname E#1\endcsname
+    \endgroup
+  \fi
+}
+
+\newhelp\EMsimple{Press RETURN to continue.}
+
+
+% Be sure we're in horizontal mode when doing a tie, since we make space
+% equivalent to this in @example-like environments. Otherwise, a space
+% at the beginning of a line will start with \penalty -- and
+% since \penalty is valid in vertical mode, we'd end up putting the
+% penalty on the vertical list instead of in the new paragraph.
+{\catcode`@ = 11
+ % Avoid using \@M directly, because that causes trouble
+ % if the definition is written into an index file.
+ \global\let\tiepenalty = \@M
+ \gdef\tie{\leavevmode\penalty\tiepenalty\ }
+}
+
+% @: forces normal size whitespace following.
+\def\:{\spacefactor=1000 }
+
+% @* forces a line break.
+\def\*{\hfil\break\hbox{}\ignorespaces}
+
+% @/ allows a line break.
+\let\/=\allowbreak
+
+% @. is an end-of-sentence period.
+\def\.{.\spacefactor=\endofsentencespacefactor\space}
+
+% @! is an end-of-sentence bang.
+\def\!{!\spacefactor=\endofsentencespacefactor\space}
+
+% @? is an end-of-sentence query.
+\def\?{?\spacefactor=\endofsentencespacefactor\space}
+
+% @frenchspacing on|off  says whether to put extra space after punctuation.
+%
+\def\onword{on}
+\def\offword{off}
+%
+\parseargdef\frenchspacing{%
+  \def\temp{#1}%
+  \ifx\temp\onword \plainfrenchspacing
+  \else\ifx\temp\offword \plainnonfrenchspacing
+  \else
+    \errhelp = \EMsimple
+    \errmessage{Unknown @frenchspacing option `\temp', must be on|off}%
+  \fi\fi
+}
+
+% @w prevents a word break.  Without the \leavevmode, @w at the
+% beginning of a paragraph, when TeX is still in vertical mode, would
+% produce a whole line of output instead of starting the paragraph.
+\def\w#1{\leavevmode\hbox{#1}}
+
+% @group ... @end group forces ... to be all on one page, by enclosing
+% it in a TeX vbox.  We use \vtop instead of \vbox to construct the box
+% to keep its height that of a normal line.  According to the rules for
+% \topskip (p.114 of the TeXbook), the glue inserted is
+% max (\topskip - \ht (first item), 0).  If that height is large,
+% therefore, no glue is inserted, and the space between the headline and
+% the text is small, which looks bad.
+%
+% Another complication is that the group might be very large.  This can
+% cause the glue on the previous page to be unduly stretched, because it
+% does not have much material.  In this case, it's better to add an
+% explicit \vfill so that the extra space is at the bottom.  The
+% threshold for doing this is if the group is more than \vfilllimit
+% percent of a page (\vfilllimit can be changed inside of @tex).
+%
+\newbox\groupbox
+\def\vfilllimit{0.7}
+%
+\envdef\group{%
+  \ifnum\catcode`\^^M=\active \else
+    \errhelp = \groupinvalidhelp
+    \errmessage{@group invalid in context where filling is enabled}%
+  \fi
+  \startsavinginserts
+  %
+  \setbox\groupbox = \vtop\bgroup
+    % Do @comment since we are called inside an environment such as
+    % @example, where each end-of-line in the input causes an
+    % end-of-line in the output.  We don't want the end-of-line after
+    % the `@group' to put extra space in the output.  Since @group
+    % should appear on a line by itself (according to the Texinfo
+    % manual), we don't worry about eating any user text.
+    \comment
+}
+%
+% The \vtop produces a box with normal height and large depth; thus, TeX puts
+% \baselineskip glue before it, and (when the next line of text is done)
+% \lineskip glue after it.  Thus, space below is not quite equal to space
+% above.  But it's pretty close.
+\def\Egroup{%
+    % To get correct interline space between the last line of the group
+    % and the first line afterwards, we have to propagate \prevdepth.
+    \endgraf % Not \par, as it may have been set to \lisppar.
+    \global\dimen1 = \prevdepth
+  \egroup           % End the \vtop.
+  % \dimen0 is the vertical size of the group's box.
+  \dimen0 = \ht\groupbox  \advance\dimen0 by \dp\groupbox
+  % \dimen2 is how much space is left on the page (more or less).
+  \dimen2 = \pageheight   \advance\dimen2 by -\pagetotal
+  % if the group doesn't fit on the current page, and it's a big big
+  % group, force a page break.
+  \ifdim \dimen0 > \dimen2
+    \ifdim \pagetotal < \vfilllimit\pageheight
+      \page
+    \fi
+  \fi
+  \box\groupbox
+  \prevdepth = \dimen1
+  \checkinserts
+}
+%
+% TeX puts in an \escapechar (i.e., `@') at the beginning of the help
+% message, so this ends up printing `@group can only ...'.
+%
+\newhelp\groupinvalidhelp{%
+group can only be used in environments such as @example,^^J%
+where each line of input produces a line of output.}
+
+% @need space-in-mils
+% forces a page break if there is not space-in-mils remaining.
+
+\newdimen\mil  \mil=0.001in
+
+\parseargdef\need{%
+  % Ensure vertical mode, so we don't make a big box in the middle of a
+  % paragraph.
+  \par
+  %
+  % If the @need value is less than one line space, it's useless.
+  \dimen0 = #1\mil
+  \dimen2 = \ht\strutbox
+  \advance\dimen2 by \dp\strutbox
+  \ifdim\dimen0 > \dimen2
+    %
+    % Do a \strut just to make the height of this box be normal, so the
+    % normal leading is inserted relative to the preceding line.
+    % And a page break here is fine.
+    \vtop to #1\mil{\strut\vfil}%
+    %
+    % TeX does not even consider page breaks if a penalty added to the
+    % main vertical list is 10000 or more.  But in order to see if the
+    % empty box we just added fits on the page, we must make it consider
+    % page breaks.  On the other hand, we don't want to actually break the
+    % page after the empty box.  So we use a penalty of 9999.
+    %
+    % There is an extremely small chance that TeX will actually break the
+    % page at this \penalty, if there are no other feasible breakpoints in
+    % sight.  (If the user is using lots of big @group commands, which
+    % almost-but-not-quite fill up a page, TeX will have a hard time doing
+    % good page breaking, for example.)  However, I could not construct an
+    % example where a page broke at this \penalty; if it happens in a real
+    % document, then we can reconsider our strategy.
+    \penalty9999
+    %
+    % Back up by the size of the box, whether we did a page break or not.
+    \kern -#1\mil
+    %
+    % Do not allow a page break right after this kern.
+    \nobreak
+  \fi
+}
+
+% @br   forces paragraph break (and is undocumented).
+
+\let\br = \par
+
+% @page forces the start of a new page.
+%
+\def\page{\par\vfill\supereject}
+
+% @exdent text....
+% outputs text on separate line in roman font, starting at standard page margin
+
+% This records the amount of indent in the innermost environment.
+% That's how much \exdent should take out.
+\newskip\exdentamount
+
+% This defn is used inside fill environments such as @defun.
+\parseargdef\exdent{\hfil\break\hbox{\kern -\exdentamount{\rm#1}}\hfil\break}
+
+% This defn is used inside nofill environments such as @example.
+\parseargdef\nofillexdent{{\advance \leftskip by -\exdentamount
+  \leftline{\hskip\leftskip{\rm#1}}}}
+
+% @inmargin{WHICH}{TEXT} puts TEXT in the WHICH margin next to the current
+% paragraph.  For more general purposes, use the \margin insertion
+% class.  WHICH is `l' or `r'.  Not documented, written for gawk manual.
+%
+\newskip\inmarginspacing \inmarginspacing=1cm
+\def\strutdepth{\dp\strutbox}
+%
+\def\doinmargin#1#2{\strut\vadjust{%
+  \nobreak
+  \kern-\strutdepth
+  \vtop to \strutdepth{%
+    \baselineskip=\strutdepth
+    \vss
+    % if you have multiple lines of stuff to put here, you'll need to
+    % make the vbox yourself of the appropriate size.
+    \ifx#1l%
+      \llap{\ignorespaces #2\hskip\inmarginspacing}%
+    \else
+      \rlap{\hskip\hsize \hskip\inmarginspacing \ignorespaces #2}%
+    \fi
+    \null
+  }%
+}}
+\def\inleftmargin{\doinmargin l}
+\def\inrightmargin{\doinmargin r}
+%
+% @inmargin{TEXT [, RIGHT-TEXT]}
+% (if RIGHT-TEXT is given, use TEXT for left page, RIGHT-TEXT for right;
+% else use TEXT for both).
+%
+\def\inmargin#1{\parseinmargin #1,,\finish}
+\def\parseinmargin#1,#2,#3\finish{% not perfect, but better than nothing.
+  \setbox0 = \hbox{\ignorespaces #2}%
+  \ifdim\wd0 > 0pt
+    \def\lefttext{#1}%  have both texts
+    \def\righttext{#2}%
+  \else
+    \def\lefttext{#1}%  have only one text
+    \def\righttext{#1}%
+  \fi
+  %
+  \ifodd\pageno
+    \def\temp{\inrightmargin\righttext}% odd page -> outside is right margin
+  \else
+    \def\temp{\inleftmargin\lefttext}%
+  \fi
+  \temp
+}
+
+% @| inserts a changebar to the left of the current line.  It should
+% surround any changed text.  This approach does *not* work if the
+% change spans more than two lines of output.  To handle that, we would
+% have adopt a much more difficult approach (putting marks into the main
+% vertical list for the beginning and end of each change).  This command
+% is not documented, not supported, and doesn't work.
+%
+\def\|{%
+  % \vadjust can only be used in horizontal mode.
+  \leavevmode
+  %
+  % Append this vertical mode material after the current line in the output.
+  \vadjust{%
+    % We want to insert a rule with the height and depth of the current
+    % leading; that is exactly what \strutbox is supposed to record.
+    \vskip-\baselineskip
+    %
+    % \vadjust-items are inserted at the left edge of the type.  So
+    % the \llap here moves out into the left-hand margin.
+    \llap{%
+      %
+      % For a thicker or thinner bar, change the `1pt'.
+      \vrule height\baselineskip width1pt
+      %
+      % This is the space between the bar and the text.
+      \hskip 12pt
+    }%
+  }%
+}
+
+% @include FILE -- \input text of FILE.
+%
+\def\include{\parseargusing\filenamecatcodes\includezzz}
+\def\includezzz#1{%
+  \pushthisfilestack
+  \def\thisfile{#1}%
+  {%
+    \makevalueexpandable  % we want to expand any @value in FILE.
+    \turnoffactive        % and allow special characters in the expansion
+    \indexnofonts         % Allow `@@' and other weird things in file names.
+    \wlog{texinfo.tex: doing @include of #1^^J}%
+    \edef\temp{\noexpand\input #1 }%
+    %
+    % This trickery is to read FILE outside of a group, in case it makes
+    % definitions, etc.
+    \expandafter
+  }\temp
+  \popthisfilestack
+}
+\def\filenamecatcodes{%
+  \catcode`\\=\other
+  \catcode`~=\other
+  \catcode`^=\other
+  \catcode`_=\other
+  \catcode`|=\other
+  \catcode`<=\other
+  \catcode`>=\other
+  \catcode`+=\other
+  \catcode`-=\other
+  \catcode`\`=\other
+  \catcode`\'=\other
+}
+
+\def\pushthisfilestack{%
+  \expandafter\pushthisfilestackX\popthisfilestack\StackTerm
+}
+\def\pushthisfilestackX{%
+  \expandafter\pushthisfilestackY\thisfile\StackTerm
+}
+\def\pushthisfilestackY #1\StackTerm #2\StackTerm {%
+  \gdef\popthisfilestack{\gdef\thisfile{#1}\gdef\popthisfilestack{#2}}%
+}
+
+\def\popthisfilestack{\errthisfilestackempty}
+\def\errthisfilestackempty{\errmessage{Internal error:
+  the stack of filenames is empty.}}
+%
+\def\thisfile{}
+
+% @center line
+% outputs that line, centered.
+%
+\parseargdef\center{%
+  \ifhmode
+    \let\centersub\centerH
+  \else
+    \let\centersub\centerV
+  \fi
+  \centersub{\hfil \ignorespaces#1\unskip \hfil}%
+  \let\centersub\relax % don't let the definition persist, just in case
+}
+\def\centerH#1{{%
+  \hfil\break
+  \advance\hsize by -\leftskip
+  \advance\hsize by -\rightskip
+  \line{#1}%
+  \break
+}}
+%
+\newcount\centerpenalty
+\def\centerV#1{%
+  % The idea here is the same as in \startdefun, \cartouche, etc.: if
+  % @center is the first thing after a section heading, we need to wipe
+  % out the negative parskip inserted by \sectionheading, but still
+  % prevent a page break here.
+  \centerpenalty = \lastpenalty
+  \ifnum\centerpenalty>10000 \vskip\parskip \fi
+  \ifnum\centerpenalty>9999 \penalty\centerpenalty \fi
+  \line{\kern\leftskip #1\kern\rightskip}%
+}
+
+% @sp n   outputs n lines of vertical space
+%
+\parseargdef\sp{\vskip #1\baselineskip}
+
+% @comment ...line which is ignored...
+% @c is the same as @comment
+% @ignore ... @end ignore  is another way to write a comment
+%
+\def\comment{\begingroup \catcode`\^^M=\other%
+\catcode`\@=\other \catcode`\{=\other \catcode`\}=\other%
+\commentxxx}
+{\catcode`\^^M=\other \gdef\commentxxx#1^^M{\endgroup}}
+%
+\let\c=\comment
+
+% @paragraphindent NCHARS
+% We'll use ems for NCHARS, close enough.
+% NCHARS can also be the word `asis' or `none'.
+% We cannot feasibly implement @paragraphindent asis, though.
+%
+\def\asisword{asis} % no translation, these are keywords
+\def\noneword{none}
+%
+\parseargdef\paragraphindent{%
+  \def\temp{#1}%
+  \ifx\temp\asisword
+  \else
+    \ifx\temp\noneword
+      \defaultparindent = 0pt
+    \else
+      \defaultparindent = #1em
+    \fi
+  \fi
+  \parindent = \defaultparindent
+}
+
+% @exampleindent NCHARS
+% We'll use ems for NCHARS like @paragraphindent.
+% It seems @exampleindent asis isn't necessary, but
+% I preserve it to make it similar to @paragraphindent.
+\parseargdef\exampleindent{%
+  \def\temp{#1}%
+  \ifx\temp\asisword
+  \else
+    \ifx\temp\noneword
+      \lispnarrowing = 0pt
+    \else
+      \lispnarrowing = #1em
+    \fi
+  \fi
+}
+
+% @firstparagraphindent WORD
+% If WORD is `none', then suppress indentation of the first paragraph
+% after a section heading.  If WORD is `insert', then do indent at such
+% paragraphs.
+%
+% The paragraph indentation is suppressed or not by calling
+% \suppressfirstparagraphindent, which the sectioning commands do.
+% We switch the definition of this back and forth according to WORD.
+% By default, we suppress indentation.
+%
+\def\suppressfirstparagraphindent{\dosuppressfirstparagraphindent}
+\def\insertword{insert}
+%
+\parseargdef\firstparagraphindent{%
+  \def\temp{#1}%
+  \ifx\temp\noneword
+    \let\suppressfirstparagraphindent = \dosuppressfirstparagraphindent
+  \else\ifx\temp\insertword
+    \let\suppressfirstparagraphindent = \relax
+  \else
+    \errhelp = \EMsimple
+    \errmessage{Unknown @firstparagraphindent option `\temp'}%
+  \fi\fi
+}
+
+% Here is how we actually suppress indentation.  Redefine \everypar to
+% \kern backwards by \parindent, and then reset itself to empty.
+%
+% We also make \indent itself not actually do anything until the next
+% paragraph.
+%
+\gdef\dosuppressfirstparagraphindent{%
+  \gdef\indent{%
+    \restorefirstparagraphindent
+    \indent
+  }%
+  \gdef\noindent{%
+    \restorefirstparagraphindent
+    \noindent
+  }%
+  \global\everypar = {%
+    \kern -\parindent
+    \restorefirstparagraphindent
+  }%
+}
+
+\gdef\restorefirstparagraphindent{%
+  \global \let \indent = \ptexindent
+  \global \let \noindent = \ptexnoindent
+  \global \everypar = {}%
+}
+
+
+% @refill is a no-op.
+\let\refill=\relax
+
+% If working on a large document in chapters, it is convenient to
+% be able to disable indexing, cross-referencing, and contents, for test runs.
+% This is done with @novalidate (before @setfilename).
+%
+\newif\iflinks \linkstrue % by default we want the aux files.
+\let\novalidate = \linksfalse
+
+% @setfilename is done at the beginning of every texinfo file.
+% So open here the files we need to have open while reading the input.
+% This makes it possible to make a .fmt file for texinfo.
+\def\setfilename{%
+   \fixbackslash  % Turn off hack to swallow `\input texinfo'.
+   \iflinks
+     \tryauxfile
+     % Open the new aux file.  TeX will close it automatically at exit.
+     \immediate\openout\auxfile=\jobname.aux
+   \fi % \openindices needs to do some work in any case.
+   \openindices
+   \let\setfilename=\comment % Ignore extra @setfilename cmds.
+   %
+   % If texinfo.cnf is present on the system, read it.
+   % Useful for site-wide @afourpaper, etc.
+   \openin 1 texinfo.cnf
+   \ifeof 1 \else \input texinfo.cnf \fi
+   \closein 1
+   %
+   \comment % Ignore the actual filename.
+}
+
+% Called from \setfilename.
+%
+\def\openindices{%
+  \newindex{cp}%
+  \newcodeindex{fn}%
+  \newcodeindex{vr}%
+  \newcodeindex{tp}%
+  \newcodeindex{ky}%
+  \newcodeindex{pg}%
+}
+
+% @bye.
+\outer\def\bye{\pagealignmacro\tracingstats=1\ptexend}
+
+
+\message{pdf,}
+% adobe `portable' document format
+\newcount\tempnum
+\newcount\lnkcount
+\newtoks\filename
+\newcount\filenamelength
+\newcount\pgn
+\newtoks\toksA
+\newtoks\toksB
+\newtoks\toksC
+\newtoks\toksD
+\newbox\boxA
+\newcount\countA
+\newif\ifpdf
+\newif\ifpdfmakepagedest
+
+% when pdftex is run in dvi mode, \pdfoutput is defined (so \pdfoutput=1
+% can be set).  So we test for \relax and 0 as well as being undefined.
+\ifx\pdfoutput\thisisundefined
+\else
+  \ifx\pdfoutput\relax
+  \else
+    \ifcase\pdfoutput
+    \else
+      \pdftrue
+    \fi
+  \fi
+\fi
+
+% PDF uses PostScript string constants for the names of xref targets,
+% for display in the outlines, and in other places.  Thus, we have to
+% double any backslashes.  Otherwise, a name like "\node" will be
+% interpreted as a newline (\n), followed by o, d, e.  Not good.
+% 
+% See http://www.ntg.nl/pipermail/ntg-pdftex/2004-July/000654.html and
+% related messages.  The final outcome is that it is up to the TeX user
+% to double the backslashes and otherwise make the string valid, so
+% that's what we do.  pdftex 1.30.0 (ca.2005) introduced a primitive to
+% do this reliably, so we use it.
+
+% #1 is a control sequence in which to do the replacements,
+% which we \xdef.
+\def\txiescapepdf#1{%
+  \ifx\pdfescapestring\relax
+    % No primitive available; should we give a warning or log?
+    % Many times it won't matter.
+  \else
+    % The expandable \pdfescapestring primitive escapes parentheses,
+    % backslashes, and other special chars.
+    \xdef#1{\pdfescapestring{#1}}%
+  \fi
+}
+
+\newhelp\nopdfimagehelp{Texinfo supports .png, .jpg, .jpeg, and .pdf images
+with PDF output, and none of those formats could be found.  (.eps cannot
+be supported due to the design of the PDF format; use regular TeX (DVI
+output) for that.)}
+
+\ifpdf
+  %
+  % Color manipulation macros based on pdfcolor.tex,
+  % except using rgb instead of cmyk; the latter is said to render as a
+  % very dark gray on-screen and a very dark halftone in print, instead
+  % of actual black.
+  \def\rgbDarkRed{0.50 0.09 0.12}
+  \def\rgbBlack{0 0 0}
+  %
+  % k sets the color for filling (usual text, etc.);
+  % K sets the color for stroking (thin rules, e.g., normal _'s).
+  \def\pdfsetcolor#1{\pdfliteral{#1 rg  #1 RG}}
+  %
+  % Set color, and create a mark which defines \thiscolor accordingly,
+  % so that \makeheadline knows which color to restore.
+  \def\setcolor#1{%
+    \xdef\lastcolordefs{\gdef\noexpand\thiscolor{#1}}%
+    \domark
+    \pdfsetcolor{#1}%
+  }
+  %
+  \def\maincolor{\rgbBlack}
+  \pdfsetcolor{\maincolor}
+  \edef\thiscolor{\maincolor}
+  \def\lastcolordefs{}
+  %
+  \def\makefootline{%
+    \baselineskip24pt
+    \line{\pdfsetcolor{\maincolor}\the\footline}%
+  }
+  %
+  \def\makeheadline{%
+    \vbox to 0pt{%
+      \vskip-22.5pt
+      \line{%
+        \vbox to8.5pt{}%
+        % Extract \thiscolor definition from the marks.
+        \getcolormarks
+        % Typeset the headline with \maincolor, then restore the color.
+        \pdfsetcolor{\maincolor}\the\headline\pdfsetcolor{\thiscolor}%
+      }%
+      \vss
+    }%
+    \nointerlineskip
+  }
+  %
+  %
+  \pdfcatalog{/PageMode /UseOutlines}
+  %
+  % #1 is image name, #2 width (might be empty/whitespace), #3 height (ditto).
+  \def\dopdfimage#1#2#3{%
+    \def\pdfimagewidth{#2}\setbox0 = \hbox{\ignorespaces #2}%
+    \def\pdfimageheight{#3}\setbox2 = \hbox{\ignorespaces #3}%
+    %
+    % pdftex (and the PDF format) support .pdf, .png, .jpg (among
+    % others).  Let's try in that order, PDF first since if
+    % someone has a scalable image, presumably better to use that than a
+    % bitmap.
+    \let\pdfimgext=\empty
+    \begingroup
+      \openin 1 #1.pdf \ifeof 1
+        \openin 1 #1.PDF \ifeof 1
+          \openin 1 #1.png \ifeof 1
+            \openin 1 #1.jpg \ifeof 1
+              \openin 1 #1.jpeg \ifeof 1
+                \openin 1 #1.JPG \ifeof 1
+                  \errhelp = \nopdfimagehelp
+                  \errmessage{Could not find image file #1 for pdf}%
+                \else \gdef\pdfimgext{JPG}%
+                \fi
+              \else \gdef\pdfimgext{jpeg}%
+              \fi
+            \else \gdef\pdfimgext{jpg}%
+            \fi
+          \else \gdef\pdfimgext{png}%
+          \fi
+        \else \gdef\pdfimgext{PDF}%
+        \fi
+      \else \gdef\pdfimgext{pdf}%
+      \fi
+      \closein 1
+    \endgroup
+    %
+    % without \immediate, ancient pdftex seg faults when the same image is
+    % included twice.  (Version 3.14159-pre-1.0-unofficial-20010704.)
+    \ifnum\pdftexversion < 14
+      \immediate\pdfimage
+    \else
+      \immediate\pdfximage
+    \fi
+      \ifdim \wd0 >0pt width \pdfimagewidth \fi
+      \ifdim \wd2 >0pt height \pdfimageheight \fi
+      \ifnum\pdftexversion<13
+         #1.\pdfimgext
+       \else
+         {#1.\pdfimgext}%
+       \fi
+    \ifnum\pdftexversion < 14 \else
+      \pdfrefximage \pdflastximage
+    \fi}
+  %
+  \def\pdfmkdest#1{{%
+    % We have to set dummies so commands such as @code, and characters
+    % such as \, aren't expanded when present in a section title.
+    \indexnofonts
+    \turnoffactive
+    \makevalueexpandable
+    \def\pdfdestname{#1}%
+    \txiescapepdf\pdfdestname
+    \safewhatsit{\pdfdest name{\pdfdestname} xyz}%
+  }}
+  %
+  % used to mark target names; must be expandable.
+  \def\pdfmkpgn#1{#1}
+  %
+  % by default, use a color that is dark enough to print on paper as
+  % nearly black, but still distinguishable for online viewing.
+  \def\urlcolor{\rgbDarkRed}
+  \def\linkcolor{\rgbDarkRed}
+  \def\endlink{\setcolor{\maincolor}\pdfendlink}
+  %
+  % Adding outlines to PDF; macros for calculating structure of outlines
+  % come from Petr Olsak
+  \def\expnumber#1{\expandafter\ifx\csname#1\endcsname\relax 0%
+    \else \csname#1\endcsname \fi}
+  \def\advancenumber#1{\tempnum=\expnumber{#1}\relax
+    \advance\tempnum by 1
+    \expandafter\xdef\csname#1\endcsname{\the\tempnum}}
+  %
+  % #1 is the section text, which is what will be displayed in the
+  % outline by the pdf viewer.  #2 is the pdf expression for the number
+  % of subentries (or empty, for subsubsections).  #3 is the node text,
+  % which might be empty if this toc entry had no corresponding node.
+  % #4 is the page number
+  %
+  \def\dopdfoutline#1#2#3#4{%
+    % Generate a link to the node text if that exists; else, use the
+    % page number.  We could generate a destination for the section
+    % text in the case where a section has no node, but it doesn't
+    % seem worth the trouble, since most documents are normally structured.
+    \edef\pdfoutlinedest{#3}%
+    \ifx\pdfoutlinedest\empty
+      \def\pdfoutlinedest{#4}%
+    \else
+      \txiescapepdf\pdfoutlinedest
+    \fi
+    %
+    % Also escape PDF chars in the display string.
+    \edef\pdfoutlinetext{#1}%
+    \txiescapepdf\pdfoutlinetext
+    %
+    \pdfoutline goto name{\pdfmkpgn{\pdfoutlinedest}}#2{\pdfoutlinetext}%
+  }
+  %
+  \def\pdfmakeoutlines{%
+    \begingroup
+      % Read toc silently, to get counts of subentries for \pdfoutline.
+      \def\partentry##1##2##3##4{}% ignore parts in the outlines
+      \def\numchapentry##1##2##3##4{%
+	\def\thischapnum{##2}%
+	\def\thissecnum{0}%
+	\def\thissubsecnum{0}%
+      }%
+      \def\numsecentry##1##2##3##4{%
+	\advancenumber{chap\thischapnum}%
+	\def\thissecnum{##2}%
+	\def\thissubsecnum{0}%
+      }%
+      \def\numsubsecentry##1##2##3##4{%
+	\advancenumber{sec\thissecnum}%
+	\def\thissubsecnum{##2}%
+      }%
+      \def\numsubsubsecentry##1##2##3##4{%
+	\advancenumber{subsec\thissubsecnum}%
+      }%
+      \def\thischapnum{0}%
+      \def\thissecnum{0}%
+      \def\thissubsecnum{0}%
+      %
+      % use \def rather than \let here because we redefine \chapentry et
+      % al. a second time, below.
+      \def\appentry{\numchapentry}%
+      \def\appsecentry{\numsecentry}%
+      \def\appsubsecentry{\numsubsecentry}%
+      \def\appsubsubsecentry{\numsubsubsecentry}%
+      \def\unnchapentry{\numchapentry}%
+      \def\unnsecentry{\numsecentry}%
+      \def\unnsubsecentry{\numsubsecentry}%
+      \def\unnsubsubsecentry{\numsubsubsecentry}%
+      \readdatafile{toc}%
+      %
+      % Read toc second time, this time actually producing the outlines.
+      % The `-' means take the \expnumber as the absolute number of
+      % subentries, which we calculated on our first read of the .toc above.
+      %
+      % We use the node names as the destinations.
+      \def\numchapentry##1##2##3##4{%
+        \dopdfoutline{##1}{count-\expnumber{chap##2}}{##3}{##4}}%
+      \def\numsecentry##1##2##3##4{%
+        \dopdfoutline{##1}{count-\expnumber{sec##2}}{##3}{##4}}%
+      \def\numsubsecentry##1##2##3##4{%
+        \dopdfoutline{##1}{count-\expnumber{subsec##2}}{##3}{##4}}%
+      \def\numsubsubsecentry##1##2##3##4{% count is always zero
+        \dopdfoutline{##1}{}{##3}{##4}}%
+      %
+      % PDF outlines are displayed using system fonts, instead of
+      % document fonts.  Therefore we cannot use special characters,
+      % since the encoding is unknown.  For example, the eogonek from
+      % Latin 2 (0xea) gets translated to a | character.  Info from
+      % Staszek Wawrykiewicz, 19 Jan 2004 04:09:24 +0100.
+      %
+      % TODO this right, we have to translate 8-bit characters to
+      % their "best" equivalent, based on the @documentencoding.  Too
+      % much work for too little return.  Just use the ASCII equivalents
+      % we use for the index sort strings.
+      % 
+      \indexnofonts
+      \setupdatafile
+      % We can have normal brace characters in the PDF outlines, unlike
+      % Texinfo index files.  So set that up.
+      \def\{{\lbracecharliteral}%
+      \def\}{\rbracecharliteral}%
+      \catcode`\\=\active \otherbackslash
+      \input \tocreadfilename
+    \endgroup
+  }
+  {\catcode`[=1 \catcode`]=2
+   \catcode`{=\other \catcode`}=\other
+   \gdef\lbracecharliteral[{]%
+   \gdef\rbracecharliteral[}]%
+  ]
+  %
+  \def\skipspaces#1{\def\PP{#1}\def\D{|}%
+    \ifx\PP\D\let\nextsp\relax
+    \else\let\nextsp\skipspaces
+      \ifx\p\space\else\addtokens{\filename}{\PP}%
+        \advance\filenamelength by 1
+      \fi
+    \fi
+    \nextsp}
+  \def\getfilename#1{%
+    \filenamelength=0
+    % If we don't expand the argument now, \skipspaces will get
+    % snagged on things like "@value{foo}".
+    \edef\temp{#1}%
+    \expandafter\skipspaces\temp|\relax
+  }
+  \ifnum\pdftexversion < 14
+    \let \startlink \pdfannotlink
+  \else
+    \let \startlink \pdfstartlink
+  \fi
+  % make a live url in pdf output.
+  \def\pdfurl#1{%
+    \begingroup
+      % it seems we really need yet another set of dummies; have not
+      % tried to figure out what each command should do in the context
+      % of @url.  for now, just make @/ a no-op, that's the only one
+      % people have actually reported a problem with.
+      %
+      \normalturnoffactive
+      \def\@{@}%
+      \let\/=\empty
+      \makevalueexpandable
+      % do we want to go so far as to use \indexnofonts instead of just
+      % special-casing \var here?
+      \def\var##1{##1}%
+      %
+      \leavevmode\setcolor{\urlcolor}%
+      \startlink attr{/Border [0 0 0]}%
+        user{/Subtype /Link /A << /S /URI /URI (#1) >>}%
+    \endgroup}
+  \def\pdfgettoks#1.{\setbox\boxA=\hbox{\toksA={#1.}\toksB={}\maketoks}}
+  \def\addtokens#1#2{\edef\addtoks{\noexpand#1={\the#1#2}}\addtoks}
+  \def\adn#1{\addtokens{\toksC}{#1}\global\countA=1\let\next=\maketoks}
+  \def\poptoks#1#2|ENDTOKS|{\let\first=#1\toksD={#1}\toksA={#2}}
+  \def\maketoks{%
+    \expandafter\poptoks\the\toksA|ENDTOKS|\relax
+    \ifx\first0\adn0
+    \else\ifx\first1\adn1 \else\ifx\first2\adn2 \else\ifx\first3\adn3
+    \else\ifx\first4\adn4 \else\ifx\first5\adn5 \else\ifx\first6\adn6
+    \else\ifx\first7\adn7 \else\ifx\first8\adn8 \else\ifx\first9\adn9
+    \else
+      \ifnum0=\countA\else\makelink\fi
+      \ifx\first.\let\next=\done\else
+        \let\next=\maketoks
+        \addtokens{\toksB}{\the\toksD}
+        \ifx\first,\addtokens{\toksB}{\space}\fi
+      \fi
+    \fi\fi\fi\fi\fi\fi\fi\fi\fi\fi
+    \next}
+  \def\makelink{\addtokens{\toksB}%
+    {\noexpand\pdflink{\the\toksC}}\toksC={}\global\countA=0}
+  \def\pdflink#1{%
+    \startlink attr{/Border [0 0 0]} goto name{\pdfmkpgn{#1}}
+    \setcolor{\linkcolor}#1\endlink}
+  \def\done{\edef\st{\global\noexpand\toksA={\the\toksB}}\st}
+\else
+  % non-pdf mode
+  \let\pdfmkdest = \gobble
+  \let\pdfurl = \gobble
+  \let\endlink = \relax
+  \let\setcolor = \gobble
+  \let\pdfsetcolor = \gobble
+  \let\pdfmakeoutlines = \relax
+\fi  % \ifx\pdfoutput
+
+
+\message{fonts,}
+
+% Change the current font style to #1, remembering it in \curfontstyle.
+% For now, we do not accumulate font styles: @b{@i{foo}} prints foo in
+% italics, not bold italics.
+%
+\def\setfontstyle#1{%
+  \def\curfontstyle{#1}% not as a control sequence, because we are \edef'd.
+  \csname ten#1\endcsname  % change the current font
+}
+
+% Select #1 fonts with the current style.
+%
+\def\selectfonts#1{\csname #1fonts\endcsname \csname\curfontstyle\endcsname}
+
+\def\rm{\fam=0 \setfontstyle{rm}}
+\def\it{\fam=\itfam \setfontstyle{it}}
+\def\sl{\fam=\slfam \setfontstyle{sl}}
+\def\bf{\fam=\bffam \setfontstyle{bf}}\def\bfstylename{bf}
+\def\tt{\fam=\ttfam \setfontstyle{tt}}
+
+% Unfortunately, we have to override this for titles and the like, since
+% in those cases "rm" is bold.  Sigh.
+\def\rmisbold{\rm\def\curfontstyle{bf}}
+
+% Texinfo sort of supports the sans serif font style, which plain TeX does not.
+% So we set up a \sf.
+\newfam\sffam
+\def\sf{\fam=\sffam \setfontstyle{sf}}
+\let\li = \sf % Sometimes we call it \li, not \sf.
+
+% We don't need math for this font style.
+\def\ttsl{\setfontstyle{ttsl}}
+
+
+% Default leading.
+\newdimen\textleading  \textleading = 13.2pt
+
+% Set the baselineskip to #1, and the lineskip and strut size
+% correspondingly.  There is no deep meaning behind these magic numbers
+% used as factors; they just match (closely enough) what Knuth defined.
+%
+\def\lineskipfactor{.08333}
+\def\strutheightpercent{.70833}
+\def\strutdepthpercent {.29167}
+%
+% can get a sort of poor man's double spacing by redefining this.
+\def\baselinefactor{1}
+%
+\def\setleading#1{%
+  \dimen0 = #1\relax
+  \normalbaselineskip = \baselinefactor\dimen0
+  \normallineskip = \lineskipfactor\normalbaselineskip
+  \normalbaselines
+  \setbox\strutbox =\hbox{%
+    \vrule width0pt height\strutheightpercent\baselineskip
+                    depth \strutdepthpercent \baselineskip
+  }%
+}
+
+% PDF CMaps.  See also LaTeX's t1.cmap.
+%
+% do nothing with this by default.
+\expandafter\let\csname cmapOT1\endcsname\gobble
+\expandafter\let\csname cmapOT1IT\endcsname\gobble
+\expandafter\let\csname cmapOT1TT\endcsname\gobble
+
+% if we are producing pdf, and we have \pdffontattr, then define cmaps.
+% (\pdffontattr was introduced many years ago, but people still run
+% older pdftex's; it's easy to conditionalize, so we do.)
+\ifpdf \ifx\pdffontattr\thisisundefined \else
+  \begingroup
+    \catcode`\^^M=\active \def^^M{^^J}% Output line endings as the ^^J char.
+    \catcode`\%=12 \immediate\pdfobj stream {%!PS-Adobe-3.0 Resource-CMap
+%%DocumentNeededResources: ProcSet (CIDInit)
+%%IncludeResource: ProcSet (CIDInit)
+%%BeginResource: CMap (TeX-OT1-0)
+%%Title: (TeX-OT1-0 TeX OT1 0)
+%%Version: 1.000
+%%EndComments
+/CIDInit /ProcSet findresource begin
+12 dict begin
+begincmap
+/CIDSystemInfo
+<< /Registry (TeX)
+/Ordering (OT1)
+/Supplement 0
+>> def
+/CMapName /TeX-OT1-0 def
+/CMapType 2 def
+1 begincodespacerange
+<00> <7F>
+endcodespacerange
+8 beginbfrange
+<00> <01> <0393>
+<09> <0A> <03A8>
+<23> <26> <0023>
+<28> <3B> <0028>
+<3F> <5B> <003F>
+<5D> <5E> <005D>
+<61> <7A> <0061>
+<7B> <7C> <2013>
+endbfrange
+40 beginbfchar
+<02> <0398>
+<03> <039B>
+<04> <039E>
+<05> <03A0>
+<06> <03A3>
+<07> <03D2>
+<08> <03A6>
+<0B> <00660066>
+<0C> <00660069>
+<0D> <0066006C>
+<0E> <006600660069>
+<0F> <00660066006C>
+<10> <0131>
+<11> <0237>
+<12> <0060>
+<13> <00B4>
+<14> <02C7>
+<15> <02D8>
+<16> <00AF>
+<17> <02DA>
+<18> <00B8>
+<19> <00DF>
+<1A> <00E6>
+<1B> <0153>
+<1C> <00F8>
+<1D> <00C6>
+<1E> <0152>
+<1F> <00D8>
+<21> <0021>
+<22> <201D>
+<27> <2019>
+<3C> <00A1>
+<3D> <003D>
+<3E> <00BF>
+<5C> <201C>
+<5F> <02D9>
+<60> <2018>
+<7D> <02DD>
+<7E> <007E>
+<7F> <00A8>
+endbfchar
+endcmap
+CMapName currentdict /CMap defineresource pop
+end
+end
+%%EndResource
+%%EOF
+    }\endgroup
+  \expandafter\edef\csname cmapOT1\endcsname#1{%
+    \pdffontattr#1{/ToUnicode \the\pdflastobj\space 0 R}%
+  }%
+%
+% \cmapOT1IT
+  \begingroup
+    \catcode`\^^M=\active \def^^M{^^J}% Output line endings as the ^^J char.
+    \catcode`\%=12 \immediate\pdfobj stream {%!PS-Adobe-3.0 Resource-CMap
+%%DocumentNeededResources: ProcSet (CIDInit)
+%%IncludeResource: ProcSet (CIDInit)
+%%BeginResource: CMap (TeX-OT1IT-0)
+%%Title: (TeX-OT1IT-0 TeX OT1IT 0)
+%%Version: 1.000
+%%EndComments
+/CIDInit /ProcSet findresource begin
+12 dict begin
+begincmap
+/CIDSystemInfo
+<< /Registry (TeX)
+/Ordering (OT1IT)
+/Supplement 0
+>> def
+/CMapName /TeX-OT1IT-0 def
+/CMapType 2 def
+1 begincodespacerange
+<00> <7F>
+endcodespacerange
+8 beginbfrange
+<00> <01> <0393>
+<09> <0A> <03A8>
+<25> <26> <0025>
+<28> <3B> <0028>
+<3F> <5B> <003F>
+<5D> <5E> <005D>
+<61> <7A> <0061>
+<7B> <7C> <2013>
+endbfrange
+42 beginbfchar
+<02> <0398>
+<03> <039B>
+<04> <039E>
+<05> <03A0>
+<06> <03A3>
+<07> <03D2>
+<08> <03A6>
+<0B> <00660066>
+<0C> <00660069>
+<0D> <0066006C>
+<0E> <006600660069>
+<0F> <00660066006C>
+<10> <0131>
+<11> <0237>
+<12> <0060>
+<13> <00B4>
+<14> <02C7>
+<15> <02D8>
+<16> <00AF>
+<17> <02DA>
+<18> <00B8>
+<19> <00DF>
+<1A> <00E6>
+<1B> <0153>
+<1C> <00F8>
+<1D> <00C6>
+<1E> <0152>
+<1F> <00D8>
+<21> <0021>
+<22> <201D>
+<23> <0023>
+<24> <00A3>
+<27> <2019>
+<3C> <00A1>
+<3D> <003D>
+<3E> <00BF>
+<5C> <201C>
+<5F> <02D9>
+<60> <2018>
+<7D> <02DD>
+<7E> <007E>
+<7F> <00A8>
+endbfchar
+endcmap
+CMapName currentdict /CMap defineresource pop
+end
+end
+%%EndResource
+%%EOF
+    }\endgroup
+  \expandafter\edef\csname cmapOT1IT\endcsname#1{%
+    \pdffontattr#1{/ToUnicode \the\pdflastobj\space 0 R}%
+  }%
+%
+% \cmapOT1TT
+  \begingroup
+    \catcode`\^^M=\active \def^^M{^^J}% Output line endings as the ^^J char.
+    \catcode`\%=12 \immediate\pdfobj stream {%!PS-Adobe-3.0 Resource-CMap
+%%DocumentNeededResources: ProcSet (CIDInit)
+%%IncludeResource: ProcSet (CIDInit)
+%%BeginResource: CMap (TeX-OT1TT-0)
+%%Title: (TeX-OT1TT-0 TeX OT1TT 0)
+%%Version: 1.000
+%%EndComments
+/CIDInit /ProcSet findresource begin
+12 dict begin
+begincmap
+/CIDSystemInfo
+<< /Registry (TeX)
+/Ordering (OT1TT)
+/Supplement 0
+>> def
+/CMapName /TeX-OT1TT-0 def
+/CMapType 2 def
+1 begincodespacerange
+<00> <7F>
+endcodespacerange
+5 beginbfrange
+<00> <01> <0393>
+<09> <0A> <03A8>
+<21> <26> <0021>
+<28> <5F> <0028>
+<61> <7E> <0061>
+endbfrange
+32 beginbfchar
+<02> <0398>
+<03> <039B>
+<04> <039E>
+<05> <03A0>
+<06> <03A3>
+<07> <03D2>
+<08> <03A6>
+<0B> <2191>
+<0C> <2193>
+<0D> <0027>
+<0E> <00A1>
+<0F> <00BF>
+<10> <0131>
+<11> <0237>
+<12> <0060>
+<13> <00B4>
+<14> <02C7>
+<15> <02D8>
+<16> <00AF>
+<17> <02DA>
+<18> <00B8>
+<19> <00DF>
+<1A> <00E6>
+<1B> <0153>
+<1C> <00F8>
+<1D> <00C6>
+<1E> <0152>
+<1F> <00D8>
+<20> <2423>
+<27> <2019>
+<60> <2018>
+<7F> <00A8>
+endbfchar
+endcmap
+CMapName currentdict /CMap defineresource pop
+end
+end
+%%EndResource
+%%EOF
+    }\endgroup
+  \expandafter\edef\csname cmapOT1TT\endcsname#1{%
+    \pdffontattr#1{/ToUnicode \the\pdflastobj\space 0 R}%
+  }%
+\fi\fi
+
+
+% Set the font macro #1 to the font named #2, adding on the
+% specified font prefix (normally `cm').
+% #3 is the font's design size, #4 is a scale factor, #5 is the CMap
+% encoding (currently only OT1, OT1IT and OT1TT are allowed, pass
+% empty to omit).
+\def\setfont#1#2#3#4#5{%
+  \font#1=\fontprefix#2#3 scaled #4
+  \csname cmap#5\endcsname#1%
+}
+% This is what gets called when #5 of \setfont is empty.
+\let\cmap\gobble
+% emacs-page end of cmaps
+
+% Use cm as the default font prefix.
+% To specify the font prefix, you must define \fontprefix
+% before you read in texinfo.tex.
+\ifx\fontprefix\thisisundefined
+\def\fontprefix{cm}
+\fi
+% Support font families that don't use the same naming scheme as CM.
+\def\rmshape{r}
+\def\rmbshape{bx}               %where the normal face is bold
+\def\bfshape{b}
+\def\bxshape{bx}
+\def\ttshape{tt}
+\def\ttbshape{tt}
+\def\ttslshape{sltt}
+\def\itshape{ti}
+\def\itbshape{bxti}
+\def\slshape{sl}
+\def\slbshape{bxsl}
+\def\sfshape{ss}
+\def\sfbshape{ss}
+\def\scshape{csc}
+\def\scbshape{csc}
+
+% Definitions for a main text size of 11pt.  This is the default in
+% Texinfo.
+%
+\def\definetextfontsizexi{%
+% Text fonts (11.2pt, magstep1).
+\def\textnominalsize{11pt}
+\edef\mainmagstep{\magstephalf}
+\setfont\textrm\rmshape{10}{\mainmagstep}{OT1}
+\setfont\texttt\ttshape{10}{\mainmagstep}{OT1TT}
+\setfont\textbf\bfshape{10}{\mainmagstep}{OT1}
+\setfont\textit\itshape{10}{\mainmagstep}{OT1IT}
+\setfont\textsl\slshape{10}{\mainmagstep}{OT1}
+\setfont\textsf\sfshape{10}{\mainmagstep}{OT1}
+\setfont\textsc\scshape{10}{\mainmagstep}{OT1}
+\setfont\textttsl\ttslshape{10}{\mainmagstep}{OT1TT}
+\font\texti=cmmi10 scaled \mainmagstep
+\font\textsy=cmsy10 scaled \mainmagstep
+\def\textecsize{1095}
+
+% A few fonts for @defun names and args.
+\setfont\defbf\bfshape{10}{\magstep1}{OT1}
+\setfont\deftt\ttshape{10}{\magstep1}{OT1TT}
+\setfont\defttsl\ttslshape{10}{\magstep1}{OT1TT}
+\def\df{\let\tentt=\deftt \let\tenbf = \defbf \let\tenttsl=\defttsl \bf}
+
+% Fonts for indices, footnotes, small examples (9pt).
+\def\smallnominalsize{9pt}
+\setfont\smallrm\rmshape{9}{1000}{OT1}
+\setfont\smalltt\ttshape{9}{1000}{OT1TT}
+\setfont\smallbf\bfshape{10}{900}{OT1}
+\setfont\smallit\itshape{9}{1000}{OT1IT}
+\setfont\smallsl\slshape{9}{1000}{OT1}
+\setfont\smallsf\sfshape{9}{1000}{OT1}
+\setfont\smallsc\scshape{10}{900}{OT1}
+\setfont\smallttsl\ttslshape{10}{900}{OT1TT}
+\font\smalli=cmmi9
+\font\smallsy=cmsy9
+\def\smallecsize{0900}
+
+% Fonts for small examples (8pt).
+\def\smallernominalsize{8pt}
+\setfont\smallerrm\rmshape{8}{1000}{OT1}
+\setfont\smallertt\ttshape{8}{1000}{OT1TT}
+\setfont\smallerbf\bfshape{10}{800}{OT1}
+\setfont\smallerit\itshape{8}{1000}{OT1IT}
+\setfont\smallersl\slshape{8}{1000}{OT1}
+\setfont\smallersf\sfshape{8}{1000}{OT1}
+\setfont\smallersc\scshape{10}{800}{OT1}
+\setfont\smallerttsl\ttslshape{10}{800}{OT1TT}
+\font\smalleri=cmmi8
+\font\smallersy=cmsy8
+\def\smallerecsize{0800}
+
+% Fonts for title page (20.4pt):
+\def\titlenominalsize{20pt}
+\setfont\titlerm\rmbshape{12}{\magstep3}{OT1}
+\setfont\titleit\itbshape{10}{\magstep4}{OT1IT}
+\setfont\titlesl\slbshape{10}{\magstep4}{OT1}
+\setfont\titlett\ttbshape{12}{\magstep3}{OT1TT}
+\setfont\titlettsl\ttslshape{10}{\magstep4}{OT1TT}
+\setfont\titlesf\sfbshape{17}{\magstep1}{OT1}
+\let\titlebf=\titlerm
+\setfont\titlesc\scbshape{10}{\magstep4}{OT1}
+\font\titlei=cmmi12 scaled \magstep3
+\font\titlesy=cmsy10 scaled \magstep4
+\def\titleecsize{2074}
+
+% Chapter (and unnumbered) fonts (17.28pt).
+\def\chapnominalsize{17pt}
+\setfont\chaprm\rmbshape{12}{\magstep2}{OT1}
+\setfont\chapit\itbshape{10}{\magstep3}{OT1IT}
+\setfont\chapsl\slbshape{10}{\magstep3}{OT1}
+\setfont\chaptt\ttbshape{12}{\magstep2}{OT1TT}
+\setfont\chapttsl\ttslshape{10}{\magstep3}{OT1TT}
+\setfont\chapsf\sfbshape{17}{1000}{OT1}
+\let\chapbf=\chaprm
+\setfont\chapsc\scbshape{10}{\magstep3}{OT1}
+\font\chapi=cmmi12 scaled \magstep2
+\font\chapsy=cmsy10 scaled \magstep3
+\def\chapecsize{1728}
+
+% Section fonts (14.4pt).
+\def\secnominalsize{14pt}
+\setfont\secrm\rmbshape{12}{\magstep1}{OT1}
+\setfont\secit\itbshape{10}{\magstep2}{OT1IT}
+\setfont\secsl\slbshape{10}{\magstep2}{OT1}
+\setfont\sectt\ttbshape{12}{\magstep1}{OT1TT}
+\setfont\secttsl\ttslshape{10}{\magstep2}{OT1TT}
+\setfont\secsf\sfbshape{12}{\magstep1}{OT1}
+\let\secbf\secrm
+\setfont\secsc\scbshape{10}{\magstep2}{OT1}
+\font\seci=cmmi12 scaled \magstep1
+\font\secsy=cmsy10 scaled \magstep2
+\def\sececsize{1440}
+
+% Subsection fonts (13.15pt).
+\def\ssecnominalsize{13pt}
+\setfont\ssecrm\rmbshape{12}{\magstephalf}{OT1}
+\setfont\ssecit\itbshape{10}{1315}{OT1IT}
+\setfont\ssecsl\slbshape{10}{1315}{OT1}
+\setfont\ssectt\ttbshape{12}{\magstephalf}{OT1TT}
+\setfont\ssecttsl\ttslshape{10}{1315}{OT1TT}
+\setfont\ssecsf\sfbshape{12}{\magstephalf}{OT1}
+\let\ssecbf\ssecrm
+\setfont\ssecsc\scbshape{10}{1315}{OT1}
+\font\sseci=cmmi12 scaled \magstephalf
+\font\ssecsy=cmsy10 scaled 1315
+\def\ssececsize{1200}
+
+% Reduced fonts for @acro in text (10pt).
+\def\reducednominalsize{10pt}
+\setfont\reducedrm\rmshape{10}{1000}{OT1}
+\setfont\reducedtt\ttshape{10}{1000}{OT1TT}
+\setfont\reducedbf\bfshape{10}{1000}{OT1}
+\setfont\reducedit\itshape{10}{1000}{OT1IT}
+\setfont\reducedsl\slshape{10}{1000}{OT1}
+\setfont\reducedsf\sfshape{10}{1000}{OT1}
+\setfont\reducedsc\scshape{10}{1000}{OT1}
+\setfont\reducedttsl\ttslshape{10}{1000}{OT1TT}
+\font\reducedi=cmmi10
+\font\reducedsy=cmsy10
+\def\reducedecsize{1000}
+
+\textleading = 13.2pt % line spacing for 11pt CM
+\textfonts            % reset the current fonts
+\rm
+} % end of 11pt text font size definitions
+
+
+% Definitions to make the main text be 10pt Computer Modern, with
+% section, chapter, etc., sizes following suit.  This is for the GNU
+% Press printing of the Emacs 22 manual.  Maybe other manuals in the
+% future.  Used with @smallbook, which sets the leading to 12pt.
+%
+\def\definetextfontsizex{%
+% Text fonts (10pt).
+\def\textnominalsize{10pt}
+\edef\mainmagstep{1000}
+\setfont\textrm\rmshape{10}{\mainmagstep}{OT1}
+\setfont\texttt\ttshape{10}{\mainmagstep}{OT1TT}
+\setfont\textbf\bfshape{10}{\mainmagstep}{OT1}
+\setfont\textit\itshape{10}{\mainmagstep}{OT1IT}
+\setfont\textsl\slshape{10}{\mainmagstep}{OT1}
+\setfont\textsf\sfshape{10}{\mainmagstep}{OT1}
+\setfont\textsc\scshape{10}{\mainmagstep}{OT1}
+\setfont\textttsl\ttslshape{10}{\mainmagstep}{OT1TT}
+\font\texti=cmmi10 scaled \mainmagstep
+\font\textsy=cmsy10 scaled \mainmagstep
+\def\textecsize{1000}
+
+% A few fonts for @defun names and args.
+\setfont\defbf\bfshape{10}{\magstephalf}{OT1}
+\setfont\deftt\ttshape{10}{\magstephalf}{OT1TT}
+\setfont\defttsl\ttslshape{10}{\magstephalf}{OT1TT}
+\def\df{\let\tentt=\deftt \let\tenbf = \defbf \let\tenttsl=\defttsl \bf}
+
+% Fonts for indices, footnotes, small examples (9pt).
+\def\smallnominalsize{9pt}
+\setfont\smallrm\rmshape{9}{1000}{OT1}
+\setfont\smalltt\ttshape{9}{1000}{OT1TT}
+\setfont\smallbf\bfshape{10}{900}{OT1}
+\setfont\smallit\itshape{9}{1000}{OT1IT}
+\setfont\smallsl\slshape{9}{1000}{OT1}
+\setfont\smallsf\sfshape{9}{1000}{OT1}
+\setfont\smallsc\scshape{10}{900}{OT1}
+\setfont\smallttsl\ttslshape{10}{900}{OT1TT}
+\font\smalli=cmmi9
+\font\smallsy=cmsy9
+\def\smallecsize{0900}
+
+% Fonts for small examples (8pt).
+\def\smallernominalsize{8pt}
+\setfont\smallerrm\rmshape{8}{1000}{OT1}
+\setfont\smallertt\ttshape{8}{1000}{OT1TT}
+\setfont\smallerbf\bfshape{10}{800}{OT1}
+\setfont\smallerit\itshape{8}{1000}{OT1IT}
+\setfont\smallersl\slshape{8}{1000}{OT1}
+\setfont\smallersf\sfshape{8}{1000}{OT1}
+\setfont\smallersc\scshape{10}{800}{OT1}
+\setfont\smallerttsl\ttslshape{10}{800}{OT1TT}
+\font\smalleri=cmmi8
+\font\smallersy=cmsy8
+\def\smallerecsize{0800}
+
+% Fonts for title page (20.4pt):
+\def\titlenominalsize{20pt}
+\setfont\titlerm\rmbshape{12}{\magstep3}{OT1}
+\setfont\titleit\itbshape{10}{\magstep4}{OT1IT}
+\setfont\titlesl\slbshape{10}{\magstep4}{OT1}
+\setfont\titlett\ttbshape{12}{\magstep3}{OT1TT}
+\setfont\titlettsl\ttslshape{10}{\magstep4}{OT1TT}
+\setfont\titlesf\sfbshape{17}{\magstep1}{OT1}
+\let\titlebf=\titlerm
+\setfont\titlesc\scbshape{10}{\magstep4}{OT1}
+\font\titlei=cmmi12 scaled \magstep3
+\font\titlesy=cmsy10 scaled \magstep4
+\def\titleecsize{2074}
+
+% Chapter fonts (14.4pt).
+\def\chapnominalsize{14pt}
+\setfont\chaprm\rmbshape{12}{\magstep1}{OT1}
+\setfont\chapit\itbshape{10}{\magstep2}{OT1IT}
+\setfont\chapsl\slbshape{10}{\magstep2}{OT1}
+\setfont\chaptt\ttbshape{12}{\magstep1}{OT1TT}
+\setfont\chapttsl\ttslshape{10}{\magstep2}{OT1TT}
+\setfont\chapsf\sfbshape{12}{\magstep1}{OT1}
+\let\chapbf\chaprm
+\setfont\chapsc\scbshape{10}{\magstep2}{OT1}
+\font\chapi=cmmi12 scaled \magstep1
+\font\chapsy=cmsy10 scaled \magstep2
+\def\chapecsize{1440}
+
+% Section fonts (12pt).
+\def\secnominalsize{12pt}
+\setfont\secrm\rmbshape{12}{1000}{OT1}
+\setfont\secit\itbshape{10}{\magstep1}{OT1IT}
+\setfont\secsl\slbshape{10}{\magstep1}{OT1}
+\setfont\sectt\ttbshape{12}{1000}{OT1TT}
+\setfont\secttsl\ttslshape{10}{\magstep1}{OT1TT}
+\setfont\secsf\sfbshape{12}{1000}{OT1}
+\let\secbf\secrm
+\setfont\secsc\scbshape{10}{\magstep1}{OT1}
+\font\seci=cmmi12
+\font\secsy=cmsy10 scaled \magstep1
+\def\sececsize{1200}
+
+% Subsection fonts (10pt).
+\def\ssecnominalsize{10pt}
+\setfont\ssecrm\rmbshape{10}{1000}{OT1}
+\setfont\ssecit\itbshape{10}{1000}{OT1IT}
+\setfont\ssecsl\slbshape{10}{1000}{OT1}
+\setfont\ssectt\ttbshape{10}{1000}{OT1TT}
+\setfont\ssecttsl\ttslshape{10}{1000}{OT1TT}
+\setfont\ssecsf\sfbshape{10}{1000}{OT1}
+\let\ssecbf\ssecrm
+\setfont\ssecsc\scbshape{10}{1000}{OT1}
+\font\sseci=cmmi10
+\font\ssecsy=cmsy10
+\def\ssececsize{1000}
+
+% Reduced fonts for @acro in text (9pt).
+\def\reducednominalsize{9pt}
+\setfont\reducedrm\rmshape{9}{1000}{OT1}
+\setfont\reducedtt\ttshape{9}{1000}{OT1TT}
+\setfont\reducedbf\bfshape{10}{900}{OT1}
+\setfont\reducedit\itshape{9}{1000}{OT1IT}
+\setfont\reducedsl\slshape{9}{1000}{OT1}
+\setfont\reducedsf\sfshape{9}{1000}{OT1}
+\setfont\reducedsc\scshape{10}{900}{OT1}
+\setfont\reducedttsl\ttslshape{10}{900}{OT1TT}
+\font\reducedi=cmmi9
+\font\reducedsy=cmsy9
+\def\reducedecsize{0900}
+
+\divide\parskip by 2  % reduce space between paragraphs
+\textleading = 12pt   % line spacing for 10pt CM
+\textfonts            % reset the current fonts
+\rm
+} % end of 10pt text font size definitions
+
+
+% We provide the user-level command
+%   @fonttextsize 10
+% (or 11) to redefine the text font size.  pt is assumed.
+%
+\def\xiword{11}
+\def\xword{10}
+\def\xwordpt{10pt}
+%
+\parseargdef\fonttextsize{%
+  \def\textsizearg{#1}%
+  %\wlog{doing @fonttextsize \textsizearg}%
+  %
+  % Set \globaldefs so that documents can use this inside @tex, since
+  % makeinfo 4.8 does not support it, but we need it nonetheless.
+  %
+ \begingroup \globaldefs=1
+  \ifx\textsizearg\xword \definetextfontsizex
+  \else \ifx\textsizearg\xiword \definetextfontsizexi
+  \else
+    \errhelp=\EMsimple
+    \errmessage{@fonttextsize only supports `10' or `11', not `\textsizearg'}
+  \fi\fi
+ \endgroup
+}
+
+
+% In order for the font changes to affect most math symbols and letters,
+% we have to define the \textfont of the standard families.  Since
+% texinfo doesn't allow for producing subscripts and superscripts except
+% in the main text, we don't bother to reset \scriptfont and
+% \scriptscriptfont (which would also require loading a lot more fonts).
+%
+\def\resetmathfonts{%
+  \textfont0=\tenrm \textfont1=\teni \textfont2=\tensy
+  \textfont\itfam=\tenit \textfont\slfam=\tensl \textfont\bffam=\tenbf
+  \textfont\ttfam=\tentt \textfont\sffam=\tensf
+}
+
+% The font-changing commands redefine the meanings of \tenSTYLE, instead
+% of just \STYLE.  We do this because \STYLE needs to also set the
+% current \fam for math mode.  Our \STYLE (e.g., \rm) commands hardwire
+% \tenSTYLE to set the current font.
+%
+% Each font-changing command also sets the names \lsize (one size lower)
+% and \lllsize (three sizes lower).  These relative commands are used in
+% the LaTeX logo and acronyms.
+%
+% This all needs generalizing, badly.
+%
+\def\textfonts{%
+  \let\tenrm=\textrm \let\tenit=\textit \let\tensl=\textsl
+  \let\tenbf=\textbf \let\tentt=\texttt \let\smallcaps=\textsc
+  \let\tensf=\textsf \let\teni=\texti \let\tensy=\textsy
+  \let\tenttsl=\textttsl
+  \def\curfontsize{text}%
+  \def\lsize{reduced}\def\lllsize{smaller}%
+  \resetmathfonts \setleading{\textleading}}
+\def\titlefonts{%
+  \let\tenrm=\titlerm \let\tenit=\titleit \let\tensl=\titlesl
+  \let\tenbf=\titlebf \let\tentt=\titlett \let\smallcaps=\titlesc
+  \let\tensf=\titlesf \let\teni=\titlei \let\tensy=\titlesy
+  \let\tenttsl=\titlettsl
+  \def\curfontsize{title}%
+  \def\lsize{chap}\def\lllsize{subsec}%
+  \resetmathfonts \setleading{27pt}}
+\def\titlefont#1{{\titlefonts\rmisbold #1}}
+\def\chapfonts{%
+  \let\tenrm=\chaprm \let\tenit=\chapit \let\tensl=\chapsl
+  \let\tenbf=\chapbf \let\tentt=\chaptt \let\smallcaps=\chapsc
+  \let\tensf=\chapsf \let\teni=\chapi \let\tensy=\chapsy
+  \let\tenttsl=\chapttsl
+  \def\curfontsize{chap}%
+  \def\lsize{sec}\def\lllsize{text}%
+  \resetmathfonts \setleading{19pt}}
+\def\secfonts{%
+  \let\tenrm=\secrm \let\tenit=\secit \let\tensl=\secsl
+  \let\tenbf=\secbf \let\tentt=\sectt \let\smallcaps=\secsc
+  \let\tensf=\secsf \let\teni=\seci \let\tensy=\secsy
+  \let\tenttsl=\secttsl
+  \def\curfontsize{sec}%
+  \def\lsize{subsec}\def\lllsize{reduced}%
+  \resetmathfonts \setleading{16pt}}
+\def\subsecfonts{%
+  \let\tenrm=\ssecrm \let\tenit=\ssecit \let\tensl=\ssecsl
+  \let\tenbf=\ssecbf \let\tentt=\ssectt \let\smallcaps=\ssecsc
+  \let\tensf=\ssecsf \let\teni=\sseci \let\tensy=\ssecsy
+  \let\tenttsl=\ssecttsl
+  \def\curfontsize{ssec}%
+  \def\lsize{text}\def\lllsize{small}%
+  \resetmathfonts \setleading{15pt}}
+\let\subsubsecfonts = \subsecfonts
+\def\reducedfonts{%
+  \let\tenrm=\reducedrm \let\tenit=\reducedit \let\tensl=\reducedsl
+  \let\tenbf=\reducedbf \let\tentt=\reducedtt \let\reducedcaps=\reducedsc
+  \let\tensf=\reducedsf \let\teni=\reducedi \let\tensy=\reducedsy
+  \let\tenttsl=\reducedttsl
+  \def\curfontsize{reduced}%
+  \def\lsize{small}\def\lllsize{smaller}%
+  \resetmathfonts \setleading{10.5pt}}
+\def\smallfonts{%
+  \let\tenrm=\smallrm \let\tenit=\smallit \let\tensl=\smallsl
+  \let\tenbf=\smallbf \let\tentt=\smalltt \let\smallcaps=\smallsc
+  \let\tensf=\smallsf \let\teni=\smalli \let\tensy=\smallsy
+  \let\tenttsl=\smallttsl
+  \def\curfontsize{small}%
+  \def\lsize{smaller}\def\lllsize{smaller}%
+  \resetmathfonts \setleading{10.5pt}}
+\def\smallerfonts{%
+  \let\tenrm=\smallerrm \let\tenit=\smallerit \let\tensl=\smallersl
+  \let\tenbf=\smallerbf \let\tentt=\smallertt \let\smallcaps=\smallersc
+  \let\tensf=\smallersf \let\teni=\smalleri \let\tensy=\smallersy
+  \let\tenttsl=\smallerttsl
+  \def\curfontsize{smaller}%
+  \def\lsize{smaller}\def\lllsize{smaller}%
+  \resetmathfonts \setleading{9.5pt}}
+
+% Fonts for short table of contents.
+\setfont\shortcontrm\rmshape{12}{1000}{OT1}
+\setfont\shortcontbf\bfshape{10}{\magstep1}{OT1}  % no cmb12
+\setfont\shortcontsl\slshape{12}{1000}{OT1}
+\setfont\shortconttt\ttshape{12}{1000}{OT1TT}
+
+% Define these just so they can be easily changed for other fonts.
+\def\angleleft{$\langle$}
+\def\angleright{$\rangle$}
+
+% Set the fonts to use with the @small... environments.
+\let\smallexamplefonts = \smallfonts
+
+% About \smallexamplefonts.  If we use \smallfonts (9pt), @smallexample
+% can fit this many characters:
+%   8.5x11=86   smallbook=72  a4=90  a5=69
+% If we use \scriptfonts (8pt), then we can fit this many characters:
+%   8.5x11=90+  smallbook=80  a4=90+  a5=77
+% For me, subjectively, the few extra characters that fit aren't worth
+% the additional smallness of 8pt.  So I'm making the default 9pt.
+%
+% By the way, for comparison, here's what fits with @example (10pt):
+%   8.5x11=71  smallbook=60  a4=75  a5=58
+% --karl, 24jan03.
+
+% Set up the default fonts, so we can use them for creating boxes.
+%
+\definetextfontsizexi
+
+
+\message{markup,}
+
+% Check if we are currently using a typewriter font.  Since all the
+% Computer Modern typewriter fonts have zero interword stretch (and
+% shrink), and it is reasonable to expect all typewriter fonts to have
+% this property, we can check that font parameter.
+%
+\def\ifmonospace{\ifdim\fontdimen3\font=0pt }
+
+% Markup style infrastructure.  \defmarkupstylesetup\INITMACRO will
+% define and register \INITMACRO to be called on markup style changes.
+% \INITMACRO can check \currentmarkupstyle for the innermost
+% style and the set of \ifmarkupSTYLE switches for all styles
+% currently in effect.
+\newif\ifmarkupvar
+\newif\ifmarkupsamp
+\newif\ifmarkupkey
+%\newif\ifmarkupfile % @file == @samp.
+%\newif\ifmarkupoption % @option == @samp.
+\newif\ifmarkupcode
+\newif\ifmarkupkbd
+%\newif\ifmarkupenv % @env == @code.
+%\newif\ifmarkupcommand % @command == @code.
+\newif\ifmarkuptex % @tex (and part of @math, for now).
+\newif\ifmarkupexample
+\newif\ifmarkupverb
+\newif\ifmarkupverbatim
+
+\let\currentmarkupstyle\empty
+
+\def\setupmarkupstyle#1{%
+  \csname markup#1true\endcsname
+  \def\currentmarkupstyle{#1}%
+  \markupstylesetup
+}
+
+\let\markupstylesetup\empty
+
+\def\defmarkupstylesetup#1{%
+  \expandafter\def\expandafter\markupstylesetup
+    \expandafter{\markupstylesetup #1}%
+  \def#1%
+}
+
+% Markup style setup for left and right quotes.
+\defmarkupstylesetup\markupsetuplq{%
+  \expandafter\let\expandafter \temp
+    \csname markupsetuplq\currentmarkupstyle\endcsname
+  \ifx\temp\relax \markupsetuplqdefault \else \temp \fi
+}
+
+\defmarkupstylesetup\markupsetuprq{%
+  \expandafter\let\expandafter \temp
+    \csname markupsetuprq\currentmarkupstyle\endcsname
+  \ifx\temp\relax \markupsetuprqdefault \else \temp \fi
+}
+
+{
+\catcode`\'=\active
+\catcode`\`=\active
+
+\gdef\markupsetuplqdefault{\let`\lq}
+\gdef\markupsetuprqdefault{\let'\rq}
+
+\gdef\markupsetcodequoteleft{\let`\codequoteleft}
+\gdef\markupsetcodequoteright{\let'\codequoteright}
+
+\gdef\markupsetnoligaturesquoteleft{\let`\noligaturesquoteleft}
+}
+
+\let\markupsetuplqcode \markupsetcodequoteleft
+\let\markupsetuprqcode \markupsetcodequoteright
+%
+\let\markupsetuplqexample \markupsetcodequoteleft
+\let\markupsetuprqexample \markupsetcodequoteright
+%
+\let\markupsetuplqsamp \markupsetcodequoteleft
+\let\markupsetuprqsamp \markupsetcodequoteright
+%
+\let\markupsetuplqverb \markupsetcodequoteleft
+\let\markupsetuprqverb \markupsetcodequoteright
+%
+\let\markupsetuplqverbatim \markupsetcodequoteleft
+\let\markupsetuprqverbatim \markupsetcodequoteright
+
+\let\markupsetuplqkbd \markupsetnoligaturesquoteleft
+
+% Allow an option to not use regular directed right quote/apostrophe
+% (char 0x27), but instead the undirected quote from cmtt (char 0x0d).
+% The undirected quote is ugly, so don't make it the default, but it
+% works for pasting with more pdf viewers (at least evince), the
+% lilypond developers report.  xpdf does work with the regular 0x27.
+%
+\def\codequoteright{%
+  \expandafter\ifx\csname SETtxicodequoteundirected\endcsname\relax
+    \expandafter\ifx\csname SETcodequoteundirected\endcsname\relax
+      '%
+    \else \char'15 \fi
+  \else \char'15 \fi
+}
+%
+% and a similar option for the left quote char vs. a grave accent.
+% Modern fonts display ASCII 0x60 as a grave accent, so some people like
+% the code environments to do likewise.
+%
+\def\codequoteleft{%
+  \expandafter\ifx\csname SETtxicodequotebacktick\endcsname\relax
+    \expandafter\ifx\csname SETcodequotebacktick\endcsname\relax
+      % [Knuth] pp. 380,381,391
+      % \relax disables Spanish ligatures ?` and !` of \tt font.
+      \relax`%
+    \else \char'22 \fi
+  \else \char'22 \fi
+}
+
+% Commands to set the quote options.
+% 
+\parseargdef\codequoteundirected{%
+  \def\temp{#1}%
+  \ifx\temp\onword
+    \expandafter\let\csname SETtxicodequoteundirected\endcsname
+      = t%
+  \else\ifx\temp\offword
+    \expandafter\let\csname SETtxicodequoteundirected\endcsname
+      = \relax
+  \else
+    \errhelp = \EMsimple
+    \errmessage{Unknown @codequoteundirected value `\temp', must be on|off}%
+  \fi\fi
+}
+%
+\parseargdef\codequotebacktick{%
+  \def\temp{#1}%
+  \ifx\temp\onword
+    \expandafter\let\csname SETtxicodequotebacktick\endcsname
+      = t%
+  \else\ifx\temp\offword
+    \expandafter\let\csname SETtxicodequotebacktick\endcsname
+      = \relax
+  \else
+    \errhelp = \EMsimple
+    \errmessage{Unknown @codequotebacktick value `\temp', must be on|off}%
+  \fi\fi
+}
+
+% [Knuth] pp. 380,381,391, disable Spanish ligatures ?` and !` of \tt font.
+\def\noligaturesquoteleft{\relax\lq}
+
+% Count depth in font-changes, for error checks
+\newcount\fontdepth \fontdepth=0
+
+% Font commands.
+
+% #1 is the font command (\sl or \it), #2 is the text to slant.
+% If we are in a monospaced environment, however, 1) always use \ttsl,
+% and 2) do not add an italic correction.
+\def\dosmartslant#1#2{%
+  \ifusingtt 
+    {{\ttsl #2}\let\next=\relax}%
+    {\def\next{{#1#2}\futurelet\next\smartitaliccorrection}}%
+  \next
+}
+\def\smartslanted{\dosmartslant\sl}
+\def\smartitalic{\dosmartslant\it}
+
+% Output an italic correction unless \next (presumed to be the following
+% character) is such as not to need one.
+\def\smartitaliccorrection{%
+  \ifx\next,%
+  \else\ifx\next-%
+  \else\ifx\next.%
+  \else\ptexslash
+  \fi\fi\fi
+  \aftersmartic
+}
+
+% like \smartslanted except unconditionally uses \ttsl, and no ic.
+% @var is set to this for defun arguments.
+\def\ttslanted#1{{\ttsl #1}}
+
+% @cite is like \smartslanted except unconditionally use \sl.  We never want
+% ttsl for book titles, do we?
+\def\cite#1{{\sl #1}\futurelet\next\smartitaliccorrection}
+
+\def\aftersmartic{}
+\def\var#1{%
+  \let\saveaftersmartic = \aftersmartic
+  \def\aftersmartic{\null\let\aftersmartic=\saveaftersmartic}%
+  \smartslanted{#1}%
+}
+
+\let\i=\smartitalic
+\let\slanted=\smartslanted
+\let\dfn=\smartslanted
+\let\emph=\smartitalic
+
+% Explicit font changes: @r, @sc, undocumented @ii.
+\def\r#1{{\rm #1}}              % roman font
+\def\sc#1{{\smallcaps#1}}       % smallcaps font
+\def\ii#1{{\it #1}}             % italic font
+
+% @b, explicit bold.  Also @strong.
+\def\b#1{{\bf #1}}
+\let\strong=\b
+
+% @sansserif, explicit sans.
+\def\sansserif#1{{\sf #1}}
+
+% We can't just use \exhyphenpenalty, because that only has effect at
+% the end of a paragraph.  Restore normal hyphenation at the end of the
+% group within which \nohyphenation is presumably called.
+%
+\def\nohyphenation{\hyphenchar\font = -1  \aftergroup\restorehyphenation}
+\def\restorehyphenation{\hyphenchar\font = `- }
+
+% Set sfcode to normal for the chars that usually have another value.
+% Can't use plain's \frenchspacing because it uses the `\x notation, and
+% sometimes \x has an active definition that messes things up.
+%
+\catcode`@=11
+  \def\plainfrenchspacing{%
+    \sfcode\dotChar  =\@m \sfcode\questChar=\@m \sfcode\exclamChar=\@m
+    \sfcode\colonChar=\@m \sfcode\semiChar =\@m \sfcode\commaChar =\@m
+    \def\endofsentencespacefactor{1000}% for @. and friends
+  }
+  \def\plainnonfrenchspacing{%
+    \sfcode`\.3000\sfcode`\?3000\sfcode`\!3000
+    \sfcode`\:2000\sfcode`\;1500\sfcode`\,1250
+    \def\endofsentencespacefactor{3000}% for @. and friends
+  }
+\catcode`@=\other
+\def\endofsentencespacefactor{3000}% default
+
+% @t, explicit typewriter.
+\def\t#1{%
+  {\tt \rawbackslash \plainfrenchspacing #1}%
+  \null
+}
+
+% @samp.
+\def\samp#1{{\setupmarkupstyle{samp}\lq\tclose{#1}\rq\null}}
+
+% definition of @key that produces a lozenge.  Doesn't adjust to text size.
+%\setfont\keyrm\rmshape{8}{1000}{OT1}
+%\font\keysy=cmsy9
+%\def\key#1{{\keyrm\textfont2=\keysy \leavevmode\hbox{%
+%  \raise0.4pt\hbox{\angleleft}\kern-.08em\vtop{%
+%    \vbox{\hrule\kern-0.4pt
+%     \hbox{\raise0.4pt\hbox{\vphantom{\angleleft}}#1}}%
+%    \kern-0.4pt\hrule}%
+%  \kern-.06em\raise0.4pt\hbox{\angleright}}}}
+
+% definition of @key with no lozenge.  If the current font is already
+% monospace, don't change it; that way, we respect @kbdinputstyle.  But
+% if it isn't monospace, then use \tt.
+%
+\def\key#1{{\setupmarkupstyle{key}%
+  \nohyphenation
+  \ifmonospace\else\tt\fi
+  #1}\null}
+
+% ctrl is no longer a Texinfo command.
+\def\ctrl #1{{\tt \rawbackslash \hat}#1}
+
+% @file, @option are the same as @samp.
+\let\file=\samp
+\let\option=\samp
+
+% @code is a modification of @t,
+% which makes spaces the same size as normal in the surrounding text.
+\def\tclose#1{%
+  {%
+    % Change normal interword space to be same as for the current font.
+    \spaceskip = \fontdimen2\font
+    %
+    % Switch to typewriter.
+    \tt
+    %
+    % But `\ ' produces the large typewriter interword space.
+    \def\ {{\spaceskip = 0pt{} }}%
+    %
+    % Turn off hyphenation.
+    \nohyphenation
+    %
+    \rawbackslash
+    \plainfrenchspacing
+    #1%
+  }%
+  \null % reset spacefactor to 1000
+}
+
+% We *must* turn on hyphenation at `-' and `_' in @code.
+% Otherwise, it is too hard to avoid overfull hboxes
+% in the Emacs manual, the Library manual, etc.
+
+% Unfortunately, TeX uses one parameter (\hyphenchar) to control
+% both hyphenation at - and hyphenation within words.
+% We must therefore turn them both off (\tclose does that)
+% and arrange explicitly to hyphenate at a dash.
+%  -- rms.
+{
+  \catcode`\-=\active \catcode`\_=\active
+  \catcode`\'=\active \catcode`\`=\active
+  \global\let'=\rq \global\let`=\lq  % default definitions
+  %
+  \global\def\code{\begingroup
+    \setupmarkupstyle{code}%
+    % The following should really be moved into \setupmarkupstyle handlers.
+    \catcode\dashChar=\active  \catcode\underChar=\active
+    \ifallowcodebreaks
+     \let-\codedash
+     \let_\codeunder
+    \else
+     \let-\realdash
+     \let_\realunder
+    \fi
+    \codex
+  }
+}
+
+\def\codex #1{\tclose{#1}\endgroup}
+
+\def\realdash{-}
+\def\codedash{-\discretionary{}{}{}}
+\def\codeunder{%
+  % this is all so @math{@code{var_name}+1} can work.  In math mode, _
+  % is "active" (mathcode"8000) and \normalunderscore (or \char95, etc.)
+  % will therefore expand the active definition of _, which is us
+  % (inside @code that is), therefore an endless loop.
+  \ifusingtt{\ifmmode
+               \mathchar"075F % class 0=ordinary, family 7=ttfam, pos 0x5F=_.
+             \else\normalunderscore \fi
+             \discretionary{}{}{}}%
+            {\_}%
+}
+
+% An additional complication: the above will allow breaks after, e.g.,
+% each of the four underscores in __typeof__.  This is undesirable in
+% some manuals, especially if they don't have long identifiers in
+% general.  @allowcodebreaks provides a way to control this.
+%
+\newif\ifallowcodebreaks  \allowcodebreakstrue
+
+\def\keywordtrue{true}
+\def\keywordfalse{false}
+
+\parseargdef\allowcodebreaks{%
+  \def\txiarg{#1}%
+  \ifx\txiarg\keywordtrue
+    \allowcodebreakstrue
+  \else\ifx\txiarg\keywordfalse
+    \allowcodebreaksfalse
+  \else
+    \errhelp = \EMsimple
+    \errmessage{Unknown @allowcodebreaks option `\txiarg', must be true|false}%
+  \fi\fi
+}
+
+% @uref (abbreviation for `urlref') takes an optional (comma-separated)
+% second argument specifying the text to display and an optional third
+% arg as text to display instead of (rather than in addition to) the url
+% itself.  First (mandatory) arg is the url.
+% (This \urefnobreak definition isn't used now, leaving it for a while
+% for comparison.)
+\def\urefnobreak#1{\dourefnobreak #1,,,\finish}
+\def\dourefnobreak#1,#2,#3,#4\finish{\begingroup
+  \unsepspaces
+  \pdfurl{#1}%
+  \setbox0 = \hbox{\ignorespaces #3}%
+  \ifdim\wd0 > 0pt
+    \unhbox0 % third arg given, show only that
+  \else
+    \setbox0 = \hbox{\ignorespaces #2}%
+    \ifdim\wd0 > 0pt
+      \ifpdf
+        \unhbox0             % PDF: 2nd arg given, show only it
+      \else
+        \unhbox0\ (\code{#1})% DVI: 2nd arg given, show both it and url
+      \fi
+    \else
+      \code{#1}% only url given, so show it
+    \fi
+  \fi
+  \endlink
+\endgroup}
+
+% This \urefbreak definition is the active one.
+\def\urefbreak{\begingroup \urefcatcodes \dourefbreak}
+\let\uref=\urefbreak
+\def\dourefbreak#1{\urefbreakfinish #1,,,\finish}
+\def\urefbreakfinish#1,#2,#3,#4\finish{% doesn't work in @example
+  \unsepspaces
+  \pdfurl{#1}%
+  \setbox0 = \hbox{\ignorespaces #3}%
+  \ifdim\wd0 > 0pt
+    \unhbox0 % third arg given, show only that
+  \else
+    \setbox0 = \hbox{\ignorespaces #2}%
+    \ifdim\wd0 > 0pt
+      \ifpdf
+        \unhbox0             % PDF: 2nd arg given, show only it
+      \else
+        \unhbox0\ (\urefcode{#1})% DVI: 2nd arg given, show both it and url
+      \fi
+    \else
+      \urefcode{#1}% only url given, so show it
+    \fi
+  \fi
+  \endlink
+\endgroup}
+
+% Allow line breaks around only a few characters (only).
+\def\urefcatcodes{%
+  \catcode\ampChar=\active   \catcode\dotChar=\active
+  \catcode\hashChar=\active  \catcode\questChar=\active
+  \catcode\slashChar=\active
+}
+{
+  \urefcatcodes
+  %
+  \global\def\urefcode{\begingroup
+    \setupmarkupstyle{code}%
+    \urefcatcodes
+    \let&\urefcodeamp
+    \let.\urefcodedot
+    \let#\urefcodehash
+    \let?\urefcodequest
+    \let/\urefcodeslash
+    \codex
+  }
+  %
+  % By default, they are just regular characters.
+  \global\def&{\normalamp}
+  \global\def.{\normaldot}
+  \global\def#{\normalhash}
+  \global\def?{\normalquest}
+  \global\def/{\normalslash}
+}
+
+% we put a little stretch before and after the breakable chars, to help
+% line breaking of long url's.  The unequal skips make look better in
+% cmtt at least, especially for dots.
+\def\urefprestretch{\urefprebreak \hskip0pt plus.13em }
+\def\urefpoststretch{\urefpostbreak \hskip0pt plus.1em }
+%
+\def\urefcodeamp{\urefprestretch \&\urefpoststretch}
+\def\urefcodedot{\urefprestretch .\urefpoststretch}
+\def\urefcodehash{\urefprestretch \#\urefpoststretch}
+\def\urefcodequest{\urefprestretch ?\urefpoststretch}
+\def\urefcodeslash{\futurelet\next\urefcodeslashfinish}
+{
+  \catcode`\/=\active
+  \global\def\urefcodeslashfinish{%
+    \urefprestretch \slashChar
+    % Allow line break only after the final / in a sequence of
+    % slashes, to avoid line break between the slashes in http://.
+    \ifx\next/\else \urefpoststretch \fi
+  }
+}
+
+% One more complication: by default we'll break after the special
+% characters, but some people like to break before the special chars, so
+% allow that.  Also allow no breaking at all, for manual control.
+% 
+\parseargdef\urefbreakstyle{%
+  \def\txiarg{#1}%
+  \ifx\txiarg\wordnone
+    \def\urefprebreak{\nobreak}\def\urefpostbreak{\nobreak}
+  \else\ifx\txiarg\wordbefore
+    \def\urefprebreak{\allowbreak}\def\urefpostbreak{\nobreak}
+  \else\ifx\txiarg\wordafter
+    \def\urefprebreak{\nobreak}\def\urefpostbreak{\allowbreak}
+  \else
+    \errhelp = \EMsimple
+    \errmessage{Unknown @urefbreakstyle setting `\txiarg'}%
+  \fi\fi\fi
+}
+\def\wordafter{after}
+\def\wordbefore{before}
+\def\wordnone{none}
+
+\urefbreakstyle after
+
+% @url synonym for @uref, since that's how everyone uses it.
+%
+\let\url=\uref
+
+% rms does not like angle brackets --karl, 17may97.
+% So now @email is just like @uref, unless we are pdf.
+%
+%\def\email#1{\angleleft{\tt #1}\angleright}
+\ifpdf
+  \def\email#1{\doemail#1,,\finish}
+  \def\doemail#1,#2,#3\finish{\begingroup
+    \unsepspaces
+    \pdfurl{mailto:#1}%
+    \setbox0 = \hbox{\ignorespaces #2}%
+    \ifdim\wd0>0pt\unhbox0\else\code{#1}\fi
+    \endlink
+  \endgroup}
+\else
+  \let\email=\uref
+\fi
+
+% @kbd is like @code, except that if the argument is just one @key command,
+% then @kbd has no effect.
+\def\kbd#1{{\setupmarkupstyle{kbd}\def\look{#1}\expandafter\kbdfoo\look??\par}}
+
+% @kbdinputstyle -- arg is `distinct' (@kbd uses slanted tty font always),
+%   `example' (@kbd uses ttsl only inside of @example and friends),
+%   or `code' (@kbd uses normal tty font always).
+\parseargdef\kbdinputstyle{%
+  \def\txiarg{#1}%
+  \ifx\txiarg\worddistinct
+    \gdef\kbdexamplefont{\ttsl}\gdef\kbdfont{\ttsl}%
+  \else\ifx\txiarg\wordexample
+    \gdef\kbdexamplefont{\ttsl}\gdef\kbdfont{\tt}%
+  \else\ifx\txiarg\wordcode
+    \gdef\kbdexamplefont{\tt}\gdef\kbdfont{\tt}%
+  \else
+    \errhelp = \EMsimple
+    \errmessage{Unknown @kbdinputstyle setting `\txiarg'}%
+  \fi\fi\fi
+}
+\def\worddistinct{distinct}
+\def\wordexample{example}
+\def\wordcode{code}
+
+% Default is `distinct'.
+\kbdinputstyle distinct
+
+\def\xkey{\key}
+\def\kbdfoo#1#2#3\par{\def\one{#1}\def\three{#3}\def\threex{??}%
+\ifx\one\xkey\ifx\threex\three \key{#2}%
+\else{\tclose{\kbdfont\setupmarkupstyle{kbd}\look}}\fi
+\else{\tclose{\kbdfont\setupmarkupstyle{kbd}\look}}\fi}
+
+% For @indicateurl, @env, @command quotes seem unnecessary, so use \code.
+\let\indicateurl=\code
+\let\env=\code
+\let\command=\code
+
+% @clicksequence{File @click{} Open ...}
+\def\clicksequence#1{\begingroup #1\endgroup}
+
+% @clickstyle @arrow   (by default)
+\parseargdef\clickstyle{\def\click{#1}}
+\def\click{\arrow}
+
+% Typeset a dimension, e.g., `in' or `pt'.  The only reason for the
+% argument is to make the input look right: @dmn{pt} instead of @dmn{}pt.
+%
+\def\dmn#1{\thinspace #1}
+
+% @l was never documented to mean ``switch to the Lisp font'',
+% and it is not used as such in any manual I can find.  We need it for
+% Polish suppressed-l.  --karl, 22sep96.
+%\def\l#1{{\li #1}\null}
+
+% @acronym for "FBI", "NATO", and the like.
+% We print this one point size smaller, since it's intended for
+% all-uppercase.
+%
+\def\acronym#1{\doacronym #1,,\finish}
+\def\doacronym#1,#2,#3\finish{%
+  {\selectfonts\lsize #1}%
+  \def\temp{#2}%
+  \ifx\temp\empty \else
+    \space ({\unsepspaces \ignorespaces \temp \unskip})%
+  \fi
+  \null % reset \spacefactor=1000
+}
+
+% @abbr for "Comput. J." and the like.
+% No font change, but don't do end-of-sentence spacing.
+%
+\def\abbr#1{\doabbr #1,,\finish}
+\def\doabbr#1,#2,#3\finish{%
+  {\plainfrenchspacing #1}%
+  \def\temp{#2}%
+  \ifx\temp\empty \else
+    \space ({\unsepspaces \ignorespaces \temp \unskip})%
+  \fi
+  \null % reset \spacefactor=1000
+}
+
+% @asis just yields its argument.  Used with @table, for example.
+%
+\def\asis#1{#1}
+
+% @math outputs its argument in math mode.
+%
+% One complication: _ usually means subscripts, but it could also mean
+% an actual _ character, as in @math{@var{some_variable} + 1}.  So make
+% _ active, and distinguish by seeing if the current family is \slfam,
+% which is what @var uses.
+{
+  \catcode`\_ = \active
+  \gdef\mathunderscore{%
+    \catcode`\_=\active
+    \def_{\ifnum\fam=\slfam \_\else\sb\fi}%
+  }
+}
+% Another complication: we want \\ (and @\) to output a math (or tt) \.
+% FYI, plain.tex uses \\ as a temporary control sequence (for no
+% particular reason), but this is not advertised and we don't care.
+%
+% The \mathchar is class=0=ordinary, family=7=ttfam, position=5C=\.
+\def\mathbackslash{\ifnum\fam=\ttfam \mathchar"075C \else\backslash \fi}
+%
+\def\math{%
+  \tex
+  \mathunderscore
+  \let\\ = \mathbackslash
+  \mathactive
+  % make the texinfo accent commands work in math mode
+  \let\"=\ddot
+  \let\'=\acute
+  \let\==\bar
+  \let\^=\hat
+  \let\`=\grave
+  \let\u=\breve
+  \let\v=\check
+  \let\~=\tilde
+  \let\dotaccent=\dot
+  $\finishmath
+}
+\def\finishmath#1{#1$\endgroup}  % Close the group opened by \tex.
+
+% Some active characters (such as <) are spaced differently in math.
+% We have to reset their definitions in case the @math was an argument
+% to a command which sets the catcodes (such as @item or @section).
+%
+{
+  \catcode`^ = \active
+  \catcode`< = \active
+  \catcode`> = \active
+  \catcode`+ = \active
+  \catcode`' = \active
+  \gdef\mathactive{%
+    \let^ = \ptexhat
+    \let< = \ptexless
+    \let> = \ptexgtr
+    \let+ = \ptexplus
+    \let' = \ptexquoteright
+  }
+}
+
+% @inlinefmt{FMTNAME,PROCESSED-TEXT} and @inlineraw{FMTNAME,RAW-TEXT}.
+% Ignore unless FMTNAME == tex; then it is like @iftex and @tex,
+% except specified as a normal braced arg, so no newlines to worry about.
+% 
+\def\outfmtnametex{tex}
+%
+\long\def\inlinefmt#1{\doinlinefmt #1,\finish}
+\long\def\doinlinefmt#1,#2,\finish{%
+  \def\inlinefmtname{#1}%
+  \ifx\inlinefmtname\outfmtnametex \ignorespaces #2\fi
+}
+% For raw, must switch into @tex before parsing the argument, to avoid
+% setting catcodes prematurely.  Doing it this way means that, for
+% example, @inlineraw{html, foo{bar} gets a parse error instead of being
+% ignored.  But this isn't important because if people want a literal
+% *right* brace they would have to use a command anyway, so they may as
+% well use a command to get a left brace too.  We could re-use the
+% delimiter character idea from \verb, but it seems like overkill.
+% 
+\long\def\inlineraw{\tex \doinlineraw}
+\long\def\doinlineraw#1{\doinlinerawtwo #1,\finish}
+\def\doinlinerawtwo#1,#2,\finish{%
+  \def\inlinerawname{#1}%
+  \ifx\inlinerawname\outfmtnametex \ignorespaces #2\fi
+  \endgroup % close group opened by \tex.
+}
+
+
+\message{glyphs,}
+% and logos.
+
+% @@ prints an @, as does @atchar{}.
+\def\@{\char64 }
+\let\atchar=\@
+
+% @{ @} @lbracechar{} @rbracechar{} all generate brace characters.
+% Unless we're in typewriter, use \ecfont because the CM text fonts do
+% not have braces, and we don't want to switch into math.
+\def\mylbrace{{\ifmonospace\else\ecfont\fi \char123}}
+\def\myrbrace{{\ifmonospace\else\ecfont\fi \char125}}
+\let\{=\mylbrace \let\lbracechar=\{
+\let\}=\myrbrace \let\rbracechar=\}
+\begingroup
+  % Definitions to produce \{ and \} commands for indices,
+  % and @{ and @} for the aux/toc files.
+  \catcode`\{ = \other \catcode`\} = \other
+  \catcode`\[ = 1 \catcode`\] = 2
+  \catcode`\! = 0 \catcode`\\ = \other
+  !gdef!lbracecmd[\{]%
+  !gdef!rbracecmd[\}]%
+  !gdef!lbraceatcmd[@{]%
+  !gdef!rbraceatcmd[@}]%
+!endgroup
+
+% @comma{} to avoid , parsing problems.
+\let\comma = ,
+
+% Accents: @, @dotaccent @ringaccent @ubaraccent @udotaccent
+% Others are defined by plain TeX: @` @' @" @^ @~ @= @u @v @H.
+\let\, = \ptexc
+\let\dotaccent = \ptexdot
+\def\ringaccent#1{{\accent23 #1}}
+\let\tieaccent = \ptext
+\let\ubaraccent = \ptexb
+\let\udotaccent = \d
+
+% Other special characters: @questiondown @exclamdown @ordf @ordm
+% Plain TeX defines: @AA @AE @O @OE @L (plus lowercase versions) @ss.
+\def\questiondown{?`}
+\def\exclamdown{!`}
+\def\ordf{\leavevmode\raise1ex\hbox{\selectfonts\lllsize \underbar{a}}}
+\def\ordm{\leavevmode\raise1ex\hbox{\selectfonts\lllsize \underbar{o}}}
+
+% Dotless i and dotless j, used for accents.
+\def\imacro{i}
+\def\jmacro{j}
+\def\dotless#1{%
+  \def\temp{#1}%
+  \ifx\temp\imacro \ifmmode\imath \else\ptexi \fi
+  \else\ifx\temp\jmacro \ifmmode\jmath \else\j \fi
+  \else \errmessage{@dotless can be used only with i or j}%
+  \fi\fi
+}
+
+% The \TeX{} logo, as in plain, but resetting the spacing so that a
+% period following counts as ending a sentence.  (Idea found in latex.)
+%
+\edef\TeX{\TeX \spacefactor=1000 }
+
+% @LaTeX{} logo.  Not quite the same results as the definition in
+% latex.ltx, since we use a different font for the raised A; it's most
+% convenient for us to use an explicitly smaller font, rather than using
+% the \scriptstyle font (since we don't reset \scriptstyle and
+% \scriptscriptstyle).
+%
+\def\LaTeX{%
+  L\kern-.36em
+  {\setbox0=\hbox{T}%
+   \vbox to \ht0{\hbox{%
+     \ifx\textnominalsize\xwordpt
+       % for 10pt running text, \lllsize (8pt) is too small for the A in LaTeX.
+       % Revert to plain's \scriptsize, which is 7pt.
+       \count255=\the\fam $\fam\count255 \scriptstyle A$%
+     \else
+       % For 11pt, we can use our lllsize.
+       \selectfonts\lllsize A%
+     \fi
+     }%
+     \vss
+  }}%
+  \kern-.15em
+  \TeX
+}
+
+% Some math mode symbols.
+\def\bullet{$\ptexbullet$}
+\def\geq{\ifmmode \ge\else $\ge$\fi}
+\def\leq{\ifmmode \le\else $\le$\fi}
+\def\minus{\ifmmode -\else $-$\fi}
+
+% @dots{} outputs an ellipsis using the current font.
+% We do .5em per period so that it has the same spacing in the cm
+% typewriter fonts as three actual period characters; on the other hand,
+% in other typewriter fonts three periods are wider than 1.5em.  So do
+% whichever is larger.
+%
+\def\dots{%
+  \leavevmode
+  \setbox0=\hbox{...}% get width of three periods
+  \ifdim\wd0 > 1.5em
+    \dimen0 = \wd0
+  \else
+    \dimen0 = 1.5em
+  \fi
+  \hbox to \dimen0{%
+    \hskip 0pt plus.25fil
+    .\hskip 0pt plus1fil
+    .\hskip 0pt plus1fil
+    .\hskip 0pt plus.5fil
+  }%
+}
+
+% @enddots{} is an end-of-sentence ellipsis.
+%
+\def\enddots{%
+  \dots
+  \spacefactor=\endofsentencespacefactor
+}
+
+% @point{}, @result{}, @expansion{}, @print{}, @equiv{}.
+%
+% Since these characters are used in examples, they should be an even number of
+% \tt widths. Each \tt character is 1en, so two makes it 1em.
+%
+\def\point{$\star$}
+\def\arrow{\leavevmode\raise.05ex\hbox to 1em{\hfil$\rightarrow$\hfil}}
+\def\result{\leavevmode\raise.05ex\hbox to 1em{\hfil$\Rightarrow$\hfil}}
+\def\expansion{\leavevmode\hbox to 1em{\hfil$\mapsto$\hfil}}
+\def\print{\leavevmode\lower.1ex\hbox to 1em{\hfil$\dashv$\hfil}}
+\def\equiv{\leavevmode\hbox to 1em{\hfil$\ptexequiv$\hfil}}
+
+% The @error{} command.
+% Adapted from the TeXbook's \boxit.
+%
+\newbox\errorbox
+%
+{\tentt \global\dimen0 = 3em}% Width of the box.
+\dimen2 = .55pt % Thickness of rules
+% The text. (`r' is open on the right, `e' somewhat less so on the left.)
+\setbox0 = \hbox{\kern-.75pt \reducedsf \putworderror\kern-1.5pt}
+%
+\setbox\errorbox=\hbox to \dimen0{\hfil
+   \hsize = \dimen0 \advance\hsize by -5.8pt % Space to left+right.
+   \advance\hsize by -2\dimen2 % Rules.
+   \vbox{%
+      \hrule height\dimen2
+      \hbox{\vrule width\dimen2 \kern3pt          % Space to left of text.
+         \vtop{\kern2.4pt \box0 \kern2.4pt}% Space above/below.
+         \kern3pt\vrule width\dimen2}% Space to right.
+      \hrule height\dimen2}
+    \hfil}
+%
+\def\error{\leavevmode\lower.7ex\copy\errorbox}
+
+% @pounds{} is a sterling sign, which Knuth put in the CM italic font.
+%
+\def\pounds{{\it\$}}
+
+% @euro{} comes from a separate font, depending on the current style.
+% We use the free feym* fonts from the eurosym package by Henrik
+% Theiling, which support regular, slanted, bold and bold slanted (and
+% "outlined" (blackboard board, sort of) versions, which we don't need).
+% It is available from http://www.ctan.org/tex-archive/fonts/eurosym.
+%
+% Although only regular is the truly official Euro symbol, we ignore
+% that.  The Euro is designed to be slightly taller than the regular
+% font height.
+%
+% feymr - regular
+% feymo - slanted
+% feybr - bold
+% feybo - bold slanted
+%
+% There is no good (free) typewriter version, to my knowledge.
+% A feymr10 euro is ~7.3pt wide, while a normal cmtt10 char is ~5.25pt wide.
+% Hmm.
+%
+% Also doesn't work in math.  Do we need to do math with euro symbols?
+% Hope not.
+%
+%
+\def\euro{{\eurofont e}}
+\def\eurofont{%
+  % We set the font at each command, rather than predefining it in
+  % \textfonts and the other font-switching commands, so that
+  % installations which never need the symbol don't have to have the
+  % font installed.
+  %
+  % There is only one designed size (nominal 10pt), so we always scale
+  % that to the current nominal size.
+  %
+  % By the way, simply using "at 1em" works for cmr10 and the like, but
+  % does not work for cmbx10 and other extended/shrunken fonts.
+  %
+  \def\eurosize{\csname\curfontsize nominalsize\endcsname}%
+  %
+  \ifx\curfontstyle\bfstylename
+    % bold:
+    \font\thiseurofont = \ifusingit{feybo10}{feybr10} at \eurosize
+  \else
+    % regular:
+    \font\thiseurofont = \ifusingit{feymo10}{feymr10} at \eurosize
+  \fi
+  \thiseurofont
+}
+
+% Glyphs from the EC fonts.  We don't use \let for the aliases, because
+% sometimes we redefine the original macro, and the alias should reflect
+% the redefinition.
+%
+% Use LaTeX names for the Icelandic letters.
+\def\DH{{\ecfont \char"D0}} % Eth
+\def\dh{{\ecfont \char"F0}} % eth
+\def\TH{{\ecfont \char"DE}} % Thorn
+\def\th{{\ecfont \char"FE}} % thorn
+%
+\def\guillemetleft{{\ecfont \char"13}}
+\def\guillemotleft{\guillemetleft}
+\def\guillemetright{{\ecfont \char"14}}
+\def\guillemotright{\guillemetright}
+\def\guilsinglleft{{\ecfont \char"0E}}
+\def\guilsinglright{{\ecfont \char"0F}}
+\def\quotedblbase{{\ecfont \char"12}}
+\def\quotesinglbase{{\ecfont \char"0D}}
+%
+% This positioning is not perfect (see the ogonek LaTeX package), but
+% we have the precomposed glyphs for the most common cases.  We put the
+% tests to use those glyphs in the single \ogonek macro so we have fewer
+% dummy definitions to worry about for index entries, etc.
+%
+% ogonek is also used with other letters in Lithuanian (IOU), but using
+% the precomposed glyphs for those is not so easy since they aren't in
+% the same EC font.
+\def\ogonek#1{{%
+  \def\temp{#1}%
+  \ifx\temp\macrocharA\Aogonek
+  \else\ifx\temp\macrochara\aogonek
+  \else\ifx\temp\macrocharE\Eogonek
+  \else\ifx\temp\macrochare\eogonek
+  \else
+    \ecfont \setbox0=\hbox{#1}%
+    \ifdim\ht0=1ex\accent"0C #1%
+    \else\ooalign{\unhbox0\crcr\hidewidth\char"0C \hidewidth}%
+    \fi
+  \fi\fi\fi\fi
+  }%
+}
+\def\Aogonek{{\ecfont \char"81}}\def\macrocharA{A}
+\def\aogonek{{\ecfont \char"A1}}\def\macrochara{a}
+\def\Eogonek{{\ecfont \char"86}}\def\macrocharE{E}
+\def\eogonek{{\ecfont \char"A6}}\def\macrochare{e}
+%
+% Use the ec* fonts (cm-super in outline format) for non-CM glyphs.
+\def\ecfont{%
+  % We can't distinguish serif/sans and italic/slanted, but this
+  % is used for crude hacks anyway (like adding French and German
+  % quotes to documents typeset with CM, where we lose kerning), so
+  % hopefully nobody will notice/care.
+  \edef\ecsize{\csname\curfontsize ecsize\endcsname}%
+  \edef\nominalsize{\csname\curfontsize nominalsize\endcsname}%
+  \ifx\curfontstyle\bfstylename
+    % bold:
+    \font\thisecfont = ecb\ifusingit{i}{x}\ecsize \space at \nominalsize
+  \else
+    % regular:
+    \font\thisecfont = ec\ifusingit{ti}{rm}\ecsize \space at \nominalsize
+  \fi
+  \thisecfont
+}
+
+% @registeredsymbol - R in a circle.  The font for the R should really
+% be smaller yet, but lllsize is the best we can do for now.
+% Adapted from the plain.tex definition of \copyright.
+%
+\def\registeredsymbol{%
+  $^{{\ooalign{\hfil\raise.07ex\hbox{\selectfonts\lllsize R}%
+               \hfil\crcr\Orb}}%
+    }$%
+}
+
+% @textdegree - the normal degrees sign.
+%
+\def\textdegree{$^\circ$}
+
+% Laurent Siebenmann reports \Orb undefined with:
+%  Textures 1.7.7 (preloaded format=plain 93.10.14)  (68K)  16 APR 2004 02:38
+% so we'll define it if necessary.
+%
+\ifx\Orb\thisisundefined
+\def\Orb{\mathhexbox20D}
+\fi
+
+% Quotes.
+\chardef\quotedblleft="5C
+\chardef\quotedblright=`\"
+\chardef\quoteleft=`\`
+\chardef\quoteright=`\'
+
+
+\message{page headings,}
+
+\newskip\titlepagetopglue \titlepagetopglue = 1.5in
+\newskip\titlepagebottomglue \titlepagebottomglue = 2pc
+
+% First the title page.  Must do @settitle before @titlepage.
+\newif\ifseenauthor
+\newif\iffinishedtitlepage
+
+% Do an implicit @contents or @shortcontents after @end titlepage if the
+% user says @setcontentsaftertitlepage or @setshortcontentsaftertitlepage.
+%
+\newif\ifsetcontentsaftertitlepage
+ \let\setcontentsaftertitlepage = \setcontentsaftertitlepagetrue
+\newif\ifsetshortcontentsaftertitlepage
+ \let\setshortcontentsaftertitlepage = \setshortcontentsaftertitlepagetrue
+
+\parseargdef\shorttitlepage{%
+  \begingroup \hbox{}\vskip 1.5in \chaprm \centerline{#1}%
+  \endgroup\page\hbox{}\page}
+
+\envdef\titlepage{%
+  % Open one extra group, as we want to close it in the middle of \Etitlepage.
+  \begingroup
+    \parindent=0pt \textfonts
+    % Leave some space at the very top of the page.
+    \vglue\titlepagetopglue
+    % No rule at page bottom unless we print one at the top with @title.
+    \finishedtitlepagetrue
+    %
+    % Most title ``pages'' are actually two pages long, with space
+    % at the top of the second.  We don't want the ragged left on the second.
+    \let\oldpage = \page
+    \def\page{%
+      \iffinishedtitlepage\else
+	 \finishtitlepage
+      \fi
+      \let\page = \oldpage
+      \page
+      \null
+    }%
+}
+
+\def\Etitlepage{%
+    \iffinishedtitlepage\else
+	\finishtitlepage
+    \fi
+    % It is important to do the page break before ending the group,
+    % because the headline and footline are only empty inside the group.
+    % If we use the new definition of \page, we always get a blank page
+    % after the title page, which we certainly don't want.
+    \oldpage
+  \endgroup
+  %
+  % Need this before the \...aftertitlepage checks so that if they are
+  % in effect the toc pages will come out with page numbers.
+  \HEADINGSon
+  %
+  % If they want short, they certainly want long too.
+  \ifsetshortcontentsaftertitlepage
+    \shortcontents
+    \contents
+    \global\let\shortcontents = \relax
+    \global\let\contents = \relax
+  \fi
+  %
+  \ifsetcontentsaftertitlepage
+    \contents
+    \global\let\contents = \relax
+    \global\let\shortcontents = \relax
+  \fi
+}
+
+\def\finishtitlepage{%
+  \vskip4pt \hrule height 2pt width \hsize
+  \vskip\titlepagebottomglue
+  \finishedtitlepagetrue
+}
+
+% Macros to be used within @titlepage:
+
+\let\subtitlerm=\tenrm
+\def\subtitlefont{\subtitlerm \normalbaselineskip = 13pt \normalbaselines}
+
+\parseargdef\title{%
+  \checkenv\titlepage
+  \leftline{\titlefonts\rmisbold #1}
+  % print a rule at the page bottom also.
+  \finishedtitlepagefalse
+  \vskip4pt \hrule height 4pt width \hsize \vskip4pt
+}
+
+\parseargdef\subtitle{%
+  \checkenv\titlepage
+  {\subtitlefont \rightline{#1}}%
+}
+
+% @author should come last, but may come many times.
+% It can also be used inside @quotation.
+%
+\parseargdef\author{%
+  \def\temp{\quotation}%
+  \ifx\thisenv\temp
+    \def\quotationauthor{#1}% printed in \Equotation.
+  \else
+    \checkenv\titlepage
+    \ifseenauthor\else \vskip 0pt plus 1filll \seenauthortrue \fi
+    {\secfonts\rmisbold \leftline{#1}}%
+  \fi
+}
+
+
+% Set up page headings and footings.
+
+\let\thispage=\folio
+
+\newtoks\evenheadline    % headline on even pages
+\newtoks\oddheadline     % headline on odd pages
+\newtoks\evenfootline    % footline on even pages
+\newtoks\oddfootline     % footline on odd pages
+
+% Now make TeX use those variables
+\headline={{\textfonts\rm \ifodd\pageno \the\oddheadline
+                            \else \the\evenheadline \fi}}
+\footline={{\textfonts\rm \ifodd\pageno \the\oddfootline
+                            \else \the\evenfootline \fi}\HEADINGShook}
+\let\HEADINGShook=\relax
+
+% Commands to set those variables.
+% For example, this is what  @headings on  does
+% @evenheading @thistitle|@thispage|@thischapter
+% @oddheading @thischapter|@thispage|@thistitle
+% @evenfooting @thisfile||
+% @oddfooting ||@thisfile
+
+
+\def\evenheading{\parsearg\evenheadingxxx}
+\def\evenheadingxxx #1{\evenheadingyyy #1\|\|\|\|\finish}
+\def\evenheadingyyy #1\|#2\|#3\|#4\finish{%
+\global\evenheadline={\rlap{\centerline{#2}}\line{#1\hfil#3}}}
+
+\def\oddheading{\parsearg\oddheadingxxx}
+\def\oddheadingxxx #1{\oddheadingyyy #1\|\|\|\|\finish}
+\def\oddheadingyyy #1\|#2\|#3\|#4\finish{%
+\global\oddheadline={\rlap{\centerline{#2}}\line{#1\hfil#3}}}
+
+\parseargdef\everyheading{\oddheadingxxx{#1}\evenheadingxxx{#1}}%
+
+\def\evenfooting{\parsearg\evenfootingxxx}
+\def\evenfootingxxx #1{\evenfootingyyy #1\|\|\|\|\finish}
+\def\evenfootingyyy #1\|#2\|#3\|#4\finish{%
+\global\evenfootline={\rlap{\centerline{#2}}\line{#1\hfil#3}}}
+
+\def\oddfooting{\parsearg\oddfootingxxx}
+\def\oddfootingxxx #1{\oddfootingyyy #1\|\|\|\|\finish}
+\def\oddfootingyyy #1\|#2\|#3\|#4\finish{%
+  \global\oddfootline = {\rlap{\centerline{#2}}\line{#1\hfil#3}}%
+  %
+  % Leave some space for the footline.  Hopefully ok to assume
+  % @evenfooting will not be used by itself.
+  \global\advance\pageheight by -12pt
+  \global\advance\vsize by -12pt
+}
+
+\parseargdef\everyfooting{\oddfootingxxx{#1}\evenfootingxxx{#1}}
+
+% @evenheadingmarks top     \thischapter <- chapter at the top of a page
+% @evenheadingmarks bottom  \thischapter <- chapter at the bottom of a page
+%
+% The same set of arguments for:
+%
+% @oddheadingmarks
+% @evenfootingmarks
+% @oddfootingmarks
+% @everyheadingmarks
+% @everyfootingmarks
+
+\def\evenheadingmarks{\headingmarks{even}{heading}}
+\def\oddheadingmarks{\headingmarks{odd}{heading}}
+\def\evenfootingmarks{\headingmarks{even}{footing}}
+\def\oddfootingmarks{\headingmarks{odd}{footing}}
+\def\everyheadingmarks#1 {\headingmarks{even}{heading}{#1}
+                          \headingmarks{odd}{heading}{#1} }
+\def\everyfootingmarks#1 {\headingmarks{even}{footing}{#1}
+                          \headingmarks{odd}{footing}{#1} }
+% #1 = even/odd, #2 = heading/footing, #3 = top/bottom.
+\def\headingmarks#1#2#3 {%
+  \expandafter\let\expandafter\temp \csname get#3headingmarks\endcsname
+  \global\expandafter\let\csname get#1#2marks\endcsname \temp
+}
+
+\everyheadingmarks bottom
+\everyfootingmarks bottom
+
+% @headings double      turns headings on for double-sided printing.
+% @headings single      turns headings on for single-sided printing.
+% @headings off         turns them off.
+% @headings on          same as @headings double, retained for compatibility.
+% @headings after       turns on double-sided headings after this page.
+% @headings doubleafter turns on double-sided headings after this page.
+% @headings singleafter turns on single-sided headings after this page.
+% By default, they are off at the start of a document,
+% and turned `on' after @end titlepage.
+
+\def\headings #1 {\csname HEADINGS#1\endcsname}
+
+\def\headingsoff{% non-global headings elimination
+  \evenheadline={\hfil}\evenfootline={\hfil}%
+   \oddheadline={\hfil}\oddfootline={\hfil}%
+}
+
+\def\HEADINGSoff{{\globaldefs=1 \headingsoff}} % global setting
+\HEADINGSoff  % it's the default
+
+% When we turn headings on, set the page number to 1.
+% For double-sided printing, put current file name in lower left corner,
+% chapter name on inside top of right hand pages, document
+% title on inside top of left hand pages, and page numbers on outside top
+% edge of all pages.
+\def\HEADINGSdouble{%
+\global\pageno=1
+\global\evenfootline={\hfil}
+\global\oddfootline={\hfil}
+\global\evenheadline={\line{\folio\hfil\thistitle}}
+\global\oddheadline={\line{\thischapter\hfil\folio}}
+\global\let\contentsalignmacro = \chapoddpage
+}
+\let\contentsalignmacro = \chappager
+
+% For single-sided printing, chapter title goes across top left of page,
+% page number on top right.
+\def\HEADINGSsingle{%
+\global\pageno=1
+\global\evenfootline={\hfil}
+\global\oddfootline={\hfil}
+\global\evenheadline={\line{\thischapter\hfil\folio}}
+\global\oddheadline={\line{\thischapter\hfil\folio}}
+\global\let\contentsalignmacro = \chappager
+}
+\def\HEADINGSon{\HEADINGSdouble}
+
+\def\HEADINGSafter{\let\HEADINGShook=\HEADINGSdoublex}
+\let\HEADINGSdoubleafter=\HEADINGSafter
+\def\HEADINGSdoublex{%
+\global\evenfootline={\hfil}
+\global\oddfootline={\hfil}
+\global\evenheadline={\line{\folio\hfil\thistitle}}
+\global\oddheadline={\line{\thischapter\hfil\folio}}
+\global\let\contentsalignmacro = \chapoddpage
+}
+
+\def\HEADINGSsingleafter{\let\HEADINGShook=\HEADINGSsinglex}
+\def\HEADINGSsinglex{%
+\global\evenfootline={\hfil}
+\global\oddfootline={\hfil}
+\global\evenheadline={\line{\thischapter\hfil\folio}}
+\global\oddheadline={\line{\thischapter\hfil\folio}}
+\global\let\contentsalignmacro = \chappager
+}
+
+% Subroutines used in generating headings
+% This produces Day Month Year style of output.
+% Only define if not already defined, in case a txi-??.tex file has set
+% up a different format (e.g., txi-cs.tex does this).
+\ifx\today\thisisundefined
+\def\today{%
+  \number\day\space
+  \ifcase\month
+  \or\putwordMJan\or\putwordMFeb\or\putwordMMar\or\putwordMApr
+  \or\putwordMMay\or\putwordMJun\or\putwordMJul\or\putwordMAug
+  \or\putwordMSep\or\putwordMOct\or\putwordMNov\or\putwordMDec
+  \fi
+  \space\number\year}
+\fi
+
+% @settitle line...  specifies the title of the document, for headings.
+% It generates no output of its own.
+\def\thistitle{\putwordNoTitle}
+\def\settitle{\parsearg{\gdef\thistitle}}
+
+
+\message{tables,}
+% Tables -- @table, @ftable, @vtable, @item(x).
+
+% default indentation of table text
+\newdimen\tableindent \tableindent=.8in
+% default indentation of @itemize and @enumerate text
+\newdimen\itemindent  \itemindent=.3in
+% margin between end of table item and start of table text.
+\newdimen\itemmargin  \itemmargin=.1in
+
+% used internally for \itemindent minus \itemmargin
+\newdimen\itemmax
+
+% Note @table, @ftable, and @vtable define @item, @itemx, etc., with
+% these defs.
+% They also define \itemindex
+% to index the item name in whatever manner is desired (perhaps none).
+
+\newif\ifitemxneedsnegativevskip
+
+\def\itemxpar{\par\ifitemxneedsnegativevskip\nobreak\vskip-\parskip\nobreak\fi}
+
+\def\internalBitem{\smallbreak \parsearg\itemzzz}
+\def\internalBitemx{\itemxpar \parsearg\itemzzz}
+
+\def\itemzzz #1{\begingroup %
+  \advance\hsize by -\rightskip
+  \advance\hsize by -\tableindent
+  \setbox0=\hbox{\itemindicate{#1}}%
+  \itemindex{#1}%
+  \nobreak % This prevents a break before @itemx.
+  %
+  % If the item text does not fit in the space we have, put it on a line
+  % by itself, and do not allow a page break either before or after that
+  % line.  We do not start a paragraph here because then if the next
+  % command is, e.g., @kindex, the whatsit would get put into the
+  % horizontal list on a line by itself, resulting in extra blank space.
+  \ifdim \wd0>\itemmax
+    %
+    % Make this a paragraph so we get the \parskip glue and wrapping,
+    % but leave it ragged-right.
+    \begingroup
+      \advance\leftskip by-\tableindent
+      \advance\hsize by\tableindent
+      \advance\rightskip by0pt plus1fil\relax
+      \leavevmode\unhbox0\par
+    \endgroup
+    %
+    % We're going to be starting a paragraph, but we don't want the
+    % \parskip glue -- logically it's part of the @item we just started.
+    \nobreak \vskip-\parskip
+    %
+    % Stop a page break at the \parskip glue coming up.  However, if
+    % what follows is an environment such as @example, there will be no
+    % \parskip glue; then the negative vskip we just inserted would
+    % cause the example and the item to crash together.  So we use this
+    % bizarre value of 10001 as a signal to \aboveenvbreak to insert
+    % \parskip glue after all.  Section titles are handled this way also.
+    %
+    \penalty 10001
+    \endgroup
+    \itemxneedsnegativevskipfalse
+  \else
+    % The item text fits into the space.  Start a paragraph, so that the
+    % following text (if any) will end up on the same line.
+    \noindent
+    % Do this with kerns and \unhbox so that if there is a footnote in
+    % the item text, it can migrate to the main vertical list and
+    % eventually be printed.
+    \nobreak\kern-\tableindent
+    \dimen0 = \itemmax  \advance\dimen0 by \itemmargin \advance\dimen0 by -\wd0
+    \unhbox0
+    \nobreak\kern\dimen0
+    \endgroup
+    \itemxneedsnegativevskiptrue
+  \fi
+}
+
+\def\item{\errmessage{@item while not in a list environment}}
+\def\itemx{\errmessage{@itemx while not in a list environment}}
+
+% @table, @ftable, @vtable.
+\envdef\table{%
+  \let\itemindex\gobble
+  \tablecheck{table}%
+}
+\envdef\ftable{%
+  \def\itemindex ##1{\doind {fn}{\code{##1}}}%
+  \tablecheck{ftable}%
+}
+\envdef\vtable{%
+  \def\itemindex ##1{\doind {vr}{\code{##1}}}%
+  \tablecheck{vtable}%
+}
+\def\tablecheck#1{%
+  \ifnum \the\catcode`\^^M=\active
+    \endgroup
+    \errmessage{This command won't work in this context; perhaps the problem is
+      that we are \inenvironment\thisenv}%
+    \def\next{\doignore{#1}}%
+  \else
+    \let\next\tablex
+  \fi
+  \next
+}
+\def\tablex#1{%
+  \def\itemindicate{#1}%
+  \parsearg\tabley
+}
+\def\tabley#1{%
+  {%
+    \makevalueexpandable
+    \edef\temp{\noexpand\tablez #1\space\space\space}%
+    \expandafter
+  }\temp \endtablez
+}
+\def\tablez #1 #2 #3 #4\endtablez{%
+  \aboveenvbreak
+  \ifnum 0#1>0 \advance \leftskip by #1\mil \fi
+  \ifnum 0#2>0 \tableindent=#2\mil \fi
+  \ifnum 0#3>0 \advance \rightskip by #3\mil \fi
+  \itemmax=\tableindent
+  \advance \itemmax by -\itemmargin
+  \advance \leftskip by \tableindent
+  \exdentamount=\tableindent
+  \parindent = 0pt
+  \parskip = \smallskipamount
+  \ifdim \parskip=0pt \parskip=2pt \fi
+  \let\item = \internalBitem
+  \let\itemx = \internalBitemx
+}
+\def\Etable{\endgraf\afterenvbreak}
+\let\Eftable\Etable
+\let\Evtable\Etable
+\let\Eitemize\Etable
+\let\Eenumerate\Etable
+
+% This is the counter used by @enumerate, which is really @itemize
+
+\newcount \itemno
+
+\envdef\itemize{\parsearg\doitemize}
+
+\def\doitemize#1{%
+  \aboveenvbreak
+  \itemmax=\itemindent
+  \advance\itemmax by -\itemmargin
+  \advance\leftskip by \itemindent
+  \exdentamount=\itemindent
+  \parindent=0pt
+  \parskip=\smallskipamount
+  \ifdim\parskip=0pt \parskip=2pt \fi
+  %
+  % Try typesetting the item mark that if the document erroneously says
+  % something like @itemize @samp (intending @table), there's an error
+  % right away at the @itemize.  It's not the best error message in the
+  % world, but it's better than leaving it to the @item.  This means if
+  % the user wants an empty mark, they have to say @w{} not just @w.
+  \def\itemcontents{#1}%
+  \setbox0 = \hbox{\itemcontents}%
+  %
+  % @itemize with no arg is equivalent to @itemize @bullet.
+  \ifx\itemcontents\empty\def\itemcontents{\bullet}\fi
+  %
+  \let\item=\itemizeitem
+}
+
+% Definition of @item while inside @itemize and @enumerate.
+%
+\def\itemizeitem{%
+  \advance\itemno by 1  % for enumerations
+  {\let\par=\endgraf \smallbreak}% reasonable place to break
+  {%
+   % If the document has an @itemize directly after a section title, a
+   % \nobreak will be last on the list, and \sectionheading will have
+   % done a \vskip-\parskip.  In that case, we don't want to zero
+   % parskip, or the item text will crash with the heading.  On the
+   % other hand, when there is normal text preceding the item (as there
+   % usually is), we do want to zero parskip, or there would be too much
+   % space.  In that case, we won't have a \nobreak before.  At least
+   % that's the theory.
+   \ifnum\lastpenalty<10000 \parskip=0in \fi
+   \noindent
+   \hbox to 0pt{\hss \itemcontents \kern\itemmargin}%
+   %
+   \vadjust{\penalty 1200}}% not good to break after first line of item.
+  \flushcr
+}
+
+% \splitoff TOKENS\endmark defines \first to be the first token in
+% TOKENS, and \rest to be the remainder.
+%
+\def\splitoff#1#2\endmark{\def\first{#1}\def\rest{#2}}%
+
+% Allow an optional argument of an uppercase letter, lowercase letter,
+% or number, to specify the first label in the enumerated list.  No
+% argument is the same as `1'.
+%
+\envparseargdef\enumerate{\enumeratey #1  \endenumeratey}
+\def\enumeratey #1 #2\endenumeratey{%
+  % If we were given no argument, pretend we were given `1'.
+  \def\thearg{#1}%
+  \ifx\thearg\empty \def\thearg{1}\fi
+  %
+  % Detect if the argument is a single token.  If so, it might be a
+  % letter.  Otherwise, the only valid thing it can be is a number.
+  % (We will always have one token, because of the test we just made.
+  % This is a good thing, since \splitoff doesn't work given nothing at
+  % all -- the first parameter is undelimited.)
+  \expandafter\splitoff\thearg\endmark
+  \ifx\rest\empty
+    % Only one token in the argument.  It could still be anything.
+    % A ``lowercase letter'' is one whose \lccode is nonzero.
+    % An ``uppercase letter'' is one whose \lccode is both nonzero, and
+    %   not equal to itself.
+    % Otherwise, we assume it's a number.
+    %
+    % We need the \relax at the end of the \ifnum lines to stop TeX from
+    % continuing to look for a <number>.
+    %
+    \ifnum\lccode\expandafter`\thearg=0\relax
+      \numericenumerate % a number (we hope)
+    \else
+      % It's a letter.
+      \ifnum\lccode\expandafter`\thearg=\expandafter`\thearg\relax
+        \lowercaseenumerate % lowercase letter
+      \else
+        \uppercaseenumerate % uppercase letter
+      \fi
+    \fi
+  \else
+    % Multiple tokens in the argument.  We hope it's a number.
+    \numericenumerate
+  \fi
+}
+
+% An @enumerate whose labels are integers.  The starting integer is
+% given in \thearg.
+%
+\def\numericenumerate{%
+  \itemno = \thearg
+  \startenumeration{\the\itemno}%
+}
+
+% The starting (lowercase) letter is in \thearg.
+\def\lowercaseenumerate{%
+  \itemno = \expandafter`\thearg
+  \startenumeration{%
+    % Be sure we're not beyond the end of the alphabet.
+    \ifnum\itemno=0
+      \errmessage{No more lowercase letters in @enumerate; get a bigger
+                  alphabet}%
+    \fi
+    \char\lccode\itemno
+  }%
+}
+
+% The starting (uppercase) letter is in \thearg.
+\def\uppercaseenumerate{%
+  \itemno = \expandafter`\thearg
+  \startenumeration{%
+    % Be sure we're not beyond the end of the alphabet.
+    \ifnum\itemno=0
+      \errmessage{No more uppercase letters in @enumerate; get a bigger
+                  alphabet}
+    \fi
+    \char\uccode\itemno
+  }%
+}
+
+% Call \doitemize, adding a period to the first argument and supplying the
+% common last two arguments.  Also subtract one from the initial value in
+% \itemno, since @item increments \itemno.
+%
+\def\startenumeration#1{%
+  \advance\itemno by -1
+  \doitemize{#1.}\flushcr
+}
+
+% @alphaenumerate and @capsenumerate are abbreviations for giving an arg
+% to @enumerate.
+%
+\def\alphaenumerate{\enumerate{a}}
+\def\capsenumerate{\enumerate{A}}
+\def\Ealphaenumerate{\Eenumerate}
+\def\Ecapsenumerate{\Eenumerate}
+
+
+% @multitable macros
+% Amy Hendrickson, 8/18/94, 3/6/96
+%
+% @multitable ... @end multitable will make as many columns as desired.
+% Contents of each column will wrap at width given in preamble.  Width
+% can be specified either with sample text given in a template line,
+% or in percent of \hsize, the current width of text on page.
+
+% Table can continue over pages but will only break between lines.
+
+% To make preamble:
+%
+% Either define widths of columns in terms of percent of \hsize:
+%   @multitable @columnfractions .25 .3 .45
+%   @item ...
+%
+%   Numbers following @columnfractions are the percent of the total
+%   current hsize to be used for each column. You may use as many
+%   columns as desired.
+
+
+% Or use a template:
+%   @multitable {Column 1 template} {Column 2 template} {Column 3 template}
+%   @item ...
+%   using the widest term desired in each column.
+
+% Each new table line starts with @item, each subsequent new column
+% starts with @tab. Empty columns may be produced by supplying @tab's
+% with nothing between them for as many times as empty columns are needed,
+% ie, @tab@tab@tab will produce two empty columns.
+
+% @item, @tab do not need to be on their own lines, but it will not hurt
+% if they are.
+
+% Sample multitable:
+
+%   @multitable {Column 1 template} {Column 2 template} {Column 3 template}
+%   @item first col stuff @tab second col stuff @tab third col
+%   @item
+%   first col stuff
+%   @tab
+%   second col stuff
+%   @tab
+%   third col
+%   @item first col stuff @tab second col stuff
+%   @tab Many paragraphs of text may be used in any column.
+%
+%         They will wrap at the width determined by the template.
+%   @item@tab@tab This will be in third column.
+%   @end multitable
+
+% Default dimensions may be reset by user.
+% @multitableparskip is vertical space between paragraphs in table.
+% @multitableparindent is paragraph indent in table.
+% @multitablecolmargin is horizontal space to be left between columns.
+% @multitablelinespace is space to leave between table items, baseline
+%                                                            to baseline.
+%   0pt means it depends on current normal line spacing.
+%
+\newskip\multitableparskip
+\newskip\multitableparindent
+\newdimen\multitablecolspace
+\newskip\multitablelinespace
+\multitableparskip=0pt
+\multitableparindent=6pt
+\multitablecolspace=12pt
+\multitablelinespace=0pt
+
+% Macros used to set up halign preamble:
+%
+\let\endsetuptable\relax
+\def\xendsetuptable{\endsetuptable}
+\let\columnfractions\relax
+\def\xcolumnfractions{\columnfractions}
+\newif\ifsetpercent
+
+% #1 is the @columnfraction, usually a decimal number like .5, but might
+% be just 1.  We just use it, whatever it is.
+%
+\def\pickupwholefraction#1 {%
+  \global\advance\colcount by 1
+  \expandafter\xdef\csname col\the\colcount\endcsname{#1\hsize}%
+  \setuptable
+}
+
+\newcount\colcount
+\def\setuptable#1{%
+  \def\firstarg{#1}%
+  \ifx\firstarg\xendsetuptable
+    \let\go = \relax
+  \else
+    \ifx\firstarg\xcolumnfractions
+      \global\setpercenttrue
+    \else
+      \ifsetpercent
+         \let\go\pickupwholefraction
+      \else
+         \global\advance\colcount by 1
+         \setbox0=\hbox{#1\unskip\space}% Add a normal word space as a
+                   % separator; typically that is always in the input, anyway.
+         \expandafter\xdef\csname col\the\colcount\endcsname{\the\wd0}%
+      \fi
+    \fi
+    \ifx\go\pickupwholefraction
+      % Put the argument back for the \pickupwholefraction call, so
+      % we'll always have a period there to be parsed.
+      \def\go{\pickupwholefraction#1}%
+    \else
+      \let\go = \setuptable
+    \fi%
+  \fi
+  \go
+}
+
+% multitable-only commands.
+%
+% @headitem starts a heading row, which we typeset in bold.
+% Assignments have to be global since we are inside the implicit group
+% of an alignment entry.  \everycr resets \everytab so we don't have to
+% undo it ourselves.
+\def\headitemfont{\b}% for people to use in the template row; not changeable
+\def\headitem{%
+  \checkenv\multitable
+  \crcr
+  \global\everytab={\bf}% can't use \headitemfont since the parsing differs
+  \the\everytab % for the first item
+}%
+%
+% A \tab used to include \hskip1sp.  But then the space in a template
+% line is not enough.  That is bad.  So let's go back to just `&' until
+% we again encounter the problem the 1sp was intended to solve.
+%					--karl, nathan@acm.org, 20apr99.
+\def\tab{\checkenv\multitable &\the\everytab}%
+
+% @multitable ... @end multitable definitions:
+%
+\newtoks\everytab  % insert after every tab.
+%
+\envdef\multitable{%
+  \vskip\parskip
+  \startsavinginserts
+  %
+  % @item within a multitable starts a normal row.
+  % We use \def instead of \let so that if one of the multitable entries
+  % contains an @itemize, we don't choke on the \item (seen as \crcr aka
+  % \endtemplate) expanding \doitemize.
+  \def\item{\crcr}%
+  %
+  \tolerance=9500
+  \hbadness=9500
+  \setmultitablespacing
+  \parskip=\multitableparskip
+  \parindent=\multitableparindent
+  \overfullrule=0pt
+  \global\colcount=0
+  %
+  \everycr = {%
+    \noalign{%
+      \global\everytab={}%
+      \global\colcount=0 % Reset the column counter.
+      % Check for saved footnotes, etc.
+      \checkinserts
+      % Keeps underfull box messages off when table breaks over pages.
+      %\filbreak
+	% Maybe so, but it also creates really weird page breaks when the
+	% table breaks over pages. Wouldn't \vfil be better?  Wait until the
+	% problem manifests itself, so it can be fixed for real --karl.
+    }%
+  }%
+  %
+  \parsearg\domultitable
+}
+\def\domultitable#1{%
+  % To parse everything between @multitable and @item:
+  \setuptable#1 \endsetuptable
+  %
+  % This preamble sets up a generic column definition, which will
+  % be used as many times as user calls for columns.
+  % \vtop will set a single line and will also let text wrap and
+  % continue for many paragraphs if desired.
+  \halign\bgroup &%
+    \global\advance\colcount by 1
+    \multistrut
+    \vtop{%
+      % Use the current \colcount to find the correct column width:
+      \hsize=\expandafter\csname col\the\colcount\endcsname
+      %
+      % In order to keep entries from bumping into each other
+      % we will add a \leftskip of \multitablecolspace to all columns after
+      % the first one.
+      %
+      % If a template has been used, we will add \multitablecolspace
+      % to the width of each template entry.
+      %
+      % If the user has set preamble in terms of percent of \hsize we will
+      % use that dimension as the width of the column, and the \leftskip
+      % will keep entries from bumping into each other.  Table will start at
+      % left margin and final column will justify at right margin.
+      %
+      % Make sure we don't inherit \rightskip from the outer environment.
+      \rightskip=0pt
+      \ifnum\colcount=1
+	% The first column will be indented with the surrounding text.
+	\advance\hsize by\leftskip
+      \else
+	\ifsetpercent \else
+	  % If user has not set preamble in terms of percent of \hsize
+	  % we will advance \hsize by \multitablecolspace.
+	  \advance\hsize by \multitablecolspace
+	\fi
+       % In either case we will make \leftskip=\multitablecolspace:
+      \leftskip=\multitablecolspace
+      \fi
+      % Ignoring space at the beginning and end avoids an occasional spurious
+      % blank line, when TeX decides to break the line at the space before the
+      % box from the multistrut, so the strut ends up on a line by itself.
+      % For example:
+      % @multitable @columnfractions .11 .89
+      % @item @code{#}
+      % @tab Legal holiday which is valid in major parts of the whole country.
+      % Is automatically provided with highlighting sequences respectively
+      % marking characters.
+      \noindent\ignorespaces##\unskip\multistrut
+    }\cr
+}
+\def\Emultitable{%
+  \crcr
+  \egroup % end the \halign
+  \global\setpercentfalse
+}
+
+\def\setmultitablespacing{%
+  \def\multistrut{\strut}% just use the standard line spacing
+  %
+  % Compute \multitablelinespace (if not defined by user) for use in
+  % \multitableparskip calculation.  We used define \multistrut based on
+  % this, but (ironically) that caused the spacing to be off.
+  % See bug-texinfo report from Werner Lemberg, 31 Oct 2004 12:52:20 +0100.
+\ifdim\multitablelinespace=0pt
+\setbox0=\vbox{X}\global\multitablelinespace=\the\baselineskip
+\global\advance\multitablelinespace by-\ht0
+\fi
+% Test to see if parskip is larger than space between lines of
+% table. If not, do nothing.
+%        If so, set to same dimension as multitablelinespace.
+\ifdim\multitableparskip>\multitablelinespace
+\global\multitableparskip=\multitablelinespace
+\global\advance\multitableparskip-7pt % to keep parskip somewhat smaller
+                                      % than skip between lines in the table.
+\fi%
+\ifdim\multitableparskip=0pt
+\global\multitableparskip=\multitablelinespace
+\global\advance\multitableparskip-7pt % to keep parskip somewhat smaller
+                                      % than skip between lines in the table.
+\fi}
+
+
+\message{conditionals,}
+
+% @iftex, @ifnotdocbook, @ifnothtml, @ifnotinfo, @ifnotplaintext,
+% @ifnotxml always succeed.  They currently do nothing; we don't
+% attempt to check whether the conditionals are properly nested.  But we
+% have to remember that they are conditionals, so that @end doesn't
+% attempt to close an environment group.
+%
+\def\makecond#1{%
+  \expandafter\let\csname #1\endcsname = \relax
+  \expandafter\let\csname iscond.#1\endcsname = 1
+}
+\makecond{iftex}
+\makecond{ifnotdocbook}
+\makecond{ifnothtml}
+\makecond{ifnotinfo}
+\makecond{ifnotplaintext}
+\makecond{ifnotxml}
+
+% Ignore @ignore, @ifhtml, @ifinfo, and the like.
+%
+\def\direntry{\doignore{direntry}}
+\def\documentdescription{\doignore{documentdescription}}
+\def\docbook{\doignore{docbook}}
+\def\html{\doignore{html}}
+\def\ifdocbook{\doignore{ifdocbook}}
+\def\ifhtml{\doignore{ifhtml}}
+\def\ifinfo{\doignore{ifinfo}}
+\def\ifnottex{\doignore{ifnottex}}
+\def\ifplaintext{\doignore{ifplaintext}}
+\def\ifxml{\doignore{ifxml}}
+\def\ignore{\doignore{ignore}}
+\def\menu{\doignore{menu}}
+\def\xml{\doignore{xml}}
+
+% Ignore text until a line `@end #1', keeping track of nested conditionals.
+%
+% A count to remember the depth of nesting.
+\newcount\doignorecount
+
+\def\doignore#1{\begingroup
+  % Scan in ``verbatim'' mode:
+  \obeylines
+  \catcode`\@ = \other
+  \catcode`\{ = \other
+  \catcode`\} = \other
+  %
+  % Make sure that spaces turn into tokens that match what \doignoretext wants.
+  \spaceisspace
+  %
+  % Count number of #1's that we've seen.
+  \doignorecount = 0
+  %
+  % Swallow text until we reach the matching `@end #1'.
+  \dodoignore{#1}%
+}
+
+{ \catcode`_=11 % We want to use \_STOP_ which cannot appear in texinfo source.
+  \obeylines %
+  %
+  \gdef\dodoignore#1{%
+    % #1 contains the command name as a string, e.g., `ifinfo'.
+    %
+    % Define a command to find the next `@end #1'.
+    \long\def\doignoretext##1^^M@end #1{%
+      \doignoretextyyy##1^^M@#1\_STOP_}%
+    %
+    % And this command to find another #1 command, at the beginning of a
+    % line.  (Otherwise, we would consider a line `@c @ifset', for
+    % example, to count as an @ifset for nesting.)
+    \long\def\doignoretextyyy##1^^M@#1##2\_STOP_{\doignoreyyy{##2}\_STOP_}%
+    %
+    % And now expand that command.
+    \doignoretext ^^M%
+  }%
+}
+
+\def\doignoreyyy#1{%
+  \def\temp{#1}%
+  \ifx\temp\empty			% Nothing found.
+    \let\next\doignoretextzzz
+  \else					% Found a nested condition, ...
+    \advance\doignorecount by 1
+    \let\next\doignoretextyyy		% ..., look for another.
+    % If we're here, #1 ends with ^^M\ifinfo (for example).
+  \fi
+  \next #1% the token \_STOP_ is present just after this macro.
+}
+
+% We have to swallow the remaining "\_STOP_".
+%
+\def\doignoretextzzz#1{%
+  \ifnum\doignorecount = 0	% We have just found the outermost @end.
+    \let\next\enddoignore
+  \else				% Still inside a nested condition.
+    \advance\doignorecount by -1
+    \let\next\doignoretext      % Look for the next @end.
+  \fi
+  \next
+}
+
+% Finish off ignored text.
+{ \obeylines%
+  % Ignore anything after the last `@end #1'; this matters in verbatim
+  % environments, where otherwise the newline after an ignored conditional
+  % would result in a blank line in the output.
+  \gdef\enddoignore#1^^M{\endgroup\ignorespaces}%
+}
+
+
+% @set VAR sets the variable VAR to an empty value.
+% @set VAR REST-OF-LINE sets VAR to the value REST-OF-LINE.
+%
+% Since we want to separate VAR from REST-OF-LINE (which might be
+% empty), we can't just use \parsearg; we have to insert a space of our
+% own to delimit the rest of the line, and then take it out again if we
+% didn't need it.
+% We rely on the fact that \parsearg sets \catcode`\ =10.
+%
+\parseargdef\set{\setyyy#1 \endsetyyy}
+\def\setyyy#1 #2\endsetyyy{%
+  {%
+    \makevalueexpandable
+    \def\temp{#2}%
+    \edef\next{\gdef\makecsname{SET#1}}%
+    \ifx\temp\empty
+      \next{}%
+    \else
+      \setzzz#2\endsetzzz
+    \fi
+  }%
+}
+% Remove the trailing space \setxxx inserted.
+\def\setzzz#1 \endsetzzz{\next{#1}}
+
+% @clear VAR clears (i.e., unsets) the variable VAR.
+%
+\parseargdef\clear{%
+  {%
+    \makevalueexpandable
+    \global\expandafter\let\csname SET#1\endcsname=\relax
+  }%
+}
+
+% @value{foo} gets the text saved in variable foo.
+\def\value{\begingroup\makevalueexpandable\valuexxx}
+\def\valuexxx#1{\expandablevalue{#1}\endgroup}
+{
+  \catcode`\- = \active \catcode`\_ = \active
+  %
+  \gdef\makevalueexpandable{%
+    \let\value = \expandablevalue
+    % We don't want these characters active, ...
+    \catcode`\-=\other \catcode`\_=\other
+    % ..., but we might end up with active ones in the argument if
+    % we're called from @code, as @code{@value{foo-bar_}}, though.
+    % So \let them to their normal equivalents.
+    \let-\realdash \let_\normalunderscore
+  }
+}
+
+% We have this subroutine so that we can handle at least some @value's
+% properly in indexes (we call \makevalueexpandable in \indexdummies).
+% The command has to be fully expandable (if the variable is set), since
+% the result winds up in the index file.  This means that if the
+% variable's value contains other Texinfo commands, it's almost certain
+% it will fail (although perhaps we could fix that with sufficient work
+% to do a one-level expansion on the result, instead of complete).
+%
+\def\expandablevalue#1{%
+  \expandafter\ifx\csname SET#1\endcsname\relax
+    {[No value for ``#1'']}%
+    \message{Variable `#1', used in @value, is not set.}%
+  \else
+    \csname SET#1\endcsname
+  \fi
+}
+
+% @ifset VAR ... @end ifset reads the `...' iff VAR has been defined
+% with @set.
+%
+% To get special treatment of `@end ifset,' call \makeond and the redefine.
+%
+\makecond{ifset}
+\def\ifset{\parsearg{\doifset{\let\next=\ifsetfail}}}
+\def\doifset#1#2{%
+  {%
+    \makevalueexpandable
+    \let\next=\empty
+    \expandafter\ifx\csname SET#2\endcsname\relax
+      #1% If not set, redefine \next.
+    \fi
+    \expandafter
+  }\next
+}
+\def\ifsetfail{\doignore{ifset}}
+
+% @ifclear VAR ... @end ifclear reads the `...' iff VAR has never been
+% defined with @set, or has been undefined with @clear.
+%
+% The `\else' inside the `\doifset' parameter is a trick to reuse the
+% above code: if the variable is not set, do nothing, if it is set,
+% then redefine \next to \ifclearfail.
+%
+\makecond{ifclear}
+\def\ifclear{\parsearg{\doifset{\else \let\next=\ifclearfail}}}
+\def\ifclearfail{\doignore{ifclear}}
+
+% @dircategory CATEGORY  -- specify a category of the dir file
+% which this file should belong to.  Ignore this in TeX.
+\let\dircategory=\comment
+
+% @defininfoenclose.
+\let\definfoenclose=\comment
+
+
+\message{indexing,}
+% Index generation facilities
+
+% Define \newwrite to be identical to plain tex's \newwrite
+% except not \outer, so it can be used within macros and \if's.
+\edef\newwrite{\makecsname{ptexnewwrite}}
+
+% \newindex {foo} defines an index named foo.
+% It automatically defines \fooindex such that
+% \fooindex ...rest of line... puts an entry in the index foo.
+% It also defines \fooindfile to be the number of the output channel for
+% the file that accumulates this index.  The file's extension is foo.
+% The name of an index should be no more than 2 characters long
+% for the sake of vms.
+%
+\def\newindex#1{%
+  \iflinks
+    \expandafter\newwrite \csname#1indfile\endcsname
+    \openout \csname#1indfile\endcsname \jobname.#1 % Open the file
+  \fi
+  \expandafter\xdef\csname#1index\endcsname{%     % Define @#1index
+    \noexpand\doindex{#1}}
+}
+
+% @defindex foo  ==  \newindex{foo}
+%
+\def\defindex{\parsearg\newindex}
+
+% Define @defcodeindex, like @defindex except put all entries in @code.
+%
+\def\defcodeindex{\parsearg\newcodeindex}
+%
+\def\newcodeindex#1{%
+  \iflinks
+    \expandafter\newwrite \csname#1indfile\endcsname
+    \openout \csname#1indfile\endcsname \jobname.#1
+  \fi
+  \expandafter\xdef\csname#1index\endcsname{%
+    \noexpand\docodeindex{#1}}%
+}
+
+
+% @synindex foo bar    makes index foo feed into index bar.
+% Do this instead of @defindex foo if you don't want it as a separate index.
+%
+% @syncodeindex foo bar   similar, but put all entries made for index foo
+% inside @code.
+%
+\def\synindex#1 #2 {\dosynindex\doindex{#1}{#2}}
+\def\syncodeindex#1 #2 {\dosynindex\docodeindex{#1}{#2}}
+
+% #1 is \doindex or \docodeindex, #2 the index getting redefined (foo),
+% #3 the target index (bar).
+\def\dosynindex#1#2#3{%
+  % Only do \closeout if we haven't already done it, else we'll end up
+  % closing the target index.
+  \expandafter \ifx\csname donesynindex#2\endcsname \relax
+    % The \closeout helps reduce unnecessary open files; the limit on the
+    % Acorn RISC OS is a mere 16 files.
+    \expandafter\closeout\csname#2indfile\endcsname
+    \expandafter\let\csname donesynindex#2\endcsname = 1
+  \fi
+  % redefine \fooindfile:
+  \expandafter\let\expandafter\temp\expandafter=\csname#3indfile\endcsname
+  \expandafter\let\csname#2indfile\endcsname=\temp
+  % redefine \fooindex:
+  \expandafter\xdef\csname#2index\endcsname{\noexpand#1{#3}}%
+}
+
+% Define \doindex, the driver for all \fooindex macros.
+% Argument #1 is generated by the calling \fooindex macro,
+%  and it is "foo", the name of the index.
+
+% \doindex just uses \parsearg; it calls \doind for the actual work.
+% This is because \doind is more useful to call from other macros.
+
+% There is also \dosubind {index}{topic}{subtopic}
+% which makes an entry in a two-level index such as the operation index.
+
+\def\doindex#1{\edef\indexname{#1}\parsearg\singleindexer}
+\def\singleindexer #1{\doind{\indexname}{#1}}
+
+% like the previous two, but they put @code around the argument.
+\def\docodeindex#1{\edef\indexname{#1}\parsearg\singlecodeindexer}
+\def\singlecodeindexer #1{\doind{\indexname}{\code{#1}}}
+
+% Take care of Texinfo commands that can appear in an index entry.
+% Since there are some commands we want to expand, and others we don't,
+% we have to laboriously prevent expansion for those that we don't.
+%
+\def\indexdummies{%
+  \escapechar = `\\     % use backslash in output files.
+  \def\@{@}% change to @@ when we switch to @ as escape char in index files.
+  \def\ {\realbackslash\space }%
+  %
+  % Need these unexpandable (because we define \tt as a dummy)
+  % definitions when @{ or @} appear in index entry text.  Also, more
+  % complicated, when \tex is in effect and \{ is a \delimiter again.
+  % We can't use \lbracecmd and \rbracecmd because texindex assumes
+  % braces and backslashes are used only as delimiters.  Perhaps we
+  % should define @lbrace and @rbrace commands a la @comma.
+  \def\{{{\tt\char123}}%
+  \def\}{{\tt\char125}}%
+  %
+  % I don't entirely understand this, but when an index entry is
+  % generated from a macro call, the \endinput which \scanmacro inserts
+  % causes processing to be prematurely terminated.  This is,
+  % apparently, because \indexsorttmp is fully expanded, and \endinput
+  % is an expandable command.  The redefinition below makes \endinput
+  % disappear altogether for that purpose -- although logging shows that
+  % processing continues to some further point.  On the other hand, it
+  % seems \endinput does not hurt in the printed index arg, since that
+  % is still getting written without apparent harm.
+  %
+  % Sample source (mac-idx3.tex, reported by Graham Percival to
+  % help-texinfo, 22may06):
+  % @macro funindex {WORD}
+  % @findex xyz
+  % @end macro
+  % ...
+  % @funindex commtest
+  %
+  % The above is not enough to reproduce the bug, but it gives the flavor.
+  %
+  % Sample whatsit resulting:
+  % .@write3{\entry{xyz}{@folio }{@code {xyz@endinput }}}
+  %
+  % So:
+  \let\endinput = \empty
+  %
+  % Do the redefinitions.
+  \commondummies
+}
+
+% For the aux and toc files, @ is the escape character.  So we want to
+% redefine everything using @ as the escape character (instead of
+% \realbackslash, still used for index files).  When everything uses @,
+% this will be simpler.
+%
+\def\atdummies{%
+  \def\@{@@}%
+  \def\ {@ }%
+  \let\{ = \lbraceatcmd
+  \let\} = \rbraceatcmd
+  %
+  % Do the redefinitions.
+  \commondummies
+  \otherbackslash
+}
+
+% Called from \indexdummies and \atdummies.
+%
+\def\commondummies{%
+  %
+  % \definedummyword defines \#1 as \string\#1\space, thus effectively
+  % preventing its expansion.  This is used only for control words,
+  % not control letters, because the \space would be incorrect for
+  % control characters, but is needed to separate the control word
+  % from whatever follows.
+  %
+  % For control letters, we have \definedummyletter, which omits the
+  % space.
+  %
+  % These can be used both for control words that take an argument and
+  % those that do not.  If it is followed by {arg} in the input, then
+  % that will dutifully get written to the index (or wherever).
+  %
+  \def\definedummyword  ##1{\def##1{\string##1\space}}%
+  \def\definedummyletter##1{\def##1{\string##1}}%
+  \let\definedummyaccent\definedummyletter
+  %
+  \commondummiesnofonts
+  %
+  \definedummyletter\_%
+  \definedummyletter\-%
+  %
+  % Non-English letters.
+  \definedummyword\AA
+  \definedummyword\AE
+  \definedummyword\DH
+  \definedummyword\L
+  \definedummyword\O
+  \definedummyword\OE
+  \definedummyword\TH
+  \definedummyword\aa
+  \definedummyword\ae
+  \definedummyword\dh
+  \definedummyword\exclamdown
+  \definedummyword\l
+  \definedummyword\o
+  \definedummyword\oe
+  \definedummyword\ordf
+  \definedummyword\ordm
+  \definedummyword\questiondown
+  \definedummyword\ss
+  \definedummyword\th
+  %
+  % Although these internal commands shouldn't show up, sometimes they do.
+  \definedummyword\bf
+  \definedummyword\gtr
+  \definedummyword\hat
+  \definedummyword\less
+  \definedummyword\sf
+  \definedummyword\sl
+  \definedummyword\tclose
+  \definedummyword\tt
+  %
+  \definedummyword\LaTeX
+  \definedummyword\TeX
+  %
+  % Assorted special characters.
+  \definedummyword\arrow
+  \definedummyword\bullet
+  \definedummyword\comma
+  \definedummyword\copyright
+  \definedummyword\registeredsymbol
+  \definedummyword\dots
+  \definedummyword\enddots
+  \definedummyword\entrybreak
+  \definedummyword\equiv
+  \definedummyword\error
+  \definedummyword\euro
+  \definedummyword\expansion
+  \definedummyword\geq
+  \definedummyword\guillemetleft
+  \definedummyword\guillemetright
+  \definedummyword\guilsinglleft
+  \definedummyword\guilsinglright
+  \definedummyword\leq
+  \definedummyword\minus
+  \definedummyword\ogonek
+  \definedummyword\pounds
+  \definedummyword\point
+  \definedummyword\print
+  \definedummyword\quotedblbase
+  \definedummyword\quotedblleft
+  \definedummyword\quotedblright
+  \definedummyword\quoteleft
+  \definedummyword\quoteright
+  \definedummyword\quotesinglbase
+  \definedummyword\result
+  \definedummyword\textdegree
+  %
+  % We want to disable all macros so that they are not expanded by \write.
+  \macrolist
+  %
+  \normalturnoffactive
+  %
+  % Handle some cases of @value -- where it does not contain any
+  % (non-fully-expandable) commands.
+  \makevalueexpandable
+}
+
+% \commondummiesnofonts: common to \commondummies and \indexnofonts.
+%
+\def\commondummiesnofonts{%
+  % Control letters and accents.
+  \definedummyletter\!%
+  \definedummyaccent\"%
+  \definedummyaccent\'%
+  \definedummyletter\*%
+  \definedummyaccent\,%
+  \definedummyletter\.%
+  \definedummyletter\/%
+  \definedummyletter\:%
+  \definedummyaccent\=%
+  \definedummyletter\?%
+  \definedummyaccent\^%
+  \definedummyaccent\`%
+  \definedummyaccent\~%
+  \definedummyword\u
+  \definedummyword\v
+  \definedummyword\H
+  \definedummyword\dotaccent
+  \definedummyword\ogonek
+  \definedummyword\ringaccent
+  \definedummyword\tieaccent
+  \definedummyword\ubaraccent
+  \definedummyword\udotaccent
+  \definedummyword\dotless
+  %
+  % Texinfo font commands.
+  \definedummyword\b
+  \definedummyword\i
+  \definedummyword\r
+  \definedummyword\sansserif
+  \definedummyword\sc
+  \definedummyword\slanted
+  \definedummyword\t
+  %
+  % Commands that take arguments.
+  \definedummyword\acronym
+  \definedummyword\anchor
+  \definedummyword\cite
+  \definedummyword\code
+  \definedummyword\command
+  \definedummyword\dfn
+  \definedummyword\dmn
+  \definedummyword\email
+  \definedummyword\emph
+  \definedummyword\env
+  \definedummyword\file
+  \definedummyword\indicateurl
+  \definedummyword\kbd
+  \definedummyword\key
+  \definedummyword\math
+  \definedummyword\option
+  \definedummyword\pxref
+  \definedummyword\ref
+  \definedummyword\samp
+  \definedummyword\strong
+  \definedummyword\tie
+  \definedummyword\uref
+  \definedummyword\url
+  \definedummyword\var
+  \definedummyword\verb
+  \definedummyword\w
+  \definedummyword\xref
+}
+
+% \indexnofonts is used when outputting the strings to sort the index
+% by, and when constructing control sequence names.  It eliminates all
+% control sequences and just writes whatever the best ASCII sort string
+% would be for a given command (usually its argument).
+%
+\def\indexnofonts{%
+  % Accent commands should become @asis.
+  \def\definedummyaccent##1{\let##1\asis}%
+  % We can just ignore other control letters.
+  \def\definedummyletter##1{\let##1\empty}%
+  % All control words become @asis by default; overrides below.
+  \let\definedummyword\definedummyaccent
+  %
+  \commondummiesnofonts
+  %
+  % Don't no-op \tt, since it isn't a user-level command
+  % and is used in the definitions of the active chars like <, >, |, etc.
+  % Likewise with the other plain tex font commands.
+  %\let\tt=\asis
+  %
+  \def\ { }%
+  \def\@{@}%
+  \def\_{\normalunderscore}%
+  \def\-{}% @- shouldn't affect sorting
+  %
+  % Unfortunately, texindex is not prepared to handle braces in the
+  % content at all.  So for index sorting, we map @{ and @} to strings
+  % starting with |, since that ASCII character is between ASCII { and }.
+  \def\{{|a}%
+  \def\}{|b}%
+  %
+  % Non-English letters.
+  \def\AA{AA}%
+  \def\AE{AE}%
+  \def\DH{DZZ}%
+  \def\L{L}%
+  \def\OE{OE}%
+  \def\O{O}%
+  \def\TH{ZZZ}%
+  \def\aa{aa}%
+  \def\ae{ae}%
+  \def\dh{dzz}%
+  \def\exclamdown{!}%
+  \def\l{l}%
+  \def\oe{oe}%
+  \def\ordf{a}%
+  \def\ordm{o}%
+  \def\o{o}%
+  \def\questiondown{?}%
+  \def\ss{ss}%
+  \def\th{zzz}%
+  %
+  \def\LaTeX{LaTeX}%
+  \def\TeX{TeX}%
+  %
+  % Assorted special characters.
+  % (The following {} will end up in the sort string, but that's ok.)
+  \def\arrow{->}%
+  \def\bullet{bullet}%
+  \def\comma{,}%
+  \def\copyright{copyright}%
+  \def\dots{...}%
+  \def\enddots{...}%
+  \def\equiv{==}%
+  \def\error{error}%
+  \def\euro{euro}%
+  \def\expansion{==>}%
+  \def\geq{>=}%
+  \def\guillemetleft{<<}%
+  \def\guillemetright{>>}%
+  \def\guilsinglleft{<}%
+  \def\guilsinglright{>}%
+  \def\leq{<=}%
+  \def\minus{-}%
+  \def\point{.}%
+  \def\pounds{pounds}%
+  \def\print{-|}%
+  \def\quotedblbase{"}%
+  \def\quotedblleft{"}%
+  \def\quotedblright{"}%
+  \def\quoteleft{`}%
+  \def\quoteright{'}%
+  \def\quotesinglbase{,}%
+  \def\registeredsymbol{R}%
+  \def\result{=>}%
+  \def\textdegree{o}%
+  %
+  \expandafter\ifx\csname SETtxiindexlquoteignore\endcsname\relax
+  \else \indexlquoteignore \fi
+  %
+  % We need to get rid of all macros, leaving only the arguments (if present).
+  % Of course this is not nearly correct, but it is the best we can do for now.
+  % makeinfo does not expand macros in the argument to @deffn, which ends up
+  % writing an index entry, and texindex isn't prepared for an index sort entry
+  % that starts with \.
+  %
+  % Since macro invocations are followed by braces, we can just redefine them
+  % to take a single TeX argument.  The case of a macro invocation that
+  % goes to end-of-line is not handled.
+  %
+  \macrolist
+}
+
+% Undocumented (for FSFS 2nd ed.): @set txiindexlquoteignore makes us
+% ignore left quotes in the sort term.
+{\catcode`\`=\active
+ \gdef\indexlquoteignore{\let`=\empty}}
+
+\let\indexbackslash=0  %overridden during \printindex.
+\let\SETmarginindex=\relax % put index entries in margin (undocumented)?
+
+% Most index entries go through here, but \dosubind is the general case.
+% #1 is the index name, #2 is the entry text.
+\def\doind#1#2{\dosubind{#1}{#2}{}}
+
+% Workhorse for all \fooindexes.
+% #1 is name of index, #2 is stuff to put there, #3 is subentry --
+% empty if called from \doind, as we usually are (the main exception
+% is with most defuns, which call us directly).
+%
+\def\dosubind#1#2#3{%
+  \iflinks
+  {%
+    % Store the main index entry text (including the third arg).
+    \toks0 = {#2}%
+    % If third arg is present, precede it with a space.
+    \def\thirdarg{#3}%
+    \ifx\thirdarg\empty \else
+      \toks0 = \expandafter{\the\toks0 \space #3}%
+    \fi
+    %
+    \edef\writeto{\csname#1indfile\endcsname}%
+    %
+    \safewhatsit\dosubindwrite
+  }%
+  \fi
+}
+
+% Write the entry in \toks0 to the index file:
+%
+\def\dosubindwrite{%
+  % Put the index entry in the margin if desired.
+  \ifx\SETmarginindex\relax\else
+    \insert\margin{\hbox{\vrule height8pt depth3pt width0pt \the\toks0}}%
+  \fi
+  %
+  % Remember, we are within a group.
+  \indexdummies % Must do this here, since \bf, etc expand at this stage
+  \def\backslashcurfont{\indexbackslash}% \indexbackslash isn't defined now
+      % so it will be output as is; and it will print as backslash.
+  %
+  % Process the index entry with all font commands turned off, to
+  % get the string to sort by.
+  {\indexnofonts
+   \edef\temp{\the\toks0}% need full expansion
+   \xdef\indexsorttmp{\temp}%
+  }%
+  %
+  % Set up the complete index entry, with both the sort key and
+  % the original text, including any font commands.  We write
+  % three arguments to \entry to the .?? file (four in the
+  % subentry case), texindex reduces to two when writing the .??s
+  % sorted result.
+  \edef\temp{%
+    \write\writeto{%
+      \string\entry{\indexsorttmp}{\noexpand\folio}{\the\toks0}}%
+  }%
+  \temp
+}
+
+% Take care of unwanted page breaks/skips around a whatsit:
+%
+% If a skip is the last thing on the list now, preserve it
+% by backing up by \lastskip, doing the \write, then inserting
+% the skip again.  Otherwise, the whatsit generated by the
+% \write or \pdfdest will make \lastskip zero.  The result is that
+% sequences like this:
+% @end defun
+% @tindex whatever
+% @defun ...
+% will have extra space inserted, because the \medbreak in the
+% start of the @defun won't see the skip inserted by the @end of
+% the previous defun.
+%
+% But don't do any of this if we're not in vertical mode.  We
+% don't want to do a \vskip and prematurely end a paragraph.
+%
+% Avoid page breaks due to these extra skips, too.
+%
+% But wait, there is a catch there:
+% We'll have to check whether \lastskip is zero skip.  \ifdim is not
+% sufficient for this purpose, as it ignores stretch and shrink parts
+% of the skip.  The only way seems to be to check the textual
+% representation of the skip.
+%
+% The following is almost like \def\zeroskipmacro{0.0pt} except that
+% the ``p'' and ``t'' characters have catcode \other, not 11 (letter).
+%
+\edef\zeroskipmacro{\expandafter\the\csname z@skip\endcsname}
+%
+\newskip\whatsitskip
+\newcount\whatsitpenalty
+%
+% ..., ready, GO:
+%
+\def\safewhatsit#1{\ifhmode
+  #1%
+ \else
+  % \lastskip and \lastpenalty cannot both be nonzero simultaneously.
+  \whatsitskip = \lastskip
+  \edef\lastskipmacro{\the\lastskip}%
+  \whatsitpenalty = \lastpenalty
+  %
+  % If \lastskip is nonzero, that means the last item was a
+  % skip.  And since a skip is discardable, that means this
+  % -\whatsitskip glue we're inserting is preceded by a
+  % non-discardable item, therefore it is not a potential
+  % breakpoint, therefore no \nobreak needed.
+  \ifx\lastskipmacro\zeroskipmacro
+  \else
+    \vskip-\whatsitskip
+  \fi
+  %
+  #1%
+  %
+  \ifx\lastskipmacro\zeroskipmacro
+    % If \lastskip was zero, perhaps the last item was a penalty, and
+    % perhaps it was >=10000, e.g., a \nobreak.  In that case, we want
+    % to re-insert the same penalty (values >10000 are used for various
+    % signals); since we just inserted a non-discardable item, any
+    % following glue (such as a \parskip) would be a breakpoint.  For example:
+    %   @deffn deffn-whatever
+    %   @vindex index-whatever
+    %   Description.
+    % would allow a break between the index-whatever whatsit
+    % and the "Description." paragraph.
+    \ifnum\whatsitpenalty>9999 \penalty\whatsitpenalty \fi
+  \else
+    % On the other hand, if we had a nonzero \lastskip,
+    % this make-up glue would be preceded by a non-discardable item
+    % (the whatsit from the \write), so we must insert a \nobreak.
+    \nobreak\vskip\whatsitskip
+  \fi
+\fi}
+
+% The index entry written in the file actually looks like
+%  \entry {sortstring}{page}{topic}
+% or
+%  \entry {sortstring}{page}{topic}{subtopic}
+% The texindex program reads in these files and writes files
+% containing these kinds of lines:
+%  \initial {c}
+%     before the first topic whose initial is c
+%  \entry {topic}{pagelist}
+%     for a topic that is used without subtopics
+%  \primary {topic}
+%     for the beginning of a topic that is used with subtopics
+%  \secondary {subtopic}{pagelist}
+%     for each subtopic.
+
+% Define the user-accessible indexing commands
+% @findex, @vindex, @kindex, @cindex.
+
+\def\findex {\fnindex}
+\def\kindex {\kyindex}
+\def\cindex {\cpindex}
+\def\vindex {\vrindex}
+\def\tindex {\tpindex}
+\def\pindex {\pgindex}
+
+\def\cindexsub {\begingroup\obeylines\cindexsub}
+{\obeylines %
+\gdef\cindexsub "#1" #2^^M{\endgroup %
+\dosubind{cp}{#2}{#1}}}
+
+% Define the macros used in formatting output of the sorted index material.
+
+% @printindex causes a particular index (the ??s file) to get printed.
+% It does not print any chapter heading (usually an @unnumbered).
+%
+\parseargdef\printindex{\begingroup
+  \dobreak \chapheadingskip{10000}%
+  %
+  \smallfonts \rm
+  \tolerance = 9500
+  \plainfrenchspacing
+  \everypar = {}% don't want the \kern\-parindent from indentation suppression.
+  %
+  % See if the index file exists and is nonempty.
+  % Change catcode of @ here so that if the index file contains
+  % \initial {@}
+  % as its first line, TeX doesn't complain about mismatched braces
+  % (because it thinks @} is a control sequence).
+  \catcode`\@ = 11
+  \openin 1 \jobname.#1s
+  \ifeof 1
+    % \enddoublecolumns gets confused if there is no text in the index,
+    % and it loses the chapter title and the aux file entries for the
+    % index.  The easiest way to prevent this problem is to make sure
+    % there is some text.
+    \putwordIndexNonexistent
+  \else
+    %
+    % If the index file exists but is empty, then \openin leaves \ifeof
+    % false.  We have to make TeX try to read something from the file, so
+    % it can discover if there is anything in it.
+    \read 1 to \temp
+    \ifeof 1
+      \putwordIndexIsEmpty
+    \else
+      % Index files are almost Texinfo source, but we use \ as the escape
+      % character.  It would be better to use @, but that's too big a change
+      % to make right now.
+      \def\indexbackslash{\backslashcurfont}%
+      \catcode`\\ = 0
+      \escapechar = `\\
+      \begindoublecolumns
+      \input \jobname.#1s
+      \enddoublecolumns
+    \fi
+  \fi
+  \closein 1
+\endgroup}
+
+% These macros are used by the sorted index file itself.
+% Change them to control the appearance of the index.
+
+\def\initial#1{{%
+  % Some minor font changes for the special characters.
+  \let\tentt=\sectt \let\tt=\sectt \let\sf=\sectt
+  %
+  % Remove any glue we may have, we'll be inserting our own.
+  \removelastskip
+  %
+  % We like breaks before the index initials, so insert a bonus.
+  \nobreak
+  \vskip 0pt plus 3\baselineskip
+  \penalty 0
+  \vskip 0pt plus -3\baselineskip
+  %
+  % Typeset the initial.  Making this add up to a whole number of
+  % baselineskips increases the chance of the dots lining up from column
+  % to column.  It still won't often be perfect, because of the stretch
+  % we need before each entry, but it's better.
+  %
+  % No shrink because it confuses \balancecolumns.
+  \vskip 1.67\baselineskip plus .5\baselineskip
+  \leftline{\secbf #1}%
+  % Do our best not to break after the initial.
+  \nobreak
+  \vskip .33\baselineskip plus .1\baselineskip
+}}
+
+% \entry typesets a paragraph consisting of the text (#1), dot leaders, and
+% then page number (#2) flushed to the right margin.  It is used for index
+% and table of contents entries.  The paragraph is indented by \leftskip.
+%
+% A straightforward implementation would start like this:
+%	\def\entry#1#2{...
+% But this freezes the catcodes in the argument, and can cause problems to
+% @code, which sets - active.  This problem was fixed by a kludge---
+% ``-'' was active throughout whole index, but this isn't really right.
+% The right solution is to prevent \entry from swallowing the whole text.
+%                                 --kasal, 21nov03
+\def\entry{%
+  \begingroup
+    %
+    % Start a new paragraph if necessary, so our assignments below can't
+    % affect previous text.
+    \par
+    %
+    % Do not fill out the last line with white space.
+    \parfillskip = 0in
+    %
+    % No extra space above this paragraph.
+    \parskip = 0in
+    %
+    % Do not prefer a separate line ending with a hyphen to fewer lines.
+    \finalhyphendemerits = 0
+    %
+    % \hangindent is only relevant when the entry text and page number
+    % don't both fit on one line.  In that case, bob suggests starting the
+    % dots pretty far over on the line.  Unfortunately, a large
+    % indentation looks wrong when the entry text itself is broken across
+    % lines.  So we use a small indentation and put up with long leaders.
+    %
+    % \hangafter is reset to 1 (which is the value we want) at the start
+    % of each paragraph, so we need not do anything with that.
+    \hangindent = 2em
+    %
+    % When the entry text needs to be broken, just fill out the first line
+    % with blank space.
+    \rightskip = 0pt plus1fil
+    %
+    % A bit of stretch before each entry for the benefit of balancing
+    % columns.
+    \vskip 0pt plus1pt
+    %
+    % When reading the text of entry, convert explicit line breaks
+    % from @* into spaces.  The user might give these in long section
+    % titles, for instance.
+    \def\*{\unskip\space\ignorespaces}%
+    \def\entrybreak{\hfil\break}%
+    %
+    % Swallow the left brace of the text (first parameter):
+    \afterassignment\doentry
+    \let\temp =
+}
+\def\entrybreak{\unskip\space\ignorespaces}%
+\def\doentry{%
+    \bgroup % Instead of the swallowed brace.
+      \noindent
+      \aftergroup\finishentry
+      % And now comes the text of the entry.
+}
+\def\finishentry#1{%
+    % #1 is the page number.
+    %
+    % The following is kludged to not output a line of dots in the index if
+    % there are no page numbers.  The next person who breaks this will be
+    % cursed by a Unix daemon.
+    \setbox\boxA = \hbox{#1}%
+    \ifdim\wd\boxA = 0pt
+      \ %
+    \else
+      %
+      % If we must, put the page number on a line of its own, and fill out
+      % this line with blank space.  (The \hfil is overwhelmed with the
+      % fill leaders glue in \indexdotfill if the page number does fit.)
+      \hfil\penalty50
+      \null\nobreak\indexdotfill % Have leaders before the page number.
+      %
+      % The `\ ' here is removed by the implicit \unskip that TeX does as
+      % part of (the primitive) \par.  Without it, a spurious underfull
+      % \hbox ensues.
+      \ifpdf
+	\pdfgettoks#1.%
+	\ \the\toksA
+      \else
+	\ #1%
+      \fi
+    \fi
+    \par
+  \endgroup
+}
+
+% Like plain.tex's \dotfill, except uses up at least 1 em.
+\def\indexdotfill{\cleaders
+  \hbox{$\mathsurround=0pt \mkern1.5mu.\mkern1.5mu$}\hskip 1em plus 1fill}
+
+\def\primary #1{\line{#1\hfil}}
+
+\newskip\secondaryindent \secondaryindent=0.5cm
+\def\secondary#1#2{{%
+  \parfillskip=0in
+  \parskip=0in
+  \hangindent=1in
+  \hangafter=1
+  \noindent\hskip\secondaryindent\hbox{#1}\indexdotfill
+  \ifpdf
+    \pdfgettoks#2.\ \the\toksA % The page number ends the paragraph.
+  \else
+    #2
+  \fi
+  \par
+}}
+
+% Define two-column mode, which we use to typeset indexes.
+% Adapted from the TeXbook, page 416, which is to say,
+% the manmac.tex format used to print the TeXbook itself.
+\catcode`\@=11
+
+\newbox\partialpage
+\newdimen\doublecolumnhsize
+
+\def\begindoublecolumns{\begingroup % ended by \enddoublecolumns
+  % Grab any single-column material above us.
+  \output = {%
+    %
+    % Here is a possibility not foreseen in manmac: if we accumulate a
+    % whole lot of material, we might end up calling this \output
+    % routine twice in a row (see the doublecol-lose test, which is
+    % essentially a couple of indexes with @setchapternewpage off).  In
+    % that case we just ship out what is in \partialpage with the normal
+    % output routine.  Generally, \partialpage will be empty when this
+    % runs and this will be a no-op.  See the indexspread.tex test case.
+    \ifvoid\partialpage \else
+      \onepageout{\pagecontents\partialpage}%
+    \fi
+    %
+    \global\setbox\partialpage = \vbox{%
+      % Unvbox the main output page.
+      \unvbox\PAGE
+      \kern-\topskip \kern\baselineskip
+    }%
+  }%
+  \eject % run that output routine to set \partialpage
+  %
+  % Use the double-column output routine for subsequent pages.
+  \output = {\doublecolumnout}%
+  %
+  % Change the page size parameters.  We could do this once outside this
+  % routine, in each of @smallbook, @afourpaper, and the default 8.5x11
+  % format, but then we repeat the same computation.  Repeating a couple
+  % of assignments once per index is clearly meaningless for the
+  % execution time, so we may as well do it in one place.
+  %
+  % First we halve the line length, less a little for the gutter between
+  % the columns.  We compute the gutter based on the line length, so it
+  % changes automatically with the paper format.  The magic constant
+  % below is chosen so that the gutter has the same value (well, +-<1pt)
+  % as it did when we hard-coded it.
+  %
+  % We put the result in a separate register, \doublecolumhsize, so we
+  % can restore it in \pagesofar, after \hsize itself has (potentially)
+  % been clobbered.
+  %
+  \doublecolumnhsize = \hsize
+    \advance\doublecolumnhsize by -.04154\hsize
+    \divide\doublecolumnhsize by 2
+  \hsize = \doublecolumnhsize
+  %
+  % Double the \vsize as well.  (We don't need a separate register here,
+  % since nobody clobbers \vsize.)
+  \vsize = 2\vsize
+}
+
+% The double-column output routine for all double-column pages except
+% the last.
+%
+\def\doublecolumnout{%
+  \splittopskip=\topskip \splitmaxdepth=\maxdepth
+  % Get the available space for the double columns -- the normal
+  % (undoubled) page height minus any material left over from the
+  % previous page.
+  \dimen@ = \vsize
+  \divide\dimen@ by 2
+  \advance\dimen@ by -\ht\partialpage
+  %
+  % box0 will be the left-hand column, box2 the right.
+  \setbox0=\vsplit255 to\dimen@ \setbox2=\vsplit255 to\dimen@
+  \onepageout\pagesofar
+  \unvbox255
+  \penalty\outputpenalty
+}
+%
+% Re-output the contents of the output page -- any previous material,
+% followed by the two boxes we just split, in box0 and box2.
+\def\pagesofar{%
+  \unvbox\partialpage
+  %
+  \hsize = \doublecolumnhsize
+  \wd0=\hsize \wd2=\hsize
+  \hbox to\pagewidth{\box0\hfil\box2}%
+}
+%
+% All done with double columns.
+\def\enddoublecolumns{%
+  % The following penalty ensures that the page builder is exercised
+  % _before_ we change the output routine.  This is necessary in the
+  % following situation:
+  %
+  % The last section of the index consists only of a single entry.
+  % Before this section, \pagetotal is less than \pagegoal, so no
+  % break occurs before the last section starts.  However, the last
+  % section, consisting of \initial and the single \entry, does not
+  % fit on the page and has to be broken off.  Without the following
+  % penalty the page builder will not be exercised until \eject
+  % below, and by that time we'll already have changed the output
+  % routine to the \balancecolumns version, so the next-to-last
+  % double-column page will be processed with \balancecolumns, which
+  % is wrong:  The two columns will go to the main vertical list, with
+  % the broken-off section in the recent contributions.  As soon as
+  % the output routine finishes, TeX starts reconsidering the page
+  % break.  The two columns and the broken-off section both fit on the
+  % page, because the two columns now take up only half of the page
+  % goal.  When TeX sees \eject from below which follows the final
+  % section, it invokes the new output routine that we've set after
+  % \balancecolumns below; \onepageout will try to fit the two columns
+  % and the final section into the vbox of \pageheight (see
+  % \pagebody), causing an overfull box.
+  %
+  % Note that glue won't work here, because glue does not exercise the
+  % page builder, unlike penalties (see The TeXbook, pp. 280-281).
+  \penalty0
+  %
+  \output = {%
+    % Split the last of the double-column material.  Leave it on the
+    % current page, no automatic page break.
+    \balancecolumns
+    %
+    % If we end up splitting too much material for the current page,
+    % though, there will be another page break right after this \output
+    % invocation ends.  Having called \balancecolumns once, we do not
+    % want to call it again.  Therefore, reset \output to its normal
+    % definition right away.  (We hope \balancecolumns will never be
+    % called on to balance too much material, but if it is, this makes
+    % the output somewhat more palatable.)
+    \global\output = {\onepageout{\pagecontents\PAGE}}%
+  }%
+  \eject
+  \endgroup % started in \begindoublecolumns
+  %
+  % \pagegoal was set to the doubled \vsize above, since we restarted
+  % the current page.  We're now back to normal single-column
+  % typesetting, so reset \pagegoal to the normal \vsize (after the
+  % \endgroup where \vsize got restored).
+  \pagegoal = \vsize
+}
+%
+% Called at the end of the double column material.
+\def\balancecolumns{%
+  \setbox0 = \vbox{\unvbox255}% like \box255 but more efficient, see p.120.
+  \dimen@ = \ht0
+  \advance\dimen@ by \topskip
+  \advance\dimen@ by-\baselineskip
+  \divide\dimen@ by 2 % target to split to
+  %debug\message{final 2-column material height=\the\ht0, target=\the\dimen@.}%
+  \splittopskip = \topskip
+  % Loop until we get a decent breakpoint.
+  {%
+    \vbadness = 10000
+    \loop
+      \global\setbox3 = \copy0
+      \global\setbox1 = \vsplit3 to \dimen@
+    \ifdim\ht3>\dimen@
+      \global\advance\dimen@ by 1pt
+    \repeat
+  }%
+  %debug\message{split to \the\dimen@, column heights: \the\ht1, \the\ht3.}%
+  \setbox0=\vbox to\dimen@{\unvbox1}%
+  \setbox2=\vbox to\dimen@{\unvbox3}%
+  %
+  \pagesofar
+}
+\catcode`\@ = \other
+
+
+\message{sectioning,}
+% Chapters, sections, etc.
+
+% Let's start with @part.
+\outer\parseargdef\part{\partzzz{#1}}
+\def\partzzz#1{%
+  \chapoddpage
+  \null
+  \vskip.3\vsize  % move it down on the page a bit
+  \begingroup
+    \noindent \titlefonts\rmisbold #1\par % the text
+    \let\lastnode=\empty      % no node to associate with
+    \writetocentry{part}{#1}{}% but put it in the toc
+    \headingsoff              % no headline or footline on the part page
+    \chapoddpage
+  \endgroup
+}
+
+% \unnumberedno is an oxymoron.  But we count the unnumbered
+% sections so that we can refer to them unambiguously in the pdf
+% outlines by their "section number".  We avoid collisions with chapter
+% numbers by starting them at 10000.  (If a document ever has 10000
+% chapters, we're in trouble anyway, I'm sure.)
+\newcount\unnumberedno \unnumberedno = 10000
+\newcount\chapno
+\newcount\secno        \secno=0
+\newcount\subsecno     \subsecno=0
+\newcount\subsubsecno  \subsubsecno=0
+
+% This counter is funny since it counts through charcodes of letters A, B, ...
+\newcount\appendixno  \appendixno = `\@
+%
+% \def\appendixletter{\char\the\appendixno}
+% We do the following ugly conditional instead of the above simple
+% construct for the sake of pdftex, which needs the actual
+% letter in the expansion, not just typeset.
+%
+\def\appendixletter{%
+  \ifnum\appendixno=`A A%
+  \else\ifnum\appendixno=`B B%
+  \else\ifnum\appendixno=`C C%
+  \else\ifnum\appendixno=`D D%
+  \else\ifnum\appendixno=`E E%
+  \else\ifnum\appendixno=`F F%
+  \else\ifnum\appendixno=`G G%
+  \else\ifnum\appendixno=`H H%
+  \else\ifnum\appendixno=`I I%
+  \else\ifnum\appendixno=`J J%
+  \else\ifnum\appendixno=`K K%
+  \else\ifnum\appendixno=`L L%
+  \else\ifnum\appendixno=`M M%
+  \else\ifnum\appendixno=`N N%
+  \else\ifnum\appendixno=`O O%
+  \else\ifnum\appendixno=`P P%
+  \else\ifnum\appendixno=`Q Q%
+  \else\ifnum\appendixno=`R R%
+  \else\ifnum\appendixno=`S S%
+  \else\ifnum\appendixno=`T T%
+  \else\ifnum\appendixno=`U U%
+  \else\ifnum\appendixno=`V V%
+  \else\ifnum\appendixno=`W W%
+  \else\ifnum\appendixno=`X X%
+  \else\ifnum\appendixno=`Y Y%
+  \else\ifnum\appendixno=`Z Z%
+  % The \the is necessary, despite appearances, because \appendixletter is
+  % expanded while writing the .toc file.  \char\appendixno is not
+  % expandable, thus it is written literally, thus all appendixes come out
+  % with the same letter (or @) in the toc without it.
+  \else\char\the\appendixno
+  \fi\fi\fi\fi\fi\fi\fi\fi\fi\fi\fi\fi\fi
+  \fi\fi\fi\fi\fi\fi\fi\fi\fi\fi\fi\fi\fi}
+
+% Each @chapter defines these (using marks) as the number+name, number
+% and name of the chapter.  Page headings and footings can use
+% these.  @section does likewise.
+\def\thischapter{}
+\def\thischapternum{}
+\def\thischaptername{}
+\def\thissection{}
+\def\thissectionnum{}
+\def\thissectionname{}
+
+\newcount\absseclevel % used to calculate proper heading level
+\newcount\secbase\secbase=0 % @raisesections/@lowersections modify this count
+
+% @raisesections: treat @section as chapter, @subsection as section, etc.
+\def\raisesections{\global\advance\secbase by -1}
+\let\up=\raisesections % original BFox name
+
+% @lowersections: treat @chapter as section, @section as subsection, etc.
+\def\lowersections{\global\advance\secbase by 1}
+\let\down=\lowersections % original BFox name
+
+% we only have subsub.
+\chardef\maxseclevel = 3
+%
+% A numbered section within an unnumbered changes to unnumbered too.
+% To achieve this, remember the "biggest" unnum. sec. we are currently in:
+\chardef\unnlevel = \maxseclevel
+%
+% Trace whether the current chapter is an appendix or not:
+% \chapheadtype is "N" or "A", unnumbered chapters are ignored.
+\def\chapheadtype{N}
+
+% Choose a heading macro
+% #1 is heading type
+% #2 is heading level
+% #3 is text for heading
+\def\genhead#1#2#3{%
+  % Compute the abs. sec. level:
+  \absseclevel=#2
+  \advance\absseclevel by \secbase
+  % Make sure \absseclevel doesn't fall outside the range:
+  \ifnum \absseclevel < 0
+    \absseclevel = 0
+  \else
+    \ifnum \absseclevel > 3
+      \absseclevel = 3
+    \fi
+  \fi
+  % The heading type:
+  \def\headtype{#1}%
+  \if \headtype U%
+    \ifnum \absseclevel < \unnlevel
+      \chardef\unnlevel = \absseclevel
+    \fi
+  \else
+    % Check for appendix sections:
+    \ifnum \absseclevel = 0
+      \edef\chapheadtype{\headtype}%
+    \else
+      \if \headtype A\if \chapheadtype N%
+	\errmessage{@appendix... within a non-appendix chapter}%
+      \fi\fi
+    \fi
+    % Check for numbered within unnumbered:
+    \ifnum \absseclevel > \unnlevel
+      \def\headtype{U}%
+    \else
+      \chardef\unnlevel = 3
+    \fi
+  \fi
+  % Now print the heading:
+  \if \headtype U%
+    \ifcase\absseclevel
+	\unnumberedzzz{#3}%
+    \or \unnumberedseczzz{#3}%
+    \or \unnumberedsubseczzz{#3}%
+    \or \unnumberedsubsubseczzz{#3}%
+    \fi
+  \else
+    \if \headtype A%
+      \ifcase\absseclevel
+	  \appendixzzz{#3}%
+      \or \appendixsectionzzz{#3}%
+      \or \appendixsubseczzz{#3}%
+      \or \appendixsubsubseczzz{#3}%
+      \fi
+    \else
+      \ifcase\absseclevel
+	  \chapterzzz{#3}%
+      \or \seczzz{#3}%
+      \or \numberedsubseczzz{#3}%
+      \or \numberedsubsubseczzz{#3}%
+      \fi
+    \fi
+  \fi
+  \suppressfirstparagraphindent
+}
+
+% an interface:
+\def\numhead{\genhead N}
+\def\apphead{\genhead A}
+\def\unnmhead{\genhead U}
+
+% @chapter, @appendix, @unnumbered.  Increment top-level counter, reset
+% all lower-level sectioning counters to zero.
+%
+% Also set \chaplevelprefix, which we prepend to @float sequence numbers
+% (e.g., figures), q.v.  By default (before any chapter), that is empty.
+\let\chaplevelprefix = \empty
+%
+\outer\parseargdef\chapter{\numhead0{#1}} % normally numhead0 calls chapterzzz
+\def\chapterzzz#1{%
+  % section resetting is \global in case the chapter is in a group, such
+  % as an @include file.
+  \global\secno=0 \global\subsecno=0 \global\subsubsecno=0
+    \global\advance\chapno by 1
+  %
+  % Used for \float.
+  \gdef\chaplevelprefix{\the\chapno.}%
+  \resetallfloatnos
+  %
+  % \putwordChapter can contain complex things in translations.
+  \toks0=\expandafter{\putwordChapter}%
+  \message{\the\toks0 \space \the\chapno}%
+  %
+  % Write the actual heading.
+  \chapmacro{#1}{Ynumbered}{\the\chapno}%
+  %
+  % So @section and the like are numbered underneath this chapter.
+  \global\let\section = \numberedsec
+  \global\let\subsection = \numberedsubsec
+  \global\let\subsubsection = \numberedsubsubsec
+}
+
+\outer\parseargdef\appendix{\apphead0{#1}} % normally calls appendixzzz
+%
+\def\appendixzzz#1{%
+  \global\secno=0 \global\subsecno=0 \global\subsubsecno=0
+    \global\advance\appendixno by 1
+  \gdef\chaplevelprefix{\appendixletter.}%
+  \resetallfloatnos
+  %
+  % \putwordAppendix can contain complex things in translations.
+  \toks0=\expandafter{\putwordAppendix}%
+  \message{\the\toks0 \space \appendixletter}%
+  %
+  \chapmacro{#1}{Yappendix}{\appendixletter}%
+  %
+  \global\let\section = \appendixsec
+  \global\let\subsection = \appendixsubsec
+  \global\let\subsubsection = \appendixsubsubsec
+}
+
+% normally unnmhead0 calls unnumberedzzz:
+\outer\parseargdef\unnumbered{\unnmhead0{#1}}
+\def\unnumberedzzz#1{%
+  \global\secno=0 \global\subsecno=0 \global\subsubsecno=0
+    \global\advance\unnumberedno by 1
+  %
+  % Since an unnumbered has no number, no prefix for figures.
+  \global\let\chaplevelprefix = \empty
+  \resetallfloatnos
+  %
+  % This used to be simply \message{#1}, but TeX fully expands the
+  % argument to \message.  Therefore, if #1 contained @-commands, TeX
+  % expanded them.  For example, in `@unnumbered The @cite{Book}', TeX
+  % expanded @cite (which turns out to cause errors because \cite is meant
+  % to be executed, not expanded).
+  %
+  % Anyway, we don't want the fully-expanded definition of @cite to appear
+  % as a result of the \message, we just want `@cite' itself.  We use
+  % \the<toks register> to achieve this: TeX expands \the<toks> only once,
+  % simply yielding the contents of <toks register>.  (We also do this for
+  % the toc entries.)
+  \toks0 = {#1}%
+  \message{(\the\toks0)}%
+  %
+  \chapmacro{#1}{Ynothing}{\the\unnumberedno}%
+  %
+  \global\let\section = \unnumberedsec
+  \global\let\subsection = \unnumberedsubsec
+  \global\let\subsubsection = \unnumberedsubsubsec
+}
+
+% @centerchap is like @unnumbered, but the heading is centered.
+\outer\parseargdef\centerchap{%
+  % Well, we could do the following in a group, but that would break
+  % an assumption that \chapmacro is called at the outermost level.
+  % Thus we are safer this way:		--kasal, 24feb04
+  \let\centerparametersmaybe = \centerparameters
+  \unnmhead0{#1}%
+  \let\centerparametersmaybe = \relax
+}
+
+% @top is like @unnumbered.
+\let\top\unnumbered
+
+% Sections.
+% 
+\outer\parseargdef\numberedsec{\numhead1{#1}} % normally calls seczzz
+\def\seczzz#1{%
+  \global\subsecno=0 \global\subsubsecno=0  \global\advance\secno by 1
+  \sectionheading{#1}{sec}{Ynumbered}{\the\chapno.\the\secno}%
+}
+
+% normally calls appendixsectionzzz:
+\outer\parseargdef\appendixsection{\apphead1{#1}}
+\def\appendixsectionzzz#1{%
+  \global\subsecno=0 \global\subsubsecno=0  \global\advance\secno by 1
+  \sectionheading{#1}{sec}{Yappendix}{\appendixletter.\the\secno}%
+}
+\let\appendixsec\appendixsection
+
+% normally calls unnumberedseczzz:
+\outer\parseargdef\unnumberedsec{\unnmhead1{#1}}
+\def\unnumberedseczzz#1{%
+  \global\subsecno=0 \global\subsubsecno=0  \global\advance\secno by 1
+  \sectionheading{#1}{sec}{Ynothing}{\the\unnumberedno.\the\secno}%
+}
+
+% Subsections.
+% 
+% normally calls numberedsubseczzz:
+\outer\parseargdef\numberedsubsec{\numhead2{#1}}
+\def\numberedsubseczzz#1{%
+  \global\subsubsecno=0  \global\advance\subsecno by 1
+  \sectionheading{#1}{subsec}{Ynumbered}{\the\chapno.\the\secno.\the\subsecno}%
+}
+
+% normally calls appendixsubseczzz:
+\outer\parseargdef\appendixsubsec{\apphead2{#1}}
+\def\appendixsubseczzz#1{%
+  \global\subsubsecno=0  \global\advance\subsecno by 1
+  \sectionheading{#1}{subsec}{Yappendix}%
+                 {\appendixletter.\the\secno.\the\subsecno}%
+}
+
+% normally calls unnumberedsubseczzz:
+\outer\parseargdef\unnumberedsubsec{\unnmhead2{#1}}
+\def\unnumberedsubseczzz#1{%
+  \global\subsubsecno=0  \global\advance\subsecno by 1
+  \sectionheading{#1}{subsec}{Ynothing}%
+                 {\the\unnumberedno.\the\secno.\the\subsecno}%
+}
+
+% Subsubsections.
+% 
+% normally numberedsubsubseczzz:
+\outer\parseargdef\numberedsubsubsec{\numhead3{#1}}
+\def\numberedsubsubseczzz#1{%
+  \global\advance\subsubsecno by 1
+  \sectionheading{#1}{subsubsec}{Ynumbered}%
+                 {\the\chapno.\the\secno.\the\subsecno.\the\subsubsecno}%
+}
+
+% normally appendixsubsubseczzz:
+\outer\parseargdef\appendixsubsubsec{\apphead3{#1}}
+\def\appendixsubsubseczzz#1{%
+  \global\advance\subsubsecno by 1
+  \sectionheading{#1}{subsubsec}{Yappendix}%
+                 {\appendixletter.\the\secno.\the\subsecno.\the\subsubsecno}%
+}
+
+% normally unnumberedsubsubseczzz:
+\outer\parseargdef\unnumberedsubsubsec{\unnmhead3{#1}}
+\def\unnumberedsubsubseczzz#1{%
+  \global\advance\subsubsecno by 1
+  \sectionheading{#1}{subsubsec}{Ynothing}%
+                 {\the\unnumberedno.\the\secno.\the\subsecno.\the\subsubsecno}%
+}
+
+% These macros control what the section commands do, according
+% to what kind of chapter we are in (ordinary, appendix, or unnumbered).
+% Define them by default for a numbered chapter.
+\let\section = \numberedsec
+\let\subsection = \numberedsubsec
+\let\subsubsection = \numberedsubsubsec
+
+% Define @majorheading, @heading and @subheading
+
+% NOTE on use of \vbox for chapter headings, section headings, and such:
+%       1) We use \vbox rather than the earlier \line to permit
+%          overlong headings to fold.
+%       2) \hyphenpenalty is set to 10000 because hyphenation in a
+%          heading is obnoxious; this forbids it.
+%       3) Likewise, headings look best if no \parindent is used, and
+%          if justification is not attempted.  Hence \raggedright.
+
+\def\majorheading{%
+  {\advance\chapheadingskip by 10pt \chapbreak }%
+  \parsearg\chapheadingzzz
+}
+
+\def\chapheading{\chapbreak \parsearg\chapheadingzzz}
+\def\chapheadingzzz#1{%
+  {\chapfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+                    \parindent=0pt\ptexraggedright
+                    \rmisbold #1\hfill}}%
+  \bigskip \par\penalty 200\relax
+  \suppressfirstparagraphindent
+}
+
+% @heading, @subheading, @subsubheading.
+\parseargdef\heading{\sectionheading{#1}{sec}{Yomitfromtoc}{}
+  \suppressfirstparagraphindent}
+\parseargdef\subheading{\sectionheading{#1}{subsec}{Yomitfromtoc}{}
+  \suppressfirstparagraphindent}
+\parseargdef\subsubheading{\sectionheading{#1}{subsubsec}{Yomitfromtoc}{}
+  \suppressfirstparagraphindent}
+
+% These macros generate a chapter, section, etc. heading only
+% (including whitespace, linebreaking, etc. around it),
+% given all the information in convenient, parsed form.
+
+% Args are the skip and penalty (usually negative)
+\def\dobreak#1#2{\par\ifdim\lastskip<#1\removelastskip\penalty#2\vskip#1\fi}
+
+% Parameter controlling skip before chapter headings (if needed)
+\newskip\chapheadingskip
+
+% Define plain chapter starts, and page on/off switching for it.
+\def\chapbreak{\dobreak \chapheadingskip {-4000}}
+\def\chappager{\par\vfill\supereject}
+% Because \domark is called before \chapoddpage, the filler page will
+% get the headings for the next chapter, which is wrong.  But we don't
+% care -- we just disable all headings on the filler page.
+\def\chapoddpage{%
+  \chappager
+  \ifodd\pageno \else
+    \begingroup
+      \headingsoff
+      \null
+      \chappager
+    \endgroup
+  \fi
+}
+
+\def\setchapternewpage #1 {\csname CHAPPAG#1\endcsname}
+
+\def\CHAPPAGoff{%
+\global\let\contentsalignmacro = \chappager
+\global\let\pchapsepmacro=\chapbreak
+\global\let\pagealignmacro=\chappager}
+
+\def\CHAPPAGon{%
+\global\let\contentsalignmacro = \chappager
+\global\let\pchapsepmacro=\chappager
+\global\let\pagealignmacro=\chappager
+\global\def\HEADINGSon{\HEADINGSsingle}}
+
+\def\CHAPPAGodd{%
+\global\let\contentsalignmacro = \chapoddpage
+\global\let\pchapsepmacro=\chapoddpage
+\global\let\pagealignmacro=\chapoddpage
+\global\def\HEADINGSon{\HEADINGSdouble}}
+
+\CHAPPAGon
+
+% Chapter opening.
+%
+% #1 is the text, #2 is the section type (Ynumbered, Ynothing,
+% Yappendix, Yomitfromtoc), #3 the chapter number.
+%
+% To test against our argument.
+\def\Ynothingkeyword{Ynothing}
+\def\Yomitfromtockeyword{Yomitfromtoc}
+\def\Yappendixkeyword{Yappendix}
+%
+\def\chapmacro#1#2#3{%
+  % Insert the first mark before the heading break (see notes for \domark).
+  \let\prevchapterdefs=\lastchapterdefs
+  \let\prevsectiondefs=\lastsectiondefs
+  \gdef\lastsectiondefs{\gdef\thissectionname{}\gdef\thissectionnum{}%
+                        \gdef\thissection{}}%
+  %
+  \def\temptype{#2}%
+  \ifx\temptype\Ynothingkeyword
+    \gdef\lastchapterdefs{\gdef\thischaptername{#1}\gdef\thischapternum{}%
+                          \gdef\thischapter{\thischaptername}}%
+  \else\ifx\temptype\Yomitfromtockeyword
+    \gdef\lastchapterdefs{\gdef\thischaptername{#1}\gdef\thischapternum{}%
+                          \gdef\thischapter{}}%
+  \else\ifx\temptype\Yappendixkeyword
+    \toks0={#1}%
+    \xdef\lastchapterdefs{%
+      \gdef\noexpand\thischaptername{\the\toks0}%
+      \gdef\noexpand\thischapternum{\appendixletter}%
+      % \noexpand\putwordAppendix avoids expanding indigestible
+      % commands in some of the translations.
+      \gdef\noexpand\thischapter{\noexpand\putwordAppendix{}
+                                 \noexpand\thischapternum:
+                                 \noexpand\thischaptername}%
+    }%
+  \else
+    \toks0={#1}%
+    \xdef\lastchapterdefs{%
+      \gdef\noexpand\thischaptername{\the\toks0}%
+      \gdef\noexpand\thischapternum{\the\chapno}%
+      % \noexpand\putwordChapter avoids expanding indigestible
+      % commands in some of the translations.
+      \gdef\noexpand\thischapter{\noexpand\putwordChapter{}
+                                 \noexpand\thischapternum:
+                                 \noexpand\thischaptername}%
+    }%
+  \fi\fi\fi
+  %
+  % Output the mark.  Pass it through \safewhatsit, to take care of
+  % the preceding space.
+  \safewhatsit\domark
+  %
+  % Insert the chapter heading break.
+  \pchapsepmacro
+  %
+  % Now the second mark, after the heading break.  No break points
+  % between here and the heading.
+  \let\prevchapterdefs=\lastchapterdefs
+  \let\prevsectiondefs=\lastsectiondefs
+  \domark
+  %
+  {%
+    \chapfonts \rmisbold
+    %
+    % Have to define \lastsection before calling \donoderef, because the
+    % xref code eventually uses it.  On the other hand, it has to be called
+    % after \pchapsepmacro, or the headline will change too soon.
+    \gdef\lastsection{#1}%
+    %
+    % Only insert the separating space if we have a chapter/appendix
+    % number, and don't print the unnumbered ``number''.
+    \ifx\temptype\Ynothingkeyword
+      \setbox0 = \hbox{}%
+      \def\toctype{unnchap}%
+    \else\ifx\temptype\Yomitfromtockeyword
+      \setbox0 = \hbox{}% contents like unnumbered, but no toc entry
+      \def\toctype{omit}%
+    \else\ifx\temptype\Yappendixkeyword
+      \setbox0 = \hbox{\putwordAppendix{} #3\enspace}%
+      \def\toctype{app}%
+    \else
+      \setbox0 = \hbox{#3\enspace}%
+      \def\toctype{numchap}%
+    \fi\fi\fi
+    %
+    % Write the toc entry for this chapter.  Must come before the
+    % \donoderef, because we include the current node name in the toc
+    % entry, and \donoderef resets it to empty.
+    \writetocentry{\toctype}{#1}{#3}%
+    %
+    % For pdftex, we have to write out the node definition (aka, make
+    % the pdfdest) after any page break, but before the actual text has
+    % been typeset.  If the destination for the pdf outline is after the
+    % text, then jumping from the outline may wind up with the text not
+    % being visible, for instance under high magnification.
+    \donoderef{#2}%
+    %
+    % Typeset the actual heading.
+    \nobreak % Avoid page breaks at the interline glue.
+    \vbox{\hyphenpenalty=10000 \tolerance=5000 \parindent=0pt \ptexraggedright
+          \hangindent=\wd0 \centerparametersmaybe
+          \unhbox0 #1\par}%
+  }%
+  \nobreak\bigskip % no page break after a chapter title
+  \nobreak
+}
+
+% @centerchap -- centered and unnumbered.
+\let\centerparametersmaybe = \relax
+\def\centerparameters{%
+  \advance\rightskip by 3\rightskip
+  \leftskip = \rightskip
+  \parfillskip = 0pt
+}
+
+
+% I don't think this chapter style is supported any more, so I'm not
+% updating it with the new noderef stuff.  We'll see.  --karl, 11aug03.
+%
+\def\setchapterstyle #1 {\csname CHAPF#1\endcsname}
+%
+\def\unnchfopen #1{%
+\chapoddpage {\chapfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+                       \parindent=0pt\ptexraggedright
+                       \rmisbold #1\hfill}}\bigskip \par\nobreak
+}
+\def\chfopen #1#2{\chapoddpage {\chapfonts
+\vbox to 3in{\vfil \hbox to\hsize{\hfil #2} \hbox to\hsize{\hfil #1} \vfil}}%
+\par\penalty 5000 %
+}
+\def\centerchfopen #1{%
+\chapoddpage {\chapfonts \vbox{\hyphenpenalty=10000\tolerance=5000
+                       \parindent=0pt
+                       \hfill {\rmisbold #1}\hfill}}\bigskip \par\nobreak
+}
+\def\CHAPFopen{%
+  \global\let\chapmacro=\chfopen
+  \global\let\centerchapmacro=\centerchfopen}
+
+
+% Section titles.  These macros combine the section number parts and
+% call the generic \sectionheading to do the printing.
+%
+\newskip\secheadingskip
+\def\secheadingbreak{\dobreak \secheadingskip{-1000}}
+
+% Subsection titles.
+\newskip\subsecheadingskip
+\def\subsecheadingbreak{\dobreak \subsecheadingskip{-500}}
+
+% Subsubsection titles.
+\def\subsubsecheadingskip{\subsecheadingskip}
+\def\subsubsecheadingbreak{\subsecheadingbreak}
+
+
+% Print any size, any type, section title.
+%
+% #1 is the text, #2 is the section level (sec/subsec/subsubsec), #3 is
+% the section type for xrefs (Ynumbered, Ynothing, Yappendix), #4 is the
+% section number.
+%
+\def\seckeyword{sec}
+%
+\def\sectionheading#1#2#3#4{%
+  {%
+    \checkenv{}% should not be in an environment.
+    %
+    % Switch to the right set of fonts.
+    \csname #2fonts\endcsname \rmisbold
+    %
+    \def\sectionlevel{#2}%
+    \def\temptype{#3}%
+    %
+    % Insert first mark before the heading break (see notes for \domark).
+    \let\prevsectiondefs=\lastsectiondefs
+    \ifx\temptype\Ynothingkeyword
+      \ifx\sectionlevel\seckeyword
+        \gdef\lastsectiondefs{\gdef\thissectionname{#1}\gdef\thissectionnum{}%
+                              \gdef\thissection{\thissectionname}}%
+      \fi
+    \else\ifx\temptype\Yomitfromtockeyword
+      % Don't redefine \thissection.
+    \else\ifx\temptype\Yappendixkeyword
+      \ifx\sectionlevel\seckeyword
+        \toks0={#1}%
+        \xdef\lastsectiondefs{%
+          \gdef\noexpand\thissectionname{\the\toks0}%
+          \gdef\noexpand\thissectionnum{#4}%
+          % \noexpand\putwordSection avoids expanding indigestible
+          % commands in some of the translations.
+          \gdef\noexpand\thissection{\noexpand\putwordSection{}
+                                     \noexpand\thissectionnum:
+                                     \noexpand\thissectionname}%
+        }%
+      \fi
+    \else
+      \ifx\sectionlevel\seckeyword
+        \toks0={#1}%
+        \xdef\lastsectiondefs{%
+          \gdef\noexpand\thissectionname{\the\toks0}%
+          \gdef\noexpand\thissectionnum{#4}%
+          % \noexpand\putwordSection avoids expanding indigestible
+          % commands in some of the translations.
+          \gdef\noexpand\thissection{\noexpand\putwordSection{}
+                                     \noexpand\thissectionnum:
+                                     \noexpand\thissectionname}%
+        }%
+      \fi
+    \fi\fi\fi
+    %
+    % Go into vertical mode.  Usually we'll already be there, but we
+    % don't want the following whatsit to end up in a preceding paragraph
+    % if the document didn't happen to have a blank line.
+    \par
+    %
+    % Output the mark.  Pass it through \safewhatsit, to take care of
+    % the preceding space.
+    \safewhatsit\domark
+    %
+    % Insert space above the heading.
+    \csname #2headingbreak\endcsname
+    %
+    % Now the second mark, after the heading break.  No break points
+    % between here and the heading.
+    \let\prevsectiondefs=\lastsectiondefs
+    \domark
+    %
+    % Only insert the space after the number if we have a section number.
+    \ifx\temptype\Ynothingkeyword
+      \setbox0 = \hbox{}%
+      \def\toctype{unn}%
+      \gdef\lastsection{#1}%
+    \else\ifx\temptype\Yomitfromtockeyword
+      % for @headings -- no section number, don't include in toc,
+      % and don't redefine \lastsection.
+      \setbox0 = \hbox{}%
+      \def\toctype{omit}%
+      \let\sectionlevel=\empty
+    \else\ifx\temptype\Yappendixkeyword
+      \setbox0 = \hbox{#4\enspace}%
+      \def\toctype{app}%
+      \gdef\lastsection{#1}%
+    \else
+      \setbox0 = \hbox{#4\enspace}%
+      \def\toctype{num}%
+      \gdef\lastsection{#1}%
+    \fi\fi\fi
+    %
+    % Write the toc entry (before \donoderef).  See comments in \chapmacro.
+    \writetocentry{\toctype\sectionlevel}{#1}{#4}%
+    %
+    % Write the node reference (= pdf destination for pdftex).
+    % Again, see comments in \chapmacro.
+    \donoderef{#3}%
+    %
+    % Interline glue will be inserted when the vbox is completed.
+    % That glue will be a valid breakpoint for the page, since it'll be
+    % preceded by a whatsit (usually from the \donoderef, or from the
+    % \writetocentry if there was no node).  We don't want to allow that
+    % break, since then the whatsits could end up on page n while the
+    % section is on page n+1, thus toc/etc. are wrong.  Debian bug 276000.
+    \nobreak
+    %
+    % Output the actual section heading.
+    \vbox{\hyphenpenalty=10000 \tolerance=5000 \parindent=0pt \ptexraggedright
+          \hangindent=\wd0  % zero if no section number
+          \unhbox0 #1}%
+  }%
+  % Add extra space after the heading -- half of whatever came above it.
+  % Don't allow stretch, though.
+  \kern .5 \csname #2headingskip\endcsname
+  %
+  % Do not let the kern be a potential breakpoint, as it would be if it
+  % was followed by glue.
+  \nobreak
+  %
+  % We'll almost certainly start a paragraph next, so don't let that
+  % glue accumulate.  (Not a breakpoint because it's preceded by a
+  % discardable item.)  However, when a paragraph is not started next
+  % (\startdefun, \cartouche, \center, etc.), this needs to be wiped out
+  % or the negative glue will cause weirdly wrong output, typically
+  % obscuring the section heading with something else.
+  \vskip-\parskip
+  %
+  % This is so the last item on the main vertical list is a known
+  % \penalty > 10000, so \startdefun, etc., can recognize the situation
+  % and do the needful.
+  \penalty 10001
+}
+
+
+\message{toc,}
+% Table of contents.
+\newwrite\tocfile
+
+% Write an entry to the toc file, opening it if necessary.
+% Called from @chapter, etc.
+%
+% Example usage: \writetocentry{sec}{Section Name}{\the\chapno.\the\secno}
+% We append the current node name (if any) and page number as additional
+% arguments for the \{chap,sec,...}entry macros which will eventually
+% read this.  The node name is used in the pdf outlines as the
+% destination to jump to.
+%
+% We open the .toc file for writing here instead of at @setfilename (or
+% any other fixed time) so that @contents can be anywhere in the document.
+% But if #1 is `omit', then we don't do anything.  This is used for the
+% table of contents chapter openings themselves.
+%
+\newif\iftocfileopened
+\def\omitkeyword{omit}%
+%
+\def\writetocentry#1#2#3{%
+  \edef\writetoctype{#1}%
+  \ifx\writetoctype\omitkeyword \else
+    \iftocfileopened\else
+      \immediate\openout\tocfile = \jobname.toc
+      \global\tocfileopenedtrue
+    \fi
+    %
+    \iflinks
+      {\atdummies
+       \edef\temp{%
+         \write\tocfile{@#1entry{#2}{#3}{\lastnode}{\noexpand\folio}}}%
+       \temp
+      }%
+    \fi
+  \fi
+  %
+  % Tell \shipout to create a pdf destination on each page, if we're
+  % writing pdf.  These are used in the table of contents.  We can't
+  % just write one on every page because the title pages are numbered
+  % 1 and 2 (the page numbers aren't printed), and so are the first
+  % two pages of the document.  Thus, we'd have two destinations named
+  % `1', and two named `2'.
+  \ifpdf \global\pdfmakepagedesttrue \fi
+}
+
+
+% These characters do not print properly in the Computer Modern roman
+% fonts, so we must take special care.  This is more or less redundant
+% with the Texinfo input format setup at the end of this file.
+%
+\def\activecatcodes{%
+  \catcode`\"=\active
+  \catcode`\$=\active
+  \catcode`\<=\active
+  \catcode`\>=\active
+  \catcode`\\=\active
+  \catcode`\^=\active
+  \catcode`\_=\active
+  \catcode`\|=\active
+  \catcode`\~=\active
+}
+
+
+% Read the toc file, which is essentially Texinfo input.
+\def\readtocfile{%
+  \setupdatafile
+  \activecatcodes
+  \input \tocreadfilename
+}
+
+\newskip\contentsrightmargin \contentsrightmargin=1in
+\newcount\savepageno
+\newcount\lastnegativepageno \lastnegativepageno = -1
+
+% Prepare to read what we've written to \tocfile.
+%
+\def\startcontents#1{%
+  % If @setchapternewpage on, and @headings double, the contents should
+  % start on an odd page, unlike chapters.  Thus, we maintain
+  % \contentsalignmacro in parallel with \pagealignmacro.
+  % From: Torbjorn Granlund <tege@matematik.su.se>
+  \contentsalignmacro
+  \immediate\closeout\tocfile
+  %
+  % Don't need to put `Contents' or `Short Contents' in the headline.
+  % It is abundantly clear what they are.
+  \chapmacro{#1}{Yomitfromtoc}{}%
+  %
+  \savepageno = \pageno
+  \begingroup                  % Set up to handle contents files properly.
+    \raggedbottom              % Worry more about breakpoints than the bottom.
+    \advance\hsize by -\contentsrightmargin % Don't use the full line length.
+    %
+    % Roman numerals for page numbers.
+    \ifnum \pageno>0 \global\pageno = \lastnegativepageno \fi
+}
+
+% redefined for the two-volume lispref.  We always output on
+% \jobname.toc even if this is redefined.
+%
+\def\tocreadfilename{\jobname.toc}
+
+% Normal (long) toc.
+%
+\def\contents{%
+  \startcontents{\putwordTOC}%
+    \openin 1 \tocreadfilename\space
+    \ifeof 1 \else
+      \readtocfile
+    \fi
+    \vfill \eject
+    \contentsalignmacro % in case @setchapternewpage odd is in effect
+    \ifeof 1 \else
+      \pdfmakeoutlines
+    \fi
+    \closein 1
+  \endgroup
+  \lastnegativepageno = \pageno
+  \global\pageno = \savepageno
+}
+
+% And just the chapters.
+\def\summarycontents{%
+  \startcontents{\putwordShortTOC}%
+    %
+    \let\partentry = \shortpartentry
+    \let\numchapentry = \shortchapentry
+    \let\appentry = \shortchapentry
+    \let\unnchapentry = \shortunnchapentry
+    % We want a true roman here for the page numbers.
+    \secfonts
+    \let\rm=\shortcontrm \let\bf=\shortcontbf
+    \let\sl=\shortcontsl \let\tt=\shortconttt
+    \rm
+    \hyphenpenalty = 10000
+    \advance\baselineskip by 1pt % Open it up a little.
+    \def\numsecentry##1##2##3##4{}
+    \let\appsecentry = \numsecentry
+    \let\unnsecentry = \numsecentry
+    \let\numsubsecentry = \numsecentry
+    \let\appsubsecentry = \numsecentry
+    \let\unnsubsecentry = \numsecentry
+    \let\numsubsubsecentry = \numsecentry
+    \let\appsubsubsecentry = \numsecentry
+    \let\unnsubsubsecentry = \numsecentry
+    \openin 1 \tocreadfilename\space
+    \ifeof 1 \else
+      \readtocfile
+    \fi
+    \closein 1
+    \vfill \eject
+    \contentsalignmacro % in case @setchapternewpage odd is in effect
+  \endgroup
+  \lastnegativepageno = \pageno
+  \global\pageno = \savepageno
+}
+\let\shortcontents = \summarycontents
+
+% Typeset the label for a chapter or appendix for the short contents.
+% The arg is, e.g., `A' for an appendix, or `3' for a chapter.
+%
+\def\shortchaplabel#1{%
+  % This space should be enough, since a single number is .5em, and the
+  % widest letter (M) is 1em, at least in the Computer Modern fonts.
+  % But use \hss just in case.
+  % (This space doesn't include the extra space that gets added after
+  % the label; that gets put in by \shortchapentry above.)
+  %
+  % We'd like to right-justify chapter numbers, but that looks strange
+  % with appendix letters.  And right-justifying numbers and
+  % left-justifying letters looks strange when there is less than 10
+  % chapters.  Have to read the whole toc once to know how many chapters
+  % there are before deciding ...
+  \hbox to 1em{#1\hss}%
+}
+
+% These macros generate individual entries in the table of contents.
+% The first argument is the chapter or section name.
+% The last argument is the page number.
+% The arguments in between are the chapter number, section number, ...
+
+% Parts, in the main contents.  Replace the part number, which doesn't
+% exist, with an empty box.  Let's hope all the numbers have the same width.
+% Also ignore the page number, which is conventionally not printed.
+\def\numeralbox{\setbox0=\hbox{8}\hbox to \wd0{\hfil}}
+\def\partentry#1#2#3#4{\dochapentry{\numeralbox\labelspace#1}{}}
+%
+% Parts, in the short toc.
+\def\shortpartentry#1#2#3#4{%
+  \penalty-300
+  \vskip.5\baselineskip plus.15\baselineskip minus.1\baselineskip
+  \shortchapentry{{\bf #1}}{\numeralbox}{}{}%
+}
+
+% Chapters, in the main contents.
+\def\numchapentry#1#2#3#4{\dochapentry{#2\labelspace#1}{#4}}
+%
+% Chapters, in the short toc.
+% See comments in \dochapentry re vbox and related settings.
+\def\shortchapentry#1#2#3#4{%
+  \tocentry{\shortchaplabel{#2}\labelspace #1}{\doshortpageno\bgroup#4\egroup}%
+}
+
+% Appendices, in the main contents.
+% Need the word Appendix, and a fixed-size box.
+%
+\def\appendixbox#1{%
+  % We use M since it's probably the widest letter.
+  \setbox0 = \hbox{\putwordAppendix{} M}%
+  \hbox to \wd0{\putwordAppendix{} #1\hss}}
+%
+\def\appentry#1#2#3#4{\dochapentry{\appendixbox{#2}\labelspace#1}{#4}}
+
+% Unnumbered chapters.
+\def\unnchapentry#1#2#3#4{\dochapentry{#1}{#4}}
+\def\shortunnchapentry#1#2#3#4{\tocentry{#1}{\doshortpageno\bgroup#4\egroup}}
+
+% Sections.
+\def\numsecentry#1#2#3#4{\dosecentry{#2\labelspace#1}{#4}}
+\let\appsecentry=\numsecentry
+\def\unnsecentry#1#2#3#4{\dosecentry{#1}{#4}}
+
+% Subsections.
+\def\numsubsecentry#1#2#3#4{\dosubsecentry{#2\labelspace#1}{#4}}
+\let\appsubsecentry=\numsubsecentry
+\def\unnsubsecentry#1#2#3#4{\dosubsecentry{#1}{#4}}
+
+% And subsubsections.
+\def\numsubsubsecentry#1#2#3#4{\dosubsubsecentry{#2\labelspace#1}{#4}}
+\let\appsubsubsecentry=\numsubsubsecentry
+\def\unnsubsubsecentry#1#2#3#4{\dosubsubsecentry{#1}{#4}}
+
+% This parameter controls the indentation of the various levels.
+% Same as \defaultparindent.
+\newdimen\tocindent \tocindent = 15pt
+
+% Now for the actual typesetting. In all these, #1 is the text and #2 is the
+% page number.
+%
+% If the toc has to be broken over pages, we want it to be at chapters
+% if at all possible; hence the \penalty.
+\def\dochapentry#1#2{%
+   \penalty-300 \vskip1\baselineskip plus.33\baselineskip minus.25\baselineskip
+   \begingroup
+     \chapentryfonts
+     \tocentry{#1}{\dopageno\bgroup#2\egroup}%
+   \endgroup
+   \nobreak\vskip .25\baselineskip plus.1\baselineskip
+}
+
+\def\dosecentry#1#2{\begingroup
+  \secentryfonts \leftskip=\tocindent
+  \tocentry{#1}{\dopageno\bgroup#2\egroup}%
+\endgroup}
+
+\def\dosubsecentry#1#2{\begingroup
+  \subsecentryfonts \leftskip=2\tocindent
+  \tocentry{#1}{\dopageno\bgroup#2\egroup}%
+\endgroup}
+
+\def\dosubsubsecentry#1#2{\begingroup
+  \subsubsecentryfonts \leftskip=3\tocindent
+  \tocentry{#1}{\dopageno\bgroup#2\egroup}%
+\endgroup}
+
+% We use the same \entry macro as for the index entries.
+\let\tocentry = \entry
+
+% Space between chapter (or whatever) number and the title.
+\def\labelspace{\hskip1em \relax}
+
+\def\dopageno#1{{\rm #1}}
+\def\doshortpageno#1{{\rm #1}}
+
+\def\chapentryfonts{\secfonts \rm}
+\def\secentryfonts{\textfonts}
+\def\subsecentryfonts{\textfonts}
+\def\subsubsecentryfonts{\textfonts}
+
+
+\message{environments,}
+% @foo ... @end foo.
+
+% @tex ... @end tex    escapes into raw TeX temporarily.
+% One exception: @ is still an escape character, so that @end tex works.
+% But \@ or @@ will get a plain @ character.
+
+\envdef\tex{%
+  \setupmarkupstyle{tex}%
+  \catcode `\\=0 \catcode `\{=1 \catcode `\}=2
+  \catcode `\$=3 \catcode `\&=4 \catcode `\#=6
+  \catcode `\^=7 \catcode `\_=8 \catcode `\~=\active \let~=\tie
+  \catcode `\%=14
+  \catcode `\+=\other
+  \catcode `\"=\other
+  \catcode `\|=\other
+  \catcode `\<=\other
+  \catcode `\>=\other
+  \catcode`\`=\other
+  \catcode`\'=\other
+  \escapechar=`\\
+  %
+  % ' is active in math mode (mathcode"8000).  So reset it, and all our
+  % other math active characters (just in case), to plain's definitions.
+  \mathactive
+  %
+  \let\b=\ptexb
+  \let\bullet=\ptexbullet
+  \let\c=\ptexc
+  \let\,=\ptexcomma
+  \let\.=\ptexdot
+  \let\dots=\ptexdots
+  \let\equiv=\ptexequiv
+  \let\!=\ptexexclam
+  \let\i=\ptexi
+  \let\indent=\ptexindent
+  \let\noindent=\ptexnoindent
+  \let\{=\ptexlbrace
+  \let\+=\tabalign
+  \let\}=\ptexrbrace
+  \let\/=\ptexslash
+  \let\*=\ptexstar
+  \let\t=\ptext
+  \expandafter \let\csname top\endcsname=\ptextop  % outer
+  \let\frenchspacing=\plainfrenchspacing
+  %
+  \def\endldots{\mathinner{\ldots\ldots\ldots\ldots}}%
+  \def\enddots{\relax\ifmmode\endldots\else$\mathsurround=0pt \endldots\,$\fi}%
+  \def\@{@}%
+}
+% There is no need to define \Etex.
+
+% Define @lisp ... @end lisp.
+% @lisp environment forms a group so it can rebind things,
+% including the definition of @end lisp (which normally is erroneous).
+
+% Amount to narrow the margins by for @lisp.
+\newskip\lispnarrowing \lispnarrowing=0.4in
+
+% This is the definition that ^^M gets inside @lisp, @example, and other
+% such environments.  \null is better than a space, since it doesn't
+% have any width.
+\def\lisppar{\null\endgraf}
+
+% This space is always present above and below environments.
+\newskip\envskipamount \envskipamount = 0pt
+
+% Make spacing and below environment symmetrical.  We use \parskip here
+% to help in doing that, since in @example-like environments \parskip
+% is reset to zero; thus the \afterenvbreak inserts no space -- but the
+% start of the next paragraph will insert \parskip.
+%
+\def\aboveenvbreak{{%
+  % =10000 instead of <10000 because of a special case in \itemzzz and
+  % \sectionheading, q.v.
+  \ifnum \lastpenalty=10000 \else
+    \advance\envskipamount by \parskip
+    \endgraf
+    \ifdim\lastskip<\envskipamount
+      \removelastskip
+      % it's not a good place to break if the last penalty was \nobreak
+      % or better ...
+      \ifnum\lastpenalty<10000 \penalty-50 \fi
+      \vskip\envskipamount
+    \fi
+  \fi
+}}
+
+\let\afterenvbreak = \aboveenvbreak
+
+% \nonarrowing is a flag.  If "set", @lisp etc don't narrow margins; it will
+% also clear it, so that its embedded environments do the narrowing again.
+\let\nonarrowing=\relax
+
+% @cartouche ... @end cartouche: draw rectangle w/rounded corners around
+% environment contents.
+\font\circle=lcircle10
+\newdimen\circthick
+\newdimen\cartouter\newdimen\cartinner
+\newskip\normbskip\newskip\normpskip\newskip\normlskip
+\circthick=\fontdimen8\circle
+%
+\def\ctl{{\circle\char'013\hskip -6pt}}% 6pt from pl file: 1/2charwidth
+\def\ctr{{\hskip 6pt\circle\char'010}}
+\def\cbl{{\circle\char'012\hskip -6pt}}
+\def\cbr{{\hskip 6pt\circle\char'011}}
+\def\carttop{\hbox to \cartouter{\hskip\lskip
+        \ctl\leaders\hrule height\circthick\hfil\ctr
+        \hskip\rskip}}
+\def\cartbot{\hbox to \cartouter{\hskip\lskip
+        \cbl\leaders\hrule height\circthick\hfil\cbr
+        \hskip\rskip}}
+%
+\newskip\lskip\newskip\rskip
+
+\envdef\cartouche{%
+  \ifhmode\par\fi  % can't be in the midst of a paragraph.
+  \startsavinginserts
+  \lskip=\leftskip \rskip=\rightskip
+  \leftskip=0pt\rightskip=0pt % we want these *outside*.
+  \cartinner=\hsize \advance\cartinner by-\lskip
+  \advance\cartinner by-\rskip
+  \cartouter=\hsize
+  \advance\cartouter by 18.4pt	% allow for 3pt kerns on either
+				% side, and for 6pt waste from
+				% each corner char, and rule thickness
+  \normbskip=\baselineskip \normpskip=\parskip \normlskip=\lineskip
+  % Flag to tell @lisp, etc., not to narrow margin.
+  \let\nonarrowing = t%
+  %
+  % If this cartouche directly follows a sectioning command, we need the
+  % \parskip glue (backspaced over by default) or the cartouche can
+  % collide with the section heading.
+  \ifnum\lastpenalty>10000 \vskip\parskip \penalty\lastpenalty \fi
+  %
+  \vbox\bgroup
+      \baselineskip=0pt\parskip=0pt\lineskip=0pt
+      \carttop
+      \hbox\bgroup
+	  \hskip\lskip
+	  \vrule\kern3pt
+	  \vbox\bgroup
+	      \kern3pt
+	      \hsize=\cartinner
+	      \baselineskip=\normbskip
+	      \lineskip=\normlskip
+	      \parskip=\normpskip
+	      \vskip -\parskip
+	      \comment % For explanation, see the end of def\group.
+}
+\def\Ecartouche{%
+              \ifhmode\par\fi
+	      \kern3pt
+	  \egroup
+	  \kern3pt\vrule
+	  \hskip\rskip
+      \egroup
+      \cartbot
+  \egroup
+  \checkinserts
+}
+
+
+% This macro is called at the beginning of all the @example variants,
+% inside a group.
+\newdimen\nonfillparindent
+\def\nonfillstart{%
+  \aboveenvbreak
+  \hfuzz = 12pt % Don't be fussy
+  \sepspaces % Make spaces be word-separators rather than space tokens.
+  \let\par = \lisppar % don't ignore blank lines
+  \obeylines % each line of input is a line of output
+  \parskip = 0pt
+  % Turn off paragraph indentation but redefine \indent to emulate
+  % the normal \indent.
+  \nonfillparindent=\parindent
+  \parindent = 0pt
+  \let\indent\nonfillindent
+  %
+  \emergencystretch = 0pt % don't try to avoid overfull boxes
+  \ifx\nonarrowing\relax
+    \advance \leftskip by \lispnarrowing
+    \exdentamount=\lispnarrowing
+  \else
+    \let\nonarrowing = \relax
+  \fi
+  \let\exdent=\nofillexdent
+}
+
+\begingroup
+\obeyspaces
+% We want to swallow spaces (but not other tokens) after the fake
+% @indent in our nonfill-environments, where spaces are normally
+% active and set to @tie, resulting in them not being ignored after
+% @indent.
+\gdef\nonfillindent{\futurelet\temp\nonfillindentcheck}%
+\gdef\nonfillindentcheck{%
+\ifx\temp %
+\expandafter\nonfillindentgobble%
+\else%
+\leavevmode\nonfillindentbox%
+\fi%
+}%
+\endgroup
+\def\nonfillindentgobble#1{\nonfillindent}
+\def\nonfillindentbox{\hbox to \nonfillparindent{\hss}}
+
+% If you want all examples etc. small: @set dispenvsize small.
+% If you want even small examples the full size: @set dispenvsize nosmall.
+% This affects the following displayed environments:
+%    @example, @display, @format, @lisp
+%
+\def\smallword{small}
+\def\nosmallword{nosmall}
+\let\SETdispenvsize\relax
+\def\setnormaldispenv{%
+  \ifx\SETdispenvsize\smallword
+    % end paragraph for sake of leading, in case document has no blank
+    % line.  This is redundant with what happens in \aboveenvbreak, but
+    % we need to do it before changing the fonts, and it's inconvenient
+    % to change the fonts afterward.
+    \ifnum \lastpenalty=10000 \else \endgraf \fi
+    \smallexamplefonts \rm
+  \fi
+}
+\def\setsmalldispenv{%
+  \ifx\SETdispenvsize\nosmallword
+  \else
+    \ifnum \lastpenalty=10000 \else \endgraf \fi
+    \smallexamplefonts \rm
+  \fi
+}
+
+% We often define two environments, @foo and @smallfoo.
+% Let's do it in one command.  #1 is the env name, #2 the definition.
+\def\makedispenvdef#1#2{%
+  \expandafter\envdef\csname#1\endcsname {\setnormaldispenv #2}%
+  \expandafter\envdef\csname small#1\endcsname {\setsmalldispenv #2}%
+  \expandafter\let\csname E#1\endcsname \afterenvbreak
+  \expandafter\let\csname Esmall#1\endcsname \afterenvbreak
+}
+
+% Define two environment synonyms (#1 and #2) for an environment.
+\def\maketwodispenvdef#1#2#3{%
+  \makedispenvdef{#1}{#3}%
+  \makedispenvdef{#2}{#3}%
+}
+%
+% @lisp: indented, narrowed, typewriter font;
+% @example: same as @lisp.
+%
+% @smallexample and @smalllisp: use smaller fonts.
+% Originally contributed by Pavel@xerox.
+%
+\maketwodispenvdef{lisp}{example}{%
+  \nonfillstart
+  \tt\setupmarkupstyle{example}%
+  \let\kbdfont = \kbdexamplefont % Allow @kbd to do something special.
+  \gobble % eat return
+}
+% @display/@smalldisplay: same as @lisp except keep current font.
+%
+\makedispenvdef{display}{%
+  \nonfillstart
+  \gobble
+}
+
+% @format/@smallformat: same as @display except don't narrow margins.
+%
+\makedispenvdef{format}{%
+  \let\nonarrowing = t%
+  \nonfillstart
+  \gobble
+}
+
+% @flushleft: same as @format, but doesn't obey \SETdispenvsize.
+\envdef\flushleft{%
+  \let\nonarrowing = t%
+  \nonfillstart
+  \gobble
+}
+\let\Eflushleft = \afterenvbreak
+
+% @flushright.
+%
+\envdef\flushright{%
+  \let\nonarrowing = t%
+  \nonfillstart
+  \advance\leftskip by 0pt plus 1fill\relax
+  \gobble
+}
+\let\Eflushright = \afterenvbreak
+
+
+% @raggedright does more-or-less normal line breaking but no right
+% justification.  From plain.tex.
+\envdef\raggedright{%
+  \rightskip0pt plus2em \spaceskip.3333em \xspaceskip.5em\relax
+}
+\let\Eraggedright\par
+
+\envdef\raggedleft{%
+  \parindent=0pt \leftskip0pt plus2em
+  \spaceskip.3333em \xspaceskip.5em \parfillskip=0pt
+  \hbadness=10000 % Last line will usually be underfull, so turn off
+                  % badness reporting.
+}
+\let\Eraggedleft\par
+
+\envdef\raggedcenter{%
+  \parindent=0pt \rightskip0pt plus1em \leftskip0pt plus1em
+  \spaceskip.3333em \xspaceskip.5em \parfillskip=0pt
+  \hbadness=10000 % Last line will usually be underfull, so turn off
+                  % badness reporting.
+}
+\let\Eraggedcenter\par
+
+
+% @quotation does normal linebreaking (hence we can't use \nonfillstart)
+% and narrows the margins.  We keep \parskip nonzero in general, since
+% we're doing normal filling.  So, when using \aboveenvbreak and
+% \afterenvbreak, temporarily make \parskip 0.
+%
+\makedispenvdef{quotation}{\quotationstart}
+%
+\def\quotationstart{%
+  {\parskip=0pt \aboveenvbreak}% because \aboveenvbreak inserts \parskip
+  \parindent=0pt
+  %
+  % @cartouche defines \nonarrowing to inhibit narrowing at next level down.
+  \ifx\nonarrowing\relax
+    \advance\leftskip by \lispnarrowing
+    \advance\rightskip by \lispnarrowing
+    \exdentamount = \lispnarrowing
+  \else
+    \let\nonarrowing = \relax
+  \fi
+  \parsearg\quotationlabel
+}
+
+% We have retained a nonzero parskip for the environment, since we're
+% doing normal filling.
+%
+\def\Equotation{%
+  \par
+  \ifx\quotationauthor\thisisundefined\else
+    % indent a bit.
+    \leftline{\kern 2\leftskip \sl ---\quotationauthor}%
+  \fi
+  {\parskip=0pt \afterenvbreak}%
+}
+\def\Esmallquotation{\Equotation}
+
+% If we're given an argument, typeset it in bold with a colon after.
+\def\quotationlabel#1{%
+  \def\temp{#1}%
+  \ifx\temp\empty \else
+    {\bf #1: }%
+  \fi
+}
+
+
+% LaTeX-like @verbatim...@end verbatim and @verb{<char>...<char>}
+% If we want to allow any <char> as delimiter,
+% we need the curly braces so that makeinfo sees the @verb command, eg:
+% `@verbx...x' would look like the '@verbx' command.  --janneke@gnu.org
+%
+% [Knuth]: Donald Ervin Knuth, 1996.  The TeXbook.
+%
+% [Knuth] p.344; only we need to do the other characters Texinfo sets
+% active too.  Otherwise, they get lost as the first character on a
+% verbatim line.
+\def\dospecials{%
+  \do\ \do\\\do\{\do\}\do\$\do\&%
+  \do\#\do\^\do\^^K\do\_\do\^^A\do\%\do\~%
+  \do\<\do\>\do\|\do\@\do+\do\"%
+  % Don't do the quotes -- if we do, @set txicodequoteundirected and
+  % @set txicodequotebacktick will not have effect on @verb and
+  % @verbatim, and ?` and !` ligatures won't get disabled.
+  %\do\`\do\'%
+}
+%
+% [Knuth] p. 380
+\def\uncatcodespecials{%
+  \def\do##1{\catcode`##1=\other}\dospecials}
+%
+% Setup for the @verb command.
+%
+% Eight spaces for a tab
+\begingroup
+  \catcode`\^^I=\active
+  \gdef\tabeightspaces{\catcode`\^^I=\active\def^^I{\ \ \ \ \ \ \ \ }}
+\endgroup
+%
+\def\setupverb{%
+  \tt  % easiest (and conventionally used) font for verbatim
+  \def\par{\leavevmode\endgraf}%
+  \setupmarkupstyle{verb}%
+  \tabeightspaces
+  % Respect line breaks,
+  % print special symbols as themselves, and
+  % make each space count
+  % must do in this order:
+  \obeylines \uncatcodespecials \sepspaces
+}
+
+% Setup for the @verbatim environment
+%
+% Real tab expansion.
+\newdimen\tabw \setbox0=\hbox{\tt\space} \tabw=8\wd0 % tab amount
+%
+% We typeset each line of the verbatim in an \hbox, so we can handle
+% tabs.  The \global is in case the verbatim line starts with an accent,
+% or some other command that starts with a begin-group.  Otherwise, the
+% entire \verbbox would disappear at the corresponding end-group, before
+% it is typeset.  Meanwhile, we can't have nested verbatim commands
+% (can we?), so the \global won't be overwriting itself.
+\newbox\verbbox
+\def\starttabbox{\global\setbox\verbbox=\hbox\bgroup}
+%
+\begingroup
+  \catcode`\^^I=\active
+  \gdef\tabexpand{%
+    \catcode`\^^I=\active
+    \def^^I{\leavevmode\egroup
+      \dimen\verbbox=\wd\verbbox % the width so far, or since the previous tab
+      \divide\dimen\verbbox by\tabw
+      \multiply\dimen\verbbox by\tabw % compute previous multiple of \tabw
+      \advance\dimen\verbbox by\tabw  % advance to next multiple of \tabw
+      \wd\verbbox=\dimen\verbbox \box\verbbox \starttabbox
+    }%
+  }
+\endgroup
+
+% start the verbatim environment.
+\def\setupverbatim{%
+  \let\nonarrowing = t%
+  \nonfillstart
+  \tt % easiest (and conventionally used) font for verbatim
+  % The \leavevmode here is for blank lines.  Otherwise, we would
+  % never \starttabox and the \egroup would end verbatim mode.
+  \def\par{\leavevmode\egroup\box\verbbox\endgraf}%
+  \tabexpand
+  \setupmarkupstyle{verbatim}%
+  % Respect line breaks,
+  % print special symbols as themselves, and
+  % make each space count.
+  % Must do in this order:
+  \obeylines \uncatcodespecials \sepspaces
+  \everypar{\starttabbox}%
+}
+
+% Do the @verb magic: verbatim text is quoted by unique
+% delimiter characters.  Before first delimiter expect a
+% right brace, after last delimiter expect closing brace:
+%
+%    \def\doverb'{'<char>#1<char>'}'{#1}
+%
+% [Knuth] p. 382; only eat outer {}
+\begingroup
+  \catcode`[=1\catcode`]=2\catcode`\{=\other\catcode`\}=\other
+  \gdef\doverb{#1[\def\next##1#1}[##1\endgroup]\next]
+\endgroup
+%
+\def\verb{\begingroup\setupverb\doverb}
+%
+%
+% Do the @verbatim magic: define the macro \doverbatim so that
+% the (first) argument ends when '@end verbatim' is reached, ie:
+%
+%     \def\doverbatim#1@end verbatim{#1}
+%
+% For Texinfo it's a lot easier than for LaTeX,
+% because texinfo's \verbatim doesn't stop at '\end{verbatim}':
+% we need not redefine '\', '{' and '}'.
+%
+% Inspired by LaTeX's verbatim command set [latex.ltx]
+%
+\begingroup
+  \catcode`\ =\active
+  \obeylines %
+  % ignore everything up to the first ^^M, that's the newline at the end
+  % of the @verbatim input line itself.  Otherwise we get an extra blank
+  % line in the output.
+  \xdef\doverbatim#1^^M#2@end verbatim{#2\noexpand\end\gobble verbatim}%
+  % We really want {...\end verbatim} in the body of the macro, but
+  % without the active space; thus we have to use \xdef and \gobble.
+\endgroup
+%
+\envdef\verbatim{%
+    \setupverbatim\doverbatim
+}
+\let\Everbatim = \afterenvbreak
+
+
+% @verbatiminclude FILE - insert text of file in verbatim environment.
+%
+\def\verbatiminclude{\parseargusing\filenamecatcodes\doverbatiminclude}
+%
+\def\doverbatiminclude#1{%
+  {%
+    \makevalueexpandable
+    \setupverbatim
+    \indexnofonts       % Allow `@@' and other weird things in file names.
+    \wlog{texinfo.tex: doing @verbatiminclude of #1^^J}%
+    \input #1
+    \afterenvbreak
+  }%
+}
+
+% @copying ... @end copying.
+% Save the text away for @insertcopying later.
+%
+% We save the uninterpreted tokens, rather than creating a box.
+% Saving the text in a box would be much easier, but then all the
+% typesetting commands (@smallbook, font changes, etc.) have to be done
+% beforehand -- and a) we want @copying to be done first in the source
+% file; b) letting users define the frontmatter in as flexible order as
+% possible is very desirable.
+%
+\def\copying{\checkenv{}\begingroup\scanargctxt\docopying}
+\def\docopying#1@end copying{\endgroup\def\copyingtext{#1}}
+%
+\def\insertcopying{%
+  \begingroup
+    \parindent = 0pt  % paragraph indentation looks wrong on title page
+    \scanexp\copyingtext
+  \endgroup
+}
+
+
+\message{defuns,}
+% @defun etc.
+
+\newskip\defbodyindent \defbodyindent=.4in
+\newskip\defargsindent \defargsindent=50pt
+\newskip\deflastargmargin \deflastargmargin=18pt
+\newcount\defunpenalty
+
+% Start the processing of @deffn:
+\def\startdefun{%
+  \ifnum\lastpenalty<10000
+    \medbreak
+    \defunpenalty=10003 % Will keep this @deffn together with the
+                        % following @def command, see below.
+  \else
+    % If there are two @def commands in a row, we'll have a \nobreak,
+    % which is there to keep the function description together with its
+    % header.  But if there's nothing but headers, we need to allow a
+    % break somewhere.  Check specifically for penalty 10002, inserted
+    % by \printdefunline, instead of 10000, since the sectioning
+    % commands also insert a nobreak penalty, and we don't want to allow
+    % a break between a section heading and a defun.
+    %
+    % As a further refinement, we avoid "club" headers by signalling
+    % with penalty of 10003 after the very first @deffn in the
+    % sequence (see above), and penalty of 10002 after any following
+    % @def command.
+    \ifnum\lastpenalty=10002 \penalty2000 \else \defunpenalty=10002 \fi
+    %
+    % Similarly, after a section heading, do not allow a break.
+    % But do insert the glue.
+    \medskip  % preceded by discardable penalty, so not a breakpoint
+  \fi
+  %
+  \parindent=0in
+  \advance\leftskip by \defbodyindent
+  \exdentamount=\defbodyindent
+}
+
+\def\dodefunx#1{%
+  % First, check whether we are in the right environment:
+  \checkenv#1%
+  %
+  % As above, allow line break if we have multiple x headers in a row.
+  % It's not a great place, though.
+  \ifnum\lastpenalty=10002 \penalty3000 \else \defunpenalty=10002 \fi
+  %
+  % And now, it's time to reuse the body of the original defun:
+  \expandafter\gobbledefun#1%
+}
+\def\gobbledefun#1\startdefun{}
+
+% \printdefunline \deffnheader{text}
+%
+\def\printdefunline#1#2{%
+  \begingroup
+    % call \deffnheader:
+    #1#2 \endheader
+    % common ending:
+    \interlinepenalty = 10000
+    \advance\rightskip by 0pt plus 1fil\relax
+    \endgraf
+    \nobreak\vskip -\parskip
+    \penalty\defunpenalty  % signal to \startdefun and \dodefunx
+    % Some of the @defun-type tags do not enable magic parentheses,
+    % rendering the following check redundant.  But we don't optimize.
+    \checkparencounts
+  \endgroup
+}
+
+\def\Edefun{\endgraf\medbreak}
+
+% \makedefun{deffn} creates \deffn, \deffnx and \Edeffn;
+% the only thing remaining is to define \deffnheader.
+%
+\def\makedefun#1{%
+  \expandafter\let\csname E#1\endcsname = \Edefun
+  \edef\temp{\noexpand\domakedefun
+    \makecsname{#1}\makecsname{#1x}\makecsname{#1header}}%
+  \temp
+}
+
+% \domakedefun \deffn \deffnx \deffnheader
+%
+% Define \deffn and \deffnx, without parameters.
+% \deffnheader has to be defined explicitly.
+%
+\def\domakedefun#1#2#3{%
+  \envdef#1{%
+    \startdefun
+    \doingtypefnfalse    % distinguish typed functions from all else
+    \parseargusing\activeparens{\printdefunline#3}%
+  }%
+  \def#2{\dodefunx#1}%
+  \def#3%
+}
+
+\newif\ifdoingtypefn       % doing typed function?
+\newif\ifrettypeownline    % typeset return type on its own line?
+
+% @deftypefnnewline on|off says whether the return type of typed functions
+% are printed on their own line.  This affects @deftypefn, @deftypefun,
+% @deftypeop, and @deftypemethod.
+% 
+\parseargdef\deftypefnnewline{%
+  \def\temp{#1}%
+  \ifx\temp\onword
+    \expandafter\let\csname SETtxideftypefnnl\endcsname
+      = \empty
+  \else\ifx\temp\offword
+    \expandafter\let\csname SETtxideftypefnnl\endcsname
+      = \relax
+  \else
+    \errhelp = \EMsimple
+    \errmessage{Unknown @txideftypefnnl value `\temp',
+                must be on|off}%
+  \fi\fi
+}
+
+% Untyped functions:
+
+% @deffn category name args
+\makedefun{deffn}{\deffngeneral{}}
+
+% @deffn category class name args
+\makedefun{defop}#1 {\defopon{#1\ \putwordon}}
+
+% \defopon {category on}class name args
+\def\defopon#1#2 {\deffngeneral{\putwordon\ \code{#2}}{#1\ \code{#2}} }
+
+% \deffngeneral {subind}category name args
+%
+\def\deffngeneral#1#2 #3 #4\endheader{%
+  % Remember that \dosubind{fn}{foo}{} is equivalent to \doind{fn}{foo}.
+  \dosubind{fn}{\code{#3}}{#1}%
+  \defname{#2}{}{#3}\magicamp\defunargs{#4\unskip}%
+}
+
+% Typed functions:
+
+% @deftypefn category type name args
+\makedefun{deftypefn}{\deftypefngeneral{}}
+
+% @deftypeop category class type name args
+\makedefun{deftypeop}#1 {\deftypeopon{#1\ \putwordon}}
+
+% \deftypeopon {category on}class type name args
+\def\deftypeopon#1#2 {\deftypefngeneral{\putwordon\ \code{#2}}{#1\ \code{#2}} }
+
+% \deftypefngeneral {subind}category type name args
+%
+\def\deftypefngeneral#1#2 #3 #4 #5\endheader{%
+  \dosubind{fn}{\code{#4}}{#1}%
+  \doingtypefntrue
+  \defname{#2}{#3}{#4}\defunargs{#5\unskip}%
+}
+
+% Typed variables:
+
+% @deftypevr category type var args
+\makedefun{deftypevr}{\deftypecvgeneral{}}
+
+% @deftypecv category class type var args
+\makedefun{deftypecv}#1 {\deftypecvof{#1\ \putwordof}}
+
+% \deftypecvof {category of}class type var args
+\def\deftypecvof#1#2 {\deftypecvgeneral{\putwordof\ \code{#2}}{#1\ \code{#2}} }
+
+% \deftypecvgeneral {subind}category type var args
+%
+\def\deftypecvgeneral#1#2 #3 #4 #5\endheader{%
+  \dosubind{vr}{\code{#4}}{#1}%
+  \defname{#2}{#3}{#4}\defunargs{#5\unskip}%
+}
+
+% Untyped variables:
+
+% @defvr category var args
+\makedefun{defvr}#1 {\deftypevrheader{#1} {} }
+
+% @defcv category class var args
+\makedefun{defcv}#1 {\defcvof{#1\ \putwordof}}
+
+% \defcvof {category of}class var args
+\def\defcvof#1#2 {\deftypecvof{#1}#2 {} }
+
+% Types:
+
+% @deftp category name args
+\makedefun{deftp}#1 #2 #3\endheader{%
+  \doind{tp}{\code{#2}}%
+  \defname{#1}{}{#2}\defunargs{#3\unskip}%
+}
+
+% Remaining @defun-like shortcuts:
+\makedefun{defun}{\deffnheader{\putwordDeffunc} }
+\makedefun{defmac}{\deffnheader{\putwordDefmac} }
+\makedefun{defspec}{\deffnheader{\putwordDefspec} }
+\makedefun{deftypefun}{\deftypefnheader{\putwordDeffunc} }
+\makedefun{defvar}{\defvrheader{\putwordDefvar} }
+\makedefun{defopt}{\defvrheader{\putwordDefopt} }
+\makedefun{deftypevar}{\deftypevrheader{\putwordDefvar} }
+\makedefun{defmethod}{\defopon\putwordMethodon}
+\makedefun{deftypemethod}{\deftypeopon\putwordMethodon}
+\makedefun{defivar}{\defcvof\putwordInstanceVariableof}
+\makedefun{deftypeivar}{\deftypecvof\putwordInstanceVariableof}
+
+% \defname, which formats the name of the @def (not the args).
+% #1 is the category, such as "Function".
+% #2 is the return type, if any.
+% #3 is the function name.
+%
+% We are followed by (but not passed) the arguments, if any.
+%
+\def\defname#1#2#3{%
+  \par
+  % Get the values of \leftskip and \rightskip as they were outside the @def...
+  \advance\leftskip by -\defbodyindent
+  %
+  % Determine if we are typesetting the return type of a typed function
+  % on a line by itself.
+  \rettypeownlinefalse
+  \ifdoingtypefn  % doing a typed function specifically?
+    % then check user option for putting return type on its own line:
+    \expandafter\ifx\csname SETtxideftypefnnl\endcsname\relax \else
+      \rettypeownlinetrue
+    \fi
+  \fi
+  %
+  % How we'll format the category name.  Putting it in brackets helps
+  % distinguish it from the body text that may end up on the next line
+  % just below it.
+  \def\temp{#1}%
+  \setbox0=\hbox{\kern\deflastargmargin \ifx\temp\empty\else [\rm\temp]\fi}
+  %
+  % Figure out line sizes for the paragraph shape.  We'll always have at
+  % least two.
+  \tempnum = 2
+  %
+  % The first line needs space for \box0; but if \rightskip is nonzero,
+  % we need only space for the part of \box0 which exceeds it:
+  \dimen0=\hsize  \advance\dimen0 by -\wd0  \advance\dimen0 by \rightskip
+  %
+  % If doing a return type on its own line, we'll have another line.
+  \ifrettypeownline
+    \advance\tempnum by 1
+    \def\maybeshapeline{0in \hsize}%
+  \else
+    \def\maybeshapeline{}%
+  \fi
+  %
+  % The continuations:
+  \dimen2=\hsize  \advance\dimen2 by -\defargsindent
+  %
+  % The final paragraph shape:
+  \parshape \tempnum  0in \dimen0  \maybeshapeline  \defargsindent \dimen2
+  %
+  % Put the category name at the right margin.
+  \noindent
+  \hbox to 0pt{%
+    \hfil\box0 \kern-\hsize
+    % \hsize has to be shortened this way:
+    \kern\leftskip
+    % Intentionally do not respect \rightskip, since we need the space.
+  }%
+  %
+  % Allow all lines to be underfull without complaint:
+  \tolerance=10000 \hbadness=10000
+  \exdentamount=\defbodyindent
+  {%
+    % defun fonts. We use typewriter by default (used to be bold) because:
+    % . we're printing identifiers, they should be in tt in principle.
+    % . in languages with many accents, such as Czech or French, it's
+    %   common to leave accents off identifiers.  The result looks ok in
+    %   tt, but exceedingly strange in rm.
+    % . we don't want -- and --- to be treated as ligatures.
+    % . this still does not fix the ?` and !` ligatures, but so far no
+    %   one has made identifiers using them :).
+    \df \tt
+    \def\temp{#2}% text of the return type
+    \ifx\temp\empty\else
+      \tclose{\temp}% typeset the return type
+      \ifrettypeownline
+        % put return type on its own line; prohibit line break following:
+        \hfil\vadjust{\nobreak}\break  
+      \else
+        \space  % type on same line, so just followed by a space
+      \fi
+    \fi           % no return type
+    #3% output function name
+  }%
+  {\rm\enskip}% hskip 0.5 em of \tenrm
+  %
+  \boldbrax
+  % arguments will be output next, if any.
+}
+
+% Print arguments in slanted roman (not ttsl), inconsistently with using
+% tt for the name.  This is because literal text is sometimes needed in
+% the argument list (groff manual), and ttsl and tt are not very
+% distinguishable.  Prevent hyphenation at `-' chars.
+%
+\def\defunargs#1{%
+  % use sl by default (not ttsl),
+  % tt for the names.
+  \df \sl \hyphenchar\font=0
+  %
+  % On the other hand, if an argument has two dashes (for instance), we
+  % want a way to get ttsl.  Let's try @var for that.
+  \def\var##1{{\setupmarkupstyle{var}\ttslanted{##1}}}%
+  #1%
+  \sl\hyphenchar\font=45
+}
+
+% We want ()&[] to print specially on the defun line.
+%
+\def\activeparens{%
+  \catcode`\(=\active \catcode`\)=\active
+  \catcode`\[=\active \catcode`\]=\active
+  \catcode`\&=\active
+}
+
+% Make control sequences which act like normal parenthesis chars.
+\let\lparen = ( \let\rparen = )
+
+% Be sure that we always have a definition for `(', etc.  For example,
+% if the fn name has parens in it, \boldbrax will not be in effect yet,
+% so TeX would otherwise complain about undefined control sequence.
+{
+  \activeparens
+  \global\let(=\lparen \global\let)=\rparen
+  \global\let[=\lbrack \global\let]=\rbrack
+  \global\let& = \&
+
+  \gdef\boldbrax{\let(=\opnr\let)=\clnr\let[=\lbrb\let]=\rbrb}
+  \gdef\magicamp{\let&=\amprm}
+}
+
+\newcount\parencount
+
+% If we encounter &foo, then turn on ()-hacking afterwards
+\newif\ifampseen
+\def\amprm#1 {\ampseentrue{\bf\ }}
+
+\def\parenfont{%
+  \ifampseen
+    % At the first level, print parens in roman,
+    % otherwise use the default font.
+    \ifnum \parencount=1 \rm \fi
+  \else
+    % The \sf parens (in \boldbrax) actually are a little bolder than
+    % the contained text.  This is especially needed for [ and ] .
+    \sf
+  \fi
+}
+\def\infirstlevel#1{%
+  \ifampseen
+    \ifnum\parencount=1
+      #1%
+    \fi
+  \fi
+}
+\def\bfafterword#1 {#1 \bf}
+
+\def\opnr{%
+  \global\advance\parencount by 1
+  {\parenfont(}%
+  \infirstlevel \bfafterword
+}
+\def\clnr{%
+  {\parenfont)}%
+  \infirstlevel \sl
+  \global\advance\parencount by -1
+}
+
+\newcount\brackcount
+\def\lbrb{%
+  \global\advance\brackcount by 1
+  {\bf[}%
+}
+\def\rbrb{%
+  {\bf]}%
+  \global\advance\brackcount by -1
+}
+
+\def\checkparencounts{%
+  \ifnum\parencount=0 \else \badparencount \fi
+  \ifnum\brackcount=0 \else \badbrackcount \fi
+}
+% these should not use \errmessage; the glibc manual, at least, actually
+% has such constructs (when documenting function pointers).
+\def\badparencount{%
+  \message{Warning: unbalanced parentheses in @def...}%
+  \global\parencount=0
+}
+\def\badbrackcount{%
+  \message{Warning: unbalanced square brackets in @def...}%
+  \global\brackcount=0
+}
+
+
+\message{macros,}
+% @macro.
+
+% To do this right we need a feature of e-TeX, \scantokens,
+% which we arrange to emulate with a temporary file in ordinary TeX.
+\ifx\eTeXversion\thisisundefined
+  \newwrite\macscribble
+  \def\scantokens#1{%
+    \toks0={#1}%
+    \immediate\openout\macscribble=\jobname.tmp
+    \immediate\write\macscribble{\the\toks0}%
+    \immediate\closeout\macscribble
+    \input \jobname.tmp
+  }
+\fi
+
+\def\scanmacro#1{\begingroup
+  \newlinechar`\^^M
+  \let\xeatspaces\eatspaces
+  %
+  % Undo catcode changes of \startcontents and \doprintindex
+  % When called from @insertcopying or (short)caption, we need active
+  % backslash to get it printed correctly.  Previously, we had
+  % \catcode`\\=\other instead.  We'll see whether a problem appears
+  % with macro expansion.				--kasal, 19aug04
+  \catcode`\@=0 \catcode`\\=\active \escapechar=`\@
+  %
+  % ... and for \example:
+  \spaceisspace
+  %
+  % The \empty here causes a following catcode 5 newline to be eaten as
+  % part of reading whitespace after a control sequence.  It does not
+  % eat a catcode 13 newline.  There's no good way to handle the two
+  % cases (untried: maybe e-TeX's \everyeof could help, though plain TeX
+  % would then have different behavior).  See the Macro Details node in
+  % the manual for the workaround we recommend for macros and
+  % line-oriented commands.
+  % 
+  \scantokens{#1\empty}%
+\endgroup}
+
+\def\scanexp#1{%
+  \edef\temp{\noexpand\scanmacro{#1}}%
+  \temp
+}
+
+\newcount\paramno   % Count of parameters
+\newtoks\macname    % Macro name
+\newif\ifrecursive  % Is it recursive?
+
+% List of all defined macros in the form
+%    \definedummyword\macro1\definedummyword\macro2...
+% Currently is also contains all @aliases; the list can be split
+% if there is a need.
+\def\macrolist{}
+
+% Add the macro to \macrolist
+\def\addtomacrolist#1{\expandafter \addtomacrolistxxx \csname#1\endcsname}
+\def\addtomacrolistxxx#1{%
+     \toks0 = \expandafter{\macrolist\definedummyword#1}%
+     \xdef\macrolist{\the\toks0}%
+}
+
+% Utility routines.
+% This does \let #1 = #2, with \csnames; that is,
+%   \let \csname#1\endcsname = \csname#2\endcsname
+% (except of course we have to play expansion games).
+%
+\def\cslet#1#2{%
+  \expandafter\let
+  \csname#1\expandafter\endcsname
+  \csname#2\endcsname
+}
+
+% Trim leading and trailing spaces off a string.
+% Concepts from aro-bend problem 15 (see CTAN).
+{\catcode`\@=11
+\gdef\eatspaces #1{\expandafter\trim@\expandafter{#1 }}
+\gdef\trim@ #1{\trim@@ @#1 @ #1 @ @@}
+\gdef\trim@@ #1@ #2@ #3@@{\trim@@@\empty #2 @}
+\def\unbrace#1{#1}
+\unbrace{\gdef\trim@@@ #1 } #2@{#1}
+}
+
+% Trim a single trailing ^^M off a string.
+{\catcode`\^^M=\other \catcode`\Q=3%
+\gdef\eatcr #1{\eatcra #1Q^^MQ}%
+\gdef\eatcra#1^^MQ{\eatcrb#1Q}%
+\gdef\eatcrb#1Q#2Q{#1}%
+}
+
+% Macro bodies are absorbed as an argument in a context where
+% all characters are catcode 10, 11 or 12, except \ which is active
+% (as in normal texinfo). It is necessary to change the definition of \
+% to recognize macro arguments; this is the job of \mbodybackslash.
+%
+% Non-ASCII encodings make 8-bit characters active, so un-activate
+% them to avoid their expansion.  Must do this non-globally, to
+% confine the change to the current group.
+%
+% It's necessary to have hard CRs when the macro is executed. This is
+% done by making ^^M (\endlinechar) catcode 12 when reading the macro
+% body, and then making it the \newlinechar in \scanmacro.
+%
+\def\scanctxt{% used as subroutine
+  \catcode`\"=\other
+  \catcode`\+=\other
+  \catcode`\<=\other
+  \catcode`\>=\other
+  \catcode`\@=\other
+  \catcode`\^=\other
+  \catcode`\_=\other
+  \catcode`\|=\other
+  \catcode`\~=\other
+  \ifx\declaredencoding\ascii \else \setnonasciicharscatcodenonglobal\other \fi
+}
+
+\def\scanargctxt{% used for copying and captions, not macros.
+  \scanctxt
+  \catcode`\\=\other
+  \catcode`\^^M=\other
+}
+
+\def\macrobodyctxt{% used for @macro definitions
+  \scanctxt
+  \catcode`\{=\other
+  \catcode`\}=\other
+  \catcode`\^^M=\other
+  \usembodybackslash
+}
+
+\def\macroargctxt{% used when scanning invocations
+  \scanctxt
+  \catcode`\\=0
+}
+% why catcode 0 for \ in the above?  To recognize \\ \{ \} as "escapes"
+% for the single characters \ { }.  Thus, we end up with the "commands"
+% that would be written @\ @{ @} in a Texinfo document.
+% 
+% We already have @{ and @}.  For @\, we define it here, and only for
+% this purpose, to produce a typewriter backslash (so, the @\ that we
+% define for @math can't be used with @macro calls):
+%
+\def\\{\normalbackslash}%
+% 
+% We would like to do this for \, too, since that is what makeinfo does.
+% But it is not possible, because Texinfo already has a command @, for a
+% cedilla accent.  Documents must use @comma{} instead.
+%
+% \anythingelse will almost certainly be an error of some kind.
+
+
+% \mbodybackslash is the definition of \ in @macro bodies.
+% It maps \foo\ => \csname macarg.foo\endcsname => #N
+% where N is the macro parameter number.
+% We define \csname macarg.\endcsname to be \realbackslash, so
+% \\ in macro replacement text gets you a backslash.
+%
+{\catcode`@=0 @catcode`@\=@active
+ @gdef@usembodybackslash{@let\=@mbodybackslash}
+ @gdef@mbodybackslash#1\{@csname macarg.#1@endcsname}
+}
+\expandafter\def\csname macarg.\endcsname{\realbackslash}
+
+\def\margbackslash#1{\char`\#1 }
+
+\def\macro{\recursivefalse\parsearg\macroxxx}
+\def\rmacro{\recursivetrue\parsearg\macroxxx}
+
+\def\macroxxx#1{%
+  \getargs{#1}% now \macname is the macname and \argl the arglist
+  \ifx\argl\empty       % no arguments
+     \paramno=0\relax
+  \else
+     \expandafter\parsemargdef \argl;%
+     \if\paramno>256\relax
+       \ifx\eTeXversion\thisisundefined
+         \errhelp = \EMsimple
+         \errmessage{You need eTeX to compile a file with macros with more than 256 arguments}
+       \fi
+     \fi
+  \fi
+  \if1\csname ismacro.\the\macname\endcsname
+     \message{Warning: redefining \the\macname}%
+  \else
+     \expandafter\ifx\csname \the\macname\endcsname \relax
+     \else \errmessage{Macro name \the\macname\space already defined}\fi
+     \global\cslet{macsave.\the\macname}{\the\macname}%
+     \global\expandafter\let\csname ismacro.\the\macname\endcsname=1%
+     \addtomacrolist{\the\macname}%
+  \fi
+  \begingroup \macrobodyctxt
+  \ifrecursive \expandafter\parsermacbody
+  \else \expandafter\parsemacbody
+  \fi}
+
+\parseargdef\unmacro{%
+  \if1\csname ismacro.#1\endcsname
+    \global\cslet{#1}{macsave.#1}%
+    \global\expandafter\let \csname ismacro.#1\endcsname=0%
+    % Remove the macro name from \macrolist:
+    \begingroup
+      \expandafter\let\csname#1\endcsname \relax
+      \let\definedummyword\unmacrodo
+      \xdef\macrolist{\macrolist}%
+    \endgroup
+  \else
+    \errmessage{Macro #1 not defined}%
+  \fi
+}
+
+% Called by \do from \dounmacro on each macro.  The idea is to omit any
+% macro definitions that have been changed to \relax.
+%
+\def\unmacrodo#1{%
+  \ifx #1\relax
+    % remove this
+  \else
+    \noexpand\definedummyword \noexpand#1%
+  \fi
+}
+
+% This makes use of the obscure feature that if the last token of a
+% <parameter list> is #, then the preceding argument is delimited by
+% an opening brace, and that opening brace is not consumed.
+\def\getargs#1{\getargsxxx#1{}}
+\def\getargsxxx#1#{\getmacname #1 \relax\getmacargs}
+\def\getmacname#1 #2\relax{\macname={#1}}
+\def\getmacargs#1{\def\argl{#1}}
+
+% For macro processing make @ a letter so that we can make Texinfo private macro names.
+\edef\texiatcatcode{\the\catcode`\@}
+\catcode `@=11\relax
+
+% Parse the optional {params} list.  Set up \paramno and \paramlist
+% so \defmacro knows what to do.  Define \macarg.BLAH for each BLAH
+% in the params list to some hook where the argument si to be expanded.  If
+% there are less than 10 arguments that hook is to be replaced by ##N where N
+% is the position in that list, that is to say the macro arguments are to be
+% defined `a la TeX in the macro body.  
+%
+% That gets used by \mbodybackslash (above).
+%
+% We need to get `macro parameter char #' into several definitions.
+% The technique used is stolen from LaTeX: let \hash be something
+% unexpandable, insert that wherever you need a #, and then redefine
+% it to # just before using the token list produced.
+%
+% The same technique is used to protect \eatspaces till just before
+% the macro is used.
+%
+% If there are 10 or more arguments, a different technique is used, where the
+% hook remains in the body, and when macro is to be expanded the body is
+% processed again to replace the arguments.
+%
+% In that case, the hook is \the\toks N-1, and we simply set \toks N-1 to the
+% argument N value and then \edef  the body (nothing else will expand because of
+% the catcode regime underwhich the body was input).
+%
+% If you compile with TeX (not eTeX), and you have macros with 10 or more
+% arguments, you need that no macro has more than 256 arguments, otherwise an
+% error is produced.
+\def\parsemargdef#1;{%
+  \paramno=0\def\paramlist{}%
+  \let\hash\relax
+  \let\xeatspaces\relax
+  \parsemargdefxxx#1,;,%
+  % In case that there are 10 or more arguments we parse again the arguments
+  % list to set new definitions for the \macarg.BLAH macros corresponding to
+  % each BLAH argument. It was anyhow needed to parse already once this list
+  % in order to count the arguments, and as macros with at most 9 arguments
+  % are by far more frequent than macro with 10 or more arguments, defining
+  % twice the \macarg.BLAH macros does not cost too much processing power.
+  \ifnum\paramno<10\relax\else
+    \paramno0\relax
+    \parsemmanyargdef@@#1,;,% 10 or more arguments
+  \fi
+}
+\def\parsemargdefxxx#1,{%
+  \if#1;\let\next=\relax
+  \else \let\next=\parsemargdefxxx
+    \advance\paramno by 1
+    \expandafter\edef\csname macarg.\eatspaces{#1}\endcsname
+        {\xeatspaces{\hash\the\paramno}}%
+    \edef\paramlist{\paramlist\hash\the\paramno,}%
+  \fi\next}
+
+\def\parsemmanyargdef@@#1,{%
+  \if#1;\let\next=\relax
+  \else 
+    \let\next=\parsemmanyargdef@@
+    \edef\tempb{\eatspaces{#1}}%
+    \expandafter\def\expandafter\tempa
+       \expandafter{\csname macarg.\tempb\endcsname}%
+    % Note that we need some extra \noexpand\noexpand, this is because we
+    % don't want \the  to be expanded in the \parsermacbody  as it uses an
+    % \xdef .
+    \expandafter\edef\tempa
+      {\noexpand\noexpand\noexpand\the\toks\the\paramno}%
+    \advance\paramno by 1\relax
+  \fi\next}
+
+% These two commands read recursive and nonrecursive macro bodies.
+% (They're different since rec and nonrec macros end differently.)
+%
+
+\catcode `\@\texiatcatcode
+\long\def\parsemacbody#1@end macro%
+{\xdef\temp{\eatcr{#1}}\endgroup\defmacro}%
+\long\def\parsermacbody#1@end rmacro%
+{\xdef\temp{\eatcr{#1}}\endgroup\defmacro}%
+\catcode `\@=11\relax
+
+\let\endargs@\relax
+\let\nil@\relax
+\def\nilm@{\nil@}%
+\long\def\nillm@{\nil@}%
+
+% This macro is expanded during the Texinfo macro expansion, not during its
+% definition.  It gets all the arguments values and assigns them to macros
+% macarg.ARGNAME
+%
+% #1 is the macro name
+% #2 is the list of argument names
+% #3 is the list of argument values
+\def\getargvals@#1#2#3{%
+  \def\macargdeflist@{}%
+  \def\saveparamlist@{#2}% Need to keep a copy for parameter expansion.
+  \def\paramlist{#2,\nil@}%
+  \def\macroname{#1}%
+  \begingroup
+  \macroargctxt
+  \def\argvaluelist{#3,\nil@}%
+  \def\@tempa{#3}%
+  \ifx\@tempa\empty
+    \setemptyargvalues@
+  \else
+    \getargvals@@
+  \fi
+}
+
+% 
+\def\getargvals@@{%
+  \ifx\paramlist\nilm@
+      % Some sanity check needed here that \argvaluelist is also empty.
+      \ifx\argvaluelist\nillm@
+      \else
+        \errhelp = \EMsimple
+        \errmessage{Too many arguments in macro `\macroname'!}%
+      \fi
+      \let\next\macargexpandinbody@
+  \else
+    \ifx\argvaluelist\nillm@
+       % No more arguments values passed to macro.  Set remaining named-arg
+       % macros to empty.
+       \let\next\setemptyargvalues@
+    \else
+      % pop current arg name into \@tempb
+      \def\@tempa##1{\pop@{\@tempb}{\paramlist}##1\endargs@}%
+      \expandafter\@tempa\expandafter{\paramlist}%
+       % pop current argument value into \@tempc
+      \def\@tempa##1{\longpop@{\@tempc}{\argvaluelist}##1\endargs@}%
+      \expandafter\@tempa\expandafter{\argvaluelist}%
+       % Here \@tempb is the current arg name and \@tempc is the current arg value.
+       % First place the new argument macro definition into \@tempd
+       \expandafter\macname\expandafter{\@tempc}%
+       \expandafter\let\csname macarg.\@tempb\endcsname\relax
+       \expandafter\def\expandafter\@tempe\expandafter{%
+         \csname macarg.\@tempb\endcsname}%
+       \edef\@tempd{\long\def\@tempe{\the\macname}}%
+       \push@\@tempd\macargdeflist@
+       \let\next\getargvals@@
+    \fi
+  \fi
+  \next
+}
+
+\def\push@#1#2{%
+  \expandafter\expandafter\expandafter\def
+  \expandafter\expandafter\expandafter#2%
+  \expandafter\expandafter\expandafter{%
+  \expandafter#1#2}%
+}
+
+% Replace arguments by their values in the macro body, and place the result
+% in macro \@tempa
+\def\macvalstoargs@{%
+  %  To do this we use the property that token registers that are \the'ed
+  % within an \edef  expand only once. So we are going to place all argument
+  % values into respective token registers.
+  %
+  % First we save the token context, and initialize argument numbering.
+  \begingroup
+    \paramno0\relax
+    % Then, for each argument number #N, we place the corresponding argument
+    % value into a new token list register \toks#N
+    \expandafter\putargsintokens@\saveparamlist@,;,%
+    % Then, we expand the body so that argument are replaced by their
+    % values. The trick for values not to be expanded themselves is that they
+    % are within tokens and that tokens expand only once in an \edef .
+    \edef\@tempc{\csname mac.\macroname .body\endcsname}%
+    % Now we restore the token stack pointer to free the token list registers
+    % which we have used, but we make sure that expanded body is saved after
+    % group.
+    \expandafter
+  \endgroup
+  \expandafter\def\expandafter\@tempa\expandafter{\@tempc}%
+  }
+
+\def\macargexpandinbody@{% 
+  %% Define the named-macro outside of this group and then close this group. 
+  \expandafter
+  \endgroup
+  \macargdeflist@
+  % First the replace in body the macro arguments by their values, the result
+  % is in \@tempa .
+  \macvalstoargs@
+  % Then we point at the \norecurse or \gobble (for recursive) macro value
+  % with \@tempb .
+  \expandafter\let\expandafter\@tempb\csname mac.\macroname .recurse\endcsname
+  % Depending on whether it is recursive or not, we need some tailing
+  % \egroup .
+  \ifx\@tempb\gobble
+     \let\@tempc\relax
+  \else
+     \let\@tempc\egroup
+  \fi
+  % And now we do the real job:
+  \edef\@tempd{\noexpand\@tempb{\macroname}\noexpand\scanmacro{\@tempa}\@tempc}%
+  \@tempd
+}
+
+\def\putargsintokens@#1,{%
+  \if#1;\let\next\relax
+  \else
+    \let\next\putargsintokens@
+    % First we allocate the new token list register, and give it a temporary
+    % alias \@tempb .
+    \toksdef\@tempb\the\paramno
+    % Then we place the argument value into that token list register.
+    \expandafter\let\expandafter\@tempa\csname macarg.#1\endcsname
+    \expandafter\@tempb\expandafter{\@tempa}%
+    \advance\paramno by 1\relax
+  \fi
+  \next
+}
+
+% Save the token stack pointer into macro #1
+\def\texisavetoksstackpoint#1{\edef#1{\the\@cclvi}}
+% Restore the token stack pointer from number in macro #1
+\def\texirestoretoksstackpoint#1{\expandafter\mathchardef\expandafter\@cclvi#1\relax}
+% newtoks that can be used non \outer .
+\def\texinonouternewtoks{\alloc@ 5\toks \toksdef \@cclvi}
+
+% Tailing missing arguments are set to empty
+\def\setemptyargvalues@{%
+  \ifx\paramlist\nilm@
+    \let\next\macargexpandinbody@
+  \else
+    \expandafter\setemptyargvaluesparser@\paramlist\endargs@
+    \let\next\setemptyargvalues@
+  \fi
+  \next
+}
+
+\def\setemptyargvaluesparser@#1,#2\endargs@{%
+  \expandafter\def\expandafter\@tempa\expandafter{%
+    \expandafter\def\csname macarg.#1\endcsname{}}%
+  \push@\@tempa\macargdeflist@
+  \def\paramlist{#2}%
+}
+
+% #1 is the element target macro
+% #2 is the list macro
+% #3,#4\endargs@ is the list value
+\def\pop@#1#2#3,#4\endargs@{%
+   \def#1{#3}%
+   \def#2{#4}%
+}
+\long\def\longpop@#1#2#3,#4\endargs@{%
+   \long\def#1{#3}%
+   \long\def#2{#4}%
+}
+
+% This defines a Texinfo @macro. There are eight cases: recursive and
+% nonrecursive macros of zero, one, up to nine, and many arguments.
+% Much magic with \expandafter here.
+% \xdef is used so that macro definitions will survive the file
+% they're defined in; @include reads the file inside a group.
+%
+\def\defmacro{%
+  \let\hash=##% convert placeholders to macro parameter chars
+  \ifrecursive
+    \ifcase\paramno
+    % 0
+      \expandafter\xdef\csname\the\macname\endcsname{%
+        \noexpand\scanmacro{\temp}}%
+    \or % 1
+      \expandafter\xdef\csname\the\macname\endcsname{%
+         \bgroup\noexpand\macroargctxt
+         \noexpand\braceorline
+         \expandafter\noexpand\csname\the\macname xxx\endcsname}%
+      \expandafter\xdef\csname\the\macname xxx\endcsname##1{%
+         \egroup\noexpand\scanmacro{\temp}}%
+    \else
+      \ifnum\paramno<10\relax % at most 9
+        \expandafter\xdef\csname\the\macname\endcsname{%
+           \bgroup\noexpand\macroargctxt
+           \noexpand\csname\the\macname xx\endcsname}%
+        \expandafter\xdef\csname\the\macname xx\endcsname##1{%
+            \expandafter\noexpand\csname\the\macname xxx\endcsname ##1,}%
+        \expandafter\expandafter
+        \expandafter\xdef
+        \expandafter\expandafter
+          \csname\the\macname xxx\endcsname
+            \paramlist{\egroup\noexpand\scanmacro{\temp}}%
+      \else % 10 or more
+        \expandafter\xdef\csname\the\macname\endcsname{%
+          \noexpand\getargvals@{\the\macname}{\argl}%
+        }%    
+        \global\expandafter\let\csname mac.\the\macname .body\endcsname\temp
+        \global\expandafter\let\csname mac.\the\macname .recurse\endcsname\gobble
+      \fi
+    \fi
+  \else
+    \ifcase\paramno
+    % 0
+      \expandafter\xdef\csname\the\macname\endcsname{%
+        \noexpand\norecurse{\the\macname}%
+        \noexpand\scanmacro{\temp}\egroup}%
+    \or % 1
+      \expandafter\xdef\csname\the\macname\endcsname{%
+         \bgroup\noexpand\macroargctxt
+         \noexpand\braceorline
+         \expandafter\noexpand\csname\the\macname xxx\endcsname}%
+      \expandafter\xdef\csname\the\macname xxx\endcsname##1{%
+        \egroup
+        \noexpand\norecurse{\the\macname}%
+        \noexpand\scanmacro{\temp}\egroup}%
+    \else % at most 9
+      \ifnum\paramno<10\relax
+        \expandafter\xdef\csname\the\macname\endcsname{%
+           \bgroup\noexpand\macroargctxt
+           \expandafter\noexpand\csname\the\macname xx\endcsname}%
+        \expandafter\xdef\csname\the\macname xx\endcsname##1{%
+            \expandafter\noexpand\csname\the\macname xxx\endcsname ##1,}%
+        \expandafter\expandafter
+        \expandafter\xdef
+        \expandafter\expandafter
+        \csname\the\macname xxx\endcsname
+        \paramlist{%
+            \egroup
+            \noexpand\norecurse{\the\macname}%
+            \noexpand\scanmacro{\temp}\egroup}%
+      \else % 10 or more:
+        \expandafter\xdef\csname\the\macname\endcsname{%
+          \noexpand\getargvals@{\the\macname}{\argl}%
+        }%
+        \global\expandafter\let\csname mac.\the\macname .body\endcsname\temp
+        \global\expandafter\let\csname mac.\the\macname .recurse\endcsname\norecurse
+      \fi
+    \fi
+  \fi}
+
+\catcode `\@\texiatcatcode\relax
+
+\def\norecurse#1{\bgroup\cslet{#1}{macsave.#1}}
+
+% \braceorline decides whether the next nonwhitespace character is a
+% {.  If so it reads up to the closing }, if not, it reads the whole
+% line.  Whatever was read is then fed to the next control sequence
+% as an argument (by \parsebrace or \parsearg).
+% 
+\def\braceorline#1{\let\macnamexxx=#1\futurelet\nchar\braceorlinexxx}
+\def\braceorlinexxx{%
+  \ifx\nchar\bgroup\else
+    \expandafter\parsearg
+  \fi \macnamexxx}
+
+
+% @alias.
+% We need some trickery to remove the optional spaces around the equal
+% sign.  Make them active and then expand them all to nothing.
+%
+\def\alias{\parseargusing\obeyspaces\aliasxxx}
+\def\aliasxxx #1{\aliasyyy#1\relax}
+\def\aliasyyy #1=#2\relax{%
+  {%
+    \expandafter\let\obeyedspace=\empty
+    \addtomacrolist{#1}%
+    \xdef\next{\global\let\makecsname{#1}=\makecsname{#2}}%
+  }%
+  \next
+}
+
+
+\message{cross references,}
+
+\newwrite\auxfile
+\newif\ifhavexrefs    % True if xref values are known.
+\newif\ifwarnedxrefs  % True if we warned once that they aren't known.
+
+% @inforef is relatively simple.
+\def\inforef #1{\inforefzzz #1,,,,**}
+\def\inforefzzz #1,#2,#3,#4**{%
+  \putwordSee{} \putwordInfo{} \putwordfile{} \file{\ignorespaces #3{}},
+  node \samp{\ignorespaces#1{}}}
+
+% @node's only job in TeX is to define \lastnode, which is used in
+% cross-references.  The @node line might or might not have commas, and
+% might or might not have spaces before the first comma, like:
+% @node foo , bar , ...
+% We don't want such trailing spaces in the node name.
+%
+\parseargdef\node{\checkenv{}\donode #1 ,\finishnodeparse}
+%
+% also remove a trailing comma, in case of something like this:
+% @node Help-Cross,  ,  , Cross-refs
+\def\donode#1 ,#2\finishnodeparse{\dodonode #1,\finishnodeparse}
+\def\dodonode#1,#2\finishnodeparse{\gdef\lastnode{#1}}
+
+\let\nwnode=\node
+\let\lastnode=\empty
+
+% Write a cross-reference definition for the current node.  #1 is the
+% type (Ynumbered, Yappendix, Ynothing).
+%
+\def\donoderef#1{%
+  \ifx\lastnode\empty\else
+    \setref{\lastnode}{#1}%
+    \global\let\lastnode=\empty
+  \fi
+}
+
+% @anchor{NAME} -- define xref target at arbitrary point.
+%
+\newcount\savesfregister
+%
+\def\savesf{\relax \ifhmode \savesfregister=\spacefactor \fi}
+\def\restoresf{\relax \ifhmode \spacefactor=\savesfregister \fi}
+\def\anchor#1{\savesf \setref{#1}{Ynothing}\restoresf \ignorespaces}
+
+% \setref{NAME}{SNT} defines a cross-reference point NAME (a node or an
+% anchor), which consists of three parts:
+% 1) NAME-title - the current sectioning name taken from \lastsection,
+%                 or the anchor name.
+% 2) NAME-snt   - section number and type, passed as the SNT arg, or
+%                 empty for anchors.
+% 3) NAME-pg    - the page number.
+%
+% This is called from \donoderef, \anchor, and \dofloat.  In the case of
+% floats, there is an additional part, which is not written here:
+% 4) NAME-lof   - the text as it should appear in a @listoffloats.
+%
+\def\setref#1#2{%
+  \pdfmkdest{#1}%
+  \iflinks
+    {%
+      \atdummies  % preserve commands, but don't expand them
+      \edef\writexrdef##1##2{%
+	\write\auxfile{@xrdef{#1-% #1 of \setref, expanded by the \edef
+	  ##1}{##2}}% these are parameters of \writexrdef
+      }%
+      \toks0 = \expandafter{\lastsection}%
+      \immediate \writexrdef{title}{\the\toks0 }%
+      \immediate \writexrdef{snt}{\csname #2\endcsname}% \Ynumbered etc.
+      \safewhatsit{\writexrdef{pg}{\folio}}% will be written later, at \shipout
+    }%
+  \fi
+}
+
+% @xrefautosectiontitle on|off says whether @section(ing) names are used
+% automatically in xrefs, if the third arg is not explicitly specified.
+% This was provided as a "secret" @set xref-automatic-section-title
+% variable, now it's official.
+% 
+\parseargdef\xrefautomaticsectiontitle{%
+  \def\temp{#1}%
+  \ifx\temp\onword
+    \expandafter\let\csname SETxref-automatic-section-title\endcsname
+      = \empty
+  \else\ifx\temp\offword
+    \expandafter\let\csname SETxref-automatic-section-title\endcsname
+      = \relax
+  \else
+    \errhelp = \EMsimple
+    \errmessage{Unknown @xrefautomaticsectiontitle value `\temp',
+                must be on|off}%
+  \fi\fi
+}
+
+
+% @xref, @pxref, and @ref generate cross-references.  For \xrefX, #1 is
+% the node name, #2 the name of the Info cross-reference, #3 the printed
+% node name, #4 the name of the Info file, #5 the name of the printed
+% manual.  All but the node name can be omitted.
+%
+\def\pxref#1{\putwordsee{} \xrefX[#1,,,,,,,]}
+\def\xref#1{\putwordSee{} \xrefX[#1,,,,,,,]}
+\def\ref#1{\xrefX[#1,,,,,,,]}
+%
+\newbox\topbox
+\newbox\printedrefnamebox
+\newbox\printedmanualbox
+%
+\def\xrefX[#1,#2,#3,#4,#5,#6]{\begingroup
+  \unsepspaces
+  %
+  \def\printedrefname{\ignorespaces #3}%
+  \setbox\printedrefnamebox = \hbox{\printedrefname\unskip}%
+  %
+  \def\printedmanual{\ignorespaces #5}%
+  \setbox\printedmanualbox  = \hbox{\printedmanual\unskip}%
+  %
+  % If the printed reference name (arg #3) was not explicitly given in
+  % the @xref, figure out what we want to use.
+  \ifdim \wd\printedrefnamebox = 0pt
+    % No printed node name was explicitly given.
+    \expandafter\ifx\csname SETxref-automatic-section-title\endcsname \relax
+      % Not auto section-title: use node name inside the square brackets.
+      \def\printedrefname{\ignorespaces #1}%
+    \else
+      % Auto section-title: use chapter/section title inside
+      % the square brackets if we have it.
+      \ifdim \wd\printedmanualbox > 0pt
+        % It is in another manual, so we don't have it; use node name.
+        \def\printedrefname{\ignorespaces #1}%
+      \else
+        \ifhavexrefs
+          % We (should) know the real title if we have the xref values.
+          \def\printedrefname{\refx{#1-title}{}}%
+        \else
+          % Otherwise just copy the Info node name.
+          \def\printedrefname{\ignorespaces #1}%
+        \fi%
+      \fi
+    \fi
+  \fi
+  %
+  % Make link in pdf output.
+  \ifpdf
+    {\indexnofonts
+     \turnoffactive
+     \makevalueexpandable
+     % This expands tokens, so do it after making catcode changes, so _
+     % etc. don't get their TeX definitions.
+     \getfilename{#4}%
+     %
+     \edef\pdfxrefdest{#1}%
+     \txiescapepdf\pdfxrefdest
+     %
+     \leavevmode
+     \startlink attr{/Border [0 0 0]}%
+     \ifnum\filenamelength>0
+       goto file{\the\filename.pdf} name{\pdfxrefdest}%
+     \else
+       goto name{\pdfmkpgn{\pdfxrefdest}}%
+     \fi
+    }%
+    \setcolor{\linkcolor}%
+  \fi
+  %
+  % Float references are printed completely differently: "Figure 1.2"
+  % instead of "[somenode], p.3".  We distinguish them by the
+  % LABEL-title being set to a magic string.
+  {%
+    % Have to otherify everything special to allow the \csname to
+    % include an _ in the xref name, etc.
+    \indexnofonts
+    \turnoffactive
+    \expandafter\global\expandafter\let\expandafter\Xthisreftitle
+      \csname XR#1-title\endcsname
+  }%
+  \iffloat\Xthisreftitle
+    % If the user specified the print name (third arg) to the ref,
+    % print it instead of our usual "Figure 1.2".
+    \ifdim\wd\printedrefnamebox = 0pt
+      \refx{#1-snt}{}%
+    \else
+      \printedrefname
+    \fi
+    %
+    % if the user also gave the printed manual name (fifth arg), append
+    % "in MANUALNAME".
+    \ifdim \wd\printedmanualbox > 0pt
+      \space \putwordin{} \cite{\printedmanual}%
+    \fi
+  \else
+    % node/anchor (non-float) references.
+    % 
+    % If we use \unhbox to print the node names, TeX does not insert
+    % empty discretionaries after hyphens, which means that it will not
+    % find a line break at a hyphen in a node names.  Since some manuals
+    % are best written with fairly long node names, containing hyphens,
+    % this is a loss.  Therefore, we give the text of the node name
+    % again, so it is as if TeX is seeing it for the first time.
+    % 
+    % Cross-manual reference.  Only include the "Section ``foo'' in" if
+    % the foo is neither missing or Top.  Thus, @xref{,,,foo,The Foo Manual}
+    % outputs simply "see The Foo Manual".
+    \ifdim \wd\printedmanualbox > 0pt
+      % What is the 7sp about?  The idea is that we also want to omit
+      % the Section part if we would be printing "Top", since they are
+      % clearly trying to refer to the whole manual.  But, this being
+      % TeX, we can't easily compare strings while ignoring the possible
+      % spaces before and after in the input.  By adding the arbitrary
+      % 7sp, we make it much less likely that a real node name would
+      % happen to have the same width as "Top" (e.g., in a monospaced font).
+      % I hope it will never happen in practice.
+      % 
+      % For the same basic reason, we retypeset the "Top" at every
+      % reference, since the current font is indeterminate.
+      % 
+      \setbox\topbox = \hbox{Top\kern7sp}%
+      \setbox2 = \hbox{\ignorespaces \printedrefname \unskip \kern7sp}%
+      \ifdim \wd2 > 7sp
+        \ifdim \wd2 = \wd\topbox \else
+          \putwordSection{} ``\printedrefname'' \putwordin{}\space
+        \fi
+      \fi
+      \cite{\printedmanual}%
+    \else
+      % Reference in this manual.
+      %
+      % _ (for example) has to be the character _ for the purposes of the
+      % control sequence corresponding to the node, but it has to expand
+      % into the usual \leavevmode...\vrule stuff for purposes of
+      % printing. So we \turnoffactive for the \refx-snt, back on for the
+      % printing, back off for the \refx-pg.
+      {\turnoffactive
+       % Only output a following space if the -snt ref is nonempty; for
+       % @unnumbered and @anchor, it won't be.
+       \setbox2 = \hbox{\ignorespaces \refx{#1-snt}{}}%
+       \ifdim \wd2 > 0pt \refx{#1-snt}\space\fi
+      }%
+      % output the `[mynode]' via the macro below so it can be overridden.
+      \xrefprintnodename\printedrefname
+      %
+      % But we always want a comma and a space:
+      ,\space
+      %
+      % output the `page 3'.
+      \turnoffactive \putwordpage\tie\refx{#1-pg}{}%
+    \fi
+  \fi
+  \endlink
+\endgroup}
+
+% This macro is called from \xrefX for the `[nodename]' part of xref
+% output.  It's a separate macro only so it can be changed more easily,
+% since square brackets don't work well in some documents.  Particularly
+% one that Bob is working on :).
+%
+\def\xrefprintnodename#1{[#1]}
+
+% Things referred to by \setref.
+%
+\def\Ynothing{}
+\def\Yomitfromtoc{}
+\def\Ynumbered{%
+  \ifnum\secno=0
+    \putwordChapter@tie \the\chapno
+  \else \ifnum\subsecno=0
+    \putwordSection@tie \the\chapno.\the\secno
+  \else \ifnum\subsubsecno=0
+    \putwordSection@tie \the\chapno.\the\secno.\the\subsecno
+  \else
+    \putwordSection@tie \the\chapno.\the\secno.\the\subsecno.\the\subsubsecno
+  \fi\fi\fi
+}
+\def\Yappendix{%
+  \ifnum\secno=0
+     \putwordAppendix@tie @char\the\appendixno{}%
+  \else \ifnum\subsecno=0
+     \putwordSection@tie @char\the\appendixno.\the\secno
+  \else \ifnum\subsubsecno=0
+    \putwordSection@tie @char\the\appendixno.\the\secno.\the\subsecno
+  \else
+    \putwordSection@tie
+      @char\the\appendixno.\the\secno.\the\subsecno.\the\subsubsecno
+  \fi\fi\fi
+}
+
+% Define \refx{NAME}{SUFFIX} to reference a cross-reference string named NAME.
+% If its value is nonempty, SUFFIX is output afterward.
+%
+\def\refx#1#2{%
+  {%
+    \indexnofonts
+    \otherbackslash
+    \expandafter\global\expandafter\let\expandafter\thisrefX
+      \csname XR#1\endcsname
+  }%
+  \ifx\thisrefX\relax
+    % If not defined, say something at least.
+    \angleleft un\-de\-fined\angleright
+    \iflinks
+      \ifhavexrefs
+        {\toks0 = {#1}% avoid expansion of possibly-complex value
+         \message{\linenumber Undefined cross reference `\the\toks0'.}}%
+      \else
+        \ifwarnedxrefs\else
+          \global\warnedxrefstrue
+          \message{Cross reference values unknown; you must run TeX again.}%
+        \fi
+      \fi
+    \fi
+  \else
+    % It's defined, so just use it.
+    \thisrefX
+  \fi
+  #2% Output the suffix in any case.
+}
+
+% This is the macro invoked by entries in the aux file.  Usually it's
+% just a \def (we prepend XR to the control sequence name to avoid
+% collisions).  But if this is a float type, we have more work to do.
+%
+\def\xrdef#1#2{%
+  {% The node name might contain 8-bit characters, which in our current
+   % implementation are changed to commands like @'e.  Don't let these
+   % mess up the control sequence name.
+    \indexnofonts
+    \turnoffactive
+    \xdef\safexrefname{#1}%
+  }%
+  %
+  \expandafter\gdef\csname XR\safexrefname\endcsname{#2}% remember this xref
+  %
+  % Was that xref control sequence that we just defined for a float?
+  \expandafter\iffloat\csname XR\safexrefname\endcsname
+    % it was a float, and we have the (safe) float type in \iffloattype.
+    \expandafter\let\expandafter\floatlist
+      \csname floatlist\iffloattype\endcsname
+    %
+    % Is this the first time we've seen this float type?
+    \expandafter\ifx\floatlist\relax
+      \toks0 = {\do}% yes, so just \do
+    \else
+      % had it before, so preserve previous elements in list.
+      \toks0 = \expandafter{\floatlist\do}%
+    \fi
+    %
+    % Remember this xref in the control sequence \floatlistFLOATTYPE,
+    % for later use in \listoffloats.
+    \expandafter\xdef\csname floatlist\iffloattype\endcsname{\the\toks0
+      {\safexrefname}}%
+  \fi
+}
+
+% Read the last existing aux file, if any.  No error if none exists.
+%
+\def\tryauxfile{%
+  \openin 1 \jobname.aux
+  \ifeof 1 \else
+    \readdatafile{aux}%
+    \global\havexrefstrue
+  \fi
+  \closein 1
+}
+
+\def\setupdatafile{%
+  \catcode`\^^@=\other
+  \catcode`\^^A=\other
+  \catcode`\^^B=\other
+  \catcode`\^^C=\other
+  \catcode`\^^D=\other
+  \catcode`\^^E=\other
+  \catcode`\^^F=\other
+  \catcode`\^^G=\other
+  \catcode`\^^H=\other
+  \catcode`\^^K=\other
+  \catcode`\^^L=\other
+  \catcode`\^^N=\other
+  \catcode`\^^P=\other
+  \catcode`\^^Q=\other
+  \catcode`\^^R=\other
+  \catcode`\^^S=\other
+  \catcode`\^^T=\other
+  \catcode`\^^U=\other
+  \catcode`\^^V=\other
+  \catcode`\^^W=\other
+  \catcode`\^^X=\other
+  \catcode`\^^Z=\other
+  \catcode`\^^[=\other
+  \catcode`\^^\=\other
+  \catcode`\^^]=\other
+  \catcode`\^^^=\other
+  \catcode`\^^_=\other
+  % It was suggested to set the catcode of ^ to 7, which would allow ^^e4 etc.
+  % in xref tags, i.e., node names.  But since ^^e4 notation isn't
+  % supported in the main text, it doesn't seem desirable.  Furthermore,
+  % that is not enough: for node names that actually contain a ^
+  % character, we would end up writing a line like this: 'xrdef {'hat
+  % b-title}{'hat b} and \xrdef does a \csname...\endcsname on the first
+  % argument, and \hat is not an expandable control sequence.  It could
+  % all be worked out, but why?  Either we support ^^ or we don't.
+  %
+  % The other change necessary for this was to define \auxhat:
+  % \def\auxhat{\def^{'hat }}% extra space so ok if followed by letter
+  % and then to call \auxhat in \setq.
+  %
+  \catcode`\^=\other
+  %
+  % Special characters.  Should be turned off anyway, but...
+  \catcode`\~=\other
+  \catcode`\[=\other
+  \catcode`\]=\other
+  \catcode`\"=\other
+  \catcode`\_=\other
+  \catcode`\|=\other
+  \catcode`\<=\other
+  \catcode`\>=\other
+  \catcode`\$=\other
+  \catcode`\#=\other
+  \catcode`\&=\other
+  \catcode`\%=\other
+  \catcode`+=\other % avoid \+ for paranoia even though we've turned it off
+  %
+  % This is to support \ in node names and titles, since the \
+  % characters end up in a \csname.  It's easier than
+  % leaving it active and making its active definition an actual \
+  % character.  What I don't understand is why it works in the *value*
+  % of the xrdef.  Seems like it should be a catcode12 \, and that
+  % should not typeset properly.  But it works, so I'm moving on for
+  % now.  --karl, 15jan04.
+  \catcode`\\=\other
+  %
+  % Make the characters 128-255 be printing characters.
+  {%
+    \count1=128
+    \def\loop{%
+      \catcode\count1=\other
+      \advance\count1 by 1
+      \ifnum \count1<256 \loop \fi
+    }%
+  }%
+  %
+  % @ is our escape character in .aux files, and we need braces.
+  \catcode`\{=1
+  \catcode`\}=2
+  \catcode`\@=0
+}
+
+\def\readdatafile#1{%
+\begingroup
+  \setupdatafile
+  \input\jobname.#1
+\endgroup}
+
+
+\message{insertions,}
+% including footnotes.
+
+\newcount \footnoteno
+
+% The trailing space in the following definition for supereject is
+% vital for proper filling; pages come out unaligned when you do a
+% pagealignmacro call if that space before the closing brace is
+% removed. (Generally, numeric constants should always be followed by a
+% space to prevent strange expansion errors.)
+\def\supereject{\par\penalty -20000\footnoteno =0 }
+
+% @footnotestyle is meaningful for Info output only.
+\let\footnotestyle=\comment
+
+{\catcode `\@=11
+%
+% Auto-number footnotes.  Otherwise like plain.
+\gdef\footnote{%
+  \let\indent=\ptexindent
+  \let\noindent=\ptexnoindent
+  \global\advance\footnoteno by \@ne
+  \edef\thisfootno{$^{\the\footnoteno}$}%
+  %
+  % In case the footnote comes at the end of a sentence, preserve the
+  % extra spacing after we do the footnote number.
+  \let\@sf\empty
+  \ifhmode\edef\@sf{\spacefactor\the\spacefactor}\ptexslash\fi
+  %
+  % Remove inadvertent blank space before typesetting the footnote number.
+  \unskip
+  \thisfootno\@sf
+  \dofootnote
+}%
+
+% Don't bother with the trickery in plain.tex to not require the
+% footnote text as a parameter.  Our footnotes don't need to be so general.
+%
+% Oh yes, they do; otherwise, @ifset (and anything else that uses
+% \parseargline) fails inside footnotes because the tokens are fixed when
+% the footnote is read.  --karl, 16nov96.
+%
+\gdef\dofootnote{%
+  \insert\footins\bgroup
+  % We want to typeset this text as a normal paragraph, even if the
+  % footnote reference occurs in (for example) a display environment.
+  % So reset some parameters.
+  \hsize=\pagewidth
+  \interlinepenalty\interfootnotelinepenalty
+  \splittopskip\ht\strutbox % top baseline for broken footnotes
+  \splitmaxdepth\dp\strutbox
+  \floatingpenalty\@MM
+  \leftskip\z@skip
+  \rightskip\z@skip
+  \spaceskip\z@skip
+  \xspaceskip\z@skip
+  \parindent\defaultparindent
+  %
+  \smallfonts \rm
+  %
+  % Because we use hanging indentation in footnotes, a @noindent appears
+  % to exdent this text, so make it be a no-op.  makeinfo does not use
+  % hanging indentation so @noindent can still be needed within footnote
+  % text after an @example or the like (not that this is good style).
+  \let\noindent = \relax
+  %
+  % Hang the footnote text off the number.  Use \everypar in case the
+  % footnote extends for more than one paragraph.
+  \everypar = {\hang}%
+  \textindent{\thisfootno}%
+  %
+  % Don't crash into the line above the footnote text.  Since this
+  % expands into a box, it must come within the paragraph, lest it
+  % provide a place where TeX can split the footnote.
+  \footstrut
+  %
+  % Invoke rest of plain TeX footnote routine.
+  \futurelet\next\fo@t
+}
+}%end \catcode `\@=11
+
+% In case a @footnote appears in a vbox, save the footnote text and create
+% the real \insert just after the vbox finished.  Otherwise, the insertion
+% would be lost.
+% Similarly, if a @footnote appears inside an alignment, save the footnote
+% text to a box and make the \insert when a row of the table is finished.
+% And the same can be done for other insert classes.  --kasal, 16nov03.
+
+% Replace the \insert primitive by a cheating macro.
+% Deeper inside, just make sure that the saved insertions are not spilled
+% out prematurely.
+%
+\def\startsavinginserts{%
+  \ifx \insert\ptexinsert
+    \let\insert\saveinsert
+  \else
+    \let\checkinserts\relax
+  \fi
+}
+
+% This \insert replacement works for both \insert\footins{foo} and
+% \insert\footins\bgroup foo\egroup, but it doesn't work for \insert27{foo}.
+%
+\def\saveinsert#1{%
+  \edef\next{\noexpand\savetobox \makeSAVEname#1}%
+  \afterassignment\next
+  % swallow the left brace
+  \let\temp =
+}
+\def\makeSAVEname#1{\makecsname{SAVE\expandafter\gobble\string#1}}
+\def\savetobox#1{\global\setbox#1 = \vbox\bgroup \unvbox#1}
+
+\def\checksaveins#1{\ifvoid#1\else \placesaveins#1\fi}
+
+\def\placesaveins#1{%
+  \ptexinsert \csname\expandafter\gobblesave\string#1\endcsname
+    {\box#1}%
+}
+
+% eat @SAVE -- beware, all of them have catcode \other:
+{
+  \def\dospecials{\do S\do A\do V\do E} \uncatcodespecials  %  ;-)
+  \gdef\gobblesave @SAVE{}
+}
+
+% initialization:
+\def\newsaveins #1{%
+  \edef\next{\noexpand\newsaveinsX \makeSAVEname#1}%
+  \next
+}
+\def\newsaveinsX #1{%
+  \csname newbox\endcsname #1%
+  \expandafter\def\expandafter\checkinserts\expandafter{\checkinserts
+    \checksaveins #1}%
+}
+
+% initialize:
+\let\checkinserts\empty
+\newsaveins\footins
+\newsaveins\margin
+
+
+% @image.  We use the macros from epsf.tex to support this.
+% If epsf.tex is not installed and @image is used, we complain.
+%
+% Check for and read epsf.tex up front.  If we read it only at @image
+% time, we might be inside a group, and then its definitions would get
+% undone and the next image would fail.
+\openin 1 = epsf.tex
+\ifeof 1 \else
+  % Do not bother showing banner with epsf.tex v2.7k (available in
+  % doc/epsf.tex and on ctan).
+  \def\epsfannounce{\toks0 = }%
+  \input epsf.tex
+\fi
+\closein 1
+%
+% We will only complain once about lack of epsf.tex.
+\newif\ifwarnednoepsf
+\newhelp\noepsfhelp{epsf.tex must be installed for images to
+  work.  It is also included in the Texinfo distribution, or you can get
+  it from ftp://tug.org/tex/epsf.tex.}
+%
+\def\image#1{%
+  \ifx\epsfbox\thisisundefined
+    \ifwarnednoepsf \else
+      \errhelp = \noepsfhelp
+      \errmessage{epsf.tex not found, images will be ignored}%
+      \global\warnednoepsftrue
+    \fi
+  \else
+    \imagexxx #1,,,,,\finish
+  \fi
+}
+%
+% Arguments to @image:
+% #1 is (mandatory) image filename; we tack on .eps extension.
+% #2 is (optional) width, #3 is (optional) height.
+% #4 is (ignored optional) html alt text.
+% #5 is (ignored optional) extension.
+% #6 is just the usual extra ignored arg for parsing stuff.
+\newif\ifimagevmode
+\def\imagexxx#1,#2,#3,#4,#5,#6\finish{\begingroup
+  \catcode`\^^M = 5     % in case we're inside an example
+  \normalturnoffactive  % allow _ et al. in names
+  % If the image is by itself, center it.
+  \ifvmode
+    \imagevmodetrue
+  \else \ifx\centersub\centerV
+    % for @center @image, we need a vbox so we can have our vertical space
+    \imagevmodetrue
+    \vbox\bgroup % vbox has better behavior than vtop herev
+  \fi\fi
+  %
+  \ifimagevmode
+    \nobreak\medskip
+    % Usually we'll have text after the image which will insert
+    % \parskip glue, so insert it here too to equalize the space
+    % above and below.
+    \nobreak\vskip\parskip
+    \nobreak
+  \fi
+  %
+  % Leave vertical mode so that indentation from an enclosing
+  %  environment such as @quotation is respected.
+  % However, if we're at the top level, we don't want the
+  %  normal paragraph indentation.
+  % On the other hand, if we are in the case of @center @image, we don't
+  %  want to start a paragraph, which will create a hsize-width box and
+  %  eradicate the centering.
+  \ifx\centersub\centerV\else \noindent \fi
+  %
+  % Output the image.
+  \ifpdf
+    \dopdfimage{#1}{#2}{#3}%
+  \else
+    % \epsfbox itself resets \epsf?size at each figure.
+    \setbox0 = \hbox{\ignorespaces #2}\ifdim\wd0 > 0pt \epsfxsize=#2\relax \fi
+    \setbox0 = \hbox{\ignorespaces #3}\ifdim\wd0 > 0pt \epsfysize=#3\relax \fi
+    \epsfbox{#1.eps}%
+  \fi
+  %
+  \ifimagevmode
+    \medskip  % space after a standalone image
+  \fi  
+  \ifx\centersub\centerV \egroup \fi
+\endgroup}
+
+
+% @float FLOATTYPE,LABEL,LOC ... @end float for displayed figures, tables,
+% etc.  We don't actually implement floating yet, we always include the
+% float "here".  But it seemed the best name for the future.
+%
+\envparseargdef\float{\eatcommaspace\eatcommaspace\dofloat#1, , ,\finish}
+
+% There may be a space before second and/or third parameter; delete it.
+\def\eatcommaspace#1, {#1,}
+
+% #1 is the optional FLOATTYPE, the text label for this float, typically
+% "Figure", "Table", "Example", etc.  Can't contain commas.  If omitted,
+% this float will not be numbered and cannot be referred to.
+%
+% #2 is the optional xref label.  Also must be present for the float to
+% be referable.
+%
+% #3 is the optional positioning argument; for now, it is ignored.  It
+% will somehow specify the positions allowed to float to (here, top, bottom).
+%
+% We keep a separate counter for each FLOATTYPE, which we reset at each
+% chapter-level command.
+\let\resetallfloatnos=\empty
+%
+\def\dofloat#1,#2,#3,#4\finish{%
+  \let\thiscaption=\empty
+  \let\thisshortcaption=\empty
+  %
+  % don't lose footnotes inside @float.
+  %
+  % BEWARE: when the floats start float, we have to issue warning whenever an
+  % insert appears inside a float which could possibly float. --kasal, 26may04
+  %
+  \startsavinginserts
+  %
+  % We can't be used inside a paragraph.
+  \par
+  %
+  \vtop\bgroup
+    \def\floattype{#1}%
+    \def\floatlabel{#2}%
+    \def\floatloc{#3}% we do nothing with this yet.
+    %
+    \ifx\floattype\empty
+      \let\safefloattype=\empty
+    \else
+      {%
+        % the floattype might have accents or other special characters,
+        % but we need to use it in a control sequence name.
+        \indexnofonts
+        \turnoffactive
+        \xdef\safefloattype{\floattype}%
+      }%
+    \fi
+    %
+    % If label is given but no type, we handle that as the empty type.
+    \ifx\floatlabel\empty \else
+      % We want each FLOATTYPE to be numbered separately (Figure 1,
+      % Table 1, Figure 2, ...).  (And if no label, no number.)
+      %
+      \expandafter\getfloatno\csname\safefloattype floatno\endcsname
+      \global\advance\floatno by 1
+      %
+      {%
+        % This magic value for \lastsection is output by \setref as the
+        % XREFLABEL-title value.  \xrefX uses it to distinguish float
+        % labels (which have a completely different output format) from
+        % node and anchor labels.  And \xrdef uses it to construct the
+        % lists of floats.
+        %
+        \edef\lastsection{\floatmagic=\safefloattype}%
+        \setref{\floatlabel}{Yfloat}%
+      }%
+    \fi
+    %
+    % start with \parskip glue, I guess.
+    \vskip\parskip
+    %
+    % Don't suppress indentation if a float happens to start a section.
+    \restorefirstparagraphindent
+}
+
+% we have these possibilities:
+% @float Foo,lbl & @caption{Cap}: Foo 1.1: Cap
+% @float Foo,lbl & no caption:    Foo 1.1
+% @float Foo & @caption{Cap}:     Foo: Cap
+% @float Foo & no caption:        Foo
+% @float ,lbl & Caption{Cap}:     1.1: Cap
+% @float ,lbl & no caption:       1.1
+% @float & @caption{Cap}:         Cap
+% @float & no caption:
+%
+\def\Efloat{%
+    \let\floatident = \empty
+    %
+    % In all cases, if we have a float type, it comes first.
+    \ifx\floattype\empty \else \def\floatident{\floattype}\fi
+    %
+    % If we have an xref label, the number comes next.
+    \ifx\floatlabel\empty \else
+      \ifx\floattype\empty \else % if also had float type, need tie first.
+        \appendtomacro\floatident{\tie}%
+      \fi
+      % the number.
+      \appendtomacro\floatident{\chaplevelprefix\the\floatno}%
+    \fi
+    %
+    % Start the printed caption with what we've constructed in
+    % \floatident, but keep it separate; we need \floatident again.
+    \let\captionline = \floatident
+    %
+    \ifx\thiscaption\empty \else
+      \ifx\floatident\empty \else
+	\appendtomacro\captionline{: }% had ident, so need a colon between
+      \fi
+      %
+      % caption text.
+      \appendtomacro\captionline{\scanexp\thiscaption}%
+    \fi
+    %
+    % If we have anything to print, print it, with space before.
+    % Eventually this needs to become an \insert.
+    \ifx\captionline\empty \else
+      \vskip.5\parskip
+      \captionline
+      %
+      % Space below caption.
+      \vskip\parskip
+    \fi
+    %
+    % If have an xref label, write the list of floats info.  Do this
+    % after the caption, to avoid chance of it being a breakpoint.
+    \ifx\floatlabel\empty \else
+      % Write the text that goes in the lof to the aux file as
+      % \floatlabel-lof.  Besides \floatident, we include the short
+      % caption if specified, else the full caption if specified, else nothing.
+      {%
+        \atdummies
+        %
+        % since we read the caption text in the macro world, where ^^M
+        % is turned into a normal character, we have to scan it back, so
+        % we don't write the literal three characters "^^M" into the aux file.
+	\scanexp{%
+	  \xdef\noexpand\gtemp{%
+	    \ifx\thisshortcaption\empty
+	      \thiscaption
+	    \else
+	      \thisshortcaption
+	    \fi
+	  }%
+	}%
+        \immediate\write\auxfile{@xrdef{\floatlabel-lof}{\floatident
+	  \ifx\gtemp\empty \else : \gtemp \fi}}%
+      }%
+    \fi
+  \egroup  % end of \vtop
+  %
+  % place the captured inserts
+  %
+  % BEWARE: when the floats start floating, we have to issue warning
+  % whenever an insert appears inside a float which could possibly
+  % float. --kasal, 26may04
+  %
+  \checkinserts
+}
+
+% Append the tokens #2 to the definition of macro #1, not expanding either.
+%
+\def\appendtomacro#1#2{%
+  \expandafter\def\expandafter#1\expandafter{#1#2}%
+}
+
+% @caption, @shortcaption
+%
+\def\caption{\docaption\thiscaption}
+\def\shortcaption{\docaption\thisshortcaption}
+\def\docaption{\checkenv\float \bgroup\scanargctxt\defcaption}
+\def\defcaption#1#2{\egroup \def#1{#2}}
+
+% The parameter is the control sequence identifying the counter we are
+% going to use.  Create it if it doesn't exist and assign it to \floatno.
+\def\getfloatno#1{%
+  \ifx#1\relax
+      % Haven't seen this figure type before.
+      \csname newcount\endcsname #1%
+      %
+      % Remember to reset this floatno at the next chap.
+      \expandafter\gdef\expandafter\resetallfloatnos
+        \expandafter{\resetallfloatnos #1=0 }%
+  \fi
+  \let\floatno#1%
+}
+
+% \setref calls this to get the XREFLABEL-snt value.  We want an @xref
+% to the FLOATLABEL to expand to "Figure 3.1".  We call \setref when we
+% first read the @float command.
+%
+\def\Yfloat{\floattype@tie \chaplevelprefix\the\floatno}%
+
+% Magic string used for the XREFLABEL-title value, so \xrefX can
+% distinguish floats from other xref types.
+\def\floatmagic{!!float!!}
+
+% #1 is the control sequence we are passed; we expand into a conditional
+% which is true if #1 represents a float ref.  That is, the magic
+% \lastsection value which we \setref above.
+%
+\def\iffloat#1{\expandafter\doiffloat#1==\finish}
+%
+% #1 is (maybe) the \floatmagic string.  If so, #2 will be the
+% (safe) float type for this float.  We set \iffloattype to #2.
+%
+\def\doiffloat#1=#2=#3\finish{%
+  \def\temp{#1}%
+  \def\iffloattype{#2}%
+  \ifx\temp\floatmagic
+}
+
+% @listoffloats FLOATTYPE - print a list of floats like a table of contents.
+%
+\parseargdef\listoffloats{%
+  \def\floattype{#1}% floattype
+  {%
+    % the floattype might have accents or other special characters,
+    % but we need to use it in a control sequence name.
+    \indexnofonts
+    \turnoffactive
+    \xdef\safefloattype{\floattype}%
+  }%
+  %
+  % \xrdef saves the floats as a \do-list in \floatlistSAFEFLOATTYPE.
+  \expandafter\ifx\csname floatlist\safefloattype\endcsname \relax
+    \ifhavexrefs
+      % if the user said @listoffloats foo but never @float foo.
+      \message{\linenumber No `\safefloattype' floats to list.}%
+    \fi
+  \else
+    \begingroup
+      \leftskip=\tocindent  % indent these entries like a toc
+      \let\do=\listoffloatsdo
+      \csname floatlist\safefloattype\endcsname
+    \endgroup
+  \fi
+}
+
+% This is called on each entry in a list of floats.  We're passed the
+% xref label, in the form LABEL-title, which is how we save it in the
+% aux file.  We strip off the -title and look up \XRLABEL-lof, which
+% has the text we're supposed to typeset here.
+%
+% Figures without xref labels will not be included in the list (since
+% they won't appear in the aux file).
+%
+\def\listoffloatsdo#1{\listoffloatsdoentry#1\finish}
+\def\listoffloatsdoentry#1-title\finish{{%
+  % Can't fully expand XR#1-lof because it can contain anything.  Just
+  % pass the control sequence.  On the other hand, XR#1-pg is just the
+  % page number, and we want to fully expand that so we can get a link
+  % in pdf output.
+  \toksA = \expandafter{\csname XR#1-lof\endcsname}%
+  %
+  % use the same \entry macro we use to generate the TOC and index.
+  \edef\writeentry{\noexpand\entry{\the\toksA}{\csname XR#1-pg\endcsname}}%
+  \writeentry
+}}
+
+
+\message{localization,}
+
+% For single-language documents, @documentlanguage is usually given very
+% early, just after @documentencoding.  Single argument is the language
+% (de) or locale (de_DE) abbreviation.
+%
+{
+  \catcode`\_ = \active
+  \globaldefs=1
+\parseargdef\documentlanguage{\begingroup
+  \let_=\normalunderscore  % normal _ character for filenames
+  \tex % read txi-??.tex file in plain TeX.
+    % Read the file by the name they passed if it exists.
+    \openin 1 txi-#1.tex
+    \ifeof 1
+      \documentlanguagetrywithoutunderscore{#1_\finish}%
+    \else
+      \globaldefs = 1  % everything in the txi-LL files needs to persist
+      \input txi-#1.tex
+    \fi
+    \closein 1
+  \endgroup % end raw TeX
+\endgroup}
+%
+% If they passed de_DE, and txi-de_DE.tex doesn't exist,
+% try txi-de.tex.
+%
+\gdef\documentlanguagetrywithoutunderscore#1_#2\finish{%
+  \openin 1 txi-#1.tex
+  \ifeof 1
+    \errhelp = \nolanghelp
+    \errmessage{Cannot read language file txi-#1.tex}%
+  \else
+    \globaldefs = 1  % everything in the txi-LL files needs to persist
+    \input txi-#1.tex
+  \fi
+  \closein 1
+}
+}% end of special _ catcode
+%
+\newhelp\nolanghelp{The given language definition file cannot be found or
+is empty.  Maybe you need to install it?  Putting it in the current
+directory should work if nowhere else does.}
+
+% This macro is called from txi-??.tex files; the first argument is the
+% \language name to set (without the "\lang@" prefix), the second and
+% third args are \{left,right}hyphenmin.
+%
+% The language names to pass are determined when the format is built.
+% See the etex.log file created at that time, e.g.,
+% /usr/local/texlive/2008/texmf-var/web2c/pdftex/etex.log.
+%
+% With TeX Live 2008, etex now includes hyphenation patterns for all
+% available languages.  This means we can support hyphenation in
+% Texinfo, at least to some extent.  (This still doesn't solve the
+% accented characters problem.)
+%
+\catcode`@=11
+\def\txisetlanguage#1#2#3{%
+  % do not set the language if the name is undefined in the current TeX.
+  \expandafter\ifx\csname lang@#1\endcsname \relax
+    \message{no patterns for #1}%
+  \else
+    \global\language = \csname lang@#1\endcsname
+  \fi
+  % but there is no harm in adjusting the hyphenmin values regardless.
+  \global\lefthyphenmin = #2\relax
+  \global\righthyphenmin = #3\relax
+}
+
+% Helpers for encodings.
+% Set the catcode of characters 128 through 255 to the specified number.
+%
+\def\setnonasciicharscatcode#1{%
+   \count255=128
+   \loop\ifnum\count255<256
+      \global\catcode\count255=#1\relax
+      \advance\count255 by 1
+   \repeat
+}
+
+\def\setnonasciicharscatcodenonglobal#1{%
+   \count255=128
+   \loop\ifnum\count255<256
+      \catcode\count255=#1\relax
+      \advance\count255 by 1
+   \repeat
+}
+
+% @documentencoding sets the definition of non-ASCII characters
+% according to the specified encoding.
+%
+\parseargdef\documentencoding{%
+  % Encoding being declared for the document.
+  \def\declaredencoding{\csname #1.enc\endcsname}%
+  %
+  % Supported encodings: names converted to tokens in order to be able
+  % to compare them with \ifx.
+  \def\ascii{\csname US-ASCII.enc\endcsname}%
+  \def\latnine{\csname ISO-8859-15.enc\endcsname}%
+  \def\latone{\csname ISO-8859-1.enc\endcsname}%
+  \def\lattwo{\csname ISO-8859-2.enc\endcsname}%
+  \def\utfeight{\csname UTF-8.enc\endcsname}%
+  %
+  \ifx \declaredencoding \ascii
+     \asciichardefs
+  %
+  \else \ifx \declaredencoding \lattwo
+     \setnonasciicharscatcode\active
+     \lattwochardefs
+  %
+  \else \ifx \declaredencoding \latone
+     \setnonasciicharscatcode\active
+     \latonechardefs
+  %
+  \else \ifx \declaredencoding \latnine
+     \setnonasciicharscatcode\active
+     \latninechardefs
+  %
+  \else \ifx \declaredencoding \utfeight
+     \setnonasciicharscatcode\active
+     \utfeightchardefs
+  %
+  \else
+    \message{Unknown document encoding #1, ignoring.}%
+  %
+  \fi % utfeight
+  \fi % latnine
+  \fi % latone
+  \fi % lattwo
+  \fi % ascii
+}
+
+% A message to be logged when using a character that isn't available
+% the default font encoding (OT1).
+%
+\def\missingcharmsg#1{\message{Character missing in OT1 encoding: #1.}}
+
+% Take account of \c (plain) vs. \, (Texinfo) difference.
+\def\cedilla#1{\ifx\c\ptexc\c{#1}\else\,{#1}\fi}
+
+% First, make active non-ASCII characters in order for them to be
+% correctly categorized when TeX reads the replacement text of
+% macros containing the character definitions.
+\setnonasciicharscatcode\active
+%
+% Latin1 (ISO-8859-1) character definitions.
+\def\latonechardefs{%
+  \gdef^^a0{\tie}
+  \gdef^^a1{\exclamdown}
+  \gdef^^a2{\missingcharmsg{CENT SIGN}}
+  \gdef^^a3{{\pounds}}
+  \gdef^^a4{\missingcharmsg{CURRENCY SIGN}}
+  \gdef^^a5{\missingcharmsg{YEN SIGN}}
+  \gdef^^a6{\missingcharmsg{BROKEN BAR}}
+  \gdef^^a7{\S}
+  \gdef^^a8{\"{}}
+  \gdef^^a9{\copyright}
+  \gdef^^aa{\ordf}
+  \gdef^^ab{\guillemetleft}
+  \gdef^^ac{$\lnot$}
+  \gdef^^ad{\-}
+  \gdef^^ae{\registeredsymbol}
+  \gdef^^af{\={}}
+  %
+  \gdef^^b0{\textdegree}
+  \gdef^^b1{$\pm$}
+  \gdef^^b2{$^2$}
+  \gdef^^b3{$^3$}
+  \gdef^^b4{\'{}}
+  \gdef^^b5{$\mu$}
+  \gdef^^b6{\P}
+  %
+  \gdef^^b7{$^.$}
+  \gdef^^b8{\cedilla\ }
+  \gdef^^b9{$^1$}
+  \gdef^^ba{\ordm}
+  %
+  \gdef^^bb{\guillemetright}
+  \gdef^^bc{$1\over4$}
+  \gdef^^bd{$1\over2$}
+  \gdef^^be{$3\over4$}
+  \gdef^^bf{\questiondown}
+  %
+  \gdef^^c0{\`A}
+  \gdef^^c1{\'A}
+  \gdef^^c2{\^A}
+  \gdef^^c3{\~A}
+  \gdef^^c4{\"A}
+  \gdef^^c5{\ringaccent A}
+  \gdef^^c6{\AE}
+  \gdef^^c7{\cedilla C}
+  \gdef^^c8{\`E}
+  \gdef^^c9{\'E}
+  \gdef^^ca{\^E}
+  \gdef^^cb{\"E}
+  \gdef^^cc{\`I}
+  \gdef^^cd{\'I}
+  \gdef^^ce{\^I}
+  \gdef^^cf{\"I}
+  %
+  \gdef^^d0{\DH}
+  \gdef^^d1{\~N}
+  \gdef^^d2{\`O}
+  \gdef^^d3{\'O}
+  \gdef^^d4{\^O}
+  \gdef^^d5{\~O}
+  \gdef^^d6{\"O}
+  \gdef^^d7{$\times$}
+  \gdef^^d8{\O}
+  \gdef^^d9{\`U}
+  \gdef^^da{\'U}
+  \gdef^^db{\^U}
+  \gdef^^dc{\"U}
+  \gdef^^dd{\'Y}
+  \gdef^^de{\TH}
+  \gdef^^df{\ss}
+  %
+  \gdef^^e0{\`a}
+  \gdef^^e1{\'a}
+  \gdef^^e2{\^a}
+  \gdef^^e3{\~a}
+  \gdef^^e4{\"a}
+  \gdef^^e5{\ringaccent a}
+  \gdef^^e6{\ae}
+  \gdef^^e7{\cedilla c}
+  \gdef^^e8{\`e}
+  \gdef^^e9{\'e}
+  \gdef^^ea{\^e}
+  \gdef^^eb{\"e}
+  \gdef^^ec{\`{\dotless i}}
+  \gdef^^ed{\'{\dotless i}}
+  \gdef^^ee{\^{\dotless i}}
+  \gdef^^ef{\"{\dotless i}}
+  %
+  \gdef^^f0{\dh}
+  \gdef^^f1{\~n}
+  \gdef^^f2{\`o}
+  \gdef^^f3{\'o}
+  \gdef^^f4{\^o}
+  \gdef^^f5{\~o}
+  \gdef^^f6{\"o}
+  \gdef^^f7{$\div$}
+  \gdef^^f8{\o}
+  \gdef^^f9{\`u}
+  \gdef^^fa{\'u}
+  \gdef^^fb{\^u}
+  \gdef^^fc{\"u}
+  \gdef^^fd{\'y}
+  \gdef^^fe{\th}
+  \gdef^^ff{\"y}
+}
+
+% Latin9 (ISO-8859-15) encoding character definitions.
+\def\latninechardefs{%
+  % Encoding is almost identical to Latin1.
+  \latonechardefs
+  %
+  \gdef^^a4{\euro}
+  \gdef^^a6{\v S}
+  \gdef^^a8{\v s}
+  \gdef^^b4{\v Z}
+  \gdef^^b8{\v z}
+  \gdef^^bc{\OE}
+  \gdef^^bd{\oe}
+  \gdef^^be{\"Y}
+}
+
+% Latin2 (ISO-8859-2) character definitions.
+\def\lattwochardefs{%
+  \gdef^^a0{\tie}
+  \gdef^^a1{\ogonek{A}}
+  \gdef^^a2{\u{}}
+  \gdef^^a3{\L}
+  \gdef^^a4{\missingcharmsg{CURRENCY SIGN}}
+  \gdef^^a5{\v L}
+  \gdef^^a6{\'S}
+  \gdef^^a7{\S}
+  \gdef^^a8{\"{}}
+  \gdef^^a9{\v S}
+  \gdef^^aa{\cedilla S}
+  \gdef^^ab{\v T}
+  \gdef^^ac{\'Z}
+  \gdef^^ad{\-}
+  \gdef^^ae{\v Z}
+  \gdef^^af{\dotaccent Z}
+  %
+  \gdef^^b0{\textdegree}
+  \gdef^^b1{\ogonek{a}}
+  \gdef^^b2{\ogonek{ }}
+  \gdef^^b3{\l}
+  \gdef^^b4{\'{}}
+  \gdef^^b5{\v l}
+  \gdef^^b6{\'s}
+  \gdef^^b7{\v{}}
+  \gdef^^b8{\cedilla\ }
+  \gdef^^b9{\v s}
+  \gdef^^ba{\cedilla s}
+  \gdef^^bb{\v t}
+  \gdef^^bc{\'z}
+  \gdef^^bd{\H{}}
+  \gdef^^be{\v z}
+  \gdef^^bf{\dotaccent z}
+  %
+  \gdef^^c0{\'R}
+  \gdef^^c1{\'A}
+  \gdef^^c2{\^A}
+  \gdef^^c3{\u A}
+  \gdef^^c4{\"A}
+  \gdef^^c5{\'L}
+  \gdef^^c6{\'C}
+  \gdef^^c7{\cedilla C}
+  \gdef^^c8{\v C}
+  \gdef^^c9{\'E}
+  \gdef^^ca{\ogonek{E}}
+  \gdef^^cb{\"E}
+  \gdef^^cc{\v E}
+  \gdef^^cd{\'I}
+  \gdef^^ce{\^I}
+  \gdef^^cf{\v D}
+  %
+  \gdef^^d0{\DH}
+  \gdef^^d1{\'N}
+  \gdef^^d2{\v N}
+  \gdef^^d3{\'O}
+  \gdef^^d4{\^O}
+  \gdef^^d5{\H O}
+  \gdef^^d6{\"O}
+  \gdef^^d7{$\times$}
+  \gdef^^d8{\v R}
+  \gdef^^d9{\ringaccent U}
+  \gdef^^da{\'U}
+  \gdef^^db{\H U}
+  \gdef^^dc{\"U}
+  \gdef^^dd{\'Y}
+  \gdef^^de{\cedilla T}
+  \gdef^^df{\ss}
+  %
+  \gdef^^e0{\'r}
+  \gdef^^e1{\'a}
+  \gdef^^e2{\^a}
+  \gdef^^e3{\u a}
+  \gdef^^e4{\"a}
+  \gdef^^e5{\'l}
+  \gdef^^e6{\'c}
+  \gdef^^e7{\cedilla c}
+  \gdef^^e8{\v c}
+  \gdef^^e9{\'e}
+  \gdef^^ea{\ogonek{e}}
+  \gdef^^eb{\"e}
+  \gdef^^ec{\v e}
+  \gdef^^ed{\'{\dotless{i}}}
+  \gdef^^ee{\^{\dotless{i}}}
+  \gdef^^ef{\v d}
+  %
+  \gdef^^f0{\dh}
+  \gdef^^f1{\'n}
+  \gdef^^f2{\v n}
+  \gdef^^f3{\'o}
+  \gdef^^f4{\^o}
+  \gdef^^f5{\H o}
+  \gdef^^f6{\"o}
+  \gdef^^f7{$\div$}
+  \gdef^^f8{\v r}
+  \gdef^^f9{\ringaccent u}
+  \gdef^^fa{\'u}
+  \gdef^^fb{\H u}
+  \gdef^^fc{\"u}
+  \gdef^^fd{\'y}
+  \gdef^^fe{\cedilla t}
+  \gdef^^ff{\dotaccent{}}
+}
+
+% UTF-8 character definitions.
+%
+% This code to support UTF-8 is based on LaTeX's utf8.def, with some
+% changes for Texinfo conventions.  It is included here under the GPL by
+% permission from Frank Mittelbach and the LaTeX team.
+%
+\newcount\countUTFx
+\newcount\countUTFy
+\newcount\countUTFz
+
+\gdef\UTFviiiTwoOctets#1#2{\expandafter
+   \UTFviiiDefined\csname u8:#1\string #2\endcsname}
+%
+\gdef\UTFviiiThreeOctets#1#2#3{\expandafter
+   \UTFviiiDefined\csname u8:#1\string #2\string #3\endcsname}
+%
+\gdef\UTFviiiFourOctets#1#2#3#4{\expandafter
+   \UTFviiiDefined\csname u8:#1\string #2\string #3\string #4\endcsname}
+
+\gdef\UTFviiiDefined#1{%
+  \ifx #1\relax
+    \message{\linenumber Unicode char \string #1 not defined for Texinfo}%
+  \else
+    \expandafter #1%
+  \fi
+}
+
+\begingroup
+  \catcode`\~13
+  \catcode`\"12
+
+  \def\UTFviiiLoop{%
+    \global\catcode\countUTFx\active
+    \uccode`\~\countUTFx
+    \uppercase\expandafter{\UTFviiiTmp}%
+    \advance\countUTFx by 1
+    \ifnum\countUTFx < \countUTFy
+      \expandafter\UTFviiiLoop
+    \fi}
+
+  \countUTFx = "C2
+  \countUTFy = "E0
+  \def\UTFviiiTmp{%
+    \xdef~{\noexpand\UTFviiiTwoOctets\string~}}
+  \UTFviiiLoop
+
+  \countUTFx = "E0
+  \countUTFy = "F0
+  \def\UTFviiiTmp{%
+    \xdef~{\noexpand\UTFviiiThreeOctets\string~}}
+  \UTFviiiLoop
+
+  \countUTFx = "F0
+  \countUTFy = "F4
+  \def\UTFviiiTmp{%
+    \xdef~{\noexpand\UTFviiiFourOctets\string~}}
+  \UTFviiiLoop
+\endgroup
+
+\begingroup
+  \catcode`\"=12
+  \catcode`\<=12
+  \catcode`\.=12
+  \catcode`\,=12
+  \catcode`\;=12
+  \catcode`\!=12
+  \catcode`\~=13
+
+  \gdef\DeclareUnicodeCharacter#1#2{%
+    \countUTFz = "#1\relax
+    %\wlog{\space\space defining Unicode char U+#1 (decimal \the\countUTFz)}%
+    \begingroup
+      \parseXMLCharref
+      \def\UTFviiiTwoOctets##1##2{%
+        \csname u8:##1\string ##2\endcsname}%
+      \def\UTFviiiThreeOctets##1##2##3{%
+        \csname u8:##1\string ##2\string ##3\endcsname}%
+      \def\UTFviiiFourOctets##1##2##3##4{%
+        \csname u8:##1\string ##2\string ##3\string ##4\endcsname}%
+      \expandafter\expandafter\expandafter\expandafter
+       \expandafter\expandafter\expandafter
+       \gdef\UTFviiiTmp{#2}%
+    \endgroup}
+
+  \gdef\parseXMLCharref{%
+    \ifnum\countUTFz < "A0\relax
+      \errhelp = \EMsimple
+      \errmessage{Cannot define Unicode char value < 00A0}%
+    \else\ifnum\countUTFz < "800\relax
+      \parseUTFviiiA,%
+      \parseUTFviiiB C\UTFviiiTwoOctets.,%
+    \else\ifnum\countUTFz < "10000\relax
+      \parseUTFviiiA;%
+      \parseUTFviiiA,%
+      \parseUTFviiiB E\UTFviiiThreeOctets.{,;}%
+    \else
+      \parseUTFviiiA;%
+      \parseUTFviiiA,%
+      \parseUTFviiiA!%
+      \parseUTFviiiB F\UTFviiiFourOctets.{!,;}%
+    \fi\fi\fi
+  }
+
+  \gdef\parseUTFviiiA#1{%
+    \countUTFx = \countUTFz
+    \divide\countUTFz by 64
+    \countUTFy = \countUTFz
+    \multiply\countUTFz by 64
+    \advance\countUTFx by -\countUTFz
+    \advance\countUTFx by 128
+    \uccode `#1\countUTFx
+    \countUTFz = \countUTFy}
+
+  \gdef\parseUTFviiiB#1#2#3#4{%
+    \advance\countUTFz by "#10\relax
+    \uccode `#3\countUTFz
+    \uppercase{\gdef\UTFviiiTmp{#2#3#4}}}
+\endgroup
+
+\def\utfeightchardefs{%
+  \DeclareUnicodeCharacter{00A0}{\tie}
+  \DeclareUnicodeCharacter{00A1}{\exclamdown}
+  \DeclareUnicodeCharacter{00A3}{\pounds}
+  \DeclareUnicodeCharacter{00A8}{\"{ }}
+  \DeclareUnicodeCharacter{00A9}{\copyright}
+  \DeclareUnicodeCharacter{00AA}{\ordf}
+  \DeclareUnicodeCharacter{00AB}{\guillemetleft}
+  \DeclareUnicodeCharacter{00AD}{\-}
+  \DeclareUnicodeCharacter{00AE}{\registeredsymbol}
+  \DeclareUnicodeCharacter{00AF}{\={ }}
+
+  \DeclareUnicodeCharacter{00B0}{\ringaccent{ }}
+  \DeclareUnicodeCharacter{00B4}{\'{ }}
+  \DeclareUnicodeCharacter{00B8}{\cedilla{ }}
+  \DeclareUnicodeCharacter{00BA}{\ordm}
+  \DeclareUnicodeCharacter{00BB}{\guillemetright}
+  \DeclareUnicodeCharacter{00BF}{\questiondown}
+
+  \DeclareUnicodeCharacter{00C0}{\`A}
+  \DeclareUnicodeCharacter{00C1}{\'A}
+  \DeclareUnicodeCharacter{00C2}{\^A}
+  \DeclareUnicodeCharacter{00C3}{\~A}
+  \DeclareUnicodeCharacter{00C4}{\"A}
+  \DeclareUnicodeCharacter{00C5}{\AA}
+  \DeclareUnicodeCharacter{00C6}{\AE}
+  \DeclareUnicodeCharacter{00C7}{\cedilla{C}}
+  \DeclareUnicodeCharacter{00C8}{\`E}
+  \DeclareUnicodeCharacter{00C9}{\'E}
+  \DeclareUnicodeCharacter{00CA}{\^E}
+  \DeclareUnicodeCharacter{00CB}{\"E}
+  \DeclareUnicodeCharacter{00CC}{\`I}
+  \DeclareUnicodeCharacter{00CD}{\'I}
+  \DeclareUnicodeCharacter{00CE}{\^I}
+  \DeclareUnicodeCharacter{00CF}{\"I}
+
+  \DeclareUnicodeCharacter{00D0}{\DH}
+  \DeclareUnicodeCharacter{00D1}{\~N}
+  \DeclareUnicodeCharacter{00D2}{\`O}
+  \DeclareUnicodeCharacter{00D3}{\'O}
+  \DeclareUnicodeCharacter{00D4}{\^O}
+  \DeclareUnicodeCharacter{00D5}{\~O}
+  \DeclareUnicodeCharacter{00D6}{\"O}
+  \DeclareUnicodeCharacter{00D8}{\O}
+  \DeclareUnicodeCharacter{00D9}{\`U}
+  \DeclareUnicodeCharacter{00DA}{\'U}
+  \DeclareUnicodeCharacter{00DB}{\^U}
+  \DeclareUnicodeCharacter{00DC}{\"U}
+  \DeclareUnicodeCharacter{00DD}{\'Y}
+  \DeclareUnicodeCharacter{00DE}{\TH}
+  \DeclareUnicodeCharacter{00DF}{\ss}
+
+  \DeclareUnicodeCharacter{00E0}{\`a}
+  \DeclareUnicodeCharacter{00E1}{\'a}
+  \DeclareUnicodeCharacter{00E2}{\^a}
+  \DeclareUnicodeCharacter{00E3}{\~a}
+  \DeclareUnicodeCharacter{00E4}{\"a}
+  \DeclareUnicodeCharacter{00E5}{\aa}
+  \DeclareUnicodeCharacter{00E6}{\ae}
+  \DeclareUnicodeCharacter{00E7}{\cedilla{c}}
+  \DeclareUnicodeCharacter{00E8}{\`e}
+  \DeclareUnicodeCharacter{00E9}{\'e}
+  \DeclareUnicodeCharacter{00EA}{\^e}
+  \DeclareUnicodeCharacter{00EB}{\"e}
+  \DeclareUnicodeCharacter{00EC}{\`{\dotless{i}}}
+  \DeclareUnicodeCharacter{00ED}{\'{\dotless{i}}}
+  \DeclareUnicodeCharacter{00EE}{\^{\dotless{i}}}
+  \DeclareUnicodeCharacter{00EF}{\"{\dotless{i}}}
+
+  \DeclareUnicodeCharacter{00F0}{\dh}
+  \DeclareUnicodeCharacter{00F1}{\~n}
+  \DeclareUnicodeCharacter{00F2}{\`o}
+  \DeclareUnicodeCharacter{00F3}{\'o}
+  \DeclareUnicodeCharacter{00F4}{\^o}
+  \DeclareUnicodeCharacter{00F5}{\~o}
+  \DeclareUnicodeCharacter{00F6}{\"o}
+  \DeclareUnicodeCharacter{00F8}{\o}
+  \DeclareUnicodeCharacter{00F9}{\`u}
+  \DeclareUnicodeCharacter{00FA}{\'u}
+  \DeclareUnicodeCharacter{00FB}{\^u}
+  \DeclareUnicodeCharacter{00FC}{\"u}
+  \DeclareUnicodeCharacter{00FD}{\'y}
+  \DeclareUnicodeCharacter{00FE}{\th}
+  \DeclareUnicodeCharacter{00FF}{\"y}
+
+  \DeclareUnicodeCharacter{0100}{\=A}
+  \DeclareUnicodeCharacter{0101}{\=a}
+  \DeclareUnicodeCharacter{0102}{\u{A}}
+  \DeclareUnicodeCharacter{0103}{\u{a}}
+  \DeclareUnicodeCharacter{0104}{\ogonek{A}}
+  \DeclareUnicodeCharacter{0105}{\ogonek{a}}
+  \DeclareUnicodeCharacter{0106}{\'C}
+  \DeclareUnicodeCharacter{0107}{\'c}
+  \DeclareUnicodeCharacter{0108}{\^C}
+  \DeclareUnicodeCharacter{0109}{\^c}
+  \DeclareUnicodeCharacter{0118}{\ogonek{E}}
+  \DeclareUnicodeCharacter{0119}{\ogonek{e}}
+  \DeclareUnicodeCharacter{010A}{\dotaccent{C}}
+  \DeclareUnicodeCharacter{010B}{\dotaccent{c}}
+  \DeclareUnicodeCharacter{010C}{\v{C}}
+  \DeclareUnicodeCharacter{010D}{\v{c}}
+  \DeclareUnicodeCharacter{010E}{\v{D}}
+
+  \DeclareUnicodeCharacter{0112}{\=E}
+  \DeclareUnicodeCharacter{0113}{\=e}
+  \DeclareUnicodeCharacter{0114}{\u{E}}
+  \DeclareUnicodeCharacter{0115}{\u{e}}
+  \DeclareUnicodeCharacter{0116}{\dotaccent{E}}
+  \DeclareUnicodeCharacter{0117}{\dotaccent{e}}
+  \DeclareUnicodeCharacter{011A}{\v{E}}
+  \DeclareUnicodeCharacter{011B}{\v{e}}
+  \DeclareUnicodeCharacter{011C}{\^G}
+  \DeclareUnicodeCharacter{011D}{\^g}
+  \DeclareUnicodeCharacter{011E}{\u{G}}
+  \DeclareUnicodeCharacter{011F}{\u{g}}
+
+  \DeclareUnicodeCharacter{0120}{\dotaccent{G}}
+  \DeclareUnicodeCharacter{0121}{\dotaccent{g}}
+  \DeclareUnicodeCharacter{0124}{\^H}
+  \DeclareUnicodeCharacter{0125}{\^h}
+  \DeclareUnicodeCharacter{0128}{\~I}
+  \DeclareUnicodeCharacter{0129}{\~{\dotless{i}}}
+  \DeclareUnicodeCharacter{012A}{\=I}
+  \DeclareUnicodeCharacter{012B}{\={\dotless{i}}}
+  \DeclareUnicodeCharacter{012C}{\u{I}}
+  \DeclareUnicodeCharacter{012D}{\u{\dotless{i}}}
+
+  \DeclareUnicodeCharacter{0130}{\dotaccent{I}}
+  \DeclareUnicodeCharacter{0131}{\dotless{i}}
+  \DeclareUnicodeCharacter{0132}{IJ}
+  \DeclareUnicodeCharacter{0133}{ij}
+  \DeclareUnicodeCharacter{0134}{\^J}
+  \DeclareUnicodeCharacter{0135}{\^{\dotless{j}}}
+  \DeclareUnicodeCharacter{0139}{\'L}
+  \DeclareUnicodeCharacter{013A}{\'l}
+
+  \DeclareUnicodeCharacter{0141}{\L}
+  \DeclareUnicodeCharacter{0142}{\l}
+  \DeclareUnicodeCharacter{0143}{\'N}
+  \DeclareUnicodeCharacter{0144}{\'n}
+  \DeclareUnicodeCharacter{0147}{\v{N}}
+  \DeclareUnicodeCharacter{0148}{\v{n}}
+  \DeclareUnicodeCharacter{014C}{\=O}
+  \DeclareUnicodeCharacter{014D}{\=o}
+  \DeclareUnicodeCharacter{014E}{\u{O}}
+  \DeclareUnicodeCharacter{014F}{\u{o}}
+
+  \DeclareUnicodeCharacter{0150}{\H{O}}
+  \DeclareUnicodeCharacter{0151}{\H{o}}
+  \DeclareUnicodeCharacter{0152}{\OE}
+  \DeclareUnicodeCharacter{0153}{\oe}
+  \DeclareUnicodeCharacter{0154}{\'R}
+  \DeclareUnicodeCharacter{0155}{\'r}
+  \DeclareUnicodeCharacter{0158}{\v{R}}
+  \DeclareUnicodeCharacter{0159}{\v{r}}
+  \DeclareUnicodeCharacter{015A}{\'S}
+  \DeclareUnicodeCharacter{015B}{\'s}
+  \DeclareUnicodeCharacter{015C}{\^S}
+  \DeclareUnicodeCharacter{015D}{\^s}
+  \DeclareUnicodeCharacter{015E}{\cedilla{S}}
+  \DeclareUnicodeCharacter{015F}{\cedilla{s}}
+
+  \DeclareUnicodeCharacter{0160}{\v{S}}
+  \DeclareUnicodeCharacter{0161}{\v{s}}
+  \DeclareUnicodeCharacter{0162}{\cedilla{t}}
+  \DeclareUnicodeCharacter{0163}{\cedilla{T}}
+  \DeclareUnicodeCharacter{0164}{\v{T}}
+
+  \DeclareUnicodeCharacter{0168}{\~U}
+  \DeclareUnicodeCharacter{0169}{\~u}
+  \DeclareUnicodeCharacter{016A}{\=U}
+  \DeclareUnicodeCharacter{016B}{\=u}
+  \DeclareUnicodeCharacter{016C}{\u{U}}
+  \DeclareUnicodeCharacter{016D}{\u{u}}
+  \DeclareUnicodeCharacter{016E}{\ringaccent{U}}
+  \DeclareUnicodeCharacter{016F}{\ringaccent{u}}
+
+  \DeclareUnicodeCharacter{0170}{\H{U}}
+  \DeclareUnicodeCharacter{0171}{\H{u}}
+  \DeclareUnicodeCharacter{0174}{\^W}
+  \DeclareUnicodeCharacter{0175}{\^w}
+  \DeclareUnicodeCharacter{0176}{\^Y}
+  \DeclareUnicodeCharacter{0177}{\^y}
+  \DeclareUnicodeCharacter{0178}{\"Y}
+  \DeclareUnicodeCharacter{0179}{\'Z}
+  \DeclareUnicodeCharacter{017A}{\'z}
+  \DeclareUnicodeCharacter{017B}{\dotaccent{Z}}
+  \DeclareUnicodeCharacter{017C}{\dotaccent{z}}
+  \DeclareUnicodeCharacter{017D}{\v{Z}}
+  \DeclareUnicodeCharacter{017E}{\v{z}}
+
+  \DeclareUnicodeCharacter{01C4}{D\v{Z}}
+  \DeclareUnicodeCharacter{01C5}{D\v{z}}
+  \DeclareUnicodeCharacter{01C6}{d\v{z}}
+  \DeclareUnicodeCharacter{01C7}{LJ}
+  \DeclareUnicodeCharacter{01C8}{Lj}
+  \DeclareUnicodeCharacter{01C9}{lj}
+  \DeclareUnicodeCharacter{01CA}{NJ}
+  \DeclareUnicodeCharacter{01CB}{Nj}
+  \DeclareUnicodeCharacter{01CC}{nj}
+  \DeclareUnicodeCharacter{01CD}{\v{A}}
+  \DeclareUnicodeCharacter{01CE}{\v{a}}
+  \DeclareUnicodeCharacter{01CF}{\v{I}}
+
+  \DeclareUnicodeCharacter{01D0}{\v{\dotless{i}}}
+  \DeclareUnicodeCharacter{01D1}{\v{O}}
+  \DeclareUnicodeCharacter{01D2}{\v{o}}
+  \DeclareUnicodeCharacter{01D3}{\v{U}}
+  \DeclareUnicodeCharacter{01D4}{\v{u}}
+
+  \DeclareUnicodeCharacter{01E2}{\={\AE}}
+  \DeclareUnicodeCharacter{01E3}{\={\ae}}
+  \DeclareUnicodeCharacter{01E6}{\v{G}}
+  \DeclareUnicodeCharacter{01E7}{\v{g}}
+  \DeclareUnicodeCharacter{01E8}{\v{K}}
+  \DeclareUnicodeCharacter{01E9}{\v{k}}
+
+  \DeclareUnicodeCharacter{01F0}{\v{\dotless{j}}}
+  \DeclareUnicodeCharacter{01F1}{DZ}
+  \DeclareUnicodeCharacter{01F2}{Dz}
+  \DeclareUnicodeCharacter{01F3}{dz}
+  \DeclareUnicodeCharacter{01F4}{\'G}
+  \DeclareUnicodeCharacter{01F5}{\'g}
+  \DeclareUnicodeCharacter{01F8}{\`N}
+  \DeclareUnicodeCharacter{01F9}{\`n}
+  \DeclareUnicodeCharacter{01FC}{\'{\AE}}
+  \DeclareUnicodeCharacter{01FD}{\'{\ae}}
+  \DeclareUnicodeCharacter{01FE}{\'{\O}}
+  \DeclareUnicodeCharacter{01FF}{\'{\o}}
+
+  \DeclareUnicodeCharacter{021E}{\v{H}}
+  \DeclareUnicodeCharacter{021F}{\v{h}}
+
+  \DeclareUnicodeCharacter{0226}{\dotaccent{A}}
+  \DeclareUnicodeCharacter{0227}{\dotaccent{a}}
+  \DeclareUnicodeCharacter{0228}{\cedilla{E}}
+  \DeclareUnicodeCharacter{0229}{\cedilla{e}}
+  \DeclareUnicodeCharacter{022E}{\dotaccent{O}}
+  \DeclareUnicodeCharacter{022F}{\dotaccent{o}}
+
+  \DeclareUnicodeCharacter{0232}{\=Y}
+  \DeclareUnicodeCharacter{0233}{\=y}
+  \DeclareUnicodeCharacter{0237}{\dotless{j}}
+
+  \DeclareUnicodeCharacter{02DB}{\ogonek{ }}
+
+  \DeclareUnicodeCharacter{1E02}{\dotaccent{B}}
+  \DeclareUnicodeCharacter{1E03}{\dotaccent{b}}
+  \DeclareUnicodeCharacter{1E04}{\udotaccent{B}}
+  \DeclareUnicodeCharacter{1E05}{\udotaccent{b}}
+  \DeclareUnicodeCharacter{1E06}{\ubaraccent{B}}
+  \DeclareUnicodeCharacter{1E07}{\ubaraccent{b}}
+  \DeclareUnicodeCharacter{1E0A}{\dotaccent{D}}
+  \DeclareUnicodeCharacter{1E0B}{\dotaccent{d}}
+  \DeclareUnicodeCharacter{1E0C}{\udotaccent{D}}
+  \DeclareUnicodeCharacter{1E0D}{\udotaccent{d}}
+  \DeclareUnicodeCharacter{1E0E}{\ubaraccent{D}}
+  \DeclareUnicodeCharacter{1E0F}{\ubaraccent{d}}
+
+  \DeclareUnicodeCharacter{1E1E}{\dotaccent{F}}
+  \DeclareUnicodeCharacter{1E1F}{\dotaccent{f}}
+
+  \DeclareUnicodeCharacter{1E20}{\=G}
+  \DeclareUnicodeCharacter{1E21}{\=g}
+  \DeclareUnicodeCharacter{1E22}{\dotaccent{H}}
+  \DeclareUnicodeCharacter{1E23}{\dotaccent{h}}
+  \DeclareUnicodeCharacter{1E24}{\udotaccent{H}}
+  \DeclareUnicodeCharacter{1E25}{\udotaccent{h}}
+  \DeclareUnicodeCharacter{1E26}{\"H}
+  \DeclareUnicodeCharacter{1E27}{\"h}
+
+  \DeclareUnicodeCharacter{1E30}{\'K}
+  \DeclareUnicodeCharacter{1E31}{\'k}
+  \DeclareUnicodeCharacter{1E32}{\udotaccent{K}}
+  \DeclareUnicodeCharacter{1E33}{\udotaccent{k}}
+  \DeclareUnicodeCharacter{1E34}{\ubaraccent{K}}
+  \DeclareUnicodeCharacter{1E35}{\ubaraccent{k}}
+  \DeclareUnicodeCharacter{1E36}{\udotaccent{L}}
+  \DeclareUnicodeCharacter{1E37}{\udotaccent{l}}
+  \DeclareUnicodeCharacter{1E3A}{\ubaraccent{L}}
+  \DeclareUnicodeCharacter{1E3B}{\ubaraccent{l}}
+  \DeclareUnicodeCharacter{1E3E}{\'M}
+  \DeclareUnicodeCharacter{1E3F}{\'m}
+
+  \DeclareUnicodeCharacter{1E40}{\dotaccent{M}}
+  \DeclareUnicodeCharacter{1E41}{\dotaccent{m}}
+  \DeclareUnicodeCharacter{1E42}{\udotaccent{M}}
+  \DeclareUnicodeCharacter{1E43}{\udotaccent{m}}
+  \DeclareUnicodeCharacter{1E44}{\dotaccent{N}}
+  \DeclareUnicodeCharacter{1E45}{\dotaccent{n}}
+  \DeclareUnicodeCharacter{1E46}{\udotaccent{N}}
+  \DeclareUnicodeCharacter{1E47}{\udotaccent{n}}
+  \DeclareUnicodeCharacter{1E48}{\ubaraccent{N}}
+  \DeclareUnicodeCharacter{1E49}{\ubaraccent{n}}
+
+  \DeclareUnicodeCharacter{1E54}{\'P}
+  \DeclareUnicodeCharacter{1E55}{\'p}
+  \DeclareUnicodeCharacter{1E56}{\dotaccent{P}}
+  \DeclareUnicodeCharacter{1E57}{\dotaccent{p}}
+  \DeclareUnicodeCharacter{1E58}{\dotaccent{R}}
+  \DeclareUnicodeCharacter{1E59}{\dotaccent{r}}
+  \DeclareUnicodeCharacter{1E5A}{\udotaccent{R}}
+  \DeclareUnicodeCharacter{1E5B}{\udotaccent{r}}
+  \DeclareUnicodeCharacter{1E5E}{\ubaraccent{R}}
+  \DeclareUnicodeCharacter{1E5F}{\ubaraccent{r}}
+
+  \DeclareUnicodeCharacter{1E60}{\dotaccent{S}}
+  \DeclareUnicodeCharacter{1E61}{\dotaccent{s}}
+  \DeclareUnicodeCharacter{1E62}{\udotaccent{S}}
+  \DeclareUnicodeCharacter{1E63}{\udotaccent{s}}
+  \DeclareUnicodeCharacter{1E6A}{\dotaccent{T}}
+  \DeclareUnicodeCharacter{1E6B}{\dotaccent{t}}
+  \DeclareUnicodeCharacter{1E6C}{\udotaccent{T}}
+  \DeclareUnicodeCharacter{1E6D}{\udotaccent{t}}
+  \DeclareUnicodeCharacter{1E6E}{\ubaraccent{T}}
+  \DeclareUnicodeCharacter{1E6F}{\ubaraccent{t}}
+
+  \DeclareUnicodeCharacter{1E7C}{\~V}
+  \DeclareUnicodeCharacter{1E7D}{\~v}
+  \DeclareUnicodeCharacter{1E7E}{\udotaccent{V}}
+  \DeclareUnicodeCharacter{1E7F}{\udotaccent{v}}
+
+  \DeclareUnicodeCharacter{1E80}{\`W}
+  \DeclareUnicodeCharacter{1E81}{\`w}
+  \DeclareUnicodeCharacter{1E82}{\'W}
+  \DeclareUnicodeCharacter{1E83}{\'w}
+  \DeclareUnicodeCharacter{1E84}{\"W}
+  \DeclareUnicodeCharacter{1E85}{\"w}
+  \DeclareUnicodeCharacter{1E86}{\dotaccent{W}}
+  \DeclareUnicodeCharacter{1E87}{\dotaccent{w}}
+  \DeclareUnicodeCharacter{1E88}{\udotaccent{W}}
+  \DeclareUnicodeCharacter{1E89}{\udotaccent{w}}
+  \DeclareUnicodeCharacter{1E8A}{\dotaccent{X}}
+  \DeclareUnicodeCharacter{1E8B}{\dotaccent{x}}
+  \DeclareUnicodeCharacter{1E8C}{\"X}
+  \DeclareUnicodeCharacter{1E8D}{\"x}
+  \DeclareUnicodeCharacter{1E8E}{\dotaccent{Y}}
+  \DeclareUnicodeCharacter{1E8F}{\dotaccent{y}}
+
+  \DeclareUnicodeCharacter{1E90}{\^Z}
+  \DeclareUnicodeCharacter{1E91}{\^z}
+  \DeclareUnicodeCharacter{1E92}{\udotaccent{Z}}
+  \DeclareUnicodeCharacter{1E93}{\udotaccent{z}}
+  \DeclareUnicodeCharacter{1E94}{\ubaraccent{Z}}
+  \DeclareUnicodeCharacter{1E95}{\ubaraccent{z}}
+  \DeclareUnicodeCharacter{1E96}{\ubaraccent{h}}
+  \DeclareUnicodeCharacter{1E97}{\"t}
+  \DeclareUnicodeCharacter{1E98}{\ringaccent{w}}
+  \DeclareUnicodeCharacter{1E99}{\ringaccent{y}}
+
+  \DeclareUnicodeCharacter{1EA0}{\udotaccent{A}}
+  \DeclareUnicodeCharacter{1EA1}{\udotaccent{a}}
+
+  \DeclareUnicodeCharacter{1EB8}{\udotaccent{E}}
+  \DeclareUnicodeCharacter{1EB9}{\udotaccent{e}}
+  \DeclareUnicodeCharacter{1EBC}{\~E}
+  \DeclareUnicodeCharacter{1EBD}{\~e}
+
+  \DeclareUnicodeCharacter{1ECA}{\udotaccent{I}}
+  \DeclareUnicodeCharacter{1ECB}{\udotaccent{i}}
+  \DeclareUnicodeCharacter{1ECC}{\udotaccent{O}}
+  \DeclareUnicodeCharacter{1ECD}{\udotaccent{o}}
+
+  \DeclareUnicodeCharacter{1EE4}{\udotaccent{U}}
+  \DeclareUnicodeCharacter{1EE5}{\udotaccent{u}}
+
+  \DeclareUnicodeCharacter{1EF2}{\`Y}
+  \DeclareUnicodeCharacter{1EF3}{\`y}
+  \DeclareUnicodeCharacter{1EF4}{\udotaccent{Y}}
+
+  \DeclareUnicodeCharacter{1EF8}{\~Y}
+  \DeclareUnicodeCharacter{1EF9}{\~y}
+
+  \DeclareUnicodeCharacter{2013}{--}
+  \DeclareUnicodeCharacter{2014}{---}
+  \DeclareUnicodeCharacter{2018}{\quoteleft}
+  \DeclareUnicodeCharacter{2019}{\quoteright}
+  \DeclareUnicodeCharacter{201A}{\quotesinglbase}
+  \DeclareUnicodeCharacter{201C}{\quotedblleft}
+  \DeclareUnicodeCharacter{201D}{\quotedblright}
+  \DeclareUnicodeCharacter{201E}{\quotedblbase}
+  \DeclareUnicodeCharacter{2022}{\bullet}
+  \DeclareUnicodeCharacter{2026}{\dots}
+  \DeclareUnicodeCharacter{2039}{\guilsinglleft}
+  \DeclareUnicodeCharacter{203A}{\guilsinglright}
+  \DeclareUnicodeCharacter{20AC}{\euro}
+
+  \DeclareUnicodeCharacter{2192}{\expansion}
+  \DeclareUnicodeCharacter{21D2}{\result}
+
+  \DeclareUnicodeCharacter{2212}{\minus}
+  \DeclareUnicodeCharacter{2217}{\point}
+  \DeclareUnicodeCharacter{2261}{\equiv}
+}% end of \utfeightchardefs
+
+
+% US-ASCII character definitions.
+\def\asciichardefs{% nothing need be done
+   \relax
+}
+
+% Make non-ASCII characters printable again for compatibility with
+% existing Texinfo documents that may use them, even without declaring a
+% document encoding.
+%
+\setnonasciicharscatcode \other
+
+
+\message{formatting,}
+
+\newdimen\defaultparindent \defaultparindent = 15pt
+
+\chapheadingskip = 15pt plus 4pt minus 2pt
+\secheadingskip = 12pt plus 3pt minus 2pt
+\subsecheadingskip = 9pt plus 2pt minus 2pt
+
+% Prevent underfull vbox error messages.
+\vbadness = 10000
+
+% Don't be very finicky about underfull hboxes, either.
+\hbadness = 6666
+
+% Following George Bush, get rid of widows and orphans.
+\widowpenalty=10000
+\clubpenalty=10000
+
+% Use TeX 3.0's \emergencystretch to help line breaking, but if we're
+% using an old version of TeX, don't do anything.  We want the amount of
+% stretch added to depend on the line length, hence the dependence on
+% \hsize.  We call this whenever the paper size is set.
+%
+\def\setemergencystretch{%
+  \ifx\emergencystretch\thisisundefined
+    % Allow us to assign to \emergencystretch anyway.
+    \def\emergencystretch{\dimen0}%
+  \else
+    \emergencystretch = .15\hsize
+  \fi
+}
+
+% Parameters in order: 1) textheight; 2) textwidth;
+% 3) voffset; 4) hoffset; 5) binding offset; 6) topskip;
+% 7) physical page height; 8) physical page width.
+%
+% We also call \setleading{\textleading}, so the caller should define
+% \textleading.  The caller should also set \parskip.
+%
+\def\internalpagesizes#1#2#3#4#5#6#7#8{%
+  \voffset = #3\relax
+  \topskip = #6\relax
+  \splittopskip = \topskip
+  %
+  \vsize = #1\relax
+  \advance\vsize by \topskip
+  \outervsize = \vsize
+  \advance\outervsize by 2\topandbottommargin
+  \pageheight = \vsize
+  %
+  \hsize = #2\relax
+  \outerhsize = \hsize
+  \advance\outerhsize by 0.5in
+  \pagewidth = \hsize
+  %
+  \normaloffset = #4\relax
+  \bindingoffset = #5\relax
+  %
+  \ifpdf
+    \pdfpageheight #7\relax
+    \pdfpagewidth #8\relax
+    % if we don't reset these, they will remain at "1 true in" of
+    % whatever layout pdftex was dumped with.
+    \pdfhorigin = 1 true in
+    \pdfvorigin = 1 true in
+  \fi
+  %
+  \setleading{\textleading}
+  %
+  \parindent = \defaultparindent
+  \setemergencystretch
+}
+
+% @letterpaper (the default).
+\def\letterpaper{{\globaldefs = 1
+  \parskip = 3pt plus 2pt minus 1pt
+  \textleading = 13.2pt
+  %
+  % If page is nothing but text, make it come out even.
+  \internalpagesizes{607.2pt}{6in}% that's 46 lines
+                    {\voffset}{.25in}%
+                    {\bindingoffset}{36pt}%
+                    {11in}{8.5in}%
+}}
+
+% Use @smallbook to reset parameters for 7x9.25 trim size.
+\def\smallbook{{\globaldefs = 1
+  \parskip = 2pt plus 1pt
+  \textleading = 12pt
+  %
+  \internalpagesizes{7.5in}{5in}%
+                    {-.2in}{0in}%
+                    {\bindingoffset}{16pt}%
+                    {9.25in}{7in}%
+  %
+  \lispnarrowing = 0.3in
+  \tolerance = 700
+  \hfuzz = 1pt
+  \contentsrightmargin = 0pt
+  \defbodyindent = .5cm
+}}
+
+% Use @smallerbook to reset parameters for 6x9 trim size.
+% (Just testing, parameters still in flux.)
+\def\smallerbook{{\globaldefs = 1
+  \parskip = 1.5pt plus 1pt
+  \textleading = 12pt
+  %
+  \internalpagesizes{7.4in}{4.8in}%
+                    {-.2in}{-.4in}%
+                    {0pt}{14pt}%
+                    {9in}{6in}%
+  %
+  \lispnarrowing = 0.25in
+  \tolerance = 700
+  \hfuzz = 1pt
+  \contentsrightmargin = 0pt
+  \defbodyindent = .4cm
+}}
+
+% Use @afourpaper to print on European A4 paper.
+\def\afourpaper{{\globaldefs = 1
+  \parskip = 3pt plus 2pt minus 1pt
+  \textleading = 13.2pt
+  %
+  % Double-side printing via postscript on Laserjet 4050
+  % prints double-sided nicely when \bindingoffset=10mm and \hoffset=-6mm.
+  % To change the settings for a different printer or situation, adjust
+  % \normaloffset until the front-side and back-side texts align.  Then
+  % do the same for \bindingoffset.  You can set these for testing in
+  % your texinfo source file like this:
+  % @tex
+  % \global\normaloffset = -6mm
+  % \global\bindingoffset = 10mm
+  % @end tex
+  \internalpagesizes{673.2pt}{160mm}% that's 51 lines
+                    {\voffset}{\hoffset}%
+                    {\bindingoffset}{44pt}%
+                    {297mm}{210mm}%
+  %
+  \tolerance = 700
+  \hfuzz = 1pt
+  \contentsrightmargin = 0pt
+  \defbodyindent = 5mm
+}}
+
+% Use @afivepaper to print on European A5 paper.
+% From romildo@urano.iceb.ufop.br, 2 July 2000.
+% He also recommends making @example and @lisp be small.
+\def\afivepaper{{\globaldefs = 1
+  \parskip = 2pt plus 1pt minus 0.1pt
+  \textleading = 12.5pt
+  %
+  \internalpagesizes{160mm}{120mm}%
+                    {\voffset}{\hoffset}%
+                    {\bindingoffset}{8pt}%
+                    {210mm}{148mm}%
+  %
+  \lispnarrowing = 0.2in
+  \tolerance = 800
+  \hfuzz = 1.2pt
+  \contentsrightmargin = 0pt
+  \defbodyindent = 2mm
+  \tableindent = 12mm
+}}
+
+% A specific text layout, 24x15cm overall, intended for A4 paper.
+\def\afourlatex{{\globaldefs = 1
+  \afourpaper
+  \internalpagesizes{237mm}{150mm}%
+                    {\voffset}{4.6mm}%
+                    {\bindingoffset}{7mm}%
+                    {297mm}{210mm}%
+  %
+  % Must explicitly reset to 0 because we call \afourpaper.
+  \globaldefs = 0
+}}
+
+% Use @afourwide to print on A4 paper in landscape format.
+\def\afourwide{{\globaldefs = 1
+  \afourpaper
+  \internalpagesizes{241mm}{165mm}%
+                    {\voffset}{-2.95mm}%
+                    {\bindingoffset}{7mm}%
+                    {297mm}{210mm}%
+  \globaldefs = 0
+}}
+
+% @pagesizes TEXTHEIGHT[,TEXTWIDTH]
+% Perhaps we should allow setting the margins, \topskip, \parskip,
+% and/or leading, also. Or perhaps we should compute them somehow.
+%
+\parseargdef\pagesizes{\pagesizesyyy #1,,\finish}
+\def\pagesizesyyy#1,#2,#3\finish{{%
+  \setbox0 = \hbox{\ignorespaces #2}\ifdim\wd0 > 0pt \hsize=#2\relax \fi
+  \globaldefs = 1
+  %
+  \parskip = 3pt plus 2pt minus 1pt
+  \setleading{\textleading}%
+  %
+  \dimen0 = #1\relax
+  \advance\dimen0 by \voffset
+  %
+  \dimen2 = \hsize
+  \advance\dimen2 by \normaloffset
+  %
+  \internalpagesizes{#1}{\hsize}%
+                    {\voffset}{\normaloffset}%
+                    {\bindingoffset}{44pt}%
+                    {\dimen0}{\dimen2}%
+}}
+
+% Set default to letter.
+%
+\letterpaper
+
+
+\message{and turning on texinfo input format.}
+
+\def^^L{\par} % remove \outer, so ^L can appear in an @comment
+
+% DEL is a comment character, in case @c does not suffice.
+\catcode`\^^? = 14
+
+% Define macros to output various characters with catcode for normal text.
+\catcode`\"=\other \def\normaldoublequote{"}
+\catcode`\$=\other \def\normaldollar{$}%$ font-lock fix
+\catcode`\+=\other \def\normalplus{+}
+\catcode`\<=\other \def\normalless{<}
+\catcode`\>=\other \def\normalgreater{>}
+\catcode`\^=\other \def\normalcaret{^}
+\catcode`\_=\other \def\normalunderscore{_}
+\catcode`\|=\other \def\normalverticalbar{|}
+\catcode`\~=\other \def\normaltilde{~}
+
+% This macro is used to make a character print one way in \tt
+% (where it can probably be output as-is), and another way in other fonts,
+% where something hairier probably needs to be done.
+%
+% #1 is what to print if we are indeed using \tt; #2 is what to print
+% otherwise.  Since all the Computer Modern typewriter fonts have zero
+% interword stretch (and shrink), and it is reasonable to expect all
+% typewriter fonts to have this, we can check that font parameter.
+%
+\def\ifusingtt#1#2{\ifdim \fontdimen3\font=0pt #1\else #2\fi}
+
+% Same as above, but check for italic font.  Actually this also catches
+% non-italic slanted fonts since it is impossible to distinguish them from
+% italic fonts.  But since this is only used by $ and it uses \sl anyway
+% this is not a problem.
+\def\ifusingit#1#2{\ifdim \fontdimen1\font>0pt #1\else #2\fi}
+
+% Turn off all special characters except @
+% (and those which the user can use as if they were ordinary).
+% Most of these we simply print from the \tt font, but for some, we can
+% use math or other variants that look better in normal text.
+
+\catcode`\"=\active
+\def\activedoublequote{{\tt\char34}}
+\let"=\activedoublequote
+\catcode`\~=\active
+\def~{{\tt\char126}}
+\chardef\hat=`\^
+\catcode`\^=\active
+\def^{{\tt \hat}}
+
+\catcode`\_=\active
+\def_{\ifusingtt\normalunderscore\_}
+\let\realunder=_
+% Subroutine for the previous macro.
+\def\_{\leavevmode \kern.07em \vbox{\hrule width.3em height.1ex}\kern .07em }
+
+\catcode`\|=\active
+\def|{{\tt\char124}}
+\chardef \less=`\<
+\catcode`\<=\active
+\def<{{\tt \less}}
+\chardef \gtr=`\>
+\catcode`\>=\active
+\def>{{\tt \gtr}}
+\catcode`\+=\active
+\def+{{\tt \char 43}}
+\catcode`\$=\active
+\def${\ifusingit{{\sl\$}}\normaldollar}%$ font-lock fix
+
+% If a .fmt file is being used, characters that might appear in a file
+% name cannot be active until we have parsed the command line.
+% So turn them off again, and have \everyjob (or @setfilename) turn them on.
+% \otherifyactive is called near the end of this file.
+\def\otherifyactive{\catcode`+=\other \catcode`\_=\other}
+
+% Used sometimes to turn off (effectively) the active characters even after
+% parsing them.
+\def\turnoffactive{%
+  \normalturnoffactive
+  \otherbackslash
+}
+
+\catcode`\@=0
+
+% \backslashcurfont outputs one backslash character in current font,
+% as in \char`\\.
+\global\chardef\backslashcurfont=`\\
+\global\let\rawbackslashxx=\backslashcurfont  % let existing .??s files work
+
+% \realbackslash is an actual character `\' with catcode other, and
+% \doublebackslash is two of them (for the pdf outlines).
+{\catcode`\\=\other @gdef@realbackslash{\} @gdef@doublebackslash{\\}}
+
+% In texinfo, backslash is an active character; it prints the backslash
+% in fixed width font.
+\catcode`\\=\active  % @ for escape char from now on.
+
+% The story here is that in math mode, the \char of \backslashcurfont
+% ends up printing the roman \ from the math symbol font (because \char
+% in math mode uses the \mathcode, and plain.tex sets
+% \mathcode`\\="026E).  It seems better for @backslashchar{} to always
+% print a typewriter backslash, hence we use an explicit \mathchar,
+% which is the decimal equivalent of "715c (class 7, e.g., use \fam;
+% ignored family value; char position "5C).  We can't use " for the
+% usual hex value because it has already been made active.
+@def@normalbackslash{{@tt @ifmmode @mathchar29020 @else @backslashcurfont @fi}}
+@let@backslashchar = @normalbackslash % @backslashchar{} is for user documents.
+
+% On startup, @fixbackslash assigns:
+%  @let \ = @normalbackslash
+% \rawbackslash defines an active \ to do \backslashcurfont.
+% \otherbackslash defines an active \ to be a literal `\' character with
+% catcode other.  We switch back and forth between these.
+@gdef@rawbackslash{@let\=@backslashcurfont}
+@gdef@otherbackslash{@let\=@realbackslash}
+
+% Same as @turnoffactive except outputs \ as {\tt\char`\\} instead of
+% the literal character `\'.
+%
+@def@normalturnoffactive{%
+  @let"=@normaldoublequote
+  @let$=@normaldollar %$ font-lock fix
+  @let+=@normalplus
+  @let<=@normalless
+  @let>=@normalgreater
+  @let\=@normalbackslash
+  @let^=@normalcaret
+  @let_=@normalunderscore
+  @let|=@normalverticalbar
+  @let~=@normaltilde
+  @markupsetuplqdefault
+  @markupsetuprqdefault
+  @unsepspaces
+}
+
+% Make _ and + \other characters, temporarily.
+% This is canceled by @fixbackslash.
+@otherifyactive
+
+% If a .fmt file is being used, we don't want the `\input texinfo' to show up.
+% That is what \eatinput is for; after that, the `\' should revert to printing
+% a backslash.
+%
+@gdef@eatinput input texinfo{@fixbackslash}
+@global@let\ = @eatinput
+
+% On the other hand, perhaps the file did not have a `\input texinfo'. Then
+% the first `\' in the file would cause an error. This macro tries to fix
+% that, assuming it is called before the first `\' could plausibly occur.
+% Also turn back on active characters that might appear in the input
+% file name, in case not using a pre-dumped format.
+%
+@gdef@fixbackslash{%
+  @ifx\@eatinput @let\ = @normalbackslash @fi
+  @catcode`+=@active
+  @catcode`@_=@active
+}
+
+% Say @foo, not \foo, in error messages.
+@escapechar = `@@
+
+% These (along with & and #) are made active for url-breaking, so need
+% active definitions as the normal characters.
+@def@normaldot{.}
+@def@normalquest{?}
+@def@normalslash{/}
+
+% These look ok in all fonts, so just make them not special.
+% @hashchar{} gets its own user-level command, because of #line.
+@catcode`@& = @other @def@normalamp{&}
+@catcode`@# = @other @def@normalhash{#}
+@catcode`@% = @other @def@normalpercent{%}
+
+@let @hashchar = @normalhash
+
+@c Finally, make ` and ' active, so that txicodequoteundirected and
+@c txicodequotebacktick work right in, e.g., @w{@code{`foo'}}.  If we
+@c don't make ` and ' active, @code will not get them as active chars.
+@c Do this last of all since we use ` in the previous @catcode assignments.
+@catcode`@'=@active
+@catcode`@`=@active
+@markupsetuplqdefault
+@markupsetuprqdefault
+
+@c Local variables:
+@c eval: (add-hook 'write-file-hooks 'time-stamp)
+@c page-delimiter: "^\\\\message"
+@c time-stamp-start: "def\\\\texinfoversion{"
+@c time-stamp-format: "%:y-%02m-%02d.%02H"
+@c time-stamp-end: "}"
+@c End:
+
+@c vim:sw=2:
+
+@ignore
+   arch-tag: e1b36e32-c96e-4135-a41a-0b2efa2ea115
+@end ignore
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/threads.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/threads.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,219 @@
+@node Multi-threaded FFTW, Distributed-memory FFTW with MPI, FFTW Reference, Top
+@chapter Multi-threaded FFTW
+
+@cindex parallel transform
+In this chapter we document the parallel FFTW routines for
+shared-memory parallel hardware.  These routines, which support
+parallel one- and multi-dimensional transforms of both real and
+complex data, are the easiest way to take advantage of multiple
+processors with FFTW.  They work just like the corresponding
+uniprocessor transform routines, except that you have an extra
+initialization routine to call, and there is a routine to set the
+number of threads to employ.  Any program that uses the uniprocessor
+FFTW can therefore be trivially modified to use the multi-threaded
+FFTW.
+
+A shared-memory machine is one in which all CPUs can directly access
+the same main memory, and such machines are now common due to the
+ubiquity of multi-core CPUs.  FFTW's multi-threading support allows
+you to utilize these additional CPUs transparently from a single
+program.  However, this does not necessarily translate into
+performance gains---when multiple threads/CPUs are employed, there is
+an overhead required for synchronization that may outweigh the
+computatational parallelism.  Therefore, you can only benefit from
+threads if your problem is sufficiently large.
+@cindex shared-memory
+@cindex threads
+
+@menu
+* Installation and Supported Hardware/Software::  
+* Usage of Multi-threaded FFTW::  
+* How Many Threads to Use?::    
+* Thread safety::               
+@end menu
+
+@c ------------------------------------------------------------
+@node Installation and Supported Hardware/Software, Usage of Multi-threaded FFTW, Multi-threaded FFTW, Multi-threaded FFTW
+@section Installation and Supported Hardware/Software
+
+All of the FFTW threads code is located in the @code{threads}
+subdirectory of the FFTW package.  On Unix systems, the FFTW threads
+libraries and header files can be automatically configured, compiled,
+and installed along with the uniprocessor FFTW libraries simply by
+including @code{--enable-threads} in the flags to the @code{configure}
+script (@pxref{Installation on Unix}), or @code{--enable-openmp} to use
+@uref{http://www.openmp.org,OpenMP} threads.
+@fpindex configure
+
+
+@cindex portability
+@cindex OpenMP
+The threads routines require your operating system to have some sort
+of shared-memory threads support.  Specifically, the FFTW threads
+package works with POSIX threads (available on most Unix variants,
+from GNU/Linux to MacOS X) and Win32 threads.  OpenMP threads, which
+are supported in many common compilers (e.g. gcc) are also supported,
+and may give better performance on some systems.  (OpenMP threads are
+also useful if you are employing OpenMP in your own code, in order to
+minimize conflicts between threading models.)  If you have a
+shared-memory machine that uses a different threads API, it should be
+a simple matter of programming to include support for it; see the file
+@code{threads/threads.c} for more detail.
+
+You can compile FFTW with @emph{both} @code{--enable-threads} and
+@code{--enable-openmp} at the same time, since they install libraries
+with different names (@samp{fftw3_threads} and @samp{fftw3_omp}, as
+described below).  However, your programs may only link to @emph{one}
+of these two libraries at a time.
+
+Ideally, of course, you should also have multiple processors in order to
+get any benefit from the threaded transforms.
+
+@c ------------------------------------------------------------
+@node Usage of Multi-threaded FFTW, How Many Threads to Use?, Installation and Supported Hardware/Software, Multi-threaded FFTW
+@section Usage of Multi-threaded FFTW
+
+Here, it is assumed that the reader is already familiar with the usage
+of the uniprocessor FFTW routines, described elsewhere in this manual.
+We only describe what one has to change in order to use the
+multi-threaded routines.
+
+@cindex OpenMP
+First, programs using the parallel complex transforms should be linked
+with @code{-lfftw3_threads -lfftw3 -lm} on Unix, or @code{-lfftw3_omp
+-lfftw3 -lm} if you compiled with OpenMP. You will also need to link
+with whatever library is responsible for threads on your system
+(e.g. @code{-lpthread} on GNU/Linux) or include whatever compiler flag
+enables OpenMP (e.g. @code{-fopenmp} with gcc).
+@cindex linking on Unix
+
+
+Second, before calling @emph{any} FFTW routines, you should call the
+function:
+
+@example
+int fftw_init_threads(void);
+@end example
+@findex fftw_init_threads
+
+This function, which need only be called once, performs any one-time
+initialization required to use threads on your system.  It returns zero
+if there was some error (which should not happen under normal
+circumstances) and a non-zero value otherwise.
+
+Third, before creating a plan that you want to parallelize, you should
+call:
+
+@example
+void fftw_plan_with_nthreads(int nthreads);
+@end example
+@findex fftw_plan_with_nthreads
+
+The @code{nthreads} argument indicates the number of threads you want
+FFTW to use (or actually, the maximum number).  All plans subsequently
+created with any planner routine will use that many threads.  You can
+call @code{fftw_plan_with_nthreads}, create some plans, call
+@code{fftw_plan_with_nthreads} again with a different argument, and
+create some more plans for a new number of threads.  Plans already created
+before a call to @code{fftw_plan_with_nthreads} are unaffected.  If you
+pass an @code{nthreads} argument of @code{1} (the default), threads are
+disabled for subsequent plans.
+
+@cindex OpenMP
+With OpenMP, to configure FFTW to use all of the currently running
+OpenMP threads (set by @code{omp_set_num_threads(nthreads)} or by the
+@code{OMP_NUM_THREADS} environment variable), you can do:
+@code{fftw_plan_with_nthreads(omp_get_max_threads())}. (The @samp{omp_}
+OpenMP functions are declared via @code{#include <omp.h>}.)
+
+@cindex thread safety
+Given a plan, you then execute it as usual with
+@code{fftw_execute(plan)}, and the execution will use the number of
+threads specified when the plan was created.  When done, you destroy
+it as usual with @code{fftw_destroy_plan}.  As described in
+@ref{Thread safety}, plan @emph{execution} is thread-safe, but plan
+creation and destruction are @emph{not}: you should create/destroy
+plans only from a single thread, but can safely execute multiple plans
+in parallel.
+
+There is one additional routine: if you want to get rid of all memory
+and other resources allocated internally by FFTW, you can call:
+
+@example
+void fftw_cleanup_threads(void);
+@end example
+@findex fftw_cleanup_threads
+
+which is much like the @code{fftw_cleanup()} function except that it
+also gets rid of threads-related data.  You must @emph{not} execute any
+previously created plans after calling this function.
+
+We should also mention one other restriction: if you save wisdom from a
+program using the multi-threaded FFTW, that wisdom @emph{cannot be used}
+by a program using only the single-threaded FFTW (i.e. not calling
+@code{fftw_init_threads}).  @xref{Words of Wisdom-Saving Plans}.
+
+@c ------------------------------------------------------------
+@node How Many Threads to Use?, Thread safety, Usage of Multi-threaded FFTW, Multi-threaded FFTW
+@section How Many Threads to Use?
+
+@cindex number of threads
+There is a fair amount of overhead involved in synchronizing threads,
+so the optimal number of threads to use depends upon the size of the
+transform as well as on the number of processors you have.
+
+As a general rule, you don't want to use more threads than you have
+processors.  (Using more threads will work, but there will be extra
+overhead with no benefit.)  In fact, if the problem size is too small,
+you may want to use fewer threads than you have processors.
+
+You will have to experiment with your system to see what level of
+parallelization is best for your problem size.  Typically, the problem
+will have to involve at least a few thousand data points before threads
+become beneficial.  If you plan with @code{FFTW_PATIENT}, it will
+automatically disable threads for sizes that don't benefit from
+parallelization.
+@ctindex FFTW_PATIENT
+
+@c ------------------------------------------------------------
+@node Thread safety,  , How Many Threads to Use?, Multi-threaded FFTW
+@section Thread safety
+
+@cindex threads
+@cindex OpenMP
+@cindex thread safety
+Users writing multi-threaded programs (including OpenMP) must concern
+themselves with the @dfn{thread safety} of the libraries they
+use---that is, whether it is safe to call routines in parallel from
+multiple threads.  FFTW can be used in such an environment, but some
+care must be taken because the planner routines share data
+(e.g. wisdom and trigonometric tables) between calls and plans.
+
+The upshot is that the only thread-safe (re-entrant) routine in FFTW is
+@code{fftw_execute} (and the new-array variants thereof).  All other routines
+(e.g. the planner) should only be called from one thread at a time.  So,
+for example, you can wrap a semaphore lock around any calls to the
+planner; even more simply, you can just create all of your plans from
+one thread.  We do not think this should be an important restriction
+(FFTW is designed for the situation where the only performance-sensitive
+code is the actual execution of the transform), and the benefits of
+shared data between plans are great.
+
+Note also that, since the plan is not modified by @code{fftw_execute},
+it is safe to execute the @emph{same plan} in parallel by multiple
+threads.  However, since a given plan operates by default on a fixed
+array, you need to use one of the new-array execute functions (@pxref{New-array Execute Functions}) so that different threads compute the transform of different data.
+
+(Users should note that these comments only apply to programs using
+shared-memory threads or OpenMP.  Parallelism using MPI or forked processes
+involves a separate address-space and global variables for each process,
+and is not susceptible to problems of this sort.)
+
+If you are configured FFTW with the @code{--enable-debug} or
+@code{--enable-debug-malloc} flags (@pxref{Installation on Unix}),
+then @code{fftw_execute} is not thread-safe.  These flags are not
+documented because they are intended only for developing
+and debugging FFTW, but if you must use @code{--enable-debug} then you
+should also specifically pass @code{--disable-debug-malloc} for
+@code{fftw_execute} to be thread-safe.
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/tutorial.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/tutorial.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,905 @@
+@node  Tutorial, Other Important Topics, Introduction, Top
+@chapter Tutorial
+@menu
+* Complex One-Dimensional DFTs::  
+* Complex Multi-Dimensional DFTs::  
+* One-Dimensional DFTs of Real Data::  
+* Multi-Dimensional DFTs of Real Data::  
+* More DFTs of Real Data::      
+@end menu
+
+This chapter describes the basic usage of FFTW, i.e., how to compute
+@cindex basic interface
+the Fourier transform of a single array.  This chapter tells the
+truth, but not the @emph{whole} truth. Specifically, FFTW implements
+additional routines and flags that are not documented here, although
+in many cases we try to indicate where added capabilities exist.  For
+more complete information, see @ref{FFTW Reference}.  (Note that you
+need to compile and install FFTW before you can use it in a program.
+For the details of the installation, see @ref{Installation and
+Customization}.)
+
+We recommend that you read this tutorial in order.@footnote{You can
+read the tutorial in bit-reversed order after computing your first
+transform.}  At the least, read the first section (@pxref{Complex
+One-Dimensional DFTs}) before reading any of the others, even if your
+main interest lies in one of the other transform types.
+
+Users of FFTW version 2 and earlier may also want to read @ref{Upgrading
+from FFTW version 2}.
+
+@c ------------------------------------------------------------
+@node Complex One-Dimensional DFTs, Complex Multi-Dimensional DFTs, Tutorial, Tutorial
+@section Complex One-Dimensional DFTs
+
+@quotation
+Plan: To bother about the best method of accomplishing an accidental result.
+[Ambrose Bierce, @cite{The Enlarged Devil's Dictionary}.]
+@cindex Devil
+@end quotation
+
+@iftex
+@medskip
+@end iftex
+
+The basic usage of FFTW to compute a one-dimensional DFT of size
+@code{N} is simple, and it typically looks something like this code:
+
+@example
+#include <fftw3.h>
+...
+@{
+    fftw_complex *in, *out;
+    fftw_plan p;
+    ...
+    in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
+    out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
+    p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
+    ...
+    fftw_execute(p); /* @r{repeat as needed} */
+    ...
+    fftw_destroy_plan(p);
+    fftw_free(in); fftw_free(out);
+@}
+@end example
+
+You must link this code with the @code{fftw3} library.  On Unix systems,
+link with @code{-lfftw3 -lm}.
+
+The example code first allocates the input and output arrays.  You can
+allocate them in any way that you like, but we recommend using
+@code{fftw_malloc}, which behaves like
+@findex fftw_malloc
+@code{malloc} except that it properly aligns the array when SIMD
+instructions (such as SSE and Altivec) are available (@pxref{SIMD
+alignment and fftw_malloc}). [Alternatively, we provide a convenient wrapper function @code{fftw_alloc_complex(N)} which has the same effect.]
+@findex fftw_alloc_complex
+@cindex SIMD
+
+
+The data is an array of type @code{fftw_complex}, which is by default a
+@code{double[2]} composed of the real (@code{in[i][0]}) and imaginary
+(@code{in[i][1]}) parts of a complex number.
+@tindex fftw_complex
+
+The next step is to create a @dfn{plan}, which is an object
+@cindex plan
+that contains all the data that FFTW needs to compute the FFT. 
+This function creates the plan:
+
+@example
+fftw_plan fftw_plan_dft_1d(int n, fftw_complex *in, fftw_complex *out,
+                           int sign, unsigned flags);
+@end example
+@findex fftw_plan_dft_1d
+@tindex fftw_plan
+
+The first argument, @code{n}, is the size of the transform you are
+trying to compute.  The size @code{n} can be any positive integer, but
+sizes that are products of small factors are transformed most
+efficiently (although prime sizes still use an @Onlogn{} algorithm).
+
+The next two arguments are pointers to the input and output arrays of
+the transform.  These pointers can be equal, indicating an
+@dfn{in-place} transform.
+@cindex in-place
+
+
+The fourth argument, @code{sign}, can be either @code{FFTW_FORWARD}
+(@code{-1}) or @code{FFTW_BACKWARD} (@code{+1}),
+@ctindex FFTW_FORWARD
+@ctindex FFTW_BACKWARD
+and indicates the direction of the transform you are interested in;
+technically, it is the sign of the exponent in the transform.  
+
+The @code{flags} argument is usually either @code{FFTW_MEASURE} or
+@cindex flags
+@code{FFTW_ESTIMATE}.  @code{FFTW_MEASURE} instructs FFTW to run
+@ctindex FFTW_MEASURE
+and measure the execution time of several FFTs in order to find the
+best way to compute the transform of size @code{n}.  This process takes
+some time (usually a few seconds), depending on your machine and on
+the size of the transform.  @code{FFTW_ESTIMATE}, on the contrary,
+does not run any computation and just builds a
+@ctindex FFTW_ESTIMATE
+reasonable plan that is probably sub-optimal.  In short, if your
+program performs many transforms of the same size and initialization
+time is not important, use @code{FFTW_MEASURE}; otherwise use the
+estimate.  
+
+@emph{You must create the plan before initializing the input}, because
+@code{FFTW_MEASURE} overwrites the @code{in}/@code{out} arrays.
+(Technically, @code{FFTW_ESTIMATE} does not touch your arrays, but you
+should always create plans first just to be sure.)
+
+Once the plan has been created, you can use it as many times as you
+like for transforms on the specified @code{in}/@code{out} arrays,
+computing the actual transforms via @code{fftw_execute(plan)}:
+@example
+void fftw_execute(const fftw_plan plan);
+@end example
+@findex fftw_execute
+
+The DFT results are stored in-order in the array @code{out}, with the
+zero-frequency (DC) component in @code{out[0]}.
+@cindex frequency
+If @code{in != out}, the transform is @dfn{out-of-place} and the input
+array @code{in} is not modified.  Otherwise, the input array is
+overwritten with the transform.
+
+@cindex execute
+If you want to transform a @emph{different} array of the same size, you
+can create a new plan with @code{fftw_plan_dft_1d} and FFTW
+automatically reuses the information from the previous plan, if
+possible.  Alternatively, with the ``guru'' interface you can apply a
+given plan to a different array, if you are careful.
+@xref{FFTW Reference}.
+
+When you are done with the plan, you deallocate it by calling
+@code{fftw_destroy_plan(plan)}:
+@example
+void fftw_destroy_plan(fftw_plan plan);
+@end example
+@findex fftw_destroy_plan
+If you allocate an array with @code{fftw_malloc()} you must deallocate
+it with @code{fftw_free()}.  Do not use @code{free()} or, heaven
+forbid, @code{delete}.
+@findex fftw_free
+
+FFTW computes an @emph{unnormalized} DFT.  Thus, computing a forward
+followed by a backward transform (or vice versa) results in the original
+array scaled by @code{n}.  For the definition of the DFT, see @ref{What
+FFTW Really Computes}.
+@cindex DFT
+@cindex normalization
+
+
+If you have a C compiler, such as @code{gcc}, that supports the
+C99 standard, and you @code{#include <complex.h>} @emph{before}
+@code{<fftw3.h>}, then @code{fftw_complex} is the native
+double-precision complex type and you can manipulate it with ordinary
+arithmetic.  Otherwise, FFTW defines its own complex type, which is
+bit-compatible with the C99 complex type. @xref{Complex numbers}.
+(The C++ @code{<complex>} template class may also be usable via a
+typecast.)
+@cindex C++
+
+To use single or long-double precision versions of FFTW, replace the
+@code{fftw_} prefix by @code{fftwf_} or @code{fftwl_} and link with
+@code{-lfftw3f} or @code{-lfftw3l}, but use the @emph{same}
+@code{<fftw3.h>} header file.
+@cindex precision
+
+
+Many more flags exist besides @code{FFTW_MEASURE} and
+@code{FFTW_ESTIMATE}.  For example, use @code{FFTW_PATIENT} if you're
+willing to wait even longer for a possibly even faster plan (@pxref{FFTW
+Reference}).
+@ctindex FFTW_PATIENT
+You can also save plans for future use, as described by @ref{Words of
+Wisdom-Saving Plans}.
+
+@c ------------------------------------------------------------
+@node Complex Multi-Dimensional DFTs, One-Dimensional DFTs of Real Data, Complex One-Dimensional DFTs, Tutorial
+@section Complex Multi-Dimensional DFTs
+
+Multi-dimensional transforms work much the same way as one-dimensional
+transforms: you allocate arrays of @code{fftw_complex} (preferably
+using @code{fftw_malloc}), create an @code{fftw_plan}, execute it as
+many times as you want with @code{fftw_execute(plan)}, and clean up
+with @code{fftw_destroy_plan(plan)} (and @code{fftw_free}).  
+
+FFTW provides two routines for creating plans for 2d and 3d transforms,
+and one routine for creating plans of arbitrary dimensionality.
+The 2d and 3d routines have the following signature:
+@example
+fftw_plan fftw_plan_dft_2d(int n0, int n1,
+                           fftw_complex *in, fftw_complex *out,
+                           int sign, unsigned flags);
+fftw_plan fftw_plan_dft_3d(int n0, int n1, int n2,
+                           fftw_complex *in, fftw_complex *out,
+                           int sign, unsigned flags);
+@end example
+@findex fftw_plan_dft_2d
+@findex fftw_plan_dft_3d
+
+These routines create plans for @code{n0} by @code{n1} two-dimensional
+(2d) transforms and @code{n0} by @code{n1} by @code{n2} 3d transforms,
+respectively.  All of these transforms operate on contiguous arrays in
+the C-standard @dfn{row-major} order, so that the last dimension has the
+fastest-varying index in the array.  This layout is described further in
+@ref{Multi-dimensional Array Format}.
+
+FFTW can also compute transforms of higher dimensionality.  In order to
+avoid confusion between the various meanings of the the word
+``dimension'', we use the term @emph{rank}
+@cindex rank
+to denote the number of independent indices in an array.@footnote{The
+term ``rank'' is commonly used in the APL, FORTRAN, and Common Lisp
+traditions, although it is not so common in the C@tie{}world.}  For
+example, we say that a 2d transform has rank@tie{}2, a 3d transform has
+rank@tie{}3, and so on.  You can plan transforms of arbitrary rank by
+means of the following function:
+
+@example
+fftw_plan fftw_plan_dft(int rank, const int *n,
+                        fftw_complex *in, fftw_complex *out,
+                        int sign, unsigned flags);
+@end example
+@findex fftw_plan_dft
+
+Here, @code{n} is a pointer to an array @code{n[rank]} denoting an
+@code{n[0]} by @code{n[1]} by @dots{} by @code{n[rank-1]} transform.
+Thus, for example, the call
+@example
+fftw_plan_dft_2d(n0, n1, in, out, sign, flags);
+@end example
+is equivalent to the following code fragment:
+@example
+int n[2];
+n[0] = n0;
+n[1] = n1;
+fftw_plan_dft(2, n, in, out, sign, flags);
+@end example
+@code{fftw_plan_dft} is not restricted to 2d and 3d transforms,
+however, but it can plan transforms of arbitrary rank.
+
+You may have noticed that all the planner routines described so far
+have overlapping functionality.  For example, you can plan a 1d or 2d
+transform by using @code{fftw_plan_dft} with a @code{rank} of @code{1}
+or @code{2}, or even by calling @code{fftw_plan_dft_3d} with @code{n0}
+and/or @code{n1} equal to @code{1} (with no loss in efficiency).  This
+pattern continues, and FFTW's planning routines in general form a
+``partial order,'' sequences of
+@cindex partial order
+interfaces with strictly increasing generality but correspondingly
+greater complexity.
+
+@code{fftw_plan_dft} is the most general complex-DFT routine that we
+describe in this tutorial, but there are also the advanced and guru interfaces,
+@cindex advanced interface
+@cindex guru interface 
+which allow one to efficiently combine multiple/strided transforms
+into a single FFTW plan, transform a subset of a larger
+multi-dimensional array, and/or to handle more general complex-number
+formats.  For more information, see @ref{FFTW Reference}.
+
+@c ------------------------------------------------------------
+@node One-Dimensional DFTs of Real Data, Multi-Dimensional DFTs of Real Data, Complex Multi-Dimensional DFTs, Tutorial
+@section One-Dimensional DFTs of Real Data
+
+In many practical applications, the input data @code{in[i]} are purely
+real numbers, in which case the DFT output satisfies the ``Hermitian''
+@cindex Hermitian
+redundancy: @code{out[i]} is the conjugate of @code{out[n-i]}.  It is
+possible to take advantage of these circumstances in order to achieve
+roughly a factor of two improvement in both speed and memory usage.
+
+In exchange for these speed and space advantages, the user sacrifices
+some of the simplicity of FFTW's complex transforms. First of all, the
+input and output arrays are of @emph{different sizes and types}: the
+input is @code{n} real numbers, while the output is @code{n/2+1}
+complex numbers (the non-redundant outputs); this also requires slight
+``padding'' of the input array for
+@cindex padding
+in-place transforms.  Second, the inverse transform (complex to real)
+has the side-effect of @emph{overwriting its input array}, by default.
+Neither of these inconveniences should pose a serious problem for
+users, but it is important to be aware of them.
+
+The routines to perform real-data transforms are almost the same as
+those for complex transforms: you allocate arrays of @code{double}
+and/or @code{fftw_complex} (preferably using @code{fftw_malloc} or
+@code{fftw_alloc_complex}), create an @code{fftw_plan}, execute it as
+many times as you want with @code{fftw_execute(plan)}, and clean up
+with @code{fftw_destroy_plan(plan)} (and @code{fftw_free}).  The only
+differences are that the input (or output) is of type @code{double}
+and there are new routines to create the plan.  In one dimension:
+
+@example
+fftw_plan fftw_plan_dft_r2c_1d(int n, double *in, fftw_complex *out,
+                               unsigned flags);
+fftw_plan fftw_plan_dft_c2r_1d(int n, fftw_complex *in, double *out,
+                               unsigned flags);
+@end example
+@findex fftw_plan_dft_r2c_1d
+@findex fftw_plan_dft_c2r_1d
+
+for the real input to complex-Hermitian output (@dfn{r2c}) and
+complex-Hermitian input to real output (@dfn{c2r}) transforms.
+@cindex r2c
+@cindex c2r
+Unlike the complex DFT planner, there is no @code{sign} argument.
+Instead, r2c DFTs are always @code{FFTW_FORWARD} and c2r DFTs are
+always @code{FFTW_BACKWARD}.
+@ctindex FFTW_FORWARD
+@ctindex FFTW_BACKWARD
+(For single/long-double precision
+@code{fftwf} and @code{fftwl}, @code{double} should be replaced by
+@code{float} and @code{long double}, respectively.)
+@cindex precision
+
+
+Here, @code{n} is the ``logical'' size of the DFT, not necessarily the
+physical size of the array.  In particular, the real (@code{double})
+array has @code{n} elements, while the complex (@code{fftw_complex})
+array has @code{n/2+1} elements (where the division is rounded down).
+For an in-place transform,
+@cindex in-place
+@code{in} and @code{out} are aliased to the same array, which must be
+big enough to hold both; so, the real array would actually have
+@code{2*(n/2+1)} elements, where the elements beyond the first
+@code{n} are unused padding.  (Note that this is very different from
+the concept of ``zero-padding'' a transform to a larger length, which
+changes the logical size of the DFT by actually adding new input
+data.)  The @math{k}th element of the complex array is exactly the
+same as the @math{k}th element of the corresponding complex DFT.  All
+positive @code{n} are supported; products of small factors are most
+efficient, but an @Onlogn algorithm is used even for prime sizes.
+
+As noted above, the c2r transform destroys its input array even for
+out-of-place transforms.  This can be prevented, if necessary, by
+including @code{FFTW_PRESERVE_INPUT} in the @code{flags}, with
+unfortunately some sacrifice in performance.
+@cindex flags
+@ctindex FFTW_PRESERVE_INPUT
+This flag is also not currently supported for multi-dimensional real
+DFTs (next section).
+
+Readers familiar with DFTs of real data will recall that the 0th (the
+``DC'') and @code{n/2}-th (the ``Nyquist'' frequency, when @code{n} is
+even) elements of the complex output are purely real.  Some
+implementations therefore store the Nyquist element where the DC
+imaginary part would go, in order to make the input and output arrays
+the same size.  Such packing, however, does not generalize well to
+multi-dimensional transforms, and the space savings are miniscule in
+any case; FFTW does not support it.
+
+An alternative interface for one-dimensional r2c and c2r DFTs can be
+found in the @samp{r2r} interface (@pxref{The Halfcomplex-format
+DFT}), with ``halfcomplex''-format output that @emph{is} the same size
+(and type) as the input array.
+@cindex halfcomplex format
+That interface, although it is not very useful for multi-dimensional
+transforms, may sometimes yield better performance.
+
+@c ------------------------------------------------------------
+@node Multi-Dimensional DFTs of Real Data, More DFTs of Real Data, One-Dimensional DFTs of Real Data, Tutorial
+@section Multi-Dimensional DFTs of Real Data
+
+Multi-dimensional DFTs of real data use the following planner routines:
+
+@example
+fftw_plan fftw_plan_dft_r2c_2d(int n0, int n1,
+                               double *in, fftw_complex *out,
+                               unsigned flags);
+fftw_plan fftw_plan_dft_r2c_3d(int n0, int n1, int n2,
+                               double *in, fftw_complex *out,
+                               unsigned flags);
+fftw_plan fftw_plan_dft_r2c(int rank, const int *n,
+                            double *in, fftw_complex *out,
+                            unsigned flags);
+@end example
+@findex fftw_plan_dft_r2c_2d
+@findex fftw_plan_dft_r2c_3d
+@findex fftw_plan_dft_r2c
+
+as well as the corresponding @code{c2r} routines with the input/output
+types swapped.  These routines work similarly to their complex
+analogues, except for the fact that here the complex output array is cut
+roughly in half and the real array requires padding for in-place
+transforms (as in 1d, above).
+
+As before, @code{n} is the logical size of the array, and the
+consequences of this on the the format of the complex arrays deserve
+careful attention.
+@cindex r2c/c2r multi-dimensional array format
+Suppose that the real data has dimensions @ndims (in row-major order).
+Then, after an r2c transform, the output is an @ndimshalf array of
+@code{fftw_complex} values in row-major order, corresponding to slightly
+over half of the output of the corresponding complex DFT.  (The division
+is rounded down.)  The ordering of the data is otherwise exactly the
+same as in the complex-DFT case.
+
+For out-of-place transforms, this is the end of the story: the real
+data is stored as a row-major array of size @ndims and the complex
+data is stored as a row-major array of size @ndimshalf{}.
+
+For in-place transforms, however, extra padding of the real-data array
+is necessary because the complex array is larger than the real array,
+and the two arrays share the same memory locations.  Thus, for
+in-place transforms, the final dimension of the real-data array must
+be padded with extra values to accommodate the size of the complex
+data---two values if the last dimension is even and one if it is odd.
+@cindex padding
+That is, the last dimension of the real data must physically contain
+@tex
+$2 (n_{d-1}/2+1)$
+@end tex
+@ifinfo
+2 * (n[d-1]/2+1)
+@end ifinfo
+@html
+2 * (n<sub>d-1</sub>/2+1)
+@end html
+@code{double} values (exactly enough to hold the complex data).
+This physical array size does not, however, change the @emph{logical}
+array size---only
+@tex
+$n_{d-1}$
+@end tex
+@ifinfo
+n[d-1]
+@end ifinfo
+@html
+n<sub>d-1</sub>
+@end html
+values are actually stored in the last dimension, and
+@tex
+$n_{d-1}$
+@end tex
+@ifinfo
+n[d-1]
+@end ifinfo
+@html
+n<sub>d-1</sub>
+@end html
+is the last dimension passed to the plan-creation routine.
+
+For example, consider the transform of a two-dimensional real array of
+size @code{n0} by @code{n1}.  The output of the r2c transform is a
+two-dimensional complex array of size @code{n0} by @code{n1/2+1}, where
+the @code{y} dimension has been cut nearly in half because of
+redundancies in the output.  Because @code{fftw_complex} is twice the
+size of @code{double}, the output array is slightly bigger than the
+input array.  Thus, if we want to compute the transform in place, we
+must @emph{pad} the input array so that it is of size @code{n0} by
+@code{2*(n1/2+1)}.  If @code{n1} is even, then there are two padding
+elements at the end of each row (which need not be initialized, as they
+are only used for output).
+
+@ifhtml
+The following illustration depicts the input and output arrays just
+described, for both the out-of-place and in-place transforms (with the
+arrows indicating consecutive memory locations):
+@image{rfftwnd-for-html}
+@end ifhtml
+@ifnotinfo
+@ifnothtml
+@float Figure,fig:rfftwnd
+@center @image{rfftwnd}
+@caption{Illustration of the data layout for a 2d @code{nx} by @code{ny}
+real-to-complex transform.}
+@end float
+@ref{fig:rfftwnd} depicts the input and output arrays just
+described, for both the out-of-place and in-place transforms (with the
+arrows indicating consecutive memory locations):
+@end ifnothtml
+@end ifnotinfo
+
+These transforms are unnormalized, so an r2c followed by a c2r
+transform (or vice versa) will result in the original data scaled by
+the number of real data elements---that is, the product of the
+(logical) dimensions of the real data.
+@cindex normalization
+
+
+(Because the last dimension is treated specially, if it is equal to
+@code{1} the transform is @emph{not} equivalent to a lower-dimensional
+r2c/c2r transform.  In that case, the last complex dimension also has
+size @code{1} (@code{=1/2+1}), and no advantage is gained over the
+complex transforms.)
+
+@c ------------------------------------------------------------
+@node More DFTs of Real Data,  , Multi-Dimensional DFTs of Real Data, Tutorial
+@section More DFTs of Real Data
+@menu
+* The Halfcomplex-format DFT::  
+* Real even/odd DFTs (cosine/sine transforms)::  
+* The Discrete Hartley Transform::  
+@end menu
+
+FFTW supports several other transform types via a unified @dfn{r2r}
+(real-to-real) interface,
+@cindex r2r
+so called because it takes a real (@code{double}) array and outputs a
+real array of the same size.  These r2r transforms currently fall into
+three categories: DFTs of real input and complex-Hermitian output in
+halfcomplex format, DFTs of real input with even/odd symmetry
+(a.k.a. discrete cosine/sine transforms, DCTs/DSTs), and discrete
+Hartley transforms (DHTs), all described in more detail by the
+following sections.
+
+The r2r transforms follow the by now familiar interface of creating an
+@code{fftw_plan}, executing it with @code{fftw_execute(plan)}, and
+destroying it with @code{fftw_destroy_plan(plan)}.  Furthermore, all
+r2r transforms share the same planner interface:
+
+@example
+fftw_plan fftw_plan_r2r_1d(int n, double *in, double *out,
+                           fftw_r2r_kind kind, unsigned flags);
+fftw_plan fftw_plan_r2r_2d(int n0, int n1, double *in, double *out,
+                           fftw_r2r_kind kind0, fftw_r2r_kind kind1,
+                           unsigned flags);
+fftw_plan fftw_plan_r2r_3d(int n0, int n1, int n2,
+                           double *in, double *out,
+                           fftw_r2r_kind kind0,
+                           fftw_r2r_kind kind1,
+                           fftw_r2r_kind kind2,
+                           unsigned flags);
+fftw_plan fftw_plan_r2r(int rank, const int *n, double *in, double *out,
+                        const fftw_r2r_kind *kind, unsigned flags);
+@end example
+@findex fftw_plan_r2r_1d
+@findex fftw_plan_r2r_2d
+@findex fftw_plan_r2r_3d
+@findex fftw_plan_r2r
+
+Just as for the complex DFT, these plan 1d/2d/3d/multi-dimensional
+transforms for contiguous arrays in row-major order, transforming (real)
+input to output of the same size, where @code{n} specifies the
+@emph{physical} dimensions of the arrays.  All positive @code{n} are
+supported (with the exception of @code{n=1} for the @code{FFTW_REDFT00}
+kind, noted in the real-even subsection below); products of small
+factors are most efficient (factorizing @code{n-1} and @code{n+1} for
+@code{FFTW_REDFT00} and @code{FFTW_RODFT00} kinds, described below), but
+an @Onlogn algorithm is used even for prime sizes.
+
+Each dimension has a @dfn{kind} parameter, of type
+@code{fftw_r2r_kind}, specifying the kind of r2r transform to be used
+for that dimension.
+@cindex kind (r2r)
+@tindex fftw_r2r_kind
+(In the case of @code{fftw_plan_r2r}, this is an array @code{kind[rank]}
+where @code{kind[i]} is the transform kind for the dimension
+@code{n[i]}.)  The kind can be one of a set of predefined constants,
+defined in the following subsections.
+
+In other words, FFTW computes the separable product of the specified
+r2r transforms over each dimension, which can be used e.g. for partial
+differential equations with mixed boundary conditions.  (For some r2r
+kinds, notably the halfcomplex DFT and the DHT, such a separable
+product is somewhat problematic in more than one dimension, however,
+as is described below.)
+
+In the current version of FFTW, all r2r transforms except for the
+halfcomplex type are computed via pre- or post-processing of
+halfcomplex transforms, and they are therefore not as fast as they
+could be.  Since most other general DCT/DST codes employ a similar
+algorithm, however, FFTW's implementation should provide at least
+competitive performance.
+
+@c =========>
+@node The Halfcomplex-format DFT, Real even/odd DFTs (cosine/sine transforms), More DFTs of Real Data, More DFTs of Real Data
+@subsection The Halfcomplex-format DFT
+
+An r2r kind of @code{FFTW_R2HC} (@dfn{r2hc}) corresponds to an r2c DFT
+@ctindex FFTW_R2HC
+@cindex r2c
+@cindex r2hc
+(@pxref{One-Dimensional DFTs of Real Data}) but with ``halfcomplex''
+format output, and may sometimes be faster and/or more convenient than
+the latter.
+@cindex halfcomplex format
+The inverse @dfn{hc2r} transform is of kind @code{FFTW_HC2R}.
+@ctindex FFTW_HC2R
+@cindex hc2r
+This consists of the non-redundant half of the complex output for a 1d
+real-input DFT of size @code{n}, stored as a sequence of @code{n} real
+numbers (@code{double}) in the format:
+
+@tex
+$$
+r_0, r_1, r_2, \ldots, r_{n/2}, i_{(n+1)/2-1}, \ldots, i_2, i_1
+$$
+@end tex
+@ifinfo
+r0, r1, r2, r(n/2), i((n+1)/2-1), ..., i2, i1
+@end ifinfo
+@html
+<p align=center>
+r<sub>0</sub>, r<sub>1</sub>, r<sub>2</sub>, ..., r<sub>n/2</sub>, i<sub>(n+1)/2-1</sub>, ..., i<sub>2</sub>, i<sub>1</sub>
+</p>
+@end html
+
+Here,
+@ifinfo
+rk
+@end ifinfo
+@tex
+$r_k$
+@end tex
+@html
+r<sub>k</sub>
+@end html
+is the real part of the @math{k}th output, and
+@ifinfo
+ik
+@end ifinfo
+@tex
+$i_k$
+@end tex
+@html
+i<sub>k</sub>
+@end html
+is the imaginary part.  (Division by 2 is rounded down.) For a
+halfcomplex array @code{hc[n]}, the @math{k}th component thus has its
+real part in @code{hc[k]} and its imaginary part in @code{hc[n-k]}, with
+the exception of @code{k} @code{==} @code{0} or @code{n/2} (the latter
+only if @code{n} is even)---in these two cases, the imaginary part is
+zero due to symmetries of the real-input DFT, and is not stored.
+Thus, the r2hc transform of @code{n} real values is a halfcomplex array of
+length @code{n}, and vice versa for hc2r.
+@cindex normalization
+
+
+Aside from the differing format, the output of
+@code{FFTW_R2HC}/@code{FFTW_HC2R} is otherwise exactly the same as for
+the corresponding 1d r2c/c2r transform
+(i.e. @code{FFTW_FORWARD}/@code{FFTW_BACKWARD} transforms, respectively).
+Recall that these transforms are unnormalized, so r2hc followed by hc2r
+will result in the original data multiplied by @code{n}.  Furthermore,
+like the c2r transform, an out-of-place hc2r transform will
+@emph{destroy its input} array.
+
+Although these halfcomplex transforms can be used with the
+multi-dimensional r2r interface, the interpretation of such a separable
+product of transforms along each dimension is problematic.  For example,
+consider a two-dimensional @code{n0} by @code{n1}, r2hc by r2hc
+transform planned by @code{fftw_plan_r2r_2d(n0, n1, in, out, FFTW_R2HC,
+FFTW_R2HC, FFTW_MEASURE)}.  Conceptually, FFTW first transforms the rows
+(of size @code{n1}) to produce halfcomplex rows, and then transforms the
+columns (of size @code{n0}).  Half of these column transforms, however,
+are of imaginary parts, and should therefore be multiplied by @math{i}
+and combined with the r2hc transforms of the real columns to produce the
+2d DFT amplitudes; FFTW's r2r transform does @emph{not} perform this
+combination for you.  Thus, if a multi-dimensional real-input/output DFT
+is required, we recommend using the ordinary r2c/c2r
+interface (@pxref{Multi-Dimensional DFTs of Real Data}).
+
+@c =========>
+@node Real even/odd DFTs (cosine/sine transforms), The Discrete Hartley Transform, The Halfcomplex-format DFT, More DFTs of Real Data
+@subsection Real even/odd DFTs (cosine/sine transforms)
+
+The Fourier transform of a real-even function @math{f(-x) = f(x)} is
+real-even, and @math{i} times the Fourier transform of a real-odd
+function @math{f(-x) = -f(x)} is real-odd.  Similar results hold for a
+discrete Fourier transform, and thus for these symmetries the need for
+complex inputs/outputs is entirely eliminated.  Moreover, one gains a
+factor of two in speed/space from the fact that the data are real, and
+an additional factor of two from the even/odd symmetry: only the
+non-redundant (first) half of the array need be stored.  The result is
+the real-even DFT (@dfn{REDFT}) and the real-odd DFT (@dfn{RODFT}), also
+known as the discrete cosine and sine transforms (@dfn{DCT} and
+@dfn{DST}), respectively.
+@cindex real-even DFT
+@cindex REDFT
+@cindex real-odd DFT
+@cindex RODFT
+@cindex discrete cosine transform
+@cindex DCT
+@cindex discrete sine transform
+@cindex DST
+
+
+(In this section, we describe the 1d transforms; multi-dimensional
+transforms are just a separable product of these transforms operating
+along each dimension.)
+
+Because of the discrete sampling, one has an additional choice: is the
+data even/odd around a sampling point, or around the point halfway
+between two samples?  The latter corresponds to @emph{shifting} the
+samples by @emph{half} an interval, and gives rise to several transform
+variants denoted by REDFT@math{ab} and RODFT@math{ab}: @math{a} and
+@math{b} are @math{0} or @math{1}, and indicate whether the input
+(@math{a}) and/or output (@math{b}) are shifted by half a sample
+(@math{1} means it is shifted).  These are also known as types I-IV of
+the DCT and DST, and all four types are supported by FFTW's r2r
+interface.@footnote{There are also type V-VIII transforms, which
+correspond to a logical DFT of @emph{odd} size @math{N}, independent of
+whether the physical size @code{n} is odd, but we do not support these
+variants.}
+
+The r2r kinds for the various REDFT and RODFT types supported by FFTW,
+along with the boundary conditions at both ends of the @emph{input}
+array (@code{n} real numbers @code{in[j=0..n-1]}), are:
+
+@itemize @bullet
+
+@item
+@code{FFTW_REDFT00} (DCT-I): even around @math{j=0} and even around @math{j=n-1}.
+@ctindex FFTW_REDFT00
+
+@item
+@code{FFTW_REDFT10} (DCT-II, ``the'' DCT): even around @math{j=-0.5} and even around @math{j=n-0.5}.
+@ctindex FFTW_REDFT10
+
+@item
+@code{FFTW_REDFT01} (DCT-III, ``the'' IDCT): even around @math{j=0} and odd around @math{j=n}.
+@ctindex FFTW_REDFT01
+@cindex IDCT
+
+@item
+@code{FFTW_REDFT11} (DCT-IV): even around @math{j=-0.5} and odd around @math{j=n-0.5}.
+@ctindex FFTW_REDFT11
+
+@item
+@code{FFTW_RODFT00} (DST-I): odd around @math{j=-1} and odd around @math{j=n}.
+@ctindex FFTW_RODFT00
+
+@item
+@code{FFTW_RODFT10} (DST-II): odd around @math{j=-0.5} and odd around @math{j=n-0.5}.
+@ctindex FFTW_RODFT10
+
+@item
+@code{FFTW_RODFT01} (DST-III): odd around @math{j=-1} and even around @math{j=n-1}.
+@ctindex FFTW_RODFT01
+
+@item
+@code{FFTW_RODFT11} (DST-IV): odd around @math{j=-0.5} and even around @math{j=n-0.5}.
+@ctindex FFTW_RODFT11
+
+@end itemize
+
+Note that these symmetries apply to the ``logical'' array being
+transformed; @strong{there are no constraints on your physical input
+data}.  So, for example, if you specify a size-5 REDFT00 (DCT-I) of the
+data @math{abcde}, it corresponds to the DFT of the logical even array
+@math{abcdedcb} of size 8.  A size-4 REDFT10 (DCT-II) of the data
+@math{abcd} corresponds to the size-8 logical DFT of the even array
+@math{abcddcba}, shifted by half a sample.
+
+All of these transforms are invertible.  The inverse of R*DFT00 is
+R*DFT00; of R*DFT10 is R*DFT01 and vice versa (these are often called
+simply ``the'' DCT and IDCT, respectively); and of R*DFT11 is R*DFT11.
+However, the transforms computed by FFTW are unnormalized, exactly
+like the corresponding real and complex DFTs, so computing a transform
+followed by its inverse yields the original array scaled by @math{N},
+where @math{N} is the @emph{logical} DFT size.  For REDFT00,
+@math{N=2(n-1)}; for RODFT00, @math{N=2(n+1)}; otherwise, @math{N=2n}.
+@cindex normalization
+@cindex IDCT
+
+
+Note that the boundary conditions of the transform output array are
+given by the input boundary conditions of the inverse transform.
+Thus, the above transforms are all inequivalent in terms of
+input/output boundary conditions, even neglecting the 0.5 shift
+difference.
+
+FFTW is most efficient when @math{N} is a product of small factors; note
+that this @emph{differs} from the factorization of the physical size
+@code{n} for REDFT00 and RODFT00!  There is another oddity: @code{n=1}
+REDFT00 transforms correspond to @math{N=0}, and so are @emph{not
+defined} (the planner will return @code{NULL}).  Otherwise, any positive
+@code{n} is supported.
+
+For the precise mathematical definitions of these transforms as used by
+FFTW, see @ref{What FFTW Really Computes}.  (For people accustomed to
+the DCT/DST, FFTW's definitions have a coefficient of @math{2} in front
+of the cos/sin functions so that they correspond precisely to an
+even/odd DFT of size @math{N}.  Some authors also include additional
+multiplicative factors of 
+@ifinfo
+sqrt(2)
+@end ifinfo
+@html
+&radic;2
+@end html
+@tex
+$\sqrt{2}$
+@end tex
+for selected inputs and outputs; this makes
+the transform orthogonal, but sacrifices the direct equivalence to a
+symmetric DFT.)
+
+@subsubheading Which type do you need?
+
+Since the required flavor of even/odd DFT depends upon your problem,
+you are the best judge of this choice, but we can make a few comments
+on relative efficiency to help you in your selection.  In particular,
+R*DFT01 and R*DFT10 tend to be slightly faster than R*DFT11
+(especially for odd sizes), while the R*DFT00 transforms are sometimes
+significantly slower (especially for even sizes).@footnote{R*DFT00 is
+sometimes slower in FFTW because we discovered that the standard
+algorithm for computing this by a pre/post-processed real DFT---the
+algorithm used in FFTPACK, Numerical Recipes, and other sources for
+decades now---has serious numerical problems: it already loses several
+decimal places of accuracy for 16k sizes.  There seem to be only two
+alternatives in the literature that do not suffer similarly: a
+recursive decomposition into smaller DCTs, which would require a large
+set of codelets for efficiency and generality, or sacrificing a factor of 
+@tex
+$\sim 2$
+@end tex
+@ifnottex
+2
+@end ifnottex
+in speed to use a real DFT of twice the size.  We currently
+employ the latter technique for general @math{n}, as well as a limited
+form of the former method: a split-radix decomposition when @math{n}
+is odd (@math{N} a multiple of 4).  For @math{N} containing many
+factors of 2, the split-radix method seems to recover most of the
+speed of the standard algorithm without the accuracy tradeoff.}
+
+Thus, if only the boundary conditions on the transform inputs are
+specified, we generally recommend R*DFT10 over R*DFT00 and R*DFT01 over
+R*DFT11 (unless the half-sample shift or the self-inverse property is
+significant for your problem).
+
+If performance is important to you and you are using only small sizes
+(say @math{n<200}), e.g. for multi-dimensional transforms, then you
+might consider generating hard-coded transforms of those sizes and types
+that you are interested in (@pxref{Generating your own code}).
+
+We are interested in hearing what types of symmetric transforms you find
+most useful.
+
+@c =========>
+@node The Discrete Hartley Transform,  , Real even/odd DFTs (cosine/sine transforms), More DFTs of Real Data
+@subsection The Discrete Hartley Transform
+
+If you are planning to use the DHT because you've heard that it is
+``faster'' than the DFT (FFT), @strong{stop here}.  The DHT is not
+faster than the DFT.  That story is an old but enduring misconception
+that was debunked in 1987.
+
+The discrete Hartley transform (DHT) is an invertible linear transform
+closely related to the DFT.  In the DFT, one multiplies each input by
+@math{cos - i * sin} (a complex exponential), whereas in the DHT each
+input is multiplied by simply @math{cos + sin}.  Thus, the DHT
+transforms @code{n} real numbers to @code{n} real numbers, and has the
+convenient property of being its own inverse.  In FFTW, a DHT (of any
+positive @code{n}) can be specified by an r2r kind of @code{FFTW_DHT}.
+@ctindex FFTW_DHT
+@cindex discrete Hartley transform
+@cindex DHT
+
+Like the DFT, in FFTW the DHT is unnormalized, so computing a DHT of
+size @code{n} followed by another DHT of the same size will result in
+the original array multiplied by @code{n}.
+@cindex normalization
+
+The DHT was originally proposed as a more efficient alternative to the
+DFT for real data, but it was subsequently shown that a specialized DFT
+(such as FFTW's r2hc or r2c transforms) could be just as fast.  In FFTW,
+the DHT is actually computed by post-processing an r2hc transform, so
+there is ordinarily no reason to prefer it from a performance
+perspective.@footnote{We provide the DHT mainly as a byproduct of some
+internal algorithms. FFTW computes a real input/output DFT of
+@emph{prime} size by re-expressing it as a DHT plus post/pre-processing
+and then using Rader's prime-DFT algorithm adapted to the DHT.}
+However, we have heard rumors that the DHT might be the most appropriate
+transform in its own right for certain applications, and we would be
+very interested to hear from anyone who finds it useful.
+
+If @code{FFTW_DHT} is specified for multiple dimensions of a
+multi-dimensional transform, FFTW computes the separable product of 1d
+DHTs along each dimension.  Unfortunately, this is not quite the same
+thing as a true multi-dimensional DHT; you can compute the latter, if
+necessary, with at most @code{rank-1} post-processing passes
+[see e.g. H. Hao and R. N. Bracewell, @i{Proc. IEEE} @b{75}, 264--266 (1987)].
+
+For the precise mathematical definition of the DHT as used by FFTW, see
+@ref{What FFTW Really Computes}.
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/upgrading.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/upgrading.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,198 @@
+@node Upgrading from FFTW version 2, Installation and Customization, Calling FFTW from Legacy Fortran, Top
+@chapter Upgrading from FFTW version 2
+
+In this chapter, we outline the process for updating codes designed for
+the older FFTW 2 interface to work with FFTW 3.  The interface for FFTW
+3 is not backwards-compatible with the interface for FFTW 2 and earlier
+versions; codes written to use those versions will fail to link with
+FFTW 3.  Nor is it possible to write ``compatibility wrappers'' to
+bridge the gap (at least not efficiently), because FFTW 3 has different
+semantics from previous versions.  However, upgrading should be a
+straightforward process because the data formats are identical and the
+overall style of planning/execution is essentially the same.
+
+Unlike FFTW 2, there are no separate header files for real and complex
+transforms (or even for different precisions) in FFTW 3; all interfaces
+are defined in the @code{<fftw3.h>} header file.
+
+@heading Numeric Types
+
+The main difference in data types is that @code{fftw_complex} in FFTW 2
+was defined as a @code{struct} with macros @code{c_re} and @code{c_im}
+for accessing the real/imaginary parts.  (This is binary-compatible with
+FFTW 3 on any machine except perhaps for some older Crays in single
+precision.)  The equivalent macros for FFTW 3 are:
+
+@example
+#define c_re(c) ((c)[0])
+#define c_im(c) ((c)[1])
+@end example
+
+This does not work if you are using the C99 complex type, however,
+unless you insert a @code{double*} typecast into the above macros
+(@pxref{Complex numbers}).
+
+Also, FFTW 2 had an @code{fftw_real} typedef that was an alias for
+@code{double} (in double precision).  In FFTW 3 you should just use
+@code{double} (or whatever precision you are employing).
+
+@heading Plans
+
+The major difference between FFTW 2 and FFTW 3 is in the
+planning/execution division of labor.  In FFTW 2, plans were found for a
+given transform size and type, and then could be applied to @emph{any}
+arrays and for @emph{any} multiplicity/stride parameters.  In FFTW 3,
+you specify the particular arrays, stride parameters, etcetera when
+creating the plan, and the plan is then executed for @emph{those} arrays
+(unless the guru interface is used) and @emph{those} parameters
+@emph{only}.  (FFTW 2 had ``specific planner'' routines that planned for
+a particular array and stride, but the plan could still be used for
+other arrays and strides.)  That is, much of the information that was
+formerly specified at execution time is now specified at planning time.
+
+Like FFTW 2's specific planner routines, the FFTW 3 planner overwrites
+the input/output arrays unless you use @code{FFTW_ESTIMATE}.
+
+FFTW 2 had separate data types @code{fftw_plan}, @code{fftwnd_plan},
+@code{rfftw_plan}, and @code{rfftwnd_plan} for complex and real one- and
+multi-dimensional transforms, and each type had its own @samp{destroy}
+function.  In FFTW 3, all plans are of type @code{fftw_plan} and all are
+destroyed by @code{fftw_destroy_plan(plan)}.
+
+Where you formerly used @code{fftw_create_plan} and @code{fftw_one} to
+plan and compute a single 1d transform, you would now use
+@code{fftw_plan_dft_1d} to plan the transform.  If you used the generic
+@code{fftw} function to execute the transform with multiplicity
+(@code{howmany}) and stride parameters, you would now use the advanced
+interface @code{fftw_plan_many_dft} to specify those parameters.  The
+plans are now executed with @code{fftw_execute(plan)}, which takes all
+of its parameters (including the input/output arrays) from the plan.
+
+In-place transforms no longer interpret their output argument as scratch
+space, nor is there an @code{FFTW_IN_PLACE} flag.  You simply pass the
+same pointer for both the input and output arguments.  (Previously, the
+output @code{ostride} and @code{odist} parameters were ignored for
+in-place transforms; now, if they are specified via the advanced
+interface, they are significant even in the in-place case, although they
+should normally equal the corresponding input parameters.)
+
+The @code{FFTW_ESTIMATE} and @code{FFTW_MEASURE} flags have the same
+meaning as before, although the planning time will differ.  You may also
+consider using @code{FFTW_PATIENT}, which is like @code{FFTW_MEASURE}
+except that it takes more time in order to consider a wider variety of
+algorithms.
+
+For multi-dimensional complex DFTs, instead of @code{fftwnd_create_plan}
+(or @code{fftw2d_create_plan} or @code{fftw3d_create_plan}), followed by
+@code{fftwnd_one}, you would use @code{fftw_plan_dft} (or
+@code{fftw_plan_dft_2d} or @code{fftw_plan_dft_3d}).  followed by
+@code{fftw_execute}.  If you used @code{fftwnd} to to specify strides
+etcetera, you would instead specify these via @code{fftw_plan_many_dft}.
+
+The analogues to @code{rfftw_create_plan} and @code{rfftw_one} with
+@code{FFTW_REAL_TO_COMPLEX} or @code{FFTW_COMPLEX_TO_REAL} directions
+are @code{fftw_plan_r2r_1d} with kind @code{FFTW_R2HC} or
+@code{FFTW_HC2R}, followed by @code{fftw_execute}.  The stride etcetera
+arguments of @code{rfftw} are now in @code{fftw_plan_many_r2r}.
+
+Instead of @code{rfftwnd_create_plan} (or @code{rfftw2d_create_plan} or
+@code{rfftw3d_create_plan}) followed by
+@code{rfftwnd_one_real_to_complex} or
+@code{rfftwnd_one_complex_to_real}, you now use @code{fftw_plan_dft_r2c}
+(or @code{fftw_plan_dft_r2c_2d} or @code{fftw_plan_dft_r2c_3d}) or
+@code{fftw_plan_dft_c2r} (or @code{fftw_plan_dft_c2r_2d} or
+@code{fftw_plan_dft_c2r_3d}), respectively, followed by
+@code{fftw_execute}.  As usual, the strides etcetera of
+@code{rfftwnd_real_to_complex} or @code{rfftwnd_complex_to_real} are no
+specified in the advanced planner routines,
+@code{fftw_plan_many_dft_r2c} or @code{fftw_plan_many_dft_c2r}.
+
+@heading Wisdom
+
+In FFTW 2, you had to supply the @code{FFTW_USE_WISDOM} flag in order to
+use wisdom; in FFTW 3, wisdom is always used.  (You could simulate the
+FFTW 2 wisdom-less behavior by calling @code{fftw_forget_wisdom} after
+every planner call.)
+
+The FFTW 3 wisdom import/export routines are almost the same as before
+(although the storage format is entirely different).  There is one
+significant difference, however.  In FFTW 2, the import routines would
+never read past the end of the wisdom, so you could store extra data
+beyond the wisdom in the same file, for example.  In FFTW 3, the
+file-import routine may read up to a few hundred bytes past the end of
+the wisdom, so you cannot store other data just beyond it.@footnote{We
+do our own buffering because GNU libc I/O routines are horribly slow for
+single-character I/O, apparently for thread-safety reasons (whether you
+are using threads or not).}
+
+Wisdom has been enhanced by additional humility in FFTW 3: whereas FFTW
+2 would re-use wisdom for a given transform size regardless of the
+stride etc., in FFTW 3 wisdom is only used with the strides etc. for
+which it was created.  Unfortunately, this means FFTW 3 has to create
+new plans from scratch more often than FFTW 2 (in FFTW 2, planning
+e.g. one transform of size 1024 also created wisdom for all smaller
+powers of 2, but this no longer occurs).
+
+FFTW 3 also has the new routine @code{fftw_import_system_wisdom} to
+import wisdom from a standard system-wide location.
+
+@heading Memory allocation
+
+In FFTW 3, we recommend allocating your arrays with @code{fftw_malloc}
+and deallocating them with @code{fftw_free}; this is not required, but
+allows optimal performance when SIMD acceleration is used.  (Those two
+functions actually existed in FFTW 2, and worked the same way, but were
+not documented.)
+
+In FFTW 2, there were @code{fftw_malloc_hook} and @code{fftw_free_hook}
+functions that allowed the user to replace FFTW's memory-allocation
+routines (e.g. to implement different error-handling, since by default
+FFTW prints an error message and calls @code{exit} to abort the program
+if @code{malloc} returns @code{NULL}).  These hooks are not supported in
+FFTW 3; those few users who require this functionality can just
+directly modify the memory-allocation routines in FFTW (they are defined
+in @code{kernel/alloc.c}).
+
+@heading Fortran interface
+
+In FFTW 2, the subroutine names were obtained by replacing @samp{fftw_}
+with @samp{fftw_f77}; in FFTW 3, you replace @samp{fftw_} with
+@samp{dfftw_} (or @samp{sfftw_} or @samp{lfftw_}, depending upon the
+precision).
+
+In FFTW 3, we have begun recommending that you always declare the type
+used to store plans as @code{integer*8}.  (Too many people didn't notice
+our instruction to switch from @code{integer} to @code{integer*8} for
+64-bit machines.)
+
+In FFTW 3, we provide a @code{fftw3.f} ``header file'' to include in
+your code (and which is officially installed on Unix systems).  (In FFTW
+2, we supplied a @code{fftw_f77.i} file, but it was not installed.)
+
+Otherwise, the C-Fortran interface relationship is much the same as it
+was before (e.g. return values become initial parameters, and
+multi-dimensional arrays are in column-major order).  Unlike FFTW 2, we
+do provide some support for wisdom import/export in Fortran
+(@pxref{Wisdom of Fortran?}).
+
+@heading Threads
+
+Like FFTW 2, only the execution routines are thread-safe.  All planner
+routines, etcetera, should be called by only a single thread at a time
+(@pxref{Thread safety}).  @emph{Unlike} FFTW 2, there is no special
+@code{FFTW_THREADSAFE} flag for the planner to allow a given plan to be
+usable by multiple threads in parallel; this is now the case by default.
+
+The multi-threaded version of FFTW 2 required you to pass the number of
+threads each time you execute the transform.  The number of threads is
+now stored in the plan, and is specified before the planner is called by
+@code{fftw_plan_with_nthreads}.  The threads initialization routine used
+to be called @code{fftw_threads_init} and would return zero on success;
+the new routine is called @code{fftw_init_threads} and returns zero on
+failure.  @xref{Multi-threaded FFTW}.
+
+There is no separate threads header file in FFTW 3; all the function
+prototypes are in @code{<fftw3.h>}.  However, you still have to link to
+a separate library (@code{-lfftw3_threads -lfftw3 -lm} on Unix), as well as
+to the threading library (e.g. POSIX threads on Unix).
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/doc/version.texi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/doc/version.texi	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,4 @@
+@set UPDATED 25 November 2012
+@set UPDATED-MONTH November 2012
+@set EDITION 3.3.3
+@set VERSION 3.3.3
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/fftw.pc.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/fftw.pc.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,10 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: FFTW
+Description: fast Fourier transform library
+Version: @VERSION@
+Libs: -L${libdir} -lfftw3@PREC_SUFFIX@ @LIBQUADMATH@ -lm
+Cflags: -I${includedir}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,25 @@
+# this makefile requires GNU make.
+
+EXTRA_DIST = algsimp.ml annotate.ml assoctable.ml c.ml complex.ml	\
+conv.ml dag.ml expr.ml fft.ml gen_hc2c.ml gen_hc2cdft.ml		\
+gen_hc2cdft_c.ml gen_hc2hc.ml gen_r2cb.ml gen_mdct.ml gen_notw.ml	\
+gen_notw_c.ml gen_r2cf.ml gen_r2r.ml gen_twiddle.ml gen_twiddle_c.ml	\
+gen_twidsq.ml gen_twidsq_c.ml genutil.ml littlesimp.ml magic.ml		\
+monads.ml number.ml oracle.ml schedule.ml simd.ml simdmagic.ml		\
+to_alist.ml trig.ml twiddle.ml unique.ml util.ml variable.ml		\
+algsimp.mli annotate.mli assoctable.mli c.mli complex.mli conv.mli	\
+dag.mli expr.mli fft.mli littlesimp.mli number.mli oracle.mli		\
+schedule.mli simd.mli to_alist.mli trig.mli twiddle.mli unique.mli	\
+util.mli variable.mli
+
+GENFFT_NATIVE=gen_notw.native gen_notw_c.native gen_twiddle.native	\
+gen_twiddle_c.native gen_twidsq.native gen_twidsq_c.native		\
+gen_r2r.native gen_r2cf.native gen_r2cb.native gen_hc2c.native		\
+gen_hc2cdft.native gen_hc2cdft_c.native gen_hc2hc.native		\
+gen_mdct.native
+
+all-local::
+	$(OCAMLBUILD) -classic-display -libs unix,nums $(GENFFT_NATIVE)
+
+maintainer-clean-local::
+	$(OCAMLBUILD) -classic-display -clean
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,444 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# this makefile requires GNU make.
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = genfft
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+EXTRA_DIST = algsimp.ml annotate.ml assoctable.ml c.ml complex.ml	\
+conv.ml dag.ml expr.ml fft.ml gen_hc2c.ml gen_hc2cdft.ml		\
+gen_hc2cdft_c.ml gen_hc2hc.ml gen_r2cb.ml gen_mdct.ml gen_notw.ml	\
+gen_notw_c.ml gen_r2cf.ml gen_r2r.ml gen_twiddle.ml gen_twiddle_c.ml	\
+gen_twidsq.ml gen_twidsq_c.ml genutil.ml littlesimp.ml magic.ml		\
+monads.ml number.ml oracle.ml schedule.ml simd.ml simdmagic.ml		\
+to_alist.ml trig.ml twiddle.ml unique.ml util.ml variable.ml		\
+algsimp.mli annotate.mli assoctable.mli c.mli complex.mli conv.mli	\
+dag.mli expr.mli fft.mli littlesimp.mli number.mli oracle.mli		\
+schedule.mli simd.mli to_alist.mli trig.mli twiddle.mli unique.mli	\
+util.mli variable.mli
+
+GENFFT_NATIVE = gen_notw.native gen_notw_c.native gen_twiddle.native	\
+gen_twiddle_c.native gen_twidsq.native gen_twidsq_c.native		\
+gen_r2r.native gen_r2cf.native gen_r2cb.native gen_hc2c.native		\
+gen_hc2cdft.native gen_hc2cdft_c.native gen_hc2hc.native		\
+gen_mdct.native
+
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu genfft/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu genfft/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+tags: TAGS
+TAGS:
+
+ctags: CTAGS
+CTAGS:
+
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile all-local
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic \
+	maintainer-clean-local
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am all-local check check-am clean clean-generic \
+	clean-libtool distclean distclean-generic distclean-libtool \
+	distdir dvi dvi-am html html-am info info-am install \
+	install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic \
+	maintainer-clean-local mostlyclean mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am uninstall uninstall-am
+
+
+all-local::
+	$(OCAMLBUILD) -classic-display -libs unix,nums $(GENFFT_NATIVE)
+
+maintainer-clean-local::
+	$(OCAMLBUILD) -classic-display -clean
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/algsimp.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/algsimp.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,580 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+
+open Util
+open Expr
+
+let node_insert x =  Assoctable.insert Expr.hash x
+let node_lookup x =  Assoctable.lookup Expr.hash (==) x
+
+(*************************************************************
+ * Algebraic simplifier/elimination of common subexpressions
+ *************************************************************)
+module AlgSimp : sig 
+  val algsimp : expr list -> expr list
+end = struct
+
+  open Monads.StateMonad
+  open Monads.MemoMonad
+  open Assoctable
+
+  let fetchSimp = 
+    fetchState >>= fun (s, _) -> returnM s
+  let storeSimp s =
+    fetchState >>= (fun (_, c) -> storeState (s, c))
+  let lookupSimpM key =
+    fetchSimp >>= fun table ->
+      returnM (node_lookup key table)
+  let insertSimpM key value =
+    fetchSimp >>= fun table ->
+      storeSimp (node_insert key value table)
+
+  let subset a b =
+    List.for_all (fun x -> List.exists (fun y -> x == y) b) a
+
+  let structurallyEqualCSE a b = 
+    match (a, b) with
+    | (Num a, Num b) -> Number.equal a b
+    | (NaN a, NaN b) -> a == b
+    | (Load a, Load b) -> Variable.same a b
+    | (Times (a, a'), Times (b, b')) ->
+ 	((a == b) && (a' == b')) or
+ 	((a == b') && (a' == b))
+    | (CTimes (a, a'), CTimes (b, b')) ->
+ 	((a == b) && (a' == b')) or
+ 	((a == b') && (a' == b))
+    | (CTimesJ (a, a'), CTimesJ (b, b')) -> ((a == b) && (a' == b'))
+    | (Plus a, Plus b) -> subset a b && subset b a
+    | (Uminus a, Uminus b) -> (a == b)
+    | _ -> false
+
+  let hashCSE x = 
+    if (!Magic.randomized_cse) then
+      Oracle.hash x
+    else
+      Expr.hash x
+
+  let equalCSE a b = 
+    if (!Magic.randomized_cse) then
+      (structurallyEqualCSE a b || Oracle.likely_equal a b)
+    else
+      structurallyEqualCSE a b
+
+  let fetchCSE = 
+    fetchState >>= fun (_, c) -> returnM c
+  let storeCSE c =
+    fetchState >>= (fun (s, _) -> storeState (s, c))
+  let lookupCSEM key =
+    fetchCSE >>= fun table ->
+      returnM (Assoctable.lookup hashCSE equalCSE key table)
+  let insertCSEM key value =
+    fetchCSE >>= fun table ->
+      storeCSE (Assoctable.insert hashCSE key value table)
+
+  (* memoize both x and Uminus x (unless x is already negated) *) 
+  let identityM x =
+    let memo x = memoizing lookupCSEM insertCSEM returnM x in
+    match x with
+	Uminus _ -> memo x 
+      |	_ -> memo x >>= fun x' -> memo (Uminus x') >> returnM x'
+
+  let makeNode = identityM
+
+  (* simplifiers for various kinds of nodes *)
+  let rec snumM = function
+      n when Number.is_zero n -> 
+	makeNode (Num (Number.zero))
+    | n when Number.negative n -> 
+	makeNode (Num (Number.negate n)) >>= suminusM
+    | n -> makeNode (Num n)
+
+  and suminusM = function
+      Uminus x -> makeNode x
+    | Num a when (Number.is_zero a) -> snumM Number.zero
+    | a -> makeNode (Uminus a)
+
+  and stimesM = function 
+    | (Uminus a, b) -> stimesM (a, b) >>= suminusM
+    | (a, Uminus b) -> stimesM (a, b) >>= suminusM
+    | (NaN I, CTimes (a, b)) -> stimesM (NaN I, b) >>= 
+	fun ib -> sctimesM (a, ib)
+    | (NaN I, CTimesJ (a, b)) -> stimesM (NaN I, b) >>= 
+	fun ib -> sctimesjM (a, ib)
+    | (Num a, Num b) -> snumM (Number.mul a b)
+    | (Num a, Times (Num b, c)) -> 
+	snumM (Number.mul a b) >>= fun x -> stimesM (x, c)
+    | (Num a, b) when Number.is_zero a -> snumM Number.zero
+    | (Num a, b) when Number.is_one a -> makeNode b
+    | (Num a, b) when Number.is_mone a -> suminusM b
+    | (a, b) when is_known_constant b && not (is_known_constant a) -> 
+	stimesM (b, a)
+    | (a, b) -> makeNode (Times (a, b))
+
+  and sctimesM = function 
+    | (Uminus a, b) -> sctimesM (a, b) >>= suminusM
+    | (a, Uminus b) -> sctimesM (a, b) >>= suminusM
+    | (a, b) -> makeNode (CTimes (a, b))
+
+  and sctimesjM = function 
+    | (Uminus a, b) -> sctimesjM (a, b) >>= suminusM
+    | (a, Uminus b) -> sctimesjM (a, b) >>= suminusM
+    | (a, b) -> makeNode (CTimesJ (a, b))
+
+  and reduce_sumM x = match x with
+    [] -> returnM []
+  | [Num a] -> 
+      if (Number.is_zero a) then 
+	returnM [] 
+      else returnM x
+  | [Uminus (Num a)] -> 
+      if (Number.is_zero a) then 
+	returnM [] 
+      else returnM x
+  | (Num a) :: (Num b) :: s -> 
+      snumM (Number.add a b) >>= fun x ->
+	reduce_sumM (x :: s)
+  | (Num a) :: (Uminus (Num b)) :: s -> 
+      snumM (Number.sub a b) >>= fun x ->
+	reduce_sumM (x :: s)
+  | (Uminus (Num a)) :: (Num b) :: s -> 
+      snumM (Number.sub b a) >>= fun x ->
+	reduce_sumM (x :: s)
+  | (Uminus (Num a)) :: (Uminus (Num b)) :: s -> 
+      snumM (Number.add a b) >>= 
+      suminusM >>= fun x ->
+	reduce_sumM (x :: s)
+  | ((Num _) as a) :: b :: s -> reduce_sumM (b :: a :: s)
+  | ((Uminus (Num _)) as a) :: b :: s -> reduce_sumM (b :: a :: s)
+  | a :: s -> 
+      reduce_sumM s >>= fun s' -> returnM (a :: s')
+
+  and collectible1 = function
+    | NaN _ -> false
+    | Uminus x -> collectible1 x
+    | _ -> true
+  and collectible (a, b) = collectible1 a
+
+  (* collect common factors: ax + bx -> (a+b)x *)
+  and collectM which x = 
+    let rec findCoeffM which = function
+      |	Times (a, b) when collectible (which (a, b)) -> returnM (which (a, b))
+      | Uminus x -> 
+	  findCoeffM which x >>= fun (coeff, b) ->
+	    suminusM coeff >>= fun mcoeff ->
+ 	      returnM (mcoeff, b)
+      | x -> snumM Number.one >>= fun one -> returnM (one, x)
+    and separateM xpr = function
+ 	[] -> returnM ([], [])
+      |	a :: b ->
+ 	  separateM xpr b >>= fun (w, wo) ->
+	    (* try first factor *)
+ 	    findCoeffM (fun (a, b) -> (a, b)) a >>= fun (c, x) ->
+ 	      if (xpr == x) && collectible (c, x) then returnM (c :: w, wo)
+ 	      else
+	      (* try second factor *)
+ 		findCoeffM (fun (a, b) -> (b, a)) a >>= fun (c, x) ->
+ 		  if (xpr == x) && collectible (c, x) then returnM (c :: w, wo)
+ 		  else returnM (w, a :: wo)
+    in match x with
+      [] -> returnM x
+    | [a] -> returnM x
+    | a :: b ->
+ 	findCoeffM which a >>= fun (_, xpr) ->
+ 	  separateM xpr x >>= fun (w, wo) ->
+ 	    collectM which wo >>= fun wo' ->
+ 	      splusM w >>= fun w' ->
+ 		stimesM (w', xpr) >>= fun t' ->
+ 		  returnM (t':: wo')
+
+  and mangleSumM x = returnM x
+      >>= reduce_sumM 
+      >>= collectM (fun (a, b) -> (a, b))
+      >>= collectM (fun (a, b) -> (b, a))
+      >>= reduce_sumM 
+      >>= deepCollectM !Magic.deep_collect_depth
+      >>= reduce_sumM
+
+  and reorder_uminus = function  (* push all Uminuses to the end *)
+      [] -> []
+    | ((Uminus _) as a' :: b) -> (reorder_uminus b) @ [a']
+    | (a :: b) -> a :: (reorder_uminus b)                      
+
+  and canonicalizeM = function 
+      [] -> snumM Number.zero
+    | [a] -> makeNode a                    (* one term *)
+    | a -> generateFusedMultAddM (reorder_uminus a)
+
+  and generateFusedMultAddM = 
+    let rec is_multiplication = function
+      | Times (Num a, b) -> true
+      | Uminus (Times (Num a, b)) -> true
+      | _ -> false
+    and separate = function
+	[] -> ([], [], Number.zero)
+      | (Times (Num a, b)) as this :: c -> 
+	  let (x, y, max) = separate c in
+	  let newmax = if (Number.greater a max) then a else max in
+	  (this :: x, y, newmax)
+      | (Uminus (Times (Num a, b))) as this :: c -> 
+	  let (x, y, max) = separate c in
+	  let newmax = if (Number.greater a max) then a else max in
+	  (this :: x, y, newmax)
+      | this :: c ->
+	  let (x, y, max) = separate c in
+	  (x, this :: y, max)
+    in fun l ->
+      if !Magic.enable_fma && count is_multiplication l >= 2 then
+	let (w, wo, max) = separate l in
+	snumM (Number.div Number.one max) >>= fun invmax' ->
+	  snumM max >>= fun max' ->
+	    mapM (fun x -> stimesM (invmax', x)) w >>= splusM >>= fun pw' ->
+	      stimesM (max', pw') >>= fun mw' ->
+		splusM (wo @ [mw'])
+      else 
+	makeNode (Plus l)
+
+
+  and negative = function
+      Uminus _ -> true
+    | _ -> false
+
+  (*
+   * simplify patterns of the form
+   *
+   *  ((c_1 * a + ...) + ...) +  (c_2 * a + ...)
+   *
+   * The pattern includes arbitrary coefficients and minus signs.
+   * A common case of this pattern is the butterfly
+   *   (a + b) + (a - b)
+   *   (a + b) - (a - b)
+   *)
+  (* this whole procedure needs much more thought *)
+  and deepCollectM maxdepth l =
+    let rec findTerms depth x = match x with
+      | Uminus x -> findTerms depth x
+      |	Times (Num _, b) -> (findTerms (depth - 1) b)
+      |	Plus l when depth > 0 ->
+	  x :: List.flatten (List.map (findTerms (depth - 1)) l)
+      |	x -> [x]
+    and duplicates = function
+	[] -> []
+      |	a :: b -> if List.memq a b then a :: duplicates b
+      else duplicates b
+
+    in let rec splitDuplicates depth d x =
+      if (List.memq x d) then 
+	snumM (Number.zero) >>= fun zero ->
+	  returnM (zero, x)
+      else match x with
+      |	Times (a, b) ->
+	  splitDuplicates (depth - 1) d a >>= fun (a', xa) ->
+	    splitDuplicates (depth - 1) d b >>= fun (b', xb) ->
+	      stimesM (a', b') >>= fun ab ->
+		stimesM (a, xb) >>= fun xb' ->
+		  stimesM (xa, b) >>= fun xa' ->
+		    stimesM (xa, xb) >>= fun xab ->
+		      splusM [xa'; xb'; xab] >>= fun x ->
+			returnM (ab, x)
+      | Uminus a -> 
+	  splitDuplicates depth d a >>= fun (x, y) ->
+	    suminusM x >>= fun ux -> 
+	      suminusM y >>= fun uy -> 
+		returnM (ux, uy)
+      |	Plus l when depth > 0 -> 
+	  mapM (splitDuplicates (depth - 1) d) l >>= fun ld ->
+	    let (l', d') = List.split ld in
+	    splusM l' >>= fun p ->
+	      splusM d' >>= fun d'' ->
+	      returnM (p, d'')
+      |	x -> 
+	  snumM (Number.zero) >>= fun zero' ->
+	    returnM (x, zero')
+
+    in let l' = List.flatten (List.map (findTerms maxdepth) l)
+    in match duplicates l' with
+    | [] -> returnM l
+    | d ->
+	mapM (splitDuplicates maxdepth d) l >>= fun ld ->
+	  let (l', d') = List.split ld in
+	  splusM l' >>= fun l'' ->
+	    let rec flattenPlusM = function
+	      | Plus l -> returnM l
+	      | Uminus x ->
+		  flattenPlusM x >>= mapM suminusM
+	      | x -> returnM [x]
+	    in
+	    mapM flattenPlusM d' >>= fun d'' ->
+	      splusM (List.flatten d'') >>= fun d''' ->
+		mangleSumM [l''; d''']
+
+  and splusM l =
+    let fma_heuristics x = 
+      if !Magic.enable_fma then 
+	match x with
+	| [Uminus (Times _); Times _] -> Some false
+	| [Times _; Uminus (Times _)] -> Some false
+	| [Uminus (_); Times _] -> Some true
+	| [Times _; Uminus (Plus _)] -> Some true
+	| [_; Uminus (Times _)] -> Some false
+	| [Uminus (Times _); _] -> Some false
+	| _ -> None
+      else
+	None
+    in
+    mangleSumM l >>=  fun l' ->
+      (* no terms are negative.  Don't do anything *)
+      if not (List.exists negative l') then
+	canonicalizeM l'
+      (* all terms are negative.  Negate them all and collect the minus sign *)
+      else if List.for_all negative l' then
+	mapM suminusM l' >>= splusM >>= suminusM
+      else match fma_heuristics l' with
+      |	Some true -> mapM suminusM l' >>= splusM >>= suminusM
+      |	Some false -> canonicalizeM l'
+      |	None ->
+         (* Ask the Oracle for the canonical form *)
+	  if (not !Magic.randomized_cse) &&
+	    Oracle.should_flip_sign (Plus l') then
+	    mapM suminusM l' >>= splusM >>= suminusM
+	  else
+	    canonicalizeM l'
+
+  (* monadic style algebraic simplifier for the dag *)
+  let rec algsimpM x =
+    memoizing lookupSimpM insertSimpM 
+      (function 
+ 	| Num a -> snumM a
+ 	| NaN _ as x -> makeNode x
+ 	| Plus a -> 
+ 	    mapM algsimpM a >>= splusM
+ 	| Times (a, b) -> 
+ 	    (algsimpM a >>= fun a' ->
+ 	      algsimpM b >>= fun b' ->
+ 		stimesM (a', b'))
+ 	| CTimes (a, b) -> 
+ 	    (algsimpM a >>= fun a' ->
+ 	      algsimpM b >>= fun b' ->
+		sctimesM (a', b'))
+ 	| CTimesJ (a, b) -> 
+ 	    (algsimpM a >>= fun a' ->
+ 	      algsimpM b >>= fun b' ->
+		sctimesjM (a', b'))
+ 	| Uminus a -> 
+ 	    algsimpM a >>= suminusM 
+ 	| Store (v, a) ->
+ 	    algsimpM a >>= fun a' ->
+ 	      makeNode (Store (v, a'))
+ 	| Load _ as x -> makeNode x)
+      x
+
+   let initialTable = (empty, empty)
+   let simp_roots = mapM algsimpM
+   let algsimp = runM initialTable simp_roots
+end
+
+(*************************************************************
+ * Network transposition algorithm
+ *************************************************************)
+module Transpose = struct
+  open Monads.StateMonad
+  open Monads.MemoMonad
+  open Littlesimp
+
+  let fetchDuals = fetchState
+  let storeDuals = storeState
+
+  let lookupDualsM key =
+    fetchDuals >>= fun table ->
+      returnM (node_lookup key table)
+
+  let insertDualsM key value =
+    fetchDuals >>= fun table ->
+      storeDuals (node_insert key value table)
+
+  let rec visit visited vtable parent_table = function
+      [] -> (visited, parent_table)
+    | node :: rest ->
+	match node_lookup node vtable with
+	| Some _ -> visit visited vtable parent_table rest
+	| None ->
+	    let children = match node with
+	    | Store (v, n) -> [n]
+	    | Plus l -> l
+	    | Times (a, b) -> [a; b]
+	    | CTimes (a, b) -> [a; b]
+	    | CTimesJ (a, b) -> [a; b]
+	    | Uminus x -> [x]
+	    | _ -> []
+	    in let rec loop t = function
+		[] -> t
+	      |	a :: rest ->
+		  (match node_lookup a t with
+		    None -> loop (node_insert a [node] t) rest
+		  | Some c -> loop (node_insert a (node :: c) t) rest)
+	    in 
+	    (visit 
+	       (node :: visited)
+	       (node_insert node () vtable)
+	       (loop parent_table children)
+	       (children @ rest))
+
+  let make_transposer parent_table =
+    let rec termM node candidate_parent = 
+      match candidate_parent with
+      |	Store (_, n) when n == node -> 
+	  dualM candidate_parent >>= fun x' -> returnM [x']
+      | Plus (l) when List.memq node l -> 
+	  dualM candidate_parent >>= fun x' -> returnM [x']
+      | Times (a, b) when b == node -> 
+	  dualM candidate_parent >>= fun x' -> 
+	    returnM [makeTimes (a, x')]
+      | CTimes (a, b) when b == node -> 
+	  dualM candidate_parent >>= fun x' -> 
+	    returnM [CTimes (a, x')]
+      | CTimesJ (a, b) when b == node -> 
+	  dualM candidate_parent >>= fun x' -> 
+	    returnM [CTimesJ (a, x')]
+      | Uminus n when n == node -> 
+	  dualM candidate_parent >>= fun x' -> 
+	    returnM [makeUminus x']
+      | _ -> returnM []
+    
+    and dualExpressionM this_node = 
+      mapM (termM this_node) 
+	(match node_lookup this_node parent_table with
+	| Some a -> a
+	| None -> failwith "bug in dualExpressionM"
+	) >>= fun l ->
+	returnM (makePlus (List.flatten l))
+
+    and dualM this_node =
+      memoizing lookupDualsM insertDualsM
+	(function
+	  | Load v as x -> 
+	      if (Variable.is_constant v) then
+		returnM (Load v)
+	      else
+		(dualExpressionM x >>= fun d ->
+		  returnM (Store (v, d)))
+	  | Store (v, x) -> returnM (Load v)
+	  | x -> dualExpressionM x)
+	this_node
+
+    in dualM
+
+  let is_store = function 
+    | Store _ -> true
+    | _ -> false
+
+  let transpose dag = 
+    let _ = Util.info "begin transpose" in
+    let (all_nodes, parent_table) = 
+      visit [] Assoctable.empty Assoctable.empty dag in
+    let transposerM = make_transposer parent_table in
+    let mapTransposerM = mapM transposerM in
+    let duals = runM Assoctable.empty mapTransposerM all_nodes in
+    let roots = List.filter is_store duals in
+    let _ = Util.info "end transpose" in
+    roots
+end
+
+
+(*************************************************************
+ * Various dag statistics
+ *************************************************************)
+module Stats : sig
+  type complexity
+  val complexity : Expr.expr list -> complexity
+  val same_complexity : complexity -> complexity -> bool
+  val leq_complexity : complexity -> complexity -> bool
+  val to_string : complexity -> string
+end = struct
+  type complexity = int * int * int * int * int * int
+  let rec visit visited vtable = function
+      [] -> visited
+    | node :: rest ->
+	match node_lookup node vtable with
+	  Some _ -> visit visited vtable rest
+	| None ->
+	    let children = match node with
+	      Store (v, n) -> [n]
+	    | Plus l -> l
+	    | Times (a, b) -> [a; b]
+	    | Uminus x -> [x]
+	    | _ -> []
+	    in visit (node :: visited)
+	      (node_insert node () vtable)
+	      (children @ rest)
+
+  let complexity dag = 
+    let rec loop (load, store, plus, times, uminus, num) = function 
+      	[] -> (load, store, plus, times, uminus, num)
+      | node :: rest ->
+	  loop
+	    (match node with
+	    | Load _ -> (load + 1, store, plus, times, uminus, num)
+	    | Store _ -> (load, store + 1, plus, times, uminus, num)
+	    | Plus x -> (load, store, plus + (List.length x - 1), times, uminus, num)
+	    | Times _ -> (load, store, plus, times + 1, uminus, num)
+	    | Uminus _ -> (load, store, plus, times, uminus + 1, num)
+	    | Num _ -> (load, store, plus, times, uminus, num + 1)
+	    | CTimes _ -> (load, store, plus, times, uminus, num)
+	    | CTimesJ _ -> (load, store, plus, times, uminus, num)
+	    | NaN _ -> (load, store, plus, times, uminus, num))
+	    rest
+    in let (l, s, p, t, u, n) = 
+      loop (0, 0, 0, 0, 0, 0) (visit [] Assoctable.empty dag)
+    in (l, s, p, t, u, n)
+
+  let weight (l, s, p, t, u, n) =
+    l + s + 10 * p + 20 * t + u + n
+
+  let same_complexity a b = weight a = weight b
+  let leq_complexity a b = weight a <= weight b
+
+  let to_string (l, s, p, t, u, n) =
+    Printf.sprintf "ld=%d st=%d add=%d mul=%d uminus=%d num=%d\n"
+		   l s p t u n
+		   
+end    
+
+(* simplify the dag *)
+let algsimp v = 
+  let rec simplification_loop v =
+    let () = Util.info "simplification step" in
+    let complexity = Stats.complexity v in
+    let () = Util.info ("complexity = " ^ (Stats.to_string complexity)) in
+    let v = (AlgSimp.algsimp @@ Transpose.transpose @@ 
+	     AlgSimp.algsimp @@ Transpose.transpose) v in
+    let complexity' = Stats.complexity v in
+    let () = Util.info ("complexity = " ^ (Stats.to_string complexity')) in
+    if (Stats.leq_complexity complexity' complexity) then
+      let () = Util.info "end algsimp" in
+      v
+    else
+      simplification_loop v
+
+  in
+  let () = Util.info "begin algsimp" in
+  let v = AlgSimp.algsimp v in
+  if !Magic.network_transposition then simplification_loop v else v
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/algsimp.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/algsimp.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,22 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+val algsimp : Expr.expr list -> Expr.expr list
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/annotate.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/annotate.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,361 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* Here, we take a schedule (produced by schedule.ml) ordering a
+   sequence of instructions, and produce an annotated schedule.  The
+   annotated schedule has the same ordering as the original schedule,
+   but is additionally partitioned into nested blocks of temporary
+   variables.  The partitioning is computed via a heuristic algorithm.
+
+   The blocking allows the C code that we generate to consist of
+   nested blocks that help communicate variable lifetimes to the
+   compiler. *)
+
+open Schedule
+open Expr
+open Variable
+
+type annotated_schedule = 
+    Annotate of variable list * variable list * variable list * int * aschedule
+and aschedule = 
+    ADone
+  | AInstr of assignment
+  | ASeq of (annotated_schedule * annotated_schedule)
+
+let addelem a set = if not (List.memq a set) then a :: set else set
+let union l = 
+  let f x = addelem x   (* let is source of polymorphism *)
+  in List.fold_right f l
+
+(* set difference a - b *)
+let diff a b = List.filter (fun x -> not (List.memq x b)) a
+
+let rec minimize f = function
+    [] -> failwith "minimize"
+  | [n] -> n
+  | n :: rest ->
+      let x = minimize f rest in
+      if (f x) >= (f n) then n else x
+
+(* find all variables used inside a scheduling unit *)
+let rec find_block_vars = function
+    Done -> []
+  | (Instr (Assign (v, x))) -> v :: (find_vars x)
+  | Par a -> List.flatten (List.map find_block_vars a)
+  | Seq (a, b) -> (find_block_vars a) @ (find_block_vars b)
+
+let uniq l = 
+  List.fold_right (fun a b -> if List.memq a b then b else a :: b) l []
+
+let has_related x = List.exists (Variable.same_class x)
+
+let rec overlap a b = Util.count (fun y -> has_related y b) a
+
+(* reorder a list of schedules so as to maximize overlap of variables *)
+let reorder l =
+  let rec loop = function
+      [] -> []
+    | (a, va) :: b ->
+	let c = 
+	  List.map 
+	    (fun (a, x) -> ((a, x), (overlap va x, List.length x))) b in
+	let c' =
+	  Sort.list 
+	    (fun (_, (a, la)) (_, (b, lb)) -> 
+	      la < lb or a > b)
+	    c in
+	let b' = List.map (fun (a, _) -> a) c' in
+	a :: (loop b') in
+  let l' = List.map (fun x -> x, uniq (find_block_vars x)) l in
+  (* start with smallest block --- does this matter ? *)
+  match l' with
+    [] -> []
+  | _ ->  
+      let m = minimize (fun (_, x) -> (List.length x)) l' in
+      let l'' = Util.remove m l' in
+      loop (m :: l'')
+
+(* remove Par blocks *)
+let rec linearize = function
+  | Seq (a, Done) -> linearize a
+  | Seq (Done, a) -> linearize a
+  | Seq (a, b) -> Seq (linearize a, linearize b)
+
+  (* try to balance nested Par blocks *)
+  | Par [a] -> linearize a
+  | Par l -> 
+      let n2 = (List.length l) / 2 in
+      let rec loop n a b =
+	if n = 0 then
+	  (List.rev b, a)
+	else
+	  match a with
+	    [] -> failwith "loop"
+	  | x :: y -> loop (n - 1) y (x :: b)
+      in let (a, b) = loop n2 (reorder l) []
+      in linearize (Seq (Par a, Par b))
+
+  | x -> x 
+
+let subset a b =
+  List.for_all (fun x -> List.exists (fun y -> x == y) b) a
+
+let use_same_vars (Assign (av, ax)) (Assign (bv, bx)) =
+  is_temporary av &&
+  is_temporary bv &&
+  (let va = Expr.find_vars ax and vb = Expr.find_vars bx in
+   subset va vb && subset vb va)
+
+let store_to_same_class (Assign (av, ax)) (Assign (bv, bx)) =
+  is_locative av &&
+  is_locative bv &&
+  Variable.same_class av bv
+
+let loads_from_same_class (Assign (av, ax)) (Assign (bv, bx)) =
+  match (ax, bx) with
+    | (Load a), (Load b) when 
+	Variable.is_locative a && Variable.is_locative b 
+	-> Variable.same_class a b
+    | _ -> false
+
+(* extract instructions from schedule *)
+let rec sched_to_ilist = function
+  | Done -> []
+  | Instr a -> [a]
+  | Seq (a, b) -> (sched_to_ilist a) @ (sched_to_ilist b)
+  | _ -> failwith "sched_to_ilist" (* Par blocks removed by linearize *)
+
+let rec find_friends friendp insn friends foes = function
+  | [] -> (friends, foes)
+  | a :: b -> 
+      if (a == insn) || (friendp a insn) then
+	find_friends friendp insn (a :: friends) foes b
+      else
+	find_friends friendp insn friends (a :: foes) b
+
+(* schedule all instructions in the equivalence class determined
+   by friendp at the point where the last one
+   is executed *)
+let rec delay_friends friendp sched =
+  let rec recur insns = function
+    | Done -> (Done, insns)
+    | Instr a ->
+	let (friends, foes) = find_friends friendp a [] [] insns in
+	(Schedule.sequentially friends), foes
+    | Seq (a, b) ->
+	let (b', insnsb) = recur insns b in
+	let (a', insnsa) = recur insnsb a in
+	(Seq (a', b')), insnsa
+    | _ -> failwith "delay_friends"
+  in match recur (sched_to_ilist sched) sched with
+  | (s, []) -> s (* assert that all insns have been used *)
+  | _ -> failwith "delay_friends"
+
+(* schedule all instructions in the equivalence class determined
+   by friendp at the point where the first one
+   is executed *)
+let rec anticipate_friends friendp sched =
+  let rec recur insns = function
+    | Done -> (Done, insns)
+    | Instr a ->
+	let (friends, foes) = find_friends friendp a [] [] insns in
+	(Schedule.sequentially friends), foes
+    | Seq (a, b) ->
+	let (a', insnsa) = recur insns a in
+	let (b', insnsb) = recur insnsa b in
+	(Seq (a', b')), insnsb
+    | _ -> failwith "anticipate_friends"
+  in match recur (sched_to_ilist sched) sched with
+  | (s, []) -> s (* assert that all insns have been used *)
+  | _ -> failwith "anticipate_friends"
+
+let collect_buddy_stores buddy_list sched =
+  let rec recur sched delayed_stores = match sched with
+    | Done -> (sched, delayed_stores)
+    | Instr (Assign (v, x)) ->
+	begin
+	  try
+	    let buddies = List.find (List.memq v) buddy_list in 
+	    let tmp = Variable.make_temporary () in
+	    let i = Seq(Instr (Assign (tmp, x)),
+			Instr (Assign (v, Times (NaN MULTI_A, Load tmp))))
+	    and delayed_stores = (v, Load tmp) :: delayed_stores in
+	      try
+		(Seq (i,
+		      Instr (Assign 
+			       (List.hd buddies,
+				Times (NaN MULTI_B,
+				       Plus (List.map 
+					       (fun buddy ->
+						  List.assq buddy 
+						    delayed_stores)
+					       buddies))) )))
+		  , delayed_stores
+	      with Not_found -> (i, delayed_stores)
+	  with Not_found -> (sched, delayed_stores)
+	end
+    | Seq (a, b) ->
+	let (newa, delayed_stores) = recur a delayed_stores in
+	let (newb, delayed_stores) = recur b delayed_stores in
+	  (Seq (newa, newb), delayed_stores)
+    | _ -> failwith "collect_buddy_stores"
+  in let (sched, _) = recur sched [] in
+    sched
+
+let schedule_for_pipeline sched =
+  let update_readytimes t (Assign (v, _)) ready_times = 
+    (v, (t + !Magic.pipeline_latency)) :: ready_times
+  and readyp t ready_times (Assign (_, x)) =
+    List.for_all 
+      (fun var -> 
+	 try 
+	   (List.assq var ready_times) <= t
+	 with Not_found -> false)
+      (List.filter Variable.is_temporary (Expr.find_vars x))
+  in
+  let rec recur sched t ready_times delayed_instructions =
+    let (ready, not_ready) = 
+      List.partition (readyp t ready_times) delayed_instructions 
+    in match ready with
+      | a :: b -> 
+	  let (sched, t, ready_times, delayed_instructions) =
+	    recur sched (t+1) (update_readytimes t a ready_times)
+	      (b @ not_ready)
+	  in
+	    (Seq (Instr a, sched)), t, ready_times, delayed_instructions
+      | _ -> (match sched with
+		| Done -> (sched, t, ready_times, delayed_instructions)
+		| Instr a ->
+		    if (readyp t ready_times a) then
+		      (sched, (t+1), (update_readytimes t a ready_times),
+		       delayed_instructions)
+		    else
+		      (Done, t, ready_times, (a :: delayed_instructions))
+		| Seq (a, b) ->
+		    let (a, t, ready_times, delayed_instructions) =
+		      recur a t ready_times delayed_instructions 
+		    in
+		    let (b, t, ready_times, delayed_instructions) =
+		      recur b t ready_times delayed_instructions 
+		    in (Seq (a, b)), t, ready_times, delayed_instructions
+	        | _ -> failwith "schedule_for_pipeline")
+  in let rec recur_until_done sched t ready_times delayed_instructions =
+      let (sched, t, ready_times, delayed_instructions) = 
+	recur sched t ready_times delayed_instructions
+      in match delayed_instructions with
+	| [] -> sched
+	| _ -> 
+	    (Seq (sched,
+		  (recur_until_done Done (t+1) ready_times 
+		     delayed_instructions)))
+  in recur_until_done sched 0 [] []
+  
+let rec rewrite_declarations force_declarations 
+    (Annotate (_, _, declared, _, what)) =
+  let m = !Magic.number_of_variables in
+
+  let declare_it declared =
+    if (force_declarations or List.length declared >= m) then
+      ([], declared)
+    else
+      (declared, [])
+
+  in match what with
+    ADone -> Annotate ([], [], [], 0, what)
+  | AInstr i -> 
+      let (u, d) = declare_it declared
+      in Annotate ([], u, d, 0, what)
+  | ASeq (a, b) ->
+      let ma = rewrite_declarations false a
+      and mb = rewrite_declarations false b
+      in let Annotate (_, ua, _, _, _) = ma
+      and Annotate (_, ub, _, _, _) = mb
+      in let (u, d) = declare_it (declared @ ua @ ub)
+      in Annotate ([], u, d, 0, ASeq (ma, mb))
+
+let annotate list_of_buddy_stores schedule =
+  let rec analyze live_at_end = function
+      Done -> Annotate (live_at_end, [], [], 0, ADone)
+    | Instr i -> (match i with
+	Assign (v, x) -> 
+	  let vars = (find_vars x) in
+	  Annotate (Util.remove v (union live_at_end vars), [v], [],
+		    0, AInstr i))
+    | Seq (a, b) ->
+	let ab = analyze live_at_end b in
+	let Annotate (live_at_begin_b, defined_b, _, depth_a, _) = ab in
+	let aa = analyze live_at_begin_b a in
+	let Annotate (live_at_begin_a, defined_a, _, depth_b, _) = aa in
+	let defined = List.filter is_temporary (defined_a @ defined_b) in
+	let declarable = diff defined live_at_end in
+	let undeclarable = diff defined declarable 
+	and maxdepth = max depth_a depth_b in
+	Annotate (live_at_begin_a, undeclarable, declarable, 
+		  List.length declarable + maxdepth,
+		  ASeq (aa, ab))
+    | _ -> failwith "really_analyze"
+
+  in 
+  let () = Util.info "begin annotate" in
+  let x = linearize schedule in
+
+  let x =
+    if (!Magic.schedule_for_pipeline && !Magic.pipeline_latency > 0) then
+      schedule_for_pipeline x 
+    else
+      x
+  in
+
+  let x = 
+    if !Magic.reorder_insns then 
+      linearize(anticipate_friends use_same_vars x) 
+    else 
+      x
+  in
+
+  (* delay stores to the real and imaginary parts of the same number *)
+  let x = 
+    if !Magic.reorder_stores then 
+      linearize(delay_friends store_to_same_class x) 
+    else
+      x
+  in
+
+  (* move loads of the real and imaginary parts of the same number *)
+  let x = 
+    if !Magic.reorder_loads then 
+      linearize(anticipate_friends loads_from_same_class x) 
+    else 
+      x
+  in
+
+  let x = collect_buddy_stores list_of_buddy_stores x in
+  let x = analyze [] x in
+  let res = rewrite_declarations true x in
+  let () = Util.info "end annotate" in
+  res
+
+let rec dump print (Annotate (_, _, _, _, code)) =
+  dump_code print code
+and dump_code print = function
+  | ADone -> ()
+  | AInstr x -> print ((assignment_to_string x) ^ "\n")
+  | ASeq (a, b) -> dump print a; dump print b
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/annotate.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/annotate.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,36 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Variable
+open Expr
+
+type annotated_schedule = 
+    Annotate of variable list * variable list * variable list *
+	int * aschedule
+and aschedule = 
+    ADone
+  | AInstr of assignment
+  | ASeq of (annotated_schedule * annotated_schedule)
+
+val annotate :
+  variable list list -> Schedule.schedule -> annotated_schedule
+
+val dump : (string -> unit) -> annotated_schedule -> unit
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/assoctable.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/assoctable.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,65 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(*************************************************************
+ *    Functional associative table
+ *************************************************************)
+
+(* 
+ * this module implements a functional associative table.  
+ * The table is parametrized by an equality predicate and
+ * a hash function, with the restriction that (equal a b) ==>
+ * hash a == hash b.
+ * The table is purely functional and implemented using a binary
+ * search tree (not balanced for now)
+ *)
+
+type ('a, 'b) elem = 
+    Leaf 
+  | Node of int * ('a, 'b) elem * ('a, 'b) elem * ('a * 'b) list
+
+let empty = Leaf
+
+let lookup hash equal key table =
+  let h = hash key in
+  let rec look = function
+      Leaf -> None
+    | Node (hash_key, left, right, this_list) ->
+        if (hash_key < h) then look left
+        else if (hash_key > h) then look right
+        else let rec loop = function
+            [] -> None
+          | (a, b) :: rest -> if (equal key a) then Some b else loop rest
+        in loop this_list
+  in look table
+
+let insert hash key value table =
+  let h = hash key in
+  let rec ins = function
+      Leaf -> Node (h, Leaf, Leaf, [(key, value)])
+    | Node (hash_key, left, right, this_list) ->
+        if (hash_key < h) then 
+          Node (hash_key, ins left, right, this_list)
+        else if (hash_key > h) then 
+          Node (hash_key, left, ins right, this_list)
+        else 
+          Node (hash_key, left, right, (key, value) :: this_list)
+  in ins table
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/assoctable.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/assoctable.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+type ('a, 'b) elem =
+  | Leaf
+  | Node of int * ('a, 'b) elem * ('a, 'b) elem * ('a * 'b) list
+val empty : ('a, 'b) elem
+val lookup :
+    ('a -> int) -> ('a -> 'b -> bool) -> 'a -> ('b, 'c) elem -> 'c option
+val insert :
+    ('a -> int) -> 'a -> 'c -> ('a, 'c) elem -> ('a, 'c) elem
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/c.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/c.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,461 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(*
+ * This module contains the definition of a C-like abstract
+ * syntax tree, and functions to convert ML values into C
+ * programs
+ *)
+
+open Expr
+open Annotate
+open List
+
+let realtype = "R"
+let realtypep = realtype ^ " *"
+let extended_realtype = "E"
+let constrealtype = "const " ^ realtype
+let constrealtypep = constrealtype ^ " *"
+
+let stridetype = "stride"
+
+(***********************************
+ * C program structure 
+ ***********************************)
+type c_decl = 
+  | Decl of string * string
+  | Tdecl of string                (* arbitrary text declaration *)
+
+and c_ast =
+  | Asch of annotated_schedule
+  | Simd_leavefun
+  | Return of c_ast
+  | For of c_ast * c_ast * c_ast * c_ast
+  | If of c_ast * c_ast
+  | Block of (c_decl list) * (c_ast list)
+  | Binop of string * c_ast * c_ast
+  | Expr_assign of c_ast * c_ast
+  | Stmt_assign of c_ast * c_ast
+  | Comma of c_ast * c_ast
+  | Integer of int
+  | CVar of string
+  | CCall of string * c_ast
+  | CPlus of c_ast list
+  | ITimes of c_ast * c_ast
+  | CUminus of c_ast
+and c_fcn = Fcn of string * string * (c_decl list) * c_ast
+
+
+let ctimes = function
+  | (Integer 1), a -> a
+  | a, (Integer 1) -> a
+  | a, b -> ITimes (a, b)
+
+(*
+ * C AST unparser 
+ *)
+let foldr_string_concat l = fold_right (^) l ""
+
+let rec unparse_expr_c =
+  let yes x = x and no x = "" in
+
+  let rec unparse_plus maybe = 
+    let maybep = maybe " + " in
+    function
+    | [] -> ""
+    | (Uminus (Times (a, b))) :: (Uminus c) :: d -> 
+	maybep ^ (op "FNMA" a b c) ^ (unparse_plus yes d)
+    | (Uminus c) :: (Uminus (Times (a, b))) :: d -> 
+	maybep ^ (op "FNMA" a b c) ^ (unparse_plus yes d)
+    | (Uminus (Times (a, b))) :: c :: d -> 
+	maybep ^ (op "FNMS" a b c) ^ (unparse_plus yes d)
+    | c :: (Uminus (Times (a, b))) :: d -> 
+	maybep ^ (op "FNMS" a b c) ^ (unparse_plus yes d)
+    | (Times (a, b)) :: (Uminus c) :: d -> 
+	maybep ^ (op "FMS" a b c) ^ (unparse_plus yes d)
+    | (Uminus c) :: (Times (a, b)) :: d -> 
+	maybep ^ (op "FMS" a b c) ^ (unparse_plus yes d)
+    | (Times (a, b)) :: c :: d -> 
+	maybep ^ (op "FMA" a b c) ^ (unparse_plus yes d)
+    | c :: (Times (a, b)) :: d -> 
+	maybep ^ (op "FMA" a b c) ^ (unparse_plus yes d)
+    | (Uminus a :: b) -> 
+	" - " ^ (parenthesize a) ^ (unparse_plus yes b)
+    | (a :: b) -> 
+	maybep ^ (parenthesize a) ^ (unparse_plus yes b)
+  and parenthesize x = match x with
+  | (Load _) -> unparse_expr_c x
+  | (Num _) -> unparse_expr_c x
+  | _ -> "(" ^ (unparse_expr_c x) ^ ")"
+  and op nam a b c =
+    nam ^ "(" ^ (unparse_expr_c a) ^ ", " ^ (unparse_expr_c b) ^ ", " ^
+    (unparse_expr_c c) ^ ")"
+      			      
+  in function
+    | Load v -> Variable.unparse v
+    | Num n -> Number.to_konst n
+    | Plus [] -> "0.0 /* bug */"
+    | Plus [a] -> " /* bug */ " ^ (unparse_expr_c a)
+    | Plus a -> (unparse_plus no a)
+    | Times (a, b) -> (parenthesize a) ^ " * " ^ (parenthesize b)
+    | Uminus (Plus [a; Uminus b]) -> unparse_plus no [b; Uminus a]
+    | Uminus a -> "- " ^ (parenthesize a)
+    | _ -> failwith "unparse_expr_c"
+
+and unparse_expr_generic = 
+  let rec u x = unparse_expr_generic x
+  and unary op a = Printf.sprintf "%s(%s)" op (u a)
+  and binary op a b = Printf.sprintf "%s(%s, %s)" op (u a) (u b)
+  and ternary op a b c = Printf.sprintf "%s(%s, %s, %s)" op (u a) (u b) (u c)
+  and quaternary op a b c d = 
+    Printf.sprintf "%s(%s, %s, %s, %s)" op (u a) (u b) (u c) (u d)
+  and unparse_plus = function
+    | [(Uminus (Times (a, b))); Times (c, d)] -> quaternary "FNMMS" a b c d
+    | [Times (c, d); (Uminus (Times (a, b)))] -> quaternary "FNMMS" a b c d
+    | [Times (c, d); (Times (a, b))] -> quaternary "FMMA" a b c d
+    | [(Uminus (Times (a, b))); c] -> ternary "FNMS" a b c
+    | [c; (Uminus (Times (a, b)))] -> ternary "FNMS" a b c
+    | [(Uminus c); (Times (a, b))] -> ternary "FMS" a b c
+    | [(Times (a, b)); (Uminus c)] -> ternary "FMS" a b c
+    | [c; (Times (a, b))] -> ternary "FMA" a b c
+    | [(Times (a, b)); c] -> ternary "FMA" a b c
+    | [a; Uminus b] -> binary "SUB" a b
+    | [a; b] -> binary "ADD" a b
+    | a :: b :: c -> binary "ADD" a (Plus (b :: c))
+    | _ -> failwith "unparse_plus"
+  in function
+    | Load v -> Variable.unparse v 
+    | Num n -> Number.to_konst n
+    | Plus a -> unparse_plus a
+    | Times (a, b) -> binary "MUL" a b
+    | Uminus a -> unary "NEG" a
+    | _ -> failwith "unparse_expr"
+
+and unparse_expr x = 
+  if !Magic.generic_arith then
+    unparse_expr_generic x
+  else
+    unparse_expr_c x
+
+and unparse_assignment (Assign (v, x)) =
+  (Variable.unparse v) ^ " = " ^ (unparse_expr x) ^ ";\n"
+
+and unparse_annotated force_bracket = 
+  let rec unparse_code = function
+      ADone -> ""
+    | AInstr i -> unparse_assignment i
+    | ASeq (a, b) -> 
+        (unparse_annotated false a) ^ (unparse_annotated false b)
+  and declare_variables l = 
+    let rec uvar = function
+	[] -> failwith "uvar"
+      |	[v] -> (Variable.unparse v) ^ ";\n"
+      | a :: b -> (Variable.unparse a) ^ ", " ^ (uvar b)
+    in let rec vvar l = 
+      let s = if !Magic.compact then 15 else 1 in
+      if (List.length l <= s) then
+	match l with
+	  [] -> ""
+	| _ -> extended_realtype ^ " " ^ (uvar l)
+      else
+	(vvar (Util.take s l)) ^ (vvar (Util.drop s l))
+    in vvar (List.filter Variable.is_temporary l)
+  in function
+      Annotate (_, _, decl, _, code) ->
+        if (not force_bracket) && (Util.null decl) then 
+          unparse_code code
+        else "{\n" ^
+          (declare_variables decl) ^
+          (unparse_code code) ^
+	  "}\n"
+
+and unparse_decl = function
+  | Decl (a, b) -> a ^ " " ^ b ^ ";\n"
+  | Tdecl x -> x
+
+and unparse_ast = 
+  let rec unparse_plus = function
+    | [] -> ""
+    | (CUminus a :: b) -> " - " ^ (parenthesize a) ^ (unparse_plus b)
+    | (a :: b) -> " + " ^ (parenthesize a) ^ (unparse_plus b)
+  and parenthesize x = match x with
+  | (CVar _) -> unparse_ast x
+  | (CCall _) -> unparse_ast x
+  | (Integer _) -> unparse_ast x
+  | _ -> "(" ^ (unparse_ast x) ^ ")"
+
+  in
+  function
+    | Asch a -> (unparse_annotated true a)
+    | Simd_leavefun -> "" (* used only in SIMD code *)
+    | Return x -> "return " ^ unparse_ast x ^ ";"
+    | For (a, b, c, d) ->
+	"for (" ^
+	unparse_ast a ^ "; " ^ unparse_ast b ^ "; " ^ unparse_ast c
+	^ ")" ^ unparse_ast d
+    | If (a, d) ->
+	"if (" ^
+	unparse_ast a 
+	^ ")" ^ unparse_ast d
+    | Block (d, s) ->
+	if (s == []) then ""
+	else 
+          "{\n"                                      ^ 
+          foldr_string_concat (map unparse_decl d)   ^ 
+          foldr_string_concat (map unparse_ast s)    ^
+          "}\n"      
+    | Binop (op, a, b) -> (unparse_ast a) ^ op ^ (unparse_ast b)
+    | Expr_assign (a, b) -> (unparse_ast a) ^ " = " ^ (unparse_ast b)
+    | Stmt_assign (a, b) -> (unparse_ast a) ^ " = " ^ (unparse_ast b) ^ ";\n"
+    | Comma (a, b) -> (unparse_ast a) ^ ", " ^ (unparse_ast b)
+    | Integer i -> string_of_int i
+    | CVar s -> s
+    | CCall (s, x) -> s ^ "(" ^ (unparse_ast x) ^ ")"
+    | CPlus [] -> "0 /* bug */"
+    | CPlus [a] -> " /* bug */ " ^ (unparse_ast a)
+    | CPlus (a::b) -> (parenthesize a) ^ (unparse_plus b)
+    | ITimes (a, b) -> (parenthesize a) ^ " * " ^ (parenthesize b)
+    | CUminus a -> "- " ^ (parenthesize a)
+
+and unparse_function = function
+    Fcn (typ, name, args, body) ->
+      let rec unparse_args = function
+          [Decl (a, b)] -> a ^ " " ^ b 
+	| (Decl (a, b)) :: s -> a ^ " " ^ b  ^ ", "
+            ^  unparse_args s
+	| [] -> ""
+	| _ -> failwith "unparse_function"
+      in 
+      (typ ^ " " ^ name ^ "(" ^ unparse_args args ^ ")\n" ^
+       unparse_ast body)
+
+
+(*************************************************************
+ * traverse a a function and return a list of all expressions,
+ * in the execution order
+ **************************************************************)
+let rec fcn_to_expr_list = fun (Fcn (_, _, _, body)) -> ast_to_expr_list body 
+and acode_to_expr_list = function
+    AInstr (Assign (_, x)) -> [x]
+  | ASeq (a, b) -> 
+      (asched_to_expr_list a) @ (asched_to_expr_list b)
+  | _ -> []
+and asched_to_expr_list (Annotate (_, _, _, _, code)) =
+  acode_to_expr_list code
+and ast_to_expr_list = function
+    Asch a -> asched_to_expr_list a
+  | Block (_, a) -> flatten (map ast_to_expr_list a)
+  | For (_, _, _, body) ->  ast_to_expr_list body
+  | If (_, body) ->  ast_to_expr_list body
+  | _ -> []
+
+(***********************
+ * Extracting Constants
+ ***********************)
+
+(* add a new key & value to a list of (key,value) pairs, where
+   the keys are floats and each key is unique up to almost_equal *)
+
+let extract_constants f =
+  let constlist = flatten (map expr_to_constants (ast_to_expr_list f))
+  in map
+       (fun n ->
+	  Tdecl 
+	    ("DK(" ^ (Number.to_konst n) ^ ", " ^ (Number.to_string n) ^ 
+	       ");\n"))
+       (unique_constants constlist)
+       
+(******************************
+   Extracting operation counts 
+ ******************************)
+
+let count_stack_vars =
+  let rec count_acode = function
+    | ASeq (a, b) -> max (count_asched a) (count_asched b)
+    | _ -> 0
+  and count_asched (Annotate (_, _, decl, _, code)) =
+    (length decl) + (count_acode code)
+  and count_ast = function
+    | Asch a -> count_asched a
+    | Block (d, a) -> (length d) + (Util.max_list (map count_ast a))
+    | For (_, _, _, body) -> count_ast body
+    | If (_, body) -> count_ast body
+    | _ -> 0
+  in function (Fcn (_, _, _, body)) -> count_ast body
+
+let count_memory_acc f =
+  let rec count_var v =
+    if (Variable.is_locative v)	then 1 else 0
+  and count_acode = function
+    | AInstr (Assign (v, _)) -> count_var v
+    | ASeq (a, b) -> (count_asched a) + (count_asched b)
+    | _ -> 0
+  and count_asched = function
+      Annotate (_, _, _, _, code) -> count_acode code
+  and count_ast = function
+    | Asch a -> count_asched a
+    | Block (_, a) -> (Util.sum_list (map count_ast a))
+    | Comma (a, b) -> (count_ast a) + (count_ast b)
+    | For (_, _, _, body) -> count_ast body
+    | If (_, body) -> count_ast body
+    | _ -> 0
+  and count_acc_expr_func acc = function
+    | Load v -> acc + (count_var v)
+    | Plus a -> fold_left count_acc_expr_func acc a
+    | Times (a, b) -> fold_left count_acc_expr_func acc [a; b]
+    | Uminus a -> count_acc_expr_func acc a
+    | _ -> acc
+  in let (Fcn (typ, name, args, body)) = f
+  in (count_ast body) + 
+    fold_left count_acc_expr_func 0 (fcn_to_expr_list f)
+
+let good_for_fma = To_alist.good_for_fma
+
+let build_fma = function
+  | [a; Times (b, c)] when good_for_fma (b, c) -> Some (a, b, c)
+  | [Times (b, c); a] when good_for_fma (b, c) -> Some (a, b, c)
+  | [a; Uminus (Times (b, c))] when good_for_fma (b, c) -> Some (a, b, c)
+  | [Uminus (Times (b, c)); a] when good_for_fma (b, c) -> Some (a, b, c)
+  | _ -> None
+
+let rec count_flops_expr_func (adds, mults, fmas) = function
+  | Plus [] -> (adds, mults, fmas)
+  | Plus ([_; _] as a) -> 
+      begin
+	match build_fma a with
+	  | None ->
+	      fold_left count_flops_expr_func 
+		(adds + (length a) - 1, mults, fmas) a
+	  | Some (a, b, c) ->
+	      fold_left count_flops_expr_func (adds, mults, fmas+1) [a; b; c]
+      end
+  | Plus (a :: b) -> 
+      count_flops_expr_func (adds, mults, fmas) (Plus [a; Plus b])
+  | Times (NaN MULTI_A,_)  -> (adds, mults, fmas)
+  | Times (NaN MULTI_B,_)  -> (adds, mults, fmas)
+  | Times (NaN I,b) -> count_flops_expr_func (adds, mults, fmas) b
+  | Times (NaN CONJ,b) -> count_flops_expr_func (adds, mults, fmas) b
+  | Times (a,b) -> fold_left count_flops_expr_func (adds, mults+1, fmas) [a; b]
+  | CTimes (a,b) -> 
+      fold_left count_flops_expr_func (adds+1, mults+2, fmas) [a; b]
+  | CTimesJ (a,b) -> 
+      fold_left count_flops_expr_func (adds+1, mults+2, fmas) [a; b]
+  | Uminus a -> count_flops_expr_func (adds, mults, fmas) a
+  | _ -> (adds, mults, fmas)
+
+let count_flops f = 
+    fold_left count_flops_expr_func (0, 0, 0) (fcn_to_expr_list f)
+
+let count_constants f = 
+    length (unique_constants (flatten (map expr_to_constants (fcn_to_expr_list f))))
+
+let arith_complexity f =
+  let (a, m, fmas) = count_flops f
+  and v = count_stack_vars f
+  and c = count_constants f
+  and mem = count_memory_acc f
+  in (a, m, fmas, v, c, mem)
+
+(* print the operation costs *)
+let print_cost f =
+  let Fcn (_, _, _, _) = f 
+  and (a, m, fmas, v, c, mem) = arith_complexity f
+  in
+  "/*\n"^
+  " * This function contains " ^
+  (string_of_int (a + fmas)) ^ " FP additions, "  ^
+  (string_of_int (m + fmas)) ^ " FP multiplications,\n" ^
+  " * (or, " ^
+  (string_of_int a) ^ " additions, "  ^
+  (string_of_int m) ^ " multiplications, " ^
+  (string_of_int fmas) ^ " fused multiply/add),\n" ^
+  " * " ^ (string_of_int v) ^ " stack variables, " ^
+  (string_of_int c) ^ " constants, and " ^
+  (string_of_int mem) ^ " memory accesses\n" ^
+  " */\n"
+
+(*****************************************
+ * functions that create C arrays 
+ *****************************************)
+type stride = 
+  | SVar of string
+  | SConst of string
+  | SInteger of int
+  | SNeg of stride
+
+type sstride =
+  | Simple of int
+  | Constant of (string * int)
+  | Composite of (string * int)
+  | Negative of sstride
+
+let rec simplify_stride stride i =
+    match (stride, i) with
+      (_, 0) -> Simple 0
+    | (SInteger n, i) -> Simple (n * i)
+    | (SConst s, i) -> Constant (s, i)
+    | (SVar s, i) -> Composite (s, i)
+    | (SNeg x, i) -> 
+	match (simplify_stride x i) with
+	| Negative y -> y
+	| y -> Negative y
+  
+let rec cstride_to_string = function
+  | Simple i -> string_of_int i
+  | Constant (s, i) -> 
+        if !Magic.lisp_syntax then
+	  "(* " ^ s ^ " " ^ (string_of_int i) ^ ")"
+	else
+	  s ^ " * " ^ (string_of_int i)
+  | Composite (s, i) -> 
+        if !Magic.lisp_syntax then
+	  "(* " ^ s ^ " " ^ (string_of_int i) ^ ")"
+	else
+	  "WS(" ^ s ^ ", " ^ (string_of_int i) ^ ")"
+  | Negative x -> "-" ^ cstride_to_string x
+
+let aref name index = 
+  if !Magic.lisp_syntax then
+    Printf.sprintf "(aref %s %s)"  name index
+  else
+    Printf.sprintf "%s[%s]"  name index
+
+let array_subscript name stride k = 
+  aref name (cstride_to_string (simplify_stride stride k))
+
+let varray_subscript name vstride stride v i = 
+  let vindex = simplify_stride vstride v
+  and iindex = simplify_stride stride i
+  in 
+  let index = 
+    match (vindex, iindex) with
+      (Simple vi, Simple ii) -> string_of_int (vi + ii)
+    | (Simple 0, x) -> cstride_to_string x
+    | (x, Simple 0) -> cstride_to_string x
+    | _ -> (cstride_to_string vindex) ^ " + " ^ (cstride_to_string iindex)
+  in aref name index
+
+let real_of s = "c_re(" ^ s ^ ")"
+let imag_of s = "c_im(" ^ s ^ ")"
+
+let flops_of f =
+  let (add, mul, fma) = count_flops f in
+  Printf.sprintf "{ %d, %d, %d, 0 }" add mul fma
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/c.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/c.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,74 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+type stride = 
+  | SVar of string
+  | SConst of string
+  | SInteger of int
+  | SNeg of stride
+val array_subscript : string -> stride -> int -> string
+val varray_subscript : string -> stride -> stride -> int -> int -> string
+
+val real_of : string -> string
+val imag_of : string -> string
+
+val realtype : string
+val realtypep : string
+val constrealtype : string
+val constrealtypep : string
+val stridetype : string
+
+type c_decl = 
+  | Decl of string * string
+  | Tdecl of string                (* arbitrary text declaration *)
+
+and c_ast =
+  | Asch of Annotate.annotated_schedule
+  | Simd_leavefun
+  | Return of c_ast
+  | For of c_ast * c_ast * c_ast * c_ast
+  | If of c_ast * c_ast
+  | Block of (c_decl list) * (c_ast list)
+  | Binop of string * c_ast * c_ast
+  | Expr_assign of c_ast * c_ast
+  | Stmt_assign of c_ast * c_ast
+  | Comma of c_ast * c_ast
+  | Integer of int
+  | CVar of string
+  | CCall of string * c_ast
+  | CPlus of c_ast list
+  | ITimes of c_ast * c_ast
+  | CUminus of c_ast
+and c_fcn = | Fcn of string * string * c_decl list * c_ast
+
+val unparse_expr : Expr.expr -> string
+val unparse_assignment : Expr.assignment -> string
+val unparse_annotated : bool -> Annotate.annotated_schedule -> string
+val unparse_decl : c_decl -> string
+val unparse_ast : c_ast -> string
+val unparse_function : c_fcn -> string
+
+val flops_of : c_fcn -> string
+val print_cost : c_fcn -> string
+
+val ast_to_expr_list : c_ast -> Expr.expr list
+val extract_constants : c_ast -> c_decl list
+val ctimes : (c_ast * c_ast) -> c_ast
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/complex.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/complex.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,147 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* abstraction layer for complex operations *)
+open Littlesimp
+open Expr
+
+(* type of complex expressions *)
+type expr = CE of Expr.expr * Expr.expr
+
+let two = CE (makeNum Number.two, makeNum Number.zero)
+let one = CE (makeNum Number.one, makeNum Number.zero)
+let i = CE (makeNum Number.zero, makeNum Number.one)
+let zero = CE (makeNum Number.zero, makeNum Number.zero)
+let make (r, i) = CE (r, i)
+
+let uminus (CE (a, b)) =  CE (makeUminus a, makeUminus b)
+
+let inverse_int n = CE (makeNum (Number.div Number.one (Number.of_int n)),
+			makeNum Number.zero)
+
+let inverse_int_sqrt n = 
+  CE (makeNum (Number.div Number.one (Number.sqrt (Number.of_int n))),
+      makeNum Number.zero)
+let int_sqrt n = 
+  CE (makeNum (Number.sqrt (Number.of_int n)),
+      makeNum Number.zero)
+
+let nan x = CE (NaN x, makeNum Number.zero)
+
+let half = inverse_int 2
+
+let times3x3 (CE (a, b)) (CE (c, d)) = 
+  CE (makePlus [makeTimes (c, makePlus [a; makeUminus (b)]);
+	        makeTimes (b, makePlus [c; makeUminus (d)])],
+      makePlus [makeTimes (a, makePlus [c; d]);
+	        makeUminus(makeTimes (c, makePlus [a; makeUminus (b)]))])
+
+let times (CE (a, b)) (CE (c, d)) = 
+  if not !Magic.threemult then
+    CE (makePlus [makeTimes (a, c); makeUminus (makeTimes (b, d))],
+        makePlus [makeTimes (a, d); makeTimes (b, c)])
+  else if is_constant c && is_constant d then
+    times3x3 (CE (a, b)) (CE (c, d))
+  else (* hope a and b are constant expressions *)
+    times3x3 (CE (c, d)) (CE (a, b))
+
+let ctimes (CE (a, _)) (CE (c, _)) = 
+  CE (CTimes (a, c), makeNum Number.zero)
+
+let ctimesj (CE (a, _)) (CE (c, _)) = 
+  CE (CTimesJ (a, c), makeNum Number.zero)
+      
+(* complex exponential (of root of unity); returns exp(2*pi*i/n * m) *)
+let exp n i =
+  let (c, s) = Number.cexp n i
+  in CE (makeNum c, makeNum s)
+
+(* various trig functions evaluated at (2*pi*i/n * m) *)
+let sec n m =
+  let (c, s) = Number.cexp n m
+  in CE (makeNum (Number.div Number.one c), makeNum Number.zero)
+let csc n m =
+  let (c, s) = Number.cexp n m
+  in CE (makeNum (Number.div Number.one s), makeNum Number.zero)
+let tan n m =
+  let (c, s) = Number.cexp n m
+  in CE (makeNum (Number.div s c), makeNum Number.zero)
+let cot n m =
+  let (c, s) = Number.cexp n m
+  in CE (makeNum (Number.div c s), makeNum Number.zero)
+    
+(* complex sum *)
+let plus a =
+  let rec unzip_complex = function
+      [] -> ([], [])
+    | ((CE (a, b)) :: s) ->
+        let (r,i) = unzip_complex s
+	in
+	(a::r), (b::i) in
+  let (c, d) = unzip_complex a in
+  CE (makePlus c, makePlus d)
+
+(* extract real/imaginary *)
+let real (CE (a, b)) = CE (a, makeNum Number.zero)
+let imag (CE (a, b)) = CE (b, makeNum Number.zero)
+let iimag (CE (a, b)) = CE (makeNum Number.zero, b)
+let conj (CE (a, b)) = CE (a, makeUminus b)
+
+    
+(* abstraction of sum_{i=0}^{n-1} *)
+let sigma a b f = plus (List.map f (Util.interval a b))
+
+(* store and assignment operations *)
+let store_real v (CE (a, b)) = Expr.Store (v, a)
+let store_imag v (CE (a, b)) = Expr.Store (v, b)
+let store (vr, vi) x = (store_real vr x, store_imag vi x)
+
+let assign_real v (CE (a, b)) = Expr.Assign (v, a)
+let assign_imag v (CE (a, b)) = Expr.Assign (v, b)
+let assign (vr, vi) x = (assign_real vr x, assign_imag vi x)
+
+
+(************************
+   shortcuts 
+ ************************)
+let (@*) = times
+let (@+) a b = plus [a; b]
+let (@-) a b = plus [a; uminus b]
+
+(* type of complex signals *)
+type signal = int -> expr
+
+(* make a finite signal infinite *)
+let infinite n signal i = if ((0 <= i) && (i < n)) then signal i else zero
+
+let hermitian n a =
+  Util.array n (fun i ->
+    if (i = 0) then real (a 0)
+    else if (i < n - i)  then (a i)
+    else if (i > n - i)  then conj (a (n - i))
+    else real (a i))
+
+let antihermitian n a =
+  Util.array n (fun i ->
+    if (i = 0) then iimag (a 0)
+    else if (i < n - i)  then (a i)
+    else if (i > n - i)  then uminus (conj (a (n - i)))
+    else iimag (a i))
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/complex.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/complex.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,68 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+type expr
+val make : (Expr.expr * Expr.expr) -> expr
+val two : expr
+val one : expr
+val i : expr
+val zero : expr
+val half : expr
+val inverse_int : int -> expr
+val inverse_int_sqrt : int -> expr
+val int_sqrt : int -> expr
+val times : expr -> expr -> expr
+val ctimes : expr -> expr -> expr
+val ctimesj : expr -> expr -> expr
+val uminus : expr -> expr
+val exp : int -> int -> expr
+val sec : int -> int -> expr
+val csc : int -> int -> expr
+val tan : int -> int -> expr
+val cot : int -> int -> expr
+val plus : expr list -> expr
+val real : expr -> expr
+val imag : expr -> expr
+val conj : expr -> expr
+val nan : Expr.transcendent -> expr
+val sigma : int -> int -> (int -> expr) -> expr
+
+val (@*) : expr -> expr -> expr
+val (@+) : expr -> expr -> expr
+val (@-) : expr -> expr -> expr
+
+(* a signal is a map from integers to expressions *)
+type signal = int -> expr
+val infinite : int -> signal -> signal
+
+val store_real : Variable.variable -> expr -> Expr.expr
+val store_imag : Variable.variable -> expr -> Expr.expr
+val store :
+  Variable.variable * Variable.variable -> expr -> Expr.expr * Expr.expr
+
+val assign_real : Variable.variable -> expr -> Expr.assignment
+val assign_imag : Variable.variable -> expr -> Expr.assignment
+val assign :
+  Variable.variable * Variable.variable ->
+  expr -> Expr.assignment * Expr.assignment
+
+val hermitian : int -> (int -> expr) -> int -> expr
+val antihermitian : int -> (int -> expr) -> int -> expr
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/conv.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/conv.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,130 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+*)
+
+open Complex
+open Util
+
+let polyphase m a ph i = a (m * i + ph)
+
+let rec divmod n i =
+  if (i < 0) then 
+    let (a, b) = divmod n (i + n)
+    in (a - 1, b)
+  else (i / n, i mod n)
+
+let unpolyphase m a i = let (x, y) = divmod m i in a y x
+
+let lift2 f a b i = f (a i) (b i)
+
+(* convolution of signals A and B *)
+let rec conv na a nb b =
+  let rec naive na a nb b i =
+    sigma 0 na (fun j -> (a j) @* (b (i - j)))
+
+  and recur na a nb b =
+    if (na <= 1 || nb <= 1) then
+      naive na a nb b
+    else
+      let p = polyphase 2 in
+      let ee = conv (na - na / 2) (p a 0) (nb - nb / 2) (p b 0)
+      and eo = conv (na - na / 2) (p a 0) (nb / 2) (p b 1)
+      and oe = conv (na / 2) (p a 1) (nb - nb / 2) (p b 0)
+      and oo = conv (na / 2) (p a 1) (nb / 2) (p b 1) in
+      unpolyphase 2 (function
+	  0 -> fun i -> (ee i) @+ (oo (i - 1))
+	| 1 -> fun i -> (eo i) @+ (oe i) 
+	| _ -> failwith "recur")
+
+
+  (* Karatsuba variant 1: (a+bx)(c+dx) = (ac+bdxx)+((a+b)(c+d)-ac-bd)x *)
+  and karatsuba1 na a nb b =
+      let p = polyphase 2 in
+      let ae = p a 0 and nae = na - na / 2
+      and ao = p a 1 and nao = na / 2
+      and be = p b 0 and nbe = nb - nb / 2
+      and bo = p b 1 and nbo = nb / 2 in
+      let ae = infinite nae ae and ao = infinite nao ao
+      and be = infinite nbe be and bo = infinite nbo bo in
+      let aeo = lift2 (@+) ae ao and naeo = nae
+      and beo = lift2 (@+) be bo and nbeo = nbe in
+      let ee = conv nae ae nbe be 
+      and oo = conv nao ao nbo bo
+      and eoeo = conv naeo aeo nbeo beo in
+
+      let q = function
+	  0 -> fun i -> (ee i)  @+ (oo (i - 1))
+	| 1 -> fun i -> (eoeo i) @- ((ee i) @+ (oo i))
+	| _ -> failwith "karatsuba1" in
+      unpolyphase 2 q
+
+  (* Karatsuba variant 2: 
+     (a+bx)(c+dx) = ((a+b)c-b(c-dxx))+x((a+b)c-a(c-d)) *)
+  and karatsuba2 na a nb b =
+      let p = polyphase 2 in
+      let ae = p a 0 and nae = na - na / 2
+      and ao = p a 1 and nao = na / 2
+      and be = p b 0 and nbe = nb - nb / 2
+      and bo = p b 1 and nbo = nb / 2 in
+      let ae = infinite nae ae and ao = infinite nao ao
+      and be = infinite nbe be and bo = infinite nbo bo in
+
+      let c1 = conv nae (lift2 (@+) ae ao) nbe be
+      and c2 = conv nao ao (nbo + 1) (fun i -> be i @- bo (i - 1))
+      and c3 = conv nae ae nbe (lift2 (@-) be bo) in
+
+      let q = function
+	  0 -> lift2 (@-) c1 c2
+	| 1 -> lift2 (@-) c1 c3
+	| _ -> failwith "karatsuba2" in
+      unpolyphase 2 q
+
+  and karatsuba na a nb b =
+    let m = na + nb - 1 in
+    if (m < !Magic.karatsuba_min) then
+      recur na a nb b
+    else
+      match !Magic.karatsuba_variant with
+	1 -> karatsuba1 na a nb b
+      |	2 -> karatsuba2 na a nb b
+      |	_ -> failwith "unknown karatsuba variant"
+
+  and via_circular na a nb b =
+    let m = na + nb - 1 in
+    if (m < !Magic.circular_min) then
+      karatsuba na a nb b
+    else
+      let rec find_min n = if n >= m then n else find_min (2 * n) in
+      circular (find_min 1) a b
+
+  in
+  let a = infinite na a and b = infinite nb b in
+  let res = array (na + nb - 1) (via_circular na a nb b) in
+  infinite (na + nb - 1) res
+    
+and circular n a b =
+  let via_dft n a b =
+    let fa = Fft.dft (-1) n a 
+    and fb = Fft.dft (-1) n b
+    and scale = inverse_int n in
+    let fab i = ((fa i) @* (fb i)) @* scale in
+    Fft.dft 1 n fab
+
+  in via_dft n a b
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/conv.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/conv.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,22 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+val conv : int -> Complex.signal -> int -> Complex.signal -> Complex.signal
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/dag.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/dag.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,109 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+
+(* Here, we have functions to transform a sequence of assignments
+   (variable = expression) into a DAG (a directed, acyclic graph).
+   The nodes of the DAG are the assignments, and the edges indicate
+   dependencies.  (The DAG is analyzed in the scheduler to find an
+   efficient ordering of the assignments.)
+
+   This file also contains utilities to manipulate the DAG in various
+   ways. *)
+
+(********************************************
+ *  Dag structure
+ ********************************************)
+type color = RED | BLUE | BLACK | YELLOW
+
+type dagnode = 
+    { assigned: Variable.variable;
+      mutable expression: Expr.expr;
+      input_variables: Variable.variable list;
+      mutable successors: dagnode list;
+      mutable predecessors: dagnode list;
+      mutable label: int;
+      mutable color: color}
+
+type dag = Dag of (dagnode list)
+
+(* true if node uses v *)
+let node_uses v node = 
+  List.exists (Variable.same v) node.input_variables
+
+(* true if assignment of v clobbers any input of node *)
+let node_clobbers node v = 
+  List.exists (Variable.same_location v) node.input_variables
+
+(* true if nodeb depends on nodea *)
+let depends_on nodea nodeb =
+  node_uses nodea.assigned nodeb or
+  node_clobbers nodea nodeb.assigned
+
+(* transform an assignment list into a dag *)
+let makedag alist =
+  let dag = List.map
+      (fun assignment ->
+	let (v, x) = assignment in
+	{ assigned = v;
+	  expression = x;
+	  input_variables = Expr.find_vars x;
+	  successors = [];
+	  predecessors = [];
+	  label = 0;
+	  color = BLACK })
+      alist
+  in begin
+    for_list dag (fun i ->
+	for_list dag (fun j ->
+	  if depends_on i j then begin
+	    i.successors <- j :: i.successors;
+	    j.predecessors <- i :: j.predecessors;
+	  end));
+    Dag dag;
+  end
+
+let map f (Dag dag) = Dag (List.map f dag)
+let for_all (Dag dag) f = 
+  (* type system loophole *)
+  let make_unit _ = () in
+  make_unit (List.map f dag)
+let to_list (Dag dag) = dag
+
+let find_node f (Dag dag) = Util.find_elem f dag
+
+(* breadth-first search *)
+let rec bfs (Dag dag) node init_label =
+  let _ =  node.label <- init_label in
+  let rec loop = function
+      [] -> ()
+    | node :: rest ->
+	let neighbors = node.predecessors @ node.successors in
+	let m = min_list (List.map (fun node -> node.label) neighbors) in
+	if (node.label > m + 1) then begin
+	  node.label <- m + 1;
+	  loop (rest @ neighbors);
+	end else
+	  loop rest
+  in let neighbors = node.predecessors @ node.successors in
+  loop neighbors
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/dag.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/dag.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,43 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+
+type color = | RED | BLUE | BLACK | YELLOW
+
+type dagnode = 
+    { assigned: Variable.variable;
+      mutable expression: Expr.expr;
+      input_variables: Variable.variable list;
+      mutable successors: dagnode list;
+      mutable predecessors: dagnode list;
+      mutable label: int;
+      mutable color: color}
+
+type dag
+
+val makedag : (Variable.variable * Expr.expr) list -> dag
+
+val map : (dagnode -> dagnode) -> dag -> dag
+val for_all : dag -> (dagnode -> unit) -> unit
+val to_list : dag -> (dagnode list)
+val bfs : dag -> dagnode -> int -> unit
+val find_node : (dagnode -> bool) -> dag -> dagnode option
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/expr.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/expr.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,152 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* Here, we define the data type encapsulating a symbolic arithmetic
+   expression, and provide some routines for manipulating it. *)
+
+(* I will regret this hack : *)
+(* NEWS: I did *)
+type transcendent = I | MULTI_A | MULTI_B | CONJ
+
+type expr =
+  | Num of Number.number
+  | NaN of transcendent
+  | Plus of expr list
+  | Times of expr * expr
+  | CTimes of expr * expr
+  | CTimesJ of expr * expr  (* CTimesJ (a, b) = conj(a) * b *)
+  | Uminus of expr
+  | Load of Variable.variable
+  | Store of Variable.variable * expr
+
+type assignment = Assign of Variable.variable * expr
+
+(* various hash functions *)
+let hash_float x = 
+  let (mantissa, exponent) = frexp x
+  in truncate (float_of_int(exponent) *. 1234.567 +. mantissa *. 10000.0)
+
+let sum_list l = List.fold_right (+) l 0
+
+let transcendent_to_float = function
+  | I -> 2.718281828459045235360287471  (* any transcendent number will do *)
+  | MULTI_A -> 0.6931471805599453094172321214
+  | MULTI_B -> -0.3665129205816643270124391582
+  | CONJ -> 0.6019072301972345747375400015
+
+let rec hash = function
+  | Num x -> hash_float (Number.to_float x)
+  | NaN x -> hash_float (transcendent_to_float x)
+  | Load v -> 1 + 1237 * Variable.hash v
+  | Store (v, x) -> 2 * Variable.hash v - 2345 * hash x
+  | Plus l -> 5 + 23451 * sum_list (List.map Hashtbl.hash l)
+  | Times (a, b) -> 41 + 31415 * (Hashtbl.hash a +  Hashtbl.hash b)
+  | CTimes (a, b) -> 49 + 3245 * (Hashtbl.hash a +  Hashtbl.hash b)
+  | CTimesJ (a, b) -> 31 + 3471 * (Hashtbl.hash a +  Hashtbl.hash b)
+  | Uminus x -> 42 + 12345 * (hash x)
+
+(* find all variables *)
+let rec find_vars x =
+  match x with
+  | Load y -> [y]
+  | Plus l -> List.flatten (List.map find_vars l)
+  | Times (a, b) -> (find_vars a) @ (find_vars b)
+  | CTimes (a, b) -> (find_vars a) @ (find_vars b)
+  | CTimesJ (a, b) -> (find_vars a) @ (find_vars b)
+  | Uminus a -> find_vars a
+  | _ -> []
+
+
+(* TRUE if expression is a constant *)
+let is_constant = function
+  | Num _ -> true
+  | NaN _ -> true
+  | Load v -> Variable.is_constant v
+  | _ -> false
+
+let is_known_constant = function
+  | Num _ -> true
+  | NaN _ -> true
+  | _ -> false
+
+(* expr to string, used for debugging *)
+let rec foldr_string_concat l = 
+  match l with
+    [] -> ""
+  | [a] -> a
+  | a :: b -> a ^ " " ^ (foldr_string_concat b)
+
+let string_of_transcendent = function
+  | I -> "I"
+  | MULTI_A -> "MULTI_A"
+  | MULTI_B -> "MULTI_B"
+  | CONJ -> "CONJ"
+
+let rec to_string = function
+  | Load v -> Variable.unparse v
+  | Num n -> string_of_float (Number.to_float n)
+  | NaN n -> string_of_transcendent n
+  | Plus x -> "(+ " ^ (foldr_string_concat (List.map to_string x)) ^ ")"
+  | Times (a, b) -> "(* " ^ (to_string a) ^ " " ^ (to_string b) ^ ")"
+  | CTimes (a, b) -> "(c* " ^ (to_string a) ^ " " ^ (to_string b) ^ ")"
+  | CTimesJ (a, b) -> "(cj* " ^ (to_string a) ^ " " ^ (to_string b) ^ ")"
+  | Uminus a -> "(- " ^ (to_string a) ^ ")"
+  | Store (v, a) -> "(:= " ^ (Variable.unparse v) ^ " " ^
+      (to_string a) ^ ")"
+
+let rec to_string_a d x = 
+  if (d = 0) then "..." else match x with
+  | Load v -> Variable.unparse v
+  | Num n -> Number.to_konst n
+  | NaN n -> string_of_transcendent n
+  | Plus x -> "(+ " ^ (foldr_string_concat (List.map (to_string_a (d - 1)) x)) ^ ")"
+  | Times (a, b) -> "(* " ^ (to_string_a (d - 1) a) ^ " " ^ (to_string_a (d - 1) b) ^ ")"
+  | CTimes (a, b) -> "(c* " ^ (to_string_a (d - 1) a) ^ " " ^ (to_string_a (d - 1) b) ^ ")"
+  | CTimesJ (a, b) -> "(cj* " ^ (to_string_a (d - 1) a) ^ " " ^ (to_string_a (d - 1) b) ^ ")"
+  | Uminus a -> "(- " ^ (to_string_a (d-1) a) ^ ")"
+  | Store (v, a) -> "(:= " ^ (Variable.unparse v) ^ " " ^
+      (to_string_a (d-1) a) ^ ")"
+
+let to_string = to_string_a 10
+
+let assignment_to_string = function
+  | Assign (v, a) -> "(:= " ^ (Variable.unparse v) ^ " " ^ (to_string a) ^ ")"
+
+let dump print = List.iter (fun x -> print ((assignment_to_string x) ^ "\n"))
+
+(* find all constants in a given expression *)
+let rec expr_to_constants = function
+  | Num n -> [n]
+  | Plus a -> List.flatten (List.map expr_to_constants a)
+  | Times (a, b) -> (expr_to_constants a) @ (expr_to_constants b)
+  | CTimes (a, b) -> (expr_to_constants a) @ (expr_to_constants b)
+  | CTimesJ (a, b) -> (expr_to_constants a) @ (expr_to_constants b)
+  | Uminus a -> expr_to_constants a
+  | _ -> []
+
+
+let add_float_key_value list_so_far k = 
+  if List.exists (fun k2 -> Number.equal k k2) list_so_far then
+    list_so_far
+  else
+    k :: list_so_far
+
+let unique_constants = List.fold_left add_float_key_value [] 
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/expr.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/expr.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,51 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+type transcendent = I | MULTI_A | MULTI_B | CONJ
+
+type expr =
+  | Num of Number.number
+  | NaN of transcendent
+  | Plus of expr list
+  | Times of expr * expr
+  | CTimes of expr * expr
+  | CTimesJ of expr * expr
+  | Uminus of expr
+  | Load of Variable.variable
+  | Store of Variable.variable * expr
+
+type assignment = Assign of Variable.variable * expr
+
+val hash_float : float -> int
+val hash : expr -> int
+val to_string : expr -> string
+val assignment_to_string : assignment -> string
+val transcendent_to_float : transcendent -> float
+val string_of_transcendent : transcendent -> string
+
+val find_vars : expr -> Variable.variable list
+val is_constant : expr -> bool
+val is_known_constant : expr -> bool
+
+val dump : (string -> unit) -> assignment list -> unit
+
+val expr_to_constants : expr -> Number.number list
+val unique_constants : Number.number list -> Number.number list
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/fft.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/fft.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,307 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+
+(* This is the part of the generator that actually computes the FFT
+   in symbolic form *)
+
+open Complex
+open Util
+
+(* choose a suitable factor of n *)
+let choose_factor n =
+  (* first choice: i such that gcd(i, n / i) = 1, i as big as possible *)
+  let choose1 n =
+    let rec loop i f =
+      if (i * i > n) then f
+      else if ((n mod i) == 0 && gcd i (n / i) == 1) then loop (i + 1) i
+      else loop (i + 1) f
+    in loop 1 1
+
+  (* second choice: the biggest factor i of n, where i < sqrt(n), if any *)
+  and choose2 n =
+    let rec loop i f =
+      if (i * i > n) then f
+      else if ((n mod i) == 0) then loop (i + 1) i
+      else loop (i + 1) f
+    in loop 1 1
+
+  in let i = choose1 n in
+  if (i > 1) then i
+  else choose2 n
+
+let is_power_of_two n = (n > 0) && ((n - 1) land n == 0)
+  
+let rec dft_prime sign n input = 
+  let sum filter i =
+    sigma 0 n (fun j ->
+      let coeff = filter (exp n (sign * i * j))
+      in coeff @* (input j)) in
+  let computation_even = array n (sum identity)
+  and computation_odd =
+    let sumr = array n (sum real)
+    and sumi = array n (sum ((times Complex.i) @@ imag)) in
+    array n (fun i ->
+      if (i = 0) then
+	(* expose some common subexpressions *)
+	input 0 @+ 
+	sigma 1 ((n + 1) / 2) (fun j -> input j @+ input (n - j))
+      else
+	let i' = min i (n - i) in
+	if (i < n - i) then 
+	  sumr i' @+ sumi i'
+	else
+	  sumr i' @- sumi i') in
+  if (n >= !Magic.rader_min) then
+    dft_rader sign n input
+  else if (n == 2) then
+    computation_even
+  else
+    computation_odd 
+
+
+and dft_rader sign p input =
+  let half = 
+    let one_half = inverse_int 2 in
+    times one_half
+
+  and make_product n a b =
+    let scale_factor = inverse_int n in
+    array n (fun i -> a i @* (scale_factor @* b i)) in
+
+  (* generates a convolution using ffts.  (all arguments are the
+     same as to gen_convolution, below) *)
+  let gen_convolution_by_fft n a b addtoall =
+    let fft_a = dft 1 n a
+    and fft_b = dft 1 n b in 
+
+    let fft_ab = make_product n fft_a fft_b
+    and dc_term i = if (i == 0) then addtoall else zero in
+
+    let fft_ab1 = array n (fun i -> fft_ab i @+ dc_term i)
+    and sum = fft_a 0 in
+    let conv = dft (-1) n fft_ab1 in
+    (sum, conv)
+
+  (* alternate routine for convolution.  Seems to work better for
+     small sizes.  I have no idea why. *)
+  and gen_convolution_by_fft_alt n a b addtoall =
+    let ap = array n (fun i -> half (a i @+ a ((n - i) mod n)))
+    and am = array n (fun i -> half (a i @- a ((n - i) mod n)))
+    and bp = array n (fun i -> half (b i @+ b ((n - i) mod n)))
+    and bm = array n (fun i -> half (b i @- b ((n - i) mod n)))
+    in
+
+    let fft_ap = dft 1 n ap
+    and fft_am = dft 1 n am
+    and fft_bp = dft 1 n bp
+    and fft_bm = dft 1 n bm in
+
+    let fft_abpp = make_product n fft_ap fft_bp
+    and fft_abpm = make_product n fft_ap fft_bm
+    and fft_abmp = make_product n fft_am fft_bp
+    and fft_abmm = make_product n fft_am fft_bm 
+    and sum = fft_ap 0 @+ fft_am 0
+    and dc_term i = if (i == 0) then addtoall else zero in
+
+    let fft_ab1 = array n (fun i -> (fft_abpp i @+ fft_abmm i) @+ dc_term i)
+    and fft_ab2 = array n (fun i -> fft_abpm i @+ fft_abmp i) in
+    let conv1 = dft (-1) n fft_ab1 
+    and conv2 = dft (-1) n fft_ab2 in
+    let conv = array n (fun i ->
+      conv1 i @+ conv2 i) in
+    (sum, conv) 
+
+    (* generator of assignment list assigning conv to the convolution of
+       a and b, all of which are of length n.  addtoall is added to
+       all of the elements of the result.  Returns (sum, convolution) pair
+       where sum is the sum of the elements of a. *)
+
+  in let gen_convolution = 
+    if (p <= !Magic.alternate_convolution) then 
+      gen_convolution_by_fft_alt
+    else
+      gen_convolution_by_fft
+
+  (* fft generator for prime n = p using Rader's algorithm for
+     turning the fft into a convolution, which then can be
+     performed in a variety of ways *)
+  in  
+    let g = find_generator p in
+    let ginv = pow_mod g (p - 2) p in
+    let input_perm = array p (fun i -> input (pow_mod g i p))
+    and omega_perm = array p (fun i -> exp p (sign * (pow_mod ginv i p)))
+    and output_perm = array p (fun i -> pow_mod ginv i p)
+    in let (sum, conv) = 
+      (gen_convolution (p - 1)  input_perm omega_perm (input 0))
+    in array p (fun i ->
+      if (i = 0) then
+	input 0 @+ sum
+      else
+	let i' = suchthat 0 (fun i' -> i = output_perm i')
+	in conv i')
+
+(* our modified version of the conjugate-pair split-radix algorithm,
+   which reduces the number of multiplications by rescaling the 
+   sub-transforms (power-of-two n's only) *)
+and newsplit sign n input =
+  let rec s n k = (* recursive scale factor *)
+    if n <= 4 then
+      one
+    else 
+      let k4 = (abs k) mod (n / 4) in
+      let k4' = if k4 <= (n / 8) then k4 else (n/4 - k4) in
+      (s (n / 4) k4') @* (real (exp n k4'))
+			  
+  and sinv n k = (* 1 / s(n,k) *)
+    if n <= 4 then
+      one
+    else 
+      let k4 = (abs k) mod (n / 4) in
+      let k4' = if k4 <= (n / 8) then k4 else (n/4 - k4) in
+      (sinv (n / 4) k4') @* (sec n k4')
+
+  in let sdiv2 n k = (s n k) @* (sinv (2*n) k) (* s(n,k) / s(2*n,k) *)
+  and sdiv4 n k = (* s(n,k) / s(4*n,k) *)
+    let k4 = (abs k) mod n in
+    sec (4*n) (if k4 <= (n / 2) then k4 else (n - k4))
+      
+  in let t n k = (exp n k) @* (sdiv4 (n/4) k)
+
+  and dft1 input = input
+  and dft2 input = array 2 (fun k -> (input 0) @+ ((input 1) @* exp 2 k))
+
+  in let rec newsplit0 sign n input =
+    if (n == 1) then dft1 input
+    else if (n == 2) then dft2 input
+    else let u = newsplit0 sign (n / 2) (fun i -> input (i*2))
+    and z = newsplitS sign (n / 4) (fun i -> input (i*4 + 1))
+    and z' = newsplitS sign (n / 4) (fun i -> input ((n + i*4 - 1) mod n)) 
+    and twid = array n (fun k -> s (n/4) k @* exp n (sign * k)) in
+    let w = array n (fun k -> twid k @* z (k mod (n / 4)))
+    and w' = array n (fun k -> conj (twid k) @* z' (k mod (n / 4))) in
+    let ww = array n (fun k -> w k @+ w' k) in
+    array n (fun k -> u (k mod (n / 2)) @+ ww k)
+      
+  and newsplitS sign n input =
+    if (n == 1) then dft1 input
+    else if (n == 2) then dft2 input
+    else let u = newsplitS2 sign (n / 2) (fun i -> input (i*2))
+    and z = newsplitS sign (n / 4) (fun i -> input (i*4 + 1))
+    and z' = newsplitS sign (n / 4) (fun i -> input ((n + i*4 - 1) mod n)) in
+    let w = array n (fun k -> t n (sign * k) @* z (k mod (n / 4)))
+    and w' = array n (fun k -> conj (t n (sign * k)) @* z' (k mod (n / 4))) in
+    let ww = array n (fun k -> w k @+ w' k) in
+    array n (fun k -> u (k mod (n / 2)) @+ ww k)
+      
+  and newsplitS2 sign n input =
+    if (n == 1) then dft1 input
+    else if (n == 2) then dft2 input
+    else let u = newsplitS4 sign (n / 2) (fun i -> input (i*2))
+    and z = newsplitS sign (n / 4) (fun i -> input (i*4 + 1))
+    and z' = newsplitS sign (n / 4) (fun i -> input ((n + i*4 - 1) mod n)) in
+    let w = array n (fun k -> t n (sign * k) @* z (k mod (n / 4)))
+    and w' = array n (fun k -> conj (t n (sign * k)) @* z' (k mod (n / 4))) in
+    let ww = array n (fun k -> (w k @+ w' k) @* (sdiv2 n k)) in
+    array n (fun k -> u (k mod (n / 2)) @+ ww k)
+      
+  and newsplitS4 sign n input =
+    if (n == 1) then dft1 input
+    else if (n == 2) then 
+      let f = dft2 input
+      in array 2 (fun k -> (f k) @* (sinv 8 k))
+    else let u = newsplitS2 sign (n / 2) (fun i -> input (i*2))
+    and z = newsplitS sign (n / 4) (fun i -> input (i*4 + 1))
+    and z' = newsplitS sign (n / 4) (fun i -> input ((n + i*4 - 1) mod n)) in
+    let w = array n (fun k -> t n (sign * k) @* z (k mod (n / 4)))
+    and w' = array n (fun k -> conj (t n (sign * k)) @* z' (k mod (n / 4))) in
+    let ww = array n (fun k -> w k @+ w' k) in
+    array n (fun k -> (u (k mod (n / 2)) @+ ww k) @* (sdiv4 n k))
+      
+  in newsplit0 sign n input
+ 
+and dft sign n input =
+  let rec cooley_tukey sign n1 n2 input =
+    let tmp1 = 
+      array n2 (fun i2 -> 
+	dft sign n1 (fun i1 -> input (i1 * n2 + i2))) in
+    let tmp2 =  
+      array n1 (fun i1 ->
+	array n2 (fun i2 ->
+	  exp n (sign * i1 * i2) @* tmp1 i2 i1)) in
+    let tmp3 = array n1 (fun i1 -> dft sign n2 (tmp2 i1)) in
+    (fun i -> tmp3 (i mod n1) (i / n1))
+
+  (*
+   * This is "exponent -1" split-radix by Dan Bernstein.
+   *)
+  and split_radix_dit sign n input =
+    let f0 = dft sign (n / 2) (fun i -> input (i * 2))
+    and f10 = dft sign (n / 4) (fun i -> input (i * 4 + 1))
+    and f11 = dft sign (n / 4) (fun i -> input ((n + i * 4 - 1) mod n)) in
+    let g10 = array n (fun k ->
+      exp n (sign * k) @* f10 (k mod (n / 4)))
+    and g11 = array n (fun k ->
+      exp n (- sign * k) @* f11 (k mod (n / 4))) in
+    let g1 = array n (fun k -> g10 k @+ g11 k) in
+    array n (fun k -> f0 (k mod (n / 2)) @+ g1 k)
+
+  and split_radix_dif sign n input =
+    let n2 = n / 2 and n4 = n / 4 in
+    let x0 = array n2 (fun i -> input i @+ input (i + n2))
+    and x10 = array n4 (fun i -> input i @- input (i + n2))
+    and x11 = array n4 (fun i ->
+	input (i + n4) @- input (i + n2 + n4)) in
+    let x1 k i = 
+      exp n (k * i * sign) @* (x10 i @+ exp 4 (k * sign) @* x11 i) in
+    let f0 = dft sign n2 x0 
+    and f1 = array 4 (fun k -> dft sign n4 (x1 k)) in
+    array n (fun k ->
+      if k mod 2 = 0 then f0 (k / 2)
+      else let k' = k mod 4 in f1 k' ((k - k') / 4))
+
+  and prime_factor sign n1 n2 input =
+    let tmp1 = array n2 (fun i2 ->
+      dft sign n1 (fun i1 -> input ((i1 * n2 + i2 * n1) mod n)))
+    in let tmp2 = array n1 (fun i1 ->
+      dft sign n2 (fun k2 -> tmp1 k2 i1))
+    in fun i -> tmp2 (i mod n1) (i mod n2)
+
+  in let algorithm sign n =
+    let r = choose_factor n in
+    if List.mem n !Magic.rader_list then
+      (* special cases *)
+      dft_rader sign n
+    else if (r == 1) then  (* n is prime *)
+      dft_prime sign n
+    else if (gcd r (n / r)) == 1 then
+      prime_factor sign r (n / r)
+    else if (n mod 4 = 0 && n > 4) then
+      if !Magic.newsplit && is_power_of_two n then
+	newsplit sign n
+      else if !Magic.dif_split_radix then
+	split_radix_dif sign n
+      else
+	split_radix_dit sign n
+    else 
+      cooley_tukey sign r (n / r)
+  in
+  array n (algorithm sign n input)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/fft.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/fft.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,22 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+val dft : int -> int -> Complex.signal -> Complex.signal
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_hc2c.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_hc2c.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,186 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+
+type ditdif = DIT | DIF
+let ditdif = ref DIT
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number> [ -dit | -dif ]"
+
+let urs = ref Stride_variable
+
+let speclist = [
+  "-dit",
+  Arg.Unit(fun () -> ditdif := DIT),
+  " generate a DIT codelet";
+
+  "-dif",
+  Arg.Unit(fun () -> ditdif := DIF),
+  " generate a DIF codelet";
+
+  "-with-rs",
+  Arg.String(fun x -> urs := arg_to_stride x),
+  " specialize for given R-stride";
+]
+
+let byi = Complex.times Complex.i
+let byui = Complex.times (Complex.uminus Complex.i)
+
+let sym n f i = if (i < n - i) then f i else Complex.conj (f i)
+
+let shuffle_eo fe fo i = if i mod 2 == 0 then fe (i/2) else fo ((i-1)/2)
+
+let generate n =
+  let rs = "rs"
+  and twarray = "W"
+  and m = "m" and mb = "mb" and me = "me" and ms = "ms"
+
+  (* the array names are from the point of view of the complex array
+     (output in R2C, input in C2R) *)
+  and arp = "Rp" (* real, positive *)
+  and aip = "Ip" (* imag, positive *)
+  and arm = "Rm" (* real, negative *)
+  and aim = "Im" (* imag, negative *)
+
+  in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name 
+  and byvl x = choose_simd x (ctimes (CVar "VL", x)) in
+
+  let (bytwiddle, num_twiddles, twdesc) = Twiddle.twiddle_policy 1 false in
+  let nt = num_twiddles n in
+
+  let byw = bytwiddle n sign (twiddle_array nt twarray) in
+
+  let vrs = either_stride (!urs) (C.SVar rs) in
+
+  (* assume a single location.  No point in doing alias analysis *)
+  let the_location = (Unique.make (), Unique.make ()) in
+  let locations _ = the_location in
+
+  let locr = (locative_array_c n 
+		(C.array_subscript arp vrs)
+		(C.array_subscript arm vrs)
+		locations "BUG")
+  and loci = (locative_array_c n 
+		(C.array_subscript aip vrs)
+		(C.array_subscript aim vrs)
+		locations "BUG")
+  and locp = (locative_array_c n 
+		(C.array_subscript arp vrs)
+		(C.array_subscript aip vrs)
+		locations "BUG")
+  and locm = (locative_array_c n 
+		(C.array_subscript arm vrs)
+		(C.array_subscript aim vrs)
+		locations "BUG")
+  in
+  let locri i = if i mod 2 == 0 then locr (i/2) else loci ((i-1)/2)
+  and locpm i = if i < n - i then locp i else locm (n-1-i)
+  in
+
+  let asch = 
+    match !ditdif with
+    | DIT -> 
+	let output = Fft.dft sign n (byw (load_array_c n locri)) in
+	let odag = store_array_c n locpm (sym n output) in
+	  standard_optimizer odag 
+
+    | DIF -> 
+	let output = byw (Fft.dft sign n (sym n (load_array_c n locpm))) in
+	let odag = store_array_c n locri output in
+	  standard_optimizer odag 
+  in
+
+  let vms = CVar "ms" 
+  and varp = CVar arp
+  and vaip = CVar aip
+  and varm = CVar arm
+  and vaim = CVar aim
+  and vm = CVar m and vmb = CVar mb and vme = CVar me 
+  in
+  let body = Block (
+    [Decl ("INT", m)],
+    [For (list_to_comma
+	    [Expr_assign (vm, vmb);
+	     Expr_assign (CVar twarray, 
+			  CPlus [CVar twarray; 
+				 ctimes (CPlus [vmb; CUminus (Integer 1)],
+					 Integer nt)])],
+	  Binop (" < ", vm, vme),
+	  list_to_comma 
+	    [Expr_assign (vm, CPlus [vm; byvl (Integer 1)]);
+	     Expr_assign (varp, CPlus [varp; byvl vms]);
+	     Expr_assign (vaip, CPlus [vaip; byvl vms]);
+	     Expr_assign (varm, CPlus [varm; CUminus (byvl vms)]);
+	     Expr_assign (vaim, CPlus [vaim; CUminus (byvl vms)]);
+	     Expr_assign (CVar twarray, CPlus [CVar twarray; 
+					       byvl (Integer nt)]);
+	     make_volatile_stride (4*n) (CVar rs)
+	   ],
+	  Asch asch)])
+  in
+
+  let tree = 
+    Fcn ("static void", name,
+	 [Decl (C.realtypep, arp);
+	  Decl (C.realtypep, aip);
+	  Decl (C.realtypep, arm);
+	  Decl (C.realtypep, aim);
+	  Decl (C.constrealtypep, twarray);
+	  Decl (C.stridetype, rs);
+	  Decl ("INT", mb);
+	  Decl ("INT", me);
+	  Decl ("INT", ms)],
+         finalize_fcn body)
+  in
+  let twinstr = 
+    Printf.sprintf "static const tw_instr twinstr[] = %s;\n\n" 
+      (twinstr_to_string "VL" (twdesc n))
+  and desc = 
+    Printf.sprintf
+      "static const hc2c_desc desc = {%d, \"%s\", twinstr, &GENUS, %s};\n\n"
+      n name (flops_of tree)
+  and register = "X(khc2c_register)"
+
+  in
+  let init =
+    "\n" ^
+    twinstr ^ 
+    desc ^
+    (declare_register_fcn name) ^
+    (Printf.sprintf "{\n%s(p, %s, &desc, HC2C_VIA_RDFT);\n}" register name)
+  in
+
+  (unparse tree) ^ "\n" ^ init
+
+
+let main () =
+  begin 
+    parse (speclist @ Twiddle.speclist) usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_hc2cdft.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_hc2cdft.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,208 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+
+type ditdif = DIT | DIF
+let ditdif = ref DIT
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number> [ -dit | -dif ]"
+
+let urs = ref Stride_variable
+let ums = ref Stride_variable
+
+let speclist = [
+  "-dit",
+  Arg.Unit(fun () -> ditdif := DIT),
+  " generate a DIT codelet";
+
+  "-dif",
+  Arg.Unit(fun () -> ditdif := DIF),
+  " generate a DIF codelet";
+
+  "-with-rs",
+  Arg.String(fun x -> urs := arg_to_stride x),
+  " specialize for given R-stride";
+
+  "-with-ms",
+  Arg.String(fun x -> ums := arg_to_stride x),
+  " specialize for given ms"
+]
+
+let byi = Complex.times Complex.i
+let byui = Complex.times (Complex.uminus Complex.i)
+
+let shuffle_eo fe fo i = if i mod 2 == 0 then fe (i/2) else fo ((i-1)/2)
+
+let generate n =
+  let rs = "rs"
+  and twarray = "W"
+  and m = "m" and mb = "mb" and me = "me" and ms = "ms"
+
+  (* the array names are from the point of view of the complex array
+     (output in R2C, input in C2R) *)
+  and arp = "Rp" (* real, positive *)
+  and aip = "Ip" (* imag, positive *)
+  and arm = "Rm" (* real, negative *)
+  and aim = "Im" (* imag, negative *)
+
+  in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name 
+  and byvl x = choose_simd x (ctimes (CVar "VL", x)) in
+
+  let (bytwiddle, num_twiddles, twdesc) = Twiddle.twiddle_policy 1 false in
+  let nt = num_twiddles n in
+
+  let byw = bytwiddle n sign (twiddle_array nt twarray) in
+
+  let vrs = either_stride (!urs) (C.SVar rs) in
+
+  (* assume a single location.  No point in doing alias analysis *)
+  let the_location = (Unique.make (), Unique.make ()) in
+  let locations _ = the_location in
+
+  let rlocp = (locative_array_c n 
+		 (C.array_subscript arp vrs)
+		 (C.array_subscript aip vrs)
+		 locations "BUG")
+  and rlocm = (locative_array_c n 
+		 (C.array_subscript arm vrs)
+		 (C.array_subscript aim vrs)
+		 locations "BUG")
+  and clocp = (locative_array_c n 
+		 (C.array_subscript arp vrs)
+		 (C.array_subscript aip vrs)
+		 locations "BUG")
+  and clocm = (locative_array_c n 
+		 (C.array_subscript arm vrs)
+		 (C.array_subscript aim vrs)
+		 locations "BUG")
+  in
+  let rloc i = if i mod 2 == 0 then rlocp (i/2) else rlocm ((i-1)/2)
+  and cloc i = if i < n - i then clocp i else clocm (n-1-i)
+  and sym n f i = if (i < n - i) then f i else Complex.conj (f i)
+  and sym1 f i = 
+    if i mod 2 == 0 then
+      Complex.plus [f i; Complex.conj (f (i+1))]
+    else
+      Complex.times (Complex.uminus Complex.i)
+	(Complex.plus [f (i-1); Complex.uminus (Complex.conj (f i))])
+  and sym1i f i = 
+    if i mod 2 == 0 then
+      Complex.plus [f i; Complex.times Complex.i (f (i+1))]
+    else
+      Complex.conj
+	(Complex.plus [f (i-1); 
+		       Complex.times (Complex.uminus Complex.i) (f i)])
+  in
+
+  let asch = 
+    match !ditdif with
+    | DIT -> 
+	let output = 
+	  (Complex.times Complex.half) @@
+	    (Fft.dft sign n (byw (sym1 (load_array_c n rloc)))) in
+	let odag = store_array_c n cloc (sym n output) in
+	  standard_optimizer odag 
+
+    | DIF -> 
+	let output = 
+	  byw (Fft.dft sign n (sym n (load_array_c n cloc)))
+	in
+	let odag = store_array_c n rloc (sym1i output) in
+	  standard_optimizer odag 
+  in
+
+  let vms = CVar "ms" 
+  and varp = CVar arp
+  and vaip = CVar aip
+  and varm = CVar arm
+  and vaim = CVar aim
+  and vm = CVar m and vmb = CVar mb and vme = CVar me 
+  in
+  let body = Block (
+    [Decl ("INT", m)],
+    [For (list_to_comma
+	    [Expr_assign (vm, vmb);
+	     Expr_assign (CVar twarray, 
+			  CPlus [CVar twarray; 
+				 ctimes (CPlus [vmb; CUminus (Integer 1)],
+					 Integer nt)])],
+	  Binop (" < ", vm, vme),
+	  list_to_comma 
+	    [Expr_assign (vm, CPlus [vm; byvl (Integer 1)]);
+	     Expr_assign (varp, CPlus [varp; byvl vms]);
+	     Expr_assign (vaip, CPlus [vaip; byvl vms]);
+	     Expr_assign (varm, CPlus [varm; CUminus (byvl vms)]);
+	     Expr_assign (vaim, CPlus [vaim; CUminus (byvl vms)]);
+	     Expr_assign (CVar twarray, CPlus [CVar twarray; 
+					       byvl (Integer nt)]);
+	     make_volatile_stride (4*n) (CVar rs)
+	   ],
+	  Asch asch)]
+    )
+  in
+
+  let tree = 
+    Fcn ("static void", name,
+	 [Decl (C.realtypep, arp);
+	  Decl (C.realtypep, aip);
+	  Decl (C.realtypep, arm);
+	  Decl (C.realtypep, aim);
+	  Decl (C.constrealtypep, twarray);
+	  Decl (C.stridetype, rs);
+	  Decl ("INT", mb);
+	  Decl ("INT", me);
+	  Decl ("INT", ms)],
+         finalize_fcn body)
+  in
+  let twinstr = 
+    Printf.sprintf "static const tw_instr twinstr[] = %s;\n\n" 
+      (twinstr_to_string "VL" (twdesc n))
+  and desc = 
+    Printf.sprintf
+      "static const hc2c_desc desc = {%d, \"%s\", twinstr, &GENUS, %s};\n\n"
+      n name (flops_of tree)
+  and register = "X(khc2c_register)"
+
+  in
+  let init =
+    "\n" ^
+    twinstr ^ 
+    desc ^
+    (declare_register_fcn name) ^
+    (Printf.sprintf "{\n%s(p, %s, &desc, HC2C_VIA_DFT);\n}" register name)
+  in
+
+  (unparse tree) ^ "\n" ^ init
+
+
+let main () =
+  begin 
+    parse (speclist @ Twiddle.speclist) usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_hc2cdft_c.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_hc2cdft_c.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,221 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+
+type ditdif = DIT | DIF
+let ditdif = ref DIT
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number> [ -dit | -dif ]"
+
+let urs = ref Stride_variable
+let ums = ref Stride_variable
+
+let speclist = [
+  "-dit",
+  Arg.Unit(fun () -> ditdif := DIT),
+  " generate a DIT codelet";
+
+  "-dif",
+  Arg.Unit(fun () -> ditdif := DIF),
+  " generate a DIF codelet";
+
+  "-with-rs",
+  Arg.String(fun x -> urs := arg_to_stride x),
+  " specialize for given R-stride";
+
+  "-with-ms",
+  Arg.String(fun x -> ums := arg_to_stride x),
+  " specialize for given ms"
+]
+
+let byi = Complex.times Complex.i
+let byui = Complex.times (Complex.uminus Complex.i)
+
+let shuffle_eo fe fo i = if i mod 2 == 0 then fe (i/2) else fo ((i-1)/2)
+
+let generate n =
+  let rs = "rs"
+  and twarray = "W"
+  and m = "m" and mb = "mb" and me = "me" and ms = "ms"
+
+  (* the array names are from the point of view of the complex array
+     (output in R2C, input in C2R) *)
+  and arp = "Rp" (* real, positive *)
+  and aip = "Ip" (* imag, positive *)
+  and arm = "Rm" (* real, negative *)
+  and aim = "Im" (* imag, negative *)
+
+  in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name 
+  and byvl x = choose_simd x (ctimes (CVar "VL", x)) 
+  and bytwvl x = choose_simd x (ctimes (CVar "TWVL", x)) 
+  and bytwvl_vl x = choose_simd x (ctimes (CVar "(TWVL/VL)", x)) in
+
+  let (bytwiddle, num_twiddles, twdesc) = Twiddle.twiddle_policy 1 true in
+  let nt = num_twiddles n in
+
+  let byw = bytwiddle n sign (twiddle_array nt twarray) in
+
+  let vrs = either_stride (!urs) (C.SVar rs) in
+  let sms = stride_to_string "ms" !ums in
+  let msms = "-" ^ sms in
+
+  (* assume a single location.  No point in doing alias analysis *)
+  let the_location = (Unique.make (), Unique.make ()) in
+  let locations _ = the_location in
+
+  let rlocp = (locative_array_c n 
+		 (C.array_subscript arp vrs)
+		 (C.array_subscript aip vrs)
+		 locations sms)
+  and rlocm = (locative_array_c n 
+		 (C.array_subscript arm vrs)
+		 (C.array_subscript aim vrs)
+		 locations msms)
+  and clocp = (locative_array_c n 
+		 (C.array_subscript arp vrs)
+		 (C.array_subscript aip vrs)
+		 locations sms)
+  and clocm = (locative_array_c n 
+		 (C.array_subscript arm vrs)
+		 (C.array_subscript aim vrs)
+		 locations msms)
+  in
+  let rloc i = if i mod 2 == 0 then rlocp (i/2) else rlocm ((i-1)/2)
+  and cloc i = if i < n - i then clocp i else clocm (n-1-i)
+  and sym n f i =
+    if (i < n - i) then 
+      f i
+    else 
+      Complex.times (Complex.nan Expr.CONJ) (f i)
+  and sym1 f i = 
+    if i mod 2 == 0 then
+      Complex.plus [f i; 
+		    Complex.times (Complex.nan Expr.CONJ) (f (i+1))]
+    else
+      Complex.times (Complex.nan Expr.I)
+	(Complex.plus [Complex.uminus (f (i-1));
+		       Complex.times (Complex.nan Expr.CONJ) (f i)])
+  and sym1i f i = 
+    if i mod 2 == 0 then
+      Complex.plus [f i; 
+		    Complex.times (Complex.nan Expr.I) (f (i+1))]
+    else
+      Complex.times (Complex.nan Expr.CONJ)
+	(Complex.plus [f (i-1); 
+		       Complex.uminus
+			 (Complex.times (Complex.nan Expr.I) (f i))])
+  in
+
+  let asch = 
+    match !ditdif with
+    | DIT -> 
+	let output = 
+	  (Complex.times Complex.half) @@
+	    (Trig.dft_via_rdft sign n (byw (sym1 (load_array_r n rloc)))) in
+	let odag = store_array_r n cloc (sym n output) in
+	  standard_optimizer odag 
+
+    | DIF -> 
+	let output = 
+	  byw (Trig.dft_via_rdft sign n (sym n (load_array_r n cloc)))
+	in
+	let odag = store_array_r n rloc (sym1i output) in
+	  standard_optimizer odag 
+  in
+
+  let vms = CVar sms 
+  and varp = CVar arp
+  and vaip = CVar aip
+  and varm = CVar arm
+  and vaim = CVar aim
+  and vm = CVar m and vmb = CVar mb and vme = CVar me 
+  in
+  let body = Block (
+    [Decl ("INT", m)],
+    [For (list_to_comma
+	    [Expr_assign (vm, vmb);
+	     Expr_assign (CVar twarray, 
+			  CPlus [CVar twarray; 
+				 ctimes (CPlus [vmb; CUminus (Integer 1)],
+					 bytwvl_vl (Integer nt))])],
+	  Binop (" < ", vm, vme),
+	  list_to_comma 
+	    [Expr_assign (vm, CPlus [vm; byvl (Integer 1)]);
+	     Expr_assign (varp, CPlus [varp; byvl vms]);
+	     Expr_assign (vaip, CPlus [vaip; byvl vms]);
+	     Expr_assign (varm, CPlus [varm; CUminus (byvl vms)]);
+	     Expr_assign (vaim, CPlus [vaim; CUminus (byvl vms)]);
+	     Expr_assign (CVar twarray, CPlus [CVar twarray; 
+					       bytwvl (Integer nt)]);
+	     make_volatile_stride (4*n) (CVar rs)
+	   ],
+	  Asch asch)]
+    )
+  in
+
+  let tree = 
+    Fcn ("static void", name,
+	 [Decl (C.realtypep, arp);
+	  Decl (C.realtypep, aip);
+	  Decl (C.realtypep, arm);
+	  Decl (C.realtypep, aim);
+	  Decl (C.constrealtypep, twarray);
+	  Decl (C.stridetype, rs);
+	  Decl ("INT", mb);
+	  Decl ("INT", me);
+	  Decl ("INT", ms)],
+         finalize_fcn body)
+  in
+  let twinstr = 
+    Printf.sprintf "static const tw_instr twinstr[] = %s;\n\n" 
+      (twinstr_to_string "VL" (twdesc n))
+  and desc = 
+    Printf.sprintf
+      "static const hc2c_desc desc = {%d, %s, twinstr, &GENUS, %s};\n\n"
+      n (stringify name) (flops_of tree)
+  and register = "X(khc2c_register)"
+
+  in
+  let init =
+    "\n" ^
+    twinstr ^ 
+    desc ^
+    (declare_register_fcn name) ^
+    (Printf.sprintf "{\n%s(p, %s, &desc, HC2C_VIA_DFT);\n}" register name)
+  in
+
+  (unparse tree) ^ "\n" ^ init
+
+
+let main () =
+  begin 
+    Simdmagic.simd_mode := true;
+    parse (speclist @ Twiddle.speclist) usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_hc2hc.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_hc2hc.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,170 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+
+type ditdif = DIT | DIF
+let ditdif = ref DIT
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number> [ -dit | -dif ]"
+
+let urs = ref Stride_variable
+
+let speclist = [
+  "-dit",
+  Arg.Unit(fun () -> ditdif := DIT),
+  " generate a DIT codelet";
+
+  "-dif",
+  Arg.Unit(fun () -> ditdif := DIF),
+  " generate a DIF codelet";
+
+  "-with-rs",
+  Arg.String(fun x -> urs := arg_to_stride x),
+  " specialize for given R-stride";
+]
+
+let rioarray = "cr" 
+and iioarray = "ci" 
+
+let genone sign n transform load store vrs =
+  let locations = unique_array_c n in
+  let input = 
+    locative_array_c n 
+      (C.array_subscript rioarray vrs)
+      (C.array_subscript iioarray vrs)
+      locations "BUG" in
+  let output = transform sign n (load n input) in
+  let ioloc = 
+    locative_array_c n 
+      (C.array_subscript rioarray vrs)
+      (C.array_subscript iioarray vrs)
+      locations "BUG" in
+  let odag = store n ioloc output in
+  let annot = standard_optimizer odag 
+  in annot
+
+let byi = Complex.times Complex.i
+let byui = Complex.times (Complex.uminus Complex.i)
+
+let sym1 n f i = 
+  Complex.plus [Complex.real (f i); byi (Complex.imag (f (n - 1 - i)))]
+
+let sym2 n f i = if (i < n - i) then f i else byi (f i)
+let sym2i n f i = if (i < n - i) then f i else byui (f i)
+
+let generate n =
+  let rs = "rs"
+  and twarray = "W"
+  and m = "m" and mb = "mb" and me = "me" and ms = "ms" in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name 
+  and byvl x = choose_simd x (ctimes (CVar "VL", x)) in
+
+  let (bytwiddle, num_twiddles, twdesc) = Twiddle.twiddle_policy 1 false in
+  let nt = num_twiddles n in
+
+  let byw = bytwiddle n sign (twiddle_array nt twarray) in
+
+  let vrs = either_stride (!urs) (C.SVar rs) in
+
+  let asch = 
+    match !ditdif with
+    | DIT -> 
+	genone sign n 
+	  (fun sign n input -> 
+	     ((sym1 n) @@ (sym2 n)) (Fft.dft sign n (byw input)))
+	  load_array_c store_array_c vrs
+    | DIF -> 
+	genone sign n 
+	  (fun sign n input -> 
+	     byw (Fft.dft sign n (((sym2i n) @@ (sym1 n)) input)))
+	  load_array_c store_array_c vrs
+  in
+
+  let vms = CVar "ms" 
+  and vrioarray = CVar rioarray
+  and viioarray = CVar iioarray
+  and vm = CVar m and vmb = CVar mb and vme = CVar me 
+  in
+  let body = Block (
+    [Decl ("INT", m)],
+    [For (list_to_comma
+	    [Expr_assign (vm, vmb);
+	     Expr_assign (CVar twarray, 
+			  CPlus [CVar twarray; 
+				 ctimes (CPlus [vmb; CUminus (Integer 1)],
+					 Integer nt)])],
+	  Binop (" < ", vm, vme),
+	  list_to_comma 
+	    [Expr_assign (vm, CPlus [vm; byvl (Integer 1)]);
+	     Expr_assign (vrioarray, CPlus [vrioarray; byvl vms]);
+	     Expr_assign (viioarray, 
+			  CPlus [viioarray; CUminus (byvl vms)]);
+	     Expr_assign (CVar twarray, CPlus [CVar twarray; 
+					       byvl (Integer nt)]);
+	     make_volatile_stride (2*n) (CVar rs)
+	   ],
+	  Asch asch)])
+  in
+
+  let tree = 
+    Fcn ("static void", name,
+	 [Decl (C.realtypep, rioarray);
+	  Decl (C.realtypep, iioarray);
+	  Decl (C.constrealtypep, twarray);
+	  Decl (C.stridetype, rs);
+	  Decl ("INT", mb);
+	  Decl ("INT", me);
+	  Decl ("INT", ms)],
+         finalize_fcn body)
+  in
+  let twinstr = 
+    Printf.sprintf "static const tw_instr twinstr[] = %s;\n\n" 
+      (twinstr_to_string "VL" (twdesc n))
+  and desc = 
+    Printf.sprintf
+      "static const hc2hc_desc desc = {%d, \"%s\", twinstr, &GENUS, %s};\n\n"
+      n name (flops_of tree)
+  and register = "X(khc2hc_register)"
+
+  in
+  let init =
+    "\n" ^
+    twinstr ^ 
+    desc ^
+    (declare_register_fcn name) ^
+    (Printf.sprintf "{\n%s(p, %s, &desc);\n}" register name)
+  in
+
+  (unparse tree) ^ "\n" ^ init
+
+
+let main () =
+  begin 
+    parse (speclist @ Twiddle.speclist) usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_mdct.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_mdct.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,257 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* generation of trigonometric transforms *)
+
+open Util
+open Genutil
+open C
+
+
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number>"
+
+let uistride = ref Stride_variable
+let uostride = ref Stride_variable
+let uivstride = ref Stride_variable
+let uovstride = ref Stride_variable
+let normalization = ref 1
+
+type mode =
+  | MDCT
+  | MDCT_MP3
+  | MDCT_VORBIS
+  | MDCT_WINDOW
+  | MDCT_WINDOW_SYM
+  | IMDCT
+  | IMDCT_MP3
+  | IMDCT_VORBIS
+  | IMDCT_WINDOW
+  | IMDCT_WINDOW_SYM
+  | NONE
+
+let mode = ref NONE
+
+let speclist = [
+  "-with-istride",
+  Arg.String(fun x -> uistride := arg_to_stride x),
+  " specialize for given input stride";
+
+  "-with-ostride",
+  Arg.String(fun x -> uostride := arg_to_stride x),
+  " specialize for given output stride";
+
+  "-with-ivstride",
+  Arg.String(fun x -> uivstride := arg_to_stride x),
+  " specialize for given input vector stride";
+
+  "-with-ovstride",
+  Arg.String(fun x -> uovstride := arg_to_stride x),
+  " specialize for given output vector stride";
+
+  "-normalization",
+  Arg.String(fun x -> normalization := int_of_string x),
+  " normalization integer to divide by";
+
+  "-mdct",
+  Arg.Unit(fun () -> mode := MDCT),
+  " generate an MDCT codelet";
+
+  "-mdct-mp3",
+  Arg.Unit(fun () -> mode := MDCT_MP3),
+  " generate an MDCT codelet with MP3 windowing";
+
+  "-mdct-window",
+  Arg.Unit(fun () -> mode := MDCT_WINDOW),
+  " generate an MDCT codelet with window array";
+
+  "-mdct-window-sym",
+  Arg.Unit(fun () -> mode := MDCT_WINDOW_SYM),
+  " generate an MDCT codelet with symmetric window array";
+
+  "-imdct",
+  Arg.Unit(fun () -> mode := IMDCT),
+  " generate an IMDCT codelet";
+
+  "-imdct-mp3",
+  Arg.Unit(fun () -> mode := IMDCT_MP3),
+  " generate an IMDCT codelet with MP3 windowing";
+
+  "-imdct-window",
+  Arg.Unit(fun () -> mode := IMDCT_WINDOW),
+  " generate an IMDCT codelet with window array";
+
+  "-imdct-window-sym",
+  Arg.Unit(fun () -> mode := IMDCT_WINDOW_SYM),
+  " generate an IMDCT codelet with symmetric window array";
+]
+
+let unity_window n i = Complex.one
+
+(* MP3 window(k) = sin(pi/(2n) * (k + 1/2)) *)
+let mp3_window n k = 
+  Complex.imag (Complex.exp (8 * n) (2*k + 1))
+
+(* Vorbis window(k) = sin(pi/2 * (mp3_window(k))^2)
+    ... this is transcendental, though, so we can't do it with our
+        current Complex.exp function *)
+
+let window_array n w =
+    array n (fun i ->
+      let stride = C.SInteger 1
+      and klass = Unique.make () in
+      let refr = C.array_subscript w stride i in
+      let kr = Variable.make_constant klass refr in
+      load_r (kr, kr))
+
+let load_window w n i = w i
+let load_window_sym w n i = w (if (i < n) then i else (2*n - 1 - i))
+
+(* fixme: use same locations for input and output so that it works in-place? *)
+
+(* Note: only correct for even n! *)
+let load_array_mdct window n rarr iarr locations =
+  let twon = 2 * n in
+  let arr = load_array_c twon 
+      (locative_array_c twon rarr iarr locations "BUG") in
+  let arrw = fun i -> Complex.times (window n i) (arr i) in
+  array n
+    ((Complex.times Complex.half) @@
+     (fun i ->
+       if (i < n/2) then
+	 Complex.uminus (Complex.plus [arrw (i + n + n/2); 
+				       arrw (n + n/2 - 1 - i)])
+       else
+	 Complex.plus [arrw (i - n/2); 
+		       Complex.uminus (arrw (n + n/2 - 1 - i))]))
+
+let store_array_mdct window n rarr iarr locations arr =
+  store_array_r n (locative_array_c n rarr iarr locations "BUG") arr
+
+let load_array_imdct window n rarr iarr locations =
+  load_array_c n (locative_array_c n rarr iarr locations "BUG")
+
+let store_array_imdct window n rarr iarr locations arr =
+  let n2 = n/2 in
+  let threen2 = 3*n2 in
+  let arr2 = fun i ->
+    if (i < n2) then
+      arr (i + n2)
+    else if (i < threen2) then
+      Complex.uminus (arr (threen2 - 1 - i))
+    else
+      Complex.uminus (arr (i - threen2))
+  in
+  let arr2w = fun i -> Complex.times (window n i) (arr2 i) in
+  let twon = 2 * n in
+  store_array_r twon (locative_array_c twon rarr iarr locations "BUG") arr2w
+
+let window_param = function
+    MDCT_WINDOW -> true
+  | MDCT_WINDOW_SYM -> true
+  | IMDCT_WINDOW -> true
+  | IMDCT_WINDOW_SYM -> true
+  | _ -> false
+
+let generate n mode =
+  let iarray = "I"
+  and oarray = "O"
+  and istride = "istride"
+  and ostride = "ostride" 
+  and window = "W" 
+  and name = !Magic.codelet_name in
+
+  let vistride = either_stride (!uistride) (C.SVar istride)
+  and vostride = either_stride (!uostride) (C.SVar ostride)
+  in
+
+  let sivs = stride_to_string "ovs" !uovstride in
+  let sovs = stride_to_string "ivs" !uivstride in
+
+  let (transform, load_input, store_output) = match mode with
+  | MDCT -> Trig.dctIV, load_array_mdct unity_window,
+      store_array_mdct unity_window
+  | MDCT_MP3 -> Trig.dctIV, load_array_mdct mp3_window,
+      store_array_mdct unity_window
+  | MDCT_WINDOW -> Trig.dctIV, load_array_mdct
+	(load_window (window_array (2 * n) window)),
+      store_array_mdct unity_window
+  | MDCT_WINDOW_SYM -> Trig.dctIV, load_array_mdct
+	(load_window_sym (window_array n window)),
+      store_array_mdct unity_window
+  | IMDCT -> Trig.dctIV, load_array_imdct unity_window,
+      store_array_imdct unity_window
+  | IMDCT_MP3 -> Trig.dctIV, load_array_imdct unity_window,
+      store_array_imdct mp3_window
+  | IMDCT_WINDOW -> Trig.dctIV, load_array_imdct unity_window,
+      store_array_imdct (load_window (window_array (2 * n) window))
+  | IMDCT_WINDOW_SYM -> Trig.dctIV, load_array_imdct unity_window,
+      store_array_imdct (load_window_sym (window_array n window))
+  | _ -> failwith "must specify transform kind"
+  in
+    
+  let locations = unique_array_c (2*n) in
+  let input = 
+    load_input n
+      (C.array_subscript iarray vistride)
+      (C.array_subscript "BUG" vistride)
+      locations
+  in
+  let output = (Complex.times (Complex.inverse_int !normalization)) 
+    @@ (transform n input) in
+  let odag =
+    store_output n
+      (C.array_subscript oarray vostride)
+      (C.array_subscript "BUG" vostride)
+      locations 
+      output
+  in
+  let annot = standard_optimizer odag in
+
+  let tree =
+    Fcn ("void", name,
+	 ([Decl (C.constrealtypep, iarray);
+	   Decl (C.realtypep, oarray)]
+	  @ (if stride_fixed !uistride then [] 
+               else [Decl (C.stridetype, istride)])
+	  @ (if stride_fixed !uostride then [] 
+	       else [Decl (C.stridetype, ostride)])
+	  @ (choose_simd []
+	       (if stride_fixed !uivstride then [] else 
+	       [Decl ("int", sivs)]))
+	  @ (choose_simd []
+	       (if stride_fixed !uovstride then [] else 
+	       [Decl ("int", sovs)]))
+	  @ (if (not (window_param mode)) then [] 
+	       else [Decl (C.constrealtypep, window)])
+	 ),
+	 finalize_fcn (Asch annot))
+
+  in
+  (unparse tree) ^ "\n"
+
+
+let main () =
+  begin
+    parse speclist usage;
+    print_string (generate (check_size ()) !mode);
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_notw.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_notw.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,168 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number>"
+
+let uistride = ref Stride_variable
+let uostride = ref Stride_variable
+let uivstride = ref Stride_variable
+let uovstride = ref Stride_variable
+
+let speclist = [
+  "-with-istride",
+  Arg.String(fun x -> uistride := arg_to_stride x),
+  " specialize for given input stride";
+
+  "-with-ostride",
+  Arg.String(fun x -> uostride := arg_to_stride x),
+  " specialize for given output stride";
+
+  "-with-ivstride",
+  Arg.String(fun x -> uivstride := arg_to_stride x),
+  " specialize for given input vector stride";
+
+  "-with-ovstride",
+  Arg.String(fun x -> uovstride := arg_to_stride x),
+  " specialize for given output vector stride"
+] 
+
+let nonstandard_optimizer list_of_buddy_stores dag =
+  let sched = standard_scheduler dag in
+  let annot = Annotate.annotate list_of_buddy_stores sched in
+  let _ = dump_asched annot in
+  annot
+
+let generate n =
+  let riarray = "ri"
+  and iiarray = "ii"
+  and roarray = "ro"
+  and ioarray = "io"
+  and istride = "is"
+  and ostride = "os" 
+  and i = "i" 
+  and v = "v"
+  in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name
+  and byvl x = choose_simd x (ctimes (CVar "(2 * VL)", x))  in
+  let ename = expand_name name in
+
+  let vistride = either_stride (!uistride) (C.SVar istride)
+  and vostride = either_stride (!uostride) (C.SVar ostride)
+  in
+
+  let sovs = stride_to_string "ovs" !uovstride in
+  let sivs = stride_to_string "ivs" !uivstride in
+
+  let locations = unique_array_c n in
+  let input = 
+    locative_array_c n 
+      (C.array_subscript riarray vistride)
+      (C.array_subscript iiarray vistride)
+      locations sivs in
+  let output = Fft.dft sign n (load_array_c n input) in
+  let oloc = 
+    locative_array_c n 
+      (C.array_subscript roarray vostride)
+      (C.array_subscript ioarray vostride)
+      locations sovs in
+  let list_of_buddy_stores =
+    let k = !Simdmagic.store_multiple in
+    if (k > 1) then
+      if (n mod k == 0) then
+	List.append
+	  (List.map 
+	     (fun i -> List.map (fun j -> (fst (oloc (k * i + j)))) (iota k))
+	     (iota (n / k)))
+	  (List.map 
+	     (fun i -> List.map (fun j -> (snd (oloc (k * i + j)))) (iota k))
+	     (iota (n / k)))
+      else failwith "invalid n for -store-multiple"
+    else []
+  in
+
+  let odag = store_array_c n oloc output in
+  let annot = nonstandard_optimizer list_of_buddy_stores odag in
+
+  let body = Block (
+    [Decl ("INT", i)],
+    [For (Expr_assign (CVar i, CVar v),
+	  Binop (" > ", CVar i, Integer 0),
+	  list_to_comma 
+	    [Expr_assign (CVar i, CPlus [CVar i; CUminus (byvl (Integer 1))]);
+	     Expr_assign (CVar riarray, CPlus [CVar riarray; 
+					       byvl (CVar sivs)]);
+	     Expr_assign (CVar iiarray, CPlus [CVar iiarray; 
+					       byvl (CVar sivs)]);
+	     Expr_assign (CVar roarray, CPlus [CVar roarray; 
+					       byvl (CVar sovs)]);
+	     Expr_assign (CVar ioarray, CPlus [CVar ioarray; 
+					       byvl (CVar sovs)]);
+	     make_volatile_stride (4*n) (CVar istride);
+	     make_volatile_stride (4*n) (CVar ostride)
+	   ],
+	  Asch annot)
+   ])
+  in
+
+  let tree =
+    Fcn ((if !Magic.standalone then "void" else "static void"), ename,
+	 ([Decl (C.constrealtypep, riarray);
+	   Decl (C.constrealtypep, iiarray);
+	   Decl (C.realtypep, roarray);
+ 	   Decl (C.realtypep, ioarray);
+	   Decl (C.stridetype, istride);
+	   Decl (C.stridetype, ostride);
+	   Decl ("INT", v);
+	   Decl ("INT", "ivs");
+	   Decl ("INT", "ovs")]),
+	 finalize_fcn body)
+
+  in let desc = 
+    Printf.sprintf 
+      "static const kdft_desc desc = { %d, %s, %s, &GENUS, %s, %s, %s, %s };\n"
+      n (stringify name) (flops_of tree) 
+      (stride_to_solverparm !uistride) (stride_to_solverparm !uostride)
+      (choose_simd "0" (stride_to_solverparm !uivstride))
+      (choose_simd "0" (stride_to_solverparm !uovstride))
+
+  and init =
+    (declare_register_fcn name) ^
+    "{" ^
+    "  X(kdft_register)(p, " ^ ename ^ ", &desc);\n" ^
+    "}\n"
+
+  in ((unparse tree) ^ "\n" ^ 
+      (if !Magic.standalone then "" else desc ^ init))
+
+let main () =
+  begin
+    parse speclist usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_notw_c.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_notw_c.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,165 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number>"
+
+let uistride = ref Stride_variable
+let uostride = ref Stride_variable
+let uivstride = ref Stride_variable
+let uovstride = ref Stride_variable
+
+let speclist = [
+  "-with-istride",
+  Arg.String(fun x -> uistride := arg_to_stride x),
+  " specialize for given input stride";
+
+  "-with-ostride",
+  Arg.String(fun x -> uostride := arg_to_stride x),
+  " specialize for given output stride";
+
+  "-with-ivstride",
+  Arg.String(fun x -> uivstride := arg_to_stride x),
+  " specialize for given input vector stride";
+
+  "-with-ovstride",
+  Arg.String(fun x -> uovstride := arg_to_stride x),
+  " specialize for given output vector stride"
+] 
+
+let nonstandard_optimizer list_of_buddy_stores dag =
+  let sched = standard_scheduler dag in
+  let annot = Annotate.annotate list_of_buddy_stores sched in
+  let _ = dump_asched annot in
+  annot
+
+let generate n =
+  let riarray = "xi"
+  and roarray = "xo"
+  and istride = "is"
+  and ostride = "os" 
+  and i = "i" 
+  and v = "v"
+  in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name
+  and byvl x = choose_simd x (ctimes (CVar "VL", x))  in
+  let ename = expand_name name in
+
+  let vistride = either_stride (!uistride) (C.SVar istride)
+  and vostride = either_stride (!uostride) (C.SVar ostride)
+  in
+
+  let sivs = stride_to_string "ivs" !uivstride in
+  let sovs = stride_to_string "ovs" !uovstride in
+
+  let fft = Trig.dft_via_rdft in
+
+  let locations = unique_array_c n in
+  let input = 
+    locative_array_c n 
+      (C.array_subscript riarray vistride)
+      (C.array_subscript "BUG" vistride)
+      locations sivs in
+  let output = fft sign n (load_array_r n input) in
+  let oloc = 
+    locative_array_c n 
+      (C.array_subscript roarray vostride)
+      (C.array_subscript "BUG" vostride)
+      locations sovs in
+  let list_of_buddy_stores =
+    let k = !Simdmagic.store_multiple in
+    if (k > 1) then
+      if (n mod k == 0) then
+	List.map 
+	  (fun i -> List.map (fun j -> (fst (oloc (k * i + j)))) (iota k))
+	  (iota (n / k)) 
+      else failwith "invalid n for -store-multiple"
+    else []
+  in
+  let odag = store_array_r n oloc output in
+  let annot = nonstandard_optimizer list_of_buddy_stores odag in
+
+  let body = Block (
+    [Decl ("INT", i);
+     Decl (C.constrealtypep, riarray);
+     Decl (C.realtypep, roarray)],
+    [Stmt_assign (CVar riarray, CVar (if (sign < 0) then "ri" else "ii"));
+     Stmt_assign (CVar roarray, CVar (if (sign < 0) then "ro" else "io"));
+     For (Expr_assign (CVar i, CVar v),
+	  Binop (" > ", CVar i, Integer 0),
+	  list_to_comma 
+	    [Expr_assign (CVar i, CPlus [CVar i; CUminus (byvl (Integer 1))]);
+	     Expr_assign (CVar riarray, CPlus [CVar riarray; 
+					       byvl (CVar sivs)]);
+	     Expr_assign (CVar roarray, CPlus [CVar roarray; 
+					       byvl (CVar sovs)]);
+	     make_volatile_stride (2*n) (CVar istride);
+	     make_volatile_stride (2*n) (CVar ostride)
+	   ],
+	  Asch annot);
+   ])
+  in
+
+  let tree =
+    Fcn ((if !Magic.standalone then "void" else "static void"), ename,
+	 ([Decl (C.constrealtypep, "ri");
+	   Decl (C.constrealtypep, "ii");
+	   Decl (C.realtypep, "ro");
+ 	   Decl (C.realtypep, "io");
+	   Decl (C.stridetype, istride);
+	   Decl (C.stridetype, ostride);
+	   Decl ("INT", v);
+	   Decl ("INT", "ivs");
+	   Decl ("INT", "ovs")]),
+	 finalize_fcn body)
+      
+  in
+  let desc = 
+    Printf.sprintf 
+      "static const kdft_desc desc = { %d, %s, %s, &GENUS, %s, %s, %s, %s };\n"
+      n (stringify name) (flops_of tree) 
+      (stride_to_solverparm !uistride) (stride_to_solverparm !uostride)
+      (choose_simd "0" (stride_to_solverparm !uivstride))
+      (choose_simd "0" (stride_to_solverparm !uovstride))
+
+  and init =
+    (declare_register_fcn name) ^
+    "{" ^
+    "  X(kdft_register)(p, " ^ ename ^ ", &desc);\n" ^
+    "}\n"
+
+  in ((unparse tree) ^ "\n" ^ 
+	(if !Magic.standalone then "" else desc ^ init))
+
+let main () =
+  begin
+    Simdmagic.simd_mode := true;
+    parse speclist usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_r2cb.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_r2cb.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,167 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number>"
+
+let urs = ref Stride_variable
+let ucsr = ref Stride_variable
+let ucsi = ref Stride_variable
+let uivs = ref Stride_variable
+let uovs = ref Stride_variable
+let dftIII_flag = ref false
+
+let speclist = [
+  "-with-rs",
+  Arg.String(fun x -> urs := arg_to_stride x),
+  " specialize for given real-array stride";
+
+  "-with-csr",
+  Arg.String(fun x -> ucsr := arg_to_stride x),
+  " specialize for given complex-array real stride";
+
+  "-with-csi",
+  Arg.String(fun x -> ucsi := arg_to_stride x),
+  " specialize for given complex-array imaginary stride";
+
+  "-with-ivs",
+  Arg.String(fun x -> uivs := arg_to_stride x),
+  " specialize for given input vector stride";
+
+  "-with-ovs",
+  Arg.String(fun x -> uovs := arg_to_stride x),
+  " specialize for given output vector stride";
+
+  "-dft-III",
+  Arg.Unit(fun () -> dftIII_flag := true),
+  " produce shifted dftIII-style codelets"
+] 
+
+let hcdftIII sign n input =
+  let input' i =
+    if (i mod 2 == 0) then
+      Complex.zero
+    else
+      let i' = (i - 1) / 2 in
+      if (2 * i' < n - 1) then (input i')
+      else if (2 * i' == n - 1) then 
+	Complex.real (input i')
+      else 
+	Complex.conj (input (n - 1 - i')) 
+  in Fft.dft sign (2 * n) input'
+
+let generate n =
+  let ar0 = "R0" and ar1 = "R1" and acr = "Cr" and aci = "Ci"
+  and rs = "rs" and csr = "csr" and csi = "csi" 
+  and i = "i" and v = "v"
+  and transform = if !dftIII_flag then hcdftIII else Trig.hdft
+  in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name in
+
+  let vrs = either_stride (!urs) (C.SVar rs)
+  and vcsr = either_stride (!ucsr) (C.SVar csr)
+  and vcsi = either_stride (!ucsi) (C.SVar csi)
+  in
+
+  let sovs = stride_to_string "ovs" !uovs in
+  let sivs = stride_to_string "ivs" !uivs in
+
+  let locations = unique_array_c n in
+  let input = 
+    locative_array_c n 
+      (C.array_subscript acr vcsr)
+      (C.array_subscript aci vcsi)
+      locations sivs in
+  let output = transform sign n (load_array_hc n input) in
+  let oloce = 
+    locative_array_c n 
+      (C.array_subscript ar0 vrs)
+      (C.array_subscript "BUG" vrs)
+      locations sovs
+  and oloco = 
+    locative_array_c n 
+      (C.array_subscript ar1 vrs)
+      (C.array_subscript "BUG" vrs)
+      locations sovs in
+  let oloc i = if i mod 2 == 0 then oloce (i/2) else oloco ((i-1)/2) in
+  let odag = store_array_r n oloc output in
+  let annot = standard_optimizer odag in
+
+  let body = Block (
+    [Decl ("INT", i)],
+    [For (Expr_assign (CVar i, CVar v),
+	  Binop (" > ", CVar i, Integer 0),
+	  list_to_comma 
+	    [Expr_assign (CVar i, CPlus [CVar i; CUminus (Integer 1)]);
+	     Expr_assign (CVar ar0, CPlus [CVar ar0; CVar sovs]);
+	     Expr_assign (CVar ar1, CPlus [CVar ar1; CVar sovs]);
+	     Expr_assign (CVar acr, CPlus [CVar acr; CVar sivs]);
+	     Expr_assign (CVar aci, CPlus [CVar aci; CVar sivs]);
+	     make_volatile_stride (4*n) (CVar rs);
+	     make_volatile_stride (4*n) (CVar csr);
+	     make_volatile_stride (4*n) (CVar csi)
+	   ],
+	  Asch annot)
+   ])
+  in
+
+  let tree =
+    Fcn ((if !Magic.standalone then "void" else "static void"), name,
+	 ([Decl (C.realtypep, ar0);
+	   Decl (C.realtypep, ar1);
+	   Decl (C.realtypep, acr);
+	   Decl (C.realtypep, aci);
+	   Decl (C.stridetype, rs);
+	   Decl (C.stridetype, csr);
+	   Decl (C.stridetype, csi);
+	   Decl ("INT", v);
+	   Decl ("INT", "ivs");
+	   Decl ("INT", "ovs")]),
+	 finalize_fcn body)
+
+  in let desc = 
+    Printf.sprintf 
+      "static const kr2c_desc desc = { %d, \"%s\", %s, &GENUS };\n\n"
+      n name (flops_of tree) 
+
+  and init =
+    (declare_register_fcn name) ^
+    "{" ^
+    "  X(kr2c_register)(p, " ^ name ^ ", &desc);\n" ^
+    "}\n"
+
+  in
+  (unparse tree) ^ "\n" ^ (if !Magic.standalone then "" else desc ^ init)
+
+
+let main () =
+  begin
+    parse speclist usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_r2cf.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_r2cf.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,164 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number>"
+
+let urs = ref Stride_variable
+let ucsr = ref Stride_variable
+let ucsi = ref Stride_variable
+let uivs = ref Stride_variable
+let uovs = ref Stride_variable
+let dftII_flag = ref false
+
+let speclist = [
+  "-with-rs",
+  Arg.String(fun x -> urs := arg_to_stride x),
+  " specialize for given real-array stride";
+
+  "-with-csr",
+  Arg.String(fun x -> ucsr := arg_to_stride x),
+  " specialize for given complex-array real stride";
+
+  "-with-csi",
+  Arg.String(fun x -> ucsi := arg_to_stride x),
+  " specialize for given complex-array imaginary stride";
+
+  "-with-ivs",
+  Arg.String(fun x -> uivs := arg_to_stride x),
+  " specialize for given input vector stride";
+
+  "-with-ovs",
+  Arg.String(fun x -> uovs := arg_to_stride x),
+  " specialize for given output vector stride";
+
+  "-dft-II",
+  Arg.Unit(fun () -> dftII_flag := true),
+  " produce shifted dftII-style codelets"
+] 
+
+let rdftII sign n input =
+  let input' i = if i < n then input i else Complex.zero in
+  let f = Fft.dft sign (2 * n) input' in
+  let g i = f (2 * i + 1)
+  in fun i -> 
+    if (i < n - i) then g i
+    else if (2 * i + 1 == n) then Complex.real (g i)
+    else Complex.zero
+
+let generate n =
+  let ar0 = "R0" and ar1 = "R1" and acr = "Cr" and aci = "Ci"
+  and rs = "rs" and csr = "csr" and csi = "csi" 
+  and i = "i" and v = "v"
+  and transform = if !dftII_flag then rdftII else Trig.rdft
+  in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name in
+
+  let vrs = either_stride (!urs) (C.SVar rs)
+  and vcsr = either_stride (!ucsr) (C.SVar csr)
+  and vcsi = either_stride (!ucsi) (C.SVar csi)
+  in
+
+  let sovs = stride_to_string "ovs" !uovs in
+  let sivs = stride_to_string "ivs" !uivs in
+
+  let locations = unique_array_c n in
+  let inpute = 
+    locative_array_c n 
+      (C.array_subscript ar0 vrs)
+      (C.array_subscript "BUG" vrs)
+      locations sivs
+  and inputo =
+    locative_array_c n 
+      (C.array_subscript ar1 vrs)
+      (C.array_subscript "BUG" vrs)
+      locations sivs
+  in
+  let input i = if i mod 2 == 0 then inpute (i/2) else inputo ((i-1)/2) in
+  let output = transform sign n (load_array_r n input) in
+  let oloc = 
+    locative_array_c n 
+      (C.array_subscript acr vcsr)
+      (C.array_subscript aci vcsi)
+      locations sovs in
+  let odag = store_array_hc n oloc output in
+  let annot = standard_optimizer odag in
+
+  let body = Block (
+    [Decl ("INT", i)],
+    [For (Expr_assign (CVar i, CVar v),
+	  Binop (" > ", CVar i, Integer 0),
+	  list_to_comma 
+	    [Expr_assign (CVar i, CPlus [CVar i; CUminus (Integer 1)]);
+	     Expr_assign (CVar ar0, CPlus [CVar ar0; CVar sivs]);
+	     Expr_assign (CVar ar1, CPlus [CVar ar1; CVar sivs]);
+	     Expr_assign (CVar acr, CPlus [CVar acr; CVar sovs]);
+	     Expr_assign (CVar aci, CPlus [CVar aci; CVar sovs]);
+	     make_volatile_stride (4*n) (CVar rs);
+	     make_volatile_stride (4*n) (CVar csr);
+	     make_volatile_stride (4*n) (CVar csi)
+	   ],
+	  Asch annot)
+   ])
+  in
+
+  let tree =
+    Fcn ((if !Magic.standalone then "void" else "static void"), name,
+	 ([Decl (C.realtypep, ar0);
+	   Decl (C.realtypep, ar1);
+	   Decl (C.realtypep, acr);
+	   Decl (C.realtypep, aci);
+	   Decl (C.stridetype, rs);
+	   Decl (C.stridetype, csr);
+	   Decl (C.stridetype, csi);
+	   Decl ("INT", v);
+	   Decl ("INT", "ivs");
+	   Decl ("INT", "ovs")]),
+	 finalize_fcn body)
+
+  in let desc = 
+    Printf.sprintf 
+      "static const kr2c_desc desc = { %d, \"%s\", %s, &GENUS };\n\n"
+      n name (flops_of tree) 
+
+  and init =
+    (declare_register_fcn name) ^
+    "{" ^
+    "  X(kr2c_register)(p, " ^ name ^ ", &desc);\n" ^
+    "}\n"
+
+  in
+  (unparse tree) ^ "\n" ^ (if !Magic.standalone then "" else desc ^ init)
+
+
+let main () =
+  begin
+    parse speclist usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_r2r.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_r2r.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,257 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* generation of trigonometric transforms *)
+
+open Util
+open Genutil
+open C
+
+
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number>"
+
+let uistride = ref Stride_variable
+let uostride = ref Stride_variable
+let uivstride = ref Stride_variable
+let uovstride = ref Stride_variable
+
+type mode =
+  | RDFT
+  | HDFT
+  | DHT
+  | REDFT00
+  | REDFT10
+  | REDFT01
+  | REDFT11
+  | RODFT00
+  | RODFT10
+  | RODFT01
+  | RODFT11
+  | NONE
+
+let mode = ref NONE
+let normsqr = ref 1
+let unitary = ref false
+let noloop = ref false
+
+let speclist = [
+  "-with-istride",
+  Arg.String(fun x -> uistride := arg_to_stride x),
+  " specialize for given input stride";
+
+  "-with-ostride",
+  Arg.String(fun x -> uostride := arg_to_stride x),
+  " specialize for given output stride";
+
+  "-with-ivstride",
+  Arg.String(fun x -> uivstride := arg_to_stride x),
+  " specialize for given input vector stride";
+
+  "-with-ovstride",
+  Arg.String(fun x -> uovstride := arg_to_stride x),
+  " specialize for given output vector stride";
+
+  "-rdft",
+  Arg.Unit(fun () -> mode := RDFT),
+  " generate a real DFT codelet";
+
+  "-hdft",
+  Arg.Unit(fun () -> mode := HDFT),
+  " generate a Hermitian DFT codelet";
+
+  "-dht",
+  Arg.Unit(fun () -> mode := DHT),
+  " generate a DHT codelet";
+
+  "-redft00",
+  Arg.Unit(fun () -> mode := REDFT00),
+  " generate a DCT-I codelet";
+
+  "-redft10",
+  Arg.Unit(fun () -> mode := REDFT10),
+  " generate a DCT-II codelet";
+
+  "-redft01",
+  Arg.Unit(fun () -> mode := REDFT01),
+  " generate a DCT-III codelet";
+
+  "-redft11",
+  Arg.Unit(fun () -> mode := REDFT11),
+  " generate a DCT-IV codelet";
+
+  "-rodft00",
+  Arg.Unit(fun () -> mode := RODFT00),
+  " generate a DST-I codelet";
+
+  "-rodft10",
+  Arg.Unit(fun () -> mode := RODFT10),
+  " generate a DST-II codelet";
+
+  "-rodft01",
+  Arg.Unit(fun () -> mode := RODFT01),
+  " generate a DST-III codelet";
+
+  "-rodft11",
+  Arg.Unit(fun () -> mode := RODFT11),
+  " generate a DST-IV codelet";
+
+  "-normalization",
+  Arg.String(fun x -> let ix = int_of_string x in normsqr := ix * ix),
+  " normalization integer to divide by";
+
+  "-normsqr",
+  Arg.String(fun x -> normsqr := int_of_string x),
+  " integer square of normalization to divide by";
+
+  "-unitary",
+  Arg.Unit(fun () -> unitary := true),
+  " unitary normalization (up overall scale factor)";
+
+  "-noloop",
+  Arg.Unit(fun () -> noloop := true),
+  " no vector loop";
+]
+
+let sqrt_half = Complex.inverse_int_sqrt 2
+let sqrt_two = Complex.int_sqrt 2
+
+let rescale sc s1 s2 input i = 
+  if ((i == s1 || i == s2) && !unitary) then
+    Complex.times (input i) sc
+  else
+    input i
+
+let generate n mode =
+  let iarray = "I"
+  and oarray = "O"
+  and istride = "is"
+  and ostride = "os" 
+  and i = "i" 
+  and v = "v" 
+  in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name in
+
+  let vistride = either_stride (!uistride) (C.SVar istride)
+  and vostride = either_stride (!uostride) (C.SVar ostride)
+  in
+
+  let sovs = stride_to_string "ovs" !uovstride in
+  let sivs = stride_to_string "ivs" !uivstride in
+
+  let (transform, load_input, store_output, si1,si2,so1,so2) = match mode with
+  | RDFT -> Trig.rdft sign, load_array_r, store_array_hc, -1,-1,-1,-1
+  | HDFT -> Trig.hdft sign, load_array_c, store_array_r, -1,-1,-1,-1 (* TODO *)
+  | DHT -> Trig.dht 1, load_array_r, store_array_r, -1,-1,-1,-1
+  | REDFT00 -> Trig.dctI, load_array_r, store_array_r, 0,n-1,0,n-1
+  | REDFT10 -> Trig.dctII, load_array_r, store_array_r, -1,-1,0,-1
+  | REDFT01 -> Trig.dctIII, load_array_r, store_array_r, 0,-1,-1,-1
+  | REDFT11 -> Trig.dctIV, load_array_r, store_array_r, -1,-1,-1,-1
+  | RODFT00 -> Trig.dstI, load_array_r, store_array_r, -1,-1,-1,-1
+  | RODFT10 -> Trig.dstII, load_array_r, store_array_r, -1,-1,n-1,-1
+  | RODFT01 -> Trig.dstIII, load_array_r, store_array_r, n-1,-1,-1,-1
+  | RODFT11 -> Trig.dstIV, load_array_r, store_array_r, -1,-1,-1,-1
+  | _ -> failwith "must specify transform kind"
+  in
+    
+  let locations = unique_array_c n in
+  let input = locative_array_c n 
+      (C.array_subscript iarray vistride)
+      (C.array_subscript "BUG" vistride)
+      locations sivs in
+  let output = rescale sqrt_half so1 so2
+      ((Complex.times (Complex.inverse_int_sqrt !normsqr))
+       @@ (transform n (rescale sqrt_two si1 si2 (load_array_c n input)))) in
+  let oloc = 
+    locative_array_c n 
+      (C.array_subscript oarray vostride)
+      (C.array_subscript "BUG" vostride)
+      locations sovs in
+  let odag = store_output n oloc output in
+  let annot = standard_optimizer odag in
+
+  let body = if !noloop then Block([], [Asch annot]) else Block (
+    [Decl ("INT", i)],
+    [For (Expr_assign (CVar i, CVar v),
+	  Binop (" > ", CVar i, Integer 0),
+	  list_to_comma 
+	    [Expr_assign (CVar i, CPlus [CVar i; CUminus (Integer 1)]);
+	     Expr_assign (CVar iarray, CPlus [CVar iarray; CVar sivs]);
+	     Expr_assign (CVar oarray, CPlus [CVar oarray; CVar sovs]);
+	     make_volatile_stride (2*n) (CVar istride);
+	     make_volatile_stride (2*n) (CVar ostride)
+	   ],
+	  Asch annot)
+   ])
+  in
+
+  let tree =
+    Fcn ((if !Magic.standalone then "void" else "static void"), name,
+	 ([Decl (C.constrealtypep, iarray);
+	   Decl (C.realtypep, oarray)]
+	  @ (if stride_fixed !uistride then [] 
+               else [Decl (C.stridetype, istride)])
+	  @ (if stride_fixed !uostride then [] 
+	       else [Decl (C.stridetype, ostride)])
+	  @ (if !noloop then [] else
+               [Decl ("INT", v)]
+	       @ (if stride_fixed !uivstride then [] 
+                    else [Decl ("INT", "ivs")])
+	       @ (if stride_fixed !uovstride then [] 
+                    else [Decl ("INT", "ovs")]))),
+	 finalize_fcn body)
+
+  in let desc = 
+    Printf.sprintf 
+      "static const kr2r_desc desc = { %d, \"%s\", %s, &GENUS, %s };\n\n"
+      n name (flops_of tree) 
+      (match mode with
+      | RDFT -> "RDFT00"
+      | HDFT -> "HDFT00"
+      | DHT  -> "DHT"
+      | REDFT00 -> "REDFT00"
+      | REDFT10 -> "REDFT10"
+      | REDFT01 -> "REDFT01"
+      | REDFT11 -> "REDFT11"
+      | RODFT00 -> "RODFT00"
+      | RODFT10 -> "RODFT10"
+      | RODFT01 -> "RODFT01"
+      | RODFT11 -> "RODFT11"
+      | _ -> failwith "must specify a transform kind")
+
+  and init =
+    (declare_register_fcn name) ^
+    "{" ^
+    "  X(kr2r_register)(p, " ^ name ^ ", &desc);\n" ^
+    "}\n"
+
+  in
+  (unparse tree) ^ "\n" ^ (if !Magic.standalone then "" else desc ^ init)
+
+
+let main () =
+  begin
+    parse speclist usage;
+    print_string (generate (check_size ()) !mode);
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_twiddle.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_twiddle.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,161 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+
+type ditdif = DIT | DIF
+let ditdif = ref DIT
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number> [ -dit | -dif ]"
+
+let urs = ref Stride_variable
+let ums = ref Stride_variable
+
+let speclist = [
+  "-dit",
+  Arg.Unit(fun () -> ditdif := DIT),
+  " generate a DIT codelet";
+
+  "-dif",
+  Arg.Unit(fun () -> ditdif := DIF),
+  " generate a DIF codelet";
+
+  "-with-rs",
+  Arg.String(fun x -> urs := arg_to_stride x),
+  " specialize for given i/o stride";
+
+  "-with-ms",
+  Arg.String(fun x -> ums := arg_to_stride x),
+  " specialize for given ms"
+]
+
+let generate n =
+  let rioarray = "ri"
+  and iioarray = "ii"
+  and rs = "rs"
+  and twarray = "W"
+  and m = "m" and mb = "mb" and me = "me" and ms = "ms" in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name 
+  and byvl x = choose_simd x (ctimes (CVar "(2 * VL)", x)) in
+  let ename = expand_name name in
+
+  let (bytwiddle, num_twiddles, twdesc) = Twiddle.twiddle_policy 0 false in
+  let nt = num_twiddles n in
+
+  let byw = bytwiddle n sign (twiddle_array nt twarray) in
+
+  let vrs = either_stride (!urs) (C.SVar rs) in
+  let sms = stride_to_string "ms" !ums in
+
+  let locations = unique_array_c n in
+  let iloc = 
+    locative_array_c n 
+      (C.array_subscript rioarray vrs)
+      (C.array_subscript iioarray vrs)
+      locations sms
+  and oloc = 
+    locative_array_c n 
+      (C.array_subscript rioarray vrs)
+      (C.array_subscript iioarray vrs)
+      locations sms
+  in
+  let liloc = load_array_c n iloc in
+  let output =
+    match !ditdif with
+    | DIT -> array n (Fft.dft sign n (byw liloc))
+    | DIF -> array n (byw (Fft.dft sign n liloc))
+  in
+  let odag = store_array_c n oloc output in
+  let annot = standard_optimizer odag in
+
+  let vm = CVar m and vmb = CVar mb and vme = CVar me in
+
+  let body = Block (
+    [Decl ("INT", m)],
+    [For (list_to_comma
+	    [Expr_assign (vm, vmb);
+	     Expr_assign (CVar twarray, 
+			  CPlus [CVar twarray; 
+				 ctimes (vmb, Integer nt)])],
+	  Binop (" < ", vm, vme),
+	  list_to_comma 
+	    [Expr_assign (vm, CPlus [vm; byvl (Integer 1)]);
+	     Expr_assign (CVar rioarray, CPlus [CVar rioarray; 
+						byvl (CVar sms)]);
+	     Expr_assign (CVar iioarray, CPlus [CVar iioarray; 
+						byvl (CVar sms)]);
+	     Expr_assign (CVar twarray, CPlus [CVar twarray; 
+					       byvl (Integer nt)]);
+	     make_volatile_stride (2*n) (CVar rs)
+	    ],
+	  Asch annot)])
+  in
+
+  let tree = 
+    Fcn (((if !Magic.standalone then "" else "static ") ^ "void"),
+	 ename,
+	 [Decl (C.realtypep, rioarray);
+	  Decl (C.realtypep, iioarray);
+	  Decl (C.constrealtypep, twarray);
+	  Decl (C.stridetype, rs);
+	  Decl ("INT", mb);
+	  Decl ("INT", me);
+	  Decl ("INT", ms)],
+         finalize_fcn body)
+  in
+  let twinstr = 
+    Printf.sprintf "static const tw_instr twinstr[] = %s;\n\n" 
+      (twinstr_to_string "(2 * VL)" (twdesc n))
+  and desc = 
+    Printf.sprintf
+      "static const ct_desc desc = {%d, %s, twinstr, &GENUS, %s, %s, %s, %s};\n\n"
+      n (stringify name) (flops_of tree) 
+      (stride_to_solverparm !urs) "0"
+      (stride_to_solverparm !ums) 
+  and register = 
+    match !ditdif with
+    | DIT -> "X(kdft_dit_register)"
+    | DIF -> "X(kdft_dif_register)"
+
+  in
+  let init =
+    "\n" ^
+    twinstr ^ 
+    desc ^
+    (declare_register_fcn name) ^
+    (Printf.sprintf "{\n%s(p, %s, &desc);\n}" register ename)
+  in
+
+  (unparse tree) ^ "\n" ^
+    (if !Magic.standalone then "" else init)
+
+
+let main () =
+  begin 
+    parse (speclist @ Twiddle.speclist) usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_twiddle_c.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_twiddle_c.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,165 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+
+type ditdif = DIT | DIF
+let ditdif = ref DIT
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number> [ -dit | -dif ]"
+
+let urs = ref Stride_variable
+let ums = ref Stride_variable
+
+let speclist = [
+  "-dit",
+  Arg.Unit(fun () -> ditdif := DIT),
+  " generate a DIT codelet";
+
+  "-dif",
+  Arg.Unit(fun () -> ditdif := DIF),
+  " generate a DIF codelet";
+
+  "-with-rs",
+  Arg.String(fun x -> urs := arg_to_stride x),
+  " specialize for given i/o stride";
+
+  "-with-ms",
+  Arg.String(fun x -> ums := arg_to_stride x),
+  " specialize for given ms"
+]
+
+let generate n =
+  let rioarray = "x"
+  and rs = "rs"
+  and twarray = "W"
+  and m = "m" and mb = "mb" and me = "me" and ms = "ms" in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name 
+  and byvl x = choose_simd x (ctimes (CVar "VL", x)) 
+  and bytwvl x = choose_simd x (ctimes (CVar "TWVL", x))
+  and bytwvl_vl x = choose_simd x (ctimes (CVar "(TWVL/VL)", x)) in
+  let ename = expand_name name in
+
+  let (bytwiddle, num_twiddles, twdesc) = Twiddle.twiddle_policy 0 true in
+  let nt = num_twiddles n in
+
+  let byw = bytwiddle n sign (twiddle_array nt twarray) in
+
+  let vrs = either_stride (!urs) (C.SVar rs) in
+  let sms = stride_to_string "ms" !ums in
+
+  let locations = unique_array_c n in
+  let iloc = 
+    locative_array_c n 
+      (C.array_subscript rioarray vrs)
+      (C.array_subscript "BUG" vrs)
+      locations sms
+  and oloc = 
+    locative_array_c n 
+      (C.array_subscript rioarray vrs)
+      (C.array_subscript "BUG" vrs)
+      locations sms
+  in
+  let liloc = load_array_r n iloc in
+  let fft = Trig.dft_via_rdft  in
+  let output =
+    match !ditdif with
+    | DIT -> array n (fft sign n (byw liloc))
+    | DIF -> array n (byw (fft sign n liloc))
+  in
+  let odag = store_array_r n oloc output in
+  let annot = standard_optimizer odag in
+
+  let vm = CVar m and vmb = CVar mb and vme = CVar me in
+
+  let body = Block (
+    [Decl ("INT", m);
+     Decl (C.realtypep, rioarray)],
+    [Stmt_assign (CVar rioarray,
+		  CVar (if (sign < 0) then "ri" else "ii"));
+     For (list_to_comma
+	    [Expr_assign (vm, vmb);
+	     Expr_assign (CVar twarray, 
+			  CPlus [CVar twarray; 
+				 ctimes (vmb, 
+					 bytwvl_vl (Integer nt))])],
+	  Binop (" < ", vm, vme),
+	  list_to_comma 
+	    [Expr_assign (vm, CPlus [vm; byvl (Integer 1)]);
+	     Expr_assign (CVar rioarray, CPlus [CVar rioarray; 
+						byvl (CVar sms)]);
+	     Expr_assign (CVar twarray, CPlus [CVar twarray; 
+					       bytwvl (Integer nt)]);
+	     make_volatile_stride n (CVar rs)
+	    ],
+	  Asch annot)])
+  in
+
+  let tree = 
+    Fcn (((if !Magic.standalone then "" else "static ") ^ "void"),
+	 ename,
+	 [Decl (C.realtypep, "ri");
+	  Decl (C.realtypep, "ii");
+	  Decl (C.constrealtypep, twarray);
+	  Decl (C.stridetype, rs);
+	  Decl ("INT", mb);
+	  Decl ("INT", me);
+	  Decl ("INT", ms)],
+         finalize_fcn body)
+  in
+  let twinstr = 
+    Printf.sprintf "static const tw_instr twinstr[] = %s;\n\n" 
+      (twinstr_to_string "VL" (twdesc n))
+  and desc = 
+    Printf.sprintf
+      "static const ct_desc desc = {%d, %s, twinstr, &GENUS, %s, %s, %s, %s};\n\n"
+      n (stringify name) (flops_of tree) 
+      (stride_to_solverparm !urs) "0"
+      (stride_to_solverparm !ums) 
+  and register = 
+    match !ditdif with
+    | DIT -> "X(kdft_dit_register)"
+    | DIF -> "X(kdft_dif_register)"
+
+  in
+  let init =
+    "\n" ^
+    twinstr ^ 
+    desc ^
+    (declare_register_fcn name) ^
+    (Printf.sprintf "{\n%s(p, %s, &desc);\n}" register ename)
+  in
+
+  (unparse tree) ^ "\n" ^ (if !Magic.standalone then "" else init)
+
+
+let main () =
+  begin 
+    Simdmagic.simd_mode := true;
+    parse (speclist @ Twiddle.speclist) usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_twidsq.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_twidsq.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,176 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+type ditdif = DIT | DIF
+let ditdif = ref DIT
+
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number> [ -dit | -dif ]"
+
+let reload_twiddle = ref false
+
+let urs = ref Stride_variable
+let uvs = ref Stride_variable
+let ums = ref Stride_variable
+
+let speclist = [
+  "-dit",
+  Arg.Unit(fun () -> ditdif := DIT),
+  " generate a DIT codelet";
+
+  "-dif",
+  Arg.Unit(fun () -> ditdif := DIF),
+  " generate a DIF codelet";
+
+  "-reload-twiddle",
+  Arg.Unit(fun () -> reload_twiddle := true),
+  " do not collect common twiddle factors";
+
+  "-with-rs",
+  Arg.String(fun x -> urs := arg_to_stride x),
+  " specialize for given input stride";
+
+  "-with-vs",
+  Arg.String(fun x -> uvs := arg_to_stride x),
+  " specialize for given vector stride";
+
+  "-with-ms",
+  Arg.String(fun x -> ums := arg_to_stride x),
+  " specialize for given ms"
+]
+
+let generate n =
+  let rioarray = "rio"
+  and iioarray = "iio"
+  and rs = "rs" and vs = "vs"
+  and twarray = "W"
+  and m = "m" and mb = "mb" and me = "me" and ms = "ms" in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name in
+
+  let (bytwiddle, num_twiddles, twdesc) = Twiddle.twiddle_policy 0 false in
+  let nt = num_twiddles n in
+
+  let svs = either_stride (!uvs) (C.SVar vs)
+  and srs = either_stride (!urs) (C.SVar rs) in
+
+  let byw =
+    if !reload_twiddle then
+      array n (fun v -> bytwiddle n sign (twiddle_array nt twarray))
+    else
+      let a = bytwiddle n sign (twiddle_array nt twarray)
+      in fun v -> a
+  in
+
+  let locations = unique_v_array_c n n in
+
+  let ioi = 
+    locative_v_array_c n n 
+      (C.varray_subscript rioarray svs srs) 
+      (C.varray_subscript iioarray svs srs) 
+      locations "BUG"
+  and ioo = 
+    locative_v_array_c n n 
+      (C.varray_subscript rioarray svs srs) 
+      (C.varray_subscript iioarray svs srs) 
+      locations "BUG"
+  in
+
+  let lioi = load_v_array_c n n ioi in
+  let output =
+    match !ditdif with
+    | DIT -> array n (fun v -> Fft.dft sign n (byw v (lioi v)))
+    | DIF -> array n (fun v -> byw v (Fft.dft sign n (lioi v)))
+  in
+
+  let odag = store_v_array_c n n ioo (transpose output) in
+  let annot = standard_optimizer odag in
+
+  let vm = CVar m and vmb = CVar mb and vme = CVar me in
+
+  let body = Block (
+    [Decl ("INT", m)],
+    [For (list_to_comma
+	    [Expr_assign (vm, vmb);
+	     Expr_assign (CVar twarray, 
+			  CPlus [CVar twarray; 
+				 ctimes (vmb, Integer nt)])],
+	  Binop (" < ", vm, vme),
+	  list_to_comma 
+	    [Expr_assign (vm, CPlus [vm; Integer 1]);
+	     Expr_assign (CVar rioarray, CPlus [CVar rioarray; CVar ms]);
+	     Expr_assign (CVar iioarray, CPlus [CVar iioarray; CVar ms]);
+	     Expr_assign (CVar twarray, CPlus [CVar twarray; Integer nt]);
+	     make_volatile_stride (2*n) (CVar rs);
+	     make_volatile_stride (2*0) (CVar vs)
+	   ],
+	  Asch annot)]) in
+
+  let tree = 
+    Fcn (("static void"), name,
+	 [Decl (C.realtypep, rioarray);
+	  Decl (C.realtypep, iioarray);
+	  Decl (C.constrealtypep, twarray);
+	  Decl (C.stridetype, rs);
+	  Decl (C.stridetype, vs);
+	  Decl ("INT", mb);
+	  Decl ("INT", me);
+	  Decl ("INT", ms)],
+         finalize_fcn body)
+  in
+  let twinstr = 
+    Printf.sprintf "static const tw_instr twinstr[] = %s;\n\n" 
+      (Twiddle.twinstr_to_c_string (twdesc n))
+
+  and desc = 
+    Printf.sprintf
+      "static const ct_desc desc = {%d, \"%s\", twinstr, &GENUS, %s, %s, %s, %s};\n\n"
+      n name (flops_of tree) 
+      (stride_to_solverparm !urs) (stride_to_solverparm !uvs)
+      (stride_to_solverparm !ums) 
+
+  and register = 
+    match !ditdif with
+    | DIT -> "X(kdft_ditsq_register)"
+    | DIF -> "X(kdft_difsq_register)"
+  in
+  let init =
+    "\n" ^ 
+    twinstr ^ 
+    desc ^
+    (declare_register_fcn name) ^
+    (Printf.sprintf "{\n%s(p, %s, &desc);\n}" register name)
+  in
+
+  (unparse tree) ^ "\n" ^ init
+
+
+let main () =
+  begin
+    parse (speclist @ Twiddle.speclist) usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/gen_twidsq_c.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/gen_twidsq_c.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,187 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Util
+open Genutil
+open C
+
+type ditdif = DIT | DIF
+let ditdif = ref DIT
+
+let usage = "Usage: " ^ Sys.argv.(0) ^ " -n <number> [ -dit | -dif ]"
+
+let reload_twiddle = ref false
+
+let urs = ref Stride_variable
+let uvs = ref Stride_variable
+let ums = ref Stride_variable
+
+let speclist = [
+  "-dit",
+  Arg.Unit(fun () -> ditdif := DIT),
+  " generate a DIT codelet";
+
+  "-dif",
+  Arg.Unit(fun () -> ditdif := DIF),
+  " generate a DIF codelet";
+
+  "-reload-twiddle",
+  Arg.Unit(fun () -> reload_twiddle := true),
+  " do not collect common twiddle factors";
+
+  "-with-rs",
+  Arg.String(fun x -> urs := arg_to_stride x),
+  " specialize for given input stride";
+
+  "-with-vs",
+  Arg.String(fun x -> uvs := arg_to_stride x),
+  " specialize for given vector stride";
+
+  "-with-ms",
+  Arg.String(fun x -> ums := arg_to_stride x),
+  " specialize for given ms"
+]
+
+let generate n =
+  let rioarray = "x"
+  and rs = "rs" and vs = "vs"
+  and twarray = "W" 
+  and m = "m" and mb = "mb" and me = "me" and ms = "ms" in
+
+  let sign = !Genutil.sign 
+  and name = !Magic.codelet_name 
+  and byvl x = choose_simd x (ctimes (CVar "VL", x)) 
+  and bytwvl x = choose_simd x (ctimes (CVar "TWVL", x)) 
+  and bytwvl_vl x = choose_simd x (ctimes (CVar "(TWVL/VL)", x)) in
+  let ename = expand_name name in
+
+  let (bytwiddle, num_twiddles, twdesc) = Twiddle.twiddle_policy 0 true in
+  let nt = num_twiddles n in
+
+  let svs = either_stride (!uvs) (C.SVar vs)
+  and srs = either_stride (!urs) (C.SVar rs) in
+  let sms = stride_to_string "ms" !ums in
+
+  let byw =
+    if !reload_twiddle then
+      array n (fun v -> bytwiddle n sign (twiddle_array nt twarray))
+    else
+      let a = bytwiddle n sign (twiddle_array nt twarray)
+      in fun v -> a
+  in
+
+  let locations = unique_v_array_c n n in
+
+  let ioi = 
+    locative_v_array_c n n 
+      (C.varray_subscript rioarray svs srs) 
+      (C.varray_subscript "BUG" svs srs) 
+      locations sms
+  and ioo = 
+    locative_v_array_c n n 
+      (C.varray_subscript rioarray svs srs) 
+      (C.varray_subscript "BUG" svs srs) 
+      locations sms
+  in
+
+  let lioi = load_v_array_c n n ioi in
+  let fft = Trig.dft_via_rdft  in
+  let output =
+    match !ditdif with
+    | DIT -> array n (fun v -> fft sign n (byw v (lioi v)))
+    | DIF -> array n (fun v -> byw v (fft sign n (lioi v)))
+  in
+
+  let odag = store_v_array_c n n ioo (transpose output) in
+  let annot = standard_optimizer odag in
+
+  let vm = CVar m and vmb = CVar mb and vme = CVar me in
+
+  let body = Block (
+    [Decl ("INT", m);
+     Decl (C.realtypep, rioarray)],
+    [Stmt_assign (CVar rioarray,
+		  CVar (if (sign < 0) then "ri" else "ii"));
+     For (list_to_comma
+	    [Expr_assign (vm, vmb);
+	     Expr_assign (CVar twarray, 
+			  CPlus [CVar twarray; 
+				 ctimes (vmb, 
+					 bytwvl_vl (Integer nt))])],
+	  Binop (" < ", vm, vme),
+	  list_to_comma 
+	    [Expr_assign (vm, CPlus [vm; byvl (Integer 1)]);
+	     Expr_assign (CVar rioarray, CPlus [CVar rioarray; 
+						byvl (CVar sms)]);
+	     Expr_assign (CVar twarray, CPlus [CVar twarray; 
+					       bytwvl (Integer nt)]);
+	     make_volatile_stride (2*n) (CVar rs);
+	     make_volatile_stride (2*n) (CVar vs)
+	   ],
+	  Asch annot)]) in
+
+  let tree = 
+    Fcn (("static void"), ename,
+	 [Decl (C.realtypep, "ri");
+	  Decl (C.realtypep, "ii");
+	  Decl (C.constrealtypep, twarray);
+	  Decl (C.stridetype, rs);
+	  Decl (C.stridetype, vs);
+	  Decl ("INT", mb);
+	  Decl ("INT", me);
+	  Decl ("INT", ms)],
+         finalize_fcn body)
+  in
+  let twinstr = 
+    Printf.sprintf "static const tw_instr twinstr[] = %s;\n\n" 
+      (twinstr_to_string "VL" (twdesc n))
+
+  and desc = 
+    Printf.sprintf
+      "static const ct_desc desc = {%d, %s, twinstr, &GENUS, %s, %s, %s, %s};\n\n"
+      n (stringify name) (flops_of tree) 
+      (stride_to_solverparm !urs) 
+      (stride_to_solverparm !uvs)
+      (stride_to_solverparm !ums) 
+
+  and register = 
+    match !ditdif with
+    | DIT -> "X(kdft_ditsq_register)"
+    | DIF -> "X(kdft_difsq_register)"
+  in
+  let init =
+    "\n" ^ 
+    twinstr ^ 
+    desc ^
+    (declare_register_fcn name) ^
+    (Printf.sprintf "{\n%s(p, %s, &desc);\n}" register ename)
+  in
+
+  (unparse tree) ^ "\n" ^ init
+
+
+let main () =
+  begin
+    parse (speclist @ Twiddle.speclist) usage;
+    print_string (generate (check_size ()));
+  end
+
+let _ = main()
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/genutil.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/genutil.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,328 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* utilities common to all generators *)
+open Util
+
+let choose_simd a b = if !Simdmagic.simd_mode then b else a
+
+let unique_array n = array n (fun _ -> Unique.make ())
+let unique_array_c n = 
+  array n (fun _ -> 
+    (Unique.make (), Unique.make ()))
+
+let unique_v_array_c veclen n = 
+  array veclen (fun _ ->
+    unique_array_c n)
+
+let locative_array_c n rarr iarr loc vs = 
+  array n (fun i -> 
+    let klass = Unique.make () in
+    let (rloc, iloc) = loc i in
+    (Variable.make_locative rloc klass rarr i vs,
+     Variable.make_locative iloc klass iarr i vs))
+
+let locative_v_array_c veclen n rarr iarr loc vs = 
+  array veclen (fun v ->
+    array n (fun i -> 
+      let klass = Unique.make () in
+      let (rloc, iloc) = loc v i in
+      (Variable.make_locative rloc klass (rarr v) i vs,
+       Variable.make_locative iloc klass (iarr v) i vs)))
+
+let temporary_array n = 
+  array n (fun i -> Variable.make_temporary ())
+
+let temporary_array_c n = 
+  let tmpr = temporary_array n
+  and tmpi = temporary_array n
+  in 
+  array n (fun i -> (tmpr i, tmpi i))
+
+let temporary_v_array_c veclen n =
+  array veclen (fun v -> temporary_array_c n)
+
+let temporary_array_c n = 
+  let tmpr = temporary_array n
+  and tmpi = temporary_array n
+  in 
+  array n (fun i -> (tmpr i, tmpi i))
+
+let load_c (vr, vi) = Complex.make (Expr.Load vr, Expr.Load vi)
+let load_r (vr, vi) = Complex.make (Expr.Load vr, Expr.Num (Number.zero))
+
+let twiddle_array nt w =
+  array (nt/2) (fun i ->
+    let stride = choose_simd (C.SInteger 1) (C.SConst "TWVL") 
+    and klass = Unique.make () in
+    let (refr, refi) = (C.array_subscript w stride (2 * i),
+			C.array_subscript w stride (2 * i + 1))
+    in
+    let (kr, ki) = (Variable.make_constant klass refr,
+		    Variable.make_constant klass refi)  
+    in
+    load_c (kr, ki))
+
+
+let load_array_c n var = array n (fun i -> load_c (var i))
+let load_array_r n var = array n (fun i -> load_r (var i))
+let load_array_hc n var = 
+  array n (fun i -> 
+    if (i < n - i) then
+      load_c (var i)
+    else if (i > n - i) then
+      Complex.times Complex.i (load_c (var (n - i)))
+    else
+      load_r (var i))
+
+let load_v_array_c veclen n var =
+  array veclen (fun v -> load_array_c n (var v))
+
+let store_c (vr, vi) x = [Complex.store_real vr x; Complex.store_imag vi x]
+let store_r (vr, vi) x = Complex.store_real vr x
+let store_i (vr, vi) x = Complex.store_imag vi x
+
+let assign_array_c n dst src =
+  List.flatten
+    (rmap (iota n)
+       (fun i ->
+	 let (ar, ai) = Complex.assign (dst i) (src i)
+	 in [ar; ai]))
+let assign_v_array_c veclen n dst src =
+  List.flatten
+    (rmap (iota veclen)
+       (fun v ->
+	 assign_array_c n (dst v) (src v)))
+
+let vassign_v_array_c veclen n dst src =
+  List.flatten
+    (rmap (iota n) (fun i ->
+      List.flatten
+	(rmap (iota veclen)
+	   (fun v ->
+	     let (ar, ai) = Complex.assign (dst v i) (src v i)
+	     in [ar; ai]))))
+
+let store_array_r n dst src =
+  rmap (iota n)
+    (fun i -> store_r (dst i) (src i))
+
+let store_array_c n dst src =
+  List.flatten
+    (rmap (iota n)
+       (fun i -> store_c (dst i) (src i)))
+
+let store_array_hc n dst src =
+  List.flatten
+    (rmap (iota n)
+       (fun i -> 
+	 if (i < n - i) then
+	   store_c (dst i) (src i)
+	 else if (i > n - i) then
+	   []
+	 else 
+	   [store_r (dst i) (Complex.real (src i))]))
+	
+
+let store_v_array_c veclen n dst src =
+  List.flatten
+    (rmap (iota veclen)
+       (fun v ->
+	 store_array_c n (dst v) (src v)))
+
+
+let elementwise f n a = array n (fun i -> f (a i))
+let conj_array_c = elementwise Complex.conj
+let real_array_c = elementwise Complex.real
+let imag_array_c = elementwise Complex.imag
+
+let elementwise_v f veclen n a = 
+  array veclen (fun v ->
+    array n (fun i -> f (a v i)))
+let conj_v_array_c = elementwise_v Complex.conj
+let real_v_array_c = elementwise_v Complex.real
+let imag_v_array_c = elementwise_v Complex.imag
+
+
+let transpose f i j = f j i
+let symmetrize f i j = if i <= j then f i j else f j i
+
+(* utilities for command-line parsing *)
+let standard_arg_parse_fail _ = failwith "too many arguments"
+
+let dump_dag alist =
+  let fnam = !Magic.dag_dump_file in
+  if (String.length fnam > 0) then
+    let ochan = open_out fnam in
+    begin
+      To_alist.dump (output_string ochan) alist;
+      close_out ochan;
+    end
+
+let dump_alist alist =
+  let fnam = !Magic.alist_dump_file in
+  if (String.length fnam > 0) then
+    let ochan = open_out fnam in
+    begin
+      Expr.dump (output_string ochan) alist;
+      close_out ochan;
+    end
+
+let dump_asched asched =
+  let fnam = !Magic.asched_dump_file in
+  if (String.length fnam > 0) then
+    let ochan = open_out fnam in
+    begin
+      Annotate.dump (output_string ochan) asched;
+      close_out ochan;
+    end
+
+(* utilities for optimization *)
+let standard_scheduler dag =
+  let optim = Algsimp.algsimp dag in
+  let alist = To_alist.to_assignments optim in
+  let _ = dump_alist alist in
+  let _ = dump_dag alist in
+    if !Magic.precompute_twiddles then
+      Schedule.isolate_precomputations_and_schedule alist 
+    else
+      Schedule.schedule alist 
+
+let standard_optimizer dag =
+  let sched = standard_scheduler dag in
+  let annot = Annotate.annotate [] sched in
+  let _ = dump_asched annot in
+  annot
+
+let size = ref None
+let sign = ref (-1)
+
+let speclist = [
+  "-n", Arg.Int(fun i -> size := Some i), " generate a codelet of size <n>";
+  "-sign",
+  Arg.Int(fun i -> 
+    if (i > 0) then
+      sign := 1
+    else
+      sign := (-1)),
+  " sign of transform";
+]
+
+let check_size () =
+  match !size with
+  | Some i -> i
+  | None -> failwith "must specify -n"
+
+let expand_name name = if name = "" then "noname" else name
+
+let declare_register_fcn name =
+  if name = "" then
+    "void NAME(planner *p)\n"
+  else 
+    "void " ^ (choose_simd "X" "XSIMD") ^
+      "(codelet_" ^ name ^ ")(planner *p)\n"
+
+let stringify name = 
+  if name = "" then "STRINGIZE(NAME)" else 
+    choose_simd ("\"" ^ name ^ "\"")
+      ("XSIMD_STRING(\"" ^ name ^ "\")")
+
+let parse user_speclist usage =
+  Arg.parse
+    (user_speclist @ speclist @ Magic.speclist @ Simdmagic.speclist)
+    standard_arg_parse_fail
+    usage
+
+let rec list_to_c = function
+    [] -> ""
+  | [a] -> (string_of_int a)
+  | a :: b -> (string_of_int a) ^ ", " ^ (list_to_c b)
+
+let rec list_to_comma = function
+  | [a; b] -> C.Comma (a, b)
+  | a :: b -> C.Comma (a, list_to_comma b)
+  | _ -> failwith "list_to_comma"
+
+
+type stride = Stride_variable | Fixed_int of int | Fixed_string of string
+
+let either_stride a b =
+  match a with
+    Fixed_int x -> C.SInteger x
+  | Fixed_string x -> C.SConst x
+  | _ -> b
+
+let stride_fixed = function
+    Stride_variable -> false
+  | _ -> true
+
+let arg_to_stride s =
+  try
+    Fixed_int (int_of_string s)
+  with Failure "int_of_string" -> 
+    Fixed_string s
+
+let stride_to_solverparm = function
+    Stride_variable -> "0"
+  | Fixed_int x -> string_of_int x
+  | Fixed_string x -> x
+
+let stride_to_string s = function
+    Stride_variable -> s
+  | Fixed_int x -> string_of_int x
+  | Fixed_string x -> x
+
+(* output the command line *)
+let cmdline () =
+  List.fold_right (fun a b -> a ^ " " ^ b) (Array.to_list Sys.argv) ""
+
+let unparse tree =
+  "/* Generated by: " ^ (cmdline ()) ^ "*/\n\n" ^
+  (C.print_cost tree) ^
+  (if String.length !Magic.inklude > 0 
+  then
+    (Printf.sprintf "#include \"%s\"\n\n" !Magic.inklude)
+  else "") ^
+  (if !Simdmagic.simd_mode then
+    Simd.unparse_function tree
+  else
+    C.unparse_function tree)
+
+let finalize_fcn ast = 
+  let mergedecls = function
+      C.Block (d1, [C.Block (d2, s)]) -> C.Block (d1 @ d2, s)
+    | x -> x
+  and extract_constants =
+    if !Simdmagic.simd_mode then 
+      Simd.extract_constants 
+    else
+      C.extract_constants
+	
+  in mergedecls (C.Block (extract_constants ast, [ast; C.Simd_leavefun]))
+
+let twinstr_to_string vl x =
+  if !Simdmagic.simd_mode then 
+    Twiddle.twinstr_to_simd_string vl x
+  else
+    Twiddle.twinstr_to_c_string x
+
+let make_volatile_stride n x = 
+  C.CCall ("MAKE_VOLATILE_STRIDE", C.Comma((C.Integer n), x))
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/littlesimp.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/littlesimp.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,71 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* 
+ * The LittleSimplifier module implements a subset of the simplifications
+ * of the AlgSimp module.  These simplifications can be executed
+ * quickly here, while they would take a long time using the heavy
+ * machinery of AlgSimp.  
+ * 
+ * For example, 0 * x is simplified to 0 tout court by the LittleSimplifier.
+ * On the other hand, AlgSimp would first simplify x, generating lots
+ * of common subexpressions, storing them in a table etc, just to
+ * discard all the work later.  Similarly, the LittleSimplifier
+ * reduces the constant FFT in Rader's algorithm to a constant sequence.
+ *)
+
+open Expr
+
+let rec makeNum = function
+  | n -> Num n
+
+and makeUminus = function
+  | Uminus a -> a 
+  | Num a -> makeNum (Number.negate a)
+  | a -> Uminus a
+
+and makeTimes = function
+  | (Num a, Num b) -> makeNum (Number.mul a b)
+  | (Num a, Times (Num b, c)) -> makeTimes (makeNum (Number.mul a b), c)
+  | (Num a, b) when Number.is_zero a -> makeNum (Number.zero)
+  | (Num a, b) when Number.is_one a -> b
+  | (Num a, b) when Number.is_mone a -> makeUminus b
+  | (Num a, Uminus b) -> Times (makeUminus (Num a), b)
+  | (a, (Num b as b')) -> makeTimes (b', a)
+  | (a, b) -> Times (a, b)
+
+and makePlus l = 
+  let rec reduceSum x = match x with
+    [] -> []
+  | [Num a] -> if Number.is_zero a then [] else x
+  | (Num a) :: (Num b) :: c -> 
+      reduceSum ((makeNum (Number.add a b)) :: c)
+  | ((Num _) as a') :: b :: c -> b :: reduceSum (a' :: c)
+  | a :: s -> a :: reduceSum s
+
+  in match reduceSum l with
+    [] -> makeNum (Number.zero)
+  | [a] -> a 
+  | [a; b] when a == b -> makeTimes (Num Number.two, a)
+  | [Times (Num a, b); Times (Num c, d)] when b == d ->
+      makeTimes (makePlus [Num a; Num c], b)
+  | a -> Plus a
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/littlesimp.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/littlesimp.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,25 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+val makeNum : Number.number -> Expr.expr
+val makeUminus : Expr.expr -> Expr.expr
+val makeTimes : Expr.expr * Expr.expr -> Expr.expr
+val makePlus : Expr.expr list -> Expr.expr
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/magic.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/magic.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,161 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* magic parameters *)
+let verbose = ref false
+let vneg = ref false
+let karatsuba_min = ref 15
+let karatsuba_variant = ref 2
+let circular_min = ref 64
+let rader_min = ref 13
+let rader_list = ref [5]
+let alternate_convolution = ref 17
+let threemult = ref false
+let inline_single = ref true
+let inline_loads = ref false
+let inline_loads_constants = ref false
+let inline_constants = ref true
+let trivial_stores = ref false
+let locations_are_special = ref false
+let strength_reduce_mul = ref false
+let number_of_variables = ref 4
+let codelet_name = ref "unnamed"
+let randomized_cse = ref true
+let dif_split_radix = ref false
+let enable_fma = ref false
+let deep_collect_depth = ref 1
+let schedule_type = ref 0
+let compact = ref false
+let dag_dump_file = ref ""
+let alist_dump_file = ref ""
+let asched_dump_file = ref ""
+let lisp_syntax = ref false
+let network_transposition = ref true
+let inklude = ref ""
+let generic_arith = ref false
+let reorder_insns = ref false
+let reorder_loads = ref false
+let reorder_stores = ref false
+let precompute_twiddles = ref false
+let newsplit = ref false
+let standalone = ref false
+let pipeline_latency = ref 0
+let schedule_for_pipeline = ref false
+let generate_bytw = ref true
+
+(* command-line parser for magic parameters *)
+let undocumented = " Undocumented voodoo parameter"
+
+let set_bool var = Arg.Unit (fun () -> var := true)
+let unset_bool var = Arg.Unit (fun () -> var := false)
+let set_int var = Arg.Int(fun i -> var := i)
+let set_string var = Arg.String(fun s -> var := s)
+
+let speclist = [
+  "-name", set_string codelet_name, " set codelet name";
+  "-standalone", set_bool standalone, " standalone codelet (no desc)";
+  "-include", set_string inklude, undocumented;
+
+  "-verbose", set_bool verbose, " Enable verbose logging messages to stderr";
+
+  "-rader-min", set_int rader_min,
+  "<n> : Use Rader's algorithm for prime sizes >= <n>";
+
+  "-threemult", set_bool threemult, 
+  " Use 3-multiply complex multiplications";
+
+  "-karatsuba-min", set_int karatsuba_min, undocumented;
+  "-karatsuba-variant", set_int karatsuba_variant, undocumented;
+  "-circular-min", set_int circular_min, undocumented;
+
+  "-compact", set_bool compact, 
+  " Mangle variable names to reduce size of source code";
+  "-no-compact", unset_bool compact, 
+  " Disable -compact";
+
+  "-dump-dag", set_string dag_dump_file, undocumented;
+  "-dump-alist", set_string alist_dump_file, undocumented;
+  "-dump-asched", set_string asched_dump_file, undocumented;
+  "-lisp-syntax", set_bool lisp_syntax, undocumented;
+
+  "-alternate-convolution", set_int alternate_convolution, undocumented;
+  "-deep-collect-depth", set_int deep_collect_depth, undocumented;
+  "-schedule-type", set_int schedule_type, undocumented;
+  "-pipeline-latency", set_int pipeline_latency, undocumented;
+  "-schedule-for-pipeline", set_bool schedule_for_pipeline, undocumented;
+
+  "-dif-split-radix", set_bool dif_split_radix, undocumented;
+  "-dit-split-radix", unset_bool dif_split_radix, undocumented;
+
+  "-generic-arith", set_bool generic_arith, undocumented;
+  "-no-generic-arith", unset_bool generic_arith, undocumented;
+
+  "-precompute-twiddles", set_bool precompute_twiddles, undocumented;
+  "-no-precompute-twiddles", unset_bool precompute_twiddles, undocumented;
+
+  "-inline-single", set_bool inline_single, undocumented;
+  "-no-inline-single", unset_bool inline_single, undocumented;
+
+  "-inline-loads", set_bool inline_loads, undocumented;
+  "-no-inline-loads", unset_bool inline_loads, undocumented;
+
+  "-inline-loads-constants", set_bool inline_loads_constants, undocumented;
+  "-no-inline-loads-constants",
+     unset_bool inline_loads_constants, undocumented;
+
+  "-inline-constants", set_bool inline_constants, undocumented;
+  "-no-inline-constants", unset_bool inline_constants, undocumented;
+
+  "-trivial-stores", set_bool trivial_stores, undocumented;
+  "-no-trivial-stores", unset_bool trivial_stores, undocumented;
+
+  "-locations-are-special", set_bool locations_are_special, undocumented;
+  "-no-locations-are-special", unset_bool locations_are_special, undocumented;
+
+  "-randomized-cse", set_bool randomized_cse, undocumented;
+  "-no-randomized-cse", unset_bool randomized_cse, undocumented;
+
+  "-network-transposition", set_bool network_transposition, undocumented;
+  "-no-network-transposition", unset_bool network_transposition, undocumented;
+
+  "-reorder-insns", set_bool reorder_insns, undocumented;
+  "-no-reorder-insns", unset_bool reorder_insns, undocumented;
+  "-reorder-loads", set_bool reorder_loads, undocumented;
+  "-no-reorder-loads", unset_bool reorder_loads, undocumented;
+  "-reorder-stores", set_bool reorder_stores, undocumented;
+  "-no-reorder-stores", unset_bool reorder_stores, undocumented;
+
+  "-newsplit", set_bool newsplit, undocumented;
+
+  "-vneg", set_bool vneg, undocumented;
+  "-fma", set_bool enable_fma, undocumented;
+  "-no-fma", unset_bool enable_fma, undocumented;
+
+  "-variables", set_int number_of_variables, undocumented;
+
+  "-strength-reduce-mul", set_bool strength_reduce_mul, undocumented;
+  "-no-strength-reduce-mul", unset_bool strength_reduce_mul, undocumented;
+
+  "-generate-bytw", set_bool generate_bytw, undocumented;
+  "-no-generate-bytw", unset_bool generate_bytw, undocumented;
+] 
+   
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/monads.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/monads.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,75 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(*************************************************************
+ *   Monads
+ *************************************************************)
+
+(*
+ * Phil Wadler has many well written papers about monads.  See
+ * http://cm.bell-labs.com/cm/cs/who/wadler/ 
+ *)
+(* vanilla state monad *)
+module StateMonad = struct
+  let returnM x = fun s -> (x, s)
+
+  let (>>=) = fun m k -> 
+    fun s ->
+      let (a', s') = m s
+      in let (a'', s'') = k a' s'
+      in (a'', s'')
+
+  let (>>) = fun m k ->
+    m >>= fun _ -> k
+
+  let rec mapM f = function
+      [] -> returnM []
+    | a :: b ->
+	f a >>= fun a' ->
+	  mapM f b >>= fun b' ->
+	    returnM (a' :: b')
+
+  let runM m x initial_state =
+    let (a, _) = m x initial_state
+    in a
+
+  let fetchState =
+    fun s -> s, s
+
+  let storeState newState =
+    fun _ -> (), newState
+end
+
+(* monad with built-in memoizing capabilities *)
+module MemoMonad =
+  struct
+    open StateMonad
+
+    let memoizing lookupM insertM f k =
+      lookupM k >>= fun vMaybe ->
+	match vMaybe with
+	  Some value -> returnM value
+	| None ->
+	    f k >>= fun value ->
+	      insertM k value >> returnM value
+
+    let runM initial_state m x  = StateMonad.runM m x initial_state
+end
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/number.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/number.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,164 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* The generator keeps track of numeric constants in symbolic
+   expressions using the abstract number type, defined in this file.
+
+   Our implementation of the number type uses arbitrary-precision
+   arithmetic from the built-in Num package in order to maintain an
+   accurate representation of constants.  This allows us to output
+   constants with many decimal places in the generated C code,
+   ensuring that we will take advantage of the full precision
+   available on current and future machines.
+
+   Note that we have to write our own routine to compute roots of
+   unity, since the Num package only supplies simple arithmetic.  The
+   arbitrary-precision operations in Num look like the normal
+   operations except that they have an appended slash (e.g. +/ -/ */
+   // etcetera). *)
+
+open Num
+
+type number = N of num
+
+let makeNum n = N n
+
+(* decimal digits of precision to maintain internally, and to print out: *)
+let precision = 50
+let print_precision = 45
+
+let inveps = (Int 10) **/ (Int precision)
+let epsilon = (Int 1) // inveps
+
+let pinveps = (Int 10) **/ (Int print_precision)
+let pepsilon = (Int 1) // pinveps
+
+let round x = epsilon */ (round_num (x */ inveps))
+
+let of_int n = N (Int n)
+let zero = of_int 0
+let one = of_int 1
+let two = of_int 2
+let mone = of_int (-1)
+
+(* comparison predicate for real numbers *)
+let equal (N x) (N y) = (* use both relative and absolute error *)
+  let absdiff = abs_num (x -/ y) in
+  absdiff <=/ pepsilon or
+  absdiff <=/ pepsilon */ (abs_num x +/ abs_num y)
+
+let is_zero = equal zero
+let is_one = equal one
+let is_mone = equal mone
+let is_two = equal two
+
+
+(* Note that, in the following computations, it is important to round
+   to precision epsilon after each operation.  Otherwise, since the
+   Num package uses exact rational arithmetic, the number of digits
+   quickly blows up. *)
+let mul (N a) (N b) = makeNum (round (a */ b)) 
+let div (N a) (N b) = makeNum (round (a // b))
+let add (N a) (N b) = makeNum (round (a +/ b)) 
+let sub (N a) (N b) = makeNum (round (a -/ b))
+
+let negative (N a) = (a </ (Int 0))
+let negate (N a) = makeNum (minus_num a)
+
+let greater a b = negative (sub b a)
+
+let epsilonsq = epsilon */ epsilon
+let epsilonsq2 =  (Int 100) */ epsilonsq
+
+let sqr a = a */ a
+let almost_equal (N a) (N b) = (sqr (a -/ b)) <=/ epsilonsq2
+
+(* find square root by Newton's method *)
+let sqrt a =
+  let rec sqrt_iter guess =
+    let newguess = div (add guess (div a guess)) two in
+    if (almost_equal newguess guess) then newguess
+    else sqrt_iter newguess
+  in sqrt_iter (div a two)
+
+let csub (xr, xi) (yr, yi) = (round (xr -/ yr), round (xi -/ yi))
+let cdiv (xr, xi) r = (round (xr // r), round (xi // r))
+let cmul (xr, xi) (yr, yi) = (round (xr */ yr -/ xi */ yi),
+                              round (xr */ yi +/ xi */ yr))
+let csqr (xr, xi) = (round (xr */ xr -/ xi */ xi), round ((Int 2) */ xr */ xi))
+let cabssq (xr, xi) = xr */ xr +/ xi */ xi
+let cconj (xr, xi) = (xr, minus_num xi)
+let cinv x = cdiv (cconj x) (cabssq x)
+
+let almost_equal_cnum (xr, xi) (yr, yi) =
+    (cabssq (xr -/ yr,xi -/ yi)) <=/ epsilonsq2
+
+(* Put a complex number to an integer power by repeated squaring: *)
+let rec ipow_cnum x n =
+    if (n == 0) then
+      (Int 1, Int 0)
+    else if (n < 0) then
+      cinv (ipow_cnum x (- n))
+    else if (n mod 2 == 0) then
+      ipow_cnum (csqr x) (n / 2)
+    else
+      cmul x (ipow_cnum x (n - 1))
+
+let twopi = 6.28318530717958647692528676655900576839433879875021164194989
+
+(* Find the nth (complex) primitive root of unity by Newton's method: *)
+let primitive_root_of_unity n =
+    let rec root_iter guess =
+        let newguess = csub guess (cdiv (csub guess
+                                         (ipow_cnum guess (1 - n)))
+                                   (Int n)) in
+            if (almost_equal_cnum guess newguess) then newguess
+            else root_iter newguess
+    in let float_to_num f = (Int (truncate (f *. 1.0e9))) // (Int 1000000000)
+    in root_iter (float_to_num (cos (twopi /. (float n))),
+		  float_to_num (sin (twopi /. (float n)))) 
+
+let cexp n i =
+    if ((i mod n) == 0) then
+      (one,zero)
+    else
+      let (n2,i2) = Util.lowest_terms n i
+      in let (c,s) = ipow_cnum (primitive_root_of_unity n2) i2
+      in (makeNum c, makeNum s)
+
+let to_konst (N n) =
+  let f = float_of_num n in
+  let f' = if f < 0.0 then f *. (-1.0) else f in
+  let f2 = if (f' >= 1.0) then (f' -. (float (truncate f'))) else f'
+  in let q = string_of_int (truncate(f2 *. 1.0E9))
+  in let r = "0000000000" ^ q
+  in let l = String.length r 
+  in let prefix = if (f < 0.0) then "KN" else "KP" in
+  if (f' >= 1.0) then
+    (prefix ^ (string_of_int (truncate f')) ^ "_" ^ 
+     (String.sub r (l - 9) 9))
+  else
+    (prefix ^ (String.sub r (l - 9) 9))
+
+let to_string (N n) = approx_num_fix print_precision n
+
+let to_float (N n) = float_of_num n
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/number.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/number.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,49 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+type number
+
+val equal : number -> number -> bool
+val of_int : int -> number
+val zero : number
+val one : number
+val two : number
+val mone : number
+val is_zero : number -> bool
+val is_one : number -> bool
+val is_mone : number -> bool
+val is_two : number -> bool
+val mul : number -> number -> number
+val div : number -> number -> number
+val add : number -> number -> number
+val sub : number -> number -> number
+val negative : number -> bool
+val greater : number -> number -> bool
+val negate : number -> number
+val sqrt : number -> number
+
+(* cexp n i = (cos (2 * pi * i / n), sin (2 * pi * i / n)) *)
+val cexp : int -> int -> (number * number)
+
+val to_konst : number -> string
+val to_string : number -> string
+val to_float : number -> float
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/oracle.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/oracle.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,144 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(*
+ * the oracle decrees whether the sign of an expression should
+ * be changed.
+ *
+ * Say the expression (A - B) appears somewhere.  Elsewhere in the
+ * expression dag the expression (B - A) may appear.
+ * The oracle determines which of the two forms is canonical.
+ *
+ * Algorithm: evaluate the expression at a random input, and
+ * keep the expression with the positive sign.
+ *)
+
+let make_memoizer hash equal =
+  let table = ref Assoctable.empty 
+  in 
+  (fun f k ->
+    match Assoctable.lookup hash equal k !table with
+      Some value -> value
+    | None ->
+        let value = f k in
+        begin	
+          table := Assoctable.insert hash k value !table;
+          value
+        end)
+
+let almost_equal x y = 
+  let epsilon = 1.0E-8 in
+  (abs_float (x -. y) < epsilon) ||
+  (abs_float (x -. y) < epsilon *. (abs_float x +. abs_float y)) 
+
+let absid = make_memoizer
+    (fun x -> Expr.hash_float (abs_float x))
+    (fun a b -> almost_equal a b || almost_equal (-. a) b)
+    (fun x -> x)
+
+let make_random_oracle () = make_memoizer 
+    Variable.hash 
+    Variable.same
+    (fun _ -> (float (Random.bits())) /. 1073741824.0)
+
+let the_random_oracle = make_random_oracle ()
+
+let sum_list l = List.fold_right (+.) l 0.0
+
+let eval_aux random_oracle =
+  let memoizing = make_memoizer Expr.hash (==) in
+  let rec eval x = 
+    memoizing
+      (function
+	| Expr.Num x -> Number.to_float x
+	| Expr.NaN x -> Expr.transcendent_to_float x
+	| Expr.Load v -> random_oracle v
+	| Expr.Store (v, x) -> eval x
+	| Expr.Plus l -> sum_list (List.map eval l)
+	| Expr.Times (a, b) -> (eval a) *. (eval b)
+	| Expr.CTimes (a, b) -> 
+	    1.098612288668109691395245236 +. 
+	       1.609437912434100374600759333 *. (eval a) *. (eval b)
+	| Expr.CTimesJ (a, b) -> 
+	    0.9102392266268373936142401657 +. 
+	      0.6213349345596118107071993881 *. (eval a) *. (eval b)
+	| Expr.Uminus x -> -. (eval x))
+      x
+  in eval
+
+let eval = eval_aux the_random_oracle
+
+let should_flip_sign node = 
+  let v = eval node in
+  let v' = absid v in
+  not (almost_equal v v')
+
+(*
+ * determine with high probability if two expressions are equal.
+ *
+ * The test is randomized: if the two expressions have the
+ * same value for NTESTS random inputs, then they are proclaimed
+ * equal.  (Note that two distinct linear functions L1(x0, x1, ..., xn)
+ * and L2(x0, x1, ..., xn) have the same value with probability
+ * 0 for random x's, and thus this test is way more paranoid than
+ * necessary.)
+ *)
+let likely_equal a b =
+  let tolerance = 1.0e-8
+  and ntests = 20
+  in
+  let rec loop n =
+    if n = 0 then 
+      true
+    else
+      let r = make_random_oracle () in
+      let va = eval_aux r a
+      and vb = eval_aux r b
+      in
+      if (abs_float (va -. vb)) > 
+	   tolerance *. (abs_float va +. abs_float vb +. 0.0001)
+      then
+	false
+      else
+	loop (n - 1)
+  in
+  match (a, b) with
+
+    (* 
+     * Because of the way eval is constructed, we have
+     *     eval (Store (v, x)) == eval x
+     * However, we never consider the two expressions equal
+     *)
+  | (Expr.Store _, _) -> false
+  | (_, Expr.Store _) -> false
+
+    (*
+     * Expressions of the form ``Uminus (Store _)''
+     * are artifacts of algsimp
+     *)
+  | ((Expr.Uminus (Expr.Store _)), _) -> false
+  | (_, Expr.Uminus (Expr.Store _)) -> false
+
+  | _ -> loop ntests
+
+let hash x =
+  let f = eval x in
+  truncate (f *. 65536.0)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/oracle.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/oracle.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,24 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+val should_flip_sign : Expr.expr -> bool
+val likely_equal : Expr.expr -> Expr.expr -> bool
+val hash : Expr.expr -> int
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/schedule.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/schedule.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,236 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* This file contains the instruction scheduler, which finds an
+   efficient ordering for a given list of instructions.
+
+   The scheduler analyzes the DAG (directed acyclic graph) formed by
+   the instruction dependencies, and recursively partitions it.  The
+   resulting schedule data structure expresses a "good" ordering
+   and structure for the computation.
+
+   The scheduler makes use of utilties in Dag and other packages to
+   manipulate the Dag and the instruction list. *)
+
+open Dag
+(*************************************************
+ *               Dag scheduler
+ *************************************************)
+let to_assignment node = (Expr.Assign (node.assigned, node.expression))
+let makedag l = Dag.makedag 
+    (List.map (function Expr.Assign (v, x) -> (v, x)) l)
+
+let return x = x
+let has_color c n = (n.color = c)
+let set_color c n = (n.color <- c)
+let has_either_color c1 c2 n = (n.color = c1 || n.color = c2)
+
+let infinity = 100000 
+
+let cc dag inputs =
+  begin
+    Dag.for_all dag (fun node -> 
+      node.label <- infinity);
+    
+    (match inputs with 
+      a :: _ -> bfs dag a 0
+    | _ -> failwith "connected");
+
+    return
+      ((List.map to_assignment (List.filter (fun n -> n.label < infinity)
+				  (Dag.to_list dag))),
+       (List.map to_assignment (List.filter (fun n -> n.label == infinity) 
+				  (Dag.to_list dag))))
+  end
+
+let rec connected_components alist =
+  let dag = makedag alist in
+  let inputs = 
+    List.filter (fun node -> Util.null node.predecessors) 
+      (Dag.to_list dag) in
+  match cc dag inputs with
+    (a, []) -> [a]
+  | (a, b) -> a :: connected_components b
+
+let single_load node =
+  match (node.input_variables, node.predecessors) with
+    ([x], []) -> 
+      Variable.is_constant x ||
+      (!Magic.locations_are_special && Variable.is_locative x)
+  | _ -> false
+
+let loads_locative node =
+  match (node.input_variables, node.predecessors) with
+    | ([x], []) -> Variable.is_locative x
+    | _ -> false
+
+let partition alist =
+  let dag = makedag alist in
+  let dag' = Dag.to_list dag in
+  let inputs = 
+    List.filter (fun node -> Util.null node.predecessors) dag'
+  and outputs = 
+    List.filter (fun node -> Util.null node.successors) dag'
+  and special_inputs =  List.filter single_load dag' in
+  begin
+    
+    let c = match !Magic.schedule_type with
+	| 1 -> RED; (* all nodes in the input partition *)
+	| -1 -> BLUE; (* all nodes in the output partition *)
+	| _ -> BLACK; (* node color determined by bisection algorithm *)
+    in Dag.for_all dag (fun node -> node.color <- c);
+
+    Util.for_list inputs (set_color RED);
+
+    (*
+       The special inputs are those input nodes that load a single
+       location or twiddle factor.  Special inputs can end up either
+       in the blue or in the red part.  These inputs are special
+       because they inherit a color from their neighbors: If a red
+       node needs a special input, the special input becomes red, but
+       if all successors of a special input are blue, the special
+       input becomes blue.  Outputs are always blue, whether they be
+       special or not.
+
+       Because of the processing of special inputs, however, the final
+       partition might end up being composed only of blue nodes (which
+       is incorrect).  In this case we manually reset all inputs
+       (whether special or not) to be red.
+    *)
+
+    Util.for_list special_inputs (set_color YELLOW);
+
+    Util.for_list outputs (set_color BLUE);
+
+    let rec loopi donep = 
+      match (List.filter
+	       (fun node -> (has_color BLACK node) &&
+		 List.for_all (has_either_color RED YELLOW) node.predecessors)
+	       dag') with
+	[] -> if (donep) then () else loopo true
+      |	i -> 
+	  begin
+	    Util.for_list i (fun node -> 
+	      begin
+      		set_color RED node;
+		Util.for_list node.predecessors (set_color RED);
+	      end);
+	    loopo false; 
+	  end
+
+    and loopo donep =
+      match (List.filter
+	       (fun node -> (has_either_color BLACK YELLOW node) &&
+		 List.for_all (has_color BLUE) node.successors)
+	       dag') with
+	[] -> if (donep) then () else loopi true
+      |	o ->
+	  begin
+	    Util.for_list o (set_color BLUE);
+	    loopi false; 
+	  end
+
+    in loopi false;
+
+    (* fix the partition if it is incorrect *)
+    if not (List.exists (has_color RED) dag') then 
+	Util.for_list inputs (set_color RED);
+    
+    return
+      ((List.map to_assignment (List.filter (has_color RED) dag')),
+       (List.map to_assignment (List.filter (has_color BLUE) dag')))
+  end
+
+type schedule = 
+    Done
+  | Instr of Expr.assignment
+  | Seq of (schedule * schedule)
+  | Par of schedule list
+
+
+
+(* produce a sequential schedule determined by the user *)
+let rec sequentially = function
+    [] -> Done
+  | a :: b -> Seq (Instr a, sequentially b)
+
+let schedule =
+  let rec schedule_alist = function
+    | [] -> Done
+    | [a] -> Instr a
+    | alist -> match connected_components alist with
+	| ([a]) -> schedule_connected a
+	| l -> Par (List.map schedule_alist l)
+
+  and schedule_connected alist = 
+    match partition alist with
+    | (a, b) -> Seq (schedule_alist a, schedule_alist b)
+
+  in fun x ->
+    let () = Util.info "begin schedule" in
+    let res = schedule_alist x in
+    let () = Util.info "end schedule" in
+    res
+
+
+(* partition a dag into two parts:
+
+   1) the set of loads from locatives and their successors,
+   2) all other nodes
+
+   This step separates the ``body'' of the dag, which computes the
+   actual fft, from the ``precomputations'' part, which computes e.g.
+   twiddle factors.
+*)
+let partition_precomputations alist =
+  let dag = makedag alist in
+  let dag' = Dag.to_list dag in
+  let loads =  List.filter loads_locative dag' in
+    begin
+      
+      Dag.for_all dag (set_color BLUE);
+      Util.for_list loads (set_color RED);
+
+      let rec loop () = 
+	match (List.filter
+		 (fun node -> (has_color RED node) &&
+		    List.exists (has_color BLUE) node.successors)
+		 dag') with
+	    [] -> ()
+	  |	i -> 
+		  begin
+		    Util.for_list i 
+		      (fun node -> 
+			 Util.for_list node.successors (set_color RED));
+		    loop ()
+		  end
+
+      in loop ();
+
+	return
+	  ((List.map to_assignment (List.filter (has_color BLUE) dag')),
+	   (List.map to_assignment (List.filter (has_color RED) dag')))
+    end
+
+let isolate_precomputations_and_schedule alist =
+  let (a, b) = partition_precomputations alist in
+    Seq (schedule a, schedule b)
+  
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/schedule.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/schedule.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+type schedule =
+  | Done
+  | Instr of Expr.assignment
+  | Seq of (schedule * schedule)
+  | Par of schedule list
+
+val schedule : Expr.assignment list -> schedule
+val sequentially : Expr.assignment list -> schedule
+val isolate_precomputations_and_schedule : Expr.assignment list -> schedule
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/simd.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/simd.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,226 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+open Expr
+open List
+open Printf
+open Variable
+open Annotate
+open Simdmagic
+open C
+
+let realtype = "V"
+let realtypep = realtype ^ " *"
+let constrealtype = "const " ^ realtype
+let constrealtypep = constrealtype ^ " *"
+let alignment_mod = 2
+
+(*
+ * SIMD C AST unparser 
+ *)
+let foldr_string_concat l = fold_right (^) l ""
+
+let rec unparse_by_twiddle nam tw src = 
+  sprintf "%s(&(%s),%s)" nam (Variable.unparse tw) (unparse_expr src)
+
+and unparse_store dst = function
+  | Times (NaN MULTI_A, x) ->
+      sprintf "STM%d(&(%s),%s,%s,&(%s));\n" 
+	!Simdmagic.store_multiple
+	(Variable.unparse dst) (unparse_expr x)
+	(Variable.vstride_of_locative dst)
+	(Variable.unparse_for_alignment alignment_mod dst)
+  | Times (NaN MULTI_B, Plus stuff) ->
+      sprintf "STN%d(&(%s)%s,%s);\n" 
+	!Simdmagic.store_multiple
+	(Variable.unparse dst) 
+	(List.fold_right (fun x a -> "," ^ (unparse_expr x) ^ a) stuff "")
+	(Variable.vstride_of_locative dst)
+  | src_expr -> 
+      sprintf "ST(&(%s),%s,%s,&(%s));\n" 
+	(Variable.unparse dst) (unparse_expr src_expr) 
+	(Variable.vstride_of_locative dst)
+	(Variable.unparse_for_alignment alignment_mod dst)
+
+and unparse_expr =
+  let rec unparse_plus = function
+    | [a] -> unparse_expr a
+
+    | (Uminus (Times (NaN I, b))) :: c :: d -> op2 "VFNMSI" [b] (c :: d)
+    | c :: (Uminus (Times (NaN I, b))) :: d -> op2 "VFNMSI" [b] (c :: d)
+    | (Uminus (Times (NaN CONJ, b))) :: c :: d -> op2 "VFNMSCONJ" [b] (c :: d)
+    | c :: (Uminus (Times (NaN CONJ, b))) :: d -> op2 "VFNMSCONJ" [b] (c :: d)
+    | (Times (NaN I, b)) :: c :: d -> op2 "VFMAI" [b] (c :: d)
+    | c :: (Times (NaN I, b)) :: d -> op2 "VFMAI" [b] (c :: d)
+    | (Times (NaN CONJ, b)) :: (Uminus c) :: d -> op2 "VFMSCONJ" [b] (c :: d)
+    | (Uminus c) :: (Times (NaN CONJ, b)) :: d -> op2 "VFMSCONJ" [b] (c :: d)
+    | (Times (NaN CONJ, b)) :: c :: d -> op2 "VFMACONJ" [b] (c :: d)
+    | c :: (Times (NaN CONJ, b)) :: d -> op2 "VFMACONJ" [b] (c :: d)
+    | (Times (NaN _, b)) :: (Uminus c) :: d -> failwith "VFMS NaN"
+    | (Uminus c) :: (Times (NaN _, b)) :: d -> failwith "VFMS NaN"
+
+    | (Uminus (Times (a, b))) :: c :: d -> op3 "VFNMS" a b (c :: d)
+    | c :: (Uminus (Times (a, b))) :: d -> op3 "VFNMS" a b (c :: d)
+    | (Times (a, b)) :: (Uminus c) :: d -> op3 "VFMS" a b (c :: negate d)
+    | (Uminus c) :: (Times (a, b)) :: d -> op3 "VFMS" a b (c :: negate d)
+    | (Times (a, b)) :: c :: d          -> op3 "VFMA" a b (c :: d)
+    | c :: (Times (a, b)) :: d          -> op3 "VFMA" a b (c :: d)
+
+    | (Uminus a :: b)                   -> op2 "VSUB" b [a]
+    | (b :: Uminus a :: c)              -> op2 "VSUB" (b :: c) [a]
+    | (a :: b)                          -> op2 "VADD" [a] b
+    | [] -> failwith "unparse_plus"
+  and op3 nam a b c =
+    nam ^ "(" ^ (unparse_expr a) ^ ", " ^ (unparse_expr b) ^ ", " ^
+    (unparse_plus c) ^ ")"
+  and op2 nam a b = 
+    nam ^ "(" ^ (unparse_plus a) ^ ", " ^ (unparse_plus b) ^ ")"
+  and op1 nam a = 
+    nam ^ "(" ^ (unparse_expr a) ^ ")"
+  and negate = function
+    | [] -> []
+    | (Uminus x) :: y -> x :: negate y
+    | x :: y -> (Uminus x) :: negate y
+
+  in function
+    | CTimes(Load tw, src) 
+	when Variable.is_constant tw && !Magic.generate_bytw ->
+	unparse_by_twiddle "BYTW" tw src
+    | CTimesJ(Load tw, src) 
+	when Variable.is_constant tw && !Magic.generate_bytw ->
+	unparse_by_twiddle "BYTWJ" tw src
+    | Load v when is_locative(v) ->
+	sprintf "LD(&(%s), %s, &(%s))" (Variable.unparse v) 
+	  (Variable.vstride_of_locative v)
+	  (Variable.unparse_for_alignment alignment_mod v)
+    | Load v when is_constant(v) -> sprintf "LDW(&(%s))" (Variable.unparse v)
+    | Load v  -> Variable.unparse v
+    | Num n -> sprintf "LDK(%s)" (Number.to_konst n)
+    | NaN n -> failwith "NaN in unparse_expr"
+    | Plus [] -> "0.0 /* bug */"
+    | Plus [a] -> " /* bug */ " ^ (unparse_expr a)
+    | Plus a -> unparse_plus a
+    | Times(NaN I,b) -> op1 "VBYI" b
+    | Times(NaN CONJ,b) -> op1 "VCONJ" b
+    | Times(a,b) ->
+	sprintf "VMUL(%s, %s)" (unparse_expr a) (unparse_expr b)
+    | CTimes(a,Times(NaN I, b)) ->
+	sprintf "VZMULI(%s, %s)" (unparse_expr a) (unparse_expr b)
+    | CTimes(a,b) ->
+	sprintf "VZMUL(%s, %s)" (unparse_expr a) (unparse_expr b)
+    | CTimesJ(a,Times(NaN I, b)) ->
+	sprintf "VZMULIJ(%s, %s)" (unparse_expr a) (unparse_expr b)
+    | CTimesJ(a,b) ->
+	sprintf "VZMULJ(%s, %s)" (unparse_expr a) (unparse_expr b)
+    | Uminus a when !Magic.vneg -> op1 "VNEG" a
+    | Uminus a -> failwith "SIMD Uminus"
+    | _ -> failwith "unparse_expr"
+
+and unparse_decl x = C.unparse_decl x
+
+and unparse_ast ast = 
+  let rec unparse_assignment = function
+    | Assign (v, x) when Variable.is_locative v ->
+	unparse_store v x
+    | Assign (v, x) -> 
+	(Variable.unparse v) ^ " = " ^ (unparse_expr x) ^ ";\n"
+
+  and unparse_annotated force_bracket = 
+    let rec unparse_code = function
+      | ADone -> ""
+      | AInstr i -> unparse_assignment i
+      | ASeq (a, b) -> 
+	  (unparse_annotated false a) ^ (unparse_annotated false b)
+    and declare_variables l = 
+      let rec uvar = function
+	  [] -> failwith "uvar"
+	|	[v] -> (Variable.unparse v) ^ ";\n"
+	| a :: b -> (Variable.unparse a) ^ ", " ^ (uvar b)
+      in let rec vvar l = 
+	let s = if !Magic.compact then 15 else 1 in
+	if (List.length l <= s) then
+	  match l with
+	    [] -> ""
+	  | _ -> realtype ^ " " ^ (uvar l)
+	else
+	  (vvar (Util.take s l)) ^ (vvar (Util.drop s l))
+      in vvar (List.filter Variable.is_temporary l)
+    in function
+        Annotate (_, _, decl, _, code) ->
+          if (not force_bracket) && (Util.null decl) then 
+            unparse_code code
+          else "{\n" ^
+            (declare_variables decl) ^
+            (unparse_code code) ^
+	    "}\n"
+
+(* ---- *)
+  and unparse_plus = function
+    | [] -> ""
+    | (CUminus a :: b) -> " - " ^ (parenthesize a) ^ (unparse_plus b)
+    | (a :: b) -> " + " ^ (parenthesize a) ^ (unparse_plus b)
+  and parenthesize x = match x with
+  | (CVar _) -> unparse_ast x
+  | (CCall _) -> unparse_ast x
+  | (Integer _) -> unparse_ast x
+  | _ -> "(" ^ (unparse_ast x) ^ ")"
+
+  in match ast with 
+  | Asch a -> (unparse_annotated true a)
+  | Return x -> "return " ^ unparse_ast x ^ ";"
+  | Simd_leavefun -> "VLEAVE();"
+  | For (a, b, c, d) ->
+      "for (" ^
+      unparse_ast a ^ "; " ^ unparse_ast b ^ "; " ^ unparse_ast c
+      ^ ")" ^ unparse_ast d
+  | If (a, d) ->
+      "if (" ^
+      unparse_ast a 
+      ^ ")" ^ unparse_ast d
+  | Block (d, s) ->
+      if (s == []) then ""
+      else 
+        "{\n"                                      ^ 
+        foldr_string_concat (map unparse_decl d)   ^ 
+        foldr_string_concat (map unparse_ast s)    ^
+        "}\n"      
+  | x -> C.unparse_ast x
+
+and unparse_function = function
+    Fcn (typ, name, args, body) ->
+      let rec unparse_args = function
+          [Decl (a, b)] -> a ^ " " ^ b 
+	| (Decl (a, b)) :: s -> a ^ " " ^ b  ^ ", "
+            ^  unparse_args s
+	| [] -> ""
+	| _ -> failwith "unparse_function"
+      in 
+      (typ ^ " " ^ name ^ "(" ^ unparse_args args ^ ")\n" ^
+       unparse_ast body)
+
+let extract_constants f =
+  let constlist = flatten (map expr_to_constants (C.ast_to_expr_list f))
+  in map
+       (fun n ->
+	  Tdecl 
+	    ("DVK(" ^ (Number.to_konst n) ^ ", " ^ (Number.to_string n) ^ 
+	       ");\n"))
+       (unique_constants constlist)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/simd.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/simd.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,28 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+val unparse_function : C.c_fcn -> string
+val extract_constants : C.c_ast -> C.c_decl list
+val realtype : string
+val realtypep : string
+val constrealtype : string
+val constrealtypep : string
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/simdmagic.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/simdmagic.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,31 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* SIMD magic parameters *)
+let simd_mode = ref false
+let store_multiple = ref 1
+
+open Magic
+
+let speclist = [
+  "-simd", set_bool simd_mode, undocumented;
+  "-store-multiple", set_int store_multiple, undocumented;
+]
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/to_alist.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/to_alist.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,288 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(*************************************************************
+ * Conversion of the dag to an assignment list
+ *************************************************************)
+(*
+ * This function is messy.  The main problem is that we want to
+ * inline dag nodes conditionally, depending on how many times they
+ * are used.  The Right Thing to do would be to modify the
+ * state monad to propagate some of the state backwards, so that
+ * we know whether a given node will be used again in the future.
+ * This modification is trivial in a lazy language, but it is
+ * messy in a strict language like ML.  
+ *
+ * In this implementation, we just do the obvious thing, i.e., visit
+ * the dag twice, the first to count the node usages, and the second to
+ * produce the output.
+ *)
+
+open Monads.StateMonad
+open Monads.MemoMonad
+open Expr
+
+let fresh = Variable.make_temporary
+let node_insert x =  Assoctable.insert Expr.hash x
+let node_lookup x =  Assoctable.lookup Expr.hash (==) x
+let empty = Assoctable.empty
+
+let fetchAl = 
+  fetchState >>= (fun (al, _, _) -> returnM al)
+
+let storeAl al =
+  fetchState >>= (fun (_, visited, visited') ->
+    storeState (al, visited, visited'))
+
+let fetchVisited = fetchState >>= (fun (_, v, _) -> returnM v)
+
+let storeVisited visited =
+  fetchState >>= (fun (al, _, visited') ->
+    storeState (al, visited, visited'))
+
+let fetchVisited' = fetchState >>= (fun (_, _, v') -> returnM v')
+let storeVisited' visited' =
+  fetchState >>= (fun (al, visited, _) ->
+    storeState (al, visited, visited'))
+let lookupVisitedM' key =
+  fetchVisited' >>= fun table ->
+    returnM (node_lookup key table)
+let insertVisitedM' key value =
+  fetchVisited' >>= fun table ->
+    storeVisited' (node_insert key value table)
+
+let counting f x =
+  fetchVisited >>= (fun v ->
+    match node_lookup x v with
+      Some count -> 
+	let incr_cnt = 
+	  fetchVisited >>= (fun v' ->
+	    storeVisited (node_insert x (count + 1) v'))
+	in
+	begin
+	  match x with
+	    (* Uminus is always inlined.  Visit child *)
+	    Uminus y -> f y >> incr_cnt
+	  | _ -> incr_cnt
+	end
+    | None ->
+        f x >> fetchVisited >>= (fun v' ->
+            storeVisited (node_insert x 1 v')))
+
+let with_varM v x = 
+  fetchAl >>= (fun al -> storeAl ((v, x) :: al)) >> returnM (Load v)
+
+let inlineM = returnM
+
+let with_tempM x = match x with
+| Load v when Variable.is_temporary v -> inlineM x (* avoid trivial moves *)
+|  _ -> with_varM (fresh ()) x
+
+(* declare a temporary only if node is used more than once *)
+let with_temp_maybeM node x =
+  fetchVisited >>= (fun v ->
+    match node_lookup node v with
+      Some count -> 
+        if (count = 1 && !Magic.inline_single) then
+          inlineM x
+        else
+          with_tempM x
+    | None ->
+        failwith "with_temp_maybeM")
+type fma = 
+    NO_FMA
+  | FMA of expr * expr * expr   (* FMA (a, b, c) => a + b * c *)
+  | FMS of expr * expr * expr   (* FMS (a, b, c) => -a + b * c *)
+  | FNMS of expr * expr * expr  (* FNMS (a, b, c) => a - b * c *)
+
+let good_for_fma (a, b) = 
+  let good = function
+    | NaN I -> true
+    | NaN CONJ -> true
+    | NaN _ -> false
+    | Times(NaN _, _) -> false
+    | Times(_, NaN _) -> false
+    | _ -> true
+  in good a && good b
+
+let build_fma l = 
+  if (not !Magic.enable_fma) then NO_FMA
+  else match l with
+  | [a; Uminus (Times (b, c))] when good_for_fma (b, c) -> FNMS (a, b, c)
+  | [Uminus (Times (b, c)); a] when good_for_fma (b, c) -> FNMS (a, b, c)
+  | [Uminus a; Times (b, c)] when good_for_fma (b, c) -> FMS (a, b, c)
+  | [Times (b, c); Uminus a] when good_for_fma (b, c) -> FMS (a, b, c)
+  | [a; Times (b, c)] when good_for_fma (b, c) -> FMA (a, b, c)
+  | [Times (b, c); a] when good_for_fma (b, c) -> FMA (a, b, c)
+  | _ -> NO_FMA
+
+let children_fma l = match build_fma l with
+| FMA (a, b, c) -> Some (a, b, c)
+| FMS (a, b, c) -> Some (a, b, c)
+| FNMS (a, b, c) -> Some (a, b, c)
+| NO_FMA -> None
+
+
+let rec visitM x =
+  counting (function
+    | Load v -> returnM ()
+    | Num a -> returnM ()
+    | NaN a -> returnM ()
+    | Store (v, x) -> visitM x
+    | Plus a -> (match children_fma a with
+	None -> mapM visitM a >> returnM ()
+      | Some (a, b, c) -> 
+          (* visit fma's arguments twice to make sure they are not inlined *)
+	  visitM a >> visitM a >>
+	  visitM b >> visitM b >>
+	  visitM c >> visitM c)
+    | Times (a, b) -> visitM a >> visitM b
+    | CTimes (a, b) -> visitM a >> visitM b
+    | CTimesJ (a, b) -> visitM a >> visitM b
+    | Uminus a -> visitM a)
+    x
+
+let visit_rootsM = mapM visitM
+
+
+let rec expr_of_nodeM x =
+  memoizing lookupVisitedM' insertVisitedM'
+    (function x -> match x with
+    | Load v -> 
+	if (Variable.is_temporary v) then
+	  inlineM (Load v)
+	else if (Variable.is_locative v && !Magic.inline_loads) then
+          inlineM (Load v)
+        else if (Variable.is_constant v && !Magic.inline_loads_constants) then
+          inlineM (Load v)
+	else
+          with_tempM (Load v)
+    | Num a ->
+        if !Magic.inline_constants then
+          inlineM (Num a)
+	else
+          with_temp_maybeM x (Num a)
+    | NaN a -> inlineM (NaN a)
+    | Store (v, x) -> 
+        expr_of_nodeM x >>= 
+	(if !Magic.trivial_stores then with_tempM else inlineM) >>=
+        with_varM v 
+
+    | Plus a -> 
+	begin
+	  match build_fma a with
+	    FMA (a, b, c) ->	  
+	      expr_of_nodeM a >>= fun a' ->
+		expr_of_nodeM b >>= fun b' ->
+		  expr_of_nodeM c >>= fun c' ->
+		    with_temp_maybeM x (Plus [a'; Times (b', c')])
+	  | FMS (a, b, c) ->	  
+	      expr_of_nodeM a >>= fun a' ->
+		expr_of_nodeM b >>= fun b' ->
+		  expr_of_nodeM c >>= fun c' ->
+		    with_temp_maybeM x 
+		      (Plus [Times (b', c'); Uminus a'])
+	  | FNMS (a, b, c) ->	  
+	      expr_of_nodeM a >>= fun a' ->
+		expr_of_nodeM b >>= fun b' ->
+		  expr_of_nodeM c >>= fun c' ->
+		    with_temp_maybeM x 
+		      (Plus [a'; Uminus (Times (b', c'))])
+	  | NO_FMA ->
+              mapM expr_of_nodeM a >>= fun a' ->
+		with_temp_maybeM x (Plus a')
+	end
+    | CTimes (Load _ as a, b) when !Magic.generate_bytw ->
+        expr_of_nodeM b >>= fun b' ->
+          with_tempM (CTimes (a, b'))
+    | CTimes (a, b) ->
+        expr_of_nodeM a >>= fun a' ->
+          expr_of_nodeM b >>= fun b' ->
+            with_tempM (CTimes (a', b'))
+    | CTimesJ (Load _ as a, b) when !Magic.generate_bytw ->
+        expr_of_nodeM b >>= fun b' ->
+          with_tempM (CTimesJ (a, b'))
+    | CTimesJ (a, b) ->
+        expr_of_nodeM a >>= fun a' ->
+          expr_of_nodeM b >>= fun b' ->
+            with_tempM (CTimesJ (a', b'))
+    | Times (a, b) ->
+        expr_of_nodeM a >>= fun a' ->
+          expr_of_nodeM b >>= fun b' ->
+	    begin
+	      match a' with
+		Num a'' when !Magic.strength_reduce_mul && Number.is_two a'' ->
+		  (inlineM b' >>= fun b'' ->
+		    with_temp_maybeM x (Plus [b''; b'']))
+	      | _ -> with_temp_maybeM x (Times (a', b'))
+	    end
+    | Uminus a ->
+        expr_of_nodeM a >>= fun a' ->
+          inlineM (Uminus a'))
+    x
+
+let expr_of_rootsM = mapM expr_of_nodeM
+
+let peek_alistM roots =
+  visit_rootsM roots >> expr_of_rootsM roots >> fetchAl
+
+let wrap_assign (a, b) = Expr.Assign (a, b)
+
+let to_assignments dag =
+  let () = Util.info "begin to_alist" in
+  let al = List.rev (runM ([], empty, empty) peek_alistM dag) in
+  let res = List.map wrap_assign al in
+  let () = Util.info "end to_alist" in
+  res
+
+
+(* dump alist in `dot' format *)
+let dump print alist =
+  let vs v = "\"" ^ (Variable.unparse v) ^ "\"" in
+  begin
+    print "digraph G {\n";
+    print "\tsize=\"6,6\";\n";
+
+    (* all input nodes have the same rank *)
+    print "{ rank = same;\n";
+    List.iter (fun (Expr.Assign (v, x)) ->
+      List.iter (fun y -> 
+	if (Variable.is_locative y) then print("\t" ^ (vs y) ^ ";\n"))
+	(Expr.find_vars x))
+      alist;
+    print "}\n";
+
+    (* all output nodes have the same rank *)
+    print "{ rank = same;\n";
+    List.iter (fun (Expr.Assign (v, x)) ->
+      if (Variable.is_locative v) then print("\t" ^ (vs v) ^ ";\n"))
+      alist;
+    print "}\n";
+    
+    (* edges *)
+    List.iter (fun (Expr.Assign (v, x)) ->
+      List.iter (fun y -> print("\t" ^ (vs y) ^ " -> " ^ (vs v) ^ ";\n"))
+	(Expr.find_vars x))
+      alist;
+
+    print "}\n";
+  end
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/to_alist.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/to_alist.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,24 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+val to_assignments : Expr.expr list -> Expr.assignment list
+val dump : (string -> unit) -> Expr.assignment list -> unit
+val good_for_fma : Expr.expr * Expr.expr -> bool
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/trig.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/trig.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,152 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* trigonometric transforms *)
+open Util
+
+(* DFT of real input *)
+let rdft sign n input =
+  Fft.dft sign n (Complex.real @@ input)
+
+(* DFT of hermitian input *)
+let hdft sign n input =
+  Fft.dft sign n (Complex.hermitian n input)
+
+(* DFT real transform of vectors of two real numbers,
+   multiplication by (NaN I), and summation *)
+let dft_via_rdft sign n input =
+  let f = rdft sign n input
+  in fun i -> 
+    Complex.plus
+      [Complex.real (f i); 
+       Complex.times (Complex.nan Expr.I) (Complex.imag (f i))]
+
+(* Discrete Hartley Transform *)
+let dht sign n input =
+  let f = Fft.dft sign n (Complex.real @@ input) in
+  (fun i ->
+    Complex.plus [Complex.real (f i); Complex.imag (f i)])
+
+let trigI n input = 
+  let twon = 2 * n in
+  let input' = Complex.hermitian twon input
+  in
+  Fft.dft 1 twon input'
+
+let interleave_zero input = fun i -> 
+  if (i mod 2) == 0
+      then Complex.zero
+  else
+    input ((i - 1) / 2)
+
+let trigII n input =
+  let fourn = 4 * n in
+  let input' = Complex.hermitian fourn (interleave_zero input)
+  in
+  Fft.dft 1 fourn input'
+
+let trigIII n input =
+  let fourn = 4 * n in
+  let twon = 2 * n in
+  let input' = Complex.hermitian fourn
+      (fun i -> 
+	if (i == 0) then
+	  Complex.real (input 0)
+	else if (i == twon) then
+	  Complex.uminus (Complex.real (input 0))
+	else
+	  Complex.antihermitian twon input i)
+  in
+  let dft = Fft.dft 1 fourn input'
+  in fun k -> dft (2 * k + 1)
+
+let zero_extend n input = fun i ->
+  if (i >= 0 && i < n)
+  then input i
+  else Complex.zero
+
+let trigIV n input =
+  let fourn = 4 * n
+  and eightn = 8 * n in
+  let input' = Complex.hermitian eightn 
+      (zero_extend fourn (Complex.antihermitian fourn 
+			 (interleave_zero input)))
+  in
+  let dft = Fft.dft 1 eightn input'
+  in fun k -> dft (2 * k + 1)
+
+let make_dct scale nshift trig =
+  fun n input ->
+    trig (n - nshift) (Complex.real @@ (Complex.times scale) @@ 
+		       (zero_extend n input))
+(*
+ * DCT-I:  y[k] = sum x[j] cos(pi * j * k / n)
+ *)
+let dctI = make_dct Complex.one 1 trigI
+
+(*
+ * DCT-II:  y[k] = sum x[j] cos(pi * (j + 1/2) * k / n)
+ *)
+let dctII = make_dct Complex.one 0 trigII
+
+(*
+ * DCT-III:  y[k] = sum x[j] cos(pi * j * (k + 1/2) / n)
+ *)
+let dctIII = make_dct Complex.half 0 trigIII
+
+(*
+ * DCT-IV  y[k] = sum x[j] cos(pi * (j + 1/2) * (k + 1/2) / n)
+ *)
+let dctIV = make_dct Complex.half 0 trigIV
+
+let shift s input = fun i -> input (i - s)
+
+(* DST-x input := TRIG-x (input / i) *)
+let make_dst scale nshift kshift jshift trig =
+  fun n input ->
+    Complex.real @@ 
+    (shift (- jshift)
+       (trig (n + nshift) (Complex.uminus @@
+			   (Complex.times Complex.i) @@
+			   (Complex.times scale) @@ 
+			   Complex.real @@ 
+			   (shift kshift (zero_extend n input)))))
+
+(*
+ * DST-I:  y[k] = sum x[j] sin(pi * j * k / n)
+ *)
+let dstI = make_dst Complex.one 1 1 1 trigI
+
+(*
+ * DST-II:  y[k] = sum x[j] sin(pi * (j + 1/2) * k / n)
+ *)
+let dstII = make_dst Complex.one 0 0 1 trigII
+
+(*
+ * DST-III:  y[k] = sum x[j] sin(pi * j * (k + 1/2) / n)
+ *)
+let dstIII = make_dst Complex.half 0 1 0 trigIII
+
+(*
+ * DST-IV  y[k] = sum x[j] sin(pi * (j + 1/2) * (k + 1/2) / n)
+ *)
+let dstIV = make_dst Complex.half 0 0 0 trigIV
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/trig.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/trig.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,35 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+val rdft : int -> int -> Complex.signal -> Complex.signal
+val hdft : int -> int -> Complex.signal -> Complex.signal
+val dft_via_rdft : int -> int -> Complex.signal -> Complex.signal
+val dht : int -> int -> Complex.signal -> Complex.signal
+
+val dctI : int -> Complex.signal -> Complex.signal
+val dctII : int -> Complex.signal -> Complex.signal
+val dctIII : int -> Complex.signal -> Complex.signal
+val dctIV : int -> Complex.signal -> Complex.signal
+
+val dstI : int -> Complex.signal -> Complex.signal
+val dstII : int -> Complex.signal -> Complex.signal
+val dstIII : int -> Complex.signal -> Complex.signal
+val dstIV : int -> Complex.signal -> Complex.signal
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/twiddle.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/twiddle.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,188 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* policies for loading/computing twiddle factors *)
+open Complex
+open Util
+
+type twop = TW_FULL | TW_CEXP | TW_NEXT
+
+let optostring = function
+  | TW_CEXP -> "TW_CEXP"
+  | TW_NEXT -> "TW_NEXT"
+  | TW_FULL -> "TW_FULL"
+
+type twinstr = (twop * int * int)
+
+let rec unroll_twfull l = match l with
+| [] -> []
+| (TW_FULL, v, n) :: b ->
+    (forall [] cons 1 n (fun i -> (TW_CEXP, v, i)))
+    @ unroll_twfull b
+| a :: b -> a :: unroll_twfull b
+
+let twinstr_to_c_string l =
+  let one (op, a, b) = Printf.sprintf "{ %s, %d, %d }" (optostring op) a b
+  in let rec loop first = function
+    | [] -> ""
+    | a :: b ->  (if first then "\n" else ",\n") ^ (one a) ^ (loop false b)
+  in "{" ^ (loop true l) ^ "}"
+
+let twinstr_to_simd_string vl l =
+  let one sep = function
+    | (TW_NEXT, 1, 0) -> sep ^ "{TW_NEXT, " ^ vl ^ ", 0}"
+    | (TW_NEXT, _, _) -> failwith "twinstr_to_simd_string"
+    | (TW_CEXP, v, b) -> sep ^ (Printf.sprintf "VTW(%d,%d)" v b)
+    | _ -> failwith "twinstr_to_simd_string"
+  in let rec loop first = function
+    | [] -> ""
+    | a :: b ->  (one (if first then "\n" else ",\n") a) ^ (loop false b)
+  in "{" ^ (loop true (unroll_twfull l)) ^ "}"
+  
+let rec pow m n =
+  if (n = 0) then 1
+  else m * pow m (n - 1)
+
+let rec is_pow m n =
+  n = 1 || ((n mod m) = 0 && is_pow m (n / m))
+
+let rec log m n = if n = 1 then 0 else 1 + log m (n / m)
+
+let rec largest_power_smaller_than m i =
+  if (is_pow m i) then i
+  else largest_power_smaller_than m (i - 1)
+
+let rec smallest_power_larger_than m i =
+  if (is_pow m i) then i
+  else smallest_power_larger_than m (i + 1)
+
+let rec_array n f =
+  let g = ref (fun i -> Complex.zero) in
+  let a = Array.init n (fun i -> lazy (!g i)) in
+  let h i = f (fun i -> Lazy.force a.(i)) i in
+  begin
+    g := h;
+    h
+  end
+
+ 
+let ctimes use_complex_arith a b =
+  if use_complex_arith then
+    Complex.ctimes a b
+  else
+    Complex.times a b
+
+let ctimesj use_complex_arith a b =
+  if use_complex_arith then
+    Complex.ctimesj a b
+  else
+    Complex.times (Complex.conj a) b
+
+let make_bytwiddle sign use_complex_arith g f i =
+  if i = 0 then 
+    f i
+  else if sign = 1 then 
+    ctimes use_complex_arith (g i) (f i)
+  else
+    ctimesj use_complex_arith (g i) (f i)
+
+(* various policies for computing/loading twiddle factors *)
+
+let twiddle_policy_load_all v use_complex_arith =
+  let bytwiddle n sign w f =
+    make_bytwiddle sign use_complex_arith (fun i -> w (i - 1)) f
+  and twidlen n = 2 * (n - 1)
+  and twdesc r = [(TW_FULL, v, r);(TW_NEXT, 1, 0)]
+  in bytwiddle, twidlen, twdesc
+
+(*
+ * if i is a power of two, then load w (log i)
+ * else let x = largest power of 2 less than i in
+ *      let y = i - x in
+ *      compute w^{x+y} = w^x * w^y
+ *)
+let twiddle_policy_log2 v use_complex_arith =
+  let bytwiddle n sign w f =
+    let g = rec_array n (fun self i ->
+      if i = 0 then Complex.one
+      else if is_pow 2 i then w (log 2 i)
+      else let x = largest_power_smaller_than 2 i in
+      let y = i - x in
+	ctimes use_complex_arith (self x) (self y))
+    in make_bytwiddle sign use_complex_arith g f
+  and twidlen n = 2 * (log 2 (largest_power_smaller_than 2 (2 * n - 1)))
+  and twdesc n =
+    (List.flatten 
+       (List.map 
+	  (fun i -> 
+	    if i > 0 && is_pow 2 i then 
+	      [TW_CEXP, v, i] 
+	    else 
+	      [])
+	  (iota n)))
+    @ [(TW_NEXT, 1, 0)]
+  in bytwiddle, twidlen, twdesc
+
+let twiddle_policy_log3 v use_complex_arith =
+  let rec terms_needed i pi s n =
+    if (s >= n - 1) then i
+    else terms_needed (i + 1) (3 * pi) (s + pi) n
+  in
+  let rec bytwiddle n sign w f =
+    let nterms = terms_needed 0 1 0 n in
+    let maxterm = pow 3 (nterms - 1) in
+    let g = rec_array (3 * n) (fun self i ->
+      if i = 0 then Complex.one
+      else if is_pow 3 i then w (log 3 i)
+      else if i = (n - 1) && maxterm >= n then
+	w (nterms - 1)
+      else let x = smallest_power_larger_than 3 i in
+      if (i + i >= x) then
+	let x = min x (n - 1) in
+	  ctimesj use_complex_arith (self (x - i)) (self x)
+      else let x = largest_power_smaller_than 3 i in
+	ctimes use_complex_arith (self (i - x)) (self x))
+    in make_bytwiddle sign use_complex_arith g f
+  and twidlen n = 2 * (terms_needed 0 1 0 n)
+  and twdesc n =
+    (List.map 
+       (fun i -> 
+	  let x = min (pow 3 i) (n - 1) in
+	    TW_CEXP, v, x)
+       (iota ((twidlen n) / 2)))
+    @ [(TW_NEXT, 1, 0)]
+  in bytwiddle, twidlen, twdesc
+    
+let current_twiddle_policy = ref twiddle_policy_load_all
+
+let twiddle_policy use_complex_arith = 
+  !current_twiddle_policy use_complex_arith
+
+let set_policy x = Arg.Unit (fun () -> current_twiddle_policy := x)
+let set_policy_int x = Arg.Int (fun i -> current_twiddle_policy := x i)
+
+let undocumented = " Undocumented twiddle policy"
+
+let speclist = [
+  "-twiddle-load-all", set_policy twiddle_policy_load_all, undocumented;
+  "-twiddle-log2", set_policy twiddle_policy_log2, undocumented;
+  "-twiddle-log3", set_policy twiddle_policy_log3, undocumented;
+] 
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/twiddle.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/twiddle.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,32 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+val speclist : (string * Arg.spec * string) list
+
+type twinstr
+
+val twiddle_policy :
+  int -> bool ->
+  (int -> int -> (int -> Complex.expr) -> (int -> Complex.expr) ->
+     int -> Complex.expr) *(int -> int) * (int -> twinstr list)
+
+val twinstr_to_c_string : twinstr list -> string
+val twinstr_to_simd_string : string -> twinstr list -> string
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/unique.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/unique.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,38 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* repository of unique tokens *)
+
+type unique = Unique of unit
+
+(* this depends on the compiler not being too smart *)
+let make () =
+  let make_aux x = Unique x in
+  make_aux ()
+
+(* note that the obvious definition
+
+      let make () = Unique ()
+
+   fails *)
+
+let same (a : unique) (b : unique) =
+  (a == b)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/unique.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/unique.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,24 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+type unique
+val make : unit -> unique
+val same : unique -> unique -> bool
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/util.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/util.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,176 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+(* various utility functions *)
+open List
+open Unix 
+
+(*****************************************
+ * Integer operations
+ *****************************************)
+(* fint the inverse of n modulo m *)
+let invmod n m =
+    let rec loop i =
+	if ((i * n) mod m == 1) then i
+	else loop (i + 1)
+    in
+	loop 1
+
+(* Yooklid's algorithm *)
+let rec gcd n m =
+    if (n > m)
+      then gcd m n
+    else
+      let r = m mod n
+      in
+	  if (r == 0) then n
+	  else gcd r n
+
+(* reduce the fraction m/n to lowest terms, modulo factors of n/n *)
+let lowest_terms n m =
+    if (m mod n == 0) then
+      (1,0)
+    else
+      let nn = (abs n) in let mm = m * (n / nn)
+      in let mpos = 
+	  if (mm > 0) then (mm mod nn)
+	  else (mm + (1 + (abs mm) / nn) * nn) mod nn
+      and d = gcd nn (abs mm)
+      in (nn / d, mpos / d)
+
+(* find a generator for the multiplicative group mod p
+   (where p must be prime for a generator to exist!!) *)
+
+exception No_Generator
+
+let find_generator p =
+    let rec period x prod =
+ 	if (prod == 1) then 1
+	else 1 + (period x (prod * x mod p))
+    in let rec findgen x =
+	if (x == 0) then raise No_Generator
+	else if ((period x x) == (p - 1)) then x
+	else findgen ((x + 1) mod p)
+    in findgen 1
+
+(* raise x to a power n modulo p (requires n > 0) (in principle,
+   negative powers would be fine, provided that x and p are relatively
+   prime...we don't need this functionality, though) *)
+
+exception Negative_Power
+
+let rec pow_mod x n p =
+    if (n == 0) then 1
+    else if (n < 0) then raise Negative_Power
+    else if (n mod 2 == 0) then pow_mod (x * x mod p) (n / 2) p
+    else x * (pow_mod x (n - 1) p) mod p
+
+(******************************************
+ * auxiliary functions 
+ ******************************************)
+let rec forall id combiner a b f =
+    if (a >= b) then id
+    else combiner (f a) (forall id combiner (a + 1) b f)
+
+let sum_list l = fold_right (+) l 0
+let max_list l = fold_right (max) l (-999999)
+let min_list l = fold_right (min) l 999999
+let count pred = fold_left 
+    (fun a elem -> if (pred elem) then 1 + a else a) 0
+let remove elem = List.filter (fun e -> (e != elem))
+let cons a b = a :: b
+let null = function 
+    [] -> true
+  | _ -> false
+let for_list l f = List.iter f l
+let rmap l f = List.map f l
+
+(* functional composition *)
+let (@@) f g x = f (g x)
+
+let forall_flat a b = forall [] (@) a b
+
+let identity x = x
+
+let rec minimize f = function
+    [] -> None
+  | elem :: rest ->
+      match minimize f rest with
+	None -> Some elem
+      |	Some x -> if (f x) >= (f elem) then Some elem else Some x
+
+
+let rec find_elem condition = function
+    [] -> None
+  | elem :: rest ->
+      if condition elem then
+	Some elem
+      else
+	find_elem condition rest
+
+
+(* find x, x >= a, such that (p x) is true *)
+let rec suchthat a pred =
+  if (pred a) then a else suchthat (a + 1) pred
+
+(* print an information message *)
+let info string =
+  if !Magic.verbose then begin
+    let now = Unix.times () 
+    and pid = Unix.getpid () in
+    prerr_string ((string_of_int pid) ^ ": " ^
+		  "at t = " ^  (string_of_float now.tms_utime) ^ " : ");
+    prerr_string (string ^ "\n");
+    flush Pervasives.stderr;
+  end
+
+(* iota n produces the list [0; 1; ...; n - 1] *)
+let iota n = forall [] cons 0 n identity
+
+(* interval a b produces the list [a; 1; ...; b - 1] *)
+let interval a b = List.map ((+) a) (iota (b - a))
+
+(*
+ * freeze a function, i.e., compute it only once on demand, and
+ * cache it into an array.
+ *)
+let array n f =
+  let a = Array.init n (fun i -> lazy (f i))
+  in fun i -> Lazy.force a.(i)
+
+
+let rec take n l =
+  match (n, l) with
+    (0, _) -> []
+  | (n, (a :: b)) -> a :: (take (n - 1) b)
+  | _ -> failwith "take"
+
+let rec drop n l =
+  match (n, l) with
+    (0, _) -> l
+  | (n, (_ :: b)) -> drop (n - 1) b
+  | _ -> failwith "drop"
+
+
+let either a b =
+  match a with
+    Some x -> x
+  | _ -> b
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/util.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/util.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,49 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+val invmod : int -> int -> int
+val gcd : int -> int -> int
+val lowest_terms : int -> int -> int * int
+val find_generator : int -> int
+val pow_mod : int -> int -> int -> int
+val forall : 'a -> ('b -> 'a -> 'a) -> int -> int -> (int -> 'b) -> 'a
+val sum_list : int list -> int
+val max_list : int list -> int
+val min_list : int list -> int
+val count : ('a -> bool) -> 'a list -> int
+val remove : 'a -> 'a list -> 'a list
+val for_list : 'a list -> ('a -> unit) -> unit
+val rmap : 'a list -> ('a -> 'b) -> 'b list
+val cons : 'a -> 'a list -> 'a list
+val null : 'a list -> bool
+val (@@) : ('a -> 'b) -> ('c -> 'a) -> 'c -> 'b
+val forall_flat : int -> int -> (int -> 'a list) -> 'a list
+val identity : 'a -> 'a
+val minimize : ('a -> 'b) -> 'a list -> 'a option
+val find_elem : ('a -> bool) -> 'a list -> 'a option
+val suchthat : int -> (int -> bool) -> int
+val info : string -> unit
+val iota : int -> int list
+val interval : int -> int -> int list
+val array : int -> (int -> 'a) -> int -> 'a
+val take : int -> 'a list -> 'a list
+val drop : int -> 'a list -> 'a list
+val either : 'a option -> 'a -> 'a
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/variable.ml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/variable.ml	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,108 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+type variable = 
+      (* temporary variables generated automatically *)
+  | Temporary of int
+      (* memory locations, e.g., array elements *)
+  | Locative of (Unique.unique * Unique.unique *
+		   (int -> string) * int * string)
+      (* constant values, e.g., twiddle factors *)
+  | Constant of (Unique.unique * string)
+
+let hash v = Hashtbl.hash v
+
+let same a b = (a == b)
+
+let is_constant = function
+  | Constant _ -> true
+  | _ -> false
+
+let is_temporary = function
+  | Temporary _ -> true
+  | _ -> false
+
+let is_locative = function
+  | Locative _ -> true
+  | _ -> false
+
+let same_location a b = 
+  match (a, b) with
+  | (Locative (location_a, _, _, _, _), Locative (location_b, _, _, _, _)) ->
+      Unique.same location_a location_b
+  | _ -> false
+
+let same_class a b = 
+  match (a, b) with
+  | (Locative (_, class_a, _, _, _), Locative (_, class_b, _, _, _)) ->
+      Unique.same class_a class_b
+  | (Constant (class_a, _), Constant (class_b, _)) ->
+      Unique.same class_a class_b
+  | _ -> false
+
+let make_temporary =
+  let tmp_count = ref 0
+  in fun () -> begin
+    tmp_count := !tmp_count + 1;
+    Temporary !tmp_count
+  end
+
+let make_constant class_token name = 
+  Constant (class_token, name)
+
+let make_locative location_token class_token name i vs =
+  Locative (location_token, class_token, name, i, vs)
+
+let vstride_of_locative = function
+  | Locative (_, _, _, _, vs) -> vs
+  | _ -> failwith "vstride_of_locative"
+
+(* special naming conventions for variables *)
+let rec base62_of_int k = 
+  let x = k mod 62 
+  and y = k / 62 in
+  let c = 
+    if x < 10 then 
+      Char.chr (x + Char.code '0')
+    else if x < 36 then
+      Char.chr (x + Char.code 'a' - 10)
+    else 
+      Char.chr (x + Char.code 'A' - 36)
+  in
+  let s = String.make 1 c in
+  let r = if y == 0 then "" else base62_of_int y in
+  r ^ s
+
+let varname_of_int k =
+  if !Magic.compact then
+    base62_of_int k
+  else
+    string_of_int k
+
+let unparse = function
+  | Temporary k -> "T" ^ (varname_of_int k)
+  | Constant (_, name) -> name
+  | Locative (_, _, name, i, _) -> name i
+
+let unparse_for_alignment m = function
+  | Locative (_, _, name, i, _) -> name (i mod m)
+  | _ -> failwith "unparse_for_alignment"
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/genfft/variable.mli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/genfft/variable.mli	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,38 @@
+(*
+ * Copyright (c) 1997-1999 Massachusetts Institute of Technology
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *)
+
+type variable
+
+val hash : variable -> int
+val same : variable -> variable -> bool
+val is_constant : variable -> bool
+val is_temporary : variable -> bool
+val is_locative : variable -> bool
+val same_location : variable -> variable -> bool
+val same_class : variable -> variable -> bool
+val make_temporary : unit -> variable
+val make_constant : Unique.unique -> string -> variable
+val make_locative :
+  Unique.unique -> Unique.unique -> (int -> string) -> 
+  int -> string -> variable
+val unparse : variable -> string
+val unparse_for_alignment : int -> variable -> string
+val vstride_of_locative : variable -> string
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/install-sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/install-sh	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,527 @@
+#!/bin/sh
+# install - install a program, script, or datafile
+
+scriptversion=2011-01-19.21; # UTC
+
+# This originates from X11R5 (mit/util/scripts/install.sh), which was
+# later released in X11R6 (xc/config/util/install.sh) with the
+# following copyright and license.
+#
+# Copyright (C) 1994 X Consortium
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# Except as contained in this notice, the name of the X Consortium shall not
+# be used in advertising or otherwise to promote the sale, use or other deal-
+# ings in this Software without prior written authorization from the X Consor-
+# tium.
+#
+#
+# FSF changes to this file are in the public domain.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.
+
+nl='
+'
+IFS=" ""	$nl"
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit=${DOITPROG-}
+if test -z "$doit"; then
+  doit_exec=exec
+else
+  doit_exec=$doit
+fi
+
+# Put in absolute file names if you don't have them in your path;
+# or use environment vars.
+
+chgrpprog=${CHGRPPROG-chgrp}
+chmodprog=${CHMODPROG-chmod}
+chownprog=${CHOWNPROG-chown}
+cmpprog=${CMPPROG-cmp}
+cpprog=${CPPROG-cp}
+mkdirprog=${MKDIRPROG-mkdir}
+mvprog=${MVPROG-mv}
+rmprog=${RMPROG-rm}
+stripprog=${STRIPPROG-strip}
+
+posix_glob='?'
+initialize_posix_glob='
+  test "$posix_glob" != "?" || {
+    if (set -f) 2>/dev/null; then
+      posix_glob=
+    else
+      posix_glob=:
+    fi
+  }
+'
+
+posix_mkdir=
+
+# Desired mode of installed file.
+mode=0755
+
+chgrpcmd=
+chmodcmd=$chmodprog
+chowncmd=
+mvcmd=$mvprog
+rmcmd="$rmprog -f"
+stripcmd=
+
+src=
+dst=
+dir_arg=
+dst_arg=
+
+copy_on_change=false
+no_target_directory=
+
+usage="\
+Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
+   or: $0 [OPTION]... SRCFILES... DIRECTORY
+   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
+   or: $0 [OPTION]... -d DIRECTORIES...
+
+In the 1st form, copy SRCFILE to DSTFILE.
+In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
+In the 4th, create DIRECTORIES.
+
+Options:
+     --help     display this help and exit.
+     --version  display version info and exit.
+
+  -c            (ignored)
+  -C            install only if different (preserve the last data modification time)
+  -d            create directories instead of installing files.
+  -g GROUP      $chgrpprog installed files to GROUP.
+  -m MODE       $chmodprog installed files to MODE.
+  -o USER       $chownprog installed files to USER.
+  -s            $stripprog installed files.
+  -t DIRECTORY  install into DIRECTORY.
+  -T            report an error if DSTFILE is a directory.
+
+Environment variables override the default commands:
+  CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG
+  RMPROG STRIPPROG
+"
+
+while test $# -ne 0; do
+  case $1 in
+    -c) ;;
+
+    -C) copy_on_change=true;;
+
+    -d) dir_arg=true;;
+
+    -g) chgrpcmd="$chgrpprog $2"
+	shift;;
+
+    --help) echo "$usage"; exit $?;;
+
+    -m) mode=$2
+	case $mode in
+	  *' '* | *'	'* | *'
+'*	  | *'*'* | *'?'* | *'['*)
+	    echo "$0: invalid mode: $mode" >&2
+	    exit 1;;
+	esac
+	shift;;
+
+    -o) chowncmd="$chownprog $2"
+	shift;;
+
+    -s) stripcmd=$stripprog;;
+
+    -t) dst_arg=$2
+	# Protect names problematic for `test' and other utilities.
+	case $dst_arg in
+	  -* | [=\(\)!]) dst_arg=./$dst_arg;;
+	esac
+	shift;;
+
+    -T) no_target_directory=true;;
+
+    --version) echo "$0 $scriptversion"; exit $?;;
+
+    --)	shift
+	break;;
+
+    -*)	echo "$0: invalid option: $1" >&2
+	exit 1;;
+
+    *)  break;;
+  esac
+  shift
+done
+
+if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
+  # When -d is used, all remaining arguments are directories to create.
+  # When -t is used, the destination is already specified.
+  # Otherwise, the last argument is the destination.  Remove it from $@.
+  for arg
+  do
+    if test -n "$dst_arg"; then
+      # $@ is not empty: it contains at least $arg.
+      set fnord "$@" "$dst_arg"
+      shift # fnord
+    fi
+    shift # arg
+    dst_arg=$arg
+    # Protect names problematic for `test' and other utilities.
+    case $dst_arg in
+      -* | [=\(\)!]) dst_arg=./$dst_arg;;
+    esac
+  done
+fi
+
+if test $# -eq 0; then
+  if test -z "$dir_arg"; then
+    echo "$0: no input file specified." >&2
+    exit 1
+  fi
+  # It's OK to call `install-sh -d' without argument.
+  # This can happen when creating conditional directories.
+  exit 0
+fi
+
+if test -z "$dir_arg"; then
+  do_exit='(exit $ret); exit $ret'
+  trap "ret=129; $do_exit" 1
+  trap "ret=130; $do_exit" 2
+  trap "ret=141; $do_exit" 13
+  trap "ret=143; $do_exit" 15
+
+  # Set umask so as not to create temps with too-generous modes.
+  # However, 'strip' requires both read and write access to temps.
+  case $mode in
+    # Optimize common cases.
+    *644) cp_umask=133;;
+    *755) cp_umask=22;;
+
+    *[0-7])
+      if test -z "$stripcmd"; then
+	u_plus_rw=
+      else
+	u_plus_rw='% 200'
+      fi
+      cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
+    *)
+      if test -z "$stripcmd"; then
+	u_plus_rw=
+      else
+	u_plus_rw=,u+rw
+      fi
+      cp_umask=$mode$u_plus_rw;;
+  esac
+fi
+
+for src
+do
+  # Protect names problematic for `test' and other utilities.
+  case $src in
+    -* | [=\(\)!]) src=./$src;;
+  esac
+
+  if test -n "$dir_arg"; then
+    dst=$src
+    dstdir=$dst
+    test -d "$dstdir"
+    dstdir_status=$?
+  else
+
+    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
+    # might cause directories to be created, which would be especially bad
+    # if $src (and thus $dsttmp) contains '*'.
+    if test ! -f "$src" && test ! -d "$src"; then
+      echo "$0: $src does not exist." >&2
+      exit 1
+    fi
+
+    if test -z "$dst_arg"; then
+      echo "$0: no destination specified." >&2
+      exit 1
+    fi
+    dst=$dst_arg
+
+    # If destination is a directory, append the input filename; won't work
+    # if double slashes aren't ignored.
+    if test -d "$dst"; then
+      if test -n "$no_target_directory"; then
+	echo "$0: $dst_arg: Is a directory" >&2
+	exit 1
+      fi
+      dstdir=$dst
+      dst=$dstdir/`basename "$src"`
+      dstdir_status=0
+    else
+      # Prefer dirname, but fall back on a substitute if dirname fails.
+      dstdir=`
+	(dirname "$dst") 2>/dev/null ||
+	expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	     X"$dst" : 'X\(//\)[^/]' \| \
+	     X"$dst" : 'X\(//\)$' \| \
+	     X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
+	echo X"$dst" |
+	    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\/\)[^/].*/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\/\)$/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\).*/{
+		   s//\1/
+		   q
+		 }
+		 s/.*/./; q'
+      `
+
+      test -d "$dstdir"
+      dstdir_status=$?
+    fi
+  fi
+
+  obsolete_mkdir_used=false
+
+  if test $dstdir_status != 0; then
+    case $posix_mkdir in
+      '')
+	# Create intermediate dirs using mode 755 as modified by the umask.
+	# This is like FreeBSD 'install' as of 1997-10-28.
+	umask=`umask`
+	case $stripcmd.$umask in
+	  # Optimize common cases.
+	  *[2367][2367]) mkdir_umask=$umask;;
+	  .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
+
+	  *[0-7])
+	    mkdir_umask=`expr $umask + 22 \
+	      - $umask % 100 % 40 + $umask % 20 \
+	      - $umask % 10 % 4 + $umask % 2
+	    `;;
+	  *) mkdir_umask=$umask,go-w;;
+	esac
+
+	# With -d, create the new directory with the user-specified mode.
+	# Otherwise, rely on $mkdir_umask.
+	if test -n "$dir_arg"; then
+	  mkdir_mode=-m$mode
+	else
+	  mkdir_mode=
+	fi
+
+	posix_mkdir=false
+	case $umask in
+	  *[123567][0-7][0-7])
+	    # POSIX mkdir -p sets u+wx bits regardless of umask, which
+	    # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
+	    ;;
+	  *)
+	    tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
+	    trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
+
+	    if (umask $mkdir_umask &&
+		exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+	    then
+	      if test -z "$dir_arg" || {
+		   # Check for POSIX incompatibilities with -m.
+		   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
+		   # other-writeable bit of parent directory when it shouldn't.
+		   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
+		   ls_ld_tmpdir=`ls -ld "$tmpdir"`
+		   case $ls_ld_tmpdir in
+		     d????-?r-*) different_mode=700;;
+		     d????-?--*) different_mode=755;;
+		     *) false;;
+		   esac &&
+		   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
+		     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+		     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
+		   }
+		 }
+	      then posix_mkdir=:
+	      fi
+	      rmdir "$tmpdir/d" "$tmpdir"
+	    else
+	      # Remove any dirs left behind by ancient mkdir implementations.
+	      rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+	    fi
+	    trap '' 0;;
+	esac;;
+    esac
+
+    if
+      $posix_mkdir && (
+	umask $mkdir_umask &&
+	$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
+      )
+    then :
+    else
+
+      # The umask is ridiculous, or mkdir does not conform to POSIX,
+      # or it failed possibly due to a race condition.  Create the
+      # directory the slow way, step by step, checking for races as we go.
+
+      case $dstdir in
+	/*) prefix='/';;
+	[-=\(\)!]*) prefix='./';;
+	*)  prefix='';;
+      esac
+
+      eval "$initialize_posix_glob"
+
+      oIFS=$IFS
+      IFS=/
+      $posix_glob set -f
+      set fnord $dstdir
+      shift
+      $posix_glob set +f
+      IFS=$oIFS
+
+      prefixes=
+
+      for d
+      do
+	test X"$d" = X && continue
+
+	prefix=$prefix$d
+	if test -d "$prefix"; then
+	  prefixes=
+	else
+	  if $posix_mkdir; then
+	    (umask=$mkdir_umask &&
+	     $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
+	    # Don't fail if two instances are running concurrently.
+	    test -d "$prefix" || exit 1
+	  else
+	    case $prefix in
+	      *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
+	      *) qprefix=$prefix;;
+	    esac
+	    prefixes="$prefixes '$qprefix'"
+	  fi
+	fi
+	prefix=$prefix/
+      done
+
+      if test -n "$prefixes"; then
+	# Don't fail if two instances are running concurrently.
+	(umask $mkdir_umask &&
+	 eval "\$doit_exec \$mkdirprog $prefixes") ||
+	  test -d "$dstdir" || exit 1
+	obsolete_mkdir_used=true
+      fi
+    fi
+  fi
+
+  if test -n "$dir_arg"; then
+    { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
+    { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
+      test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
+  else
+
+    # Make a couple of temp file names in the proper directory.
+    dsttmp=$dstdir/_inst.$$_
+    rmtmp=$dstdir/_rm.$$_
+
+    # Trap to clean up those temp files at exit.
+    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
+
+    # Copy the file name to the temp name.
+    (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
+
+    # and set any options; do chmod last to preserve setuid bits.
+    #
+    # If any of these fail, we abort the whole thing.  If we want to
+    # ignore errors from any of these, just make sure not to ignore
+    # errors from the above "$doit $cpprog $src $dsttmp" command.
+    #
+    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } &&
+    { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } &&
+    { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
+
+    # If -C, don't bother to copy if it wouldn't change the file.
+    if $copy_on_change &&
+       old=`LC_ALL=C ls -dlL "$dst"	2>/dev/null` &&
+       new=`LC_ALL=C ls -dlL "$dsttmp"	2>/dev/null` &&
+
+       eval "$initialize_posix_glob" &&
+       $posix_glob set -f &&
+       set X $old && old=:$2:$4:$5:$6 &&
+       set X $new && new=:$2:$4:$5:$6 &&
+       $posix_glob set +f &&
+
+       test "$old" = "$new" &&
+       $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
+    then
+      rm -f "$dsttmp"
+    else
+      # Rename the file to the real destination.
+      $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null ||
+
+      # The rename failed, perhaps because mv can't rename something else
+      # to itself, or perhaps because mv is so ancient that it does not
+      # support -f.
+      {
+	# Now remove or move aside any old file at destination location.
+	# We try this two ways since rm can't unlink itself on some
+	# systems and the destination file might be busy for other
+	# reasons.  In this case, the final cleanup might fail but the new
+	# file should still install successfully.
+	{
+	  test ! -f "$dst" ||
+	  $doit $rmcmd -f "$dst" 2>/dev/null ||
+	  { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
+	    { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
+	  } ||
+	  { echo "$0: cannot unlink or rename $dst" >&2
+	    (exit 1); exit 1
+	  }
+	} &&
+
+	# Now rename the file to the real destination.
+	$doit $mvcmd "$dsttmp" "$dst"
+      }
+    fi || exit 1
+
+    trap '' 0
+  fi
+done
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,13 @@
+AM_CPPFLAGS = -I$(top_srcdir)/simd
+noinst_LTLIBRARIES = libkernel.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = ifftw.h cycle.h
+
+libkernel_la_SOURCES = align.c alloc.c assert.c awake.c buffered.c	\
+cpy1d.c cpy2d-pair.c cpy2d.c ct.c debug.c extract-reim.c hash.c iabs.c	\
+kalloc.c md5-1.c md5.c minmax.c ops.c pickdim.c plan.c planner.c	\
+primes.c print.c problem.c rader.c scan.c solver.c solvtab.c stride.c	\
+tensor.c tensor1.c tensor2.c tensor3.c tensor4.c tensor5.c tensor7.c	\
+tensor8.c tensor9.c tile2d.c timer.c transpose.c trig.c twiddle.c	\
+cycle.h ifftw.h
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,592 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = kernel
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libkernel_la_LIBADD =
+am_libkernel_la_OBJECTS = align.lo alloc.lo assert.lo awake.lo \
+	buffered.lo cpy1d.lo cpy2d-pair.lo cpy2d.lo ct.lo debug.lo \
+	extract-reim.lo hash.lo iabs.lo kalloc.lo md5-1.lo md5.lo \
+	minmax.lo ops.lo pickdim.lo plan.lo planner.lo primes.lo \
+	print.lo problem.lo rader.lo scan.lo solver.lo solvtab.lo \
+	stride.lo tensor.lo tensor1.lo tensor2.lo tensor3.lo \
+	tensor4.lo tensor5.lo tensor7.lo tensor8.lo tensor9.lo \
+	tile2d.lo timer.lo transpose.lo trig.lo twiddle.lo
+libkernel_la_OBJECTS = $(am_libkernel_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libkernel_la_SOURCES)
+DIST_SOURCES = $(libkernel_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/simd
+noinst_LTLIBRARIES = libkernel.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = ifftw.h cycle.h
+libkernel_la_SOURCES = align.c alloc.c assert.c awake.c buffered.c	\
+cpy1d.c cpy2d-pair.c cpy2d.c ct.c debug.c extract-reim.c hash.c iabs.c	\
+kalloc.c md5-1.c md5.c minmax.c ops.c pickdim.c plan.c planner.c	\
+primes.c print.c problem.c rader.c scan.c solver.c solvtab.c stride.c	\
+tensor.c tensor1.c tensor2.c tensor3.c tensor4.c tensor5.c tensor7.c	\
+tensor8.c tensor9.c tile2d.c timer.c transpose.c trig.c twiddle.c	\
+cycle.h ifftw.h
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu kernel/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu kernel/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libkernel.la: $(libkernel_la_OBJECTS) $(libkernel_la_DEPENDENCIES) $(EXTRA_libkernel_la_DEPENDENCIES) 
+	$(LINK)  $(libkernel_la_OBJECTS) $(libkernel_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/align.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/assert.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/awake.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffered.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cpy1d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cpy2d-pair.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cpy2d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/debug.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/extract-reim.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hash.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iabs.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kalloc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/md5-1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/md5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/minmax.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ops.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pickdim.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/planner.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/primes.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/print.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rader.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scan.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solver.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solvtab.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stride.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tile2d.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/timer.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/trig.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/twiddle.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/align.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/align.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+#if HAVE_SIMD
+#  define ALGN 16
+#else
+   /* disable the alignment machinery, because it will break,
+      e.g., if sizeof(R) == 12 (as in long-double/x86) */
+#  define ALGN 0
+#endif
+
+/* NONPORTABLE */
+int X(alignment_of)(R *p)
+{
+#if ALGN == 0
+     UNUSED(p);
+     return 0;
+#else
+     return (int)(((uintptr_t) p) % ALGN);
+#endif
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/alloc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/alloc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+
+/**********************************************************
+ *   DEBUGGING CODE
+ **********************************************************/
+#if defined(FFTW_DEBUG_MALLOC)
+
+#include <stdio.h>
+
+/*
+  debugging malloc/free. 
+ 
+  1) Initialize every malloced and freed area to random values, just
+  to make sure we are not using uninitialized pointers.
+ 
+  2) check for blocks freed twice.
+ 
+  3) Check for writes past the ends of allocated blocks
+ 
+  4) destroy contents of freed blocks in order to detect incorrect reuse.
+ 
+  5) keep track of who allocates what and report memory leaks
+ 
+  This code is a quick and dirty hack.  May be nonportable. 
+  Use at your own risk.
+ 
+*/
+
+#define MAGIC ((size_t)0xABadCafe)
+#define PAD_FACTOR 2
+#define SZ_HEADER (4 * sizeof(size_t))
+#define HASHSZ 1031
+
+static unsigned int hashaddr(void *p)
+{
+     return ((unsigned long)p) % HASHSZ;
+}
+
+struct mstat {
+     int siz;
+     int maxsiz;
+     int cnt;
+     int maxcnt;
+};
+
+static struct mstat mstat[MALLOC_WHAT_LAST];
+
+struct minfo {
+     const char *file;
+     int line;
+     size_t n;
+     void *p;
+     struct minfo *next;
+};
+
+static struct minfo *minfo[HASHSZ] = {0};
+
+#if defined(HAVE_THREADS) || defined(HAVE_OPENMP)
+int X(in_thread) = 0;
+#endif
+
+void *X(malloc_debug)(size_t n, enum malloc_tag what,
+                      const char *file, int line)
+{
+     char *p;
+     size_t i;
+     struct minfo *info;
+     struct mstat *stat = mstat + what;
+     struct mstat *estat = mstat + EVERYTHING;
+
+     if (n == 0)
+          n = 1;
+
+     if (!IN_THREAD) {
+	  stat->siz += n;
+	  if (stat->siz > stat->maxsiz)
+	       stat->maxsiz = stat->siz;
+	  estat->siz += n;
+	  if (estat->siz > estat->maxsiz)
+	       estat->maxsiz = estat->siz;
+     }
+
+     p = (char *) X(kernel_malloc)(PAD_FACTOR * n + SZ_HEADER);
+     A(p);
+
+     /* store the sz in a known position */
+     ((size_t *) p)[0] = n;
+     ((size_t *) p)[1] = MAGIC;
+     ((size_t *) p)[2] = what;
+
+     /* fill with junk */
+     for (i = 0; i < PAD_FACTOR * n; i++)
+          p[i + SZ_HEADER] = (char) (i ^ 0xEF);
+
+     if (!IN_THREAD) {
+	  ++stat->cnt;
+	  ++estat->cnt;
+	  
+	  if (stat->cnt > stat->maxcnt)
+	       stat->maxcnt = stat->cnt;
+	  if (estat->cnt > estat->maxcnt)
+	       estat->maxcnt = estat->cnt;
+     }
+
+     /* skip the info we stored previously */
+     p = p + SZ_HEADER;
+
+     if (!IN_THREAD) {
+	  unsigned int h = hashaddr(p);
+	  /* record allocation in allocation list */
+	  info = (struct minfo *) malloc(sizeof(struct minfo));
+	  info->n = n;
+	  info->file = file;
+	  info->line = line;
+	  info->p = p;
+	  info->next = minfo[h];
+	  minfo[h] = info;
+     }
+
+     return (void *) p;
+}
+
+void X(ifree)(void *p)
+{
+     char *q;
+
+     A(p);
+
+     q = ((char *) p) - SZ_HEADER;
+     A(q);
+
+     {
+          size_t n = ((size_t *) q)[0];
+          size_t magic = ((size_t *) q)[1];
+          int what = ((size_t *) q)[2];
+          size_t i;
+          struct mstat *stat = mstat + what;
+          struct mstat *estat = mstat + EVERYTHING;
+
+          /* set to zero to detect duplicate free's */
+          ((size_t *) q)[0] = 0;
+
+          A(magic == MAGIC);
+          ((size_t *) q)[1] = ~MAGIC;
+
+	  if (!IN_THREAD) {
+	       stat->siz -= n;
+	       A(stat->siz >= 0);
+	       estat->siz -= n;
+	       A(estat->siz >= 0);
+	  }
+
+          /* check for writing past end of array: */
+          for (i = n; i < PAD_FACTOR * n; ++i)
+               if (q[i + SZ_HEADER] != (char) (i ^ 0xEF)) {
+                    A(0 /* array bounds overwritten */ );
+               }
+          for (i = 0; i < PAD_FACTOR * n; ++i)
+               q[i + SZ_HEADER] = (char) (i ^ 0xAD);
+
+	  if (!IN_THREAD) {
+	       --stat->cnt;
+	       --estat->cnt;
+	       
+	       A(stat->cnt >= 0);
+	       A((stat->cnt == 0 && stat->siz == 0) ||
+		 (stat->cnt > 0 && stat->siz > 0));
+	       A(estat->cnt >= 0);
+	       A((estat->cnt == 0 && estat->siz == 0) ||
+		 (estat->cnt > 0 && estat->siz > 0));
+	  }
+
+          X(kernel_free)(q);
+     }
+
+     if (!IN_THREAD) {
+          /* delete minfo entry */
+	  unsigned int h = hashaddr(p);
+          struct minfo **i;
+
+          for (i = minfo + h; *i; i = &((*i)->next)) {
+               if ((*i)->p == p) {
+                    struct minfo *i0 = (*i)->next;
+                    free(*i);
+                    *i = i0;
+                    return;
+               }
+          }
+
+          A(0 /* no entry in minfo list */ );
+     }
+}
+
+void X(malloc_print_minfo)(int verbose)
+{
+     struct minfo *info;
+     int what;
+     unsigned int h;
+     int leak = 0;
+
+     if (verbose > 2) {
+	  static const char *names[MALLOC_WHAT_LAST] = {
+	       "EVERYTHING",
+	       "PLANS", "SOLVERS", "PROBLEMS", "BUFFERS",
+	       "HASHT", "TENSORS", "PLANNERS", "SLVDSC", "TWIDDLES",
+	       "STRIDES", "OTHER"
+	  };
+
+	  printf("%12s %8s %8s %10s %10s\n",
+		 "what", "cnt", "maxcnt", "siz", "maxsiz");
+
+	  for (what = 0; what < MALLOC_WHAT_LAST; ++what) {
+	       struct mstat *stat = mstat + what;
+	       printf("%12s %8d %8d %10d %10d\n",
+		      names[what], stat->cnt, stat->maxcnt,
+		      stat->siz, stat->maxsiz);
+	  }
+     }
+
+     for (h = 0; h < HASHSZ; ++h) 
+	  if (minfo[h]) {
+	       printf("\nUnfreed allocations:\n");
+	       break;
+	  }
+
+     for (h = 0; h < HASHSZ; ++h) 
+	  for (info = minfo[h]; info; info = info->next) {
+	       leak = 1;
+	       printf("%s:%d:  %zd bytes at %p\n",
+		      info->file, info->line, info->n, info->p);
+	  }
+
+     if (leak)
+	  abort();
+}
+
+#else
+/**********************************************************
+ *   NON DEBUGGING CODE
+ **********************************************************/
+/* production version, no hacks */
+
+void *X(malloc_plain)(size_t n)
+{
+     void *p;
+     if (n == 0)
+          n = 1;
+     p = X(kernel_malloc)(n);
+     CK(p);
+
+#ifdef MIN_ALIGNMENT
+     A((((uintptr_t)p) % MIN_ALIGNMENT) == 0);
+#endif
+
+     return p;
+}
+
+void X(ifree)(void *p)
+{
+     X(kernel_free)(p);
+}
+
+#endif
+
+void X(ifree0)(void *p)
+{
+     /* common pattern */
+     if (p) X(ifree)(p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/assert.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/assert.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+void X(assertion_failed)(const char *s, int line, const char *file)
+{
+     fflush(stdout);
+     fprintf(stderr, "fftw: %s:%d: assertion failed: %s\n", file, line, s);
+#ifdef HAVE_ABORT
+     abort();
+#else
+     exit(EXIT_FAILURE);
+#endif
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/awake.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/awake.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+void X(null_awake)(plan *ego, enum wakefulness wakefulness)
+{
+     UNUSED(ego);
+     UNUSED(wakefulness);
+     /* do nothing */
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/buffered.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/buffered.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* routines shared by the various buffered solvers */
+
+#include "ifftw.h"
+
+#define DEFAULT_MAXNBUF ((INT)256)
+
+/* approx. 512KB of buffers for complex data */
+#define MAXBUFSZ (256 * 1024 / (INT)(sizeof(R)))
+
+INT X(nbuf)(INT n, INT vl, INT maxnbuf)
+{
+     INT i, nbuf, lb; 
+
+     if (!maxnbuf) 
+	  maxnbuf = DEFAULT_MAXNBUF;
+
+     nbuf = X(imin)(maxnbuf,
+		    X(imin)(vl, X(imax)((INT)1, MAXBUFSZ / n)));
+
+     /*
+      * Look for a buffer number (not too small) that divides the
+      * vector length, in order that we only need one child plan:
+      */
+     lb = X(imax)(1, nbuf / 4);
+     for (i = nbuf; i >= lb; --i)
+          if (vl % i == 0)
+               return i;
+
+     /* whatever... */
+     return nbuf;
+}
+
+#define SKEW 6 /* need to be even for SIMD */
+#define SKEWMOD 8 
+
+INT X(bufdist)(INT n, INT vl)
+{
+     if (vl == 1)
+	  return n;
+     else 
+	  /* return smallest X such that X >= N and X == SKEW (mod SKEWMOD) */
+	  return n + X(modulo)(SKEW - n, SKEWMOD);
+}
+
+int X(toobig)(INT n)
+{
+     return n > MAXBUFSZ;
+}
+
+/* TRUE if there exists i < which such that maxnbuf[i] and
+   maxnbuf[which] yield the same value, in which case we canonicalize
+   on the minimum value */
+int X(nbuf_redundant)(INT n, INT vl, int which, 
+		      const INT *maxnbuf, int nmaxnbuf)
+{
+     int i;
+     (void)nmaxnbuf; /* UNUSED */
+     for (i = 0; i < which; ++i)
+	  if (X(nbuf)(n, vl, maxnbuf[i]) == X(nbuf)(n, vl, maxnbuf[which]))
+	       return 1;
+     return 0;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/cpy1d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/cpy1d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* out of place 1D copy routine */
+#include "ifftw.h"
+
+void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl)
+{
+     INT i0, v;
+
+     A(I != O);
+     switch (vl) {
+	 case 1:
+	      if ((n0 & 1) || is0 != 1 || os0 != 1) {
+		   for (; n0 > 0; --n0, I += is0, O += os0)
+			*O = *I;
+		   break;
+	      }
+	      n0 /= 2; is0 = 2; os0 = 2;
+	      /* fall through */
+	 case 2:
+	      if ((n0 & 1) || is0 != 2 || os0 != 2) {
+		   for (; n0 > 0; --n0, I += is0, O += os0) {
+			R x0 = I[0];
+			R x1 = I[1];
+			O[0] = x0;
+			O[1] = x1;
+		   }
+		   break;
+	      }
+	      n0 /= 2; is0 = 4; os0 = 4;
+	      /* fall through */
+	 case 4:
+	      for (; n0 > 0; --n0, I += is0, O += os0) {
+		   R x0 = I[0];
+		   R x1 = I[1];
+		   R x2 = I[2];
+		   R x3 = I[3];
+		   O[0] = x0;
+		   O[1] = x1;
+		   O[2] = x2;
+		   O[3] = x3;
+	      }
+	      break;
+	 default:
+	      for (i0 = 0; i0 < n0; ++i0)
+		   for (v = 0; v < vl; ++v) {
+			R x0 = I[i0 * is0 + v];
+			O[i0 * os0 + v] = x0;
+		   }
+	      break;
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/cpy2d-pair.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/cpy2d-pair.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* out of place copy routines for pairs of isomorphic 2D arrays */
+#include "ifftw.h"
+
+void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
+		   INT n0, INT is0, INT os0,
+		   INT n1, INT is1, INT os1)
+{
+     INT i0, i1;
+
+     for (i1 = 0; i1 < n1; ++i1)
+	  for (i0 = 0; i0 < n0; ++i0) {
+	       R x0 = I0[i0 * is0 + i1 * is1];
+	       R x1 = I1[i0 * is0 + i1 * is1];
+	       O0[i0 * os0 + i1 * os1] = x0;
+	       O1[i0 * os0 + i1 * os1] = x1;
+	  }
+}
+
+/* like cpy2d_pair, but read input contiguously if possible */
+void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
+		      INT n0, INT is0, INT os0,
+		      INT n1, INT is1, INT os1)
+{
+     if (IABS(is0) < IABS(is1))	/* inner loop is for n0 */
+	  X(cpy2d_pair) (I0, I1, O0, O1, n0, is0, os0, n1, is1, os1);
+     else
+	  X(cpy2d_pair) (I0, I1, O0, O1, n1, is1, os1, n0, is0, os0);
+}
+
+/* like cpy2d_pair, but write output contiguously if possible */
+void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
+		      INT n0, INT is0, INT os0,
+		      INT n1, INT is1, INT os1)
+{
+     if (IABS(os0) < IABS(os1))	/* inner loop is for n0 */
+	  X(cpy2d_pair) (I0, I1, O0, O1, n0, is0, os0, n1, is1, os1);
+     else
+	  X(cpy2d_pair) (I0, I1, O0, O1, n1, is1, os1, n0, is0, os0);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/cpy2d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/cpy2d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* out of place 2D copy routines */
+#include "ifftw.h"
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+#  ifdef HAVE_XMMINTRIN_H
+#    include <xmmintrin.h>
+#    define WIDE_TYPE __m128
+#  endif
+#endif
+
+#ifndef WIDE_TYPE
+/* fall back to double, which means that WIDE_TYPE will be unused */
+#  define WIDE_TYPE double
+#endif
+
+void X(cpy2d)(R *I, R *O,
+	      INT n0, INT is0, INT os0,
+	      INT n1, INT is1, INT os1,
+	      INT vl)
+{
+     INT i0, i1, v;
+
+     switch (vl) {
+	 case 1:
+	      for (i1 = 0; i1 < n1; ++i1)
+		   for (i0 = 0; i0 < n0; ++i0) {
+			R x0 = I[i0 * is0 + i1 * is1];
+			O[i0 * os0 + i1 * os1] = x0;
+		   }
+	      break;
+	 case 2:
+	      if (1
+		  && (2 * sizeof(R) == sizeof(WIDE_TYPE))
+		  && (sizeof(WIDE_TYPE) > sizeof(double))
+		  && (((size_t)I) % sizeof(WIDE_TYPE) == 0)
+		  && (((size_t)O) % sizeof(WIDE_TYPE) == 0)
+		  && ((is0 & 1) == 0)
+		  && ((is1 & 1) == 0)
+		  && ((os0 & 1) == 0)
+		  && ((os1 & 1) == 0)) {
+		   /* copy R[2] as WIDE_TYPE if WIDE_TYPE is large
+		      enough to hold R[2], and if the input is
+		      properly aligned.  This is a win when R==double
+		      and WIDE_TYPE is 128 bits. */
+		   for (i1 = 0; i1 < n1; ++i1)
+			for (i0 = 0; i0 < n0; ++i0) {
+			     *(WIDE_TYPE *)&O[i0 * os0 + i1 * os1] =
+				  *(WIDE_TYPE *)&I[i0 * is0 + i1 * is1];
+			}
+	      } else if (1
+		  && (2 * sizeof(R) == sizeof(double))
+		  && (((size_t)I) % sizeof(double) == 0)
+		  && (((size_t)O) % sizeof(double) == 0)
+		  && ((is0 & 1) == 0)
+		  && ((is1 & 1) == 0)
+		  && ((os0 & 1) == 0)
+		  && ((os1 & 1) == 0)) {
+		   /* copy R[2] as double if double is large enough to
+		      hold R[2], and if the input is properly aligned.
+		      This case applies when R==float */
+		   for (i1 = 0; i1 < n1; ++i1)
+			for (i0 = 0; i0 < n0; ++i0) {
+			     *(double *)&O[i0 * os0 + i1 * os1] =
+				  *(double *)&I[i0 * is0 + i1 * is1];
+			}
+	      } else {
+		   for (i1 = 0; i1 < n1; ++i1)
+			for (i0 = 0; i0 < n0; ++i0) {
+			     R x0 = I[i0 * is0 + i1 * is1];
+			     R x1 = I[i0 * is0 + i1 * is1 + 1];
+			     O[i0 * os0 + i1 * os1] = x0;
+ 			     O[i0 * os0 + i1 * os1 + 1] = x1;
+			}
+	      }
+	      break;
+	 default:
+	      for (i1 = 0; i1 < n1; ++i1)
+		   for (i0 = 0; i0 < n0; ++i0)
+			for (v = 0; v < vl; ++v) {
+			     R x0 = I[i0 * is0 + i1 * is1 + v];
+			     O[i0 * os0 + i1 * os1 + v] = x0;
+			}
+	      break;
+     }
+}
+
+/* like cpy2d, but read input contiguously if possible */
+void X(cpy2d_ci)(R *I, R *O,
+		 INT n0, INT is0, INT os0,
+		 INT n1, INT is1, INT os1,
+		 INT vl)
+{
+     if (IABS(is0) < IABS(is1))	/* inner loop is for n0 */
+	  X(cpy2d) (I, O, n0, is0, os0, n1, is1, os1, vl);
+     else
+	  X(cpy2d) (I, O, n1, is1, os1, n0, is0, os0, vl);
+}
+
+/* like cpy2d, but write output contiguously if possible */
+void X(cpy2d_co)(R *I, R *O,
+		 INT n0, INT is0, INT os0,
+		 INT n1, INT is1, INT os1,
+		 INT vl)
+{
+     if (IABS(os0) < IABS(os1))	/* inner loop is for n0 */
+	  X(cpy2d) (I, O, n0, is0, os0, n1, is1, os1, vl);
+     else
+	  X(cpy2d) (I, O, n1, is1, os1, n0, is0, os0, vl);
+}
+
+
+/* tiled copy routines */
+struct cpy2d_closure {
+     R *I, *O;
+     INT is0, os0, is1, os1, vl;
+     R *buf;
+};
+
+static void dotile(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
+{
+     struct cpy2d_closure *k = (struct cpy2d_closure *)args;
+     X(cpy2d)(k->I + n0l * k->is0 + n1l * k->is1,
+	      k->O + n0l * k->os0 + n1l * k->os1,
+	      n0u - n0l, k->is0, k->os0,
+	      n1u - n1l, k->is1, k->os1,
+	      k->vl);
+}
+
+static void dotile_buf(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
+{
+     struct cpy2d_closure *k = (struct cpy2d_closure *)args;
+
+     /* copy from I to buf */
+     X(cpy2d_ci)(k->I + n0l * k->is0 + n1l * k->is1,
+		 k->buf,
+		 n0u - n0l, k->is0, k->vl,
+		 n1u - n1l, k->is1, k->vl * (n0u - n0l),
+		 k->vl);
+
+     /* copy from buf to O */
+     X(cpy2d_co)(k->buf,
+		 k->O + n0l * k->os0 + n1l * k->os1,
+		 n0u - n0l, k->vl, k->os0,
+		 n1u - n1l, k->vl * (n0u - n0l), k->os1,
+		 k->vl);
+}
+
+
+void X(cpy2d_tiled)(R *I, R *O,
+		    INT n0, INT is0, INT os0,
+		    INT n1, INT is1, INT os1, INT vl)
+{
+     INT tilesz = X(compute_tilesz)(vl,
+				    1 /* input array */
+				    + 1 /* ouput array */);
+     struct cpy2d_closure k;
+     k.I = I;
+     k.O = O;
+     k.is0 = is0;
+     k.os0 = os0;
+     k.is1 = is1;
+     k.os1 = os1;
+     k.vl = vl;
+     k.buf = 0; /* unused */
+     X(tile2d)(0, n0, 0, n1, tilesz, dotile, &k);
+}
+
+void X(cpy2d_tiledbuf)(R *I, R *O,
+		       INT n0, INT is0, INT os0,
+		       INT n1, INT is1, INT os1, INT vl)
+{
+     R buf[CACHESIZE / (2 * sizeof(R))];
+     /* input and buffer in cache, or
+	output and buffer in cache */
+     INT tilesz = X(compute_tilesz)(vl, 2);
+     struct cpy2d_closure k;
+     k.I = I;
+     k.O = O;
+     k.is0 = is0;
+     k.os0 = os0;
+     k.is1 = is1;
+     k.os1 = os1;
+     k.vl = vl;
+     k.buf = buf;
+     A(tilesz * tilesz * vl * sizeof(R) <= sizeof(buf));
+     X(tile2d)(0, n0, 0, n1, tilesz, dotile_buf, &k);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/ct.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/ct.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* common routines for Cooley-Tukey algorithms */
+
+#include "ifftw.h"
+
+#define POW2P(n) (((n) > 0) && (((n) & ((n) - 1)) == 0))
+
+/* TRUE if radix-r is ugly for size n */
+int X(ct_uglyp)(INT min_n, INT v, INT n, INT r)
+{
+     return (n <= min_n) || (POW2P(n) && (v * (n / r)) <= 4);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/cycle.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/cycle.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+
+/* machine-dependent cycle counters code. Needs to be inlined. */
+
+/***************************************************************************/
+/* To use the cycle counters in your code, simply #include "cycle.h" (this
+   file), and then use the functions/macros:
+
+                 ticks getticks(void);
+
+   ticks is an opaque typedef defined below, representing the current time.
+   You extract the elapsed time between two calls to gettick() via:
+
+                 double elapsed(ticks t1, ticks t0);
+
+   which returns a double-precision variable in arbitrary units.  You
+   are not expected to convert this into human units like seconds; it
+   is intended only for *comparisons* of time intervals.
+
+   (In order to use some of the OS-dependent timer routines like
+   Solaris' gethrtime, you need to paste the autoconf snippet below
+   into your configure.ac file and #include "config.h" before cycle.h,
+   or define the relevant macros manually if you are not using autoconf.)
+*/
+
+/***************************************************************************/
+/* This file uses macros like HAVE_GETHRTIME that are assumed to be
+   defined according to whether the corresponding function/type/header
+   is available on your system.  The necessary macros are most
+   conveniently defined if you are using GNU autoconf, via the tests:
+   
+   dnl ---------------------------------------------------------------------
+
+   AC_C_INLINE
+   AC_HEADER_TIME
+   AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
+
+   AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif])
+
+   AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time])
+
+   dnl Cray UNICOS _rtc() (real-time clock) intrinsic
+   AC_MSG_CHECKING([for _rtc intrinsic])
+   rtc_ok=yes
+   AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
+#include <intrinsics.h>
+#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
+   AC_MSG_RESULT($rtc_ok)
+
+   dnl ---------------------------------------------------------------------
+*/
+
+/***************************************************************************/
+
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
+{									  \
+     return (double)t1 - (double)t0;					  \
+}
+
+/*----------------------------------------------------------------*/
+/* Solaris */
+#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
+typedef hrtime_t ticks;
+
+#define getticks gethrtime
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* AIX v. 4+ routines to read the real-time clock or time-base register */
+#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
+typedef timebasestruct_t ticks;
+
+static __inline ticks getticks(void)
+{
+     ticks t;
+     read_real_time(&t, TIMEBASE_SZ);
+     return t;
+}
+
+static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
+{
+     time_base_to_time(&t1, TIMEBASE_SZ);
+     time_base_to_time(&t0, TIMEBASE_SZ);
+     return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + 
+	     ((double)t1.tb_low - (double)t0.tb_low));
+}
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PowerPC ``cycle'' counter using the time base register.
+ */
+#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__))))  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned int tbl, tbu0, tbu1;
+
+     do {
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+	  __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+     } while (tbu0 != tbu1);
+
+     return (((unsigned long long)tbu0) << 32) | tbl;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime,
+   from Carbon, requires no additional libraries to be linked). */
+#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER)
+#include <mach/mach_time.h>
+typedef uint64_t ticks;
+#define getticks mach_absolute_time
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * Pentium cycle counter 
+ */
+#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("rdtsc": "=A" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/* Visual C++ -- thanks to Morten Nissov for his help with this */
+#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER)
+#include <windows.h>
+typedef LARGE_INTEGER ticks;
+#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
+
+static __inline ticks getticks(void)
+{
+     ticks retval;
+
+     __asm {
+	  RDTSC
+	  mov retval.HighPart, edx
+	  mov retval.LowPart, eax
+     }
+     return retval;
+}
+
+static __inline double elapsed(ticks t1, ticks t0)
+{  
+     return (double)t1.QuadPart - (double)t0.QuadPart;
+}  
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * X86-64 cycle counter
+ */
+#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned a, d; 
+     asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
+     return ((ticks)a) | (((ticks)d) << 32); 
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori.
+   NOTE: this code will fail to link unless you use the -Masmkeyword compiler
+   option (grrr). */
+#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) 
+typedef unsigned long long ticks;
+static ticks getticks(void)
+{
+    asm(" rdtsc; shl    $0x20,%rdx; mov    %eax,%eax; or     %rdx,%rax;    ");
+}
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Visual C++, courtesy of Dirk Michaelis */
+#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER)
+
+#include <intrin.h>
+#pragma intrinsic(__rdtsc)
+typedef unsigned __int64 ticks;
+#define getticks __rdtsc
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * IA64 cycle counter
+ */
+
+/* intel's icc/ecc compiler */
+#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+#include <ia64intrin.h>
+
+static __inline__ ticks getticks(void)
+{
+     return __getReg(_IA64_REG_AR_ITC);
+}
+ 
+INLINE_ELAPSED(__inline__)
+ 
+#define HAVE_TICK_COUNTER
+#endif
+
+/* gcc */
+#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */
+#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER)
+#include <machine/sys/inline.h>
+typedef unsigned long ticks;
+
+static inline ticks getticks(void)
+{
+     ticks ret;
+
+     ret = _Asm_mov_from_ar (_AREG_ITC);
+     return ret;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Microsoft Visual C++ */
+#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned __int64 ticks;
+
+#  ifdef __cplusplus
+extern "C"
+#  endif
+ticks __getReg(int whichReg);
+#pragma intrinsic(__getReg)
+
+static __inline ticks getticks(void)
+{
+     volatile ticks temp;
+     temp = __getReg(3116);
+     return temp;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PA-RISC cycle counter 
+ */
+#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+#  ifdef __GNUC__
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("mfctl 16, %0": "=r" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+#  else
+#  include <machine/inline.h>
+static inline unsigned long getticks(void)
+{
+     register ticks ret;
+     _MFCTL(16, ret);
+     return ret;
+}
+#  endif
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* S390, courtesy of James Treacy */
+#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks cycles;
+     __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
+     return cycles;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
+/*
+ * The 32-bit cycle counter on alpha overflows pretty quickly, 
+ * unfortunately.  A 1GHz machine overflows in 4 seconds.
+ */
+typedef unsigned int ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned long cc;
+     __asm__ __volatile__ ("rpcc %0" : "=r"(cc));
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+     __asm__ __volatile__("rd %%tick, %0" : "=r" (ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
+#  include <c_asm.h>
+typedef unsigned int ticks;
+
+static __inline ticks getticks(void)
+{
+     unsigned long cc;
+     cc = asm("rpcc %v0");
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+/* SGI/Irix */
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
+typedef struct timespec ticks;
+
+static inline ticks getticks(void)
+{
+     struct timespec t;
+     clock_gettime(CLOCK_SGI_CYCLE, &t);
+     return t;
+}
+
+static inline double elapsed(ticks t1, ticks t0)
+{
+     return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
+	  ((double)t1.tv_nsec - (double)t0.tv_nsec);
+}
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* Cray UNICOS _rtc() intrinsic function */
+#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
+#ifdef HAVE_INTRINSICS_H
+#  include <intrinsics.h>
+#endif
+
+typedef long long ticks;
+
+#define getticks _rtc
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* MIPS ZBus */
+#if HAVE_MIPS_ZBUS_TIMER
+#if defined(__mips__) && !defined(HAVE_TICK_COUNTER)
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+typedef uint64_t ticks;
+
+static inline ticks getticks(void)
+{
+  static uint64_t* addr = 0;
+
+  if (addr == 0)
+  {
+    uint32_t rq_addr = 0x10030000;
+    int fd;
+    int pgsize;
+
+    pgsize = getpagesize();
+    fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0);
+    if (fd < 0) {
+      perror("open");
+      return NULL;
+    }
+    addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr);
+    close(fd);
+    if (addr == (uint64_t *)-1) {
+      perror("mmap");
+      return NULL;
+    }
+  }
+
+  return *addr;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+#endif /* HAVE_MIPS_ZBUS_TIMER */
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/debug.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/debug.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+
+#ifdef FFTW_DEBUG
+#include <stdio.h>
+
+typedef struct {
+     printer super;
+     FILE *f;
+} P_file;
+
+static void putchr_file(printer *p_, char c)
+{
+     P_file *p = (P_file *) p_;
+     fputc(c, p->f);
+}
+
+static printer *mkprinter_file(FILE *f)
+{
+     P_file *p = (P_file *) X(mkprinter)(sizeof(P_file), putchr_file, 0);
+     p->f = f;
+     return &p->super;
+}
+
+void X(debug)(const char *format, ...)
+{
+     va_list ap;
+     printer *p = mkprinter_file(stderr);
+     va_start(ap, format);
+     p->vprint(p, format, ap);
+     va_end(ap);
+     X(printer_destroy)(p);
+}
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/extract-reim.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/extract-reim.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+
+/* decompose complex pointer into real and imaginary parts.
+   Flip real and imaginary if there the sign does not match
+   FFTW's idea of what the sign should be */
+
+void X(extract_reim)(int sign, R *c, R **r, R **i)
+{
+     if (sign == FFT_SIGN) {
+          *r = c + 0;
+          *i = c + 1;
+     } else {
+          *r = c + 1;
+          *i = c + 0;
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/hash.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/hash.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+
+unsigned X(hash)(const char *s)
+{
+     unsigned h = 0xDEADBEEFu;
+     do {
+	  h = h * 17 + (int)*s;
+     } while (*s++);
+     return h;
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/iabs.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/iabs.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+INT X(iabs)(INT a)
+{
+     return a < 0 ? (0 - a) : a;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/ifftw.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/ifftw.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1160 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* FFTW internal header file */
+#ifndef __IFFTW_H__
+#define __IFFTW_H__
+
+#include "config.h"
+
+#include <stdlib.h>		/* size_t */
+#include <stdarg.h>		/* va_list */
+#include <stddef.h>             /* ptrdiff_t */
+
+#if HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+
+#if HAVE_STDINT_H
+# include <stdint.h>             /* uintptr_t, maybe */
+#endif
+
+#if HAVE_INTTYPES_H
+# include <inttypes.h>           /* uintptr_t, maybe */
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+/* Windows annoyances -- since tests/hook.c uses some internal
+   FFTW functions, we need to given them the dllexport attribute
+   under Windows when compiling as a DLL (see api/fftw3.h). */
+#if defined(FFTW_EXTERN)
+#  define IFFTW_EXTERN FFTW_EXTERN
+#elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
+ && (defined(_WIN32) || defined(__WIN32__))
+#  define IFFTW_EXTERN extern __declspec(dllexport)
+#else
+#  define IFFTW_EXTERN extern
+#endif
+
+/* determine precision and name-mangling scheme */
+#define CONCAT(prefix, name) prefix ## name
+#if defined(FFTW_SINGLE)
+  typedef float R;
+# define X(name) CONCAT(fftwf_, name)
+#elif defined(FFTW_LDOUBLE)
+  typedef long double R;
+# define X(name) CONCAT(fftwl_, name)
+# define TRIGREAL_IS_LONG_DOUBLE
+#elif defined(FFTW_QUAD)
+  typedef __float128 R;
+# define X(name) CONCAT(fftwq_, name)
+# define TRIGREAL_IS_QUAD
+#else
+  typedef double R;
+# define X(name) CONCAT(fftw_, name)
+#endif
+
+/*
+  integral type large enough to contain a stride (what ``int'' should
+  have been in the first place.
+*/
+typedef ptrdiff_t INT;
+
+/* dummy use of unused parameters to silence compiler warnings */
+#define UNUSED(x) (void)x
+
+#define NELEM(array) ((int) (sizeof(array) / sizeof((array)[0])))
+
+#define FFT_SIGN (-1)  /* sign convention for forward transforms */
+extern void X(extract_reim)(int sign, R *c, R **r, R **i);
+
+#define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
+
+#define STRINGIZEx(x) #x
+#define STRINGIZE(x) STRINGIZEx(x)
+#define CIMPLIES(ante, post) (!(ante) || (post))
+
+/* define HAVE_SIMD if any simd extensions are supported */
+#if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC) || \
+     defined(HAVE_MIPS_PS) || defined(HAVE_AVX)
+#define HAVE_SIMD 1
+#else
+#define HAVE_SIMD 0
+#endif
+
+extern int X(have_simd_sse2)(void);
+extern int X(have_simd_avx)(void);
+extern int X(have_simd_altivec)(void);
+extern int X(have_simd_neon)(void);
+
+/* forward declarations */
+typedef struct problem_s problem;
+typedef struct plan_s plan;
+typedef struct solver_s solver;
+typedef struct planner_s planner;
+typedef struct printer_s printer;
+typedef struct scanner_s scanner;
+
+/*-----------------------------------------------------------------------*/
+/* alloca: */
+#if HAVE_SIMD
+#  ifdef HAVE_AVX
+#    define MIN_ALIGNMENT 32  /* best alignment for AVX, conservative for
+			       * everything else */
+#  else
+     /* Note that we cannot use 32-byte alignment for all SIMD.  For
+	example, MacOS X malloc is 16-byte aligned, but there was no
+	posix_memalign in MacOS X until version 10.6. */
+#    define MIN_ALIGNMENT 16
+#  endif
+#endif
+
+#if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
+   /* use alloca if available */
+
+#ifndef alloca
+#ifdef __GNUC__
+# define alloca __builtin_alloca
+#else
+# ifdef _MSC_VER
+#  include <malloc.h>
+#  define alloca _alloca
+# else
+#  if HAVE_ALLOCA_H
+#   include <alloca.h>
+#  else
+#   ifdef _AIX
+ #pragma alloca
+#   else
+#    ifndef alloca /* predefined by HP cc +Olibcalls */
+void *alloca(size_t);
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+#endif
+
+#  ifdef MIN_ALIGNMENT
+#    define STACK_MALLOC(T, p, n)				\
+     {								\
+         p = (T)alloca((n) + MIN_ALIGNMENT);			\
+         p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) &	\
+               (~(uintptr_t)(MIN_ALIGNMENT - 1)));		\
+     }
+#    define STACK_FREE(n) 
+#  else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
+#    define STACK_MALLOC(T, p, n) p = (T)alloca(n) 
+#    define STACK_FREE(n) 
+#  endif
+
+#else /* ! HAVE_ALLOCA */
+   /* use malloc instead of alloca */
+#  define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
+#  define STACK_FREE(n) X(ifree)(n)
+#endif /* ! HAVE_ALLOCA */
+
+/* allocation of buffers.  If these grow too large use malloc(), else
+   use STACK_MALLOC (hopefully reducing to alloca()). */
+
+/* 64KiB ought to be enough for anybody */
+#define MAX_STACK_ALLOC ((size_t)64 * 1024)
+
+#define BUF_ALLOC(T, p, n)			\
+{						\
+     if (n < MAX_STACK_ALLOC) {			\
+	  STACK_MALLOC(T, p, n);		\
+     } else {					\
+	  p = (T)MALLOC(n, BUFFERS);		\
+     }						\
+}
+
+#define BUF_FREE(p, n)				\
+{						\
+     if (n < MAX_STACK_ALLOC) {			\
+	  STACK_FREE(p);			\
+     } else {					\
+	  X(ifree)(p);				\
+     }						\
+}
+
+/*-----------------------------------------------------------------------*/
+/* define uintptr_t if it is not already defined */
+
+#ifndef HAVE_UINTPTR_T
+#  if SIZEOF_VOID_P == 0
+#    error sizeof void* is unknown!
+#  elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
+     typedef unsigned int uintptr_t;
+#  elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
+     typedef unsigned long uintptr_t;
+#  elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
+     typedef unsigned long long uintptr_t;
+#  else
+#    error no unsigned integer type matches void* sizeof!
+#  endif
+#endif
+
+/*-----------------------------------------------------------------------*/
+/* We can do an optimization for copying pairs of (aligned) floats
+   when in single precision if 2*float = double. */
+
+#define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
+                           && SIZEOF_FLOAT != 0 \
+                           && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
+
+#define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
+
+/*-----------------------------------------------------------------------*/
+/* assert.c: */
+IFFTW_EXTERN void X(assertion_failed)(const char *s, 
+				      int line, const char *file);
+
+/* always check */
+#define CK(ex)						 \
+      (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
+
+#ifdef FFTW_DEBUG
+/* check only if debug enabled */
+#define A(ex)						 \
+      (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
+#else
+#define A(ex) /* nothing */
+#endif
+
+extern void X(debug)(const char *format, ...);
+#define D X(debug)
+
+/*-----------------------------------------------------------------------*/
+/* kalloc.c: */
+extern void *X(kernel_malloc)(size_t n);
+extern void X(kernel_free)(void *p);
+
+/*-----------------------------------------------------------------------*/
+/* alloc.c: */
+
+/* objects allocated by malloc, for statistical purposes */
+enum malloc_tag {
+     EVERYTHING,
+     PLANS,
+     SOLVERS,
+     PROBLEMS,
+     BUFFERS,
+     HASHT,
+     TENSORS,
+     PLANNERS,
+     SLVDESCS,
+     TWIDDLES,
+     STRIDES,
+     OTHER,
+     MALLOC_WHAT_LAST		/* must be last */
+};
+
+IFFTW_EXTERN void X(ifree)(void *ptr);
+extern void X(ifree0)(void *ptr);
+
+#ifdef FFTW_DEBUG_MALLOC
+
+IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
+			     const char *file, int line);
+#define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
+IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
+
+#else /* ! FFTW_DEBUG_MALLOC */
+
+IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
+#define MALLOC(n, what)  X(malloc_plain)(n)
+
+#endif
+
+#if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP))
+extern int X(in_thread);
+#  define IN_THREAD X(in_thread)
+#  define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
+#  define THREAD_OFF X(in_thread) = in_thread_save; }
+#else
+#  define IN_THREAD 0
+#  define THREAD_ON 
+#  define THREAD_OFF 
+#endif
+
+/*-----------------------------------------------------------------------*/
+/* low-resolution clock */
+
+#ifdef FAKE_CRUDE_TIME
+ typedef int crude_time;
+#else
+# if TIME_WITH_SYS_TIME
+#  include <sys/time.h>
+#  include <time.h>
+# else
+#  if HAVE_SYS_TIME_H
+#   include <sys/time.h>
+#  else
+#   include <time.h>
+#  endif
+# endif
+
+# ifdef HAVE_BSDGETTIMEOFDAY
+# ifndef HAVE_GETTIMEOFDAY
+# define gettimeofday BSDgettimeofday
+# define HAVE_GETTIMEOFDAY 1
+# endif
+# endif
+
+# if defined(HAVE_GETTIMEOFDAY)
+   typedef struct timeval crude_time;
+# else
+   typedef clock_t crude_time;
+# endif
+#endif /* else FAKE_CRUDE_TIME */
+
+crude_time X(get_crude_time)(void);
+double X(elapsed_since)(const planner *plnr, const problem *p,
+			crude_time t0); /* time in seconds since t0 */
+
+/*-----------------------------------------------------------------------*/
+/* ops.c: */
+/*
+ * ops counter.  The total number of additions is add + fma
+ * and the total number of multiplications is mul + fma.
+ * Total flops = add + mul + 2 * fma
+ */
+typedef struct {
+     double add;
+     double mul;
+     double fma;
+     double other;
+} opcnt;
+
+void X(ops_zero)(opcnt *dst);
+void X(ops_other)(INT o, opcnt *dst);
+void X(ops_cpy)(const opcnt *src, opcnt *dst);
+
+void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
+void X(ops_add2)(const opcnt *a, opcnt *dst);
+
+/* dst = m * a + b */
+void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
+
+/* dst += m * a */
+void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
+
+
+/*-----------------------------------------------------------------------*/
+/* minmax.c: */
+INT X(imax)(INT a, INT b);
+INT X(imin)(INT a, INT b);
+
+/*-----------------------------------------------------------------------*/
+/* iabs.c: */
+INT X(iabs)(INT a);
+
+/* inline version */
+#define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
+
+/*-----------------------------------------------------------------------*/
+/* md5.c */
+
+#if SIZEOF_UNSIGNED_INT >= 4
+typedef unsigned int md5uint;
+#else
+typedef unsigned long md5uint; /* at least 32 bits as per C standard */
+#endif
+
+typedef md5uint md5sig[4];
+
+typedef struct {
+     md5sig s; /* state and signature */
+
+     /* fields not meant to be used outside md5.c: */
+     unsigned char c[64]; /* stuff not yet processed */
+     unsigned l;  /* total length.  Should be 64 bits long, but this is
+		     good enough for us */
+} md5;
+
+void X(md5begin)(md5 *p);
+void X(md5putb)(md5 *p, const void *d_, size_t len);
+void X(md5puts)(md5 *p, const char *s);
+void X(md5putc)(md5 *p, unsigned char c);
+void X(md5int)(md5 *p, int i);
+void X(md5INT)(md5 *p, INT i);
+void X(md5unsigned)(md5 *p, unsigned i);
+void X(md5end)(md5 *p);
+
+/*-----------------------------------------------------------------------*/
+/* tensor.c: */
+#define STRUCT_HACK_KR
+#undef STRUCT_HACK_C99
+
+typedef struct {
+     INT n;
+     INT is;			/* input stride */
+     INT os;			/* output stride */
+} iodim;
+
+typedef struct {
+     int rnk;
+#if defined(STRUCT_HACK_KR)
+     iodim dims[1];
+#elif defined(STRUCT_HACK_C99)
+     iodim dims[];
+#else
+     iodim *dims;
+#endif
+} tensor;
+
+/*
+  Definition of rank -infinity.
+  This definition has the property that if you want rank 0 or 1,
+  you can simply test for rank <= 1.  This is a common case.
+ 
+  A tensor of rank -infinity has size 0.
+*/
+#define RNK_MINFTY  ((int)(((unsigned) -1) >> 1))
+#define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
+
+typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
+
+tensor *X(mktensor)(int rnk);
+tensor *X(mktensor_0d)(void);
+tensor *X(mktensor_1d)(INT n, INT is, INT os);
+tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
+		       INT n1, INT is1, INT os1);
+tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
+		       INT n1, INT is1, INT os1,
+		       INT n2, INT is2, INT os2);
+tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
+		       INT n1, INT is1, INT os1,
+		       INT n2, INT is2, INT os2,
+		       INT n3, INT is3, INT os3);
+tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
+		       INT n1, INT is1, INT os1,
+		       INT n2, INT is2, INT os2,
+		       INT n3, INT is3, INT os3,
+		       INT n4, INT is4, INT os4);
+INT X(tensor_sz)(const tensor *sz);
+void X(tensor_md5)(md5 *p, const tensor *t);
+INT X(tensor_max_index)(const tensor *sz);
+INT X(tensor_min_istride)(const tensor *sz);
+INT X(tensor_min_ostride)(const tensor *sz);
+INT X(tensor_min_stride)(const tensor *sz);
+int X(tensor_inplace_strides)(const tensor *sz);
+int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
+int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
+                               inplace_kind k);
+tensor *X(tensor_copy)(const tensor *sz);
+int X(tensor_kosherp)(const tensor *x);
+
+tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
+tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
+tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
+tensor *X(tensor_compress)(const tensor *sz);
+tensor *X(tensor_compress_contiguous)(const tensor *sz);
+tensor *X(tensor_append)(const tensor *a, const tensor *b);
+void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
+int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
+void X(tensor_destroy)(tensor *sz);
+void X(tensor_destroy2)(tensor *a, tensor *b);
+void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
+void X(tensor_print)(const tensor *sz, printer *p);
+int X(dimcmp)(const iodim *a, const iodim *b);
+int X(tensor_equal)(const tensor *a, const tensor *b);
+int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
+
+/*-----------------------------------------------------------------------*/
+/* problem.c: */
+enum { 
+     /* a problem that cannot be solved */
+     PROBLEM_UNSOLVABLE,
+
+     PROBLEM_DFT, 
+     PROBLEM_RDFT,
+     PROBLEM_RDFT2,
+
+     /* for mpi/ subdirectory */
+     PROBLEM_MPI_DFT,
+     PROBLEM_MPI_RDFT,
+     PROBLEM_MPI_RDFT2,
+     PROBLEM_MPI_TRANSPOSE,
+
+     PROBLEM_LAST 
+};
+
+typedef struct {
+     int problem_kind;
+     void (*hash) (const problem *ego, md5 *p);
+     void (*zero) (const problem *ego);
+     void (*print) (const problem *ego, printer *p);
+     void (*destroy) (problem *ego);
+} problem_adt;
+
+struct problem_s {
+     const problem_adt *adt;
+};
+
+problem *X(mkproblem)(size_t sz, const problem_adt *adt);
+void X(problem_destroy)(problem *ego);
+problem *X(mkproblem_unsolvable)(void);
+
+/*-----------------------------------------------------------------------*/
+/* print.c */
+struct printer_s {
+     void (*print)(printer *p, const char *format, ...);
+     void (*vprint)(printer *p, const char *format, va_list ap);
+     void (*putchr)(printer *p, char c);
+     void (*cleanup)(printer *p);
+     int indent;
+     int indent_incr;
+};
+
+printer *X(mkprinter)(size_t size, 
+		      void (*putchr)(printer *p, char c),
+		      void (*cleanup)(printer *p));
+IFFTW_EXTERN void X(printer_destroy)(printer *p);
+
+/*-----------------------------------------------------------------------*/
+/* scan.c */
+struct scanner_s {
+     int (*scan)(scanner *sc, const char *format, ...);
+     int (*vscan)(scanner *sc, const char *format, va_list ap);
+     int (*getchr)(scanner *sc);
+     int ungotc;
+};
+
+scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
+void X(scanner_destroy)(scanner *sc);
+
+/*-----------------------------------------------------------------------*/
+/* plan.c: */
+
+enum wakefulness {
+     SLEEPY,
+     AWAKE_ZERO,
+     AWAKE_SQRTN_TABLE,
+     AWAKE_SINCOS
+};
+
+typedef struct {
+     void (*solve)(const plan *ego, const problem *p);
+     void (*awake)(plan *ego, enum wakefulness wakefulness);
+     void (*print)(const plan *ego, printer *p);
+     void (*destroy)(plan *ego);
+} plan_adt;
+
+struct plan_s {
+     const plan_adt *adt;
+     opcnt ops;
+     double pcost;
+     enum wakefulness wakefulness; /* used for debugging only */
+     int could_prune_now_p;
+};
+
+plan *X(mkplan)(size_t size, const plan_adt *adt);
+void X(plan_destroy_internal)(plan *ego);
+IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
+void X(plan_null_destroy)(plan *ego);
+
+/*-----------------------------------------------------------------------*/
+/* solver.c: */
+typedef struct {
+     int problem_kind;
+     plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
+     void (*destroy)(solver *ego);
+} solver_adt;
+
+struct solver_s {
+     const solver_adt *adt;
+     int refcnt;
+};
+
+solver *X(mksolver)(size_t size, const solver_adt *adt);
+void X(solver_use)(solver *ego);
+void X(solver_destroy)(solver *ego);
+void X(solver_register)(planner *plnr, solver *s);
+
+/* shorthand */
+#define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
+
+/*-----------------------------------------------------------------------*/
+/* planner.c */
+
+typedef struct slvdesc_s {
+     solver *slv;
+     const char *reg_nam;
+     unsigned nam_hash;
+     int reg_id;
+     int next_for_same_problem_kind;
+} slvdesc;
+
+typedef struct solution_s solution; /* opaque */
+
+/* interpretation of L and U: 
+
+   - if it returns a plan, the planner guarantees that all applicable
+     plans at least as impatient as U have been tried, and that each
+     plan in the solution is at least as impatient as L.
+   
+   - if it returns 0, the planner guarantees to have tried all solvers
+     at least as impatient as L, and that none of them was applicable.
+
+   The structure is packed to fit into 64 bits.
+*/
+
+typedef struct {
+     unsigned l:20;
+     unsigned hash_info:3;
+#    define BITS_FOR_TIMELIMIT 9
+     unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
+     unsigned u:20;
+     
+     /* abstraction break: we store the solver here to pad the
+	structure to 64 bits.  Otherwise, the struct is padded to 64
+	bits anyway, and another word is allocated for slvndx. */
+#    define BITS_FOR_SLVNDX 12
+     unsigned slvndx:BITS_FOR_SLVNDX;
+} flags_t;
+
+/* impatience flags  */
+enum {
+     BELIEVE_PCOST = 0x0001,
+     ESTIMATE = 0x0002,
+     NO_DFT_R2HC = 0x0004,
+     NO_SLOW = 0x0008,
+     NO_VRECURSE = 0x0010,
+     NO_INDIRECT_OP = 0x0020,
+     NO_LARGE_GENERIC = 0x0040,
+     NO_RANK_SPLITS = 0x0080,
+     NO_VRANK_SPLITS = 0x0100,
+     NO_NONTHREADED = 0x0200,
+     NO_BUFFERING = 0x0400,
+     NO_FIXED_RADIX_LARGE_N = 0x0800,
+     NO_DESTROY_INPUT = 0x1000,
+     NO_SIMD = 0x2000,
+     CONSERVE_MEMORY = 0x4000,
+     NO_DHT_R2HC = 0x8000,
+     NO_UGLY = 0x10000,
+     ALLOW_PRUNING = 0x20000
+};
+
+/* hashtable information */
+enum {
+     BLESSING = 0x1,   /* save this entry */
+     H_VALID = 0x2,    /* valid hastable entry */
+     H_LIVE = 0x4      /* entry is nonempty, implies H_VALID */
+};
+
+#define PLNR_L(plnr) ((plnr)->flags.l)
+#define PLNR_U(plnr) ((plnr)->flags.u)
+#define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
+
+#define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
+#define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
+#define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
+
+#define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
+#define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
+#define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
+#define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
+#define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
+#define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
+#define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
+#define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
+#define NO_FIXED_RADIX_LARGE_NP(plnr) \
+  (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
+#define NO_NONTHREADEDP(plnr) \
+  ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
+
+#define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
+#define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
+#define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
+#define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
+#define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
+
+typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
+
+typedef enum { 
+     /* WISDOM_NORMAL: planner may or may not use wisdom */
+     WISDOM_NORMAL, 
+
+     /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
+     WISDOM_ONLY, 
+
+     /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
+     WISDOM_IS_BOGUS,
+
+     /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
+     WISDOM_IGNORE_INFEASIBLE,
+
+     /* WISDOM_IGNORE_ALL: planner ignores all */
+     WISDOM_IGNORE_ALL
+} wisdom_state_t;
+
+typedef struct {
+     void (*register_solver)(planner *ego, solver *s);
+     plan *(*mkplan)(planner *ego, const problem *p);
+     void (*forget)(planner *ego, amnesia a);
+     void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
+						 word in C++. */
+     int (*imprt)(planner *ego, scanner *sc);
+} planner_adt;
+
+/* hash table of solutions */
+typedef struct {
+     solution *solutions;
+     unsigned hashsiz, nelem;
+
+     /* statistics */
+     int lookup, succ_lookup, lookup_iter;
+     int insert, insert_iter, insert_unknown;
+     int nrehash;
+} hashtab;
+
+typedef enum { COST_SUM, COST_MAX } cost_kind;
+
+struct planner_s {
+     const planner_adt *adt;
+     void (*hook)(struct planner_s *plnr, plan *pln, 
+		  const problem *p, int optimalp);
+     double (*cost_hook)(const problem *p, double t, cost_kind k);
+     int (*wisdom_ok_hook)(const problem *p, flags_t flags);
+     void (*nowisdom_hook)(const problem *p);
+     wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
+
+     /* solver descriptors */
+     slvdesc *slvdescs;
+     unsigned nslvdesc, slvdescsiz;
+     const char *cur_reg_nam;
+     int cur_reg_id;
+     int slvdescs_for_problem_kind[PROBLEM_LAST];
+
+     wisdom_state_t wisdom_state;
+
+     hashtab htab_blessed;
+     hashtab htab_unblessed;
+
+     int nthr;
+     flags_t flags;
+
+     crude_time start_time;
+     double timelimit; /* elapsed_since(start_time) at which to bail out */
+     int timed_out; /* whether most recent search timed out */
+     int need_timeout_check;
+
+     /* various statistics */
+     int nplan;    /* number of plans evaluated */
+     double pcost, epcost; /* total pcost of measured/estimated plans */
+     int nprob;    /* number of problems evaluated */
+};
+
+planner *X(mkplanner)(void);
+void X(planner_destroy)(planner *ego);
+
+/*
+  Iterate over all solvers.   Read:
+ 
+  @article{ baker93iterators,
+  author = "Henry G. Baker, Jr.",
+  title = "Iterators: Signs of Weakness in Object-Oriented Languages",
+  journal = "{ACM} {OOPS} Messenger",
+  volume = "4",
+  number = "3",
+  pages = "18--25"
+  }
+*/
+#define FORALL_SOLVERS(ego, s, p, what)			\
+{							\
+     unsigned _cnt;					\
+     for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) {	\
+	  slvdesc *p = ego->slvdescs + _cnt;		\
+	  solver *s = p->slv;				\
+	  what;						\
+     }							\
+}
+
+#define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what)		\
+{								\
+     int _cnt = ego->slvdescs_for_problem_kind[kind]; 		\
+     while (_cnt >= 0) {					\
+	  slvdesc *p = ego->slvdescs + _cnt;			\
+	  solver *s = p->slv;					\
+	  what;							\
+	  _cnt = p->next_for_same_problem_kind;			\
+     }								\
+}
+
+
+/* make plan, destroy problem */
+plan *X(mkplan_d)(planner *ego, problem *p);
+plan *X(mkplan_f_d)(planner *ego, problem *p, 
+		    unsigned l_set, unsigned u_set, unsigned u_reset);
+
+/*-----------------------------------------------------------------------*/
+/* stride.c: */
+
+/* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
+#if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
+#define PRECOMPUTE_ARRAY_INDICES
+#endif
+
+extern const INT X(an_INT_guaranteed_to_be_zero);
+
+#ifdef PRECOMPUTE_ARRAY_INDICES
+typedef INT *stride;
+#define WS(stride, i)  (stride[i])
+extern stride X(mkstride)(INT n, INT s);
+void X(stride_destroy)(stride p);
+/* hackery to prevent the compiler from copying the strides array
+   onto the stack */
+#define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
+#else
+
+typedef INT stride;
+#define WS(stride, i)  (stride * i)
+#define fftwf_mkstride(n, stride) stride
+#define fftw_mkstride(n, stride) stride
+#define fftwl_mkstride(n, stride) stride
+#define fftwf_stride_destroy(p) ((void) p)
+#define fftw_stride_destroy(p) ((void) p)
+#define fftwl_stride_destroy(p) ((void) p)
+
+/* hackery to prevent the compiler from ``optimizing'' induction
+   variables in codelet loops.  The problem is that for each K and for
+   each expression of the form P[I + STRIDE * K] in a loop, most
+   compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
+   For large values of K this behavior overflows the
+   register set, which is likely worse than doing the index computation
+   in the first place.
+
+   If we guess that there are more than
+   ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
+   the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
+   be 0, but the compiler does not know this. 
+
+   16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
+   seem to imply.
+*/
+#define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
+#define MAKE_VOLATILE_STRIDE(nptr, x)                   \
+     (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ?     \
+        0 :                                             \
+      ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
+#endif /* PRECOMPUTE_ARRAY_INDICES */
+
+/*-----------------------------------------------------------------------*/
+/* solvtab.c */
+
+struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
+typedef struct solvtab_s solvtab[];
+void X(solvtab_exec)(const solvtab tbl, planner *p);
+#define SOLVTAB(s) { s, STRINGIZE(s) }
+#define SOLVTAB_END { 0, 0 }
+
+/*-----------------------------------------------------------------------*/
+/* pickdim.c */
+int X(pickdim)(int which_dim, const int *buddies, int nbuddies,
+	       const tensor *sz, int oop, int *dp);
+
+/*-----------------------------------------------------------------------*/
+/* twiddle.c */
+/* little language to express twiddle factors computation */
+enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3, 
+       TW_FULL = 4, TW_HALF = 5 };
+
+typedef struct {
+     unsigned char op;
+     signed char v;
+     short i;
+} tw_instr;
+
+typedef struct twid_s {
+     R *W;                     /* array of twiddle factors */
+     INT n, r, m;                /* transform order, radix, # twiddle rows */
+     int refcnt;
+     const tw_instr *instr;
+     struct twid_s *cdr;
+     enum wakefulness wakefulness;
+} twid;
+
+INT X(twiddle_length)(INT r, const tw_instr *p);
+void X(twiddle_awake)(enum wakefulness wakefulness,
+		      twid **pp, const tw_instr *instr, INT n, INT r, INT m);
+
+/*-----------------------------------------------------------------------*/
+/* trig.c */
+#if defined(TRIGREAL_IS_LONG_DOUBLE)
+   typedef long double trigreal;
+#elif defined(TRIGREAL_IS_QUAD)
+   typedef __float128 trigreal;
+#else
+   typedef double trigreal;
+#endif
+
+typedef struct triggen_s triggen;
+
+struct triggen_s {
+     void (*cexp)(triggen *t, INT m, R *result);
+     void (*cexpl)(triggen *t, INT m, trigreal *result);
+     void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
+
+     INT twshft;
+     INT twradix;
+     INT twmsk;
+     trigreal *W0, *W1;
+     INT n;
+};
+
+triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
+void X(triggen_destroy)(triggen *p);
+
+/*-----------------------------------------------------------------------*/
+/* primes.c: */
+
+#define MULMOD(x, y, p) \
+   (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
+
+INT X(safe_mulmod)(INT x, INT y, INT p);
+INT X(power_mod)(INT n, INT m, INT p);
+INT X(find_generator)(INT p);
+INT X(first_divisor)(INT n);
+int X(is_prime)(INT n);
+INT X(next_prime)(INT n);
+int X(factors_into)(INT n, const INT *primes);
+int X(factors_into_small_primes)(INT n);
+INT X(choose_radix)(INT r, INT n);
+INT X(isqrt)(INT n);
+INT X(modulo)(INT a, INT n);
+
+#define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
+
+/* thresholds below which certain solvers are considered SLOW.  These are guesses
+   believed to be conservative */
+#define GENERIC_MAX_SLOW     16
+#define RADER_MAX_SLOW       32
+#define BLUESTEIN_MAX_SLOW   24
+
+/*-----------------------------------------------------------------------*/
+/* rader.c: */
+typedef struct rader_tls rader_tl;
+
+void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
+R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
+void X(rader_tl_delete)(R *W, rader_tl **tl);
+
+/*-----------------------------------------------------------------------*/
+/* copy/transposition routines */
+
+/* lower bound to the cache size, for tiled routines */
+#define CACHESIZE 8192
+
+INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
+
+void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
+	       void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
+	       void *args);
+void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
+void X(cpy2d)(R *I, R *O,
+	      INT n0, INT is0, INT os0,
+	      INT n1, INT is1, INT os1,
+	      INT vl);
+void X(cpy2d_ci)(R *I, R *O,
+		 INT n0, INT is0, INT os0,
+		 INT n1, INT is1, INT os1,
+		 INT vl);
+void X(cpy2d_co)(R *I, R *O,
+		 INT n0, INT is0, INT os0,
+		 INT n1, INT is1, INT os1,
+		 INT vl);
+void X(cpy2d_tiled)(R *I, R *O,
+		    INT n0, INT is0, INT os0,
+		    INT n1, INT is1, INT os1, 
+		    INT vl);
+void X(cpy2d_tiledbuf)(R *I, R *O,
+		       INT n0, INT is0, INT os0,
+		       INT n1, INT is1, INT os1, 
+		       INT vl);
+void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
+		   INT n0, INT is0, INT os0,
+		   INT n1, INT is1, INT os1);
+void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
+		      INT n0, INT is0, INT os0,
+		      INT n1, INT is1, INT os1);
+void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
+		      INT n0, INT is0, INT os0,
+		      INT n1, INT is1, INT os1);
+
+void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
+void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
+void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
+
+typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
+typedef void (*cpy2d_func)(R *I, R *O,
+			   INT n0, INT is0, INT os0,
+			   INT n1, INT is1, INT os1,
+			   INT vl);
+
+/*-----------------------------------------------------------------------*/
+/* misc stuff */
+void X(null_awake)(plan *ego, enum wakefulness wakefulness);
+double X(iestimate_cost)(const planner *, const plan *, const problem *);
+
+#ifdef FFTW_RANDOM_ESTIMATOR
+extern unsigned X(random_estimate_seed);
+#endif
+
+double X(measure_execution_time)(const planner *plnr, 
+				 plan *pln, const problem *p);
+int X(alignment_of)(R *p);
+unsigned X(hash)(const char *s);
+INT X(nbuf)(INT n, INT vl, INT maxnbuf);
+int X(nbuf_redundant)(INT n, INT vl, int which, 
+		      const INT *maxnbuf, int nmaxnbuf);
+INT X(bufdist)(INT n, INT vl);
+int X(toobig)(INT n);
+int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
+
+#if HAVE_SIMD
+R *X(taint)(R *p, INT s);
+R *X(join_taint)(R *p1, R *p2);
+#define TAINT(p, s) X(taint)(p, s)
+#define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
+#define TAINTOF(p) (((uintptr_t)(p)) & 3)
+#define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
+#else
+#define TAINT(p, s) (p)
+#define UNTAINT(p) (p)
+#define TAINTOF(p) 0
+#define JOIN_TAINT(p1, p2) p1
+#endif
+
+#ifdef FFTW_DEBUG_ALIGNMENT
+#  define ASSERT_ALIGNED_DOUBLE {		\
+     double __foo;				\
+     CK(!(((uintptr_t) &__foo) & 0x7));		\
+}
+#else
+#  define ASSERT_ALIGNED_DOUBLE 
+#endif /* FFTW_DEBUG_ALIGNMENT */
+
+
+
+/*-----------------------------------------------------------------------*/
+/* macros used in codelets to reduce source code size */
+
+typedef R E;  /* internal precision of codelets. */
+
+#if defined(FFTW_LDOUBLE)
+#  define K(x) ((E) x##L)
+#elif defined(FFTW_QUAD)
+#  define K(x) ((E) x##Q)
+#else
+#  define K(x) ((E) x)
+#endif
+#define DK(name, value) const E name = K(value)
+
+/* FMA macros */
+
+#if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
+/* The obvious expression a * b + c does not work.  If both x = a * b
+   + c and y = a * b - c appear in the source, gcc computes t = a * b,
+   x = t + c, y = t - c, thus destroying the fma.
+
+   This peculiar coding seems to do the right thing on all of
+   gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3.  It does the right thing
+   on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
+   `x' for the single-assignment form).
+
+   However, gcc-4.0 is a formidable adversary which succeeds in
+   pessimizing two fma's into one multiplication and two additions.
+   It does it very early in the game---before the optimization passes
+   even start.  The only real workaround seems to use fake inline asm
+   such as
+
+     asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
+     return a * b + c;
+     
+   in each of the FMA, FMS, FNMA, and FNMS functions.  However, this
+   does not solve the problem either, because two equal asm statements
+   count as a common subexpression!  One must use *different* fake asm
+   statements:
+
+   in FMA:
+     asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
+
+   in FMS:
+     asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
+
+   etc.
+
+   After these changes, gcc recalcitrantly generates the fma that was
+   in the source to begin with.  However, the extra asm() cruft
+   confuses other passes of gcc, notably the instruction scheduler.
+   (Of course, one could also generate the fma directly via inline
+   asm, but this confuses the scheduler even more.)
+
+   Steven and I have submitted more than one bug report to the gcc
+   mailing list over the past few years, to no effect.  Thus, I give
+   up.  gcc-4.0 can go to hell.  I'll wait at least until gcc-4.3 is
+   out before touching this crap again.
+*/
+static __inline__ E FMA(E a, E b, E c)
+{
+     E x = a * b;
+     x = x + c;
+     return x;
+}
+
+static __inline__ E FMS(E a, E b, E c)
+{
+     E x = a * b;
+     x = x - c;
+     return x;
+}
+
+static __inline__ E FNMA(E a, E b, E c)
+{
+     E x = a * b;
+     x = - (x + c);
+     return x;
+}
+
+static __inline__ E FNMS(E a, E b, E c)
+{
+     E x = a * b;
+     x = - (x - c);
+     return x;
+}
+#else
+#define FMA(a, b, c) (((a) * (b)) + (c))
+#define FMS(a, b, c) (((a) * (b)) - (c))
+#define FNMA(a, b, c) (- (((a) * (b)) + (c)))
+#define FNMS(a, b, c) ((c) - ((a) * (b)))
+#endif
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* __IFFTW_H__ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/kalloc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/kalloc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+/* ``kernel'' malloc(), with proper memory alignment */
+
+#if defined(HAVE_DECL_MEMALIGN) && !HAVE_DECL_MEMALIGN
+#  if defined(HAVE_MALLOC_H)
+#    include <malloc.h>
+#  else
+extern void *memalign(size_t, size_t);
+#  endif
+#endif
+
+#if defined(HAVE_DECL_POSIX_MEMALIGN) && !HAVE_DECL_POSIX_MEMALIGN
+extern int posix_memalign(void **, size_t, size_t);
+#endif
+
+#if defined(macintosh) /* MacOS 9 */
+#  include <Multiprocessing.h>
+#endif
+
+#define real_free free /* memalign and malloc use ordinary free */
+
+#define IS_POWER_OF_TWO(n) (((n) > 0) && (((n) & ((n) - 1)) == 0))
+#if defined(WITH_OUR_MALLOC) && (MIN_ALIGNMENT >= 8) && IS_POWER_OF_TWO(MIN_ALIGNMENT)
+/* Our own MIN_ALIGNMENT-aligned malloc/free.  Assumes sizeof(void*) is a
+   power of two <= 8 and that malloc is at least sizeof(void*)-aligned.
+
+   The main reason for this routine is that, as of this writing,
+   Windows does not include any aligned allocation routines in its
+   system libraries, and instead provides an implementation with a
+   Visual C++ "Processor Pack" that you have to statically link into
+   your program.  We do not want to require users to have VC++
+   (e.g. gcc/MinGW should be fine).  Our code should be at least as good
+   as the MS _aligned_malloc, in any case, according to second-hand
+   reports of the algorithm it employs (also based on plain malloc). */
+static void *our_malloc(size_t n)
+{
+     void *p0, *p;
+     if (!(p0 = malloc(n + MIN_ALIGNMENT))) return (void *) 0;
+     p = (void *) (((uintptr_t) p0 + MIN_ALIGNMENT) & (~((uintptr_t) (MIN_ALIGNMENT - 1))));
+     *((void **) p - 1) = p0;
+     return p;
+}
+static void our_free(void *p)
+{
+     if (p) free(*((void **) p - 1));
+}
+#endif
+
+void *X(kernel_malloc)(size_t n)
+{
+     void *p;
+
+#if defined(MIN_ALIGNMENT)
+
+#  if defined(WITH_OUR_MALLOC)
+     p = our_malloc(n);
+#    undef real_free
+#    define real_free our_free
+
+#  elif defined(__FreeBSD__) && (MIN_ALIGNMENT <= 16)
+     /* FreeBSD does not have memalign, but its malloc is 16-byte aligned. */
+     p = malloc(n);
+
+#  elif (defined(__MACOSX__) || defined(__APPLE__)) && (MIN_ALIGNMENT <= 16)
+     /* MacOS X malloc is already 16-byte aligned */
+     p = malloc(n);
+
+#  elif defined(HAVE_MEMALIGN)
+     p = memalign(MIN_ALIGNMENT, n);
+
+#  elif defined(HAVE_POSIX_MEMALIGN)
+     /* note: posix_memalign is broken in glibc 2.2.5: it constrains
+	the size, not the alignment, to be (power of two) * sizeof(void*).
+        The bug seems to have been fixed as of glibc 2.3.1. */
+     if (posix_memalign(&p, MIN_ALIGNMENT, n))
+	  p = (void*) 0;
+
+#  elif defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
+     /* Intel's C compiler defines _mm_malloc and _mm_free intrinsics */
+     p = (void *) _mm_malloc(n, MIN_ALIGNMENT);
+#    undef real_free
+#    define real_free _mm_free
+
+#  elif defined(_MSC_VER)
+     /* MS Visual C++ 6.0 with a "Processor Pack" supports SIMD
+	and _aligned_malloc/free (uses malloc.h) */
+     p = (void *) _aligned_malloc(n, MIN_ALIGNMENT);
+#    undef real_free
+#    define real_free _aligned_free
+
+#  elif defined(macintosh) /* MacOS 9 */
+     p = (void *) MPAllocateAligned(n,
+#    if MIN_ALIGNMENT == 8
+				    kMPAllocate8ByteAligned,
+#    elif MIN_ALIGNMENT == 16
+				    kMPAllocate16ByteAligned,
+#    elif MIN_ALIGNMENT == 32
+				    kMPAllocate32ByteAligned,
+#    else
+#      error "Unknown alignment for MPAllocateAligned"
+#    endif
+				    0);
+#    undef real_free
+#    define real_free MPFree
+
+#  else
+     /* Add your machine here and send a patch to fftw@fftw.org 
+        or (e.g. for Windows) configure --with-our-malloc */
+#    error "Don't know how to malloc() aligned memory ... try configuring --with-our-malloc"
+#  endif
+
+#else /* !defined(MIN_ALIGNMENT) */
+     p = malloc(n);
+#endif
+
+     return p;
+}
+
+void X(kernel_free)(void *p)
+{
+     real_free(p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/md5-1.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/md5-1.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+
+
+void X(md5putb)(md5 *p, const void *d_, size_t len)
+{
+     size_t i;
+     const unsigned char *d = (const unsigned char *)d_;
+     for (i = 0; i < len; ++i)
+	  X(md5putc)(p, d[i]);
+}
+
+void X(md5puts)(md5 *p, const char *s)
+{
+     /* also hash final '\0' */
+     do {
+	  X(md5putc)(p, *s);
+     } while(*s++);
+}
+
+void X(md5int)(md5 *p, int i)
+{
+     X(md5putb)(p, &i, sizeof(i));
+}
+
+void X(md5INT)(md5 *p, INT i)
+{
+     X(md5putb)(p, &i, sizeof(i));
+}
+
+void X(md5unsigned)(md5 *p, unsigned i)
+{
+     X(md5putb)(p, &i, sizeof(i));
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/md5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/md5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* 
+   independent implementation of Ron Rivest's MD5 message-digest
+   algorithm, based on rfc 1321.
+
+   Optimized for small code size, not speed.  Works as long as
+   sizeof(md5uint) >= 4.
+*/
+
+#include "ifftw.h"
+
+/* sintab[i] = 4294967296.0 * abs(sin((double)(i + 1))) */
+static const md5uint sintab[64] = {
+     0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
+     0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
+     0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+     0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
+     0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
+     0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+     0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
+     0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
+     0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+     0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
+     0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
+     0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+     0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
+     0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
+     0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+     0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+}; 
+
+/* see rfc 1321 section 3.4 */
+static const struct roundtab {
+     char k; 
+     char s;
+} roundtab[64] = {
+     {  0,  7}, {  1, 12}, {  2, 17}, {  3, 22},
+     {  4,  7}, {  5, 12}, {  6, 17}, {  7, 22},
+     {  8,  7}, {  9, 12}, { 10, 17}, { 11, 22},
+     { 12,  7}, { 13, 12}, { 14, 17}, { 15, 22},
+     {  1,  5}, {  6,  9}, { 11, 14}, {  0, 20},
+     {  5,  5}, { 10,  9}, { 15, 14}, {  4, 20},
+     {  9,  5}, { 14,  9}, {  3, 14}, {  8, 20},
+     { 13,  5}, {  2,  9}, {  7, 14}, { 12, 20},
+     {  5,  4}, {  8, 11}, { 11, 16}, { 14, 23},
+     {  1,  4}, {  4, 11}, {  7, 16}, { 10, 23},
+     { 13,  4}, {  0, 11}, {  3, 16}, {  6, 23},
+     {  9,  4}, { 12, 11}, { 15, 16}, {  2, 23},
+     {  0,  6}, {  7, 10}, { 14, 15}, {  5, 21},
+     { 12,  6}, {  3, 10}, { 10, 15}, {  1, 21},
+     {  8,  6}, { 15, 10}, {  6, 15}, { 13, 21},
+     {  4,  6}, { 11, 10}, {  2, 15}, {  9, 21}
+};
+
+#define rol(a, s) ((a << (int)(s)) | (a >> (32 - (int)(s))))
+
+static void doblock(md5sig state, const unsigned char *data)
+{
+     md5uint a, b, c, d, t, x[16];
+     const md5uint msk = (md5uint)0xffffffffUL;
+     int i;
+
+     /* encode input bytes into md5uint */
+     for (i = 0; i < 16; ++i) {
+	  const unsigned char *p = data + 4 * i;
+	  x[i] = p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
+     }
+
+     a = state[0]; b = state[1]; c = state[2]; d = state[3];
+     for (i = 0; i < 64; ++i) {
+	  const struct roundtab *p = roundtab + i;
+	  switch (i >> 4) {
+	      case 0: a += (b & c) | (~b & d); break;
+	      case 1: a += (b & d) | (c & ~d); break;
+	      case 2: a += b ^ c ^ d; break;
+	      case 3: a += c ^ (b | ~d); break;
+	  }
+	  a += sintab[i];
+	  a += x[(int)(p->k)];
+	  a &= msk;
+	  t = b + rol(a, p->s);
+	  a = d; d = c; c = b; b = t;
+     }
+     state[0] = (state[0] + a) & msk;
+     state[1] = (state[1] + b) & msk;
+     state[2] = (state[2] + c) & msk;
+     state[3] = (state[3] + d) & msk;
+}
+
+
+void X(md5begin)(md5 *p)
+{
+     p->s[0] = 0x67452301;
+     p->s[1] = 0xefcdab89;
+     p->s[2] = 0x98badcfe;
+     p->s[3] = 0x10325476;
+     p->l = 0;
+}
+
+void X(md5putc)(md5 *p, unsigned char c)
+{
+     p->c[p->l % 64] = c;
+     if (((++p->l) % 64) == 0) doblock(p->s, p->c);
+}
+
+void X(md5end)(md5 *p)
+{
+     unsigned l, i;
+
+     l = 8 * p->l; /* length before padding, in bits */
+
+     /* rfc 1321 section 3.1: padding */
+     X(md5putc)(p, 0x80);
+     while ((p->l % 64) != 56) X(md5putc)(p, 0x00);
+
+     /* rfc 1321 section 3.2: length (little endian) */
+     for (i = 0; i < 8; ++i) {
+	  X(md5putc)(p, l & 0xFF);
+	  l = l >> 8;
+     }
+
+     /* Now p->l % 64 == 0 and signature is in p->s */
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/minmax.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/minmax.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+INT X(imax)(INT a, INT b)
+{
+     return (a > b) ? a : b;
+}
+
+INT X(imin)(INT a, INT b)
+{
+     return (a < b) ? a : b;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/ops.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/ops.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+void X(ops_zero)(opcnt *dst)
+{
+     dst->add = dst->mul = dst->fma = dst->other = 0;
+}
+
+void X(ops_cpy)(const opcnt *src, opcnt *dst)
+{
+     *dst = *src;
+}
+
+void X(ops_other)(INT o, opcnt *dst)
+{
+     X(ops_zero)(dst);
+     dst->other = o;
+}
+
+void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst)
+{
+     dst->add = m * a->add + b->add;
+     dst->mul = m * a->mul + b->mul;
+     dst->fma = m * a->fma + b->fma;
+     dst->other = m * a->other + b->other;
+}
+
+void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst)
+{
+     X(ops_madd)(1, a, b, dst);
+}
+
+void X(ops_add2)(const opcnt *a, opcnt *dst)
+{
+     X(ops_add)(a, dst, dst);
+}
+
+void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst)
+{
+     X(ops_madd)(m, a, dst, dst);
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/pickdim.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/pickdim.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+
+
+/* Given a solver which_dim, a vector sz, and whether or not the
+   transform is out-of-place, return the actual dimension index that
+   it corresponds to.  The basic idea here is that we return the
+   which_dim'th valid dimension, starting from the end if
+   which_dim < 0. */
+static int really_pickdim(int which_dim, const tensor *sz, int oop, int *dp)
+{
+     int i;
+     int count_ok = 0;
+     if (which_dim > 0) {
+          for (i = 0; i < sz->rnk; ++i) {
+               if (oop || sz->dims[i].is == sz->dims[i].os)
+                    if (++count_ok == which_dim) {
+                         *dp = i;
+                         return 1;
+                    }
+          }
+     }
+     else if (which_dim < 0) {
+          for (i = sz->rnk - 1; i >= 0; --i) {
+               if (oop || sz->dims[i].is == sz->dims[i].os)
+                    if (++count_ok == -which_dim) {
+                         *dp = i;
+                         return 1;
+                    }
+          }
+     }
+     else { /* zero: pick the middle, if valid */
+	  i = (sz->rnk - 1) / 2;
+	  if (i >= 0 && (oop || sz->dims[i].is == sz->dims[i].os)) {
+	       *dp = i;
+	       return 1;
+	  }
+     }
+     return 0;
+}
+
+/* Like really_pickdim, but only returns 1 if no previous "buddy"
+   which_dim in the buddies list would give the same dim. */
+int X(pickdim)(int which_dim, const int *buddies, int nbuddies,
+	       const tensor *sz, int oop, int *dp)
+{
+     int i, d1;
+
+     if (!really_pickdim(which_dim, sz, oop, dp))
+          return 0;
+
+     /* check whether some buddy solver would produce the same dim.
+        If so, consider this solver unapplicable and let the buddy
+        take care of it.  The smallest-indexed buddy is applicable. */
+     for (i = 0; i < nbuddies; ++i) {
+          if (buddies[i] == which_dim)
+               break;  /* found self */
+          if (really_pickdim(buddies[i], sz, oop, &d1) && *dp == d1)
+               return 0; /* found equivalent buddy */
+     }
+     return 1;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/plan.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/plan.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+/* "Plan: To bother about the best method of accomplishing an
+   accidental result."  (Ambrose Bierce, The Enlarged Devil's
+   Dictionary). */
+
+plan *X(mkplan)(size_t size, const plan_adt *adt)
+{
+     plan *p = (plan *)MALLOC(size, PLANS);
+
+     A(adt->destroy);
+     p->adt = adt;
+     X(ops_zero)(&p->ops);
+     p->pcost = 0.0;
+     p->wakefulness = SLEEPY;
+     p->could_prune_now_p = 0;
+     
+     return p;
+}
+
+/*
+ * destroy a plan
+ */
+void X(plan_destroy_internal)(plan *ego)
+{
+     if (ego) {
+	  A(ego->wakefulness == SLEEPY);
+          ego->adt->destroy(ego);
+	  X(ifree)(ego);
+     }
+}
+
+/* dummy destroy routine for plans with no local state */
+void X(plan_null_destroy)(plan *ego)
+{
+     UNUSED(ego);
+     /* nothing */
+}
+
+void X(plan_awake)(plan *ego, enum wakefulness wakefulness)
+{
+     if (ego) {
+	  A(((wakefulness == SLEEPY) ^ (ego->wakefulness == SLEEPY)));
+	  
+	  ego->adt->awake(ego, wakefulness);
+	  ego->wakefulness = wakefulness;
+     }
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/planner.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/planner.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1035 @@
+/*
+ * Copyright (c) 2000 Matteo Frigo
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+#include <string.h>
+
+/* GNU Coding Standards, Sec. 5.2: "Please write the comments in a GNU
+   program in English, because English is the one language that nearly
+   all programmers in all countries can read."
+
+                    ingemisco tanquam reus
+		    culpa rubet vultus meus
+		    supplicanti parce [rms]
+*/
+
+#define VALIDP(solution) ((solution)->flags.hash_info & H_VALID)
+#define LIVEP(solution) ((solution)->flags.hash_info & H_LIVE)
+#define SLVNDX(solution) ((solution)->flags.slvndx)
+#define BLISS(flags) (((flags).hash_info) & BLESSING)
+#define INFEASIBLE_SLVNDX ((1U<<BITS_FOR_SLVNDX)-1)
+
+
+#define MAXNAM 64  /* maximum length of registrar's name.
+		      Used for reading wisdom.  There is no point
+		      in doing this right */
+
+
+#ifdef FFTW_DEBUG
+static void check(hashtab *ht);
+#endif
+
+/* x <= y */
+#define LEQ(x, y) (((x) & (y)) == (x))
+
+/* A subsumes B */
+static int subsumes(const flags_t *a, unsigned slvndx_a, const flags_t *b)
+{
+     if (slvndx_a != INFEASIBLE_SLVNDX) {
+	  A(a->timelimit_impatience == 0);
+	  return (LEQ(a->u, b->u) && LEQ(b->l, a->l));
+     } else {
+	  return (LEQ(a->l, b->l) 
+		  && a->timelimit_impatience <= b->timelimit_impatience);
+     }
+}
+
+static unsigned addmod(unsigned a, unsigned b, unsigned p)
+{
+     /* gcc-2.95/sparc produces incorrect code for the fast version below. */
+#if defined(__sparc__) && defined(__GNUC__)
+     /* slow version  */
+     return (a + b) % p;
+#else
+     /* faster version */
+     unsigned c = a + b;
+     return c >= p ? c - p : c;
+#endif
+}
+
+/*
+  slvdesc management:
+*/
+static void sgrow(planner *ego)
+{
+     unsigned osiz = ego->slvdescsiz, nsiz = 1 + osiz + osiz / 4;
+     slvdesc *ntab = (slvdesc *)MALLOC(nsiz * sizeof(slvdesc), SLVDESCS);
+     slvdesc *otab = ego->slvdescs;
+     unsigned i;
+
+     ego->slvdescs = ntab;
+     ego->slvdescsiz = nsiz;
+     for (i = 0; i < osiz; ++i)
+	  ntab[i] = otab[i];
+     X(ifree0)(otab);
+}
+
+static void register_solver(planner *ego, solver *s)
+{
+     slvdesc *n;
+     int kind;
+
+     if (s) { /* add s to solver list */
+	  X(solver_use)(s);
+
+	  A(ego->nslvdesc < INFEASIBLE_SLVNDX);
+	  if (ego->nslvdesc >= ego->slvdescsiz)
+	       sgrow(ego);
+
+	  n = ego->slvdescs + ego->nslvdesc;
+
+	  n->slv = s;
+	  n->reg_nam = ego->cur_reg_nam;
+	  n->reg_id = ego->cur_reg_id++;
+	  
+	  A(strlen(n->reg_nam) < MAXNAM);
+	  n->nam_hash = X(hash)(n->reg_nam);
+
+	  kind = s->adt->problem_kind;
+	  n->next_for_same_problem_kind = ego->slvdescs_for_problem_kind[kind];
+	  ego->slvdescs_for_problem_kind[kind] = ego->nslvdesc;
+
+	  ego->nslvdesc++;
+     }
+}
+
+static unsigned slookup(planner *ego, char *nam, int id)
+{
+     unsigned h = X(hash)(nam); /* used to avoid strcmp in the common case */
+     FORALL_SOLVERS(ego, s, sp, {
+	  UNUSED(s);
+	  if (sp->reg_id == id && sp->nam_hash == h
+	      && !strcmp(sp->reg_nam, nam))
+	       return sp - ego->slvdescs;
+     });
+     return INFEASIBLE_SLVNDX;
+}
+
+/* Compute a MD5 hash of the configuration of the planner.
+   We store it into the wisdom file to make absolutely sure that
+   we are reading wisdom that is applicable */
+static void signature_of_configuration(md5 *m, planner *ego)
+{
+     X(md5begin)(m);
+     X(md5unsigned)(m, sizeof(R)); /* so we don't mix different precisions */
+     FORALL_SOLVERS(ego, s, sp, {
+	  UNUSED(s);
+	  X(md5int)(m, sp->reg_id);
+	  X(md5puts)(m, sp->reg_nam);
+     });
+     X(md5end)(m);
+}
+
+/*
+  md5-related stuff:
+*/
+
+/* first hash function */
+static unsigned h1(const hashtab *ht, const md5sig s)
+{
+     unsigned h = s[0] % ht->hashsiz;
+     A(h == (s[0] % ht->hashsiz));
+     return h;
+}
+
+/* second hash function (for double hashing) */
+static unsigned h2(const hashtab *ht, const md5sig s)
+{
+     unsigned h = 1U + s[1] % (ht->hashsiz - 1);
+     A(h == (1U + s[1] % (ht->hashsiz - 1)));
+     return h;
+}
+
+static void md5hash(md5 *m, const problem *p, const planner *plnr)
+{
+     X(md5begin)(m);
+     X(md5unsigned)(m, sizeof(R)); /* so we don't mix different precisions */
+     X(md5int)(m, plnr->nthr);
+     p->adt->hash(p, m);
+     X(md5end)(m);
+}
+
+static int md5eq(const md5sig a, const md5sig b)
+{
+     return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
+}
+
+static void sigcpy(const md5sig a, md5sig b)
+{
+     b[0] = a[0]; b[1] = a[1]; b[2] = a[2]; b[3] = a[3];
+}
+
+/*
+  memoization routines :
+*/
+
+/*
+   liber scriptus proferetur
+   in quo totum continetur
+   unde mundus iudicetur
+*/
+struct solution_s {
+     md5sig s;
+     flags_t flags;
+};
+
+static solution *htab_lookup(hashtab *ht, const md5sig s, 
+			     const flags_t *flagsp)
+{
+     unsigned g, h = h1(ht, s), d = h2(ht, s);
+     solution *best = 0;
+
+     ++ht->lookup;
+
+     /* search all entries that match; select the one with
+	the lowest flags.u */
+     /* This loop may potentially traverse the whole table, since at
+	least one element is guaranteed to be !LIVEP, but all elements
+	may be VALIDP.  Hence, we stop after at the first invalid
+	element or after traversing the whole table. */
+     g = h;
+     do {
+	  solution *l = ht->solutions + g;
+	  ++ht->lookup_iter;
+	  if (VALIDP(l)) {
+	       if (LIVEP(l)
+		   && md5eq(s, l->s)
+		   && subsumes(&l->flags, SLVNDX(l), flagsp) ) { 
+		    if (!best || LEQ(l->flags.u, best->flags.u))
+			 best = l;
+	       }
+	  } else 
+	       break;
+
+	  g = addmod(g, d, ht->hashsiz);
+     } while (g != h);
+
+     if (best) 
+	  ++ht->succ_lookup;
+     return best;
+}
+
+static solution *hlookup(planner *ego, const md5sig s, 
+			 const flags_t *flagsp)
+{
+     solution *sol = htab_lookup(&ego->htab_blessed, s, flagsp);
+     if (!sol) sol = htab_lookup(&ego->htab_unblessed, s, flagsp);
+     return sol;
+}
+
+static void fill_slot(hashtab *ht, const md5sig s, const flags_t *flagsp,
+		      unsigned slvndx, solution *slot)
+{
+     ++ht->insert;
+     ++ht->nelem;
+     A(!LIVEP(slot));
+     slot->flags.u = flagsp->u;
+     slot->flags.l = flagsp->l;
+     slot->flags.timelimit_impatience = flagsp->timelimit_impatience;
+     slot->flags.hash_info |= H_VALID | H_LIVE;
+     SLVNDX(slot) = slvndx;
+
+     /* keep this check enabled in case we add so many solvers
+	that the bitfield overflows */
+     CK(SLVNDX(slot) == slvndx);     
+     sigcpy(s, slot->s);
+}
+
+static void kill_slot(hashtab *ht, solution *slot)
+{
+     A(LIVEP(slot)); /* ==> */ A(VALIDP(slot));
+
+     --ht->nelem;
+     slot->flags.hash_info = H_VALID;
+}
+
+static void hinsert0(hashtab *ht, const md5sig s, const flags_t *flagsp, 
+		     unsigned slvndx)
+{
+     solution *l;
+     unsigned g, h = h1(ht, s), d = h2(ht, s); 
+
+     ++ht->insert_unknown;
+
+     /* search for nonfull slot */
+     for (g = h; ; g = addmod(g, d, ht->hashsiz)) {
+	  ++ht->insert_iter;
+	  l = ht->solutions + g;
+	  if (!LIVEP(l)) break;
+	  A((g + d) % ht->hashsiz != h);
+     }
+
+     fill_slot(ht, s, flagsp, slvndx, l);
+}
+
+static void rehash(hashtab *ht, unsigned nsiz)
+{
+     unsigned osiz = ht->hashsiz, h;
+     solution *osol = ht->solutions, *nsol;
+
+     nsiz = (unsigned)X(next_prime)((INT)nsiz);
+     nsol = (solution *)MALLOC(nsiz * sizeof(solution), HASHT);
+     ++ht->nrehash;
+
+     /* init new table */
+     for (h = 0; h < nsiz; ++h) 
+	  nsol[h].flags.hash_info = 0;
+
+     /* install new table */
+     ht->hashsiz = nsiz;
+     ht->solutions = nsol;
+     ht->nelem = 0;
+
+     /* copy table */
+     for (h = 0; h < osiz; ++h) {
+	  solution *l = osol + h;
+	  if (LIVEP(l))
+	       hinsert0(ht, l->s, &l->flags, SLVNDX(l));
+     }
+
+     X(ifree0)(osol);
+}
+
+static unsigned minsz(unsigned nelem)
+{
+     return 1U + nelem + nelem / 8U;
+}
+
+static unsigned nextsz(unsigned nelem)
+{
+     return minsz(minsz(nelem));
+}
+
+static void hgrow(hashtab *ht)
+{
+     unsigned nelem = ht->nelem;
+     if (minsz(nelem) >= ht->hashsiz)
+	  rehash(ht, nextsz(nelem));
+}
+
+#if 0
+/* shrink the hash table, never used */
+static void hshrink(hashtab *ht)
+{
+     unsigned nelem = ht->nelem;
+     /* always rehash after deletions */
+     rehash(ht, nextsz(nelem));
+}
+#endif
+
+static void htab_insert(hashtab *ht, const md5sig s, const flags_t *flagsp,
+			unsigned slvndx)
+{
+     unsigned g, h = h1(ht, s), d = h2(ht, s);
+     solution *first = 0;
+
+     /* Remove all entries that are subsumed by the new one.  */
+     /* This loop may potentially traverse the whole table, since at
+	least one element is guaranteed to be !LIVEP, but all elements
+	may be VALIDP.  Hence, we stop after at the first invalid
+	element or after traversing the whole table. */
+     g = h;
+     do {
+	  solution *l = ht->solutions + g;
+	  ++ht->insert_iter;
+	  if (VALIDP(l)) {
+	       if (LIVEP(l) && md5eq(s, l->s)) {
+		    if (subsumes(flagsp, slvndx, &l->flags)) {
+			 if (!first) first = l;
+			 kill_slot(ht, l);
+		    } else {
+			 /* It is an error to insert an element that
+			    is subsumed by an existing entry. */
+			 A(!subsumes(&l->flags, SLVNDX(l), flagsp));
+		    }
+	       }
+	  } else 
+	       break;
+
+	  g = addmod(g, d, ht->hashsiz);
+     } while (g != h);
+
+     if (first) {
+	  /* overwrite FIRST */
+	  fill_slot(ht, s, flagsp, slvndx, first);
+     } else {
+	  /* create a new entry */
+ 	  hgrow(ht);
+	  hinsert0(ht, s, flagsp, slvndx);
+     }
+}
+
+static void hinsert(planner *ego, const md5sig s, const flags_t *flagsp, 
+		    unsigned slvndx)
+{
+     htab_insert(BLISS(*flagsp) ? &ego->htab_blessed : &ego->htab_unblessed,
+		 s, flagsp, slvndx );
+}
+
+
+static void invoke_hook(planner *ego, plan *pln, const problem *p, 
+			int optimalp)
+{
+     if (ego->hook)
+	  ego->hook(ego, pln, p, optimalp);
+}
+
+#ifdef FFTW_RANDOM_ESTIMATOR
+/* a "random" estimate, used for debugging to generate "random"
+   plans, albeit from a deterministic seed. */
+
+unsigned X(random_estimate_seed) = 0;
+
+static double random_estimate(const planner *ego, const plan *pln, 
+			      const problem *p)
+{
+     md5 m;
+     X(md5begin)(&m);
+     X(md5unsigned)(&m, X(random_estimate_seed));
+     X(md5int)(&m, ego->nthr);
+     p->adt->hash(p, &m);
+     X(md5putb)(&m, &pln->ops, sizeof(pln->ops));
+     X(md5putb)(&m, &pln->adt, sizeof(pln->adt));
+     X(md5end)(&m);
+     return ego->cost_hook ? ego->cost_hook(p, m.s[0], COST_MAX) : m.s[0];
+}
+
+#endif
+
+double X(iestimate_cost)(const planner *ego, const plan *pln, const problem *p)
+{
+     double cost =
+	  + pln->ops.add
+	  + pln->ops.mul
+	  
+#if HAVE_FMA
+	  + pln->ops.fma
+#else
+	  + 2 * pln->ops.fma
+#endif
+	  
+	  + pln->ops.other;
+     if (ego->cost_hook)
+	  cost = ego->cost_hook(p, cost, COST_MAX);
+     return cost;
+}
+
+static void evaluate_plan(planner *ego, plan *pln, const problem *p)
+{
+     if (ESTIMATEP(ego) || !BELIEVE_PCOSTP(ego) || pln->pcost == 0.0) {
+	  ego->nplan++;
+
+	  if (ESTIMATEP(ego)) {
+	  estimate:
+	       /* heuristic */
+#ifdef FFTW_RANDOM_ESTIMATOR
+	       pln->pcost = random_estimate(ego, pln, p);
+	       ego->epcost += X(iestimate_cost)(ego, pln, p);
+#else
+	       pln->pcost = X(iestimate_cost)(ego, pln, p);
+	       ego->epcost += pln->pcost;
+#endif
+	  } else {
+	       double t = X(measure_execution_time)(ego, pln, p);
+	       
+	       if (t < 0) {  /* unavailable cycle counter */
+		    /* Real programmers can write FORTRAN in any language */
+		    goto estimate;
+	       }
+
+	       pln->pcost = t;
+	       ego->pcost += t;
+	       ego->need_timeout_check = 1;
+	  }
+     }
+     
+     invoke_hook(ego, pln, p, 0);
+}
+
+/* maintain dynamic scoping of flags, nthr: */
+static plan *invoke_solver(planner *ego, const problem *p, solver *s, 
+			   const flags_t *nflags)
+{
+     flags_t flags = ego->flags;
+     int nthr = ego->nthr;
+     plan *pln;
+     ego->flags = *nflags;
+     PLNR_TIMELIMIT_IMPATIENCE(ego) = 0;
+     A(p->adt->problem_kind == s->adt->problem_kind);
+     pln = s->adt->mkplan(s, p, ego);
+     ego->nthr = nthr;
+     ego->flags = flags;
+     return pln;
+}
+
+/* maintain the invariant TIMED_OUT ==> NEED_TIMEOUT_CHECK */
+static int timeout_p(planner *ego, const problem *p)
+{
+     /* do not timeout when estimating.  First, the estimator is the
+	planner of last resort.  Second, calling X(elapsed_since)() is
+	slower than estimating */
+     if (!ESTIMATEP(ego)) {
+	  /* do not assume that X(elapsed_since)() is monotonic */
+	  if (ego->timed_out) {
+	       A(ego->need_timeout_check);
+	       return 1;
+	  }
+
+	  if (ego->timelimit >= 0 &&
+	      X(elapsed_since)(ego, p, ego->start_time) >= ego->timelimit) {
+	       ego->timed_out = 1;
+	       ego->need_timeout_check = 1;
+	       return 1;
+	  }
+     }
+
+     A(!ego->timed_out);
+     ego->need_timeout_check = 0;
+     return 0;
+}
+
+static plan *search0(planner *ego, const problem *p, unsigned *slvndx, 
+		     const flags_t *flagsp)
+{
+     plan *best = 0;
+     int best_not_yet_timed = 1;
+
+     /* Do not start a search if the planner timed out. This check is
+	necessary, lest the relaxation mechanism kick in */
+     if (timeout_p(ego, p))
+	  return 0;
+
+     FORALL_SOLVERS_OF_KIND(p->adt->problem_kind, ego, s, sp, {
+	  plan *pln;
+
+	  pln = invoke_solver(ego, p, s, flagsp);
+
+	  if (ego->need_timeout_check) 
+	       if (timeout_p(ego, p)) {
+		    X(plan_destroy_internal)(pln);
+		    X(plan_destroy_internal)(best);
+		    return 0;
+	       }
+
+	  if (pln) {
+	       /* read COULD_PRUNE_NOW_P because PLN may be destroyed
+		  before we use COULD_PRUNE_NOW_P */
+	       int could_prune_now_p = pln->could_prune_now_p;
+
+	       if (best) {
+		    if (best_not_yet_timed) {
+			 evaluate_plan(ego, best, p);
+			 best_not_yet_timed = 0;
+		    }
+		    evaluate_plan(ego, pln, p);
+		    if (pln->pcost < best->pcost) {
+			 X(plan_destroy_internal)(best);
+			 best = pln;
+			 *slvndx = sp - ego->slvdescs;
+		    } else {
+			 X(plan_destroy_internal)(pln);
+		    }
+	       } else {
+		    best = pln;
+		    *slvndx = sp - ego->slvdescs;
+	       }
+
+	       if (ALLOW_PRUNINGP(ego) && could_prune_now_p) 
+		    break;
+	  }
+     });
+
+     return best;
+}
+
+static plan *search(planner *ego, const problem *p, unsigned *slvndx, 
+		    flags_t *flagsp)
+{
+     plan *pln = 0;
+     unsigned i;
+
+     /* relax impatience in this order: */
+     static const unsigned relax_tab[] = {
+	  0, /* relax nothing */
+	  NO_VRECURSE,
+	  NO_FIXED_RADIX_LARGE_N,
+	  NO_SLOW,
+	  NO_UGLY
+     };
+
+     unsigned l_orig = flagsp->l;
+     unsigned x = flagsp->u;
+
+     /* guaranteed to be different from X */
+     unsigned last_x = ~x; 
+
+     for (i = 0; i < sizeof(relax_tab) / sizeof(relax_tab[0]); ++i) {
+	  if (LEQ(l_orig, x & ~relax_tab[i]))
+	       x = x & ~relax_tab[i];
+
+	  if (x != last_x) {
+	       last_x = x;
+	       flagsp->l = x;
+	       pln = search0(ego, p, slvndx, flagsp);
+	       if (pln) break;
+	  }
+     }
+
+     if (!pln) {
+	  /* search [L_ORIG, U] */
+	  if (l_orig != last_x) {
+	       last_x = l_orig;
+	       flagsp->l = l_orig;
+	       pln = search0(ego, p, slvndx, flagsp);
+	  }
+     }
+
+     return pln;
+}
+
+#define CHECK_FOR_BOGOSITY						\
+     if ((ego->bogosity_hook ?						\
+	  (ego->wisdom_state = ego->bogosity_hook(ego->wisdom_state, p)) \
+	  : ego->wisdom_state) == WISDOM_IS_BOGUS)			\
+	  goto wisdom_is_bogus;
+
+static plan *mkplan(planner *ego, const problem *p)
+{
+     plan *pln;
+     md5 m;
+     unsigned slvndx;
+     flags_t flags_of_solution;
+     solution *sol;
+     solver *s;
+
+     ASSERT_ALIGNED_DOUBLE;
+     A(LEQ(PLNR_L(ego), PLNR_U(ego)));
+
+     if (ESTIMATEP(ego))
+	  PLNR_TIMELIMIT_IMPATIENCE(ego) = 0; /* canonical form */
+
+
+#ifdef FFTW_DEBUG
+     check(&ego->htab_blessed);
+     check(&ego->htab_unblessed);
+#endif
+
+     pln = 0;
+
+     CHECK_FOR_BOGOSITY;
+
+     ego->timed_out = 0;
+
+     ++ego->nprob;
+     md5hash(&m, p, ego);
+
+     flags_of_solution = ego->flags;
+
+     if (ego->wisdom_state != WISDOM_IGNORE_ALL) {
+	  if ((sol = hlookup(ego, m.s, &flags_of_solution))) { 
+	       /* wisdom is acceptable */
+	       wisdom_state_t owisdom_state = ego->wisdom_state;
+	       
+	       /* this hook is mainly for MPI, to make sure that
+		  wisdom is in sync across all processes for MPI problems */
+	       if (ego->wisdom_ok_hook && !ego->wisdom_ok_hook(p, sol->flags))
+		    goto do_search; /* ignore not-ok wisdom */
+	       
+	       slvndx = SLVNDX(sol);
+	       
+	       if (slvndx == INFEASIBLE_SLVNDX) {
+		    if (ego->wisdom_state == WISDOM_IGNORE_INFEASIBLE)
+			 goto do_search;
+		    else
+			 return 0;   /* known to be infeasible */
+	       }
+	       
+	       flags_of_solution = sol->flags;
+	       
+	       /* inherit blessing either from wisdom
+		  or from the planner */
+	       flags_of_solution.hash_info |= BLISS(ego->flags);
+	       
+	       ego->wisdom_state = WISDOM_ONLY;
+	       
+	       s = ego->slvdescs[slvndx].slv;
+	       if (p->adt->problem_kind != s->adt->problem_kind)
+		    goto wisdom_is_bogus;
+	       
+	       pln = invoke_solver(ego, p, s, &flags_of_solution);
+	       
+	       CHECK_FOR_BOGOSITY; 	  /* catch error in child solvers */
+	       
+	       sol = 0; /* Paranoia: SOL may be dangling after
+			   invoke_solver(); make sure we don't accidentally
+			   reuse it. */
+	       
+	       if (!pln)
+		    goto wisdom_is_bogus;
+	       
+	       ego->wisdom_state = owisdom_state;
+	       
+	       goto skip_search;
+	  }
+	  else if (ego->nowisdom_hook) /* for MPI, make sure lack of wisdom */
+	       ego->nowisdom_hook(p);  /*   is in sync across all processes */
+     }
+
+ do_search:
+     /* cannot search in WISDOM_ONLY mode */
+     if (ego->wisdom_state == WISDOM_ONLY)
+	  goto wisdom_is_bogus;
+
+     flags_of_solution = ego->flags;
+     pln = search(ego, p, &slvndx, &flags_of_solution);
+     CHECK_FOR_BOGOSITY; 	  /* catch error in child solvers */
+
+     if (ego->timed_out) {
+	  A(!pln);
+	  if (PLNR_TIMELIMIT_IMPATIENCE(ego) != 0) {
+	       /* record (below) that this plan has failed because of
+		  timeout */
+	       flags_of_solution.hash_info |= BLESSING;
+	  } else {
+	       /* this is not the top-level problem or timeout is not
+		  active: record no wisdom. */
+	       return 0;
+	  }
+     } else {
+	  /* canonicalize to infinite timeout */
+	  flags_of_solution.timelimit_impatience = 0;
+     }
+
+ skip_search:
+     if (ego->wisdom_state == WISDOM_NORMAL ||
+	 ego->wisdom_state == WISDOM_ONLY) {
+	  if (pln) {
+	       hinsert(ego, m.s, &flags_of_solution, slvndx);
+	       invoke_hook(ego, pln, p, 1);
+	  } else {
+	       hinsert(ego, m.s, &flags_of_solution, INFEASIBLE_SLVNDX);
+	  }
+     }
+
+     return pln;
+
+ wisdom_is_bogus:
+     X(plan_destroy_internal)(pln);
+     ego->wisdom_state = WISDOM_IS_BOGUS;
+     return 0;
+}
+
+static void htab_destroy(hashtab *ht)
+{
+     X(ifree)(ht->solutions);
+     ht->solutions = 0;
+     ht->nelem = 0U;
+}
+
+static void mkhashtab(hashtab *ht)
+{
+     ht->nrehash = 0;
+     ht->succ_lookup = ht->lookup = ht->lookup_iter = 0;
+     ht->insert = ht->insert_iter = ht->insert_unknown = 0;
+
+     ht->solutions = 0;
+     ht->hashsiz = ht->nelem = 0U;
+     hgrow(ht);			/* so that hashsiz > 0 */
+}
+
+/* destroy hash table entries.  If FORGET_EVERYTHING, destroy the whole
+   table.  If FORGET_ACCURSED, then destroy entries that are not blessed. */
+static void forget(planner *ego, amnesia a)
+{
+     switch (a) {
+	 case FORGET_EVERYTHING:
+	      htab_destroy(&ego->htab_blessed);
+	      mkhashtab(&ego->htab_blessed);
+	      /* fall through */
+	 case FORGET_ACCURSED:
+	      htab_destroy(&ego->htab_unblessed);
+	      mkhashtab(&ego->htab_unblessed);
+	      break;
+	 default:
+	      break;
+     }
+}
+
+/* FIXME: what sort of version information should we write? */
+#define WISDOM_PREAMBLE PACKAGE "-" VERSION " " STRINGIZE(X(wisdom))
+static const char stimeout[] = "TIMEOUT";
+
+/* tantus labor non sit cassus */
+static void exprt(planner *ego, printer *p)
+{
+     unsigned h;
+     hashtab *ht = &ego->htab_blessed;
+     md5 m;
+
+     signature_of_configuration(&m, ego);
+
+     p->print(p, 
+	      "(" WISDOM_PREAMBLE " #x%M #x%M #x%M #x%M\n",
+	      m.s[0], m.s[1], m.s[2], m.s[3]);
+
+     for (h = 0; h < ht->hashsiz; ++h) {
+	  solution *l = ht->solutions + h;
+	  if (LIVEP(l)) {
+	       const char *reg_nam;
+	       int reg_id;
+
+	       if (SLVNDX(l) == INFEASIBLE_SLVNDX) {
+		    reg_nam = stimeout;
+		    reg_id = 0;
+	       } else {
+		    slvdesc *sp = ego->slvdescs + SLVNDX(l);
+		    reg_nam = sp->reg_nam;
+		    reg_id = sp->reg_id;
+	       }
+
+	       /* qui salvandos salvas gratis
+		  salva me fons pietatis */
+	       p->print(p, "  (%s %d #x%x #x%x #x%x #x%M #x%M #x%M #x%M)\n",
+			reg_nam, reg_id, 
+			l->flags.l, l->flags.u, l->flags.timelimit_impatience, 
+			l->s[0], l->s[1], l->s[2], l->s[3]);
+	  }
+     }
+     p->print(p, ")\n");
+}
+
+/* mors stupebit et natura
+   cum resurget creatura */
+static int imprt(planner *ego, scanner *sc)
+{
+     char buf[MAXNAM + 1];
+     md5uint sig[4];
+     unsigned l, u, timelimit_impatience;
+     flags_t flags;
+     int reg_id;
+     unsigned slvndx;
+     hashtab *ht = &ego->htab_blessed;
+     hashtab old;
+     md5 m;
+
+     if (!sc->scan(sc, 
+		   "(" WISDOM_PREAMBLE " #x%M #x%M #x%M #x%M\n",
+		   sig + 0, sig + 1, sig + 2, sig + 3))
+	  return 0; /* don't need to restore hashtable */
+
+     signature_of_configuration(&m, ego);
+     if (m.s[0] != sig[0] || m.s[1] != sig[1] ||
+	 m.s[2] != sig[2] || m.s[3] != sig[3]) {
+	  /* invalid configuration */
+	  return 0;
+     }
+     
+     /* make a backup copy of the hash table (cache the hash) */
+     {
+	  unsigned h, hsiz = ht->hashsiz;
+	  old = *ht;
+	  old.solutions = (solution *)MALLOC(hsiz * sizeof(solution), HASHT);
+	  for (h = 0; h < hsiz; ++h)
+	       old.solutions[h] = ht->solutions[h];
+     }
+
+     while (1) {
+	  if (sc->scan(sc, ")"))
+	       break;
+
+	  /* qua resurget ex favilla */
+	  if (!sc->scan(sc, "(%*s %d #x%x #x%x #x%x #x%M #x%M #x%M #x%M)",
+			MAXNAM, buf, &reg_id, &l, &u, &timelimit_impatience,
+			sig + 0, sig + 1, sig + 2, sig + 3))
+	       goto bad;
+
+	  if (!strcmp(buf, stimeout) && reg_id == 0) {
+	       slvndx = INFEASIBLE_SLVNDX;
+	  } else {
+	       if (timelimit_impatience != 0)
+		    goto bad;
+
+	       slvndx = slookup(ego, buf, reg_id);
+	       if (slvndx == INFEASIBLE_SLVNDX)
+		    goto bad;
+	  }
+
+	  /* inter oves locum praesta */
+	  flags.l = l;
+	  flags.u = u;
+	  flags.timelimit_impatience = timelimit_impatience;
+	  flags.hash_info = BLESSING;
+
+	  CK(flags.l == l);
+	  CK(flags.u == u);
+	  CK(flags.timelimit_impatience == timelimit_impatience);
+
+	  if (!hlookup(ego, sig, &flags))
+	       hinsert(ego, sig, &flags, slvndx);
+     }
+
+     X(ifree0)(old.solutions);
+     return 1;
+
+ bad:
+     /* ``The wisdom of FFTW must be above suspicion.'' */
+     X(ifree0)(ht->solutions);
+     *ht = old;
+     return 0;
+}
+
+/*
+ * create a planner
+ */
+planner *X(mkplanner)(void)
+{
+     int i;
+
+     static const planner_adt padt = {
+	  register_solver, mkplan, forget, exprt, imprt
+     };
+
+     planner *p = (planner *) MALLOC(sizeof(planner), PLANNERS);
+
+     p->adt = &padt;
+     p->nplan = p->nprob = 0;
+     p->pcost = p->epcost = 0.0;
+     p->hook = 0;
+     p->cost_hook = 0;
+     p->wisdom_ok_hook = 0;
+     p->nowisdom_hook = 0;
+     p->bogosity_hook = 0;
+     p->cur_reg_nam = 0;
+     p->wisdom_state = WISDOM_NORMAL;
+
+     p->slvdescs = 0;
+     p->nslvdesc = p->slvdescsiz = 0;
+
+     p->flags.l = 0;
+     p->flags.u = 0;
+     p->flags.timelimit_impatience = 0;
+     p->flags.hash_info = 0;
+     p->nthr = 1;
+     p->need_timeout_check = 1;
+     p->timelimit = -1;
+
+     mkhashtab(&p->htab_blessed);
+     mkhashtab(&p->htab_unblessed);
+
+     for (i = 0; i < PROBLEM_LAST; ++i)
+	  p->slvdescs_for_problem_kind[i] = -1;
+
+     return p;
+}
+
+void X(planner_destroy)(planner *ego)
+{
+     /* destroy hash table */
+     htab_destroy(&ego->htab_blessed);
+     htab_destroy(&ego->htab_unblessed);
+
+     /* destroy solvdesc table */
+     FORALL_SOLVERS(ego, s, sp, {
+	  UNUSED(sp);
+	  X(solver_destroy)(s);
+     });
+
+     X(ifree0)(ego->slvdescs);
+     X(ifree)(ego); /* dona eis requiem */
+}
+
+plan *X(mkplan_d)(planner *ego, problem *p)
+{
+     plan *pln = ego->adt->mkplan(ego, p);
+     X(problem_destroy)(p);
+     return pln;
+}
+
+/* like X(mkplan_d), but sets/resets flags as well */
+plan *X(mkplan_f_d)(planner *ego, problem *p, 
+		    unsigned l_set, unsigned u_set, unsigned u_reset)
+{
+     flags_t oflags = ego->flags;
+     plan *pln;
+
+     PLNR_U(ego) &= ~u_reset;
+     PLNR_L(ego) &= ~u_reset;
+     PLNR_L(ego) |= l_set;
+     PLNR_U(ego) |= u_set | l_set;
+     pln = X(mkplan_d)(ego, p);
+     ego->flags = oflags;
+     return pln;
+}
+
+/*
+ * Debugging code:
+ */
+#ifdef FFTW_DEBUG
+static void check(hashtab *ht)
+{
+     unsigned live = 0;
+     unsigned i;
+
+     A(ht->nelem < ht->hashsiz);
+
+     for (i = 0; i < ht->hashsiz; ++i) {
+	  solution *l = ht->solutions + i; 
+	  if (LIVEP(l)) 
+	       ++live; 
+     }
+
+     A(ht->nelem == live);
+
+     for (i = 0; i < ht->hashsiz; ++i) {
+	  solution *l1 = ht->solutions + i; 
+	  int foundit = 0;
+	  if (LIVEP(l1)) {
+	       unsigned g, h = h1(ht, l1->s), d = h2(ht, l1->s);
+
+	       g = h;
+	       do {
+		    solution *l = ht->solutions + g;
+		    if (VALIDP(l)) {
+			 if (l1 == l)
+			      foundit = 1;
+			 else if (LIVEP(l) && md5eq(l1->s, l->s)) {
+			      A(!subsumes(&l->flags, SLVNDX(l), &l1->flags));
+			      A(!subsumes(&l1->flags, SLVNDX(l1), &l->flags));
+			 }
+		    } else 
+			 break;
+		    g = addmod(g, d, ht->hashsiz);
+	       } while (g != h);
+
+	       A(foundit);
+	  }
+     }
+}
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/primes.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/primes.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+/***************************************************************************/
+
+/* Rader's algorithm requires lots of modular arithmetic, and if we
+   aren't careful we can have errors due to integer overflows. */
+
+/* Compute (x * y) mod p, but watch out for integer overflows; we must
+   have 0 <= {x, y} < p.
+
+   If overflow is common, this routine is somewhat slower than
+   e.g. using 'long long' arithmetic.  However, it has the advantage
+   of working when INT is 64 bits, and is also faster when overflow is
+   rare.  FFTW calls this via the MULMOD macro, which further
+   optimizes for the case of small integers. 
+*/
+
+#define ADD_MOD(x, y, p) ((x) >= (p) - (y)) ? ((x) + ((y) - (p))) : ((x) + (y))
+
+INT X(safe_mulmod)(INT x, INT y, INT p)
+{
+     INT r;
+
+     if (y > x) 
+	  return X(safe_mulmod)(y, x, p);
+
+     A(0 <= y && x < p);
+
+     r = 0;
+     while (y) {
+	  r = ADD_MOD(r, x*(y&1), p); y >>= 1;
+	  x = ADD_MOD(x, x, p);
+     }
+
+     return r;
+}
+
+/***************************************************************************/
+
+/* Compute n^m mod p, where m >= 0 and p > 0.  If we really cared, we
+   could make this tail-recursive. */
+
+INT X(power_mod)(INT n, INT m, INT p)
+{
+     A(p > 0);
+     if (m == 0)
+	  return 1;
+     else if (m % 2 == 0) {
+	  INT x = X(power_mod)(n, m / 2, p);
+	  return MULMOD(x, x, p);
+     }
+     else
+	  return MULMOD(n, X(power_mod)(n, m - 1, p), p);
+}
+
+/* the following two routines were contributed by Greg Dionne. */
+static INT get_prime_factors(INT n, INT *primef)
+{
+     INT i;
+     INT size = 0;
+
+     A(n % 2 == 0); /* this routine is designed only for even n */
+     primef[size++] = (INT)2;
+     do
+	  n >>= 1;
+     while ((n & 1) == 0);
+
+     if (n == 1)
+	  return size;
+
+     for (i = 3; i * i <= n; i += 2)
+	  if (!(n % i)) {
+	       primef[size++] = i;
+	       do
+		    n /= i;
+	       while (!(n % i));
+	  }
+     if (n == 1)
+	  return size;
+     primef[size++] = n;
+     return size;
+}
+
+INT X(find_generator)(INT p)
+{
+    INT n, i, size;
+    INT primef[16];     /* smallest number = 32589158477190044730 > 2^64 */
+    INT pm1 = p - 1;
+
+    if (p == 2)
+	 return 1;
+
+    size = get_prime_factors(pm1, primef);
+    n = 2;
+    for (i = 0; i < size; i++)
+        if (X(power_mod)(n, pm1 / primef[i], p) == 1) {
+            i = -1;
+            n++;
+        }
+    return n;
+}
+
+/* Return first prime divisor of n  (It would be at best slightly faster to
+   search a static table of primes; there are 6542 primes < 2^16.)  */
+INT X(first_divisor)(INT n)
+{
+     INT i;
+     if (n <= 1)
+	  return n;
+     if (n % 2 == 0)
+	  return 2;
+     for (i = 3; i*i <= n; i += 2)
+	  if (n % i == 0)
+	       return i;
+     return n;
+}
+
+int X(is_prime)(INT n)
+{
+     return(n > 1 && X(first_divisor)(n) == n);
+}
+
+INT X(next_prime)(INT n)
+{
+     while (!X(is_prime)(n)) ++n;
+     return n;
+}
+
+int X(factors_into)(INT n, const INT *primes)
+{
+     for (; *primes != 0; ++primes) 
+	  while ((n % *primes) == 0) 
+	       n /= *primes;
+     return (n == 1);
+}
+
+/* integer square root.  Return floor(sqrt(N)) */
+INT X(isqrt)(INT n)
+{
+     INT guess, iguess;
+
+     A(n >= 0);
+     if (n == 0) return 0;
+
+     guess = n; iguess = 1;
+
+     do {
+          guess = (guess + iguess) / 2;
+	  iguess = n / guess;
+     } while (guess > iguess);
+
+     return guess;
+}
+
+static INT isqrt_maybe(INT n)
+{
+     INT guess = X(isqrt)(n);
+     return guess * guess == n ? guess : 0;
+}
+
+#define divides(a, b) (((b) % (a)) == 0)
+INT X(choose_radix)(INT r, INT n)
+{
+     if (r > 0) {
+	  if (divides(r, n)) return r;
+	  return 0;
+     } else if (r == 0) {
+	  return X(first_divisor)(n);
+     } else {
+	  /* r is negative.  If n = (-r) * q^2, take q as the radix */
+	  r = 0 - r;
+	  return (n > r && divides(r, n)) ? isqrt_maybe(n / r) : 0;
+     }
+}
+
+/* return A mod N, works for all A including A < 0 */
+INT X(modulo)(INT a, INT n)
+{
+     A(n > 0);
+     if (a >= 0)
+	  return a % n;
+     else
+	  return (n - 1) - ((-(a + (INT)1)) % n);
+}
+
+/* TRUE if N factors into small primes */
+int X(factors_into_small_primes)(INT n)
+{
+     static const INT primes[] = { 2, 3, 5, 0 };
+     return X(factors_into)(n, primes);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/print.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/print.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#define BSZ 64
+
+static void myputs(printer *p, const char *s)
+{
+     char c;
+     while ((c = *s++))
+          p->putchr(p, c);
+}
+
+static void newline(printer *p)
+{
+     int i;
+
+     p->putchr(p, '\n');
+     for (i = 0; i < p->indent; ++i)
+	  p->putchr(p, ' ');
+}
+
+static const char *digits = "0123456789abcdef";
+
+static void putint(printer *p, INT i)
+{
+     char buf[BSZ];
+     char *f = buf;
+
+     if (i < 0) {
+	  p->putchr(p, '-');
+	  i = -i;
+     }
+     
+     do {
+	  *f++ = digits[i % 10];
+	  i /= 10;
+     } while (i);
+     
+     do {
+	  p->putchr(p, *--f);
+     } while (f != buf);
+}
+
+static void putulong(printer *p, unsigned long i, int base, int width)
+{
+     char buf[BSZ];
+     char *f = buf;
+
+     do {
+	  *f++ = digits[i % base];
+	  i /= base;
+     } while (i);
+
+     while (width > f - buf) {
+	  p->putchr(p, '0');
+	  --width;
+     }
+
+     do {
+	  p->putchr(p, *--f);
+     } while (f != buf);
+}
+
+static void vprint(printer *p, const char *format, va_list ap)
+{
+     const char *s = format;
+     char c;
+     INT ival;
+
+     while ((c = *s++)) {
+          switch (c) {
+	      case '%':
+		   switch ((c = *s++)) {
+		       case 'M': {
+			    /* md5 value */
+			    md5uint x = va_arg(ap, md5uint);
+			    putulong(p, (unsigned long)(0xffffffffUL & x),
+				     16, 8);
+			    break;
+		       }
+		       case 'c': {
+			    int x = va_arg(ap, int);
+			    p->putchr(p, x);
+			    break;
+		       }
+		       case 's': {
+			    char *x = va_arg(ap, char *);
+			    if (x)
+				 myputs(p, x);
+			    else
+				 goto putnull;
+			    break;
+		       }
+		       case 'd': {
+			    int x = va_arg(ap, int);
+			    ival = (INT)x;
+			    goto putival;
+		       }
+		       case 'D': {
+			    ival = va_arg(ap, INT);
+			    goto putival;
+		       }
+		       case 'v': {
+			    /* print optional vector length */
+			    ival = va_arg(ap, INT);
+			    if (ival > 1) {
+				 myputs(p, "-x");
+				 goto putival;
+			    }
+			    break;
+		       }
+		       case 'o': {
+			    /* integer option.  Usage: %oNAME= */
+			    ival = va_arg(ap, INT);
+			    if (ival)
+				 p->putchr(p, '/');
+			    while ((c = *s++) != '=')
+				 if (ival)
+				      p->putchr(p, c);
+			    if (ival) {
+				 p->putchr(p, '=');
+				 goto putival;
+			    }
+			    break;
+		       }
+		       case 'u': {
+			    unsigned x = va_arg(ap, unsigned);
+			    putulong(p, (unsigned long)x, 10, 0);
+			    break;
+		       }
+		       case 'x': {
+			    unsigned x = va_arg(ap, unsigned);
+			    putulong(p, (unsigned long)x, 16, 0);
+			    break;
+		       }
+		       case '(': {
+			    /* newline, augment indent level */
+			    p->indent += p->indent_incr;
+			    newline(p);
+			    break;
+		       }
+		       case ')': {
+			    /* decrement indent level */
+			    p->indent -= p->indent_incr;
+			    break;
+		       }
+		       case 'p': {  /* note difference from C's %p */
+			    /* print plan */
+			    plan *x = va_arg(ap, plan *);
+			    if (x) 
+				 x->adt->print(x, p);
+			    else 
+				 goto putnull;
+			    break;
+		       }
+		       case 'P': {
+			    /* print problem */
+			    problem *x = va_arg(ap, problem *);
+			    if (x)
+				 x->adt->print(x, p);
+			    else
+				 goto putnull;
+			    break;
+		       }
+		       case 'T': {
+			    /* print tensor */
+			    tensor *x = va_arg(ap, tensor *);
+			    if (x)
+				 X(tensor_print)(x, p);
+			    else
+				 goto putnull;
+			    break;
+		       }
+		       default:
+			    A(0 /* unknown format */);
+			    break;
+
+		   putnull:
+			    myputs(p, "(null)");
+			    break;
+
+		   putival:
+			    putint(p, ival);
+			    break;
+		   }
+		   break;
+	      default:
+		   p->putchr(p, c);
+		   break;
+          }
+     }
+}
+
+static void print(printer *p, const char *format, ...)
+{
+     va_list ap;
+     va_start(ap, format);
+     vprint(p, format, ap);
+     va_end(ap);
+}
+
+printer *X(mkprinter)(size_t size, 
+		      void (*putchr)(printer *p, char c),
+		      void (*cleanup)(printer *p))
+{
+     printer *s = (printer *)MALLOC(size, OTHER);
+     s->print = print;
+     s->vprint = vprint;
+     s->putchr = putchr;
+     s->cleanup = cleanup;
+     s->indent = 0;
+     s->indent_incr = 2;
+     return s;
+}
+
+void X(printer_destroy)(printer *p)
+{
+     if (p->cleanup)
+	  p->cleanup(p);
+     X(ifree)(p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/problem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/problem.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+/* constructor */
+problem *X(mkproblem)(size_t sz, const problem_adt *adt)
+{
+     problem *p = (problem *)MALLOC(sz, PROBLEMS);
+
+     p->adt = adt;
+     return p;
+}
+
+/* destructor */
+void X(problem_destroy)(problem *ego)
+{
+     if (ego)
+	  ego->adt->destroy(ego);
+}
+
+/* management of unsolvable problems */
+static void unsolvable_destroy(problem *ego)
+{
+     UNUSED(ego);
+}
+
+static void unsolvable_hash(const problem *p, md5 *m)
+{
+     UNUSED(p);
+     X(md5puts)(m, "unsolvable");
+}
+
+static void unsolvable_print(const problem *ego, printer *p)
+{
+     UNUSED(ego);
+     p->print(p, "(unsolvable)");
+}
+
+static void unsolvable_zero(const problem *ego)
+{
+     UNUSED(ego);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_UNSOLVABLE,
+     unsolvable_hash,
+     unsolvable_zero,
+     unsolvable_print,
+     unsolvable_destroy
+};
+
+/* there is no point in malloc'ing this one */
+static problem the_unsolvable_problem = { &padt };
+
+problem *X(mkproblem_unsolvable)(void)
+{
+     return &the_unsolvable_problem;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/rader.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/rader.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+
+/*
+  common routines for Rader solvers 
+*/
+
+
+/* shared twiddle and omega lists, keyed by two/three integers. */
+struct rader_tls {
+     INT k1, k2, k3;
+     R *W;
+     int refcnt;
+     rader_tl *cdr; 
+};
+
+void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl)
+{
+     rader_tl *t = (rader_tl *) MALLOC(sizeof(rader_tl), TWIDDLES);
+     t->k1 = k1; t->k2 = k2; t->k3 = k3; t->W = W;
+     t->refcnt = 1; t->cdr = *tl; *tl = t;
+}
+
+R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t)
+{
+     while (t && (t->k1 != k1 || t->k2 != k2 || t->k3 != k3))
+	  t = t->cdr;
+     if (t) {
+	  ++t->refcnt;
+	  return t->W;
+     } else 
+	  return 0;
+}
+
+void X(rader_tl_delete)(R *W, rader_tl **tl)
+{
+     if (W) {
+	  rader_tl **tp, *t;
+
+	  for (tp = tl; (t = *tp) && t->W != W; tp = &t->cdr)
+	       ;
+
+	  if (t && --t->refcnt <= 0) {
+	       *tp = t->cdr;
+	       X(ifree)(t->W);
+	       X(ifree)(t);
+	  }
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/scan.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/scan.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+#include <string.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#ifdef USE_CTYPE
+#include <ctype.h>
+#else
+/* Screw ctype. On linux, the is* functions call a routine that gets
+   the ctype map in the current locale.  Because this operation is
+   expensive, the map is cached on a per-thread basis.  I am not
+   willing to link this crap with FFTW.  Not over my dead body.
+
+   Sic transit gloria mundi.
+*/
+#undef isspace
+#define isspace(x) ((x) >= 0 && (x) <= ' ')
+#undef isdigit
+#define isdigit(x) ((x) >= '0' && (x) <= '9')
+#undef isupper
+#define isupper(x) ((x) >= 'A' && (x) <= 'Z')
+#undef islower
+#define islower(x) ((x) >= 'a' && (x) <= 'z')
+#endif
+
+static int mygetc(scanner *sc)
+{
+     if (sc->ungotc != EOF) {
+	  int c = sc->ungotc;
+	  sc->ungotc = EOF;
+	  return c;
+     }
+     return(sc->getchr(sc));
+}
+
+#define GETCHR(sc) mygetc(sc)
+
+static void myungetc(scanner *sc, int c)
+{
+     sc->ungotc = c;
+}
+
+#define UNGETCHR(sc, c) myungetc(sc, c)
+
+static void eat_blanks(scanner *sc)
+{
+     int ch;
+     while (ch = GETCHR(sc), isspace(ch))
+          ;
+     UNGETCHR(sc, ch);
+}
+
+static void mygets(scanner *sc, char *s, size_t maxlen)
+{
+     char *s0 = s;
+     int ch;
+
+     A(maxlen > 0);
+     while ((ch = GETCHR(sc)) != EOF && !isspace(ch)
+	    && ch != ')' && ch != '(' && s < s0 + maxlen)
+	  *s++ = ch;
+     *s = 0;
+     UNGETCHR(sc, ch);
+}
+
+static long getlong(scanner *sc, int base, int *ret)
+{
+     int sign = 1, ch, count;
+     long x = 0;     
+
+     ch = GETCHR(sc);
+     if (ch == '-' || ch == '+') {
+	  sign = ch == '-' ? -1 : 1;
+	  ch = GETCHR(sc);
+     }
+     for (count = 0; ; ++count) {
+	  if (isdigit(ch)) 
+	       ch -= '0';
+	  else if (isupper(ch))
+	       ch -= 'A' - 10;
+	  else if (islower(ch))
+	       ch -= 'a' - 10;
+	  else
+	       break;
+	  x = x * base + ch;
+	  ch = GETCHR(sc);
+     }
+     x *= sign;
+     UNGETCHR(sc, ch);
+     *ret = count > 0;
+     return x;
+}
+
+/* vscan is mostly scanf-like, with our additional format specifiers,
+   but with a few twists.  It returns simply 0 or 1 indicating whether
+   the match was successful. '(' and ')' in the format string match
+   those characters preceded by any whitespace.  Finally, if a
+   character match fails, it will ungetchr() the last character back
+   onto the stream. */
+static int vscan(scanner *sc, const char *format, va_list ap)
+{
+     const char *s = format;
+     char c;
+     int ch = 0;
+     size_t fmt_len;
+
+     while ((c = *s++)) {
+	  fmt_len = 0;
+          switch (c) {
+	      case '%':
+	  getformat:
+		   switch ((c = *s++)) {
+		       case 's': {
+			    char *x = va_arg(ap, char *);
+			    mygets(sc, x, fmt_len);
+			    break;
+		       }
+		       case 'd': {
+			    int *x = va_arg(ap, int *);
+			    *x = (int) getlong(sc, 10, &ch);
+			    if (!ch) return 0;
+			    break;
+		       }
+		       case 'x': {
+			    int *x = va_arg(ap, int *);
+			    *x = (int) getlong(sc, 16, &ch);
+			    if (!ch) return 0;
+			    break;
+		       }
+		       case 'M': {
+			    md5uint *x = va_arg(ap, md5uint *);
+			    *x = (md5uint)
+				    (0xffffffffUL & getlong(sc, 16, &ch));
+			    if (!ch) return 0;
+			    break;
+		       }
+		       case '*': {
+			    if ((fmt_len = va_arg(ap, int)) <= 0) return 0;
+			    goto getformat;
+		       }
+		       default:
+			    A(0 /* unknown format */);
+			    break;
+		   }
+		   break;
+	      default:
+		   if (isspace(c) || c == '(' || c == ')')
+			eat_blanks(sc);
+		   if (!isspace(c) && (ch = GETCHR(sc)) != c) {
+			UNGETCHR(sc, ch);
+			return 0;
+		   }
+		   break;
+          }
+     }
+     return 1;
+}
+
+static int scan(scanner *sc, const char *format, ...)
+{
+     int ret;
+     va_list ap;
+     va_start(ap, format);
+     ret = vscan(sc, format, ap);
+     va_end(ap);
+     return ret;
+}
+
+scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc))
+{
+     scanner *s = (scanner *)MALLOC(size, OTHER);
+     s->scan = scan;
+     s->vscan = vscan;
+     s->getchr = getchr;
+     s->ungotc = EOF;
+     return s;
+}
+
+void X(scanner_destroy)(scanner *sc)
+{
+     X(ifree)(sc);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/solver.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/solver.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+solver *X(mksolver)(size_t size, const solver_adt *adt)
+{
+     solver *s = (solver *)MALLOC(size, SOLVERS);
+
+     s->adt = adt;
+     s->refcnt = 0;
+     return s;
+}
+
+void X(solver_use)(solver *ego)
+{
+     ++ego->refcnt;
+}
+
+void X(solver_destroy)(solver *ego)
+{
+     if ((--ego->refcnt) == 0) {
+	  if (ego->adt->destroy)
+	       ego->adt->destroy(ego);
+          X(ifree)(ego);
+     }
+}
+
+void X(solver_register)(planner *plnr, solver *s)
+{
+     plnr->adt->register_solver(plnr, s);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/solvtab.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/solvtab.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+void X(solvtab_exec)(const solvtab tbl, planner *p)
+{
+     for (; tbl->reg_nam; ++tbl) {
+	  p->cur_reg_nam = tbl->reg_nam;
+	  p->cur_reg_id = 0;
+	  tbl->reg(p);
+     }
+     p->cur_reg_nam = 0;
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/stride.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/stride.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+
+const INT X(an_INT_guaranteed_to_be_zero) = 0;
+
+#ifdef PRECOMPUTE_ARRAY_INDICES
+stride X(mkstride)(INT n, INT s)
+{
+     int i;
+     INT *p = (INT *) MALLOC(n * sizeof(INT), STRIDES);
+
+     for (i = 0; i < n; ++i)
+          p[i] = s * i;
+
+     return p;
+}
+
+void X(stride_destroy)(stride p)
+{
+     X(ifree0)(p);
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/tensor.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/tensor.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+tensor *X(mktensor)(int rnk) 
+{
+     tensor *x;
+
+     A(rnk >= 0);
+
+#if defined(STRUCT_HACK_KR)
+     if (FINITE_RNK(rnk) && rnk > 1)
+	  x = (tensor *)MALLOC(sizeof(tensor) + (rnk - 1) * sizeof(iodim),
+				    TENSORS);
+     else
+	  x = (tensor *)MALLOC(sizeof(tensor), TENSORS);
+#elif defined(STRUCT_HACK_C99)
+     if (FINITE_RNK(rnk))
+	  x = (tensor *)MALLOC(sizeof(tensor) + rnk * sizeof(iodim),
+				    TENSORS);
+     else
+	  x = (tensor *)MALLOC(sizeof(tensor), TENSORS);
+#else
+     x = (tensor *)MALLOC(sizeof(tensor), TENSORS);
+     if (FINITE_RNK(rnk) && rnk > 0)
+          x->dims = (iodim *)MALLOC(sizeof(iodim) * rnk, TENSORS);
+     else
+          x->dims = 0;
+#endif
+
+     x->rnk = rnk;
+     return x;
+}
+
+void X(tensor_destroy)(tensor *sz)
+{
+#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
+     X(ifree0)(sz->dims);
+#endif
+     X(ifree)(sz);
+}
+
+INT X(tensor_sz)(const tensor *sz)
+{
+     int i;
+     INT n = 1;
+
+     if (!FINITE_RNK(sz->rnk))
+          return 0;
+
+     for (i = 0; i < sz->rnk; ++i)
+          n *= sz->dims[i].n;
+     return n;
+}
+
+void X(tensor_md5)(md5 *p, const tensor *t)
+{
+     int i;
+     X(md5int)(p, t->rnk);
+     if (FINITE_RNK(t->rnk)) {
+	  for (i = 0; i < t->rnk; ++i) {
+	       const iodim *q = t->dims + i;
+	       X(md5INT)(p, q->n);
+	       X(md5INT)(p, q->is);
+	       X(md5INT)(p, q->os);
+	  }
+     }
+}
+
+/* treat a (rank <= 1)-tensor as a rank-1 tensor, extracting
+   appropriate n, is, and os components */
+int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os)
+{
+     A(t->rnk <= 1);
+     if (t->rnk == 1) {
+	  const iodim *vd = t->dims;
+          *n = vd[0].n;
+          *is = vd[0].is;
+          *os = vd[0].os;
+     } else {
+          *n = 1;
+          *is = *os = 0;
+     }
+     return 1;
+}
+
+void X(tensor_print)(const tensor *x, printer *p)
+{
+     if (FINITE_RNK(x->rnk)) {
+	  int i;
+	  int first = 1;
+	  p->print(p, "(");
+	  for (i = 0; i < x->rnk; ++i) {
+	       const iodim *d = x->dims + i;
+	       p->print(p, "%s(%D %D %D)", 
+			first ? "" : " ",
+			d->n, d->is, d->os);
+	       first = 0;
+	  }
+	  p->print(p, ")");
+     } else {
+	  p->print(p, "rank-minfty"); 
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/tensor1.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/tensor1.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+tensor *X(mktensor_0d)(void)
+{
+     return X(mktensor(0));
+}
+
+tensor *X(mktensor_1d)(INT n, INT is, INT os)
+{
+     tensor *x = X(mktensor)(1);
+     x->dims[0].n = n;
+     x->dims[0].is = is;
+     x->dims[0].os = os;
+     return x;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/tensor2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/tensor2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
+		       INT n1, INT is1, INT os1)
+{
+     tensor *x = X(mktensor)(2);
+     x->dims[0].n = n0;
+     x->dims[0].is = is0;
+     x->dims[0].os = os0;
+     x->dims[1].n = n1;
+     x->dims[1].is = is1;
+     x->dims[1].os = os1;
+     return x;
+}
+
+
+tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
+		       INT n1, INT is1, INT os1,
+		       INT n2, INT is2, INT os2)
+{
+     tensor *x = X(mktensor)(3);
+     x->dims[0].n = n0;
+     x->dims[0].is = is0;
+     x->dims[0].os = os0;
+     x->dims[1].n = n1;
+     x->dims[1].is = is1;
+     x->dims[1].os = os1;
+     x->dims[2].n = n2;
+     x->dims[2].is = is2;
+     x->dims[2].os = os2;
+     return x;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/tensor3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/tensor3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+/* Currently, mktensor_4d and mktensor_5d are only used in the MPI
+   routines, where very complicated transpositions are required.
+   Therefore we split them into a separate source file. */
+
+tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
+		       INT n1, INT is1, INT os1,
+		       INT n2, INT is2, INT os2,
+		       INT n3, INT is3, INT os3)
+{
+     tensor *x = X(mktensor)(4);
+     x->dims[0].n = n0;
+     x->dims[0].is = is0;
+     x->dims[0].os = os0;
+     x->dims[1].n = n1;
+     x->dims[1].is = is1;
+     x->dims[1].os = os1;
+     x->dims[2].n = n2;
+     x->dims[2].is = is2;
+     x->dims[2].os = os2;
+     x->dims[3].n = n3;
+     x->dims[3].is = is3;
+     x->dims[3].os = os3;
+     return x;
+}
+
+tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
+		       INT n1, INT is1, INT os1,
+		       INT n2, INT is2, INT os2,
+		       INT n3, INT is3, INT os3,
+		       INT n4, INT is4, INT os4)
+{
+     tensor *x = X(mktensor)(5);
+     x->dims[0].n = n0;
+     x->dims[0].is = is0;
+     x->dims[0].os = os0;
+     x->dims[1].n = n1;
+     x->dims[1].is = is1;
+     x->dims[1].os = os1;
+     x->dims[2].n = n2;
+     x->dims[2].is = is2;
+     x->dims[2].os = os2;
+     x->dims[3].n = n3;
+     x->dims[3].is = is3;
+     x->dims[3].os = os3;
+     x->dims[4].n = n4;
+     x->dims[4].is = is4;
+     x->dims[4].os = os4;
+     return x;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/tensor4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/tensor4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+INT X(tensor_max_index)(const tensor *sz)
+{
+     int i;
+     INT ni = 0, no = 0;
+
+     A(FINITE_RNK(sz->rnk));
+     for (i = 0; i < sz->rnk; ++i) {
+          const iodim *p = sz->dims + i;
+          ni += (p->n - 1) * X(iabs)(p->is);
+          no += (p->n - 1) * X(iabs)(p->os);
+     }
+     return X(imax)(ni, no);
+}
+
+#define tensor_min_xstride(sz, xs) {			\
+     A(FINITE_RNK(sz->rnk));				\
+     if (sz->rnk == 0) return 0;			\
+     else {						\
+          int i;					\
+          INT s = X(iabs)(sz->dims[0].xs);		\
+          for (i = 1; i < sz->rnk; ++i)			\
+               s = X(imin)(s, X(iabs)(sz->dims[i].xs));	\
+          return s;					\
+     }							\
+}
+
+INT X(tensor_min_istride)(const tensor *sz) tensor_min_xstride(sz, is)
+INT X(tensor_min_ostride)(const tensor *sz) tensor_min_xstride(sz, os)
+
+INT X(tensor_min_stride)(const tensor *sz)
+{
+     return X(imin)(X(tensor_min_istride)(sz), X(tensor_min_ostride)(sz));
+}
+
+int X(tensor_inplace_strides)(const tensor *sz)
+{
+     int i;
+     A(FINITE_RNK(sz->rnk));
+     for (i = 0; i < sz->rnk; ++i) {
+          const iodim *p = sz->dims + i;
+          if (p->is != p->os)
+               return 0;
+     }
+     return 1;
+}
+
+int X(tensor_inplace_strides2)(const tensor *a, const tensor *b)
+{
+     return X(tensor_inplace_strides(a)) && X(tensor_inplace_strides(b));
+}
+
+/* return true (1) iff *any* strides of sz decrease when we
+   tensor_inplace_copy(sz, k). */
+static int tensor_strides_decrease(const tensor *sz, inplace_kind k)
+{
+     if (FINITE_RNK(sz->rnk)) {
+          int i;
+          for (i = 0; i < sz->rnk; ++i)
+               if ((sz->dims[i].os - sz->dims[i].is)
+                   * (k == INPLACE_OS ? (INT)1 : (INT)-1) < 0)
+                    return 1;
+     }
+     return 0;
+}
+
+/* Return true (1) iff *any* strides of sz decrease when we
+   tensor_inplace_copy(k) *or* if *all* strides of sz are unchanged
+   but *any* strides of vecsz decrease.  This is used in indirect.c
+   to determine whether to use INPLACE_IS or INPLACE_OS.
+
+   Note: X(tensor_strides_decrease)(sz, vecsz, INPLACE_IS)
+         || X(tensor_strides_decrease)(sz, vecsz, INPLACE_OS)
+         || X(tensor_inplace_strides2)(p->sz, p->vecsz)
+   must always be true. */
+int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
+			       inplace_kind k)
+{
+     return(tensor_strides_decrease(sz, k)
+	    || (X(tensor_inplace_strides)(sz)
+		&& tensor_strides_decrease(vecsz, k)));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/tensor5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/tensor5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+static void dimcpy(iodim *dst, const iodim *src, int rnk)
+{
+     int i;
+     if (FINITE_RNK(rnk))
+          for (i = 0; i < rnk; ++i)
+               dst[i] = src[i];
+}
+
+tensor *X(tensor_copy)(const tensor *sz)
+{
+     tensor *x = X(mktensor)(sz->rnk);
+     dimcpy(x->dims, sz->dims, sz->rnk);
+     return x;
+}
+
+/* like X(tensor_copy), but makes strides in-place by
+   setting os = is if k == INPLACE_IS or is = os if k == INPLACE_OS. */
+tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k)
+{
+     tensor *x = X(tensor_copy)(sz);
+     if (FINITE_RNK(x->rnk)) {
+	  int i;
+	  if (k == INPLACE_OS)
+	       for (i = 0; i < x->rnk; ++i)
+		    x->dims[i].is = x->dims[i].os;
+	  else
+	       for (i = 0; i < x->rnk; ++i)
+		    x->dims[i].os = x->dims[i].is;
+     }
+     return x;
+}
+
+/* Like X(tensor_copy), but copy all of the dimensions *except*
+   except_dim. */
+tensor *X(tensor_copy_except)(const tensor *sz, int except_dim)
+{
+     tensor *x;
+
+     A(FINITE_RNK(sz->rnk) && sz->rnk >= 1 && except_dim < sz->rnk);
+     x = X(mktensor)(sz->rnk - 1);
+     dimcpy(x->dims, sz->dims, except_dim);
+     dimcpy(x->dims + except_dim, sz->dims + except_dim + 1,
+            x->rnk - except_dim);
+     return x;
+}
+
+/* Like X(tensor_copy), but copy only rnk dimensions starting
+   with start_dim. */
+tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk)
+{
+     tensor *x;
+
+     A(FINITE_RNK(sz->rnk) && start_dim + rnk <= sz->rnk);
+     x = X(mktensor)(rnk);
+     dimcpy(x->dims, sz->dims + start_dim, rnk);
+     return x;
+}
+
+tensor *X(tensor_append)(const tensor *a, const tensor *b)
+{
+     if (!FINITE_RNK(a->rnk) || !FINITE_RNK(b->rnk)) {
+          return X(mktensor)(RNK_MINFTY);
+     } else {
+	  tensor *x = X(mktensor)(a->rnk + b->rnk);
+          dimcpy(x->dims, a->dims, a->rnk);
+          dimcpy(x->dims + a->rnk, b->dims, b->rnk);
+	  return x;
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/tensor7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/tensor7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+static int signof(INT x)
+{
+     if (x < 0) return -1;
+     if (x == 0) return 0;
+     /* if (x > 0) */ return 1;
+}
+
+/* total order among iodim's */
+int X(dimcmp)(const iodim *a, const iodim *b)
+{
+     INT sai = X(iabs)(a->is), sbi = X(iabs)(b->is);
+     INT sao = X(iabs)(a->os), sbo = X(iabs)(b->os);
+     INT sam = X(imin)(sai, sao), sbm = X(imin)(sbi, sbo);
+
+     /* in descending order of min{istride, ostride} */
+     if (sam != sbm)
+	  return signof(sbm - sam);
+
+     /* in case of a tie, in descending order of istride */
+     if (sbi != sai)
+          return signof(sbi - sai);
+
+     /* in case of a tie, in descending order of ostride */
+     if (sbo != sao)
+          return signof(sbo - sao);
+
+     /* in case of a tie, in ascending order of n */
+     return signof(a->n - b->n);
+}
+
+static void canonicalize(tensor *x)
+{
+     if (x->rnk > 1) {
+	  qsort(x->dims, (size_t)x->rnk, sizeof(iodim),
+		(int (*)(const void *, const void *))X(dimcmp));
+     }
+}
+
+static int compare_by_istride(const iodim *a, const iodim *b)
+{
+     INT sai = X(iabs)(a->is), sbi = X(iabs)(b->is);
+
+     /* in descending order of istride */
+     return signof(sbi - sai);
+}
+
+static tensor *really_compress(const tensor *sz)
+{
+     int i, rnk;
+     tensor *x;
+
+     A(FINITE_RNK(sz->rnk));
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+          A(sz->dims[i].n > 0);
+          if (sz->dims[i].n != 1)
+               ++rnk;
+     }
+
+     x = X(mktensor)(rnk);
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+          if (sz->dims[i].n != 1)
+               x->dims[rnk++] = sz->dims[i];
+     }
+     return x;
+}
+
+/* Like tensor_copy, but eliminate n == 1 dimensions, which
+   never affect any transform or transform vector.
+ 
+   Also, we sort the tensor into a canonical order of decreasing
+   strides (see X(dimcmp) for an exact definition).  In general,
+   processing a loop/array in order of decreasing stride will improve
+   locality.  Both forward and backwards traversal of the tensor are
+   considered e.g. by vrank-geq1, so sorting in increasing
+   vs. decreasing order is not really important. */
+tensor *X(tensor_compress)(const tensor *sz)
+{
+     tensor *x = really_compress(sz);
+     canonicalize(x);
+     return x;
+}
+
+/* Return whether the strides of a and b are such that they form an
+   effective contiguous 1d array.  Assumes that a.is >= b.is. */
+static int strides_contig(iodim *a, iodim *b)
+{
+     return (a->is == b->is * b->n && a->os == b->os * b->n);
+}
+
+/* Like tensor_compress, but also compress into one dimension any
+   group of dimensions that form a contiguous block of indices with
+   some stride.  (This can safely be done for transform vector sizes.) */
+tensor *X(tensor_compress_contiguous)(const tensor *sz)
+{
+     int i, rnk;
+     tensor *sz2, *x;
+
+     if (X(tensor_sz)(sz) == 0) 
+	  return X(mktensor)(RNK_MINFTY);
+
+     sz2 = really_compress(sz);
+     A(FINITE_RNK(sz2->rnk));
+
+     if (sz2->rnk <= 1) { /* nothing to compress. */ 
+	  if (0) {
+	       /* this call is redundant, because "sz->rnk <= 1" implies
+		  that the tensor is already canonical, but I am writing
+		  it explicitly because "logically" we need to canonicalize
+		  the tensor before returning. */
+	       canonicalize(sz2);
+	  }
+          return sz2;
+     }
+
+     /* sort in descending order of |istride|, so that compressible
+	dimensions appear contigously */
+     qsort(sz2->dims, (size_t)sz2->rnk, sizeof(iodim),
+		(int (*)(const void *, const void *))compare_by_istride);
+
+     /* compute what the rank will be after compression */
+     for (i = rnk = 1; i < sz2->rnk; ++i)
+          if (!strides_contig(sz2->dims + i - 1, sz2->dims + i))
+               ++rnk;
+
+     /* merge adjacent dimensions whenever possible */
+     x = X(mktensor)(rnk);
+     x->dims[0] = sz2->dims[0];
+     for (i = rnk = 1; i < sz2->rnk; ++i) {
+          if (strides_contig(sz2->dims + i - 1, sz2->dims + i)) {
+               x->dims[rnk - 1].n *= sz2->dims[i].n;
+               x->dims[rnk - 1].is = sz2->dims[i].is;
+               x->dims[rnk - 1].os = sz2->dims[i].os;
+          } else {
+               A(rnk < x->rnk);
+               x->dims[rnk++] = sz2->dims[i];
+          }
+     }
+
+     X(tensor_destroy)(sz2);
+
+     /* reduce to canonical form */
+     canonicalize(x);
+     return x;
+}
+
+/* The inverse of X(tensor_append): splits the sz tensor into
+   tensor a followed by tensor b, where a's rank is arnk. */
+void X(tensor_split)(const tensor *sz, tensor **a, int arnk, tensor **b)
+{
+     A(FINITE_RNK(sz->rnk) && FINITE_RNK(arnk));
+
+     *a = X(tensor_copy_sub)(sz, 0, arnk);
+     *b = X(tensor_copy_sub)(sz, arnk, sz->rnk - arnk);
+}
+
+/* TRUE if the two tensors are equal */
+int X(tensor_equal)(const tensor *a, const tensor *b)
+{
+     if (a->rnk != b->rnk)
+	  return 0;
+
+     if (FINITE_RNK(a->rnk)) {
+	  int i;
+	  for (i = 0; i < a->rnk; ++i) 
+	       if (0
+		   || a->dims[i].n != b->dims[i].n
+		   || a->dims[i].is != b->dims[i].is
+		   || a->dims[i].os != b->dims[i].os
+		    )
+		    return 0;
+     }
+
+     return 1;
+}
+
+/* TRUE if the sets of input and output locations described by
+   (append sz vecsz) are the same */
+int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz)
+{
+     tensor *t = X(tensor_append)(sz, vecsz);
+     tensor *ti = X(tensor_copy_inplace)(t, INPLACE_IS);
+     tensor *to = X(tensor_copy_inplace)(t, INPLACE_OS);
+     tensor *tic = X(tensor_compress_contiguous)(ti);
+     tensor *toc = X(tensor_compress_contiguous)(to);
+
+     int retval = X(tensor_equal)(tic, toc);
+
+     X(tensor_destroy)(t);
+     X(tensor_destroy4)(ti, to, tic, toc);
+
+     return retval;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/tensor8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/tensor8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+void X(tensor_destroy2)(tensor *a, tensor *b)
+{
+     X(tensor_destroy)(a);
+     X(tensor_destroy)(b);
+}
+
+void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d)
+{
+     X(tensor_destroy2)(a, b);
+     X(tensor_destroy2)(c, d);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/tensor9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/tensor9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+int X(tensor_kosherp)(const tensor *x)
+{
+     int i;
+
+     if (x->rnk < 0) return 0;
+
+     if (FINITE_RNK(x->rnk)) {
+	  for (i = 0; i < x->rnk; ++i)
+	       if (x->dims[i].n < 0)
+		    return 0;
+     }
+     return 1;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/tile2d.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/tile2d.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* out of place 2D copy routines */
+#include "ifftw.h"
+
+void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
+	       void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
+	       void *args)
+{
+     INT d0, d1;
+
+     A(tilesz > 0); /* infinite loops otherwise */
+     
+ tail:
+     d0 = n0u - n0l;
+     d1 = n1u - n1l;
+
+     if (d0 >= d1 && d0 > tilesz) {
+	  INT n0m = (n0u + n0l) / 2;
+	  X(tile2d)(n0l, n0m, n1l, n1u, tilesz, f, args);
+	  n0l = n0m; goto tail;
+     } else if (/* d1 >= d0 && */ d1 > tilesz) {
+	  INT n1m = (n1u + n1l) / 2;
+	  X(tile2d)(n0l, n0u, n1l, n1m, tilesz, f, args);
+	  n1l = n1m; goto tail;
+     } else {
+	  f(n0l, n0u, n1l, n1u, args);
+     }
+}
+
+INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache)
+{
+     return X(isqrt)(CACHESIZE / 
+		     (((INT)sizeof(R)) * vl * (INT)how_many_tiles_in_cache));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/timer.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/timer.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+#ifdef HAVE_UNISTD_H
+#  include <unistd.h>
+#endif
+
+#ifndef WITH_SLOW_TIMER
+#  include "cycle.h"
+#endif
+
+#ifndef FFTW_TIME_LIMIT
+#define FFTW_TIME_LIMIT 2.0  /* don't run for more than two seconds */
+#endif
+
+/* the following code is disabled for now, because it seems to
+   require that we #include <windows.h> in ifftw.h to 
+   typedef LARGE_INTEGER crude_time, and this pulls in the whole
+   Windows universe and leads to namespace conflicts (unless
+   we did some hack like assuming sizeof(LARGE_INTEGER) == sizeof(long long).
+   gettimeofday is provided by MinGW, which we use to cross-compile
+   FFTW for Windows, and this seems to work well enough */
+#if 0 && (defined(__WIN32__) || defined(_WIN32) || defined(_WIN64))
+crude_time X(get_crude_time)(void)
+{
+     crude_time tv;
+     QueryPerformanceCounter(&tv);
+     return tv;
+}
+
+static double elapsed_since(crude_time t0)
+{
+     crude_time t1, freq;
+     QueryPerformanceCounter(&t1);
+     QueryPerformanceFrequency(&freq);
+     return (((double) (t1.QuadPart - t0.QuadPart))) /
+	  ((double) freq.QuadPart);
+}
+
+#  define TIME_MIN_SEC 1.0e-2
+
+#elif defined(HAVE_GETTIMEOFDAY)
+crude_time X(get_crude_time)(void)
+{
+     crude_time tv;
+     gettimeofday(&tv, 0);
+     return tv;
+}
+
+#define elapsed_sec(t1,t0) ((double)(t1.tv_sec - t0.tv_sec) +		\
+			    (double)(t1.tv_usec - t0.tv_usec) * 1.0E-6)
+
+static double elapsed_since(crude_time t0)
+{
+     crude_time t1;
+     gettimeofday(&t1, 0);
+     return elapsed_sec(t1, t0);
+}
+
+#  define TIME_MIN_SEC 1.0e-3
+
+#else /* !HAVE_GETTIMEOFDAY */
+
+/* Note that the only system where we are likely to need to fall back
+   on the clock() function is Windows, for which CLOCKS_PER_SEC is 1000
+   and thus the clock wraps once every 50 days.  This should hopefully
+   be longer than the time required to create any single plan! */
+crude_time X(get_crude_time)(void) { return clock(); }
+
+#define elapsed_sec(t1,t0) ((double) ((t1) - (t0)) / CLOCKS_PER_SEC)
+
+static double elapsed_since(crude_time t0)
+{
+     return elapsed_sec(clock(), t0);
+}
+
+#  define TIME_MIN_SEC 2.0e-1 /* from fftw2 */
+
+#endif /* !HAVE_GETTIMEOFDAY */
+
+double X(elapsed_since)(const planner *plnr, const problem *p, crude_time t0)
+{
+     double t = elapsed_since(t0);
+     if (plnr->cost_hook)
+	  t = plnr->cost_hook(p, t, COST_MAX);
+     return t;
+}
+
+#ifdef WITH_SLOW_TIMER
+/* excruciatingly slow; only use this if there is no choice! */
+typedef crude_time ticks;
+#  define getticks X(get_crude_time)
+#  define elapsed(t1,t0) elapsed_sec(t1,t0)
+#  define TIME_MIN TIME_MIN_SEC
+#  define TIME_REPEAT 4 /* from fftw2 */
+#  define HAVE_TICK_COUNTER
+#endif
+
+#ifdef HAVE_TICK_COUNTER
+
+#  ifndef TIME_MIN
+#    define TIME_MIN 100.0
+#  endif
+
+#  ifndef TIME_REPEAT
+#    define TIME_REPEAT 8
+#  endif
+
+  static double measure(plan *pln, const problem *p, int iter)
+  {
+       ticks t0, t1;
+       int i;
+
+       t0 = getticks();
+       for (i = 0; i < iter; ++i) 
+	    pln->adt->solve(pln, p);
+       t1 = getticks();
+       return elapsed(t1, t0);
+  }
+
+
+  double X(measure_execution_time)(const planner *plnr, 
+				   plan *pln, const problem *p)
+  {
+       int iter;
+       int repeat;
+
+       X(plan_awake)(pln, AWAKE_ZERO);
+       p->adt->zero(p);
+
+  start_over:
+       for (iter = 1; iter; iter *= 2) {
+	    double tmin = 0;
+	    int first = 1;
+	    crude_time begin = X(get_crude_time)();
+
+	    /* repeat the measurement TIME_REPEAT times */
+	    for (repeat = 0; repeat < TIME_REPEAT; ++repeat) {
+		 double t = measure(pln, p, iter);
+		 
+		 if (plnr->cost_hook)
+		      t = plnr->cost_hook(p, t, COST_MAX);
+		 if (t < 0)
+		      goto start_over;
+
+		 if (first || t < tmin)
+		      tmin = t;
+		 first = 0;
+
+		 /* do not run for too long */
+		 if (X(elapsed_since)(plnr, p, begin) > FFTW_TIME_LIMIT)
+		      break;
+	    }
+
+	    if (tmin >= TIME_MIN) {
+		 X(plan_awake)(pln, SLEEPY);
+		 return tmin / (double) iter;
+	    }
+       }
+       goto start_over; /* may happen if timer is screwed up */
+  }
+
+#else /* no cycle counter */
+
+  double X(measure_execution_time)(const planner *plnr, 
+				   plan *pln, const problem *p)
+  {
+       UNUSED(plnr);
+       UNUSED(p);
+       UNUSED(pln);
+       return -1.0;
+  }
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/transpose.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/transpose.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+
+/* in place square transposition, iterative */
+void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl)
+{
+     INT i0, i1, v;
+
+     switch (vl) {
+	 case 1:
+	      for (i1 = 1; i1 < n; ++i1) {
+		   for (i0 = 0; i0 < i1; ++i0) {
+			R x0 = I[i1 * s0 + i0 * s1];
+			R y0 = I[i1 * s1 + i0 * s0];
+			I[i1 * s1 + i0 * s0] = x0;
+			I[i1 * s0 + i0 * s1] = y0;
+		   }
+	      }
+	      break;
+	 case 2:
+	      for (i1 = 1; i1 < n; ++i1) {
+		   for (i0 = 0; i0 < i1; ++i0) {
+			R x0 = I[i1 * s0 + i0 * s1];
+			R x1 = I[i1 * s0 + i0 * s1 + 1];
+			R y0 = I[i1 * s1 + i0 * s0];
+			R y1 = I[i1 * s1 + i0 * s0 + 1];
+			I[i1 * s1 + i0 * s0] = x0;
+			I[i1 * s1 + i0 * s0 + 1] = x1;
+			I[i1 * s0 + i0 * s1] = y0;
+			I[i1 * s0 + i0 * s1 + 1] = y1;
+		   }
+	      }
+	      break;
+	 default:
+	      for (i1 = 1; i1 < n; ++i1) {
+		   for (i0 = 0; i0 < i1; ++i0) {
+			for (v = 0; v < vl; ++v) {
+			     R x0 = I[i1 * s0 + i0 * s1 + v];
+			     R y0 = I[i1 * s1 + i0 * s0 + v];
+			     I[i1 * s1 + i0 * s0 + v] = x0;
+			     I[i1 * s0 + i0 * s1 + v] = y0;
+			}
+		   }
+	      }
+	      break;
+     }
+}
+
+struct transpose_closure {
+     R *I;
+     INT s0, s1, vl, tilesz;
+     R *buf0, *buf1; 
+};
+
+static void dotile(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
+{
+     struct transpose_closure *k = (struct transpose_closure *)args;
+     R *I = k->I;
+     INT s0 = k->s0, s1 = k->s1, vl = k->vl;
+     INT i0, i1, v;
+
+     switch (vl) {
+	 case 1:
+	      for (i1 = n1l; i1 < n1u; ++i1) {
+		   for (i0 = n0l; i0 < n0u; ++i0) {
+			R x0 = I[i1 * s0 + i0 * s1];
+			R y0 = I[i1 * s1 + i0 * s0];
+			I[i1 * s1 + i0 * s0] = x0;
+			I[i1 * s0 + i0 * s1] = y0;
+		   }
+	      }
+	      break;
+	 case 2:
+	      for (i1 = n1l; i1 < n1u; ++i1) {
+		   for (i0 = n0l; i0 < n0u; ++i0) {
+			R x0 = I[i1 * s0 + i0 * s1];
+			R x1 = I[i1 * s0 + i0 * s1 + 1];
+			R y0 = I[i1 * s1 + i0 * s0];
+			R y1 = I[i1 * s1 + i0 * s0 + 1];
+			I[i1 * s1 + i0 * s0] = x0;
+			I[i1 * s1 + i0 * s0 + 1] = x1;
+			I[i1 * s0 + i0 * s1] = y0;
+			I[i1 * s0 + i0 * s1 + 1] = y1;
+		   }
+	      }
+	      break;
+	 default:
+	      for (i1 = n1l; i1 < n1u; ++i1) {
+		   for (i0 = n0l; i0 < n0u; ++i0) {
+			for (v = 0; v < vl; ++v) {
+			     R x0 = I[i1 * s0 + i0 * s1 + v];
+			     R y0 = I[i1 * s1 + i0 * s0 + v];
+			     I[i1 * s1 + i0 * s0 + v] = x0;
+			     I[i1 * s0 + i0 * s1 + v] = y0;
+			}
+		   }
+	      }
+     }
+}
+
+static void dotile_buf(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
+{
+     struct transpose_closure *k = (struct transpose_closure *)args;
+     X(cpy2d_ci)(k->I + n0l * k->s0 + n1l * k->s1,
+		 k->buf0,
+		 n0u - n0l, k->s0, k->vl,
+		 n1u - n1l, k->s1, k->vl * (n0u - n0l),
+		 k->vl);
+     X(cpy2d_ci)(k->I + n0l * k->s1 + n1l * k->s0,
+		 k->buf1,
+		 n0u - n0l, k->s1, k->vl,
+		 n1u - n1l, k->s0, k->vl * (n0u - n0l),
+		 k->vl);
+     X(cpy2d_co)(k->buf1,
+		 k->I + n0l * k->s0 + n1l * k->s1,
+		 n0u - n0l, k->vl, k->s0,
+		 n1u - n1l, k->vl * (n0u - n0l), k->s1,
+		 k->vl);
+     X(cpy2d_co)(k->buf0,
+		 k->I + n0l * k->s1 + n1l * k->s0,
+		 n0u - n0l, k->vl, k->s1,
+		 n1u - n1l, k->vl * (n0u - n0l), k->s0,
+		 k->vl);
+}
+
+static void transpose_rec(R *I, INT n,
+			  void (*f)(INT n0l, INT n0u, INT n1l, INT n1u,
+				    void *args),
+			  struct transpose_closure *k)
+{
+   tail:
+     if (n > 1) {
+	  INT n2 = n / 2;
+	  k->I = I;
+	  X(tile2d)(0, n2, n2, n, k->tilesz, f, k);
+	  transpose_rec(I, n2, f, k);
+	  I += n2 * (k->s0 + k->s1); n -= n2; goto tail;
+     }
+}
+
+void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl) 
+{
+     struct transpose_closure k;
+     k.s0 = s0;
+     k.s1 = s1;
+     k.vl = vl;
+     /* two blocks must be in cache, to be swapped */
+     k.tilesz = X(compute_tilesz)(vl, 2);
+     k.buf0 = k.buf1 = 0; /* unused */
+     transpose_rec(I, n, dotile, &k);
+}
+
+void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl) 
+{
+     struct transpose_closure k;
+     /* Assume that the the rows of I conflict into the same cache
+        lines, and therefore we don't need to reserve cache space for
+        the input.  If the rows don't conflict, there is no reason
+	to use tiledbuf at all.*/
+     R buf0[CACHESIZE / (2 * sizeof(R))];
+     R buf1[CACHESIZE / (2 * sizeof(R))];
+     k.s0 = s0;
+     k.s1 = s1;
+     k.vl = vl;
+     k.tilesz = X(compute_tilesz)(vl, 2);
+     k.buf0 = buf0;
+     k.buf1 = buf1;
+     A(k.tilesz * k.tilesz * vl * sizeof(R) <= sizeof(buf0));
+     A(k.tilesz * k.tilesz * vl * sizeof(R) <= sizeof(buf1));
+     transpose_rec(I, n, dotile_buf, &k);
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/trig.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/trig.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* trigonometric functions */
+#include "ifftw.h"
+#include <math.h>
+
+#if defined(TRIGREAL_IS_LONG_DOUBLE)
+#  define COS cosl
+#  define SIN sinl
+#  define KTRIG(x) (x##L)
+#  ifndef HAVE_DECL_SINL
+     extern long double sinl(long double x);
+#  endif
+#  ifndef HAVE_DECL_COSL
+     extern long double cosl(long double x);
+#  endif
+#elif defined(TRIGREAL_IS_QUAD)
+#  define COS cosq
+#  define SIN sinq
+#  define KTRIG(x) (x##Q)
+   extern __float128 sinq(__float128 x);
+   extern __float128 cosq(__float128 x);
+#else
+#  define COS cos
+#  define SIN sin
+#  define KTRIG(x) (x)
+#endif
+
+static const trigreal K2PI =
+    KTRIG(6.2831853071795864769252867665590057683943388);
+#define by2pi(m, n) ((K2PI * (m)) / (n))
+
+/*
+ * Improve accuracy by reducing x to range [0..1/8]
+ * before multiplication by 2 * PI.
+ */
+
+static void real_cexp(INT m, INT n, trigreal *out)
+{
+     trigreal theta, c, s, t;
+     unsigned octant = 0;
+     INT quarter_n = n;
+
+     n += n; n += n;
+     m += m; m += m;
+
+     if (m < 0) m += n;
+     if (m > n - m) { m = n - m; octant |= 4; }
+     if (m - quarter_n > 0) { m = m - quarter_n; octant |= 2; }
+     if (m > quarter_n - m) { m = quarter_n - m; octant |= 1; }
+
+     theta = by2pi(m, n);
+     c = COS(theta); s = SIN(theta);
+
+     if (octant & 1) { t = c; c = s; s = t; }
+     if (octant & 2) { t = c; c = -s; s = t; }
+     if (octant & 4) { s = -s; }
+
+     out[0] = c; 
+     out[1] = s; 
+}
+
+static INT choose_twshft(INT n)
+{
+     INT log2r = 0;
+     while (n > 0) {
+	  ++log2r;
+	  n /= 4;
+     }
+     return log2r;
+}
+
+static void cexpl_sqrtn_table(triggen *p, INT m, trigreal *res)
+{
+     m += p->n * (m < 0);
+
+     {
+	  INT m0 = m & p->twmsk;
+	  INT m1 = m >> p->twshft;
+	  trigreal wr0 = p->W0[2 * m0];
+	  trigreal wi0 = p->W0[2 * m0 + 1];
+	  trigreal wr1 = p->W1[2 * m1];
+	  trigreal wi1 = p->W1[2 * m1 + 1];
+
+	  res[0] = wr1 * wr0 - wi1 * wi0;
+	  res[1] = wi1 * wr0 + wr1 * wi0;
+     }
+}
+
+/* multiply (xr, xi) by exp(FFT_SIGN * 2*pi*i*m/n) */
+static void rotate_sqrtn_table(triggen *p, INT m, R xr, R xi, R *res)
+{
+     m += p->n * (m < 0);
+
+     {
+	  INT m0 = m & p->twmsk;
+	  INT m1 = m >> p->twshft;
+	  trigreal wr0 = p->W0[2 * m0];
+	  trigreal wi0 = p->W0[2 * m0 + 1];
+	  trigreal wr1 = p->W1[2 * m1];
+	  trigreal wi1 = p->W1[2 * m1 + 1];
+	  trigreal wr = wr1 * wr0 - wi1 * wi0;
+	  trigreal wi = wi1 * wr0 + wr1 * wi0;
+
+#if FFT_SIGN == -1
+	  res[0] = xr * wr + xi * wi;
+	  res[1] = xi * wr - xr * wi;
+#else
+	  res[0] = xr * wr - xi * wi;
+	  res[1] = xi * wr + xr * wi;
+#endif
+     }
+}
+
+static void cexpl_sincos(triggen *p, INT m, trigreal *res)
+{
+     real_cexp(m, p->n, res);
+}
+
+static void cexp_zero(triggen *p, INT m, R *res)
+{
+     UNUSED(p); UNUSED(m);
+     res[0] = 0;
+     res[1] = 0;
+}
+
+static void cexpl_zero(triggen *p, INT m, trigreal *res)
+{
+     UNUSED(p); UNUSED(m);
+     res[0] = 0;
+     res[1] = 0;
+}
+
+static void cexp_generic(triggen *p, INT m, R *res)
+{
+     trigreal resl[2];
+     p->cexpl(p, m, resl);
+     res[0] = (R)resl[0];
+     res[1] = (R)resl[1];
+}
+
+static void rotate_generic(triggen *p, INT m, R xr, R xi, R *res)
+{
+     trigreal w[2];
+     p->cexpl(p, m, w);
+     res[0] = xr * w[0] - xi * (FFT_SIGN * w[1]);
+     res[1] = xi * w[0] + xr * (FFT_SIGN * w[1]);
+}
+
+triggen *X(mktriggen)(enum wakefulness wakefulness, INT n)
+{
+     INT i, n0, n1;
+     triggen *p = (triggen *)MALLOC(sizeof(*p), TWIDDLES);
+
+     p->n = n;
+     p->W0 = p->W1 = 0;
+     p->cexp = 0;
+     p->rotate = 0;
+
+     switch (wakefulness) {
+	 case SLEEPY:
+	      A(0 /* can't happen */);
+	      break;
+
+	 case AWAKE_SQRTN_TABLE: {
+	      INT twshft = choose_twshft(n);
+
+	      p->twshft = twshft;
+	      p->twradix = ((INT)1) << twshft;
+	      p->twmsk = p->twradix - 1;
+
+	      n0 = p->twradix;
+	      n1 = (n + n0 - 1) / n0;
+
+	      p->W0 = (trigreal *)MALLOC(n0 * 2 * sizeof(trigreal), TWIDDLES);
+	      p->W1 = (trigreal *)MALLOC(n1 * 2 * sizeof(trigreal), TWIDDLES);
+
+	      for (i = 0; i < n0; ++i) 
+		   real_cexp(i, n, p->W0 + 2 * i);
+
+	      for (i = 0; i < n1; ++i) 
+		   real_cexp(i * p->twradix, n, p->W1 + 2 * i);
+
+	      p->cexpl = cexpl_sqrtn_table;
+	      p->rotate = rotate_sqrtn_table;
+	      break;
+	 }
+
+	 case AWAKE_SINCOS: 
+	      p->cexpl = cexpl_sincos;
+	      break;
+
+	 case AWAKE_ZERO: 
+	      p->cexp = cexp_zero;
+	      p->cexpl = cexpl_zero;
+	      break;
+     }
+
+     if (!p->cexp) {
+	  if (sizeof(trigreal) == sizeof(R))
+	       p->cexp = (void (*)(triggen *, INT, R *))p->cexpl;
+	  else
+	       p->cexp = cexp_generic;
+     }
+     if (!p->rotate)     
+	       p->rotate = rotate_generic;
+     return p;
+}
+
+void X(triggen_destroy)(triggen *p)
+{
+     X(ifree0)(p->W0);
+     X(ifree0)(p->W1);
+     X(ifree)(p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/kernel/twiddle.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/kernel/twiddle.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Twiddle manipulation */
+
+#include "ifftw.h"
+#include <math.h>
+
+#define HASHSZ 109
+
+/* hash table of known twiddle factors */
+static twid *twlist[HASHSZ];
+
+static INT hash(INT n, INT r)
+{
+     INT h = n * 17 + r;
+
+     if (h < 0) h = -h;
+
+     return (h % HASHSZ);
+}
+
+static int equal_instr(const tw_instr *p, const tw_instr *q)
+{
+     if (p == q)
+          return 1;
+
+     for (;; ++p, ++q) {
+          if (p->op != q->op)
+	       return 0;
+
+	  switch (p->op) {
+	      case TW_NEXT:
+		   return (p->v == q->v); /* p->i is ignored */
+
+	      case TW_FULL:
+	      case TW_HALF:
+		   if (p->v != q->v) return 0; /* p->i is ignored */
+		   break;
+
+	      default:
+		   if (p->v != q->v || p->i != q->i) return 0;
+		   break;
+	  }
+     }
+     A(0 /* can't happen */);
+}
+
+static int ok_twid(const twid *t, 
+		   enum wakefulness wakefulness,
+		   const tw_instr *q, INT n, INT r, INT m)
+{
+     return (wakefulness == t->wakefulness &&
+	     n == t->n &&
+	     r == t->r && 
+	     m <= t->m && 
+	     equal_instr(t->instr, q));
+}
+
+static twid *lookup(enum wakefulness wakefulness,
+		    const tw_instr *q, INT n, INT r, INT m)
+{
+     twid *p;
+
+     for (p = twlist[hash(n,r)]; 
+	  p && !ok_twid(p, wakefulness, q, n, r, m); 
+	  p = p->cdr)
+          ;
+     return p;
+}
+
+static INT twlen0(INT r, const tw_instr *p, INT *vl)
+{
+     INT ntwiddle = 0;
+
+     /* compute length of bytecode program */
+     A(r > 0);
+     for ( ; p->op != TW_NEXT; ++p) {
+	  switch (p->op) {
+	      case TW_FULL:
+		   ntwiddle += (r - 1) * 2;
+		   break;
+	      case TW_HALF:
+		   ntwiddle += (r - 1);
+		   break;
+	      case TW_CEXP:
+		   ntwiddle += 2;
+		   break;
+	      case TW_COS:
+	      case TW_SIN:
+		   ntwiddle += 1;
+		   break;
+	  }
+     }
+
+     *vl = (INT)p->v;
+     return ntwiddle;
+}
+
+INT X(twiddle_length)(INT r, const tw_instr *p)
+{
+     INT vl;
+     return twlen0(r, p, &vl);
+}
+
+static R *compute(enum wakefulness wakefulness,
+		  const tw_instr *instr, INT n, INT r, INT m)
+{
+     INT ntwiddle, j, vl;
+     R *W, *W0;
+     const tw_instr *p;
+     triggen *t = X(mktriggen)(wakefulness, n);
+
+     p = instr;
+     ntwiddle = twlen0(r, p, &vl);
+
+     A(m % vl == 0);
+
+     W0 = W = (R *)MALLOC((ntwiddle * (m / vl)) * sizeof(R), TWIDDLES);
+
+     for (j = 0; j < m; j += vl) {
+          for (p = instr; p->op != TW_NEXT; ++p) {
+	       switch (p->op) {
+		   case TW_FULL: {
+			INT i;
+			for (i = 1; i < r; ++i) {
+			     A((j + (INT)p->v) * i < n);
+			     A((j + (INT)p->v) * i > -n);
+			     t->cexp(t, (j + (INT)p->v) * i, W);
+			     W += 2;
+			}
+			break;
+		   }
+
+		   case TW_HALF: {
+			INT i;
+			A((r % 2) == 1);
+			for (i = 1; i + i < r; ++i) {
+			     t->cexp(t, MULMOD(i, (j + (INT)p->v), n), W);
+			     W += 2;
+			}
+			break;
+		   }
+
+		   case TW_COS: {
+			R d[2];
+
+			A((j + (INT)p->v) * p->i < n);
+			A((j + (INT)p->v) * p->i > -n);
+			t->cexp(t, (j + (INT)p->v) * (INT)p->i, d);
+			*W++ = d[0];
+			break;
+		   }
+
+		   case TW_SIN: {
+			R d[2];
+
+			A((j + (INT)p->v) * p->i < n);
+			A((j + (INT)p->v) * p->i > -n);
+			t->cexp(t, (j + (INT)p->v) * (INT)p->i, d);
+			*W++ = d[1];
+			break;
+		   }
+
+		   case TW_CEXP:
+			A((j + (INT)p->v) * p->i < n);
+			A((j + (INT)p->v) * p->i > -n);
+			t->cexp(t, (j + (INT)p->v) * (INT)p->i, W);
+			W += 2;
+			break;
+	       }
+	  }
+     }
+
+     X(triggen_destroy)(t);
+     return W0;
+}
+
+static void mktwiddle(enum wakefulness wakefulness,
+		      twid **pp, const tw_instr *instr, INT n, INT r, INT m)
+{
+     twid *p;
+     INT h;
+
+     if ((p = lookup(wakefulness, instr, n, r, m))) {
+          ++p->refcnt;
+     } else {
+	  p = (twid *) MALLOC(sizeof(twid), TWIDDLES);
+	  p->n = n;
+	  p->r = r;
+	  p->m = m;
+	  p->instr = instr;
+	  p->refcnt = 1;
+	  p->wakefulness = wakefulness;
+	  p->W = compute(wakefulness, instr, n, r, m);
+
+	  /* cons! onto twlist */
+	  h = hash(n, r);
+	  p->cdr = twlist[h];
+	  twlist[h] = p;
+     }
+
+     *pp = p;
+}
+
+static void twiddle_destroy(twid **pp)
+{
+     twid *p = *pp;
+     twid **q;
+
+     if ((--p->refcnt) == 0) {
+	  /* remove p from twiddle list */
+	  for (q = &twlist[hash(p->n, p->r)]; *q; q = &((*q)->cdr)) {
+	       if (*q == p) {
+		    *q = p->cdr;
+		    X(ifree)(p->W);
+		    X(ifree)(p);
+		    *pp = 0;
+		    return;
+	       }
+	  }
+	  A(0 /* can't happen */ );
+     }
+}
+
+
+void X(twiddle_awake)(enum wakefulness wakefulness, twid **pp, 
+		      const tw_instr *instr, INT n, INT r, INT m)
+{
+     switch (wakefulness) {
+	 case SLEEPY: 
+	      twiddle_destroy(pp);
+	      break;
+	 default:
+	      mktwiddle(wakefulness, pp, instr, n, r, m);
+	      break;
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,17 @@
+noinst_LIBRARIES=libbench2.a
+
+libbench2_a_SOURCES=after-ccopy-from.c after-ccopy-to.c			\
+after-hccopy-from.c after-hccopy-to.c after-rcopy-from.c		\
+after-rcopy-to.c allocate.c aset.c bench-cost-postprocess.c		\
+bench-exit.c bench-main.c can-do.c caset.c dotens2.c info.c main.c	\
+mflops.c mp.c ovtpvt.c pow2.c problem.c report.c speed.c tensor.c	\
+timer.c useropt.c util.c verify-dft.c verify-lib.c verify-r2r.c		\
+verify-rdft2.c verify.c zero.c bench-user.h bench.h verify.h		\
+my-getopt.c my-getopt.h
+
+benchmark: all
+	@echo "nothing to benchmark"
+
+accuracy: all
+	@echo "nothing to benchmark"
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,589 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = libbench2
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LIBRARIES = $(noinst_LIBRARIES)
+ARFLAGS = cru
+libbench2_a_AR = $(AR) $(ARFLAGS)
+libbench2_a_LIBADD =
+am_libbench2_a_OBJECTS = after-ccopy-from.$(OBJEXT) \
+	after-ccopy-to.$(OBJEXT) after-hccopy-from.$(OBJEXT) \
+	after-hccopy-to.$(OBJEXT) after-rcopy-from.$(OBJEXT) \
+	after-rcopy-to.$(OBJEXT) allocate.$(OBJEXT) aset.$(OBJEXT) \
+	bench-cost-postprocess.$(OBJEXT) bench-exit.$(OBJEXT) \
+	bench-main.$(OBJEXT) can-do.$(OBJEXT) caset.$(OBJEXT) \
+	dotens2.$(OBJEXT) info.$(OBJEXT) main.$(OBJEXT) \
+	mflops.$(OBJEXT) mp.$(OBJEXT) ovtpvt.$(OBJEXT) pow2.$(OBJEXT) \
+	problem.$(OBJEXT) report.$(OBJEXT) speed.$(OBJEXT) \
+	tensor.$(OBJEXT) timer.$(OBJEXT) useropt.$(OBJEXT) \
+	util.$(OBJEXT) verify-dft.$(OBJEXT) verify-lib.$(OBJEXT) \
+	verify-r2r.$(OBJEXT) verify-rdft2.$(OBJEXT) verify.$(OBJEXT) \
+	zero.$(OBJEXT) my-getopt.$(OBJEXT)
+libbench2_a_OBJECTS = $(am_libbench2_a_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libbench2_a_SOURCES)
+DIST_SOURCES = $(libbench2_a_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+noinst_LIBRARIES = libbench2.a
+libbench2_a_SOURCES = after-ccopy-from.c after-ccopy-to.c			\
+after-hccopy-from.c after-hccopy-to.c after-rcopy-from.c		\
+after-rcopy-to.c allocate.c aset.c bench-cost-postprocess.c		\
+bench-exit.c bench-main.c can-do.c caset.c dotens2.c info.c main.c	\
+mflops.c mp.c ovtpvt.c pow2.c problem.c report.c speed.c tensor.c	\
+timer.c useropt.c util.c verify-dft.c verify-lib.c verify-r2r.c		\
+verify-rdft2.c verify.c zero.c bench-user.h bench.h verify.h		\
+my-getopt.c my-getopt.h
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libbench2/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu libbench2/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLIBRARIES:
+	-test -z "$(noinst_LIBRARIES)" || rm -f $(noinst_LIBRARIES)
+libbench2.a: $(libbench2_a_OBJECTS) $(libbench2_a_DEPENDENCIES) $(EXTRA_libbench2_a_DEPENDENCIES) 
+	-rm -f libbench2.a
+	$(libbench2_a_AR) libbench2.a $(libbench2_a_OBJECTS) $(libbench2_a_LIBADD)
+	$(RANLIB) libbench2.a
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-ccopy-from.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-ccopy-to.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-hccopy-from.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-hccopy-to.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-rcopy-from.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-rcopy-to.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/allocate.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/aset.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bench-cost-postprocess.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bench-exit.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bench-main.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/can-do.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/caset.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dotens2.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/info.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mflops.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mp.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/my-getopt.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ovtpvt.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pow2.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/report.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/speed.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/timer.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/useropt.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/util.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/verify-dft.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/verify-lib.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/verify-r2r.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/verify-rdft2.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/verify.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/zero.Po@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LIBRARIES)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+benchmark: all
+	@echo "nothing to benchmark"
+
+accuracy: all
+	@echo "nothing to benchmark"
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/after-ccopy-from.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/after-ccopy-from.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,10 @@
+/* not worth copyrighting */
+#include "bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
+{
+     UNUSED(p);
+     UNUSED(ri);
+     UNUSED(ii);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/after-ccopy-to.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/after-ccopy-to.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,10 @@
+/* not worth copyrighting */
+#include "bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
+{
+     UNUSED(p);
+     UNUSED(ro);
+     UNUSED(io);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/after-hccopy-from.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/after-hccopy-from.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,10 @@
+/* not worth copyrighting */
+#include "bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
+{
+     UNUSED(p);
+     UNUSED(ri);
+     UNUSED(ii);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/after-hccopy-to.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/after-hccopy-to.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,10 @@
+/* not worth copyrighting */
+#include "bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
+{
+     UNUSED(p);
+     UNUSED(ro);
+     UNUSED(io);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/after-rcopy-from.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/after-rcopy-from.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,9 @@
+/* not worth copyrighting */
+#include "bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_rcopy_from(bench_problem *p, bench_real *ri)
+{
+     UNUSED(p);
+     UNUSED(ri);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/after-rcopy-to.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/after-rcopy-to.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,9 @@
+/* not worth copyrighting */
+#include "bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_rcopy_to(bench_problem *p, bench_real *ro)
+{
+     UNUSED(p);
+     UNUSED(ro);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/allocate.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/allocate.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,110 @@
+/* not worth copyrighting */
+
+
+#include "bench.h"
+
+static void bounds(bench_problem *p, int *ilb, int *iub, int *olb, int *oub)
+{
+     bench_tensor *t = tensor_append(p->sz, p->vecsz);
+     tensor_ibounds(t, ilb, iub);
+     tensor_obounds(t, olb, oub);
+     tensor_destroy(t);
+}
+
+/*
+ * Allocate I/O arrays for a problem.
+ *
+ * This is the default routine that can be overridden by the user in
+ * complicated cases.
+ */
+void problem_alloc(bench_problem *p)
+{
+     int ilb, iub, olb, oub;
+     int isz, osz;
+
+     bounds(p, &ilb, &iub, &olb, &oub);
+     isz = iub - ilb;
+     osz = oub - olb;
+
+     if (p->kind == PROBLEM_COMPLEX) {
+	  bench_complex *in, *out;
+
+	  p->iphyssz = isz;
+	  p->inphys = in = (bench_complex *) bench_malloc(isz * sizeof(bench_complex));
+	  p->in = in - ilb;
+	  
+	  if (p->in_place) {
+	       p->out = p->in;
+	       p->outphys = p->inphys;
+	       p->ophyssz = p->iphyssz;
+	  } else {
+	       p->ophyssz = osz;
+	       p->outphys = out = (bench_complex *) bench_malloc(osz * sizeof(bench_complex));
+	       p->out = out - olb;
+	  }
+     } else if (p->kind == PROBLEM_R2R) {
+	  bench_real *in, *out;
+
+	  p->iphyssz = isz;
+	  p->inphys = in = (bench_real *) bench_malloc(isz * sizeof(bench_real));
+	  p->in = in - ilb;
+	  
+	  if (p->in_place) {
+	       p->out = p->in;
+	       p->outphys = p->inphys;
+	       p->ophyssz = p->iphyssz;
+	  } else {
+	       p->ophyssz = osz;
+	       p->outphys = out = (bench_real *) bench_malloc(osz * sizeof(bench_real));
+	       p->out = out - olb;
+	  }
+     } else if (p->kind == PROBLEM_REAL && p->sign < 0) { /* R2HC */
+	  bench_real *in;
+	  bench_complex *out;
+
+	  isz = isz > osz*2 ? isz : osz*2;
+	  p->iphyssz = isz;
+	  p->inphys = in = (bench_real *) bench_malloc(p->iphyssz * sizeof(bench_real));
+	  p->in = in - ilb;
+	  
+	  if (p->in_place) {
+	       p->out = p->in;
+	       p->outphys = p->inphys;
+	       p->ophyssz = p->iphyssz / 2;
+	  } else {
+	       p->ophyssz = osz;
+	       p->outphys = out = (bench_complex *) bench_malloc(osz * sizeof(bench_complex));
+	       p->out = out - olb;
+	  }
+     } else if (p->kind == PROBLEM_REAL && p->sign > 0) { /* HC2R */
+	  bench_real *out;
+	  bench_complex *in;
+
+	  osz = osz > isz*2 ? osz : isz*2;
+	  p->ophyssz = osz;
+	  p->outphys = out = (bench_real *) bench_malloc(p->ophyssz * sizeof(bench_real));
+	  p->out = out - olb;
+	  
+	  if (p->in_place) {
+	       p->in = p->out;
+	       p->inphys = p->outphys;
+	       p->iphyssz = p->ophyssz / 2;
+	  } else {
+	       p->iphyssz = isz;
+	       p->inphys = in = (bench_complex *) bench_malloc(isz * sizeof(bench_complex));
+	       p->in = in - ilb;
+	  }
+     } else {
+	  BENCH_ASSERT(0); /* TODO */
+     }
+}
+
+void problem_free(bench_problem *p)
+{
+     if (p->outphys && p->outphys != p->inphys)
+	  bench_free(p->outphys);
+     if (p->inphys)
+	  bench_free(p->inphys);
+     tensor_destroy(p->sz);
+     tensor_destroy(p->vecsz);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/aset.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/aset.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,10 @@
+/* not worth copyrighting */
+
+#include "bench.h"
+
+void aset(bench_real *A, int n, bench_real x)
+{
+     int i;
+     for (i = 0; i < n; ++i)
+	  A[i] = x;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/bench-cost-postprocess.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/bench-cost-postprocess.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,8 @@
+/* not worth copyrighting */
+#include "bench.h"
+
+/* default routine, can be overridden by user */
+double bench_cost_postprocess(double cost)
+{
+     return cost;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/bench-exit.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/bench-exit.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,8 @@
+/* not worth copyrighting */
+#include "bench.h"
+
+/* default routine, can be overridden by user */
+void bench_exit(int status)
+{
+     exit(status);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/bench-main.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/bench-main.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "bench.h"
+#include "my-getopt.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+int verbose;
+
+static const struct my_option options[] =
+{
+  {"accuracy", REQARG, 'a'},
+  {"accuracy-rounds", REQARG, 405},
+  {"impulse-accuracy-rounds", REQARG, 406},
+  {"can-do", REQARG, 'd'},
+  {"help", NOARG, 'h'},
+  {"info", REQARG, 'i'},
+  {"info-all", NOARG, 'I'},
+  {"print-precision", NOARG, 402},
+  {"print-time-min", NOARG, 400},
+  {"random-seed", REQARG, 404},
+  {"report-benchmark", NOARG, 320},
+  {"report-mflops", NOARG, 300},
+  {"report-time", NOARG, 310},
+  {"report-verbose", NOARG, 330},
+  {"speed", REQARG, 's'},
+  {"setup-speed", REQARG, 'S'},
+  {"time-min", REQARG, 't'},
+  {"time-repeat", REQARG, 'r'},
+  {"user-option", REQARG, 'o'},
+  {"verbose", OPTARG, 'v'},
+  {"verify", REQARG, 'y'},
+  {"verify-rounds", REQARG, 401},
+  {"verify-tolerance", REQARG, 403},
+  {0, NOARG, 0}
+};
+
+int bench_main(int argc, char *argv[])
+{
+     double tmin = 0.0;
+     double tol;
+     int repeat = 0;
+     int rounds = 10;
+     int iarounds = 0;
+     int arounds = 1; /* this is too low for precise results */
+     int c;
+
+     report = report_verbose; /* default */
+     verbose = 0;
+
+     tol = SINGLE_PRECISION ? 1.0e-3 : (QUAD_PRECISION ? 1e-29 : 1.0e-10);
+
+     main_init(&argc, &argv);
+
+     bench_srand(1);
+
+     while ((c = my_getopt (argc, argv, options)) != -1) {
+	  switch (c) {
+	      case 't' :
+		   tmin = strtod(my_optarg, 0);
+		   break;
+	      case 'r':
+		   repeat = atoi(my_optarg);
+		   break;
+	      case 's':
+		   timer_init(tmin, repeat);
+		   speed(my_optarg, 0);
+		   break;
+	      case 'S':
+		   timer_init(tmin, repeat);
+		   speed(my_optarg, 1);
+		   break;
+	      case 'd':
+		   report_can_do(my_optarg);
+		   break;
+	      case 'o':
+		   useropt(my_optarg);
+		   break;
+	      case 'v':
+		   if (verbose >= 0) { /* verbose < 0 disables output */
+			if (my_optarg)
+			     verbose = atoi(my_optarg);
+			else
+			     ++verbose;
+		   }
+		   break;
+	      case 'y':
+		   verify(my_optarg, rounds, tol);
+		   break;
+	      case 'a':
+		   accuracy(my_optarg, arounds, iarounds);
+		   break;
+	      case 'i':
+		   report_info(my_optarg);
+		   break;
+	      case 'I':
+		   report_info_all();
+		   break;
+	      case 'h':
+		   if (verbose >= 0) my_usage(argv[0], options);
+		   break;
+
+	      case 300: /* --report-mflops */
+		   report = report_mflops;
+		   break;
+
+	      case 310: /* --report-time */
+		   report = report_time;
+		   break;
+
+ 	      case 320: /* --report-benchmark */
+		   report = report_benchmark;
+		   break;
+
+ 	      case 330: /* --report-verbose */
+		   report = report_verbose;
+		   break;
+
+	      case 400: /* --print-time-min */
+		   timer_init(tmin, repeat);
+		   ovtpvt("%g\n", time_min);
+		   break;
+
+	      case 401: /* --verify-rounds */
+		   rounds = atoi(my_optarg);
+		   break;
+
+	      case 402: /* --print-precision */
+		   if (SINGLE_PRECISION)
+			ovtpvt("single\n");
+		   else if (QUAD_PRECISION)
+			ovtpvt("quad\n");
+		   else if (LDOUBLE_PRECISION)
+			ovtpvt("long-double\n");
+		   else if (DOUBLE_PRECISION)
+			ovtpvt("double\n");
+		   else 
+			ovtpvt("unknown %d\n", sizeof(bench_real));
+		   break;
+
+	      case 403: /* --verify-tolerance */
+		   tol = strtod(my_optarg, 0);
+		   break;
+
+	      case 404: /* --random-seed */
+		   bench_srand(atoi(my_optarg));
+		   break;
+
+	      case 405: /* --accuracy-rounds */
+		   arounds = atoi(my_optarg);
+		   break;
+		   
+	      case 406: /* --impulse-accuracy-rounds */
+		   iarounds = atoi(my_optarg);
+		   break;
+		   
+	      case '?':
+		   /* my_getopt() already printed an error message. */
+		   cleanup();
+		   return 1;
+
+	      default:
+		   abort ();
+	  }
+     }
+
+     /* assume that any remaining arguments are problems to be
+        benchmarked */
+     while (my_optind < argc) {
+	  timer_init(tmin, repeat);
+	  speed(argv[my_optind++], 0);
+     }
+
+     cleanup();
+     return 0;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/bench-user.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/bench-user.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __BENCH_USER_H__
+#define __BENCH_USER_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif                          /* __cplusplus */
+
+/* benchmark program definitions for user code */
+#include "config.h"
+
+#if HAVE_STDDEF_H
+#include <stddef.h>
+#endif
+
+#if HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+#if defined(BENCHFFT_SINGLE)
+typedef float bench_real;
+#elif defined(BENCHFFT_LDOUBLE)
+typedef long double bench_real;
+#elif defined(BENCHFFT_QUAD)
+typedef __float128 bench_real;
+#else
+typedef double bench_real;
+#endif
+
+typedef bench_real bench_complex[2];
+
+#define c_re(c)  ((c)[0])
+#define c_im(c)  ((c)[1])
+
+#undef DOUBLE_PRECISION
+#define DOUBLE_PRECISION (sizeof(bench_real) == sizeof(double))
+#undef SINGLE_PRECISION
+#define SINGLE_PRECISION (!DOUBLE_PRECISION && sizeof(bench_real) == sizeof(float))
+#undef LDOUBLE_PRECISION
+#define LDOUBLE_PRECISION (!DOUBLE_PRECISION && sizeof(bench_real) == sizeof(long double))
+
+#undef QUAD_PRECISION
+#ifdef BENCHFFT_QUAD
+#define QUAD_PRECISION (!LDOUBLE_PRECISION && sizeof(bench_real) == sizeof(__float128))
+#else
+#define QUAD_PRECISION 0
+#endif
+
+typedef enum { PROBLEM_COMPLEX, PROBLEM_REAL, PROBLEM_R2R } problem_kind_t;
+
+typedef enum {
+     R2R_R2HC, R2R_HC2R, R2R_DHT,
+     R2R_REDFT00, R2R_REDFT01, R2R_REDFT10, R2R_REDFT11,
+     R2R_RODFT00, R2R_RODFT01, R2R_RODFT10, R2R_RODFT11
+} r2r_kind_t;
+
+typedef struct {
+     int n;
+     int is;			/* input stride */
+     int os;			/* output stride */
+} bench_iodim;
+
+typedef struct {
+     int rnk;
+     bench_iodim *dims;
+} bench_tensor;
+
+bench_tensor *mktensor(int rnk);
+void tensor_destroy(bench_tensor *sz);
+int tensor_sz(const bench_tensor *sz);
+bench_tensor *tensor_compress(const bench_tensor *sz);
+int tensor_unitstridep(bench_tensor *t);
+int tensor_rowmajorp(bench_tensor *t);
+int tensor_real_rowmajorp(bench_tensor *t, int sign, int in_place);
+bench_tensor *tensor_append(const bench_tensor *a, const bench_tensor *b);
+bench_tensor *tensor_copy(const bench_tensor *sz);
+bench_tensor *tensor_copy_sub(const bench_tensor *sz, int start_dim, int rnk);
+bench_tensor *tensor_copy_swapio(const bench_tensor *sz);
+void tensor_ibounds(bench_tensor *t, int *lbp, int *ubp);
+void tensor_obounds(bench_tensor *t, int *lbp, int *ubp);
+
+/*
+  Definition of rank -infinity.
+  This definition has the property that if you want rank 0 or 1,
+  you can simply test for rank <= 1.  This is a common case.
+ 
+  A tensor of rank -infinity has size 0.
+*/
+#define RNK_MINFTY  ((int)(((unsigned) -1) >> 1))
+#define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
+
+typedef struct {
+     problem_kind_t kind;
+     r2r_kind_t *k;
+     bench_tensor *sz;
+     bench_tensor *vecsz;
+     int sign;
+     int in_place;
+     int destroy_input;
+     int split;
+     void *in, *out;
+     void *inphys, *outphys;
+     int iphyssz, ophyssz;
+     char *pstring;
+     void *userinfo; /* user can store whatever */
+     int scrambled_in, scrambled_out; /* hack for MPI */
+
+     /* internal hack so that we can use verifier in FFTW test program */
+     void *ini, *outi; /* if nonzero, point to imag. parts for dft */
+
+     /* another internal hack to avoid passing around too many parameters */
+     double setup_time;
+} bench_problem;
+
+extern int verbose;
+
+extern int no_speed_allocation;
+
+extern int always_pad_real;
+
+#define LIBBENCH_TIMER 0
+#define USER_TIMER 1
+#define BENCH_NTIMERS 2
+extern void timer_start(int which_timer);
+extern double timer_stop(int which_timer);
+
+extern int can_do(bench_problem *p);
+extern void setup(bench_problem *p);
+extern void doit(int iter, bench_problem *p);
+extern void done(bench_problem *p);
+extern void main_init(int *argc, char ***argv);
+extern void cleanup(void);
+extern void verify(const char *param, int rounds, double tol);
+extern void useropt(const char *arg);
+
+extern void verify_problem(bench_problem *p, int rounds, double tol);
+
+extern void problem_alloc(bench_problem *p);
+extern void problem_free(bench_problem *p);
+extern void problem_zero(bench_problem *p);
+extern void problem_destroy(bench_problem *p);
+
+extern int power_of_two(int n);
+extern int log_2(int n);
+
+
+#define CASSIGN(out, in) (c_re(out) = c_re(in), c_im(out) = c_im(in))
+
+bench_tensor *verify_pack(const bench_tensor *sz, int s);
+
+typedef struct {
+     double l;
+     double i;
+     double s;
+} errors;
+
+void verify_dft(bench_problem *p, int rounds, double tol, errors *e);
+void verify_rdft2(bench_problem *p, int rounds, double tol, errors *e);
+void verify_r2r(bench_problem *p, int rounds, double tol, errors *e);
+
+/**************************************************************/
+/* routines to override */
+
+extern void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii);
+extern void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io);
+extern void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii);
+extern void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io);
+extern void after_problem_rcopy_from(bench_problem *p, bench_real *ri);
+extern void after_problem_rcopy_to(bench_problem *p, bench_real *ro);
+extern void bench_exit(int status);
+extern double bench_cost_postprocess(double cost);
+
+/**************************************************************
+ * malloc
+ **************************************************************/
+extern void *bench_malloc(size_t size);
+extern void bench_free(void *ptr);
+extern void bench_free0(void *ptr);
+
+/**************************************************************
+ * alloca
+ **************************************************************/
+#ifdef HAVE_ALLOCA_H
+#include <alloca.h>
+#endif
+
+/**************************************************************
+ * assert
+ **************************************************************/
+extern void bench_assertion_failed(const char *s, int line, const char *file);
+#define BENCH_ASSERT(ex)						 \
+      (void)((ex) || (bench_assertion_failed(#ex, __LINE__, __FILE__), 0))
+
+#define UNUSED(x) (void)x
+
+/***************************************
+ * Documentation strings
+ ***************************************/
+struct bench_doc {
+     const char *key;
+     const char *val;
+     const char *(*f)(void);
+};
+
+extern struct bench_doc bench_doc[];
+
+#ifdef CC
+#define CC_DOC BENCH_DOC("cc", CC)
+#elif defined(BENCH_CC)
+#define CC_DOC BENCH_DOC("cc", BENCH_CC)
+#else
+#define CC_DOC /* none */
+#endif
+
+#ifdef CXX
+#define CXX_DOC BENCH_DOC("cxx", CXX)
+#elif defined(BENCH_CXX)
+#define CXX_DOC BENCH_DOC("cxx", BENCH_CXX)
+#else
+#define CXX_DOC /* none */
+#endif
+
+#ifdef F77
+#define F77_DOC BENCH_DOC("f77", F77)
+#elif defined(BENCH_F77)
+#define F77_DOC BENCH_DOC("f77", BENCH_F77)
+#else
+#define F77_DOC /* none */
+#endif
+
+#ifdef F90
+#define F90_DOC BENCH_DOC("f90", F90)
+#elif defined(BENCH_F90)
+#define F90_DOC BENCH_DOC("f90", BENCH_F90)
+#else
+#define F90_DOC /* none */
+#endif
+
+#define BEGIN_BENCH_DOC						\
+struct bench_doc bench_doc[] = {				\
+    CC_DOC							\
+    CXX_DOC							\
+    F77_DOC							\
+    F90_DOC
+
+#define BENCH_DOC(key, val) { key, val, 0 },
+#define BENCH_DOCF(key, f) { key, 0, f },
+
+#define END_BENCH_DOC				\
+     {0, 0, 0}};
+
+#ifdef __cplusplus
+}                               /* extern "C" */
+#endif                          /* __cplusplus */
+    
+#endif /* __BENCH_USER_H__ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/bench.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/bench.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* benchmark program definitions */
+#include "bench-user.h"
+
+extern double time_min;
+extern int time_repeat;
+
+extern void timer_init(double tmin, int repeat);
+
+/* report functions */
+extern void (*report)(const bench_problem *p, double *t, int st);
+
+void report_mflops(const bench_problem *p, double *t, int st);
+void report_time(const bench_problem *p, double *t, int st);
+void report_benchmark(const bench_problem *p, double *t, int st);
+void report_verbose(const bench_problem *p, double *t, int st);
+
+void report_can_do(const char *param);
+void report_info(const char *param);
+void report_info_all(void);
+
+extern int aligned_main(int argc, char *argv[]);
+extern int bench_main(int argc, char *argv[]);
+
+extern void speed(const char *param, int setup_only);
+extern void accuracy(const char *param, int rounds, int impulse_rounds);
+
+extern double mflops(const bench_problem *p, double t);
+
+extern double bench_drand(void);
+extern void bench_srand(int seed);
+
+extern bench_problem *problem_parse(const char *desc);
+
+extern void ovtpvt(const char *format, ...);
+extern void ovtpvt_err(const char *format, ...);
+
+extern void fftaccuracy(int n, bench_complex *a, bench_complex *ffta,
+			int sign, double err[6]);
+extern void fftaccuracy_done(void);
+
+extern void caset(bench_complex *A, int n, bench_complex x);
+extern void aset(bench_real *A, int n, bench_real x);
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/can-do.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/can-do.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "bench.h"
+#include <stdio.h>
+
+void report_can_do(const char *param)
+{
+     bench_problem *p;
+     p = problem_parse(param);
+     ovtpvt("#%c\n", can_do(p) ? 't' : 'f');
+     problem_destroy(p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/caset.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/caset.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,12 @@
+/* not worth copyrighting */
+
+#include "bench.h"
+
+void caset(bench_complex *A, int n, bench_complex x)
+{
+     int i;
+     for (i = 0; i < n; ++i) {
+	  c_re(A[i]) = c_re(x);
+	  c_im(A[i]) = c_im(x);
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/dotens2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/dotens2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "verify.h"
+
+static void recur(int rnk, const bench_iodim *dims0, const bench_iodim *dims1,
+		  dotens2_closure *k, 
+		  int indx0, int ondx0, int indx1, int ondx1)
+{
+     if (rnk == 0)
+          k->apply(k, indx0, ondx0, indx1, ondx1);
+     else {
+          int i, n = dims0[0].n;
+          int is0 = dims0[0].is;
+          int os0 = dims0[0].os;
+          int is1 = dims1[0].is;
+          int os1 = dims1[0].os;
+
+	  BENCH_ASSERT(n == dims1[0].n);
+
+          for (i = 0; i < n; ++i) {
+               recur(rnk - 1, dims0 + 1, dims1 + 1, k,
+		     indx0, ondx0, indx1, ondx1);
+	       indx0 += is0; ondx0 += os0;
+	       indx1 += is1; ondx1 += os1;
+	  }
+     }
+}
+
+void bench_dotens2(const bench_tensor *sz0, const bench_tensor *sz1, dotens2_closure *k)
+{
+     BENCH_ASSERT(sz0->rnk == sz1->rnk);
+     if (sz0->rnk == RNK_MINFTY)
+          return;
+     recur(sz0->rnk, sz0->dims, sz1->dims, k, 0, 0, 0, 0);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/info.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/info.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "bench.h"
+#include <stdio.h>
+#include <string.h>
+
+void report_info(const char *param)
+{
+     struct bench_doc *p;
+
+     for (p = bench_doc; p->key; ++p) {
+	  if (!strcmp(param, p->key)) {
+	       if (!p->val)
+		    p->val = p->f();
+
+	       ovtpvt("%s\n", p->val);
+	  }
+     }
+}
+
+void report_info_all(void)
+{
+     struct bench_doc *p;
+
+     /*
+      * TODO: escape quotes?  The format is not unambigously
+      * parseable if the info string contains double quotes.
+      */
+     for (p = bench_doc; p->key; ++p) {
+	  if (!p->val)
+	       p->val = p->f();
+	  ovtpvt("(%s \"%s\")\n", p->key, p->val);
+     }
+     ovtpvt("(benchmark-precision \"%s\")\n", 
+	    SINGLE_PRECISION ? "single" : 
+	    (LDOUBLE_PRECISION ? "long-double" : 
+	     (QUAD_PRECISION ? "quad" : "double")));
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/main.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/main.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "bench.h"
+
+/* On some systems, we are required to define a dummy main-like
+   routine (called "MAIN__" or something similar in order to link a C
+   main() with the Fortran libraries).  This is detected by autoconf;
+   see the autoconf 2.52 or later manual. */
+#ifdef F77_DUMMY_MAIN
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+     int F77_DUMMY_MAIN() { return 1; }
+#endif
+
+/* in a separate file so that the user can override it */
+int main(int argc, char *argv[])
+{
+     return bench_main(argc, argv);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/mflops.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/mflops.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,32 @@
+/* not worth copyrighting */
+
+#include "bench.h"
+#include <math.h>
+
+double mflops(const bench_problem *p, double t)
+{
+     int size = tensor_sz(p->sz);
+     int vsize = tensor_sz(p->vecsz);
+
+     if (size <= 1) /* a copy: just return reals copied / time */
+	  switch (p->kind) {
+	      case PROBLEM_COMPLEX:
+		   return (2.0 * size * vsize / (t * 1.0e6));
+	      case PROBLEM_REAL:
+	      case PROBLEM_R2R:
+		   return (1.0 * size * vsize / (t * 1.0e6));
+	  }
+
+     switch (p->kind) {
+	 case PROBLEM_COMPLEX:
+	      return (5.0 * size * vsize * log((double)size) / 
+		      (log(2.0) * t * 1.0e6));
+	 case PROBLEM_REAL:
+	 case PROBLEM_R2R:
+	      return (2.5 * vsize * size * log((double) size) / 
+		      (log(2.0) * t * 1.0e6));
+     }
+     BENCH_ASSERT(0 /* can't happen */);
+     return 0.0;
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/mp.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/mp.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,641 @@
+#include "config.h"
+#include "bench.h"
+#include <math.h>
+
+#define DG unsigned short
+#define ACC unsigned long
+#define REAL bench_real
+#define BITS_IN_REAL 53 /* mantissa */
+
+#define SHFT 16
+#define RADIX 65536L
+#define IRADIX (1.0 / RADIX)
+#define LO(x) ((x) & (RADIX - 1))
+#define HI(x) ((x) >> SHFT)
+#define HI_SIGNED(x) \
+   ((((x) + (ACC)(RADIX >> 1) * RADIX) >> SHFT) - (RADIX >> 1))
+#define ZEROEXP (-32768)
+
+#define LEN 10
+
+typedef struct {
+     short sign;
+     short expt;
+     DG d[LEN]; 
+} N[1];
+
+#define EXA a->expt
+#define EXB b->expt
+#define EXC c->expt
+
+#define AD a->d
+#define BD b->d
+
+#define SGNA a->sign
+#define SGNB b->sign
+
+static const N zero = {{ 1, ZEROEXP, {0} }};
+
+static void cpy(const N a, N b)
+{
+     *b = *a;
+}
+
+static void fromreal(REAL x, N a)
+{
+     int i, e;
+
+     cpy(zero, a);
+     if (x == 0.0) return;
+     
+     if (x >= 0) { SGNA = 1; }
+     else       { SGNA = -1; x = -x; }
+
+     e = 0;
+     while (x >= 1.0) { x *= IRADIX; ++e; }
+     while (x < IRADIX) { x *= RADIX; --e; }
+     EXA = e;
+     
+     for (i = LEN - 1; i >= 0 && x != 0.0; --i) {
+	  REAL y;
+
+	  x *= RADIX;
+	  y = (REAL) ((int) x);
+	  AD[i] = (DG)y;
+	  x -= y;
+     }
+}
+
+static void fromshort(int x, N a)
+{
+     cpy(zero, a);
+
+     if (x < 0) { x = -x; SGNA = -1; } 
+     else { SGNA = 1; }
+     EXA = 1;
+     AD[LEN - 1] = x;
+}
+
+static void pack(DG *d, int e, int s, int l, N a)
+{
+     int i, j;
+
+     for (i = l - 1; i >= 0; --i, --e) 
+	  if (d[i] != 0) 
+	       break;
+
+     if (i < 0) {
+	  /* number is zero */
+	  cpy(zero, a);
+     } else {
+	  EXA = e;
+	  SGNA = s;
+
+	  if (i >= LEN - 1) {
+	       for (j = LEN - 1; j >= 0; --i, --j)
+		    AD[j] = d[i];
+	  } else {
+	       for (j = LEN - 1; i >= 0; --i, --j)
+		    AD[j] = d[i];
+	       for ( ; j >= 0; --j)
+		    AD[j] = 0;
+	  }
+     }
+}
+
+
+/* compare absolute values */
+static int abscmp(const N a, const N b)
+{
+     int i;
+     if (EXA > EXB) return 1;
+     if (EXA < EXB) return -1;
+     for (i = LEN - 1; i >= 0; --i) {
+	  if (AD[i] > BD[i])
+	       return 1;
+	  if (AD[i] < BD[i])
+	       return -1;
+     }
+     return 0;
+}
+
+static int eq(const N a, const N b)
+{
+     return (SGNA == SGNB) && (abscmp(a, b) == 0);
+}
+
+/* add magnitudes, for |a| >= |b| */
+static void addmag0(int s, const N a, const N b, N c)
+{
+     int ia, ib;
+     ACC r = 0;
+     DG d[LEN + 1];
+
+     for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
+	  r += (ACC)AD[ia] + (ACC)BD[ib];
+	  d[ia] = LO(r);
+	  r = HI(r);
+     }     
+     for (; ia < LEN; ++ia) {
+	  r += (ACC)AD[ia];
+	  d[ia] = LO(r);
+	  r = HI(r);
+     }
+     d[ia] = LO(r);
+     pack(d, EXA + 1, s * SGNA, LEN + 1, c);
+}
+
+static void addmag(int s, const N a, const N b, N c)
+{
+     if (abscmp(a, b) > 0) addmag0(1, a, b, c); else addmag0(s, b, a, c);
+}
+
+/* subtract magnitudes, for |a| >= |b| */
+static void submag0(int s, const N a, const N b, N c)
+{
+     int ia, ib;
+     ACC r = 0;
+     DG d[LEN];
+
+     for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
+	  r += (ACC)AD[ia] - (ACC)BD[ib];
+	  d[ia] = LO(r);
+	  r = HI_SIGNED(r);
+     }     
+     for (; ia < LEN; ++ia) {
+	  r += (ACC)AD[ia];
+	  d[ia] = LO(r);
+	  r = HI_SIGNED(r);
+     }
+
+     pack(d, EXA, s * SGNA, LEN, c);
+}
+
+static void submag(int s, const N a, const N b, N c)
+{
+     if (abscmp(a, b) > 0) submag0(1, a, b, c); else submag0(s, b, a, c);
+}
+
+/* c = a + b */
+static void add(const N a, const N b, N c)
+{
+     if (SGNA == SGNB) addmag(1, a, b, c); else submag(1, a, b, c);
+}
+
+static void sub(const N a, const N b, N c)
+{
+     if (SGNA == SGNB) submag(-1, a, b, c); else addmag(-1, a, b, c);
+}
+
+static void mul(const N a, const N b, N c)
+{
+     DG d[2 * LEN];
+     int i, j, k;
+     ACC r;
+
+     for (i = 0; i < LEN; ++i)
+	  d[2 * i] = d[2 * i + 1] = 0;
+
+     for (i = 0; i < LEN; ++i) {
+	  ACC ai = AD[i];
+	  if (ai) {
+	       r = 0;
+	       for (j = 0, k = i; j < LEN; ++j, ++k) {
+		    r += ai * (ACC)BD[j] + (ACC)d[k];
+		    d[k] = LO(r);
+		    r = HI(r);
+	       }
+	       d[k] = LO(r);
+	  }
+     }
+
+     pack(d, EXA + EXB, SGNA * SGNB, 2 * LEN, c);
+}
+
+static REAL toreal(const N a)
+{
+     REAL h, l, f;
+     int i, bits;
+     ACC r;
+     DG sticky;
+
+     if (EXA != ZEROEXP) {
+	  f = IRADIX;
+	  i = LEN;
+
+	  bits = 0;
+	  h = (r = AD[--i]) * f; f *= IRADIX;
+	  for (bits = 0; r > 0; ++bits)
+	       r >>= 1;
+
+	  /* first digit */
+	  while (bits + SHFT <= BITS_IN_REAL) {
+	       h += AD[--i] * f;  f *= IRADIX; bits += SHFT;
+	  }
+
+	  /* guard digit (leave one bit for sticky bit, hence `<' instead
+	     of `<=') */
+	  bits = 0; l = 0.0;
+	  while (bits + SHFT < BITS_IN_REAL) {
+	       l += AD[--i] * f;  f *= IRADIX; bits += SHFT;
+	  }
+	  
+	  /* sticky bit */
+	  sticky = 0;
+	  while (i > 0) 
+	       sticky |= AD[--i];
+
+	  if (sticky)
+	       l += (RADIX / 2) * f;
+
+	  h += l;
+
+	  for (i = 0; i < EXA; ++i) h *= (REAL)RADIX;
+	  for (i = 0; i > EXA; --i) h *= IRADIX;
+	  if (SGNA == -1) h = -h;
+	  return h;
+     } else {
+	  return 0.0;
+     }
+}
+
+static void neg(N a)
+{
+     SGNA = -SGNA;
+}
+
+static void inv(const N a, N x)
+{
+     N w, z, one, two;
+
+     fromreal(1.0 / toreal(a), x); /* initial guess */
+     fromshort(1, one);
+     fromshort(2, two);
+
+     for (;;) {
+	  /* Newton */
+	  mul(a, x, w);
+	  sub(two, w, z);
+	  if (eq(one, z)) break;
+	  mul(x, z, x);
+     }
+}
+
+
+/* 2 pi */
+static const N n2pi = {{
+     1, 1,
+     {18450, 59017, 1760, 5212, 9779, 4518, 2886, 54545, 18558, 6}
+}};
+
+/* 1 / 31! */
+static const N i31fac = {{ 
+     1, -7, 
+     {28087, 45433, 51357, 24545, 14291, 3954, 57879, 8109, 38716, 41382}
+}};
+
+
+/* 1 / 32! */
+static const N i32fac = {{
+     1, -7,
+     {52078, 60811, 3652, 39679, 37310, 47227, 28432, 57597, 13497, 1293}
+}};
+
+static void msin(const N a, N b)
+{
+     N a2, g, k;
+     int i;
+
+     cpy(i31fac, g);
+     cpy(g, b);
+     mul(a, a, a2);
+
+     /* Taylor */
+     for (i = 31; i > 1; i -= 2) {
+	  fromshort(i * (i - 1), k);
+	  mul(k, g, g);
+	  mul(a2, b, k);
+	  sub(g, k, b);
+     }
+     mul(a, b, b);
+}
+
+static void mcos(const N a, N b)
+{
+     N a2, g, k;
+     int i;
+
+     cpy(i32fac, g);
+     cpy(g, b);
+     mul(a, a, a2);
+
+     /* Taylor */
+     for (i = 32; i > 0; i -= 2) {
+	  fromshort(i * (i - 1), k);
+	  mul(k, g, g);
+	  mul(a2, b, k);
+	  sub(g, k, b);
+     }
+}
+
+static void by2pi(REAL m, REAL n, N a)
+{
+     N b;
+
+     fromreal(n, b);
+     inv(b, a);
+     fromreal(m, b);
+     mul(a, b, a);
+     mul(n2pi, a, a);
+}
+
+static void sin2pi(REAL m, REAL n, N a);
+static void cos2pi(REAL m, REAL n, N a)
+{
+     N b;
+     if (m < 0) cos2pi(-m, n, a);
+     else if (m > n * 0.5) cos2pi(n - m, n, a);
+     else if (m > n * 0.25) {sin2pi(m - n * 0.25, n, a); neg(a);}
+     else if (m > n * 0.125) sin2pi(n * 0.25 - m, n, a);
+     else { by2pi(m, n, b); mcos(b, a); }
+}
+
+static void sin2pi(REAL m, REAL n, N a)
+{
+     N b;
+     if (m < 0)  {sin2pi(-m, n, a); neg(a);}
+     else if (m > n * 0.5) {sin2pi(n - m, n, a); neg(a);}
+     else if (m > n * 0.25) {cos2pi(m - n * 0.25, n, a);}
+     else if (m > n * 0.125) {cos2pi(n * 0.25 - m, n, a);}
+     else {by2pi(m, n, b); msin(b, a);}
+}
+
+/*----------------------------------------------------------------------*/
+/* FFT stuff */
+
+/* (r0 + i i0)(r1 + i i1) */
+static void cmul(N r0, N i0, N r1, N i1, N r2, N i2)
+{
+     N s, t, q;
+     mul(r0, r1, s);
+     mul(i0, i1, t);
+     sub(s, t, q);
+     mul(r0, i1, s);
+     mul(i0, r1, t);
+     add(s, t, i2);
+     cpy(q, r2);
+}
+
+/* (r0 - i i0)(r1 + i i1) */
+static void cmulj(N r0, N i0, N r1, N i1, N r2, N i2)
+{
+     N s, t, q;
+     mul(r0, r1, s);
+     mul(i0, i1, t);
+     add(s, t, q);
+     mul(r0, i1, s);
+     mul(i0, r1, t);
+     sub(s, t, i2);
+     cpy(q, r2);
+}
+
+static void mcexp(int m, int n, N r, N i)
+{
+     static int cached_n = -1;
+     static N w[64][2];
+     int k, j;
+     if (n != cached_n) {
+	  for (j = 1, k = 0; j < n; j += j, ++k) {
+	       cos2pi(j, n, w[k][0]);
+	       sin2pi(j, n, w[k][1]);
+	  }
+	  cached_n = n;
+     }
+
+     fromshort(1, r);
+     fromshort(0, i);
+     if (m > 0) {
+	  for (k = 0; m; ++k, m >>= 1) 
+	       if (m & 1)
+		    cmul(w[k][0], w[k][1], r, i, r, i);
+     } else {
+	  m = -m;
+	  for (k = 0; m; ++k, m >>= 1) 
+	       if (m & 1)
+		    cmulj(w[k][0], w[k][1], r, i, r, i);
+     }
+}
+
+static void bitrev(int n, N *a)
+{
+     int i, j, m;
+     for (i = j = 0; i < n - 1; ++i) {
+	  if (i < j) {
+	       N t;
+	       cpy(a[2*i], t); cpy(a[2*j], a[2*i]); cpy(t, a[2*j]);
+	       cpy(a[2*i+1], t); cpy(a[2*j+1], a[2*i+1]); cpy(t, a[2*j+1]);
+	  }
+
+	  /* bit reversed counter */
+	  m = n; do { m >>= 1; j ^= m; } while (!(j & m));
+     }
+}
+
+static void fft0(int n, N *a, int sign)
+{
+     int i, j, k;
+
+     bitrev(n, a);
+     for (i = 1; i < n; i = 2 * i) {
+	  for (j = 0; j < i; ++j) {
+	       N wr, wi;
+	       mcexp(sign * (int)j, 2 * i, wr, wi);
+	       for (k = j; k < n; k += 2 * i) {
+		    N *a0 = a + 2 * k;
+		    N *a1 = a0 + 2 * i;
+		    N r0, i0, r1, i1, t0, t1, xr, xi;
+		    cpy(a0[0], r0); cpy(a0[1], i0);
+		    cpy(a1[0], r1); cpy(a1[1], i1);
+		    mul(r1, wr, t0); mul(i1, wi, t1); sub(t0, t1, xr);
+		    mul(r1, wi, t0); mul(i1, wr, t1); add(t0, t1, xi);
+		    add(r0, xr, a0[0]);  add(i0, xi, a0[1]);
+		    sub(r0, xr, a1[0]);  sub(i0, xi, a1[1]);
+	       }
+	  }
+     }
+}
+
+/* a[2*k]+i*a[2*k+1] = exp(2*pi*i*k^2/(2*n)) */
+static void bluestein_sequence(int n, N *a)
+{
+     int k, ksq, n2 = 2 * n;
+
+     ksq = 1; /* (-1)^2 */
+     for (k = 0; k < n; ++k) {
+	  /* careful with overflow */
+	  ksq = ksq + 2*k - 1; while (ksq > n2) ksq -= n2;
+	  mcexp(ksq, n2, a[2*k], a[2*k+1]);
+     }
+}
+
+static int pow2_atleast(int x)
+{
+     int h;
+     for (h = 1; h < x; h = 2 * h)
+	  ;
+     return h;
+}
+
+static N *cached_bluestein_w = 0;
+static N *cached_bluestein_y = 0;
+static int cached_bluestein_n = -1;
+
+static void bluestein(int n, N *a)
+{
+     int nb = pow2_atleast(2 * n);
+     N *b = (N *)bench_malloc(2 * nb * sizeof(N));
+     N *w = cached_bluestein_w;
+     N *y = cached_bluestein_y;
+     N nbinv;
+     int i;
+
+     fromreal(1.0 / nb, nbinv); /* exact because nb = 2^k */
+
+     if (cached_bluestein_n != n) {
+	  if (w) bench_free(w);
+	  if (y) bench_free(y);
+	  w = (N *)bench_malloc(2 * n * sizeof(N));
+	  y = (N *)bench_malloc(2 * nb * sizeof(N));
+	  cached_bluestein_n = n;
+	  cached_bluestein_w = w;
+	  cached_bluestein_y = y;
+
+	  bluestein_sequence(n, w);
+	  for (i = 0; i < 2*nb; ++i)  cpy(zero, y[i]);
+
+	  for (i = 0; i < n; ++i) {
+	       cpy(w[2*i], y[2*i]);
+	       cpy(w[2*i+1], y[2*i+1]);
+	  }
+	  for (i = 1; i < n; ++i) {
+	       cpy(w[2*i], y[2*(nb-i)]);
+	       cpy(w[2*i+1], y[2*(nb-i)+1]);
+	  }
+
+	  fft0(nb, y, -1);
+     }
+
+     for (i = 0; i < 2*nb; ++i)  cpy(zero, b[i]);
+     
+     for (i = 0; i < n; ++i) 
+	  cmulj(w[2*i], w[2*i+1], a[2*i], a[2*i+1], b[2*i], b[2*i+1]);
+
+     /* scaled convolution b * y */
+     fft0(nb, b, -1);
+
+     for (i = 0; i < nb; ++i) 
+	  cmul(b[2*i], b[2*i+1], y[2*i], y[2*i+1], b[2*i], b[2*i+1]);
+     fft0(nb, b, 1);
+
+     for (i = 0; i < n; ++i) {
+	  cmulj(w[2*i], w[2*i+1], b[2*i], b[2*i+1], a[2*i], a[2*i+1]);
+	  mul(nbinv, a[2*i], a[2*i]);
+	  mul(nbinv, a[2*i+1], a[2*i+1]);
+     }
+
+     bench_free(b);
+}
+
+static void swapri(int n, N *a)
+{
+     int i;
+     for (i = 0; i < n; ++i) {
+	  N t;
+	  cpy(a[2 * i], t);
+	  cpy(a[2 * i + 1], a[2 * i]);
+	  cpy(t, a[2 * i + 1]);
+     }
+}
+
+static void fft1(int n, N *a, int sign)
+{
+     if (power_of_two(n)) {
+	  fft0(n, a, sign);
+     } else {
+	  if (sign == 1) swapri(n, a);
+	  bluestein(n, a);
+	  if (sign == 1) swapri(n, a);
+     }
+}
+
+static void fromrealv(int n, bench_complex *a, N *b)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  fromreal(c_re(a[i]), b[2 * i]);
+	  fromreal(c_im(a[i]), b[2 * i + 1]);
+     }
+}
+
+static void compare(int n, N *a, N *b, double *err)
+{
+     int i;
+     double e1, e2, einf;
+     double n1, n2, ninf;
+
+     e1 = e2 = einf = 0.0;
+     n1 = n2 = ninf = 0.0;
+
+#    define DO(x1, x2, xinf, var) { 			\
+     double d = var;					\
+     if (d < 0) d = -d;					\
+     x1 += d; x2 += d * d; if (d > xinf) xinf = d;	\
+}
+	  
+     for (i = 0; i < 2 * n; ++i) {
+	  N dd;
+	  sub(a[i], b[i], dd);
+	  DO(n1, n2, ninf, toreal(a[i]));
+	  DO(e1, e2, einf, toreal(dd));
+     }
+
+#    undef DO
+     err[0] = e1 / n1;
+     err[1] = sqrt(e2 / n2);
+     err[2] = einf / ninf;
+}
+
+void fftaccuracy(int n, bench_complex *a, bench_complex *ffta,
+		 int sign, double err[6])
+{
+     N *b = (N *)bench_malloc(2 * n * sizeof(N));
+     N *fftb = (N *)bench_malloc(2 * n * sizeof(N));
+     N mn, ninv;
+     int i;
+
+     fromreal(n, mn); inv(mn, ninv);
+
+     /* forward error */
+     fromrealv(n, a, b); fromrealv(n, ffta, fftb);
+     fft1(n, b, sign);
+     compare(n, b, fftb, err);
+
+     /* backward error */
+     fromrealv(n, a, b); fromrealv(n, ffta, fftb);
+     for (i = 0; i < 2 * n; ++i) mul(fftb[i], ninv, fftb[i]);
+     fft1(n, fftb, -sign);
+     compare(n, b, fftb, err + 3);
+
+     bench_free(fftb);
+     bench_free(b);
+}
+
+void fftaccuracy_done(void)
+{
+     if (cached_bluestein_w) bench_free(cached_bluestein_w);
+     if (cached_bluestein_y) bench_free(cached_bluestein_y);
+     cached_bluestein_w = 0;
+     cached_bluestein_y = 0;
+     cached_bluestein_n = -1;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/my-getopt.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/my-getopt.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <string.h>
+#include <stdio.h>
+
+#include "config.h"
+#include "my-getopt.h"
+
+int my_optind = 1;
+const char *my_optarg = 0;
+static const char *scan_pointer = 0;
+
+void my_usage(const char *progname, const struct my_option *opt)
+{
+    int i;
+    size_t col = 0;
+
+    fprintf(stdout, "Usage: %s", progname);
+    col += (strlen(progname) + 7);
+    for (i = 0; opt[i].long_name; i++) {
+	size_t option_len;
+
+	option_len = strlen(opt[i].long_name);
+	if (col >= 80 - (option_len + 16)) {
+	    fputs("\n\t", stdout);
+	    col = 8;
+	}
+	fprintf(stdout, " [--%s", opt[i].long_name);
+	col += (option_len + 4);
+	if (opt[i].short_name < 128) {
+	    fprintf(stdout, " | -%c", opt[i].short_name);
+	    col += 5;
+	}
+	switch (opt[i].argtype) {
+	    case REQARG:
+		 fputs(" arg]", stdout);
+		 col += 5;
+		 break;
+	    case OPTARG:
+		 fputs(" [arg]]", stdout);
+		 col += 10;
+		 break;
+	    default:
+		 fputs("]", stdout);
+		 col++;
+	}
+    }
+
+    fputs ("\n", stdout);
+}
+
+int my_getopt(int argc, char *argv[], const struct my_option *optarray)
+{
+     const char *p;
+     const struct my_option *l;
+
+     if (scan_pointer && *scan_pointer) {
+	  /* continue a previously scanned argv[] element */
+	  p = scan_pointer;
+	  goto short_option;
+     } else {
+	  /* new argv[] element */
+	  if (my_optind >= argc)
+	       return -1; /* no more options */
+
+	  p = argv[my_optind];
+     
+	  if (*p++ != '-')  
+	       return (-1); /* not an option */
+
+	  if (!*p) 
+	       return (-1); /* string is exactly '-' */
+	       
+	  ++my_optind;
+     }
+
+     if (*p == '-') {
+	  /* long option */
+	  scan_pointer = 0;
+	  my_optarg = 0;
+
+	  ++p;
+	  
+	  for (l = optarray; l->short_name; ++l) {
+	       size_t len = strlen(l->long_name);
+	       if (!strncmp(l->long_name, p, len) && 
+		   (!p[len] || p[len] == '=')) {
+		    switch (l->argtype) {
+			case NOARG: 
+			     goto ok;
+			case OPTARG: 
+			     if (p[len] == '=')
+				  my_optarg = p + len + 1;
+			     goto ok;
+			case REQARG: 
+			     if (p[len] == '=') {
+				  my_optarg = p + len + 1;
+				  goto ok;
+			     }
+			     if (my_optind >= argc) {
+				  fprintf(stderr, 
+					  "option --%s requires an argument\n",
+					  l->long_name);
+				  return '?';
+			     }
+			     my_optarg = argv[my_optind];
+			     ++my_optind;
+			     goto ok;
+		    }
+	       }
+	  }
+     } else {
+     short_option:
+	  scan_pointer = 0;
+	  my_optarg = 0;
+
+	  for (l = optarray; l->short_name; ++l) {
+	       if (l->short_name == (char)l->short_name &&
+		   *p == l->short_name) {
+		    ++p;
+		    switch (l->argtype) {
+			case NOARG: 
+			     scan_pointer = p;
+			     goto ok;
+			case OPTARG: 
+			     if (*p)
+				  my_optarg = p;
+			     goto ok;
+			case REQARG: 
+			     if (*p) {
+				  my_optarg = p;
+			     } else {
+				  if (my_optind >= argc) {
+				       fprintf(stderr, 
+					  "option -%c requires an argument\n",
+					  l->short_name);
+				       return '?';
+				  }
+				  my_optarg = argv[my_optind];
+				  ++my_optind;
+			     }
+			     goto ok;
+		    }
+	       }
+	  }
+     }
+
+     fprintf(stderr, "unrecognized option %s\n", argv[my_optind - 1]);
+     return '?';
+
+ ok:
+     return l->short_name;
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/my-getopt.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/my-getopt.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __MY_GETOPT_H__
+#define __MY_GETOPT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif                          /* __cplusplus */
+
+enum { REQARG, OPTARG, NOARG };
+
+struct my_option {
+     const char *long_name;
+     int argtype;
+     int short_name;
+};
+
+extern int my_optind;
+extern const char *my_optarg;
+
+extern void my_usage(const char *progname, const struct my_option *opt);
+extern int my_getopt(int argc, char *argv[], const struct my_option *optarray);
+
+#ifdef __cplusplus
+}                               /* extern "C" */
+#endif                          /* __cplusplus */
+
+#endif /* __MY_GETOPT_H__ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/ovtpvt.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/ovtpvt.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,28 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include "bench.h"
+
+void ovtpvt(const char *format, ...)
+{
+     va_list ap;
+     
+     va_start(ap, format);
+     if (verbose >= 0)
+	  vfprintf(stdout, format, ap);
+     va_end(ap);
+     fflush(stdout);
+}
+
+void ovtpvt_err(const char *format, ...)
+{
+     va_list ap;
+     
+     va_start(ap, format);
+     if (verbose >= 0) {
+	  fflush(stdout);
+	  vfprintf(stderr, format, ap);
+     }
+     va_end(ap);
+     fflush(stdout);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/pow2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/pow2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,6 @@
+#include "bench.h"
+
+int power_of_two(int n)
+{
+     return (((n) > 0) && (((n) & ((n) - 1)) == 0));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/problem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/problem.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "config.h"
+#include "bench.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+int always_pad_real = 0; /* by default, only pad in-place case */
+
+typedef enum {
+     SAME, PADDED, HALFISH
+} n_transform;
+
+/* funny transformations for last dimension of PROBLEM_REAL */
+static int transform_n(int n, n_transform nt)
+{
+     switch (nt) {
+	 case SAME: return n;
+	 case PADDED: return 2*(n/2+1);
+	 case HALFISH: return (n/2+1);
+	 default: BENCH_ASSERT(0); return 0;
+     }
+}
+
+/* do what I mean */
+static bench_tensor *dwim(bench_tensor *t, bench_iodim **last_iodim,
+			  n_transform nti, n_transform nto,
+			  bench_iodim *dt)
+{
+     int i;
+     bench_iodim *d, *d1;
+
+     if (!FINITE_RNK(t->rnk) || t->rnk < 1)
+	  return t;
+
+     i = t->rnk;
+     d1 = *last_iodim;
+
+     while (--i >= 0) {
+	  d = t->dims + i;
+	  if (!d->is) 
+	       d->is = d1->is * transform_n(d1->n, d1==dt ? nti : SAME); 
+	  if (!d->os) 
+	       d->os = d1->os * transform_n(d1->n, d1==dt ? nto : SAME); 
+	  d1 = d;
+     }
+
+     *last_iodim = d1;
+     return t;
+}
+
+static void transpose_tensor(bench_tensor *t)
+{
+     if (!FINITE_RNK(t->rnk) || t->rnk < 2)
+          return;
+
+     t->dims[0].os = t->dims[1].os;
+     t->dims[1].os = t->dims[0].os * t->dims[0].n;
+}
+
+static const char *parseint(const char *s, int *n)
+{
+     int sign = 1;
+
+     *n = 0;
+
+     if (*s == '-') { 
+	  sign = -1;
+	  ++s;
+     } else if (*s == '+') { 
+	  sign = +1; 
+	  ++s; 
+     }
+
+     BENCH_ASSERT(isdigit(*s));
+     while (isdigit(*s)) {
+	  *n = *n * 10 + (*s - '0');
+	  ++s;
+     }
+     
+     *n *= sign;
+
+     if (*s == 'k' || *s == 'K') {
+	  *n *= 1024;
+	  ++s;
+     }
+
+     if (*s == 'm' || *s == 'M') {
+	  *n *= 1024 * 1024;
+	  ++s;
+     }
+
+     return s;
+}
+
+struct dimlist { bench_iodim car; r2r_kind_t k; struct dimlist *cdr; };
+
+static const char *parsetensor(const char *s, bench_tensor **tp,
+			       r2r_kind_t **k)
+{
+     struct dimlist *l = 0, *m;
+     bench_tensor *t;
+     int rnk = 0;
+
+ L1:
+     m = (struct dimlist *)bench_malloc(sizeof(struct dimlist));
+     /* nconc onto l */
+     m->cdr = l; l = m;
+     ++rnk; 
+
+     s = parseint(s, &m->car.n);
+
+     if (*s == ':') {
+	  /* read input stride */
+	  ++s;
+	  s = parseint(s, &m->car.is);
+	  if (*s == ':') {
+	       /* read output stride */
+	       ++s;
+	       s = parseint(s, &m->car.os);
+	  } else {
+	       /* default */
+	       m->car.os = m->car.is;
+	  }
+     } else {
+	  m->car.is = 0;
+	  m->car.os = 0;
+     }
+
+     if (*s == 'f' || *s == 'F') {
+	  m->k = R2R_R2HC;
+	  ++s;
+     }
+     else if (*s == 'b' || *s == 'B') {
+	  m->k = R2R_HC2R;
+	  ++s;
+     }
+     else if (*s == 'h' || *s == 'H') {
+	  m->k = R2R_DHT;
+	  ++s;
+     }
+     else if (*s == 'e' || *s == 'E' || *s == 'o' || *s == 'O') {
+	  char c = *(s++);
+	  int ab;
+
+	  s = parseint(s, &ab);
+
+	  if (c == 'e' || c == 'E') {
+	       if (ab == 0)
+		    m->k = R2R_REDFT00;
+	       else if (ab == 1)
+		    m->k = R2R_REDFT01;
+	       else if (ab == 10)
+		    m->k = R2R_REDFT10;
+	       else if (ab == 11)
+		    m->k = R2R_REDFT11;
+	       else
+		    BENCH_ASSERT(0);
+	  }
+	  else {
+	       if (ab == 0)
+		    m->k = R2R_RODFT00;
+	       else if (ab == 1)
+		    m->k = R2R_RODFT01;
+	       else if (ab == 10)
+		    m->k = R2R_RODFT10;
+	       else if (ab == 11)
+		    m->k = R2R_RODFT11;
+	       else
+		    BENCH_ASSERT(0);
+	  }
+     }
+     else
+	  m->k = R2R_R2HC;
+
+     if (*s == 'x' || *s == 'X') {
+	  ++s;
+	  goto L1;
+     }
+     
+     /* now we have a dimlist.  Build bench_tensor, etc. */
+
+     if (k && rnk > 0) {
+	  int i;
+	  *k = (r2r_kind_t *) bench_malloc(sizeof(r2r_kind_t) * rnk);
+	  for (m = l, i = rnk - 1; i >= 0; --i, m = m->cdr) {
+	       BENCH_ASSERT(m);
+	       (*k)[i] = m->k;
+	  }
+     }
+
+     t = mktensor(rnk);
+     while (--rnk >= 0) {
+	  bench_iodim *d = t->dims + rnk;
+	  BENCH_ASSERT(l);
+	  m = l; l = m->cdr;
+	  d->n = m->car.n;
+	  d->is = m->car.is;
+	  d->os = m->car.os;
+	  bench_free(m);
+     }
+
+     *tp = t;
+     return s;
+}
+
+/* parse a problem description, return a problem */
+bench_problem *problem_parse(const char *s)
+{
+     bench_problem *p;
+     bench_iodim last_iodim0 = {1,1,1}, *last_iodim = &last_iodim0;
+     bench_iodim *sz_last_iodim;
+     bench_tensor *sz;
+     n_transform nti = SAME, nto = SAME;
+     int transpose = 0;
+
+     p = (bench_problem *) bench_malloc(sizeof(bench_problem));
+     p->kind = PROBLEM_COMPLEX;
+     p->k = 0;
+     p->sign = -1;
+     p->in = p->out = 0;
+     p->inphys = p->outphys = 0;
+     p->iphyssz = p->ophyssz = 0;
+     p->in_place = 0;
+     p->destroy_input = 0;
+     p->split = 0;
+     p->userinfo = 0;
+     p->scrambled_in = p->scrambled_out = 0;
+     p->sz = p->vecsz = 0;
+     p->ini = p->outi = 0;
+     p->pstring = (char *) bench_malloc(sizeof(char) * (strlen(s) + 1));
+     strcpy(p->pstring, s);
+
+ L1:
+     switch (tolower(*s)) {
+	 case 'i': p->in_place = 1; ++s; goto L1;
+	 case 'o': p->in_place = 0; ++s; goto L1;
+	 case 'd': p->destroy_input = 1; ++s; goto L1;
+	 case '/': p->split = 1; ++s; goto L1;
+	 case 'f': 
+	 case '-': p->sign = -1; ++s; goto L1;
+	 case 'b': 
+	 case '+': p->sign = 1; ++s; goto L1;
+	 case 'r': p->kind = PROBLEM_REAL; ++s; goto L1;
+	 case 'c': p->kind = PROBLEM_COMPLEX; ++s; goto L1;
+	 case 'k': p->kind = PROBLEM_R2R; ++s; goto L1;
+	 case 't': transpose = 1; ++s; goto L1;
+	      
+	 /* hack for MPI: */
+	 case '[': p->scrambled_in = 1; ++s; goto L1;
+	 case ']': p->scrambled_out = 1; ++s; goto L1;
+
+	 default : ;
+     }
+
+     s = parsetensor(s, &sz, p->kind == PROBLEM_R2R ? &p->k : 0);
+
+     if (p->kind == PROBLEM_REAL) {
+	  if (p->sign < 0) {
+	       nti = p->in_place || always_pad_real ? PADDED : SAME;
+	       nto = HALFISH;
+	  }
+	  else {
+	       nti = HALFISH;
+	       nto = p->in_place || always_pad_real ? PADDED : SAME;
+	  }
+     }
+
+     sz_last_iodim = sz->dims + sz->rnk - 1;
+     if (*s == '*') { /* "external" vector */
+	  ++s;
+	  p->sz = dwim(sz, &last_iodim, nti, nto, sz_last_iodim);
+	  s = parsetensor(s, &sz, 0);
+	  p->vecsz = dwim(sz, &last_iodim, nti, nto, sz_last_iodim);
+     } else if (*s == 'v' || *s == 'V') { /* "internal" vector */
+	  bench_tensor *vecsz;
+	  ++s;
+	  s = parsetensor(s, &vecsz, 0);
+	  p->vecsz = dwim(vecsz, &last_iodim, nti, nto, sz_last_iodim);
+	  p->sz = dwim(sz, &last_iodim, nti, nto, sz_last_iodim);
+     } else {
+	  p->sz = dwim(sz, &last_iodim, nti, nto, sz_last_iodim);
+	  p->vecsz = mktensor(0);
+     }
+
+     if (transpose) {
+	  transpose_tensor(p->sz);
+	  transpose_tensor(p->vecsz);
+     }
+
+     if (!p->in_place)
+	  p->out = ((bench_real *) p->in) + (1 << 20);  /* whatever */
+
+     BENCH_ASSERT(p->sz && p->vecsz);
+     BENCH_ASSERT(!*s);
+     return p;
+}
+
+void problem_destroy(bench_problem *p)
+{
+     BENCH_ASSERT(p);
+     problem_free(p);
+     bench_free0(p->k);
+     bench_free0(p->pstring);
+     bench_free(p);
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/report.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/report.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "bench.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+void (*report)(const bench_problem *p, double *t, int st);
+
+#undef min
+#undef max /* you never know */
+
+struct stats {
+     double min;
+     double max;
+     double avg;
+     double median;
+};
+
+static void mkstat(double *t, int st, struct stats *a)
+{
+     int i, j;
+     
+     a->min = t[0];
+     a->max = t[0];
+     a->avg = 0.0;
+
+     for (i = 0; i < st; ++i) {
+	  if (t[i] < a->min)
+	       a->min = t[i];
+	  if (t[i] > a->max)
+	       a->max = t[i];
+	  a->avg += t[i];
+     }
+     a->avg /= (double)st;
+
+     /* compute median --- silly bubblesort algorithm */
+     for (i = st - 1; i > 1; --i) {
+	  for (j = 0; j < i - 1; ++j) {
+	       double t0, t1;
+	       if ((t0 = t[j]) > (t1 = t[j + 1])) {
+		    t[j] = t1;
+		    t[j + 1] = t0;
+	       }
+	  } 
+     }
+     a->median = t[st / 2];
+}
+
+void report_mflops(const bench_problem *p, double *t, int st)
+{
+     struct stats s;
+     mkstat(t, st, &s);
+     ovtpvt("(%g %g %g %g)\n", 
+	    mflops(p, s.max), mflops(p, s.avg), 
+	    mflops(p, s.min), mflops(p, s.median));
+}
+
+void report_time(const bench_problem *p, double *t, int st)
+{
+     struct stats s;
+     UNUSED(p);
+     mkstat(t, st, &s);
+     ovtpvt("(%g %g %g %g)\n", s.min, s.avg, s.max, s.median);
+}
+
+void report_benchmark(const bench_problem *p, double *t, int st)
+{
+     struct stats s;
+     mkstat(t, st, &s);
+     ovtpvt("%.5g %.8g %g\n", mflops(p, s.min), s.min, p->setup_time);
+}
+
+static void sprintf_time(double x, char *buf, int buflen)
+{
+#ifdef HAVE_SNPRINTF
+#  define MY_SPRINTF(a, b) snprintf(buf, buflen, a, b)
+#else
+#  define MY_SPRINTF(a, b) sprintf(buf, a, b)
+#endif
+     if (x < 1.0E-6)
+	  MY_SPRINTF("%.2f ns", x * 1.0E9);
+     else if (x < 1.0E-3)
+	  MY_SPRINTF("%.2f us", x * 1.0E6);
+     else if (x < 1.0)
+	  MY_SPRINTF("%.2f ms", x * 1.0E3);
+     else
+	  MY_SPRINTF("%.2f s", x);
+#undef MY_SPRINTF
+}
+
+void report_verbose(const bench_problem *p, double *t, int st)
+{
+     struct stats s;
+     char bmin[64], bmax[64], bavg[64], bmedian[64], btmin[64];
+     char bsetup[64];
+     int copyp = tensor_sz(p->sz) == 1;
+
+     mkstat(t, st, &s);
+
+     sprintf_time(s.min, bmin, 64);
+     sprintf_time(s.max, bmax, 64);
+     sprintf_time(s.avg, bavg, 64);
+     sprintf_time(s.median, bmedian, 64);
+     sprintf_time(time_min, btmin, 64);
+     sprintf_time(p->setup_time, bsetup, 64);
+
+     ovtpvt("Problem: %s, setup: %s, time: %s, %s: %.5g\n",
+	    p->pstring, bsetup, bmin, 
+	    copyp ? "fp-move/us" : "``mflops''",
+	    mflops(p, s.min));
+
+     if (verbose) {
+	  ovtpvt("Took %d measurements for at least %s each.\n", st, btmin);
+	  ovtpvt("Time: min %s, max %s, avg %s, median %s\n",
+		 bmin, bmax, bavg, bmedian);
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/speed.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/speed.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "bench.h"
+
+int no_speed_allocation = 0; /* 1 to not allocate array data in speed() */
+
+void speed(const char *param, int setup_only)
+{
+     double *t;
+     int iter = 0, k;
+     bench_problem *p;
+     double tmin, y;
+
+     t = (double *) bench_malloc(time_repeat * sizeof(double));
+
+     for (k = 0; k < time_repeat; ++k) 
+	  t[k] = 0;
+
+     p = problem_parse(param);
+     BENCH_ASSERT(can_do(p));
+     if (!no_speed_allocation) {
+	  problem_alloc(p);
+	  problem_zero(p);
+     }
+
+     timer_start(LIBBENCH_TIMER);
+     setup(p);
+     p->setup_time = bench_cost_postprocess(timer_stop(LIBBENCH_TIMER));
+
+     /* reset the input to zero again, because the planner in paranoid
+	mode sets it to random values, thus making the benchmark
+	diverge. */
+     if (!no_speed_allocation) 
+	  problem_zero(p);
+     
+     if (setup_only)
+	  goto done;
+
+ start_over:
+     for (iter = 1; iter < (1<<30); iter *= 2) {
+	  tmin = 1.0e20;
+	  for (k = 0; k < time_repeat; ++k) {
+	       timer_start(LIBBENCH_TIMER);
+	       doit(iter, p);
+	       y = bench_cost_postprocess(timer_stop(LIBBENCH_TIMER));
+	       if (y < 0) /* yes, it happens */
+		    goto start_over;
+	       t[k] = y;
+	       if (y < tmin)
+		    tmin = y;
+	  }
+	  
+	  if (tmin >= time_min)
+	       goto done;
+     }
+
+     goto start_over; /* this also happens */
+
+ done:
+     done(p);
+
+     if (iter) 
+	  for (k = 0; k < time_repeat; ++k) 
+	       t[k] /= iter;
+     else
+	  for (k = 0; k < time_repeat; ++k) 
+	       t[k] = 0;
+
+     report(p, t, time_repeat);
+
+     if (!no_speed_allocation)
+	  problem_destroy(p);
+     bench_free(t);
+     return;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/tensor.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/tensor.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "bench.h"
+#include <stdlib.h>
+
+bench_tensor *mktensor(int rnk) 
+{
+     bench_tensor *x;
+
+     BENCH_ASSERT(rnk >= 0);
+
+     x = (bench_tensor *)bench_malloc(sizeof(bench_tensor));
+     if (FINITE_RNK(rnk) && rnk > 0)
+          x->dims = (bench_iodim *)bench_malloc(sizeof(bench_iodim) * rnk);
+     else
+          x->dims = 0;
+
+     x->rnk = rnk;
+     return x;
+}
+
+void tensor_destroy(bench_tensor *sz)
+{
+     bench_free0(sz->dims);
+     bench_free(sz);
+}
+
+int tensor_sz(const bench_tensor *sz)
+{
+     int i, n = 1;
+
+     if (!FINITE_RNK(sz->rnk))
+          return 0;
+
+     for (i = 0; i < sz->rnk; ++i)
+          n *= sz->dims[i].n;
+     return n;
+}
+
+
+/* total order among bench_iodim's */
+static int dimcmp(const bench_iodim *a, const bench_iodim *b)
+{
+     if (b->is != a->is)
+          return (b->is - a->is);	/* shorter strides go later */
+     if (b->os != a->os)
+          return (b->os - a->os);	/* shorter strides go later */
+     return (int)(a->n - b->n);	        /* larger n's go later */
+}
+
+bench_tensor *tensor_compress(const bench_tensor *sz)
+{
+     int i, rnk;
+     bench_tensor *x;
+
+     BENCH_ASSERT(FINITE_RNK(sz->rnk));
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+          BENCH_ASSERT(sz->dims[i].n > 0);
+          if (sz->dims[i].n != 1)
+               ++rnk;
+     }
+
+     x = mktensor(rnk);
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+          if (sz->dims[i].n != 1)
+               x->dims[rnk++] = sz->dims[i];
+     }
+
+     if (rnk) {
+	  /* God knows how qsort() behaves if n==0 */
+	  qsort(x->dims, (size_t)x->rnk, sizeof(bench_iodim),
+		(int (*)(const void *, const void *))dimcmp);
+     }
+
+     return x;
+}
+
+int tensor_unitstridep(bench_tensor *t)
+{
+     BENCH_ASSERT(FINITE_RNK(t->rnk));
+     return (t->rnk == 0 ||
+	     (t->dims[t->rnk - 1].is == 1 && t->dims[t->rnk - 1].os == 1));
+}
+
+/* detect screwy real padded rowmajor... ugh */
+int tensor_real_rowmajorp(bench_tensor *t, int sign, int in_place)
+{
+     int i;
+
+     BENCH_ASSERT(FINITE_RNK(t->rnk));
+
+     i = t->rnk - 1;
+
+     if (--i >= 0) {
+          bench_iodim *d = t->dims + i;
+	  if (sign < 0) {
+	       if (d[0].is != d[1].is * (in_place ? 2*(d[1].n/2 + 1) : d[1].n))
+		    return 0;
+	       if (d[0].os != d[1].os * (d[1].n/2 + 1))
+		    return 0;
+	  }
+	  else {
+	       if (d[0].is != d[1].is * (d[1].n/2 + 1))
+		    return 0;
+	       if (d[0].os != d[1].os * (in_place ? 2*(d[1].n/2 + 1) : d[1].n))
+		    return 0;
+	  }
+     }
+
+     while (--i >= 0) {
+          bench_iodim *d = t->dims + i;
+          if (d[0].is != d[1].is * d[1].n)
+               return 0;
+          if (d[0].os != d[1].os * d[1].n)
+               return 0;
+     }
+     return 1;
+}
+
+int tensor_rowmajorp(bench_tensor *t)
+{
+     int i;
+
+     BENCH_ASSERT(FINITE_RNK(t->rnk));
+
+     i = t->rnk - 1;
+     while (--i >= 0) {
+	  bench_iodim *d = t->dims + i;
+	  if (d[0].is != d[1].is * d[1].n)
+	       return 0;
+	  if (d[0].os != d[1].os * d[1].n)
+	       return 0;
+     }
+     return 1;
+}
+
+static void dimcpy(bench_iodim *dst, const bench_iodim *src, int rnk)
+{
+     int i;
+     if (FINITE_RNK(rnk))
+          for (i = 0; i < rnk; ++i)
+               dst[i] = src[i];
+}
+
+bench_tensor *tensor_append(const bench_tensor *a, const bench_tensor *b)
+{
+     if (!FINITE_RNK(a->rnk) || !FINITE_RNK(b->rnk)) {
+          return mktensor(RNK_MINFTY);
+     } else {
+	  bench_tensor *x = mktensor(a->rnk + b->rnk);
+          dimcpy(x->dims, a->dims, a->rnk);
+          dimcpy(x->dims + a->rnk, b->dims, b->rnk);
+	  return x;
+     }
+}
+
+static int imax(int a, int b)
+{
+     return (a > b) ? a : b;
+}
+
+static int imin(int a, int b)
+{
+     return (a < b) ? a : b;
+}
+
+#define DEFBOUNDS(name, xs)			\
+void name(bench_tensor *t, int *lbp, int *ubp)	\
+{						\
+     int lb = 0;				\
+     int ub = 1;				\
+     int i;					\
+						\
+     BENCH_ASSERT(FINITE_RNK(t->rnk));		\
+						\
+     for (i = 0; i < t->rnk; ++i) {		\
+	  bench_iodim *d = t->dims + i;		\
+	  int n = d->n;				\
+	  int s = d->xs;			\
+	  lb = imin(lb, lb + s * (n - 1));	\
+	  ub = imax(ub, ub + s * (n - 1));	\
+     }						\
+						\
+     *lbp = lb;					\
+     *ubp = ub;					\
+}
+
+DEFBOUNDS(tensor_ibounds, is)
+DEFBOUNDS(tensor_obounds, os)
+
+bench_tensor *tensor_copy(const bench_tensor *sz)
+{
+     bench_tensor *x = mktensor(sz->rnk);
+     dimcpy(x->dims, sz->dims, sz->rnk);
+     return x;
+}
+
+/* Like tensor_copy, but copy only rnk dimensions starting with start_dim. */
+bench_tensor *tensor_copy_sub(const bench_tensor *sz, int start_dim, int rnk)
+{
+     bench_tensor *x;
+
+     BENCH_ASSERT(FINITE_RNK(sz->rnk) && start_dim + rnk <= sz->rnk);
+     x = mktensor(rnk);
+     dimcpy(x->dims, sz->dims + start_dim, rnk);
+     return x;
+}
+
+bench_tensor *tensor_copy_swapio(const bench_tensor *sz)
+{
+     bench_tensor *x = tensor_copy(sz);
+     int i;
+     if (FINITE_RNK(x->rnk))
+	  for (i = 0; i < x->rnk; ++i) {
+	       int s;
+	       s = x->dims[i].is;
+	       x->dims[i].is = x->dims[i].os;
+	       x->dims[i].os = s;
+	  }
+     return x;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/timer.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/timer.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "bench.h"
+#include <stdio.h>
+
+/* 
+ * System-dependent timing functions:
+ */
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_BSDGETTIMEOFDAY
+#ifndef HAVE_GETTIMEOFDAY
+#define gettimeofday BSDgettimeofday
+#define HAVE_GETTIMEOFDAY 1
+#endif
+#endif
+
+double time_min;
+int time_repeat;
+
+#if !defined(HAVE_TIMER) && (defined(__WIN32__) || defined(_WIN32) || defined(_WINDOWS) || defined(__CYGWIN__))
+#include <windows.h>
+typedef LARGE_INTEGER mytime;
+
+static mytime get_time(void)
+{
+     mytime tv;
+     QueryPerformanceCounter(&tv);
+     return tv;
+}
+
+static double elapsed(mytime t1, mytime t0)
+{
+     LARGE_INTEGER freq;
+     QueryPerformanceFrequency(&freq);
+     return (((double) t1.QuadPart - (double) t0.QuadPart)) /
+	  ((double) freq.QuadPart);
+}
+
+#define HAVE_TIMER
+#endif
+
+
+#if defined(HAVE_GETTIMEOFDAY) && !defined(HAVE_TIMER)
+typedef struct timeval mytime;
+
+static mytime get_time(void)
+{
+     struct timeval tv;
+     gettimeofday(&tv, 0);
+     return tv;
+}
+
+static double elapsed(mytime t1, mytime t0)
+{
+     return ((double) t1.tv_sec - (double) t0.tv_sec) +
+	  ((double) t1.tv_usec - (double) t0.tv_usec) * 1.0E-6;
+}
+
+#define HAVE_TIMER
+#endif
+
+#ifndef HAVE_TIMER
+#error "timer not defined"
+#endif
+
+static double calibrate(void)
+{
+     /* there seems to be no reasonable way to calibrate the
+	clock automatically any longer.  Grrr... */
+
+     return 0.01;
+}
+
+
+void timer_init(double tmin, int repeat)
+{
+     static int inited = 0;
+
+     if (inited)
+	  return;
+     inited = 1;
+
+     if (!repeat)
+	  repeat = 8;
+     time_repeat = repeat;
+
+     if (tmin > 0)
+	  time_min = tmin;
+     else
+	  time_min = calibrate();
+}
+
+static mytime t0[BENCH_NTIMERS];
+
+void timer_start(int n)
+{
+     BENCH_ASSERT(n >= 0 && n < BENCH_NTIMERS);
+     t0[n] = get_time();
+}
+
+double timer_stop(int n)
+{
+     mytime t1;
+     BENCH_ASSERT(n >= 0 && n < BENCH_NTIMERS);
+     t1 = get_time();
+     return elapsed(t1, t0[n]);
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/useropt.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/useropt.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2000 Matteo Frigo
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "bench.h"
+
+void useropt(const char *arg)
+{
+     ovtpvt_err("unknown user option: %s.  Ignoring.\n", arg);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/util.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/util.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2000 Matteo Frigo
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "bench.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <math.h>
+
+#if defined(HAVE_DECL_MEMALIGN) && !HAVE_DECL_MEMALIGN
+#  if defined(HAVE_MALLOC_H)
+#    include <malloc.h>
+#  else
+extern void *memalign(size_t, size_t);
+#  endif
+#endif
+
+#if defined(HAVE_DECL_POSIX_MEMALIGN) && !HAVE_DECL_POSIX_MEMALIGN
+extern int posix_memalign(void **, size_t, size_t);
+#endif
+
+void bench_assertion_failed(const char *s, int line, const char *file)
+{
+     ovtpvt_err("bench: %s:%d: assertion failed: %s\n", file, line, s);
+     bench_exit(EXIT_FAILURE);
+}
+
+#ifdef HAVE_DRAND48
+#  if defined(HAVE_DECL_DRAND48) && !HAVE_DECL_DRAND48
+extern double drand48(void);
+#  endif
+double bench_drand(void)
+{
+     return drand48() - 0.5;
+}
+#  if defined(HAVE_DECL_SRAND48) && !HAVE_DECL_SRAND48
+extern void srand48(long);
+#  endif
+void bench_srand(int seed)
+{
+     srand48(seed);
+}
+#else
+double bench_drand(void)
+{
+     double d = rand();
+     return (d / (double) RAND_MAX) - 0.5;
+}
+void bench_srand(int seed)
+{
+     srand(seed);
+}
+#endif
+
+/**********************************************************
+ *   DEBUGGING CODE
+ **********************************************************/
+#ifdef BENCH_DEBUG
+static int bench_malloc_cnt = 0;
+
+/*
+ * debugging malloc/free.  Initialize every malloced and freed area to
+ * random values, just to make sure we are not using uninitialized
+ * pointers.  Also check for writes past the ends of allocated blocks,
+ * and a couple of other things.
+ *
+ * This code is a quick and dirty hack -- use at your own risk.
+ */
+
+static int bench_malloc_total = 0, bench_malloc_max = 0, bench_malloc_cnt_max = 0;
+
+#define MAGIC ((size_t)0xABadCafe)
+#define PAD_FACTOR 2
+#define TWO_SIZE_T (2 * sizeof(size_t))
+
+#define VERBOSE_ALLOCATION 0
+
+#if VERBOSE_ALLOCATION
+#define WHEN_VERBOSE(a) a
+#else
+#define WHEN_VERBOSE(a)
+#endif
+
+void *bench_malloc(size_t n)
+{
+     char *p;
+     size_t i;
+
+     bench_malloc_total += n;
+
+     if (bench_malloc_total > bench_malloc_max)
+	  bench_malloc_max = bench_malloc_total;
+
+     p = (char *) malloc(PAD_FACTOR * n + TWO_SIZE_T);
+     BENCH_ASSERT(p);
+
+     /* store the size in a known position */
+     ((size_t *) p)[0] = n;
+     ((size_t *) p)[1] = MAGIC;
+     for (i = 0; i < PAD_FACTOR * n; i++)
+	  p[i + TWO_SIZE_T] = (char) (i ^ 0xDEADBEEF);
+
+     ++bench_malloc_cnt;
+
+     if (bench_malloc_cnt > bench_malloc_cnt_max)
+	  bench_malloc_cnt_max = bench_malloc_cnt;
+
+     /* skip the size we stored previously */
+     return (void *) (p + TWO_SIZE_T);
+}
+
+void bench_free(void *p)
+{
+     char *q;
+
+     BENCH_ASSERT(p);
+
+     q = ((char *) p) - TWO_SIZE_T;
+     BENCH_ASSERT(q);
+
+     {
+	  size_t n = ((size_t *) q)[0];
+	  size_t magic = ((size_t *) q)[1];
+	  size_t i;
+
+	  ((size_t *) q)[0] = 0; /* set to zero to detect duplicate free's */
+
+	  BENCH_ASSERT(magic == MAGIC);
+	  ((size_t *) q)[1] = ~MAGIC;
+
+	  bench_malloc_total -= n;
+	  BENCH_ASSERT(bench_malloc_total >= 0);
+
+	  /* check for writing past end of array: */
+	  for (i = n; i < PAD_FACTOR * n; ++i)
+	       if (q[i + TWO_SIZE_T] != (char) (i ^ 0xDEADBEEF)) {
+		    BENCH_ASSERT(0 /* array bounds overwritten */);
+	       }
+	  for (i = 0; i < PAD_FACTOR * n; ++i)
+	       q[i + TWO_SIZE_T] = (char) (i ^ 0xBEEFDEAD);
+
+	  --bench_malloc_cnt;
+
+	  BENCH_ASSERT(bench_malloc_cnt >= 0);
+
+	  BENCH_ASSERT(
+	       (bench_malloc_cnt == 0 && bench_malloc_total == 0) ||
+	       (bench_malloc_cnt > 0 && bench_malloc_total > 0));
+
+	  free(q);
+     }
+}
+
+#else
+/**********************************************************
+ *   NON DEBUGGING CODE
+ **********************************************************/
+/* production version, no hacks */
+
+#define MIN_ALIGNMENT 128    /* must be power of two */
+
+#define real_free free /* memalign and malloc use ordinary free */
+
+void *bench_malloc(size_t n)
+{
+     void *p;
+     if (n == 0) n = 1;
+
+#if defined(WITH_OUR_MALLOC)
+     /* Our own aligned malloc/free.  Assumes sizeof(void*) is
+	a power of two <= 8 and that malloc is at least
+	sizeof(void*)-aligned.  Assumes size_t = uintptr_t.  */
+     {
+	  void *p0;
+	  if ((p0 = malloc(n + MIN_ALIGNMENT))) {
+	       p = (void *) (((size_t) p0 + MIN_ALIGNMENT) & (~((size_t) (MIN_ALIGNMENT - 1))));
+	       *((void **) p - 1) = p0;
+	  }
+	  else
+	       p = (void *) 0;
+     }
+#elif defined(HAVE_MEMALIGN)
+     p = memalign(MIN_ALIGNMENT, n);
+#elif defined(HAVE_POSIX_MEMALIGN)
+     /* note: posix_memalign is broken in glibc 2.2.5: it constrains
+	the size, not the alignment, to be (power of two) * sizeof(void*).
+        The bug seems to have been fixed as of glibc 2.3.1. */
+     if (posix_memalign(&p, MIN_ALIGNMENT, n))
+	  p = (void*) 0;
+#elif defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
+     /* Intel's C compiler defines _mm_malloc and _mm_free intrinsics */
+     p = (void *) _mm_malloc(n, MIN_ALIGNMENT);
+#    undef real_free
+#    define real_free _mm_free
+#else
+     p = malloc(n);
+#endif
+
+     BENCH_ASSERT(p);
+     return p;
+}
+
+void bench_free(void *p)
+{
+#ifdef WITH_OUR_MALLOC
+     if (p) free(*((void **) p - 1));
+#else
+     real_free(p);
+#endif
+}
+
+#endif
+
+void bench_free0(void *p)
+{
+     if (p) bench_free(p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/verify-dft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/verify-dft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "verify.h"
+
+/* copy A into B, using output stride of A and input stride of B */
+typedef struct {
+     dotens2_closure k;
+     R *ra; R *ia;
+     R *rb; R *ib;
+     int scalea, scaleb;
+} cpy_closure;
+
+static void cpy0(dotens2_closure *k_, 
+		 int indxa, int ondxa, int indxb, int ondxb)
+{
+     cpy_closure *k = (cpy_closure *)k_;
+     k->rb[indxb * k->scaleb] = k->ra[ondxa * k->scalea];
+     k->ib[indxb * k->scaleb] = k->ia[ondxa * k->scalea];
+     UNUSED(indxa); UNUSED(ondxb);
+}
+
+static void cpy(R *ra, R *ia, const bench_tensor *sza, int scalea,
+		R *rb, R *ib, const bench_tensor *szb, int scaleb)
+{
+     cpy_closure k;
+     k.k.apply = cpy0;
+     k.ra = ra; k.ia = ia; k.rb = rb; k.ib = ib;
+     k.scalea = scalea; k.scaleb = scaleb;
+     bench_dotens2(sza, szb, &k.k);
+}
+
+typedef struct {
+     dofft_closure k;
+     bench_problem *p;
+} dofft_dft_closure;
+
+static void dft_apply(dofft_closure *k_, bench_complex *in, bench_complex *out)
+{
+     dofft_dft_closure *k = (dofft_dft_closure *)k_;
+     bench_problem *p = k->p;
+     bench_tensor *totalsz, *pckdsz;
+     bench_tensor *totalsz_swap, *pckdsz_swap;
+     bench_real *ri, *ii, *ro, *io;
+     int totalscale;
+
+     totalsz = tensor_append(p->vecsz, p->sz);
+     pckdsz = verify_pack(totalsz, 2);
+     ri = (bench_real *) p->in;
+     ro = (bench_real *) p->out;
+
+     totalsz_swap = tensor_copy_swapio(totalsz);
+     pckdsz_swap = tensor_copy_swapio(pckdsz);
+
+     /* confusion: the stride is the distance between complex elements
+	when using interleaved format, but it is the distance between
+	real elements when using split format */
+     if (p->split) {
+	  ii = p->ini ? (bench_real *) p->ini : ri + p->iphyssz;
+	  io = p->outi ? (bench_real *) p->outi : ro + p->ophyssz;
+	  totalscale = 1;
+     } else {
+	  ii = p->ini ? (bench_real *) p->ini : ri + 1;
+	  io = p->outi ? (bench_real *) p->outi : ro + 1;
+	  totalscale = 2;
+     }
+
+     cpy(&c_re(in[0]), &c_im(in[0]), pckdsz, 1,
+	    ri, ii, totalsz, totalscale);
+     after_problem_ccopy_from(p, ri, ii);
+     doit(1, p);
+     after_problem_ccopy_to(p, ro, io);
+     if (k->k.recopy_input)
+	  cpy(ri, ii, totalsz_swap, totalscale,
+	      &c_re(in[0]), &c_im(in[0]), pckdsz_swap, 1);
+     cpy(ro, io, totalsz, totalscale,
+	 &c_re(out[0]), &c_im(out[0]), pckdsz, 1);
+
+     tensor_destroy(totalsz);
+     tensor_destroy(pckdsz);
+     tensor_destroy(totalsz_swap);
+     tensor_destroy(pckdsz_swap);
+}
+
+void verify_dft(bench_problem *p, int rounds, double tol, errors *e)
+{
+     C *inA, *inB, *inC, *outA, *outB, *outC, *tmp;
+     int n, vecn, N;
+     dofft_dft_closure k;
+
+     BENCH_ASSERT(p->kind == PROBLEM_COMPLEX);
+
+     k.k.apply = dft_apply;
+     k.k.recopy_input = 0;
+     k.p = p;
+
+     if (rounds == 0)
+	  rounds = 20;  /* default value */
+
+     n = tensor_sz(p->sz);
+     vecn = tensor_sz(p->vecsz);
+     N = n * vecn;
+
+     inA = (C *) bench_malloc(N * sizeof(C));
+     inB = (C *) bench_malloc(N * sizeof(C));
+     inC = (C *) bench_malloc(N * sizeof(C));
+     outA = (C *) bench_malloc(N * sizeof(C));
+     outB = (C *) bench_malloc(N * sizeof(C));
+     outC = (C *) bench_malloc(N * sizeof(C));
+     tmp = (C *) bench_malloc(N * sizeof(C));
+
+     e->i = impulse(&k.k, n, vecn, inA, inB, inC, outA, outB, outC, 
+		    tmp, rounds, tol);
+     e->l = linear(&k.k, 0, N, inA, inB, inC, outA, outB, outC,
+		   tmp, rounds, tol);
+
+     e->s = 0.0;
+     e->s = dmax(e->s, tf_shift(&k.k, 0, p->sz, n, vecn, p->sign,
+				inA, inB, outA, outB, 
+				tmp, rounds, tol, TIME_SHIFT));
+     e->s = dmax(e->s, tf_shift(&k.k, 0, p->sz, n, vecn, p->sign,
+				inA, inB, outA, outB, 
+				tmp, rounds, tol, FREQ_SHIFT));
+
+     if (!p->in_place && !p->destroy_input)
+	  preserves_input(&k.k, 0, N, inA, inB, outB, rounds);
+
+     bench_free(tmp);
+     bench_free(outC);
+     bench_free(outB);
+     bench_free(outA);
+     bench_free(inC);
+     bench_free(inB);
+     bench_free(inA);
+}
+
+
+void accuracy_dft(bench_problem *p, int rounds, int impulse_rounds,
+		  double t[6])
+{
+     dofft_dft_closure k;
+     int n;
+     C *a, *b;
+
+     BENCH_ASSERT(p->kind == PROBLEM_COMPLEX);
+     BENCH_ASSERT(p->sz->rnk == 1);
+     BENCH_ASSERT(p->vecsz->rnk == 0);
+
+     k.k.apply = dft_apply;
+     k.k.recopy_input = 0;
+     k.p = p;
+     n = tensor_sz(p->sz);
+
+     a = (C *) bench_malloc(n * sizeof(C));
+     b = (C *) bench_malloc(n * sizeof(C));
+     accuracy_test(&k.k, 0, p->sign, n, a, b, rounds, impulse_rounds, t);
+     bench_free(b);
+     bench_free(a);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/verify-lib.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/verify-lib.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "verify.h"
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+/*
+ * Utility functions:
+ */
+static double dabs(double x) { return (x < 0.0) ? -x : x; }
+static double dmin(double x, double y) { return (x < y) ? x : y; }
+static double norm2(double x, double y) { return dmax(dabs(x), dabs(y)); }
+
+double dmax(double x, double y) { return (x > y) ? x : y; }
+
+static double aerror(C *a, C *b, int n)
+{
+     if (n > 0) {
+	  /* compute the relative Linf error */
+	  double e = 0.0, mag = 0.0;
+	  int i;
+
+	  for (i = 0; i < n; ++i) {
+	       e = dmax(e, norm2(c_re(a[i]) - c_re(b[i]),
+				 c_im(a[i]) - c_im(b[i])));
+	       mag = dmax(mag, 
+			  dmin(norm2(c_re(a[i]), c_im(a[i])),
+			       norm2(c_re(b[i]), c_im(b[i]))));
+	  }
+	  e /= mag;
+
+#ifdef HAVE_ISNAN
+	  BENCH_ASSERT(!isnan(e));
+#endif
+	  return e;
+     } else
+	  return 0.0;
+}
+
+#ifdef HAVE_DRAND48
+#  if defined(HAVE_DECL_DRAND48) && !HAVE_DECL_DRAND48
+extern double drand48(void);
+#  endif
+double mydrand(void)
+{
+     return drand48() - 0.5;
+}
+#else
+double mydrand(void)
+{
+     double d = rand();
+     return (d / (double) RAND_MAX) - 0.5;
+}
+#endif
+
+void arand(C *a, int n)
+{
+     int i;
+
+     /* generate random inputs */
+     for (i = 0; i < n; ++i) {
+	  c_re(a[i]) = mydrand();
+	  c_im(a[i]) = mydrand();
+     }
+}
+
+/* make array real */
+void mkreal(C *A, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+          c_im(A[i]) = 0.0;
+     }
+}
+
+static void assign_conj(C *Ac, C *A, int rank, const bench_iodim *dim, int stride)
+{
+     if (rank == 0) {
+          c_re(*Ac) = c_re(*A);
+          c_im(*Ac) = -c_im(*A);
+     }
+     else {
+          int i, n0 = dim[rank - 1].n, s = stride;
+          rank -= 1;
+	  stride *= n0;
+          assign_conj(Ac, A, rank, dim, stride);
+          for (i = 1; i < n0; ++i)
+               assign_conj(Ac + (n0 - i) * s, A + i * s, rank, dim, stride);
+     }
+}
+
+/* make array hermitian */
+void mkhermitian(C *A, int rank, const bench_iodim *dim, int stride)
+{
+     if (rank == 0)
+          c_im(*A) = 0.0;
+     else {
+          int i, n0 = dim[rank - 1].n, s = stride;
+          rank -= 1;
+	  stride *= n0;
+          mkhermitian(A, rank, dim, stride);
+          for (i = 1; 2*i < n0; ++i)
+               assign_conj(A + (n0 - i) * s, A + i * s, rank, dim, stride);
+          if (2*i == n0)
+               mkhermitian(A + i * s, rank, dim, stride);
+     }
+}
+
+void mkhermitian1(C *a, int n)
+{
+     bench_iodim d;
+
+     d.n = n;
+     d.is = d.os = 1;
+     mkhermitian(a, 1, &d, 1);
+}
+
+/* C = A */
+void acopy(C *c, C *a, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  c_re(c[i]) = c_re(a[i]);
+	  c_im(c[i]) = c_im(a[i]);
+     }
+}
+
+/* C = A + B */
+void aadd(C *c, C *a, C *b, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  c_re(c[i]) = c_re(a[i]) + c_re(b[i]);
+	  c_im(c[i]) = c_im(a[i]) + c_im(b[i]);
+     }
+}
+
+/* C = A - B */
+void asub(C *c, C *a, C *b, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  c_re(c[i]) = c_re(a[i]) - c_re(b[i]);
+	  c_im(c[i]) = c_im(a[i]) - c_im(b[i]);
+     }
+}
+
+/* B = rotate left A (complex) */
+void arol(C *b, C *a, int n, int nb, int na)
+{
+     int i, ib, ia;
+
+     for (ib = 0; ib < nb; ++ib) {
+	  for (i = 0; i < n - 1; ++i)
+	       for (ia = 0; ia < na; ++ia) {
+		    C *pb = b + (ib * n + i) * na + ia;
+		    C *pa = a + (ib * n + i + 1) * na + ia;
+		    c_re(*pb) = c_re(*pa);
+		    c_im(*pb) = c_im(*pa);
+	       }
+
+	  for (ia = 0; ia < na; ++ia) {
+	       C *pb = b + (ib * n + n - 1) * na + ia;
+	       C *pa = a + ib * n * na + ia;
+	       c_re(*pb) = c_re(*pa);
+	       c_im(*pb) = c_im(*pa);
+	  }
+     }
+}
+
+void aphase_shift(C *b, C *a, int n, int nb, int na, double sign)
+{
+     int j, jb, ja;
+     trigreal twopin;
+     twopin = K2PI / n;
+
+     for (jb = 0; jb < nb; ++jb)
+	  for (j = 0; j < n; ++j) {
+	       trigreal s = sign * SIN(j * twopin);
+	       trigreal c = COS(j * twopin);
+
+	       for (ja = 0; ja < na; ++ja) {
+		    int k = (jb * n + j) * na + ja;
+		    c_re(b[k]) = c_re(a[k]) * c - c_im(a[k]) * s;
+		    c_im(b[k]) = c_re(a[k]) * s + c_im(a[k]) * c;
+	       }
+	  }
+}
+
+/* A = alpha * A  (complex, in place) */
+void ascale(C *a, C alpha, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  R xr = c_re(a[i]), xi = c_im(a[i]);
+	  c_re(a[i]) = xr * c_re(alpha) - xi * c_im(alpha);
+	  c_im(a[i]) = xr * c_im(alpha) + xi * c_re(alpha);
+     }
+}
+
+
+double acmp(C *a, C *b, int n, const char *test, double tol)
+{
+     double d = aerror(a, b, n);
+     if (d > tol) {
+	  ovtpvt_err("Found relative error %e (%s)\n", d, test);
+
+	  {
+	       int i, N;
+	       N = n > 300 && verbose <= 2 ? 300 : n;
+	       for (i = 0; i < N; ++i) 
+		    ovtpvt_err("%8d %16.12f %16.12f   %16.12f %16.12f\n", i, 
+			       (double) c_re(a[i]), (double) c_im(a[i]),
+			       (double) c_re(b[i]), (double) c_im(b[i]));
+	  }
+
+	  bench_exit(EXIT_FAILURE);
+     }
+     return d;
+}
+
+
+/*
+ * Implementation of the FFT tester described in
+ *
+ * Funda Erg�n. Testing multivariate linear functions: Overcoming the
+ * generator bottleneck. In Proceedings of the Twenty-Seventh Annual
+ * ACM Symposium on the Theory of Computing, pages 407-416, Las Vegas,
+ * Nevada, 29 May--1 June 1995.
+ *
+ * Also: F. Ergun, S. R. Kumar, and D. Sivakumar, "Self-testing without
+ * the generator bottleneck," SIAM J. on Computing 29 (5), 1630-51 (2000).
+ */
+
+static double impulse0(dofft_closure *k,
+		       int n, int vecn, 
+		       C *inA, C *inB, C *inC,
+		       C *outA, C *outB, C *outC,
+		       C *tmp, int rounds, double tol)
+{
+     int N = n * vecn;
+     double e = 0.0;
+     int j;
+
+     k->apply(k, inA, tmp);
+     e = dmax(e, acmp(tmp, outA, N, "impulse 1", tol));
+
+     for (j = 0; j < rounds; ++j) {
+	  arand(inB, N);
+	  asub(inC, inA, inB, N);
+	  k->apply(k, inB, outB);
+	  k->apply(k, inC, outC);
+	  aadd(tmp, outB, outC, N);
+	  e = dmax(e, acmp(tmp, outA, N, "impulse", tol));
+     }
+     return e;
+}
+
+double impulse(dofft_closure *k,
+	       int n, int vecn, 
+	       C *inA, C *inB, C *inC,
+	       C *outA, C *outB, C *outC,
+	       C *tmp, int rounds, double tol)
+{
+     int i, j;
+     double e = 0.0;
+
+     /* check impulsive input */
+     for (i = 0; i < vecn; ++i) {
+	  R x = (sqrt(n)*(i+1)) / (double)(vecn+1);
+	  for (j = 0; j < n; ++j) {
+	       c_re(inA[j + i * n]) = 0;
+	       c_im(inA[j + i * n]) = 0;
+	       c_re(outA[j + i * n]) = x;
+	       c_im(outA[j + i * n]) = 0;
+	  }
+	  c_re(inA[i * n]) = x;
+	  c_im(inA[i * n]) = 0;
+     }
+
+     e = dmax(e, impulse0(k, n, vecn, inA, inB, inC, outA, outB, outC,
+			  tmp, rounds, tol));
+
+     /* check constant input */
+     for (i = 0; i < vecn; ++i) {
+	  R x = (i+1) / ((double)(vecn+1) * sqrt(n));
+	  for (j = 0; j < n; ++j) {
+	       c_re(inA[j + i * n]) = x;
+	       c_im(inA[j + i * n]) = 0;
+	       c_re(outA[j + i * n]) = 0;
+	       c_im(outA[j + i * n]) = 0;
+	  }
+	  c_re(outA[i * n]) = n * x;
+	  c_im(outA[i * n]) = 0;
+     }
+
+     e = dmax(e, impulse0(k, n, vecn, inA, inB, inC, outA, outB, outC,
+			  tmp, rounds, tol));
+     return e;
+}
+
+double linear(dofft_closure *k, int realp,
+	      int n, C *inA, C *inB, C *inC, C *outA,
+	      C *outB, C *outC, C *tmp, int rounds, double tol)
+{
+     int j;
+     double e = 0.0;
+
+     for (j = 0; j < rounds; ++j) {
+	  C alpha, beta;
+	  c_re(alpha) = mydrand();
+	  c_im(alpha) = realp ? 0.0 : mydrand();
+	  c_re(beta) = mydrand();
+	  c_im(beta) = realp ? 0.0 : mydrand();
+	  arand(inA, n);
+	  arand(inB, n);
+	  k->apply(k, inA, outA);
+	  k->apply(k, inB, outB);
+
+	  ascale(outA, alpha, n);
+	  ascale(outB, beta, n);
+	  aadd(tmp, outA, outB, n);
+	  ascale(inA, alpha, n);
+	  ascale(inB, beta, n);
+	  aadd(inC, inA, inB, n);
+	  k->apply(k, inC, outC);
+
+	  e = dmax(e, acmp(outC, tmp, n, "linear", tol));
+     }
+     return e;
+}
+
+
+
+double tf_shift(dofft_closure *k,
+		int realp, const bench_tensor *sz,
+		int n, int vecn, double sign,
+		C *inA, C *inB, C *outA, C *outB, C *tmp,
+		int rounds, double tol, int which_shift)
+{
+     int nb, na, dim, N = n * vecn;
+     int i, j;
+     double e = 0.0;
+
+     /* test 3: check the time-shift property */
+     /* the paper performs more tests, but this code should be fine too */
+
+     nb = 1;
+     na = n;
+
+     /* check shifts across all SZ dimensions */
+     for (dim = 0; dim < sz->rnk; ++dim) {
+	  int ncur = sz->dims[dim].n;
+
+	  na /= ncur;
+
+	  for (j = 0; j < rounds; ++j) {
+	       arand(inA, N);
+
+	       if (which_shift == TIME_SHIFT) {
+		    for (i = 0; i < vecn; ++i) {
+			 if (realp) mkreal(inA + i * n, n);
+			 arol(inB + i * n, inA + i * n, ncur, nb, na);
+		    }
+		    k->apply(k, inA, outA);
+		    k->apply(k, inB, outB);
+		    for (i = 0; i < vecn; ++i) 
+			 aphase_shift(tmp + i * n, outB + i * n, ncur, 
+				      nb, na, sign);
+		    e = dmax(e, acmp(tmp, outA, N, "time shift", tol));
+	       } else {
+		    for (i = 0; i < vecn; ++i) {
+			 if (realp) 
+			      mkhermitian(inA + i * n, sz->rnk, sz->dims, 1);
+			 aphase_shift(inB + i * n, inA + i * n, ncur,
+				      nb, na, -sign);
+		    }
+		    k->apply(k, inA, outA);
+		    k->apply(k, inB, outB);
+		    for (i = 0; i < vecn; ++i) 
+			 arol(tmp + i * n, outB + i * n, ncur, nb, na);
+		    e = dmax(e, acmp(tmp, outA, N, "freq shift", tol));
+	       }
+	  }
+
+	  nb *= ncur;
+     }
+     return e;
+}
+
+
+void preserves_input(dofft_closure *k, aconstrain constrain,
+		     int n, C *inA, C *inB, C *outB, int rounds)
+{
+     int j;
+     int recopy_input = k->recopy_input;
+
+     k->recopy_input = 1;
+     for (j = 0; j < rounds; ++j) {
+	  arand(inA, n);
+	  if (constrain)
+	       constrain(inA, n);
+	  
+	  acopy(inB, inA, n);
+	  k->apply(k, inB, outB);
+	  acmp(inB, inA, n, "preserves_input", 0.0);
+     }
+     k->recopy_input = recopy_input;
+}
+
+
+/* Make a copy of the size tensor, with the same dimensions, but with
+   the strides corresponding to a "packed" row-major array with the
+   given stride. */
+bench_tensor *verify_pack(const bench_tensor *sz, int s)
+{
+     bench_tensor *x = tensor_copy(sz);
+     if (FINITE_RNK(x->rnk) && x->rnk > 0) {
+	  int i;
+	  x->dims[x->rnk - 1].is = s;
+	  x->dims[x->rnk - 1].os = s;
+	  for (i = x->rnk - 1; i > 0; --i) {
+	       x->dims[i - 1].is = x->dims[i].is * x->dims[i].n;
+	       x->dims[i - 1].os = x->dims[i].os * x->dims[i].n;
+	  }
+     }
+     return x;
+}
+
+static int all_zero(C *a, int n)
+{
+     int i;
+     for (i = 0; i < n; ++i)
+	  if (c_re(a[i]) != 0.0 || c_im(a[i]) != 0.0)
+	       return 0;
+     return 1;
+}
+
+static int one_accuracy_test(dofft_closure *k, aconstrain constrain,
+			     int sign, int n, C *a, C *b, 
+			     double t[6])
+{
+     double err[6];
+
+     if (constrain)
+	  constrain(a, n);
+     
+     if (all_zero(a, n))
+	  return 0;
+     
+     k->apply(k, a, b);
+     fftaccuracy(n, a, b, sign, err);
+     
+     t[0] += err[0];
+     t[1] += err[1] * err[1];
+     t[2] = dmax(t[2], err[2]);
+     t[3] += err[3];
+     t[4] += err[4] * err[4];
+     t[5] = dmax(t[5], err[5]);
+
+     return 1;
+}
+
+void accuracy_test(dofft_closure *k, aconstrain constrain,
+		   int sign, int n, C *a, C *b, int rounds, int impulse_rounds,
+		   double t[6])
+{
+     int r, i;
+     int ntests = 0;
+     bench_complex czero = {0, 0};
+
+     for (i = 0; i < 6; ++i) t[i] = 0.0;
+
+     for (r = 0; r < rounds; ++r) {
+	  arand(a, n);
+	  if (one_accuracy_test(k, constrain, sign, n, a, b, t))
+	       ++ntests;
+     }
+
+     /* impulses at beginning of array */
+     for (r = 0; r < impulse_rounds; ++r) {
+	  if (r > n - r - 1)
+	       continue;
+	  
+	  caset(a, n, czero);
+	  c_re(a[r]) = c_im(a[r]) = 1.0;
+	  
+	  if (one_accuracy_test(k, constrain, sign, n, a, b, t))
+	       ++ntests;
+     }
+     
+     /* impulses at end of array */
+     for (r = 0; r < impulse_rounds; ++r) {
+	  if (r <= n - r - 1)
+	       continue;
+	  
+	  caset(a, n, czero);
+	  c_re(a[n - r - 1]) = c_im(a[n - r - 1]) = 1.0;
+	  
+	  if (one_accuracy_test(k, constrain, sign, n, a, b, t))
+	       ++ntests;
+     }
+     
+     /* randomly-located impulses */
+     for (r = 0; r < impulse_rounds; ++r) {
+	  caset(a, n, czero);
+	  i = rand() % n;
+	  c_re(a[i]) = c_im(a[i]) = 1.0;
+	  
+	  if (one_accuracy_test(k, constrain, sign, n, a, b, t))
+	       ++ntests;
+     }
+
+     t[0] /= ntests;
+     t[1] = sqrt(t[1] / ntests);
+     t[3] /= ntests;
+     t[4] = sqrt(t[4] / ntests);
+
+     fftaccuracy_done();
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/verify-r2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/verify-r2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,964 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Lots of ugly duplication from verify-lib.c, plus lots of ugliness in
+   general for all of the r2r variants...oh well, for now */
+
+#include "verify.h"
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+typedef struct {
+     bench_problem *p;
+     bench_tensor *probsz;
+     bench_tensor *totalsz;
+     bench_tensor *pckdsz;
+     bench_tensor *pckdvecsz;
+} info;
+
+/*
+ * Utility functions:
+ */
+
+static double dabs(double x) { return (x < 0.0) ? -x : x; }
+static double dmin(double x, double y) { return (x < y) ? x : y; }
+
+static double raerror(R *a, R *b, int n)
+{
+     if (n > 0) {
+          /* compute the relative Linf error */
+          double e = 0.0, mag = 0.0;
+          int i;
+
+          for (i = 0; i < n; ++i) {
+               e = dmax(e, dabs(a[i] - b[i]));
+               mag = dmax(mag, dmin(dabs(a[i]), dabs(b[i])));
+          }
+	  if (dabs(mag) < 1e-14 && dabs(e) < 1e-14)
+	       e = 0.0;
+	  else
+	       e /= mag;
+
+#ifdef HAVE_ISNAN
+          BENCH_ASSERT(!isnan(e));
+#endif
+          return e;
+     } else
+          return 0.0;
+}
+
+#define by2pi(m, n) ((K2PI * (m)) / (n))
+
+/*
+ * Improve accuracy by reducing x to range [0..1/8]
+ * before multiplication by 2 * PI.
+ */
+
+static trigreal bench_sincos(trigreal m, trigreal n, int sinp)
+{
+     /* waiting for C to get tail recursion... */
+     trigreal half_n = n * 0.5;
+     trigreal quarter_n = half_n * 0.5;
+     trigreal eighth_n = quarter_n * 0.5;
+     trigreal sgn = 1.0;
+
+     if (sinp) goto sin;
+ cos:
+     if (m < 0) { m = -m; /* goto cos; */ }
+     if (m > half_n) { m = n - m; goto cos; }
+     if (m > eighth_n) { m = quarter_n - m; goto sin; }
+     return sgn * COS(by2pi(m, n));
+
+ msin:
+     sgn = -sgn;
+ sin:
+     if (m < 0) { m = -m; goto msin; }
+     if (m > half_n) { m = n - m; goto msin; }
+     if (m > eighth_n) { m = quarter_n - m; goto cos; }
+     return sgn * SIN(by2pi(m, n));
+}
+
+static trigreal cos2pi(int m, int n)
+{
+     return bench_sincos((trigreal)m, (trigreal)n, 0);
+}
+
+static trigreal sin2pi(int m, int n)
+{
+     return bench_sincos((trigreal)m, (trigreal)n, 1);
+}
+
+static trigreal cos00(int i, int j, int n)
+{
+     return cos2pi(i * j, n);
+}
+
+static trigreal cos01(int i, int j, int n)
+{
+     return cos00(i, 2*j + 1, 2*n);
+}
+
+static trigreal cos10(int i, int j, int n)
+{
+     return cos00(2*i + 1, j, 2*n);
+}
+
+static trigreal cos11(int i, int j, int n)
+{
+     return cos00(2*i + 1, 2*j + 1, 4*n);
+}
+
+static trigreal sin00(int i, int j, int n)
+{
+     return sin2pi(i * j, n);
+}
+
+static trigreal sin01(int i, int j, int n)
+{
+     return sin00(i, 2*j + 1, 2*n);
+}
+
+static trigreal sin10(int i, int j, int n)
+{
+     return sin00(2*i + 1, j, 2*n);
+}
+
+static trigreal sin11(int i, int j, int n)
+{
+     return sin00(2*i + 1, 2*j + 1, 4*n);
+}
+
+static trigreal realhalf(int i, int j, int n)
+{
+     UNUSED(i);
+     if (j <= n - j)
+	  return 1.0;
+     else
+	  return 0.0;
+}
+
+static trigreal coshalf(int i, int j, int n)
+{
+     if (j <= n - j)
+	  return cos00(i, j, n);
+     else
+	  return cos00(i, n - j, n);
+}
+
+static trigreal unity(int i, int j, int n)
+{
+     UNUSED(i);
+     UNUSED(j);
+     UNUSED(n);
+     return 1.0;
+}
+
+typedef trigreal (*trigfun)(int, int, int);
+
+static void rarand(R *a, int n)
+{
+     int i;
+
+     /* generate random inputs */
+     for (i = 0; i < n; ++i) {
+	  a[i] = mydrand();
+     }
+}
+
+/* C = A + B */
+static void raadd(R *c, R *a, R *b, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  c[i] = a[i] + b[i];
+     }
+}
+
+/* C = A - B */
+static void rasub(R *c, R *a, R *b, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  c[i] = a[i] - b[i];
+     }
+}
+
+/* B = rotate left A + rotate right A */
+static void rarolr(R *b, R *a, int n, int nb, int na, 
+		   r2r_kind_t k)
+{
+     int isL0 = 0, isL1 = 0, isR0 = 0, isR1 = 0;
+     int i, ib, ia;
+
+     for (ib = 0; ib < nb; ++ib) {
+	  for (i = 0; i < n - 1; ++i)
+	       for (ia = 0; ia < na; ++ia)
+		    b[(ib * n + i) * na + ia] =
+			 a[(ib * n + i + 1) * na + ia];
+
+	  /* ugly switch to do boundary conditions for various r2r types */
+	  switch (k) {
+	       /* periodic boundaries */
+	      case R2R_DHT:
+	      case R2R_R2HC:
+		   for (ia = 0; ia < na; ++ia) {
+			b[(ib * n + n - 1) * na + ia] = 
+			     a[(ib * n + 0) * na + ia];
+			b[(ib * n + 0) * na + ia] += 
+			     a[(ib * n + n - 1) * na + ia];
+		   }
+		   break;
+		   
+	      case R2R_HC2R: /* ugh (hermitian halfcomplex boundaries) */
+		   if (n > 2) {
+			if (n % 2 == 0)
+			     for (ia = 0; ia < na; ++ia) {
+				  b[(ib * n + n - 1) * na + ia] = 0.0;
+				  b[(ib * n + 0) * na + ia] += 
+				       a[(ib * n + 1) * na + ia];
+				  b[(ib * n + n/2) * na + ia] += 
+				       + a[(ib * n + n/2 - 1) * na + ia]
+				       - a[(ib * n + n/2 + 1) * na + ia];
+				  b[(ib * n + n/2 + 1) * na + ia] += 
+				       - a[(ib * n + n/2) * na + ia];
+			     }
+			else 
+			     for (ia = 0; ia < na; ++ia) {
+				  b[(ib * n + n - 1) * na + ia] = 0.0;
+				  b[(ib * n + 0) * na + ia] += 
+				       a[(ib * n + 1) * na + ia];
+				  b[(ib * n + n/2) * na + ia] += 
+				       + a[(ib * n + n/2) * na + ia]
+				       - a[(ib * n + n/2 + 1) * na + ia];
+				  b[(ib * n + n/2 + 1) * na + ia] += 
+				       - a[(ib * n + n/2 + 1) * na + ia]
+				       - a[(ib * n + n/2) * na + ia];
+			     }
+		   } else /* n <= 2 */ {
+			for (ia = 0; ia < na; ++ia) {
+			     b[(ib * n + n - 1) * na + ia] =
+				  a[(ib * n + 0) * na + ia];
+			     b[(ib * n + 0) * na + ia] += 
+				  a[(ib * n + n - 1) * na + ia];
+			}
+		   }
+		   break;
+		   
+	      /* various even/odd boundary conditions */
+	      case R2R_REDFT00:
+		   isL1 = isR1 = 1;
+		   goto mirrors;
+	      case R2R_REDFT01:
+		   isL1 = 1;
+		   goto mirrors;
+	      case R2R_REDFT10:
+		   isL0 = isR0 = 1;
+		   goto mirrors;
+	      case R2R_REDFT11:
+		   isL0 = 1;
+		   isR0 = -1;
+		   goto mirrors;
+	      case R2R_RODFT00:
+		   goto mirrors;
+	      case R2R_RODFT01:
+		   isR1 = 1;
+		   goto mirrors;
+	      case R2R_RODFT10:
+		   isL0 = isR0 = -1;
+		   goto mirrors;
+	      case R2R_RODFT11:
+		   isL0 = -1;
+		   isR0 = 1;
+		   goto mirrors;
+
+	  mirrors:
+		   
+		   for (ia = 0; ia < na; ++ia)
+			b[(ib * n + n - 1) * na + ia] = 
+			     isR0 * a[(ib * n + n - 1) * na + ia]
+			     + (n > 1 ? isR1 * a[(ib * n + n - 2) * na + ia]
+				: 0);
+		   
+		   for (ia = 0; ia < na; ++ia)
+			b[(ib * n) * na + ia] += 
+			     isL0 * a[(ib * n) * na + ia]
+			     + (n > 1 ? isL1 * a[(ib * n + 1) * na + ia] : 0);
+		   
+	  }
+
+	  for (i = 1; i < n; ++i)
+	       for (ia = 0; ia < na; ++ia)
+		    b[(ib * n + i) * na + ia] +=
+			 a[(ib * n + i - 1) * na + ia];
+     }
+}
+
+static void raphase_shift(R *b, R *a, int n, int nb, int na,
+			 int n0, int k0, trigfun t)
+{
+     int j, jb, ja;
+ 
+     for (jb = 0; jb < nb; ++jb)
+          for (j = 0; j < n; ++j) {
+               trigreal c = 2.0 * t(1, j + k0, n0);
+
+               for (ja = 0; ja < na; ++ja) {
+                    int k = (jb * n + j) * na + ja;
+                    b[k] = a[k] * c;
+               }
+          }
+}
+
+/* A = alpha * A  (real, in place) */
+static void rascale(R *a, R alpha, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  a[i] *= alpha;
+     }
+}
+
+/*
+ * compute rdft:
+ */
+
+/* copy real A into real B, using output stride of A and input stride of B */
+typedef struct {
+     dotens2_closure k;
+     R *ra;
+     R *rb;
+} cpyr_closure;
+
+static void cpyr0(dotens2_closure *k_, 
+		  int indxa, int ondxa, int indxb, int ondxb)
+{
+     cpyr_closure *k = (cpyr_closure *)k_;
+     k->rb[indxb] = k->ra[ondxa];
+     UNUSED(indxa); UNUSED(ondxb);
+}
+
+static void cpyr(R *ra, bench_tensor *sza, R *rb, bench_tensor *szb)
+{
+     cpyr_closure k;
+     k.k.apply = cpyr0;
+     k.ra = ra; k.rb = rb;
+     bench_dotens2(sza, szb, &k.k);
+}
+
+static void dofft(info *nfo, R *in, R *out)
+{
+     cpyr(in, nfo->pckdsz, (R *) nfo->p->in, nfo->totalsz);
+     after_problem_rcopy_from(nfo->p, (bench_real *)nfo->p->in);
+     doit(1, nfo->p);
+     after_problem_rcopy_to(nfo->p, (bench_real *)nfo->p->out);
+     cpyr((R *) nfo->p->out, nfo->totalsz, out, nfo->pckdsz);
+}
+
+static double racmp(R *a, R *b, int n, const char *test, double tol)
+{
+     double d = raerror(a, b, n);
+     if (d > tol) {
+	  ovtpvt_err("Found relative error %e (%s)\n", d, test);
+	  {
+	       int i, N;
+	       N = n > 300 && verbose <= 2 ? 300 : n;
+	       for (i = 0; i < N; ++i)
+		    ovtpvt_err("%8d %16.12f   %16.12f\n", i, 
+			       (double) a[i],
+			       (double) b[i]);
+	  }
+	  bench_exit(EXIT_FAILURE);
+     }
+     return d;
+}
+
+/***********************************************************************/
+
+typedef struct {
+     int n; /* physical size */
+     int n0; /* "logical" transform size */
+     int i0, k0; /* shifts of input/output */
+     trigfun ti, ts;  /* impulse/shift trig functions */
+} dim_stuff;
+
+static void impulse_response(int rnk, dim_stuff *d, R impulse_amp,
+			     R *A, int N)
+{
+     if (rnk == 0)
+	  A[0] = impulse_amp;
+     else {
+	  int i;
+	  N /= d->n;
+	  for (i = 0; i < d->n; ++i) {
+	       impulse_response(rnk - 1, d + 1,
+				impulse_amp * d->ti(d->i0, d->k0 + i, d->n0),
+				A + i * N, N);
+	  }
+     }
+}
+
+/***************************************************************************/
+
+/*
+ * Implementation of the FFT tester described in
+ *
+ * Funda Erg�n. Testing multivariate linear functions: Overcoming the
+ * generator bottleneck. In Proceedings of the Twenty-Seventh Annual
+ * ACM Symposium on the Theory of Computing, pages 407-416, Las Vegas,
+ * Nevada, 29 May--1 June 1995.
+ *
+ * Also: F. Ergun, S. R. Kumar, and D. Sivakumar, "Self-testing without
+ * the generator bottleneck," SIAM J. on Computing 29 (5), 1630-51 (2000).
+ */
+
+static double rlinear(int n, info *nfo, R *inA, R *inB, R *inC, R *outA,
+		      R *outB, R *outC, R *tmp, int rounds, double tol)
+{
+     double e = 0.0;
+     int j;
+
+     for (j = 0; j < rounds; ++j) {
+	  R alpha, beta;
+	  alpha = mydrand();
+	  beta = mydrand();
+	  rarand(inA, n);
+	  rarand(inB, n);
+	  dofft(nfo, inA, outA);
+	  dofft(nfo, inB, outB);
+
+	  rascale(outA, alpha, n);
+	  rascale(outB, beta, n);
+	  raadd(tmp, outA, outB, n);
+	  rascale(inA, alpha, n);
+	  rascale(inB, beta, n);
+	  raadd(inC, inA, inB, n);
+	  dofft(nfo, inC, outC);
+
+	  e = dmax(e, racmp(outC, tmp, n, "linear", tol));
+     }
+     return e;
+}
+
+static double rimpulse(dim_stuff *d, R impulse_amp,
+		       int n, int vecn, info *nfo, 
+		       R *inA, R *inB, R *inC,
+		       R *outA, R *outB, R *outC,
+		       R *tmp, int rounds, double tol)
+{
+     double e = 0.0;
+     int N = n * vecn;
+     int i;
+     int j;
+
+     /* test 2: check that the unit impulse is transformed properly */
+
+     for (i = 0; i < N; ++i) {
+	  /* pls */
+	  inA[i] = 0.0;
+     }
+     for (i = 0; i < vecn; ++i) {
+	  inA[i * n] = (i+1) / (double)(vecn+1);
+     
+	  /* transform of the pls */
+	  impulse_response(nfo->probsz->rnk, d, impulse_amp * inA[i * n],
+			   outA + i * n, n);
+     }
+
+     dofft(nfo, inA, tmp);
+     e = dmax(e, racmp(tmp, outA, N, "impulse 1", tol));
+
+     for (j = 0; j < rounds; ++j) {
+          rarand(inB, N);
+          rasub(inC, inA, inB, N);
+          dofft(nfo, inB, outB);
+          dofft(nfo, inC, outC);
+          raadd(tmp, outB, outC, N);
+          e = dmax(e, racmp(tmp, outA, N, "impulse", tol));
+     }
+     return e;
+}
+
+static double t_shift(int n, int vecn, info *nfo, 
+		      R *inA, R *inB, R *outA, R *outB, R *tmp,
+		      int rounds, double tol,
+		      dim_stuff *d)
+{
+     double e = 0.0;
+     int nb, na, dim, N = n * vecn;
+     int i, j;
+     bench_tensor *sz = nfo->probsz;
+
+     /* test 3: check the time-shift property */
+     /* the paper performs more tests, but this code should be fine too */
+
+     nb = 1;
+     na = n;
+
+     /* check shifts across all SZ dimensions */
+     for (dim = 0; dim < sz->rnk; ++dim) {
+	  int ncur = sz->dims[dim].n;
+
+	  na /= ncur;
+
+	  for (j = 0; j < rounds; ++j) {
+	       rarand(inA, N);
+
+	       for (i = 0; i < vecn; ++i) {
+		    rarolr(inB + i * n, inA + i*n, ncur, nb,na, 
+			  nfo->p->k[dim]);
+	       }
+	       dofft(nfo, inA, outA);
+	       dofft(nfo, inB, outB);
+	       for (i = 0; i < vecn; ++i) 
+		    raphase_shift(tmp + i * n, outA + i * n, ncur, 
+				 nb, na, d[dim].n0, d[dim].k0, d[dim].ts);
+	       e = dmax(e, racmp(tmp, outB, N, "time shift", tol));
+	  }
+
+	  nb *= ncur;
+     }
+     return e;
+}
+
+/***********************************************************************/
+
+void verify_r2r(bench_problem *p, int rounds, double tol, errors *e)
+{
+     R *inA, *inB, *inC, *outA, *outB, *outC, *tmp;
+     info nfo;
+     int n, vecn, N;
+     double impulse_amp = 1.0;
+     dim_stuff *d;
+     int i;
+
+     if (rounds == 0)
+	  rounds = 20;  /* default value */
+
+     n = tensor_sz(p->sz);
+     vecn = tensor_sz(p->vecsz);
+     N = n * vecn;
+
+     d = (dim_stuff *) bench_malloc(sizeof(dim_stuff) * p->sz->rnk);
+     for (i = 0; i < p->sz->rnk; ++i) {
+	  int n0, i0, k0;
+	  trigfun ti, ts;
+
+	  d[i].n = n0 = p->sz->dims[i].n;
+	  if (p->k[i] > R2R_DHT)
+	       n0 = 2 * (n0 + (p->k[i] == R2R_REDFT00 ? -1 : 
+			       (p->k[i] == R2R_RODFT00 ? 1 : 0)));
+	  
+	  switch (p->k[i]) {
+	      case R2R_R2HC:
+		   i0 = k0 = 0;
+		   ti = realhalf;
+		   ts = coshalf;
+		   break;
+	      case R2R_DHT:
+		   i0 = k0 = 0;
+		   ti = unity;
+		   ts = cos00;
+		   break;
+	      case R2R_HC2R:
+		   i0 = k0 = 0;
+		   ti = unity;
+		   ts = cos00;
+		   break;
+	      case R2R_REDFT00:
+		   i0 = k0 = 0;
+		   ti = ts = cos00;
+		   break;
+	      case R2R_REDFT01:
+		   i0 = k0 = 0;
+		   ti = ts = cos01;
+		   break;
+	      case R2R_REDFT10:
+		   i0 = k0 = 0;
+		   ti = cos10; impulse_amp *= 2.0;
+		   ts = cos00;
+		   break;
+	      case R2R_REDFT11:
+		   i0 = k0 = 0;
+		   ti = cos11; impulse_amp *= 2.0;
+		   ts = cos01;
+		   break;
+	      case R2R_RODFT00:
+		   i0 = k0 = 1;
+		   ti = sin00; impulse_amp *= 2.0;
+		   ts = cos00;
+		   break;
+	      case R2R_RODFT01:
+		   i0 = 1; k0 = 0;
+		   ti = sin01; impulse_amp *= n == 1 ? 1.0 : 2.0;
+		   ts = cos01;
+		   break;
+	      case R2R_RODFT10:
+		   i0 = 0; k0 = 1;
+		   ti = sin10; impulse_amp *= 2.0;
+		   ts = cos00;
+		   break;
+	      case R2R_RODFT11:
+		   i0 = k0 = 0;
+		   ti = sin11; impulse_amp *= 2.0;
+		   ts = cos01;
+		   break;
+	      default:
+		   BENCH_ASSERT(0);
+		   return;
+	  }
+
+	  d[i].n0 = n0;
+	  d[i].i0 = i0;
+	  d[i].k0 = k0;
+	  d[i].ti = ti;
+	  d[i].ts = ts;
+     }
+
+
+     inA = (R *) bench_malloc(N * sizeof(R));
+     inB = (R *) bench_malloc(N * sizeof(R));
+     inC = (R *) bench_malloc(N * sizeof(R));
+     outA = (R *) bench_malloc(N * sizeof(R));
+     outB = (R *) bench_malloc(N * sizeof(R));
+     outC = (R *) bench_malloc(N * sizeof(R));
+     tmp = (R *) bench_malloc(N * sizeof(R));
+
+     nfo.p = p;
+     nfo.probsz = p->sz;
+     nfo.totalsz = tensor_append(p->vecsz, nfo.probsz);
+     nfo.pckdsz = verify_pack(nfo.totalsz, 1);
+     nfo.pckdvecsz = verify_pack(p->vecsz, tensor_sz(nfo.probsz));
+
+     e->i = rimpulse(d, impulse_amp, n, vecn, &nfo,
+		     inA, inB, inC, outA, outB, outC, tmp, rounds, tol);
+     e->l = rlinear(N, &nfo, inA, inB, inC, outA, outB, outC, tmp, rounds,tol);
+     e->s = t_shift(n, vecn, &nfo, inA, inB, outA, outB, tmp, 
+		    rounds, tol, d);
+
+     /* grr, verify-lib.c:preserves_input() only works for complex */
+     if (!p->in_place && !p->destroy_input) {
+	  bench_tensor *totalsz_swap, *pckdsz_swap;
+	  totalsz_swap = tensor_copy_swapio(nfo.totalsz);
+	  pckdsz_swap = tensor_copy_swapio(nfo.pckdsz);
+
+	  for (i = 0; i < rounds; ++i) {
+	       rarand(inA, N);
+	       dofft(&nfo, inA, outB);
+	       cpyr((R *) nfo.p->in, totalsz_swap, inB, pckdsz_swap);
+	       racmp(inB, inA, N, "preserves_input", 0.0);
+	  }
+
+	  tensor_destroy(totalsz_swap);
+	  tensor_destroy(pckdsz_swap);
+     }
+
+     tensor_destroy(nfo.totalsz);
+     tensor_destroy(nfo.pckdsz);
+     tensor_destroy(nfo.pckdvecsz);
+     bench_free(tmp);
+     bench_free(outC);
+     bench_free(outB);
+     bench_free(outA);
+     bench_free(inC);
+     bench_free(inB);
+     bench_free(inA);
+     bench_free(d);
+}
+
+
+typedef struct {
+     dofft_closure k;
+     bench_problem *p;
+     int n0;
+} dofft_r2r_closure;
+
+static void cpyr1(int n, R *in, int is, R *out, int os, R scale)
+{
+     int i;
+     for (i = 0; i < n; ++i)
+	  out[i * os] = in[i * is] * scale;
+}
+
+static void mke00(C *a, int n, int c)
+{
+     int i;
+     for (i = 1; i + i < n; ++i)
+	  a[n - i][c] = a[i][c];
+}
+
+static void mkre00(C *a, int n)
+{
+     mkreal(a, n);
+     mke00(a, n, 0);
+}
+
+static void mkimag(C *a, int n)
+{
+     int i;
+     for (i = 0; i < n; ++i)
+	  c_re(a[i]) = 0.0;
+}
+
+static void mko00(C *a, int n, int c)
+{
+     int i;
+     a[0][c] = 0.0;
+     for (i = 1; i + i < n; ++i)
+	  a[n - i][c] = -a[i][c];
+     if (i + i == n)
+	  a[i][c] = 0.0;
+}
+
+static void mkro00(C *a, int n)
+{
+     mkreal(a, n);
+     mko00(a, n, 0);
+}
+
+static void mkio00(C *a, int n)
+{
+     mkimag(a, n);
+     mko00(a, n, 1);
+}
+
+static void mkre01(C *a, int n) /* n should be be multiple of 4 */
+{
+     R a0;
+     a0 = c_re(a[0]);
+     mko00(a, n/2, 0);
+     c_re(a[n/2]) = -(c_re(a[0]) = a0);
+     mkre00(a, n);
+}
+
+static void mkro01(C *a, int n) /* n should be be multiple of 4 */
+{
+     c_re(a[0]) = c_im(a[0]) = 0.0;
+     mkre00(a, n/2);
+     mkro00(a, n);
+}
+
+static void mkoddonly(C *a, int n)
+{
+     int i;
+     for (i = 0; i < n; i += 2)
+	  c_re(a[i]) = c_im(a[i]) = 0.0;
+}
+
+static void mkre10(C *a, int n)
+{
+     mkoddonly(a, n);
+     mkre00(a, n);
+}
+
+static void mkio10(C *a, int n)
+{
+     mkoddonly(a, n);
+     mkio00(a, n);
+}
+
+static void mkre11(C *a, int n)
+{
+     mkoddonly(a, n);
+     mko00(a, n/2, 0);
+     mkre00(a, n);
+}
+
+static void mkro11(C *a, int n)
+{
+     mkoddonly(a, n);
+     mkre00(a, n/2);
+     mkro00(a, n);
+}
+
+static void mkio11(C *a, int n)
+{
+     mkoddonly(a, n);
+     mke00(a, n/2, 1);
+     mkio00(a, n);
+}
+
+static void r2r_apply(dofft_closure *k_, bench_complex *in, bench_complex *out)
+{
+     dofft_r2r_closure *k = (dofft_r2r_closure *)k_;
+     bench_problem *p = k->p;
+     bench_real *ri, *ro;
+     int n, is, os;
+
+     n = p->sz->dims[0].n;
+     is = p->sz->dims[0].is;
+     os = p->sz->dims[0].os;
+
+     ri = (bench_real *) p->in;
+     ro = (bench_real *) p->out;
+
+     switch (p->k[0]) {
+	 case R2R_R2HC:
+	      cpyr1(n, &c_re(in[0]), 2, ri, is, 1.0);
+	      break;
+	 case R2R_HC2R:
+	      cpyr1(n/2 + 1, &c_re(in[0]), 2, ri, is, 1.0);
+	      cpyr1((n+1)/2 - 1, &c_im(in[n-1]), -2, ri + is*(n-1), -is, 1.0);
+	      break;
+	 case R2R_REDFT00:
+	      cpyr1(n, &c_re(in[0]), 2, ri, is, 1.0);
+	      break;
+	 case R2R_RODFT00:
+	      cpyr1(n, &c_re(in[1]), 2, ri, is, 1.0);
+	      break;
+	 case R2R_REDFT01:
+	      cpyr1(n, &c_re(in[0]), 2, ri, is, 1.0);
+	      break;
+	 case R2R_REDFT10:
+	      cpyr1(n, &c_re(in[1]), 4, ri, is, 1.0);
+	      break;
+	 case R2R_RODFT01:
+	      cpyr1(n, &c_re(in[1]), 2, ri, is, 1.0);
+	      break;
+	 case R2R_RODFT10:
+	      cpyr1(n, &c_im(in[1]), 4, ri, is, 1.0);
+	      break;
+	 case R2R_REDFT11:
+	      cpyr1(n, &c_re(in[1]), 4, ri, is, 1.0);
+	      break;
+	 case R2R_RODFT11:
+	      cpyr1(n, &c_re(in[1]), 4, ri, is, 1.0);
+	      break;
+	 default:
+	      BENCH_ASSERT(0); /* not yet implemented */
+     }
+
+     after_problem_rcopy_from(p, ri);
+     doit(1, p);
+     after_problem_rcopy_to(p, ro);
+
+     switch (p->k[0]) {
+	 case R2R_R2HC:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[0]), 2, 1.0);
+	      cpyr1(n/2 + 1, ro, os, &c_re(out[0]), 2, 1.0);
+	      cpyr1((n+1)/2 - 1, ro + os*(n-1), -os, &c_im(out[1]), 2, 1.0);
+	      c_im(out[0]) = 0.0;
+	      if (n % 2 == 0)
+		   c_im(out[n/2]) = 0.0;
+	      mkhermitian1(out, n);
+	      break;
+	 case R2R_HC2R:
+	      if (k->k.recopy_input) {
+		   cpyr1(n/2 + 1, ri, is, &c_re(in[0]), 2, 1.0);
+		   cpyr1((n+1)/2 - 1, ri + is*(n-1), -is, &c_im(in[1]), 2,1.0);
+	      }
+	      cpyr1(n, ro, os, &c_re(out[0]), 2, 1.0);
+	      mkreal(out, n);
+	      break;
+	 case R2R_REDFT00:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[0]), 2, 1.0);
+	      cpyr1(n, ro, os, &c_re(out[0]), 2, 1.0);
+	      mkre00(out, k->n0);
+	      break;
+	 case R2R_RODFT00:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_im(in[1]), 2, -1.0);
+	      cpyr1(n, ro, os, &c_im(out[1]), 2, -1.0);
+	      mkio00(out, k->n0);
+	      break;
+	 case R2R_REDFT01:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[0]), 2, 1.0);
+	      cpyr1(n, ro, os, &c_re(out[1]), 4, 2.0);
+	      mkre10(out, k->n0);
+	      break;
+	 case R2R_REDFT10:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[1]), 4, 2.0);
+	      cpyr1(n, ro, os, &c_re(out[0]), 2, 1.0);
+	      mkre01(out, k->n0);
+	      break;
+	 case R2R_RODFT01:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[1]), 2, 1.0);
+	      cpyr1(n, ro, os, &c_im(out[1]), 4, -2.0);
+	      mkio10(out, k->n0);
+	      break;
+	 case R2R_RODFT10:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_im(in[1]), 4, -2.0);
+	      cpyr1(n, ro, os, &c_re(out[1]), 2, 1.0);
+	      mkro01(out, k->n0);
+	      break;
+	 case R2R_REDFT11:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[1]), 4, 2.0);
+	      cpyr1(n, ro, os, &c_re(out[1]), 4, 2.0);
+	      mkre11(out, k->n0);
+	      break;
+	 case R2R_RODFT11:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_im(in[1]), 4, -2.0);
+	      cpyr1(n, ro, os, &c_im(out[1]), 4, -2.0);
+	      mkio11(out, k->n0);
+	      break;
+	 default:
+	      BENCH_ASSERT(0); /* not yet implemented */
+     }
+}
+
+void accuracy_r2r(bench_problem *p, int rounds, int impulse_rounds,
+		  double t[6])
+{
+     dofft_r2r_closure k;
+     int n, n0 = 1;
+     C *a, *b;
+     aconstrain constrain = 0;
+
+     BENCH_ASSERT(p->kind == PROBLEM_R2R);
+     BENCH_ASSERT(p->sz->rnk == 1);
+     BENCH_ASSERT(p->vecsz->rnk == 0);
+
+     k.k.apply = r2r_apply;
+     k.k.recopy_input = 0;
+     k.p = p;
+     n = tensor_sz(p->sz);
+     
+     switch (p->k[0]) {
+         case R2R_R2HC: constrain = mkreal; n0 = n; break;
+         case R2R_HC2R: constrain = mkhermitian1; n0 = n; break;
+         case R2R_REDFT00: constrain = mkre00; n0 = 2*(n-1); break;
+         case R2R_RODFT00: constrain = mkro00; n0 = 2*(n+1); break;
+         case R2R_REDFT01: constrain = mkre01; n0 = 4*n; break;
+         case R2R_REDFT10: constrain = mkre10; n0 = 4*n; break;
+         case R2R_RODFT01: constrain = mkro01; n0 = 4*n; break;
+         case R2R_RODFT10: constrain = mkio10; n0 = 4*n; break;
+         case R2R_REDFT11: constrain = mkre11; n0 = 8*n; break;
+         case R2R_RODFT11: constrain = mkro11; n0 = 8*n; break;
+	 default: BENCH_ASSERT(0); /* not yet implemented */
+     }
+     k.n0 = n0;
+
+     a = (C *) bench_malloc(n0 * sizeof(C));
+     b = (C *) bench_malloc(n0 * sizeof(C));
+     accuracy_test(&k.k, constrain, -1, n0, a, b, rounds, impulse_rounds, t);
+     bench_free(b);
+     bench_free(a);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/verify-rdft2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/verify-rdft2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "verify.h"
+
+/* copy real A into real B, using output stride of A and input stride of B */
+typedef struct {
+     dotens2_closure k;
+     R *ra;
+     R *rb;
+} cpyr_closure;
+
+static void cpyr0(dotens2_closure *k_,
+                  int indxa, int ondxa, int indxb, int ondxb)
+{
+     cpyr_closure *k = (cpyr_closure *)k_;
+     k->rb[indxb] = k->ra[ondxa];
+     UNUSED(indxa); UNUSED(ondxb);
+}
+
+static void cpyr(R *ra, const bench_tensor *sza, 
+		 R *rb, const bench_tensor *szb)
+{
+     cpyr_closure k;
+     k.k.apply = cpyr0;
+     k.ra = ra; k.rb = rb;
+     bench_dotens2(sza, szb, &k.k);
+}
+
+/* copy unpacked halfcomplex A[n] into packed-complex B[n], using output stride
+   of A and input stride of B.  Only copies non-redundant half; other
+   half must be copied via mkhermitian. */
+typedef struct {
+     dotens2_closure k;
+     int n;
+     int as;
+     int scalea;
+     R *ra, *ia;
+     R *rb, *ib;
+} cpyhc2_closure;
+
+static void cpyhc20(dotens2_closure *k_, 
+		    int indxa, int ondxa, int indxb, int ondxb)
+{
+     cpyhc2_closure *k = (cpyhc2_closure *)k_;
+     int i, n = k->n;
+     int scalea = k->scalea;
+     int as = k->as * scalea;
+     R *ra = k->ra + ondxa * scalea, *ia = k->ia + ondxa * scalea;
+     R *rb = k->rb + indxb, *ib = k->ib + indxb;
+     UNUSED(indxa); UNUSED(ondxb);
+
+     for (i = 0; i < n/2 + 1; ++i) {
+	  rb[2*i] = ra[as*i];
+	  ib[2*i] = ia[as*i];
+     }
+}
+
+static void cpyhc2(R *ra, R *ia,
+		   const bench_tensor *sza, const bench_tensor *vecsza,
+		   int scalea,
+		   R *rb, R *ib, const bench_tensor *szb)
+{
+     cpyhc2_closure k;
+     BENCH_ASSERT(sza->rnk <= 1);
+     k.k.apply = cpyhc20;
+     k.n = tensor_sz(sza);
+     k.scalea = scalea;
+     if (!FINITE_RNK(sza->rnk) || sza->rnk == 0)
+	  k.as = 0;
+     else
+	  k.as = sza->dims[0].os;
+     k.ra = ra; k.ia = ia; k.rb = rb; k.ib = ib;
+     bench_dotens2(vecsza, szb, &k.k);
+}
+
+/* icpyhc2 is the inverse of cpyhc2 */
+
+static void icpyhc20(dotens2_closure *k_, 
+		     int indxa, int ondxa, int indxb, int ondxb)
+{
+     cpyhc2_closure *k = (cpyhc2_closure *)k_;
+     int i, n = k->n;
+     int scalea = k->scalea;
+     int as = k->as * scalea;
+     R *ra = k->ra + indxa * scalea, *ia = k->ia + indxa * scalea;
+     R *rb = k->rb + ondxb, *ib = k->ib + ondxb;
+     UNUSED(ondxa); UNUSED(indxb);
+
+     for (i = 0; i < n/2 + 1; ++i) {
+	  ra[as*i] = rb[2*i];
+	  ia[as*i] = ib[2*i];
+     }
+}
+
+static void icpyhc2(R *ra, R *ia, 
+		    const bench_tensor *sza, const bench_tensor *vecsza,
+		    int scalea,
+		    R *rb, R *ib, const bench_tensor *szb)
+{
+     cpyhc2_closure k;
+     BENCH_ASSERT(sza->rnk <= 1);
+     k.k.apply = icpyhc20;
+     k.n = tensor_sz(sza);
+     k.scalea = scalea;
+     if (!FINITE_RNK(sza->rnk) || sza->rnk == 0)
+	  k.as = 0;
+     else
+	  k.as = sza->dims[0].is;
+     k.ra = ra; k.ia = ia; k.rb = rb; k.ib = ib;
+     bench_dotens2(vecsza, szb, &k.k);
+}
+
+typedef struct {
+     dofft_closure k;
+     bench_problem *p;
+} dofft_rdft2_closure;
+
+static void rdft2_apply(dofft_closure *k_, 
+			bench_complex *in, bench_complex *out)
+{
+     dofft_rdft2_closure *k = (dofft_rdft2_closure *)k_;
+     bench_problem *p = k->p;
+     bench_tensor *totalsz, *pckdsz, *totalsz_swap, *pckdsz_swap;
+     bench_tensor *probsz2, *totalsz2, *pckdsz2;
+     bench_tensor *probsz2_swap, *totalsz2_swap, *pckdsz2_swap;
+     bench_real *ri, *ii, *ro, *io;
+     int n2, totalscale;
+
+     totalsz = tensor_append(p->vecsz, p->sz);
+     pckdsz = verify_pack(totalsz, 2);
+     n2 = tensor_sz(totalsz);
+     if (FINITE_RNK(p->sz->rnk) && p->sz->rnk > 0)
+	  n2 = (n2 / p->sz->dims[p->sz->rnk - 1].n) * 
+	       (p->sz->dims[p->sz->rnk - 1].n / 2 + 1);
+     ri = (bench_real *) p->in;
+     ro = (bench_real *) p->out;
+
+     if (FINITE_RNK(p->sz->rnk) && p->sz->rnk > 0 && n2 > 0) {
+	  probsz2 = tensor_copy_sub(p->sz, p->sz->rnk - 1, 1);
+	  totalsz2 = tensor_copy_sub(totalsz, 0, totalsz->rnk - 1);
+	  pckdsz2 = tensor_copy_sub(pckdsz, 0, pckdsz->rnk - 1);
+     }
+     else {
+	  probsz2 = mktensor(0);
+	  totalsz2 = tensor_copy(totalsz);
+	  pckdsz2 = tensor_copy(pckdsz);
+     }
+
+     totalsz_swap = tensor_copy_swapio(totalsz);
+     pckdsz_swap = tensor_copy_swapio(pckdsz);
+     totalsz2_swap = tensor_copy_swapio(totalsz2);
+     pckdsz2_swap = tensor_copy_swapio(pckdsz2);
+     probsz2_swap = tensor_copy_swapio(probsz2);
+
+     /* confusion: the stride is the distance between complex elements
+	when using interleaved format, but it is the distance between
+	real elements when using split format */
+     if (p->split) {
+	  ii = p->ini ? (bench_real *) p->ini : ri + n2;
+	  io = p->outi ? (bench_real *) p->outi : ro + n2;
+	  totalscale = 1;
+     } else {
+	  ii = p->ini ? (bench_real *) p->ini : ri + 1;
+	  io = p->outi ? (bench_real *) p->outi : ro + 1;
+	  totalscale = 2;
+     }
+
+     if (p->sign < 0) { /* R2HC */
+	  int N, vN, i;
+	  cpyr(&c_re(in[0]), pckdsz, ri, totalsz);
+	  after_problem_rcopy_from(p, ri);
+	  doit(1, p);
+	  after_problem_hccopy_to(p, ro, io);
+	  if (k->k.recopy_input)
+	       cpyr(ri, totalsz_swap, &c_re(in[0]), pckdsz_swap);
+	  cpyhc2(ro, io, probsz2, totalsz2, totalscale,
+		 &c_re(out[0]), &c_im(out[0]), pckdsz2);
+	  N = tensor_sz(p->sz);
+	  vN = tensor_sz(p->vecsz);
+	  for (i = 0; i < vN; ++i)
+	       mkhermitian(out + i*N, p->sz->rnk, p->sz->dims, 1);
+     }
+     else { /* HC2R */
+	  icpyhc2(ri, ii, probsz2, totalsz2, totalscale,
+		  &c_re(in[0]), &c_im(in[0]), pckdsz2);
+	  after_problem_hccopy_from(p, ri, ii);
+	  doit(1, p);
+	  after_problem_rcopy_to(p, ro);
+	  if (k->k.recopy_input)
+	       cpyhc2(ri, ii, probsz2_swap, totalsz2_swap, totalscale,
+		      &c_re(in[0]), &c_im(in[0]), pckdsz2_swap);
+	  mkreal(out, tensor_sz(pckdsz));
+	  cpyr(ro, totalsz, &c_re(out[0]), pckdsz);
+     }
+
+     tensor_destroy(totalsz);
+     tensor_destroy(pckdsz);
+     tensor_destroy(totalsz_swap);
+     tensor_destroy(pckdsz_swap);
+     tensor_destroy(probsz2);
+     tensor_destroy(totalsz2);
+     tensor_destroy(pckdsz2);
+     tensor_destroy(probsz2_swap);
+     tensor_destroy(totalsz2_swap);
+     tensor_destroy(pckdsz2_swap);
+}
+
+void verify_rdft2(bench_problem *p, int rounds, double tol, errors *e)
+{
+     C *inA, *inB, *inC, *outA, *outB, *outC, *tmp;
+     int n, vecn, N;
+     dofft_rdft2_closure k;
+
+     BENCH_ASSERT(p->kind == PROBLEM_REAL);
+
+     if (!FINITE_RNK(p->sz->rnk) || !FINITE_RNK(p->vecsz->rnk))
+	  return;      /* give up */
+
+     k.k.apply = rdft2_apply;
+     k.k.recopy_input = 0;
+     k.p = p;
+
+     if (rounds == 0)
+	  rounds = 20;  /* default value */
+
+     n = tensor_sz(p->sz);
+     vecn = tensor_sz(p->vecsz);
+     N = n * vecn;
+
+     inA = (C *) bench_malloc(N * sizeof(C));
+     inB = (C *) bench_malloc(N * sizeof(C));
+     inC = (C *) bench_malloc(N * sizeof(C));
+     outA = (C *) bench_malloc(N * sizeof(C));
+     outB = (C *) bench_malloc(N * sizeof(C));
+     outC = (C *) bench_malloc(N * sizeof(C));
+     tmp = (C *) bench_malloc(N * sizeof(C));
+
+     e->i = impulse(&k.k, n, vecn, inA, inB, inC, outA, outB, outC, 
+		    tmp, rounds, tol);
+     e->l = linear(&k.k, 1, N, inA, inB, inC, outA, outB, outC,
+		   tmp, rounds, tol);
+
+     e->s = 0.0;
+     if (p->sign < 0)
+	  e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign,
+				     inA, inB, outA, outB, 
+				     tmp, rounds, tol, TIME_SHIFT));
+     else
+	  e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign,
+				     inA, inB, outA, outB, 
+				     tmp, rounds, tol, FREQ_SHIFT));
+     
+     if (!p->in_place && !p->destroy_input)
+	  preserves_input(&k.k, p->sign < 0 ? mkreal : mkhermitian1,
+			  N, inA, inB, outB, rounds);
+
+     bench_free(tmp);
+     bench_free(outC);
+     bench_free(outB);
+     bench_free(outA);
+     bench_free(inC);
+     bench_free(inB);
+     bench_free(inA);
+}
+
+void accuracy_rdft2(bench_problem *p, int rounds, int impulse_rounds,
+		    double t[6])
+{
+     dofft_rdft2_closure k;
+     int n;
+     C *a, *b;
+
+     BENCH_ASSERT(p->kind == PROBLEM_REAL);
+     BENCH_ASSERT(p->sz->rnk == 1);
+     BENCH_ASSERT(p->vecsz->rnk == 0);
+
+     k.k.apply = rdft2_apply;
+     k.k.recopy_input = 0;
+     k.p = p;
+     n = tensor_sz(p->sz);
+
+     a = (C *) bench_malloc(n * sizeof(C));
+     b = (C *) bench_malloc(n * sizeof(C));
+     accuracy_test(&k.k, p->sign < 0 ? mkreal : mkhermitian1, p->sign, 
+		   n, a, b, rounds, impulse_rounds, t);
+     bench_free(b);
+     bench_free(a);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/verify.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/verify.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2000 Matteo Frigo
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "verify.h"
+
+void verify_problem(bench_problem *p, int rounds, double tol)
+{
+     errors e;
+     const char *pstring = p->pstring ? p->pstring : "<unknown problem>";
+
+     switch (p->kind) {
+	 case PROBLEM_COMPLEX: verify_dft(p, rounds, tol, &e); break;
+	 case PROBLEM_REAL: verify_rdft2(p, rounds, tol, &e); break;
+	 case PROBLEM_R2R: verify_r2r(p, rounds, tol, &e); break;
+     }
+
+     if (verbose)
+	  ovtpvt("%s %g %g %g\n", pstring, e.l, e.i, e.s);
+}
+
+void verify(const char *param, int rounds, double tol)
+{
+     bench_problem *p;
+
+     p = problem_parse(param);
+     problem_alloc(p);
+
+     if (!can_do(p)) {
+	  ovtpvt_err("No can_do for %s\n", p->pstring);
+	  BENCH_ASSERT(0);
+     }
+
+     problem_zero(p);
+     setup(p);
+
+     verify_problem(p, rounds, tol);
+
+     done(p);
+     problem_destroy(p);
+}
+
+
+static void do_accuracy(bench_problem *p, int rounds, int impulse_rounds)
+{
+     double t[6];
+
+     switch (p->kind) {
+	 case PROBLEM_COMPLEX:
+	      accuracy_dft(p, rounds, impulse_rounds, t); break;
+	 case PROBLEM_REAL:
+	      accuracy_rdft2(p, rounds, impulse_rounds, t); break;
+	 case PROBLEM_R2R:
+	      accuracy_r2r(p, rounds, impulse_rounds, t); break;
+     }
+
+     /* t[0] : L1 error
+	t[1] : L2 error
+	t[2] : Linf error
+	t[3..5]: L1, L2, Linf backward error */
+     ovtpvt("%6.2e %6.2e %6.2e %6.2e %6.2e %6.2e\n", 
+	    t[0], t[1], t[2], t[3], t[4], t[5]);
+}
+
+void accuracy(const char *param, int rounds, int impulse_rounds)
+{
+     bench_problem *p;
+     p = problem_parse(param);
+     BENCH_ASSERT(can_do(p));
+     problem_alloc(p);
+     problem_zero(p);
+     setup(p);
+     do_accuracy(p, rounds, impulse_rounds);
+     done(p);
+     problem_destroy(p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/verify.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/verify.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "bench.h"
+
+typedef bench_real R;
+typedef bench_complex C;
+
+typedef struct dofft_closure_s {
+     void (*apply)(struct dofft_closure_s *k,
+		   bench_complex *in, bench_complex *out);
+     int recopy_input;
+} dofft_closure;
+
+double dmax(double x, double y);
+
+typedef void (*aconstrain)(C *a, int n);
+
+void arand(C *a, int n);
+void mkreal(C *A, int n);
+void mkhermitian(C *A, int rank, const bench_iodim *dim, int stride);
+void mkhermitian1(C *a, int n);
+void aadd(C *c, C *a, C *b, int n);
+void asub(C *c, C *a, C *b, int n);
+void arol(C *b, C *a, int n, int nb, int na);
+void aphase_shift(C *b, C *a, int n, int nb, int na, double sign);
+void ascale(C *a, C alpha, int n);
+double acmp(C *a, C *b, int n, const char *test, double tol);
+double mydrand(void);
+double impulse(dofft_closure *k,
+	       int n, int vecn, 
+	       C *inA, C *inB, C *inC,
+	       C *outA, C *outB, C *outC,
+	       C *tmp, int rounds, double tol);
+double linear(dofft_closure *k, int realp,
+	      int n, C *inA, C *inB, C *inC, C *outA,
+	      C *outB, C *outC, C *tmp, int rounds, double tol);
+void preserves_input(dofft_closure *k, aconstrain constrain,
+                     int n, C *inA, C *inB, C *outB, int rounds);
+
+enum { TIME_SHIFT, FREQ_SHIFT };
+double tf_shift(dofft_closure *k, int realp, const bench_tensor *sz,
+		int n, int vecn, double sign,
+		C *inA, C *inB, C *outA, C *outB, C *tmp,
+		int rounds, double tol, int which_shift);
+
+typedef struct dotens2_closure_s {
+     void (*apply)(struct dotens2_closure_s *k, 
+		   int indx0, int ondx0, int indx1, int ondx1);
+} dotens2_closure;
+
+void bench_dotens2(const bench_tensor *sz0, 
+		   const bench_tensor *sz1, dotens2_closure *k);
+
+void accuracy_test(dofft_closure *k, aconstrain constrain,
+		   int sign, int n, C *a, C *b, int rounds, int impulse_rounds,
+		   double t[6]);
+
+void accuracy_dft(bench_problem *p, int rounds, int impulse_rounds,
+		  double t[6]);
+void accuracy_rdft2(bench_problem *p, int rounds, int impulse_rounds,
+		    double t[6]);
+void accuracy_r2r(bench_problem *p, int rounds, int impulse_rounds,
+		  double t[6]);
+
+#if defined(BENCHFFT_LDOUBLE) && HAVE_COSL
+   typedef long double trigreal;
+#  define COS cosl
+#  define SIN sinl
+#  define TAN tanl
+#  define KTRIG(x) (x##L)
+#elif defined(BENCHFFT_QUAD) && HAVE_LIBQUADMATH
+   typedef __float128 trigreal;
+#  define COS cosq
+#  define SIN sinq
+#  define TAN tanq
+#  define KTRIG(x) (x##Q)
+extern trigreal cosq(trigreal);
+extern trigreal sinq(trigreal);
+extern trigreal tanq(trigreal);
+#else
+   typedef double trigreal;
+#  define COS cos
+#  define SIN sin
+#  define TAN tan
+#  define KTRIG(x) (x)
+#endif
+#define K2PI KTRIG(6.2831853071795864769252867665590057683943388)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/libbench2/zero.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/libbench2/zero.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "bench.h"
+
+/* set I/O arrays to zero.  Default routine */
+void problem_zero(bench_problem *p)
+{
+     bench_complex czero = {0, 0};
+     if (p->kind == PROBLEM_COMPLEX) {
+	  caset((bench_complex *) p->inphys, p->iphyssz, czero);
+	  caset((bench_complex *) p->outphys, p->ophyssz, czero);
+     } else if (p->kind == PROBLEM_R2R) {
+	  aset((bench_real *) p->inphys, p->iphyssz, 0.0);
+	  aset((bench_real *) p->outphys, p->ophyssz, 0.0);
+     } else if (p->kind == PROBLEM_REAL && p->sign < 0) {
+	  aset((bench_real *) p->inphys, p->iphyssz, 0.0);
+	  caset((bench_complex *) p->outphys, p->ophyssz, czero);
+     } else if (p->kind == PROBLEM_REAL && p->sign > 0) {
+	  caset((bench_complex *) p->inphys, p->iphyssz, czero);
+	  aset((bench_real *) p->outphys, p->ophyssz, 0.0);
+     } else {
+	  BENCH_ASSERT(0); /* TODO */
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/ltmain.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/ltmain.sh	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,9661 @@
+
+# libtool (GNU libtool) 2.4.2
+# Written by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005, 2006,
+# 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions.  There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+# GNU Libtool is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# As a special exception to the GNU General Public License,
+# if you distribute this file as part of a program or library that
+# is built using GNU Libtool, you may include this file under the
+# same distribution terms that you use for the rest of that program.
+#
+# GNU Libtool is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Libtool; see the file COPYING.  If not, a copy
+# can be downloaded from http://www.gnu.org/licenses/gpl.html,
+# or obtained by writing to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# Usage: $progname [OPTION]... [MODE-ARG]...
+#
+# Provide generalized library-building support services.
+#
+#       --config             show all configuration variables
+#       --debug              enable verbose shell tracing
+#   -n, --dry-run            display commands without modifying any files
+#       --features           display basic configuration information and exit
+#       --mode=MODE          use operation mode MODE
+#       --preserve-dup-deps  don't remove duplicate dependency libraries
+#       --quiet, --silent    don't print informational messages
+#       --no-quiet, --no-silent
+#                            print informational messages (default)
+#       --no-warn            don't display warning messages
+#       --tag=TAG            use configuration variables from tag TAG
+#   -v, --verbose            print more informational messages than default
+#       --no-verbose         don't print the extra informational messages
+#       --version            print version information
+#   -h, --help, --help-all   print short, long, or detailed help message
+#
+# MODE must be one of the following:
+#
+#         clean              remove files from the build directory
+#         compile            compile a source file into a libtool object
+#         execute            automatically set library path, then run a program
+#         finish             complete the installation of libtool libraries
+#         install            install libraries or executables
+#         link               create a library or an executable
+#         uninstall          remove libraries from an installed directory
+#
+# MODE-ARGS vary depending on the MODE.  When passed as first option,
+# `--mode=MODE' may be abbreviated as `MODE' or a unique abbreviation of that.
+# Try `$progname --help --mode=MODE' for a more detailed description of MODE.
+#
+# When reporting a bug, please describe a test case to reproduce it and
+# include the following information:
+#
+#         host-triplet:	$host
+#         shell:		$SHELL
+#         compiler:		$LTCC
+#         compiler flags:		$LTCFLAGS
+#         linker:		$LD (gnu? $with_gnu_ld)
+#         $progname:	(GNU libtool) 2.4.2 Debian-2.4.2-1.1
+#         automake:	$automake_version
+#         autoconf:	$autoconf_version
+#
+# Report bugs to <bug-libtool@gnu.org>.
+# GNU libtool home page: <http://www.gnu.org/software/libtool/>.
+# General help using GNU software: <http://www.gnu.org/gethelp/>.
+
+PROGRAM=libtool
+PACKAGE=libtool
+VERSION="2.4.2 Debian-2.4.2-1.1"
+TIMESTAMP=""
+package_revision=1.3337
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+
+# NLS nuisances: We save the old values to restore during execute mode.
+lt_user_locale=
+lt_safe_locale=
+for lt_var in LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES
+do
+  eval "if test \"\${$lt_var+set}\" = set; then
+          save_$lt_var=\$$lt_var
+          $lt_var=C
+	  export $lt_var
+	  lt_user_locale=\"$lt_var=\\\$save_\$lt_var; \$lt_user_locale\"
+	  lt_safe_locale=\"$lt_var=C; \$lt_safe_locale\"
+	fi"
+done
+LC_ALL=C
+LANGUAGE=C
+export LANGUAGE LC_ALL
+
+$lt_unset CDPATH
+
+
+# Work around backward compatibility issue on IRIX 6.5. On IRIX 6.4+, sh
+# is ksh but when the shell is invoked as "sh" and the current value of
+# the _XPG environment variable is not equal to 1 (one), the special
+# positional parameter $0, within a function call, is the name of the
+# function.
+progpath="$0"
+
+
+
+: ${CP="cp -f"}
+test "${ECHO+set}" = set || ECHO=${as_echo-'printf %s\n'}
+: ${MAKE="make"}
+: ${MKDIR="mkdir"}
+: ${MV="mv -f"}
+: ${RM="rm -f"}
+: ${SHELL="${CONFIG_SHELL-/bin/sh}"}
+: ${Xsed="$SED -e 1s/^X//"}
+
+# Global variables:
+EXIT_SUCCESS=0
+EXIT_FAILURE=1
+EXIT_MISMATCH=63  # $? = 63 is used to indicate version mismatch to missing.
+EXIT_SKIP=77	  # $? = 77 is used to indicate a skipped test to automake.
+
+exit_status=$EXIT_SUCCESS
+
+# Make sure IFS has a sensible default
+lt_nl='
+'
+IFS=" 	$lt_nl"
+
+dirname="s,/[^/]*$,,"
+basename="s,^.*/,,"
+
+# func_dirname file append nondir_replacement
+# Compute the dirname of FILE.  If nonempty, add APPEND to the result,
+# otherwise set result to NONDIR_REPLACEMENT.
+func_dirname ()
+{
+    func_dirname_result=`$ECHO "${1}" | $SED "$dirname"`
+    if test "X$func_dirname_result" = "X${1}"; then
+      func_dirname_result="${3}"
+    else
+      func_dirname_result="$func_dirname_result${2}"
+    fi
+} # func_dirname may be replaced by extended shell implementation
+
+
+# func_basename file
+func_basename ()
+{
+    func_basename_result=`$ECHO "${1}" | $SED "$basename"`
+} # func_basename may be replaced by extended shell implementation
+
+
+# func_dirname_and_basename file append nondir_replacement
+# perform func_basename and func_dirname in a single function
+# call:
+#   dirname:  Compute the dirname of FILE.  If nonempty,
+#             add APPEND to the result, otherwise set result
+#             to NONDIR_REPLACEMENT.
+#             value returned in "$func_dirname_result"
+#   basename: Compute filename of FILE.
+#             value retuned in "$func_basename_result"
+# Implementation must be kept synchronized with func_dirname
+# and func_basename. For efficiency, we do not delegate to
+# those functions but instead duplicate the functionality here.
+func_dirname_and_basename ()
+{
+    # Extract subdirectory from the argument.
+    func_dirname_result=`$ECHO "${1}" | $SED -e "$dirname"`
+    if test "X$func_dirname_result" = "X${1}"; then
+      func_dirname_result="${3}"
+    else
+      func_dirname_result="$func_dirname_result${2}"
+    fi
+    func_basename_result=`$ECHO "${1}" | $SED -e "$basename"`
+} # func_dirname_and_basename may be replaced by extended shell implementation
+
+
+# func_stripname prefix suffix name
+# strip PREFIX and SUFFIX off of NAME.
+# PREFIX and SUFFIX must not contain globbing or regex special
+# characters, hashes, percent signs, but SUFFIX may contain a leading
+# dot (in which case that matches only a dot).
+# func_strip_suffix prefix name
+func_stripname ()
+{
+    case ${2} in
+      .*) func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%\\\\${2}\$%%"`;;
+      *)  func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%${2}\$%%"`;;
+    esac
+} # func_stripname may be replaced by extended shell implementation
+
+
+# These SED scripts presuppose an absolute path with a trailing slash.
+pathcar='s,^/\([^/]*\).*$,\1,'
+pathcdr='s,^/[^/]*,,'
+removedotparts=':dotsl
+		s@/\./@/@g
+		t dotsl
+		s,/\.$,/,'
+collapseslashes='s@/\{1,\}@/@g'
+finalslash='s,/*$,/,'
+
+# func_normal_abspath PATH
+# Remove doubled-up and trailing slashes, "." path components,
+# and cancel out any ".." path components in PATH after making
+# it an absolute path.
+#             value returned in "$func_normal_abspath_result"
+func_normal_abspath ()
+{
+  # Start from root dir and reassemble the path.
+  func_normal_abspath_result=
+  func_normal_abspath_tpath=$1
+  func_normal_abspath_altnamespace=
+  case $func_normal_abspath_tpath in
+    "")
+      # Empty path, that just means $cwd.
+      func_stripname '' '/' "`pwd`"
+      func_normal_abspath_result=$func_stripname_result
+      return
+    ;;
+    # The next three entries are used to spot a run of precisely
+    # two leading slashes without using negated character classes;
+    # we take advantage of case's first-match behaviour.
+    ///*)
+      # Unusual form of absolute path, do nothing.
+    ;;
+    //*)
+      # Not necessarily an ordinary path; POSIX reserves leading '//'
+      # and for example Cygwin uses it to access remote file shares
+      # over CIFS/SMB, so we conserve a leading double slash if found.
+      func_normal_abspath_altnamespace=/
+    ;;
+    /*)
+      # Absolute path, do nothing.
+    ;;
+    *)
+      # Relative path, prepend $cwd.
+      func_normal_abspath_tpath=`pwd`/$func_normal_abspath_tpath
+    ;;
+  esac
+  # Cancel out all the simple stuff to save iterations.  We also want
+  # the path to end with a slash for ease of parsing, so make sure
+  # there is one (and only one) here.
+  func_normal_abspath_tpath=`$ECHO "$func_normal_abspath_tpath" | $SED \
+        -e "$removedotparts" -e "$collapseslashes" -e "$finalslash"`
+  while :; do
+    # Processed it all yet?
+    if test "$func_normal_abspath_tpath" = / ; then
+      # If we ascended to the root using ".." the result may be empty now.
+      if test -z "$func_normal_abspath_result" ; then
+        func_normal_abspath_result=/
+      fi
+      break
+    fi
+    func_normal_abspath_tcomponent=`$ECHO "$func_normal_abspath_tpath" | $SED \
+        -e "$pathcar"`
+    func_normal_abspath_tpath=`$ECHO "$func_normal_abspath_tpath" | $SED \
+        -e "$pathcdr"`
+    # Figure out what to do with it
+    case $func_normal_abspath_tcomponent in
+      "")
+        # Trailing empty path component, ignore it.
+      ;;
+      ..)
+        # Parent dir; strip last assembled component from result.
+        func_dirname "$func_normal_abspath_result"
+        func_normal_abspath_result=$func_dirname_result
+      ;;
+      *)
+        # Actual path component, append it.
+        func_normal_abspath_result=$func_normal_abspath_result/$func_normal_abspath_tcomponent
+      ;;
+    esac
+  done
+  # Restore leading double-slash if one was found on entry.
+  func_normal_abspath_result=$func_normal_abspath_altnamespace$func_normal_abspath_result
+}
+
+# func_relative_path SRCDIR DSTDIR
+# generates a relative path from SRCDIR to DSTDIR, with a trailing
+# slash if non-empty, suitable for immediately appending a filename
+# without needing to append a separator.
+#             value returned in "$func_relative_path_result"
+func_relative_path ()
+{
+  func_relative_path_result=
+  func_normal_abspath "$1"
+  func_relative_path_tlibdir=$func_normal_abspath_result
+  func_normal_abspath "$2"
+  func_relative_path_tbindir=$func_normal_abspath_result
+
+  # Ascend the tree starting from libdir
+  while :; do
+    # check if we have found a prefix of bindir
+    case $func_relative_path_tbindir in
+      $func_relative_path_tlibdir)
+        # found an exact match
+        func_relative_path_tcancelled=
+        break
+        ;;
+      $func_relative_path_tlibdir*)
+        # found a matching prefix
+        func_stripname "$func_relative_path_tlibdir" '' "$func_relative_path_tbindir"
+        func_relative_path_tcancelled=$func_stripname_result
+        if test -z "$func_relative_path_result"; then
+          func_relative_path_result=.
+        fi
+        break
+        ;;
+      *)
+        func_dirname $func_relative_path_tlibdir
+        func_relative_path_tlibdir=${func_dirname_result}
+        if test "x$func_relative_path_tlibdir" = x ; then
+          # Have to descend all the way to the root!
+          func_relative_path_result=../$func_relative_path_result
+          func_relative_path_tcancelled=$func_relative_path_tbindir
+          break
+        fi
+        func_relative_path_result=../$func_relative_path_result
+        ;;
+    esac
+  done
+
+  # Now calculate path; take care to avoid doubling-up slashes.
+  func_stripname '' '/' "$func_relative_path_result"
+  func_relative_path_result=$func_stripname_result
+  func_stripname '/' '/' "$func_relative_path_tcancelled"
+  if test "x$func_stripname_result" != x ; then
+    func_relative_path_result=${func_relative_path_result}/${func_stripname_result}
+  fi
+
+  # Normalisation. If bindir is libdir, return empty string,
+  # else relative path ending with a slash; either way, target
+  # file name can be directly appended.
+  if test ! -z "$func_relative_path_result"; then
+    func_stripname './' '' "$func_relative_path_result/"
+    func_relative_path_result=$func_stripname_result
+  fi
+}
+
+# The name of this program:
+func_dirname_and_basename "$progpath"
+progname=$func_basename_result
+
+# Make sure we have an absolute path for reexecution:
+case $progpath in
+  [\\/]*|[A-Za-z]:\\*) ;;
+  *[\\/]*)
+     progdir=$func_dirname_result
+     progdir=`cd "$progdir" && pwd`
+     progpath="$progdir/$progname"
+     ;;
+  *)
+     save_IFS="$IFS"
+     IFS=${PATH_SEPARATOR-:}
+     for progdir in $PATH; do
+       IFS="$save_IFS"
+       test -x "$progdir/$progname" && break
+     done
+     IFS="$save_IFS"
+     test -n "$progdir" || progdir=`pwd`
+     progpath="$progdir/$progname"
+     ;;
+esac
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed="${SED}"' -e 1s/^X//'
+sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
+
+# Same as above, but do not quote variable references.
+double_quote_subst='s/\(["`\\]\)/\\\1/g'
+
+# Sed substitution that turns a string into a regex matching for the
+# string literally.
+sed_make_literal_regex='s,[].[^$\\*\/],\\&,g'
+
+# Sed substitution that converts a w32 file name or path
+# which contains forward slashes, into one that contains
+# (escaped) backslashes.  A very naive implementation.
+lt_sed_naive_backslashify='s|\\\\*|\\|g;s|/|\\|g;s|\\|\\\\|g'
+
+# Re-`\' parameter expansions in output of double_quote_subst that were
+# `\'-ed in input to the same.  If an odd number of `\' preceded a '$'
+# in input to double_quote_subst, that '$' was protected from expansion.
+# Since each input `\' is now two `\'s, look for any number of runs of
+# four `\'s followed by two `\'s and then a '$'.  `\' that '$'.
+bs='\\'
+bs2='\\\\'
+bs4='\\\\\\\\'
+dollar='\$'
+sed_double_backslash="\
+  s/$bs4/&\\
+/g
+  s/^$bs2$dollar/$bs&/
+  s/\\([^$bs]\\)$bs2$dollar/\\1$bs2$bs$dollar/g
+  s/\n//g"
+
+# Standard options:
+opt_dry_run=false
+opt_help=false
+opt_quiet=false
+opt_verbose=false
+opt_warning=:
+
+# func_echo arg...
+# Echo program name prefixed message, along with the current mode
+# name if it has been set yet.
+func_echo ()
+{
+    $ECHO "$progname: ${opt_mode+$opt_mode: }$*"
+}
+
+# func_verbose arg...
+# Echo program name prefixed message in verbose mode only.
+func_verbose ()
+{
+    $opt_verbose && func_echo ${1+"$@"}
+
+    # A bug in bash halts the script if the last line of a function
+    # fails when set -e is in force, so we need another command to
+    # work around that:
+    :
+}
+
+# func_echo_all arg...
+# Invoke $ECHO with all args, space-separated.
+func_echo_all ()
+{
+    $ECHO "$*"
+}
+
+# func_error arg...
+# Echo program name prefixed message to standard error.
+func_error ()
+{
+    $ECHO "$progname: ${opt_mode+$opt_mode: }"${1+"$@"} 1>&2
+}
+
+# func_warning arg...
+# Echo program name prefixed warning message to standard error.
+func_warning ()
+{
+    $opt_warning && $ECHO "$progname: ${opt_mode+$opt_mode: }warning: "${1+"$@"} 1>&2
+
+    # bash bug again:
+    :
+}
+
+# func_fatal_error arg...
+# Echo program name prefixed message to standard error, and exit.
+func_fatal_error ()
+{
+    func_error ${1+"$@"}
+    exit $EXIT_FAILURE
+}
+
+# func_fatal_help arg...
+# Echo program name prefixed message to standard error, followed by
+# a help hint, and exit.
+func_fatal_help ()
+{
+    func_error ${1+"$@"}
+    func_fatal_error "$help"
+}
+help="Try \`$progname --help' for more information."  ## default
+
+
+# func_grep expression filename
+# Check whether EXPRESSION matches any line of FILENAME, without output.
+func_grep ()
+{
+    $GREP "$1" "$2" >/dev/null 2>&1
+}
+
+
+# func_mkdir_p directory-path
+# Make sure the entire path to DIRECTORY-PATH is available.
+func_mkdir_p ()
+{
+    my_directory_path="$1"
+    my_dir_list=
+
+    if test -n "$my_directory_path" && test "$opt_dry_run" != ":"; then
+
+      # Protect directory names starting with `-'
+      case $my_directory_path in
+        -*) my_directory_path="./$my_directory_path" ;;
+      esac
+
+      # While some portion of DIR does not yet exist...
+      while test ! -d "$my_directory_path"; do
+        # ...make a list in topmost first order.  Use a colon delimited
+	# list incase some portion of path contains whitespace.
+        my_dir_list="$my_directory_path:$my_dir_list"
+
+        # If the last portion added has no slash in it, the list is done
+        case $my_directory_path in */*) ;; *) break ;; esac
+
+        # ...otherwise throw away the child directory and loop
+        my_directory_path=`$ECHO "$my_directory_path" | $SED -e "$dirname"`
+      done
+      my_dir_list=`$ECHO "$my_dir_list" | $SED 's,:*$,,'`
+
+      save_mkdir_p_IFS="$IFS"; IFS=':'
+      for my_dir in $my_dir_list; do
+	IFS="$save_mkdir_p_IFS"
+        # mkdir can fail with a `File exist' error if two processes
+        # try to create one of the directories concurrently.  Don't
+        # stop in that case!
+        $MKDIR "$my_dir" 2>/dev/null || :
+      done
+      IFS="$save_mkdir_p_IFS"
+
+      # Bail out if we (or some other process) failed to create a directory.
+      test -d "$my_directory_path" || \
+        func_fatal_error "Failed to create \`$1'"
+    fi
+}
+
+
+# func_mktempdir [string]
+# Make a temporary directory that won't clash with other running
+# libtool processes, and avoids race conditions if possible.  If
+# given, STRING is the basename for that directory.
+func_mktempdir ()
+{
+    my_template="${TMPDIR-/tmp}/${1-$progname}"
+
+    if test "$opt_dry_run" = ":"; then
+      # Return a directory name, but don't create it in dry-run mode
+      my_tmpdir="${my_template}-$$"
+    else
+
+      # If mktemp works, use that first and foremost
+      my_tmpdir=`mktemp -d "${my_template}-XXXXXXXX" 2>/dev/null`
+
+      if test ! -d "$my_tmpdir"; then
+        # Failing that, at least try and use $RANDOM to avoid a race
+        my_tmpdir="${my_template}-${RANDOM-0}$$"
+
+        save_mktempdir_umask=`umask`
+        umask 0077
+        $MKDIR "$my_tmpdir"
+        umask $save_mktempdir_umask
+      fi
+
+      # If we're not in dry-run mode, bomb out on failure
+      test -d "$my_tmpdir" || \
+        func_fatal_error "cannot create temporary directory \`$my_tmpdir'"
+    fi
+
+    $ECHO "$my_tmpdir"
+}
+
+
+# func_quote_for_eval arg
+# Aesthetically quote ARG to be evaled later.
+# This function returns two values: FUNC_QUOTE_FOR_EVAL_RESULT
+# is double-quoted, suitable for a subsequent eval, whereas
+# FUNC_QUOTE_FOR_EVAL_UNQUOTED_RESULT has merely all characters
+# which are still active within double quotes backslashified.
+func_quote_for_eval ()
+{
+    case $1 in
+      *[\\\`\"\$]*)
+	func_quote_for_eval_unquoted_result=`$ECHO "$1" | $SED "$sed_quote_subst"` ;;
+      *)
+        func_quote_for_eval_unquoted_result="$1" ;;
+    esac
+
+    case $func_quote_for_eval_unquoted_result in
+      # Double-quote args containing shell metacharacters to delay
+      # word splitting, command substitution and and variable
+      # expansion for a subsequent eval.
+      # Many Bourne shells cannot handle close brackets correctly
+      # in scan sets, so we specify it separately.
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+        func_quote_for_eval_result="\"$func_quote_for_eval_unquoted_result\""
+        ;;
+      *)
+        func_quote_for_eval_result="$func_quote_for_eval_unquoted_result"
+    esac
+}
+
+
+# func_quote_for_expand arg
+# Aesthetically quote ARG to be evaled later; same as above,
+# but do not quote variable references.
+func_quote_for_expand ()
+{
+    case $1 in
+      *[\\\`\"]*)
+	my_arg=`$ECHO "$1" | $SED \
+	    -e "$double_quote_subst" -e "$sed_double_backslash"` ;;
+      *)
+        my_arg="$1" ;;
+    esac
+
+    case $my_arg in
+      # Double-quote args containing shell metacharacters to delay
+      # word splitting and command substitution for a subsequent eval.
+      # Many Bourne shells cannot handle close brackets correctly
+      # in scan sets, so we specify it separately.
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+        my_arg="\"$my_arg\""
+        ;;
+    esac
+
+    func_quote_for_expand_result="$my_arg"
+}
+
+
+# func_show_eval cmd [fail_exp]
+# Unless opt_silent is true, then output CMD.  Then, if opt_dryrun is
+# not true, evaluate CMD.  If the evaluation of CMD fails, and FAIL_EXP
+# is given, then evaluate it.
+func_show_eval ()
+{
+    my_cmd="$1"
+    my_fail_exp="${2-:}"
+
+    ${opt_silent-false} || {
+      func_quote_for_expand "$my_cmd"
+      eval "func_echo $func_quote_for_expand_result"
+    }
+
+    if ${opt_dry_run-false}; then :; else
+      eval "$my_cmd"
+      my_status=$?
+      if test "$my_status" -eq 0; then :; else
+	eval "(exit $my_status); $my_fail_exp"
+      fi
+    fi
+}
+
+
+# func_show_eval_locale cmd [fail_exp]
+# Unless opt_silent is true, then output CMD.  Then, if opt_dryrun is
+# not true, evaluate CMD.  If the evaluation of CMD fails, and FAIL_EXP
+# is given, then evaluate it.  Use the saved locale for evaluation.
+func_show_eval_locale ()
+{
+    my_cmd="$1"
+    my_fail_exp="${2-:}"
+
+    ${opt_silent-false} || {
+      func_quote_for_expand "$my_cmd"
+      eval "func_echo $func_quote_for_expand_result"
+    }
+
+    if ${opt_dry_run-false}; then :; else
+      eval "$lt_user_locale
+	    $my_cmd"
+      my_status=$?
+      eval "$lt_safe_locale"
+      if test "$my_status" -eq 0; then :; else
+	eval "(exit $my_status); $my_fail_exp"
+      fi
+    fi
+}
+
+# func_tr_sh
+# Turn $1 into a string suitable for a shell variable name.
+# Result is stored in $func_tr_sh_result.  All characters
+# not in the set a-zA-Z0-9_ are replaced with '_'. Further,
+# if $1 begins with a digit, a '_' is prepended as well.
+func_tr_sh ()
+{
+  case $1 in
+  [0-9]* | *[!a-zA-Z0-9_]*)
+    func_tr_sh_result=`$ECHO "$1" | $SED 's/^\([0-9]\)/_\1/; s/[^a-zA-Z0-9_]/_/g'`
+    ;;
+  * )
+    func_tr_sh_result=$1
+    ;;
+  esac
+}
+
+
+# func_version
+# Echo version message to standard output and exit.
+func_version ()
+{
+    $opt_debug
+
+    $SED -n '/(C)/!b go
+	:more
+	/\./!{
+	  N
+	  s/\n# / /
+	  b more
+	}
+	:go
+	/^# '$PROGRAM' (GNU /,/# warranty; / {
+        s/^# //
+	s/^# *$//
+        s/\((C)\)[ 0-9,-]*\( [1-9][0-9]*\)/\1\2/
+        p
+     }' < "$progpath"
+     exit $?
+}
+
+# func_usage
+# Echo short help message to standard output and exit.
+func_usage ()
+{
+    $opt_debug
+
+    $SED -n '/^# Usage:/,/^#  *.*--help/ {
+        s/^# //
+	s/^# *$//
+	s/\$progname/'$progname'/
+	p
+    }' < "$progpath"
+    echo
+    $ECHO "run \`$progname --help | more' for full usage"
+    exit $?
+}
+
+# func_help [NOEXIT]
+# Echo long help message to standard output and exit,
+# unless 'noexit' is passed as argument.
+func_help ()
+{
+    $opt_debug
+
+    $SED -n '/^# Usage:/,/# Report bugs to/ {
+	:print
+        s/^# //
+	s/^# *$//
+	s*\$progname*'$progname'*
+	s*\$host*'"$host"'*
+	s*\$SHELL*'"$SHELL"'*
+	s*\$LTCC*'"$LTCC"'*
+	s*\$LTCFLAGS*'"$LTCFLAGS"'*
+	s*\$LD*'"$LD"'*
+	s/\$with_gnu_ld/'"$with_gnu_ld"'/
+	s/\$automake_version/'"`(${AUTOMAKE-automake} --version) 2>/dev/null |$SED 1q`"'/
+	s/\$autoconf_version/'"`(${AUTOCONF-autoconf} --version) 2>/dev/null |$SED 1q`"'/
+	p
+	d
+     }
+     /^# .* home page:/b print
+     /^# General help using/b print
+     ' < "$progpath"
+    ret=$?
+    if test -z "$1"; then
+      exit $ret
+    fi
+}
+
+# func_missing_arg argname
+# Echo program name prefixed message to standard error and set global
+# exit_cmd.
+func_missing_arg ()
+{
+    $opt_debug
+
+    func_error "missing argument for $1."
+    exit_cmd=exit
+}
+
+
+# func_split_short_opt shortopt
+# Set func_split_short_opt_name and func_split_short_opt_arg shell
+# variables after splitting SHORTOPT after the 2nd character.
+func_split_short_opt ()
+{
+    my_sed_short_opt='1s/^\(..\).*$/\1/;q'
+    my_sed_short_rest='1s/^..\(.*\)$/\1/;q'
+
+    func_split_short_opt_name=`$ECHO "$1" | $SED "$my_sed_short_opt"`
+    func_split_short_opt_arg=`$ECHO "$1" | $SED "$my_sed_short_rest"`
+} # func_split_short_opt may be replaced by extended shell implementation
+
+
+# func_split_long_opt longopt
+# Set func_split_long_opt_name and func_split_long_opt_arg shell
+# variables after splitting LONGOPT at the `=' sign.
+func_split_long_opt ()
+{
+    my_sed_long_opt='1s/^\(--[^=]*\)=.*/\1/;q'
+    my_sed_long_arg='1s/^--[^=]*=//'
+
+    func_split_long_opt_name=`$ECHO "$1" | $SED "$my_sed_long_opt"`
+    func_split_long_opt_arg=`$ECHO "$1" | $SED "$my_sed_long_arg"`
+} # func_split_long_opt may be replaced by extended shell implementation
+
+exit_cmd=:
+
+
+
+
+
+magic="%%%MAGIC variable%%%"
+magic_exe="%%%MAGIC EXE variable%%%"
+
+# Global variables.
+nonopt=
+preserve_args=
+lo2o="s/\\.lo\$/.${objext}/"
+o2lo="s/\\.${objext}\$/.lo/"
+extracted_archives=
+extracted_serial=0
+
+# If this variable is set in any of the actions, the command in it
+# will be execed at the end.  This prevents here-documents from being
+# left over by shells.
+exec_cmd=
+
+# func_append var value
+# Append VALUE to the end of shell variable VAR.
+func_append ()
+{
+    eval "${1}=\$${1}\${2}"
+} # func_append may be replaced by extended shell implementation
+
+# func_append_quoted var value
+# Quote VALUE and append to the end of shell variable VAR, separated
+# by a space.
+func_append_quoted ()
+{
+    func_quote_for_eval "${2}"
+    eval "${1}=\$${1}\\ \$func_quote_for_eval_result"
+} # func_append_quoted may be replaced by extended shell implementation
+
+
+# func_arith arithmetic-term...
+func_arith ()
+{
+    func_arith_result=`expr "${@}"`
+} # func_arith may be replaced by extended shell implementation
+
+
+# func_len string
+# STRING may not start with a hyphen.
+func_len ()
+{
+    func_len_result=`expr "${1}" : ".*" 2>/dev/null || echo $max_cmd_len`
+} # func_len may be replaced by extended shell implementation
+
+
+# func_lo2o object
+func_lo2o ()
+{
+    func_lo2o_result=`$ECHO "${1}" | $SED "$lo2o"`
+} # func_lo2o may be replaced by extended shell implementation
+
+
+# func_xform libobj-or-source
+func_xform ()
+{
+    func_xform_result=`$ECHO "${1}" | $SED 's/\.[^.]*$/.lo/'`
+} # func_xform may be replaced by extended shell implementation
+
+
+# func_fatal_configuration arg...
+# Echo program name prefixed message to standard error, followed by
+# a configuration failure hint, and exit.
+func_fatal_configuration ()
+{
+    func_error ${1+"$@"}
+    func_error "See the $PACKAGE documentation for more information."
+    func_fatal_error "Fatal configuration error."
+}
+
+
+# func_config
+# Display the configuration for all the tags in this script.
+func_config ()
+{
+    re_begincf='^# ### BEGIN LIBTOOL'
+    re_endcf='^# ### END LIBTOOL'
+
+    # Default configuration.
+    $SED "1,/$re_begincf CONFIG/d;/$re_endcf CONFIG/,\$d" < "$progpath"
+
+    # Now print the configurations for the tags.
+    for tagname in $taglist; do
+      $SED -n "/$re_begincf TAG CONFIG: $tagname\$/,/$re_endcf TAG CONFIG: $tagname\$/p" < "$progpath"
+    done
+
+    exit $?
+}
+
+# func_features
+# Display the features supported by this script.
+func_features ()
+{
+    echo "host: $host"
+    if test "$build_libtool_libs" = yes; then
+      echo "enable shared libraries"
+    else
+      echo "disable shared libraries"
+    fi
+    if test "$build_old_libs" = yes; then
+      echo "enable static libraries"
+    else
+      echo "disable static libraries"
+    fi
+
+    exit $?
+}
+
+# func_enable_tag tagname
+# Verify that TAGNAME is valid, and either flag an error and exit, or
+# enable the TAGNAME tag.  We also add TAGNAME to the global $taglist
+# variable here.
+func_enable_tag ()
+{
+  # Global variable:
+  tagname="$1"
+
+  re_begincf="^# ### BEGIN LIBTOOL TAG CONFIG: $tagname\$"
+  re_endcf="^# ### END LIBTOOL TAG CONFIG: $tagname\$"
+  sed_extractcf="/$re_begincf/,/$re_endcf/p"
+
+  # Validate tagname.
+  case $tagname in
+    *[!-_A-Za-z0-9,/]*)
+      func_fatal_error "invalid tag name: $tagname"
+      ;;
+  esac
+
+  # Don't test for the "default" C tag, as we know it's
+  # there but not specially marked.
+  case $tagname in
+    CC) ;;
+    *)
+      if $GREP "$re_begincf" "$progpath" >/dev/null 2>&1; then
+	taglist="$taglist $tagname"
+
+	# Evaluate the configuration.  Be careful to quote the path
+	# and the sed script, to avoid splitting on whitespace, but
+	# also don't use non-portable quotes within backquotes within
+	# quotes we have to do it in 2 steps:
+	extractedcf=`$SED -n -e "$sed_extractcf" < "$progpath"`
+	eval "$extractedcf"
+      else
+	func_error "ignoring unknown tag $tagname"
+      fi
+      ;;
+  esac
+}
+
+# func_check_version_match
+# Ensure that we are using m4 macros, and libtool script from the same
+# release of libtool.
+func_check_version_match ()
+{
+  if test "$package_revision" != "$macro_revision"; then
+    if test "$VERSION" != "$macro_version"; then
+      if test -z "$macro_version"; then
+        cat >&2 <<_LT_EOF
+$progname: Version mismatch error.  This is $PACKAGE $VERSION, but the
+$progname: definition of this LT_INIT comes from an older release.
+$progname: You should recreate aclocal.m4 with macros from $PACKAGE $VERSION
+$progname: and run autoconf again.
+_LT_EOF
+      else
+        cat >&2 <<_LT_EOF
+$progname: Version mismatch error.  This is $PACKAGE $VERSION, but the
+$progname: definition of this LT_INIT comes from $PACKAGE $macro_version.
+$progname: You should recreate aclocal.m4 with macros from $PACKAGE $VERSION
+$progname: and run autoconf again.
+_LT_EOF
+      fi
+    else
+      cat >&2 <<_LT_EOF
+$progname: Version mismatch error.  This is $PACKAGE $VERSION, revision $package_revision,
+$progname: but the definition of this LT_INIT comes from revision $macro_revision.
+$progname: You should recreate aclocal.m4 with macros from revision $package_revision
+$progname: of $PACKAGE $VERSION and run autoconf again.
+_LT_EOF
+    fi
+
+    exit $EXIT_MISMATCH
+  fi
+}
+
+
+# Shorthand for --mode=foo, only valid as the first argument
+case $1 in
+clean|clea|cle|cl)
+  shift; set dummy --mode clean ${1+"$@"}; shift
+  ;;
+compile|compil|compi|comp|com|co|c)
+  shift; set dummy --mode compile ${1+"$@"}; shift
+  ;;
+execute|execut|execu|exec|exe|ex|e)
+  shift; set dummy --mode execute ${1+"$@"}; shift
+  ;;
+finish|finis|fini|fin|fi|f)
+  shift; set dummy --mode finish ${1+"$@"}; shift
+  ;;
+install|instal|insta|inst|ins|in|i)
+  shift; set dummy --mode install ${1+"$@"}; shift
+  ;;
+link|lin|li|l)
+  shift; set dummy --mode link ${1+"$@"}; shift
+  ;;
+uninstall|uninstal|uninsta|uninst|unins|unin|uni|un|u)
+  shift; set dummy --mode uninstall ${1+"$@"}; shift
+  ;;
+esac
+
+
+
+# Option defaults:
+opt_debug=:
+opt_dry_run=false
+opt_config=false
+opt_preserve_dup_deps=false
+opt_features=false
+opt_finish=false
+opt_help=false
+opt_help_all=false
+opt_silent=:
+opt_warning=:
+opt_verbose=:
+opt_silent=false
+opt_verbose=false
+
+
+# Parse options once, thoroughly.  This comes as soon as possible in the
+# script to make things like `--version' happen as quickly as we can.
+{
+  # this just eases exit handling
+  while test $# -gt 0; do
+    opt="$1"
+    shift
+    case $opt in
+      --debug|-x)	opt_debug='set -x'
+			func_echo "enabling shell trace mode"
+			$opt_debug
+			;;
+      --dry-run|--dryrun|-n)
+			opt_dry_run=:
+			;;
+      --config)
+			opt_config=:
+func_config
+			;;
+      --dlopen|-dlopen)
+			optarg="$1"
+			opt_dlopen="${opt_dlopen+$opt_dlopen
+}$optarg"
+			shift
+			;;
+      --preserve-dup-deps)
+			opt_preserve_dup_deps=:
+			;;
+      --features)
+			opt_features=:
+func_features
+			;;
+      --finish)
+			opt_finish=:
+set dummy --mode finish ${1+"$@"}; shift
+			;;
+      --help)
+			opt_help=:
+			;;
+      --help-all)
+			opt_help_all=:
+opt_help=': help-all'
+			;;
+      --mode)
+			test $# = 0 && func_missing_arg $opt && break
+			optarg="$1"
+			opt_mode="$optarg"
+case $optarg in
+  # Valid mode arguments:
+  clean|compile|execute|finish|install|link|relink|uninstall) ;;
+
+  # Catch anything else as an error
+  *) func_error "invalid argument for $opt"
+     exit_cmd=exit
+     break
+     ;;
+esac
+			shift
+			;;
+      --no-silent|--no-quiet)
+			opt_silent=false
+func_append preserve_args " $opt"
+			;;
+      --no-warning|--no-warn)
+			opt_warning=false
+func_append preserve_args " $opt"
+			;;
+      --no-verbose)
+			opt_verbose=false
+func_append preserve_args " $opt"
+			;;
+      --silent|--quiet)
+			opt_silent=:
+func_append preserve_args " $opt"
+        opt_verbose=false
+			;;
+      --verbose|-v)
+			opt_verbose=:
+func_append preserve_args " $opt"
+opt_silent=false
+			;;
+      --tag)
+			test $# = 0 && func_missing_arg $opt && break
+			optarg="$1"
+			opt_tag="$optarg"
+func_append preserve_args " $opt $optarg"
+func_enable_tag "$optarg"
+			shift
+			;;
+
+      -\?|-h)		func_usage				;;
+      --help)		func_help				;;
+      --version)	func_version				;;
+
+      # Separate optargs to long options:
+      --*=*)
+			func_split_long_opt "$opt"
+			set dummy "$func_split_long_opt_name" "$func_split_long_opt_arg" ${1+"$@"}
+			shift
+			;;
+
+      # Separate non-argument short options:
+      -\?*|-h*|-n*|-v*)
+			func_split_short_opt "$opt"
+			set dummy "$func_split_short_opt_name" "-$func_split_short_opt_arg" ${1+"$@"}
+			shift
+			;;
+
+      --)		break					;;
+      -*)		func_fatal_help "unrecognized option \`$opt'" ;;
+      *)		set dummy "$opt" ${1+"$@"};	shift; break  ;;
+    esac
+  done
+
+  # Validate options:
+
+  # save first non-option argument
+  if test "$#" -gt 0; then
+    nonopt="$opt"
+    shift
+  fi
+
+  # preserve --debug
+  test "$opt_debug" = : || func_append preserve_args " --debug"
+
+  case $host in
+    *cygwin* | *mingw* | *pw32* | *cegcc*)
+      # don't eliminate duplications in $postdeps and $predeps
+      opt_duplicate_compiler_generated_deps=:
+      ;;
+    *)
+      opt_duplicate_compiler_generated_deps=$opt_preserve_dup_deps
+      ;;
+  esac
+
+  $opt_help || {
+    # Sanity checks first:
+    func_check_version_match
+
+    if test "$build_libtool_libs" != yes && test "$build_old_libs" != yes; then
+      func_fatal_configuration "not configured to build any kind of library"
+    fi
+
+    # Darwin sucks
+    eval std_shrext=\"$shrext_cmds\"
+
+    # Only execute mode is allowed to have -dlopen flags.
+    if test -n "$opt_dlopen" && test "$opt_mode" != execute; then
+      func_error "unrecognized option \`-dlopen'"
+      $ECHO "$help" 1>&2
+      exit $EXIT_FAILURE
+    fi
+
+    # Change the help message to a mode-specific one.
+    generic_help="$help"
+    help="Try \`$progname --help --mode=$opt_mode' for more information."
+  }
+
+
+  # Bail if the options were screwed
+  $exit_cmd $EXIT_FAILURE
+}
+
+
+
+
+## ----------- ##
+##    Main.    ##
+## ----------- ##
+
+# func_lalib_p file
+# True iff FILE is a libtool `.la' library or `.lo' object file.
+# This function is only a basic sanity check; it will hardly flush out
+# determined imposters.
+func_lalib_p ()
+{
+    test -f "$1" &&
+      $SED -e 4q "$1" 2>/dev/null \
+        | $GREP "^# Generated by .*$PACKAGE" > /dev/null 2>&1
+}
+
+# func_lalib_unsafe_p file
+# True iff FILE is a libtool `.la' library or `.lo' object file.
+# This function implements the same check as func_lalib_p without
+# resorting to external programs.  To this end, it redirects stdin and
+# closes it afterwards, without saving the original file descriptor.
+# As a safety measure, use it only where a negative result would be
+# fatal anyway.  Works if `file' does not exist.
+func_lalib_unsafe_p ()
+{
+    lalib_p=no
+    if test -f "$1" && test -r "$1" && exec 5<&0 <"$1"; then
+	for lalib_p_l in 1 2 3 4
+	do
+	    read lalib_p_line
+	    case "$lalib_p_line" in
+		\#\ Generated\ by\ *$PACKAGE* ) lalib_p=yes; break;;
+	    esac
+	done
+	exec 0<&5 5<&-
+    fi
+    test "$lalib_p" = yes
+}
+
+# func_ltwrapper_script_p file
+# True iff FILE is a libtool wrapper script
+# This function is only a basic sanity check; it will hardly flush out
+# determined imposters.
+func_ltwrapper_script_p ()
+{
+    func_lalib_p "$1"
+}
+
+# func_ltwrapper_executable_p file
+# True iff FILE is a libtool wrapper executable
+# This function is only a basic sanity check; it will hardly flush out
+# determined imposters.
+func_ltwrapper_executable_p ()
+{
+    func_ltwrapper_exec_suffix=
+    case $1 in
+    *.exe) ;;
+    *) func_ltwrapper_exec_suffix=.exe ;;
+    esac
+    $GREP "$magic_exe" "$1$func_ltwrapper_exec_suffix" >/dev/null 2>&1
+}
+
+# func_ltwrapper_scriptname file
+# Assumes file is an ltwrapper_executable
+# uses $file to determine the appropriate filename for a
+# temporary ltwrapper_script.
+func_ltwrapper_scriptname ()
+{
+    func_dirname_and_basename "$1" "" "."
+    func_stripname '' '.exe' "$func_basename_result"
+    func_ltwrapper_scriptname_result="$func_dirname_result/$objdir/${func_stripname_result}_ltshwrapper"
+}
+
+# func_ltwrapper_p file
+# True iff FILE is a libtool wrapper script or wrapper executable
+# This function is only a basic sanity check; it will hardly flush out
+# determined imposters.
+func_ltwrapper_p ()
+{
+    func_ltwrapper_script_p "$1" || func_ltwrapper_executable_p "$1"
+}
+
+
+# func_execute_cmds commands fail_cmd
+# Execute tilde-delimited COMMANDS.
+# If FAIL_CMD is given, eval that upon failure.
+# FAIL_CMD may read-access the current command in variable CMD!
+func_execute_cmds ()
+{
+    $opt_debug
+    save_ifs=$IFS; IFS='~'
+    for cmd in $1; do
+      IFS=$save_ifs
+      eval cmd=\"$cmd\"
+      func_show_eval "$cmd" "${2-:}"
+    done
+    IFS=$save_ifs
+}
+
+
+# func_source file
+# Source FILE, adding directory component if necessary.
+# Note that it is not necessary on cygwin/mingw to append a dot to
+# FILE even if both FILE and FILE.exe exist: automatic-append-.exe
+# behavior happens only for exec(3), not for open(2)!  Also, sourcing
+# `FILE.' does not work on cygwin managed mounts.
+func_source ()
+{
+    $opt_debug
+    case $1 in
+    */* | *\\*)	. "$1" ;;
+    *)		. "./$1" ;;
+    esac
+}
+
+
+# func_resolve_sysroot PATH
+# Replace a leading = in PATH with a sysroot.  Store the result into
+# func_resolve_sysroot_result
+func_resolve_sysroot ()
+{
+  func_resolve_sysroot_result=$1
+  case $func_resolve_sysroot_result in
+  =*)
+    func_stripname '=' '' "$func_resolve_sysroot_result"
+    func_resolve_sysroot_result=$lt_sysroot$func_stripname_result
+    ;;
+  esac
+}
+
+# func_replace_sysroot PATH
+# If PATH begins with the sysroot, replace it with = and
+# store the result into func_replace_sysroot_result.
+func_replace_sysroot ()
+{
+  case "$lt_sysroot:$1" in
+  ?*:"$lt_sysroot"*)
+    func_stripname "$lt_sysroot" '' "$1"
+    func_replace_sysroot_result="=$func_stripname_result"
+    ;;
+  *)
+    # Including no sysroot.
+    func_replace_sysroot_result=$1
+    ;;
+  esac
+}
+
+# func_infer_tag arg
+# Infer tagged configuration to use if any are available and
+# if one wasn't chosen via the "--tag" command line option.
+# Only attempt this if the compiler in the base compile
+# command doesn't match the default compiler.
+# arg is usually of the form 'gcc ...'
+func_infer_tag ()
+{
+    $opt_debug
+    if test -n "$available_tags" && test -z "$tagname"; then
+      CC_quoted=
+      for arg in $CC; do
+	func_append_quoted CC_quoted "$arg"
+      done
+      CC_expanded=`func_echo_all $CC`
+      CC_quoted_expanded=`func_echo_all $CC_quoted`
+      case $@ in
+      # Blanks in the command may have been stripped by the calling shell,
+      # but not from the CC environment variable when configure was run.
+      " $CC "* | "$CC "* | " $CC_expanded "* | "$CC_expanded "* | \
+      " $CC_quoted"* | "$CC_quoted "* | " $CC_quoted_expanded "* | "$CC_quoted_expanded "*) ;;
+      # Blanks at the start of $base_compile will cause this to fail
+      # if we don't check for them as well.
+      *)
+	for z in $available_tags; do
+	  if $GREP "^# ### BEGIN LIBTOOL TAG CONFIG: $z$" < "$progpath" > /dev/null; then
+	    # Evaluate the configuration.
+	    eval "`${SED} -n -e '/^# ### BEGIN LIBTOOL TAG CONFIG: '$z'$/,/^# ### END LIBTOOL TAG CONFIG: '$z'$/p' < $progpath`"
+	    CC_quoted=
+	    for arg in $CC; do
+	      # Double-quote args containing other shell metacharacters.
+	      func_append_quoted CC_quoted "$arg"
+	    done
+	    CC_expanded=`func_echo_all $CC`
+	    CC_quoted_expanded=`func_echo_all $CC_quoted`
+	    case "$@ " in
+	    " $CC "* | "$CC "* | " $CC_expanded "* | "$CC_expanded "* | \
+	    " $CC_quoted"* | "$CC_quoted "* | " $CC_quoted_expanded "* | "$CC_quoted_expanded "*)
+	      # The compiler in the base compile command matches
+	      # the one in the tagged configuration.
+	      # Assume this is the tagged configuration we want.
+	      tagname=$z
+	      break
+	      ;;
+	    esac
+	  fi
+	done
+	# If $tagname still isn't set, then no tagged configuration
+	# was found and let the user know that the "--tag" command
+	# line option must be used.
+	if test -z "$tagname"; then
+	  func_echo "unable to infer tagged configuration"
+	  func_fatal_error "specify a tag with \`--tag'"
+#	else
+#	  func_verbose "using $tagname tagged configuration"
+	fi
+	;;
+      esac
+    fi
+}
+
+
+
+# func_write_libtool_object output_name pic_name nonpic_name
+# Create a libtool object file (analogous to a ".la" file),
+# but don't create it if we're doing a dry run.
+func_write_libtool_object ()
+{
+    write_libobj=${1}
+    if test "$build_libtool_libs" = yes; then
+      write_lobj=\'${2}\'
+    else
+      write_lobj=none
+    fi
+
+    if test "$build_old_libs" = yes; then
+      write_oldobj=\'${3}\'
+    else
+      write_oldobj=none
+    fi
+
+    $opt_dry_run || {
+      cat >${write_libobj}T <<EOF
+# $write_libobj - a libtool object file
+# Generated by $PROGRAM (GNU $PACKAGE$TIMESTAMP) $VERSION
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# Name of the PIC object.
+pic_object=$write_lobj
+
+# Name of the non-PIC object
+non_pic_object=$write_oldobj
+
+EOF
+      $MV "${write_libobj}T" "${write_libobj}"
+    }
+}
+
+
+##################################################
+# FILE NAME AND PATH CONVERSION HELPER FUNCTIONS #
+##################################################
+
+# func_convert_core_file_wine_to_w32 ARG
+# Helper function used by file name conversion functions when $build is *nix,
+# and $host is mingw, cygwin, or some other w32 environment. Relies on a
+# correctly configured wine environment available, with the winepath program
+# in $build's $PATH.
+#
+# ARG is the $build file name to be converted to w32 format.
+# Result is available in $func_convert_core_file_wine_to_w32_result, and will
+# be empty on error (or when ARG is empty)
+func_convert_core_file_wine_to_w32 ()
+{
+  $opt_debug
+  func_convert_core_file_wine_to_w32_result="$1"
+  if test -n "$1"; then
+    # Unfortunately, winepath does not exit with a non-zero error code, so we
+    # are forced to check the contents of stdout. On the other hand, if the
+    # command is not found, the shell will set an exit code of 127 and print
+    # *an error message* to stdout. So we must check for both error code of
+    # zero AND non-empty stdout, which explains the odd construction:
+    func_convert_core_file_wine_to_w32_tmp=`winepath -w "$1" 2>/dev/null`
+    if test "$?" -eq 0 && test -n "${func_convert_core_file_wine_to_w32_tmp}"; then
+      func_convert_core_file_wine_to_w32_result=`$ECHO "$func_convert_core_file_wine_to_w32_tmp" |
+        $SED -e "$lt_sed_naive_backslashify"`
+    else
+      func_convert_core_file_wine_to_w32_result=
+    fi
+  fi
+}
+# end: func_convert_core_file_wine_to_w32
+
+
+# func_convert_core_path_wine_to_w32 ARG
+# Helper function used by path conversion functions when $build is *nix, and
+# $host is mingw, cygwin, or some other w32 environment. Relies on a correctly
+# configured wine environment available, with the winepath program in $build's
+# $PATH. Assumes ARG has no leading or trailing path separator characters.
+#
+# ARG is path to be converted from $build format to win32.
+# Result is available in $func_convert_core_path_wine_to_w32_result.
+# Unconvertible file (directory) names in ARG are skipped; if no directory names
+# are convertible, then the result may be empty.
+func_convert_core_path_wine_to_w32 ()
+{
+  $opt_debug
+  # unfortunately, winepath doesn't convert paths, only file names
+  func_convert_core_path_wine_to_w32_result=""
+  if test -n "$1"; then
+    oldIFS=$IFS
+    IFS=:
+    for func_convert_core_path_wine_to_w32_f in $1; do
+      IFS=$oldIFS
+      func_convert_core_file_wine_to_w32 "$func_convert_core_path_wine_to_w32_f"
+      if test -n "$func_convert_core_file_wine_to_w32_result" ; then
+        if test -z "$func_convert_core_path_wine_to_w32_result"; then
+          func_convert_core_path_wine_to_w32_result="$func_convert_core_file_wine_to_w32_result"
+        else
+          func_append func_convert_core_path_wine_to_w32_result ";$func_convert_core_file_wine_to_w32_result"
+        fi
+      fi
+    done
+    IFS=$oldIFS
+  fi
+}
+# end: func_convert_core_path_wine_to_w32
+
+
+# func_cygpath ARGS...
+# Wrapper around calling the cygpath program via LT_CYGPATH. This is used when
+# when (1) $build is *nix and Cygwin is hosted via a wine environment; or (2)
+# $build is MSYS and $host is Cygwin, or (3) $build is Cygwin. In case (1) or
+# (2), returns the Cygwin file name or path in func_cygpath_result (input
+# file name or path is assumed to be in w32 format, as previously converted
+# from $build's *nix or MSYS format). In case (3), returns the w32 file name
+# or path in func_cygpath_result (input file name or path is assumed to be in
+# Cygwin format). Returns an empty string on error.
+#
+# ARGS are passed to cygpath, with the last one being the file name or path to
+# be converted.
+#
+# Specify the absolute *nix (or w32) name to cygpath in the LT_CYGPATH
+# environment variable; do not put it in $PATH.
+func_cygpath ()
+{
+  $opt_debug
+  if test -n "$LT_CYGPATH" && test -f "$LT_CYGPATH"; then
+    func_cygpath_result=`$LT_CYGPATH "$@" 2>/dev/null`
+    if test "$?" -ne 0; then
+      # on failure, ensure result is empty
+      func_cygpath_result=
+    fi
+  else
+    func_cygpath_result=
+    func_error "LT_CYGPATH is empty or specifies non-existent file: \`$LT_CYGPATH'"
+  fi
+}
+#end: func_cygpath
+
+
+# func_convert_core_msys_to_w32 ARG
+# Convert file name or path ARG from MSYS format to w32 format.  Return
+# result in func_convert_core_msys_to_w32_result.
+func_convert_core_msys_to_w32 ()
+{
+  $opt_debug
+  # awkward: cmd appends spaces to result
+  func_convert_core_msys_to_w32_result=`( cmd //c echo "$1" ) 2>/dev/null |
+    $SED -e 's/[ ]*$//' -e "$lt_sed_naive_backslashify"`
+}
+#end: func_convert_core_msys_to_w32
+
+
+# func_convert_file_check ARG1 ARG2
+# Verify that ARG1 (a file name in $build format) was converted to $host
+# format in ARG2. Otherwise, emit an error message, but continue (resetting
+# func_to_host_file_result to ARG1).
+func_convert_file_check ()
+{
+  $opt_debug
+  if test -z "$2" && test -n "$1" ; then
+    func_error "Could not determine host file name corresponding to"
+    func_error "  \`$1'"
+    func_error "Continuing, but uninstalled executables may not work."
+    # Fallback:
+    func_to_host_file_result="$1"
+  fi
+}
+# end func_convert_file_check
+
+
+# func_convert_path_check FROM_PATHSEP TO_PATHSEP FROM_PATH TO_PATH
+# Verify that FROM_PATH (a path in $build format) was converted to $host
+# format in TO_PATH. Otherwise, emit an error message, but continue, resetting
+# func_to_host_file_result to a simplistic fallback value (see below).
+func_convert_path_check ()
+{
+  $opt_debug
+  if test -z "$4" && test -n "$3"; then
+    func_error "Could not determine the host path corresponding to"
+    func_error "  \`$3'"
+    func_error "Continuing, but uninstalled executables may not work."
+    # Fallback.  This is a deliberately simplistic "conversion" and
+    # should not be "improved".  See libtool.info.
+    if test "x$1" != "x$2"; then
+      lt_replace_pathsep_chars="s|$1|$2|g"
+      func_to_host_path_result=`echo "$3" |
+        $SED -e "$lt_replace_pathsep_chars"`
+    else
+      func_to_host_path_result="$3"
+    fi
+  fi
+}
+# end func_convert_path_check
+
+
+# func_convert_path_front_back_pathsep FRONTPAT BACKPAT REPL ORIG
+# Modifies func_to_host_path_result by prepending REPL if ORIG matches FRONTPAT
+# and appending REPL if ORIG matches BACKPAT.
+func_convert_path_front_back_pathsep ()
+{
+  $opt_debug
+  case $4 in
+  $1 ) func_to_host_path_result="$3$func_to_host_path_result"
+    ;;
+  esac
+  case $4 in
+  $2 ) func_append func_to_host_path_result "$3"
+    ;;
+  esac
+}
+# end func_convert_path_front_back_pathsep
+
+
+##################################################
+# $build to $host FILE NAME CONVERSION FUNCTIONS #
+##################################################
+# invoked via `$to_host_file_cmd ARG'
+#
+# In each case, ARG is the path to be converted from $build to $host format.
+# Result will be available in $func_to_host_file_result.
+
+
+# func_to_host_file ARG
+# Converts the file name ARG from $build format to $host format. Return result
+# in func_to_host_file_result.
+func_to_host_file ()
+{
+  $opt_debug
+  $to_host_file_cmd "$1"
+}
+# end func_to_host_file
+
+
+# func_to_tool_file ARG LAZY
+# converts the file name ARG from $build format to toolchain format. Return
+# result in func_to_tool_file_result.  If the conversion in use is listed
+# in (the comma separated) LAZY, no conversion takes place.
+func_to_tool_file ()
+{
+  $opt_debug
+  case ,$2, in
+    *,"$to_tool_file_cmd",*)
+      func_to_tool_file_result=$1
+      ;;
+    *)
+      $to_tool_file_cmd "$1"
+      func_to_tool_file_result=$func_to_host_file_result
+      ;;
+  esac
+}
+# end func_to_tool_file
+
+
+# func_convert_file_noop ARG
+# Copy ARG to func_to_host_file_result.
+func_convert_file_noop ()
+{
+  func_to_host_file_result="$1"
+}
+# end func_convert_file_noop
+
+
+# func_convert_file_msys_to_w32 ARG
+# Convert file name ARG from (mingw) MSYS to (mingw) w32 format; automatic
+# conversion to w32 is not available inside the cwrapper.  Returns result in
+# func_to_host_file_result.
+func_convert_file_msys_to_w32 ()
+{
+  $opt_debug
+  func_to_host_file_result="$1"
+  if test -n "$1"; then
+    func_convert_core_msys_to_w32 "$1"
+    func_to_host_file_result="$func_convert_core_msys_to_w32_result"
+  fi
+  func_convert_file_check "$1" "$func_to_host_file_result"
+}
+# end func_convert_file_msys_to_w32
+
+
+# func_convert_file_cygwin_to_w32 ARG
+# Convert file name ARG from Cygwin to w32 format.  Returns result in
+# func_to_host_file_result.
+func_convert_file_cygwin_to_w32 ()
+{
+  $opt_debug
+  func_to_host_file_result="$1"
+  if test -n "$1"; then
+    # because $build is cygwin, we call "the" cygpath in $PATH; no need to use
+    # LT_CYGPATH in this case.
+    func_to_host_file_result=`cygpath -m "$1"`
+  fi
+  func_convert_file_check "$1" "$func_to_host_file_result"
+}
+# end func_convert_file_cygwin_to_w32
+
+
+# func_convert_file_nix_to_w32 ARG
+# Convert file name ARG from *nix to w32 format.  Requires a wine environment
+# and a working winepath. Returns result in func_to_host_file_result.
+func_convert_file_nix_to_w32 ()
+{
+  $opt_debug
+  func_to_host_file_result="$1"
+  if test -n "$1"; then
+    func_convert_core_file_wine_to_w32 "$1"
+    func_to_host_file_result="$func_convert_core_file_wine_to_w32_result"
+  fi
+  func_convert_file_check "$1" "$func_to_host_file_result"
+}
+# end func_convert_file_nix_to_w32
+
+
+# func_convert_file_msys_to_cygwin ARG
+# Convert file name ARG from MSYS to Cygwin format.  Requires LT_CYGPATH set.
+# Returns result in func_to_host_file_result.
+func_convert_file_msys_to_cygwin ()
+{
+  $opt_debug
+  func_to_host_file_result="$1"
+  if test -n "$1"; then
+    func_convert_core_msys_to_w32 "$1"
+    func_cygpath -u "$func_convert_core_msys_to_w32_result"
+    func_to_host_file_result="$func_cygpath_result"
+  fi
+  func_convert_file_check "$1" "$func_to_host_file_result"
+}
+# end func_convert_file_msys_to_cygwin
+
+
+# func_convert_file_nix_to_cygwin ARG
+# Convert file name ARG from *nix to Cygwin format.  Requires Cygwin installed
+# in a wine environment, working winepath, and LT_CYGPATH set.  Returns result
+# in func_to_host_file_result.
+func_convert_file_nix_to_cygwin ()
+{
+  $opt_debug
+  func_to_host_file_result="$1"
+  if test -n "$1"; then
+    # convert from *nix to w32, then use cygpath to convert from w32 to cygwin.
+    func_convert_core_file_wine_to_w32 "$1"
+    func_cygpath -u "$func_convert_core_file_wine_to_w32_result"
+    func_to_host_file_result="$func_cygpath_result"
+  fi
+  func_convert_file_check "$1" "$func_to_host_file_result"
+}
+# end func_convert_file_nix_to_cygwin
+
+
+#############################################
+# $build to $host PATH CONVERSION FUNCTIONS #
+#############################################
+# invoked via `$to_host_path_cmd ARG'
+#
+# In each case, ARG is the path to be converted from $build to $host format.
+# The result will be available in $func_to_host_path_result.
+#
+# Path separators are also converted from $build format to $host format.  If
+# ARG begins or ends with a path separator character, it is preserved (but
+# converted to $host format) on output.
+#
+# All path conversion functions are named using the following convention:
+#   file name conversion function    : func_convert_file_X_to_Y ()
+#   path conversion function         : func_convert_path_X_to_Y ()
+# where, for any given $build/$host combination the 'X_to_Y' value is the
+# same.  If conversion functions are added for new $build/$host combinations,
+# the two new functions must follow this pattern, or func_init_to_host_path_cmd
+# will break.
+
+
+# func_init_to_host_path_cmd
+# Ensures that function "pointer" variable $to_host_path_cmd is set to the
+# appropriate value, based on the value of $to_host_file_cmd.
+to_host_path_cmd=
+func_init_to_host_path_cmd ()
+{
+  $opt_debug
+  if test -z "$to_host_path_cmd"; then
+    func_stripname 'func_convert_file_' '' "$to_host_file_cmd"
+    to_host_path_cmd="func_convert_path_${func_stripname_result}"
+  fi
+}
+
+
+# func_to_host_path ARG
+# Converts the path ARG from $build format to $host format. Return result
+# in func_to_host_path_result.
+func_to_host_path ()
+{
+  $opt_debug
+  func_init_to_host_path_cmd
+  $to_host_path_cmd "$1"
+}
+# end func_to_host_path
+
+
+# func_convert_path_noop ARG
+# Copy ARG to func_to_host_path_result.
+func_convert_path_noop ()
+{
+  func_to_host_path_result="$1"
+}
+# end func_convert_path_noop
+
+
+# func_convert_path_msys_to_w32 ARG
+# Convert path ARG from (mingw) MSYS to (mingw) w32 format; automatic
+# conversion to w32 is not available inside the cwrapper.  Returns result in
+# func_to_host_path_result.
+func_convert_path_msys_to_w32 ()
+{
+  $opt_debug
+  func_to_host_path_result="$1"
+  if test -n "$1"; then
+    # Remove leading and trailing path separator characters from ARG.  MSYS
+    # behavior is inconsistent here; cygpath turns them into '.;' and ';.';
+    # and winepath ignores them completely.
+    func_stripname : : "$1"
+    func_to_host_path_tmp1=$func_stripname_result
+    func_convert_core_msys_to_w32 "$func_to_host_path_tmp1"
+    func_to_host_path_result="$func_convert_core_msys_to_w32_result"
+    func_convert_path_check : ";" \
+      "$func_to_host_path_tmp1" "$func_to_host_path_result"
+    func_convert_path_front_back_pathsep ":*" "*:" ";" "$1"
+  fi
+}
+# end func_convert_path_msys_to_w32
+
+
+# func_convert_path_cygwin_to_w32 ARG
+# Convert path ARG from Cygwin to w32 format.  Returns result in
+# func_to_host_file_result.
+func_convert_path_cygwin_to_w32 ()
+{
+  $opt_debug
+  func_to_host_path_result="$1"
+  if test -n "$1"; then
+    # See func_convert_path_msys_to_w32:
+    func_stripname : : "$1"
+    func_to_host_path_tmp1=$func_stripname_result
+    func_to_host_path_result=`cygpath -m -p "$func_to_host_path_tmp1"`
+    func_convert_path_check : ";" \
+      "$func_to_host_path_tmp1" "$func_to_host_path_result"
+    func_convert_path_front_back_pathsep ":*" "*:" ";" "$1"
+  fi
+}
+# end func_convert_path_cygwin_to_w32
+
+
+# func_convert_path_nix_to_w32 ARG
+# Convert path ARG from *nix to w32 format.  Requires a wine environment and
+# a working winepath.  Returns result in func_to_host_file_result.
+func_convert_path_nix_to_w32 ()
+{
+  $opt_debug
+  func_to_host_path_result="$1"
+  if test -n "$1"; then
+    # See func_convert_path_msys_to_w32:
+    func_stripname : : "$1"
+    func_to_host_path_tmp1=$func_stripname_result
+    func_convert_core_path_wine_to_w32 "$func_to_host_path_tmp1"
+    func_to_host_path_result="$func_convert_core_path_wine_to_w32_result"
+    func_convert_path_check : ";" \
+      "$func_to_host_path_tmp1" "$func_to_host_path_result"
+    func_convert_path_front_back_pathsep ":*" "*:" ";" "$1"
+  fi
+}
+# end func_convert_path_nix_to_w32
+
+
+# func_convert_path_msys_to_cygwin ARG
+# Convert path ARG from MSYS to Cygwin format.  Requires LT_CYGPATH set.
+# Returns result in func_to_host_file_result.
+func_convert_path_msys_to_cygwin ()
+{
+  $opt_debug
+  func_to_host_path_result="$1"
+  if test -n "$1"; then
+    # See func_convert_path_msys_to_w32:
+    func_stripname : : "$1"
+    func_to_host_path_tmp1=$func_stripname_result
+    func_convert_core_msys_to_w32 "$func_to_host_path_tmp1"
+    func_cygpath -u -p "$func_convert_core_msys_to_w32_result"
+    func_to_host_path_result="$func_cygpath_result"
+    func_convert_path_check : : \
+      "$func_to_host_path_tmp1" "$func_to_host_path_result"
+    func_convert_path_front_back_pathsep ":*" "*:" : "$1"
+  fi
+}
+# end func_convert_path_msys_to_cygwin
+
+
+# func_convert_path_nix_to_cygwin ARG
+# Convert path ARG from *nix to Cygwin format.  Requires Cygwin installed in a
+# a wine environment, working winepath, and LT_CYGPATH set.  Returns result in
+# func_to_host_file_result.
+func_convert_path_nix_to_cygwin ()
+{
+  $opt_debug
+  func_to_host_path_result="$1"
+  if test -n "$1"; then
+    # Remove leading and trailing path separator characters from
+    # ARG. msys behavior is inconsistent here, cygpath turns them
+    # into '.;' and ';.', and winepath ignores them completely.
+    func_stripname : : "$1"
+    func_to_host_path_tmp1=$func_stripname_result
+    func_convert_core_path_wine_to_w32 "$func_to_host_path_tmp1"
+    func_cygpath -u -p "$func_convert_core_path_wine_to_w32_result"
+    func_to_host_path_result="$func_cygpath_result"
+    func_convert_path_check : : \
+      "$func_to_host_path_tmp1" "$func_to_host_path_result"
+    func_convert_path_front_back_pathsep ":*" "*:" : "$1"
+  fi
+}
+# end func_convert_path_nix_to_cygwin
+
+
+# func_mode_compile arg...
+func_mode_compile ()
+{
+    $opt_debug
+    # Get the compilation command and the source file.
+    base_compile=
+    srcfile="$nonopt"  #  always keep a non-empty value in "srcfile"
+    suppress_opt=yes
+    suppress_output=
+    arg_mode=normal
+    libobj=
+    later=
+    pie_flag=
+
+    for arg
+    do
+      case $arg_mode in
+      arg  )
+	# do not "continue".  Instead, add this to base_compile
+	lastarg="$arg"
+	arg_mode=normal
+	;;
+
+      target )
+	libobj="$arg"
+	arg_mode=normal
+	continue
+	;;
+
+      normal )
+	# Accept any command-line options.
+	case $arg in
+	-o)
+	  test -n "$libobj" && \
+	    func_fatal_error "you cannot specify \`-o' more than once"
+	  arg_mode=target
+	  continue
+	  ;;
+
+	-pie | -fpie | -fPIE)
+          func_append pie_flag " $arg"
+	  continue
+	  ;;
+
+	-shared | -static | -prefer-pic | -prefer-non-pic)
+	  func_append later " $arg"
+	  continue
+	  ;;
+
+	-no-suppress)
+	  suppress_opt=no
+	  continue
+	  ;;
+
+	-Xcompiler)
+	  arg_mode=arg  #  the next one goes into the "base_compile" arg list
+	  continue      #  The current "srcfile" will either be retained or
+	  ;;            #  replaced later.  I would guess that would be a bug.
+
+	-Wc,*)
+	  func_stripname '-Wc,' '' "$arg"
+	  args=$func_stripname_result
+	  lastarg=
+	  save_ifs="$IFS"; IFS=','
+	  for arg in $args; do
+	    IFS="$save_ifs"
+	    func_append_quoted lastarg "$arg"
+	  done
+	  IFS="$save_ifs"
+	  func_stripname ' ' '' "$lastarg"
+	  lastarg=$func_stripname_result
+
+	  # Add the arguments to base_compile.
+	  func_append base_compile " $lastarg"
+	  continue
+	  ;;
+
+	*)
+	  # Accept the current argument as the source file.
+	  # The previous "srcfile" becomes the current argument.
+	  #
+	  lastarg="$srcfile"
+	  srcfile="$arg"
+	  ;;
+	esac  #  case $arg
+	;;
+      esac    #  case $arg_mode
+
+      # Aesthetically quote the previous argument.
+      func_append_quoted base_compile "$lastarg"
+    done # for arg
+
+    case $arg_mode in
+    arg)
+      func_fatal_error "you must specify an argument for -Xcompile"
+      ;;
+    target)
+      func_fatal_error "you must specify a target with \`-o'"
+      ;;
+    *)
+      # Get the name of the library object.
+      test -z "$libobj" && {
+	func_basename "$srcfile"
+	libobj="$func_basename_result"
+      }
+      ;;
+    esac
+
+    # Recognize several different file suffixes.
+    # If the user specifies -o file.o, it is replaced with file.lo
+    case $libobj in
+    *.[cCFSifmso] | \
+    *.ada | *.adb | *.ads | *.asm | \
+    *.c++ | *.cc | *.ii | *.class | *.cpp | *.cxx | \
+    *.[fF][09]? | *.for | *.java | *.go | *.obj | *.sx | *.cu | *.cup)
+      func_xform "$libobj"
+      libobj=$func_xform_result
+      ;;
+    esac
+
+    case $libobj in
+    *.lo) func_lo2o "$libobj"; obj=$func_lo2o_result ;;
+    *)
+      func_fatal_error "cannot determine name of library object from \`$libobj'"
+      ;;
+    esac
+
+    func_infer_tag $base_compile
+
+    for arg in $later; do
+      case $arg in
+      -shared)
+	test "$build_libtool_libs" != yes && \
+	  func_fatal_configuration "can not build a shared library"
+	build_old_libs=no
+	continue
+	;;
+
+      -static)
+	build_libtool_libs=no
+	build_old_libs=yes
+	continue
+	;;
+
+      -prefer-pic)
+	pic_mode=yes
+	continue
+	;;
+
+      -prefer-non-pic)
+	pic_mode=no
+	continue
+	;;
+      esac
+    done
+
+    func_quote_for_eval "$libobj"
+    test "X$libobj" != "X$func_quote_for_eval_result" \
+      && $ECHO "X$libobj" | $GREP '[]~#^*{};<>?"'"'"'	 &()|`$[]' \
+      && func_warning "libobj name \`$libobj' may not contain shell special characters."
+    func_dirname_and_basename "$obj" "/" ""
+    objname="$func_basename_result"
+    xdir="$func_dirname_result"
+    lobj=${xdir}$objdir/$objname
+
+    test -z "$base_compile" && \
+      func_fatal_help "you must specify a compilation command"
+
+    # Delete any leftover library objects.
+    if test "$build_old_libs" = yes; then
+      removelist="$obj $lobj $libobj ${libobj}T"
+    else
+      removelist="$lobj $libobj ${libobj}T"
+    fi
+
+    # On Cygwin there's no "real" PIC flag so we must build both object types
+    case $host_os in
+    cygwin* | mingw* | pw32* | os2* | cegcc*)
+      pic_mode=default
+      ;;
+    esac
+    if test "$pic_mode" = no && test "$deplibs_check_method" != pass_all; then
+      # non-PIC code in shared libraries is not supported
+      pic_mode=default
+    fi
+
+    # Calculate the filename of the output object if compiler does
+    # not support -o with -c
+    if test "$compiler_c_o" = no; then
+      output_obj=`$ECHO "$srcfile" | $SED 's%^.*/%%; s%\.[^.]*$%%'`.${objext}
+      lockfile="$output_obj.lock"
+    else
+      output_obj=
+      need_locks=no
+      lockfile=
+    fi
+
+    # Lock this critical section if it is needed
+    # We use this script file to make the link, it avoids creating a new file
+    if test "$need_locks" = yes; then
+      until $opt_dry_run || ln "$progpath" "$lockfile" 2>/dev/null; do
+	func_echo "Waiting for $lockfile to be removed"
+	sleep 2
+      done
+    elif test "$need_locks" = warn; then
+      if test -f "$lockfile"; then
+	$ECHO "\
+*** ERROR, $lockfile exists and contains:
+`cat $lockfile 2>/dev/null`
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$opt_dry_run || $RM $removelist
+	exit $EXIT_FAILURE
+      fi
+      func_append removelist " $output_obj"
+      $ECHO "$srcfile" > "$lockfile"
+    fi
+
+    $opt_dry_run || $RM $removelist
+    func_append removelist " $lockfile"
+    trap '$opt_dry_run || $RM $removelist; exit $EXIT_FAILURE' 1 2 15
+
+    func_to_tool_file "$srcfile" func_convert_file_msys_to_w32
+    srcfile=$func_to_tool_file_result
+    func_quote_for_eval "$srcfile"
+    qsrcfile=$func_quote_for_eval_result
+
+    # Only build a PIC object if we are building libtool libraries.
+    if test "$build_libtool_libs" = yes; then
+      # Without this assignment, base_compile gets emptied.
+      fbsd_hideous_sh_bug=$base_compile
+
+      if test "$pic_mode" != no; then
+	command="$base_compile $qsrcfile $pic_flag"
+      else
+	# Don't build PIC code
+	command="$base_compile $qsrcfile"
+      fi
+
+      func_mkdir_p "$xdir$objdir"
+
+      if test -z "$output_obj"; then
+	# Place PIC objects in $objdir
+	func_append command " -o $lobj"
+      fi
+
+      func_show_eval_locale "$command"	\
+          'test -n "$output_obj" && $RM $removelist; exit $EXIT_FAILURE'
+
+      if test "$need_locks" = warn &&
+	 test "X`cat $lockfile 2>/dev/null`" != "X$srcfile"; then
+	$ECHO "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$opt_dry_run || $RM $removelist
+	exit $EXIT_FAILURE
+      fi
+
+      # Just move the object if needed, then go on to compile the next one
+      if test -n "$output_obj" && test "X$output_obj" != "X$lobj"; then
+	func_show_eval '$MV "$output_obj" "$lobj"' \
+	  'error=$?; $opt_dry_run || $RM $removelist; exit $error'
+      fi
+
+      # Allow error messages only from the first compilation.
+      if test "$suppress_opt" = yes; then
+	suppress_output=' >/dev/null 2>&1'
+      fi
+    fi
+
+    # Only build a position-dependent object if we build old libraries.
+    if test "$build_old_libs" = yes; then
+      if test "$pic_mode" != yes; then
+	# Don't build PIC code
+	command="$base_compile $qsrcfile$pie_flag"
+      else
+	command="$base_compile $qsrcfile $pic_flag"
+      fi
+      if test "$compiler_c_o" = yes; then
+	func_append command " -o $obj"
+      fi
+
+      # Suppress compiler output if we already did a PIC compilation.
+      func_append command "$suppress_output"
+      func_show_eval_locale "$command" \
+        '$opt_dry_run || $RM $removelist; exit $EXIT_FAILURE'
+
+      if test "$need_locks" = warn &&
+	 test "X`cat $lockfile 2>/dev/null`" != "X$srcfile"; then
+	$ECHO "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$opt_dry_run || $RM $removelist
+	exit $EXIT_FAILURE
+      fi
+
+      # Just move the object if needed
+      if test -n "$output_obj" && test "X$output_obj" != "X$obj"; then
+	func_show_eval '$MV "$output_obj" "$obj"' \
+	  'error=$?; $opt_dry_run || $RM $removelist; exit $error'
+      fi
+    fi
+
+    $opt_dry_run || {
+      func_write_libtool_object "$libobj" "$objdir/$objname" "$objname"
+
+      # Unlock the critical section if it was locked
+      if test "$need_locks" != no; then
+	removelist=$lockfile
+        $RM "$lockfile"
+      fi
+    }
+
+    exit $EXIT_SUCCESS
+}
+
+$opt_help || {
+  test "$opt_mode" = compile && func_mode_compile ${1+"$@"}
+}
+
+func_mode_help ()
+{
+    # We need to display help for each of the modes.
+    case $opt_mode in
+      "")
+        # Generic help is extracted from the usage comments
+        # at the start of this file.
+        func_help
+        ;;
+
+      clean)
+        $ECHO \
+"Usage: $progname [OPTION]... --mode=clean RM [RM-OPTION]... FILE...
+
+Remove files from the build directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, object or program, all the files associated
+with it are deleted. Otherwise, only FILE itself is deleted using RM."
+        ;;
+
+      compile)
+      $ECHO \
+"Usage: $progname [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE
+
+Compile a source file into a libtool library object.
+
+This mode accepts the following additional options:
+
+  -o OUTPUT-FILE    set the output file name to OUTPUT-FILE
+  -no-suppress      do not suppress compiler output for multiple passes
+  -prefer-pic       try to build PIC objects only
+  -prefer-non-pic   try to build non-PIC objects only
+  -shared           do not build a \`.o' file suitable for static linking
+  -static           only build a \`.o' file suitable for static linking
+  -Wc,FLAG          pass FLAG directly to the compiler
+
+COMPILE-COMMAND is a command to be used in creating a \`standard' object file
+from the given SOURCEFILE.
+
+The output file name is determined by removing the directory component from
+SOURCEFILE, then substituting the C source code suffix \`.c' with the
+library object suffix, \`.lo'."
+        ;;
+
+      execute)
+        $ECHO \
+"Usage: $progname [OPTION]... --mode=execute COMMAND [ARGS]...
+
+Automatically set library path, then run a program.
+
+This mode accepts the following additional options:
+
+  -dlopen FILE      add the directory containing FILE to the library path
+
+This mode sets the library path environment variable according to \`-dlopen'
+flags.
+
+If any of the ARGS are libtool executable wrappers, then they are translated
+into their corresponding uninstalled binary, and any of their required library
+directories are added to the library path.
+
+Then, COMMAND is executed, with ARGS as arguments."
+        ;;
+
+      finish)
+        $ECHO \
+"Usage: $progname [OPTION]... --mode=finish [LIBDIR]...
+
+Complete the installation of libtool libraries.
+
+Each LIBDIR is a directory that contains libtool libraries.
+
+The commands that this mode executes may require superuser privileges.  Use
+the \`--dry-run' option if you just want to see what would be executed."
+        ;;
+
+      install)
+        $ECHO \
+"Usage: $progname [OPTION]... --mode=install INSTALL-COMMAND...
+
+Install executables or libraries.
+
+INSTALL-COMMAND is the installation command.  The first component should be
+either the \`install' or \`cp' program.
+
+The following components of INSTALL-COMMAND are treated specially:
+
+  -inst-prefix-dir PREFIX-DIR  Use PREFIX-DIR as a staging area for installation
+
+The rest of the components are interpreted as arguments to that command (only
+BSD-compatible install options are recognized)."
+        ;;
+
+      link)
+        $ECHO \
+"Usage: $progname [OPTION]... --mode=link LINK-COMMAND...
+
+Link object files or libraries together to form another library, or to
+create an executable program.
+
+LINK-COMMAND is a command using the C compiler that you would use to create
+a program from several object files.
+
+The following components of LINK-COMMAND are treated specially:
+
+  -all-static       do not do any dynamic linking at all
+  -avoid-version    do not add a version suffix if possible
+  -bindir BINDIR    specify path to binaries directory (for systems where
+                    libraries must be found in the PATH setting at runtime)
+  -dlopen FILE      \`-dlpreopen' FILE if it cannot be dlopened at runtime
+  -dlpreopen FILE   link in FILE and add its symbols to lt_preloaded_symbols
+  -export-dynamic   allow symbols from OUTPUT-FILE to be resolved with dlsym(3)
+  -export-symbols SYMFILE
+                    try to export only the symbols listed in SYMFILE
+  -export-symbols-regex REGEX
+                    try to export only the symbols matching REGEX
+  -LLIBDIR          search LIBDIR for required installed libraries
+  -lNAME            OUTPUT-FILE requires the installed library libNAME
+  -module           build a library that can dlopened
+  -no-fast-install  disable the fast-install mode
+  -no-install       link a not-installable executable
+  -no-undefined     declare that a library does not refer to external symbols
+  -o OUTPUT-FILE    create OUTPUT-FILE from the specified objects
+  -objectlist FILE  Use a list of object files found in FILE to specify objects
+  -precious-files-regex REGEX
+                    don't remove output files matching REGEX
+  -release RELEASE  specify package release information
+  -rpath LIBDIR     the created library will eventually be installed in LIBDIR
+  -R[ ]LIBDIR       add LIBDIR to the runtime path of programs and libraries
+  -shared           only do dynamic linking of libtool libraries
+  -shrext SUFFIX    override the standard shared library file extension
+  -static           do not do any dynamic linking of uninstalled libtool libraries
+  -static-libtool-libs
+                    do not do any dynamic linking of libtool libraries
+  -version-info CURRENT[:REVISION[:AGE]]
+                    specify library version info [each variable defaults to 0]
+  -weak LIBNAME     declare that the target provides the LIBNAME interface
+  -Wc,FLAG
+  -Xcompiler FLAG   pass linker-specific FLAG directly to the compiler
+  -Wl,FLAG
+  -Xlinker FLAG     pass linker-specific FLAG directly to the linker
+  -XCClinker FLAG   pass link-specific FLAG to the compiler driver (CC)
+
+All other options (arguments beginning with \`-') are ignored.
+
+Every other argument is treated as a filename.  Files ending in \`.la' are
+treated as uninstalled libtool libraries, other files are standard or library
+object files.
+
+If the OUTPUT-FILE ends in \`.la', then a libtool library is created,
+only library objects (\`.lo' files) may be specified, and \`-rpath' is
+required, except when creating a convenience library.
+
+If OUTPUT-FILE ends in \`.a' or \`.lib', then a standard library is created
+using \`ar' and \`ranlib', or on Windows using \`lib'.
+
+If OUTPUT-FILE ends in \`.lo' or \`.${objext}', then a reloadable object file
+is created, otherwise an executable program is created."
+        ;;
+
+      uninstall)
+        $ECHO \
+"Usage: $progname [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE...
+
+Remove libraries from an installation directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, all the files associated with it are deleted.
+Otherwise, only FILE itself is deleted using RM."
+        ;;
+
+      *)
+        func_fatal_help "invalid operation mode \`$opt_mode'"
+        ;;
+    esac
+
+    echo
+    $ECHO "Try \`$progname --help' for more information about other modes."
+}
+
+# Now that we've collected a possible --mode arg, show help if necessary
+if $opt_help; then
+  if test "$opt_help" = :; then
+    func_mode_help
+  else
+    {
+      func_help noexit
+      for opt_mode in compile link execute install finish uninstall clean; do
+	func_mode_help
+      done
+    } | sed -n '1p; 2,$s/^Usage:/  or: /p'
+    {
+      func_help noexit
+      for opt_mode in compile link execute install finish uninstall clean; do
+	echo
+	func_mode_help
+      done
+    } |
+    sed '1d
+      /^When reporting/,/^Report/{
+	H
+	d
+      }
+      $x
+      /information about other modes/d
+      /more detailed .*MODE/d
+      s/^Usage:.*--mode=\([^ ]*\) .*/Description of \1 mode:/'
+  fi
+  exit $?
+fi
+
+
+# func_mode_execute arg...
+func_mode_execute ()
+{
+    $opt_debug
+    # The first argument is the command name.
+    cmd="$nonopt"
+    test -z "$cmd" && \
+      func_fatal_help "you must specify a COMMAND"
+
+    # Handle -dlopen flags immediately.
+    for file in $opt_dlopen; do
+      test -f "$file" \
+	|| func_fatal_help "\`$file' is not a file"
+
+      dir=
+      case $file in
+      *.la)
+	func_resolve_sysroot "$file"
+	file=$func_resolve_sysroot_result
+
+	# Check to see that this really is a libtool archive.
+	func_lalib_unsafe_p "$file" \
+	  || func_fatal_help "\`$lib' is not a valid libtool archive"
+
+	# Read the libtool library.
+	dlname=
+	library_names=
+	func_source "$file"
+
+	# Skip this library if it cannot be dlopened.
+	if test -z "$dlname"; then
+	  # Warn if it was a shared library.
+	  test -n "$library_names" && \
+	    func_warning "\`$file' was not linked with \`-export-dynamic'"
+	  continue
+	fi
+
+	func_dirname "$file" "" "."
+	dir="$func_dirname_result"
+
+	if test -f "$dir/$objdir/$dlname"; then
+	  func_append dir "/$objdir"
+	else
+	  if test ! -f "$dir/$dlname"; then
+	    func_fatal_error "cannot find \`$dlname' in \`$dir' or \`$dir/$objdir'"
+	  fi
+	fi
+	;;
+
+      *.lo)
+	# Just add the directory containing the .lo file.
+	func_dirname "$file" "" "."
+	dir="$func_dirname_result"
+	;;
+
+      *)
+	func_warning "\`-dlopen' is ignored for non-libtool libraries and objects"
+	continue
+	;;
+      esac
+
+      # Get the absolute pathname.
+      absdir=`cd "$dir" && pwd`
+      test -n "$absdir" && dir="$absdir"
+
+      # Now add the directory to shlibpath_var.
+      if eval "test -z \"\$$shlibpath_var\""; then
+	eval "$shlibpath_var=\"\$dir\""
+      else
+	eval "$shlibpath_var=\"\$dir:\$$shlibpath_var\""
+      fi
+    done
+
+    # This variable tells wrapper scripts just to set shlibpath_var
+    # rather than running their programs.
+    libtool_execute_magic="$magic"
+
+    # Check if any of the arguments is a wrapper script.
+    args=
+    for file
+    do
+      case $file in
+      -* | *.la | *.lo ) ;;
+      *)
+	# Do a test to see if this is really a libtool program.
+	if func_ltwrapper_script_p "$file"; then
+	  func_source "$file"
+	  # Transform arg to wrapped name.
+	  file="$progdir/$program"
+	elif func_ltwrapper_executable_p "$file"; then
+	  func_ltwrapper_scriptname "$file"
+	  func_source "$func_ltwrapper_scriptname_result"
+	  # Transform arg to wrapped name.
+	  file="$progdir/$program"
+	fi
+	;;
+      esac
+      # Quote arguments (to preserve shell metacharacters).
+      func_append_quoted args "$file"
+    done
+
+    if test "X$opt_dry_run" = Xfalse; then
+      if test -n "$shlibpath_var"; then
+	# Export the shlibpath_var.
+	eval "export $shlibpath_var"
+      fi
+
+      # Restore saved environment variables
+      for lt_var in LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES
+      do
+	eval "if test \"\${save_$lt_var+set}\" = set; then
+                $lt_var=\$save_$lt_var; export $lt_var
+	      else
+		$lt_unset $lt_var
+	      fi"
+      done
+
+      # Now prepare to actually exec the command.
+      exec_cmd="\$cmd$args"
+    else
+      # Display what would be done.
+      if test -n "$shlibpath_var"; then
+	eval "\$ECHO \"\$shlibpath_var=\$$shlibpath_var\""
+	echo "export $shlibpath_var"
+      fi
+      $ECHO "$cmd$args"
+      exit $EXIT_SUCCESS
+    fi
+}
+
+test "$opt_mode" = execute && func_mode_execute ${1+"$@"}
+
+
+# func_mode_finish arg...
+func_mode_finish ()
+{
+    $opt_debug
+    libs=
+    libdirs=
+    admincmds=
+
+    for opt in "$nonopt" ${1+"$@"}
+    do
+      if test -d "$opt"; then
+	func_append libdirs " $opt"
+
+      elif test -f "$opt"; then
+	if func_lalib_unsafe_p "$opt"; then
+	  func_append libs " $opt"
+	else
+	  func_warning "\`$opt' is not a valid libtool archive"
+	fi
+
+      else
+	func_fatal_error "invalid argument \`$opt'"
+      fi
+    done
+
+    if test -n "$libs"; then
+      if test -n "$lt_sysroot"; then
+        sysroot_regex=`$ECHO "$lt_sysroot" | $SED "$sed_make_literal_regex"`
+        sysroot_cmd="s/\([ ']\)$sysroot_regex/\1/g;"
+      else
+        sysroot_cmd=
+      fi
+
+      # Remove sysroot references
+      if $opt_dry_run; then
+        for lib in $libs; do
+          echo "removing references to $lt_sysroot and \`=' prefixes from $lib"
+        done
+      else
+        tmpdir=`func_mktempdir`
+        for lib in $libs; do
+	  sed -e "${sysroot_cmd} s/\([ ']-[LR]\)=/\1/g; s/\([ ']\)=/\1/g" $lib \
+	    > $tmpdir/tmp-la
+	  mv -f $tmpdir/tmp-la $lib
+	done
+        ${RM}r "$tmpdir"
+      fi
+    fi
+
+    if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
+      for libdir in $libdirs; do
+	if test -n "$finish_cmds"; then
+	  # Do each command in the finish commands.
+	  func_execute_cmds "$finish_cmds" 'admincmds="$admincmds
+'"$cmd"'"'
+	fi
+	if test -n "$finish_eval"; then
+	  # Do the single finish_eval.
+	  eval cmds=\"$finish_eval\"
+	  $opt_dry_run || eval "$cmds" || func_append admincmds "
+       $cmds"
+	fi
+      done
+    fi
+
+    # Exit here if they wanted silent mode.
+    $opt_silent && exit $EXIT_SUCCESS
+
+    if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
+      echo "----------------------------------------------------------------------"
+      echo "Libraries have been installed in:"
+      for libdir in $libdirs; do
+	$ECHO "   $libdir"
+      done
+      echo
+      echo "If you ever happen to want to link against installed libraries"
+      echo "in a given directory, LIBDIR, you must either use libtool, and"
+      echo "specify the full pathname of the library, or use the \`-LLIBDIR'"
+      echo "flag during linking and do at least one of the following:"
+      if test -n "$shlibpath_var"; then
+	echo "   - add LIBDIR to the \`$shlibpath_var' environment variable"
+	echo "     during execution"
+      fi
+      if test -n "$runpath_var"; then
+	echo "   - add LIBDIR to the \`$runpath_var' environment variable"
+	echo "     during linking"
+      fi
+      if test -n "$hardcode_libdir_flag_spec"; then
+	libdir=LIBDIR
+	eval flag=\"$hardcode_libdir_flag_spec\"
+
+	$ECHO "   - use the \`$flag' linker flag"
+      fi
+      if test -n "$admincmds"; then
+	$ECHO "   - have your system administrator run these commands:$admincmds"
+      fi
+      if test -f /etc/ld.so.conf; then
+	echo "   - have your system administrator add LIBDIR to \`/etc/ld.so.conf'"
+      fi
+      echo
+
+      echo "See any operating system documentation about shared libraries for"
+      case $host in
+	solaris2.[6789]|solaris2.1[0-9])
+	  echo "more information, such as the ld(1), crle(1) and ld.so(8) manual"
+	  echo "pages."
+	  ;;
+	*)
+	  echo "more information, such as the ld(1) and ld.so(8) manual pages."
+	  ;;
+      esac
+      echo "----------------------------------------------------------------------"
+    fi
+    exit $EXIT_SUCCESS
+}
+
+test "$opt_mode" = finish && func_mode_finish ${1+"$@"}
+
+
+# func_mode_install arg...
+func_mode_install ()
+{
+    $opt_debug
+    # There may be an optional sh(1) argument at the beginning of
+    # install_prog (especially on Windows NT).
+    if test "$nonopt" = "$SHELL" || test "$nonopt" = /bin/sh ||
+       # Allow the use of GNU shtool's install command.
+       case $nonopt in *shtool*) :;; *) false;; esac; then
+      # Aesthetically quote it.
+      func_quote_for_eval "$nonopt"
+      install_prog="$func_quote_for_eval_result "
+      arg=$1
+      shift
+    else
+      install_prog=
+      arg=$nonopt
+    fi
+
+    # The real first argument should be the name of the installation program.
+    # Aesthetically quote it.
+    func_quote_for_eval "$arg"
+    func_append install_prog "$func_quote_for_eval_result"
+    install_shared_prog=$install_prog
+    case " $install_prog " in
+      *[\\\ /]cp\ *) install_cp=: ;;
+      *) install_cp=false ;;
+    esac
+
+    # We need to accept at least all the BSD install flags.
+    dest=
+    files=
+    opts=
+    prev=
+    install_type=
+    isdir=no
+    stripme=
+    no_mode=:
+    for arg
+    do
+      arg2=
+      if test -n "$dest"; then
+	func_append files " $dest"
+	dest=$arg
+	continue
+      fi
+
+      case $arg in
+      -d) isdir=yes ;;
+      -f)
+	if $install_cp; then :; else
+	  prev=$arg
+	fi
+	;;
+      -g | -m | -o)
+	prev=$arg
+	;;
+      -s)
+	stripme=" -s"
+	continue
+	;;
+      -*)
+	;;
+      *)
+	# If the previous option needed an argument, then skip it.
+	if test -n "$prev"; then
+	  if test "x$prev" = x-m && test -n "$install_override_mode"; then
+	    arg2=$install_override_mode
+	    no_mode=false
+	  fi
+	  prev=
+	else
+	  dest=$arg
+	  continue
+	fi
+	;;
+      esac
+
+      # Aesthetically quote the argument.
+      func_quote_for_eval "$arg"
+      func_append install_prog " $func_quote_for_eval_result"
+      if test -n "$arg2"; then
+	func_quote_for_eval "$arg2"
+      fi
+      func_append install_shared_prog " $func_quote_for_eval_result"
+    done
+
+    test -z "$install_prog" && \
+      func_fatal_help "you must specify an install program"
+
+    test -n "$prev" && \
+      func_fatal_help "the \`$prev' option requires an argument"
+
+    if test -n "$install_override_mode" && $no_mode; then
+      if $install_cp; then :; else
+	func_quote_for_eval "$install_override_mode"
+	func_append install_shared_prog " -m $func_quote_for_eval_result"
+      fi
+    fi
+
+    if test -z "$files"; then
+      if test -z "$dest"; then
+	func_fatal_help "no file or destination specified"
+      else
+	func_fatal_help "you must specify a destination"
+      fi
+    fi
+
+    # Strip any trailing slash from the destination.
+    func_stripname '' '/' "$dest"
+    dest=$func_stripname_result
+
+    # Check to see that the destination is a directory.
+    test -d "$dest" && isdir=yes
+    if test "$isdir" = yes; then
+      destdir="$dest"
+      destname=
+    else
+      func_dirname_and_basename "$dest" "" "."
+      destdir="$func_dirname_result"
+      destname="$func_basename_result"
+
+      # Not a directory, so check to see that there is only one file specified.
+      set dummy $files; shift
+      test "$#" -gt 1 && \
+	func_fatal_help "\`$dest' is not a directory"
+    fi
+    case $destdir in
+    [\\/]* | [A-Za-z]:[\\/]*) ;;
+    *)
+      for file in $files; do
+	case $file in
+	*.lo) ;;
+	*)
+	  func_fatal_help "\`$destdir' must be an absolute directory name"
+	  ;;
+	esac
+      done
+      ;;
+    esac
+
+    # This variable tells wrapper scripts just to set variables rather
+    # than running their programs.
+    libtool_install_magic="$magic"
+
+    staticlibs=
+    future_libdirs=
+    current_libdirs=
+    for file in $files; do
+
+      # Do each installation.
+      case $file in
+      *.$libext)
+	# Do the static libraries later.
+	func_append staticlibs " $file"
+	;;
+
+      *.la)
+	func_resolve_sysroot "$file"
+	file=$func_resolve_sysroot_result
+
+	# Check to see that this really is a libtool archive.
+	func_lalib_unsafe_p "$file" \
+	  || func_fatal_help "\`$file' is not a valid libtool archive"
+
+	library_names=
+	old_library=
+	relink_command=
+	func_source "$file"
+
+	# Add the libdir to current_libdirs if it is the destination.
+	if test "X$destdir" = "X$libdir"; then
+	  case "$current_libdirs " in
+	  *" $libdir "*) ;;
+	  *) func_append current_libdirs " $libdir" ;;
+	  esac
+	else
+	  # Note the libdir as a future libdir.
+	  case "$future_libdirs " in
+	  *" $libdir "*) ;;
+	  *) func_append future_libdirs " $libdir" ;;
+	  esac
+	fi
+
+	func_dirname "$file" "/" ""
+	dir="$func_dirname_result"
+	func_append dir "$objdir"
+
+	if test -n "$relink_command"; then
+	  # Determine the prefix the user has applied to our future dir.
+	  inst_prefix_dir=`$ECHO "$destdir" | $SED -e "s%$libdir\$%%"`
+
+	  # Don't allow the user to place us outside of our expected
+	  # location b/c this prevents finding dependent libraries that
+	  # are installed to the same prefix.
+	  # At present, this check doesn't affect windows .dll's that
+	  # are installed into $libdir/../bin (currently, that works fine)
+	  # but it's something to keep an eye on.
+	  test "$inst_prefix_dir" = "$destdir" && \
+	    func_fatal_error "error: cannot install \`$file' to a directory not ending in $libdir"
+
+	  if test -n "$inst_prefix_dir"; then
+	    # Stick the inst_prefix_dir data into the link command.
+	    relink_command=`$ECHO "$relink_command" | $SED "s%@inst_prefix_dir@%-inst-prefix-dir $inst_prefix_dir%"`
+	  else
+	    relink_command=`$ECHO "$relink_command" | $SED "s%@inst_prefix_dir@%%"`
+	  fi
+
+	  func_warning "relinking \`$file'"
+	  func_show_eval "$relink_command" \
+	    'func_fatal_error "error: relink \`$file'\'' with the above command before installing it"'
+	fi
+
+	# See the names of the shared library.
+	set dummy $library_names; shift
+	if test -n "$1"; then
+	  realname="$1"
+	  shift
+
+	  srcname="$realname"
+	  test -n "$relink_command" && srcname="$realname"T
+
+	  # Install the shared library and build the symlinks.
+	  func_show_eval "$install_shared_prog $dir/$srcname $destdir/$realname" \
+	      'exit $?'
+	  tstripme="$stripme"
+	  case $host_os in
+	  cygwin* | mingw* | pw32* | cegcc*)
+	    case $realname in
+	    *.dll.a)
+	      tstripme=""
+	      ;;
+	    esac
+	    ;;
+	  esac
+	  if test -n "$tstripme" && test -n "$striplib"; then
+	    func_show_eval "$striplib $destdir/$realname" 'exit $?'
+	  fi
+
+	  if test "$#" -gt 0; then
+	    # Delete the old symlinks, and create new ones.
+	    # Try `ln -sf' first, because the `ln' binary might depend on
+	    # the symlink we replace!  Solaris /bin/ln does not understand -f,
+	    # so we also need to try rm && ln -s.
+	    for linkname
+	    do
+	      test "$linkname" != "$realname" \
+		&& func_show_eval "(cd $destdir && { $LN_S -f $realname $linkname || { $RM $linkname && $LN_S $realname $linkname; }; })"
+	    done
+	  fi
+
+	  # Do each command in the postinstall commands.
+	  lib="$destdir/$realname"
+	  func_execute_cmds "$postinstall_cmds" 'exit $?'
+	fi
+
+	# Install the pseudo-library for information purposes.
+	func_basename "$file"
+	name="$func_basename_result"
+	instname="$dir/$name"i
+	func_show_eval "$install_prog $instname $destdir/$name" 'exit $?'
+
+	# Maybe install the static library, too.
+	test -n "$old_library" && func_append staticlibs " $dir/$old_library"
+	;;
+
+      *.lo)
+	# Install (i.e. copy) a libtool object.
+
+	# Figure out destination file name, if it wasn't already specified.
+	if test -n "$destname"; then
+	  destfile="$destdir/$destname"
+	else
+	  func_basename "$file"
+	  destfile="$func_basename_result"
+	  destfile="$destdir/$destfile"
+	fi
+
+	# Deduce the name of the destination old-style object file.
+	case $destfile in
+	*.lo)
+	  func_lo2o "$destfile"
+	  staticdest=$func_lo2o_result
+	  ;;
+	*.$objext)
+	  staticdest="$destfile"
+	  destfile=
+	  ;;
+	*)
+	  func_fatal_help "cannot copy a libtool object to \`$destfile'"
+	  ;;
+	esac
+
+	# Install the libtool object if requested.
+	test -n "$destfile" && \
+	  func_show_eval "$install_prog $file $destfile" 'exit $?'
+
+	# Install the old object if enabled.
+	if test "$build_old_libs" = yes; then
+	  # Deduce the name of the old-style object file.
+	  func_lo2o "$file"
+	  staticobj=$func_lo2o_result
+	  func_show_eval "$install_prog \$staticobj \$staticdest" 'exit $?'
+	fi
+	exit $EXIT_SUCCESS
+	;;
+
+      *)
+	# Figure out destination file name, if it wasn't already specified.
+	if test -n "$destname"; then
+	  destfile="$destdir/$destname"
+	else
+	  func_basename "$file"
+	  destfile="$func_basename_result"
+	  destfile="$destdir/$destfile"
+	fi
+
+	# If the file is missing, and there is a .exe on the end, strip it
+	# because it is most likely a libtool script we actually want to
+	# install
+	stripped_ext=""
+	case $file in
+	  *.exe)
+	    if test ! -f "$file"; then
+	      func_stripname '' '.exe' "$file"
+	      file=$func_stripname_result
+	      stripped_ext=".exe"
+	    fi
+	    ;;
+	esac
+
+	# Do a test to see if this is really a libtool program.
+	case $host in
+	*cygwin* | *mingw*)
+	    if func_ltwrapper_executable_p "$file"; then
+	      func_ltwrapper_scriptname "$file"
+	      wrapper=$func_ltwrapper_scriptname_result
+	    else
+	      func_stripname '' '.exe' "$file"
+	      wrapper=$func_stripname_result
+	    fi
+	    ;;
+	*)
+	    wrapper=$file
+	    ;;
+	esac
+	if func_ltwrapper_script_p "$wrapper"; then
+	  notinst_deplibs=
+	  relink_command=
+
+	  func_source "$wrapper"
+
+	  # Check the variables that should have been set.
+	  test -z "$generated_by_libtool_version" && \
+	    func_fatal_error "invalid libtool wrapper script \`$wrapper'"
+
+	  finalize=yes
+	  for lib in $notinst_deplibs; do
+	    # Check to see that each library is installed.
+	    libdir=
+	    if test -f "$lib"; then
+	      func_source "$lib"
+	    fi
+	    libfile="$libdir/"`$ECHO "$lib" | $SED 's%^.*/%%g'` ### testsuite: skip nested quoting test
+	    if test -n "$libdir" && test ! -f "$libfile"; then
+	      func_warning "\`$lib' has not been installed in \`$libdir'"
+	      finalize=no
+	    fi
+	  done
+
+	  relink_command=
+	  func_source "$wrapper"
+
+	  outputname=
+	  if test "$fast_install" = no && test -n "$relink_command"; then
+	    $opt_dry_run || {
+	      if test "$finalize" = yes; then
+	        tmpdir=`func_mktempdir`
+		func_basename "$file$stripped_ext"
+		file="$func_basename_result"
+	        outputname="$tmpdir/$file"
+	        # Replace the output file specification.
+	        relink_command=`$ECHO "$relink_command" | $SED 's%@OUTPUT@%'"$outputname"'%g'`
+
+	        $opt_silent || {
+	          func_quote_for_expand "$relink_command"
+		  eval "func_echo $func_quote_for_expand_result"
+	        }
+	        if eval "$relink_command"; then :
+	          else
+		  func_error "error: relink \`$file' with the above command before installing it"
+		  $opt_dry_run || ${RM}r "$tmpdir"
+		  continue
+	        fi
+	        file="$outputname"
+	      else
+	        func_warning "cannot relink \`$file'"
+	      fi
+	    }
+	  else
+	    # Install the binary that we compiled earlier.
+	    file=`$ECHO "$file$stripped_ext" | $SED "s%\([^/]*\)$%$objdir/\1%"`
+	  fi
+	fi
+
+	# remove .exe since cygwin /usr/bin/install will append another
+	# one anyway
+	case $install_prog,$host in
+	*/usr/bin/install*,*cygwin*)
+	  case $file:$destfile in
+	  *.exe:*.exe)
+	    # this is ok
+	    ;;
+	  *.exe:*)
+	    destfile=$destfile.exe
+	    ;;
+	  *:*.exe)
+	    func_stripname '' '.exe' "$destfile"
+	    destfile=$func_stripname_result
+	    ;;
+	  esac
+	  ;;
+	esac
+	func_show_eval "$install_prog\$stripme \$file \$destfile" 'exit $?'
+	$opt_dry_run || if test -n "$outputname"; then
+	  ${RM}r "$tmpdir"
+	fi
+	;;
+      esac
+    done
+
+    for file in $staticlibs; do
+      func_basename "$file"
+      name="$func_basename_result"
+
+      # Set up the ranlib parameters.
+      oldlib="$destdir/$name"
+      func_to_tool_file "$oldlib" func_convert_file_msys_to_w32
+      tool_oldlib=$func_to_tool_file_result
+
+      func_show_eval "$install_prog \$file \$oldlib" 'exit $?'
+
+      if test -n "$stripme" && test -n "$old_striplib"; then
+	func_show_eval "$old_striplib $tool_oldlib" 'exit $?'
+      fi
+
+      # Do each command in the postinstall commands.
+      func_execute_cmds "$old_postinstall_cmds" 'exit $?'
+    done
+
+    test -n "$future_libdirs" && \
+      func_warning "remember to run \`$progname --finish$future_libdirs'"
+
+    if test -n "$current_libdirs"; then
+      # Maybe just do a dry run.
+      $opt_dry_run && current_libdirs=" -n$current_libdirs"
+      exec_cmd='$SHELL $progpath $preserve_args --finish$current_libdirs'
+    else
+      exit $EXIT_SUCCESS
+    fi
+}
+
+test "$opt_mode" = install && func_mode_install ${1+"$@"}
+
+
+# func_generate_dlsyms outputname originator pic_p
+# Extract symbols from dlprefiles and create ${outputname}S.o with
+# a dlpreopen symbol table.
+func_generate_dlsyms ()
+{
+    $opt_debug
+    my_outputname="$1"
+    my_originator="$2"
+    my_pic_p="${3-no}"
+    my_prefix=`$ECHO "$my_originator" | sed 's%[^a-zA-Z0-9]%_%g'`
+    my_dlsyms=
+
+    if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+      if test -n "$NM" && test -n "$global_symbol_pipe"; then
+	my_dlsyms="${my_outputname}S.c"
+      else
+	func_error "not configured to extract global symbols from dlpreopened files"
+      fi
+    fi
+
+    if test -n "$my_dlsyms"; then
+      case $my_dlsyms in
+      "") ;;
+      *.c)
+	# Discover the nlist of each of the dlfiles.
+	nlist="$output_objdir/${my_outputname}.nm"
+
+	func_show_eval "$RM $nlist ${nlist}S ${nlist}T"
+
+	# Parse the name list into a source file.
+	func_verbose "creating $output_objdir/$my_dlsyms"
+
+	$opt_dry_run || $ECHO > "$output_objdir/$my_dlsyms" "\
+/* $my_dlsyms - symbol resolution table for \`$my_outputname' dlsym emulation. */
+/* Generated by $PROGRAM (GNU $PACKAGE$TIMESTAMP) $VERSION */
+
+#ifdef __cplusplus
+extern \"C\" {
+#endif
+
+#if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4))
+#pragma GCC diagnostic ignored \"-Wstrict-prototypes\"
+#endif
+
+/* Keep this code in sync between libtool.m4, ltmain, lt_system.h, and tests.  */
+#if defined(_WIN32) || defined(__CYGWIN__) || defined(_WIN32_WCE)
+/* DATA imports from DLLs on WIN32 con't be const, because runtime
+   relocations are performed -- see ld's documentation on pseudo-relocs.  */
+# define LT_DLSYM_CONST
+#elif defined(__osf__)
+/* This system does not cope well with relocations in const data.  */
+# define LT_DLSYM_CONST
+#else
+# define LT_DLSYM_CONST const
+#endif
+
+/* External symbol declarations for the compiler. */\
+"
+
+	if test "$dlself" = yes; then
+	  func_verbose "generating symbol list for \`$output'"
+
+	  $opt_dry_run || echo ': @PROGRAM@ ' > "$nlist"
+
+	  # Add our own program objects to the symbol list.
+	  progfiles=`$ECHO "$objs$old_deplibs" | $SP2NL | $SED "$lo2o" | $NL2SP`
+	  for progfile in $progfiles; do
+	    func_to_tool_file "$progfile" func_convert_file_msys_to_w32
+	    func_verbose "extracting global C symbols from \`$func_to_tool_file_result'"
+	    $opt_dry_run || eval "$NM $func_to_tool_file_result | $global_symbol_pipe >> '$nlist'"
+	  done
+
+	  if test -n "$exclude_expsyms"; then
+	    $opt_dry_run || {
+	      eval '$EGREP -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T'
+	      eval '$MV "$nlist"T "$nlist"'
+	    }
+	  fi
+
+	  if test -n "$export_symbols_regex"; then
+	    $opt_dry_run || {
+	      eval '$EGREP -e "$export_symbols_regex" "$nlist" > "$nlist"T'
+	      eval '$MV "$nlist"T "$nlist"'
+	    }
+	  fi
+
+	  # Prepare the list of exported symbols
+	  if test -z "$export_symbols"; then
+	    export_symbols="$output_objdir/$outputname.exp"
+	    $opt_dry_run || {
+	      $RM $export_symbols
+	      eval "${SED} -n -e '/^: @PROGRAM@ $/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"'
+	      case $host in
+	      *cygwin* | *mingw* | *cegcc* )
+                eval "echo EXPORTS "'> "$output_objdir/$outputname.def"'
+                eval 'cat "$export_symbols" >> "$output_objdir/$outputname.def"'
+	        ;;
+	      esac
+	    }
+	  else
+	    $opt_dry_run || {
+	      eval "${SED} -e 's/\([].[*^$]\)/\\\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$outputname.exp"'
+	      eval '$GREP -f "$output_objdir/$outputname.exp" < "$nlist" > "$nlist"T'
+	      eval '$MV "$nlist"T "$nlist"'
+	      case $host in
+	        *cygwin* | *mingw* | *cegcc* )
+	          eval "echo EXPORTS "'> "$output_objdir/$outputname.def"'
+	          eval 'cat "$nlist" >> "$output_objdir/$outputname.def"'
+	          ;;
+	      esac
+	    }
+	  fi
+	fi
+
+	for dlprefile in $dlprefiles; do
+	  func_verbose "extracting global C symbols from \`$dlprefile'"
+	  func_basename "$dlprefile"
+	  name="$func_basename_result"
+          case $host in
+	    *cygwin* | *mingw* | *cegcc* )
+	      # if an import library, we need to obtain dlname
+	      if func_win32_import_lib_p "$dlprefile"; then
+	        func_tr_sh "$dlprefile"
+	        eval "curr_lafile=\$libfile_$func_tr_sh_result"
+	        dlprefile_dlbasename=""
+	        if test -n "$curr_lafile" && func_lalib_p "$curr_lafile"; then
+	          # Use subshell, to avoid clobbering current variable values
+	          dlprefile_dlname=`source "$curr_lafile" && echo "$dlname"`
+	          if test -n "$dlprefile_dlname" ; then
+	            func_basename "$dlprefile_dlname"
+	            dlprefile_dlbasename="$func_basename_result"
+	          else
+	            # no lafile. user explicitly requested -dlpreopen <import library>.
+	            $sharedlib_from_linklib_cmd "$dlprefile"
+	            dlprefile_dlbasename=$sharedlib_from_linklib_result
+	          fi
+	        fi
+	        $opt_dry_run || {
+	          if test -n "$dlprefile_dlbasename" ; then
+	            eval '$ECHO ": $dlprefile_dlbasename" >> "$nlist"'
+	          else
+	            func_warning "Could not compute DLL name from $name"
+	            eval '$ECHO ": $name " >> "$nlist"'
+	          fi
+	          func_to_tool_file "$dlprefile" func_convert_file_msys_to_w32
+	          eval "$NM \"$func_to_tool_file_result\" 2>/dev/null | $global_symbol_pipe |
+	            $SED -e '/I __imp/d' -e 's/I __nm_/D /;s/_nm__//' >> '$nlist'"
+	        }
+	      else # not an import lib
+	        $opt_dry_run || {
+	          eval '$ECHO ": $name " >> "$nlist"'
+	          func_to_tool_file "$dlprefile" func_convert_file_msys_to_w32
+	          eval "$NM \"$func_to_tool_file_result\" 2>/dev/null | $global_symbol_pipe >> '$nlist'"
+	        }
+	      fi
+	    ;;
+	    *)
+	      $opt_dry_run || {
+	        eval '$ECHO ": $name " >> "$nlist"'
+	        func_to_tool_file "$dlprefile" func_convert_file_msys_to_w32
+	        eval "$NM \"$func_to_tool_file_result\" 2>/dev/null | $global_symbol_pipe >> '$nlist'"
+	      }
+	    ;;
+          esac
+	done
+
+	$opt_dry_run || {
+	  # Make sure we have at least an empty file.
+	  test -f "$nlist" || : > "$nlist"
+
+	  if test -n "$exclude_expsyms"; then
+	    $EGREP -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T
+	    $MV "$nlist"T "$nlist"
+	  fi
+
+	  # Try sorting and uniquifying the output.
+	  if $GREP -v "^: " < "$nlist" |
+	      if sort -k 3 </dev/null >/dev/null 2>&1; then
+		sort -k 3
+	      else
+		sort +2
+	      fi |
+	      uniq > "$nlist"S; then
+	    :
+	  else
+	    $GREP -v "^: " < "$nlist" > "$nlist"S
+	  fi
+
+	  if test -f "$nlist"S; then
+	    eval "$global_symbol_to_cdecl"' < "$nlist"S >> "$output_objdir/$my_dlsyms"'
+	  else
+	    echo '/* NONE */' >> "$output_objdir/$my_dlsyms"
+	  fi
+
+	  echo >> "$output_objdir/$my_dlsyms" "\
+
+/* The mapping between symbol names and symbols.  */
+typedef struct {
+  const char *name;
+  void *address;
+} lt_dlsymlist;
+extern LT_DLSYM_CONST lt_dlsymlist
+lt_${my_prefix}_LTX_preloaded_symbols[];
+LT_DLSYM_CONST lt_dlsymlist
+lt_${my_prefix}_LTX_preloaded_symbols[] =
+{\
+  { \"$my_originator\", (void *) 0 },"
+
+	  case $need_lib_prefix in
+	  no)
+	    eval "$global_symbol_to_c_name_address" < "$nlist" >> "$output_objdir/$my_dlsyms"
+	    ;;
+	  *)
+	    eval "$global_symbol_to_c_name_address_lib_prefix" < "$nlist" >> "$output_objdir/$my_dlsyms"
+	    ;;
+	  esac
+	  echo >> "$output_objdir/$my_dlsyms" "\
+  {0, (void *) 0}
+};
+
+/* This works around a problem in FreeBSD linker */
+#ifdef FREEBSD_WORKAROUND
+static const void *lt_preloaded_setup() {
+  return lt_${my_prefix}_LTX_preloaded_symbols;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif\
+"
+	} # !$opt_dry_run
+
+	pic_flag_for_symtable=
+	case "$compile_command " in
+	*" -static "*) ;;
+	*)
+	  case $host in
+	  # compiling the symbol table file with pic_flag works around
+	  # a FreeBSD bug that causes programs to crash when -lm is
+	  # linked before any other PIC object.  But we must not use
+	  # pic_flag when linking with -static.  The problem exists in
+	  # FreeBSD 2.2.6 and is fixed in FreeBSD 3.1.
+	  *-*-freebsd2.*|*-*-freebsd3.0*|*-*-freebsdelf3.0*)
+	    pic_flag_for_symtable=" $pic_flag -DFREEBSD_WORKAROUND" ;;
+	  *-*-hpux*)
+	    pic_flag_for_symtable=" $pic_flag"  ;;
+	  *)
+	    if test "X$my_pic_p" != Xno; then
+	      pic_flag_for_symtable=" $pic_flag"
+	    fi
+	    ;;
+	  esac
+	  ;;
+	esac
+	symtab_cflags=
+	for arg in $LTCFLAGS; do
+	  case $arg in
+	  -pie | -fpie | -fPIE) ;;
+	  *) func_append symtab_cflags " $arg" ;;
+	  esac
+	done
+
+	# Now compile the dynamic symbol file.
+	func_show_eval '(cd $output_objdir && $LTCC$symtab_cflags -c$no_builtin_flag$pic_flag_for_symtable "$my_dlsyms")' 'exit $?'
+
+	# Clean up the generated files.
+	func_show_eval '$RM "$output_objdir/$my_dlsyms" "$nlist" "${nlist}S" "${nlist}T"'
+
+	# Transform the symbol file into the correct name.
+	symfileobj="$output_objdir/${my_outputname}S.$objext"
+	case $host in
+	*cygwin* | *mingw* | *cegcc* )
+	  if test -f "$output_objdir/$my_outputname.def"; then
+	    compile_command=`$ECHO "$compile_command" | $SED "s%@SYMFILE@%$output_objdir/$my_outputname.def $symfileobj%"`
+	    finalize_command=`$ECHO "$finalize_command" | $SED "s%@SYMFILE@%$output_objdir/$my_outputname.def $symfileobj%"`
+	  else
+	    compile_command=`$ECHO "$compile_command" | $SED "s%@SYMFILE@%$symfileobj%"`
+	    finalize_command=`$ECHO "$finalize_command" | $SED "s%@SYMFILE@%$symfileobj%"`
+	  fi
+	  ;;
+	*)
+	  compile_command=`$ECHO "$compile_command" | $SED "s%@SYMFILE@%$symfileobj%"`
+	  finalize_command=`$ECHO "$finalize_command" | $SED "s%@SYMFILE@%$symfileobj%"`
+	  ;;
+	esac
+	;;
+      *)
+	func_fatal_error "unknown suffix for \`$my_dlsyms'"
+	;;
+      esac
+    else
+      # We keep going just in case the user didn't refer to
+      # lt_preloaded_symbols.  The linker will fail if global_symbol_pipe
+      # really was required.
+
+      # Nullify the symbol file.
+      compile_command=`$ECHO "$compile_command" | $SED "s% @SYMFILE@%%"`
+      finalize_command=`$ECHO "$finalize_command" | $SED "s% @SYMFILE@%%"`
+    fi
+}
+
+# func_win32_libid arg
+# return the library type of file 'arg'
+#
+# Need a lot of goo to handle *both* DLLs and import libs
+# Has to be a shell function in order to 'eat' the argument
+# that is supplied when $file_magic_command is called.
+# Despite the name, also deal with 64 bit binaries.
+func_win32_libid ()
+{
+  $opt_debug
+  win32_libid_type="unknown"
+  win32_fileres=`file -L $1 2>/dev/null`
+  case $win32_fileres in
+  *ar\ archive\ import\ library*) # definitely import
+    win32_libid_type="x86 archive import"
+    ;;
+  *ar\ archive*) # could be an import, or static
+    # Keep the egrep pattern in sync with the one in _LT_CHECK_MAGIC_METHOD.
+    if eval $OBJDUMP -f $1 | $SED -e '10q' 2>/dev/null |
+       $EGREP 'file format (pei*-i386(.*architecture: i386)?|pe-arm-wince|pe-x86-64)' >/dev/null; then
+      func_to_tool_file "$1" func_convert_file_msys_to_w32
+      win32_nmres=`eval $NM -f posix -A \"$func_to_tool_file_result\" |
+	$SED -n -e '
+	    1,100{
+		/ I /{
+		    s,.*,import,
+		    p
+		    q
+		}
+	    }'`
+      case $win32_nmres in
+      import*)  win32_libid_type="x86 archive import";;
+      *)        win32_libid_type="x86 archive static";;
+      esac
+    fi
+    ;;
+  *DLL*)
+    win32_libid_type="x86 DLL"
+    ;;
+  *executable*) # but shell scripts are "executable" too...
+    case $win32_fileres in
+    *MS\ Windows\ PE\ Intel*)
+      win32_libid_type="x86 DLL"
+      ;;
+    esac
+    ;;
+  esac
+  $ECHO "$win32_libid_type"
+}
+
+# func_cygming_dll_for_implib ARG
+#
+# Platform-specific function to extract the
+# name of the DLL associated with the specified
+# import library ARG.
+# Invoked by eval'ing the libtool variable
+#    $sharedlib_from_linklib_cmd
+# Result is available in the variable
+#    $sharedlib_from_linklib_result
+func_cygming_dll_for_implib ()
+{
+  $opt_debug
+  sharedlib_from_linklib_result=`$DLLTOOL --identify-strict --identify "$1"`
+}
+
+# func_cygming_dll_for_implib_fallback_core SECTION_NAME LIBNAMEs
+#
+# The is the core of a fallback implementation of a
+# platform-specific function to extract the name of the
+# DLL associated with the specified import library LIBNAME.
+#
+# SECTION_NAME is either .idata$6 or .idata$7, depending
+# on the platform and compiler that created the implib.
+#
+# Echos the name of the DLL associated with the
+# specified import library.
+func_cygming_dll_for_implib_fallback_core ()
+{
+  $opt_debug
+  match_literal=`$ECHO "$1" | $SED "$sed_make_literal_regex"`
+  $OBJDUMP -s --section "$1" "$2" 2>/dev/null |
+    $SED '/^Contents of section '"$match_literal"':/{
+      # Place marker at beginning of archive member dllname section
+      s/.*/====MARK====/
+      p
+      d
+    }
+    # These lines can sometimes be longer than 43 characters, but
+    # are always uninteresting
+    /:[	 ]*file format pe[i]\{,1\}-/d
+    /^In archive [^:]*:/d
+    # Ensure marker is printed
+    /^====MARK====/p
+    # Remove all lines with less than 43 characters
+    /^.\{43\}/!d
+    # From remaining lines, remove first 43 characters
+    s/^.\{43\}//' |
+    $SED -n '
+      # Join marker and all lines until next marker into a single line
+      /^====MARK====/ b para
+      H
+      $ b para
+      b
+      :para
+      x
+      s/\n//g
+      # Remove the marker
+      s/^====MARK====//
+      # Remove trailing dots and whitespace
+      s/[\. \t]*$//
+      # Print
+      /./p' |
+    # we now have a list, one entry per line, of the stringified
+    # contents of the appropriate section of all members of the
+    # archive which possess that section. Heuristic: eliminate
+    # all those which have a first or second character that is
+    # a '.' (that is, objdump's representation of an unprintable
+    # character.) This should work for all archives with less than
+    # 0x302f exports -- but will fail for DLLs whose name actually
+    # begins with a literal '.' or a single character followed by
+    # a '.'.
+    #
+    # Of those that remain, print the first one.
+    $SED -e '/^\./d;/^.\./d;q'
+}
+
+# func_cygming_gnu_implib_p ARG
+# This predicate returns with zero status (TRUE) if
+# ARG is a GNU/binutils-style import library. Returns
+# with nonzero status (FALSE) otherwise.
+func_cygming_gnu_implib_p ()
+{
+  $opt_debug
+  func_to_tool_file "$1" func_convert_file_msys_to_w32
+  func_cygming_gnu_implib_tmp=`$NM "$func_to_tool_file_result" | eval "$global_symbol_pipe" | $EGREP ' (_head_[A-Za-z0-9_]+_[ad]l*|[A-Za-z0-9_]+_[ad]l*_iname)$'`
+  test -n "$func_cygming_gnu_implib_tmp"
+}
+
+# func_cygming_ms_implib_p ARG
+# This predicate returns with zero status (TRUE) if
+# ARG is an MS-style import library. Returns
+# with nonzero status (FALSE) otherwise.
+func_cygming_ms_implib_p ()
+{
+  $opt_debug
+  func_to_tool_file "$1" func_convert_file_msys_to_w32
+  func_cygming_ms_implib_tmp=`$NM "$func_to_tool_file_result" | eval "$global_symbol_pipe" | $GREP '_NULL_IMPORT_DESCRIPTOR'`
+  test -n "$func_cygming_ms_implib_tmp"
+}
+
+# func_cygming_dll_for_implib_fallback ARG
+# Platform-specific function to extract the
+# name of the DLL associated with the specified
+# import library ARG.
+#
+# This fallback implementation is for use when $DLLTOOL
+# does not support the --identify-strict option.
+# Invoked by eval'ing the libtool variable
+#    $sharedlib_from_linklib_cmd
+# Result is available in the variable
+#    $sharedlib_from_linklib_result
+func_cygming_dll_for_implib_fallback ()
+{
+  $opt_debug
+  if func_cygming_gnu_implib_p "$1" ; then
+    # binutils import library
+    sharedlib_from_linklib_result=`func_cygming_dll_for_implib_fallback_core '.idata$7' "$1"`
+  elif func_cygming_ms_implib_p "$1" ; then
+    # ms-generated import library
+    sharedlib_from_linklib_result=`func_cygming_dll_for_implib_fallback_core '.idata$6' "$1"`
+  else
+    # unknown
+    sharedlib_from_linklib_result=""
+  fi
+}
+
+
+# func_extract_an_archive dir oldlib
+func_extract_an_archive ()
+{
+    $opt_debug
+    f_ex_an_ar_dir="$1"; shift
+    f_ex_an_ar_oldlib="$1"
+    if test "$lock_old_archive_extraction" = yes; then
+      lockfile=$f_ex_an_ar_oldlib.lock
+      until $opt_dry_run || ln "$progpath" "$lockfile" 2>/dev/null; do
+	func_echo "Waiting for $lockfile to be removed"
+	sleep 2
+      done
+    fi
+    func_show_eval "(cd \$f_ex_an_ar_dir && $AR x \"\$f_ex_an_ar_oldlib\")" \
+		   'stat=$?; rm -f "$lockfile"; exit $stat'
+    if test "$lock_old_archive_extraction" = yes; then
+      $opt_dry_run || rm -f "$lockfile"
+    fi
+    if ($AR t "$f_ex_an_ar_oldlib" | sort | sort -uc >/dev/null 2>&1); then
+     :
+    else
+      func_fatal_error "object name conflicts in archive: $f_ex_an_ar_dir/$f_ex_an_ar_oldlib"
+    fi
+}
+
+
+# func_extract_archives gentop oldlib ...
+func_extract_archives ()
+{
+    $opt_debug
+    my_gentop="$1"; shift
+    my_oldlibs=${1+"$@"}
+    my_oldobjs=""
+    my_xlib=""
+    my_xabs=""
+    my_xdir=""
+
+    for my_xlib in $my_oldlibs; do
+      # Extract the objects.
+      case $my_xlib in
+	[\\/]* | [A-Za-z]:[\\/]*) my_xabs="$my_xlib" ;;
+	*) my_xabs=`pwd`"/$my_xlib" ;;
+      esac
+      func_basename "$my_xlib"
+      my_xlib="$func_basename_result"
+      my_xlib_u=$my_xlib
+      while :; do
+        case " $extracted_archives " in
+	*" $my_xlib_u "*)
+	  func_arith $extracted_serial + 1
+	  extracted_serial=$func_arith_result
+	  my_xlib_u=lt$extracted_serial-$my_xlib ;;
+	*) break ;;
+	esac
+      done
+      extracted_archives="$extracted_archives $my_xlib_u"
+      my_xdir="$my_gentop/$my_xlib_u"
+
+      func_mkdir_p "$my_xdir"
+
+      case $host in
+      *-darwin*)
+	func_verbose "Extracting $my_xabs"
+	# Do not bother doing anything if just a dry run
+	$opt_dry_run || {
+	  darwin_orig_dir=`pwd`
+	  cd $my_xdir || exit $?
+	  darwin_archive=$my_xabs
+	  darwin_curdir=`pwd`
+	  darwin_base_archive=`basename "$darwin_archive"`
+	  darwin_arches=`$LIPO -info "$darwin_archive" 2>/dev/null | $GREP Architectures 2>/dev/null || true`
+	  if test -n "$darwin_arches"; then
+	    darwin_arches=`$ECHO "$darwin_arches" | $SED -e 's/.*are://'`
+	    darwin_arch=
+	    func_verbose "$darwin_base_archive has multiple architectures $darwin_arches"
+	    for darwin_arch in  $darwin_arches ; do
+	      func_mkdir_p "unfat-$$/${darwin_base_archive}-${darwin_arch}"
+	      $LIPO -thin $darwin_arch -output "unfat-$$/${darwin_base_archive}-${darwin_arch}/${darwin_base_archive}" "${darwin_archive}"
+	      cd "unfat-$$/${darwin_base_archive}-${darwin_arch}"
+	      func_extract_an_archive "`pwd`" "${darwin_base_archive}"
+	      cd "$darwin_curdir"
+	      $RM "unfat-$$/${darwin_base_archive}-${darwin_arch}/${darwin_base_archive}"
+	    done # $darwin_arches
+            ## Okay now we've a bunch of thin objects, gotta fatten them up :)
+	    darwin_filelist=`find unfat-$$ -type f -name \*.o -print -o -name \*.lo -print | $SED -e "$basename" | sort -u`
+	    darwin_file=
+	    darwin_files=
+	    for darwin_file in $darwin_filelist; do
+	      darwin_files=`find unfat-$$ -name $darwin_file -print | sort | $NL2SP`
+	      $LIPO -create -output "$darwin_file" $darwin_files
+	    done # $darwin_filelist
+	    $RM -rf unfat-$$
+	    cd "$darwin_orig_dir"
+	  else
+	    cd $darwin_orig_dir
+	    func_extract_an_archive "$my_xdir" "$my_xabs"
+	  fi # $darwin_arches
+	} # !$opt_dry_run
+	;;
+      *)
+        func_extract_an_archive "$my_xdir" "$my_xabs"
+	;;
+      esac
+      my_oldobjs="$my_oldobjs "`find $my_xdir -name \*.$objext -print -o -name \*.lo -print | sort | $NL2SP`
+    done
+
+    func_extract_archives_result="$my_oldobjs"
+}
+
+
+# func_emit_wrapper [arg=no]
+#
+# Emit a libtool wrapper script on stdout.
+# Don't directly open a file because we may want to
+# incorporate the script contents within a cygwin/mingw
+# wrapper executable.  Must ONLY be called from within
+# func_mode_link because it depends on a number of variables
+# set therein.
+#
+# ARG is the value that the WRAPPER_SCRIPT_BELONGS_IN_OBJDIR
+# variable will take.  If 'yes', then the emitted script
+# will assume that the directory in which it is stored is
+# the $objdir directory.  This is a cygwin/mingw-specific
+# behavior.
+func_emit_wrapper ()
+{
+	func_emit_wrapper_arg1=${1-no}
+
+	$ECHO "\
+#! $SHELL
+
+# $output - temporary wrapper script for $objdir/$outputname
+# Generated by $PROGRAM (GNU $PACKAGE$TIMESTAMP) $VERSION
+#
+# The $output program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='$sed_quote_subst'
+
+# Be Bourne compatible
+if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on \${1+\"\$@\"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '\${1+\"\$@\"}'='\"\$@\"'
+  setopt NO_GLOB_SUBST
+else
+  case \`(set -o) 2>/dev/null\` in *posix*) set -o posix;; esac
+fi
+BIN_SH=xpg4; export BIN_SH # for Tru64
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+relink_command=\"$relink_command\"
+
+# This environment variable determines our operation mode.
+if test \"\$libtool_install_magic\" = \"$magic\"; then
+  # install mode needs the following variables:
+  generated_by_libtool_version='$macro_version'
+  notinst_deplibs='$notinst_deplibs'
+else
+  # When we are sourced in execute mode, \$file and \$ECHO are already set.
+  if test \"\$libtool_execute_magic\" != \"$magic\"; then
+    file=\"\$0\""
+
+    qECHO=`$ECHO "$ECHO" | $SED "$sed_quote_subst"`
+    $ECHO "\
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+\$1
+_LTECHO_EOF'
+}
+    ECHO=\"$qECHO\"
+  fi
+
+# Very basic option parsing. These options are (a) specific to
+# the libtool wrapper, (b) are identical between the wrapper
+# /script/ and the wrapper /executable/ which is used only on
+# windows platforms, and (c) all begin with the string "--lt-"
+# (application programs are unlikely to have options which match
+# this pattern).
+#
+# There are only two supported options: --lt-debug and
+# --lt-dump-script. There is, deliberately, no --lt-help.
+#
+# The first argument to this parsing function should be the
+# script's $0 value, followed by "$@".
+lt_option_debug=
+func_parse_lt_options ()
+{
+  lt_script_arg0=\$0
+  shift
+  for lt_opt
+  do
+    case \"\$lt_opt\" in
+    --lt-debug) lt_option_debug=1 ;;
+    --lt-dump-script)
+        lt_dump_D=\`\$ECHO \"X\$lt_script_arg0\" | $SED -e 's/^X//' -e 's%/[^/]*$%%'\`
+        test \"X\$lt_dump_D\" = \"X\$lt_script_arg0\" && lt_dump_D=.
+        lt_dump_F=\`\$ECHO \"X\$lt_script_arg0\" | $SED -e 's/^X//' -e 's%^.*/%%'\`
+        cat \"\$lt_dump_D/\$lt_dump_F\"
+        exit 0
+      ;;
+    --lt-*)
+        \$ECHO \"Unrecognized --lt- option: '\$lt_opt'\" 1>&2
+        exit 1
+      ;;
+    esac
+  done
+
+  # Print the debug banner immediately:
+  if test -n \"\$lt_option_debug\"; then
+    echo \"${outputname}:${output}:\${LINENO}: libtool wrapper (GNU $PACKAGE$TIMESTAMP) $VERSION\" 1>&2
+  fi
+}
+
+# Used when --lt-debug. Prints its arguments to stdout
+# (redirection is the responsibility of the caller)
+func_lt_dump_args ()
+{
+  lt_dump_args_N=1;
+  for lt_arg
+  do
+    \$ECHO \"${outputname}:${output}:\${LINENO}: newargv[\$lt_dump_args_N]: \$lt_arg\"
+    lt_dump_args_N=\`expr \$lt_dump_args_N + 1\`
+  done
+}
+
+# Core function for launching the target application
+func_exec_program_core ()
+{
+"
+  case $host in
+  # Backslashes separate directories on plain windows
+  *-*-mingw | *-*-os2* | *-cegcc*)
+    $ECHO "\
+      if test -n \"\$lt_option_debug\"; then
+        \$ECHO \"${outputname}:${output}:\${LINENO}: newargv[0]: \$progdir\\\\\$program\" 1>&2
+        func_lt_dump_args \${1+\"\$@\"} 1>&2
+      fi
+      exec \"\$progdir\\\\\$program\" \${1+\"\$@\"}
+"
+    ;;
+
+  *)
+    $ECHO "\
+      if test -n \"\$lt_option_debug\"; then
+        \$ECHO \"${outputname}:${output}:\${LINENO}: newargv[0]: \$progdir/\$program\" 1>&2
+        func_lt_dump_args \${1+\"\$@\"} 1>&2
+      fi
+      exec \"\$progdir/\$program\" \${1+\"\$@\"}
+"
+    ;;
+  esac
+  $ECHO "\
+      \$ECHO \"\$0: cannot exec \$program \$*\" 1>&2
+      exit 1
+}
+
+# A function to encapsulate launching the target application
+# Strips options in the --lt-* namespace from \$@ and
+# launches target application with the remaining arguments.
+func_exec_program ()
+{
+  case \" \$* \" in
+  *\\ --lt-*)
+    for lt_wr_arg
+    do
+      case \$lt_wr_arg in
+      --lt-*) ;;
+      *) set x \"\$@\" \"\$lt_wr_arg\"; shift;;
+      esac
+      shift
+    done ;;
+  esac
+  func_exec_program_core \${1+\"\$@\"}
+}
+
+  # Parse options
+  func_parse_lt_options \"\$0\" \${1+\"\$@\"}
+
+  # Find the directory that this script lives in.
+  thisdir=\`\$ECHO \"\$file\" | $SED 's%/[^/]*$%%'\`
+  test \"x\$thisdir\" = \"x\$file\" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=\`ls -ld \"\$file\" | $SED -n 's/.*-> //p'\`
+  while test -n \"\$file\"; do
+    destdir=\`\$ECHO \"\$file\" | $SED 's%/[^/]*\$%%'\`
+
+    # If there was a directory component, then change thisdir.
+    if test \"x\$destdir\" != \"x\$file\"; then
+      case \"\$destdir\" in
+      [\\\\/]* | [A-Za-z]:[\\\\/]*) thisdir=\"\$destdir\" ;;
+      *) thisdir=\"\$thisdir/\$destdir\" ;;
+      esac
+    fi
+
+    file=\`\$ECHO \"\$file\" | $SED 's%^.*/%%'\`
+    file=\`ls -ld \"\$thisdir/\$file\" | $SED -n 's/.*-> //p'\`
+  done
+
+  # Usually 'no', except on cygwin/mingw when embedded into
+  # the cwrapper.
+  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=$func_emit_wrapper_arg1
+  if test \"\$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR\" = \"yes\"; then
+    # special case for '.'
+    if test \"\$thisdir\" = \".\"; then
+      thisdir=\`pwd\`
+    fi
+    # remove .libs from thisdir
+    case \"\$thisdir\" in
+    *[\\\\/]$objdir ) thisdir=\`\$ECHO \"\$thisdir\" | $SED 's%[\\\\/][^\\\\/]*$%%'\` ;;
+    $objdir )   thisdir=. ;;
+    esac
+  fi
+
+  # Try to get the absolute directory name.
+  absdir=\`cd \"\$thisdir\" && pwd\`
+  test -n \"\$absdir\" && thisdir=\"\$absdir\"
+"
+
+	if test "$fast_install" = yes; then
+	  $ECHO "\
+  program=lt-'$outputname'$exeext
+  progdir=\"\$thisdir/$objdir\"
+
+  if test ! -f \"\$progdir/\$program\" ||
+     { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | ${SED} 1q\`; \\
+       test \"X\$file\" != \"X\$progdir/\$program\"; }; then
+
+    file=\"\$\$-\$program\"
+
+    if test ! -d \"\$progdir\"; then
+      $MKDIR \"\$progdir\"
+    else
+      $RM \"\$progdir/\$file\"
+    fi"
+
+	  $ECHO "\
+
+    # relink executable if necessary
+    if test -n \"\$relink_command\"; then
+      if relink_command_output=\`eval \$relink_command 2>&1\`; then :
+      else
+	$ECHO \"\$relink_command_output\" >&2
+	$RM \"\$progdir/\$file\"
+	exit 1
+      fi
+    fi
+
+    $MV \"\$progdir/\$file\" \"\$progdir/\$program\" 2>/dev/null ||
+    { $RM \"\$progdir/\$program\";
+      $MV \"\$progdir/\$file\" \"\$progdir/\$program\"; }
+    $RM \"\$progdir/\$file\"
+  fi"
+	else
+	  $ECHO "\
+  program='$outputname'
+  progdir=\"\$thisdir/$objdir\"
+"
+	fi
+
+	$ECHO "\
+
+  if test -f \"\$progdir/\$program\"; then"
+
+	# fixup the dll searchpath if we need to.
+	#
+	# Fix the DLL searchpath if we need to.  Do this before prepending
+	# to shlibpath, because on Windows, both are PATH and uninstalled
+	# libraries must come first.
+	if test -n "$dllsearchpath"; then
+	  $ECHO "\
+    # Add the dll search path components to the executable PATH
+    PATH=$dllsearchpath:\$PATH
+"
+	fi
+
+	# Export our shlibpath_var if we have one.
+	if test "$shlibpath_overrides_runpath" = yes && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
+	  $ECHO "\
+    # Add our own library path to $shlibpath_var
+    $shlibpath_var=\"$temp_rpath\$$shlibpath_var\"
+
+    # Some systems cannot cope with colon-terminated $shlibpath_var
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    $shlibpath_var=\`\$ECHO \"\$$shlibpath_var\" | $SED 's/::*\$//'\`
+
+    export $shlibpath_var
+"
+	fi
+
+	$ECHO "\
+    if test \"\$libtool_execute_magic\" != \"$magic\"; then
+      # Run the actual program with our arguments.
+      func_exec_program \${1+\"\$@\"}
+    fi
+  else
+    # The program doesn't exist.
+    \$ECHO \"\$0: error: \\\`\$progdir/\$program' does not exist\" 1>&2
+    \$ECHO \"This script is just a wrapper for \$program.\" 1>&2
+    \$ECHO \"See the $PACKAGE documentation for more information.\" 1>&2
+    exit 1
+  fi
+fi\
+"
+}
+
+
+# func_emit_cwrapperexe_src
+# emit the source code for a wrapper executable on stdout
+# Must ONLY be called from within func_mode_link because
+# it depends on a number of variable set therein.
+func_emit_cwrapperexe_src ()
+{
+	cat <<EOF
+
+/* $cwrappersource - temporary wrapper executable for $objdir/$outputname
+   Generated by $PROGRAM (GNU $PACKAGE$TIMESTAMP) $VERSION
+
+   The $output program cannot be directly executed until all the libtool
+   libraries that it depends on are installed.
+
+   This wrapper executable should never be moved out of the build directory.
+   If it is, it will not operate correctly.
+*/
+EOF
+	    cat <<"EOF"
+#ifdef _MSC_VER
+# define _CRT_SECURE_NO_DEPRECATE 1
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef _MSC_VER
+# include <direct.h>
+# include <process.h>
+# include <io.h>
+#else
+# include <unistd.h>
+# include <stdint.h>
+# ifdef __CYGWIN__
+#  include <io.h>
+# endif
+#endif
+#include <malloc.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+/* declarations of non-ANSI functions */
+#if defined(__MINGW32__)
+# ifdef __STRICT_ANSI__
+int _putenv (const char *);
+# endif
+#elif defined(__CYGWIN__)
+# ifdef __STRICT_ANSI__
+char *realpath (const char *, char *);
+int putenv (char *);
+int setenv (const char *, const char *, int);
+# endif
+/* #elif defined (other platforms) ... */
+#endif
+
+/* portability defines, excluding path handling macros */
+#if defined(_MSC_VER)
+# define setmode _setmode
+# define stat    _stat
+# define chmod   _chmod
+# define getcwd  _getcwd
+# define putenv  _putenv
+# define S_IXUSR _S_IEXEC
+# ifndef _INTPTR_T_DEFINED
+#  define _INTPTR_T_DEFINED
+#  define intptr_t int
+# endif
+#elif defined(__MINGW32__)
+# define setmode _setmode
+# define stat    _stat
+# define chmod   _chmod
+# define getcwd  _getcwd
+# define putenv  _putenv
+#elif defined(__CYGWIN__)
+# define HAVE_SETENV
+# define FOPEN_WB "wb"
+/* #elif defined (other platforms) ... */
+#endif
+
+#if defined(PATH_MAX)
+# define LT_PATHMAX PATH_MAX
+#elif defined(MAXPATHLEN)
+# define LT_PATHMAX MAXPATHLEN
+#else
+# define LT_PATHMAX 1024
+#endif
+
+#ifndef S_IXOTH
+# define S_IXOTH 0
+#endif
+#ifndef S_IXGRP
+# define S_IXGRP 0
+#endif
+
+/* path handling portability macros */
+#ifndef DIR_SEPARATOR
+# define DIR_SEPARATOR '/'
+# define PATH_SEPARATOR ':'
+#endif
+
+#if defined (_WIN32) || defined (__MSDOS__) || defined (__DJGPP__) || \
+  defined (__OS2__)
+# define HAVE_DOS_BASED_FILE_SYSTEM
+# define FOPEN_WB "wb"
+# ifndef DIR_SEPARATOR_2
+#  define DIR_SEPARATOR_2 '\\'
+# endif
+# ifndef PATH_SEPARATOR_2
+#  define PATH_SEPARATOR_2 ';'
+# endif
+#endif
+
+#ifndef DIR_SEPARATOR_2
+# define IS_DIR_SEPARATOR(ch) ((ch) == DIR_SEPARATOR)
+#else /* DIR_SEPARATOR_2 */
+# define IS_DIR_SEPARATOR(ch) \
+	(((ch) == DIR_SEPARATOR) || ((ch) == DIR_SEPARATOR_2))
+#endif /* DIR_SEPARATOR_2 */
+
+#ifndef PATH_SEPARATOR_2
+# define IS_PATH_SEPARATOR(ch) ((ch) == PATH_SEPARATOR)
+#else /* PATH_SEPARATOR_2 */
+# define IS_PATH_SEPARATOR(ch) ((ch) == PATH_SEPARATOR_2)
+#endif /* PATH_SEPARATOR_2 */
+
+#ifndef FOPEN_WB
+# define FOPEN_WB "w"
+#endif
+#ifndef _O_BINARY
+# define _O_BINARY 0
+#endif
+
+#define XMALLOC(type, num)      ((type *) xmalloc ((num) * sizeof(type)))
+#define XFREE(stale) do { \
+  if (stale) { free ((void *) stale); stale = 0; } \
+} while (0)
+
+#if defined(LT_DEBUGWRAPPER)
+static int lt_debug = 1;
+#else
+static int lt_debug = 0;
+#endif
+
+const char *program_name = "libtool-wrapper"; /* in case xstrdup fails */
+
+void *xmalloc (size_t num);
+char *xstrdup (const char *string);
+const char *base_name (const char *name);
+char *find_executable (const char *wrapper);
+char *chase_symlinks (const char *pathspec);
+int make_executable (const char *path);
+int check_executable (const char *path);
+char *strendzap (char *str, const char *pat);
+void lt_debugprintf (const char *file, int line, const char *fmt, ...);
+void lt_fatal (const char *file, int line, const char *message, ...);
+static const char *nonnull (const char *s);
+static const char *nonempty (const char *s);
+void lt_setenv (const char *name, const char *value);
+char *lt_extend_str (const char *orig_value, const char *add, int to_end);
+void lt_update_exe_path (const char *name, const char *value);
+void lt_update_lib_path (const char *name, const char *value);
+char **prepare_spawn (char **argv);
+void lt_dump_script (FILE *f);
+EOF
+
+	    cat <<EOF
+volatile const char * MAGIC_EXE = "$magic_exe";
+const char * LIB_PATH_VARNAME = "$shlibpath_var";
+EOF
+
+	    if test "$shlibpath_overrides_runpath" = yes && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
+              func_to_host_path "$temp_rpath"
+	      cat <<EOF
+const char * LIB_PATH_VALUE   = "$func_to_host_path_result";
+EOF
+	    else
+	      cat <<"EOF"
+const char * LIB_PATH_VALUE   = "";
+EOF
+	    fi
+
+	    if test -n "$dllsearchpath"; then
+              func_to_host_path "$dllsearchpath:"
+	      cat <<EOF
+const char * EXE_PATH_VARNAME = "PATH";
+const char * EXE_PATH_VALUE   = "$func_to_host_path_result";
+EOF
+	    else
+	      cat <<"EOF"
+const char * EXE_PATH_VARNAME = "";
+const char * EXE_PATH_VALUE   = "";
+EOF
+	    fi
+
+	    if test "$fast_install" = yes; then
+	      cat <<EOF
+const char * TARGET_PROGRAM_NAME = "lt-$outputname"; /* hopefully, no .exe */
+EOF
+	    else
+	      cat <<EOF
+const char * TARGET_PROGRAM_NAME = "$outputname"; /* hopefully, no .exe */
+EOF
+	    fi
+
+
+	    cat <<"EOF"
+
+#define LTWRAPPER_OPTION_PREFIX         "--lt-"
+
+static const char *ltwrapper_option_prefix = LTWRAPPER_OPTION_PREFIX;
+static const char *dumpscript_opt       = LTWRAPPER_OPTION_PREFIX "dump-script";
+static const char *debug_opt            = LTWRAPPER_OPTION_PREFIX "debug";
+
+int
+main (int argc, char *argv[])
+{
+  char **newargz;
+  int  newargc;
+  char *tmp_pathspec;
+  char *actual_cwrapper_path;
+  char *actual_cwrapper_name;
+  char *target_name;
+  char *lt_argv_zero;
+  intptr_t rval = 127;
+
+  int i;
+
+  program_name = (char *) xstrdup (base_name (argv[0]));
+  newargz = XMALLOC (char *, argc + 1);
+
+  /* very simple arg parsing; don't want to rely on getopt
+   * also, copy all non cwrapper options to newargz, except
+   * argz[0], which is handled differently
+   */
+  newargc=0;
+  for (i = 1; i < argc; i++)
+    {
+      if (strcmp (argv[i], dumpscript_opt) == 0)
+	{
+EOF
+	    case "$host" in
+	      *mingw* | *cygwin* )
+		# make stdout use "unix" line endings
+		echo "          setmode(1,_O_BINARY);"
+		;;
+	      esac
+
+	    cat <<"EOF"
+	  lt_dump_script (stdout);
+	  return 0;
+	}
+      if (strcmp (argv[i], debug_opt) == 0)
+	{
+          lt_debug = 1;
+          continue;
+	}
+      if (strcmp (argv[i], ltwrapper_option_prefix) == 0)
+        {
+          /* however, if there is an option in the LTWRAPPER_OPTION_PREFIX
+             namespace, but it is not one of the ones we know about and
+             have already dealt with, above (inluding dump-script), then
+             report an error. Otherwise, targets might begin to believe
+             they are allowed to use options in the LTWRAPPER_OPTION_PREFIX
+             namespace. The first time any user complains about this, we'll
+             need to make LTWRAPPER_OPTION_PREFIX a configure-time option
+             or a configure.ac-settable value.
+           */
+          lt_fatal (__FILE__, __LINE__,
+		    "unrecognized %s option: '%s'",
+                    ltwrapper_option_prefix, argv[i]);
+        }
+      /* otherwise ... */
+      newargz[++newargc] = xstrdup (argv[i]);
+    }
+  newargz[++newargc] = NULL;
+
+EOF
+	    cat <<EOF
+  /* The GNU banner must be the first non-error debug message */
+  lt_debugprintf (__FILE__, __LINE__, "libtool wrapper (GNU $PACKAGE$TIMESTAMP) $VERSION\n");
+EOF
+	    cat <<"EOF"
+  lt_debugprintf (__FILE__, __LINE__, "(main) argv[0]: %s\n", argv[0]);
+  lt_debugprintf (__FILE__, __LINE__, "(main) program_name: %s\n", program_name);
+
+  tmp_pathspec = find_executable (argv[0]);
+  if (tmp_pathspec == NULL)
+    lt_fatal (__FILE__, __LINE__, "couldn't find %s", argv[0]);
+  lt_debugprintf (__FILE__, __LINE__,
+                  "(main) found exe (before symlink chase) at: %s\n",
+		  tmp_pathspec);
+
+  actual_cwrapper_path = chase_symlinks (tmp_pathspec);
+  lt_debugprintf (__FILE__, __LINE__,
+                  "(main) found exe (after symlink chase) at: %s\n",
+		  actual_cwrapper_path);
+  XFREE (tmp_pathspec);
+
+  actual_cwrapper_name = xstrdup (base_name (actual_cwrapper_path));
+  strendzap (actual_cwrapper_path, actual_cwrapper_name);
+
+  /* wrapper name transforms */
+  strendzap (actual_cwrapper_name, ".exe");
+  tmp_pathspec = lt_extend_str (actual_cwrapper_name, ".exe", 1);
+  XFREE (actual_cwrapper_name);
+  actual_cwrapper_name = tmp_pathspec;
+  tmp_pathspec = 0;
+
+  /* target_name transforms -- use actual target program name; might have lt- prefix */
+  target_name = xstrdup (base_name (TARGET_PROGRAM_NAME));
+  strendzap (target_name, ".exe");
+  tmp_pathspec = lt_extend_str (target_name, ".exe", 1);
+  XFREE (target_name);
+  target_name = tmp_pathspec;
+  tmp_pathspec = 0;
+
+  lt_debugprintf (__FILE__, __LINE__,
+		  "(main) libtool target name: %s\n",
+		  target_name);
+EOF
+
+	    cat <<EOF
+  newargz[0] =
+    XMALLOC (char, (strlen (actual_cwrapper_path) +
+		    strlen ("$objdir") + 1 + strlen (actual_cwrapper_name) + 1));
+  strcpy (newargz[0], actual_cwrapper_path);
+  strcat (newargz[0], "$objdir");
+  strcat (newargz[0], "/");
+EOF
+
+	    cat <<"EOF"
+  /* stop here, and copy so we don't have to do this twice */
+  tmp_pathspec = xstrdup (newargz[0]);
+
+  /* do NOT want the lt- prefix here, so use actual_cwrapper_name */
+  strcat (newargz[0], actual_cwrapper_name);
+
+  /* DO want the lt- prefix here if it exists, so use target_name */
+  lt_argv_zero = lt_extend_str (tmp_pathspec, target_name, 1);
+  XFREE (tmp_pathspec);
+  tmp_pathspec = NULL;
+EOF
+
+	    case $host_os in
+	      mingw*)
+	    cat <<"EOF"
+  {
+    char* p;
+    while ((p = strchr (newargz[0], '\\')) != NULL)
+      {
+	*p = '/';
+      }
+    while ((p = strchr (lt_argv_zero, '\\')) != NULL)
+      {
+	*p = '/';
+      }
+  }
+EOF
+	    ;;
+	    esac
+
+	    cat <<"EOF"
+  XFREE (target_name);
+  XFREE (actual_cwrapper_path);
+  XFREE (actual_cwrapper_name);
+
+  lt_setenv ("BIN_SH", "xpg4"); /* for Tru64 */
+  lt_setenv ("DUALCASE", "1");  /* for MSK sh */
+  /* Update the DLL searchpath.  EXE_PATH_VALUE ($dllsearchpath) must
+     be prepended before (that is, appear after) LIB_PATH_VALUE ($temp_rpath)
+     because on Windows, both *_VARNAMEs are PATH but uninstalled
+     libraries must come first. */
+  lt_update_exe_path (EXE_PATH_VARNAME, EXE_PATH_VALUE);
+  lt_update_lib_path (LIB_PATH_VARNAME, LIB_PATH_VALUE);
+
+  lt_debugprintf (__FILE__, __LINE__, "(main) lt_argv_zero: %s\n",
+		  nonnull (lt_argv_zero));
+  for (i = 0; i < newargc; i++)
+    {
+      lt_debugprintf (__FILE__, __LINE__, "(main) newargz[%d]: %s\n",
+		      i, nonnull (newargz[i]));
+    }
+
+EOF
+
+	    case $host_os in
+	      mingw*)
+		cat <<"EOF"
+  /* execv doesn't actually work on mingw as expected on unix */
+  newargz = prepare_spawn (newargz);
+  rval = _spawnv (_P_WAIT, lt_argv_zero, (const char * const *) newargz);
+  if (rval == -1)
+    {
+      /* failed to start process */
+      lt_debugprintf (__FILE__, __LINE__,
+		      "(main) failed to launch target \"%s\": %s\n",
+		      lt_argv_zero, nonnull (strerror (errno)));
+      return 127;
+    }
+  return rval;
+EOF
+		;;
+	      *)
+		cat <<"EOF"
+  execv (lt_argv_zero, newargz);
+  return rval; /* =127, but avoids unused variable warning */
+EOF
+		;;
+	    esac
+
+	    cat <<"EOF"
+}
+
+void *
+xmalloc (size_t num)
+{
+  void *p = (void *) malloc (num);
+  if (!p)
+    lt_fatal (__FILE__, __LINE__, "memory exhausted");
+
+  return p;
+}
+
+char *
+xstrdup (const char *string)
+{
+  return string ? strcpy ((char *) xmalloc (strlen (string) + 1),
+			  string) : NULL;
+}
+
+const char *
+base_name (const char *name)
+{
+  const char *base;
+
+#if defined (HAVE_DOS_BASED_FILE_SYSTEM)
+  /* Skip over the disk name in MSDOS pathnames. */
+  if (isalpha ((unsigned char) name[0]) && name[1] == ':')
+    name += 2;
+#endif
+
+  for (base = name; *name; name++)
+    if (IS_DIR_SEPARATOR (*name))
+      base = name + 1;
+  return base;
+}
+
+int
+check_executable (const char *path)
+{
+  struct stat st;
+
+  lt_debugprintf (__FILE__, __LINE__, "(check_executable): %s\n",
+                  nonempty (path));
+  if ((!path) || (!*path))
+    return 0;
+
+  if ((stat (path, &st) >= 0)
+      && (st.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)))
+    return 1;
+  else
+    return 0;
+}
+
+int
+make_executable (const char *path)
+{
+  int rval = 0;
+  struct stat st;
+
+  lt_debugprintf (__FILE__, __LINE__, "(make_executable): %s\n",
+                  nonempty (path));
+  if ((!path) || (!*path))
+    return 0;
+
+  if (stat (path, &st) >= 0)
+    {
+      rval = chmod (path, st.st_mode | S_IXOTH | S_IXGRP | S_IXUSR);
+    }
+  return rval;
+}
+
+/* Searches for the full path of the wrapper.  Returns
+   newly allocated full path name if found, NULL otherwise
+   Does not chase symlinks, even on platforms that support them.
+*/
+char *
+find_executable (const char *wrapper)
+{
+  int has_slash = 0;
+  const char *p;
+  const char *p_next;
+  /* static buffer for getcwd */
+  char tmp[LT_PATHMAX + 1];
+  int tmp_len;
+  char *concat_name;
+
+  lt_debugprintf (__FILE__, __LINE__, "(find_executable): %s\n",
+                  nonempty (wrapper));
+
+  if ((wrapper == NULL) || (*wrapper == '\0'))
+    return NULL;
+
+  /* Absolute path? */
+#if defined (HAVE_DOS_BASED_FILE_SYSTEM)
+  if (isalpha ((unsigned char) wrapper[0]) && wrapper[1] == ':')
+    {
+      concat_name = xstrdup (wrapper);
+      if (check_executable (concat_name))
+	return concat_name;
+      XFREE (concat_name);
+    }
+  else
+    {
+#endif
+      if (IS_DIR_SEPARATOR (wrapper[0]))
+	{
+	  concat_name = xstrdup (wrapper);
+	  if (check_executable (concat_name))
+	    return concat_name;
+	  XFREE (concat_name);
+	}
+#if defined (HAVE_DOS_BASED_FILE_SYSTEM)
+    }
+#endif
+
+  for (p = wrapper; *p; p++)
+    if (*p == '/')
+      {
+	has_slash = 1;
+	break;
+      }
+  if (!has_slash)
+    {
+      /* no slashes; search PATH */
+      const char *path = getenv ("PATH");
+      if (path != NULL)
+	{
+	  for (p = path; *p; p = p_next)
+	    {
+	      const char *q;
+	      size_t p_len;
+	      for (q = p; *q; q++)
+		if (IS_PATH_SEPARATOR (*q))
+		  break;
+	      p_len = q - p;
+	      p_next = (*q == '\0' ? q : q + 1);
+	      if (p_len == 0)
+		{
+		  /* empty path: current directory */
+		  if (getcwd (tmp, LT_PATHMAX) == NULL)
+		    lt_fatal (__FILE__, __LINE__, "getcwd failed: %s",
+                              nonnull (strerror (errno)));
+		  tmp_len = strlen (tmp);
+		  concat_name =
+		    XMALLOC (char, tmp_len + 1 + strlen (wrapper) + 1);
+		  memcpy (concat_name, tmp, tmp_len);
+		  concat_name[tmp_len] = '/';
+		  strcpy (concat_name + tmp_len + 1, wrapper);
+		}
+	      else
+		{
+		  concat_name =
+		    XMALLOC (char, p_len + 1 + strlen (wrapper) + 1);
+		  memcpy (concat_name, p, p_len);
+		  concat_name[p_len] = '/';
+		  strcpy (concat_name + p_len + 1, wrapper);
+		}
+	      if (check_executable (concat_name))
+		return concat_name;
+	      XFREE (concat_name);
+	    }
+	}
+      /* not found in PATH; assume curdir */
+    }
+  /* Relative path | not found in path: prepend cwd */
+  if (getcwd (tmp, LT_PATHMAX) == NULL)
+    lt_fatal (__FILE__, __LINE__, "getcwd failed: %s",
+              nonnull (strerror (errno)));
+  tmp_len = strlen (tmp);
+  concat_name = XMALLOC (char, tmp_len + 1 + strlen (wrapper) + 1);
+  memcpy (concat_name, tmp, tmp_len);
+  concat_name[tmp_len] = '/';
+  strcpy (concat_name + tmp_len + 1, wrapper);
+
+  if (check_executable (concat_name))
+    return concat_name;
+  XFREE (concat_name);
+  return NULL;
+}
+
+char *
+chase_symlinks (const char *pathspec)
+{
+#ifndef S_ISLNK
+  return xstrdup (pathspec);
+#else
+  char buf[LT_PATHMAX];
+  struct stat s;
+  char *tmp_pathspec = xstrdup (pathspec);
+  char *p;
+  int has_symlinks = 0;
+  while (strlen (tmp_pathspec) && !has_symlinks)
+    {
+      lt_debugprintf (__FILE__, __LINE__,
+		      "checking path component for symlinks: %s\n",
+		      tmp_pathspec);
+      if (lstat (tmp_pathspec, &s) == 0)
+	{
+	  if (S_ISLNK (s.st_mode) != 0)
+	    {
+	      has_symlinks = 1;
+	      break;
+	    }
+
+	  /* search backwards for last DIR_SEPARATOR */
+	  p = tmp_pathspec + strlen (tmp_pathspec) - 1;
+	  while ((p > tmp_pathspec) && (!IS_DIR_SEPARATOR (*p)))
+	    p--;
+	  if ((p == tmp_pathspec) && (!IS_DIR_SEPARATOR (*p)))
+	    {
+	      /* no more DIR_SEPARATORS left */
+	      break;
+	    }
+	  *p = '\0';
+	}
+      else
+	{
+	  lt_fatal (__FILE__, __LINE__,
+		    "error accessing file \"%s\": %s",
+		    tmp_pathspec, nonnull (strerror (errno)));
+	}
+    }
+  XFREE (tmp_pathspec);
+
+  if (!has_symlinks)
+    {
+      return xstrdup (pathspec);
+    }
+
+  tmp_pathspec = realpath (pathspec, buf);
+  if (tmp_pathspec == 0)
+    {
+      lt_fatal (__FILE__, __LINE__,
+		"could not follow symlinks for %s", pathspec);
+    }
+  return xstrdup (tmp_pathspec);
+#endif
+}
+
+char *
+strendzap (char *str, const char *pat)
+{
+  size_t len, patlen;
+
+  assert (str != NULL);
+  assert (pat != NULL);
+
+  len = strlen (str);
+  patlen = strlen (pat);
+
+  if (patlen <= len)
+    {
+      str += len - patlen;
+      if (strcmp (str, pat) == 0)
+	*str = '\0';
+    }
+  return str;
+}
+
+void
+lt_debugprintf (const char *file, int line, const char *fmt, ...)
+{
+  va_list args;
+  if (lt_debug)
+    {
+      (void) fprintf (stderr, "%s:%s:%d: ", program_name, file, line);
+      va_start (args, fmt);
+      (void) vfprintf (stderr, fmt, args);
+      va_end (args);
+    }
+}
+
+static void
+lt_error_core (int exit_status, const char *file,
+	       int line, const char *mode,
+	       const char *message, va_list ap)
+{
+  fprintf (stderr, "%s:%s:%d: %s: ", program_name, file, line, mode);
+  vfprintf (stderr, message, ap);
+  fprintf (stderr, ".\n");
+
+  if (exit_status >= 0)
+    exit (exit_status);
+}
+
+void
+lt_fatal (const char *file, int line, const char *message, ...)
+{
+  va_list ap;
+  va_start (ap, message);
+  lt_error_core (EXIT_FAILURE, file, line, "FATAL", message, ap);
+  va_end (ap);
+}
+
+static const char *
+nonnull (const char *s)
+{
+  return s ? s : "(null)";
+}
+
+static const char *
+nonempty (const char *s)
+{
+  return (s && !*s) ? "(empty)" : nonnull (s);
+}
+
+void
+lt_setenv (const char *name, const char *value)
+{
+  lt_debugprintf (__FILE__, __LINE__,
+		  "(lt_setenv) setting '%s' to '%s'\n",
+                  nonnull (name), nonnull (value));
+  {
+#ifdef HAVE_SETENV
+    /* always make a copy, for consistency with !HAVE_SETENV */
+    char *str = xstrdup (value);
+    setenv (name, str, 1);
+#else
+    int len = strlen (name) + 1 + strlen (value) + 1;
+    char *str = XMALLOC (char, len);
+    sprintf (str, "%s=%s", name, value);
+    if (putenv (str) != EXIT_SUCCESS)
+      {
+        XFREE (str);
+      }
+#endif
+  }
+}
+
+char *
+lt_extend_str (const char *orig_value, const char *add, int to_end)
+{
+  char *new_value;
+  if (orig_value && *orig_value)
+    {
+      int orig_value_len = strlen (orig_value);
+      int add_len = strlen (add);
+      new_value = XMALLOC (char, add_len + orig_value_len + 1);
+      if (to_end)
+        {
+          strcpy (new_value, orig_value);
+          strcpy (new_value + orig_value_len, add);
+        }
+      else
+        {
+          strcpy (new_value, add);
+          strcpy (new_value + add_len, orig_value);
+        }
+    }
+  else
+    {
+      new_value = xstrdup (add);
+    }
+  return new_value;
+}
+
+void
+lt_update_exe_path (const char *name, const char *value)
+{
+  lt_debugprintf (__FILE__, __LINE__,
+		  "(lt_update_exe_path) modifying '%s' by prepending '%s'\n",
+                  nonnull (name), nonnull (value));
+
+  if (name && *name && value && *value)
+    {
+      char *new_value = lt_extend_str (getenv (name), value, 0);
+      /* some systems can't cope with a ':'-terminated path #' */
+      int len = strlen (new_value);
+      while (((len = strlen (new_value)) > 0) && IS_PATH_SEPARATOR (new_value[len-1]))
+        {
+          new_value[len-1] = '\0';
+        }
+      lt_setenv (name, new_value);
+      XFREE (new_value);
+    }
+}
+
+void
+lt_update_lib_path (const char *name, const char *value)
+{
+  lt_debugprintf (__FILE__, __LINE__,
+		  "(lt_update_lib_path) modifying '%s' by prepending '%s'\n",
+                  nonnull (name), nonnull (value));
+
+  if (name && *name && value && *value)
+    {
+      char *new_value = lt_extend_str (getenv (name), value, 0);
+      lt_setenv (name, new_value);
+      XFREE (new_value);
+    }
+}
+
+EOF
+	    case $host_os in
+	      mingw*)
+		cat <<"EOF"
+
+/* Prepares an argument vector before calling spawn().
+   Note that spawn() does not by itself call the command interpreter
+     (getenv ("COMSPEC") != NULL ? getenv ("COMSPEC") :
+      ({ OSVERSIONINFO v; v.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+         GetVersionEx(&v);
+         v.dwPlatformId == VER_PLATFORM_WIN32_NT;
+      }) ? "cmd.exe" : "command.com").
+   Instead it simply concatenates the arguments, separated by ' ', and calls
+   CreateProcess().  We must quote the arguments since Win32 CreateProcess()
+   interprets characters like ' ', '\t', '\\', '"' (but not '<' and '>') in a
+   special way:
+   - Space and tab are interpreted as delimiters. They are not treated as
+     delimiters if they are surrounded by double quotes: "...".
+   - Unescaped double quotes are removed from the input. Their only effect is
+     that within double quotes, space and tab are treated like normal
+     characters.
+   - Backslashes not followed by double quotes are not special.
+   - But 2*n+1 backslashes followed by a double quote become
+     n backslashes followed by a double quote (n >= 0):
+       \" -> "
+       \\\" -> \"
+       \\\\\" -> \\"
+ */
+#define SHELL_SPECIAL_CHARS "\"\\ \001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
+#define SHELL_SPACE_CHARS " \001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
+char **
+prepare_spawn (char **argv)
+{
+  size_t argc;
+  char **new_argv;
+  size_t i;
+
+  /* Count number of arguments.  */
+  for (argc = 0; argv[argc] != NULL; argc++)
+    ;
+
+  /* Allocate new argument vector.  */
+  new_argv = XMALLOC (char *, argc + 1);
+
+  /* Put quoted arguments into the new argument vector.  */
+  for (i = 0; i < argc; i++)
+    {
+      const char *string = argv[i];
+
+      if (string[0] == '\0')
+	new_argv[i] = xstrdup ("\"\"");
+      else if (strpbrk (string, SHELL_SPECIAL_CHARS) != NULL)
+	{
+	  int quote_around = (strpbrk (string, SHELL_SPACE_CHARS) != NULL);
+	  size_t length;
+	  unsigned int backslashes;
+	  const char *s;
+	  char *quoted_string;
+	  char *p;
+
+	  length = 0;
+	  backslashes = 0;
+	  if (quote_around)
+	    length++;
+	  for (s = string; *s != '\0'; s++)
+	    {
+	      char c = *s;
+	      if (c == '"')
+		length += backslashes + 1;
+	      length++;
+	      if (c == '\\')
+		backslashes++;
+	      else
+		backslashes = 0;
+	    }
+	  if (quote_around)
+	    length += backslashes + 1;
+
+	  quoted_string = XMALLOC (char, length + 1);
+
+	  p = quoted_string;
+	  backslashes = 0;
+	  if (quote_around)
+	    *p++ = '"';
+	  for (s = string; *s != '\0'; s++)
+	    {
+	      char c = *s;
+	      if (c == '"')
+		{
+		  unsigned int j;
+		  for (j = backslashes + 1; j > 0; j--)
+		    *p++ = '\\';
+		}
+	      *p++ = c;
+	      if (c == '\\')
+		backslashes++;
+	      else
+		backslashes = 0;
+	    }
+	  if (quote_around)
+	    {
+	      unsigned int j;
+	      for (j = backslashes; j > 0; j--)
+		*p++ = '\\';
+	      *p++ = '"';
+	    }
+	  *p = '\0';
+
+	  new_argv[i] = quoted_string;
+	}
+      else
+	new_argv[i] = (char *) string;
+    }
+  new_argv[argc] = NULL;
+
+  return new_argv;
+}
+EOF
+		;;
+	    esac
+
+            cat <<"EOF"
+void lt_dump_script (FILE* f)
+{
+EOF
+	    func_emit_wrapper yes |
+	      $SED -n -e '
+s/^\(.\{79\}\)\(..*\)/\1\
+\2/
+h
+s/\([\\"]\)/\\\1/g
+s/$/\\n/
+s/\([^\n]*\).*/  fputs ("\1", f);/p
+g
+D'
+            cat <<"EOF"
+}
+EOF
+}
+# end: func_emit_cwrapperexe_src
+
+# func_win32_import_lib_p ARG
+# True if ARG is an import lib, as indicated by $file_magic_cmd
+func_win32_import_lib_p ()
+{
+    $opt_debug
+    case `eval $file_magic_cmd \"\$1\" 2>/dev/null | $SED -e 10q` in
+    *import*) : ;;
+    *) false ;;
+    esac
+}
+
+# func_mode_link arg...
+func_mode_link ()
+{
+    $opt_debug
+    case $host in
+    *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*)
+      # It is impossible to link a dll without this setting, and
+      # we shouldn't force the makefile maintainer to figure out
+      # which system we are compiling for in order to pass an extra
+      # flag for every libtool invocation.
+      # allow_undefined=no
+
+      # FIXME: Unfortunately, there are problems with the above when trying
+      # to make a dll which has undefined symbols, in which case not
+      # even a static library is built.  For now, we need to specify
+      # -no-undefined on the libtool link line when we can be certain
+      # that all symbols are satisfied, otherwise we get a static library.
+      allow_undefined=yes
+      ;;
+    *)
+      allow_undefined=yes
+      ;;
+    esac
+    libtool_args=$nonopt
+    base_compile="$nonopt $@"
+    compile_command=$nonopt
+    finalize_command=$nonopt
+
+    compile_rpath=
+    finalize_rpath=
+    compile_shlibpath=
+    finalize_shlibpath=
+    convenience=
+    old_convenience=
+    deplibs=
+    old_deplibs=
+    compiler_flags=
+    linker_flags=
+    dllsearchpath=
+    lib_search_path=`pwd`
+    inst_prefix_dir=
+    new_inherited_linker_flags=
+
+    avoid_version=no
+    bindir=
+    dlfiles=
+    dlprefiles=
+    dlself=no
+    export_dynamic=no
+    export_symbols=
+    export_symbols_regex=
+    generated=
+    libobjs=
+    ltlibs=
+    module=no
+    no_install=no
+    objs=
+    non_pic_objects=
+    precious_files_regex=
+    prefer_static_libs=no
+    preload=no
+    prev=
+    prevarg=
+    release=
+    rpath=
+    xrpath=
+    perm_rpath=
+    temp_rpath=
+    thread_safe=no
+    vinfo=
+    vinfo_number=no
+    weak_libs=
+    single_module="${wl}-single_module"
+    func_infer_tag $base_compile
+
+    # We need to know -static, to get the right output filenames.
+    for arg
+    do
+      case $arg in
+      -shared)
+	test "$build_libtool_libs" != yes && \
+	  func_fatal_configuration "can not build a shared library"
+	build_old_libs=no
+	break
+	;;
+      -all-static | -static | -static-libtool-libs)
+	case $arg in
+	-all-static)
+	  if test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then
+	    func_warning "complete static linking is impossible in this configuration"
+	  fi
+	  if test -n "$link_static_flag"; then
+	    dlopen_self=$dlopen_self_static
+	  fi
+	  prefer_static_libs=yes
+	  ;;
+	-static)
+	  if test -z "$pic_flag" && test -n "$link_static_flag"; then
+	    dlopen_self=$dlopen_self_static
+	  fi
+	  prefer_static_libs=built
+	  ;;
+	-static-libtool-libs)
+	  if test -z "$pic_flag" && test -n "$link_static_flag"; then
+	    dlopen_self=$dlopen_self_static
+	  fi
+	  prefer_static_libs=yes
+	  ;;
+	esac
+	build_libtool_libs=no
+	build_old_libs=yes
+	break
+	;;
+      esac
+    done
+
+    # See if our shared archives depend on static archives.
+    test -n "$old_archive_from_new_cmds" && build_old_libs=yes
+
+    # Go through the arguments, transforming them on the way.
+    while test "$#" -gt 0; do
+      arg="$1"
+      shift
+      func_quote_for_eval "$arg"
+      qarg=$func_quote_for_eval_unquoted_result
+      func_append libtool_args " $func_quote_for_eval_result"
+
+      # If the previous option needs an argument, assign it.
+      if test -n "$prev"; then
+	case $prev in
+	output)
+	  func_append compile_command " @OUTPUT@"
+	  func_append finalize_command " @OUTPUT@"
+	  ;;
+	esac
+
+	case $prev in
+	bindir)
+	  bindir="$arg"
+	  prev=
+	  continue
+	  ;;
+	dlfiles|dlprefiles)
+	  if test "$preload" = no; then
+	    # Add the symbol object into the linking commands.
+	    func_append compile_command " @SYMFILE@"
+	    func_append finalize_command " @SYMFILE@"
+	    preload=yes
+	  fi
+	  case $arg in
+	  *.la | *.lo) ;;  # We handle these cases below.
+	  force)
+	    if test "$dlself" = no; then
+	      dlself=needless
+	      export_dynamic=yes
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  self)
+	    if test "$prev" = dlprefiles; then
+	      dlself=yes
+	    elif test "$prev" = dlfiles && test "$dlopen_self" != yes; then
+	      dlself=yes
+	    else
+	      dlself=needless
+	      export_dynamic=yes
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  *)
+	    if test "$prev" = dlfiles; then
+	      func_append dlfiles " $arg"
+	    else
+	      func_append dlprefiles " $arg"
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  esac
+	  ;;
+	expsyms)
+	  export_symbols="$arg"
+	  test -f "$arg" \
+	    || func_fatal_error "symbol file \`$arg' does not exist"
+	  prev=
+	  continue
+	  ;;
+	expsyms_regex)
+	  export_symbols_regex="$arg"
+	  prev=
+	  continue
+	  ;;
+	framework)
+	  case $host in
+	    *-*-darwin*)
+	      case "$deplibs " in
+		*" $qarg.ltframework "*) ;;
+		*) func_append deplibs " $qarg.ltframework" # this is fixed later
+		   ;;
+	      esac
+	      ;;
+	  esac
+	  prev=
+	  continue
+	  ;;
+	inst_prefix)
+	  inst_prefix_dir="$arg"
+	  prev=
+	  continue
+	  ;;
+	objectlist)
+	  if test -f "$arg"; then
+	    save_arg=$arg
+	    moreargs=
+	    for fil in `cat "$save_arg"`
+	    do
+#	      func_append moreargs " $fil"
+	      arg=$fil
+	      # A libtool-controlled object.
+
+	      # Check to see that this really is a libtool object.
+	      if func_lalib_unsafe_p "$arg"; then
+		pic_object=
+		non_pic_object=
+
+		# Read the .lo file
+		func_source "$arg"
+
+		if test -z "$pic_object" ||
+		   test -z "$non_pic_object" ||
+		   test "$pic_object" = none &&
+		   test "$non_pic_object" = none; then
+		  func_fatal_error "cannot find name of object for \`$arg'"
+		fi
+
+		# Extract subdirectory from the argument.
+		func_dirname "$arg" "/" ""
+		xdir="$func_dirname_result"
+
+		if test "$pic_object" != none; then
+		  # Prepend the subdirectory the object is found in.
+		  pic_object="$xdir$pic_object"
+
+		  if test "$prev" = dlfiles; then
+		    if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+		      func_append dlfiles " $pic_object"
+		      prev=
+		      continue
+		    else
+		      # If libtool objects are unsupported, then we need to preload.
+		      prev=dlprefiles
+		    fi
+		  fi
+
+		  # CHECK ME:  I think I busted this.  -Ossama
+		  if test "$prev" = dlprefiles; then
+		    # Preload the old-style object.
+		    func_append dlprefiles " $pic_object"
+		    prev=
+		  fi
+
+		  # A PIC object.
+		  func_append libobjs " $pic_object"
+		  arg="$pic_object"
+		fi
+
+		# Non-PIC object.
+		if test "$non_pic_object" != none; then
+		  # Prepend the subdirectory the object is found in.
+		  non_pic_object="$xdir$non_pic_object"
+
+		  # A standard non-PIC object
+		  func_append non_pic_objects " $non_pic_object"
+		  if test -z "$pic_object" || test "$pic_object" = none ; then
+		    arg="$non_pic_object"
+		  fi
+		else
+		  # If the PIC object exists, use it instead.
+		  # $xdir was prepended to $pic_object above.
+		  non_pic_object="$pic_object"
+		  func_append non_pic_objects " $non_pic_object"
+		fi
+	      else
+		# Only an error if not doing a dry-run.
+		if $opt_dry_run; then
+		  # Extract subdirectory from the argument.
+		  func_dirname "$arg" "/" ""
+		  xdir="$func_dirname_result"
+
+		  func_lo2o "$arg"
+		  pic_object=$xdir$objdir/$func_lo2o_result
+		  non_pic_object=$xdir$func_lo2o_result
+		  func_append libobjs " $pic_object"
+		  func_append non_pic_objects " $non_pic_object"
+	        else
+		  func_fatal_error "\`$arg' is not a valid libtool object"
+		fi
+	      fi
+	    done
+	  else
+	    func_fatal_error "link input file \`$arg' does not exist"
+	  fi
+	  arg=$save_arg
+	  prev=
+	  continue
+	  ;;
+	precious_regex)
+	  precious_files_regex="$arg"
+	  prev=
+	  continue
+	  ;;
+	release)
+	  release="-$arg"
+	  prev=
+	  continue
+	  ;;
+	rpath | xrpath)
+	  # We need an absolute path.
+	  case $arg in
+	  [\\/]* | [A-Za-z]:[\\/]*) ;;
+	  *)
+	    func_fatal_error "only absolute run-paths are allowed"
+	    ;;
+	  esac
+	  if test "$prev" = rpath; then
+	    case "$rpath " in
+	    *" $arg "*) ;;
+	    *) func_append rpath " $arg" ;;
+	    esac
+	  else
+	    case "$xrpath " in
+	    *" $arg "*) ;;
+	    *) func_append xrpath " $arg" ;;
+	    esac
+	  fi
+	  prev=
+	  continue
+	  ;;
+	shrext)
+	  shrext_cmds="$arg"
+	  prev=
+	  continue
+	  ;;
+	weak)
+	  func_append weak_libs " $arg"
+	  prev=
+	  continue
+	  ;;
+	xcclinker)
+	  func_append linker_flags " $qarg"
+	  func_append compiler_flags " $qarg"
+	  prev=
+	  func_append compile_command " $qarg"
+	  func_append finalize_command " $qarg"
+	  continue
+	  ;;
+	xcompiler)
+	  func_append compiler_flags " $qarg"
+	  prev=
+	  func_append compile_command " $qarg"
+	  func_append finalize_command " $qarg"
+	  continue
+	  ;;
+	xlinker)
+	  func_append linker_flags " $qarg"
+	  func_append compiler_flags " $wl$qarg"
+	  prev=
+	  func_append compile_command " $wl$qarg"
+	  func_append finalize_command " $wl$qarg"
+	  continue
+	  ;;
+	*)
+	  eval "$prev=\"\$arg\""
+	  prev=
+	  continue
+	  ;;
+	esac
+      fi # test -n "$prev"
+
+      prevarg="$arg"
+
+      case $arg in
+      -all-static)
+	if test -n "$link_static_flag"; then
+	  # See comment for -static flag below, for more details.
+	  func_append compile_command " $link_static_flag"
+	  func_append finalize_command " $link_static_flag"
+	fi
+	continue
+	;;
+
+      -allow-undefined)
+	# FIXME: remove this flag sometime in the future.
+	func_fatal_error "\`-allow-undefined' must not be used because it is the default"
+	;;
+
+      -avoid-version)
+	avoid_version=yes
+	continue
+	;;
+
+      -bindir)
+	prev=bindir
+	continue
+	;;
+
+      -dlopen)
+	prev=dlfiles
+	continue
+	;;
+
+      -dlpreopen)
+	prev=dlprefiles
+	continue
+	;;
+
+      -export-dynamic)
+	export_dynamic=yes
+	continue
+	;;
+
+      -export-symbols | -export-symbols-regex)
+	if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+	  func_fatal_error "more than one -exported-symbols argument is not allowed"
+	fi
+	if test "X$arg" = "X-export-symbols"; then
+	  prev=expsyms
+	else
+	  prev=expsyms_regex
+	fi
+	continue
+	;;
+
+      -framework)
+	prev=framework
+	continue
+	;;
+
+      -inst-prefix-dir)
+	prev=inst_prefix
+	continue
+	;;
+
+      # The native IRIX linker understands -LANG:*, -LIST:* and -LNO:*
+      # so, if we see these flags be careful not to treat them like -L
+      -L[A-Z][A-Z]*:*)
+	case $with_gcc/$host in
+	no/*-*-irix* | /*-*-irix*)
+	  func_append compile_command " $arg"
+	  func_append finalize_command " $arg"
+	  ;;
+	esac
+	continue
+	;;
+
+      -L*)
+	func_stripname "-L" '' "$arg"
+	if test -z "$func_stripname_result"; then
+	  if test "$#" -gt 0; then
+	    func_fatal_error "require no space between \`-L' and \`$1'"
+	  else
+	    func_fatal_error "need path for \`-L' option"
+	  fi
+	fi
+	func_resolve_sysroot "$func_stripname_result"
+	dir=$func_resolve_sysroot_result
+	# We need an absolute path.
+	case $dir in
+	[\\/]* | [A-Za-z]:[\\/]*) ;;
+	*)
+	  absdir=`cd "$dir" && pwd`
+	  test -z "$absdir" && \
+	    func_fatal_error "cannot determine absolute directory name of \`$dir'"
+	  dir="$absdir"
+	  ;;
+	esac
+	case "$deplibs " in
+	*" -L$dir "* | *" $arg "*)
+	  # Will only happen for absolute or sysroot arguments
+	  ;;
+	*)
+	  # Preserve sysroot, but never include relative directories
+	  case $dir in
+	    [\\/]* | [A-Za-z]:[\\/]* | =*) func_append deplibs " $arg" ;;
+	    *) func_append deplibs " -L$dir" ;;
+	  esac
+	  func_append lib_search_path " $dir"
+	  ;;
+	esac
+	case $host in
+	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*)
+	  testbindir=`$ECHO "$dir" | $SED 's*/lib$*/bin*'`
+	  case :$dllsearchpath: in
+	  *":$dir:"*) ;;
+	  ::) dllsearchpath=$dir;;
+	  *) func_append dllsearchpath ":$dir";;
+	  esac
+	  case :$dllsearchpath: in
+	  *":$testbindir:"*) ;;
+	  ::) dllsearchpath=$testbindir;;
+	  *) func_append dllsearchpath ":$testbindir";;
+	  esac
+	  ;;
+	esac
+	continue
+	;;
+
+      -l*)
+	if test "X$arg" = "X-lc" || test "X$arg" = "X-lm"; then
+	  case $host in
+	  *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-beos* | *-cegcc* | *-*-haiku*)
+	    # These systems don't actually have a C or math library (as such)
+	    continue
+	    ;;
+	  *-*-os2*)
+	    # These systems don't actually have a C library (as such)
+	    test "X$arg" = "X-lc" && continue
+	    ;;
+	  *-*-openbsd* | *-*-freebsd* | *-*-dragonfly*)
+	    # Do not include libc due to us having libc/libc_r.
+	    test "X$arg" = "X-lc" && continue
+	    ;;
+	  *-*-rhapsody* | *-*-darwin1.[012])
+	    # Rhapsody C and math libraries are in the System framework
+	    func_append deplibs " System.ltframework"
+	    continue
+	    ;;
+	  *-*-sco3.2v5* | *-*-sco5v6*)
+	    # Causes problems with __ctype
+	    test "X$arg" = "X-lc" && continue
+	    ;;
+	  *-*-sysv4.2uw2* | *-*-sysv5* | *-*-unixware* | *-*-OpenUNIX*)
+	    # Compiler inserts libc in the correct place for threads to work
+	    test "X$arg" = "X-lc" && continue
+	    ;;
+	  esac
+	elif test "X$arg" = "X-lc_r"; then
+	 case $host in
+	 *-*-openbsd* | *-*-freebsd* | *-*-dragonfly*)
+	   # Do not include libc_r directly, use -pthread flag.
+	   continue
+	   ;;
+	 esac
+	fi
+	func_append deplibs " $arg"
+	continue
+	;;
+
+      -module)
+	module=yes
+	continue
+	;;
+
+      # Tru64 UNIX uses -model [arg] to determine the layout of C++
+      # classes, name mangling, and exception handling.
+      # Darwin uses the -arch flag to determine output architecture.
+      -model|-arch|-isysroot|--sysroot)
+	func_append compiler_flags " $arg"
+	func_append compile_command " $arg"
+	func_append finalize_command " $arg"
+	prev=xcompiler
+	continue
+	;;
+
+      -mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe \
+      |-threads|-fopenmp|-openmp|-mp|-xopenmp|-omp|-qsmp=*)
+	func_append compiler_flags " $arg"
+	func_append compile_command " $arg"
+	func_append finalize_command " $arg"
+	case "$new_inherited_linker_flags " in
+	    *" $arg "*) ;;
+	    * ) func_append new_inherited_linker_flags " $arg" ;;
+	esac
+	continue
+	;;
+
+      -multi_module)
+	single_module="${wl}-multi_module"
+	continue
+	;;
+
+      -no-fast-install)
+	fast_install=no
+	continue
+	;;
+
+      -no-install)
+	case $host in
+	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-darwin* | *-cegcc*)
+	  # The PATH hackery in wrapper scripts is required on Windows
+	  # and Darwin in order for the loader to find any dlls it needs.
+	  func_warning "\`-no-install' is ignored for $host"
+	  func_warning "assuming \`-no-fast-install' instead"
+	  fast_install=no
+	  ;;
+	*) no_install=yes ;;
+	esac
+	continue
+	;;
+
+      -no-undefined)
+	allow_undefined=no
+	continue
+	;;
+
+      -objectlist)
+	prev=objectlist
+	continue
+	;;
+
+      -o) prev=output ;;
+
+      -precious-files-regex)
+	prev=precious_regex
+	continue
+	;;
+
+      -release)
+	prev=release
+	continue
+	;;
+
+      -rpath)
+	prev=rpath
+	continue
+	;;
+
+      -R)
+	prev=xrpath
+	continue
+	;;
+
+      -R*)
+	func_stripname '-R' '' "$arg"
+	dir=$func_stripname_result
+	# We need an absolute path.
+	case $dir in
+	[\\/]* | [A-Za-z]:[\\/]*) ;;
+	=*)
+	  func_stripname '=' '' "$dir"
+	  dir=$lt_sysroot$func_stripname_result
+	  ;;
+	*)
+	  func_fatal_error "only absolute run-paths are allowed"
+	  ;;
+	esac
+	case "$xrpath " in
+	*" $dir "*) ;;
+	*) func_append xrpath " $dir" ;;
+	esac
+	continue
+	;;
+
+      -shared)
+	# The effects of -shared are defined in a previous loop.
+	continue
+	;;
+
+      -shrext)
+	prev=shrext
+	continue
+	;;
+
+      -static | -static-libtool-libs)
+	# The effects of -static are defined in a previous loop.
+	# We used to do the same as -all-static on platforms that
+	# didn't have a PIC flag, but the assumption that the effects
+	# would be equivalent was wrong.  It would break on at least
+	# Digital Unix and AIX.
+	continue
+	;;
+
+      -thread-safe)
+	thread_safe=yes
+	continue
+	;;
+
+      -version-info)
+	prev=vinfo
+	continue
+	;;
+
+      -version-number)
+	prev=vinfo
+	vinfo_number=yes
+	continue
+	;;
+
+      -weak)
+        prev=weak
+	continue
+	;;
+
+      -Wc,*)
+	func_stripname '-Wc,' '' "$arg"
+	args=$func_stripname_result
+	arg=
+	save_ifs="$IFS"; IFS=','
+	for flag in $args; do
+	  IFS="$save_ifs"
+          func_quote_for_eval "$flag"
+	  func_append arg " $func_quote_for_eval_result"
+	  func_append compiler_flags " $func_quote_for_eval_result"
+	done
+	IFS="$save_ifs"
+	func_stripname ' ' '' "$arg"
+	arg=$func_stripname_result
+	;;
+
+      -Wl,*)
+	func_stripname '-Wl,' '' "$arg"
+	args=$func_stripname_result
+	arg=
+	save_ifs="$IFS"; IFS=','
+	for flag in $args; do
+	  IFS="$save_ifs"
+          func_quote_for_eval "$flag"
+	  func_append arg " $wl$func_quote_for_eval_result"
+	  func_append compiler_flags " $wl$func_quote_for_eval_result"
+	  func_append linker_flags " $func_quote_for_eval_result"
+	done
+	IFS="$save_ifs"
+	func_stripname ' ' '' "$arg"
+	arg=$func_stripname_result
+	;;
+
+      -Xcompiler)
+	prev=xcompiler
+	continue
+	;;
+
+      -Xlinker)
+	prev=xlinker
+	continue
+	;;
+
+      -XCClinker)
+	prev=xcclinker
+	continue
+	;;
+
+      # -msg_* for osf cc
+      -msg_*)
+	func_quote_for_eval "$arg"
+	arg="$func_quote_for_eval_result"
+	;;
+
+      # Flags to be passed through unchanged, with rationale:
+      # -64, -mips[0-9]      enable 64-bit mode for the SGI compiler
+      # -r[0-9][0-9]*        specify processor for the SGI compiler
+      # -xarch=*, -xtarget=* enable 64-bit mode for the Sun compiler
+      # +DA*, +DD*           enable 64-bit mode for the HP compiler
+      # -q*                  compiler args for the IBM compiler
+      # -m*, -t[45]*, -txscale* architecture-specific flags for GCC
+      # -F/path              path to uninstalled frameworks, gcc on darwin
+      # -p, -pg, --coverage, -fprofile-*  profiling flags for GCC
+      # @file                GCC response files
+      # -tp=*                Portland pgcc target processor selection
+      # --sysroot=*          for sysroot support
+      # -O*, -flto*, -fwhopr*, -fuse-linker-plugin GCC link-time optimization
+      -64|-mips[0-9]|-r[0-9][0-9]*|-xarch=*|-xtarget=*|+DA*|+DD*|-q*|-m*| \
+      -t[45]*|-txscale*|-p|-pg|--coverage|-fprofile-*|-F*|@*|-tp=*|--sysroot=*| \
+      -O*|-flto*|-fwhopr*|-fuse-linker-plugin)
+        func_quote_for_eval "$arg"
+	arg="$func_quote_for_eval_result"
+        func_append compile_command " $arg"
+        func_append finalize_command " $arg"
+        func_append compiler_flags " $arg"
+        continue
+        ;;
+
+      # Some other compiler flag.
+      -* | +*)
+        func_quote_for_eval "$arg"
+	arg="$func_quote_for_eval_result"
+	;;
+
+      *.$objext)
+	# A standard object.
+	func_append objs " $arg"
+	;;
+
+      *.lo)
+	# A libtool-controlled object.
+
+	# Check to see that this really is a libtool object.
+	if func_lalib_unsafe_p "$arg"; then
+	  pic_object=
+	  non_pic_object=
+
+	  # Read the .lo file
+	  func_source "$arg"
+
+	  if test -z "$pic_object" ||
+	     test -z "$non_pic_object" ||
+	     test "$pic_object" = none &&
+	     test "$non_pic_object" = none; then
+	    func_fatal_error "cannot find name of object for \`$arg'"
+	  fi
+
+	  # Extract subdirectory from the argument.
+	  func_dirname "$arg" "/" ""
+	  xdir="$func_dirname_result"
+
+	  if test "$pic_object" != none; then
+	    # Prepend the subdirectory the object is found in.
+	    pic_object="$xdir$pic_object"
+
+	    if test "$prev" = dlfiles; then
+	      if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+		func_append dlfiles " $pic_object"
+		prev=
+		continue
+	      else
+		# If libtool objects are unsupported, then we need to preload.
+		prev=dlprefiles
+	      fi
+	    fi
+
+	    # CHECK ME:  I think I busted this.  -Ossama
+	    if test "$prev" = dlprefiles; then
+	      # Preload the old-style object.
+	      func_append dlprefiles " $pic_object"
+	      prev=
+	    fi
+
+	    # A PIC object.
+	    func_append libobjs " $pic_object"
+	    arg="$pic_object"
+	  fi
+
+	  # Non-PIC object.
+	  if test "$non_pic_object" != none; then
+	    # Prepend the subdirectory the object is found in.
+	    non_pic_object="$xdir$non_pic_object"
+
+	    # A standard non-PIC object
+	    func_append non_pic_objects " $non_pic_object"
+	    if test -z "$pic_object" || test "$pic_object" = none ; then
+	      arg="$non_pic_object"
+	    fi
+	  else
+	    # If the PIC object exists, use it instead.
+	    # $xdir was prepended to $pic_object above.
+	    non_pic_object="$pic_object"
+	    func_append non_pic_objects " $non_pic_object"
+	  fi
+	else
+	  # Only an error if not doing a dry-run.
+	  if $opt_dry_run; then
+	    # Extract subdirectory from the argument.
+	    func_dirname "$arg" "/" ""
+	    xdir="$func_dirname_result"
+
+	    func_lo2o "$arg"
+	    pic_object=$xdir$objdir/$func_lo2o_result
+	    non_pic_object=$xdir$func_lo2o_result
+	    func_append libobjs " $pic_object"
+	    func_append non_pic_objects " $non_pic_object"
+	  else
+	    func_fatal_error "\`$arg' is not a valid libtool object"
+	  fi
+	fi
+	;;
+
+      *.$libext)
+	# An archive.
+	func_append deplibs " $arg"
+	func_append old_deplibs " $arg"
+	continue
+	;;
+
+      *.la)
+	# A libtool-controlled library.
+
+	func_resolve_sysroot "$arg"
+	if test "$prev" = dlfiles; then
+	  # This library was specified with -dlopen.
+	  func_append dlfiles " $func_resolve_sysroot_result"
+	  prev=
+	elif test "$prev" = dlprefiles; then
+	  # The library was specified with -dlpreopen.
+	  func_append dlprefiles " $func_resolve_sysroot_result"
+	  prev=
+	else
+	  func_append deplibs " $func_resolve_sysroot_result"
+	fi
+	continue
+	;;
+
+      # Some other compiler argument.
+      *)
+	# Unknown arguments in both finalize_command and compile_command need
+	# to be aesthetically quoted because they are evaled later.
+	func_quote_for_eval "$arg"
+	arg="$func_quote_for_eval_result"
+	;;
+      esac # arg
+
+      # Now actually substitute the argument into the commands.
+      if test -n "$arg"; then
+	func_append compile_command " $arg"
+	func_append finalize_command " $arg"
+      fi
+    done # argument parsing loop
+
+    test -n "$prev" && \
+      func_fatal_help "the \`$prevarg' option requires an argument"
+
+    if test "$export_dynamic" = yes && test -n "$export_dynamic_flag_spec"; then
+      eval arg=\"$export_dynamic_flag_spec\"
+      func_append compile_command " $arg"
+      func_append finalize_command " $arg"
+    fi
+
+    oldlibs=
+    # calculate the name of the file, without its directory
+    func_basename "$output"
+    outputname="$func_basename_result"
+    libobjs_save="$libobjs"
+
+    if test -n "$shlibpath_var"; then
+      # get the directories listed in $shlibpath_var
+      eval shlib_search_path=\`\$ECHO \"\${$shlibpath_var}\" \| \$SED \'s/:/ /g\'\`
+    else
+      shlib_search_path=
+    fi
+    eval sys_lib_search_path=\"$sys_lib_search_path_spec\"
+    eval sys_lib_dlsearch_path=\"$sys_lib_dlsearch_path_spec\"
+
+    func_dirname "$output" "/" ""
+    output_objdir="$func_dirname_result$objdir"
+    func_to_tool_file "$output_objdir/"
+    tool_output_objdir=$func_to_tool_file_result
+    # Create the object directory.
+    func_mkdir_p "$output_objdir"
+
+    # Determine the type of output
+    case $output in
+    "")
+      func_fatal_help "you must specify an output file"
+      ;;
+    *.$libext) linkmode=oldlib ;;
+    *.lo | *.$objext) linkmode=obj ;;
+    *.la) linkmode=lib ;;
+    *) linkmode=prog ;; # Anything else should be a program.
+    esac
+
+    specialdeplibs=
+
+    libs=
+    # Find all interdependent deplibs by searching for libraries
+    # that are linked more than once (e.g. -la -lb -la)
+    for deplib in $deplibs; do
+      if $opt_preserve_dup_deps ; then
+	case "$libs " in
+	*" $deplib "*) func_append specialdeplibs " $deplib" ;;
+	esac
+      fi
+      func_append libs " $deplib"
+    done
+
+    if test "$linkmode" = lib; then
+      libs="$predeps $libs $compiler_lib_search_path $postdeps"
+
+      # Compute libraries that are listed more than once in $predeps
+      # $postdeps and mark them as special (i.e., whose duplicates are
+      # not to be eliminated).
+      pre_post_deps=
+      if $opt_duplicate_compiler_generated_deps; then
+	for pre_post_dep in $predeps $postdeps; do
+	  case "$pre_post_deps " in
+	  *" $pre_post_dep "*) func_append specialdeplibs " $pre_post_deps" ;;
+	  esac
+	  func_append pre_post_deps " $pre_post_dep"
+	done
+      fi
+      pre_post_deps=
+    fi
+
+    deplibs=
+    newdependency_libs=
+    newlib_search_path=
+    need_relink=no # whether we're linking any uninstalled libtool libraries
+    notinst_deplibs= # not-installed libtool libraries
+    notinst_path= # paths that contain not-installed libtool libraries
+
+    case $linkmode in
+    lib)
+	passes="conv dlpreopen link"
+	for file in $dlfiles $dlprefiles; do
+	  case $file in
+	  *.la) ;;
+	  *)
+	    func_fatal_help "libraries can \`-dlopen' only libtool libraries: $file"
+	    ;;
+	  esac
+	done
+	;;
+    prog)
+	compile_deplibs=
+	finalize_deplibs=
+	alldeplibs=no
+	newdlfiles=
+	newdlprefiles=
+	passes="conv scan dlopen dlpreopen link"
+	;;
+    *)  passes="conv"
+	;;
+    esac
+
+    for pass in $passes; do
+      # The preopen pass in lib mode reverses $deplibs; put it back here
+      # so that -L comes before libs that need it for instance...
+      if test "$linkmode,$pass" = "lib,link"; then
+	## FIXME: Find the place where the list is rebuilt in the wrong
+	##        order, and fix it there properly
+        tmp_deplibs=
+	for deplib in $deplibs; do
+	  tmp_deplibs="$deplib $tmp_deplibs"
+	done
+	deplibs="$tmp_deplibs"
+      fi
+
+      if test "$linkmode,$pass" = "lib,link" ||
+	 test "$linkmode,$pass" = "prog,scan"; then
+	libs="$deplibs"
+	deplibs=
+      fi
+      if test "$linkmode" = prog; then
+	case $pass in
+	dlopen) libs="$dlfiles" ;;
+	dlpreopen) libs="$dlprefiles" ;;
+	link)
+	  libs="$deplibs %DEPLIBS%"
+	  test "X$link_all_deplibs" != Xno && libs="$libs $dependency_libs"
+	  ;;
+	esac
+      fi
+      if test "$linkmode,$pass" = "lib,dlpreopen"; then
+	# Collect and forward deplibs of preopened libtool libs
+	for lib in $dlprefiles; do
+	  # Ignore non-libtool-libs
+	  dependency_libs=
+	  func_resolve_sysroot "$lib"
+	  case $lib in
+	  *.la)	func_source "$func_resolve_sysroot_result" ;;
+	  esac
+
+	  # Collect preopened libtool deplibs, except any this library
+	  # has declared as weak libs
+	  for deplib in $dependency_libs; do
+	    func_basename "$deplib"
+            deplib_base=$func_basename_result
+	    case " $weak_libs " in
+	    *" $deplib_base "*) ;;
+	    *) func_append deplibs " $deplib" ;;
+	    esac
+	  done
+	done
+	libs="$dlprefiles"
+      fi
+      if test "$pass" = dlopen; then
+	# Collect dlpreopened libraries
+	save_deplibs="$deplibs"
+	deplibs=
+      fi
+
+      for deplib in $libs; do
+	lib=
+	found=no
+	case $deplib in
+	-mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe \
+        |-threads|-fopenmp|-openmp|-mp|-xopenmp|-omp|-qsmp=*)
+	  if test "$linkmode,$pass" = "prog,link"; then
+	    compile_deplibs="$deplib $compile_deplibs"
+	    finalize_deplibs="$deplib $finalize_deplibs"
+	  else
+	    func_append compiler_flags " $deplib"
+	    if test "$linkmode" = lib ; then
+		case "$new_inherited_linker_flags " in
+		    *" $deplib "*) ;;
+		    * ) func_append new_inherited_linker_flags " $deplib" ;;
+		esac
+	    fi
+	  fi
+	  continue
+	  ;;
+	-l*)
+	  if test "$linkmode" != lib && test "$linkmode" != prog; then
+	    func_warning "\`-l' is ignored for archives/objects"
+	    continue
+	  fi
+	  func_stripname '-l' '' "$deplib"
+	  name=$func_stripname_result
+	  if test "$linkmode" = lib; then
+	    searchdirs="$newlib_search_path $lib_search_path $compiler_lib_search_dirs $sys_lib_search_path $shlib_search_path"
+	  else
+	    searchdirs="$newlib_search_path $lib_search_path $sys_lib_search_path $shlib_search_path"
+	  fi
+	  for searchdir in $searchdirs; do
+	    for search_ext in .la $std_shrext .so .a; do
+	      # Search the libtool library
+	      lib="$searchdir/lib${name}${search_ext}"
+	      if test -f "$lib"; then
+		if test "$search_ext" = ".la"; then
+		  found=yes
+		else
+		  found=no
+		fi
+		break 2
+	      fi
+	    done
+	  done
+	  if test "$found" != yes; then
+	    # deplib doesn't seem to be a libtool library
+	    if test "$linkmode,$pass" = "prog,link"; then
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    else
+	      deplibs="$deplib $deplibs"
+	      test "$linkmode" = lib && newdependency_libs="$deplib $newdependency_libs"
+	    fi
+	    continue
+	  else # deplib is a libtool library
+	    # If $allow_libtool_libs_with_static_runtimes && $deplib is a stdlib,
+	    # We need to do some special things here, and not later.
+	    if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+	      case " $predeps $postdeps " in
+	      *" $deplib "*)
+		if func_lalib_p "$lib"; then
+		  library_names=
+		  old_library=
+		  func_source "$lib"
+		  for l in $old_library $library_names; do
+		    ll="$l"
+		  done
+		  if test "X$ll" = "X$old_library" ; then # only static version available
+		    found=no
+		    func_dirname "$lib" "" "."
+		    ladir="$func_dirname_result"
+		    lib=$ladir/$old_library
+		    if test "$linkmode,$pass" = "prog,link"; then
+		      compile_deplibs="$deplib $compile_deplibs"
+		      finalize_deplibs="$deplib $finalize_deplibs"
+		    else
+		      deplibs="$deplib $deplibs"
+		      test "$linkmode" = lib && newdependency_libs="$deplib $newdependency_libs"
+		    fi
+		    continue
+		  fi
+		fi
+		;;
+	      *) ;;
+	      esac
+	    fi
+	  fi
+	  ;; # -l
+	*.ltframework)
+	  if test "$linkmode,$pass" = "prog,link"; then
+	    compile_deplibs="$deplib $compile_deplibs"
+	    finalize_deplibs="$deplib $finalize_deplibs"
+	  else
+	    deplibs="$deplib $deplibs"
+	    if test "$linkmode" = lib ; then
+		case "$new_inherited_linker_flags " in
+		    *" $deplib "*) ;;
+		    * ) func_append new_inherited_linker_flags " $deplib" ;;
+		esac
+	    fi
+	  fi
+	  continue
+	  ;;
+	-L*)
+	  case $linkmode in
+	  lib)
+	    deplibs="$deplib $deplibs"
+	    test "$pass" = conv && continue
+	    newdependency_libs="$deplib $newdependency_libs"
+	    func_stripname '-L' '' "$deplib"
+	    func_resolve_sysroot "$func_stripname_result"
+	    func_append newlib_search_path " $func_resolve_sysroot_result"
+	    ;;
+	  prog)
+	    if test "$pass" = conv; then
+	      deplibs="$deplib $deplibs"
+	      continue
+	    fi
+	    if test "$pass" = scan; then
+	      deplibs="$deplib $deplibs"
+	    else
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    fi
+	    func_stripname '-L' '' "$deplib"
+	    func_resolve_sysroot "$func_stripname_result"
+	    func_append newlib_search_path " $func_resolve_sysroot_result"
+	    ;;
+	  *)
+	    func_warning "\`-L' is ignored for archives/objects"
+	    ;;
+	  esac # linkmode
+	  continue
+	  ;; # -L
+	-R*)
+	  if test "$pass" = link; then
+	    func_stripname '-R' '' "$deplib"
+	    func_resolve_sysroot "$func_stripname_result"
+	    dir=$func_resolve_sysroot_result
+	    # Make sure the xrpath contains only unique directories.
+	    case "$xrpath " in
+	    *" $dir "*) ;;
+	    *) func_append xrpath " $dir" ;;
+	    esac
+	  fi
+	  deplibs="$deplib $deplibs"
+	  continue
+	  ;;
+	*.la)
+	  func_resolve_sysroot "$deplib"
+	  lib=$func_resolve_sysroot_result
+	  ;;
+	*.$libext)
+	  if test "$pass" = conv; then
+	    deplibs="$deplib $deplibs"
+	    continue
+	  fi
+	  case $linkmode in
+	  lib)
+	    # Linking convenience modules into shared libraries is allowed,
+	    # but linking other static libraries is non-portable.
+	    case " $dlpreconveniencelibs " in
+	    *" $deplib "*) ;;
+	    *)
+	      valid_a_lib=no
+	      case $deplibs_check_method in
+		match_pattern*)
+		  set dummy $deplibs_check_method; shift
+		  match_pattern_regex=`expr "$deplibs_check_method" : "$1 \(.*\)"`
+		  if eval "\$ECHO \"$deplib\"" 2>/dev/null | $SED 10q \
+		    | $EGREP "$match_pattern_regex" > /dev/null; then
+		    valid_a_lib=yes
+		  fi
+		;;
+		pass_all)
+		  valid_a_lib=yes
+		;;
+	      esac
+	      if test "$valid_a_lib" != yes; then
+		echo
+		$ECHO "*** Warning: Trying to link with static lib archive $deplib."
+		echo "*** I have the capability to make that library automatically link in when"
+		echo "*** you link to this library.  But I can only do this if you have a"
+		echo "*** shared version of the library, which you do not appear to have"
+		echo "*** because the file extensions .$libext of this argument makes me believe"
+		echo "*** that it is just a static archive that I should not use here."
+	      else
+		echo
+		$ECHO "*** Warning: Linking the shared library $output against the"
+		$ECHO "*** static library $deplib is not portable!"
+		deplibs="$deplib $deplibs"
+	      fi
+	      ;;
+	    esac
+	    continue
+	    ;;
+	  prog)
+	    if test "$pass" != link; then
+	      deplibs="$deplib $deplibs"
+	    else
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    fi
+	    continue
+	    ;;
+	  esac # linkmode
+	  ;; # *.$libext
+	*.lo | *.$objext)
+	  if test "$pass" = conv; then
+	    deplibs="$deplib $deplibs"
+	  elif test "$linkmode" = prog; then
+	    if test "$pass" = dlpreopen || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+	      # If there is no dlopen support or we're linking statically,
+	      # we need to preload.
+	      func_append newdlprefiles " $deplib"
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    else
+	      func_append newdlfiles " $deplib"
+	    fi
+	  fi
+	  continue
+	  ;;
+	%DEPLIBS%)
+	  alldeplibs=yes
+	  continue
+	  ;;
+	esac # case $deplib
+
+	if test "$found" = yes || test -f "$lib"; then :
+	else
+	  func_fatal_error "cannot find the library \`$lib' or unhandled argument \`$deplib'"
+	fi
+
+	# Check to see that this really is a libtool archive.
+	func_lalib_unsafe_p "$lib" \
+	  || func_fatal_error "\`$lib' is not a valid libtool archive"
+
+	func_dirname "$lib" "" "."
+	ladir="$func_dirname_result"
+
+	dlname=
+	dlopen=
+	dlpreopen=
+	libdir=
+	library_names=
+	old_library=
+	inherited_linker_flags=
+	# If the library was installed with an old release of libtool,
+	# it will not redefine variables installed, or shouldnotlink
+	installed=yes
+	shouldnotlink=no
+	avoidtemprpath=
+
+
+	# Read the .la file
+	func_source "$lib"
+
+	# Convert "-framework foo" to "foo.ltframework"
+	if test -n "$inherited_linker_flags"; then
+	  tmp_inherited_linker_flags=`$ECHO "$inherited_linker_flags" | $SED 's/-framework \([^ $]*\)/\1.ltframework/g'`
+	  for tmp_inherited_linker_flag in $tmp_inherited_linker_flags; do
+	    case " $new_inherited_linker_flags " in
+	      *" $tmp_inherited_linker_flag "*) ;;
+	      *) func_append new_inherited_linker_flags " $tmp_inherited_linker_flag";;
+	    esac
+	  done
+	fi
+	dependency_libs=`$ECHO " $dependency_libs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
+	if test "$linkmode,$pass" = "lib,link" ||
+	   test "$linkmode,$pass" = "prog,scan" ||
+	   { test "$linkmode" != prog && test "$linkmode" != lib; }; then
+	  test -n "$dlopen" && func_append dlfiles " $dlopen"
+	  test -n "$dlpreopen" && func_append dlprefiles " $dlpreopen"
+	fi
+
+	if test "$pass" = conv; then
+	  # Only check for convenience libraries
+	  deplibs="$lib $deplibs"
+	  if test -z "$libdir"; then
+	    if test -z "$old_library"; then
+	      func_fatal_error "cannot find name of link library for \`$lib'"
+	    fi
+	    # It is a libtool convenience library, so add in its objects.
+	    func_append convenience " $ladir/$objdir/$old_library"
+	    func_append old_convenience " $ladir/$objdir/$old_library"
+	    tmp_libs=
+	    for deplib in $dependency_libs; do
+	      deplibs="$deplib $deplibs"
+	      if $opt_preserve_dup_deps ; then
+		case "$tmp_libs " in
+		*" $deplib "*) func_append specialdeplibs " $deplib" ;;
+		esac
+	      fi
+	      func_append tmp_libs " $deplib"
+	    done
+	  elif test "$linkmode" != prog && test "$linkmode" != lib; then
+	    func_fatal_error "\`$lib' is not a convenience library"
+	  fi
+	  continue
+	fi # $pass = conv
+
+
+	# Get the name of the library we link against.
+	linklib=
+	if test -n "$old_library" &&
+	   { test "$prefer_static_libs" = yes ||
+	     test "$prefer_static_libs,$installed" = "built,no"; }; then
+	  linklib=$old_library
+	else
+	  for l in $old_library $library_names; do
+	    linklib="$l"
+	  done
+	fi
+	if test -z "$linklib"; then
+	  func_fatal_error "cannot find name of link library for \`$lib'"
+	fi
+
+	# This library was specified with -dlopen.
+	if test "$pass" = dlopen; then
+	  if test -z "$libdir"; then
+	    func_fatal_error "cannot -dlopen a convenience library: \`$lib'"
+	  fi
+	  if test -z "$dlname" ||
+	     test "$dlopen_support" != yes ||
+	     test "$build_libtool_libs" = no; then
+	    # If there is no dlname, no dlopen support or we're linking
+	    # statically, we need to preload.  We also need to preload any
+	    # dependent libraries so libltdl's deplib preloader doesn't
+	    # bomb out in the load deplibs phase.
+	    func_append dlprefiles " $lib $dependency_libs"
+	  else
+	    func_append newdlfiles " $lib"
+	  fi
+	  continue
+	fi # $pass = dlopen
+
+	# We need an absolute path.
+	case $ladir in
+	[\\/]* | [A-Za-z]:[\\/]*) abs_ladir="$ladir" ;;
+	*)
+	  abs_ladir=`cd "$ladir" && pwd`
+	  if test -z "$abs_ladir"; then
+	    func_warning "cannot determine absolute directory name of \`$ladir'"
+	    func_warning "passing it literally to the linker, although it might fail"
+	    abs_ladir="$ladir"
+	  fi
+	  ;;
+	esac
+	func_basename "$lib"
+	laname="$func_basename_result"
+
+	# Find the relevant object directory and library name.
+	if test "X$installed" = Xyes; then
+	  if test ! -f "$lt_sysroot$libdir/$linklib" && test -f "$abs_ladir/$linklib"; then
+	    func_warning "library \`$lib' was moved."
+	    dir="$ladir"
+	    absdir="$abs_ladir"
+	    libdir="$abs_ladir"
+	  else
+	    dir="$lt_sysroot$libdir"
+	    absdir="$lt_sysroot$libdir"
+	  fi
+	  test "X$hardcode_automatic" = Xyes && avoidtemprpath=yes
+	else
+	  if test ! -f "$ladir/$objdir/$linklib" && test -f "$abs_ladir/$linklib"; then
+	    dir="$ladir"
+	    absdir="$abs_ladir"
+	    # Remove this search path later
+	    func_append notinst_path " $abs_ladir"
+	  else
+	    dir="$ladir/$objdir"
+	    absdir="$abs_ladir/$objdir"
+	    # Remove this search path later
+	    func_append notinst_path " $abs_ladir"
+	  fi
+	fi # $installed = yes
+	func_stripname 'lib' '.la' "$laname"
+	name=$func_stripname_result
+
+	# This library was specified with -dlpreopen.
+	if test "$pass" = dlpreopen; then
+	  if test -z "$libdir" && test "$linkmode" = prog; then
+	    func_fatal_error "only libraries may -dlpreopen a convenience library: \`$lib'"
+	  fi
+	  case "$host" in
+	    # special handling for platforms with PE-DLLs.
+	    *cygwin* | *mingw* | *cegcc* )
+	      # Linker will automatically link against shared library if both
+	      # static and shared are present.  Therefore, ensure we extract
+	      # symbols from the import library if a shared library is present
+	      # (otherwise, the dlopen module name will be incorrect).  We do
+	      # this by putting the import library name into $newdlprefiles.
+	      # We recover the dlopen module name by 'saving' the la file
+	      # name in a special purpose variable, and (later) extracting the
+	      # dlname from the la file.
+	      if test -n "$dlname"; then
+	        func_tr_sh "$dir/$linklib"
+	        eval "libfile_$func_tr_sh_result=\$abs_ladir/\$laname"
+	        func_append newdlprefiles " $dir/$linklib"
+	      else
+	        func_append newdlprefiles " $dir/$old_library"
+	        # Keep a list of preopened convenience libraries to check
+	        # that they are being used correctly in the link pass.
+	        test -z "$libdir" && \
+	          func_append dlpreconveniencelibs " $dir/$old_library"
+	      fi
+	    ;;
+	    * )
+	      # Prefer using a static library (so that no silly _DYNAMIC symbols
+	      # are required to link).
+	      if test -n "$old_library"; then
+	        func_append newdlprefiles " $dir/$old_library"
+	        # Keep a list of preopened convenience libraries to check
+	        # that they are being used correctly in the link pass.
+	        test -z "$libdir" && \
+	          func_append dlpreconveniencelibs " $dir/$old_library"
+	      # Otherwise, use the dlname, so that lt_dlopen finds it.
+	      elif test -n "$dlname"; then
+	        func_append newdlprefiles " $dir/$dlname"
+	      else
+	        func_append newdlprefiles " $dir/$linklib"
+	      fi
+	    ;;
+	  esac
+	fi # $pass = dlpreopen
+
+	if test -z "$libdir"; then
+	  # Link the convenience library
+	  if test "$linkmode" = lib; then
+	    deplibs="$dir/$old_library $deplibs"
+	  elif test "$linkmode,$pass" = "prog,link"; then
+	    compile_deplibs="$dir/$old_library $compile_deplibs"
+	    finalize_deplibs="$dir/$old_library $finalize_deplibs"
+	  else
+	    deplibs="$lib $deplibs" # used for prog,scan pass
+	  fi
+	  continue
+	fi
+
+
+	if test "$linkmode" = prog && test "$pass" != link; then
+	  func_append newlib_search_path " $ladir"
+	  deplibs="$lib $deplibs"
+
+	  linkalldeplibs=no
+	  if test "$link_all_deplibs" != no || test -z "$library_names" ||
+	     test "$build_libtool_libs" = no; then
+	    linkalldeplibs=yes
+	  fi
+
+	  tmp_libs=
+	  for deplib in $dependency_libs; do
+	    case $deplib in
+	    -L*) func_stripname '-L' '' "$deplib"
+	         func_resolve_sysroot "$func_stripname_result"
+	         func_append newlib_search_path " $func_resolve_sysroot_result"
+		 ;;
+	    esac
+	    # Need to link against all dependency_libs?
+	    if test "$linkalldeplibs" = yes; then
+	      deplibs="$deplib $deplibs"
+	    else
+	      # Need to hardcode shared library paths
+	      # or/and link against static libraries
+	      newdependency_libs="$deplib $newdependency_libs"
+	    fi
+	    if $opt_preserve_dup_deps ; then
+	      case "$tmp_libs " in
+	      *" $deplib "*) func_append specialdeplibs " $deplib" ;;
+	      esac
+	    fi
+	    func_append tmp_libs " $deplib"
+	  done # for deplib
+	  continue
+	fi # $linkmode = prog...
+
+	if test "$linkmode,$pass" = "prog,link"; then
+	  if test -n "$library_names" &&
+	     { { test "$prefer_static_libs" = no ||
+	         test "$prefer_static_libs,$installed" = "built,yes"; } ||
+	       test -z "$old_library"; }; then
+	    # We need to hardcode the library path
+	    if test -n "$shlibpath_var" && test -z "$avoidtemprpath" ; then
+	      # Make sure the rpath contains only unique directories.
+	      case "$temp_rpath:" in
+	      *"$absdir:"*) ;;
+	      *) func_append temp_rpath "$absdir:" ;;
+	      esac
+	    fi
+
+	    # Hardcode the library path.
+	    # Skip directories that are in the system default run-time
+	    # search path.
+	    case " $sys_lib_dlsearch_path " in
+	    *" $absdir "*) ;;
+	    *)
+	      case "$compile_rpath " in
+	      *" $absdir "*) ;;
+	      *) func_append compile_rpath " $absdir" ;;
+	      esac
+	      ;;
+	    esac
+	    case " $sys_lib_dlsearch_path " in
+	    *" $libdir "*) ;;
+	    *)
+	      case "$finalize_rpath " in
+	      *" $libdir "*) ;;
+	      *) func_append finalize_rpath " $libdir" ;;
+	      esac
+	      ;;
+	    esac
+	  fi # $linkmode,$pass = prog,link...
+
+	  if test "$alldeplibs" = yes &&
+	     { test "$deplibs_check_method" = pass_all ||
+	       { test "$build_libtool_libs" = yes &&
+		 test -n "$library_names"; }; }; then
+	    # We only need to search for static libraries
+	    continue
+	  fi
+	fi
+
+	link_static=no # Whether the deplib will be linked statically
+	use_static_libs=$prefer_static_libs
+	if test "$use_static_libs" = built && test "$installed" = yes; then
+	  use_static_libs=no
+	fi
+	if test -n "$library_names" &&
+	   { test "$use_static_libs" = no || test -z "$old_library"; }; then
+	  case $host in
+	  *cygwin* | *mingw* | *cegcc*)
+	      # No point in relinking DLLs because paths are not encoded
+	      func_append notinst_deplibs " $lib"
+	      need_relink=no
+	    ;;
+	  *)
+	    if test "$installed" = no; then
+	      func_append notinst_deplibs " $lib"
+	      need_relink=yes
+	    fi
+	    ;;
+	  esac
+	  # This is a shared library
+
+	  # Warn about portability, can't link against -module's on some
+	  # systems (darwin).  Don't bleat about dlopened modules though!
+	  dlopenmodule=""
+	  for dlpremoduletest in $dlprefiles; do
+	    if test "X$dlpremoduletest" = "X$lib"; then
+	      dlopenmodule="$dlpremoduletest"
+	      break
+	    fi
+	  done
+	  if test -z "$dlopenmodule" && test "$shouldnotlink" = yes && test "$pass" = link; then
+	    echo
+	    if test "$linkmode" = prog; then
+	      $ECHO "*** Warning: Linking the executable $output against the loadable module"
+	    else
+	      $ECHO "*** Warning: Linking the shared library $output against the loadable module"
+	    fi
+	    $ECHO "*** $linklib is not portable!"
+	  fi
+	  if test "$linkmode" = lib &&
+	     test "$hardcode_into_libs" = yes; then
+	    # Hardcode the library path.
+	    # Skip directories that are in the system default run-time
+	    # search path.
+	    case " $sys_lib_dlsearch_path " in
+	    *" $absdir "*) ;;
+	    *)
+	      case "$compile_rpath " in
+	      *" $absdir "*) ;;
+	      *) func_append compile_rpath " $absdir" ;;
+	      esac
+	      ;;
+	    esac
+	    case " $sys_lib_dlsearch_path " in
+	    *" $libdir "*) ;;
+	    *)
+	      case "$finalize_rpath " in
+	      *" $libdir "*) ;;
+	      *) func_append finalize_rpath " $libdir" ;;
+	      esac
+	      ;;
+	    esac
+	  fi
+
+	  if test -n "$old_archive_from_expsyms_cmds"; then
+	    # figure out the soname
+	    set dummy $library_names
+	    shift
+	    realname="$1"
+	    shift
+	    libname=`eval "\\$ECHO \"$libname_spec\""`
+	    # use dlname if we got it. it's perfectly good, no?
+	    if test -n "$dlname"; then
+	      soname="$dlname"
+	    elif test -n "$soname_spec"; then
+	      # bleh windows
+	      case $host in
+	      *cygwin* | mingw* | *cegcc*)
+	        func_arith $current - $age
+		major=$func_arith_result
+		versuffix="-$major"
+		;;
+	      esac
+	      eval soname=\"$soname_spec\"
+	    else
+	      soname="$realname"
+	    fi
+
+	    # Make a new name for the extract_expsyms_cmds to use
+	    soroot="$soname"
+	    func_basename "$soroot"
+	    soname="$func_basename_result"
+	    func_stripname 'lib' '.dll' "$soname"
+	    newlib=libimp-$func_stripname_result.a
+
+	    # If the library has no export list, then create one now
+	    if test -f "$output_objdir/$soname-def"; then :
+	    else
+	      func_verbose "extracting exported symbol list from \`$soname'"
+	      func_execute_cmds "$extract_expsyms_cmds" 'exit $?'
+	    fi
+
+	    # Create $newlib
+	    if test -f "$output_objdir/$newlib"; then :; else
+	      func_verbose "generating import library for \`$soname'"
+	      func_execute_cmds "$old_archive_from_expsyms_cmds" 'exit $?'
+	    fi
+	    # make sure the library variables are pointing to the new library
+	    dir=$output_objdir
+	    linklib=$newlib
+	  fi # test -n "$old_archive_from_expsyms_cmds"
+
+	  if test "$linkmode" = prog || test "$opt_mode" != relink; then
+	    add_shlibpath=
+	    add_dir=
+	    add=
+	    lib_linked=yes
+	    case $hardcode_action in
+	    immediate | unsupported)
+	      if test "$hardcode_direct" = no; then
+		add="$dir/$linklib"
+		case $host in
+		  *-*-sco3.2v5.0.[024]*) add_dir="-L$dir" ;;
+		  *-*-sysv4*uw2*) add_dir="-L$dir" ;;
+		  *-*-sysv5OpenUNIX* | *-*-sysv5UnixWare7.[01].[10]* | \
+		    *-*-unixware7*) add_dir="-L$dir" ;;
+		  *-*-darwin* )
+		    # if the lib is a (non-dlopened) module then we can not
+		    # link against it, someone is ignoring the earlier warnings
+		    if /usr/bin/file -L $add 2> /dev/null |
+			 $GREP ": [^:]* bundle" >/dev/null ; then
+		      if test "X$dlopenmodule" != "X$lib"; then
+			$ECHO "*** Warning: lib $linklib is a module, not a shared library"
+			if test -z "$old_library" ; then
+			  echo
+			  echo "*** And there doesn't seem to be a static archive available"
+			  echo "*** The link will probably fail, sorry"
+			else
+			  add="$dir/$old_library"
+			fi
+		      elif test -n "$old_library"; then
+			add="$dir/$old_library"
+		      fi
+		    fi
+		esac
+	      elif test "$hardcode_minus_L" = no; then
+		case $host in
+		*-*-sunos*) add_shlibpath="$dir" ;;
+		esac
+		add_dir="-L$dir"
+		add="-l$name"
+	      elif test "$hardcode_shlibpath_var" = no; then
+		add_shlibpath="$dir"
+		add="-l$name"
+	      else
+		lib_linked=no
+	      fi
+	      ;;
+	    relink)
+	      if test "$hardcode_direct" = yes &&
+	         test "$hardcode_direct_absolute" = no; then
+		add="$dir/$linklib"
+	      elif test "$hardcode_minus_L" = yes; then
+		add_dir="-L$absdir"
+		# Try looking first in the location we're being installed to.
+		if test -n "$inst_prefix_dir"; then
+		  case $libdir in
+		    [\\/]*)
+		      func_append add_dir " -L$inst_prefix_dir$libdir"
+		      ;;
+		  esac
+		fi
+		add="-l$name"
+	      elif test "$hardcode_shlibpath_var" = yes; then
+		add_shlibpath="$dir"
+		add="-l$name"
+	      else
+		lib_linked=no
+	      fi
+	      ;;
+	    *) lib_linked=no ;;
+	    esac
+
+	    if test "$lib_linked" != yes; then
+	      func_fatal_configuration "unsupported hardcode properties"
+	    fi
+
+	    if test -n "$add_shlibpath"; then
+	      case :$compile_shlibpath: in
+	      *":$add_shlibpath:"*) ;;
+	      *) func_append compile_shlibpath "$add_shlibpath:" ;;
+	      esac
+	    fi
+	    if test "$linkmode" = prog; then
+	      test -n "$add_dir" && compile_deplibs="$add_dir $compile_deplibs"
+	      test -n "$add" && compile_deplibs="$add $compile_deplibs"
+	    else
+	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
+	      test -n "$add" && deplibs="$add $deplibs"
+	      if test "$hardcode_direct" != yes &&
+		 test "$hardcode_minus_L" != yes &&
+		 test "$hardcode_shlibpath_var" = yes; then
+		case :$finalize_shlibpath: in
+		*":$libdir:"*) ;;
+		*) func_append finalize_shlibpath "$libdir:" ;;
+		esac
+	      fi
+	    fi
+	  fi
+
+	  if test "$linkmode" = prog || test "$opt_mode" = relink; then
+	    add_shlibpath=
+	    add_dir=
+	    add=
+	    # Finalize command for both is simple: just hardcode it.
+	    if test "$hardcode_direct" = yes &&
+	       test "$hardcode_direct_absolute" = no; then
+	      add="$libdir/$linklib"
+	    elif test "$hardcode_minus_L" = yes; then
+	      add_dir="-L$libdir"
+	      add="-l$name"
+	    elif test "$hardcode_shlibpath_var" = yes; then
+	      case :$finalize_shlibpath: in
+	      *":$libdir:"*) ;;
+	      *) func_append finalize_shlibpath "$libdir:" ;;
+	      esac
+	      add="-l$name"
+	    elif test "$hardcode_automatic" = yes; then
+	      if test -n "$inst_prefix_dir" &&
+		 test -f "$inst_prefix_dir$libdir/$linklib" ; then
+		add="$inst_prefix_dir$libdir/$linklib"
+	      else
+		add="$libdir/$linklib"
+	      fi
+	    else
+	      # We cannot seem to hardcode it, guess we'll fake it.
+	      add_dir="-L$libdir"
+	      # Try looking first in the location we're being installed to.
+	      if test -n "$inst_prefix_dir"; then
+		case $libdir in
+		  [\\/]*)
+		    func_append add_dir " -L$inst_prefix_dir$libdir"
+		    ;;
+		esac
+	      fi
+	      add="-l$name"
+	    fi
+
+	    if test "$linkmode" = prog; then
+	      test -n "$add_dir" && finalize_deplibs="$add_dir $finalize_deplibs"
+	      test -n "$add" && finalize_deplibs="$add $finalize_deplibs"
+	    else
+	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
+	      test -n "$add" && deplibs="$add $deplibs"
+	    fi
+	  fi
+	elif test "$linkmode" = prog; then
+	  # Here we assume that one of hardcode_direct or hardcode_minus_L
+	  # is not unsupported.  This is valid on all known static and
+	  # shared platforms.
+	  if test "$hardcode_direct" != unsupported; then
+	    test -n "$old_library" && linklib="$old_library"
+	    compile_deplibs="$dir/$linklib $compile_deplibs"
+	    finalize_deplibs="$dir/$linklib $finalize_deplibs"
+	  else
+	    compile_deplibs="-l$name -L$dir $compile_deplibs"
+	    finalize_deplibs="-l$name -L$dir $finalize_deplibs"
+	  fi
+	elif test "$build_libtool_libs" = yes; then
+	  # Not a shared library
+	  if test "$deplibs_check_method" != pass_all; then
+	    # We're trying link a shared library against a static one
+	    # but the system doesn't support it.
+
+	    # Just print a warning and add the library to dependency_libs so
+	    # that the program can be linked against the static library.
+	    echo
+	    $ECHO "*** Warning: This system can not link to static lib archive $lib."
+	    echo "*** I have the capability to make that library automatically link in when"
+	    echo "*** you link to this library.  But I can only do this if you have a"
+	    echo "*** shared version of the library, which you do not appear to have."
+	    if test "$module" = yes; then
+	      echo "*** But as you try to build a module library, libtool will still create "
+	      echo "*** a static module, that should work as long as the dlopening application"
+	      echo "*** is linked with the -dlopen flag to resolve symbols at runtime."
+	      if test -z "$global_symbol_pipe"; then
+		echo
+		echo "*** However, this would only work if libtool was able to extract symbol"
+		echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+		echo "*** not find such a program.  So, this module is probably useless."
+		echo "*** \`nm' from GNU binutils and a full rebuild may help."
+	      fi
+	      if test "$build_old_libs" = no; then
+		build_libtool_libs=module
+		build_old_libs=yes
+	      else
+		build_libtool_libs=no
+	      fi
+	    fi
+	  else
+	    deplibs="$dir/$old_library $deplibs"
+	    link_static=yes
+	  fi
+	fi # link shared/static library?
+
+	if test "$linkmode" = lib; then
+	  if test -n "$dependency_libs" &&
+	     { test "$hardcode_into_libs" != yes ||
+	       test "$build_old_libs" = yes ||
+	       test "$link_static" = yes; }; then
+	    # Extract -R from dependency_libs
+	    temp_deplibs=
+	    for libdir in $dependency_libs; do
+	      case $libdir in
+	      -R*) func_stripname '-R' '' "$libdir"
+	           temp_xrpath=$func_stripname_result
+		   case " $xrpath " in
+		   *" $temp_xrpath "*) ;;
+		   *) func_append xrpath " $temp_xrpath";;
+		   esac;;
+	      *) func_append temp_deplibs " $libdir";;
+	      esac
+	    done
+	    dependency_libs="$temp_deplibs"
+	  fi
+
+	  func_append newlib_search_path " $absdir"
+	  # Link against this library
+	  test "$link_static" = no && newdependency_libs="$abs_ladir/$laname $newdependency_libs"
+	  # ... and its dependency_libs
+	  tmp_libs=
+	  for deplib in $dependency_libs; do
+	    newdependency_libs="$deplib $newdependency_libs"
+	    case $deplib in
+              -L*) func_stripname '-L' '' "$deplib"
+                   func_resolve_sysroot "$func_stripname_result";;
+              *) func_resolve_sysroot "$deplib" ;;
+            esac
+	    if $opt_preserve_dup_deps ; then
+	      case "$tmp_libs " in
+	      *" $func_resolve_sysroot_result "*)
+                func_append specialdeplibs " $func_resolve_sysroot_result" ;;
+	      esac
+	    fi
+	    func_append tmp_libs " $func_resolve_sysroot_result"
+	  done
+
+	  if test "$link_all_deplibs" != no; then
+	    # Add the search paths of all dependency libraries
+	    for deplib in $dependency_libs; do
+	      path=
+	      case $deplib in
+	      -L*) path="$deplib" ;;
+	      *.la)
+	        func_resolve_sysroot "$deplib"
+	        deplib=$func_resolve_sysroot_result
+	        func_dirname "$deplib" "" "."
+		dir=$func_dirname_result
+		# We need an absolute path.
+		case $dir in
+		[\\/]* | [A-Za-z]:[\\/]*) absdir="$dir" ;;
+		*)
+		  absdir=`cd "$dir" && pwd`
+		  if test -z "$absdir"; then
+		    func_warning "cannot determine absolute directory name of \`$dir'"
+		    absdir="$dir"
+		  fi
+		  ;;
+		esac
+		if $GREP "^installed=no" $deplib > /dev/null; then
+		case $host in
+		*-*-darwin*)
+		  depdepl=
+		  eval deplibrary_names=`${SED} -n -e 's/^library_names=\(.*\)$/\1/p' $deplib`
+		  if test -n "$deplibrary_names" ; then
+		    for tmp in $deplibrary_names ; do
+		      depdepl=$tmp
+		    done
+		    if test -f "$absdir/$objdir/$depdepl" ; then
+		      depdepl="$absdir/$objdir/$depdepl"
+		      darwin_install_name=`${OTOOL} -L $depdepl | awk '{if (NR == 2) {print $1;exit}}'`
+                      if test -z "$darwin_install_name"; then
+                          darwin_install_name=`${OTOOL64} -L $depdepl  | awk '{if (NR == 2) {print $1;exit}}'`
+                      fi
+		      func_append compiler_flags " ${wl}-dylib_file ${wl}${darwin_install_name}:${depdepl}"
+		      func_append linker_flags " -dylib_file ${darwin_install_name}:${depdepl}"
+		      path=
+		    fi
+		  fi
+		  ;;
+		*)
+		  path="-L$absdir/$objdir"
+		  ;;
+		esac
+		else
+		  eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+		  test -z "$libdir" && \
+		    func_fatal_error "\`$deplib' is not a valid libtool archive"
+		  test "$absdir" != "$libdir" && \
+		    func_warning "\`$deplib' seems to be moved"
+
+		  path="-L$absdir"
+		fi
+		;;
+	      esac
+	      case " $deplibs " in
+	      *" $path "*) ;;
+	      *) deplibs="$path $deplibs" ;;
+	      esac
+	    done
+	  fi # link_all_deplibs != no
+	fi # linkmode = lib
+      done # for deplib in $libs
+      if test "$pass" = link; then
+	if test "$linkmode" = "prog"; then
+	  compile_deplibs="$new_inherited_linker_flags $compile_deplibs"
+	  finalize_deplibs="$new_inherited_linker_flags $finalize_deplibs"
+	else
+	  compiler_flags="$compiler_flags "`$ECHO " $new_inherited_linker_flags" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
+	fi
+      fi
+      dependency_libs="$newdependency_libs"
+      if test "$pass" = dlpreopen; then
+	# Link the dlpreopened libraries before other libraries
+	for deplib in $save_deplibs; do
+	  deplibs="$deplib $deplibs"
+	done
+      fi
+      if test "$pass" != dlopen; then
+	if test "$pass" != conv; then
+	  # Make sure lib_search_path contains only unique directories.
+	  lib_search_path=
+	  for dir in $newlib_search_path; do
+	    case "$lib_search_path " in
+	    *" $dir "*) ;;
+	    *) func_append lib_search_path " $dir" ;;
+	    esac
+	  done
+	  newlib_search_path=
+	fi
+
+	if test "$linkmode,$pass" != "prog,link"; then
+	  vars="deplibs"
+	else
+	  vars="compile_deplibs finalize_deplibs"
+	fi
+	for var in $vars dependency_libs; do
+	  # Add libraries to $var in reverse order
+	  eval tmp_libs=\"\$$var\"
+	  new_libs=
+	  for deplib in $tmp_libs; do
+	    # FIXME: Pedantically, this is the right thing to do, so
+	    #        that some nasty dependency loop isn't accidentally
+	    #        broken:
+	    #new_libs="$deplib $new_libs"
+	    # Pragmatically, this seems to cause very few problems in
+	    # practice:
+	    case $deplib in
+	    -L*) new_libs="$deplib $new_libs" ;;
+	    -R*) ;;
+	    *)
+	      # And here is the reason: when a library appears more
+	      # than once as an explicit dependence of a library, or
+	      # is implicitly linked in more than once by the
+	      # compiler, it is considered special, and multiple
+	      # occurrences thereof are not removed.  Compare this
+	      # with having the same library being listed as a
+	      # dependency of multiple other libraries: in this case,
+	      # we know (pedantically, we assume) the library does not
+	      # need to be listed more than once, so we keep only the
+	      # last copy.  This is not always right, but it is rare
+	      # enough that we require users that really mean to play
+	      # such unportable linking tricks to link the library
+	      # using -Wl,-lname, so that libtool does not consider it
+	      # for duplicate removal.
+	      case " $specialdeplibs " in
+	      *" $deplib "*) new_libs="$deplib $new_libs" ;;
+	      *)
+		case " $new_libs " in
+		*" $deplib "*) ;;
+		*) new_libs="$deplib $new_libs" ;;
+		esac
+		;;
+	      esac
+	      ;;
+	    esac
+	  done
+	  tmp_libs=
+	  for deplib in $new_libs; do
+	    case $deplib in
+	    -L*)
+	      case " $tmp_libs " in
+	      *" $deplib "*) ;;
+	      *) func_append tmp_libs " $deplib" ;;
+	      esac
+	      ;;
+	    *) func_append tmp_libs " $deplib" ;;
+	    esac
+	  done
+	  eval $var=\"$tmp_libs\"
+	done # for var
+      fi
+      # Last step: remove runtime libs from dependency_libs
+      # (they stay in deplibs)
+      tmp_libs=
+      for i in $dependency_libs ; do
+	case " $predeps $postdeps $compiler_lib_search_path " in
+	*" $i "*)
+	  i=""
+	  ;;
+	esac
+	if test -n "$i" ; then
+	  func_append tmp_libs " $i"
+	fi
+      done
+      dependency_libs=$tmp_libs
+    done # for pass
+    if test "$linkmode" = prog; then
+      dlfiles="$newdlfiles"
+    fi
+    if test "$linkmode" = prog || test "$linkmode" = lib; then
+      dlprefiles="$newdlprefiles"
+    fi
+
+    case $linkmode in
+    oldlib)
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	func_warning "\`-dlopen' is ignored for archives"
+      fi
+
+      case " $deplibs" in
+      *\ -l* | *\ -L*)
+	func_warning "\`-l' and \`-L' are ignored for archives" ;;
+      esac
+
+      test -n "$rpath" && \
+	func_warning "\`-rpath' is ignored for archives"
+
+      test -n "$xrpath" && \
+	func_warning "\`-R' is ignored for archives"
+
+      test -n "$vinfo" && \
+	func_warning "\`-version-info/-version-number' is ignored for archives"
+
+      test -n "$release" && \
+	func_warning "\`-release' is ignored for archives"
+
+      test -n "$export_symbols$export_symbols_regex" && \
+	func_warning "\`-export-symbols' is ignored for archives"
+
+      # Now set the variables for building old libraries.
+      build_libtool_libs=no
+      oldlibs="$output"
+      func_append objs "$old_deplibs"
+      ;;
+
+    lib)
+      # Make sure we only generate libraries of the form `libNAME.la'.
+      case $outputname in
+      lib*)
+	func_stripname 'lib' '.la' "$outputname"
+	name=$func_stripname_result
+	eval shared_ext=\"$shrext_cmds\"
+	eval libname=\"$libname_spec\"
+	;;
+      *)
+	test "$module" = no && \
+	  func_fatal_help "libtool library \`$output' must begin with \`lib'"
+
+	if test "$need_lib_prefix" != no; then
+	  # Add the "lib" prefix for modules if required
+	  func_stripname '' '.la' "$outputname"
+	  name=$func_stripname_result
+	  eval shared_ext=\"$shrext_cmds\"
+	  eval libname=\"$libname_spec\"
+	else
+	  func_stripname '' '.la' "$outputname"
+	  libname=$func_stripname_result
+	fi
+	;;
+      esac
+
+      if test -n "$objs"; then
+	if test "$deplibs_check_method" != pass_all; then
+	  func_fatal_error "cannot build libtool library \`$output' from non-libtool objects on this host:$objs"
+	else
+	  echo
+	  $ECHO "*** Warning: Linking the shared library $output against the non-libtool"
+	  $ECHO "*** objects $objs is not portable!"
+	  func_append libobjs " $objs"
+	fi
+      fi
+
+      test "$dlself" != no && \
+	func_warning "\`-dlopen self' is ignored for libtool libraries"
+
+      set dummy $rpath
+      shift
+      test "$#" -gt 1 && \
+	func_warning "ignoring multiple \`-rpath's for a libtool library"
+
+      install_libdir="$1"
+
+      oldlibs=
+      if test -z "$rpath"; then
+	if test "$build_libtool_libs" = yes; then
+	  # Building a libtool convenience library.
+	  # Some compilers have problems with a `.al' extension so
+	  # convenience libraries should have the same extension an
+	  # archive normally would.
+	  oldlibs="$output_objdir/$libname.$libext $oldlibs"
+	  build_libtool_libs=convenience
+	  build_old_libs=yes
+	fi
+
+	test -n "$vinfo" && \
+	  func_warning "\`-version-info/-version-number' is ignored for convenience libraries"
+
+	test -n "$release" && \
+	  func_warning "\`-release' is ignored for convenience libraries"
+      else
+
+	# Parse the version information argument.
+	save_ifs="$IFS"; IFS=':'
+	set dummy $vinfo 0 0 0
+	shift
+	IFS="$save_ifs"
+
+	test -n "$7" && \
+	  func_fatal_help "too many parameters to \`-version-info'"
+
+	# convert absolute version numbers to libtool ages
+	# this retains compatibility with .la files and attempts
+	# to make the code below a bit more comprehensible
+
+	case $vinfo_number in
+	yes)
+	  number_major="$1"
+	  number_minor="$2"
+	  number_revision="$3"
+	  #
+	  # There are really only two kinds -- those that
+	  # use the current revision as the major version
+	  # and those that subtract age and use age as
+	  # a minor version.  But, then there is irix
+	  # which has an extra 1 added just for fun
+	  #
+	  case $version_type in
+	  # correct linux to gnu/linux during the next big refactor
+	  darwin|linux|osf|windows|none)
+	    func_arith $number_major + $number_minor
+	    current=$func_arith_result
+	    age="$number_minor"
+	    revision="$number_revision"
+	    ;;
+	  freebsd-aout|freebsd-elf|qnx|sunos)
+	    current="$number_major"
+	    revision="$number_minor"
+	    age="0"
+	    ;;
+	  irix|nonstopux)
+	    func_arith $number_major + $number_minor
+	    current=$func_arith_result
+	    age="$number_minor"
+	    revision="$number_minor"
+	    lt_irix_increment=no
+	    ;;
+	  *)
+	    func_fatal_configuration "$modename: unknown library version type \`$version_type'"
+	    ;;
+	  esac
+	  ;;
+	no)
+	  current="$1"
+	  revision="$2"
+	  age="$3"
+	  ;;
+	esac
+
+	# Check that each of the things are valid numbers.
+	case $current in
+	0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;;
+	*)
+	  func_error "CURRENT \`$current' must be a nonnegative integer"
+	  func_fatal_error "\`$vinfo' is not valid version information"
+	  ;;
+	esac
+
+	case $revision in
+	0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;;
+	*)
+	  func_error "REVISION \`$revision' must be a nonnegative integer"
+	  func_fatal_error "\`$vinfo' is not valid version information"
+	  ;;
+	esac
+
+	case $age in
+	0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;;
+	*)
+	  func_error "AGE \`$age' must be a nonnegative integer"
+	  func_fatal_error "\`$vinfo' is not valid version information"
+	  ;;
+	esac
+
+	if test "$age" -gt "$current"; then
+	  func_error "AGE \`$age' is greater than the current interface number \`$current'"
+	  func_fatal_error "\`$vinfo' is not valid version information"
+	fi
+
+	# Calculate the version variables.
+	major=
+	versuffix=
+	verstring=
+	case $version_type in
+	none) ;;
+
+	darwin)
+	  # Like Linux, but with the current version available in
+	  # verstring for coding it into the library header
+	  func_arith $current - $age
+	  major=.$func_arith_result
+	  versuffix="$major.$age.$revision"
+	  # Darwin ld doesn't like 0 for these options...
+	  func_arith $current + 1
+	  minor_current=$func_arith_result
+	  xlcverstring="${wl}-compatibility_version ${wl}$minor_current ${wl}-current_version ${wl}$minor_current.$revision"
+	  verstring="-compatibility_version $minor_current -current_version $minor_current.$revision"
+	  ;;
+
+	freebsd-aout)
+	  major=".$current"
+	  versuffix=".$current.$revision";
+	  ;;
+
+	freebsd-elf)
+	  major=".$current"
+	  versuffix=".$current"
+	  ;;
+
+	irix | nonstopux)
+	  if test "X$lt_irix_increment" = "Xno"; then
+	    func_arith $current - $age
+	  else
+	    func_arith $current - $age + 1
+	  fi
+	  major=$func_arith_result
+
+	  case $version_type in
+	    nonstopux) verstring_prefix=nonstopux ;;
+	    *)         verstring_prefix=sgi ;;
+	  esac
+	  verstring="$verstring_prefix$major.$revision"
+
+	  # Add in all the interfaces that we are compatible with.
+	  loop=$revision
+	  while test "$loop" -ne 0; do
+	    func_arith $revision - $loop
+	    iface=$func_arith_result
+	    func_arith $loop - 1
+	    loop=$func_arith_result
+	    verstring="$verstring_prefix$major.$iface:$verstring"
+	  done
+
+	  # Before this point, $major must not contain `.'.
+	  major=.$major
+	  versuffix="$major.$revision"
+	  ;;
+
+	linux) # correct to gnu/linux during the next big refactor
+	  func_arith $current - $age
+	  major=.$func_arith_result
+	  versuffix="$major.$age.$revision"
+	  ;;
+
+	osf)
+	  func_arith $current - $age
+	  major=.$func_arith_result
+	  versuffix=".$current.$age.$revision"
+	  verstring="$current.$age.$revision"
+
+	  # Add in all the interfaces that we are compatible with.
+	  loop=$age
+	  while test "$loop" -ne 0; do
+	    func_arith $current - $loop
+	    iface=$func_arith_result
+	    func_arith $loop - 1
+	    loop=$func_arith_result
+	    verstring="$verstring:${iface}.0"
+	  done
+
+	  # Make executables depend on our current version.
+	  func_append verstring ":${current}.0"
+	  ;;
+
+	qnx)
+	  major=".$current"
+	  versuffix=".$current"
+	  ;;
+
+	sunos)
+	  major=".$current"
+	  versuffix=".$current.$revision"
+	  ;;
+
+	windows)
+	  # Use '-' rather than '.', since we only want one
+	  # extension on DOS 8.3 filesystems.
+	  func_arith $current - $age
+	  major=$func_arith_result
+	  versuffix="-$major"
+	  ;;
+
+	*)
+	  func_fatal_configuration "unknown library version type \`$version_type'"
+	  ;;
+	esac
+
+	# Clear the version info if we defaulted, and they specified a release.
+	if test -z "$vinfo" && test -n "$release"; then
+	  major=
+	  case $version_type in
+	  darwin)
+	    # we can't check for "0.0" in archive_cmds due to quoting
+	    # problems, so we reset it completely
+	    verstring=
+	    ;;
+	  *)
+	    verstring="0.0"
+	    ;;
+	  esac
+	  if test "$need_version" = no; then
+	    versuffix=
+	  else
+	    versuffix=".0.0"
+	  fi
+	fi
+
+	# Remove version info from name if versioning should be avoided
+	if test "$avoid_version" = yes && test "$need_version" = no; then
+	  major=
+	  versuffix=
+	  verstring=""
+	fi
+
+	# Check to see if the archive will have undefined symbols.
+	if test "$allow_undefined" = yes; then
+	  if test "$allow_undefined_flag" = unsupported; then
+	    func_warning "undefined symbols not allowed in $host shared libraries"
+	    build_libtool_libs=no
+	    build_old_libs=yes
+	  fi
+	else
+	  # Don't allow undefined symbols.
+	  allow_undefined_flag="$no_undefined_flag"
+	fi
+
+      fi
+
+      func_generate_dlsyms "$libname" "$libname" "yes"
+      func_append libobjs " $symfileobj"
+      test "X$libobjs" = "X " && libobjs=
+
+      if test "$opt_mode" != relink; then
+	# Remove our outputs, but don't remove object files since they
+	# may have been created when compiling PIC objects.
+	removelist=
+	tempremovelist=`$ECHO "$output_objdir/*"`
+	for p in $tempremovelist; do
+	  case $p in
+	    *.$objext | *.gcno)
+	       ;;
+	    $output_objdir/$outputname | $output_objdir/$libname.* | $output_objdir/${libname}${release}.*)
+	       if test "X$precious_files_regex" != "X"; then
+		 if $ECHO "$p" | $EGREP -e "$precious_files_regex" >/dev/null 2>&1
+		 then
+		   continue
+		 fi
+	       fi
+	       func_append removelist " $p"
+	       ;;
+	    *) ;;
+	  esac
+	done
+	test -n "$removelist" && \
+	  func_show_eval "${RM}r \$removelist"
+      fi
+
+      # Now set the variables for building old libraries.
+      if test "$build_old_libs" = yes && test "$build_libtool_libs" != convenience ; then
+	func_append oldlibs " $output_objdir/$libname.$libext"
+
+	# Transform .lo files to .o files.
+	oldobjs="$objs "`$ECHO "$libobjs" | $SP2NL | $SED "/\.${libext}$/d; $lo2o" | $NL2SP`
+      fi
+
+      # Eliminate all temporary directories.
+      #for path in $notinst_path; do
+      #	lib_search_path=`$ECHO "$lib_search_path " | $SED "s% $path % %g"`
+      #	deplibs=`$ECHO "$deplibs " | $SED "s% -L$path % %g"`
+      #	dependency_libs=`$ECHO "$dependency_libs " | $SED "s% -L$path % %g"`
+      #done
+
+      if test -n "$xrpath"; then
+	# If the user specified any rpath flags, then add them.
+	temp_xrpath=
+	for libdir in $xrpath; do
+	  func_replace_sysroot "$libdir"
+	  func_append temp_xrpath " -R$func_replace_sysroot_result"
+	  case "$finalize_rpath " in
+	  *" $libdir "*) ;;
+	  *) func_append finalize_rpath " $libdir" ;;
+	  esac
+	done
+	if test "$hardcode_into_libs" != yes || test "$build_old_libs" = yes; then
+	  dependency_libs="$temp_xrpath $dependency_libs"
+	fi
+      fi
+
+      # Make sure dlfiles contains only unique files that won't be dlpreopened
+      old_dlfiles="$dlfiles"
+      dlfiles=
+      for lib in $old_dlfiles; do
+	case " $dlprefiles $dlfiles " in
+	*" $lib "*) ;;
+	*) func_append dlfiles " $lib" ;;
+	esac
+      done
+
+      # Make sure dlprefiles contains only unique files
+      old_dlprefiles="$dlprefiles"
+      dlprefiles=
+      for lib in $old_dlprefiles; do
+	case "$dlprefiles " in
+	*" $lib "*) ;;
+	*) func_append dlprefiles " $lib" ;;
+	esac
+      done
+
+      if test "$build_libtool_libs" = yes; then
+	if test -n "$rpath"; then
+	  case $host in
+	  *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-beos* | *-cegcc* | *-*-haiku*)
+	    # these systems don't actually have a c library (as such)!
+	    ;;
+	  *-*-rhapsody* | *-*-darwin1.[012])
+	    # Rhapsody C library is in the System framework
+	    func_append deplibs " System.ltframework"
+	    ;;
+	  *-*-netbsd*)
+	    # Don't link with libc until the a.out ld.so is fixed.
+	    ;;
+	  *-*-openbsd* | *-*-freebsd* | *-*-dragonfly*)
+	    # Do not include libc due to us having libc/libc_r.
+	    ;;
+	  *-*-sco3.2v5* | *-*-sco5v6*)
+	    # Causes problems with __ctype
+	    ;;
+	  *-*-sysv4.2uw2* | *-*-sysv5* | *-*-unixware* | *-*-OpenUNIX*)
+	    # Compiler inserts libc in the correct place for threads to work
+	    ;;
+	  *)
+	    # Add libc to deplibs on all other systems if necessary.
+	    if test "$build_libtool_need_lc" = "yes"; then
+	      func_append deplibs " -lc"
+	    fi
+	    ;;
+	  esac
+	fi
+
+	# Transform deplibs into only deplibs that can be linked in shared.
+	name_save=$name
+	libname_save=$libname
+	release_save=$release
+	versuffix_save=$versuffix
+	major_save=$major
+	# I'm not sure if I'm treating the release correctly.  I think
+	# release should show up in the -l (ie -lgmp5) so we don't want to
+	# add it in twice.  Is that correct?
+	release=""
+	versuffix=""
+	major=""
+	newdeplibs=
+	droppeddeps=no
+	case $deplibs_check_method in
+	pass_all)
+	  # Don't check for shared/static.  Everything works.
+	  # This might be a little naive.  We might want to check
+	  # whether the library exists or not.  But this is on
+	  # osf3 & osf4 and I'm not really sure... Just
+	  # implementing what was already the behavior.
+	  newdeplibs=$deplibs
+	  ;;
+	test_compile)
+	  # This code stresses the "libraries are programs" paradigm to its
+	  # limits. Maybe even breaks it.  We compile a program, linking it
+	  # against the deplibs as a proxy for the library.  Then we can check
+	  # whether they linked in statically or dynamically with ldd.
+	  $opt_dry_run || $RM conftest.c
+	  cat > conftest.c <<EOF
+	  int main() { return 0; }
+EOF
+	  $opt_dry_run || $RM conftest
+	  if $LTCC $LTCFLAGS -o conftest conftest.c $deplibs; then
+	    ldd_output=`ldd conftest`
+	    for i in $deplibs; do
+	      case $i in
+	      -l*)
+		func_stripname -l '' "$i"
+		name=$func_stripname_result
+		if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+		  case " $predeps $postdeps " in
+		  *" $i "*)
+		    func_append newdeplibs " $i"
+		    i=""
+		    ;;
+		  esac
+		fi
+		if test -n "$i" ; then
+		  libname=`eval "\\$ECHO \"$libname_spec\""`
+		  deplib_matches=`eval "\\$ECHO \"$library_names_spec\""`
+		  set dummy $deplib_matches; shift
+		  deplib_match=$1
+		  if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+		    func_append newdeplibs " $i"
+		  else
+		    droppeddeps=yes
+		    echo
+		    $ECHO "*** Warning: dynamic linker does not accept needed library $i."
+		    echo "*** I have the capability to make that library automatically link in when"
+		    echo "*** you link to this library.  But I can only do this if you have a"
+		    echo "*** shared version of the library, which I believe you do not have"
+		    echo "*** because a test_compile did reveal that the linker did not use it for"
+		    echo "*** its dynamic dependency list that programs get resolved with at runtime."
+		  fi
+		fi
+		;;
+	      *)
+		func_append newdeplibs " $i"
+		;;
+	      esac
+	    done
+	  else
+	    # Error occurred in the first compile.  Let's try to salvage
+	    # the situation: Compile a separate program for each library.
+	    for i in $deplibs; do
+	      case $i in
+	      -l*)
+		func_stripname -l '' "$i"
+		name=$func_stripname_result
+		$opt_dry_run || $RM conftest
+		if $LTCC $LTCFLAGS -o conftest conftest.c $i; then
+		  ldd_output=`ldd conftest`
+		  if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+		    case " $predeps $postdeps " in
+		    *" $i "*)
+		      func_append newdeplibs " $i"
+		      i=""
+		      ;;
+		    esac
+		  fi
+		  if test -n "$i" ; then
+		    libname=`eval "\\$ECHO \"$libname_spec\""`
+		    deplib_matches=`eval "\\$ECHO \"$library_names_spec\""`
+		    set dummy $deplib_matches; shift
+		    deplib_match=$1
+		    if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+		      func_append newdeplibs " $i"
+		    else
+		      droppeddeps=yes
+		      echo
+		      $ECHO "*** Warning: dynamic linker does not accept needed library $i."
+		      echo "*** I have the capability to make that library automatically link in when"
+		      echo "*** you link to this library.  But I can only do this if you have a"
+		      echo "*** shared version of the library, which you do not appear to have"
+		      echo "*** because a test_compile did reveal that the linker did not use this one"
+		      echo "*** as a dynamic dependency that programs can get resolved with at runtime."
+		    fi
+		  fi
+		else
+		  droppeddeps=yes
+		  echo
+		  $ECHO "*** Warning!  Library $i is needed by this library but I was not able to"
+		  echo "*** make it link in!  You will probably need to install it or some"
+		  echo "*** library that it depends on before this library will be fully"
+		  echo "*** functional.  Installing it before continuing would be even better."
+		fi
+		;;
+	      *)
+		func_append newdeplibs " $i"
+		;;
+	      esac
+	    done
+	  fi
+	  ;;
+	file_magic*)
+	  set dummy $deplibs_check_method; shift
+	  file_magic_regex=`expr "$deplibs_check_method" : "$1 \(.*\)"`
+	  for a_deplib in $deplibs; do
+	    case $a_deplib in
+	    -l*)
+	      func_stripname -l '' "$a_deplib"
+	      name=$func_stripname_result
+	      if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+		case " $predeps $postdeps " in
+		*" $a_deplib "*)
+		  func_append newdeplibs " $a_deplib"
+		  a_deplib=""
+		  ;;
+		esac
+	      fi
+	      if test -n "$a_deplib" ; then
+		libname=`eval "\\$ECHO \"$libname_spec\""`
+		if test -n "$file_magic_glob"; then
+		  libnameglob=`func_echo_all "$libname" | $SED -e $file_magic_glob`
+		else
+		  libnameglob=$libname
+		fi
+		test "$want_nocaseglob" = yes && nocaseglob=`shopt -p nocaseglob`
+		for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
+		  if test "$want_nocaseglob" = yes; then
+		    shopt -s nocaseglob
+		    potential_libs=`ls $i/$libnameglob[.-]* 2>/dev/null`
+		    $nocaseglob
+		  else
+		    potential_libs=`ls $i/$libnameglob[.-]* 2>/dev/null`
+		  fi
+		  for potent_lib in $potential_libs; do
+		      # Follow soft links.
+		      if ls -lLd "$potent_lib" 2>/dev/null |
+			 $GREP " -> " >/dev/null; then
+			continue
+		      fi
+		      # The statement above tries to avoid entering an
+		      # endless loop below, in case of cyclic links.
+		      # We might still enter an endless loop, since a link
+		      # loop can be closed while we follow links,
+		      # but so what?
+		      potlib="$potent_lib"
+		      while test -h "$potlib" 2>/dev/null; do
+			potliblink=`ls -ld $potlib | ${SED} 's/.* -> //'`
+			case $potliblink in
+			[\\/]* | [A-Za-z]:[\\/]*) potlib="$potliblink";;
+			*) potlib=`$ECHO "$potlib" | $SED 's,[^/]*$,,'`"$potliblink";;
+			esac
+		      done
+		      if eval $file_magic_cmd \"\$potlib\" 2>/dev/null |
+			 $SED -e 10q |
+			 $EGREP "$file_magic_regex" > /dev/null; then
+			func_append newdeplibs " $a_deplib"
+			a_deplib=""
+			break 2
+		      fi
+		  done
+		done
+	      fi
+	      if test -n "$a_deplib" ; then
+		droppeddeps=yes
+		echo
+		$ECHO "*** Warning: linker path does not have real file for library $a_deplib."
+		echo "*** I have the capability to make that library automatically link in when"
+		echo "*** you link to this library.  But I can only do this if you have a"
+		echo "*** shared version of the library, which you do not appear to have"
+		echo "*** because I did check the linker path looking for a file starting"
+		if test -z "$potlib" ; then
+		  $ECHO "*** with $libname but no candidates were found. (...for file magic test)"
+		else
+		  $ECHO "*** with $libname and none of the candidates passed a file format test"
+		  $ECHO "*** using a file magic. Last file checked: $potlib"
+		fi
+	      fi
+	      ;;
+	    *)
+	      # Add a -L argument.
+	      func_append newdeplibs " $a_deplib"
+	      ;;
+	    esac
+	  done # Gone through all deplibs.
+	  ;;
+	match_pattern*)
+	  set dummy $deplibs_check_method; shift
+	  match_pattern_regex=`expr "$deplibs_check_method" : "$1 \(.*\)"`
+	  for a_deplib in $deplibs; do
+	    case $a_deplib in
+	    -l*)
+	      func_stripname -l '' "$a_deplib"
+	      name=$func_stripname_result
+	      if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+		case " $predeps $postdeps " in
+		*" $a_deplib "*)
+		  func_append newdeplibs " $a_deplib"
+		  a_deplib=""
+		  ;;
+		esac
+	      fi
+	      if test -n "$a_deplib" ; then
+		libname=`eval "\\$ECHO \"$libname_spec\""`
+		for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
+		  potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
+		  for potent_lib in $potential_libs; do
+		    potlib="$potent_lib" # see symlink-check above in file_magic test
+		    if eval "\$ECHO \"$potent_lib\"" 2>/dev/null | $SED 10q | \
+		       $EGREP "$match_pattern_regex" > /dev/null; then
+		      func_append newdeplibs " $a_deplib"
+		      a_deplib=""
+		      break 2
+		    fi
+		  done
+		done
+	      fi
+	      if test -n "$a_deplib" ; then
+		droppeddeps=yes
+		echo
+		$ECHO "*** Warning: linker path does not have real file for library $a_deplib."
+		echo "*** I have the capability to make that library automatically link in when"
+		echo "*** you link to this library.  But I can only do this if you have a"
+		echo "*** shared version of the library, which you do not appear to have"
+		echo "*** because I did check the linker path looking for a file starting"
+		if test -z "$potlib" ; then
+		  $ECHO "*** with $libname but no candidates were found. (...for regex pattern test)"
+		else
+		  $ECHO "*** with $libname and none of the candidates passed a file format test"
+		  $ECHO "*** using a regex pattern. Last file checked: $potlib"
+		fi
+	      fi
+	      ;;
+	    *)
+	      # Add a -L argument.
+	      func_append newdeplibs " $a_deplib"
+	      ;;
+	    esac
+	  done # Gone through all deplibs.
+	  ;;
+	none | unknown | *)
+	  newdeplibs=""
+	  tmp_deplibs=`$ECHO " $deplibs" | $SED 's/ -lc$//; s/ -[LR][^ ]*//g'`
+	  if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+	    for i in $predeps $postdeps ; do
+	      # can't use Xsed below, because $i might contain '/'
+	      tmp_deplibs=`$ECHO " $tmp_deplibs" | $SED "s,$i,,"`
+	    done
+	  fi
+	  case $tmp_deplibs in
+	  *[!\	\ ]*)
+	    echo
+	    if test "X$deplibs_check_method" = "Xnone"; then
+	      echo "*** Warning: inter-library dependencies are not supported in this platform."
+	    else
+	      echo "*** Warning: inter-library dependencies are not known to be supported."
+	    fi
+	    echo "*** All declared inter-library dependencies are being dropped."
+	    droppeddeps=yes
+	    ;;
+	  esac
+	  ;;
+	esac
+	versuffix=$versuffix_save
+	major=$major_save
+	release=$release_save
+	libname=$libname_save
+	name=$name_save
+
+	case $host in
+	*-*-rhapsody* | *-*-darwin1.[012])
+	  # On Rhapsody replace the C library with the System framework
+	  newdeplibs=`$ECHO " $newdeplibs" | $SED 's/ -lc / System.ltframework /'`
+	  ;;
+	esac
+
+	if test "$droppeddeps" = yes; then
+	  if test "$module" = yes; then
+	    echo
+	    echo "*** Warning: libtool could not satisfy all declared inter-library"
+	    $ECHO "*** dependencies of module $libname.  Therefore, libtool will create"
+	    echo "*** a static module, that should work as long as the dlopening"
+	    echo "*** application is linked with the -dlopen flag."
+	    if test -z "$global_symbol_pipe"; then
+	      echo
+	      echo "*** However, this would only work if libtool was able to extract symbol"
+	      echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+	      echo "*** not find such a program.  So, this module is probably useless."
+	      echo "*** \`nm' from GNU binutils and a full rebuild may help."
+	    fi
+	    if test "$build_old_libs" = no; then
+	      oldlibs="$output_objdir/$libname.$libext"
+	      build_libtool_libs=module
+	      build_old_libs=yes
+	    else
+	      build_libtool_libs=no
+	    fi
+	  else
+	    echo "*** The inter-library dependencies that have been dropped here will be"
+	    echo "*** automatically added whenever a program is linked with this library"
+	    echo "*** or is declared to -dlopen it."
+
+	    if test "$allow_undefined" = no; then
+	      echo
+	      echo "*** Since this library must not contain undefined symbols,"
+	      echo "*** because either the platform does not support them or"
+	      echo "*** it was explicitly requested with -no-undefined,"
+	      echo "*** libtool will only create a static version of it."
+	      if test "$build_old_libs" = no; then
+		oldlibs="$output_objdir/$libname.$libext"
+		build_libtool_libs=module
+		build_old_libs=yes
+	      else
+		build_libtool_libs=no
+	      fi
+	    fi
+	  fi
+	fi
+	# Done checking deplibs!
+	deplibs=$newdeplibs
+      fi
+      # Time to change all our "foo.ltframework" stuff back to "-framework foo"
+      case $host in
+	*-*-darwin*)
+	  newdeplibs=`$ECHO " $newdeplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
+	  new_inherited_linker_flags=`$ECHO " $new_inherited_linker_flags" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
+	  deplibs=`$ECHO " $deplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
+	  ;;
+      esac
+
+      # move library search paths that coincide with paths to not yet
+      # installed libraries to the beginning of the library search list
+      new_libs=
+      for path in $notinst_path; do
+	case " $new_libs " in
+	*" -L$path/$objdir "*) ;;
+	*)
+	  case " $deplibs " in
+	  *" -L$path/$objdir "*)
+	    func_append new_libs " -L$path/$objdir" ;;
+	  esac
+	  ;;
+	esac
+      done
+      for deplib in $deplibs; do
+	case $deplib in
+	-L*)
+	  case " $new_libs " in
+	  *" $deplib "*) ;;
+	  *) func_append new_libs " $deplib" ;;
+	  esac
+	  ;;
+	*) func_append new_libs " $deplib" ;;
+	esac
+      done
+      deplibs="$new_libs"
+
+      # All the library-specific variables (install_libdir is set above).
+      library_names=
+      old_library=
+      dlname=
+
+      # Test again, we may have decided not to build it any more
+      if test "$build_libtool_libs" = yes; then
+	# Remove ${wl} instances when linking with ld.
+	# FIXME: should test the right _cmds variable.
+	case $archive_cmds in
+	  *\$LD\ *) wl= ;;
+        esac
+	if test "$hardcode_into_libs" = yes; then
+	  # Hardcode the library paths
+	  hardcode_libdirs=
+	  dep_rpath=
+	  rpath="$finalize_rpath"
+	  test "$opt_mode" != relink && rpath="$compile_rpath$rpath"
+	  for libdir in $rpath; do
+	    if test -n "$hardcode_libdir_flag_spec"; then
+	      if test -n "$hardcode_libdir_separator"; then
+		func_replace_sysroot "$libdir"
+		libdir=$func_replace_sysroot_result
+		if test -z "$hardcode_libdirs"; then
+		  hardcode_libdirs="$libdir"
+		else
+		  # Just accumulate the unique libdirs.
+		  case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+		  *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		    ;;
+		  *)
+		    func_append hardcode_libdirs "$hardcode_libdir_separator$libdir"
+		    ;;
+		  esac
+		fi
+	      else
+		eval flag=\"$hardcode_libdir_flag_spec\"
+		func_append dep_rpath " $flag"
+	      fi
+	    elif test -n "$runpath_var"; then
+	      case "$perm_rpath " in
+	      *" $libdir "*) ;;
+	      *) func_append perm_rpath " $libdir" ;;
+	      esac
+	    fi
+	  done
+	  # Substitute the hardcoded libdirs into the rpath.
+	  if test -n "$hardcode_libdir_separator" &&
+	     test -n "$hardcode_libdirs"; then
+	    libdir="$hardcode_libdirs"
+	    eval "dep_rpath=\"$hardcode_libdir_flag_spec\""
+	  fi
+	  if test -n "$runpath_var" && test -n "$perm_rpath"; then
+	    # We should set the runpath_var.
+	    rpath=
+	    for dir in $perm_rpath; do
+	      func_append rpath "$dir:"
+	    done
+	    eval "$runpath_var='$rpath\$$runpath_var'; export $runpath_var"
+	  fi
+	  test -n "$dep_rpath" && deplibs="$dep_rpath $deplibs"
+	fi
+
+	shlibpath="$finalize_shlibpath"
+	test "$opt_mode" != relink && shlibpath="$compile_shlibpath$shlibpath"
+	if test -n "$shlibpath"; then
+	  eval "$shlibpath_var='$shlibpath\$$shlibpath_var'; export $shlibpath_var"
+	fi
+
+	# Get the real and link names of the library.
+	eval shared_ext=\"$shrext_cmds\"
+	eval library_names=\"$library_names_spec\"
+	set dummy $library_names
+	shift
+	realname="$1"
+	shift
+
+	if test -n "$soname_spec"; then
+	  eval soname=\"$soname_spec\"
+	else
+	  soname="$realname"
+	fi
+	if test -z "$dlname"; then
+	  dlname=$soname
+	fi
+
+	lib="$output_objdir/$realname"
+	linknames=
+	for link
+	do
+	  func_append linknames " $link"
+	done
+
+	# Use standard objects if they are pic
+	test -z "$pic_flag" && libobjs=`$ECHO "$libobjs" | $SP2NL | $SED "$lo2o" | $NL2SP`
+	test "X$libobjs" = "X " && libobjs=
+
+	delfiles=
+	if test -n "$export_symbols" && test -n "$include_expsyms"; then
+	  $opt_dry_run || cp "$export_symbols" "$output_objdir/$libname.uexp"
+	  export_symbols="$output_objdir/$libname.uexp"
+	  func_append delfiles " $export_symbols"
+	fi
+
+	orig_export_symbols=
+	case $host_os in
+	cygwin* | mingw* | cegcc*)
+	  if test -n "$export_symbols" && test -z "$export_symbols_regex"; then
+	    # exporting using user supplied symfile
+	    if test "x`$SED 1q $export_symbols`" != xEXPORTS; then
+	      # and it's NOT already a .def file. Must figure out
+	      # which of the given symbols are data symbols and tag
+	      # them as such. So, trigger use of export_symbols_cmds.
+	      # export_symbols gets reassigned inside the "prepare
+	      # the list of exported symbols" if statement, so the
+	      # include_expsyms logic still works.
+	      orig_export_symbols="$export_symbols"
+	      export_symbols=
+	      always_export_symbols=yes
+	    fi
+	  fi
+	  ;;
+	esac
+
+	# Prepare the list of exported symbols
+	if test -z "$export_symbols"; then
+	  if test "$always_export_symbols" = yes || test -n "$export_symbols_regex"; then
+	    func_verbose "generating symbol list for \`$libname.la'"
+	    export_symbols="$output_objdir/$libname.exp"
+	    $opt_dry_run || $RM $export_symbols
+	    cmds=$export_symbols_cmds
+	    save_ifs="$IFS"; IFS='~'
+	    for cmd1 in $cmds; do
+	      IFS="$save_ifs"
+	      # Take the normal branch if the nm_file_list_spec branch
+	      # doesn't work or if tool conversion is not needed.
+	      case $nm_file_list_spec~$to_tool_file_cmd in
+		*~func_convert_file_noop | *~func_convert_file_msys_to_w32 | ~*)
+		  try_normal_branch=yes
+		  eval cmd=\"$cmd1\"
+		  func_len " $cmd"
+		  len=$func_len_result
+		  ;;
+		*)
+		  try_normal_branch=no
+		  ;;
+	      esac
+	      if test "$try_normal_branch" = yes \
+		 && { test "$len" -lt "$max_cmd_len" \
+		      || test "$max_cmd_len" -le -1; }
+	      then
+		func_show_eval "$cmd" 'exit $?'
+		skipped_export=false
+	      elif test -n "$nm_file_list_spec"; then
+		func_basename "$output"
+		output_la=$func_basename_result
+		save_libobjs=$libobjs
+		save_output=$output
+		output=${output_objdir}/${output_la}.nm
+		func_to_tool_file "$output"
+		libobjs=$nm_file_list_spec$func_to_tool_file_result
+		func_append delfiles " $output"
+		func_verbose "creating $NM input file list: $output"
+		for obj in $save_libobjs; do
+		  func_to_tool_file "$obj"
+		  $ECHO "$func_to_tool_file_result"
+		done > "$output"
+		eval cmd=\"$cmd1\"
+		func_show_eval "$cmd" 'exit $?'
+		output=$save_output
+		libobjs=$save_libobjs
+		skipped_export=false
+	      else
+		# The command line is too long to execute in one step.
+		func_verbose "using reloadable object file for export list..."
+		skipped_export=:
+		# Break out early, otherwise skipped_export may be
+		# set to false by a later but shorter cmd.
+		break
+	      fi
+	    done
+	    IFS="$save_ifs"
+	    if test -n "$export_symbols_regex" && test "X$skipped_export" != "X:"; then
+	      func_show_eval '$EGREP -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
+	      func_show_eval '$MV "${export_symbols}T" "$export_symbols"'
+	    fi
+	  fi
+	fi
+
+	if test -n "$export_symbols" && test -n "$include_expsyms"; then
+	  tmp_export_symbols="$export_symbols"
+	  test -n "$orig_export_symbols" && tmp_export_symbols="$orig_export_symbols"
+	  $opt_dry_run || eval '$ECHO "$include_expsyms" | $SP2NL >> "$tmp_export_symbols"'
+	fi
+
+	if test "X$skipped_export" != "X:" && test -n "$orig_export_symbols"; then
+	  # The given exports_symbols file has to be filtered, so filter it.
+	  func_verbose "filter symbol list for \`$libname.la' to tag DATA exports"
+	  # FIXME: $output_objdir/$libname.filter potentially contains lots of
+	  # 's' commands which not all seds can handle. GNU sed should be fine
+	  # though. Also, the filter scales superlinearly with the number of
+	  # global variables. join(1) would be nice here, but unfortunately
+	  # isn't a blessed tool.
+	  $opt_dry_run || $SED -e '/[ ,]DATA/!d;s,\(.*\)\([ \,].*\),s|^\1$|\1\2|,' < $export_symbols > $output_objdir/$libname.filter
+	  func_append delfiles " $export_symbols $output_objdir/$libname.filter"
+	  export_symbols=$output_objdir/$libname.def
+	  $opt_dry_run || $SED -f $output_objdir/$libname.filter < $orig_export_symbols > $export_symbols
+	fi
+
+	tmp_deplibs=
+	for test_deplib in $deplibs; do
+	  case " $convenience " in
+	  *" $test_deplib "*) ;;
+	  *)
+	    func_append tmp_deplibs " $test_deplib"
+	    ;;
+	  esac
+	done
+	deplibs="$tmp_deplibs"
+
+	if test -n "$convenience"; then
+	  if test -n "$whole_archive_flag_spec" &&
+	    test "$compiler_needs_object" = yes &&
+	    test -z "$libobjs"; then
+	    # extract the archives, so we have objects to list.
+	    # TODO: could optimize this to just extract one archive.
+	    whole_archive_flag_spec=
+	  fi
+	  if test -n "$whole_archive_flag_spec"; then
+	    save_libobjs=$libobjs
+	    eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
+	    test "X$libobjs" = "X " && libobjs=
+	  else
+	    gentop="$output_objdir/${outputname}x"
+	    func_append generated " $gentop"
+
+	    func_extract_archives $gentop $convenience
+	    func_append libobjs " $func_extract_archives_result"
+	    test "X$libobjs" = "X " && libobjs=
+	  fi
+	fi
+
+	if test "$thread_safe" = yes && test -n "$thread_safe_flag_spec"; then
+	  eval flag=\"$thread_safe_flag_spec\"
+	  func_append linker_flags " $flag"
+	fi
+
+	# Make a backup of the uninstalled library when relinking
+	if test "$opt_mode" = relink; then
+	  $opt_dry_run || eval '(cd $output_objdir && $RM ${realname}U && $MV $realname ${realname}U)' || exit $?
+	fi
+
+	# Do each of the archive commands.
+	if test "$module" = yes && test -n "$module_cmds" ; then
+	  if test -n "$export_symbols" && test -n "$module_expsym_cmds"; then
+	    eval test_cmds=\"$module_expsym_cmds\"
+	    cmds=$module_expsym_cmds
+	  else
+	    eval test_cmds=\"$module_cmds\"
+	    cmds=$module_cmds
+	  fi
+	else
+	  if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
+	    eval test_cmds=\"$archive_expsym_cmds\"
+	    cmds=$archive_expsym_cmds
+	  else
+	    eval test_cmds=\"$archive_cmds\"
+	    cmds=$archive_cmds
+	  fi
+	fi
+
+	if test "X$skipped_export" != "X:" &&
+	   func_len " $test_cmds" &&
+	   len=$func_len_result &&
+	   test "$len" -lt "$max_cmd_len" || test "$max_cmd_len" -le -1; then
+	  :
+	else
+	  # The command line is too long to link in one step, link piecewise
+	  # or, if using GNU ld and skipped_export is not :, use a linker
+	  # script.
+
+	  # Save the value of $output and $libobjs because we want to
+	  # use them later.  If we have whole_archive_flag_spec, we
+	  # want to use save_libobjs as it was before
+	  # whole_archive_flag_spec was expanded, because we can't
+	  # assume the linker understands whole_archive_flag_spec.
+	  # This may have to be revisited, in case too many
+	  # convenience libraries get linked in and end up exceeding
+	  # the spec.
+	  if test -z "$convenience" || test -z "$whole_archive_flag_spec"; then
+	    save_libobjs=$libobjs
+	  fi
+	  save_output=$output
+	  func_basename "$output"
+	  output_la=$func_basename_result
+
+	  # Clear the reloadable object creation command queue and
+	  # initialize k to one.
+	  test_cmds=
+	  concat_cmds=
+	  objlist=
+	  last_robj=
+	  k=1
+
+	  if test -n "$save_libobjs" && test "X$skipped_export" != "X:" && test "$with_gnu_ld" = yes; then
+	    output=${output_objdir}/${output_la}.lnkscript
+	    func_verbose "creating GNU ld script: $output"
+	    echo 'INPUT (' > $output
+	    for obj in $save_libobjs
+	    do
+	      func_to_tool_file "$obj"
+	      $ECHO "$func_to_tool_file_result" >> $output
+	    done
+	    echo ')' >> $output
+	    func_append delfiles " $output"
+	    func_to_tool_file "$output"
+	    output=$func_to_tool_file_result
+	  elif test -n "$save_libobjs" && test "X$skipped_export" != "X:" && test "X$file_list_spec" != X; then
+	    output=${output_objdir}/${output_la}.lnk
+	    func_verbose "creating linker input file list: $output"
+	    : > $output
+	    set x $save_libobjs
+	    shift
+	    firstobj=
+	    if test "$compiler_needs_object" = yes; then
+	      firstobj="$1 "
+	      shift
+	    fi
+	    for obj
+	    do
+	      func_to_tool_file "$obj"
+	      $ECHO "$func_to_tool_file_result" >> $output
+	    done
+	    func_append delfiles " $output"
+	    func_to_tool_file "$output"
+	    output=$firstobj\"$file_list_spec$func_to_tool_file_result\"
+	  else
+	    if test -n "$save_libobjs"; then
+	      func_verbose "creating reloadable object files..."
+	      output=$output_objdir/$output_la-${k}.$objext
+	      eval test_cmds=\"$reload_cmds\"
+	      func_len " $test_cmds"
+	      len0=$func_len_result
+	      len=$len0
+
+	      # Loop over the list of objects to be linked.
+	      for obj in $save_libobjs
+	      do
+		func_len " $obj"
+		func_arith $len + $func_len_result
+		len=$func_arith_result
+		if test "X$objlist" = X ||
+		   test "$len" -lt "$max_cmd_len"; then
+		  func_append objlist " $obj"
+		else
+		  # The command $test_cmds is almost too long, add a
+		  # command to the queue.
+		  if test "$k" -eq 1 ; then
+		    # The first file doesn't have a previous command to add.
+		    reload_objs=$objlist
+		    eval concat_cmds=\"$reload_cmds\"
+		  else
+		    # All subsequent reloadable object files will link in
+		    # the last one created.
+		    reload_objs="$objlist $last_robj"
+		    eval concat_cmds=\"\$concat_cmds~$reload_cmds~\$RM $last_robj\"
+		  fi
+		  last_robj=$output_objdir/$output_la-${k}.$objext
+		  func_arith $k + 1
+		  k=$func_arith_result
+		  output=$output_objdir/$output_la-${k}.$objext
+		  objlist=" $obj"
+		  func_len " $last_robj"
+		  func_arith $len0 + $func_len_result
+		  len=$func_arith_result
+		fi
+	      done
+	      # Handle the remaining objects by creating one last
+	      # reloadable object file.  All subsequent reloadable object
+	      # files will link in the last one created.
+	      test -z "$concat_cmds" || concat_cmds=$concat_cmds~
+	      reload_objs="$objlist $last_robj"
+	      eval concat_cmds=\"\${concat_cmds}$reload_cmds\"
+	      if test -n "$last_robj"; then
+	        eval concat_cmds=\"\${concat_cmds}~\$RM $last_robj\"
+	      fi
+	      func_append delfiles " $output"
+
+	    else
+	      output=
+	    fi
+
+	    if ${skipped_export-false}; then
+	      func_verbose "generating symbol list for \`$libname.la'"
+	      export_symbols="$output_objdir/$libname.exp"
+	      $opt_dry_run || $RM $export_symbols
+	      libobjs=$output
+	      # Append the command to create the export file.
+	      test -z "$concat_cmds" || concat_cmds=$concat_cmds~
+	      eval concat_cmds=\"\$concat_cmds$export_symbols_cmds\"
+	      if test -n "$last_robj"; then
+		eval concat_cmds=\"\$concat_cmds~\$RM $last_robj\"
+	      fi
+	    fi
+
+	    test -n "$save_libobjs" &&
+	      func_verbose "creating a temporary reloadable object file: $output"
+
+	    # Loop through the commands generated above and execute them.
+	    save_ifs="$IFS"; IFS='~'
+	    for cmd in $concat_cmds; do
+	      IFS="$save_ifs"
+	      $opt_silent || {
+		  func_quote_for_expand "$cmd"
+		  eval "func_echo $func_quote_for_expand_result"
+	      }
+	      $opt_dry_run || eval "$cmd" || {
+		lt_exit=$?
+
+		# Restore the uninstalled library and exit
+		if test "$opt_mode" = relink; then
+		  ( cd "$output_objdir" && \
+		    $RM "${realname}T" && \
+		    $MV "${realname}U" "$realname" )
+		fi
+
+		exit $lt_exit
+	      }
+	    done
+	    IFS="$save_ifs"
+
+	    if test -n "$export_symbols_regex" && ${skipped_export-false}; then
+	      func_show_eval '$EGREP -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
+	      func_show_eval '$MV "${export_symbols}T" "$export_symbols"'
+	    fi
+	  fi
+
+          if ${skipped_export-false}; then
+	    if test -n "$export_symbols" && test -n "$include_expsyms"; then
+	      tmp_export_symbols="$export_symbols"
+	      test -n "$orig_export_symbols" && tmp_export_symbols="$orig_export_symbols"
+	      $opt_dry_run || eval '$ECHO "$include_expsyms" | $SP2NL >> "$tmp_export_symbols"'
+	    fi
+
+	    if test -n "$orig_export_symbols"; then
+	      # The given exports_symbols file has to be filtered, so filter it.
+	      func_verbose "filter symbol list for \`$libname.la' to tag DATA exports"
+	      # FIXME: $output_objdir/$libname.filter potentially contains lots of
+	      # 's' commands which not all seds can handle. GNU sed should be fine
+	      # though. Also, the filter scales superlinearly with the number of
+	      # global variables. join(1) would be nice here, but unfortunately
+	      # isn't a blessed tool.
+	      $opt_dry_run || $SED -e '/[ ,]DATA/!d;s,\(.*\)\([ \,].*\),s|^\1$|\1\2|,' < $export_symbols > $output_objdir/$libname.filter
+	      func_append delfiles " $export_symbols $output_objdir/$libname.filter"
+	      export_symbols=$output_objdir/$libname.def
+	      $opt_dry_run || $SED -f $output_objdir/$libname.filter < $orig_export_symbols > $export_symbols
+	    fi
+	  fi
+
+	  libobjs=$output
+	  # Restore the value of output.
+	  output=$save_output
+
+	  if test -n "$convenience" && test -n "$whole_archive_flag_spec"; then
+	    eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
+	    test "X$libobjs" = "X " && libobjs=
+	  fi
+	  # Expand the library linking commands again to reset the
+	  # value of $libobjs for piecewise linking.
+
+	  # Do each of the archive commands.
+	  if test "$module" = yes && test -n "$module_cmds" ; then
+	    if test -n "$export_symbols" && test -n "$module_expsym_cmds"; then
+	      cmds=$module_expsym_cmds
+	    else
+	      cmds=$module_cmds
+	    fi
+	  else
+	    if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
+	      cmds=$archive_expsym_cmds
+	    else
+	      cmds=$archive_cmds
+	    fi
+	  fi
+	fi
+
+	if test -n "$delfiles"; then
+	  # Append the command to remove temporary files to $cmds.
+	  eval cmds=\"\$cmds~\$RM $delfiles\"
+	fi
+
+	# Add any objects from preloaded convenience libraries
+	if test -n "$dlprefiles"; then
+	  gentop="$output_objdir/${outputname}x"
+	  func_append generated " $gentop"
+
+	  func_extract_archives $gentop $dlprefiles
+	  func_append libobjs " $func_extract_archives_result"
+	  test "X$libobjs" = "X " && libobjs=
+	fi
+
+	save_ifs="$IFS"; IFS='~'
+	for cmd in $cmds; do
+	  IFS="$save_ifs"
+	  eval cmd=\"$cmd\"
+	  $opt_silent || {
+	    func_quote_for_expand "$cmd"
+	    eval "func_echo $func_quote_for_expand_result"
+	  }
+	  $opt_dry_run || eval "$cmd" || {
+	    lt_exit=$?
+
+	    # Restore the uninstalled library and exit
+	    if test "$opt_mode" = relink; then
+	      ( cd "$output_objdir" && \
+	        $RM "${realname}T" && \
+		$MV "${realname}U" "$realname" )
+	    fi
+
+	    exit $lt_exit
+	  }
+	done
+	IFS="$save_ifs"
+
+	# Restore the uninstalled library and exit
+	if test "$opt_mode" = relink; then
+	  $opt_dry_run || eval '(cd $output_objdir && $RM ${realname}T && $MV $realname ${realname}T && $MV ${realname}U $realname)' || exit $?
+
+	  if test -n "$convenience"; then
+	    if test -z "$whole_archive_flag_spec"; then
+	      func_show_eval '${RM}r "$gentop"'
+	    fi
+	  fi
+
+	  exit $EXIT_SUCCESS
+	fi
+
+	# Create links to the real library.
+	for linkname in $linknames; do
+	  if test "$realname" != "$linkname"; then
+	    func_show_eval '(cd "$output_objdir" && $RM "$linkname" && $LN_S "$realname" "$linkname")' 'exit $?'
+	  fi
+	done
+
+	# If -module or -export-dynamic was specified, set the dlname.
+	if test "$module" = yes || test "$export_dynamic" = yes; then
+	  # On all known operating systems, these are identical.
+	  dlname="$soname"
+	fi
+      fi
+      ;;
+
+    obj)
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	func_warning "\`-dlopen' is ignored for objects"
+      fi
+
+      case " $deplibs" in
+      *\ -l* | *\ -L*)
+	func_warning "\`-l' and \`-L' are ignored for objects" ;;
+      esac
+
+      test -n "$rpath" && \
+	func_warning "\`-rpath' is ignored for objects"
+
+      test -n "$xrpath" && \
+	func_warning "\`-R' is ignored for objects"
+
+      test -n "$vinfo" && \
+	func_warning "\`-version-info' is ignored for objects"
+
+      test -n "$release" && \
+	func_warning "\`-release' is ignored for objects"
+
+      case $output in
+      *.lo)
+	test -n "$objs$old_deplibs" && \
+	  func_fatal_error "cannot build library object \`$output' from non-libtool objects"
+
+	libobj=$output
+	func_lo2o "$libobj"
+	obj=$func_lo2o_result
+	;;
+      *)
+	libobj=
+	obj="$output"
+	;;
+      esac
+
+      # Delete the old objects.
+      $opt_dry_run || $RM $obj $libobj
+
+      # Objects from convenience libraries.  This assumes
+      # single-version convenience libraries.  Whenever we create
+      # different ones for PIC/non-PIC, this we'll have to duplicate
+      # the extraction.
+      reload_conv_objs=
+      gentop=
+      # reload_cmds runs $LD directly, so let us get rid of
+      # -Wl from whole_archive_flag_spec and hope we can get by with
+      # turning comma into space..
+      wl=
+
+      if test -n "$convenience"; then
+	if test -n "$whole_archive_flag_spec"; then
+	  eval tmp_whole_archive_flags=\"$whole_archive_flag_spec\"
+	  reload_conv_objs=$reload_objs\ `$ECHO "$tmp_whole_archive_flags" | $SED 's|,| |g'`
+	else
+	  gentop="$output_objdir/${obj}x"
+	  func_append generated " $gentop"
+
+	  func_extract_archives $gentop $convenience
+	  reload_conv_objs="$reload_objs $func_extract_archives_result"
+	fi
+      fi
+
+      # If we're not building shared, we need to use non_pic_objs
+      test "$build_libtool_libs" != yes && libobjs="$non_pic_objects"
+
+      # Create the old-style object.
+      reload_objs="$objs$old_deplibs "`$ECHO "$libobjs" | $SP2NL | $SED "/\.${libext}$/d; /\.lib$/d; $lo2o" | $NL2SP`" $reload_conv_objs" ### testsuite: skip nested quoting test
+
+      output="$obj"
+      func_execute_cmds "$reload_cmds" 'exit $?'
+
+      # Exit if we aren't doing a library object file.
+      if test -z "$libobj"; then
+	if test -n "$gentop"; then
+	  func_show_eval '${RM}r "$gentop"'
+	fi
+
+	exit $EXIT_SUCCESS
+      fi
+
+      if test "$build_libtool_libs" != yes; then
+	if test -n "$gentop"; then
+	  func_show_eval '${RM}r "$gentop"'
+	fi
+
+	# Create an invalid libtool object if no PIC, so that we don't
+	# accidentally link it into a program.
+	# $show "echo timestamp > $libobj"
+	# $opt_dry_run || eval "echo timestamp > $libobj" || exit $?
+	exit $EXIT_SUCCESS
+      fi
+
+      if test -n "$pic_flag" || test "$pic_mode" != default; then
+	# Only do commands if we really have different PIC objects.
+	reload_objs="$libobjs $reload_conv_objs"
+	output="$libobj"
+	func_execute_cmds "$reload_cmds" 'exit $?'
+      fi
+
+      if test -n "$gentop"; then
+	func_show_eval '${RM}r "$gentop"'
+      fi
+
+      exit $EXIT_SUCCESS
+      ;;
+
+    prog)
+      case $host in
+	*cygwin*) func_stripname '' '.exe' "$output"
+	          output=$func_stripname_result.exe;;
+      esac
+      test -n "$vinfo" && \
+	func_warning "\`-version-info' is ignored for programs"
+
+      test -n "$release" && \
+	func_warning "\`-release' is ignored for programs"
+
+      test "$preload" = yes \
+        && test "$dlopen_support" = unknown \
+	&& test "$dlopen_self" = unknown \
+	&& test "$dlopen_self_static" = unknown && \
+	  func_warning "\`LT_INIT([dlopen])' not used. Assuming no dlopen support."
+
+      case $host in
+      *-*-rhapsody* | *-*-darwin1.[012])
+	# On Rhapsody replace the C library is the System framework
+	compile_deplibs=`$ECHO " $compile_deplibs" | $SED 's/ -lc / System.ltframework /'`
+	finalize_deplibs=`$ECHO " $finalize_deplibs" | $SED 's/ -lc / System.ltframework /'`
+	;;
+      esac
+
+      case $host in
+      *-*-darwin*)
+	# Don't allow lazy linking, it breaks C++ global constructors
+	# But is supposedly fixed on 10.4 or later (yay!).
+	if test "$tagname" = CXX ; then
+	  case ${MACOSX_DEPLOYMENT_TARGET-10.0} in
+	    10.[0123])
+	      func_append compile_command " ${wl}-bind_at_load"
+	      func_append finalize_command " ${wl}-bind_at_load"
+	    ;;
+	  esac
+	fi
+	# Time to change all our "foo.ltframework" stuff back to "-framework foo"
+	compile_deplibs=`$ECHO " $compile_deplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
+	finalize_deplibs=`$ECHO " $finalize_deplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
+	;;
+      esac
+
+
+      # move library search paths that coincide with paths to not yet
+      # installed libraries to the beginning of the library search list
+      new_libs=
+      for path in $notinst_path; do
+	case " $new_libs " in
+	*" -L$path/$objdir "*) ;;
+	*)
+	  case " $compile_deplibs " in
+	  *" -L$path/$objdir "*)
+	    func_append new_libs " -L$path/$objdir" ;;
+	  esac
+	  ;;
+	esac
+      done
+      for deplib in $compile_deplibs; do
+	case $deplib in
+	-L*)
+	  case " $new_libs " in
+	  *" $deplib "*) ;;
+	  *) func_append new_libs " $deplib" ;;
+	  esac
+	  ;;
+	*) func_append new_libs " $deplib" ;;
+	esac
+      done
+      compile_deplibs="$new_libs"
+
+
+      func_append compile_command " $compile_deplibs"
+      func_append finalize_command " $finalize_deplibs"
+
+      if test -n "$rpath$xrpath"; then
+	# If the user specified any rpath flags, then add them.
+	for libdir in $rpath $xrpath; do
+	  # This is the magic to use -rpath.
+	  case "$finalize_rpath " in
+	  *" $libdir "*) ;;
+	  *) func_append finalize_rpath " $libdir" ;;
+	  esac
+	done
+      fi
+
+      # Now hardcode the library paths
+      rpath=
+      hardcode_libdirs=
+      for libdir in $compile_rpath $finalize_rpath; do
+	if test -n "$hardcode_libdir_flag_spec"; then
+	  if test -n "$hardcode_libdir_separator"; then
+	    if test -z "$hardcode_libdirs"; then
+	      hardcode_libdirs="$libdir"
+	    else
+	      # Just accumulate the unique libdirs.
+	      case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+	      *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		;;
+	      *)
+		func_append hardcode_libdirs "$hardcode_libdir_separator$libdir"
+		;;
+	      esac
+	    fi
+	  else
+	    eval flag=\"$hardcode_libdir_flag_spec\"
+	    func_append rpath " $flag"
+	  fi
+	elif test -n "$runpath_var"; then
+	  case "$perm_rpath " in
+	  *" $libdir "*) ;;
+	  *) func_append perm_rpath " $libdir" ;;
+	  esac
+	fi
+	case $host in
+	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*)
+	  testbindir=`${ECHO} "$libdir" | ${SED} -e 's*/lib$*/bin*'`
+	  case :$dllsearchpath: in
+	  *":$libdir:"*) ;;
+	  ::) dllsearchpath=$libdir;;
+	  *) func_append dllsearchpath ":$libdir";;
+	  esac
+	  case :$dllsearchpath: in
+	  *":$testbindir:"*) ;;
+	  ::) dllsearchpath=$testbindir;;
+	  *) func_append dllsearchpath ":$testbindir";;
+	  esac
+	  ;;
+	esac
+      done
+      # Substitute the hardcoded libdirs into the rpath.
+      if test -n "$hardcode_libdir_separator" &&
+	 test -n "$hardcode_libdirs"; then
+	libdir="$hardcode_libdirs"
+	eval rpath=\" $hardcode_libdir_flag_spec\"
+      fi
+      compile_rpath="$rpath"
+
+      rpath=
+      hardcode_libdirs=
+      for libdir in $finalize_rpath; do
+	if test -n "$hardcode_libdir_flag_spec"; then
+	  if test -n "$hardcode_libdir_separator"; then
+	    if test -z "$hardcode_libdirs"; then
+	      hardcode_libdirs="$libdir"
+	    else
+	      # Just accumulate the unique libdirs.
+	      case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+	      *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		;;
+	      *)
+		func_append hardcode_libdirs "$hardcode_libdir_separator$libdir"
+		;;
+	      esac
+	    fi
+	  else
+	    eval flag=\"$hardcode_libdir_flag_spec\"
+	    func_append rpath " $flag"
+	  fi
+	elif test -n "$runpath_var"; then
+	  case "$finalize_perm_rpath " in
+	  *" $libdir "*) ;;
+	  *) func_append finalize_perm_rpath " $libdir" ;;
+	  esac
+	fi
+      done
+      # Substitute the hardcoded libdirs into the rpath.
+      if test -n "$hardcode_libdir_separator" &&
+	 test -n "$hardcode_libdirs"; then
+	libdir="$hardcode_libdirs"
+	eval rpath=\" $hardcode_libdir_flag_spec\"
+      fi
+      finalize_rpath="$rpath"
+
+      if test -n "$libobjs" && test "$build_old_libs" = yes; then
+	# Transform all the library objects into standard objects.
+	compile_command=`$ECHO "$compile_command" | $SP2NL | $SED "$lo2o" | $NL2SP`
+	finalize_command=`$ECHO "$finalize_command" | $SP2NL | $SED "$lo2o" | $NL2SP`
+      fi
+
+      func_generate_dlsyms "$outputname" "@PROGRAM@" "no"
+
+      # template prelinking step
+      if test -n "$prelink_cmds"; then
+	func_execute_cmds "$prelink_cmds" 'exit $?'
+      fi
+
+      wrappers_required=yes
+      case $host in
+      *cegcc* | *mingw32ce*)
+        # Disable wrappers for cegcc and mingw32ce hosts, we are cross compiling anyway.
+        wrappers_required=no
+        ;;
+      *cygwin* | *mingw* )
+        if test "$build_libtool_libs" != yes; then
+          wrappers_required=no
+        fi
+        ;;
+      *)
+        if test "$need_relink" = no || test "$build_libtool_libs" != yes; then
+          wrappers_required=no
+        fi
+        ;;
+      esac
+      if test "$wrappers_required" = no; then
+	# Replace the output file specification.
+	compile_command=`$ECHO "$compile_command" | $SED 's%@OUTPUT@%'"$output"'%g'`
+	link_command="$compile_command$compile_rpath"
+
+	# We have no uninstalled library dependencies, so finalize right now.
+	exit_status=0
+	func_show_eval "$link_command" 'exit_status=$?'
+
+	if test -n "$postlink_cmds"; then
+	  func_to_tool_file "$output"
+	  postlink_cmds=`func_echo_all "$postlink_cmds" | $SED -e 's%@OUTPUT@%'"$output"'%g' -e 's%@TOOL_OUTPUT@%'"$func_to_tool_file_result"'%g'`
+	  func_execute_cmds "$postlink_cmds" 'exit $?'
+	fi
+
+	# Delete the generated files.
+	if test -f "$output_objdir/${outputname}S.${objext}"; then
+	  func_show_eval '$RM "$output_objdir/${outputname}S.${objext}"'
+	fi
+
+	exit $exit_status
+      fi
+
+      if test -n "$compile_shlibpath$finalize_shlibpath"; then
+	compile_command="$shlibpath_var=\"$compile_shlibpath$finalize_shlibpath\$$shlibpath_var\" $compile_command"
+      fi
+      if test -n "$finalize_shlibpath"; then
+	finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command"
+      fi
+
+      compile_var=
+      finalize_var=
+      if test -n "$runpath_var"; then
+	if test -n "$perm_rpath"; then
+	  # We should set the runpath_var.
+	  rpath=
+	  for dir in $perm_rpath; do
+	    func_append rpath "$dir:"
+	  done
+	  compile_var="$runpath_var=\"$rpath\$$runpath_var\" "
+	fi
+	if test -n "$finalize_perm_rpath"; then
+	  # We should set the runpath_var.
+	  rpath=
+	  for dir in $finalize_perm_rpath; do
+	    func_append rpath "$dir:"
+	  done
+	  finalize_var="$runpath_var=\"$rpath\$$runpath_var\" "
+	fi
+      fi
+
+      if test "$no_install" = yes; then
+	# We don't need to create a wrapper script.
+	link_command="$compile_var$compile_command$compile_rpath"
+	# Replace the output file specification.
+	link_command=`$ECHO "$link_command" | $SED 's%@OUTPUT@%'"$output"'%g'`
+	# Delete the old output file.
+	$opt_dry_run || $RM $output
+	# Link the executable and exit
+	func_show_eval "$link_command" 'exit $?'
+
+	if test -n "$postlink_cmds"; then
+	  func_to_tool_file "$output"
+	  postlink_cmds=`func_echo_all "$postlink_cmds" | $SED -e 's%@OUTPUT@%'"$output"'%g' -e 's%@TOOL_OUTPUT@%'"$func_to_tool_file_result"'%g'`
+	  func_execute_cmds "$postlink_cmds" 'exit $?'
+	fi
+
+	exit $EXIT_SUCCESS
+      fi
+
+      if test "$hardcode_action" = relink; then
+	# Fast installation is not supported
+	link_command="$compile_var$compile_command$compile_rpath"
+	relink_command="$finalize_var$finalize_command$finalize_rpath"
+
+	func_warning "this platform does not like uninstalled shared libraries"
+	func_warning "\`$output' will be relinked during installation"
+      else
+	if test "$fast_install" != no; then
+	  link_command="$finalize_var$compile_command$finalize_rpath"
+	  if test "$fast_install" = yes; then
+	    relink_command=`$ECHO "$compile_var$compile_command$compile_rpath" | $SED 's%@OUTPUT@%\$progdir/\$file%g'`
+	  else
+	    # fast_install is set to needless
+	    relink_command=
+	  fi
+	else
+	  link_command="$compile_var$compile_command$compile_rpath"
+	  relink_command="$finalize_var$finalize_command$finalize_rpath"
+	fi
+      fi
+
+      # Replace the output file specification.
+      link_command=`$ECHO "$link_command" | $SED 's%@OUTPUT@%'"$output_objdir/$outputname"'%g'`
+
+      # Delete the old output files.
+      $opt_dry_run || $RM $output $output_objdir/$outputname $output_objdir/lt-$outputname
+
+      func_show_eval "$link_command" 'exit $?'
+
+      if test -n "$postlink_cmds"; then
+	func_to_tool_file "$output_objdir/$outputname"
+	postlink_cmds=`func_echo_all "$postlink_cmds" | $SED -e 's%@OUTPUT@%'"$output_objdir/$outputname"'%g' -e 's%@TOOL_OUTPUT@%'"$func_to_tool_file_result"'%g'`
+	func_execute_cmds "$postlink_cmds" 'exit $?'
+      fi
+
+      # Now create the wrapper script.
+      func_verbose "creating $output"
+
+      # Quote the relink command for shipping.
+      if test -n "$relink_command"; then
+	# Preserve any variables that may affect compiler behavior
+	for var in $variables_saved_for_relink; do
+	  if eval test -z \"\${$var+set}\"; then
+	    relink_command="{ test -z \"\${$var+set}\" || $lt_unset $var || { $var=; export $var; }; }; $relink_command"
+	  elif eval var_value=\$$var; test -z "$var_value"; then
+	    relink_command="$var=; export $var; $relink_command"
+	  else
+	    func_quote_for_eval "$var_value"
+	    relink_command="$var=$func_quote_for_eval_result; export $var; $relink_command"
+	  fi
+	done
+	relink_command="(cd `pwd`; $relink_command)"
+	relink_command=`$ECHO "$relink_command" | $SED "$sed_quote_subst"`
+      fi
+
+      # Only actually do things if not in dry run mode.
+      $opt_dry_run || {
+	# win32 will think the script is a binary if it has
+	# a .exe suffix, so we strip it off here.
+	case $output in
+	  *.exe) func_stripname '' '.exe' "$output"
+	         output=$func_stripname_result ;;
+	esac
+	# test for cygwin because mv fails w/o .exe extensions
+	case $host in
+	  *cygwin*)
+	    exeext=.exe
+	    func_stripname '' '.exe' "$outputname"
+	    outputname=$func_stripname_result ;;
+	  *) exeext= ;;
+	esac
+	case $host in
+	  *cygwin* | *mingw* )
+	    func_dirname_and_basename "$output" "" "."
+	    output_name=$func_basename_result
+	    output_path=$func_dirname_result
+	    cwrappersource="$output_path/$objdir/lt-$output_name.c"
+	    cwrapper="$output_path/$output_name.exe"
+	    $RM $cwrappersource $cwrapper
+	    trap "$RM $cwrappersource $cwrapper; exit $EXIT_FAILURE" 1 2 15
+
+	    func_emit_cwrapperexe_src > $cwrappersource
+
+	    # The wrapper executable is built using the $host compiler,
+	    # because it contains $host paths and files. If cross-
+	    # compiling, it, like the target executable, must be
+	    # executed on the $host or under an emulation environment.
+	    $opt_dry_run || {
+	      $LTCC $LTCFLAGS -o $cwrapper $cwrappersource
+	      $STRIP $cwrapper
+	    }
+
+	    # Now, create the wrapper script for func_source use:
+	    func_ltwrapper_scriptname $cwrapper
+	    $RM $func_ltwrapper_scriptname_result
+	    trap "$RM $func_ltwrapper_scriptname_result; exit $EXIT_FAILURE" 1 2 15
+	    $opt_dry_run || {
+	      # note: this script will not be executed, so do not chmod.
+	      if test "x$build" = "x$host" ; then
+		$cwrapper --lt-dump-script > $func_ltwrapper_scriptname_result
+	      else
+		func_emit_wrapper no > $func_ltwrapper_scriptname_result
+	      fi
+	    }
+	  ;;
+	  * )
+	    $RM $output
+	    trap "$RM $output; exit $EXIT_FAILURE" 1 2 15
+
+	    func_emit_wrapper no > $output
+	    chmod +x $output
+	  ;;
+	esac
+      }
+      exit $EXIT_SUCCESS
+      ;;
+    esac
+
+    # See if we need to build an old-fashioned archive.
+    for oldlib in $oldlibs; do
+
+      if test "$build_libtool_libs" = convenience; then
+	oldobjs="$libobjs_save $symfileobj"
+	addlibs="$convenience"
+	build_libtool_libs=no
+      else
+	if test "$build_libtool_libs" = module; then
+	  oldobjs="$libobjs_save"
+	  build_libtool_libs=no
+	else
+	  oldobjs="$old_deplibs $non_pic_objects"
+	  if test "$preload" = yes && test -f "$symfileobj"; then
+	    func_append oldobjs " $symfileobj"
+	  fi
+	fi
+	addlibs="$old_convenience"
+      fi
+
+      if test -n "$addlibs"; then
+	gentop="$output_objdir/${outputname}x"
+	func_append generated " $gentop"
+
+	func_extract_archives $gentop $addlibs
+	func_append oldobjs " $func_extract_archives_result"
+      fi
+
+      # Do each command in the archive commands.
+      if test -n "$old_archive_from_new_cmds" && test "$build_libtool_libs" = yes; then
+	cmds=$old_archive_from_new_cmds
+      else
+
+	# Add any objects from preloaded convenience libraries
+	if test -n "$dlprefiles"; then
+	  gentop="$output_objdir/${outputname}x"
+	  func_append generated " $gentop"
+
+	  func_extract_archives $gentop $dlprefiles
+	  func_append oldobjs " $func_extract_archives_result"
+	fi
+
+	# POSIX demands no paths to be encoded in archives.  We have
+	# to avoid creating archives with duplicate basenames if we
+	# might have to extract them afterwards, e.g., when creating a
+	# static archive out of a convenience library, or when linking
+	# the entirety of a libtool archive into another (currently
+	# not supported by libtool).
+	if (for obj in $oldobjs
+	    do
+	      func_basename "$obj"
+	      $ECHO "$func_basename_result"
+	    done | sort | sort -uc >/dev/null 2>&1); then
+	  :
+	else
+	  echo "copying selected object files to avoid basename conflicts..."
+	  gentop="$output_objdir/${outputname}x"
+	  func_append generated " $gentop"
+	  func_mkdir_p "$gentop"
+	  save_oldobjs=$oldobjs
+	  oldobjs=
+	  counter=1
+	  for obj in $save_oldobjs
+	  do
+	    func_basename "$obj"
+	    objbase="$func_basename_result"
+	    case " $oldobjs " in
+	    " ") oldobjs=$obj ;;
+	    *[\ /]"$objbase "*)
+	      while :; do
+		# Make sure we don't pick an alternate name that also
+		# overlaps.
+		newobj=lt$counter-$objbase
+		func_arith $counter + 1
+		counter=$func_arith_result
+		case " $oldobjs " in
+		*[\ /]"$newobj "*) ;;
+		*) if test ! -f "$gentop/$newobj"; then break; fi ;;
+		esac
+	      done
+	      func_show_eval "ln $obj $gentop/$newobj || cp $obj $gentop/$newobj"
+	      func_append oldobjs " $gentop/$newobj"
+	      ;;
+	    *) func_append oldobjs " $obj" ;;
+	    esac
+	  done
+	fi
+	func_to_tool_file "$oldlib" func_convert_file_msys_to_w32
+	tool_oldlib=$func_to_tool_file_result
+	eval cmds=\"$old_archive_cmds\"
+
+	func_len " $cmds"
+	len=$func_len_result
+	if test "$len" -lt "$max_cmd_len" || test "$max_cmd_len" -le -1; then
+	  cmds=$old_archive_cmds
+	elif test -n "$archiver_list_spec"; then
+	  func_verbose "using command file archive linking..."
+	  for obj in $oldobjs
+	  do
+	    func_to_tool_file "$obj"
+	    $ECHO "$func_to_tool_file_result"
+	  done > $output_objdir/$libname.libcmd
+	  func_to_tool_file "$output_objdir/$libname.libcmd"
+	  oldobjs=" $archiver_list_spec$func_to_tool_file_result"
+	  cmds=$old_archive_cmds
+	else
+	  # the command line is too long to link in one step, link in parts
+	  func_verbose "using piecewise archive linking..."
+	  save_RANLIB=$RANLIB
+	  RANLIB=:
+	  objlist=
+	  concat_cmds=
+	  save_oldobjs=$oldobjs
+	  oldobjs=
+	  # Is there a better way of finding the last object in the list?
+	  for obj in $save_oldobjs
+	  do
+	    last_oldobj=$obj
+	  done
+	  eval test_cmds=\"$old_archive_cmds\"
+	  func_len " $test_cmds"
+	  len0=$func_len_result
+	  len=$len0
+	  for obj in $save_oldobjs
+	  do
+	    func_len " $obj"
+	    func_arith $len + $func_len_result
+	    len=$func_arith_result
+	    func_append objlist " $obj"
+	    if test "$len" -lt "$max_cmd_len"; then
+	      :
+	    else
+	      # the above command should be used before it gets too long
+	      oldobjs=$objlist
+	      if test "$obj" = "$last_oldobj" ; then
+		RANLIB=$save_RANLIB
+	      fi
+	      test -z "$concat_cmds" || concat_cmds=$concat_cmds~
+	      eval concat_cmds=\"\${concat_cmds}$old_archive_cmds\"
+	      objlist=
+	      len=$len0
+	    fi
+	  done
+	  RANLIB=$save_RANLIB
+	  oldobjs=$objlist
+	  if test "X$oldobjs" = "X" ; then
+	    eval cmds=\"\$concat_cmds\"
+	  else
+	    eval cmds=\"\$concat_cmds~\$old_archive_cmds\"
+	  fi
+	fi
+      fi
+      func_execute_cmds "$cmds" 'exit $?'
+    done
+
+    test -n "$generated" && \
+      func_show_eval "${RM}r$generated"
+
+    # Now create the libtool archive.
+    case $output in
+    *.la)
+      old_library=
+      test "$build_old_libs" = yes && old_library="$libname.$libext"
+      func_verbose "creating $output"
+
+      # Preserve any variables that may affect compiler behavior
+      for var in $variables_saved_for_relink; do
+	if eval test -z \"\${$var+set}\"; then
+	  relink_command="{ test -z \"\${$var+set}\" || $lt_unset $var || { $var=; export $var; }; }; $relink_command"
+	elif eval var_value=\$$var; test -z "$var_value"; then
+	  relink_command="$var=; export $var; $relink_command"
+	else
+	  func_quote_for_eval "$var_value"
+	  relink_command="$var=$func_quote_for_eval_result; export $var; $relink_command"
+	fi
+      done
+      # Quote the link command for shipping.
+      relink_command="(cd `pwd`; $SHELL $progpath $preserve_args --mode=relink $libtool_args @inst_prefix_dir@)"
+      relink_command=`$ECHO "$relink_command" | $SED "$sed_quote_subst"`
+      if test "$hardcode_automatic" = yes ; then
+	relink_command=
+      fi
+
+      # Only create the output if not a dry run.
+      $opt_dry_run || {
+	for installed in no yes; do
+	  if test "$installed" = yes; then
+	    if test -z "$install_libdir"; then
+	      break
+	    fi
+	    output="$output_objdir/$outputname"i
+	    # Replace all uninstalled libtool libraries with the installed ones
+	    newdependency_libs=
+	    for deplib in $dependency_libs; do
+	      case $deplib in
+	      *.la)
+		func_basename "$deplib"
+		name="$func_basename_result"
+		func_resolve_sysroot "$deplib"
+		eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $func_resolve_sysroot_result`
+		test -z "$libdir" && \
+		  func_fatal_error "\`$deplib' is not a valid libtool archive"
+		func_append newdependency_libs " ${lt_sysroot:+=}$libdir/$name"
+		;;
+	      -L*)
+		func_stripname -L '' "$deplib"
+		func_replace_sysroot "$func_stripname_result"
+		func_append newdependency_libs " -L$func_replace_sysroot_result"
+		;;
+	      -R*)
+		func_stripname -R '' "$deplib"
+		func_replace_sysroot "$func_stripname_result"
+		func_append newdependency_libs " -R$func_replace_sysroot_result"
+		;;
+	      *) func_append newdependency_libs " $deplib" ;;
+	      esac
+	    done
+	    dependency_libs="$newdependency_libs"
+	    newdlfiles=
+
+	    for lib in $dlfiles; do
+	      case $lib in
+	      *.la)
+	        func_basename "$lib"
+		name="$func_basename_result"
+		eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+		test -z "$libdir" && \
+		  func_fatal_error "\`$lib' is not a valid libtool archive"
+		func_append newdlfiles " ${lt_sysroot:+=}$libdir/$name"
+		;;
+	      *) func_append newdlfiles " $lib" ;;
+	      esac
+	    done
+	    dlfiles="$newdlfiles"
+	    newdlprefiles=
+	    for lib in $dlprefiles; do
+	      case $lib in
+	      *.la)
+		# Only pass preopened files to the pseudo-archive (for
+		# eventual linking with the app. that links it) if we
+		# didn't already link the preopened objects directly into
+		# the library:
+		func_basename "$lib"
+		name="$func_basename_result"
+		eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+		test -z "$libdir" && \
+		  func_fatal_error "\`$lib' is not a valid libtool archive"
+		func_append newdlprefiles " ${lt_sysroot:+=}$libdir/$name"
+		;;
+	      esac
+	    done
+	    dlprefiles="$newdlprefiles"
+	  else
+	    newdlfiles=
+	    for lib in $dlfiles; do
+	      case $lib in
+		[\\/]* | [A-Za-z]:[\\/]*) abs="$lib" ;;
+		*) abs=`pwd`"/$lib" ;;
+	      esac
+	      func_append newdlfiles " $abs"
+	    done
+	    dlfiles="$newdlfiles"
+	    newdlprefiles=
+	    for lib in $dlprefiles; do
+	      case $lib in
+		[\\/]* | [A-Za-z]:[\\/]*) abs="$lib" ;;
+		*) abs=`pwd`"/$lib" ;;
+	      esac
+	      func_append newdlprefiles " $abs"
+	    done
+	    dlprefiles="$newdlprefiles"
+	  fi
+	  $RM $output
+	  # place dlname in correct position for cygwin
+	  # In fact, it would be nice if we could use this code for all target
+	  # systems that can't hard-code library paths into their executables
+	  # and that have no shared library path variable independent of PATH,
+	  # but it turns out we can't easily determine that from inspecting
+	  # libtool variables, so we have to hard-code the OSs to which it
+	  # applies here; at the moment, that means platforms that use the PE
+	  # object format with DLL files.  See the long comment at the top of
+	  # tests/bindir.at for full details.
+	  tdlname=$dlname
+	  case $host,$output,$installed,$module,$dlname in
+	    *cygwin*,*lai,yes,no,*.dll | *mingw*,*lai,yes,no,*.dll | *cegcc*,*lai,yes,no,*.dll)
+	      # If a -bindir argument was supplied, place the dll there.
+	      if test "x$bindir" != x ;
+	      then
+		func_relative_path "$install_libdir" "$bindir"
+		tdlname=$func_relative_path_result$dlname
+	      else
+		# Otherwise fall back on heuristic.
+		tdlname=../bin/$dlname
+	      fi
+	      ;;
+	  esac
+	  $ECHO > $output "\
+# $outputname - a libtool library file
+# Generated by $PROGRAM (GNU $PACKAGE$TIMESTAMP) $VERSION
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# The name that we can dlopen(3).
+dlname='$tdlname'
+
+# Names of this library.
+library_names='$library_names'
+
+# The name of the static archive.
+old_library='$old_library'
+
+# Linker flags that can not go in dependency_libs.
+inherited_linker_flags='$new_inherited_linker_flags'
+
+# Libraries that this one depends upon.
+dependency_libs='$dependency_libs'
+
+# Names of additional weak libraries provided by this library
+weak_library_names='$weak_libs'
+
+# Version information for $libname.
+current=$current
+age=$age
+revision=$revision
+
+# Is this an already installed library?
+installed=$installed
+
+# Should we warn about portability when linking against -modules?
+shouldnotlink=$module
+
+# Files to dlopen/dlpreopen
+dlopen='$dlfiles'
+dlpreopen='$dlprefiles'
+
+# Directory that this library needs to be installed in:
+libdir='$install_libdir'"
+	  if test "$installed" = no && test "$need_relink" = yes; then
+	    $ECHO >> $output "\
+relink_command=\"$relink_command\""
+	  fi
+	done
+      }
+
+      # Do a symbolic link so that the libtool archive can be found in
+      # LD_LIBRARY_PATH before the program is installed.
+      func_show_eval '( cd "$output_objdir" && $RM "$outputname" && $LN_S "../$outputname" "$outputname" )' 'exit $?'
+      ;;
+    esac
+    exit $EXIT_SUCCESS
+}
+
+{ test "$opt_mode" = link || test "$opt_mode" = relink; } &&
+    func_mode_link ${1+"$@"}
+
+
+# func_mode_uninstall arg...
+func_mode_uninstall ()
+{
+    $opt_debug
+    RM="$nonopt"
+    files=
+    rmforce=
+    exit_status=0
+
+    # This variable tells wrapper scripts just to set variables rather
+    # than running their programs.
+    libtool_install_magic="$magic"
+
+    for arg
+    do
+      case $arg in
+      -f) func_append RM " $arg"; rmforce=yes ;;
+      -*) func_append RM " $arg" ;;
+      *) func_append files " $arg" ;;
+      esac
+    done
+
+    test -z "$RM" && \
+      func_fatal_help "you must specify an RM program"
+
+    rmdirs=
+
+    for file in $files; do
+      func_dirname "$file" "" "."
+      dir="$func_dirname_result"
+      if test "X$dir" = X.; then
+	odir="$objdir"
+      else
+	odir="$dir/$objdir"
+      fi
+      func_basename "$file"
+      name="$func_basename_result"
+      test "$opt_mode" = uninstall && odir="$dir"
+
+      # Remember odir for removal later, being careful to avoid duplicates
+      if test "$opt_mode" = clean; then
+	case " $rmdirs " in
+	  *" $odir "*) ;;
+	  *) func_append rmdirs " $odir" ;;
+	esac
+      fi
+
+      # Don't error if the file doesn't exist and rm -f was used.
+      if { test -L "$file"; } >/dev/null 2>&1 ||
+	 { test -h "$file"; } >/dev/null 2>&1 ||
+	 test -f "$file"; then
+	:
+      elif test -d "$file"; then
+	exit_status=1
+	continue
+      elif test "$rmforce" = yes; then
+	continue
+      fi
+
+      rmfiles="$file"
+
+      case $name in
+      *.la)
+	# Possibly a libtool archive, so verify it.
+	if func_lalib_p "$file"; then
+	  func_source $dir/$name
+
+	  # Delete the libtool libraries and symlinks.
+	  for n in $library_names; do
+	    func_append rmfiles " $odir/$n"
+	  done
+	  test -n "$old_library" && func_append rmfiles " $odir/$old_library"
+
+	  case "$opt_mode" in
+	  clean)
+	    case " $library_names " in
+	    *" $dlname "*) ;;
+	    *) test -n "$dlname" && func_append rmfiles " $odir/$dlname" ;;
+	    esac
+	    test -n "$libdir" && func_append rmfiles " $odir/$name $odir/${name}i"
+	    ;;
+	  uninstall)
+	    if test -n "$library_names"; then
+	      # Do each command in the postuninstall commands.
+	      func_execute_cmds "$postuninstall_cmds" 'test "$rmforce" = yes || exit_status=1'
+	    fi
+
+	    if test -n "$old_library"; then
+	      # Do each command in the old_postuninstall commands.
+	      func_execute_cmds "$old_postuninstall_cmds" 'test "$rmforce" = yes || exit_status=1'
+	    fi
+	    # FIXME: should reinstall the best remaining shared library.
+	    ;;
+	  esac
+	fi
+	;;
+
+      *.lo)
+	# Possibly a libtool object, so verify it.
+	if func_lalib_p "$file"; then
+
+	  # Read the .lo file
+	  func_source $dir/$name
+
+	  # Add PIC object to the list of files to remove.
+	  if test -n "$pic_object" &&
+	     test "$pic_object" != none; then
+	    func_append rmfiles " $dir/$pic_object"
+	  fi
+
+	  # Add non-PIC object to the list of files to remove.
+	  if test -n "$non_pic_object" &&
+	     test "$non_pic_object" != none; then
+	    func_append rmfiles " $dir/$non_pic_object"
+	  fi
+	fi
+	;;
+
+      *)
+	if test "$opt_mode" = clean ; then
+	  noexename=$name
+	  case $file in
+	  *.exe)
+	    func_stripname '' '.exe' "$file"
+	    file=$func_stripname_result
+	    func_stripname '' '.exe' "$name"
+	    noexename=$func_stripname_result
+	    # $file with .exe has already been added to rmfiles,
+	    # add $file without .exe
+	    func_append rmfiles " $file"
+	    ;;
+	  esac
+	  # Do a test to see if this is a libtool program.
+	  if func_ltwrapper_p "$file"; then
+	    if func_ltwrapper_executable_p "$file"; then
+	      func_ltwrapper_scriptname "$file"
+	      relink_command=
+	      func_source $func_ltwrapper_scriptname_result
+	      func_append rmfiles " $func_ltwrapper_scriptname_result"
+	    else
+	      relink_command=
+	      func_source $dir/$noexename
+	    fi
+
+	    # note $name still contains .exe if it was in $file originally
+	    # as does the version of $file that was added into $rmfiles
+	    func_append rmfiles " $odir/$name $odir/${name}S.${objext}"
+	    if test "$fast_install" = yes && test -n "$relink_command"; then
+	      func_append rmfiles " $odir/lt-$name"
+	    fi
+	    if test "X$noexename" != "X$name" ; then
+	      func_append rmfiles " $odir/lt-${noexename}.c"
+	    fi
+	  fi
+	fi
+	;;
+      esac
+      func_show_eval "$RM $rmfiles" 'exit_status=1'
+    done
+
+    # Try to remove the ${objdir}s in the directories where we deleted files
+    for dir in $rmdirs; do
+      if test -d "$dir"; then
+	func_show_eval "rmdir $dir >/dev/null 2>&1"
+      fi
+    done
+
+    exit $exit_status
+}
+
+{ test "$opt_mode" = uninstall || test "$opt_mode" = clean; } &&
+    func_mode_uninstall ${1+"$@"}
+
+test -z "$opt_mode" && {
+  help="$generic_help"
+  func_fatal_help "you must specify a MODE"
+}
+
+test -z "$exec_cmd" && \
+  func_fatal_help "invalid operation mode \`$opt_mode'"
+
+if test -n "$exec_cmd"; then
+  eval exec "$exec_cmd"
+  exit $EXIT_FAILURE
+fi
+
+exit $exit_status
+
+
+# The TAGs below are defined such that we never get into a situation
+# in which we disable both kinds of libraries.  Given conflicting
+# choices, we go for a static library, that is the most portable,
+# since we can't tell whether shared libraries were disabled because
+# the user asked for that or because the platform doesn't support
+# them.  This is particularly important on AIX, because we don't
+# support having both static and shared libraries enabled at the same
+# time on that platform, so we default to a shared-only configuration.
+# If a disable-shared tag is given, we'll fallback to a static-only
+# configuration.  But we'll never go from static-only to shared-only.
+
+# ### BEGIN LIBTOOL TAG CONFIG: disable-shared
+build_libtool_libs=no
+build_old_libs=yes
+# ### END LIBTOOL TAG CONFIG: disable-shared
+
+# ### BEGIN LIBTOOL TAG CONFIG: disable-static
+build_old_libs=`case $build_libtool_libs in yes) echo no;; *) echo yes;; esac`
+# ### END LIBTOOL TAG CONFIG: disable-static
+
+# Local Variables:
+# mode:shell-script
+# sh-indentation:2
+# End:
+# vi:sw=2
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,8 @@
+EXTRA_DIST = acx_mpi.m4 acx_pthread.m4 ax_cc_maxopt.m4	\
+ax_check_compiler_flags.m4 ax_compiler_vendor.m4	\
+ax_gcc_aligns_stack.m4 ax_gcc_version.m4 ax_openmp.m4
+
+# libtool sticks a bunch of extra .m4 files in this directory,
+# but they don't seem to be needed for the distributed tarball
+# (they aren't needed for configure && make, and boostrapping
+# will regenerate them anyway).
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,425 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = m4
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+EXTRA_DIST = acx_mpi.m4 acx_pthread.m4 ax_cc_maxopt.m4	\
+ax_check_compiler_flags.m4 ax_compiler_vendor.m4	\
+ax_gcc_aligns_stack.m4 ax_gcc_version.m4 ax_openmp.m4
+
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu m4/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu m4/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+tags: TAGS
+TAGS:
+
+ctags: CTAGS
+CTAGS:
+
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+	distclean distclean-generic distclean-libtool distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am uninstall uninstall-am
+
+
+# libtool sticks a bunch of extra .m4 files in this directory,
+# but they don't seem to be needed for the distributed tarball
+# (they aren't needed for configure && make, and boostrapping
+# will regenerate them anyway).
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/acx_mpi.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/acx_mpi.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,106 @@
+dnl @synopsis ACX_MPI([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+dnl @summary figure out how to compile/link code with MPI
+dnl @category InstalledPackages
+dnl
+dnl This macro tries to find out how to compile programs that
+dnl use MPI (Message Passing Interface), a standard API for
+dnl parallel process communication (see http://www-unix.mcs.anl.gov/mpi/)
+dnl
+dnl On success, it sets the MPICC, MPICXX, or MPIF77 output variable to
+dnl the name of the MPI compiler, depending upon the current language.
+dnl (This may just be $CC/$CXX/$F77, but is more often something like
+dnl mpicc/mpiCC/mpif77.)  It also sets MPILIBS to any libraries that are
+dnl needed for linking MPI (e.g. -lmpi, if a special MPICC/MPICXX/MPIF77
+dnl was not found).
+dnl
+dnl If you want to compile everything with MPI, you should set:
+dnl
+dnl     CC="$MPICC" #OR# CXX="$MPICXX" #OR# F77="$MPIF77"
+dnl     LIBS="$MPILIBS $LIBS"
+dnl
+dnl NOTE: The above assumes that you will use $CC (or whatever)
+dnl       for linking as well as for compiling.  (This is the
+dnl       default for automake and most Makefiles.)
+dnl
+dnl The user can force a particular library/compiler by setting the
+dnl MPICC/MPICXX/MPIF77 and/or MPILIBS environment variables.
+dnl
+dnl ACTION-IF-FOUND is a list of shell commands to run if an MPI
+dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands
+dnl to run it if it is not found.  If ACTION-IF-FOUND is not specified,
+dnl the default action will define HAVE_MPI.
+dnl
+dnl @version 2005-09-02
+dnl @license GPLWithACException
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu>
+
+AC_DEFUN([ACX_MPI], [
+AC_PREREQ(2.50) dnl for AC_LANG_CASE
+
+AC_LANG_CASE([C], [
+	AC_REQUIRE([AC_PROG_CC])
+	AC_ARG_VAR(MPICC,[MPI C compiler command])
+	AC_CHECK_PROGS(MPICC, mpicc hcc mpcc mpcc_r mpxlc cmpicc, $CC)
+	acx_mpi_save_CC="$CC"
+	CC="$MPICC"
+	AC_SUBST(MPICC)
+],
+[C++], [
+	AC_REQUIRE([AC_PROG_CXX])
+	AC_ARG_VAR(MPICXX,[MPI C++ compiler command])
+	AC_CHECK_PROGS(MPICXX, mpic++ mpiCC mpicxx mpCC hcp mpxlC mpxlC_r cmpic++, $CXX)
+	acx_mpi_save_CXX="$CXX"
+	CXX="$MPICXX"
+	AC_SUBST(MPICXX)
+],
+[Fortran 77], [
+	AC_REQUIRE([AC_PROG_F77])
+	AC_ARG_VAR(MPIF77,[MPI Fortran compiler command])
+	AC_CHECK_PROGS(MPIF77, mpif77 hf77 mpxlf mpf77 mpif90 mpf90 mpxlf90 mpxlf95 mpxlf_r cmpifc cmpif90c, $F77)
+	acx_mpi_save_F77="$F77"
+	F77="$MPIF77"
+	AC_SUBST(MPIF77)
+])
+
+if test x = x"$MPILIBS"; then
+	AC_LANG_CASE([C], [AC_CHECK_FUNC(MPI_Init, [MPILIBS=" "])],
+		[C++], [AC_CHECK_FUNC(MPI_Init, [MPILIBS=" "])],
+		[Fortran 77], [AC_MSG_CHECKING([for MPI_Init])
+			AC_TRY_LINK([],[      call MPI_Init], [MPILIBS=" "
+				AC_MSG_RESULT(yes)], [AC_MSG_RESULT(no)])])
+fi
+if test x = x"$MPILIBS"; then
+	AC_CHECK_LIB(mpi, MPI_Init, [MPILIBS="-lmpi"])
+fi
+if test x = x"$MPILIBS"; then
+	AC_CHECK_LIB(mpich, MPI_Init, [MPILIBS="-lmpich"])
+fi
+
+dnl We have to use AC_TRY_COMPILE and not AC_CHECK_HEADER because the
+dnl latter uses $CPP, not $CC (which may be mpicc).
+AC_LANG_CASE([C], [if test x != x"$MPILIBS"; then
+	AC_MSG_CHECKING([for mpi.h])
+	AC_TRY_COMPILE([#include <mpi.h>],[],[AC_MSG_RESULT(yes)], [MPILIBS=""
+		AC_MSG_RESULT(no)])
+fi],
+[C++], [if test x != x"$MPILIBS"; then
+	AC_MSG_CHECKING([for mpi.h])
+	AC_TRY_COMPILE([#include <mpi.h>],[],[AC_MSG_RESULT(yes)], [MPILIBS=""
+		AC_MSG_RESULT(no)])
+fi])
+
+AC_LANG_CASE([C], [CC="$acx_mpi_save_CC"],
+	[C++], [CXX="$acx_mpi_save_CXX"],
+	[Fortran 77], [F77="$acx_mpi_save_F77"])
+
+AC_SUBST(MPILIBS)
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x = x"$MPILIBS"; then
+        $2
+        :
+else
+        ifelse([$1],,[AC_DEFINE(HAVE_MPI,1,[Define if you have the MPI library.])],[$1])
+        :
+fi
+])dnl ACX_MPI
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/acx_pthread.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/acx_pthread.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,245 @@
+dnl @synopsis ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+dnl @summary figure out how to build C programs using POSIX threads
+dnl @category InstalledPackages
+dnl
+dnl This macro figures out how to build C programs using POSIX
+dnl threads.  It sets the PTHREAD_LIBS output variable to the threads
+dnl library and linker flags, and the PTHREAD_CFLAGS output variable
+dnl to any special C compiler flags that are needed.  (The user can also
+dnl force certain compiler flags/libs to be tested by setting these
+dnl environment variables.)
+dnl
+dnl Also sets PTHREAD_CC to any special C compiler that is needed for
+dnl multi-threaded programs (defaults to the value of CC otherwise).
+dnl (This is necessary on AIX to use the special cc_r compiler alias.)
+dnl
+dnl NOTE: You are assumed to not only compile your program with these
+dnl flags, but also link it with them as well.  e.g. you should link
+dnl with $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
+dnl
+dnl If you are only building threads programs, you may wish to
+dnl use these variables in your default LIBS, CFLAGS, and CC:
+dnl
+dnl        LIBS="$PTHREAD_LIBS $LIBS"
+dnl        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+dnl        CC="$PTHREAD_CC"
+dnl
+dnl In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute
+dnl constant has a nonstandard name, defines PTHREAD_CREATE_JOINABLE
+dnl to that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+dnl
+dnl ACTION-IF-FOUND is a list of shell commands to run if a threads
+dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands
+dnl to run it if it is not found.  If ACTION-IF-FOUND is not specified,
+dnl the default action will define HAVE_PTHREAD.
+dnl
+dnl Please let the authors know if this macro fails on any platform,
+dnl or if you have any other suggestions or comments.  This macro was
+dnl based on work by SGJ on autoconf scripts for FFTW (www.fftw.org)
+dnl (with help from M. Frigo), as well as ac_pthread and hb_pthread
+dnl macros posted by Alejandro Forero Cuervo to the autoconf macro
+dnl repository.  We are also grateful for the helpful feedback of
+dnl numerous users.
+dnl
+dnl @version 2006-09-15
+dnl @license GPLWithACException
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu>
+
+AC_DEFUN([ACX_PTHREAD], [
+AC_REQUIRE([AC_CANONICAL_HOST])
+AC_LANG_SAVE
+AC_LANG_C
+acx_pthread_ok=no
+
+# We used to check for pthread.h first, but this fails if pthread.h
+# requires special compiler flags (e.g. on True64 or Sequent).
+# It gets checked for in the link test anyway.
+
+# First of all, check if the user has set any of the PTHREAD_LIBS,
+# etcetera environment variables, and if threads linking works using
+# them:
+if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
+        AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes)
+        AC_MSG_RESULT($acx_pthread_ok)
+        if test x"$acx_pthread_ok" = xno; then
+                PTHREAD_LIBS=""
+                PTHREAD_CFLAGS=""
+        fi
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+fi
+
+# We must check for the threads library under a number of different
+# names; the ordering is very important because some systems
+# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
+# libraries is broken (non-POSIX).
+
+# Create a list of thread flags to try.  Items starting with a "-" are
+# C compiler flags, and other items are library names, except for "none"
+# which indicates that we try without any flags at all, and "pthread-config"
+# which is a program returning the flags for the Pth emulation library.
+
+acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mt -mthreads pthread --thread-safe pthread-config"
+
+# The ordering *is* (sometimes) important.  Some notes on the
+# individual items follow:
+
+# pthreads: AIX (must check this before -lpthread)
+# none: in case threads are in libc; should be tried before -Kthread and
+#       other compiler flags to prevent continual compiler warnings
+# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
+# -pthreads: Solaris/gcc
+# -mthreads: Mingw32/gcc, Lynx/gcc
+# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+#      doesn't hurt to check since this sometimes defines pthreads too;
+#      also defines -D_REENTRANT)
+#      ... -mt is also the pthreads flag for HP/aCC
+#           (where it should come before -mthreads to avoid spurious warnings)
+# pthread: Linux, etcetera
+# --thread-safe: KAI C++
+# pthread-config: use pthread-config program (for GNU Pth library)
+
+case "${host_cpu}-${host_os}" in
+        *solaris*)
+
+        # On Solaris (at least, for some versions), libc contains stubbed
+        # (non-functional) versions of the pthreads routines, so link-based
+        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
+        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+        # a function called by this macro, so we could check for that, but
+        # who knows whether they'll stub that too in a future libc.)  So,
+        # we'll just look for -pthreads and -lpthread first:
+
+        acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags"
+        ;;
+esac
+
+if test x"$acx_pthread_ok" = xno; then
+for flag in $acx_pthread_flags; do
+
+        case $flag in
+                none)
+                AC_MSG_CHECKING([whether pthreads work without any flags])
+                ;;
+
+                -*)
+                AC_MSG_CHECKING([whether pthreads work with $flag])
+                PTHREAD_CFLAGS="$flag"
+                ;;
+
+		pthread-config)
+		AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no)
+		if test x"$acx_pthread_config" = xno; then continue; fi
+		PTHREAD_CFLAGS="`pthread-config --cflags`"
+		PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
+		;;
+
+                *)
+                AC_MSG_CHECKING([for the pthreads library -l$flag])
+                PTHREAD_LIBS="-l$flag"
+                ;;
+        esac
+
+        save_LIBS="$LIBS"
+        save_CFLAGS="$CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Check for various functions.  We must include pthread.h,
+        # since some functions may be macros.  (On the Sequent, we
+        # need a special flag -Kthread to make this header compile.)
+        # We check for pthread_join because it is in -lpthread on IRIX
+        # while pthread_create is in libc.  We check for pthread_attr_init
+        # due to DEC craziness with -lpthreads.  We check for
+        # pthread_cleanup_push because it is one of the few pthread
+        # functions on Solaris that doesn't have a non-functional libc stub.
+        # We try pthread_create on general principles.
+        AC_TRY_LINK([#include <pthread.h>],
+                    [pthread_t th; pthread_join(th, (void**) 0);
+                     pthread_attr_init((pthread_attr_t*) 0);
+                     pthread_cleanup_push((void(*)(void *)) 0, (void*) 0);
+                     pthread_create((pthread_t*) 0, (pthread_attr_t*) 0,
+                                    (void*(*)(void *)) 0, (void*) 0);
+                     pthread_cleanup_pop(0); ],
+                    [acx_pthread_ok=yes])
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        AC_MSG_RESULT($acx_pthread_ok)
+        if test "x$acx_pthread_ok" = xyes; then
+                break;
+        fi
+
+        PTHREAD_LIBS=""
+        PTHREAD_CFLAGS=""
+done
+fi
+
+# Various other checks:
+if test "x$acx_pthread_ok" = xyes; then
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
+	AC_MSG_CHECKING([for joinable pthread attribute])
+	attr_name=unknown
+	for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+	    AC_TRY_LINK([#include <pthread.h>], [int attr=$attr; return attr;],
+                        [attr_name=$attr; break])
+	done
+        AC_MSG_RESULT($attr_name)
+        if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
+            AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name,
+                               [Define to necessary symbol if this constant
+                                uses a non-standard name on your system.])
+        fi
+
+        AC_MSG_CHECKING([if more special flags are required for pthreads])
+        flag=no
+        case "${host_cpu}-${host_os}" in
+            *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";;
+            *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";;
+        esac
+        AC_MSG_RESULT(${flag})
+        if test "x$flag" != xno; then
+            PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
+        fi
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        # More AIX lossage: must compile with xlc_r or cc_r
+	if test x"$GCC" != xyes; then
+          AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC})
+        else
+          PTHREAD_CC=$CC
+	fi
+else
+        PTHREAD_CC="$CC"
+fi
+
+AC_SUBST(PTHREAD_LIBS)
+AC_SUBST(PTHREAD_CFLAGS)
+AC_SUBST(PTHREAD_CC)
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$acx_pthread_ok" = xyes; then
+        ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1])
+        :
+else
+        acx_pthread_ok=no
+        $2
+fi
+AC_LANG_RESTORE
+])dnl ACX_PTHREAD
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/ax_cc_maxopt.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/ax_cc_maxopt.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,128 @@
+dnl @synopsis AX_CC_MAXOPT
+dnl @summary turn on optimization flags for the C compiler
+dnl @category C
+dnl
+dnl Try to turn on "good" C optimization flags for various compilers
+dnl and architectures, for some definition of "good".  (In our case,
+dnl good for FFTW and hopefully for other scientific codes.  Modify 
+dnl as needed.)
+dnl
+dnl The user can override the flags by setting the CFLAGS environment
+dnl variable.  
+dnl
+dnl Note also that the flags assume that ANSI C aliasing rules are
+dnl followed by the code (e.g. for gcc's -fstrict-aliasing), and that
+dnl floating-point computations can be re-ordered as needed.
+dnl
+dnl Requires macros: AX_CHECK_COMPILER_FLAGS, AX_COMPILER_VENDOR,
+dnl
+dnl @version 2011-06-22
+dnl @license GPLWithACException
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu> and Matteo Frigo.
+AC_DEFUN([AX_CC_MAXOPT],
+[
+AC_REQUIRE([AC_PROG_CC])
+AC_REQUIRE([AX_COMPILER_VENDOR])
+AC_REQUIRE([AC_CANONICAL_HOST])
+
+# Try to determine "good" native compiler flags if none specified via CFLAGS
+if test "$ac_test_CFLAGS" != "set"; then
+  CFLAGS=""
+  case $ax_cv_c_compiler_vendor in
+    dec) CFLAGS="-newc -w0 -O5 -ansi_alias -ansi_args -fp_reorder -tune host"
+    	 ;;
+
+    sun) CFLAGS="-native -fast -xO5 -dalign"
+    	 ;;
+
+    hp)  CFLAGS="+Oall +Optrs_ansi +DSnative"
+    	 ;;
+
+    ibm) xlc_opt="-qtune=auto"
+         AX_CHECK_COMPILER_FLAGS($xlc_opt,
+         	CFLAGS="-O3 -qansialias -w $xlc_opt",
+               [CFLAGS="-O3 -qansialias -w"
+                echo "******************************************************"
+                echo "*  You seem to have the IBM  C compiler.  It is      *"
+                echo "*  recommended for best performance that you use:    *"
+                echo "*                                                    *"
+                echo "*    CFLAGS=-O3 -qarch=xxx -qtune=xxx -qansialias -w *"
+                echo "*                      ^^^        ^^^                *"
+                echo "*  where xxx is pwr2, pwr3, 604, or whatever kind of *"
+                echo "*  CPU you have.  (Set the CFLAGS environment var.   *"
+                echo "*  and re-run configure.)  For more info, man cc.    *"
+                echo "******************************************************"])
+         ;;
+
+    intel) CFLAGS="-O3"
+        # Intel seems to have changed the spelling of this flag recently
+        icc_ansi_alias="unknown"
+	for flag in -ansi-alias -ansi_alias; do
+	  AX_CHECK_COMPILER_FLAGS($flag, [icc_ansi_alias=$flag; break])
+	done
+ 	if test "x$icc_ansi_alias" != xunknown; then
+            CFLAGS="$CFLAGS $icc_ansi_alias"
+        fi
+	AX_CHECK_COMPILER_FLAGS(-malign-double, CFLAGS="$CFLAGS -malign-double")
+	# We used to check for architecture flags here, e.g. -xHost etc.,
+	# but these flags are problematic.  On icc-12.0.0, "-mavx -xHost"
+	# overrides -mavx with -xHost, generating SSE2 code instead of AVX
+	# code.  ICC does not seem to support -mtune=host or equivalent
+	# non-ABI changing flag.
+	;;
+    
+    gnu) 
+     # Default optimization flags for gcc on all systems.
+     # Somehow -O3 does not imply -fomit-frame-pointer on ia32
+     CFLAGS="-O3 -fomit-frame-pointer"
+
+     # tune for the host by default
+     AX_CHECK_COMPILER_FLAGS(-mtune=native, CFLAGS="$CFLAGS -mtune=native")
+
+     # -malign-double for x86 systems
+     AX_CHECK_COMPILER_FLAGS(-malign-double, CFLAGS="$CFLAGS -malign-double")
+
+     #  -fstrict-aliasing for gcc-2.95+
+     AX_CHECK_COMPILER_FLAGS(-fstrict-aliasing,
+	CFLAGS="$CFLAGS -fstrict-aliasing")
+
+     # -fno-schedule-insns is pretty much required on all risc
+     # processors.
+     # 
+     # gcc performs one pass of instruction scheduling, then a pass of
+     # register allocation, then another pass of instruction
+     # scheduling.  The first pass reorders instructions in a way that
+     # is pretty much the worst possible for the purposes of register
+     # allocation.  We disable the first pass.
+     AX_CHECK_COMPILER_FLAGS(-fno-schedule-insns, CFLAGS="$CFLAGS -fno-schedule-insns")
+
+     # note that we enable "unsafe" fp optimization with other compilers, too
+     AX_CHECK_COMPILER_FLAGS(-ffast-math, CFLAGS="$CFLAGS -ffast-math")
+
+     ;;
+  esac
+
+  if test -z "$CFLAGS"; then
+	echo ""
+	echo "********************************************************"
+        echo "* WARNING: Don't know the best CFLAGS for this system  *"
+        echo "* Use ./configure CFLAGS=... to specify your own flags *"
+	echo "* (otherwise, a default of CFLAGS=-O3 will be used)    *"
+	echo "********************************************************"
+	echo ""
+        CFLAGS="-O3"
+  fi
+
+  AX_CHECK_COMPILER_FLAGS($CFLAGS, [], [
+	echo ""
+        echo "********************************************************"
+        echo "* WARNING: The guessed CFLAGS don't seem to work with  *"
+        echo "* your compiler.                                       *"
+        echo "* Use ./configure CFLAGS=... to specify your own flags *"
+        echo "********************************************************"
+        echo ""
+        CFLAGS=""
+  ])
+
+fi
+])
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/ax_check_compiler_flags.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/ax_check_compiler_flags.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,40 @@
+dnl @synopsis AX_CHECK_COMPILER_FLAGS(FLAGS, [ACTION-SUCCESS], [ACTION-FAILURE])
+dnl @summary check whether FLAGS are accepted by the compiler
+dnl @category Misc
+dnl
+dnl Check whether the given compiler FLAGS work with the current language's
+dnl compiler, or whether they give an error.  (Warnings, however, are
+dnl ignored.)
+dnl
+dnl ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+dnl success/failure.
+dnl
+dnl @version 2005-05-30
+dnl @license GPLWithACException
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu> and Matteo Frigo.
+AC_DEFUN([AX_CHECK_COMPILER_FLAGS],
+[AC_PREREQ(2.59) dnl for _AC_LANG_PREFIX
+AC_MSG_CHECKING([whether _AC_LANG compiler accepts $1])
+dnl Some hackery here since AC_CACHE_VAL can't handle a non-literal varname:
+AS_LITERAL_IF([$1],
+  [AC_CACHE_VAL(AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1), [
+      ax_save_FLAGS=$[]_AC_LANG_PREFIX[]FLAGS
+      _AC_LANG_PREFIX[]FLAGS="$1"
+      AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], 
+        AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1)=yes,
+        AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1)=no)
+      _AC_LANG_PREFIX[]FLAGS=$ax_save_FLAGS])],
+  [ax_save_FLAGS=$[]_AC_LANG_PREFIX[]FLAGS
+   _AC_LANG_PREFIX[]FLAGS="$1"
+   AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], 
+     eval AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1)=yes,
+     eval AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1)=no)
+   _AC_LANG_PREFIX[]FLAGS=$ax_save_FLAGS])
+eval ax_check_compiler_flags=$AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1)
+AC_MSG_RESULT($ax_check_compiler_flags)
+if test "x$ax_check_compiler_flags" = xyes; then
+	m4_default([$2], :)
+else
+	m4_default([$3], :)
+fi
+])dnl AX_CHECK_COMPILER_FLAGS
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/ax_compiler_vendor.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/ax_compiler_vendor.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+dnl @synopsis AX_COMPILER_VENDOR
+dnl @summary find the vendor (gnu, intel, etc.) of the C/C++ compiler
+dnl @category C
+dnl @category C++
+dnl
+dnl Determine the vendor of the C/C++ compiler, e.g., gnu, intel, ibm,
+dnl sun, hp, borland, comeau, dec, cray, kai, lcc, metrowerks, sgi, 
+dnl microsoft, watcom, etc.  The vendor is returned in the cache variable
+dnl $ax_cv_c_compiler_vendor for C and $ax_cv_cxx_compiler_vendor for C++.
+dnl
+dnl @version 2007-08-01
+dnl @license GPLWithACException
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu> with Matteo Frigo
+
+AC_DEFUN([AX_COMPILER_VENDOR],
+[
+AC_CACHE_CHECK([for _AC_LANG compiler vendor], ax_cv_[]_AC_LANG_ABBREV[]_compiler_vendor,
+ [ax_cv_[]_AC_LANG_ABBREV[]_compiler_vendor=unknown
+  # note: don't check for gcc first since some other compilers define __GNUC__
+  for ventest in intel:__ICC,__ECC,__INTEL_COMPILER ibm:__xlc__,__xlC__,__IBMC__,__IBMCPP__ pathscale:__PATHCC__,__PATHSCALE__ gnu:__GNUC__ sun:__SUNPRO_C,__SUNPRO_CC hp:__HP_cc,__HP_aCC dec:__DECC,__DECCXX,__DECC_VER,__DECCXX_VER borland:__BORLANDC__,__TURBOC__ comeau:__COMO__ cray:_CRAYC kai:__KCC lcc:__LCC__ metrowerks:__MWERKS__ sgi:__sgi,sgi microsoft:_MSC_VER watcom:__WATCOMC__ portland:__PGI; do 
+    vencpp="defined("`echo $ventest | cut -d: -f2 | sed 's/,/) || defined(/g'`")"
+    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[
+#if !($vencpp)
+      thisisanerror;
+#endif
+])], [ax_cv_]_AC_LANG_ABBREV[_compiler_vendor=`echo $ventest | cut -d: -f1`; break])
+  done
+ ])
+])
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/ax_gcc_aligns_stack.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/ax_gcc_aligns_stack.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,50 @@
+dnl @synopsis AX_GCC_ALIGNS_STACK([ACTION-IF-YES], [ACTION-IF-NO])
+dnl @summary check whether gcc can align stack to 8-byte boundary
+dnl @category Misc
+dnl
+dnl Check to see if we are using a version of gcc that aligns the stack
+dnl (true in gcc-2.95+, which have the -mpreferred-stack-boundary flag).
+dnl Also, however, checks whether main() is correctly aligned by the
+dnl OS/libc/..., as well as for a bug in the stack alignment of gcc-2.95.x
+dnl (see http://gcc.gnu.org/ml/gcc-bugs/1999-11/msg00259.html).
+dnl
+dnl ACTION-IF-YES/ACTION-IF-NO are shell commands to execute if we are
+dnl using gcc and the stack is/isn't aligned, respectively.
+dnl
+dnl Requires macro: AX_CHECK_COMPILER_FLAGS, AX_GCC_VERSION
+dnl
+dnl @version 2005-05-30
+dnl @license GPLWithACException
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu>
+AC_DEFUN([AX_GCC_ALIGNS_STACK],
+[
+AC_REQUIRE([AC_PROG_CC])
+ax_gcc_aligns_stack=no
+if test "$GCC" = "yes"; then
+AX_CHECK_COMPILER_FLAGS(-mpreferred-stack-boundary=4, [
+	AC_MSG_CHECKING([whether the stack is at least 8-byte aligned by gcc])
+	save_CFLAGS="$CFLAGS"
+	CFLAGS="-O"
+	AX_CHECK_COMPILER_FLAGS(-malign-double, CFLAGS="$CFLAGS -malign-double")
+	AC_TRY_RUN([#include <stdlib.h>
+#       include <stdio.h>
+	struct yuck { int blechh; };
+	int one(void) { return 1; }
+	struct yuck ick(void) { struct yuck y; y.blechh = 3; return y; }
+#       define CHK_ALIGN(x) if ((((long) &(x)) & 0x7)) { fprintf(stderr, "bad alignment of " #x "\n"); exit(1); }
+	void blah(int foo) { double foobar; CHK_ALIGN(foobar); }
+	int main2(void) {double ok1; struct yuck y; double ok2; CHK_ALIGN(ok1);
+                         CHK_ALIGN(ok2); y = ick(); blah(one()); return 0;}
+	int main(void) { if ((((long) (__builtin_alloca(0))) & 0x7)) __builtin_alloca(4); return main2(); }
+	], [ax_gcc_aligns_stack=yes; ax_gcc_stack_align_bug=no], 
+	ax_gcc_stack_align_bug=yes, [AX_GCC_VERSION(3,0,0, ax_gcc_stack_align_bug=no, ax_gcc_stack_align_bug=yes)])
+	CFLAGS="$save_CFLAGS"
+	AC_MSG_RESULT($ax_gcc_aligns_stack)
+])
+fi
+if test "$ax_gcc_aligns_stack" = yes; then
+	m4_default([$1], :)
+else
+	m4_default([$2], :)
+fi
+])
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/ax_gcc_version.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/ax_gcc_version.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,38 @@
+dnl @synopsis AX_GCC_VERSION(MAJOR, MINOR, PATCHLEVEL, [ACTION-SUCCESS], [ACTION-FAILURE])
+dnl @summary check wither gcc is at least version MAJOR.MINOR.PATCHLEVEL
+dnl @category InstalledPackages
+dnl
+dnl Check whether we are using gcc and, if so, whether its version
+dnl is at least MAJOR.MINOR.PATCHLEVEL
+dnl
+dnl ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+dnl success/failure.
+dnl
+dnl @version 2005-05-30
+dnl @license GPLWithACException
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu> and Matteo Frigo.
+AC_DEFUN([AX_GCC_VERSION],
+[
+AC_REQUIRE([AC_PROG_CC])
+AC_CACHE_CHECK(whether we are using gcc $1.$2.$3 or later, ax_cv_gcc_$1_$2_$3,
+[
+ax_cv_gcc_$1_$2_$3=no
+if test "$GCC" = "yes"; then
+dnl The semicolon after "yes" below is to pacify NeXT's syntax-checking cpp.
+AC_EGREP_CPP(yes, [
+#ifdef __GNUC__
+#  if (__GNUC__ > $1) || (__GNUC__ == $1 && __GNUC_MINOR__ > $2) \
+   || (__GNUC__ == $1 && __GNUC_MINOR__ == $2 && __GNUC_PATCHLEVEL__ >= $3)
+     yes;
+#  endif
+#endif
+], [ax_cv_gcc_$1_$2_$3=yes])
+fi
+])
+if test "$ax_cv_gcc_$1_$2_$3" = yes; then
+	m4_default([$4], :)
+else
+	m4_default([$5], :)
+fi
+])
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/ax_openmp.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/ax_openmp.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,66 @@
+dnl @synopsis AX_OPENMP([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+dnl @summary determine how to compile programs using OpenMP
+dnl @category InstalledPackages
+dnl
+dnl This macro tries to find out how to compile programs that
+dnl use OpenMP, a standard API and set of compiler directives for
+dnl parallel programming (see http://www.openmp.org/).
+dnl
+dnl On success, it sets the OPENMP_CFLAGS/OPENMP_CXXFLAGS/OPENMP_FFLAGS
+dnl output variable to the flag (e.g. -omp) used both to compile *and* link
+dnl OpenMP programs in the current language.
+dnl
+dnl NOTE: You are assumed to not only compile your program with these
+dnl flags, but also link it with them as well.
+dnl
+dnl If you want to compile everything with OpenMP, you should set:
+dnl
+dnl     CFLAGS="$CFLAGS $OPENMP_CFLAGS" 
+dnl     #OR#  CXXFLAGS="$CXXFLAGS $OPENMP_CXXFLAGS" 
+dnl     #OR#  FFLAGS="$FFLAGS $OPENMP_FFLAGS" 
+dnl
+dnl (depending on the selected language).
+dnl
+dnl The user can override the default choice by setting the corresponding
+dnl environment variable (e.g. OPENMP_CFLAGS).
+dnl
+dnl ACTION-IF-FOUND is a list of shell commands to run if an OpenMP
+dnl flag is found, and ACTION-IF-NOT-FOUND is a list of commands
+dnl to run it if it is not found.  If ACTION-IF-FOUND is not specified,
+dnl the default action will define HAVE_OPENMP.
+dnl
+dnl @version 2006-11-20
+dnl @license GPLWithACException
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu>
+
+AC_DEFUN([AX_OPENMP], [
+AC_PREREQ(2.59) dnl for _AC_LANG_PREFIX
+
+AC_CACHE_CHECK([for OpenMP flag of _AC_LANG compiler], ax_cv_[]_AC_LANG_ABBREV[]_openmp, [save[]_AC_LANG_PREFIX[]FLAGS=$[]_AC_LANG_PREFIX[]FLAGS
+ax_cv_[]_AC_LANG_ABBREV[]_openmp=unknown
+# Flags to try:  -fopenmp (gcc), -openmp (icc), -mp (SGI & PGI),
+#                -xopenmp (Sun), -omp (Tru64), -qsmp=omp (AIX), none
+ax_openmp_flags="-fopenmp -openmp -mp -xopenmp -omp -qsmp=omp none"
+if test "x$OPENMP_[]_AC_LANG_PREFIX[]FLAGS" != x; then
+  ax_openmp_flags="$OPENMP_[]_AC_LANG_PREFIX[]FLAGS $ax_openmp_flags"
+fi
+for ax_openmp_flag in $ax_openmp_flags; do
+  case $ax_openmp_flag in
+    none) []_AC_LANG_PREFIX[]FLAGS=$save[]_AC_LANG_PREFIX[] ;;
+    *) []_AC_LANG_PREFIX[]FLAGS="$save[]_AC_LANG_PREFIX[]FLAGS $ax_openmp_flag" ;;
+  esac
+  AC_TRY_LINK_FUNC(omp_set_num_threads,
+	[ax_cv_[]_AC_LANG_ABBREV[]_openmp=$ax_openmp_flag; break])
+done
+[]_AC_LANG_PREFIX[]FLAGS=$save[]_AC_LANG_PREFIX[]FLAGS
+])
+if test "x$ax_cv_[]_AC_LANG_ABBREV[]_openmp" = "xunknown"; then
+  m4_default([$2],:)
+else
+  if test "x$ax_cv_[]_AC_LANG_ABBREV[]_openmp" != "xnone"; then
+    OPENMP_[]_AC_LANG_PREFIX[]FLAGS=$ax_cv_[]_AC_LANG_ABBREV[]_openmp
+  fi
+  m4_default([$1], [AC_DEFINE(HAVE_OPENMP,1,[Define if OpenMP is enabled])])
+fi
+AC_SUBST(OPENMP_[]_AC_LANG_PREFIX[]FLAGS)
+])dnl AX_OPENMP
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/libtool.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/libtool.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,7983 @@
+# libtool.m4 - Configure libtool for the host system. -*-Autoconf-*-
+#
+#   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005,
+#                 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+#                 Foundation, Inc.
+#   Written by Gordon Matzigkeit, 1996
+#
+# This file is free software; the Free Software Foundation gives
+# unlimited permission to copy and/or distribute it, with or without
+# modifications, as long as this notice is preserved.
+
+m4_define([_LT_COPYING], [dnl
+#   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005,
+#                 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+#                 Foundation, Inc.
+#   Written by Gordon Matzigkeit, 1996
+#
+#   This file is part of GNU Libtool.
+#
+# GNU Libtool is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of
+# the License, or (at your option) any later version.
+#
+# As a special exception to the GNU General Public License,
+# if you distribute this file as part of a program or library that
+# is built using GNU Libtool, you may include this file under the
+# same distribution terms that you use for the rest of that program.
+#
+# GNU Libtool is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Libtool; see the file COPYING.  If not, a copy
+# can be downloaded from http://www.gnu.org/licenses/gpl.html, or
+# obtained by writing to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+])
+
+# serial 57 LT_INIT
+
+
+# LT_PREREQ(VERSION)
+# ------------------
+# Complain and exit if this libtool version is less that VERSION.
+m4_defun([LT_PREREQ],
+[m4_if(m4_version_compare(m4_defn([LT_PACKAGE_VERSION]), [$1]), -1,
+       [m4_default([$3],
+		   [m4_fatal([Libtool version $1 or higher is required],
+		             63)])],
+       [$2])])
+
+
+# _LT_CHECK_BUILDDIR
+# ------------------
+# Complain if the absolute build directory name contains unusual characters
+m4_defun([_LT_CHECK_BUILDDIR],
+[case `pwd` in
+  *\ * | *\	*)
+    AC_MSG_WARN([Libtool does not cope well with whitespace in `pwd`]) ;;
+esac
+])
+
+
+# LT_INIT([OPTIONS])
+# ------------------
+AC_DEFUN([LT_INIT],
+[AC_PREREQ([2.58])dnl We use AC_INCLUDES_DEFAULT
+AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
+AC_BEFORE([$0], [LT_LANG])dnl
+AC_BEFORE([$0], [LT_OUTPUT])dnl
+AC_BEFORE([$0], [LTDL_INIT])dnl
+m4_require([_LT_CHECK_BUILDDIR])dnl
+
+dnl Autoconf doesn't catch unexpanded LT_ macros by default:
+m4_pattern_forbid([^_?LT_[A-Z_]+$])dnl
+m4_pattern_allow([^(_LT_EOF|LT_DLGLOBAL|LT_DLLAZY_OR_NOW|LT_MULTI_MODULE)$])dnl
+dnl aclocal doesn't pull ltoptions.m4, ltsugar.m4, or ltversion.m4
+dnl unless we require an AC_DEFUNed macro:
+AC_REQUIRE([LTOPTIONS_VERSION])dnl
+AC_REQUIRE([LTSUGAR_VERSION])dnl
+AC_REQUIRE([LTVERSION_VERSION])dnl
+AC_REQUIRE([LTOBSOLETE_VERSION])dnl
+m4_require([_LT_PROG_LTMAIN])dnl
+
+_LT_SHELL_INIT([SHELL=${CONFIG_SHELL-/bin/sh}])
+
+dnl Parse OPTIONS
+_LT_SET_OPTIONS([$0], [$1])
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ltmain"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+AC_SUBST(LIBTOOL)dnl
+
+_LT_SETUP
+
+# Only expand once:
+m4_define([LT_INIT])
+])# LT_INIT
+
+# Old names:
+AU_ALIAS([AC_PROG_LIBTOOL], [LT_INIT])
+AU_ALIAS([AM_PROG_LIBTOOL], [LT_INIT])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_PROG_LIBTOOL], [])
+dnl AC_DEFUN([AM_PROG_LIBTOOL], [])
+
+
+# _LT_CC_BASENAME(CC)
+# -------------------
+# Calculate cc_basename.  Skip known compiler wrappers and cross-prefix.
+m4_defun([_LT_CC_BASENAME],
+[for cc_temp in $1""; do
+  case $cc_temp in
+    compile | *[[\\/]]compile | ccache | *[[\\/]]ccache ) ;;
+    distcc | *[[\\/]]distcc | purify | *[[\\/]]purify ) ;;
+    \-*) ;;
+    *) break;;
+  esac
+done
+cc_basename=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"`
+])
+
+
+# _LT_FILEUTILS_DEFAULTS
+# ----------------------
+# It is okay to use these file commands and assume they have been set
+# sensibly after `m4_require([_LT_FILEUTILS_DEFAULTS])'.
+m4_defun([_LT_FILEUTILS_DEFAULTS],
+[: ${CP="cp -f"}
+: ${MV="mv -f"}
+: ${RM="rm -f"}
+])# _LT_FILEUTILS_DEFAULTS
+
+
+# _LT_SETUP
+# ---------
+m4_defun([_LT_SETUP],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+AC_REQUIRE([_LT_PREPARE_SED_QUOTE_VARS])dnl
+AC_REQUIRE([_LT_PROG_ECHO_BACKSLASH])dnl
+
+_LT_DECL([], [PATH_SEPARATOR], [1], [The PATH separator for the build system])dnl
+dnl
+_LT_DECL([], [host_alias], [0], [The host system])dnl
+_LT_DECL([], [host], [0])dnl
+_LT_DECL([], [host_os], [0])dnl
+dnl
+_LT_DECL([], [build_alias], [0], [The build system])dnl
+_LT_DECL([], [build], [0])dnl
+_LT_DECL([], [build_os], [0])dnl
+dnl
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([LT_PATH_LD])dnl
+AC_REQUIRE([LT_PATH_NM])dnl
+dnl
+AC_REQUIRE([AC_PROG_LN_S])dnl
+test -z "$LN_S" && LN_S="ln -s"
+_LT_DECL([], [LN_S], [1], [Whether we need soft or hard links])dnl
+dnl
+AC_REQUIRE([LT_CMD_MAX_LEN])dnl
+_LT_DECL([objext], [ac_objext], [0], [Object file suffix (normally "o")])dnl
+_LT_DECL([], [exeext], [0], [Executable file suffix (normally "")])dnl
+dnl
+m4_require([_LT_FILEUTILS_DEFAULTS])dnl
+m4_require([_LT_CHECK_SHELL_FEATURES])dnl
+m4_require([_LT_PATH_CONVERSION_FUNCTIONS])dnl
+m4_require([_LT_CMD_RELOAD])dnl
+m4_require([_LT_CHECK_MAGIC_METHOD])dnl
+m4_require([_LT_CHECK_SHAREDLIB_FROM_LINKLIB])dnl
+m4_require([_LT_CMD_OLD_ARCHIVE])dnl
+m4_require([_LT_CMD_GLOBAL_SYMBOLS])dnl
+m4_require([_LT_WITH_SYSROOT])dnl
+
+_LT_CONFIG_LIBTOOL_INIT([
+# See if we are running on zsh, and set the options which allow our
+# commands through without removal of \ escapes INIT.
+if test -n "\${ZSH_VERSION+set}" ; then
+   setopt NO_GLOB_SUBST
+fi
+])
+if test -n "${ZSH_VERSION+set}" ; then
+   setopt NO_GLOB_SUBST
+fi
+
+_LT_CHECK_OBJDIR
+
+m4_require([_LT_TAG_COMPILER])dnl
+
+case $host_os in
+aix3*)
+  # AIX sometimes has problems with the GCC collect2 program.  For some
+  # reason, if we set the COLLECT_NAMES environment variable, the problems
+  # vanish in a puff of smoke.
+  if test "X${COLLECT_NAMES+set}" != Xset; then
+    COLLECT_NAMES=
+    export COLLECT_NAMES
+  fi
+  ;;
+esac
+
+# Global variables:
+ofile=libtool
+can_build_shared=yes
+
+# All known linkers require a `.a' archive for static linking (except MSVC,
+# which needs '.lib').
+libext=a
+
+with_gnu_ld="$lt_cv_prog_gnu_ld"
+
+old_CC="$CC"
+old_CFLAGS="$CFLAGS"
+
+# Set sane defaults for various variables
+test -z "$CC" && CC=cc
+test -z "$LTCC" && LTCC=$CC
+test -z "$LTCFLAGS" && LTCFLAGS=$CFLAGS
+test -z "$LD" && LD=ld
+test -z "$ac_objext" && ac_objext=o
+
+_LT_CC_BASENAME([$compiler])
+
+# Only perform the check for file, if the check method requires it
+test -z "$MAGIC_CMD" && MAGIC_CMD=file
+case $deplibs_check_method in
+file_magic*)
+  if test "$file_magic_cmd" = '$MAGIC_CMD'; then
+    _LT_PATH_MAGIC
+  fi
+  ;;
+esac
+
+# Use C for the default configuration in the libtool script
+LT_SUPPORTED_TAG([CC])
+_LT_LANG_C_CONFIG
+_LT_LANG_DEFAULT_CONFIG
+_LT_CONFIG_COMMANDS
+])# _LT_SETUP
+
+
+# _LT_PREPARE_SED_QUOTE_VARS
+# --------------------------
+# Define a few sed substitution that help us do robust quoting.
+m4_defun([_LT_PREPARE_SED_QUOTE_VARS],
+[# Backslashify metacharacters that are still active within
+# double-quoted strings.
+sed_quote_subst='s/\([["`$\\]]\)/\\\1/g'
+
+# Same as above, but do not quote variable references.
+double_quote_subst='s/\([["`\\]]\)/\\\1/g'
+
+# Sed substitution to delay expansion of an escaped shell variable in a
+# double_quote_subst'ed string.
+delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g'
+
+# Sed substitution to delay expansion of an escaped single quote.
+delay_single_quote_subst='s/'\''/'\'\\\\\\\'\''/g'
+
+# Sed substitution to avoid accidental globbing in evaled expressions
+no_glob_subst='s/\*/\\\*/g'
+])
+
+# _LT_PROG_LTMAIN
+# ---------------
+# Note that this code is called both from `configure', and `config.status'
+# now that we use AC_CONFIG_COMMANDS to generate libtool.  Notably,
+# `config.status' has no value for ac_aux_dir unless we are using Automake,
+# so we pass a copy along to make sure it has a sensible value anyway.
+m4_defun([_LT_PROG_LTMAIN],
+[m4_ifdef([AC_REQUIRE_AUX_FILE], [AC_REQUIRE_AUX_FILE([ltmain.sh])])dnl
+_LT_CONFIG_LIBTOOL_INIT([ac_aux_dir='$ac_aux_dir'])
+ltmain="$ac_aux_dir/ltmain.sh"
+])# _LT_PROG_LTMAIN
+
+
+## ------------------------------------- ##
+## Accumulate code for creating libtool. ##
+## ------------------------------------- ##
+
+# So that we can recreate a full libtool script including additional
+# tags, we accumulate the chunks of code to send to AC_CONFIG_COMMANDS
+# in macros and then make a single call at the end using the `libtool'
+# label.
+
+
+# _LT_CONFIG_LIBTOOL_INIT([INIT-COMMANDS])
+# ----------------------------------------
+# Register INIT-COMMANDS to be passed to AC_CONFIG_COMMANDS later.
+m4_define([_LT_CONFIG_LIBTOOL_INIT],
+[m4_ifval([$1],
+          [m4_append([_LT_OUTPUT_LIBTOOL_INIT],
+                     [$1
+])])])
+
+# Initialize.
+m4_define([_LT_OUTPUT_LIBTOOL_INIT])
+
+
+# _LT_CONFIG_LIBTOOL([COMMANDS])
+# ------------------------------
+# Register COMMANDS to be passed to AC_CONFIG_COMMANDS later.
+m4_define([_LT_CONFIG_LIBTOOL],
+[m4_ifval([$1],
+          [m4_append([_LT_OUTPUT_LIBTOOL_COMMANDS],
+                     [$1
+])])])
+
+# Initialize.
+m4_define([_LT_OUTPUT_LIBTOOL_COMMANDS])
+
+
+# _LT_CONFIG_SAVE_COMMANDS([COMMANDS], [INIT_COMMANDS])
+# -----------------------------------------------------
+m4_defun([_LT_CONFIG_SAVE_COMMANDS],
+[_LT_CONFIG_LIBTOOL([$1])
+_LT_CONFIG_LIBTOOL_INIT([$2])
+])
+
+
+# _LT_FORMAT_COMMENT([COMMENT])
+# -----------------------------
+# Add leading comment marks to the start of each line, and a trailing
+# full-stop to the whole comment if one is not present already.
+m4_define([_LT_FORMAT_COMMENT],
+[m4_ifval([$1], [
+m4_bpatsubst([m4_bpatsubst([$1], [^ *], [# ])],
+              [['`$\]], [\\\&])]m4_bmatch([$1], [[!?.]$], [], [.])
+)])
+
+
+
+## ------------------------ ##
+## FIXME: Eliminate VARNAME ##
+## ------------------------ ##
+
+
+# _LT_DECL([CONFIGNAME], VARNAME, VALUE, [DESCRIPTION], [IS-TAGGED?])
+# -------------------------------------------------------------------
+# CONFIGNAME is the name given to the value in the libtool script.
+# VARNAME is the (base) name used in the configure script.
+# VALUE may be 0, 1 or 2 for a computed quote escaped value based on
+# VARNAME.  Any other value will be used directly.
+m4_define([_LT_DECL],
+[lt_if_append_uniq([lt_decl_varnames], [$2], [, ],
+    [lt_dict_add_subkey([lt_decl_dict], [$2], [libtool_name],
+	[m4_ifval([$1], [$1], [$2])])
+    lt_dict_add_subkey([lt_decl_dict], [$2], [value], [$3])
+    m4_ifval([$4],
+	[lt_dict_add_subkey([lt_decl_dict], [$2], [description], [$4])])
+    lt_dict_add_subkey([lt_decl_dict], [$2],
+	[tagged?], [m4_ifval([$5], [yes], [no])])])
+])
+
+
+# _LT_TAGDECL([CONFIGNAME], VARNAME, VALUE, [DESCRIPTION])
+# --------------------------------------------------------
+m4_define([_LT_TAGDECL], [_LT_DECL([$1], [$2], [$3], [$4], [yes])])
+
+
+# lt_decl_tag_varnames([SEPARATOR], [VARNAME1...])
+# ------------------------------------------------
+m4_define([lt_decl_tag_varnames],
+[_lt_decl_filter([tagged?], [yes], $@)])
+
+
+# _lt_decl_filter(SUBKEY, VALUE, [SEPARATOR], [VARNAME1..])
+# ---------------------------------------------------------
+m4_define([_lt_decl_filter],
+[m4_case([$#],
+  [0], [m4_fatal([$0: too few arguments: $#])],
+  [1], [m4_fatal([$0: too few arguments: $#: $1])],
+  [2], [lt_dict_filter([lt_decl_dict], [$1], [$2], [], lt_decl_varnames)],
+  [3], [lt_dict_filter([lt_decl_dict], [$1], [$2], [$3], lt_decl_varnames)],
+  [lt_dict_filter([lt_decl_dict], $@)])[]dnl
+])
+
+
+# lt_decl_quote_varnames([SEPARATOR], [VARNAME1...])
+# --------------------------------------------------
+m4_define([lt_decl_quote_varnames],
+[_lt_decl_filter([value], [1], $@)])
+
+
+# lt_decl_dquote_varnames([SEPARATOR], [VARNAME1...])
+# ---------------------------------------------------
+m4_define([lt_decl_dquote_varnames],
+[_lt_decl_filter([value], [2], $@)])
+
+
+# lt_decl_varnames_tagged([SEPARATOR], [VARNAME1...])
+# ---------------------------------------------------
+m4_define([lt_decl_varnames_tagged],
+[m4_assert([$# <= 2])dnl
+_$0(m4_quote(m4_default([$1], [[, ]])),
+    m4_ifval([$2], [[$2]], [m4_dquote(lt_decl_tag_varnames)]),
+    m4_split(m4_normalize(m4_quote(_LT_TAGS)), [ ]))])
+m4_define([_lt_decl_varnames_tagged],
+[m4_ifval([$3], [lt_combine([$1], [$2], [_], $3)])])
+
+
+# lt_decl_all_varnames([SEPARATOR], [VARNAME1...])
+# ------------------------------------------------
+m4_define([lt_decl_all_varnames],
+[_$0(m4_quote(m4_default([$1], [[, ]])),
+     m4_if([$2], [],
+	   m4_quote(lt_decl_varnames),
+	m4_quote(m4_shift($@))))[]dnl
+])
+m4_define([_lt_decl_all_varnames],
+[lt_join($@, lt_decl_varnames_tagged([$1],
+			lt_decl_tag_varnames([[, ]], m4_shift($@))))dnl
+])
+
+
+# _LT_CONFIG_STATUS_DECLARE([VARNAME])
+# ------------------------------------
+# Quote a variable value, and forward it to `config.status' so that its
+# declaration there will have the same value as in `configure'.  VARNAME
+# must have a single quote delimited value for this to work.
+m4_define([_LT_CONFIG_STATUS_DECLARE],
+[$1='`$ECHO "$][$1" | $SED "$delay_single_quote_subst"`'])
+
+
+# _LT_CONFIG_STATUS_DECLARATIONS
+# ------------------------------
+# We delimit libtool config variables with single quotes, so when
+# we write them to config.status, we have to be sure to quote all
+# embedded single quotes properly.  In configure, this macro expands
+# each variable declared with _LT_DECL (and _LT_TAGDECL) into:
+#
+#    <var>='`$ECHO "$<var>" | $SED "$delay_single_quote_subst"`'
+m4_defun([_LT_CONFIG_STATUS_DECLARATIONS],
+[m4_foreach([_lt_var], m4_quote(lt_decl_all_varnames),
+    [m4_n([_LT_CONFIG_STATUS_DECLARE(_lt_var)])])])
+
+
+# _LT_LIBTOOL_TAGS
+# ----------------
+# Output comment and list of tags supported by the script
+m4_defun([_LT_LIBTOOL_TAGS],
+[_LT_FORMAT_COMMENT([The names of the tagged configurations supported by this script])dnl
+available_tags="_LT_TAGS"dnl
+])
+
+
+# _LT_LIBTOOL_DECLARE(VARNAME, [TAG])
+# -----------------------------------
+# Extract the dictionary values for VARNAME (optionally with TAG) and
+# expand to a commented shell variable setting:
+#
+#    # Some comment about what VAR is for.
+#    visible_name=$lt_internal_name
+m4_define([_LT_LIBTOOL_DECLARE],
+[_LT_FORMAT_COMMENT(m4_quote(lt_dict_fetch([lt_decl_dict], [$1],
+					   [description])))[]dnl
+m4_pushdef([_libtool_name],
+    m4_quote(lt_dict_fetch([lt_decl_dict], [$1], [libtool_name])))[]dnl
+m4_case(m4_quote(lt_dict_fetch([lt_decl_dict], [$1], [value])),
+    [0], [_libtool_name=[$]$1],
+    [1], [_libtool_name=$lt_[]$1],
+    [2], [_libtool_name=$lt_[]$1],
+    [_libtool_name=lt_dict_fetch([lt_decl_dict], [$1], [value])])[]dnl
+m4_ifval([$2], [_$2])[]m4_popdef([_libtool_name])[]dnl
+])
+
+
+# _LT_LIBTOOL_CONFIG_VARS
+# -----------------------
+# Produce commented declarations of non-tagged libtool config variables
+# suitable for insertion in the LIBTOOL CONFIG section of the `libtool'
+# script.  Tagged libtool config variables (even for the LIBTOOL CONFIG
+# section) are produced by _LT_LIBTOOL_TAG_VARS.
+m4_defun([_LT_LIBTOOL_CONFIG_VARS],
+[m4_foreach([_lt_var],
+    m4_quote(_lt_decl_filter([tagged?], [no], [], lt_decl_varnames)),
+    [m4_n([_LT_LIBTOOL_DECLARE(_lt_var)])])])
+
+
+# _LT_LIBTOOL_TAG_VARS(TAG)
+# -------------------------
+m4_define([_LT_LIBTOOL_TAG_VARS],
+[m4_foreach([_lt_var], m4_quote(lt_decl_tag_varnames),
+    [m4_n([_LT_LIBTOOL_DECLARE(_lt_var, [$1])])])])
+
+
+# _LT_TAGVAR(VARNAME, [TAGNAME])
+# ------------------------------
+m4_define([_LT_TAGVAR], [m4_ifval([$2], [$1_$2], [$1])])
+
+
+# _LT_CONFIG_COMMANDS
+# -------------------
+# Send accumulated output to $CONFIG_STATUS.  Thanks to the lists of
+# variables for single and double quote escaping we saved from calls
+# to _LT_DECL, we can put quote escaped variables declarations
+# into `config.status', and then the shell code to quote escape them in
+# for loops in `config.status'.  Finally, any additional code accumulated
+# from calls to _LT_CONFIG_LIBTOOL_INIT is expanded.
+m4_defun([_LT_CONFIG_COMMANDS],
+[AC_PROVIDE_IFELSE([LT_OUTPUT],
+	dnl If the libtool generation code has been placed in $CONFIG_LT,
+	dnl instead of duplicating it all over again into config.status,
+	dnl then we will have config.status run $CONFIG_LT later, so it
+	dnl needs to know what name is stored there:
+        [AC_CONFIG_COMMANDS([libtool],
+            [$SHELL $CONFIG_LT || AS_EXIT(1)], [CONFIG_LT='$CONFIG_LT'])],
+    dnl If the libtool generation code is destined for config.status,
+    dnl expand the accumulated commands and init code now:
+    [AC_CONFIG_COMMANDS([libtool],
+        [_LT_OUTPUT_LIBTOOL_COMMANDS], [_LT_OUTPUT_LIBTOOL_COMMANDS_INIT])])
+])#_LT_CONFIG_COMMANDS
+
+
+# Initialize.
+m4_define([_LT_OUTPUT_LIBTOOL_COMMANDS_INIT],
+[
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+sed_quote_subst='$sed_quote_subst'
+double_quote_subst='$double_quote_subst'
+delay_variable_subst='$delay_variable_subst'
+_LT_CONFIG_STATUS_DECLARATIONS
+LTCC='$LTCC'
+LTCFLAGS='$LTCFLAGS'
+compiler='$compiler_DEFAULT'
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+\$[]1
+_LTECHO_EOF'
+}
+
+# Quote evaled strings.
+for var in lt_decl_all_varnames([[ \
+]], lt_decl_quote_varnames); do
+    case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in
+    *[[\\\\\\\`\\"\\\$]]*)
+      eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED \\"\\\$sed_quote_subst\\"\\\`\\\\\\""
+      ;;
+    *)
+      eval "lt_\$var=\\\\\\"\\\$\$var\\\\\\""
+      ;;
+    esac
+done
+
+# Double-quote double-evaled strings.
+for var in lt_decl_all_varnames([[ \
+]], lt_decl_dquote_varnames); do
+    case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in
+    *[[\\\\\\\`\\"\\\$]]*)
+      eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED -e \\"\\\$double_quote_subst\\" -e \\"\\\$sed_quote_subst\\" -e \\"\\\$delay_variable_subst\\"\\\`\\\\\\""
+      ;;
+    *)
+      eval "lt_\$var=\\\\\\"\\\$\$var\\\\\\""
+      ;;
+    esac
+done
+
+_LT_OUTPUT_LIBTOOL_INIT
+])
+
+# _LT_GENERATED_FILE_INIT(FILE, [COMMENT])
+# ------------------------------------
+# Generate a child script FILE with all initialization necessary to
+# reuse the environment learned by the parent script, and make the
+# file executable.  If COMMENT is supplied, it is inserted after the
+# `#!' sequence but before initialization text begins.  After this
+# macro, additional text can be appended to FILE to form the body of
+# the child script.  The macro ends with non-zero status if the
+# file could not be fully written (such as if the disk is full).
+m4_ifdef([AS_INIT_GENERATED],
+[m4_defun([_LT_GENERATED_FILE_INIT],[AS_INIT_GENERATED($@)])],
+[m4_defun([_LT_GENERATED_FILE_INIT],
+[m4_require([AS_PREPARE])]dnl
+[m4_pushdef([AS_MESSAGE_LOG_FD])]dnl
+[lt_write_fail=0
+cat >$1 <<_ASEOF || lt_write_fail=1
+#! $SHELL
+# Generated by $as_me.
+$2
+SHELL=\${CONFIG_SHELL-$SHELL}
+export SHELL
+_ASEOF
+cat >>$1 <<\_ASEOF || lt_write_fail=1
+AS_SHELL_SANITIZE
+_AS_PREPARE
+exec AS_MESSAGE_FD>&1
+_ASEOF
+test $lt_write_fail = 0 && chmod +x $1[]dnl
+m4_popdef([AS_MESSAGE_LOG_FD])])])# _LT_GENERATED_FILE_INIT
+
+# LT_OUTPUT
+# ---------
+# This macro allows early generation of the libtool script (before
+# AC_OUTPUT is called), incase it is used in configure for compilation
+# tests.
+AC_DEFUN([LT_OUTPUT],
+[: ${CONFIG_LT=./config.lt}
+AC_MSG_NOTICE([creating $CONFIG_LT])
+_LT_GENERATED_FILE_INIT(["$CONFIG_LT"],
+[# Run this file to recreate a libtool stub with the current configuration.])
+
+cat >>"$CONFIG_LT" <<\_LTEOF
+lt_cl_silent=false
+exec AS_MESSAGE_LOG_FD>>config.log
+{
+  echo
+  AS_BOX([Running $as_me.])
+} >&AS_MESSAGE_LOG_FD
+
+lt_cl_help="\
+\`$as_me' creates a local libtool stub from the current configuration,
+for use in further configure time tests before the real libtool is
+generated.
+
+Usage: $[0] [[OPTIONS]]
+
+  -h, --help      print this help, then exit
+  -V, --version   print version number, then exit
+  -q, --quiet     do not print progress messages
+  -d, --debug     don't remove temporary files
+
+Report bugs to <bug-libtool@gnu.org>."
+
+lt_cl_version="\
+m4_ifset([AC_PACKAGE_NAME], [AC_PACKAGE_NAME ])config.lt[]dnl
+m4_ifset([AC_PACKAGE_VERSION], [ AC_PACKAGE_VERSION])
+configured by $[0], generated by m4_PACKAGE_STRING.
+
+Copyright (C) 2011 Free Software Foundation, Inc.
+This config.lt script is free software; the Free Software Foundation
+gives unlimited permision to copy, distribute and modify it."
+
+while test $[#] != 0
+do
+  case $[1] in
+    --version | --v* | -V )
+      echo "$lt_cl_version"; exit 0 ;;
+    --help | --h* | -h )
+      echo "$lt_cl_help"; exit 0 ;;
+    --debug | --d* | -d )
+      debug=: ;;
+    --quiet | --q* | --silent | --s* | -q )
+      lt_cl_silent=: ;;
+
+    -*) AC_MSG_ERROR([unrecognized option: $[1]
+Try \`$[0] --help' for more information.]) ;;
+
+    *) AC_MSG_ERROR([unrecognized argument: $[1]
+Try \`$[0] --help' for more information.]) ;;
+  esac
+  shift
+done
+
+if $lt_cl_silent; then
+  exec AS_MESSAGE_FD>/dev/null
+fi
+_LTEOF
+
+cat >>"$CONFIG_LT" <<_LTEOF
+_LT_OUTPUT_LIBTOOL_COMMANDS_INIT
+_LTEOF
+
+cat >>"$CONFIG_LT" <<\_LTEOF
+AC_MSG_NOTICE([creating $ofile])
+_LT_OUTPUT_LIBTOOL_COMMANDS
+AS_EXIT(0)
+_LTEOF
+chmod +x "$CONFIG_LT"
+
+# configure is writing to config.log, but config.lt does its own redirection,
+# appending to config.log, which fails on DOS, as config.log is still kept
+# open by configure.  Here we exec the FD to /dev/null, effectively closing
+# config.log, so it can be properly (re)opened and appended to by config.lt.
+lt_cl_success=:
+test "$silent" = yes &&
+  lt_config_lt_args="$lt_config_lt_args --quiet"
+exec AS_MESSAGE_LOG_FD>/dev/null
+$SHELL "$CONFIG_LT" $lt_config_lt_args || lt_cl_success=false
+exec AS_MESSAGE_LOG_FD>>config.log
+$lt_cl_success || AS_EXIT(1)
+])# LT_OUTPUT
+
+
+# _LT_CONFIG(TAG)
+# ---------------
+# If TAG is the built-in tag, create an initial libtool script with a
+# default configuration from the untagged config vars.  Otherwise add code
+# to config.status for appending the configuration named by TAG from the
+# matching tagged config vars.
+m4_defun([_LT_CONFIG],
+[m4_require([_LT_FILEUTILS_DEFAULTS])dnl
+_LT_CONFIG_SAVE_COMMANDS([
+  m4_define([_LT_TAG], m4_if([$1], [], [C], [$1]))dnl
+  m4_if(_LT_TAG, [C], [
+    # See if we are running on zsh, and set the options which allow our
+    # commands through without removal of \ escapes.
+    if test -n "${ZSH_VERSION+set}" ; then
+      setopt NO_GLOB_SUBST
+    fi
+
+    cfgfile="${ofile}T"
+    trap "$RM \"$cfgfile\"; exit 1" 1 2 15
+    $RM "$cfgfile"
+
+    cat <<_LT_EOF >> "$cfgfile"
+#! $SHELL
+
+# `$ECHO "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services.
+# Generated automatically by $as_me ($PACKAGE$TIMESTAMP) $VERSION
+# Libtool was configured on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
+# NOTE: Changes made to this file will be lost: look at ltmain.sh.
+#
+_LT_COPYING
+_LT_LIBTOOL_TAGS
+
+# ### BEGIN LIBTOOL CONFIG
+_LT_LIBTOOL_CONFIG_VARS
+_LT_LIBTOOL_TAG_VARS
+# ### END LIBTOOL CONFIG
+
+_LT_EOF
+
+  case $host_os in
+  aix3*)
+    cat <<\_LT_EOF >> "$cfgfile"
+# AIX sometimes has problems with the GCC collect2 program.  For some
+# reason, if we set the COLLECT_NAMES environment variable, the problems
+# vanish in a puff of smoke.
+if test "X${COLLECT_NAMES+set}" != Xset; then
+  COLLECT_NAMES=
+  export COLLECT_NAMES
+fi
+_LT_EOF
+    ;;
+  esac
+
+  _LT_PROG_LTMAIN
+
+  # We use sed instead of cat because bash on DJGPP gets confused if
+  # if finds mixed CR/LF and LF-only lines.  Since sed operates in
+  # text mode, it properly converts lines to CR/LF.  This bash problem
+  # is reportedly fixed, but why not run on old versions too?
+  sed '$q' "$ltmain" >> "$cfgfile" \
+     || (rm -f "$cfgfile"; exit 1)
+
+  _LT_PROG_REPLACE_SHELLFNS
+
+   mv -f "$cfgfile" "$ofile" ||
+    (rm -f "$ofile" && cp "$cfgfile" "$ofile" && rm -f "$cfgfile")
+  chmod +x "$ofile"
+],
+[cat <<_LT_EOF >> "$ofile"
+
+dnl Unfortunately we have to use $1 here, since _LT_TAG is not expanded
+dnl in a comment (ie after a #).
+# ### BEGIN LIBTOOL TAG CONFIG: $1
+_LT_LIBTOOL_TAG_VARS(_LT_TAG)
+# ### END LIBTOOL TAG CONFIG: $1
+_LT_EOF
+])dnl /m4_if
+],
+[m4_if([$1], [], [
+    PACKAGE='$PACKAGE'
+    VERSION='$VERSION'
+    TIMESTAMP='$TIMESTAMP'
+    RM='$RM'
+    ofile='$ofile'], [])
+])dnl /_LT_CONFIG_SAVE_COMMANDS
+])# _LT_CONFIG
+
+
+# LT_SUPPORTED_TAG(TAG)
+# ---------------------
+# Trace this macro to discover what tags are supported by the libtool
+# --tag option, using:
+#    autoconf --trace 'LT_SUPPORTED_TAG:$1'
+AC_DEFUN([LT_SUPPORTED_TAG], [])
+
+
+# C support is built-in for now
+m4_define([_LT_LANG_C_enabled], [])
+m4_define([_LT_TAGS], [])
+
+
+# LT_LANG(LANG)
+# -------------
+# Enable libtool support for the given language if not already enabled.
+AC_DEFUN([LT_LANG],
+[AC_BEFORE([$0], [LT_OUTPUT])dnl
+m4_case([$1],
+  [C],			[_LT_LANG(C)],
+  [C++],		[_LT_LANG(CXX)],
+  [Go],			[_LT_LANG(GO)],
+  [Java],		[_LT_LANG(GCJ)],
+  [Fortran 77],		[_LT_LANG(F77)],
+  [Fortran],		[_LT_LANG(FC)],
+  [Windows Resource],	[_LT_LANG(RC)],
+  [m4_ifdef([_LT_LANG_]$1[_CONFIG],
+    [_LT_LANG($1)],
+    [m4_fatal([$0: unsupported language: "$1"])])])dnl
+])# LT_LANG
+
+
+# _LT_LANG(LANGNAME)
+# ------------------
+m4_defun([_LT_LANG],
+[m4_ifdef([_LT_LANG_]$1[_enabled], [],
+  [LT_SUPPORTED_TAG([$1])dnl
+  m4_append([_LT_TAGS], [$1 ])dnl
+  m4_define([_LT_LANG_]$1[_enabled], [])dnl
+  _LT_LANG_$1_CONFIG($1)])dnl
+])# _LT_LANG
+
+
+m4_ifndef([AC_PROG_GO], [
+############################################################
+# NOTE: This macro has been submitted for inclusion into   #
+#  GNU Autoconf as AC_PROG_GO.  When it is available in    #
+#  a released version of Autoconf we should remove this    #
+#  macro and use it instead.                               #
+############################################################
+m4_defun([AC_PROG_GO],
+[AC_LANG_PUSH(Go)dnl
+AC_ARG_VAR([GOC],     [Go compiler command])dnl
+AC_ARG_VAR([GOFLAGS], [Go compiler flags])dnl
+_AC_ARG_VAR_LDFLAGS()dnl
+AC_CHECK_TOOL(GOC, gccgo)
+if test -z "$GOC"; then
+  if test -n "$ac_tool_prefix"; then
+    AC_CHECK_PROG(GOC, [${ac_tool_prefix}gccgo], [${ac_tool_prefix}gccgo])
+  fi
+fi
+if test -z "$GOC"; then
+  AC_CHECK_PROG(GOC, gccgo, gccgo, false)
+fi
+])#m4_defun
+])#m4_ifndef
+
+
+# _LT_LANG_DEFAULT_CONFIG
+# -----------------------
+m4_defun([_LT_LANG_DEFAULT_CONFIG],
+[AC_PROVIDE_IFELSE([AC_PROG_CXX],
+  [LT_LANG(CXX)],
+  [m4_define([AC_PROG_CXX], defn([AC_PROG_CXX])[LT_LANG(CXX)])])
+
+AC_PROVIDE_IFELSE([AC_PROG_F77],
+  [LT_LANG(F77)],
+  [m4_define([AC_PROG_F77], defn([AC_PROG_F77])[LT_LANG(F77)])])
+
+AC_PROVIDE_IFELSE([AC_PROG_FC],
+  [LT_LANG(FC)],
+  [m4_define([AC_PROG_FC], defn([AC_PROG_FC])[LT_LANG(FC)])])
+
+dnl The call to [A][M_PROG_GCJ] is quoted like that to stop aclocal
+dnl pulling things in needlessly.
+AC_PROVIDE_IFELSE([AC_PROG_GCJ],
+  [LT_LANG(GCJ)],
+  [AC_PROVIDE_IFELSE([A][M_PROG_GCJ],
+    [LT_LANG(GCJ)],
+    [AC_PROVIDE_IFELSE([LT_PROG_GCJ],
+      [LT_LANG(GCJ)],
+      [m4_ifdef([AC_PROG_GCJ],
+	[m4_define([AC_PROG_GCJ], defn([AC_PROG_GCJ])[LT_LANG(GCJ)])])
+       m4_ifdef([A][M_PROG_GCJ],
+	[m4_define([A][M_PROG_GCJ], defn([A][M_PROG_GCJ])[LT_LANG(GCJ)])])
+       m4_ifdef([LT_PROG_GCJ],
+	[m4_define([LT_PROG_GCJ], defn([LT_PROG_GCJ])[LT_LANG(GCJ)])])])])])
+
+AC_PROVIDE_IFELSE([AC_PROG_GO],
+  [LT_LANG(GO)],
+  [m4_define([AC_PROG_GO], defn([AC_PROG_GO])[LT_LANG(GO)])])
+
+AC_PROVIDE_IFELSE([LT_PROG_RC],
+  [LT_LANG(RC)],
+  [m4_define([LT_PROG_RC], defn([LT_PROG_RC])[LT_LANG(RC)])])
+])# _LT_LANG_DEFAULT_CONFIG
+
+# Obsolete macros:
+AU_DEFUN([AC_LIBTOOL_CXX], [LT_LANG(C++)])
+AU_DEFUN([AC_LIBTOOL_F77], [LT_LANG(Fortran 77)])
+AU_DEFUN([AC_LIBTOOL_FC], [LT_LANG(Fortran)])
+AU_DEFUN([AC_LIBTOOL_GCJ], [LT_LANG(Java)])
+AU_DEFUN([AC_LIBTOOL_RC], [LT_LANG(Windows Resource)])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_LIBTOOL_CXX], [])
+dnl AC_DEFUN([AC_LIBTOOL_F77], [])
+dnl AC_DEFUN([AC_LIBTOOL_FC], [])
+dnl AC_DEFUN([AC_LIBTOOL_GCJ], [])
+dnl AC_DEFUN([AC_LIBTOOL_RC], [])
+
+
+# _LT_TAG_COMPILER
+# ----------------
+m4_defun([_LT_TAG_COMPILER],
+[AC_REQUIRE([AC_PROG_CC])dnl
+
+_LT_DECL([LTCC], [CC], [1], [A C compiler])dnl
+_LT_DECL([LTCFLAGS], [CFLAGS], [1], [LTCC compiler flags])dnl
+_LT_TAGDECL([CC], [compiler], [1], [A language specific compiler])dnl
+_LT_TAGDECL([with_gcc], [GCC], [0], [Is the compiler the GNU compiler?])dnl
+
+# If no C compiler was specified, use CC.
+LTCC=${LTCC-"$CC"}
+
+# If no C compiler flags were specified, use CFLAGS.
+LTCFLAGS=${LTCFLAGS-"$CFLAGS"}
+
+# Allow CC to be a program name with arguments.
+compiler=$CC
+])# _LT_TAG_COMPILER
+
+
+# _LT_COMPILER_BOILERPLATE
+# ------------------------
+# Check for compiler boilerplate output or warnings with
+# the simple compiler test code.
+m4_defun([_LT_COMPILER_BOILERPLATE],
+[m4_require([_LT_DECL_SED])dnl
+ac_outfile=conftest.$ac_objext
+echo "$lt_simple_compile_test_code" >conftest.$ac_ext
+eval "$ac_compile" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
+_lt_compiler_boilerplate=`cat conftest.err`
+$RM conftest*
+])# _LT_COMPILER_BOILERPLATE
+
+
+# _LT_LINKER_BOILERPLATE
+# ----------------------
+# Check for linker boilerplate output or warnings with
+# the simple link test code.
+m4_defun([_LT_LINKER_BOILERPLATE],
+[m4_require([_LT_DECL_SED])dnl
+ac_outfile=conftest.$ac_objext
+echo "$lt_simple_link_test_code" >conftest.$ac_ext
+eval "$ac_link" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
+_lt_linker_boilerplate=`cat conftest.err`
+$RM -r conftest*
+])# _LT_LINKER_BOILERPLATE
+
+# _LT_REQUIRED_DARWIN_CHECKS
+# -------------------------
+m4_defun_once([_LT_REQUIRED_DARWIN_CHECKS],[
+  case $host_os in
+    rhapsody* | darwin*)
+    AC_CHECK_TOOL([DSYMUTIL], [dsymutil], [:])
+    AC_CHECK_TOOL([NMEDIT], [nmedit], [:])
+    AC_CHECK_TOOL([LIPO], [lipo], [:])
+    AC_CHECK_TOOL([OTOOL], [otool], [:])
+    AC_CHECK_TOOL([OTOOL64], [otool64], [:])
+    _LT_DECL([], [DSYMUTIL], [1],
+      [Tool to manipulate archived DWARF debug symbol files on Mac OS X])
+    _LT_DECL([], [NMEDIT], [1],
+      [Tool to change global to local symbols on Mac OS X])
+    _LT_DECL([], [LIPO], [1],
+      [Tool to manipulate fat objects and archives on Mac OS X])
+    _LT_DECL([], [OTOOL], [1],
+      [ldd/readelf like tool for Mach-O binaries on Mac OS X])
+    _LT_DECL([], [OTOOL64], [1],
+      [ldd/readelf like tool for 64 bit Mach-O binaries on Mac OS X 10.4])
+
+    AC_CACHE_CHECK([for -single_module linker flag],[lt_cv_apple_cc_single_mod],
+      [lt_cv_apple_cc_single_mod=no
+      if test -z "${LT_MULTI_MODULE}"; then
+	# By default we will add the -single_module flag. You can override
+	# by either setting the environment variable LT_MULTI_MODULE
+	# non-empty at configure time, or by adding -multi_module to the
+	# link flags.
+	rm -rf libconftest.dylib*
+	echo "int foo(void){return 1;}" > conftest.c
+	echo "$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \
+-dynamiclib -Wl,-single_module conftest.c" >&AS_MESSAGE_LOG_FD
+	$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \
+	  -dynamiclib -Wl,-single_module conftest.c 2>conftest.err
+        _lt_result=$?
+	# If there is a non-empty error log, and "single_module"
+	# appears in it, assume the flag caused a linker warning
+        if test -s conftest.err && $GREP single_module conftest.err; then
+	  cat conftest.err >&AS_MESSAGE_LOG_FD
+	# Otherwise, if the output was created with a 0 exit code from
+	# the compiler, it worked.
+	elif test -f libconftest.dylib && test $_lt_result -eq 0; then
+	  lt_cv_apple_cc_single_mod=yes
+	else
+	  cat conftest.err >&AS_MESSAGE_LOG_FD
+	fi
+	rm -rf libconftest.dylib*
+	rm -f conftest.*
+      fi])
+
+    AC_CACHE_CHECK([for -exported_symbols_list linker flag],
+      [lt_cv_ld_exported_symbols_list],
+      [lt_cv_ld_exported_symbols_list=no
+      save_LDFLAGS=$LDFLAGS
+      echo "_main" > conftest.sym
+      LDFLAGS="$LDFLAGS -Wl,-exported_symbols_list,conftest.sym"
+      AC_LINK_IFELSE([AC_LANG_PROGRAM([],[])],
+	[lt_cv_ld_exported_symbols_list=yes],
+	[lt_cv_ld_exported_symbols_list=no])
+	LDFLAGS="$save_LDFLAGS"
+    ])
+
+    AC_CACHE_CHECK([for -force_load linker flag],[lt_cv_ld_force_load],
+      [lt_cv_ld_force_load=no
+      cat > conftest.c << _LT_EOF
+int forced_loaded() { return 2;}
+_LT_EOF
+      echo "$LTCC $LTCFLAGS -c -o conftest.o conftest.c" >&AS_MESSAGE_LOG_FD
+      $LTCC $LTCFLAGS -c -o conftest.o conftest.c 2>&AS_MESSAGE_LOG_FD
+      echo "$AR cru libconftest.a conftest.o" >&AS_MESSAGE_LOG_FD
+      $AR cru libconftest.a conftest.o 2>&AS_MESSAGE_LOG_FD
+      echo "$RANLIB libconftest.a" >&AS_MESSAGE_LOG_FD
+      $RANLIB libconftest.a 2>&AS_MESSAGE_LOG_FD
+      cat > conftest.c << _LT_EOF
+int main() { return 0;}
+_LT_EOF
+      echo "$LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a" >&AS_MESSAGE_LOG_FD
+      $LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a 2>conftest.err
+      _lt_result=$?
+      if test -s conftest.err && $GREP force_load conftest.err; then
+	cat conftest.err >&AS_MESSAGE_LOG_FD
+      elif test -f conftest && test $_lt_result -eq 0 && $GREP forced_load conftest >/dev/null 2>&1 ; then
+	lt_cv_ld_force_load=yes
+      else
+	cat conftest.err >&AS_MESSAGE_LOG_FD
+      fi
+        rm -f conftest.err libconftest.a conftest conftest.c
+        rm -rf conftest.dSYM
+    ])
+    case $host_os in
+    rhapsody* | darwin1.[[012]])
+      _lt_dar_allow_undefined='${wl}-undefined ${wl}suppress' ;;
+    darwin1.*)
+      _lt_dar_allow_undefined='${wl}-flat_namespace ${wl}-undefined ${wl}suppress' ;;
+    darwin*) # darwin 5.x on
+      # if running on 10.5 or later, the deployment target defaults
+      # to the OS version, if on x86, and 10.4, the deployment
+      # target defaults to 10.4. Don't you love it?
+      case ${MACOSX_DEPLOYMENT_TARGET-10.0},$host in
+	10.0,*86*-darwin8*|10.0,*-darwin[[91]]*)
+	  _lt_dar_allow_undefined='${wl}-undefined ${wl}dynamic_lookup' ;;
+	10.[[012]]*)
+	  _lt_dar_allow_undefined='${wl}-flat_namespace ${wl}-undefined ${wl}suppress' ;;
+	10.*)
+	  _lt_dar_allow_undefined='${wl}-undefined ${wl}dynamic_lookup' ;;
+      esac
+    ;;
+  esac
+    if test "$lt_cv_apple_cc_single_mod" = "yes"; then
+      _lt_dar_single_mod='$single_module'
+    fi
+    if test "$lt_cv_ld_exported_symbols_list" = "yes"; then
+      _lt_dar_export_syms=' ${wl}-exported_symbols_list,$output_objdir/${libname}-symbols.expsym'
+    else
+      _lt_dar_export_syms='~$NMEDIT -s $output_objdir/${libname}-symbols.expsym ${lib}'
+    fi
+    if test "$DSYMUTIL" != ":" && test "$lt_cv_ld_force_load" = "no"; then
+      _lt_dsymutil='~$DSYMUTIL $lib || :'
+    else
+      _lt_dsymutil=
+    fi
+    ;;
+  esac
+])
+
+
+# _LT_DARWIN_LINKER_FEATURES([TAG])
+# ---------------------------------
+# Checks for linker and compiler features on darwin
+m4_defun([_LT_DARWIN_LINKER_FEATURES],
+[
+  m4_require([_LT_REQUIRED_DARWIN_CHECKS])
+  _LT_TAGVAR(archive_cmds_need_lc, $1)=no
+  _LT_TAGVAR(hardcode_direct, $1)=no
+  _LT_TAGVAR(hardcode_automatic, $1)=yes
+  _LT_TAGVAR(hardcode_shlibpath_var, $1)=unsupported
+  if test "$lt_cv_ld_force_load" = "yes"; then
+    _LT_TAGVAR(whole_archive_flag_spec, $1)='`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience ${wl}-force_load,$conv\"; done; func_echo_all \"$new_convenience\"`'
+    m4_case([$1], [F77], [_LT_TAGVAR(compiler_needs_object, $1)=yes],
+                  [FC],  [_LT_TAGVAR(compiler_needs_object, $1)=yes])
+  else
+    _LT_TAGVAR(whole_archive_flag_spec, $1)=''
+  fi
+  _LT_TAGVAR(link_all_deplibs, $1)=yes
+  _LT_TAGVAR(allow_undefined_flag, $1)="$_lt_dar_allow_undefined"
+  case $cc_basename in
+     ifort*) _lt_dar_can_shared=yes ;;
+     *) _lt_dar_can_shared=$GCC ;;
+  esac
+  if test "$_lt_dar_can_shared" = "yes"; then
+    output_verbose_link_cmd=func_echo_all
+    _LT_TAGVAR(archive_cmds, $1)="\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod${_lt_dsymutil}"
+    _LT_TAGVAR(module_cmds, $1)="\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dsymutil}"
+    _LT_TAGVAR(archive_expsym_cmds, $1)="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring ${_lt_dar_single_mod}${_lt_dar_export_syms}${_lt_dsymutil}"
+    _LT_TAGVAR(module_expsym_cmds, $1)="sed -e 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dar_export_syms}${_lt_dsymutil}"
+    m4_if([$1], [CXX],
+[   if test "$lt_cv_apple_cc_single_mod" != "yes"; then
+      _LT_TAGVAR(archive_cmds, $1)="\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dsymutil}"
+      _LT_TAGVAR(archive_expsym_cmds, $1)="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dar_export_syms}${_lt_dsymutil}"
+    fi
+],[])
+  else
+  _LT_TAGVAR(ld_shlibs, $1)=no
+  fi
+])
+
+# _LT_SYS_MODULE_PATH_AIX([TAGNAME])
+# ----------------------------------
+# Links a minimal program and checks the executable
+# for the system default hardcoded library path. In most cases,
+# this is /usr/lib:/lib, but when the MPI compilers are used
+# the location of the communication and MPI libs are included too.
+# If we don't find anything, use the default library path according
+# to the aix ld manual.
+# Store the results from the different compilers for each TAGNAME.
+# Allow to override them for all tags through lt_cv_aix_libpath.
+m4_defun([_LT_SYS_MODULE_PATH_AIX],
+[m4_require([_LT_DECL_SED])dnl
+if test "${lt_cv_aix_libpath+set}" = set; then
+  aix_libpath=$lt_cv_aix_libpath
+else
+  AC_CACHE_VAL([_LT_TAGVAR([lt_cv_aix_libpath_], [$1])],
+  [AC_LINK_IFELSE([AC_LANG_PROGRAM],[
+  lt_aix_libpath_sed='[
+      /Import File Strings/,/^$/ {
+	  /^0/ {
+	      s/^0  *\([^ ]*\) *$/\1/
+	      p
+	  }
+      }]'
+  _LT_TAGVAR([lt_cv_aix_libpath_], [$1])=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  # Check for a 64-bit object if we didn't find anything.
+  if test -z "$_LT_TAGVAR([lt_cv_aix_libpath_], [$1])"; then
+    _LT_TAGVAR([lt_cv_aix_libpath_], [$1])=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  fi],[])
+  if test -z "$_LT_TAGVAR([lt_cv_aix_libpath_], [$1])"; then
+    _LT_TAGVAR([lt_cv_aix_libpath_], [$1])="/usr/lib:/lib"
+  fi
+  ])
+  aix_libpath=$_LT_TAGVAR([lt_cv_aix_libpath_], [$1])
+fi
+])# _LT_SYS_MODULE_PATH_AIX
+
+
+# _LT_SHELL_INIT(ARG)
+# -------------------
+m4_define([_LT_SHELL_INIT],
+[m4_divert_text([M4SH-INIT], [$1
+])])# _LT_SHELL_INIT
+
+
+
+# _LT_PROG_ECHO_BACKSLASH
+# -----------------------
+# Find how we can fake an echo command that does not interpret backslash.
+# In particular, with Autoconf 2.60 or later we add some code to the start
+# of the generated configure script which will find a shell with a builtin
+# printf (which we can use as an echo command).
+m4_defun([_LT_PROG_ECHO_BACKSLASH],
+[ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO
+ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO$ECHO
+
+AC_MSG_CHECKING([how to print strings])
+# Test print first, because it will be a builtin if present.
+if test "X`( print -r -- -n ) 2>/dev/null`" = X-n && \
+   test "X`print -r -- $ECHO 2>/dev/null`" = "X$ECHO"; then
+  ECHO='print -r --'
+elif test "X`printf %s $ECHO 2>/dev/null`" = "X$ECHO"; then
+  ECHO='printf %s\n'
+else
+  # Use this function as a fallback that always works.
+  func_fallback_echo ()
+  {
+    eval 'cat <<_LTECHO_EOF
+$[]1
+_LTECHO_EOF'
+  }
+  ECHO='func_fallback_echo'
+fi
+
+# func_echo_all arg...
+# Invoke $ECHO with all args, space-separated.
+func_echo_all ()
+{
+    $ECHO "$*" 
+}
+
+case "$ECHO" in
+  printf*) AC_MSG_RESULT([printf]) ;;
+  print*) AC_MSG_RESULT([print -r]) ;;
+  *) AC_MSG_RESULT([cat]) ;;
+esac
+
+m4_ifdef([_AS_DETECT_SUGGESTED],
+[_AS_DETECT_SUGGESTED([
+  test -n "${ZSH_VERSION+set}${BASH_VERSION+set}" || (
+    ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+    ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO
+    ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO$ECHO
+    PATH=/empty FPATH=/empty; export PATH FPATH
+    test "X`printf %s $ECHO`" = "X$ECHO" \
+      || test "X`print -r -- $ECHO`" = "X$ECHO" )])])
+
+_LT_DECL([], [SHELL], [1], [Shell to use when invoking shell scripts])
+_LT_DECL([], [ECHO], [1], [An echo program that protects backslashes])
+])# _LT_PROG_ECHO_BACKSLASH
+
+
+# _LT_WITH_SYSROOT
+# ----------------
+AC_DEFUN([_LT_WITH_SYSROOT],
+[AC_MSG_CHECKING([for sysroot])
+AC_ARG_WITH([sysroot],
+[  --with-sysroot[=DIR] Search for dependent libraries within DIR
+                        (or the compiler's sysroot if not specified).],
+[], [with_sysroot=no])
+
+dnl lt_sysroot will always be passed unquoted.  We quote it here
+dnl in case the user passed a directory name.
+lt_sysroot=
+case ${with_sysroot} in #(
+ yes)
+   if test "$GCC" = yes; then
+     lt_sysroot=`$CC --print-sysroot 2>/dev/null`
+   fi
+   ;; #(
+ /*)
+   lt_sysroot=`echo "$with_sysroot" | sed -e "$sed_quote_subst"`
+   ;; #(
+ no|'')
+   ;; #(
+ *)
+   AC_MSG_RESULT([${with_sysroot}])
+   AC_MSG_ERROR([The sysroot must be an absolute path.])
+   ;;
+esac
+
+ AC_MSG_RESULT([${lt_sysroot:-no}])
+_LT_DECL([], [lt_sysroot], [0], [The root where to search for ]dnl
+[dependent libraries, and in which our libraries should be installed.])])
+
+# _LT_ENABLE_LOCK
+# ---------------
+m4_defun([_LT_ENABLE_LOCK],
+[AC_ARG_ENABLE([libtool-lock],
+  [AS_HELP_STRING([--disable-libtool-lock],
+    [avoid locking (might break parallel builds)])])
+test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case $host in
+ia64-*-hpux*)
+  # Find out which ABI we are using.
+  echo 'int i;' > conftest.$ac_ext
+  if AC_TRY_EVAL(ac_compile); then
+    case `/usr/bin/file conftest.$ac_objext` in
+      *ELF-32*)
+	HPUX_IA64_MODE="32"
+	;;
+      *ELF-64*)
+	HPUX_IA64_MODE="64"
+	;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '[#]line '$LINENO' "configure"' > conftest.$ac_ext
+  if AC_TRY_EVAL(ac_compile); then
+    if test "$lt_cv_prog_gnu_ld" = yes; then
+      case `/usr/bin/file conftest.$ac_objext` in
+	*32-bit*)
+	  LD="${LD-ld} -melf32bsmip"
+	  ;;
+	*N32*)
+	  LD="${LD-ld} -melf32bmipn32"
+	  ;;
+	*64-bit*)
+	  LD="${LD-ld} -melf64bmip"
+	;;
+      esac
+    else
+      case `/usr/bin/file conftest.$ac_objext` in
+	*32-bit*)
+	  LD="${LD-ld} -32"
+	  ;;
+	*N32*)
+	  LD="${LD-ld} -n32"
+	  ;;
+	*64-bit*)
+	  LD="${LD-ld} -64"
+	  ;;
+      esac
+    fi
+  fi
+  rm -rf conftest*
+  ;;
+
+x86_64-*kfreebsd*-gnu|x86_64-*linux*|ppc*-*linux*|powerpc*-*linux*| \
+s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
+  # Find out which ABI we are using.
+  echo 'int i;' > conftest.$ac_ext
+  if AC_TRY_EVAL(ac_compile); then
+    case `/usr/bin/file conftest.o` in
+      *32-bit*)
+	case $host in
+	  x86_64-*kfreebsd*-gnu)
+	    LD="${LD-ld} -m elf_i386_fbsd"
+	    ;;
+	  x86_64-*linux*)
+	    LD="${LD-ld} -m elf_i386"
+	    ;;
+	  ppc64-*linux*|powerpc64-*linux*)
+	    LD="${LD-ld} -m elf32ppclinux"
+	    ;;
+	  s390x-*linux*)
+	    LD="${LD-ld} -m elf_s390"
+	    ;;
+	  sparc64-*linux*)
+	    LD="${LD-ld} -m elf32_sparc"
+	    ;;
+	esac
+	;;
+      *64-bit*)
+	case $host in
+	  x86_64-*kfreebsd*-gnu)
+	    LD="${LD-ld} -m elf_x86_64_fbsd"
+	    ;;
+	  x86_64-*linux*)
+	    LD="${LD-ld} -m elf_x86_64"
+	    ;;
+	  ppc*-*linux*|powerpc*-*linux*)
+	    LD="${LD-ld} -m elf64ppc"
+	    ;;
+	  s390*-*linux*|s390*-*tpf*)
+	    LD="${LD-ld} -m elf64_s390"
+	    ;;
+	  sparc*-*linux*)
+	    LD="${LD-ld} -m elf64_sparc"
+	    ;;
+	esac
+	;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf,
+    [AC_LANG_PUSH(C)
+     AC_LINK_IFELSE([AC_LANG_PROGRAM([[]],[[]])],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no])
+     AC_LANG_POP])
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
+*-*solaris*)
+  # Find out which ABI we are using.
+  echo 'int i;' > conftest.$ac_ext
+  if AC_TRY_EVAL(ac_compile); then
+    case `/usr/bin/file conftest.o` in
+    *64-bit*)
+      case $lt_cv_prog_gnu_ld in
+      yes*)
+        case $host in
+        i?86-*-solaris*)
+          LD="${LD-ld} -m elf_x86_64"
+          ;;
+        sparc*-*-solaris*)
+          LD="${LD-ld} -m elf64_sparc"
+          ;;
+        esac
+        # GNU ld 2.21 introduced _sol2 emulations.  Use them if available.
+        if ${LD-ld} -V | grep _sol2 >/dev/null 2>&1; then
+          LD="${LD-ld}_sol2"
+        fi
+        ;;
+      *)
+	if ${LD-ld} -64 -r -o conftest2.o conftest.o >/dev/null 2>&1; then
+	  LD="${LD-ld} -64"
+	fi
+	;;
+      esac
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+esac
+
+need_locks="$enable_libtool_lock"
+])# _LT_ENABLE_LOCK
+
+
+# _LT_PROG_AR
+# -----------
+m4_defun([_LT_PROG_AR],
+[AC_CHECK_TOOLS(AR, [ar], false)
+: ${AR=ar}
+: ${AR_FLAGS=cru}
+_LT_DECL([], [AR], [1], [The archiver])
+_LT_DECL([], [AR_FLAGS], [1], [Flags to create an archive])
+
+AC_CACHE_CHECK([for archiver @FILE support], [lt_cv_ar_at_file],
+  [lt_cv_ar_at_file=no
+   AC_COMPILE_IFELSE([AC_LANG_PROGRAM],
+     [echo conftest.$ac_objext > conftest.lst
+      lt_ar_try='$AR $AR_FLAGS libconftest.a @conftest.lst >&AS_MESSAGE_LOG_FD'
+      AC_TRY_EVAL([lt_ar_try])
+      if test "$ac_status" -eq 0; then
+	# Ensure the archiver fails upon bogus file names.
+	rm -f conftest.$ac_objext libconftest.a
+	AC_TRY_EVAL([lt_ar_try])
+	if test "$ac_status" -ne 0; then
+          lt_cv_ar_at_file=@
+        fi
+      fi
+      rm -f conftest.* libconftest.a
+     ])
+  ])
+
+if test "x$lt_cv_ar_at_file" = xno; then
+  archiver_list_spec=
+else
+  archiver_list_spec=$lt_cv_ar_at_file
+fi
+_LT_DECL([], [archiver_list_spec], [1],
+  [How to feed a file listing to the archiver])
+])# _LT_PROG_AR
+
+
+# _LT_CMD_OLD_ARCHIVE
+# -------------------
+m4_defun([_LT_CMD_OLD_ARCHIVE],
+[_LT_PROG_AR
+
+AC_CHECK_TOOL(STRIP, strip, :)
+test -z "$STRIP" && STRIP=:
+_LT_DECL([], [STRIP], [1], [A symbol stripping program])
+
+AC_CHECK_TOOL(RANLIB, ranlib, :)
+test -z "$RANLIB" && RANLIB=:
+_LT_DECL([], [RANLIB], [1],
+    [Commands used to install an old-style archive])
+
+# Determine commands to create old-style static archives.
+old_archive_cmds='$AR $AR_FLAGS $oldlib$oldobjs'
+old_postinstall_cmds='chmod 644 $oldlib'
+old_postuninstall_cmds=
+
+if test -n "$RANLIB"; then
+  case $host_os in
+  openbsd*)
+    old_postinstall_cmds="$old_postinstall_cmds~\$RANLIB -t \$tool_oldlib"
+    ;;
+  *)
+    old_postinstall_cmds="$old_postinstall_cmds~\$RANLIB \$tool_oldlib"
+    ;;
+  esac
+  old_archive_cmds="$old_archive_cmds~\$RANLIB \$tool_oldlib"
+fi
+
+case $host_os in
+  darwin*)
+    lock_old_archive_extraction=yes ;;
+  *)
+    lock_old_archive_extraction=no ;;
+esac
+_LT_DECL([], [old_postinstall_cmds], [2])
+_LT_DECL([], [old_postuninstall_cmds], [2])
+_LT_TAGDECL([], [old_archive_cmds], [2],
+    [Commands used to build an old-style archive])
+_LT_DECL([], [lock_old_archive_extraction], [0],
+    [Whether to use a lock for old archive extraction])
+])# _LT_CMD_OLD_ARCHIVE
+
+
+# _LT_COMPILER_OPTION(MESSAGE, VARIABLE-NAME, FLAGS,
+#		[OUTPUT-FILE], [ACTION-SUCCESS], [ACTION-FAILURE])
+# ----------------------------------------------------------------
+# Check whether the given compiler option works
+AC_DEFUN([_LT_COMPILER_OPTION],
+[m4_require([_LT_FILEUTILS_DEFAULTS])dnl
+m4_require([_LT_DECL_SED])dnl
+AC_CACHE_CHECK([$1], [$2],
+  [$2=no
+   m4_if([$4], , [ac_outfile=conftest.$ac_objext], [ac_outfile=$4])
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+   lt_compiler_flag="$3"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   # The option is referenced via a variable to avoid confusing sed.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [[^ ]]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&AS_MESSAGE_LOG_FD)
+   (eval "$lt_compile" 2>conftest.err)
+   ac_status=$?
+   cat conftest.err >&AS_MESSAGE_LOG_FD
+   echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
+   if (exit $ac_status) && test -s "$ac_outfile"; then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings other than the usual output.
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' >conftest.exp
+     $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
+     if test ! -s conftest.er2 || diff conftest.exp conftest.er2 >/dev/null; then
+       $2=yes
+     fi
+   fi
+   $RM conftest*
+])
+
+if test x"[$]$2" = xyes; then
+    m4_if([$5], , :, [$5])
+else
+    m4_if([$6], , :, [$6])
+fi
+])# _LT_COMPILER_OPTION
+
+# Old name:
+AU_ALIAS([AC_LIBTOOL_COMPILER_OPTION], [_LT_COMPILER_OPTION])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_LIBTOOL_COMPILER_OPTION], [])
+
+
+# _LT_LINKER_OPTION(MESSAGE, VARIABLE-NAME, FLAGS,
+#                  [ACTION-SUCCESS], [ACTION-FAILURE])
+# ----------------------------------------------------
+# Check whether the given linker option works
+AC_DEFUN([_LT_LINKER_OPTION],
+[m4_require([_LT_FILEUTILS_DEFAULTS])dnl
+m4_require([_LT_DECL_SED])dnl
+AC_CACHE_CHECK([$1], [$2],
+  [$2=no
+   save_LDFLAGS="$LDFLAGS"
+   LDFLAGS="$LDFLAGS $3"
+   echo "$lt_simple_link_test_code" > conftest.$ac_ext
+   if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then
+     # The linker can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     if test -s conftest.err; then
+       # Append any errors to the config.log.
+       cat conftest.err 1>&AS_MESSAGE_LOG_FD
+       $ECHO "$_lt_linker_boilerplate" | $SED '/^$/d' > conftest.exp
+       $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
+       if diff conftest.exp conftest.er2 >/dev/null; then
+         $2=yes
+       fi
+     else
+       $2=yes
+     fi
+   fi
+   $RM -r conftest*
+   LDFLAGS="$save_LDFLAGS"
+])
+
+if test x"[$]$2" = xyes; then
+    m4_if([$4], , :, [$4])
+else
+    m4_if([$5], , :, [$5])
+fi
+])# _LT_LINKER_OPTION
+
+# Old name:
+AU_ALIAS([AC_LIBTOOL_LINKER_OPTION], [_LT_LINKER_OPTION])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_LIBTOOL_LINKER_OPTION], [])
+
+
+# LT_CMD_MAX_LEN
+#---------------
+AC_DEFUN([LT_CMD_MAX_LEN],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+# find the maximum length of command line arguments
+AC_MSG_CHECKING([the maximum length of command line arguments])
+AC_CACHE_VAL([lt_cv_sys_max_cmd_len], [dnl
+  i=0
+  teststring="ABCD"
+
+  case $build_os in
+  msdosdjgpp*)
+    # On DJGPP, this test can blow up pretty badly due to problems in libc
+    # (any single argument exceeding 2000 bytes causes a buffer overrun
+    # during glob expansion).  Even if it were fixed, the result of this
+    # check would be larger than it should be.
+    lt_cv_sys_max_cmd_len=12288;    # 12K is about right
+    ;;
+
+  gnu*)
+    # Under GNU Hurd, this test is not required because there is
+    # no limit to the length of command line arguments.
+    # Libtool will interpret -1 as no limit whatsoever
+    lt_cv_sys_max_cmd_len=-1;
+    ;;
+
+  cygwin* | mingw* | cegcc*)
+    # On Win9x/ME, this test blows up -- it succeeds, but takes
+    # about 5 minutes as the teststring grows exponentially.
+    # Worse, since 9x/ME are not pre-emptively multitasking,
+    # you end up with a "frozen" computer, even though with patience
+    # the test eventually succeeds (with a max line length of 256k).
+    # Instead, let's just punt: use the minimum linelength reported by
+    # all of the supported platforms: 8192 (on NT/2K/XP).
+    lt_cv_sys_max_cmd_len=8192;
+    ;;
+
+  mint*)
+    # On MiNT this can take a long time and run out of memory.
+    lt_cv_sys_max_cmd_len=8192;
+    ;;
+
+  amigaos*)
+    # On AmigaOS with pdksh, this test takes hours, literally.
+    # So we just punt and use a minimum line length of 8192.
+    lt_cv_sys_max_cmd_len=8192;
+    ;;
+
+  netbsd* | freebsd* | openbsd* | darwin* | dragonfly*)
+    # This has been around since 386BSD, at least.  Likely further.
+    if test -x /sbin/sysctl; then
+      lt_cv_sys_max_cmd_len=`/sbin/sysctl -n kern.argmax`
+    elif test -x /usr/sbin/sysctl; then
+      lt_cv_sys_max_cmd_len=`/usr/sbin/sysctl -n kern.argmax`
+    else
+      lt_cv_sys_max_cmd_len=65536	# usable default for all BSDs
+    fi
+    # And add a safety zone
+    lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4`
+    lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3`
+    ;;
+
+  interix*)
+    # We know the value 262144 and hardcode it with a safety zone (like BSD)
+    lt_cv_sys_max_cmd_len=196608
+    ;;
+
+  os2*)
+    # The test takes a long time on OS/2.
+    lt_cv_sys_max_cmd_len=8192
+    ;;
+
+  osf*)
+    # Dr. Hans Ekkehard Plesser reports seeing a kernel panic running configure
+    # due to this test when exec_disable_arg_limit is 1 on Tru64. It is not
+    # nice to cause kernel panics so lets avoid the loop below.
+    # First set a reasonable default.
+    lt_cv_sys_max_cmd_len=16384
+    #
+    if test -x /sbin/sysconfig; then
+      case `/sbin/sysconfig -q proc exec_disable_arg_limit` in
+        *1*) lt_cv_sys_max_cmd_len=-1 ;;
+      esac
+    fi
+    ;;
+  sco3.2v5*)
+    lt_cv_sys_max_cmd_len=102400
+    ;;
+  sysv5* | sco5v6* | sysv4.2uw2*)
+    kargmax=`grep ARG_MAX /etc/conf/cf.d/stune 2>/dev/null`
+    if test -n "$kargmax"; then
+      lt_cv_sys_max_cmd_len=`echo $kargmax | sed 's/.*[[	 ]]//'`
+    else
+      lt_cv_sys_max_cmd_len=32768
+    fi
+    ;;
+  *)
+    lt_cv_sys_max_cmd_len=`(getconf ARG_MAX) 2> /dev/null`
+    if test -n "$lt_cv_sys_max_cmd_len"; then
+      lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4`
+      lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3`
+    else
+      # Make teststring a little bigger before we do anything with it.
+      # a 1K string should be a reasonable start.
+      for i in 1 2 3 4 5 6 7 8 ; do
+        teststring=$teststring$teststring
+      done
+      SHELL=${SHELL-${CONFIG_SHELL-/bin/sh}}
+      # If test is not a shell built-in, we'll probably end up computing a
+      # maximum length that is only half of the actual maximum length, but
+      # we can't tell.
+      while { test "X"`env echo "$teststring$teststring" 2>/dev/null` \
+	         = "X$teststring$teststring"; } >/dev/null 2>&1 &&
+	      test $i != 17 # 1/2 MB should be enough
+      do
+        i=`expr $i + 1`
+        teststring=$teststring$teststring
+      done
+      # Only check the string length outside the loop.
+      lt_cv_sys_max_cmd_len=`expr "X$teststring" : ".*" 2>&1`
+      teststring=
+      # Add a significant safety factor because C++ compilers can tack on
+      # massive amounts of additional arguments before passing them to the
+      # linker.  It appears as though 1/2 is a usable value.
+      lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 2`
+    fi
+    ;;
+  esac
+])
+if test -n $lt_cv_sys_max_cmd_len ; then
+  AC_MSG_RESULT($lt_cv_sys_max_cmd_len)
+else
+  AC_MSG_RESULT(none)
+fi
+max_cmd_len=$lt_cv_sys_max_cmd_len
+_LT_DECL([], [max_cmd_len], [0],
+    [What is the maximum length of a command?])
+])# LT_CMD_MAX_LEN
+
+# Old name:
+AU_ALIAS([AC_LIBTOOL_SYS_MAX_CMD_LEN], [LT_CMD_MAX_LEN])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_LIBTOOL_SYS_MAX_CMD_LEN], [])
+
+
+# _LT_HEADER_DLFCN
+# ----------------
+m4_defun([_LT_HEADER_DLFCN],
+[AC_CHECK_HEADERS([dlfcn.h], [], [], [AC_INCLUDES_DEFAULT])dnl
+])# _LT_HEADER_DLFCN
+
+
+# _LT_TRY_DLOPEN_SELF (ACTION-IF-TRUE, ACTION-IF-TRUE-W-USCORE,
+#                      ACTION-IF-FALSE, ACTION-IF-CROSS-COMPILING)
+# ----------------------------------------------------------------
+m4_defun([_LT_TRY_DLOPEN_SELF],
+[m4_require([_LT_HEADER_DLFCN])dnl
+if test "$cross_compiling" = yes; then :
+  [$4]
+else
+  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+  lt_status=$lt_dlunknown
+  cat > conftest.$ac_ext <<_LT_EOF
+[#line $LINENO "configure"
+#include "confdefs.h"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+#  define LT_DLGLOBAL		RTLD_GLOBAL
+#else
+#  ifdef DL_GLOBAL
+#    define LT_DLGLOBAL		DL_GLOBAL
+#  else
+#    define LT_DLGLOBAL		0
+#  endif
+#endif
+
+/* We may have to define LT_DLLAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LT_DLLAZY_OR_NOW
+#  ifdef RTLD_LAZY
+#    define LT_DLLAZY_OR_NOW		RTLD_LAZY
+#  else
+#    ifdef DL_LAZY
+#      define LT_DLLAZY_OR_NOW		DL_LAZY
+#    else
+#      ifdef RTLD_NOW
+#        define LT_DLLAZY_OR_NOW	RTLD_NOW
+#      else
+#        ifdef DL_NOW
+#          define LT_DLLAZY_OR_NOW	DL_NOW
+#        else
+#          define LT_DLLAZY_OR_NOW	0
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+
+/* When -fvisbility=hidden is used, assume the code has been annotated
+   correspondingly for the symbols needed.  */
+#if defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3))
+int fnord () __attribute__((visibility("default")));
+#endif
+
+int fnord () { return 42; }
+int main ()
+{
+  void *self = dlopen (0, LT_DLGLOBAL|LT_DLLAZY_OR_NOW);
+  int status = $lt_dlunknown;
+
+  if (self)
+    {
+      if (dlsym (self,"fnord"))       status = $lt_dlno_uscore;
+      else
+        {
+	  if (dlsym( self,"_fnord"))  status = $lt_dlneed_uscore;
+          else puts (dlerror ());
+	}
+      /* dlclose (self); */
+    }
+  else
+    puts (dlerror ());
+
+  return status;
+}]
+_LT_EOF
+  if AC_TRY_EVAL(ac_link) && test -s conftest${ac_exeext} 2>/dev/null; then
+    (./conftest; exit; ) >&AS_MESSAGE_LOG_FD 2>/dev/null
+    lt_status=$?
+    case x$lt_status in
+      x$lt_dlno_uscore) $1 ;;
+      x$lt_dlneed_uscore) $2 ;;
+      x$lt_dlunknown|x*) $3 ;;
+    esac
+  else :
+    # compilation failed
+    $3
+  fi
+fi
+rm -fr conftest*
+])# _LT_TRY_DLOPEN_SELF
+
+
+# LT_SYS_DLOPEN_SELF
+# ------------------
+AC_DEFUN([LT_SYS_DLOPEN_SELF],
+[m4_require([_LT_HEADER_DLFCN])dnl
+if test "x$enable_dlopen" != xyes; then
+  enable_dlopen=unknown
+  enable_dlopen_self=unknown
+  enable_dlopen_self_static=unknown
+else
+  lt_cv_dlopen=no
+  lt_cv_dlopen_libs=
+
+  case $host_os in
+  beos*)
+    lt_cv_dlopen="load_add_on"
+    lt_cv_dlopen_libs=
+    lt_cv_dlopen_self=yes
+    ;;
+
+  mingw* | pw32* | cegcc*)
+    lt_cv_dlopen="LoadLibrary"
+    lt_cv_dlopen_libs=
+    ;;
+
+  cygwin*)
+    lt_cv_dlopen="dlopen"
+    lt_cv_dlopen_libs=
+    ;;
+
+  darwin*)
+  # if libdl is installed we need to link against it
+    AC_CHECK_LIB([dl], [dlopen],
+		[lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"],[
+    lt_cv_dlopen="dyld"
+    lt_cv_dlopen_libs=
+    lt_cv_dlopen_self=yes
+    ])
+    ;;
+
+  *)
+    AC_CHECK_FUNC([shl_load],
+	  [lt_cv_dlopen="shl_load"],
+      [AC_CHECK_LIB([dld], [shl_load],
+	    [lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-ldld"],
+	[AC_CHECK_FUNC([dlopen],
+	      [lt_cv_dlopen="dlopen"],
+	  [AC_CHECK_LIB([dl], [dlopen],
+		[lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"],
+	    [AC_CHECK_LIB([svld], [dlopen],
+		  [lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-lsvld"],
+	      [AC_CHECK_LIB([dld], [dld_link],
+		    [lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-ldld"])
+	      ])
+	    ])
+	  ])
+	])
+      ])
+    ;;
+  esac
+
+  if test "x$lt_cv_dlopen" != xno; then
+    enable_dlopen=yes
+  else
+    enable_dlopen=no
+  fi
+
+  case $lt_cv_dlopen in
+  dlopen)
+    save_CPPFLAGS="$CPPFLAGS"
+    test "x$ac_cv_header_dlfcn_h" = xyes && CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
+
+    save_LDFLAGS="$LDFLAGS"
+    wl=$lt_prog_compiler_wl eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\"
+
+    save_LIBS="$LIBS"
+    LIBS="$lt_cv_dlopen_libs $LIBS"
+
+    AC_CACHE_CHECK([whether a program can dlopen itself],
+	  lt_cv_dlopen_self, [dnl
+	  _LT_TRY_DLOPEN_SELF(
+	    lt_cv_dlopen_self=yes, lt_cv_dlopen_self=yes,
+	    lt_cv_dlopen_self=no, lt_cv_dlopen_self=cross)
+    ])
+
+    if test "x$lt_cv_dlopen_self" = xyes; then
+      wl=$lt_prog_compiler_wl eval LDFLAGS=\"\$LDFLAGS $lt_prog_compiler_static\"
+      AC_CACHE_CHECK([whether a statically linked program can dlopen itself],
+	  lt_cv_dlopen_self_static, [dnl
+	  _LT_TRY_DLOPEN_SELF(
+	    lt_cv_dlopen_self_static=yes, lt_cv_dlopen_self_static=yes,
+	    lt_cv_dlopen_self_static=no,  lt_cv_dlopen_self_static=cross)
+      ])
+    fi
+
+    CPPFLAGS="$save_CPPFLAGS"
+    LDFLAGS="$save_LDFLAGS"
+    LIBS="$save_LIBS"
+    ;;
+  esac
+
+  case $lt_cv_dlopen_self in
+  yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;;
+  *) enable_dlopen_self=unknown ;;
+  esac
+
+  case $lt_cv_dlopen_self_static in
+  yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;;
+  *) enable_dlopen_self_static=unknown ;;
+  esac
+fi
+_LT_DECL([dlopen_support], [enable_dlopen], [0],
+	 [Whether dlopen is supported])
+_LT_DECL([dlopen_self], [enable_dlopen_self], [0],
+	 [Whether dlopen of programs is supported])
+_LT_DECL([dlopen_self_static], [enable_dlopen_self_static], [0],
+	 [Whether dlopen of statically linked programs is supported])
+])# LT_SYS_DLOPEN_SELF
+
+# Old name:
+AU_ALIAS([AC_LIBTOOL_DLOPEN_SELF], [LT_SYS_DLOPEN_SELF])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_LIBTOOL_DLOPEN_SELF], [])
+
+
+# _LT_COMPILER_C_O([TAGNAME])
+# ---------------------------
+# Check to see if options -c and -o are simultaneously supported by compiler.
+# This macro does not hard code the compiler like AC_PROG_CC_C_O.
+m4_defun([_LT_COMPILER_C_O],
+[m4_require([_LT_DECL_SED])dnl
+m4_require([_LT_FILEUTILS_DEFAULTS])dnl
+m4_require([_LT_TAG_COMPILER])dnl
+AC_CACHE_CHECK([if $compiler supports -c -o file.$ac_objext],
+  [_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)],
+  [_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)=no
+   $RM -r conftest 2>/dev/null
+   mkdir conftest
+   cd conftest
+   mkdir out
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+
+   lt_compiler_flag="-o out/conftest2.$ac_objext"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [[^ ]]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&AS_MESSAGE_LOG_FD)
+   (eval "$lt_compile" 2>out/conftest.err)
+   ac_status=$?
+   cat out/conftest.err >&AS_MESSAGE_LOG_FD
+   echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
+   if (exit $ac_status) && test -s out/conftest2.$ac_objext
+   then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
+     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
+     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
+       _LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)=yes
+     fi
+   fi
+   chmod u+w . 2>&AS_MESSAGE_LOG_FD
+   $RM conftest*
+   # SGI C++ compiler will create directory out/ii_files/ for
+   # template instantiation
+   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
+   $RM out/* && rmdir out
+   cd ..
+   $RM -r conftest
+   $RM conftest*
+])
+_LT_TAGDECL([compiler_c_o], [lt_cv_prog_compiler_c_o], [1],
+	[Does compiler simultaneously support -c and -o options?])
+])# _LT_COMPILER_C_O
+
+
+# _LT_COMPILER_FILE_LOCKS([TAGNAME])
+# ----------------------------------
+# Check to see if we can do hard links to lock some files if needed
+m4_defun([_LT_COMPILER_FILE_LOCKS],
+[m4_require([_LT_ENABLE_LOCK])dnl
+m4_require([_LT_FILEUTILS_DEFAULTS])dnl
+_LT_COMPILER_C_O([$1])
+
+hard_links="nottested"
+if test "$_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)" = no && test "$need_locks" != no; then
+  # do not overwrite the value of need_locks provided by the user
+  AC_MSG_CHECKING([if we can lock with hard links])
+  hard_links=yes
+  $RM conftest*
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  touch conftest.a
+  ln conftest.a conftest.b 2>&5 || hard_links=no
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  AC_MSG_RESULT([$hard_links])
+  if test "$hard_links" = no; then
+    AC_MSG_WARN([`$CC' does not support `-c -o', so `make -j' may be unsafe])
+    need_locks=warn
+  fi
+else
+  need_locks=no
+fi
+_LT_DECL([], [need_locks], [1], [Must we lock files when doing compilation?])
+])# _LT_COMPILER_FILE_LOCKS
+
+
+# _LT_CHECK_OBJDIR
+# ----------------
+m4_defun([_LT_CHECK_OBJDIR],
+[AC_CACHE_CHECK([for objdir], [lt_cv_objdir],
+[rm -f .libs 2>/dev/null
+mkdir .libs 2>/dev/null
+if test -d .libs; then
+  lt_cv_objdir=.libs
+else
+  # MS-DOS does not allow filenames that begin with a dot.
+  lt_cv_objdir=_libs
+fi
+rmdir .libs 2>/dev/null])
+objdir=$lt_cv_objdir
+_LT_DECL([], [objdir], [0],
+         [The name of the directory that contains temporary libtool files])dnl
+m4_pattern_allow([LT_OBJDIR])dnl
+AC_DEFINE_UNQUOTED(LT_OBJDIR, "$lt_cv_objdir/",
+  [Define to the sub-directory in which libtool stores uninstalled libraries.])
+])# _LT_CHECK_OBJDIR
+
+
+# _LT_LINKER_HARDCODE_LIBPATH([TAGNAME])
+# --------------------------------------
+# Check hardcoding attributes.
+m4_defun([_LT_LINKER_HARDCODE_LIBPATH],
+[AC_MSG_CHECKING([how to hardcode library paths into programs])
+_LT_TAGVAR(hardcode_action, $1)=
+if test -n "$_LT_TAGVAR(hardcode_libdir_flag_spec, $1)" ||
+   test -n "$_LT_TAGVAR(runpath_var, $1)" ||
+   test "X$_LT_TAGVAR(hardcode_automatic, $1)" = "Xyes" ; then
+
+  # We can hardcode non-existent directories.
+  if test "$_LT_TAGVAR(hardcode_direct, $1)" != no &&
+     # If the only mechanism to avoid hardcoding is shlibpath_var, we
+     # have to relink, otherwise we might link with an installed library
+     # when we should be linking with a yet-to-be-installed one
+     ## test "$_LT_TAGVAR(hardcode_shlibpath_var, $1)" != no &&
+     test "$_LT_TAGVAR(hardcode_minus_L, $1)" != no; then
+    # Linking always hardcodes the temporary library directory.
+    _LT_TAGVAR(hardcode_action, $1)=relink
+  else
+    # We can link without hardcoding, and we can hardcode nonexisting dirs.
+    _LT_TAGVAR(hardcode_action, $1)=immediate
+  fi
+else
+  # We cannot hardcode anything, or else we can only hardcode existing
+  # directories.
+  _LT_TAGVAR(hardcode_action, $1)=unsupported
+fi
+AC_MSG_RESULT([$_LT_TAGVAR(hardcode_action, $1)])
+
+if test "$_LT_TAGVAR(hardcode_action, $1)" = relink ||
+   test "$_LT_TAGVAR(inherit_rpath, $1)" = yes; then
+  # Fast installation is not supported
+  enable_fast_install=no
+elif test "$shlibpath_overrides_runpath" = yes ||
+     test "$enable_shared" = no; then
+  # Fast installation is not necessary
+  enable_fast_install=needless
+fi
+_LT_TAGDECL([], [hardcode_action], [0],
+    [How to hardcode a shared library path into an executable])
+])# _LT_LINKER_HARDCODE_LIBPATH
+
+
+# _LT_CMD_STRIPLIB
+# ----------------
+m4_defun([_LT_CMD_STRIPLIB],
+[m4_require([_LT_DECL_EGREP])
+striplib=
+old_striplib=
+AC_MSG_CHECKING([whether stripping libraries is possible])
+if test -n "$STRIP" && $STRIP -V 2>&1 | $GREP "GNU strip" >/dev/null; then
+  test -z "$old_striplib" && old_striplib="$STRIP --strip-debug"
+  test -z "$striplib" && striplib="$STRIP --strip-unneeded"
+  AC_MSG_RESULT([yes])
+else
+# FIXME - insert some real tests, host_os isn't really good enough
+  case $host_os in
+  darwin*)
+    if test -n "$STRIP" ; then
+      striplib="$STRIP -x"
+      old_striplib="$STRIP -S"
+      AC_MSG_RESULT([yes])
+    else
+      AC_MSG_RESULT([no])
+    fi
+    ;;
+  *)
+    AC_MSG_RESULT([no])
+    ;;
+  esac
+fi
+_LT_DECL([], [old_striplib], [1], [Commands to strip libraries])
+_LT_DECL([], [striplib], [1])
+])# _LT_CMD_STRIPLIB
+
+
+# _LT_SYS_DYNAMIC_LINKER([TAG])
+# -----------------------------
+# PORTME Fill in your ld.so characteristics
+m4_defun([_LT_SYS_DYNAMIC_LINKER],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+m4_require([_LT_DECL_EGREP])dnl
+m4_require([_LT_FILEUTILS_DEFAULTS])dnl
+m4_require([_LT_DECL_OBJDUMP])dnl
+m4_require([_LT_DECL_SED])dnl
+m4_require([_LT_CHECK_SHELL_FEATURES])dnl
+AC_MSG_CHECKING([dynamic linker characteristics])
+m4_if([$1],
+	[], [
+if test "$GCC" = yes; then
+  case $host_os in
+    darwin*) lt_awk_arg="/^libraries:/,/LR/" ;;
+    *) lt_awk_arg="/^libraries:/" ;;
+  esac
+  case $host_os in
+    mingw* | cegcc*) lt_sed_strip_eq="s,=\([[A-Za-z]]:\),\1,g" ;;
+    *) lt_sed_strip_eq="s,=/,/,g" ;;
+  esac
+  lt_search_path_spec=`$CC -print-search-dirs | awk $lt_awk_arg | $SED -e "s/^libraries://" -e $lt_sed_strip_eq`
+  case $lt_search_path_spec in
+  *\;*)
+    # if the path contains ";" then we assume it to be the separator
+    # otherwise default to the standard path separator (i.e. ":") - it is
+    # assumed that no part of a normal pathname contains ";" but that should
+    # okay in the real world where ";" in dirpaths is itself problematic.
+    lt_search_path_spec=`$ECHO "$lt_search_path_spec" | $SED 's/;/ /g'`
+    ;;
+  *)
+    lt_search_path_spec=`$ECHO "$lt_search_path_spec" | $SED "s/$PATH_SEPARATOR/ /g"`
+    ;;
+  esac
+  # Ok, now we have the path, separated by spaces, we can step through it
+  # and add multilib dir if necessary.
+  lt_tmp_lt_search_path_spec=
+  lt_multi_os_dir=`$CC $CPPFLAGS $CFLAGS $LDFLAGS -print-multi-os-directory 2>/dev/null`
+  for lt_sys_path in $lt_search_path_spec; do
+    if test -d "$lt_sys_path/$lt_multi_os_dir"; then
+      lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path/$lt_multi_os_dir"
+    else
+      test -d "$lt_sys_path" && \
+	lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path"
+    fi
+  done
+  lt_search_path_spec=`$ECHO "$lt_tmp_lt_search_path_spec" | awk '
+BEGIN {RS=" "; FS="/|\n";} {
+  lt_foo="";
+  lt_count=0;
+  for (lt_i = NF; lt_i > 0; lt_i--) {
+    if ($lt_i != "" && $lt_i != ".") {
+      if ($lt_i == "..") {
+        lt_count++;
+      } else {
+        if (lt_count == 0) {
+          lt_foo="/" $lt_i lt_foo;
+        } else {
+          lt_count--;
+        }
+      }
+    }
+  }
+  if (lt_foo != "") { lt_freq[[lt_foo]]++; }
+  if (lt_freq[[lt_foo]] == 1) { print lt_foo; }
+}'`
+  # AWK program above erroneously prepends '/' to C:/dos/paths
+  # for these hosts.
+  case $host_os in
+    mingw* | cegcc*) lt_search_path_spec=`$ECHO "$lt_search_path_spec" |\
+      $SED 's,/\([[A-Za-z]]:\),\1,g'` ;;
+  esac
+  sys_lib_search_path_spec=`$ECHO "$lt_search_path_spec" | $lt_NL2SP`
+else
+  sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib"
+fi])
+library_names_spec=
+libname_spec='lib$name'
+soname_spec=
+shrext_cmds=".so"
+postinstall_cmds=
+postuninstall_cmds=
+finish_cmds=
+finish_eval=
+shlibpath_var=
+shlibpath_overrides_runpath=unknown
+version_type=none
+dynamic_linker="$host_os ld.so"
+sys_lib_dlsearch_path_spec="/lib /usr/lib"
+need_lib_prefix=unknown
+hardcode_into_libs=no
+
+# when you set need_version to no, make sure it does not cause -set_version
+# flags to be left without arguments
+need_version=unknown
+
+case $host_os in
+aix3*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix $libname.a'
+  shlibpath_var=LIBPATH
+
+  # AIX 3 has no versioning support, so we append a major version to the name.
+  soname_spec='${libname}${release}${shared_ext}$major'
+  ;;
+
+aix[[4-9]]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  hardcode_into_libs=yes
+  if test "$host_cpu" = ia64; then
+    # AIX 5 supports IA64
+    library_names_spec='${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext}$versuffix $libname${shared_ext}'
+    shlibpath_var=LD_LIBRARY_PATH
+  else
+    # With GCC up to 2.95.x, collect2 would create an import file
+    # for dependence libraries.  The import file would start with
+    # the line `#! .'.  This would cause the generated library to
+    # depend on `.', always an invalid library.  This was fixed in
+    # development snapshots of GCC prior to 3.0.
+    case $host_os in
+      aix4 | aix4.[[01]] | aix4.[[01]].*)
+      if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
+	   echo ' yes '
+	   echo '#endif'; } | ${CC} -E - | $GREP yes > /dev/null; then
+	:
+      else
+	can_build_shared=no
+      fi
+      ;;
+    esac
+    # AIX (on Power*) has no versioning support, so currently we can not hardcode correct
+    # soname into executable. Probably we can add versioning support to
+    # collect2, so additional links can be useful in future.
+    if test "$aix_use_runtimelinking" = yes; then
+      # If using run time linking (on AIX 4.2 or later) use lib<name>.so
+      # instead of lib<name>.a to let people know that these are not
+      # typical AIX shared libraries.
+      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    else
+      # We preserve .a as extension for shared libraries through AIX4.2
+      # and later when we are not doing run time linking.
+      library_names_spec='${libname}${release}.a $libname.a'
+      soname_spec='${libname}${release}${shared_ext}$major'
+    fi
+    shlibpath_var=LIBPATH
+  fi
+  ;;
+
+amigaos*)
+  case $host_cpu in
+  powerpc)
+    # Since July 2007 AmigaOS4 officially supports .so libraries.
+    # When compiling the executable, add -use-dynld -Lsobjs: to the compileline.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    ;;
+  m68k)
+    library_names_spec='$libname.ixlibrary $libname.a'
+    # Create ${libname}_ixlibrary.a entries in /sys/libs.
+    finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`func_echo_all "$lib" | $SED '\''s%^.*/\([[^/]]*\)\.ixlibrary$%\1%'\''`; test $RM /sys/libs/${libname}_ixlibrary.a; $show "cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a"; cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a || exit 1; done'
+    ;;
+  esac
+  ;;
+
+beos*)
+  library_names_spec='${libname}${shared_ext}'
+  dynamic_linker="$host_os ld.so"
+  shlibpath_var=LIBRARY_PATH
+  ;;
+
+bsdi[[45]]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
+  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
+  # the default ld.so.conf also contains /usr/contrib/lib and
+  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
+  # libtool to hard-code these into programs
+  ;;
+
+cygwin* | mingw* | pw32* | cegcc*)
+  version_type=windows
+  shrext_cmds=".dll"
+  need_version=no
+  need_lib_prefix=no
+
+  case $GCC,$cc_basename in
+  yes,*)
+    # gcc
+    library_names_spec='$libname.dll.a'
+    # DLL is installed to $(libdir)/../bin by postinstall_cmds
+    postinstall_cmds='base_file=`basename \${file}`~
+      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog $dir/$dlname \$dldir/$dlname~
+      chmod a+x \$dldir/$dlname~
+      if test -n '\''$stripme'\'' && test -n '\''$striplib'\''; then
+        eval '\''$striplib \$dldir/$dlname'\'' || exit \$?;
+      fi'
+    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll~
+       $RM \$dlpath'
+    shlibpath_overrides_runpath=yes
+
+    case $host_os in
+    cygwin*)
+      # Cygwin DLLs use 'cyg' prefix rather than 'lib'
+      soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
+m4_if([$1], [],[
+      sys_lib_search_path_spec="$sys_lib_search_path_spec /usr/lib/w32api"])
+      ;;
+    mingw* | cegcc*)
+      # MinGW DLLs use traditional 'lib' prefix
+      soname_spec='${libname}`echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
+      ;;
+    pw32*)
+      # pw32 DLLs use 'pw' prefix rather than 'lib'
+      library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
+      ;;
+    esac
+    dynamic_linker='Win32 ld.exe'
+    ;;
+
+  *,cl*)
+    # Native MSVC
+    libname_spec='$name'
+    soname_spec='${libname}`echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
+    library_names_spec='${libname}.dll.lib'
+
+    case $build_os in
+    mingw*)
+      sys_lib_search_path_spec=
+      lt_save_ifs=$IFS
+      IFS=';'
+      for lt_path in $LIB
+      do
+        IFS=$lt_save_ifs
+        # Let DOS variable expansion print the short 8.3 style file name.
+        lt_path=`cd "$lt_path" 2>/dev/null && cmd //C "for %i in (".") do @echo %~si"`
+        sys_lib_search_path_spec="$sys_lib_search_path_spec $lt_path"
+      done
+      IFS=$lt_save_ifs
+      # Convert to MSYS style.
+      sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | sed -e 's|\\\\|/|g' -e 's| \\([[a-zA-Z]]\\):| /\\1|g' -e 's|^ ||'`
+      ;;
+    cygwin*)
+      # Convert to unix form, then to dos form, then back to unix form
+      # but this time dos style (no spaces!) so that the unix form looks
+      # like /cygdrive/c/PROGRA~1:/cygdr...
+      sys_lib_search_path_spec=`cygpath --path --unix "$LIB"`
+      sys_lib_search_path_spec=`cygpath --path --dos "$sys_lib_search_path_spec" 2>/dev/null`
+      sys_lib_search_path_spec=`cygpath --path --unix "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
+      ;;
+    *)
+      sys_lib_search_path_spec="$LIB"
+      if $ECHO "$sys_lib_search_path_spec" | [$GREP ';[c-zC-Z]:/' >/dev/null]; then
+        # It is most probably a Windows format PATH.
+        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e 's/;/ /g'`
+      else
+        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
+      fi
+      # FIXME: find the short name or the path components, as spaces are
+      # common. (e.g. "Program Files" -> "PROGRA~1")
+      ;;
+    esac
+
+    # DLL is installed to $(libdir)/../bin by postinstall_cmds
+    postinstall_cmds='base_file=`basename \${file}`~
+      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog $dir/$dlname \$dldir/$dlname'
+    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll~
+       $RM \$dlpath'
+    shlibpath_overrides_runpath=yes
+    dynamic_linker='Win32 link.exe'
+    ;;
+
+  *)
+    # Assume MSVC wrapper
+    library_names_spec='${libname}`echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext} $libname.lib'
+    dynamic_linker='Win32 ld.exe'
+    ;;
+  esac
+  # FIXME: first we should search . and the directory the executable is in
+  shlibpath_var=PATH
+  ;;
+
+darwin* | rhapsody*)
+  dynamic_linker="$host_os dyld"
+  version_type=darwin
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${major}$shared_ext ${libname}$shared_ext'
+  soname_spec='${libname}${release}${major}$shared_ext'
+  shlibpath_overrides_runpath=yes
+  shlibpath_var=DYLD_LIBRARY_PATH
+  shrext_cmds='`test .$module = .yes && echo .so || echo .dylib`'
+m4_if([$1], [],[
+  sys_lib_search_path_spec="$sys_lib_search_path_spec /usr/local/lib"])
+  sys_lib_dlsearch_path_spec='/usr/local/lib /lib /usr/lib'
+  ;;
+
+dgux*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname$shared_ext'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+freebsd* | dragonfly*)
+  # DragonFly does not have aout.  When/if they implement a new
+  # versioning mechanism, adjust this.
+  if test -x /usr/bin/objformat; then
+    objformat=`/usr/bin/objformat`
+  else
+    case $host_os in
+    freebsd[[23]].*) objformat=aout ;;
+    *) objformat=elf ;;
+    esac
+  fi
+  version_type=freebsd-$objformat
+  case $version_type in
+    freebsd-elf*)
+      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
+      need_version=no
+      need_lib_prefix=no
+      ;;
+    freebsd-*)
+      library_names_spec='${libname}${release}${shared_ext}$versuffix $libname${shared_ext}$versuffix'
+      need_version=yes
+      ;;
+  esac
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_os in
+  freebsd2.*)
+    shlibpath_overrides_runpath=yes
+    ;;
+  freebsd3.[[01]]* | freebsdelf3.[[01]]*)
+    shlibpath_overrides_runpath=yes
+    hardcode_into_libs=yes
+    ;;
+  freebsd3.[[2-9]]* | freebsdelf3.[[2-9]]* | \
+  freebsd4.[[0-5]] | freebsdelf4.[[0-5]] | freebsd4.1.1 | freebsdelf4.1.1)
+    shlibpath_overrides_runpath=no
+    hardcode_into_libs=yes
+    ;;
+  *) # from 4.6 on, and DragonFly
+    shlibpath_overrides_runpath=yes
+    hardcode_into_libs=yes
+    ;;
+  esac
+  ;;
+
+haiku*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  dynamic_linker="$host_os runtime_loader"
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}${major} ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  sys_lib_dlsearch_path_spec='/boot/home/config/lib /boot/common/lib /boot/system/lib'
+  hardcode_into_libs=yes
+  ;;
+
+hpux9* | hpux10* | hpux11*)
+  # Give a soname corresponding to the major version so that dld.sl refuses to
+  # link against other versions.
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  case $host_cpu in
+  ia64*)
+    shrext_cmds='.so'
+    hardcode_into_libs=yes
+    dynamic_linker="$host_os dld.so"
+    shlibpath_var=LD_LIBRARY_PATH
+    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    if test "X$HPUX_IA64_MODE" = X32; then
+      sys_lib_search_path_spec="/usr/lib/hpux32 /usr/local/lib/hpux32 /usr/local/lib"
+    else
+      sys_lib_search_path_spec="/usr/lib/hpux64 /usr/local/lib/hpux64"
+    fi
+    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
+    ;;
+  hppa*64*)
+    shrext_cmds='.sl'
+    hardcode_into_libs=yes
+    dynamic_linker="$host_os dld.sl"
+    shlibpath_var=LD_LIBRARY_PATH # How should we handle SHLIB_PATH
+    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    sys_lib_search_path_spec="/usr/lib/pa20_64 /usr/ccs/lib/pa20_64"
+    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
+    ;;
+  *)
+    shrext_cmds='.sl'
+    dynamic_linker="$host_os dld.sl"
+    shlibpath_var=SHLIB_PATH
+    shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    ;;
+  esac
+  # HP-UX runs *really* slowly unless shared libraries are mode 555, ...
+  postinstall_cmds='chmod 555 $lib'
+  # or fails outright, so override atomically:
+  install_override_mode=555
+  ;;
+
+interix[[3-9]]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  dynamic_linker='Interix 3.x ld.so.1 (PE, like ELF)'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  ;;
+
+irix5* | irix6* | nonstopux*)
+  case $host_os in
+    nonstopux*) version_type=nonstopux ;;
+    *)
+	if test "$lt_cv_prog_gnu_ld" = yes; then
+		version_type=linux # correct to gnu/linux during the next big refactor
+	else
+		version_type=irix
+	fi ;;
+  esac
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext} $libname${shared_ext}'
+  case $host_os in
+  irix5* | nonstopux*)
+    libsuff= shlibsuff=
+    ;;
+  *)
+    case $LD in # libtool.m4 will add one of these switches to LD
+    *-32|*"-32 "|*-melf32bsmip|*"-melf32bsmip ")
+      libsuff= shlibsuff= libmagic=32-bit;;
+    *-n32|*"-n32 "|*-melf32bmipn32|*"-melf32bmipn32 ")
+      libsuff=32 shlibsuff=N32 libmagic=N32;;
+    *-64|*"-64 "|*-melf64bmip|*"-melf64bmip ")
+      libsuff=64 shlibsuff=64 libmagic=64-bit;;
+    *) libsuff= shlibsuff= libmagic=never-match;;
+    esac
+    ;;
+  esac
+  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
+  shlibpath_overrides_runpath=no
+  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
+  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+  hardcode_into_libs=yes
+  ;;
+
+# No shared lib support for Linux oldld, aout, or coff.
+linux*oldld* | linux*aout* | linux*coff*)
+  dynamic_linker=no
+  ;;
+
+# This must be glibc/ELF.
+linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+
+  # Some binutils ld are patched to set DT_RUNPATH
+  AC_CACHE_VAL([lt_cv_shlibpath_overrides_runpath],
+    [lt_cv_shlibpath_overrides_runpath=no
+    save_LDFLAGS=$LDFLAGS
+    save_libdir=$libdir
+    eval "libdir=/foo; wl=\"$_LT_TAGVAR(lt_prog_compiler_wl, $1)\"; \
+	 LDFLAGS=\"\$LDFLAGS $_LT_TAGVAR(hardcode_libdir_flag_spec, $1)\""
+    AC_LINK_IFELSE([AC_LANG_PROGRAM([],[])],
+      [AS_IF([ ($OBJDUMP -p conftest$ac_exeext) 2>/dev/null | grep "RUNPATH.*$libdir" >/dev/null],
+	 [lt_cv_shlibpath_overrides_runpath=yes])])
+    LDFLAGS=$save_LDFLAGS
+    libdir=$save_libdir
+    ])
+  shlibpath_overrides_runpath=$lt_cv_shlibpath_overrides_runpath
+
+  # This implies no fast_install, which is unacceptable.
+  # Some rework will be needed to allow for fast_install
+  # before this can be enabled.
+  hardcode_into_libs=yes
+
+  # Append ld.so.conf contents to the search path
+  if test -f /etc/ld.so.conf; then
+    lt_ld_extra=`awk '/^include / { system(sprintf("cd /etc; cat %s 2>/dev/null", \[$]2)); skip = 1; } { if (!skip) print \[$]0; skip = 0; }' < /etc/ld.so.conf | $SED -e 's/#.*//;/^[	 ]*hwcap[	 ]/d;s/[:,	]/ /g;s/=[^=]*$//;s/=[^= ]* / /g;s/"//g;/^$/d' | tr '\n' ' '`
+    sys_lib_dlsearch_path_spec="/lib /usr/lib $lt_ld_extra"
+  fi
+
+  # We used to test for /lib/ld.so.1 and disable shared libraries on
+  # powerpc, because MkLinux only supported shared libraries with the
+  # GNU dynamic linker.  Since this was broken with cross compilers,
+  # most powerpc-linux boxes support dynamic linking these days and
+  # people can always --disable-shared, the test was removed, and we
+  # assume the GNU/Linux dynamic linker is in use.
+  dynamic_linker='GNU/Linux ld.so'
+  ;;
+
+netbsdelf*-gnu)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  dynamic_linker='NetBSD ld.elf_so'
+  ;;
+
+netbsd*)
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+    dynamic_linker='NetBSD (a.out) ld.so'
+  else
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    dynamic_linker='NetBSD ld.elf_so'
+  fi
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  ;;
+
+newsos6)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  ;;
+
+*nto* | *qnx*)
+  version_type=qnx
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  dynamic_linker='ldqnx.so'
+  ;;
+
+openbsd*)
+  version_type=sunos
+  sys_lib_dlsearch_path_spec="/usr/lib"
+  need_lib_prefix=no
+  # Some older versions of OpenBSD (3.3 at least) *do* need versioned libs.
+  case $host_os in
+    openbsd3.3 | openbsd3.3.*)	need_version=yes ;;
+    *)				need_version=no  ;;
+  esac
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+    case $host_os in
+      openbsd2.[[89]] | openbsd2.[[89]].*)
+	shlibpath_overrides_runpath=no
+	;;
+      *)
+	shlibpath_overrides_runpath=yes
+	;;
+      esac
+  else
+    shlibpath_overrides_runpath=yes
+  fi
+  ;;
+
+os2*)
+  libname_spec='$name'
+  shrext_cmds=".dll"
+  need_lib_prefix=no
+  library_names_spec='$libname${shared_ext} $libname.a'
+  dynamic_linker='OS/2 ld.exe'
+  shlibpath_var=LIBPATH
+  ;;
+
+osf3* | osf4* | osf5*)
+  version_type=osf
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
+  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+  ;;
+
+rdos*)
+  dynamic_linker=no
+  ;;
+
+solaris*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  # ldd complains unless libraries are executable
+  postinstall_cmds='chmod +x $lib'
+  ;;
+
+sunos4*)
+  version_type=sunos
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+  fi
+  need_version=yes
+  ;;
+
+sysv4 | sysv4.3*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_vendor in
+    sni)
+      shlibpath_overrides_runpath=no
+      need_lib_prefix=no
+      runpath_var=LD_RUN_PATH
+      ;;
+    siemens)
+      need_lib_prefix=no
+      ;;
+    motorola)
+      need_lib_prefix=no
+      need_version=no
+      shlibpath_overrides_runpath=no
+      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+      ;;
+  esac
+  ;;
+
+sysv4*MP*)
+  if test -d /usr/nec ;then
+    version_type=linux # correct to gnu/linux during the next big refactor
+    library_names_spec='$libname${shared_ext}.$versuffix $libname${shared_ext}.$major $libname${shared_ext}'
+    soname_spec='$libname${shared_ext}.$major'
+    shlibpath_var=LD_LIBRARY_PATH
+  fi
+  ;;
+
+sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
+  version_type=freebsd-elf
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  if test "$with_gnu_ld" = yes; then
+    sys_lib_search_path_spec='/usr/local/lib /usr/gnu/lib /usr/ccs/lib /usr/lib /lib'
+  else
+    sys_lib_search_path_spec='/usr/ccs/lib /usr/lib'
+    case $host_os in
+      sco3.2v5*)
+        sys_lib_search_path_spec="$sys_lib_search_path_spec /lib"
+	;;
+    esac
+  fi
+  sys_lib_dlsearch_path_spec='/usr/lib'
+  ;;
+
+tpf*)
+  # TPF is a cross-target only.  Preferred cross-host = GNU/Linux.
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  ;;
+
+uts4*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+*)
+  dynamic_linker=no
+  ;;
+esac
+AC_MSG_RESULT([$dynamic_linker])
+test "$dynamic_linker" = no && can_build_shared=no
+
+variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
+if test "$GCC" = yes; then
+  variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
+fi
+
+if test "${lt_cv_sys_lib_search_path_spec+set}" = set; then
+  sys_lib_search_path_spec="$lt_cv_sys_lib_search_path_spec"
+fi
+if test "${lt_cv_sys_lib_dlsearch_path_spec+set}" = set; then
+  sys_lib_dlsearch_path_spec="$lt_cv_sys_lib_dlsearch_path_spec"
+fi
+
+_LT_DECL([], [variables_saved_for_relink], [1],
+    [Variables whose values should be saved in libtool wrapper scripts and
+    restored at link time])
+_LT_DECL([], [need_lib_prefix], [0],
+    [Do we need the "lib" prefix for modules?])
+_LT_DECL([], [need_version], [0], [Do we need a version for libraries?])
+_LT_DECL([], [version_type], [0], [Library versioning type])
+_LT_DECL([], [runpath_var], [0],  [Shared library runtime path variable])
+_LT_DECL([], [shlibpath_var], [0],[Shared library path variable])
+_LT_DECL([], [shlibpath_overrides_runpath], [0],
+    [Is shlibpath searched before the hard-coded library search path?])
+_LT_DECL([], [libname_spec], [1], [Format of library name prefix])
+_LT_DECL([], [library_names_spec], [1],
+    [[List of archive names.  First name is the real one, the rest are links.
+    The last name is the one that the linker finds with -lNAME]])
+_LT_DECL([], [soname_spec], [1],
+    [[The coded name of the library, if different from the real name]])
+_LT_DECL([], [install_override_mode], [1],
+    [Permission mode override for installation of shared libraries])
+_LT_DECL([], [postinstall_cmds], [2],
+    [Command to use after installation of a shared archive])
+_LT_DECL([], [postuninstall_cmds], [2],
+    [Command to use after uninstallation of a shared archive])
+_LT_DECL([], [finish_cmds], [2],
+    [Commands used to finish a libtool library installation in a directory])
+_LT_DECL([], [finish_eval], [1],
+    [[As "finish_cmds", except a single script fragment to be evaled but
+    not shown]])
+_LT_DECL([], [hardcode_into_libs], [0],
+    [Whether we should hardcode library paths into libraries])
+_LT_DECL([], [sys_lib_search_path_spec], [2],
+    [Compile-time system search path for libraries])
+_LT_DECL([], [sys_lib_dlsearch_path_spec], [2],
+    [Run-time system search path for libraries])
+])# _LT_SYS_DYNAMIC_LINKER
+
+
+# _LT_PATH_TOOL_PREFIX(TOOL)
+# --------------------------
+# find a file program which can recognize shared library
+AC_DEFUN([_LT_PATH_TOOL_PREFIX],
+[m4_require([_LT_DECL_EGREP])dnl
+AC_MSG_CHECKING([for $1])
+AC_CACHE_VAL(lt_cv_path_MAGIC_CMD,
+[case $MAGIC_CMD in
+[[\\/*] |  ?:[\\/]*])
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+  ;;
+*)
+  lt_save_MAGIC_CMD="$MAGIC_CMD"
+  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word.  This closes a longstanding sh security hole.
+  ac_dummy="m4_if([$2], , $PATH, [$2])"
+  for ac_dir in $ac_dummy; do
+    IFS="$lt_save_ifs"
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$1; then
+      lt_cv_path_MAGIC_CMD="$ac_dir/$1"
+      if test -n "$file_magic_test_file"; then
+	case $deplibs_check_method in
+	"file_magic "*)
+	  file_magic_regex=`expr "$deplibs_check_method" : "file_magic \(.*\)"`
+	  MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    $EGREP "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<_LT_EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+_LT_EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$lt_save_ifs"
+  MAGIC_CMD="$lt_save_MAGIC_CMD"
+  ;;
+esac])
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+  AC_MSG_RESULT($MAGIC_CMD)
+else
+  AC_MSG_RESULT(no)
+fi
+_LT_DECL([], [MAGIC_CMD], [0],
+	 [Used to examine libraries when file_magic_cmd begins with "file"])dnl
+])# _LT_PATH_TOOL_PREFIX
+
+# Old name:
+AU_ALIAS([AC_PATH_TOOL_PREFIX], [_LT_PATH_TOOL_PREFIX])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_PATH_TOOL_PREFIX], [])
+
+
+# _LT_PATH_MAGIC
+# --------------
+# find a file program which can recognize a shared library
+m4_defun([_LT_PATH_MAGIC],
+[_LT_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin$PATH_SEPARATOR$PATH)
+if test -z "$lt_cv_path_MAGIC_CMD"; then
+  if test -n "$ac_tool_prefix"; then
+    _LT_PATH_TOOL_PREFIX(file, /usr/bin$PATH_SEPARATOR$PATH)
+  else
+    MAGIC_CMD=:
+  fi
+fi
+])# _LT_PATH_MAGIC
+
+
+# LT_PATH_LD
+# ----------
+# find the pathname to the GNU or non-GNU linker
+AC_DEFUN([LT_PATH_LD],
+[AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+m4_require([_LT_DECL_SED])dnl
+m4_require([_LT_DECL_EGREP])dnl
+m4_require([_LT_PROG_ECHO_BACKSLASH])dnl
+
+AC_ARG_WITH([gnu-ld],
+    [AS_HELP_STRING([--with-gnu-ld],
+	[assume the C compiler uses GNU ld @<:@default=no@:>@])],
+    [test "$withval" = no || with_gnu_ld=yes],
+    [with_gnu_ld=no])dnl
+
+ac_prog=ld
+if test "$GCC" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  AC_MSG_CHECKING([for ld used by $CC])
+  case $host in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case $ac_prog in
+    # Accept absolute paths.
+    [[\\/]]* | ?:[[\\/]]*)
+      re_direlt='/[[^/]][[^/]]*/\.\./'
+      # Canonicalize the pathname of ld
+      ac_prog=`$ECHO "$ac_prog"| $SED 's%\\\\%/%g'`
+      while $ECHO "$ac_prog" | $GREP "$re_direlt" > /dev/null 2>&1; do
+	ac_prog=`$ECHO $ac_prog| $SED "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  AC_MSG_CHECKING([for GNU ld])
+else
+  AC_MSG_CHECKING([for non-GNU ld])
+fi
+AC_CACHE_VAL(lt_cv_path_LD,
+[if test -z "$LD"; then
+  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  for ac_dir in $PATH; do
+    IFS="$lt_save_ifs"
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      lt_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some variants of GNU ld only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      case `"$lt_cv_path_LD" -v 2>&1 </dev/null` in
+      *GNU* | *'with BFD'*)
+	test "$with_gnu_ld" != no && break
+	;;
+      *)
+	test "$with_gnu_ld" != yes && break
+	;;
+      esac
+    fi
+  done
+  IFS="$lt_save_ifs"
+else
+  lt_cv_path_LD="$LD" # Let the user override the test with a path.
+fi])
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+  AC_MSG_RESULT($LD)
+else
+  AC_MSG_RESULT(no)
+fi
+test -z "$LD" && AC_MSG_ERROR([no acceptable ld found in \$PATH])
+_LT_PATH_LD_GNU
+AC_SUBST([LD])
+
+_LT_TAGDECL([], [LD], [1], [The linker used to build libraries])
+])# LT_PATH_LD
+
+# Old names:
+AU_ALIAS([AM_PROG_LD], [LT_PATH_LD])
+AU_ALIAS([AC_PROG_LD], [LT_PATH_LD])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AM_PROG_LD], [])
+dnl AC_DEFUN([AC_PROG_LD], [])
+
+
+# _LT_PATH_LD_GNU
+#- --------------
+m4_defun([_LT_PATH_LD_GNU],
+[AC_CACHE_CHECK([if the linker ($LD) is GNU ld], lt_cv_prog_gnu_ld,
+[# I'd rather use --version here, but apparently some GNU lds only accept -v.
+case `$LD -v 2>&1 </dev/null` in
+*GNU* | *'with BFD'*)
+  lt_cv_prog_gnu_ld=yes
+  ;;
+*)
+  lt_cv_prog_gnu_ld=no
+  ;;
+esac])
+with_gnu_ld=$lt_cv_prog_gnu_ld
+])# _LT_PATH_LD_GNU
+
+
+# _LT_CMD_RELOAD
+# --------------
+# find reload flag for linker
+#   -- PORTME Some linkers may need a different reload flag.
+m4_defun([_LT_CMD_RELOAD],
+[AC_CACHE_CHECK([for $LD option to reload object files],
+  lt_cv_ld_reload_flag,
+  [lt_cv_ld_reload_flag='-r'])
+reload_flag=$lt_cv_ld_reload_flag
+case $reload_flag in
+"" | " "*) ;;
+*) reload_flag=" $reload_flag" ;;
+esac
+reload_cmds='$LD$reload_flag -o $output$reload_objs'
+case $host_os in
+  cygwin* | mingw* | pw32* | cegcc*)
+    if test "$GCC" != yes; then
+      reload_cmds=false
+    fi
+    ;;
+  darwin*)
+    if test "$GCC" = yes; then
+      reload_cmds='$LTCC $LTCFLAGS -nostdlib ${wl}-r -o $output$reload_objs'
+    else
+      reload_cmds='$LD$reload_flag -o $output$reload_objs'
+    fi
+    ;;
+esac
+_LT_TAGDECL([], [reload_flag], [1], [How to create reloadable object files])dnl
+_LT_TAGDECL([], [reload_cmds], [2])dnl
+])# _LT_CMD_RELOAD
+
+
+# _LT_CHECK_MAGIC_METHOD
+# ----------------------
+# how to check for library dependencies
+#  -- PORTME fill in with the dynamic library characteristics
+m4_defun([_LT_CHECK_MAGIC_METHOD],
+[m4_require([_LT_DECL_EGREP])
+m4_require([_LT_DECL_OBJDUMP])
+AC_CACHE_CHECK([how to recognize dependent libraries],
+lt_cv_deplibs_check_method,
+[lt_cv_file_magic_cmd='$MAGIC_CMD'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [[regex]]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given extended regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case $host_os in
+aix[[4-9]]*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi[[45]]*)
+  lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (shared object|dynamic lib)'
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin*)
+  # func_win32_libid is a shell function defined in ltmain.sh
+  lt_cv_deplibs_check_method='file_magic ^x86 archive import|^x86 DLL'
+  lt_cv_file_magic_cmd='func_win32_libid'
+  ;;
+
+mingw* | pw32*)
+  # Base MSYS/MinGW do not provide the 'file' command needed by
+  # func_win32_libid shell function, so use a weaker test based on 'objdump',
+  # unless we find 'file', for example because we are cross-compiling.
+  # func_win32_libid assumes BSD nm, so disallow it if using MS dumpbin.
+  if ( test "$lt_cv_nm_interface" = "BSD nm" && file / ) >/dev/null 2>&1; then
+    lt_cv_deplibs_check_method='file_magic ^x86 archive import|^x86 DLL'
+    lt_cv_file_magic_cmd='func_win32_libid'
+  else
+    # Keep this pattern in sync with the one in func_win32_libid.
+    lt_cv_deplibs_check_method='file_magic file format (pei*-i386(.*architecture: i386)?|pe-arm-wince|pe-x86-64)'
+    lt_cv_file_magic_cmd='$OBJDUMP -f'
+  fi
+  ;;
+
+cegcc*)
+  # use the weaker test based on 'objdump'. See mingw*.
+  lt_cv_deplibs_check_method='file_magic file format pe-arm-.*little(.*architecture: arm)?'
+  lt_cv_file_magic_cmd='$OBJDUMP -f'
+  ;;
+
+darwin* | rhapsody*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+freebsd* | dragonfly*)
+  if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then
+    case $host_cpu in
+    i*86 )
+      # Not sure whether the presence of OpenBSD here was a mistake.
+      # Let's accept both of them until this is cleared up.
+      lt_cv_deplibs_check_method='file_magic (FreeBSD|OpenBSD|DragonFly)/i[[3-9]]86 (compact )?demand paged shared library'
+      lt_cv_file_magic_cmd=/usr/bin/file
+      lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+      ;;
+    esac
+  else
+    lt_cv_deplibs_check_method=pass_all
+  fi
+  ;;
+
+haiku*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+hpux10.20* | hpux11*)
+  lt_cv_file_magic_cmd=/usr/bin/file
+  case $host_cpu in
+  ia64*)
+    lt_cv_deplibs_check_method='file_magic (s[[0-9]][[0-9]][[0-9]]|ELF-[[0-9]][[0-9]]) shared object file - IA64'
+    lt_cv_file_magic_test_file=/usr/lib/hpux32/libc.so
+    ;;
+  hppa*64*)
+    [lt_cv_deplibs_check_method='file_magic (s[0-9][0-9][0-9]|ELF[ -][0-9][0-9])(-bit)?( [LM]SB)? shared object( file)?[, -]* PA-RISC [0-9]\.[0-9]']
+    lt_cv_file_magic_test_file=/usr/lib/pa20_64/libc.sl
+    ;;
+  *)
+    lt_cv_deplibs_check_method='file_magic (s[[0-9]][[0-9]][[0-9]]|PA-RISC[[0-9]]\.[[0-9]]) shared library'
+    lt_cv_file_magic_test_file=/usr/lib/libc.sl
+    ;;
+  esac
+  ;;
+
+interix[[3-9]]*)
+  # PIC code is broken on Interix 3.x, that's why |\.a not |_pic\.a here
+  lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so|\.a)$'
+  ;;
+
+irix5* | irix6* | nonstopux*)
+  case $LD in
+  *-32|*"-32 ") libmagic=32-bit;;
+  *-n32|*"-n32 ") libmagic=N32;;
+  *-64|*"-64 ") libmagic=64-bit;;
+  *) libmagic=never-match;;
+  esac
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be glibc/ELF.
+linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+netbsd* | netbsdelf*-gnu)
+  if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then
+    lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|_pic\.a)$'
+  else
+    lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so|_pic\.a)$'
+  fi
+  ;;
+
+newos6*)
+  lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (executable|dynamic lib)'
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libnls.so
+  ;;
+
+*nto* | *qnx*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+openbsd*)
+  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+    lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|\.so|_pic\.a)$'
+  else
+    lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|_pic\.a)$'
+  fi
+  ;;
+
+osf3* | osf4* | osf5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+rdos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sysv4 | sysv4.3*)
+  case $host_vendor in
+  motorola)
+    lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (shared object|dynamic lib) M[[0-9]][[0-9]]* Version [[0-9]]'
+    lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  sequent)
+    lt_cv_file_magic_cmd='/bin/file'
+    lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[LM]]SB (shared object|dynamic lib )'
+    ;;
+  sni)
+    lt_cv_file_magic_cmd='/bin/file'
+    lt_cv_deplibs_check_method="file_magic ELF [[0-9]][[0-9]]*-bit [[LM]]SB dynamic lib"
+    lt_cv_file_magic_test_file=/lib/libc.so
+    ;;
+  siemens)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  pc)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  esac
+  ;;
+
+tpf*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+esac
+])
+
+file_magic_glob=
+want_nocaseglob=no
+if test "$build" = "$host"; then
+  case $host_os in
+  mingw* | pw32*)
+    if ( shopt | grep nocaseglob ) >/dev/null 2>&1; then
+      want_nocaseglob=yes
+    else
+      file_magic_glob=`echo aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ | $SED -e "s/\(..\)/s\/[[\1]]\/[[\1]]\/g;/g"`
+    fi
+    ;;
+  esac
+fi
+
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+test -z "$deplibs_check_method" && deplibs_check_method=unknown
+
+_LT_DECL([], [deplibs_check_method], [1],
+    [Method to check whether dependent libraries are shared objects])
+_LT_DECL([], [file_magic_cmd], [1],
+    [Command to use when deplibs_check_method = "file_magic"])
+_LT_DECL([], [file_magic_glob], [1],
+    [How to find potential files when deplibs_check_method = "file_magic"])
+_LT_DECL([], [want_nocaseglob], [1],
+    [Find potential files using nocaseglob when deplibs_check_method = "file_magic"])
+])# _LT_CHECK_MAGIC_METHOD
+
+
+# LT_PATH_NM
+# ----------
+# find the pathname to a BSD- or MS-compatible name lister
+AC_DEFUN([LT_PATH_NM],
+[AC_REQUIRE([AC_PROG_CC])dnl
+AC_CACHE_CHECK([for BSD- or MS-compatible name lister (nm)], lt_cv_path_NM,
+[if test -n "$NM"; then
+  # Let the user override the test.
+  lt_cv_path_NM="$NM"
+else
+  lt_nm_to_check="${ac_tool_prefix}nm"
+  if test -n "$ac_tool_prefix" && test "$build" = "$host"; then
+    lt_nm_to_check="$lt_nm_to_check nm"
+  fi
+  for lt_tmp_nm in $lt_nm_to_check; do
+    lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+    for ac_dir in $PATH /usr/ccs/bin/elf /usr/ccs/bin /usr/ucb /bin; do
+      IFS="$lt_save_ifs"
+      test -z "$ac_dir" && ac_dir=.
+      tmp_nm="$ac_dir/$lt_tmp_nm"
+      if test -f "$tmp_nm" || test -f "$tmp_nm$ac_exeext" ; then
+	# Check to see if the nm accepts a BSD-compat flag.
+	# Adding the `sed 1q' prevents false positives on HP-UX, which says:
+	#   nm: unknown option "B" ignored
+	# Tru64's nm complains that /dev/null is an invalid object file
+	case `"$tmp_nm" -B /dev/null 2>&1 | sed '1q'` in
+	*/dev/null* | *'Invalid file or object type'*)
+	  lt_cv_path_NM="$tmp_nm -B"
+	  break
+	  ;;
+	*)
+	  case `"$tmp_nm" -p /dev/null 2>&1 | sed '1q'` in
+	  */dev/null*)
+	    lt_cv_path_NM="$tmp_nm -p"
+	    break
+	    ;;
+	  *)
+	    lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
+	    continue # so that we can try to find one that supports BSD flags
+	    ;;
+	  esac
+	  ;;
+	esac
+      fi
+    done
+    IFS="$lt_save_ifs"
+  done
+  : ${lt_cv_path_NM=no}
+fi])
+if test "$lt_cv_path_NM" != "no"; then
+  NM="$lt_cv_path_NM"
+else
+  # Didn't find any BSD compatible name lister, look for dumpbin.
+  if test -n "$DUMPBIN"; then :
+    # Let the user override the test.
+  else
+    AC_CHECK_TOOLS(DUMPBIN, [dumpbin "link -dump"], :)
+    case `$DUMPBIN -symbols /dev/null 2>&1 | sed '1q'` in
+    *COFF*)
+      DUMPBIN="$DUMPBIN -symbols"
+      ;;
+    *)
+      DUMPBIN=:
+      ;;
+    esac
+  fi
+  AC_SUBST([DUMPBIN])
+  if test "$DUMPBIN" != ":"; then
+    NM="$DUMPBIN"
+  fi
+fi
+test -z "$NM" && NM=nm
+AC_SUBST([NM])
+_LT_DECL([], [NM], [1], [A BSD- or MS-compatible name lister])dnl
+
+AC_CACHE_CHECK([the name lister ($NM) interface], [lt_cv_nm_interface],
+  [lt_cv_nm_interface="BSD nm"
+  echo "int some_variable = 0;" > conftest.$ac_ext
+  (eval echo "\"\$as_me:$LINENO: $ac_compile\"" >&AS_MESSAGE_LOG_FD)
+  (eval "$ac_compile" 2>conftest.err)
+  cat conftest.err >&AS_MESSAGE_LOG_FD
+  (eval echo "\"\$as_me:$LINENO: $NM \\\"conftest.$ac_objext\\\"\"" >&AS_MESSAGE_LOG_FD)
+  (eval "$NM \"conftest.$ac_objext\"" 2>conftest.err > conftest.out)
+  cat conftest.err >&AS_MESSAGE_LOG_FD
+  (eval echo "\"\$as_me:$LINENO: output\"" >&AS_MESSAGE_LOG_FD)
+  cat conftest.out >&AS_MESSAGE_LOG_FD
+  if $GREP 'External.*some_variable' conftest.out > /dev/null; then
+    lt_cv_nm_interface="MS dumpbin"
+  fi
+  rm -f conftest*])
+])# LT_PATH_NM
+
+# Old names:
+AU_ALIAS([AM_PROG_NM], [LT_PATH_NM])
+AU_ALIAS([AC_PROG_NM], [LT_PATH_NM])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AM_PROG_NM], [])
+dnl AC_DEFUN([AC_PROG_NM], [])
+
+# _LT_CHECK_SHAREDLIB_FROM_LINKLIB
+# --------------------------------
+# how to determine the name of the shared library
+# associated with a specific link library.
+#  -- PORTME fill in with the dynamic library characteristics
+m4_defun([_LT_CHECK_SHAREDLIB_FROM_LINKLIB],
+[m4_require([_LT_DECL_EGREP])
+m4_require([_LT_DECL_OBJDUMP])
+m4_require([_LT_DECL_DLLTOOL])
+AC_CACHE_CHECK([how to associate runtime and link libraries],
+lt_cv_sharedlib_from_linklib_cmd,
+[lt_cv_sharedlib_from_linklib_cmd='unknown'
+
+case $host_os in
+cygwin* | mingw* | pw32* | cegcc*)
+  # two different shell functions defined in ltmain.sh
+  # decide which to use based on capabilities of $DLLTOOL
+  case `$DLLTOOL --help 2>&1` in
+  *--identify-strict*)
+    lt_cv_sharedlib_from_linklib_cmd=func_cygming_dll_for_implib
+    ;;
+  *)
+    lt_cv_sharedlib_from_linklib_cmd=func_cygming_dll_for_implib_fallback
+    ;;
+  esac
+  ;;
+*)
+  # fallback: assume linklib IS sharedlib
+  lt_cv_sharedlib_from_linklib_cmd="$ECHO"
+  ;;
+esac
+])
+sharedlib_from_linklib_cmd=$lt_cv_sharedlib_from_linklib_cmd
+test -z "$sharedlib_from_linklib_cmd" && sharedlib_from_linklib_cmd=$ECHO
+
+_LT_DECL([], [sharedlib_from_linklib_cmd], [1],
+    [Command to associate shared and link libraries])
+])# _LT_CHECK_SHAREDLIB_FROM_LINKLIB
+
+
+# _LT_PATH_MANIFEST_TOOL
+# ----------------------
+# locate the manifest tool
+m4_defun([_LT_PATH_MANIFEST_TOOL],
+[AC_CHECK_TOOL(MANIFEST_TOOL, mt, :)
+test -z "$MANIFEST_TOOL" && MANIFEST_TOOL=mt
+AC_CACHE_CHECK([if $MANIFEST_TOOL is a manifest tool], [lt_cv_path_mainfest_tool],
+  [lt_cv_path_mainfest_tool=no
+  echo "$as_me:$LINENO: $MANIFEST_TOOL '-?'" >&AS_MESSAGE_LOG_FD
+  $MANIFEST_TOOL '-?' 2>conftest.err > conftest.out
+  cat conftest.err >&AS_MESSAGE_LOG_FD
+  if $GREP 'Manifest Tool' conftest.out > /dev/null; then
+    lt_cv_path_mainfest_tool=yes
+  fi
+  rm -f conftest*])
+if test "x$lt_cv_path_mainfest_tool" != xyes; then
+  MANIFEST_TOOL=:
+fi
+_LT_DECL([], [MANIFEST_TOOL], [1], [Manifest tool])dnl
+])# _LT_PATH_MANIFEST_TOOL
+
+
+# LT_LIB_M
+# --------
+# check for math library
+AC_DEFUN([LT_LIB_M],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+LIBM=
+case $host in
+*-*-beos* | *-*-cegcc* | *-*-cygwin* | *-*-haiku* | *-*-pw32* | *-*-darwin*)
+  # These system don't have libm, or don't need it
+  ;;
+*-ncr-sysv4.3*)
+  AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM="-lmw")
+  AC_CHECK_LIB(m, cos, LIBM="$LIBM -lm")
+  ;;
+*)
+  AC_CHECK_LIB(m, cos, LIBM="-lm")
+  ;;
+esac
+AC_SUBST([LIBM])
+])# LT_LIB_M
+
+# Old name:
+AU_ALIAS([AC_CHECK_LIBM], [LT_LIB_M])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_CHECK_LIBM], [])
+
+
+# _LT_COMPILER_NO_RTTI([TAGNAME])
+# -------------------------------
+m4_defun([_LT_COMPILER_NO_RTTI],
+[m4_require([_LT_TAG_COMPILER])dnl
+
+_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=
+
+if test "$GCC" = yes; then
+  case $cc_basename in
+  nvcc*)
+    _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -Xcompiler -fno-builtin' ;;
+  *)
+    _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -fno-builtin' ;;
+  esac
+
+  _LT_COMPILER_OPTION([if $compiler supports -fno-rtti -fno-exceptions],
+    lt_cv_prog_compiler_rtti_exceptions,
+    [-fno-rtti -fno-exceptions], [],
+    [_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)="$_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1) -fno-rtti -fno-exceptions"])
+fi
+_LT_TAGDECL([no_builtin_flag], [lt_prog_compiler_no_builtin_flag], [1],
+	[Compiler flag to turn off builtin functions])
+])# _LT_COMPILER_NO_RTTI
+
+
+# _LT_CMD_GLOBAL_SYMBOLS
+# ----------------------
+m4_defun([_LT_CMD_GLOBAL_SYMBOLS],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_PROG_AWK])dnl
+AC_REQUIRE([LT_PATH_NM])dnl
+AC_REQUIRE([LT_PATH_LD])dnl
+m4_require([_LT_DECL_SED])dnl
+m4_require([_LT_DECL_EGREP])dnl
+m4_require([_LT_TAG_COMPILER])dnl
+
+# Check for command to grab the raw symbol name followed by C symbol from nm.
+AC_MSG_CHECKING([command to parse $NM output from $compiler object])
+AC_CACHE_VAL([lt_cv_sys_global_symbol_pipe],
+[
+# These are sane defaults that work on at least a few old systems.
+# [They come from Ultrix.  What could be older than Ultrix?!! ;)]
+
+# Character class describing NM global symbol codes.
+symcode='[[BCDEGRST]]'
+
+# Regexp to match symbols that can be accessed directly from C.
+sympat='\([[_A-Za-z]][[_A-Za-z0-9]]*\)'
+
+# Define system-specific variables.
+case $host_os in
+aix*)
+  symcode='[[BCDT]]'
+  ;;
+cygwin* | mingw* | pw32* | cegcc*)
+  symcode='[[ABCDGISTW]]'
+  ;;
+hpux*)
+  if test "$host_cpu" = ia64; then
+    symcode='[[ABCDEGRST]]'
+  fi
+  ;;
+irix* | nonstopux*)
+  symcode='[[BCDEGRST]]'
+  ;;
+osf*)
+  symcode='[[BCDEGQRST]]'
+  ;;
+solaris*)
+  symcode='[[BDRT]]'
+  ;;
+sco3.2v5*)
+  symcode='[[DT]]'
+  ;;
+sysv4.2uw2*)
+  symcode='[[DT]]'
+  ;;
+sysv5* | sco5v6* | unixware* | OpenUNIX*)
+  symcode='[[ABDT]]'
+  ;;
+sysv4)
+  symcode='[[DFNSTU]]'
+  ;;
+esac
+
+# If we're using GNU nm, then use its standard symbol codes.
+case `$NM -V 2>&1` in
+*GNU* | *'with BFD'*)
+  symcode='[[ABCDGIRSTW]]' ;;
+esac
+
+# Transform an extracted symbol line into a proper C declaration.
+# Some systems (esp. on ia64) link data and code symbols differently,
+# so use this general approach.
+lt_cv_sys_global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern int \1();/p' -e 's/^$symcode* .* \(.*\)$/extern char \1;/p'"
+
+# Transform an extracted symbol line into symbol name and symbol address
+lt_cv_sys_global_symbol_to_c_name_address="sed -n -e 's/^: \([[^ ]]*\)[[ ]]*$/  {\\\"\1\\\", (void *) 0},/p' -e 's/^$symcode* \([[^ ]]*\) \([[^ ]]*\)$/  {\"\2\", (void *) \&\2},/p'"
+lt_cv_sys_global_symbol_to_c_name_address_lib_prefix="sed -n -e 's/^: \([[^ ]]*\)[[ ]]*$/  {\\\"\1\\\", (void *) 0},/p' -e 's/^$symcode* \([[^ ]]*\) \(lib[[^ ]]*\)$/  {\"\2\", (void *) \&\2},/p' -e 's/^$symcode* \([[^ ]]*\) \([[^ ]]*\)$/  {\"lib\2\", (void *) \&\2},/p'"
+
+# Handle CRLF in mingw tool chain
+opt_cr=
+case $build_os in
+mingw*)
+  opt_cr=`$ECHO 'x\{0,1\}' | tr x '\015'` # option cr in regexp
+  ;;
+esac
+
+# Try without a prefix underscore, then with it.
+for ac_symprfx in "" "_"; do
+
+  # Transform symcode, sympat, and symprfx into a raw symbol and a C symbol.
+  symxfrm="\\1 $ac_symprfx\\2 \\2"
+
+  # Write the raw and C identifiers.
+  if test "$lt_cv_nm_interface" = "MS dumpbin"; then
+    # Fake it for dumpbin and say T for any non-static function
+    # and D for any global variable.
+    # Also find C++ and __fastcall symbols from MSVC++,
+    # which start with @ or ?.
+    lt_cv_sys_global_symbol_pipe="$AWK ['"\
+"     {last_section=section; section=\$ 3};"\
+"     /^COFF SYMBOL TABLE/{for(i in hide) delete hide[i]};"\
+"     /Section length .*#relocs.*(pick any)/{hide[last_section]=1};"\
+"     \$ 0!~/External *\|/{next};"\
+"     / 0+ UNDEF /{next}; / UNDEF \([^|]\)*()/{next};"\
+"     {if(hide[section]) next};"\
+"     {f=0}; \$ 0~/\(\).*\|/{f=1}; {printf f ? \"T \" : \"D \"};"\
+"     {split(\$ 0, a, /\||\r/); split(a[2], s)};"\
+"     s[1]~/^[@?]/{print s[1], s[1]; next};"\
+"     s[1]~prfx {split(s[1],t,\"@\"); print t[1], substr(t[1],length(prfx))}"\
+"     ' prfx=^$ac_symprfx]"
+  else
+    lt_cv_sys_global_symbol_pipe="sed -n -e 's/^.*[[	 ]]\($symcode$symcode*\)[[	 ]][[	 ]]*$ac_symprfx$sympat$opt_cr$/$symxfrm/p'"
+  fi
+  lt_cv_sys_global_symbol_pipe="$lt_cv_sys_global_symbol_pipe | sed '/ __gnu_lto/d'"
+
+  # Check to see that the pipe works correctly.
+  pipe_works=no
+
+  rm -f conftest*
+  cat > conftest.$ac_ext <<_LT_EOF
+#ifdef __cplusplus
+extern "C" {
+#endif
+char nm_test_var;
+void nm_test_func(void);
+void nm_test_func(void){}
+#ifdef __cplusplus
+}
+#endif
+int main(){nm_test_var='a';nm_test_func();return(0);}
+_LT_EOF
+
+  if AC_TRY_EVAL(ac_compile); then
+    # Now try to grab the symbols.
+    nlist=conftest.nm
+    if AC_TRY_EVAL(NM conftest.$ac_objext \| "$lt_cv_sys_global_symbol_pipe" \> $nlist) && test -s "$nlist"; then
+      # Try sorting and uniquifying the output.
+      if sort "$nlist" | uniq > "$nlist"T; then
+	mv -f "$nlist"T "$nlist"
+      else
+	rm -f "$nlist"T
+      fi
+
+      # Make sure that we snagged all the symbols we need.
+      if $GREP ' nm_test_var$' "$nlist" >/dev/null; then
+	if $GREP ' nm_test_func$' "$nlist" >/dev/null; then
+	  cat <<_LT_EOF > conftest.$ac_ext
+/* Keep this code in sync between libtool.m4, ltmain, lt_system.h, and tests.  */
+#if defined(_WIN32) || defined(__CYGWIN__) || defined(_WIN32_WCE)
+/* DATA imports from DLLs on WIN32 con't be const, because runtime
+   relocations are performed -- see ld's documentation on pseudo-relocs.  */
+# define LT@&t@_DLSYM_CONST
+#elif defined(__osf__)
+/* This system does not cope well with relocations in const data.  */
+# define LT@&t@_DLSYM_CONST
+#else
+# define LT@&t@_DLSYM_CONST const
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+_LT_EOF
+	  # Now generate the symbol file.
+	  eval "$lt_cv_sys_global_symbol_to_cdecl"' < "$nlist" | $GREP -v main >> conftest.$ac_ext'
+
+	  cat <<_LT_EOF >> conftest.$ac_ext
+
+/* The mapping between symbol names and symbols.  */
+LT@&t@_DLSYM_CONST struct {
+  const char *name;
+  void       *address;
+}
+lt__PROGRAM__LTX_preloaded_symbols[[]] =
+{
+  { "@PROGRAM@", (void *) 0 },
+_LT_EOF
+	  $SED "s/^$symcode$symcode* \(.*\) \(.*\)$/  {\"\2\", (void *) \&\2},/" < "$nlist" | $GREP -v main >> conftest.$ac_ext
+	  cat <<\_LT_EOF >> conftest.$ac_ext
+  {0, (void *) 0}
+};
+
+/* This works around a problem in FreeBSD linker */
+#ifdef FREEBSD_WORKAROUND
+static const void *lt_preloaded_setup() {
+  return lt__PROGRAM__LTX_preloaded_symbols;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+_LT_EOF
+	  # Now try linking the two files.
+	  mv conftest.$ac_objext conftstm.$ac_objext
+	  lt_globsym_save_LIBS=$LIBS
+	  lt_globsym_save_CFLAGS=$CFLAGS
+	  LIBS="conftstm.$ac_objext"
+	  CFLAGS="$CFLAGS$_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)"
+	  if AC_TRY_EVAL(ac_link) && test -s conftest${ac_exeext}; then
+	    pipe_works=yes
+	  fi
+	  LIBS=$lt_globsym_save_LIBS
+	  CFLAGS=$lt_globsym_save_CFLAGS
+	else
+	  echo "cannot find nm_test_func in $nlist" >&AS_MESSAGE_LOG_FD
+	fi
+      else
+	echo "cannot find nm_test_var in $nlist" >&AS_MESSAGE_LOG_FD
+      fi
+    else
+      echo "cannot run $lt_cv_sys_global_symbol_pipe" >&AS_MESSAGE_LOG_FD
+    fi
+  else
+    echo "$progname: failed program was:" >&AS_MESSAGE_LOG_FD
+    cat conftest.$ac_ext >&5
+  fi
+  rm -rf conftest* conftst*
+
+  # Do not use the global_symbol_pipe unless it works.
+  if test "$pipe_works" = yes; then
+    break
+  else
+    lt_cv_sys_global_symbol_pipe=
+  fi
+done
+])
+if test -z "$lt_cv_sys_global_symbol_pipe"; then
+  lt_cv_sys_global_symbol_to_cdecl=
+fi
+if test -z "$lt_cv_sys_global_symbol_pipe$lt_cv_sys_global_symbol_to_cdecl"; then
+  AC_MSG_RESULT(failed)
+else
+  AC_MSG_RESULT(ok)
+fi
+
+# Response file support.
+if test "$lt_cv_nm_interface" = "MS dumpbin"; then
+  nm_file_list_spec='@'
+elif $NM --help 2>/dev/null | grep '[[@]]FILE' >/dev/null; then
+  nm_file_list_spec='@'
+fi
+
+_LT_DECL([global_symbol_pipe], [lt_cv_sys_global_symbol_pipe], [1],
+    [Take the output of nm and produce a listing of raw symbols and C names])
+_LT_DECL([global_symbol_to_cdecl], [lt_cv_sys_global_symbol_to_cdecl], [1],
+    [Transform the output of nm in a proper C declaration])
+_LT_DECL([global_symbol_to_c_name_address],
+    [lt_cv_sys_global_symbol_to_c_name_address], [1],
+    [Transform the output of nm in a C name address pair])
+_LT_DECL([global_symbol_to_c_name_address_lib_prefix],
+    [lt_cv_sys_global_symbol_to_c_name_address_lib_prefix], [1],
+    [Transform the output of nm in a C name address pair when lib prefix is needed])
+_LT_DECL([], [nm_file_list_spec], [1],
+    [Specify filename containing input files for $NM])
+]) # _LT_CMD_GLOBAL_SYMBOLS
+
+
+# _LT_COMPILER_PIC([TAGNAME])
+# ---------------------------
+m4_defun([_LT_COMPILER_PIC],
+[m4_require([_LT_TAG_COMPILER])dnl
+_LT_TAGVAR(lt_prog_compiler_wl, $1)=
+_LT_TAGVAR(lt_prog_compiler_pic, $1)=
+_LT_TAGVAR(lt_prog_compiler_static, $1)=
+
+m4_if([$1], [CXX], [
+  # C++ specific cases for pic, static, wl, etc.
+  if test "$GXX" = yes; then
+    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+    _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
+
+    case $host_os in
+    aix*)
+      # All AIX code is PIC.
+      if test "$host_cpu" = ia64; then
+	# AIX 5 now supports IA64 processor
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+        ;;
+      m68k)
+            # FIXME: we need at least 68020 code to build shared libraries, but
+            # adding the `-m68020' flag to GCC prevents building anything better,
+            # like `-m68040'.
+            _LT_TAGVAR(lt_prog_compiler_pic, $1)='-m68020 -resident32 -malways-restore-a4'
+        ;;
+      esac
+      ;;
+
+    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+    mingw* | cygwin* | os2* | pw32* | cegcc*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      # Although the cygwin gcc ignores -fPIC, still need this for old-style
+      # (--disable-auto-import) libraries
+      m4_if([$1], [GCJ], [],
+	[_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
+      ;;
+    darwin* | rhapsody*)
+      # PIC is the default on this platform
+      # Common symbols not allowed in MH_DYLIB files
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fno-common'
+      ;;
+    *djgpp*)
+      # DJGPP does not support shared libraries at all
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)=
+      ;;
+    haiku*)
+      # PIC is the default for Haiku.
+      # The "-static" flag exists, but is broken.
+      _LT_TAGVAR(lt_prog_compiler_static, $1)=
+      ;;
+    interix[[3-9]]*)
+      # Interix 3.x gcc -fpic/-fPIC options generate broken code.
+      # Instead, we relocate shared libraries at runtime.
+      ;;
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)=-Kconform_pic
+      fi
+      ;;
+    hpux*)
+      # PIC is the default for 64-bit PA HP-UX, but not for 32-bit
+      # PA HP-UX.  On IA64 HP-UX, PIC is the default but the pic flag
+      # sets the default TLS model and affects inlining.
+      case $host_cpu in
+      hppa*64*)
+	;;
+      *)
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+	;;
+      esac
+      ;;
+    *qnx* | *nto*)
+      # QNX uses GNU C++, but need to define -shared option too, otherwise
+      # it will coredump.
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared'
+      ;;
+    *)
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+      ;;
+    esac
+  else
+    case $host_os in
+      aix[[4-9]]*)
+	# All AIX code is PIC.
+	if test "$host_cpu" = ia64; then
+	  # AIX 5 now supports IA64 processor
+	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	else
+	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-bnso -bI:/lib/syscalls.exp'
+	fi
+	;;
+      chorus*)
+	case $cc_basename in
+	cxch68*)
+	  # Green Hills C++ Compiler
+	  # _LT_TAGVAR(lt_prog_compiler_static, $1)="--no_auto_instantiation -u __main -u __premain -u _abort -r $COOL_DIR/lib/libOrb.a $MVME_DIR/lib/CC/libC.a $MVME_DIR/lib/classix/libcx.s.a"
+	  ;;
+	esac
+	;;
+      mingw* | cygwin* | os2* | pw32* | cegcc*)
+	# This hack is so that the source file can tell whether it is being
+	# built for inclusion in a dll (and should export symbols for example).
+	m4_if([$1], [GCJ], [],
+	  [_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
+	;;
+      dgux*)
+	case $cc_basename in
+	  ec++*)
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+	    ;;
+	  ghcx*)
+	    # Green Hills C++ Compiler
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic'
+	    ;;
+	  *)
+	    ;;
+	esac
+	;;
+      freebsd* | dragonfly*)
+	# FreeBSD uses GNU C++
+	;;
+      hpux9* | hpux10* | hpux11*)
+	case $cc_basename in
+	  CC*)
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='${wl}-a ${wl}archive'
+	    if test "$host_cpu" != ia64; then
+	      _LT_TAGVAR(lt_prog_compiler_pic, $1)='+Z'
+	    fi
+	    ;;
+	  aCC*)
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='${wl}-a ${wl}archive'
+	    case $host_cpu in
+	    hppa*64*|ia64*)
+	      # +Z the default
+	      ;;
+	    *)
+	      _LT_TAGVAR(lt_prog_compiler_pic, $1)='+Z'
+	      ;;
+	    esac
+	    ;;
+	  *)
+	    ;;
+	esac
+	;;
+      interix*)
+	# This is c89, which is MS Visual C++ (no shared libs)
+	# Anyone wants to do a port?
+	;;
+      irix5* | irix6* | nonstopux*)
+	case $cc_basename in
+	  CC*)
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
+	    # CC pic flag -KPIC is the default.
+	    ;;
+	  *)
+	    ;;
+	esac
+	;;
+      linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+	case $cc_basename in
+	  KCC*)
+	    # KAI C++ Compiler
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='--backend -Wl,'
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+	    ;;
+	  ecpc* )
+	    # old Intel C++ for x86_64 which still supported -KPIC.
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
+	    ;;
+	  icpc* )
+	    # Intel C++, used to be incompatible with GCC.
+	    # ICC 10 doesn't accept -KPIC any more.
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
+	    ;;
+	  pgCC* | pgcpp*)
+	    # Portland Group C++ compiler
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fpic'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	    ;;
+	  cxx*)
+	    # Compaq C++
+	    # Make sure the PIC flag is empty.  It appears that all Alpha
+	    # Linux and Compaq Tru64 Unix objects are PIC.
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)=
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
+	    ;;
+	  xlc* | xlC* | bgxl[[cC]]* | mpixl[[cC]]*)
+	    # IBM XL 8.0, 9.0 on PPC and BlueGene
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-qpic'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-qstaticlink'
+	    ;;
+	  *)
+	    case `$CC -V 2>&1 | sed 5q` in
+	    *Sun\ C*)
+	      # Sun C++ 5.9
+	      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+	      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld '
+	      ;;
+	    esac
+	    ;;
+	esac
+	;;
+      lynxos*)
+	;;
+      m88k*)
+	;;
+      mvs*)
+	case $cc_basename in
+	  cxx*)
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-W c,exportall'
+	    ;;
+	  *)
+	    ;;
+	esac
+	;;
+      netbsd* | netbsdelf*-gnu)
+	;;
+      *qnx* | *nto*)
+        # QNX uses GNU C++, but need to define -shared option too, otherwise
+        # it will coredump.
+        _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared'
+        ;;
+      osf3* | osf4* | osf5*)
+	case $cc_basename in
+	  KCC*)
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='--backend -Wl,'
+	    ;;
+	  RCC*)
+	    # Rational C++ 2.4.1
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic'
+	    ;;
+	  cxx*)
+	    # Digital/Compaq C++
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	    # Make sure the PIC flag is empty.  It appears that all Alpha
+	    # Linux and Compaq Tru64 Unix objects are PIC.
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)=
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
+	    ;;
+	  *)
+	    ;;
+	esac
+	;;
+      psos*)
+	;;
+      solaris*)
+	case $cc_basename in
+	  CC* | sunCC*)
+	    # Sun C++ 4.2, 5.x and Centerline C++
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld '
+	    ;;
+	  gcx*)
+	    # Green Hills C++ Compiler
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC'
+	    ;;
+	  *)
+	    ;;
+	esac
+	;;
+      sunos4*)
+	case $cc_basename in
+	  CC*)
+	    # Sun C++ 4.x
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	    ;;
+	  lcc*)
+	    # Lucid
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic'
+	    ;;
+	  *)
+	    ;;
+	esac
+	;;
+      sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*)
+	case $cc_basename in
+	  CC*)
+	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	    ;;
+	esac
+	;;
+      tandem*)
+	case $cc_basename in
+	  NCC*)
+	    # NonStop-UX NCC 3.20
+	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+	    ;;
+	  *)
+	    ;;
+	esac
+	;;
+      vxworks*)
+	;;
+      *)
+	_LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
+	;;
+    esac
+  fi
+],
+[
+  if test "$GCC" = yes; then
+    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+    _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
+
+    case $host_os in
+      aix*)
+      # All AIX code is PIC.
+      if test "$host_cpu" = ia64; then
+	# AIX 5 now supports IA64 processor
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+        ;;
+      m68k)
+            # FIXME: we need at least 68020 code to build shared libraries, but
+            # adding the `-m68020' flag to GCC prevents building anything better,
+            # like `-m68040'.
+            _LT_TAGVAR(lt_prog_compiler_pic, $1)='-m68020 -resident32 -malways-restore-a4'
+        ;;
+      esac
+      ;;
+
+    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+
+    mingw* | cygwin* | pw32* | os2* | cegcc*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      # Although the cygwin gcc ignores -fPIC, still need this for old-style
+      # (--disable-auto-import) libraries
+      m4_if([$1], [GCJ], [],
+	[_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
+      ;;
+
+    darwin* | rhapsody*)
+      # PIC is the default on this platform
+      # Common symbols not allowed in MH_DYLIB files
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fno-common'
+      ;;
+
+    haiku*)
+      # PIC is the default for Haiku.
+      # The "-static" flag exists, but is broken.
+      _LT_TAGVAR(lt_prog_compiler_static, $1)=
+      ;;
+
+    hpux*)
+      # PIC is the default for 64-bit PA HP-UX, but not for 32-bit
+      # PA HP-UX.  On IA64 HP-UX, PIC is the default but the pic flag
+      # sets the default TLS model and affects inlining.
+      case $host_cpu in
+      hppa*64*)
+	# +Z the default
+	;;
+      *)
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+	;;
+      esac
+      ;;
+
+    interix[[3-9]]*)
+      # Interix 3.x gcc -fpic/-fPIC options generate broken code.
+      # Instead, we relocate shared libraries at runtime.
+      ;;
+
+    msdosdjgpp*)
+      # Just because we use GCC doesn't mean we suddenly get shared libraries
+      # on systems that don't support them.
+      _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
+      enable_shared=no
+      ;;
+
+    *nto* | *qnx*)
+      # QNX uses GNU C++, but need to define -shared option too, otherwise
+      # it will coredump.
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)=-Kconform_pic
+      fi
+      ;;
+
+    *)
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+      ;;
+    esac
+
+    case $cc_basename in
+    nvcc*) # Cuda Compiler Driver 2.2
+      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Xlinker '
+      if test -n "$_LT_TAGVAR(lt_prog_compiler_pic, $1)"; then
+        _LT_TAGVAR(lt_prog_compiler_pic, $1)="-Xcompiler $_LT_TAGVAR(lt_prog_compiler_pic, $1)"
+      fi
+      ;;
+    esac
+  else
+    # PORTME Check for flag to pass linker flags through the system compiler.
+    case $host_os in
+    aix*)
+      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+      if test "$host_cpu" = ia64; then
+	# AIX 5 now supports IA64 processor
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+      else
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-bnso -bI:/lib/syscalls.exp'
+      fi
+      ;;
+
+    mingw* | cygwin* | pw32* | os2* | cegcc*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      m4_if([$1], [GCJ], [],
+	[_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
+      ;;
+
+    hpux9* | hpux10* | hpux11*)
+      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+      # PIC is the default for IA64 HP-UX and 64-bit HP-UX, but
+      # not for PA HP-UX.
+      case $host_cpu in
+      hppa*64*|ia64*)
+	# +Z the default
+	;;
+      *)
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='+Z'
+	;;
+      esac
+      # Is there a better lt_prog_compiler_static that works with the bundled CC?
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='${wl}-a ${wl}archive'
+      ;;
+
+    irix5* | irix6* | nonstopux*)
+      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+      # PIC (with -KPIC) is the default.
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
+      ;;
+
+    linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+      case $cc_basename in
+      # old Intel for x86_64 which still supported -KPIC.
+      ecc*)
+	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
+        ;;
+      # icc used to be incompatible with GCC.
+      # ICC 10 doesn't accept -KPIC any more.
+      icc* | ifort*)
+	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
+        ;;
+      # Lahey Fortran 8.1.
+      lf95*)
+	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='--shared'
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='--static'
+	;;
+      nagfor*)
+	# NAG Fortran compiler
+	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,-Wl,,'
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC'
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	;;
+      pgcc* | pgf77* | pgf90* | pgf95* | pgfortran*)
+        # Portland Group compilers (*not* the Pentium gcc compiler,
+	# which looks to be a dead project)
+	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-fpic'
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+        ;;
+      ccc*)
+        _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+        # All Alpha code is PIC.
+        _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
+        ;;
+      xl* | bgxl* | bgf* | mpixl*)
+	# IBM XL C 8.0/Fortran 10.1, 11.1 on PPC and BlueGene
+	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-qpic'
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-qstaticlink'
+	;;
+      *)
+	case `$CC -V 2>&1 | sed 5q` in
+	*Sun\ Ceres\ Fortran* | *Sun*Fortran*\ [[1-7]].* | *Sun*Fortran*\ 8.[[0-3]]*)
+	  # Sun Fortran 8.3 passes all unrecognized flags to the linker
+	  _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	  _LT_TAGVAR(lt_prog_compiler_wl, $1)=''
+	  ;;
+	*Sun\ F* | *Sun*Fortran*)
+	  _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	  _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld '
+	  ;;
+	*Sun\ C*)
+	  # Sun C 5.9
+	  _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	  _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	  ;;
+        *Intel*\ [[CF]]*Compiler*)
+	  _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	  _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
+	  ;;
+	*Portland\ Group*)
+	  _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	  _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fpic'
+	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+	  ;;
+	esac
+	;;
+      esac
+      ;;
+
+    newsos6)
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+      ;;
+
+    *nto* | *qnx*)
+      # QNX uses GNU C++, but need to define -shared option too, otherwise
+      # it will coredump.
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared'
+      ;;
+
+    osf3* | osf4* | osf5*)
+      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+      # All OSF/1 code is PIC.
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
+      ;;
+
+    rdos*)
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
+      ;;
+
+    solaris*)
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+      case $cc_basename in
+      f77* | f90* | f95* | sunf77* | sunf90* | sunf95*)
+	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld ';;
+      *)
+	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,';;
+      esac
+      ;;
+
+    sunos4*)
+      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld '
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC'
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+      ;;
+
+    sysv4 | sysv4.2uw2* | sysv4.3*)
+      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec ;then
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-Kconform_pic'
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+      fi
+      ;;
+
+    sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*)
+      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+      ;;
+
+    unicos*)
+      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+      _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
+      ;;
+
+    uts4*)
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic'
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+      ;;
+
+    *)
+      _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
+      ;;
+    esac
+  fi
+])
+case $host_os in
+  # For platforms which do not support PIC, -DPIC is meaningless:
+  *djgpp*)
+    _LT_TAGVAR(lt_prog_compiler_pic, $1)=
+    ;;
+  *)
+    _LT_TAGVAR(lt_prog_compiler_pic, $1)="$_LT_TAGVAR(lt_prog_compiler_pic, $1)@&t@m4_if([$1],[],[ -DPIC],[m4_if([$1],[CXX],[ -DPIC],[])])"
+    ;;
+esac
+
+AC_CACHE_CHECK([for $compiler option to produce PIC],
+  [_LT_TAGVAR(lt_cv_prog_compiler_pic, $1)],
+  [_LT_TAGVAR(lt_cv_prog_compiler_pic, $1)=$_LT_TAGVAR(lt_prog_compiler_pic, $1)])
+_LT_TAGVAR(lt_prog_compiler_pic, $1)=$_LT_TAGVAR(lt_cv_prog_compiler_pic, $1)
+
+#
+# Check to make sure the PIC flag actually works.
+#
+if test -n "$_LT_TAGVAR(lt_prog_compiler_pic, $1)"; then
+  _LT_COMPILER_OPTION([if $compiler PIC flag $_LT_TAGVAR(lt_prog_compiler_pic, $1) works],
+    [_LT_TAGVAR(lt_cv_prog_compiler_pic_works, $1)],
+    [$_LT_TAGVAR(lt_prog_compiler_pic, $1)@&t@m4_if([$1],[],[ -DPIC],[m4_if([$1],[CXX],[ -DPIC],[])])], [],
+    [case $_LT_TAGVAR(lt_prog_compiler_pic, $1) in
+     "" | " "*) ;;
+     *) _LT_TAGVAR(lt_prog_compiler_pic, $1)=" $_LT_TAGVAR(lt_prog_compiler_pic, $1)" ;;
+     esac],
+    [_LT_TAGVAR(lt_prog_compiler_pic, $1)=
+     _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no])
+fi
+_LT_TAGDECL([pic_flag], [lt_prog_compiler_pic], [1],
+	[Additional compiler flags for building library objects])
+
+_LT_TAGDECL([wl], [lt_prog_compiler_wl], [1],
+	[How to pass a linker flag through the compiler])
+#
+# Check to make sure the static flag actually works.
+#
+wl=$_LT_TAGVAR(lt_prog_compiler_wl, $1) eval lt_tmp_static_flag=\"$_LT_TAGVAR(lt_prog_compiler_static, $1)\"
+_LT_LINKER_OPTION([if $compiler static flag $lt_tmp_static_flag works],
+  _LT_TAGVAR(lt_cv_prog_compiler_static_works, $1),
+  $lt_tmp_static_flag,
+  [],
+  [_LT_TAGVAR(lt_prog_compiler_static, $1)=])
+_LT_TAGDECL([link_static_flag], [lt_prog_compiler_static], [1],
+	[Compiler flag to prevent dynamic linking])
+])# _LT_COMPILER_PIC
+
+
+# _LT_LINKER_SHLIBS([TAGNAME])
+# ----------------------------
+# See if the linker supports building shared libraries.
+m4_defun([_LT_LINKER_SHLIBS],
+[AC_REQUIRE([LT_PATH_LD])dnl
+AC_REQUIRE([LT_PATH_NM])dnl
+m4_require([_LT_PATH_MANIFEST_TOOL])dnl
+m4_require([_LT_FILEUTILS_DEFAULTS])dnl
+m4_require([_LT_DECL_EGREP])dnl
+m4_require([_LT_DECL_SED])dnl
+m4_require([_LT_CMD_GLOBAL_SYMBOLS])dnl
+m4_require([_LT_TAG_COMPILER])dnl
+AC_MSG_CHECKING([whether the $compiler linker ($LD) supports shared libraries])
+m4_if([$1], [CXX], [
+  _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
+  _LT_TAGVAR(exclude_expsyms, $1)=['_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*']
+  case $host_os in
+  aix[[4-9]]*)
+    # If we're using GNU nm, then we don't want the "-C" option.
+    # -C means demangle to AIX nm, but means don't demangle with GNU nm
+    # Also, AIX nm treats weak defined symbols like other global defined
+    # symbols, whereas GNU nm marks them as "W".
+    if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
+      _LT_TAGVAR(export_symbols_cmds, $1)='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+    else
+      _LT_TAGVAR(export_symbols_cmds, $1)='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+    fi
+    ;;
+  pw32*)
+    _LT_TAGVAR(export_symbols_cmds, $1)="$ltdll_cmds"
+    ;;
+  cygwin* | mingw* | cegcc*)
+    case $cc_basename in
+    cl*)
+      _LT_TAGVAR(exclude_expsyms, $1)='_NULL_IMPORT_DESCRIPTOR|_IMPORT_DESCRIPTOR_.*'
+      ;;
+    *)
+      _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[[BCDGRS]][[ ]]/s/.*[[ ]]\([[^ ]]*\)/\1 DATA/;s/^.*[[ ]]__nm__\([[^ ]]*\)[[ ]][[^ ]]*/\1 DATA/;/^I[[ ]]/d;/^[[AITW]][[ ]]/s/.* //'\'' | sort | uniq > $export_symbols'
+      _LT_TAGVAR(exclude_expsyms, $1)=['[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname']
+      ;;
+    esac
+    ;;
+  linux* | k*bsd*-gnu | gnu*)
+    _LT_TAGVAR(link_all_deplibs, $1)=no
+    ;;
+  *)
+    _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
+    ;;
+  esac
+], [
+  runpath_var=
+  _LT_TAGVAR(allow_undefined_flag, $1)=
+  _LT_TAGVAR(always_export_symbols, $1)=no
+  _LT_TAGVAR(archive_cmds, $1)=
+  _LT_TAGVAR(archive_expsym_cmds, $1)=
+  _LT_TAGVAR(compiler_needs_object, $1)=no
+  _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no
+  _LT_TAGVAR(export_dynamic_flag_spec, $1)=
+  _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
+  _LT_TAGVAR(hardcode_automatic, $1)=no
+  _LT_TAGVAR(hardcode_direct, $1)=no
+  _LT_TAGVAR(hardcode_direct_absolute, $1)=no
+  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
+  _LT_TAGVAR(hardcode_libdir_separator, $1)=
+  _LT_TAGVAR(hardcode_minus_L, $1)=no
+  _LT_TAGVAR(hardcode_shlibpath_var, $1)=unsupported
+  _LT_TAGVAR(inherit_rpath, $1)=no
+  _LT_TAGVAR(link_all_deplibs, $1)=unknown
+  _LT_TAGVAR(module_cmds, $1)=
+  _LT_TAGVAR(module_expsym_cmds, $1)=
+  _LT_TAGVAR(old_archive_from_new_cmds, $1)=
+  _LT_TAGVAR(old_archive_from_expsyms_cmds, $1)=
+  _LT_TAGVAR(thread_safe_flag_spec, $1)=
+  _LT_TAGVAR(whole_archive_flag_spec, $1)=
+  # include_expsyms should be a list of space-separated symbols to be *always*
+  # included in the symbol list
+  _LT_TAGVAR(include_expsyms, $1)=
+  # exclude_expsyms can be an extended regexp of symbols to exclude
+  # it will be wrapped by ` (' and `)$', so one must not match beginning or
+  # end of line.  Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
+  # as well as any symbol that contains `d'.
+  _LT_TAGVAR(exclude_expsyms, $1)=['_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*']
+  # Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
+  # platforms (ab)use it in PIC code, but their linkers get confused if
+  # the symbol is explicitly referenced.  Since portable code cannot
+  # rely on this symbol name, it's probably fine to never include it in
+  # preloaded symbol tables.
+  # Exclude shared library initialization/finalization symbols.
+dnl Note also adjust exclude_expsyms for C++ above.
+  extract_expsyms_cmds=
+
+  case $host_os in
+  cygwin* | mingw* | pw32* | cegcc*)
+    # FIXME: the MSVC++ port hasn't been tested in a loooong time
+    # When not using gcc, we currently assume that we are using
+    # Microsoft Visual C++.
+    if test "$GCC" != yes; then
+      with_gnu_ld=no
+    fi
+    ;;
+  interix*)
+    # we just hope/assume this is gcc and not c89 (= MSVC++)
+    with_gnu_ld=yes
+    ;;
+  openbsd*)
+    with_gnu_ld=no
+    ;;
+  linux* | k*bsd*-gnu | gnu*)
+    _LT_TAGVAR(link_all_deplibs, $1)=no
+    ;;
+  esac
+
+  _LT_TAGVAR(ld_shlibs, $1)=yes
+
+  # On some targets, GNU ld is compatible enough with the native linker
+  # that we're better off using the native interface for both.
+  lt_use_gnu_ld_interface=no
+  if test "$with_gnu_ld" = yes; then
+    case $host_os in
+      aix*)
+	# The AIX port of GNU ld has always aspired to compatibility
+	# with the native linker.  However, as the warning in the GNU ld
+	# block says, versions before 2.19.5* couldn't really create working
+	# shared libraries, regardless of the interface used.
+	case `$LD -v 2>&1` in
+	  *\ \(GNU\ Binutils\)\ 2.19.5*) ;;
+	  *\ \(GNU\ Binutils\)\ 2.[[2-9]]*) ;;
+	  *\ \(GNU\ Binutils\)\ [[3-9]]*) ;;
+	  *)
+	    lt_use_gnu_ld_interface=yes
+	    ;;
+	esac
+	;;
+      *)
+	lt_use_gnu_ld_interface=yes
+	;;
+    esac
+  fi
+
+  if test "$lt_use_gnu_ld_interface" = yes; then
+    # If archive_cmds runs LD, not CC, wlarc should be empty
+    wlarc='${wl}'
+
+    # Set some defaults for GNU ld with shared library support. These
+    # are reset later if shared libraries are not supported. Putting them
+    # here allows them to be overridden if necessary.
+    runpath_var=LD_RUN_PATH
+    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
+    # ancient GNU ld didn't support --whole-archive et. al.
+    if $LD --help 2>&1 | $GREP 'no-whole-archive' > /dev/null; then
+      _LT_TAGVAR(whole_archive_flag_spec, $1)="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+    else
+      _LT_TAGVAR(whole_archive_flag_spec, $1)=
+    fi
+    supports_anon_versioning=no
+    case `$LD -v 2>&1` in
+      *GNU\ gold*) supports_anon_versioning=yes ;;
+      *\ [[01]].* | *\ 2.[[0-9]].* | *\ 2.10.*) ;; # catch versions < 2.11
+      *\ 2.11.93.0.2\ *) supports_anon_versioning=yes ;; # RH7.3 ...
+      *\ 2.11.92.0.12\ *) supports_anon_versioning=yes ;; # Mandrake 8.2 ...
+      *\ 2.11.*) ;; # other 2.11 versions
+      *) supports_anon_versioning=yes ;;
+    esac
+
+    # See if GNU ld supports shared libraries.
+    case $host_os in
+    aix[[3-9]]*)
+      # On AIX/PPC, the GNU linker is very broken
+      if test "$host_cpu" != ia64; then
+	_LT_TAGVAR(ld_shlibs, $1)=no
+	cat <<_LT_EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.19, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support.  If you
+*** really care for shared libraries, you may want to install binutils
+*** 2.20 or above, or modify your PATH so that a non-GNU linker is found.
+*** You will then need to restart the configuration process.
+
+_LT_EOF
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+            _LT_TAGVAR(archive_expsym_cmds, $1)=''
+        ;;
+      m68k)
+            _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/a2ixlibrary.data~$ECHO "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$ECHO "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$ECHO "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$ECHO "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+            _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+            _LT_TAGVAR(hardcode_minus_L, $1)=yes
+        ;;
+      esac
+      ;;
+
+    beos*)
+      if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	_LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+	# Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+	# support --undefined.  This deserves some investigation.  FIXME
+	_LT_TAGVAR(archive_cmds, $1)='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      else
+	_LT_TAGVAR(ld_shlibs, $1)=no
+      fi
+      ;;
+
+    cygwin* | mingw* | pw32* | cegcc*)
+      # _LT_TAGVAR(hardcode_libdir_flag_spec, $1) is actually meaningless,
+      # as there is no search path for DLLs.
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-all-symbols'
+      _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+      _LT_TAGVAR(always_export_symbols, $1)=no
+      _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
+      _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[[BCDGRS]][[ ]]/s/.*[[ ]]\([[^ ]]*\)/\1 DATA/;s/^.*[[ ]]__nm__\([[^ ]]*\)[[ ]][[^ ]]*/\1 DATA/;/^I[[ ]]/d;/^[[AITW]][[ ]]/s/.* //'\'' | sort | uniq > $export_symbols'
+      _LT_TAGVAR(exclude_expsyms, $1)=['[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname']
+
+      if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
+        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+	# If the export-symbols file already is a .def file (1st line
+	# is EXPORTS), use it as is; otherwise, prepend...
+	_LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
+	  cp $export_symbols $output_objdir/$soname.def;
+	else
+	  echo EXPORTS > $output_objdir/$soname.def;
+	  cat $export_symbols >> $output_objdir/$soname.def;
+	fi~
+	$CC -shared $output_objdir/$soname.def $libobjs $deplibs $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+      else
+	_LT_TAGVAR(ld_shlibs, $1)=no
+      fi
+      ;;
+
+    haiku*)
+      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      _LT_TAGVAR(link_all_deplibs, $1)=yes
+      ;;
+
+    interix[[3-9]]*)
+      _LT_TAGVAR(hardcode_direct, $1)=no
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+      # Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
+      # Instead, shared libraries are loaded at an image base (0x10000000 by
+      # default) and relocated if they conflict, which is a slow very memory
+      # consuming and fragmenting process.  To avoid this, we pick a random,
+      # 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
+      # time.  Moving up from 0x10000000 also allows more sbrk(2) space.
+      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+      _LT_TAGVAR(archive_expsym_cmds, $1)='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+      ;;
+
+    gnu* | linux* | tpf* | k*bsd*-gnu | kopensolaris*-gnu)
+      tmp_diet=no
+      if test "$host_os" = linux-dietlibc; then
+	case $cc_basename in
+	  diet\ *) tmp_diet=yes;;	# linux-dietlibc with static linking (!diet-dyn)
+	esac
+      fi
+      if $LD --help 2>&1 | $EGREP ': supported targets:.* elf' > /dev/null \
+	 && test "$tmp_diet" = no
+      then
+	tmp_addflag=' $pic_flag'
+	tmp_sharedflag='-shared'
+	case $cc_basename,$host_cpu in
+        pgcc*)				# Portland Group C compiler
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  tmp_addflag=' $pic_flag'
+	  ;;
+	pgf77* | pgf90* | pgf95* | pgfortran*)
+					# Portland Group f77 and f90 compilers
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  tmp_addflag=' $pic_flag -Mnomain' ;;
+	ecc*,ia64* | icc*,ia64*)	# Intel C compiler on ia64
+	  tmp_addflag=' -i_dynamic' ;;
+	efc*,ia64* | ifort*,ia64*)	# Intel Fortran compiler on ia64
+	  tmp_addflag=' -i_dynamic -nofor_main' ;;
+	ifc* | ifort*)			# Intel Fortran compiler
+	  tmp_addflag=' -nofor_main' ;;
+	lf95*)				# Lahey Fortran 8.1
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)=
+	  tmp_sharedflag='--shared' ;;
+	xl[[cC]]* | bgxl[[cC]]* | mpixl[[cC]]*) # IBM XL C 8.0 on PPC (deal with xlf below)
+	  tmp_sharedflag='-qmkshrobj'
+	  tmp_addflag= ;;
+	nvcc*)	# Cuda Compiler Driver 2.2
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  _LT_TAGVAR(compiler_needs_object, $1)=yes
+	  ;;
+	esac
+	case `$CC -V 2>&1 | sed 5q` in
+	*Sun\ C*)			# Sun C 5.9
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  _LT_TAGVAR(compiler_needs_object, $1)=yes
+	  tmp_sharedflag='-G' ;;
+	*Sun\ F*)			# Sun Fortran 8.3
+	  tmp_sharedflag='-G' ;;
+	esac
+	_LT_TAGVAR(archive_cmds, $1)='$CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+
+        if test "x$supports_anon_versioning" = xyes; then
+          _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~
+	    cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+	    echo "local: *; };" >> $output_objdir/$libname.ver~
+	    $CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
+        fi
+
+	case $cc_basename in
+	xlf* | bgf* | bgxlf* | mpixlf*)
+	  # IBM XL Fortran 10.1 on PPC cannot create shared libs itself
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='--whole-archive$convenience --no-whole-archive'
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+	  _LT_TAGVAR(archive_cmds, $1)='$LD -shared $libobjs $deplibs $linker_flags -soname $soname -o $lib'
+	  if test "x$supports_anon_versioning" = xyes; then
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~
+	      cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+	      echo "local: *; };" >> $output_objdir/$libname.ver~
+	      $LD -shared $libobjs $deplibs $linker_flags -soname $soname -version-script $output_objdir/$libname.ver -o $lib'
+	  fi
+	  ;;
+	esac
+      else
+        _LT_TAGVAR(ld_shlibs, $1)=no
+      fi
+      ;;
+
+    netbsd* | netbsdelf*-gnu)
+      if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+	_LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+	wlarc=
+      else
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+      fi
+      ;;
+
+    solaris*)
+      if $LD -v 2>&1 | $GREP 'BFD 2\.8' > /dev/null; then
+	_LT_TAGVAR(ld_shlibs, $1)=no
+	cat <<_LT_EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+_LT_EOF
+      elif $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+      else
+	_LT_TAGVAR(ld_shlibs, $1)=no
+      fi
+      ;;
+
+    sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX*)
+      case `$LD -v 2>&1` in
+        *\ [[01]].* | *\ 2.[[0-9]].* | *\ 2.1[[0-5]].*)
+	_LT_TAGVAR(ld_shlibs, $1)=no
+	cat <<_LT_EOF 1>&2
+
+*** Warning: Releases of the GNU linker prior to 2.16.91.0.3 can not
+*** reliably create shared libraries on SCO systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.16.91.0.3 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+_LT_EOF
+	;;
+	*)
+	  # For security reasons, it is highly recommended that you always
+	  # use absolute paths for naming shared libraries, and exclude the
+	  # DT_RUNPATH tag from executables and libraries.  But doing so
+	  # requires that you compile everything twice, which is a pain.
+	  if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+	  else
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	  fi
+	;;
+      esac
+      ;;
+
+    sunos4*)
+      _LT_TAGVAR(archive_cmds, $1)='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+      wlarc=
+      _LT_TAGVAR(hardcode_direct, $1)=yes
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      ;;
+
+    *)
+      if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+      else
+	_LT_TAGVAR(ld_shlibs, $1)=no
+      fi
+      ;;
+    esac
+
+    if test "$_LT_TAGVAR(ld_shlibs, $1)" = no; then
+      runpath_var=
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)=
+      _LT_TAGVAR(whole_archive_flag_spec, $1)=
+    fi
+  else
+    # PORTME fill in a description of your system's linker (not GNU ld)
+    case $host_os in
+    aix3*)
+      _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+      _LT_TAGVAR(always_export_symbols, $1)=yes
+      _LT_TAGVAR(archive_expsym_cmds, $1)='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname'
+      # Note: this linker hardcodes the directories in LIBPATH if there
+      # are no directories specified by -L.
+      _LT_TAGVAR(hardcode_minus_L, $1)=yes
+      if test "$GCC" = yes && test -z "$lt_prog_compiler_static"; then
+	# Neither direct hardcoding nor static linking is supported with a
+	# broken collect2.
+	_LT_TAGVAR(hardcode_direct, $1)=unsupported
+      fi
+      ;;
+
+    aix[[4-9]]*)
+      if test "$host_cpu" = ia64; then
+	# On IA64, the linker does run time linking by default, so we don't
+	# have to do anything special.
+	aix_use_runtimelinking=no
+	exp_sym_flag='-Bexport'
+	no_entry_flag=""
+      else
+	# If we're using GNU nm, then we don't want the "-C" option.
+	# -C means demangle to AIX nm, but means don't demangle with GNU nm
+	# Also, AIX nm treats weak defined symbols like other global
+	# defined symbols, whereas GNU nm marks them as "W".
+	if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
+	  _LT_TAGVAR(export_symbols_cmds, $1)='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+	else
+	  _LT_TAGVAR(export_symbols_cmds, $1)='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+	fi
+	aix_use_runtimelinking=no
+
+	# Test if we are trying to use run time linking or normal
+	# AIX style linking. If -brtl is somewhere in LDFLAGS, we
+	# need to do runtime linking.
+	case $host_os in aix4.[[23]]|aix4.[[23]].*|aix[[5-9]]*)
+	  for ld_flag in $LDFLAGS; do
+	  if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl"); then
+	    aix_use_runtimelinking=yes
+	    break
+	  fi
+	  done
+	  ;;
+	esac
+
+	exp_sym_flag='-bexport'
+	no_entry_flag='-bnoentry'
+      fi
+
+      # When large executables or shared objects are built, AIX ld can
+      # have problems creating the table of contents.  If linking a library
+      # or program results in "error TOC overflow" add -mminimal-toc to
+      # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
+      # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
+
+      _LT_TAGVAR(archive_cmds, $1)=''
+      _LT_TAGVAR(hardcode_direct, $1)=yes
+      _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
+      _LT_TAGVAR(hardcode_libdir_separator, $1)=':'
+      _LT_TAGVAR(link_all_deplibs, $1)=yes
+      _LT_TAGVAR(file_list_spec, $1)='${wl}-f,'
+
+      if test "$GCC" = yes; then
+	case $host_os in aix4.[[012]]|aix4.[[012]].*)
+	# We only want to do this on AIX 4.2 and lower, the check
+	# below for broken collect2 doesn't work under 4.3+
+	  collect2name=`${CC} -print-prog-name=collect2`
+	  if test -f "$collect2name" &&
+	   strings "$collect2name" | $GREP resolve_lib_name >/dev/null
+	  then
+	  # We have reworked collect2
+	  :
+	  else
+	  # We have old collect2
+	  _LT_TAGVAR(hardcode_direct, $1)=unsupported
+	  # It fails to find uninstalled libraries when the uninstalled
+	  # path is not listed in the libpath.  Setting hardcode_minus_L
+	  # to unsupported forces relinking
+	  _LT_TAGVAR(hardcode_minus_L, $1)=yes
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+	  _LT_TAGVAR(hardcode_libdir_separator, $1)=
+	  fi
+	  ;;
+	esac
+	shared_flag='-shared'
+	if test "$aix_use_runtimelinking" = yes; then
+	  shared_flag="$shared_flag "'${wl}-G'
+	fi
+	_LT_TAGVAR(link_all_deplibs, $1)=no
+      else
+	# not using gcc
+	if test "$host_cpu" = ia64; then
+	# VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
+	# chokes on -Wl,-G. The following line is correct:
+	  shared_flag='-G'
+	else
+	  if test "$aix_use_runtimelinking" = yes; then
+	    shared_flag='${wl}-G'
+	  else
+	    shared_flag='${wl}-bM:SRE'
+	  fi
+	fi
+      fi
+
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-bexpall'
+      # It seems that -bexpall does not export symbols beginning with
+      # underscore (_), so it is better to generate a list of symbols to export.
+      _LT_TAGVAR(always_export_symbols, $1)=yes
+      if test "$aix_use_runtimelinking" = yes; then
+	# Warning - without using the other runtime loading flags (-brtl),
+	# -berok will link without error, but may produce a broken library.
+	_LT_TAGVAR(allow_undefined_flag, $1)='-berok'
+        # Determine the default libpath from the value encoded in an
+        # empty executable.
+        _LT_SYS_MODULE_PATH_AIX([$1])
+        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
+        _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
+      else
+	if test "$host_cpu" = ia64; then
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R $libdir:/usr/lib:/lib'
+	  _LT_TAGVAR(allow_undefined_flag, $1)="-z nodefs"
+	  _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
+	else
+	 # Determine the default libpath from the value encoded in an
+	 # empty executable.
+	 _LT_SYS_MODULE_PATH_AIX([$1])
+	 _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
+	  # Warning - without using the other run time loading flags,
+	  # -berok will link without error, but may produce a broken library.
+	  _LT_TAGVAR(no_undefined_flag, $1)=' ${wl}-bernotok'
+	  _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-berok'
+	  if test "$with_gnu_ld" = yes; then
+	    # We only use this code for GNU lds that support --whole-archive.
+	    _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	  else
+	    # Exported symbols can be pulled into shared objects from archives
+	    _LT_TAGVAR(whole_archive_flag_spec, $1)='$convenience'
+	  fi
+	  _LT_TAGVAR(archive_cmds_need_lc, $1)=yes
+	  # This is similar to how AIX traditionally builds its shared libraries.
+	  _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+	fi
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+            _LT_TAGVAR(archive_expsym_cmds, $1)=''
+        ;;
+      m68k)
+            _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/a2ixlibrary.data~$ECHO "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$ECHO "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$ECHO "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$ECHO "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+            _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+            _LT_TAGVAR(hardcode_minus_L, $1)=yes
+        ;;
+      esac
+      ;;
+
+    bsdi[[45]]*)
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)=-rdynamic
+      ;;
+
+    cygwin* | mingw* | pw32* | cegcc*)
+      # When not using gcc, we currently assume that we are using
+      # Microsoft Visual C++.
+      # hardcode_libdir_flag_spec is actually meaningless, as there is
+      # no search path for DLLs.
+      case $cc_basename in
+      cl*)
+	# Native MSVC
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)=' '
+	_LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+	_LT_TAGVAR(always_export_symbols, $1)=yes
+	_LT_TAGVAR(file_list_spec, $1)='@'
+	# Tell ltmain to make .lib files, not .a files.
+	libext=lib
+	# Tell ltmain to make .dll files, not .so files.
+	shrext_cmds=".dll"
+	# FIXME: Setting linknames here is a bad hack.
+	_LT_TAGVAR(archive_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-dll~linknames='
+	_LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
+	    sed -n -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' -e '1\\\!p' < $export_symbols > $output_objdir/$soname.exp;
+	  else
+	    sed -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' < $export_symbols > $output_objdir/$soname.exp;
+	  fi~
+	  $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
+	  linknames='
+	# The linker will not automatically build a static lib if we build a DLL.
+	# _LT_TAGVAR(old_archive_from_new_cmds, $1)='true'
+	_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
+	_LT_TAGVAR(exclude_expsyms, $1)='_NULL_IMPORT_DESCRIPTOR|_IMPORT_DESCRIPTOR_.*'
+	_LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[[BCDGRS]][[ ]]/s/.*[[ ]]\([[^ ]]*\)/\1,DATA/'\'' | $SED -e '\''/^[[AITW]][[ ]]/s/.*[[ ]]//'\'' | sort | uniq > $export_symbols'
+	# Don't use ranlib
+	_LT_TAGVAR(old_postinstall_cmds, $1)='chmod 644 $oldlib'
+	_LT_TAGVAR(postlink_cmds, $1)='lt_outputfile="@OUTPUT@"~
+	  lt_tool_outputfile="@TOOL_OUTPUT@"~
+	  case $lt_outputfile in
+	    *.exe|*.EXE) ;;
+	    *)
+	      lt_outputfile="$lt_outputfile.exe"
+	      lt_tool_outputfile="$lt_tool_outputfile.exe"
+	      ;;
+	  esac~
+	  if test "$MANIFEST_TOOL" != ":" && test -f "$lt_outputfile.manifest"; then
+	    $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
+	    $RM "$lt_outputfile.manifest";
+	  fi'
+	;;
+      *)
+	# Assume MSVC wrapper
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)=' '
+	_LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+	# Tell ltmain to make .lib files, not .a files.
+	libext=lib
+	# Tell ltmain to make .dll files, not .so files.
+	shrext_cmds=".dll"
+	# FIXME: Setting linknames here is a bad hack.
+	_LT_TAGVAR(archive_cmds, $1)='$CC -o $lib $libobjs $compiler_flags `func_echo_all "$deplibs" | $SED '\''s/ -lc$//'\''` -link -dll~linknames='
+	# The linker will automatically build a .lib file if we build a DLL.
+	_LT_TAGVAR(old_archive_from_new_cmds, $1)='true'
+	# FIXME: Should let the user specify the lib program.
+	_LT_TAGVAR(old_archive_cmds, $1)='lib -OUT:$oldlib$oldobjs$old_deplibs'
+	_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
+	;;
+      esac
+      ;;
+
+    darwin* | rhapsody*)
+      _LT_DARWIN_LINKER_FEATURES($1)
+      ;;
+
+    dgux*)
+      _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      ;;
+
+    # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+    # support.  Future versions do this automatically, but an explicit c++rt0.o
+    # does not break anything, and helps significantly (at the cost of a little
+    # extra space).
+    freebsd2.2*)
+      _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
+      _LT_TAGVAR(hardcode_direct, $1)=yes
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      ;;
+
+    # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+    freebsd2.*)
+      _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+      _LT_TAGVAR(hardcode_direct, $1)=yes
+      _LT_TAGVAR(hardcode_minus_L, $1)=yes
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      ;;
+
+    # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+    freebsd* | dragonfly*)
+      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
+      _LT_TAGVAR(hardcode_direct, $1)=yes
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      ;;
+
+    hpux9*)
+      if test "$GCC" = yes; then
+	_LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared $pic_flag ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $libobjs $deplibs $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+      else
+	_LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+      fi
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
+      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+      _LT_TAGVAR(hardcode_direct, $1)=yes
+
+      # hardcode_minus_L: Not really in the search PATH,
+      # but as the default location of the library.
+      _LT_TAGVAR(hardcode_minus_L, $1)=yes
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+      ;;
+
+    hpux10*)
+      if test "$GCC" = yes && test "$with_gnu_ld" = no; then
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	_LT_TAGVAR(archive_cmds, $1)='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags'
+      fi
+      if test "$with_gnu_ld" = no; then
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
+	_LT_TAGVAR(hardcode_libdir_separator, $1)=:
+	_LT_TAGVAR(hardcode_direct, $1)=yes
+	_LT_TAGVAR(hardcode_direct_absolute, $1)=yes
+	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+	# hardcode_minus_L: Not really in the search PATH,
+	# but as the default location of the library.
+	_LT_TAGVAR(hardcode_minus_L, $1)=yes
+      fi
+      ;;
+
+    hpux11*)
+      if test "$GCC" = yes && test "$with_gnu_ld" = no; then
+	case $host_cpu in
+	hppa*64*)
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}+h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	ia64*)
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	*)
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	esac
+      else
+	case $host_cpu in
+	hppa*64*)
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	ia64*)
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	*)
+	m4_if($1, [], [
+	  # Older versions of the 11.00 compiler do not understand -b yet
+	  # (HP92453-01 A.11.01.20 doesn't, HP92453-01 B.11.X.35175-35176.GP does)
+	  _LT_LINKER_OPTION([if $CC understands -b],
+	    _LT_TAGVAR(lt_cv_prog_compiler__b, $1), [-b],
+	    [_LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'],
+	    [_LT_TAGVAR(archive_cmds, $1)='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags'])],
+	  [_LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'])
+	  ;;
+	esac
+      fi
+      if test "$with_gnu_ld" = no; then
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
+	_LT_TAGVAR(hardcode_libdir_separator, $1)=:
+
+	case $host_cpu in
+	hppa*64*|ia64*)
+	  _LT_TAGVAR(hardcode_direct, $1)=no
+	  _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+	  ;;
+	*)
+	  _LT_TAGVAR(hardcode_direct, $1)=yes
+	  _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
+	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+
+	  # hardcode_minus_L: Not really in the search PATH,
+	  # but as the default location of the library.
+	  _LT_TAGVAR(hardcode_minus_L, $1)=yes
+	  ;;
+	esac
+      fi
+      ;;
+
+    irix5* | irix6* | nonstopux*)
+      if test "$GCC" = yes; then
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	# Try to use the -exported_symbol ld option, if it does not
+	# work, assume that -exports_file does not work either and
+	# implicitly export all symbols.
+	# This should be the same for all languages, so no per-tag cache variable.
+	AC_CACHE_CHECK([whether the $host_os linker accepts -exported_symbol],
+	  [lt_cv_irix_exported_symbol],
+	  [save_LDFLAGS="$LDFLAGS"
+	   LDFLAGS="$LDFLAGS -shared ${wl}-exported_symbol ${wl}foo ${wl}-update_registry ${wl}/dev/null"
+	   AC_LINK_IFELSE(
+	     [AC_LANG_SOURCE(
+	        [AC_LANG_CASE([C], [[int foo (void) { return 0; }]],
+			      [C++], [[int foo (void) { return 0; }]],
+			      [Fortran 77], [[
+      subroutine foo
+      end]],
+			      [Fortran], [[
+      subroutine foo
+      end]])])],
+	      [lt_cv_irix_exported_symbol=yes],
+	      [lt_cv_irix_exported_symbol=no])
+           LDFLAGS="$save_LDFLAGS"])
+	if test "$lt_cv_irix_exported_symbol" = yes; then
+          _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations ${wl}-exports_file ${wl}$export_symbols -o $lib'
+	fi
+      else
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -exports_file $export_symbols -o $lib'
+      fi
+      _LT_TAGVAR(archive_cmds_need_lc, $1)='no'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+      _LT_TAGVAR(inherit_rpath, $1)=yes
+      _LT_TAGVAR(link_all_deplibs, $1)=yes
+      ;;
+
+    netbsd* | netbsdelf*-gnu)
+      if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+	_LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
+      else
+	_LT_TAGVAR(archive_cmds, $1)='$LD -shared -o $lib $libobjs $deplibs $linker_flags'      # ELF
+      fi
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
+      _LT_TAGVAR(hardcode_direct, $1)=yes
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      ;;
+
+    newsos6)
+      _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      _LT_TAGVAR(hardcode_direct, $1)=yes
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      ;;
+
+    *nto* | *qnx*)
+      ;;
+
+    openbsd*)
+      if test -f /usr/libexec/ld.so; then
+	_LT_TAGVAR(hardcode_direct, $1)=yes
+	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+	_LT_TAGVAR(hardcode_direct_absolute, $1)=yes
+	if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags ${wl}-retain-symbols-file,$export_symbols'
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
+	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+	else
+	  case $host_os in
+	   openbsd[[01]].* | openbsd2.[[0-7]] | openbsd2.[[0-7]].*)
+	     _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+	     _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
+	     ;;
+	   *)
+	     _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+	     _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
+	     ;;
+	  esac
+	fi
+      else
+	_LT_TAGVAR(ld_shlibs, $1)=no
+      fi
+      ;;
+
+    os2*)
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+      _LT_TAGVAR(hardcode_minus_L, $1)=yes
+      _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+      _LT_TAGVAR(archive_cmds, $1)='$ECHO "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~echo DATA >> $output_objdir/$libname.def~echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+      _LT_TAGVAR(old_archive_from_new_cmds, $1)='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+      ;;
+
+    osf3*)
+      if test "$GCC" = yes; then
+	_LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+      else
+	_LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+      fi
+      _LT_TAGVAR(archive_cmds_need_lc, $1)='no'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+      ;;
+
+    osf4* | osf5*)	# as osf3* with the addition of -msym flag
+      if test "$GCC" = yes; then
+	_LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $pic_flag $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+      else
+	_LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done; printf "%s\\n" "-hidden">> $lib.exp~
+	$CC -shared${allow_undefined_flag} ${wl}-input ${wl}$lib.exp $compiler_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~$RM $lib.exp'
+
+	# Both c and cxx compiler support -rpath directly
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir'
+      fi
+      _LT_TAGVAR(archive_cmds_need_lc, $1)='no'
+      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+      ;;
+
+    solaris*)
+      _LT_TAGVAR(no_undefined_flag, $1)=' -z defs'
+      if test "$GCC" = yes; then
+	wlarc='${wl}'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}-z ${wl}text ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	  $CC -shared $pic_flag ${wl}-z ${wl}text ${wl}-M ${wl}$lib.exp ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
+      else
+	case `$CC -V 2>&1` in
+	*"Compilers 5.0"*)
+	  wlarc=''
+	  _LT_TAGVAR(archive_cmds, $1)='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	  $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$RM $lib.exp'
+	  ;;
+	*)
+	  wlarc='${wl}'
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	  $CC -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
+	  ;;
+	esac
+      fi
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      case $host_os in
+      solaris2.[[0-5]] | solaris2.[[0-5]].*) ;;
+      *)
+	# The compiler driver will combine and reorder linker options,
+	# but understands `-z linker_flag'.  GCC discards it without `$wl',
+	# but is careful enough not to reorder.
+	# Supported since Solaris 2.6 (maybe 2.5.1?)
+	if test "$GCC" = yes; then
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
+	else
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='-z allextract$convenience -z defaultextract'
+	fi
+	;;
+      esac
+      _LT_TAGVAR(link_all_deplibs, $1)=yes
+      ;;
+
+    sunos4*)
+      if test "x$host_vendor" = xsequent; then
+	# Use $CC to link under sequent, because it throws in some extra .o
+	# files that make .init and .fini sections work.
+	_LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h $soname -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	_LT_TAGVAR(archive_cmds, $1)='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+      fi
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+      _LT_TAGVAR(hardcode_direct, $1)=yes
+      _LT_TAGVAR(hardcode_minus_L, $1)=yes
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      ;;
+
+    sysv4)
+      case $host_vendor in
+	sni)
+	  _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	  _LT_TAGVAR(hardcode_direct, $1)=yes # is this really true???
+	;;
+	siemens)
+	  ## LD is ld it makes a PLAMLIB
+	  ## CC just makes a GrossModule.
+	  _LT_TAGVAR(archive_cmds, $1)='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+	  _LT_TAGVAR(reload_cmds, $1)='$CC -r -o $output$reload_objs'
+	  _LT_TAGVAR(hardcode_direct, $1)=no
+        ;;
+	motorola)
+	  _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	  _LT_TAGVAR(hardcode_direct, $1)=no #Motorola manual says yes, but my tests say they lie
+	;;
+      esac
+      runpath_var='LD_RUN_PATH'
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      ;;
+
+    sysv4.3*)
+      _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='-Bexport'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	_LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+	runpath_var=LD_RUN_PATH
+	hardcode_runpath_var=yes
+	_LT_TAGVAR(ld_shlibs, $1)=yes
+      fi
+      ;;
+
+    sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[[01]].[[10]]* | unixware7* | sco3.2v5.0.[[024]]*)
+      _LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
+      _LT_TAGVAR(archive_cmds_need_lc, $1)=no
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      runpath_var='LD_RUN_PATH'
+
+      if test "$GCC" = yes; then
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	_LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      fi
+      ;;
+
+    sysv5* | sco3.2v5* | sco5v6*)
+      # Note: We can NOT use -z defs as we might desire, because we do not
+      # link with -lc, and that would cause any symbols used from libc to
+      # always be unresolved, which means just about no library would
+      # ever link correctly.  If we're not using GNU ld we use -z text
+      # though, which does catch some bad symbols but isn't as heavy-handed
+      # as -z defs.
+      _LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
+      _LT_TAGVAR(allow_undefined_flag, $1)='${wl}-z,nodefs'
+      _LT_TAGVAR(archive_cmds_need_lc, $1)=no
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R,$libdir'
+      _LT_TAGVAR(hardcode_libdir_separator, $1)=':'
+      _LT_TAGVAR(link_all_deplibs, $1)=yes
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-Bexport'
+      runpath_var='LD_RUN_PATH'
+
+      if test "$GCC" = yes; then
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      else
+	_LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      fi
+      ;;
+
+    uts4*)
+      _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      ;;
+
+    *)
+      _LT_TAGVAR(ld_shlibs, $1)=no
+      ;;
+    esac
+
+    if test x$host_vendor = xsni; then
+      case $host in
+      sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-Blargedynsym'
+	;;
+      esac
+    fi
+  fi
+])
+AC_MSG_RESULT([$_LT_TAGVAR(ld_shlibs, $1)])
+test "$_LT_TAGVAR(ld_shlibs, $1)" = no && can_build_shared=no
+
+_LT_TAGVAR(with_gnu_ld, $1)=$with_gnu_ld
+
+_LT_DECL([], [libext], [0], [Old archive suffix (normally "a")])dnl
+_LT_DECL([], [shrext_cmds], [1], [Shared library suffix (normally ".so")])dnl
+_LT_DECL([], [extract_expsyms_cmds], [2],
+    [The commands to extract the exported symbol list from a shared archive])
+
+#
+# Do we need to explicitly link libc?
+#
+case "x$_LT_TAGVAR(archive_cmds_need_lc, $1)" in
+x|xyes)
+  # Assume -lc should be added
+  _LT_TAGVAR(archive_cmds_need_lc, $1)=yes
+
+  if test "$enable_shared" = yes && test "$GCC" = yes; then
+    case $_LT_TAGVAR(archive_cmds, $1) in
+    *'~'*)
+      # FIXME: we may have to deal with multi-command sequences.
+      ;;
+    '$CC '*)
+      # Test whether the compiler implicitly links with -lc since on some
+      # systems, -lgcc has to come before -lc. If gcc already passes -lc
+      # to ld, don't add -lc before -lgcc.
+      AC_CACHE_CHECK([whether -lc should be explicitly linked in],
+	[lt_cv_]_LT_TAGVAR(archive_cmds_need_lc, $1),
+	[$RM conftest*
+	echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+
+	if AC_TRY_EVAL(ac_compile) 2>conftest.err; then
+	  soname=conftest
+	  lib=conftest
+	  libobjs=conftest.$ac_objext
+	  deplibs=
+	  wl=$_LT_TAGVAR(lt_prog_compiler_wl, $1)
+	  pic_flag=$_LT_TAGVAR(lt_prog_compiler_pic, $1)
+	  compiler_flags=-v
+	  linker_flags=-v
+	  verstring=
+	  output_objdir=.
+	  libname=conftest
+	  lt_save_allow_undefined_flag=$_LT_TAGVAR(allow_undefined_flag, $1)
+	  _LT_TAGVAR(allow_undefined_flag, $1)=
+	  if AC_TRY_EVAL(_LT_TAGVAR(archive_cmds, $1) 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1)
+	  then
+	    lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1)=no
+	  else
+	    lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1)=yes
+	  fi
+	  _LT_TAGVAR(allow_undefined_flag, $1)=$lt_save_allow_undefined_flag
+	else
+	  cat conftest.err 1>&5
+	fi
+	$RM conftest*
+	])
+      _LT_TAGVAR(archive_cmds_need_lc, $1)=$lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1)
+      ;;
+    esac
+  fi
+  ;;
+esac
+
+_LT_TAGDECL([build_libtool_need_lc], [archive_cmds_need_lc], [0],
+    [Whether or not to add -lc for building shared libraries])
+_LT_TAGDECL([allow_libtool_libs_with_static_runtimes],
+    [enable_shared_with_static_runtimes], [0],
+    [Whether or not to disallow shared libs when runtime libs are static])
+_LT_TAGDECL([], [export_dynamic_flag_spec], [1],
+    [Compiler flag to allow reflexive dlopens])
+_LT_TAGDECL([], [whole_archive_flag_spec], [1],
+    [Compiler flag to generate shared objects directly from archives])
+_LT_TAGDECL([], [compiler_needs_object], [1],
+    [Whether the compiler copes with passing no objects directly])
+_LT_TAGDECL([], [old_archive_from_new_cmds], [2],
+    [Create an old-style archive from a shared archive])
+_LT_TAGDECL([], [old_archive_from_expsyms_cmds], [2],
+    [Create a temporary old-style archive to link instead of a shared archive])
+_LT_TAGDECL([], [archive_cmds], [2], [Commands used to build a shared archive])
+_LT_TAGDECL([], [archive_expsym_cmds], [2])
+_LT_TAGDECL([], [module_cmds], [2],
+    [Commands used to build a loadable module if different from building
+    a shared archive.])
+_LT_TAGDECL([], [module_expsym_cmds], [2])
+_LT_TAGDECL([], [with_gnu_ld], [1],
+    [Whether we are building with GNU ld or not])
+_LT_TAGDECL([], [allow_undefined_flag], [1],
+    [Flag that allows shared libraries with undefined symbols to be built])
+_LT_TAGDECL([], [no_undefined_flag], [1],
+    [Flag that enforces no undefined symbols])
+_LT_TAGDECL([], [hardcode_libdir_flag_spec], [1],
+    [Flag to hardcode $libdir into a binary during linking.
+    This must work even if $libdir does not exist])
+_LT_TAGDECL([], [hardcode_libdir_separator], [1],
+    [Whether we need a single "-rpath" flag with a separated argument])
+_LT_TAGDECL([], [hardcode_direct], [0],
+    [Set to "yes" if using DIR/libNAME${shared_ext} during linking hardcodes
+    DIR into the resulting binary])
+_LT_TAGDECL([], [hardcode_direct_absolute], [0],
+    [Set to "yes" if using DIR/libNAME${shared_ext} during linking hardcodes
+    DIR into the resulting binary and the resulting library dependency is
+    "absolute", i.e impossible to change by setting ${shlibpath_var} if the
+    library is relocated])
+_LT_TAGDECL([], [hardcode_minus_L], [0],
+    [Set to "yes" if using the -LDIR flag during linking hardcodes DIR
+    into the resulting binary])
+_LT_TAGDECL([], [hardcode_shlibpath_var], [0],
+    [Set to "yes" if using SHLIBPATH_VAR=DIR during linking hardcodes DIR
+    into the resulting binary])
+_LT_TAGDECL([], [hardcode_automatic], [0],
+    [Set to "yes" if building a shared library automatically hardcodes DIR
+    into the library and all subsequent libraries and executables linked
+    against it])
+_LT_TAGDECL([], [inherit_rpath], [0],
+    [Set to yes if linker adds runtime paths of dependent libraries
+    to runtime path list])
+_LT_TAGDECL([], [link_all_deplibs], [0],
+    [Whether libtool must link a program against all its dependency libraries])
+_LT_TAGDECL([], [always_export_symbols], [0],
+    [Set to "yes" if exported symbols are required])
+_LT_TAGDECL([], [export_symbols_cmds], [2],
+    [The commands to list exported symbols])
+_LT_TAGDECL([], [exclude_expsyms], [1],
+    [Symbols that should not be listed in the preloaded symbols])
+_LT_TAGDECL([], [include_expsyms], [1],
+    [Symbols that must always be exported])
+_LT_TAGDECL([], [prelink_cmds], [2],
+    [Commands necessary for linking programs (against libraries) with templates])
+_LT_TAGDECL([], [postlink_cmds], [2],
+    [Commands necessary for finishing linking programs])
+_LT_TAGDECL([], [file_list_spec], [1],
+    [Specify filename containing input files])
+dnl FIXME: Not yet implemented
+dnl _LT_TAGDECL([], [thread_safe_flag_spec], [1],
+dnl    [Compiler flag to generate thread safe objects])
+])# _LT_LINKER_SHLIBS
+
+
+# _LT_LANG_C_CONFIG([TAG])
+# ------------------------
+# Ensure that the configuration variables for a C compiler are suitably
+# defined.  These variables are subsequently used by _LT_CONFIG to write
+# the compiler configuration to `libtool'.
+m4_defun([_LT_LANG_C_CONFIG],
+[m4_require([_LT_DECL_EGREP])dnl
+lt_save_CC="$CC"
+AC_LANG_PUSH(C)
+
+# Source file extension for C test sources.
+ac_ext=c
+
+# Object file extension for compiled C test sources.
+objext=o
+_LT_TAGVAR(objext, $1)=$objext
+
+# Code to be used in simple compile tests
+lt_simple_compile_test_code="int some_variable = 0;"
+
+# Code to be used in simple link tests
+lt_simple_link_test_code='int main(){return(0);}'
+
+_LT_TAG_COMPILER
+# Save the default compiler, since it gets overwritten when the other
+# tags are being tested, and _LT_TAGVAR(compiler, []) is a NOP.
+compiler_DEFAULT=$CC
+
+# save warnings/boilerplate of simple test code
+_LT_COMPILER_BOILERPLATE
+_LT_LINKER_BOILERPLATE
+
+## CAVEAT EMPTOR:
+## There is no encapsulation within the following macros, do not change
+## the running order or otherwise move them around unless you know exactly
+## what you are doing...
+if test -n "$compiler"; then
+  _LT_COMPILER_NO_RTTI($1)
+  _LT_COMPILER_PIC($1)
+  _LT_COMPILER_C_O($1)
+  _LT_COMPILER_FILE_LOCKS($1)
+  _LT_LINKER_SHLIBS($1)
+  _LT_SYS_DYNAMIC_LINKER($1)
+  _LT_LINKER_HARDCODE_LIBPATH($1)
+  LT_SYS_DLOPEN_SELF
+  _LT_CMD_STRIPLIB
+
+  # Report which library types will actually be built
+  AC_MSG_CHECKING([if libtool supports shared libraries])
+  AC_MSG_RESULT([$can_build_shared])
+
+  AC_MSG_CHECKING([whether to build shared libraries])
+  test "$can_build_shared" = "no" && enable_shared=no
+
+  # On AIX, shared libraries and static libraries use the same namespace, and
+  # are all built from PIC.
+  case $host_os in
+  aix3*)
+    test "$enable_shared" = yes && enable_static=no
+    if test -n "$RANLIB"; then
+      archive_cmds="$archive_cmds~\$RANLIB \$lib"
+      postinstall_cmds='$RANLIB $lib'
+    fi
+    ;;
+
+  aix[[4-9]]*)
+    if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
+      test "$enable_shared" = yes && enable_static=no
+    fi
+    ;;
+  esac
+  AC_MSG_RESULT([$enable_shared])
+
+  AC_MSG_CHECKING([whether to build static libraries])
+  # Make sure either enable_shared or enable_static is yes.
+  test "$enable_shared" = yes || enable_static=yes
+  AC_MSG_RESULT([$enable_static])
+
+  _LT_CONFIG($1)
+fi
+AC_LANG_POP
+CC="$lt_save_CC"
+])# _LT_LANG_C_CONFIG
+
+
+# _LT_LANG_CXX_CONFIG([TAG])
+# --------------------------
+# Ensure that the configuration variables for a C++ compiler are suitably
+# defined.  These variables are subsequently used by _LT_CONFIG to write
+# the compiler configuration to `libtool'.
+m4_defun([_LT_LANG_CXX_CONFIG],
+[m4_require([_LT_FILEUTILS_DEFAULTS])dnl
+m4_require([_LT_DECL_EGREP])dnl
+m4_require([_LT_PATH_MANIFEST_TOOL])dnl
+if test -n "$CXX" && ( test "X$CXX" != "Xno" &&
+    ( (test "X$CXX" = "Xg++" && `g++ -v >/dev/null 2>&1` ) ||
+    (test "X$CXX" != "Xg++"))) ; then
+  AC_PROG_CXXCPP
+else
+  _lt_caught_CXX_error=yes
+fi
+
+AC_LANG_PUSH(C++)
+_LT_TAGVAR(archive_cmds_need_lc, $1)=no
+_LT_TAGVAR(allow_undefined_flag, $1)=
+_LT_TAGVAR(always_export_symbols, $1)=no
+_LT_TAGVAR(archive_expsym_cmds, $1)=
+_LT_TAGVAR(compiler_needs_object, $1)=no
+_LT_TAGVAR(export_dynamic_flag_spec, $1)=
+_LT_TAGVAR(hardcode_direct, $1)=no
+_LT_TAGVAR(hardcode_direct_absolute, $1)=no
+_LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
+_LT_TAGVAR(hardcode_libdir_separator, $1)=
+_LT_TAGVAR(hardcode_minus_L, $1)=no
+_LT_TAGVAR(hardcode_shlibpath_var, $1)=unsupported
+_LT_TAGVAR(hardcode_automatic, $1)=no
+_LT_TAGVAR(inherit_rpath, $1)=no
+_LT_TAGVAR(module_cmds, $1)=
+_LT_TAGVAR(module_expsym_cmds, $1)=
+_LT_TAGVAR(link_all_deplibs, $1)=unknown
+_LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds
+_LT_TAGVAR(reload_flag, $1)=$reload_flag
+_LT_TAGVAR(reload_cmds, $1)=$reload_cmds
+_LT_TAGVAR(no_undefined_flag, $1)=
+_LT_TAGVAR(whole_archive_flag_spec, $1)=
+_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no
+
+# Source file extension for C++ test sources.
+ac_ext=cpp
+
+# Object file extension for compiled C++ test sources.
+objext=o
+_LT_TAGVAR(objext, $1)=$objext
+
+# No sense in running all these tests if we already determined that
+# the CXX compiler isn't working.  Some variables (like enable_shared)
+# are currently assumed to apply to all compilers on this platform,
+# and will be corrupted by setting them based on a non-working compiler.
+if test "$_lt_caught_CXX_error" != yes; then
+  # Code to be used in simple compile tests
+  lt_simple_compile_test_code="int some_variable = 0;"
+
+  # Code to be used in simple link tests
+  lt_simple_link_test_code='int main(int, char *[[]]) { return(0); }'
+
+  # ltmain only uses $CC for tagged configurations so make sure $CC is set.
+  _LT_TAG_COMPILER
+
+  # save warnings/boilerplate of simple test code
+  _LT_COMPILER_BOILERPLATE
+  _LT_LINKER_BOILERPLATE
+
+  # Allow CC to be a program name with arguments.
+  lt_save_CC=$CC
+  lt_save_CFLAGS=$CFLAGS
+  lt_save_LD=$LD
+  lt_save_GCC=$GCC
+  GCC=$GXX
+  lt_save_with_gnu_ld=$with_gnu_ld
+  lt_save_path_LD=$lt_cv_path_LD
+  if test -n "${lt_cv_prog_gnu_ldcxx+set}"; then
+    lt_cv_prog_gnu_ld=$lt_cv_prog_gnu_ldcxx
+  else
+    $as_unset lt_cv_prog_gnu_ld
+  fi
+  if test -n "${lt_cv_path_LDCXX+set}"; then
+    lt_cv_path_LD=$lt_cv_path_LDCXX
+  else
+    $as_unset lt_cv_path_LD
+  fi
+  test -z "${LDCXX+set}" || LD=$LDCXX
+  CC=${CXX-"c++"}
+  CFLAGS=$CXXFLAGS
+  compiler=$CC
+  _LT_TAGVAR(compiler, $1)=$CC
+  _LT_CC_BASENAME([$compiler])
+
+  if test -n "$compiler"; then
+    # We don't want -fno-exception when compiling C++ code, so set the
+    # no_builtin_flag separately
+    if test "$GXX" = yes; then
+      _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -fno-builtin'
+    else
+      _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=
+    fi
+
+    if test "$GXX" = yes; then
+      # Set up default GNU C++ configuration
+
+      LT_PATH_LD
+
+      # Check if GNU C++ uses GNU ld as the underlying linker, since the
+      # archiving commands below assume that GNU ld is being used.
+      if test "$with_gnu_ld" = yes; then
+        _LT_TAGVAR(archive_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
+        _LT_TAGVAR(archive_expsym_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+
+        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+        _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
+
+        # If archive_cmds runs LD, not CC, wlarc should be empty
+        # XXX I think wlarc can be eliminated in ltcf-cxx, but I need to
+        #     investigate it a little bit more. (MM)
+        wlarc='${wl}'
+
+        # ancient GNU ld didn't support --whole-archive et. al.
+        if eval "`$CC -print-prog-name=ld` --help 2>&1" |
+	  $GREP 'no-whole-archive' > /dev/null; then
+          _LT_TAGVAR(whole_archive_flag_spec, $1)="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+        else
+          _LT_TAGVAR(whole_archive_flag_spec, $1)=
+        fi
+      else
+        with_gnu_ld=no
+        wlarc=
+
+        # A generic and very simple default shared library creation
+        # command for GNU C++ for the case where it uses the native
+        # linker, instead of GNU ld.  If possible, this setting should
+        # overridden to take advantage of the native linker features on
+        # the platform it is being used on.
+        _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
+      fi
+
+      # Commands to make compiler produce verbose output that lists
+      # what "hidden" libraries, object files and flags are used when
+      # linking a shared library.
+      output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
+
+    else
+      GXX=no
+      with_gnu_ld=no
+      wlarc=
+    fi
+
+    # PORTME: fill in a description of your system's C++ link characteristics
+    AC_MSG_CHECKING([whether the $compiler linker ($LD) supports shared libraries])
+    _LT_TAGVAR(ld_shlibs, $1)=yes
+    case $host_os in
+      aix3*)
+        # FIXME: insert proper C++ library support
+        _LT_TAGVAR(ld_shlibs, $1)=no
+        ;;
+      aix[[4-9]]*)
+        if test "$host_cpu" = ia64; then
+          # On IA64, the linker does run time linking by default, so we don't
+          # have to do anything special.
+          aix_use_runtimelinking=no
+          exp_sym_flag='-Bexport'
+          no_entry_flag=""
+        else
+          aix_use_runtimelinking=no
+
+          # Test if we are trying to use run time linking or normal
+          # AIX style linking. If -brtl is somewhere in LDFLAGS, we
+          # need to do runtime linking.
+          case $host_os in aix4.[[23]]|aix4.[[23]].*|aix[[5-9]]*)
+	    for ld_flag in $LDFLAGS; do
+	      case $ld_flag in
+	      *-brtl*)
+	        aix_use_runtimelinking=yes
+	        break
+	        ;;
+	      esac
+	    done
+	    ;;
+          esac
+
+          exp_sym_flag='-bexport'
+          no_entry_flag='-bnoentry'
+        fi
+
+        # When large executables or shared objects are built, AIX ld can
+        # have problems creating the table of contents.  If linking a library
+        # or program results in "error TOC overflow" add -mminimal-toc to
+        # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
+        # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
+
+        _LT_TAGVAR(archive_cmds, $1)=''
+        _LT_TAGVAR(hardcode_direct, $1)=yes
+        _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
+        _LT_TAGVAR(hardcode_libdir_separator, $1)=':'
+        _LT_TAGVAR(link_all_deplibs, $1)=yes
+        _LT_TAGVAR(file_list_spec, $1)='${wl}-f,'
+
+        if test "$GXX" = yes; then
+          case $host_os in aix4.[[012]]|aix4.[[012]].*)
+          # We only want to do this on AIX 4.2 and lower, the check
+          # below for broken collect2 doesn't work under 4.3+
+	  collect2name=`${CC} -print-prog-name=collect2`
+	  if test -f "$collect2name" &&
+	     strings "$collect2name" | $GREP resolve_lib_name >/dev/null
+	  then
+	    # We have reworked collect2
+	    :
+	  else
+	    # We have old collect2
+	    _LT_TAGVAR(hardcode_direct, $1)=unsupported
+	    # It fails to find uninstalled libraries when the uninstalled
+	    # path is not listed in the libpath.  Setting hardcode_minus_L
+	    # to unsupported forces relinking
+	    _LT_TAGVAR(hardcode_minus_L, $1)=yes
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+	    _LT_TAGVAR(hardcode_libdir_separator, $1)=
+	  fi
+          esac
+          shared_flag='-shared'
+	  if test "$aix_use_runtimelinking" = yes; then
+	    shared_flag="$shared_flag "'${wl}-G'
+	  fi
+        else
+          # not using gcc
+          if test "$host_cpu" = ia64; then
+	  # VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
+	  # chokes on -Wl,-G. The following line is correct:
+	  shared_flag='-G'
+          else
+	    if test "$aix_use_runtimelinking" = yes; then
+	      shared_flag='${wl}-G'
+	    else
+	      shared_flag='${wl}-bM:SRE'
+	    fi
+          fi
+        fi
+
+        _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-bexpall'
+        # It seems that -bexpall does not export symbols beginning with
+        # underscore (_), so it is better to generate a list of symbols to
+	# export.
+        _LT_TAGVAR(always_export_symbols, $1)=yes
+        if test "$aix_use_runtimelinking" = yes; then
+          # Warning - without using the other runtime loading flags (-brtl),
+          # -berok will link without error, but may produce a broken library.
+          _LT_TAGVAR(allow_undefined_flag, $1)='-berok'
+          # Determine the default libpath from the value encoded in an empty
+          # executable.
+          _LT_SYS_MODULE_PATH_AIX([$1])
+          _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
+
+          _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
+        else
+          if test "$host_cpu" = ia64; then
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R $libdir:/usr/lib:/lib'
+	    _LT_TAGVAR(allow_undefined_flag, $1)="-z nodefs"
+	    _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
+          else
+	    # Determine the default libpath from the value encoded in an
+	    # empty executable.
+	    _LT_SYS_MODULE_PATH_AIX([$1])
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
+	    # Warning - without using the other run time loading flags,
+	    # -berok will link without error, but may produce a broken library.
+	    _LT_TAGVAR(no_undefined_flag, $1)=' ${wl}-bernotok'
+	    _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-berok'
+	    if test "$with_gnu_ld" = yes; then
+	      # We only use this code for GNU lds that support --whole-archive.
+	      _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	    else
+	      # Exported symbols can be pulled into shared objects from archives
+	      _LT_TAGVAR(whole_archive_flag_spec, $1)='$convenience'
+	    fi
+	    _LT_TAGVAR(archive_cmds_need_lc, $1)=yes
+	    # This is similar to how AIX traditionally builds its shared
+	    # libraries.
+	    _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+          fi
+        fi
+        ;;
+
+      beos*)
+	if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	  _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+	  # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+	  # support --undefined.  This deserves some investigation.  FIXME
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	else
+	  _LT_TAGVAR(ld_shlibs, $1)=no
+	fi
+	;;
+
+      chorus*)
+        case $cc_basename in
+          *)
+	  # FIXME: insert proper C++ library support
+	  _LT_TAGVAR(ld_shlibs, $1)=no
+	  ;;
+        esac
+        ;;
+
+      cygwin* | mingw* | pw32* | cegcc*)
+	case $GXX,$cc_basename in
+	,cl* | no,cl*)
+	  # Native MSVC
+	  # hardcode_libdir_flag_spec is actually meaningless, as there is
+	  # no search path for DLLs.
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)=' '
+	  _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+	  _LT_TAGVAR(always_export_symbols, $1)=yes
+	  _LT_TAGVAR(file_list_spec, $1)='@'
+	  # Tell ltmain to make .lib files, not .a files.
+	  libext=lib
+	  # Tell ltmain to make .dll files, not .so files.
+	  shrext_cmds=".dll"
+	  # FIXME: Setting linknames here is a bad hack.
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-dll~linknames='
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
+	      $SED -n -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' -e '1\\\!p' < $export_symbols > $output_objdir/$soname.exp;
+	    else
+	      $SED -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' < $export_symbols > $output_objdir/$soname.exp;
+	    fi~
+	    $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
+	    linknames='
+	  # The linker will not automatically build a static lib if we build a DLL.
+	  # _LT_TAGVAR(old_archive_from_new_cmds, $1)='true'
+	  _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
+	  # Don't use ranlib
+	  _LT_TAGVAR(old_postinstall_cmds, $1)='chmod 644 $oldlib'
+	  _LT_TAGVAR(postlink_cmds, $1)='lt_outputfile="@OUTPUT@"~
+	    lt_tool_outputfile="@TOOL_OUTPUT@"~
+	    case $lt_outputfile in
+	      *.exe|*.EXE) ;;
+	      *)
+		lt_outputfile="$lt_outputfile.exe"
+		lt_tool_outputfile="$lt_tool_outputfile.exe"
+		;;
+	    esac~
+	    func_to_tool_file "$lt_outputfile"~
+	    if test "$MANIFEST_TOOL" != ":" && test -f "$lt_outputfile.manifest"; then
+	      $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
+	      $RM "$lt_outputfile.manifest";
+	    fi'
+	  ;;
+	*)
+	  # g++
+	  # _LT_TAGVAR(hardcode_libdir_flag_spec, $1) is actually meaningless,
+	  # as there is no search path for DLLs.
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-all-symbols'
+	  _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+	  _LT_TAGVAR(always_export_symbols, $1)=no
+	  _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
+
+	  if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+	    # If the export-symbols file already is a .def file (1st line
+	    # is EXPORTS), use it as is; otherwise, prepend...
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
+	      cp $export_symbols $output_objdir/$soname.def;
+	    else
+	      echo EXPORTS > $output_objdir/$soname.def;
+	      cat $export_symbols >> $output_objdir/$soname.def;
+	    fi~
+	    $CC -shared -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+	  else
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	  fi
+	  ;;
+	esac
+	;;
+      darwin* | rhapsody*)
+        _LT_DARWIN_LINKER_FEATURES($1)
+	;;
+
+      dgux*)
+        case $cc_basename in
+          ec++*)
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+          ghcx*)
+	    # Green Hills C++ Compiler
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+          *)
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+        esac
+        ;;
+
+      freebsd2.*)
+        # C++ shared libraries reported to be fairly broken before
+	# switch to ELF
+        _LT_TAGVAR(ld_shlibs, $1)=no
+        ;;
+
+      freebsd-elf*)
+        _LT_TAGVAR(archive_cmds_need_lc, $1)=no
+        ;;
+
+      freebsd* | dragonfly*)
+        # FreeBSD 3 and later use GNU C++ and GNU ld with standard ELF
+        # conventions
+        _LT_TAGVAR(ld_shlibs, $1)=yes
+        ;;
+
+      haiku*)
+        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+        _LT_TAGVAR(link_all_deplibs, $1)=yes
+        ;;
+
+      hpux9*)
+        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
+        _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+        _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+        _LT_TAGVAR(hardcode_direct, $1)=yes
+        _LT_TAGVAR(hardcode_minus_L, $1)=yes # Not in the search PATH,
+				             # but as the default
+				             # location of the library.
+
+        case $cc_basename in
+          CC*)
+            # FIXME: insert proper C++ library support
+            _LT_TAGVAR(ld_shlibs, $1)=no
+            ;;
+          aCC*)
+            _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -b ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+            # Commands to make compiler produce verbose output that lists
+            # what "hidden" libraries, object files and flags are used when
+            # linking a shared library.
+            #
+            # There doesn't appear to be a way to prevent this compiler from
+            # explicitly linking system object files so we need to strip them
+            # from the output so that they don't get included in the library
+            # dependencies.
+            output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $EGREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+            ;;
+          *)
+            if test "$GXX" = yes; then
+              _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared -nostdlib $pic_flag ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+            else
+              # FIXME: insert proper C++ library support
+              _LT_TAGVAR(ld_shlibs, $1)=no
+            fi
+            ;;
+        esac
+        ;;
+
+      hpux10*|hpux11*)
+        if test $with_gnu_ld = no; then
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
+	  _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+
+          case $host_cpu in
+            hppa*64*|ia64*)
+              ;;
+            *)
+	      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+              ;;
+          esac
+        fi
+        case $host_cpu in
+          hppa*64*|ia64*)
+            _LT_TAGVAR(hardcode_direct, $1)=no
+            _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+            ;;
+          *)
+            _LT_TAGVAR(hardcode_direct, $1)=yes
+            _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
+            _LT_TAGVAR(hardcode_minus_L, $1)=yes # Not in the search PATH,
+					         # but as the default
+					         # location of the library.
+            ;;
+        esac
+
+        case $cc_basename in
+          CC*)
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+          aCC*)
+	    case $host_cpu in
+	      hppa*64*)
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	        ;;
+	      ia64*)
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	        ;;
+	      *)
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	        ;;
+	    esac
+	    # Commands to make compiler produce verbose output that lists
+	    # what "hidden" libraries, object files and flags are used when
+	    # linking a shared library.
+	    #
+	    # There doesn't appear to be a way to prevent this compiler from
+	    # explicitly linking system object files so we need to strip them
+	    # from the output so that they don't get included in the library
+	    # dependencies.
+	    output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $GREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+	    ;;
+          *)
+	    if test "$GXX" = yes; then
+	      if test $with_gnu_ld = no; then
+	        case $host_cpu in
+	          hppa*64*)
+	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib -fPIC ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	            ;;
+	          ia64*)
+	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	            ;;
+	          *)
+	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	            ;;
+	        esac
+	      fi
+	    else
+	      # FIXME: insert proper C++ library support
+	      _LT_TAGVAR(ld_shlibs, $1)=no
+	    fi
+	    ;;
+        esac
+        ;;
+
+      interix[[3-9]]*)
+	_LT_TAGVAR(hardcode_direct, $1)=no
+	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
+	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+	# Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
+	# Instead, shared libraries are loaded at an image base (0x10000000 by
+	# default) and relocated if they conflict, which is a slow very memory
+	# consuming and fragmenting process.  To avoid this, we pick a random,
+	# 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
+	# time.  Moving up from 0x10000000 also allows more sbrk(2) space.
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+	;;
+      irix5* | irix6*)
+        case $cc_basename in
+          CC*)
+	    # SGI C++
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared -all -multigot $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+
+	    # Archives containing C++ object files must be created using
+	    # "CC -ar", where "CC" is the IRIX C++ compiler.  This is
+	    # necessary to make sure instantiated templates are included
+	    # in the archive.
+	    _LT_TAGVAR(old_archive_cmds, $1)='$CC -ar -WR,-u -o $oldlib $oldobjs'
+	    ;;
+          *)
+	    if test "$GXX" = yes; then
+	      if test "$with_gnu_ld" = no; then
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	      else
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` -o $lib'
+	      fi
+	    fi
+	    _LT_TAGVAR(link_all_deplibs, $1)=yes
+	    ;;
+        esac
+        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+        _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+        _LT_TAGVAR(inherit_rpath, $1)=yes
+        ;;
+
+      linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+        case $cc_basename in
+          KCC*)
+	    # Kuck and Associates, Inc. (KAI) C++ Compiler
+
+	    # KCC will only create a shared library if the output file
+	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
+	    # to its proper name (with version) after linking.
+	    _LT_TAGVAR(archive_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib ${wl}-retain-symbols-file,$export_symbols; mv \$templib $lib'
+	    # Commands to make compiler produce verbose output that lists
+	    # what "hidden" libraries, object files and flags are used when
+	    # linking a shared library.
+	    #
+	    # There doesn't appear to be a way to prevent this compiler from
+	    # explicitly linking system object files so we need to strip them
+	    # from the output so that they don't get included in the library
+	    # dependencies.
+	    output_verbose_link_cmd='templist=`$CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1 | $GREP "ld"`; rm -f libconftest$shared_ext; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
+	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
+
+	    # Archives containing C++ object files must be created using
+	    # "CC -Bstatic", where "CC" is the KAI C++ compiler.
+	    _LT_TAGVAR(old_archive_cmds, $1)='$CC -Bstatic -o $oldlib $oldobjs'
+	    ;;
+	  icpc* | ecpc* )
+	    # Intel C++
+	    with_gnu_ld=yes
+	    # version 8.0 and above of icpc choke on multiply defined symbols
+	    # if we add $predep_objects and $postdep_objects, however 7.1 and
+	    # earlier do not add the objects themselves.
+	    case `$CC -V 2>&1` in
+	      *"Version 7."*)
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
+		_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+		;;
+	      *)  # Version 8.0 or newer
+	        tmp_idyn=
+	        case $host_cpu in
+		  ia64*) tmp_idyn=' -i_dynamic';;
+		esac
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+		_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+		;;
+	    esac
+	    _LT_TAGVAR(archive_cmds_need_lc, $1)=no
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
+	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
+	    _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	    ;;
+          pgCC* | pgcpp*)
+            # Portland Group C++ compiler
+	    case `$CC -V` in
+	    *pgCC\ [[1-5]].* | *pgcpp\ [[1-5]].*)
+	      _LT_TAGVAR(prelink_cmds, $1)='tpldir=Template.dir~
+		rm -rf $tpldir~
+		$CC --prelink_objects --instantiation_dir $tpldir $objs $libobjs $compile_deplibs~
+		compile_command="$compile_command `find $tpldir -name \*.o | sort | $NL2SP`"'
+	      _LT_TAGVAR(old_archive_cmds, $1)='tpldir=Template.dir~
+		rm -rf $tpldir~
+		$CC --prelink_objects --instantiation_dir $tpldir $oldobjs$old_deplibs~
+		$AR $AR_FLAGS $oldlib$oldobjs$old_deplibs `find $tpldir -name \*.o | sort | $NL2SP`~
+		$RANLIB $oldlib'
+	      _LT_TAGVAR(archive_cmds, $1)='tpldir=Template.dir~
+		rm -rf $tpldir~
+		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
+		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
+	      _LT_TAGVAR(archive_expsym_cmds, $1)='tpldir=Template.dir~
+		rm -rf $tpldir~
+		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
+		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
+	      ;;
+	    *) # Version 6 and above use weak symbols
+	      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
+	      _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
+	      ;;
+	    esac
+
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}--rpath ${wl}$libdir'
+	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
+	    _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+            ;;
+	  cxx*)
+	    # Compaq C++
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname  -o $lib ${wl}-retain-symbols-file $wl$export_symbols'
+
+	    runpath_var=LD_RUN_PATH
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir'
+	    _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+
+	    # Commands to make compiler produce verbose output that lists
+	    # what "hidden" libraries, object files and flags are used when
+	    # linking a shared library.
+	    #
+	    # There doesn't appear to be a way to prevent this compiler from
+	    # explicitly linking system object files so we need to strip them
+	    # from the output so that they don't get included in the library
+	    # dependencies.
+	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld .*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "X$list" | $Xsed'
+	    ;;
+	  xl* | mpixl* | bgxl*)
+	    # IBM XL 8.0 on PPC, with GNU ld
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	    if test "x$supports_anon_versioning" = xyes; then
+	      _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~
+		cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+		echo "local: *; };" >> $output_objdir/$libname.ver~
+		$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
+	    fi
+	    ;;
+	  *)
+	    case `$CC -V 2>&1 | sed 5q` in
+	    *Sun\ C*)
+	      # Sun C++ 5.9
+	      _LT_TAGVAR(no_undefined_flag, $1)=' -zdefs'
+	      _LT_TAGVAR(archive_cmds, $1)='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	      _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file ${wl}$export_symbols'
+	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
+	      _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	      _LT_TAGVAR(compiler_needs_object, $1)=yes
+
+	      # Not sure whether something based on
+	      # $CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1
+	      # would be better.
+	      output_verbose_link_cmd='func_echo_all'
+
+	      # Archives containing C++ object files must be created using
+	      # "CC -xar", where "CC" is the Sun C++ compiler.  This is
+	      # necessary to make sure instantiated templates are included
+	      # in the archive.
+	      _LT_TAGVAR(old_archive_cmds, $1)='$CC -xar -o $oldlib $oldobjs'
+	      ;;
+	    esac
+	    ;;
+	esac
+	;;
+
+      lynxos*)
+        # FIXME: insert proper C++ library support
+	_LT_TAGVAR(ld_shlibs, $1)=no
+	;;
+
+      m88k*)
+        # FIXME: insert proper C++ library support
+        _LT_TAGVAR(ld_shlibs, $1)=no
+	;;
+
+      mvs*)
+        case $cc_basename in
+          cxx*)
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+	  *)
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+	esac
+	;;
+
+      netbsd*)
+        if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+	  _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable  -o $lib $predep_objects $libobjs $deplibs $postdep_objects $linker_flags'
+	  wlarc=
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
+	  _LT_TAGVAR(hardcode_direct, $1)=yes
+	  _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+	fi
+	# Workaround some broken pre-1.5 toolchains
+	output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP conftest.$objext | $SED -e "s:-lgcc -lc -lgcc::"'
+	;;
+
+      *nto* | *qnx*)
+        _LT_TAGVAR(ld_shlibs, $1)=yes
+	;;
+
+      openbsd2*)
+        # C++ shared libraries are fairly broken
+	_LT_TAGVAR(ld_shlibs, $1)=no
+	;;
+
+      openbsd*)
+	if test -f /usr/libexec/ld.so; then
+	  _LT_TAGVAR(hardcode_direct, $1)=yes
+	  _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+	  _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
+	  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file,$export_symbols -o $lib'
+	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+	    _LT_TAGVAR(whole_archive_flag_spec, $1)="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+	  fi
+	  output_verbose_link_cmd=func_echo_all
+	else
+	  _LT_TAGVAR(ld_shlibs, $1)=no
+	fi
+	;;
+
+      osf3* | osf4* | osf5*)
+        case $cc_basename in
+          KCC*)
+	    # Kuck and Associates, Inc. (KAI) C++ Compiler
+
+	    # KCC will only create a shared library if the output file
+	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
+	    # to its proper name (with version) after linking.
+	    _LT_TAGVAR(archive_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo "$lib" | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
+
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
+	    _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+
+	    # Archives containing C++ object files must be created using
+	    # the KAI C++ compiler.
+	    case $host in
+	      osf3*) _LT_TAGVAR(old_archive_cmds, $1)='$CC -Bstatic -o $oldlib $oldobjs' ;;
+	      *) _LT_TAGVAR(old_archive_cmds, $1)='$CC -o $oldlib $oldobjs' ;;
+	    esac
+	    ;;
+          RCC*)
+	    # Rational C++ 2.4.1
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+          cxx*)
+	    case $host in
+	      osf3*)
+	        _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $soname `test -n "$verstring" && func_echo_all "${wl}-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+		;;
+	      *)
+	        _LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	        _LT_TAGVAR(archive_expsym_cmds, $1)='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done~
+	          echo "-hidden">> $lib.exp~
+	          $CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname ${wl}-input ${wl}$lib.exp  `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~
+	          $RM $lib.exp'
+	        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir'
+		;;
+	    esac
+
+	    _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+
+	    # Commands to make compiler produce verbose output that lists
+	    # what "hidden" libraries, object files and flags are used when
+	    # linking a shared library.
+	    #
+	    # There doesn't appear to be a way to prevent this compiler from
+	    # explicitly linking system object files so we need to strip them
+	    # from the output so that they don't get included in the library
+	    # dependencies.
+	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld" | $GREP -v "ld:"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld.*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+	    ;;
+	  *)
+	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
+	      _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
+	      case $host in
+	        osf3*)
+	          _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+		  ;;
+	        *)
+	          _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+		  ;;
+	      esac
+
+	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+	      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
+
+	      # Commands to make compiler produce verbose output that lists
+	      # what "hidden" libraries, object files and flags are used when
+	      # linking a shared library.
+	      output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
+
+	    else
+	      # FIXME: insert proper C++ library support
+	      _LT_TAGVAR(ld_shlibs, $1)=no
+	    fi
+	    ;;
+        esac
+        ;;
+
+      psos*)
+        # FIXME: insert proper C++ library support
+        _LT_TAGVAR(ld_shlibs, $1)=no
+        ;;
+
+      sunos4*)
+        case $cc_basename in
+          CC*)
+	    # Sun C++ 4.x
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+          lcc*)
+	    # Lucid
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+          *)
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+        esac
+        ;;
+
+      solaris*)
+        case $cc_basename in
+          CC* | sunCC*)
+	    # Sun C++ 4.2, 5.x and Centerline C++
+            _LT_TAGVAR(archive_cmds_need_lc,$1)=yes
+	    _LT_TAGVAR(no_undefined_flag, $1)=' -zdefs'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -G${allow_undefined_flag}  -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	      $CC -G${allow_undefined_flag} ${wl}-M ${wl}$lib.exp -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
+
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
+	    _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+	    case $host_os in
+	      solaris2.[[0-5]] | solaris2.[[0-5]].*) ;;
+	      *)
+		# The compiler driver will combine and reorder linker options,
+		# but understands `-z linker_flag'.
+	        # Supported since Solaris 2.6 (maybe 2.5.1?)
+		_LT_TAGVAR(whole_archive_flag_spec, $1)='-z allextract$convenience -z defaultextract'
+	        ;;
+	    esac
+	    _LT_TAGVAR(link_all_deplibs, $1)=yes
+
+	    output_verbose_link_cmd='func_echo_all'
+
+	    # Archives containing C++ object files must be created using
+	    # "CC -xar", where "CC" is the Sun C++ compiler.  This is
+	    # necessary to make sure instantiated templates are included
+	    # in the archive.
+	    _LT_TAGVAR(old_archive_cmds, $1)='$CC -xar -o $oldlib $oldobjs'
+	    ;;
+          gcx*)
+	    # Green Hills C++ Compiler
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
+
+	    # The C++ compiler must be used to create the archive.
+	    _LT_TAGVAR(old_archive_cmds, $1)='$CC $LDFLAGS -archive -o $oldlib $oldobjs'
+	    ;;
+          *)
+	    # GNU C++ compiler with Solaris linker
+	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
+	      _LT_TAGVAR(no_undefined_flag, $1)=' ${wl}-z ${wl}defs'
+	      if $CC --version | $GREP -v '^2\.7' > /dev/null; then
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
+	        _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+		  $CC -shared $pic_flag -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
+
+	        # Commands to make compiler produce verbose output that lists
+	        # what "hidden" libraries, object files and flags are used when
+	        # linking a shared library.
+	        output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
+	      else
+	        # g++ 2.7 appears to require `-G' NOT `-shared' on this
+	        # platform.
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -G -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
+	        _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+		  $CC -G -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
+
+	        # Commands to make compiler produce verbose output that lists
+	        # what "hidden" libraries, object files and flags are used when
+	        # linking a shared library.
+	        output_verbose_link_cmd='$CC -G $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
+	      fi
+
+	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R $wl$libdir'
+	      case $host_os in
+		solaris2.[[0-5]] | solaris2.[[0-5]].*) ;;
+		*)
+		  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
+		  ;;
+	      esac
+	    fi
+	    ;;
+        esac
+        ;;
+
+    sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[[01]].[[10]]* | unixware7* | sco3.2v5.0.[[024]]*)
+      _LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
+      _LT_TAGVAR(archive_cmds_need_lc, $1)=no
+      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+      runpath_var='LD_RUN_PATH'
+
+      case $cc_basename in
+        CC*)
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	*)
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+      esac
+      ;;
+
+      sysv5* | sco3.2v5* | sco5v6*)
+	# Note: We can NOT use -z defs as we might desire, because we do not
+	# link with -lc, and that would cause any symbols used from libc to
+	# always be unresolved, which means just about no library would
+	# ever link correctly.  If we're not using GNU ld we use -z text
+	# though, which does catch some bad symbols but isn't as heavy-handed
+	# as -z defs.
+	_LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
+	_LT_TAGVAR(allow_undefined_flag, $1)='${wl}-z,nodefs'
+	_LT_TAGVAR(archive_cmds_need_lc, $1)=no
+	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R,$libdir'
+	_LT_TAGVAR(hardcode_libdir_separator, $1)=':'
+	_LT_TAGVAR(link_all_deplibs, $1)=yes
+	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-Bexport'
+	runpath_var='LD_RUN_PATH'
+
+	case $cc_basename in
+          CC*)
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    _LT_TAGVAR(old_archive_cmds, $1)='$CC -Tprelink_objects $oldobjs~
+	      '"$_LT_TAGVAR(old_archive_cmds, $1)"
+	    _LT_TAGVAR(reload_cmds, $1)='$CC -Tprelink_objects $reload_objs~
+	      '"$_LT_TAGVAR(reload_cmds, $1)"
+	    ;;
+	  *)
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    ;;
+	esac
+      ;;
+
+      tandem*)
+        case $cc_basename in
+          NCC*)
+	    # NonStop-UX NCC 3.20
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+          *)
+	    # FIXME: insert proper C++ library support
+	    _LT_TAGVAR(ld_shlibs, $1)=no
+	    ;;
+        esac
+        ;;
+
+      vxworks*)
+        # FIXME: insert proper C++ library support
+        _LT_TAGVAR(ld_shlibs, $1)=no
+        ;;
+
+      *)
+        # FIXME: insert proper C++ library support
+        _LT_TAGVAR(ld_shlibs, $1)=no
+        ;;
+    esac
+
+    AC_MSG_RESULT([$_LT_TAGVAR(ld_shlibs, $1)])
+    test "$_LT_TAGVAR(ld_shlibs, $1)" = no && can_build_shared=no
+
+    _LT_TAGVAR(GCC, $1)="$GXX"
+    _LT_TAGVAR(LD, $1)="$LD"
+
+    ## CAVEAT EMPTOR:
+    ## There is no encapsulation within the following macros, do not change
+    ## the running order or otherwise move them around unless you know exactly
+    ## what you are doing...
+    _LT_SYS_HIDDEN_LIBDEPS($1)
+    _LT_COMPILER_PIC($1)
+    _LT_COMPILER_C_O($1)
+    _LT_COMPILER_FILE_LOCKS($1)
+    _LT_LINKER_SHLIBS($1)
+    _LT_SYS_DYNAMIC_LINKER($1)
+    _LT_LINKER_HARDCODE_LIBPATH($1)
+
+    _LT_CONFIG($1)
+  fi # test -n "$compiler"
+
+  CC=$lt_save_CC
+  CFLAGS=$lt_save_CFLAGS
+  LDCXX=$LD
+  LD=$lt_save_LD
+  GCC=$lt_save_GCC
+  with_gnu_ld=$lt_save_with_gnu_ld
+  lt_cv_path_LDCXX=$lt_cv_path_LD
+  lt_cv_path_LD=$lt_save_path_LD
+  lt_cv_prog_gnu_ldcxx=$lt_cv_prog_gnu_ld
+  lt_cv_prog_gnu_ld=$lt_save_with_gnu_ld
+fi # test "$_lt_caught_CXX_error" != yes
+
+AC_LANG_POP
+])# _LT_LANG_CXX_CONFIG
+
+
+# _LT_FUNC_STRIPNAME_CNF
+# ----------------------
+# func_stripname_cnf prefix suffix name
+# strip PREFIX and SUFFIX off of NAME.
+# PREFIX and SUFFIX must not contain globbing or regex special
+# characters, hashes, percent signs, but SUFFIX may contain a leading
+# dot (in which case that matches only a dot).
+#
+# This function is identical to the (non-XSI) version of func_stripname,
+# except this one can be used by m4 code that may be executed by configure,
+# rather than the libtool script.
+m4_defun([_LT_FUNC_STRIPNAME_CNF],[dnl
+AC_REQUIRE([_LT_DECL_SED])
+AC_REQUIRE([_LT_PROG_ECHO_BACKSLASH])
+func_stripname_cnf ()
+{
+  case ${2} in
+  .*) func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%\\\\${2}\$%%"`;;
+  *)  func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%${2}\$%%"`;;
+  esac
+} # func_stripname_cnf
+])# _LT_FUNC_STRIPNAME_CNF
+
+# _LT_SYS_HIDDEN_LIBDEPS([TAGNAME])
+# ---------------------------------
+# Figure out "hidden" library dependencies from verbose
+# compiler output when linking a shared library.
+# Parse the compiler output and extract the necessary
+# objects, libraries and library flags.
+m4_defun([_LT_SYS_HIDDEN_LIBDEPS],
+[m4_require([_LT_FILEUTILS_DEFAULTS])dnl
+AC_REQUIRE([_LT_FUNC_STRIPNAME_CNF])dnl
+# Dependencies to place before and after the object being linked:
+_LT_TAGVAR(predep_objects, $1)=
+_LT_TAGVAR(postdep_objects, $1)=
+_LT_TAGVAR(predeps, $1)=
+_LT_TAGVAR(postdeps, $1)=
+_LT_TAGVAR(compiler_lib_search_path, $1)=
+
+dnl we can't use the lt_simple_compile_test_code here,
+dnl because it contains code intended for an executable,
+dnl not a library.  It's possible we should let each
+dnl tag define a new lt_????_link_test_code variable,
+dnl but it's only used here...
+m4_if([$1], [], [cat > conftest.$ac_ext <<_LT_EOF
+int a;
+void foo (void) { a = 0; }
+_LT_EOF
+], [$1], [CXX], [cat > conftest.$ac_ext <<_LT_EOF
+class Foo
+{
+public:
+  Foo (void) { a = 0; }
+private:
+  int a;
+};
+_LT_EOF
+], [$1], [F77], [cat > conftest.$ac_ext <<_LT_EOF
+      subroutine foo
+      implicit none
+      integer*4 a
+      a=0
+      return
+      end
+_LT_EOF
+], [$1], [FC], [cat > conftest.$ac_ext <<_LT_EOF
+      subroutine foo
+      implicit none
+      integer a
+      a=0
+      return
+      end
+_LT_EOF
+], [$1], [GCJ], [cat > conftest.$ac_ext <<_LT_EOF
+public class foo {
+  private int a;
+  public void bar (void) {
+    a = 0;
+  }
+};
+_LT_EOF
+], [$1], [GO], [cat > conftest.$ac_ext <<_LT_EOF
+package foo
+func foo() {
+}
+_LT_EOF
+])
+
+_lt_libdeps_save_CFLAGS=$CFLAGS
+case "$CC $CFLAGS " in #(
+*\ -flto*\ *) CFLAGS="$CFLAGS -fno-lto" ;;
+*\ -fwhopr*\ *) CFLAGS="$CFLAGS -fno-whopr" ;;
+*\ -fuse-linker-plugin*\ *) CFLAGS="$CFLAGS -fno-use-linker-plugin" ;;
+esac
+
+dnl Parse the compiler output and extract the necessary
+dnl objects, libraries and library flags.
+if AC_TRY_EVAL(ac_compile); then
+  # Parse the compiler output and extract the necessary
+  # objects, libraries and library flags.
+
+  # Sentinel used to keep track of whether or not we are before
+  # the conftest object file.
+  pre_test_object_deps_done=no
+
+  for p in `eval "$output_verbose_link_cmd"`; do
+    case ${prev}${p} in
+
+    -L* | -R* | -l*)
+       # Some compilers place space between "-{L,R}" and the path.
+       # Remove the space.
+       if test $p = "-L" ||
+          test $p = "-R"; then
+	 prev=$p
+	 continue
+       fi
+
+       # Expand the sysroot to ease extracting the directories later.
+       if test -z "$prev"; then
+         case $p in
+         -L*) func_stripname_cnf '-L' '' "$p"; prev=-L; p=$func_stripname_result ;;
+         -R*) func_stripname_cnf '-R' '' "$p"; prev=-R; p=$func_stripname_result ;;
+         -l*) func_stripname_cnf '-l' '' "$p"; prev=-l; p=$func_stripname_result ;;
+         esac
+       fi
+       case $p in
+       =*) func_stripname_cnf '=' '' "$p"; p=$lt_sysroot$func_stripname_result ;;
+       esac
+       if test "$pre_test_object_deps_done" = no; then
+	 case ${prev} in
+	 -L | -R)
+	   # Internal compiler library paths should come after those
+	   # provided the user.  The postdeps already come after the
+	   # user supplied libs so there is no need to process them.
+	   if test -z "$_LT_TAGVAR(compiler_lib_search_path, $1)"; then
+	     _LT_TAGVAR(compiler_lib_search_path, $1)="${prev}${p}"
+	   else
+	     _LT_TAGVAR(compiler_lib_search_path, $1)="${_LT_TAGVAR(compiler_lib_search_path, $1)} ${prev}${p}"
+	   fi
+	   ;;
+	 # The "-l" case would never come before the object being
+	 # linked, so don't bother handling this case.
+	 esac
+       else
+	 if test -z "$_LT_TAGVAR(postdeps, $1)"; then
+	   _LT_TAGVAR(postdeps, $1)="${prev}${p}"
+	 else
+	   _LT_TAGVAR(postdeps, $1)="${_LT_TAGVAR(postdeps, $1)} ${prev}${p}"
+	 fi
+       fi
+       prev=
+       ;;
+
+    *.lto.$objext) ;; # Ignore GCC LTO objects
+    *.$objext)
+       # This assumes that the test object file only shows up
+       # once in the compiler output.
+       if test "$p" = "conftest.$objext"; then
+	 pre_test_object_deps_done=yes
+	 continue
+       fi
+
+       if test "$pre_test_object_deps_done" = no; then
+	 if test -z "$_LT_TAGVAR(predep_objects, $1)"; then
+	   _LT_TAGVAR(predep_objects, $1)="$p"
+	 else
+	   _LT_TAGVAR(predep_objects, $1)="$_LT_TAGVAR(predep_objects, $1) $p"
+	 fi
+       else
+	 if test -z "$_LT_TAGVAR(postdep_objects, $1)"; then
+	   _LT_TAGVAR(postdep_objects, $1)="$p"
+	 else
+	   _LT_TAGVAR(postdep_objects, $1)="$_LT_TAGVAR(postdep_objects, $1) $p"
+	 fi
+       fi
+       ;;
+
+    *) ;; # Ignore the rest.
+
+    esac
+  done
+
+  # Clean up.
+  rm -f a.out a.exe
+else
+  echo "libtool.m4: error: problem compiling $1 test program"
+fi
+
+$RM -f confest.$objext
+CFLAGS=$_lt_libdeps_save_CFLAGS
+
+# PORTME: override above test on systems where it is broken
+m4_if([$1], [CXX],
+[case $host_os in
+interix[[3-9]]*)
+  # Interix 3.5 installs completely hosed .la files for C++, so rather than
+  # hack all around it, let's just trust "g++" to DTRT.
+  _LT_TAGVAR(predep_objects,$1)=
+  _LT_TAGVAR(postdep_objects,$1)=
+  _LT_TAGVAR(postdeps,$1)=
+  ;;
+
+linux*)
+  case `$CC -V 2>&1 | sed 5q` in
+  *Sun\ C*)
+    # Sun C++ 5.9
+
+    # The more standards-conforming stlport4 library is
+    # incompatible with the Cstd library. Avoid specifying
+    # it if it's in CXXFLAGS. Ignore libCrun as
+    # -library=stlport4 depends on it.
+    case " $CXX $CXXFLAGS " in
+    *" -library=stlport4 "*)
+      solaris_use_stlport4=yes
+      ;;
+    esac
+
+    if test "$solaris_use_stlport4" != yes; then
+      _LT_TAGVAR(postdeps,$1)='-library=Cstd -library=Crun'
+    fi
+    ;;
+  esac
+  ;;
+
+solaris*)
+  case $cc_basename in
+  CC* | sunCC*)
+    # The more standards-conforming stlport4 library is
+    # incompatible with the Cstd library. Avoid specifying
+    # it if it's in CXXFLAGS. Ignore libCrun as
+    # -library=stlport4 depends on it.
+    case " $CXX $CXXFLAGS " in
+    *" -library=stlport4 "*)
+      solaris_use_stlport4=yes
+      ;;
+    esac
+
+    # Adding this requires a known-good setup of shared libraries for
+    # Sun compiler versions before 5.6, else PIC objects from an old
+    # archive will be linked into the output, leading to subtle bugs.
+    if test "$solaris_use_stlport4" != yes; then
+      _LT_TAGVAR(postdeps,$1)='-library=Cstd -library=Crun'
+    fi
+    ;;
+  esac
+  ;;
+esac
+])
+
+case " $_LT_TAGVAR(postdeps, $1) " in
+*" -lc "*) _LT_TAGVAR(archive_cmds_need_lc, $1)=no ;;
+esac
+ _LT_TAGVAR(compiler_lib_search_dirs, $1)=
+if test -n "${_LT_TAGVAR(compiler_lib_search_path, $1)}"; then
+ _LT_TAGVAR(compiler_lib_search_dirs, $1)=`echo " ${_LT_TAGVAR(compiler_lib_search_path, $1)}" | ${SED} -e 's! -L! !g' -e 's!^ !!'`
+fi
+_LT_TAGDECL([], [compiler_lib_search_dirs], [1],
+    [The directories searched by this compiler when creating a shared library])
+_LT_TAGDECL([], [predep_objects], [1],
+    [Dependencies to place before and after the objects being linked to
+    create a shared library])
+_LT_TAGDECL([], [postdep_objects], [1])
+_LT_TAGDECL([], [predeps], [1])
+_LT_TAGDECL([], [postdeps], [1])
+_LT_TAGDECL([], [compiler_lib_search_path], [1],
+    [The library search path used internally by the compiler when linking
+    a shared library])
+])# _LT_SYS_HIDDEN_LIBDEPS
+
+
+# _LT_LANG_F77_CONFIG([TAG])
+# --------------------------
+# Ensure that the configuration variables for a Fortran 77 compiler are
+# suitably defined.  These variables are subsequently used by _LT_CONFIG
+# to write the compiler configuration to `libtool'.
+m4_defun([_LT_LANG_F77_CONFIG],
+[AC_LANG_PUSH(Fortran 77)
+if test -z "$F77" || test "X$F77" = "Xno"; then
+  _lt_disable_F77=yes
+fi
+
+_LT_TAGVAR(archive_cmds_need_lc, $1)=no
+_LT_TAGVAR(allow_undefined_flag, $1)=
+_LT_TAGVAR(always_export_symbols, $1)=no
+_LT_TAGVAR(archive_expsym_cmds, $1)=
+_LT_TAGVAR(export_dynamic_flag_spec, $1)=
+_LT_TAGVAR(hardcode_direct, $1)=no
+_LT_TAGVAR(hardcode_direct_absolute, $1)=no
+_LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
+_LT_TAGVAR(hardcode_libdir_separator, $1)=
+_LT_TAGVAR(hardcode_minus_L, $1)=no
+_LT_TAGVAR(hardcode_automatic, $1)=no
+_LT_TAGVAR(inherit_rpath, $1)=no
+_LT_TAGVAR(module_cmds, $1)=
+_LT_TAGVAR(module_expsym_cmds, $1)=
+_LT_TAGVAR(link_all_deplibs, $1)=unknown
+_LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds
+_LT_TAGVAR(reload_flag, $1)=$reload_flag
+_LT_TAGVAR(reload_cmds, $1)=$reload_cmds
+_LT_TAGVAR(no_undefined_flag, $1)=
+_LT_TAGVAR(whole_archive_flag_spec, $1)=
+_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no
+
+# Source file extension for f77 test sources.
+ac_ext=f
+
+# Object file extension for compiled f77 test sources.
+objext=o
+_LT_TAGVAR(objext, $1)=$objext
+
+# No sense in running all these tests if we already determined that
+# the F77 compiler isn't working.  Some variables (like enable_shared)
+# are currently assumed to apply to all compilers on this platform,
+# and will be corrupted by setting them based on a non-working compiler.
+if test "$_lt_disable_F77" != yes; then
+  # Code to be used in simple compile tests
+  lt_simple_compile_test_code="\
+      subroutine t
+      return
+      end
+"
+
+  # Code to be used in simple link tests
+  lt_simple_link_test_code="\
+      program t
+      end
+"
+
+  # ltmain only uses $CC for tagged configurations so make sure $CC is set.
+  _LT_TAG_COMPILER
+
+  # save warnings/boilerplate of simple test code
+  _LT_COMPILER_BOILERPLATE
+  _LT_LINKER_BOILERPLATE
+
+  # Allow CC to be a program name with arguments.
+  lt_save_CC="$CC"
+  lt_save_GCC=$GCC
+  lt_save_CFLAGS=$CFLAGS
+  CC=${F77-"f77"}
+  CFLAGS=$FFLAGS
+  compiler=$CC
+  _LT_TAGVAR(compiler, $1)=$CC
+  _LT_CC_BASENAME([$compiler])
+  GCC=$G77
+  if test -n "$compiler"; then
+    AC_MSG_CHECKING([if libtool supports shared libraries])
+    AC_MSG_RESULT([$can_build_shared])
+
+    AC_MSG_CHECKING([whether to build shared libraries])
+    test "$can_build_shared" = "no" && enable_shared=no
+
+    # On AIX, shared libraries and static libraries use the same namespace, and
+    # are all built from PIC.
+    case $host_os in
+      aix3*)
+        test "$enable_shared" = yes && enable_static=no
+        if test -n "$RANLIB"; then
+          archive_cmds="$archive_cmds~\$RANLIB \$lib"
+          postinstall_cmds='$RANLIB $lib'
+        fi
+        ;;
+      aix[[4-9]]*)
+	if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
+	  test "$enable_shared" = yes && enable_static=no
+	fi
+        ;;
+    esac
+    AC_MSG_RESULT([$enable_shared])
+
+    AC_MSG_CHECKING([whether to build static libraries])
+    # Make sure either enable_shared or enable_static is yes.
+    test "$enable_shared" = yes || enable_static=yes
+    AC_MSG_RESULT([$enable_static])
+
+    _LT_TAGVAR(GCC, $1)="$G77"
+    _LT_TAGVAR(LD, $1)="$LD"
+
+    ## CAVEAT EMPTOR:
+    ## There is no encapsulation within the following macros, do not change
+    ## the running order or otherwise move them around unless you know exactly
+    ## what you are doing...
+    _LT_COMPILER_PIC($1)
+    _LT_COMPILER_C_O($1)
+    _LT_COMPILER_FILE_LOCKS($1)
+    _LT_LINKER_SHLIBS($1)
+    _LT_SYS_DYNAMIC_LINKER($1)
+    _LT_LINKER_HARDCODE_LIBPATH($1)
+
+    _LT_CONFIG($1)
+  fi # test -n "$compiler"
+
+  GCC=$lt_save_GCC
+  CC="$lt_save_CC"
+  CFLAGS="$lt_save_CFLAGS"
+fi # test "$_lt_disable_F77" != yes
+
+AC_LANG_POP
+])# _LT_LANG_F77_CONFIG
+
+
+# _LT_LANG_FC_CONFIG([TAG])
+# -------------------------
+# Ensure that the configuration variables for a Fortran compiler are
+# suitably defined.  These variables are subsequently used by _LT_CONFIG
+# to write the compiler configuration to `libtool'.
+m4_defun([_LT_LANG_FC_CONFIG],
+[AC_LANG_PUSH(Fortran)
+
+if test -z "$FC" || test "X$FC" = "Xno"; then
+  _lt_disable_FC=yes
+fi
+
+_LT_TAGVAR(archive_cmds_need_lc, $1)=no
+_LT_TAGVAR(allow_undefined_flag, $1)=
+_LT_TAGVAR(always_export_symbols, $1)=no
+_LT_TAGVAR(archive_expsym_cmds, $1)=
+_LT_TAGVAR(export_dynamic_flag_spec, $1)=
+_LT_TAGVAR(hardcode_direct, $1)=no
+_LT_TAGVAR(hardcode_direct_absolute, $1)=no
+_LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
+_LT_TAGVAR(hardcode_libdir_separator, $1)=
+_LT_TAGVAR(hardcode_minus_L, $1)=no
+_LT_TAGVAR(hardcode_automatic, $1)=no
+_LT_TAGVAR(inherit_rpath, $1)=no
+_LT_TAGVAR(module_cmds, $1)=
+_LT_TAGVAR(module_expsym_cmds, $1)=
+_LT_TAGVAR(link_all_deplibs, $1)=unknown
+_LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds
+_LT_TAGVAR(reload_flag, $1)=$reload_flag
+_LT_TAGVAR(reload_cmds, $1)=$reload_cmds
+_LT_TAGVAR(no_undefined_flag, $1)=
+_LT_TAGVAR(whole_archive_flag_spec, $1)=
+_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no
+
+# Source file extension for fc test sources.
+ac_ext=${ac_fc_srcext-f}
+
+# Object file extension for compiled fc test sources.
+objext=o
+_LT_TAGVAR(objext, $1)=$objext
+
+# No sense in running all these tests if we already determined that
+# the FC compiler isn't working.  Some variables (like enable_shared)
+# are currently assumed to apply to all compilers on this platform,
+# and will be corrupted by setting them based on a non-working compiler.
+if test "$_lt_disable_FC" != yes; then
+  # Code to be used in simple compile tests
+  lt_simple_compile_test_code="\
+      subroutine t
+      return
+      end
+"
+
+  # Code to be used in simple link tests
+  lt_simple_link_test_code="\
+      program t
+      end
+"
+
+  # ltmain only uses $CC for tagged configurations so make sure $CC is set.
+  _LT_TAG_COMPILER
+
+  # save warnings/boilerplate of simple test code
+  _LT_COMPILER_BOILERPLATE
+  _LT_LINKER_BOILERPLATE
+
+  # Allow CC to be a program name with arguments.
+  lt_save_CC="$CC"
+  lt_save_GCC=$GCC
+  lt_save_CFLAGS=$CFLAGS
+  CC=${FC-"f95"}
+  CFLAGS=$FCFLAGS
+  compiler=$CC
+  GCC=$ac_cv_fc_compiler_gnu
+
+  _LT_TAGVAR(compiler, $1)=$CC
+  _LT_CC_BASENAME([$compiler])
+
+  if test -n "$compiler"; then
+    AC_MSG_CHECKING([if libtool supports shared libraries])
+    AC_MSG_RESULT([$can_build_shared])
+
+    AC_MSG_CHECKING([whether to build shared libraries])
+    test "$can_build_shared" = "no" && enable_shared=no
+
+    # On AIX, shared libraries and static libraries use the same namespace, and
+    # are all built from PIC.
+    case $host_os in
+      aix3*)
+        test "$enable_shared" = yes && enable_static=no
+        if test -n "$RANLIB"; then
+          archive_cmds="$archive_cmds~\$RANLIB \$lib"
+          postinstall_cmds='$RANLIB $lib'
+        fi
+        ;;
+      aix[[4-9]]*)
+	if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
+	  test "$enable_shared" = yes && enable_static=no
+	fi
+        ;;
+    esac
+    AC_MSG_RESULT([$enable_shared])
+
+    AC_MSG_CHECKING([whether to build static libraries])
+    # Make sure either enable_shared or enable_static is yes.
+    test "$enable_shared" = yes || enable_static=yes
+    AC_MSG_RESULT([$enable_static])
+
+    _LT_TAGVAR(GCC, $1)="$ac_cv_fc_compiler_gnu"
+    _LT_TAGVAR(LD, $1)="$LD"
+
+    ## CAVEAT EMPTOR:
+    ## There is no encapsulation within the following macros, do not change
+    ## the running order or otherwise move them around unless you know exactly
+    ## what you are doing...
+    _LT_SYS_HIDDEN_LIBDEPS($1)
+    _LT_COMPILER_PIC($1)
+    _LT_COMPILER_C_O($1)
+    _LT_COMPILER_FILE_LOCKS($1)
+    _LT_LINKER_SHLIBS($1)
+    _LT_SYS_DYNAMIC_LINKER($1)
+    _LT_LINKER_HARDCODE_LIBPATH($1)
+
+    _LT_CONFIG($1)
+  fi # test -n "$compiler"
+
+  GCC=$lt_save_GCC
+  CC=$lt_save_CC
+  CFLAGS=$lt_save_CFLAGS
+fi # test "$_lt_disable_FC" != yes
+
+AC_LANG_POP
+])# _LT_LANG_FC_CONFIG
+
+
+# _LT_LANG_GCJ_CONFIG([TAG])
+# --------------------------
+# Ensure that the configuration variables for the GNU Java Compiler compiler
+# are suitably defined.  These variables are subsequently used by _LT_CONFIG
+# to write the compiler configuration to `libtool'.
+m4_defun([_LT_LANG_GCJ_CONFIG],
+[AC_REQUIRE([LT_PROG_GCJ])dnl
+AC_LANG_SAVE
+
+# Source file extension for Java test sources.
+ac_ext=java
+
+# Object file extension for compiled Java test sources.
+objext=o
+_LT_TAGVAR(objext, $1)=$objext
+
+# Code to be used in simple compile tests
+lt_simple_compile_test_code="class foo {}"
+
+# Code to be used in simple link tests
+lt_simple_link_test_code='public class conftest { public static void main(String[[]] argv) {}; }'
+
+# ltmain only uses $CC for tagged configurations so make sure $CC is set.
+_LT_TAG_COMPILER
+
+# save warnings/boilerplate of simple test code
+_LT_COMPILER_BOILERPLATE
+_LT_LINKER_BOILERPLATE
+
+# Allow CC to be a program name with arguments.
+lt_save_CC=$CC
+lt_save_CFLAGS=$CFLAGS
+lt_save_GCC=$GCC
+GCC=yes
+CC=${GCJ-"gcj"}
+CFLAGS=$GCJFLAGS
+compiler=$CC
+_LT_TAGVAR(compiler, $1)=$CC
+_LT_TAGVAR(LD, $1)="$LD"
+_LT_CC_BASENAME([$compiler])
+
+# GCJ did not exist at the time GCC didn't implicitly link libc in.
+_LT_TAGVAR(archive_cmds_need_lc, $1)=no
+
+_LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds
+_LT_TAGVAR(reload_flag, $1)=$reload_flag
+_LT_TAGVAR(reload_cmds, $1)=$reload_cmds
+
+## CAVEAT EMPTOR:
+## There is no encapsulation within the following macros, do not change
+## the running order or otherwise move them around unless you know exactly
+## what you are doing...
+if test -n "$compiler"; then
+  _LT_COMPILER_NO_RTTI($1)
+  _LT_COMPILER_PIC($1)
+  _LT_COMPILER_C_O($1)
+  _LT_COMPILER_FILE_LOCKS($1)
+  _LT_LINKER_SHLIBS($1)
+  _LT_LINKER_HARDCODE_LIBPATH($1)
+
+  _LT_CONFIG($1)
+fi
+
+AC_LANG_RESTORE
+
+GCC=$lt_save_GCC
+CC=$lt_save_CC
+CFLAGS=$lt_save_CFLAGS
+])# _LT_LANG_GCJ_CONFIG
+
+
+# _LT_LANG_GO_CONFIG([TAG])
+# --------------------------
+# Ensure that the configuration variables for the GNU Go compiler
+# are suitably defined.  These variables are subsequently used by _LT_CONFIG
+# to write the compiler configuration to `libtool'.
+m4_defun([_LT_LANG_GO_CONFIG],
+[AC_REQUIRE([LT_PROG_GO])dnl
+AC_LANG_SAVE
+
+# Source file extension for Go test sources.
+ac_ext=go
+
+# Object file extension for compiled Go test sources.
+objext=o
+_LT_TAGVAR(objext, $1)=$objext
+
+# Code to be used in simple compile tests
+lt_simple_compile_test_code="package main; func main() { }"
+
+# Code to be used in simple link tests
+lt_simple_link_test_code='package main; func main() { }'
+
+# ltmain only uses $CC for tagged configurations so make sure $CC is set.
+_LT_TAG_COMPILER
+
+# save warnings/boilerplate of simple test code
+_LT_COMPILER_BOILERPLATE
+_LT_LINKER_BOILERPLATE
+
+# Allow CC to be a program name with arguments.
+lt_save_CC=$CC
+lt_save_CFLAGS=$CFLAGS
+lt_save_GCC=$GCC
+GCC=yes
+CC=${GOC-"gccgo"}
+CFLAGS=$GOFLAGS
+compiler=$CC
+_LT_TAGVAR(compiler, $1)=$CC
+_LT_TAGVAR(LD, $1)="$LD"
+_LT_CC_BASENAME([$compiler])
+
+# Go did not exist at the time GCC didn't implicitly link libc in.
+_LT_TAGVAR(archive_cmds_need_lc, $1)=no
+
+_LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds
+_LT_TAGVAR(reload_flag, $1)=$reload_flag
+_LT_TAGVAR(reload_cmds, $1)=$reload_cmds
+
+## CAVEAT EMPTOR:
+## There is no encapsulation within the following macros, do not change
+## the running order or otherwise move them around unless you know exactly
+## what you are doing...
+if test -n "$compiler"; then
+  _LT_COMPILER_NO_RTTI($1)
+  _LT_COMPILER_PIC($1)
+  _LT_COMPILER_C_O($1)
+  _LT_COMPILER_FILE_LOCKS($1)
+  _LT_LINKER_SHLIBS($1)
+  _LT_LINKER_HARDCODE_LIBPATH($1)
+
+  _LT_CONFIG($1)
+fi
+
+AC_LANG_RESTORE
+
+GCC=$lt_save_GCC
+CC=$lt_save_CC
+CFLAGS=$lt_save_CFLAGS
+])# _LT_LANG_GO_CONFIG
+
+
+# _LT_LANG_RC_CONFIG([TAG])
+# -------------------------
+# Ensure that the configuration variables for the Windows resource compiler
+# are suitably defined.  These variables are subsequently used by _LT_CONFIG
+# to write the compiler configuration to `libtool'.
+m4_defun([_LT_LANG_RC_CONFIG],
+[AC_REQUIRE([LT_PROG_RC])dnl
+AC_LANG_SAVE
+
+# Source file extension for RC test sources.
+ac_ext=rc
+
+# Object file extension for compiled RC test sources.
+objext=o
+_LT_TAGVAR(objext, $1)=$objext
+
+# Code to be used in simple compile tests
+lt_simple_compile_test_code='sample MENU { MENUITEM "&Soup", 100, CHECKED }'
+
+# Code to be used in simple link tests
+lt_simple_link_test_code="$lt_simple_compile_test_code"
+
+# ltmain only uses $CC for tagged configurations so make sure $CC is set.
+_LT_TAG_COMPILER
+
+# save warnings/boilerplate of simple test code
+_LT_COMPILER_BOILERPLATE
+_LT_LINKER_BOILERPLATE
+
+# Allow CC to be a program name with arguments.
+lt_save_CC="$CC"
+lt_save_CFLAGS=$CFLAGS
+lt_save_GCC=$GCC
+GCC=
+CC=${RC-"windres"}
+CFLAGS=
+compiler=$CC
+_LT_TAGVAR(compiler, $1)=$CC
+_LT_CC_BASENAME([$compiler])
+_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)=yes
+
+if test -n "$compiler"; then
+  :
+  _LT_CONFIG($1)
+fi
+
+GCC=$lt_save_GCC
+AC_LANG_RESTORE
+CC=$lt_save_CC
+CFLAGS=$lt_save_CFLAGS
+])# _LT_LANG_RC_CONFIG
+
+
+# LT_PROG_GCJ
+# -----------
+AC_DEFUN([LT_PROG_GCJ],
+[m4_ifdef([AC_PROG_GCJ], [AC_PROG_GCJ],
+  [m4_ifdef([A][M_PROG_GCJ], [A][M_PROG_GCJ],
+    [AC_CHECK_TOOL(GCJ, gcj,)
+      test "x${GCJFLAGS+set}" = xset || GCJFLAGS="-g -O2"
+      AC_SUBST(GCJFLAGS)])])[]dnl
+])
+
+# Old name:
+AU_ALIAS([LT_AC_PROG_GCJ], [LT_PROG_GCJ])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([LT_AC_PROG_GCJ], [])
+
+
+# LT_PROG_GO
+# ----------
+AC_DEFUN([LT_PROG_GO],
+[AC_CHECK_TOOL(GOC, gccgo,)
+])
+
+
+# LT_PROG_RC
+# ----------
+AC_DEFUN([LT_PROG_RC],
+[AC_CHECK_TOOL(RC, windres,)
+])
+
+# Old name:
+AU_ALIAS([LT_AC_PROG_RC], [LT_PROG_RC])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([LT_AC_PROG_RC], [])
+
+
+# _LT_DECL_EGREP
+# --------------
+# If we don't have a new enough Autoconf to choose the best grep
+# available, choose the one first in the user's PATH.
+m4_defun([_LT_DECL_EGREP],
+[AC_REQUIRE([AC_PROG_EGREP])dnl
+AC_REQUIRE([AC_PROG_FGREP])dnl
+test -z "$GREP" && GREP=grep
+_LT_DECL([], [GREP], [1], [A grep program that handles long lines])
+_LT_DECL([], [EGREP], [1], [An ERE matcher])
+_LT_DECL([], [FGREP], [1], [A literal string matcher])
+dnl Non-bleeding-edge autoconf doesn't subst GREP, so do it here too
+AC_SUBST([GREP])
+])
+
+
+# _LT_DECL_OBJDUMP
+# --------------
+# If we don't have a new enough Autoconf to choose the best objdump
+# available, choose the one first in the user's PATH.
+m4_defun([_LT_DECL_OBJDUMP],
+[AC_CHECK_TOOL(OBJDUMP, objdump, false)
+test -z "$OBJDUMP" && OBJDUMP=objdump
+_LT_DECL([], [OBJDUMP], [1], [An object symbol dumper])
+AC_SUBST([OBJDUMP])
+])
+
+# _LT_DECL_DLLTOOL
+# ----------------
+# Ensure DLLTOOL variable is set.
+m4_defun([_LT_DECL_DLLTOOL],
+[AC_CHECK_TOOL(DLLTOOL, dlltool, false)
+test -z "$DLLTOOL" && DLLTOOL=dlltool
+_LT_DECL([], [DLLTOOL], [1], [DLL creation program])
+AC_SUBST([DLLTOOL])
+])
+
+# _LT_DECL_SED
+# ------------
+# Check for a fully-functional sed program, that truncates
+# as few characters as possible.  Prefer GNU sed if found.
+m4_defun([_LT_DECL_SED],
+[AC_PROG_SED
+test -z "$SED" && SED=sed
+Xsed="$SED -e 1s/^X//"
+_LT_DECL([], [SED], [1], [A sed program that does not truncate output])
+_LT_DECL([], [Xsed], ["\$SED -e 1s/^X//"],
+    [Sed that helps us avoid accidentally triggering echo(1) options like -n])
+])# _LT_DECL_SED
+
+m4_ifndef([AC_PROG_SED], [
+############################################################
+# NOTE: This macro has been submitted for inclusion into   #
+#  GNU Autoconf as AC_PROG_SED.  When it is available in   #
+#  a released version of Autoconf we should remove this    #
+#  macro and use it instead.                               #
+############################################################
+
+m4_defun([AC_PROG_SED],
+[AC_MSG_CHECKING([for a sed that does not truncate output])
+AC_CACHE_VAL(lt_cv_path_SED,
+[# Loop through the user's path and test for sed and gsed.
+# Then use that list of sed's as ones to test for truncation.
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for lt_ac_prog in sed gsed; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      if $as_executable_p "$as_dir/$lt_ac_prog$ac_exec_ext"; then
+        lt_ac_sed_list="$lt_ac_sed_list $as_dir/$lt_ac_prog$ac_exec_ext"
+      fi
+    done
+  done
+done
+IFS=$as_save_IFS
+lt_ac_max=0
+lt_ac_count=0
+# Add /usr/xpg4/bin/sed as it is typically found on Solaris
+# along with /bin/sed that truncates output.
+for lt_ac_sed in $lt_ac_sed_list /usr/xpg4/bin/sed; do
+  test ! -f $lt_ac_sed && continue
+  cat /dev/null > conftest.in
+  lt_ac_count=0
+  echo $ECHO_N "0123456789$ECHO_C" >conftest.in
+  # Check for GNU sed and select it if it is found.
+  if "$lt_ac_sed" --version 2>&1 < /dev/null | grep 'GNU' > /dev/null; then
+    lt_cv_path_SED=$lt_ac_sed
+    break
+  fi
+  while true; do
+    cat conftest.in conftest.in >conftest.tmp
+    mv conftest.tmp conftest.in
+    cp conftest.in conftest.nl
+    echo >>conftest.nl
+    $lt_ac_sed -e 's/a$//' < conftest.nl >conftest.out || break
+    cmp -s conftest.out conftest.nl || break
+    # 10000 chars as input seems more than enough
+    test $lt_ac_count -gt 10 && break
+    lt_ac_count=`expr $lt_ac_count + 1`
+    if test $lt_ac_count -gt $lt_ac_max; then
+      lt_ac_max=$lt_ac_count
+      lt_cv_path_SED=$lt_ac_sed
+    fi
+  done
+done
+])
+SED=$lt_cv_path_SED
+AC_SUBST([SED])
+AC_MSG_RESULT([$SED])
+])#AC_PROG_SED
+])#m4_ifndef
+
+# Old name:
+AU_ALIAS([LT_AC_PROG_SED], [AC_PROG_SED])
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([LT_AC_PROG_SED], [])
+
+
+# _LT_CHECK_SHELL_FEATURES
+# ------------------------
+# Find out whether the shell is Bourne or XSI compatible,
+# or has some other useful features.
+m4_defun([_LT_CHECK_SHELL_FEATURES],
+[AC_MSG_CHECKING([whether the shell understands some XSI constructs])
+# Try some XSI features
+xsi_shell=no
+( _lt_dummy="a/b/c"
+  test "${_lt_dummy##*/},${_lt_dummy%/*},${_lt_dummy#??}"${_lt_dummy%"$_lt_dummy"}, \
+      = c,a/b,b/c, \
+    && eval 'test $(( 1 + 1 )) -eq 2 \
+    && test "${#_lt_dummy}" -eq 5' ) >/dev/null 2>&1 \
+  && xsi_shell=yes
+AC_MSG_RESULT([$xsi_shell])
+_LT_CONFIG_LIBTOOL_INIT([xsi_shell='$xsi_shell'])
+
+AC_MSG_CHECKING([whether the shell understands "+="])
+lt_shell_append=no
+( foo=bar; set foo baz; eval "$[1]+=\$[2]" && test "$foo" = barbaz ) \
+    >/dev/null 2>&1 \
+  && lt_shell_append=yes
+AC_MSG_RESULT([$lt_shell_append])
+_LT_CONFIG_LIBTOOL_INIT([lt_shell_append='$lt_shell_append'])
+
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+  lt_unset=unset
+else
+  lt_unset=false
+fi
+_LT_DECL([], [lt_unset], [0], [whether the shell understands "unset"])dnl
+
+# test EBCDIC or ASCII
+case `echo X|tr X '\101'` in
+ A) # ASCII based system
+    # \n is not interpreted correctly by Solaris 8 /usr/ucb/tr
+  lt_SP2NL='tr \040 \012'
+  lt_NL2SP='tr \015\012 \040\040'
+  ;;
+ *) # EBCDIC based system
+  lt_SP2NL='tr \100 \n'
+  lt_NL2SP='tr \r\n \100\100'
+  ;;
+esac
+_LT_DECL([SP2NL], [lt_SP2NL], [1], [turn spaces into newlines])dnl
+_LT_DECL([NL2SP], [lt_NL2SP], [1], [turn newlines into spaces])dnl
+])# _LT_CHECK_SHELL_FEATURES
+
+
+# _LT_PROG_FUNCTION_REPLACE (FUNCNAME, REPLACEMENT-BODY)
+# ------------------------------------------------------
+# In `$cfgfile', look for function FUNCNAME delimited by `^FUNCNAME ()$' and
+# '^} FUNCNAME ', and replace its body with REPLACEMENT-BODY.
+m4_defun([_LT_PROG_FUNCTION_REPLACE],
+[dnl {
+sed -e '/^$1 ()$/,/^} # $1 /c\
+$1 ()\
+{\
+m4_bpatsubsts([$2], [$], [\\], [^\([	 ]\)], [\\\1])
+} # Extended-shell $1 implementation' "$cfgfile" > $cfgfile.tmp \
+  && mv -f "$cfgfile.tmp" "$cfgfile" \
+    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+test 0 -eq $? || _lt_function_replace_fail=:
+])
+
+
+# _LT_PROG_REPLACE_SHELLFNS
+# -------------------------
+# Replace existing portable implementations of several shell functions with
+# equivalent extended shell implementations where those features are available..
+m4_defun([_LT_PROG_REPLACE_SHELLFNS],
+[if test x"$xsi_shell" = xyes; then
+  _LT_PROG_FUNCTION_REPLACE([func_dirname], [dnl
+    case ${1} in
+      */*) func_dirname_result="${1%/*}${2}" ;;
+      *  ) func_dirname_result="${3}" ;;
+    esac])
+
+  _LT_PROG_FUNCTION_REPLACE([func_basename], [dnl
+    func_basename_result="${1##*/}"])
+
+  _LT_PROG_FUNCTION_REPLACE([func_dirname_and_basename], [dnl
+    case ${1} in
+      */*) func_dirname_result="${1%/*}${2}" ;;
+      *  ) func_dirname_result="${3}" ;;
+    esac
+    func_basename_result="${1##*/}"])
+
+  _LT_PROG_FUNCTION_REPLACE([func_stripname], [dnl
+    # pdksh 5.2.14 does not do ${X%$Y} correctly if both X and Y are
+    # positional parameters, so assign one to ordinary parameter first.
+    func_stripname_result=${3}
+    func_stripname_result=${func_stripname_result#"${1}"}
+    func_stripname_result=${func_stripname_result%"${2}"}])
+
+  _LT_PROG_FUNCTION_REPLACE([func_split_long_opt], [dnl
+    func_split_long_opt_name=${1%%=*}
+    func_split_long_opt_arg=${1#*=}])
+
+  _LT_PROG_FUNCTION_REPLACE([func_split_short_opt], [dnl
+    func_split_short_opt_arg=${1#??}
+    func_split_short_opt_name=${1%"$func_split_short_opt_arg"}])
+
+  _LT_PROG_FUNCTION_REPLACE([func_lo2o], [dnl
+    case ${1} in
+      *.lo) func_lo2o_result=${1%.lo}.${objext} ;;
+      *)    func_lo2o_result=${1} ;;
+    esac])
+
+  _LT_PROG_FUNCTION_REPLACE([func_xform], [    func_xform_result=${1%.*}.lo])
+
+  _LT_PROG_FUNCTION_REPLACE([func_arith], [    func_arith_result=$(( $[*] ))])
+
+  _LT_PROG_FUNCTION_REPLACE([func_len], [    func_len_result=${#1}])
+fi
+
+if test x"$lt_shell_append" = xyes; then
+  _LT_PROG_FUNCTION_REPLACE([func_append], [    eval "${1}+=\\${2}"])
+
+  _LT_PROG_FUNCTION_REPLACE([func_append_quoted], [dnl
+    func_quote_for_eval "${2}"
+dnl m4 expansion turns \\\\ into \\, and then the shell eval turns that into \
+    eval "${1}+=\\\\ \\$func_quote_for_eval_result"])
+
+  # Save a `func_append' function call where possible by direct use of '+='
+  sed -e 's%func_append \([[a-zA-Z_]]\{1,\}\) "%\1+="%g' $cfgfile > $cfgfile.tmp \
+    && mv -f "$cfgfile.tmp" "$cfgfile" \
+      || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+  test 0 -eq $? || _lt_function_replace_fail=:
+else
+  # Save a `func_append' function call even when '+=' is not available
+  sed -e 's%func_append \([[a-zA-Z_]]\{1,\}\) "%\1="$\1%g' $cfgfile > $cfgfile.tmp \
+    && mv -f "$cfgfile.tmp" "$cfgfile" \
+      || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
+  test 0 -eq $? || _lt_function_replace_fail=:
+fi
+
+if test x"$_lt_function_replace_fail" = x":"; then
+  AC_MSG_WARN([Unable to substitute extended shell functions in $ofile])
+fi
+])
+
+# _LT_PATH_CONVERSION_FUNCTIONS
+# -----------------------------
+# Determine which file name conversion functions should be used by
+# func_to_host_file (and, implicitly, by func_to_host_path).  These are needed
+# for certain cross-compile configurations and native mingw.
+m4_defun([_LT_PATH_CONVERSION_FUNCTIONS],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+AC_MSG_CHECKING([how to convert $build file names to $host format])
+AC_CACHE_VAL(lt_cv_to_host_file_cmd,
+[case $host in
+  *-*-mingw* )
+    case $build in
+      *-*-mingw* ) # actually msys
+        lt_cv_to_host_file_cmd=func_convert_file_msys_to_w32
+        ;;
+      *-*-cygwin* )
+        lt_cv_to_host_file_cmd=func_convert_file_cygwin_to_w32
+        ;;
+      * ) # otherwise, assume *nix
+        lt_cv_to_host_file_cmd=func_convert_file_nix_to_w32
+        ;;
+    esac
+    ;;
+  *-*-cygwin* )
+    case $build in
+      *-*-mingw* ) # actually msys
+        lt_cv_to_host_file_cmd=func_convert_file_msys_to_cygwin
+        ;;
+      *-*-cygwin* )
+        lt_cv_to_host_file_cmd=func_convert_file_noop
+        ;;
+      * ) # otherwise, assume *nix
+        lt_cv_to_host_file_cmd=func_convert_file_nix_to_cygwin
+        ;;
+    esac
+    ;;
+  * ) # unhandled hosts (and "normal" native builds)
+    lt_cv_to_host_file_cmd=func_convert_file_noop
+    ;;
+esac
+])
+to_host_file_cmd=$lt_cv_to_host_file_cmd
+AC_MSG_RESULT([$lt_cv_to_host_file_cmd])
+_LT_DECL([to_host_file_cmd], [lt_cv_to_host_file_cmd],
+         [0], [convert $build file names to $host format])dnl
+
+AC_MSG_CHECKING([how to convert $build file names to toolchain format])
+AC_CACHE_VAL(lt_cv_to_tool_file_cmd,
+[#assume ordinary cross tools, or native build.
+lt_cv_to_tool_file_cmd=func_convert_file_noop
+case $host in
+  *-*-mingw* )
+    case $build in
+      *-*-mingw* ) # actually msys
+        lt_cv_to_tool_file_cmd=func_convert_file_msys_to_w32
+        ;;
+    esac
+    ;;
+esac
+])
+to_tool_file_cmd=$lt_cv_to_tool_file_cmd
+AC_MSG_RESULT([$lt_cv_to_tool_file_cmd])
+_LT_DECL([to_tool_file_cmd], [lt_cv_to_tool_file_cmd],
+         [0], [convert $build files to toolchain format])dnl
+])# _LT_PATH_CONVERSION_FUNCTIONS
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/ltoptions.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/ltoptions.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,384 @@
+# Helper functions for option handling.                    -*- Autoconf -*-
+#
+#   Copyright (C) 2004, 2005, 2007, 2008, 2009 Free Software Foundation,
+#   Inc.
+#   Written by Gary V. Vaughan, 2004
+#
+# This file is free software; the Free Software Foundation gives
+# unlimited permission to copy and/or distribute it, with or without
+# modifications, as long as this notice is preserved.
+
+# serial 7 ltoptions.m4
+
+# This is to help aclocal find these macros, as it can't see m4_define.
+AC_DEFUN([LTOPTIONS_VERSION], [m4_if([1])])
+
+
+# _LT_MANGLE_OPTION(MACRO-NAME, OPTION-NAME)
+# ------------------------------------------
+m4_define([_LT_MANGLE_OPTION],
+[[_LT_OPTION_]m4_bpatsubst($1__$2, [[^a-zA-Z0-9_]], [_])])
+
+
+# _LT_SET_OPTION(MACRO-NAME, OPTION-NAME)
+# ---------------------------------------
+# Set option OPTION-NAME for macro MACRO-NAME, and if there is a
+# matching handler defined, dispatch to it.  Other OPTION-NAMEs are
+# saved as a flag.
+m4_define([_LT_SET_OPTION],
+[m4_define(_LT_MANGLE_OPTION([$1], [$2]))dnl
+m4_ifdef(_LT_MANGLE_DEFUN([$1], [$2]),
+        _LT_MANGLE_DEFUN([$1], [$2]),
+    [m4_warning([Unknown $1 option `$2'])])[]dnl
+])
+
+
+# _LT_IF_OPTION(MACRO-NAME, OPTION-NAME, IF-SET, [IF-NOT-SET])
+# ------------------------------------------------------------
+# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
+m4_define([_LT_IF_OPTION],
+[m4_ifdef(_LT_MANGLE_OPTION([$1], [$2]), [$3], [$4])])
+
+
+# _LT_UNLESS_OPTIONS(MACRO-NAME, OPTION-LIST, IF-NOT-SET)
+# -------------------------------------------------------
+# Execute IF-NOT-SET unless all options in OPTION-LIST for MACRO-NAME
+# are set.
+m4_define([_LT_UNLESS_OPTIONS],
+[m4_foreach([_LT_Option], m4_split(m4_normalize([$2])),
+	    [m4_ifdef(_LT_MANGLE_OPTION([$1], _LT_Option),
+		      [m4_define([$0_found])])])[]dnl
+m4_ifdef([$0_found], [m4_undefine([$0_found])], [$3
+])[]dnl
+])
+
+
+# _LT_SET_OPTIONS(MACRO-NAME, OPTION-LIST)
+# ----------------------------------------
+# OPTION-LIST is a space-separated list of Libtool options associated
+# with MACRO-NAME.  If any OPTION has a matching handler declared with
+# LT_OPTION_DEFINE, dispatch to that macro; otherwise complain about
+# the unknown option and exit.
+m4_defun([_LT_SET_OPTIONS],
+[# Set options
+m4_foreach([_LT_Option], m4_split(m4_normalize([$2])),
+    [_LT_SET_OPTION([$1], _LT_Option)])
+
+m4_if([$1],[LT_INIT],[
+  dnl
+  dnl Simply set some default values (i.e off) if boolean options were not
+  dnl specified:
+  _LT_UNLESS_OPTIONS([LT_INIT], [dlopen], [enable_dlopen=no
+  ])
+  _LT_UNLESS_OPTIONS([LT_INIT], [win32-dll], [enable_win32_dll=no
+  ])
+  dnl
+  dnl If no reference was made to various pairs of opposing options, then
+  dnl we run the default mode handler for the pair.  For example, if neither
+  dnl `shared' nor `disable-shared' was passed, we enable building of shared
+  dnl archives by default:
+  _LT_UNLESS_OPTIONS([LT_INIT], [shared disable-shared], [_LT_ENABLE_SHARED])
+  _LT_UNLESS_OPTIONS([LT_INIT], [static disable-static], [_LT_ENABLE_STATIC])
+  _LT_UNLESS_OPTIONS([LT_INIT], [pic-only no-pic], [_LT_WITH_PIC])
+  _LT_UNLESS_OPTIONS([LT_INIT], [fast-install disable-fast-install],
+  		   [_LT_ENABLE_FAST_INSTALL])
+  ])
+])# _LT_SET_OPTIONS
+
+
+## --------------------------------- ##
+## Macros to handle LT_INIT options. ##
+## --------------------------------- ##
+
+# _LT_MANGLE_DEFUN(MACRO-NAME, OPTION-NAME)
+# -----------------------------------------
+m4_define([_LT_MANGLE_DEFUN],
+[[_LT_OPTION_DEFUN_]m4_bpatsubst(m4_toupper([$1__$2]), [[^A-Z0-9_]], [_])])
+
+
+# LT_OPTION_DEFINE(MACRO-NAME, OPTION-NAME, CODE)
+# -----------------------------------------------
+m4_define([LT_OPTION_DEFINE],
+[m4_define(_LT_MANGLE_DEFUN([$1], [$2]), [$3])[]dnl
+])# LT_OPTION_DEFINE
+
+
+# dlopen
+# ------
+LT_OPTION_DEFINE([LT_INIT], [dlopen], [enable_dlopen=yes
+])
+
+AU_DEFUN([AC_LIBTOOL_DLOPEN],
+[_LT_SET_OPTION([LT_INIT], [dlopen])
+AC_DIAGNOSE([obsolete],
+[$0: Remove this warning and the call to _LT_SET_OPTION when you
+put the `dlopen' option into LT_INIT's first parameter.])
+])
+
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_LIBTOOL_DLOPEN], [])
+
+
+# win32-dll
+# ---------
+# Declare package support for building win32 dll's.
+LT_OPTION_DEFINE([LT_INIT], [win32-dll],
+[enable_win32_dll=yes
+
+case $host in
+*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-cegcc*)
+  AC_CHECK_TOOL(AS, as, false)
+  AC_CHECK_TOOL(DLLTOOL, dlltool, false)
+  AC_CHECK_TOOL(OBJDUMP, objdump, false)
+  ;;
+esac
+
+test -z "$AS" && AS=as
+_LT_DECL([], [AS],      [1], [Assembler program])dnl
+
+test -z "$DLLTOOL" && DLLTOOL=dlltool
+_LT_DECL([], [DLLTOOL], [1], [DLL creation program])dnl
+
+test -z "$OBJDUMP" && OBJDUMP=objdump
+_LT_DECL([], [OBJDUMP], [1], [Object dumper program])dnl
+])# win32-dll
+
+AU_DEFUN([AC_LIBTOOL_WIN32_DLL],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+_LT_SET_OPTION([LT_INIT], [win32-dll])
+AC_DIAGNOSE([obsolete],
+[$0: Remove this warning and the call to _LT_SET_OPTION when you
+put the `win32-dll' option into LT_INIT's first parameter.])
+])
+
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_LIBTOOL_WIN32_DLL], [])
+
+
+# _LT_ENABLE_SHARED([DEFAULT])
+# ----------------------------
+# implement the --enable-shared flag, and supports the `shared' and
+# `disable-shared' LT_INIT options.
+# DEFAULT is either `yes' or `no'.  If omitted, it defaults to `yes'.
+m4_define([_LT_ENABLE_SHARED],
+[m4_define([_LT_ENABLE_SHARED_DEFAULT], [m4_if($1, no, no, yes)])dnl
+AC_ARG_ENABLE([shared],
+    [AS_HELP_STRING([--enable-shared@<:@=PKGS@:>@],
+	[build shared libraries @<:@default=]_LT_ENABLE_SHARED_DEFAULT[@:>@])],
+    [p=${PACKAGE-default}
+    case $enableval in
+    yes) enable_shared=yes ;;
+    no) enable_shared=no ;;
+    *)
+      enable_shared=no
+      # Look at the argument we got.  We use all the common list separators.
+      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      for pkg in $enableval; do
+	IFS="$lt_save_ifs"
+	if test "X$pkg" = "X$p"; then
+	  enable_shared=yes
+	fi
+      done
+      IFS="$lt_save_ifs"
+      ;;
+    esac],
+    [enable_shared=]_LT_ENABLE_SHARED_DEFAULT)
+
+    _LT_DECL([build_libtool_libs], [enable_shared], [0],
+	[Whether or not to build shared libraries])
+])# _LT_ENABLE_SHARED
+
+LT_OPTION_DEFINE([LT_INIT], [shared], [_LT_ENABLE_SHARED([yes])])
+LT_OPTION_DEFINE([LT_INIT], [disable-shared], [_LT_ENABLE_SHARED([no])])
+
+# Old names:
+AC_DEFUN([AC_ENABLE_SHARED],
+[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[shared])
+])
+
+AC_DEFUN([AC_DISABLE_SHARED],
+[_LT_SET_OPTION([LT_INIT], [disable-shared])
+])
+
+AU_DEFUN([AM_ENABLE_SHARED], [AC_ENABLE_SHARED($@)])
+AU_DEFUN([AM_DISABLE_SHARED], [AC_DISABLE_SHARED($@)])
+
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AM_ENABLE_SHARED], [])
+dnl AC_DEFUN([AM_DISABLE_SHARED], [])
+
+
+
+# _LT_ENABLE_STATIC([DEFAULT])
+# ----------------------------
+# implement the --enable-static flag, and support the `static' and
+# `disable-static' LT_INIT options.
+# DEFAULT is either `yes' or `no'.  If omitted, it defaults to `yes'.
+m4_define([_LT_ENABLE_STATIC],
+[m4_define([_LT_ENABLE_STATIC_DEFAULT], [m4_if($1, no, no, yes)])dnl
+AC_ARG_ENABLE([static],
+    [AS_HELP_STRING([--enable-static@<:@=PKGS@:>@],
+	[build static libraries @<:@default=]_LT_ENABLE_STATIC_DEFAULT[@:>@])],
+    [p=${PACKAGE-default}
+    case $enableval in
+    yes) enable_static=yes ;;
+    no) enable_static=no ;;
+    *)
+     enable_static=no
+      # Look at the argument we got.  We use all the common list separators.
+      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      for pkg in $enableval; do
+	IFS="$lt_save_ifs"
+	if test "X$pkg" = "X$p"; then
+	  enable_static=yes
+	fi
+      done
+      IFS="$lt_save_ifs"
+      ;;
+    esac],
+    [enable_static=]_LT_ENABLE_STATIC_DEFAULT)
+
+    _LT_DECL([build_old_libs], [enable_static], [0],
+	[Whether or not to build static libraries])
+])# _LT_ENABLE_STATIC
+
+LT_OPTION_DEFINE([LT_INIT], [static], [_LT_ENABLE_STATIC([yes])])
+LT_OPTION_DEFINE([LT_INIT], [disable-static], [_LT_ENABLE_STATIC([no])])
+
+# Old names:
+AC_DEFUN([AC_ENABLE_STATIC],
+[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[static])
+])
+
+AC_DEFUN([AC_DISABLE_STATIC],
+[_LT_SET_OPTION([LT_INIT], [disable-static])
+])
+
+AU_DEFUN([AM_ENABLE_STATIC], [AC_ENABLE_STATIC($@)])
+AU_DEFUN([AM_DISABLE_STATIC], [AC_DISABLE_STATIC($@)])
+
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AM_ENABLE_STATIC], [])
+dnl AC_DEFUN([AM_DISABLE_STATIC], [])
+
+
+
+# _LT_ENABLE_FAST_INSTALL([DEFAULT])
+# ----------------------------------
+# implement the --enable-fast-install flag, and support the `fast-install'
+# and `disable-fast-install' LT_INIT options.
+# DEFAULT is either `yes' or `no'.  If omitted, it defaults to `yes'.
+m4_define([_LT_ENABLE_FAST_INSTALL],
+[m4_define([_LT_ENABLE_FAST_INSTALL_DEFAULT], [m4_if($1, no, no, yes)])dnl
+AC_ARG_ENABLE([fast-install],
+    [AS_HELP_STRING([--enable-fast-install@<:@=PKGS@:>@],
+    [optimize for fast installation @<:@default=]_LT_ENABLE_FAST_INSTALL_DEFAULT[@:>@])],
+    [p=${PACKAGE-default}
+    case $enableval in
+    yes) enable_fast_install=yes ;;
+    no) enable_fast_install=no ;;
+    *)
+      enable_fast_install=no
+      # Look at the argument we got.  We use all the common list separators.
+      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      for pkg in $enableval; do
+	IFS="$lt_save_ifs"
+	if test "X$pkg" = "X$p"; then
+	  enable_fast_install=yes
+	fi
+      done
+      IFS="$lt_save_ifs"
+      ;;
+    esac],
+    [enable_fast_install=]_LT_ENABLE_FAST_INSTALL_DEFAULT)
+
+_LT_DECL([fast_install], [enable_fast_install], [0],
+	 [Whether or not to optimize for fast installation])dnl
+])# _LT_ENABLE_FAST_INSTALL
+
+LT_OPTION_DEFINE([LT_INIT], [fast-install], [_LT_ENABLE_FAST_INSTALL([yes])])
+LT_OPTION_DEFINE([LT_INIT], [disable-fast-install], [_LT_ENABLE_FAST_INSTALL([no])])
+
+# Old names:
+AU_DEFUN([AC_ENABLE_FAST_INSTALL],
+[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[fast-install])
+AC_DIAGNOSE([obsolete],
+[$0: Remove this warning and the call to _LT_SET_OPTION when you put
+the `fast-install' option into LT_INIT's first parameter.])
+])
+
+AU_DEFUN([AC_DISABLE_FAST_INSTALL],
+[_LT_SET_OPTION([LT_INIT], [disable-fast-install])
+AC_DIAGNOSE([obsolete],
+[$0: Remove this warning and the call to _LT_SET_OPTION when you put
+the `disable-fast-install' option into LT_INIT's first parameter.])
+])
+
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_ENABLE_FAST_INSTALL], [])
+dnl AC_DEFUN([AM_DISABLE_FAST_INSTALL], [])
+
+
+# _LT_WITH_PIC([MODE])
+# --------------------
+# implement the --with-pic flag, and support the `pic-only' and `no-pic'
+# LT_INIT options.
+# MODE is either `yes' or `no'.  If omitted, it defaults to `both'.
+m4_define([_LT_WITH_PIC],
+[AC_ARG_WITH([pic],
+    [AS_HELP_STRING([--with-pic@<:@=PKGS@:>@],
+	[try to use only PIC/non-PIC objects @<:@default=use both@:>@])],
+    [lt_p=${PACKAGE-default}
+    case $withval in
+    yes|no) pic_mode=$withval ;;
+    *)
+      pic_mode=default
+      # Look at the argument we got.  We use all the common list separators.
+      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      for lt_pkg in $withval; do
+	IFS="$lt_save_ifs"
+	if test "X$lt_pkg" = "X$lt_p"; then
+	  pic_mode=yes
+	fi
+      done
+      IFS="$lt_save_ifs"
+      ;;
+    esac],
+    [pic_mode=default])
+
+test -z "$pic_mode" && pic_mode=m4_default([$1], [default])
+
+_LT_DECL([], [pic_mode], [0], [What type of objects to build])dnl
+])# _LT_WITH_PIC
+
+LT_OPTION_DEFINE([LT_INIT], [pic-only], [_LT_WITH_PIC([yes])])
+LT_OPTION_DEFINE([LT_INIT], [no-pic], [_LT_WITH_PIC([no])])
+
+# Old name:
+AU_DEFUN([AC_LIBTOOL_PICMODE],
+[_LT_SET_OPTION([LT_INIT], [pic-only])
+AC_DIAGNOSE([obsolete],
+[$0: Remove this warning and the call to _LT_SET_OPTION when you
+put the `pic-only' option into LT_INIT's first parameter.])
+])
+
+dnl aclocal-1.4 backwards compatibility:
+dnl AC_DEFUN([AC_LIBTOOL_PICMODE], [])
+
+## ----------------- ##
+## LTDL_INIT Options ##
+## ----------------- ##
+
+m4_define([_LTDL_MODE], [])
+LT_OPTION_DEFINE([LTDL_INIT], [nonrecursive],
+		 [m4_define([_LTDL_MODE], [nonrecursive])])
+LT_OPTION_DEFINE([LTDL_INIT], [recursive],
+		 [m4_define([_LTDL_MODE], [recursive])])
+LT_OPTION_DEFINE([LTDL_INIT], [subproject],
+		 [m4_define([_LTDL_MODE], [subproject])])
+
+m4_define([_LTDL_TYPE], [])
+LT_OPTION_DEFINE([LTDL_INIT], [installable],
+		 [m4_define([_LTDL_TYPE], [installable])])
+LT_OPTION_DEFINE([LTDL_INIT], [convenience],
+		 [m4_define([_LTDL_TYPE], [convenience])])
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/ltsugar.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/ltsugar.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,123 @@
+# ltsugar.m4 -- libtool m4 base layer.                         -*-Autoconf-*-
+#
+# Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
+# Written by Gary V. Vaughan, 2004
+#
+# This file is free software; the Free Software Foundation gives
+# unlimited permission to copy and/or distribute it, with or without
+# modifications, as long as this notice is preserved.
+
+# serial 6 ltsugar.m4
+
+# This is to help aclocal find these macros, as it can't see m4_define.
+AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])])
+
+
+# lt_join(SEP, ARG1, [ARG2...])
+# -----------------------------
+# Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their
+# associated separator.
+# Needed until we can rely on m4_join from Autoconf 2.62, since all earlier
+# versions in m4sugar had bugs.
+m4_define([lt_join],
+[m4_if([$#], [1], [],
+       [$#], [2], [[$2]],
+       [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])])
+m4_define([_lt_join],
+[m4_if([$#$2], [2], [],
+       [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])])
+
+
+# lt_car(LIST)
+# lt_cdr(LIST)
+# ------------
+# Manipulate m4 lists.
+# These macros are necessary as long as will still need to support
+# Autoconf-2.59 which quotes differently.
+m4_define([lt_car], [[$1]])
+m4_define([lt_cdr],
+[m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])],
+       [$#], 1, [],
+       [m4_dquote(m4_shift($@))])])
+m4_define([lt_unquote], $1)
+
+
+# lt_append(MACRO-NAME, STRING, [SEPARATOR])
+# ------------------------------------------
+# Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'.
+# Note that neither SEPARATOR nor STRING are expanded; they are appended
+# to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked).
+# No SEPARATOR is output if MACRO-NAME was previously undefined (different
+# than defined and empty).
+#
+# This macro is needed until we can rely on Autoconf 2.62, since earlier
+# versions of m4sugar mistakenly expanded SEPARATOR but not STRING.
+m4_define([lt_append],
+[m4_define([$1],
+	   m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])])
+
+
+
+# lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...])
+# ----------------------------------------------------------
+# Produce a SEP delimited list of all paired combinations of elements of
+# PREFIX-LIST with SUFFIX1 through SUFFIXn.  Each element of the list
+# has the form PREFIXmINFIXSUFFIXn.
+# Needed until we can rely on m4_combine added in Autoconf 2.62.
+m4_define([lt_combine],
+[m4_if(m4_eval([$# > 3]), [1],
+       [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl
+[[m4_foreach([_Lt_prefix], [$2],
+	     [m4_foreach([_Lt_suffix],
+		]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[,
+	[_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])])
+
+
+# lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ])
+# -----------------------------------------------------------------------
+# Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited
+# by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ.
+m4_define([lt_if_append_uniq],
+[m4_ifdef([$1],
+	  [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1],
+		 [lt_append([$1], [$2], [$3])$4],
+		 [$5])],
+	  [lt_append([$1], [$2], [$3])$4])])
+
+
+# lt_dict_add(DICT, KEY, VALUE)
+# -----------------------------
+m4_define([lt_dict_add],
+[m4_define([$1($2)], [$3])])
+
+
+# lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE)
+# --------------------------------------------
+m4_define([lt_dict_add_subkey],
+[m4_define([$1($2:$3)], [$4])])
+
+
+# lt_dict_fetch(DICT, KEY, [SUBKEY])
+# ----------------------------------
+m4_define([lt_dict_fetch],
+[m4_ifval([$3],
+	m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]),
+    m4_ifdef([$1($2)], [m4_defn([$1($2)])]))])
+
+
+# lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE])
+# -----------------------------------------------------------------
+m4_define([lt_if_dict_fetch],
+[m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4],
+	[$5],
+    [$6])])
+
+
+# lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...])
+# --------------------------------------------------------------
+m4_define([lt_dict_filter],
+[m4_if([$5], [], [],
+  [lt_join(m4_quote(m4_default([$4], [[, ]])),
+           lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]),
+		      [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl
+])
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/ltversion.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/ltversion.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+# ltversion.m4 -- version numbers			-*- Autoconf -*-
+#
+#   Copyright (C) 2004 Free Software Foundation, Inc.
+#   Written by Scott James Remnant, 2004
+#
+# This file is free software; the Free Software Foundation gives
+# unlimited permission to copy and/or distribute it, with or without
+# modifications, as long as this notice is preserved.
+
+# @configure_input@
+
+# serial 3337 ltversion.m4
+# This file is part of GNU Libtool
+
+m4_define([LT_PACKAGE_VERSION], [2.4.2])
+m4_define([LT_PACKAGE_REVISION], [1.3337])
+
+AC_DEFUN([LTVERSION_VERSION],
+[macro_version='2.4.2'
+macro_revision='1.3337'
+_LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?])
+_LT_DECL(, macro_revision, 0)
+])
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/m4/lt~obsolete.m4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/m4/lt~obsolete.m4	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,98 @@
+# lt~obsolete.m4 -- aclocal satisfying obsolete definitions.    -*-Autoconf-*-
+#
+#   Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc.
+#   Written by Scott James Remnant, 2004.
+#
+# This file is free software; the Free Software Foundation gives
+# unlimited permission to copy and/or distribute it, with or without
+# modifications, as long as this notice is preserved.
+
+# serial 5 lt~obsolete.m4
+
+# These exist entirely to fool aclocal when bootstrapping libtool.
+#
+# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN)
+# which have later been changed to m4_define as they aren't part of the
+# exported API, or moved to Autoconf or Automake where they belong.
+#
+# The trouble is, aclocal is a bit thick.  It'll see the old AC_DEFUN
+# in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us
+# using a macro with the same name in our local m4/libtool.m4 it'll
+# pull the old libtool.m4 in (it doesn't see our shiny new m4_define
+# and doesn't know about Autoconf macros at all.)
+#
+# So we provide this file, which has a silly filename so it's always
+# included after everything else.  This provides aclocal with the
+# AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything
+# because those macros already exist, or will be overwritten later.
+# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. 
+#
+# Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here.
+# Yes, that means every name once taken will need to remain here until
+# we give up compatibility with versions before 1.7, at which point
+# we need to keep only those names which we still refer to.
+
+# This is to help aclocal find these macros, as it can't see m4_define.
+AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])])
+
+m4_ifndef([AC_LIBTOOL_LINKER_OPTION],	[AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])])
+m4_ifndef([AC_PROG_EGREP],		[AC_DEFUN([AC_PROG_EGREP])])
+m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH],	[AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])])
+m4_ifndef([_LT_AC_SHELL_INIT],		[AC_DEFUN([_LT_AC_SHELL_INIT])])
+m4_ifndef([_LT_AC_SYS_LIBPATH_AIX],	[AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])])
+m4_ifndef([_LT_PROG_LTMAIN],		[AC_DEFUN([_LT_PROG_LTMAIN])])
+m4_ifndef([_LT_AC_TAGVAR],		[AC_DEFUN([_LT_AC_TAGVAR])])
+m4_ifndef([AC_LTDL_ENABLE_INSTALL],	[AC_DEFUN([AC_LTDL_ENABLE_INSTALL])])
+m4_ifndef([AC_LTDL_PREOPEN],		[AC_DEFUN([AC_LTDL_PREOPEN])])
+m4_ifndef([_LT_AC_SYS_COMPILER],	[AC_DEFUN([_LT_AC_SYS_COMPILER])])
+m4_ifndef([_LT_AC_LOCK],		[AC_DEFUN([_LT_AC_LOCK])])
+m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE],	[AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])])
+m4_ifndef([_LT_AC_TRY_DLOPEN_SELF],	[AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])])
+m4_ifndef([AC_LIBTOOL_PROG_CC_C_O],	[AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])])
+m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])])
+m4_ifndef([AC_LIBTOOL_OBJDIR],		[AC_DEFUN([AC_LIBTOOL_OBJDIR])])
+m4_ifndef([AC_LTDL_OBJDIR],		[AC_DEFUN([AC_LTDL_OBJDIR])])
+m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])])
+m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP],	[AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])])
+m4_ifndef([AC_PATH_MAGIC],		[AC_DEFUN([AC_PATH_MAGIC])])
+m4_ifndef([AC_PROG_LD_GNU],		[AC_DEFUN([AC_PROG_LD_GNU])])
+m4_ifndef([AC_PROG_LD_RELOAD_FLAG],	[AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])])
+m4_ifndef([AC_DEPLIBS_CHECK_METHOD],	[AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])])
+m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])])
+m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])])
+m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])])
+m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS],	[AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])])
+m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP],	[AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])])
+m4_ifndef([LT_AC_PROG_EGREP],		[AC_DEFUN([LT_AC_PROG_EGREP])])
+m4_ifndef([LT_AC_PROG_SED],		[AC_DEFUN([LT_AC_PROG_SED])])
+m4_ifndef([_LT_CC_BASENAME],		[AC_DEFUN([_LT_CC_BASENAME])])
+m4_ifndef([_LT_COMPILER_BOILERPLATE],	[AC_DEFUN([_LT_COMPILER_BOILERPLATE])])
+m4_ifndef([_LT_LINKER_BOILERPLATE],	[AC_DEFUN([_LT_LINKER_BOILERPLATE])])
+m4_ifndef([_AC_PROG_LIBTOOL],		[AC_DEFUN([_AC_PROG_LIBTOOL])])
+m4_ifndef([AC_LIBTOOL_SETUP],		[AC_DEFUN([AC_LIBTOOL_SETUP])])
+m4_ifndef([_LT_AC_CHECK_DLFCN],		[AC_DEFUN([_LT_AC_CHECK_DLFCN])])
+m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER],	[AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])])
+m4_ifndef([_LT_AC_TAGCONFIG],		[AC_DEFUN([_LT_AC_TAGCONFIG])])
+m4_ifndef([AC_DISABLE_FAST_INSTALL],	[AC_DEFUN([AC_DISABLE_FAST_INSTALL])])
+m4_ifndef([_LT_AC_LANG_CXX],		[AC_DEFUN([_LT_AC_LANG_CXX])])
+m4_ifndef([_LT_AC_LANG_F77],		[AC_DEFUN([_LT_AC_LANG_F77])])
+m4_ifndef([_LT_AC_LANG_GCJ],		[AC_DEFUN([_LT_AC_LANG_GCJ])])
+m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])])
+m4_ifndef([_LT_AC_LANG_C_CONFIG],	[AC_DEFUN([_LT_AC_LANG_C_CONFIG])])
+m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])])
+m4_ifndef([_LT_AC_LANG_CXX_CONFIG],	[AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])])
+m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])])
+m4_ifndef([_LT_AC_LANG_F77_CONFIG],	[AC_DEFUN([_LT_AC_LANG_F77_CONFIG])])
+m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])])
+m4_ifndef([_LT_AC_LANG_GCJ_CONFIG],	[AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])])
+m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])])
+m4_ifndef([_LT_AC_LANG_RC_CONFIG],	[AC_DEFUN([_LT_AC_LANG_RC_CONFIG])])
+m4_ifndef([AC_LIBTOOL_CONFIG],		[AC_DEFUN([AC_LIBTOOL_CONFIG])])
+m4_ifndef([_LT_AC_FILE_LTDLL_C],	[AC_DEFUN([_LT_AC_FILE_LTDLL_C])])
+m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS],	[AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])])
+m4_ifndef([_LT_AC_PROG_CXXCPP],		[AC_DEFUN([_LT_AC_PROG_CXXCPP])])
+m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS],	[AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])])
+m4_ifndef([_LT_PROG_ECHO_BACKSLASH],	[AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])])
+m4_ifndef([_LT_PROG_F77],		[AC_DEFUN([_LT_PROG_F77])])
+m4_ifndef([_LT_PROG_FC],		[AC_DEFUN([_LT_PROG_FC])])
+m4_ifndef([_LT_PROG_CXX],		[AC_DEFUN([_LT_PROG_CXX])])
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/missing
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/missing	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,331 @@
+#! /bin/sh
+# Common stub for a few missing GNU programs while installing.
+
+scriptversion=2012-01-06.13; # UTC
+
+# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006,
+# 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+if test $# -eq 0; then
+  echo 1>&2 "Try \`$0 --help' for more information"
+  exit 1
+fi
+
+run=:
+sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
+sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
+
+# In the cases where this matters, `missing' is being run in the
+# srcdir already.
+if test -f configure.ac; then
+  configure_ac=configure.ac
+else
+  configure_ac=configure.in
+fi
+
+msg="missing on your system"
+
+case $1 in
+--run)
+  # Try to run requested program, and just exit if it succeeds.
+  run=
+  shift
+  "$@" && exit 0
+  # Exit code 63 means version mismatch.  This often happens
+  # when the user try to use an ancient version of a tool on
+  # a file that requires a minimum version.  In this case we
+  # we should proceed has if the program had been absent, or
+  # if --run hadn't been passed.
+  if test $? = 63; then
+    run=:
+    msg="probably too old"
+  fi
+  ;;
+
+  -h|--h|--he|--hel|--help)
+    echo "\
+$0 [OPTION]... PROGRAM [ARGUMENT]...
+
+Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
+error status if there is no known handling for PROGRAM.
+
+Options:
+  -h, --help      display this help and exit
+  -v, --version   output version information and exit
+  --run           try to run the given command, and emulate it if it fails
+
+Supported PROGRAM values:
+  aclocal      touch file \`aclocal.m4'
+  autoconf     touch file \`configure'
+  autoheader   touch file \`config.h.in'
+  autom4te     touch the output file, or create a stub one
+  automake     touch all \`Makefile.in' files
+  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
+  flex         create \`lex.yy.c', if possible, from existing .c
+  help2man     touch the output file
+  lex          create \`lex.yy.c', if possible, from existing .c
+  makeinfo     touch the output file
+  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
+
+Version suffixes to PROGRAM as well as the prefixes \`gnu-', \`gnu', and
+\`g' are ignored when checking the name.
+
+Send bug reports to <bug-automake@gnu.org>."
+    exit $?
+    ;;
+
+  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
+    echo "missing $scriptversion (GNU Automake)"
+    exit $?
+    ;;
+
+  -*)
+    echo 1>&2 "$0: Unknown \`$1' option"
+    echo 1>&2 "Try \`$0 --help' for more information"
+    exit 1
+    ;;
+
+esac
+
+# normalize program name to check for.
+program=`echo "$1" | sed '
+  s/^gnu-//; t
+  s/^gnu//; t
+  s/^g//; t'`
+
+# Now exit if we have it, but it failed.  Also exit now if we
+# don't have it and --version was passed (most likely to detect
+# the program).  This is about non-GNU programs, so use $1 not
+# $program.
+case $1 in
+  lex*|yacc*)
+    # Not GNU programs, they don't have --version.
+    ;;
+
+  *)
+    if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
+       # We have it, but it failed.
+       exit 1
+    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
+       # Could not run --version or --help.  This is probably someone
+       # running `$TOOL --version' or `$TOOL --help' to check whether
+       # $TOOL exists and not knowing $TOOL uses missing.
+       exit 1
+    fi
+    ;;
+esac
+
+# If it does not exist, or fails to run (possibly an outdated version),
+# try to emulate it.
+case $program in
+  aclocal*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
+         to install the \`Automake' and \`Perl' packages.  Grab them from
+         any GNU archive site."
+    touch aclocal.m4
+    ;;
+
+  autoconf*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`${configure_ac}'.  You might want to install the
+         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
+         archive site."
+    touch configure
+    ;;
+
+  autoheader*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`acconfig.h' or \`${configure_ac}'.  You might want
+         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
+         from any GNU archive site."
+    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
+    test -z "$files" && files="config.h"
+    touch_files=
+    for f in $files; do
+      case $f in
+      *:*) touch_files="$touch_files "`echo "$f" |
+				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
+      *) touch_files="$touch_files $f.in";;
+      esac
+    done
+    touch $touch_files
+    ;;
+
+  automake*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
+         You might want to install the \`Automake' and \`Perl' packages.
+         Grab them from any GNU archive site."
+    find . -type f -name Makefile.am -print |
+	   sed 's/\.am$/.in/' |
+	   while read f; do touch "$f"; done
+    ;;
+
+  autom4te*)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, but is $msg.
+         You might have modified some files without having the
+         proper tools for further handling them.
+         You can get \`$1' as part of \`Autoconf' from any GNU
+         archive site."
+
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -f "$file"; then
+	touch $file
+    else
+	test -z "$file" || exec >$file
+	echo "#! /bin/sh"
+	echo "# Created by GNU Automake missing as a replacement of"
+	echo "#  $ $@"
+	echo "exit 0"
+	chmod +x $file
+	exit 1
+    fi
+    ;;
+
+  bison*|yacc*)
+    echo 1>&2 "\
+WARNING: \`$1' $msg.  You should only need it if
+         you modified a \`.y' file.  You may need the \`Bison' package
+         in order for those modifications to take effect.  You can get
+         \`Bison' from any GNU archive site."
+    rm -f y.tab.c y.tab.h
+    if test $# -ne 1; then
+        eval LASTARG=\${$#}
+	case $LASTARG in
+	*.y)
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" y.tab.c
+	    fi
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" y.tab.h
+	    fi
+	  ;;
+	esac
+    fi
+    if test ! -f y.tab.h; then
+	echo >y.tab.h
+    fi
+    if test ! -f y.tab.c; then
+	echo 'main() { return 0; }' >y.tab.c
+    fi
+    ;;
+
+  lex*|flex*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified a \`.l' file.  You may need the \`Flex' package
+         in order for those modifications to take effect.  You can get
+         \`Flex' from any GNU archive site."
+    rm -f lex.yy.c
+    if test $# -ne 1; then
+        eval LASTARG=\${$#}
+	case $LASTARG in
+	*.l)
+	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" lex.yy.c
+	    fi
+	  ;;
+	esac
+    fi
+    if test ! -f lex.yy.c; then
+	echo 'main() { return 0; }' >lex.yy.c
+    fi
+    ;;
+
+  help2man*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+	 you modified a dependency of a manual page.  You may need the
+	 \`Help2man' package in order for those modifications to take
+	 effect.  You can get \`Help2man' from any GNU archive site."
+
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -f "$file"; then
+	touch $file
+    else
+	test -z "$file" || exec >$file
+	echo ".ab help2man is required to generate this page"
+	exit $?
+    fi
+    ;;
+
+  makeinfo*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified a \`.texi' or \`.texinfo' file, or any other file
+         indirectly affecting the aspect of the manual.  The spurious
+         call might also be the consequence of using a buggy \`make' (AIX,
+         DU, IRIX).  You might want to install the \`Texinfo' package or
+         the \`GNU make' package.  Grab either from any GNU archive site."
+    # The file to touch is that specified with -o ...
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -z "$file"; then
+      # ... or it is the one specified with @setfilename ...
+      infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
+      file=`sed -n '
+	/^@setfilename/{
+	  s/.* \([^ ]*\) *$/\1/
+	  p
+	  q
+	}' $infile`
+      # ... or it is derived from the source name (dir/f.texi becomes f.info)
+      test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
+    fi
+    # If the file does not exist, the user really needs makeinfo;
+    # let's fail without touching anything.
+    test -f $file || exit 1
+    touch $file
+    ;;
+
+  *)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, and is $msg.
+         You might have modified some files without having the
+         proper tools for further handling them.  Check the \`README' file,
+         it often tells you about the needed prerequisites for installing
+         this package.  You may also peek at any GNU archive site, in case
+         some other package would contain this missing \`$1' program."
+    exit 1
+    ;;
+esac
+
+exit 0
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,100 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/rdft -I$(top_srcdir)/api -I$(top_srcdir)/tests	\
+-I$(top_srcdir)/libbench2
+
+if MPI
+lib_LTLIBRARIES = libfftw3@PREC_SUFFIX@_mpi.la
+include_HEADERS = fftw3-mpi.h
+nodist_include_HEADERS = fftw3-mpi.f03 fftw3l-mpi.f03
+noinst_PROGRAMS = mpi-bench
+endif
+
+CC=@MPICC@
+
+EXTRA_DIST = testsched.c f03api.sh f03-wrap.sh genf03-wrap.pl fftw3-mpi.f03.in fftw3l-mpi.f03.in
+BUILT_SOURCES = fftw3-mpi.f03.in fftw3-mpi.f03 fftw3l-mpi.f03.in fftw3l-mpi.f03 f03-wrap.c
+CLEANFILES = fftw3-mpi.f03 fftw3l-mpi.f03
+
+TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-problem.c transpose-solve.c mpi-transpose.h
+DFT_SRC = dft-serial.c dft-rank-geq2.c dft-rank-geq2-transposed.c dft-rank1.c dft-rank1-bigvec.c dft-problem.c dft-solve.c mpi-dft.h
+RDFT_SRC = rdft-serial.c rdft-rank-geq2.c rdft-rank-geq2-transposed.c rdft-rank1-bigvec.c rdft-problem.c rdft-solve.c mpi-rdft.h
+RDFT2_SRC = rdft2-serial.c rdft2-rank-geq2.c rdft2-rank-geq2-transposed.c rdft2-problem.c rdft2-solve.c mpi-rdft2.h
+SRC = any-true.c api.c block.c choose-radix.c conf.c dtensor.c fftw3-mpi.h ifftw-mpi.h rearrange.c wisdom-api.c f03-wrap.c
+
+libfftw3@PREC_SUFFIX@_mpi_la_SOURCES = $(SRC) $(TRANSPOSE_SRC) $(DFT_SRC) $(RDFT_SRC) $(RDFT2_SRC)
+
+libfftw3@PREC_SUFFIX@_mpi_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
+libfftw3@PREC_SUFFIX@_mpi_la_LIBADD = ../libfftw3@PREC_SUFFIX@.la @MPILIBS@
+
+if THREADS
+mpi_bench_CFLAGS = $(PTHREAD_CFLAGS)
+if !COMBINED_THREADS
+LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_threads.la
+endif
+else
+if OPENMP
+mpi_bench_CFLAGS = $(OPENMP_CFLAGS)
+LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_omp.la
+endif
+endif
+
+mpi_bench_SOURCES = mpi-bench.c $(top_srcdir)/tests/fftw-bench.c $(top_srcdir)/tests/hook.c
+mpi_bench_LDADD = libfftw3@PREC_SUFFIX@_mpi.la $(LIBFFTWTHREADS) $(top_builddir)/libfftw3@PREC_SUFFIX@.la $(top_builddir)/libbench2/libbench2.a $(MPILIBS) $(THREADLIBS)
+
+CHECK = $(top_srcdir)/tests/check.pl
+NUMCHECK=10
+CHECKSIZE=10000
+CHECKOPTS = --verbose --random --maxsize=$(CHECKSIZE) -c=$(NUMCHECK) $(CHECK_PL_OPTS)
+
+if MPI
+
+check-local: mpi-bench$(EXEEXT)
+	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 1 `pwd`/mpi-bench"
+	@echo "--------------------------------------------------------------"
+	@echo "     MPI FFTW transforms passed "$(NUMCHECK)" tests, 1 CPU"
+	@echo "--------------------------------------------------------------"
+	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 2 `pwd`/mpi-bench"
+	@echo "--------------------------------------------------------------"
+	@echo "      MPI FFTW transforms passed "$(NUMCHECK)" tests, 2 CPUs"
+	@echo "--------------------------------------------------------------"
+	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 3 `pwd`/mpi-bench"
+	@echo "--------------------------------------------------------------"
+	@echo "      MPI FFTW transforms passed "$(NUMCHECK)" tests, 3 CPUs"
+	@echo "--------------------------------------------------------------"
+	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 4 `pwd`/mpi-bench"
+	@echo "--------------------------------------------------------------"
+	@echo "      MPI FFTW transforms passed "$(NUMCHECK)" tests, 4 CPUs"
+	@echo "--------------------------------------------------------------"
+if SMP
+	perl -w $(CHECK) $(CHECKOPTS) --mpi --nthreads=2 "$(MPIRUN) -np 3 `pwd`/mpi-bench"
+	@echo "--------------------------------------------------------------"
+	@echo "      MPI FFTW threaded transforms passed "$(NUMCHECK)" tests!"
+	@echo "--------------------------------------------------------------"
+endif
+
+bigcheck: mpi-bench$(EXEEXT)
+	$(MAKE) $(AM_MAKEFLAGS) NUMCHECK=100 CHECKSIZE=60000 check-local
+
+smallcheck: mpi-bench$(EXEEXT)
+	$(MAKE) $(AM_MAKEFLAGS) NUMCHECK=2 check-local
+
+endif
+
+fftw3-mpi.f03: fftw3-mpi.f03.in
+	sed 's/C_MPI_FINT/@C_MPI_FINT@/' $(srcdir)/fftw3-mpi.f03.in > $@
+
+fftw3l-mpi.f03: fftw3l-mpi.f03.in
+	sed 's/C_MPI_FINT/@C_MPI_FINT@/' $(srcdir)/fftw3l-mpi.f03.in > $@
+
+if MAINTAINER_MODE
+
+fftw3-mpi.f03.in: fftw3-mpi.h f03api.sh $(top_srcdir)/api/genf03.pl
+	sh $(srcdir)/f03api.sh d f > $@
+
+fftw3l-mpi.f03.in: fftw3-mpi.h f03api.sh $(top_srcdir)/api/genf03.pl
+	sh $(srcdir)/f03api.sh l | grep -v parameter | sed 's/fftw3.f03/fftw3l.f03/' > $@
+
+f03-wrap.c: fftw3-mpi.h f03-wrap.sh genf03-wrap.pl
+	sh $(srcdir)/f03-wrap.sh > $@
+
+endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,838 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+@MPI_TRUE@noinst_PROGRAMS = mpi-bench$(EXEEXT)
+subdir = mpi
+DIST_COMMON = $(am__include_HEADERS_DIST) $(srcdir)/Makefile.am \
+	$(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)" \
+	"$(DESTDIR)$(includedir)"
+LTLIBRARIES = $(lib_LTLIBRARIES)
+libfftw3@PREC_SUFFIX@_mpi_la_DEPENDENCIES =  \
+	../libfftw3@PREC_SUFFIX@.la
+am__objects_1 = any-true.lo api.lo block.lo choose-radix.lo conf.lo \
+	dtensor.lo rearrange.lo wisdom-api.lo f03-wrap.lo
+am__objects_2 = transpose-alltoall.lo transpose-pairwise.lo \
+	transpose-recurse.lo transpose-problem.lo transpose-solve.lo
+am__objects_3 = dft-serial.lo dft-rank-geq2.lo \
+	dft-rank-geq2-transposed.lo dft-rank1.lo dft-rank1-bigvec.lo \
+	dft-problem.lo dft-solve.lo
+am__objects_4 = rdft-serial.lo rdft-rank-geq2.lo \
+	rdft-rank-geq2-transposed.lo rdft-rank1-bigvec.lo \
+	rdft-problem.lo rdft-solve.lo
+am__objects_5 = rdft2-serial.lo rdft2-rank-geq2.lo \
+	rdft2-rank-geq2-transposed.lo rdft2-problem.lo rdft2-solve.lo
+am_libfftw3@PREC_SUFFIX@_mpi_la_OBJECTS = $(am__objects_1) \
+	$(am__objects_2) $(am__objects_3) $(am__objects_4) \
+	$(am__objects_5)
+libfftw3@PREC_SUFFIX@_mpi_la_OBJECTS =  \
+	$(am_libfftw3@PREC_SUFFIX@_mpi_la_OBJECTS)
+libfftw3@PREC_SUFFIX@_mpi_la_LINK = $(LIBTOOL) --tag=CC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(AM_CFLAGS) $(CFLAGS) $(libfftw3@PREC_SUFFIX@_mpi_la_LDFLAGS) \
+	$(LDFLAGS) -o $@
+@MPI_TRUE@am_libfftw3@PREC_SUFFIX@_mpi_la_rpath = -rpath $(libdir)
+PROGRAMS = $(noinst_PROGRAMS)
+am_mpi_bench_OBJECTS = mpi_bench-mpi-bench.$(OBJEXT) \
+	mpi_bench-fftw-bench.$(OBJEXT) mpi_bench-hook.$(OBJEXT)
+mpi_bench_OBJECTS = $(am_mpi_bench_OBJECTS)
+am__DEPENDENCIES_1 =
+mpi_bench_DEPENDENCIES = libfftw3@PREC_SUFFIX@_mpi.la \
+	$(LIBFFTWTHREADS) $(top_builddir)/libfftw3@PREC_SUFFIX@.la \
+	$(top_builddir)/libbench2/libbench2.a $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
+mpi_bench_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(mpi_bench_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libfftw3@PREC_SUFFIX@_mpi_la_SOURCES) $(mpi_bench_SOURCES)
+DIST_SOURCES = $(libfftw3@PREC_SUFFIX@_mpi_la_SOURCES) \
+	$(mpi_bench_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__include_HEADERS_DIST = fftw3-mpi.h
+HEADERS = $(include_HEADERS) $(nodist_include_HEADERS)
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @MPICC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/rdft -I$(top_srcdir)/api -I$(top_srcdir)/tests	\
+-I$(top_srcdir)/libbench2
+
+@MPI_TRUE@lib_LTLIBRARIES = libfftw3@PREC_SUFFIX@_mpi.la
+@MPI_TRUE@include_HEADERS = fftw3-mpi.h
+@MPI_TRUE@nodist_include_HEADERS = fftw3-mpi.f03 fftw3l-mpi.f03
+EXTRA_DIST = testsched.c f03api.sh f03-wrap.sh genf03-wrap.pl fftw3-mpi.f03.in fftw3l-mpi.f03.in
+BUILT_SOURCES = fftw3-mpi.f03.in fftw3-mpi.f03 fftw3l-mpi.f03.in fftw3l-mpi.f03 f03-wrap.c
+CLEANFILES = fftw3-mpi.f03 fftw3l-mpi.f03
+TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-problem.c transpose-solve.c mpi-transpose.h
+DFT_SRC = dft-serial.c dft-rank-geq2.c dft-rank-geq2-transposed.c dft-rank1.c dft-rank1-bigvec.c dft-problem.c dft-solve.c mpi-dft.h
+RDFT_SRC = rdft-serial.c rdft-rank-geq2.c rdft-rank-geq2-transposed.c rdft-rank1-bigvec.c rdft-problem.c rdft-solve.c mpi-rdft.h
+RDFT2_SRC = rdft2-serial.c rdft2-rank-geq2.c rdft2-rank-geq2-transposed.c rdft2-problem.c rdft2-solve.c mpi-rdft2.h
+SRC = any-true.c api.c block.c choose-radix.c conf.c dtensor.c fftw3-mpi.h ifftw-mpi.h rearrange.c wisdom-api.c f03-wrap.c
+libfftw3@PREC_SUFFIX@_mpi_la_SOURCES = $(SRC) $(TRANSPOSE_SRC) $(DFT_SRC) $(RDFT_SRC) $(RDFT2_SRC)
+libfftw3@PREC_SUFFIX@_mpi_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
+libfftw3@PREC_SUFFIX@_mpi_la_LIBADD = ../libfftw3@PREC_SUFFIX@.la @MPILIBS@
+@OPENMP_TRUE@@THREADS_FALSE@mpi_bench_CFLAGS = $(OPENMP_CFLAGS)
+@THREADS_TRUE@mpi_bench_CFLAGS = $(PTHREAD_CFLAGS)
+@COMBINED_THREADS_FALSE@@THREADS_TRUE@LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_threads.la
+@OPENMP_TRUE@@THREADS_FALSE@LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_omp.la
+mpi_bench_SOURCES = mpi-bench.c $(top_srcdir)/tests/fftw-bench.c $(top_srcdir)/tests/hook.c
+mpi_bench_LDADD = libfftw3@PREC_SUFFIX@_mpi.la $(LIBFFTWTHREADS) $(top_builddir)/libfftw3@PREC_SUFFIX@.la $(top_builddir)/libbench2/libbench2.a $(MPILIBS) $(THREADLIBS)
+CHECK = $(top_srcdir)/tests/check.pl
+NUMCHECK = 10
+CHECKSIZE = 10000
+CHECKOPTS = --verbose --random --maxsize=$(CHECKSIZE) -c=$(NUMCHECK) $(CHECK_PL_OPTS)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu mpi/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu mpi/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	list2=; for p in $$list; do \
+	  if test -f $$p; then \
+	    list2="$$list2 $$p"; \
+	  else :; fi; \
+	done; \
+	test -z "$$list2" || { \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
+	}
+
+uninstall-libLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
+	done
+
+clean-libLTLIBRARIES:
+	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libfftw3@PREC_SUFFIX@_mpi.la: $(libfftw3@PREC_SUFFIX@_mpi_la_OBJECTS) $(libfftw3@PREC_SUFFIX@_mpi_la_DEPENDENCIES) $(EXTRA_libfftw3@PREC_SUFFIX@_mpi_la_DEPENDENCIES) 
+	$(libfftw3@PREC_SUFFIX@_mpi_la_LINK) $(am_libfftw3@PREC_SUFFIX@_mpi_la_rpath) $(libfftw3@PREC_SUFFIX@_mpi_la_OBJECTS) $(libfftw3@PREC_SUFFIX@_mpi_la_LIBADD) $(LIBS)
+
+clean-noinstPROGRAMS:
+	@list='$(noinst_PROGRAMS)'; test -n "$$list" || exit 0; \
+	echo " rm -f" $$list; \
+	rm -f $$list || exit $$?; \
+	test -n "$(EXEEXT)" || exit 0; \
+	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
+	echo " rm -f" $$list; \
+	rm -f $$list
+mpi-bench$(EXEEXT): $(mpi_bench_OBJECTS) $(mpi_bench_DEPENDENCIES) $(EXTRA_mpi_bench_DEPENDENCIES) 
+	@rm -f mpi-bench$(EXEEXT)
+	$(mpi_bench_LINK) $(mpi_bench_OBJECTS) $(mpi_bench_LDADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/any-true.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/block.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/choose-radix.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/conf.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dft-problem.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dft-rank-geq2-transposed.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dft-rank-geq2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dft-rank1-bigvec.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dft-rank1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dft-serial.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dft-solve.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dtensor.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/f03-wrap.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mpi_bench-fftw-bench.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mpi_bench-hook.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mpi_bench-mpi-bench.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft-problem.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft-rank-geq2-transposed.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft-rank-geq2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft-rank1-bigvec.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft-serial.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft-solve.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-problem.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-rank-geq2-transposed.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-rank-geq2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-serial.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-solve.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rearrange.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose-alltoall.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose-pairwise.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose-problem.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose-recurse.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose-solve.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wisdom-api.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mpi_bench-mpi-bench.o: mpi-bench.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -MT mpi_bench-mpi-bench.o -MD -MP -MF $(DEPDIR)/mpi_bench-mpi-bench.Tpo -c -o mpi_bench-mpi-bench.o `test -f 'mpi-bench.c' || echo '$(srcdir)/'`mpi-bench.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mpi_bench-mpi-bench.Tpo $(DEPDIR)/mpi_bench-mpi-bench.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='mpi-bench.c' object='mpi_bench-mpi-bench.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -c -o mpi_bench-mpi-bench.o `test -f 'mpi-bench.c' || echo '$(srcdir)/'`mpi-bench.c
+
+mpi_bench-mpi-bench.obj: mpi-bench.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -MT mpi_bench-mpi-bench.obj -MD -MP -MF $(DEPDIR)/mpi_bench-mpi-bench.Tpo -c -o mpi_bench-mpi-bench.obj `if test -f 'mpi-bench.c'; then $(CYGPATH_W) 'mpi-bench.c'; else $(CYGPATH_W) '$(srcdir)/mpi-bench.c'; fi`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mpi_bench-mpi-bench.Tpo $(DEPDIR)/mpi_bench-mpi-bench.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='mpi-bench.c' object='mpi_bench-mpi-bench.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -c -o mpi_bench-mpi-bench.obj `if test -f 'mpi-bench.c'; then $(CYGPATH_W) 'mpi-bench.c'; else $(CYGPATH_W) '$(srcdir)/mpi-bench.c'; fi`
+
+mpi_bench-fftw-bench.o: $(top_srcdir)/tests/fftw-bench.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -MT mpi_bench-fftw-bench.o -MD -MP -MF $(DEPDIR)/mpi_bench-fftw-bench.Tpo -c -o mpi_bench-fftw-bench.o `test -f '$(top_srcdir)/tests/fftw-bench.c' || echo '$(srcdir)/'`$(top_srcdir)/tests/fftw-bench.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mpi_bench-fftw-bench.Tpo $(DEPDIR)/mpi_bench-fftw-bench.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(top_srcdir)/tests/fftw-bench.c' object='mpi_bench-fftw-bench.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -c -o mpi_bench-fftw-bench.o `test -f '$(top_srcdir)/tests/fftw-bench.c' || echo '$(srcdir)/'`$(top_srcdir)/tests/fftw-bench.c
+
+mpi_bench-fftw-bench.obj: $(top_srcdir)/tests/fftw-bench.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -MT mpi_bench-fftw-bench.obj -MD -MP -MF $(DEPDIR)/mpi_bench-fftw-bench.Tpo -c -o mpi_bench-fftw-bench.obj `if test -f '$(top_srcdir)/tests/fftw-bench.c'; then $(CYGPATH_W) '$(top_srcdir)/tests/fftw-bench.c'; else $(CYGPATH_W) '$(srcdir)/$(top_srcdir)/tests/fftw-bench.c'; fi`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mpi_bench-fftw-bench.Tpo $(DEPDIR)/mpi_bench-fftw-bench.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(top_srcdir)/tests/fftw-bench.c' object='mpi_bench-fftw-bench.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -c -o mpi_bench-fftw-bench.obj `if test -f '$(top_srcdir)/tests/fftw-bench.c'; then $(CYGPATH_W) '$(top_srcdir)/tests/fftw-bench.c'; else $(CYGPATH_W) '$(srcdir)/$(top_srcdir)/tests/fftw-bench.c'; fi`
+
+mpi_bench-hook.o: $(top_srcdir)/tests/hook.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -MT mpi_bench-hook.o -MD -MP -MF $(DEPDIR)/mpi_bench-hook.Tpo -c -o mpi_bench-hook.o `test -f '$(top_srcdir)/tests/hook.c' || echo '$(srcdir)/'`$(top_srcdir)/tests/hook.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mpi_bench-hook.Tpo $(DEPDIR)/mpi_bench-hook.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(top_srcdir)/tests/hook.c' object='mpi_bench-hook.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -c -o mpi_bench-hook.o `test -f '$(top_srcdir)/tests/hook.c' || echo '$(srcdir)/'`$(top_srcdir)/tests/hook.c
+
+mpi_bench-hook.obj: $(top_srcdir)/tests/hook.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -MT mpi_bench-hook.obj -MD -MP -MF $(DEPDIR)/mpi_bench-hook.Tpo -c -o mpi_bench-hook.obj `if test -f '$(top_srcdir)/tests/hook.c'; then $(CYGPATH_W) '$(top_srcdir)/tests/hook.c'; else $(CYGPATH_W) '$(srcdir)/$(top_srcdir)/tests/hook.c'; fi`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mpi_bench-hook.Tpo $(DEPDIR)/mpi_bench-hook.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(top_srcdir)/tests/hook.c' object='mpi_bench-hook.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(mpi_bench_CFLAGS) $(CFLAGS) -c -o mpi_bench-hook.obj `if test -f '$(top_srcdir)/tests/hook.c'; then $(CYGPATH_W) '$(top_srcdir)/tests/hook.c'; else $(CYGPATH_W) '$(srcdir)/$(top_srcdir)/tests/hook.c'; fi`
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+install-includeHEADERS: $(include_HEADERS)
+	@$(NORMAL_INSTALL)
+	@list='$(include_HEADERS)'; test -n "$(includedir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(includedir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(includedir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(includedir)'"; \
+	  $(INSTALL_HEADER) $$files "$(DESTDIR)$(includedir)" || exit $$?; \
+	done
+
+uninstall-includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(include_HEADERS)'; test -n "$(includedir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(includedir)'; $(am__uninstall_files_from_dir)
+install-nodist_includeHEADERS: $(nodist_include_HEADERS)
+	@$(NORMAL_INSTALL)
+	@list='$(nodist_include_HEADERS)'; test -n "$(includedir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(includedir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(includedir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(includedir)'"; \
+	  $(INSTALL_HEADER) $$files "$(DESTDIR)$(includedir)" || exit $$?; \
+	done
+
+uninstall-nodist_includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(nodist_include_HEADERS)'; test -n "$(includedir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(includedir)'; $(am__uninstall_files_from_dir)
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+@MPI_FALSE@check-local:
+check-am: all-am
+	$(MAKE) $(AM_MAKEFLAGS) check-local
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES) $(PROGRAMS) $(HEADERS)
+installdirs:
+	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)" "$(DESTDIR)$(includedir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
+	clean-noinstPROGRAMS mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-includeHEADERS install-nodist_includeHEADERS
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-libLTLIBRARIES
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-includeHEADERS uninstall-libLTLIBRARIES \
+	uninstall-nodist_includeHEADERS
+
+.MAKE: all check check-am install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am check-local clean \
+	clean-generic clean-libLTLIBRARIES clean-libtool \
+	clean-noinstPROGRAMS ctags distclean distclean-compile \
+	distclean-generic distclean-libtool distclean-tags distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-includeHEADERS install-info install-info-am \
+	install-libLTLIBRARIES install-man \
+	install-nodist_includeHEADERS install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am uninstall-includeHEADERS \
+	uninstall-libLTLIBRARIES uninstall-nodist_includeHEADERS
+
+
+@MPI_TRUE@check-local: mpi-bench$(EXEEXT)
+@MPI_TRUE@	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 1 `pwd`/mpi-bench"
+@MPI_TRUE@	@echo "--------------------------------------------------------------"
+@MPI_TRUE@	@echo "     MPI FFTW transforms passed "$(NUMCHECK)" tests, 1 CPU"
+@MPI_TRUE@	@echo "--------------------------------------------------------------"
+@MPI_TRUE@	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 2 `pwd`/mpi-bench"
+@MPI_TRUE@	@echo "--------------------------------------------------------------"
+@MPI_TRUE@	@echo "      MPI FFTW transforms passed "$(NUMCHECK)" tests, 2 CPUs"
+@MPI_TRUE@	@echo "--------------------------------------------------------------"
+@MPI_TRUE@	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 3 `pwd`/mpi-bench"
+@MPI_TRUE@	@echo "--------------------------------------------------------------"
+@MPI_TRUE@	@echo "      MPI FFTW transforms passed "$(NUMCHECK)" tests, 3 CPUs"
+@MPI_TRUE@	@echo "--------------------------------------------------------------"
+@MPI_TRUE@	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 4 `pwd`/mpi-bench"
+@MPI_TRUE@	@echo "--------------------------------------------------------------"
+@MPI_TRUE@	@echo "      MPI FFTW transforms passed "$(NUMCHECK)" tests, 4 CPUs"
+@MPI_TRUE@	@echo "--------------------------------------------------------------"
+@MPI_TRUE@@SMP_TRUE@	perl -w $(CHECK) $(CHECKOPTS) --mpi --nthreads=2 "$(MPIRUN) -np 3 `pwd`/mpi-bench"
+@MPI_TRUE@@SMP_TRUE@	@echo "--------------------------------------------------------------"
+@MPI_TRUE@@SMP_TRUE@	@echo "      MPI FFTW threaded transforms passed "$(NUMCHECK)" tests!"
+@MPI_TRUE@@SMP_TRUE@	@echo "--------------------------------------------------------------"
+
+@MPI_TRUE@bigcheck: mpi-bench$(EXEEXT)
+@MPI_TRUE@	$(MAKE) $(AM_MAKEFLAGS) NUMCHECK=100 CHECKSIZE=60000 check-local
+
+@MPI_TRUE@smallcheck: mpi-bench$(EXEEXT)
+@MPI_TRUE@	$(MAKE) $(AM_MAKEFLAGS) NUMCHECK=2 check-local
+
+fftw3-mpi.f03: fftw3-mpi.f03.in
+	sed 's/C_MPI_FINT/@C_MPI_FINT@/' $(srcdir)/fftw3-mpi.f03.in > $@
+
+fftw3l-mpi.f03: fftw3l-mpi.f03.in
+	sed 's/C_MPI_FINT/@C_MPI_FINT@/' $(srcdir)/fftw3l-mpi.f03.in > $@
+
+@MAINTAINER_MODE_TRUE@fftw3-mpi.f03.in: fftw3-mpi.h f03api.sh $(top_srcdir)/api/genf03.pl
+@MAINTAINER_MODE_TRUE@	sh $(srcdir)/f03api.sh d f > $@
+
+@MAINTAINER_MODE_TRUE@fftw3l-mpi.f03.in: fftw3-mpi.h f03api.sh $(top_srcdir)/api/genf03.pl
+@MAINTAINER_MODE_TRUE@	sh $(srcdir)/f03api.sh l | grep -v parameter | sed 's/fftw3.f03/fftw3l.f03/' > $@
+
+@MAINTAINER_MODE_TRUE@f03-wrap.c: fftw3-mpi.h f03-wrap.sh genf03-wrap.pl
+@MAINTAINER_MODE_TRUE@	sh $(srcdir)/f03-wrap.sh > $@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/any-true.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/any-true.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* During planning, if any process fails to create a plan then
+   all of the processes must fail.  This synchronization is implemented
+   by the following routine.
+
+   Instead of 
+        if (failure) goto nada;
+   we instead do:
+        if (any_true(failure, comm)) goto nada;
+*/
+
+int XM(any_true)(int condition, MPI_Comm comm)
+{
+     int result;
+     MPI_Allreduce(&condition, &result, 1, MPI_INT, MPI_LOR, comm);
+     return result;
+}
+
+/***********************************************************************/
+
+#if defined(FFTW_DEBUG)
+/* for debugging, we include an assertion to make sure that
+   MPI problems all produce equal hashes, as checked by this routine: */
+
+int XM(md5_equal)(md5 m, MPI_Comm comm)
+{
+     unsigned long s0[4];
+     int i, eq_me, eq_all;
+
+     X(md5end)(&m);
+     for (i = 0; i < 4; ++i) s0[i] = m.s[i];
+     MPI_Bcast(s0, 4, MPI_UNSIGNED_LONG, 0, comm);
+     for (i = 0; i < 4 && s0[i] == m.s[i]; ++i) ;
+     eq_me = i == 4;
+     MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm);
+     return eq_all;
+}
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/api.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/api.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "fftw3-mpi.h"
+#include "ifftw-mpi.h"
+#include "mpi-transpose.h"
+#include "mpi-dft.h"
+#include "mpi-rdft.h"
+#include "mpi-rdft2.h"
+
+/* Convert API flags to internal MPI flags. */
+#define MPI_FLAGS(f) ((f) >> 27)
+
+/*************************************************************************/
+
+static int mpi_inited = 0;
+
+static MPI_Comm problem_comm(const problem *p) {
+     switch (p->adt->problem_kind) {
+	 case PROBLEM_MPI_DFT:
+	      return ((const problem_mpi_dft *) p)->comm;
+	 case PROBLEM_MPI_RDFT:
+	      return ((const problem_mpi_rdft *) p)->comm;
+	 case PROBLEM_MPI_RDFT2:
+	      return ((const problem_mpi_rdft2 *) p)->comm;
+	 case PROBLEM_MPI_TRANSPOSE:
+	      return ((const problem_mpi_transpose *) p)->comm;
+	 default:
+	      return MPI_COMM_NULL;
+     }
+}
+
+/* used to synchronize cost measurements (timing or estimation)
+   across all processes for an MPI problem, which is critical to
+   ensure that all processes decide to use the same MPI plans
+   (whereas serial plans need not be syncronized). */
+static double cost_hook(const problem *p, double t, cost_kind k)
+{
+     MPI_Comm comm = problem_comm(p);
+     double tsum;
+     if (comm == MPI_COMM_NULL) return t;
+     MPI_Allreduce(&t, &tsum, 1, MPI_DOUBLE, 
+		   k == COST_SUM ? MPI_SUM : MPI_MAX, comm);
+     return tsum;
+}
+
+/* Used to reject wisdom that is not in sync across all processes
+   for an MPI problem, which is critical to ensure that all processes
+   decide to use the same MPI plans.  (Even though costs are synchronized,
+   above, out-of-sync wisdom may result from plans being produced
+   by communicators that do not span all processes, either from a
+   user-specified communicator or e.g. from transpose-recurse. */
+static int wisdom_ok_hook(const problem *p, flags_t flags)
+{
+     MPI_Comm comm = problem_comm(p);
+     int eq_me, eq_all;
+     /* unpack flags bitfield, since MPI communications may involve
+	byte-order changes and MPI cannot do this for bit fields */
+#if SIZEOF_UNSIGNED_INT >= 4 /* must be big enough to hold 20-bit fields */
+     unsigned int f[5];
+#else
+     unsigned long f[5]; /* at least 32 bits as per C standard */
+#endif
+
+     if (comm == MPI_COMM_NULL) return 1; /* non-MPI wisdom is always ok */
+
+     if (XM(any_true)(0, comm)) return 0; /* some process had nowisdom_hook */
+
+     /* otherwise, check that the flags and solver index are identical
+	on all processes in this problem's communicator.
+
+	TO DO: possibly we can relax strict equality, but it is
+	critical to ensure that any flags which affect what plan is
+	created (and whether the solver is applicable) are the same,
+	e.g. DESTROY_INPUT, NO_UGLY, etcetera.  (If the MPI algorithm
+	differs between processes, deadlocks/crashes generally result.) */
+     f[0] = flags.l;
+     f[1] = flags.hash_info;
+     f[2] = flags.timelimit_impatience;
+     f[3] = flags.u;
+     f[4] = flags.slvndx;
+     MPI_Bcast(f, 5, 
+	       SIZEOF_UNSIGNED_INT >= 4 ? MPI_UNSIGNED : MPI_UNSIGNED_LONG,
+	       0, comm);
+     eq_me = f[0] == flags.l && f[1] == flags.hash_info
+	  && f[2] == flags.timelimit_impatience
+	  && f[3] == flags.u && f[4] == flags.slvndx;
+     MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm);
+     return eq_all;
+}
+
+/* This hook is called when wisdom is not found.  The any_true here
+   matches up with the any_true in wisdom_ok_hook, in order to handle
+   the case where some processes had wisdom (and called wisdom_ok_hook)
+   and some processes didn't have wisdom (and called nowisdom_hook). */
+static void nowisdom_hook(const problem *p)
+{
+     MPI_Comm comm = problem_comm(p);
+     if (comm == MPI_COMM_NULL) return; /* nothing to do for non-MPI p */
+     XM(any_true)(1, comm); /* signal nowisdom to any wisdom_ok_hook */
+}
+
+/* needed to synchronize planner bogosity flag, in case non-MPI problems
+   on a subset of processes encountered bogus wisdom */
+static wisdom_state_t bogosity_hook(wisdom_state_t state, const problem *p)
+{
+     MPI_Comm comm = problem_comm(p);
+     if (comm != MPI_COMM_NULL /* an MPI problem */
+	 && XM(any_true)(state == WISDOM_IS_BOGUS, comm)) /* bogus somewhere */
+	  return WISDOM_IS_BOGUS;
+     return state;
+}
+
+void XM(init)(void)
+{
+     if (!mpi_inited) {
+	  planner *plnr = X(the_planner)();
+	  plnr->cost_hook = cost_hook;
+	  plnr->wisdom_ok_hook = wisdom_ok_hook;
+	  plnr->nowisdom_hook = nowisdom_hook;
+	  plnr->bogosity_hook = bogosity_hook;
+          XM(conf_standard)(plnr);
+	  mpi_inited = 1;	  
+     }
+}
+
+void XM(cleanup)(void)
+{
+     X(cleanup)();
+     mpi_inited = 0;
+}
+
+/*************************************************************************/
+
+static dtensor *mkdtensor_api(int rnk, const XM(ddim) *dims0)
+{
+     dtensor *x = XM(mkdtensor)(rnk);
+     int i;
+     for (i = 0; i < rnk; ++i) {
+	  x->dims[i].n = dims0[i].n;
+	  x->dims[i].b[IB] = dims0[i].ib;
+	  x->dims[i].b[OB] = dims0[i].ob;
+     }
+     return x;
+}
+
+static dtensor *default_sz(int rnk, const XM(ddim) *dims0, int n_pes,
+			   int rdft2)
+{
+     dtensor *sz = XM(mkdtensor)(rnk);
+     dtensor *sz0 = mkdtensor_api(rnk, dims0);
+     block_kind k;
+     int i;
+
+     for (i = 0; i < rnk; ++i)
+	  sz->dims[i].n = dims0[i].n;
+
+     if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1;
+
+     for (i = 0; i < rnk; ++i) {
+	  sz->dims[i].b[IB] = dims0[i].ib ? dims0[i].ib : sz->dims[i].n;
+	  sz->dims[i].b[OB] = dims0[i].ob ? dims0[i].ob : sz->dims[i].n;
+     }
+
+     /* If we haven't used all of the processes yet, and some of the
+	block sizes weren't specified (i.e. 0), then set the
+	unspecified blocks so as to use as many processes as
+	possible with as few distributed dimensions as possible. */
+     FORALL_BLOCK_KIND(k) {
+	  INT nb = XM(num_blocks_total)(sz, k);
+	  INT np = n_pes / nb;
+	  for (i = 0; i < rnk && np > 1; ++i)
+	       if (!sz0->dims[i].b[k]) {
+		    sz->dims[i].b[k] = XM(default_block)(sz->dims[i].n, np);
+		    nb *= XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]);
+		    np = n_pes / nb;
+	       }
+     }
+
+     if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n;
+
+     /* punt for 1d prime */
+     if (rnk == 1 && X(is_prime)(sz->dims[0].n))
+	  sz->dims[0].b[IB] = sz->dims[0].b[OB] = sz->dims[0].n;
+
+     XM(dtensor_destroy)(sz0);
+     sz0 = XM(dtensor_canonical)(sz, 0);
+     XM(dtensor_destroy)(sz);
+     return sz0;
+}
+
+/* allocate simple local (serial) dims array corresponding to n[rnk] */
+static XM(ddim) *simple_dims(int rnk, const ptrdiff_t *n)
+{
+     XM(ddim) *dims = (XM(ddim) *) MALLOC(sizeof(XM(ddim)) * rnk,
+						TENSORS);
+     int i;
+     for (i = 0; i < rnk; ++i)
+	  dims[i].n = dims[i].ib = dims[i].ob = n[i];
+     return dims;
+}
+
+/*************************************************************************/
+
+static void local_size(int my_pe, const dtensor *sz, block_kind k,
+		       ptrdiff_t *local_n, ptrdiff_t *local_start)
+{
+     int i;
+     if (my_pe >= XM(num_blocks_total)(sz, k))
+	  for (i = 0; i < sz->rnk; ++i)
+	       local_n[i] = local_start[i] = 0;
+     else {
+	  XM(block_coords)(sz, k, my_pe, local_start);
+	  for (i = 0; i < sz->rnk; ++i) {
+	       local_n[i] = XM(block)(sz->dims[i].n, sz->dims[i].b[k],
+				      local_start[i]);
+	       local_start[i] *= sz->dims[i].b[k];
+	  }
+     }
+}
+
+static INT prod(int rnk, const ptrdiff_t *local_n) 
+{
+     int i;
+     INT N = 1;
+     for (i = 0; i < rnk; ++i) N *= local_n[i];
+     return N;
+}
+
+ptrdiff_t XM(local_size_guru)(int rnk, const XM(ddim) *dims0,
+			      ptrdiff_t howmany, MPI_Comm comm,
+			      ptrdiff_t *local_n_in,
+			      ptrdiff_t *local_start_in,
+			      ptrdiff_t *local_n_out, 
+			      ptrdiff_t *local_start_out,
+			      int sign, unsigned flags)
+{
+     INT N;
+     int my_pe, n_pes, i;
+     dtensor *sz;
+
+     if (rnk == 0)
+	  return howmany;
+
+     MPI_Comm_rank(comm, &my_pe);
+     MPI_Comm_size(comm, &n_pes);
+     sz = default_sz(rnk, dims0, n_pes, 0);
+
+     /* Now, we must figure out how much local space the user should
+	allocate (or at least an upper bound).  This depends strongly
+	on the exact algorithms we employ...ugh!  FIXME: get this info
+	from the solvers somehow? */
+     N = 1; /* never return zero allocation size */
+     if (rnk > 1 && XM(is_block1d)(sz, IB) && XM(is_block1d)(sz, OB)) {
+	  INT Nafter;
+	  ddim odims[2];
+
+	  /* dft-rank-geq2-transposed */
+	  odims[0] = sz->dims[0]; odims[1] = sz->dims[1]; /* save */
+	  /* we may need extra space for transposed intermediate data */
+	  for (i = 0; i < 2; ++i)
+	       if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[IB]) == 1 &&
+		   XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[OB]) == 1) {
+		    sz->dims[i].b[IB]
+			 = XM(default_block)(sz->dims[i].n, n_pes);
+		    sz->dims[1-i].b[IB] = sz->dims[1-i].n;
+		    local_size(my_pe, sz, IB, local_n_in, local_start_in);
+		    N = X(imax)(N, prod(rnk, local_n_in));
+		    sz->dims[i] = odims[i];
+		    sz->dims[1-i] = odims[1-i];
+		    break;
+	       }
+
+	  /* dft-rank-geq2 */
+	  Nafter = howmany;
+	  for (i = 1; i < sz->rnk; ++i) Nafter *= sz->dims[i].n;
+	  N = X(imax)(N, (sz->dims[0].n
+			  * XM(block)(Nafter, XM(default_block)(Nafter, n_pes),
+				      my_pe) + howmany - 1) / howmany);
+
+	  /* dft-rank-geq2 with dimensions swapped */
+	  Nafter = howmany * sz->dims[0].n;
+          for (i = 2; i < sz->rnk; ++i) Nafter *= sz->dims[i].n;
+          N = X(imax)(N, (sz->dims[1].n
+                          * XM(block)(Nafter, XM(default_block)(Nafter, n_pes),
+                                      my_pe) + howmany - 1) / howmany);
+     }
+     else if (rnk == 1) {
+	  if (howmany >= n_pes && !MPI_FLAGS(flags)) { /* dft-rank1-bigvec */
+	       ptrdiff_t n[2], start[2];
+	       dtensor *sz2 = XM(mkdtensor)(2);
+	       sz2->dims[0] = sz->dims[0];
+	       sz2->dims[0].b[IB] = sz->dims[0].n;
+	       sz2->dims[1].n = sz2->dims[1].b[OB] = howmany;
+	       sz2->dims[1].b[IB] = XM(default_block)(howmany, n_pes);
+	       local_size(my_pe, sz2, IB, n, start);
+	       XM(dtensor_destroy)(sz2);
+	       N = X(imax)(N, (prod(2, n) + howmany - 1) / howmany);
+	  }
+	  else { /* dft-rank1 */
+	       INT r, m, rblock[2], mblock[2];
+
+	       /* Since the 1d transforms are so different, we require
+		  the user to call local_size_1d for this case.  Ugh. */
+	       CK(sign == FFTW_FORWARD || sign == FFTW_BACKWARD);
+
+	       if ((r = XM(choose_radix)(sz->dims[0], n_pes, flags, sign,
+					 rblock, mblock))) {
+		    m = sz->dims[0].n / r;
+		    if (flags & FFTW_MPI_SCRAMBLED_IN)
+			 sz->dims[0].b[IB] = rblock[IB] * m;
+		    else { /* !SCRAMBLED_IN */
+			 sz->dims[0].b[IB] = r * mblock[IB];
+			 N = X(imax)(N, rblock[IB] * m);
+		    }
+		    if (flags & FFTW_MPI_SCRAMBLED_OUT)
+			 sz->dims[0].b[OB] = r * mblock[OB];
+		    else { /* !SCRAMBLED_OUT */
+			 N = X(imax)(N, r * mblock[OB]);
+			 sz->dims[0].b[OB] = rblock[OB] * m;
+		    }
+	       }
+	  }
+     }
+
+     local_size(my_pe, sz, IB, local_n_in, local_start_in);
+     local_size(my_pe, sz, OB, local_n_out, local_start_out);
+
+     /* at least, make sure we have enough space to store input & output */
+     N = X(imax)(N, X(imax)(prod(rnk, local_n_in), prod(rnk, local_n_out)));
+
+     XM(dtensor_destroy)(sz);
+     return N * howmany;
+}
+
+ptrdiff_t XM(local_size_many_transposed)(int rnk, const ptrdiff_t *n,
+					 ptrdiff_t howmany,
+					 ptrdiff_t xblock, ptrdiff_t yblock,
+					 MPI_Comm comm,
+					 ptrdiff_t *local_nx,
+					 ptrdiff_t *local_x_start,
+					 ptrdiff_t *local_ny,
+					 ptrdiff_t *local_y_start)
+{
+     ptrdiff_t N;
+     XM(ddim) *dims; 
+     ptrdiff_t *local;
+
+     if (rnk == 0) {
+	  *local_nx = *local_ny = 1;
+	  *local_x_start = *local_y_start = 0;
+	  return howmany;
+     }
+
+     dims = simple_dims(rnk, n);
+     local = (ptrdiff_t *) MALLOC(sizeof(ptrdiff_t) * rnk * 4, TENSORS);
+
+     /* default 1d block distribution, with transposed output
+        if yblock < n[1] */
+     dims[0].ib = xblock;
+     if (rnk > 1) {
+	  if (yblock < n[1])
+	       dims[1].ob = yblock;
+	  else
+	       dims[0].ob = xblock;
+     }
+     else
+	  dims[0].ob = xblock; /* FIXME: 1d not really supported here 
+				         since we don't have flags/sign */
+     
+     N = XM(local_size_guru)(rnk, dims, howmany, comm, 
+			     local, local + rnk,
+			     local + 2*rnk, local + 3*rnk,
+			     0, 0);
+     *local_nx = local[0];
+     *local_x_start = local[rnk];
+     if (rnk > 1) {
+	  *local_ny = local[2*rnk + 1];
+	  *local_y_start = local[3*rnk + 1];
+     }
+     else {
+	  *local_ny = *local_nx;
+	  *local_y_start = *local_x_start;
+     }
+     X(ifree)(local);
+     X(ifree)(dims);
+     return N;
+}
+
+ptrdiff_t XM(local_size_many)(int rnk, const ptrdiff_t *n,
+			      ptrdiff_t howmany, 
+			      ptrdiff_t xblock,
+			      MPI_Comm comm,
+			      ptrdiff_t *local_nx,
+			      ptrdiff_t *local_x_start)
+{
+     ptrdiff_t local_ny, local_y_start;
+     return XM(local_size_many_transposed)(rnk, n, howmany,
+					   xblock, rnk > 1 
+					   ? n[1] : FFTW_MPI_DEFAULT_BLOCK,
+					   comm,
+					   local_nx, local_x_start,
+					   &local_ny, &local_y_start);
+}
+
+
+ptrdiff_t XM(local_size_transposed)(int rnk, const ptrdiff_t *n,
+				    MPI_Comm comm,
+				    ptrdiff_t *local_nx,
+				    ptrdiff_t *local_x_start,
+				    ptrdiff_t *local_ny,
+				    ptrdiff_t *local_y_start)
+{
+     return XM(local_size_many_transposed)(rnk, n, 1,
+					   FFTW_MPI_DEFAULT_BLOCK,
+					   FFTW_MPI_DEFAULT_BLOCK,
+					   comm,
+					   local_nx, local_x_start,
+					   local_ny, local_y_start);
+}
+
+ptrdiff_t XM(local_size)(int rnk, const ptrdiff_t *n,
+			 MPI_Comm comm,
+			 ptrdiff_t *local_nx,
+			 ptrdiff_t *local_x_start)
+{
+     return XM(local_size_many)(rnk, n, 1, FFTW_MPI_DEFAULT_BLOCK, comm,
+				local_nx, local_x_start);
+}
+
+ptrdiff_t XM(local_size_many_1d)(ptrdiff_t nx, ptrdiff_t howmany, 
+				 MPI_Comm comm, int sign, unsigned flags,
+				 ptrdiff_t *local_nx, ptrdiff_t *local_x_start,
+				 ptrdiff_t *local_ny, ptrdiff_t *local_y_start)
+{
+     XM(ddim) d;
+     d.n = nx;
+     d.ib = d.ob = FFTW_MPI_DEFAULT_BLOCK;
+     return XM(local_size_guru)(1, &d, howmany, comm,
+				local_nx, local_x_start,
+				local_ny, local_y_start, sign, flags);
+}
+
+ptrdiff_t XM(local_size_1d)(ptrdiff_t nx,
+			    MPI_Comm comm, int sign, unsigned flags,
+			    ptrdiff_t *local_nx, ptrdiff_t *local_x_start,
+			    ptrdiff_t *local_ny, ptrdiff_t *local_y_start)
+{
+     return XM(local_size_many_1d)(nx, 1, comm, sign, flags,
+				   local_nx, local_x_start,
+				   local_ny, local_y_start);
+}
+
+ptrdiff_t XM(local_size_2d_transposed)(ptrdiff_t nx, ptrdiff_t ny,
+				       MPI_Comm comm,
+				       ptrdiff_t *local_nx,
+				       ptrdiff_t *local_x_start,
+				       ptrdiff_t *local_ny, 
+				       ptrdiff_t *local_y_start)
+{
+     ptrdiff_t n[2];
+     n[0] = nx; n[1] = ny;
+     return XM(local_size_transposed)(2, n, comm,
+				      local_nx, local_x_start,
+				      local_ny, local_y_start);
+}
+
+ptrdiff_t XM(local_size_2d)(ptrdiff_t nx, ptrdiff_t ny, MPI_Comm comm,
+			       ptrdiff_t *local_nx, ptrdiff_t *local_x_start)
+{
+     ptrdiff_t n[2];
+     n[0] = nx; n[1] = ny;
+     return XM(local_size)(2, n, comm, local_nx, local_x_start);
+}
+
+ptrdiff_t XM(local_size_3d_transposed)(ptrdiff_t nx, ptrdiff_t ny,
+				       ptrdiff_t nz,
+				       MPI_Comm comm,
+				       ptrdiff_t *local_nx,
+				       ptrdiff_t *local_x_start,
+				       ptrdiff_t *local_ny, 
+				       ptrdiff_t *local_y_start)
+{
+     ptrdiff_t n[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     return XM(local_size_transposed)(3, n, comm,
+				      local_nx, local_x_start,
+				      local_ny, local_y_start);
+}
+
+ptrdiff_t XM(local_size_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
+			    MPI_Comm comm,
+			    ptrdiff_t *local_nx, ptrdiff_t *local_x_start)
+{
+     ptrdiff_t n[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     return XM(local_size)(3, n, comm, local_nx, local_x_start);
+}
+
+/*************************************************************************/
+/* Transpose API */
+
+X(plan) XM(plan_many_transpose)(ptrdiff_t nx, ptrdiff_t ny, 
+				ptrdiff_t howmany,
+				ptrdiff_t xblock, ptrdiff_t yblock,
+				R *in, R *out, 
+				MPI_Comm comm, unsigned flags)
+{
+     int n_pes;
+     XM(init)();
+
+     if (howmany < 0 || xblock < 0 || yblock < 0 ||
+	 nx <= 0 || ny <= 0) return 0;
+
+     MPI_Comm_size(comm, &n_pes);
+     if (!xblock) xblock = XM(default_block)(nx, n_pes);
+     if (!yblock) yblock = XM(default_block)(ny, n_pes);
+     if (n_pes < XM(num_blocks)(nx, xblock)
+	 || n_pes < XM(num_blocks)(ny, yblock))
+	  return 0;
+
+     return 
+	  X(mkapiplan)(FFTW_FORWARD, flags,
+		       XM(mkproblem_transpose)(nx, ny, howmany,
+					       in, out, xblock, yblock,
+					       comm, MPI_FLAGS(flags)));
+}
+
+X(plan) XM(plan_transpose)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out, 
+			   MPI_Comm comm, unsigned flags)
+			      
+{
+     return XM(plan_many_transpose)(nx, ny, 1,
+				    FFTW_MPI_DEFAULT_BLOCK,
+				    FFTW_MPI_DEFAULT_BLOCK,
+				    in, out, comm, flags);
+}
+
+/*************************************************************************/
+/* Complex DFT API */
+
+X(plan) XM(plan_guru_dft)(int rnk, const XM(ddim) *dims0,
+			  ptrdiff_t howmany,
+			  C *in, C *out,
+			  MPI_Comm comm, int sign, unsigned flags)
+{
+     int n_pes, i;
+     dtensor *sz;
+     
+     XM(init)();
+
+     if (howmany < 0 || rnk < 1) return 0;
+     for (i = 0; i < rnk; ++i)
+	  if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
+	       return 0;
+
+     MPI_Comm_size(comm, &n_pes);
+     sz = default_sz(rnk, dims0, n_pes, 0);
+
+     if (XM(num_blocks_total)(sz, IB) > n_pes
+	 || XM(num_blocks_total)(sz, OB) > n_pes) {
+	  XM(dtensor_destroy)(sz);
+	  return 0;
+     }
+
+     return
+          X(mkapiplan)(sign, flags,
+                       XM(mkproblem_dft_d)(sz, howmany,
+					   (R *) in, (R *) out,
+					   comm, sign, 
+					   MPI_FLAGS(flags)));
+}
+
+X(plan) XM(plan_many_dft)(int rnk, const ptrdiff_t *n,
+			  ptrdiff_t howmany,
+			  ptrdiff_t iblock, ptrdiff_t oblock,
+			  C *in, C *out,
+			  MPI_Comm comm, int sign, unsigned flags)
+{
+     XM(ddim) *dims = simple_dims(rnk, n);
+     X(plan) pln;
+
+     if (rnk == 1) {
+	  dims[0].ib = iblock;
+	  dims[0].ob = oblock;
+     }
+     else if (rnk > 1) {
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
+     }
+
+     pln = XM(plan_guru_dft)(rnk,dims,howmany, in,out, comm, sign, flags);
+     X(ifree)(dims);
+     return pln;
+}
+
+X(plan) XM(plan_dft)(int rnk, const ptrdiff_t *n, C *in, C *out,
+		     MPI_Comm comm, int sign, unsigned flags)
+{
+     return XM(plan_many_dft)(rnk, n, 1,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      in, out, comm, sign, flags);
+}
+
+X(plan) XM(plan_dft_1d)(ptrdiff_t nx, C *in, C *out,
+			MPI_Comm comm, int sign, unsigned flags)
+{
+     return XM(plan_dft)(1, &nx, in, out, comm, sign, flags);
+}
+
+X(plan) XM(plan_dft_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, C *out,
+			MPI_Comm comm, int sign, unsigned flags)
+{
+     ptrdiff_t n[2];
+     n[0] = nx; n[1] = ny;
+     return XM(plan_dft)(2, n, in, out, comm, sign, flags);
+}
+
+X(plan) XM(plan_dft_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
+			C *in, C *out,
+			MPI_Comm comm, int sign, unsigned flags)
+{
+     ptrdiff_t n[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     return XM(plan_dft)(3, n, in, out, comm, sign, flags);
+}
+
+/*************************************************************************/
+/* R2R API */
+
+X(plan) XM(plan_guru_r2r)(int rnk, const XM(ddim) *dims0,
+			  ptrdiff_t howmany,
+			  R *in, R *out,
+			  MPI_Comm comm, const X(r2r_kind) *kind,
+			  unsigned flags)
+{
+     int n_pes, i;
+     dtensor *sz;
+     rdft_kind *k;
+     X(plan) pln;
+     
+     XM(init)();
+
+     if (howmany < 0 || rnk < 1) return 0;
+     for (i = 0; i < rnk; ++i)
+	  if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
+	       return 0;
+
+     k = X(map_r2r_kind)(rnk, kind);
+
+     MPI_Comm_size(comm, &n_pes);
+     sz = default_sz(rnk, dims0, n_pes, 0);
+
+     if (XM(num_blocks_total)(sz, IB) > n_pes
+	 || XM(num_blocks_total)(sz, OB) > n_pes) {
+	  XM(dtensor_destroy)(sz);
+	  return 0;
+     }
+
+     pln = X(mkapiplan)(0, flags,
+			XM(mkproblem_rdft_d)(sz, howmany,
+					     in, out,
+					     comm, k, MPI_FLAGS(flags)));
+     X(ifree0)(k);
+     return pln;
+}
+
+X(plan) XM(plan_many_r2r)(int rnk, const ptrdiff_t *n,
+			  ptrdiff_t howmany,
+			  ptrdiff_t iblock, ptrdiff_t oblock,
+			  R *in, R *out,
+			  MPI_Comm comm, const X(r2r_kind) *kind,
+			  unsigned flags)
+{
+     XM(ddim) *dims = simple_dims(rnk, n);
+     X(plan) pln;
+
+     if (rnk == 1) {
+	  dims[0].ib = iblock;
+	  dims[0].ob = oblock;
+     }
+     else if (rnk > 1) {
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
+     }
+
+     pln = XM(plan_guru_r2r)(rnk,dims,howmany, in,out, comm, kind, flags);
+     X(ifree)(dims);
+     return pln;
+}
+
+X(plan) XM(plan_r2r)(int rnk, const ptrdiff_t *n, R *in, R *out,
+		     MPI_Comm comm, 
+		     const X(r2r_kind) *kind,
+		     unsigned flags)
+{
+     return XM(plan_many_r2r)(rnk, n, 1,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      in, out, comm, kind, flags);
+}
+
+X(plan) XM(plan_r2r_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out,
+			MPI_Comm comm,
+			X(r2r_kind) kindx, X(r2r_kind) kindy,
+			unsigned flags)
+{
+     ptrdiff_t n[2];
+     X(r2r_kind) kind[2];
+     n[0] = nx; n[1] = ny;
+     kind[0] = kindx; kind[1] = kindy;
+     return XM(plan_r2r)(2, n, in, out, comm, kind, flags);
+}
+
+X(plan) XM(plan_r2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
+			R *in, R *out,
+			MPI_Comm comm, 
+			X(r2r_kind) kindx, X(r2r_kind) kindy,
+			X(r2r_kind) kindz,
+			unsigned flags)
+{
+     ptrdiff_t n[3];
+     X(r2r_kind) kind[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     kind[0] = kindx; kind[1] = kindy; kind[2] = kindz;
+     return XM(plan_r2r)(3, n, in, out, comm, kind, flags);
+}
+
+/*************************************************************************/
+/* R2C/C2R API */
+
+static X(plan) plan_guru_rdft2(int rnk, const XM(ddim) *dims0,
+			       ptrdiff_t howmany,
+			       R *r, C *c,
+			       MPI_Comm comm, rdft_kind kind, unsigned flags)
+{
+     int n_pes, i;
+     dtensor *sz;
+     R *cr = (R *) c;
+     
+     XM(init)();
+
+     if (howmany < 0 || rnk < 2) return 0;
+     for (i = 0; i < rnk; ++i)
+	  if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
+	       return 0;
+
+     MPI_Comm_size(comm, &n_pes);
+     sz = default_sz(rnk, dims0, n_pes, 1);
+
+     sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1;
+     if (XM(num_blocks_total)(sz, IB) > n_pes
+	 || XM(num_blocks_total)(sz, OB) > n_pes) {
+	  XM(dtensor_destroy)(sz);
+	  return 0;
+     }
+     sz->dims[rnk-1].n = dims0[rnk-1].n;
+
+     if (kind == R2HC)
+	  return X(mkapiplan)(0, flags,
+			      XM(mkproblem_rdft2_d)(sz, howmany,
+						    r, cr, comm, R2HC, 
+						    MPI_FLAGS(flags)));
+     else
+	  return X(mkapiplan)(0, flags,
+			      XM(mkproblem_rdft2_d)(sz, howmany,
+						    cr, r, comm, HC2R, 
+						    MPI_FLAGS(flags)));
+}
+
+X(plan) XM(plan_many_dft_r2c)(int rnk, const ptrdiff_t *n,
+			  ptrdiff_t howmany,
+			  ptrdiff_t iblock, ptrdiff_t oblock,
+			  R *in, C *out,
+			  MPI_Comm comm, unsigned flags)
+{
+     XM(ddim) *dims = simple_dims(rnk, n);
+     X(plan) pln;
+
+     if (rnk == 1) {
+	  dims[0].ib = iblock;
+	  dims[0].ob = oblock;
+     }
+     else if (rnk > 1) {
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
+     }
+
+     pln = plan_guru_rdft2(rnk,dims,howmany, in,out, comm, R2HC, flags);
+     X(ifree)(dims);
+     return pln;
+}
+
+X(plan) XM(plan_many_dft_c2r)(int rnk, const ptrdiff_t *n,
+			  ptrdiff_t howmany,
+			  ptrdiff_t iblock, ptrdiff_t oblock,
+			  C *in, R *out,
+			  MPI_Comm comm, unsigned flags)
+{
+     XM(ddim) *dims = simple_dims(rnk, n);
+     X(plan) pln;
+
+     if (rnk == 1) {
+	  dims[0].ib = iblock;
+	  dims[0].ob = oblock;
+     }
+     else if (rnk > 1) {
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
+     }
+
+     pln = plan_guru_rdft2(rnk,dims,howmany, out,in, comm, HC2R, flags);
+     X(ifree)(dims);
+     return pln;
+}
+
+X(plan) XM(plan_dft_r2c)(int rnk, const ptrdiff_t *n, R *in, C *out,
+		     MPI_Comm comm, unsigned flags)
+{
+     return XM(plan_many_dft_r2c)(rnk, n, 1,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      in, out, comm, flags);
+}
+
+X(plan) XM(plan_dft_r2c_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, C *out,
+			MPI_Comm comm, unsigned flags)
+{
+     ptrdiff_t n[2];
+     n[0] = nx; n[1] = ny;
+     return XM(plan_dft_r2c)(2, n, in, out, comm, flags);
+}
+
+X(plan) XM(plan_dft_r2c_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
+			R *in, C *out, MPI_Comm comm, unsigned flags)
+{
+     ptrdiff_t n[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     return XM(plan_dft_r2c)(3, n, in, out, comm, flags);
+}
+
+X(plan) XM(plan_dft_c2r)(int rnk, const ptrdiff_t *n, C *in, R *out,
+		     MPI_Comm comm, unsigned flags)
+{
+     return XM(plan_many_dft_c2r)(rnk, n, 1,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      in, out, comm, flags);
+}
+
+X(plan) XM(plan_dft_c2r_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, R *out,
+			MPI_Comm comm, unsigned flags)
+{
+     ptrdiff_t n[2];
+     n[0] = nx; n[1] = ny;
+     return XM(plan_dft_c2r)(2, n, in, out, comm, flags);
+}
+
+X(plan) XM(plan_dft_c2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
+			C *in, R *out, MPI_Comm comm, unsigned flags)
+{
+     ptrdiff_t n[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     return XM(plan_dft_c2r)(3, n, in, out, comm, flags);
+}
+
+/*************************************************************************/
+/* New-array execute functions */
+
+void XM(execute_dft)(const X(plan) p, C *in, C *out) {
+     /* internally, MPI plans are just rdft plans */
+     X(execute_r2r)(p, (R*) in, (R*) out);
+}
+
+void XM(execute_dft_r2c)(const X(plan) p, R *in, C *out) {
+     /* internally, MPI plans are just rdft plans */
+     X(execute_r2r)(p, in, (R*) out);
+}
+
+void XM(execute_dft_c2r)(const X(plan) p, C *in, R *out) {
+     /* internally, MPI plans are just rdft plans */
+     X(execute_r2r)(p, (R*) in, out);
+}
+
+void XM(execute_r2r)(const X(plan) p, R *in, R *out) {
+     /* internally, MPI plans are just rdft plans */
+     X(execute_r2r)(p, in, out);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/block.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/block.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+INT XM(num_blocks)(INT n, INT block)
+{
+     return (n + block - 1) / block;
+}
+
+int XM(num_blocks_ok)(INT n, INT block, MPI_Comm comm)
+{
+     int n_pes;
+     MPI_Comm_size(comm, &n_pes);
+     return n_pes >= XM(num_blocks)(n, block);
+}
+
+/* Pick a default block size for dividing a problem of size n among
+   n_pes processes.  Divide as equally as possible, while minimizing
+   the maximum block size among the processes as well as the number of
+   processes with nonzero blocks. */
+INT XM(default_block)(INT n, int n_pes)
+{
+     return ((n + n_pes - 1) / n_pes);
+}
+
+/* For a given block size and dimension n, compute the block size 
+   on the given process. */
+INT XM(block)(INT n, INT block, int which_block)
+{
+     INT d = n - which_block * block;
+     return d <= 0 ? 0 : (d > block ? block : d);
+}
+
+static INT num_blocks_kind(const ddim *dim, block_kind k)
+{
+     return XM(num_blocks)(dim->n, dim->b[k]);
+}
+
+INT XM(num_blocks_total)(const dtensor *sz, block_kind k)
+{
+     if (FINITE_RNK(sz->rnk)) {
+	  int i;
+	  INT ntot = 1;
+	  for (i = 0; i < sz->rnk; ++i)
+	       ntot *= num_blocks_kind(sz->dims + i, k);
+	  return ntot;
+     }
+     else
+	  return 0;
+}
+
+int XM(idle_process)(const dtensor *sz, block_kind k, int which_pe)
+{
+     return (which_pe >= XM(num_blocks_total)(sz, k));
+}
+
+/* Given a non-idle process which_pe, computes the coordinate
+   vector coords[rnk] giving the coordinates of a block in the
+   matrix of blocks.  k specifies whether we are talking about
+   the input or output data distribution. */
+void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe, 
+		     INT *coords)
+{
+     int i;
+     A(!XM(idle_process)(sz, k, which_pe) && FINITE_RNK(sz->rnk));
+     for (i = sz->rnk - 1; i >= 0; --i) {
+	  INT nb = num_blocks_kind(sz->dims + i, k);
+	  coords[i] = which_pe % nb;
+	  which_pe /= nb;
+     }
+}
+
+INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe)
+{
+     if (XM(idle_process)(sz, k, which_pe))
+	  return 0;
+     else {
+	  int i;
+	  INT N = 1, *coords;
+	  STACK_MALLOC(INT*, coords, sizeof(INT) * sz->rnk);
+	  XM(block_coords)(sz, k, which_pe, coords);
+	  for (i = 0; i < sz->rnk; ++i)
+	       N *= XM(block)(sz->dims[i].n, sz->dims[i].b[k], coords[i]);
+	  STACK_FREE(coords);
+	  return N;
+     }
+}
+
+/* returns whether sz is local for dims >= dim */
+int XM(is_local_after)(int dim, const dtensor *sz, block_kind k)
+{
+     if (FINITE_RNK(sz->rnk))
+	  for (; dim < sz->rnk; ++dim)
+	       if (XM(num_blocks)(sz->dims[dim].n, sz->dims[dim].b[k]) > 1)
+		    return 0;
+     return 1;
+}
+
+int XM(is_local)(const dtensor *sz, block_kind k)
+{
+     return XM(is_local_after)(0, sz, k);
+}
+
+/* Return whether sz is distributed for k according to a simple
+   1d block distribution in the first or second dimensions */
+int XM(is_block1d)(const dtensor *sz, block_kind k)
+{
+     int i;
+     if (!FINITE_RNK(sz->rnk)) return 0;
+     for (i = 0; i < sz->rnk && num_blocks_kind(sz->dims + i, k) == 1; ++i) ;
+     return(i < sz->rnk && i < 2 && XM(is_local_after)(i + 1, sz, k));
+
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/choose-radix.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/choose-radix.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* Return the radix r for a 1d MPI transform of a distributed dimension d,
+   with the given flags and transform size.   That is, decomposes d.n
+   as r * m, Cooley-Tukey style.  Also computes the block sizes rblock
+   and mblock.  Returns 0 if such a decomposition is not feasible.
+   This is unfortunately somewhat complicated.
+
+   A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c):
+
+   d.n is initially distributed as an m x r array with block size mblock[IB].
+   Then it is internally transposed to an r x m array with block size
+   rblock[IB].  Then it is internally transposed to m x r again with block
+   size mblock[OB].  Finally, it is transposed to r x m with block size
+   rblock[IB].
+
+   If flags & SCRAMBLED_IN, then the first transpose is skipped (the array
+   starts out as r x m).  If flags & SCRAMBLED_OUT, then the last transpose
+   is skipped (the array ends up as m x r).  To make sure the forward
+   and backward transforms use the same "scrambling" format, we swap r
+   and m when sign != FFT_SIGN.
+
+   There are some downsides to this, especially in the case where
+   either m or r is not divisible by n_pes.  For one thing, it means
+   that in general we can't use the same block size for the input and
+   output.  For another thing, it means that we can't in general honor
+   a user's "requested" block sizes in d.b[].  Therefore, for simplicity,
+   we simply ignore d.b[] for now.
+*/
+INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
+		     INT rblock[2], INT mblock[2])
+{
+     INT r, m;
+
+     UNUSED(flags); /* we would need this if we paid attention to d.b[*] */
+
+     /* If n_pes is a factor of d.n, then choose r to be d.n / n_pes.
+        This not only ensures that the input (the m dimension) is
+        equally distributed if possible, and at the r dimension is
+        maximally equally distributed (if d.n/n_pes >= n_pes), it also
+        makes one of the local transpositions in the algorithm
+        trivial. */
+     if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/
+	 && d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */)
+	  r = d.n / n_pes;
+     else {  /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */
+	  for (r = X(isqrt)(d.n); d.n % r != 0; ++r)
+	       ;
+     }
+     if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */
+
+     if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */
+	  m = r;
+	  r = d.n / m;
+     }
+     else
+	  m = d.n / r;
+
+     rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes);
+     mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes);
+
+     return r;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/conf.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/conf.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "mpi-transpose.h"
+#include "mpi-dft.h"
+#include "mpi-rdft.h"
+#include "mpi-rdft2.h"
+
+static const solvtab s =
+{
+     SOLVTAB(XM(transpose_pairwise_register)),
+     SOLVTAB(XM(transpose_alltoall_register)),
+     SOLVTAB(XM(transpose_recurse_register)),
+     SOLVTAB(XM(dft_rank_geq2_register)),
+     SOLVTAB(XM(dft_rank_geq2_transposed_register)),
+     SOLVTAB(XM(dft_serial_register)),
+     SOLVTAB(XM(dft_rank1_bigvec_register)),
+     SOLVTAB(XM(dft_rank1_register)),
+     SOLVTAB(XM(rdft_rank_geq2_register)),
+     SOLVTAB(XM(rdft_rank_geq2_transposed_register)),
+     SOLVTAB(XM(rdft_serial_register)),
+     SOLVTAB(XM(rdft_rank1_bigvec_register)),
+     SOLVTAB(XM(rdft2_rank_geq2_register)),
+     SOLVTAB(XM(rdft2_rank_geq2_transposed_register)),
+     SOLVTAB(XM(rdft2_serial_register)),
+     SOLVTAB_END
+};
+
+void XM(conf_standard)(planner *p)
+{
+     X(solvtab_exec)(s, p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/dft-problem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/dft-problem.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-dft.h"
+
+static void destroy(problem *ego_)
+{
+     problem_mpi_dft *ego = (problem_mpi_dft *) ego_;
+     XM(dtensor_destroy)(ego->sz);
+     MPI_Comm_free(&ego->comm);
+     X(ifree)(ego_);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     int i;
+     X(md5puts)(m, "mpi-dft");
+     X(md5int)(m, p->I == p->O);
+     /* don't include alignment -- may differ between processes
+	X(md5int)(m, X(alignment_of)(p->I));
+	X(md5int)(m, X(alignment_of)(p->O));
+	... note that applicability of MPI plans does not depend
+	    on alignment (although optimality may, in principle). */
+     XM(dtensor_md5)(m, p->sz);
+     X(md5INT)(m, p->vn);
+     X(md5int)(m, p->sign);
+     X(md5int)(m, p->flags);
+     MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
+     A(XM(md5_equal)(*m, p->comm));
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_mpi_dft *ego = (const problem_mpi_dft *) ego_;
+     int i;
+     p->print(p, "(mpi-dft %d %d %d ", 
+	      ego->I == ego->O,
+	      X(alignment_of)(ego->I),
+	      X(alignment_of)(ego->O));
+     XM(dtensor_print)(ego->sz, p);
+     p->print(p, " %D %d %d", ego->vn, ego->sign, ego->flags);
+     MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_mpi_dft *ego = (const problem_mpi_dft *) ego_;
+     R *I = ego->I;
+     INT i, N;
+     int my_pe;
+
+     MPI_Comm_rank(ego->comm, &my_pe);
+     N = 2 * ego->vn * XM(total_block)(ego->sz, IB, my_pe);
+     for (i = 0; i < N; ++i) I[i] = K(0.0);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_MPI_DFT,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+problem *XM(mkproblem_dft)(const dtensor *sz, INT vn,
+			   R *I, R *O,
+			   MPI_Comm comm,
+			   int sign,
+			   unsigned flags)
+{
+     problem_mpi_dft *ego =
+          (problem_mpi_dft *)X(mkproblem)(sizeof(problem_mpi_dft), &padt);
+     int n_pes;
+
+     A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk));
+     MPI_Comm_size(comm, &n_pes);
+     A(n_pes >= XM(num_blocks_total)(sz, IB)
+       && n_pes >= XM(num_blocks_total)(sz, OB));
+     A(vn >= 0);
+     A(sign == -1 || sign == 1);
+
+     /* enforce pointer equality if untainted pointers are equal */
+     if (UNTAINT(I) == UNTAINT(O))
+	  I = O = JOIN_TAINT(I, O);
+
+     ego->sz = XM(dtensor_canonical)(sz, 1);
+     ego->vn = vn;
+     ego->I = I;
+     ego->O = O;
+     ego->sign = sign;
+
+     /* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by
+        swapping the first two dimensions (for rnk > 1) */
+     if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) {
+	  ddim dim0 = ego->sz->dims[0];
+	  ego->sz->dims[0] = ego->sz->dims[1];
+	  ego->sz->dims[1] = dim0;
+	  flags &= ~TRANSPOSED_IN;
+	  flags ^= TRANSPOSED_OUT;
+     }
+     ego->flags = flags;
+
+     MPI_Comm_dup(comm, &ego->comm);
+
+     return &(ego->super);
+}
+
+problem *XM(mkproblem_dft_d)(dtensor *sz, INT vn,
+			     R *I, R *O,
+			     MPI_Comm comm,
+			     int sign,
+			     unsigned flags)
+{
+     problem *p = XM(mkproblem_dft)(sz, vn, I, O, comm, sign, flags);
+     XM(dtensor_destroy)(sz);
+     return p;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/dft-rank-geq2-transposed.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/dft-rank-geq2-transposed.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex DFTs of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is transposed both
+   in data distribution and in ordering (for the first 2 dimensions).
+
+   (Note that we don't have to handle the case where the input is
+   transposed, since this is equivalent to transposed output with the
+   first two dimensions swapped, and is automatically canonicalized as
+   such by dft-problem.c. */
+
+#include "mpi-dft.h"
+#include "mpi-transpose.h"
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_dft super;
+
+     plan *cld1, *cldt, *cld2;
+     INT roff, ioff;
+     int preserve_input;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld1, *cld2;
+     plan_rdft *cldt;
+     INT roff = ego->roff, ioff = ego->ioff;
+     
+     /* DFT local dimensions */
+     cld1 = (plan_dft *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I+roff, I+ioff, O+roff, O+ioff);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I+roff, I+ioff, I+roff, I+ioff);
+
+     /* global transpose */
+     cldt = (plan_rdft *) ego->cldt;
+     cldt->apply(ego->cldt, I, O);
+
+     /* DFT final local dimension */
+     cld2 = (plan_dft *) ego->cld2;
+     cld2->apply(ego->cld2, O+roff, O+ioff, O+roff, O+ioff);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && p->flags == TRANSPOSED_OUT
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && XM(is_local_after)(1, p->sz, IB)
+	     && XM(is_local_after)(2, p->sz, OB)
+	     && XM(num_blocks)(p->sz->dims[0].n, p->sz->dims[0].b[OB]) == 1
+	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
+		 || !XM(dft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cldt, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cldt);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-dft-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cldt, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_dft *p;
+     P *pln;
+     plan *cld1 = 0, *cldt = 0, *cld2 = 0;
+     R *ri, *ii, *ro, *io, *I, *O;
+     tensor *sz;
+     int i, my_pe, n_pes;
+     INT nrest;
+     static const plan_adt padt = {
+          XM(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_dft *) p_;
+
+     X(extract_reim)(p->sign, I = p->I, &ri, &ii);
+     X(extract_reim)(p->sign, O = p->O, &ro, &io);
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
+	  I = O; 
+     else { 
+	  ro = ri;
+	  io = ii;
+     }
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].n = p->sz->dims[i+1].n;
+     sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
+     {
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_dft_d)(sz,
+                                                X(mktensor_2d)(b, is, is,
+                                                               p->vn, 2, 2),
+                                                ri, ii, ro, io));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     nrest *= p->vn;
+     cldt = X(mkplan_d)(plnr,
+			XM(mkproblem_transpose)(
+			     p->sz->dims[0].n, p->sz->dims[1].n, nrest * 2,
+			     I, O,
+			     p->sz->dims[0].b[IB], p->sz->dims[1].b[OB], 
+			     p->comm, 0));
+     if (XM(any_true)(!cldt, p->comm)) goto nada;
+
+     X(extract_reim)(p->sign, O, &ro, &io);
+     {
+	  INT is = p->sz->dims[0].n * nrest * 2;
+	  INT b = XM(block)(p->sz->dims[1].n, p->sz->dims[1].b[OB], my_pe);
+	  cld2 = X(mkplan_d)(plnr,
+			     X(mkproblem_dft_d)(X(mktensor_1d)(
+						     p->sz->dims[0].n,
+						     nrest * 2, nrest * 2),
+						X(mktensor_2d)(b, is, is,
+							       nrest, 2, 2),
+						ro, io, ro, io));
+	  if (XM(any_true)(!cld2, p->comm)) goto nada;
+     }
+
+     pln = MKPLAN_MPI_DFT(P, &padt, apply);
+     pln->cld1 = cld1;
+     pln->cldt = cldt;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->roff = ri - p->I;
+     pln->ioff = ii - p->I;
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+     X(ops_add2)(&cldt->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cldt);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(dft_rank_geq2_transposed_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/dft-rank-geq2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/dft-rank-geq2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex DFTs of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is not transposed. */
+
+#include "mpi-dft.h"
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_dft super;
+
+     plan *cld1, *cld2;
+     INT roff, ioff;
+     int preserve_input;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld1;
+     plan_rdft *cld2;
+     INT roff = ego->roff, ioff = ego->ioff;
+     
+     /* DFT local dimensions */
+     cld1 = (plan_dft *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I+roff, I+ioff, O+roff, O+ioff);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I+roff, I+ioff, I+roff, I+ioff);
+
+     /* DFT non-local dimension (via dft-rank1-bigvec, usually): */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && XM(is_local_after)(1, p->sz, IB)
+	     && XM(is_local_after)(1, p->sz, OB)
+	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
+		 || !XM(dft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-dft-rank-geq2%s%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_dft *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0;
+     R *ri, *ii, *ro, *io, *I, *O;
+     tensor *sz;
+     dtensor *sz2;
+     int i, my_pe, n_pes;
+     INT nrest;
+     static const plan_adt padt = {
+          XM(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_dft *) p_;
+
+     X(extract_reim)(p->sign, I = p->I, &ri, &ii);
+     X(extract_reim)(p->sign, O = p->O, &ro, &io);
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
+	  I = O; 
+     else { 
+	  ro = ri;
+	  io = ii;
+     }
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].n = p->sz->dims[i+1].n;
+     sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = X(tensor_sz)(sz);
+     {
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_dft_d)(sz,
+                                                X(mktensor_2d)(b, is, is,
+                                                               p->vn, 2, 2),
+                                                ri, ii, ro, io));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
+     sz2->dims[0] = p->sz->dims[0];
+     cld2 = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz2, nrest * p->vn,
+						  I, O, p->comm, p->sign, 
+						  RANK1_BIGVEC_ONLY));
+     if (XM(any_true)(!cld2, p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_DFT(P, &padt, apply);
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->roff = ri - p->I;
+     pln->ioff = ii - p->I;
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(dft_rank_geq2_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/dft-rank1-bigvec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/dft-rank1-bigvec.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex DFTs of rank == 1 when the vector length vn is >= # processes.
+   In this case, we don't need to use a six-step type algorithm, and can
+   instead transpose the DFT dimension with the vector dimension to 
+   make the DFT local. */
+
+#include "mpi-dft.h"
+#include "mpi-transpose.h"
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+     rearrangement rearrange;
+} S;
+
+typedef struct {
+     plan_mpi_dft super;
+
+     plan *cldt_before, *cld, *cldt_after;
+     INT roff, ioff;
+     int preserve_input;
+     rearrangement rearrange;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+     plan_rdft *cldt_before, *cldt_after;
+     INT roff = ego->roff, ioff = ego->ioff;
+     
+     /* global transpose */
+     cldt_before = (plan_rdft *) ego->cldt_before;
+     cldt_before->apply(ego->cldt_before, I, O);
+     
+     if (ego->preserve_input) I = O;
+	  
+     /* 1d DFT(s) */
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, O+roff, O+ioff, I+roff, I+ioff);
+     
+     /* global transpose */
+     cldt_after = (plan_rdft *) ego->cldt_after;
+     cldt_after->apply(ego->cldt_after, I, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     int n_pes;
+     MPI_Comm_size(p->comm, &n_pes);
+     return (1
+	     && p->sz->rnk == 1
+	     && !(p->flags & ~RANK1_BIGVEC_ONLY)
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && (p->vn >= n_pes /* TODO: relax this, using more memory? */
+		 || (p->flags & RANK1_BIGVEC_ONLY))
+
+	     && XM(rearrange_applicable)(ego->rearrange,
+					 p->sz->dims[0], p->vn, n_pes)
+
+	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
+                 || !XM(dft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cldt_before, wakefulness);
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldt_after, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldt_after);
+     X(plan_destroy_internal)(ego->cld);
+     X(plan_destroy_internal)(ego->cldt_before);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const char descrip[][16] = { "contig", "discontig", "square-after",
+				  "square-middle", "square-before" };
+     p->print(p, "(mpi-dft-rank1-bigvec/%s%s %(%p%) %(%p%) %(%p%))",
+	      descrip[ego->rearrange], ego->preserve_input==2 ?"/p":"",
+	      ego->cldt_before, ego->cld, ego->cldt_after);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_dft *p;
+     P *pln;
+     plan *cld = 0, *cldt_before = 0, *cldt_after = 0;
+     R *ri, *ii, *ro, *io, *I, *O;
+     INT yblock, yb, nx, ny, vn;
+     int my_pe, n_pes;
+     static const plan_adt padt = {
+          XM(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_dft *) p_;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+     
+     nx = p->sz->dims[0].n;
+     if (!(ny = XM(rearrange_ny)(ego->rearrange, p->sz->dims[0],p->vn,n_pes)))
+	  return (plan *) 0;
+     vn = p->vn / ny;
+     A(ny * vn == p->vn);
+
+     yblock = XM(default_block)(ny, n_pes);
+     cldt_before = X(mkplan_d)(plnr,
+			       XM(mkproblem_transpose)(
+				    nx, ny, vn*2,
+				    I = p->I, O = p->O,
+				    p->sz->dims[0].b[IB], yblock,
+				    p->comm, 0));
+     if (XM(any_true)(!cldt_before, p->comm)) goto nada;	  
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) { I = O; }
+     
+     X(extract_reim)(p->sign, I, &ri, &ii);
+     X(extract_reim)(p->sign, O, &ro, &io);
+
+     yb = XM(block)(ny, yblock, my_pe);
+     cld = X(mkplan_d)(plnr,
+		       X(mkproblem_dft_d)(X(mktensor_1d)(nx, vn*2, vn*2),
+					  X(mktensor_2d)(yb, vn*2*nx, vn*2*nx,
+							 vn, 2, 2),
+					  ro, io, ri, ii));
+     if (XM(any_true)(!cld, p->comm)) goto nada;	  
+     
+     cldt_after = X(mkplan_d)(plnr,
+			      XM(mkproblem_transpose)(
+				   ny, nx, vn*2,
+				   I, O,
+				   yblock, p->sz->dims[0].b[OB], 
+				   p->comm, 0));
+     if (XM(any_true)(!cldt_after, p->comm)) goto nada;	  
+
+     pln = MKPLAN_MPI_DFT(P, &padt, apply);
+
+     pln->cldt_before = cldt_before;
+     pln->cld = cld;
+     pln->cldt_after = cldt_after;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->roff = ro - p->O;
+     pln->ioff = io - p->O;
+     pln->rearrange = ego->rearrange;
+
+     X(ops_add)(&cldt_before->ops, &cld->ops, &pln->super.super.ops);
+     X(ops_add2)(&cldt_after->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldt_after);
+     X(plan_destroy_internal)(cld);
+     X(plan_destroy_internal)(cldt_before);
+     return (plan *) 0;
+}
+
+static solver *mksolver(rearrangement rearrange, int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->rearrange = rearrange;
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(dft_rank1_bigvec_register)(planner *p)
+{
+     rearrangement rearrange;
+     int preserve_input;
+     FORALL_REARRANGE(rearrange)
+	  for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	       REGISTER_SOLVER(p, mksolver(rearrange, preserve_input));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/dft-rank1.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/dft-rank1.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex DFTs of rank == 1 via six-step algorithm. */
+
+#include "mpi-dft.h"
+#include "mpi-transpose.h"
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     rdftapply apply; /* apply_ddft_first or apply_ddft_last */
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_dft super;
+
+     triggen *t;
+     plan *cldt, *cld_ddft, *cld_dft;
+     INT roff, ioff;
+     int preserve_input;
+     INT vn, xmin, xmax, xs, m, r;
+} P;
+
+static void do_twiddle(triggen *t, INT ir, INT m, INT vn, R *xr, R *xi)
+{
+     void (*rotate)(triggen *, INT, R, R, R *) = t->rotate;
+     INT im, iv;
+     for (im = 0; im < m; ++im)
+	  for (iv = 0; iv < vn; ++iv) {
+	       /* TODO: modify/inline rotate function
+		  so that it can do whole vn vector at once? */
+	       R c[2];
+	       rotate(t, ir * im, *xr, *xi, c);
+	       *xr = c[0]; *xi = c[1];
+	       xr += 2; xi += 2;
+	  }
+}
+
+/* radix-r DFT of size r*m.  This is equivalent to an m x r 2d DFT,
+   plus twiddle factors between the size-m and size-r 1d DFTs, where
+   the m dimension is initially distributed.  The output is transposed
+   to r x m where the r dimension is distributed. 
+
+   This algorithm follows the general sequence:
+        global transpose (m x r -> r x m)
+        DFTs of size m
+	multiply by twiddles + global transpose (r x m -> m x r)
+	DFTs of size r
+	global transpose (m x r -> r x m)
+   where the multiplication by twiddles can come before or after
+   the middle transpose.  The first/last transposes are omitted
+   for SCRAMBLED_IN/OUT formats, respectively.
+
+   However, we wish to exploit our dft-rank1-bigvec solver, which
+   solves a vector of distributed DFTs via transpose+dft+transpose.
+   Therefore, we can group *either* the DFTs of size m *or* the
+   DFTs of size r with their surrounding transposes as a single
+   distributed-DFT (ddft) plan.  These two variations correspond to
+   apply_ddft_first or apply_ddft_last, respectively.
+*/
+
+static void apply_ddft_first(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld_dft;
+     plan_rdft *cldt, *cld_ddft;
+     INT roff, ioff, im, mmax, ms, r, vn;
+     triggen *t;
+     R *dI, *dO;
+
+     /* distributed size-m DFTs, with output in m x r format */
+     cld_ddft = (plan_rdft *) ego->cld_ddft;
+     cld_ddft->apply(ego->cld_ddft, I, O);
+
+     cldt = (plan_rdft *) ego->cldt;
+     if (ego->preserve_input || !cldt) I = O;
+
+     /* twiddle multiplications, followed by 1d DFTs of size-r */
+     cld_dft = (plan_dft *) ego->cld_dft;
+     roff = ego->roff; ioff = ego->ioff;
+     mmax = ego->xmax; ms = ego->xs;
+     t = ego->t; r = ego->r; vn = ego->vn;
+     dI = O; dO = I;
+     for (im = ego->xmin; im <= mmax; ++im) {
+	  do_twiddle(t, im, r, vn, dI+roff, dI+ioff);
+	  cld_dft->apply((plan *) cld_dft, dI+roff, dI+ioff, dO+roff, dO+ioff);
+	  dI += ms; dO += ms;
+     }
+
+     /* final global transpose (m x r -> r x m), if not SCRAMBLED_OUT */
+     if (cldt) 
+	  cldt->apply((plan *) cldt, I, O);
+}
+
+static void apply_ddft_last(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld_dft;
+     plan_rdft *cldt, *cld_ddft;
+     INT roff, ioff, ir, rmax, rs, m, vn;
+     triggen *t;
+     R *dI, *dO0, *dO;
+
+     /* initial global transpose (m x r -> r x m), if not SCRAMBLED_IN */
+     cldt = (plan_rdft *) ego->cldt;
+     if (cldt) {
+	  cldt->apply((plan *) cldt, I, O);
+	  dI = O;
+     }
+     else 
+	  dI = I;
+     if (ego->preserve_input) dO = O; else dO = I;
+     dO0 = dO;
+
+     /* 1d DFTs of size m, followed by twiddle multiplications */
+     cld_dft = (plan_dft *) ego->cld_dft;
+     roff = ego->roff; ioff = ego->ioff;
+     rmax = ego->xmax; rs = ego->xs;
+     t = ego->t; m = ego->m; vn = ego->vn;
+     for (ir = ego->xmin; ir <= rmax; ++ir) {
+	  cld_dft->apply((plan *) cld_dft, dI+roff, dI+ioff, dO+roff, dO+ioff);
+	  do_twiddle(t, ir, m, vn, dO+roff, dO+ioff);
+	  dI += rs; dO += rs;
+     }
+
+     /* distributed size-r DFTs, with output in r x m format */
+     cld_ddft = (plan_rdft *) ego->cld_ddft;
+     cld_ddft->apply(ego->cld_ddft, dO0, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr,
+		      INT *r, INT rblock[2], INT mblock[2])
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     int n_pes;
+     MPI_Comm_size(p->comm, &n_pes);
+     return (1
+	     && p->sz->rnk == 1
+
+	     && ONLY_SCRAMBLEDP(p->flags)
+
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+                                          && p->I != p->O))
+
+	     && (!(p->flags & SCRAMBLED_IN) || ego->apply == apply_ddft_last)
+	     && (!(p->flags & SCRAMBLED_OUT) || ego->apply == apply_ddft_first)
+
+	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
+                 || !XM(dft_serial_applicable)(p))
+
+	     /* disallow if dft-rank1-bigvec is applicable since the
+		data distribution may be slightly different (ugh!) */
+	     && (p->vn < n_pes || p->flags)
+
+	     && (*r = XM(choose_radix)(p->sz->dims[0], n_pes,
+				       p->flags, p->sign,
+				       rblock, mblock))
+
+	     /* ddft_first or last has substantial advantages in the
+		bigvec transpositions for the common case where
+		n_pes == n/r or r, respectively */
+	     && (!NO_UGLYP(plnr)
+		 || !(*r == n_pes && ego->apply == apply_ddft_first)
+		 || !(p->sz->dims[0].n / *r == n_pes 
+		      && ego->apply == apply_ddft_last))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cldt, wakefulness);
+     X(plan_awake)(ego->cld_dft, wakefulness);
+     X(plan_awake)(ego->cld_ddft, wakefulness);
+
+     switch (wakefulness) {
+         case SLEEPY:
+              X(triggen_destroy)(ego->t); ego->t = 0;
+              break;
+         default:
+              ego->t = X(mktriggen)(AWAKE_SQRTN_TABLE, ego->r * ego->m);
+              break;
+     }
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldt);
+     X(plan_destroy_internal)(ego->cld_dft);
+     X(plan_destroy_internal)(ego->cld_ddft);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-dft-rank1/%D%s%s%(%p%)%(%p%)%(%p%))",
+	      ego->r,
+	      ego->super.apply == apply_ddft_first ? "/first" : "/last",
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld_ddft, ego->cld_dft, ego->cldt);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_dft *p;
+     P *pln;
+     plan *cld_dft = 0, *cld_ddft = 0, *cldt = 0;
+     R *ri, *ii, *ro, *io, *I, *O;
+     INT r, rblock[2], m, mblock[2], rp, mp, mpblock[2], mpb;
+     int my_pe, n_pes, preserve_input, ddft_first;
+     dtensor *sz;
+     static const plan_adt padt = {
+          XM(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr, &r, rblock, mblock))
+          return (plan *) 0;
+
+     p = (const problem_mpi_dft *) p_;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     m = p->sz->dims[0].n / r;
+
+     /* some hackery so that we can plan both ddft_first and ddft_last
+	as if they were ddft_first */
+     if ((ddft_first = (ego->apply == apply_ddft_first))) {
+	  rp = r; mp = m;
+	  mpblock[IB] = mblock[IB]; mpblock[OB] = mblock[OB];
+	  mpb = XM(block)(mp, mpblock[OB], my_pe);
+     }
+     else {
+	  rp = m; mp = r;
+	  mpblock[IB] = rblock[IB]; mpblock[OB] = rblock[OB];
+	  mpb = XM(block)(mp, mpblock[IB], my_pe);
+     }
+
+     preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+
+     sz = XM(mkdtensor)(1);
+     sz->dims[0].n = mp;
+     sz->dims[0].b[IB] = mpblock[IB];
+     sz->dims[0].b[OB] = mpblock[OB];
+     I = (ddft_first || !preserve_input) ? p->I : p->O;
+     O = p->O;
+     cld_ddft = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz, rp * p->vn,
+						      I, O, p->comm, p->sign,
+						      RANK1_BIGVEC_ONLY));
+     if (XM(any_true)(!cld_ddft, p->comm)) goto nada;
+
+     I = TAINT((ddft_first || !p->flags) ? p->O : p->I, rp * p->vn * 2);
+     O = TAINT((preserve_input || (ddft_first && p->flags)) ? p->O : p->I, 
+	       rp * p->vn * 2);
+     X(extract_reim)(p->sign, I, &ri, &ii);
+     X(extract_reim)(p->sign, O, &ro, &io);
+     cld_dft = X(mkplan_d)(plnr,
+			X(mkproblem_dft_d)(X(mktensor_1d)(rp, p->vn*2,p->vn*2),
+					   X(mktensor_1d)(p->vn, 2, 2),
+					   ri, ii, ro, io));
+     if (XM(any_true)(!cld_dft, p->comm)) goto nada;
+     
+     if (!p->flags) { /* !(SCRAMBLED_IN or SCRAMBLED_OUT) */
+	  I = (ddft_first && preserve_input) ? p->O : p->I;
+	  O = p->O;
+	  cldt = X(mkplan_d)(plnr,
+			     XM(mkproblem_transpose)(
+				  m, r, p->vn * 2,
+				  I, O,
+				  ddft_first ? mblock[OB] : mblock[IB],
+				  ddft_first ? rblock[OB] : rblock[IB],
+				  p->comm, 0));
+	  if (XM(any_true)(!cldt, p->comm)) goto nada;	  
+     }
+
+     pln = MKPLAN_MPI_DFT(P, &padt, ego->apply);
+
+     pln->cld_ddft = cld_ddft;
+     pln->cld_dft = cld_dft;
+     pln->cldt = cldt;
+     pln->preserve_input = preserve_input;
+     X(extract_reim)(p->sign, p->O, &ro, &io);
+     pln->roff = ro - p->O;
+     pln->ioff = io - p->O;
+     pln->vn = p->vn;
+     pln->m = m;
+     pln->r = r;
+     pln->xmin = (ddft_first ? mblock[OB] : rblock[IB]) * my_pe;
+     pln->xmax = pln->xmin + mpb - 1;
+     pln->xs = rp * p->vn * 2;
+     pln->t = 0;
+
+     X(ops_add)(&cld_ddft->ops, &cld_dft->ops, &pln->super.super.ops);
+     if (cldt) X(ops_add2)(&cldt->ops, &pln->super.super.ops);
+     {
+          double n0 = (1 + pln->xmax - pln->xmin) * (mp - 1) * pln->vn;
+          pln->super.super.ops.mul += 8 * n0;
+          pln->super.super.ops.add += 4 * n0;
+          pln->super.super.ops.other += 8 * n0;
+     }
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldt);
+     X(plan_destroy_internal)(cld_dft);
+     X(plan_destroy_internal)(cld_ddft);
+     return (plan *) 0;
+}
+
+static solver *mksolver(rdftapply apply, int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->apply = apply;
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(dft_rank1_register)(planner *p)
+{
+     rdftapply apply[] = { apply_ddft_first, apply_ddft_last };
+     unsigned int iapply;
+     int preserve_input;
+     for (iapply = 0; iapply < sizeof(apply) / sizeof(apply[0]); ++iapply)
+	  for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	       REGISTER_SOLVER(p, mksolver(apply[iapply], preserve_input));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/dft-serial.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/dft-serial.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* "MPI" DFTs where all of the data is on one processor...just
+   call through to serial API. */
+
+#include "mpi-dft.h"
+#include "dft.h"
+
+typedef struct {
+     plan_mpi_dft super;
+     plan *cld;
+     INT roff, ioff;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+     INT roff = ego->roff, ioff = ego->ioff;
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, I+roff, I+ioff, O+roff, O+ioff);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-dft-serial %(%p%))", ego->cld);
+}
+
+int XM(dft_serial_applicable)(const problem_mpi_dft *p)
+{
+     return (1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
+		 || p->vn == 0));
+}
+
+static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     P *pln;
+     plan *cld;
+     int my_pe;
+     R *ri, *ii, *ro, *io;
+     static const plan_adt padt = {
+          XM(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     /* check whether applicable: */
+     if (!XM(dft_serial_applicable)(p))
+          return (plan *) 0;
+
+     X(extract_reim)(p->sign, p->I, &ri, &ii);
+     X(extract_reim)(p->sign, p->O, &ro, &io);
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     if (my_pe == 0 && p->vn > 0) {
+	  int i, rnk = p->sz->rnk;
+	  tensor *sz = X(mktensor)(p->sz->rnk);
+	  sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn;
+	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
+	  for (i = rnk - 1; i > 0; --i) {
+	       sz->dims[i - 1].is = sz->dims[i - 1].os = 
+		    sz->dims[i].is * sz->dims[i].n;
+	       sz->dims[i - 1].n = p->sz->dims[i - 1].n;
+	  }
+	  
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_dft_d)(sz,
+					       X(mktensor_1d)(p->vn, 2, 2),
+					       ri, ii, ro, io));
+     }
+     else { /* idle process: make nop plan */
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_dft_d)(X(mktensor_0d)(),
+					       X(mktensor_1d)(0,0,0),
+					       ri, ii, ro, io));
+     }
+     if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
+
+     pln = MKPLAN_MPI_DFT(P, &padt, apply);
+     pln->cld = cld;
+     pln->roff = ro - p->O;
+     pln->ioff = io - p->O;
+     X(ops_cpy)(&cld->ops, &pln->super.super.ops);
+     return &(pln->super.super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
+     return MKSOLVER(solver, &sadt);
+}
+
+void XM(dft_serial_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/dft-solve.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/dft-solve.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-dft.h"
+
+/* use the apply() operation for MPI_DFT problems */
+void XM(dft_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_mpi_dft *ego = (const plan_mpi_dft *) ego_;
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/dtensor.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/dtensor.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+dtensor *XM(mkdtensor)(int rnk) 
+{
+     dtensor *x;
+
+     A(rnk >= 0);
+
+#if defined(STRUCT_HACK_KR)
+     if (FINITE_RNK(rnk) && rnk > 1)
+	  x = (dtensor *)MALLOC(sizeof(dtensor) + (rnk - 1) * sizeof(ddim),
+				    TENSORS);
+     else
+	  x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
+#elif defined(STRUCT_HACK_C99)
+     if (FINITE_RNK(rnk))
+	  x = (dtensor *)MALLOC(sizeof(dtensor) + rnk * sizeof(ddim),
+				    TENSORS);
+     else
+	  x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
+#else
+     x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
+     if (FINITE_RNK(rnk) && rnk > 0)
+          x->dims = (ddim *)MALLOC(sizeof(ddim) * rnk, TENSORS);
+     else
+          x->dims = 0;
+#endif
+
+     x->rnk = rnk;
+     return x;
+}
+
+void XM(dtensor_destroy)(dtensor *sz)
+{
+#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
+     X(ifree0)(sz->dims);
+#endif
+     X(ifree)(sz);
+}
+
+void XM(dtensor_md5)(md5 *p, const dtensor *t)
+{
+     int i;
+     X(md5int)(p, t->rnk);
+     if (FINITE_RNK(t->rnk)) {
+          for (i = 0; i < t->rnk; ++i) {
+               const ddim *q = t->dims + i;
+               X(md5INT)(p, q->n);
+               X(md5INT)(p, q->b[IB]);
+               X(md5INT)(p, q->b[OB]);
+          }
+     }
+}
+
+dtensor *XM(dtensor_copy)(const dtensor *sz)
+{
+     dtensor *x = XM(mkdtensor)(sz->rnk);
+     int i;
+     if (FINITE_RNK(sz->rnk))
+          for (i = 0; i < sz->rnk; ++i)
+               x->dims[i] = sz->dims[i];
+     return x;
+}
+
+dtensor *XM(dtensor_canonical)(const dtensor *sz, int compress)
+{
+     int i, rnk;
+     dtensor *x;
+     block_kind k;
+
+     if (!FINITE_RNK(sz->rnk))
+	  return XM(mkdtensor)(sz->rnk);
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+	  if (sz->dims[i].n <= 0)
+	       return XM(mkdtensor)(RNK_MINFTY);
+	  else if (!compress || sz->dims[i].n > 1)
+	       ++rnk;
+     }
+     x = XM(mkdtensor)(rnk);
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+	  if (!compress || sz->dims[i].n > 1) {
+               x->dims[rnk].n = sz->dims[i].n;
+	       FORALL_BLOCK_KIND(k) {
+		    if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]) == 1)
+			 x->dims[rnk].b[k] = sz->dims[i].n;
+		    else
+			 x->dims[rnk].b[k] = sz->dims[i].b[k];
+	       }
+	       ++rnk;
+	  }
+     }
+     return x;
+}
+
+int XM(dtensor_validp)(const dtensor *sz)
+{
+     int i;
+     if (sz->rnk < 0) return 0;
+     if (FINITE_RNK(sz->rnk))
+	  for (i = 0; i < sz->rnk; ++i)
+	       if (sz->dims[i].n < 0
+		   || sz->dims[i].b[IB] <= 0
+		   || sz->dims[i].b[OB] <= 0)
+		    return 0;
+     return 1;
+}
+
+void XM(dtensor_print)(const dtensor *t, printer *p)
+{
+     if (FINITE_RNK(t->rnk)) {
+          int i;
+          int first = 1;
+          p->print(p, "(");
+          for (i = 0; i < t->rnk; ++i) {
+               const ddim *d = t->dims + i;
+               p->print(p, "%s(%D %D %D)",
+                        first ? "" : " ",
+                        d->n, d->b[IB], d->b[OB]);
+               first = 0;
+          }
+          p->print(p, ")");
+     } else {
+          p->print(p, "rank-minfty");
+     }
+
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/f03-wrap.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/f03-wrap.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,284 @@
+/* Generated automatically.  DO NOT EDIT! */
+
+#include "fftw3-mpi.h"
+#include "ifftw-mpi.h"
+
+FFTW_EXTERN ptrdiff_t XM(local_size_many_transposed_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_many_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_transposed_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_many_1d_f03)(ptrdiff_t n0, ptrdiff_t howmany, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_1d_f03)(ptrdiff_t n0, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_2d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_3d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
+FFTW_EXTERN X(plan) XM(plan_many_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, R * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_many_dft_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_1d_f03)(ptrdiff_t n0, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_many_r2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_r2r_f03)(int rnk, const ptrdiff_t * n, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_r2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_r2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_many_dft_r2c_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_r2c_f03)(int rnk, const ptrdiff_t * n, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_r2c_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_r2c_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_many_dft_c2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_c2r_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_c2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_c2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN void XM(gather_wisdom_f03)(MPI_Fint f_comm_);
+FFTW_EXTERN void XM(broadcast_wisdom_f03)(MPI_Fint f_comm_);
+
+ptrdiff_t XM(local_size_many_transposed_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_many_transposed)(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start,local_n1,local_1_start);
+}
+
+ptrdiff_t XM(local_size_many_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_many)(rnk,n,howmany,block0,comm,local_n0,local_0_start);
+}
+
+ptrdiff_t XM(local_size_transposed_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_transposed)(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start);
+}
+
+ptrdiff_t XM(local_size_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size)(rnk,n,comm,local_n0,local_0_start);
+}
+
+ptrdiff_t XM(local_size_many_1d_f03)(ptrdiff_t n0, ptrdiff_t howmany, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_many_1d)(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start);
+}
+
+ptrdiff_t XM(local_size_1d_f03)(ptrdiff_t n0, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_1d)(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start);
+}
+
+ptrdiff_t XM(local_size_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_2d)(n0,n1,comm,local_n0,local_0_start);
+}
+
+ptrdiff_t XM(local_size_2d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_2d_transposed)(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start);
+}
+
+ptrdiff_t XM(local_size_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_3d)(n0,n1,n2,comm,local_n0,local_0_start);
+}
+
+ptrdiff_t XM(local_size_3d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_3d_transposed)(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start);
+}
+
+X(plan) XM(plan_many_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, R * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_many_transpose)(n0,n1,howmany,block0,block1,in,out,comm,flags);
+}
+
+X(plan) XM(plan_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_transpose)(n0,n1,in,out,comm,flags);
+}
+
+X(plan) XM(plan_many_dft_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_many_dft)(rnk,n,howmany,block,tblock,in,out,comm,sign,flags);
+}
+
+X(plan) XM(plan_dft_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft)(rnk,n,in,out,comm,sign,flags);
+}
+
+X(plan) XM(plan_dft_1d_f03)(ptrdiff_t n0, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_1d)(n0,in,out,comm,sign,flags);
+}
+
+X(plan) XM(plan_dft_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_2d)(n0,n1,in,out,comm,sign,flags);
+}
+
+X(plan) XM(plan_dft_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_3d)(n0,n1,n2,in,out,comm,sign,flags);
+}
+
+X(plan) XM(plan_many_r2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_many_r2r)(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags);
+}
+
+X(plan) XM(plan_r2r_f03)(int rnk, const ptrdiff_t * n, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_r2r)(rnk,n,in,out,comm,kind,flags);
+}
+
+X(plan) XM(plan_r2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_r2r_2d)(n0,n1,in,out,comm,kind0,kind1,flags);
+}
+
+X(plan) XM(plan_r2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_r2r_3d)(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags);
+}
+
+X(plan) XM(plan_many_dft_r2c_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_many_dft_r2c)(rnk,n,howmany,iblock,oblock,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_r2c_f03)(int rnk, const ptrdiff_t * n, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_r2c)(rnk,n,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_r2c_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_r2c_2d)(n0,n1,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_r2c_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_r2c_3d)(n0,n1,n2,in,out,comm,flags);
+}
+
+X(plan) XM(plan_many_dft_c2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_many_dft_c2r)(rnk,n,howmany,iblock,oblock,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_c2r_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_c2r)(rnk,n,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_c2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_c2r_2d)(n0,n1,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_c2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_c2r_3d)(n0,n1,n2,in,out,comm,flags);
+}
+
+void XM(gather_wisdom_f03)(MPI_Fint f_comm_)
+{
+     MPI_Comm comm_;
+
+     comm_ = MPI_Comm_f2c(f_comm_);
+     XM(gather_wisdom)(comm_);
+}
+
+void XM(broadcast_wisdom_f03)(MPI_Fint f_comm_)
+{
+     MPI_Comm comm_;
+
+     comm_ = MPI_Comm_f2c(f_comm_);
+     XM(broadcast_wisdom)(comm_);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/f03-wrap.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/f03-wrap.sh	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,22 @@
+#! /bin/sh
+
+# Script to generate Fortran 2003 wrappers for FFTW's MPI functions.  This
+# is necessary because MPI provides no way to deal with C MPI_Comm handles
+# from Fortran (where MPI_Comm == integer), but does provide a way to
+# deal with Fortran MPI_Comm handles from C (via MPI_Comm_f2c).  So,
+# every FFTW function that takes an MPI_Comm argument needs a wrapper
+# function that takes a Fortran integer and converts it to MPI_Comm.
+
+echo "/* Generated automatically.  DO NOT EDIT! */"
+echo
+
+echo "#include \"fftw3-mpi.h\""
+echo "#include \"ifftw-mpi.h\""
+echo
+
+# Declare prototypes using FFTW_EXTERN, important for Windows DLLs
+grep -v 'mpi.h' fftw3-mpi.h | gcc -E - |grep "fftw_mpi_init" |tr ';' '\n' | grep "MPI_Comm" | perl genf03-wrap.pl | grep "MPI_Fint" | sed 's/^/FFTW_EXTERN /;s/$/;/'
+
+grep -v 'mpi.h' fftw3-mpi.h | gcc -E - |grep "fftw_mpi_init" |tr ';' '\n' | grep "MPI_Comm" | perl genf03-wrap.pl
+
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/f03api.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/f03api.sh	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,43 @@
+#! /bin/sh
+
+# Script to generate Fortran 2003 interface declarations for FFTW's MPI
+# interface from the fftw3-mpi.h header file.
+
+# This is designed so that the Fortran caller can do:
+#   use, intrinsic :: iso_c_binding
+#   implicit none
+#   include 'fftw3-mpi.f03'
+# and then call the C FFTW MPI functions directly, with type checking.
+#
+# One caveat: because there is no standard way to conver MPI_Comm objects
+# from Fortran (= integer) to C (= opaque type), the Fortran interface
+# technically calls C wrapper functions (also auto-generated) which
+# call MPI_Comm_f2c to convert the communicators as needed.
+
+echo "! Generated automatically.  DO NOT EDIT!"
+echo
+
+echo "  include 'fftw3.f03'"
+echo
+
+# Extract constants
+perl -pe 's/#define +([A-Z0-9_]+) +\(([+-]?[0-9]+)U?\)/\n  integer\(C_INTPTR_T\), parameter :: \1 = \2\n/g' < fftw3-mpi.h | grep 'integer(C_INTPTR_T)'
+perl -pe 'if (/#define +([A-Z0-9_]+) +\(([0-9]+)U? *<< *([0-9]+)\)/) { print "\n  integer\(C_INT\), parameter :: $1 = ",$2 << $3,"\n"; }' < fftw3-mpi.h | grep 'integer(C_INT)'
+
+# Extract function declarations
+for p in $*; do
+    if test "$p" = "d"; then p=""; fi
+
+    echo
+    cat <<EOF
+  type, bind(C) :: fftw${p}_mpi_ddim
+     integer(C_INTPTR_T) n, ib, ob
+  end type fftw${p}_mpi_ddim
+EOF
+
+    echo
+    echo "  interface"
+    grep -v 'mpi.h' fftw3-mpi.h | gcc -D__GNUC__=5 -D__i386__ -E - |grep "fftw${p}_mpi_init" |tr ';' '\n' | perl ../api/genf03.pl
+    echo "  end interface"
+
+done
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/fftw3-mpi.f03.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/fftw3-mpi.f03.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,810 @@
+! Generated automatically.  DO NOT EDIT!
+
+  include 'fftw3.f03'
+
+  integer(C_INTPTR_T), parameter :: FFTW_MPI_DEFAULT_BLOCK = 0
+  integer(C_INT), parameter :: FFTW_MPI_SCRAMBLED_IN = 134217728
+  integer(C_INT), parameter :: FFTW_MPI_SCRAMBLED_OUT = 268435456
+  integer(C_INT), parameter :: FFTW_MPI_TRANSPOSED_IN = 536870912
+  integer(C_INT), parameter :: FFTW_MPI_TRANSPOSED_OUT = 1073741824
+
+  type, bind(C) :: fftw_mpi_ddim
+     integer(C_INTPTR_T) n, ib, ob
+  end type fftw_mpi_ddim
+
+  interface
+    subroutine fftw_mpi_init() bind(C, name='fftw_mpi_init')
+      import
+    end subroutine fftw_mpi_init
+    
+    subroutine fftw_mpi_cleanup() bind(C, name='fftw_mpi_cleanup')
+      import
+    end subroutine fftw_mpi_cleanup
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
+                                                                     local_n1,local_1_start) &
+                                 bind(C, name='fftw_mpi_local_size_many_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftw_mpi_local_size_many_transposed
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftw_mpi_local_size_many_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftw_mpi_local_size_many
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftw_mpi_local_size_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftw_mpi_local_size_transposed
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftw_mpi_local_size_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftw_mpi_local_size
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
+                                                             local_o_start) bind(C, name='fftw_mpi_local_size_many_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftw_mpi_local_size_many_1d
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
+                                 bind(C, name='fftw_mpi_local_size_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftw_mpi_local_size_1d
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftw_mpi_local_size_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftw_mpi_local_size_2d
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftw_mpi_local_size_2d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftw_mpi_local_size_2d_transposed
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftw_mpi_local_size_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftw_mpi_local_size_3d
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftw_mpi_local_size_3d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftw_mpi_local_size_3d_transposed
+    
+    type(C_PTR) function fftw_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
+                         bind(C, name='fftw_mpi_plan_many_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_many_transpose
+    
+    type(C_PTR) function fftw_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_transpose
+    
+    type(C_PTR) function fftw_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
+                         bind(C, name='fftw_mpi_plan_many_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block
+      integer(C_INTPTR_T), value :: tblock
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_many_dft
+    
+    type(C_PTR) function fftw_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft
+    
+    type(C_PTR) function fftw_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_1d
+    
+    type(C_PTR) function fftw_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_2d
+    
+    type(C_PTR) function fftw_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_3d
+    
+    type(C_PTR) function fftw_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
+                         bind(C, name='fftw_mpi_plan_many_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_many_r2r
+    
+    type(C_PTR) function fftw_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftw_mpi_plan_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_r2r
+    
+    type(C_PTR) function fftw_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftw_mpi_plan_r2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_r2r_2d
+    
+    type(C_PTR) function fftw_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) bind(C, name='fftw_mpi_plan_r2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_FFTW_R2R_KIND), value :: kind2
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_r2r_3d
+    
+    type(C_PTR) function fftw_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftw_mpi_plan_many_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_many_dft_r2c
+    
+    type(C_PTR) function fftw_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_r2c
+    
+    type(C_PTR) function fftw_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_r2c_2d
+    
+    type(C_PTR) function fftw_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_r2c_3d
+    
+    type(C_PTR) function fftw_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftw_mpi_plan_many_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_many_dft_c2r
+    
+    type(C_PTR) function fftw_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_c2r
+    
+    type(C_PTR) function fftw_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_c2r_2d
+    
+    type(C_PTR) function fftw_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_c2r_3d
+    
+    subroutine fftw_mpi_gather_wisdom(comm_) bind(C, name='fftw_mpi_gather_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftw_mpi_gather_wisdom
+    
+    subroutine fftw_mpi_broadcast_wisdom(comm_) bind(C, name='fftw_mpi_broadcast_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftw_mpi_broadcast_wisdom
+    
+    subroutine fftw_mpi_execute_dft(p,in,out) bind(C, name='fftw_mpi_execute_dft')
+      import
+      type(C_PTR), value :: p
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftw_mpi_execute_dft
+    
+    subroutine fftw_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftw_mpi_execute_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), dimension(*), intent(inout) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftw_mpi_execute_dft_r2c
+    
+    subroutine fftw_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftw_mpi_execute_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftw_mpi_execute_dft_c2r
+    
+    subroutine fftw_mpi_execute_r2r(p,in,out) bind(C, name='fftw_mpi_execute_r2r')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), dimension(*), intent(inout) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftw_mpi_execute_r2r
+    
+  end interface
+
+  type, bind(C) :: fftwf_mpi_ddim
+     integer(C_INTPTR_T) n, ib, ob
+  end type fftwf_mpi_ddim
+
+  interface
+    subroutine fftwf_mpi_init() bind(C, name='fftwf_mpi_init')
+      import
+    end subroutine fftwf_mpi_init
+    
+    subroutine fftwf_mpi_cleanup() bind(C, name='fftwf_mpi_cleanup')
+      import
+    end subroutine fftwf_mpi_cleanup
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
+                                                                      local_n1,local_1_start) &
+                                 bind(C, name='fftwf_mpi_local_size_many_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwf_mpi_local_size_many_transposed
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwf_mpi_local_size_many_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwf_mpi_local_size_many
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwf_mpi_local_size_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwf_mpi_local_size_transposed
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftwf_mpi_local_size_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwf_mpi_local_size
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
+                                                              local_o_start) bind(C, name='fftwf_mpi_local_size_many_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftwf_mpi_local_size_many_1d
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
+                                 bind(C, name='fftwf_mpi_local_size_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftwf_mpi_local_size_1d
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwf_mpi_local_size_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwf_mpi_local_size_2d
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwf_mpi_local_size_2d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwf_mpi_local_size_2d_transposed
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwf_mpi_local_size_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwf_mpi_local_size_3d
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwf_mpi_local_size_3d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwf_mpi_local_size_3d_transposed
+    
+    type(C_PTR) function fftwf_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
+                         bind(C, name='fftwf_mpi_plan_many_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_many_transpose
+    
+    type(C_PTR) function fftwf_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_transpose
+    
+    type(C_PTR) function fftwf_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
+                         bind(C, name='fftwf_mpi_plan_many_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block
+      integer(C_INTPTR_T), value :: tblock
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_many_dft
+    
+    type(C_PTR) function fftwf_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_1d
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_2d
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_3d
+    
+    type(C_PTR) function fftwf_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
+                         bind(C, name='fftwf_mpi_plan_many_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_many_r2r
+    
+    type(C_PTR) function fftwf_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftwf_mpi_plan_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_r2r
+    
+    type(C_PTR) function fftwf_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftwf_mpi_plan_r2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_r2r_2d
+    
+    type(C_PTR) function fftwf_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) &
+                         bind(C, name='fftwf_mpi_plan_r2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_FFTW_R2R_KIND), value :: kind2
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_r2r_3d
+    
+    type(C_PTR) function fftwf_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftwf_mpi_plan_many_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_many_dft_r2c
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_r2c
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_r2c_2d
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_r2c_3d
+    
+    type(C_PTR) function fftwf_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftwf_mpi_plan_many_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_many_dft_c2r
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_c2r
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_c2r_2d
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_c2r_3d
+    
+    subroutine fftwf_mpi_gather_wisdom(comm_) bind(C, name='fftwf_mpi_gather_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftwf_mpi_gather_wisdom
+    
+    subroutine fftwf_mpi_broadcast_wisdom(comm_) bind(C, name='fftwf_mpi_broadcast_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftwf_mpi_broadcast_wisdom
+    
+    subroutine fftwf_mpi_execute_dft(p,in,out) bind(C, name='fftwf_mpi_execute_dft')
+      import
+      type(C_PTR), value :: p
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(inout) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwf_mpi_execute_dft
+    
+    subroutine fftwf_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftwf_mpi_execute_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_FLOAT), dimension(*), intent(inout) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwf_mpi_execute_dft_r2c
+    
+    subroutine fftwf_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftwf_mpi_execute_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(inout) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+    end subroutine fftwf_mpi_execute_dft_c2r
+    
+    subroutine fftwf_mpi_execute_r2r(p,in,out) bind(C, name='fftwf_mpi_execute_r2r')
+      import
+      type(C_PTR), value :: p
+      real(C_FLOAT), dimension(*), intent(inout) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+    end subroutine fftwf_mpi_execute_r2r
+    
+  end interface
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/fftw3-mpi.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/fftw3-mpi.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * The following statement of license applies *only* to this header file,
+ * and *not* to the other files distributed with FFTW or derived therefrom:
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/***************************** NOTE TO USERS *********************************
+ *
+ *                 THIS IS A HEADER FILE, NOT A MANUAL
+ *
+ *    If you want to know how to use FFTW, please read the manual,
+ *    online at http://www.fftw.org/doc/ and also included with FFTW.
+ *    For a quick start, see the manual's tutorial section.
+ *
+ *   (Reading header files to learn how to use a library is a habit
+ *    stemming from code lacking a proper manual.  Arguably, it's a
+ *    *bad* habit in most cases, because header files can contain
+ *    interfaces that are not part of the public, stable API.)
+ *
+ ****************************************************************************/
+
+#ifndef FFTW3_MPI_H
+#define FFTW3_MPI_H
+
+#include "fftw3.h"
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+struct fftw_mpi_ddim_do_not_use_me {
+     ptrdiff_t n;                     /* dimension size */
+     ptrdiff_t ib;                    /* input block */
+     ptrdiff_t ob;                    /* output block */
+};
+
+/*
+  huge second-order macro that defines prototypes for all API
+  functions.  We expand this macro for each supported precision
+ 
+  XM: name-mangling macro (MPI)
+  X: name-mangling macro (serial)
+  R: real data type
+  C: complex data type
+*/
+
+#define FFTW_MPI_DEFINE_API(XM, X, R, C)			\
+								\
+typedef struct fftw_mpi_ddim_do_not_use_me XM(ddim);		\
+								\
+FFTW_EXTERN void XM(init)(void);				\
+FFTW_EXTERN void XM(cleanup)(void);				\
+								\
+FFTW_EXTERN ptrdiff_t XM(local_size_many_transposed)		\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t block0, ptrdiff_t block1, MPI_Comm comm,	\
+      ptrdiff_t *local_n0, ptrdiff_t *local_0_start,		\
+      ptrdiff_t *local_n1, ptrdiff_t *local_1_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_many)			\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t block0, MPI_Comm comm,				\
+      ptrdiff_t *local_n0, ptrdiff_t *local_0_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_transposed)			\
+     (int rnk, const ptrdiff_t *n, MPI_Comm comm,		\
+      ptrdiff_t *local_n0, ptrdiff_t *local_0_start,		\
+      ptrdiff_t *local_n1, ptrdiff_t *local_1_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size)				\
+     (int rnk, const ptrdiff_t *n, MPI_Comm comm,		\
+      ptrdiff_t *local_n0, ptrdiff_t *local_0_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_many_1d)(			\
+     ptrdiff_t n0, ptrdiff_t howmany,				\
+     MPI_Comm comm, int sign, unsigned flags,			\
+     ptrdiff_t *local_ni, ptrdiff_t *local_i_start,		\
+     ptrdiff_t *local_no, ptrdiff_t *local_o_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_1d)(			\
+     ptrdiff_t n0, MPI_Comm comm, int sign, unsigned flags,	\
+     ptrdiff_t *local_ni, ptrdiff_t *local_i_start,		\
+     ptrdiff_t *local_no, ptrdiff_t *local_o_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_2d)(			\
+     ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,			\
+     ptrdiff_t *local_n0, ptrdiff_t *local_0_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_2d_transposed)(		\
+     ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,			\
+     ptrdiff_t *local_n0, ptrdiff_t *local_0_start,		\
+     ptrdiff_t *local_n1, ptrdiff_t *local_1_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_3d)(			\
+     ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm,	\
+     ptrdiff_t *local_n0, ptrdiff_t *local_0_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_3d_transposed)(		\
+     ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm,	\
+     ptrdiff_t *local_n0, ptrdiff_t *local_0_start,		\
+     ptrdiff_t *local_n1, ptrdiff_t *local_1_start);		\
+								\
+FFTW_EXTERN X(plan) XM(plan_many_transpose)			\
+     (ptrdiff_t n0, ptrdiff_t n1,				\
+      ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1,	\
+      R *in, R *out, MPI_Comm comm, unsigned flags);		\
+FFTW_EXTERN X(plan) XM(plan_transpose)				\
+     (ptrdiff_t n0, ptrdiff_t n1,				\
+      R *in, R *out, MPI_Comm comm, unsigned flags);		\
+								\
+FFTW_EXTERN X(plan) XM(plan_many_dft)				\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t block, ptrdiff_t tblock, C *in, C *out,		\
+      MPI_Comm comm, int sign, unsigned flags);			\
+FFTW_EXTERN X(plan) XM(plan_dft)				\
+     (int rnk, const ptrdiff_t *n, C *in, C *out,		\
+      MPI_Comm comm, int sign, unsigned flags);			\
+FFTW_EXTERN X(plan) XM(plan_dft_1d)				\
+     (ptrdiff_t n0, C *in, C *out,				\
+      MPI_Comm comm, int sign, unsigned flags);			\
+FFTW_EXTERN X(plan) XM(plan_dft_2d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, C *in, C *out,		\
+      MPI_Comm comm, int sign, unsigned flags);			\
+FFTW_EXTERN X(plan) XM(plan_dft_3d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, C *in, C *out,	\
+      MPI_Comm comm, int sign, unsigned flags);			\
+								\
+FFTW_EXTERN X(plan) XM(plan_many_r2r)				\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t iblock, ptrdiff_t oblock, R *in, R *out,	\
+      MPI_Comm comm, const X(r2r_kind) *kind, unsigned flags);	\
+FFTW_EXTERN X(plan) XM(plan_r2r)				\
+     (int rnk, const ptrdiff_t *n, R *in, R *out,		\
+      MPI_Comm comm, const X(r2r_kind) *kind, unsigned flags);	\
+FFTW_EXTERN X(plan) XM(plan_r2r_2d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, R *in, R *out, MPI_Comm comm,	\
+      X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags);	\
+FFTW_EXTERN X(plan) XM(plan_r2r_3d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,			\
+      R *in, R *out, MPI_Comm comm, X(r2r_kind) kind0,		\
+      X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags);	\
+								\
+FFTW_EXTERN X(plan) XM(plan_many_dft_r2c)			\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t iblock, ptrdiff_t oblock, R *in, C *out,	\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_r2c)				\
+     (int rnk, const ptrdiff_t *n, R *in, C *out,		\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_r2c_2d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, R *in, C *out,		\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_r2c_3d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R *in, C *out,	\
+      MPI_Comm comm, unsigned flags);				\
+								\
+FFTW_EXTERN X(plan) XM(plan_many_dft_c2r)			\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t iblock, ptrdiff_t oblock, C *in, R *out,	\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_c2r)				\
+     (int rnk, const ptrdiff_t *n, C *in, R *out,		\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_c2r_2d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, C *in, R *out,		\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_c2r_3d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, C *in, R *out,	\
+      MPI_Comm comm, unsigned flags);				\
+								\
+FFTW_EXTERN void XM(gather_wisdom)(MPI_Comm comm_);		\
+FFTW_EXTERN void XM(broadcast_wisdom)(MPI_Comm comm_);          \
+								\
+FFTW_EXTERN void XM(execute_dft)(X(plan) p, C *in, C *out);	\
+FFTW_EXTERN void XM(execute_dft_r2c)(X(plan) p, R *in, C *out);	\
+FFTW_EXTERN void XM(execute_dft_c2r)(X(plan) p, C *in, R *out);	\
+FFTW_EXTERN void XM(execute_r2r)(X(plan) p, R *in, R *out); 
+
+
+
+/* end of FFTW_MPI_DEFINE_API macro */
+
+#define FFTW_MPI_MANGLE_DOUBLE(name) FFTW_MANGLE_DOUBLE(FFTW_CONCAT(mpi_,name))
+#define FFTW_MPI_MANGLE_FLOAT(name) FFTW_MANGLE_FLOAT(FFTW_CONCAT(mpi_,name))
+#define FFTW_MPI_MANGLE_LONG_DOUBLE(name) FFTW_MANGLE_LONG_DOUBLE(FFTW_CONCAT(mpi_,name))
+
+FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_DOUBLE, FFTW_MANGLE_DOUBLE, double, fftw_complex)
+FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_FLOAT, FFTW_MANGLE_FLOAT, float, fftwf_complex)
+FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_LONG_DOUBLE, FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
+
+#define FFTW_MPI_DEFAULT_BLOCK (0)
+
+/* MPI-specific flags */
+#define FFTW_MPI_SCRAMBLED_IN (1U << 27)
+#define FFTW_MPI_SCRAMBLED_OUT (1U << 28)
+#define FFTW_MPI_TRANSPOSED_IN (1U << 29)
+#define FFTW_MPI_TRANSPOSED_OUT (1U << 30)
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* FFTW3_MPI_H */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/fftw3l-mpi.f03.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/fftw3l-mpi.f03.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,405 @@
+! Generated automatically.  DO NOT EDIT!
+
+  include 'fftw3l.f03'
+
+
+  type, bind(C) :: fftwl_mpi_ddim
+     integer(C_INTPTR_T) n, ib, ob
+  end type fftwl_mpi_ddim
+
+  interface
+    subroutine fftwl_mpi_init() bind(C, name='fftwl_mpi_init')
+      import
+    end subroutine fftwl_mpi_init
+    
+    subroutine fftwl_mpi_cleanup() bind(C, name='fftwl_mpi_cleanup')
+      import
+    end subroutine fftwl_mpi_cleanup
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
+                                                                      local_n1,local_1_start) &
+                                 bind(C, name='fftwl_mpi_local_size_many_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwl_mpi_local_size_many_transposed
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwl_mpi_local_size_many_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwl_mpi_local_size_many
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwl_mpi_local_size_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwl_mpi_local_size_transposed
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftwl_mpi_local_size_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwl_mpi_local_size
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
+                                                              local_o_start) bind(C, name='fftwl_mpi_local_size_many_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftwl_mpi_local_size_many_1d
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
+                                 bind(C, name='fftwl_mpi_local_size_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftwl_mpi_local_size_1d
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwl_mpi_local_size_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwl_mpi_local_size_2d
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwl_mpi_local_size_2d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwl_mpi_local_size_2d_transposed
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwl_mpi_local_size_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwl_mpi_local_size_3d
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwl_mpi_local_size_3d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwl_mpi_local_size_3d_transposed
+    
+    type(C_PTR) function fftwl_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
+                         bind(C, name='fftwl_mpi_plan_many_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_many_transpose
+    
+    type(C_PTR) function fftwl_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_transpose
+    
+    type(C_PTR) function fftwl_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
+                         bind(C, name='fftwl_mpi_plan_many_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block
+      integer(C_INTPTR_T), value :: tblock
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_many_dft
+    
+    type(C_PTR) function fftwl_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_1d
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_2d
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_3d
+    
+    type(C_PTR) function fftwl_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
+                         bind(C, name='fftwl_mpi_plan_many_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_many_r2r
+    
+    type(C_PTR) function fftwl_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftwl_mpi_plan_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_r2r
+    
+    type(C_PTR) function fftwl_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftwl_mpi_plan_r2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_r2r_2d
+    
+    type(C_PTR) function fftwl_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) &
+                         bind(C, name='fftwl_mpi_plan_r2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_FFTW_R2R_KIND), value :: kind2
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_r2r_3d
+    
+    type(C_PTR) function fftwl_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftwl_mpi_plan_many_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_many_dft_r2c
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_r2c
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_r2c_2d
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_r2c_3d
+    
+    type(C_PTR) function fftwl_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftwl_mpi_plan_many_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_many_dft_c2r
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_c2r
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_c2r_2d
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_c2r_3d
+    
+    subroutine fftwl_mpi_gather_wisdom(comm_) bind(C, name='fftwl_mpi_gather_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftwl_mpi_gather_wisdom
+    
+    subroutine fftwl_mpi_broadcast_wisdom(comm_) bind(C, name='fftwl_mpi_broadcast_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftwl_mpi_broadcast_wisdom
+    
+    subroutine fftwl_mpi_execute_dft(p,in,out) bind(C, name='fftwl_mpi_execute_dft')
+      import
+      type(C_PTR), value :: p
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwl_mpi_execute_dft
+    
+    subroutine fftwl_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftwl_mpi_execute_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwl_mpi_execute_dft_r2c
+    
+    subroutine fftwl_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftwl_mpi_execute_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftwl_mpi_execute_dft_c2r
+    
+    subroutine fftwl_mpi_execute_r2r(p,in,out) bind(C, name='fftwl_mpi_execute_r2r')
+      import
+      type(C_PTR), value :: p
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftwl_mpi_execute_r2r
+    
+  end interface
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/genf03-wrap.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/genf03-wrap.pl	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,78 @@
+#!/usr/bin/perl -w
+# Generate Fortran 2003 wrappers (which translate MPI_Comm from f2c) from
+# function declarations of the form (one per line):
+#     extern <type> fftw_mpi_<name>(...args...)
+#     extern <type> fftw_mpi_<name>(...args...)
+#     ...
+# with no line breaks within a given function.  (It's too much work to
+# write a general parser, since we just have to handle FFTW's header files.)
+# Each declaration has at least one MPI_Comm argument.
+
+sub canonicalize_type {
+    my($type);
+    ($type) = @_;
+    $type =~ s/ +/ /g;
+    $type =~ s/^ //;
+    $type =~ s/ $//;
+    $type =~ s/([^\* ])\*/$1 \*/g;
+    $type =~ s/double/R/;
+    $type =~ s/fftw_([A-Za-z0-9_]+)/X(\1)/;
+    return $type;
+}
+
+while (<>) {
+    next if /^ *$/;
+    if (/^ *extern +([a-zA-Z_0-9 ]+[ \*]) *fftw_mpi_([a-zA-Z_0-9]+) *\((.*)\) *$/) {
+	$ret = &canonicalize_type($1);
+	$name = $2;
+
+	$args = $3;
+
+	
+	print "\n$ret XM(${name}_f03)(";
+
+	$comma = "";
+	foreach $arg (split(/ *, */, $args)) {
+            $arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
+            $argtype = &canonicalize_type($1);
+            $argname = $2;
+	    print $comma;
+	    if ($argtype eq "MPI_Comm") {
+		print "MPI_Fint f_$argname";
+	    }
+	    else {
+		print "$argtype $argname";
+	    }
+	    $comma = ", ";
+        }
+	print ")\n{\n";
+
+	print "     MPI_Comm ";
+	$comma = "";
+	foreach $arg (split(/ *, */, $args)) {
+            $arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
+            $argtype = &canonicalize_type($1);
+            $argname = $2;
+	    if ($argtype eq "MPI_Comm") {
+		print "$comma$argname";
+		$comma = ", ";
+	    }
+        }
+	print ";\n\n";
+
+	foreach $arg (split(/ *, */, $args)) {
+            $arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
+            $argtype = &canonicalize_type($1);
+            $argname = $2;
+            if ($argtype eq "MPI_Comm") {
+                print "     $argname = MPI_Comm_f2c(f_$argname);\n";
+            }
+        }
+
+	$argnames = $args;
+	$argnames =~ s/([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) */$2/g;
+	print "     ";
+	print "return " if ($ret ne "void");
+	print "XM($name)($argnames);\n}\n";
+    }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/ifftw-mpi.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/ifftw-mpi.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* FFTW-MPI internal header file */
+#ifndef __IFFTW_MPI_H__
+#define __IFFTW_MPI_H__
+
+#include "ifftw.h"
+#include "rdft.h"
+
+#include <mpi.h>
+
+/* mpi problem flags: problem-dependent meaning, but in general
+   SCRAMBLED means some reordering *within* the dimensions, while
+   TRANSPOSED means some reordering *of* the dimensions */
+#define SCRAMBLED_IN (1 << 0)
+#define SCRAMBLED_OUT (1 << 1)
+#define TRANSPOSED_IN (1 << 2)
+#define TRANSPOSED_OUT (1 << 3)
+#define RANK1_BIGVEC_ONLY (1 << 4) /* for rank=1, allow only bigvec solver */
+
+#define ONLY_SCRAMBLEDP(flags) (!((flags) & ~(SCRAMBLED_IN|SCRAMBLED_OUT)))
+#define ONLY_TRANSPOSEDP(flags) (!((flags) & ~(TRANSPOSED_IN|TRANSPOSED_OUT)))
+
+#if defined(FFTW_SINGLE)
+#  define FFTW_MPI_TYPE MPI_FLOAT
+#elif defined(FFTW_LDOUBLE)
+#  define FFTW_MPI_TYPE MPI_LONG_DOUBLE
+#elif defined(FFTW_QUAD)
+#  error MPI quad-precision type is unknown
+#else
+#  define FFTW_MPI_TYPE MPI_DOUBLE
+#endif
+
+/* all fftw-mpi identifiers start with fftw_mpi (or fftwf_mpi etc.) */
+#define XM(name) X(CONCAT(mpi_, name))
+
+/***********************************************************************/
+/* block distributions */
+
+/* a distributed dimension of length n with input and output block
+   sizes ib and ob, respectively. */
+typedef enum { IB = 0, OB } block_kind;
+typedef struct {
+     INT n;
+     INT b[2]; /* b[IB], b[OB] */
+} ddim;
+
+/* Loop over k in {IB, OB}.  Note: need explicit casts for C++. */
+#define FORALL_BLOCK_KIND(k) for (k = IB; k <= OB; k = (block_kind) (((int) k) + 1))
+
+/* unlike tensors in the serial FFTW, the ordering of the dtensor
+   dimensions matters - both the array and the block layout are
+   row-major order. */
+typedef struct {
+     int rnk;
+#if defined(STRUCT_HACK_KR)
+     ddim dims[1];
+#elif defined(STRUCT_HACK_C99)
+     ddim dims[];
+#else
+     ddim *dims;
+#endif
+} dtensor;
+
+
+/* dtensor.c: */
+dtensor *XM(mkdtensor)(int rnk);
+void XM(dtensor_destroy)(dtensor *sz);
+dtensor *XM(dtensor_copy)(const dtensor *sz);
+dtensor *XM(dtensor_canonical)(const dtensor *sz, int compress);
+int XM(dtensor_validp)(const dtensor *sz);
+void XM(dtensor_md5)(md5 *p, const dtensor *t);
+void XM(dtensor_print)(const dtensor *t, printer *p);
+
+/* block.c: */
+
+/* for a single distributed dimension: */
+INT XM(num_blocks)(INT n, INT block);
+int XM(num_blocks_ok)(INT n, INT block, MPI_Comm comm);
+INT XM(default_block)(INT n, int n_pes);
+INT XM(block)(INT n, INT block, int which_block);
+
+/* for multiple distributed dimensions: */
+INT XM(num_blocks_total)(const dtensor *sz, block_kind k);
+int XM(idle_process)(const dtensor *sz, block_kind k, int which_pe);
+void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe, 
+		     INT *coords);
+INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe);
+int XM(is_local_after)(int dim, const dtensor *sz, block_kind k);
+int XM(is_local)(const dtensor *sz, block_kind k);
+int XM(is_block1d)(const dtensor *sz, block_kind k);
+
+/* choose-radix.c */
+INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
+                     INT rblock[2], INT mblock[2]);
+
+/***********************************************************************/
+/* any_true.c */
+int XM(any_true)(int condition, MPI_Comm comm);
+int XM(md5_equal)(md5 m, MPI_Comm comm);
+
+/* conf.c */
+void XM(conf_standard)(planner *p);
+
+/***********************************************************************/
+/* rearrange.c */
+
+/* Different ways to rearrange the vector dimension vn during transposition,
+   reflecting different tradeoffs between ease of transposition and
+   contiguity during the subsequent DFTs.
+
+   TODO: can we pare this down to CONTIG and DISCONTIG, at least
+   in MEASURE mode?  SQUARE_MIDDLE is also used for 1d destroy-input DFTs. */
+typedef enum {
+     CONTIG = 0, /* vn x 1: make subsequent DFTs contiguous */
+     DISCONTIG, /* P x (vn/P) for P processes */
+     SQUARE_BEFORE, /* try to get square transpose at beginning */
+     SQUARE_MIDDLE, /* try to get square transpose in the middle */
+     SQUARE_AFTER /* try to get square transpose at end */
+} rearrangement;
+
+/* skipping SQUARE_AFTER since it doesn't seem to offer any advantage
+   over SQUARE_BEFORE */
+#define FORALL_REARRANGE(rearrange) for (rearrange = CONTIG; rearrange <= SQUARE_MIDDLE; rearrange = (rearrangement) (((int) rearrange) + 1))
+
+int XM(rearrange_applicable)(rearrangement rearrange, 
+			     ddim dim0, INT vn, int n_pes);
+INT XM(rearrange_ny)(rearrangement rearrange, ddim dim0, INT vn, int n_pes);
+
+/***********************************************************************/
+
+#endif /* __IFFTW_MPI_H__ */
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/mpi-bench.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/mpi-bench.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,844 @@
+/**************************************************************************/
+/* NOTE to users: this is the FFTW-MPI self-test and benchmark program.
+   It is probably NOT a good place to learn FFTW usage, since it has a
+   lot of added complexity in order to exercise and test the full API,
+   etcetera.  We suggest reading the manual. */
+/**************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include "fftw3-mpi.h"
+#include "fftw-bench.h"
+
+#if defined(BENCHFFT_SINGLE)
+#  define BENCH_MPI_TYPE MPI_FLOAT
+#elif defined(BENCHFFT_LDOUBLE)
+#  define BENCH_MPI_TYPE MPI_LONG_DOUBLE
+#elif defined(BENCHFFT_QUAD)
+#  error MPI quad-precision type is unknown
+#else
+#  define BENCH_MPI_TYPE MPI_DOUBLE
+#endif
+
+#if SIZEOF_PTRDIFF_T == SIZEOF_INT
+#  define FFTW_MPI_PTRDIFF_T MPI_INT
+#elif SIZEOF_PTRDIFF_T == SIZEOF_LONG
+#  define FFTW_MPI_PTRDIFF_T MPI_LONG
+#elif SIZEOF_PTRDIFF_T == SIZEOF_LONG_LONG
+#  define FFTW_MPI_PTRDIFF_T MPI_LONG_LONG
+#else
+#  error MPI type for ptrdiff_t is unknown
+#  define FFTW_MPI_PTRDIFF_T MPI_LONG
+#endif
+
+static const char *mkversion(void) { return FFTW(version); }
+static const char *mkcc(void) { return FFTW(cc); }
+static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); }
+static const char *mknproc(void) {
+     static char buf[32];
+     int ncpus;
+     MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
+#ifdef HAVE_SNPRINTF
+     snprintf(buf, 32, "%d", ncpus);
+#else
+     sprintf(buf, "%d", ncpus);
+#endif
+     return buf;
+}
+
+BEGIN_BENCH_DOC
+BENCH_DOC("name", "fftw3_mpi")
+BENCH_DOCF("version", mkversion)
+BENCH_DOCF("cc", mkcc)
+BENCH_DOCF("codelet-optim", mkcodelet_optim)
+BENCH_DOCF("nproc", mknproc)
+END_BENCH_DOC 
+
+static int n_pes = 1, my_pe = 0;
+
+/* global variables describing the shape of the data and its distribution */
+static int rnk;
+static ptrdiff_t vn, iNtot, oNtot;
+static ptrdiff_t *local_ni=0, *local_starti=0;
+static ptrdiff_t *local_no=0, *local_starto=0;
+static ptrdiff_t *all_local_ni=0, *all_local_starti=0; /* n_pes x rnk arrays */
+static ptrdiff_t *all_local_no=0, *all_local_starto=0; /* n_pes x rnk arrays */
+static ptrdiff_t *istrides = 0, *ostrides = 0;
+static ptrdiff_t *total_ni=0, *total_no=0;
+static int *isend_cnt = 0, *isend_off = 0; /* for MPI_Scatterv */
+static int *orecv_cnt = 0, *orecv_off = 0; /* for MPI_Gatherv */
+
+static bench_real *local_in = 0, *local_out = 0;
+static bench_real *all_local_in = 0, *all_local_out = 0;
+static int all_local_in_alloc = 0, all_local_out_alloc = 0;
+static FFTW(plan) plan_scramble_in = 0, plan_unscramble_out = 0;
+
+static void alloc_rnk(int rnk_) {
+     rnk = rnk_;
+     bench_free(local_ni);
+     if (rnk == 0)
+	  local_ni = 0;
+     else
+	  local_ni = (ptrdiff_t *) bench_malloc(sizeof(ptrdiff_t) * rnk
+						* (8 + n_pes * 4));
+
+     local_starti = local_ni + rnk;
+     local_no = local_ni + 2 * rnk;
+     local_starto = local_ni + 3 * rnk;
+     istrides = local_ni + 4 * rnk;
+     ostrides = local_ni + 5 * rnk;
+     total_ni = local_ni + 6 * rnk;
+     total_no = local_ni + 7 * rnk;
+     all_local_ni = local_ni + 8 * rnk;
+     all_local_starti = local_ni + (8 + n_pes) * rnk;
+     all_local_no = local_ni + (8 + 2 * n_pes) * rnk;
+     all_local_starto = local_ni + (8 + 3 * n_pes) * rnk;
+}
+
+static void setup_gather_scatter(void)
+{
+     int i, j;
+     ptrdiff_t off;
+
+     MPI_Gather(local_ni, rnk, FFTW_MPI_PTRDIFF_T,
+		all_local_ni, rnk, FFTW_MPI_PTRDIFF_T,
+		0, MPI_COMM_WORLD);
+     MPI_Bcast(all_local_ni, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
+     MPI_Gather(local_starti, rnk, FFTW_MPI_PTRDIFF_T,
+		all_local_starti, rnk, FFTW_MPI_PTRDIFF_T,
+		0, MPI_COMM_WORLD);
+     MPI_Bcast(all_local_starti, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
+
+     MPI_Gather(local_no, rnk, FFTW_MPI_PTRDIFF_T,
+		all_local_no, rnk, FFTW_MPI_PTRDIFF_T,
+		0, MPI_COMM_WORLD);
+     MPI_Bcast(all_local_no, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
+     MPI_Gather(local_starto, rnk, FFTW_MPI_PTRDIFF_T,
+		all_local_starto, rnk, FFTW_MPI_PTRDIFF_T,
+		0, MPI_COMM_WORLD);
+     MPI_Bcast(all_local_starto, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
+
+     off = 0;
+     for (i = 0; i < n_pes; ++i) {
+	  ptrdiff_t N = vn;
+	  for (j = 0; j < rnk; ++j)
+	       N *= all_local_ni[i * rnk + j];
+	  isend_cnt[i] = N;
+	  isend_off[i] = off;
+	  off += N;
+     }
+     iNtot = off;
+     all_local_in_alloc = 1;
+
+     istrides[rnk - 1] = vn;
+     for (j = rnk - 2; j >= 0; --j)
+	  istrides[j] = total_ni[j + 1] * istrides[j + 1];
+
+     off = 0;
+     for (i = 0; i < n_pes; ++i) {
+	  ptrdiff_t N = vn;
+	  for (j = 0; j < rnk; ++j)
+	       N *= all_local_no[i * rnk + j];
+	  orecv_cnt[i] = N;
+	  orecv_off[i] = off;
+	  off += N;
+     }
+     oNtot = off;
+     all_local_out_alloc = 1;
+
+     ostrides[rnk - 1] = vn;
+     for (j = rnk - 2; j >= 0; --j)
+	  ostrides[j] = total_no[j + 1] * ostrides[j + 1];
+}
+
+static void copy_block_out(const bench_real *in,
+			   int rnk, ptrdiff_t *n, ptrdiff_t *start, 
+			   ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
+			   bench_real *out)
+{
+     ptrdiff_t i;
+     if (rnk == 0) { 
+	  for (i = 0; i < vn; ++i)
+	       out[i] = in[i];
+     }
+     else if (rnk == 1) { /* this case is just an optimization */
+	  ptrdiff_t j;
+	  out += start[0] * os[0];
+	  for (j = 0; j < n[0]; ++j) {
+	       for (i = 0; i < vn; ++i)
+		    out[i] = in[i];
+	       in += is;
+	       out += os[0];
+	  }
+     }
+     else {
+	  /* we should do n[0] for locality, but this way is simpler to code */
+	  for (i = 0; i < n[rnk - 1]; ++i) 
+	       copy_block_out(in + i * is,
+			      rnk - 1, n, start, is * n[rnk - 1], os, vn,
+			      out + (start[rnk - 1] + i) * os[rnk - 1]);
+     }
+}
+
+static void copy_block_in(bench_real *in,
+			  int rnk, ptrdiff_t *n, ptrdiff_t *start, 
+			  ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
+			  const bench_real *out)
+{
+     ptrdiff_t i;
+     if (rnk == 0) { 
+	  for (i = 0; i < vn; ++i)
+	       in[i] = out[i];
+     }
+     else if (rnk == 1) { /* this case is just an optimization */
+	  ptrdiff_t j;
+	  out += start[0] * os[0];
+	  for (j = 0; j < n[0]; ++j) {
+	       for (i = 0; i < vn; ++i)
+		    in[i] = out[i];
+	       in += is;
+	       out += os[0];
+	  }
+     }
+     else {
+	  /* we should do n[0] for locality, but this way is simpler to code */
+	  for (i = 0; i < n[rnk - 1]; ++i) 
+	       copy_block_in(in + i * is,
+			     rnk - 1, n, start, is * n[rnk - 1], os, vn,
+			     out + (start[rnk - 1] + i) * os[rnk - 1]);
+     }
+}
+
+static void do_scatter_in(bench_real *in)
+{
+     bench_real *ali;
+     int i;
+     if (all_local_in_alloc) {
+          bench_free(all_local_in);
+	  all_local_in = (bench_real*) bench_malloc(iNtot*sizeof(bench_real));
+	  all_local_in_alloc = 0;
+     }
+     ali = all_local_in;
+     for (i = 0; i < n_pes; ++i) {
+	  copy_block_in(ali,
+			rnk, all_local_ni + i * rnk, 
+			all_local_starti + i * rnk,
+			vn, istrides, vn,
+			in);
+	  ali += isend_cnt[i];
+     }
+     MPI_Scatterv(all_local_in, isend_cnt, isend_off, BENCH_MPI_TYPE,
+		  local_in, isend_cnt[my_pe], BENCH_MPI_TYPE,
+		  0, MPI_COMM_WORLD);
+}
+
+static void do_gather_out(bench_real *out)
+{
+     bench_real *alo;
+     int i;
+
+     if (all_local_out_alloc) {
+          bench_free(all_local_out);
+	  all_local_out = (bench_real*) bench_malloc(oNtot*sizeof(bench_real));
+	  all_local_out_alloc = 0;
+     }
+     MPI_Gatherv(local_out, orecv_cnt[my_pe], BENCH_MPI_TYPE,
+		 all_local_out, orecv_cnt, orecv_off, BENCH_MPI_TYPE,
+		 0, MPI_COMM_WORLD);
+     MPI_Bcast(all_local_out, oNtot, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD);
+     alo = all_local_out;
+     for (i = 0; i < n_pes; ++i) {
+	  copy_block_out(alo,
+			 rnk, all_local_no + i * rnk, 
+			 all_local_starto + i * rnk,
+			 vn, ostrides, vn,
+			 out);
+	  alo += orecv_cnt[i];
+     }
+}
+
+static void alloc_local(ptrdiff_t nreal, int inplace)
+{
+     bench_free(local_in);
+     if (local_out != local_in) bench_free(local_out);
+     local_in = local_out = 0;
+     if (nreal > 0) {
+	  ptrdiff_t i;
+	  local_in = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
+	  if (inplace)
+	       local_out = local_in;
+	  else
+	       local_out = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
+	  for (i = 0; i < nreal; ++i) local_in[i] = local_out[i] = 0.0;
+     }
+}
+
+void after_problem_rcopy_from(bench_problem *p, bench_real *ri)
+{
+     UNUSED(p);
+     do_scatter_in(ri);
+     if (plan_scramble_in) FFTW(execute)(plan_scramble_in);
+}
+
+void after_problem_rcopy_to(bench_problem *p, bench_real *ro)
+{
+     UNUSED(p);
+     if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out);
+     do_gather_out(ro);
+}
+
+void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
+{
+     UNUSED(ii);
+     after_problem_rcopy_from(p, ri);
+}
+
+void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
+{
+     UNUSED(io);
+     after_problem_rcopy_to(p, ro);
+}
+
+void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
+{
+     UNUSED(ii);
+     after_problem_rcopy_from(p, ri);
+}
+
+void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
+{
+     UNUSED(io);
+     after_problem_rcopy_to(p, ro);
+}
+
+static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny, 
+					 ptrdiff_t vn, 
+					 bench_real *in, bench_real *out)
+{
+     FFTW(iodim64) hdims[3];
+     FFTW(r2r_kind) k[3];
+     FFTW(plan) pln;
+
+     hdims[0].n = nx;
+     hdims[0].is = ny * vn;
+     hdims[0].os = vn;
+     hdims[1].n = ny;
+     hdims[1].is = vn;
+     hdims[1].os = nx * vn;
+     hdims[2].n = vn;
+     hdims[2].is = 1;
+     hdims[2].os = 1;
+     k[0] = k[1] = k[2] = FFTW_R2HC;
+     pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE);
+     BENCH_ASSERT(pln != 0);
+     return pln;
+}
+
+static int tensor_rowmajor_transposedp(bench_tensor *t)
+{
+     bench_iodim *d;
+     int i;
+
+     BENCH_ASSERT(FINITE_RNK(t->rnk));
+     if (t->rnk < 2)
+	  return 0;
+
+     d = t->dims;
+     if (d[0].is != d[1].is * d[1].n
+	 || d[0].os != d[1].is
+	 || d[1].os != d[0].os * d[0].n)
+	  return 0;
+     if (t->rnk > 2 && d[1].is != d[2].is * d[2].n)
+	  return 0;
+     for (i = 2; i + 1 < t->rnk; ++i) {
+          d = t->dims + i;
+          if (d[0].is != d[1].is * d[1].n
+	      || d[0].os != d[1].os * d[1].n)
+               return 0;
+     }
+
+     if (t->rnk > 2 && t->dims[t->rnk-1].is != t->dims[t->rnk-1].os)
+	  return 0;
+     return 1;
+}
+
+static int tensor_contiguousp(bench_tensor *t, int s)
+{
+     return (t->dims[t->rnk-1].is == s
+	     && ((tensor_rowmajorp(t) && 
+		  t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)
+		 || tensor_rowmajor_transposedp(t)));
+}
+
+static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln = 0;
+     int i; 
+     ptrdiff_t ntot;
+
+     vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
+
+     if (p->sz->rnk < 1
+	 || p->split
+	 || !tensor_contiguousp(p->sz, vn)
+	 || tensor_rowmajor_transposedp(p->sz)
+	 || p->vecsz->rnk > 1
+	 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
+				    || p->vecsz->dims[0].os != 1)))
+	  return 0;
+
+     alloc_rnk(p->sz->rnk);
+     for (i = 0; i < rnk; ++i) {
+	  total_ni[i] = total_no[i] = p->sz->dims[i].n;
+	  local_ni[i] = local_no[i] = total_ni[i];
+	  local_starti[i] = local_starto[i] = 0;
+     }
+     if (rnk > 1) {
+	  ptrdiff_t n, start, nT, startT;
+	  ntot = FFTW(mpi_local_size_many_transposed)
+	       (p->sz->rnk, total_ni, vn,
+		FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+		MPI_COMM_WORLD,
+		&n, &start, &nT, &startT);
+	  if  (flags & FFTW_MPI_TRANSPOSED_IN) {
+	       local_ni[1] = nT;
+	       local_starti[1] = startT;
+	  }
+	  else {
+	       local_ni[0] = n;
+	       local_starti[0] = start;
+	  }
+	  if  (flags & FFTW_MPI_TRANSPOSED_OUT) {
+	       local_no[1] = nT;
+	       local_starto[1] = startT;
+	  }
+	  else {
+	       local_no[0] = n;
+	       local_starto[0] = start;
+	  }
+     }
+     else if (rnk == 1) {
+	  ntot = FFTW(mpi_local_size_many_1d)
+	       (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
+		local_ni, local_starti, local_no, local_starto);
+     }
+     alloc_local(ntot * 2, p->in == p->out);
+
+     pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn, 
+				   FFTW_MPI_DEFAULT_BLOCK,
+				   FFTW_MPI_DEFAULT_BLOCK,
+				   (FFTW(complex) *) local_in, 
+				   (FFTW(complex) *) local_out,
+				   MPI_COMM_WORLD, p->sign, flags);
+
+     vn *= 2;
+
+     if (rnk > 1) {
+	  ptrdiff_t nrest = 1;
+	  for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
+	  if (flags & FFTW_MPI_TRANSPOSED_IN)
+	       plan_scramble_in = mkplan_transpose_local(
+		    p->sz->dims[0].n, local_ni[1], vn * nrest,
+		    local_in, local_in);
+	  if (flags & FFTW_MPI_TRANSPOSED_OUT)
+	       plan_unscramble_out = mkplan_transpose_local(
+		    local_no[1], p->sz->dims[0].n, vn * nrest,
+		    local_out, local_out);
+     }
+     
+     return pln;
+}
+
+static int tensor_real_contiguousp(bench_tensor *t, int sign, int s)
+{
+     return (t->dims[t->rnk-1].is == s
+	     && ((tensor_real_rowmajorp(t, sign, 1) && 
+		  t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)));
+}
+
+static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln = 0;
+     int i; 
+     ptrdiff_t ntot;
+
+     vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
+
+     if (p->sz->rnk < 2
+	 || p->split
+	 || !tensor_real_contiguousp(p->sz, p->sign, vn)
+	 || tensor_rowmajor_transposedp(p->sz)
+	 || p->vecsz->rnk > 1
+	 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
+				    || p->vecsz->dims[0].os != 1)))
+	  return 0;
+
+     alloc_rnk(p->sz->rnk);
+     for (i = 0; i < rnk; ++i) {
+	  total_ni[i] = total_no[i] = p->sz->dims[i].n;
+	  local_ni[i] = local_no[i] = total_ni[i];
+	  local_starti[i] = local_starto[i] = 0;
+     }
+     local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1] 
+	  = p->sz->dims[rnk-1].n / 2 + 1;
+     {
+	  ptrdiff_t n, start, nT, startT;
+	  ntot = FFTW(mpi_local_size_many_transposed)
+	       (p->sz->rnk, total_ni, vn,
+		FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+		MPI_COMM_WORLD,
+		&n, &start, &nT, &startT);
+	  if  (flags & FFTW_MPI_TRANSPOSED_IN) {
+	       local_ni[1] = nT;
+	       local_starti[1] = startT;
+	  }
+	  else {
+	       local_ni[0] = n;
+	       local_starti[0] = start;
+	  }
+	  if  (flags & FFTW_MPI_TRANSPOSED_OUT) {
+	       local_no[1] = nT;
+	       local_starto[1] = startT;
+	  }
+	  else {
+	       local_no[0] = n;
+	       local_starto[0] = start;
+	  }
+     }
+     alloc_local(ntot * 2, p->in == p->out);
+
+     total_ni[rnk - 1] = p->sz->dims[rnk - 1].n;
+     if (p->sign < 0)
+	  pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn, 
+					    FFTW_MPI_DEFAULT_BLOCK,
+					    FFTW_MPI_DEFAULT_BLOCK,
+					    local_in, 
+					    (FFTW(complex) *) local_out,
+					    MPI_COMM_WORLD, flags);
+     else
+	  pln = FFTW(mpi_plan_many_dft_c2r)(p->sz->rnk, total_ni, vn, 
+					    FFTW_MPI_DEFAULT_BLOCK,
+					    FFTW_MPI_DEFAULT_BLOCK,
+					    (FFTW(complex) *) local_in, 
+					    local_out,
+					    MPI_COMM_WORLD, flags);
+
+     total_ni[rnk - 1] = p->sz->dims[rnk - 1].n / 2 + 1;
+     vn *= 2;
+
+     {
+	  ptrdiff_t nrest = 1;
+	  for (i = 2; i < rnk; ++i) nrest *= total_ni[i];
+	  if (flags & FFTW_MPI_TRANSPOSED_IN)
+	       plan_scramble_in = mkplan_transpose_local(
+		    total_ni[0], local_ni[1], vn * nrest,
+		    local_in, local_in);
+	  if (flags & FFTW_MPI_TRANSPOSED_OUT)
+	       plan_unscramble_out = mkplan_transpose_local(
+		    local_no[1], total_ni[0], vn * nrest,
+		    local_out, local_out);
+     }
+     
+     return pln;
+}
+
+static FFTW(plan) mkplan_transpose(bench_problem *p, unsigned flags)
+{
+     ptrdiff_t ntot, nx, ny;
+     int ix=0, iy=1, i;
+     const bench_iodim *d = p->vecsz->dims;
+     FFTW(plan) pln;
+
+     if (p->vecsz->rnk == 3) {
+	  for (i = 0; i < 3; ++i)
+	       if (d[i].is == 1 && d[i].os == 1) {
+		    vn = d[i].n;
+		    ix = (i + 1) % 3;
+		    iy = (i + 2) % 3;
+		    break;
+	       }
+	  if (i == 3) return 0;
+     }
+     else {
+	  vn = 1;
+	  ix = 0;
+	  iy = 1;
+     }
+
+     if (d[ix].is == d[iy].n * vn && d[ix].os == vn
+	 && d[iy].os == d[ix].n * vn && d[iy].is == vn) {
+	  nx = d[ix].n;
+	  ny = d[iy].n;
+     }
+     else if (d[iy].is == d[ix].n * vn && d[iy].os == vn
+	      && d[ix].os == d[iy].n * vn && d[ix].is == vn) {
+	  nx = d[iy].n;
+	  ny = d[ix].n;
+     }
+     else
+	  return 0;
+
+     alloc_rnk(2);
+     ntot = vn * FFTW(mpi_local_size_2d_transposed)(nx, ny, MPI_COMM_WORLD,
+						    &local_ni[0], 
+						    &local_starti[0],
+						    &local_no[0], 
+						    &local_starto[0]);
+     local_ni[1] = ny;
+     local_starti[1] = 0;
+     local_no[1] = nx;
+     local_starto[1] = 0;
+     total_ni[0] = nx; total_ni[1] = ny;
+     total_no[1] = nx; total_no[0] = ny;
+     alloc_local(ntot, p->in == p->out);
+
+     pln = FFTW(mpi_plan_many_transpose)(nx, ny, vn,
+					 FFTW_MPI_DEFAULT_BLOCK,
+					 FFTW_MPI_DEFAULT_BLOCK,
+					 local_in, local_out,
+					 MPI_COMM_WORLD, flags);
+     
+     if (flags & FFTW_MPI_TRANSPOSED_IN)
+	  plan_scramble_in = mkplan_transpose_local(local_ni[0], ny, vn,
+						    local_in, local_in);
+     if (flags & FFTW_MPI_TRANSPOSED_OUT)
+	  plan_unscramble_out = mkplan_transpose_local
+	       (nx, local_no[0], vn, local_out, local_out);
+     
+#if 0
+     if (pln && vn == 1) {
+	  int i, j;
+	  bench_real *ri = (bench_real *) p->in;
+	  bench_real *ro = (bench_real *) p->out;
+	  if (!ri || !ro) return pln;
+	  setup_gather_scatter();
+	  for (i = 0; i < nx * ny; ++i)
+	       ri[i] = i;
+	  after_problem_rcopy_from(p, ri);
+	  FFTW(execute)(pln);
+	  after_problem_rcopy_to(p, ro);
+	  if (my_pe == 0) {
+	       for (i = 0; i < nx; ++i) {
+		    for (j = 0; j < ny; ++j)
+			 printf("  %3g", ro[j * nx + i]);
+		    printf("\n");
+	       }
+	  }
+     }
+#endif
+
+     return pln;
+}
+
+static FFTW(plan) mkplan_r2r(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln = 0;
+     int i; 
+     ptrdiff_t ntot;
+     FFTW(r2r_kind) *k;
+
+     if ((p->sz->rnk == 0 || (p->sz->rnk == 1 && p->sz->dims[0].n == 1))
+	 && p->vecsz->rnk >= 2 && p->vecsz->rnk <= 3)
+	  return mkplan_transpose(p, flags);
+
+     vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
+
+     if (p->sz->rnk < 1
+	 || p->split
+	 || !tensor_contiguousp(p->sz, vn)
+	 || tensor_rowmajor_transposedp(p->sz)
+	 || p->vecsz->rnk > 1
+	 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
+				    || p->vecsz->dims[0].os != 1)))
+	  return 0;
+
+     alloc_rnk(p->sz->rnk);
+     for (i = 0; i < rnk; ++i) {
+	  total_ni[i] = total_no[i] = p->sz->dims[i].n;
+	  local_ni[i] = local_no[i] = total_ni[i];
+	  local_starti[i] = local_starto[i] = 0;
+     }
+     if (rnk > 1) {
+	  ptrdiff_t n, start, nT, startT;
+	  ntot = FFTW(mpi_local_size_many_transposed)
+	       (p->sz->rnk, total_ni, vn,
+		FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+		MPI_COMM_WORLD,
+		&n, &start, &nT, &startT);
+	  if  (flags & FFTW_MPI_TRANSPOSED_IN) {
+	       local_ni[1] = nT;
+	       local_starti[1] = startT;
+	  }
+	  else {
+	       local_ni[0] = n;
+	       local_starti[0] = start;
+	  }
+	  if  (flags & FFTW_MPI_TRANSPOSED_OUT) {
+	       local_no[1] = nT;
+	       local_starto[1] = startT;
+	  }
+	  else {
+	       local_no[0] = n;
+	       local_starto[0] = start;
+	  }
+     }
+     else if (rnk == 1) {
+	  ntot = FFTW(mpi_local_size_many_1d)
+	       (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
+		local_ni, local_starti, local_no, local_starto);
+     }
+     alloc_local(ntot, p->in == p->out);
+
+     k = (FFTW(r2r_kind) *) bench_malloc(sizeof(FFTW(r2r_kind)) * p->sz->rnk);
+     for (i = 0; i < p->sz->rnk; ++i)
+	  switch (p->k[i]) {
+	      case R2R_R2HC: k[i] = FFTW_R2HC; break;
+	      case R2R_HC2R: k[i] = FFTW_HC2R; break;
+	      case R2R_DHT: k[i] = FFTW_DHT; break;
+	      case R2R_REDFT00: k[i] = FFTW_REDFT00; break;
+	      case R2R_REDFT01: k[i] = FFTW_REDFT01; break;
+	      case R2R_REDFT10: k[i] = FFTW_REDFT10; break;
+	      case R2R_REDFT11: k[i] = FFTW_REDFT11; break;
+	      case R2R_RODFT00: k[i] = FFTW_RODFT00; break;
+	      case R2R_RODFT01: k[i] = FFTW_RODFT01; break;
+	      case R2R_RODFT10: k[i] = FFTW_RODFT10; break;
+	      case R2R_RODFT11: k[i] = FFTW_RODFT11; break;
+	      default: BENCH_ASSERT(0);
+	  }
+
+     pln = FFTW(mpi_plan_many_r2r)(p->sz->rnk, total_ni, vn, 
+				   FFTW_MPI_DEFAULT_BLOCK,
+				   FFTW_MPI_DEFAULT_BLOCK,
+				   local_in, local_out,
+				   MPI_COMM_WORLD, k, flags);
+     bench_free(k);
+
+     if (rnk > 1) {
+	  ptrdiff_t nrest = 1;
+	  for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
+	  if (flags & FFTW_MPI_TRANSPOSED_IN)
+	       plan_scramble_in = mkplan_transpose_local(
+		    p->sz->dims[0].n, local_ni[1], vn * nrest,
+		    local_in, local_in);
+	  if (flags & FFTW_MPI_TRANSPOSED_OUT)
+	       plan_unscramble_out = mkplan_transpose_local(
+		    local_no[1], p->sz->dims[0].n, vn * nrest,
+		    local_out, local_out);
+     }
+     
+     return pln;
+}
+
+FFTW(plan) mkplan(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln = 0;
+     FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
+     FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
+     if (p->scrambled_in) {
+	  if (p->sz->rnk == 1 && p->sz->dims[0].n != 1) 
+	       flags |= FFTW_MPI_SCRAMBLED_IN;
+	  else
+	       flags |= FFTW_MPI_TRANSPOSED_IN;
+     }
+     if (p->scrambled_out) {
+	  if (p->sz->rnk == 1 && p->sz->dims[0].n != 1) 
+	       flags |= FFTW_MPI_SCRAMBLED_OUT;
+	  else
+	       flags |= FFTW_MPI_TRANSPOSED_OUT;
+     }
+     switch (p->kind) {
+         case PROBLEM_COMPLEX: 
+	      pln =mkplan_complex(p, flags);
+	      break;
+         case PROBLEM_REAL: 
+	      pln = mkplan_real(p, flags);
+	      break;
+         case PROBLEM_R2R:
+	      pln = mkplan_r2r(p, flags);
+	      break;
+         default: BENCH_ASSERT(0);
+     }
+     if (pln) setup_gather_scatter();
+     return pln;
+}
+
+void main_init(int *argc, char ***argv)
+{
+#ifdef HAVE_SMP
+# if MPI_VERSION >= 2 /* for MPI_Init_thread */
+     int provided;
+     MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, &provided);
+     threads_ok = provided >= MPI_THREAD_FUNNELED;
+# else
+     MPI_Init(argc, argv);
+     threads_ok = 0;
+# endif
+#else
+     MPI_Init(argc, argv);
+#endif
+     MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
+     MPI_Comm_size(MPI_COMM_WORLD, &n_pes);
+     if (my_pe != 0) verbose = -999;
+     no_speed_allocation = 1; /* so we can benchmark transforms > memory */
+     always_pad_real = 1; /* out-of-place real transforms are padded */
+     isend_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
+     isend_off = (int *) bench_malloc(sizeof(int) * n_pes);
+     orecv_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
+     orecv_off = (int *) bench_malloc(sizeof(int) * n_pes);
+
+     /* init_threads must be called before any other FFTW function,
+	including mpi_init, because it has to register the threads hooks
+	before the planner is initalized */
+#ifdef HAVE_SMP
+     if (threads_ok) { BENCH_ASSERT(FFTW(init_threads)()); }
+#endif
+     FFTW(mpi_init)();
+}
+
+void initial_cleanup(void)
+{
+     alloc_rnk(0);
+     alloc_local(0, 0);
+     bench_free(all_local_in); all_local_in = 0;
+     bench_free(all_local_out); all_local_out = 0;
+     bench_free(isend_off); isend_off = 0;
+     bench_free(isend_cnt); isend_cnt = 0;
+     bench_free(orecv_off); orecv_off = 0;
+     bench_free(orecv_cnt); orecv_cnt = 0;
+     FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
+     FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
+}
+
+void final_cleanup(void)
+{
+     MPI_Finalize();
+}
+
+void bench_exit(int status)
+{
+     MPI_Abort(MPI_COMM_WORLD, status);
+}
+
+double bench_cost_postprocess(double cost)
+{
+     double cost_max;
+     MPI_Allreduce(&cost, &cost_max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+     return cost_max;
+}
+
+
+int import_wisdom(FILE *f)
+{
+     int success = 1, sall;
+     if (my_pe == 0) success = FFTW(import_wisdom_from_file)(f);
+     FFTW(mpi_broadcast_wisdom)(MPI_COMM_WORLD);
+     MPI_Allreduce(&success, &sall, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
+     return sall;
+}
+
+void export_wisdom(FILE *f)
+{
+     FFTW(mpi_gather_wisdom)(MPI_COMM_WORLD);
+     if (my_pe == 0) FFTW(export_wisdom_to_file)(f);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/mpi-dft.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/mpi-dft.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* problem.c: */
+typedef struct {
+     problem super;
+     dtensor *sz;
+     INT vn; /* vector length (vector stride 1) */
+     R *I, *O; /* contiguous interleaved arrays */
+
+     int sign; /* FFTW_FORWARD / FFTW_BACKWARD */
+     unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
+			SCRAMBLED_IN/OUT meaningful for 1d transforms only */
+
+     MPI_Comm comm;
+} problem_mpi_dft;
+
+problem *XM(mkproblem_dft)(const dtensor *sz, INT vn,
+			      R *I, R *O, MPI_Comm comm,
+			      int sign, unsigned flags);
+problem *XM(mkproblem_dft_d)(dtensor *sz, INT vn,
+			     R *I, R *O, MPI_Comm comm,
+			     int sign, unsigned flags);
+
+/* solve.c: */
+void XM(dft_solve)(const plan *ego_, const problem *p_);
+
+/* plans have same operands as rdft plans, so just re-use */
+typedef plan_rdft plan_mpi_dft;
+#define MKPLAN_MPI_DFT(type, adt, apply) \
+  (type *)X(mkplan_rdft)(sizeof(type), adt, apply)
+
+int XM(dft_serial_applicable)(const problem_mpi_dft *p);
+
+/* various solvers */
+void XM(dft_rank_geq2_register)(planner *p);
+void XM(dft_rank_geq2_transposed_register)(planner *p);
+void XM(dft_serial_register)(planner *p);
+void XM(dft_rank1_bigvec_register)(planner *p);
+void XM(dft_rank1_register)(planner *p);
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/mpi-rdft.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/mpi-rdft.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* problem.c: */
+typedef struct {
+     problem super;
+     dtensor *sz;
+     INT vn; /* vector length (vector stride 1) */
+     R *I, *O; /* contiguous interleaved arrays */
+
+     
+     unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
+			SCRAMBLED_IN/OUT meaningful for 1d transforms only */
+
+     MPI_Comm comm;
+
+#if defined(STRUCT_HACK_KR)
+     rdft_kind kind[1];
+#elif defined(STRUCT_HACK_C99)
+     rdft_kind kind[];
+#else
+     rdft_kind *kind;
+#endif
+} problem_mpi_rdft;
+
+problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn,
+			    R *I, R *O, MPI_Comm comm, 
+			    const rdft_kind *kind, unsigned flags);
+problem *XM(mkproblem_rdft_d)(dtensor *sz, INT vn,
+			      R *I, R *O, MPI_Comm comm, 
+			      const rdft_kind *kind, unsigned flags);
+
+/* solve.c: */
+void XM(rdft_solve)(const plan *ego_, const problem *p_);
+
+/* plans have same operands as rdft plans, so just re-use */
+typedef plan_rdft plan_mpi_rdft;
+#define MKPLAN_MPI_RDFT(type, adt, apply) \
+  (type *)X(mkplan_rdft)(sizeof(type), adt, apply)
+
+int XM(rdft_serial_applicable)(const problem_mpi_rdft *p);
+
+/* various solvers */
+void XM(rdft_rank_geq2_register)(planner *p);
+void XM(rdft_rank_geq2_transposed_register)(planner *p);
+void XM(rdft_serial_register)(planner *p);
+void XM(rdft_rank1_bigvec_register)(planner *p);
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/mpi-rdft2.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/mpi-rdft2.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* r2c and c2r transforms.  The sz dtensor, as usual, gives the size
+   of the "logical" complex array.  For the last dimension N, however,
+   only N/2+1 complex numbers are stored for the complex data.  Moreover,
+   for the real data, the last dimension is *always* padded to a size
+   2*(N/2+1).  (Contrast this with the serial API, where there is only
+   padding for in-place plans.) */
+
+/* problem.c: */
+typedef struct {
+     problem super;
+     dtensor *sz;
+     INT vn; /* vector length (vector stride 1) */
+     R *I, *O; /* contiguous interleaved arrays */
+
+     rdft_kind kind; /* assert(kind < DHT) */
+     unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
+			SCRAMBLED_IN/OUT meaningful for 1d transforms only */
+
+     MPI_Comm comm;
+} problem_mpi_rdft2;
+
+problem *XM(mkproblem_rdft2)(const dtensor *sz, INT vn,
+			     R *I, R *O, MPI_Comm comm,
+			     rdft_kind kind, unsigned flags);
+problem *XM(mkproblem_rdft2_d)(dtensor *sz, INT vn,
+			       R *I, R *O, MPI_Comm comm,
+			       rdft_kind kind, unsigned flags);
+
+/* solve.c: */
+void XM(rdft2_solve)(const plan *ego_, const problem *p_);
+
+/* plans have same operands as rdft plans, so just re-use */
+typedef plan_rdft plan_mpi_rdft2;
+#define MKPLAN_MPI_RDFT2(type, adt, apply) \
+  (type *)X(mkplan_rdft)(sizeof(type), adt, apply)
+
+int XM(rdft2_serial_applicable)(const problem_mpi_rdft2 *p);
+
+/* various solvers */
+void XM(rdft2_rank_geq2_register)(planner *p);
+void XM(rdft2_rank_geq2_transposed_register)(planner *p);
+void XM(rdft2_serial_register)(planner *p);
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/mpi-transpose.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/mpi-transpose.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* tproblem.c: */
+typedef struct {
+     problem super;
+     INT vn; /* vector length (vector stride 1) */
+     INT nx, ny; /* nx x ny transposed to ny x nx */
+     R *I, *O; /* contiguous real arrays (both same size!) */
+
+     unsigned flags; /* TRANSPOSED_IN: input is *locally* transposed
+			TRANSPOSED_OUT: output is *locally* transposed */
+
+     INT block, tblock; /* block size, slab decomposition;
+			   tblock is for transposed blocks on output */
+
+     MPI_Comm comm;
+} problem_mpi_transpose;
+
+problem *XM(mkproblem_transpose)(INT nx, INT ny, INT vn,
+				 R *I, R *O,
+				 INT block, INT tblock,
+				 MPI_Comm comm,
+				 unsigned flags);
+
+/* tsolve.c: */
+void XM(transpose_solve)(const plan *ego_, const problem *p_);
+
+/* plans have same operands as rdft plans, so just re-use */
+typedef plan_rdft plan_mpi_transpose;
+#define MKPLAN_MPI_TRANSPOSE(type, adt, apply) \
+  (type *)X(mkplan_rdft)(sizeof(type), adt, apply)
+
+/* transpose-pairwise.c: */
+int XM(mkplans_posttranspose)(const problem_mpi_transpose *p, planner *plnr,
+			      R *I, R *O, int my_pe,
+			      plan **cld2, plan **cld2rest, plan **cld3,
+			      INT *rest_Ioff, INT *rest_Ooff);
+/* various solvers */
+void XM(transpose_pairwise_register)(planner *p);
+void XM(transpose_alltoall_register)(planner *p);
+void XM(transpose_recurse_register)(planner *p);
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft-problem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft-problem.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-rdft.h"
+
+static void destroy(problem *ego_)
+{
+     problem_mpi_rdft *ego = (problem_mpi_rdft *) ego_;
+     XM(dtensor_destroy)(ego->sz);
+     MPI_Comm_free(&ego->comm);
+#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
+     X(ifree0)(ego->kind);
+#endif
+     X(ifree)(ego_);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     int i;
+     X(md5puts)(m, "mpi-dft");
+     X(md5int)(m, p->I == p->O);
+     /* don't include alignment -- may differ between processes
+	X(md5int)(m, X(alignment_of)(p->I));
+	X(md5int)(m, X(alignment_of)(p->O));
+	... note that applicability of MPI plans does not depend
+	    on alignment (although optimality may, in principle). */
+     XM(dtensor_md5)(m, p->sz);
+     X(md5INT)(m, p->vn);
+     for (i = 0; i < p->sz->rnk; ++i)
+	  X(md5int)(m, p->kind[i]);
+     X(md5int)(m, p->flags);
+     MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
+     A(XM(md5_equal)(*m, p->comm));
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_;
+     int i;
+     p->print(p, "(mpi-rdft %d %d %d ", 
+	      ego->I == ego->O,
+	      X(alignment_of)(ego->I),
+	      X(alignment_of)(ego->O));
+     XM(dtensor_print)(ego->sz, p);
+     for (i = 0; i < ego->sz->rnk; ++i)
+          p->print(p, " %d", (int)ego->kind[i]);
+     p->print(p, " %D %d", ego->vn, ego->flags);
+     MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_;
+     R *I = ego->I;
+     INT i, N;
+     int my_pe;
+
+     MPI_Comm_rank(ego->comm, &my_pe);
+     N = ego->vn * XM(total_block)(ego->sz, IB, my_pe);
+     for (i = 0; i < N; ++i) I[i] = K(0.0);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_MPI_RDFT,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn,
+			    R *I, R *O,
+			    MPI_Comm comm,
+			    const rdft_kind *kind, unsigned flags)
+{
+     problem_mpi_rdft *ego;
+     int i, rnk = sz->rnk;
+     int n_pes;
+
+     A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk));
+     MPI_Comm_size(comm, &n_pes);
+     A(n_pes >= XM(num_blocks_total)(sz, IB)
+       && n_pes >= XM(num_blocks_total)(sz, OB));
+     A(vn >= 0);
+
+#if defined(STRUCT_HACK_KR)
+     ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft)
+					     + sizeof(rdft_kind)
+					     * (rnk > 0 ? rnk - 1 : 0), &padt);
+#elif defined(STRUCT_HACK_C99)
+     ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft)
+					     + sizeof(rdft_kind) * rnk, &padt);
+#else
+     ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft), &padt);
+     ego->kind = (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS);
+#endif
+
+     /* enforce pointer equality if untainted pointers are equal */
+     if (UNTAINT(I) == UNTAINT(O))
+	  I = O = JOIN_TAINT(I, O);
+
+     ego->sz = XM(dtensor_canonical)(sz, 0);
+     ego->vn = vn;
+     ego->I = I;
+     ego->O = O;
+     for (i = 0; i< ego->sz->rnk; ++i)
+	  ego->kind[i] = kind[i];
+
+     /* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by
+        swapping the first two dimensions (for rnk > 1) */
+     if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) {
+	  rdft_kind k = ego->kind[0];
+	  ddim dim0 = ego->sz->dims[0];
+	  ego->sz->dims[0] = ego->sz->dims[1];
+	  ego->sz->dims[1] = dim0;
+	  ego->kind[0] = ego->kind[1];
+	  ego->kind[1] = k;
+	  flags &= ~TRANSPOSED_IN;
+	  flags ^= TRANSPOSED_OUT;
+     }
+     ego->flags = flags;
+
+     MPI_Comm_dup(comm, &ego->comm);
+
+     return &(ego->super);
+}
+
+problem *XM(mkproblem_rdft_d)(dtensor *sz, INT vn,
+			      R *I, R *O, 
+			      MPI_Comm comm,
+			      const rdft_kind *kind, unsigned flags)
+{
+     problem *p = XM(mkproblem_rdft)(sz, vn, I, O, comm, kind, flags);
+     XM(dtensor_destroy)(sz);
+     return p;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft-rank-geq2-transposed.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft-rank-geq2-transposed.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex RDFTs of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is transposed both
+   in data distribution and in ordering (for the first 2 dimensions).
+
+   (Note that we don't have to handle the case where the input is
+   transposed, since this is equivalent to transposed output with the
+   first two dimensions swapped, and is automatically canonicalized as
+   such by rdft-problem.c. */
+
+#include "mpi-rdft.h"
+#include "mpi-transpose.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_rdft super;
+
+     plan *cld1, *cldt, *cld2;
+     INT roff, ioff;
+     int preserve_input;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cld2, *cldt;
+     
+     /* RDFT local dimensions */
+     cld1 = (plan_rdft *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I, O);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I, I);
+
+     /* global transpose */
+     cldt = (plan_rdft *) ego->cldt;
+     cldt->apply(ego->cldt, I, O);
+
+     /* RDFT final local dimension */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, O, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && p->flags == TRANSPOSED_OUT
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && XM(is_local_after)(1, p->sz, IB)
+	     && XM(is_local_after)(2, p->sz, OB)
+	     && XM(num_blocks)(p->sz->dims[0].n, p->sz->dims[0].b[OB]) == 1
+	     && (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
+		 || !XM(rdft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cldt, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cldt);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cldt, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_rdft *p;
+     P *pln;
+     plan *cld1 = 0, *cldt = 0, *cld2 = 0;
+     R *I, *O, *I2;
+     tensor *sz;
+     int i, my_pe, n_pes;
+     INT nrest;
+     static const plan_adt padt = {
+          XM(rdft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_rdft *) p_;
+
+     I2 = I = p->I;
+     O = p->O;
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
+	  I = O; 
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].n = p->sz->dims[i+1].n;
+     sz->dims[i].is = sz->dims[i].os = p->vn;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
+     {
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_rdft_d)(sz,
+						 X(mktensor_2d)(b, is, is,
+								p->vn, 1, 1),
+						 I2, I, p->kind + 1));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     nrest *= p->vn;
+     cldt = X(mkplan_d)(plnr,
+			XM(mkproblem_transpose)(
+			     p->sz->dims[0].n, p->sz->dims[1].n, nrest,
+			     I, O,
+			     p->sz->dims[0].b[IB], p->sz->dims[1].b[OB], 
+			     p->comm, 0));
+     if (XM(any_true)(!cldt, p->comm)) goto nada;
+
+     {
+	  INT is = p->sz->dims[0].n * nrest;
+	  INT b = XM(block)(p->sz->dims[1].n, p->sz->dims[1].b[OB], my_pe);
+	  cld2 = X(mkplan_d)(plnr,
+			     X(mkproblem_rdft_1_d)(X(mktensor_1d)(
+							p->sz->dims[0].n,
+							nrest, nrest),
+						   X(mktensor_2d)(b, is, is,
+								  nrest, 1, 1),
+						   O, O, p->kind[0]));
+	  if (XM(any_true)(!cld2, p->comm)) goto nada;
+     }
+
+     pln = MKPLAN_MPI_RDFT(P, &padt, apply);
+     pln->cld1 = cld1;
+     pln->cldt = cldt;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+     X(ops_add2)(&cldt->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cldt);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(rdft_rank_geq2_transposed_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft-rank-geq2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft-rank-geq2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex RDFTs of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is not transposed. */
+
+#include "mpi-rdft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_rdft super;
+
+     plan *cld1, *cld2;
+     int preserve_input;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cld2;
+     
+     /* RDFT local dimensions */
+     cld1 = (plan_rdft *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I, O);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I, I);
+
+     /* RDFT non-local dimension (via rdft-rank1-bigvec, usually): */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && XM(is_local_after)(1, p->sz, IB)
+	     && XM(is_local_after)(1, p->sz, OB)
+	     && (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
+		 || !XM(rdft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft-rank-geq2%s%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_rdft *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0;
+     R *I, *O, *I2;
+     tensor *sz;
+     dtensor *sz2;
+     int i, my_pe, n_pes;
+     INT nrest;
+     static const plan_adt padt = {
+          XM(rdft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_rdft *) p_;
+
+     I2 = I = p->I;
+     O = p->O;
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
+	  I = O; 
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].n = p->sz->dims[i+1].n;
+     sz->dims[i].is = sz->dims[i].os = p->vn;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = X(tensor_sz)(sz);
+     {
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_rdft_d)(sz,
+						 X(mktensor_2d)(b, is, is,
+								p->vn, 1, 1),
+						 I2, I, p->kind + 1));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
+     sz2->dims[0] = p->sz->dims[0];
+     cld2 = X(mkplan_d)(plnr, XM(mkproblem_rdft_d)(sz2, nrest * p->vn,
+						   I, O,
+						   p->comm, p->kind,
+						   RANK1_BIGVEC_ONLY));
+     if (XM(any_true)(!cld2, p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_RDFT(P, &padt, apply);
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(rdft_rank_geq2_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft-rank1-bigvec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft-rank1-bigvec.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex RDFTs of rank == 1 when the vector length vn is >= # processes.
+   In this case, we don't need to use a six-step type algorithm, and can
+   instead transpose the RDFT dimension with the vector dimension to 
+   make the RDFT local. */
+
+#include "mpi-rdft.h"
+#include "mpi-transpose.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+     rearrangement rearrange;
+} S;
+
+typedef struct {
+     plan_mpi_rdft super;
+
+     plan *cldt_before, *cld, *cldt_after;
+     int preserve_input;
+     rearrangement rearrange;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld, *cldt_before, *cldt_after;
+     
+     /* global transpose */
+     cldt_before = (plan_rdft *) ego->cldt_before;
+     cldt_before->apply(ego->cldt_before, I, O);
+     
+     if (ego->preserve_input) I = O;
+	  
+     /* 1d RDFT(s) */
+     cld = (plan_rdft *) ego->cld;
+     cld->apply(ego->cld, O, I);
+     
+     /* global transpose */
+     cldt_after = (plan_rdft *) ego->cldt_after;
+     cldt_after->apply(ego->cldt_after, I, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     int n_pes;
+     MPI_Comm_size(p->comm, &n_pes);
+     return (1
+	     && p->sz->rnk == 1
+	     && !(p->flags & ~RANK1_BIGVEC_ONLY)
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+
+#if 0 /* don't need this check since no other rank-1 rdft solver */
+	     && (p->vn >= n_pes /* TODO: relax this, using more memory? */
+		 || (p->flags & RANK1_BIGVEC_ONLY))
+#endif
+
+	     && XM(rearrange_applicable)(ego->rearrange,
+					 p->sz->dims[0], p->vn, n_pes)
+
+	     && (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
+                 || !XM(rdft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cldt_before, wakefulness);
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldt_after, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldt_after);
+     X(plan_destroy_internal)(ego->cld);
+     X(plan_destroy_internal)(ego->cldt_before);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const char descrip[][16] = { "contig", "discontig", "square-after",
+				  "square-middle", "square-before" };
+     p->print(p, "(mpi-rdft-rank1-bigvec/%s%s %(%p%) %(%p%) %(%p%))",
+	      descrip[ego->rearrange], ego->preserve_input==2 ?"/p":"",
+	      ego->cldt_before, ego->cld, ego->cldt_after);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_rdft *p;
+     P *pln;
+     plan *cld = 0, *cldt_before = 0, *cldt_after = 0;
+     R *I, *O;
+     INT yblock, yb, nx, ny, vn;
+     int my_pe, n_pes;
+     static const plan_adt padt = {
+          XM(rdft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_rdft *) p_;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+     
+     nx = p->sz->dims[0].n;
+     if (!(ny = XM(rearrange_ny)(ego->rearrange, p->sz->dims[0],p->vn,n_pes)))
+	  return (plan *) 0;
+     vn = p->vn / ny;
+     A(ny * vn == p->vn);
+
+     yblock = XM(default_block)(ny, n_pes);
+     cldt_before = X(mkplan_d)(plnr,
+			       XM(mkproblem_transpose)(
+				    nx, ny, vn,
+				    I = p->I, O = p->O,
+				    p->sz->dims[0].b[IB], yblock,
+				    p->comm, 0));
+     if (XM(any_true)(!cldt_before, p->comm)) goto nada;	  
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) { I = O; }
+     
+     yb = XM(block)(ny, yblock, my_pe);
+     cld = X(mkplan_d)(plnr,
+		       X(mkproblem_rdft_1_d)(X(mktensor_1d)(nx, vn, vn),
+					     X(mktensor_2d)(yb, vn*nx, vn*nx,
+							    vn, 1, 1),
+					     O, I, p->kind[0]));
+     if (XM(any_true)(!cld, p->comm)) goto nada;	  
+     
+     cldt_after = X(mkplan_d)(plnr,
+			      XM(mkproblem_transpose)(
+				   ny, nx, vn,
+				   I, O,
+				   yblock, p->sz->dims[0].b[OB], 
+				   p->comm, 0));
+     if (XM(any_true)(!cldt_after, p->comm)) goto nada;	  
+
+     pln = MKPLAN_MPI_RDFT(P, &padt, apply);
+
+     pln->cldt_before = cldt_before;
+     pln->cld = cld;
+     pln->cldt_after = cldt_after;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->rearrange = ego->rearrange;
+
+     X(ops_add)(&cldt_before->ops, &cld->ops, &pln->super.super.ops);
+     X(ops_add2)(&cldt_after->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldt_after);
+     X(plan_destroy_internal)(cld);
+     X(plan_destroy_internal)(cldt_before);
+     return (plan *) 0;
+}
+
+static solver *mksolver(rearrangement rearrange, int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->rearrange = rearrange;
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(rdft_rank1_bigvec_register)(planner *p)
+{
+     rearrangement rearrange;
+     int preserve_input;
+     FORALL_REARRANGE(rearrange)
+	  for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	       REGISTER_SOLVER(p, mksolver(rearrange, preserve_input));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft-serial.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft-serial.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* "MPI" RDFTs where all of the data is on one processor...just
+   call through to serial API. */
+
+#include "mpi-rdft.h"
+
+typedef struct {
+     plan_mpi_rdft super;
+     plan *cld;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld = (plan_rdft *) ego->cld;
+     cld->apply(ego->cld, I, O);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft-serial %(%p%))", ego->cld);
+}
+
+int XM(rdft_serial_applicable)(const problem_mpi_rdft *p)
+{
+     return (1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
+		 || p->vn == 0));
+}
+
+static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     P *pln;
+     plan *cld;
+     int my_pe;
+     static const plan_adt padt = {
+          XM(rdft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     /* check whether applicable: */
+     if (!XM(rdft_serial_applicable)(p))
+          return (plan *) 0;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     if (my_pe == 0 && p->vn > 0) {
+	  int i, rnk = p->sz->rnk;
+	  tensor *sz = X(mktensor)(rnk);
+	  rdft_kind *kind 
+	       = (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS);
+	  sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = p->vn;
+	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
+	  for (i = rnk - 1; i > 0; --i) {
+	       sz->dims[i - 1].is = sz->dims[i - 1].os = 
+		    sz->dims[i].is * sz->dims[i].n;
+	       sz->dims[i - 1].n = p->sz->dims[i - 1].n;
+	  }
+	  for (i = 0; i < rnk; ++i)
+	       kind[i] = p->kind[i];
+	  
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_rdft_d)(sz,
+						X(mktensor_1d)(p->vn, 1, 1),
+						p->I, p->O, kind));
+	  X(ifree0)(kind);
+     }
+     else { /* idle process: make nop plan */
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_rdft_0_d)(X(mktensor_1d)(0,0,0),
+						  p->I, p->O));
+     }
+     if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
+
+     pln = MKPLAN_MPI_RDFT(P, &padt, apply);
+     pln->cld = cld;
+     X(ops_cpy)(&cld->ops, &pln->super.super.ops);
+     return &(pln->super.super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
+     return MKSOLVER(solver, &sadt);
+}
+
+void XM(rdft_serial_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft-solve.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft-solve.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-rdft.h"
+
+/* use the apply() operation for MPI_RDFT problems */
+void XM(rdft_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_mpi_rdft *ego = (const plan_mpi_rdft *) ego_;
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft2-problem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft2-problem.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-rdft2.h"
+
+static void destroy(problem *ego_)
+{
+     problem_mpi_rdft2 *ego = (problem_mpi_rdft2 *) ego_;
+     XM(dtensor_destroy)(ego->sz);
+     MPI_Comm_free(&ego->comm);
+     X(ifree)(ego_);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
+     int i;
+     X(md5puts)(m, "mpi-rdft2");
+     X(md5int)(m, p->I == p->O);
+     /* don't include alignment -- may differ between processes
+	X(md5int)(m, X(alignment_of)(p->I));
+	X(md5int)(m, X(alignment_of)(p->O));
+	... note that applicability of MPI plans does not depend
+	    on alignment (although optimality may, in principle). */
+     XM(dtensor_md5)(m, p->sz);
+     X(md5INT)(m, p->vn);
+     X(md5int)(m, p->kind);
+     X(md5int)(m, p->flags);
+     MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
+     A(XM(md5_equal)(*m, p->comm));
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_mpi_rdft2 *ego = (const problem_mpi_rdft2 *) ego_;
+     int i;
+     p->print(p, "(mpi-rdft2 %d %d %d ", 
+	      ego->I == ego->O,
+	      X(alignment_of)(ego->I),
+	      X(alignment_of)(ego->O));
+     XM(dtensor_print)(ego->sz, p);
+     p->print(p, " %D %d %d", ego->vn, (int) ego->kind, ego->flags);
+     MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_mpi_rdft2 *ego = (const problem_mpi_rdft2 *) ego_;
+     R *I = ego->I;
+     dtensor *sz;
+     INT i, N;
+     int my_pe;
+
+     sz = XM(dtensor_copy)(ego->sz);
+     sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n / 2 + 1;
+     MPI_Comm_rank(ego->comm, &my_pe);
+     N = 2 * ego->vn * XM(total_block)(sz, IB, my_pe);
+     XM(dtensor_destroy)(sz);
+     for (i = 0; i < N; ++i) I[i] = K(0.0);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_MPI_RDFT2,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+problem *XM(mkproblem_rdft2)(const dtensor *sz, INT vn,
+			   R *I, R *O,
+			   MPI_Comm comm,
+			   rdft_kind kind,
+			   unsigned flags)
+{
+     problem_mpi_rdft2 *ego =
+          (problem_mpi_rdft2 *)X(mkproblem)(sizeof(problem_mpi_rdft2), &padt);
+     int n_pes;
+
+     A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk) && sz->rnk > 1);
+     MPI_Comm_size(comm, &n_pes);
+     A(vn >= 0);
+     A(kind == R2HC || kind == HC2R);
+
+     /* enforce pointer equality if untainted pointers are equal */
+     if (UNTAINT(I) == UNTAINT(O))
+	  I = O = JOIN_TAINT(I, O);
+
+     ego->sz = XM(dtensor_canonical)(sz, 0);
+#ifdef FFTW_DEBUG
+     ego->sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n / 2 + 1;
+     A(n_pes >= XM(num_blocks_total)(ego->sz, IB)
+       && n_pes >= XM(num_blocks_total)(ego->sz, OB));
+     ego->sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n;
+#endif
+
+     ego->vn = vn;
+     ego->I = I;
+     ego->O = O;
+     ego->kind = kind;
+
+     /* We only support TRANSPOSED_OUT for r2c and TRANSPOSED_IN for
+	c2r transforms. */
+
+     ego->flags = flags;
+
+     MPI_Comm_dup(comm, &ego->comm);
+
+     return &(ego->super);
+}
+
+problem *XM(mkproblem_rdft2_d)(dtensor *sz, INT vn,
+			     R *I, R *O,
+			     MPI_Comm comm,
+			     rdft_kind kind,
+			     unsigned flags)
+{
+     problem *p = XM(mkproblem_rdft2)(sz, vn, I, O, comm, kind, flags);
+     XM(dtensor_destroy)(sz);
+     return p;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft2-rank-geq2-transposed.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft2-rank-geq2-transposed.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Real-input (r2c) DFTs of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is transposed both
+   in data distribution and in ordering (for the first 2 dimensions).
+
+   Conversely, real-output (c2r) DFTs where the input is transposed.
+
+   We don't currently support transposed-input r2c or transposed-output
+   c2r transforms. */
+
+#include "mpi-rdft2.h"
+#include "mpi-transpose.h"
+#include "rdft.h"
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_rdft2 super;
+
+     plan *cld1, *cldt, *cld2;
+     INT vn;
+     int preserve_input;
+} P;
+
+static void apply_r2c(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld1;
+     plan_dft *cld2;
+     plan_rdft *cldt;
+     
+     /* RDFT2 local dimensions */
+     cld1 = (plan_rdft2 *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I, I+ego->vn, O, O+1);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I, I+ego->vn, I, I+1);
+
+     /* global transpose */
+     cldt = (plan_rdft *) ego->cldt;
+     cldt->apply(ego->cldt, I, O);
+
+     /* DFT final local dimension */
+     cld2 = (plan_dft *) ego->cld2;
+     cld2->apply(ego->cld2, O, O+1, O, O+1);
+}
+
+static void apply_c2r(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld1;
+     plan_dft *cld2;
+     plan_rdft *cldt;
+     
+     /* IDFT local dimensions */
+     cld2 = (plan_dft *) ego->cld2;
+     if (ego->preserve_input) {
+	  cld2->apply(ego->cld2, I+1, I, O+1, O);
+	  I = O;
+     }
+     else
+	  cld2->apply(ego->cld2, I+1, I, I+1, I);
+
+     /* global transpose */
+     cldt = (plan_rdft *) ego->cldt;
+     cldt->apply(ego->cldt, I, O);
+
+     /* RDFT2 final local dimension */
+     cld1 = (plan_rdft2 *) ego->cld1;
+     cld1->apply(ego->cld1, O, O+ego->vn, O, O+1);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && ((p->flags == TRANSPOSED_OUT && p->kind == R2HC
+		  && XM(is_local_after)(1, p->sz, IB)
+		  && XM(is_local_after)(2, p->sz, OB)
+		  && XM(num_blocks)(p->sz->dims[0].n, 
+				    p->sz->dims[0].b[OB]) == 1)
+		 || 
+		 (p->flags == TRANSPOSED_IN && p->kind == HC2R
+		  && XM(is_local_after)(1, p->sz, OB)
+		  && XM(is_local_after)(2, p->sz, IB)
+		  && XM(num_blocks)(p->sz->dims[0].n, 
+				    p->sz->dims[0].b[IB]) == 1))
+	     && (!NO_SLOWP(plnr) /* slow if rdft2-serial is applicable */
+		 || !XM(rdft2_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cldt, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cldt);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft2-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cldt, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_rdft2 *p;
+     P *pln;
+     plan *cld1 = 0, *cldt = 0, *cld2 = 0;
+     R *r0, *r1, *cr, *ci, *ri, *ii, *ro, *io, *I, *O;
+     tensor *sz;
+     int i, my_pe, n_pes;
+     INT nrest, n1, b1;
+     static const plan_adt padt = {
+          XM(rdft2_solve), awake, print, destroy
+     };
+     block_kind k1, k2;
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_rdft2 *) p_;
+
+     I = p->I; O = p->O;
+     if (p->kind == R2HC) {
+	  k1 = IB; k2 = OB;
+          r1 = (r0 = I) + p->vn;
+	  if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
+	       ci = (cr = O) + 1;
+	       I = O; 
+	  }
+	  else 
+	       ci = (cr = I) + 1;
+	  io = ii = (ro = ri = O) + 1;
+     }
+     else {
+	  k1 = OB; k2 = IB;
+	  r1 = (r0 = O) + p->vn;
+	  ci = (cr = O) + 1;
+	  if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
+	       ri = (ii = I) + 1;
+	       ro = (io = O) + 1;
+	       I = O;
+	  }
+	  else
+	       ro = ri = (io = ii = I) + 1;
+     }
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].n = p->sz->dims[i+1].n / 2 + 1;
+     sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
+     {
+	  INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[k1], my_pe);
+	  sz->dims[p->sz->rnk - 2].n = p->sz->dims[p->sz->rnk - 1].n;
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_rdft2_d)(sz,
+						  X(mktensor_2d)(b, is, is,
+								p->vn,ivs,ovs),
+						  r0, r1, cr, ci, p->kind));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     nrest *= p->vn;
+     n1 = p->sz->dims[1].n;
+     b1 = p->sz->dims[1].b[k2];
+     if (p->sz->rnk == 2) { /* n1 dimension is cut in ~half */
+	  n1 = n1 / 2 + 1;
+	  b1 = b1 == p->sz->dims[1].n ? n1 : b1;
+     }
+
+     if (p->kind == R2HC)
+	  cldt = X(mkplan_d)(plnr,
+			     XM(mkproblem_transpose)(
+				  p->sz->dims[0].n, n1, nrest * 2,
+				  I, O,
+				  p->sz->dims[0].b[IB], b1,
+				  p->comm, 0));
+     else
+	  cldt = X(mkplan_d)(plnr,
+			     XM(mkproblem_transpose)(
+				  n1, p->sz->dims[0].n, nrest * 2,
+				  I, O,
+				  b1, p->sz->dims[0].b[OB], 
+				  p->comm, 0));
+     if (XM(any_true)(!cldt, p->comm)) goto nada;
+
+     {
+	  INT is = p->sz->dims[0].n * nrest * 2;
+	  INT b = XM(block)(n1, b1, my_pe);
+	  cld2 = X(mkplan_d)(plnr,
+			     X(mkproblem_dft_d)(X(mktensor_1d)(
+						     p->sz->dims[0].n,
+						     nrest * 2, nrest * 2),
+						X(mktensor_2d)(b, is, is,
+							       nrest, 2, 2),
+						ri, ii, ro, io));
+	  if (XM(any_true)(!cld2, p->comm)) goto nada;
+     }
+
+     pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
+     pln->cld1 = cld1;
+     pln->cldt = cldt;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->vn = p->vn;
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+     X(ops_add2)(&cldt->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cldt);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(rdft2_rank_geq2_transposed_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft2-rank-geq2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft2-rank-geq2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex RDFT2s of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is not transposed. */
+
+#include "mpi-dft.h"
+#include "mpi-rdft2.h"
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_rdft2 super;
+
+     plan *cld1, *cld2;
+     INT vn;
+     int preserve_input;
+} P;
+
+static void apply_r2c(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld1;
+     plan_rdft *cld2;
+     
+     /* RDFT2 local dimensions */
+     cld1 = (plan_rdft2 *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I, I+ego->vn, O, O+1);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I, I+ego->vn, I, I+1);
+
+     /* DFT non-local dimension (via dft-rank1-bigvec, usually): */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+}
+
+static void apply_c2r(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld1;
+     plan_rdft *cld2;
+     
+     /* DFT non-local dimension (via dft-rank1-bigvec, usually): */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+
+     /* RDFT2 local dimensions */
+     cld1 = (plan_rdft2 *) ego->cld1;
+     cld1->apply(ego->cld1, O, O+ego->vn, O, O+1);
+
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O
+					  && p->kind == R2HC))
+	     && XM(is_local_after)(1, p->sz, IB)
+	     && XM(is_local_after)(1, p->sz, OB)
+	     && (!NO_SLOWP(plnr) /* slow if rdft2-serial is applicable */
+		 || !XM(rdft2_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft2-rank-geq2%s%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_rdft2 *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0;
+     R *r0, *r1, *cr, *ci, *I, *O;
+     tensor *sz;
+     dtensor *sz2;
+     int i, my_pe, n_pes;
+     INT nrest;
+     static const plan_adt padt = {
+          XM(rdft2_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_rdft2 *) p_;
+
+     I = p->I; O = p->O;
+     if (p->kind == R2HC) {
+          r1 = (r0 = p->I) + p->vn;
+	  if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
+	       ci = (cr = p->O) + 1;
+	       I = O; 
+	  }
+	  else 
+	       ci = (cr = p->I) + 1;
+     }
+     else {
+          r1 = (r0 = p->O) + p->vn;
+          ci = (cr = p->O) + 1;
+     }
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
+     sz->dims[i].n = p->sz->dims[i+1].n / 2 + 1;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = X(tensor_sz)(sz);
+     {
+	  INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
+	  sz->dims[p->sz->rnk - 2].n = p->sz->dims[p->sz->rnk - 1].n;
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_rdft2_d)(sz,
+						  X(mktensor_2d)(b, is, is,
+							        p->vn,ivs,ovs),
+						  r0, r1, cr, ci, p->kind));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
+     sz2->dims[0] = p->sz->dims[0];
+     cld2 = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz2, nrest * p->vn,
+						  I, O, p->comm, 
+						  p->kind == R2HC ?
+						  FFT_SIGN : -FFT_SIGN,
+						  RANK1_BIGVEC_ONLY));
+     if (XM(any_true)(!cld2, p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->vn = p->vn;
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(rdft2_rank_geq2_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft2-serial.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft2-serial.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* "MPI" DFTs where all of the data is on one processor...just
+   call through to serial API. */
+
+#include "mpi-rdft2.h"
+#include "rdft.h"
+
+typedef struct {
+     plan_mpi_rdft2 super;
+     plan *cld;
+     INT vn;
+} P;
+
+static void apply_r2c(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld;
+     cld = (plan_rdft2 *) ego->cld;
+     cld->apply(ego->cld, I, I+ego->vn, O, O+1);
+}
+
+static void apply_c2r(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld;
+     cld = (plan_rdft2 *) ego->cld;
+     cld->apply(ego->cld, O, O+ego->vn, I, I+1);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft2-serial %(%p%))", ego->cld);
+}
+
+int XM(rdft2_serial_applicable)(const problem_mpi_rdft2 *p)
+{
+     return (1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
+		 || p->vn == 0));
+}
+
+static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
+     P *pln;
+     plan *cld;
+     int my_pe;
+     R *r0, *r1, *cr, *ci;
+     static const plan_adt padt = {
+          XM(rdft2_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     /* check whether applicable: */
+     if (!XM(rdft2_serial_applicable)(p))
+          return (plan *) 0;
+
+     if (p->kind == R2HC) {
+	  r1 = (r0 = p->I) + p->vn;
+	  ci = (cr = p->O) + 1;
+     }
+     else {
+	  r1 = (r0 = p->O) + p->vn;
+	  ci = (cr = p->I) + 1;
+     }
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     if (my_pe == 0 && p->vn > 0) {
+	  INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
+	  int i, rnk = p->sz->rnk;
+	  tensor *sz = X(mktensor)(p->sz->rnk);
+	  sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn;
+	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n / 2 + 1;
+	  for (i = rnk - 1; i > 0; --i) {
+	       sz->dims[i - 1].is = sz->dims[i - 1].os = 
+		    sz->dims[i].is * sz->dims[i].n;
+	       sz->dims[i - 1].n = p->sz->dims[i - 1].n;
+	  }
+	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
+
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_rdft2_d)(sz,
+						 X(mktensor_1d)(p->vn,ivs,ovs),
+						 r0, r1, cr, ci, p->kind));
+     }
+     else { /* idle process: make nop plan */
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_rdft2_d)(X(mktensor_0d)(),
+						 X(mktensor_1d)(0,0,0),
+						 cr, ci, cr, ci, HC2R));
+     }
+     if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
+
+     pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
+     pln->cld = cld;
+     pln->vn = p->vn;
+     X(ops_cpy)(&cld->ops, &pln->super.super.ops);
+     return &(pln->super.super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
+     return MKSOLVER(solver, &sadt);
+}
+
+void XM(rdft2_serial_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rdft2-solve.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rdft2-solve.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-rdft2.h"
+
+/* use the apply() operation for MPI_RDFT2 problems */
+void XM(rdft2_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_mpi_rdft2 *ego = (const plan_mpi_rdft2 *) ego_;
+     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
+     ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/rearrange.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/rearrange.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* common functions for rearrangements of the data for the *-rank1-bigvec
+   solvers */
+
+static int div_mult(INT b, INT a) { 
+     return (a > b && a % b == 0);
+}
+static int div_mult2(INT b, INT a, INT n) { 
+     return (div_mult(b, a) && div_mult(n, b));
+}
+
+int XM(rearrange_applicable)(rearrangement rearrange, 
+			     ddim dim0, INT vn, int n_pes)
+{
+     /* note: it is important that cases other than CONTIG be
+	applicable only when the resulting transpose dimension
+	is divisible by n_pes; otherwise, the allocation size
+	returned by the API will be incorrect */
+     return ((rearrange != DISCONTIG || div_mult(n_pes, vn))
+	     && (rearrange != SQUARE_BEFORE 
+		 || div_mult2(dim0.b[IB], vn, n_pes))
+	     && (rearrange != SQUARE_AFTER
+		 || (dim0.b[IB] != dim0.b[OB]
+		     && div_mult2(dim0.b[OB], vn, n_pes)))
+	     && (rearrange != SQUARE_MIDDLE
+		 || div_mult(dim0.n * n_pes, vn)));
+}
+
+INT XM(rearrange_ny)(rearrangement rearrange, ddim dim0, INT vn, int n_pes)
+{
+     switch (rearrange) {
+	 case CONTIG:
+	      return vn;
+	 case DISCONTIG:
+	      return n_pes;
+	 case SQUARE_BEFORE:
+	      return dim0.b[IB];
+	 case SQUARE_AFTER:
+	      return dim0.b[OB];
+	 case SQUARE_MIDDLE:
+	      return dim0.n * n_pes;
+     }
+     return 0;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/testsched.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/testsched.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 1999-2003, 2007-8 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/**********************************************************************/
+/* This is a modified and combined version of the sched.c and
+   test_sched.c files shipped with FFTW 2, written to implement and
+   test various all-to-all communications scheduling patterns.
+
+   It is not used in FFTW 3, but I keep it around in case we ever want
+   to play with this again or to change algorithms.  In particular, I
+   used it to implement and test the fill1_comm_sched routine in
+   transpose-pairwise.c, which allows us to create a schedule for one
+   process at a time and is much more compact than the FFTW 2 code.
+
+   Note that the scheduling algorithm is somewhat modified from that
+   of FFTW 2.  Originally, I thought that one "stall" in the schedule
+   was unavoidable for odd numbers of processes, since this is the
+   case for the soccer-timetabling problem.  However, because of the
+   self-communication step, we can use the self-communication to fill
+   in the stalls.  (Thanks to Ralf Wildenhues for pointing this out.)
+   This greatly simplifies the process re-sorting algorithm. */
+
+/**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/* This file contains routines to compute communications schedules for
+   all-to-all communications (complete exchanges) that are performed
+   in-place.  (That is, the block that processor x sends to processor
+   y gets replaced on processor x by a block received from processor y.)
+
+   A schedule, int **sched, is a two-dimensional array where
+   sched[pe][i] is the processor that pe expects to exchange a message
+   with on the i-th step of the exchange.  sched[pe][i] == -1 for the
+   i after the last exchange scheduled on pe.
+
+   Here, processors (pe's, for processing elements), are numbered from
+   0 to npes-1.
+
+   There are a couple of constraints that a schedule should satisfy
+   (besides the obvious one that every processor has to communicate
+   with every other processor exactly once).
+   
+   * First, and most importantly, there must be no deadlocks.
+   
+   * Second, we would like to overlap communications as much as possible,
+   so that all exchanges occur in parallel.  It turns out that perfect
+   overlap is possible for all number of processes (npes).
+
+   It turns out that this scheduling problem is actually well-studied,
+   and good solutions are known.  The problem is known as a
+   "time-tabling" problem, and is specifically the problem of
+   scheduling a sports competition (where n teams must compete exactly
+   once with every other team).  The problem is discussed and
+   algorithms are presented in:
+
+   [1] J. A. M. Schreuder, "Constructing Timetables for Sport
+   Competitions," Mathematical Programming Study 13, pp. 58-67 (1980).
+
+   [2] A. Schaerf, "Scheduling Sport Tournaments using Constraint
+   Logic Programming," Proc. of 12th Europ. Conf. on
+   Artif. Intell. (ECAI-96), pp. 634-639 (Budapest 1996).
+   http://hermes.dis.uniromal.it/~aschaerf/publications.html
+
+   (These people actually impose a lot of additional constraints that
+   we don't care about, so they are solving harder problems. [1] gives
+   a simple enough algorithm for our purposes, though.)
+   
+   In the timetabling problem, N teams can all play one another in N-1
+   steps if N is even, and N steps if N is odd.  Here, however,
+   there is a "self-communication" step (a team must also "play itself")
+   and so we can always make an optimal N-step schedule regardless of N.
+
+   However, we have to do more: for a particular processor, the
+   communications schedule must be sorted in ascending or descending
+   order of processor index.  (This is necessary so that the data
+   coming in for the transpose does not overwrite data that will be
+   sent later; for that processor the incoming and outgoing blocks are
+   of different non-zero sizes.)  Fortunately, because the schedule
+   is stall free, each parallel step of the schedule is independent
+   of every other step, and we can reorder the steps arbitrarily
+   to achieve any desired order on a particular process.
+*/
+
+void free_comm_schedule(int **sched, int npes)
+{
+     if (sched) {
+	  int i;
+
+	  for (i = 0; i < npes; ++i)
+	       free(sched[i]);
+	  free(sched);
+     }
+}
+
+void empty_comm_schedule(int **sched, int npes)
+{
+     int i;
+     for (i = 0; i < npes; ++i)
+	  sched[i][0] = -1;
+}
+
+extern void fill_comm_schedule(int **sched, int npes);
+
+/* Create a new communications schedule for a given number of processors.
+   The schedule is initialized to a deadlock-free, maximum overlap
+   schedule.  Returns NULL on an error (may print a message to
+   stderr if there is a program bug detected).  */
+int **make_comm_schedule(int npes)
+{
+     int **sched;
+     int i;
+
+     sched = (int **) malloc(sizeof(int *) * npes);
+     if (!sched)
+	  return NULL;
+
+     for (i = 0; i < npes; ++i)
+	  sched[i] = NULL;
+
+     for (i = 0; i < npes; ++i) {
+	  sched[i] = (int *) malloc(sizeof(int) * 10 * (npes + 1));
+	  if (!sched[i]) {
+	       free_comm_schedule(sched,npes);
+	       return NULL;
+	  }
+     }
+     
+     empty_comm_schedule(sched,npes);
+     fill_comm_schedule(sched,npes);
+
+     if (!check_comm_schedule(sched,npes)) {
+	  free_comm_schedule(sched,npes);
+	  return NULL;
+     }
+
+     return sched;
+}
+
+static void add_dest_to_comm_schedule(int **sched, int pe, int dest)
+{
+     int i;
+     
+     for (i = 0; sched[pe][i] != -1; ++i)
+	  ;
+
+     sched[pe][i] = dest;
+     sched[pe][i+1] = -1;
+}
+
+static void add_pair_to_comm_schedule(int **sched, int pe1, int pe2)
+{
+     add_dest_to_comm_schedule(sched, pe1, pe2);
+     if (pe1 != pe2)
+	  add_dest_to_comm_schedule(sched, pe2, pe1);
+}
+
+/* Simplification of algorithm presented in [1] (we have fewer
+   constraints).  Produces a perfect schedule (npes steps).  */
+
+void fill_comm_schedule(int **sched, int npes)
+{
+     int pe, i, n;
+
+     if (npes % 2 == 0) {
+	  n = npes;
+	  for (pe = 0; pe < npes; ++pe)
+	       add_pair_to_comm_schedule(sched,pe,pe);
+     }
+     else
+	  n = npes + 1;
+
+     for (pe = 0; pe < n - 1; ++pe) {
+	  add_pair_to_comm_schedule(sched, pe, npes % 2 == 0 ? npes - 1 : pe);
+	  
+	  for (i = 1; i < n/2; ++i) {
+	       int pe_a, pe_b;
+
+	       pe_a = pe - i;
+	       if (pe_a < 0)
+		    pe_a += n - 1;
+
+	       pe_b = (pe + i) % (n - 1);
+
+	       add_pair_to_comm_schedule(sched,pe_a,pe_b);
+	  }
+     }
+}
+
+/* given an array sched[npes], fills it with the communications
+   schedule for process pe. */
+void fill1_comm_sched(int *sched, int which_pe, int npes)
+{
+     int pe, i, n, s = 0;
+     if (npes % 2 == 0) {
+	  n = npes;
+	  sched[s++] = which_pe;
+     }
+     else
+	  n = npes + 1;
+     for (pe = 0; pe < n - 1; ++pe) {
+	  if (npes % 2 == 0) {
+	       if (pe == which_pe) sched[s++] = npes - 1;
+	       else if (npes - 1 == which_pe) sched[s++] = pe;
+	  }
+	  else if (pe == which_pe) sched[s++] = pe;
+
+	  if (pe != which_pe && which_pe < n - 1) {
+	       i = (pe - which_pe + (n - 1)) % (n - 1);
+	       if (i < n/2)
+		    sched[s++] = (pe + i) % (n - 1);
+	       
+	       i = (which_pe - pe + (n - 1)) % (n - 1);
+	       if (i < n/2)
+		    sched[s++] = (pe - i + (n - 1)) % (n - 1);
+	  }
+     }
+     if (s != npes) {
+	  fprintf(stderr, "bug in fill1_com_schedule (%d, %d/%d)\n", 
+		  s, which_pe, npes);
+	  exit(EXIT_FAILURE);
+     }
+}
+
+/* sort the communication schedule sched for npes so that the schedule
+   on process sortpe is ascending or descending (!ascending). */
+static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
+{
+     int *sortsched, i;
+     sortsched = (int *) malloc(npes * sizeof(int) * 2);
+     fill1_comm_sched(sortsched, sortpe, npes);
+     if (ascending)
+          for (i = 0; i < npes; ++i)
+               sortsched[npes + sortsched[i]] = sched[i];
+     else
+          for (i = 0; i < npes; ++i)
+               sortsched[2*npes - 1 - sortsched[i]] = sched[i];
+     for (i = 0; i < npes; ++i)
+          sched[i] = sortsched[npes + i];
+     free(sortsched);
+}
+
+/* Below, we have various checks in case of bugs: */
+
+/* check for deadlocks by simulating the schedule and looking for
+   cycles in the dependency list; returns 0 if there are deadlocks
+   (or other errors) */
+static int check_schedule_deadlock(int **sched, int npes)
+{
+     int *step, *depend, *visited, pe, pe2, period, done = 0;
+     int counter = 0;
+
+     /* step[pe] is the step in the schedule that a given pe is on */
+     step = (int *) malloc(sizeof(int) * npes);
+
+     /* depend[pe] is the pe' that pe is currently waiting for a message
+	from (-1 if none) */
+     depend = (int *) malloc(sizeof(int) * npes);
+
+     /* visited[pe] tells whether we have visited the current pe already
+	when we are looking for cycles. */
+     visited = (int *) malloc(sizeof(int) * npes);
+
+     if (!step || !depend || !visited) {
+	  free(step); free(depend); free(visited);
+	  return 0;
+     }
+
+     for (pe = 0; pe < npes; ++pe)
+	  step[pe] = 0;
+
+     while (!done) {
+	  ++counter;
+
+	  for (pe = 0; pe < npes; ++pe)
+	       depend[pe] = sched[pe][step[pe]];
+	  
+	  /* now look for cycles in the dependencies with period > 2: */
+	  for (pe = 0; pe < npes; ++pe)
+	       if (depend[pe] != -1) {
+		    for (pe2 = 0; pe2 < npes; ++pe2)
+			 visited[pe2] = 0;
+
+		    period = 0;
+		    pe2 = pe;
+		    do {
+			 visited[pe2] = period + 1;
+			 pe2 = depend[pe2];
+			 period++;
+		    } while (pe2 != -1 && !visited[pe2]);
+
+		    if (pe2 == -1) {
+			 fprintf(stderr,
+				 "BUG: unterminated cycle in schedule!\n");
+			 free(step); free(depend);
+			 free(visited);
+			 return 0;
+		    }
+		    if (period - (visited[pe2] - 1) > 2) {
+			 fprintf(stderr,"BUG: deadlock in schedule!\n");
+			 free(step); free(depend);
+			 free(visited);
+			 return 0;
+		    }
+
+		    if (pe2 == pe)
+			 step[pe]++;
+	       }
+
+	  done = 1;
+	  for (pe = 0; pe < npes; ++pe)
+	       if (sched[pe][step[pe]] != -1) {
+		    done = 0;
+		    break;
+	       }
+     }
+
+     free(step); free(depend); free(visited);
+     return (counter > 0 ? counter : 1);
+}
+
+/* sanity checks; prints message and returns 0 on failure.
+   undocumented feature: the return value on success is actually the
+   number of steps required for the schedule to complete, counting
+   stalls. */
+int check_comm_schedule(int **sched, int npes)
+{
+     int pe, i, comm_pe;
+     
+     for (pe = 0; pe < npes; ++pe) {
+	  for (comm_pe = 0; comm_pe < npes; ++comm_pe) {
+	       for (i = 0; sched[pe][i] != -1 && sched[pe][i] != comm_pe; ++i)
+		    ;
+	       if (sched[pe][i] == -1) {
+		    fprintf(stderr,"BUG: schedule never sends message from "
+			    "%d to %d.\n",pe,comm_pe);
+		    return 0;  /* never send message to comm_pe */
+	       }
+	  }
+	  for (i = 0; sched[pe][i] != -1; ++i)
+	       ;
+	  if (i != npes) {
+	       fprintf(stderr,"BUG: schedule sends too many messages from "
+		       "%d\n",pe);
+	       return 0;
+	  }
+     }
+     return check_schedule_deadlock(sched,npes);
+}
+
+/* invert the order of all the schedules; this has no effect on
+   its required properties. */
+void invert_comm_schedule(int **sched, int npes)
+{
+     int pe, i;
+
+     for (pe = 0; pe < npes; ++pe)
+	  for (i = 0; i < npes/2; ++i) {
+	       int dummy = sched[pe][i];
+	       sched[pe][i] = sched[pe][npes-1-i];
+	       sched[pe][npes-1-i] = dummy;
+	  }
+}
+
+/* Sort the schedule for sort_pe in ascending order of processor
+   index.  Unfortunately, for odd npes (when schedule has a stall
+   to begin with) this will introduce an extra stall due to
+   the motion of the self-communication past a stall.  We could
+   fix this if it were really important.  Actually, we don't
+   get an extra stall when sort_pe == 0 or npes-1, which is sufficient
+   for our purposes. */
+void sort_comm_schedule(int **sched, int npes, int sort_pe)
+{
+     int i,j,pe;
+
+     /* Note that we can do this sort in O(npes) swaps because we know
+	that the numbers we are sorting are just 0...npes-1.   But we'll
+	just do a bubble sort for simplicity here. */
+
+     for (i = 0; i < npes - 1; ++i)
+	  for (j = i + 1; j < npes; ++j)
+	       if (sched[sort_pe][i] > sched[sort_pe][j]) {
+		    for (pe = 0; pe < npes; ++pe) {
+			 int s = sched[pe][i];
+			 sched[pe][i] = sched[pe][j];
+			 sched[pe][j] = s;
+		    }
+	       }
+}
+
+/* print the schedule (for debugging purposes) */
+void print_comm_schedule(int **sched, int npes)
+{
+     int pe, i, width;
+
+     if (npes < 10)
+	  width = 1;
+     else if (npes < 100)
+	  width = 2;
+     else
+	  width = 3;
+
+     for (pe = 0; pe < npes; ++pe) {
+	  printf("pe %*d schedule:", width, pe);
+	  for (i = 0; sched[pe][i] != -1; ++i)
+	       printf("  %*d",width,sched[pe][i]);
+	  printf("\n");
+     }
+}
+
+int main(int argc, char **argv)
+{
+     int **sched;
+     int npes = -1, sortpe = -1, steps, i;
+
+     if (argc >= 2) {
+	  npes = atoi(argv[1]);
+	  if (npes <= 0) {
+	       fprintf(stderr,"npes must be positive!");
+	       return 1;
+	  }
+     }
+     if (argc >= 3) {
+	  sortpe = atoi(argv[2]);
+	  if (sortpe < 0 || sortpe >= npes) {
+	       fprintf(stderr,"sortpe must be between 0 and npes-1.\n");
+	       return 1;
+	  }
+     }
+
+     if (npes != -1) {
+	  printf("Computing schedule for npes = %d:\n",npes);
+	  sched = make_comm_schedule(npes);
+	  if (!sched) {
+	       fprintf(stderr,"Out of memory!");
+	       return 6;
+	  }
+	  
+	  if (steps = check_comm_schedule(sched,npes))
+	       printf("schedule OK (takes %d steps to complete).\n", steps);
+	  else
+	       printf("schedule not OK.\n");
+
+	  print_comm_schedule(sched, npes);
+	  
+	  if (sortpe != -1) {
+	       printf("\nRe-creating schedule for pe = %d...\n", sortpe);
+	       int *sched1 = (int*) malloc(sizeof(int) * npes);
+	       for (i = 0; i < npes; ++i) sched1[i] = -1;
+	       fill1_comm_sched(sched1, sortpe, npes);
+	       printf("  =");
+	       for (i = 0; i < npes; ++i) 
+		    printf("  %*d", npes < 10 ? 1 : (npes < 100 ? 2 : 3),
+			   sched1[i]);
+	       printf("\n");
+
+	       printf("\nSorting schedule for sortpe = %d...\n", sortpe);
+	       sort_comm_schedule(sched,npes,sortpe);
+	       
+	       if (steps = check_comm_schedule(sched,npes))
+		    printf("schedule OK (takes %d steps to complete).\n", 
+			   steps);
+	       else
+		    printf("schedule not OK.\n");
+
+	       print_comm_schedule(sched, npes);
+
+	       printf("\nInverting schedule...\n");
+	       invert_comm_schedule(sched,npes);
+	       
+	       if (steps = check_comm_schedule(sched,npes))
+		    printf("schedule OK (takes %d steps to complete).\n", 
+			   steps);
+	       else
+		    printf("schedule not OK.\n");
+
+	       print_comm_schedule(sched, npes);
+	       
+	       free_comm_schedule(sched,npes);
+
+	       free(sched1);
+	  }
+     }
+     else {
+	  printf("Doing infinite tests...\n");
+	  for (npes = 1; ; ++npes) {
+	       int *sched1 = (int*) malloc(sizeof(int) * npes);
+	       printf("npes = %d...",npes);
+	       sched = make_comm_schedule(npes);
+	       if (!sched) {
+		    fprintf(stderr,"Out of memory!\n");
+		    return 5;
+	       }
+	       for (sortpe = 0; sortpe < npes; ++sortpe) {
+		    empty_comm_schedule(sched,npes);
+		    fill_comm_schedule(sched,npes);
+		    if (!check_comm_schedule(sched,npes)) {
+			 fprintf(stderr,
+				 "\n -- fill error for sortpe = %d!\n",sortpe);
+			 return 2;
+		    }
+
+		    for (i = 0; i < npes; ++i) sched1[i] = -1;
+		    fill1_comm_sched(sched1, sortpe, npes);
+		    for (i = 0; i < npes; ++i)
+			 if (sched1[i] != sched[sortpe][i])
+			      fprintf(stderr,
+				      "\n -- fill1 error for pe = %d!\n",
+				      sortpe);
+
+		    sort_comm_schedule(sched,npes,sortpe);
+		    if (!check_comm_schedule(sched,npes)) {
+			 fprintf(stderr,
+				 "\n -- sort error for sortpe = %d!\n",sortpe);
+			 return 3;
+		    }
+		    invert_comm_schedule(sched,npes);
+		    if (!check_comm_schedule(sched,npes)) {
+			 fprintf(stderr,
+				 "\n -- invert error for sortpe = %d!\n",
+				 sortpe);
+			 return 4;
+		    }
+	       }
+	       free_comm_schedule(sched,npes);
+	       printf("OK\n");
+	       if (npes % 50 == 0)
+		    printf("(...Hit Ctrl-C to stop...)\n");
+	       free(sched1);
+	  }
+     }
+
+     return 0;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/transpose-alltoall.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/transpose-alltoall.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* plans for distributed out-of-place transpose using MPI_Alltoall,
+   and which destroy the input array (unless TRANSPOSED_IN is used) */
+
+#include "mpi-transpose.h"
+#include <string.h>
+
+typedef struct {
+     solver super;
+     int copy_transposed_in; /* whether to copy the input for TRANSPOSED_IN,
+				which makes the final transpose out-of-place
+				but costs an extra copy and requires us
+				to destroy the input */
+} S;
+
+typedef struct {
+     plan_mpi_transpose super;
+
+     plan *cld1, *cld2, *cld2rest, *cld3;
+
+     MPI_Comm comm;
+     int *send_block_sizes, *send_block_offsets;
+     int *recv_block_sizes, *recv_block_offsets;
+
+     INT rest_Ioff, rest_Ooff;
+
+     int equal_blocks;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cld2, *cld2rest, *cld3;
+
+     /* transpose locally to get contiguous chunks */
+     cld1 = (plan_rdft *) ego->cld1;
+     if (cld1) {
+	  cld1->apply(ego->cld1, I, O);
+	  
+	  /* transpose chunks globally */
+	  if (ego->equal_blocks)
+	       MPI_Alltoall(O, ego->send_block_sizes[0], FFTW_MPI_TYPE,
+			    I, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
+			    ego->comm);
+	  else
+	       MPI_Alltoallv(O, ego->send_block_sizes, ego->send_block_offsets,
+			     FFTW_MPI_TYPE,
+			     I, ego->recv_block_sizes, ego->recv_block_offsets,
+			     FFTW_MPI_TYPE,
+			     ego->comm);
+     }
+     else { /* TRANSPOSED_IN, no need to destroy input */
+	  /* transpose chunks globally */
+	  if (ego->equal_blocks)
+	       MPI_Alltoall(I, ego->send_block_sizes[0], FFTW_MPI_TYPE,
+			    O, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
+			    ego->comm);
+	  else
+	       MPI_Alltoallv(I, ego->send_block_sizes, ego->send_block_offsets,
+			     FFTW_MPI_TYPE,
+			     O, ego->recv_block_sizes, ego->recv_block_offsets,
+			     FFTW_MPI_TYPE,
+			     ego->comm);
+	  I = O; /* final transpose (if any) is in-place */
+     }
+     
+     /* transpose locally, again, to get ordinary row-major */
+     cld2 = (plan_rdft *) ego->cld2;
+     if (cld2) {
+	  cld2->apply(ego->cld2, I, O);
+	  cld2rest = (plan_rdft *) ego->cld2rest;
+	  if (cld2rest) { /* leftover from unequal block sizes */
+	       cld2rest->apply(ego->cld2rest,
+			       I + ego->rest_Ioff, O + ego->rest_Ooff);
+	       cld3 = (plan_rdft *) ego->cld3;
+	       if (cld3)
+		    cld3->apply(ego->cld3, O, O);
+	       /* else TRANSPOSED_OUT is true and user wants O transposed */
+	  }
+     }
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     return (1
+	     && p->I != p->O
+	     && (!NO_DESTROY_INPUTP(plnr) || 
+		 ((p->flags & TRANSPOSED_IN) && !ego->copy_transposed_in))
+	     && ((p->flags & TRANSPOSED_IN) || !ego->copy_transposed_in)
+	     && ONLY_TRANSPOSEDP(p->flags)
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+     X(plan_awake)(ego->cld2rest, wakefulness);
+     X(plan_awake)(ego->cld3, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(ifree0)(ego->send_block_sizes);
+     MPI_Comm_free(&ego->comm);
+     X(plan_destroy_internal)(ego->cld3);
+     X(plan_destroy_internal)(ego->cld2rest);
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-transpose-alltoall%s%(%p%)%(%p%)%(%p%)%(%p%))",
+	      ego->equal_blocks ? "/e" : "",
+	      ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_transpose *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
+     INT b, bt, vn, rest_Ioff, rest_Ooff;
+     R *I;
+     int *sbs, *sbo, *rbs, *rbo;
+     int pe, my_pe, n_pes;
+     int equal_blocks = 1;
+     static const plan_adt padt = {
+          XM(transpose_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_transpose *) p_;
+     vn = p->vn;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     b = XM(block)(p->nx, p->block, my_pe);
+
+     if (p->flags & TRANSPOSED_IN) { /* I is already transposed */
+	  if (ego->copy_transposed_in) {
+	       cld1 = X(mkplan_f_d)(plnr,
+				  X(mkproblem_rdft_0_d)(X(mktensor_1d)
+							(b * p->ny * vn, 1, 1),
+							I = p->I, p->O),
+				    0, 0, NO_SLOW);
+	       if (XM(any_true)(!cld1, p->comm)) goto nada;
+	  }
+	  else
+	       I = p->O; /* final transpose is in-place */
+     }
+     else { /* transpose b x ny x vn -> ny x b x vn */
+	  cld1 = X(mkplan_f_d)(plnr, 
+			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
+						     (b, p->ny * vn, vn,
+						      p->ny, vn, b * vn,
+						      vn, 1, 1),
+						     I = p->I, p->O),
+			       0, 0, NO_SLOW);
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+	  
+     if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, p->O, my_pe,
+						 &cld2, &cld2rest, &cld3,
+						 &rest_Ioff, &rest_Ooff),
+		      p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->cld2rest = cld2rest;
+     pln->rest_Ioff = rest_Ioff;
+     pln->rest_Ooff = rest_Ooff;
+     pln->cld3 = cld3;
+
+     MPI_Comm_dup(p->comm, &pln->comm);
+
+     /* Compute sizes/offsets of blocks to send for all-to-all command. */
+     sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS);
+     sbo = sbs + n_pes;
+     rbs = sbo + n_pes;
+     rbo = rbs + n_pes;
+     b = XM(block)(p->nx, p->block, my_pe);
+     bt = XM(block)(p->ny, p->tblock, my_pe);
+     for (pe = 0; pe < n_pes; ++pe) {
+	  INT db, dbt; /* destination block sizes */
+	  db = XM(block)(p->nx, p->block, pe);
+	  dbt = XM(block)(p->ny, p->tblock, pe);
+	  if (db != p->block || dbt != p->tblock)
+	       equal_blocks = 0;
+
+	  /* MPI requires type "int" here; apparently it
+	     has no 64-bit API?  Grrr. */
+	  sbs[pe] = (int) (b * dbt * vn);
+	  sbo[pe] = (int) (pe * (b * p->tblock) * vn);
+	  rbs[pe] = (int) (db * bt * vn);
+	  rbo[pe] = (int) (pe * (p->block * bt) * vn);
+     }
+     pln->send_block_sizes = sbs;
+     pln->send_block_offsets = sbo;
+     pln->recv_block_sizes = rbs;
+     pln->recv_block_offsets = rbo;
+     pln->equal_blocks = equal_blocks;
+
+     X(ops_zero)(&pln->super.super.ops);
+     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
+     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
+     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
+     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
+     /* FIXME: should MPI operations be counted in "other" somehow? */
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld3);
+     X(plan_destroy_internal)(cld2rest);
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int copy_transposed_in)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->copy_transposed_in = copy_transposed_in;
+     return &(slv->super);
+}
+
+void XM(transpose_alltoall_register)(planner *p)
+{
+     int cti;
+     for (cti = 0; cti <= 1; ++cti)
+	  REGISTER_SOLVER(p, mksolver(cti));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/transpose-pairwise.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/transpose-pairwise.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Distributed transposes using a sequence of carefully scheduled
+   pairwise exchanges.  This has the advantage that it can be done
+   in-place, or out-of-place while preserving the input, using buffer
+   space proportional to the local size divided by the number of
+   processes (i.e. to the total array size divided by the number of
+   processes squared). */
+
+#include "mpi-transpose.h"
+#include <string.h>
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_transpose super;
+
+     plan *cld1, *cld2, *cld2rest, *cld3;
+     INT rest_Ioff, rest_Ooff;
+     
+     int n_pes, my_pe, *sched;
+     INT *send_block_sizes, *send_block_offsets;
+     INT *recv_block_sizes, *recv_block_offsets;
+     MPI_Comm comm;
+     int preserve_input;
+} P;
+
+static void transpose_chunks(int *sched, int n_pes, int my_pe,
+			     INT *sbs, INT *sbo, INT *rbs, INT *rbo,
+			     MPI_Comm comm,
+			     R *I, R *O)
+{
+     if (sched) {
+	  int i;
+	  MPI_Status status;
+
+	  /* TODO: explore non-synchronous send/recv? */
+
+	  if (I == O) {
+	       R *buf = (R*) MALLOC(sizeof(R) * sbs[0], BUFFERS);
+	       
+	       for (i = 0; i < n_pes; ++i) {
+		    int pe = sched[i];
+		    if (my_pe == pe) {
+			 if (rbo[pe] != sbo[pe])
+			      memmove(O + rbo[pe], O + sbo[pe],
+				      sbs[pe] * sizeof(R));
+		    }
+		    else {
+			 memcpy(buf, O + sbo[pe], sbs[pe] * sizeof(R));
+			 MPI_Sendrecv(buf, (int) (sbs[pe]), FFTW_MPI_TYPE,
+				      pe, (my_pe * n_pes + pe) & 0xffff,
+				      O + rbo[pe], (int) (rbs[pe]),
+				      FFTW_MPI_TYPE,
+				      pe, (pe * n_pes + my_pe) & 0xffff,
+				      comm, &status);
+		    }
+	       }
+
+	       X(ifree)(buf);
+	  }
+	  else { /* I != O */
+	       for (i = 0; i < n_pes; ++i) {
+		    int pe = sched[i];
+		    if (my_pe == pe)
+			 memcpy(O + rbo[pe], I + sbo[pe], sbs[pe] * sizeof(R));
+		    else
+			 MPI_Sendrecv(I + sbo[pe], (int) (sbs[pe]),
+				      FFTW_MPI_TYPE,
+				      pe, (my_pe * n_pes + pe) & 0xffff,
+				      O + rbo[pe], (int) (rbs[pe]),
+				      FFTW_MPI_TYPE,
+				      pe, (pe * n_pes + my_pe) & 0xffff,
+				      comm, &status);
+	       }
+	  }
+     }
+}
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cld2, *cld2rest, *cld3;
+
+     /* transpose locally to get contiguous chunks */
+     cld1 = (plan_rdft *) ego->cld1;
+     if (cld1) {
+	  cld1->apply(ego->cld1, I, O);
+	  
+	  if (ego->preserve_input) I = O;
+
+	  /* transpose chunks globally */
+	  transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+			   ego->send_block_sizes, ego->send_block_offsets,
+			   ego->recv_block_sizes, ego->recv_block_offsets,
+			   ego->comm, O, I);
+     }
+     else if (ego->preserve_input) {
+	  /* transpose chunks globally */
+	  transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+			   ego->send_block_sizes, ego->send_block_offsets,
+			   ego->recv_block_sizes, ego->recv_block_offsets,
+			   ego->comm, I, O);
+
+	  I = O;
+     }
+     else {
+	  /* transpose chunks globally */
+	  transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+			   ego->send_block_sizes, ego->send_block_offsets,
+			   ego->recv_block_sizes, ego->recv_block_offsets,
+			   ego->comm, I, I);
+     }
+
+     /* transpose locally, again, to get ordinary row-major;
+	this may take two transposes if the block sizes are unequal
+	(3 subplans, two of which operate on disjoint data) */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+     cld2rest = (plan_rdft *) ego->cld2rest;
+     if (cld2rest) {
+	  cld2rest->apply(ego->cld2rest,
+			  I + ego->rest_Ioff, O + ego->rest_Ooff);
+	  cld3 = (plan_rdft *) ego->cld3;
+	  if (cld3)
+	       cld3->apply(ego->cld3, O, O);
+	  /* else TRANSPOSED_OUT is true and user wants O transposed */
+     }
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     /* Note: this is *not* UGLY for out-of-place, destroy-input plans;
+	the planner often prefers transpose-pairwise to transpose-alltoall,
+	at least with LAM MPI on my machine. */
+     return (1
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && ONLY_TRANSPOSEDP(p->flags));
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+     X(plan_awake)(ego->cld2rest, wakefulness);
+     X(plan_awake)(ego->cld3, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(ifree0)(ego->sched);
+     X(ifree0)(ego->send_block_sizes);
+     MPI_Comm_free(&ego->comm);
+     X(plan_destroy_internal)(ego->cld3);
+     X(plan_destroy_internal)(ego->cld2rest);
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-transpose-pairwise%s%(%p%)%(%p%)%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
+}
+
+/* Given a process which_pe and a number of processes npes, fills
+   the array sched[npes] with a sequence of processes to communicate
+   with for a deadlock-free, optimum-overlap all-to-all communication.
+   (All processes must call this routine to get their own schedules.)
+   The schedule can be re-ordered arbitrarily as long as all processes
+   apply the same permutation to their schedules.
+
+   The algorithm here is based upon the one described in:
+       J. A. M. Schreuder, "Constructing timetables for sport
+       competitions," Mathematical Programming Study 13, pp. 58-67 (1980). 
+   In a sport competition, you have N teams and want every team to
+   play every other team in as short a time as possible (maximum overlap
+   between games).  This timetabling problem is therefore identical
+   to that of an all-to-all communications problem.  In our case, there
+   is one wrinkle: as part of the schedule, the process must do
+   some data transfer with itself (local data movement), analogous
+   to a requirement that each team "play itself" in addition to other
+   teams.  With this wrinkle, it turns out that an optimal timetable
+   (N parallel games) can be constructed for any N, not just for even
+   N as in the original problem described by Schreuder.
+*/
+static void fill1_comm_sched(int *sched, int which_pe, int npes)
+{
+     int pe, i, n, s = 0;
+     A(which_pe >= 0 && which_pe < npes);
+     if (npes % 2 == 0) {
+	  n = npes;
+	  sched[s++] = which_pe;
+     }
+     else
+	  n = npes + 1;
+     for (pe = 0; pe < n - 1; ++pe) {
+	  if (npes % 2 == 0) {
+	       if (pe == which_pe) sched[s++] = npes - 1;
+	       else if (npes - 1 == which_pe) sched[s++] = pe;
+	  }
+	  else if (pe == which_pe) sched[s++] = pe;
+
+	  if (pe != which_pe && which_pe < n - 1) {
+	       i = (pe - which_pe + (n - 1)) % (n - 1);
+	       if (i < n/2)
+		    sched[s++] = (pe + i) % (n - 1);
+	       
+	       i = (which_pe - pe + (n - 1)) % (n - 1);
+	       if (i < n/2)
+		    sched[s++] = (pe - i + (n - 1)) % (n - 1);
+	  }
+     }
+     A(s == npes);
+}
+
+/* Sort the communication schedule sched for npes so that the schedule
+   on process sortpe is ascending or descending (!ascending).  This is
+   necessary to allow in-place transposes when the problem does not
+   divide equally among the processes.  In this case there is one
+   process where the incoming blocks are bigger/smaller than the
+   outgoing blocks and thus have to be received in
+   descending/ascending order, respectively, to avoid overwriting data
+   before it is sent. */
+static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
+{
+     int *sortsched, i;
+     sortsched = (int *) MALLOC(npes * sizeof(int) * 2, OTHER);
+     fill1_comm_sched(sortsched, sortpe, npes);
+     if (ascending)
+	  for (i = 0; i < npes; ++i)
+	       sortsched[npes + sortsched[i]] = sched[i];
+     else
+	  for (i = 0; i < npes; ++i)
+	       sortsched[2*npes - 1 - sortsched[i]] = sched[i];
+     for (i = 0; i < npes; ++i)
+	  sched[i] = sortsched[npes + i];
+     X(ifree)(sortsched);
+}
+
+/* make the plans to do the post-MPI transpositions (shared with
+   transpose-alltoall) */
+int XM(mkplans_posttranspose)(const problem_mpi_transpose *p, planner *plnr,
+			      R *I, R *O, int my_pe,
+			      plan **cld2, plan **cld2rest, plan **cld3,
+			      INT *rest_Ioff, INT *rest_Ooff)
+{
+     INT vn = p->vn;
+     INT b = p->block;
+     INT bt = XM(block)(p->ny, p->tblock, my_pe);
+     INT nxb = p->nx / b; /* number of equal-sized blocks */
+     INT nxr = p->nx - nxb * b; /* leftover rows after equal blocks */
+
+     *cld2 = *cld2rest = *cld3 = NULL;
+     *rest_Ioff = *rest_Ooff = 0;
+
+     if (!(p->flags & TRANSPOSED_OUT) && (nxr == 0 || I != O)) {
+	  INT nx = p->nx * vn;
+	  b *= vn;
+	  *cld2 = X(mkplan_f_d)(plnr, 
+				X(mkproblem_rdft_0_d)(X(mktensor_3d)
+						      (nxb, bt * b, b,
+						       bt, b, nx,
+						       b, 1, 1),
+						      I, O),
+				0, 0, NO_SLOW);
+	  if (!*cld2) goto nada;
+
+	  if (nxr > 0) {
+	       *rest_Ioff = nxb * bt * b;
+	       *rest_Ooff = nxb * b;
+	       b = nxr * vn;
+	       *cld2rest = X(mkplan_f_d)(plnr,
+					 X(mkproblem_rdft_0_d)(X(mktensor_2d)
+							       (bt, b, nx,
+								b, 1, 1),
+							       I + *rest_Ioff,
+							       O + *rest_Ooff),
+                                        0, 0, NO_SLOW);
+               if (!*cld2rest) goto nada;
+	  }
+     }
+     else {
+	  *cld2 = X(mkplan_f_d)(plnr,
+				X(mkproblem_rdft_0_d)(
+				     X(mktensor_4d)
+				     (nxb, bt * b * vn, bt * b * vn,
+				      bt, b * vn, vn,
+				      b, vn, bt * vn,
+				      vn, 1, 1),
+				     I, O),
+				0, 0, NO_SLOW);
+	  if (!*cld2) goto nada;
+
+	  *rest_Ioff = *rest_Ooff = nxb * bt * b * vn;
+	  *cld2rest = X(mkplan_f_d)(plnr,
+				    X(mkproblem_rdft_0_d)(
+					 X(mktensor_3d)
+					 (bt, nxr * vn, vn,
+					  nxr, vn, bt * vn,
+					  vn, 1, 1),
+					 I + *rest_Ioff, O + *rest_Ooff),
+				    0, 0, NO_SLOW);
+	  if (!*cld2rest) goto nada;
+
+	  if (!(p->flags & TRANSPOSED_OUT)) {
+	       *cld3 = X(mkplan_f_d)(plnr,
+				     X(mkproblem_rdft_0_d)(
+					  X(mktensor_3d)
+					  (p->nx, bt * vn, vn,
+					   bt, vn, p->nx * vn,
+					   vn, 1, 1),
+					  O, O),
+				     0, 0, NO_SLOW);
+	       if (!*cld3) goto nada;
+	  }
+     }
+
+     return 1;
+
+nada:
+     X(plan_destroy_internal)(*cld3);
+     X(plan_destroy_internal)(*cld2rest);
+     X(plan_destroy_internal)(*cld2);
+     return 0;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_transpose *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
+     INT b, bt, vn, rest_Ioff, rest_Ooff;
+     INT *sbs, *sbo, *rbs, *rbo;
+     int pe, my_pe, n_pes, sort_pe = -1, ascending = 1;
+     R *I, *O;
+     static const plan_adt padt = {
+          XM(transpose_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_transpose *) p_;
+     vn = p->vn;
+     I = p->I; O = p->O;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     b = XM(block)(p->nx, p->block, my_pe);
+     
+     if (!(p->flags & TRANSPOSED_IN)) { /* b x ny x vn -> ny x b x vn */
+	  cld1 = X(mkplan_f_d)(plnr, 
+			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
+						     (b, p->ny * vn, vn,
+						      p->ny, vn, b * vn,
+						      vn, 1, 1),
+						     I, O),
+			       0, 0, NO_SLOW);
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
+
+     if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, O, my_pe,
+						 &cld2, &cld2rest, &cld3,
+						 &rest_Ioff, &rest_Ooff),
+		      p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->cld2rest = cld2rest;
+     pln->rest_Ioff = rest_Ioff;
+     pln->rest_Ooff = rest_Ooff;
+     pln->cld3 = cld3;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+
+     MPI_Comm_dup(p->comm, &pln->comm);
+
+     n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block),
+			   XM(num_blocks)(p->ny, p->tblock));
+
+     /* Compute sizes/offsets of blocks to exchange between processors */
+     sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS);
+     sbo = sbs + n_pes;
+     rbs = sbo + n_pes;
+     rbo = rbs + n_pes;
+     b = XM(block)(p->nx, p->block, my_pe);
+     bt = XM(block)(p->ny, p->tblock, my_pe);
+     for (pe = 0; pe < n_pes; ++pe) {
+	  INT db, dbt; /* destination block sizes */
+	  db = XM(block)(p->nx, p->block, pe);
+	  dbt = XM(block)(p->ny, p->tblock, pe);
+
+	  sbs[pe] = b * dbt * vn;
+	  sbo[pe] = pe * (b * p->tblock) * vn;
+	  rbs[pe] = db * bt * vn;
+	  rbo[pe] = pe * (p->block * bt) * vn;
+
+	  if (db * dbt > 0 && db * p->tblock != p->block * dbt) {
+	       A(sort_pe == -1); /* only one process should need sorting */
+	       sort_pe = pe;
+	       ascending = db * p->tblock > p->block * dbt;
+	  }
+     }
+     pln->n_pes = n_pes;
+     pln->my_pe = my_pe;
+     pln->send_block_sizes = sbs;
+     pln->send_block_offsets = sbo;
+     pln->recv_block_sizes = rbs;
+     pln->recv_block_offsets = rbo;
+
+     if (my_pe >= n_pes) {
+	  pln->sched = 0; /* this process is not doing anything */
+     }
+     else {
+	  pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS);
+	  fill1_comm_sched(pln->sched, my_pe, n_pes);
+	  if (sort_pe >= 0)
+	       sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending);
+     }
+
+     X(ops_zero)(&pln->super.super.ops);
+     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
+     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
+     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
+     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
+     /* FIXME: should MPI operations be counted in "other" somehow? */
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld3);
+     X(plan_destroy_internal)(cld2rest);
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(transpose_pairwise_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/transpose-problem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/transpose-problem.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-transpose.h"
+
+static void destroy(problem *ego_)
+{
+     problem_mpi_transpose *ego = (problem_mpi_transpose *) ego_;
+     MPI_Comm_free(&ego->comm);
+     X(ifree)(ego_);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     int i;
+     X(md5puts)(m, "mpi-transpose");
+     X(md5int)(m, p->I == p->O);
+     /* don't include alignment -- may differ between processes
+	X(md5int)(m, X(alignment_of)(p->I));
+	X(md5int)(m, X(alignment_of)(p->O));
+	... note that applicability of MPI plans does not depend
+	    on alignment (although optimality may, in principle). */
+     X(md5INT)(m, p->vn);
+     X(md5INT)(m, p->nx);
+     X(md5INT)(m, p->ny);
+     X(md5INT)(m, p->block);
+     X(md5INT)(m, p->tblock);
+     MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
+     A(XM(md5_equal)(*m, p->comm));
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_mpi_transpose *ego = (const problem_mpi_transpose *) ego_;
+     int i;
+     MPI_Comm_size(ego->comm, &i);
+     p->print(p, "(mpi-transpose %d %d %d %D %D %D %D %D %d)", 
+	      ego->I == ego->O,
+	      X(alignment_of)(ego->I),
+	      X(alignment_of)(ego->O),
+	      ego->vn,
+	      ego->nx, ego->ny,
+	      ego->block, ego->tblock,
+	      i);
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_mpi_transpose *ego = (const problem_mpi_transpose *) ego_;
+     R *I = ego->I;
+     INT i, N = ego->vn * ego->ny;
+     int my_pe;
+
+     MPI_Comm_rank(ego->comm, &my_pe);
+     N *= XM(block)(ego->nx, ego->block, my_pe);
+
+     for (i = 0; i < N; ++i) I[i] = K(0.0);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_MPI_TRANSPOSE,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+problem *XM(mkproblem_transpose)(INT nx, INT ny, INT vn,
+				 R *I, R *O,
+				 INT block, INT tblock,
+				 MPI_Comm comm,
+				 unsigned flags)
+{
+     problem_mpi_transpose *ego =
+          (problem_mpi_transpose *)X(mkproblem)(sizeof(problem_mpi_transpose), &padt);
+
+     A(nx > 0 && ny > 0 && vn > 0);
+     A(block > 0 && XM(num_blocks_ok)(nx, block, comm)
+       && tblock > 0 && XM(num_blocks_ok)(ny, tblock, comm));
+
+     /* enforce pointer equality if untainted pointers are equal */
+     if (UNTAINT(I) == UNTAINT(O))
+	  I = O = JOIN_TAINT(I, O);
+
+     ego->nx = nx;
+     ego->ny = ny;
+     ego->vn = vn;
+     ego->I = I;
+     ego->O = O;
+     ego->block = block > nx ? nx : block;
+     ego->tblock = tblock > ny ? ny : tblock;
+
+     /* canonicalize flags: we can freely assume that the data is
+	"transposed" if one of the dimensions is 1. */
+     if (ego->block == 1)
+	  flags |= TRANSPOSED_IN;
+     if (ego->tblock == 1)
+	  flags |= TRANSPOSED_OUT;
+     ego->flags = flags;
+
+     MPI_Comm_dup(comm, &ego->comm);
+
+     return &(ego->super);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/transpose-recurse.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/transpose-recurse.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Recursive "radix-r" distributed transpose, which breaks a transpose
+   over p processes into p/r transposes over r processes plus r
+   transposes over p/r processes.  If performed recursively, this
+   produces a total of O(p log p) messages vs. O(p^2) messages for a
+   direct approach.
+
+   However, this is not necessarily an improvement.  The total size of
+   all the messages is actually increased from O(N) to O(N log p)
+   where N is the total data size.  Also, the amount of local data
+   rearrangement is increased.  So, it's not clear, a priori, what the
+   best algorithm will be, and we'll leave it to the planner.  (In
+   theory and practice, it looks like this becomes advantageous for
+   large p, in the limit where the message sizes are small and
+   latency-dominated.)
+*/
+
+#include "mpi-transpose.h"
+#include <string.h>
+
+typedef struct {
+     solver super;
+     int (*radix)(int np);
+     const char *nam;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_transpose super;
+
+     plan *cld1, *cldtr, *cldtm;
+     int preserve_input;
+
+     int r; /* "radix" */
+     const char *nam;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cldtr, *cldtm;
+
+     cld1 = (plan_rdft *) ego->cld1;
+     if (cld1) cld1->apply((plan *) cld1, I, O);
+
+     if (ego->preserve_input) I = O;
+
+     cldtr = (plan_rdft *) ego->cldtr;
+     if (cldtr) cldtr->apply((plan *) cldtr, O, I);
+
+     cldtm = (plan_rdft *) ego->cldtm;
+     if (cldtm) cldtm->apply((plan *) cldtm, I, O);
+}
+
+static int radix_sqrt(int np)
+{
+     int r;
+     for (r = (int) (X(isqrt)(np)); np % r != 0; ++r)
+	  ;
+     return r;
+}
+
+static int radix_first(int np)
+{
+     int r = (int) (X(first_divisor)(np));
+     return (r >= (int) (X(isqrt)(np)) ? 0 : r);
+}
+
+/* the local allocated space on process pe required for the given transpose
+   dimensions and block sizes */
+static INT transpose_space(INT nx, INT ny, INT block, INT tblock, int pe)
+{
+     return X(imax)(XM(block)(nx, block, pe) * ny,
+		    nx * XM(block)(ny, tblock, pe));
+}
+
+/* check whether the recursive transposes fit within the space
+   that must have been allocated on each process for this transpose;
+   this must be modified if the subdivision in mkplan is changed! */
+static int enough_space(INT nx, INT ny, INT block, INT tblock,
+			int r, int n_pes)
+{
+     int pe;
+     int m = n_pes / r;
+     for (pe = 0; pe < n_pes; ++pe) {
+	  INT space = transpose_space(nx, ny, block, tblock, pe);
+	  INT b1 = XM(block)(nx, r * block, pe / r);
+	  INT b2 = XM(block)(ny, m * tblock, pe % r);
+	  if (transpose_space(b1, ny, block, m*tblock, pe % r) > space
+	      || transpose_space(nx, b2, r*block, tblock, pe / r) > space)
+	       return 0;
+     }
+     return 1;
+}
+
+/* In theory, transpose-recurse becomes advantageous for message sizes
+   below some minimum, assuming that the time is dominated by
+   communications.  In practice, we want to constrain the minimum
+   message size for transpose-recurse to keep the planning time down.
+   I've set this conservatively according to some simple experiments
+   on a Cray XT3 where the crossover message size was 128, although on
+   a larger-latency machine the crossover will be larger. */
+#define SMALL_MESSAGE 2048
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr, int *r)
+{
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     int n_pes;
+     MPI_Comm_size(p->comm, &n_pes);
+     return (1
+	     && p->tblock * n_pes == p->ny
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+                                          && p->I != p->O))
+	     && (*r = ego->radix(n_pes)) && *r < n_pes && *r > 1
+	     && enough_space(p->nx, p->ny, p->block, p->tblock, *r, n_pes)
+	     && (!CONSERVE_MEMORYP(plnr) || *r > 8
+		 || !X(toobig)((p->nx * (p->ny / n_pes) * p->vn) / *r))
+	     && (!NO_SLOWP(plnr) || 
+		 (p->nx * (p->ny / n_pes) * p->vn) / n_pes <= SMALL_MESSAGE)
+	     && ONLY_TRANSPOSEDP(p->flags)
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cldtr, wakefulness);
+     X(plan_awake)(ego->cldtm, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldtm);
+     X(plan_destroy_internal)(ego->cldtr);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-transpose-recurse/%s/%d%s%(%p%)%(%p%)%(%p%))",
+	      ego->nam, ego->r, ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cldtr, ego->cldtm);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_transpose *p;
+     P *pln;
+     plan *cld1 = 0, *cldtr = 0, *cldtm = 0;
+     R *I, *O;
+     int me, np, r, m;
+     INT b;
+     MPI_Comm comm2;
+     static const plan_adt padt = {
+          XM(transpose_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr, &r))
+          return (plan *) 0;
+
+     p = (const problem_mpi_transpose *) p_;
+
+     MPI_Comm_size(p->comm, &np);
+     MPI_Comm_rank(p->comm, &me);
+     m = np / r;
+     A(r * m == np);
+
+     I = p->I; O = p->O;
+
+     b = XM(block)(p->nx, p->block, me);
+     A(p->tblock * np == p->ny); /* this is currently required for cld1 */
+     if (p->flags & TRANSPOSED_IN) { 
+          /* m x r x (bt x b x vn) -> r x m x (bt x b x vn) */
+	  INT vn = p->vn * b * p->tblock;
+	  cld1 = X(mkplan_f_d)(plnr,
+                               X(mkproblem_rdft_0_d)(X(mktensor_3d)
+						     (m, r*vn, vn,
+						      r, vn, m*vn,
+						      vn, 1, 1),
+                                                     I, O),
+                               0, 0, NO_SLOW);
+     }
+     else if (I != O) { /* combine cld1 with TRANSPOSED_IN permutation */
+          /* b x m x r x bt x vn -> r x m x bt x b x vn */
+	  INT vn = p->vn;
+	  INT bt = p->tblock;
+	  cld1 = X(mkplan_f_d)(plnr,
+                               X(mkproblem_rdft_0_d)(X(mktensor_5d)
+						     (b, m*r*bt*vn, vn,
+						      m, r*bt*vn, bt*b*vn,
+						      r, bt*vn, m*bt*b*vn,
+						      bt, vn, b*vn,
+						      vn, 1, 1),
+                                                     I, O),
+                               0, 0, NO_SLOW);
+     }
+     else { /* TRANSPOSED_IN permutation must be separate for in-place */
+	  /* b x (m x r) x bt x vn -> b x (r x m) x bt x vn */
+	  INT vn = p->vn * p->tblock;
+	  cld1 = X(mkplan_f_d)(plnr,
+                               X(mkproblem_rdft_0_d)(X(mktensor_4d)
+						     (m, r*vn, vn,
+						      r, vn, m*vn,
+						      vn, 1, 1,
+						      b, np*vn, np*vn),
+                                                     I, O),
+                               0, 0, NO_SLOW);
+     }
+     if (XM(any_true)(!cld1, p->comm)) goto nada;
+
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
+
+     b = XM(block)(p->nx, r * p->block, me / r);
+     MPI_Comm_split(p->comm, me / r, me, &comm2);
+     if (b)
+	  cldtr = X(mkplan_d)(plnr, XM(mkproblem_transpose)
+			      (b, p->ny, p->vn,
+			       O, I, p->block, m * p->tblock, comm2, 
+			       p->I != p->O
+			       ? TRANSPOSED_IN : (p->flags & TRANSPOSED_IN)));
+     MPI_Comm_free(&comm2);
+     if (XM(any_true)(b && !cldtr, p->comm)) goto nada;
+     
+     b = XM(block)(p->ny, m * p->tblock, me % r);
+     MPI_Comm_split(p->comm, me % r, me, &comm2);
+     if (b)
+	  cldtm = X(mkplan_d)(plnr, XM(mkproblem_transpose)
+			      (p->nx, b, p->vn,
+			       I, O, r * p->block, p->tblock, comm2, 
+			       TRANSPOSED_IN | (p->flags & TRANSPOSED_OUT)));
+     MPI_Comm_free(&comm2);
+     if (XM(any_true)(b && !cldtm, p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cldtr = cldtr;
+     pln->cldtm = cldtm;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->r = r;
+     pln->nam = ego->nam;
+
+     pln->super.super.ops = cld1->ops;
+     if (cldtr) X(ops_add2)(&cldtr->ops, &pln->super.super.ops);
+     if (cldtm) X(ops_add2)(&cldtm->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldtm);
+     X(plan_destroy_internal)(cldtr);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input,
+			int (*radix)(int np), const char *nam)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     slv->radix = radix;
+     slv->nam = nam;
+     return &(slv->super);
+}
+
+void XM(transpose_recurse_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input) {
+	  REGISTER_SOLVER(p, mksolver(preserve_input, radix_sqrt, "sqrt"));
+	  REGISTER_SOLVER(p, mksolver(preserve_input, radix_first, "first"));
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/transpose-solve.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/transpose-solve.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-transpose.h"
+
+/* use the apply() operation for MPI_TRANSPOSE problems */
+void XM(transpose_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_mpi_transpose *ego = (const plan_mpi_transpose *) ego_;
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/mpi/wisdom-api.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/mpi/wisdom-api.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "fftw3-mpi.h"
+#include "ifftw-mpi.h"
+#include <string.h>
+
+#if SIZEOF_SIZE_T == SIZEOF_UNSIGNED_INT
+#  define FFTW_MPI_SIZE_T MPI_UNSIGNED
+#elif SIZEOF_SIZE_T == SIZEOF_UNSIGNED_LONG
+#  define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG
+#elif SIZEOF_SIZE_T == SIZEOF_UNSIGNED_LONG_LONG
+#  define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG_LONG
+#else
+#  error MPI type for size_t is unknown
+#  define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG
+#endif
+
+/* Import wisdom from all processes to process 0, as prelude to
+   exporting a single wisdom file (this is convenient when we are
+   running on identical processors, to avoid the annoyance of having
+   per-process wisdom files).  In order to make the time for this
+   operation logarithmic in the number of processors (rather than
+   linear), we employ a tree reduction algorithm.  This means that the
+   wisdom is modified on processes other than root, which shouldn't
+   matter in practice. */
+void XM(gather_wisdom)(MPI_Comm comm_)
+{
+     MPI_Comm comm, comm2;
+     int my_pe, n_pes;
+     char *wis;
+     size_t wislen;
+     MPI_Status status;
+
+     MPI_Comm_dup(comm_, &comm);
+     MPI_Comm_rank(comm, &my_pe);
+     MPI_Comm_size(comm, &n_pes);
+
+     if (n_pes > 2) { /* recursively split into even/odd processes */
+	  MPI_Comm_split(comm, my_pe % 2, my_pe, &comm2);
+	  XM(gather_wisdom)(comm2);
+	  MPI_Comm_free(&comm2);
+     }
+     if (n_pes > 1 && my_pe < 2) { /* import process 1 -> 0 */
+	  if (my_pe == 1) {
+	       wis = X(export_wisdom_to_string)();
+	       wislen = strlen(wis) + 1;
+	       MPI_Send(&wislen, 1, FFTW_MPI_SIZE_T, 0, 111, comm);
+	       MPI_Send(wis, wislen, MPI_CHAR, 0, 222, comm);
+	       free(wis);
+	  }
+	  else /* my_pe == 0 */ {
+	       MPI_Recv(&wislen, 1, FFTW_MPI_SIZE_T, 1, 111, comm, &status);
+	       wis = (char *) MALLOC(wislen * sizeof(char), OTHER);
+	       MPI_Recv(wis, wislen, MPI_CHAR, 1, 222, comm, &status);
+	       if (!X(import_wisdom_from_string)(wis))
+		    MPI_Abort(comm, 1);
+	       X(ifree)(wis);
+	  }
+     }
+     MPI_Comm_free(&comm);
+}
+
+/* broadcast wisdom from process 0 to all other processes; this
+   is useful so that we can import wisdom once and not worry
+   about parallel I/O or process-specific wisdom, although of
+   course it assumes that all the processes have identical
+   performance characteristics (i.e. identical hardware). */
+void XM(broadcast_wisdom)(MPI_Comm comm_)
+{
+     MPI_Comm comm;
+     int my_pe;
+     char *wis;
+     size_t wislen;
+
+     MPI_Comm_dup(comm_, &comm);
+     MPI_Comm_rank(comm, &my_pe);
+
+     if (my_pe != 0) {
+	  MPI_Bcast(&wislen, 1, FFTW_MPI_SIZE_T, 0, comm);
+	  wis = (char *) MALLOC(wislen * sizeof(char), OTHER);
+	  MPI_Bcast(wis, wislen, MPI_CHAR, 0, comm);
+	  if (!X(import_wisdom_from_string)(wis))
+	       MPI_Abort(comm, 1);
+	  X(ifree)(wis);
+     }
+     else /* my_pe == 0 */ {
+	  wis = X(export_wisdom_to_string)();
+	  wislen = strlen(wis) + 1;
+	  MPI_Bcast(&wislen, 1, FFTW_MPI_SIZE_T, 0, comm);
+	  MPI_Bcast(wis, wislen, MPI_CHAR, 0, comm);
+	  X(free)(wis);
+     }
+     MPI_Comm_free(&comm);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,18 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft
+SUBDIRS = scalar simd
+
+noinst_LTLIBRARIES = librdft.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = codelet-rdft.h rdft.h
+
+RDFT2 = buffered2.c direct2.c nop2.c rank0-rdft2.c rank-geq2-rdft2.c	\
+plan2.c problem2.c solve2.c vrank-geq1-rdft2.c rdft2-rdft.c		\
+rdft2-tensor-max-index.c rdft2-inplace-strides.c rdft2-strides.c	\
+khc2c.c ct-hc2c.h ct-hc2c.c ct-hc2c-direct.c
+
+librdft_la_SOURCES = hc2hc.h hc2hc.c dft-r2hc.c dht-r2hc.c dht-rader.c	\
+buffered.c codelet-rdft.h conf.c direct-r2r.c direct-r2c.c generic.c	\
+hc2hc-direct.c hc2hc-generic.c khc2hc.c kr2c.c kr2r.c indirect.c nop.c	\
+plan.c problem.c rank0.c rank-geq2.c rdft.h rdft-dht.c solve.c		\
+vrank-geq1.c vrank3-transpose.c $(RDFT2)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,746 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = rdft
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+librdft_la_LIBADD =
+am__objects_1 = buffered2.lo direct2.lo nop2.lo rank0-rdft2.lo \
+	rank-geq2-rdft2.lo plan2.lo problem2.lo solve2.lo \
+	vrank-geq1-rdft2.lo rdft2-rdft.lo rdft2-tensor-max-index.lo \
+	rdft2-inplace-strides.lo rdft2-strides.lo khc2c.lo ct-hc2c.lo \
+	ct-hc2c-direct.lo
+am_librdft_la_OBJECTS = hc2hc.lo dft-r2hc.lo dht-r2hc.lo dht-rader.lo \
+	buffered.lo conf.lo direct-r2r.lo direct-r2c.lo generic.lo \
+	hc2hc-direct.lo hc2hc-generic.lo khc2hc.lo kr2c.lo kr2r.lo \
+	indirect.lo nop.lo plan.lo problem.lo rank0.lo rank-geq2.lo \
+	rdft-dht.lo solve.lo vrank-geq1.lo vrank3-transpose.lo \
+	$(am__objects_1)
+librdft_la_OBJECTS = $(am_librdft_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(librdft_la_SOURCES)
+DIST_SOURCES = $(librdft_la_SOURCES)
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
+	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
+	distdir
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft
+SUBDIRS = scalar simd
+noinst_LTLIBRARIES = librdft.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = codelet-rdft.h rdft.h
+RDFT2 = buffered2.c direct2.c nop2.c rank0-rdft2.c rank-geq2-rdft2.c	\
+plan2.c problem2.c solve2.c vrank-geq1-rdft2.c rdft2-rdft.c		\
+rdft2-tensor-max-index.c rdft2-inplace-strides.c rdft2-strides.c	\
+khc2c.c ct-hc2c.h ct-hc2c.c ct-hc2c-direct.c
+
+librdft_la_SOURCES = hc2hc.h hc2hc.c dft-r2hc.c dht-r2hc.c dht-rader.c	\
+buffered.c codelet-rdft.h conf.c direct-r2r.c direct-r2c.c generic.c	\
+hc2hc-direct.c hc2hc-generic.c khc2hc.c kr2c.c kr2r.c indirect.c nop.c	\
+plan.c problem.c rank0.c rank-geq2.c rdft.h rdft-dht.c solve.c		\
+vrank-geq1.c vrank3-transpose.c $(RDFT2)
+
+all: all-recursive
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+librdft.la: $(librdft_la_OBJECTS) $(librdft_la_DEPENDENCIES) $(EXTRA_librdft_la_DEPENDENCIES) 
+	$(LINK)  $(librdft_la_OBJECTS) $(librdft_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffered.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffered2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/conf.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ct-hc2c-direct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ct-hc2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dft-r2hc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dht-r2hc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dht-rader.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct-r2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct-r2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/generic.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2hc-direct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2hc-generic.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2hc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/indirect.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/khc2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/khc2hc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kr2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kr2r.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nop.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nop2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank-geq2-rdft2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank-geq2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank0-rdft2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank0.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft-dht.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-inplace-strides.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-rdft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-strides.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-tensor-max-index.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solve.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solve2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank-geq1-rdft2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank-geq1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank3-transpose.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-recursive
+all-am: Makefile $(LTLIBRARIES)
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \
+	install-am install-strip tags-recursive
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am check check-am clean clean-generic clean-libtool \
+	clean-noinstLTLIBRARIES ctags ctags-recursive distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	installdirs-am maintainer-clean maintainer-clean-generic \
+	mostlyclean mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \
+	uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/buffered.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/buffered.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+     int maxnbuf_ndx;
+} S;
+
+static const INT maxnbufs[] = { 8, 256 };
+
+typedef struct {
+     plan_rdft super;
+
+     plan *cld, *cldcpy, *cldrest;
+     INT n, vl, nbuf, bufdist;
+     INT ivs_by_nbuf, ovs_by_nbuf;
+} P;
+
+/* transform a vector input with the help of bufs */
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld = (plan_rdft *) ego->cld;
+     plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
+     plan_rdft *cldrest;
+     INT i, vl = ego->vl, nbuf = ego->nbuf;
+     INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
+     R *bufs;
+
+     bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
+
+     for (i = nbuf; i <= vl; i += nbuf) {
+          /* transform to bufs: */
+          cld->apply((plan *) cld, I, bufs);
+	  I += ivs_by_nbuf;
+
+          /* copy back */
+          cldcpy->apply((plan *) cldcpy, bufs, O);
+	  O += ovs_by_nbuf;
+     }
+
+     X(ifree)(bufs);
+
+     /* Do the remaining transforms, if any: */
+     cldrest = (plan_rdft *) ego->cldrest;
+     cldrest->apply((plan *) cldrest, I, O);
+}
+
+/* for hc2r problems, copy the input into buffer, and then
+   transform buffer->output, which allows for destruction of the
+   buffer */
+static void apply_hc2r(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld = (plan_rdft *) ego->cld;
+     plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
+     plan_rdft *cldrest;
+     INT i, vl = ego->vl, nbuf = ego->nbuf;
+     INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
+     R *bufs;
+
+     bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
+
+     for (i = nbuf; i <= vl; i += nbuf) {
+          /* copy input into bufs: */
+          cldcpy->apply((plan *) cldcpy, I, bufs);
+	  I += ivs_by_nbuf;
+
+          /* transform to output */
+          cld->apply((plan *) cld, bufs, O);
+	  O += ovs_by_nbuf;
+     }
+
+     X(ifree)(bufs);
+
+     /* Do the remaining transforms, if any: */
+     cldrest = (plan_rdft *) ego->cldrest;
+     cldrest->apply((plan *) cldrest, I, O);
+}
+
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldcpy, wakefulness);
+     X(plan_awake)(ego->cldrest, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldrest);
+     X(plan_destroy_internal)(ego->cldcpy);
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(rdft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
+              ego->n, ego->nbuf,
+              ego->vl, ego->bufdist % ego->n,
+              ego->cld, ego->cldcpy, ego->cldrest);
+}
+
+static int applicable0(const S *ego, const problem *p_, const planner *plnr)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     iodim *d = p->sz->dims;
+
+     if (1
+	 && p->vecsz->rnk <= 1
+	 && p->sz->rnk == 1
+	  ) {
+	  INT vl, ivs, ovs;
+	  X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
+
+	  if (X(toobig)(d[0].n) && CONSERVE_MEMORYP(plnr))
+	       return 0;
+
+	  /* if this solver is redundant, in the sense that a solver
+	     of lower index generates the same plan, then prune this
+	     solver */
+	  if (X(nbuf_redundant)(d[0].n, vl,
+				ego->maxnbuf_ndx,
+				maxnbufs, NELEM(maxnbufs)))
+	       return 0;
+
+	  if (p->I != p->O) {
+	       if (p->kind[0] == HC2R) {
+		    /* Allow HC2R problems only if the input is to be
+		       preserved.  This solver sets NO_DESTROY_INPUT,
+		       which prevents infinite loops */
+		    return (NO_DESTROY_INPUTP(plnr));
+	       } else {
+		    /*
+		      In principle, the buffered transforms might be useful
+		      when working out of place.  However, in order to
+		      prevent infinite loops in the planner, we require
+		      that the output stride of the buffered transforms be
+		      greater than 1.
+		    */
+		    return (d[0].os > 1);
+	       }
+	  }
+
+	  /*
+	   * If the problem is in place, the input/output strides must
+	   * be the same or the whole thing must fit in the buffer.
+	   */
+	  if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
+	       return 1;
+
+	  if (/* fits into buffer: */
+	       ((p->vecsz->rnk == 0)
+		||
+		(X(nbuf)(d[0].n, p->vecsz->dims[0].n, 
+			 maxnbufs[ego->maxnbuf_ndx]) 
+		 == p->vecsz->dims[0].n)))
+	       return 1;
+     }
+
+     return 0;
+}
+
+static int applicable(const S *ego, const problem *p_, const planner *plnr)
+{
+     const problem_rdft *p;
+
+     if (NO_BUFFERINGP(plnr)) return 0;
+
+     if (!applicable0(ego, p_, plnr)) return 0;
+
+     p = (const problem_rdft *) p_;
+     if (p->kind[0] == HC2R) {
+	  if (NO_UGLYP(plnr)) {
+	       /* UGLY if in-place and too big, since the problem
+		  could be solved via transpositions */
+	       if (p->I == p->O && X(toobig)(p->sz->dims[0].n)) 
+		    return 0;
+	  }
+     } else {
+	  if (NO_UGLYP(plnr)) {
+	       if (p->I != p->O) return 0;
+	       if (X(toobig)(p->sz->dims[0].n)) return 0;
+	  }
+     }
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const S *ego = (const S *)ego_;
+     plan *cld = (plan *) 0;
+     plan *cldcpy = (plan *) 0;
+     plan *cldrest = (plan *) 0;
+     const problem_rdft *p = (const problem_rdft *) p_;
+     R *bufs = (R *) 0;
+     INT nbuf = 0, bufdist, n, vl;
+     INT ivs, ovs;
+     int hc2rp;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego, p_, plnr))
+          goto nada;
+
+     n = X(tensor_sz)(p->sz);
+     X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
+     hc2rp = (p->kind[0] == HC2R);
+
+     nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
+     bufdist = X(bufdist)(n, vl);
+     A(nbuf > 0);
+
+     /* initial allocation for the purpose of planning */
+     bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
+
+     if (hc2rp) {
+	  /* allow destruction of buffer */
+	  cld = X(mkplan_f_d)(plnr, 
+			      X(mkproblem_rdft_d)(
+				   X(mktensor_1d)(n, 1, p->sz->dims[0].os),
+				   X(mktensor_1d)(nbuf, bufdist, ovs),
+				   bufs, TAINT(p->O, ovs * nbuf), p->kind),
+			      0, 0, NO_DESTROY_INPUT);
+	  if (!cld) goto nada;
+
+	  /* copying input into buffer buffer is a rank-0 transform: */
+	  cldcpy = X(mkplan_d)(plnr, 
+			       X(mkproblem_rdft_0_d)(
+				    X(mktensor_2d)(nbuf, ivs, bufdist,
+						   n, p->sz->dims[0].is, 1),
+				    TAINT(p->I, ivs * nbuf), bufs));
+	  if (!cldcpy) goto nada;
+     } else {
+	  /* allow destruction of input if problem is in place */
+	  cld = X(mkplan_f_d)(plnr, 
+			      X(mkproblem_rdft_d)(
+				   X(mktensor_1d)(n, p->sz->dims[0].is, 1),
+				   X(mktensor_1d)(nbuf, ivs, bufdist),
+				   TAINT(p->I, ivs * nbuf), bufs, p->kind),
+			      0, 0, (p->I == p->O) ? NO_DESTROY_INPUT : 0);
+	  if (!cld) goto nada;
+
+	  /* copying back from the buffer is a rank-0 transform: */
+	  cldcpy = X(mkplan_d)(plnr, 
+			       X(mkproblem_rdft_0_d)(
+				    X(mktensor_2d)(nbuf, bufdist, ovs,
+						   n, 1, p->sz->dims[0].os),
+				    bufs, TAINT(p->O, ovs * nbuf)));
+	  if (!cldcpy) goto nada;
+     }
+
+     /* deallocate buffers, let apply() allocate them for real */
+     X(ifree)(bufs);
+     bufs = 0;
+
+     /* plan the leftover transforms (cldrest): */
+     {
+	  INT id = ivs * (nbuf * (vl / nbuf));
+	  INT od = ovs * (nbuf * (vl / nbuf));
+	  cldrest = X(mkplan_d)(plnr, 
+				X(mkproblem_rdft_d)(
+				     X(tensor_copy)(p->sz),
+				     X(mktensor_1d)(vl % nbuf, ivs, ovs),
+				     p->I + id, p->O + od, p->kind));
+     }
+     if (!cldrest) goto nada;
+
+     pln = MKPLAN_RDFT(P, &padt, hc2rp ? apply_hc2r : apply);
+     pln->cld = cld;
+     pln->cldcpy = cldcpy;
+     pln->cldrest = cldrest;
+     pln->n = n;
+     pln->vl = vl;
+     pln->ivs_by_nbuf = ivs * nbuf;
+     pln->ovs_by_nbuf = ovs * nbuf;
+
+     pln->nbuf = nbuf;
+     pln->bufdist = bufdist;
+
+     {
+	  opcnt t;
+	  X(ops_add)(&cld->ops, &cldcpy->ops, &t);
+	  X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
+     }
+
+     return &(pln->super.super);
+
+ nada:
+     X(ifree0)(bufs);
+     X(plan_destroy_internal)(cldrest);
+     X(plan_destroy_internal)(cldcpy);
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int maxnbuf_ndx)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->maxnbuf_ndx = maxnbuf_ndx;
+     return &(slv->super);
+}
+
+void X(rdft_buffered_register)(planner *p)
+{
+     size_t i;
+     for (i = 0; i < NELEM(maxnbufs); ++i)
+	  REGISTER_SOLVER(p, mksolver(i));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/buffered2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/buffered2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* buffering of rdft2.  We always buffer the complex array */
+
+#include "rdft.h"
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     int maxnbuf_ndx;
+} S;
+
+static const INT maxnbufs[] = { 8, 256 };
+
+typedef struct {
+     plan_rdft2 super;
+
+     plan *cld, *cldcpy, *cldrest;
+     INT n, vl, nbuf, bufdist;
+     INT ivs_by_nbuf, ovs_by_nbuf;
+     INT ioffset, roffset;
+} P;
+
+/* transform a vector input with the help of bufs */
+static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld = (plan_rdft2 *) ego->cld;
+     plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
+     INT i, vl = ego->vl, nbuf = ego->nbuf;
+     INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
+     R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
+     R *bufr = bufs + ego->roffset;
+     R *bufi = bufs + ego->ioffset;
+     plan_rdft2 *cldrest;
+
+     for (i = nbuf; i <= vl; i += nbuf) {
+          /* transform to bufs: */
+          cld->apply((plan *) cld, r0, r1, bufr, bufi);
+	  r0 += ivs_by_nbuf; r1 += ivs_by_nbuf;
+
+          /* copy back */
+          cldcpy->apply((plan *) cldcpy, bufr, bufi, cr, ci);
+	  cr += ovs_by_nbuf; ci += ovs_by_nbuf;
+     }
+
+     X(ifree)(bufs);
+
+     /* Do the remaining transforms, if any: */
+     cldrest = (plan_rdft2 *) ego->cldrest;
+     cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
+}
+
+/* for hc2r problems, copy the input into buffer, and then
+   transform buffer->output, which allows for destruction of the
+   buffer */
+static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld = (plan_rdft2 *) ego->cld;
+     plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
+     INT i, vl = ego->vl, nbuf = ego->nbuf;
+     INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
+     R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
+     R *bufr = bufs + ego->roffset;
+     R *bufi = bufs + ego->ioffset;
+     plan_rdft2 *cldrest;
+
+     for (i = nbuf; i <= vl; i += nbuf) {
+          /* copy input into bufs: */
+          cldcpy->apply((plan *) cldcpy, cr, ci, bufr, bufi);
+	  cr += ivs_by_nbuf; ci += ivs_by_nbuf;
+
+          /* transform to output */
+          cld->apply((plan *) cld, r0, r1, bufr, bufi);
+	  r0 += ovs_by_nbuf; r1 += ovs_by_nbuf;
+     }
+
+     X(ifree)(bufs);
+
+     /* Do the remaining transforms, if any: */
+     cldrest = (plan_rdft2 *) ego->cldrest;
+     cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
+}
+
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldcpy, wakefulness);
+     X(plan_awake)(ego->cldrest, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldrest);
+     X(plan_destroy_internal)(ego->cldcpy);
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(rdft2-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
+              ego->n, ego->nbuf,
+              ego->vl, ego->bufdist % ego->n,
+              ego->cld, ego->cldcpy, ego->cldrest);
+}
+
+static int applicable0(const S *ego, const problem *p_, const planner *plnr)
+{
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     iodim *d = p->sz->dims;
+
+     if (1
+	 && p->vecsz->rnk <= 1
+	 && p->sz->rnk == 1
+
+	 /* we assume even n throughout */
+	 && (d[0].n % 2) == 0
+
+	 /* and we only consider these two cases */
+	 && (p->kind == R2HC || p->kind == HC2R)
+
+	  ) {
+	  INT vl, ivs, ovs;
+	  X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
+
+	  if (X(toobig)(d[0].n) && CONSERVE_MEMORYP(plnr))
+	       return 0;
+
+	  /* if this solver is redundant, in the sense that a solver
+	     of lower index generates the same plan, then prune this
+	     solver */
+	  if (X(nbuf_redundant)(d[0].n, vl,
+				ego->maxnbuf_ndx,
+				maxnbufs, NELEM(maxnbufs)))
+	       return 0;
+
+	  if (p->r0 != p->cr) {
+	       if (p->kind == HC2R) {
+		    /* Allow HC2R problems only if the input is to be
+		       preserved.  This solver sets NO_DESTROY_INPUT,
+		       which prevents infinite loops */
+		    return (NO_DESTROY_INPUTP(plnr));
+	       } else {
+		    /*
+		      In principle, the buffered transforms might be useful
+		      when working out of place.  However, in order to
+		      prevent infinite loops in the planner, we require
+		      that the output stride of the buffered transforms be
+		      greater than 2.
+		    */
+		    return (d[0].os > 2);
+	       }
+	  }
+
+	  /*
+	   * If the problem is in place, the input/output strides must
+	   * be the same or the whole thing must fit in the buffer.
+	   */
+	  if (X(rdft2_inplace_strides(p, RNK_MINFTY)))
+	       return 1;
+
+	  if (/* fits into buffer: */
+	       ((p->vecsz->rnk == 0)
+		||
+		(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
+			 maxnbufs[ego->maxnbuf_ndx])
+		 == p->vecsz->dims[0].n)))
+	       return 1;
+     }
+
+     return 0;
+}
+
+static int applicable(const S *ego, const problem *p_, const planner *plnr)
+{
+     const problem_rdft2 *p;
+
+     if (NO_BUFFERINGP(plnr)) return 0;
+
+     if (!applicable0(ego, p_, plnr)) return 0;
+
+     p = (const problem_rdft2 *) p_;
+     if (p->kind == HC2R) {
+	  if (NO_UGLYP(plnr)) {
+	       /* UGLY if in-place and too big, since the problem
+		  could be solved via transpositions */
+	       if (p->r0 == p->cr && X(toobig)(p->sz->dims[0].n)) 
+		    return 0;
+	  }
+     } else {
+	  if (NO_UGLYP(plnr)) {
+	       if (p->r0 != p->cr || X(toobig)(p->sz->dims[0].n))
+		    return 0;
+	  }
+     }
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const S *ego = (const S *)ego_;
+     plan *cld = (plan *) 0;
+     plan *cldcpy = (plan *) 0;
+     plan *cldrest = (plan *) 0;
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     R *bufs = (R *) 0;
+     INT nbuf = 0, bufdist, n, vl;
+     INT ivs, ovs, ioffset, roffset, id, od;
+
+     static const plan_adt padt = {
+	  X(rdft2_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego, p_, plnr))
+          goto nada;
+
+     n = X(tensor_sz)(p->sz);
+     X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
+
+     nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
+     bufdist = X(bufdist)(n + 2, vl); /* complex-side rdft2 stores N+2
+					 real numbers */
+     A(nbuf > 0);
+
+     /* attempt to keep real and imaginary part in the same order,
+	so as to allow optimizations in the the copy plan */
+     roffset = (p->cr - p->ci > 0) ? (INT)1 : (INT)0;
+     ioffset = 1 - roffset;
+
+     /* initial allocation for the purpose of planning */
+     bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
+
+     id = ivs * (nbuf * (vl / nbuf));
+     od = ovs * (nbuf * (vl / nbuf));
+
+     if (p->kind == R2HC) {
+	  /* allow destruction of input if problem is in place */
+	  cld = X(mkplan_f_d)(
+	       plnr, 
+	       X(mkproblem_rdft2_d)(
+		    X(mktensor_1d)(n, p->sz->dims[0].is, 2),
+		    X(mktensor_1d)(nbuf, ivs, bufdist),
+		    TAINT(p->r0, ivs * nbuf), TAINT(p->r1, ivs * nbuf),
+		    bufs + roffset, bufs + ioffset, p->kind),
+	       0, 0, (p->r0 == p->cr) ? NO_DESTROY_INPUT : 0);
+	  if (!cld) goto nada;
+
+	  /* copying back from the buffer is a rank-0 DFT: */
+	  cldcpy = X(mkplan_d)(
+	       plnr, 
+	       X(mkproblem_dft_d)(
+		    X(mktensor_0d)(),
+		    X(mktensor_2d)(nbuf, bufdist, ovs,
+				   n/2+1, 2, p->sz->dims[0].os),
+		    bufs + roffset, bufs + ioffset,
+		    TAINT(p->cr, ovs * nbuf), TAINT(p->ci, ovs * nbuf) ));
+	  if (!cldcpy) goto nada;
+
+	  X(ifree)(bufs); bufs = 0;
+
+	  cldrest = X(mkplan_d)(plnr, 
+				X(mkproblem_rdft2_d)(
+				     X(tensor_copy)(p->sz),
+				     X(mktensor_1d)(vl % nbuf, ivs, ovs),
+				     p->r0 + id, p->r1 + id, 
+				     p->cr + od, p->ci + od,
+				     p->kind));
+	  if (!cldrest) goto nada;
+	  pln = MKPLAN_RDFT2(P, &padt, apply_r2hc);
+     } else {
+	  /* allow destruction of buffer */
+	  cld = X(mkplan_f_d)(
+	       plnr, 
+	       X(mkproblem_rdft2_d)(
+		    X(mktensor_1d)(n, 2, p->sz->dims[0].os),
+		    X(mktensor_1d)(nbuf, bufdist, ovs),
+		    TAINT(p->r0, ovs * nbuf), TAINT(p->r1, ovs * nbuf),
+		    bufs + roffset, bufs + ioffset, p->kind),
+	       0, 0, NO_DESTROY_INPUT);
+	  if (!cld) goto nada;
+
+	  /* copying input into buffer is a rank-0 DFT: */
+	  cldcpy = X(mkplan_d)(
+	       plnr, 
+	       X(mkproblem_dft_d)(
+		    X(mktensor_0d)(),
+		    X(mktensor_2d)(nbuf, ivs, bufdist,
+				   n/2+1, p->sz->dims[0].is, 2),
+		    TAINT(p->cr, ivs * nbuf), TAINT(p->ci, ivs * nbuf), 
+		    bufs + roffset, bufs + ioffset));
+	  if (!cldcpy) goto nada;
+
+	  X(ifree)(bufs); bufs = 0;
+
+	  cldrest = X(mkplan_d)(plnr, 
+				X(mkproblem_rdft2_d)(
+				     X(tensor_copy)(p->sz),
+				     X(mktensor_1d)(vl % nbuf, ivs, ovs),
+				     p->r0 + od, p->r1 + od, 
+				     p->cr + id, p->ci + id,
+				     p->kind));
+	  if (!cldrest) goto nada;
+
+	  pln = MKPLAN_RDFT2(P, &padt, apply_hc2r);
+     }
+
+     pln->cld = cld;
+     pln->cldcpy = cldcpy;
+     pln->cldrest = cldrest;
+     pln->n = n;
+     pln->vl = vl;
+     pln->ivs_by_nbuf = ivs * nbuf;
+     pln->ovs_by_nbuf = ovs * nbuf;
+     pln->roffset = roffset;
+     pln->ioffset = ioffset;
+
+     pln->nbuf = nbuf;
+     pln->bufdist = bufdist;
+
+     {
+	  opcnt t;
+	  X(ops_add)(&cld->ops, &cldcpy->ops, &t);
+	  X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
+     }
+
+     return &(pln->super.super);
+
+ nada:
+     X(ifree0)(bufs);
+     X(plan_destroy_internal)(cldrest);
+     X(plan_destroy_internal)(cldcpy);
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int maxnbuf_ndx)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->maxnbuf_ndx = maxnbuf_ndx;
+     return &(slv->super);
+}
+
+void X(rdft2_buffered_register)(planner *p)
+{
+     size_t i;
+     for (i = 0; i < NELEM(maxnbufs); ++i)
+	  REGISTER_SOLVER(p, mksolver(i));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/codelet-rdft.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/codelet-rdft.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/*
+ * This header file must include every file or define every
+ * type or macro which is required to compile a codelet.
+ */
+
+#ifndef __RDFT_CODELET_H__
+#define __RDFT_CODELET_H__
+
+#include "ifftw.h"
+
+/**************************************************************
+ * types of codelets
+ **************************************************************/
+
+/* FOOab, with a,b in {0,1}, denotes the FOO transform
+   where a/b say whether the input/output are shifted by
+   half a sample/slot. */
+
+typedef enum {
+     R2HC00, R2HC01, R2HC10, R2HC11,
+     HC2R00, HC2R01, HC2R10, HC2R11,
+     DHT, 
+     REDFT00, REDFT01, REDFT10, REDFT11, /* real-even == DCT's */
+     RODFT00, RODFT01, RODFT10, RODFT11  /*  real-odd == DST's */
+} rdft_kind;
+
+/* standard R2HC/HC2R transforms are unshifted */
+#define R2HC R2HC00
+#define HC2R HC2R00
+
+#define R2HCII R2HC01
+#define HC2RIII HC2R10
+
+/* (k) >= R2HC00 produces a warning under gcc because checking x >= 0
+   is superfluous for unsigned values...but it is needed because other
+   compilers (e.g. icc) may define the enum to be a signed int...grrr. */
+#define R2HC_KINDP(k) ((k) >= R2HC00 && (k) <= R2HC11) /* uses kr2hc_genus */
+#define HC2R_KINDP(k) ((k) >= HC2R00 && (k) <= HC2R11) /* uses khc2r_genus */
+
+#define R2R_KINDP(k) ((k) >= DHT) /* uses kr2r_genus */
+
+#define REDFT_KINDP(k) ((k) >= REDFT00 && (k) <= REDFT11)
+#define RODFT_KINDP(k) ((k) >= RODFT00 && (k) <= RODFT11)
+#define REODFT_KINDP(k) ((k) >= REDFT00 && (k) <= RODFT11)
+
+/* codelets with real input (output) and complex output (input) */
+typedef struct kr2c_desc_s kr2c_desc;
+
+typedef struct {
+     rdft_kind kind;
+     INT vl;
+} kr2c_genus;
+
+struct kr2c_desc_s {
+     INT n;    /* size of transform computed */
+     const char *nam;
+     opcnt ops;
+     const kr2c_genus *genus;
+};
+
+typedef void (*kr2c) (R *R0, R *R1, R *Cr, R *Ci,
+		      stride rs, stride csr, stride csi,
+		      INT vl, INT ivs, INT ovs);
+void X(kr2c_register)(planner *p, kr2c codelet, const kr2c_desc *desc);
+
+/* half-complex to half-complex DIT/DIF codelets: */
+typedef struct hc2hc_desc_s hc2hc_desc;
+
+typedef struct {
+     rdft_kind kind;
+     INT vl;
+} hc2hc_genus;
+
+struct hc2hc_desc_s {
+     INT radix;
+     const char *nam;
+     const tw_instr *tw;
+     const hc2hc_genus *genus;
+     opcnt ops;
+};
+
+typedef void (*khc2hc) (R *rioarray, R *iioarray, const R *W,
+			stride rs, INT mb, INT me, INT ms);
+void X(khc2hc_register)(planner *p, khc2hc codelet, const hc2hc_desc *desc);
+
+/* half-complex to rdft2-complex DIT/DIF codelets: */
+typedef struct hc2c_desc_s hc2c_desc;
+
+typedef enum {
+     HC2C_VIA_RDFT,
+     HC2C_VIA_DFT
+} hc2c_kind;
+
+typedef struct {
+     int (*okp)(
+	  const R *Rp, const R *Ip, const R *Rm, const R *Im, 
+	  INT rs, INT mb, INT me, INT ms, 
+	  const planner *plnr);
+     rdft_kind kind;
+     INT vl;
+} hc2c_genus;
+
+struct hc2c_desc_s {
+     INT radix;
+     const char *nam;
+     const tw_instr *tw;
+     const hc2c_genus *genus;
+     opcnt ops;
+};
+
+typedef void (*khc2c) (R *Rp, R *Ip, R *Rm, R *Im, const R *W,
+		       stride rs, INT mb, INT me, INT ms);
+void X(khc2c_register)(planner *p, khc2c codelet, const hc2c_desc *desc,
+		       hc2c_kind hc2ckind);
+
+extern const solvtab X(solvtab_rdft_r2cf);
+extern const solvtab X(solvtab_rdft_r2cb);
+extern const solvtab X(solvtab_rdft_sse2);
+extern const solvtab X(solvtab_rdft_avx);
+extern const solvtab X(solvtab_rdft_altivec);
+extern const solvtab X(solvtab_rdft_neon);
+
+/* real-input & output DFT-like codelets (DHT, etc.) */
+typedef struct kr2r_desc_s kr2r_desc;
+
+typedef struct {
+     INT vl;
+} kr2r_genus;
+
+struct kr2r_desc_s {
+     INT n;    /* size of transform computed */
+     const char *nam;
+     opcnt ops;
+     const kr2r_genus *genus;
+     rdft_kind kind;
+};
+
+typedef void (*kr2r) (const R *I, R *O, stride is, stride os,
+		      INT vl, INT ivs, INT ovs);
+void X(kr2r_register)(planner *p, kr2r codelet, const kr2r_desc *desc);
+
+extern const solvtab X(solvtab_rdft_r2r);
+
+#endif				/* __RDFT_CODELET_H__ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/conf.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/conf.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+static const solvtab s =
+{
+     SOLVTAB(X(rdft_indirect_register)),
+     SOLVTAB(X(rdft_rank0_register)),
+     SOLVTAB(X(rdft_vrank3_transpose_register)),
+     SOLVTAB(X(rdft_vrank_geq1_register)),
+
+     SOLVTAB(X(rdft_nop_register)),
+     SOLVTAB(X(rdft_buffered_register)),
+     SOLVTAB(X(rdft_generic_register)),
+     SOLVTAB(X(rdft_rank_geq2_register)),
+
+     SOLVTAB(X(dft_r2hc_register)),
+
+     SOLVTAB(X(rdft_dht_register)),
+     SOLVTAB(X(dht_r2hc_register)),
+     SOLVTAB(X(dht_rader_register)),
+
+     SOLVTAB(X(rdft2_vrank_geq1_register)),
+     SOLVTAB(X(rdft2_nop_register)),
+     SOLVTAB(X(rdft2_rank0_register)),
+     SOLVTAB(X(rdft2_buffered_register)),
+     SOLVTAB(X(rdft2_rank_geq2_register)),
+     SOLVTAB(X(rdft2_rdft_register)),
+
+     SOLVTAB(X(hc2hc_generic_register)),
+
+     SOLVTAB_END
+};
+
+void X(rdft_conf_standard)(planner *p)
+{
+     X(solvtab_exec)(s, p);
+     X(solvtab_exec)(X(solvtab_rdft_r2cf), p);
+     X(solvtab_exec)(X(solvtab_rdft_r2cb), p);
+     X(solvtab_exec)(X(solvtab_rdft_r2r), p);
+
+#if HAVE_SSE2
+     if (X(have_simd_sse2)())
+	  X(solvtab_exec)(X(solvtab_rdft_sse2), p);
+#endif
+#if HAVE_AVX
+     if (X(have_simd_avx)())
+	  X(solvtab_exec)(X(solvtab_rdft_avx), p);
+#endif
+#if HAVE_ALTIVEC
+     if (X(have_simd_altivec)())
+	  X(solvtab_exec)(X(solvtab_rdft_altivec), p);
+#endif
+#if HAVE_NEON
+     if (X(have_simd_neon)())
+	  X(solvtab_exec)(X(solvtab_rdft_neon), p);
+#endif
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/ct-hc2c-direct.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/ct-hc2c-direct.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ct-hc2c.h"
+
+typedef struct {
+     hc2c_solver super;
+     const hc2c_desc *desc;
+     int bufferedp;
+     khc2c k;
+} S;
+
+typedef struct {
+     plan_hc2c super;
+     khc2c k;
+     plan *cld0, *cldm; /* children for 0th and middle butterflies */
+     INT r, m, v, extra_iter;
+     INT ms, vs;
+     stride rs, brs;
+     twid *td;
+     const S *slv;
+} P;
+
+/*************************************************************
+  Nonbuffered code
+ *************************************************************/
+static void apply(const plan *ego_, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
+     plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
+     INT i, m = ego->m, v = ego->v;
+     INT ms = ego->ms, vs = ego->vs;
+
+     for (i = 0; i < v; ++i, cr += vs, ci += vs) {
+	  cld0->apply((plan *) cld0, cr, ci, cr, ci);
+	  ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
+		 ego->td->W, ego->rs, 1, (m+1)/2, ms);
+	  cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms, 
+		      cr + (m/2)*ms, ci + (m/2)*ms);
+     }
+}
+
+static void apply_extra_iter(const plan *ego_, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
+     plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
+     INT i, m = ego->m, v = ego->v;
+     INT ms = ego->ms, vs = ego->vs;
+     INT mm = (m-1)/2;
+
+     for (i = 0; i < v; ++i, cr += vs, ci += vs) {
+	  cld0->apply((plan *) cld0, cr, ci, cr, ci);
+
+	  /* for 4-way SIMD when (m+1)/2-1 is odd: iterate over an
+	     even vector length MM-1, and then execute the last
+	     iteration as a 2-vector with vector stride 0.  The
+	     twiddle factors of the second half of the last iteration
+	     are bogus, but we only store the results of the first
+	     half. */
+	  ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
+		 ego->td->W, ego->rs, 1, mm, ms);
+	  ego->k(cr + mm*ms, ci + mm*ms, cr + (m-mm)*ms, ci + (m-mm)*ms,
+		 ego->td->W, ego->rs, mm, mm+2, 0);
+	  cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms, 
+		      cr + (m/2)*ms, ci + (m/2)*ms);
+     }
+
+}
+
+/*************************************************************
+  Buffered code
+ *************************************************************/
+
+/* should not be 2^k to avoid associativity conflicts */
+static INT compute_batchsize(INT radix)
+{
+     /* round up to multiple of 4 */
+     radix += 3;
+     radix &= -4;
+
+     return (radix + 2);
+}
+
+static void dobatch(const P *ego, R *Rp, R *Ip, R *Rm, R *Im,
+		    INT mb, INT me, INT extra_iter, R *bufp)
+{
+     INT b = WS(ego->brs, 1);
+     INT rs = WS(ego->rs, 1);
+     INT ms = ego->ms;
+     R *bufm = bufp + b - 2;
+
+     X(cpy2d_pair_ci)(Rp + mb * ms, Ip + mb * ms, bufp, bufp + 1,
+		      ego->r / 2, rs, b,
+		      me - mb, ms, 2);
+     X(cpy2d_pair_ci)(Rm - mb * ms, Im - mb * ms, bufm, bufm + 1,
+		      ego->r / 2, rs, b,
+		      me - mb, -ms, -2);
+     ego->k(bufp, bufp + 1, bufm, bufm + 1, ego->td->W, 
+	    ego->brs, mb, me + extra_iter, 2);
+     X(cpy2d_pair_co)(bufp, bufp + 1, Rp + mb * ms, Ip + mb * ms, 
+		      ego->r / 2, b, rs,
+		      me - mb, 2, ms);
+     X(cpy2d_pair_co)(bufm, bufm + 1, Rm - mb * ms, Im - mb * ms,
+		      ego->r / 2, b, rs,
+		      me - mb, -2, -ms);
+}
+
+static void apply_buf(const plan *ego_, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
+     plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
+     INT i, j, ms = ego->ms, v = ego->v;
+     INT batchsz = compute_batchsize(ego->r);
+     R *buf;
+     INT mb = 1, me = (ego->m+1) / 2;
+     size_t bufsz = ego->r * batchsz * 2 * sizeof(R);
+
+     BUF_ALLOC(R *, buf, bufsz);
+
+     for (i = 0; i < v; ++i, cr += ego->vs, ci += ego->vs) {
+	  R *Rp = cr;
+	  R *Ip = ci;
+	  R *Rm = cr + ego->m * ms;
+	  R *Im = ci + ego->m * ms;
+
+	  cld0->apply((plan *) cld0, Rp, Ip, Rp, Ip);
+
+	  for (j = mb; j + batchsz < me; j += batchsz) 
+	       dobatch(ego, Rp, Ip, Rm, Im, j, j + batchsz, 0, buf);
+
+	  dobatch(ego, Rp, Ip, Rm, Im, j, me, ego->extra_iter, buf);
+
+	  cldm->apply((plan *) cldm, 
+		      Rp + me * ms, Ip + me * ms,
+		      Rp + me * ms, Ip + me * ms);
+
+     }
+
+     BUF_FREE(buf, bufsz);
+}
+
+/*************************************************************
+  common code
+ *************************************************************/
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(plan_awake)(ego->cld0, wakefulness);
+     X(plan_awake)(ego->cldm, wakefulness);
+     X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw, 
+		      ego->r * ego->m, ego->r, 
+		      (ego->m - 1) / 2 + ego->extra_iter);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld0);
+     X(plan_destroy_internal)(ego->cldm);
+     X(stride_destroy)(ego->rs);
+     X(stride_destroy)(ego->brs);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *slv = ego->slv;
+     const hc2c_desc *e = slv->desc;
+
+     if (slv->bufferedp)
+	  p->print(p, "(hc2c-directbuf/%D-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
+		   compute_batchsize(ego->r),
+		   ego->r, X(twiddle_length)(ego->r, e->tw),
+		   ego->extra_iter, ego->v, e->nam, 
+		   ego->cld0, ego->cldm);
+     else
+	  p->print(p, "(hc2c-direct-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
+		   ego->r, X(twiddle_length)(ego->r, e->tw), 
+		   ego->extra_iter, ego->v, e->nam, 
+		   ego->cld0, ego->cldm);
+}
+
+static int applicable0(const S *ego, rdft_kind kind,
+		       INT r, INT rs,
+		       INT m, INT ms, 
+		       INT v, INT vs,
+		       const R *cr, const R *ci,
+		       const planner *plnr,
+		       INT *extra_iter)
+{
+     const hc2c_desc *e = ego->desc;
+     UNUSED(v);
+
+     return (
+	  1
+	  && r == e->radix
+	  && kind == e->genus->kind
+
+	  /* first v-loop iteration */
+	  && ((*extra_iter = 0,
+	       e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
+			     rs, 1, (m+1)/2, ms, plnr))
+	      ||
+	      (*extra_iter = 1,
+	       ((e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
+			       rs, 1, (m-1)/2, ms, plnr))
+		&&
+		(e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
+			       rs, (m-1)/2, (m-1)/2 + 2, 0, plnr)))))
+	  
+	  /* subsequent v-loop iterations */
+	  && (cr += vs, ci += vs, 1)
+
+	  && e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
+			   rs, 1, (m+1)/2 - *extra_iter, ms, plnr)
+	  );
+}
+
+static int applicable0_buf(const S *ego, rdft_kind kind,
+			   INT r, INT rs,
+			   INT m, INT ms, 
+			   INT v, INT vs,
+			   const R *cr, const R *ci,
+			   const planner *plnr, INT *extra_iter)
+{
+     const hc2c_desc *e = ego->desc;
+     INT batchsz, brs;
+     UNUSED(v); UNUSED(rs); UNUSED(ms); UNUSED(vs);
+
+     return (
+	  1
+	  && r == e->radix
+	  && kind == e->genus->kind
+
+	  /* ignore cr, ci, use buffer */
+	  && (cr = (const R *)0, ci = cr + 1, 
+	      batchsz = compute_batchsize(r), 
+	      brs = 4 * batchsz, 1)
+
+	  && e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2, 
+			   brs, 1, 1+batchsz, 2, plnr)
+
+	  && ((*extra_iter = 0,
+	       e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2, 
+			     brs, 1, 1 + (((m-1)/2) % batchsz), 2, plnr))
+	      ||
+	      (*extra_iter = 1,
+	       e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2, 
+			     brs, 1, 1 + 1 + (((m-1)/2) % batchsz), 2, plnr)))
+	      
+	  );
+}
+
+static int applicable(const S *ego, rdft_kind kind,
+		      INT r, INT rs,
+		      INT m, INT ms, 
+		      INT v, INT vs,
+		      R *cr, R *ci,
+		      const planner *plnr, INT *extra_iter)
+{
+     if (ego->bufferedp) {
+	  if (!applicable0_buf(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
+			       extra_iter))
+	       return 0;
+     } else {
+	  if (!applicable0(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
+			   extra_iter))
+	       return 0;
+     }
+
+     if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
+				       v, m * r, r))
+	  return 0;
+
+     return 1;
+}
+
+static plan *mkcldw(const hc2c_solver *ego_, rdft_kind kind,
+		    INT r, INT rs,
+		    INT m, INT ms, 
+		    INT v, INT vs,
+		    R *cr, R *ci,
+		    planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     P *pln;
+     const hc2c_desc *e = ego->desc;
+     plan *cld0 = 0, *cldm = 0;
+     INT imid = (m / 2) * ms;
+     INT extra_iter;
+
+     static const plan_adt padt = {
+	  0, awake, print, destroy
+     };
+
+     if (!applicable(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr, 
+		     &extra_iter))
+          return (plan *)0;
+
+     cld0 = X(mkplan_d)(
+	  plnr, 
+	  X(mkproblem_rdft2_d)(X(mktensor_1d)(r, rs, rs),
+			       X(mktensor_0d)(),
+			       TAINT(cr, vs), TAINT(ci, vs),
+			       TAINT(cr, vs), TAINT(ci, vs),
+			       kind));
+     if (!cld0) goto nada;
+
+     cldm = X(mkplan_d)(
+	  plnr, 
+	  X(mkproblem_rdft2_d)(((m % 2) ?
+				X(mktensor_0d)() : X(mktensor_1d)(r, rs, rs) ),
+			       X(mktensor_0d)(),
+			       TAINT(cr + imid, vs), TAINT(ci + imid, vs),
+			       TAINT(cr + imid, vs), TAINT(ci + imid, vs),
+			       kind == R2HC ? R2HCII : HC2RIII));
+     if (!cldm) goto nada;
+
+     if (ego->bufferedp)
+	  pln = MKPLAN_HC2C(P, &padt, apply_buf);
+     else
+	  pln = MKPLAN_HC2C(P, &padt, extra_iter ? apply_extra_iter : apply);
+
+     pln->k = ego->k;
+     pln->td = 0;
+     pln->r = r; pln->rs = X(mkstride)(r, rs);
+     pln->m = m; pln->ms = ms;
+     pln->v = v; pln->vs = vs;
+     pln->slv = ego;
+     pln->brs = X(mkstride)(r, 4 * compute_batchsize(r));
+     pln->cld0 = cld0;
+     pln->cldm = cldm;
+     pln->extra_iter = extra_iter;
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(v * (((m - 1) / 2) / e->genus->vl),
+		  &e->ops, &pln->super.super.ops);
+     X(ops_madd2)(v, &cld0->ops, &pln->super.super.ops);
+     X(ops_madd2)(v, &cldm->ops, &pln->super.super.ops);
+
+     if (ego->bufferedp) 
+	  pln->super.super.ops.other += 4 * r * m * v;
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld0);
+     X(plan_destroy_internal)(cldm);
+     return 0;
+}
+
+static void regone(planner *plnr, khc2c codelet,
+		   const hc2c_desc *desc, 
+		   hc2c_kind hc2ckind, 
+		   int bufferedp)
+{
+     S *slv = (S *)X(mksolver_hc2c)(sizeof(S), desc->radix, hc2ckind, mkcldw);
+     slv->k = codelet;
+     slv->desc = desc;
+     slv->bufferedp = bufferedp;
+     REGISTER_SOLVER(plnr, &(slv->super.super));
+}
+
+void X(regsolver_hc2c_direct)(planner *plnr, khc2c codelet,
+			      const hc2c_desc *desc,
+			      hc2c_kind hc2ckind)
+{
+     regone(plnr, codelet, desc, hc2ckind, /* bufferedp */0);
+     regone(plnr, codelet, desc, hc2ckind, /* bufferedp */1);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/ct-hc2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/ct-hc2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ct-hc2c.h"
+#include "dft.h"
+
+typedef struct {
+     plan_rdft2 super;
+     plan *cld;
+     plan *cldw;
+     INT r;
+} P;
+
+static void apply_dit(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld;
+     plan_hc2c *cldw;
+     UNUSED(r1);
+
+     cld = (plan_rdft *) ego->cld;
+     cld->apply(ego->cld, r0, cr);
+
+     cldw = (plan_hc2c *) ego->cldw;
+     cldw->apply(ego->cldw, cr, ci);
+}
+
+static void apply_dif(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld;
+     plan_hc2c *cldw;
+     UNUSED(r1);
+
+     cldw = (plan_hc2c *) ego->cldw;
+     cldw->apply(ego->cldw, cr, ci);
+
+     cld = (plan_rdft *) ego->cld;
+     cld->apply(ego->cld, cr, r0);
+}
+
+static void apply_dit_dft(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+     plan_hc2c *cldw;
+
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, r0, r1, cr, ci);
+
+     cldw = (plan_hc2c *) ego->cldw;
+     cldw->apply(ego->cldw, cr, ci);
+}
+
+static void apply_dif_dft(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+     plan_hc2c *cldw;
+
+     cldw = (plan_hc2c *) ego->cldw;
+     cldw->apply(ego->cldw, cr, ci);
+
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, ci, cr, r1, r0);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldw, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldw);
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(rdft2-ct-%s/%D%(%p%)%(%p%))",
+	      (ego->super.apply == apply_dit || 
+	       ego->super.apply == apply_dit_dft)
+	      ? "dit" : "dif",
+	      ego->r, ego->cldw, ego->cld);
+}
+
+static int applicable0(const hc2c_solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     INT r;
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1 
+
+	     && (/* either the problem is R2HC, which is solved by DIT */
+		  (p->kind == R2HC)
+		  ||
+		  /* or the problem is HC2R, in which case it is solved
+		     by DIF, which destroys the input */
+		  (p->kind == HC2R && 
+		   (p->r0 == p->cr || !NO_DESTROY_INPUTP(plnr))))
+		  
+	     && ((r = X(choose_radix)(ego->r, p->sz->dims[0].n)) > 0)
+	     && p->sz->dims[0].n > r);
+}
+
+int X(hc2c_applicable)(const hc2c_solver *ego, const problem *p_,
+		       planner *plnr)
+{
+     const problem_rdft2 *p;
+
+     if (!applicable0(ego, p_, plnr))
+          return 0;
+
+     p = (const problem_rdft2 *) p_;
+
+     return (0
+	     || p->vecsz->rnk == 0
+	     || !NO_VRECURSEP(plnr)
+	  );
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const hc2c_solver *ego = (const hc2c_solver *) ego_;
+     const problem_rdft2 *p;
+     P *pln = 0;
+     plan *cld = 0, *cldw = 0;
+     INT n, r, m, v, ivs, ovs;
+     iodim *d;
+
+     static const plan_adt padt = {
+	  X(rdft2_solve), awake, print, destroy
+     };
+
+     if (!X(hc2c_applicable)(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_rdft2 *) p_;
+     d = p->sz->dims;
+     n = d[0].n;
+     r = X(choose_radix)(ego->r, n);
+     A((r % 2) == 0);
+     m = n / r;
+
+     X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
+
+     switch (p->kind) {
+	 case R2HC:
+	      cldw = ego->mkcldw(ego, R2HC, 
+				 r, m * d[0].os, 
+				 m, d[0].os,
+				 v, ovs,
+				 p->cr, p->ci, plnr);
+	      if (!cldw) goto nada;
+
+	      switch (ego->hc2ckind) {
+		  case HC2C_VIA_RDFT:
+		       cld = X(mkplan_d)(
+			    plnr, 
+			    X(mkproblem_rdft_1_d)(
+				 X(mktensor_1d)(m, (r/2)*d[0].is, d[0].os),
+				 X(mktensor_3d)(
+				      2, p->r1 - p->r0, p->ci - p->cr,
+				      r / 2, d[0].is, m * d[0].os,
+				      v, ivs, ovs),
+				 p->r0, p->cr, R2HC) 
+			    );
+		       if (!cld) goto nada;
+
+		       pln = MKPLAN_RDFT2(P, &padt, apply_dit);
+		       break;
+
+		  case HC2C_VIA_DFT:
+		       cld = X(mkplan_d)(
+			    plnr, 
+			    X(mkproblem_dft_d)(
+				 X(mktensor_1d)(m, (r/2)*d[0].is, d[0].os),
+				 X(mktensor_2d)(
+				      r / 2, d[0].is, m * d[0].os,
+				      v, ivs, ovs),
+				 p->r0, p->r1, p->cr, p->ci) 
+			    );
+		       if (!cld) goto nada;
+
+		       pln = MKPLAN_RDFT2(P, &padt, apply_dit_dft);
+		       break;
+	      }
+	      break;
+
+	 case HC2R:
+	      cldw = ego->mkcldw(ego, HC2R, 
+				 r, m * d[0].is, 
+				 m, d[0].is,
+				 v, ivs,
+				 p->cr, p->ci, plnr);
+	      if (!cldw) goto nada;
+
+	      switch (ego->hc2ckind) {
+		  case HC2C_VIA_RDFT:
+		       cld = X(mkplan_d)(
+			    plnr, 
+			    X(mkproblem_rdft_1_d)(
+				 X(mktensor_1d)(m, d[0].is, (r/2)*d[0].os),
+				 X(mktensor_3d)(
+				      2, p->ci - p->cr, p->r1 - p->r0, 
+				      r / 2, m * d[0].is, d[0].os,
+				      v, ivs, ovs),
+				 p->cr, p->r0, HC2R) 
+			    );
+		       if (!cld) goto nada;
+
+		       pln = MKPLAN_RDFT2(P, &padt, apply_dif);
+		       break;
+
+		  case HC2C_VIA_DFT:
+		       cld = X(mkplan_d)(
+			    plnr, 
+			    X(mkproblem_dft_d)(
+				 X(mktensor_1d)(m, d[0].is, (r/2)*d[0].os),
+				 X(mktensor_2d)(
+				      r / 2, m * d[0].is, d[0].os,
+				      v, ivs, ovs),
+				 p->ci, p->cr, p->r1, p->r0) 
+			    );
+		       if (!cld) goto nada;
+
+		       pln = MKPLAN_RDFT2(P, &padt, apply_dif_dft);
+		       break;
+	      }
+	      break;
+
+	 default: 
+	      A(0);
+     }
+
+     pln->cld = cld;
+     pln->cldw = cldw;
+     pln->r = r;
+     X(ops_add)(&cld->ops, &cldw->ops, &pln->super.super.ops);
+
+     /* inherit could_prune_now_p attribute from cldw */
+     pln->super.super.could_prune_now_p = cldw->could_prune_now_p;
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldw);
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+hc2c_solver *X(mksolver_hc2c)(size_t size, INT r, 
+			      hc2c_kind hc2ckind,
+			      hc2c_mkinferior mkcldw)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
+     hc2c_solver *slv = (hc2c_solver *)X(mksolver)(size, &sadt);
+     slv->r = r;
+     slv->hc2ckind = hc2ckind;
+     slv->mkcldw = mkcldw;
+     return slv;
+}
+
+plan *X(mkplan_hc2c)(size_t size, const plan_adt *adt, hc2capply apply)
+{
+     plan_hc2c *ego;
+
+     ego = (plan_hc2c *) X(mkplan)(size, adt);
+     ego->apply = apply;
+
+     return &(ego->super);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/ct-hc2c.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/ct-hc2c.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "rdft.h"
+
+typedef void (*hc2capply) (const plan *ego, R *cr, R *ci);
+typedef struct hc2c_solver_s hc2c_solver;
+typedef plan *(*hc2c_mkinferior)(const hc2c_solver *ego, rdft_kind kind,
+				 INT r, INT rs,
+				 INT m, INT ms, 
+				 INT v, INT vs,
+				 R *cr, R *ci,
+				 planner *plnr);
+
+typedef struct {
+     plan super;
+     hc2capply apply;
+} plan_hc2c;
+
+extern plan *X(mkplan_hc2c)(size_t size, const plan_adt *adt, 
+			    hc2capply apply);
+
+#define MKPLAN_HC2C(type, adt, apply) \
+  (type *)X(mkplan_hc2c)(sizeof(type), adt, apply)
+
+struct hc2c_solver_s {
+     solver super;
+     INT r;
+
+     hc2c_mkinferior mkcldw;
+     hc2c_kind hc2ckind;
+};
+
+hc2c_solver *X(mksolver_hc2c)(size_t size, INT r,
+			      hc2c_kind hc2ckind,
+			      hc2c_mkinferior mkcldw);
+
+void X(regsolver_hc2c_direct)(planner *plnr, khc2c codelet, 
+			      const hc2c_desc *desc,
+			      hc2c_kind hc2ckind);
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/dft-r2hc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/dft-r2hc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Compute the complex DFT by combining R2HC RDFTs on the real
+   and imaginary parts.   This could be useful for people just wanting
+   to link to the real codelets and not the complex ones.  It could
+   also even be faster than the complex algorithms for split (as opposed
+   to interleaved) real/imag complex data. */
+
+#include "rdft.h"
+#include "dft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_dft super;
+     plan *cld;
+     INT ishift, oshift;
+     INT os;
+     INT n;
+} P;
+
+static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     INT n;
+
+     UNUSED(ii);
+
+     { /* transform vector of real & imag parts: */
+	  plan_rdft *cld = (plan_rdft *) ego->cld;
+	  cld->apply((plan *) cld, ri + ego->ishift, ro + ego->oshift);
+     }
+
+     n = ego->n;
+     if (n > 1) {
+	  INT i, os = ego->os;
+	  for (i = 1; i < (n + 1)/2; ++i) {
+	       E rop, iop, iom, rom;
+	       rop = ro[os * i];
+	       iop = io[os * i];
+	       rom = ro[os * (n - i)];
+	       iom = io[os * (n - i)];
+	       ro[os * i] = rop - iom;
+	       io[os * i] = iop + rom;
+	       ro[os * (n - i)] = rop + iom;
+	       io[os * (n - i)] = iop - rom;
+	  }
+     }
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(dft-r2hc-%D%(%p%))", ego->n, ego->cld);
+}
+
+
+static int applicable0(const problem *p_)
+{
+     const problem_dft *p = (const problem_dft *) p_;
+     return ((p->sz->rnk == 1 && p->vecsz->rnk == 0)
+	     || (p->sz->rnk == 0 && FINITE_RNK(p->vecsz->rnk))
+	  );
+}
+
+static int splitp(R *r, R *i, INT n, INT s)
+{
+     return ((r > i ? (r - i) : (i - r)) >= n * (s > 0 ? s : 0-s));
+}
+
+static int applicable(const problem *p_, const planner *plnr)
+{
+     if (!applicable0(p_)) return 0;
+
+     {
+	  const problem_dft *p = (const problem_dft *) p_;
+
+	  /* rank-0 problems are always OK */
+	  if (p->sz->rnk == 0) return 1;
+
+	  /* this solver is ok for split arrays */
+	  if (p->sz->rnk == 1 &&
+	      splitp(p->ri, p->ii, p->sz->dims[0].n, p->sz->dims[0].is) &&
+	      splitp(p->ro, p->io, p->sz->dims[0].n, p->sz->dims[0].os))
+	       return 1;
+
+	  return !(NO_DFT_R2HCP(plnr));
+     }
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_dft *p;
+     plan *cld;
+     INT ishift = 0, oshift = 0;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego_);
+     if (!applicable(p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_dft *) p_;
+
+     {
+	  tensor *ri_vec = X(mktensor_1d)(2, p->ii - p->ri, p->io - p->ro);
+	  tensor *cld_vec = X(tensor_append)(ri_vec, p->vecsz);
+	  int i;
+	  for (i = 0; i < cld_vec->rnk; ++i) { /* make all istrides > 0 */
+	       if (cld_vec->dims[i].is < 0) {
+		    INT nm1 = cld_vec->dims[i].n - 1;
+		    ishift -= nm1 * (cld_vec->dims[i].is *= -1);
+		    oshift -= nm1 * (cld_vec->dims[i].os *= -1);
+	       }
+	  }
+	  cld = X(mkplan_d)(plnr, 
+			    X(mkproblem_rdft_1)(p->sz, cld_vec, 
+						p->ri + ishift, 
+						p->ro + oshift, R2HC));
+	  X(tensor_destroy2)(ri_vec, cld_vec);
+     }
+     if (!cld) return (plan *)0;
+
+     pln = MKPLAN_DFT(P, &padt, apply);
+
+     if (p->sz->rnk == 0) {
+	  pln->n = 1;
+	  pln->os = 0;
+     }
+     else {
+	  pln->n = p->sz->dims[0].n;
+	  pln->os = p->sz->dims[0].os;
+     }
+     pln->ishift = ishift;
+     pln->oshift = oshift;
+
+     pln->cld = cld;
+     
+     pln->super.super.ops = cld->ops;
+     pln->super.super.ops.other += 8 * ((pln->n - 1)/2);
+     pln->super.super.ops.add += 4 * ((pln->n - 1)/2);
+     pln->super.super.ops.other += 1; /* estimator hack for nop plans */
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(dft_r2hc_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/dht-r2hc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/dht-r2hc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Solve a DHT problem (Discrete Hartley Transform) via post-processing
+   of an R2HC problem. */
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld;
+     INT os;
+     INT n;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT os = ego->os;
+     INT i, n = ego->n;
+
+     {
+	  plan_rdft *cld = (plan_rdft *) ego->cld;
+	  cld->apply((plan *) cld, I, O);
+     }
+
+     for (i = 1; i < n - i; ++i) {
+	  E a, b;
+	  a = O[os * i];
+	  b = O[os * (n - i)];
+#if FFT_SIGN == -1
+	  O[os * i] = a - b;
+	  O[os * (n - i)] = a + b;
+#else
+	  O[os * i] = a + b;
+	  O[os * (n - i)] = a - b;
+#endif
+     }
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(dht-r2hc-%D%(%p%))", ego->n, ego->cld);
+}
+
+static int applicable0(const problem *p_, const planner *plnr)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     return (1
+	     && !NO_DHT_R2HCP(plnr)
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk == 0
+	     && p->kind[0] == DHT
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p, const planner *plnr)
+{
+     UNUSED(ego);
+     return (!NO_SLOWP(plnr) && applicable0(p, plnr));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     plan *cld;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+
+     /* NO_DHT_R2HC stops infinite loops with rdft-dht.c */
+     cld = X(mkplan_f_d)(plnr, 
+			 X(mkproblem_rdft_1)(p->sz, p->vecsz, 
+					     p->I, p->O, R2HC),
+			 NO_DHT_R2HC, 0, 0);
+     if (!cld) return (plan *)0;
+
+     pln = MKPLAN_RDFT(P, &padt, apply);
+
+     pln->n = p->sz->dims[0].n;
+     pln->os = p->sz->dims[0].os;
+     pln->cld = cld;
+     
+     pln->super.super.ops = cld->ops;
+     pln->super.super.ops.other += 4 * ((pln->n - 1)/2);
+     pln->super.super.ops.add += 2 * ((pln->n - 1)/2);
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(dht_r2hc_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/dht-rader.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/dht-rader.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "rdft.h"
+
+/*
+ * Compute DHTs of prime sizes using Rader's trick: turn them
+ * into convolutions of size n - 1, which we then perform via a pair
+ * of FFTs.   (We can then do prime real FFTs via rdft-dht.c.)
+ *
+ * Optionally (determined by the "pad" field of the solver), we can
+ * perform the (cyclic) convolution by zero-padding to a size
+ * >= 2*(n-1) - 1.  This is advantageous if n-1 has large prime factors.
+ *
+ */
+
+typedef struct {
+     solver super;
+     int pad;
+} S;
+
+typedef struct {
+     plan_rdft super;
+
+     plan *cld1, *cld2;
+     R *omega;
+     INT n, npad, g, ginv;
+     INT is, os;
+     plan *cld_omega;
+} P;
+
+static rader_tl *omegas = 0;
+
+/***************************************************************************/
+
+/* If R2HC_ONLY_CONV is 1, we use a trick to perform the convolution
+   purely in terms of R2HC transforms, as opposed to R2HC followed by H2RC.
+   This requires a few more operations, but allows us to share the same
+   plan/codelets for both Rader children. */
+#define R2HC_ONLY_CONV 1
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT n = ego->n; /* prime */
+     INT npad = ego->npad; /* == n - 1 for unpadded Rader; always even */
+     INT is = ego->is, os;
+     INT k, gpower, g;
+     R *buf, *omega;
+     R r0;
+
+     buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS);
+
+     /* First, permute the input, storing in buf: */
+     g = ego->g; 
+     for (gpower = 1, k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
+	  buf[k] = I[gpower * is];
+     }
+     /* gpower == g^(n-1) mod n == 1 */;
+
+     A(n - 1 <= npad);
+     for (k = n - 1; k < npad; ++k) /* optionally, zero-pad convolution */
+	  buf[k] = 0;
+
+     os = ego->os;
+
+     /* compute RDFT of buf, storing in buf (i.e., in-place): */
+     {
+	    plan_rdft *cld = (plan_rdft *) ego->cld1;
+	    cld->apply((plan *) cld, buf, buf);
+     }
+
+     /* set output DC component: */
+     O[0] = (r0 = I[0]) + buf[0];
+
+     /* now, multiply by omega: */
+     omega = ego->omega;
+     buf[0] *= omega[0];
+     for (k = 1; k < npad/2; ++k) {
+	  E rB, iB, rW, iW, a, b;
+	  rW = omega[k];
+	  iW = omega[npad - k];
+	  rB = buf[k];
+	  iB = buf[npad - k];
+	  a = rW * rB - iW * iB;
+	  b = rW * iB + iW * rB;
+#if R2HC_ONLY_CONV
+	  buf[k] = a + b;
+	  buf[npad - k] = a - b;
+#else
+	  buf[k] = a;
+	  buf[npad - k] = b;
+#endif
+     }
+     /* Nyquist component: */
+     A(k + k == npad); /* since npad is even */
+     buf[k] *= omega[k];
+     
+     /* this will add input[0] to all of the outputs after the ifft */
+     buf[0] += r0;
+
+     /* inverse FFT: */
+     {
+	    plan_rdft *cld = (plan_rdft *) ego->cld2;
+	    cld->apply((plan *) cld, buf, buf);
+     }
+
+     /* do inverse permutation to unshuffle the output: */
+     A(gpower == 1);
+#if R2HC_ONLY_CONV
+     O[os] = buf[0];
+     gpower = g = ego->ginv;
+     A(npad == n - 1 || npad/2 >= n - 1);
+     if (npad == n - 1) {
+	  for (k = 1; k < npad/2; ++k, gpower = MULMOD(gpower, g, n)) {
+	       O[gpower * os] = buf[k] + buf[npad - k];
+	  }
+	  O[gpower * os] = buf[k];
+	  ++k, gpower = MULMOD(gpower, g, n);
+	  for (; k < npad; ++k, gpower = MULMOD(gpower, g, n)) {
+	       O[gpower * os] = buf[npad - k] - buf[k];
+	  }
+     }
+     else {
+	  for (k = 1; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
+	       O[gpower * os] = buf[k] + buf[npad - k];
+	  }
+     }
+#else
+     g = ego->ginv;
+     for (k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
+	  O[gpower * os] = buf[k];
+     }
+#endif
+     A(gpower == 1);
+
+     X(ifree)(buf);
+}
+
+static R *mkomega(enum wakefulness wakefulness,
+		  plan *p_, INT n, INT npad, INT ginv)
+{
+     plan_rdft *p = (plan_rdft *) p_;
+     R *omega;
+     INT i, gpower;
+     trigreal scale;
+     triggen *t;
+
+     if ((omega = X(rader_tl_find)(n, npad + 1, ginv, omegas))) 
+	  return omega;
+
+     omega = (R *)MALLOC(sizeof(R) * npad, TWIDDLES);
+
+     scale = npad; /* normalization for convolution */
+
+     t = X(mktriggen)(wakefulness, n);
+     for (i = 0, gpower = 1; i < n-1; ++i, gpower = MULMOD(gpower, ginv, n)) {
+	  trigreal w[2];
+	  t->cexpl(t, gpower, w);
+	  omega[i] = (w[0] + w[1]) / scale;
+     }
+     X(triggen_destroy)(t);
+     A(gpower == 1);
+
+     A(npad == n - 1 || npad >= 2*(n - 1) - 1);
+
+     for (; i < npad; ++i)
+	  omega[i] = K(0.0);
+     if (npad > n - 1)
+	  for (i = 1; i < n-1; ++i)
+	       omega[npad - i] = omega[n - 1 - i];
+
+     p->apply(p_, omega, omega);
+
+     X(rader_tl_insert)(n, npad + 1, ginv, omega, &omegas);
+     return omega;
+}
+
+static void free_omega(R *omega)
+{
+     X(rader_tl_delete)(omega, &omegas);
+}
+
+/***************************************************************************/
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+     X(plan_awake)(ego->cld_omega, wakefulness);
+
+     switch (wakefulness) {
+	 case SLEEPY:
+	      free_omega(ego->omega);
+	      ego->omega = 0;
+	      break;
+	 default:
+	      ego->g = X(find_generator)(ego->n);
+	      ego->ginv = X(power_mod)(ego->g, ego->n - 2, ego->n);
+	      A(MULMOD(ego->g, ego->ginv, ego->n) == 1);
+
+	      A(!ego->omega);
+	      ego->omega = mkomega(wakefulness, 
+				   ego->cld_omega,ego->n,ego->npad,ego->ginv);
+	      break;
+     }
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld_omega);
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+
+     p->print(p, "(dht-rader-%D/%D%ois=%oos=%(%p%)",
+              ego->n, ego->npad, ego->is, ego->os, ego->cld1);
+     if (ego->cld2 != ego->cld1)
+          p->print(p, "%(%p%)", ego->cld2);
+     if (ego->cld_omega != ego->cld1 && ego->cld_omega != ego->cld2)
+          p->print(p, "%(%p%)", ego->cld_omega);
+     p->putchr(p, ')');
+}
+
+static int applicable(const solver *ego, const problem *p_, const planner *plnr)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego);
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk == 0
+	     && p->kind[0] == DHT
+	     && X(is_prime)(p->sz->dims[0].n)
+	     && p->sz->dims[0].n > 2
+	     && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
+	     /* proclaim the solver SLOW if p-1 is not easily
+		factorizable.  Unlike in the complex case where
+		Bluestein can solve the problem, in the DHT case we
+		may have no other choice */
+	     && CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
+	  );
+}
+
+static INT choose_transform_size(INT minsz)
+{
+     static const INT primes[] = { 2, 3, 5, 0 };
+     while (!X(factors_into)(minsz, primes) || minsz % 2)
+	  ++minsz;
+     return minsz;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft *p = (const problem_rdft *) p_;
+     P *pln;
+     INT n, npad;
+     INT is, os;
+     plan *cld1 = (plan *) 0;
+     plan *cld2 = (plan *) 0;
+     plan *cld_omega = (plan *) 0;
+     R *buf = (R *) 0;
+     problem *cldp;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+	  return (plan *) 0;
+
+     n = p->sz->dims[0].n;
+     is = p->sz->dims[0].is;
+     os = p->sz->dims[0].os;
+
+     if (ego->pad)
+	  npad = choose_transform_size(2 * (n - 1) - 1);
+     else
+	  npad = n - 1;
+
+     /* initial allocation for the purpose of planning */
+     buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS);
+
+     cld1 = X(mkplan_f_d)(plnr, 
+			  X(mkproblem_rdft_1_d)(X(mktensor_1d)(npad, 1, 1),
+						X(mktensor_1d)(1, 0, 0),
+						buf, buf,
+						R2HC),
+			  NO_SLOW, 0, 0);
+     if (!cld1) goto nada;
+
+     cldp =
+          X(mkproblem_rdft_1_d)(
+               X(mktensor_1d)(npad, 1, 1),
+               X(mktensor_1d)(1, 0, 0),
+	       buf, buf, 
+#if R2HC_ONLY_CONV
+	       R2HC
+#else
+	       HC2R
+#endif
+	       );
+     if (!(cld2 = X(mkplan_f_d)(plnr, cldp, NO_SLOW, 0, 0)))
+	  goto nada;
+
+     /* plan for omega */
+     cld_omega = X(mkplan_f_d)(plnr, 
+			       X(mkproblem_rdft_1_d)(
+				    X(mktensor_1d)(npad, 1, 1),
+				    X(mktensor_1d)(1, 0, 0),
+				    buf, buf, R2HC),
+			       NO_SLOW, ESTIMATE, 0);
+     if (!cld_omega) goto nada;
+
+     /* deallocate buffers; let awake() or apply() allocate them for real */
+     X(ifree)(buf);
+     buf = 0;
+
+     pln = MKPLAN_RDFT(P, &padt, apply);
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->cld_omega = cld_omega;
+     pln->omega = 0;
+     pln->n = n;
+     pln->npad = npad;
+     pln->is = is;
+     pln->os = os;
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+     pln->super.super.ops.other += (npad/2-1)*6 + npad + n + (n-1) * ego->pad;
+     pln->super.super.ops.add += (npad/2-1)*2 + 2 + (n-1) * ego->pad;
+     pln->super.super.ops.mul += (npad/2-1)*4 + 2 + ego->pad;
+#if R2HC_ONLY_CONV
+     pln->super.super.ops.other += n-2 - ego->pad;
+     pln->super.super.ops.add += (npad/2-1)*2 + (n-2) - ego->pad;
+#endif
+
+     return &(pln->super.super);
+
+ nada:
+     X(ifree0)(buf);
+     X(plan_destroy_internal)(cld_omega);
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return 0;
+}
+
+/* constructors */
+
+static solver *mksolver(int pad)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->pad = pad;
+     return &(slv->super);
+}
+
+void X(dht_rader_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver(0));
+     REGISTER_SOLVER(p, mksolver(1));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/direct-r2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/direct-r2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* direct RDFT solver, using r2c codelets */
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+     const kr2c_desc *desc;
+     kr2c k;
+     int bufferedp;
+} S;
+
+typedef struct {
+     plan_rdft super;
+
+     stride rs, csr, csi;
+     stride brs, bcsr, bcsi;
+     INT n, vl, rs0, ivs, ovs, ioffset, bioffset;
+     kr2c k;
+     const S *slv;
+} P;
+
+/*************************************************************
+  Nonbuffered code
+ *************************************************************/
+static void apply_r2hc(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     ASSERT_ALIGNED_DOUBLE;
+     ego->k(I, I + ego->rs0, O, O + ego->ioffset, 
+	    ego->rs, ego->csr, ego->csi,
+	    ego->vl, ego->ivs, ego->ovs);
+}
+
+static void apply_hc2r(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     ASSERT_ALIGNED_DOUBLE;
+     ego->k(O, O + ego->rs0, I, I + ego->ioffset, 
+	    ego->rs, ego->csr, ego->csi,
+	    ego->vl, ego->ivs, ego->ovs);
+}
+
+/*************************************************************
+  Buffered code
+ *************************************************************/
+/* should not be 2^k to avoid associativity conflicts */
+static INT compute_batchsize(INT radix)
+{
+     /* round up to multiple of 4 */
+     radix += 3;
+     radix &= -4;
+
+     return (radix + 2);
+}
+
+static void dobatch_r2hc(const P *ego, R *I, R *O, R *buf, INT batchsz)
+{
+     X(cpy2d_ci)(I, buf,
+		 ego->n, ego->rs0, WS(ego->bcsr /* hack */, 1),
+		 batchsz, ego->ivs, 1, 1);
+
+     if (IABS(WS(ego->csr, 1)) < IABS(ego->ovs)) {
+	  /* transform directly to output */
+	  ego->k(buf, buf + WS(ego->bcsr /* hack */, 1), 
+		 O, O + ego->ioffset, 
+		 ego->brs, ego->csr, ego->csi,
+		 batchsz, 1, ego->ovs);
+     } else {
+	  /* transform to buffer and copy back */
+	  ego->k(buf, buf + WS(ego->bcsr /* hack */, 1), 
+		 buf, buf + ego->bioffset, 
+		 ego->brs, ego->bcsr, ego->bcsi,
+		 batchsz, 1, 1);
+	  X(cpy2d_co)(buf, O,
+		      ego->n, WS(ego->bcsr, 1), WS(ego->csr, 1),  
+		      batchsz, 1, ego->ovs, 1);
+     }
+}
+
+static void dobatch_hc2r(const P *ego, R *I, R *O, R *buf, INT batchsz)
+{
+     if (IABS(WS(ego->csr, 1)) < IABS(ego->ivs)) {
+	  /* transform directly from input */
+	  ego->k(buf, buf + WS(ego->bcsr /* hack */, 1),
+		 I, I + ego->ioffset, 
+		 ego->brs, ego->csr, ego->csi,
+		 batchsz, ego->ivs, 1);
+     } else {
+	  /* copy into buffer and transform in place */
+	  X(cpy2d_ci)(I, buf,
+		      ego->n, WS(ego->csr, 1), WS(ego->bcsr, 1),
+		      batchsz, ego->ivs, 1, 1);
+	  ego->k(buf, buf + WS(ego->bcsr /* hack */, 1),
+		 buf, buf + ego->bioffset, 
+		 ego->brs, ego->bcsr, ego->bcsi,
+		 batchsz, 1, 1);
+     }
+     X(cpy2d_co)(buf, O,
+		 ego->n, WS(ego->bcsr /* hack */, 1), ego->rs0,
+		 batchsz, 1, ego->ovs, 1);
+}
+
+static void iterate(const P *ego, R *I, R *O,
+		    void (*dobatch)(const P *ego, R *I, R *O, 
+				    R *buf, INT batchsz))
+{
+     R *buf;
+     INT vl = ego->vl;
+     INT n = ego->n;
+     INT i;
+     INT batchsz = compute_batchsize(n);
+     size_t bufsz = n * batchsz * sizeof(R);
+
+     BUF_ALLOC(R *, buf, bufsz);
+
+     for (i = 0; i < vl - batchsz; i += batchsz) {
+	  dobatch(ego, I, O, buf, batchsz);
+	  I += batchsz * ego->ivs;
+	  O += batchsz * ego->ovs;
+     }
+     dobatch(ego, I, O, buf, vl - i);
+
+     BUF_FREE(buf, bufsz);
+}
+
+static void apply_buf_r2hc(const plan *ego_, R *I, R *O)
+{
+     iterate((const P *) ego_, I, O, dobatch_r2hc);
+}
+
+static void apply_buf_hc2r(const plan *ego_, R *I, R *O)
+{
+     iterate((const P *) ego_, I, O, dobatch_hc2r);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(stride_destroy)(ego->rs);
+     X(stride_destroy)(ego->csr);
+     X(stride_destroy)(ego->csi);
+     X(stride_destroy)(ego->brs);
+     X(stride_destroy)(ego->bcsr);
+     X(stride_destroy)(ego->bcsi);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->slv;
+
+     if (ego->slv->bufferedp)
+	  p->print(p, "(rdft-%s-directbuf/%D-r2c-%D%v \"%s\")", 
+		   X(rdft_kind_str)(s->desc->genus->kind), 
+		   /* hack */ WS(ego->bcsr, 1), ego->n, 
+		   ego->vl, s->desc->nam);
+
+     else 
+	  p->print(p, "(rdft-%s-direct-r2c-%D%v \"%s\")", 
+		   X(rdft_kind_str)(s->desc->genus->kind), ego->n, 
+		   ego->vl, s->desc->nam);
+}
+
+static INT ioffset(rdft_kind kind, INT sz, INT s)
+{
+     return(s * ((kind == R2HC || kind == HC2R) ? sz : (sz - 1)));
+}
+
+static int applicable(const solver *ego_, const problem *p_)
+{
+     const S *ego = (const S *) ego_;
+     const kr2c_desc *desc = ego->desc;
+     const problem_rdft *p = (const problem_rdft *) p_;
+     INT vl, ivs, ovs;
+
+     return (
+	  1
+	  && p->sz->rnk == 1
+	  && p->vecsz->rnk <= 1
+	  && p->sz->dims[0].n == desc->n
+	  && p->kind[0] == desc->genus->kind
+
+	  /* check strides etc */
+	  && X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
+
+	  && (0
+	      /* can operate out-of-place */
+	      || p->I != p->O
+
+	      /* computing one transform */
+	      || vl == 1
+
+	      /* can operate in-place as long as strides are the same */
+	      || X(tensor_inplace_strides2)(p->sz, p->vecsz)
+	       )
+	  );
+}
+
+static int applicable_buf(const solver *ego_, const problem *p_)
+{
+     const S *ego = (const S *) ego_;
+     const kr2c_desc *desc = ego->desc;
+     const problem_rdft *p = (const problem_rdft *) p_;
+     INT vl, ivs, ovs, batchsz;
+
+     return (
+	  1
+	  && p->sz->rnk == 1
+	  && p->vecsz->rnk <= 1
+	  && p->sz->dims[0].n == desc->n
+	  && p->kind[0] == desc->genus->kind
+
+	  /* check strides etc */
+	  && X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
+
+	  && (batchsz = compute_batchsize(desc->n), 1)
+
+	  && (0
+	      /* can operate out-of-place */
+	      || p->I != p->O
+
+	      /* can operate in-place as long as strides are the same */
+	      || X(tensor_inplace_strides2)(p->sz, p->vecsz)
+
+	      /* can do it if the problem fits in the buffer, no matter
+		 what the strides are */
+	      || vl <= batchsz
+	       )
+	  );
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     P *pln;
+     const problem_rdft *p;
+     iodim *d;
+     INT rs, cs, b, n;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), X(null_awake), print, destroy
+     };
+
+     UNUSED(plnr);
+
+     if (ego->bufferedp) {
+	  if (!applicable_buf(ego_, p_))
+	       return (plan *)0;
+     } else {
+	  if (!applicable(ego_, p_))
+	       return (plan *)0;
+     }
+
+     p = (const problem_rdft *) p_;
+
+     if (R2HC_KINDP(p->kind[0])) {
+	  rs = p->sz->dims[0].is; cs = p->sz->dims[0].os;
+	  pln = MKPLAN_RDFT(P, &padt, 
+			    ego->bufferedp ? apply_buf_r2hc : apply_r2hc);
+     } else {
+	  rs = p->sz->dims[0].os; cs = p->sz->dims[0].is;
+	  pln = MKPLAN_RDFT(P, &padt, 
+			    ego->bufferedp ? apply_buf_hc2r : apply_hc2r);
+     }
+
+     d = p->sz->dims;
+     n = d[0].n;
+
+     pln->k = ego->k;
+     pln->n = n;
+
+     pln->rs0 = rs;
+     pln->rs = X(mkstride)(n, 2 * rs);
+     pln->csr = X(mkstride)(n, cs);
+     pln->csi = X(mkstride)(n, -cs);
+     pln->ioffset = ioffset(p->kind[0], n, cs);
+
+     b = compute_batchsize(n);
+     pln->brs = X(mkstride)(n, 2 * b);
+     pln->bcsr = X(mkstride)(n, b);
+     pln->bcsi = X(mkstride)(n, -b);
+     pln->bioffset = ioffset(p->kind[0], n, b);
+
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+
+     pln->slv = ego;
+     X(ops_zero)(&pln->super.super.ops);
+
+     X(ops_madd2)(pln->vl / ego->desc->genus->vl,
+		  &ego->desc->ops,
+		  &pln->super.super.ops);
+
+     if (ego->bufferedp) 
+	  pln->super.super.ops.other += 2 * n * pln->vl;
+
+     pln->super.super.could_prune_now_p = !ego->bufferedp;
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(kr2c k, const kr2c_desc *desc, int bufferedp)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->k = k;
+     slv->desc = desc;
+     slv->bufferedp = bufferedp;
+     return &(slv->super);
+}
+
+solver *X(mksolver_rdft_r2c_direct)(kr2c k, const kr2c_desc *desc)
+{
+     return mksolver(k, desc, 0);
+}
+
+solver *X(mksolver_rdft_r2c_directbuf)(kr2c k, const kr2c_desc *desc)
+{
+     return mksolver(k, desc, 1);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/direct-r2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/direct-r2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* direct RDFT solver, using r2r codelets */
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+     const kr2r_desc *desc;
+     kr2r k;
+} S;
+
+typedef struct {
+     plan_rdft super;
+
+     INT vl, ivs, ovs;
+     stride is, os;
+     kr2r k;
+     const S *slv;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     ASSERT_ALIGNED_DOUBLE;
+     ego->k(I, O, ego->is, ego->os, ego->vl, ego->ivs, ego->ovs);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(stride_destroy)(ego->is);
+     X(stride_destroy)(ego->os);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->slv;
+
+     p->print(p, "(rdft-%s-direct-r2r-%D%v \"%s\")", 
+	      X(rdft_kind_str)(s->desc->kind), s->desc->n,
+	      ego->vl, s->desc->nam);
+}
+
+static int applicable(const solver *ego_, const problem *p_)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft *p = (const problem_rdft *) p_;
+     INT vl;
+     INT ivs, ovs;
+
+     return (
+	  1
+	  && p->sz->rnk == 1
+	  && p->vecsz->rnk <= 1
+	  && p->sz->dims[0].n == ego->desc->n
+	  && p->kind[0] == ego->desc->kind
+
+	  /* check strides etc */
+	  && X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
+
+	  && (0
+	      /* can operate out-of-place */
+	      || p->I != p->O
+
+	      /* computing one transform */
+	      || vl == 1
+
+	      /* can operate in-place as long as strides are the same */
+	      || X(tensor_inplace_strides2)(p->sz, p->vecsz)
+	       )
+	  );
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     P *pln;
+     const problem_rdft *p;
+     iodim *d;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), X(null_awake), print, destroy
+     };
+
+     UNUSED(plnr);
+
+     if (!applicable(ego_, p_))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+
+
+     pln = MKPLAN_RDFT(P, &padt, apply);
+
+     d = p->sz->dims;
+
+     pln->k = ego->k;
+
+     pln->is = X(mkstride)(d->n, d->is);
+     pln->os = X(mkstride)(d->n, d->os);
+
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+
+     pln->slv = ego;
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl / ego->desc->genus->vl,
+		  &ego->desc->ops,
+		  &pln->super.super.ops);
+
+     pln->super.super.could_prune_now_p = 1;
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+solver *X(mksolver_rdft_r2r_direct)(kr2r k, const kr2r_desc *desc)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->k = k;
+     slv->desc = desc;
+     return &(slv->super);
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/direct2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/direct2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* direct RDFT2 R2HC/HC2R solver, if we have a codelet */
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+     const kr2c_desc *desc;
+     kr2c k;
+} S;
+
+typedef struct {
+     plan_rdft2 super;
+
+     stride rs, cs;
+     INT vl;
+     INT ivs, ovs;
+     kr2c k;
+     const S *slv;
+     INT ilast;
+} P;
+
+static void apply(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     ASSERT_ALIGNED_DOUBLE;
+     ego->k(r0, r1, cr, ci,
+	    ego->rs, ego->cs, ego->cs,
+	    ego->vl, ego->ivs, ego->ovs);
+}
+
+static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     INT i, vl = ego->vl, ovs = ego->ovs;
+     ASSERT_ALIGNED_DOUBLE;
+     ego->k(r0, r1, cr, ci,
+	    ego->rs, ego->cs, ego->cs,
+	    vl, ego->ivs, ovs);
+     for (i = 0; i < vl; ++i, ci += ovs)
+	  ci[0] = ci[ego->ilast] = 0;
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(stride_destroy)(ego->rs);
+     X(stride_destroy)(ego->cs);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->slv;
+
+     p->print(p, "(rdft2-%s-direct-%D%v \"%s\")", 
+	      X(rdft_kind_str)(s->desc->genus->kind), s->desc->n, 
+	      ego->vl, s->desc->nam);
+}
+
+static int applicable(const solver *ego_, const problem *p_)
+{
+     const S *ego = (const S *) ego_;
+     const kr2c_desc *desc = ego->desc;
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     INT vl;
+     INT ivs, ovs;
+
+     return (
+	  1
+	  && p->sz->rnk == 1
+	  && p->vecsz->rnk <= 1
+	  && p->sz->dims[0].n == desc->n
+	  && p->kind == desc->genus->kind
+
+	  /* check strides etc */
+	  && X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
+
+	  && (0
+	      /* can operate out-of-place */
+	      || p->r0 != p->cr
+
+	      /*
+	       * can compute one transform in-place, no matter
+	       * what the strides are.
+	       */
+	      || p->vecsz->rnk == 0
+
+	      /* can operate in-place as long as strides are the same */
+	      || X(rdft2_inplace_strides)(p, RNK_MINFTY)
+	       )
+	  );
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     P *pln;
+     const problem_rdft2 *p;
+     iodim *d;
+     int r2hc_kindp;
+
+     static const plan_adt padt = {
+	  X(rdft2_solve), X(null_awake), print, destroy
+     };
+
+     UNUSED(plnr);
+
+     if (!applicable(ego_, p_))
+          return (plan *)0;
+
+     p = (const problem_rdft2 *) p_;
+
+     r2hc_kindp = R2HC_KINDP(p->kind);
+     A(r2hc_kindp || HC2R_KINDP(p->kind));
+
+     pln = MKPLAN_RDFT2(P, &padt, p->kind == R2HC ? apply_r2hc : apply);
+
+     d = p->sz->dims;
+
+     pln->k = ego->k;
+
+     pln->rs = X(mkstride)(d->n, r2hc_kindp ? d->is : d->os);
+     pln->cs = X(mkstride)(d->n, r2hc_kindp ? d->os : d->is);
+
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+
+     /* Nyquist freq., if any */
+     pln->ilast = (d->n % 2) ? 0 : (d->n/2) * d->os;
+
+     pln->slv = ego;
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl / ego->desc->genus->vl,
+		  &ego->desc->ops,
+		  &pln->super.super.ops);
+     if (p->kind == R2HC)
+	  pln->super.super.ops.other += 2 * pln->vl; /* + 2 stores */
+
+     pln->super.super.could_prune_now_p = 1;
+     return &(pln->super.super);
+}
+
+/* constructor */
+solver *X(mksolver_rdft2_direct)(kr2c k, const kr2c_desc *desc)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->k = k;
+     slv->desc = desc;
+     return &(slv->super);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/generic.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/generic.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+     rdft_kind kind;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     twid *td;
+     INT n, is, os;
+     rdft_kind kind;
+} P;
+
+/***************************************************************************/
+
+static void cdot_r2hc(INT n, const E *x, const R *w, R *or0, R *oi1)
+{
+     INT i;
+
+     E rr = x[0], ri = 0;
+     x += 1;
+     for (i = 1; i + i < n; ++i) {
+	  rr += x[0] * w[0];
+	  ri += x[1] * w[1];
+	  x += 2; w += 2;
+     }
+     *or0 = rr;
+     *oi1 = ri;
+}
+
+static void hartley_r2hc(INT n, const R *xr, INT xs, E *o, R *pr)
+{
+     INT i;
+     E sr;
+     o[0] = sr = xr[0]; o += 1;
+     for (i = 1; i + i < n; ++i) {
+	  R a, b;
+	  a = xr[i * xs];
+	  b =  xr[(n - i) * xs];
+	  sr += (o[0] = a + b);
+#if FFT_SIGN == -1
+	  o[1] = b - a;
+#else
+	  o[1] = a - b;
+#endif
+	  o += 2;
+     }
+     *pr = sr;
+}
+		    
+static void apply_r2hc(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT i;
+     INT n = ego->n, is = ego->is, os = ego->os;
+     const R *W = ego->td->W;
+     E *buf;
+     size_t bufsz = n * sizeof(E);
+
+     BUF_ALLOC(E *, buf, bufsz);
+     hartley_r2hc(n, I, is, buf, O);
+
+     for (i = 1; i + i < n; ++i) {
+	  cdot_r2hc(n, buf, W, O + i * os, O + (n - i) * os);
+	  W += n - 1;
+     }
+
+     BUF_FREE(buf, bufsz);
+}
+
+
+static void cdot_hc2r(INT n, const E *x, const R *w, R *or0, R *or1)
+{
+     INT i;
+
+     E rr = x[0], ii = 0; 
+     x += 1;
+     for (i = 1; i + i < n; ++i) {
+	  rr += x[0] * w[0];
+	  ii += x[1] * w[1];
+	  x += 2; w += 2;
+     }
+#if FFT_SIGN == -1
+     *or0 = rr - ii;
+     *or1 = rr + ii;
+#else
+     *or0 = rr + ii;
+     *or1 = rr - ii;
+#endif
+}
+
+static void hartley_hc2r(INT n, const R *x, INT xs, E *o, R *pr)
+{
+     INT i;
+     E sr;
+
+     o[0] = sr = x[0]; o += 1;
+     for (i = 1; i + i < n; ++i) {
+	  sr += (o[0] = x[i * xs] + x[i * xs]);
+	  o[1] = x[(n - i) * xs] + x[(n - i) * xs];
+	  o += 2;
+     }
+     *pr = sr;
+}
+
+static void apply_hc2r(const plan *ego_, R *I, R *O)		    
+{
+     const P *ego = (const P *) ego_;
+     INT i;
+     INT n = ego->n, is = ego->is, os = ego->os;
+     const R *W = ego->td->W;
+     E *buf;
+     size_t bufsz = n * sizeof(E);
+
+     BUF_ALLOC(E *, buf, bufsz);
+     hartley_hc2r(n, I, is, buf, O);
+
+     for (i = 1; i + i < n; ++i) {
+	  cdot_hc2r(n, buf, W, O + i * os, O + (n - i) * os);
+	  W += n - 1;
+     }
+
+     BUF_FREE(buf, bufsz);
+}
+
+
+/***************************************************************************/
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     static const tw_instr half_tw[] = {
+	  { TW_HALF, 1, 0 },
+	  { TW_NEXT, 1, 0 }
+     };
+
+     X(twiddle_awake)(wakefulness, &ego->td, half_tw, ego->n, ego->n,
+		      (ego->n - 1) / 2);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+
+     p->print(p, "(rdft-generic-%s-%D)", 
+	      ego->kind == R2HC ? "r2hc" : "hc2r", 
+	      ego->n);
+}
+
+static int applicable(const S *ego, const problem *p_, 
+		      const planner *plnr)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk == 0
+	     && (p->sz->dims[0].n % 2) == 1 
+	     && CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD)
+	     && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW)
+	     && X(is_prime)(p->sz->dims[0].n)
+	     && p->kind[0] == ego->kind
+	  );
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *)ego_;
+     const problem_rdft *p;
+     P *pln;
+     INT n;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, X(plan_null_destroy)
+     };
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+     pln = MKPLAN_RDFT(P, &padt, 
+		       R2HC_KINDP(p->kind[0]) ? apply_r2hc : apply_hc2r);
+
+     pln->n = n = p->sz->dims[0].n;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+     pln->td = 0;
+     pln->kind = ego->kind;
+
+     pln->super.super.ops.add = (n-1) * 2.5;
+     pln->super.super.ops.mul = 0;
+     pln->super.super.ops.fma = 0.5 * (n-1) * (n-1) ;
+#if 0 /* these are nice pipelined sequential loads and should cost nothing */
+     pln->super.super.ops.other = (n-1)*(2 + 1 + (n-1));  /* approximate */
+#endif
+
+     return &(pln->super.super);
+}
+
+static solver *mksolver(rdft_kind kind)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->kind = kind;
+     return &(slv->super);
+}
+
+void X(rdft_generic_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver(R2HC));
+     REGISTER_SOLVER(p, mksolver(HC2R));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/hc2hc-direct.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/hc2hc-direct.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "hc2hc.h"
+
+typedef struct {
+     hc2hc_solver super;
+     const hc2hc_desc *desc;
+     khc2hc k;
+     int bufferedp;
+} S;
+
+typedef struct {
+     plan_hc2hc super;
+     khc2hc k;
+     plan *cld0, *cldm; /* children for 0th and middle butterflies */
+     INT r, m, v;
+     INT ms, vs, mb, me;
+     stride rs, brs;
+     twid *td;
+     const S *slv;
+} P;
+
+/*************************************************************
+  Nonbuffered code
+*************************************************************/
+static void apply(const plan *ego_, R *IO)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld0 = (plan_rdft *) ego->cld0;
+     plan_rdft *cldm = (plan_rdft *) ego->cldm;
+     INT i, m = ego->m, v = ego->v;
+     INT mb = ego->mb, me = ego->me;
+     INT ms = ego->ms, vs = ego->vs;
+
+     for (i = 0; i < v; ++i, IO += vs) {
+	  cld0->apply((plan *) cld0, IO, IO);
+	  ego->k(IO + ms * mb, IO + (m - mb) * ms, 
+		 ego->td->W, ego->rs, mb, me, ms);
+	  cldm->apply((plan *) cldm, IO + (m/2) * ms, IO + (m/2) * ms);
+     }
+}
+
+/*************************************************************
+  Buffered code
+*************************************************************/
+
+/* should not be 2^k to avoid associativity conflicts */
+static INT compute_batchsize(INT radix)
+{
+     /* round up to multiple of 4 */
+     radix += 3;
+     radix &= -4;
+
+     return (radix + 2);
+}
+
+static void dobatch(const P *ego, R *IOp, R *IOm,
+		    INT mb, INT me, R *bufp)
+{
+     INT b = WS(ego->brs, 1);
+     INT rs = WS(ego->rs, 1);
+     INT r = ego->r;
+     INT ms = ego->ms;
+     R *bufm = bufp + b - 1;
+
+     X(cpy2d_ci)(IOp + mb * ms, bufp, r, rs, b, me - mb,  ms,  1, 1);
+     X(cpy2d_ci)(IOm - mb * ms, bufm, r, rs, b, me - mb, -ms, -1, 1);
+
+     ego->k(bufp, bufm, ego->td->W, ego->brs, mb, me, 1);
+
+     X(cpy2d_co)(bufp, IOp + mb * ms, r, b, rs, me - mb,  1,  ms, 1);
+     X(cpy2d_co)(bufm, IOm - mb * ms, r, b, rs, me - mb, -1, -ms, 1);
+}
+
+static void apply_buf(const plan *ego_, R *IO)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld0 = (plan_rdft *) ego->cld0;
+     plan_rdft *cldm = (plan_rdft *) ego->cldm;
+     INT i, j, m = ego->m, v = ego->v, r = ego->r;
+     INT mb = ego->mb, me = ego->me, ms = ego->ms;
+     INT batchsz = compute_batchsize(r);
+     R *buf;
+     size_t bufsz = r * batchsz * 2 * sizeof(R);
+
+     BUF_ALLOC(R *, buf, bufsz);
+
+     for (i = 0; i < v; ++i, IO += ego->vs) {
+	  R *IOp = IO;
+	  R *IOm = IO + m * ms;
+
+	  cld0->apply((plan *) cld0, IO, IO);
+
+	  for (j = mb; j + batchsz < me; j += batchsz) 	       
+	       dobatch(ego, IOp, IOm, j, j + batchsz, buf);
+
+	  dobatch(ego, IOp, IOm, j, me, buf);
+
+	  cldm->apply((plan *) cldm, IO + ms * (m/2), IO + ms * (m/2));
+     }
+
+     BUF_FREE(buf, bufsz);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(plan_awake)(ego->cld0, wakefulness);
+     X(plan_awake)(ego->cldm, wakefulness);
+     X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw, 
+		      ego->r * ego->m, ego->r, (ego->m - 1) / 2);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld0);
+     X(plan_destroy_internal)(ego->cldm);
+     X(stride_destroy)(ego->rs);
+     X(stride_destroy)(ego->brs);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *slv = ego->slv;
+     const hc2hc_desc *e = slv->desc;
+     INT batchsz = compute_batchsize(ego->r);
+
+     if (slv->bufferedp)
+	  p->print(p, "(hc2hc-directbuf/%D-%D/%D%v \"%s\"%(%p%)%(%p%))",
+		   batchsz, ego->r, X(twiddle_length)(ego->r, e->tw), 
+		   ego->v, e->nam, ego->cld0, ego->cldm);
+     else
+	  p->print(p, "(hc2hc-direct-%D/%D%v \"%s\"%(%p%)%(%p%))",
+		   ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam,
+		   ego->cld0, ego->cldm);
+}
+
+static int applicable0(const S *ego, rdft_kind kind, INT r)
+{
+     const hc2hc_desc *e = ego->desc;
+
+     return (1
+	     && r == e->radix
+	     && kind == e->genus->kind
+	  );
+}
+
+static int applicable(const S *ego, rdft_kind kind, INT r, INT m, INT v,
+		      const planner *plnr)
+{
+     if (!applicable0(ego, kind, r))
+          return 0;
+
+     if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
+				       v, m * r, r)) 
+	  return 0;
+
+     return 1;
+}
+
+#define CLDMP(m, mstart, mcount) (2 * ((mstart) + (mcount)) == (m) + 2)
+#define CLD0P(mstart) ((mstart) == 0)
+
+static plan *mkcldw(const hc2hc_solver *ego_, 
+		    rdft_kind kind, INT r, INT m, INT ms, INT v, INT vs, 
+		    INT mstart, INT mcount,
+		    R *IO, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     P *pln;
+     const hc2hc_desc *e = ego->desc;
+     plan *cld0 = 0, *cldm = 0;
+     INT imid = (m / 2) * ms;
+     INT rs = m * ms;
+
+     static const plan_adt padt = {
+	  0, awake, print, destroy
+     };
+
+     if (!applicable(ego, kind, r, m, v, plnr))
+          return (plan *)0;
+
+     cld0 = X(mkplan_d)(
+	  plnr, 
+	  X(mkproblem_rdft_1_d)((CLD0P(mstart) ?
+				 X(mktensor_1d)(r, rs, rs) : X(mktensor_0d)()),
+				X(mktensor_0d)(),
+				TAINT(IO, vs), TAINT(IO, vs), 
+				kind));
+     if (!cld0) goto nada;
+
+     cldm = X(mkplan_d)(
+	  plnr, 
+	  X(mkproblem_rdft_1_d)((CLDMP(m, mstart, mcount) ?
+				 X(mktensor_1d)(r, rs, rs) : X(mktensor_0d)()),
+				X(mktensor_0d)(),
+				TAINT(IO + imid, vs), TAINT(IO + imid, vs),
+				kind == R2HC ? R2HCII : HC2RIII));
+     if (!cldm) goto nada;
+	  
+     pln = MKPLAN_HC2HC(P, &padt, ego->bufferedp ? apply_buf : apply);
+
+     pln->k = ego->k;
+     pln->td = 0;
+     pln->r = r; pln->rs = X(mkstride)(r, rs);
+     pln->m = m; pln->ms = ms;
+     pln->v = v; pln->vs = vs;
+     pln->slv = ego;
+     pln->brs = X(mkstride)(r, 2 * compute_batchsize(r));
+     pln->cld0 = cld0;
+     pln->cldm = cldm;
+     pln->mb = mstart + CLD0P(mstart);
+     pln->me = mstart + mcount - CLDMP(m, mstart, mcount);
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(v * ((pln->me - pln->mb) / e->genus->vl),
+		  &e->ops, &pln->super.super.ops);
+     X(ops_madd2)(v, &cld0->ops, &pln->super.super.ops);
+     X(ops_madd2)(v, &cldm->ops, &pln->super.super.ops);
+
+     if (ego->bufferedp) 
+	  pln->super.super.ops.other += 4 * r * (pln->me - pln->mb) * v;
+
+     pln->super.super.could_prune_now_p =
+	  (!ego->bufferedp && r >= 5 && r < 64 && m >= r);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld0);
+     X(plan_destroy_internal)(cldm);
+     return 0;
+}
+
+static void regone(planner *plnr, khc2hc codelet, const hc2hc_desc *desc,
+		   int bufferedp)
+{
+     S *slv = (S *)X(mksolver_hc2hc)(sizeof(S), desc->radix, mkcldw);
+     slv->k = codelet;
+     slv->desc = desc;
+     slv->bufferedp = bufferedp;
+     REGISTER_SOLVER(plnr, &(slv->super.super));
+     if (X(mksolver_hc2hc_hook)) {
+	  slv = (S *)X(mksolver_hc2hc_hook)(sizeof(S), desc->radix, mkcldw);
+	  slv->k = codelet;
+	  slv->desc = desc;
+	  slv->bufferedp = bufferedp;
+	  REGISTER_SOLVER(plnr, &(slv->super.super));
+     }
+}
+
+void X(regsolver_hc2hc_direct)(planner *plnr, khc2hc codelet,
+			       const hc2hc_desc *desc)
+{
+     regone(plnr, codelet, desc, /* bufferedp */0);
+     regone(plnr, codelet, desc, /* bufferedp */1);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/hc2hc-generic.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/hc2hc-generic.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* express a hc2hc problem in terms of rdft + multiplication by
+   twiddle factors */
+
+#include "hc2hc.h"
+
+typedef hc2hc_solver S;
+
+typedef struct {
+     plan_hc2hc super;
+
+     INT r, m, s, vl, vs, mstart1, mcount1;
+     plan *cld0;
+     plan *cld;
+     twid *td;
+} P;
+
+
+/**************************************************************/
+static void mktwiddle(P *ego, enum wakefulness wakefulness)
+{
+     static const tw_instr tw[] = { { TW_HALF, 0, 0 }, { TW_NEXT, 1, 0 } };
+
+     /* note that R and M are swapped, to allow for sequential
+	access both to data and twiddles */
+     X(twiddle_awake)(wakefulness, &ego->td, tw, 
+		      ego->r * ego->m, ego->m, ego->r);
+}
+
+static void bytwiddle(const P *ego, R *IO, R sign)
+{
+     INT i, j, k;
+     INT r = ego->r, m = ego->m, s = ego->s, vl = ego->vl, vs = ego->vs;
+     INT ms = m * s;
+     INT mstart1 = ego->mstart1, mcount1 = ego->mcount1;
+     INT wrem = 2 * ((m-1)/2 - mcount1);
+
+     for (i = 0; i < vl; ++i, IO += vs) {
+	  const R *W = ego->td->W;
+
+	  A(m % 2 == 1);
+	  for (k = 1, W += (m - 1) + 2*(mstart1-1); k < r; ++k) {
+	       /* pr := IO + (j + mstart1) * s + k * ms */
+	       R *pr = IO + mstart1 * s + k * ms;
+
+	       /* pi := IO + (m - j - mstart1) * s + k * ms */
+	       R *pi = IO - mstart1 * s + (k + 1) * ms;
+
+	       for (j = 0; j < mcount1; ++j, pr += s, pi -= s) {
+		    E xr = *pr;
+		    E xi = *pi;
+		    E wr = W[0];
+		    E wi = sign * W[1];
+		    *pr = xr * wr - xi * wi;
+		    *pi = xi * wr + xr * wi;
+		    W += 2;
+	       }
+	       W += wrem;
+	  }
+     }
+}
+
+static void swapri(R *IO, INT r, INT m, INT s, INT jstart, INT jend)
+{
+     INT k;
+     INT ms = m * s;
+     INT js = jstart * s;
+     for (k = 0; k + k < r; ++k) {
+	  /* pr := IO + (m - j) * s + k * ms */
+	  R *pr = IO + (k + 1) * ms - js;
+	  /* pi := IO + (m - j) * s + (r - 1 - k) * ms */
+	  R *pi = IO + (r - k) * ms - js;
+	  INT j;
+	  for (j = jstart; j < jend; j += 1, pr -= s, pi -= s) {
+	       R t = *pr;
+	       *pr = *pi;
+	       *pi = t;
+	  }
+     }
+}
+
+static void reorder_dit(const P *ego, R *IO)
+{
+     INT i, k;
+     INT r = ego->r, m = ego->m, s = ego->s, vl = ego->vl, vs = ego->vs;
+     INT ms = m * s;
+     INT mstart1 = ego->mstart1, mend1 = mstart1 + ego->mcount1;
+
+     for (i = 0; i < vl; ++i, IO += vs) {
+	  for (k = 1; k + k < r; ++k) {
+	       R *p0 = IO + k * ms;
+	       R *p1 = IO + (r - k) * ms;
+	       INT j;
+
+	       for (j = mstart1; j < mend1; ++j) {
+		    E rp, ip, im, rm;
+		    rp = p0[j * s];
+		    im = p1[ms - j * s];
+		    rm = p1[j * s];
+		    ip = p0[ms - j * s];
+		    p0[j * s] = rp - im;
+		    p1[ms - j * s] = rp + im;
+		    p1[j * s] = rm - ip;
+		    p0[ms - j * s] = ip + rm;
+	       }
+	  }
+
+	  swapri(IO, r, m, s, mstart1, mend1);
+     }
+}
+
+static void reorder_dif(const P *ego, R *IO)
+{
+     INT i, k;
+     INT r = ego->r, m = ego->m, s = ego->s, vl = ego->vl, vs = ego->vs;
+     INT ms = m * s;
+     INT mstart1 = ego->mstart1, mend1 = mstart1 + ego->mcount1;
+
+     for (i = 0; i < vl; ++i, IO += vs) {
+	  swapri(IO, r, m, s, mstart1, mend1);
+
+	  for (k = 1; k + k < r; ++k) {
+	       R *p0 = IO + k * ms;
+	       R *p1 = IO + (r - k) * ms;
+	       const R half = K(0.5);
+	       INT j;
+
+	       for (j = mstart1; j < mend1; ++j) {
+		    E rp, ip, im, rm;
+		    rp = half * p0[j * s];
+		    im = half * p1[ms - j * s];
+		    rm = half * p1[j * s];
+		    ip = half * p0[ms - j * s];
+		    p0[j * s] = rp + im;
+		    p1[ms - j * s] = im - rp;
+		    p1[j * s] = rm + ip;
+		    p0[ms - j * s] = ip - rm;
+	       }
+	  }
+     }
+}
+
+static int applicable(rdft_kind kind, INT r, INT m, const planner *plnr)
+{
+     return (1 
+	     && (kind == R2HC || kind == HC2R)
+	     && (m % 2)
+	     && (r % 2)
+	     && !NO_SLOWP(plnr)
+	  );
+}
+
+/**************************************************************/
+
+static void apply_dit(const plan *ego_, R *IO)
+{
+     const P *ego = (const P *) ego_;
+     INT start;
+     plan_rdft *cld, *cld0;
+
+     bytwiddle(ego, IO, K(-1.0));
+
+     cld0 = (plan_rdft *) ego->cld0;
+     cld0->apply(ego->cld0, IO, IO);
+
+     start = ego->mstart1 * ego->s;
+     cld = (plan_rdft *) ego->cld;
+     cld->apply(ego->cld, IO + start, IO + start);
+
+     reorder_dit(ego, IO);
+}
+
+static void apply_dif(const plan *ego_, R *IO)
+{
+     const P *ego = (const P *) ego_;
+     INT start;
+     plan_rdft *cld, *cld0;
+
+     reorder_dif(ego, IO);
+
+     cld0 = (plan_rdft *) ego->cld0;
+     cld0->apply(ego->cld0, IO, IO);
+
+     start = ego->mstart1 * ego->s;
+     cld = (plan_rdft *) ego->cld;
+     cld->apply(ego->cld, IO + start, IO + start);
+
+     bytwiddle(ego, IO, K(1.0));
+}
+
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld0, wakefulness);
+     X(plan_awake)(ego->cld, wakefulness);
+     mktwiddle(ego, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+     X(plan_destroy_internal)(ego->cld0);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(hc2hc-generic-%s-%D-%D%v%(%p%)%(%p%))", 
+	      ego->super.apply == apply_dit ? "dit" : "dif",
+	      ego->r, ego->m, ego->vl, ego->cld0, ego->cld);
+}
+
+static plan *mkcldw(const hc2hc_solver *ego_, 
+		    rdft_kind kind, INT r, INT m, INT s, INT vl, INT vs, 
+		    INT mstart, INT mcount,
+		    R *IO, planner *plnr)
+{
+     P *pln;
+     plan *cld0 = 0, *cld = 0;
+     INT mstart1, mcount1, mstride;
+
+     static const plan_adt padt = {
+	  0, awake, print, destroy
+     };
+
+     UNUSED(ego_);
+
+     A(mstart >= 0 && mcount > 0 && mstart + mcount <= (m+2)/2);
+
+     if (!applicable(kind, r, m, plnr))
+          return (plan *)0;
+
+     A(m % 2);
+     mstart1 = mstart + (mstart == 0);
+     mcount1 = mcount - (mstart == 0);
+     mstride = m - (mstart + mcount - 1) - mstart1;
+
+     /* 0th (DC) transform (vl of these), if mstart == 0 */
+     cld0 = X(mkplan_d)(plnr, 
+			X(mkproblem_rdft_1_d)(
+			     mstart == 0 ? X(mktensor_1d)(r, m * s, m * s)
+			     : X(mktensor_0d)(),
+			     X(mktensor_1d)(vl, vs, vs),
+			     IO, IO, kind)
+			);
+     if (!cld0) goto nada;
+
+     /* twiddle transforms: there are 2 x mcount1 x vl of these
+	(where 2 corresponds to the real and imaginary parts) ...
+        the 2 x mcount1 loops are combined if mstart=0 and mcount=(m+2)/2. */
+     cld = X(mkplan_d)(plnr, 
+			X(mkproblem_rdft_1_d)(
+			     X(mktensor_1d)(r, m * s, m * s),
+			     X(mktensor_3d)(2, mstride * s, mstride * s,
+					    mcount1, s, s, 
+					    vl, vs, vs),
+			     IO + s * mstart1, IO + s * mstart1, kind)
+	                );
+     if (!cld) goto nada;
+     
+     pln = MKPLAN_HC2HC(P, &padt, (kind == R2HC) ? apply_dit : apply_dif);
+     pln->cld = cld;
+     pln->cld0 = cld0;
+     pln->r = r;
+     pln->m = m;
+     pln->s = s;
+     pln->vl = vl;
+     pln->vs = vs;
+     pln->td = 0;
+     pln->mstart1 = mstart1;
+     pln->mcount1 = mcount1;
+
+     {
+	  double n0 = 0.5 * (r - 1) * (2 * mcount1) * vl;
+	  pln->super.super.ops = cld->ops;
+	  pln->super.super.ops.mul += (kind == R2HC ? 5.0 : 7.0) * n0;
+	  pln->super.super.ops.add += 4.0 * n0;
+	  pln->super.super.ops.other += 11.0 * n0;
+     }
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld);
+     X(plan_destroy_internal)(cld0);
+     return (plan *) 0;
+}
+
+static void regsolver(planner *plnr, INT r)
+{
+     S *slv = (S *)X(mksolver_hc2hc)(sizeof(S), r, mkcldw);
+     REGISTER_SOLVER(plnr, &(slv->super));
+     if (X(mksolver_hc2hc_hook)) {
+	  slv = (S *)X(mksolver_hc2hc_hook)(sizeof(S), r, mkcldw);
+	  REGISTER_SOLVER(plnr, &(slv->super));
+     }
+}
+
+void X(hc2hc_generic_register)(planner *p)
+{
+     regsolver(p, 0);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/hc2hc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/hc2hc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "hc2hc.h"
+
+hc2hc_solver *(*X(mksolver_hc2hc_hook))(size_t, INT, hc2hc_mkinferior) = 0;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld;
+     plan *cldw;
+     INT r;
+} P;
+
+static void apply_dit(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld;
+     plan_hc2hc *cldw;
+
+     cld = (plan_rdft *) ego->cld;
+     cld->apply(ego->cld, I, O);
+
+     cldw = (plan_hc2hc *) ego->cldw;
+     cldw->apply(ego->cldw, O);
+}
+
+static void apply_dif(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld;
+     plan_hc2hc *cldw;
+
+     cldw = (plan_hc2hc *) ego->cldw;
+     cldw->apply(ego->cldw, I);
+
+     cld = (plan_rdft *) ego->cld;
+     cld->apply(ego->cld, I, O);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldw, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldw);
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(rdft-ct-%s/%D%(%p%)%(%p%))",
+	      ego->super.apply == apply_dit ? "dit" : "dif",
+	      ego->r, ego->cldw, ego->cld);
+}
+
+static int applicable0(const hc2hc_solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     INT r;
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1 
+
+	     && (/* either the problem is R2HC, which is solved by DIT */
+		  (p->kind[0] == R2HC)
+		  ||
+		  /* or the problem is HC2R, in which case it is solved
+		     by DIF, which destroys the input */
+		  (p->kind[0] == HC2R && 
+		   (p->I == p->O || !NO_DESTROY_INPUTP(plnr))))
+		  
+	     && ((r = X(choose_radix)(ego->r, p->sz->dims[0].n)) > 0)
+	     && p->sz->dims[0].n > r);
+}
+
+int X(hc2hc_applicable)(const hc2hc_solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_rdft *p;
+
+     if (!applicable0(ego, p_, plnr))
+          return 0;
+
+     p = (const problem_rdft *) p_;
+
+     return (0
+	     || p->vecsz->rnk == 0
+	     || !NO_VRECURSEP(plnr)
+	  );
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const hc2hc_solver *ego = (const hc2hc_solver *) ego_;
+     const problem_rdft *p;
+     P *pln = 0;
+     plan *cld = 0, *cldw = 0;
+     INT n, r, m, v, ivs, ovs;
+     iodim *d;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (NO_NONTHREADEDP(plnr) || !X(hc2hc_applicable)(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_rdft *) p_;
+     d = p->sz->dims;
+     n = d[0].n;
+     r = X(choose_radix)(ego->r, n);
+     m = n / r;
+
+     X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
+
+     switch (p->kind[0]) {
+	 case R2HC:
+	      cldw = ego->mkcldw(ego, 
+				 R2HC, r, m, d[0].os, v, ovs, 0, (m+2)/2, 
+				 p->O, plnr);
+	      if (!cldw) goto nada;
+
+	      cld = X(mkplan_d)(plnr, 
+				X(mkproblem_rdft_d)(
+				     X(mktensor_1d)(m, r * d[0].is, d[0].os),
+				     X(mktensor_2d)(r, d[0].is, m * d[0].os,
+						    v, ivs, ovs),
+				     p->I, p->O, p->kind)
+		   );
+	      if (!cld) goto nada;
+
+	      pln = MKPLAN_RDFT(P, &padt, apply_dit);
+	      break;
+
+	 case HC2R:
+	      cldw = ego->mkcldw(ego,
+				 HC2R, r, m, d[0].is, v, ivs, 0, (m+2)/2, 
+				 p->I, plnr);
+	      if (!cldw) goto nada;
+
+	      cld = X(mkplan_d)(plnr, 
+				X(mkproblem_rdft_d)(
+				     X(mktensor_1d)(m, d[0].is, r * d[0].os),
+				     X(mktensor_2d)(r, m * d[0].is, d[0].os,
+						    v, ivs, ovs),
+				     p->I, p->O, p->kind)
+		   );
+	      if (!cld) goto nada;
+	      
+	      pln = MKPLAN_RDFT(P, &padt, apply_dif);
+	      break;
+
+	 default: 
+	      A(0);
+     }
+
+     pln->cld = cld;
+     pln->cldw = cldw;
+     pln->r = r;
+     X(ops_add)(&cld->ops, &cldw->ops, &pln->super.super.ops);
+
+     /* inherit could_prune_now_p attribute from cldw */
+     pln->super.super.could_prune_now_p = cldw->could_prune_now_p;
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldw);
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+hc2hc_solver *X(mksolver_hc2hc)(size_t size, INT r, hc2hc_mkinferior mkcldw)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     hc2hc_solver *slv = (hc2hc_solver *)X(mksolver)(size, &sadt);
+     slv->r = r;
+     slv->mkcldw = mkcldw;
+     return slv;
+}
+
+plan *X(mkplan_hc2hc)(size_t size, const plan_adt *adt, hc2hcapply apply)
+{
+     plan_hc2hc *ego;
+
+     ego = (plan_hc2hc *) X(mkplan)(size, adt);
+     ego->apply = apply;
+
+     return &(ego->super);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/hc2hc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/hc2hc.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "rdft.h"
+
+typedef void (*hc2hcapply) (const plan *ego, R *IO);
+typedef struct hc2hc_solver_s hc2hc_solver;
+typedef plan *(*hc2hc_mkinferior)(const hc2hc_solver *ego,
+			    rdft_kind kind, INT r, INT m, INT s, 
+			    INT vl, INT vs, INT mstart, INT mcount,
+			    R *IO, planner *plnr);
+
+typedef struct {
+     plan super;
+     hc2hcapply apply;
+} plan_hc2hc;
+
+extern plan *X(mkplan_hc2hc)(size_t size, const plan_adt *adt, 
+			     hc2hcapply apply);
+
+#define MKPLAN_HC2HC(type, adt, apply) \
+  (type *)X(mkplan_hc2hc)(sizeof(type), adt, apply)
+
+struct hc2hc_solver_s {
+     solver super;
+     INT r;
+
+     hc2hc_mkinferior mkcldw;
+};
+
+hc2hc_solver *X(mksolver_hc2hc)(size_t size, INT r, hc2hc_mkinferior mkcldw);
+extern hc2hc_solver *(*X(mksolver_hc2hc_hook))(size_t, INT, hc2hc_mkinferior);
+
+void X(regsolver_hc2hc_direct)(planner *plnr, khc2hc codelet, 
+			       const hc2hc_desc *desc);
+
+int X(hc2hc_applicable)(const hc2hc_solver *, const problem *, planner *);
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/indirect.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/indirect.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+
+/* solvers/plans for vectors of small RDFT's that cannot be done
+   in-place directly.  Use a rank-0 plan to rearrange the data
+   before or after the transform.  Can also change an out-of-place
+   plan into a copy + in-place (where the in-place transform
+   is e.g. unit stride). */
+
+/* FIXME: merge with rank-geq2.c(?), since this is just a special case
+   of a rank split where the first/second transform has rank 0. */
+
+#include "rdft.h"
+
+typedef problem *(*mkcld_t) (const problem_rdft *p);
+
+typedef struct {
+     rdftapply apply;
+     problem *(*mkcld)(const problem_rdft *p);
+     const char *nam;
+} ndrct_adt;
+
+typedef struct {
+     solver super;
+     const ndrct_adt *adt;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cldcpy, *cld;
+     const S *slv;
+} P;
+
+/*-----------------------------------------------------------------------*/
+/* first rearrange, then transform */
+static void apply_before(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+
+     {
+          plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
+          cldcpy->apply(ego->cldcpy, I, O);
+     }
+     {
+          plan_rdft *cld = (plan_rdft *) ego->cld;
+          cld->apply(ego->cld, O, O);
+     }
+}
+
+static problem *mkcld_before(const problem_rdft *p)
+{
+     return X(mkproblem_rdft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_OS),
+				X(tensor_copy_inplace)(p->vecsz, INPLACE_OS),
+				p->O, p->O, p->kind);
+}
+
+static const ndrct_adt adt_before =
+{
+     apply_before, mkcld_before, "rdft-indirect-before"
+};
+
+/*-----------------------------------------------------------------------*/
+/* first transform, then rearrange */
+
+static void apply_after(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+
+     {
+          plan_rdft *cld = (plan_rdft *) ego->cld;
+          cld->apply(ego->cld, I, I);
+     }
+     {
+          plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
+          cldcpy->apply(ego->cldcpy, I, O);
+     }
+}
+
+static problem *mkcld_after(const problem_rdft *p)
+{
+     return X(mkproblem_rdft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_IS),
+				X(tensor_copy_inplace)(p->vecsz, INPLACE_IS),
+				p->I, p->I, p->kind);
+}
+
+static const ndrct_adt adt_after =
+{
+     apply_after, mkcld_after, "rdft-indirect-after"
+};
+
+/*-----------------------------------------------------------------------*/
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+     X(plan_destroy_internal)(ego->cldcpy);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cldcpy, wakefulness);
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->slv;
+     p->print(p, "(%s%(%p%)%(%p%))", s->adt->nam, ego->cld, ego->cldcpy);
+}
+
+static int applicable0(const solver *ego_, const problem *p_,
+		       const planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft *p = (const problem_rdft *) p_;
+     return (1
+	     && FINITE_RNK(p->vecsz->rnk)
+
+	     /* problem must be a nontrivial transform, not just a copy */
+	     && p->sz->rnk > 0
+
+	     && (0
+
+		 /* problem must be in-place & require some
+		    rearrangement of the data */
+		 || (p->I == p->O
+		     && !(X(tensor_inplace_strides2)(p->sz, p->vecsz)))
+
+		 /* or problem must be out of place, transforming
+		    from stride 1/2 to bigger stride, for apply_after */
+		 || (p->I != p->O && ego->adt->apply == apply_after
+		     && !NO_DESTROY_INPUTP(plnr)
+		     && X(tensor_min_istride)(p->sz) <= 2
+		     && X(tensor_min_ostride)(p->sz) > 2)
+			  
+		 /* or problem must be out of place, transforming
+		    to stride 1/2 from bigger stride, for apply_before */
+		 || (p->I != p->O && ego->adt->apply == apply_before
+		     && X(tensor_min_ostride)(p->sz) <= 2
+		     && X(tensor_min_istride)(p->sz) > 2)
+			  
+		  )
+	  );
+}
+
+static int applicable(const solver *ego_, const problem *p_,
+		      const planner *plnr)
+{
+     if (!applicable0(ego_, p_, plnr)) return 0;
+	  
+     if (NO_INDIRECT_OP_P(plnr)) {
+	  const problem_rdft *p = (const problem_rdft *)p_;
+	  if (p->I != p->O) return 0;
+     }
+
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     const S *ego = (const S *) ego_;
+     P *pln;
+     plan *cld = 0, *cldcpy = 0;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *) 0;
+
+     cldcpy = X(mkplan_d)(plnr,
+			  X(mkproblem_rdft_0_d)(
+			       X(tensor_append)(p->vecsz, p->sz),
+			       p->I, p->O));
+     if (!cldcpy) goto nada;
+
+     cld = X(mkplan_f_d)(plnr, ego->adt->mkcld(p), NO_BUFFERING, 0, 0);
+     if (!cld) goto nada;
+
+     pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
+     pln->cld = cld;
+     pln->cldcpy = cldcpy;
+     pln->slv = ego;
+     X(ops_add)(&cld->ops, &cldcpy->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld);
+     X(plan_destroy_internal)(cldcpy);
+     return (plan *)0;
+}
+
+static solver *mksolver(const ndrct_adt *adt)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->adt = adt;
+     return &(slv->super);
+}
+
+void X(rdft_indirect_register)(planner *p)
+{
+     unsigned i;
+     static const ndrct_adt *const adts[] = {
+	  &adt_before, &adt_after
+     };
+
+     for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
+          REGISTER_SOLVER(p, mksolver(adts[i]));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/khc2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/khc2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ct-hc2c.h"
+
+void X(khc2c_register)(planner *p, khc2c codelet, const hc2c_desc *desc,
+		       hc2c_kind hc2ckind)
+{
+     X(regsolver_hc2c_direct)(p, codelet, desc, hc2ckind);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/khc2hc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/khc2hc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "hc2hc.h"
+
+void X(khc2hc_register)(planner *p, khc2hc codelet, const hc2hc_desc *desc)
+{
+     X(regsolver_hc2hc_direct)(p, codelet, desc);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/kr2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/kr2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+void X(kr2c_register)(planner *p, kr2c codelet, const kr2c_desc *desc)
+{
+     REGISTER_SOLVER(p, X(mksolver_rdft_r2c_direct)(codelet, desc));
+     REGISTER_SOLVER(p, X(mksolver_rdft_r2c_directbuf)(codelet, desc));
+     REGISTER_SOLVER(p, X(mksolver_rdft2_direct)(codelet, desc));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/kr2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/kr2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+void X(kr2r_register)(planner *p, kr2r codelet, const kr2r_desc *desc)
+{
+     REGISTER_SOLVER(p, X(mksolver_rdft_r2r_direct)(codelet, desc));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/nop.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/nop.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* plans for vrank -infty RDFTs (nothing to do) */
+
+#include "rdft.h"
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     UNUSED(ego_);
+     UNUSED(I);
+     UNUSED(O);
+}
+
+static int applicable(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego_);
+     return 0
+	  /* case 1 : -infty vector rank */
+	  || (p->vecsz->rnk == RNK_MINFTY)
+
+	  /* case 2 : rank-0 in-place rdft */
+	  || (1
+	      && p->sz->rnk == 0
+	      && FINITE_RNK(p->vecsz->rnk)
+	      && p->O == p->I
+	      && X(tensor_inplace_strides)(p->vecsz)
+	       );
+}
+
+static void print(const plan *ego, printer *p)
+{
+     UNUSED(ego);
+     p->print(p, "(rdft-nop)");
+}
+
+static plan *mkplan(const solver *ego, const problem *p, planner *plnr)
+{
+     static const plan_adt padt = {
+	  X(rdft_solve), X(null_awake), print, X(plan_null_destroy)
+     };
+     plan_rdft *pln;
+
+     UNUSED(plnr);
+
+     if (!applicable(ego, p))
+          return (plan *) 0;
+     pln = MKPLAN_RDFT(plan_rdft, &padt, apply);
+     X(ops_zero)(&pln->super.ops);
+
+     return &(pln->super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     return MKSOLVER(solver, &sadt);
+}
+
+void X(rdft_nop_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/nop2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/nop2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* plans for vrank -infty RDFT2s (nothing to do), as well as in-place
+   rank-0 HC2R.  Note that in-place rank-0 R2HC is *not* a no-op, because
+   we have to set the imaginary parts of the output to zero. */
+
+#include "rdft.h"
+
+static void apply(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     UNUSED(ego_);
+     UNUSED(r0);
+     UNUSED(r1);
+     UNUSED(cr);
+     UNUSED(ci);
+}
+
+static int applicable(const solver *ego_, const problem *p_)
+{
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     UNUSED(ego_);
+
+     return(0
+	    /* case 1 : -infty vector rank */
+	    || (p->vecsz->rnk == RNK_MINFTY)
+		 
+	    /* case 2 : rank-0 in-place rdft, except that
+	       R2HC is not a no-op because it sets the imaginary
+	       part to 0 */
+	    || (1
+		&& p->kind != R2HC
+		&& p->sz->rnk == 0
+		&& FINITE_RNK(p->vecsz->rnk)
+		&& (p->r0 == p->cr)
+		&& X(rdft2_inplace_strides)(p, RNK_MINFTY)
+		 ));
+}
+
+static void print(const plan *ego, printer *p)
+{
+     UNUSED(ego);
+     p->print(p, "(rdft2-nop)");
+}
+
+static plan *mkplan(const solver *ego, const problem *p, planner *plnr)
+{
+     static const plan_adt padt = {
+	  X(rdft2_solve), X(null_awake), print, X(plan_null_destroy)
+     };
+     plan_rdft2 *pln;
+
+     UNUSED(plnr);
+
+     if (!applicable(ego, p))
+          return (plan *) 0;
+     pln = MKPLAN_RDFT2(plan_rdft2, &padt, apply);
+     X(ops_zero)(&pln->super.ops);
+
+     return &(pln->super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
+     return MKSOLVER(solver, &sadt);
+}
+
+void X(rdft2_nop_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/plan.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/plan.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+plan *X(mkplan_rdft)(size_t size, const plan_adt *adt, rdftapply apply)
+{
+     plan_rdft *ego;
+
+     ego = (plan_rdft *) X(mkplan)(size, adt);
+     ego->apply = apply;
+
+     return &(ego->super);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/plan2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/plan2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+plan *X(mkplan_rdft2)(size_t size, const plan_adt *adt, rdft2apply apply)
+{
+     plan_rdft2 *ego;
+
+     ego = (plan_rdft2 *) X(mkplan)(size, adt);
+     ego->apply = apply;
+
+     return &(ego->super);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/problem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/problem.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+#include <stddef.h>
+
+static void destroy(problem *ego_)
+{
+     problem_rdft *ego = (problem_rdft *) ego_;
+#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
+     X(ifree0)(ego->kind);
+#endif
+     X(tensor_destroy2)(ego->vecsz, ego->sz);
+     X(ifree)(ego_);
+}
+
+static void kind_hash(md5 *m, const rdft_kind *kind, int rnk)
+{
+     int i;
+     for (i = 0; i < rnk; ++i)
+	  X(md5int)(m, kind[i]);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     X(md5puts)(m, "rdft");
+     X(md5int)(m, p->I == p->O);
+     kind_hash(m, p->kind, p->sz->rnk);
+     X(md5int)(m, X(alignment_of)(p->I));
+     X(md5int)(m, X(alignment_of)(p->O));
+     X(tensor_md5)(m, p->sz);
+     X(tensor_md5)(m, p->vecsz);
+}
+
+static void recur(const iodim *dims, int rnk, R *I)
+{
+     if (rnk == RNK_MINFTY)
+          return;
+     else if (rnk == 0)
+          I[0] = K(0.0);
+     else if (rnk > 0) {
+          INT i, n = dims[0].n, is = dims[0].is;
+
+	  if (rnk == 1) {
+	       /* this case is redundant but faster */
+	       for (i = 0; i < n; ++i)
+		    I[i * is] = K(0.0);
+	  } else {
+	       for (i = 0; i < n; ++i)
+		    recur(dims + 1, rnk - 1, I + i * is);
+	  }
+     }
+}
+
+void X(rdft_zerotens)(tensor *sz, R *I)
+{
+     recur(sz->dims, sz->rnk, I);
+}
+
+#define KSTR_LEN 8
+
+const char *X(rdft_kind_str)(rdft_kind kind)
+{
+     static const char kstr[][KSTR_LEN] = {
+	  "r2hc", "r2hc01", "r2hc10", "r2hc11",
+	  "hc2r", "hc2r01", "hc2r10", "hc2r11",
+	  "dht",
+	  "redft00", "redft01", "redft10", "redft11",
+	  "rodft00", "rodft01", "rodft10", "rodft11"
+     };
+     A(kind >= 0 && kind < sizeof(kstr) / KSTR_LEN);
+     return kstr[kind];
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_rdft *ego = (const problem_rdft *) ego_;
+     int i;
+     p->print(p, "(rdft %d %D %T %T", 
+	      X(alignment_of)(ego->I),
+	      (INT)(ego->O - ego->I), 
+	      ego->sz,
+	      ego->vecsz);
+     for (i = 0; i < ego->sz->rnk; ++i)
+	  p->print(p, " %d", (int)ego->kind[i]);
+     p->print(p, ")");
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_rdft *ego = (const problem_rdft *) ego_;
+     tensor *sz = X(tensor_append)(ego->vecsz, ego->sz);
+     X(rdft_zerotens)(sz, UNTAINT(ego->I));
+     X(tensor_destroy)(sz);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_RDFT,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+/* Dimensions of size 1 that are not REDFT/RODFT are no-ops and can be
+   eliminated.  REDFT/RODFT unit dimensions often have factors of 2.0
+   and suchlike from normalization and phases, although in principle
+   these constant factors from different dimensions could be combined. */
+static int nontrivial(const iodim *d, rdft_kind kind)
+{
+     return (d->n > 1 || kind == R2HC11 || kind == HC2R11
+	     || (REODFT_KINDP(kind) && kind != REDFT01 && kind != RODFT01));
+}
+
+problem *X(mkproblem_rdft)(const tensor *sz, const tensor *vecsz,
+			   R *I, R *O, const rdft_kind *kind)
+{
+     problem_rdft *ego;
+     int rnk = sz->rnk;
+     int i;
+
+     A(X(tensor_kosherp)(sz));
+     A(X(tensor_kosherp)(vecsz));
+     A(FINITE_RNK(sz->rnk));
+
+     if (UNTAINT(I) == UNTAINT(O))
+	  I = O = JOIN_TAINT(I, O);
+
+     if (I == O && !X(tensor_inplace_locations)(sz, vecsz))
+	  return X(mkproblem_unsolvable)();
+
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+          A(sz->dims[i].n > 0);
+          if (nontrivial(sz->dims + i, kind[i]))
+               ++rnk;
+     }
+
+#if defined(STRUCT_HACK_KR)
+     ego = (problem_rdft *) X(mkproblem)(sizeof(problem_rdft)
+					 + sizeof(rdft_kind)
+					 * (rnk > 0 ? rnk - 1 : 0), &padt);
+#elif defined(STRUCT_HACK_C99)
+     ego = (problem_rdft *) X(mkproblem)(sizeof(problem_rdft)
+					 + sizeof(rdft_kind) * rnk, &padt);
+#else
+     ego = (problem_rdft *) X(mkproblem)(sizeof(problem_rdft), &padt);
+     ego->kind = (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS);
+#endif
+
+     /* do compression and sorting as in X(tensor_compress), but take
+	transform kind into account (sigh) */
+     ego->sz = X(mktensor)(rnk);
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+          if (nontrivial(sz->dims + i, kind[i])) {
+	       ego->kind[rnk] = kind[i];
+               ego->sz->dims[rnk++] = sz->dims[i];
+	  }
+     }
+     for (i = 0; i + 1 < rnk; ++i) {
+	  int j;
+	  for (j = i + 1; j < rnk; ++j)
+	       if (X(dimcmp)(ego->sz->dims + i, ego->sz->dims + j) > 0) {
+		    iodim dswap;
+		    rdft_kind kswap;
+		    dswap = ego->sz->dims[i];
+		    ego->sz->dims[i] = ego->sz->dims[j];
+		    ego->sz->dims[j] = dswap;
+		    kswap = ego->kind[i];
+		    ego->kind[i] = ego->kind[j];
+		    ego->kind[j] = kswap;
+	       }
+     }
+
+     for (i = 0; i < rnk; ++i)
+	  if (ego->sz->dims[i].n == 2 && (ego->kind[i] == REDFT00
+					  || ego->kind[i] == DHT
+					  || ego->kind[i] == HC2R))
+	       ego->kind[i] = R2HC; /* size-2 transforms are equivalent */
+
+     ego->vecsz = X(tensor_compress_contiguous)(vecsz);
+     ego->I = I;
+     ego->O = O;
+
+     A(FINITE_RNK(ego->sz->rnk));
+
+     return &(ego->super);
+}
+
+/* Same as X(mkproblem_rdft), but also destroy input tensors. */
+problem *X(mkproblem_rdft_d)(tensor *sz, tensor *vecsz,
+			     R *I, R *O, const rdft_kind *kind)
+{
+     problem *p = X(mkproblem_rdft)(sz, vecsz, I, O, kind);
+     X(tensor_destroy2)(vecsz, sz);
+     return p;
+}
+
+/* As above, but for rnk <= 1 only and takes a scalar kind parameter */
+problem *X(mkproblem_rdft_1)(const tensor *sz, const tensor *vecsz,
+			     R *I, R *O, rdft_kind kind)
+{
+     A(sz->rnk <= 1);
+     return X(mkproblem_rdft)(sz, vecsz, I, O, &kind);
+}
+
+problem *X(mkproblem_rdft_1_d)(tensor *sz, tensor *vecsz,
+			       R *I, R *O, rdft_kind kind)
+{
+     A(sz->rnk <= 1);
+     return X(mkproblem_rdft_d)(sz, vecsz, I, O, &kind);
+}
+
+/* create a zero-dimensional problem */
+problem *X(mkproblem_rdft_0_d)(tensor *vecsz, R *I, R *O)
+{
+     return X(mkproblem_rdft_d)(X(mktensor_0d)(), vecsz, I, O, 
+				(const rdft_kind *)0);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/problem2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/problem2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "dft.h"
+#include "rdft.h"
+#include <stddef.h>
+
+static void destroy(problem *ego_)
+{
+     problem_rdft2 *ego = (problem_rdft2 *) ego_;
+     X(tensor_destroy2)(ego->vecsz, ego->sz);
+     X(ifree)(ego_);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     X(md5puts)(m, "rdft2");
+     X(md5int)(m, p->r0 == p->cr);
+     X(md5INT)(m, p->r1 - p->r0);
+     X(md5INT)(m, p->ci - p->cr);
+     X(md5int)(m, X(alignment_of)(p->r0));
+     X(md5int)(m, X(alignment_of)(p->r1));
+     X(md5int)(m, X(alignment_of)(p->cr)); 
+     X(md5int)(m, X(alignment_of)(p->ci)); 
+     X(md5int)(m, p->kind);
+     X(tensor_md5)(m, p->sz);
+     X(tensor_md5)(m, p->vecsz);
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_rdft2 *ego = (const problem_rdft2 *) ego_;
+     p->print(p, "(rdft2 %d %d %T %T)", 
+	      (int)(ego->cr == ego->r0), 
+	      (int)(ego->kind),
+	      ego->sz,
+	      ego->vecsz);
+}
+
+static void recur(const iodim *dims, int rnk, R *I0, R *I1)
+{
+     if (rnk == RNK_MINFTY)
+          return;
+     else if (rnk == 0)
+          I0[0] = K(0.0);
+     else if (rnk > 0) {
+          INT i, n = dims[0].n, is = dims[0].is;
+
+	  if (rnk == 1) {
+	       for (i = 0; i < n - 1; i += 2) {
+		    *I0 = *I1 = K(0.0);
+		    I0 += is; I1 += is;
+	       }
+	       if (i < n) 
+		    *I0 = K(0.0);
+	  } else {
+	       for (i = 0; i < n; ++i)
+		    recur(dims + 1, rnk - 1, I0 + i * is, I1 + i * is);
+	  }
+     }
+}
+
+static void vrecur(const iodim *vdims, int vrnk,
+		   const iodim *dims, int rnk, R *I0, R *I1)
+{
+     if (vrnk == RNK_MINFTY)
+          return;
+     else if (vrnk == 0)
+	  recur(dims, rnk, I0, I1);
+     else if (vrnk > 0) {
+          INT i, n = vdims[0].n, is = vdims[0].is;
+
+	  for (i = 0; i < n; ++i)
+	       vrecur(vdims + 1, vrnk - 1, 
+		      dims, rnk, I0 + i * is, I1 + i * is);
+     }
+}
+
+INT X(rdft2_complex_n)(INT real_n, rdft_kind kind)
+{
+     switch (kind) {
+	 case R2HC:
+	 case HC2R:
+	      return (real_n / 2) + 1;
+	 case R2HCII:
+	 case HC2RIII:
+	      return (real_n + 1) / 2;
+	 default:
+	      /* can't happen */
+	      A(0);
+	      return 0;
+     }
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_rdft2 *ego = (const problem_rdft2 *) ego_;
+     if (R2HC_KINDP(ego->kind)) {
+	  /* FIXME: can we avoid the double recursion somehow? */
+	  vrecur(ego->vecsz->dims, ego->vecsz->rnk, 
+		 ego->sz->dims, ego->sz->rnk, 
+		 UNTAINT(ego->r0), UNTAINT(ego->r1));
+     } else {
+	  tensor *sz;
+	  tensor *sz2 = X(tensor_copy)(ego->sz);
+	  int rnk = sz2->rnk;
+	  if (rnk > 0) /* ~half as many complex outputs */
+	       sz2->dims[rnk-1].n = 
+		    X(rdft2_complex_n)(sz2->dims[rnk-1].n, ego->kind);
+	  sz = X(tensor_append)(ego->vecsz, sz2);
+	  X(tensor_destroy)(sz2);
+	  X(dft_zerotens)(sz, UNTAINT(ego->cr), UNTAINT(ego->ci));
+	  X(tensor_destroy)(sz);
+     }
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_RDFT2,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+problem *X(mkproblem_rdft2)(const tensor *sz, const tensor *vecsz,
+			    R *r0, R *r1, R *cr, R *ci,
+			    rdft_kind kind)
+{
+     problem_rdft2 *ego;
+
+     A(kind == R2HC || kind == R2HCII || kind == HC2R || kind == HC2RIII);
+     A(X(tensor_kosherp)(sz));
+     A(X(tensor_kosherp)(vecsz));
+     A(FINITE_RNK(sz->rnk));
+
+     /* require in-place problems to use r0 == cr */
+     if (UNTAINT(r0) == UNTAINT(ci))
+	  return X(mkproblem_unsolvable)();
+
+     /* FIXME: should check UNTAINT(r1) == UNTAINT(cr) but
+	only if odd elements exist, which requires compressing the 
+	tensors first */
+
+     if (UNTAINT(r0) == UNTAINT(cr)) 
+	  r0 = cr = JOIN_TAINT(r0, cr);
+
+     ego = (problem_rdft2 *)X(mkproblem)(sizeof(problem_rdft2), &padt);
+
+     if (sz->rnk > 1) { /* have to compress rnk-1 dims separately, ugh */
+	  tensor *szc = X(tensor_copy_except)(sz, sz->rnk - 1);
+	  tensor *szr = X(tensor_copy_sub)(sz, sz->rnk - 1, 1);
+	  tensor *szcc = X(tensor_compress)(szc);
+	  if (szcc->rnk > 0)
+	       ego->sz = X(tensor_append)(szcc, szr);
+	  else
+	       ego->sz = X(tensor_compress)(szr);
+	  X(tensor_destroy2)(szc, szr); X(tensor_destroy)(szcc);
+     } else {
+	  ego->sz = X(tensor_compress)(sz);
+     }
+     ego->vecsz = X(tensor_compress_contiguous)(vecsz);
+     ego->r0 = r0;
+     ego->r1 = r1;
+     ego->cr = cr;
+     ego->ci = ci;
+     ego->kind = kind;
+
+     A(FINITE_RNK(ego->sz->rnk));
+     return &(ego->super);
+
+}
+
+/* Same as X(mkproblem_rdft2), but also destroy input tensors. */
+problem *X(mkproblem_rdft2_d)(tensor *sz, tensor *vecsz,
+			      R *r0, R *r1, R *cr, R *ci, rdft_kind kind)
+{
+     problem *p = X(mkproblem_rdft2)(sz, vecsz, r0, r1, cr, ci, kind);
+     X(tensor_destroy2)(vecsz, sz);
+     return p;
+}
+
+/* Same as X(mkproblem_rdft2_d), but with only one R pointer.
+   Used by the API. */
+problem *X(mkproblem_rdft2_d_3pointers)(tensor *sz, tensor *vecsz,
+					R *r0, R *cr, R *ci, rdft_kind kind)
+{
+     problem *p;
+     int rnk = sz->rnk;
+     R *r1;
+
+     if (rnk == 0)
+	  r1 = r0;
+     else if (R2HC_KINDP(kind)) {
+	  r1 = r0 + sz->dims[rnk-1].is;
+	  sz->dims[rnk-1].is *= 2;
+     } else {
+	  r1 = r0 + sz->dims[rnk-1].os;
+	  sz->dims[rnk-1].os *= 2;
+     }
+
+     p = X(mkproblem_rdft2)(sz, vecsz, r0, r1, cr, ci, kind);
+     X(tensor_destroy2)(vecsz, sz);
+     return p;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/rank-geq2-rdft2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/rank-geq2-rdft2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* plans for RDFT2 of rank >= 2 (multidimensional) */
+
+#include "rdft.h"
+#include "dft.h"
+
+typedef struct {
+     solver super;
+     int spltrnk;
+     const int *buddies;
+     int nbuddies;
+} S;
+
+typedef struct {
+     plan_dft super;
+     plan *cldr, *cldc;
+     const S *solver;
+} P;
+
+static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+
+     {
+	  plan_rdft2 *cldr = (plan_rdft2 *) ego->cldr;
+	  cldr->apply((plan *) cldr, r0, r1, cr, ci);
+     }
+     
+     {
+	  plan_dft *cldc = (plan_dft *) ego->cldc;
+	  cldc->apply((plan *) cldc, cr, ci, cr, ci);
+     }
+}
+
+static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+
+     {
+	  plan_dft *cldc = (plan_dft *) ego->cldc;
+	  cldc->apply((plan *) cldc, ci, cr, ci, cr);
+     }
+
+     {
+	  plan_rdft2 *cldr = (plan_rdft2 *) ego->cldr;
+	  cldr->apply((plan *) cldr, r0, r1, cr, ci);
+     }
+     
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cldr, wakefulness);
+     X(plan_awake)(ego->cldc, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldr);
+     X(plan_destroy_internal)(ego->cldc);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->solver;
+     p->print(p, "(rdft2-rank>=2/%d%(%p%)%(%p%))", 
+	      s->spltrnk, ego->cldr, ego->cldc);
+}
+ 
+static int picksplit(const S *ego, const tensor *sz, int *rp)
+{
+     A(sz->rnk > 1); /* cannot split rnk <= 1 */
+     if (!X(pickdim)(ego->spltrnk, ego->buddies, ego->nbuddies, sz, 1, rp))
+          return 0;
+     *rp += 1; /* convert from dim. index to rank */
+     if (*rp >= sz->rnk) /* split must reduce rank */
+          return 0;
+     return 1;
+}
+
+static int applicable0(const solver *ego_, const problem *p_, int *rp,
+		       const planner *plnr)
+{
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     const S *ego = (const S *)ego_;
+     return (1
+	     && FINITE_RNK(p->sz->rnk) && FINITE_RNK(p->vecsz->rnk)
+
+	     /* FIXME: multidimensional R2HCII ? */
+	     && (p->kind == R2HC || p->kind == HC2R)
+
+	     && p->sz->rnk >= 2
+	     && picksplit(ego, p->sz, rp)
+	     && (0
+
+		 /* can work out-of-place, but HC2R destroys input */
+		 || (p->r0 != p->cr && 
+		     (p->kind == R2HC || !NO_DESTROY_INPUTP(plnr)))
+
+		 /* FIXME: what are sufficient conditions for inplace? */
+		 || (p->r0 == p->cr))
+	  );
+}
+
+/* TODO: revise this. */
+static int applicable(const solver *ego_, const problem *p_, 
+		      const planner *plnr, int *rp)
+{
+     const S *ego = (const S *)ego_;
+
+     if (!applicable0(ego_, p_, rp, plnr)) return 0;
+
+     if (NO_RANK_SPLITSP(plnr) && (ego->spltrnk != ego->buddies[0]))
+          return 0;
+
+     if (NO_UGLYP(plnr)) {
+	  const problem_rdft2 *p = (const problem_rdft2 *) p_;
+
+	  /* Heuristic: if the vector stride is greater than the transform
+	     size, don't use (prefer to do the vector loop first with a
+	     vrank-geq1 plan). */
+	  if (p->vecsz->rnk > 0 &&
+	      X(tensor_min_stride)(p->vecsz) 
+	      > X(rdft2_tensor_max_index)(p->sz, p->kind))
+	       return 0;
+     }
+
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft2 *p;
+     P *pln;
+     plan *cldr = 0, *cldc = 0;
+     tensor *sz1, *sz2, *vecszi, *sz2i;
+     int spltrnk;
+     inplace_kind k;
+     problem *cldp;
+
+     static const plan_adt padt = {
+	  X(rdft2_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &spltrnk))
+          return (plan *) 0;
+
+     p = (const problem_rdft2 *) p_;
+     X(tensor_split)(p->sz, &sz1, spltrnk, &sz2);
+
+     k = p->kind == R2HC ? INPLACE_OS : INPLACE_IS;
+     vecszi = X(tensor_copy_inplace)(p->vecsz, k);
+     sz2i = X(tensor_copy_inplace)(sz2, k);
+
+     /* complex data is ~half of real */
+     sz2i->dims[sz2i->rnk - 1].n = sz2i->dims[sz2i->rnk - 1].n/2 + 1;
+
+     cldr = X(mkplan_d)(plnr, 
+		       X(mkproblem_rdft2_d)(X(tensor_copy)(sz2),
+					    X(tensor_append)(p->vecsz, sz1),
+					    p->r0, p->r1,
+					    p->cr, p->ci, p->kind));
+     if (!cldr) goto nada;
+
+     if (p->kind == R2HC)
+	  cldp = X(mkproblem_dft_d)(X(tensor_copy_inplace)(sz1, k),
+				    X(tensor_append)(vecszi, sz2i),
+				    p->cr, p->ci, p->cr, p->ci);
+     else /* HC2R must swap re/im parts to get IDFT */
+	  cldp = X(mkproblem_dft_d)(X(tensor_copy_inplace)(sz1, k),
+				    X(tensor_append)(vecszi, sz2i),
+				    p->ci, p->cr, p->ci, p->cr);
+     cldc = X(mkplan_d)(plnr, cldp);
+     if (!cldc) goto nada;
+
+     pln = MKPLAN_RDFT2(P, &padt, p->kind == R2HC ? apply_r2hc : apply_hc2r);
+
+     pln->cldr = cldr;
+     pln->cldc = cldc;
+
+     pln->solver = ego;
+     X(ops_add)(&cldr->ops, &cldc->ops, &pln->super.super.ops);
+
+     X(tensor_destroy4)(sz2i, vecszi, sz2, sz1);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldr);
+     X(plan_destroy_internal)(cldc);
+     X(tensor_destroy4)(sz2i, vecszi, sz2, sz1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int spltrnk, const int *buddies, int nbuddies)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->spltrnk = spltrnk;
+     slv->buddies = buddies;
+     slv->nbuddies = nbuddies;
+     return &(slv->super);
+}
+
+void X(rdft2_rank_geq2_register)(planner *p)
+{
+     int i;
+     static const int buddies[] = { 1, 0, -2 };
+
+     const int nbuddies = (int)(sizeof(buddies) / sizeof(buddies[0]));
+
+     for (i = 0; i < nbuddies; ++i)
+          REGISTER_SOLVER(p, mksolver(buddies[i], buddies, nbuddies));
+
+     /* FIXME: Should we try more buddies?  See also dft/rank-geq2. */
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/rank-geq2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/rank-geq2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* plans for RDFT of rank >= 2 (multidimensional) */
+
+/* FIXME: this solver cannot strictly be applied to multidimensional
+   DHTs, since the latter are not separable...up to rnk-1 additional
+   post-processing passes may be required.  See also:
+
+   R. N. Bracewell, O. Buneman, H. Hao, and J. Villasenor, "Fast
+   two-dimensional Hartley transform," Proc. IEEE 74, 1282-1283 (1986).
+
+   H. Hao and R. N. Bracewell, "A three-dimensional DFT algorithm
+   using the fast Hartley transform," Proc. IEEE 75(2), 264-266 (1987).
+*/
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+     int spltrnk;
+     const int *buddies;
+     int nbuddies;
+} S;
+
+typedef struct {
+     plan_rdft super;
+
+     plan *cld1, *cld2;
+     const S *solver;
+} P;
+
+/* Compute multi-dimensional RDFT by applying the two cld plans
+   (lower-rnk RDFTs). */
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cld2;
+
+     cld1 = (plan_rdft *) ego->cld1;
+     cld1->apply(ego->cld1, I, O);
+
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, O, O);
+}
+
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->solver;
+     p->print(p, "(rdft-rank>=2/%d%(%p%)%(%p%))",
+	      s->spltrnk, ego->cld1, ego->cld2);
+}
+
+static int picksplit(const S *ego, const tensor *sz, int *rp)
+{
+     A(sz->rnk > 1); /* cannot split rnk <= 1 */
+     if (!X(pickdim)(ego->spltrnk, ego->buddies, ego->nbuddies, sz, 1, rp))
+	  return 0;
+     *rp += 1; /* convert from dim. index to rank */
+     if (*rp >= sz->rnk) /* split must reduce rank */
+	  return 0;
+     return 1;
+}
+
+static int applicable0(const solver *ego_, const problem *p_, int *rp)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     const S *ego = (const S *)ego_;
+     return (1
+	     && FINITE_RNK(p->sz->rnk) && FINITE_RNK(p->vecsz->rnk)
+	     && p->sz->rnk >= 2
+	     && picksplit(ego, p->sz, rp)
+	  );
+}
+
+/* TODO: revise this. */
+static int applicable(const solver *ego_, const problem *p_, 
+		      const planner *plnr, int *rp)
+{
+     const S *ego = (const S *)ego_;
+
+     if (!applicable0(ego_, p_, rp)) return 0;
+
+     if (NO_RANK_SPLITSP(plnr) && (ego->spltrnk != ego->buddies[0]))
+	  return 0;
+
+     if (NO_UGLYP(plnr)) {
+	  /* Heuristic: if the vector stride is greater than the transform
+	     sz, don't use (prefer to do the vector loop first with a
+	     vrank-geq1 plan). */
+	  const problem_rdft *p = (const problem_rdft *) p_;
+
+	  if (p->vecsz->rnk > 0 &&
+	      X(tensor_min_stride)(p->vecsz) > X(tensor_max_index)(p->sz))
+	       return 0;
+     }
+
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0;
+     tensor *sz1, *sz2, *vecszi, *sz2i;
+     int spltrnk;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &spltrnk))
+          return (plan *) 0;
+
+     p = (const problem_rdft *) p_;
+     X(tensor_split)(p->sz, &sz1, spltrnk, &sz2);
+     vecszi = X(tensor_copy_inplace)(p->vecsz, INPLACE_OS);
+     sz2i = X(tensor_copy_inplace)(sz2, INPLACE_OS);
+
+     cld1 = X(mkplan_d)(plnr, 
+			X(mkproblem_rdft_d)(X(tensor_copy)(sz2),
+					    X(tensor_append)(p->vecsz, sz1),
+					    p->I, p->O, p->kind + spltrnk));
+     if (!cld1) goto nada;
+
+     cld2 = X(mkplan_d)(plnr, 
+			X(mkproblem_rdft_d)(
+			     X(tensor_copy_inplace)(sz1, INPLACE_OS),
+			     X(tensor_append)(vecszi, sz2i),
+			     p->O, p->O, p->kind));
+     if (!cld2) goto nada;
+
+     pln = MKPLAN_RDFT(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+
+     pln->solver = ego;
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+
+     X(tensor_destroy4)(sz2, sz1, vecszi, sz2i);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     X(tensor_destroy4)(sz2, sz1, vecszi, sz2i);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int spltrnk, const int *buddies, int nbuddies)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->spltrnk = spltrnk;
+     slv->buddies = buddies;
+     slv->nbuddies = nbuddies;
+     return &(slv->super);
+}
+
+void X(rdft_rank_geq2_register)(planner *p)
+{
+     int i;
+     static const int buddies[] = { 1, 0, -2 };
+
+     const int nbuddies = (int)(sizeof(buddies) / sizeof(buddies[0]));
+
+     for (i = 0; i < nbuddies; ++i)
+          REGISTER_SOLVER(p, mksolver(buddies[i], buddies, nbuddies));
+
+     /* FIXME: Should we try more buddies?  See also dft/rank-geq2. */
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/rank0-rdft2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/rank0-rdft2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* plans for rank-0 RDFT2 (copy operations, plus setting 0 imag. parts) */
+
+#include "rdft.h"
+
+#ifdef HAVE_STRING_H
+#include <string.h>		/* for memcpy() */
+#endif
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     INT vl;
+     INT ivs, ovs;
+     plan *cldcpy;
+} P;
+
+static int applicable(const problem *p_)
+{
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     return (1
+	     && p->sz->rnk == 0
+	     && (p->kind == HC2R
+		 ||
+		 (1
+		  && p->kind == R2HC
+		
+		  && p->vecsz->rnk <= 1
+  
+		  && ((p->r0 != p->cr) 
+		      || 
+		      X(rdft2_inplace_strides)(p, RNK_MINFTY)) ))
+	  );
+}
+
+static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     INT i, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+
+     UNUSED(r1); /* rank-0 has no real odd-index elements */
+
+     for (i = 4; i <= vl; i += 4) {
+          R x0, x1, x2, x3;
+          x0 = *r0; r0 += ivs;
+          x1 = *r0; r0 += ivs;
+          x2 = *r0; r0 += ivs;
+          x3 = *r0; r0 += ivs;
+          *cr = x0; cr += ovs;
+	  *ci = K(0.0); ci += ovs;
+          *cr = x1; cr += ovs;
+	  *ci = K(0.0); ci += ovs;
+          *cr = x2; cr += ovs;
+	  *ci = K(0.0); ci += ovs;
+	  *cr = x3; cr += ovs;
+	  *ci = K(0.0); ci += ovs;
+     }
+     for (; i < vl + 4; ++i) {
+          R x0;
+          x0 = *r0; r0 += ivs;
+          *cr = x0; cr += ovs;
+	  *ci = K(0.0); ci += ovs;
+     }
+}
+
+/* in-place r2hc rank-0: set imaginary parts of output to 0 */
+static void apply_r2hc_inplace(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     INT i, vl = ego->vl;
+     INT ovs = ego->ovs;
+
+     UNUSED(r0); UNUSED(r1); UNUSED(cr);
+
+     for (i = 4; i <= vl; i += 4) {
+	  *ci = K(0.0); ci += ovs;
+	  *ci = K(0.0); ci += ovs;
+	  *ci = K(0.0); ci += ovs;
+	  *ci = K(0.0); ci += ovs;
+     }
+     for (; i < vl + 4; ++i) {
+	  *ci = K(0.0); ci += ovs;
+     }
+}
+
+/* a rank-0 HC2R rdft2 problem is just a copy from cr to r0,
+   so we can use a rank-0 rdft plan */
+static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
+     UNUSED(ci);
+     UNUSED(r1);
+     cldcpy->apply((plan *) cldcpy, cr, r0);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     if (ego->cldcpy)
+	  X(plan_awake)(ego->cldcpy, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     if (ego->cldcpy)
+	  X(plan_destroy_internal)(ego->cldcpy);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     if (ego->cldcpy)
+	  p->print(p, "(rdft2-hc2r-rank0%(%p%))", ego->cldcpy);
+     else
+	  p->print(p, "(rdft2-r2hc-rank0%v)", ego->vl);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const problem_rdft2 *p;
+     plan *cldcpy = (plan *) 0;
+     P *pln;
+
+     static const plan_adt padt = {
+	  X(rdft2_solve), awake, print, destroy
+     };
+
+     UNUSED(ego_);
+
+     if (!applicable(p_))
+          return (plan *) 0;
+
+     p = (const problem_rdft2 *) p_;
+
+     if (p->kind == HC2R) {
+	  cldcpy = X(mkplan_d)(plnr,
+			       X(mkproblem_rdft_0_d)(
+				    X(tensor_copy)(p->vecsz),
+				    p->cr, p->r0));
+	  if (!cldcpy) return (plan *) 0;
+     }
+
+     pln = MKPLAN_RDFT2(P, &padt, 
+			p->kind == R2HC ? 
+			(p->r0 == p->cr ? apply_r2hc_inplace : apply_r2hc) 
+			: apply_hc2r);
+     
+     if (p->kind == R2HC)
+	  X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+     pln->cldcpy = cldcpy;
+
+     if (p->kind == R2HC) {
+	  /* vl loads, 2*vl stores */
+	  X(ops_other)(3 * pln->vl, &pln->super.super.ops);
+     }
+     else {
+	  pln->super.super.ops = cldcpy->ops;
+     }
+
+     return &(pln->super.super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(rdft2_rank0_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/rank0.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/rank0.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* plans for rank-0 RDFTs (copy operations) */
+
+#include "rdft.h"
+
+#ifdef HAVE_STRING_H
+#include <string.h>		/* for memcpy() */
+#endif
+
+#define MAXRNK 32 /* FIXME: should malloc() */
+
+typedef struct {
+     plan_rdft super;
+     INT vl;
+     int rnk;
+     iodim d[MAXRNK];
+     const char *nam;
+} P;
+
+typedef struct {
+     solver super;
+     rdftapply apply;
+     int (*applicable)(const P *pln, const problem_rdft *p);
+     const char *nam;
+} S;
+
+/* copy up to MAXRNK dimensions from problem into plan.  If a
+   contiguous dimension exists, save its length in pln->vl */
+static int fill_iodim(P *pln, const problem_rdft *p)
+{
+     int i;
+     const tensor *vecsz = p->vecsz;
+
+     pln->vl = 1;
+     pln->rnk = 0;
+     for (i = 0; i < vecsz->rnk; ++i) {
+	  /* extract contiguous dimensions */
+	  if (pln->vl == 1 &&
+	      vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) 
+	       pln->vl = vecsz->dims[i].n;
+	  else if (pln->rnk == MAXRNK) 
+	       return 0;
+	  else 
+	       pln->d[pln->rnk++] = vecsz->dims[i];
+     }
+
+     return 1;
+}
+
+/* generic higher-rank copy routine, calls cpy2d() to do the real work */
+static void copy(const iodim *d, int rnk, INT vl,
+		 R *I, R *O,
+		 cpy2d_func cpy2d)
+{
+     A(rnk >= 2);
+     if (rnk == 2)
+	  cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl);
+     else {
+	  INT i;
+	  for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os)
+	       copy(d + 1, rnk - 1, vl, I, O, cpy2d);
+     }
+}
+
+/* FIXME: should be more general */
+static int transposep(const P *pln)
+{
+     int i;
+
+     for (i = 0; i < pln->rnk - 2; ++i) 
+	  if (pln->d[i].is != pln->d[i].os)
+	       return 0;
+     
+     return (pln->d[i].n == pln->d[i+1].n &&
+	     pln->d[i].is == pln->d[i+1].os &&
+	     pln->d[i].os == pln->d[i+1].is);
+}
+
+/* generic higher-rank transpose routine, calls transpose2d() to do
+ * the real work */
+static void transpose(const iodim *d, int rnk, INT vl,
+		      R *I,
+		      transpose_func transpose2d)
+{
+     A(rnk >= 2);
+     if (rnk == 2)
+	  transpose2d(I, d[0].n, d[0].is, d[0].os, vl);
+     else {
+	  INT i;
+	  for (i = 0; i < d[0].n; ++i, I += d[0].is)
+	       transpose(d + 1, rnk - 1, vl, I, transpose2d);
+     }
+}
+
+/**************************************************************/
+/* rank 0,1,2, out of place, iterative */
+static void apply_iter(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+
+     switch (ego->rnk) {
+	 case 0: 
+	      X(cpy1d)(I, O, ego->vl, 1, 1, 1);
+	      break;
+	 case 1:
+	      X(cpy1d)(I, O, 
+		       ego->d[0].n, ego->d[0].is, ego->d[0].os, 
+		       ego->vl);
+	      break;
+	 default:
+	      copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci));
+	      break;
+     }
+}
+
+static int applicable_iter(const P *pln, const problem_rdft *p)
+{
+     UNUSED(pln);
+     return (p->I != p->O);
+}
+
+/**************************************************************/
+/* out of place, write contiguous output */
+static void apply_cpy2dco(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co));
+}
+
+static int applicable_cpy2dco(const P *pln, const problem_rdft *p)
+{
+     int rnk = pln->rnk;
+     return (1
+	     && p->I != p->O
+	     && rnk >= 2
+
+	     /* must not duplicate apply_iter */
+	     && (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is)
+		 ||
+		 X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os))
+	  );
+}
+
+/**************************************************************/
+/* out of place, tiled, no buffering */
+static void apply_tiled(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled));
+}
+
+static int applicable_tiled(const P *pln, const problem_rdft *p)
+{
+     return (1
+	     && p->I != p->O
+	     && pln->rnk >= 2
+
+	     /* somewhat arbitrary */
+	     && X(compute_tilesz)(pln->vl, 1) > 4
+	  );
+}
+
+/**************************************************************/
+/* out of place, tiled, with buffer */
+static void apply_tiledbuf(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf));
+}
+
+#define applicable_tiledbuf applicable_tiled
+
+/**************************************************************/
+/* rank 0, out of place, using memcpy */
+static void apply_memcpy(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+
+     A(ego->rnk == 0);
+     memcpy(O, I, ego->vl * sizeof(R));
+}
+
+static int applicable_memcpy(const P *pln, const problem_rdft *p)
+{
+     return (1
+	     && p->I != p->O 
+	     && pln->rnk == 0
+	     && pln->vl > 2 /* do not bother memcpy-ing complex numbers */
+	     );
+}
+
+/**************************************************************/
+/* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place
+   transposes of vl-tuples ... for large vl it should be more
+   efficient to use memcpy than the tiled stuff). */
+
+static void memcpy_loop(INT cpysz, int rnk, const iodim *d, R *I, R *O)
+{
+     INT i, n = d->n, is = d->is, os = d->os;
+     if (rnk == 1)
+	  for (i = 0; i < n; ++i, I += is, O += os)
+	       memcpy(O, I, cpysz);
+     else {
+	  --rnk; ++d;
+	  for (i = 0; i < n; ++i, I += is, O += os)
+	       memcpy_loop(cpysz, rnk, d, I, O);
+     }
+}
+
+static void apply_memcpy_loop(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O);
+}
+
+static int applicable_memcpy_loop(const P *pln, const problem_rdft *p)
+{
+     return (p->I != p->O
+	     && pln->rnk > 0
+             && pln->vl > 2 /* do not bother memcpy-ing complex numbers */);
+}
+
+/**************************************************************/
+/* rank 2, in place, square transpose, iterative */
+static void apply_ip_sq(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     UNUSED(O);
+     transpose(ego->d, ego->rnk, ego->vl, I, X(transpose));
+}
+
+
+static int applicable_ip_sq(const P *pln, const problem_rdft *p)
+{
+     return (1
+	     && p->I == p->O
+	     && pln->rnk >= 2
+	     && transposep(pln));
+}
+
+/**************************************************************/
+/* rank 2, in place, square transpose, tiled */
+static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     UNUSED(O);
+     transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled));
+}
+
+static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p)
+{
+     return (1
+	     && applicable_ip_sq(pln, p)
+
+	     /* somewhat arbitrary */
+	     && X(compute_tilesz)(pln->vl, 2) > 4
+	  );
+}
+
+/**************************************************************/
+/* rank 2, in place, square transpose, tiled, buffered */
+static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     UNUSED(O);
+     transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf));
+}
+
+#define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled
+
+/**************************************************************/
+static int applicable(const S *ego, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     P pln;
+     return (1
+	     && p->sz->rnk == 0
+	     && FINITE_RNK(p->vecsz->rnk)
+	     && fill_iodim(&pln, p)
+	     && ego->applicable(&pln, p)
+	  );
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     int i;
+     p->print(p, "(%s/%D", ego->nam, ego->vl);
+     for (i = 0; i < ego->rnk; ++i)
+	  p->print(p, "%v", ego->d[i].n);
+     p->print(p, ")");
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const problem_rdft *p;
+     const S *ego = (const S *) ego_;
+     P *pln;
+     int retval;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), X(null_awake), print, X(plan_null_destroy)
+     };
+
+     UNUSED(plnr);
+
+     if (!applicable(ego, p_))
+          return (plan *) 0;
+
+     p = (const problem_rdft *) p_;
+     pln = MKPLAN_RDFT(P, &padt, ego->apply);
+
+     retval = fill_iodim(pln, p);
+     (void)retval; /* UNUSED unless DEBUG */
+     A(retval);
+     A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */
+     pln->nam = ego->nam;
+
+     /* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */
+     X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops);
+     return &(pln->super.super);
+}
+
+
+void X(rdft_rank0_register)(planner *p)
+{
+     unsigned i;
+     static struct {
+	  rdftapply apply;
+	  int (*applicable)(const P *, const problem_rdft *);
+	  const char *nam;
+     } tab[] = {
+	  { apply_memcpy,   applicable_memcpy,   "rdft-rank0-memcpy" },
+	  { apply_memcpy_loop,   applicable_memcpy_loop,  
+	    "rdft-rank0-memcpy-loop" },
+	  { apply_iter,     applicable_iter,     "rdft-rank0-iter-ci" },
+	  { apply_cpy2dco,  applicable_cpy2dco,  "rdft-rank0-iter-co" },
+	  { apply_tiled,    applicable_tiled,    "rdft-rank0-tiled" },
+	  { apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" },
+	  { apply_ip_sq,    applicable_ip_sq,    "rdft-rank0-ip-sq" },
+	  { 
+	       apply_ip_sq_tiled,
+	       applicable_ip_sq_tiled,
+	       "rdft-rank0-ip-sq-tiled" 
+	  },
+	  { 
+	       apply_ip_sq_tiledbuf,
+	       applicable_ip_sq_tiledbuf,
+	       "rdft-rank0-ip-sq-tiledbuf" 
+	  },
+     };
+
+     for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) {
+	  static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+	  S *slv = MKSOLVER(S, &sadt);
+	  slv->apply = tab[i].apply;
+	  slv->applicable = tab[i].applicable;
+	  slv->nam = tab[i].nam;
+	  REGISTER_SOLVER(p, &(slv->super));
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/rdft-dht.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/rdft-dht.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Solve an R2HC/HC2R problem via post/pre processing of a DHT.  This
+   is mainly useful because we can use Rader to compute DHTs of prime
+   sizes.  It also allows us to express hc2r problems in terms of r2hc
+   (via dht-r2hc), and to do hc2r problems without destroying the input. */
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld;
+     INT is, os;
+     INT n;
+} P;
+
+static void apply_r2hc(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT os;
+     INT i, n;
+
+     {
+	  plan_rdft *cld = (plan_rdft *) ego->cld;
+	  cld->apply((plan *) cld, I, O);
+     }
+
+     n = ego->n;
+     os = ego->os;
+     for (i = 1; i < n - i; ++i) {
+	  E a, b;
+	  a = K(0.5) * O[os * i];
+	  b = K(0.5) * O[os * (n - i)];
+	  O[os * i] = a + b;
+#if FFT_SIGN == -1
+	  O[os * (n - i)] = b - a;
+#else
+	  O[os * (n - i)] = a - b;
+#endif
+     }
+}
+
+/* hc2r, destroying input as usual */
+static void apply_hc2r(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is;
+     INT i, n = ego->n;
+
+     for (i = 1; i < n - i; ++i) {
+	  E a, b;
+	  a = I[is * i];
+	  b = I[is * (n - i)];
+#if FFT_SIGN == -1
+	  I[is * i] = a - b;
+	  I[is * (n - i)] = a + b;
+#else
+	  I[is * i] = a + b;
+	  I[is * (n - i)] = a - b;
+#endif
+     }
+
+     {
+	  plan_rdft *cld = (plan_rdft *) ego->cld;
+	  cld->apply((plan *) cld, I, O);
+     }
+}
+
+/* hc2r, without destroying input */
+static void apply_hc2r_save(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n;
+
+     O[0] = I[0];
+     for (i = 1; i < n - i; ++i) {
+	  E a, b;
+	  a = I[is * i];
+	  b = I[is * (n - i)];
+#if FFT_SIGN == -1
+	  O[os * i] = a - b;
+	  O[os * (n - i)] = a + b;
+#else
+	  O[os * i] = a + b;
+	  O[os * (n - i)] = a - b;
+#endif
+     }
+     if (i == n - i)
+	  O[os * i] = I[is * i];
+
+     {
+	  plan_rdft *cld = (plan_rdft *) ego->cld;
+	  cld->apply((plan *) cld, O, O);
+     }
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(%s-dht-%D%(%p%))", 
+	      ego->super.apply == apply_r2hc ? "r2hc" : "hc2r",
+	      ego->n, ego->cld);
+}
+
+static int applicable0(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego_);
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk == 0
+	     && (p->kind[0] == R2HC || p->kind[0] == HC2R)
+
+	     /* hack: size-2 DHT etc. are defined as being equivalent
+		to size-2 R2HC in problem.c, so we need this to prevent
+		infinite loops for size 2 in EXHAUSTIVE mode: */
+	     && p->sz->dims[0].n > 2
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p_, 
+		      const planner *plnr)
+{
+     return (!NO_SLOWP(plnr) && applicable0(ego, p_));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     problem *cldp;
+     plan *cld;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+
+     if (p->kind[0] == R2HC || !NO_DESTROY_INPUTP(plnr))
+	  cldp = X(mkproblem_rdft_1)(p->sz, p->vecsz, p->I, p->O, DHT);
+     else {
+	  tensor *sz = X(tensor_copy_inplace)(p->sz, INPLACE_OS);
+	  cldp = X(mkproblem_rdft_1)(sz, p->vecsz, p->O, p->O, DHT);
+	  X(tensor_destroy)(sz);
+     }
+     cld = X(mkplan_d)(plnr, cldp);
+     if (!cld) return (plan *)0;
+
+     pln = MKPLAN_RDFT(P, &padt, p->kind[0] == R2HC ? 
+		       apply_r2hc : (NO_DESTROY_INPUTP(plnr) ?
+				     apply_hc2r_save : apply_hc2r));
+     pln->n = p->sz->dims[0].n;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+     pln->cld = cld;
+     
+     pln->super.super.ops = cld->ops;
+     pln->super.super.ops.other += 4 * ((pln->n - 1)/2);
+     pln->super.super.ops.add += 2 * ((pln->n - 1)/2);
+     if (p->kind[0] == R2HC)
+	  pln->super.super.ops.mul += 2 * ((pln->n - 1)/2);
+     if (pln->super.apply == apply_hc2r_save)
+	  pln->super.super.ops.other += 2 + (pln->n % 2 ? 0 : 2);
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(rdft_dht_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/rdft.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/rdft.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __RDFT_H__
+#define __RDFT_H__
+
+#include "ifftw.h"
+#include "codelet-rdft.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+/* problem.c: */
+typedef struct {
+     problem super;
+     tensor *sz, *vecsz;
+     R *I, *O;
+#if defined(STRUCT_HACK_KR)
+     rdft_kind kind[1];
+#elif defined(STRUCT_HACK_C99)
+     rdft_kind kind[];
+#else
+     rdft_kind *kind;
+#endif
+} problem_rdft;
+
+void X(rdft_zerotens)(tensor *sz, R *I);
+problem *X(mkproblem_rdft)(const tensor *sz, const tensor *vecsz,
+			   R *I, R *O, const rdft_kind *kind);
+problem *X(mkproblem_rdft_d)(tensor *sz, tensor *vecsz,
+			     R *I, R *O, const rdft_kind *kind);
+problem *X(mkproblem_rdft_0_d)(tensor *vecsz, R *I, R *O);
+problem *X(mkproblem_rdft_1)(const tensor *sz, const tensor *vecsz,
+			     R *I, R *O, rdft_kind kind);
+problem *X(mkproblem_rdft_1_d)(tensor *sz, tensor *vecsz,
+			       R *I, R *O, rdft_kind kind);
+
+const char *X(rdft_kind_str)(rdft_kind kind);
+
+/* solve.c: */
+void X(rdft_solve)(const plan *ego_, const problem *p_);
+
+/* plan.c: */
+typedef void (*rdftapply) (const plan *ego, R *I, R *O);
+
+typedef struct {
+     plan super;
+     rdftapply apply;
+} plan_rdft;
+
+plan *X(mkplan_rdft)(size_t size, const plan_adt *adt, rdftapply apply);
+
+#define MKPLAN_RDFT(type, adt, apply) \
+  (type *)X(mkplan_rdft)(sizeof(type), adt, apply)
+
+/* various solvers */
+
+solver *X(mksolver_rdft_r2c_direct)(kr2c k, const kr2c_desc *desc);
+solver *X(mksolver_rdft_r2c_directbuf)(kr2c k, const kr2c_desc *desc);
+solver *X(mksolver_rdft_r2r_direct)(kr2r k, const kr2r_desc *desc);
+
+void X(rdft_rank0_register)(planner *p);
+void X(rdft_vrank3_transpose_register)(planner *p);
+void X(rdft_rank_geq2_register)(planner *p);
+void X(rdft_indirect_register)(planner *p);
+void X(rdft_vrank_geq1_register)(planner *p);
+void X(rdft_buffered_register)(planner *p);
+void X(rdft_generic_register)(planner *p);
+void X(rdft_rader_hc2hc_register)(planner *p);
+void X(rdft_dht_register)(planner *p);
+void X(dht_r2hc_register)(planner *p);
+void X(dht_rader_register)(planner *p);
+void X(dft_r2hc_register)(planner *p);
+void X(rdft_nop_register)(planner *p);
+void X(hc2hc_generic_register)(planner *p);
+
+/****************************************************************************/
+/* problem2.c: */
+/* 
+   An RDFT2 problem transforms a 1d real array r[n] with stride is/os
+   to/from an "unpacked" complex array {rio,iio}[n/2 + 1] with stride
+   os/is.  R0 points to the first even element of the real array.  
+   R1 points to the first odd element of the real array.
+
+   Strides on the real side of the transform express distances
+   between consecutive elements of the same array (even or odd).
+   E.g., for a contiguous input
+
+     R0 R1 R2 R3 ...
+
+   the input stride would be 2, not 1.  This convention is necessary
+   for hc2c codelets to work, since they transpose even/odd with
+   real/imag.
+   
+   Multidimensional transforms use complex DFTs for the
+   noncontiguous dimensions.  vecsz has the usual interpretation.  
+*/
+typedef struct {
+     problem super;
+     tensor *sz;
+     tensor *vecsz;
+     R *r0, *r1;
+     R *cr, *ci;
+     rdft_kind kind; /* assert(kind < DHT) */
+} problem_rdft2;
+
+problem *X(mkproblem_rdft2)(const tensor *sz, const tensor *vecsz,
+			    R *r0, R *r1, R *cr, R *ci, rdft_kind kind);
+problem *X(mkproblem_rdft2_d)(tensor *sz, tensor *vecsz,
+			      R *r0, R *r1, R *cr, R *ci, rdft_kind kind);
+problem *X(mkproblem_rdft2_d_3pointers)(tensor *sz, tensor *vecsz,
+					R *r, R *cr, R *ci, rdft_kind kind);
+int X(rdft2_inplace_strides)(const problem_rdft2 *p, int vdim);
+INT X(rdft2_tensor_max_index)(const tensor *sz, rdft_kind k);
+void X(rdft2_strides)(rdft_kind kind, const iodim *d, INT *rs, INT *cs);
+INT X(rdft2_complex_n)(INT real_n, rdft_kind kind);
+
+/* verify.c: */
+void X(rdft2_verify)(plan *pln, const problem_rdft2 *p, int rounds);
+
+/* solve.c: */
+void X(rdft2_solve)(const plan *ego_, const problem *p_);
+
+/* plan.c: */
+typedef void (*rdft2apply) (const plan *ego, R *r0, R *r1, R *cr, R *ci);
+
+typedef struct {
+     plan super;
+     rdft2apply apply;
+} plan_rdft2;
+
+plan *X(mkplan_rdft2)(size_t size, const plan_adt *adt, rdft2apply apply);
+
+#define MKPLAN_RDFT2(type, adt, apply) \
+  (type *)X(mkplan_rdft2)(sizeof(type), adt, apply)
+
+/* various solvers */
+
+solver *X(mksolver_rdft2_direct)(kr2c k, const kr2c_desc *desc);
+
+void X(rdft2_vrank_geq1_register)(planner *p);
+void X(rdft2_buffered_register)(planner *p);
+void X(rdft2_rdft_register)(planner *p);
+void X(rdft2_nop_register)(planner *p);
+void X(rdft2_rank0_register)(planner *p);
+void X(rdft2_rank_geq2_register)(planner *p);
+
+/****************************************************************************/
+
+/* configurations */
+void X(rdft_conf_standard)(planner *p);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* __RDFT_H__ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/rdft2-inplace-strides.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/rdft2-inplace-strides.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+/* Check if the vecsz/sz strides are consistent with the problem
+   being in-place for vecsz.dim[vdim], or for all dimensions
+   if vdim == RNK_MINFTY.  We can't just use tensor_inplace_strides
+   because rdft transforms have the unfortunate property of
+   differing input and output sizes.   This routine is not
+   exhaustive; we only return 1 for the most common case.  */
+int X(rdft2_inplace_strides)(const problem_rdft2 *p, int vdim)
+{
+     INT N, Nc;
+     INT rs, cs;
+     int i;
+     
+     for (i = 0; i + 1 < p->sz->rnk; ++i)
+	  if (p->sz->dims[i].is != p->sz->dims[i].os)
+	       return 0;
+
+     if (!FINITE_RNK(p->vecsz->rnk) || p->vecsz->rnk == 0)
+	  return 1;
+     if (!FINITE_RNK(vdim)) { /* check all vector dimensions */
+	  for (vdim = 0; vdim < p->vecsz->rnk; ++vdim)
+	       if (!X(rdft2_inplace_strides)(p, vdim))
+		    return 0;
+	  return 1;
+     }
+
+     A(vdim < p->vecsz->rnk);
+     if (p->sz->rnk == 0)
+	  return(p->vecsz->dims[vdim].is == p->vecsz->dims[vdim].os);
+
+     N = X(tensor_sz)(p->sz);
+     Nc = (N / p->sz->dims[p->sz->rnk-1].n) *
+	  (p->sz->dims[p->sz->rnk-1].n/2 + 1);
+     X(rdft2_strides)(p->kind, p->sz->dims + p->sz->rnk - 1, &rs, &cs);
+
+     /* the factor of 2 comes from the fact that RS is the stride
+	of p->r0 and p->r1, which is twice as large as the strides
+	in the r2r case */
+     return(p->vecsz->dims[vdim].is == p->vecsz->dims[vdim].os
+	    && (X(iabs)(2 * p->vecsz->dims[vdim].os)
+		>= X(imax)(2 * Nc * X(iabs)(cs), N * X(iabs)(rs))));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/rdft2-rdft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/rdft2-rdft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft2 super;
+
+     plan *cld, *cldrest;
+     INT n, vl, nbuf, bufdist;
+     INT cs, ivs, ovs;
+} P;
+
+/***************************************************************************/
+
+/* FIXME: have alternate copy functions that push a vector loop inside
+   the n loops? */
+
+/* copy halfcomplex array r (contiguous) to complex (strided) array rio/iio. */
+static void hc2c(INT n, R *r, R *rio, R *iio, INT os)
+{
+     INT i;
+
+     rio[0] = r[0];
+     iio[0] = 0;
+
+     for (i = 1; i + i < n; ++i) {
+	  rio[i * os] = r[i];
+	  iio[i * os] = r[n - i];
+     }
+
+     if (i + i == n) {	/* store the Nyquist frequency */
+	  rio[i * os] = r[i];
+	  iio[i * os] = K(0.0);
+     }
+}
+
+/* reverse of hc2c */
+static void c2hc(INT n, R *rio, R *iio, INT is, R *r)
+{
+     INT i;
+
+     r[0] = rio[0];
+
+     for (i = 1; i + i < n; ++i) {
+	  r[i] = rio[i * is];
+	  r[n - i] = iio[i * is];
+     }
+
+     if (i + i == n)		/* store the Nyquist frequency */
+	  r[i] = rio[i * is];
+}
+
+/***************************************************************************/
+
+static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld = (plan_rdft *) ego->cld;
+     INT i, j, vl = ego->vl, nbuf = ego->nbuf, bufdist = ego->bufdist;
+     INT n = ego->n;
+     INT ivs = ego->ivs, ovs = ego->ovs, os = ego->cs;
+     R *bufs = (R *)MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
+     plan_rdft2 *cldrest;
+
+     for (i = nbuf; i <= vl; i += nbuf) {
+          /* transform to bufs: */
+          cld->apply((plan *) cld, r0, bufs);
+	  r0 += ivs * nbuf; r1 += ivs * nbuf;
+
+          /* copy back */
+	  for (j = 0; j < nbuf; ++j, cr += ovs, ci += ovs)
+	       hc2c(n, bufs + j*bufdist, cr, ci, os);
+     }
+
+     X(ifree)(bufs);
+
+     /* Do the remaining transforms, if any: */
+     cldrest = (plan_rdft2 *) ego->cldrest;
+     cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
+}
+
+static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld = (plan_rdft *) ego->cld;
+     INT i, j, vl = ego->vl, nbuf = ego->nbuf, bufdist = ego->bufdist;
+     INT n = ego->n;
+     INT ivs = ego->ivs, ovs = ego->ovs, is = ego->cs;
+     R *bufs = (R *)MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
+     plan_rdft2 *cldrest;
+
+     for (i = nbuf; i <= vl; i += nbuf) {
+          /* copy to bufs */
+	  for (j = 0; j < nbuf; ++j, cr += ivs, ci += ivs)
+	       c2hc(n, cr, ci, is, bufs + j*bufdist);
+
+          /* transform back: */
+          cld->apply((plan *) cld, bufs, r0);
+	  r0 += ovs * nbuf; r1 += ovs * nbuf;
+     }
+
+     X(ifree)(bufs);
+
+     /* Do the remaining transforms, if any: */
+     cldrest = (plan_rdft2 *) ego->cldrest;
+     cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldrest, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldrest);
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(rdft2-rdft-%s-%D%v/%D-%D%(%p%)%(%p%))",
+	      ego->super.apply == apply_r2hc ? "r2hc" : "hc2r",
+              ego->n, ego->nbuf,
+              ego->vl, ego->bufdist % ego->n,
+              ego->cld, ego->cldrest);
+}
+
+static INT min_nbuf(const problem_rdft2 *p, INT n, INT vl)
+{
+     INT is, os, ivs, ovs;
+
+     if (p->r0 != p->cr)
+	  return 1;
+     if (X(rdft2_inplace_strides(p, RNK_MINFTY)))
+	  return 1;
+     A(p->vecsz->rnk == 1); /*  rank 0 and MINFTY are inplace */
+
+     X(rdft2_strides)(p->kind, p->sz->dims, &is, &os);
+     X(rdft2_strides)(p->kind, p->vecsz->dims, &ivs, &ovs);
+     
+     /* handle one potentially common case: "contiguous" real and
+	complex arrays, which overlap because of the differing sizes. */
+     if (n * X(iabs)(is) <= X(iabs)(ivs)
+	 && (n/2 + 1) * X(iabs)(os) <= X(iabs)(ovs)
+	 && ( ((p->cr - p->ci) <= X(iabs)(os)) || 
+	      ((p->ci - p->cr) <= X(iabs)(os)) )
+	 && ivs > 0 && ovs > 0) {
+	  INT vsmin = X(imin)(ivs, ovs);
+	  INT vsmax = X(imax)(ivs, ovs);
+	  return(((vsmax - vsmin) * vl + vsmin - 1) / vsmin);
+     }
+
+     return vl; /* punt: just buffer the whole vector */
+}
+
+static int applicable0(const problem *p_, const S *ego, const planner *plnr)
+{
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     UNUSED(ego);
+     return(1
+	    && p->vecsz->rnk <= 1
+	    && p->sz->rnk == 1
+
+	    /* FIXME: does it make sense to do R2HCII ? */
+	    && (p->kind == R2HC || p->kind == HC2R)
+
+	    /* real strides must allow for reduction to rdft */
+	    && (2 * (p->r1 - p->r0) ==
+		(((p->kind == R2HC) ? p->sz->dims[0].is : p->sz->dims[0].os)))
+
+	    && !(X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
+	  );
+}
+
+static int applicable(const problem *p_, const S *ego, const planner *plnr)
+{
+     const problem_rdft2 *p;
+
+     if (NO_BUFFERINGP(plnr)) return 0;
+
+     if (!applicable0(p_, ego, plnr)) return 0;
+
+     p = (const problem_rdft2 *) p_;
+     if (NO_UGLYP(plnr)) {
+	  if (p->r0 != p->cr) return 0;
+	  if (X(toobig)(p->sz->dims[0].n)) return 0;
+     }
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     P *pln;
+     plan *cld = (plan *) 0;
+     plan *cldrest = (plan *) 0;
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     R *bufs = (R *) 0;
+     INT nbuf = 0, bufdist, n, vl;
+     INT ivs, ovs, rs, id, od;
+
+     static const plan_adt padt = {
+	  X(rdft2_solve), awake, print, destroy
+     };
+
+     if (!applicable(p_, ego, plnr))
+          goto nada;
+
+     n = p->sz->dims[0].n;
+     X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
+
+     nbuf = X(imax)(X(nbuf)(n, vl, 0), min_nbuf(p, n, vl));
+     bufdist = X(bufdist)(n, vl);
+     A(nbuf > 0);
+
+     /* initial allocation for the purpose of planning */
+     bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
+
+     id = ivs * (nbuf * (vl / nbuf));
+     od = ovs * (nbuf * (vl / nbuf));
+
+     if (p->kind == R2HC) {
+	  cld = X(mkplan_f_d)(
+	       plnr,
+	       X(mkproblem_rdft_d)(
+		    X(mktensor_1d)(n, p->sz->dims[0].is/2, 1),
+		    X(mktensor_1d)(nbuf, ivs, bufdist),
+		    TAINT(p->r0, ivs * nbuf), bufs, &p->kind),
+	       0, 0, (p->r0 == p->cr) ? NO_DESTROY_INPUT : 0);
+	  if (!cld) goto nada;
+	  X(ifree)(bufs); bufs = 0;
+
+	  cldrest = X(mkplan_d)(plnr, 
+				X(mkproblem_rdft2_d)(
+				     X(tensor_copy)(p->sz),
+				     X(mktensor_1d)(vl % nbuf, ivs, ovs),
+				     p->r0 + id, p->r1 + id, 
+				     p->cr + od, p->ci + od,
+				     p->kind));
+	  if (!cldrest) goto nada;
+
+	  pln = MKPLAN_RDFT2(P, &padt, apply_r2hc);
+     } else {
+	  A(p->kind == HC2R);
+	  cld = X(mkplan_f_d)(
+	       plnr,
+	       X(mkproblem_rdft_d)(
+		    X(mktensor_1d)(n, 1, p->sz->dims[0].os/2),
+		    X(mktensor_1d)(nbuf, bufdist, ovs),
+		    bufs, TAINT(p->r0, ovs * nbuf), &p->kind),
+	       0, 0, NO_DESTROY_INPUT); /* always ok to destroy bufs */
+	  if (!cld) goto nada;
+	  X(ifree)(bufs); bufs = 0;
+
+	  cldrest = X(mkplan_d)(plnr, 
+				X(mkproblem_rdft2_d)(
+				     X(tensor_copy)(p->sz),
+				     X(mktensor_1d)(vl % nbuf, ivs, ovs),
+				     p->r0 + od, p->r1 + od, 
+				     p->cr + id, p->ci + id,
+				     p->kind));
+	  if (!cldrest) goto nada;
+	  pln = MKPLAN_RDFT2(P, &padt, apply_hc2r);
+     }
+
+     pln->cld = cld;
+     pln->cldrest = cldrest;
+     pln->n = n;
+     pln->vl = vl;
+     pln->ivs = ivs;
+     pln->ovs = ovs;
+     X(rdft2_strides)(p->kind, &p->sz->dims[0], &rs, &pln->cs);
+     pln->nbuf = nbuf;
+     pln->bufdist = bufdist;
+
+     X(ops_madd)(vl / nbuf, &cld->ops, &cldrest->ops,
+		 &pln->super.super.ops);
+     pln->super.super.ops.other += (p->kind == R2HC ? (n + 2) : n) * vl;
+
+     return &(pln->super.super);
+
+ nada:
+     X(ifree0)(bufs);
+     X(plan_destroy_internal)(cldrest);
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(rdft2_rdft_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/rdft2-strides.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/rdft2-strides.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "rdft.h"
+
+/* Deal with annoyance because the tensor (is,os) applies to
+   (r,rio/iio) for R2HC and vice-versa for HC2R.  We originally had
+   (is,os) always apply to (r,rio/iio), but this causes other
+   headaches with the tensor functions. */
+void X(rdft2_strides)(rdft_kind kind, const iodim *d, INT *rs, INT *cs)
+{
+     if (kind == R2HC) {
+	  *rs = d->is;
+	  *cs = d->os;
+     }
+     else {
+	  A(kind == HC2R);
+	  *rs = d->os;
+	  *cs = d->is;
+     }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/rdft2-tensor-max-index.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/rdft2-tensor-max-index.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+/* like X(tensor_max_index), but takes into account the special n/2+1
+   final dimension for the complex output/input of an R2HC/HC2R transform. */
+INT X(rdft2_tensor_max_index)(const tensor *sz, rdft_kind k)
+{
+     int i;
+     INT n = 0;
+
+     A(FINITE_RNK(sz->rnk));
+     for (i = 0; i + 1 < sz->rnk; ++i) {
+          const iodim *p = sz->dims + i;
+          n += (p->n - 1) * X(imax)(X(iabs)(p->is), X(iabs)(p->os));
+     }
+     if (i < sz->rnk) {
+	  const iodim *p = sz->dims + i;
+	  INT is, os;
+	  X(rdft2_strides)(k, p, &is, &os);
+	  n += X(imax)((p->n - 1) * X(iabs)(is), (p->n/2) * X(iabs)(os));
+     }
+     return n;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,7 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft
+SUBDIRS = r2cf r2cb r2r
+noinst_LTLIBRARIES = librdft_scalar.la
+
+librdft_scalar_la_SOURCES = hb.h r2cb.h r2cbIII.h hf.h hfb.c r2c.c	\
+r2cf.h r2cfII.h r2r.c r2r.h hc2c.c hc2cf.h hc2cb.h
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,689 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = rdft/scalar
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+librdft_scalar_la_LIBADD =
+am_librdft_scalar_la_OBJECTS = hfb.lo r2c.lo r2r.lo hc2c.lo
+librdft_scalar_la_OBJECTS = $(am_librdft_scalar_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(librdft_scalar_la_SOURCES)
+DIST_SOURCES = $(librdft_scalar_la_SOURCES)
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
+	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
+	distdir
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft
+SUBDIRS = r2cf r2cb r2r
+noinst_LTLIBRARIES = librdft_scalar.la
+librdft_scalar_la_SOURCES = hb.h r2cb.h r2cbIII.h hf.h hfb.c r2c.c	\
+r2cf.h r2cfII.h r2r.c r2r.h hc2c.c hc2cf.h hc2cb.h
+
+all: all-recursive
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/scalar/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/scalar/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+librdft_scalar.la: $(librdft_scalar_la_OBJECTS) $(librdft_scalar_la_DEPENDENCIES) $(EXTRA_librdft_scalar_la_DEPENDENCIES) 
+	$(LINK)  $(librdft_scalar_la_OBJECTS) $(librdft_scalar_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hfb.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2r.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-recursive
+all-am: Makefile $(LTLIBRARIES)
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \
+	install-am install-strip tags-recursive
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am check check-am clean clean-generic clean-libtool \
+	clean-noinstLTLIBRARIES ctags ctags-recursive distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	installdirs-am maintainer-clean maintainer-clean-generic \
+	mostlyclean mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \
+	uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/hb.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/hb.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(rdft_hb_genus)
+extern const hc2hc_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/hc2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/hc2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "codelet-rdft.h"
+#include "hc2cf.h"
+
+static int okp(const R *Rp, const R *Ip, const R *Rm, const R *Im, 
+	       INT rs, INT mb, INT me, INT ms, 
+	       const planner *plnr)
+{
+     UNUSED(Rp); UNUSED(Ip); UNUSED(Rm); UNUSED(Im);
+     UNUSED(rs); UNUSED(mb); UNUSED(me); UNUSED(ms); UNUSED(plnr);
+
+     return 1;
+}
+
+const hc2c_genus GENUS = { okp, R2HC, 1 };
+
+#undef GENUS
+#include "hc2cb.h"
+
+const hc2c_genus GENUS = { okp, HC2R, 1 };
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/hc2cb.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/hc2cb.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(rdft_hc2cb_genus)
+extern const hc2c_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/hc2cf.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/hc2cf.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(rdft_hc2cf_genus)
+extern const hc2c_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/hf.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/hf.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(rdft_hf_genus)
+extern const hc2hc_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/hfb.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/hfb.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "codelet-rdft.h"
+#include "hf.h"
+
+const hc2hc_genus GENUS = { R2HC, 1 };
+
+#undef GENUS
+#include "hb.h"
+
+const hc2hc_genus GENUS = { HC2R, 1 };
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2c.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2c.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "codelet-rdft.h"
+
+#include "r2cf.h"
+const kr2c_genus GENUS = { R2HC, 1 };
+#undef GENUS
+
+#include "r2cfII.h"
+const kr2c_genus GENUS = { R2HCII, 1 };
+#undef GENUS
+
+#include "r2cb.h"
+const kr2c_genus GENUS = { HC2R, 1 };
+#undef GENUS
+
+#include "r2cbIII.h"
+const kr2c_genus GENUS = { HC2RIII, 1 };
+#undef GENUS
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(rdft_r2cb_genus)
+extern const kr2c_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,110 @@
+# This Makefile.am specifies a set of codelets, efficient transforms
+# of small sizes, that are used as building blocks (kernels) by FFTW
+# to build up large transforms, as well as the options for generating
+# and compiling them.
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+###########################################################################
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/scalar
+noinst_LTLIBRARIES = librdft_scalar_r2cb.la
+
+###########################################################################
+# r2cb_<n> is a hard-coded complex-to-real FFT of size <n> (base cases
+# of real-output FFT recursion)
+R2CB = r2cb_2.c r2cb_3.c r2cb_4.c r2cb_5.c r2cb_6.c r2cb_7.c r2cb_8.c	\
+r2cb_9.c r2cb_10.c r2cb_11.c r2cb_12.c r2cb_13.c r2cb_14.c r2cb_15.c	\
+r2cb_16.c r2cb_32.c r2cb_64.c r2cb_128.c r2cb_20.c r2cb_25.c
+# r2cb_30.c r2cb_40.c r2cb_50.c
+
+###########################################################################
+# hb_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF
+# step for a real-output FFT.  Every hb codelet must have a
+# corresponding r2cbIII codelet (see below)!
+HB = hb_2.c hb_3.c hb_4.c hb_5.c hb_6.c hb_7.c hb_8.c hb_9.c	\
+hb_10.c hb_12.c hb_15.c hb_16.c hb_32.c hb_64.c \
+hb_20.c hb_25.c # hb_30.c hb_40.c hb_50.c
+
+# like hb, but generates part of its trig table on the fly (good for large n)
+HB2 = hb2_4.c hb2_8.c hb2_16.c hb2_32.c \
+hb2_5.c hb2_20.c hb2_25.c
+
+# an r2cb transform where the output is shifted by half a sample (input
+# is multiplied by a phase).  This is needed as part of the DIF recursion;
+# every hb_<r> or hb2_<r> codelet should have a corresponding r2cbIII_<r>
+R2CBIII = r2cbIII_2.c r2cbIII_3.c r2cbIII_4.c r2cbIII_5.c r2cbIII_6.c	\
+r2cbIII_7.c r2cbIII_8.c r2cbIII_9.c r2cbIII_10.c r2cbIII_12.c		\
+r2cbIII_15.c r2cbIII_16.c r2cbIII_32.c r2cbIII_64.c \
+r2cbIII_20.c r2cbIII_25.c # r2cbIII_30.c r2cbIII_40.c r2cbIII_50.c
+
+###########################################################################
+# hc2cb_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF
+# step for a real-input FFT with rdft2-style output.  <r> must be even.
+HC2CB = hc2cb_2.c hc2cb_4.c hc2cb_6.c hc2cb_8.c hc2cb_10.c hc2cb_12.c	\
+hc2cb_16.c hc2cb_32.c \
+hc2cb_20.c # hc2cb_30.c
+
+HC2CBDFT = hc2cbdft_2.c hc2cbdft_4.c hc2cbdft_6.c hc2cbdft_8.c	\
+hc2cbdft_10.c hc2cbdft_12.c hc2cbdft_16.c hc2cbdft_32.c \
+hc2cbdft_20.c # hc2cbdft_30.c
+
+# like hc2cb, but generates part of its trig table on the fly (good
+# for large n)
+HC2CB2 = hc2cb2_4.c hc2cb2_8.c hc2cb2_16.c hc2cb2_32.c \
+hc2cb2_20.c # hc2cb2_30.c
+HC2CBDFT2 = hc2cbdft2_4.c hc2cbdft2_8.c hc2cbdft2_16.c hc2cbdft2_32.c \
+hc2cbdft2_20.c # hc2cbdft2_30.c
+
+###########################################################################
+ALL_CODELETS = $(R2CB) $(HB) $(HB2) $(R2CBIII) $(HC2CB) $(HC2CB2)	\
+$(HC2CBDFT) $(HC2CBDFT2)
+
+BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
+
+librdft_scalar_r2cb_la_SOURCES = $(BUILT_SOURCES)
+
+SOLVTAB_NAME = X(solvtab_rdft_r2cb)
+XRENAME=X
+
+# special rules for regenerating codelets.
+include $(top_srcdir)/support/Makefile.codelets
+
+if MAINTAINER_MODE
+FLAGS_R2CB=$(RDFT_FLAGS_COMMON) -sign 1
+FLAGS_HB=$(RDFT_FLAGS_COMMON) -sign 1
+FLAGS_HB2=$(RDFT_FLAGS_COMMON) -sign 1 -twiddle-log3 -precompute-twiddles
+FLAGS_HC2CB=$(RDFT_FLAGS_COMMON) -sign 1
+FLAGS_HC2CB2=$(RDFT_FLAGS_COMMON) -sign 1 -twiddle-log3 -precompute-twiddles
+FLAGS_R2CBIII=$(RDFT_FLAGS_COMMON) -sign 1
+
+r2cb_%.c:  $(CODELET_DEPS) $(GEN_R2CB)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CB) $(FLAGS_R2CB) -n $* -name r2cb_$* -include "r2cb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hb_%.c:  $(CODELET_DEPS) $(GEN_HC2HC)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HB) -n $* -dif -name hb_$* -include "hb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hb2_%.c:  $(CODELET_DEPS) $(GEN_HC2HC)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HB2) -n $* -dif -name hb2_$* -include "hb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+r2cbIII_%.c:  $(CODELET_DEPS) $(GEN_R2CB)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CB) $(FLAGS_R2CB) -n $* -name r2cbIII_$* -dft-III -include "r2cbIII.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hc2cb_%.c:  $(CODELET_DEPS) $(GEN_HC2C)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CB) -n $* -dif -name hc2cb_$* -include "hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hc2cb2_%.c:  $(CODELET_DEPS) $(GEN_HC2C)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CB2) -n $* -dif -name hc2cb2_$* -include "hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hc2cbdft_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CB) -n $* -dif -name hc2cbdft_$* -include "hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hc2cbdft2_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CB) -n $* -dif -name hc2cbdft2_$* -include "hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+endif # MAINTAINER_MODE
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,833 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This Makefile.am specifies a set of codelets, efficient transforms
+# of small sizes, that are used as building blocks (kernels) by FFTW
+# to build up large transforms, as well as the options for generating
+# and compiling them.
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+# -*- makefile -*-
+# This file contains special make rules to generate codelets.
+# Most of this file requires GNU make .
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/support/Makefile.codelets
+subdir = rdft/scalar/r2cb
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+librdft_scalar_r2cb_la_LIBADD =
+am__objects_1 = r2cb_2.lo r2cb_3.lo r2cb_4.lo r2cb_5.lo r2cb_6.lo \
+	r2cb_7.lo r2cb_8.lo r2cb_9.lo r2cb_10.lo r2cb_11.lo r2cb_12.lo \
+	r2cb_13.lo r2cb_14.lo r2cb_15.lo r2cb_16.lo r2cb_32.lo \
+	r2cb_64.lo r2cb_128.lo r2cb_20.lo r2cb_25.lo
+am__objects_2 = hb_2.lo hb_3.lo hb_4.lo hb_5.lo hb_6.lo hb_7.lo \
+	hb_8.lo hb_9.lo hb_10.lo hb_12.lo hb_15.lo hb_16.lo hb_32.lo \
+	hb_64.lo hb_20.lo hb_25.lo
+am__objects_3 = hb2_4.lo hb2_8.lo hb2_16.lo hb2_32.lo hb2_5.lo \
+	hb2_20.lo hb2_25.lo
+am__objects_4 = r2cbIII_2.lo r2cbIII_3.lo r2cbIII_4.lo r2cbIII_5.lo \
+	r2cbIII_6.lo r2cbIII_7.lo r2cbIII_8.lo r2cbIII_9.lo \
+	r2cbIII_10.lo r2cbIII_12.lo r2cbIII_15.lo r2cbIII_16.lo \
+	r2cbIII_32.lo r2cbIII_64.lo r2cbIII_20.lo r2cbIII_25.lo
+am__objects_5 = hc2cb_2.lo hc2cb_4.lo hc2cb_6.lo hc2cb_8.lo \
+	hc2cb_10.lo hc2cb_12.lo hc2cb_16.lo hc2cb_32.lo hc2cb_20.lo
+am__objects_6 = hc2cb2_4.lo hc2cb2_8.lo hc2cb2_16.lo hc2cb2_32.lo \
+	hc2cb2_20.lo
+am__objects_7 = hc2cbdft_2.lo hc2cbdft_4.lo hc2cbdft_6.lo \
+	hc2cbdft_8.lo hc2cbdft_10.lo hc2cbdft_12.lo hc2cbdft_16.lo \
+	hc2cbdft_32.lo hc2cbdft_20.lo
+am__objects_8 = hc2cbdft2_4.lo hc2cbdft2_8.lo hc2cbdft2_16.lo \
+	hc2cbdft2_32.lo hc2cbdft2_20.lo
+am__objects_9 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
+	$(am__objects_4) $(am__objects_5) $(am__objects_6) \
+	$(am__objects_7) $(am__objects_8)
+am__objects_10 = codlist.lo
+am__objects_11 = $(am__objects_9) $(am__objects_10)
+am_librdft_scalar_r2cb_la_OBJECTS = $(am__objects_11)
+librdft_scalar_r2cb_la_OBJECTS = $(am_librdft_scalar_r2cb_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(librdft_scalar_r2cb_la_SOURCES)
+DIST_SOURCES = $(librdft_scalar_r2cb_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+
+###########################################################################
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/scalar
+
+noinst_LTLIBRARIES = librdft_scalar_r2cb.la
+
+###########################################################################
+# r2cb_<n> is a hard-coded complex-to-real FFT of size <n> (base cases
+# of real-output FFT recursion)
+R2CB = r2cb_2.c r2cb_3.c r2cb_4.c r2cb_5.c r2cb_6.c r2cb_7.c r2cb_8.c	\
+r2cb_9.c r2cb_10.c r2cb_11.c r2cb_12.c r2cb_13.c r2cb_14.c r2cb_15.c	\
+r2cb_16.c r2cb_32.c r2cb_64.c r2cb_128.c r2cb_20.c r2cb_25.c
+
+# r2cb_30.c r2cb_40.c r2cb_50.c
+
+###########################################################################
+# hb_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF
+# step for a real-output FFT.  Every hb codelet must have a
+# corresponding r2cbIII codelet (see below)!
+HB = hb_2.c hb_3.c hb_4.c hb_5.c hb_6.c hb_7.c hb_8.c hb_9.c	\
+hb_10.c hb_12.c hb_15.c hb_16.c hb_32.c hb_64.c \
+hb_20.c hb_25.c # hb_30.c hb_40.c hb_50.c
+
+
+# like hb, but generates part of its trig table on the fly (good for large n)
+HB2 = hb2_4.c hb2_8.c hb2_16.c hb2_32.c \
+hb2_5.c hb2_20.c hb2_25.c
+
+
+# an r2cb transform where the output is shifted by half a sample (input
+# is multiplied by a phase).  This is needed as part of the DIF recursion;
+# every hb_<r> or hb2_<r> codelet should have a corresponding r2cbIII_<r>
+R2CBIII = r2cbIII_2.c r2cbIII_3.c r2cbIII_4.c r2cbIII_5.c r2cbIII_6.c	\
+r2cbIII_7.c r2cbIII_8.c r2cbIII_9.c r2cbIII_10.c r2cbIII_12.c		\
+r2cbIII_15.c r2cbIII_16.c r2cbIII_32.c r2cbIII_64.c \
+r2cbIII_20.c r2cbIII_25.c # r2cbIII_30.c r2cbIII_40.c r2cbIII_50.c
+
+
+###########################################################################
+# hc2cb_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF
+# step for a real-input FFT with rdft2-style output.  <r> must be even.
+HC2CB = hc2cb_2.c hc2cb_4.c hc2cb_6.c hc2cb_8.c hc2cb_10.c hc2cb_12.c	\
+hc2cb_16.c hc2cb_32.c \
+hc2cb_20.c # hc2cb_30.c
+
+HC2CBDFT = hc2cbdft_2.c hc2cbdft_4.c hc2cbdft_6.c hc2cbdft_8.c	\
+hc2cbdft_10.c hc2cbdft_12.c hc2cbdft_16.c hc2cbdft_32.c \
+hc2cbdft_20.c # hc2cbdft_30.c
+
+
+# like hc2cb, but generates part of its trig table on the fly (good
+# for large n)
+HC2CB2 = hc2cb2_4.c hc2cb2_8.c hc2cb2_16.c hc2cb2_32.c \
+hc2cb2_20.c # hc2cb2_30.c
+
+HC2CBDFT2 = hc2cbdft2_4.c hc2cbdft2_8.c hc2cbdft2_16.c hc2cbdft2_32.c \
+hc2cbdft2_20.c # hc2cbdft2_30.c
+
+
+###########################################################################
+ALL_CODELETS = $(R2CB) $(HB) $(HB2) $(R2CBIII) $(HC2CB) $(HC2CB2)	\
+$(HC2CBDFT) $(HC2CBDFT2)
+
+BUILT_SOURCES = $(ALL_CODELETS) $(CODLIST)
+librdft_scalar_r2cb_la_SOURCES = $(BUILT_SOURCES)
+SOLVTAB_NAME = X(solvtab_rdft_r2cb)
+XRENAME = X
+CODLIST = codlist.c
+CODELET_NAME = codelet_
+@MAINTAINER_MODE_TRUE@INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
+@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
+@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
+@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
+@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
+@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
+@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
+@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
+@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
+@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
+@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
+@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
+@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE) 
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
+@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
+@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+
+# special rules for regenerating codelets.
+@MAINTAINER_MODE_TRUE@FLAGS_R2CB = $(RDFT_FLAGS_COMMON) -sign 1
+@MAINTAINER_MODE_TRUE@FLAGS_HB = $(RDFT_FLAGS_COMMON) -sign 1
+@MAINTAINER_MODE_TRUE@FLAGS_HB2 = $(RDFT_FLAGS_COMMON) -sign 1 -twiddle-log3 -precompute-twiddles
+@MAINTAINER_MODE_TRUE@FLAGS_HC2CB = $(RDFT_FLAGS_COMMON) -sign 1
+@MAINTAINER_MODE_TRUE@FLAGS_HC2CB2 = $(RDFT_FLAGS_COMMON) -sign 1 -twiddle-log3 -precompute-twiddles
+@MAINTAINER_MODE_TRUE@FLAGS_R2CBIII = $(RDFT_FLAGS_COMMON) -sign 1
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/scalar/r2cb/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/scalar/r2cb/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/support/Makefile.codelets:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+librdft_scalar_r2cb.la: $(librdft_scalar_r2cb_la_OBJECTS) $(librdft_scalar_r2cb_la_DEPENDENCIES) $(EXTRA_librdft_scalar_r2cb_la_DEPENDENCIES) 
+	$(LINK)  $(librdft_scalar_r2cb_la_OBJECTS) $(librdft_scalar_r2cb_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb2_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb2_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb2_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb2_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb2_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb2_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb2_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hb_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb2_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb2_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb2_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb2_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb2_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cb_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft2_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft2_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft2_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft2_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft2_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdft_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cbIII_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_128.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cb_9.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic \
+	maintainer-clean-local
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic \
+	maintainer-clean-local mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am
+
+
+# rule to build codlist
+$(CODLIST): Makefile
+	(									\
+	echo "#include \"ifftw.h\"";						\
+	echo $(INCLUDE_SIMD_HEADER);						\
+	echo;									\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+             echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);";	\
+           fi									\
+	done;									\
+	echo;									\
+	echo;									\
+	echo "extern const solvtab $(SOLVTAB_NAME);";				\
+	echo "const solvtab $(SOLVTAB_NAME) = {";				\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+	     echo "   SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),";		\
+	   fi									\
+	done;									\
+	echo "   SOLVTAB_END";							\
+	echo "};";								\
+	) >$@
+
+# only delete codlist.c in maintainer-mode, since it is included in the dist
+# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
+maintainer-clean-local:
+	rm -f $(CODLIST)
+
+# cancel the hideous builtin rules that cause an infinite loop
+@MAINTAINER_MODE_TRUE@%: %.o
+@MAINTAINER_MODE_TRUE@%: %.s
+@MAINTAINER_MODE_TRUE@%: %.c
+@MAINTAINER_MODE_TRUE@%: %.S
+
+@MAINTAINER_MODE_TRUE@r2cb_%.c:  $(CODELET_DEPS) $(GEN_R2CB)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CB) $(FLAGS_R2CB) -n $* -name r2cb_$* -include "r2cb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hb_%.c:  $(CODELET_DEPS) $(GEN_HC2HC)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HB) -n $* -dif -name hb_$* -include "hb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hb2_%.c:  $(CODELET_DEPS) $(GEN_HC2HC)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HB2) -n $* -dif -name hb2_$* -include "hb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@r2cbIII_%.c:  $(CODELET_DEPS) $(GEN_R2CB)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CB) $(FLAGS_R2CB) -n $* -name r2cbIII_$* -dft-III -include "r2cbIII.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hc2cb_%.c:  $(CODELET_DEPS) $(GEN_HC2C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CB) -n $* -dif -name hc2cb_$* -include "hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hc2cb2_%.c:  $(CODELET_DEPS) $(GEN_HC2C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CB2) -n $* -dif -name hc2cb2_$* -include "hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hc2cbdft_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CB) -n $* -dif -name hc2cbdft_$* -include "hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hc2cbdft2_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CB) -n $* -dif -name hc2cbdft2_$* -include "hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,183 @@
+#include "ifftw.h"
+
+
+extern void X(codelet_r2cb_2)(planner *);
+extern void X(codelet_r2cb_3)(planner *);
+extern void X(codelet_r2cb_4)(planner *);
+extern void X(codelet_r2cb_5)(planner *);
+extern void X(codelet_r2cb_6)(planner *);
+extern void X(codelet_r2cb_7)(planner *);
+extern void X(codelet_r2cb_8)(planner *);
+extern void X(codelet_r2cb_9)(planner *);
+extern void X(codelet_r2cb_10)(planner *);
+extern void X(codelet_r2cb_11)(planner *);
+extern void X(codelet_r2cb_12)(planner *);
+extern void X(codelet_r2cb_13)(planner *);
+extern void X(codelet_r2cb_14)(planner *);
+extern void X(codelet_r2cb_15)(planner *);
+extern void X(codelet_r2cb_16)(planner *);
+extern void X(codelet_r2cb_32)(planner *);
+extern void X(codelet_r2cb_64)(planner *);
+extern void X(codelet_r2cb_128)(planner *);
+extern void X(codelet_r2cb_20)(planner *);
+extern void X(codelet_r2cb_25)(planner *);
+extern void X(codelet_hb_2)(planner *);
+extern void X(codelet_hb_3)(planner *);
+extern void X(codelet_hb_4)(planner *);
+extern void X(codelet_hb_5)(planner *);
+extern void X(codelet_hb_6)(planner *);
+extern void X(codelet_hb_7)(planner *);
+extern void X(codelet_hb_8)(planner *);
+extern void X(codelet_hb_9)(planner *);
+extern void X(codelet_hb_10)(planner *);
+extern void X(codelet_hb_12)(planner *);
+extern void X(codelet_hb_15)(planner *);
+extern void X(codelet_hb_16)(planner *);
+extern void X(codelet_hb_32)(planner *);
+extern void X(codelet_hb_64)(planner *);
+extern void X(codelet_hb_20)(planner *);
+extern void X(codelet_hb_25)(planner *);
+extern void X(codelet_hb2_4)(planner *);
+extern void X(codelet_hb2_8)(planner *);
+extern void X(codelet_hb2_16)(planner *);
+extern void X(codelet_hb2_32)(planner *);
+extern void X(codelet_hb2_5)(planner *);
+extern void X(codelet_hb2_20)(planner *);
+extern void X(codelet_hb2_25)(planner *);
+extern void X(codelet_r2cbIII_2)(planner *);
+extern void X(codelet_r2cbIII_3)(planner *);
+extern void X(codelet_r2cbIII_4)(planner *);
+extern void X(codelet_r2cbIII_5)(planner *);
+extern void X(codelet_r2cbIII_6)(planner *);
+extern void X(codelet_r2cbIII_7)(planner *);
+extern void X(codelet_r2cbIII_8)(planner *);
+extern void X(codelet_r2cbIII_9)(planner *);
+extern void X(codelet_r2cbIII_10)(planner *);
+extern void X(codelet_r2cbIII_12)(planner *);
+extern void X(codelet_r2cbIII_15)(planner *);
+extern void X(codelet_r2cbIII_16)(planner *);
+extern void X(codelet_r2cbIII_32)(planner *);
+extern void X(codelet_r2cbIII_64)(planner *);
+extern void X(codelet_r2cbIII_20)(planner *);
+extern void X(codelet_r2cbIII_25)(planner *);
+extern void X(codelet_hc2cb_2)(planner *);
+extern void X(codelet_hc2cb_4)(planner *);
+extern void X(codelet_hc2cb_6)(planner *);
+extern void X(codelet_hc2cb_8)(planner *);
+extern void X(codelet_hc2cb_10)(planner *);
+extern void X(codelet_hc2cb_12)(planner *);
+extern void X(codelet_hc2cb_16)(planner *);
+extern void X(codelet_hc2cb_32)(planner *);
+extern void X(codelet_hc2cb_20)(planner *);
+extern void X(codelet_hc2cb2_4)(planner *);
+extern void X(codelet_hc2cb2_8)(planner *);
+extern void X(codelet_hc2cb2_16)(planner *);
+extern void X(codelet_hc2cb2_32)(planner *);
+extern void X(codelet_hc2cb2_20)(planner *);
+extern void X(codelet_hc2cbdft_2)(planner *);
+extern void X(codelet_hc2cbdft_4)(planner *);
+extern void X(codelet_hc2cbdft_6)(planner *);
+extern void X(codelet_hc2cbdft_8)(planner *);
+extern void X(codelet_hc2cbdft_10)(planner *);
+extern void X(codelet_hc2cbdft_12)(planner *);
+extern void X(codelet_hc2cbdft_16)(planner *);
+extern void X(codelet_hc2cbdft_32)(planner *);
+extern void X(codelet_hc2cbdft_20)(planner *);
+extern void X(codelet_hc2cbdft2_4)(planner *);
+extern void X(codelet_hc2cbdft2_8)(planner *);
+extern void X(codelet_hc2cbdft2_16)(planner *);
+extern void X(codelet_hc2cbdft2_32)(planner *);
+extern void X(codelet_hc2cbdft2_20)(planner *);
+
+
+extern const solvtab X(solvtab_rdft_r2cb);
+const solvtab X(solvtab_rdft_r2cb) = {
+   SOLVTAB(X(codelet_r2cb_2)),
+   SOLVTAB(X(codelet_r2cb_3)),
+   SOLVTAB(X(codelet_r2cb_4)),
+   SOLVTAB(X(codelet_r2cb_5)),
+   SOLVTAB(X(codelet_r2cb_6)),
+   SOLVTAB(X(codelet_r2cb_7)),
+   SOLVTAB(X(codelet_r2cb_8)),
+   SOLVTAB(X(codelet_r2cb_9)),
+   SOLVTAB(X(codelet_r2cb_10)),
+   SOLVTAB(X(codelet_r2cb_11)),
+   SOLVTAB(X(codelet_r2cb_12)),
+   SOLVTAB(X(codelet_r2cb_13)),
+   SOLVTAB(X(codelet_r2cb_14)),
+   SOLVTAB(X(codelet_r2cb_15)),
+   SOLVTAB(X(codelet_r2cb_16)),
+   SOLVTAB(X(codelet_r2cb_32)),
+   SOLVTAB(X(codelet_r2cb_64)),
+   SOLVTAB(X(codelet_r2cb_128)),
+   SOLVTAB(X(codelet_r2cb_20)),
+   SOLVTAB(X(codelet_r2cb_25)),
+   SOLVTAB(X(codelet_hb_2)),
+   SOLVTAB(X(codelet_hb_3)),
+   SOLVTAB(X(codelet_hb_4)),
+   SOLVTAB(X(codelet_hb_5)),
+   SOLVTAB(X(codelet_hb_6)),
+   SOLVTAB(X(codelet_hb_7)),
+   SOLVTAB(X(codelet_hb_8)),
+   SOLVTAB(X(codelet_hb_9)),
+   SOLVTAB(X(codelet_hb_10)),
+   SOLVTAB(X(codelet_hb_12)),
+   SOLVTAB(X(codelet_hb_15)),
+   SOLVTAB(X(codelet_hb_16)),
+   SOLVTAB(X(codelet_hb_32)),
+   SOLVTAB(X(codelet_hb_64)),
+   SOLVTAB(X(codelet_hb_20)),
+   SOLVTAB(X(codelet_hb_25)),
+   SOLVTAB(X(codelet_hb2_4)),
+   SOLVTAB(X(codelet_hb2_8)),
+   SOLVTAB(X(codelet_hb2_16)),
+   SOLVTAB(X(codelet_hb2_32)),
+   SOLVTAB(X(codelet_hb2_5)),
+   SOLVTAB(X(codelet_hb2_20)),
+   SOLVTAB(X(codelet_hb2_25)),
+   SOLVTAB(X(codelet_r2cbIII_2)),
+   SOLVTAB(X(codelet_r2cbIII_3)),
+   SOLVTAB(X(codelet_r2cbIII_4)),
+   SOLVTAB(X(codelet_r2cbIII_5)),
+   SOLVTAB(X(codelet_r2cbIII_6)),
+   SOLVTAB(X(codelet_r2cbIII_7)),
+   SOLVTAB(X(codelet_r2cbIII_8)),
+   SOLVTAB(X(codelet_r2cbIII_9)),
+   SOLVTAB(X(codelet_r2cbIII_10)),
+   SOLVTAB(X(codelet_r2cbIII_12)),
+   SOLVTAB(X(codelet_r2cbIII_15)),
+   SOLVTAB(X(codelet_r2cbIII_16)),
+   SOLVTAB(X(codelet_r2cbIII_32)),
+   SOLVTAB(X(codelet_r2cbIII_64)),
+   SOLVTAB(X(codelet_r2cbIII_20)),
+   SOLVTAB(X(codelet_r2cbIII_25)),
+   SOLVTAB(X(codelet_hc2cb_2)),
+   SOLVTAB(X(codelet_hc2cb_4)),
+   SOLVTAB(X(codelet_hc2cb_6)),
+   SOLVTAB(X(codelet_hc2cb_8)),
+   SOLVTAB(X(codelet_hc2cb_10)),
+   SOLVTAB(X(codelet_hc2cb_12)),
+   SOLVTAB(X(codelet_hc2cb_16)),
+   SOLVTAB(X(codelet_hc2cb_32)),
+   SOLVTAB(X(codelet_hc2cb_20)),
+   SOLVTAB(X(codelet_hc2cb2_4)),
+   SOLVTAB(X(codelet_hc2cb2_8)),
+   SOLVTAB(X(codelet_hc2cb2_16)),
+   SOLVTAB(X(codelet_hc2cb2_32)),
+   SOLVTAB(X(codelet_hc2cb2_20)),
+   SOLVTAB(X(codelet_hc2cbdft_2)),
+   SOLVTAB(X(codelet_hc2cbdft_4)),
+   SOLVTAB(X(codelet_hc2cbdft_6)),
+   SOLVTAB(X(codelet_hc2cbdft_8)),
+   SOLVTAB(X(codelet_hc2cbdft_10)),
+   SOLVTAB(X(codelet_hc2cbdft_12)),
+   SOLVTAB(X(codelet_hc2cbdft_16)),
+   SOLVTAB(X(codelet_hc2cbdft_32)),
+   SOLVTAB(X(codelet_hc2cbdft_20)),
+   SOLVTAB(X(codelet_hc2cbdft2_4)),
+   SOLVTAB(X(codelet_hc2cbdft2_8)),
+   SOLVTAB(X(codelet_hc2cbdft2_16)),
+   SOLVTAB(X(codelet_hc2cbdft2_32)),
+   SOLVTAB(X(codelet_hc2cbdft2_20)),
+   SOLVTAB_END
+};
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb2_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb2_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,831 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:24 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hb2_16 -include hb.h */
+
+/*
+ * This function contains 196 FP additions, 134 FP multiplications,
+ * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
+ * 114 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E Tv, TB, TF, Ty, T1J, T1O, T1N, T1K;
+	       {
+		    E Tw, T2z, T2C, Tx, T3f, T3l, T2F, T3r, Tz;
+		    Tv = W[0];
+		    Tw = W[2];
+		    T2z = W[6];
+		    T2C = W[7];
+		    TB = W[4];
+		    Tx = Tv * Tw;
+		    T3f = Tv * T2z;
+		    T3l = Tv * T2C;
+		    T2F = Tv * TB;
+		    T3r = Tw * TB;
+		    TF = W[5];
+		    Ty = W[1];
+		    Tz = W[3];
+		    {
+			 E T2G, T3z, T3m, T3g, T3L, T3s, T1V, TA, T3w, T3Q, T30, T3C, TE, T1X, T1D;
+			 E TG, T1G, T1o, T2p, T1Y, T2u, T2c, T1Z, TL, T1t, T2d, T35, T3n, T3R, T3F;
+			 E T20, T1w, T3M, Tf, T3h, T2L, T2e, TW, T3N, T3I, T2Q, T36, T2V, T37, T1d;
+			 E Tu, T3S, T18, T1z, T1i, T24, T2g, T27, T2h, TQ, TV;
+			 {
+			      E TH, T3, T2I, TU, T32, T1s, T1p, T6, TM, Ta, Tb, T33, TK, T2J, TP;
+			      E Tc, T4, T5;
+			      {
+				   E TS, TT, T1q, T1r;
+				   {
+					E T1, T1n, TC, T2b, T1W, T2, T3v, T2Z, TD;
+					T1 = cr[0];
+					T3v = Tw * TF;
+					T2Z = Tv * TF;
+					T2G = FNMS(Ty, TF, T2F);
+					T3z = FMA(Ty, TF, T2F);
+					T3m = FNMS(Ty, T2z, T3l);
+					T3g = FMA(Ty, T2C, T3f);
+					T3L = FNMS(Tz, TF, T3r);
+					T3s = FMA(Tz, TF, T3r);
+					T1V = FMA(Ty, Tz, Tx);
+					TA = FNMS(Ty, Tz, Tx);
+					TD = Tv * Tz;
+					T3w = FNMS(Tz, TB, T3v);
+					T3Q = FMA(Tz, TB, T3v);
+					T30 = FMA(Ty, TB, T2Z);
+					T3C = FNMS(Ty, TB, T2Z);
+					T1n = TA * TF;
+					TC = TA * TB;
+					T2b = T1V * TF;
+					T1W = T1V * TB;
+					TE = FMA(Ty, Tw, TD);
+					T1X = FNMS(Ty, Tw, TD);
+					T2 = ci[WS(rs, 7)];
+					TS = ci[WS(rs, 9)];
+					T1D = FMA(TE, TF, TC);
+					TG = FNMS(TE, TF, TC);
+					T1G = FNMS(TE, TB, T1n);
+					T1o = FMA(TE, TB, T1n);
+					T2p = FMA(T1X, TF, T1W);
+					T1Y = FNMS(T1X, TF, T1W);
+					T2u = FNMS(T1X, TB, T2b);
+					T2c = FMA(T1X, TB, T2b);
+					TH = T1 - T2;
+					T3 = T1 + T2;
+					TT = cr[WS(rs, 14)];
+				   }
+				   T1q = ci[WS(rs, 15)];
+				   T1r = cr[WS(rs, 8)];
+				   T4 = cr[WS(rs, 4)];
+				   T2I = TS - TT;
+				   TU = TS + TT;
+				   T32 = T1q - T1r;
+				   T1s = T1q + T1r;
+				   T5 = ci[WS(rs, 3)];
+			      }
+			      {
+				   E TI, TJ, T8, T9, TN, TO;
+				   T8 = cr[WS(rs, 2)];
+				   T9 = ci[WS(rs, 5)];
+				   TI = ci[WS(rs, 11)];
+				   T1p = T4 - T5;
+				   T6 = T4 + T5;
+				   TM = T8 - T9;
+				   Ta = T8 + T9;
+				   TJ = cr[WS(rs, 12)];
+				   TN = ci[WS(rs, 13)];
+				   TO = cr[WS(rs, 10)];
+				   Tb = ci[WS(rs, 1)];
+				   T33 = TI - TJ;
+				   TK = TI + TJ;
+				   T2J = TN - TO;
+				   TP = TN + TO;
+				   Tc = cr[WS(rs, 6)];
+			      }
+			      {
+				   E TR, Td, T3D, T34;
+				   T1Z = TH + TK;
+				   TL = TH - TK;
+				   T1t = T1p + T1s;
+				   T2d = T1s - T1p;
+				   TR = Tb - Tc;
+				   Td = Tb + Tc;
+				   T3D = T32 + T33;
+				   T34 = T32 - T33;
+				   {
+					E Te, T2K, T1u, T1v, T31, T3E, T2H, T7;
+					Te = Ta + Td;
+					T31 = Ta - Td;
+					T3E = T2J + T2I;
+					T2K = T2I - T2J;
+					TQ = TM - TP;
+					T1u = TM + TP;
+					T1v = TR + TU;
+					TV = TR - TU;
+					T35 = T31 + T34;
+					T3n = T34 - T31;
+					T3R = T3D - T3E;
+					T3F = T3D + T3E;
+					T2H = T3 - T6;
+					T7 = T3 + T6;
+					T20 = T1u + T1v;
+					T1w = T1u - T1v;
+					T3M = T7 - Te;
+					Tf = T7 + Te;
+					T3h = T2H - T2K;
+					T2L = T2H + T2K;
+				   }
+			      }
+			 }
+			 {
+			      E T1e, Ti, T2N, T1c, T2O, T1h, T19, Tl, T13, Tp, Tq, T2S, T11, T2T, T16;
+			      E Tr, Tj, Tk, Tm, TY, Tt;
+			      {
+				   E T1a, T1b, Tg, Th, T1f, T1g;
+				   Tg = cr[WS(rs, 1)];
+				   Th = ci[WS(rs, 6)];
+				   T1a = ci[WS(rs, 14)];
+				   T2e = TQ - TV;
+				   TW = TQ + TV;
+				   T1e = Tg - Th;
+				   Ti = Tg + Th;
+				   T1b = cr[WS(rs, 9)];
+				   T1f = ci[WS(rs, 10)];
+				   T1g = cr[WS(rs, 13)];
+				   Tj = cr[WS(rs, 5)];
+				   T2N = T1a - T1b;
+				   T1c = T1a + T1b;
+				   T2O = T1f - T1g;
+				   T1h = T1f + T1g;
+				   Tk = ci[WS(rs, 2)];
+			      }
+			      {
+				   E TZ, T10, Tn, To, T14, T15;
+				   Tn = ci[0];
+				   To = cr[WS(rs, 7)];
+				   TZ = ci[WS(rs, 8)];
+				   T19 = Tj - Tk;
+				   Tl = Tj + Tk;
+				   T13 = Tn - To;
+				   Tp = Tn + To;
+				   T10 = cr[WS(rs, 15)];
+				   T14 = ci[WS(rs, 12)];
+				   T15 = cr[WS(rs, 11)];
+				   Tq = cr[WS(rs, 3)];
+				   T2S = TZ - T10;
+				   T11 = TZ + T10;
+				   T2T = T14 - T15;
+				   T16 = T14 + T15;
+				   Tr = ci[WS(rs, 4)];
+			      }
+			      {
+				   E T2P, T2U, T2M, Ts, T3G, T3H, T2R;
+				   T2P = T2N - T2O;
+				   T3G = T2N + T2O;
+				   T3H = T2S + T2T;
+				   T2U = T2S - T2T;
+				   Tm = Ti + Tl;
+				   T2M = Ti - Tl;
+				   TY = Tq - Tr;
+				   Ts = Tq + Tr;
+				   T3N = T3H - T3G;
+				   T3I = T3G + T3H;
+				   Tt = Tp + Ts;
+				   T2R = Tp - Ts;
+				   T2Q = T2M - T2P;
+				   T36 = T2M + T2P;
+				   T2V = T2R + T2U;
+				   T37 = T2U - T2R;
+			      }
+			      {
+				   E T25, T26, T22, T23, T12, T17;
+				   T12 = TY - T11;
+				   T25 = TY + T11;
+				   T26 = T13 + T16;
+				   T17 = T13 - T16;
+				   T22 = T1c - T19;
+				   T1d = T19 + T1c;
+				   Tu = Tm + Tt;
+				   T3S = Tm - Tt;
+				   T18 = FNMS(KP414213562, T17, T12);
+				   T1z = FMA(KP414213562, T12, T17);
+				   T1i = T1e - T1h;
+				   T23 = T1e + T1h;
+				   T24 = FNMS(KP414213562, T23, T22);
+				   T2g = FMA(KP414213562, T22, T23);
+				   T27 = FNMS(KP414213562, T26, T25);
+				   T2h = FMA(KP414213562, T25, T26);
+			      }
+			 }
+			 {
+			      E T1j, T1y, T3V, T3X, T3W, T38, T3i, T3o, T2W, T3K, T3B, T3A;
+			      cr[0] = Tf + Tu;
+			      T3A = Tf - Tu;
+			      T1j = FMA(KP414213562, T1i, T1d);
+			      T1y = FNMS(KP414213562, T1d, T1i);
+			      T3K = T3C * T3A;
+			      T3B = T3z * T3A;
+			      {
+				   E T3O, T3T, T3J, T3P, T3U;
+				   T3O = T3M - T3N;
+				   T3V = T3M + T3N;
+				   T3X = T3S + T3R;
+				   T3T = T3R - T3S;
+				   ci[0] = T3F + T3I;
+				   T3J = T3F - T3I;
+				   T3P = T3L * T3O;
+				   T3U = T3L * T3T;
+				   T3W = TA * T3V;
+				   cr[WS(rs, 8)] = FNMS(T3C, T3J, T3B);
+				   ci[WS(rs, 8)] = FMA(T3z, T3J, T3K);
+				   cr[WS(rs, 12)] = FNMS(T3Q, T3T, T3P);
+				   ci[WS(rs, 12)] = FMA(T3Q, T3O, T3U);
+				   T38 = T36 + T37;
+				   T3i = T37 - T36;
+				   T3o = T2Q - T2V;
+				   T2W = T2Q + T2V;
+			      }
+			      {
+				   E T2q, T21, T28, T2w, T2v, T2f, T2i, T2r;
+				   {
+					E T2Y, T3a, T3c, T3d, T39, T3e, T3b, T2X, T3Y;
+					cr[WS(rs, 4)] = FNMS(TE, T3X, T3W);
+					T3Y = TA * T3X;
+					{
+					     E T3t, T3j, T3x, T3p;
+					     T3t = FMA(KP707106781, T3i, T3h);
+					     T3j = FNMS(KP707106781, T3i, T3h);
+					     T3x = FMA(KP707106781, T3o, T3n);
+					     T3p = FNMS(KP707106781, T3o, T3n);
+					     ci[WS(rs, 4)] = FMA(TE, T3V, T3Y);
+					     {
+						  E T3u, T3k, T3y, T3q;
+						  T3u = T3s * T3t;
+						  T3k = T3g * T3j;
+						  T3y = T3s * T3x;
+						  T3q = T3g * T3p;
+						  cr[WS(rs, 6)] = FNMS(T3w, T3x, T3u);
+						  cr[WS(rs, 14)] = FNMS(T3m, T3p, T3k);
+						  ci[WS(rs, 6)] = FMA(T3w, T3t, T3y);
+						  ci[WS(rs, 14)] = FMA(T3m, T3j, T3q);
+						  T3b = FMA(KP707106781, T2W, T2L);
+						  T2X = FNMS(KP707106781, T2W, T2L);
+					     }
+					}
+					T2Y = T2G * T2X;
+					T3a = T30 * T2X;
+					T3c = T1V * T3b;
+					T3d = FMA(KP707106781, T38, T35);
+					T39 = FNMS(KP707106781, T38, T35);
+					T3e = T1X * T3b;
+					T2q = FMA(KP707106781, T20, T1Z);
+					T21 = FNMS(KP707106781, T20, T1Z);
+					cr[WS(rs, 2)] = FNMS(T1X, T3d, T3c);
+					ci[WS(rs, 10)] = FMA(T2G, T39, T3a);
+					cr[WS(rs, 10)] = FNMS(T30, T39, T2Y);
+					ci[WS(rs, 2)] = FMA(T1V, T3d, T3e);
+					T28 = T24 + T27;
+					T2w = T27 - T24;
+					T2v = FNMS(KP707106781, T2e, T2d);
+					T2f = FMA(KP707106781, T2e, T2d);
+					T2i = T2g - T2h;
+					T2r = T2g + T2h;
+				   }
+				   {
+					E TX, T1k, T1x, T1A;
+					T1J = FMA(KP707106781, TW, TL);
+					TX = FNMS(KP707106781, TW, TL);
+					{
+					     E T2l, T29, T2n, T2j;
+					     T2l = FNMS(KP923879532, T28, T21);
+					     T29 = FMA(KP923879532, T28, T21);
+					     T2n = FMA(KP923879532, T2i, T2f);
+					     T2j = FNMS(KP923879532, T2i, T2f);
+					     {
+						  E T2o, T2m, T2k, T2a;
+						  T2o = Tz * T2l;
+						  T2m = Tw * T2l;
+						  T2k = T2c * T29;
+						  T2a = T1Y * T29;
+						  ci[WS(rs, 3)] = FMA(Tw, T2n, T2o);
+						  cr[WS(rs, 3)] = FNMS(Tz, T2n, T2m);
+						  ci[WS(rs, 11)] = FMA(T1Y, T2j, T2k);
+						  cr[WS(rs, 11)] = FNMS(T2c, T2j, T2a);
+						  T1k = T18 - T1j;
+						  T1O = T1j + T18;
+					     }
+					}
+					T1N = FMA(KP707106781, T1w, T1t);
+					T1x = FNMS(KP707106781, T1w, T1t);
+					T1A = T1y - T1z;
+					T1K = T1y + T1z;
+					{
+					     E T1E, T1l, T1H, T1B;
+					     T1E = FMA(KP923879532, T1k, TX);
+					     T1l = FNMS(KP923879532, T1k, TX);
+					     T1H = FMA(KP923879532, T1A, T1x);
+					     T1B = FNMS(KP923879532, T1A, T1x);
+					     {
+						  E T1I, T1F, T1C, T1m;
+						  T1I = T1G * T1E;
+						  T1F = T1D * T1E;
+						  T1C = T1o * T1l;
+						  T1m = TG * T1l;
+						  ci[WS(rs, 5)] = FMA(T1D, T1H, T1I);
+						  cr[WS(rs, 5)] = FNMS(T1G, T1H, T1F);
+						  ci[WS(rs, 13)] = FMA(TG, T1B, T1C);
+						  cr[WS(rs, 13)] = FNMS(T1o, T1B, T1m);
+					     }
+					}
+					{
+					     E T2A, T2s, T2D, T2x;
+					     T2A = FMA(KP923879532, T2r, T2q);
+					     T2s = FNMS(KP923879532, T2r, T2q);
+					     T2D = FNMS(KP923879532, T2w, T2v);
+					     T2x = FMA(KP923879532, T2w, T2v);
+					     {
+						  E T2B, T2t, T2E, T2y;
+						  T2B = T2z * T2A;
+						  T2t = T2p * T2s;
+						  T2E = T2z * T2D;
+						  T2y = T2p * T2x;
+						  cr[WS(rs, 15)] = FNMS(T2C, T2D, T2B);
+						  cr[WS(rs, 7)] = FNMS(T2u, T2x, T2t);
+						  ci[WS(rs, 15)] = FMA(T2C, T2A, T2E);
+						  ci[WS(rs, 7)] = FMA(T2u, T2s, T2y);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    E T1L, T1R, T1P, T1T;
+		    T1L = FNMS(KP923879532, T1K, T1J);
+		    T1R = FMA(KP923879532, T1K, T1J);
+		    T1P = FNMS(KP923879532, T1O, T1N);
+		    T1T = FMA(KP923879532, T1O, T1N);
+		    {
+			 E T1S, T1M, T1U, T1Q;
+			 T1S = Tv * T1R;
+			 T1M = TB * T1L;
+			 T1U = Tv * T1T;
+			 T1Q = TB * T1P;
+			 cr[WS(rs, 1)] = FNMS(Ty, T1T, T1S);
+			 cr[WS(rs, 9)] = FNMS(TF, T1P, T1M);
+			 ci[WS(rs, 1)] = FMA(Ty, T1R, T1U);
+			 ci[WS(rs, 9)] = FMA(TF, T1L, T1Q);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 16, "hb2_16", twinstr, &GENUS, {104, 42, 92, 0} };
+
+void X(codelet_hb2_16) (planner *p) {
+     X(khc2hc_register) (p, hb2_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hb2_16 -include hb.h */
+
+/*
+ * This function contains 196 FP additions, 108 FP multiplications,
+ * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
+ * 80 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E Tv, Ty, T1l, T1n, T1p, T1t, T27, T25, Tz, Tw, TB, T21, T1P, T1H, T1X;
+	       E T17, T1L, T1N, T1v, T1w, T1x, T1B, T2F, T2T, T2b, T2R, T3j, T3x, T35, T3t;
+	       {
+		    E TA, T1J, T15, T1G, Tx, T1K, T16, T1F;
+		    {
+			 E T1m, T1s, T1o, T1r;
+			 Tv = W[0];
+			 Ty = W[1];
+			 T1l = W[2];
+			 T1n = W[3];
+			 T1m = Tv * T1l;
+			 T1s = Ty * T1l;
+			 T1o = Ty * T1n;
+			 T1r = Tv * T1n;
+			 T1p = T1m + T1o;
+			 T1t = T1r - T1s;
+			 T27 = T1r + T1s;
+			 T25 = T1m - T1o;
+			 Tz = W[5];
+			 TA = Ty * Tz;
+			 T1J = T1l * Tz;
+			 T15 = Tv * Tz;
+			 T1G = T1n * Tz;
+			 Tw = W[4];
+			 Tx = Tv * Tw;
+			 T1K = T1n * Tw;
+			 T16 = Ty * Tw;
+			 T1F = T1l * Tw;
+		    }
+		    TB = Tx - TA;
+		    T21 = T1J + T1K;
+		    T1P = T15 - T16;
+		    T1H = T1F + T1G;
+		    T1X = T1F - T1G;
+		    T17 = T15 + T16;
+		    T1L = T1J - T1K;
+		    T1N = Tx + TA;
+		    T1v = W[6];
+		    T1w = W[7];
+		    T1x = FMA(Tv, T1v, Ty * T1w);
+		    T1B = FNMS(Ty, T1v, Tv * T1w);
+		    {
+			 E T2D, T2E, T29, T2a;
+			 T2D = T25 * Tz;
+			 T2E = T27 * Tw;
+			 T2F = T2D + T2E;
+			 T2T = T2D - T2E;
+			 T29 = T25 * Tw;
+			 T2a = T27 * Tz;
+			 T2b = T29 - T2a;
+			 T2R = T29 + T2a;
+		    }
+		    {
+			 E T3h, T3i, T33, T34;
+			 T3h = T1p * Tz;
+			 T3i = T1t * Tw;
+			 T3j = T3h + T3i;
+			 T3x = T3h - T3i;
+			 T33 = T1p * Tw;
+			 T34 = T1t * Tz;
+			 T35 = T33 - T34;
+			 T3t = T33 + T34;
+		    }
+	       }
+	       {
+		    E T7, T36, T3k, TC, T1f, T2e, T2I, T1Q, Te, TJ, T1R, T18, T2L, T37, T2l;
+		    E T3l, Tm, T1T, TT, T1h, T2A, T2N, T3b, T3n, Tt, T1U, T12, T1i, T2t, T2O;
+		    E T3e, T3o;
+		    {
+			 E T3, T2c, T1e, T2d, T6, T2G, T1b, T2H;
+			 {
+			      E T1, T2, T1c, T1d;
+			      T1 = cr[0];
+			      T2 = ci[WS(rs, 7)];
+			      T3 = T1 + T2;
+			      T2c = T1 - T2;
+			      T1c = ci[WS(rs, 11)];
+			      T1d = cr[WS(rs, 12)];
+			      T1e = T1c - T1d;
+			      T2d = T1c + T1d;
+			 }
+			 {
+			      E T4, T5, T19, T1a;
+			      T4 = cr[WS(rs, 4)];
+			      T5 = ci[WS(rs, 3)];
+			      T6 = T4 + T5;
+			      T2G = T4 - T5;
+			      T19 = ci[WS(rs, 15)];
+			      T1a = cr[WS(rs, 8)];
+			      T1b = T19 - T1a;
+			      T2H = T19 + T1a;
+			 }
+			 T7 = T3 + T6;
+			 T36 = T2c + T2d;
+			 T3k = T2H - T2G;
+			 TC = T3 - T6;
+			 T1f = T1b - T1e;
+			 T2e = T2c - T2d;
+			 T2I = T2G + T2H;
+			 T1Q = T1b + T1e;
+		    }
+		    {
+			 E Ta, T2f, TI, T2g, Td, T2i, TF, T2j;
+			 {
+			      E T8, T9, TG, TH;
+			      T8 = cr[WS(rs, 2)];
+			      T9 = ci[WS(rs, 5)];
+			      Ta = T8 + T9;
+			      T2f = T8 - T9;
+			      TG = ci[WS(rs, 13)];
+			      TH = cr[WS(rs, 10)];
+			      TI = TG - TH;
+			      T2g = TG + TH;
+			 }
+			 {
+			      E Tb, Tc, TD, TE;
+			      Tb = ci[WS(rs, 1)];
+			      Tc = cr[WS(rs, 6)];
+			      Td = Tb + Tc;
+			      T2i = Tb - Tc;
+			      TD = ci[WS(rs, 9)];
+			      TE = cr[WS(rs, 14)];
+			      TF = TD - TE;
+			      T2j = TD + TE;
+			 }
+			 Te = Ta + Td;
+			 TJ = TF - TI;
+			 T1R = TI + TF;
+			 T18 = Ta - Td;
+			 {
+			      E T2J, T2K, T2h, T2k;
+			      T2J = T2f + T2g;
+			      T2K = T2i + T2j;
+			      T2L = KP707106781 * (T2J - T2K);
+			      T37 = KP707106781 * (T2J + T2K);
+			      T2h = T2f - T2g;
+			      T2k = T2i - T2j;
+			      T2l = KP707106781 * (T2h + T2k);
+			      T3l = KP707106781 * (T2h - T2k);
+			 }
+		    }
+		    {
+			 E Ti, T2x, TR, T2y, Tl, T2u, TO, T2v, TL, TS;
+			 {
+			      E Tg, Th, TP, TQ;
+			      Tg = cr[WS(rs, 1)];
+			      Th = ci[WS(rs, 6)];
+			      Ti = Tg + Th;
+			      T2x = Tg - Th;
+			      TP = ci[WS(rs, 10)];
+			      TQ = cr[WS(rs, 13)];
+			      TR = TP - TQ;
+			      T2y = TP + TQ;
+			 }
+			 {
+			      E Tj, Tk, TM, TN;
+			      Tj = cr[WS(rs, 5)];
+			      Tk = ci[WS(rs, 2)];
+			      Tl = Tj + Tk;
+			      T2u = Tj - Tk;
+			      TM = ci[WS(rs, 14)];
+			      TN = cr[WS(rs, 9)];
+			      TO = TM - TN;
+			      T2v = TM + TN;
+			 }
+			 Tm = Ti + Tl;
+			 T1T = TO + TR;
+			 TL = Ti - Tl;
+			 TS = TO - TR;
+			 TT = TL - TS;
+			 T1h = TL + TS;
+			 {
+			      E T2w, T2z, T39, T3a;
+			      T2w = T2u + T2v;
+			      T2z = T2x - T2y;
+			      T2A = FMA(KP923879532, T2w, KP382683432 * T2z);
+			      T2N = FNMS(KP382683432, T2w, KP923879532 * T2z);
+			      T39 = T2x + T2y;
+			      T3a = T2v - T2u;
+			      T3b = FNMS(KP923879532, T3a, KP382683432 * T39);
+			      T3n = FMA(KP382683432, T3a, KP923879532 * T39);
+			 }
+		    }
+		    {
+			 E Tp, T2q, T10, T2r, Ts, T2n, TX, T2o, TU, T11;
+			 {
+			      E Tn, To, TY, TZ;
+			      Tn = ci[0];
+			      To = cr[WS(rs, 7)];
+			      Tp = Tn + To;
+			      T2q = Tn - To;
+			      TY = ci[WS(rs, 12)];
+			      TZ = cr[WS(rs, 11)];
+			      T10 = TY - TZ;
+			      T2r = TY + TZ;
+			 }
+			 {
+			      E Tq, Tr, TV, TW;
+			      Tq = cr[WS(rs, 3)];
+			      Tr = ci[WS(rs, 4)];
+			      Ts = Tq + Tr;
+			      T2n = Tq - Tr;
+			      TV = ci[WS(rs, 8)];
+			      TW = cr[WS(rs, 15)];
+			      TX = TV - TW;
+			      T2o = TV + TW;
+			 }
+			 Tt = Tp + Ts;
+			 T1U = TX + T10;
+			 TU = Tp - Ts;
+			 T11 = TX - T10;
+			 T12 = TU + T11;
+			 T1i = T11 - TU;
+			 {
+			      E T2p, T2s, T3c, T3d;
+			      T2p = T2n - T2o;
+			      T2s = T2q - T2r;
+			      T2t = FNMS(KP382683432, T2s, KP923879532 * T2p);
+			      T2O = FMA(KP382683432, T2p, KP923879532 * T2s);
+			      T3c = T2q + T2r;
+			      T3d = T2n + T2o;
+			      T3e = FNMS(KP923879532, T3d, KP382683432 * T3c);
+			      T3o = FMA(KP382683432, T3d, KP923879532 * T3c);
+			 }
+		    }
+		    {
+			 E Tf, Tu, T1O, T1S, T1V, T1W;
+			 Tf = T7 + Te;
+			 Tu = Tm + Tt;
+			 T1O = Tf - Tu;
+			 T1S = T1Q + T1R;
+			 T1V = T1T + T1U;
+			 T1W = T1S - T1V;
+			 cr[0] = Tf + Tu;
+			 ci[0] = T1S + T1V;
+			 cr[WS(rs, 8)] = FNMS(T1P, T1W, T1N * T1O);
+			 ci[WS(rs, 8)] = FMA(T1P, T1O, T1N * T1W);
+		    }
+		    {
+			 E T3g, T3r, T3q, T3s;
+			 {
+			      E T38, T3f, T3m, T3p;
+			      T38 = T36 - T37;
+			      T3f = T3b + T3e;
+			      T3g = T38 - T3f;
+			      T3r = T38 + T3f;
+			      T3m = T3k + T3l;
+			      T3p = T3n - T3o;
+			      T3q = T3m - T3p;
+			      T3s = T3m + T3p;
+			 }
+			 cr[WS(rs, 11)] = FNMS(T3j, T3q, T35 * T3g);
+			 ci[WS(rs, 11)] = FMA(T3j, T3g, T35 * T3q);
+			 cr[WS(rs, 3)] = FNMS(T1n, T3s, T1l * T3r);
+			 ci[WS(rs, 3)] = FMA(T1n, T3r, T1l * T3s);
+		    }
+		    {
+			 E T3w, T3B, T3A, T3C;
+			 {
+			      E T3u, T3v, T3y, T3z;
+			      T3u = T36 + T37;
+			      T3v = T3n + T3o;
+			      T3w = T3u - T3v;
+			      T3B = T3u + T3v;
+			      T3y = T3k - T3l;
+			      T3z = T3b - T3e;
+			      T3A = T3y + T3z;
+			      T3C = T3y - T3z;
+			 }
+			 cr[WS(rs, 7)] = FNMS(T3x, T3A, T3t * T3w);
+			 ci[WS(rs, 7)] = FMA(T3t, T3A, T3x * T3w);
+			 cr[WS(rs, 15)] = FNMS(T1w, T3C, T1v * T3B);
+			 ci[WS(rs, 15)] = FMA(T1v, T3C, T1w * T3B);
+		    }
+		    {
+			 E T14, T1q, T1k, T1u;
+			 {
+			      E TK, T13, T1g, T1j;
+			      TK = TC + TJ;
+			      T13 = KP707106781 * (TT + T12);
+			      T14 = TK - T13;
+			      T1q = TK + T13;
+			      T1g = T18 + T1f;
+			      T1j = KP707106781 * (T1h + T1i);
+			      T1k = T1g - T1j;
+			      T1u = T1g + T1j;
+			 }
+			 cr[WS(rs, 10)] = FNMS(T17, T1k, TB * T14);
+			 ci[WS(rs, 10)] = FMA(T17, T14, TB * T1k);
+			 cr[WS(rs, 2)] = FNMS(T1t, T1u, T1p * T1q);
+			 ci[WS(rs, 2)] = FMA(T1t, T1q, T1p * T1u);
+		    }
+		    {
+			 E T1A, T1I, T1E, T1M;
+			 {
+			      E T1y, T1z, T1C, T1D;
+			      T1y = TC - TJ;
+			      T1z = KP707106781 * (T1i - T1h);
+			      T1A = T1y - T1z;
+			      T1I = T1y + T1z;
+			      T1C = T1f - T18;
+			      T1D = KP707106781 * (TT - T12);
+			      T1E = T1C - T1D;
+			      T1M = T1C + T1D;
+			 }
+			 cr[WS(rs, 14)] = FNMS(T1B, T1E, T1x * T1A);
+			 ci[WS(rs, 14)] = FMA(T1x, T1E, T1B * T1A);
+			 cr[WS(rs, 6)] = FNMS(T1L, T1M, T1H * T1I);
+			 ci[WS(rs, 6)] = FMA(T1H, T1M, T1L * T1I);
+		    }
+		    {
+			 E T2C, T2S, T2Q, T2U;
+			 {
+			      E T2m, T2B, T2M, T2P;
+			      T2m = T2e - T2l;
+			      T2B = T2t - T2A;
+			      T2C = T2m - T2B;
+			      T2S = T2m + T2B;
+			      T2M = T2I - T2L;
+			      T2P = T2N - T2O;
+			      T2Q = T2M - T2P;
+			      T2U = T2M + T2P;
+			 }
+			 cr[WS(rs, 13)] = FNMS(T2F, T2Q, T2b * T2C);
+			 ci[WS(rs, 13)] = FMA(T2F, T2C, T2b * T2Q);
+			 cr[WS(rs, 5)] = FNMS(T2T, T2U, T2R * T2S);
+			 ci[WS(rs, 5)] = FMA(T2T, T2S, T2R * T2U);
+		    }
+		    {
+			 E T2X, T31, T30, T32;
+			 {
+			      E T2V, T2W, T2Y, T2Z;
+			      T2V = T2e + T2l;
+			      T2W = T2N + T2O;
+			      T2X = T2V - T2W;
+			      T31 = T2V + T2W;
+			      T2Y = T2I + T2L;
+			      T2Z = T2A + T2t;
+			      T30 = T2Y - T2Z;
+			      T32 = T2Y + T2Z;
+			 }
+			 cr[WS(rs, 9)] = FNMS(Tz, T30, Tw * T2X);
+			 ci[WS(rs, 9)] = FMA(Tw, T30, Tz * T2X);
+			 cr[WS(rs, 1)] = FNMS(Ty, T32, Tv * T31);
+			 ci[WS(rs, 1)] = FMA(Tv, T32, Ty * T31);
+		    }
+		    {
+			 E T20, T26, T24, T28;
+			 {
+			      E T1Y, T1Z, T22, T23;
+			      T1Y = T7 - Te;
+			      T1Z = T1U - T1T;
+			      T20 = T1Y - T1Z;
+			      T26 = T1Y + T1Z;
+			      T22 = T1Q - T1R;
+			      T23 = Tm - Tt;
+			      T24 = T22 - T23;
+			      T28 = T23 + T22;
+			 }
+			 cr[WS(rs, 12)] = FNMS(T21, T24, T1X * T20);
+			 ci[WS(rs, 12)] = FMA(T1X, T24, T21 * T20);
+			 cr[WS(rs, 4)] = FNMS(T27, T28, T25 * T26);
+			 ci[WS(rs, 4)] = FMA(T25, T28, T27 * T26);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 16, "hb2_16", twinstr, &GENUS, {156, 68, 40, 0} };
+
+void X(codelet_hb2_16) (planner *p) {
+     X(khc2hc_register) (p, hb2_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb2_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb2_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1087 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 20 -dif -name hb2_20 -include hb.h */
+
+/*
+ * This function contains 276 FP additions, 198 FP multiplications,
+ * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
+ * 153 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T1S, T1O, T1s, TI, T24, T1Y, T2g, T2k, TS, TR, T1I, T26, T1o, T20, T1F;
+	       E T25, TT, T1Z;
+	       {
+		    E TD, TH, TE, T1L, T1N, T1X, TG, T1V, T2Y, T2b, T29, T2s, T36, T3e, T31;
+		    E T2o, T3b, T5b, T2c, T2U, T4y, T4u, T2f, T5g, T47, T5p, T4b, T5l;
+		    {
+			 E T1r, TF, T2T, T1M, T1R, T2X, T2r, T4x;
+			 TD = W[0];
+			 TH = W[3];
+			 TE = W[2];
+			 T1L = W[6];
+			 T1N = W[7];
+			 T1r = TD * TH;
+			 TF = TD * TE;
+			 T2T = TE * T1L;
+			 T1M = TD * T1L;
+			 T1R = TD * T1N;
+			 T2X = TE * T1N;
+			 T1X = W[5];
+			 TG = W[1];
+			 T1V = W[4];
+			 T2Y = FNMS(TH, T1L, T2X);
+			 T2r = TD * T1X;
+			 {
+			      E T23, T2n, T1W, T2a;
+			      T23 = TE * T1X;
+			      T1S = FNMS(TG, T1L, T1R);
+			      T1O = FMA(TG, T1N, T1M);
+			      T2b = FMA(TG, TE, T1r);
+			      T1s = FNMS(TG, TE, T1r);
+			      T29 = FNMS(TG, TH, TF);
+			      TI = FMA(TG, TH, TF);
+			      T2n = TD * T1V;
+			      T1W = TE * T1V;
+			      T2s = FMA(TG, T1V, T2r);
+			      T36 = FNMS(TG, T1V, T2r);
+			      T3e = FMA(TH, T1V, T23);
+			      T24 = FNMS(TH, T1V, T23);
+			      T2a = T29 * T1V;
+			      T31 = FMA(TG, T1X, T2n);
+			      T2o = FNMS(TG, T1X, T2n);
+			      T3b = FNMS(TH, T1X, T1W);
+			      T1Y = FMA(TH, T1X, T1W);
+			      T5b = FNMS(T2b, T1X, T2a);
+			      T2c = FMA(T2b, T1X, T2a);
+			      T2U = FMA(TH, T1N, T2T);
+			 }
+			 T4x = T29 * T1N;
+			 {
+			      E T4t, T2d, T2j, T2e;
+			      T4t = T29 * T1L;
+			      T2e = T29 * T1X;
+			      T4y = FNMS(T2b, T1L, T4x);
+			      T4u = FMA(T2b, T1N, T4t);
+			      T2f = FNMS(T2b, T1V, T2e);
+			      T5g = FMA(T2b, T1V, T2e);
+			      T2d = T2c * T1L;
+			      T2j = T2c * T1N;
+			      T47 = TI * T1V;
+			      T2g = FMA(T2f, T1N, T2d);
+			      T2k = FNMS(T2f, T1L, T2j);
+			      T5p = TI * T1N;
+			      T4b = TI * T1X;
+			      T5l = TI * T1L;
+			 }
+		    }
+		    {
+			 E T4f, T48, T4c, T4k, T5m, T5q, T3j, T4B, T7, TJ, T4V, T3V, T1z, T2H, T3x;
+			 E T42, T18, T3q, T43, T1n, T2D, T53, T52, T2A, T1H, T4R, T4X, T4W, T4O, T1G;
+			 E T2O, T3I, T2P, T3P, T2K, T2M, T1C, T1E, TC, T2w, T40, T3Y, T4K, T4I, TQ;
+			 {
+			      E T1y, T3U, T1v, T3T;
+			      {
+				   E T3h, T3, T1t, T3i, T6, T1u;
+				   {
+					E T1w, T1x, T1, T2, T4, T5;
+					T1 = cr[0];
+					T2 = ci[WS(rs, 9)];
+					T1w = ci[WS(rs, 14)];
+					T4f = FNMS(T1s, T1X, T47);
+					T48 = FMA(T1s, T1X, T47);
+					T4c = FNMS(T1s, T1V, T4b);
+					T4k = FMA(T1s, T1V, T4b);
+					T5m = FMA(T1s, T1N, T5l);
+					T5q = FNMS(T1s, T1L, T5p);
+					T3h = T1 - T2;
+					T3 = T1 + T2;
+					T1x = cr[WS(rs, 15)];
+					T4 = cr[WS(rs, 5)];
+					T5 = ci[WS(rs, 4)];
+					T1t = ci[WS(rs, 19)];
+					T3i = T1w + T1x;
+					T1y = T1w - T1x;
+					T3U = T4 - T5;
+					T6 = T4 + T5;
+					T1u = cr[WS(rs, 10)];
+				   }
+				   T3j = T3h + T3i;
+				   T4B = T3h - T3i;
+				   T7 = T3 + T6;
+				   TJ = T3 - T6;
+				   T1v = T1t - T1u;
+				   T3T = T1t + T1u;
+			      }
+			      {
+				   E T3m, T4C, Te, TK, T4M, T3L, T1f, T2y, TO, TA, T4Q, T3H, T3w, T4G, T2C;
+				   E T17, T3p, T4D, Tl, TL, T3O, T4N, T1m, T2z, T3t, T4F, Tt, TN, T3E, T4P;
+				   E T10, T2B;
+				   {
+					E T3u, T13, T3v, T16;
+					{
+					     E T1e, T3K, T1b, T3J;
+					     {
+						  E T3k, Ta, T19, T3l, Td, T1a;
+						  {
+						       E T1c, T1d, T8, T9, Tb, Tc;
+						       T8 = cr[WS(rs, 4)];
+						       T9 = ci[WS(rs, 5)];
+						       T4V = T3U + T3T;
+						       T3V = T3T - T3U;
+						       T1z = T1v - T1y;
+						       T2H = T1v + T1y;
+						       T3k = T8 - T9;
+						       Ta = T8 + T9;
+						       T1c = ci[WS(rs, 10)];
+						       T1d = cr[WS(rs, 19)];
+						       Tb = cr[WS(rs, 9)];
+						       Tc = ci[0];
+						       T19 = ci[WS(rs, 15)];
+						       T3l = T1c + T1d;
+						       T1e = T1c - T1d;
+						       T3K = Tb - Tc;
+						       Td = Tb + Tc;
+						       T1a = cr[WS(rs, 14)];
+						  }
+						  T3m = T3k + T3l;
+						  T4C = T3k - T3l;
+						  Te = Ta + Td;
+						  TK = Ta - Td;
+						  T1b = T19 - T1a;
+						  T3J = T19 + T1a;
+					     }
+					     {
+						  E Tw, T14, T3F, Tz, T3G, T15;
+						  {
+						       E Tx, Ty, Tu, Tv, T11, T12;
+						       Tu = ci[WS(rs, 7)];
+						       Tv = cr[WS(rs, 2)];
+						       T4M = T3K + T3J;
+						       T3L = T3J - T3K;
+						       T1f = T1b - T1e;
+						       T2y = T1b + T1e;
+						       T3u = Tu - Tv;
+						       Tw = Tu + Tv;
+						       Tx = ci[WS(rs, 2)];
+						       Ty = cr[WS(rs, 7)];
+						       T11 = ci[WS(rs, 17)];
+						       T12 = cr[WS(rs, 12)];
+						       T14 = ci[WS(rs, 12)];
+						       T3F = Tx - Ty;
+						       Tz = Tx + Ty;
+						       T3G = T11 + T12;
+						       T13 = T11 - T12;
+						       T15 = cr[WS(rs, 17)];
+						  }
+						  TO = Tw - Tz;
+						  TA = Tw + Tz;
+						  T4Q = T3F - T3G;
+						  T3H = T3F + T3G;
+						  T3v = T14 + T15;
+						  T16 = T14 - T15;
+					     }
+					}
+					{
+					     E Ti, T3n, Th, T3o, T1l, Tj, T1g, T1h;
+					     {
+						  E Tf, Tg, T1j, T1k;
+						  Tf = ci[WS(rs, 3)];
+						  T3w = T3u - T3v;
+						  T4G = T3u + T3v;
+						  T2C = T13 + T16;
+						  T17 = T13 - T16;
+						  Tg = cr[WS(rs, 6)];
+						  T1j = ci[WS(rs, 18)];
+						  T1k = cr[WS(rs, 11)];
+						  Ti = cr[WS(rs, 1)];
+						  T3n = Tf - Tg;
+						  Th = Tf + Tg;
+						  T3o = T1j + T1k;
+						  T1l = T1j - T1k;
+						  Tj = ci[WS(rs, 8)];
+						  T1g = ci[WS(rs, 13)];
+						  T1h = cr[WS(rs, 16)];
+					     }
+					     {
+						  E T3M, Tk, T3N, T1i;
+						  T3p = T3n + T3o;
+						  T4D = T3n - T3o;
+						  T3M = Ti - Tj;
+						  Tk = Ti + Tj;
+						  T3N = T1g + T1h;
+						  T1i = T1g - T1h;
+						  Tl = Th + Tk;
+						  TL = Th - Tk;
+						  T3O = T3M + T3N;
+						  T4N = T3M - T3N;
+						  T1m = T1i - T1l;
+						  T2z = T1i + T1l;
+					     }
+					}
+					{
+					     E Tq, T3r, Tp, T3s, TZ, Tr, TU, TV;
+					     {
+						  E Tn, To, TX, TY;
+						  Tn = cr[WS(rs, 8)];
+						  To = ci[WS(rs, 1)];
+						  TX = ci[WS(rs, 16)];
+						  TY = cr[WS(rs, 13)];
+						  Tq = ci[WS(rs, 6)];
+						  T3r = Tn - To;
+						  Tp = Tn + To;
+						  T3s = TX + TY;
+						  TZ = TX - TY;
+						  Tr = cr[WS(rs, 3)];
+						  TU = ci[WS(rs, 11)];
+						  TV = cr[WS(rs, 18)];
+					     }
+					     {
+						  E T3D, Ts, T3C, TW;
+						  T3t = T3r - T3s;
+						  T4F = T3r + T3s;
+						  T3D = Tq - Tr;
+						  Ts = Tq + Tr;
+						  T3C = TU + TV;
+						  TW = TU - TV;
+						  Tt = Tp + Ts;
+						  TN = Tp - Ts;
+						  T3E = T3C - T3D;
+						  T4P = T3D + T3C;
+						  T10 = TW - TZ;
+						  T2B = TW + TZ;
+					     }
+					}
+				   }
+				   {
+					E T1B, T1A, T2J, T4H, T4E, T2I, TM, TP;
+					T3x = T3t + T3w;
+					T42 = T3t - T3w;
+					T18 = T10 - T17;
+					T1B = T10 + T17;
+					T3q = T3m + T3p;
+					T43 = T3m - T3p;
+					T1n = T1f - T1m;
+					T1A = T1f + T1m;
+					T2J = T2B + T2C;
+					T2D = T2B - T2C;
+					T53 = T4F - T4G;
+					T4H = T4F + T4G;
+					T4E = T4C + T4D;
+					T52 = T4C - T4D;
+					T2A = T2y - T2z;
+					T2I = T2y + T2z;
+					TM = TK + TL;
+					T1H = TK - TL;
+					T4R = T4P - T4Q;
+					T4X = T4P + T4Q;
+					T4W = T4M + T4N;
+					T4O = T4M - T4N;
+					T1G = TN - TO;
+					TP = TN + TO;
+					{
+					     E Tm, T3X, TB, T3W;
+					     Tm = Te + Tl;
+					     T2O = Te - Tl;
+					     T3I = T3E + T3H;
+					     T3X = T3E - T3H;
+					     TB = Tt + TA;
+					     T2P = Tt - TA;
+					     T3P = T3L + T3O;
+					     T3W = T3L - T3O;
+					     T2K = T2I + T2J;
+					     T2M = T2I - T2J;
+					     T1C = T1A + T1B;
+					     T1E = T1A - T1B;
+					     TC = Tm + TB;
+					     T2w = Tm - TB;
+					     T40 = T3W - T3X;
+					     T3Y = T3W + T3X;
+					     T4K = T4E - T4H;
+					     T4I = T4E + T4H;
+					     TS = TM - TP;
+					     TQ = TM + TP;
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T3A, T3y, T50, T1D, T2t, T2p, T4J, T5t, T5v, T4Z, T4Y;
+			      cr[0] = T7 + TC;
+			      T3A = T3q - T3x;
+			      T3y = T3q + T3x;
+			      T50 = T4W - T4X;
+			      T4Y = T4W + T4X;
+			      ci[0] = T2H + T2K;
+			      T1D = FNMS(KP250000000, T1C, T1z);
+			      T2t = T1z + T1C;
+			      T2p = TJ + TQ;
+			      TR = FNMS(KP250000000, TQ, TJ);
+			      T4J = FNMS(KP250000000, T4I, T4B);
+			      T5t = T4B + T4I;
+			      T5v = T4V + T4Y;
+			      T4Z = FNMS(KP250000000, T4Y, T4V);
+			      {
+				   E T4m, T44, T4i, T4p, T49, T3R, T4j, T4a, T3S, T4l, T41, T4q;
+				   {
+					E T3z, T4v, T4w, T3Z, T4z;
+					T3z = FNMS(KP250000000, T3y, T3j);
+					T4v = T3j + T3y;
+					{
+					     E T2u, T2q, T5u, T5w;
+					     T2u = T2s * T2p;
+					     T2q = T2o * T2p;
+					     T5u = T2c * T5t;
+					     T5w = T2c * T5v;
+					     ci[WS(rs, 10)] = FMA(T2o, T2t, T2u);
+					     cr[WS(rs, 10)] = FNMS(T2s, T2t, T2q);
+					     cr[WS(rs, 5)] = FNMS(T2f, T5v, T5u);
+					     ci[WS(rs, 5)] = FMA(T2f, T5t, T5w);
+					     T4w = T4u * T4v;
+					}
+					T3Z = FNMS(KP250000000, T3Y, T3V);
+					T4z = T3V + T3Y;
+					{
+					     E T3Q, T4h, T4A, T4g, T3B;
+					     T3Q = FNMS(KP618033988, T3P, T3I);
+					     T4h = FMA(KP618033988, T3I, T3P);
+					     cr[WS(rs, 15)] = FNMS(T4y, T4z, T4w);
+					     T4A = T4u * T4z;
+					     T4m = FMA(KP618033988, T42, T43);
+					     T44 = FNMS(KP618033988, T43, T42);
+					     T4g = FMA(KP559016994, T3A, T3z);
+					     T3B = FNMS(KP559016994, T3A, T3z);
+					     ci[WS(rs, 15)] = FMA(T4y, T4v, T4A);
+					     T4i = FNMS(KP951056516, T4h, T4g);
+					     T4p = FMA(KP951056516, T4h, T4g);
+					     T49 = FMA(KP951056516, T3Q, T3B);
+					     T3R = FNMS(KP951056516, T3Q, T3B);
+					}
+					T4j = T4f * T4i;
+					T4a = T48 * T49;
+					T3S = TE * T3R;
+					T4l = FMA(KP559016994, T40, T3Z);
+					T41 = FNMS(KP559016994, T40, T3Z);
+					T4q = T1L * T4p;
+				   }
+				   {
+					E T5d, T4S, T54, T5i, T4L, T5c;
+					T5d = FNMS(KP618033988, T4O, T4R);
+					T4S = FMA(KP618033988, T4R, T4O);
+					{
+					     E T4n, T4r, T4d, T45;
+					     T4n = FMA(KP951056516, T4m, T4l);
+					     T4r = FNMS(KP951056516, T4m, T4l);
+					     T4d = FNMS(KP951056516, T44, T41);
+					     T45 = FMA(KP951056516, T44, T41);
+					     {
+						  E T4o, T4s, T4e, T46;
+						  T4o = T4f * T4n;
+						  cr[WS(rs, 11)] = FNMS(T4k, T4n, T4j);
+						  T4s = T1L * T4r;
+						  cr[WS(rs, 19)] = FNMS(T1N, T4r, T4q);
+						  T4e = T48 * T4d;
+						  cr[WS(rs, 7)] = FNMS(T4c, T4d, T4a);
+						  T46 = TE * T45;
+						  cr[WS(rs, 3)] = FNMS(TH, T45, T3S);
+						  ci[WS(rs, 11)] = FMA(T4k, T4i, T4o);
+						  ci[WS(rs, 19)] = FMA(T1N, T4p, T4s);
+						  ci[WS(rs, 7)] = FMA(T4c, T49, T4e);
+						  ci[WS(rs, 3)] = FMA(TH, T3R, T46);
+					     }
+					}
+					T54 = FMA(KP618033988, T53, T52);
+					T5i = FNMS(KP618033988, T52, T53);
+					T4L = FMA(KP559016994, T4K, T4J);
+					T5c = FNMS(KP559016994, T4K, T4J);
+					{
+					     E T38, T2Q, T33, T2E, T2v, T37, T2N, T5h, T51, T2L, T2x, T32;
+					     T38 = FNMS(KP618033988, T2O, T2P);
+					     T2Q = FMA(KP618033988, T2P, T2O);
+					     T5h = FNMS(KP559016994, T50, T4Z);
+					     T51 = FMA(KP559016994, T50, T4Z);
+					     {
+						  E T5e, T5n, T57, T4T;
+						  T5e = FNMS(KP951056516, T5d, T5c);
+						  T5n = FMA(KP951056516, T5d, T5c);
+						  T57 = FMA(KP951056516, T4S, T4L);
+						  T4T = FNMS(KP951056516, T4S, T4L);
+						  {
+						       E T5j, T5r, T59, T55;
+						       T5j = FMA(KP951056516, T5i, T5h);
+						       T5r = FNMS(KP951056516, T5i, T5h);
+						       T59 = FNMS(KP951056516, T54, T51);
+						       T55 = FMA(KP951056516, T54, T51);
+						       {
+							    E T5f, T5o, T58, T4U;
+							    T5f = T5b * T5e;
+							    T5o = T5m * T5n;
+							    T58 = T1V * T57;
+							    T4U = TD * T4T;
+							    {
+								 E T5k, T5s, T5a, T56;
+								 T5k = T5b * T5j;
+								 T5s = T5m * T5r;
+								 T5a = T1V * T59;
+								 T56 = TD * T55;
+								 cr[WS(rs, 13)] = FNMS(T5g, T5j, T5f);
+								 cr[WS(rs, 17)] = FNMS(T5q, T5r, T5o);
+								 cr[WS(rs, 9)] = FNMS(T1X, T59, T58);
+								 cr[WS(rs, 1)] = FNMS(TG, T55, T4U);
+								 ci[WS(rs, 13)] = FMA(T5g, T5e, T5k);
+								 ci[WS(rs, 17)] = FMA(T5q, T5n, T5s);
+								 ci[WS(rs, 9)] = FMA(T1X, T57, T5a);
+								 ci[WS(rs, 1)] = FMA(TG, T4T, T56);
+							    }
+						       }
+						  }
+					     }
+					     T2L = FNMS(KP250000000, T2K, T2H);
+					     T33 = FNMS(KP618033988, T2A, T2D);
+					     T2E = FMA(KP618033988, T2D, T2A);
+					     T2v = FNMS(KP250000000, TC, T7);
+					     T37 = FNMS(KP559016994, T2M, T2L);
+					     T2N = FMA(KP559016994, T2M, T2L);
+					     T1I = FNMS(KP618033988, T1H, T1G);
+					     T26 = FMA(KP618033988, T1G, T1H);
+					     T2x = FMA(KP559016994, T2w, T2v);
+					     T32 = FNMS(KP559016994, T2w, T2v);
+					     {
+						  E T3f, T39, T2R, T2Z;
+						  T3f = FNMS(KP951056516, T38, T37);
+						  T39 = FMA(KP951056516, T38, T37);
+						  T2R = FNMS(KP951056516, T2Q, T2N);
+						  T2Z = FMA(KP951056516, T2Q, T2N);
+						  {
+						       E T3c, T34, T2F, T2V;
+						       T3c = FMA(KP951056516, T33, T32);
+						       T34 = FNMS(KP951056516, T33, T32);
+						       T2F = FMA(KP951056516, T2E, T2x);
+						       T2V = FNMS(KP951056516, T2E, T2x);
+						       {
+							    E T3a, T35, T3g, T3d;
+							    T3a = T36 * T34;
+							    T35 = T31 * T34;
+							    T3g = T3e * T3c;
+							    T3d = T3b * T3c;
+							    {
+								 E T30, T2W, T2S, T2G;
+								 T30 = T2Y * T2V;
+								 T2W = T2U * T2V;
+								 T2S = T2b * T2F;
+								 T2G = T29 * T2F;
+								 ci[WS(rs, 8)] = FMA(T31, T39, T3a);
+								 cr[WS(rs, 8)] = FNMS(T36, T39, T35);
+								 ci[WS(rs, 12)] = FMA(T3b, T3f, T3g);
+								 cr[WS(rs, 12)] = FNMS(T3e, T3f, T3d);
+								 ci[WS(rs, 16)] = FMA(T2U, T2Z, T30);
+								 cr[WS(rs, 16)] = FNMS(T2Y, T2Z, T2W);
+								 ci[WS(rs, 4)] = FMA(T29, T2R, T2S);
+								 cr[WS(rs, 4)] = FNMS(T2b, T2R, T2G);
+							    }
+						       }
+						  }
+					     }
+					     T1o = FNMS(KP618033988, T1n, T18);
+					     T20 = FMA(KP618033988, T18, T1n);
+					     T1F = FNMS(KP559016994, T1E, T1D);
+					     T25 = FMA(KP559016994, T1E, T1D);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       TT = FNMS(KP559016994, TS, TR);
+	       T1Z = FMA(KP559016994, TS, TR);
+	       {
+		    E T2l, T27, T1J, T1T;
+		    T2l = FNMS(KP951056516, T26, T25);
+		    T27 = FMA(KP951056516, T26, T25);
+		    T1J = FNMS(KP951056516, T1I, T1F);
+		    T1T = FMA(KP951056516, T1I, T1F);
+		    {
+			 E T2h, T21, T1p, T1P;
+			 T2h = FMA(KP951056516, T20, T1Z);
+			 T21 = FNMS(KP951056516, T20, T1Z);
+			 T1p = FMA(KP951056516, T1o, TT);
+			 T1P = FNMS(KP951056516, T1o, TT);
+			 {
+			      E T28, T22, T2m, T2i;
+			      T28 = T24 * T21;
+			      T22 = T1Y * T21;
+			      T2m = T2k * T2h;
+			      T2i = T2g * T2h;
+			      {
+				   E T1U, T1Q, T1K, T1q;
+				   T1U = T1S * T1P;
+				   T1Q = T1O * T1P;
+				   T1K = T1s * T1p;
+				   T1q = TI * T1p;
+				   ci[WS(rs, 6)] = FMA(T1Y, T27, T28);
+				   cr[WS(rs, 6)] = FNMS(T24, T27, T22);
+				   ci[WS(rs, 14)] = FMA(T2g, T2l, T2m);
+				   cr[WS(rs, 14)] = FNMS(T2k, T2l, T2i);
+				   ci[WS(rs, 18)] = FMA(T1O, T1T, T1U);
+				   cr[WS(rs, 18)] = FNMS(T1S, T1T, T1Q);
+				   ci[WS(rs, 2)] = FMA(TI, T1J, T1K);
+				   cr[WS(rs, 2)] = FNMS(T1s, T1J, T1q);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 20, "hb2_20", twinstr, &GENUS, {136, 58, 140, 0} };
+
+void X(codelet_hb2_20) (planner *p) {
+     X(khc2hc_register) (p, hb2_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 20 -dif -name hb2_20 -include hb.h */
+
+/*
+ * This function contains 276 FP additions, 164 FP multiplications,
+ * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
+ * 137 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E TD, TG, TE, TH, TJ, T1t, T27, T25, T1T, T1R, T1V, T2j, T2Z, T21, T2X;
+	       E T2T, T2n, T2P, T3V, T41, T3R, T3X, T29, T2c, T4H, T4L, T1L, T1M, T1N, T2d;
+	       E T4R, T1P, T4P, T49, T2N, T2f, T47, T2L;
+	       {
+		    E T1U, T2l, T1Z, T2i, T1S, T2m, T20, T2h;
+		    {
+			 E TF, T1s, TI, T1r;
+			 TD = W[0];
+			 TG = W[1];
+			 TE = W[2];
+			 TH = W[3];
+			 TF = TD * TE;
+			 T1s = TG * TE;
+			 TI = TG * TH;
+			 T1r = TD * TH;
+			 TJ = TF + TI;
+			 T1t = T1r - T1s;
+			 T27 = T1r + T1s;
+			 T25 = TF - TI;
+			 T1T = W[5];
+			 T1U = TH * T1T;
+			 T2l = TD * T1T;
+			 T1Z = TE * T1T;
+			 T2i = TG * T1T;
+			 T1R = W[4];
+			 T1S = TE * T1R;
+			 T2m = TG * T1R;
+			 T20 = TH * T1R;
+			 T2h = TD * T1R;
+		    }
+		    T1V = T1S + T1U;
+		    T2j = T2h - T2i;
+		    T2Z = T1Z + T20;
+		    T21 = T1Z - T20;
+		    T2X = T1S - T1U;
+		    T2T = T2l - T2m;
+		    T2n = T2l + T2m;
+		    T2P = T2h + T2i;
+		    {
+			 E T3T, T3U, T3P, T3Q;
+			 T3T = TJ * T1T;
+			 T3U = T1t * T1R;
+			 T3V = T3T - T3U;
+			 T41 = T3T + T3U;
+			 T3P = TJ * T1R;
+			 T3Q = T1t * T1T;
+			 T3R = T3P + T3Q;
+			 T3X = T3P - T3Q;
+			 {
+			      E T26, T28, T2a, T2b;
+			      T26 = T25 * T1R;
+			      T28 = T27 * T1T;
+			      T29 = T26 + T28;
+			      T2a = T25 * T1T;
+			      T2b = T27 * T1R;
+			      T2c = T2a - T2b;
+			      T4H = T26 - T28;
+			      T4L = T2a + T2b;
+			      T1L = W[6];
+			      T1M = W[7];
+			      T1N = FMA(TD, T1L, TG * T1M);
+			      T2d = FMA(T29, T1L, T2c * T1M);
+			      T4R = FNMS(T1t, T1L, TJ * T1M);
+			      T1P = FNMS(TG, T1L, TD * T1M);
+			      T4P = FMA(TJ, T1L, T1t * T1M);
+			      T49 = FNMS(T27, T1L, T25 * T1M);
+			      T2N = FNMS(TH, T1L, TE * T1M);
+			      T2f = FNMS(T2c, T1L, T29 * T1M);
+			      T47 = FMA(T25, T1L, T27 * T1M);
+			      T2L = FMA(TE, T1L, TH * T1M);
+			 }
+		    }
+	       }
+	       {
+		    E T7, T4i, T4x, TK, T1D, T3i, T3E, T2D, T19, T3L, T3M, T1o, T2x, T4C, T4B;
+		    E T2u, T1v, T4r, T4o, T1u, T2H, T37, T2I, T3e, T3p, T3w, T3x, Tm, TB, TC;
+		    E T4u, T4v, T4y, T2A, T2B, T2E, T1E, T1F, T1G, T4d, T4g, T4j, T3F, T3G, T3H;
+		    E TN, TQ, TR, T48, T4a;
+		    {
+			 E T3, T3g, T1C, T3h, T6, T3D, T1z, T3C;
+			 {
+			      E T1, T2, T1A, T1B;
+			      T1 = cr[0];
+			      T2 = ci[WS(rs, 9)];
+			      T3 = T1 + T2;
+			      T3g = T1 - T2;
+			      T1A = ci[WS(rs, 14)];
+			      T1B = cr[WS(rs, 15)];
+			      T1C = T1A - T1B;
+			      T3h = T1A + T1B;
+			 }
+			 {
+			      E T4, T5, T1x, T1y;
+			      T4 = cr[WS(rs, 5)];
+			      T5 = ci[WS(rs, 4)];
+			      T6 = T4 + T5;
+			      T3D = T4 - T5;
+			      T1x = ci[WS(rs, 19)];
+			      T1y = cr[WS(rs, 10)];
+			      T1z = T1x - T1y;
+			      T3C = T1x + T1y;
+			 }
+			 T7 = T3 + T6;
+			 T4i = T3g - T3h;
+			 T4x = T3D + T3C;
+			 TK = T3 - T6;
+			 T1D = T1z - T1C;
+			 T3i = T3g + T3h;
+			 T3E = T3C - T3D;
+			 T2D = T1z + T1C;
+		    }
+		    {
+			 E Te, T4b, T4m, TL, T11, T33, T3l, T2s, TA, T4f, T4q, TP, T1n, T3d, T3v;
+			 E T2w, Tl, T4c, T4n, TM, T18, T36, T3o, T2t, Tt, T4e, T4p, TO, T1g, T3a;
+			 E T3s, T2v;
+			 {
+			      E Ta, T3j, T10, T3k, Td, T32, TX, T31;
+			      {
+				   E T8, T9, TY, TZ;
+				   T8 = cr[WS(rs, 4)];
+				   T9 = ci[WS(rs, 5)];
+				   Ta = T8 + T9;
+				   T3j = T8 - T9;
+				   TY = ci[WS(rs, 10)];
+				   TZ = cr[WS(rs, 19)];
+				   T10 = TY - TZ;
+				   T3k = TY + TZ;
+			      }
+			      {
+				   E Tb, Tc, TV, TW;
+				   Tb = cr[WS(rs, 9)];
+				   Tc = ci[0];
+				   Td = Tb + Tc;
+				   T32 = Tb - Tc;
+				   TV = ci[WS(rs, 15)];
+				   TW = cr[WS(rs, 14)];
+				   TX = TV - TW;
+				   T31 = TV + TW;
+			      }
+			      Te = Ta + Td;
+			      T4b = T3j - T3k;
+			      T4m = T32 + T31;
+			      TL = Ta - Td;
+			      T11 = TX - T10;
+			      T33 = T31 - T32;
+			      T3l = T3j + T3k;
+			      T2s = TX + T10;
+			 }
+			 {
+			      E Tw, T3t, Tz, T3b, T1j, T3c, T1m, T3u;
+			      {
+				   E Tu, Tv, Tx, Ty;
+				   Tu = ci[WS(rs, 7)];
+				   Tv = cr[WS(rs, 2)];
+				   Tw = Tu + Tv;
+				   T3t = Tu - Tv;
+				   Tx = ci[WS(rs, 2)];
+				   Ty = cr[WS(rs, 7)];
+				   Tz = Tx + Ty;
+				   T3b = Tx - Ty;
+			      }
+			      {
+				   E T1h, T1i, T1k, T1l;
+				   T1h = ci[WS(rs, 17)];
+				   T1i = cr[WS(rs, 12)];
+				   T1j = T1h - T1i;
+				   T3c = T1h + T1i;
+				   T1k = ci[WS(rs, 12)];
+				   T1l = cr[WS(rs, 17)];
+				   T1m = T1k - T1l;
+				   T3u = T1k + T1l;
+			      }
+			      TA = Tw + Tz;
+			      T4f = T3t + T3u;
+			      T4q = T3b - T3c;
+			      TP = Tw - Tz;
+			      T1n = T1j - T1m;
+			      T3d = T3b + T3c;
+			      T3v = T3t - T3u;
+			      T2w = T1j + T1m;
+			 }
+			 {
+			      E Th, T3m, T17, T3n, Tk, T34, T14, T35;
+			      {
+				   E Tf, Tg, T15, T16;
+				   Tf = ci[WS(rs, 3)];
+				   Tg = cr[WS(rs, 6)];
+				   Th = Tf + Tg;
+				   T3m = Tf - Tg;
+				   T15 = ci[WS(rs, 18)];
+				   T16 = cr[WS(rs, 11)];
+				   T17 = T15 - T16;
+				   T3n = T15 + T16;
+			      }
+			      {
+				   E Ti, Tj, T12, T13;
+				   Ti = cr[WS(rs, 1)];
+				   Tj = ci[WS(rs, 8)];
+				   Tk = Ti + Tj;
+				   T34 = Ti - Tj;
+				   T12 = ci[WS(rs, 13)];
+				   T13 = cr[WS(rs, 16)];
+				   T14 = T12 - T13;
+				   T35 = T12 + T13;
+			      }
+			      Tl = Th + Tk;
+			      T4c = T3m - T3n;
+			      T4n = T34 - T35;
+			      TM = Th - Tk;
+			      T18 = T14 - T17;
+			      T36 = T34 + T35;
+			      T3o = T3m + T3n;
+			      T2t = T14 + T17;
+			 }
+			 {
+			      E Tp, T3q, T1f, T3r, Ts, T39, T1c, T38;
+			      {
+				   E Tn, To, T1d, T1e;
+				   Tn = cr[WS(rs, 8)];
+				   To = ci[WS(rs, 1)];
+				   Tp = Tn + To;
+				   T3q = Tn - To;
+				   T1d = ci[WS(rs, 16)];
+				   T1e = cr[WS(rs, 13)];
+				   T1f = T1d - T1e;
+				   T3r = T1d + T1e;
+			      }
+			      {
+				   E Tq, Tr, T1a, T1b;
+				   Tq = ci[WS(rs, 6)];
+				   Tr = cr[WS(rs, 3)];
+				   Ts = Tq + Tr;
+				   T39 = Tq - Tr;
+				   T1a = ci[WS(rs, 11)];
+				   T1b = cr[WS(rs, 18)];
+				   T1c = T1a - T1b;
+				   T38 = T1a + T1b;
+			      }
+			      Tt = Tp + Ts;
+			      T4e = T3q + T3r;
+			      T4p = T39 + T38;
+			      TO = Tp - Ts;
+			      T1g = T1c - T1f;
+			      T3a = T38 - T39;
+			      T3s = T3q - T3r;
+			      T2v = T1c + T1f;
+			 }
+			 T19 = T11 - T18;
+			 T3L = T3l - T3o;
+			 T3M = T3s - T3v;
+			 T1o = T1g - T1n;
+			 T2x = T2v - T2w;
+			 T4C = T4e - T4f;
+			 T4B = T4b - T4c;
+			 T2u = T2s - T2t;
+			 T1v = TO - TP;
+			 T4r = T4p - T4q;
+			 T4o = T4m - T4n;
+			 T1u = TL - TM;
+			 T2H = Te - Tl;
+			 T37 = T33 + T36;
+			 T2I = Tt - TA;
+			 T3e = T3a + T3d;
+			 T3p = T3l + T3o;
+			 T3w = T3s + T3v;
+			 T3x = T3p + T3w;
+			 Tm = Te + Tl;
+			 TB = Tt + TA;
+			 TC = Tm + TB;
+			 T4u = T4m + T4n;
+			 T4v = T4p + T4q;
+			 T4y = T4u + T4v;
+			 T2A = T2s + T2t;
+			 T2B = T2v + T2w;
+			 T2E = T2A + T2B;
+			 T1E = T11 + T18;
+			 T1F = T1g + T1n;
+			 T1G = T1E + T1F;
+			 T4d = T4b + T4c;
+			 T4g = T4e + T4f;
+			 T4j = T4d + T4g;
+			 T3F = T33 - T36;
+			 T3G = T3a - T3d;
+			 T3H = T3F + T3G;
+			 TN = TL + TM;
+			 TQ = TO + TP;
+			 TR = TN + TQ;
+		    }
+		    cr[0] = T7 + TC;
+		    ci[0] = T2D + T2E;
+		    {
+			 E T2k, T2o, T4T, T4U;
+			 T2k = TK + TR;
+			 T2o = T1D + T1G;
+			 cr[WS(rs, 10)] = FNMS(T2n, T2o, T2j * T2k);
+			 ci[WS(rs, 10)] = FMA(T2n, T2k, T2j * T2o);
+			 T4T = T4i + T4j;
+			 T4U = T4x + T4y;
+			 cr[WS(rs, 5)] = FNMS(T2c, T4U, T29 * T4T);
+			 ci[WS(rs, 5)] = FMA(T29, T4U, T2c * T4T);
+		    }
+		    T48 = T3i + T3x;
+		    T4a = T3E + T3H;
+		    cr[WS(rs, 15)] = FNMS(T49, T4a, T47 * T48);
+		    ci[WS(rs, 15)] = FMA(T47, T4a, T49 * T48);
+		    {
+			 E T2y, T2J, T2V, T2R, T2G, T2U, T2r, T2Q;
+			 T2y = FMA(KP951056516, T2u, KP587785252 * T2x);
+			 T2J = FMA(KP951056516, T2H, KP587785252 * T2I);
+			 T2V = FNMS(KP951056516, T2I, KP587785252 * T2H);
+			 T2R = FNMS(KP951056516, T2x, KP587785252 * T2u);
+			 {
+			      E T2C, T2F, T2p, T2q;
+			      T2C = KP559016994 * (T2A - T2B);
+			      T2F = FNMS(KP250000000, T2E, T2D);
+			      T2G = T2C + T2F;
+			      T2U = T2F - T2C;
+			      T2p = KP559016994 * (Tm - TB);
+			      T2q = FNMS(KP250000000, TC, T7);
+			      T2r = T2p + T2q;
+			      T2Q = T2q - T2p;
+			 }
+			 {
+			      E T2z, T2K, T2Y, T30;
+			      T2z = T2r + T2y;
+			      T2K = T2G - T2J;
+			      cr[WS(rs, 4)] = FNMS(T27, T2K, T25 * T2z);
+			      ci[WS(rs, 4)] = FMA(T27, T2z, T25 * T2K);
+			      T2Y = T2Q - T2R;
+			      T30 = T2V + T2U;
+			      cr[WS(rs, 12)] = FNMS(T2Z, T30, T2X * T2Y);
+			      ci[WS(rs, 12)] = FMA(T2Z, T2Y, T2X * T30);
+			 }
+			 {
+			      E T2M, T2O, T2S, T2W;
+			      T2M = T2r - T2y;
+			      T2O = T2J + T2G;
+			      cr[WS(rs, 16)] = FNMS(T2N, T2O, T2L * T2M);
+			      ci[WS(rs, 16)] = FMA(T2N, T2M, T2L * T2O);
+			      T2S = T2Q + T2R;
+			      T2W = T2U - T2V;
+			      cr[WS(rs, 8)] = FNMS(T2T, T2W, T2P * T2S);
+			      ci[WS(rs, 8)] = FMA(T2T, T2S, T2P * T2W);
+			 }
+		    }
+		    {
+			 E T4s, T4D, T4N, T4I, T4A, T4M, T4l, T4J;
+			 T4s = FMA(KP951056516, T4o, KP587785252 * T4r);
+			 T4D = FMA(KP951056516, T4B, KP587785252 * T4C);
+			 T4N = FNMS(KP951056516, T4C, KP587785252 * T4B);
+			 T4I = FNMS(KP951056516, T4r, KP587785252 * T4o);
+			 {
+			      E T4w, T4z, T4h, T4k;
+			      T4w = KP559016994 * (T4u - T4v);
+			      T4z = FNMS(KP250000000, T4y, T4x);
+			      T4A = T4w + T4z;
+			      T4M = T4z - T4w;
+			      T4h = KP559016994 * (T4d - T4g);
+			      T4k = FNMS(KP250000000, T4j, T4i);
+			      T4l = T4h + T4k;
+			      T4J = T4k - T4h;
+			 }
+			 {
+			      E T4t, T4E, T4Q, T4S;
+			      T4t = T4l - T4s;
+			      T4E = T4A + T4D;
+			      cr[WS(rs, 1)] = FNMS(TG, T4E, TD * T4t);
+			      ci[WS(rs, 1)] = FMA(TD, T4E, TG * T4t);
+			      T4Q = T4J - T4I;
+			      T4S = T4M + T4N;
+			      cr[WS(rs, 17)] = FNMS(T4R, T4S, T4P * T4Q);
+			      ci[WS(rs, 17)] = FMA(T4P, T4S, T4R * T4Q);
+			 }
+			 {
+			      E T4F, T4G, T4K, T4O;
+			      T4F = T4s + T4l;
+			      T4G = T4A - T4D;
+			      cr[WS(rs, 9)] = FNMS(T1T, T4G, T1R * T4F);
+			      ci[WS(rs, 9)] = FMA(T1R, T4G, T1T * T4F);
+			      T4K = T4I + T4J;
+			      T4O = T4M - T4N;
+			      cr[WS(rs, 13)] = FNMS(T4L, T4O, T4H * T4K);
+			      ci[WS(rs, 13)] = FMA(T4H, T4O, T4L * T4K);
+			 }
+		    }
+		    {
+			 E T1p, T1w, T22, T1X, T1J, T23, TU, T1W;
+			 T1p = FNMS(KP951056516, T1o, KP587785252 * T19);
+			 T1w = FNMS(KP951056516, T1v, KP587785252 * T1u);
+			 T22 = FMA(KP951056516, T1u, KP587785252 * T1v);
+			 T1X = FMA(KP951056516, T19, KP587785252 * T1o);
+			 {
+			      E T1H, T1I, TS, TT;
+			      T1H = FNMS(KP250000000, T1G, T1D);
+			      T1I = KP559016994 * (T1E - T1F);
+			      T1J = T1H - T1I;
+			      T23 = T1I + T1H;
+			      TS = FNMS(KP250000000, TR, TK);
+			      TT = KP559016994 * (TN - TQ);
+			      TU = TS - TT;
+			      T1W = TT + TS;
+			 }
+			 {
+			      E T1q, T1K, T2e, T2g;
+			      T1q = TU - T1p;
+			      T1K = T1w + T1J;
+			      cr[WS(rs, 2)] = FNMS(T1t, T1K, TJ * T1q);
+			      ci[WS(rs, 2)] = FMA(T1t, T1q, TJ * T1K);
+			      T2e = T1W + T1X;
+			      T2g = T23 - T22;
+			      cr[WS(rs, 14)] = FNMS(T2f, T2g, T2d * T2e);
+			      ci[WS(rs, 14)] = FMA(T2f, T2e, T2d * T2g);
+			 }
+			 {
+			      E T1O, T1Q, T1Y, T24;
+			      T1O = TU + T1p;
+			      T1Q = T1J - T1w;
+			      cr[WS(rs, 18)] = FNMS(T1P, T1Q, T1N * T1O);
+			      ci[WS(rs, 18)] = FMA(T1P, T1O, T1N * T1Q);
+			      T1Y = T1W - T1X;
+			      T24 = T22 + T23;
+			      cr[WS(rs, 6)] = FNMS(T21, T24, T1V * T1Y);
+			      ci[WS(rs, 6)] = FMA(T21, T1Y, T1V * T24);
+			 }
+		    }
+		    {
+			 E T3f, T3N, T43, T3Z, T3K, T42, T3A, T3Y;
+			 T3f = FNMS(KP951056516, T3e, KP587785252 * T37);
+			 T3N = FNMS(KP951056516, T3M, KP587785252 * T3L);
+			 T43 = FMA(KP951056516, T3L, KP587785252 * T3M);
+			 T3Z = FMA(KP951056516, T37, KP587785252 * T3e);
+			 {
+			      E T3I, T3J, T3y, T3z;
+			      T3I = FNMS(KP250000000, T3H, T3E);
+			      T3J = KP559016994 * (T3F - T3G);
+			      T3K = T3I - T3J;
+			      T42 = T3J + T3I;
+			      T3y = FNMS(KP250000000, T3x, T3i);
+			      T3z = KP559016994 * (T3p - T3w);
+			      T3A = T3y - T3z;
+			      T3Y = T3z + T3y;
+			 }
+			 {
+			      E T3B, T3O, T45, T46;
+			      T3B = T3f + T3A;
+			      T3O = T3K - T3N;
+			      cr[WS(rs, 3)] = FNMS(TH, T3O, TE * T3B);
+			      ci[WS(rs, 3)] = FMA(TE, T3O, TH * T3B);
+			      T45 = T3Z + T3Y;
+			      T46 = T42 - T43;
+			      cr[WS(rs, 19)] = FNMS(T1M, T46, T1L * T45);
+			      ci[WS(rs, 19)] = FMA(T1L, T46, T1M * T45);
+			 }
+			 {
+			      E T3S, T3W, T40, T44;
+			      T3S = T3A - T3f;
+			      T3W = T3K + T3N;
+			      cr[WS(rs, 7)] = FNMS(T3V, T3W, T3R * T3S);
+			      ci[WS(rs, 7)] = FMA(T3R, T3W, T3V * T3S);
+			      T40 = T3Y - T3Z;
+			      T44 = T42 + T43;
+			      cr[WS(rs, 11)] = FNMS(T41, T44, T3X * T40);
+			      ci[WS(rs, 11)] = FMA(T3X, T44, T41 * T40);
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 20, "hb2_20", twinstr, &GENUS, {204, 92, 72, 0} };
+
+void X(codelet_hb2_20) (planner *p) {
+     X(khc2hc_register) (p, hb2_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb2_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb2_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1682 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:30 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 25 -dif -name hb2_25 -include hb.h */
+
+/*
+ * This function contains 440 FP additions, 434 FP multiplications,
+ * (or, 84 additions, 78 multiplications, 356 fused multiply/add),
+ * 234 stack variables, 47 constants, and 100 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_25(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP833417178, +0.833417178328688677408962550243238843138996060);
+     DK(KP921177326, +0.921177326965143320250447435415066029359282231);
+     DK(KP541454447, +0.541454447536312777046285590082819509052033189);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DK(KP554608978, +0.554608978404018097464974850792216217022558774);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP248028675, +0.248028675328619457762448260696444630363259177);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DK(KP871714437, +0.871714437527667770979999223229522602943903653);
+     DK(KP851038619, +0.851038619207379630836264138867114231259902550);
+     DK(KP943557151, +0.943557151597354104399655195398983005179443399);
+     DK(KP726211448, +0.726211448929902658173535992263577167607493062);
+     DK(KP525970792, +0.525970792408939708442463226536226366643874659);
+     DK(KP912018591, +0.912018591466481957908415381764119056233607330);
+     DK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DK(KP994076283, +0.994076283785401014123185814696322018529298887);
+     DK(KP614372930, +0.614372930789563808870829930444362096004872855);
+     DK(KP621716863, +0.621716863012209892444754556304102309693593202);
+     DK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DK(KP557913902, +0.557913902031834264187699648465567037992437152);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP249506682, +0.249506682107067890488084201715862638334226305);
+     DK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DK(KP968479752, +0.968479752739016373193524836781420152702090879);
+     DK(KP062914667, +0.062914667253649757225485955897349402364686947);
+     DK(KP827271945, +0.827271945972475634034355757144307982555673741);
+     DK(KP470564281, +0.470564281212251493087595091036643380879947982);
+     DK(KP126329378, +0.126329378446108174786050455341811215027378105);
+     DK(KP256756360, +0.256756360367726783319498520922669048172391148);
+     DK(KP634619297, +0.634619297544148100711287640319130485732531031);
+     DK(KP549754652, +0.549754652192770074288023275540779861653779767);
+     DK(KP939062505, +0.939062505817492352556001843133229685779824606);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E TN, TQ, T4e, T2y, T4i, T3U, T4u, T4o, T4G, T4C, T2F, T41, T3Q, T4q, T3a;
+	       E T3F, T4a, T4w, T46, T44;
+	       {
+		    E TT, TO, TR, T23, T2d, T2x, TP, TV, T2p, T85, T4d, T25, TX;
+		    TN = W[0];
+		    TT = W[4];
+		    TO = W[2];
+		    TR = W[3];
+		    T23 = W[6];
+		    T2d = TN * TT;
+		    T2x = TO * TT;
+		    TP = TN * TO;
+		    TV = TN * TR;
+		    T2p = TT * T23;
+		    T85 = TN * T23;
+		    T4d = TO * T23;
+		    T25 = W[7];
+		    TQ = W[1];
+		    TX = W[5];
+		    {
+			 E T86, T4n, TW, T4l, TS, T71, T2q, T4z, T2e, T8a, T2u, T76, T2k, T4B, T6E;
+			 E T6U, T6Y, T5T, T8i, T1I, T2a, T26, TY, T8d, T8s, T8o, T5C, T5w, T7g, T7c;
+			 E T5M, T5I, T9, T40, T1R, T3X, T6H, T7F, T5W, T7n, T4N, T68, T1S, T1k, T1T;
+			 E T1D, T1Y, T1Z, T10, TM, T7K, T7A, T6p, T6w, T4X, T56, T3K, T2U, T7x, T7J;
+			 E T6v, T6i, T50, T57, T3L, T39, T4Q, T59, T3O, T3E, T67, T7t, T7H, T6y, T63;
+			 E T4T, T5a, T3N, T3p, T66, T7o;
+			 {
+			      E T2A, T2z, T6G, T2E, T5V, T6F;
+			      {
+				   E T1, T1J, T3Y, T3Z, T8, T2C, T1M, T1P, T2D, T4h, T89, T2t, T3W, T1Q, T3V;
+				   T1 = cr[0];
+				   T4e = FMA(TR, T25, T4d);
+				   T4h = TO * T25;
+				   T89 = TN * T25;
+				   T2t = TT * T25;
+				   T86 = FMA(TQ, T25, T85);
+				   T4n = FNMS(TQ, TO, TV);
+				   TW = FMA(TQ, TO, TV);
+				   T4l = FMA(TQ, TR, TP);
+				   TS = FNMS(TQ, TR, TP);
+				   T71 = FNMS(TR, TX, T2x);
+				   T2y = FMA(TR, TX, T2x);
+				   T2q = FMA(TX, T25, T2p);
+				   T4z = FMA(TQ, TX, T2d);
+				   T2e = FNMS(TQ, TX, T2d);
+				   {
+					E T3T, T2j, T4t, T6T;
+					T3T = TO * TX;
+					T2j = TN * TX;
+					T4i = FNMS(TR, T23, T4h);
+					T8a = FNMS(TQ, T23, T89);
+					T2u = FNMS(TX, T23, T2t);
+					T4t = T4l * TX;
+					T6T = T4l * T23;
+					{
+					     E T6X, T4m, T1H, T29;
+					     T6X = T4l * T25;
+					     T4m = T4l * TT;
+					     T1H = TS * TX;
+					     T29 = TS * T25;
+					     {
+						  E T24, TU, T4F, T4A;
+						  T24 = TS * T23;
+						  TU = TS * TT;
+						  T4F = T4z * T25;
+						  T4A = T4z * T23;
+						  {
+						       E T8r, T8n, T5B, T5v;
+						       T8r = T2y * T25;
+						       T8n = T2y * T23;
+						       T5B = T2e * T25;
+						       T5v = T2e * T23;
+						       T3U = FNMS(TR, TT, T3T);
+						       T76 = FMA(TR, TT, T3T);
+						       T2k = FMA(TQ, TT, T2j);
+						       T4B = FNMS(TQ, TT, T2j);
+						       T4u = FMA(T4n, TT, T4t);
+						       T6E = FNMS(T4n, TT, T4t);
+						       T6U = FMA(T4n, T25, T6T);
+						       T6Y = FNMS(T4n, T23, T6X);
+						       T5T = FMA(T4n, TX, T4m);
+						       T4o = FNMS(T4n, TX, T4m);
+						       T8i = FMA(TW, TT, T1H);
+						       T1I = FNMS(TW, TT, T1H);
+						       T2a = FNMS(TW, T23, T29);
+						       T26 = FMA(TW, T25, T24);
+						       TY = FMA(TW, TX, TU);
+						       T8d = FNMS(TW, TX, TU);
+						       T8s = FNMS(T3U, T23, T8r);
+						       T8o = FMA(T3U, T25, T8n);
+						       T5C = FNMS(T2k, T23, T5B);
+						       T5w = FMA(T2k, T25, T5v);
+						       T4G = FNMS(T4B, T23, T4F);
+						       T4C = FMA(T4B, T25, T4A);
+						       {
+							    E T7f, T7b, T5L, T5H;
+							    T7f = T5T * T25;
+							    T7b = T5T * T23;
+							    T5L = TY * T25;
+							    T5H = TY * T23;
+							    T7g = FNMS(T6E, T23, T7f);
+							    T7c = FMA(T6E, T25, T7b);
+							    T5M = FNMS(T1I, T23, T5L);
+							    T5I = FMA(T1I, T25, T5H);
+							    T1J = ci[WS(rs, 24)];
+						       }
+						  }
+					     }
+					}
+				   }
+				   {
+					E T2, T3, T5, T6;
+					T2 = cr[WS(rs, 5)];
+					T3 = ci[WS(rs, 4)];
+					T5 = cr[WS(rs, 10)];
+					T6 = ci[WS(rs, 9)];
+					{
+					     E T1K, T4, T7, T1L, T1N, T1O;
+					     T1K = ci[WS(rs, 19)];
+					     T3Y = T2 - T3;
+					     T4 = T2 + T3;
+					     T3Z = T5 - T6;
+					     T7 = T5 + T6;
+					     T1L = cr[WS(rs, 20)];
+					     T1N = ci[WS(rs, 14)];
+					     T1O = cr[WS(rs, 15)];
+					     T8 = T4 + T7;
+					     T2A = T4 - T7;
+					     T2C = T1K + T1L;
+					     T1M = T1K - T1L;
+					     T1P = T1N - T1O;
+					     T2D = T1N + T1O;
+					}
+				   }
+				   T2z = FNMS(KP250000000, T8, T1);
+				   T9 = T1 + T8;
+				   T3W = T1M - T1P;
+				   T1Q = T1M + T1P;
+				   T40 = FMA(KP618033988, T3Z, T3Y);
+				   T6G = FNMS(KP618033988, T3Y, T3Z);
+				   T2E = FMA(KP618033988, T2D, T2C);
+				   T5V = FNMS(KP618033988, T2C, T2D);
+				   T1R = T1J + T1Q;
+				   T3V = FNMS(KP250000000, T1Q, T1J);
+				   T6F = FNMS(KP559016994, T3W, T3V);
+				   T3X = FMA(KP559016994, T3W, T3V);
+			      }
+			      {
+				   E T2S, T6n, T2H, T2G, Ti, T5Y, T3C, T3r, TK, T3q, T30, T6d, T33, Tr, T32;
+				   E T3v, T61, T3y, T1C, T3x, T2L, T6k, T2O, T1a, T2N, T6g, T37, T2W, Tt, T1j;
+				   E T2V, Tx, T3g, T3j, Tw, T3l, T1t, T3i, Ty;
+				   {
+					E T1u, T1v, T1A, T3u, T1w;
+					{
+					     E TC, TI, T3B, TD, TE;
+					     {
+						  E Ta, Te, Tf, Tb, Tc, T5U, T2B, T2R, Tg;
+						  Ta = cr[WS(rs, 1)];
+						  T5U = FNMS(KP559016994, T2A, T2z);
+						  T2B = FMA(KP559016994, T2A, T2z);
+						  T6H = FNMS(KP951056516, T6G, T6F);
+						  T7F = FMA(KP951056516, T6G, T6F);
+						  Te = cr[WS(rs, 11)];
+						  T5W = FMA(KP951056516, T5V, T5U);
+						  T7n = FNMS(KP951056516, T5V, T5U);
+						  T4N = FMA(KP951056516, T2E, T2B);
+						  T2F = FNMS(KP951056516, T2E, T2B);
+						  Tf = ci[WS(rs, 8)];
+						  Tb = cr[WS(rs, 6)];
+						  Tc = ci[WS(rs, 3)];
+						  TC = cr[WS(rs, 3)];
+						  T2R = Tf - Te;
+						  Tg = Te + Tf;
+						  {
+						       E T2Q, Td, Th, TG, TH;
+						       T2Q = Tb - Tc;
+						       Td = Tb + Tc;
+						       TG = ci[WS(rs, 11)];
+						       TH = ci[WS(rs, 6)];
+						       T2S = FNMS(KP618033988, T2R, T2Q);
+						       T6n = FMA(KP618033988, T2Q, T2R);
+						       Th = Td + Tg;
+						       T2H = Td - Tg;
+						       TI = TG + TH;
+						       T3B = TG - TH;
+						       T2G = FNMS(KP250000000, Th, Ta);
+						       Ti = Ta + Th;
+						       TD = cr[WS(rs, 8)];
+						       TE = ci[WS(rs, 1)];
+						  }
+					     }
+					     {
+						  E Tj, Tk, Tp, T2Z, TJ, Tl;
+						  Tj = cr[WS(rs, 4)];
+						  {
+						       E Tn, To, T3A, TF;
+						       Tn = ci[WS(rs, 10)];
+						       To = ci[WS(rs, 5)];
+						       T3A = TD - TE;
+						       TF = TD + TE;
+						       Tk = cr[WS(rs, 9)];
+						       Tp = Tn + To;
+						       T2Z = To - Tn;
+						       T5Y = FNMS(KP618033988, T3A, T3B);
+						       T3C = FMA(KP618033988, T3B, T3A);
+						       T3r = TI - TF;
+						       TJ = TF + TI;
+						       Tl = ci[0];
+						  }
+						  T1u = ci[WS(rs, 21)];
+						  TK = TC + TJ;
+						  T3q = FNMS(KP250000000, TJ, TC);
+						  {
+						       E T1y, Tm, T2Y, T1z, Tq;
+						       T1y = cr[WS(rs, 13)];
+						       Tm = Tk + Tl;
+						       T2Y = Tl - Tk;
+						       T1z = cr[WS(rs, 18)];
+						       T1v = ci[WS(rs, 16)];
+						       T30 = FMA(KP618033988, T2Z, T2Y);
+						       T6d = FNMS(KP618033988, T2Y, T2Z);
+						       T33 = Tm - Tp;
+						       Tq = Tm + Tp;
+						       T1A = T1y + T1z;
+						       T3u = T1z - T1y;
+						       Tr = Tj + Tq;
+						       T32 = FMS(KP250000000, Tq, Tj);
+						       T1w = cr[WS(rs, 23)];
+						  }
+					     }
+					}
+					{
+					     E T1b, T1c, T1h, T36, T1d;
+					     {
+						  E T12, T13, T18, T2K, T1B, T14;
+						  T12 = ci[WS(rs, 23)];
+						  {
+						       E T16, T17, T3t, T1x;
+						       T16 = ci[WS(rs, 13)];
+						       T17 = cr[WS(rs, 16)];
+						       T3t = T1v + T1w;
+						       T1x = T1v - T1w;
+						       T13 = ci[WS(rs, 18)];
+						       T18 = T16 - T17;
+						       T2K = T16 + T17;
+						       T3v = FMA(KP618033988, T3u, T3t);
+						       T61 = FNMS(KP618033988, T3t, T3u);
+						       T3y = T1x + T1A;
+						       T1B = T1x - T1A;
+						       T14 = cr[WS(rs, 21)];
+						  }
+						  T1b = ci[WS(rs, 20)];
+						  T1C = T1u + T1B;
+						  T3x = FMS(KP250000000, T1B, T1u);
+						  {
+						       E T1f, T15, T2J, T1g, T19;
+						       T1f = cr[WS(rs, 14)];
+						       T15 = T13 - T14;
+						       T2J = T13 + T14;
+						       T1g = cr[WS(rs, 19)];
+						       T1c = ci[WS(rs, 15)];
+						       T2L = FMA(KP618033988, T2K, T2J);
+						       T6k = FNMS(KP618033988, T2J, T2K);
+						       T2O = T15 - T18;
+						       T19 = T15 + T18;
+						       T1h = T1f + T1g;
+						       T36 = T1g - T1f;
+						       T1a = T12 + T19;
+						       T2N = FNMS(KP250000000, T19, T12);
+						       T1d = cr[WS(rs, 24)];
+						  }
+					     }
+					     {
+						  E T1l, T1p, T1o, T3e, T1i, T1q;
+						  T1l = ci[WS(rs, 22)];
+						  {
+						       E T1m, T1n, T35, T1e;
+						       T1m = ci[WS(rs, 17)];
+						       T1n = cr[WS(rs, 22)];
+						       T35 = T1c + T1d;
+						       T1e = T1c - T1d;
+						       T1p = ci[WS(rs, 12)];
+						       T1o = T1m - T1n;
+						       T3e = T1m + T1n;
+						       T6g = FNMS(KP618033988, T35, T36);
+						       T37 = FMA(KP618033988, T36, T35);
+						       T2W = T1e + T1h;
+						       T1i = T1e - T1h;
+						       T1q = cr[WS(rs, 17)];
+						  }
+						  Tt = cr[WS(rs, 2)];
+						  T1j = T1b + T1i;
+						  T2V = FMS(KP250000000, T1i, T1b);
+						  {
+						       E Tu, T1r, T3f, Tv, T1s;
+						       Tu = cr[WS(rs, 7)];
+						       T1r = T1p - T1q;
+						       T3f = T1p + T1q;
+						       Tv = ci[WS(rs, 2)];
+						       Tx = cr[WS(rs, 12)];
+						       T3g = FMA(KP618033988, T3f, T3e);
+						       T68 = FNMS(KP618033988, T3e, T3f);
+						       T3j = T1o - T1r;
+						       T1s = T1o + T1r;
+						       Tw = Tu + Tv;
+						       T3l = Tu - Tv;
+						       T1t = T1l + T1s;
+						       T3i = FMS(KP250000000, T1s, T1l);
+						       Ty = ci[WS(rs, 7)];
+						  }
+					     }
+					}
+				   }
+				   {
+					E T3n, T65, T3c, T3b, T2P, T2M, T4W;
+					{
+					     E TA, T3m, Tz, TB, Ts;
+					     T3m = Ty - Tx;
+					     Tz = Tx + Ty;
+					     T1S = T1a + T1j;
+					     T1k = T1a - T1j;
+					     T3n = FNMS(KP618033988, T3m, T3l);
+					     T65 = FMA(KP618033988, T3l, T3m);
+					     TA = Tw + Tz;
+					     T3c = Tz - Tw;
+					     T3b = FNMS(KP250000000, TA, Tt);
+					     TB = Tt + TA;
+					     T1T = T1t + T1C;
+					     T1D = T1t - T1C;
+					     T1Y = Ti - Tr;
+					     Ts = Ti + Tr;
+					     {
+						  E T2I, T6j, T6m, TL;
+						  T2I = FMA(KP559016994, T2H, T2G);
+						  T6j = FNMS(KP559016994, T2H, T2G);
+						  T6m = FNMS(KP559016994, T2O, T2N);
+						  T2P = FMA(KP559016994, T2O, T2N);
+						  TL = TB + TK;
+						  T1Z = TB - TK;
+						  {
+						       E T6l, T7y, T6o, T7z;
+						       T6l = FMA(KP951056516, T6k, T6j);
+						       T7y = FNMS(KP951056516, T6k, T6j);
+						       T6o = FMA(KP951056516, T6n, T6m);
+						       T7z = FNMS(KP951056516, T6n, T6m);
+						       T10 = Ts - TL;
+						       TM = Ts + TL;
+						       T2M = FNMS(KP951056516, T2L, T2I);
+						       T4W = FMA(KP951056516, T2L, T2I);
+						       T7K = FMA(KP939062505, T7y, T7z);
+						       T7A = FNMS(KP939062505, T7z, T7y);
+						       T6p = FNMS(KP549754652, T6o, T6l);
+						       T6w = FMA(KP549754652, T6l, T6o);
+						  }
+					     }
+					}
+					{
+					     E T34, T31, T4Y, T60, T3s, T3z, T5X;
+					     {
+						  E T2X, T6c, T6f, T4V, T2T;
+						  T2X = FNMS(KP559016994, T2W, T2V);
+						  T6c = FMA(KP559016994, T2W, T2V);
+						  T6f = FMA(KP559016994, T33, T32);
+						  T34 = FNMS(KP559016994, T33, T32);
+						  T4V = FNMS(KP951056516, T2S, T2P);
+						  T2T = FMA(KP951056516, T2S, T2P);
+						  {
+						       E T7w, T6e, T7v, T6h;
+						       T7w = FMA(KP951056516, T6d, T6c);
+						       T6e = FNMS(KP951056516, T6d, T6c);
+						       T7v = FMA(KP951056516, T6g, T6f);
+						       T6h = FNMS(KP951056516, T6g, T6f);
+						       T4X = FNMS(KP634619297, T4W, T4V);
+						       T56 = FMA(KP634619297, T4V, T4W);
+						       T3K = FMA(KP256756360, T2M, T2T);
+						       T2U = FNMS(KP256756360, T2T, T2M);
+						       T7x = FMA(KP126329378, T7w, T7v);
+						       T7J = FNMS(KP126329378, T7v, T7w);
+						       T6v = FNMS(KP470564281, T6e, T6h);
+						       T6i = FMA(KP470564281, T6h, T6e);
+						       T31 = FMA(KP951056516, T30, T2X);
+						       T4Y = FNMS(KP951056516, T30, T2X);
+						  }
+						  T60 = FMA(KP559016994, T3r, T3q);
+						  T3s = FNMS(KP559016994, T3r, T3q);
+						  T3z = FNMS(KP559016994, T3y, T3x);
+						  T5X = FMA(KP559016994, T3y, T3x);
+					     }
+					     {
+						  E T5Z, T7r, T4Z, T38;
+						  T4Z = FNMS(KP951056516, T37, T34);
+						  T38 = FMA(KP951056516, T37, T34);
+						  {
+						       E T4O, T3w, T4P, T3D;
+						       T4O = FMA(KP951056516, T3v, T3s);
+						       T3w = FNMS(KP951056516, T3v, T3s);
+						       T4P = FMA(KP951056516, T3C, T3z);
+						       T3D = FNMS(KP951056516, T3C, T3z);
+						       T50 = FNMS(KP827271945, T4Z, T4Y);
+						       T57 = FMA(KP827271945, T4Y, T4Z);
+						       T3L = FMA(KP634619297, T31, T38);
+						       T39 = FNMS(KP634619297, T38, T31);
+						       T4Q = FNMS(KP126329378, T4P, T4O);
+						       T59 = FMA(KP126329378, T4O, T4P);
+						       T3O = FNMS(KP939062505, T3w, T3D);
+						       T3E = FMA(KP939062505, T3D, T3w);
+						       T5Z = FMA(KP951056516, T5Y, T5X);
+						       T7r = FNMS(KP951056516, T5Y, T5X);
+						  }
+						  {
+						       E T3d, T3k, T64, T7s, T62;
+						       T67 = FMA(KP559016994, T3c, T3b);
+						       T3d = FNMS(KP559016994, T3c, T3b);
+						       T3k = FNMS(KP559016994, T3j, T3i);
+						       T64 = FMA(KP559016994, T3j, T3i);
+						       T7s = FNMS(KP951056516, T61, T60);
+						       T62 = FMA(KP951056516, T61, T60);
+						       {
+							    E T4S, T3h, T4R, T3o;
+							    T4S = FMA(KP951056516, T3g, T3d);
+							    T3h = FNMS(KP951056516, T3g, T3d);
+							    T4R = FMA(KP951056516, T3n, T3k);
+							    T3o = FNMS(KP951056516, T3n, T3k);
+							    T7t = FNMS(KP827271945, T7s, T7r);
+							    T7H = FMA(KP827271945, T7r, T7s);
+							    T6y = FNMS(KP062914667, T5Z, T62);
+							    T63 = FMA(KP062914667, T62, T5Z);
+							    T4T = FNMS(KP470564281, T4S, T4R);
+							    T5a = FMA(KP470564281, T4R, T4S);
+							    T3N = FNMS(KP549754652, T3h, T3o);
+							    T3p = FMA(KP549754652, T3o, T3h);
+							    T66 = FNMS(KP951056516, T65, T64);
+							    T7o = FMA(KP951056516, T65, T64);
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T7q, T7G, T6J, T6I, T6q, T6b, T6B, T73, T6Q, T78, T6z, T6a;
+			      cr[0] = T9 + TM;
+			      {
+				   E T1U, T2l, T1X, T2g, T1E, TZ, T2m, T20, T2v, T2n;
+				   {
+					E T1W, T7p, T69, T1V;
+					T1W = T1S - T1T;
+					T1U = T1S + T1T;
+					T7p = FNMS(KP951056516, T68, T67);
+					T69 = FMA(KP951056516, T68, T67);
+					T1V = FNMS(KP250000000, T1U, T1R);
+					T7q = FMA(KP062914667, T7p, T7o);
+					T7G = FNMS(KP062914667, T7o, T7p);
+					T6z = FNMS(KP634619297, T66, T69);
+					T6a = FMA(KP634619297, T69, T66);
+					T2l = FNMS(KP559016994, T1W, T1V);
+					T1X = FMA(KP559016994, T1W, T1V);
+					T2g = FNMS(KP618033988, T1k, T1D);
+					T1E = FMA(KP618033988, T1D, T1k);
+					TZ = FNMS(KP250000000, TM, T9);
+					T2m = FNMS(KP618033988, T1Y, T1Z);
+					T20 = FMA(KP618033988, T1Z, T1Y);
+				   }
+				   ci[0] = T1R + T1U;
+				   T2v = FMA(KP951056516, T2m, T2l);
+				   T2n = FNMS(KP951056516, T2m, T2l);
+				   {
+					E T2b, T21, T2f, T11;
+					T2b = FNMS(KP951056516, T20, T1X);
+					T21 = FMA(KP951056516, T20, T1X);
+					T2f = FNMS(KP559016994, T10, TZ);
+					T11 = FMA(KP559016994, T10, TZ);
+					{
+					     E T2h, T2r, T27, T1F;
+					     T2h = FMA(KP951056516, T2g, T2f);
+					     T2r = FNMS(KP951056516, T2g, T2f);
+					     T27 = FMA(KP951056516, T1E, T11);
+					     T1F = FNMS(KP951056516, T1E, T11);
+					     {
+						  E T2o, T2i, T2w, T2s;
+						  T2o = T2k * T2h;
+						  T2i = T2e * T2h;
+						  T2w = T2u * T2r;
+						  T2s = T2q * T2r;
+						  {
+						       E T2c, T28, T22, T1G;
+						       T2c = T2a * T27;
+						       T28 = T26 * T27;
+						       T22 = T1I * T1F;
+						       T1G = TY * T1F;
+						       ci[WS(rs, 15)] = FMA(T2q, T2v, T2w);
+						       cr[WS(rs, 15)] = FNMS(T2u, T2v, T2s);
+						       ci[WS(rs, 20)] = FMA(T26, T2b, T2c);
+						       cr[WS(rs, 20)] = FNMS(T2a, T2b, T28);
+						       ci[WS(rs, 5)] = FMA(TY, T21, T22);
+						       cr[WS(rs, 5)] = FNMS(T1I, T21, T1G);
+						       cr[WS(rs, 10)] = FNMS(T2k, T2n, T2i);
+						       ci[WS(rs, 10)] = FMA(T2e, T2n, T2o);
+						  }
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T6x, T6A, T6O, T6P;
+				   T6x = FMA(KP968479752, T6w, T6v);
+				   T6J = FNMS(KP968479752, T6w, T6v);
+				   T6I = FMA(KP845997307, T6z, T6y);
+				   T6A = FNMS(KP845997307, T6z, T6y);
+				   T6O = FNMS(KP968479752, T6p, T6i);
+				   T6q = FMA(KP968479752, T6p, T6i);
+				   T6b = FMA(KP845997307, T6a, T63);
+				   T6P = FNMS(KP845997307, T6a, T63);
+				   T6B = FNMS(KP681693190, T6A, T6x);
+				   T73 = FMA(KP560319534, T6x, T6A);
+				   T6Q = FMA(KP681693190, T6P, T6O);
+				   T78 = FNMS(KP560319534, T6O, T6P);
+			      }
+			      {
+				   E T7U, T8f, T7B, T7u, T82, T8k, T7Y, T7M;
+				   {
+					E T7L, T7I, T80, T81;
+					{
+					     E T7S, T6r, T6t, T6K, T6M, T7T, T6s, T7j;
+					     T7S = FNMS(KP734762448, T7K, T7J);
+					     T7L = FMA(KP734762448, T7K, T7J);
+					     T6r = FMA(KP906616052, T6q, T6b);
+					     T6t = FNMS(KP906616052, T6q, T6b);
+					     T6K = FNMS(KP906616052, T6J, T6I);
+					     T6M = FMA(KP906616052, T6J, T6I);
+					     T7I = FMA(KP772036680, T7H, T7G);
+					     T7T = FNMS(KP772036680, T7H, T7G);
+					     T6s = FNMS(KP249506682, T6r, T5W);
+					     T7j = FMA(KP998026728, T6r, T5W);
+					     {
+						  E T6L, T7l, T72, T6u;
+						  T6L = FNMS(KP249506682, T6K, T6H);
+						  T7l = FMA(KP998026728, T6K, T6H);
+						  T72 = FMA(KP557913902, T6t, T6s);
+						  T6u = FNMS(KP557913902, T6t, T6s);
+						  {
+						       E T7k, T6N, T77, T7m;
+						       T7k = T4l * T7j;
+						       T6N = FNMS(KP557913902, T6M, T6L);
+						       T77 = FMA(KP557913902, T6M, T6L);
+						       T7m = T4l * T7l;
+						       {
+							    E T74, T7d, T6V, T6C;
+							    T74 = FNMS(KP949179823, T73, T72);
+							    T7d = FMA(KP949179823, T73, T72);
+							    T6V = FMA(KP860541664, T6B, T6u);
+							    T6C = FNMS(KP860541664, T6B, T6u);
+							    cr[WS(rs, 2)] = FNMS(T4n, T7l, T7k);
+							    {
+								 E T7h, T79, T6R, T6Z;
+								 T7h = FNMS(KP949179823, T78, T77);
+								 T79 = FMA(KP949179823, T78, T77);
+								 T6R = FNMS(KP860541664, T6Q, T6N);
+								 T6Z = FMA(KP860541664, T6Q, T6N);
+								 ci[WS(rs, 2)] = FMA(T4n, T7j, T7m);
+								 {
+								      E T75, T7e, T6W, T6D;
+								      T75 = T71 * T74;
+								      T7e = T7c * T7d;
+								      T6W = T6U * T6V;
+								      T6D = T5T * T6C;
+								      {
+									   E T7a, T7i, T70, T6S;
+									   T7a = T71 * T79;
+									   T7i = T7c * T7h;
+									   T70 = T6U * T6Z;
+									   T6S = T5T * T6R;
+									   cr[WS(rs, 12)] = FNMS(T76, T79, T75);
+									   cr[WS(rs, 17)] = FNMS(T7g, T7h, T7e);
+									   cr[WS(rs, 22)] = FNMS(T6Y, T6Z, T6W);
+									   cr[WS(rs, 7)] = FNMS(T6E, T6R, T6D);
+									   ci[WS(rs, 12)] = FMA(T76, T74, T7a);
+									   ci[WS(rs, 17)] = FMA(T7g, T7d, T7i);
+									   ci[WS(rs, 22)] = FMA(T6Y, T6V, T70);
+									   ci[WS(rs, 7)] = FMA(T6E, T6C, T6S);
+									   T7U = FNMS(KP621716863, T7T, T7S);
+									   T8f = FMA(KP614372930, T7S, T7T);
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					T80 = FNMS(KP734762448, T7A, T7x);
+					T7B = FMA(KP734762448, T7A, T7x);
+					T7u = FMA(KP772036680, T7t, T7q);
+					T81 = FNMS(KP772036680, T7t, T7q);
+					T82 = FNMS(KP621716863, T81, T80);
+					T8k = FMA(KP614372930, T80, T81);
+					T7Y = FNMS(KP994076283, T7L, T7I);
+					T7M = FMA(KP994076283, T7L, T7I);
+				   }
+				   {
+					E T5y, T5c, T51, T4U, T5f, T5E, T5o, T5i, T5k;
+					{
+					     E T5h, T5g, T5m, T5n, T58, T5b;
+					     T5h = FMA(KP912575812, T57, T56);
+					     T58 = FNMS(KP912575812, T57, T56);
+					     T5b = FNMS(KP912018591, T5a, T59);
+					     T5g = FMA(KP912018591, T5a, T59);
+					     {
+						  E T7X, T7N, T7C, T7Q;
+						  T7X = FNMS(KP249506682, T7M, T7F);
+						  T7N = FMA(KP998026728, T7M, T7F);
+						  T7C = FMA(KP994076283, T7B, T7u);
+						  T7Q = FNMS(KP994076283, T7B, T7u);
+						  T5y = FMA(KP525970792, T58, T5b);
+						  T5c = FNMS(KP726211448, T5b, T58);
+						  {
+						       E T7Z, T8j, T7P, T7D;
+						       T7Z = FNMS(KP557913902, T7Y, T7X);
+						       T8j = FMA(KP557913902, T7Y, T7X);
+						       T7P = FNMS(KP249506682, T7C, T7n);
+						       T7D = FMA(KP998026728, T7C, T7n);
+						       {
+							    E T8b, T83, T8t, T8l;
+							    T8b = FMA(KP943557151, T82, T7Z);
+							    T83 = FNMS(KP943557151, T82, T7Z);
+							    T8t = FMA(KP949179823, T8k, T8j);
+							    T8l = FNMS(KP949179823, T8k, T8j);
+							    {
+								 E T8e, T7R, T7O, T7E;
+								 T8e = FMA(KP557913902, T7Q, T7P);
+								 T7R = FNMS(KP557913902, T7Q, T7P);
+								 T7O = TR * T7D;
+								 T7E = TO * T7D;
+								 {
+								      E T8g, T8p, T7V, T87;
+								      T8g = FMA(KP949179823, T8f, T8e);
+								      T8p = FNMS(KP949179823, T8f, T8e);
+								      T7V = FMA(KP943557151, T7U, T7R);
+								      T87 = FNMS(KP943557151, T7U, T7R);
+								      ci[WS(rs, 3)] = FMA(TO, T7N, T7O);
+								      cr[WS(rs, 3)] = FNMS(TR, T7N, T7E);
+								      {
+									   E T8m, T8h, T8u, T8q;
+									   T8m = T8i * T8g;
+									   T8h = T8d * T8g;
+									   T8u = T8s * T8p;
+									   T8q = T8o * T8p;
+									   {
+										E T84, T7W, T8c, T88;
+										T84 = T4B * T7V;
+										T7W = T4z * T7V;
+										T8c = T8a * T87;
+										T88 = T86 * T87;
+										ci[WS(rs, 13)] = FMA(T8d, T8l, T8m);
+										cr[WS(rs, 13)] = FNMS(T8i, T8l, T8h);
+										ci[WS(rs, 18)] = FMA(T8o, T8t, T8u);
+										cr[WS(rs, 18)] = FNMS(T8s, T8t, T8q);
+										ci[WS(rs, 8)] = FMA(T4z, T83, T84);
+										cr[WS(rs, 8)] = FNMS(T4B, T83, T7W);
+										ci[WS(rs, 23)] = FMA(T86, T8b, T8c);
+										cr[WS(rs, 23)] = FNMS(T8a, T8b, T88);
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					     T51 = FMA(KP912575812, T50, T4X);
+					     T5m = FNMS(KP912575812, T50, T4X);
+					     T5n = FMA(KP912018591, T4T, T4Q);
+					     T4U = FNMS(KP912018591, T4T, T4Q);
+					     T41 = FMA(KP951056516, T40, T3X);
+					     T5f = FNMS(KP951056516, T40, T3X);
+					     T5E = FMA(KP525970792, T5m, T5n);
+					     T5o = FNMS(KP726211448, T5n, T5m);
+					     T5i = FMA(KP851038619, T5h, T5g);
+					     T5k = FNMS(KP851038619, T5h, T5g);
+					}
+					{
+					     E T42, T43, T48, T49, T3M, T3P;
+					     T3M = FMA(KP871714437, T3L, T3K);
+					     T42 = FNMS(KP871714437, T3L, T3K);
+					     T43 = FMA(KP831864738, T3O, T3N);
+					     T3P = FNMS(KP831864738, T3O, T3N);
+					     {
+						  E T5R, T5j, T54, T52;
+						  T5R = FMA(KP992114701, T5i, T5f);
+						  T5j = FNMS(KP248028675, T5i, T5f);
+						  T54 = FNMS(KP851038619, T51, T4U);
+						  T52 = FMA(KP851038619, T51, T4U);
+						  T3Q = FNMS(KP559154169, T3P, T3M);
+						  T4q = FMA(KP683113946, T3M, T3P);
+						  {
+						       E T5D, T5l, T5P, T53;
+						       T5D = FMA(KP554608978, T5k, T5j);
+						       T5l = FNMS(KP554608978, T5k, T5j);
+						       T5P = FNMS(KP992114701, T52, T4N);
+						       T53 = FMA(KP248028675, T52, T4N);
+						       {
+							    E T5p, T5t, T5F, T5N;
+							    T5p = FNMS(KP803003575, T5o, T5l);
+							    T5t = FMA(KP803003575, T5o, T5l);
+							    T5F = FNMS(KP943557151, T5E, T5D);
+							    T5N = FMA(KP943557151, T5E, T5D);
+							    {
+								 E T55, T5x, T5S, T5Q;
+								 T55 = FMA(KP554608978, T54, T53);
+								 T5x = FNMS(KP554608978, T54, T53);
+								 T5S = TW * T5P;
+								 T5Q = TS * T5P;
+								 {
+								      E T5J, T5z, T5r, T5d;
+								      T5J = FMA(KP943557151, T5y, T5x);
+								      T5z = FNMS(KP943557151, T5y, T5x);
+								      T5r = FMA(KP803003575, T5c, T55);
+								      T5d = FNMS(KP803003575, T5c, T55);
+								      ci[WS(rs, 4)] = FMA(TS, T5R, T5S);
+								      cr[WS(rs, 4)] = FNMS(TW, T5R, T5Q);
+								      {
+									   E T5G, T5A, T5O, T5K;
+									   T5G = T5C * T5z;
+									   T5A = T5w * T5z;
+									   T5O = T5M * T5J;
+									   T5K = T5I * T5J;
+									   {
+										E T5q, T5e, T5u, T5s;
+										T5q = TX * T5d;
+										T5e = TT * T5d;
+										T5u = T25 * T5r;
+										T5s = T23 * T5r;
+										ci[WS(rs, 14)] = FMA(T5w, T5F, T5G);
+										cr[WS(rs, 14)] = FNMS(T5C, T5F, T5A);
+										ci[WS(rs, 19)] = FMA(T5I, T5N, T5O);
+										cr[WS(rs, 19)] = FNMS(T5M, T5N, T5K);
+										ci[WS(rs, 9)] = FMA(TT, T5p, T5q);
+										cr[WS(rs, 9)] = FNMS(TX, T5p, T5e);
+										ci[WS(rs, 24)] = FMA(T23, T5t, T5u);
+										cr[WS(rs, 24)] = FNMS(T25, T5t, T5s);
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					     T48 = FNMS(KP871714437, T39, T2U);
+					     T3a = FMA(KP871714437, T39, T2U);
+					     T3F = FMA(KP831864738, T3E, T3p);
+					     T49 = FNMS(KP831864738, T3E, T3p);
+					     T4a = FMA(KP559154169, T49, T48);
+					     T4w = FNMS(KP683113946, T48, T49);
+					     T46 = FMA(KP904730450, T43, T42);
+					     T44 = FNMS(KP904730450, T43, T42);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    E T45, T4L, T3G, T3I;
+		    T45 = FNMS(KP242145790, T44, T41);
+		    T4L = FMA(KP968583161, T44, T41);
+		    T3G = FMA(KP904730450, T3F, T3a);
+		    T3I = FNMS(KP904730450, T3F, T3a);
+		    {
+			 E T4v, T47, T4J, T3H;
+			 T4v = FNMS(KP541454447, T46, T45);
+			 T47 = FMA(KP541454447, T46, T45);
+			 T4J = FMA(KP968583161, T3G, T2F);
+			 T3H = FNMS(KP242145790, T3G, T2F);
+			 {
+			      E T4b, T4j, T4x, T4H;
+			      T4b = FMA(KP921177326, T4a, T47);
+			      T4j = FNMS(KP921177326, T4a, T47);
+			      T4x = FNMS(KP833417178, T4w, T4v);
+			      T4H = FMA(KP833417178, T4w, T4v);
+			      {
+				   E T3J, T4p, T4M, T4K;
+				   T3J = FMA(KP541454447, T3I, T3H);
+				   T4p = FNMS(KP541454447, T3I, T3H);
+				   T4M = TQ * T4J;
+				   T4K = TN * T4J;
+				   {
+					E T4D, T4r, T4f, T3R;
+					T4D = FMA(KP833417178, T4q, T4p);
+					T4r = FNMS(KP833417178, T4q, T4p);
+					T4f = FMA(KP921177326, T3Q, T3J);
+					T3R = FNMS(KP921177326, T3Q, T3J);
+					ci[WS(rs, 1)] = FMA(TN, T4L, T4M);
+					cr[WS(rs, 1)] = FNMS(TQ, T4L, T4K);
+					{
+					     E T4y, T4s, T4I, T4E;
+					     T4y = T4u * T4r;
+					     T4s = T4o * T4r;
+					     T4I = T4G * T4D;
+					     T4E = T4C * T4D;
+					     {
+						  E T4c, T3S, T4k, T4g;
+						  T4c = T3U * T3R;
+						  T3S = T2y * T3R;
+						  T4k = T4i * T4f;
+						  T4g = T4e * T4f;
+						  ci[WS(rs, 11)] = FMA(T4o, T4x, T4y);
+						  cr[WS(rs, 11)] = FNMS(T4u, T4x, T4s);
+						  ci[WS(rs, 16)] = FMA(T4C, T4H, T4I);
+						  cr[WS(rs, 16)] = FNMS(T4G, T4H, T4E);
+						  ci[WS(rs, 6)] = FMA(T2y, T4b, T4c);
+						  cr[WS(rs, 6)] = FNMS(T3U, T4b, T3S);
+						  ci[WS(rs, 21)] = FMA(T4e, T4j, T4k);
+						  cr[WS(rs, 21)] = FNMS(T4i, T4j, T4g);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 24},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 25, "hb2_25", twinstr, &GENUS, {84, 78, 356, 0} };
+
+void X(codelet_hb2_25) (planner *p) {
+     X(khc2hc_register) (p, hb2_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 25 -dif -name hb2_25 -include hb.h */
+
+/*
+ * This function contains 440 FP additions, 340 FP multiplications,
+ * (or, 280 additions, 180 multiplications, 160 fused multiply/add),
+ * 155 stack variables, 20 constants, and 100 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_25(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E TN, TQ, TO, TR, TT, TY, T2t, T2r, TZ, TU, T4f, T4l, T2d, T4v, T5m;
+	       E T2j, T5l, T4X, T2v, T11, T3R, T1L, T5d, T6x, T5h, T6t, T25, T26, T27, T29;
+	       E T6D, T7v, T49, T7l, T7p, T7t, T2p, T2n, T4b, T4p, T5n, T6B, T5b, T5p, T6p;
+	       E T6r, T59, T4r;
+	       {
+		    E T2c, T4j, T2h, T4e, T2b, T4k, T2i, T4d;
+		    {
+			 E TP, TX, TS, TW;
+			 TN = W[0];
+			 TQ = W[1];
+			 TO = W[2];
+			 TR = W[3];
+			 TP = TN * TO;
+			 TX = TQ * TO;
+			 TS = TQ * TR;
+			 TW = TN * TR;
+			 TT = TP - TS;
+			 TY = TW + TX;
+			 T2t = TW - TX;
+			 T2r = TP + TS;
+			 TZ = W[5];
+			 T2c = TQ * TZ;
+			 T4j = TO * TZ;
+			 T2h = TN * TZ;
+			 T4e = TR * TZ;
+			 TU = W[4];
+			 T2b = TN * TU;
+			 T4k = TR * TU;
+			 T2i = TQ * TU;
+			 T4d = TO * TU;
+		    }
+		    T4f = T4d - T4e;
+		    T4l = T4j + T4k;
+		    {
+			 E T2s, T2u, TV, T10, T3P, T3Q, T1J, T1K;
+			 T2d = T2b - T2c;
+			 T4v = T2b + T2c;
+			 T5m = T4j - T4k;
+			 T2j = T2h + T2i;
+			 T5l = T4d + T4e;
+			 T4X = T2h - T2i;
+			 T2s = T2r * TU;
+			 T2u = T2t * TZ;
+			 T2v = T2s + T2u;
+			 TV = TT * TU;
+			 T10 = TY * TZ;
+			 T11 = TV + T10;
+			 T3P = T2r * TZ;
+			 T3Q = T2t * TU;
+			 T3R = T3P - T3Q;
+			 T1J = TT * TZ;
+			 T1K = TY * TU;
+			 T1L = T1J - T1K;
+			 T5d = TV - T10;
+			 T6x = T3P + T3Q;
+			 T5h = T1J + T1K;
+			 T6t = T2s - T2u;
+			 T25 = W[6];
+			 T26 = W[7];
+			 T27 = FMA(TT, T25, TY * T26);
+			 T29 = FNMS(TY, T25, TT * T26);
+			 T6D = FNMS(T4X, T25, T4v * T26);
+			 T7v = FNMS(T1L, T25, T11 * T26);
+			 T49 = FMA(T2r, T25, T2t * T26);
+			 T7l = FMA(T2d, T25, T2j * T26);
+			 T7p = FNMS(T2j, T25, T2d * T26);
+			 T7t = FMA(T11, T25, T1L * T26);
+			 T2p = FNMS(TZ, T25, TU * T26);
+			 T2n = FMA(TU, T25, TZ * T26);
+			 T4b = FNMS(T2t, T25, T2r * T26);
+			 T4p = FMA(T2v, T25, T3R * T26);
+			 T5n = FMA(T5l, T25, T5m * T26);
+			 T6B = FMA(T4v, T25, T4X * T26);
+			 T5b = FNMS(TQ, T25, TN * T26);
+			 T5p = FNMS(T5m, T25, T5l * T26);
+			 T6p = FMA(TO, T25, TR * T26);
+			 T6r = FNMS(TR, T25, TO * T26);
+			 T59 = FMA(TN, T25, TQ * T26);
+			 T4r = FNMS(T3R, T25, T2v * T26);
+		    }
+	       }
+	       {
+		    E T9, T6i, T40, T3z, T5Y, Ti, Tr, Ts, T1d, T1m, T1P, T2K, T4P, T3H, T4y;
+		    E T5G, T71, T65, T6N, T5z, T70, T64, T6K, T2Z, T4Q, T3I, T4B, T20, T5Z, T3C;
+		    E T43, T6j, TB, TK, TL, T1w, T1F, T1Q, T3f, T4S, T3K, T4F, T5V, T74, T68;
+		    E T6U, T5O, T73, T67, T6R, T3u, T4T, T3L, T4I;
+		    {
+			 E T1, T4, T7, T8, T3Z, T3Y, T3x, T3y;
+			 T1 = cr[0];
+			 {
+			      E T2, T3, T5, T6;
+			      T2 = cr[WS(rs, 5)];
+			      T3 = ci[WS(rs, 4)];
+			      T4 = T2 + T3;
+			      T5 = cr[WS(rs, 10)];
+			      T6 = ci[WS(rs, 9)];
+			      T7 = T5 + T6;
+			      T8 = T4 + T7;
+			      T3Z = T5 - T6;
+			      T3Y = T2 - T3;
+			 }
+			 T9 = T1 + T8;
+			 T6i = FMA(KP951056516, T3Y, KP587785252 * T3Z);
+			 T40 = FNMS(KP951056516, T3Z, KP587785252 * T3Y);
+			 T3x = FNMS(KP250000000, T8, T1);
+			 T3y = KP559016994 * (T4 - T7);
+			 T3z = T3x - T3y;
+			 T5Y = T3y + T3x;
+		    }
+		    {
+			 E Ta, T2x, T5w, T2F, Th, T2w, T1e, T2P, T5B, T2X, T1l, T2O, Tj, T2N, T5D;
+			 E T2T, Tq, T2S, T15, T2B, T5u, T2H, T1c, T2G;
+			 {
+			      E Tg, T2E, Td, T2D;
+			      Ta = cr[WS(rs, 1)];
+			      {
+				   E Te, Tf, Tb, Tc;
+				   Te = cr[WS(rs, 11)];
+				   Tf = ci[WS(rs, 8)];
+				   Tg = Te + Tf;
+				   T2E = Te - Tf;
+				   Tb = cr[WS(rs, 6)];
+				   Tc = ci[WS(rs, 3)];
+				   Td = Tb + Tc;
+				   T2D = Tb - Tc;
+			      }
+			      T2x = KP559016994 * (Td - Tg);
+			      T5w = FMA(KP951056516, T2D, KP587785252 * T2E);
+			      T2F = FNMS(KP951056516, T2E, KP587785252 * T2D);
+			      Th = Td + Tg;
+			      T2w = FNMS(KP250000000, Th, Ta);
+			 }
+			 {
+			      E T1k, T2W, T1h, T2V;
+			      T1e = ci[WS(rs, 20)];
+			      {
+				   E T1i, T1j, T1f, T1g;
+				   T1i = cr[WS(rs, 14)];
+				   T1j = cr[WS(rs, 19)];
+				   T1k = T1i + T1j;
+				   T2W = T1j - T1i;
+				   T1f = ci[WS(rs, 15)];
+				   T1g = cr[WS(rs, 24)];
+				   T1h = T1f - T1g;
+				   T2V = T1f + T1g;
+			      }
+			      T2P = KP559016994 * (T1h + T1k);
+			      T5B = FMA(KP951056516, T2V, KP587785252 * T2W);
+			      T2X = FNMS(KP951056516, T2W, KP587785252 * T2V);
+			      T1l = T1h - T1k;
+			      T2O = FNMS(KP250000000, T1l, T1e);
+			 }
+			 {
+			      E Tp, T2M, Tm, T2L;
+			      Tj = cr[WS(rs, 4)];
+			      {
+				   E Tn, To, Tk, Tl;
+				   Tn = ci[WS(rs, 10)];
+				   To = ci[WS(rs, 5)];
+				   Tp = Tn + To;
+				   T2M = Tn - To;
+				   Tk = cr[WS(rs, 9)];
+				   Tl = ci[0];
+				   Tm = Tk + Tl;
+				   T2L = Tk - Tl;
+			      }
+			      T2N = FNMS(KP951056516, T2M, KP587785252 * T2L);
+			      T5D = FMA(KP951056516, T2L, KP587785252 * T2M);
+			      T2T = KP559016994 * (Tm - Tp);
+			      Tq = Tm + Tp;
+			      T2S = FNMS(KP250000000, Tq, Tj);
+			 }
+			 {
+			      E T1b, T2A, T18, T2z;
+			      T15 = ci[WS(rs, 23)];
+			      {
+				   E T19, T1a, T16, T17;
+				   T19 = ci[WS(rs, 13)];
+				   T1a = cr[WS(rs, 16)];
+				   T1b = T19 - T1a;
+				   T2A = T19 + T1a;
+				   T16 = ci[WS(rs, 18)];
+				   T17 = cr[WS(rs, 21)];
+				   T18 = T16 - T17;
+				   T2z = T16 + T17;
+			      }
+			      T2B = FNMS(KP951056516, T2A, KP587785252 * T2z);
+			      T5u = FMA(KP951056516, T2z, KP587785252 * T2A);
+			      T2H = KP559016994 * (T18 - T1b);
+			      T1c = T18 + T1b;
+			      T2G = FNMS(KP250000000, T1c, T15);
+			 }
+			 Ti = Ta + Th;
+			 Tr = Tj + Tq;
+			 Ts = Ti + Tr;
+			 T1d = T15 + T1c;
+			 T1m = T1e + T1l;
+			 T1P = T1d + T1m;
+			 {
+			      E T2C, T4w, T2J, T4x, T2y, T2I;
+			      T2y = T2w - T2x;
+			      T2C = T2y - T2B;
+			      T4w = T2y + T2B;
+			      T2I = T2G - T2H;
+			      T2J = T2F + T2I;
+			      T4x = T2I - T2F;
+			      T2K = FNMS(KP481753674, T2J, KP876306680 * T2C);
+			      T4P = FMA(KP728968627, T4x, KP684547105 * T4w);
+			      T3H = FMA(KP876306680, T2J, KP481753674 * T2C);
+			      T4y = FNMS(KP684547105, T4x, KP728968627 * T4w);
+			 }
+			 {
+			      E T5C, T6M, T5F, T6L, T5A, T5E;
+			      T5A = T2T + T2S;
+			      T5C = T5A - T5B;
+			      T6M = T5A + T5B;
+			      T5E = T2O + T2P;
+			      T5F = T5D + T5E;
+			      T6L = T5E - T5D;
+			      T5G = FNMS(KP844327925, T5F, KP535826794 * T5C);
+			      T71 = FMA(KP637423989, T6L, KP770513242 * T6M);
+			      T65 = FMA(KP535826794, T5F, KP844327925 * T5C);
+			      T6N = FNMS(KP637423989, T6M, KP770513242 * T6L);
+			 }
+			 {
+			      E T5v, T6I, T5y, T6J, T5t, T5x;
+			      T5t = T2x + T2w;
+			      T5v = T5t - T5u;
+			      T6I = T5t + T5u;
+			      T5x = T2H + T2G;
+			      T5y = T5w + T5x;
+			      T6J = T5x - T5w;
+			      T5z = FNMS(KP248689887, T5y, KP968583161 * T5v);
+			      T70 = FMA(KP535826794, T6J, KP844327925 * T6I);
+			      T64 = FMA(KP968583161, T5y, KP248689887 * T5v);
+			      T6K = FNMS(KP844327925, T6J, KP535826794 * T6I);
+			 }
+			 {
+			      E T2R, T4z, T2Y, T4A, T2Q, T2U;
+			      T2Q = T2O - T2P;
+			      T2R = T2N + T2Q;
+			      T4z = T2Q - T2N;
+			      T2U = T2S - T2T;
+			      T2Y = T2U - T2X;
+			      T4A = T2U + T2X;
+			      T2Z = FMA(KP904827052, T2R, KP425779291 * T2Y);
+			      T4Q = FNMS(KP992114701, T4z, KP125333233 * T4A);
+			      T3I = FNMS(KP425779291, T2R, KP904827052 * T2Y);
+			      T4B = FMA(KP125333233, T4z, KP992114701 * T4A);
+			 }
+		    }
+		    {
+			 E T1S, T1V, T1Y, T1Z, T3B, T3A, T41, T42;
+			 T1S = ci[WS(rs, 24)];
+			 {
+			      E T1T, T1U, T1W, T1X;
+			      T1T = ci[WS(rs, 19)];
+			      T1U = cr[WS(rs, 20)];
+			      T1V = T1T - T1U;
+			      T1W = ci[WS(rs, 14)];
+			      T1X = cr[WS(rs, 15)];
+			      T1Y = T1W - T1X;
+			      T1Z = T1V + T1Y;
+			      T3B = T1W + T1X;
+			      T3A = T1T + T1U;
+			 }
+			 T20 = T1S + T1Z;
+			 T5Z = FMA(KP951056516, T3A, KP587785252 * T3B);
+			 T3C = FNMS(KP951056516, T3B, KP587785252 * T3A);
+			 T41 = FNMS(KP250000000, T1Z, T1S);
+			 T42 = KP559016994 * (T1V - T1Y);
+			 T43 = T41 - T42;
+			 T6j = T42 + T41;
+		    }
+		    {
+			 E Tt, T32, T5L, T3a, TA, T31, T1o, T36, T5J, T3c, T1v, T3b, TC, T3h, T5S;
+			 E T3p, TJ, T3g, T1x, T3l, T5Q, T3r, T1E, T3q;
+			 {
+			      E Tw, T38, Tz, T39;
+			      Tt = cr[WS(rs, 2)];
+			      {
+				   E Tu, Tv, Tx, Ty;
+				   Tu = cr[WS(rs, 7)];
+				   Tv = ci[WS(rs, 2)];
+				   Tw = Tu + Tv;
+				   T38 = Tu - Tv;
+				   Tx = cr[WS(rs, 12)];
+				   Ty = ci[WS(rs, 7)];
+				   Tz = Tx + Ty;
+				   T39 = Tx - Ty;
+			      }
+			      T32 = KP559016994 * (Tw - Tz);
+			      T5L = FMA(KP951056516, T38, KP587785252 * T39);
+			      T3a = FNMS(KP951056516, T39, KP587785252 * T38);
+			      TA = Tw + Tz;
+			      T31 = FNMS(KP250000000, TA, Tt);
+			 }
+			 {
+			      E T1r, T34, T1u, T35;
+			      T1o = ci[WS(rs, 22)];
+			      {
+				   E T1p, T1q, T1s, T1t;
+				   T1p = ci[WS(rs, 17)];
+				   T1q = cr[WS(rs, 22)];
+				   T1r = T1p - T1q;
+				   T34 = T1p + T1q;
+				   T1s = ci[WS(rs, 12)];
+				   T1t = cr[WS(rs, 17)];
+				   T1u = T1s - T1t;
+				   T35 = T1s + T1t;
+			      }
+			      T36 = FNMS(KP951056516, T35, KP587785252 * T34);
+			      T5J = FMA(KP951056516, T34, KP587785252 * T35);
+			      T3c = KP559016994 * (T1r - T1u);
+			      T1v = T1r + T1u;
+			      T3b = FNMS(KP250000000, T1v, T1o);
+			 }
+			 {
+			      E TI, T3o, TF, T3n;
+			      TC = cr[WS(rs, 3)];
+			      {
+				   E TG, TH, TD, TE;
+				   TG = ci[WS(rs, 11)];
+				   TH = ci[WS(rs, 6)];
+				   TI = TG + TH;
+				   T3o = TG - TH;
+				   TD = cr[WS(rs, 8)];
+				   TE = ci[WS(rs, 1)];
+				   TF = TD + TE;
+				   T3n = TD - TE;
+			      }
+			      T3h = KP559016994 * (TF - TI);
+			      T5S = FMA(KP951056516, T3n, KP587785252 * T3o);
+			      T3p = FNMS(KP951056516, T3o, KP587785252 * T3n);
+			      TJ = TF + TI;
+			      T3g = FNMS(KP250000000, TJ, TC);
+			 }
+			 {
+			      E T1D, T3k, T1A, T3j;
+			      T1x = ci[WS(rs, 21)];
+			      {
+				   E T1B, T1C, T1y, T1z;
+				   T1B = cr[WS(rs, 13)];
+				   T1C = cr[WS(rs, 18)];
+				   T1D = T1B + T1C;
+				   T3k = T1C - T1B;
+				   T1y = ci[WS(rs, 16)];
+				   T1z = cr[WS(rs, 23)];
+				   T1A = T1y - T1z;
+				   T3j = T1y + T1z;
+			      }
+			      T3l = FNMS(KP951056516, T3k, KP587785252 * T3j);
+			      T5Q = FMA(KP951056516, T3j, KP587785252 * T3k);
+			      T3r = KP559016994 * (T1A + T1D);
+			      T1E = T1A - T1D;
+			      T3q = FNMS(KP250000000, T1E, T1x);
+			 }
+			 TB = Tt + TA;
+			 TK = TC + TJ;
+			 TL = TB + TK;
+			 T1w = T1o + T1v;
+			 T1F = T1x + T1E;
+			 T1Q = T1w + T1F;
+			 {
+			      E T37, T4D, T3e, T4E, T33, T3d;
+			      T33 = T31 - T32;
+			      T37 = T33 - T36;
+			      T4D = T33 + T36;
+			      T3d = T3b - T3c;
+			      T3e = T3a + T3d;
+			      T4E = T3d - T3a;
+			      T3f = FNMS(KP844327925, T3e, KP535826794 * T37);
+			      T4S = FMA(KP062790519, T4E, KP998026728 * T4D);
+			      T3K = FMA(KP535826794, T3e, KP844327925 * T37);
+			      T4F = FNMS(KP998026728, T4E, KP062790519 * T4D);
+			 }
+			 {
+			      E T5R, T6T, T5U, T6S, T5P, T5T;
+			      T5P = T3h + T3g;
+			      T5R = T5P - T5Q;
+			      T6T = T5P + T5Q;
+			      T5T = T3q + T3r;
+			      T5U = T5S + T5T;
+			      T6S = T5T - T5S;
+			      T5V = FNMS(KP684547105, T5U, KP728968627 * T5R);
+			      T74 = FNMS(KP992114701, T6S, KP125333233 * T6T);
+			      T68 = FMA(KP728968627, T5U, KP684547105 * T5R);
+			      T6U = FMA(KP125333233, T6S, KP992114701 * T6T);
+			 }
+			 {
+			      E T5K, T6Q, T5N, T6P, T5I, T5M;
+			      T5I = T32 + T31;
+			      T5K = T5I - T5J;
+			      T6Q = T5I + T5J;
+			      T5M = T3c + T3b;
+			      T5N = T5L + T5M;
+			      T6P = T5M - T5L;
+			      T5O = FNMS(KP481753674, T5N, KP876306680 * T5K);
+			      T73 = FNMS(KP425779291, T6P, KP904827052 * T6Q);
+			      T67 = FMA(KP876306680, T5N, KP481753674 * T5K);
+			      T6R = FMA(KP904827052, T6P, KP425779291 * T6Q);
+			 }
+			 {
+			      E T3m, T4H, T3t, T4G, T3i, T3s;
+			      T3i = T3g - T3h;
+			      T3m = T3i - T3l;
+			      T4H = T3i + T3l;
+			      T3s = T3q - T3r;
+			      T3t = T3p + T3s;
+			      T4G = T3s - T3p;
+			      T3u = FNMS(KP998026728, T3t, KP062790519 * T3m);
+			      T4T = FNMS(KP637423989, T4G, KP770513242 * T4H);
+			      T3L = FMA(KP062790519, T3t, KP998026728 * T3m);
+			      T4I = FMA(KP770513242, T4G, KP637423989 * T4H);
+			 }
+		    }
+		    {
+			 E TM, T14, T2e, T21, T23, T2l, T1H, T2f, T1O, T2k;
+			 {
+			      E T12, T13, T1R, T22;
+			      T12 = KP559016994 * (Ts - TL);
+			      TM = Ts + TL;
+			      T13 = FNMS(KP250000000, TM, T9);
+			      T14 = T12 + T13;
+			      T2e = T13 - T12;
+			      T1R = KP559016994 * (T1P - T1Q);
+			      T21 = T1P + T1Q;
+			      T22 = FNMS(KP250000000, T21, T20);
+			      T23 = T1R + T22;
+			      T2l = T22 - T1R;
+			 }
+			 {
+			      E T1n, T1G, T1M, T1N;
+			      T1n = T1d - T1m;
+			      T1G = T1w - T1F;
+			      T1H = FMA(KP951056516, T1n, KP587785252 * T1G);
+			      T2f = FNMS(KP951056516, T1G, KP587785252 * T1n);
+			      T1M = Ti - Tr;
+			      T1N = TB - TK;
+			      T1O = FMA(KP951056516, T1M, KP587785252 * T1N);
+			      T2k = FNMS(KP951056516, T1N, KP587785252 * T1M);
+			 }
+			 {
+			      E T1I, T24, T2o, T2q;
+			      cr[0] = T9 + TM;
+			      ci[0] = T20 + T21;
+			      T1I = T14 - T1H;
+			      T24 = T1O + T23;
+			      cr[WS(rs, 5)] = FNMS(T1L, T24, T11 * T1I);
+			      ci[WS(rs, 5)] = FMA(T1L, T1I, T11 * T24);
+			      T2o = T2e + T2f;
+			      T2q = T2l - T2k;
+			      cr[WS(rs, 15)] = FNMS(T2p, T2q, T2n * T2o);
+			      ci[WS(rs, 15)] = FMA(T2p, T2o, T2n * T2q);
+			      {
+				   E T2g, T2m, T28, T2a;
+				   T2g = T2e - T2f;
+				   T2m = T2k + T2l;
+				   cr[WS(rs, 10)] = FNMS(T2j, T2m, T2d * T2g);
+				   ci[WS(rs, 10)] = FMA(T2j, T2g, T2d * T2m);
+				   T28 = T14 + T1H;
+				   T2a = T23 - T1O;
+				   cr[WS(rs, 20)] = FNMS(T29, T2a, T27 * T28);
+				   ci[WS(rs, 20)] = FMA(T29, T28, T27 * T2a);
+			      }
+			 }
+		    }
+		    {
+			 E T76, T7n, T7a, T7q, T6H, T6W, T6X, T6Y, T7e, T7f, T7d, T7g, T7x, T7y;
+			 {
+			      E T72, T75, T78, T79;
+			      T72 = T70 + T71;
+			      T75 = T73 - T74;
+			      T76 = FMA(KP951056516, T72, KP587785252 * T75);
+			      T7n = FNMS(KP951056516, T75, KP587785252 * T72);
+			      T78 = T6K - T6N;
+			      T79 = T6U - T6R;
+			      T7a = FMA(KP951056516, T78, KP587785252 * T79);
+			      T7q = FNMS(KP951056516, T79, KP587785252 * T78);
+			 }
+			 {
+			      E T6O, T6V, T7b, T7c;
+			      T6H = T5Y + T5Z;
+			      T6O = T6K + T6N;
+			      T6V = T6R + T6U;
+			      T6W = T6O - T6V;
+			      T6X = FNMS(KP250000000, T6W, T6H);
+			      T6Y = KP559016994 * (T6O + T6V);
+			      T7e = T6j - T6i;
+			      T7b = T70 - T71;
+			      T7c = T73 + T74;
+			      T7f = T7b + T7c;
+			      T7d = KP559016994 * (T7b - T7c);
+			      T7g = FNMS(KP250000000, T7f, T7e);
+			 }
+			 T7x = T6H + T6W;
+			 T7y = T7e + T7f;
+			 cr[WS(rs, 4)] = FNMS(TY, T7y, TT * T7x);
+			 ci[WS(rs, 4)] = FMA(TY, T7x, TT * T7y);
+			 {
+			      E T7o, T7u, T7s, T7w, T7m, T7r;
+			      T7m = T6X - T6Y;
+			      T7o = T7m - T7n;
+			      T7u = T7m + T7n;
+			      T7r = T7g - T7d;
+			      T7s = T7q + T7r;
+			      T7w = T7r - T7q;
+			      cr[WS(rs, 14)] = FNMS(T7p, T7s, T7l * T7o);
+			      ci[WS(rs, 14)] = FMA(T7p, T7o, T7l * T7s);
+			      cr[WS(rs, 19)] = FNMS(T7v, T7w, T7t * T7u);
+			      ci[WS(rs, 19)] = FMA(T7v, T7u, T7t * T7w);
+			 }
+			 {
+			      E T77, T7j, T7i, T7k, T6Z, T7h;
+			      T6Z = T6X + T6Y;
+			      T77 = T6Z - T76;
+			      T7j = T6Z + T76;
+			      T7h = T7d + T7g;
+			      T7i = T7a + T7h;
+			      T7k = T7h - T7a;
+			      cr[WS(rs, 9)] = FNMS(TZ, T7i, TU * T77);
+			      ci[WS(rs, 9)] = FMA(TZ, T77, TU * T7i);
+			      cr[WS(rs, 24)] = FNMS(T26, T7k, T25 * T7j);
+			      ci[WS(rs, 24)] = FMA(T26, T7j, T25 * T7k);
+			 }
+		    }
+		    {
+			 E T3N, T4h, T3U, T4m, T3D, T3E, T3w, T3F, T44, T45, T3X, T46, T4t, T4u;
+			 {
+			      E T3J, T3M, T3S, T3T;
+			      T3J = T3H - T3I;
+			      T3M = T3K - T3L;
+			      T3N = FMA(KP951056516, T3J, KP587785252 * T3M);
+			      T4h = FNMS(KP951056516, T3M, KP587785252 * T3J);
+			      T3S = T2K + T2Z;
+			      T3T = T3f - T3u;
+			      T3U = FMA(KP951056516, T3S, KP587785252 * T3T);
+			      T4m = FNMS(KP951056516, T3T, KP587785252 * T3S);
+			 }
+			 {
+			      E T30, T3v, T3V, T3W;
+			      T3D = T3z - T3C;
+			      T30 = T2K - T2Z;
+			      T3v = T3f + T3u;
+			      T3E = T30 + T3v;
+			      T3w = KP559016994 * (T30 - T3v);
+			      T3F = FNMS(KP250000000, T3E, T3D);
+			      T44 = T40 + T43;
+			      T3V = T3H + T3I;
+			      T3W = T3K + T3L;
+			      T45 = T3V + T3W;
+			      T3X = KP559016994 * (T3V - T3W);
+			      T46 = FNMS(KP250000000, T45, T44);
+			 }
+			 T4t = T3D + T3E;
+			 T4u = T44 + T45;
+			 cr[WS(rs, 2)] = FNMS(T2t, T4u, T2r * T4t);
+			 ci[WS(rs, 2)] = FMA(T2t, T4t, T2r * T4u);
+			 {
+			      E T4i, T4q, T4o, T4s, T4g, T4n;
+			      T4g = T3F - T3w;
+			      T4i = T4g - T4h;
+			      T4q = T4g + T4h;
+			      T4n = T46 - T3X;
+			      T4o = T4m + T4n;
+			      T4s = T4n - T4m;
+			      cr[WS(rs, 12)] = FNMS(T4l, T4o, T4f * T4i);
+			      ci[WS(rs, 12)] = FMA(T4l, T4i, T4f * T4o);
+			      cr[WS(rs, 17)] = FNMS(T4r, T4s, T4p * T4q);
+			      ci[WS(rs, 17)] = FMA(T4r, T4q, T4p * T4s);
+			 }
+			 {
+			      E T3O, T4a, T48, T4c, T3G, T47;
+			      T3G = T3w + T3F;
+			      T3O = T3G - T3N;
+			      T4a = T3G + T3N;
+			      T47 = T3X + T46;
+			      T48 = T3U + T47;
+			      T4c = T47 - T3U;
+			      cr[WS(rs, 7)] = FNMS(T3R, T48, T2v * T3O);
+			      ci[WS(rs, 7)] = FMA(T3R, T3O, T2v * T48);
+			      cr[WS(rs, 22)] = FNMS(T4b, T4c, T49 * T4a);
+			      ci[WS(rs, 22)] = FMA(T4b, T4a, T49 * T4c);
+			 }
+		    }
+		    {
+			 E T4V, T5f, T50, T5i, T4L, T4M, T4K, T4N, T54, T55, T53, T56, T5r, T5s;
+			 {
+			      E T4R, T4U, T4Y, T4Z;
+			      T4R = T4P - T4Q;
+			      T4U = T4S - T4T;
+			      T4V = FMA(KP951056516, T4R, KP587785252 * T4U);
+			      T5f = FNMS(KP951056516, T4U, KP587785252 * T4R);
+			      T4Y = T4y + T4B;
+			      T4Z = T4F + T4I;
+			      T50 = FMA(KP951056516, T4Y, KP587785252 * T4Z);
+			      T5i = FNMS(KP951056516, T4Z, KP587785252 * T4Y);
+			 }
+			 {
+			      E T4C, T4J, T51, T52;
+			      T4L = T3z + T3C;
+			      T4C = T4y - T4B;
+			      T4J = T4F - T4I;
+			      T4M = T4C + T4J;
+			      T4K = KP559016994 * (T4C - T4J);
+			      T4N = FNMS(KP250000000, T4M, T4L);
+			      T54 = T43 - T40;
+			      T51 = T4P + T4Q;
+			      T52 = T4S + T4T;
+			      T55 = T51 + T52;
+			      T53 = KP559016994 * (T51 - T52);
+			      T56 = FNMS(KP250000000, T55, T54);
+			 }
+			 T5r = T4L + T4M;
+			 T5s = T54 + T55;
+			 cr[WS(rs, 3)] = FNMS(TR, T5s, TO * T5r);
+			 ci[WS(rs, 3)] = FMA(TR, T5r, TO * T5s);
+			 {
+			      E T5g, T5o, T5k, T5q, T5e, T5j;
+			      T5e = T4N - T4K;
+			      T5g = T5e - T5f;
+			      T5o = T5e + T5f;
+			      T5j = T56 - T53;
+			      T5k = T5i + T5j;
+			      T5q = T5j - T5i;
+			      cr[WS(rs, 13)] = FNMS(T5h, T5k, T5d * T5g);
+			      ci[WS(rs, 13)] = FMA(T5h, T5g, T5d * T5k);
+			      cr[WS(rs, 18)] = FNMS(T5p, T5q, T5n * T5o);
+			      ci[WS(rs, 18)] = FMA(T5p, T5o, T5n * T5q);
+			 }
+			 {
+			      E T4W, T5a, T58, T5c, T4O, T57;
+			      T4O = T4K + T4N;
+			      T4W = T4O - T4V;
+			      T5a = T4O + T4V;
+			      T57 = T53 + T56;
+			      T58 = T50 + T57;
+			      T5c = T57 - T50;
+			      cr[WS(rs, 8)] = FNMS(T4X, T58, T4v * T4W);
+			      ci[WS(rs, 8)] = FMA(T4X, T4W, T4v * T58);
+			      cr[WS(rs, 23)] = FNMS(T5b, T5c, T59 * T5a);
+			      ci[WS(rs, 23)] = FMA(T5b, T5a, T59 * T5c);
+			 }
+		    }
+		    {
+			 E T6a, T6v, T6e, T6y, T60, T61, T5X, T62, T6k, T6l, T6h, T6m, T6F, T6G;
+			 {
+			      E T66, T69, T6c, T6d;
+			      T66 = T64 - T65;
+			      T69 = T67 - T68;
+			      T6a = FMA(KP951056516, T66, KP587785252 * T69);
+			      T6v = FNMS(KP951056516, T69, KP587785252 * T66);
+			      T6c = T5z - T5G;
+			      T6d = T5O - T5V;
+			      T6e = FMA(KP951056516, T6c, KP587785252 * T6d);
+			      T6y = FNMS(KP951056516, T6d, KP587785252 * T6c);
+			 }
+			 {
+			      E T5H, T5W, T6f, T6g;
+			      T60 = T5Y - T5Z;
+			      T5H = T5z + T5G;
+			      T5W = T5O + T5V;
+			      T61 = T5H + T5W;
+			      T5X = KP559016994 * (T5H - T5W);
+			      T62 = FNMS(KP250000000, T61, T60);
+			      T6k = T6i + T6j;
+			      T6f = T64 + T65;
+			      T6g = T67 + T68;
+			      T6l = T6f + T6g;
+			      T6h = KP559016994 * (T6f - T6g);
+			      T6m = FNMS(KP250000000, T6l, T6k);
+			 }
+			 T6F = T60 + T61;
+			 T6G = T6k + T6l;
+			 cr[WS(rs, 1)] = FNMS(TQ, T6G, TN * T6F);
+			 ci[WS(rs, 1)] = FMA(TQ, T6F, TN * T6G);
+			 {
+			      E T6w, T6C, T6A, T6E, T6u, T6z;
+			      T6u = T62 - T5X;
+			      T6w = T6u - T6v;
+			      T6C = T6u + T6v;
+			      T6z = T6m - T6h;
+			      T6A = T6y + T6z;
+			      T6E = T6z - T6y;
+			      cr[WS(rs, 11)] = FNMS(T6x, T6A, T6t * T6w);
+			      ci[WS(rs, 11)] = FMA(T6x, T6w, T6t * T6A);
+			      cr[WS(rs, 16)] = FNMS(T6D, T6E, T6B * T6C);
+			      ci[WS(rs, 16)] = FMA(T6D, T6C, T6B * T6E);
+			 }
+			 {
+			      E T6b, T6q, T6o, T6s, T63, T6n;
+			      T63 = T5X + T62;
+			      T6b = T63 - T6a;
+			      T6q = T63 + T6a;
+			      T6n = T6h + T6m;
+			      T6o = T6e + T6n;
+			      T6s = T6n - T6e;
+			      cr[WS(rs, 6)] = FNMS(T5m, T6o, T5l * T6b);
+			      ci[WS(rs, 6)] = FMA(T5m, T6b, T5l * T6o);
+			      cr[WS(rs, 21)] = FNMS(T6r, T6s, T6p * T6q);
+			      ci[WS(rs, 21)] = FMA(T6r, T6q, T6p * T6s);
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 24},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 25, "hb2_25", twinstr, &GENUS, {280, 180, 160, 0} };
+
+void X(codelet_hb2_25) (planner *p) {
+     X(khc2hc_register) (p, hb2_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb2_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb2_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1845 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:25 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 32 -dif -name hb2_32 -include hb.h */
+
+/*
+ * This function contains 488 FP additions, 350 FP multiplications,
+ * (or, 236 additions, 98 multiplications, 252 fused multiply/add),
+ * 204 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T5u, T6b, T6e, T5I, T66, T60, T5U, T5R, T67, T5L, T61, T5x, T5A, T5D, T5O;
+	       E T62, T5V, T5P;
+	       {
+		    E T11, T14, T12, T37, T17, T1b, T39, T15, T7C, T8P, T8S, T7I, T98, T7e, T78;
+		    E T8V, T3d, T3x, T3a, T3v, T9s, T3G, T4p, T5X, T16, T9m, T3y, T4b, T3C, T4g;
+		    E T5Z, T1a, T4r, T3J, T2O, T1c, T4W, T4s, T3Y, T3K, T3l, T3e, T3i, T3q, T8K;
+		    E T8E, T8m, T7S, T5k, T5e;
+		    {
+			 E T13, T3c, T38, T3F, T7B, T9l, T77, T7d, T9r, T7H;
+			 T11 = W[2];
+			 T14 = W[3];
+			 T12 = W[4];
+			 T37 = W[0];
+			 T17 = W[6];
+			 T1b = W[7];
+			 T13 = T11 * T12;
+			 T3c = T37 * T14;
+			 T38 = T37 * T11;
+			 T3F = T37 * T12;
+			 T7B = T11 * T17;
+			 T9l = T12 * T17;
+			 T77 = T37 * T17;
+			 T7d = T37 * T1b;
+			 T9r = T12 * T1b;
+			 T7H = T11 * T1b;
+			 T39 = W[1];
+			 T15 = W[5];
+			 {
+			      E T3I, T19, T5d, T3b, T18, T2N;
+			      T7C = FMA(T14, T1b, T7B);
+			      T8P = FNMS(T14, T1b, T7B);
+			      T8S = FMA(T14, T17, T7H);
+			      T7I = FNMS(T14, T17, T7H);
+			      T98 = FNMS(T39, T17, T7d);
+			      T7e = FMA(T39, T17, T7d);
+			      T78 = FNMS(T39, T1b, T77);
+			      T8V = FMA(T39, T1b, T77);
+			      T3d = FMA(T39, T11, T3c);
+			      T3x = FNMS(T39, T11, T3c);
+			      T3a = FNMS(T39, T14, T38);
+			      T3v = FMA(T39, T14, T38);
+			      T9s = FNMS(T15, T17, T9r);
+			      T3G = FNMS(T39, T15, T3F);
+			      T4p = FMA(T39, T15, T3F);
+			      T5X = FNMS(T14, T15, T13);
+			      T16 = FMA(T14, T15, T13);
+			      T3I = T37 * T15;
+			      T19 = T11 * T15;
+			      T5d = T3v * T12;
+			      T3b = T3a * T12;
+			      T9m = FMA(T15, T1b, T9l);
+			      {
+				   E T3w, T3B, T5t, T5H;
+				   T3w = T3v * T17;
+				   T3B = T3v * T1b;
+				   T5t = T3a * T17;
+				   T5H = T3a * T1b;
+				   T3y = FNMS(T3x, T1b, T3w);
+				   T4b = FMA(T3x, T1b, T3w);
+				   T3C = FMA(T3x, T17, T3B);
+				   T4g = FNMS(T3x, T17, T3B);
+				   T5u = FMA(T3d, T1b, T5t);
+				   T6b = FNMS(T3d, T1b, T5t);
+				   T6e = FMA(T3d, T17, T5H);
+				   T5I = FNMS(T3d, T17, T5H);
+				   T18 = T16 * T17;
+				   T2N = T16 * T1b;
+				   T5Z = FMA(T14, T12, T19);
+				   T1a = FNMS(T14, T12, T19);
+			      }
+			      {
+				   E T3H, T3X, T4q, T4V, T5Y, T65;
+				   T4q = T4p * T17;
+				   T4V = T4p * T1b;
+				   T4r = FNMS(T39, T12, T3I);
+				   T3J = FMA(T39, T12, T3I);
+				   T2O = FNMS(T1a, T17, T2N);
+				   T1c = FMA(T1a, T1b, T18);
+				   T3H = T3G * T17;
+				   T4W = FNMS(T4r, T17, T4V);
+				   T4s = FMA(T4r, T1b, T4q);
+				   T3X = T3G * T1b;
+				   T5Y = T5X * T17;
+				   T65 = T5X * T1b;
+				   T3Y = FNMS(T3J, T17, T3X);
+				   T3K = FMA(T3J, T1b, T3H);
+				   {
+					E T8J, T8D, T3h, T5j, T8l, T7R;
+					T3h = T3a * T15;
+					T66 = FNMS(T5Z, T17, T65);
+					T60 = FMA(T5Z, T1b, T5Y);
+					T3l = FNMS(T3d, T15, T3b);
+					T3e = FMA(T3d, T15, T3b);
+					T3i = FNMS(T3d, T12, T3h);
+					T3q = FMA(T3d, T12, T3h);
+					T8J = T3l * T1b;
+					T8D = T3l * T17;
+					T5j = T3v * T15;
+					T8l = T3e * T1b;
+					T7R = T3e * T17;
+					T8K = FNMS(T3q, T17, T8J);
+					T8E = FMA(T3q, T1b, T8D);
+					T8m = FNMS(T3i, T17, T8l);
+					T7S = FMA(T3i, T1b, T7R);
+					T5U = FNMS(T3x, T12, T5j);
+					T5k = FMA(T3x, T12, T5j);
+					T5e = FNMS(T3x, T15, T5d);
+					T5R = FMA(T3x, T15, T5d);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T6O, T6i, T7s, T7o, T6j, Tf, T8W, T7V, T99, T8p, T3L, T1t, T3Z, T2X, T5J;
+			 E T4Z, T7t, T6W, T5v, T4v, TZ, T7x, T91, T9d, T28, T3S, T3R, T2h, T5B, T4Q;
+			 E T8v, T8a, T5C, T4N, T6Z, T6J, TK, T7w, T2z, T3P, T94, T9c, T3O, T2I, T5y;
+			 E T4J, T8u, T8h, T5z, T4G, T6Y, T6A, T6p, T6m, T6P, Tu, T9a, T82, T8X, T8s;
+			 E T40, T1Q, T4y, T4B, T3M, T30, T5w, T52;
+			 {
+			      E T6B, T6I, T4L, T4M, T4t, T4u;
+			      {
+				   E T1d, T3, T2P, T6, T6Q, T2S, T6R, T1g, Td, T6U, T1i, Ta, T2V, T1r, T6T;
+				   E T1l;
+				   {
+					E T4, T5, T2Q, T2R, T1, T2, T1e, T1f;
+					T1 = cr[0];
+					T2 = ci[WS(rs, 15)];
+					{
+					     E T6N, T6h, T7r, T7n;
+					     T6N = T5R * T1b;
+					     T6h = T5R * T17;
+					     T7r = T5e * T1b;
+					     T7n = T5e * T17;
+					     T6O = FNMS(T5U, T17, T6N);
+					     T6i = FMA(T5U, T1b, T6h);
+					     T7s = FNMS(T5k, T17, T7r);
+					     T7o = FMA(T5k, T1b, T7n);
+					     T1d = T1 - T2;
+					     T3 = T1 + T2;
+					}
+					T4 = cr[WS(rs, 8)];
+					T5 = ci[WS(rs, 7)];
+					T2Q = ci[WS(rs, 31)];
+					T2R = cr[WS(rs, 16)];
+					T1e = ci[WS(rs, 23)];
+					T2P = T4 - T5;
+					T6 = T4 + T5;
+					T6Q = T2Q - T2R;
+					T2S = T2Q + T2R;
+					T1f = cr[WS(rs, 24)];
+					{
+					     E T1o, T1n, T1p, Tb, Tc;
+					     Tb = ci[WS(rs, 3)];
+					     Tc = cr[WS(rs, 12)];
+					     T1o = ci[WS(rs, 19)];
+					     T6R = T1e - T1f;
+					     T1g = T1e + T1f;
+					     T1n = Tb - Tc;
+					     Td = Tb + Tc;
+					     T1p = cr[WS(rs, 28)];
+					     {
+						  E T1j, T1k, T8, T9, T1q;
+						  T8 = cr[WS(rs, 4)];
+						  T9 = ci[WS(rs, 11)];
+						  T1q = T1o + T1p;
+						  T6U = T1o - T1p;
+						  T1j = ci[WS(rs, 27)];
+						  T1i = T8 - T9;
+						  Ta = T8 + T9;
+						  T1k = cr[WS(rs, 20)];
+						  T2V = T1n + T1q;
+						  T1r = T1n - T1q;
+						  T6T = T1j - T1k;
+						  T1l = T1j + T1k;
+					     }
+					}
+				   }
+				   {
+					E T2U, T6V, T6S, T1h, T1s, T4Y, T4X, T2T, T2W;
+					{
+					     E T7T, T8o, T1m, T7U, T7, Te, T8n;
+					     T7T = T3 - T6;
+					     T7 = T3 + T6;
+					     Te = Ta + Td;
+					     T8o = Ta - Td;
+					     T1m = T1i - T1l;
+					     T2U = T1i + T1l;
+					     T6j = T7 - Te;
+					     Tf = T7 + Te;
+					     T7U = T6U - T6T;
+					     T6V = T6T + T6U;
+					     T6S = T6Q + T6R;
+					     T8n = T6Q - T6R;
+					     T4t = T1d + T1g;
+					     T1h = T1d - T1g;
+					     T8W = T7T + T7U;
+					     T7V = T7T - T7U;
+					     T99 = T8o + T8n;
+					     T8p = T8n - T8o;
+					     T1s = T1m + T1r;
+					     T4Y = T1m - T1r;
+					}
+					T4X = T2S - T2P;
+					T2T = T2P + T2S;
+					T2W = T2U - T2V;
+					T4u = T2U + T2V;
+					T3L = FMA(KP707106781, T1s, T1h);
+					T1t = FNMS(KP707106781, T1s, T1h);
+					T3Z = FMA(KP707106781, T2W, T2T);
+					T2X = FNMS(KP707106781, T2W, T2T);
+					T5J = FNMS(KP707106781, T4Y, T4X);
+					T4Z = FMA(KP707106781, T4Y, T4X);
+					T7t = T6S + T6V;
+					T6W = T6S - T6V;
+				   }
+			      }
+			      {
+				   E T29, T1S, T1V, T87, TR, T2c, T84, T6E, T1X, TU, T1Y, T6G, T25, T22, TX;
+				   E T1Z;
+				   {
+					E TO, TN, TP, TL, TM, T6C, T6D;
+					TL = ci[0];
+					TM = cr[WS(rs, 15)];
+					TO = cr[WS(rs, 7)];
+					T5v = FMA(KP707106781, T4u, T4t);
+					T4v = FNMS(KP707106781, T4u, T4t);
+					TN = TL + TM;
+					T29 = TL - TM;
+					TP = ci[WS(rs, 8)];
+					{
+					     E T2a, T2b, T1T, T1U, TQ;
+					     T1T = ci[WS(rs, 16)];
+					     T1U = cr[WS(rs, 31)];
+					     TQ = TO + TP;
+					     T1S = TO - TP;
+					     T2a = ci[WS(rs, 24)];
+					     T6C = T1T - T1U;
+					     T1V = T1T + T1U;
+					     T2b = cr[WS(rs, 23)];
+					     T87 = TN - TQ;
+					     TR = TN + TQ;
+					     T2c = T2a + T2b;
+					     T6D = T2a - T2b;
+					}
+					{
+					     E T23, T24, TS, TT, TV, TW;
+					     TS = cr[WS(rs, 3)];
+					     TT = ci[WS(rs, 12)];
+					     T84 = T6C - T6D;
+					     T6E = T6C + T6D;
+					     T23 = ci[WS(rs, 20)];
+					     T1X = TS - TT;
+					     TU = TS + TT;
+					     T24 = cr[WS(rs, 27)];
+					     TV = ci[WS(rs, 4)];
+					     TW = cr[WS(rs, 11)];
+					     T1Y = ci[WS(rs, 28)];
+					     T6G = T23 - T24;
+					     T25 = T23 + T24;
+					     T22 = TV - TW;
+					     TX = TV + TW;
+					     T1Z = cr[WS(rs, 19)];
+					}
+				   }
+				   {
+					E T4O, T1W, T2f, T26, T8Z, T86, T2e, T21, T89, T90;
+					{
+					     E T85, TY, T6F, T20, T6H, T88;
+					     T4O = T1S + T1V;
+					     T1W = T1S - T1V;
+					     T2f = T22 - T25;
+					     T26 = T22 + T25;
+					     T85 = TU - TX;
+					     TY = TU + TX;
+					     T6F = T1Y - T1Z;
+					     T20 = T1Y + T1Z;
+					     T8Z = T85 + T84;
+					     T86 = T84 - T85;
+					     T6B = TR - TY;
+					     TZ = TR + TY;
+					     T6H = T6F + T6G;
+					     T88 = T6G - T6F;
+					     T2e = T1X - T20;
+					     T21 = T1X + T20;
+					     T7x = T6E + T6H;
+					     T6I = T6E - T6H;
+					     T89 = T87 - T88;
+					     T90 = T87 + T88;
+					}
+					{
+					     E T4P, T2d, T27, T2g;
+					     T2d = T29 - T2c;
+					     T4L = T29 + T2c;
+					     T4M = T21 + T26;
+					     T27 = T21 - T26;
+					     T2g = T2e + T2f;
+					     T4P = T2e - T2f;
+					     T91 = FNMS(KP414213562, T90, T8Z);
+					     T9d = FMA(KP414213562, T8Z, T90);
+					     T28 = FNMS(KP707106781, T27, T1W);
+					     T3S = FMA(KP707106781, T27, T1W);
+					     T3R = FMA(KP707106781, T2g, T2d);
+					     T2h = FNMS(KP707106781, T2g, T2d);
+					     T5B = FMA(KP707106781, T4P, T4O);
+					     T4Q = FNMS(KP707106781, T4P, T4O);
+					     T8v = FNMS(KP414213562, T86, T89);
+					     T8a = FMA(KP414213562, T89, T86);
+					}
+				   }
+			      }
+			      {
+				   E T6s, T6z, T4F, T4E;
+				   {
+					E T2A, T2j, TC, T8e, T2m, T2D, T6v, T8b, TG, T2o, TF, T6x, T2w, TH, T2p;
+					E T2q;
+					{
+					     E Tw, Tx, Tz, TA, T6t, T6u;
+					     Tw = cr[WS(rs, 1)];
+					     T5C = FMA(KP707106781, T4M, T4L);
+					     T4N = FNMS(KP707106781, T4M, T4L);
+					     T6Z = T6I - T6B;
+					     T6J = T6B + T6I;
+					     Tx = ci[WS(rs, 14)];
+					     Tz = cr[WS(rs, 9)];
+					     TA = ci[WS(rs, 6)];
+					     {
+						  E T2k, Ty, TB, T2l, T2B, T2C;
+						  T2k = ci[WS(rs, 30)];
+						  T2A = Tw - Tx;
+						  Ty = Tw + Tx;
+						  T2j = Tz - TA;
+						  TB = Tz + TA;
+						  T2l = cr[WS(rs, 17)];
+						  T2B = ci[WS(rs, 22)];
+						  T2C = cr[WS(rs, 25)];
+						  TC = Ty + TB;
+						  T8e = Ty - TB;
+						  T2m = T2k + T2l;
+						  T6t = T2k - T2l;
+						  T6u = T2B - T2C;
+						  T2D = T2B + T2C;
+					     }
+					     {
+						  E TD, TE, T2u, T2v;
+						  TD = cr[WS(rs, 5)];
+						  T6v = T6t + T6u;
+						  T8b = T6t - T6u;
+						  TE = ci[WS(rs, 10)];
+						  T2u = ci[WS(rs, 18)];
+						  T2v = cr[WS(rs, 29)];
+						  TG = ci[WS(rs, 2)];
+						  T2o = TD - TE;
+						  TF = TD + TE;
+						  T6x = T2u - T2v;
+						  T2w = T2u + T2v;
+						  TH = cr[WS(rs, 13)];
+						  T2p = ci[WS(rs, 26)];
+						  T2q = cr[WS(rs, 21)];
+					     }
+					}
+					{
+					     E T4H, T2n, T2G, T2F, T92, T8d, T2y, T93, T8g, T4I, T2E, T2H;
+					     {
+						  E T2x, T8c, T8f, T2s, T2t, TI;
+						  T4H = T2m - T2j;
+						  T2n = T2j + T2m;
+						  T2t = TG - TH;
+						  TI = TG + TH;
+						  {
+						       E T6w, T2r, TJ, T6y;
+						       T6w = T2p - T2q;
+						       T2r = T2p + T2q;
+						       T2G = T2t - T2w;
+						       T2x = T2t + T2w;
+						       T8c = TF - TI;
+						       TJ = TF + TI;
+						       T6y = T6w + T6x;
+						       T8f = T6x - T6w;
+						       T2F = T2o - T2r;
+						       T2s = T2o + T2r;
+						       TK = TC + TJ;
+						       T6s = TC - TJ;
+						       T6z = T6v - T6y;
+						       T7w = T6v + T6y;
+						  }
+						  T92 = T8c + T8b;
+						  T8d = T8b - T8c;
+						  T4F = T2s + T2x;
+						  T2y = T2s - T2x;
+						  T93 = T8e + T8f;
+						  T8g = T8e - T8f;
+					     }
+					     T4E = T2A + T2D;
+					     T2E = T2A - T2D;
+					     T2H = T2F + T2G;
+					     T4I = T2G - T2F;
+					     T2z = FNMS(KP707106781, T2y, T2n);
+					     T3P = FMA(KP707106781, T2y, T2n);
+					     T94 = FMA(KP414213562, T93, T92);
+					     T9c = FNMS(KP414213562, T92, T93);
+					     T3O = FMA(KP707106781, T2H, T2E);
+					     T2I = FNMS(KP707106781, T2H, T2E);
+					     T5y = FMA(KP707106781, T4I, T4H);
+					     T4J = FNMS(KP707106781, T4I, T4H);
+					     T8u = FMA(KP414213562, T8d, T8g);
+					     T8h = FNMS(KP414213562, T8g, T8d);
+					}
+				   }
+				   {
+					E T4x, T1O, Tm, T7Z, T80, T4w, T1J, T4A, T1D, Tt, T7X, T7W, T4z, T1y;
+					{
+					     E Tj, T1K, Ti, T6o, T1N, Tk, T1G, T1H;
+					     {
+						  E Tg, Th, T1L, T1M;
+						  Tg = cr[WS(rs, 2)];
+						  T5z = FMA(KP707106781, T4F, T4E);
+						  T4G = FNMS(KP707106781, T4F, T4E);
+						  T6Y = T6s + T6z;
+						  T6A = T6s - T6z;
+						  Th = ci[WS(rs, 13)];
+						  T1L = ci[WS(rs, 21)];
+						  T1M = cr[WS(rs, 26)];
+						  Tj = cr[WS(rs, 10)];
+						  T1K = Tg - Th;
+						  Ti = Tg + Th;
+						  T6o = T1L - T1M;
+						  T1N = T1L + T1M;
+						  Tk = ci[WS(rs, 5)];
+						  T1G = ci[WS(rs, 29)];
+						  T1H = cr[WS(rs, 18)];
+					     }
+					     {
+						  E T1F, Tl, T6n, T1I;
+						  T4x = T1K + T1N;
+						  T1O = T1K - T1N;
+						  T1F = Tj - Tk;
+						  Tl = Tj + Tk;
+						  T6n = T1G - T1H;
+						  T1I = T1G + T1H;
+						  Tm = Ti + Tl;
+						  T7Z = Ti - Tl;
+						  T80 = T6n - T6o;
+						  T6p = T6n + T6o;
+						  T4w = T1I - T1F;
+						  T1J = T1F + T1I;
+					     }
+					}
+					{
+					     E Tq, T1z, Tp, T6l, T1C, Tr, T1v, T1w;
+					     {
+						  E Tn, To, T1A, T1B;
+						  Tn = ci[WS(rs, 1)];
+						  To = cr[WS(rs, 14)];
+						  T1A = ci[WS(rs, 25)];
+						  T1B = cr[WS(rs, 22)];
+						  Tq = cr[WS(rs, 6)];
+						  T1z = Tn - To;
+						  Tp = Tn + To;
+						  T6l = T1A - T1B;
+						  T1C = T1A + T1B;
+						  Tr = ci[WS(rs, 9)];
+						  T1v = ci[WS(rs, 17)];
+						  T1w = cr[WS(rs, 30)];
+					     }
+					     {
+						  E T1u, Ts, T6k, T1x;
+						  T4A = T1z + T1C;
+						  T1D = T1z - T1C;
+						  T1u = Tq - Tr;
+						  Ts = Tq + Tr;
+						  T6k = T1v - T1w;
+						  T1x = T1v + T1w;
+						  Tt = Tp + Ts;
+						  T7X = Tp - Ts;
+						  T7W = T6k - T6l;
+						  T6m = T6k + T6l;
+						  T4z = T1u + T1x;
+						  T1y = T1u - T1x;
+					     }
+					}
+					{
+					     E T8r, T8q, T2Z, T1E, T1P, T2Y, T7Y, T81, T50, T51;
+					     T8r = T7X + T7W;
+					     T7Y = T7W - T7X;
+					     T81 = T7Z + T80;
+					     T8q = T7Z - T80;
+					     T6P = Tm - Tt;
+					     Tu = Tm + Tt;
+					     T9a = T81 + T7Y;
+					     T82 = T7Y - T81;
+					     T2Z = FMA(KP414213562, T1y, T1D);
+					     T1E = FNMS(KP414213562, T1D, T1y);
+					     T1P = FMA(KP414213562, T1O, T1J);
+					     T2Y = FNMS(KP414213562, T1J, T1O);
+					     T8X = T8q + T8r;
+					     T8s = T8q - T8r;
+					     T40 = T1P + T1E;
+					     T1Q = T1E - T1P;
+					     T4y = FNMS(KP414213562, T4x, T4w);
+					     T50 = FMA(KP414213562, T4w, T4x);
+					     T51 = FMA(KP414213562, T4z, T4A);
+					     T4B = FNMS(KP414213562, T4A, T4z);
+					     T3M = T2Y + T2Z;
+					     T30 = T2Y - T2Z;
+					     T5w = T50 + T51;
+					     T52 = T50 - T51;
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T7D, T5K, T4C, T7K, T7J, T7E, T83, T8w, T8t, T8i, T6r, T70, T6X, T6K;
+			      {
+				   E T6q, T8Y, T9e, T9b, T95, T8L, T8Q, T8H, T8M, T8I, T8R;
+				   {
+					E Tv, T10, T7v, T7y, T7u;
+					T7D = Tf - Tu;
+					Tv = Tf + Tu;
+					T7u = T6p + T6m;
+					T6q = T6m - T6p;
+					T5K = T4B - T4y;
+					T4C = T4y + T4B;
+					T10 = TK + TZ;
+					T7K = TK - TZ;
+					T7J = T7t - T7u;
+					T7v = T7t + T7u;
+					T7y = T7w + T7x;
+					T7E = T7x - T7w;
+					{
+					     E T9t, T9x, T9p, T9u, T9q, T9y;
+					     {
+						  E T9n, T7z, T9o, T7A, T7q, T7p;
+						  T8Y = FNMS(KP707106781, T8X, T8W);
+						  T9n = FMA(KP707106781, T8X, T8W);
+						  cr[0] = Tv + T10;
+						  T7p = Tv - T10;
+						  ci[0] = T7v + T7y;
+						  T7z = T7v - T7y;
+						  T9o = T9c + T9d;
+						  T9e = T9c - T9d;
+						  T7A = T7s * T7p;
+						  T7q = T7o * T7p;
+						  T9b = FNMS(KP707106781, T9a, T99);
+						  T9t = FMA(KP707106781, T9a, T99);
+						  T9x = FMA(KP923879532, T9o, T9n);
+						  T9p = FNMS(KP923879532, T9o, T9n);
+						  ci[WS(rs, 16)] = FMA(T7o, T7z, T7A);
+						  cr[WS(rs, 16)] = FNMS(T7s, T7z, T7q);
+						  T9u = T94 + T91;
+						  T95 = T91 - T94;
+					     }
+					     T9q = T9m * T9p;
+					     T9y = T3v * T9x;
+					     {
+						  E T8F, T9z, T9v, T8G, T9A, T9w;
+						  T83 = FMA(KP707106781, T82, T7V);
+						  T8F = FNMS(KP707106781, T82, T7V);
+						  T9z = FMA(KP923879532, T9u, T9t);
+						  T9v = FNMS(KP923879532, T9u, T9t);
+						  T8G = T8u + T8v;
+						  T8w = T8u - T8v;
+						  T8t = FMA(KP707106781, T8s, T8p);
+						  T8L = FNMS(KP707106781, T8s, T8p);
+						  T9A = T3v * T9z;
+						  cr[WS(rs, 2)] = FNMS(T3x, T9z, T9y);
+						  T9w = T9m * T9v;
+						  cr[WS(rs, 18)] = FNMS(T9s, T9v, T9q);
+						  T8Q = FMA(KP923879532, T8G, T8F);
+						  T8H = FNMS(KP923879532, T8G, T8F);
+						  ci[WS(rs, 2)] = FMA(T3x, T9x, T9A);
+						  ci[WS(rs, 18)] = FMA(T9s, T9p, T9w);
+						  T8M = T8h + T8a;
+						  T8i = T8a - T8h;
+					     }
+					     T8I = T8E * T8H;
+					     T8R = T8P * T8Q;
+					}
+				   }
+				   {
+					E T7f, T7j, T7b, T7g, T7c, T7k;
+					{
+					     E T79, T8T, T8N, T7a, T8U, T8O;
+					     T6r = T6j + T6q;
+					     T79 = T6j - T6q;
+					     T8T = FMA(KP923879532, T8M, T8L);
+					     T8N = FNMS(KP923879532, T8M, T8L);
+					     T7a = T6Z - T6Y;
+					     T70 = T6Y + T6Z;
+					     T6X = T6P + T6W;
+					     T7f = T6W - T6P;
+					     T8U = T8P * T8T;
+					     cr[WS(rs, 30)] = FNMS(T8S, T8T, T8R);
+					     T8O = T8E * T8N;
+					     cr[WS(rs, 14)] = FNMS(T8K, T8N, T8I);
+					     T7j = FMA(KP707106781, T7a, T79);
+					     T7b = FNMS(KP707106781, T7a, T79);
+					     ci[WS(rs, 30)] = FMA(T8S, T8Q, T8U);
+					     ci[WS(rs, 14)] = FMA(T8K, T8H, T8O);
+					     T7g = T6A - T6J;
+					     T6K = T6A + T6J;
+					}
+					T7c = T78 * T7b;
+					T7k = T5X * T7j;
+					{
+					     E T97, T9g, T9i, T9j, T9f, T9k, T9h, T96;
+					     {
+						  E T7l, T7h, T7m, T7i;
+						  T7l = FMA(KP707106781, T7g, T7f);
+						  T7h = FNMS(KP707106781, T7g, T7f);
+						  T7m = T5X * T7l;
+						  cr[WS(rs, 12)] = FNMS(T5Z, T7l, T7k);
+						  T7i = T78 * T7h;
+						  cr[WS(rs, 28)] = FNMS(T7e, T7h, T7c);
+						  T9h = FMA(KP923879532, T95, T8Y);
+						  T96 = FNMS(KP923879532, T95, T8Y);
+						  ci[WS(rs, 12)] = FMA(T5Z, T7j, T7m);
+						  ci[WS(rs, 28)] = FMA(T7e, T7b, T7i);
+					     }
+					     T97 = T8V * T96;
+					     T9g = T98 * T96;
+					     T9i = T3G * T9h;
+					     T9j = FMA(KP923879532, T9e, T9b);
+					     T9f = FNMS(KP923879532, T9e, T9b);
+					     T9k = T3J * T9h;
+					     cr[WS(rs, 10)] = FNMS(T3J, T9j, T9i);
+					     ci[WS(rs, 26)] = FMA(T8V, T9f, T9g);
+					     cr[WS(rs, 26)] = FNMS(T98, T9f, T97);
+					     ci[WS(rs, 10)] = FMA(T3G, T9j, T9k);
+					}
+				   }
+			      }
+			      {
+				   E T31, T3r, T1R, T3m, T33, T32, T3s, T2K, T8z, T8j;
+				   {
+					E T73, T6L, T75, T71;
+					T73 = FMA(KP707106781, T6K, T6r);
+					T6L = FNMS(KP707106781, T6K, T6r);
+					T75 = FMA(KP707106781, T70, T6X);
+					T71 = FNMS(KP707106781, T70, T6X);
+					{
+					     E T76, T74, T72, T6M;
+					     T76 = T3d * T73;
+					     T74 = T3a * T73;
+					     T72 = T6O * T6L;
+					     T6M = T6i * T6L;
+					     ci[WS(rs, 4)] = FMA(T3a, T75, T76);
+					     cr[WS(rs, 4)] = FNMS(T3d, T75, T74);
+					     ci[WS(rs, 20)] = FMA(T6i, T71, T72);
+					     cr[WS(rs, 20)] = FNMS(T6O, T71, T6M);
+					}
+				   }
+				   {
+					E T7N, T7F, T7P, T7L;
+					T7N = T7D + T7E;
+					T7F = T7D - T7E;
+					T7P = T7K + T7J;
+					T7L = T7J - T7K;
+					{
+					     E T7O, T7G, T7Q, T7M;
+					     T7O = T4p * T7N;
+					     T7G = T7C * T7F;
+					     T7Q = T4p * T7P;
+					     T7M = T7C * T7L;
+					     cr[WS(rs, 8)] = FNMS(T4r, T7P, T7O);
+					     cr[WS(rs, 24)] = FNMS(T7I, T7L, T7G);
+					     ci[WS(rs, 8)] = FMA(T4r, T7N, T7Q);
+					     ci[WS(rs, 24)] = FMA(T7I, T7F, T7M);
+					}
+				   }
+				   T31 = FMA(KP923879532, T30, T2X);
+				   T3r = FNMS(KP923879532, T30, T2X);
+				   T8z = FMA(KP923879532, T8i, T83);
+				   T8j = FNMS(KP923879532, T8i, T83);
+				   {
+					E T8B, T8x, T8C, T8A;
+					T8B = FMA(KP923879532, T8w, T8t);
+					T8x = FNMS(KP923879532, T8w, T8t);
+					T8C = T1a * T8z;
+					T8A = T16 * T8z;
+					{
+					     E T8y, T8k, T2i, T2J;
+					     T8y = T8m * T8j;
+					     T8k = T7S * T8j;
+					     ci[WS(rs, 6)] = FMA(T16, T8B, T8C);
+					     cr[WS(rs, 6)] = FNMS(T1a, T8B, T8A);
+					     ci[WS(rs, 22)] = FMA(T7S, T8x, T8y);
+					     cr[WS(rs, 22)] = FNMS(T8m, T8x, T8k);
+					     T1R = FMA(KP923879532, T1Q, T1t);
+					     T3m = FNMS(KP923879532, T1Q, T1t);
+					     T33 = FNMS(KP668178637, T28, T2h);
+					     T2i = FMA(KP668178637, T2h, T28);
+					     T2J = FNMS(KP668178637, T2I, T2z);
+					     T32 = FMA(KP668178637, T2z, T2I);
+					     T3s = T2J + T2i;
+					     T2K = T2i - T2J;
+					}
+				   }
+				   {
+					E T5l, T53, T5f, T4D, T4K, T4R, T56, T5g;
+					T5l = FNMS(KP923879532, T52, T4Z);
+					T53 = FMA(KP923879532, T52, T4Z);
+					{
+					     E T3t, T3D, T3f, T2L;
+					     T3t = FNMS(KP831469612, T3s, T3r);
+					     T3D = FMA(KP831469612, T3s, T3r);
+					     T3f = FMA(KP831469612, T2K, T1R);
+					     T2L = FNMS(KP831469612, T2K, T1R);
+					     {
+						  E T3n, T34, T3g, T2M;
+						  T3n = T32 + T33;
+						  T34 = T32 - T33;
+						  T3g = T3e * T3f;
+						  T2M = T1c * T2L;
+						  {
+						       E T3o, T3z, T3j, T35;
+						       T3o = FNMS(KP831469612, T3n, T3m);
+						       T3z = FMA(KP831469612, T3n, T3m);
+						       T3j = FMA(KP831469612, T34, T31);
+						       T35 = FNMS(KP831469612, T34, T31);
+						       {
+							    E T3u, T3p, T3E, T3A;
+							    T3u = T3q * T3o;
+							    T3p = T3l * T3o;
+							    T3E = T3C * T3z;
+							    T3A = T3y * T3z;
+							    {
+								 E T3k, T36, T54, T55;
+								 T3k = T3e * T3j;
+								 cr[WS(rs, 5)] = FNMS(T3i, T3j, T3g);
+								 T36 = T1c * T35;
+								 cr[WS(rs, 21)] = FNMS(T2O, T35, T2M);
+								 ci[WS(rs, 13)] = FMA(T3l, T3t, T3u);
+								 cr[WS(rs, 13)] = FNMS(T3q, T3t, T3p);
+								 ci[WS(rs, 29)] = FMA(T3y, T3D, T3E);
+								 cr[WS(rs, 29)] = FNMS(T3C, T3D, T3A);
+								 ci[WS(rs, 5)] = FMA(T3i, T3f, T3k);
+								 ci[WS(rs, 21)] = FMA(T2O, T2L, T36);
+								 T5f = FMA(KP923879532, T4C, T4v);
+								 T4D = FNMS(KP923879532, T4C, T4v);
+								 T4K = FNMS(KP668178637, T4J, T4G);
+								 T54 = FMA(KP668178637, T4G, T4J);
+								 T55 = FMA(KP668178637, T4N, T4Q);
+								 T4R = FNMS(KP668178637, T4Q, T4N);
+								 T56 = T54 - T55;
+								 T5g = T54 + T55;
+							    }
+						       }
+						  }
+					     }
+					}
+					{
+					     E T4h, T41, T4c, T3N, T3Q, T3T, T44, T4d;
+					     T4h = FNMS(KP923879532, T40, T3Z);
+					     T41 = FMA(KP923879532, T40, T3Z);
+					     {
+						  E T57, T5b, T5h, T5p;
+						  T57 = FNMS(KP831469612, T56, T53);
+						  T5b = FMA(KP831469612, T56, T53);
+						  T5h = FNMS(KP831469612, T5g, T5f);
+						  T5p = FMA(KP831469612, T5g, T5f);
+						  {
+						       E T5m, T4S, T5i, T5q;
+						       T5m = T4K - T4R;
+						       T4S = T4K + T4R;
+						       T5i = T5e * T5h;
+						       T5q = T17 * T5p;
+						       {
+							    E T5n, T5r, T59, T4T;
+							    T5n = FMA(KP831469612, T5m, T5l);
+							    T5r = FNMS(KP831469612, T5m, T5l);
+							    T59 = FMA(KP831469612, T4S, T4D);
+							    T4T = FNMS(KP831469612, T4S, T4D);
+							    {
+								 E T5o, T5s, T5c, T5a;
+								 T5o = T5e * T5n;
+								 cr[WS(rs, 11)] = FNMS(T5k, T5n, T5i);
+								 T5s = T17 * T5r;
+								 cr[WS(rs, 27)] = FNMS(T1b, T5r, T5q);
+								 T5c = T14 * T59;
+								 T5a = T11 * T59;
+								 {
+								      E T58, T4U, T42, T43;
+								      T58 = T4W * T4T;
+								      T4U = T4s * T4T;
+								      ci[WS(rs, 11)] = FMA(T5k, T5h, T5o);
+								      ci[WS(rs, 27)] = FMA(T1b, T5p, T5s);
+								      ci[WS(rs, 3)] = FMA(T11, T5b, T5c);
+								      cr[WS(rs, 3)] = FNMS(T14, T5b, T5a);
+								      ci[WS(rs, 19)] = FMA(T4s, T57, T58);
+								      cr[WS(rs, 19)] = FNMS(T4W, T57, T4U);
+								      T4c = FNMS(KP923879532, T3M, T3L);
+								      T3N = FMA(KP923879532, T3M, T3L);
+								      T3Q = FNMS(KP198912367, T3P, T3O);
+								      T42 = FMA(KP198912367, T3O, T3P);
+								      T43 = FNMS(KP198912367, T3R, T3S);
+								      T3T = FMA(KP198912367, T3S, T3R);
+								      T44 = T42 + T43;
+								      T4d = T43 - T42;
+								 }
+							    }
+						       }
+						  }
+					     }
+					     T67 = FNMS(KP923879532, T5K, T5J);
+					     T5L = FMA(KP923879532, T5K, T5J);
+					     {
+						  E T45, T49, T4e, T4l;
+						  T45 = FNMS(KP980785280, T44, T41);
+						  T49 = FMA(KP980785280, T44, T41);
+						  T4e = FNMS(KP980785280, T4d, T4c);
+						  T4l = FMA(KP980785280, T4d, T4c);
+						  {
+						       E T4i, T3U, T4f, T4m;
+						       T4i = T3Q - T3T;
+						       T3U = T3Q + T3T;
+						       T4f = T4b * T4e;
+						       T4m = T12 * T4l;
+						       {
+							    E T4j, T4n, T47, T3V;
+							    T4j = FNMS(KP980785280, T4i, T4h);
+							    T4n = FMA(KP980785280, T4i, T4h);
+							    T47 = FMA(KP980785280, T3U, T3N);
+							    T3V = FNMS(KP980785280, T3U, T3N);
+							    {
+								 E T4k, T4o, T4a, T48;
+								 T4k = T4b * T4j;
+								 cr[WS(rs, 25)] = FNMS(T4g, T4j, T4f);
+								 T4o = T12 * T4n;
+								 cr[WS(rs, 9)] = FNMS(T15, T4n, T4m);
+								 T4a = T39 * T47;
+								 T48 = T37 * T47;
+								 {
+								      E T46, T3W, T5M, T5N;
+								      T46 = T3Y * T3V;
+								      T3W = T3K * T3V;
+								      ci[WS(rs, 25)] = FMA(T4g, T4e, T4k);
+								      ci[WS(rs, 9)] = FMA(T15, T4l, T4o);
+								      ci[WS(rs, 1)] = FMA(T37, T49, T4a);
+								      cr[WS(rs, 1)] = FNMS(T39, T49, T48);
+								      ci[WS(rs, 17)] = FMA(T3K, T45, T46);
+								      cr[WS(rs, 17)] = FNMS(T3Y, T45, T3W);
+								      T61 = FMA(KP923879532, T5w, T5v);
+								      T5x = FNMS(KP923879532, T5w, T5v);
+								      T5A = FNMS(KP198912367, T5z, T5y);
+								      T5M = FMA(KP198912367, T5y, T5z);
+								      T5N = FMA(KP198912367, T5B, T5C);
+								      T5D = FNMS(KP198912367, T5C, T5B);
+								      T5O = T5M - T5N;
+								      T62 = T5M + T5N;
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T5V = FMA(KP980785280, T5O, T5L);
+	       T5P = FNMS(KP980785280, T5O, T5L);
+	       {
+		    E T6c, T63, T5E, T68;
+		    T6c = FMA(KP980785280, T62, T61);
+		    T63 = FNMS(KP980785280, T62, T61);
+		    T5E = T5A + T5D;
+		    T68 = T5D - T5A;
+		    {
+			 E T64, T6d, T6f, T69;
+			 T64 = T60 * T63;
+			 T6d = T6b * T6c;
+			 T6f = FNMS(KP980785280, T68, T67);
+			 T69 = FMA(KP980785280, T68, T67);
+			 {
+			      E T5F, T5S, T6a, T6g;
+			      T5F = FMA(KP980785280, T5E, T5x);
+			      T5S = FNMS(KP980785280, T5E, T5x);
+			      T6a = T60 * T69;
+			      cr[WS(rs, 15)] = FNMS(T66, T69, T64);
+			      T6g = T6b * T6f;
+			      cr[WS(rs, 31)] = FNMS(T6e, T6f, T6d);
+			      {
+				   E T5W, T5T, T5Q, T5G;
+				   T5W = T5U * T5S;
+				   T5T = T5R * T5S;
+				   T5Q = T5I * T5F;
+				   T5G = T5u * T5F;
+				   ci[WS(rs, 15)] = FMA(T66, T63, T6a);
+				   ci[WS(rs, 31)] = FMA(T6e, T6c, T6g);
+				   ci[WS(rs, 7)] = FMA(T5R, T5V, T5W);
+				   cr[WS(rs, 7)] = FNMS(T5U, T5V, T5T);
+				   ci[WS(rs, 23)] = FMA(T5u, T5P, T5Q);
+				   cr[WS(rs, 23)] = FNMS(T5I, T5P, T5G);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 32, "hb2_32", twinstr, &GENUS, {236, 98, 252, 0} };
+
+void X(codelet_hb2_32) (planner *p) {
+     X(khc2hc_register) (p, hb2_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 32 -dif -name hb2_32 -include hb.h */
+
+/*
+ * This function contains 488 FP additions, 280 FP multiplications,
+ * (or, 376 additions, 168 multiplications, 112 fused multiply/add),
+ * 160 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T11, T14, T12, T15, T17, T2z, T2B, T1c, T18, T1d, T1g, T1k, T2F, T2L, T3t;
+	       E T4H, T3h, T3V, T3b, T4v, T4T, T4X, T6t, T71, T6z, T75, T81, T8x, T8f, T8z;
+	       E T2R, T2V, T8p, T8t, T4r, T4t, T53, T69, T3n, T3r, T7P, T7T, T4P, T4R, T6F;
+	       E T6R, T1f, T2X, T1j, T2Y, T1l, T31, T2d, T2Z, T49, T4h, T4c, T4i, T4d, T4n;
+	       E T4f, T4j;
+	       {
+		    E T2P, T3q, T2U, T3l, T2Q, T3p, T2T, T3m, T2D, T3g, T2K, T39, T2E, T3f, T2J;
+		    E T3a;
+		    {
+			 E T13, T1b, T16, T1a;
+			 T11 = W[0];
+			 T14 = W[1];
+			 T12 = W[2];
+			 T15 = W[3];
+			 T13 = T11 * T12;
+			 T1b = T14 * T12;
+			 T16 = T14 * T15;
+			 T1a = T11 * T15;
+			 T17 = T13 + T16;
+			 T2z = T13 - T16;
+			 T2B = T1a + T1b;
+			 T1c = T1a - T1b;
+			 T18 = W[4];
+			 T2P = T12 * T18;
+			 T3q = T14 * T18;
+			 T2U = T15 * T18;
+			 T3l = T11 * T18;
+			 T1d = W[5];
+			 T2Q = T15 * T1d;
+			 T3p = T11 * T1d;
+			 T2T = T12 * T1d;
+			 T3m = T14 * T1d;
+			 T1g = W[6];
+			 T2D = T11 * T1g;
+			 T3g = T15 * T1g;
+			 T2K = T14 * T1g;
+			 T39 = T12 * T1g;
+			 T1k = W[7];
+			 T2E = T14 * T1k;
+			 T3f = T12 * T1k;
+			 T2J = T11 * T1k;
+			 T3a = T15 * T1k;
+		    }
+		    T2F = T2D - T2E;
+		    T2L = T2J + T2K;
+		    T3t = T39 - T3a;
+		    T4H = T2J - T2K;
+		    T3h = T3f - T3g;
+		    T3V = T3f + T3g;
+		    T3b = T39 + T3a;
+		    T4v = T2D + T2E;
+		    T4T = FMA(T18, T1g, T1d * T1k);
+		    T4X = FNMS(T1d, T1g, T18 * T1k);
+		    {
+			 E T6r, T6s, T6x, T6y;
+			 T6r = T17 * T1g;
+			 T6s = T1c * T1k;
+			 T6t = T6r - T6s;
+			 T71 = T6r + T6s;
+			 T6x = T17 * T1k;
+			 T6y = T1c * T1g;
+			 T6z = T6x + T6y;
+			 T75 = T6x - T6y;
+		    }
+		    {
+			 E T7Z, T80, T8d, T8e;
+			 T7Z = T2z * T1g;
+			 T80 = T2B * T1k;
+			 T81 = T7Z + T80;
+			 T8x = T7Z - T80;
+			 T8d = T2z * T1k;
+			 T8e = T2B * T1g;
+			 T8f = T8d - T8e;
+			 T8z = T8d + T8e;
+			 T2R = T2P - T2Q;
+			 T2V = T2T + T2U;
+			 T8p = FMA(T2R, T1g, T2V * T1k);
+			 T8t = FNMS(T2V, T1g, T2R * T1k);
+		    }
+		    T4r = T2P + T2Q;
+		    T4t = T2T - T2U;
+		    T53 = FMA(T4r, T1g, T4t * T1k);
+		    T69 = FNMS(T4t, T1g, T4r * T1k);
+		    T3n = T3l + T3m;
+		    T3r = T3p - T3q;
+		    T7P = FMA(T3n, T1g, T3r * T1k);
+		    T7T = FNMS(T3r, T1g, T3n * T1k);
+		    T4P = T3l - T3m;
+		    T4R = T3p + T3q;
+		    T6F = FMA(T4P, T1g, T4R * T1k);
+		    T6R = FNMS(T4R, T1g, T4P * T1k);
+		    {
+			 E T19, T1e, T1h, T1i;
+			 T19 = T17 * T18;
+			 T1e = T1c * T1d;
+			 T1f = T19 + T1e;
+			 T2X = T19 - T1e;
+			 T1h = T17 * T1d;
+			 T1i = T1c * T18;
+			 T1j = T1h - T1i;
+			 T2Y = T1h + T1i;
+		    }
+		    T1l = FMA(T1f, T1g, T1j * T1k);
+		    T31 = FNMS(T2Y, T1g, T2X * T1k);
+		    T2d = FNMS(T1j, T1g, T1f * T1k);
+		    T2Z = FMA(T2X, T1g, T2Y * T1k);
+		    {
+			 E T47, T48, T4a, T4b;
+			 T47 = T2z * T18;
+			 T48 = T2B * T1d;
+			 T49 = T47 - T48;
+			 T4h = T47 + T48;
+			 T4a = T2z * T1d;
+			 T4b = T2B * T18;
+			 T4c = T4a + T4b;
+			 T4i = T4a - T4b;
+		    }
+		    T4d = FMA(T49, T1g, T4c * T1k);
+		    T4n = FNMS(T4i, T1g, T4h * T1k);
+		    T4f = FNMS(T4c, T1g, T49 * T1k);
+		    T4j = FMA(T4h, T1g, T4i * T1k);
+	       }
+	       {
+		    E T56, T7b, T7C, T6c, Tf, T1m, T6f, T7c, T3Y, T4I, T2t, T32, T5d, T7D, T3w;
+		    E T4w, Tu, T2e, T7g, T7F, T7j, T7G, T1B, T33, T3z, T40, T5l, T6i, T5s, T6h;
+		    E T3C, T3Z, TK, T1D, T7v, T86, T7y, T85, T1S, T35, T3O, T4C, T5F, T6J, T5M;
+		    E T6K, T3R, T4D, TZ, T1U, T7o, T89, T7r, T88, T29, T36, T3H, T4z, T5Y, T6M;
+		    E T65, T6N, T3K, T4A;
+		    {
+			 E T3, T54, T2o, T58, T2r, T5b, T6, T6a, Ta, T57, T2h, T6b, T2k, T55, Td;
+			 E T5a;
+			 {
+			      E T1, T2, T2m, T2n;
+			      T1 = cr[0];
+			      T2 = ci[WS(rs, 15)];
+			      T3 = T1 + T2;
+			      T54 = T1 - T2;
+			      T2m = ci[WS(rs, 27)];
+			      T2n = cr[WS(rs, 20)];
+			      T2o = T2m - T2n;
+			      T58 = T2m + T2n;
+			 }
+			 {
+			      E T2p, T2q, T4, T5;
+			      T2p = ci[WS(rs, 19)];
+			      T2q = cr[WS(rs, 28)];
+			      T2r = T2p - T2q;
+			      T5b = T2p + T2q;
+			      T4 = cr[WS(rs, 8)];
+			      T5 = ci[WS(rs, 7)];
+			      T6 = T4 + T5;
+			      T6a = T4 - T5;
+			 }
+			 {
+			      E T8, T9, T2f, T2g;
+			      T8 = cr[WS(rs, 4)];
+			      T9 = ci[WS(rs, 11)];
+			      Ta = T8 + T9;
+			      T57 = T8 - T9;
+			      T2f = ci[WS(rs, 31)];
+			      T2g = cr[WS(rs, 16)];
+			      T2h = T2f - T2g;
+			      T6b = T2f + T2g;
+			 }
+			 {
+			      E T2i, T2j, Tb, Tc;
+			      T2i = ci[WS(rs, 23)];
+			      T2j = cr[WS(rs, 24)];
+			      T2k = T2i - T2j;
+			      T55 = T2i + T2j;
+			      Tb = ci[WS(rs, 3)];
+			      Tc = cr[WS(rs, 12)];
+			      Td = Tb + Tc;
+			      T5a = Tb - Tc;
+			 }
+			 {
+			      E T7, Te, T2l, T2s;
+			      T56 = T54 - T55;
+			      T7b = T54 + T55;
+			      T7C = T6b - T6a;
+			      T6c = T6a + T6b;
+			      T7 = T3 + T6;
+			      Te = Ta + Td;
+			      Tf = T7 + Te;
+			      T1m = T7 - Te;
+			      {
+				   E T6d, T6e, T3W, T3X;
+				   T6d = T57 + T58;
+				   T6e = T5a + T5b;
+				   T6f = KP707106781 * (T6d - T6e);
+				   T7c = KP707106781 * (T6d + T6e);
+				   T3W = T2h - T2k;
+				   T3X = Ta - Td;
+				   T3Y = T3W - T3X;
+				   T4I = T3X + T3W;
+			      }
+			      T2l = T2h + T2k;
+			      T2s = T2o + T2r;
+			      T2t = T2l - T2s;
+			      T32 = T2l + T2s;
+			      {
+				   E T59, T5c, T3u, T3v;
+				   T59 = T57 - T58;
+				   T5c = T5a - T5b;
+				   T5d = KP707106781 * (T59 + T5c);
+				   T7D = KP707106781 * (T59 - T5c);
+				   T3u = T3 - T6;
+				   T3v = T2r - T2o;
+				   T3w = T3u - T3v;
+				   T4w = T3u + T3v;
+			      }
+			 }
+		    }
+		    {
+			 E Ti, T5p, T1w, T5n, T1z, T5q, Tl, T5m, Tp, T5i, T1p, T5g, T1s, T5j, Ts;
+			 E T5f;
+			 {
+			      E Tg, Th, T1u, T1v;
+			      Tg = cr[WS(rs, 2)];
+			      Th = ci[WS(rs, 13)];
+			      Ti = Tg + Th;
+			      T5p = Tg - Th;
+			      T1u = ci[WS(rs, 29)];
+			      T1v = cr[WS(rs, 18)];
+			      T1w = T1u - T1v;
+			      T5n = T1u + T1v;
+			 }
+			 {
+			      E T1x, T1y, Tj, Tk;
+			      T1x = ci[WS(rs, 21)];
+			      T1y = cr[WS(rs, 26)];
+			      T1z = T1x - T1y;
+			      T5q = T1x + T1y;
+			      Tj = cr[WS(rs, 10)];
+			      Tk = ci[WS(rs, 5)];
+			      Tl = Tj + Tk;
+			      T5m = Tj - Tk;
+			 }
+			 {
+			      E Tn, To, T1n, T1o;
+			      Tn = ci[WS(rs, 1)];
+			      To = cr[WS(rs, 14)];
+			      Tp = Tn + To;
+			      T5i = Tn - To;
+			      T1n = ci[WS(rs, 17)];
+			      T1o = cr[WS(rs, 30)];
+			      T1p = T1n - T1o;
+			      T5g = T1n + T1o;
+			 }
+			 {
+			      E T1q, T1r, Tq, Tr;
+			      T1q = ci[WS(rs, 25)];
+			      T1r = cr[WS(rs, 22)];
+			      T1s = T1q - T1r;
+			      T5j = T1q + T1r;
+			      Tq = cr[WS(rs, 6)];
+			      Tr = ci[WS(rs, 9)];
+			      Ts = Tq + Tr;
+			      T5f = Tq - Tr;
+			 }
+			 {
+			      E Tm, Tt, T7e, T7f;
+			      Tm = Ti + Tl;
+			      Tt = Tp + Ts;
+			      Tu = Tm + Tt;
+			      T2e = Tm - Tt;
+			      T7e = T5p + T5q;
+			      T7f = T5n - T5m;
+			      T7g = FNMS(KP923879532, T7f, KP382683432 * T7e);
+			      T7F = FMA(KP382683432, T7f, KP923879532 * T7e);
+			 }
+			 {
+			      E T7h, T7i, T1t, T1A;
+			      T7h = T5i + T5j;
+			      T7i = T5f + T5g;
+			      T7j = FNMS(KP923879532, T7i, KP382683432 * T7h);
+			      T7G = FMA(KP382683432, T7i, KP923879532 * T7h);
+			      T1t = T1p + T1s;
+			      T1A = T1w + T1z;
+			      T1B = T1t - T1A;
+			      T33 = T1A + T1t;
+			 }
+			 {
+			      E T3x, T3y, T5h, T5k;
+			      T3x = T1p - T1s;
+			      T3y = Tp - Ts;
+			      T3z = T3x - T3y;
+			      T40 = T3y + T3x;
+			      T5h = T5f - T5g;
+			      T5k = T5i - T5j;
+			      T5l = FNMS(KP382683432, T5k, KP923879532 * T5h);
+			      T6i = FMA(KP382683432, T5h, KP923879532 * T5k);
+			 }
+			 {
+			      E T5o, T5r, T3A, T3B;
+			      T5o = T5m + T5n;
+			      T5r = T5p - T5q;
+			      T5s = FMA(KP923879532, T5o, KP382683432 * T5r);
+			      T6h = FNMS(KP382683432, T5o, KP923879532 * T5r);
+			      T3A = Ti - Tl;
+			      T3B = T1w - T1z;
+			      T3C = T3A + T3B;
+			      T3Z = T3A - T3B;
+			 }
+		    }
+		    {
+			 E Ty, T5v, TB, T5G, T1J, T5w, T1G, T5H, TI, T5K, T1Q, T5D, TF, T5J, T1N;
+			 E T5A;
+			 {
+			      E Tw, Tx, T1E, T1F;
+			      Tw = cr[WS(rs, 1)];
+			      Tx = ci[WS(rs, 14)];
+			      Ty = Tw + Tx;
+			      T5v = Tw - Tx;
+			      {
+				   E Tz, TA, T1H, T1I;
+				   Tz = cr[WS(rs, 9)];
+				   TA = ci[WS(rs, 6)];
+				   TB = Tz + TA;
+				   T5G = Tz - TA;
+				   T1H = ci[WS(rs, 22)];
+				   T1I = cr[WS(rs, 25)];
+				   T1J = T1H - T1I;
+				   T5w = T1H + T1I;
+			      }
+			      T1E = ci[WS(rs, 30)];
+			      T1F = cr[WS(rs, 17)];
+			      T1G = T1E - T1F;
+			      T5H = T1E + T1F;
+			      {
+				   E TG, TH, T5B, T1O, T1P, T5C;
+				   TG = ci[WS(rs, 2)];
+				   TH = cr[WS(rs, 13)];
+				   T5B = TG - TH;
+				   T1O = ci[WS(rs, 18)];
+				   T1P = cr[WS(rs, 29)];
+				   T5C = T1O + T1P;
+				   TI = TG + TH;
+				   T5K = T5B + T5C;
+				   T1Q = T1O - T1P;
+				   T5D = T5B - T5C;
+			      }
+			      {
+				   E TD, TE, T5y, T1L, T1M, T5z;
+				   TD = cr[WS(rs, 5)];
+				   TE = ci[WS(rs, 10)];
+				   T5y = TD - TE;
+				   T1L = ci[WS(rs, 26)];
+				   T1M = cr[WS(rs, 21)];
+				   T5z = T1L + T1M;
+				   TF = TD + TE;
+				   T5J = T5y + T5z;
+				   T1N = T1L - T1M;
+				   T5A = T5y - T5z;
+			      }
+			 }
+			 {
+			      E TC, TJ, T7t, T7u;
+			      TC = Ty + TB;
+			      TJ = TF + TI;
+			      TK = TC + TJ;
+			      T1D = TC - TJ;
+			      T7t = T5H - T5G;
+			      T7u = KP707106781 * (T5A - T5D);
+			      T7v = T7t + T7u;
+			      T86 = T7t - T7u;
+			 }
+			 {
+			      E T7w, T7x, T1K, T1R;
+			      T7w = T5v + T5w;
+			      T7x = KP707106781 * (T5J + T5K);
+			      T7y = T7w - T7x;
+			      T85 = T7w + T7x;
+			      T1K = T1G + T1J;
+			      T1R = T1N + T1Q;
+			      T1S = T1K - T1R;
+			      T35 = T1K + T1R;
+			 }
+			 {
+			      E T3M, T3N, T5x, T5E;
+			      T3M = T1G - T1J;
+			      T3N = TF - TI;
+			      T3O = T3M - T3N;
+			      T4C = T3N + T3M;
+			      T5x = T5v - T5w;
+			      T5E = KP707106781 * (T5A + T5D);
+			      T5F = T5x - T5E;
+			      T6J = T5x + T5E;
+			 }
+			 {
+			      E T5I, T5L, T3P, T3Q;
+			      T5I = T5G + T5H;
+			      T5L = KP707106781 * (T5J - T5K);
+			      T5M = T5I - T5L;
+			      T6K = T5I + T5L;
+			      T3P = Ty - TB;
+			      T3Q = T1Q - T1N;
+			      T3R = T3P - T3Q;
+			      T4D = T3P + T3Q;
+			 }
+		    }
+		    {
+			 E TN, T5O, TQ, T5Z, T20, T5P, T1X, T60, TX, T63, T27, T5W, TU, T62, T24;
+			 E T5T;
+			 {
+			      E TL, TM, T1V, T1W;
+			      TL = ci[0];
+			      TM = cr[WS(rs, 15)];
+			      TN = TL + TM;
+			      T5O = TL - TM;
+			      {
+				   E TO, TP, T1Y, T1Z;
+				   TO = cr[WS(rs, 7)];
+				   TP = ci[WS(rs, 8)];
+				   TQ = TO + TP;
+				   T5Z = TO - TP;
+				   T1Y = ci[WS(rs, 24)];
+				   T1Z = cr[WS(rs, 23)];
+				   T20 = T1Y - T1Z;
+				   T5P = T1Y + T1Z;
+			      }
+			      T1V = ci[WS(rs, 16)];
+			      T1W = cr[WS(rs, 31)];
+			      T1X = T1V - T1W;
+			      T60 = T1V + T1W;
+			      {
+				   E TV, TW, T5U, T25, T26, T5V;
+				   TV = ci[WS(rs, 4)];
+				   TW = cr[WS(rs, 11)];
+				   T5U = TV - TW;
+				   T25 = ci[WS(rs, 20)];
+				   T26 = cr[WS(rs, 27)];
+				   T5V = T25 + T26;
+				   TX = TV + TW;
+				   T63 = T5U + T5V;
+				   T27 = T25 - T26;
+				   T5W = T5U - T5V;
+			      }
+			      {
+				   E TS, TT, T5R, T22, T23, T5S;
+				   TS = cr[WS(rs, 3)];
+				   TT = ci[WS(rs, 12)];
+				   T5R = TS - TT;
+				   T22 = ci[WS(rs, 28)];
+				   T23 = cr[WS(rs, 19)];
+				   T5S = T22 + T23;
+				   TU = TS + TT;
+				   T62 = T5R + T5S;
+				   T24 = T22 - T23;
+				   T5T = T5R - T5S;
+			      }
+			 }
+			 {
+			      E TR, TY, T7m, T7n;
+			      TR = TN + TQ;
+			      TY = TU + TX;
+			      TZ = TR + TY;
+			      T1U = TR - TY;
+			      T7m = KP707106781 * (T5T - T5W);
+			      T7n = T5Z + T60;
+			      T7o = T7m - T7n;
+			      T89 = T7n + T7m;
+			 }
+			 {
+			      E T7p, T7q, T21, T28;
+			      T7p = T5O + T5P;
+			      T7q = KP707106781 * (T62 + T63);
+			      T7r = T7p - T7q;
+			      T88 = T7p + T7q;
+			      T21 = T1X + T20;
+			      T28 = T24 + T27;
+			      T29 = T21 - T28;
+			      T36 = T21 + T28;
+			 }
+			 {
+			      E T3F, T3G, T5Q, T5X;
+			      T3F = T1X - T20;
+			      T3G = TU - TX;
+			      T3H = T3F - T3G;
+			      T4z = T3G + T3F;
+			      T5Q = T5O - T5P;
+			      T5X = KP707106781 * (T5T + T5W);
+			      T5Y = T5Q - T5X;
+			      T6M = T5Q + T5X;
+			 }
+			 {
+			      E T61, T64, T3I, T3J;
+			      T61 = T5Z - T60;
+			      T64 = KP707106781 * (T62 - T63);
+			      T65 = T61 - T64;
+			      T6N = T61 + T64;
+			      T3I = TN - TQ;
+			      T3J = T27 - T24;
+			      T3K = T3I - T3J;
+			      T4A = T3I + T3J;
+			 }
+		    }
+		    {
+			 E Tv, T10, T30, T34, T37, T38;
+			 Tv = Tf + Tu;
+			 T10 = TK + TZ;
+			 T30 = Tv - T10;
+			 T34 = T32 + T33;
+			 T37 = T35 + T36;
+			 T38 = T34 - T37;
+			 cr[0] = Tv + T10;
+			 ci[0] = T34 + T37;
+			 cr[WS(rs, 16)] = FNMS(T31, T38, T2Z * T30);
+			 ci[WS(rs, 16)] = FMA(T31, T30, T2Z * T38);
+		    }
+		    {
+			 E T3e, T3o, T3k, T3s;
+			 {
+			      E T3c, T3d, T3i, T3j;
+			      T3c = Tf - Tu;
+			      T3d = T36 - T35;
+			      T3e = T3c - T3d;
+			      T3o = T3c + T3d;
+			      T3i = T32 - T33;
+			      T3j = TK - TZ;
+			      T3k = T3i - T3j;
+			      T3s = T3j + T3i;
+			 }
+			 cr[WS(rs, 24)] = FNMS(T3h, T3k, T3b * T3e);
+			 ci[WS(rs, 24)] = FMA(T3b, T3k, T3h * T3e);
+			 cr[WS(rs, 8)] = FNMS(T3r, T3s, T3n * T3o);
+			 ci[WS(rs, 8)] = FMA(T3n, T3s, T3r * T3o);
+		    }
+		    {
+			 E T1C, T2u, T2M, T2G, T2x, T2H, T2b, T2N;
+			 T1C = T1m + T1B;
+			 T2u = T2e + T2t;
+			 T2M = T2t - T2e;
+			 T2G = T1m - T1B;
+			 {
+			      E T2v, T2w, T1T, T2a;
+			      T2v = T1D + T1S;
+			      T2w = T29 - T1U;
+			      T2x = KP707106781 * (T2v + T2w);
+			      T2H = KP707106781 * (T2w - T2v);
+			      T1T = T1D - T1S;
+			      T2a = T1U + T29;
+			      T2b = KP707106781 * (T1T + T2a);
+			      T2N = KP707106781 * (T1T - T2a);
+			 }
+			 {
+			      E T2c, T2y, T2S, T2W;
+			      T2c = T1C - T2b;
+			      T2y = T2u - T2x;
+			      cr[WS(rs, 20)] = FNMS(T2d, T2y, T1l * T2c);
+			      ci[WS(rs, 20)] = FMA(T2d, T2c, T1l * T2y);
+			      T2S = T2G + T2H;
+			      T2W = T2M + T2N;
+			      cr[WS(rs, 12)] = FNMS(T2V, T2W, T2R * T2S);
+			      ci[WS(rs, 12)] = FMA(T2R, T2W, T2V * T2S);
+			 }
+			 {
+			      E T2A, T2C, T2I, T2O;
+			      T2A = T1C + T2b;
+			      T2C = T2u + T2x;
+			      cr[WS(rs, 4)] = FNMS(T2B, T2C, T2z * T2A);
+			      ci[WS(rs, 4)] = FMA(T2B, T2A, T2z * T2C);
+			      T2I = T2G - T2H;
+			      T2O = T2M - T2N;
+			      cr[WS(rs, 28)] = FNMS(T2L, T2O, T2F * T2I);
+			      ci[WS(rs, 28)] = FMA(T2F, T2O, T2L * T2I);
+			 }
+		    }
+		    {
+			 E T4y, T4U, T4K, T4Y, T4F, T4Z, T4N, T4V, T4x, T4J;
+			 T4x = KP707106781 * (T3Z + T40);
+			 T4y = T4w - T4x;
+			 T4U = T4w + T4x;
+			 T4J = KP707106781 * (T3C + T3z);
+			 T4K = T4I - T4J;
+			 T4Y = T4I + T4J;
+			 {
+			      E T4B, T4E, T4L, T4M;
+			      T4B = FNMS(KP382683432, T4A, KP923879532 * T4z);
+			      T4E = FMA(KP923879532, T4C, KP382683432 * T4D);
+			      T4F = T4B - T4E;
+			      T4Z = T4E + T4B;
+			      T4L = FNMS(KP382683432, T4C, KP923879532 * T4D);
+			      T4M = FMA(KP382683432, T4z, KP923879532 * T4A);
+			      T4N = T4L - T4M;
+			      T4V = T4L + T4M;
+			 }
+			 {
+			      E T4G, T4O, T51, T52;
+			      T4G = T4y - T4F;
+			      T4O = T4K - T4N;
+			      cr[WS(rs, 26)] = FNMS(T4H, T4O, T4v * T4G);
+			      ci[WS(rs, 26)] = FMA(T4H, T4G, T4v * T4O);
+			      T51 = T4U + T4V;
+			      T52 = T4Y + T4Z;
+			      cr[WS(rs, 2)] = FNMS(T1c, T52, T17 * T51);
+			      ci[WS(rs, 2)] = FMA(T17, T52, T1c * T51);
+			 }
+			 {
+			      E T4Q, T4S, T4W, T50;
+			      T4Q = T4y + T4F;
+			      T4S = T4K + T4N;
+			      cr[WS(rs, 10)] = FNMS(T4R, T4S, T4P * T4Q);
+			      ci[WS(rs, 10)] = FMA(T4R, T4Q, T4P * T4S);
+			      T4W = T4U - T4V;
+			      T50 = T4Y - T4Z;
+			      cr[WS(rs, 18)] = FNMS(T4X, T50, T4T * T4W);
+			      ci[WS(rs, 18)] = FMA(T4T, T50, T4X * T4W);
+			 }
+		    }
+		    {
+			 E T3E, T4k, T42, T4o, T3T, T4p, T45, T4l, T3D, T41;
+			 T3D = KP707106781 * (T3z - T3C);
+			 T3E = T3w - T3D;
+			 T4k = T3w + T3D;
+			 T41 = KP707106781 * (T3Z - T40);
+			 T42 = T3Y - T41;
+			 T4o = T3Y + T41;
+			 {
+			      E T3L, T3S, T43, T44;
+			      T3L = FNMS(KP923879532, T3K, KP382683432 * T3H);
+			      T3S = FMA(KP382683432, T3O, KP923879532 * T3R);
+			      T3T = T3L - T3S;
+			      T4p = T3S + T3L;
+			      T43 = FNMS(KP923879532, T3O, KP382683432 * T3R);
+			      T44 = FMA(KP923879532, T3H, KP382683432 * T3K);
+			      T45 = T43 - T44;
+			      T4l = T43 + T44;
+			 }
+			 {
+			      E T3U, T46, T4s, T4u;
+			      T3U = T3E - T3T;
+			      T46 = T42 - T45;
+			      cr[WS(rs, 30)] = FNMS(T3V, T46, T3t * T3U);
+			      ci[WS(rs, 30)] = FMA(T3V, T3U, T3t * T46);
+			      T4s = T4k + T4l;
+			      T4u = T4o + T4p;
+			      cr[WS(rs, 6)] = FNMS(T4t, T4u, T4r * T4s);
+			      ci[WS(rs, 6)] = FMA(T4r, T4u, T4t * T4s);
+			 }
+			 {
+			      E T4e, T4g, T4m, T4q;
+			      T4e = T3E + T3T;
+			      T4g = T42 + T45;
+			      cr[WS(rs, 14)] = FNMS(T4f, T4g, T4d * T4e);
+			      ci[WS(rs, 14)] = FMA(T4f, T4e, T4d * T4g);
+			      T4m = T4k - T4l;
+			      T4q = T4o - T4p;
+			      cr[WS(rs, 22)] = FNMS(T4n, T4q, T4j * T4m);
+			      ci[WS(rs, 22)] = FMA(T4j, T4q, T4n * T4m);
+			 }
+		    }
+		    {
+			 E T6I, T72, T6X, T73, T6P, T77, T6U, T76;
+			 {
+			      E T6G, T6H, T6V, T6W;
+			      T6G = T56 + T5d;
+			      T6H = T6h + T6i;
+			      T6I = T6G + T6H;
+			      T72 = T6G - T6H;
+			      T6V = FMA(KP195090322, T6J, KP980785280 * T6K);
+			      T6W = FNMS(KP195090322, T6M, KP980785280 * T6N);
+			      T6X = T6V + T6W;
+			      T73 = T6W - T6V;
+			 }
+			 {
+			      E T6L, T6O, T6S, T6T;
+			      T6L = FNMS(KP195090322, T6K, KP980785280 * T6J);
+			      T6O = FMA(KP980785280, T6M, KP195090322 * T6N);
+			      T6P = T6L + T6O;
+			      T77 = T6L - T6O;
+			      T6S = T6c + T6f;
+			      T6T = T5s + T5l;
+			      T6U = T6S + T6T;
+			      T76 = T6S - T6T;
+			 }
+			 {
+			      E T6Q, T6Y, T79, T7a;
+			      T6Q = T6I - T6P;
+			      T6Y = T6U - T6X;
+			      cr[WS(rs, 17)] = FNMS(T6R, T6Y, T6F * T6Q);
+			      ci[WS(rs, 17)] = FMA(T6R, T6Q, T6F * T6Y);
+			      T79 = T72 + T73;
+			      T7a = T76 + T77;
+			      cr[WS(rs, 9)] = FNMS(T1d, T7a, T18 * T79);
+			      ci[WS(rs, 9)] = FMA(T18, T7a, T1d * T79);
+			 }
+			 {
+			      E T6Z, T70, T74, T78;
+			      T6Z = T6I + T6P;
+			      T70 = T6U + T6X;
+			      cr[WS(rs, 1)] = FNMS(T14, T70, T11 * T6Z);
+			      ci[WS(rs, 1)] = FMA(T14, T6Z, T11 * T70);
+			      T74 = T72 - T73;
+			      T78 = T76 - T77;
+			      cr[WS(rs, 25)] = FNMS(T75, T78, T71 * T74);
+			      ci[WS(rs, 25)] = FMA(T71, T78, T75 * T74);
+			 }
+		    }
+		    {
+			 E T84, T8q, T8l, T8r, T8b, T8v, T8i, T8u;
+			 {
+			      E T82, T83, T8j, T8k;
+			      T82 = T7b + T7c;
+			      T83 = T7F + T7G;
+			      T84 = T82 - T83;
+			      T8q = T82 + T83;
+			      T8j = FMA(KP195090322, T86, KP980785280 * T85);
+			      T8k = FMA(KP195090322, T89, KP980785280 * T88);
+			      T8l = T8j - T8k;
+			      T8r = T8j + T8k;
+			 }
+			 {
+			      E T87, T8a, T8g, T8h;
+			      T87 = FNMS(KP980785280, T86, KP195090322 * T85);
+			      T8a = FNMS(KP980785280, T89, KP195090322 * T88);
+			      T8b = T87 + T8a;
+			      T8v = T87 - T8a;
+			      T8g = T7C - T7D;
+			      T8h = T7g - T7j;
+			      T8i = T8g + T8h;
+			      T8u = T8g - T8h;
+			 }
+			 {
+			      E T8c, T8m, T8y, T8A;
+			      T8c = T84 - T8b;
+			      T8m = T8i - T8l;
+			      cr[WS(rs, 23)] = FNMS(T8f, T8m, T81 * T8c);
+			      ci[WS(rs, 23)] = FMA(T8f, T8c, T81 * T8m);
+			      T8y = T8q + T8r;
+			      T8A = T8u - T8v;
+			      cr[WS(rs, 31)] = FNMS(T8z, T8A, T8x * T8y);
+			      ci[WS(rs, 31)] = FMA(T8x, T8A, T8z * T8y);
+			 }
+			 {
+			      E T8n, T8o, T8s, T8w;
+			      T8n = T84 + T8b;
+			      T8o = T8i + T8l;
+			      cr[WS(rs, 7)] = FNMS(T1j, T8o, T1f * T8n);
+			      ci[WS(rs, 7)] = FMA(T1j, T8n, T1f * T8o);
+			      T8s = T8q - T8r;
+			      T8w = T8u + T8v;
+			      cr[WS(rs, 15)] = FNMS(T8t, T8w, T8p * T8s);
+			      ci[WS(rs, 15)] = FMA(T8p, T8w, T8t * T8s);
+			 }
+		    }
+		    {
+			 E T5u, T6u, T6n, T6v, T67, T6B, T6k, T6A;
+			 {
+			      E T5e, T5t, T6l, T6m;
+			      T5e = T56 - T5d;
+			      T5t = T5l - T5s;
+			      T5u = T5e + T5t;
+			      T6u = T5e - T5t;
+			      T6l = FMA(KP831469612, T5F, KP555570233 * T5M);
+			      T6m = FNMS(KP831469612, T5Y, KP555570233 * T65);
+			      T6n = T6l + T6m;
+			      T6v = T6m - T6l;
+			 }
+			 {
+			      E T5N, T66, T6g, T6j;
+			      T5N = FNMS(KP831469612, T5M, KP555570233 * T5F);
+			      T66 = FMA(KP555570233, T5Y, KP831469612 * T65);
+			      T67 = T5N + T66;
+			      T6B = T5N - T66;
+			      T6g = T6c - T6f;
+			      T6j = T6h - T6i;
+			      T6k = T6g + T6j;
+			      T6A = T6g - T6j;
+			 }
+			 {
+			      E T68, T6o, T6D, T6E;
+			      T68 = T5u - T67;
+			      T6o = T6k - T6n;
+			      cr[WS(rs, 21)] = FNMS(T69, T6o, T53 * T68);
+			      ci[WS(rs, 21)] = FMA(T69, T68, T53 * T6o);
+			      T6D = T6u + T6v;
+			      T6E = T6A + T6B;
+			      cr[WS(rs, 13)] = FNMS(T4c, T6E, T49 * T6D);
+			      ci[WS(rs, 13)] = FMA(T49, T6E, T4c * T6D);
+			 }
+			 {
+			      E T6p, T6q, T6w, T6C;
+			      T6p = T5u + T67;
+			      T6q = T6k + T6n;
+			      cr[WS(rs, 5)] = FNMS(T4i, T6q, T4h * T6p);
+			      ci[WS(rs, 5)] = FMA(T4i, T6p, T4h * T6q);
+			      T6w = T6u - T6v;
+			      T6C = T6A - T6B;
+			      cr[WS(rs, 29)] = FNMS(T6z, T6C, T6t * T6w);
+			      ci[WS(rs, 29)] = FMA(T6t, T6C, T6z * T6w);
+			 }
+		    }
+		    {
+			 E T7l, T7Q, T7L, T7R, T7A, T7V, T7I, T7U;
+			 {
+			      E T7d, T7k, T7J, T7K;
+			      T7d = T7b - T7c;
+			      T7k = T7g + T7j;
+			      T7l = T7d - T7k;
+			      T7Q = T7d + T7k;
+			      T7J = FNMS(KP555570233, T7v, KP831469612 * T7y);
+			      T7K = FMA(KP555570233, T7o, KP831469612 * T7r);
+			      T7L = T7J - T7K;
+			      T7R = T7J + T7K;
+			 }
+			 {
+			      E T7s, T7z, T7E, T7H;
+			      T7s = FNMS(KP555570233, T7r, KP831469612 * T7o);
+			      T7z = FMA(KP831469612, T7v, KP555570233 * T7y);
+			      T7A = T7s - T7z;
+			      T7V = T7z + T7s;
+			      T7E = T7C + T7D;
+			      T7H = T7F - T7G;
+			      T7I = T7E - T7H;
+			      T7U = T7E + T7H;
+			 }
+			 {
+			      E T7B, T7M, T7X, T7Y;
+			      T7B = T7l - T7A;
+			      T7M = T7I - T7L;
+			      cr[WS(rs, 27)] = FNMS(T1k, T7M, T1g * T7B);
+			      ci[WS(rs, 27)] = FMA(T1k, T7B, T1g * T7M);
+			      T7X = T7Q + T7R;
+			      T7Y = T7U + T7V;
+			      cr[WS(rs, 3)] = FNMS(T15, T7Y, T12 * T7X);
+			      ci[WS(rs, 3)] = FMA(T12, T7Y, T15 * T7X);
+			 }
+			 {
+			      E T7N, T7O, T7S, T7W;
+			      T7N = T7l + T7A;
+			      T7O = T7I + T7L;
+			      cr[WS(rs, 11)] = FNMS(T2Y, T7O, T2X * T7N);
+			      ci[WS(rs, 11)] = FMA(T2Y, T7N, T2X * T7O);
+			      T7S = T7Q - T7R;
+			      T7W = T7U - T7V;
+			      cr[WS(rs, 19)] = FNMS(T7T, T7W, T7P * T7S);
+			      ci[WS(rs, 19)] = FMA(T7P, T7W, T7T * T7S);
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 32, "hb2_32", twinstr, &GENUS, {376, 168, 112, 0} };
+
+void X(codelet_hb2_32) (planner *p) {
+     X(khc2hc_register) (p, hb2_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb2_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb2_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:23 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hb2_4 -include hb.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 33 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E Tg, Tc, Te, To, Tn;
+	       {
+		    E T7, Tb, T8, Ta;
+		    T7 = W[0];
+		    Tb = W[3];
+		    T8 = W[2];
+		    Ta = W[1];
+		    {
+			 E Tj, Tm, T3, T6, Tx, Tr, Tz, Tv, Td;
+			 {
+			      E Tu, T4, Tq, T5, Tp, Tt;
+			      {
+				   E Tk, Tl, T1, T2;
+				   {
+					E Th, Tf, T9, Ti;
+					Th = ci[WS(rs, 3)];
+					Tf = T7 * Tb;
+					T9 = T7 * T8;
+					Ti = cr[WS(rs, 2)];
+					Tk = ci[WS(rs, 2)];
+					Tg = FNMS(Ta, T8, Tf);
+					Tc = FMA(Ta, Tb, T9);
+					Tu = Th + Ti;
+					Tj = Th - Ti;
+					Tl = cr[WS(rs, 3)];
+				   }
+				   T1 = cr[0];
+				   T2 = ci[WS(rs, 1)];
+				   T4 = cr[WS(rs, 1)];
+				   Tm = Tk - Tl;
+				   Tq = Tk + Tl;
+				   T5 = ci[0];
+				   T3 = T1 + T2;
+				   Tp = T1 - T2;
+			      }
+			      Tt = T4 - T5;
+			      T6 = T4 + T5;
+			      Tx = Tp + Tq;
+			      Tr = Tp - Tq;
+			      Tz = Tu - Tt;
+			      Tv = Tt + Tu;
+			      Td = T3 - T6;
+			 }
+			 {
+			      E Ts, Tw, TA, Ty;
+			      cr[0] = T3 + T6;
+			      Ts = T7 * Tr;
+			      ci[0] = Tj + Tm;
+			      Tw = T7 * Tv;
+			      TA = T8 * Tz;
+			      cr[WS(rs, 1)] = FNMS(Ta, Tv, Ts);
+			      Ty = T8 * Tx;
+			      ci[WS(rs, 1)] = FMA(Ta, Tr, Tw);
+			      ci[WS(rs, 3)] = FMA(Tb, Tx, TA);
+			      Te = Tc * Td;
+			      cr[WS(rs, 3)] = FNMS(Tb, Tz, Ty);
+			      To = Tg * Td;
+			      Tn = Tj - Tm;
+			 }
+		    }
+	       }
+	       ci[WS(rs, 2)] = FMA(Tc, Tn, To);
+	       cr[WS(rs, 2)] = FNMS(Tg, Tn, Te);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 4, "hb2_4", twinstr, &GENUS, {16, 8, 8, 0} };
+
+void X(codelet_hb2_4) (planner *p) {
+     X(khc2hc_register) (p, hb2_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hb2_4 -include hb.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 21 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T7, T9, T8, Ta, Tb, Td;
+	       T7 = W[0];
+	       T9 = W[1];
+	       T8 = W[2];
+	       Ta = W[3];
+	       Tb = FMA(T7, T8, T9 * Ta);
+	       Td = FNMS(T9, T8, T7 * Ta);
+	       {
+		    E T3, Tl, T6, To, Tg, Tp, Tj, Tm, Tc, Tk;
+		    {
+			 E T1, T2, T4, T5;
+			 T1 = cr[0];
+			 T2 = ci[WS(rs, 1)];
+			 T3 = T1 + T2;
+			 Tl = T1 - T2;
+			 T4 = cr[WS(rs, 1)];
+			 T5 = ci[0];
+			 T6 = T4 + T5;
+			 To = T4 - T5;
+		    }
+		    {
+			 E Te, Tf, Th, Ti;
+			 Te = ci[WS(rs, 3)];
+			 Tf = cr[WS(rs, 2)];
+			 Tg = Te - Tf;
+			 Tp = Te + Tf;
+			 Th = ci[WS(rs, 2)];
+			 Ti = cr[WS(rs, 3)];
+			 Tj = Th - Ti;
+			 Tm = Th + Ti;
+		    }
+		    cr[0] = T3 + T6;
+		    ci[0] = Tg + Tj;
+		    Tc = T3 - T6;
+		    Tk = Tg - Tj;
+		    cr[WS(rs, 2)] = FNMS(Td, Tk, Tb * Tc);
+		    ci[WS(rs, 2)] = FMA(Td, Tc, Tb * Tk);
+		    {
+			 E Tn, Tq, Tr, Ts;
+			 Tn = Tl - Tm;
+			 Tq = To + Tp;
+			 cr[WS(rs, 1)] = FNMS(T9, Tq, T7 * Tn);
+			 ci[WS(rs, 1)] = FMA(T7, Tq, T9 * Tn);
+			 Tr = Tl + Tm;
+			 Ts = Tp - To;
+			 cr[WS(rs, 3)] = FNMS(Ta, Ts, T8 * Tr);
+			 ci[WS(rs, 3)] = FMA(T8, Ts, Ta * Tr);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 4, "hb2_4", twinstr, &GENUS, {16, 8, 8, 0} };
+
+void X(codelet_hb2_4) (planner *p) {
+     X(khc2hc_register) (p, hb2_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb2_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb2_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 5 -dif -name hb2_5 -include hb.h */
+
+/*
+ * This function contains 44 FP additions, 40 FP multiplications,
+ * (or, 14 additions, 10 multiplications, 30 fused multiply/add),
+ * 51 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E T9, TB, Tz, Tm, T1, TG, TO, TJ, TC, Tn, Tg, To, Tf, Tw, TQ;
+	       E T8, Tb, Th, Ta, Ti, Tp;
+	       T9 = W[0];
+	       TB = W[3];
+	       Tz = W[2];
+	       Tm = W[1];
+	       {
+		    E T4, Tu, T5, T6;
+		    T1 = cr[0];
+		    {
+			 E TF, TA, T2, T3;
+			 TF = T9 * TB;
+			 TA = T9 * Tz;
+			 T2 = cr[WS(rs, 1)];
+			 T3 = ci[0];
+			 TG = FMA(Tm, Tz, TF);
+			 TO = FNMS(Tm, Tz, TF);
+			 TJ = FMA(Tm, TB, TA);
+			 TC = FNMS(Tm, TB, TA);
+			 T4 = T2 + T3;
+			 Tu = T2 - T3;
+			 T5 = cr[WS(rs, 2)];
+			 T6 = ci[WS(rs, 1)];
+		    }
+		    Tn = ci[WS(rs, 4)];
+		    {
+			 E Td, Te, T7, Tv;
+			 Td = ci[WS(rs, 3)];
+			 Te = cr[WS(rs, 4)];
+			 T7 = T5 + T6;
+			 Tv = T5 - T6;
+			 Tg = ci[WS(rs, 2)];
+			 To = Td - Te;
+			 Tf = Td + Te;
+			 Tw = FMA(KP618033988, Tv, Tu);
+			 TQ = FNMS(KP618033988, Tu, Tv);
+			 T8 = T4 + T7;
+			 Tb = T4 - T7;
+			 Th = cr[WS(rs, 3)];
+		    }
+	       }
+	       cr[0] = T1 + T8;
+	       Ta = FNMS(KP250000000, T8, T1);
+	       Ti = Tg + Th;
+	       Tp = Tg - Th;
+	       {
+		    E Tc, TK, Ts, Tq;
+		    Tc = FMA(KP559016994, Tb, Ta);
+		    TK = FNMS(KP559016994, Tb, Ta);
+		    Ts = To - Tp;
+		    Tq = To + Tp;
+		    {
+			 E Tj, TL, Tr, TM, TT;
+			 Tj = FMA(KP618033988, Ti, Tf);
+			 TL = FNMS(KP618033988, Tf, Ti);
+			 ci[0] = Tn + Tq;
+			 Tr = FNMS(KP250000000, Tq, Tn);
+			 TM = FMA(KP951056516, TL, TK);
+			 TT = FNMS(KP951056516, TL, TK);
+			 {
+			      E Tk, TD, Tt, TP;
+			      Tk = FNMS(KP951056516, Tj, Tc);
+			      TD = FMA(KP951056516, Tj, Tc);
+			      Tt = FMA(KP559016994, Ts, Tr);
+			      TP = FNMS(KP559016994, Ts, Tr);
+			      {
+				   E TW, TU, TS, TN;
+				   TW = TB * TT;
+				   TU = Tz * TT;
+				   TS = TO * TM;
+				   TN = TJ * TM;
+				   {
+					E TI, TE, Ty, Tl;
+					TI = TG * TD;
+					TE = TC * TD;
+					Ty = Tm * Tk;
+					Tl = T9 * Tk;
+					{
+					     E TR, TV, Tx, TH;
+					     TR = FNMS(KP951056516, TQ, TP);
+					     TV = FMA(KP951056516, TQ, TP);
+					     Tx = FMA(KP951056516, Tw, Tt);
+					     TH = FNMS(KP951056516, Tw, Tt);
+					     ci[WS(rs, 3)] = FMA(Tz, TV, TW);
+					     cr[WS(rs, 3)] = FNMS(TB, TV, TU);
+					     ci[WS(rs, 2)] = FMA(TJ, TR, TS);
+					     cr[WS(rs, 2)] = FNMS(TO, TR, TN);
+					     ci[WS(rs, 4)] = FMA(TC, TH, TI);
+					     cr[WS(rs, 4)] = FNMS(TG, TH, TE);
+					     ci[WS(rs, 1)] = FMA(T9, Tx, Ty);
+					     cr[WS(rs, 1)] = FNMS(Tm, Tx, Tl);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 5, "hb2_5", twinstr, &GENUS, {14, 10, 30, 0} };
+
+void X(codelet_hb2_5) (planner *p) {
+     X(khc2hc_register) (p, hb2_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 5 -dif -name hb2_5 -include hb.h */
+
+/*
+ * This function contains 44 FP additions, 32 FP multiplications,
+ * (or, 30 additions, 18 multiplications, 14 fused multiply/add),
+ * 33 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E Th, Tk, Ti, Tl, Tn, TP, Tx, TN;
+	       {
+		    E Tj, Tw, Tm, Tv;
+		    Th = W[0];
+		    Tk = W[1];
+		    Ti = W[2];
+		    Tl = W[3];
+		    Tj = Th * Ti;
+		    Tw = Tk * Ti;
+		    Tm = Tk * Tl;
+		    Tv = Th * Tl;
+		    Tn = Tj + Tm;
+		    TP = Tv + Tw;
+		    Tx = Tv - Tw;
+		    TN = Tj - Tm;
+	       }
+	       {
+		    E T1, Tp, TK, TA, T8, To, T9, Tt, TI, TC, Tg, TB;
+		    {
+			 E T4, Ty, T7, Tz;
+			 T1 = cr[0];
+			 {
+			      E T2, T3, T5, T6;
+			      T2 = cr[WS(rs, 1)];
+			      T3 = ci[0];
+			      T4 = T2 + T3;
+			      Ty = T2 - T3;
+			      T5 = cr[WS(rs, 2)];
+			      T6 = ci[WS(rs, 1)];
+			      T7 = T5 + T6;
+			      Tz = T5 - T6;
+			 }
+			 Tp = KP559016994 * (T4 - T7);
+			 TK = FMA(KP951056516, Ty, KP587785252 * Tz);
+			 TA = FNMS(KP951056516, Tz, KP587785252 * Ty);
+			 T8 = T4 + T7;
+			 To = FNMS(KP250000000, T8, T1);
+		    }
+		    {
+			 E Tc, Tr, Tf, Ts;
+			 T9 = ci[WS(rs, 4)];
+			 {
+			      E Ta, Tb, Td, Te;
+			      Ta = ci[WS(rs, 3)];
+			      Tb = cr[WS(rs, 4)];
+			      Tc = Ta - Tb;
+			      Tr = Ta + Tb;
+			      Td = ci[WS(rs, 2)];
+			      Te = cr[WS(rs, 3)];
+			      Tf = Td - Te;
+			      Ts = Td + Te;
+			 }
+			 Tt = FNMS(KP951056516, Ts, KP587785252 * Tr);
+			 TI = FMA(KP951056516, Tr, KP587785252 * Ts);
+			 TC = KP559016994 * (Tc - Tf);
+			 Tg = Tc + Tf;
+			 TB = FNMS(KP250000000, Tg, T9);
+		    }
+		    cr[0] = T1 + T8;
+		    ci[0] = T9 + Tg;
+		    {
+			 E Tu, TF, TE, TG, Tq, TD;
+			 Tq = To - Tp;
+			 Tu = Tq - Tt;
+			 TF = Tq + Tt;
+			 TD = TB - TC;
+			 TE = TA + TD;
+			 TG = TD - TA;
+			 cr[WS(rs, 2)] = FNMS(Tx, TE, Tn * Tu);
+			 ci[WS(rs, 2)] = FMA(Tn, TE, Tx * Tu);
+			 cr[WS(rs, 3)] = FNMS(Tl, TG, Ti * TF);
+			 ci[WS(rs, 3)] = FMA(Ti, TG, Tl * TF);
+		    }
+		    {
+			 E TJ, TO, TM, TQ, TH, TL;
+			 TH = Tp + To;
+			 TJ = TH - TI;
+			 TO = TH + TI;
+			 TL = TC + TB;
+			 TM = TK + TL;
+			 TQ = TL - TK;
+			 cr[WS(rs, 1)] = FNMS(Tk, TM, Th * TJ);
+			 ci[WS(rs, 1)] = FMA(Th, TM, Tk * TJ);
+			 cr[WS(rs, 4)] = FNMS(TP, TQ, TN * TO);
+			 ci[WS(rs, 4)] = FMA(TN, TQ, TP * TO);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 5, "hb2_5", twinstr, &GENUS, {30, 18, 14, 0} };
+
+void X(codelet_hb2_5) (planner *p) {
+     X(khc2hc_register) (p, hb2_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb2_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb2_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:24 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hb2_8 -include hb.h */
+
+/*
+ * This function contains 74 FP additions, 50 FP multiplications,
+ * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
+ * 77 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E Tf, Tg, Tl, Tp, Ti, Tj, T1o, T1u, Tk, T1b, To, T1e, TK, Tq, T13;
+	       E TP, T1p, T7, T1h, T1v, TZ, Tv, Tw, Ta, Tx, T1j, TE, TB, Td, Ty;
+	       E Th, T1n, T1t;
+	       Tf = W[0];
+	       Tg = W[2];
+	       Tl = W[4];
+	       Tp = W[5];
+	       Ti = W[1];
+	       Th = Tf * Tg;
+	       T1n = Tf * Tl;
+	       T1t = Tf * Tp;
+	       Tj = W[3];
+	       {
+		    E Tr, T3, Ts, T1f, TO, TL, T6, Tt;
+		    {
+			 E TM, TN, T4, T5;
+			 {
+			      E T1, Tn, T2, TJ, Tm;
+			      T1 = cr[0];
+			      T1o = FMA(Ti, Tp, T1n);
+			      T1u = FNMS(Ti, Tl, T1t);
+			      Tk = FMA(Ti, Tj, Th);
+			      T1b = FNMS(Ti, Tj, Th);
+			      Tn = Tf * Tj;
+			      T2 = ci[WS(rs, 3)];
+			      TM = ci[WS(rs, 7)];
+			      TJ = Tk * Tp;
+			      Tm = Tk * Tl;
+			      To = FNMS(Ti, Tg, Tn);
+			      T1e = FMA(Ti, Tg, Tn);
+			      Tr = T1 - T2;
+			      T3 = T1 + T2;
+			      TK = FNMS(To, Tl, TJ);
+			      Tq = FMA(To, Tp, Tm);
+			      TN = cr[WS(rs, 4)];
+			 }
+			 T4 = cr[WS(rs, 2)];
+			 T5 = ci[WS(rs, 1)];
+			 Ts = ci[WS(rs, 5)];
+			 T1f = TM - TN;
+			 TO = TM + TN;
+			 TL = T4 - T5;
+			 T6 = T4 + T5;
+			 Tt = cr[WS(rs, 6)];
+		    }
+		    {
+			 E TC, TD, Tb, Tc;
+			 {
+			      E T8, T1g, Tu, T9;
+			      T8 = cr[WS(rs, 1)];
+			      T13 = TO - TL;
+			      TP = TL + TO;
+			      T1p = T3 - T6;
+			      T7 = T3 + T6;
+			      T1g = Ts - Tt;
+			      Tu = Ts + Tt;
+			      T9 = ci[WS(rs, 2)];
+			      TC = ci[WS(rs, 4)];
+			      T1h = T1f + T1g;
+			      T1v = T1f - T1g;
+			      TZ = Tr + Tu;
+			      Tv = Tr - Tu;
+			      Tw = T8 - T9;
+			      Ta = T8 + T9;
+			      TD = cr[WS(rs, 7)];
+			 }
+			 Tb = ci[0];
+			 Tc = cr[WS(rs, 3)];
+			 Tx = ci[WS(rs, 6)];
+			 T1j = TC - TD;
+			 TE = TC + TD;
+			 TB = Tb - Tc;
+			 Td = Tb + Tc;
+			 Ty = cr[WS(rs, 5)];
+		    }
+	       }
+	       {
+		    E TR, TF, Te, T1w;
+		    TR = TB + TE;
+		    TF = TB - TE;
+		    Te = Ta + Td;
+		    T1w = Ta - Td;
+		    {
+			 E Tz, T1i, T1B, T1x, T1c;
+			 Tz = Tx + Ty;
+			 T1i = Tx - Ty;
+			 T1B = T1w + T1v;
+			 T1x = T1v - T1w;
+			 T1c = T7 - Te;
+			 cr[0] = T7 + Te;
+			 {
+			      E T1k, T1q, TQ, TA;
+			      T1k = T1i + T1j;
+			      T1q = T1j - T1i;
+			      TQ = Tw + Tz;
+			      TA = Tw - Tz;
+			      {
+				   E T1y, T1C, T1m, T1d;
+				   T1y = T1o * T1x;
+				   T1C = Tk * T1B;
+				   T1m = T1e * T1c;
+				   T1d = T1b * T1c;
+				   {
+					E T1z, T1r, T1l, TG, T14;
+					T1z = T1p + T1q;
+					T1r = T1p - T1q;
+					T1l = T1h - T1k;
+					ci[0] = T1h + T1k;
+					TG = TA + TF;
+					T14 = TA - TF;
+					{
+					     E T10, TS, T1s, T1A;
+					     T10 = TQ + TR;
+					     TS = TQ - TR;
+					     ci[WS(rs, 6)] = FMA(T1u, T1r, T1y);
+					     T1s = T1o * T1r;
+					     ci[WS(rs, 2)] = FMA(To, T1z, T1C);
+					     T1A = Tk * T1z;
+					     ci[WS(rs, 4)] = FMA(T1b, T1l, T1m);
+					     cr[WS(rs, 4)] = FNMS(T1e, T1l, T1d);
+					     {
+						  E T15, T19, TV, TH;
+						  T15 = FMA(KP707106781, T14, T13);
+						  T19 = FNMS(KP707106781, T14, T13);
+						  TV = FMA(KP707106781, TG, Tv);
+						  TH = FNMS(KP707106781, TG, Tv);
+						  {
+						       E TT, TX, T11, T17;
+						       TT = FNMS(KP707106781, TS, TP);
+						       TX = FMA(KP707106781, TS, TP);
+						       T11 = FNMS(KP707106781, T10, TZ);
+						       T17 = FMA(KP707106781, T10, TZ);
+						       cr[WS(rs, 6)] = FNMS(T1u, T1x, T1s);
+						       cr[WS(rs, 2)] = FNMS(To, T1B, T1A);
+						       {
+							    E T1a, T16, TU, TI;
+							    T1a = Tl * T19;
+							    T16 = Tg * T15;
+							    TU = TK * TH;
+							    TI = Tq * TH;
+							    {
+								 E TY, TW, T18, T12;
+								 TY = Ti * TV;
+								 TW = Tf * TV;
+								 T18 = Tl * T17;
+								 T12 = Tg * T11;
+								 ci[WS(rs, 7)] = FMA(Tp, T17, T1a);
+								 ci[WS(rs, 3)] = FMA(Tj, T11, T16);
+								 ci[WS(rs, 5)] = FMA(Tq, TT, TU);
+								 cr[WS(rs, 5)] = FNMS(TK, TT, TI);
+								 ci[WS(rs, 1)] = FMA(Tf, TX, TY);
+								 cr[WS(rs, 1)] = FNMS(Ti, TX, TW);
+								 cr[WS(rs, 7)] = FNMS(Tp, T19, T18);
+								 cr[WS(rs, 3)] = FNMS(Tj, T15, T12);
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 8, "hb2_8", twinstr, &GENUS, {44, 20, 30, 0} };
+
+void X(codelet_hb2_8) (planner *p) {
+     X(khc2hc_register) (p, hb2_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hb2_8 -include hb.h */
+
+/*
+ * This function contains 74 FP additions, 44 FP multiplications,
+ * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
+ * 46 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hb.h"
+
+static void hb2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E Tf, Ti, Tg, Tj, Tl, Tp, TP, TR, TF, TG, TH, T15, TL, TT;
+	       {
+		    E Th, To, Tk, Tn;
+		    Tf = W[0];
+		    Ti = W[1];
+		    Tg = W[2];
+		    Tj = W[3];
+		    Th = Tf * Tg;
+		    To = Ti * Tg;
+		    Tk = Ti * Tj;
+		    Tn = Tf * Tj;
+		    Tl = Th - Tk;
+		    Tp = Tn + To;
+		    TP = Th + Tk;
+		    TR = Tn - To;
+		    TF = W[4];
+		    TG = W[5];
+		    TH = FMA(Tf, TF, Ti * TG);
+		    T15 = FNMS(TR, TF, TP * TG);
+		    TL = FNMS(Ti, TF, Tf * TG);
+		    TT = FMA(TP, TF, TR * TG);
+	       }
+	       {
+		    E T7, T1f, T1i, Tw, TI, TW, T18, TM, Te, T19, T1a, TD, TJ, TZ, T12;
+		    E TN, Tm, TE;
+		    {
+			 E T3, TU, Tv, TV, T6, T16, Ts, T17;
+			 {
+			      E T1, T2, Tt, Tu;
+			      T1 = cr[0];
+			      T2 = ci[WS(rs, 3)];
+			      T3 = T1 + T2;
+			      TU = T1 - T2;
+			      Tt = ci[WS(rs, 5)];
+			      Tu = cr[WS(rs, 6)];
+			      Tv = Tt - Tu;
+			      TV = Tt + Tu;
+			 }
+			 {
+			      E T4, T5, Tq, Tr;
+			      T4 = cr[WS(rs, 2)];
+			      T5 = ci[WS(rs, 1)];
+			      T6 = T4 + T5;
+			      T16 = T4 - T5;
+			      Tq = ci[WS(rs, 7)];
+			      Tr = cr[WS(rs, 4)];
+			      Ts = Tq - Tr;
+			      T17 = Tq + Tr;
+			 }
+			 T7 = T3 + T6;
+			 T1f = TU + TV;
+			 T1i = T17 - T16;
+			 Tw = Ts + Tv;
+			 TI = T3 - T6;
+			 TW = TU - TV;
+			 T18 = T16 + T17;
+			 TM = Ts - Tv;
+		    }
+		    {
+			 E Ta, TX, TC, T11, Td, T10, Tz, TY;
+			 {
+			      E T8, T9, TA, TB;
+			      T8 = cr[WS(rs, 1)];
+			      T9 = ci[WS(rs, 2)];
+			      Ta = T8 + T9;
+			      TX = T8 - T9;
+			      TA = ci[WS(rs, 4)];
+			      TB = cr[WS(rs, 7)];
+			      TC = TA - TB;
+			      T11 = TA + TB;
+			 }
+			 {
+			      E Tb, Tc, Tx, Ty;
+			      Tb = ci[0];
+			      Tc = cr[WS(rs, 3)];
+			      Td = Tb + Tc;
+			      T10 = Tb - Tc;
+			      Tx = ci[WS(rs, 6)];
+			      Ty = cr[WS(rs, 5)];
+			      Tz = Tx - Ty;
+			      TY = Tx + Ty;
+			 }
+			 Te = Ta + Td;
+			 T19 = TX + TY;
+			 T1a = T10 + T11;
+			 TD = Tz + TC;
+			 TJ = TC - Tz;
+			 TZ = TX - TY;
+			 T12 = T10 - T11;
+			 TN = Ta - Td;
+		    }
+		    cr[0] = T7 + Te;
+		    ci[0] = Tw + TD;
+		    Tm = T7 - Te;
+		    TE = Tw - TD;
+		    cr[WS(rs, 4)] = FNMS(Tp, TE, Tl * Tm);
+		    ci[WS(rs, 4)] = FMA(Tp, Tm, Tl * TE);
+		    {
+			 E TQ, TS, TK, TO;
+			 TQ = TI + TJ;
+			 TS = TN + TM;
+			 cr[WS(rs, 2)] = FNMS(TR, TS, TP * TQ);
+			 ci[WS(rs, 2)] = FMA(TP, TS, TR * TQ);
+			 TK = TI - TJ;
+			 TO = TM - TN;
+			 cr[WS(rs, 6)] = FNMS(TL, TO, TH * TK);
+			 ci[WS(rs, 6)] = FMA(TH, TO, TL * TK);
+		    }
+		    {
+			 E T1h, T1l, T1k, T1m, T1g, T1j;
+			 T1g = KP707106781 * (T19 + T1a);
+			 T1h = T1f - T1g;
+			 T1l = T1f + T1g;
+			 T1j = KP707106781 * (TZ - T12);
+			 T1k = T1i + T1j;
+			 T1m = T1i - T1j;
+			 cr[WS(rs, 3)] = FNMS(Tj, T1k, Tg * T1h);
+			 ci[WS(rs, 3)] = FMA(Tg, T1k, Tj * T1h);
+			 cr[WS(rs, 7)] = FNMS(TG, T1m, TF * T1l);
+			 ci[WS(rs, 7)] = FMA(TF, T1m, TG * T1l);
+		    }
+		    {
+			 E T14, T1d, T1c, T1e, T13, T1b;
+			 T13 = KP707106781 * (TZ + T12);
+			 T14 = TW - T13;
+			 T1d = TW + T13;
+			 T1b = KP707106781 * (T19 - T1a);
+			 T1c = T18 - T1b;
+			 T1e = T18 + T1b;
+			 cr[WS(rs, 5)] = FNMS(T15, T1c, TT * T14);
+			 ci[WS(rs, 5)] = FMA(T15, T14, TT * T1c);
+			 cr[WS(rs, 1)] = FNMS(Ti, T1e, Tf * T1d);
+			 ci[WS(rs, 1)] = FMA(Ti, T1d, Tf * T1e);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 8, "hb2_8", twinstr, &GENUS, {56, 26, 18, 0} };
+
+void X(codelet_hb2_8) (planner *p) {
+     X(khc2hc_register) (p, hb2_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:13 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hb_10 -include hb.h */
+
+/*
+ * This function contains 102 FP additions, 72 FP multiplications,
+ * (or, 48 additions, 18 multiplications, 54 fused multiply/add),
+ * 71 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hb.h"
+
+static void hb_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
+	       E T21, T1Y, T1X;
+	       {
+		    E T1B, TH, T1g, T3, T1V, T1x, T1G, T1E, TM, TK, T11, TB, T7, T1m, T1J;
+		    E TO, Th, T1h, T6, T8, TF, TG, T1i, T9;
+		    TF = ci[WS(rs, 9)];
+		    TG = cr[WS(rs, 5)];
+		    {
+			 E T1u, Tp, Tu, T1s, Tz, T1v, Ts, Tv;
+			 {
+			      E Tx, Ty, Tn, To, Tq, Tr;
+			      Tn = ci[WS(rs, 5)];
+			      To = cr[WS(rs, 9)];
+			      Tx = ci[WS(rs, 6)];
+			      T1B = TF + TG;
+			      TH = TF - TG;
+			      T1u = Tn + To;
+			      Tp = Tn - To;
+			      Ty = cr[WS(rs, 8)];
+			      Tq = ci[WS(rs, 8)];
+			      Tr = cr[WS(rs, 6)];
+			      Tu = ci[WS(rs, 7)];
+			      T1s = Tx + Ty;
+			      Tz = Tx - Ty;
+			      T1v = Tq + Tr;
+			      Ts = Tq - Tr;
+			      Tv = cr[WS(rs, 7)];
+			 }
+			 {
+			      E T1, T1w, T1D, TJ, Tt, T1r, Tw, T2;
+			      T1 = cr[0];
+			      T1w = T1u + T1v;
+			      T1D = T1u - T1v;
+			      TJ = Tp + Ts;
+			      Tt = Tp - Ts;
+			      T1r = Tu + Tv;
+			      Tw = Tu - Tv;
+			      T2 = ci[WS(rs, 4)];
+			      {
+				   E Tb, Tc, Te, Tf;
+				   Tb = cr[WS(rs, 4)];
+				   {
+					E T1t, T1C, TI, TA;
+					T1t = T1r + T1s;
+					T1C = T1r - T1s;
+					TI = Tw + Tz;
+					TA = Tw - Tz;
+					T1g = T1 - T2;
+					T3 = T1 + T2;
+					T1V = FNMS(KP618033988, T1t, T1w);
+					T1x = FMA(KP618033988, T1w, T1t);
+					T1G = T1C - T1D;
+					T1E = T1C + T1D;
+					TM = TI - TJ;
+					TK = TI + TJ;
+					T11 = FMA(KP618033988, Tt, TA);
+					TB = FNMS(KP618033988, TA, Tt);
+					Tc = ci[0];
+				   }
+				   Te = ci[WS(rs, 3)];
+				   Tf = cr[WS(rs, 1)];
+				   {
+					E T4, T1k, Td, T1l, Tg, T5;
+					T4 = cr[WS(rs, 2)];
+					T1k = Tb - Tc;
+					Td = Tb + Tc;
+					T1l = Te - Tf;
+					Tg = Te + Tf;
+					T5 = ci[WS(rs, 2)];
+					T7 = ci[WS(rs, 1)];
+					T1m = T1k + T1l;
+					T1J = T1k - T1l;
+					TO = Td - Tg;
+					Th = Td + Tg;
+					T1h = T4 - T5;
+					T6 = T4 + T5;
+					T8 = cr[WS(rs, 3)];
+				   }
+			      }
+			 }
+		    }
+		    ci[0] = TH + TK;
+		    T1i = T7 - T8;
+		    T9 = T7 + T8;
+		    {
+			 E T2d, T1F, T29, T1I, TP, T2c, T1p, Tl, T1o, Tk, T2b, T2e, T17, T14, T13;
+			 T2d = T1B + T1E;
+			 T1F = FNMS(KP250000000, T1E, T1B);
+			 {
+			      E T1j, Ta, T1n, Ti, T2a;
+			      T29 = W[8];
+			      T1I = T1h - T1i;
+			      T1j = T1h + T1i;
+			      TP = T6 - T9;
+			      Ta = T6 + T9;
+			      T2c = W[9];
+			      T1p = T1j - T1m;
+			      T1n = T1j + T1m;
+			      Tl = Ta - Th;
+			      Ti = Ta + Th;
+			      T1o = FNMS(KP250000000, T1n, T1g);
+			      T2a = T1g + T1n;
+			      cr[0] = T3 + Ti;
+			      Tk = FNMS(KP250000000, Ti, T3);
+			      T2b = T29 * T2a;
+			      T2e = T2c * T2a;
+			 }
+			 {
+			      E T16, TQ, T10, Tm, TL;
+			      T16 = FMA(KP618033988, TO, TP);
+			      TQ = FNMS(KP618033988, TP, TO);
+			      cr[WS(rs, 5)] = FNMS(T2c, T2d, T2b);
+			      ci[WS(rs, 5)] = FMA(T29, T2d, T2e);
+			      T10 = FMA(KP559016994, Tl, Tk);
+			      Tm = FNMS(KP559016994, Tl, Tk);
+			      TL = FNMS(KP250000000, TK, TH);
+			      {
+				   E TE, TU, T12, TR, TX, T1d, T1c, T19, TD, T1e, T1b, TW, TT;
+				   {
+					E TC, T15, T1a, TS, Tj, TN;
+					TE = W[3];
+					TC = FMA(KP951056516, TB, Tm);
+					TU = FNMS(KP951056516, TB, Tm);
+					TN = FNMS(KP559016994, TM, TL);
+					T15 = FMA(KP559016994, TM, TL);
+					T12 = FMA(KP951056516, T11, T10);
+					T1a = FNMS(KP951056516, T11, T10);
+					TS = TE * TC;
+					TR = FNMS(KP951056516, TQ, TN);
+					TX = FMA(KP951056516, TQ, TN);
+					Tj = W[2];
+					T1d = FMA(KP951056516, T16, T15);
+					T17 = FNMS(KP951056516, T16, T15);
+					T1c = W[11];
+					T19 = W[10];
+					ci[WS(rs, 2)] = FMA(Tj, TR, TS);
+					TD = Tj * TC;
+					T1e = T1c * T1a;
+					T1b = T19 * T1a;
+				   }
+				   cr[WS(rs, 2)] = FNMS(TE, TR, TD);
+				   ci[WS(rs, 6)] = FMA(T19, T1d, T1e);
+				   cr[WS(rs, 6)] = FNMS(T1c, T1d, T1b);
+				   TW = W[15];
+				   TT = W[14];
+				   {
+					E TZ, T18, TY, TV;
+					T14 = W[7];
+					TY = TW * TU;
+					TV = TT * TU;
+					TZ = W[6];
+					T18 = T14 * T12;
+					ci[WS(rs, 8)] = FMA(TT, TX, TY);
+					cr[WS(rs, 8)] = FNMS(TW, TX, TV);
+					T13 = TZ * T12;
+					ci[WS(rs, 4)] = FMA(TZ, T17, T18);
+				   }
+			      }
+			 }
+			 {
+			      E T20, T1K, T1q, T1U;
+			      T20 = FNMS(KP618033988, T1I, T1J);
+			      T1K = FMA(KP618033988, T1J, T1I);
+			      cr[WS(rs, 4)] = FNMS(T14, T17, T13);
+			      T1q = FMA(KP559016994, T1p, T1o);
+			      T1U = FNMS(KP559016994, T1p, T1o);
+			      {
+				   E T1A, T1O, T1W, T1R, T1L, T27, T26, T23, T1z, T28, T25, T1Q, T1N;
+				   {
+					E T1y, T1Z, T24, T1M, T1f, T1H;
+					T1A = W[1];
+					T1O = FMA(KP951056516, T1x, T1q);
+					T1y = FNMS(KP951056516, T1x, T1q);
+					T1Z = FNMS(KP559016994, T1G, T1F);
+					T1H = FMA(KP559016994, T1G, T1F);
+					T24 = FMA(KP951056516, T1V, T1U);
+					T1W = FNMS(KP951056516, T1V, T1U);
+					T1M = T1A * T1y;
+					T1R = FNMS(KP951056516, T1K, T1H);
+					T1L = FMA(KP951056516, T1K, T1H);
+					T1f = W[0];
+					T21 = FMA(KP951056516, T20, T1Z);
+					T27 = FNMS(KP951056516, T20, T1Z);
+					T26 = W[13];
+					T23 = W[12];
+					ci[WS(rs, 1)] = FMA(T1f, T1L, T1M);
+					T1z = T1f * T1y;
+					T28 = T26 * T24;
+					T25 = T23 * T24;
+				   }
+				   cr[WS(rs, 1)] = FNMS(T1A, T1L, T1z);
+				   ci[WS(rs, 7)] = FMA(T23, T27, T28);
+				   cr[WS(rs, 7)] = FNMS(T26, T27, T25);
+				   T1Q = W[17];
+				   T1N = W[16];
+				   {
+					E T1T, T22, T1S, T1P;
+					T1Y = W[5];
+					T1S = T1Q * T1O;
+					T1P = T1N * T1O;
+					T1T = W[4];
+					T22 = T1Y * T1W;
+					ci[WS(rs, 9)] = FMA(T1N, T1R, T1S);
+					cr[WS(rs, 9)] = FNMS(T1Q, T1R, T1P);
+					T1X = T1T * T1W;
+					ci[WS(rs, 3)] = FMA(T1T, T21, T22);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 3)] = FNMS(T1Y, T21, T1X);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 10, "hb_10", twinstr, &GENUS, {48, 18, 54, 0} };
+
+void X(codelet_hb_10) (planner *p) {
+     X(khc2hc_register) (p, hb_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hb_10 -include hb.h */
+
+/*
+ * This function contains 102 FP additions, 60 FP multiplications,
+ * (or, 72 additions, 30 multiplications, 30 fused multiply/add),
+ * 41 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hb.h"
+
+static void hb_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
+	       E T3, T18, TE, TF, T1B, T1A, T1f, T1t, Ti, Tl, TJ, T1i, Tt, TA, T1w;
+	       E T1v, T1p, T1E, TM, TO;
+	       {
+		    E T1, T2, TH, TI;
+		    T1 = cr[0];
+		    T2 = ci[WS(rs, 4)];
+		    T3 = T1 + T2;
+		    T18 = T1 - T2;
+		    {
+			 E T6, T19, Tg, T1d, T9, T1a, Td, T1c;
+			 {
+			      E T4, T5, Te, Tf;
+			      T4 = cr[WS(rs, 2)];
+			      T5 = ci[WS(rs, 2)];
+			      T6 = T4 + T5;
+			      T19 = T4 - T5;
+			      Te = ci[WS(rs, 3)];
+			      Tf = cr[WS(rs, 1)];
+			      Tg = Te + Tf;
+			      T1d = Te - Tf;
+			 }
+			 {
+			      E T7, T8, Tb, Tc;
+			      T7 = ci[WS(rs, 1)];
+			      T8 = cr[WS(rs, 3)];
+			      T9 = T7 + T8;
+			      T1a = T7 - T8;
+			      Tb = cr[WS(rs, 4)];
+			      Tc = ci[0];
+			      Td = Tb + Tc;
+			      T1c = Tb - Tc;
+			 }
+			 TE = T6 - T9;
+			 TF = Td - Tg;
+			 T1B = T1c - T1d;
+			 T1A = T19 - T1a;
+			 {
+			      E T1b, T1e, Ta, Th;
+			      T1b = T19 + T1a;
+			      T1e = T1c + T1d;
+			      T1f = T1b + T1e;
+			      T1t = KP559016994 * (T1b - T1e);
+			      Ta = T6 + T9;
+			      Th = Td + Tg;
+			      Ti = Ta + Th;
+			      Tl = KP559016994 * (Ta - Th);
+			 }
+		    }
+		    TH = ci[WS(rs, 9)];
+		    TI = cr[WS(rs, 5)];
+		    TJ = TH - TI;
+		    T1i = TH + TI;
+		    {
+			 E Tp, T1j, Tz, T1n, Ts, T1k, Tw, T1m;
+			 {
+			      E Tn, To, Tx, Ty;
+			      Tn = ci[WS(rs, 7)];
+			      To = cr[WS(rs, 7)];
+			      Tp = Tn - To;
+			      T1j = Tn + To;
+			      Tx = ci[WS(rs, 8)];
+			      Ty = cr[WS(rs, 6)];
+			      Tz = Tx - Ty;
+			      T1n = Tx + Ty;
+			 }
+			 {
+			      E Tq, Tr, Tu, Tv;
+			      Tq = ci[WS(rs, 6)];
+			      Tr = cr[WS(rs, 8)];
+			      Ts = Tq - Tr;
+			      T1k = Tq + Tr;
+			      Tu = ci[WS(rs, 5)];
+			      Tv = cr[WS(rs, 9)];
+			      Tw = Tu - Tv;
+			      T1m = Tu + Tv;
+			 }
+			 Tt = Tp - Ts;
+			 TA = Tw - Tz;
+			 T1w = T1m + T1n;
+			 T1v = T1j + T1k;
+			 {
+			      E T1l, T1o, TK, TL;
+			      T1l = T1j - T1k;
+			      T1o = T1m - T1n;
+			      T1p = T1l + T1o;
+			      T1E = KP559016994 * (T1l - T1o);
+			      TK = Tp + Ts;
+			      TL = Tw + Tz;
+			      TM = TK + TL;
+			      TO = KP559016994 * (TK - TL);
+			 }
+		    }
+	       }
+	       cr[0] = T3 + Ti;
+	       ci[0] = TJ + TM;
+	       {
+		    E T1g, T1q, T17, T1h;
+		    T1g = T18 + T1f;
+		    T1q = T1i + T1p;
+		    T17 = W[8];
+		    T1h = W[9];
+		    cr[WS(rs, 5)] = FNMS(T1h, T1q, T17 * T1g);
+		    ci[WS(rs, 5)] = FMA(T1h, T1g, T17 * T1q);
+	       }
+	       {
+		    E TB, TG, T11, TX, TP, T10, Tm, TW, TN, Tk;
+		    TB = FNMS(KP951056516, TA, KP587785252 * Tt);
+		    TG = FNMS(KP951056516, TF, KP587785252 * TE);
+		    T11 = FMA(KP951056516, TE, KP587785252 * TF);
+		    TX = FMA(KP951056516, Tt, KP587785252 * TA);
+		    TN = FNMS(KP250000000, TM, TJ);
+		    TP = TN - TO;
+		    T10 = TO + TN;
+		    Tk = FNMS(KP250000000, Ti, T3);
+		    Tm = Tk - Tl;
+		    TW = Tl + Tk;
+		    {
+			 E TC, TQ, Tj, TD;
+			 TC = Tm - TB;
+			 TQ = TG + TP;
+			 Tj = W[2];
+			 TD = W[3];
+			 cr[WS(rs, 2)] = FNMS(TD, TQ, Tj * TC);
+			 ci[WS(rs, 2)] = FMA(TD, TC, Tj * TQ);
+		    }
+		    {
+			 E T14, T16, T13, T15;
+			 T14 = TW - TX;
+			 T16 = T11 + T10;
+			 T13 = W[10];
+			 T15 = W[11];
+			 cr[WS(rs, 6)] = FNMS(T15, T16, T13 * T14);
+			 ci[WS(rs, 6)] = FMA(T15, T14, T13 * T16);
+		    }
+		    {
+			 E TS, TU, TR, TT;
+			 TS = Tm + TB;
+			 TU = TP - TG;
+			 TR = W[14];
+			 TT = W[15];
+			 cr[WS(rs, 8)] = FNMS(TT, TU, TR * TS);
+			 ci[WS(rs, 8)] = FMA(TT, TS, TR * TU);
+		    }
+		    {
+			 E TY, T12, TV, TZ;
+			 TY = TW + TX;
+			 T12 = T10 - T11;
+			 TV = W[6];
+			 TZ = W[7];
+			 cr[WS(rs, 4)] = FNMS(TZ, T12, TV * TY);
+			 ci[WS(rs, 4)] = FMA(TZ, TY, TV * T12);
+		    }
+	       }
+	       {
+		    E T1x, T1C, T1Q, T1N, T1F, T1R, T1u, T1M, T1D, T1s;
+		    T1x = FNMS(KP951056516, T1w, KP587785252 * T1v);
+		    T1C = FNMS(KP951056516, T1B, KP587785252 * T1A);
+		    T1Q = FMA(KP951056516, T1A, KP587785252 * T1B);
+		    T1N = FMA(KP951056516, T1v, KP587785252 * T1w);
+		    T1D = FNMS(KP250000000, T1p, T1i);
+		    T1F = T1D - T1E;
+		    T1R = T1E + T1D;
+		    T1s = FNMS(KP250000000, T1f, T18);
+		    T1u = T1s - T1t;
+		    T1M = T1t + T1s;
+		    {
+			 E T1y, T1G, T1r, T1z;
+			 T1y = T1u - T1x;
+			 T1G = T1C + T1F;
+			 T1r = W[12];
+			 T1z = W[13];
+			 cr[WS(rs, 7)] = FNMS(T1z, T1G, T1r * T1y);
+			 ci[WS(rs, 7)] = FMA(T1r, T1G, T1z * T1y);
+		    }
+		    {
+			 E T1U, T1W, T1T, T1V;
+			 T1U = T1M + T1N;
+			 T1W = T1R - T1Q;
+			 T1T = W[16];
+			 T1V = W[17];
+			 cr[WS(rs, 9)] = FNMS(T1V, T1W, T1T * T1U);
+			 ci[WS(rs, 9)] = FMA(T1T, T1W, T1V * T1U);
+		    }
+		    {
+			 E T1I, T1K, T1H, T1J;
+			 T1I = T1u + T1x;
+			 T1K = T1F - T1C;
+			 T1H = W[4];
+			 T1J = W[5];
+			 cr[WS(rs, 3)] = FNMS(T1J, T1K, T1H * T1I);
+			 ci[WS(rs, 3)] = FMA(T1H, T1K, T1J * T1I);
+		    }
+		    {
+			 E T1O, T1S, T1L, T1P;
+			 T1O = T1M - T1N;
+			 T1S = T1Q + T1R;
+			 T1L = W[0];
+			 T1P = W[1];
+			 cr[WS(rs, 1)] = FNMS(T1P, T1S, T1L * T1O);
+			 ci[WS(rs, 1)] = FMA(T1L, T1S, T1P * T1O);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 10, "hb_10", twinstr, &GENUS, {72, 30, 30, 0} };
+
+void X(codelet_hb_10) (planner *p) {
+     X(khc2hc_register) (p, hb_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:14 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hb_12 -include hb.h */
+
+/*
+ * This function contains 118 FP additions, 68 FP multiplications,
+ * (or, 72 additions, 22 multiplications, 46 fused multiply/add),
+ * 64 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hb.h"
+
+static void hb_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T1U, T1X, T1W, T1Y, T1V;
+	       {
+		    E T18, T20, T2a, T1s, T21, T1b, T29, T1p, TO, T11, To, Tb, Tg, T23, T1f;
+		    E Ty, Tl, Tt, T1z, T2d, T1i, T24, T1w, T2c;
+		    {
+			 E T5, TN, Ta, TI;
+			 {
+			      E T1, TE, TM, T6, TJ, T1o, T4, T17, TH, TK, T7, T8;
+			      T1 = cr[0];
+			      TE = ci[WS(rs, 11)];
+			      TM = cr[WS(rs, 6)];
+			      T6 = ci[WS(rs, 5)];
+			      {
+				   E T2, T3, TF, TG;
+				   T2 = cr[WS(rs, 4)];
+				   T3 = ci[WS(rs, 3)];
+				   TF = ci[WS(rs, 7)];
+				   TG = cr[WS(rs, 8)];
+				   TJ = ci[WS(rs, 9)];
+				   T1o = T2 - T3;
+				   T4 = T2 + T3;
+				   T17 = TF + TG;
+				   TH = TF - TG;
+				   TK = cr[WS(rs, 10)];
+				   T7 = ci[WS(rs, 1)];
+				   T8 = cr[WS(rs, 2)];
+			      }
+			      {
+				   E T1a, T1r, T1q, T19, TL, T9, T16, T1n;
+				   T5 = T1 + T4;
+				   T16 = FNMS(KP500000000, T4, T1);
+				   T1a = TJ + TK;
+				   TL = TJ - TK;
+				   T1r = T7 - T8;
+				   T9 = T7 + T8;
+				   T18 = FNMS(KP866025403, T17, T16);
+				   T20 = FMA(KP866025403, T17, T16);
+				   T1q = FMA(KP500000000, TL, TM);
+				   TN = TL - TM;
+				   Ta = T6 + T9;
+				   T19 = FNMS(KP500000000, T9, T6);
+				   T1n = FNMS(KP500000000, TH, TE);
+				   TI = TE + TH;
+				   T2a = FMA(KP866025403, T1r, T1q);
+				   T1s = FNMS(KP866025403, T1r, T1q);
+				   T21 = FNMS(KP866025403, T1a, T19);
+				   T1b = FMA(KP866025403, T1a, T19);
+				   T29 = FNMS(KP866025403, T1o, T1n);
+				   T1p = FMA(KP866025403, T1o, T1n);
+			      }
+			 }
+			 {
+			      E Tc, Tp, Tx, Th, Tu, Tf, T1v, Ts, T1e, Tv, Ti, Tj;
+			      Tc = cr[WS(rs, 3)];
+			      TO = TI - TN;
+			      T11 = TI + TN;
+			      Tp = ci[WS(rs, 8)];
+			      To = T5 - Ta;
+			      Tb = T5 + Ta;
+			      Tx = cr[WS(rs, 9)];
+			      Th = ci[WS(rs, 2)];
+			      {
+				   E Td, Te, Tq, Tr;
+				   Td = ci[WS(rs, 4)];
+				   Te = ci[0];
+				   Tq = cr[WS(rs, 7)];
+				   Tr = cr[WS(rs, 11)];
+				   Tu = ci[WS(rs, 10)];
+				   Tf = Td + Te;
+				   T1v = Td - Te;
+				   Ts = Tq + Tr;
+				   T1e = Tq - Tr;
+				   Tv = ci[WS(rs, 6)];
+				   Ti = cr[WS(rs, 1)];
+				   Tj = cr[WS(rs, 5)];
+			      }
+			      {
+				   E T1h, T1y, T1x, T1g, Tw, Tk, T1d, T1u;
+				   T1d = FNMS(KP500000000, Tf, Tc);
+				   Tg = Tc + Tf;
+				   Tw = Tu + Tv;
+				   T1h = Tv - Tu;
+				   Tk = Ti + Tj;
+				   T1y = Ti - Tj;
+				   T23 = FNMS(KP866025403, T1e, T1d);
+				   T1f = FMA(KP866025403, T1e, T1d);
+				   Ty = Tw - Tx;
+				   T1x = FMA(KP500000000, Tw, Tx);
+				   T1g = FNMS(KP500000000, Tk, Th);
+				   Tl = Th + Tk;
+				   Tt = Tp - Ts;
+				   T1u = FMA(KP500000000, Ts, Tp);
+				   T1z = FNMS(KP866025403, T1y, T1x);
+				   T2d = FMA(KP866025403, T1y, T1x);
+				   T1i = FMA(KP866025403, T1h, T1g);
+				   T24 = FNMS(KP866025403, T1h, T1g);
+				   T1w = FMA(KP866025403, T1v, T1u);
+				   T2c = FNMS(KP866025403, T1v, T1u);
+			      }
+			 }
+		    }
+		    {
+			 E TY, T13, TX, T10;
+			 {
+			      E Tn, T12, TC, Tm, TD, TS, TA, Tz;
+			      Tn = W[16];
+			      T12 = Tt + Ty;
+			      Tz = Tt - Ty;
+			      TC = W[17];
+			      Tm = Tg + Tl;
+			      TD = Tg - Tl;
+			      TS = To + Tz;
+			      TA = To - Tz;
+			      {
+				   E TV, TU, TW, TT;
+				   {
+					E TQ, TR, TP, TB;
+					TV = TO - TD;
+					TP = TD + TO;
+					cr[0] = Tb + Tm;
+					TB = Tn * TA;
+					TQ = Tn * TP;
+					TR = W[4];
+					cr[WS(rs, 9)] = FNMS(TC, TP, TB);
+					TU = W[5];
+					ci[WS(rs, 9)] = FMA(TC, TA, TQ);
+					TW = TR * TV;
+					TT = TR * TS;
+				   }
+				   ci[WS(rs, 3)] = FMA(TU, TS, TW);
+				   cr[WS(rs, 3)] = FNMS(TU, TV, TT);
+				   TY = Tb - Tm;
+				   T13 = T11 - T12;
+				   TX = W[10];
+				   T10 = W[11];
+				   ci[0] = T11 + T12;
+			      }
+			 }
+			 {
+			      E T1K, T1Q, T1P, T1L, T2o, T2u, T2t, T2p;
+			      {
+				   E T1E, T1D, T1H, T1F, T1G, T1t, T1k, T1A;
+				   {
+					E T1c, TZ, T14, T1j;
+					T1K = T18 - T1b;
+					T1c = T18 + T1b;
+					TZ = TX * TY;
+					T14 = T10 * TY;
+					T1j = T1f + T1i;
+					T1Q = T1f - T1i;
+					T1P = T1p + T1s;
+					T1t = T1p - T1s;
+					cr[WS(rs, 6)] = FNMS(T10, T13, TZ);
+					ci[WS(rs, 6)] = FMA(TX, T13, T14);
+					T1E = T1c + T1j;
+					T1k = T1c - T1j;
+					T1A = T1w - T1z;
+					T1L = T1w + T1z;
+				   }
+				   {
+					E T15, T1m, T1B, T1l, T1C;
+					T15 = W[18];
+					T1m = W[19];
+					T1D = W[6];
+					T1H = T1t + T1A;
+					T1B = T1t - T1A;
+					T1l = T15 * T1k;
+					T1C = T1m * T1k;
+					T1F = T1D * T1E;
+					T1G = W[7];
+					cr[WS(rs, 10)] = FNMS(T1m, T1B, T1l);
+					ci[WS(rs, 10)] = FMA(T15, T1B, T1C);
+				   }
+				   {
+					E T26, T2i, T2l, T2f, T1Z, T28;
+					{
+					     E T22, T1I, T25, T2b, T2e;
+					     T22 = T20 + T21;
+					     T2o = T20 - T21;
+					     cr[WS(rs, 4)] = FNMS(T1G, T1H, T1F);
+					     T1I = T1G * T1E;
+					     T2u = T23 - T24;
+					     T25 = T23 + T24;
+					     T2b = T29 - T2a;
+					     T2t = T29 + T2a;
+					     T2p = T2c + T2d;
+					     T2e = T2c - T2d;
+					     ci[WS(rs, 4)] = FMA(T1D, T1H, T1I);
+					     T26 = T22 - T25;
+					     T2i = T22 + T25;
+					     T2l = T2b + T2e;
+					     T2f = T2b - T2e;
+					}
+					T1Z = W[2];
+					T28 = W[3];
+					{
+					     E T2h, T2k, T27, T2g, T2j, T2m;
+					     T2h = W[14];
+					     T2k = W[15];
+					     T27 = T1Z * T26;
+					     T2g = T28 * T26;
+					     T2j = T2h * T2i;
+					     T2m = T2k * T2i;
+					     cr[WS(rs, 2)] = FNMS(T28, T2f, T27);
+					     ci[WS(rs, 2)] = FMA(T1Z, T2f, T2g);
+					     cr[WS(rs, 8)] = FNMS(T2k, T2l, T2j);
+					     ci[WS(rs, 8)] = FMA(T2h, T2l, T2m);
+					}
+				   }
+			      }
+			      {
+				   E T2y, T2B, T2A, T2C, T2z;
+				   {
+					E T2n, T2q, T2v, T2s, T2r, T2x, T2w;
+					T2n = W[8];
+					T2y = T2o + T2p;
+					T2q = T2o - T2p;
+					T2B = T2t - T2u;
+					T2v = T2t + T2u;
+					T2s = W[9];
+					T2r = T2n * T2q;
+					T2x = W[20];
+					T2w = T2n * T2v;
+					T2A = W[21];
+					cr[WS(rs, 5)] = FNMS(T2s, T2v, T2r);
+					T2C = T2x * T2B;
+					T2z = T2x * T2y;
+					ci[WS(rs, 5)] = FMA(T2s, T2q, T2w);
+				   }
+				   ci[WS(rs, 11)] = FMA(T2A, T2y, T2C);
+				   cr[WS(rs, 11)] = FNMS(T2A, T2B, T2z);
+				   {
+					E T1J, T1M, T1R, T1O, T1N, T1T, T1S;
+					T1J = W[0];
+					T1U = T1K + T1L;
+					T1M = T1K - T1L;
+					T1X = T1P - T1Q;
+					T1R = T1P + T1Q;
+					T1O = W[1];
+					T1N = T1J * T1M;
+					T1T = W[12];
+					T1S = T1J * T1R;
+					T1W = W[13];
+					cr[WS(rs, 1)] = FNMS(T1O, T1R, T1N);
+					T1Y = T1T * T1X;
+					T1V = T1T * T1U;
+					ci[WS(rs, 1)] = FMA(T1O, T1M, T1S);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 7)] = FMA(T1W, T1U, T1Y);
+	       cr[WS(rs, 7)] = FNMS(T1W, T1X, T1V);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 12, "hb_12", twinstr, &GENUS, {72, 22, 46, 0} };
+
+void X(codelet_hb_12) (planner *p) {
+     X(khc2hc_register) (p, hb_12, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hb_12 -include hb.h */
+
+/*
+ * This function contains 118 FP additions, 60 FP multiplications,
+ * (or, 88 additions, 30 multiplications, 30 fused multiply/add),
+ * 39 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hb.h"
+
+static void hb_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T5, TH, T12, T1M, T1i, T1U, Tg, Tt, T19, T1X, T1p, T1P, Ta, TM, T15;
+	       E T1N, T1l, T1V, Tl, Ty, T1c, T1Y, T1s, T1Q;
+	       {
+		    E T1, TD, T4, T1g, TG, T11, T10, T1h;
+		    T1 = cr[0];
+		    TD = ci[WS(rs, 11)];
+		    {
+			 E T2, T3, TE, TF;
+			 T2 = cr[WS(rs, 4)];
+			 T3 = ci[WS(rs, 3)];
+			 T4 = T2 + T3;
+			 T1g = KP866025403 * (T2 - T3);
+			 TE = ci[WS(rs, 7)];
+			 TF = cr[WS(rs, 8)];
+			 TG = TE - TF;
+			 T11 = KP866025403 * (TE + TF);
+		    }
+		    T5 = T1 + T4;
+		    TH = TD + TG;
+		    T10 = FNMS(KP500000000, T4, T1);
+		    T12 = T10 - T11;
+		    T1M = T10 + T11;
+		    T1h = FNMS(KP500000000, TG, TD);
+		    T1i = T1g + T1h;
+		    T1U = T1h - T1g;
+	       }
+	       {
+		    E Tc, Tp, Tf, T17, Ts, T1o, T18, T1n;
+		    Tc = cr[WS(rs, 3)];
+		    Tp = ci[WS(rs, 8)];
+		    {
+			 E Td, Te, Tq, Tr;
+			 Td = ci[WS(rs, 4)];
+			 Te = ci[0];
+			 Tf = Td + Te;
+			 T17 = KP866025403 * (Td - Te);
+			 Tq = cr[WS(rs, 7)];
+			 Tr = cr[WS(rs, 11)];
+			 Ts = Tq + Tr;
+			 T1o = KP866025403 * (Tq - Tr);
+		    }
+		    Tg = Tc + Tf;
+		    Tt = Tp - Ts;
+		    T18 = FMA(KP500000000, Ts, Tp);
+		    T19 = T17 + T18;
+		    T1X = T18 - T17;
+		    T1n = FNMS(KP500000000, Tf, Tc);
+		    T1p = T1n + T1o;
+		    T1P = T1n - T1o;
+	       }
+	       {
+		    E T6, TL, T9, T1j, TK, T14, T13, T1k;
+		    T6 = ci[WS(rs, 5)];
+		    TL = cr[WS(rs, 6)];
+		    {
+			 E T7, T8, TI, TJ;
+			 T7 = ci[WS(rs, 1)];
+			 T8 = cr[WS(rs, 2)];
+			 T9 = T7 + T8;
+			 T1j = KP866025403 * (T7 - T8);
+			 TI = ci[WS(rs, 9)];
+			 TJ = cr[WS(rs, 10)];
+			 TK = TI - TJ;
+			 T14 = KP866025403 * (TI + TJ);
+		    }
+		    Ta = T6 + T9;
+		    TM = TK - TL;
+		    T13 = FNMS(KP500000000, T9, T6);
+		    T15 = T13 + T14;
+		    T1N = T13 - T14;
+		    T1k = FMA(KP500000000, TK, TL);
+		    T1l = T1j - T1k;
+		    T1V = T1j + T1k;
+	       }
+	       {
+		    E Th, Tx, Tk, T1a, Tw, T1r, T1b, T1q;
+		    Th = ci[WS(rs, 2)];
+		    Tx = cr[WS(rs, 9)];
+		    {
+			 E Ti, Tj, Tu, Tv;
+			 Ti = cr[WS(rs, 1)];
+			 Tj = cr[WS(rs, 5)];
+			 Tk = Ti + Tj;
+			 T1a = KP866025403 * (Ti - Tj);
+			 Tu = ci[WS(rs, 10)];
+			 Tv = ci[WS(rs, 6)];
+			 Tw = Tu + Tv;
+			 T1r = KP866025403 * (Tv - Tu);
+		    }
+		    Tl = Th + Tk;
+		    Ty = Tw - Tx;
+		    T1b = FMA(KP500000000, Tw, Tx);
+		    T1c = T1a - T1b;
+		    T1Y = T1a + T1b;
+		    T1q = FNMS(KP500000000, Tk, Th);
+		    T1s = T1q + T1r;
+		    T1Q = T1q - T1r;
+	       }
+	       {
+		    E Tb, Tm, TU, TW, TX, TY, TT, TV;
+		    Tb = T5 + Ta;
+		    Tm = Tg + Tl;
+		    TU = Tb - Tm;
+		    TW = TH + TM;
+		    TX = Tt + Ty;
+		    TY = TW - TX;
+		    cr[0] = Tb + Tm;
+		    ci[0] = TW + TX;
+		    TT = W[10];
+		    TV = W[11];
+		    cr[WS(rs, 6)] = FNMS(TV, TY, TT * TU);
+		    ci[WS(rs, 6)] = FMA(TV, TU, TT * TY);
+	       }
+	       {
+		    E TA, TQ, TO, TS;
+		    {
+			 E To, Tz, TC, TN;
+			 To = T5 - Ta;
+			 Tz = Tt - Ty;
+			 TA = To - Tz;
+			 TQ = To + Tz;
+			 TC = Tg - Tl;
+			 TN = TH - TM;
+			 TO = TC + TN;
+			 TS = TN - TC;
+		    }
+		    {
+			 E Tn, TB, TP, TR;
+			 Tn = W[16];
+			 TB = W[17];
+			 cr[WS(rs, 9)] = FNMS(TB, TO, Tn * TA);
+			 ci[WS(rs, 9)] = FMA(Tn, TO, TB * TA);
+			 TP = W[4];
+			 TR = W[5];
+			 cr[WS(rs, 3)] = FNMS(TR, TS, TP * TQ);
+			 ci[WS(rs, 3)] = FMA(TP, TS, TR * TQ);
+		    }
+	       }
+	       {
+		    E T28, T2e, T2c, T2g;
+		    {
+			 E T26, T27, T2a, T2b;
+			 T26 = T1M - T1N;
+			 T27 = T1X + T1Y;
+			 T28 = T26 - T27;
+			 T2e = T26 + T27;
+			 T2a = T1U + T1V;
+			 T2b = T1P - T1Q;
+			 T2c = T2a + T2b;
+			 T2g = T2a - T2b;
+		    }
+		    {
+			 E T25, T29, T2d, T2f;
+			 T25 = W[8];
+			 T29 = W[9];
+			 cr[WS(rs, 5)] = FNMS(T29, T2c, T25 * T28);
+			 ci[WS(rs, 5)] = FMA(T25, T2c, T29 * T28);
+			 T2d = W[20];
+			 T2f = W[21];
+			 cr[WS(rs, 11)] = FNMS(T2f, T2g, T2d * T2e);
+			 ci[WS(rs, 11)] = FMA(T2d, T2g, T2f * T2e);
+		    }
+	       }
+	       {
+		    E T1S, T22, T20, T24;
+		    {
+			 E T1O, T1R, T1W, T1Z;
+			 T1O = T1M + T1N;
+			 T1R = T1P + T1Q;
+			 T1S = T1O - T1R;
+			 T22 = T1O + T1R;
+			 T1W = T1U - T1V;
+			 T1Z = T1X - T1Y;
+			 T20 = T1W - T1Z;
+			 T24 = T1W + T1Z;
+		    }
+		    {
+			 E T1L, T1T, T21, T23;
+			 T1L = W[2];
+			 T1T = W[3];
+			 cr[WS(rs, 2)] = FNMS(T1T, T20, T1L * T1S);
+			 ci[WS(rs, 2)] = FMA(T1T, T1S, T1L * T20);
+			 T21 = W[14];
+			 T23 = W[15];
+			 cr[WS(rs, 8)] = FNMS(T23, T24, T21 * T22);
+			 ci[WS(rs, 8)] = FMA(T23, T22, T21 * T24);
+		    }
+	       }
+	       {
+		    E T1C, T1I, T1G, T1K;
+		    {
+			 E T1A, T1B, T1E, T1F;
+			 T1A = T12 + T15;
+			 T1B = T1p + T1s;
+			 T1C = T1A - T1B;
+			 T1I = T1A + T1B;
+			 T1E = T1i + T1l;
+			 T1F = T19 + T1c;
+			 T1G = T1E - T1F;
+			 T1K = T1E + T1F;
+		    }
+		    {
+			 E T1z, T1D, T1H, T1J;
+			 T1z = W[18];
+			 T1D = W[19];
+			 cr[WS(rs, 10)] = FNMS(T1D, T1G, T1z * T1C);
+			 ci[WS(rs, 10)] = FMA(T1D, T1C, T1z * T1G);
+			 T1H = W[6];
+			 T1J = W[7];
+			 cr[WS(rs, 4)] = FNMS(T1J, T1K, T1H * T1I);
+			 ci[WS(rs, 4)] = FMA(T1J, T1I, T1H * T1K);
+		    }
+	       }
+	       {
+		    E T1e, T1w, T1u, T1y;
+		    {
+			 E T16, T1d, T1m, T1t;
+			 T16 = T12 - T15;
+			 T1d = T19 - T1c;
+			 T1e = T16 - T1d;
+			 T1w = T16 + T1d;
+			 T1m = T1i - T1l;
+			 T1t = T1p - T1s;
+			 T1u = T1m + T1t;
+			 T1y = T1m - T1t;
+		    }
+		    {
+			 E TZ, T1f, T1v, T1x;
+			 TZ = W[0];
+			 T1f = W[1];
+			 cr[WS(rs, 1)] = FNMS(T1f, T1u, TZ * T1e);
+			 ci[WS(rs, 1)] = FMA(TZ, T1u, T1f * T1e);
+			 T1v = W[12];
+			 T1x = W[13];
+			 cr[WS(rs, 7)] = FNMS(T1x, T1y, T1v * T1w);
+			 ci[WS(rs, 7)] = FMA(T1v, T1y, T1x * T1w);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 12, "hb_12", twinstr, &GENUS, {88, 30, 30, 0} };
+
+void X(codelet_hb_12) (planner *p) {
+     X(khc2hc_register) (p, hb_12, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,800 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:14 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -dif -name hb_15 -include hb.h */
+
+/*
+ * This function contains 184 FP additions, 140 FP multiplications,
+ * (or, 72 additions, 28 multiplications, 112 fused multiply/add),
+ * 93 stack variables, 6 constants, and 60 memory accesses
+ */
+#include "hb.h"
+
+static void hb_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
+	       E T3v, T3u, T3r, T3w, T3t;
+	       {
+		    E T5, T11, T1C, T2U, T2f, T3f, T19, T18, TS, TH, T14, T16, T3g, T3a, Ts;
+		    E Tv, T37, T3h, T28, T2h, T1M, T21, T2g, T3n, T2X, T1P, T30, T3m, T1J, T2m;
+		    {
+			 E T1, TX, T2, T3, TY, TZ;
+			 T1 = cr[0];
+			 TX = ci[WS(rs, 14)];
+			 T2 = cr[WS(rs, 5)];
+			 T3 = ci[WS(rs, 4)];
+			 TY = ci[WS(rs, 9)];
+			 TZ = cr[WS(rs, 10)];
+			 {
+			      E T1W, T23, T1D, Ta, Tl, T1K, T1Z, T1H, T1G, Tf, TR, T1Y, T26, TI, T1O;
+			      E T1N, Tq, TG, T25, Tx, Ty, Tz, TL, T1E;
+			      {
+				   E Tb, TQ, TN, TO, Te;
+				   {
+					E T6, Th, Ti, Tj, T9, Tc, Td, Tk;
+					{
+					     E T7, T8, T2e, T4;
+					     T6 = cr[WS(rs, 3)];
+					     T2e = T2 - T3;
+					     T4 = T2 + T3;
+					     {
+						  E T1B, T10, T1A, T2d;
+						  T1B = TY + TZ;
+						  T10 = TY - TZ;
+						  T7 = ci[WS(rs, 6)];
+						  T5 = T1 + T4;
+						  T1A = FNMS(KP500000000, T4, T1);
+						  T11 = TX + T10;
+						  T2d = FNMS(KP500000000, T10, TX);
+						  T1C = FNMS(KP866025403, T1B, T1A);
+						  T2U = FMA(KP866025403, T1B, T1A);
+						  T2f = FMA(KP866025403, T2e, T2d);
+						  T3f = FNMS(KP866025403, T2e, T2d);
+						  T8 = ci[WS(rs, 1)];
+					     }
+					     Th = cr[WS(rs, 6)];
+					     Ti = ci[WS(rs, 3)];
+					     Tj = cr[WS(rs, 1)];
+					     T9 = T7 + T8;
+					     T1W = T7 - T8;
+					}
+					Tb = ci[WS(rs, 2)];
+					T23 = Ti - Tj;
+					Tk = Ti + Tj;
+					T1D = FNMS(KP500000000, T9, T6);
+					Ta = T6 + T9;
+					Tc = cr[WS(rs, 2)];
+					Tl = Th + Tk;
+					T1K = FNMS(KP500000000, Tk, Th);
+					Td = cr[WS(rs, 7)];
+					TQ = cr[WS(rs, 12)];
+					TN = ci[WS(rs, 12)];
+					TO = ci[WS(rs, 7)];
+					Te = Tc + Td;
+					T1Z = Tc - Td;
+				   }
+				   {
+					E Tm, TF, TC, TD, Tp, Tn, To, TP, TJ, TK, TE;
+					Tm = ci[WS(rs, 5)];
+					T1H = TO - TN;
+					TP = TN + TO;
+					T1G = FNMS(KP500000000, Te, Tb);
+					Tf = Tb + Te;
+					Tn = ci[0];
+					TR = TP - TQ;
+					T1Y = FMA(KP500000000, TP, TQ);
+					To = cr[WS(rs, 4)];
+					TF = cr[WS(rs, 9)];
+					TC = ci[WS(rs, 10)];
+					TD = cr[WS(rs, 14)];
+					Tp = Tn + To;
+					T26 = Tn - To;
+					TI = ci[WS(rs, 11)];
+					T1O = TC + TD;
+					TE = TC - TD;
+					T1N = FNMS(KP500000000, Tp, Tm);
+					Tq = Tm + Tp;
+					TJ = cr[WS(rs, 8)];
+					TG = TE - TF;
+					T25 = FMA(KP500000000, TE, TF);
+					TK = cr[WS(rs, 13)];
+					Tx = ci[WS(rs, 8)];
+					Ty = ci[WS(rs, 13)];
+					Tz = cr[WS(rs, 11)];
+					TL = TJ + TK;
+					T1E = TJ - TK;
+				   }
+			      }
+			      {
+				   E Tg, T1L, Tr, T22, T12, T1X, T38, T13, T39, T20;
+				   {
+					E TA, T1V, TM, TB;
+					Tg = Ta + Tf;
+					T19 = Ta - Tf;
+					T1L = Ty + Tz;
+					TA = Ty - Tz;
+					T1V = FMA(KP500000000, TL, TI);
+					TM = TI - TL;
+					T18 = Tl - Tq;
+					Tr = Tl + Tq;
+					TB = Tx + TA;
+					T22 = FNMS(KP500000000, TA, Tx);
+					T12 = TM + TR;
+					TS = TM - TR;
+					T1X = FMA(KP866025403, T1W, T1V);
+					T38 = FNMS(KP866025403, T1W, T1V);
+					T13 = TB + TG;
+					TH = TB - TG;
+					T39 = FMA(KP866025403, T1Z, T1Y);
+					T20 = FNMS(KP866025403, T1Z, T1Y);
+				   }
+				   {
+					E T35, T24, T27, T36;
+					T14 = T12 + T13;
+					T16 = T12 - T13;
+					T3g = T38 - T39;
+					T3a = T38 + T39;
+					T35 = FNMS(KP866025403, T23, T22);
+					T24 = FMA(KP866025403, T23, T22);
+					Ts = Tg + Tr;
+					Tv = Tg - Tr;
+					T27 = FNMS(KP866025403, T26, T25);
+					T36 = FMA(KP866025403, T26, T25);
+					T37 = T35 + T36;
+					T3h = T35 - T36;
+					T28 = T24 + T27;
+					T2h = T24 - T27;
+					{
+					     E T1F, T1I, T2Y, T2Z, T2V, T2W;
+					     T2V = FNMS(KP866025403, T1E, T1D);
+					     T1F = FMA(KP866025403, T1E, T1D);
+					     T1I = FMA(KP866025403, T1H, T1G);
+					     T2W = FNMS(KP866025403, T1H, T1G);
+					     T2Y = FNMS(KP866025403, T1L, T1K);
+					     T1M = FMA(KP866025403, T1L, T1K);
+					     T21 = T1X + T20;
+					     T2g = T1X - T20;
+					     T3n = T2V - T2W;
+					     T2X = T2V + T2W;
+					     T2Z = FNMS(KP866025403, T1O, T1N);
+					     T1P = FMA(KP866025403, T1O, T1N);
+					     T30 = T2Y + T2Z;
+					     T3m = T2Y - T2Z;
+					     T1J = T1F + T1I;
+					     T2m = T1F - T1I;
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T31, T33, T2n, T1Q;
+			 cr[0] = T5 + Ts;
+			 T31 = T2X + T30;
+			 T33 = T2X - T30;
+			 T2n = T1M - T1P;
+			 T1Q = T1M + T1P;
+			 ci[0] = T11 + T14;
+			 {
+			      E T1T, T1R, T1r, T1o, T1n;
+			      {
+				   E T1q, T1a, TT, T1l, Tu, T17, T1p, T15;
+				   T1q = FMA(KP618033988, T18, T19);
+				   T1a = FNMS(KP618033988, T19, T18);
+				   T1T = T1J - T1Q;
+				   T1R = T1J + T1Q;
+				   T15 = FNMS(KP250000000, T14, T11);
+				   TT = FNMS(KP618033988, TS, TH);
+				   T1l = FMA(KP618033988, TH, TS);
+				   Tu = FNMS(KP250000000, Ts, T5);
+				   T17 = FNMS(KP559016994, T16, T15);
+				   T1p = FMA(KP559016994, T16, T15);
+				   {
+					E T1h, T1m, T1e, T1x, T1w, T1v, T1g, T1d;
+					{
+					     E TW, T1b, Tt, T1u, TU, T1k, Tw;
+					     TW = W[5];
+					     T1k = FMA(KP559016994, Tv, Tu);
+					     Tw = FNMS(KP559016994, Tv, Tu);
+					     T1b = FMA(KP951056516, T1a, T17);
+					     T1h = FNMS(KP951056516, T1a, T17);
+					     Tt = W[4];
+					     T1m = FNMS(KP951056516, T1l, T1k);
+					     T1u = FMA(KP951056516, T1l, T1k);
+					     T1e = FMA(KP951056516, TT, Tw);
+					     TU = FNMS(KP951056516, TT, Tw);
+					     {
+						  E T1t, TV, T1c, T1y;
+						  T1x = FNMS(KP951056516, T1q, T1p);
+						  T1r = FMA(KP951056516, T1q, T1p);
+						  T1w = W[17];
+						  T1t = W[16];
+						  TV = Tt * TU;
+						  T1c = TW * TU;
+						  T1y = T1w * T1u;
+						  T1v = T1t * T1u;
+						  cr[WS(rs, 3)] = FNMS(TW, T1b, TV);
+						  ci[WS(rs, 3)] = FMA(Tt, T1b, T1c);
+						  ci[WS(rs, 9)] = FMA(T1t, T1x, T1y);
+					     }
+					}
+					cr[WS(rs, 9)] = FNMS(T1w, T1x, T1v);
+					T1g = W[23];
+					T1d = W[22];
+					{
+					     E T1j, T1s, T1i, T1f;
+					     T1o = W[11];
+					     T1i = T1g * T1e;
+					     T1f = T1d * T1e;
+					     T1j = W[10];
+					     T1s = T1o * T1m;
+					     ci[WS(rs, 12)] = FMA(T1d, T1h, T1i);
+					     cr[WS(rs, 12)] = FNMS(T1g, T1h, T1f);
+					     T1n = T1j * T1m;
+					     ci[WS(rs, 6)] = FMA(T1j, T1r, T1s);
+					}
+				   }
+			      }
+			      {
+				   E T2v, T2u, T2r, T2w, T2t;
+				   {
+					E T1S, T2N, T2o, T2E, T2Q, T2P, T2k, T2S, T29, T2z, T2R, T2j, T2O, T2i;
+					cr[WS(rs, 6)] = FNMS(T1o, T1r, T1n);
+					T1S = FNMS(KP250000000, T1R, T1C);
+					T2O = T1C + T1R;
+					T2N = W[18];
+					T2o = FMA(KP618033988, T2n, T2m);
+					T2E = FNMS(KP618033988, T2m, T2n);
+					T2Q = W[19];
+					T2P = T2N * T2O;
+					T2i = T2g + T2h;
+					T2k = T2g - T2h;
+					T2S = T2Q * T2O;
+					T29 = FMA(KP618033988, T28, T21);
+					T2z = FNMS(KP618033988, T21, T28);
+					T2R = T2f + T2i;
+					T2j = FNMS(KP250000000, T2i, T2f);
+					{
+					     E T2D, T2p, T2I, T2A, T2a, T2s, T2c, T1z, T2l, T1U, T2y;
+					     cr[WS(rs, 10)] = FNMS(T2Q, T2R, T2P);
+					     T2l = FMA(KP559016994, T2k, T2j);
+					     T2D = FNMS(KP559016994, T2k, T2j);
+					     T1U = FMA(KP559016994, T1T, T1S);
+					     T2y = FNMS(KP559016994, T1T, T1S);
+					     ci[WS(rs, 10)] = FMA(T2N, T2R, T2S);
+					     T2p = FMA(KP951056516, T2o, T2l);
+					     T2v = FNMS(KP951056516, T2o, T2l);
+					     T2I = FNMS(KP951056516, T2z, T2y);
+					     T2A = FMA(KP951056516, T2z, T2y);
+					     T2a = FNMS(KP951056516, T29, T1U);
+					     T2s = FMA(KP951056516, T29, T1U);
+					     T2c = W[1];
+					     T1z = W[0];
+					     {
+						  E T2F, T2L, T2K, T2J;
+						  {
+						       E T2H, T2M, T2q, T2b;
+						       T2F = FNMS(KP951056516, T2E, T2D);
+						       T2L = FMA(KP951056516, T2E, T2D);
+						       T2K = W[25];
+						       T2q = T2c * T2a;
+						       T2b = T1z * T2a;
+						       T2H = W[24];
+						       T2M = T2K * T2I;
+						       ci[WS(rs, 1)] = FMA(T1z, T2p, T2q);
+						       cr[WS(rs, 1)] = FNMS(T2c, T2p, T2b);
+						       T2J = T2H * T2I;
+						       ci[WS(rs, 13)] = FMA(T2H, T2L, T2M);
+						  }
+						  {
+						       E T2C, T2x, T2G, T2B;
+						       T2C = W[13];
+						       cr[WS(rs, 13)] = FNMS(T2K, T2L, T2J);
+						       T2x = W[12];
+						       T2G = T2C * T2A;
+						       T2u = W[7];
+						       T2B = T2x * T2A;
+						       T2r = W[6];
+						       ci[WS(rs, 7)] = FMA(T2x, T2F, T2G);
+						       T2w = T2u * T2s;
+						       cr[WS(rs, 7)] = FNMS(T2C, T2F, T2B);
+						       T2t = T2r * T2s;
+						  }
+					     }
+					}
+				   }
+				   {
+					E T32, T3N, T3E, T3o, T3Q, T3P, T3k, T3S, T3z, T3b, T3j, T3R, T3O, T3i;
+					ci[WS(rs, 4)] = FMA(T2r, T2v, T2w);
+					cr[WS(rs, 4)] = FNMS(T2u, T2v, T2t);
+					T3O = T2U + T31;
+					T32 = FNMS(KP250000000, T31, T2U);
+					T3N = W[8];
+					T3E = FMA(KP618033988, T3m, T3n);
+					T3o = FNMS(KP618033988, T3n, T3m);
+					T3Q = W[9];
+					T3P = T3N * T3O;
+					T3k = T3g - T3h;
+					T3i = T3g + T3h;
+					T3S = T3Q * T3O;
+					T3z = FMA(KP618033988, T37, T3a);
+					T3b = FNMS(KP618033988, T3a, T37);
+					T3j = FNMS(KP250000000, T3i, T3f);
+					T3R = T3f + T3i;
+					{
+					     E T3D, T3p, T3A, T3I, T3s, T3c, T3e, T2T, T3l, T3y, T34;
+					     cr[WS(rs, 5)] = FNMS(T3Q, T3R, T3P);
+					     T3D = FMA(KP559016994, T3k, T3j);
+					     T3l = FNMS(KP559016994, T3k, T3j);
+					     T3y = FMA(KP559016994, T33, T32);
+					     T34 = FNMS(KP559016994, T33, T32);
+					     ci[WS(rs, 5)] = FMA(T3N, T3R, T3S);
+					     T3v = FMA(KP951056516, T3o, T3l);
+					     T3p = FNMS(KP951056516, T3o, T3l);
+					     T3A = FNMS(KP951056516, T3z, T3y);
+					     T3I = FMA(KP951056516, T3z, T3y);
+					     T3s = FNMS(KP951056516, T3b, T34);
+					     T3c = FMA(KP951056516, T3b, T34);
+					     T3e = W[3];
+					     T2T = W[2];
+					     {
+						  E T3L, T3F, T3K, T3J;
+						  {
+						       E T3H, T3M, T3q, T3d;
+						       T3L = FNMS(KP951056516, T3E, T3D);
+						       T3F = FMA(KP951056516, T3E, T3D);
+						       T3K = W[27];
+						       T3q = T3e * T3c;
+						       T3d = T2T * T3c;
+						       T3H = W[26];
+						       T3M = T3K * T3I;
+						       ci[WS(rs, 2)] = FMA(T2T, T3p, T3q);
+						       cr[WS(rs, 2)] = FNMS(T3e, T3p, T3d);
+						       T3J = T3H * T3I;
+						       ci[WS(rs, 14)] = FMA(T3H, T3L, T3M);
+						  }
+						  {
+						       E T3C, T3x, T3G, T3B;
+						       T3C = W[21];
+						       cr[WS(rs, 14)] = FNMS(T3K, T3L, T3J);
+						       T3x = W[20];
+						       T3G = T3C * T3A;
+						       T3u = W[15];
+						       T3B = T3x * T3A;
+						       T3r = W[14];
+						       ci[WS(rs, 11)] = FMA(T3x, T3F, T3G);
+						       T3w = T3u * T3s;
+						       cr[WS(rs, 11)] = FNMS(T3C, T3F, T3B);
+						       T3t = T3r * T3s;
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 8)] = FMA(T3r, T3v, T3w);
+	       cr[WS(rs, 8)] = FNMS(T3u, T3v, T3t);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 15, "hb_15", twinstr, &GENUS, {72, 28, 112, 0} };
+
+void X(codelet_hb_15) (planner *p) {
+     X(khc2hc_register) (p, hb_15, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -dif -name hb_15 -include hb.h */
+
+/*
+ * This function contains 184 FP additions, 112 FP multiplications,
+ * (or, 128 additions, 56 multiplications, 56 fused multiply/add),
+ * 75 stack variables, 6 constants, and 60 memory accesses
+ */
+#include "hb.h"
+
+static void hb_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
+	       E T5, T10, T1J, T2C, T2c, T2M, TH, T18, T17, TS, T2Q, T2R, T2S, Tg, Tr;
+	       E Ts, T11, T12, T13, T2N, T2O, T2P, T1u, T1x, T1y, T1W, T1Z, T28, T1P, T1S;
+	       E T27, T1B, T1E, T1F, T2G, T2H, T2I, T2D, T2E, T2F;
+	       {
+		    E T1, TW, T4, T2a, TZ, T1I, T1H, T2b;
+		    T1 = cr[0];
+		    TW = ci[WS(rs, 14)];
+		    {
+			 E T2, T3, TX, TY;
+			 T2 = cr[WS(rs, 5)];
+			 T3 = ci[WS(rs, 4)];
+			 T4 = T2 + T3;
+			 T2a = KP866025403 * (T2 - T3);
+			 TX = ci[WS(rs, 9)];
+			 TY = cr[WS(rs, 10)];
+			 TZ = TX - TY;
+			 T1I = KP866025403 * (TX + TY);
+		    }
+		    T5 = T1 + T4;
+		    T10 = TW + TZ;
+		    T1H = FNMS(KP500000000, T4, T1);
+		    T1J = T1H - T1I;
+		    T2C = T1H + T1I;
+		    T2b = FNMS(KP500000000, TZ, TW);
+		    T2c = T2a + T2b;
+		    T2M = T2b - T2a;
+	       }
+	       {
+		    E Ta, T1N, T1s, Tl, T1U, T1z, Tf, T1Q, T1v, TG, T1R, T1w, Tq, T1X, T1C;
+		    E TM, T1V, T1A, TB, T1O, T1t, TR, T1Y, T1D;
+		    {
+			 E T6, T7, T8, T9;
+			 T6 = cr[WS(rs, 3)];
+			 T7 = ci[WS(rs, 6)];
+			 T8 = ci[WS(rs, 1)];
+			 T9 = T7 + T8;
+			 Ta = T6 + T9;
+			 T1N = KP866025403 * (T7 - T8);
+			 T1s = FNMS(KP500000000, T9, T6);
+		    }
+		    {
+			 E Th, Ti, Tj, Tk;
+			 Th = cr[WS(rs, 6)];
+			 Ti = ci[WS(rs, 3)];
+			 Tj = cr[WS(rs, 1)];
+			 Tk = Ti + Tj;
+			 Tl = Th + Tk;
+			 T1U = KP866025403 * (Ti - Tj);
+			 T1z = FNMS(KP500000000, Tk, Th);
+		    }
+		    {
+			 E Tb, Tc, Td, Te;
+			 Tb = ci[WS(rs, 2)];
+			 Tc = cr[WS(rs, 2)];
+			 Td = cr[WS(rs, 7)];
+			 Te = Tc + Td;
+			 Tf = Tb + Te;
+			 T1Q = KP866025403 * (Tc - Td);
+			 T1v = FNMS(KP500000000, Te, Tb);
+		    }
+		    {
+			 E TF, TC, TD, TE;
+			 TF = cr[WS(rs, 12)];
+			 TC = ci[WS(rs, 12)];
+			 TD = ci[WS(rs, 7)];
+			 TE = TC + TD;
+			 TG = TE - TF;
+			 T1R = FMA(KP500000000, TE, TF);
+			 T1w = KP866025403 * (TD - TC);
+		    }
+		    {
+			 E Tm, Tn, To, Tp;
+			 Tm = ci[WS(rs, 5)];
+			 Tn = ci[0];
+			 To = cr[WS(rs, 4)];
+			 Tp = Tn + To;
+			 Tq = Tm + Tp;
+			 T1X = KP866025403 * (Tn - To);
+			 T1C = FNMS(KP500000000, Tp, Tm);
+		    }
+		    {
+			 E TI, TJ, TK, TL;
+			 TI = ci[WS(rs, 8)];
+			 TJ = ci[WS(rs, 13)];
+			 TK = cr[WS(rs, 11)];
+			 TL = TJ - TK;
+			 TM = TI + TL;
+			 T1V = FNMS(KP500000000, TL, TI);
+			 T1A = KP866025403 * (TJ + TK);
+		    }
+		    {
+			 E Tx, Ty, Tz, TA;
+			 Tx = ci[WS(rs, 11)];
+			 Ty = cr[WS(rs, 8)];
+			 Tz = cr[WS(rs, 13)];
+			 TA = Ty + Tz;
+			 TB = Tx - TA;
+			 T1O = FMA(KP500000000, TA, Tx);
+			 T1t = KP866025403 * (Ty - Tz);
+		    }
+		    {
+			 E TQ, TN, TO, TP;
+			 TQ = cr[WS(rs, 9)];
+			 TN = ci[WS(rs, 10)];
+			 TO = cr[WS(rs, 14)];
+			 TP = TN - TO;
+			 TR = TP - TQ;
+			 T1Y = FMA(KP500000000, TP, TQ);
+			 T1D = KP866025403 * (TN + TO);
+		    }
+		    TH = TB - TG;
+		    T18 = Tl - Tq;
+		    T17 = Ta - Tf;
+		    TS = TM - TR;
+		    T2Q = T1V - T1U;
+		    T2R = T1X + T1Y;
+		    T2S = T2Q - T2R;
+		    Tg = Ta + Tf;
+		    Tr = Tl + Tq;
+		    Ts = Tg + Tr;
+		    T11 = TB + TG;
+		    T12 = TM + TR;
+		    T13 = T11 + T12;
+		    T2N = T1O - T1N;
+		    T2O = T1Q + T1R;
+		    T2P = T2N - T2O;
+		    T1u = T1s + T1t;
+		    T1x = T1v + T1w;
+		    T1y = T1u + T1x;
+		    T1W = T1U + T1V;
+		    T1Z = T1X - T1Y;
+		    T28 = T1W + T1Z;
+		    T1P = T1N + T1O;
+		    T1S = T1Q - T1R;
+		    T27 = T1P + T1S;
+		    T1B = T1z + T1A;
+		    T1E = T1C + T1D;
+		    T1F = T1B + T1E;
+		    T2G = T1z - T1A;
+		    T2H = T1C - T1D;
+		    T2I = T2G + T2H;
+		    T2D = T1s - T1t;
+		    T2E = T1v - T1w;
+		    T2F = T2D + T2E;
+	       }
+	       cr[0] = T5 + Ts;
+	       ci[0] = T10 + T13;
+	       {
+		    E TT, T19, T1k, T1h, T16, T1l, Tw, T1g;
+		    TT = FNMS(KP951056516, TS, KP587785252 * TH);
+		    T19 = FNMS(KP951056516, T18, KP587785252 * T17);
+		    T1k = FMA(KP951056516, T17, KP587785252 * T18);
+		    T1h = FMA(KP951056516, TH, KP587785252 * TS);
+		    {
+			 E T14, T15, Tu, Tv;
+			 T14 = FNMS(KP250000000, T13, T10);
+			 T15 = KP559016994 * (T11 - T12);
+			 T16 = T14 - T15;
+			 T1l = T15 + T14;
+			 Tu = FNMS(KP250000000, Ts, T5);
+			 Tv = KP559016994 * (Tg - Tr);
+			 Tw = Tu - Tv;
+			 T1g = Tv + Tu;
+		    }
+		    {
+			 E TU, T1a, Tt, TV;
+			 TU = Tw + TT;
+			 T1a = T16 - T19;
+			 Tt = W[4];
+			 TV = W[5];
+			 cr[WS(rs, 3)] = FNMS(TV, T1a, Tt * TU);
+			 ci[WS(rs, 3)] = FMA(TV, TU, Tt * T1a);
+		    }
+		    {
+			 E T1o, T1q, T1n, T1p;
+			 T1o = T1g + T1h;
+			 T1q = T1l - T1k;
+			 T1n = W[16];
+			 T1p = W[17];
+			 cr[WS(rs, 9)] = FNMS(T1p, T1q, T1n * T1o);
+			 ci[WS(rs, 9)] = FMA(T1p, T1o, T1n * T1q);
+		    }
+		    {
+			 E T1c, T1e, T1b, T1d;
+			 T1c = Tw - TT;
+			 T1e = T19 + T16;
+			 T1b = W[22];
+			 T1d = W[23];
+			 cr[WS(rs, 12)] = FNMS(T1d, T1e, T1b * T1c);
+			 ci[WS(rs, 12)] = FMA(T1d, T1c, T1b * T1e);
+		    }
+		    {
+			 E T1i, T1m, T1f, T1j;
+			 T1i = T1g - T1h;
+			 T1m = T1k + T1l;
+			 T1f = W[10];
+			 T1j = W[11];
+			 cr[WS(rs, 6)] = FNMS(T1j, T1m, T1f * T1i);
+			 ci[WS(rs, 6)] = FMA(T1j, T1i, T1f * T1m);
+		    }
+	       }
+	       {
+		    E T21, T2n, T26, T2q, T1M, T2y, T2m, T2f, T2A, T2r, T2x, T2z;
+		    {
+			 E T1T, T20, T24, T25;
+			 T1T = T1P - T1S;
+			 T20 = T1W - T1Z;
+			 T21 = FMA(KP951056516, T1T, KP587785252 * T20);
+			 T2n = FNMS(KP951056516, T20, KP587785252 * T1T);
+			 T24 = T1u - T1x;
+			 T25 = T1B - T1E;
+			 T26 = FMA(KP951056516, T24, KP587785252 * T25);
+			 T2q = FNMS(KP951056516, T25, KP587785252 * T24);
+		    }
+		    {
+			 E T1G, T1K, T1L, T29, T2d, T2e;
+			 T1G = KP559016994 * (T1y - T1F);
+			 T1K = T1y + T1F;
+			 T1L = FNMS(KP250000000, T1K, T1J);
+			 T1M = T1G + T1L;
+			 T2y = T1J + T1K;
+			 T2m = T1L - T1G;
+			 T29 = KP559016994 * (T27 - T28);
+			 T2d = T27 + T28;
+			 T2e = FNMS(KP250000000, T2d, T2c);
+			 T2f = T29 + T2e;
+			 T2A = T2c + T2d;
+			 T2r = T2e - T29;
+		    }
+		    T2x = W[18];
+		    T2z = W[19];
+		    cr[WS(rs, 10)] = FNMS(T2z, T2A, T2x * T2y);
+		    ci[WS(rs, 10)] = FMA(T2z, T2y, T2x * T2A);
+		    {
+			 E T2u, T2w, T2t, T2v;
+			 T2u = T2m + T2n;
+			 T2w = T2r - T2q;
+			 T2t = W[24];
+			 T2v = W[25];
+			 cr[WS(rs, 13)] = FNMS(T2v, T2w, T2t * T2u);
+			 ci[WS(rs, 13)] = FMA(T2v, T2u, T2t * T2w);
+		    }
+		    {
+			 E T22, T2g, T1r, T23;
+			 T22 = T1M - T21;
+			 T2g = T26 + T2f;
+			 T1r = W[0];
+			 T23 = W[1];
+			 cr[WS(rs, 1)] = FNMS(T23, T2g, T1r * T22);
+			 ci[WS(rs, 1)] = FMA(T23, T22, T1r * T2g);
+		    }
+		    {
+			 E T2i, T2k, T2h, T2j;
+			 T2i = T1M + T21;
+			 T2k = T2f - T26;
+			 T2h = W[6];
+			 T2j = W[7];
+			 cr[WS(rs, 4)] = FNMS(T2j, T2k, T2h * T2i);
+			 ci[WS(rs, 4)] = FMA(T2j, T2i, T2h * T2k);
+		    }
+		    {
+			 E T2o, T2s, T2l, T2p;
+			 T2o = T2m - T2n;
+			 T2s = T2q + T2r;
+			 T2l = W[12];
+			 T2p = W[13];
+			 cr[WS(rs, 7)] = FNMS(T2p, T2s, T2l * T2o);
+			 ci[WS(rs, 7)] = FMA(T2p, T2o, T2l * T2s);
+		    }
+	       }
+	       {
+		    E T31, T3h, T36, T3k, T2K, T3g, T2Y, T2U, T3l, T39, T2B, T2L;
+		    {
+			 E T2Z, T30, T34, T35;
+			 T2Z = T2N + T2O;
+			 T30 = T2Q + T2R;
+			 T31 = FNMS(KP951056516, T30, KP587785252 * T2Z);
+			 T3h = FMA(KP951056516, T2Z, KP587785252 * T30);
+			 T34 = T2D - T2E;
+			 T35 = T2G - T2H;
+			 T36 = FNMS(KP951056516, T35, KP587785252 * T34);
+			 T3k = FMA(KP951056516, T34, KP587785252 * T35);
+		    }
+		    {
+			 E T2X, T2J, T2W, T38, T2T, T37;
+			 T2X = KP559016994 * (T2F - T2I);
+			 T2J = T2F + T2I;
+			 T2W = FNMS(KP250000000, T2J, T2C);
+			 T2K = T2C + T2J;
+			 T3g = T2X + T2W;
+			 T2Y = T2W - T2X;
+			 T38 = KP559016994 * (T2P - T2S);
+			 T2T = T2P + T2S;
+			 T37 = FNMS(KP250000000, T2T, T2M);
+			 T2U = T2M + T2T;
+			 T3l = T38 + T37;
+			 T39 = T37 - T38;
+		    }
+		    T2B = W[8];
+		    T2L = W[9];
+		    cr[WS(rs, 5)] = FNMS(T2L, T2U, T2B * T2K);
+		    ci[WS(rs, 5)] = FMA(T2L, T2K, T2B * T2U);
+		    {
+			 E T3o, T3q, T3n, T3p;
+			 T3o = T3g + T3h;
+			 T3q = T3l - T3k;
+			 T3n = W[26];
+			 T3p = W[27];
+			 cr[WS(rs, 14)] = FNMS(T3p, T3q, T3n * T3o);
+			 ci[WS(rs, 14)] = FMA(T3n, T3q, T3p * T3o);
+		    }
+		    {
+			 E T32, T3a, T2V, T33;
+			 T32 = T2Y - T31;
+			 T3a = T36 + T39;
+			 T2V = W[2];
+			 T33 = W[3];
+			 cr[WS(rs, 2)] = FNMS(T33, T3a, T2V * T32);
+			 ci[WS(rs, 2)] = FMA(T2V, T3a, T33 * T32);
+		    }
+		    {
+			 E T3c, T3e, T3b, T3d;
+			 T3c = T2Y + T31;
+			 T3e = T39 - T36;
+			 T3b = W[14];
+			 T3d = W[15];
+			 cr[WS(rs, 8)] = FNMS(T3d, T3e, T3b * T3c);
+			 ci[WS(rs, 8)] = FMA(T3b, T3e, T3d * T3c);
+		    }
+		    {
+			 E T3i, T3m, T3f, T3j;
+			 T3i = T3g - T3h;
+			 T3m = T3k + T3l;
+			 T3f = W[20];
+			 T3j = W[21];
+			 cr[WS(rs, 11)] = FNMS(T3j, T3m, T3f * T3i);
+			 ci[WS(rs, 11)] = FMA(T3f, T3m, T3j * T3i);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 15, "hb_15", twinstr, &GENUS, {128, 56, 56, 0} };
+
+void X(codelet_hb_15) (planner *p) {
+     X(khc2hc_register) (p, hb_15, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,809 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:14 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include hb.h */
+
+/*
+ * This function contains 174 FP additions, 100 FP multiplications,
+ * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
+ * 78 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hb.h"
+
+static void hb_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T1I, T1L, T1K, T1M, T1J;
+	       {
+		    E T1O, TA, T1h, T21, T3b, T2T, T3D, T3r, T1k, T1P, T3y, Tf, T36, T2A, T22;
+		    E TL, T3z, T3u, T2U, T2F, T2K, T2V, T12, Tu, T3E, TX, T1n, T17, T1T, T24;
+		    E T1W, T25;
+		    {
+			 E T2z, TF, TK, T2w;
+			 {
+			      E Tw, T3, T2x, TJ, T2Q, T1g, T1d, T6, TC, TB, Ta, T2R, Tz, TD, Tb;
+			      E Tc;
+			      {
+				   E T1e, T1f, T4, T5;
+				   {
+					E T1, T2, TH, TI;
+					T1 = cr[0];
+					T2 = ci[WS(rs, 7)];
+					TH = ci[WS(rs, 9)];
+					TI = cr[WS(rs, 14)];
+					T1e = ci[WS(rs, 15)];
+					Tw = T1 - T2;
+					T3 = T1 + T2;
+					T2x = TH - TI;
+					TJ = TH + TI;
+					T1f = cr[WS(rs, 8)];
+					T4 = cr[WS(rs, 4)];
+					T5 = ci[WS(rs, 3)];
+				   }
+				   {
+					E T8, T9, Tx, Ty;
+					T8 = cr[WS(rs, 2)];
+					T2Q = T1e - T1f;
+					T1g = T1e + T1f;
+					T1d = T4 - T5;
+					T6 = T4 + T5;
+					T9 = ci[WS(rs, 5)];
+					Tx = ci[WS(rs, 11)];
+					Ty = cr[WS(rs, 12)];
+					TC = ci[WS(rs, 13)];
+					TB = T8 - T9;
+					Ta = T8 + T9;
+					T2R = Tx - Ty;
+					Tz = Tx + Ty;
+					TD = cr[WS(rs, 10)];
+					Tb = ci[WS(rs, 1)];
+					Tc = cr[WS(rs, 6)];
+				   }
+			      }
+			      {
+				   E T2y, TE, TG, Te, T2P, T2S, T3p, Td;
+				   T1O = Tw + Tz;
+				   TA = Tw - Tz;
+				   T2y = TC - TD;
+				   TE = TC + TD;
+				   TG = Tb - Tc;
+				   Td = Tb + Tc;
+				   T1h = T1d + T1g;
+				   T21 = T1g - T1d;
+				   Te = Ta + Td;
+				   T2P = Ta - Td;
+				   T2S = T2Q - T2R;
+				   T3p = T2Q + T2R;
+				   {
+					E T1i, T1j, T3q, T7;
+					T3q = T2y + T2x;
+					T2z = T2x - T2y;
+					TF = TB - TE;
+					T1i = TB + TE;
+					T3b = T2S - T2P;
+					T2T = T2P + T2S;
+					TK = TG - TJ;
+					T1j = TG + TJ;
+					T3D = T3p - T3q;
+					T3r = T3p + T3q;
+					T2w = T3 - T6;
+					T7 = T3 + T6;
+					T1k = T1i - T1j;
+					T1P = T1i + T1j;
+					T3y = T7 - Te;
+					Tf = T7 + Te;
+				   }
+			      }
+			 }
+			 {
+			      E T13, Ti, T2C, T11, T2D, T16, TY, Tl, TT, TS, Tp, T2H, TQ, TU, Tq;
+			      E Tr;
+			      {
+				   E T14, T15, Tj, Tk;
+				   {
+					E Tg, Th, TZ, T10;
+					Tg = cr[WS(rs, 1)];
+					T36 = T2w - T2z;
+					T2A = T2w + T2z;
+					T22 = TF - TK;
+					TL = TF + TK;
+					Th = ci[WS(rs, 6)];
+					TZ = ci[WS(rs, 14)];
+					T10 = cr[WS(rs, 9)];
+					T14 = ci[WS(rs, 10)];
+					T13 = Tg - Th;
+					Ti = Tg + Th;
+					T2C = TZ - T10;
+					T11 = TZ + T10;
+					T15 = cr[WS(rs, 13)];
+					Tj = cr[WS(rs, 5)];
+					Tk = ci[WS(rs, 2)];
+				   }
+				   {
+					E Tn, To, TO, TP;
+					Tn = ci[0];
+					T2D = T14 - T15;
+					T16 = T14 + T15;
+					TY = Tj - Tk;
+					Tl = Tj + Tk;
+					To = cr[WS(rs, 7)];
+					TO = ci[WS(rs, 8)];
+					TP = cr[WS(rs, 15)];
+					TT = ci[WS(rs, 12)];
+					TS = Tn - To;
+					Tp = Tn + To;
+					T2H = TO - TP;
+					TQ = TO + TP;
+					TU = cr[WS(rs, 11)];
+					Tq = cr[WS(rs, 3)];
+					Tr = ci[WS(rs, 4)];
+				   }
+			      }
+			      {
+				   E TV, TN, Tm, Tt;
+				   {
+					E T2E, T3s, Ts, T3t, T2J, T2B, T2I, T2G;
+					T2E = T2C - T2D;
+					T3s = T2C + T2D;
+					T2I = TT - TU;
+					TV = TT + TU;
+					TN = Tq - Tr;
+					Ts = Tq + Tr;
+					T3t = T2H + T2I;
+					T2J = T2H - T2I;
+					Tm = Ti + Tl;
+					T2B = Ti - Tl;
+					Tt = Tp + Ts;
+					T2G = Tp - Ts;
+					T3z = T3t - T3s;
+					T3u = T3s + T3t;
+					T2U = T2B + T2E;
+					T2F = T2B - T2E;
+					T2K = T2G + T2J;
+					T2V = T2J - T2G;
+				   }
+				   {
+					E T1U, T1V, T1R, T1S, TR, TW;
+					TR = TN - TQ;
+					T1U = TN + TQ;
+					T1V = TS + TV;
+					TW = TS - TV;
+					T1R = T11 - TY;
+					T12 = TY + T11;
+					Tu = Tm + Tt;
+					T3E = Tm - Tt;
+					TX = FNMS(KP414213562, TW, TR);
+					T1n = FMA(KP414213562, TR, TW);
+					T17 = T13 - T16;
+					T1S = T13 + T16;
+					T1T = FNMS(KP414213562, T1S, T1R);
+					T24 = FMA(KP414213562, T1R, T1S);
+					T1W = FNMS(KP414213562, T1V, T1U);
+					T25 = FMA(KP414213562, T1U, T1V);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T18, T1m, T2W, T2L, T3j, T3i, T3h;
+			 {
+			      E T3m, T3v, T3l, T3o;
+			      cr[0] = Tf + Tu;
+			      T18 = FMA(KP414213562, T17, T12);
+			      T1m = FNMS(KP414213562, T12, T17);
+			      T3m = Tf - Tu;
+			      T3v = T3r - T3u;
+			      T3l = W[14];
+			      T3o = W[15];
+			      ci[0] = T3r + T3u;
+			      {
+				   E T3A, T3I, T3L, T3F, T3C, T3G, T3B, T3x, T3n, T3w, T3H, T3K;
+				   T3A = T3y - T3z;
+				   T3I = T3y + T3z;
+				   T3n = T3l * T3m;
+				   T3w = T3o * T3m;
+				   T3L = T3E + T3D;
+				   T3F = T3D - T3E;
+				   T3x = W[22];
+				   cr[WS(rs, 8)] = FNMS(T3o, T3v, T3n);
+				   ci[WS(rs, 8)] = FMA(T3l, T3v, T3w);
+				   T3C = W[23];
+				   T3G = T3x * T3F;
+				   T3B = T3x * T3A;
+				   ci[WS(rs, 12)] = FMA(T3C, T3A, T3G);
+				   cr[WS(rs, 12)] = FNMS(T3C, T3F, T3B);
+				   T3H = W[6];
+				   T3K = W[7];
+				   {
+					E T3g, T38, T3d, T35, T3a;
+					{
+					     E T37, T3c, T3M, T3J;
+					     T37 = T2V - T2U;
+					     T2W = T2U + T2V;
+					     T2L = T2F + T2K;
+					     T3c = T2F - T2K;
+					     T3M = T3H * T3L;
+					     T3J = T3H * T3I;
+					     T3g = FMA(KP707106781, T37, T36);
+					     T38 = FNMS(KP707106781, T37, T36);
+					     ci[WS(rs, 4)] = FMA(T3K, T3I, T3M);
+					     cr[WS(rs, 4)] = FNMS(T3K, T3L, T3J);
+					     T3d = FNMS(KP707106781, T3c, T3b);
+					     T3j = FMA(KP707106781, T3c, T3b);
+					}
+					T35 = W[26];
+					T3a = W[27];
+					{
+					     E T3f, T3e, T39, T3k;
+					     T3f = W[10];
+					     T3i = W[11];
+					     T3e = T35 * T3d;
+					     T39 = T35 * T38;
+					     T3k = T3f * T3j;
+					     T3h = T3f * T3g;
+					     ci[WS(rs, 14)] = FMA(T3a, T38, T3e);
+					     cr[WS(rs, 14)] = FNMS(T3a, T3d, T39);
+					     ci[WS(rs, 6)] = FMA(T3i, T3g, T3k);
+					}
+				   }
+			      }
+			 }
+			 cr[WS(rs, 6)] = FNMS(T3i, T3j, T3h);
+			 {
+			      E T2g, T2m, T2l, T2h, T2d, T29, T2c, T2b, T2e;
+			      {
+				   E T33, T2Z, T32, T31, T34;
+				   {
+					E T2v, T30, T2M, T2X, T2O, T2N, T2Y;
+					T2v = W[18];
+					T30 = FMA(KP707106781, T2L, T2A);
+					T2M = FNMS(KP707106781, T2L, T2A);
+					T33 = FMA(KP707106781, T2W, T2T);
+					T2X = FNMS(KP707106781, T2W, T2T);
+					T2O = W[19];
+					T2N = T2v * T2M;
+					T2Z = W[2];
+					T32 = W[3];
+					T2Y = T2O * T2M;
+					cr[WS(rs, 10)] = FNMS(T2O, T2X, T2N);
+					T31 = T2Z * T30;
+					T34 = T32 * T30;
+					ci[WS(rs, 10)] = FMA(T2v, T2X, T2Y);
+				   }
+				   {
+					E T1Q, T1X, T23, T26;
+					T2g = FMA(KP707106781, T1P, T1O);
+					T1Q = FNMS(KP707106781, T1P, T1O);
+					cr[WS(rs, 2)] = FNMS(T32, T33, T31);
+					ci[WS(rs, 2)] = FMA(T2Z, T33, T34);
+					T1X = T1T + T1W;
+					T2m = T1W - T1T;
+					T2l = FNMS(KP707106781, T22, T21);
+					T23 = FMA(KP707106781, T22, T21);
+					T26 = T24 - T25;
+					T2h = T24 + T25;
+					{
+					     E T1N, T2a, T1Y, T27, T20, T1Z, T28;
+					     T1N = W[20];
+					     T2a = FNMS(KP923879532, T1X, T1Q);
+					     T1Y = FMA(KP923879532, T1X, T1Q);
+					     T2d = FMA(KP923879532, T26, T23);
+					     T27 = FNMS(KP923879532, T26, T23);
+					     T20 = W[21];
+					     T1Z = T1N * T1Y;
+					     T29 = W[4];
+					     T2c = W[5];
+					     T28 = T20 * T1Y;
+					     cr[WS(rs, 11)] = FNMS(T20, T27, T1Z);
+					     T2b = T29 * T2a;
+					     T2e = T2c * T2a;
+					     ci[WS(rs, 11)] = FMA(T1N, T27, T28);
+					}
+				   }
+			      }
+			      {
+				   E T1y, T1E, T1D, T1z, T1v, T1r, T1u, T1t, T1w;
+				   {
+					E TM, T19, T1l, T1o;
+					T1y = FMA(KP707106781, TL, TA);
+					TM = FNMS(KP707106781, TL, TA);
+					cr[WS(rs, 3)] = FNMS(T2c, T2d, T2b);
+					ci[WS(rs, 3)] = FMA(T29, T2d, T2e);
+					T19 = TX - T18;
+					T1E = T18 + TX;
+					T1D = FMA(KP707106781, T1k, T1h);
+					T1l = FNMS(KP707106781, T1k, T1h);
+					T1o = T1m - T1n;
+					T1z = T1m + T1n;
+					{
+					     E Tv, T1s, T1a, T1p, T1c, T1b, T1q;
+					     Tv = W[24];
+					     T1s = FMA(KP923879532, T19, TM);
+					     T1a = FNMS(KP923879532, T19, TM);
+					     T1v = FMA(KP923879532, T1o, T1l);
+					     T1p = FNMS(KP923879532, T1o, T1l);
+					     T1c = W[25];
+					     T1b = Tv * T1a;
+					     T1r = W[8];
+					     T1u = W[9];
+					     T1q = T1c * T1a;
+					     cr[WS(rs, 13)] = FNMS(T1c, T1p, T1b);
+					     T1t = T1r * T1s;
+					     T1w = T1u * T1s;
+					     ci[WS(rs, 13)] = FMA(Tv, T1p, T1q);
+					}
+				   }
+				   {
+					E T2q, T2t, T2s, T2u, T2r;
+					cr[WS(rs, 5)] = FNMS(T1u, T1v, T1t);
+					ci[WS(rs, 5)] = FMA(T1r, T1v, T1w);
+					{
+					     E T2f, T2i, T2n, T2k, T2j, T2p, T2o;
+					     T2f = W[12];
+					     T2q = FMA(KP923879532, T2h, T2g);
+					     T2i = FNMS(KP923879532, T2h, T2g);
+					     T2t = FNMS(KP923879532, T2m, T2l);
+					     T2n = FMA(KP923879532, T2m, T2l);
+					     T2k = W[13];
+					     T2j = T2f * T2i;
+					     T2p = W[28];
+					     T2o = T2f * T2n;
+					     T2s = W[29];
+					     cr[WS(rs, 7)] = FNMS(T2k, T2n, T2j);
+					     T2u = T2p * T2t;
+					     T2r = T2p * T2q;
+					     ci[WS(rs, 7)] = FMA(T2k, T2i, T2o);
+					}
+					ci[WS(rs, 15)] = FMA(T2s, T2q, T2u);
+					cr[WS(rs, 15)] = FNMS(T2s, T2t, T2r);
+					{
+					     E T1x, T1A, T1F, T1C, T1B, T1H, T1G;
+					     T1x = W[16];
+					     T1I = FMA(KP923879532, T1z, T1y);
+					     T1A = FNMS(KP923879532, T1z, T1y);
+					     T1L = FMA(KP923879532, T1E, T1D);
+					     T1F = FNMS(KP923879532, T1E, T1D);
+					     T1C = W[17];
+					     T1B = T1x * T1A;
+					     T1H = W[0];
+					     T1G = T1x * T1F;
+					     T1K = W[1];
+					     cr[WS(rs, 9)] = FNMS(T1C, T1F, T1B);
+					     T1M = T1H * T1L;
+					     T1J = T1H * T1I;
+					     ci[WS(rs, 9)] = FMA(T1C, T1A, T1G);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 1)] = FMA(T1K, T1I, T1M);
+	       cr[WS(rs, 1)] = FNMS(T1K, T1L, T1J);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, {104, 30, 70, 0} };
+
+void X(codelet_hb_16) (planner *p) {
+     X(khc2hc_register) (p, hb_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include hb.h */
+
+/*
+ * This function contains 174 FP additions, 84 FP multiplications,
+ * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
+ * 50 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hb.h"
+
+static void hb_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T7, T2K, T2W, Tw, T17, T1S, T2k, T1w, Te, TD, T1x, T10, T2n, T2L, T1Z;
+	       E T2X, Tm, T1z, TN, T19, T2e, T2p, T2P, T2Z, Tt, T1A, TW, T1a, T27, T2q;
+	       E T2S, T30;
+	       {
+		    E T3, T1Q, T16, T1R, T6, T2i, T13, T2j;
+		    {
+			 E T1, T2, T14, T15;
+			 T1 = cr[0];
+			 T2 = ci[WS(rs, 7)];
+			 T3 = T1 + T2;
+			 T1Q = T1 - T2;
+			 T14 = ci[WS(rs, 11)];
+			 T15 = cr[WS(rs, 12)];
+			 T16 = T14 - T15;
+			 T1R = T14 + T15;
+		    }
+		    {
+			 E T4, T5, T11, T12;
+			 T4 = cr[WS(rs, 4)];
+			 T5 = ci[WS(rs, 3)];
+			 T6 = T4 + T5;
+			 T2i = T4 - T5;
+			 T11 = ci[WS(rs, 15)];
+			 T12 = cr[WS(rs, 8)];
+			 T13 = T11 - T12;
+			 T2j = T11 + T12;
+		    }
+		    T7 = T3 + T6;
+		    T2K = T1Q + T1R;
+		    T2W = T2j - T2i;
+		    Tw = T3 - T6;
+		    T17 = T13 - T16;
+		    T1S = T1Q - T1R;
+		    T2k = T2i + T2j;
+		    T1w = T13 + T16;
+	       }
+	       {
+		    E Ta, T1T, TC, T1U, Td, T1W, Tz, T1X;
+		    {
+			 E T8, T9, TA, TB;
+			 T8 = cr[WS(rs, 2)];
+			 T9 = ci[WS(rs, 5)];
+			 Ta = T8 + T9;
+			 T1T = T8 - T9;
+			 TA = ci[WS(rs, 13)];
+			 TB = cr[WS(rs, 10)];
+			 TC = TA - TB;
+			 T1U = TA + TB;
+		    }
+		    {
+			 E Tb, Tc, Tx, Ty;
+			 Tb = ci[WS(rs, 1)];
+			 Tc = cr[WS(rs, 6)];
+			 Td = Tb + Tc;
+			 T1W = Tb - Tc;
+			 Tx = ci[WS(rs, 9)];
+			 Ty = cr[WS(rs, 14)];
+			 Tz = Tx - Ty;
+			 T1X = Tx + Ty;
+		    }
+		    Te = Ta + Td;
+		    TD = Tz - TC;
+		    T1x = TC + Tz;
+		    T10 = Ta - Td;
+		    {
+			 E T2l, T2m, T1V, T1Y;
+			 T2l = T1T + T1U;
+			 T2m = T1W + T1X;
+			 T2n = KP707106781 * (T2l - T2m);
+			 T2L = KP707106781 * (T2l + T2m);
+			 T1V = T1T - T1U;
+			 T1Y = T1W - T1X;
+			 T1Z = KP707106781 * (T1V + T1Y);
+			 T2X = KP707106781 * (T1V - T1Y);
+		    }
+	       }
+	       {
+		    E Ti, T2b, TL, T2c, Tl, T28, TI, T29, TF, TM;
+		    {
+			 E Tg, Th, TJ, TK;
+			 Tg = cr[WS(rs, 1)];
+			 Th = ci[WS(rs, 6)];
+			 Ti = Tg + Th;
+			 T2b = Tg - Th;
+			 TJ = ci[WS(rs, 10)];
+			 TK = cr[WS(rs, 13)];
+			 TL = TJ - TK;
+			 T2c = TJ + TK;
+		    }
+		    {
+			 E Tj, Tk, TG, TH;
+			 Tj = cr[WS(rs, 5)];
+			 Tk = ci[WS(rs, 2)];
+			 Tl = Tj + Tk;
+			 T28 = Tj - Tk;
+			 TG = ci[WS(rs, 14)];
+			 TH = cr[WS(rs, 9)];
+			 TI = TG - TH;
+			 T29 = TG + TH;
+		    }
+		    Tm = Ti + Tl;
+		    T1z = TI + TL;
+		    TF = Ti - Tl;
+		    TM = TI - TL;
+		    TN = TF - TM;
+		    T19 = TF + TM;
+		    {
+			 E T2a, T2d, T2N, T2O;
+			 T2a = T28 + T29;
+			 T2d = T2b - T2c;
+			 T2e = FMA(KP923879532, T2a, KP382683432 * T2d);
+			 T2p = FNMS(KP382683432, T2a, KP923879532 * T2d);
+			 T2N = T2b + T2c;
+			 T2O = T29 - T28;
+			 T2P = FNMS(KP923879532, T2O, KP382683432 * T2N);
+			 T2Z = FMA(KP382683432, T2O, KP923879532 * T2N);
+		    }
+	       }
+	       {
+		    E Tp, T24, TU, T25, Ts, T21, TR, T22, TO, TV;
+		    {
+			 E Tn, To, TS, TT;
+			 Tn = ci[0];
+			 To = cr[WS(rs, 7)];
+			 Tp = Tn + To;
+			 T24 = Tn - To;
+			 TS = ci[WS(rs, 12)];
+			 TT = cr[WS(rs, 11)];
+			 TU = TS - TT;
+			 T25 = TS + TT;
+		    }
+		    {
+			 E Tq, Tr, TP, TQ;
+			 Tq = cr[WS(rs, 3)];
+			 Tr = ci[WS(rs, 4)];
+			 Ts = Tq + Tr;
+			 T21 = Tq - Tr;
+			 TP = ci[WS(rs, 8)];
+			 TQ = cr[WS(rs, 15)];
+			 TR = TP - TQ;
+			 T22 = TP + TQ;
+		    }
+		    Tt = Tp + Ts;
+		    T1A = TR + TU;
+		    TO = Tp - Ts;
+		    TV = TR - TU;
+		    TW = TO + TV;
+		    T1a = TV - TO;
+		    {
+			 E T23, T26, T2Q, T2R;
+			 T23 = T21 - T22;
+			 T26 = T24 - T25;
+			 T27 = FNMS(KP382683432, T26, KP923879532 * T23);
+			 T2q = FMA(KP382683432, T23, KP923879532 * T26);
+			 T2Q = T24 + T25;
+			 T2R = T21 + T22;
+			 T2S = FNMS(KP923879532, T2R, KP382683432 * T2Q);
+			 T30 = FMA(KP382683432, T2R, KP923879532 * T2Q);
+		    }
+	       }
+	       {
+		    E Tf, Tu, T1u, T1y, T1B, T1C, T1t, T1v;
+		    Tf = T7 + Te;
+		    Tu = Tm + Tt;
+		    T1u = Tf - Tu;
+		    T1y = T1w + T1x;
+		    T1B = T1z + T1A;
+		    T1C = T1y - T1B;
+		    cr[0] = Tf + Tu;
+		    ci[0] = T1y + T1B;
+		    T1t = W[14];
+		    T1v = W[15];
+		    cr[WS(rs, 8)] = FNMS(T1v, T1C, T1t * T1u);
+		    ci[WS(rs, 8)] = FMA(T1v, T1u, T1t * T1C);
+	       }
+	       {
+		    E T2U, T34, T32, T36;
+		    {
+			 E T2M, T2T, T2Y, T31;
+			 T2M = T2K - T2L;
+			 T2T = T2P + T2S;
+			 T2U = T2M - T2T;
+			 T34 = T2M + T2T;
+			 T2Y = T2W + T2X;
+			 T31 = T2Z - T30;
+			 T32 = T2Y - T31;
+			 T36 = T2Y + T31;
+		    }
+		    {
+			 E T2J, T2V, T33, T35;
+			 T2J = W[20];
+			 T2V = W[21];
+			 cr[WS(rs, 11)] = FNMS(T2V, T32, T2J * T2U);
+			 ci[WS(rs, 11)] = FMA(T2V, T2U, T2J * T32);
+			 T33 = W[4];
+			 T35 = W[5];
+			 cr[WS(rs, 3)] = FNMS(T35, T36, T33 * T34);
+			 ci[WS(rs, 3)] = FMA(T35, T34, T33 * T36);
+		    }
+	       }
+	       {
+		    E T3a, T3g, T3e, T3i;
+		    {
+			 E T38, T39, T3c, T3d;
+			 T38 = T2K + T2L;
+			 T39 = T2Z + T30;
+			 T3a = T38 - T39;
+			 T3g = T38 + T39;
+			 T3c = T2W - T2X;
+			 T3d = T2P - T2S;
+			 T3e = T3c + T3d;
+			 T3i = T3c - T3d;
+		    }
+		    {
+			 E T37, T3b, T3f, T3h;
+			 T37 = W[12];
+			 T3b = W[13];
+			 cr[WS(rs, 7)] = FNMS(T3b, T3e, T37 * T3a);
+			 ci[WS(rs, 7)] = FMA(T37, T3e, T3b * T3a);
+			 T3f = W[28];
+			 T3h = W[29];
+			 cr[WS(rs, 15)] = FNMS(T3h, T3i, T3f * T3g);
+			 ci[WS(rs, 15)] = FMA(T3f, T3i, T3h * T3g);
+		    }
+	       }
+	       {
+		    E TY, T1e, T1c, T1g;
+		    {
+			 E TE, TX, T18, T1b;
+			 TE = Tw + TD;
+			 TX = KP707106781 * (TN + TW);
+			 TY = TE - TX;
+			 T1e = TE + TX;
+			 T18 = T10 + T17;
+			 T1b = KP707106781 * (T19 + T1a);
+			 T1c = T18 - T1b;
+			 T1g = T18 + T1b;
+		    }
+		    {
+			 E Tv, TZ, T1d, T1f;
+			 Tv = W[18];
+			 TZ = W[19];
+			 cr[WS(rs, 10)] = FNMS(TZ, T1c, Tv * TY);
+			 ci[WS(rs, 10)] = FMA(TZ, TY, Tv * T1c);
+			 T1d = W[2];
+			 T1f = W[3];
+			 cr[WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
+			 ci[WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
+		    }
+	       }
+	       {
+		    E T1k, T1q, T1o, T1s;
+		    {
+			 E T1i, T1j, T1m, T1n;
+			 T1i = Tw - TD;
+			 T1j = KP707106781 * (T1a - T19);
+			 T1k = T1i - T1j;
+			 T1q = T1i + T1j;
+			 T1m = T17 - T10;
+			 T1n = KP707106781 * (TN - TW);
+			 T1o = T1m - T1n;
+			 T1s = T1m + T1n;
+		    }
+		    {
+			 E T1h, T1l, T1p, T1r;
+			 T1h = W[26];
+			 T1l = W[27];
+			 cr[WS(rs, 14)] = FNMS(T1l, T1o, T1h * T1k);
+			 ci[WS(rs, 14)] = FMA(T1h, T1o, T1l * T1k);
+			 T1p = W[10];
+			 T1r = W[11];
+			 cr[WS(rs, 6)] = FNMS(T1r, T1s, T1p * T1q);
+			 ci[WS(rs, 6)] = FMA(T1p, T1s, T1r * T1q);
+		    }
+	       }
+	       {
+		    E T2g, T2u, T2s, T2w;
+		    {
+			 E T20, T2f, T2o, T2r;
+			 T20 = T1S - T1Z;
+			 T2f = T27 - T2e;
+			 T2g = T20 - T2f;
+			 T2u = T20 + T2f;
+			 T2o = T2k - T2n;
+			 T2r = T2p - T2q;
+			 T2s = T2o - T2r;
+			 T2w = T2o + T2r;
+		    }
+		    {
+			 E T1P, T2h, T2t, T2v;
+			 T1P = W[24];
+			 T2h = W[25];
+			 cr[WS(rs, 13)] = FNMS(T2h, T2s, T1P * T2g);
+			 ci[WS(rs, 13)] = FMA(T2h, T2g, T1P * T2s);
+			 T2t = W[8];
+			 T2v = W[9];
+			 cr[WS(rs, 5)] = FNMS(T2v, T2w, T2t * T2u);
+			 ci[WS(rs, 5)] = FMA(T2v, T2u, T2t * T2w);
+		    }
+	       }
+	       {
+		    E T2A, T2G, T2E, T2I;
+		    {
+			 E T2y, T2z, T2C, T2D;
+			 T2y = T1S + T1Z;
+			 T2z = T2p + T2q;
+			 T2A = T2y - T2z;
+			 T2G = T2y + T2z;
+			 T2C = T2k + T2n;
+			 T2D = T2e + T27;
+			 T2E = T2C - T2D;
+			 T2I = T2C + T2D;
+		    }
+		    {
+			 E T2x, T2B, T2F, T2H;
+			 T2x = W[16];
+			 T2B = W[17];
+			 cr[WS(rs, 9)] = FNMS(T2B, T2E, T2x * T2A);
+			 ci[WS(rs, 9)] = FMA(T2x, T2E, T2B * T2A);
+			 T2F = W[0];
+			 T2H = W[1];
+			 cr[WS(rs, 1)] = FNMS(T2H, T2I, T2F * T2G);
+			 ci[WS(rs, 1)] = FMA(T2F, T2I, T2H * T2G);
+		    }
+	       }
+	       {
+		    E T1G, T1M, T1K, T1O;
+		    {
+			 E T1E, T1F, T1I, T1J;
+			 T1E = T7 - Te;
+			 T1F = T1A - T1z;
+			 T1G = T1E - T1F;
+			 T1M = T1E + T1F;
+			 T1I = T1w - T1x;
+			 T1J = Tm - Tt;
+			 T1K = T1I - T1J;
+			 T1O = T1J + T1I;
+		    }
+		    {
+			 E T1D, T1H, T1L, T1N;
+			 T1D = W[22];
+			 T1H = W[23];
+			 cr[WS(rs, 12)] = FNMS(T1H, T1K, T1D * T1G);
+			 ci[WS(rs, 12)] = FMA(T1D, T1K, T1H * T1G);
+			 T1L = W[6];
+			 T1N = W[7];
+			 cr[WS(rs, 4)] = FNMS(T1N, T1O, T1L * T1M);
+			 ci[WS(rs, 4)] = FMA(T1L, T1O, T1N * T1M);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, {136, 46, 38, 0} };
+
+void X(codelet_hb_16) (planner *p) {
+     X(khc2hc_register) (p, hb_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:12 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hb_2 -include hb.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hb.h"
+
+static void hb_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
+	       E T5, T6, T9, T8, T7, Ta;
+	       {
+		    E T1, T2, T3, T4;
+		    T1 = cr[0];
+		    T2 = ci[0];
+		    T3 = ci[WS(rs, 1)];
+		    T4 = cr[WS(rs, 1)];
+		    T5 = W[0];
+		    cr[0] = T1 + T2;
+		    T6 = T1 - T2;
+		    ci[0] = T3 - T4;
+		    T9 = T3 + T4;
+		    T8 = W[1];
+		    T7 = T5 * T6;
+	       }
+	       Ta = T8 * T6;
+	       cr[WS(rs, 1)] = FNMS(T8, T9, T7);
+	       ci[WS(rs, 1)] = FMA(T5, T9, Ta);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 2, "hb_2", twinstr, &GENUS, {4, 2, 2, 0} };
+
+void X(codelet_hb_2) (planner *p) {
+     X(khc2hc_register) (p, hb_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hb_2 -include hb.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 9 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hb.h"
+
+static void hb_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
+	       E T1, T2, T6, T3, T4, T8, T5, T7;
+	       T1 = cr[0];
+	       T2 = ci[0];
+	       T6 = T1 - T2;
+	       T3 = ci[WS(rs, 1)];
+	       T4 = cr[WS(rs, 1)];
+	       T8 = T3 + T4;
+	       cr[0] = T1 + T2;
+	       ci[0] = T3 - T4;
+	       T5 = W[0];
+	       T7 = W[1];
+	       cr[WS(rs, 1)] = FNMS(T7, T8, T5 * T6);
+	       ci[WS(rs, 1)] = FMA(T7, T6, T5 * T8);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 2, "hb_2", twinstr, &GENUS, {4, 2, 2, 0} };
+
+void X(codelet_hb_2) (planner *p) {
+     X(khc2hc_register) (p, hb_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1049 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:22 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hb_20 -include hb.h */
+
+/*
+ * This function contains 246 FP additions, 148 FP multiplications,
+ * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
+ * 101 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hb.h"
+
+static void hb_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T1T, T1Q, T1P;
+	       {
+		    E T2W, T4e, T7, TE, T3z, T4z, T1t, T2l, T3a, T3G, T13, T33, T3H, T1i, T2g;
+		    E T4H, T4G, T2d, T1B, T4u, T4B, T4A, T4r, T1A, T2s, T3l, T2t, T3s, T2o, T2q;
+		    E T1w, T1y, TC, T29, T3E, T3C, T4n, T4l, TN, TL;
+		    {
+			 E T4, T2U, T3, T2V, T1s, T5, T1n, T1o;
+			 {
+			      E T1, T2, T1q, T1r;
+			      T1 = cr[0];
+			      T2 = ci[WS(rs, 9)];
+			      T1q = ci[WS(rs, 14)];
+			      T1r = cr[WS(rs, 15)];
+			      T4 = cr[WS(rs, 5)];
+			      T2U = T1 - T2;
+			      T3 = T1 + T2;
+			      T2V = T1q + T1r;
+			      T1s = T1q - T1r;
+			      T5 = ci[WS(rs, 4)];
+			      T1n = ci[WS(rs, 19)];
+			      T1o = cr[WS(rs, 10)];
+			 }
+			 {
+			      E T3y, T6, T3x, T1p;
+			      T2W = T2U + T2V;
+			      T4e = T2U - T2V;
+			      T3y = T4 - T5;
+			      T6 = T4 + T5;
+			      T3x = T1n + T1o;
+			      T1p = T1n - T1o;
+			      T7 = T3 + T6;
+			      TE = T3 - T6;
+			      T3z = T3x - T3y;
+			      T4z = T3y + T3x;
+			      T1t = T1p - T1s;
+			      T2l = T1p + T1s;
+			 }
+		    }
+		    {
+			 E T2Z, T4f, Te, TF, T3o, T4p, T1a, T2b, TJ, TA, T4t, T3k, T4j, T39, T2f;
+			 E T12, T32, T4g, Tl, TG, T3r, T4q, T1h, T2c, T36, T4i, Tt, TI, T3h, T4s;
+			 E TV, T2e;
+			 {
+			      E Tb, T2X, Ta, T2Y, T19, Tc, T14, T15;
+			      {
+				   E T8, T9, T17, T18;
+				   T8 = cr[WS(rs, 4)];
+				   T9 = ci[WS(rs, 5)];
+				   T17 = ci[WS(rs, 10)];
+				   T18 = cr[WS(rs, 19)];
+				   Tb = cr[WS(rs, 9)];
+				   T2X = T8 - T9;
+				   Ta = T8 + T9;
+				   T2Y = T17 + T18;
+				   T19 = T17 - T18;
+				   Tc = ci[0];
+				   T14 = ci[WS(rs, 15)];
+				   T15 = cr[WS(rs, 14)];
+			      }
+			      {
+				   E T3n, Td, T3m, T16;
+				   T2Z = T2X + T2Y;
+				   T4f = T2X - T2Y;
+				   T3n = Tb - Tc;
+				   Td = Tb + Tc;
+				   T3m = T14 + T15;
+				   T16 = T14 - T15;
+				   Te = Ta + Td;
+				   TF = Ta - Td;
+				   T3o = T3m - T3n;
+				   T4p = T3n + T3m;
+				   T1a = T16 - T19;
+				   T2b = T16 + T19;
+			      }
+			 }
+			 {
+			      E TW, T37, Tw, T3i, Tz, TX, TZ, T10;
+			      {
+				   E Tu, Tv, Tx, Ty;
+				   Tu = ci[WS(rs, 7)];
+				   Tv = cr[WS(rs, 2)];
+				   Tx = ci[WS(rs, 2)];
+				   Ty = cr[WS(rs, 7)];
+				   TW = ci[WS(rs, 17)];
+				   T37 = Tu - Tv;
+				   Tw = Tu + Tv;
+				   T3i = Tx - Ty;
+				   Tz = Tx + Ty;
+				   TX = cr[WS(rs, 12)];
+				   TZ = ci[WS(rs, 12)];
+				   T10 = cr[WS(rs, 17)];
+			      }
+			      {
+				   E TY, T38, T11, T3j;
+				   TJ = Tw - Tz;
+				   TA = Tw + Tz;
+				   T3j = TW + TX;
+				   TY = TW - TX;
+				   T38 = TZ + T10;
+				   T11 = TZ - T10;
+				   T4t = T3i - T3j;
+				   T3k = T3i + T3j;
+				   T4j = T37 + T38;
+				   T39 = T37 - T38;
+				   T2f = TY + T11;
+				   T12 = TY - T11;
+			      }
+			 }
+			 {
+			      E Ti, T30, Th, T31, T1g, Tj, T1b, T1c;
+			      {
+				   E Tf, Tg, T1e, T1f;
+				   Tf = ci[WS(rs, 3)];
+				   Tg = cr[WS(rs, 6)];
+				   T1e = ci[WS(rs, 18)];
+				   T1f = cr[WS(rs, 11)];
+				   Ti = cr[WS(rs, 1)];
+				   T30 = Tf - Tg;
+				   Th = Tf + Tg;
+				   T31 = T1e + T1f;
+				   T1g = T1e - T1f;
+				   Tj = ci[WS(rs, 8)];
+				   T1b = ci[WS(rs, 13)];
+				   T1c = cr[WS(rs, 16)];
+			      }
+			      {
+				   E T3p, Tk, T3q, T1d;
+				   T32 = T30 + T31;
+				   T4g = T30 - T31;
+				   T3p = Ti - Tj;
+				   Tk = Ti + Tj;
+				   T3q = T1b + T1c;
+				   T1d = T1b - T1c;
+				   Tl = Th + Tk;
+				   TG = Th - Tk;
+				   T3r = T3p + T3q;
+				   T4q = T3p - T3q;
+				   T1h = T1d - T1g;
+				   T2c = T1d + T1g;
+			      }
+			 }
+			 {
+			      E Tq, T34, Tp, T35, TU, Tr, TP, TQ;
+			      {
+				   E Tn, To, TS, TT;
+				   Tn = cr[WS(rs, 8)];
+				   To = ci[WS(rs, 1)];
+				   TS = ci[WS(rs, 16)];
+				   TT = cr[WS(rs, 13)];
+				   Tq = ci[WS(rs, 6)];
+				   T34 = Tn - To;
+				   Tp = Tn + To;
+				   T35 = TS + TT;
+				   TU = TS - TT;
+				   Tr = cr[WS(rs, 3)];
+				   TP = ci[WS(rs, 11)];
+				   TQ = cr[WS(rs, 18)];
+			      }
+			      {
+				   E T3g, Ts, T3f, TR;
+				   T36 = T34 - T35;
+				   T4i = T34 + T35;
+				   T3g = Tq - Tr;
+				   Ts = Tq + Tr;
+				   T3f = TP + TQ;
+				   TR = TP - TQ;
+				   Tt = Tp + Ts;
+				   TI = Tp - Ts;
+				   T3h = T3f - T3g;
+				   T4s = T3g + T3f;
+				   TV = TR - TU;
+				   T2e = TR + TU;
+			      }
+			 }
+			 {
+			      E T1v, T1u, T2n, T4k, T4h, T2m, TH, TK;
+			      T3a = T36 + T39;
+			      T3G = T36 - T39;
+			      T13 = TV - T12;
+			      T1v = TV + T12;
+			      T33 = T2Z + T32;
+			      T3H = T2Z - T32;
+			      T1i = T1a - T1h;
+			      T1u = T1a + T1h;
+			      T2n = T2e + T2f;
+			      T2g = T2e - T2f;
+			      T4H = T4i - T4j;
+			      T4k = T4i + T4j;
+			      T4h = T4f + T4g;
+			      T4G = T4f - T4g;
+			      T2d = T2b - T2c;
+			      T2m = T2b + T2c;
+			      TH = TF + TG;
+			      T1B = TF - TG;
+			      T4u = T4s - T4t;
+			      T4B = T4s + T4t;
+			      T4A = T4p + T4q;
+			      T4r = T4p - T4q;
+			      T1A = TI - TJ;
+			      TK = TI + TJ;
+			      {
+				   E Tm, T3B, TB, T3A;
+				   Tm = Te + Tl;
+				   T2s = Te - Tl;
+				   T3l = T3h + T3k;
+				   T3B = T3h - T3k;
+				   TB = Tt + TA;
+				   T2t = Tt - TA;
+				   T3s = T3o + T3r;
+				   T3A = T3o - T3r;
+				   T2o = T2m + T2n;
+				   T2q = T2m - T2n;
+				   T1w = T1u + T1v;
+				   T1y = T1u - T1v;
+				   TC = Tm + TB;
+				   T29 = Tm - TB;
+				   T3E = T3A - T3B;
+				   T3C = T3A + T3B;
+				   T4n = T4h - T4k;
+				   T4l = T4h + T4k;
+				   TN = TH - TK;
+				   TL = TH + TK;
+			      }
+			 }
+		    }
+		    {
+			 E T3d, T3b, T4E, T1x, TM, T4m, T58, T5b, T4D, T5a, T5c, T59, T4C;
+			 cr[0] = T7 + TC;
+			 T3d = T33 - T3a;
+			 T3b = T33 + T3a;
+			 T4E = T4A - T4B;
+			 T4C = T4A + T4B;
+			 ci[0] = T2l + T2o;
+			 {
+			      E T25, T22, T21, T24, T23, T26, T57;
+			      T1x = FNMS(KP250000000, T1w, T1t);
+			      T25 = T1t + T1w;
+			      T22 = TE + TL;
+			      TM = FNMS(KP250000000, TL, TE);
+			      T21 = W[18];
+			      T24 = W[19];
+			      T4m = FNMS(KP250000000, T4l, T4e);
+			      T58 = T4e + T4l;
+			      T5b = T4z + T4C;
+			      T4D = FNMS(KP250000000, T4C, T4z);
+			      T23 = T21 * T22;
+			      T26 = T24 * T22;
+			      T57 = W[8];
+			      T5a = W[9];
+			      cr[WS(rs, 10)] = FNMS(T24, T25, T23);
+			      ci[WS(rs, 10)] = FMA(T21, T25, T26);
+			      T5c = T57 * T5b;
+			      T59 = T57 * T58;
+			 }
+			 {
+			      E T3U, T3Z, T3W, T40, T3V;
+			      {
+				   E T3c, T48, T4b, T3D, T47, T4a;
+				   T3c = FNMS(KP250000000, T3b, T2W);
+				   T48 = T2W + T3b;
+				   T4b = T3z + T3C;
+				   T3D = FNMS(KP250000000, T3C, T3z);
+				   ci[WS(rs, 5)] = FMA(T5a, T58, T5c);
+				   cr[WS(rs, 5)] = FNMS(T5a, T5b, T59);
+				   T47 = W[28];
+				   T4a = W[29];
+				   {
+					E T3I, T3Y, T42, T3u, T3M, T3X, T3F;
+					{
+					     E T3T, T3t, T4c, T49, T3e, T3S;
+					     T3T = FMA(KP618033988, T3l, T3s);
+					     T3t = FNMS(KP618033988, T3s, T3l);
+					     T4c = T47 * T4b;
+					     T49 = T47 * T48;
+					     T3I = FNMS(KP618033988, T3H, T3G);
+					     T3Y = FMA(KP618033988, T3G, T3H);
+					     ci[WS(rs, 15)] = FMA(T4a, T48, T4c);
+					     cr[WS(rs, 15)] = FNMS(T4a, T4b, T49);
+					     T3e = FNMS(KP559016994, T3d, T3c);
+					     T3S = FMA(KP559016994, T3d, T3c);
+					     T42 = FMA(KP951056516, T3T, T3S);
+					     T3U = FNMS(KP951056516, T3T, T3S);
+					     T3u = FNMS(KP951056516, T3t, T3e);
+					     T3M = FMA(KP951056516, T3t, T3e);
+					     T3X = FMA(KP559016994, T3E, T3D);
+					     T3F = FNMS(KP559016994, T3E, T3D);
+					}
+					{
+					     E T3P, T45, T44, T46, T43;
+					     {
+						  E T3w, T3J, T3v, T3K, T2T, T41;
+						  T2T = W[4];
+						  T3w = W[5];
+						  T3J = FMA(KP951056516, T3I, T3F);
+						  T3P = FNMS(KP951056516, T3I, T3F);
+						  T45 = FNMS(KP951056516, T3Y, T3X);
+						  T3Z = FMA(KP951056516, T3Y, T3X);
+						  T3v = T2T * T3u;
+						  T3K = T2T * T3J;
+						  T41 = W[36];
+						  T44 = W[37];
+						  cr[WS(rs, 3)] = FNMS(T3w, T3J, T3v);
+						  ci[WS(rs, 3)] = FMA(T3w, T3u, T3K);
+						  T46 = T41 * T45;
+						  T43 = T41 * T42;
+					     }
+					     {
+						  E T3O, T3Q, T3N, T3L, T3R;
+						  T3L = W[12];
+						  T3O = W[13];
+						  ci[WS(rs, 19)] = FMA(T44, T42, T46);
+						  cr[WS(rs, 19)] = FNMS(T44, T45, T43);
+						  T3Q = T3L * T3P;
+						  T3N = T3L * T3M;
+						  T3R = W[20];
+						  T3W = W[21];
+						  ci[WS(rs, 7)] = FMA(T3O, T3M, T3Q);
+						  cr[WS(rs, 7)] = FNMS(T3O, T3P, T3N);
+						  T40 = T3R * T3Z;
+						  T3V = T3R * T3U;
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T4U, T4Z, T4W, T50, T4V, T2L, T2I, T2H;
+				   {
+					E T4T, T4v, T4I, T4Y, T4o, T4S;
+					T4T = FNMS(KP618033988, T4r, T4u);
+					T4v = FMA(KP618033988, T4u, T4r);
+					ci[WS(rs, 11)] = FMA(T3W, T3U, T40);
+					cr[WS(rs, 11)] = FNMS(T3W, T3Z, T3V);
+					T4I = FMA(KP618033988, T4H, T4G);
+					T4Y = FNMS(KP618033988, T4G, T4H);
+					T4o = FMA(KP559016994, T4n, T4m);
+					T4S = FNMS(KP559016994, T4n, T4m);
+					{
+					     E T52, T4M, T55, T4P, T54, T56, T53;
+					     {
+						  E T4d, T4w, T4J, T4x, T4y, T4X, T4F, T51, T4K;
+						  T4d = W[0];
+						  T4X = FNMS(KP559016994, T4E, T4D);
+						  T4F = FMA(KP559016994, T4E, T4D);
+						  T4U = FNMS(KP951056516, T4T, T4S);
+						  T52 = FMA(KP951056516, T4T, T4S);
+						  T4M = FMA(KP951056516, T4v, T4o);
+						  T4w = FNMS(KP951056516, T4v, T4o);
+						  T4Z = FMA(KP951056516, T4Y, T4X);
+						  T55 = FNMS(KP951056516, T4Y, T4X);
+						  T4P = FNMS(KP951056516, T4I, T4F);
+						  T4J = FMA(KP951056516, T4I, T4F);
+						  T4x = T4d * T4w;
+						  T4y = W[1];
+						  T51 = W[32];
+						  T4K = T4d * T4J;
+						  T54 = W[33];
+						  cr[WS(rs, 1)] = FNMS(T4y, T4J, T4x);
+						  T56 = T51 * T55;
+						  T53 = T51 * T52;
+						  ci[WS(rs, 1)] = FMA(T4y, T4w, T4K);
+					     }
+					     {
+						  E T4O, T4Q, T4N, T4L, T4R;
+						  T4L = W[16];
+						  ci[WS(rs, 17)] = FMA(T54, T52, T56);
+						  cr[WS(rs, 17)] = FNMS(T54, T55, T53);
+						  T4O = W[17];
+						  T4Q = T4L * T4P;
+						  T4N = T4L * T4M;
+						  T4R = W[24];
+						  T4W = W[25];
+						  ci[WS(rs, 9)] = FMA(T4O, T4M, T4Q);
+						  cr[WS(rs, 9)] = FNMS(T4O, T4P, T4N);
+						  T50 = T4R * T4Z;
+						  T4V = T4R * T4U;
+					     }
+					}
+				   }
+				   {
+					E T2K, T2u, T2F, T2h, T28, T2J, T2r, T2p;
+					T2K = FNMS(KP618033988, T2s, T2t);
+					T2u = FMA(KP618033988, T2t, T2s);
+					ci[WS(rs, 13)] = FMA(T4W, T4U, T50);
+					cr[WS(rs, 13)] = FNMS(T4W, T4Z, T4V);
+					T2p = FNMS(KP250000000, T2o, T2l);
+					T2F = FNMS(KP618033988, T2d, T2g);
+					T2h = FMA(KP618033988, T2g, T2d);
+					T28 = FNMS(KP250000000, TC, T7);
+					T2J = FNMS(KP559016994, T2q, T2p);
+					T2r = FMA(KP559016994, T2q, T2p);
+					{
+					     E T2B, T2G, T2y, T2R, T2Q, T2P, T2A, T2x;
+					     {
+						  E T2k, T2v, T27, T2O, T2i, T2a, T2E;
+						  T2k = W[7];
+						  T2a = FMA(KP559016994, T29, T28);
+						  T2E = FNMS(KP559016994, T29, T28);
+						  T2B = FMA(KP951056516, T2u, T2r);
+						  T2v = FNMS(KP951056516, T2u, T2r);
+						  T27 = W[6];
+						  T2O = FMA(KP951056516, T2F, T2E);
+						  T2G = FNMS(KP951056516, T2F, T2E);
+						  T2i = FMA(KP951056516, T2h, T2a);
+						  T2y = FNMS(KP951056516, T2h, T2a);
+						  {
+						       E T2N, T2j, T2w, T2S;
+						       T2L = FMA(KP951056516, T2K, T2J);
+						       T2R = FNMS(KP951056516, T2K, T2J);
+						       T2Q = W[23];
+						       T2N = W[22];
+						       T2j = T27 * T2i;
+						       T2w = T2k * T2i;
+						       T2S = T2Q * T2O;
+						       T2P = T2N * T2O;
+						       cr[WS(rs, 4)] = FNMS(T2k, T2v, T2j);
+						       ci[WS(rs, 4)] = FMA(T27, T2v, T2w);
+						       ci[WS(rs, 12)] = FMA(T2N, T2R, T2S);
+						  }
+					     }
+					     cr[WS(rs, 12)] = FNMS(T2Q, T2R, T2P);
+					     T2A = W[31];
+					     T2x = W[30];
+					     {
+						  E T2D, T2M, T2C, T2z;
+						  T2I = W[15];
+						  T2C = T2A * T2y;
+						  T2z = T2x * T2y;
+						  T2D = W[14];
+						  T2M = T2I * T2G;
+						  ci[WS(rs, 16)] = FMA(T2x, T2B, T2C);
+						  cr[WS(rs, 16)] = FNMS(T2A, T2B, T2z);
+						  T2H = T2D * T2G;
+						  ci[WS(rs, 8)] = FMA(T2D, T2L, T2M);
+					     }
+					}
+				   }
+				   {
+					E T1S, T1C, T1j, T1N, T1z, T1R;
+					T1S = FMA(KP618033988, T1A, T1B);
+					T1C = FNMS(KP618033988, T1B, T1A);
+					cr[WS(rs, 8)] = FNMS(T2I, T2L, T2H);
+					T1j = FNMS(KP618033988, T1i, T13);
+					T1N = FMA(KP618033988, T13, T1i);
+					T1z = FNMS(KP559016994, T1y, T1x);
+					T1R = FMA(KP559016994, T1y, T1x);
+					{
+					     E T1J, T1O, T1G, T1Z, T1Y, T1X, T1I, T1F;
+					     {
+						  E T1m, T1D, TD, T1W, T1k, T1M, TO;
+						  T1m = W[3];
+						  T1M = FMA(KP559016994, TN, TM);
+						  TO = FNMS(KP559016994, TN, TM);
+						  T1D = FNMS(KP951056516, T1C, T1z);
+						  T1J = FMA(KP951056516, T1C, T1z);
+						  TD = W[2];
+						  T1O = FNMS(KP951056516, T1N, T1M);
+						  T1W = FMA(KP951056516, T1N, T1M);
+						  T1G = FNMS(KP951056516, T1j, TO);
+						  T1k = FMA(KP951056516, T1j, TO);
+						  {
+						       E T1V, T1l, T1E, T20;
+						       T1Z = FNMS(KP951056516, T1S, T1R);
+						       T1T = FMA(KP951056516, T1S, T1R);
+						       T1Y = W[27];
+						       T1V = W[26];
+						       T1l = TD * T1k;
+						       T1E = T1m * T1k;
+						       T20 = T1Y * T1W;
+						       T1X = T1V * T1W;
+						       cr[WS(rs, 2)] = FNMS(T1m, T1D, T1l);
+						       ci[WS(rs, 2)] = FMA(TD, T1D, T1E);
+						       ci[WS(rs, 14)] = FMA(T1V, T1Z, T20);
+						  }
+					     }
+					     cr[WS(rs, 14)] = FNMS(T1Y, T1Z, T1X);
+					     T1I = W[35];
+					     T1F = W[34];
+					     {
+						  E T1L, T1U, T1K, T1H;
+						  T1Q = W[11];
+						  T1K = T1I * T1G;
+						  T1H = T1F * T1G;
+						  T1L = W[10];
+						  T1U = T1Q * T1O;
+						  ci[WS(rs, 18)] = FMA(T1F, T1J, T1K);
+						  cr[WS(rs, 18)] = FNMS(T1I, T1J, T1H);
+						  T1P = T1L * T1O;
+						  ci[WS(rs, 6)] = FMA(T1L, T1T, T1U);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 6)] = FNMS(T1Q, T1T, T1P);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 20, "hb_20", twinstr, &GENUS, {136, 38, 110, 0} };
+
+void X(codelet_hb_20) (planner *p) {
+     X(khc2hc_register) (p, hb_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hb_20 -include hb.h */
+
+/*
+ * This function contains 246 FP additions, 124 FP multiplications,
+ * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
+ * 97 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hb.h"
+
+static void hb_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T7, T3T, T49, TE, T1v, T2T, T3g, T2d, T13, T3n, T3o, T1i, T26, T4e, T4d;
+	       E T23, T1n, T42, T3Z, T1m, T2h, T2I, T2i, T2P, T30, T37, T38, Tm, TB, TC;
+	       E T46, T47, T4a, T2a, T2b, T2e, T1w, T1x, T1y, T3O, T3R, T3U, T3h, T3i, T3j;
+	       E TH, TK, TL;
+	       {
+		    E T3, T2R, T1u, T2S, T6, T3f, T1r, T3e;
+		    {
+			 E T1, T2, T1s, T1t;
+			 T1 = cr[0];
+			 T2 = ci[WS(rs, 9)];
+			 T3 = T1 + T2;
+			 T2R = T1 - T2;
+			 T1s = ci[WS(rs, 14)];
+			 T1t = cr[WS(rs, 15)];
+			 T1u = T1s - T1t;
+			 T2S = T1s + T1t;
+		    }
+		    {
+			 E T4, T5, T1p, T1q;
+			 T4 = cr[WS(rs, 5)];
+			 T5 = ci[WS(rs, 4)];
+			 T6 = T4 + T5;
+			 T3f = T4 - T5;
+			 T1p = ci[WS(rs, 19)];
+			 T1q = cr[WS(rs, 10)];
+			 T1r = T1p - T1q;
+			 T3e = T1p + T1q;
+		    }
+		    T7 = T3 + T6;
+		    T3T = T2R - T2S;
+		    T49 = T3f + T3e;
+		    TE = T3 - T6;
+		    T1v = T1r - T1u;
+		    T2T = T2R + T2S;
+		    T3g = T3e - T3f;
+		    T2d = T1r + T1u;
+	       }
+	       {
+		    E Te, T3M, T3X, TF, TV, T2E, T2W, T21, TA, T3Q, T41, TJ, T1h, T2O, T36;
+		    E T25, Tl, T3N, T3Y, TG, T12, T2H, T2Z, T22, Tt, T3P, T40, TI, T1a, T2L;
+		    E T33, T24;
+		    {
+			 E Ta, T2U, TU, T2V, Td, T2D, TR, T2C;
+			 {
+			      E T8, T9, TS, TT;
+			      T8 = cr[WS(rs, 4)];
+			      T9 = ci[WS(rs, 5)];
+			      Ta = T8 + T9;
+			      T2U = T8 - T9;
+			      TS = ci[WS(rs, 10)];
+			      TT = cr[WS(rs, 19)];
+			      TU = TS - TT;
+			      T2V = TS + TT;
+			 }
+			 {
+			      E Tb, Tc, TP, TQ;
+			      Tb = cr[WS(rs, 9)];
+			      Tc = ci[0];
+			      Td = Tb + Tc;
+			      T2D = Tb - Tc;
+			      TP = ci[WS(rs, 15)];
+			      TQ = cr[WS(rs, 14)];
+			      TR = TP - TQ;
+			      T2C = TP + TQ;
+			 }
+			 Te = Ta + Td;
+			 T3M = T2U - T2V;
+			 T3X = T2D + T2C;
+			 TF = Ta - Td;
+			 TV = TR - TU;
+			 T2E = T2C - T2D;
+			 T2W = T2U + T2V;
+			 T21 = TR + TU;
+		    }
+		    {
+			 E Tw, T34, Tz, T2M, T1d, T2N, T1g, T35;
+			 {
+			      E Tu, Tv, Tx, Ty;
+			      Tu = ci[WS(rs, 7)];
+			      Tv = cr[WS(rs, 2)];
+			      Tw = Tu + Tv;
+			      T34 = Tu - Tv;
+			      Tx = ci[WS(rs, 2)];
+			      Ty = cr[WS(rs, 7)];
+			      Tz = Tx + Ty;
+			      T2M = Tx - Ty;
+			 }
+			 {
+			      E T1b, T1c, T1e, T1f;
+			      T1b = ci[WS(rs, 17)];
+			      T1c = cr[WS(rs, 12)];
+			      T1d = T1b - T1c;
+			      T2N = T1b + T1c;
+			      T1e = ci[WS(rs, 12)];
+			      T1f = cr[WS(rs, 17)];
+			      T1g = T1e - T1f;
+			      T35 = T1e + T1f;
+			 }
+			 TA = Tw + Tz;
+			 T3Q = T34 + T35;
+			 T41 = T2M - T2N;
+			 TJ = Tw - Tz;
+			 T1h = T1d - T1g;
+			 T2O = T2M + T2N;
+			 T36 = T34 - T35;
+			 T25 = T1d + T1g;
+		    }
+		    {
+			 E Th, T2X, T11, T2Y, Tk, T2F, TY, T2G;
+			 {
+			      E Tf, Tg, TZ, T10;
+			      Tf = ci[WS(rs, 3)];
+			      Tg = cr[WS(rs, 6)];
+			      Th = Tf + Tg;
+			      T2X = Tf - Tg;
+			      TZ = ci[WS(rs, 18)];
+			      T10 = cr[WS(rs, 11)];
+			      T11 = TZ - T10;
+			      T2Y = TZ + T10;
+			 }
+			 {
+			      E Ti, Tj, TW, TX;
+			      Ti = cr[WS(rs, 1)];
+			      Tj = ci[WS(rs, 8)];
+			      Tk = Ti + Tj;
+			      T2F = Ti - Tj;
+			      TW = ci[WS(rs, 13)];
+			      TX = cr[WS(rs, 16)];
+			      TY = TW - TX;
+			      T2G = TW + TX;
+			 }
+			 Tl = Th + Tk;
+			 T3N = T2X - T2Y;
+			 T3Y = T2F - T2G;
+			 TG = Th - Tk;
+			 T12 = TY - T11;
+			 T2H = T2F + T2G;
+			 T2Z = T2X + T2Y;
+			 T22 = TY + T11;
+		    }
+		    {
+			 E Tp, T31, T19, T32, Ts, T2K, T16, T2J;
+			 {
+			      E Tn, To, T17, T18;
+			      Tn = cr[WS(rs, 8)];
+			      To = ci[WS(rs, 1)];
+			      Tp = Tn + To;
+			      T31 = Tn - To;
+			      T17 = ci[WS(rs, 16)];
+			      T18 = cr[WS(rs, 13)];
+			      T19 = T17 - T18;
+			      T32 = T17 + T18;
+			 }
+			 {
+			      E Tq, Tr, T14, T15;
+			      Tq = ci[WS(rs, 6)];
+			      Tr = cr[WS(rs, 3)];
+			      Ts = Tq + Tr;
+			      T2K = Tq - Tr;
+			      T14 = ci[WS(rs, 11)];
+			      T15 = cr[WS(rs, 18)];
+			      T16 = T14 - T15;
+			      T2J = T14 + T15;
+			 }
+			 Tt = Tp + Ts;
+			 T3P = T31 + T32;
+			 T40 = T2K + T2J;
+			 TI = Tp - Ts;
+			 T1a = T16 - T19;
+			 T2L = T2J - T2K;
+			 T33 = T31 - T32;
+			 T24 = T16 + T19;
+		    }
+		    T13 = TV - T12;
+		    T3n = T2W - T2Z;
+		    T3o = T33 - T36;
+		    T1i = T1a - T1h;
+		    T26 = T24 - T25;
+		    T4e = T3P - T3Q;
+		    T4d = T3M - T3N;
+		    T23 = T21 - T22;
+		    T1n = TI - TJ;
+		    T42 = T40 - T41;
+		    T3Z = T3X - T3Y;
+		    T1m = TF - TG;
+		    T2h = Te - Tl;
+		    T2I = T2E + T2H;
+		    T2i = Tt - TA;
+		    T2P = T2L + T2O;
+		    T30 = T2W + T2Z;
+		    T37 = T33 + T36;
+		    T38 = T30 + T37;
+		    Tm = Te + Tl;
+		    TB = Tt + TA;
+		    TC = Tm + TB;
+		    T46 = T3X + T3Y;
+		    T47 = T40 + T41;
+		    T4a = T46 + T47;
+		    T2a = T21 + T22;
+		    T2b = T24 + T25;
+		    T2e = T2a + T2b;
+		    T1w = TV + T12;
+		    T1x = T1a + T1h;
+		    T1y = T1w + T1x;
+		    T3O = T3M + T3N;
+		    T3R = T3P + T3Q;
+		    T3U = T3O + T3R;
+		    T3h = T2E - T2H;
+		    T3i = T2L - T2O;
+		    T3j = T3h + T3i;
+		    TH = TF + TG;
+		    TK = TI + TJ;
+		    TL = TH + TK;
+	       }
+	       cr[0] = T7 + TC;
+	       ci[0] = T2d + T2e;
+	       {
+		    E T1U, T1W, T1T, T1V;
+		    T1U = TE + TL;
+		    T1W = T1v + T1y;
+		    T1T = W[18];
+		    T1V = W[19];
+		    cr[WS(rs, 10)] = FNMS(T1V, T1W, T1T * T1U);
+		    ci[WS(rs, 10)] = FMA(T1V, T1U, T1T * T1W);
+	       }
+	       {
+		    E T4y, T4A, T4x, T4z;
+		    T4y = T3T + T3U;
+		    T4A = T49 + T4a;
+		    T4x = W[8];
+		    T4z = W[9];
+		    cr[WS(rs, 5)] = FNMS(T4z, T4A, T4x * T4y);
+		    ci[WS(rs, 5)] = FMA(T4x, T4A, T4z * T4y);
+	       }
+	       {
+		    E T3I, T3K, T3H, T3J;
+		    T3I = T2T + T38;
+		    T3K = T3g + T3j;
+		    T3H = W[28];
+		    T3J = W[29];
+		    cr[WS(rs, 15)] = FNMS(T3J, T3K, T3H * T3I);
+		    ci[WS(rs, 15)] = FMA(T3H, T3K, T3J * T3I);
+	       }
+	       {
+		    E T27, T2j, T2v, T2r, T2g, T2u, T20, T2q;
+		    T27 = FMA(KP951056516, T23, KP587785252 * T26);
+		    T2j = FMA(KP951056516, T2h, KP587785252 * T2i);
+		    T2v = FNMS(KP951056516, T2i, KP587785252 * T2h);
+		    T2r = FNMS(KP951056516, T26, KP587785252 * T23);
+		    {
+			 E T2c, T2f, T1Y, T1Z;
+			 T2c = KP559016994 * (T2a - T2b);
+			 T2f = FNMS(KP250000000, T2e, T2d);
+			 T2g = T2c + T2f;
+			 T2u = T2f - T2c;
+			 T1Y = KP559016994 * (Tm - TB);
+			 T1Z = FNMS(KP250000000, TC, T7);
+			 T20 = T1Y + T1Z;
+			 T2q = T1Z - T1Y;
+		    }
+		    {
+			 E T28, T2k, T1X, T29;
+			 T28 = T20 + T27;
+			 T2k = T2g - T2j;
+			 T1X = W[6];
+			 T29 = W[7];
+			 cr[WS(rs, 4)] = FNMS(T29, T2k, T1X * T28);
+			 ci[WS(rs, 4)] = FMA(T29, T28, T1X * T2k);
+		    }
+		    {
+			 E T2y, T2A, T2x, T2z;
+			 T2y = T2q - T2r;
+			 T2A = T2v + T2u;
+			 T2x = W[22];
+			 T2z = W[23];
+			 cr[WS(rs, 12)] = FNMS(T2z, T2A, T2x * T2y);
+			 ci[WS(rs, 12)] = FMA(T2z, T2y, T2x * T2A);
+		    }
+		    {
+			 E T2m, T2o, T2l, T2n;
+			 T2m = T20 - T27;
+			 T2o = T2j + T2g;
+			 T2l = W[30];
+			 T2n = W[31];
+			 cr[WS(rs, 16)] = FNMS(T2n, T2o, T2l * T2m);
+			 ci[WS(rs, 16)] = FMA(T2n, T2m, T2l * T2o);
+		    }
+		    {
+			 E T2s, T2w, T2p, T2t;
+			 T2s = T2q + T2r;
+			 T2w = T2u - T2v;
+			 T2p = W[14];
+			 T2t = W[15];
+			 cr[WS(rs, 8)] = FNMS(T2t, T2w, T2p * T2s);
+			 ci[WS(rs, 8)] = FMA(T2t, T2s, T2p * T2w);
+		    }
+	       }
+	       {
+		    E T43, T4f, T4r, T4m, T4c, T4q, T3W, T4n;
+		    T43 = FMA(KP951056516, T3Z, KP587785252 * T42);
+		    T4f = FMA(KP951056516, T4d, KP587785252 * T4e);
+		    T4r = FNMS(KP951056516, T4e, KP587785252 * T4d);
+		    T4m = FNMS(KP951056516, T42, KP587785252 * T3Z);
+		    {
+			 E T48, T4b, T3S, T3V;
+			 T48 = KP559016994 * (T46 - T47);
+			 T4b = FNMS(KP250000000, T4a, T49);
+			 T4c = T48 + T4b;
+			 T4q = T4b - T48;
+			 T3S = KP559016994 * (T3O - T3R);
+			 T3V = FNMS(KP250000000, T3U, T3T);
+			 T3W = T3S + T3V;
+			 T4n = T3V - T3S;
+		    }
+		    {
+			 E T44, T4g, T3L, T45;
+			 T44 = T3W - T43;
+			 T4g = T4c + T4f;
+			 T3L = W[0];
+			 T45 = W[1];
+			 cr[WS(rs, 1)] = FNMS(T45, T4g, T3L * T44);
+			 ci[WS(rs, 1)] = FMA(T3L, T4g, T45 * T44);
+		    }
+		    {
+			 E T4u, T4w, T4t, T4v;
+			 T4u = T4n - T4m;
+			 T4w = T4q + T4r;
+			 T4t = W[32];
+			 T4v = W[33];
+			 cr[WS(rs, 17)] = FNMS(T4v, T4w, T4t * T4u);
+			 ci[WS(rs, 17)] = FMA(T4t, T4w, T4v * T4u);
+		    }
+		    {
+			 E T4i, T4k, T4h, T4j;
+			 T4i = T43 + T3W;
+			 T4k = T4c - T4f;
+			 T4h = W[16];
+			 T4j = W[17];
+			 cr[WS(rs, 9)] = FNMS(T4j, T4k, T4h * T4i);
+			 ci[WS(rs, 9)] = FMA(T4h, T4k, T4j * T4i);
+		    }
+		    {
+			 E T4o, T4s, T4l, T4p;
+			 T4o = T4m + T4n;
+			 T4s = T4q - T4r;
+			 T4l = W[24];
+			 T4p = W[25];
+			 cr[WS(rs, 13)] = FNMS(T4p, T4s, T4l * T4o);
+			 ci[WS(rs, 13)] = FMA(T4l, T4s, T4p * T4o);
+		    }
+	       }
+	       {
+		    E T1j, T1o, T1M, T1J, T1B, T1N, TO, T1I;
+		    T1j = FNMS(KP951056516, T1i, KP587785252 * T13);
+		    T1o = FNMS(KP951056516, T1n, KP587785252 * T1m);
+		    T1M = FMA(KP951056516, T1m, KP587785252 * T1n);
+		    T1J = FMA(KP951056516, T13, KP587785252 * T1i);
+		    {
+			 E T1z, T1A, TM, TN;
+			 T1z = FNMS(KP250000000, T1y, T1v);
+			 T1A = KP559016994 * (T1w - T1x);
+			 T1B = T1z - T1A;
+			 T1N = T1A + T1z;
+			 TM = FNMS(KP250000000, TL, TE);
+			 TN = KP559016994 * (TH - TK);
+			 TO = TM - TN;
+			 T1I = TN + TM;
+		    }
+		    {
+			 E T1k, T1C, TD, T1l;
+			 T1k = TO - T1j;
+			 T1C = T1o + T1B;
+			 TD = W[2];
+			 T1l = W[3];
+			 cr[WS(rs, 2)] = FNMS(T1l, T1C, TD * T1k);
+			 ci[WS(rs, 2)] = FMA(T1l, T1k, TD * T1C);
+		    }
+		    {
+			 E T1Q, T1S, T1P, T1R;
+			 T1Q = T1I + T1J;
+			 T1S = T1N - T1M;
+			 T1P = W[26];
+			 T1R = W[27];
+			 cr[WS(rs, 14)] = FNMS(T1R, T1S, T1P * T1Q);
+			 ci[WS(rs, 14)] = FMA(T1R, T1Q, T1P * T1S);
+		    }
+		    {
+			 E T1E, T1G, T1D, T1F;
+			 T1E = TO + T1j;
+			 T1G = T1B - T1o;
+			 T1D = W[34];
+			 T1F = W[35];
+			 cr[WS(rs, 18)] = FNMS(T1F, T1G, T1D * T1E);
+			 ci[WS(rs, 18)] = FMA(T1F, T1E, T1D * T1G);
+		    }
+		    {
+			 E T1K, T1O, T1H, T1L;
+			 T1K = T1I - T1J;
+			 T1O = T1M + T1N;
+			 T1H = W[10];
+			 T1L = W[11];
+			 cr[WS(rs, 6)] = FNMS(T1L, T1O, T1H * T1K);
+			 ci[WS(rs, 6)] = FMA(T1L, T1K, T1H * T1O);
+		    }
+	       }
+	       {
+		    E T2Q, T3p, T3B, T3x, T3m, T3A, T3b, T3w;
+		    T2Q = FNMS(KP951056516, T2P, KP587785252 * T2I);
+		    T3p = FNMS(KP951056516, T3o, KP587785252 * T3n);
+		    T3B = FMA(KP951056516, T3n, KP587785252 * T3o);
+		    T3x = FMA(KP951056516, T2I, KP587785252 * T2P);
+		    {
+			 E T3k, T3l, T39, T3a;
+			 T3k = FNMS(KP250000000, T3j, T3g);
+			 T3l = KP559016994 * (T3h - T3i);
+			 T3m = T3k - T3l;
+			 T3A = T3l + T3k;
+			 T39 = FNMS(KP250000000, T38, T2T);
+			 T3a = KP559016994 * (T30 - T37);
+			 T3b = T39 - T3a;
+			 T3w = T3a + T39;
+		    }
+		    {
+			 E T3c, T3q, T2B, T3d;
+			 T3c = T2Q + T3b;
+			 T3q = T3m - T3p;
+			 T2B = W[4];
+			 T3d = W[5];
+			 cr[WS(rs, 3)] = FNMS(T3d, T3q, T2B * T3c);
+			 ci[WS(rs, 3)] = FMA(T2B, T3q, T3d * T3c);
+		    }
+		    {
+			 E T3E, T3G, T3D, T3F;
+			 T3E = T3x + T3w;
+			 T3G = T3A - T3B;
+			 T3D = W[36];
+			 T3F = W[37];
+			 cr[WS(rs, 19)] = FNMS(T3F, T3G, T3D * T3E);
+			 ci[WS(rs, 19)] = FMA(T3D, T3G, T3F * T3E);
+		    }
+		    {
+			 E T3s, T3u, T3r, T3t;
+			 T3s = T3b - T2Q;
+			 T3u = T3m + T3p;
+			 T3r = W[12];
+			 T3t = W[13];
+			 cr[WS(rs, 7)] = FNMS(T3t, T3u, T3r * T3s);
+			 ci[WS(rs, 7)] = FMA(T3r, T3u, T3t * T3s);
+		    }
+		    {
+			 E T3y, T3C, T3v, T3z;
+			 T3y = T3w - T3x;
+			 T3C = T3A + T3B;
+			 T3v = W[20];
+			 T3z = W[21];
+			 cr[WS(rs, 11)] = FNMS(T3z, T3C, T3v * T3y);
+			 ci[WS(rs, 11)] = FMA(T3v, T3C, T3z * T3y);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 20, "hb_20", twinstr, &GENUS, {184, 62, 62, 0} };
+
+void X(codelet_hb_20) (planner *p) {
+     X(khc2hc_register) (p, hb_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1626 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:23 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 25 -dif -name hb_25 -include hb.h */
+
+/*
+ * This function contains 400 FP additions, 364 FP multiplications,
+ * (or, 84 additions, 48 multiplications, 316 fused multiply/add),
+ * 176 stack variables, 47 constants, and 100 memory accesses
+ */
+#include "hb.h"
+
+static void hb_25(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP921177326, +0.921177326965143320250447435415066029359282231);
+     DK(KP833417178, +0.833417178328688677408962550243238843138996060);
+     DK(KP541454447, +0.541454447536312777046285590082819509052033189);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DK(KP871714437, +0.871714437527667770979999223229522602943903653);
+     DK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DK(KP554608978, +0.554608978404018097464974850792216217022558774);
+     DK(KP248028675, +0.248028675328619457762448260696444630363259177);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP851038619, +0.851038619207379630836264138867114231259902550);
+     DK(KP525970792, +0.525970792408939708442463226536226366643874659);
+     DK(KP726211448, +0.726211448929902658173535992263577167607493062);
+     DK(KP912018591, +0.912018591466481957908415381764119056233607330);
+     DK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DK(KP943557151, +0.943557151597354104399655195398983005179443399);
+     DK(KP994076283, +0.994076283785401014123185814696322018529298887);
+     DK(KP614372930, +0.614372930789563808870829930444362096004872855);
+     DK(KP621716863, +0.621716863012209892444754556304102309693593202);
+     DK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DK(KP557913902, +0.557913902031834264187699648465567037992437152);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP249506682, +0.249506682107067890488084201715862638334226305);
+     DK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DK(KP968479752, +0.968479752739016373193524836781420152702090879);
+     DK(KP062914667, +0.062914667253649757225485955897349402364686947);
+     DK(KP827271945, +0.827271945972475634034355757144307982555673741);
+     DK(KP470564281, +0.470564281212251493087595091036643380879947982);
+     DK(KP126329378, +0.126329378446108174786050455341811215027378105);
+     DK(KP256756360, +0.256756360367726783319498520922669048172391148);
+     DK(KP634619297, +0.634619297544148100711287640319130485732531031);
+     DK(KP549754652, +0.549754652192770074288023275540779861653779767);
+     DK(KP939062505, +0.939062505817492352556001843133229685779824606);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 48); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 48, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E T3w, T3P, T2d, T3y, T3x, T3Q;
+	       {
+		    E T9, T3E, T1F, T3B, T6f, T7d, T5u, T6U, T4k, T2k, T5G, T1G, T19, T1H, T1s;
+		    E T1M, T1N, TP, TM, T7i, T77, T5X, T64, T4u, T4D, T3p, T2z, T74, T7h, T63;
+		    E T5Q, T4x, T4E, T3q, T2O, T4n, T4G, T3t, T3j, T5F, T70, T7f, T66, T5B, T4q;
+		    E T4H, T3s, T34, T5E, T6V;
+		    {
+			 E T2f, T2e, T6e, T2j, T5t, T6d;
+			 {
+			      E T1, T1x, T3C, T3D, T8, T2h, T1A, T1D, T2i, T3A, T1E, T3z;
+			      T1 = cr[0];
+			      T1x = ci[WS(rs, 24)];
+			      {
+				   E T2, T3, T5, T6;
+				   T2 = cr[WS(rs, 5)];
+				   T3 = ci[WS(rs, 4)];
+				   T5 = cr[WS(rs, 10)];
+				   T6 = ci[WS(rs, 9)];
+				   {
+					E T1y, T4, T7, T1z, T1B, T1C;
+					T1y = ci[WS(rs, 19)];
+					T3C = T2 - T3;
+					T4 = T2 + T3;
+					T3D = T5 - T6;
+					T7 = T5 + T6;
+					T1z = cr[WS(rs, 20)];
+					T1B = ci[WS(rs, 14)];
+					T1C = cr[WS(rs, 15)];
+					T8 = T4 + T7;
+					T2f = T4 - T7;
+					T2h = T1y + T1z;
+					T1A = T1y - T1z;
+					T1D = T1B - T1C;
+					T2i = T1B + T1C;
+				   }
+			      }
+			      T2e = FNMS(KP250000000, T8, T1);
+			      T9 = T1 + T8;
+			      T3A = T1A - T1D;
+			      T1E = T1A + T1D;
+			      T3E = FMA(KP618033988, T3D, T3C);
+			      T6e = FNMS(KP618033988, T3C, T3D);
+			      T2j = FMA(KP618033988, T2i, T2h);
+			      T5t = FNMS(KP618033988, T2h, T2i);
+			      T1F = T1x + T1E;
+			      T3z = FNMS(KP250000000, T1E, T1x);
+			      T6d = FNMS(KP559016994, T3A, T3z);
+			      T3B = FMA(KP559016994, T3A, T3z);
+			 }
+			 {
+			      E T2x, T5V, T2m, T2l, Ti, T5w, T3h, T36, TK, T35, T2F, T5L, T2I, Tr, T2H;
+			      E T3a, T5z, T3d, T1r, T3c, T2q, T5S, T2t, TZ, T2s, T5O, T2M, T2B, Tt, T18;
+			      E T2A, Tx, T2V, T2Y, Tw, T30, T1i, T2X, Ty;
+			      {
+				   E T1j, T1k, T1p, T39, T1l;
+				   {
+					E TC, TI, T3g, TD, TE;
+					{
+					     E Ta, Te, Tf, Tb, Tc, T5s, T2g, T2w, Tg;
+					     Ta = cr[WS(rs, 1)];
+					     T5s = FNMS(KP559016994, T2f, T2e);
+					     T2g = FMA(KP559016994, T2f, T2e);
+					     T6f = FNMS(KP951056516, T6e, T6d);
+					     T7d = FMA(KP951056516, T6e, T6d);
+					     Te = cr[WS(rs, 11)];
+					     T5u = FMA(KP951056516, T5t, T5s);
+					     T6U = FNMS(KP951056516, T5t, T5s);
+					     T4k = FMA(KP951056516, T2j, T2g);
+					     T2k = FNMS(KP951056516, T2j, T2g);
+					     Tf = ci[WS(rs, 8)];
+					     Tb = cr[WS(rs, 6)];
+					     Tc = ci[WS(rs, 3)];
+					     TC = cr[WS(rs, 3)];
+					     T2w = Tf - Te;
+					     Tg = Te + Tf;
+					     {
+						  E T2v, Td, Th, TG, TH;
+						  T2v = Tb - Tc;
+						  Td = Tb + Tc;
+						  TG = ci[WS(rs, 11)];
+						  TH = ci[WS(rs, 6)];
+						  T2x = FNMS(KP618033988, T2w, T2v);
+						  T5V = FMA(KP618033988, T2v, T2w);
+						  Th = Td + Tg;
+						  T2m = Td - Tg;
+						  TI = TG + TH;
+						  T3g = TG - TH;
+						  T2l = FNMS(KP250000000, Th, Ta);
+						  Ti = Ta + Th;
+						  TD = cr[WS(rs, 8)];
+						  TE = ci[WS(rs, 1)];
+					     }
+					}
+					{
+					     E Tj, Tk, Tp, T2E, TJ, Tl;
+					     Tj = cr[WS(rs, 4)];
+					     {
+						  E Tn, To, T3f, TF;
+						  Tn = ci[WS(rs, 10)];
+						  To = ci[WS(rs, 5)];
+						  T3f = TD - TE;
+						  TF = TD + TE;
+						  Tk = cr[WS(rs, 9)];
+						  Tp = Tn + To;
+						  T2E = To - Tn;
+						  T5w = FNMS(KP618033988, T3f, T3g);
+						  T3h = FMA(KP618033988, T3g, T3f);
+						  T36 = TI - TF;
+						  TJ = TF + TI;
+						  Tl = ci[0];
+					     }
+					     T1j = ci[WS(rs, 21)];
+					     TK = TC + TJ;
+					     T35 = FNMS(KP250000000, TJ, TC);
+					     {
+						  E T1n, Tm, T2D, T1o, Tq;
+						  T1n = cr[WS(rs, 13)];
+						  Tm = Tk + Tl;
+						  T2D = Tl - Tk;
+						  T1o = cr[WS(rs, 18)];
+						  T1k = ci[WS(rs, 16)];
+						  T2F = FMA(KP618033988, T2E, T2D);
+						  T5L = FNMS(KP618033988, T2D, T2E);
+						  T2I = Tm - Tp;
+						  Tq = Tm + Tp;
+						  T1p = T1n + T1o;
+						  T39 = T1o - T1n;
+						  Tr = Tj + Tq;
+						  T2H = FMS(KP250000000, Tq, Tj);
+						  T1l = cr[WS(rs, 23)];
+					     }
+					}
+				   }
+				   {
+					E T10, T11, T16, T2L, T12;
+					{
+					     E TR, TS, TX, T2p, T1q, TT;
+					     TR = ci[WS(rs, 23)];
+					     {
+						  E TV, TW, T38, T1m;
+						  TV = ci[WS(rs, 13)];
+						  TW = cr[WS(rs, 16)];
+						  T38 = T1k + T1l;
+						  T1m = T1k - T1l;
+						  TS = ci[WS(rs, 18)];
+						  TX = TV - TW;
+						  T2p = TV + TW;
+						  T3a = FMA(KP618033988, T39, T38);
+						  T5z = FNMS(KP618033988, T38, T39);
+						  T3d = T1m + T1p;
+						  T1q = T1m - T1p;
+						  TT = cr[WS(rs, 21)];
+					     }
+					     T10 = ci[WS(rs, 20)];
+					     T1r = T1j + T1q;
+					     T3c = FMS(KP250000000, T1q, T1j);
+					     {
+						  E T14, TU, T2o, T15, TY;
+						  T14 = cr[WS(rs, 14)];
+						  TU = TS - TT;
+						  T2o = TS + TT;
+						  T15 = cr[WS(rs, 19)];
+						  T11 = ci[WS(rs, 15)];
+						  T2q = FMA(KP618033988, T2p, T2o);
+						  T5S = FNMS(KP618033988, T2o, T2p);
+						  T2t = TU - TX;
+						  TY = TU + TX;
+						  T16 = T14 + T15;
+						  T2L = T15 - T14;
+						  TZ = TR + TY;
+						  T2s = FNMS(KP250000000, TY, TR);
+						  T12 = cr[WS(rs, 24)];
+					     }
+					}
+					{
+					     E T1a, T1e, T1d, T2T, T17, T1f;
+					     T1a = ci[WS(rs, 22)];
+					     {
+						  E T1b, T1c, T2K, T13;
+						  T1b = ci[WS(rs, 17)];
+						  T1c = cr[WS(rs, 22)];
+						  T2K = T11 + T12;
+						  T13 = T11 - T12;
+						  T1e = ci[WS(rs, 12)];
+						  T1d = T1b - T1c;
+						  T2T = T1b + T1c;
+						  T5O = FNMS(KP618033988, T2K, T2L);
+						  T2M = FMA(KP618033988, T2L, T2K);
+						  T2B = T13 + T16;
+						  T17 = T13 - T16;
+						  T1f = cr[WS(rs, 17)];
+					     }
+					     Tt = cr[WS(rs, 2)];
+					     T18 = T10 + T17;
+					     T2A = FMS(KP250000000, T17, T10);
+					     {
+						  E Tu, T1g, T2U, Tv, T1h;
+						  Tu = cr[WS(rs, 7)];
+						  T1g = T1e - T1f;
+						  T2U = T1e + T1f;
+						  Tv = ci[WS(rs, 2)];
+						  Tx = cr[WS(rs, 12)];
+						  T2V = FMA(KP618033988, T2U, T2T);
+						  T5G = FNMS(KP618033988, T2T, T2U);
+						  T2Y = T1d - T1g;
+						  T1h = T1d + T1g;
+						  Tw = Tu + Tv;
+						  T30 = Tu - Tv;
+						  T1i = T1a + T1h;
+						  T2X = FMS(KP250000000, T1h, T1a);
+						  Ty = ci[WS(rs, 7)];
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T32, T5D, T2R, T2Q, T2u, T2r, T4t;
+				   {
+					E TA, T31, Tz, TB, Ts;
+					T31 = Ty - Tx;
+					Tz = Tx + Ty;
+					T1G = TZ + T18;
+					T19 = TZ - T18;
+					T32 = FNMS(KP618033988, T31, T30);
+					T5D = FMA(KP618033988, T30, T31);
+					TA = Tw + Tz;
+					T2R = Tz - Tw;
+					T2Q = FNMS(KP250000000, TA, Tt);
+					TB = Tt + TA;
+					T1H = T1i + T1r;
+					T1s = T1i - T1r;
+					T1M = Ti - Tr;
+					Ts = Ti + Tr;
+					{
+					     E T2n, T5R, T5U, TL;
+					     T2n = FMA(KP559016994, T2m, T2l);
+					     T5R = FNMS(KP559016994, T2m, T2l);
+					     T5U = FNMS(KP559016994, T2t, T2s);
+					     T2u = FMA(KP559016994, T2t, T2s);
+					     TL = TB + TK;
+					     T1N = TB - TK;
+					     {
+						  E T5T, T75, T5W, T76;
+						  T5T = FMA(KP951056516, T5S, T5R);
+						  T75 = FNMS(KP951056516, T5S, T5R);
+						  T5W = FMA(KP951056516, T5V, T5U);
+						  T76 = FNMS(KP951056516, T5V, T5U);
+						  TP = Ts - TL;
+						  TM = Ts + TL;
+						  T2r = FNMS(KP951056516, T2q, T2n);
+						  T4t = FMA(KP951056516, T2q, T2n);
+						  T7i = FMA(KP939062505, T75, T76);
+						  T77 = FNMS(KP939062505, T76, T75);
+						  T5X = FNMS(KP549754652, T5W, T5T);
+						  T64 = FMA(KP549754652, T5T, T5W);
+					     }
+					}
+				   }
+				   {
+					E T2J, T2G, T4v, T5y, T37, T3e, T5v;
+					{
+					     E T2C, T5K, T5N, T4s, T2y;
+					     T2C = FNMS(KP559016994, T2B, T2A);
+					     T5K = FMA(KP559016994, T2B, T2A);
+					     T5N = FMA(KP559016994, T2I, T2H);
+					     T2J = FNMS(KP559016994, T2I, T2H);
+					     T4s = FNMS(KP951056516, T2x, T2u);
+					     T2y = FMA(KP951056516, T2x, T2u);
+					     {
+						  E T73, T5M, T72, T5P;
+						  T73 = FMA(KP951056516, T5L, T5K);
+						  T5M = FNMS(KP951056516, T5L, T5K);
+						  T72 = FMA(KP951056516, T5O, T5N);
+						  T5P = FNMS(KP951056516, T5O, T5N);
+						  T4u = FNMS(KP634619297, T4t, T4s);
+						  T4D = FMA(KP634619297, T4s, T4t);
+						  T3p = FMA(KP256756360, T2r, T2y);
+						  T2z = FNMS(KP256756360, T2y, T2r);
+						  T74 = FMA(KP126329378, T73, T72);
+						  T7h = FNMS(KP126329378, T72, T73);
+						  T63 = FNMS(KP470564281, T5M, T5P);
+						  T5Q = FMA(KP470564281, T5P, T5M);
+						  T2G = FMA(KP951056516, T2F, T2C);
+						  T4v = FNMS(KP951056516, T2F, T2C);
+					     }
+					     T5y = FMA(KP559016994, T36, T35);
+					     T37 = FNMS(KP559016994, T36, T35);
+					     T3e = FNMS(KP559016994, T3d, T3c);
+					     T5v = FMA(KP559016994, T3d, T3c);
+					}
+					{
+					     E T5x, T6Y, T4w, T2N;
+					     T4w = FNMS(KP951056516, T2M, T2J);
+					     T2N = FMA(KP951056516, T2M, T2J);
+					     {
+						  E T4l, T3b, T4m, T3i;
+						  T4l = FMA(KP951056516, T3a, T37);
+						  T3b = FNMS(KP951056516, T3a, T37);
+						  T4m = FMA(KP951056516, T3h, T3e);
+						  T3i = FNMS(KP951056516, T3h, T3e);
+						  T4x = FNMS(KP827271945, T4w, T4v);
+						  T4E = FMA(KP827271945, T4v, T4w);
+						  T3q = FMA(KP634619297, T2G, T2N);
+						  T2O = FNMS(KP634619297, T2N, T2G);
+						  T4n = FNMS(KP126329378, T4m, T4l);
+						  T4G = FMA(KP126329378, T4l, T4m);
+						  T3t = FNMS(KP939062505, T3b, T3i);
+						  T3j = FMA(KP939062505, T3i, T3b);
+						  T5x = FMA(KP951056516, T5w, T5v);
+						  T6Y = FNMS(KP951056516, T5w, T5v);
+					     }
+					     {
+						  E T2S, T2Z, T5C, T6Z, T5A;
+						  T5F = FMA(KP559016994, T2R, T2Q);
+						  T2S = FNMS(KP559016994, T2R, T2Q);
+						  T2Z = FNMS(KP559016994, T2Y, T2X);
+						  T5C = FMA(KP559016994, T2Y, T2X);
+						  T6Z = FNMS(KP951056516, T5z, T5y);
+						  T5A = FMA(KP951056516, T5z, T5y);
+						  {
+						       E T4p, T2W, T4o, T33;
+						       T4p = FMA(KP951056516, T2V, T2S);
+						       T2W = FNMS(KP951056516, T2V, T2S);
+						       T4o = FMA(KP951056516, T32, T2Z);
+						       T33 = FNMS(KP951056516, T32, T2Z);
+						       T70 = FNMS(KP827271945, T6Z, T6Y);
+						       T7f = FMA(KP827271945, T6Y, T6Z);
+						       T66 = FNMS(KP062914667, T5x, T5A);
+						       T5B = FMA(KP062914667, T5A, T5x);
+						       T4q = FNMS(KP470564281, T4p, T4o);
+						       T4H = FMA(KP470564281, T4o, T4p);
+						       T3s = FNMS(KP549754652, T2W, T33);
+						       T34 = FMA(KP549754652, T33, T2W);
+						       T5E = FNMS(KP951056516, T5D, T5C);
+						       T6V = FMA(KP951056516, T5D, T5C);
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T6X, T7e, T6A, T6F, T6C, T6G, T6B;
+			 cr[0] = T9 + TM;
+			 {
+			      E T67, T5I, T25, T22, T1X, T26, T21;
+			      {
+				   E T1I, T23, T1L, T1Z, T1t, TO, T24, T1O;
+				   {
+					E T1K, T6W, T5H, T1J;
+					T1K = T1G - T1H;
+					T1I = T1G + T1H;
+					T6W = FNMS(KP951056516, T5G, T5F);
+					T5H = FMA(KP951056516, T5G, T5F);
+					T1J = FNMS(KP250000000, T1I, T1F);
+					T6X = FMA(KP062914667, T6W, T6V);
+					T7e = FNMS(KP062914667, T6V, T6W);
+					T67 = FNMS(KP634619297, T5E, T5H);
+					T5I = FMA(KP634619297, T5H, T5E);
+					T23 = FNMS(KP559016994, T1K, T1J);
+					T1L = FMA(KP559016994, T1K, T1J);
+					T1Z = FNMS(KP618033988, T19, T1s);
+					T1t = FMA(KP618033988, T1s, T19);
+					TO = FNMS(KP250000000, TM, T9);
+					T24 = FNMS(KP618033988, T1M, T1N);
+					T1O = FMA(KP618033988, T1N, T1M);
+				   }
+				   {
+					E T2b, T2a, T1Y, TQ, T27;
+					ci[0] = T1F + T1I;
+					T2b = FMA(KP951056516, T24, T23);
+					T25 = FNMS(KP951056516, T24, T23);
+					T2a = W[29];
+					T1Y = FNMS(KP559016994, TP, TO);
+					TQ = FMA(KP559016994, TP, TO);
+					T27 = W[28];
+					{
+					     E T1V, T1P, T20, T1S, T1w, T1v, TN, T1Q;
+					     T1V = FNMS(KP951056516, T1O, T1L);
+					     T1P = FMA(KP951056516, T1O, T1L);
+					     {
+						  E T28, T1u, T29, T2c;
+						  T20 = FMA(KP951056516, T1Z, T1Y);
+						  T28 = FNMS(KP951056516, T1Z, T1Y);
+						  T1S = FMA(KP951056516, T1t, TQ);
+						  T1u = FNMS(KP951056516, T1t, TQ);
+						  T1w = W[9];
+						  T29 = T27 * T28;
+						  T2c = T2a * T28;
+						  TN = W[8];
+						  T1Q = T1w * T1u;
+						  cr[WS(rs, 15)] = FNMS(T2a, T2b, T29);
+						  ci[WS(rs, 15)] = FMA(T27, T2b, T2c);
+						  T1v = TN * T1u;
+					     }
+					     ci[WS(rs, 5)] = FMA(TN, T1P, T1Q);
+					     {
+						  E T1U, T1R, T1W, T1T;
+						  T1U = W[39];
+						  cr[WS(rs, 5)] = FNMS(T1w, T1P, T1v);
+						  T1R = W[38];
+						  T1W = T1U * T1S;
+						  T22 = W[19];
+						  T1T = T1R * T1S;
+						  T1X = W[18];
+						  ci[WS(rs, 20)] = FMA(T1R, T1V, T1W);
+						  T26 = T22 * T20;
+						  cr[WS(rs, 20)] = FNMS(T1U, T1V, T1T);
+						  T21 = T1X * T20;
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T6h, T6g, T5Y, T5J, T6z, T69, T6o, T6E;
+				   {
+					E T6m, T6n, T65, T68;
+					T65 = FMA(KP968479752, T64, T63);
+					T6h = FNMS(KP968479752, T64, T63);
+					ci[WS(rs, 10)] = FMA(T1X, T25, T26);
+					T68 = FNMS(KP845997307, T67, T66);
+					T6g = FMA(KP845997307, T67, T66);
+					cr[WS(rs, 10)] = FNMS(T22, T25, T21);
+					T6m = FNMS(KP968479752, T5X, T5Q);
+					T5Y = FMA(KP968479752, T5X, T5Q);
+					T5J = FMA(KP845997307, T5I, T5B);
+					T6n = FNMS(KP845997307, T5I, T5B);
+					T6z = FMA(KP560319534, T65, T68);
+					T69 = FNMS(KP681693190, T68, T65);
+					T6o = FMA(KP681693190, T6n, T6m);
+					T6E = FNMS(KP560319534, T6m, T6n);
+				   }
+				   {
+					E T62, T6l, T6I, T6L, T6H, T6K;
+					{
+					     E T6Q, T6O, T6y, T6D, T6S;
+					     {
+						  E T6N, T5Z, T61, T6i, T6k;
+						  T6N = W[2];
+						  T5Z = FMA(KP906616052, T5Y, T5J);
+						  T61 = FNMS(KP906616052, T5Y, T5J);
+						  T6i = FNMS(KP906616052, T6h, T6g);
+						  T6k = FMA(KP906616052, T6h, T6g);
+						  T6Q = W[3];
+						  {
+						       E T60, T6j, T6R, T6P;
+						       T60 = FNMS(KP249506682, T5Z, T5u);
+						       T6O = FMA(KP998026728, T5Z, T5u);
+						       T6j = FNMS(KP249506682, T6i, T6f);
+						       T6R = FMA(KP998026728, T6i, T6f);
+						       T6y = FMA(KP557913902, T61, T60);
+						       T62 = FNMS(KP557913902, T61, T60);
+						       T6P = T6N * T6O;
+						       T6l = FNMS(KP557913902, T6k, T6j);
+						       T6D = FMA(KP557913902, T6k, T6j);
+						       T6S = T6N * T6R;
+						       cr[WS(rs, 2)] = FNMS(T6Q, T6R, T6P);
+						  }
+					     }
+					     T6A = FNMS(KP949179823, T6z, T6y);
+					     T6I = FMA(KP949179823, T6z, T6y);
+					     T6L = FNMS(KP949179823, T6E, T6D);
+					     T6F = FMA(KP949179823, T6E, T6D);
+					     ci[WS(rs, 2)] = FMA(T6Q, T6O, T6S);
+					     T6H = W[32];
+					     T6K = W[33];
+					}
+					{
+					     E T6a, T6s, T6v, T6p, T6c, T6q, T6b, T6M, T6J, T5r;
+					     T6a = FNMS(KP860541664, T69, T62);
+					     T6s = FMA(KP860541664, T69, T62);
+					     T6v = FMA(KP860541664, T6o, T6l);
+					     T6p = FNMS(KP860541664, T6o, T6l);
+					     T6M = T6H * T6L;
+					     T6J = T6H * T6I;
+					     T5r = W[12];
+					     T6c = W[13];
+					     ci[WS(rs, 17)] = FMA(T6K, T6I, T6M);
+					     cr[WS(rs, 17)] = FNMS(T6K, T6L, T6J);
+					     T6q = T5r * T6p;
+					     T6b = T5r * T6a;
+					     {
+						  E T6r, T6u, T6w, T6t, T6x;
+						  ci[WS(rs, 7)] = FMA(T6c, T6a, T6q);
+						  cr[WS(rs, 7)] = FNMS(T6c, T6p, T6b);
+						  T6r = W[42];
+						  T6u = W[43];
+						  T6w = T6r * T6v;
+						  T6t = T6r * T6s;
+						  T6x = W[22];
+						  T6C = W[23];
+						  ci[WS(rs, 22)] = FMA(T6u, T6s, T6w);
+						  cr[WS(rs, 22)] = FNMS(T6u, T6v, T6t);
+						  T6G = T6x * T6F;
+						  T6B = T6x * T6A;
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T7u, T7D, T7n, T7w, T7v, T7E;
+			      {
+				   E T78, T7t, T7N, T71, T7C, T7S, T7y, T7k;
+				   {
+					E T7j, T7g, T7A, T7B, T7r, T7s;
+					T7r = FNMS(KP734762448, T7i, T7h);
+					T7j = FMA(KP734762448, T7i, T7h);
+					T7g = FMA(KP772036680, T7f, T7e);
+					T7s = FNMS(KP772036680, T7f, T7e);
+					ci[WS(rs, 12)] = FMA(T6C, T6A, T6G);
+					cr[WS(rs, 12)] = FNMS(T6C, T6F, T6B);
+					T7A = FNMS(KP734762448, T77, T74);
+					T78 = FMA(KP734762448, T77, T74);
+					T7t = FNMS(KP621716863, T7s, T7r);
+					T7N = FMA(KP614372930, T7r, T7s);
+					T71 = FMA(KP772036680, T70, T6X);
+					T7B = FNMS(KP772036680, T70, T6X);
+					T7C = FNMS(KP621716863, T7B, T7A);
+					T7S = FMA(KP614372930, T7A, T7B);
+					T7y = FNMS(KP994076283, T7j, T7g);
+					T7k = FMA(KP994076283, T7j, T7g);
+				   }
+				   {
+					E T7c, T6T, T7x, T7l, T79, T7p;
+					T7c = W[5];
+					T6T = W[4];
+					T7x = FNMS(KP249506682, T7k, T7d);
+					T7l = FMA(KP998026728, T7k, T7d);
+					T79 = FMA(KP994076283, T78, T71);
+					T7p = FNMS(KP994076283, T78, T71);
+					{
+					     E T7z, T7Y, T7Z, T7T, T7q, T7O, T7X, T7L, T7Q, T7P, T7U;
+					     {
+						  E T7V, T80, T7b, T7m, T7W;
+						  {
+						       E T7R, T7o, T7a, T7M;
+						       T7V = W[34];
+						       T7R = FMA(KP557913902, T7y, T7x);
+						       T7z = FNMS(KP557913902, T7y, T7x);
+						       T7Y = W[35];
+						       T7o = FNMS(KP249506682, T79, T6U);
+						       T7a = FMA(KP998026728, T79, T6U);
+						       T7Z = FMA(KP949179823, T7S, T7R);
+						       T7T = FNMS(KP949179823, T7S, T7R);
+						       T7M = FMA(KP557913902, T7p, T7o);
+						       T7q = FNMS(KP557913902, T7p, T7o);
+						       T7b = T6T * T7a;
+						       T7m = T7c * T7a;
+						       T7W = FNMS(KP949179823, T7N, T7M);
+						       T7O = FMA(KP949179823, T7N, T7M);
+						  }
+						  cr[WS(rs, 3)] = FNMS(T7c, T7l, T7b);
+						  ci[WS(rs, 3)] = FMA(T6T, T7l, T7m);
+						  T80 = T7Y * T7W;
+						  T7X = T7V * T7W;
+						  T7L = W[24];
+						  T7Q = W[25];
+						  ci[WS(rs, 18)] = FMA(T7V, T7Z, T80);
+					     }
+					     cr[WS(rs, 18)] = FNMS(T7Y, T7Z, T7X);
+					     T7P = T7L * T7O;
+					     T7U = T7Q * T7O;
+					     {
+						  E T7J, T7F, T7I, T7H, T7K, T7G;
+						  T7u = FMA(KP943557151, T7t, T7q);
+						  T7G = FNMS(KP943557151, T7t, T7q);
+						  cr[WS(rs, 13)] = FNMS(T7Q, T7T, T7P);
+						  ci[WS(rs, 13)] = FMA(T7L, T7T, T7U);
+						  T7J = FMA(KP943557151, T7C, T7z);
+						  T7D = FNMS(KP943557151, T7C, T7z);
+						  T7F = W[44];
+						  T7I = W[45];
+						  T7n = W[14];
+						  T7H = T7F * T7G;
+						  T7K = T7I * T7G;
+						  T7w = W[15];
+						  T7v = T7n * T7u;
+						  cr[WS(rs, 23)] = FNMS(T7I, T7J, T7H);
+						  ci[WS(rs, 23)] = FMA(T7F, T7J, T7K);
+					     }
+					}
+				   }
+			      }
+			      T7E = T7w * T7u;
+			      cr[WS(rs, 8)] = FNMS(T7w, T7D, T7v);
+			      {
+				   E T3F, T4K, T4X, T4j, T4M, T4L, T4Y;
+				   {
+					E T4P, T4O, T4y, T4r, T4J, T57, T4N, T5c, T4W;
+					{
+					     E T4U, T4V, T4F, T4I;
+					     T4F = FNMS(KP912575812, T4E, T4D);
+					     T4P = FMA(KP912575812, T4E, T4D);
+					     T4O = FMA(KP912018591, T4H, T4G);
+					     T4I = FNMS(KP912018591, T4H, T4G);
+					     ci[WS(rs, 8)] = FMA(T7n, T7D, T7E);
+					     T4y = FMA(KP912575812, T4x, T4u);
+					     T4U = FNMS(KP912575812, T4x, T4u);
+					     T4V = FMA(KP912018591, T4q, T4n);
+					     T4r = FNMS(KP912018591, T4q, T4n);
+					     T4J = FNMS(KP726211448, T4I, T4F);
+					     T57 = FMA(KP525970792, T4F, T4I);
+					     T3F = FMA(KP951056516, T3E, T3B);
+					     T4N = FNMS(KP951056516, T3E, T3B);
+					     T5c = FMA(KP525970792, T4U, T4V);
+					     T4W = FNMS(KP726211448, T4V, T4U);
+					}
+					{
+					     E T5o, T4S, T4B, T5l, T5p, T4R, T4A, T5m, T4Q, T4z;
+					     T5o = W[7];
+					     T4Q = FMA(KP851038619, T4P, T4O);
+					     T4S = FNMS(KP851038619, T4P, T4O);
+					     T4z = FMA(KP851038619, T4y, T4r);
+					     T4B = FNMS(KP851038619, T4y, T4r);
+					     T5l = W[6];
+					     T5p = FMA(KP992114701, T4Q, T4N);
+					     T4R = FNMS(KP248028675, T4Q, T4N);
+					     T4A = FMA(KP248028675, T4z, T4k);
+					     T5m = FNMS(KP992114701, T4z, T4k);
+					     {
+						  E T4T, T4C, T5d, T58, T55, T5a, T59, T5e;
+						  {
+						       E T5f, T5j, T5i, T5h, T5k, T5g;
+						       T5f = W[36];
+						       {
+							    E T5b, T56, T5n, T5q;
+							    T4T = FNMS(KP554608978, T4S, T4R);
+							    T5b = FMA(KP554608978, T4S, T4R);
+							    T56 = FNMS(KP554608978, T4B, T4A);
+							    T4C = FMA(KP554608978, T4B, T4A);
+							    T5n = T5l * T5m;
+							    T5q = T5o * T5m;
+							    T5j = FMA(KP943557151, T5c, T5b);
+							    T5d = FNMS(KP943557151, T5c, T5b);
+							    T5g = FMA(KP943557151, T57, T56);
+							    T58 = FNMS(KP943557151, T57, T56);
+							    cr[WS(rs, 4)] = FNMS(T5o, T5p, T5n);
+							    ci[WS(rs, 4)] = FMA(T5l, T5p, T5q);
+						       }
+						       T5i = W[37];
+						       T5h = T5f * T5g;
+						       T55 = W[26];
+						       T5k = T5i * T5g;
+						       T5a = W[27];
+						       cr[WS(rs, 19)] = FNMS(T5i, T5j, T5h);
+						       T59 = T55 * T58;
+						       ci[WS(rs, 19)] = FMA(T5f, T5j, T5k);
+						  }
+						  T5e = T5a * T58;
+						  {
+						       E T53, T4Z, T52, T51, T54, T50;
+						       cr[WS(rs, 14)] = FNMS(T5a, T5d, T59);
+						       T4K = FNMS(KP803003575, T4J, T4C);
+						       T50 = FMA(KP803003575, T4J, T4C);
+						       ci[WS(rs, 14)] = FMA(T55, T5d, T5e);
+						       T4X = FNMS(KP803003575, T4W, T4T);
+						       T53 = FMA(KP803003575, T4W, T4T);
+						       T4Z = W[46];
+						       T52 = W[47];
+						       T4j = W[16];
+						       T51 = T4Z * T50;
+						       T54 = T52 * T50;
+						       T4M = W[17];
+						       T4L = T4j * T4K;
+						       cr[WS(rs, 24)] = FNMS(T52, T53, T51);
+						       ci[WS(rs, 24)] = FMA(T4Z, T53, T54);
+						  }
+					     }
+					}
+				   }
+				   T4Y = T4M * T4K;
+				   cr[WS(rs, 9)] = FNMS(T4M, T4X, T4L);
+				   {
+					E T3G, T3H, T2P, T3k, T3Z, T3v, T3O, T44;
+					{
+					     E T3M, T3N, T3r, T3u;
+					     T3G = FNMS(KP871714437, T3q, T3p);
+					     T3r = FMA(KP871714437, T3q, T3p);
+					     T3u = FNMS(KP831864738, T3t, T3s);
+					     T3H = FMA(KP831864738, T3t, T3s);
+					     ci[WS(rs, 9)] = FMA(T4j, T4X, T4Y);
+					     T3M = FNMS(KP871714437, T2O, T2z);
+					     T2P = FMA(KP871714437, T2O, T2z);
+					     T3k = FMA(KP831864738, T3j, T34);
+					     T3N = FNMS(KP831864738, T3j, T34);
+					     T3Z = FMA(KP683113946, T3r, T3u);
+					     T3v = FNMS(KP559154169, T3u, T3r);
+					     T3O = FMA(KP559154169, T3N, T3M);
+					     T44 = FNMS(KP683113946, T3M, T3N);
+					}
+					{
+					     E T4g, T3K, T3n, T4d, T3J, T4h, T4e, T3m, T3I, T3l;
+					     T4g = W[1];
+					     T3K = FMA(KP904730450, T3H, T3G);
+					     T3I = FNMS(KP904730450, T3H, T3G);
+					     T3n = FNMS(KP904730450, T3k, T2P);
+					     T3l = FMA(KP904730450, T3k, T2P);
+					     T4d = W[0];
+					     T3J = FNMS(KP242145790, T3I, T3F);
+					     T4h = FMA(KP968583161, T3I, T3F);
+					     T4e = FMA(KP968583161, T3l, T2k);
+					     T3m = FNMS(KP242145790, T3l, T2k);
+					     {
+						  E T3L, T3o, T45, T40, T3X, T42, T41, T46;
+						  {
+						       E T47, T4b, T4a, T49, T4c, T48;
+						       T47 = W[30];
+						       {
+							    E T43, T3Y, T4f, T4i;
+							    T43 = FNMS(KP541454447, T3K, T3J);
+							    T3L = FMA(KP541454447, T3K, T3J);
+							    T3o = FMA(KP541454447, T3n, T3m);
+							    T3Y = FNMS(KP541454447, T3n, T3m);
+							    T4f = T4d * T4e;
+							    T4i = T4g * T4e;
+							    T45 = FNMS(KP833417178, T44, T43);
+							    T4b = FMA(KP833417178, T44, T43);
+							    T40 = FNMS(KP833417178, T3Z, T3Y);
+							    T48 = FMA(KP833417178, T3Z, T3Y);
+							    cr[WS(rs, 1)] = FNMS(T4g, T4h, T4f);
+							    ci[WS(rs, 1)] = FMA(T4d, T4h, T4i);
+						       }
+						       T4a = W[31];
+						       T49 = T47 * T48;
+						       T3X = W[20];
+						       T4c = T4a * T48;
+						       T42 = W[21];
+						       cr[WS(rs, 16)] = FNMS(T4a, T4b, T49);
+						       T41 = T3X * T40;
+						       ci[WS(rs, 16)] = FMA(T47, T4b, T4c);
+						  }
+						  T46 = T42 * T40;
+						  {
+						       E T3V, T3R, T3U, T3T, T3W, T3S;
+						       cr[WS(rs, 11)] = FNMS(T42, T45, T41);
+						       T3S = FMA(KP921177326, T3v, T3o);
+						       T3w = FNMS(KP921177326, T3v, T3o);
+						       ci[WS(rs, 11)] = FMA(T3X, T45, T46);
+						       T3V = FNMS(KP921177326, T3O, T3L);
+						       T3P = FMA(KP921177326, T3O, T3L);
+						       T3R = W[40];
+						       T3U = W[41];
+						       T2d = W[10];
+						       T3T = T3R * T3S;
+						       T3W = T3U * T3S;
+						       T3y = W[11];
+						       T3x = T2d * T3w;
+						       cr[WS(rs, 21)] = FNMS(T3U, T3V, T3T);
+						       ci[WS(rs, 21)] = FMA(T3R, T3V, T3W);
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 6)] = FNMS(T3y, T3P, T3x);
+	       T3Q = T3y * T3w;
+	       ci[WS(rs, 6)] = FMA(T2d, T3P, T3Q);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 25},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 25, "hb_25", twinstr, &GENUS, {84, 48, 316, 0} };
+
+void X(codelet_hb_25) (planner *p) {
+     X(khc2hc_register) (p, hb_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 25 -dif -name hb_25 -include hb.h */
+
+/*
+ * This function contains 400 FP additions, 280 FP multiplications,
+ * (or, 260 additions, 140 multiplications, 140 fused multiply/add),
+ * 107 stack variables, 20 constants, and 100 memory accesses
+ */
+#include "hb.h"
+
+static void hb_25(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 48); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 48, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E T9, T5Q, T3y, T39, T5v, Ti, Tr, Ts, TZ, T18, T1z, T2k, T4l, T3h, T44;
+	       E T5d, T6C, T5C, T6o, T56, T6B, T5B, T6l, T2z, T4m, T3i, T47, T1K, T5w, T3c;
+	       E T3B, T5R, TB, TK, TL, T1i, T1r, T1A, T2P, T4o, T3k, T4b, T5s, T6F, T5F;
+	       E T6v, T5l, T6E, T5E, T6s, T34, T4p, T3l, T4e;
+	       {
+		    E T1, T4, T7, T8, T3x, T3w, T37, T38;
+		    T1 = cr[0];
+		    {
+			 E T2, T3, T5, T6;
+			 T2 = cr[WS(rs, 5)];
+			 T3 = ci[WS(rs, 4)];
+			 T4 = T2 + T3;
+			 T5 = cr[WS(rs, 10)];
+			 T6 = ci[WS(rs, 9)];
+			 T7 = T5 + T6;
+			 T8 = T4 + T7;
+			 T3x = T5 - T6;
+			 T3w = T2 - T3;
+		    }
+		    T9 = T1 + T8;
+		    T5Q = FMA(KP951056516, T3w, KP587785252 * T3x);
+		    T3y = FNMS(KP951056516, T3x, KP587785252 * T3w);
+		    T37 = FNMS(KP250000000, T8, T1);
+		    T38 = KP559016994 * (T4 - T7);
+		    T39 = T37 - T38;
+		    T5v = T38 + T37;
+	       }
+	       {
+		    E Ta, T27, T53, T2f, Th, T26, T10, T2p, T58, T2x, T17, T2o, Tj, T2n, T5a;
+		    E T2t, Tq, T2s, TR, T2b, T51, T2h, TY, T2g;
+		    {
+			 E Tg, T2e, Td, T2d;
+			 Ta = cr[WS(rs, 1)];
+			 {
+			      E Te, Tf, Tb, Tc;
+			      Te = cr[WS(rs, 11)];
+			      Tf = ci[WS(rs, 8)];
+			      Tg = Te + Tf;
+			      T2e = Te - Tf;
+			      Tb = cr[WS(rs, 6)];
+			      Tc = ci[WS(rs, 3)];
+			      Td = Tb + Tc;
+			      T2d = Tb - Tc;
+			 }
+			 T27 = KP559016994 * (Td - Tg);
+			 T53 = FMA(KP951056516, T2d, KP587785252 * T2e);
+			 T2f = FNMS(KP951056516, T2e, KP587785252 * T2d);
+			 Th = Td + Tg;
+			 T26 = FNMS(KP250000000, Th, Ta);
+		    }
+		    {
+			 E T16, T2w, T13, T2v;
+			 T10 = ci[WS(rs, 20)];
+			 {
+			      E T14, T15, T11, T12;
+			      T14 = cr[WS(rs, 14)];
+			      T15 = cr[WS(rs, 19)];
+			      T16 = T14 + T15;
+			      T2w = T15 - T14;
+			      T11 = ci[WS(rs, 15)];
+			      T12 = cr[WS(rs, 24)];
+			      T13 = T11 - T12;
+			      T2v = T11 + T12;
+			 }
+			 T2p = KP559016994 * (T13 + T16);
+			 T58 = FMA(KP951056516, T2v, KP587785252 * T2w);
+			 T2x = FNMS(KP951056516, T2w, KP587785252 * T2v);
+			 T17 = T13 - T16;
+			 T2o = FNMS(KP250000000, T17, T10);
+		    }
+		    {
+			 E Tp, T2m, Tm, T2l;
+			 Tj = cr[WS(rs, 4)];
+			 {
+			      E Tn, To, Tk, Tl;
+			      Tn = ci[WS(rs, 10)];
+			      To = ci[WS(rs, 5)];
+			      Tp = Tn + To;
+			      T2m = Tn - To;
+			      Tk = cr[WS(rs, 9)];
+			      Tl = ci[0];
+			      Tm = Tk + Tl;
+			      T2l = Tk - Tl;
+			 }
+			 T2n = FNMS(KP951056516, T2m, KP587785252 * T2l);
+			 T5a = FMA(KP951056516, T2l, KP587785252 * T2m);
+			 T2t = KP559016994 * (Tm - Tp);
+			 Tq = Tm + Tp;
+			 T2s = FNMS(KP250000000, Tq, Tj);
+		    }
+		    {
+			 E TX, T2a, TU, T29;
+			 TR = ci[WS(rs, 23)];
+			 {
+			      E TV, TW, TS, TT;
+			      TV = ci[WS(rs, 13)];
+			      TW = cr[WS(rs, 16)];
+			      TX = TV - TW;
+			      T2a = TV + TW;
+			      TS = ci[WS(rs, 18)];
+			      TT = cr[WS(rs, 21)];
+			      TU = TS - TT;
+			      T29 = TS + TT;
+			 }
+			 T2b = FNMS(KP951056516, T2a, KP587785252 * T29);
+			 T51 = FMA(KP951056516, T29, KP587785252 * T2a);
+			 T2h = KP559016994 * (TU - TX);
+			 TY = TU + TX;
+			 T2g = FNMS(KP250000000, TY, TR);
+		    }
+		    Ti = Ta + Th;
+		    Tr = Tj + Tq;
+		    Ts = Ti + Tr;
+		    TZ = TR + TY;
+		    T18 = T10 + T17;
+		    T1z = TZ + T18;
+		    {
+			 E T2c, T42, T2j, T43, T28, T2i;
+			 T28 = T26 - T27;
+			 T2c = T28 - T2b;
+			 T42 = T28 + T2b;
+			 T2i = T2g - T2h;
+			 T2j = T2f + T2i;
+			 T43 = T2i - T2f;
+			 T2k = FNMS(KP481753674, T2j, KP876306680 * T2c);
+			 T4l = FMA(KP728968627, T43, KP684547105 * T42);
+			 T3h = FMA(KP876306680, T2j, KP481753674 * T2c);
+			 T44 = FNMS(KP684547105, T43, KP728968627 * T42);
+		    }
+		    {
+			 E T59, T6n, T5c, T6m, T57, T5b;
+			 T57 = T2t + T2s;
+			 T59 = T57 - T58;
+			 T6n = T57 + T58;
+			 T5b = T2o + T2p;
+			 T5c = T5a + T5b;
+			 T6m = T5b - T5a;
+			 T5d = FNMS(KP844327925, T5c, KP535826794 * T59);
+			 T6C = FMA(KP637423989, T6m, KP770513242 * T6n);
+			 T5C = FMA(KP535826794, T5c, KP844327925 * T59);
+			 T6o = FNMS(KP637423989, T6n, KP770513242 * T6m);
+		    }
+		    {
+			 E T52, T6j, T55, T6k, T50, T54;
+			 T50 = T27 + T26;
+			 T52 = T50 - T51;
+			 T6j = T50 + T51;
+			 T54 = T2h + T2g;
+			 T55 = T53 + T54;
+			 T6k = T54 - T53;
+			 T56 = FNMS(KP248689887, T55, KP968583161 * T52);
+			 T6B = FMA(KP535826794, T6k, KP844327925 * T6j);
+			 T5B = FMA(KP968583161, T55, KP248689887 * T52);
+			 T6l = FNMS(KP844327925, T6k, KP535826794 * T6j);
+		    }
+		    {
+			 E T2r, T45, T2y, T46, T2q, T2u;
+			 T2q = T2o - T2p;
+			 T2r = T2n + T2q;
+			 T45 = T2q - T2n;
+			 T2u = T2s - T2t;
+			 T2y = T2u - T2x;
+			 T46 = T2u + T2x;
+			 T2z = FMA(KP904827052, T2r, KP425779291 * T2y);
+			 T4m = FNMS(KP992114701, T45, KP125333233 * T46);
+			 T3i = FNMS(KP425779291, T2r, KP904827052 * T2y);
+			 T47 = FMA(KP125333233, T45, KP992114701 * T46);
+		    }
+	       }
+	       {
+		    E T1C, T1F, T1I, T1J, T3b, T3a, T3z, T3A;
+		    T1C = ci[WS(rs, 24)];
+		    {
+			 E T1D, T1E, T1G, T1H;
+			 T1D = ci[WS(rs, 19)];
+			 T1E = cr[WS(rs, 20)];
+			 T1F = T1D - T1E;
+			 T1G = ci[WS(rs, 14)];
+			 T1H = cr[WS(rs, 15)];
+			 T1I = T1G - T1H;
+			 T1J = T1F + T1I;
+			 T3b = T1G + T1H;
+			 T3a = T1D + T1E;
+		    }
+		    T1K = T1C + T1J;
+		    T5w = FMA(KP951056516, T3a, KP587785252 * T3b);
+		    T3c = FNMS(KP951056516, T3b, KP587785252 * T3a);
+		    T3z = FNMS(KP250000000, T1J, T1C);
+		    T3A = KP559016994 * (T1F - T1I);
+		    T3B = T3z - T3A;
+		    T5R = T3A + T3z;
+	       }
+	       {
+		    E Tt, T2C, T5i, T2K, TA, T2B, T1a, T2G, T5g, T2M, T1h, T2L, TC, T2R, T5p;
+		    E T2Z, TJ, T2Q, T1j, T2V, T5n, T31, T1q, T30;
+		    {
+			 E Tw, T2I, Tz, T2J;
+			 Tt = cr[WS(rs, 2)];
+			 {
+			      E Tu, Tv, Tx, Ty;
+			      Tu = cr[WS(rs, 7)];
+			      Tv = ci[WS(rs, 2)];
+			      Tw = Tu + Tv;
+			      T2I = Tu - Tv;
+			      Tx = cr[WS(rs, 12)];
+			      Ty = ci[WS(rs, 7)];
+			      Tz = Tx + Ty;
+			      T2J = Tx - Ty;
+			 }
+			 T2C = KP559016994 * (Tw - Tz);
+			 T5i = FMA(KP951056516, T2I, KP587785252 * T2J);
+			 T2K = FNMS(KP951056516, T2J, KP587785252 * T2I);
+			 TA = Tw + Tz;
+			 T2B = FNMS(KP250000000, TA, Tt);
+		    }
+		    {
+			 E T1d, T2E, T1g, T2F;
+			 T1a = ci[WS(rs, 22)];
+			 {
+			      E T1b, T1c, T1e, T1f;
+			      T1b = ci[WS(rs, 17)];
+			      T1c = cr[WS(rs, 22)];
+			      T1d = T1b - T1c;
+			      T2E = T1b + T1c;
+			      T1e = ci[WS(rs, 12)];
+			      T1f = cr[WS(rs, 17)];
+			      T1g = T1e - T1f;
+			      T2F = T1e + T1f;
+			 }
+			 T2G = FNMS(KP951056516, T2F, KP587785252 * T2E);
+			 T5g = FMA(KP951056516, T2E, KP587785252 * T2F);
+			 T2M = KP559016994 * (T1d - T1g);
+			 T1h = T1d + T1g;
+			 T2L = FNMS(KP250000000, T1h, T1a);
+		    }
+		    {
+			 E TI, T2Y, TF, T2X;
+			 TC = cr[WS(rs, 3)];
+			 {
+			      E TG, TH, TD, TE;
+			      TG = ci[WS(rs, 11)];
+			      TH = ci[WS(rs, 6)];
+			      TI = TG + TH;
+			      T2Y = TG - TH;
+			      TD = cr[WS(rs, 8)];
+			      TE = ci[WS(rs, 1)];
+			      TF = TD + TE;
+			      T2X = TD - TE;
+			 }
+			 T2R = KP559016994 * (TF - TI);
+			 T5p = FMA(KP951056516, T2X, KP587785252 * T2Y);
+			 T2Z = FNMS(KP951056516, T2Y, KP587785252 * T2X);
+			 TJ = TF + TI;
+			 T2Q = FNMS(KP250000000, TJ, TC);
+		    }
+		    {
+			 E T1p, T2U, T1m, T2T;
+			 T1j = ci[WS(rs, 21)];
+			 {
+			      E T1n, T1o, T1k, T1l;
+			      T1n = cr[WS(rs, 13)];
+			      T1o = cr[WS(rs, 18)];
+			      T1p = T1n + T1o;
+			      T2U = T1o - T1n;
+			      T1k = ci[WS(rs, 16)];
+			      T1l = cr[WS(rs, 23)];
+			      T1m = T1k - T1l;
+			      T2T = T1k + T1l;
+			 }
+			 T2V = FNMS(KP951056516, T2U, KP587785252 * T2T);
+			 T5n = FMA(KP951056516, T2T, KP587785252 * T2U);
+			 T31 = KP559016994 * (T1m + T1p);
+			 T1q = T1m - T1p;
+			 T30 = FNMS(KP250000000, T1q, T1j);
+		    }
+		    TB = Tt + TA;
+		    TK = TC + TJ;
+		    TL = TB + TK;
+		    T1i = T1a + T1h;
+		    T1r = T1j + T1q;
+		    T1A = T1i + T1r;
+		    {
+			 E T2H, T49, T2O, T4a, T2D, T2N;
+			 T2D = T2B - T2C;
+			 T2H = T2D - T2G;
+			 T49 = T2D + T2G;
+			 T2N = T2L - T2M;
+			 T2O = T2K + T2N;
+			 T4a = T2N - T2K;
+			 T2P = FNMS(KP844327925, T2O, KP535826794 * T2H);
+			 T4o = FMA(KP062790519, T4a, KP998026728 * T49);
+			 T3k = FMA(KP535826794, T2O, KP844327925 * T2H);
+			 T4b = FNMS(KP998026728, T4a, KP062790519 * T49);
+		    }
+		    {
+			 E T5o, T6u, T5r, T6t, T5m, T5q;
+			 T5m = T2R + T2Q;
+			 T5o = T5m - T5n;
+			 T6u = T5m + T5n;
+			 T5q = T30 + T31;
+			 T5r = T5p + T5q;
+			 T6t = T5q - T5p;
+			 T5s = FNMS(KP684547105, T5r, KP728968627 * T5o);
+			 T6F = FNMS(KP992114701, T6t, KP125333233 * T6u);
+			 T5F = FMA(KP728968627, T5r, KP684547105 * T5o);
+			 T6v = FMA(KP125333233, T6t, KP992114701 * T6u);
+		    }
+		    {
+			 E T5h, T6r, T5k, T6q, T5f, T5j;
+			 T5f = T2C + T2B;
+			 T5h = T5f - T5g;
+			 T6r = T5f + T5g;
+			 T5j = T2M + T2L;
+			 T5k = T5i + T5j;
+			 T6q = T5j - T5i;
+			 T5l = FNMS(KP481753674, T5k, KP876306680 * T5h);
+			 T6E = FNMS(KP425779291, T6q, KP904827052 * T6r);
+			 T5E = FMA(KP876306680, T5k, KP481753674 * T5h);
+			 T6s = FMA(KP904827052, T6q, KP425779291 * T6r);
+		    }
+		    {
+			 E T2W, T4d, T33, T4c, T2S, T32;
+			 T2S = T2Q - T2R;
+			 T2W = T2S - T2V;
+			 T4d = T2S + T2V;
+			 T32 = T30 - T31;
+			 T33 = T2Z + T32;
+			 T4c = T32 - T2Z;
+			 T34 = FNMS(KP998026728, T33, KP062790519 * T2W);
+			 T4p = FNMS(KP637423989, T4c, KP770513242 * T4d);
+			 T3l = FMA(KP062790519, T33, KP998026728 * T2W);
+			 T4e = FMA(KP770513242, T4c, KP637423989 * T4d);
+		    }
+	       }
+	       {
+		    E TM, TQ, T1U, T1L, T1N, T1Z, T1t, T1V, T1y, T1Y;
+		    {
+			 E TO, TP, T1B, T1M;
+			 TO = KP559016994 * (Ts - TL);
+			 TM = Ts + TL;
+			 TP = FNMS(KP250000000, TM, T9);
+			 TQ = TO + TP;
+			 T1U = TP - TO;
+			 T1B = KP559016994 * (T1z - T1A);
+			 T1L = T1z + T1A;
+			 T1M = FNMS(KP250000000, T1L, T1K);
+			 T1N = T1B + T1M;
+			 T1Z = T1M - T1B;
+		    }
+		    {
+			 E T19, T1s, T1w, T1x;
+			 T19 = TZ - T18;
+			 T1s = T1i - T1r;
+			 T1t = FMA(KP951056516, T19, KP587785252 * T1s);
+			 T1V = FNMS(KP951056516, T1s, KP587785252 * T19);
+			 T1w = Ti - Tr;
+			 T1x = TB - TK;
+			 T1y = FMA(KP951056516, T1w, KP587785252 * T1x);
+			 T1Y = FNMS(KP951056516, T1x, KP587785252 * T1w);
+		    }
+		    cr[0] = T9 + TM;
+		    ci[0] = T1K + T1L;
+		    {
+			 E T1u, T1O, TN, T1v;
+			 T1u = TQ - T1t;
+			 T1O = T1y + T1N;
+			 TN = W[8];
+			 T1v = W[9];
+			 cr[WS(rs, 5)] = FNMS(T1v, T1O, TN * T1u);
+			 ci[WS(rs, 5)] = FMA(T1v, T1u, TN * T1O);
+		    }
+		    {
+			 E T22, T24, T21, T23;
+			 T22 = T1U + T1V;
+			 T24 = T1Z - T1Y;
+			 T21 = W[28];
+			 T23 = W[29];
+			 cr[WS(rs, 15)] = FNMS(T23, T24, T21 * T22);
+			 ci[WS(rs, 15)] = FMA(T23, T22, T21 * T24);
+		    }
+		    {
+			 E T1W, T20, T1T, T1X;
+			 T1W = T1U - T1V;
+			 T20 = T1Y + T1Z;
+			 T1T = W[18];
+			 T1X = W[19];
+			 cr[WS(rs, 10)] = FNMS(T1X, T20, T1T * T1W);
+			 ci[WS(rs, 10)] = FMA(T1X, T1W, T1T * T20);
+		    }
+		    {
+			 E T1Q, T1S, T1P, T1R;
+			 T1Q = TQ + T1t;
+			 T1S = T1N - T1y;
+			 T1P = W[38];
+			 T1R = W[39];
+			 cr[WS(rs, 20)] = FNMS(T1R, T1S, T1P * T1Q);
+			 ci[WS(rs, 20)] = FMA(T1R, T1Q, T1P * T1S);
+		    }
+	       }
+	       {
+		    E T6H, T71, T6M, T74, T6i, T6x, T6y, T6z, T6Q, T6R, T6P, T6S;
+		    {
+			 E T6D, T6G, T6K, T6L;
+			 T6D = T6B + T6C;
+			 T6G = T6E - T6F;
+			 T6H = FMA(KP951056516, T6D, KP587785252 * T6G);
+			 T71 = FNMS(KP951056516, T6G, KP587785252 * T6D);
+			 T6K = T6l - T6o;
+			 T6L = T6v - T6s;
+			 T6M = FMA(KP951056516, T6K, KP587785252 * T6L);
+			 T74 = FNMS(KP951056516, T6L, KP587785252 * T6K);
+		    }
+		    {
+			 E T6p, T6w, T6N, T6O;
+			 T6i = T5v + T5w;
+			 T6p = T6l + T6o;
+			 T6w = T6s + T6v;
+			 T6x = T6p - T6w;
+			 T6y = FNMS(KP250000000, T6x, T6i);
+			 T6z = KP559016994 * (T6p + T6w);
+			 T6Q = T5R - T5Q;
+			 T6N = T6B - T6C;
+			 T6O = T6E + T6F;
+			 T6R = T6N + T6O;
+			 T6P = KP559016994 * (T6N - T6O);
+			 T6S = FNMS(KP250000000, T6R, T6Q);
+		    }
+		    {
+			 E T7c, T7e, T7b, T7d;
+			 T7c = T6i + T6x;
+			 T7e = T6Q + T6R;
+			 T7b = W[6];
+			 T7d = W[7];
+			 cr[WS(rs, 4)] = FNMS(T7d, T7e, T7b * T7c);
+			 ci[WS(rs, 4)] = FMA(T7d, T7c, T7b * T7e);
+		    }
+		    {
+			 E T72, T78, T76, T7a, T70, T75;
+			 T70 = T6y - T6z;
+			 T72 = T70 - T71;
+			 T78 = T70 + T71;
+			 T75 = T6S - T6P;
+			 T76 = T74 + T75;
+			 T7a = T75 - T74;
+			 {
+			      E T6Z, T73, T77, T79;
+			      T6Z = W[26];
+			      T73 = W[27];
+			      cr[WS(rs, 14)] = FNMS(T73, T76, T6Z * T72);
+			      ci[WS(rs, 14)] = FMA(T73, T72, T6Z * T76);
+			      T77 = W[36];
+			      T79 = W[37];
+			      cr[WS(rs, 19)] = FNMS(T79, T7a, T77 * T78);
+			      ci[WS(rs, 19)] = FMA(T79, T78, T77 * T7a);
+			 }
+		    }
+		    {
+			 E T6I, T6W, T6U, T6Y, T6A, T6T;
+			 T6A = T6y + T6z;
+			 T6I = T6A - T6H;
+			 T6W = T6A + T6H;
+			 T6T = T6P + T6S;
+			 T6U = T6M + T6T;
+			 T6Y = T6T - T6M;
+			 {
+			      E T6h, T6J, T6V, T6X;
+			      T6h = W[16];
+			      T6J = W[17];
+			      cr[WS(rs, 9)] = FNMS(T6J, T6U, T6h * T6I);
+			      ci[WS(rs, 9)] = FMA(T6J, T6I, T6h * T6U);
+			      T6V = W[46];
+			      T6X = W[47];
+			      cr[WS(rs, 24)] = FNMS(T6X, T6Y, T6V * T6W);
+			      ci[WS(rs, 24)] = FMA(T6X, T6W, T6V * T6Y);
+			 }
+		    }
+	       }
+	       {
+		    E T3n, T3N, T3s, T3Q, T3d, T3e, T36, T3f, T3C, T3D, T3v, T3E;
+		    {
+			 E T3j, T3m, T3q, T3r;
+			 T3j = T3h - T3i;
+			 T3m = T3k - T3l;
+			 T3n = FMA(KP951056516, T3j, KP587785252 * T3m);
+			 T3N = FNMS(KP951056516, T3m, KP587785252 * T3j);
+			 T3q = T2k + T2z;
+			 T3r = T2P - T34;
+			 T3s = FMA(KP951056516, T3q, KP587785252 * T3r);
+			 T3Q = FNMS(KP951056516, T3r, KP587785252 * T3q);
+		    }
+		    {
+			 E T2A, T35, T3t, T3u;
+			 T3d = T39 - T3c;
+			 T2A = T2k - T2z;
+			 T35 = T2P + T34;
+			 T3e = T2A + T35;
+			 T36 = KP559016994 * (T2A - T35);
+			 T3f = FNMS(KP250000000, T3e, T3d);
+			 T3C = T3y + T3B;
+			 T3t = T3h + T3i;
+			 T3u = T3k + T3l;
+			 T3D = T3t + T3u;
+			 T3v = KP559016994 * (T3t - T3u);
+			 T3E = FNMS(KP250000000, T3D, T3C);
+		    }
+		    {
+			 E T3Y, T40, T3X, T3Z;
+			 T3Y = T3d + T3e;
+			 T40 = T3C + T3D;
+			 T3X = W[2];
+			 T3Z = W[3];
+			 cr[WS(rs, 2)] = FNMS(T3Z, T40, T3X * T3Y);
+			 ci[WS(rs, 2)] = FMA(T3Z, T3Y, T3X * T40);
+		    }
+		    {
+			 E T3O, T3U, T3S, T3W, T3M, T3R;
+			 T3M = T3f - T36;
+			 T3O = T3M - T3N;
+			 T3U = T3M + T3N;
+			 T3R = T3E - T3v;
+			 T3S = T3Q + T3R;
+			 T3W = T3R - T3Q;
+			 {
+			      E T3L, T3P, T3T, T3V;
+			      T3L = W[22];
+			      T3P = W[23];
+			      cr[WS(rs, 12)] = FNMS(T3P, T3S, T3L * T3O);
+			      ci[WS(rs, 12)] = FMA(T3P, T3O, T3L * T3S);
+			      T3T = W[32];
+			      T3V = W[33];
+			      cr[WS(rs, 17)] = FNMS(T3V, T3W, T3T * T3U);
+			      ci[WS(rs, 17)] = FMA(T3V, T3U, T3T * T3W);
+			 }
+		    }
+		    {
+			 E T3o, T3I, T3G, T3K, T3g, T3F;
+			 T3g = T36 + T3f;
+			 T3o = T3g - T3n;
+			 T3I = T3g + T3n;
+			 T3F = T3v + T3E;
+			 T3G = T3s + T3F;
+			 T3K = T3F - T3s;
+			 {
+			      E T25, T3p, T3H, T3J;
+			      T25 = W[12];
+			      T3p = W[13];
+			      cr[WS(rs, 7)] = FNMS(T3p, T3G, T25 * T3o);
+			      ci[WS(rs, 7)] = FMA(T3p, T3o, T25 * T3G);
+			      T3H = W[42];
+			      T3J = W[43];
+			      cr[WS(rs, 22)] = FNMS(T3J, T3K, T3H * T3I);
+			      ci[WS(rs, 22)] = FMA(T3J, T3I, T3H * T3K);
+			 }
+		    }
+	       }
+	       {
+		    E T4r, T4L, T4w, T4O, T4h, T4i, T4g, T4j, T4A, T4B, T4z, T4C;
+		    {
+			 E T4n, T4q, T4u, T4v;
+			 T4n = T4l - T4m;
+			 T4q = T4o - T4p;
+			 T4r = FMA(KP951056516, T4n, KP587785252 * T4q);
+			 T4L = FNMS(KP951056516, T4q, KP587785252 * T4n);
+			 T4u = T44 + T47;
+			 T4v = T4b + T4e;
+			 T4w = FMA(KP951056516, T4u, KP587785252 * T4v);
+			 T4O = FNMS(KP951056516, T4v, KP587785252 * T4u);
+		    }
+		    {
+			 E T48, T4f, T4x, T4y;
+			 T4h = T39 + T3c;
+			 T48 = T44 - T47;
+			 T4f = T4b - T4e;
+			 T4i = T48 + T4f;
+			 T4g = KP559016994 * (T48 - T4f);
+			 T4j = FNMS(KP250000000, T4i, T4h);
+			 T4A = T3B - T3y;
+			 T4x = T4l + T4m;
+			 T4y = T4o + T4p;
+			 T4B = T4x + T4y;
+			 T4z = KP559016994 * (T4x - T4y);
+			 T4C = FNMS(KP250000000, T4B, T4A);
+		    }
+		    {
+			 E T4W, T4Y, T4V, T4X;
+			 T4W = T4h + T4i;
+			 T4Y = T4A + T4B;
+			 T4V = W[4];
+			 T4X = W[5];
+			 cr[WS(rs, 3)] = FNMS(T4X, T4Y, T4V * T4W);
+			 ci[WS(rs, 3)] = FMA(T4X, T4W, T4V * T4Y);
+		    }
+		    {
+			 E T4M, T4S, T4Q, T4U, T4K, T4P;
+			 T4K = T4j - T4g;
+			 T4M = T4K - T4L;
+			 T4S = T4K + T4L;
+			 T4P = T4C - T4z;
+			 T4Q = T4O + T4P;
+			 T4U = T4P - T4O;
+			 {
+			      E T4J, T4N, T4R, T4T;
+			      T4J = W[24];
+			      T4N = W[25];
+			      cr[WS(rs, 13)] = FNMS(T4N, T4Q, T4J * T4M);
+			      ci[WS(rs, 13)] = FMA(T4N, T4M, T4J * T4Q);
+			      T4R = W[34];
+			      T4T = W[35];
+			      cr[WS(rs, 18)] = FNMS(T4T, T4U, T4R * T4S);
+			      ci[WS(rs, 18)] = FMA(T4T, T4S, T4R * T4U);
+			 }
+		    }
+		    {
+			 E T4s, T4G, T4E, T4I, T4k, T4D;
+			 T4k = T4g + T4j;
+			 T4s = T4k - T4r;
+			 T4G = T4k + T4r;
+			 T4D = T4z + T4C;
+			 T4E = T4w + T4D;
+			 T4I = T4D - T4w;
+			 {
+			      E T41, T4t, T4F, T4H;
+			      T41 = W[14];
+			      T4t = W[15];
+			      cr[WS(rs, 8)] = FNMS(T4t, T4E, T41 * T4s);
+			      ci[WS(rs, 8)] = FMA(T4t, T4s, T41 * T4E);
+			      T4F = W[44];
+			      T4H = W[45];
+			      cr[WS(rs, 23)] = FNMS(T4H, T4I, T4F * T4G);
+			      ci[WS(rs, 23)] = FMA(T4H, T4G, T4F * T4I);
+			 }
+		    }
+	       }
+	       {
+		    E T5H, T63, T5M, T66, T5x, T5y, T5u, T5z, T5S, T5T, T5P, T5U;
+		    {
+			 E T5D, T5G, T5K, T5L;
+			 T5D = T5B - T5C;
+			 T5G = T5E - T5F;
+			 T5H = FMA(KP951056516, T5D, KP587785252 * T5G);
+			 T63 = FNMS(KP951056516, T5G, KP587785252 * T5D);
+			 T5K = T56 - T5d;
+			 T5L = T5l - T5s;
+			 T5M = FMA(KP951056516, T5K, KP587785252 * T5L);
+			 T66 = FNMS(KP951056516, T5L, KP587785252 * T5K);
+		    }
+		    {
+			 E T5e, T5t, T5N, T5O;
+			 T5x = T5v - T5w;
+			 T5e = T56 + T5d;
+			 T5t = T5l + T5s;
+			 T5y = T5e + T5t;
+			 T5u = KP559016994 * (T5e - T5t);
+			 T5z = FNMS(KP250000000, T5y, T5x);
+			 T5S = T5Q + T5R;
+			 T5N = T5B + T5C;
+			 T5O = T5E + T5F;
+			 T5T = T5N + T5O;
+			 T5P = KP559016994 * (T5N - T5O);
+			 T5U = FNMS(KP250000000, T5T, T5S);
+		    }
+		    {
+			 E T6e, T6g, T6d, T6f;
+			 T6e = T5x + T5y;
+			 T6g = T5S + T5T;
+			 T6d = W[0];
+			 T6f = W[1];
+			 cr[WS(rs, 1)] = FNMS(T6f, T6g, T6d * T6e);
+			 ci[WS(rs, 1)] = FMA(T6f, T6e, T6d * T6g);
+		    }
+		    {
+			 E T64, T6a, T68, T6c, T62, T67;
+			 T62 = T5z - T5u;
+			 T64 = T62 - T63;
+			 T6a = T62 + T63;
+			 T67 = T5U - T5P;
+			 T68 = T66 + T67;
+			 T6c = T67 - T66;
+			 {
+			      E T61, T65, T69, T6b;
+			      T61 = W[20];
+			      T65 = W[21];
+			      cr[WS(rs, 11)] = FNMS(T65, T68, T61 * T64);
+			      ci[WS(rs, 11)] = FMA(T65, T64, T61 * T68);
+			      T69 = W[30];
+			      T6b = W[31];
+			      cr[WS(rs, 16)] = FNMS(T6b, T6c, T69 * T6a);
+			      ci[WS(rs, 16)] = FMA(T6b, T6a, T69 * T6c);
+			 }
+		    }
+		    {
+			 E T5I, T5Y, T5W, T60, T5A, T5V;
+			 T5A = T5u + T5z;
+			 T5I = T5A - T5H;
+			 T5Y = T5A + T5H;
+			 T5V = T5P + T5U;
+			 T5W = T5M + T5V;
+			 T60 = T5V - T5M;
+			 {
+			      E T4Z, T5J, T5X, T5Z;
+			      T4Z = W[10];
+			      T5J = W[11];
+			      cr[WS(rs, 6)] = FNMS(T5J, T5W, T4Z * T5I);
+			      ci[WS(rs, 6)] = FMA(T5J, T5I, T4Z * T5W);
+			      T5X = W[40];
+			      T5Z = W[41];
+			      cr[WS(rs, 21)] = FNMS(T5Z, T60, T5X * T5Y);
+			      ci[WS(rs, 21)] = FMA(T5Z, T5Y, T5X * T60);
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 25},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 25, "hb_25", twinstr, &GENUS, {260, 140, 140, 0} };
+
+void X(codelet_hb_25) (planner *p) {
+     X(khc2hc_register) (p, hb_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:12 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 3 -dif -name hb_3 -include hb.h */
+
+/*
+ * This function contains 16 FP additions, 14 FP multiplications,
+ * (or, 6 additions, 4 multiplications, 10 fused multiply/add),
+ * 27 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "hb.h"
+
+static void hb_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
+	       E Tk, Tj, Tn, Tl, Tm, To;
+	       {
+		    E T1, Td, T7, T8, T4, Tg, T2, T3;
+		    T1 = cr[0];
+		    T2 = cr[WS(rs, 1)];
+		    T3 = ci[0];
+		    Td = ci[WS(rs, 2)];
+		    T7 = ci[WS(rs, 1)];
+		    T8 = cr[WS(rs, 2)];
+		    T4 = T2 + T3;
+		    Tg = T2 - T3;
+		    {
+			 E T5, Tc, Tf, Ta, T9, Te, T6, Th, Ti, Tb;
+			 T5 = W[0];
+			 T9 = T7 + T8;
+			 Te = T7 - T8;
+			 cr[0] = T1 + T4;
+			 T6 = FNMS(KP500000000, T4, T1);
+			 Tc = W[1];
+			 ci[0] = Td + Te;
+			 Tf = FNMS(KP500000000, Te, Td);
+			 Tk = FMA(KP866025403, T9, T6);
+			 Ta = FNMS(KP866025403, T9, T6);
+			 Tj = W[2];
+			 Tn = FNMS(KP866025403, Tg, Tf);
+			 Th = FMA(KP866025403, Tg, Tf);
+			 Ti = Tc * Ta;
+			 Tb = T5 * Ta;
+			 Tl = Tj * Tk;
+			 Tm = W[3];
+			 ci[WS(rs, 1)] = FMA(T5, Th, Ti);
+			 cr[WS(rs, 1)] = FNMS(Tc, Th, Tb);
+		    }
+	       }
+	       cr[WS(rs, 2)] = FNMS(Tm, Tn, Tl);
+	       To = Tm * Tk;
+	       ci[WS(rs, 2)] = FMA(Tj, Tn, To);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 3, "hb_3", twinstr, &GENUS, {6, 4, 10, 0} };
+
+void X(codelet_hb_3) (planner *p) {
+     X(khc2hc_register) (p, hb_3, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 3 -dif -name hb_3 -include hb.h */
+
+/*
+ * This function contains 16 FP additions, 12 FP multiplications,
+ * (or, 10 additions, 6 multiplications, 6 fused multiply/add),
+ * 15 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "hb.h"
+
+static void hb_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
+	       E T1, T4, Ta, Te, T5, T8, Tb, Tf;
+	       {
+		    E T2, T3, T6, T7;
+		    T1 = cr[0];
+		    T2 = cr[WS(rs, 1)];
+		    T3 = ci[0];
+		    T4 = T2 + T3;
+		    Ta = FNMS(KP500000000, T4, T1);
+		    Te = KP866025403 * (T2 - T3);
+		    T5 = ci[WS(rs, 2)];
+		    T6 = ci[WS(rs, 1)];
+		    T7 = cr[WS(rs, 2)];
+		    T8 = T6 - T7;
+		    Tb = KP866025403 * (T6 + T7);
+		    Tf = FNMS(KP500000000, T8, T5);
+	       }
+	       cr[0] = T1 + T4;
+	       ci[0] = T5 + T8;
+	       {
+		    E Tc, Tg, T9, Td;
+		    Tc = Ta - Tb;
+		    Tg = Te + Tf;
+		    T9 = W[0];
+		    Td = W[1];
+		    cr[WS(rs, 1)] = FNMS(Td, Tg, T9 * Tc);
+		    ci[WS(rs, 1)] = FMA(T9, Tg, Td * Tc);
+	       }
+	       {
+		    E Ti, Tk, Th, Tj;
+		    Ti = Ta + Tb;
+		    Tk = Tf - Te;
+		    Th = W[2];
+		    Tj = W[3];
+		    cr[WS(rs, 2)] = FNMS(Tj, Tk, Th * Ti);
+		    ci[WS(rs, 2)] = FMA(Th, Tk, Tj * Ti);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 3, "hb_3", twinstr, &GENUS, {10, 6, 6, 0} };
+
+void X(codelet_hb_3) (planner *p) {
+     X(khc2hc_register) (p, hb_3, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1770 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:15 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -dif -name hb_32 -include hb.h */
+
+/*
+ * This function contains 434 FP additions, 260 FP multiplications,
+ * (or, 236 additions, 62 multiplications, 198 fused multiply/add),
+ * 135 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hb.h"
+
+static void hb_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T5o, T5r, T5q, T5n, T5s, T5p;
+	       {
+		    E T5K, Tf, T8k, T7k, T8x, T7N, T3i, T1i, T3v, T2L, T5f, T4v, T6T, T6m, T52;
+		    E T42, TZ, T6X, T1X, T3p, T8p, T8B, T3o, T26, T58, T4n, T7T, T7z, T59, T4k;
+		    E T6p, T6a, TK, T6W, T2o, T3m, T8s, T8A, T3l, T2x, T55, T4g, T7S, T7G, T56;
+		    E T4d, T6o, T61, T5Q, T5N, T6f, Tu, T8y, T7r, T8l, T7Q, T3w, T1F, T45, T48;
+		    E T3j, T2O, T53, T4y;
+		    {
+			 E T62, T69, T4j, T4i;
+			 {
+			      E T6l, T6i, T40, T41;
+			      {
+				   E T12, T3, T2D, T6, T6g, T2G, T6h, T15, Td, T6k, T1g, T2J, Ta, T17, T1a;
+				   E T6j;
+				   {
+					E T2E, T2F, T13, T14;
+					{
+					     E T1, T2, T4, T5;
+					     T1 = cr[0];
+					     T2 = ci[WS(rs, 15)];
+					     T4 = cr[WS(rs, 8)];
+					     T5 = ci[WS(rs, 7)];
+					     T2E = ci[WS(rs, 31)];
+					     T12 = T1 - T2;
+					     T3 = T1 + T2;
+					     T2D = T4 - T5;
+					     T6 = T4 + T5;
+					     T2F = cr[WS(rs, 16)];
+					}
+					T13 = ci[WS(rs, 23)];
+					T14 = cr[WS(rs, 24)];
+					{
+					     E Tb, Tc, T1d, T1e;
+					     Tb = ci[WS(rs, 3)];
+					     T6g = T2E - T2F;
+					     T2G = T2E + T2F;
+					     T6h = T13 - T14;
+					     T15 = T13 + T14;
+					     Tc = cr[WS(rs, 12)];
+					     T1d = ci[WS(rs, 19)];
+					     T1e = cr[WS(rs, 28)];
+					     {
+						  E T8, T1c, T1f, T9, T18, T19;
+						  T8 = cr[WS(rs, 4)];
+						  Td = Tb + Tc;
+						  T1c = Tb - Tc;
+						  T6k = T1d - T1e;
+						  T1f = T1d + T1e;
+						  T9 = ci[WS(rs, 11)];
+						  T18 = ci[WS(rs, 27)];
+						  T19 = cr[WS(rs, 20)];
+						  T1g = T1c - T1f;
+						  T2J = T1c + T1f;
+						  Ta = T8 + T9;
+						  T17 = T8 - T9;
+						  T1a = T18 + T19;
+						  T6j = T18 - T19;
+					     }
+					}
+				   }
+				   {
+					E T2I, T7M, T7L, T16, T1h, T4u, T4t, T2H, T2K;
+					{
+					     E T7i, T7, T1b, Te, T7j;
+					     T7i = T3 - T6;
+					     T7 = T3 + T6;
+					     T2I = T17 + T1a;
+					     T1b = T17 - T1a;
+					     Te = Ta + Td;
+					     T7M = Ta - Td;
+					     T7j = T6k - T6j;
+					     T6l = T6j + T6k;
+					     T6i = T6g + T6h;
+					     T7L = T6g - T6h;
+					     T5K = T7 - Te;
+					     Tf = T7 + Te;
+					     T8k = T7i + T7j;
+					     T7k = T7i - T7j;
+					     T40 = T12 + T15;
+					     T16 = T12 - T15;
+					     T1h = T1b + T1g;
+					     T4u = T1b - T1g;
+					}
+					T4t = T2G - T2D;
+					T2H = T2D + T2G;
+					T8x = T7M + T7L;
+					T7N = T7L - T7M;
+					T3i = FMA(KP707106781, T1h, T16);
+					T1i = FNMS(KP707106781, T1h, T16);
+					T2K = T2I - T2J;
+					T41 = T2I + T2J;
+					T3v = FMA(KP707106781, T2K, T2H);
+					T2L = FNMS(KP707106781, T2K, T2H);
+					T5f = FNMS(KP707106781, T4u, T4t);
+					T4v = FMA(KP707106781, T4u, T4t);
+				   }
+			      }
+			      {
+				   E T1Y, T1H, TR, T7w, T1K, T21, T65, T7t, TV, T1M, TU, T67, T1U, TW, T1N;
+				   E T1O;
+				   {
+					E TL, TM, TO, TP, T63, T64;
+					TL = ci[0];
+					T6T = T6i + T6l;
+					T6m = T6i - T6l;
+					T52 = FMA(KP707106781, T41, T40);
+					T42 = FNMS(KP707106781, T41, T40);
+					TM = cr[WS(rs, 15)];
+					TO = cr[WS(rs, 7)];
+					TP = ci[WS(rs, 8)];
+					{
+					     E T1I, TN, TQ, T1J, T1Z, T20;
+					     T1I = ci[WS(rs, 16)];
+					     T1Y = TL - TM;
+					     TN = TL + TM;
+					     T1H = TO - TP;
+					     TQ = TO + TP;
+					     T1J = cr[WS(rs, 31)];
+					     T1Z = ci[WS(rs, 24)];
+					     T20 = cr[WS(rs, 23)];
+					     TR = TN + TQ;
+					     T7w = TN - TQ;
+					     T1K = T1I + T1J;
+					     T63 = T1I - T1J;
+					     T64 = T1Z - T20;
+					     T21 = T1Z + T20;
+					}
+					{
+					     E TS, TT, T1S, T1T;
+					     TS = cr[WS(rs, 3)];
+					     T65 = T63 + T64;
+					     T7t = T63 - T64;
+					     TT = ci[WS(rs, 12)];
+					     T1S = ci[WS(rs, 20)];
+					     T1T = cr[WS(rs, 27)];
+					     TV = ci[WS(rs, 4)];
+					     T1M = TS - TT;
+					     TU = TS + TT;
+					     T67 = T1S - T1T;
+					     T1U = T1S + T1T;
+					     TW = cr[WS(rs, 11)];
+					     T1N = ci[WS(rs, 28)];
+					     T1O = cr[WS(rs, 19)];
+					}
+				   }
+				   {
+					E T4l, T1L, T24, T23, T8n, T7v, T1W, T8o, T7y, T4m, T22, T25;
+					{
+					     E T1V, T7u, T7x, T1Q, T1R, TX;
+					     T4l = T1H + T1K;
+					     T1L = T1H - T1K;
+					     T1R = TV - TW;
+					     TX = TV + TW;
+					     {
+						  E T66, T1P, TY, T68;
+						  T66 = T1N - T1O;
+						  T1P = T1N + T1O;
+						  T24 = T1R - T1U;
+						  T1V = T1R + T1U;
+						  T7u = TU - TX;
+						  TY = TU + TX;
+						  T68 = T66 + T67;
+						  T7x = T67 - T66;
+						  T23 = T1M - T1P;
+						  T1Q = T1M + T1P;
+						  TZ = TR + TY;
+						  T62 = TR - TY;
+						  T69 = T65 - T68;
+						  T6X = T65 + T68;
+					     }
+					     T8n = T7u + T7t;
+					     T7v = T7t - T7u;
+					     T4j = T1Q + T1V;
+					     T1W = T1Q - T1V;
+					     T8o = T7w + T7x;
+					     T7y = T7w - T7x;
+					}
+					T4i = T1Y + T21;
+					T22 = T1Y - T21;
+					T25 = T23 + T24;
+					T4m = T23 - T24;
+					T1X = FNMS(KP707106781, T1W, T1L);
+					T3p = FMA(KP707106781, T1W, T1L);
+					T8p = FNMS(KP414213562, T8o, T8n);
+					T8B = FMA(KP414213562, T8n, T8o);
+					T3o = FMA(KP707106781, T25, T22);
+					T26 = FNMS(KP707106781, T25, T22);
+					T58 = FMA(KP707106781, T4m, T4l);
+					T4n = FNMS(KP707106781, T4m, T4l);
+					T7T = FNMS(KP414213562, T7v, T7y);
+					T7z = FMA(KP414213562, T7y, T7v);
+				   }
+			      }
+			 }
+			 {
+			      E T5T, T60, T4c, T4b;
+			      {
+				   E T2p, T28, TC, T7D, T2b, T2s, T5W, T7A, TG, T2d, TF, T5Y, T2l, TH, T2e;
+				   E T2f;
+				   {
+					E Tw, Tx, Tz, TA, T5U, T5V;
+					Tw = cr[WS(rs, 1)];
+					T59 = FMA(KP707106781, T4j, T4i);
+					T4k = FNMS(KP707106781, T4j, T4i);
+					T6p = T69 - T62;
+					T6a = T62 + T69;
+					Tx = ci[WS(rs, 14)];
+					Tz = cr[WS(rs, 9)];
+					TA = ci[WS(rs, 6)];
+					{
+					     E T29, Ty, TB, T2a, T2q, T2r;
+					     T29 = ci[WS(rs, 30)];
+					     T2p = Tw - Tx;
+					     Ty = Tw + Tx;
+					     T28 = Tz - TA;
+					     TB = Tz + TA;
+					     T2a = cr[WS(rs, 17)];
+					     T2q = ci[WS(rs, 22)];
+					     T2r = cr[WS(rs, 25)];
+					     TC = Ty + TB;
+					     T7D = Ty - TB;
+					     T2b = T29 + T2a;
+					     T5U = T29 - T2a;
+					     T5V = T2q - T2r;
+					     T2s = T2q + T2r;
+					}
+					{
+					     E TD, TE, T2j, T2k;
+					     TD = cr[WS(rs, 5)];
+					     T5W = T5U + T5V;
+					     T7A = T5U - T5V;
+					     TE = ci[WS(rs, 10)];
+					     T2j = ci[WS(rs, 18)];
+					     T2k = cr[WS(rs, 29)];
+					     TG = ci[WS(rs, 2)];
+					     T2d = TD - TE;
+					     TF = TD + TE;
+					     T5Y = T2j - T2k;
+					     T2l = T2j + T2k;
+					     TH = cr[WS(rs, 13)];
+					     T2e = ci[WS(rs, 26)];
+					     T2f = cr[WS(rs, 21)];
+					}
+				   }
+				   {
+					E T4e, T2c, T2v, T2u, T8q, T7C, T2n, T8r, T7F, T4f, T2t, T2w;
+					{
+					     E T2m, T7B, T7E, T2h, T2i, TI;
+					     T4e = T2b - T28;
+					     T2c = T28 + T2b;
+					     T2i = TG - TH;
+					     TI = TG + TH;
+					     {
+						  E T5X, T2g, TJ, T5Z;
+						  T5X = T2e - T2f;
+						  T2g = T2e + T2f;
+						  T2v = T2i - T2l;
+						  T2m = T2i + T2l;
+						  T7B = TF - TI;
+						  TJ = TF + TI;
+						  T5Z = T5X + T5Y;
+						  T7E = T5Y - T5X;
+						  T2u = T2d - T2g;
+						  T2h = T2d + T2g;
+						  TK = TC + TJ;
+						  T5T = TC - TJ;
+						  T60 = T5W - T5Z;
+						  T6W = T5W + T5Z;
+					     }
+					     T8q = T7B + T7A;
+					     T7C = T7A - T7B;
+					     T4c = T2h + T2m;
+					     T2n = T2h - T2m;
+					     T8r = T7D + T7E;
+					     T7F = T7D - T7E;
+					}
+					T4b = T2p + T2s;
+					T2t = T2p - T2s;
+					T2w = T2u + T2v;
+					T4f = T2v - T2u;
+					T2o = FNMS(KP707106781, T2n, T2c);
+					T3m = FMA(KP707106781, T2n, T2c);
+					T8s = FMA(KP414213562, T8r, T8q);
+					T8A = FNMS(KP414213562, T8q, T8r);
+					T3l = FMA(KP707106781, T2w, T2t);
+					T2x = FNMS(KP707106781, T2w, T2t);
+					T55 = FMA(KP707106781, T4f, T4e);
+					T4g = FNMS(KP707106781, T4f, T4e);
+					T7S = FMA(KP414213562, T7C, T7F);
+					T7G = FNMS(KP414213562, T7F, T7C);
+				   }
+			      }
+			      {
+				   E T44, T1D, Tm, T7o, T7p, T43, T1y, T47, T1s, Tt, T7m, T7l, T46, T1n;
+				   {
+					E Tj, T1z, Ti, T5P, T1C, Tk, T1v, T1w;
+					{
+					     E Tg, Th, T1A, T1B;
+					     Tg = cr[WS(rs, 2)];
+					     T56 = FMA(KP707106781, T4c, T4b);
+					     T4d = FNMS(KP707106781, T4c, T4b);
+					     T6o = T5T + T60;
+					     T61 = T5T - T60;
+					     Th = ci[WS(rs, 13)];
+					     T1A = ci[WS(rs, 21)];
+					     T1B = cr[WS(rs, 26)];
+					     Tj = cr[WS(rs, 10)];
+					     T1z = Tg - Th;
+					     Ti = Tg + Th;
+					     T5P = T1A - T1B;
+					     T1C = T1A + T1B;
+					     Tk = ci[WS(rs, 5)];
+					     T1v = ci[WS(rs, 29)];
+					     T1w = cr[WS(rs, 18)];
+					}
+					{
+					     E T1u, Tl, T5O, T1x;
+					     T44 = T1z + T1C;
+					     T1D = T1z - T1C;
+					     T1u = Tj - Tk;
+					     Tl = Tj + Tk;
+					     T5O = T1v - T1w;
+					     T1x = T1v + T1w;
+					     Tm = Ti + Tl;
+					     T7o = Ti - Tl;
+					     T7p = T5O - T5P;
+					     T5Q = T5O + T5P;
+					     T43 = T1x - T1u;
+					     T1y = T1u + T1x;
+					}
+				   }
+				   {
+					E Tq, T1o, Tp, T5M, T1r, Tr, T1k, T1l;
+					{
+					     E Tn, To, T1p, T1q;
+					     Tn = ci[WS(rs, 1)];
+					     To = cr[WS(rs, 14)];
+					     T1p = ci[WS(rs, 25)];
+					     T1q = cr[WS(rs, 22)];
+					     Tq = cr[WS(rs, 6)];
+					     T1o = Tn - To;
+					     Tp = Tn + To;
+					     T5M = T1p - T1q;
+					     T1r = T1p + T1q;
+					     Tr = ci[WS(rs, 9)];
+					     T1k = ci[WS(rs, 17)];
+					     T1l = cr[WS(rs, 30)];
+					}
+					{
+					     E T1j, Ts, T5L, T1m;
+					     T47 = T1o + T1r;
+					     T1s = T1o - T1r;
+					     T1j = Tq - Tr;
+					     Ts = Tq + Tr;
+					     T5L = T1k - T1l;
+					     T1m = T1k + T1l;
+					     Tt = Tp + Ts;
+					     T7m = Tp - Ts;
+					     T7l = T5L - T5M;
+					     T5N = T5L + T5M;
+					     T46 = T1j + T1m;
+					     T1n = T1j - T1m;
+					}
+				   }
+				   {
+					E T7P, T7O, T2N, T1t, T1E, T2M, T7n, T7q, T4w, T4x;
+					T7P = T7m + T7l;
+					T7n = T7l - T7m;
+					T7q = T7o + T7p;
+					T7O = T7o - T7p;
+					T6f = Tm - Tt;
+					Tu = Tm + Tt;
+					T8y = T7q + T7n;
+					T7r = T7n - T7q;
+					T2N = FMA(KP414213562, T1n, T1s);
+					T1t = FNMS(KP414213562, T1s, T1n);
+					T1E = FMA(KP414213562, T1D, T1y);
+					T2M = FNMS(KP414213562, T1y, T1D);
+					T8l = T7O + T7P;
+					T7Q = T7O - T7P;
+					T3w = T1E + T1t;
+					T1F = T1t - T1E;
+					T45 = FNMS(KP414213562, T44, T43);
+					T4w = FMA(KP414213562, T43, T44);
+					T4x = FMA(KP414213562, T46, T47);
+					T48 = FNMS(KP414213562, T47, T46);
+					T3j = T2M + T2N;
+					T2O = T2M - T2N;
+					T53 = T4w + T4x;
+					T4y = T4w - T4x;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T72, T5g, T49, T78, T77, T73, T7s, T7U, T7R, T7H, T3f, T3e, T3d;
+			 {
+			      E T5R, T8m, T8C, T8z, T8t, T8e, T86, T88, T8h, T8f, T8i, T8c, T8g;
+			      {
+				   E T6P, T6Q, T6Z, T6S, T6R;
+				   {
+					E Tv, T10, T6V, T6Y, T6U;
+					T72 = Tf - Tu;
+					Tv = Tf + Tu;
+					T6U = T5Q + T5N;
+					T5R = T5N - T5Q;
+					T5g = T48 - T45;
+					T49 = T45 + T48;
+					T10 = TK + TZ;
+					T78 = TK - TZ;
+					T77 = T6T - T6U;
+					T6V = T6T + T6U;
+					T6Y = T6W + T6X;
+					T73 = T6X - T6W;
+					T6P = W[30];
+					cr[0] = Tv + T10;
+					T6Q = Tv - T10;
+					ci[0] = T6V + T6Y;
+					T6Z = T6V - T6Y;
+					T6S = W[31];
+					T6R = T6P * T6Q;
+				   }
+				   {
+					E T8O, T8W, T8Q, T8Z, T8X, T90, T8U, T8Y;
+					{
+					     E T8R, T8S, T8M, T8N, T70;
+					     T8M = FMA(KP707106781, T8l, T8k);
+					     T8m = FNMS(KP707106781, T8l, T8k);
+					     T8C = T8A - T8B;
+					     T8N = T8A + T8B;
+					     T70 = T6S * T6Q;
+					     cr[WS(rs, 16)] = FNMS(T6S, T6Z, T6R);
+					     T8R = FMA(KP707106781, T8y, T8x);
+					     T8z = FNMS(KP707106781, T8y, T8x);
+					     T8O = FNMS(KP923879532, T8N, T8M);
+					     T8W = FMA(KP923879532, T8N, T8M);
+					     ci[WS(rs, 16)] = FMA(T6P, T6Z, T70);
+					     T8S = T8s + T8p;
+					     T8t = T8p - T8s;
+					     {
+						  E T8L, T8T, T8P, T8V;
+						  T8L = W[34];
+						  T8Q = W[35];
+						  T8V = W[2];
+						  T8Z = FMA(KP923879532, T8S, T8R);
+						  T8T = FNMS(KP923879532, T8S, T8R);
+						  T8P = T8L * T8O;
+						  T8X = T8V * T8W;
+						  T90 = T8V * T8Z;
+						  T8U = T8L * T8T;
+						  cr[WS(rs, 18)] = FNMS(T8Q, T8T, T8P);
+						  T8Y = W[3];
+					     }
+					}
+					{
+					     E T89, T8a, T84, T85;
+					     T84 = FNMS(KP707106781, T7r, T7k);
+					     T7s = FMA(KP707106781, T7r, T7k);
+					     ci[WS(rs, 18)] = FMA(T8Q, T8O, T8U);
+					     T85 = T7S + T7T;
+					     T7U = T7S - T7T;
+					     ci[WS(rs, 2)] = FMA(T8Y, T8W, T90);
+					     cr[WS(rs, 2)] = FNMS(T8Y, T8Z, T8X);
+					     T7R = FMA(KP707106781, T7Q, T7N);
+					     T89 = FNMS(KP707106781, T7Q, T7N);
+					     T8e = FMA(KP923879532, T85, T84);
+					     T86 = FNMS(KP923879532, T85, T84);
+					     T8a = T7G + T7z;
+					     T7H = T7z - T7G;
+					     {
+						  E T83, T8b, T87, T8d;
+						  T83 = W[26];
+						  T88 = W[27];
+						  T8d = W[58];
+						  T8h = FMA(KP923879532, T8a, T89);
+						  T8b = FNMS(KP923879532, T8a, T89);
+						  T87 = T83 * T86;
+						  T8f = T8d * T8e;
+						  T8i = T8d * T8h;
+						  T8c = T83 * T8b;
+						  cr[WS(rs, 14)] = FNMS(T88, T8b, T87);
+						  T8g = W[59];
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T5S, T6q, T6n, T6K, T6C, T6b, T6E, T6N, T6L, T6O, T6I, T6M;
+				   {
+					E T6F, T6G, T6A, T6B;
+					T6A = T5K - T5R;
+					T5S = T5K + T5R;
+					ci[WS(rs, 14)] = FMA(T88, T86, T8c);
+					T6B = T6p - T6o;
+					T6q = T6o + T6p;
+					ci[WS(rs, 30)] = FMA(T8g, T8e, T8i);
+					cr[WS(rs, 30)] = FNMS(T8g, T8h, T8f);
+					T6n = T6f + T6m;
+					T6F = T6m - T6f;
+					T6K = FMA(KP707106781, T6B, T6A);
+					T6C = FNMS(KP707106781, T6B, T6A);
+					T6G = T61 - T6a;
+					T6b = T61 + T6a;
+					{
+					     E T6z, T6H, T6D, T6J;
+					     T6z = W[54];
+					     T6E = W[55];
+					     T6J = W[22];
+					     T6N = FMA(KP707106781, T6G, T6F);
+					     T6H = FNMS(KP707106781, T6G, T6F);
+					     T6D = T6z * T6C;
+					     T6L = T6J * T6K;
+					     T6O = T6J * T6N;
+					     T6I = T6z * T6H;
+					     cr[WS(rs, 28)] = FNMS(T6E, T6H, T6D);
+					     T6M = W[23];
+					}
+				   }
+				   {
+					E T8G, T8F, T8J, T8H, T8I, T8u;
+					ci[WS(rs, 28)] = FMA(T6E, T6C, T6I);
+					ci[WS(rs, 12)] = FMA(T6M, T6K, T6O);
+					cr[WS(rs, 12)] = FNMS(T6M, T6N, T6L);
+					T8G = FMA(KP923879532, T8t, T8m);
+					T8u = FNMS(KP923879532, T8t, T8m);
+					{
+					     E T8j, T8w, T8D, T8v, T8E;
+					     T8j = W[50];
+					     T8w = W[51];
+					     T8F = W[18];
+					     T8J = FMA(KP923879532, T8C, T8z);
+					     T8D = FNMS(KP923879532, T8C, T8z);
+					     T8v = T8j * T8u;
+					     T8E = T8w * T8u;
+					     T8H = T8F * T8G;
+					     T8I = W[19];
+					     cr[WS(rs, 26)] = FNMS(T8w, T8D, T8v);
+					     ci[WS(rs, 26)] = FMA(T8j, T8D, T8E);
+					}
+					{
+					     E T6c, T6u, T6x, T6r, T8K, T5J, T6e;
+					     cr[WS(rs, 10)] = FNMS(T8I, T8J, T8H);
+					     T8K = T8I * T8G;
+					     ci[WS(rs, 10)] = FMA(T8F, T8J, T8K);
+					     T6c = FNMS(KP707106781, T6b, T5S);
+					     T6u = FMA(KP707106781, T6b, T5S);
+					     T6x = FMA(KP707106781, T6q, T6n);
+					     T6r = FNMS(KP707106781, T6q, T6n);
+					     T5J = W[38];
+					     T6e = W[39];
+					     {
+						  E T6t, T6w, T6d, T6s, T6v, T6y;
+						  T6t = W[6];
+						  T6w = W[7];
+						  T6d = T5J * T6c;
+						  T6s = T6e * T6c;
+						  T6v = T6t * T6u;
+						  T6y = T6w * T6u;
+						  cr[WS(rs, 20)] = FNMS(T6e, T6r, T6d);
+						  ci[WS(rs, 20)] = FMA(T5J, T6r, T6s);
+						  cr[WS(rs, 4)] = FNMS(T6w, T6x, T6v);
+						  ci[WS(rs, 4)] = FMA(T6t, T6x, T6y);
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T7c, T7f, T7e, T7g, T7d;
+			      {
+				   E T71, T74, T79, T76, T75, T7b, T7a;
+				   T71 = W[46];
+				   T7c = T72 + T73;
+				   T74 = T72 - T73;
+				   T7f = T78 + T77;
+				   T79 = T77 - T78;
+				   T76 = W[47];
+				   T75 = T71 * T74;
+				   T7b = W[14];
+				   T7a = T71 * T79;
+				   T7e = W[15];
+				   cr[WS(rs, 24)] = FNMS(T76, T79, T75);
+				   T7g = T7b * T7f;
+				   T7d = T7b * T7c;
+				   ci[WS(rs, 24)] = FMA(T76, T74, T7a);
+			      }
+			      {
+				   E T81, T7X, T80, T7Z, T82;
+				   ci[WS(rs, 8)] = FMA(T7e, T7c, T7g);
+				   cr[WS(rs, 8)] = FNMS(T7e, T7f, T7d);
+				   {
+					E T7h, T7Y, T7I, T7V, T7K, T7J, T7W;
+					T7h = W[42];
+					T7Y = FMA(KP923879532, T7H, T7s);
+					T7I = FNMS(KP923879532, T7H, T7s);
+					T81 = FMA(KP923879532, T7U, T7R);
+					T7V = FNMS(KP923879532, T7U, T7R);
+					T7K = W[43];
+					T7J = T7h * T7I;
+					T7X = W[10];
+					T80 = W[11];
+					T7W = T7K * T7I;
+					cr[WS(rs, 22)] = FNMS(T7K, T7V, T7J);
+					T7Z = T7X * T7Y;
+					T82 = T80 * T7Y;
+					ci[WS(rs, 22)] = FMA(T7h, T7V, T7W);
+				   }
+				   {
+					E T2P, T37, T1G, T32, T2R, T2Q, T38, T2z, T27, T2y;
+					T2P = FMA(KP923879532, T2O, T2L);
+					T37 = FNMS(KP923879532, T2O, T2L);
+					cr[WS(rs, 6)] = FNMS(T80, T81, T7Z);
+					ci[WS(rs, 6)] = FMA(T7X, T81, T82);
+					T1G = FMA(KP923879532, T1F, T1i);
+					T32 = FNMS(KP923879532, T1F, T1i);
+					T2R = FNMS(KP668178637, T1X, T26);
+					T27 = FMA(KP668178637, T26, T1X);
+					T2y = FNMS(KP668178637, T2x, T2o);
+					T2Q = FMA(KP668178637, T2o, T2x);
+					T38 = T2y + T27;
+					T2z = T27 - T2y;
+					{
+					     E T2C, T2A, T3c, T34, T2U, T39, T36, T31;
+					     {
+						  E T11, T2W, T2S, T33;
+						  T11 = W[40];
+						  T2C = W[41];
+						  T2A = FNMS(KP831469612, T2z, T1G);
+						  T2W = FMA(KP831469612, T2z, T1G);
+						  T2S = T2Q - T2R;
+						  T33 = T2Q + T2R;
+						  {
+						       E T2V, T2B, T2T, T2Z, T2X, T2Y, T30;
+						       T2V = W[8];
+						       T2B = T11 * T2A;
+						       T3c = FMA(KP831469612, T33, T32);
+						       T34 = FNMS(KP831469612, T33, T32);
+						       T2T = FNMS(KP831469612, T2S, T2P);
+						       T2Z = FMA(KP831469612, T2S, T2P);
+						       T2X = T2V * T2W;
+						       T2Y = W[9];
+						       T30 = T2V * T2Z;
+						       cr[WS(rs, 21)] = FNMS(T2C, T2T, T2B);
+						       T2U = T11 * T2T;
+						       cr[WS(rs, 5)] = FNMS(T2Y, T2Z, T2X);
+						       ci[WS(rs, 5)] = FMA(T2Y, T2W, T30);
+						  }
+					     }
+					     T39 = FNMS(KP831469612, T38, T37);
+					     T3f = FMA(KP831469612, T38, T37);
+					     ci[WS(rs, 21)] = FMA(T2C, T2A, T2U);
+					     T36 = W[25];
+					     T31 = W[24];
+					     {
+						  E T3b, T3g, T3a, T35;
+						  T3e = W[57];
+						  T3a = T36 * T34;
+						  T35 = T31 * T34;
+						  T3b = W[56];
+						  T3g = T3e * T3c;
+						  ci[WS(rs, 13)] = FMA(T31, T39, T3a);
+						  cr[WS(rs, 13)] = FNMS(T36, T39, T35);
+						  T3d = T3b * T3c;
+						  ci[WS(rs, 29)] = FMA(T3b, T3f, T3g);
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T4G, T4J, T4I, T4F, T4K;
+			      {
+				   E T4z, T4R, T4a, T4M, T4h, T4o, T4C, T4N, T4A, T4B;
+				   T4z = FMA(KP923879532, T4y, T4v);
+				   T4R = FNMS(KP923879532, T4y, T4v);
+				   T4a = FNMS(KP923879532, T49, T42);
+				   T4M = FMA(KP923879532, T49, T42);
+				   cr[WS(rs, 29)] = FNMS(T3e, T3f, T3d);
+				   T4h = FNMS(KP668178637, T4g, T4d);
+				   T4A = FMA(KP668178637, T4d, T4g);
+				   T4B = FMA(KP668178637, T4k, T4n);
+				   T4o = FNMS(KP668178637, T4n, T4k);
+				   T4C = T4A - T4B;
+				   T4N = T4A + T4B;
+				   {
+					E T4W, T4Z, T4q, T4X, T50, T4Y;
+					{
+					     E T4L, T4Q, T4O, T4p, T4S, T4P, T4U, T4V, T4T;
+					     T4L = W[20];
+					     T4Q = W[21];
+					     T4W = FMA(KP831469612, T4N, T4M);
+					     T4O = FNMS(KP831469612, T4N, T4M);
+					     T4p = T4h + T4o;
+					     T4S = T4h - T4o;
+					     T4P = T4L * T4O;
+					     T4V = W[52];
+					     T4Z = FNMS(KP831469612, T4S, T4R);
+					     T4T = FMA(KP831469612, T4S, T4R);
+					     T4q = FNMS(KP831469612, T4p, T4a);
+					     T4G = FMA(KP831469612, T4p, T4a);
+					     cr[WS(rs, 11)] = FNMS(T4Q, T4T, T4P);
+					     T4U = T4L * T4T;
+					     T4X = T4V * T4W;
+					     T50 = T4V * T4Z;
+					     T4Y = W[53];
+					     ci[WS(rs, 11)] = FMA(T4Q, T4O, T4U);
+					}
+					{
+					     E T4D, T4s, T3Z, T4E, T4r;
+					     T4J = FMA(KP831469612, T4C, T4z);
+					     T4D = FNMS(KP831469612, T4C, T4z);
+					     T4s = W[37];
+					     ci[WS(rs, 27)] = FMA(T4Y, T4W, T50);
+					     cr[WS(rs, 27)] = FNMS(T4Y, T4Z, T4X);
+					     T3Z = W[36];
+					     T4E = T4s * T4q;
+					     T4I = W[5];
+					     T4r = T3Z * T4q;
+					     ci[WS(rs, 19)] = FMA(T3Z, T4D, T4E);
+					     T4F = W[4];
+					     T4K = T4I * T4G;
+					     cr[WS(rs, 19)] = FNMS(T4s, T4D, T4r);
+					}
+				   }
+			      }
+			      {
+				   E T3E, T3H, T3G, T3D, T3I;
+				   {
+					E T3x, T3P, T3k, T3K, T3n, T3q, T3A, T3L, T4H, T3y, T3z;
+					T3x = FMA(KP923879532, T3w, T3v);
+					T3P = FNMS(KP923879532, T3w, T3v);
+					T4H = T4F * T4G;
+					ci[WS(rs, 3)] = FMA(T4F, T4J, T4K);
+					T3k = FMA(KP923879532, T3j, T3i);
+					T3K = FNMS(KP923879532, T3j, T3i);
+					T3y = FMA(KP198912367, T3l, T3m);
+					T3n = FNMS(KP198912367, T3m, T3l);
+					cr[WS(rs, 3)] = FNMS(T4I, T4J, T4H);
+					T3z = FNMS(KP198912367, T3o, T3p);
+					T3q = FMA(KP198912367, T3p, T3o);
+					T3A = T3y + T3z;
+					T3L = T3z - T3y;
+					{
+					     E T3U, T3X, T3s, T3V, T3Y, T3W;
+					     {
+						  E T3J, T3O, T3M, T3r, T3Q, T3N, T3S, T3T, T3R;
+						  T3J = W[48];
+						  T3O = W[49];
+						  T3U = FMA(KP980785280, T3L, T3K);
+						  T3M = FNMS(KP980785280, T3L, T3K);
+						  T3r = T3n + T3q;
+						  T3Q = T3n - T3q;
+						  T3N = T3J * T3M;
+						  T3T = W[16];
+						  T3X = FMA(KP980785280, T3Q, T3P);
+						  T3R = FNMS(KP980785280, T3Q, T3P);
+						  T3s = FNMS(KP980785280, T3r, T3k);
+						  T3E = FMA(KP980785280, T3r, T3k);
+						  cr[WS(rs, 25)] = FNMS(T3O, T3R, T3N);
+						  T3S = T3J * T3R;
+						  T3V = T3T * T3U;
+						  T3Y = T3T * T3X;
+						  T3W = W[17];
+						  ci[WS(rs, 25)] = FMA(T3O, T3M, T3S);
+					     }
+					     {
+						  E T3B, T3u, T3h, T3C, T3t;
+						  T3H = FMA(KP980785280, T3A, T3x);
+						  T3B = FNMS(KP980785280, T3A, T3x);
+						  T3u = W[33];
+						  ci[WS(rs, 9)] = FMA(T3W, T3U, T3Y);
+						  cr[WS(rs, 9)] = FNMS(T3W, T3X, T3V);
+						  T3h = W[32];
+						  T3C = T3u * T3s;
+						  T3G = W[1];
+						  T3t = T3h * T3s;
+						  ci[WS(rs, 17)] = FMA(T3h, T3B, T3C);
+						  T3D = W[0];
+						  T3I = T3G * T3E;
+						  cr[WS(rs, 17)] = FNMS(T3u, T3B, T3t);
+					     }
+					}
+				   }
+				   {
+					E T5h, T5z, T54, T5u, T57, T5a, T5k, T5v, T3F, T5i, T5j;
+					T5h = FMA(KP923879532, T5g, T5f);
+					T5z = FNMS(KP923879532, T5g, T5f);
+					T3F = T3D * T3E;
+					ci[WS(rs, 1)] = FMA(T3D, T3H, T3I);
+					T54 = FNMS(KP923879532, T53, T52);
+					T5u = FMA(KP923879532, T53, T52);
+					T5i = FMA(KP198912367, T55, T56);
+					T57 = FNMS(KP198912367, T56, T55);
+					cr[WS(rs, 1)] = FNMS(T3G, T3H, T3F);
+					T5j = FMA(KP198912367, T58, T59);
+					T5a = FNMS(KP198912367, T59, T58);
+					T5k = T5i - T5j;
+					T5v = T5i + T5j;
+					{
+					     E T5E, T5H, T5c, T5F, T5I, T5G;
+					     {
+						  E T5t, T5y, T5w, T5b, T5A, T5x, T5C, T5D, T5B;
+						  T5t = W[28];
+						  T5y = W[29];
+						  T5E = FMA(KP980785280, T5v, T5u);
+						  T5w = FNMS(KP980785280, T5v, T5u);
+						  T5b = T57 + T5a;
+						  T5A = T5a - T57;
+						  T5x = T5t * T5w;
+						  T5D = W[60];
+						  T5H = FNMS(KP980785280, T5A, T5z);
+						  T5B = FMA(KP980785280, T5A, T5z);
+						  T5c = FMA(KP980785280, T5b, T54);
+						  T5o = FNMS(KP980785280, T5b, T54);
+						  cr[WS(rs, 15)] = FNMS(T5y, T5B, T5x);
+						  T5C = T5t * T5B;
+						  T5F = T5D * T5E;
+						  T5I = T5D * T5H;
+						  T5G = W[61];
+						  ci[WS(rs, 15)] = FMA(T5y, T5w, T5C);
+					     }
+					     {
+						  E T5l, T5e, T51, T5m, T5d;
+						  T5r = FMA(KP980785280, T5k, T5h);
+						  T5l = FNMS(KP980785280, T5k, T5h);
+						  T5e = W[45];
+						  ci[WS(rs, 31)] = FMA(T5G, T5E, T5I);
+						  cr[WS(rs, 31)] = FNMS(T5G, T5H, T5F);
+						  T51 = W[44];
+						  T5m = T5e * T5c;
+						  T5q = W[13];
+						  T5d = T51 * T5c;
+						  ci[WS(rs, 23)] = FMA(T51, T5l, T5m);
+						  T5n = W[12];
+						  T5s = T5q * T5o;
+						  cr[WS(rs, 23)] = FNMS(T5e, T5l, T5d);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T5p = T5n * T5o;
+	       ci[WS(rs, 7)] = FMA(T5n, T5r, T5s);
+	       cr[WS(rs, 7)] = FNMS(T5q, T5r, T5p);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 32, "hb_32", twinstr, &GENUS, {236, 62, 198, 0} };
+
+void X(codelet_hb_32) (planner *p) {
+     X(khc2hc_register) (p, hb_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -dif -name hb_32 -include hb.h */
+
+/*
+ * This function contains 434 FP additions, 208 FP multiplications,
+ * (or, 340 additions, 114 multiplications, 94 fused multiply/add),
+ * 98 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hb.h"
+
+static void hb_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T4o, T6y, T70, T5u, Tf, T12, T5x, T6z, T3m, T3Y, T29, T2y, T4v, T71, T2U;
+	       E T3M, Tu, T1U, T6D, T73, T6G, T74, T1h, T2z, T2X, T3o, T4D, T5A, T4K, T5z;
+	       E T30, T3n, TK, T1j, T6S, T7w, T6V, T7v, T1y, T2B, T3c, T3S, T4X, T61, T54;
+	       E T62, T3f, T3T, TZ, T1A, T6L, T7z, T6O, T7y, T1P, T2C, T35, T3P, T5g, T64;
+	       E T5n, T65, T38, T3Q;
+	       {
+		    E T3, T4m, T24, T4q, T27, T4t, T6, T5s, Ta, T4p, T1X, T5t, T20, T4n, Td;
+		    E T4s;
+		    {
+			 E T1, T2, T22, T23;
+			 T1 = cr[0];
+			 T2 = ci[WS(rs, 15)];
+			 T3 = T1 + T2;
+			 T4m = T1 - T2;
+			 T22 = ci[WS(rs, 27)];
+			 T23 = cr[WS(rs, 20)];
+			 T24 = T22 - T23;
+			 T4q = T22 + T23;
+		    }
+		    {
+			 E T25, T26, T4, T5;
+			 T25 = ci[WS(rs, 19)];
+			 T26 = cr[WS(rs, 28)];
+			 T27 = T25 - T26;
+			 T4t = T25 + T26;
+			 T4 = cr[WS(rs, 8)];
+			 T5 = ci[WS(rs, 7)];
+			 T6 = T4 + T5;
+			 T5s = T4 - T5;
+		    }
+		    {
+			 E T8, T9, T1V, T1W;
+			 T8 = cr[WS(rs, 4)];
+			 T9 = ci[WS(rs, 11)];
+			 Ta = T8 + T9;
+			 T4p = T8 - T9;
+			 T1V = ci[WS(rs, 31)];
+			 T1W = cr[WS(rs, 16)];
+			 T1X = T1V - T1W;
+			 T5t = T1V + T1W;
+		    }
+		    {
+			 E T1Y, T1Z, Tb, Tc;
+			 T1Y = ci[WS(rs, 23)];
+			 T1Z = cr[WS(rs, 24)];
+			 T20 = T1Y - T1Z;
+			 T4n = T1Y + T1Z;
+			 Tb = ci[WS(rs, 3)];
+			 Tc = cr[WS(rs, 12)];
+			 Td = Tb + Tc;
+			 T4s = Tb - Tc;
+		    }
+		    {
+			 E T7, Te, T21, T28;
+			 T4o = T4m - T4n;
+			 T6y = T4m + T4n;
+			 T70 = T5t - T5s;
+			 T5u = T5s + T5t;
+			 T7 = T3 + T6;
+			 Te = Ta + Td;
+			 Tf = T7 + Te;
+			 T12 = T7 - Te;
+			 {
+			      E T5v, T5w, T3k, T3l;
+			      T5v = T4p + T4q;
+			      T5w = T4s + T4t;
+			      T5x = KP707106781 * (T5v - T5w);
+			      T6z = KP707106781 * (T5v + T5w);
+			      T3k = T1X - T20;
+			      T3l = Ta - Td;
+			      T3m = T3k - T3l;
+			      T3Y = T3l + T3k;
+			 }
+			 T21 = T1X + T20;
+			 T28 = T24 + T27;
+			 T29 = T21 - T28;
+			 T2y = T21 + T28;
+			 {
+			      E T4r, T4u, T2S, T2T;
+			      T4r = T4p - T4q;
+			      T4u = T4s - T4t;
+			      T4v = KP707106781 * (T4r + T4u);
+			      T71 = KP707106781 * (T4r - T4u);
+			      T2S = T3 - T6;
+			      T2T = T27 - T24;
+			      T2U = T2S - T2T;
+			      T3M = T2S + T2T;
+			 }
+		    }
+	       }
+	       {
+		    E Ti, T4H, T1c, T4F, T1f, T4I, Tl, T4E, Tp, T4A, T15, T4y, T18, T4B, Ts;
+		    E T4x;
+		    {
+			 E Tg, Th, T1a, T1b;
+			 Tg = cr[WS(rs, 2)];
+			 Th = ci[WS(rs, 13)];
+			 Ti = Tg + Th;
+			 T4H = Tg - Th;
+			 T1a = ci[WS(rs, 29)];
+			 T1b = cr[WS(rs, 18)];
+			 T1c = T1a - T1b;
+			 T4F = T1a + T1b;
+		    }
+		    {
+			 E T1d, T1e, Tj, Tk;
+			 T1d = ci[WS(rs, 21)];
+			 T1e = cr[WS(rs, 26)];
+			 T1f = T1d - T1e;
+			 T4I = T1d + T1e;
+			 Tj = cr[WS(rs, 10)];
+			 Tk = ci[WS(rs, 5)];
+			 Tl = Tj + Tk;
+			 T4E = Tj - Tk;
+		    }
+		    {
+			 E Tn, To, T13, T14;
+			 Tn = ci[WS(rs, 1)];
+			 To = cr[WS(rs, 14)];
+			 Tp = Tn + To;
+			 T4A = Tn - To;
+			 T13 = ci[WS(rs, 17)];
+			 T14 = cr[WS(rs, 30)];
+			 T15 = T13 - T14;
+			 T4y = T13 + T14;
+		    }
+		    {
+			 E T16, T17, Tq, Tr;
+			 T16 = ci[WS(rs, 25)];
+			 T17 = cr[WS(rs, 22)];
+			 T18 = T16 - T17;
+			 T4B = T16 + T17;
+			 Tq = cr[WS(rs, 6)];
+			 Tr = ci[WS(rs, 9)];
+			 Ts = Tq + Tr;
+			 T4x = Tq - Tr;
+		    }
+		    {
+			 E Tm, Tt, T6B, T6C;
+			 Tm = Ti + Tl;
+			 Tt = Tp + Ts;
+			 Tu = Tm + Tt;
+			 T1U = Tm - Tt;
+			 T6B = T4H + T4I;
+			 T6C = T4F - T4E;
+			 T6D = FNMS(KP923879532, T6C, KP382683432 * T6B);
+			 T73 = FMA(KP382683432, T6C, KP923879532 * T6B);
+		    }
+		    {
+			 E T6E, T6F, T19, T1g;
+			 T6E = T4A + T4B;
+			 T6F = T4x + T4y;
+			 T6G = FNMS(KP923879532, T6F, KP382683432 * T6E);
+			 T74 = FMA(KP382683432, T6F, KP923879532 * T6E);
+			 T19 = T15 + T18;
+			 T1g = T1c + T1f;
+			 T1h = T19 - T1g;
+			 T2z = T1g + T19;
+		    }
+		    {
+			 E T2V, T2W, T4z, T4C;
+			 T2V = T15 - T18;
+			 T2W = Tp - Ts;
+			 T2X = T2V - T2W;
+			 T3o = T2W + T2V;
+			 T4z = T4x - T4y;
+			 T4C = T4A - T4B;
+			 T4D = FNMS(KP382683432, T4C, KP923879532 * T4z);
+			 T5A = FMA(KP382683432, T4z, KP923879532 * T4C);
+		    }
+		    {
+			 E T4G, T4J, T2Y, T2Z;
+			 T4G = T4E + T4F;
+			 T4J = T4H - T4I;
+			 T4K = FMA(KP923879532, T4G, KP382683432 * T4J);
+			 T5z = FNMS(KP382683432, T4G, KP923879532 * T4J);
+			 T2Y = Ti - Tl;
+			 T2Z = T1c - T1f;
+			 T30 = T2Y + T2Z;
+			 T3n = T2Y - T2Z;
+		    }
+	       }
+	       {
+		    E Ty, T4N, TB, T4Y, T1p, T4O, T1m, T4Z, TI, T52, T1w, T4V, TF, T51, T1t;
+		    E T4S;
+		    {
+			 E Tw, Tx, T1k, T1l;
+			 Tw = cr[WS(rs, 1)];
+			 Tx = ci[WS(rs, 14)];
+			 Ty = Tw + Tx;
+			 T4N = Tw - Tx;
+			 {
+			      E Tz, TA, T1n, T1o;
+			      Tz = cr[WS(rs, 9)];
+			      TA = ci[WS(rs, 6)];
+			      TB = Tz + TA;
+			      T4Y = Tz - TA;
+			      T1n = ci[WS(rs, 22)];
+			      T1o = cr[WS(rs, 25)];
+			      T1p = T1n - T1o;
+			      T4O = T1n + T1o;
+			 }
+			 T1k = ci[WS(rs, 30)];
+			 T1l = cr[WS(rs, 17)];
+			 T1m = T1k - T1l;
+			 T4Z = T1k + T1l;
+			 {
+			      E TG, TH, T4T, T1u, T1v, T4U;
+			      TG = ci[WS(rs, 2)];
+			      TH = cr[WS(rs, 13)];
+			      T4T = TG - TH;
+			      T1u = ci[WS(rs, 18)];
+			      T1v = cr[WS(rs, 29)];
+			      T4U = T1u + T1v;
+			      TI = TG + TH;
+			      T52 = T4T + T4U;
+			      T1w = T1u - T1v;
+			      T4V = T4T - T4U;
+			 }
+			 {
+			      E TD, TE, T4Q, T1r, T1s, T4R;
+			      TD = cr[WS(rs, 5)];
+			      TE = ci[WS(rs, 10)];
+			      T4Q = TD - TE;
+			      T1r = ci[WS(rs, 26)];
+			      T1s = cr[WS(rs, 21)];
+			      T4R = T1r + T1s;
+			      TF = TD + TE;
+			      T51 = T4Q + T4R;
+			      T1t = T1r - T1s;
+			      T4S = T4Q - T4R;
+			 }
+		    }
+		    {
+			 E TC, TJ, T6Q, T6R;
+			 TC = Ty + TB;
+			 TJ = TF + TI;
+			 TK = TC + TJ;
+			 T1j = TC - TJ;
+			 T6Q = T4Z - T4Y;
+			 T6R = KP707106781 * (T4S - T4V);
+			 T6S = T6Q + T6R;
+			 T7w = T6Q - T6R;
+		    }
+		    {
+			 E T6T, T6U, T1q, T1x;
+			 T6T = T4N + T4O;
+			 T6U = KP707106781 * (T51 + T52);
+			 T6V = T6T - T6U;
+			 T7v = T6T + T6U;
+			 T1q = T1m + T1p;
+			 T1x = T1t + T1w;
+			 T1y = T1q - T1x;
+			 T2B = T1q + T1x;
+		    }
+		    {
+			 E T3a, T3b, T4P, T4W;
+			 T3a = T1m - T1p;
+			 T3b = TF - TI;
+			 T3c = T3a - T3b;
+			 T3S = T3b + T3a;
+			 T4P = T4N - T4O;
+			 T4W = KP707106781 * (T4S + T4V);
+			 T4X = T4P - T4W;
+			 T61 = T4P + T4W;
+		    }
+		    {
+			 E T50, T53, T3d, T3e;
+			 T50 = T4Y + T4Z;
+			 T53 = KP707106781 * (T51 - T52);
+			 T54 = T50 - T53;
+			 T62 = T50 + T53;
+			 T3d = Ty - TB;
+			 T3e = T1w - T1t;
+			 T3f = T3d - T3e;
+			 T3T = T3d + T3e;
+		    }
+	       }
+	       {
+		    E TN, T56, TQ, T5h, T1G, T57, T1D, T5i, TX, T5l, T1N, T5e, TU, T5k, T1K;
+		    E T5b;
+		    {
+			 E TL, TM, T1B, T1C;
+			 TL = ci[0];
+			 TM = cr[WS(rs, 15)];
+			 TN = TL + TM;
+			 T56 = TL - TM;
+			 {
+			      E TO, TP, T1E, T1F;
+			      TO = cr[WS(rs, 7)];
+			      TP = ci[WS(rs, 8)];
+			      TQ = TO + TP;
+			      T5h = TO - TP;
+			      T1E = ci[WS(rs, 24)];
+			      T1F = cr[WS(rs, 23)];
+			      T1G = T1E - T1F;
+			      T57 = T1E + T1F;
+			 }
+			 T1B = ci[WS(rs, 16)];
+			 T1C = cr[WS(rs, 31)];
+			 T1D = T1B - T1C;
+			 T5i = T1B + T1C;
+			 {
+			      E TV, TW, T5c, T1L, T1M, T5d;
+			      TV = ci[WS(rs, 4)];
+			      TW = cr[WS(rs, 11)];
+			      T5c = TV - TW;
+			      T1L = ci[WS(rs, 20)];
+			      T1M = cr[WS(rs, 27)];
+			      T5d = T1L + T1M;
+			      TX = TV + TW;
+			      T5l = T5c + T5d;
+			      T1N = T1L - T1M;
+			      T5e = T5c - T5d;
+			 }
+			 {
+			      E TS, TT, T59, T1I, T1J, T5a;
+			      TS = cr[WS(rs, 3)];
+			      TT = ci[WS(rs, 12)];
+			      T59 = TS - TT;
+			      T1I = ci[WS(rs, 28)];
+			      T1J = cr[WS(rs, 19)];
+			      T5a = T1I + T1J;
+			      TU = TS + TT;
+			      T5k = T59 + T5a;
+			      T1K = T1I - T1J;
+			      T5b = T59 - T5a;
+			 }
+		    }
+		    {
+			 E TR, TY, T6J, T6K;
+			 TR = TN + TQ;
+			 TY = TU + TX;
+			 TZ = TR + TY;
+			 T1A = TR - TY;
+			 T6J = KP707106781 * (T5b - T5e);
+			 T6K = T5h + T5i;
+			 T6L = T6J - T6K;
+			 T7z = T6K + T6J;
+		    }
+		    {
+			 E T6M, T6N, T1H, T1O;
+			 T6M = T56 + T57;
+			 T6N = KP707106781 * (T5k + T5l);
+			 T6O = T6M - T6N;
+			 T7y = T6M + T6N;
+			 T1H = T1D + T1G;
+			 T1O = T1K + T1N;
+			 T1P = T1H - T1O;
+			 T2C = T1H + T1O;
+		    }
+		    {
+			 E T33, T34, T58, T5f;
+			 T33 = T1D - T1G;
+			 T34 = TU - TX;
+			 T35 = T33 - T34;
+			 T3P = T34 + T33;
+			 T58 = T56 - T57;
+			 T5f = KP707106781 * (T5b + T5e);
+			 T5g = T58 - T5f;
+			 T64 = T58 + T5f;
+		    }
+		    {
+			 E T5j, T5m, T36, T37;
+			 T5j = T5h - T5i;
+			 T5m = KP707106781 * (T5k - T5l);
+			 T5n = T5j - T5m;
+			 T65 = T5j + T5m;
+			 T36 = TN - TQ;
+			 T37 = T1N - T1K;
+			 T38 = T36 - T37;
+			 T3Q = T36 + T37;
+		    }
+	       }
+	       {
+		    E Tv, T10, T2w, T2A, T2D, T2E, T2v, T2x;
+		    Tv = Tf + Tu;
+		    T10 = TK + TZ;
+		    T2w = Tv - T10;
+		    T2A = T2y + T2z;
+		    T2D = T2B + T2C;
+		    T2E = T2A - T2D;
+		    cr[0] = Tv + T10;
+		    ci[0] = T2A + T2D;
+		    T2v = W[30];
+		    T2x = W[31];
+		    cr[WS(rs, 16)] = FNMS(T2x, T2E, T2v * T2w);
+		    ci[WS(rs, 16)] = FMA(T2x, T2w, T2v * T2E);
+	       }
+	       {
+		    E T2I, T2O, T2M, T2Q;
+		    {
+			 E T2G, T2H, T2K, T2L;
+			 T2G = Tf - Tu;
+			 T2H = T2C - T2B;
+			 T2I = T2G - T2H;
+			 T2O = T2G + T2H;
+			 T2K = T2y - T2z;
+			 T2L = TK - TZ;
+			 T2M = T2K - T2L;
+			 T2Q = T2L + T2K;
+		    }
+		    {
+			 E T2F, T2J, T2N, T2P;
+			 T2F = W[46];
+			 T2J = W[47];
+			 cr[WS(rs, 24)] = FNMS(T2J, T2M, T2F * T2I);
+			 ci[WS(rs, 24)] = FMA(T2F, T2M, T2J * T2I);
+			 T2N = W[14];
+			 T2P = W[15];
+			 cr[WS(rs, 8)] = FNMS(T2P, T2Q, T2N * T2O);
+			 ci[WS(rs, 8)] = FMA(T2N, T2Q, T2P * T2O);
+		    }
+	       }
+	       {
+		    E T1i, T2a, T2o, T2k, T2d, T2l, T1R, T2p;
+		    T1i = T12 + T1h;
+		    T2a = T1U + T29;
+		    T2o = T29 - T1U;
+		    T2k = T12 - T1h;
+		    {
+			 E T2b, T2c, T1z, T1Q;
+			 T2b = T1j + T1y;
+			 T2c = T1P - T1A;
+			 T2d = KP707106781 * (T2b + T2c);
+			 T2l = KP707106781 * (T2c - T2b);
+			 T1z = T1j - T1y;
+			 T1Q = T1A + T1P;
+			 T1R = KP707106781 * (T1z + T1Q);
+			 T2p = KP707106781 * (T1z - T1Q);
+		    }
+		    {
+			 E T1S, T2e, T11, T1T;
+			 T1S = T1i - T1R;
+			 T2e = T2a - T2d;
+			 T11 = W[38];
+			 T1T = W[39];
+			 cr[WS(rs, 20)] = FNMS(T1T, T2e, T11 * T1S);
+			 ci[WS(rs, 20)] = FMA(T1T, T1S, T11 * T2e);
+		    }
+		    {
+			 E T2s, T2u, T2r, T2t;
+			 T2s = T2k + T2l;
+			 T2u = T2o + T2p;
+			 T2r = W[22];
+			 T2t = W[23];
+			 cr[WS(rs, 12)] = FNMS(T2t, T2u, T2r * T2s);
+			 ci[WS(rs, 12)] = FMA(T2r, T2u, T2t * T2s);
+		    }
+		    {
+			 E T2g, T2i, T2f, T2h;
+			 T2g = T1i + T1R;
+			 T2i = T2a + T2d;
+			 T2f = W[6];
+			 T2h = W[7];
+			 cr[WS(rs, 4)] = FNMS(T2h, T2i, T2f * T2g);
+			 ci[WS(rs, 4)] = FMA(T2h, T2g, T2f * T2i);
+		    }
+		    {
+			 E T2m, T2q, T2j, T2n;
+			 T2m = T2k - T2l;
+			 T2q = T2o - T2p;
+			 T2j = W[54];
+			 T2n = W[55];
+			 cr[WS(rs, 28)] = FNMS(T2n, T2q, T2j * T2m);
+			 ci[WS(rs, 28)] = FMA(T2j, T2q, T2n * T2m);
+		    }
+	       }
+	       {
+		    E T3O, T4a, T40, T4e, T3V, T4f, T43, T4b, T3N, T3Z;
+		    T3N = KP707106781 * (T3n + T3o);
+		    T3O = T3M - T3N;
+		    T4a = T3M + T3N;
+		    T3Z = KP707106781 * (T30 + T2X);
+		    T40 = T3Y - T3Z;
+		    T4e = T3Y + T3Z;
+		    {
+			 E T3R, T3U, T41, T42;
+			 T3R = FNMS(KP382683432, T3Q, KP923879532 * T3P);
+			 T3U = FMA(KP923879532, T3S, KP382683432 * T3T);
+			 T3V = T3R - T3U;
+			 T4f = T3U + T3R;
+			 T41 = FNMS(KP382683432, T3S, KP923879532 * T3T);
+			 T42 = FMA(KP382683432, T3P, KP923879532 * T3Q);
+			 T43 = T41 - T42;
+			 T4b = T41 + T42;
+		    }
+		    {
+			 E T3W, T44, T3L, T3X;
+			 T3W = T3O - T3V;
+			 T44 = T40 - T43;
+			 T3L = W[50];
+			 T3X = W[51];
+			 cr[WS(rs, 26)] = FNMS(T3X, T44, T3L * T3W);
+			 ci[WS(rs, 26)] = FMA(T3X, T3W, T3L * T44);
+		    }
+		    {
+			 E T4i, T4k, T4h, T4j;
+			 T4i = T4a + T4b;
+			 T4k = T4e + T4f;
+			 T4h = W[2];
+			 T4j = W[3];
+			 cr[WS(rs, 2)] = FNMS(T4j, T4k, T4h * T4i);
+			 ci[WS(rs, 2)] = FMA(T4h, T4k, T4j * T4i);
+		    }
+		    {
+			 E T46, T48, T45, T47;
+			 T46 = T3O + T3V;
+			 T48 = T40 + T43;
+			 T45 = W[18];
+			 T47 = W[19];
+			 cr[WS(rs, 10)] = FNMS(T47, T48, T45 * T46);
+			 ci[WS(rs, 10)] = FMA(T47, T46, T45 * T48);
+		    }
+		    {
+			 E T4c, T4g, T49, T4d;
+			 T4c = T4a - T4b;
+			 T4g = T4e - T4f;
+			 T49 = W[34];
+			 T4d = W[35];
+			 cr[WS(rs, 18)] = FNMS(T4d, T4g, T49 * T4c);
+			 ci[WS(rs, 18)] = FMA(T49, T4g, T4d * T4c);
+		    }
+	       }
+	       {
+		    E T32, T3A, T3q, T3E, T3h, T3F, T3t, T3B, T31, T3p;
+		    T31 = KP707106781 * (T2X - T30);
+		    T32 = T2U - T31;
+		    T3A = T2U + T31;
+		    T3p = KP707106781 * (T3n - T3o);
+		    T3q = T3m - T3p;
+		    T3E = T3m + T3p;
+		    {
+			 E T39, T3g, T3r, T3s;
+			 T39 = FNMS(KP923879532, T38, KP382683432 * T35);
+			 T3g = FMA(KP382683432, T3c, KP923879532 * T3f);
+			 T3h = T39 - T3g;
+			 T3F = T3g + T39;
+			 T3r = FNMS(KP923879532, T3c, KP382683432 * T3f);
+			 T3s = FMA(KP923879532, T35, KP382683432 * T38);
+			 T3t = T3r - T3s;
+			 T3B = T3r + T3s;
+		    }
+		    {
+			 E T3i, T3u, T2R, T3j;
+			 T3i = T32 - T3h;
+			 T3u = T3q - T3t;
+			 T2R = W[58];
+			 T3j = W[59];
+			 cr[WS(rs, 30)] = FNMS(T3j, T3u, T2R * T3i);
+			 ci[WS(rs, 30)] = FMA(T3j, T3i, T2R * T3u);
+		    }
+		    {
+			 E T3I, T3K, T3H, T3J;
+			 T3I = T3A + T3B;
+			 T3K = T3E + T3F;
+			 T3H = W[10];
+			 T3J = W[11];
+			 cr[WS(rs, 6)] = FNMS(T3J, T3K, T3H * T3I);
+			 ci[WS(rs, 6)] = FMA(T3H, T3K, T3J * T3I);
+		    }
+		    {
+			 E T3w, T3y, T3v, T3x;
+			 T3w = T32 + T3h;
+			 T3y = T3q + T3t;
+			 T3v = W[26];
+			 T3x = W[27];
+			 cr[WS(rs, 14)] = FNMS(T3x, T3y, T3v * T3w);
+			 ci[WS(rs, 14)] = FMA(T3x, T3w, T3v * T3y);
+		    }
+		    {
+			 E T3C, T3G, T3z, T3D;
+			 T3C = T3A - T3B;
+			 T3G = T3E - T3F;
+			 T3z = W[42];
+			 T3D = W[43];
+			 cr[WS(rs, 22)] = FNMS(T3D, T3G, T3z * T3C);
+			 ci[WS(rs, 22)] = FMA(T3z, T3G, T3D * T3C);
+		    }
+	       }
+	       {
+		    E T60, T6m, T6f, T6n, T67, T6r, T6c, T6q;
+		    {
+			 E T5Y, T5Z, T6d, T6e;
+			 T5Y = T4o + T4v;
+			 T5Z = T5z + T5A;
+			 T60 = T5Y + T5Z;
+			 T6m = T5Y - T5Z;
+			 T6d = FMA(KP195090322, T61, KP980785280 * T62);
+			 T6e = FNMS(KP195090322, T64, KP980785280 * T65);
+			 T6f = T6d + T6e;
+			 T6n = T6e - T6d;
+		    }
+		    {
+			 E T63, T66, T6a, T6b;
+			 T63 = FNMS(KP195090322, T62, KP980785280 * T61);
+			 T66 = FMA(KP980785280, T64, KP195090322 * T65);
+			 T67 = T63 + T66;
+			 T6r = T63 - T66;
+			 T6a = T5u + T5x;
+			 T6b = T4K + T4D;
+			 T6c = T6a + T6b;
+			 T6q = T6a - T6b;
+		    }
+		    {
+			 E T68, T6g, T5X, T69;
+			 T68 = T60 - T67;
+			 T6g = T6c - T6f;
+			 T5X = W[32];
+			 T69 = W[33];
+			 cr[WS(rs, 17)] = FNMS(T69, T6g, T5X * T68);
+			 ci[WS(rs, 17)] = FMA(T69, T68, T5X * T6g);
+		    }
+		    {
+			 E T6u, T6w, T6t, T6v;
+			 T6u = T6m + T6n;
+			 T6w = T6q + T6r;
+			 T6t = W[16];
+			 T6v = W[17];
+			 cr[WS(rs, 9)] = FNMS(T6v, T6w, T6t * T6u);
+			 ci[WS(rs, 9)] = FMA(T6t, T6w, T6v * T6u);
+		    }
+		    {
+			 E T6i, T6k, T6h, T6j;
+			 T6i = T60 + T67;
+			 T6k = T6c + T6f;
+			 T6h = W[0];
+			 T6j = W[1];
+			 cr[WS(rs, 1)] = FNMS(T6j, T6k, T6h * T6i);
+			 ci[WS(rs, 1)] = FMA(T6j, T6i, T6h * T6k);
+		    }
+		    {
+			 E T6o, T6s, T6l, T6p;
+			 T6o = T6m - T6n;
+			 T6s = T6q - T6r;
+			 T6l = W[48];
+			 T6p = W[49];
+			 cr[WS(rs, 25)] = FNMS(T6p, T6s, T6l * T6o);
+			 ci[WS(rs, 25)] = FMA(T6l, T6s, T6p * T6o);
+		    }
+	       }
+	       {
+		    E T7u, T7Q, T7J, T7R, T7B, T7V, T7G, T7U;
+		    {
+			 E T7s, T7t, T7H, T7I;
+			 T7s = T6y + T6z;
+			 T7t = T73 + T74;
+			 T7u = T7s - T7t;
+			 T7Q = T7s + T7t;
+			 T7H = FMA(KP195090322, T7w, KP980785280 * T7v);
+			 T7I = FMA(KP195090322, T7z, KP980785280 * T7y);
+			 T7J = T7H - T7I;
+			 T7R = T7H + T7I;
+		    }
+		    {
+			 E T7x, T7A, T7E, T7F;
+			 T7x = FNMS(KP980785280, T7w, KP195090322 * T7v);
+			 T7A = FNMS(KP980785280, T7z, KP195090322 * T7y);
+			 T7B = T7x + T7A;
+			 T7V = T7x - T7A;
+			 T7E = T70 - T71;
+			 T7F = T6D - T6G;
+			 T7G = T7E + T7F;
+			 T7U = T7E - T7F;
+		    }
+		    {
+			 E T7C, T7K, T7r, T7D;
+			 T7C = T7u - T7B;
+			 T7K = T7G - T7J;
+			 T7r = W[44];
+			 T7D = W[45];
+			 cr[WS(rs, 23)] = FNMS(T7D, T7K, T7r * T7C);
+			 ci[WS(rs, 23)] = FMA(T7D, T7C, T7r * T7K);
+		    }
+		    {
+			 E T7Y, T80, T7X, T7Z;
+			 T7Y = T7Q + T7R;
+			 T80 = T7U - T7V;
+			 T7X = W[60];
+			 T7Z = W[61];
+			 cr[WS(rs, 31)] = FNMS(T7Z, T80, T7X * T7Y);
+			 ci[WS(rs, 31)] = FMA(T7X, T80, T7Z * T7Y);
+		    }
+		    {
+			 E T7M, T7O, T7L, T7N;
+			 T7M = T7u + T7B;
+			 T7O = T7G + T7J;
+			 T7L = W[12];
+			 T7N = W[13];
+			 cr[WS(rs, 7)] = FNMS(T7N, T7O, T7L * T7M);
+			 ci[WS(rs, 7)] = FMA(T7N, T7M, T7L * T7O);
+		    }
+		    {
+			 E T7S, T7W, T7P, T7T;
+			 T7S = T7Q - T7R;
+			 T7W = T7U + T7V;
+			 T7P = W[28];
+			 T7T = W[29];
+			 cr[WS(rs, 15)] = FNMS(T7T, T7W, T7P * T7S);
+			 ci[WS(rs, 15)] = FMA(T7P, T7W, T7T * T7S);
+		    }
+	       }
+	       {
+		    E T4M, T5M, T5F, T5N, T5p, T5R, T5C, T5Q;
+		    {
+			 E T4w, T4L, T5D, T5E;
+			 T4w = T4o - T4v;
+			 T4L = T4D - T4K;
+			 T4M = T4w + T4L;
+			 T5M = T4w - T4L;
+			 T5D = FMA(KP831469612, T4X, KP555570233 * T54);
+			 T5E = FNMS(KP831469612, T5g, KP555570233 * T5n);
+			 T5F = T5D + T5E;
+			 T5N = T5E - T5D;
+		    }
+		    {
+			 E T55, T5o, T5y, T5B;
+			 T55 = FNMS(KP831469612, T54, KP555570233 * T4X);
+			 T5o = FMA(KP555570233, T5g, KP831469612 * T5n);
+			 T5p = T55 + T5o;
+			 T5R = T55 - T5o;
+			 T5y = T5u - T5x;
+			 T5B = T5z - T5A;
+			 T5C = T5y + T5B;
+			 T5Q = T5y - T5B;
+		    }
+		    {
+			 E T5q, T5G, T4l, T5r;
+			 T5q = T4M - T5p;
+			 T5G = T5C - T5F;
+			 T4l = W[40];
+			 T5r = W[41];
+			 cr[WS(rs, 21)] = FNMS(T5r, T5G, T4l * T5q);
+			 ci[WS(rs, 21)] = FMA(T5r, T5q, T4l * T5G);
+		    }
+		    {
+			 E T5U, T5W, T5T, T5V;
+			 T5U = T5M + T5N;
+			 T5W = T5Q + T5R;
+			 T5T = W[24];
+			 T5V = W[25];
+			 cr[WS(rs, 13)] = FNMS(T5V, T5W, T5T * T5U);
+			 ci[WS(rs, 13)] = FMA(T5T, T5W, T5V * T5U);
+		    }
+		    {
+			 E T5I, T5K, T5H, T5J;
+			 T5I = T4M + T5p;
+			 T5K = T5C + T5F;
+			 T5H = W[8];
+			 T5J = W[9];
+			 cr[WS(rs, 5)] = FNMS(T5J, T5K, T5H * T5I);
+			 ci[WS(rs, 5)] = FMA(T5J, T5I, T5H * T5K);
+		    }
+		    {
+			 E T5O, T5S, T5L, T5P;
+			 T5O = T5M - T5N;
+			 T5S = T5Q - T5R;
+			 T5L = W[56];
+			 T5P = W[57];
+			 cr[WS(rs, 29)] = FNMS(T5P, T5S, T5L * T5O);
+			 ci[WS(rs, 29)] = FMA(T5L, T5S, T5P * T5O);
+		    }
+	       }
+	       {
+		    E T6I, T7g, T79, T7h, T6X, T7l, T76, T7k;
+		    {
+			 E T6A, T6H, T77, T78;
+			 T6A = T6y - T6z;
+			 T6H = T6D + T6G;
+			 T6I = T6A - T6H;
+			 T7g = T6A + T6H;
+			 T77 = FNMS(KP555570233, T6S, KP831469612 * T6V);
+			 T78 = FMA(KP555570233, T6L, KP831469612 * T6O);
+			 T79 = T77 - T78;
+			 T7h = T77 + T78;
+		    }
+		    {
+			 E T6P, T6W, T72, T75;
+			 T6P = FNMS(KP555570233, T6O, KP831469612 * T6L);
+			 T6W = FMA(KP831469612, T6S, KP555570233 * T6V);
+			 T6X = T6P - T6W;
+			 T7l = T6W + T6P;
+			 T72 = T70 + T71;
+			 T75 = T73 - T74;
+			 T76 = T72 - T75;
+			 T7k = T72 + T75;
+		    }
+		    {
+			 E T6Y, T7a, T6x, T6Z;
+			 T6Y = T6I - T6X;
+			 T7a = T76 - T79;
+			 T6x = W[52];
+			 T6Z = W[53];
+			 cr[WS(rs, 27)] = FNMS(T6Z, T7a, T6x * T6Y);
+			 ci[WS(rs, 27)] = FMA(T6Z, T6Y, T6x * T7a);
+		    }
+		    {
+			 E T7o, T7q, T7n, T7p;
+			 T7o = T7g + T7h;
+			 T7q = T7k + T7l;
+			 T7n = W[4];
+			 T7p = W[5];
+			 cr[WS(rs, 3)] = FNMS(T7p, T7q, T7n * T7o);
+			 ci[WS(rs, 3)] = FMA(T7n, T7q, T7p * T7o);
+		    }
+		    {
+			 E T7c, T7e, T7b, T7d;
+			 T7c = T6I + T6X;
+			 T7e = T76 + T79;
+			 T7b = W[20];
+			 T7d = W[21];
+			 cr[WS(rs, 11)] = FNMS(T7d, T7e, T7b * T7c);
+			 ci[WS(rs, 11)] = FMA(T7d, T7c, T7b * T7e);
+		    }
+		    {
+			 E T7i, T7m, T7f, T7j;
+			 T7i = T7g - T7h;
+			 T7m = T7k - T7l;
+			 T7f = W[36];
+			 T7j = W[37];
+			 cr[WS(rs, 19)] = FNMS(T7j, T7m, T7f * T7i);
+			 ci[WS(rs, 19)] = FMA(T7f, T7m, T7j * T7i);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 32, "hb_32", twinstr, &GENUS, {340, 114, 94, 0} };
+
+void X(codelet_hb_32) (planner *p) {
+     X(khc2hc_register) (p, hb_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:12 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hb_4 -include hb.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 27 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hb.h"
+
+static void hb_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T8, Th, Ta, T7, Ti, T9;
+	       {
+		    E Td, Tg, T3, T6, Tu, Tm, Tx, Tr;
+		    {
+			 E Tq, Tl, T4, T5, Tk, Tp;
+			 {
+			      E Tb, Tc, Te, Tf, T1, T2;
+			      Tb = ci[WS(rs, 3)];
+			      Tc = cr[WS(rs, 2)];
+			      Te = ci[WS(rs, 2)];
+			      Tf = cr[WS(rs, 3)];
+			      T1 = cr[0];
+			      Tq = Tb + Tc;
+			      Td = Tb - Tc;
+			      T2 = ci[WS(rs, 1)];
+			      Tl = Te + Tf;
+			      Tg = Te - Tf;
+			      T4 = cr[WS(rs, 1)];
+			      T5 = ci[0];
+			      T3 = T1 + T2;
+			      Tk = T1 - T2;
+			 }
+			 Tp = T4 - T5;
+			 T6 = T4 + T5;
+			 Tu = Tk + Tl;
+			 Tm = Tk - Tl;
+			 Tx = Tq - Tp;
+			 Tr = Tp + Tq;
+			 T8 = T3 - T6;
+		    }
+		    cr[0] = T3 + T6;
+		    {
+			 E Tj, To, Tw, Tv;
+			 Tj = W[0];
+			 ci[0] = Td + Tg;
+			 To = W[1];
+			 {
+			      E Tt, Ts, Tn, Ty;
+			      Tt = W[4];
+			      Ts = Tj * Tr;
+			      Tn = Tj * Tm;
+			      Tw = W[5];
+			      Ty = Tt * Tx;
+			      Tv = Tt * Tu;
+			      ci[WS(rs, 1)] = FMA(To, Tm, Ts);
+			      cr[WS(rs, 1)] = FNMS(To, Tr, Tn);
+			      ci[WS(rs, 3)] = FMA(Tw, Tu, Ty);
+			 }
+			 cr[WS(rs, 3)] = FNMS(Tw, Tx, Tv);
+			 Th = Td - Tg;
+			 Ta = W[3];
+			 T7 = W[2];
+		    }
+	       }
+	       Ti = Ta * T8;
+	       T9 = T7 * T8;
+	       ci[WS(rs, 2)] = FMA(T7, Th, Ti);
+	       cr[WS(rs, 2)] = FNMS(Ta, Th, T9);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 4, "hb_4", twinstr, &GENUS, {16, 6, 6, 0} };
+
+void X(codelet_hb_4) (planner *p) {
+     X(khc2hc_register) (p, hb_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hb_4 -include hb.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 13 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hb.h"
+
+static void hb_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T3, Ti, T6, Tm, Tc, Tn, Tf, Tj;
+	       {
+		    E T1, T2, T4, T5;
+		    T1 = cr[0];
+		    T2 = ci[WS(rs, 1)];
+		    T3 = T1 + T2;
+		    Ti = T1 - T2;
+		    T4 = cr[WS(rs, 1)];
+		    T5 = ci[0];
+		    T6 = T4 + T5;
+		    Tm = T4 - T5;
+	       }
+	       {
+		    E Ta, Tb, Td, Te;
+		    Ta = ci[WS(rs, 3)];
+		    Tb = cr[WS(rs, 2)];
+		    Tc = Ta - Tb;
+		    Tn = Ta + Tb;
+		    Td = ci[WS(rs, 2)];
+		    Te = cr[WS(rs, 3)];
+		    Tf = Td - Te;
+		    Tj = Td + Te;
+	       }
+	       cr[0] = T3 + T6;
+	       ci[0] = Tc + Tf;
+	       {
+		    E T8, Tg, T7, T9;
+		    T8 = T3 - T6;
+		    Tg = Tc - Tf;
+		    T7 = W[2];
+		    T9 = W[3];
+		    cr[WS(rs, 2)] = FNMS(T9, Tg, T7 * T8);
+		    ci[WS(rs, 2)] = FMA(T9, T8, T7 * Tg);
+	       }
+	       {
+		    E Tk, To, Th, Tl;
+		    Tk = Ti - Tj;
+		    To = Tm + Tn;
+		    Th = W[0];
+		    Tl = W[1];
+		    cr[WS(rs, 1)] = FNMS(Tl, To, Th * Tk);
+		    ci[WS(rs, 1)] = FMA(Th, To, Tl * Tk);
+	       }
+	       {
+		    E Tq, Ts, Tp, Tr;
+		    Tq = Ti + Tj;
+		    Ts = Tn - Tm;
+		    Tp = W[4];
+		    Tr = W[5];
+		    cr[WS(rs, 3)] = FNMS(Tr, Ts, Tp * Tq);
+		    ci[WS(rs, 3)] = FMA(Tp, Ts, Tr * Tq);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 4, "hb_4", twinstr, &GENUS, {16, 6, 6, 0} };
+
+void X(codelet_hb_4) (planner *p) {
+     X(khc2hc_register) (p, hb_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:12 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 5 -dif -name hb_5 -include hb.h */
+
+/*
+ * This function contains 40 FP additions, 34 FP multiplications,
+ * (or, 14 additions, 8 multiplications, 26 fused multiply/add),
+ * 42 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "hb.h"
+
+static void hb_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E TQ, TP, TT, TR, TS, TU;
+	       {
+		    E T1, Tn, TM, Tw, Tb, T8, To, Tf, Ta, Tg, Th;
+		    {
+			 E T2, T3, T5, T6, T4, Tu;
+			 T1 = cr[0];
+			 T2 = cr[WS(rs, 1)];
+			 T3 = ci[0];
+			 T5 = cr[WS(rs, 2)];
+			 T6 = ci[WS(rs, 1)];
+			 Tn = ci[WS(rs, 4)];
+			 T4 = T2 + T3;
+			 Tu = T2 - T3;
+			 {
+			      E T7, Tv, Td, Te;
+			      T7 = T5 + T6;
+			      Tv = T5 - T6;
+			      Td = ci[WS(rs, 3)];
+			      Te = cr[WS(rs, 4)];
+			      TM = FNMS(KP618033988, Tu, Tv);
+			      Tw = FMA(KP618033988, Tv, Tu);
+			      Tb = T4 - T7;
+			      T8 = T4 + T7;
+			      To = Td - Te;
+			      Tf = Td + Te;
+			      Ta = FNMS(KP250000000, T8, T1);
+			      Tg = ci[WS(rs, 2)];
+			      Th = cr[WS(rs, 3)];
+			 }
+		    }
+		    cr[0] = T1 + T8;
+		    {
+			 E TG, T9, Tm, Tz, TH, TC, TA, Tk, Tt, TL, Tc, Ti, Tp, TI;
+			 TG = FNMS(KP559016994, Tb, Ta);
+			 Tc = FMA(KP559016994, Tb, Ta);
+			 T9 = W[0];
+			 Ti = Tg + Th;
+			 Tp = Tg - Th;
+			 Tm = W[1];
+			 {
+			      E Ts, Tj, Tr, Tq;
+			      Tz = W[6];
+			      Ts = To - Tp;
+			      Tq = To + Tp;
+			      Tj = FMA(KP618033988, Ti, Tf);
+			      TH = FNMS(KP618033988, Tf, Ti);
+			      ci[0] = Tn + Tq;
+			      Tr = FNMS(KP250000000, Tq, Tn);
+			      TC = W[7];
+			      TA = FMA(KP951056516, Tj, Tc);
+			      Tk = FNMS(KP951056516, Tj, Tc);
+			      Tt = FMA(KP559016994, Ts, Tr);
+			      TL = FNMS(KP559016994, Ts, Tr);
+			 }
+			 {
+			      E TE, TB, Ty, Tl, TD, Tx;
+			      TE = TC * TA;
+			      TB = Tz * TA;
+			      Ty = Tm * Tk;
+			      Tl = T9 * Tk;
+			      TD = FNMS(KP951056516, Tw, Tt);
+			      Tx = FMA(KP951056516, Tw, Tt);
+			      TI = FMA(KP951056516, TH, TG);
+			      TQ = FNMS(KP951056516, TH, TG);
+			      ci[WS(rs, 4)] = FMA(Tz, TD, TE);
+			      cr[WS(rs, 4)] = FNMS(TC, TD, TB);
+			      ci[WS(rs, 1)] = FMA(T9, Tx, Ty);
+			      cr[WS(rs, 1)] = FNMS(Tm, Tx, Tl);
+			 }
+			 {
+			      E TF, TK, TN, TJ, TO;
+			      TF = W[2];
+			      TK = W[3];
+			      TP = W[4];
+			      TT = FMA(KP951056516, TM, TL);
+			      TN = FNMS(KP951056516, TM, TL);
+			      TJ = TF * TI;
+			      TO = TK * TI;
+			      TR = TP * TQ;
+			      TS = W[5];
+			      cr[WS(rs, 2)] = FNMS(TK, TN, TJ);
+			      ci[WS(rs, 2)] = FMA(TF, TN, TO);
+			 }
+		    }
+	       }
+	       cr[WS(rs, 3)] = FNMS(TS, TT, TR);
+	       TU = TS * TQ;
+	       ci[WS(rs, 3)] = FMA(TP, TT, TU);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 5},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 5, "hb_5", twinstr, &GENUS, {14, 8, 26, 0} };
+
+void X(codelet_hb_5) (planner *p) {
+     X(khc2hc_register) (p, hb_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 5 -dif -name hb_5 -include hb.h */
+
+/*
+ * This function contains 40 FP additions, 28 FP multiplications,
+ * (or, 26 additions, 14 multiplications, 14 fused multiply/add),
+ * 27 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "hb.h"
+
+static void hb_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E T1, Tj, TG, Ts, T8, Ti, T9, Tn, TD, Tu, Tg, Tt;
+	       {
+		    E T4, Tq, T7, Tr;
+		    T1 = cr[0];
+		    {
+			 E T2, T3, T5, T6;
+			 T2 = cr[WS(rs, 1)];
+			 T3 = ci[0];
+			 T4 = T2 + T3;
+			 Tq = T2 - T3;
+			 T5 = cr[WS(rs, 2)];
+			 T6 = ci[WS(rs, 1)];
+			 T7 = T5 + T6;
+			 Tr = T5 - T6;
+		    }
+		    Tj = KP559016994 * (T4 - T7);
+		    TG = FMA(KP951056516, Tq, KP587785252 * Tr);
+		    Ts = FNMS(KP951056516, Tr, KP587785252 * Tq);
+		    T8 = T4 + T7;
+		    Ti = FNMS(KP250000000, T8, T1);
+	       }
+	       {
+		    E Tc, Tl, Tf, Tm;
+		    T9 = ci[WS(rs, 4)];
+		    {
+			 E Ta, Tb, Td, Te;
+			 Ta = ci[WS(rs, 3)];
+			 Tb = cr[WS(rs, 4)];
+			 Tc = Ta - Tb;
+			 Tl = Ta + Tb;
+			 Td = ci[WS(rs, 2)];
+			 Te = cr[WS(rs, 3)];
+			 Tf = Td - Te;
+			 Tm = Td + Te;
+		    }
+		    Tn = FNMS(KP951056516, Tm, KP587785252 * Tl);
+		    TD = FMA(KP951056516, Tl, KP587785252 * Tm);
+		    Tu = KP559016994 * (Tc - Tf);
+		    Tg = Tc + Tf;
+		    Tt = FNMS(KP250000000, Tg, T9);
+	       }
+	       cr[0] = T1 + T8;
+	       ci[0] = T9 + Tg;
+	       {
+		    E To, Ty, Tw, TA, Tk, Tv;
+		    Tk = Ti - Tj;
+		    To = Tk - Tn;
+		    Ty = Tk + Tn;
+		    Tv = Tt - Tu;
+		    Tw = Ts + Tv;
+		    TA = Tv - Ts;
+		    {
+			 E Th, Tp, Tx, Tz;
+			 Th = W[2];
+			 Tp = W[3];
+			 cr[WS(rs, 2)] = FNMS(Tp, Tw, Th * To);
+			 ci[WS(rs, 2)] = FMA(Th, Tw, Tp * To);
+			 Tx = W[4];
+			 Tz = W[5];
+			 cr[WS(rs, 3)] = FNMS(Tz, TA, Tx * Ty);
+			 ci[WS(rs, 3)] = FMA(Tx, TA, Tz * Ty);
+		    }
+	       }
+	       {
+		    E TE, TK, TI, TM, TC, TH;
+		    TC = Tj + Ti;
+		    TE = TC - TD;
+		    TK = TC + TD;
+		    TH = Tu + Tt;
+		    TI = TG + TH;
+		    TM = TH - TG;
+		    {
+			 E TB, TF, TJ, TL;
+			 TB = W[0];
+			 TF = W[1];
+			 cr[WS(rs, 1)] = FNMS(TF, TI, TB * TE);
+			 ci[WS(rs, 1)] = FMA(TB, TI, TF * TE);
+			 TJ = W[6];
+			 TL = W[7];
+			 cr[WS(rs, 4)] = FNMS(TL, TM, TJ * TK);
+			 ci[WS(rs, 4)] = FMA(TJ, TM, TL * TK);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 5},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 5, "hb_5", twinstr, &GENUS, {26, 14, 14, 0} };
+
+void X(codelet_hb_5) (planner *p) {
+     X(khc2hc_register) (p, hb_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:12 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hb_6 -include hb.h */
+
+/*
+ * This function contains 46 FP additions, 32 FP multiplications,
+ * (or, 24 additions, 10 multiplications, 22 fused multiply/add),
+ * 45 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hb.h"
+
+static void hb_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
+	       E TK, TR, TB, TM, TL, TS;
+	       {
+		    E Td, TN, TO, TJ, Tn, Tk, TC, T3, Tr, T4, T5, T7, T8;
+		    {
+			 E TH, Tg, Tj, TI, Th, Ti, T1, T2;
+			 {
+			      E Tb, Tc, Te, Tf;
+			      Tb = ci[WS(rs, 5)];
+			      Tc = cr[WS(rs, 3)];
+			      Te = ci[WS(rs, 3)];
+			      Tf = cr[WS(rs, 5)];
+			      Th = ci[WS(rs, 4)];
+			      Td = Tb - Tc;
+			      TN = Tb + Tc;
+			      Ti = cr[WS(rs, 4)];
+			      TH = Te + Tf;
+			      Tg = Te - Tf;
+			 }
+			 Tj = Th - Ti;
+			 TI = Th + Ti;
+			 T1 = cr[0];
+			 T2 = ci[WS(rs, 2)];
+			 TO = TH - TI;
+			 TJ = TH + TI;
+			 Tn = Tj - Tg;
+			 Tk = Tg + Tj;
+			 TC = T1 - T2;
+			 T3 = T1 + T2;
+			 Tr = FNMS(KP500000000, Tk, Td);
+			 T4 = cr[WS(rs, 2)];
+			 T5 = ci[0];
+			 T7 = ci[WS(rs, 1)];
+			 T8 = cr[WS(rs, 1)];
+		    }
+		    {
+			 E Tl, Tq, TQ, Ts, Ta, T10, TG;
+			 ci[0] = Td + Tk;
+			 {
+			      E T6, TD, T9, TE, TF;
+			      T6 = T4 + T5;
+			      TD = T4 - T5;
+			      T9 = T7 + T8;
+			      TE = T7 - T8;
+			      Tl = W[2];
+			      Tq = W[3];
+			      TQ = TD - TE;
+			      TF = TD + TE;
+			      Ts = T6 - T9;
+			      Ta = T6 + T9;
+			      T10 = TC + TF;
+			      TG = FNMS(KP500000000, TF, TC);
+			 }
+			 {
+			      E T13, TP, Tz, TZ, Tw, T14, Tv, Ty;
+			      {
+				   E Tt, T12, T11, Tp, Tm, To, Tu;
+				   T13 = TN + TO;
+				   TP = FNMS(KP500000000, TO, TN);
+				   cr[0] = T3 + Ta;
+				   Tm = FNMS(KP500000000, Ta, T3);
+				   Tz = FMA(KP866025403, Ts, Tr);
+				   Tt = FNMS(KP866025403, Ts, Tr);
+				   TZ = W[4];
+				   To = FNMS(KP866025403, Tn, Tm);
+				   Tw = FMA(KP866025403, Tn, Tm);
+				   Tu = Tl * Tt;
+				   T12 = W[5];
+				   T11 = TZ * T10;
+				   Tp = Tl * To;
+				   ci[WS(rs, 2)] = FMA(Tq, To, Tu);
+				   T14 = T12 * T10;
+				   cr[WS(rs, 3)] = FNMS(T12, T13, T11);
+				   cr[WS(rs, 2)] = FNMS(Tq, Tt, Tp);
+			      }
+			      ci[WS(rs, 3)] = FMA(TZ, T13, T14);
+			      Tv = W[6];
+			      Ty = W[7];
+			      {
+				   E TX, TT, TW, TV, TY, TU, TA, Tx;
+				   TK = FNMS(KP866025403, TJ, TG);
+				   TU = FMA(KP866025403, TJ, TG);
+				   TA = Tv * Tz;
+				   Tx = Tv * Tw;
+				   TX = FNMS(KP866025403, TQ, TP);
+				   TR = FMA(KP866025403, TQ, TP);
+				   ci[WS(rs, 4)] = FMA(Ty, Tw, TA);
+				   cr[WS(rs, 4)] = FNMS(Ty, Tz, Tx);
+				   TT = W[8];
+				   TW = W[9];
+				   TB = W[0];
+				   TV = TT * TU;
+				   TY = TW * TU;
+				   TM = W[1];
+				   TL = TB * TK;
+				   cr[WS(rs, 5)] = FNMS(TW, TX, TV);
+				   ci[WS(rs, 5)] = FMA(TT, TX, TY);
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 1)] = FNMS(TM, TR, TL);
+	       TS = TM * TK;
+	       ci[WS(rs, 1)] = FMA(TB, TR, TS);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 6, "hb_6", twinstr, &GENUS, {24, 10, 22, 0} };
+
+void X(codelet_hb_6) (planner *p) {
+     X(khc2hc_register) (p, hb_6, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hb_6 -include hb.h */
+
+/*
+ * This function contains 46 FP additions, 28 FP multiplications,
+ * (or, 32 additions, 14 multiplications, 14 fused multiply/add),
+ * 27 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hb.h"
+
+static void hb_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
+	       E T3, Ty, Ta, TO, Tr, TB, Td, TE, Tk, TL, Tn, TH;
+	       {
+		    E T1, T2, Tb, Tc;
+		    T1 = cr[0];
+		    T2 = ci[WS(rs, 2)];
+		    T3 = T1 + T2;
+		    Ty = T1 - T2;
+		    {
+			 E T6, Tz, T9, TA;
+			 {
+			      E T4, T5, T7, T8;
+			      T4 = cr[WS(rs, 2)];
+			      T5 = ci[0];
+			      T6 = T4 + T5;
+			      Tz = T4 - T5;
+			      T7 = ci[WS(rs, 1)];
+			      T8 = cr[WS(rs, 1)];
+			      T9 = T7 + T8;
+			      TA = T7 - T8;
+			 }
+			 Ta = T6 + T9;
+			 TO = KP866025403 * (Tz - TA);
+			 Tr = KP866025403 * (T6 - T9);
+			 TB = Tz + TA;
+		    }
+		    Tb = ci[WS(rs, 5)];
+		    Tc = cr[WS(rs, 3)];
+		    Td = Tb - Tc;
+		    TE = Tb + Tc;
+		    {
+			 E Tg, TG, Tj, TF;
+			 {
+			      E Te, Tf, Th, Ti;
+			      Te = ci[WS(rs, 3)];
+			      Tf = cr[WS(rs, 5)];
+			      Tg = Te - Tf;
+			      TG = Te + Tf;
+			      Th = ci[WS(rs, 4)];
+			      Ti = cr[WS(rs, 4)];
+			      Tj = Th - Ti;
+			      TF = Th + Ti;
+			 }
+			 Tk = Tg + Tj;
+			 TL = KP866025403 * (TG + TF);
+			 Tn = KP866025403 * (Tj - Tg);
+			 TH = TF - TG;
+		    }
+	       }
+	       cr[0] = T3 + Ta;
+	       ci[0] = Td + Tk;
+	       {
+		    E TC, TI, Tx, TD;
+		    TC = Ty + TB;
+		    TI = TE - TH;
+		    Tx = W[4];
+		    TD = W[5];
+		    cr[WS(rs, 3)] = FNMS(TD, TI, Tx * TC);
+		    ci[WS(rs, 3)] = FMA(TD, TC, Tx * TI);
+	       }
+	       {
+		    E To, Tu, Ts, Tw, Tm, Tq;
+		    Tm = FNMS(KP500000000, Ta, T3);
+		    To = Tm - Tn;
+		    Tu = Tm + Tn;
+		    Tq = FNMS(KP500000000, Tk, Td);
+		    Ts = Tq - Tr;
+		    Tw = Tr + Tq;
+		    {
+			 E Tl, Tp, Tt, Tv;
+			 Tl = W[2];
+			 Tp = W[3];
+			 cr[WS(rs, 2)] = FNMS(Tp, Ts, Tl * To);
+			 ci[WS(rs, 2)] = FMA(Tl, Ts, Tp * To);
+			 Tt = W[6];
+			 Tv = W[7];
+			 cr[WS(rs, 4)] = FNMS(Tv, Tw, Tt * Tu);
+			 ci[WS(rs, 4)] = FMA(Tt, Tw, Tv * Tu);
+		    }
+	       }
+	       {
+		    E TM, TS, TQ, TU, TK, TP;
+		    TK = FNMS(KP500000000, TB, Ty);
+		    TM = TK - TL;
+		    TS = TK + TL;
+		    TP = FMA(KP500000000, TH, TE);
+		    TQ = TO + TP;
+		    TU = TP - TO;
+		    {
+			 E TJ, TN, TR, TT;
+			 TJ = W[0];
+			 TN = W[1];
+			 cr[WS(rs, 1)] = FNMS(TN, TQ, TJ * TM);
+			 ci[WS(rs, 1)] = FMA(TN, TM, TJ * TQ);
+			 TR = W[8];
+			 TT = W[9];
+			 cr[WS(rs, 5)] = FNMS(TT, TU, TR * TS);
+			 ci[WS(rs, 5)] = FMA(TT, TS, TR * TU);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 6, "hb_6", twinstr, &GENUS, {32, 14, 14, 0} };
+
+void X(codelet_hb_6) (planner *p) {
+     X(khc2hc_register) (p, hb_6, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3959 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:15 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 64 -dif -name hb_64 -include hb.h */
+
+/*
+ * This function contains 1038 FP additions, 644 FP multiplications,
+ * (or, 520 additions, 126 multiplications, 518 fused multiply/add),
+ * 231 stack variables, 15 constants, and 256 memory accesses
+ */
+#include "hb.h"
+
+static void hb_64(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 126); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 126, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E Tcx, Tcw, Tcv;
+	       {
+		    E Thy, Tv, T7n, T5B, TfP, Tey, Tkl, TjB, T6U, T2k, T7o, T2H, TiH, Tia, Tk8;
+		    E Tj8, T6V, T5E, Tbz, T9N, Tb7, T9Q, Tgh, Tev, Tb6, T8G, TbA, T8N, TfO, TcU;
+		    E Tgi, Td5, Ti3, T10, TjC, Tje, TiI, ThF, TeA, Tds, TjD, Tjb, TeB, Tdh, Tgl;
+		    E TfT, Tgk, TfW, T6Z, T7r, T5H, T39, Tbb, TbC, T9S, T8V, T72, T7q, T5G, T3A;
+		    E Tbe, TbD, T9T, T92, ThH, T1w, Tke, Tjq, Tkf, Tjt, TiK, ThO, Tgb, TgT, Tfc;
+		    E Tec, Tg8, TgU, Tfd, Tel, T77, T83, T6i, T5a, T7a, T82, T6j, T5n, Tbj, Tcc;
+		    E Tas, T9f, Tbm, Tcb, Tar, T9m, ThQ, T21, Tkb, Tjj, Tkc, Tjm, TiL, ThX, Tg4;
+		    E TgW, Tf9, TdL, Tg1, TgX, Tfa, TdU, T7e, T80, T6f, T4h, T9q, Tbr, T7h, T7Z;
+		    E T6g, T4u, T9D, T9C, Tbo, T9B, Tbp, T9x;
+		    {
+			 E T3v, T8Z, T8W, T90, T8X, T3y, T3q, T70;
+			 {
+			      E TcQ, TcT, Td4, TcZ;
+			      {
+				   E T24, T5t, T7, T27, T5w, Ti4, Tet, T2i, T5z, Te, Teu, Ti5, T5y, T2d, T8H;
+				   E T2u, Td0, Tm, Ti7, Td3, T8I, T2p, Tq, T2w, Tp, TcV, T2E, Tr, T2x, T2y;
+				   E Tes, Ter;
+				   {
+					E T1, T2, T4, T5, T5u, T5v;
+					T1 = cr[0];
+					T2 = ci[WS(rs, 31)];
+					T4 = cr[WS(rs, 16)];
+					T5 = ci[WS(rs, 15)];
+					{
+					     E T25, T3, T6, T26;
+					     T25 = ci[WS(rs, 47)];
+					     T24 = T1 - T2;
+					     T3 = T1 + T2;
+					     T5t = T4 - T5;
+					     T6 = T4 + T5;
+					     T26 = cr[WS(rs, 48)];
+					     T5u = ci[WS(rs, 63)];
+					     T5v = cr[WS(rs, 32)];
+					     TcQ = T3 - T6;
+					     T7 = T3 + T6;
+					     Tes = T25 - T26;
+					     T27 = T25 + T26;
+					}
+					Ter = T5u - T5v;
+					T5w = T5u + T5v;
+				   }
+				   {
+					E Ta, T29, Tb, TcR, T2h, Tc, T2a, T2b;
+					{
+					     E T2f, T2g, T8, T9;
+					     T8 = cr[WS(rs, 8)];
+					     T9 = ci[WS(rs, 23)];
+					     Ti4 = Ter + Tes;
+					     Tet = Ter - Tes;
+					     T2f = ci[WS(rs, 39)];
+					     T2g = cr[WS(rs, 56)];
+					     Ta = T8 + T9;
+					     T29 = T8 - T9;
+					     Tb = ci[WS(rs, 7)];
+					     TcR = T2f - T2g;
+					     T2h = T2f + T2g;
+					     Tc = cr[WS(rs, 24)];
+					     T2a = ci[WS(rs, 55)];
+					     T2b = cr[WS(rs, 40)];
+					}
+					{
+					     E Tj, T2l, Ti, Td1, T2t, Tk, T2m, T2n;
+					     {
+						  E Tg, Th, T2r, T2s;
+						  Tg = cr[WS(rs, 4)];
+						  {
+						       E T2e, Td, TcS, T2c;
+						       T2e = Tb - Tc;
+						       Td = Tb + Tc;
+						       TcS = T2a - T2b;
+						       T2c = T2a + T2b;
+						       T2i = T2e - T2h;
+						       T5z = T2e + T2h;
+						       Te = Ta + Td;
+						       Teu = Ta - Td;
+						       TcT = TcR - TcS;
+						       Ti5 = TcS + TcR;
+						       T5y = T29 + T2c;
+						       T2d = T29 - T2c;
+						       Th = ci[WS(rs, 27)];
+						  }
+						  T2r = ci[WS(rs, 59)];
+						  T2s = cr[WS(rs, 36)];
+						  Tj = cr[WS(rs, 20)];
+						  T2l = Tg - Th;
+						  Ti = Tg + Th;
+						  Td1 = T2r - T2s;
+						  T2t = T2r + T2s;
+						  Tk = ci[WS(rs, 11)];
+						  T2m = ci[WS(rs, 43)];
+						  T2n = cr[WS(rs, 52)];
+					     }
+					     {
+						  E Tn, To, T2C, T2D;
+						  Tn = ci[WS(rs, 3)];
+						  {
+						       E T2q, Tl, Td2, T2o;
+						       T2q = Tj - Tk;
+						       Tl = Tj + Tk;
+						       Td2 = T2m - T2n;
+						       T2o = T2m + T2n;
+						       T8H = T2t - T2q;
+						       T2u = T2q + T2t;
+						       Td0 = Ti - Tl;
+						       Tm = Ti + Tl;
+						       Ti7 = Td1 + Td2;
+						       Td3 = Td1 - Td2;
+						       T8I = T2l + T2o;
+						       T2p = T2l - T2o;
+						       To = cr[WS(rs, 28)];
+						  }
+						  T2C = ci[WS(rs, 35)];
+						  T2D = cr[WS(rs, 60)];
+						  Tq = cr[WS(rs, 12)];
+						  T2w = Tn - To;
+						  Tp = Tn + To;
+						  TcV = T2C - T2D;
+						  T2E = T2C + T2D;
+						  Tr = ci[WS(rs, 19)];
+						  T2x = ci[WS(rs, 51)];
+						  T2y = cr[WS(rs, 44)];
+					     }
+					}
+				   }
+				   {
+					E Tj6, T8K, T8L, T9L, T8F, Ti6, T8E, T9M, T5C, T5D, Ti9, Tj7;
+					{
+					     E T2F, Ti8, T2A, TjA, Tew, Tex, Tjz;
+					     {
+						  E Tf, TcY, TcX, Tu, T5x, T5A;
+						  Tj6 = T7 - Te;
+						  Tf = T7 + Te;
+						  {
+						       E T2B, Ts, TcW, T2z, Tt;
+						       T2B = Tq - Tr;
+						       Ts = Tq + Tr;
+						       TcW = T2x - T2y;
+						       T2z = T2x + T2y;
+						       T8K = T2B + T2E;
+						       T2F = T2B - T2E;
+						       TcY = Tp - Ts;
+						       Tt = Tp + Ts;
+						       TcX = TcV - TcW;
+						       Ti8 = TcV + TcW;
+						       T8L = T2w + T2z;
+						       T2A = T2w - T2z;
+						       Tu = Tm + Tt;
+						       TjA = Tm - Tt;
+						  }
+						  T9L = T5w - T5t;
+						  T5x = T5t + T5w;
+						  T5A = T5y - T5z;
+						  T8F = T5y + T5z;
+						  Td4 = Td0 + Td3;
+						  Tew = Td0 - Td3;
+						  Thy = Tf - Tu;
+						  Tv = Tf + Tu;
+						  T7n = FNMS(KP707106781, T5A, T5x);
+						  T5B = FMA(KP707106781, T5A, T5x);
+						  Tex = TcY + TcX;
+						  TcZ = TcX - TcY;
+						  Ti6 = Ti4 + Ti5;
+						  Tjz = Ti4 - Ti5;
+					     }
+					     {
+						  E T28, T2j, T2v, T2G;
+						  T8E = T24 + T27;
+						  T28 = T24 - T27;
+						  TfP = Tew + Tex;
+						  Tey = Tew - Tex;
+						  Tkl = TjA + Tjz;
+						  TjB = Tjz - TjA;
+						  T2j = T2d + T2i;
+						  T9M = T2d - T2i;
+						  T5C = FMA(KP414213562, T2p, T2u);
+						  T2v = FNMS(KP414213562, T2u, T2p);
+						  T2G = FMA(KP414213562, T2F, T2A);
+						  T5D = FNMS(KP414213562, T2A, T2F);
+						  T6U = FNMS(KP707106781, T2j, T28);
+						  T2k = FMA(KP707106781, T2j, T28);
+						  T7o = T2v - T2G;
+						  T2H = T2v + T2G;
+						  Ti9 = Ti7 + Ti8;
+						  Tj7 = Ti8 - Ti7;
+					     }
+					}
+					{
+					     E T8J, T9O, T9P, T8M;
+					     TiH = Ti6 + Ti9;
+					     Tia = Ti6 - Ti9;
+					     Tk8 = Tj6 + Tj7;
+					     Tj8 = Tj6 - Tj7;
+					     T8J = FNMS(KP414213562, T8I, T8H);
+					     T9O = FMA(KP414213562, T8H, T8I);
+					     T6V = T5D - T5C;
+					     T5E = T5C + T5D;
+					     Tbz = FNMS(KP707106781, T9M, T9L);
+					     T9N = FMA(KP707106781, T9M, T9L);
+					     T9P = FMA(KP414213562, T8K, T8L);
+					     T8M = FNMS(KP414213562, T8L, T8K);
+					     Tb7 = T9O + T9P;
+					     T9Q = T9O - T9P;
+					     Tgh = Teu + Tet;
+					     Tev = Tet - Teu;
+					     Tb6 = FMA(KP707106781, T8F, T8E);
+					     T8G = FNMS(KP707106781, T8F, T8E);
+					     TbA = T8M - T8J;
+					     T8N = T8J + T8M;
+					}
+				   }
+			      }
+			      {
+				   E T8S, TC, Tdn, Tdk, ThC, T3e, T8P, T36, T2X, Tda, TY, ThA, Tdf, T35, T2S;
+				   E T3x, T3o, Tdl, TJ, ThD, Tdq, T3w, T3j, T34, TR, Tdc, Td9, Thz, T2N;
+				   {
+					E TV, T2O, TU, Tdd, T2W, TW, T2P, T2Q;
+					{
+					     E Tz, T3r, Ty, Tdj, T3u, TA, T3b, T3c;
+					     {
+						  E Tw, Tx, T3s, T3t;
+						  Tw = cr[WS(rs, 2)];
+						  TfO = TcQ + TcT;
+						  TcU = TcQ - TcT;
+						  Tgi = Td4 + TcZ;
+						  Td5 = TcZ - Td4;
+						  Tx = ci[WS(rs, 29)];
+						  T3s = ci[WS(rs, 45)];
+						  T3t = cr[WS(rs, 50)];
+						  Tz = cr[WS(rs, 18)];
+						  T3r = Tw - Tx;
+						  Ty = Tw + Tx;
+						  Tdj = T3s - T3t;
+						  T3u = T3s + T3t;
+						  TA = ci[WS(rs, 13)];
+						  T3b = ci[WS(rs, 61)];
+						  T3c = cr[WS(rs, 34)];
+					     }
+					     {
+						  E T3a, TB, Tdi, T3d;
+						  T8S = T3r + T3u;
+						  T3v = T3r - T3u;
+						  T3a = Tz - TA;
+						  TB = Tz + TA;
+						  Tdi = T3b - T3c;
+						  T3d = T3b + T3c;
+						  TC = Ty + TB;
+						  Tdn = Ty - TB;
+						  Tdk = Tdi - Tdj;
+						  ThC = Tdi + Tdj;
+						  T3e = T3a + T3d;
+						  T8P = T3d - T3a;
+					     }
+					}
+					{
+					     E TS, TT, T2U, T2V;
+					     TS = cr[WS(rs, 6)];
+					     TT = ci[WS(rs, 25)];
+					     T2U = ci[WS(rs, 41)];
+					     T2V = cr[WS(rs, 54)];
+					     TV = ci[WS(rs, 9)];
+					     T2O = TS - TT;
+					     TU = TS + TT;
+					     Tdd = T2U - T2V;
+					     T2W = T2U + T2V;
+					     TW = cr[WS(rs, 22)];
+					     T2P = ci[WS(rs, 57)];
+					     T2Q = cr[WS(rs, 38)];
+					}
+					{
+					     E TG, T3f, TF, Tdo, T3n, TH, T3g, T3h;
+					     {
+						  E TD, TE, T3l, T3m;
+						  TD = cr[WS(rs, 10)];
+						  {
+						       E T2T, TX, Tde, T2R;
+						       T2T = TV - TW;
+						       TX = TV + TW;
+						       Tde = T2P - T2Q;
+						       T2R = T2P + T2Q;
+						       T36 = T2T - T2W;
+						       T2X = T2T + T2W;
+						       Tda = TU - TX;
+						       TY = TU + TX;
+						       ThA = Tde + Tdd;
+						       Tdf = Tdd - Tde;
+						       T35 = T2O - T2R;
+						       T2S = T2O + T2R;
+						       TE = ci[WS(rs, 21)];
+						  }
+						  T3l = ci[WS(rs, 37)];
+						  T3m = cr[WS(rs, 58)];
+						  TG = ci[WS(rs, 5)];
+						  T3f = TD - TE;
+						  TF = TD + TE;
+						  Tdo = T3l - T3m;
+						  T3n = T3l + T3m;
+						  TH = cr[WS(rs, 26)];
+						  T3g = ci[WS(rs, 53)];
+						  T3h = cr[WS(rs, 42)];
+					     }
+					     {
+						  E TO, T30, TN, Td8, T33, TP, T2K, T2L;
+						  {
+						       E TL, TM, T31, T32;
+						       TL = ci[WS(rs, 1)];
+						       {
+							    E T3k, TI, Tdp, T3i;
+							    T3k = TG - TH;
+							    TI = TG + TH;
+							    Tdp = T3g - T3h;
+							    T3i = T3g + T3h;
+							    T3x = T3k - T3n;
+							    T3o = T3k + T3n;
+							    Tdl = TF - TI;
+							    TJ = TF + TI;
+							    ThD = Tdp + Tdo;
+							    Tdq = Tdo - Tdp;
+							    T3w = T3f - T3i;
+							    T3j = T3f + T3i;
+							    TM = cr[WS(rs, 30)];
+						       }
+						       T31 = ci[WS(rs, 49)];
+						       T32 = cr[WS(rs, 46)];
+						       TO = cr[WS(rs, 14)];
+						       T30 = TL - TM;
+						       TN = TL + TM;
+						       Td8 = T31 - T32;
+						       T33 = T31 + T32;
+						       TP = ci[WS(rs, 17)];
+						       T2K = ci[WS(rs, 33)];
+						       T2L = cr[WS(rs, 62)];
+						  }
+						  {
+						       E T2J, TQ, Td7, T2M;
+						       T8Z = T30 + T33;
+						       T34 = T30 - T33;
+						       T2J = TO - TP;
+						       TQ = TO + TP;
+						       Td7 = T2K - T2L;
+						       T2M = T2K + T2L;
+						       TR = TN + TQ;
+						       Tdc = TN - TQ;
+						       Td9 = Td7 - Td8;
+						       Thz = Td7 + Td8;
+						       T2N = T2J - T2M;
+						       T8W = T2J + T2M;
+						  }
+					     }
+					}
+				   }
+				   {
+					E Tja, Tj9, TfU, TfV, TfR, Tdb, Tdg, TfS;
+					{
+					     E ThE, ThB, Tdm, Tdr;
+					     {
+						  E Tjc, TK, TZ, Tjd;
+						  Tjc = TC - TJ;
+						  TK = TC + TJ;
+						  TZ = TR + TY;
+						  Tja = TR - TY;
+						  Tjd = ThC - ThD;
+						  ThE = ThC + ThD;
+						  Tj9 = Thz - ThA;
+						  ThB = Thz + ThA;
+						  Ti3 = TK - TZ;
+						  T10 = TK + TZ;
+						  TjC = Tjc - Tjd;
+						  Tje = Tjc + Tjd;
+					     }
+					     TfU = Tdl + Tdk;
+					     Tdm = Tdk - Tdl;
+					     Tdr = Tdn - Tdq;
+					     TfV = Tdn + Tdq;
+					     TiI = ThE + ThB;
+					     ThF = ThB - ThE;
+					     TeA = FMA(KP414213562, Tdm, Tdr);
+					     Tds = FNMS(KP414213562, Tdr, Tdm);
+					     TfR = Tda + Td9;
+					     Tdb = Td9 - Tda;
+					     Tdg = Tdc - Tdf;
+					     TfS = Tdc + Tdf;
+					}
+					{
+					     E T2Z, T6X, T37, T2Y;
+					     TjD = Tja + Tj9;
+					     Tjb = Tj9 - Tja;
+					     TeB = FNMS(KP414213562, Tdb, Tdg);
+					     Tdh = FMA(KP414213562, Tdg, Tdb);
+					     T90 = T2S + T2X;
+					     T2Y = T2S - T2X;
+					     Tgl = FMA(KP414213562, TfR, TfS);
+					     TfT = FNMS(KP414213562, TfS, TfR);
+					     Tgk = FNMS(KP414213562, TfU, TfV);
+					     TfW = FMA(KP414213562, TfV, TfU);
+					     T2Z = FMA(KP707106781, T2Y, T2N);
+					     T6X = FNMS(KP707106781, T2Y, T2N);
+					     T37 = T35 + T36;
+					     T8X = T35 - T36;
+					     {
+						  E T8Q, T8T, T3p, T6Y, T38;
+						  T3y = T3w + T3x;
+						  T8Q = T3x - T3w;
+						  T8T = T3j + T3o;
+						  T3p = T3j - T3o;
+						  T6Y = FNMS(KP707106781, T37, T34);
+						  T38 = FMA(KP707106781, T37, T34);
+						  {
+						       E Tb9, T8R, Tba, T8U;
+						       Tb9 = FMA(KP707106781, T8Q, T8P);
+						       T8R = FNMS(KP707106781, T8Q, T8P);
+						       Tba = FMA(KP707106781, T8T, T8S);
+						       T8U = FNMS(KP707106781, T8T, T8S);
+						       T6Z = FMA(KP668178637, T6Y, T6X);
+						       T7r = FNMS(KP668178637, T6X, T6Y);
+						       T5H = FMA(KP198912367, T2Z, T38);
+						       T39 = FNMS(KP198912367, T38, T2Z);
+						       Tbb = FNMS(KP198912367, Tba, Tb9);
+						       TbC = FMA(KP198912367, Tb9, Tba);
+						       T9S = FNMS(KP668178637, T8R, T8U);
+						       T8V = FMA(KP668178637, T8U, T8R);
+						       T3q = FMA(KP707106781, T3p, T3e);
+						       T70 = FNMS(KP707106781, T3p, T3e);
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T97, Tbk, T9j, T9k, Tbh, T9i, Tbi, T9e;
+			      {
+				   E T9g, T5f, T18, Ted, TdY, ThI, T4A, T95, T9b, T57, T1u, Te1, Te4, ThM, T52;
+				   E T9c, T5h, T4K, TdZ, T1f, ThJ, Teg, T5g, T4F, T1j, Te8, T98, T4W, T4N, T1m;
+				   E Te7, T4Q, T1n, Te6;
+				   {
+					E T1q, Te3, T4Y, T1t, Te2, T51;
+					{
+					     E T15, T5b, T14, TdX, T5e, T16, T4x, T4y;
+					     {
+						  E T12, T13, T5c, T5d, T71, T3z;
+						  T12 = cr[WS(rs, 1)];
+						  T71 = FNMS(KP707106781, T3y, T3v);
+						  T3z = FMA(KP707106781, T3y, T3v);
+						  {
+						       E Tbc, T8Y, Tbd, T91;
+						       Tbc = FMA(KP707106781, T8X, T8W);
+						       T8Y = FNMS(KP707106781, T8X, T8W);
+						       Tbd = FMA(KP707106781, T90, T8Z);
+						       T91 = FNMS(KP707106781, T90, T8Z);
+						       T72 = FNMS(KP668178637, T71, T70);
+						       T7q = FMA(KP668178637, T70, T71);
+						       T5G = FNMS(KP198912367, T3q, T3z);
+						       T3A = FMA(KP198912367, T3z, T3q);
+						       Tbe = FNMS(KP198912367, Tbd, Tbc);
+						       TbD = FMA(KP198912367, Tbc, Tbd);
+						       T9T = FNMS(KP668178637, T8Y, T91);
+						       T92 = FMA(KP668178637, T91, T8Y);
+						       T13 = ci[WS(rs, 30)];
+						  }
+						  T5c = ci[WS(rs, 46)];
+						  T5d = cr[WS(rs, 49)];
+						  T15 = cr[WS(rs, 17)];
+						  T5b = T12 - T13;
+						  T14 = T12 + T13;
+						  TdX = T5c - T5d;
+						  T5e = T5c + T5d;
+						  T16 = ci[WS(rs, 14)];
+						  T4x = ci[WS(rs, 62)];
+						  T4y = cr[WS(rs, 33)];
+					     }
+					     {
+						  E T4w, T17, TdW, T4z;
+						  T9g = T5b + T5e;
+						  T5f = T5b - T5e;
+						  T4w = T15 - T16;
+						  T17 = T15 + T16;
+						  TdW = T4x - T4y;
+						  T4z = T4x + T4y;
+						  T18 = T14 + T17;
+						  Ted = T14 - T17;
+						  TdY = TdW - TdX;
+						  ThI = TdW + TdX;
+						  T4A = T4w + T4z;
+						  T95 = T4z - T4w;
+					     }
+					}
+					{
+					     E T1r, T53, T56, T1s, T4Z, T50;
+					     {
+						  E T1o, T1p, T54, T55;
+						  T1o = ci[WS(rs, 2)];
+						  T1p = cr[WS(rs, 29)];
+						  T54 = ci[WS(rs, 50)];
+						  T55 = cr[WS(rs, 45)];
+						  T1r = cr[WS(rs, 13)];
+						  T53 = T1o - T1p;
+						  T1q = T1o + T1p;
+						  Te3 = T54 - T55;
+						  T56 = T54 + T55;
+						  T1s = ci[WS(rs, 18)];
+						  T4Z = ci[WS(rs, 34)];
+						  T50 = cr[WS(rs, 61)];
+					     }
+					     T9b = T53 + T56;
+					     T57 = T53 - T56;
+					     T4Y = T1r - T1s;
+					     T1t = T1r + T1s;
+					     Te2 = T4Z - T50;
+					     T51 = T4Z + T50;
+					}
+					T1u = T1q + T1t;
+					Te1 = T1q - T1t;
+					Te4 = Te2 - Te3;
+					ThM = Te2 + Te3;
+					T52 = T4Y - T51;
+					T9c = T4Y + T51;
+					{
+					     E T1c, T4B, T1b, Tee, T4J, T1d, T4C, T4D;
+					     {
+						  E T19, T1a, T4H, T4I;
+						  T19 = cr[WS(rs, 9)];
+						  T1a = ci[WS(rs, 22)];
+						  T4H = ci[WS(rs, 38)];
+						  T4I = cr[WS(rs, 57)];
+						  T1c = ci[WS(rs, 6)];
+						  T4B = T19 - T1a;
+						  T1b = T19 + T1a;
+						  Tee = T4H - T4I;
+						  T4J = T4H + T4I;
+						  T1d = cr[WS(rs, 25)];
+						  T4C = ci[WS(rs, 54)];
+						  T4D = cr[WS(rs, 41)];
+					     }
+					     {
+						  E T1k, T4S, T4V, T1l, T4O, T4P;
+						  {
+						       E T1h, T1i, T4T, T4U;
+						       T1h = cr[WS(rs, 5)];
+						       {
+							    E T4G, T1e, Tef, T4E;
+							    T4G = T1c - T1d;
+							    T1e = T1c + T1d;
+							    Tef = T4C - T4D;
+							    T4E = T4C + T4D;
+							    T5h = T4G - T4J;
+							    T4K = T4G + T4J;
+							    TdZ = T1b - T1e;
+							    T1f = T1b + T1e;
+							    ThJ = Tef + Tee;
+							    Teg = Tee - Tef;
+							    T5g = T4B - T4E;
+							    T4F = T4B + T4E;
+							    T1i = ci[WS(rs, 26)];
+						       }
+						       T4T = ci[WS(rs, 42)];
+						       T4U = cr[WS(rs, 53)];
+						       T1k = cr[WS(rs, 21)];
+						       T4S = T1h - T1i;
+						       T1j = T1h + T1i;
+						       Te8 = T4T - T4U;
+						       T4V = T4T + T4U;
+						       T1l = ci[WS(rs, 10)];
+						       T4O = ci[WS(rs, 58)];
+						       T4P = cr[WS(rs, 37)];
+						  }
+						  T98 = T4S + T4V;
+						  T4W = T4S - T4V;
+						  T4N = T1k - T1l;
+						  T1m = T1k + T1l;
+						  Te7 = T4O - T4P;
+						  T4Q = T4O + T4P;
+					     }
+					}
+				   }
+				   T1n = T1j + T1m;
+				   Te6 = T1j - T1m;
+				   {
+					E Te9, ThL, T4R, T99;
+					Te9 = Te7 - Te8;
+					ThL = Te7 + Te8;
+					T4R = T4N + T4Q;
+					T99 = T4Q - T4N;
+					{
+					     E Tjr, ThK, Tjs, ThN;
+					     {
+						  E T1g, T1v, Tjp, Tjo;
+						  Tjr = T18 - T1f;
+						  T1g = T18 + T1f;
+						  T1v = T1n + T1u;
+						  Tjp = T1n - T1u;
+						  ThK = ThI + ThJ;
+						  Tjo = ThI - ThJ;
+						  ThH = T1g - T1v;
+						  T1w = T1g + T1v;
+						  Tke = Tjp + Tjo;
+						  Tjq = Tjo - Tjp;
+						  Tjs = ThM - ThL;
+						  ThN = ThL + ThM;
+					     }
+					     {
+						  E Tg6, Te0, Tg9, Teh, Tej, Tei, Tga, Teb, Te5, Tea;
+						  Tg6 = TdZ + TdY;
+						  Te0 = TdY - TdZ;
+						  Tkf = Tjr + Tjs;
+						  Tjt = Tjr - Tjs;
+						  TiK = ThK + ThN;
+						  ThO = ThK - ThN;
+						  Tg9 = Ted + Teg;
+						  Teh = Ted - Teg;
+						  Tej = Te4 - Te1;
+						  Te5 = Te1 + Te4;
+						  Tea = Te6 - Te9;
+						  Tei = Te6 + Te9;
+						  Tga = Tea + Te5;
+						  Teb = Te5 - Tea;
+						  {
+						       E T9h, T4M, T78, T96, T5k, T5l, T75, T5j, T76, T59;
+						       {
+							    E T5i, Tg7, Tek, T4L, T4X, T58;
+							    T9h = T4F + T4K;
+							    T4L = T4F - T4K;
+							    Tgb = FNMS(KP707106781, Tga, Tg9);
+							    TgT = FMA(KP707106781, Tga, Tg9);
+							    Tfc = FMA(KP707106781, Teb, Te0);
+							    Tec = FNMS(KP707106781, Teb, Te0);
+							    Tg7 = Tei + Tej;
+							    Tek = Tei - Tej;
+							    T4M = FMA(KP707106781, T4L, T4A);
+							    T78 = FNMS(KP707106781, T4L, T4A);
+							    Tg8 = FNMS(KP707106781, Tg7, Tg6);
+							    TgU = FMA(KP707106781, Tg7, Tg6);
+							    Tfd = FMA(KP707106781, Tek, Teh);
+							    Tel = FNMS(KP707106781, Tek, Teh);
+							    T5i = T5g + T5h;
+							    T96 = T5h - T5g;
+							    T5k = FNMS(KP414213562, T4R, T4W);
+							    T4X = FMA(KP414213562, T4W, T4R);
+							    T58 = FNMS(KP414213562, T57, T52);
+							    T5l = FMA(KP414213562, T52, T57);
+							    T75 = FNMS(KP707106781, T5i, T5f);
+							    T5j = FMA(KP707106781, T5i, T5f);
+							    T76 = T4X - T58;
+							    T59 = T4X + T58;
+						       }
+						       {
+							    E T79, T5m, T9a, T9d;
+							    T77 = FNMS(KP923879532, T76, T75);
+							    T83 = FMA(KP923879532, T76, T75);
+							    T6i = FMA(KP923879532, T59, T4M);
+							    T5a = FNMS(KP923879532, T59, T4M);
+							    T79 = T5l - T5k;
+							    T5m = T5k + T5l;
+							    T97 = FNMS(KP707106781, T96, T95);
+							    Tbk = FMA(KP707106781, T96, T95);
+							    T7a = FNMS(KP923879532, T79, T78);
+							    T82 = FMA(KP923879532, T79, T78);
+							    T6j = FMA(KP923879532, T5m, T5j);
+							    T5n = FNMS(KP923879532, T5m, T5j);
+							    T9j = FNMS(KP414213562, T98, T99);
+							    T9a = FMA(KP414213562, T99, T98);
+							    T9d = FMA(KP414213562, T9c, T9b);
+							    T9k = FNMS(KP414213562, T9b, T9c);
+							    Tbh = FMA(KP707106781, T9h, T9g);
+							    T9i = FNMS(KP707106781, T9h, T9g);
+							    Tbi = T9a + T9d;
+							    T9e = T9a - T9d;
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T9z, T4m, T1D, TdM, ThR, Tdx, T3H, T9o, T9r, T4e, T1Z, TdA, TdD, ThV, T49;
+				   E T9s, T4o, T3R, Tdy, T1K, ThS, TdP, T4n, T3M, T1O, T3V, TdH, T3U, T1R, T3W;
+				   E T9u, T43;
+				   {
+					E T1V, T46, TdC, T45, T1Y, T47, T48, TdB;
+					{
+					     E Tdw, T3D, T3G, Tdv, T4a, T4d;
+					     {
+						  E T4i, T1z, T3E, T4l, T1C, T3F;
+						  {
+						       E T4j, T4k, T1A, T1B;
+						       {
+							    E T1x, Tbl, T9l, T1y;
+							    T1x = ci[0];
+							    Tbj = FNMS(KP923879532, Tbi, Tbh);
+							    Tcc = FMA(KP923879532, Tbi, Tbh);
+							    Tas = FMA(KP923879532, T9e, T97);
+							    T9f = FNMS(KP923879532, T9e, T97);
+							    Tbl = T9j - T9k;
+							    T9l = T9j + T9k;
+							    T1y = cr[WS(rs, 31)];
+							    T4j = ci[WS(rs, 48)];
+							    Tbm = FNMS(KP923879532, Tbl, Tbk);
+							    Tcb = FMA(KP923879532, Tbl, Tbk);
+							    Tar = FNMS(KP923879532, T9l, T9i);
+							    T9m = FMA(KP923879532, T9l, T9i);
+							    T4i = T1x - T1y;
+							    T1z = T1x + T1y;
+							    T4k = cr[WS(rs, 47)];
+						       }
+						       T1A = cr[WS(rs, 15)];
+						       T1B = ci[WS(rs, 16)];
+						       T3E = ci[WS(rs, 32)];
+						       Tdw = T4j - T4k;
+						       T4l = T4j + T4k;
+						       T3D = T1A - T1B;
+						       T1C = T1A + T1B;
+						       T3F = cr[WS(rs, 63)];
+						  }
+						  T9z = T4i + T4l;
+						  T4m = T4i - T4l;
+						  T1D = T1z + T1C;
+						  TdM = T1z - T1C;
+						  T3G = T3E + T3F;
+						  Tdv = T3E - T3F;
+					     }
+					     {
+						  E T4b, T4c, T1T, T1U, T1W, T1X;
+						  T1T = ci[WS(rs, 4)];
+						  T1U = cr[WS(rs, 27)];
+						  ThR = Tdv + Tdw;
+						  Tdx = Tdv - Tdw;
+						  T3H = T3D - T3G;
+						  T9o = T3D + T3G;
+						  T4a = T1T - T1U;
+						  T1V = T1T + T1U;
+						  T4b = ci[WS(rs, 52)];
+						  T4c = cr[WS(rs, 43)];
+						  T1W = cr[WS(rs, 11)];
+						  T1X = ci[WS(rs, 20)];
+						  T46 = ci[WS(rs, 36)];
+						  TdC = T4b - T4c;
+						  T4d = T4b + T4c;
+						  T45 = T1W - T1X;
+						  T1Y = T1W + T1X;
+						  T47 = cr[WS(rs, 59)];
+					     }
+					     T9r = T4a + T4d;
+					     T4e = T4a - T4d;
+					}
+					T1Z = T1V + T1Y;
+					TdA = T1V - T1Y;
+					T48 = T46 + T47;
+					TdB = T46 - T47;
+					{
+					     E T3I, T1G, T3J, TdN, T3Q, T3N, T1J, T3K, T3Z, T42;
+					     {
+						  E T3O, T3P, T1E, T1F, T1H, T1I;
+						  T1E = cr[WS(rs, 7)];
+						  T1F = ci[WS(rs, 24)];
+						  TdD = TdB - TdC;
+						  ThV = TdB + TdC;
+						  T49 = T45 - T48;
+						  T9s = T45 + T48;
+						  T3I = T1E - T1F;
+						  T1G = T1E + T1F;
+						  T3O = ci[WS(rs, 40)];
+						  T3P = cr[WS(rs, 55)];
+						  T1H = ci[WS(rs, 8)];
+						  T1I = cr[WS(rs, 23)];
+						  T3J = ci[WS(rs, 56)];
+						  TdN = T3O - T3P;
+						  T3Q = T3O + T3P;
+						  T3N = T1H - T1I;
+						  T1J = T1H + T1I;
+						  T3K = cr[WS(rs, 39)];
+					     }
+					     {
+						  E T40, T41, T1P, T1Q;
+						  {
+						       E T1M, TdO, T3L, T1N;
+						       T1M = cr[WS(rs, 3)];
+						       T4o = T3N - T3Q;
+						       T3R = T3N + T3Q;
+						       Tdy = T1G - T1J;
+						       T1K = T1G + T1J;
+						       TdO = T3J - T3K;
+						       T3L = T3J + T3K;
+						       T1N = ci[WS(rs, 28)];
+						       T40 = ci[WS(rs, 44)];
+						       ThS = TdO + TdN;
+						       TdP = TdN - TdO;
+						       T4n = T3I - T3L;
+						       T3M = T3I + T3L;
+						       T3Z = T1M - T1N;
+						       T1O = T1M + T1N;
+						       T41 = cr[WS(rs, 51)];
+						  }
+						  T1P = cr[WS(rs, 19)];
+						  T1Q = ci[WS(rs, 12)];
+						  T3V = ci[WS(rs, 60)];
+						  TdH = T40 - T41;
+						  T42 = T40 + T41;
+						  T3U = T1P - T1Q;
+						  T1R = T1P + T1Q;
+						  T3W = cr[WS(rs, 35)];
+					     }
+					     T9u = T3Z + T42;
+					     T43 = T3Z - T42;
+					}
+				   }
+				   {
+					E T1S, TdF, T3X, TdG;
+					T1S = T1O + T1R;
+					TdF = T1O - T1R;
+					T3X = T3V + T3W;
+					TdG = T3V - T3W;
+					{
+					     E TdI, T3Y, T9v, ThT, ThW;
+					     {
+						  E Tjk, Tji, ThU, Tjh, T1L, T20, Tjl;
+						  Tjk = T1D - T1K;
+						  T1L = T1D + T1K;
+						  T20 = T1S + T1Z;
+						  Tji = T1S - T1Z;
+						  TdI = TdG - TdH;
+						  ThU = TdG + TdH;
+						  T3Y = T3U + T3X;
+						  T9v = T3U - T3X;
+						  ThQ = T1L - T20;
+						  T21 = T1L + T20;
+						  ThT = ThR + ThS;
+						  Tjh = ThR - ThS;
+						  Tjl = ThV - ThU;
+						  ThW = ThU + ThV;
+						  Tkb = Tji + Tjh;
+						  Tjj = Tjh - Tji;
+						  Tkc = Tjk + Tjl;
+						  Tjm = Tjk - Tjl;
+					     }
+					     {
+						  E TfZ, Tdz, Tg2, TdQ, TdS, TdR, Tg3, TdK, TdE, TdJ;
+						  TfZ = Tdy + Tdx;
+						  Tdz = Tdx - Tdy;
+						  Tg2 = TdM + TdP;
+						  TdQ = TdM - TdP;
+						  TdS = TdD - TdA;
+						  TdE = TdA + TdD;
+						  TiL = ThT + ThW;
+						  ThX = ThT - ThW;
+						  TdJ = TdF - TdI;
+						  TdR = TdF + TdI;
+						  Tg3 = TdJ + TdE;
+						  TdK = TdE - TdJ;
+						  {
+						       E T9A, T3T, T7f, T9p, T4r, T4s, T7c, T4q, T7d, T4g;
+						       {
+							    E T4p, Tg0, TdT, T3S, T44, T4f;
+							    T9A = T3M + T3R;
+							    T3S = T3M - T3R;
+							    Tg4 = FNMS(KP707106781, Tg3, Tg2);
+							    TgW = FMA(KP707106781, Tg3, Tg2);
+							    Tf9 = FMA(KP707106781, TdK, Tdz);
+							    TdL = FNMS(KP707106781, TdK, Tdz);
+							    Tg0 = TdR + TdS;
+							    TdT = TdR - TdS;
+							    T3T = FMA(KP707106781, T3S, T3H);
+							    T7f = FNMS(KP707106781, T3S, T3H);
+							    Tg1 = FNMS(KP707106781, Tg0, TfZ);
+							    TgX = FMA(KP707106781, Tg0, TfZ);
+							    Tfa = FMA(KP707106781, TdT, TdQ);
+							    TdU = FNMS(KP707106781, TdT, TdQ);
+							    T4p = T4n + T4o;
+							    T9p = T4n - T4o;
+							    T4r = FNMS(KP414213562, T3Y, T43);
+							    T44 = FMA(KP414213562, T43, T3Y);
+							    T4f = FNMS(KP414213562, T4e, T49);
+							    T4s = FMA(KP414213562, T49, T4e);
+							    T7c = FNMS(KP707106781, T4p, T4m);
+							    T4q = FMA(KP707106781, T4p, T4m);
+							    T7d = T44 - T4f;
+							    T4g = T44 + T4f;
+						       }
+						       {
+							    E T7g, T4t, T9t, T9w;
+							    T7e = FNMS(KP923879532, T7d, T7c);
+							    T80 = FMA(KP923879532, T7d, T7c);
+							    T6f = FMA(KP923879532, T4g, T3T);
+							    T4h = FNMS(KP923879532, T4g, T3T);
+							    T7g = T4s - T4r;
+							    T4t = T4r + T4s;
+							    T9q = FNMS(KP707106781, T9p, T9o);
+							    Tbr = FMA(KP707106781, T9p, T9o);
+							    T7h = FNMS(KP923879532, T7g, T7f);
+							    T7Z = FMA(KP923879532, T7g, T7f);
+							    T6g = FMA(KP923879532, T4t, T4q);
+							    T4u = FNMS(KP923879532, T4t, T4q);
+							    T9D = FNMS(KP414213562, T9r, T9s);
+							    T9t = FMA(KP414213562, T9s, T9r);
+							    T9w = FNMS(KP414213562, T9v, T9u);
+							    T9C = FMA(KP414213562, T9u, T9v);
+							    Tbo = FMA(KP707106781, T9A, T9z);
+							    T9B = FNMS(KP707106781, T9A, T9z);
+							    Tbp = T9w + T9t;
+							    T9x = T9t - T9w;
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E Tbq, Tcf, Tav, T9y, Tbt, Tce, Tau, T9F, T6p, T6d, T6c, T6q, Thf, The, Thd;
+			 {
+			      E Tk9, Tkm, TjP, TjO, TjN;
+			      {
+				   E Tj0, TiS, TiU, Tj3, Tj1, Tj4, TiY, Tj2;
+				   {
+					E TiQ, TiW, TiV, TiR, TiD, TiG, TiN, TiF, TiO;
+					{
+					     E T11, T22, TiJ, TiE, TiM, Tbs, T9E;
+					     TiQ = Tv - T10;
+					     T11 = Tv + T10;
+					     Tbq = FNMS(KP923879532, Tbp, Tbo);
+					     Tcf = FMA(KP923879532, Tbp, Tbo);
+					     Tav = FMA(KP923879532, T9x, T9q);
+					     T9y = FNMS(KP923879532, T9x, T9q);
+					     Tbs = T9C + T9D;
+					     T9E = T9C - T9D;
+					     T22 = T1w + T21;
+					     TiW = T1w - T21;
+					     TiV = TiH - TiI;
+					     TiJ = TiH + TiI;
+					     Tbt = FNMS(KP923879532, Tbs, Tbr);
+					     Tce = FMA(KP923879532, Tbs, Tbr);
+					     Tau = FMA(KP923879532, T9E, T9B);
+					     T9F = FNMS(KP923879532, T9E, T9B);
+					     TiE = T11 - T22;
+					     TiR = TiL - TiK;
+					     TiM = TiK + TiL;
+					     cr[0] = T11 + T22;
+					     TiD = W[62];
+					     TiG = W[63];
+					     ci[0] = TiJ + TiM;
+					     TiN = TiJ - TiM;
+					     TiF = TiD * TiE;
+					     TiO = TiG * TiE;
+					}
+					cr[WS(rs, 32)] = FNMS(TiG, TiN, TiF);
+					ci[WS(rs, 32)] = FMA(TiD, TiN, TiO);
+					Tj0 = TiQ + TiR;
+					TiS = TiQ - TiR;
+					{
+					     E TiP, TiX, TiT, TiZ;
+					     TiP = W[94];
+					     TiU = W[95];
+					     TiZ = W[30];
+					     Tj3 = TiW + TiV;
+					     TiX = TiV - TiW;
+					     TiT = TiP * TiS;
+					     Tj1 = TiZ * Tj0;
+					     Tj4 = TiZ * Tj3;
+					     TiY = TiP * TiX;
+					     cr[WS(rs, 48)] = FNMS(TiU, TiX, TiT);
+					     Tj2 = W[31];
+					}
+				   }
+				   {
+					E Tii, Til, Tik, Tih, Tim;
+					{
+					     E Tib, Tit, Tio, ThG, ThP, ThY, Tie, Tip, Tic, Tid;
+					     Tib = Ti3 + Tia;
+					     Tit = Tia - Ti3;
+					     ci[WS(rs, 48)] = FMA(TiU, TiS, TiY);
+					     Tio = Thy - ThF;
+					     ThG = Thy + ThF;
+					     ci[WS(rs, 16)] = FMA(Tj2, Tj0, Tj4);
+					     cr[WS(rs, 16)] = FNMS(Tj2, Tj3, Tj1);
+					     ThP = ThH - ThO;
+					     Tic = ThH + ThO;
+					     Tid = ThX - ThQ;
+					     ThY = ThQ + ThX;
+					     Tie = Tic + Tid;
+					     Tip = Tid - Tic;
+					     {
+						  E Tiy, TiB, Ti0, Tiz, TiC, TiA;
+						  {
+						       E Tin, Tis, Tiq, ThZ, Tiu, Tir, Tiw, Tix, Tiv;
+						       Tin = W[110];
+						       Tis = W[111];
+						       Tiy = FMA(KP707106781, Tip, Tio);
+						       Tiq = FNMS(KP707106781, Tip, Tio);
+						       ThZ = ThP + ThY;
+						       Tiu = ThP - ThY;
+						       Tir = Tin * Tiq;
+						       Tix = W[46];
+						       TiB = FMA(KP707106781, Tiu, Tit);
+						       Tiv = FNMS(KP707106781, Tiu, Tit);
+						       Ti0 = FNMS(KP707106781, ThZ, ThG);
+						       Tii = FMA(KP707106781, ThZ, ThG);
+						       cr[WS(rs, 56)] = FNMS(Tis, Tiv, Tir);
+						       Tiw = Tin * Tiv;
+						       Tiz = Tix * Tiy;
+						       TiC = Tix * TiB;
+						       TiA = W[47];
+						       ci[WS(rs, 56)] = FMA(Tis, Tiq, Tiw);
+						  }
+						  {
+						       E Tif, Ti2, Thx, Tig, Ti1;
+						       Til = FMA(KP707106781, Tie, Tib);
+						       Tif = FNMS(KP707106781, Tie, Tib);
+						       Ti2 = W[79];
+						       ci[WS(rs, 24)] = FMA(TiA, Tiy, TiC);
+						       cr[WS(rs, 24)] = FNMS(TiA, TiB, Tiz);
+						       Thx = W[78];
+						       Tig = Ti2 * Ti0;
+						       Tik = W[15];
+						       Ti1 = Thx * Ti0;
+						       ci[WS(rs, 40)] = FMA(Thx, Tif, Tig);
+						       Tih = W[14];
+						       Tim = Tik * Tii;
+						       cr[WS(rs, 40)] = FNMS(Ti2, Tif, Ti1);
+						  }
+					     }
+					}
+					{
+					     E TjF, TjI, TjU, Tk2, TjZ, Tk5, Tjw, TjM;
+					     {
+						  E TjX, TjG, Tju, Tjg, TjS, Tjn, TjH, Tjf, TjE, Tij, TjT, Tjv, TjY;
+						  TjE = TjC - TjD;
+						  Tk9 = TjC + TjD;
+						  Tij = Tih * Tii;
+						  ci[WS(rs, 8)] = FMA(Tih, Til, Tim);
+						  Tkm = Tje + Tjb;
+						  Tjf = Tjb - Tje;
+						  TjX = FNMS(KP707106781, TjE, TjB);
+						  TjF = FMA(KP707106781, TjE, TjB);
+						  cr[WS(rs, 8)] = FNMS(Tik, Til, Tij);
+						  TjG = FMA(KP414213562, Tjq, Tjt);
+						  Tju = FNMS(KP414213562, Tjt, Tjq);
+						  Tjg = FMA(KP707106781, Tjf, Tj8);
+						  TjS = FNMS(KP707106781, Tjf, Tj8);
+						  Tjn = FMA(KP414213562, Tjm, Tjj);
+						  TjH = FNMS(KP414213562, Tjj, Tjm);
+						  TjI = TjG - TjH;
+						  TjT = TjG + TjH;
+						  Tjv = Tjn - Tju;
+						  TjY = Tju + Tjn;
+						  TjU = FNMS(KP923879532, TjT, TjS);
+						  Tk2 = FMA(KP923879532, TjT, TjS);
+						  TjZ = FNMS(KP923879532, TjY, TjX);
+						  Tk5 = FMA(KP923879532, TjY, TjX);
+						  Tjw = FNMS(KP923879532, Tjv, Tjg);
+						  TjM = FMA(KP923879532, Tjv, Tjg);
+					     }
+					     {
+						  E Tk4, Tk3, TjR, TjW, TjJ, Tjy, Tj5;
+						  TjR = W[54];
+						  TjW = W[55];
+						  {
+						       E Tk1, Tk0, TjV, Tk6;
+						       Tk1 = W[118];
+						       Tk4 = W[119];
+						       Tk0 = TjR * TjZ;
+						       TjV = TjR * TjU;
+						       Tk6 = Tk1 * Tk5;
+						       Tk3 = Tk1 * Tk2;
+						       ci[WS(rs, 28)] = FMA(TjW, TjU, Tk0);
+						       cr[WS(rs, 28)] = FNMS(TjW, TjZ, TjV);
+						       ci[WS(rs, 60)] = FMA(Tk4, Tk2, Tk6);
+						  }
+						  cr[WS(rs, 60)] = FNMS(Tk4, Tk5, Tk3);
+						  TjP = FMA(KP923879532, TjI, TjF);
+						  TjJ = FNMS(KP923879532, TjI, TjF);
+						  Tjy = W[87];
+						  Tj5 = W[86];
+						  {
+						       E TjL, TjQ, TjK, Tjx;
+						       TjO = W[23];
+						       TjK = Tjy * Tjw;
+						       Tjx = Tj5 * Tjw;
+						       TjL = W[22];
+						       TjQ = TjO * TjM;
+						       ci[WS(rs, 44)] = FMA(Tj5, TjJ, TjK);
+						       cr[WS(rs, 44)] = FNMS(Tjy, TjJ, Tjx);
+						       TjN = TjL * TjM;
+						       ci[WS(rs, 12)] = FMA(TjL, TjP, TjQ);
+						  }
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T5T, T5S, T5R, Tkx, Tkw, Tkv;
+				   {
+					E Tkn, Tkq, TkC, TkK, TkH, TkN, Tki, Tku;
+					{
+					     E Tkg, Tko, TkF, Tka, TkA, Tkd, Tkp, TkB, Tkh, TkG;
+					     cr[WS(rs, 12)] = FNMS(TjO, TjP, TjN);
+					     Tkg = FMA(KP414213562, Tkf, Tke);
+					     Tko = FNMS(KP414213562, Tke, Tkf);
+					     TkF = FMA(KP707106781, Tkm, Tkl);
+					     Tkn = FNMS(KP707106781, Tkm, Tkl);
+					     Tka = FNMS(KP707106781, Tk9, Tk8);
+					     TkA = FMA(KP707106781, Tk9, Tk8);
+					     Tkd = FNMS(KP414213562, Tkc, Tkb);
+					     Tkp = FMA(KP414213562, Tkb, Tkc);
+					     Tkq = Tko - Tkp;
+					     TkB = Tko + Tkp;
+					     Tkh = Tkd - Tkg;
+					     TkG = Tkg + Tkd;
+					     TkC = FNMS(KP923879532, TkB, TkA);
+					     TkK = FMA(KP923879532, TkB, TkA);
+					     TkH = FNMS(KP923879532, TkG, TkF);
+					     TkN = FMA(KP923879532, TkG, TkF);
+					     Tki = FNMS(KP923879532, Tkh, Tka);
+					     Tku = FMA(KP923879532, Tkh, Tka);
+					}
+					{
+					     E TkM, TkL, Tkz, TkE, Tkr, Tkk, Tk7;
+					     Tkz = W[70];
+					     TkE = W[71];
+					     {
+						  E TkJ, TkI, TkD, TkO;
+						  TkJ = W[6];
+						  TkM = W[7];
+						  TkI = Tkz * TkH;
+						  TkD = Tkz * TkC;
+						  TkO = TkJ * TkN;
+						  TkL = TkJ * TkK;
+						  ci[WS(rs, 36)] = FMA(TkE, TkC, TkI);
+						  cr[WS(rs, 36)] = FNMS(TkE, TkH, TkD);
+						  ci[WS(rs, 4)] = FMA(TkM, TkK, TkO);
+					     }
+					     cr[WS(rs, 4)] = FNMS(TkM, TkN, TkL);
+					     Tkx = FMA(KP923879532, Tkq, Tkn);
+					     Tkr = FNMS(KP923879532, Tkq, Tkn);
+					     Tkk = W[103];
+					     Tk7 = W[102];
+					     {
+						  E Tkt, Tky, Tks, Tkj;
+						  Tkw = W[39];
+						  Tks = Tkk * Tki;
+						  Tkj = Tk7 * Tki;
+						  Tkt = W[38];
+						  Tky = Tkw * Tku;
+						  ci[WS(rs, 52)] = FMA(Tk7, Tkr, Tks);
+						  cr[WS(rs, 52)] = FNMS(Tkk, Tkr, Tkj);
+						  Tkv = Tkt * Tku;
+						  ci[WS(rs, 20)] = FMA(Tkt, Tkx, Tky);
+					     }
+					}
+				   }
+				   {
+					E T5J, T5M, T66, T5Y, T69, T63, T5Q, T5q;
+					{
+					     E T5o, T4v, T61, T5X, T3C, T5W, T62, T5p;
+					     {
+						  E T5K, T5L, T5F, T5I, T2I, T3B;
+						  T5F = FNMS(KP923879532, T5E, T5B);
+						  T6p = FMA(KP923879532, T5E, T5B);
+						  T6d = T5G + T5H;
+						  T5I = T5G - T5H;
+						  cr[WS(rs, 20)] = FNMS(Tkw, Tkx, Tkv);
+						  T5o = FNMS(KP820678790, T5n, T5a);
+						  T5K = FMA(KP820678790, T5a, T5n);
+						  T5L = FNMS(KP820678790, T4h, T4u);
+						  T4v = FMA(KP820678790, T4u, T4h);
+						  T5J = FMA(KP980785280, T5I, T5F);
+						  T61 = FNMS(KP980785280, T5I, T5F);
+						  T2I = FNMS(KP923879532, T2H, T2k);
+						  T6c = FMA(KP923879532, T2H, T2k);
+						  T6q = T3A + T39;
+						  T3B = T39 - T3A;
+						  T5X = T5K + T5L;
+						  T5M = T5K - T5L;
+						  T3C = FMA(KP980785280, T3B, T2I);
+						  T5W = FNMS(KP980785280, T3B, T2I);
+					     }
+					     T62 = T5o + T4v;
+					     T5p = T4v - T5o;
+					     T66 = FMA(KP773010453, T5X, T5W);
+					     T5Y = FNMS(KP773010453, T5X, T5W);
+					     T69 = FMA(KP773010453, T62, T61);
+					     T63 = FNMS(KP773010453, T62, T61);
+					     T5Q = FMA(KP773010453, T5p, T3C);
+					     T5q = FNMS(KP773010453, T5p, T3C);
+					}
+					{
+					     E T68, T67, T5V, T60, T5N, T5s, T23;
+					     T5V = W[48];
+					     T60 = W[49];
+					     {
+						  E T65, T64, T5Z, T6a;
+						  T65 = W[112];
+						  T68 = W[113];
+						  T64 = T5V * T63;
+						  T5Z = T5V * T5Y;
+						  T6a = T65 * T69;
+						  T67 = T65 * T66;
+						  ci[WS(rs, 25)] = FMA(T60, T5Y, T64);
+						  cr[WS(rs, 25)] = FNMS(T60, T63, T5Z);
+						  ci[WS(rs, 57)] = FMA(T68, T66, T6a);
+					     }
+					     cr[WS(rs, 57)] = FNMS(T68, T69, T67);
+					     T5T = FMA(KP773010453, T5M, T5J);
+					     T5N = FNMS(KP773010453, T5M, T5J);
+					     T5s = W[81];
+					     T23 = W[80];
+					     {
+						  E T5P, T5U, T5O, T5r;
+						  T5S = W[17];
+						  T5O = T5s * T5q;
+						  T5r = T23 * T5q;
+						  T5P = W[16];
+						  T5U = T5S * T5Q;
+						  ci[WS(rs, 41)] = FMA(T23, T5N, T5O);
+						  cr[WS(rs, 41)] = FNMS(T5s, T5N, T5r);
+						  T5R = T5P * T5Q;
+						  ci[WS(rs, 9)] = FMA(T5P, T5T, T5U);
+					     }
+					}
+				   }
+				   {
+					E Th3, TgR, TgQ, Th4, TgN, TgM, TgL;
+					{
+					     E TgG, TgF, Tge, Tgu, TgK, TgC, Tgx, Tgr;
+					     {
+						  E Tgp, Tgo, Tgd, Tgn, TfY, TgA, TgB, Tgq;
+						  {
+						       E Tgj, Tgm, Tg5, Tgc, TfQ, TfX;
+						       Tg5 = FMA(KP668178637, Tg4, Tg1);
+						       Tgp = FNMS(KP668178637, Tg1, Tg4);
+						       Tgo = FMA(KP668178637, Tg8, Tgb);
+						       Tgc = FNMS(KP668178637, Tgb, Tg8);
+						       cr[WS(rs, 9)] = FNMS(T5S, T5T, T5R);
+						       Th3 = FMA(KP707106781, Tgi, Tgh);
+						       Tgj = FNMS(KP707106781, Tgi, Tgh);
+						       Tgm = Tgk - Tgl;
+						       TgR = Tgk + Tgl;
+						       TgG = Tgc + Tg5;
+						       Tgd = Tg5 - Tgc;
+						       TfQ = FNMS(KP707106781, TfP, TfO);
+						       TgQ = FMA(KP707106781, TfP, TfO);
+						       Th4 = TfW + TfT;
+						       TfX = TfT - TfW;
+						       Tgn = FMA(KP923879532, Tgm, Tgj);
+						       TgF = FNMS(KP923879532, Tgm, Tgj);
+						       TfY = FMA(KP923879532, TfX, TfQ);
+						       TgA = FNMS(KP923879532, TfX, TfQ);
+						  }
+						  TgB = Tgo + Tgp;
+						  Tgq = Tgo - Tgp;
+						  Tge = FNMS(KP831469612, Tgd, TfY);
+						  Tgu = FMA(KP831469612, Tgd, TfY);
+						  TgK = FMA(KP831469612, TgB, TgA);
+						  TgC = FNMS(KP831469612, TgB, TgA);
+						  Tgx = FMA(KP831469612, Tgq, Tgn);
+						  Tgr = FNMS(KP831469612, Tgq, Tgn);
+					     }
+					     {
+						  E Tgw, Tgv, TfN, Tgg, TgH, TgE, Tgz;
+						  TfN = W[82];
+						  Tgg = W[83];
+						  {
+						       E Tgt, Tgs, Tgf, Tgy;
+						       Tgt = W[18];
+						       Tgw = W[19];
+						       Tgs = TfN * Tgr;
+						       Tgf = TfN * Tge;
+						       Tgy = Tgt * Tgx;
+						       Tgv = Tgt * Tgu;
+						       ci[WS(rs, 42)] = FMA(Tgg, Tge, Tgs);
+						       cr[WS(rs, 42)] = FNMS(Tgg, Tgr, Tgf);
+						       ci[WS(rs, 10)] = FMA(Tgw, Tgu, Tgy);
+						  }
+						  cr[WS(rs, 10)] = FNMS(Tgw, Tgx, Tgv);
+						  TgN = FMA(KP831469612, TgG, TgF);
+						  TgH = FNMS(KP831469612, TgG, TgF);
+						  TgE = W[51];
+						  Tgz = W[50];
+						  {
+						       E TgJ, TgO, TgI, TgD;
+						       TgM = W[115];
+						       TgI = TgE * TgC;
+						       TgD = Tgz * TgC;
+						       TgJ = W[114];
+						       TgO = TgM * TgK;
+						       ci[WS(rs, 26)] = FMA(Tgz, TgH, TgI);
+						       cr[WS(rs, 26)] = FNMS(TgE, TgH, TgD);
+						       TgL = TgJ * TgK;
+						       ci[WS(rs, 58)] = FMA(TgJ, TgN, TgO);
+						  }
+					     }
+					}
+					{
+					     E Th5, Th8, Ths, Thk, Thv, Thp, Thc, Th0;
+					     {
+						  E TgV, TgY, Thn, Thj, TgS, Thi, Th6, Th7, Tho, TgZ;
+						  cr[WS(rs, 58)] = FNMS(TgM, TgN, TgL);
+						  TgV = FNMS(KP198912367, TgU, TgT);
+						  Th6 = FMA(KP198912367, TgT, TgU);
+						  Th7 = FNMS(KP198912367, TgW, TgX);
+						  TgY = FMA(KP198912367, TgX, TgW);
+						  Th5 = FMA(KP923879532, Th4, Th3);
+						  Thn = FNMS(KP923879532, Th4, Th3);
+						  Thj = Th7 - Th6;
+						  Th8 = Th6 + Th7;
+						  TgS = FMA(KP923879532, TgR, TgQ);
+						  Thi = FNMS(KP923879532, TgR, TgQ);
+						  Tho = TgV - TgY;
+						  TgZ = TgV + TgY;
+						  Ths = FMA(KP980785280, Thj, Thi);
+						  Thk = FNMS(KP980785280, Thj, Thi);
+						  Thv = FMA(KP980785280, Tho, Thn);
+						  Thp = FNMS(KP980785280, Tho, Thn);
+						  Thc = FMA(KP980785280, TgZ, TgS);
+						  Th0 = FNMS(KP980785280, TgZ, TgS);
+					     }
+					     {
+						  E Thu, Tht, Thh, Thm, Th9, Th2, TgP;
+						  Thh = W[98];
+						  Thm = W[99];
+						  {
+						       E Thr, Thq, Thl, Thw;
+						       Thr = W[34];
+						       Thu = W[35];
+						       Thq = Thh * Thp;
+						       Thl = Thh * Thk;
+						       Thw = Thr * Thv;
+						       Tht = Thr * Ths;
+						       ci[WS(rs, 50)] = FMA(Thm, Thk, Thq);
+						       cr[WS(rs, 50)] = FNMS(Thm, Thp, Thl);
+						       ci[WS(rs, 18)] = FMA(Thu, Ths, Thw);
+						  }
+						  cr[WS(rs, 18)] = FNMS(Thu, Thv, Tht);
+						  Thf = FMA(KP980785280, Th8, Th5);
+						  Th9 = FNMS(KP980785280, Th8, Th5);
+						  Th2 = W[67];
+						  TgP = W[66];
+						  {
+						       E Thb, Thg, Tha, Th1;
+						       The = W[3];
+						       Tha = Th2 * Th0;
+						       Th1 = TgP * Th0;
+						       Thb = W[2];
+						       Thg = The * Thc;
+						       ci[WS(rs, 34)] = FMA(TgP, Th9, Tha);
+						       cr[WS(rs, 34)] = FNMS(Th2, Th9, Th1);
+						       Thd = Thb * Thc;
+						       ci[WS(rs, 2)] = FMA(Thb, Thf, Thg);
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E Tcl, Tc9, Tc8, Tcm, T9R, T93, T8O, T9U, Tez, Tdt, Td6, TeC, Tfv, Tfu, Tft;
+			      E T8B, T8A, T8z;
+			      {
+				   E TbP, TbO, TbN, T6B, T6A, T6z, TaN, TaM, TaL;
+				   {
+					E T6r, T6u, T6O, T6G, T6R, T6L, T6y, T6m;
+					{
+					     E T6k, T6h, T6J, T6F, T6e, T6E, T6s, T6t, T6K, T6l;
+					     cr[WS(rs, 2)] = FNMS(The, Thf, Thd);
+					     T6k = FMA(KP098491403, T6j, T6i);
+					     T6s = FNMS(KP098491403, T6i, T6j);
+					     T6t = FMA(KP098491403, T6f, T6g);
+					     T6h = FNMS(KP098491403, T6g, T6f);
+					     T6r = FNMS(KP980785280, T6q, T6p);
+					     T6J = FMA(KP980785280, T6q, T6p);
+					     T6F = T6s + T6t;
+					     T6u = T6s - T6t;
+					     T6e = FNMS(KP980785280, T6d, T6c);
+					     T6E = FMA(KP980785280, T6d, T6c);
+					     T6K = T6k + T6h;
+					     T6l = T6h - T6k;
+					     T6O = FMA(KP995184726, T6F, T6E);
+					     T6G = FNMS(KP995184726, T6F, T6E);
+					     T6R = FMA(KP995184726, T6K, T6J);
+					     T6L = FNMS(KP995184726, T6K, T6J);
+					     T6y = FMA(KP995184726, T6l, T6e);
+					     T6m = FNMS(KP995184726, T6l, T6e);
+					}
+					{
+					     E T6Q, T6P, T6D, T6I, T6v, T6o, T6b;
+					     T6D = W[64];
+					     T6I = W[65];
+					     {
+						  E T6N, T6M, T6H, T6S;
+						  T6N = W[0];
+						  T6Q = W[1];
+						  T6M = T6D * T6L;
+						  T6H = T6D * T6G;
+						  T6S = T6N * T6R;
+						  T6P = T6N * T6O;
+						  ci[WS(rs, 33)] = FMA(T6I, T6G, T6M);
+						  cr[WS(rs, 33)] = FNMS(T6I, T6L, T6H);
+						  ci[WS(rs, 1)] = FMA(T6Q, T6O, T6S);
+					     }
+					     cr[WS(rs, 1)] = FNMS(T6Q, T6R, T6P);
+					     T6B = FMA(KP995184726, T6u, T6r);
+					     T6v = FNMS(KP995184726, T6u, T6r);
+					     T6o = W[97];
+					     T6b = W[96];
+					     {
+						  E T6x, T6C, T6w, T6n;
+						  T6A = W[33];
+						  T6w = T6o * T6m;
+						  T6n = T6b * T6m;
+						  T6x = W[32];
+						  T6C = T6A * T6y;
+						  ci[WS(rs, 49)] = FMA(T6b, T6v, T6w);
+						  cr[WS(rs, 49)] = FNMS(T6o, T6v, T6n);
+						  T6z = T6x * T6y;
+						  ci[WS(rs, 17)] = FMA(T6x, T6B, T6C);
+					     }
+					}
+				   }
+				   {
+					E TbF, TbI, Tc2, TbU, Tc5, TbZ, TbM, Tbw;
+					{
+					     E Tbn, Tbu, TbX, TbT, Tbg, TbS, TbY, Tbv;
+					     {
+						  E TbG, TbH, TbB, TbE, Tb8, Tbf;
+						  TbB = FMA(KP923879532, TbA, Tbz);
+						  Tcl = FNMS(KP923879532, TbA, Tbz);
+						  Tc9 = TbC + TbD;
+						  TbE = TbC - TbD;
+						  cr[WS(rs, 17)] = FNMS(T6A, T6B, T6z);
+						  Tbn = FNMS(KP820678790, Tbm, Tbj);
+						  TbG = FMA(KP820678790, Tbj, Tbm);
+						  TbH = FMA(KP820678790, Tbq, Tbt);
+						  Tbu = FNMS(KP820678790, Tbt, Tbq);
+						  TbF = FMA(KP980785280, TbE, TbB);
+						  TbX = FNMS(KP980785280, TbE, TbB);
+						  Tb8 = FNMS(KP923879532, Tb7, Tb6);
+						  Tc8 = FMA(KP923879532, Tb7, Tb6);
+						  Tcm = Tbe - Tbb;
+						  Tbf = Tbb + Tbe;
+						  TbT = TbG + TbH;
+						  TbI = TbG - TbH;
+						  Tbg = FNMS(KP980785280, Tbf, Tb8);
+						  TbS = FMA(KP980785280, Tbf, Tb8);
+					     }
+					     TbY = Tbn - Tbu;
+					     Tbv = Tbn + Tbu;
+					     Tc2 = FMA(KP773010453, TbT, TbS);
+					     TbU = FNMS(KP773010453, TbT, TbS);
+					     Tc5 = FNMS(KP773010453, TbY, TbX);
+					     TbZ = FMA(KP773010453, TbY, TbX);
+					     TbM = FMA(KP773010453, Tbv, Tbg);
+					     Tbw = FNMS(KP773010453, Tbv, Tbg);
+					}
+					{
+					     E Tc4, Tc3, TbR, TbW, TbJ, Tby, Tb5;
+					     TbR = W[44];
+					     TbW = W[45];
+					     {
+						  E Tc1, Tc0, TbV, Tc6;
+						  Tc1 = W[108];
+						  Tc4 = W[109];
+						  Tc0 = TbR * TbZ;
+						  TbV = TbR * TbU;
+						  Tc6 = Tc1 * Tc5;
+						  Tc3 = Tc1 * Tc2;
+						  ci[WS(rs, 23)] = FMA(TbW, TbU, Tc0);
+						  cr[WS(rs, 23)] = FNMS(TbW, TbZ, TbV);
+						  ci[WS(rs, 55)] = FMA(Tc4, Tc2, Tc6);
+					     }
+					     cr[WS(rs, 55)] = FNMS(Tc4, Tc5, Tc3);
+					     TbP = FMA(KP773010453, TbI, TbF);
+					     TbJ = FNMS(KP773010453, TbI, TbF);
+					     Tby = W[77];
+					     Tb5 = W[76];
+					     {
+						  E TbL, TbQ, TbK, Tbx;
+						  TbO = W[13];
+						  TbK = Tby * Tbw;
+						  Tbx = Tb5 * Tbw;
+						  TbL = W[12];
+						  TbQ = TbO * TbM;
+						  ci[WS(rs, 39)] = FMA(Tb5, TbJ, TbK);
+						  cr[WS(rs, 39)] = FNMS(Tby, TbJ, Tbx);
+						  TbN = TbL * TbM;
+						  ci[WS(rs, 7)] = FMA(TbL, TbP, TbQ);
+					     }
+					}
+				   }
+				   {
+					E TaD, TaG, Tb0, TaS, Tb3, TaX, TaK, Tay;
+					{
+					     E Tat, Taw, TaV, TaR, Taq, TaQ, TaW, Tax;
+					     {
+						  E TaE, TaF, TaB, TaC, Tao, Tap;
+						  TaB = FMA(KP923879532, T9Q, T9N);
+						  T9R = FNMS(KP923879532, T9Q, T9N);
+						  T93 = T8V + T92;
+						  TaC = T8V - T92;
+						  cr[WS(rs, 7)] = FNMS(TbO, TbP, TbN);
+						  Tat = FNMS(KP303346683, Tas, Tar);
+						  TaE = FMA(KP303346683, Tar, Tas);
+						  TaF = FMA(KP303346683, Tau, Tav);
+						  Taw = FNMS(KP303346683, Tav, Tau);
+						  TaD = FMA(KP831469612, TaC, TaB);
+						  TaV = FNMS(KP831469612, TaC, TaB);
+						  Tao = FNMS(KP923879532, T8N, T8G);
+						  T8O = FMA(KP923879532, T8N, T8G);
+						  T9U = T9S - T9T;
+						  Tap = T9S + T9T;
+						  TaR = TaE + TaF;
+						  TaG = TaE - TaF;
+						  Taq = FMA(KP831469612, Tap, Tao);
+						  TaQ = FNMS(KP831469612, Tap, Tao);
+					     }
+					     TaW = Tat - Taw;
+					     Tax = Tat + Taw;
+					     Tb0 = FMA(KP956940335, TaR, TaQ);
+					     TaS = FNMS(KP956940335, TaR, TaQ);
+					     Tb3 = FNMS(KP956940335, TaW, TaV);
+					     TaX = FMA(KP956940335, TaW, TaV);
+					     TaK = FMA(KP956940335, Tax, Taq);
+					     Tay = FNMS(KP956940335, Tax, Taq);
+					}
+					{
+					     E Tb2, Tb1, TaP, TaU, TaH, TaA, Tan;
+					     TaP = W[36];
+					     TaU = W[37];
+					     {
+						  E TaZ, TaY, TaT, Tb4;
+						  TaZ = W[100];
+						  Tb2 = W[101];
+						  TaY = TaP * TaX;
+						  TaT = TaP * TaS;
+						  Tb4 = TaZ * Tb3;
+						  Tb1 = TaZ * Tb0;
+						  ci[WS(rs, 19)] = FMA(TaU, TaS, TaY);
+						  cr[WS(rs, 19)] = FNMS(TaU, TaX, TaT);
+						  ci[WS(rs, 51)] = FMA(Tb2, Tb0, Tb4);
+					     }
+					     cr[WS(rs, 51)] = FNMS(Tb2, Tb3, Tb1);
+					     TaN = FMA(KP956940335, TaG, TaD);
+					     TaH = FNMS(KP956940335, TaG, TaD);
+					     TaA = W[69];
+					     Tan = W[68];
+					     {
+						  E TaJ, TaO, TaI, Taz;
+						  TaM = W[5];
+						  TaI = TaA * Tay;
+						  Taz = Tan * Tay;
+						  TaJ = W[4];
+						  TaO = TaM * TaK;
+						  ci[WS(rs, 35)] = FMA(Tan, TaH, TaI);
+						  cr[WS(rs, 35)] = FNMS(TaA, TaH, Taz);
+						  TaL = TaJ * TaK;
+						  ci[WS(rs, 3)] = FMA(TaJ, TaN, TaO);
+					     }
+					}
+				   }
+				   {
+					E Tfl, Tfo, TfI, TfA, TfL, TfF, Tfs, Tfg;
+					{
+					     E Tfe, Tfb, TfD, Tfz, Tf8, Tfy, TfE, Tff;
+					     {
+						  E Tfm, Tfn, Tfj, Tfk, Tf6, Tf7;
+						  Tfj = FNMS(KP707106781, Tey, Tev);
+						  Tez = FMA(KP707106781, Tey, Tev);
+						  Tdt = Tdh - Tds;
+						  Tfk = Tds + Tdh;
+						  cr[WS(rs, 3)] = FNMS(TaM, TaN, TaL);
+						  Tfe = FNMS(KP198912367, Tfd, Tfc);
+						  Tfm = FMA(KP198912367, Tfc, Tfd);
+						  Tfn = FNMS(KP198912367, Tf9, Tfa);
+						  Tfb = FMA(KP198912367, Tfa, Tf9);
+						  Tfl = FNMS(KP923879532, Tfk, Tfj);
+						  TfD = FMA(KP923879532, Tfk, Tfj);
+						  Tf6 = FNMS(KP707106781, Td5, TcU);
+						  Td6 = FMA(KP707106781, Td5, TcU);
+						  TeC = TeA - TeB;
+						  Tf7 = TeA + TeB;
+						  Tfz = Tfm + Tfn;
+						  Tfo = Tfm - Tfn;
+						  Tf8 = FNMS(KP923879532, Tf7, Tf6);
+						  Tfy = FMA(KP923879532, Tf7, Tf6);
+					     }
+					     TfE = Tfe + Tfb;
+					     Tff = Tfb - Tfe;
+					     TfI = FMA(KP980785280, Tfz, Tfy);
+					     TfA = FNMS(KP980785280, Tfz, Tfy);
+					     TfL = FMA(KP980785280, TfE, TfD);
+					     TfF = FNMS(KP980785280, TfE, TfD);
+					     Tfs = FMA(KP980785280, Tff, Tf8);
+					     Tfg = FNMS(KP980785280, Tff, Tf8);
+					}
+					{
+					     E TfK, TfJ, Tfx, TfC, Tfp, Tfi, Tf5;
+					     Tfx = W[58];
+					     TfC = W[59];
+					     {
+						  E TfH, TfG, TfB, TfM;
+						  TfH = W[122];
+						  TfK = W[123];
+						  TfG = Tfx * TfF;
+						  TfB = Tfx * TfA;
+						  TfM = TfH * TfL;
+						  TfJ = TfH * TfI;
+						  ci[WS(rs, 30)] = FMA(TfC, TfA, TfG);
+						  cr[WS(rs, 30)] = FNMS(TfC, TfF, TfB);
+						  ci[WS(rs, 62)] = FMA(TfK, TfI, TfM);
+					     }
+					     cr[WS(rs, 62)] = FNMS(TfK, TfL, TfJ);
+					     Tfv = FMA(KP980785280, Tfo, Tfl);
+					     Tfp = FNMS(KP980785280, Tfo, Tfl);
+					     Tfi = W[91];
+					     Tf5 = W[90];
+					     {
+						  E Tfr, Tfw, Tfq, Tfh;
+						  Tfu = W[27];
+						  Tfq = Tfi * Tfg;
+						  Tfh = Tf5 * Tfg;
+						  Tfr = W[26];
+						  Tfw = Tfu * Tfs;
+						  ci[WS(rs, 46)] = FMA(Tf5, Tfp, Tfq);
+						  cr[WS(rs, 46)] = FNMS(Tfi, Tfp, Tfh);
+						  Tft = Tfr * Tfs;
+						  ci[WS(rs, 14)] = FMA(Tfr, Tfv, Tfw);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T89, T7X, T7W, T8a, T7D, T7C, T7B;
+				   {
+					E T7t, T7w, T7Q, T7I, T7T, T7N, T7A, T7k;
+					{
+					     E T7b, T7i, T7L, T7H, T74, T7G, T7M, T7j;
+					     {
+						  E T7u, T7v, T7p, T7s, T6W, T73;
+						  T7p = FMA(KP923879532, T7o, T7n);
+						  T89 = FNMS(KP923879532, T7o, T7n);
+						  T7X = T7q + T7r;
+						  T7s = T7q - T7r;
+						  cr[WS(rs, 14)] = FNMS(Tfu, Tfv, Tft);
+						  T7b = FNMS(KP534511135, T7a, T77);
+						  T7u = FMA(KP534511135, T77, T7a);
+						  T7v = FNMS(KP534511135, T7e, T7h);
+						  T7i = FMA(KP534511135, T7h, T7e);
+						  T7t = FMA(KP831469612, T7s, T7p);
+						  T7L = FNMS(KP831469612, T7s, T7p);
+						  T6W = FMA(KP923879532, T6V, T6U);
+						  T7W = FNMS(KP923879532, T6V, T6U);
+						  T8a = T72 + T6Z;
+						  T73 = T6Z - T72;
+						  T7H = T7v - T7u;
+						  T7w = T7u + T7v;
+						  T74 = FMA(KP831469612, T73, T6W);
+						  T7G = FNMS(KP831469612, T73, T6W);
+					     }
+					     T7M = T7b - T7i;
+					     T7j = T7b + T7i;
+					     T7Q = FMA(KP881921264, T7H, T7G);
+					     T7I = FNMS(KP881921264, T7H, T7G);
+					     T7T = FMA(KP881921264, T7M, T7L);
+					     T7N = FNMS(KP881921264, T7M, T7L);
+					     T7A = FMA(KP881921264, T7j, T74);
+					     T7k = FNMS(KP881921264, T7j, T74);
+					}
+					{
+					     E T7S, T7R, T7F, T7K, T7x, T7m, T6T;
+					     T7F = W[104];
+					     T7K = W[105];
+					     {
+						  E T7P, T7O, T7J, T7U;
+						  T7P = W[40];
+						  T7S = W[41];
+						  T7O = T7F * T7N;
+						  T7J = T7F * T7I;
+						  T7U = T7P * T7T;
+						  T7R = T7P * T7Q;
+						  ci[WS(rs, 53)] = FMA(T7K, T7I, T7O);
+						  cr[WS(rs, 53)] = FNMS(T7K, T7N, T7J);
+						  ci[WS(rs, 21)] = FMA(T7S, T7Q, T7U);
+					     }
+					     cr[WS(rs, 21)] = FNMS(T7S, T7T, T7R);
+					     T7D = FMA(KP881921264, T7w, T7t);
+					     T7x = FNMS(KP881921264, T7w, T7t);
+					     T7m = W[73];
+					     T6T = W[72];
+					     {
+						  E T7z, T7E, T7y, T7l;
+						  T7C = W[9];
+						  T7y = T7m * T7k;
+						  T7l = T6T * T7k;
+						  T7z = W[8];
+						  T7E = T7C * T7A;
+						  ci[WS(rs, 37)] = FMA(T6T, T7x, T7y);
+						  cr[WS(rs, 37)] = FNMS(T7m, T7x, T7l);
+						  T7B = T7z * T7A;
+						  ci[WS(rs, 5)] = FMA(T7z, T7D, T7E);
+					     }
+					}
+				   }
+				   {
+					E T8u, T8t, T86, T8i, T8y, T8q, T8l, T8f;
+					{
+					     E T8d, T8c, T85, T8b, T7Y, T8o, T81, T84, T8p, T8e;
+					     T81 = FMA(KP303346683, T80, T7Z);
+					     T8d = FNMS(KP303346683, T7Z, T80);
+					     T8c = FMA(KP303346683, T82, T83);
+					     T84 = FNMS(KP303346683, T83, T82);
+					     cr[WS(rs, 5)] = FNMS(T7C, T7D, T7B);
+					     T8u = T84 + T81;
+					     T85 = T81 - T84;
+					     T8b = FNMS(KP831469612, T8a, T89);
+					     T8t = FMA(KP831469612, T8a, T89);
+					     T7Y = FNMS(KP831469612, T7X, T7W);
+					     T8o = FMA(KP831469612, T7X, T7W);
+					     T8p = T8c + T8d;
+					     T8e = T8c - T8d;
+					     T86 = FNMS(KP956940335, T85, T7Y);
+					     T8i = FMA(KP956940335, T85, T7Y);
+					     T8y = FMA(KP956940335, T8p, T8o);
+					     T8q = FNMS(KP956940335, T8p, T8o);
+					     T8l = FMA(KP956940335, T8e, T8b);
+					     T8f = FNMS(KP956940335, T8e, T8b);
+					}
+					{
+					     E T8k, T8j, T7V, T88, T8v, T8s, T8n;
+					     T7V = W[88];
+					     T88 = W[89];
+					     {
+						  E T8h, T8g, T87, T8m;
+						  T8h = W[24];
+						  T8k = W[25];
+						  T8g = T7V * T8f;
+						  T87 = T7V * T86;
+						  T8m = T8h * T8l;
+						  T8j = T8h * T8i;
+						  ci[WS(rs, 45)] = FMA(T88, T86, T8g);
+						  cr[WS(rs, 45)] = FNMS(T88, T8f, T87);
+						  ci[WS(rs, 13)] = FMA(T8k, T8i, T8m);
+					     }
+					     cr[WS(rs, 13)] = FNMS(T8k, T8l, T8j);
+					     T8B = FMA(KP956940335, T8u, T8t);
+					     T8v = FNMS(KP956940335, T8u, T8t);
+					     T8s = W[57];
+					     T8n = W[56];
+					     {
+						  E T8x, T8C, T8w, T8r;
+						  T8A = W[121];
+						  T8w = T8s * T8q;
+						  T8r = T8n * T8q;
+						  T8x = W[120];
+						  T8C = T8A * T8y;
+						  ci[WS(rs, 29)] = FMA(T8n, T8v, T8w);
+						  cr[WS(rs, 29)] = FNMS(T8s, T8v, T8r);
+						  T8z = T8x * T8y;
+						  ci[WS(rs, 61)] = FMA(T8x, T8B, T8C);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E Ta5, Ta4, Ta3, TeN, TeM, TeL;
+				   {
+					E T9V, T9Y, Tai, Taa, Tal, Taf, Ta2, T9I;
+					{
+					     E T9n, T9G, Tad, Ta9, T94, Ta8, T9W, T9X, Tae, T9H;
+					     cr[WS(rs, 61)] = FNMS(T8A, T8B, T8z);
+					     T9n = FNMS(KP534511135, T9m, T9f);
+					     T9W = FMA(KP534511135, T9f, T9m);
+					     T9X = FMA(KP534511135, T9y, T9F);
+					     T9G = FNMS(KP534511135, T9F, T9y);
+					     T9V = FMA(KP831469612, T9U, T9R);
+					     Tad = FNMS(KP831469612, T9U, T9R);
+					     Ta9 = T9W + T9X;
+					     T9Y = T9W - T9X;
+					     T94 = FNMS(KP831469612, T93, T8O);
+					     Ta8 = FMA(KP831469612, T93, T8O);
+					     Tae = T9G - T9n;
+					     T9H = T9n + T9G;
+					     Tai = FMA(KP881921264, Ta9, Ta8);
+					     Taa = FNMS(KP881921264, Ta9, Ta8);
+					     Tal = FNMS(KP881921264, Tae, Tad);
+					     Taf = FMA(KP881921264, Tae, Tad);
+					     Ta2 = FNMS(KP881921264, T9H, T94);
+					     T9I = FMA(KP881921264, T9H, T94);
+					}
+					{
+					     E Tak, Taj, Ta7, Tac, T9Z, T9K, T8D;
+					     Ta7 = W[52];
+					     Tac = W[53];
+					     {
+						  E Tah, Tag, Tab, Tam;
+						  Tah = W[116];
+						  Tak = W[117];
+						  Tag = Ta7 * Taf;
+						  Tab = Ta7 * Taa;
+						  Tam = Tah * Tal;
+						  Taj = Tah * Tai;
+						  ci[WS(rs, 27)] = FMA(Tac, Taa, Tag);
+						  cr[WS(rs, 27)] = FNMS(Tac, Taf, Tab);
+						  ci[WS(rs, 59)] = FMA(Tak, Tai, Tam);
+					     }
+					     cr[WS(rs, 59)] = FNMS(Tak, Tal, Taj);
+					     Ta5 = FMA(KP881921264, T9Y, T9V);
+					     T9Z = FNMS(KP881921264, T9Y, T9V);
+					     T9K = W[85];
+					     T8D = W[84];
+					     {
+						  E Ta1, Ta6, Ta0, T9J;
+						  Ta4 = W[21];
+						  Ta0 = T9K * T9I;
+						  T9J = T8D * T9I;
+						  Ta1 = W[20];
+						  Ta6 = Ta4 * Ta2;
+						  ci[WS(rs, 43)] = FMA(T8D, T9Z, Ta0);
+						  cr[WS(rs, 43)] = FNMS(T9K, T9Z, T9J);
+						  Ta3 = Ta1 * Ta2;
+						  ci[WS(rs, 11)] = FMA(Ta1, Ta5, Ta6);
+					     }
+					}
+				   }
+				   {
+					E TeD, TeG, Tf0, TeS, Tf3, TeX, TeK, Teo;
+					{
+					     E Tem, TdV, TeV, TeR, Tdu, TeQ, TeE, TeF, TeW, Ten;
+					     cr[WS(rs, 11)] = FNMS(Ta4, Ta5, Ta3);
+					     Tem = FMA(KP668178637, Tel, Tec);
+					     TeE = FNMS(KP668178637, Tec, Tel);
+					     TeF = FMA(KP668178637, TdL, TdU);
+					     TdV = FNMS(KP668178637, TdU, TdL);
+					     TeD = FNMS(KP923879532, TeC, Tez);
+					     TeV = FMA(KP923879532, TeC, Tez);
+					     TeR = TeE + TeF;
+					     TeG = TeE - TeF;
+					     Tdu = FNMS(KP923879532, Tdt, Td6);
+					     TeQ = FMA(KP923879532, Tdt, Td6);
+					     TeW = Tem + TdV;
+					     Ten = TdV - Tem;
+					     Tf0 = FMA(KP831469612, TeR, TeQ);
+					     TeS = FNMS(KP831469612, TeR, TeQ);
+					     Tf3 = FMA(KP831469612, TeW, TeV);
+					     TeX = FNMS(KP831469612, TeW, TeV);
+					     TeK = FMA(KP831469612, Ten, Tdu);
+					     Teo = FNMS(KP831469612, Ten, Tdu);
+					}
+					{
+					     E Tf2, Tf1, TeP, TeU, TeH, Teq, TcP;
+					     TeP = W[74];
+					     TeU = W[75];
+					     {
+						  E TeZ, TeY, TeT, Tf4;
+						  TeZ = W[10];
+						  Tf2 = W[11];
+						  TeY = TeP * TeX;
+						  TeT = TeP * TeS;
+						  Tf4 = TeZ * Tf3;
+						  Tf1 = TeZ * Tf0;
+						  ci[WS(rs, 38)] = FMA(TeU, TeS, TeY);
+						  cr[WS(rs, 38)] = FNMS(TeU, TeX, TeT);
+						  ci[WS(rs, 6)] = FMA(Tf2, Tf0, Tf4);
+					     }
+					     cr[WS(rs, 6)] = FNMS(Tf2, Tf3, Tf1);
+					     TeN = FMA(KP831469612, TeG, TeD);
+					     TeH = FNMS(KP831469612, TeG, TeD);
+					     Teq = W[107];
+					     TcP = W[106];
+					     {
+						  E TeJ, TeO, TeI, Tep;
+						  TeM = W[43];
+						  TeI = Teq * Teo;
+						  Tep = TcP * Teo;
+						  TeJ = W[42];
+						  TeO = TeM * TeK;
+						  ci[WS(rs, 54)] = FMA(TcP, TeH, TeI);
+						  cr[WS(rs, 54)] = FNMS(Teq, TeH, Tep);
+						  TeL = TeJ * TeK;
+						  ci[WS(rs, 22)] = FMA(TeJ, TeN, TeO);
+					     }
+					}
+				   }
+				   {
+					E Tcn, Tcq, TcK, TcC, TcN, TcH, Tcu, Tci;
+					{
+					     E Tcd, Tcg, TcF, TcB, Tca, TcA, Tco, Tcp, TcG, Tch;
+					     cr[WS(rs, 22)] = FNMS(TeM, TeN, TeL);
+					     Tcd = FNMS(KP098491403, Tcc, Tcb);
+					     Tco = FMA(KP098491403, Tcb, Tcc);
+					     Tcp = FMA(KP098491403, Tce, Tcf);
+					     Tcg = FNMS(KP098491403, Tcf, Tce);
+					     Tcn = FMA(KP980785280, Tcm, Tcl);
+					     TcF = FNMS(KP980785280, Tcm, Tcl);
+					     TcB = Tco + Tcp;
+					     Tcq = Tco - Tcp;
+					     Tca = FNMS(KP980785280, Tc9, Tc8);
+					     TcA = FMA(KP980785280, Tc9, Tc8);
+					     TcG = Tcg - Tcd;
+					     Tch = Tcd + Tcg;
+					     TcK = FMA(KP995184726, TcB, TcA);
+					     TcC = FNMS(KP995184726, TcB, TcA);
+					     TcN = FNMS(KP995184726, TcG, TcF);
+					     TcH = FMA(KP995184726, TcG, TcF);
+					     Tcu = FNMS(KP995184726, Tch, Tca);
+					     Tci = FMA(KP995184726, Tch, Tca);
+					}
+					{
+					     E TcM, TcL, Tcz, TcE, Tcr, Tck, Tc7;
+					     Tcz = W[60];
+					     TcE = W[61];
+					     {
+						  E TcJ, TcI, TcD, TcO;
+						  TcJ = W[124];
+						  TcM = W[125];
+						  TcI = Tcz * TcH;
+						  TcD = Tcz * TcC;
+						  TcO = TcJ * TcN;
+						  TcL = TcJ * TcK;
+						  ci[WS(rs, 31)] = FMA(TcE, TcC, TcI);
+						  cr[WS(rs, 31)] = FNMS(TcE, TcH, TcD);
+						  ci[WS(rs, 63)] = FMA(TcM, TcK, TcO);
+					     }
+					     cr[WS(rs, 63)] = FNMS(TcM, TcN, TcL);
+					     Tcx = FMA(KP995184726, Tcq, Tcn);
+					     Tcr = FNMS(KP995184726, Tcq, Tcn);
+					     Tck = W[93];
+					     Tc7 = W[92];
+					     {
+						  E Tct, Tcy, Tcs, Tcj;
+						  Tcw = W[29];
+						  Tcs = Tck * Tci;
+						  Tcj = Tc7 * Tci;
+						  Tct = W[28];
+						  Tcy = Tcw * Tcu;
+						  ci[WS(rs, 47)] = FMA(Tc7, Tcr, Tcs);
+						  cr[WS(rs, 47)] = FNMS(Tck, Tcr, Tcj);
+						  Tcv = Tct * Tcu;
+						  ci[WS(rs, 15)] = FMA(Tct, Tcx, Tcy);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 15)] = FNMS(Tcw, Tcx, Tcv);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 64},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 64, "hb_64", twinstr, &GENUS, {520, 126, 518, 0} };
+
+void X(codelet_hb_64) (planner *p) {
+     X(khc2hc_register) (p, hb_64, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 64 -dif -name hb_64 -include hb.h */
+
+/*
+ * This function contains 1038 FP additions, 500 FP multiplications,
+ * (or, 808 additions, 270 multiplications, 230 fused multiply/add),
+ * 196 stack variables, 15 constants, and 256 memory accesses
+ */
+#include "hb.h"
+
+static void hb_64(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 126); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 126, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E Tf, T8C, Tfa, Thk, Tgg, ThM, T2c, T5O, T4K, T6g, Tag, TdE, TcA, Te6, T7P;
+	       E T94, TK, T7o, T38, T4P, Tfv, Thn, T5W, T6j, Tb0, TdK, Tfs, Tho, T8K, T97;
+	       E Tb7, TdL, TZ, T7l, T2P, T4Q, Tfo, Thq, T5T, T6k, TaH, TdH, Tfl, Thr, T8H;
+	       E T98, TaO, TdI, Tu, T95, Tfh, ThN, Tgj, Thl, T2v, T6h, T4N, T5P, Tav, Te7;
+	       E TcD, TdF, T7S, T8D, T1L, T20, T7A, T7D, T7G, T7H, T40, T62, Tg1, Thv, Tg8;
+	       E Thz, Tg5, Thw, T4t, T5Z, T4j, T60, T4w, T63, TbY, TdS, Tcd, TdQ, TfU, Thy;
+	       E T8P, T9z, T8S, T9A, Tcl, TdP, Tco, TdT, T1g, T1v, T7r, T7u, T7x, T7y, T3j;
+	       E T69, TfI, ThD, TfP, ThG, TfM, ThC, T3M, T66, T3C, T67, T3P, T6a, Tbl, TdZ;
+	       E TbA, TdX, TfB, ThF, T8W, T9C, T8Z, T9D, TbI, TdW, TbL, Te0;
+	       {
+		    E T3, Ta6, T6, Tcu, T4I, Ta7, T4F, Tcv, Td, Tcy, T27, Tae, Ta, Tcx, T2a;
+		    E Tab;
+		    {
+			 E T1, T2, T4D, T4E;
+			 T1 = cr[0];
+			 T2 = ci[WS(rs, 31)];
+			 T3 = T1 + T2;
+			 Ta6 = T1 - T2;
+			 {
+			      E T4, T5, T4G, T4H;
+			      T4 = cr[WS(rs, 16)];
+			      T5 = ci[WS(rs, 15)];
+			      T6 = T4 + T5;
+			      Tcu = T4 - T5;
+			      T4G = ci[WS(rs, 47)];
+			      T4H = cr[WS(rs, 48)];
+			      T4I = T4G - T4H;
+			      Ta7 = T4G + T4H;
+			 }
+			 T4D = ci[WS(rs, 63)];
+			 T4E = cr[WS(rs, 32)];
+			 T4F = T4D - T4E;
+			 Tcv = T4D + T4E;
+			 {
+			      E Tb, Tc, Tac, T25, T26, Tad;
+			      Tb = ci[WS(rs, 7)];
+			      Tc = cr[WS(rs, 24)];
+			      Tac = Tb - Tc;
+			      T25 = ci[WS(rs, 39)];
+			      T26 = cr[WS(rs, 56)];
+			      Tad = T25 + T26;
+			      Td = Tb + Tc;
+			      Tcy = Tac + Tad;
+			      T27 = T25 - T26;
+			      Tae = Tac - Tad;
+			 }
+			 {
+			      E T8, T9, Ta9, T28, T29, Taa;
+			      T8 = cr[WS(rs, 8)];
+			      T9 = ci[WS(rs, 23)];
+			      Ta9 = T8 - T9;
+			      T28 = ci[WS(rs, 55)];
+			      T29 = cr[WS(rs, 40)];
+			      Taa = T28 + T29;
+			      Ta = T8 + T9;
+			      Tcx = Ta9 + Taa;
+			      T2a = T28 - T29;
+			      Tab = Ta9 - Taa;
+			 }
+		    }
+		    {
+			 E T7, Te, Tf8, Tf9;
+			 T7 = T3 + T6;
+			 Te = Ta + Td;
+			 Tf = T7 + Te;
+			 T8C = T7 - Te;
+			 Tf8 = Ta6 + Ta7;
+			 Tf9 = KP707106781 * (Tcx + Tcy);
+			 Tfa = Tf8 - Tf9;
+			 Thk = Tf8 + Tf9;
+		    }
+		    {
+			 E Tge, Tgf, T24, T2b;
+			 Tge = Tcv - Tcu;
+			 Tgf = KP707106781 * (Tab - Tae);
+			 Tgg = Tge + Tgf;
+			 ThM = Tge - Tgf;
+			 T24 = T3 - T6;
+			 T2b = T27 - T2a;
+			 T2c = T24 + T2b;
+			 T5O = T24 - T2b;
+		    }
+		    {
+			 E T4C, T4J, Ta8, Taf;
+			 T4C = Ta - Td;
+			 T4J = T4F - T4I;
+			 T4K = T4C + T4J;
+			 T6g = T4J - T4C;
+			 Ta8 = Ta6 - Ta7;
+			 Taf = KP707106781 * (Tab + Tae);
+			 Tag = Ta8 - Taf;
+			 TdE = Ta8 + Taf;
+		    }
+		    {
+			 E Tcw, Tcz, T7N, T7O;
+			 Tcw = Tcu + Tcv;
+			 Tcz = KP707106781 * (Tcx - Tcy);
+			 TcA = Tcw - Tcz;
+			 Te6 = Tcw + Tcz;
+			 T7N = T4F + T4I;
+			 T7O = T2a + T27;
+			 T7P = T7N + T7O;
+			 T94 = T7N - T7O;
+		    }
+	       }
+	       {
+		    E TC, Tb1, T2Z, TaQ, T2X, Tb2, T7m, TaR, TJ, Tb4, Tb5, T2Q, T36, TaV, TaY;
+		    E T7n, Tfq, Tfr;
+		    {
+			 E Tw, Tx, Ty, Tz, TA, TB;
+			 Tw = cr[WS(rs, 2)];
+			 Tx = ci[WS(rs, 29)];
+			 Ty = Tw + Tx;
+			 Tz = cr[WS(rs, 18)];
+			 TA = ci[WS(rs, 13)];
+			 TB = Tz + TA;
+			 TC = Ty + TB;
+			 Tb1 = Tz - TA;
+			 T2Z = Ty - TB;
+			 TaQ = Tw - Tx;
+		    }
+		    {
+			 E T2R, T2S, T2T, T2U, T2V, T2W;
+			 T2R = ci[WS(rs, 61)];
+			 T2S = cr[WS(rs, 34)];
+			 T2T = T2R - T2S;
+			 T2U = ci[WS(rs, 45)];
+			 T2V = cr[WS(rs, 50)];
+			 T2W = T2U - T2V;
+			 T2X = T2T - T2W;
+			 Tb2 = T2R + T2S;
+			 T7m = T2T + T2W;
+			 TaR = T2U + T2V;
+		    }
+		    {
+			 E TF, TaT, T35, TaU, TI, TaW, T32, TaX;
+			 {
+			      E TD, TE, T33, T34;
+			      TD = cr[WS(rs, 10)];
+			      TE = ci[WS(rs, 21)];
+			      TF = TD + TE;
+			      TaT = TD - TE;
+			      T33 = ci[WS(rs, 53)];
+			      T34 = cr[WS(rs, 42)];
+			      T35 = T33 - T34;
+			      TaU = T33 + T34;
+			 }
+			 {
+			      E TG, TH, T30, T31;
+			      TG = ci[WS(rs, 5)];
+			      TH = cr[WS(rs, 26)];
+			      TI = TG + TH;
+			      TaW = TG - TH;
+			      T30 = ci[WS(rs, 37)];
+			      T31 = cr[WS(rs, 58)];
+			      T32 = T30 - T31;
+			      TaX = T30 + T31;
+			 }
+			 TJ = TF + TI;
+			 Tb4 = TaT + TaU;
+			 Tb5 = TaW + TaX;
+			 T2Q = TF - TI;
+			 T36 = T32 - T35;
+			 TaV = TaT - TaU;
+			 TaY = TaW - TaX;
+			 T7n = T35 + T32;
+		    }
+		    TK = TC + TJ;
+		    T7o = T7m + T7n;
+		    {
+			 E T2Y, T37, Tft, Tfu;
+			 T2Y = T2Q + T2X;
+			 T37 = T2Z + T36;
+			 T38 = FMA(KP923879532, T2Y, KP382683432 * T37);
+			 T4P = FNMS(KP382683432, T2Y, KP923879532 * T37);
+			 Tft = TaQ + TaR;
+			 Tfu = KP707106781 * (Tb4 + Tb5);
+			 Tfv = Tft - Tfu;
+			 Thn = Tft + Tfu;
+		    }
+		    {
+			 E T5U, T5V, TaS, TaZ;
+			 T5U = T2X - T2Q;
+			 T5V = T2Z - T36;
+			 T5W = FMA(KP382683432, T5U, KP923879532 * T5V);
+			 T6j = FNMS(KP923879532, T5U, KP382683432 * T5V);
+			 TaS = TaQ - TaR;
+			 TaZ = KP707106781 * (TaV + TaY);
+			 Tb0 = TaS - TaZ;
+			 TdK = TaS + TaZ;
+		    }
+		    Tfq = Tb2 - Tb1;
+		    Tfr = KP707106781 * (TaV - TaY);
+		    Tfs = Tfq + Tfr;
+		    Tho = Tfq - Tfr;
+		    {
+			 E T8I, T8J, Tb3, Tb6;
+			 T8I = TC - TJ;
+			 T8J = T7m - T7n;
+			 T8K = T8I + T8J;
+			 T97 = T8I - T8J;
+			 Tb3 = Tb1 + Tb2;
+			 Tb6 = KP707106781 * (Tb4 - Tb5);
+			 Tb7 = Tb3 - Tb6;
+			 TdL = Tb3 + Tb6;
+		    }
+	       }
+	       {
+		    E TR, TaI, T2G, Tax, T2E, TaJ, T7j, Tay, TY, TaL, TaM, T2x, T2N, TaC, TaF;
+		    E T7k, Tfj, Tfk;
+		    {
+			 E TL, TM, TN, TO, TP, TQ;
+			 TL = ci[WS(rs, 1)];
+			 TM = cr[WS(rs, 30)];
+			 TN = TL + TM;
+			 TO = cr[WS(rs, 14)];
+			 TP = ci[WS(rs, 17)];
+			 TQ = TO + TP;
+			 TR = TN + TQ;
+			 TaI = TL - TM;
+			 T2G = TN - TQ;
+			 Tax = TO - TP;
+		    }
+		    {
+			 E T2y, T2z, T2A, T2B, T2C, T2D;
+			 T2y = ci[WS(rs, 33)];
+			 T2z = cr[WS(rs, 62)];
+			 T2A = T2y - T2z;
+			 T2B = ci[WS(rs, 49)];
+			 T2C = cr[WS(rs, 46)];
+			 T2D = T2B - T2C;
+			 T2E = T2A - T2D;
+			 TaJ = T2B + T2C;
+			 T7j = T2A + T2D;
+			 Tay = T2y + T2z;
+		    }
+		    {
+			 E TU, TaA, T2M, TaB, TX, TaD, T2J, TaE;
+			 {
+			      E TS, TT, T2K, T2L;
+			      TS = cr[WS(rs, 6)];
+			      TT = ci[WS(rs, 25)];
+			      TU = TS + TT;
+			      TaA = TS - TT;
+			      T2K = ci[WS(rs, 57)];
+			      T2L = cr[WS(rs, 38)];
+			      T2M = T2K - T2L;
+			      TaB = T2K + T2L;
+			 }
+			 {
+			      E TV, TW, T2H, T2I;
+			      TV = ci[WS(rs, 9)];
+			      TW = cr[WS(rs, 22)];
+			      TX = TV + TW;
+			      TaD = TV - TW;
+			      T2H = ci[WS(rs, 41)];
+			      T2I = cr[WS(rs, 54)];
+			      T2J = T2H - T2I;
+			      TaE = T2H + T2I;
+			 }
+			 TY = TU + TX;
+			 TaL = TaA - TaB;
+			 TaM = TaD - TaE;
+			 T2x = TU - TX;
+			 T2N = T2J - T2M;
+			 TaC = TaA + TaB;
+			 TaF = TaD + TaE;
+			 T7k = T2M + T2J;
+		    }
+		    TZ = TR + TY;
+		    T7l = T7j + T7k;
+		    {
+			 E T2F, T2O, Tfm, Tfn;
+			 T2F = T2x + T2E;
+			 T2O = T2G + T2N;
+			 T2P = FNMS(KP382683432, T2O, KP923879532 * T2F);
+			 T4Q = FMA(KP382683432, T2F, KP923879532 * T2O);
+			 Tfm = TaI + TaJ;
+			 Tfn = KP707106781 * (TaC + TaF);
+			 Tfo = Tfm - Tfn;
+			 Thq = Tfm + Tfn;
+		    }
+		    {
+			 E T5R, T5S, Taz, TaG;
+			 T5R = T2E - T2x;
+			 T5S = T2G - T2N;
+			 T5T = FNMS(KP923879532, T5S, KP382683432 * T5R);
+			 T6k = FMA(KP923879532, T5R, KP382683432 * T5S);
+			 Taz = Tax - Tay;
+			 TaG = KP707106781 * (TaC - TaF);
+			 TaH = Taz - TaG;
+			 TdH = Taz + TaG;
+		    }
+		    Tfj = KP707106781 * (TaL - TaM);
+		    Tfk = Tax + Tay;
+		    Tfl = Tfj - Tfk;
+		    Thr = Tfk + Tfj;
+		    {
+			 E T8F, T8G, TaK, TaN;
+			 T8F = T7j - T7k;
+			 T8G = TR - TY;
+			 T8H = T8F - T8G;
+			 T98 = T8G + T8F;
+			 TaK = TaI - TaJ;
+			 TaN = KP707106781 * (TaL + TaM);
+			 TaO = TaK - TaN;
+			 TdI = TaK + TaN;
+		    }
+	       }
+	       {
+		    E Ti, T2j, Tl, T2g, T2d, T2k, Tfc, Tfb, Tat, Taq, Tp, T2s, Ts, T2p, T2m;
+		    E T2t, Tff, Tfe, Tam, Taj;
+		    {
+			 E Tar, Tas, Tao, Tap;
+			 {
+			      E Tg, Th, T2h, T2i;
+			      Tg = cr[WS(rs, 4)];
+			      Th = ci[WS(rs, 27)];
+			      Ti = Tg + Th;
+			      Tar = Tg - Th;
+			      T2h = ci[WS(rs, 43)];
+			      T2i = cr[WS(rs, 52)];
+			      T2j = T2h - T2i;
+			      Tas = T2h + T2i;
+			 }
+			 {
+			      E Tj, Tk, T2e, T2f;
+			      Tj = cr[WS(rs, 20)];
+			      Tk = ci[WS(rs, 11)];
+			      Tl = Tj + Tk;
+			      Tao = Tj - Tk;
+			      T2e = ci[WS(rs, 59)];
+			      T2f = cr[WS(rs, 36)];
+			      T2g = T2e - T2f;
+			      Tap = T2e + T2f;
+			 }
+			 T2d = Ti - Tl;
+			 T2k = T2g - T2j;
+			 Tfc = Tap - Tao;
+			 Tfb = Tar + Tas;
+			 Tat = Tar - Tas;
+			 Taq = Tao + Tap;
+		    }
+		    {
+			 E Tak, Tal, Tah, Tai;
+			 {
+			      E Tn, To, T2q, T2r;
+			      Tn = ci[WS(rs, 3)];
+			      To = cr[WS(rs, 28)];
+			      Tp = Tn + To;
+			      Tak = Tn - To;
+			      T2q = ci[WS(rs, 51)];
+			      T2r = cr[WS(rs, 44)];
+			      T2s = T2q - T2r;
+			      Tal = T2q + T2r;
+			 }
+			 {
+			      E Tq, Tr, T2n, T2o;
+			      Tq = cr[WS(rs, 12)];
+			      Tr = ci[WS(rs, 19)];
+			      Ts = Tq + Tr;
+			      Tah = Tq - Tr;
+			      T2n = ci[WS(rs, 35)];
+			      T2o = cr[WS(rs, 60)];
+			      T2p = T2n - T2o;
+			      Tai = T2n + T2o;
+			 }
+			 T2m = Tp - Ts;
+			 T2t = T2p - T2s;
+			 Tff = Tah + Tai;
+			 Tfe = Tak + Tal;
+			 Tam = Tak - Tal;
+			 Taj = Tah - Tai;
+		    }
+		    {
+			 E Tm, Tt, Tfd, Tfg;
+			 Tm = Ti + Tl;
+			 Tt = Tp + Ts;
+			 Tu = Tm + Tt;
+			 T95 = Tm - Tt;
+			 Tfd = FNMS(KP923879532, Tfc, KP382683432 * Tfb);
+			 Tfg = FNMS(KP923879532, Tff, KP382683432 * Tfe);
+			 Tfh = Tfd + Tfg;
+			 ThN = Tfd - Tfg;
+		    }
+		    {
+			 E Tgh, Tgi, T2l, T2u;
+			 Tgh = FMA(KP382683432, Tfc, KP923879532 * Tfb);
+			 Tgi = FMA(KP382683432, Tff, KP923879532 * Tfe);
+			 Tgj = Tgh - Tgi;
+			 Thl = Tgh + Tgi;
+			 T2l = T2d - T2k;
+			 T2u = T2m + T2t;
+			 T2v = KP707106781 * (T2l + T2u);
+			 T6h = KP707106781 * (T2l - T2u);
+		    }
+		    {
+			 E T4L, T4M, Tan, Tau;
+			 T4L = T2d + T2k;
+			 T4M = T2t - T2m;
+			 T4N = KP707106781 * (T4L + T4M);
+			 T5P = KP707106781 * (T4M - T4L);
+			 Tan = FNMS(KP382683432, Tam, KP923879532 * Taj);
+			 Tau = FMA(KP923879532, Taq, KP382683432 * Tat);
+			 Tav = Tan - Tau;
+			 Te7 = Tau + Tan;
+		    }
+		    {
+			 E TcB, TcC, T7Q, T7R;
+			 TcB = FNMS(KP382683432, Taq, KP923879532 * Tat);
+			 TcC = FMA(KP382683432, Taj, KP923879532 * Tam);
+			 TcD = TcB - TcC;
+			 TdF = TcB + TcC;
+			 T7Q = T2g + T2j;
+			 T7R = T2p + T2s;
+			 T7S = T7Q + T7R;
+			 T8D = T7R - T7Q;
+		    }
+	       }
+	       {
+		    E T1z, T1C, T1D, Tcf, TbO, T4o, T4r, T7B, Tcg, TbP, T1G, T3Y, T1J, T3V, T1K;
+		    E T7C, Tcj, Tci, TbW, TbT, T1S, TfV, TfW, T41, T48, Tc8, Tcb, T7E, T1Z, TfY;
+		    E TfZ, T4a, T4h, Tc1, Tc4, T7F;
+		    {
+			 E T1x, T1y, T1A, T1B;
+			 T1x = ci[0];
+			 T1y = cr[WS(rs, 31)];
+			 T1z = T1x + T1y;
+			 T1A = cr[WS(rs, 15)];
+			 T1B = ci[WS(rs, 16)];
+			 T1C = T1A + T1B;
+			 T1D = T1z + T1C;
+			 Tcf = T1A - T1B;
+			 TbO = T1x - T1y;
+		    }
+		    {
+			 E T4m, T4n, T4p, T4q;
+			 T4m = ci[WS(rs, 32)];
+			 T4n = cr[WS(rs, 63)];
+			 T4o = T4m - T4n;
+			 T4p = ci[WS(rs, 48)];
+			 T4q = cr[WS(rs, 47)];
+			 T4r = T4p - T4q;
+			 T7B = T4o + T4r;
+			 Tcg = T4m + T4n;
+			 TbP = T4p + T4q;
+		    }
+		    {
+			 E TbR, TbS, TbU, TbV;
+			 {
+			      E T1E, T1F, T3W, T3X;
+			      T1E = cr[WS(rs, 7)];
+			      T1F = ci[WS(rs, 24)];
+			      T1G = T1E + T1F;
+			      TbR = T1E - T1F;
+			      T3W = ci[WS(rs, 56)];
+			      T3X = cr[WS(rs, 39)];
+			      T3Y = T3W - T3X;
+			      TbS = T3W + T3X;
+			 }
+			 {
+			      E T1H, T1I, T3T, T3U;
+			      T1H = ci[WS(rs, 8)];
+			      T1I = cr[WS(rs, 23)];
+			      T1J = T1H + T1I;
+			      TbU = T1H - T1I;
+			      T3T = ci[WS(rs, 40)];
+			      T3U = cr[WS(rs, 55)];
+			      T3V = T3T - T3U;
+			      TbV = T3T + T3U;
+			 }
+			 T1K = T1G + T1J;
+			 T7C = T3Y + T3V;
+			 Tcj = TbU + TbV;
+			 Tci = TbR + TbS;
+			 TbW = TbU - TbV;
+			 TbT = TbR - TbS;
+		    }
+		    {
+			 E T1O, Tc9, T47, Tca, T1R, Tc6, T44, Tc7;
+			 {
+			      E T1M, T1N, T45, T46;
+			      T1M = cr[WS(rs, 3)];
+			      T1N = ci[WS(rs, 28)];
+			      T1O = T1M + T1N;
+			      Tc9 = T1M - T1N;
+			      T45 = ci[WS(rs, 44)];
+			      T46 = cr[WS(rs, 51)];
+			      T47 = T45 - T46;
+			      Tca = T45 + T46;
+			 }
+			 {
+			      E T1P, T1Q, T42, T43;
+			      T1P = cr[WS(rs, 19)];
+			      T1Q = ci[WS(rs, 12)];
+			      T1R = T1P + T1Q;
+			      Tc6 = T1P - T1Q;
+			      T42 = ci[WS(rs, 60)];
+			      T43 = cr[WS(rs, 35)];
+			      T44 = T42 - T43;
+			      Tc7 = T42 + T43;
+			 }
+			 T1S = T1O + T1R;
+			 TfV = Tc9 + Tca;
+			 TfW = Tc7 - Tc6;
+			 T41 = T1O - T1R;
+			 T48 = T44 - T47;
+			 Tc8 = Tc6 + Tc7;
+			 Tcb = Tc9 - Tca;
+			 T7E = T44 + T47;
+		    }
+		    {
+			 E T1V, Tc2, T4g, Tc3, T1Y, TbZ, T4d, Tc0;
+			 {
+			      E T1T, T1U, T4e, T4f;
+			      T1T = ci[WS(rs, 4)];
+			      T1U = cr[WS(rs, 27)];
+			      T1V = T1T + T1U;
+			      Tc2 = T1T - T1U;
+			      T4e = ci[WS(rs, 52)];
+			      T4f = cr[WS(rs, 43)];
+			      T4g = T4e - T4f;
+			      Tc3 = T4e + T4f;
+			 }
+			 {
+			      E T1W, T1X, T4b, T4c;
+			      T1W = cr[WS(rs, 11)];
+			      T1X = ci[WS(rs, 20)];
+			      T1Y = T1W + T1X;
+			      TbZ = T1W - T1X;
+			      T4b = ci[WS(rs, 36)];
+			      T4c = cr[WS(rs, 59)];
+			      T4d = T4b - T4c;
+			      Tc0 = T4b + T4c;
+			 }
+			 T1Z = T1V + T1Y;
+			 TfY = Tc2 + Tc3;
+			 TfZ = TbZ + Tc0;
+			 T4a = T1V - T1Y;
+			 T4h = T4d - T4g;
+			 Tc1 = TbZ - Tc0;
+			 Tc4 = Tc2 - Tc3;
+			 T7F = T4d + T4g;
+		    }
+		    T1L = T1D + T1K;
+		    T20 = T1S + T1Z;
+		    T7A = T1L - T20;
+		    T7D = T7B + T7C;
+		    T7G = T7E + T7F;
+		    T7H = T7D - T7G;
+		    {
+			 E T3S, T3Z, TfX, Tg0;
+			 T3S = T1z - T1C;
+			 T3Z = T3V - T3Y;
+			 T40 = T3S + T3Z;
+			 T62 = T3S - T3Z;
+			 TfX = FNMS(KP923879532, TfW, KP382683432 * TfV);
+			 Tg0 = FNMS(KP923879532, TfZ, KP382683432 * TfY);
+			 Tg1 = TfX + Tg0;
+			 Thv = TfX - Tg0;
+		    }
+		    {
+			 E Tg6, Tg7, Tg3, Tg4;
+			 Tg6 = FMA(KP382683432, TfW, KP923879532 * TfV);
+			 Tg7 = FMA(KP382683432, TfZ, KP923879532 * TfY);
+			 Tg8 = Tg6 - Tg7;
+			 Thz = Tg6 + Tg7;
+			 Tg3 = KP707106781 * (TbT - TbW);
+			 Tg4 = Tcf + Tcg;
+			 Tg5 = Tg3 - Tg4;
+			 Thw = Tg4 + Tg3;
+		    }
+		    {
+			 E T4l, T4s, T49, T4i;
+			 T4l = T1G - T1J;
+			 T4s = T4o - T4r;
+			 T4t = T4l + T4s;
+			 T5Z = T4s - T4l;
+			 T49 = T41 - T48;
+			 T4i = T4a + T4h;
+			 T4j = KP707106781 * (T49 + T4i);
+			 T60 = KP707106781 * (T49 - T4i);
+		    }
+		    {
+			 E T4u, T4v, TbQ, TbX;
+			 T4u = T41 + T48;
+			 T4v = T4h - T4a;
+			 T4w = KP707106781 * (T4u + T4v);
+			 T63 = KP707106781 * (T4v - T4u);
+			 TbQ = TbO - TbP;
+			 TbX = KP707106781 * (TbT + TbW);
+			 TbY = TbQ - TbX;
+			 TdS = TbQ + TbX;
+		    }
+		    {
+			 E Tc5, Tcc, TfS, TfT;
+			 Tc5 = FNMS(KP382683432, Tc4, KP923879532 * Tc1);
+			 Tcc = FMA(KP923879532, Tc8, KP382683432 * Tcb);
+			 Tcd = Tc5 - Tcc;
+			 TdQ = Tcc + Tc5;
+			 TfS = TbO + TbP;
+			 TfT = KP707106781 * (Tci + Tcj);
+			 TfU = TfS - TfT;
+			 Thy = TfS + TfT;
+		    }
+		    {
+			 E T8N, T8O, T8Q, T8R;
+			 T8N = T7B - T7C;
+			 T8O = T1S - T1Z;
+			 T8P = T8N - T8O;
+			 T9z = T8O + T8N;
+			 T8Q = T1D - T1K;
+			 T8R = T7F - T7E;
+			 T8S = T8Q - T8R;
+			 T9A = T8Q + T8R;
+		    }
+		    {
+			 E Tch, Tck, Tcm, Tcn;
+			 Tch = Tcf - Tcg;
+			 Tck = KP707106781 * (Tci - Tcj);
+			 Tcl = Tch - Tck;
+			 TdP = Tch + Tck;
+			 Tcm = FNMS(KP382683432, Tc8, KP923879532 * Tcb);
+			 Tcn = FMA(KP382683432, Tc1, KP923879532 * Tc4);
+			 Tco = Tcm - Tcn;
+			 TdT = Tcm + Tcn;
+		    }
+	       }
+	       {
+		    E T14, T17, T18, TbC, Tbb, T3H, T3K, T7s, TbD, Tbc, T1b, T3h, T1e, T3e, T1f;
+		    E T7t, TbG, TbF, Tbj, Tbg, T1n, TfC, TfD, T3k, T3r, Tbv, Tby, T7v, T1u, TfF;
+		    E TfG, T3t, T3A, Tbo, Tbr, T7w;
+		    {
+			 E T12, T13, T15, T16;
+			 T12 = cr[WS(rs, 1)];
+			 T13 = ci[WS(rs, 30)];
+			 T14 = T12 + T13;
+			 T15 = cr[WS(rs, 17)];
+			 T16 = ci[WS(rs, 14)];
+			 T17 = T15 + T16;
+			 T18 = T14 + T17;
+			 TbC = T15 - T16;
+			 Tbb = T12 - T13;
+		    }
+		    {
+			 E T3F, T3G, T3I, T3J;
+			 T3F = ci[WS(rs, 62)];
+			 T3G = cr[WS(rs, 33)];
+			 T3H = T3F - T3G;
+			 T3I = ci[WS(rs, 46)];
+			 T3J = cr[WS(rs, 49)];
+			 T3K = T3I - T3J;
+			 T7s = T3H + T3K;
+			 TbD = T3F + T3G;
+			 Tbc = T3I + T3J;
+		    }
+		    {
+			 E Tbe, Tbf, Tbh, Tbi;
+			 {
+			      E T19, T1a, T3f, T3g;
+			      T19 = cr[WS(rs, 9)];
+			      T1a = ci[WS(rs, 22)];
+			      T1b = T19 + T1a;
+			      Tbe = T19 - T1a;
+			      T3f = ci[WS(rs, 54)];
+			      T3g = cr[WS(rs, 41)];
+			      T3h = T3f - T3g;
+			      Tbf = T3f + T3g;
+			 }
+			 {
+			      E T1c, T1d, T3c, T3d;
+			      T1c = ci[WS(rs, 6)];
+			      T1d = cr[WS(rs, 25)];
+			      T1e = T1c + T1d;
+			      Tbh = T1c - T1d;
+			      T3c = ci[WS(rs, 38)];
+			      T3d = cr[WS(rs, 57)];
+			      T3e = T3c - T3d;
+			      Tbi = T3c + T3d;
+			 }
+			 T1f = T1b + T1e;
+			 T7t = T3h + T3e;
+			 TbG = Tbh + Tbi;
+			 TbF = Tbe + Tbf;
+			 Tbj = Tbh - Tbi;
+			 Tbg = Tbe - Tbf;
+		    }
+		    {
+			 E T1j, Tbw, T3q, Tbx, T1m, Tbt, T3n, Tbu;
+			 {
+			      E T1h, T1i, T3o, T3p;
+			      T1h = cr[WS(rs, 5)];
+			      T1i = ci[WS(rs, 26)];
+			      T1j = T1h + T1i;
+			      Tbw = T1h - T1i;
+			      T3o = ci[WS(rs, 42)];
+			      T3p = cr[WS(rs, 53)];
+			      T3q = T3o - T3p;
+			      Tbx = T3o + T3p;
+			 }
+			 {
+			      E T1k, T1l, T3l, T3m;
+			      T1k = cr[WS(rs, 21)];
+			      T1l = ci[WS(rs, 10)];
+			      T1m = T1k + T1l;
+			      Tbt = T1k - T1l;
+			      T3l = ci[WS(rs, 58)];
+			      T3m = cr[WS(rs, 37)];
+			      T3n = T3l - T3m;
+			      Tbu = T3l + T3m;
+			 }
+			 T1n = T1j + T1m;
+			 TfC = Tbw + Tbx;
+			 TfD = Tbu - Tbt;
+			 T3k = T1j - T1m;
+			 T3r = T3n - T3q;
+			 Tbv = Tbt + Tbu;
+			 Tby = Tbw - Tbx;
+			 T7v = T3n + T3q;
+		    }
+		    {
+			 E T1q, Tbp, T3z, Tbq, T1t, Tbm, T3w, Tbn;
+			 {
+			      E T1o, T1p, T3x, T3y;
+			      T1o = ci[WS(rs, 2)];
+			      T1p = cr[WS(rs, 29)];
+			      T1q = T1o + T1p;
+			      Tbp = T1o - T1p;
+			      T3x = ci[WS(rs, 50)];
+			      T3y = cr[WS(rs, 45)];
+			      T3z = T3x - T3y;
+			      Tbq = T3x + T3y;
+			 }
+			 {
+			      E T1r, T1s, T3u, T3v;
+			      T1r = cr[WS(rs, 13)];
+			      T1s = ci[WS(rs, 18)];
+			      T1t = T1r + T1s;
+			      Tbm = T1r - T1s;
+			      T3u = ci[WS(rs, 34)];
+			      T3v = cr[WS(rs, 61)];
+			      T3w = T3u - T3v;
+			      Tbn = T3u + T3v;
+			 }
+			 T1u = T1q + T1t;
+			 TfF = Tbp + Tbq;
+			 TfG = Tbm + Tbn;
+			 T3t = T1q - T1t;
+			 T3A = T3w - T3z;
+			 Tbo = Tbm - Tbn;
+			 Tbr = Tbp - Tbq;
+			 T7w = T3w + T3z;
+		    }
+		    T1g = T18 + T1f;
+		    T1v = T1n + T1u;
+		    T7r = T1g - T1v;
+		    T7u = T7s + T7t;
+		    T7x = T7v + T7w;
+		    T7y = T7u - T7x;
+		    {
+			 E T3b, T3i, TfE, TfH;
+			 T3b = T14 - T17;
+			 T3i = T3e - T3h;
+			 T3j = T3b + T3i;
+			 T69 = T3b - T3i;
+			 TfE = FNMS(KP923879532, TfD, KP382683432 * TfC);
+			 TfH = FNMS(KP923879532, TfG, KP382683432 * TfF);
+			 TfI = TfE + TfH;
+			 ThD = TfE - TfH;
+		    }
+		    {
+			 E TfN, TfO, TfK, TfL;
+			 TfN = FMA(KP382683432, TfD, KP923879532 * TfC);
+			 TfO = FMA(KP382683432, TfG, KP923879532 * TfF);
+			 TfP = TfN - TfO;
+			 ThG = TfN + TfO;
+			 TfK = TbD - TbC;
+			 TfL = KP707106781 * (Tbg - Tbj);
+			 TfM = TfK + TfL;
+			 ThC = TfK - TfL;
+		    }
+		    {
+			 E T3E, T3L, T3s, T3B;
+			 T3E = T1b - T1e;
+			 T3L = T3H - T3K;
+			 T3M = T3E + T3L;
+			 T66 = T3L - T3E;
+			 T3s = T3k - T3r;
+			 T3B = T3t + T3A;
+			 T3C = KP707106781 * (T3s + T3B);
+			 T67 = KP707106781 * (T3s - T3B);
+		    }
+		    {
+			 E T3N, T3O, Tbd, Tbk;
+			 T3N = T3k + T3r;
+			 T3O = T3A - T3t;
+			 T3P = KP707106781 * (T3N + T3O);
+			 T6a = KP707106781 * (T3O - T3N);
+			 Tbd = Tbb - Tbc;
+			 Tbk = KP707106781 * (Tbg + Tbj);
+			 Tbl = Tbd - Tbk;
+			 TdZ = Tbd + Tbk;
+		    }
+		    {
+			 E Tbs, Tbz, Tfz, TfA;
+			 Tbs = FNMS(KP382683432, Tbr, KP923879532 * Tbo);
+			 Tbz = FMA(KP923879532, Tbv, KP382683432 * Tby);
+			 TbA = Tbs - Tbz;
+			 TdX = Tbz + Tbs;
+			 Tfz = Tbb + Tbc;
+			 TfA = KP707106781 * (TbF + TbG);
+			 TfB = Tfz - TfA;
+			 ThF = Tfz + TfA;
+		    }
+		    {
+			 E T8U, T8V, T8X, T8Y;
+			 T8U = T7s - T7t;
+			 T8V = T1n - T1u;
+			 T8W = T8U - T8V;
+			 T9C = T8V + T8U;
+			 T8X = T18 - T1f;
+			 T8Y = T7w - T7v;
+			 T8Z = T8X - T8Y;
+			 T9D = T8X + T8Y;
+		    }
+		    {
+			 E TbE, TbH, TbJ, TbK;
+			 TbE = TbC + TbD;
+			 TbH = KP707106781 * (TbF - TbG);
+			 TbI = TbE - TbH;
+			 TdW = TbE + TbH;
+			 TbJ = FNMS(KP382683432, Tbv, KP923879532 * Tby);
+			 TbK = FMA(KP382683432, Tbo, KP923879532 * Tbr);
+			 TbL = TbJ - TbK;
+			 Te0 = TbJ + TbK;
+		    }
+	       }
+	       {
+		    E T11, T8q, T8n, T8r, T22, T8v, T8k, T8u;
+		    {
+			 E Tv, T10, T8l, T8m;
+			 Tv = Tf + Tu;
+			 T10 = TK + TZ;
+			 T11 = Tv + T10;
+			 T8q = Tv - T10;
+			 T8l = T7u + T7x;
+			 T8m = T7D + T7G;
+			 T8n = T8l + T8m;
+			 T8r = T8m - T8l;
+		    }
+		    {
+			 E T1w, T21, T8i, T8j;
+			 T1w = T1g + T1v;
+			 T21 = T1L + T20;
+			 T22 = T1w + T21;
+			 T8v = T1w - T21;
+			 T8i = T7P + T7S;
+			 T8j = T7o + T7l;
+			 T8k = T8i + T8j;
+			 T8u = T8i - T8j;
+		    }
+		    cr[0] = T11 + T22;
+		    ci[0] = T8k + T8n;
+		    {
+			 E T8g, T8o, T8f, T8h;
+			 T8g = T11 - T22;
+			 T8o = T8k - T8n;
+			 T8f = W[62];
+			 T8h = W[63];
+			 cr[WS(rs, 32)] = FNMS(T8h, T8o, T8f * T8g);
+			 ci[WS(rs, 32)] = FMA(T8h, T8g, T8f * T8o);
+		    }
+		    {
+			 E T8s, T8w, T8p, T8t;
+			 T8s = T8q - T8r;
+			 T8w = T8u - T8v;
+			 T8p = W[94];
+			 T8t = W[95];
+			 cr[WS(rs, 48)] = FNMS(T8t, T8w, T8p * T8s);
+			 ci[WS(rs, 48)] = FMA(T8p, T8w, T8t * T8s);
+		    }
+		    {
+			 E T8y, T8A, T8x, T8z;
+			 T8y = T8q + T8r;
+			 T8A = T8v + T8u;
+			 T8x = W[30];
+			 T8z = W[31];
+			 cr[WS(rs, 16)] = FNMS(T8z, T8A, T8x * T8y);
+			 ci[WS(rs, 16)] = FMA(T8x, T8A, T8z * T8y);
+		    }
+	       }
+	       {
+		    E T9y, T9U, T9N, T9V, T9F, T9Z, T9K, T9Y;
+		    {
+			 E T9w, T9x, T9L, T9M;
+			 T9w = T8C + T8D;
+			 T9x = KP707106781 * (T97 + T98);
+			 T9y = T9w - T9x;
+			 T9U = T9w + T9x;
+			 T9L = FNMS(KP382683432, T9C, KP923879532 * T9D);
+			 T9M = FMA(KP382683432, T9z, KP923879532 * T9A);
+			 T9N = T9L - T9M;
+			 T9V = T9L + T9M;
+		    }
+		    {
+			 E T9B, T9E, T9I, T9J;
+			 T9B = FNMS(KP382683432, T9A, KP923879532 * T9z);
+			 T9E = FMA(KP923879532, T9C, KP382683432 * T9D);
+			 T9F = T9B - T9E;
+			 T9Z = T9E + T9B;
+			 T9I = T95 + T94;
+			 T9J = KP707106781 * (T8K + T8H);
+			 T9K = T9I - T9J;
+			 T9Y = T9I + T9J;
+		    }
+		    {
+			 E T9G, T9O, T9v, T9H;
+			 T9G = T9y - T9F;
+			 T9O = T9K - T9N;
+			 T9v = W[102];
+			 T9H = W[103];
+			 cr[WS(rs, 52)] = FNMS(T9H, T9O, T9v * T9G);
+			 ci[WS(rs, 52)] = FMA(T9H, T9G, T9v * T9O);
+		    }
+		    {
+			 E Ta2, Ta4, Ta1, Ta3;
+			 Ta2 = T9U + T9V;
+			 Ta4 = T9Y + T9Z;
+			 Ta1 = W[6];
+			 Ta3 = W[7];
+			 cr[WS(rs, 4)] = FNMS(Ta3, Ta4, Ta1 * Ta2);
+			 ci[WS(rs, 4)] = FMA(Ta1, Ta4, Ta3 * Ta2);
+		    }
+		    {
+			 E T9Q, T9S, T9P, T9R;
+			 T9Q = T9y + T9F;
+			 T9S = T9K + T9N;
+			 T9P = W[38];
+			 T9R = W[39];
+			 cr[WS(rs, 20)] = FNMS(T9R, T9S, T9P * T9Q);
+			 ci[WS(rs, 20)] = FMA(T9R, T9Q, T9P * T9S);
+		    }
+		    {
+			 E T9W, Ta0, T9T, T9X;
+			 T9W = T9U - T9V;
+			 Ta0 = T9Y - T9Z;
+			 T9T = W[70];
+			 T9X = W[71];
+			 cr[WS(rs, 36)] = FNMS(T9X, Ta0, T9T * T9W);
+			 ci[WS(rs, 36)] = FMA(T9T, Ta0, T9X * T9W);
+		    }
+	       }
+	       {
+		    E T8M, T9k, T9d, T9l, T91, T9p, T9a, T9o;
+		    {
+			 E T8E, T8L, T9b, T9c;
+			 T8E = T8C - T8D;
+			 T8L = KP707106781 * (T8H - T8K);
+			 T8M = T8E - T8L;
+			 T9k = T8E + T8L;
+			 T9b = FNMS(KP923879532, T8W, KP382683432 * T8Z);
+			 T9c = FMA(KP923879532, T8P, KP382683432 * T8S);
+			 T9d = T9b - T9c;
+			 T9l = T9b + T9c;
+		    }
+		    {
+			 E T8T, T90, T96, T99;
+			 T8T = FNMS(KP923879532, T8S, KP382683432 * T8P);
+			 T90 = FMA(KP382683432, T8W, KP923879532 * T8Z);
+			 T91 = T8T - T90;
+			 T9p = T90 + T8T;
+			 T96 = T94 - T95;
+			 T99 = KP707106781 * (T97 - T98);
+			 T9a = T96 - T99;
+			 T9o = T96 + T99;
+		    }
+		    {
+			 E T92, T9e, T8B, T93;
+			 T92 = T8M - T91;
+			 T9e = T9a - T9d;
+			 T8B = W[118];
+			 T93 = W[119];
+			 cr[WS(rs, 60)] = FNMS(T93, T9e, T8B * T92);
+			 ci[WS(rs, 60)] = FMA(T93, T92, T8B * T9e);
+		    }
+		    {
+			 E T9s, T9u, T9r, T9t;
+			 T9s = T9k + T9l;
+			 T9u = T9o + T9p;
+			 T9r = W[22];
+			 T9t = W[23];
+			 cr[WS(rs, 12)] = FNMS(T9t, T9u, T9r * T9s);
+			 ci[WS(rs, 12)] = FMA(T9r, T9u, T9t * T9s);
+		    }
+		    {
+			 E T9g, T9i, T9f, T9h;
+			 T9g = T8M + T91;
+			 T9i = T9a + T9d;
+			 T9f = W[54];
+			 T9h = W[55];
+			 cr[WS(rs, 28)] = FNMS(T9h, T9i, T9f * T9g);
+			 ci[WS(rs, 28)] = FMA(T9h, T9g, T9f * T9i);
+		    }
+		    {
+			 E T9m, T9q, T9j, T9n;
+			 T9m = T9k - T9l;
+			 T9q = T9o - T9p;
+			 T9j = W[86];
+			 T9n = W[87];
+			 cr[WS(rs, 44)] = FNMS(T9n, T9q, T9j * T9m);
+			 ci[WS(rs, 44)] = FMA(T9j, T9q, T9n * T9m);
+		    }
+	       }
+	       {
+		    E T7q, T84, T7X, T85, T7J, T89, T7U, T88;
+		    {
+			 E T7i, T7p, T7V, T7W;
+			 T7i = Tf - Tu;
+			 T7p = T7l - T7o;
+			 T7q = T7i + T7p;
+			 T84 = T7i - T7p;
+			 T7V = T7r + T7y;
+			 T7W = T7H - T7A;
+			 T7X = KP707106781 * (T7V + T7W);
+			 T85 = KP707106781 * (T7W - T7V);
+		    }
+		    {
+			 E T7z, T7I, T7M, T7T;
+			 T7z = T7r - T7y;
+			 T7I = T7A + T7H;
+			 T7J = KP707106781 * (T7z + T7I);
+			 T89 = KP707106781 * (T7z - T7I);
+			 T7M = TK - TZ;
+			 T7T = T7P - T7S;
+			 T7U = T7M + T7T;
+			 T88 = T7T - T7M;
+		    }
+		    {
+			 E T7K, T7Y, T7h, T7L;
+			 T7K = T7q - T7J;
+			 T7Y = T7U - T7X;
+			 T7h = W[78];
+			 T7L = W[79];
+			 cr[WS(rs, 40)] = FNMS(T7L, T7Y, T7h * T7K);
+			 ci[WS(rs, 40)] = FMA(T7L, T7K, T7h * T7Y);
+		    }
+		    {
+			 E T8c, T8e, T8b, T8d;
+			 T8c = T84 + T85;
+			 T8e = T88 + T89;
+			 T8b = W[46];
+			 T8d = W[47];
+			 cr[WS(rs, 24)] = FNMS(T8d, T8e, T8b * T8c);
+			 ci[WS(rs, 24)] = FMA(T8b, T8e, T8d * T8c);
+		    }
+		    {
+			 E T80, T82, T7Z, T81;
+			 T80 = T7q + T7J;
+			 T82 = T7U + T7X;
+			 T7Z = W[14];
+			 T81 = W[15];
+			 cr[WS(rs, 8)] = FNMS(T81, T82, T7Z * T80);
+			 ci[WS(rs, 8)] = FMA(T81, T80, T7Z * T82);
+		    }
+		    {
+			 E T86, T8a, T83, T87;
+			 T86 = T84 - T85;
+			 T8a = T88 - T89;
+			 T83 = W[110];
+			 T87 = W[111];
+			 cr[WS(rs, 56)] = FNMS(T87, T8a, T83 * T86);
+			 ci[WS(rs, 56)] = FMA(T83, T8a, T87 * T86);
+		    }
+	       }
+	       {
+		    E T6K, T76, T6W, T7a, T6R, T7b, T6Z, T77;
+		    {
+			 E T6I, T6J, T6U, T6V;
+			 T6I = T5O + T5P;
+			 T6J = T6j + T6k;
+			 T6K = T6I - T6J;
+			 T76 = T6I + T6J;
+			 T6U = T6g + T6h;
+			 T6V = T5W + T5T;
+			 T6W = T6U - T6V;
+			 T7a = T6U + T6V;
+			 {
+			      E T6N, T6Y, T6Q, T6X;
+			      {
+				   E T6L, T6M, T6O, T6P;
+				   T6L = T5Z + T60;
+				   T6M = T62 + T63;
+				   T6N = FNMS(KP555570233, T6M, KP831469612 * T6L);
+				   T6Y = FMA(KP555570233, T6L, KP831469612 * T6M);
+				   T6O = T66 + T67;
+				   T6P = T69 + T6a;
+				   T6Q = FMA(KP831469612, T6O, KP555570233 * T6P);
+				   T6X = FNMS(KP555570233, T6O, KP831469612 * T6P);
+			      }
+			      T6R = T6N - T6Q;
+			      T7b = T6Q + T6N;
+			      T6Z = T6X - T6Y;
+			      T77 = T6X + T6Y;
+			 }
+		    }
+		    {
+			 E T6S, T70, T6H, T6T;
+			 T6S = T6K - T6R;
+			 T70 = T6W - T6Z;
+			 T6H = W[106];
+			 T6T = W[107];
+			 cr[WS(rs, 54)] = FNMS(T6T, T70, T6H * T6S);
+			 ci[WS(rs, 54)] = FMA(T6T, T6S, T6H * T70);
+		    }
+		    {
+			 E T7e, T7g, T7d, T7f;
+			 T7e = T76 + T77;
+			 T7g = T7a + T7b;
+			 T7d = W[10];
+			 T7f = W[11];
+			 cr[WS(rs, 6)] = FNMS(T7f, T7g, T7d * T7e);
+			 ci[WS(rs, 6)] = FMA(T7d, T7g, T7f * T7e);
+		    }
+		    {
+			 E T72, T74, T71, T73;
+			 T72 = T6K + T6R;
+			 T74 = T6W + T6Z;
+			 T71 = W[42];
+			 T73 = W[43];
+			 cr[WS(rs, 22)] = FNMS(T73, T74, T71 * T72);
+			 ci[WS(rs, 22)] = FMA(T73, T72, T71 * T74);
+		    }
+		    {
+			 E T78, T7c, T75, T79;
+			 T78 = T76 - T77;
+			 T7c = T7a - T7b;
+			 T75 = W[74];
+			 T79 = W[75];
+			 cr[WS(rs, 38)] = FNMS(T79, T7c, T75 * T78);
+			 ci[WS(rs, 38)] = FMA(T75, T7c, T79 * T78);
+		    }
+	       }
+	       {
+		    E T3a, T52, T4S, T56, T4z, T57, T4V, T53;
+		    {
+			 E T2w, T39, T4O, T4R;
+			 T2w = T2c - T2v;
+			 T39 = T2P - T38;
+			 T3a = T2w + T39;
+			 T52 = T2w - T39;
+			 T4O = T4K - T4N;
+			 T4R = T4P - T4Q;
+			 T4S = T4O + T4R;
+			 T56 = T4O - T4R;
+			 {
+			      E T3R, T4T, T4y, T4U;
+			      {
+				   E T3D, T3Q, T4k, T4x;
+				   T3D = T3j - T3C;
+				   T3Q = T3M - T3P;
+				   T3R = FNMS(KP831469612, T3Q, KP555570233 * T3D);
+				   T4T = FMA(KP831469612, T3D, KP555570233 * T3Q);
+				   T4k = T40 - T4j;
+				   T4x = T4t - T4w;
+				   T4y = FMA(KP555570233, T4k, KP831469612 * T4x);
+				   T4U = FNMS(KP831469612, T4k, KP555570233 * T4x);
+			      }
+			      T4z = T3R + T4y;
+			      T57 = T3R - T4y;
+			      T4V = T4T + T4U;
+			      T53 = T4U - T4T;
+			 }
+		    }
+		    {
+			 E T4A, T4W, T23, T4B;
+			 T4A = T3a - T4z;
+			 T4W = T4S - T4V;
+			 T23 = W[82];
+			 T4B = W[83];
+			 cr[WS(rs, 42)] = FNMS(T4B, T4W, T23 * T4A);
+			 ci[WS(rs, 42)] = FMA(T4B, T4A, T23 * T4W);
+		    }
+		    {
+			 E T5a, T5c, T59, T5b;
+			 T5a = T52 + T53;
+			 T5c = T56 + T57;
+			 T59 = W[50];
+			 T5b = W[51];
+			 cr[WS(rs, 26)] = FNMS(T5b, T5c, T59 * T5a);
+			 ci[WS(rs, 26)] = FMA(T59, T5c, T5b * T5a);
+		    }
+		    {
+			 E T4Y, T50, T4X, T4Z;
+			 T4Y = T3a + T4z;
+			 T50 = T4S + T4V;
+			 T4X = W[18];
+			 T4Z = W[19];
+			 cr[WS(rs, 10)] = FNMS(T4Z, T50, T4X * T4Y);
+			 ci[WS(rs, 10)] = FMA(T4Z, T4Y, T4X * T50);
+		    }
+		    {
+			 E T54, T58, T51, T55;
+			 T54 = T52 - T53;
+			 T58 = T56 - T57;
+			 T51 = W[114];
+			 T55 = W[115];
+			 cr[WS(rs, 58)] = FNMS(T55, T58, T51 * T54);
+			 ci[WS(rs, 58)] = FMA(T51, T58, T55 * T54);
+		    }
+	       }
+	       {
+		    E T5g, T5C, T5s, T5G, T5n, T5H, T5v, T5D;
+		    {
+			 E T5e, T5f, T5q, T5r;
+			 T5e = T2c + T2v;
+			 T5f = T4P + T4Q;
+			 T5g = T5e + T5f;
+			 T5C = T5e - T5f;
+			 T5q = T4K + T4N;
+			 T5r = T38 + T2P;
+			 T5s = T5q + T5r;
+			 T5G = T5q - T5r;
+			 {
+			      E T5j, T5t, T5m, T5u;
+			      {
+				   E T5h, T5i, T5k, T5l;
+				   T5h = T3j + T3C;
+				   T5i = T3M + T3P;
+				   T5j = FNMS(KP195090322, T5i, KP980785280 * T5h);
+				   T5t = FMA(KP195090322, T5h, KP980785280 * T5i);
+				   T5k = T40 + T4j;
+				   T5l = T4t + T4w;
+				   T5m = FMA(KP980785280, T5k, KP195090322 * T5l);
+				   T5u = FNMS(KP195090322, T5k, KP980785280 * T5l);
+			      }
+			      T5n = T5j + T5m;
+			      T5H = T5j - T5m;
+			      T5v = T5t + T5u;
+			      T5D = T5u - T5t;
+			 }
+		    }
+		    {
+			 E T5o, T5w, T5d, T5p;
+			 T5o = T5g - T5n;
+			 T5w = T5s - T5v;
+			 T5d = W[66];
+			 T5p = W[67];
+			 cr[WS(rs, 34)] = FNMS(T5p, T5w, T5d * T5o);
+			 ci[WS(rs, 34)] = FMA(T5p, T5o, T5d * T5w);
+		    }
+		    {
+			 E T5K, T5M, T5J, T5L;
+			 T5K = T5C + T5D;
+			 T5M = T5G + T5H;
+			 T5J = W[34];
+			 T5L = W[35];
+			 cr[WS(rs, 18)] = FNMS(T5L, T5M, T5J * T5K);
+			 ci[WS(rs, 18)] = FMA(T5J, T5M, T5L * T5K);
+		    }
+		    {
+			 E T5y, T5A, T5x, T5z;
+			 T5y = T5g + T5n;
+			 T5A = T5s + T5v;
+			 T5x = W[2];
+			 T5z = W[3];
+			 cr[WS(rs, 2)] = FNMS(T5z, T5A, T5x * T5y);
+			 ci[WS(rs, 2)] = FMA(T5z, T5y, T5x * T5A);
+		    }
+		    {
+			 E T5E, T5I, T5B, T5F;
+			 T5E = T5C - T5D;
+			 T5I = T5G - T5H;
+			 T5B = W[98];
+			 T5F = W[99];
+			 cr[WS(rs, 50)] = FNMS(T5F, T5I, T5B * T5E);
+			 ci[WS(rs, 50)] = FMA(T5B, T5I, T5F * T5E);
+		    }
+	       }
+	       {
+		    E T5Y, T6w, T6m, T6A, T6d, T6B, T6p, T6x;
+		    {
+			 E T5Q, T5X, T6i, T6l;
+			 T5Q = T5O - T5P;
+			 T5X = T5T - T5W;
+			 T5Y = T5Q - T5X;
+			 T6w = T5Q + T5X;
+			 T6i = T6g - T6h;
+			 T6l = T6j - T6k;
+			 T6m = T6i - T6l;
+			 T6A = T6i + T6l;
+			 {
+			      E T65, T6o, T6c, T6n;
+			      {
+				   E T61, T64, T68, T6b;
+				   T61 = T5Z - T60;
+				   T64 = T62 - T63;
+				   T65 = FNMS(KP980785280, T64, KP195090322 * T61);
+				   T6o = FMA(KP980785280, T61, KP195090322 * T64);
+				   T68 = T66 - T67;
+				   T6b = T69 - T6a;
+				   T6c = FMA(KP195090322, T68, KP980785280 * T6b);
+				   T6n = FNMS(KP980785280, T68, KP195090322 * T6b);
+			      }
+			      T6d = T65 - T6c;
+			      T6B = T6c + T65;
+			      T6p = T6n - T6o;
+			      T6x = T6n + T6o;
+			 }
+		    }
+		    {
+			 E T6e, T6q, T5N, T6f;
+			 T6e = T5Y - T6d;
+			 T6q = T6m - T6p;
+			 T5N = W[122];
+			 T6f = W[123];
+			 cr[WS(rs, 62)] = FNMS(T6f, T6q, T5N * T6e);
+			 ci[WS(rs, 62)] = FMA(T6f, T6e, T5N * T6q);
+		    }
+		    {
+			 E T6E, T6G, T6D, T6F;
+			 T6E = T6w + T6x;
+			 T6G = T6A + T6B;
+			 T6D = W[26];
+			 T6F = W[27];
+			 cr[WS(rs, 14)] = FNMS(T6F, T6G, T6D * T6E);
+			 ci[WS(rs, 14)] = FMA(T6D, T6G, T6F * T6E);
+		    }
+		    {
+			 E T6s, T6u, T6r, T6t;
+			 T6s = T5Y + T6d;
+			 T6u = T6m + T6p;
+			 T6r = W[58];
+			 T6t = W[59];
+			 cr[WS(rs, 30)] = FNMS(T6t, T6u, T6r * T6s);
+			 ci[WS(rs, 30)] = FMA(T6t, T6s, T6r * T6u);
+		    }
+		    {
+			 E T6y, T6C, T6v, T6z;
+			 T6y = T6w - T6x;
+			 T6C = T6A - T6B;
+			 T6v = W[90];
+			 T6z = W[91];
+			 cr[WS(rs, 46)] = FNMS(T6z, T6C, T6v * T6y);
+			 ci[WS(rs, 46)] = FMA(T6v, T6C, T6z * T6y);
+		    }
+	       }
+	       {
+		    E Tba, Tdw, TcS, Tdi, TcI, Tds, TcW, Td6, Tcr, TcX, TcL, TcT, Tdd, Tdx, Tdl;
+		    E Tdt;
+		    {
+			 E Taw, Tdg, Tb9, Tdh, TaP, Tb8;
+			 Taw = Tag - Tav;
+			 Tdg = TcA + TcD;
+			 TaP = FNMS(KP831469612, TaO, KP555570233 * TaH);
+			 Tb8 = FMA(KP831469612, Tb0, KP555570233 * Tb7);
+			 Tb9 = TaP - Tb8;
+			 Tdh = Tb8 + TaP;
+			 Tba = Taw + Tb9;
+			 Tdw = Tdg - Tdh;
+			 TcS = Taw - Tb9;
+			 Tdi = Tdg + Tdh;
+		    }
+		    {
+			 E TcE, Td4, TcH, Td5, TcF, TcG;
+			 TcE = TcA - TcD;
+			 Td4 = Tag + Tav;
+			 TcF = FNMS(KP831469612, Tb7, KP555570233 * Tb0);
+			 TcG = FMA(KP555570233, TaO, KP831469612 * TaH);
+			 TcH = TcF - TcG;
+			 Td5 = TcF + TcG;
+			 TcI = TcE + TcH;
+			 Tds = Td4 - Td5;
+			 TcW = TcE - TcH;
+			 Td6 = Td4 + Td5;
+		    }
+		    {
+			 E TbN, TcJ, Tcq, TcK;
+			 {
+			      E TbB, TbM, Tce, Tcp;
+			      TbB = Tbl - TbA;
+			      TbM = TbI - TbL;
+			      TbN = FNMS(KP956940335, TbM, KP290284677 * TbB);
+			      TcJ = FMA(KP956940335, TbB, KP290284677 * TbM);
+			      Tce = TbY - Tcd;
+			      Tcp = Tcl - Tco;
+			      Tcq = FMA(KP290284677, Tce, KP956940335 * Tcp);
+			      TcK = FNMS(KP956940335, Tce, KP290284677 * Tcp);
+			 }
+			 Tcr = TbN + Tcq;
+			 TcX = TbN - Tcq;
+			 TcL = TcJ + TcK;
+			 TcT = TcK - TcJ;
+		    }
+		    {
+			 E Td9, Tdj, Tdc, Tdk;
+			 {
+			      E Td7, Td8, Tda, Tdb;
+			      Td7 = Tbl + TbA;
+			      Td8 = TbI + TbL;
+			      Td9 = FNMS(KP471396736, Td8, KP881921264 * Td7);
+			      Tdj = FMA(KP471396736, Td7, KP881921264 * Td8);
+			      Tda = TbY + Tcd;
+			      Tdb = Tcl + Tco;
+			      Tdc = FMA(KP881921264, Tda, KP471396736 * Tdb);
+			      Tdk = FNMS(KP471396736, Tda, KP881921264 * Tdb);
+			 }
+			 Tdd = Td9 + Tdc;
+			 Tdx = Td9 - Tdc;
+			 Tdl = Tdj + Tdk;
+			 Tdt = Tdk - Tdj;
+		    }
+		    {
+			 E Tcs, TcM, Ta5, Tct;
+			 Tcs = Tba - Tcr;
+			 TcM = TcI - TcL;
+			 Ta5 = W[88];
+			 Tct = W[89];
+			 cr[WS(rs, 45)] = FNMS(Tct, TcM, Ta5 * Tcs);
+			 ci[WS(rs, 45)] = FMA(Tct, Tcs, Ta5 * TcM);
+		    }
+		    {
+			 E Tdu, Tdy, Tdr, Tdv;
+			 Tdu = Tds - Tdt;
+			 Tdy = Tdw - Tdx;
+			 Tdr = W[104];
+			 Tdv = W[105];
+			 cr[WS(rs, 53)] = FNMS(Tdv, Tdy, Tdr * Tdu);
+			 ci[WS(rs, 53)] = FMA(Tdr, Tdy, Tdv * Tdu);
+		    }
+		    {
+			 E TdA, TdC, Tdz, TdB;
+			 TdA = Tds + Tdt;
+			 TdC = Tdw + Tdx;
+			 Tdz = W[40];
+			 TdB = W[41];
+			 cr[WS(rs, 21)] = FNMS(TdB, TdC, Tdz * TdA);
+			 ci[WS(rs, 21)] = FMA(Tdz, TdC, TdB * TdA);
+		    }
+		    {
+			 E TcO, TcQ, TcN, TcP;
+			 TcO = Tba + Tcr;
+			 TcQ = TcI + TcL;
+			 TcN = W[24];
+			 TcP = W[25];
+			 cr[WS(rs, 13)] = FNMS(TcP, TcQ, TcN * TcO);
+			 ci[WS(rs, 13)] = FMA(TcP, TcO, TcN * TcQ);
+		    }
+		    {
+			 E TcU, TcY, TcR, TcV;
+			 TcU = TcS - TcT;
+			 TcY = TcW - TcX;
+			 TcR = W[120];
+			 TcV = W[121];
+			 cr[WS(rs, 61)] = FNMS(TcV, TcY, TcR * TcU);
+			 ci[WS(rs, 61)] = FMA(TcR, TcY, TcV * TcU);
+		    }
+		    {
+			 E Tde, Tdm, Td3, Tdf;
+			 Tde = Td6 - Tdd;
+			 Tdm = Tdi - Tdl;
+			 Td3 = W[72];
+			 Tdf = W[73];
+			 cr[WS(rs, 37)] = FNMS(Tdf, Tdm, Td3 * Tde);
+			 ci[WS(rs, 37)] = FMA(Tdf, Tde, Td3 * Tdm);
+		    }
+		    {
+			 E Tdo, Tdq, Tdn, Tdp;
+			 Tdo = Td6 + Tdd;
+			 Tdq = Tdi + Tdl;
+			 Tdn = W[8];
+			 Tdp = W[9];
+			 cr[WS(rs, 5)] = FNMS(Tdp, Tdq, Tdn * Tdo);
+			 ci[WS(rs, 5)] = FMA(Tdp, Tdo, Tdn * Tdq);
+		    }
+		    {
+			 E Td0, Td2, TcZ, Td1;
+			 Td0 = TcS + TcT;
+			 Td2 = TcW + TcX;
+			 TcZ = W[56];
+			 Td1 = W[57];
+			 cr[WS(rs, 29)] = FNMS(Td1, Td2, TcZ * Td0);
+			 ci[WS(rs, 29)] = FMA(TcZ, Td2, Td1 * Td0);
+		    }
+	       }
+	       {
+		    E Tfy, Thc, Tgy, TgY, Tgo, Th8, TgC, TgM, Tgb, TgD, Tgr, Tgz, TgT, Thd, Th1;
+		    E Th9;
+		    {
+			 E Tfi, TgW, Tfx, TgX, Tfp, Tfw;
+			 Tfi = Tfa - Tfh;
+			 TgW = Tgg + Tgj;
+			 Tfp = FNMS(KP555570233, Tfo, KP831469612 * Tfl);
+			 Tfw = FMA(KP831469612, Tfs, KP555570233 * Tfv);
+			 Tfx = Tfp - Tfw;
+			 TgX = Tfw + Tfp;
+			 Tfy = Tfi + Tfx;
+			 Thc = TgW - TgX;
+			 Tgy = Tfi - Tfx;
+			 TgY = TgW + TgX;
+		    }
+		    {
+			 E Tgk, TgK, Tgn, TgL, Tgl, Tgm;
+			 Tgk = Tgg - Tgj;
+			 TgK = Tfa + Tfh;
+			 Tgl = FNMS(KP555570233, Tfs, KP831469612 * Tfv);
+			 Tgm = FMA(KP555570233, Tfl, KP831469612 * Tfo);
+			 Tgn = Tgl - Tgm;
+			 TgL = Tgl + Tgm;
+			 Tgo = Tgk + Tgn;
+			 Th8 = TgK - TgL;
+			 TgC = Tgk - Tgn;
+			 TgM = TgK + TgL;
+		    }
+		    {
+			 E TfR, Tgp, Tga, Tgq;
+			 {
+			      E TfJ, TfQ, Tg2, Tg9;
+			      TfJ = TfB - TfI;
+			      TfQ = TfM - TfP;
+			      TfR = FNMS(KP881921264, TfQ, KP471396736 * TfJ);
+			      Tgp = FMA(KP881921264, TfJ, KP471396736 * TfQ);
+			      Tg2 = TfU - Tg1;
+			      Tg9 = Tg5 - Tg8;
+			      Tga = FMA(KP471396736, Tg2, KP881921264 * Tg9);
+			      Tgq = FNMS(KP881921264, Tg2, KP471396736 * Tg9);
+			 }
+			 Tgb = TfR + Tga;
+			 TgD = TfR - Tga;
+			 Tgr = Tgp + Tgq;
+			 Tgz = Tgq - Tgp;
+		    }
+		    {
+			 E TgP, TgZ, TgS, Th0;
+			 {
+			      E TgN, TgO, TgQ, TgR;
+			      TgN = TfB + TfI;
+			      TgO = TfM + TfP;
+			      TgP = FNMS(KP290284677, TgO, KP956940335 * TgN);
+			      TgZ = FMA(KP290284677, TgN, KP956940335 * TgO);
+			      TgQ = TfU + Tg1;
+			      TgR = Tg5 + Tg8;
+			      TgS = FMA(KP956940335, TgQ, KP290284677 * TgR);
+			      Th0 = FNMS(KP290284677, TgQ, KP956940335 * TgR);
+			 }
+			 TgT = TgP + TgS;
+			 Thd = TgP - TgS;
+			 Th1 = TgZ + Th0;
+			 Th9 = Th0 - TgZ;
+		    }
+		    {
+			 E Tgc, Tgs, Tf7, Tgd;
+			 Tgc = Tfy - Tgb;
+			 Tgs = Tgo - Tgr;
+			 Tf7 = W[84];
+			 Tgd = W[85];
+			 cr[WS(rs, 43)] = FNMS(Tgd, Tgs, Tf7 * Tgc);
+			 ci[WS(rs, 43)] = FMA(Tgd, Tgc, Tf7 * Tgs);
+		    }
+		    {
+			 E Tha, The, Th7, Thb;
+			 Tha = Th8 - Th9;
+			 The = Thc - Thd;
+			 Th7 = W[100];
+			 Thb = W[101];
+			 cr[WS(rs, 51)] = FNMS(Thb, The, Th7 * Tha);
+			 ci[WS(rs, 51)] = FMA(Th7, The, Thb * Tha);
+		    }
+		    {
+			 E Thg, Thi, Thf, Thh;
+			 Thg = Th8 + Th9;
+			 Thi = Thc + Thd;
+			 Thf = W[36];
+			 Thh = W[37];
+			 cr[WS(rs, 19)] = FNMS(Thh, Thi, Thf * Thg);
+			 ci[WS(rs, 19)] = FMA(Thf, Thi, Thh * Thg);
+		    }
+		    {
+			 E Tgu, Tgw, Tgt, Tgv;
+			 Tgu = Tfy + Tgb;
+			 Tgw = Tgo + Tgr;
+			 Tgt = W[20];
+			 Tgv = W[21];
+			 cr[WS(rs, 11)] = FNMS(Tgv, Tgw, Tgt * Tgu);
+			 ci[WS(rs, 11)] = FMA(Tgv, Tgu, Tgt * Tgw);
+		    }
+		    {
+			 E TgA, TgE, Tgx, TgB;
+			 TgA = Tgy - Tgz;
+			 TgE = TgC - TgD;
+			 Tgx = W[116];
+			 TgB = W[117];
+			 cr[WS(rs, 59)] = FNMS(TgB, TgE, Tgx * TgA);
+			 ci[WS(rs, 59)] = FMA(Tgx, TgE, TgB * TgA);
+		    }
+		    {
+			 E TgU, Th2, TgJ, TgV;
+			 TgU = TgM - TgT;
+			 Th2 = TgY - Th1;
+			 TgJ = W[68];
+			 TgV = W[69];
+			 cr[WS(rs, 35)] = FNMS(TgV, Th2, TgJ * TgU);
+			 ci[WS(rs, 35)] = FMA(TgV, TgU, TgJ * Th2);
+		    }
+		    {
+			 E Th4, Th6, Th3, Th5;
+			 Th4 = TgM + TgT;
+			 Th6 = TgY + Th1;
+			 Th3 = W[4];
+			 Th5 = W[5];
+			 cr[WS(rs, 3)] = FNMS(Th5, Th6, Th3 * Th4);
+			 ci[WS(rs, 3)] = FMA(Th5, Th4, Th3 * Th6);
+		    }
+		    {
+			 E TgG, TgI, TgF, TgH;
+			 TgG = Tgy + Tgz;
+			 TgI = TgC + TgD;
+			 TgF = W[52];
+			 TgH = W[53];
+			 cr[WS(rs, 27)] = FNMS(TgH, TgI, TgF * TgG);
+			 ci[WS(rs, 27)] = FMA(TgF, TgI, TgH * TgG);
+		    }
+	       }
+	       {
+		    E TdO, Tf0, Tem, TeM, Tec, TeW, Teq, TeA, Te3, Ter, Tef, Ten, TeH, Tf1, TeP;
+		    E TeX;
+		    {
+			 E TdG, TeK, TdN, TeL, TdJ, TdM;
+			 TdG = TdE - TdF;
+			 TeK = Te6 + Te7;
+			 TdJ = FNMS(KP195090322, TdI, KP980785280 * TdH);
+			 TdM = FMA(KP195090322, TdK, KP980785280 * TdL);
+			 TdN = TdJ - TdM;
+			 TeL = TdM + TdJ;
+			 TdO = TdG - TdN;
+			 Tf0 = TeK + TeL;
+			 Tem = TdG + TdN;
+			 TeM = TeK - TeL;
+		    }
+		    {
+			 E Te8, Tey, Teb, Tez, Te9, Tea;
+			 Te8 = Te6 - Te7;
+			 Tey = TdE + TdF;
+			 Te9 = FNMS(KP195090322, TdL, KP980785280 * TdK);
+			 Tea = FMA(KP980785280, TdI, KP195090322 * TdH);
+			 Teb = Te9 - Tea;
+			 Tez = Te9 + Tea;
+			 Tec = Te8 - Teb;
+			 TeW = Tey + Tez;
+			 Teq = Te8 + Teb;
+			 TeA = Tey - Tez;
+		    }
+		    {
+			 E TdV, Tee, Te2, Ted;
+			 {
+			      E TdR, TdU, TdY, Te1;
+			      TdR = TdP - TdQ;
+			      TdU = TdS - TdT;
+			      TdV = FNMS(KP773010453, TdU, KP634393284 * TdR);
+			      Tee = FMA(KP773010453, TdR, KP634393284 * TdU);
+			      TdY = TdW - TdX;
+			      Te1 = TdZ - Te0;
+			      Te2 = FMA(KP634393284, TdY, KP773010453 * Te1);
+			      Ted = FNMS(KP773010453, TdY, KP634393284 * Te1);
+			 }
+			 Te3 = TdV - Te2;
+			 Ter = Te2 + TdV;
+			 Tef = Ted - Tee;
+			 Ten = Ted + Tee;
+		    }
+		    {
+			 E TeD, TeO, TeG, TeN;
+			 {
+			      E TeB, TeC, TeE, TeF;
+			      TeB = TdP + TdQ;
+			      TeC = TdS + TdT;
+			      TeD = FNMS(KP098017140, TeC, KP995184726 * TeB);
+			      TeO = FMA(KP098017140, TeB, KP995184726 * TeC);
+			      TeE = TdW + TdX;
+			      TeF = TdZ + Te0;
+			      TeG = FMA(KP995184726, TeE, KP098017140 * TeF);
+			      TeN = FNMS(KP098017140, TeE, KP995184726 * TeF);
+			 }
+			 TeH = TeD - TeG;
+			 Tf1 = TeG + TeD;
+			 TeP = TeN - TeO;
+			 TeX = TeN + TeO;
+		    }
+		    {
+			 E Te4, Teg, TdD, Te5;
+			 Te4 = TdO - Te3;
+			 Teg = Tec - Tef;
+			 TdD = W[112];
+			 Te5 = W[113];
+			 cr[WS(rs, 57)] = FNMS(Te5, Teg, TdD * Te4);
+			 ci[WS(rs, 57)] = FMA(Te5, Te4, TdD * Teg);
+		    }
+		    {
+			 E TeY, Tf2, TeV, TeZ;
+			 TeY = TeW - TeX;
+			 Tf2 = Tf0 - Tf1;
+			 TeV = W[64];
+			 TeZ = W[65];
+			 cr[WS(rs, 33)] = FNMS(TeZ, Tf2, TeV * TeY);
+			 ci[WS(rs, 33)] = FMA(TeV, Tf2, TeZ * TeY);
+		    }
+		    {
+			 E Tf4, Tf6, Tf3, Tf5;
+			 Tf4 = TeW + TeX;
+			 Tf6 = Tf0 + Tf1;
+			 Tf3 = W[0];
+			 Tf5 = W[1];
+			 cr[WS(rs, 1)] = FNMS(Tf5, Tf6, Tf3 * Tf4);
+			 ci[WS(rs, 1)] = FMA(Tf3, Tf6, Tf5 * Tf4);
+		    }
+		    {
+			 E Tei, Tek, Teh, Tej;
+			 Tei = TdO + Te3;
+			 Tek = Tec + Tef;
+			 Teh = W[48];
+			 Tej = W[49];
+			 cr[WS(rs, 25)] = FNMS(Tej, Tek, Teh * Tei);
+			 ci[WS(rs, 25)] = FMA(Tej, Tei, Teh * Tek);
+		    }
+		    {
+			 E Teo, Tes, Tel, Tep;
+			 Teo = Tem - Ten;
+			 Tes = Teq - Ter;
+			 Tel = W[80];
+			 Tep = W[81];
+			 cr[WS(rs, 41)] = FNMS(Tep, Tes, Tel * Teo);
+			 ci[WS(rs, 41)] = FMA(Tel, Tes, Tep * Teo);
+		    }
+		    {
+			 E TeI, TeQ, Tex, TeJ;
+			 TeI = TeA - TeH;
+			 TeQ = TeM - TeP;
+			 Tex = W[96];
+			 TeJ = W[97];
+			 cr[WS(rs, 49)] = FNMS(TeJ, TeQ, Tex * TeI);
+			 ci[WS(rs, 49)] = FMA(TeJ, TeI, Tex * TeQ);
+		    }
+		    {
+			 E TeS, TeU, TeR, TeT;
+			 TeS = TeA + TeH;
+			 TeU = TeM + TeP;
+			 TeR = W[32];
+			 TeT = W[33];
+			 cr[WS(rs, 17)] = FNMS(TeT, TeU, TeR * TeS);
+			 ci[WS(rs, 17)] = FMA(TeT, TeS, TeR * TeU);
+		    }
+		    {
+			 E Teu, Tew, Tet, Tev;
+			 Teu = Tem + Ten;
+			 Tew = Teq + Ter;
+			 Tet = W[16];
+			 Tev = W[17];
+			 cr[WS(rs, 9)] = FNMS(Tev, Tew, Tet * Teu);
+			 ci[WS(rs, 9)] = FMA(Tet, Tew, Tev * Teu);
+		    }
+	       }
+	       {
+		    E Thu, TiG, Ti2, Tis, ThS, TiC, Ti6, Tig, ThJ, Ti7, ThV, Ti3, Tin, TiH, Tiv;
+		    E TiD;
+		    {
+			 E Thm, Tiq, Tht, Tir, Thp, Ths;
+			 Thm = Thk - Thl;
+			 Tiq = ThM - ThN;
+			 Thp = FNMS(KP980785280, Tho, KP195090322 * Thn);
+			 Ths = FNMS(KP980785280, Thr, KP195090322 * Thq);
+			 Tht = Thp + Ths;
+			 Tir = Thp - Ths;
+			 Thu = Thm - Tht;
+			 TiG = Tiq - Tir;
+			 Ti2 = Thm + Tht;
+			 Tis = Tiq + Tir;
+		    }
+		    {
+			 E ThO, Tie, ThR, Tif, ThP, ThQ;
+			 ThO = ThM + ThN;
+			 Tie = Thk + Thl;
+			 ThP = FMA(KP195090322, Tho, KP980785280 * Thn);
+			 ThQ = FMA(KP195090322, Thr, KP980785280 * Thq);
+			 ThR = ThP - ThQ;
+			 Tif = ThP + ThQ;
+			 ThS = ThO - ThR;
+			 TiC = Tie + Tif;
+			 Ti6 = ThO + ThR;
+			 Tig = Tie - Tif;
+		    }
+		    {
+			 E ThB, ThU, ThI, ThT;
+			 {
+			      E Thx, ThA, ThE, ThH;
+			      Thx = Thv - Thw;
+			      ThA = Thy - Thz;
+			      ThB = FNMS(KP634393284, ThA, KP773010453 * Thx);
+			      ThU = FMA(KP634393284, Thx, KP773010453 * ThA);
+			      ThE = ThC + ThD;
+			      ThH = ThF - ThG;
+			      ThI = FMA(KP773010453, ThE, KP634393284 * ThH);
+			      ThT = FNMS(KP634393284, ThE, KP773010453 * ThH);
+			 }
+			 ThJ = ThB - ThI;
+			 Ti7 = ThI + ThB;
+			 ThV = ThT - ThU;
+			 Ti3 = ThT + ThU;
+		    }
+		    {
+			 E Tij, Tit, Tim, Tiu;
+			 {
+			      E Tih, Tii, Tik, Til;
+			      Tih = ThF + ThG;
+			      Tii = ThC - ThD;
+			      Tij = FNMS(KP995184726, Tii, KP098017140 * Tih);
+			      Tit = FMA(KP098017140, Tii, KP995184726 * Tih);
+			      Tik = Thy + Thz;
+			      Til = Thw + Thv;
+			      Tim = FNMS(KP995184726, Til, KP098017140 * Tik);
+			      Tiu = FMA(KP098017140, Til, KP995184726 * Tik);
+			 }
+			 Tin = Tij + Tim;
+			 TiH = Tij - Tim;
+			 Tiv = Tit - Tiu;
+			 TiD = Tit + Tiu;
+		    }
+		    {
+			 E ThK, ThW, Thj, ThL;
+			 ThK = Thu - ThJ;
+			 ThW = ThS - ThV;
+			 Thj = W[108];
+			 ThL = W[109];
+			 cr[WS(rs, 55)] = FNMS(ThL, ThW, Thj * ThK);
+			 ci[WS(rs, 55)] = FMA(ThL, ThK, Thj * ThW);
+		    }
+		    {
+			 E TiE, TiI, TiB, TiF;
+			 TiE = TiC - TiD;
+			 TiI = TiG + TiH;
+			 TiB = W[60];
+			 TiF = W[61];
+			 cr[WS(rs, 31)] = FNMS(TiF, TiI, TiB * TiE);
+			 ci[WS(rs, 31)] = FMA(TiB, TiI, TiF * TiE);
+		    }
+		    {
+			 E TiK, TiM, TiJ, TiL;
+			 TiK = TiC + TiD;
+			 TiM = TiG - TiH;
+			 TiJ = W[124];
+			 TiL = W[125];
+			 cr[WS(rs, 63)] = FNMS(TiL, TiM, TiJ * TiK);
+			 ci[WS(rs, 63)] = FMA(TiJ, TiM, TiL * TiK);
+		    }
+		    {
+			 E ThY, Ti0, ThX, ThZ;
+			 ThY = Thu + ThJ;
+			 Ti0 = ThS + ThV;
+			 ThX = W[44];
+			 ThZ = W[45];
+			 cr[WS(rs, 23)] = FNMS(ThZ, Ti0, ThX * ThY);
+			 ci[WS(rs, 23)] = FMA(ThZ, ThY, ThX * Ti0);
+		    }
+		    {
+			 E Ti4, Ti8, Ti1, Ti5;
+			 Ti4 = Ti2 - Ti3;
+			 Ti8 = Ti6 - Ti7;
+			 Ti1 = W[76];
+			 Ti5 = W[77];
+			 cr[WS(rs, 39)] = FNMS(Ti5, Ti8, Ti1 * Ti4);
+			 ci[WS(rs, 39)] = FMA(Ti1, Ti8, Ti5 * Ti4);
+		    }
+		    {
+			 E Tio, Tiw, Tid, Tip;
+			 Tio = Tig - Tin;
+			 Tiw = Tis - Tiv;
+			 Tid = W[92];
+			 Tip = W[93];
+			 cr[WS(rs, 47)] = FNMS(Tip, Tiw, Tid * Tio);
+			 ci[WS(rs, 47)] = FMA(Tip, Tio, Tid * Tiw);
+		    }
+		    {
+			 E Tiy, TiA, Tix, Tiz;
+			 Tiy = Tig + Tin;
+			 TiA = Tis + Tiv;
+			 Tix = W[28];
+			 Tiz = W[29];
+			 cr[WS(rs, 15)] = FNMS(Tiz, TiA, Tix * Tiy);
+			 ci[WS(rs, 15)] = FMA(Tiz, Tiy, Tix * TiA);
+		    }
+		    {
+			 E Tia, Tic, Ti9, Tib;
+			 Tia = Ti2 + Ti3;
+			 Tic = Ti6 + Ti7;
+			 Ti9 = W[12];
+			 Tib = W[13];
+			 cr[WS(rs, 7)] = FNMS(Tib, Tic, Ti9 * Tia);
+			 ci[WS(rs, 7)] = FMA(Ti9, Tic, Tib * Tia);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 64},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 64, "hb_64", twinstr, &GENUS, {808, 270, 230, 0} };
+
+void X(codelet_hb_64) (planner *p) {
+     X(khc2hc_register) (p, hb_64, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:12 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -dif -name hb_7 -include hb.h */
+
+/*
+ * This function contains 72 FP additions, 66 FP multiplications,
+ * (or, 18 additions, 12 multiplications, 54 fused multiply/add),
+ * 67 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "hb.h"
+
+static void hb_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
+	       E T1q, T1p, T1t, T1r, T1s, T1u;
+	       {
+		    E T1, T4, TC, T7, TB, Tt, TD, Ta, TA, T1l, TZ, T1b, Th, Tw, Td;
+		    E TP, Ti, Tj, Tl, Tm, T8, T9, T1a;
+		    T1 = cr[0];
+		    {
+			 E T2, T3, T5, T6;
+			 T2 = cr[WS(rs, 1)];
+			 T3 = ci[0];
+			 T5 = cr[WS(rs, 2)];
+			 T6 = ci[WS(rs, 1)];
+			 T8 = cr[WS(rs, 3)];
+			 T4 = T2 + T3;
+			 TC = T2 - T3;
+			 T7 = T5 + T6;
+			 TB = T5 - T6;
+			 T9 = ci[WS(rs, 2)];
+		    }
+		    Tt = ci[WS(rs, 6)];
+		    TD = FNMS(KP554958132, TC, TB);
+		    T1a = FNMS(KP356895867, T7, T4);
+		    Ta = T8 + T9;
+		    TA = T8 - T9;
+		    {
+			 E Tf, Tg, Tc, TO;
+			 Tf = ci[WS(rs, 3)];
+			 Tg = cr[WS(rs, 4)];
+			 T1l = FMA(KP554958132, TA, TC);
+			 TZ = FMA(KP554958132, TB, TA);
+			 Tc = FNMS(KP356895867, Ta, T7);
+			 TO = FNMS(KP356895867, T4, Ta);
+			 T1b = FNMS(KP692021471, T1a, Ta);
+			 Th = Tf + Tg;
+			 Tw = Tf - Tg;
+			 Td = FNMS(KP692021471, Tc, T4);
+			 TP = FNMS(KP692021471, TO, T7);
+		    }
+		    Ti = ci[WS(rs, 4)];
+		    Tj = cr[WS(rs, 5)];
+		    Tl = ci[WS(rs, 5)];
+		    Tm = cr[WS(rs, 6)];
+		    {
+			 E Ty, TS, TX, T1j, T1e, Tp, Tk, Tv;
+			 cr[0] = T1 + T4 + T7 + Ta;
+			 Tk = Ti + Tj;
+			 Tv = Ti - Tj;
+			 {
+			      E Tn, Tu, Tx, TR;
+			      Tn = Tl + Tm;
+			      Tu = Tl - Tm;
+			      Tx = FNMS(KP356895867, Tw, Tv);
+			      TR = FMA(KP554958132, Tk, Th);
+			      {
+				   E TW, T1i, T1d, To;
+				   TW = FNMS(KP356895867, Tu, Tw);
+				   T1i = FNMS(KP356895867, Tv, Tu);
+				   T1d = FMA(KP554958132, Th, Tn);
+				   To = FNMS(KP554958132, Tn, Tk);
+				   Ty = FNMS(KP692021471, Tx, Tu);
+				   TS = FNMS(KP801937735, TR, Tn);
+				   TX = FNMS(KP692021471, TW, Tv);
+				   T1j = FNMS(KP692021471, T1i, Tw);
+				   T1e = FMA(KP801937735, T1d, Tk);
+				   Tp = FNMS(KP801937735, To, Th);
+				   ci[0] = Tt + Tu + Tv + Tw;
+			      }
+			 }
+			 {
+			      E TL, TH, TK, TJ, TM, Te, Tz, TE;
+			      Te = FNMS(KP900968867, Td, T1);
+			      Tz = FNMS(KP900968867, Ty, Tt);
+			      TE = FNMS(KP801937735, TD, TA);
+			      {
+				   E Tb, TI, Tq, TF, Ts, Tr, TG;
+				   Tb = W[4];
+				   TI = FMA(KP974927912, Tp, Te);
+				   Tq = FNMS(KP974927912, Tp, Te);
+				   TL = FNMS(KP974927912, TE, Tz);
+				   TF = FMA(KP974927912, TE, Tz);
+				   Ts = W[5];
+				   Tr = Tb * Tq;
+				   TH = W[6];
+				   TK = W[7];
+				   TG = Ts * Tq;
+				   cr[WS(rs, 3)] = FNMS(Ts, TF, Tr);
+				   TJ = TH * TI;
+				   TM = TK * TI;
+				   ci[WS(rs, 3)] = FMA(Tb, TF, TG);
+			      }
+			      {
+				   E T14, T13, T17, T15, T16;
+				   {
+					E TY, TT, T10, TQ;
+					TQ = FNMS(KP900968867, TP, T1);
+					cr[WS(rs, 4)] = FNMS(TK, TL, TJ);
+					ci[WS(rs, 4)] = FMA(TH, TL, TM);
+					TY = FNMS(KP900968867, TX, Tt);
+					TT = FNMS(KP974927912, TS, TQ);
+					T14 = FMA(KP974927912, TS, TQ);
+					T10 = FNMS(KP801937735, TZ, TC);
+					{
+					     E TN, TV, T11, TU, T12;
+					     TN = W[2];
+					     TV = W[3];
+					     T13 = W[8];
+					     T11 = FMA(KP974927912, T10, TY);
+					     T17 = FNMS(KP974927912, T10, TY);
+					     TU = TN * TT;
+					     T12 = TV * TT;
+					     T15 = T13 * T14;
+					     T16 = W[9];
+					     cr[WS(rs, 2)] = FNMS(TV, T11, TU);
+					     ci[WS(rs, 2)] = FMA(TN, T11, T12);
+					}
+				   }
+				   {
+					E T1k, T1f, T1m, T1c, T18;
+					T1c = FNMS(KP900968867, T1b, T1);
+					cr[WS(rs, 5)] = FNMS(T16, T17, T15);
+					T18 = T16 * T14;
+					T1k = FNMS(KP900968867, T1j, Tt);
+					T1f = FNMS(KP974927912, T1e, T1c);
+					T1q = FMA(KP974927912, T1e, T1c);
+					ci[WS(rs, 5)] = FMA(T13, T17, T18);
+					T1m = FMA(KP801937735, T1l, TB);
+					{
+					     E T19, T1h, T1n, T1g, T1o;
+					     T19 = W[0];
+					     T1h = W[1];
+					     T1p = W[10];
+					     T1t = FNMS(KP974927912, T1m, T1k);
+					     T1n = FMA(KP974927912, T1m, T1k);
+					     T1g = T19 * T1f;
+					     T1o = T1h * T1f;
+					     T1r = T1p * T1q;
+					     T1s = W[11];
+					     cr[WS(rs, 1)] = FNMS(T1h, T1n, T1g);
+					     ci[WS(rs, 1)] = FMA(T19, T1n, T1o);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 6)] = FNMS(T1s, T1t, T1r);
+	       T1u = T1s * T1q;
+	       ci[WS(rs, 6)] = FMA(T1p, T1t, T1u);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 7, "hb_7", twinstr, &GENUS, {18, 12, 54, 0} };
+
+void X(codelet_hb_7) (planner *p) {
+     X(khc2hc_register) (p, hb_7, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -dif -name hb_7 -include hb.h */
+
+/*
+ * This function contains 72 FP additions, 60 FP multiplications,
+ * (or, 36 additions, 24 multiplications, 36 fused multiply/add),
+ * 36 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "hb.h"
+
+static void hb_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
+	       E T1, T4, T7, Ta, Tx, TI, TV, TQ, TE, Tm, Tb, Te, Th, Tk, Tq;
+	       E TF, TR, TU, TJ, Tt;
+	       {
+		    E Tu, Tw, Tv, T2, T3;
+		    T1 = cr[0];
+		    T2 = cr[WS(rs, 1)];
+		    T3 = ci[0];
+		    T4 = T2 + T3;
+		    Tu = T2 - T3;
+		    {
+			 E T5, T6, T8, T9;
+			 T5 = cr[WS(rs, 2)];
+			 T6 = ci[WS(rs, 1)];
+			 T7 = T5 + T6;
+			 Tw = T5 - T6;
+			 T8 = cr[WS(rs, 3)];
+			 T9 = ci[WS(rs, 2)];
+			 Ta = T8 + T9;
+			 Tv = T8 - T9;
+		    }
+		    Tx = FMA(KP433883739, Tu, KP974927912 * Tv) - (KP781831482 * Tw);
+		    TI = FMA(KP781831482, Tu, KP974927912 * Tw) + (KP433883739 * Tv);
+		    TV = FNMS(KP781831482, Tv, KP974927912 * Tu) - (KP433883739 * Tw);
+		    TQ = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
+		    TE = FMA(KP623489801, T4, T1) + FNMA(KP900968867, Ta, KP222520933 * T7);
+		    Tm = FMA(KP623489801, T7, T1) + FNMA(KP222520933, Ta, KP900968867 * T4);
+	       }
+	       {
+		    E Tp, Tn, To, Tc, Td;
+		    Tb = ci[WS(rs, 6)];
+		    Tc = ci[WS(rs, 5)];
+		    Td = cr[WS(rs, 6)];
+		    Te = Tc - Td;
+		    Tp = Tc + Td;
+		    {
+			 E Tf, Tg, Ti, Tj;
+			 Tf = ci[WS(rs, 4)];
+			 Tg = cr[WS(rs, 5)];
+			 Th = Tf - Tg;
+			 Tn = Tf + Tg;
+			 Ti = ci[WS(rs, 3)];
+			 Tj = cr[WS(rs, 4)];
+			 Tk = Ti - Tj;
+			 To = Ti + Tj;
+		    }
+		    Tq = FNMS(KP974927912, To, KP781831482 * Tn) - (KP433883739 * Tp);
+		    TF = FMA(KP781831482, Tp, KP974927912 * Tn) + (KP433883739 * To);
+		    TR = FMA(KP433883739, Tn, KP781831482 * To) - (KP974927912 * Tp);
+		    TU = FMA(KP623489801, Tk, Tb) + FNMA(KP900968867, Th, KP222520933 * Te);
+		    TJ = FMA(KP623489801, Te, Tb) + FNMA(KP900968867, Tk, KP222520933 * Th);
+		    Tt = FMA(KP623489801, Th, Tb) + FNMA(KP222520933, Tk, KP900968867 * Te);
+	       }
+	       cr[0] = T1 + T4 + T7 + Ta;
+	       ci[0] = Tb + Te + Th + Tk;
+	       {
+		    E Tr, Ty, Tl, Ts;
+		    Tr = Tm - Tq;
+		    Ty = Tt - Tx;
+		    Tl = W[6];
+		    Ts = W[7];
+		    cr[WS(rs, 4)] = FNMS(Ts, Ty, Tl * Tr);
+		    ci[WS(rs, 4)] = FMA(Tl, Ty, Ts * Tr);
+	       }
+	       {
+		    E TY, T10, TX, TZ;
+		    TY = TQ + TR;
+		    T10 = TV + TU;
+		    TX = W[2];
+		    TZ = W[3];
+		    cr[WS(rs, 2)] = FNMS(TZ, T10, TX * TY);
+		    ci[WS(rs, 2)] = FMA(TX, T10, TZ * TY);
+	       }
+	       {
+		    E TA, TC, Tz, TB;
+		    TA = Tm + Tq;
+		    TC = Tx + Tt;
+		    Tz = W[4];
+		    TB = W[5];
+		    cr[WS(rs, 3)] = FNMS(TB, TC, Tz * TA);
+		    ci[WS(rs, 3)] = FMA(Tz, TC, TB * TA);
+	       }
+	       {
+		    E TM, TO, TL, TN;
+		    TM = TE + TF;
+		    TO = TJ - TI;
+		    TL = W[10];
+		    TN = W[11];
+		    cr[WS(rs, 6)] = FNMS(TN, TO, TL * TM);
+		    ci[WS(rs, 6)] = FMA(TL, TO, TN * TM);
+	       }
+	       {
+		    E TS, TW, TP, TT;
+		    TS = TQ - TR;
+		    TW = TU - TV;
+		    TP = W[8];
+		    TT = W[9];
+		    cr[WS(rs, 5)] = FNMS(TT, TW, TP * TS);
+		    ci[WS(rs, 5)] = FMA(TP, TW, TT * TS);
+	       }
+	       {
+		    E TG, TK, TD, TH;
+		    TG = TE - TF;
+		    TK = TI + TJ;
+		    TD = W[0];
+		    TH = W[1];
+		    cr[WS(rs, 1)] = FNMS(TH, TK, TD * TG);
+		    ci[WS(rs, 1)] = FMA(TD, TK, TH * TG);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 7, "hb_7", twinstr, &GENUS, {36, 24, 36, 0} };
+
+void X(codelet_hb_7) (planner *p) {
+     X(khc2hc_register) (p, hb_7, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:13 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hb_8 -include hb.h */
+
+/*
+ * This function contains 66 FP additions, 36 FP multiplications,
+ * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
+ * 52 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hb.h"
+
+static void hb_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E Tw, TH, Tf, Ty, Tx, TI;
+	       {
+		    E TV, TD, T1i, T7, T1b, T1n, TQ, Tk, Tb, Tl, Ta, T1d, Tt, Tc, Tm;
+		    E Tn;
+		    {
+			 E T4, Tg, T3, T19, TC, T5, Th, Ti;
+			 {
+			      E T1, T2, TA, TB;
+			      T1 = cr[0];
+			      T2 = ci[WS(rs, 3)];
+			      TA = ci[WS(rs, 7)];
+			      TB = cr[WS(rs, 4)];
+			      T4 = cr[WS(rs, 2)];
+			      Tg = T1 - T2;
+			      T3 = T1 + T2;
+			      T19 = TA - TB;
+			      TC = TA + TB;
+			      T5 = ci[WS(rs, 1)];
+			      Th = ci[WS(rs, 5)];
+			      Ti = cr[WS(rs, 6)];
+			 }
+			 {
+			      E T8, T9, Tr, Ts;
+			      T8 = cr[WS(rs, 1)];
+			      {
+				   E Tz, T6, T1a, Tj;
+				   Tz = T4 - T5;
+				   T6 = T4 + T5;
+				   T1a = Th - Ti;
+				   Tj = Th + Ti;
+				   TV = TC - Tz;
+				   TD = Tz + TC;
+				   T1i = T3 - T6;
+				   T7 = T3 + T6;
+				   T1b = T19 + T1a;
+				   T1n = T19 - T1a;
+				   TQ = Tg + Tj;
+				   Tk = Tg - Tj;
+				   T9 = ci[WS(rs, 2)];
+			      }
+			      Tr = ci[WS(rs, 4)];
+			      Ts = cr[WS(rs, 7)];
+			      Tb = ci[0];
+			      Tl = T8 - T9;
+			      Ta = T8 + T9;
+			      T1d = Tr - Ts;
+			      Tt = Tr + Ts;
+			      Tc = cr[WS(rs, 3)];
+			      Tm = ci[WS(rs, 6)];
+			      Tn = cr[WS(rs, 5)];
+			 }
+		    }
+		    {
+			 E Te, T1e, Tv, TG, T13, T1k, T1s, T10, T1p, T1v, T1u, T1w, T1t;
+			 {
+			      E TP, T1o, T1j, TR, TU, TX, TW;
+			      TP = W[4];
+			      {
+				   E Tq, Td, T1c, To;
+				   Tq = Tb - Tc;
+				   Td = Tb + Tc;
+				   T1c = Tm - Tn;
+				   To = Tm + Tn;
+				   {
+					E Tu, TF, Tp, TE;
+					Tu = Tq - Tt;
+					TF = Tq + Tt;
+					T1o = Ta - Td;
+					Te = Ta + Td;
+					T1j = T1d - T1c;
+					T1e = T1c + T1d;
+					Tp = Tl - To;
+					TE = Tl + To;
+					cr[0] = T7 + Te;
+					ci[0] = T1b + T1e;
+					TW = Tp - Tu;
+					Tv = Tp + Tu;
+					TR = TE + TF;
+					TG = TE - TF;
+				   }
+			      }
+			      TU = W[5];
+			      TX = FMA(KP707106781, TW, TV);
+			      T13 = FNMS(KP707106781, TW, TV);
+			      {
+				   E TS, TY, T1r, TT;
+				   T1k = T1i - T1j;
+				   T1s = T1i + T1j;
+				   TS = FNMS(KP707106781, TR, TQ);
+				   T10 = FMA(KP707106781, TR, TQ);
+				   T1p = T1n - T1o;
+				   T1v = T1o + T1n;
+				   TY = TP * TX;
+				   T1r = W[2];
+				   TT = TP * TS;
+				   T1u = W[3];
+				   ci[WS(rs, 3)] = FMA(TU, TS, TY);
+				   T1w = T1r * T1v;
+				   T1t = T1r * T1s;
+				   cr[WS(rs, 3)] = FNMS(TU, TX, TT);
+			      }
+			 }
+			 {
+			      E T1f, T15, T18, T17, T1g, T1h, T1m;
+			      {
+				   E TZ, T12, T16, T14, T11;
+				   ci[WS(rs, 2)] = FMA(T1u, T1s, T1w);
+				   cr[WS(rs, 2)] = FNMS(T1u, T1v, T1t);
+				   TZ = W[12];
+				   T12 = W[13];
+				   T1f = T1b - T1e;
+				   T16 = T7 - Te;
+				   T14 = TZ * T13;
+				   T11 = TZ * T10;
+				   T15 = W[6];
+				   T18 = W[7];
+				   ci[WS(rs, 7)] = FMA(T12, T10, T14);
+				   cr[WS(rs, 7)] = FNMS(T12, T13, T11);
+				   T17 = T15 * T16;
+				   T1g = T18 * T16;
+			      }
+			      cr[WS(rs, 4)] = FNMS(T18, T1f, T17);
+			      ci[WS(rs, 4)] = FMA(T15, T1f, T1g);
+			      T1h = W[10];
+			      T1m = W[11];
+			      {
+				   E TN, TJ, TM, TL, TO, TK, T1q, T1l;
+				   Tw = FNMS(KP707106781, Tv, Tk);
+				   TK = FMA(KP707106781, Tv, Tk);
+				   T1q = T1h * T1p;
+				   T1l = T1h * T1k;
+				   TN = FMA(KP707106781, TG, TD);
+				   TH = FNMS(KP707106781, TG, TD);
+				   ci[WS(rs, 6)] = FMA(T1m, T1k, T1q);
+				   cr[WS(rs, 6)] = FNMS(T1m, T1p, T1l);
+				   TJ = W[0];
+				   TM = W[1];
+				   Tf = W[8];
+				   TL = TJ * TK;
+				   TO = TM * TK;
+				   Ty = W[9];
+				   Tx = Tf * Tw;
+				   cr[WS(rs, 1)] = FNMS(TM, TN, TL);
+				   ci[WS(rs, 1)] = FMA(TJ, TN, TO);
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 5)] = FNMS(Ty, TH, Tx);
+	       TI = Ty * Tw;
+	       ci[WS(rs, 5)] = FMA(Tf, TH, TI);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 8, "hb_8", twinstr, &GENUS, {44, 14, 22, 0} };
+
+void X(codelet_hb_8) (planner *p) {
+     X(khc2hc_register) (p, hb_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hb_8 -include hb.h */
+
+/*
+ * This function contains 66 FP additions, 32 FP multiplications,
+ * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
+ * 30 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hb.h"
+
+static void hb_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T7, T18, T1c, To, Ty, TM, TY, TC, Te, TZ, T10, Tv, Tz, TP, TS;
+	       E TD;
+	       {
+		    E T3, TK, Tn, TL, T6, TW, Tk, TX;
+		    {
+			 E T1, T2, Tl, Tm;
+			 T1 = cr[0];
+			 T2 = ci[WS(rs, 3)];
+			 T3 = T1 + T2;
+			 TK = T1 - T2;
+			 Tl = ci[WS(rs, 5)];
+			 Tm = cr[WS(rs, 6)];
+			 Tn = Tl - Tm;
+			 TL = Tl + Tm;
+		    }
+		    {
+			 E T4, T5, Ti, Tj;
+			 T4 = cr[WS(rs, 2)];
+			 T5 = ci[WS(rs, 1)];
+			 T6 = T4 + T5;
+			 TW = T4 - T5;
+			 Ti = ci[WS(rs, 7)];
+			 Tj = cr[WS(rs, 4)];
+			 Tk = Ti - Tj;
+			 TX = Ti + Tj;
+		    }
+		    T7 = T3 + T6;
+		    T18 = TK + TL;
+		    T1c = TX - TW;
+		    To = Tk + Tn;
+		    Ty = T3 - T6;
+		    TM = TK - TL;
+		    TY = TW + TX;
+		    TC = Tk - Tn;
+	       }
+	       {
+		    E Ta, TN, Tu, TR, Td, TQ, Tr, TO;
+		    {
+			 E T8, T9, Ts, Tt;
+			 T8 = cr[WS(rs, 1)];
+			 T9 = ci[WS(rs, 2)];
+			 Ta = T8 + T9;
+			 TN = T8 - T9;
+			 Ts = ci[WS(rs, 4)];
+			 Tt = cr[WS(rs, 7)];
+			 Tu = Ts - Tt;
+			 TR = Ts + Tt;
+		    }
+		    {
+			 E Tb, Tc, Tp, Tq;
+			 Tb = ci[0];
+			 Tc = cr[WS(rs, 3)];
+			 Td = Tb + Tc;
+			 TQ = Tb - Tc;
+			 Tp = ci[WS(rs, 6)];
+			 Tq = cr[WS(rs, 5)];
+			 Tr = Tp - Tq;
+			 TO = Tp + Tq;
+		    }
+		    Te = Ta + Td;
+		    TZ = TN + TO;
+		    T10 = TQ + TR;
+		    Tv = Tr + Tu;
+		    Tz = Tu - Tr;
+		    TP = TN - TO;
+		    TS = TQ - TR;
+		    TD = Ta - Td;
+	       }
+	       cr[0] = T7 + Te;
+	       ci[0] = To + Tv;
+	       {
+		    E Tg, Tw, Tf, Th;
+		    Tg = T7 - Te;
+		    Tw = To - Tv;
+		    Tf = W[6];
+		    Th = W[7];
+		    cr[WS(rs, 4)] = FNMS(Th, Tw, Tf * Tg);
+		    ci[WS(rs, 4)] = FMA(Th, Tg, Tf * Tw);
+	       }
+	       {
+		    E TG, TI, TF, TH;
+		    TG = Ty + Tz;
+		    TI = TD + TC;
+		    TF = W[2];
+		    TH = W[3];
+		    cr[WS(rs, 2)] = FNMS(TH, TI, TF * TG);
+		    ci[WS(rs, 2)] = FMA(TF, TI, TH * TG);
+	       }
+	       {
+		    E TA, TE, Tx, TB;
+		    TA = Ty - Tz;
+		    TE = TC - TD;
+		    Tx = W[10];
+		    TB = W[11];
+		    cr[WS(rs, 6)] = FNMS(TB, TE, Tx * TA);
+		    ci[WS(rs, 6)] = FMA(Tx, TE, TB * TA);
+	       }
+	       {
+		    E T1a, T1g, T1e, T1i, T19, T1d;
+		    T19 = KP707106781 * (TZ + T10);
+		    T1a = T18 - T19;
+		    T1g = T18 + T19;
+		    T1d = KP707106781 * (TP - TS);
+		    T1e = T1c + T1d;
+		    T1i = T1c - T1d;
+		    {
+			 E T17, T1b, T1f, T1h;
+			 T17 = W[4];
+			 T1b = W[5];
+			 cr[WS(rs, 3)] = FNMS(T1b, T1e, T17 * T1a);
+			 ci[WS(rs, 3)] = FMA(T17, T1e, T1b * T1a);
+			 T1f = W[12];
+			 T1h = W[13];
+			 cr[WS(rs, 7)] = FNMS(T1h, T1i, T1f * T1g);
+			 ci[WS(rs, 7)] = FMA(T1f, T1i, T1h * T1g);
+		    }
+	       }
+	       {
+		    E TU, T14, T12, T16, TT, T11;
+		    TT = KP707106781 * (TP + TS);
+		    TU = TM - TT;
+		    T14 = TM + TT;
+		    T11 = KP707106781 * (TZ - T10);
+		    T12 = TY - T11;
+		    T16 = TY + T11;
+		    {
+			 E TJ, TV, T13, T15;
+			 TJ = W[8];
+			 TV = W[9];
+			 cr[WS(rs, 5)] = FNMS(TV, T12, TJ * TU);
+			 ci[WS(rs, 5)] = FMA(TV, TU, TJ * T12);
+			 T13 = W[0];
+			 T15 = W[1];
+			 cr[WS(rs, 1)] = FNMS(T15, T16, T13 * T14);
+			 ci[WS(rs, 1)] = FMA(T15, T14, T13 * T16);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 8, "hb_8", twinstr, &GENUS, {52, 18, 14, 0} };
+
+void X(codelet_hb_8) (planner *p) {
+     X(khc2hc_register) (p, hb_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hb_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hb_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:13 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -dif -name hb_9 -include hb.h */
+
+/*
+ * This function contains 96 FP additions, 88 FP multiplications,
+ * (or, 24 additions, 16 multiplications, 72 fused multiply/add),
+ * 69 stack variables, 10 constants, and 36 memory accesses
+ */
+#include "hb.h"
+
+static void hb_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP954188894, +0.954188894138671133499268364187245676532219158);
+     DK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DK(KP492403876, +0.492403876506104029683371512294761506835321626);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP777861913, +0.777861913430206160028177977318626690410586096);
+     DK(KP839099631, +0.839099631177280011763127298123181364687434283);
+     DK(KP363970234, +0.363970234266202361351047882776834043890471784);
+     DK(KP176326980, +0.176326980708464973471090386868618986121633062);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
+	       E T1X, T1S, T1U, T1P, T1Y, T1T;
+	       {
+		    E T5, Tl, TQ, T1y, T1b, T1J, Tg, TE, TW, T13, T10, Tz, Tw, TT, T1K;
+		    E T1B, T1L, T1E;
+		    {
+			 E T1, Th, T2, T3, Ti, Tj;
+			 T1 = cr[0];
+			 Th = ci[WS(rs, 8)];
+			 T2 = cr[WS(rs, 3)];
+			 T3 = ci[WS(rs, 2)];
+			 Ti = ci[WS(rs, 5)];
+			 Tj = cr[WS(rs, 6)];
+			 {
+			      E T12, Tb, TZ, TY, Ta, Tq, T11, Tr, Ts, TS, Te, Tt;
+			      {
+				   E T6, Tm, Tn, To, T9, Tc, Td, Tp;
+				   {
+					E T7, T8, T1a, T4;
+					T6 = cr[WS(rs, 1)];
+					T1a = T2 - T3;
+					T4 = T2 + T3;
+					{
+					     E TP, Tk, TO, T19;
+					     TP = Ti + Tj;
+					     Tk = Ti - Tj;
+					     T7 = cr[WS(rs, 4)];
+					     T5 = T1 + T4;
+					     TO = FNMS(KP500000000, T4, T1);
+					     Tl = Th + Tk;
+					     T19 = FNMS(KP500000000, Tk, Th);
+					     TQ = FNMS(KP866025403, TP, TO);
+					     T1y = FMA(KP866025403, TP, TO);
+					     T1b = FMA(KP866025403, T1a, T19);
+					     T1J = FNMS(KP866025403, T1a, T19);
+					     T8 = ci[WS(rs, 1)];
+					}
+					Tm = ci[WS(rs, 7)];
+					Tn = ci[WS(rs, 4)];
+					To = cr[WS(rs, 7)];
+					T9 = T7 + T8;
+					T12 = T7 - T8;
+				   }
+				   Tb = cr[WS(rs, 2)];
+				   TZ = Tn + To;
+				   Tp = Tn - To;
+				   TY = FNMS(KP500000000, T9, T6);
+				   Ta = T6 + T9;
+				   Tc = ci[WS(rs, 3)];
+				   Td = ci[0];
+				   Tq = Tm + Tp;
+				   T11 = FMS(KP500000000, Tp, Tm);
+				   Tr = ci[WS(rs, 6)];
+				   Ts = cr[WS(rs, 5)];
+				   TS = Td - Tc;
+				   Te = Tc + Td;
+				   Tt = cr[WS(rs, 8)];
+			      }
+			      {
+				   E T1C, Tv, TR, T1D, T1z, T1A;
+				   {
+					E TU, Tu, TV, Tf;
+					TU = FNMS(KP500000000, Te, Tb);
+					Tf = Tb + Te;
+					Tu = Ts + Tt;
+					TV = Ts - Tt;
+					Tg = Ta + Tf;
+					TE = Ta - Tf;
+					TW = FMA(KP866025403, TV, TU);
+					T1C = FNMS(KP866025403, TV, TU);
+					Tv = Tr - Tu;
+					TR = FMA(KP500000000, Tu, Tr);
+				   }
+				   T1z = FMA(KP866025403, T12, T11);
+				   T13 = FNMS(KP866025403, T12, T11);
+				   T10 = FNMS(KP866025403, TZ, TY);
+				   T1A = FMA(KP866025403, TZ, TY);
+				   Tz = Tv - Tq;
+				   Tw = Tq + Tv;
+				   T1D = FMA(KP866025403, TS, TR);
+				   TT = FNMS(KP866025403, TS, TR);
+				   T1K = FNMS(KP176326980, T1z, T1A);
+				   T1B = FMA(KP176326980, T1A, T1z);
+				   T1L = FNMS(KP363970234, T1C, T1D);
+				   T1E = FMA(KP363970234, T1D, T1C);
+			      }
+			 }
+		    }
+		    {
+			 E T1d, T14, T1c, TX;
+			 cr[0] = T5 + Tg;
+			 T1d = FNMS(KP839099631, T10, T13);
+			 T14 = FMA(KP839099631, T13, T10);
+			 T1c = FMA(KP176326980, TT, TW);
+			 TX = FNMS(KP176326980, TW, TT);
+			 ci[0] = Tl + Tw;
+			 {
+			      E TL, TK, TJ, Ty, TD;
+			      Ty = FNMS(KP500000000, Tg, T5);
+			      TD = FNMS(KP500000000, Tw, Tl);
+			      {
+				   E Tx, TC, TA, TI, TF;
+				   Tx = W[10];
+				   TC = W[11];
+				   TA = FNMS(KP866025403, Tz, Ty);
+				   TI = FMA(KP866025403, Tz, Ty);
+				   TF = FNMS(KP866025403, TE, TD);
+				   TL = FMA(KP866025403, TE, TD);
+				   {
+					E TH, TB, TG, TM;
+					TH = W[4];
+					TB = Tx * TA;
+					TK = W[5];
+					TG = Tx * TF;
+					TM = TH * TL;
+					TJ = TH * TI;
+					cr[WS(rs, 6)] = FNMS(TC, TF, TB);
+					ci[WS(rs, 6)] = FMA(TC, TA, TG);
+					ci[WS(rs, 3)] = FMA(TK, TI, TM);
+				   }
+			      }
+			      cr[WS(rs, 3)] = FNMS(TK, TL, TJ);
+			      {
+				   E T1k, T1p, T1l, T1q, T1m;
+				   {
+					E T1e, T1j, T15, T1o;
+					T1e = FNMS(KP777861913, T1d, T1c);
+					T1j = FMA(KP777861913, T1d, T1c);
+					T15 = FNMS(KP777861913, T14, TX);
+					T1o = FMA(KP777861913, T14, TX);
+					{
+					     E TN, T16, T1f, T17, T1s, T1v, T18, T1i, T1n, T1r, T1u;
+					     TN = W[0];
+					     T16 = FNMS(KP984807753, T15, TQ);
+					     T1i = FMA(KP492403876, T15, TQ);
+					     T1f = FMA(KP984807753, T1e, T1b);
+					     T1n = FNMS(KP492403876, T1e, T1b);
+					     T17 = TN * T16;
+					     T1s = FMA(KP852868531, T1j, T1i);
+					     T1k = FNMS(KP852868531, T1j, T1i);
+					     T1v = FMA(KP852868531, T1o, T1n);
+					     T1p = FNMS(KP852868531, T1o, T1n);
+					     T18 = W[1];
+					     T1r = W[6];
+					     T1u = W[7];
+					     {
+						  E T1h, T1g, T1w, T1t;
+						  T1h = W[12];
+						  cr[WS(rs, 1)] = FNMS(T18, T1f, T17);
+						  T1g = T18 * T16;
+						  T1w = T1r * T1v;
+						  T1t = T1r * T1s;
+						  T1l = T1h * T1k;
+						  ci[WS(rs, 1)] = FMA(TN, T1f, T1g);
+						  ci[WS(rs, 4)] = FMA(T1u, T1s, T1w);
+						  cr[WS(rs, 4)] = FNMS(T1u, T1v, T1t);
+						  T1q = T1h * T1p;
+					     }
+					     T1m = W[13];
+					}
+				   }
+				   {
+					E T1F, T1W, T1R, T1V, T1N, T1M, T1x, T1I;
+					T1F = FNMS(KP954188894, T1E, T1B);
+					T1W = FMA(KP954188894, T1E, T1B);
+					T1M = FNMS(KP954188894, T1L, T1K);
+					T1R = FMA(KP954188894, T1L, T1K);
+					ci[WS(rs, 7)] = FMA(T1m, T1k, T1q);
+					cr[WS(rs, 7)] = FNMS(T1m, T1p, T1l);
+					T1V = FNMS(KP492403876, T1M, T1J);
+					T1N = FMA(KP984807753, T1M, T1J);
+					T1x = W[2];
+					T1I = W[3];
+					{
+					     E T23, T22, T20, T1Z, T24, T21;
+					     T1X = FMA(KP852868531, T1W, T1V);
+					     T23 = FNMS(KP852868531, T1W, T1V);
+					     {
+						  E T1G, T1Q, T1O, T1H;
+						  T1G = FMA(KP984807753, T1F, T1y);
+						  T1Q = FNMS(KP492403876, T1F, T1y);
+						  T1O = T1x * T1N;
+						  T22 = W[15];
+						  T1H = T1x * T1G;
+						  T20 = FMA(KP852868531, T1R, T1Q);
+						  T1S = FNMS(KP852868531, T1R, T1Q);
+						  ci[WS(rs, 2)] = FMA(T1I, T1G, T1O);
+						  cr[WS(rs, 2)] = FNMS(T1I, T1N, T1H);
+						  T1Z = W[14];
+						  T24 = T22 * T20;
+					     }
+					     T1U = W[9];
+					     T21 = T1Z * T20;
+					     ci[WS(rs, 8)] = FMA(T1Z, T23, T24);
+					     T1P = W[8];
+					     T1Y = T1U * T1S;
+					     cr[WS(rs, 8)] = FNMS(T22, T23, T21);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T1T = T1P * T1S;
+	       ci[WS(rs, 5)] = FMA(T1P, T1X, T1Y);
+	       cr[WS(rs, 5)] = FNMS(T1U, T1X, T1T);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 9},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 9, "hb_9", twinstr, &GENUS, {24, 16, 72, 0} };
+
+void X(codelet_hb_9) (planner *p) {
+     X(khc2hc_register) (p, hb_9, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -dif -name hb_9 -include hb.h */
+
+/*
+ * This function contains 96 FP additions, 72 FP multiplications,
+ * (or, 60 additions, 36 multiplications, 36 fused multiply/add),
+ * 53 stack variables, 8 constants, and 36 memory accesses
+ */
+#include "hb.h"
+
+static void hb_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
+	       E T5, Tl, TM, T1o, T16, T1y, Ta, Tf, Tg, Tq, Tv, Tw, TT, T17, T1u;
+	       E T1A, T1r, T1z, T10, T18;
+	       {
+		    E T1, Th, T4, T14, Tk, TL, TK, T15;
+		    T1 = cr[0];
+		    Th = ci[WS(rs, 8)];
+		    {
+			 E T2, T3, Ti, Tj;
+			 T2 = cr[WS(rs, 3)];
+			 T3 = ci[WS(rs, 2)];
+			 T4 = T2 + T3;
+			 T14 = KP866025403 * (T2 - T3);
+			 Ti = ci[WS(rs, 5)];
+			 Tj = cr[WS(rs, 6)];
+			 Tk = Ti - Tj;
+			 TL = KP866025403 * (Ti + Tj);
+		    }
+		    T5 = T1 + T4;
+		    Tl = Th + Tk;
+		    TK = FNMS(KP500000000, T4, T1);
+		    TM = TK - TL;
+		    T1o = TK + TL;
+		    T15 = FNMS(KP500000000, Tk, Th);
+		    T16 = T14 + T15;
+		    T1y = T15 - T14;
+	       }
+	       {
+		    E T6, T9, TN, TQ, Tm, Tp, TO, TR, Tb, Te, TU, TX, Tr, Tu, TV;
+		    E TY;
+		    {
+			 E T7, T8, Tn, To;
+			 T6 = cr[WS(rs, 1)];
+			 T7 = cr[WS(rs, 4)];
+			 T8 = ci[WS(rs, 1)];
+			 T9 = T7 + T8;
+			 TN = FNMS(KP500000000, T9, T6);
+			 TQ = KP866025403 * (T7 - T8);
+			 Tm = ci[WS(rs, 7)];
+			 Tn = ci[WS(rs, 4)];
+			 To = cr[WS(rs, 7)];
+			 Tp = Tn - To;
+			 TO = KP866025403 * (Tn + To);
+			 TR = FNMS(KP500000000, Tp, Tm);
+		    }
+		    {
+			 E Tc, Td, Ts, Tt;
+			 Tb = cr[WS(rs, 2)];
+			 Tc = ci[WS(rs, 3)];
+			 Td = ci[0];
+			 Te = Tc + Td;
+			 TU = FNMS(KP500000000, Te, Tb);
+			 TX = KP866025403 * (Tc - Td);
+			 Tr = ci[WS(rs, 6)];
+			 Ts = cr[WS(rs, 5)];
+			 Tt = cr[WS(rs, 8)];
+			 Tu = Ts + Tt;
+			 TV = KP866025403 * (Ts - Tt);
+			 TY = FMA(KP500000000, Tu, Tr);
+		    }
+		    {
+			 E TP, TS, T1s, T1t;
+			 Ta = T6 + T9;
+			 Tf = Tb + Te;
+			 Tg = Ta + Tf;
+			 Tq = Tm + Tp;
+			 Tv = Tr - Tu;
+			 Tw = Tq + Tv;
+			 TP = TN - TO;
+			 TS = TQ + TR;
+			 TT = FNMS(KP642787609, TS, KP766044443 * TP);
+			 T17 = FMA(KP766044443, TS, KP642787609 * TP);
+			 T1s = TU - TV;
+			 T1t = TY - TX;
+			 T1u = FMA(KP939692620, T1s, KP342020143 * T1t);
+			 T1A = FNMS(KP939692620, T1t, KP342020143 * T1s);
+			 {
+			      E T1p, T1q, TW, TZ;
+			      T1p = TN + TO;
+			      T1q = TR - TQ;
+			      T1r = FNMS(KP984807753, T1q, KP173648177 * T1p);
+			      T1z = FMA(KP173648177, T1q, KP984807753 * T1p);
+			      TW = TU + TV;
+			      TZ = TX + TY;
+			      T10 = FNMS(KP984807753, TZ, KP173648177 * TW);
+			      T18 = FMA(KP984807753, TW, KP173648177 * TZ);
+			 }
+		    }
+	       }
+	       cr[0] = T5 + Tg;
+	       ci[0] = Tl + Tw;
+	       {
+		    E TA, TG, TE, TI;
+		    {
+			 E Ty, Tz, TC, TD;
+			 Ty = FNMS(KP500000000, Tg, T5);
+			 Tz = KP866025403 * (Tv - Tq);
+			 TA = Ty - Tz;
+			 TG = Ty + Tz;
+			 TC = FNMS(KP500000000, Tw, Tl);
+			 TD = KP866025403 * (Ta - Tf);
+			 TE = TC - TD;
+			 TI = TD + TC;
+		    }
+		    {
+			 E Tx, TB, TF, TH;
+			 Tx = W[10];
+			 TB = W[11];
+			 cr[WS(rs, 6)] = FNMS(TB, TE, Tx * TA);
+			 ci[WS(rs, 6)] = FMA(Tx, TE, TB * TA);
+			 TF = W[4];
+			 TH = W[5];
+			 cr[WS(rs, 3)] = FNMS(TH, TI, TF * TG);
+			 ci[WS(rs, 3)] = FMA(TF, TI, TH * TG);
+		    }
+	       }
+	       {
+		    E T1d, T1h, T12, T1c, T1a, T1g, T11, T19, TJ, T13;
+		    T1d = KP866025403 * (T18 - T17);
+		    T1h = KP866025403 * (TT - T10);
+		    T11 = TT + T10;
+		    T12 = TM + T11;
+		    T1c = FNMS(KP500000000, T11, TM);
+		    T19 = T17 + T18;
+		    T1a = T16 + T19;
+		    T1g = FNMS(KP500000000, T19, T16);
+		    TJ = W[0];
+		    T13 = W[1];
+		    cr[WS(rs, 1)] = FNMS(T13, T1a, TJ * T12);
+		    ci[WS(rs, 1)] = FMA(T13, T12, TJ * T1a);
+		    {
+			 E T1k, T1m, T1j, T1l;
+			 T1k = T1c + T1d;
+			 T1m = T1h + T1g;
+			 T1j = W[6];
+			 T1l = W[7];
+			 cr[WS(rs, 4)] = FNMS(T1l, T1m, T1j * T1k);
+			 ci[WS(rs, 4)] = FMA(T1j, T1m, T1l * T1k);
+		    }
+		    {
+			 E T1e, T1i, T1b, T1f;
+			 T1e = T1c - T1d;
+			 T1i = T1g - T1h;
+			 T1b = W[12];
+			 T1f = W[13];
+			 cr[WS(rs, 7)] = FNMS(T1f, T1i, T1b * T1e);
+			 ci[WS(rs, 7)] = FMA(T1b, T1i, T1f * T1e);
+		    }
+	       }
+	       {
+		    E T1F, T1J, T1w, T1E, T1C, T1I, T1v, T1B, T1n, T1x;
+		    T1F = KP866025403 * (T1A - T1z);
+		    T1J = KP866025403 * (T1r + T1u);
+		    T1v = T1r - T1u;
+		    T1w = T1o + T1v;
+		    T1E = FNMS(KP500000000, T1v, T1o);
+		    T1B = T1z + T1A;
+		    T1C = T1y + T1B;
+		    T1I = FNMS(KP500000000, T1B, T1y);
+		    T1n = W[2];
+		    T1x = W[3];
+		    cr[WS(rs, 2)] = FNMS(T1x, T1C, T1n * T1w);
+		    ci[WS(rs, 2)] = FMA(T1n, T1C, T1x * T1w);
+		    {
+			 E T1M, T1O, T1L, T1N;
+			 T1M = T1F + T1E;
+			 T1O = T1I + T1J;
+			 T1L = W[8];
+			 T1N = W[9];
+			 cr[WS(rs, 5)] = FNMS(T1N, T1O, T1L * T1M);
+			 ci[WS(rs, 5)] = FMA(T1N, T1M, T1L * T1O);
+		    }
+		    {
+			 E T1G, T1K, T1D, T1H;
+			 T1G = T1E - T1F;
+			 T1K = T1I - T1J;
+			 T1D = W[14];
+			 T1H = W[15];
+			 cr[WS(rs, 8)] = FNMS(T1H, T1K, T1D * T1G);
+			 ci[WS(rs, 8)] = FMA(T1H, T1G, T1D * T1K);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 9},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 9, "hb_9", twinstr, &GENUS, {60, 36, 36, 0} };
+
+void X(codelet_hb_9) (planner *p) {
+     X(khc2hc_register) (p, hb_9, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb2_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb2_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,840 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:57 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hc2cb2_16 -include hc2cb.h */
+
+/*
+ * This function contains 196 FP additions, 134 FP multiplications,
+ * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
+ * 112 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E Tv, TB, TF, Ty, T1J, T1O, T1N, T1K;
+	       {
+		    E Tw, T2z, T2C, Tx, T3f, T3l, T2F, T3r, Tz;
+		    Tv = W[0];
+		    Tw = W[2];
+		    T2z = W[6];
+		    T2C = W[7];
+		    TB = W[4];
+		    Tx = Tv * Tw;
+		    T3f = Tv * T2z;
+		    T3l = Tv * T2C;
+		    T2F = Tv * TB;
+		    T3r = Tw * TB;
+		    TF = W[5];
+		    Ty = W[1];
+		    Tz = W[3];
+		    {
+			 E T2G, T3z, T3m, T3g, T3L, T3s, T1V, TA, T3w, T3Q, T30, T3C, TE, T1X, T1D;
+			 E TG, T1G, T1o, T2p, T1Y, T2u, T2c, T1Z, TL, T1t, T2d, T3n, T35, T3R, T3F;
+			 E T1w, T20, T3M, Tf, T3h, T2L, T2e, TW, T2Q, T36, T3I, T3N, T2V, T37, T1d;
+			 E Tu, T3S, T18, T1z, T1i, T24, T2g, T27, T2h;
+			 {
+			      E T2K, TQ, TV, T2H;
+			      {
+				   E TH, T3, T32, T1s, T1p, T6, T33, TK, TM, Ta, TS, T2J, TP, TR, Td;
+				   E TT, TI, TJ;
+				   {
+					E T1q, T1r, T4, T5;
+					{
+					     E T1, T1n, TC, T2b, T1W, T2, T3v, T2Z, TD;
+					     T1 = Rp[0];
+					     T3v = Tw * TF;
+					     T2Z = Tv * TF;
+					     T2G = FNMS(Ty, TF, T2F);
+					     T3z = FMA(Ty, TF, T2F);
+					     T3m = FNMS(Ty, T2z, T3l);
+					     T3g = FMA(Ty, T2C, T3f);
+					     T3L = FNMS(Tz, TF, T3r);
+					     T3s = FMA(Tz, TF, T3r);
+					     T1V = FMA(Ty, Tz, Tx);
+					     TA = FNMS(Ty, Tz, Tx);
+					     TD = Tv * Tz;
+					     T3w = FNMS(Tz, TB, T3v);
+					     T3Q = FMA(Tz, TB, T3v);
+					     T30 = FMA(Ty, TB, T2Z);
+					     T3C = FNMS(Ty, TB, T2Z);
+					     T1n = TA * TF;
+					     TC = TA * TB;
+					     T2b = T1V * TF;
+					     T1W = T1V * TB;
+					     TE = FMA(Ty, Tw, TD);
+					     T1X = FNMS(Ty, Tw, TD);
+					     T2 = Rm[WS(rs, 7)];
+					     T1q = Ip[0];
+					     T1D = FMA(TE, TF, TC);
+					     TG = FNMS(TE, TF, TC);
+					     T1G = FNMS(TE, TB, T1n);
+					     T1o = FMA(TE, TB, T1n);
+					     T2p = FMA(T1X, TF, T1W);
+					     T1Y = FNMS(T1X, TF, T1W);
+					     T2u = FNMS(T1X, TB, T2b);
+					     T2c = FMA(T1X, TB, T2b);
+					     TH = T1 - T2;
+					     T3 = T1 + T2;
+					     T1r = Im[WS(rs, 7)];
+					}
+					T4 = Rp[WS(rs, 4)];
+					T5 = Rm[WS(rs, 3)];
+					TI = Ip[WS(rs, 4)];
+					T32 = T1q - T1r;
+					T1s = T1q + T1r;
+					T1p = T4 - T5;
+					T6 = T4 + T5;
+					TJ = Im[WS(rs, 3)];
+				   }
+				   {
+					E TN, TO, T8, T9, Tb, Tc;
+					T8 = Rp[WS(rs, 2)];
+					T9 = Rm[WS(rs, 5)];
+					TN = Ip[WS(rs, 2)];
+					T33 = TI - TJ;
+					TK = TI + TJ;
+					TM = T8 - T9;
+					Ta = T8 + T9;
+					TO = Im[WS(rs, 5)];
+					Tb = Rm[WS(rs, 1)];
+					Tc = Rp[WS(rs, 6)];
+					TS = Ip[WS(rs, 6)];
+					T2J = TN - TO;
+					TP = TN + TO;
+					TR = Tb - Tc;
+					Td = Tb + Tc;
+					TT = Im[WS(rs, 1)];
+				   }
+				   {
+					E T2I, TU, Te, T31, T34, T3D;
+					T1Z = TH + TK;
+					TL = TH - TK;
+					T1t = T1p + T1s;
+					T2d = T1s - T1p;
+					T2I = TS - TT;
+					TU = TS + TT;
+					Te = Ta + Td;
+					T31 = Ta - Td;
+					T34 = T32 - T33;
+					T3D = T32 + T33;
+					{
+					     E T1u, T1v, T3E, T7;
+					     T3E = T2J + T2I;
+					     T2K = T2I - T2J;
+					     TQ = TM - TP;
+					     T1u = TM + TP;
+					     T3n = T34 - T31;
+					     T35 = T31 + T34;
+					     TV = TR - TU;
+					     T1v = TR + TU;
+					     T3R = T3D - T3E;
+					     T3F = T3D + T3E;
+					     T2H = T3 - T6;
+					     T7 = T3 + T6;
+					     T1w = T1u - T1v;
+					     T20 = T1u + T1v;
+					     T3M = T7 - Te;
+					     Tf = T7 + Te;
+					}
+				   }
+			      }
+			      {
+				   E T1e, Ti, T2N, T1c, T19, Tl, T2O, T1h, Tq, T13, Tp, T2S, T11, Tr, T14;
+				   E T15;
+				   {
+					E Tj, Tk, T1f, T1g;
+					{
+					     E Tg, Th, T1a, T1b;
+					     Tg = Rp[WS(rs, 1)];
+					     T3h = T2H - T2K;
+					     T2L = T2H + T2K;
+					     T2e = TQ - TV;
+					     TW = TQ + TV;
+					     Th = Rm[WS(rs, 6)];
+					     T1a = Ip[WS(rs, 1)];
+					     T1b = Im[WS(rs, 6)];
+					     Tj = Rp[WS(rs, 5)];
+					     T1e = Tg - Th;
+					     Ti = Tg + Th;
+					     T2N = T1a - T1b;
+					     T1c = T1a + T1b;
+					     Tk = Rm[WS(rs, 2)];
+					     T1f = Ip[WS(rs, 5)];
+					     T1g = Im[WS(rs, 2)];
+					}
+					{
+					     E Tn, To, TZ, T10;
+					     Tn = Rm[0];
+					     T19 = Tj - Tk;
+					     Tl = Tj + Tk;
+					     T2O = T1f - T1g;
+					     T1h = T1f + T1g;
+					     To = Rp[WS(rs, 7)];
+					     TZ = Ip[WS(rs, 7)];
+					     T10 = Im[0];
+					     Tq = Rp[WS(rs, 3)];
+					     T13 = Tn - To;
+					     Tp = Tn + To;
+					     T2S = TZ - T10;
+					     T11 = TZ + T10;
+					     Tr = Rm[WS(rs, 4)];
+					     T14 = Ip[WS(rs, 3)];
+					     T15 = Im[WS(rs, 4)];
+					}
+				   }
+				   {
+					E TY, T16, Tm, Tt;
+					{
+					     E T2P, T3G, Ts, T2M, T3H, T2U, T2T, T2R;
+					     T2P = T2N - T2O;
+					     T3G = T2N + T2O;
+					     TY = Tq - Tr;
+					     Ts = Tq + Tr;
+					     T2T = T14 - T15;
+					     T16 = T14 + T15;
+					     T2M = Ti - Tl;
+					     Tm = Ti + Tl;
+					     T3H = T2S + T2T;
+					     T2U = T2S - T2T;
+					     Tt = Tp + Ts;
+					     T2R = Tp - Ts;
+					     T2Q = T2M - T2P;
+					     T36 = T2M + T2P;
+					     T3I = T3G + T3H;
+					     T3N = T3H - T3G;
+					     T2V = T2R + T2U;
+					     T37 = T2U - T2R;
+					}
+					{
+					     E T25, T26, T22, T23, T12, T17;
+					     T12 = TY - T11;
+					     T25 = TY + T11;
+					     T26 = T13 + T16;
+					     T17 = T13 - T16;
+					     T22 = T1c - T19;
+					     T1d = T19 + T1c;
+					     Tu = Tm + Tt;
+					     T3S = Tm - Tt;
+					     T18 = FNMS(KP414213562, T17, T12);
+					     T1z = FMA(KP414213562, T12, T17);
+					     T1i = T1e - T1h;
+					     T23 = T1e + T1h;
+					     T24 = FNMS(KP414213562, T23, T22);
+					     T2g = FMA(KP414213562, T22, T23);
+					     T27 = FNMS(KP414213562, T26, T25);
+					     T2h = FMA(KP414213562, T25, T26);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T1j, T1y, T3V, T3X, T3W, T38, T3i, T3o, T2W, T3K, T3B, T3A;
+			      Rp[0] = Tf + Tu;
+			      T3A = Tf - Tu;
+			      T1j = FMA(KP414213562, T1i, T1d);
+			      T1y = FNMS(KP414213562, T1d, T1i);
+			      T3K = T3C * T3A;
+			      T3B = T3z * T3A;
+			      {
+				   E T3O, T3T, T3J, T3P, T3U;
+				   T3O = T3M - T3N;
+				   T3V = T3M + T3N;
+				   T3X = T3S + T3R;
+				   T3T = T3R - T3S;
+				   Rm[0] = T3F + T3I;
+				   T3J = T3F - T3I;
+				   T3P = T3L * T3O;
+				   T3U = T3L * T3T;
+				   T3W = TA * T3V;
+				   Rp[WS(rs, 4)] = FNMS(T3C, T3J, T3B);
+				   Rm[WS(rs, 4)] = FMA(T3z, T3J, T3K);
+				   Rp[WS(rs, 6)] = FNMS(T3Q, T3T, T3P);
+				   Rm[WS(rs, 6)] = FMA(T3Q, T3O, T3U);
+				   T38 = T36 + T37;
+				   T3i = T37 - T36;
+				   T3o = T2Q - T2V;
+				   T2W = T2Q + T2V;
+			      }
+			      {
+				   E T2q, T21, T28, T2w, T2v, T2f, T2i, T2r;
+				   {
+					E T2Y, T3a, T3c, T3d, T39, T3e, T3b, T2X, T3Y;
+					Rp[WS(rs, 2)] = FNMS(TE, T3X, T3W);
+					T3Y = TA * T3X;
+					{
+					     E T3t, T3j, T3x, T3p;
+					     T3t = FMA(KP707106781, T3i, T3h);
+					     T3j = FNMS(KP707106781, T3i, T3h);
+					     T3x = FMA(KP707106781, T3o, T3n);
+					     T3p = FNMS(KP707106781, T3o, T3n);
+					     Rm[WS(rs, 2)] = FMA(TE, T3V, T3Y);
+					     {
+						  E T3u, T3k, T3y, T3q;
+						  T3u = T3s * T3t;
+						  T3k = T3g * T3j;
+						  T3y = T3s * T3x;
+						  T3q = T3g * T3p;
+						  Rp[WS(rs, 3)] = FNMS(T3w, T3x, T3u);
+						  Rp[WS(rs, 7)] = FNMS(T3m, T3p, T3k);
+						  Rm[WS(rs, 3)] = FMA(T3w, T3t, T3y);
+						  Rm[WS(rs, 7)] = FMA(T3m, T3j, T3q);
+						  T3b = FMA(KP707106781, T2W, T2L);
+						  T2X = FNMS(KP707106781, T2W, T2L);
+					     }
+					}
+					T2Y = T2G * T2X;
+					T3a = T30 * T2X;
+					T3c = T1V * T3b;
+					T3d = FMA(KP707106781, T38, T35);
+					T39 = FNMS(KP707106781, T38, T35);
+					T3e = T1X * T3b;
+					T2q = FMA(KP707106781, T20, T1Z);
+					T21 = FNMS(KP707106781, T20, T1Z);
+					Rp[WS(rs, 1)] = FNMS(T1X, T3d, T3c);
+					Rm[WS(rs, 5)] = FMA(T2G, T39, T3a);
+					Rp[WS(rs, 5)] = FNMS(T30, T39, T2Y);
+					Rm[WS(rs, 1)] = FMA(T1V, T3d, T3e);
+					T28 = T24 + T27;
+					T2w = T27 - T24;
+					T2v = FNMS(KP707106781, T2e, T2d);
+					T2f = FMA(KP707106781, T2e, T2d);
+					T2i = T2g - T2h;
+					T2r = T2g + T2h;
+				   }
+				   {
+					E TX, T1k, T1x, T1A;
+					T1J = FMA(KP707106781, TW, TL);
+					TX = FNMS(KP707106781, TW, TL);
+					{
+					     E T2l, T29, T2n, T2j;
+					     T2l = FNMS(KP923879532, T28, T21);
+					     T29 = FMA(KP923879532, T28, T21);
+					     T2n = FMA(KP923879532, T2i, T2f);
+					     T2j = FNMS(KP923879532, T2i, T2f);
+					     {
+						  E T2o, T2m, T2k, T2a;
+						  T2o = Tz * T2l;
+						  T2m = Tw * T2l;
+						  T2k = T2c * T29;
+						  T2a = T1Y * T29;
+						  Im[WS(rs, 1)] = FMA(Tw, T2n, T2o);
+						  Ip[WS(rs, 1)] = FNMS(Tz, T2n, T2m);
+						  Im[WS(rs, 5)] = FMA(T1Y, T2j, T2k);
+						  Ip[WS(rs, 5)] = FNMS(T2c, T2j, T2a);
+						  T1k = T18 - T1j;
+						  T1O = T1j + T18;
+					     }
+					}
+					T1N = FMA(KP707106781, T1w, T1t);
+					T1x = FNMS(KP707106781, T1w, T1t);
+					T1A = T1y - T1z;
+					T1K = T1y + T1z;
+					{
+					     E T1E, T1l, T1H, T1B;
+					     T1E = FMA(KP923879532, T1k, TX);
+					     T1l = FNMS(KP923879532, T1k, TX);
+					     T1H = FMA(KP923879532, T1A, T1x);
+					     T1B = FNMS(KP923879532, T1A, T1x);
+					     {
+						  E T1I, T1F, T1C, T1m;
+						  T1I = T1G * T1E;
+						  T1F = T1D * T1E;
+						  T1C = T1o * T1l;
+						  T1m = TG * T1l;
+						  Im[WS(rs, 2)] = FMA(T1D, T1H, T1I);
+						  Ip[WS(rs, 2)] = FNMS(T1G, T1H, T1F);
+						  Im[WS(rs, 6)] = FMA(TG, T1B, T1C);
+						  Ip[WS(rs, 6)] = FNMS(T1o, T1B, T1m);
+					     }
+					}
+					{
+					     E T2A, T2s, T2D, T2x;
+					     T2A = FMA(KP923879532, T2r, T2q);
+					     T2s = FNMS(KP923879532, T2r, T2q);
+					     T2D = FNMS(KP923879532, T2w, T2v);
+					     T2x = FMA(KP923879532, T2w, T2v);
+					     {
+						  E T2B, T2t, T2E, T2y;
+						  T2B = T2z * T2A;
+						  T2t = T2p * T2s;
+						  T2E = T2z * T2D;
+						  T2y = T2p * T2x;
+						  Ip[WS(rs, 7)] = FNMS(T2C, T2D, T2B);
+						  Ip[WS(rs, 3)] = FNMS(T2u, T2x, T2t);
+						  Im[WS(rs, 7)] = FMA(T2C, T2A, T2E);
+						  Im[WS(rs, 3)] = FMA(T2u, T2s, T2y);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    E T1L, T1R, T1P, T1T;
+		    T1L = FNMS(KP923879532, T1K, T1J);
+		    T1R = FMA(KP923879532, T1K, T1J);
+		    T1P = FNMS(KP923879532, T1O, T1N);
+		    T1T = FMA(KP923879532, T1O, T1N);
+		    {
+			 E T1S, T1M, T1U, T1Q;
+			 T1S = Tv * T1R;
+			 T1M = TB * T1L;
+			 T1U = Tv * T1T;
+			 T1Q = TB * T1P;
+			 Ip[0] = FNMS(Ty, T1T, T1S);
+			 Ip[WS(rs, 4)] = FNMS(TF, T1P, T1M);
+			 Im[0] = FMA(Ty, T1R, T1U);
+			 Im[WS(rs, 4)] = FMA(TF, T1L, T1Q);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cb2_16", twinstr, &GENUS, {104, 42, 92, 0} };
+
+void X(codelet_hc2cb2_16) (planner *p) {
+     X(khc2c_register) (p, hc2cb2_16, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hc2cb2_16 -include hc2cb.h */
+
+/*
+ * This function contains 196 FP additions, 108 FP multiplications,
+ * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
+ * 80 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E Tv, Ty, T1l, T1n, T1p, T1t, T27, T25, Tz, Tw, TB, T21, T1P, T1H, T1X;
+	       E T17, T1L, T1N, T1v, T1w, T1x, T1B, T2F, T2T, T2b, T2R, T3j, T3x, T35, T3t;
+	       {
+		    E TA, T1J, T15, T1G, Tx, T1K, T16, T1F;
+		    {
+			 E T1m, T1s, T1o, T1r;
+			 Tv = W[0];
+			 Ty = W[1];
+			 T1l = W[2];
+			 T1n = W[3];
+			 T1m = Tv * T1l;
+			 T1s = Ty * T1l;
+			 T1o = Ty * T1n;
+			 T1r = Tv * T1n;
+			 T1p = T1m + T1o;
+			 T1t = T1r - T1s;
+			 T27 = T1r + T1s;
+			 T25 = T1m - T1o;
+			 Tz = W[5];
+			 TA = Ty * Tz;
+			 T1J = T1l * Tz;
+			 T15 = Tv * Tz;
+			 T1G = T1n * Tz;
+			 Tw = W[4];
+			 Tx = Tv * Tw;
+			 T1K = T1n * Tw;
+			 T16 = Ty * Tw;
+			 T1F = T1l * Tw;
+		    }
+		    TB = Tx - TA;
+		    T21 = T1J + T1K;
+		    T1P = T15 - T16;
+		    T1H = T1F + T1G;
+		    T1X = T1F - T1G;
+		    T17 = T15 + T16;
+		    T1L = T1J - T1K;
+		    T1N = Tx + TA;
+		    T1v = W[6];
+		    T1w = W[7];
+		    T1x = FMA(Tv, T1v, Ty * T1w);
+		    T1B = FNMS(Ty, T1v, Tv * T1w);
+		    {
+			 E T2D, T2E, T29, T2a;
+			 T2D = T25 * Tz;
+			 T2E = T27 * Tw;
+			 T2F = T2D + T2E;
+			 T2T = T2D - T2E;
+			 T29 = T25 * Tw;
+			 T2a = T27 * Tz;
+			 T2b = T29 - T2a;
+			 T2R = T29 + T2a;
+		    }
+		    {
+			 E T3h, T3i, T33, T34;
+			 T3h = T1p * Tz;
+			 T3i = T1t * Tw;
+			 T3j = T3h + T3i;
+			 T3x = T3h - T3i;
+			 T33 = T1p * Tw;
+			 T34 = T1t * Tz;
+			 T35 = T33 - T34;
+			 T3t = T33 + T34;
+		    }
+	       }
+	       {
+		    E T7, T36, T3k, TC, T1f, T2e, T2I, T1Q, Te, TJ, T1R, T18, T2L, T37, T2l;
+		    E T3l, Tm, T1T, TT, T1h, T2A, T2N, T3b, T3n, Tt, T1U, T12, T1i, T2t, T2O;
+		    E T3e, T3o;
+		    {
+			 E T3, T2c, T1b, T2H, T6, T2G, T1e, T2d;
+			 {
+			      E T1, T2, T19, T1a;
+			      T1 = Rp[0];
+			      T2 = Rm[WS(rs, 7)];
+			      T3 = T1 + T2;
+			      T2c = T1 - T2;
+			      T19 = Ip[0];
+			      T1a = Im[WS(rs, 7)];
+			      T1b = T19 - T1a;
+			      T2H = T19 + T1a;
+			 }
+			 {
+			      E T4, T5, T1c, T1d;
+			      T4 = Rp[WS(rs, 4)];
+			      T5 = Rm[WS(rs, 3)];
+			      T6 = T4 + T5;
+			      T2G = T4 - T5;
+			      T1c = Ip[WS(rs, 4)];
+			      T1d = Im[WS(rs, 3)];
+			      T1e = T1c - T1d;
+			      T2d = T1c + T1d;
+			 }
+			 T7 = T3 + T6;
+			 T36 = T2c + T2d;
+			 T3k = T2H - T2G;
+			 TC = T3 - T6;
+			 T1f = T1b - T1e;
+			 T2e = T2c - T2d;
+			 T2I = T2G + T2H;
+			 T1Q = T1b + T1e;
+		    }
+		    {
+			 E Ta, T2f, TI, T2g, Td, T2i, TF, T2j;
+			 {
+			      E T8, T9, TG, TH;
+			      T8 = Rp[WS(rs, 2)];
+			      T9 = Rm[WS(rs, 5)];
+			      Ta = T8 + T9;
+			      T2f = T8 - T9;
+			      TG = Ip[WS(rs, 2)];
+			      TH = Im[WS(rs, 5)];
+			      TI = TG - TH;
+			      T2g = TG + TH;
+			 }
+			 {
+			      E Tb, Tc, TD, TE;
+			      Tb = Rm[WS(rs, 1)];
+			      Tc = Rp[WS(rs, 6)];
+			      Td = Tb + Tc;
+			      T2i = Tb - Tc;
+			      TD = Ip[WS(rs, 6)];
+			      TE = Im[WS(rs, 1)];
+			      TF = TD - TE;
+			      T2j = TD + TE;
+			 }
+			 Te = Ta + Td;
+			 TJ = TF - TI;
+			 T1R = TI + TF;
+			 T18 = Ta - Td;
+			 {
+			      E T2J, T2K, T2h, T2k;
+			      T2J = T2f + T2g;
+			      T2K = T2i + T2j;
+			      T2L = KP707106781 * (T2J - T2K);
+			      T37 = KP707106781 * (T2J + T2K);
+			      T2h = T2f - T2g;
+			      T2k = T2i - T2j;
+			      T2l = KP707106781 * (T2h + T2k);
+			      T3l = KP707106781 * (T2h - T2k);
+			 }
+		    }
+		    {
+			 E Ti, T2x, TO, T2v, Tl, T2u, TR, T2y, TL, TS;
+			 {
+			      E Tg, Th, TM, TN;
+			      Tg = Rp[WS(rs, 1)];
+			      Th = Rm[WS(rs, 6)];
+			      Ti = Tg + Th;
+			      T2x = Tg - Th;
+			      TM = Ip[WS(rs, 1)];
+			      TN = Im[WS(rs, 6)];
+			      TO = TM - TN;
+			      T2v = TM + TN;
+			 }
+			 {
+			      E Tj, Tk, TP, TQ;
+			      Tj = Rp[WS(rs, 5)];
+			      Tk = Rm[WS(rs, 2)];
+			      Tl = Tj + Tk;
+			      T2u = Tj - Tk;
+			      TP = Ip[WS(rs, 5)];
+			      TQ = Im[WS(rs, 2)];
+			      TR = TP - TQ;
+			      T2y = TP + TQ;
+			 }
+			 Tm = Ti + Tl;
+			 T1T = TO + TR;
+			 TL = Ti - Tl;
+			 TS = TO - TR;
+			 TT = TL - TS;
+			 T1h = TL + TS;
+			 {
+			      E T2w, T2z, T39, T3a;
+			      T2w = T2u + T2v;
+			      T2z = T2x - T2y;
+			      T2A = FMA(KP923879532, T2w, KP382683432 * T2z);
+			      T2N = FNMS(KP382683432, T2w, KP923879532 * T2z);
+			      T39 = T2x + T2y;
+			      T3a = T2v - T2u;
+			      T3b = FNMS(KP923879532, T3a, KP382683432 * T39);
+			      T3n = FMA(KP382683432, T3a, KP923879532 * T39);
+			 }
+		    }
+		    {
+			 E Tp, T2q, TX, T2o, Ts, T2n, T10, T2r, TU, T11;
+			 {
+			      E Tn, To, TV, TW;
+			      Tn = Rm[0];
+			      To = Rp[WS(rs, 7)];
+			      Tp = Tn + To;
+			      T2q = Tn - To;
+			      TV = Ip[WS(rs, 7)];
+			      TW = Im[0];
+			      TX = TV - TW;
+			      T2o = TV + TW;
+			 }
+			 {
+			      E Tq, Tr, TY, TZ;
+			      Tq = Rp[WS(rs, 3)];
+			      Tr = Rm[WS(rs, 4)];
+			      Ts = Tq + Tr;
+			      T2n = Tq - Tr;
+			      TY = Ip[WS(rs, 3)];
+			      TZ = Im[WS(rs, 4)];
+			      T10 = TY - TZ;
+			      T2r = TY + TZ;
+			 }
+			 Tt = Tp + Ts;
+			 T1U = TX + T10;
+			 TU = Tp - Ts;
+			 T11 = TX - T10;
+			 T12 = TU + T11;
+			 T1i = T11 - TU;
+			 {
+			      E T2p, T2s, T3c, T3d;
+			      T2p = T2n - T2o;
+			      T2s = T2q - T2r;
+			      T2t = FNMS(KP382683432, T2s, KP923879532 * T2p);
+			      T2O = FMA(KP382683432, T2p, KP923879532 * T2s);
+			      T3c = T2q + T2r;
+			      T3d = T2n + T2o;
+			      T3e = FNMS(KP923879532, T3d, KP382683432 * T3c);
+			      T3o = FMA(KP382683432, T3d, KP923879532 * T3c);
+			 }
+		    }
+		    {
+			 E Tf, Tu, T1O, T1S, T1V, T1W;
+			 Tf = T7 + Te;
+			 Tu = Tm + Tt;
+			 T1O = Tf - Tu;
+			 T1S = T1Q + T1R;
+			 T1V = T1T + T1U;
+			 T1W = T1S - T1V;
+			 Rp[0] = Tf + Tu;
+			 Rm[0] = T1S + T1V;
+			 Rp[WS(rs, 4)] = FNMS(T1P, T1W, T1N * T1O);
+			 Rm[WS(rs, 4)] = FMA(T1P, T1O, T1N * T1W);
+		    }
+		    {
+			 E T3g, T3r, T3q, T3s;
+			 {
+			      E T38, T3f, T3m, T3p;
+			      T38 = T36 - T37;
+			      T3f = T3b + T3e;
+			      T3g = T38 - T3f;
+			      T3r = T38 + T3f;
+			      T3m = T3k + T3l;
+			      T3p = T3n - T3o;
+			      T3q = T3m - T3p;
+			      T3s = T3m + T3p;
+			 }
+			 Ip[WS(rs, 5)] = FNMS(T3j, T3q, T35 * T3g);
+			 Im[WS(rs, 5)] = FMA(T3j, T3g, T35 * T3q);
+			 Ip[WS(rs, 1)] = FNMS(T1n, T3s, T1l * T3r);
+			 Im[WS(rs, 1)] = FMA(T1n, T3r, T1l * T3s);
+		    }
+		    {
+			 E T3w, T3B, T3A, T3C;
+			 {
+			      E T3u, T3v, T3y, T3z;
+			      T3u = T36 + T37;
+			      T3v = T3n + T3o;
+			      T3w = T3u - T3v;
+			      T3B = T3u + T3v;
+			      T3y = T3k - T3l;
+			      T3z = T3b - T3e;
+			      T3A = T3y + T3z;
+			      T3C = T3y - T3z;
+			 }
+			 Ip[WS(rs, 3)] = FNMS(T3x, T3A, T3t * T3w);
+			 Im[WS(rs, 3)] = FMA(T3t, T3A, T3x * T3w);
+			 Ip[WS(rs, 7)] = FNMS(T1w, T3C, T1v * T3B);
+			 Im[WS(rs, 7)] = FMA(T1v, T3C, T1w * T3B);
+		    }
+		    {
+			 E T14, T1q, T1k, T1u;
+			 {
+			      E TK, T13, T1g, T1j;
+			      TK = TC + TJ;
+			      T13 = KP707106781 * (TT + T12);
+			      T14 = TK - T13;
+			      T1q = TK + T13;
+			      T1g = T18 + T1f;
+			      T1j = KP707106781 * (T1h + T1i);
+			      T1k = T1g - T1j;
+			      T1u = T1g + T1j;
+			 }
+			 Rp[WS(rs, 5)] = FNMS(T17, T1k, TB * T14);
+			 Rm[WS(rs, 5)] = FMA(T17, T14, TB * T1k);
+			 Rp[WS(rs, 1)] = FNMS(T1t, T1u, T1p * T1q);
+			 Rm[WS(rs, 1)] = FMA(T1t, T1q, T1p * T1u);
+		    }
+		    {
+			 E T1A, T1I, T1E, T1M;
+			 {
+			      E T1y, T1z, T1C, T1D;
+			      T1y = TC - TJ;
+			      T1z = KP707106781 * (T1i - T1h);
+			      T1A = T1y - T1z;
+			      T1I = T1y + T1z;
+			      T1C = T1f - T18;
+			      T1D = KP707106781 * (TT - T12);
+			      T1E = T1C - T1D;
+			      T1M = T1C + T1D;
+			 }
+			 Rp[WS(rs, 7)] = FNMS(T1B, T1E, T1x * T1A);
+			 Rm[WS(rs, 7)] = FMA(T1x, T1E, T1B * T1A);
+			 Rp[WS(rs, 3)] = FNMS(T1L, T1M, T1H * T1I);
+			 Rm[WS(rs, 3)] = FMA(T1H, T1M, T1L * T1I);
+		    }
+		    {
+			 E T2C, T2S, T2Q, T2U;
+			 {
+			      E T2m, T2B, T2M, T2P;
+			      T2m = T2e - T2l;
+			      T2B = T2t - T2A;
+			      T2C = T2m - T2B;
+			      T2S = T2m + T2B;
+			      T2M = T2I - T2L;
+			      T2P = T2N - T2O;
+			      T2Q = T2M - T2P;
+			      T2U = T2M + T2P;
+			 }
+			 Ip[WS(rs, 6)] = FNMS(T2F, T2Q, T2b * T2C);
+			 Im[WS(rs, 6)] = FMA(T2F, T2C, T2b * T2Q);
+			 Ip[WS(rs, 2)] = FNMS(T2T, T2U, T2R * T2S);
+			 Im[WS(rs, 2)] = FMA(T2T, T2S, T2R * T2U);
+		    }
+		    {
+			 E T2X, T31, T30, T32;
+			 {
+			      E T2V, T2W, T2Y, T2Z;
+			      T2V = T2e + T2l;
+			      T2W = T2N + T2O;
+			      T2X = T2V - T2W;
+			      T31 = T2V + T2W;
+			      T2Y = T2I + T2L;
+			      T2Z = T2A + T2t;
+			      T30 = T2Y - T2Z;
+			      T32 = T2Y + T2Z;
+			 }
+			 Ip[WS(rs, 4)] = FNMS(Tz, T30, Tw * T2X);
+			 Im[WS(rs, 4)] = FMA(Tw, T30, Tz * T2X);
+			 Ip[0] = FNMS(Ty, T32, Tv * T31);
+			 Im[0] = FMA(Tv, T32, Ty * T31);
+		    }
+		    {
+			 E T20, T26, T24, T28;
+			 {
+			      E T1Y, T1Z, T22, T23;
+			      T1Y = T7 - Te;
+			      T1Z = T1U - T1T;
+			      T20 = T1Y - T1Z;
+			      T26 = T1Y + T1Z;
+			      T22 = T1Q - T1R;
+			      T23 = Tm - Tt;
+			      T24 = T22 - T23;
+			      T28 = T23 + T22;
+			 }
+			 Rp[WS(rs, 6)] = FNMS(T21, T24, T1X * T20);
+			 Rm[WS(rs, 6)] = FMA(T1X, T24, T21 * T20);
+			 Rp[WS(rs, 2)] = FNMS(T27, T28, T25 * T26);
+			 Rm[WS(rs, 2)] = FMA(T25, T28, T27 * T26);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cb2_16", twinstr, &GENUS, {156, 68, 40, 0} };
+
+void X(codelet_hc2cb2_16) (planner *p) {
+     X(khc2c_register) (p, hc2cb2_16, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb2_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb2_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1087 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:02 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 20 -dif -name hc2cb2_20 -include hc2cb.h */
+
+/*
+ * This function contains 276 FP additions, 198 FP multiplications,
+ * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
+ * 160 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T1S, T1O, T1s, TI, T24, T1Y, T2g, T2k, TS, TR, T1I, T26, T1o, T20, T1F;
+	       E T25, TT, T1Z;
+	       {
+		    E TD, TH, TE, T1L, T1N, T1X, TG, T1V, T2Y, T2b, T29, T2s, T36, T3e, T31;
+		    E T2o, T3b, T5b, T2c, T2U, T4y, T4u, T2f, T5g, T47, T5p, T4b, T5l;
+		    {
+			 E T1r, TF, T2T, T1M, T1R, T2X, T2r, T4x;
+			 TD = W[0];
+			 TH = W[3];
+			 TE = W[2];
+			 T1L = W[6];
+			 T1N = W[7];
+			 T1r = TD * TH;
+			 TF = TD * TE;
+			 T2T = TE * T1L;
+			 T1M = TD * T1L;
+			 T1R = TD * T1N;
+			 T2X = TE * T1N;
+			 T1X = W[5];
+			 TG = W[1];
+			 T1V = W[4];
+			 T2Y = FNMS(TH, T1L, T2X);
+			 T2r = TD * T1X;
+			 {
+			      E T23, T2n, T1W, T2a;
+			      T23 = TE * T1X;
+			      T1S = FNMS(TG, T1L, T1R);
+			      T1O = FMA(TG, T1N, T1M);
+			      T2b = FMA(TG, TE, T1r);
+			      T1s = FNMS(TG, TE, T1r);
+			      T29 = FNMS(TG, TH, TF);
+			      TI = FMA(TG, TH, TF);
+			      T2n = TD * T1V;
+			      T1W = TE * T1V;
+			      T2s = FMA(TG, T1V, T2r);
+			      T36 = FNMS(TG, T1V, T2r);
+			      T3e = FMA(TH, T1V, T23);
+			      T24 = FNMS(TH, T1V, T23);
+			      T2a = T29 * T1V;
+			      T31 = FMA(TG, T1X, T2n);
+			      T2o = FNMS(TG, T1X, T2n);
+			      T3b = FNMS(TH, T1X, T1W);
+			      T1Y = FMA(TH, T1X, T1W);
+			      T5b = FNMS(T2b, T1X, T2a);
+			      T2c = FMA(T2b, T1X, T2a);
+			      T2U = FMA(TH, T1N, T2T);
+			 }
+			 T4x = T29 * T1N;
+			 {
+			      E T4t, T2d, T2j, T2e;
+			      T4t = T29 * T1L;
+			      T2e = T29 * T1X;
+			      T4y = FNMS(T2b, T1L, T4x);
+			      T4u = FMA(T2b, T1N, T4t);
+			      T2f = FNMS(T2b, T1V, T2e);
+			      T5g = FMA(T2b, T1V, T2e);
+			      T2d = T2c * T1L;
+			      T2j = T2c * T1N;
+			      T47 = TI * T1V;
+			      T2g = FMA(T2f, T1N, T2d);
+			      T2k = FNMS(T2f, T1L, T2j);
+			      T5p = TI * T1N;
+			      T4b = TI * T1X;
+			      T5l = TI * T1L;
+			 }
+		    }
+		    {
+			 E T4f, T48, T4c, T4k, T5m, T5q, T3V, T4V, TJ, T7, T3j, T4B, T2H, T1z, T3q;
+			 E T43, T1n, T52, T42, T3x, T53, T2D, T18, T2A, T1H, T4R, T4X, T4W, T4O, T1G;
+			 E T2O, T3I, T2P, T3P, T2K, T2M, T1C, T1E, TC, T2w, T40, T3Y, T4K, T4I, TQ;
+			 {
+			      E T3h, T3, T1w, T3T, T1v, T3U, T6, T1x;
+			      {
+				   E T1t, T1u, T1, T2, T4, T5;
+				   T1 = Rp[0];
+				   T2 = Rm[WS(rs, 9)];
+				   T1t = Ip[0];
+				   T4f = FNMS(T1s, T1X, T47);
+				   T48 = FMA(T1s, T1X, T47);
+				   T4c = FNMS(T1s, T1V, T4b);
+				   T4k = FMA(T1s, T1V, T4b);
+				   T5m = FMA(T1s, T1N, T5l);
+				   T5q = FNMS(T1s, T1L, T5p);
+				   T3h = T1 - T2;
+				   T3 = T1 + T2;
+				   T1u = Im[WS(rs, 9)];
+				   T4 = Rp[WS(rs, 5)];
+				   T5 = Rm[WS(rs, 4)];
+				   T1w = Ip[WS(rs, 5)];
+				   T3T = T1t + T1u;
+				   T1v = T1t - T1u;
+				   T3U = T4 - T5;
+				   T6 = T4 + T5;
+				   T1x = Im[WS(rs, 4)];
+			      }
+			      {
+				   E T3L, T4M, TK, Te, T3m, T4C, T2y, T1f, T3H, T4Q, TO, TA, T3w, T4G, T2C;
+				   E T17, T3O, T4N, TL, Tl, T3p, T4D, T2z, T1m, T3r, Tp, TX, T3C, TW, T3D;
+				   E Ts, TY;
+				   {
+					E T3u, Tw, T14, T3G, T13, T3F, Tz, T15;
+					{
+					     E T3k, Ta, T1c, T3J, T1b, T3K, Td, T1d;
+					     {
+						  E T19, T1a, Tb, Tc;
+						  {
+						       E T8, T3i, T1y, T9;
+						       T8 = Rp[WS(rs, 4)];
+						       T3V = T3T - T3U;
+						       T4V = T3U + T3T;
+						       TJ = T3 - T6;
+						       T7 = T3 + T6;
+						       T3i = T1w + T1x;
+						       T1y = T1w - T1x;
+						       T9 = Rm[WS(rs, 5)];
+						       T19 = Ip[WS(rs, 4)];
+						       T3j = T3h + T3i;
+						       T4B = T3h - T3i;
+						       T2H = T1v + T1y;
+						       T1z = T1v - T1y;
+						       T3k = T8 - T9;
+						       Ta = T8 + T9;
+						       T1a = Im[WS(rs, 5)];
+						  }
+						  Tb = Rp[WS(rs, 9)];
+						  Tc = Rm[0];
+						  T1c = Ip[WS(rs, 9)];
+						  T3J = T19 + T1a;
+						  T1b = T19 - T1a;
+						  T3K = Tb - Tc;
+						  Td = Tb + Tc;
+						  T1d = Im[0];
+					     }
+					     {
+						  E T11, T12, Tx, Ty;
+						  {
+						       E Tu, T3l, T1e, Tv;
+						       Tu = Rm[WS(rs, 7)];
+						       T3L = T3J - T3K;
+						       T4M = T3K + T3J;
+						       TK = Ta - Td;
+						       Te = Ta + Td;
+						       T3l = T1c + T1d;
+						       T1e = T1c - T1d;
+						       Tv = Rp[WS(rs, 2)];
+						       T11 = Ip[WS(rs, 2)];
+						       T3m = T3k + T3l;
+						       T4C = T3k - T3l;
+						       T2y = T1b + T1e;
+						       T1f = T1b - T1e;
+						       T3u = Tu - Tv;
+						       Tw = Tu + Tv;
+						       T12 = Im[WS(rs, 7)];
+						  }
+						  Tx = Rm[WS(rs, 2)];
+						  Ty = Rp[WS(rs, 7)];
+						  T14 = Ip[WS(rs, 7)];
+						  T3G = T11 + T12;
+						  T13 = T11 - T12;
+						  T3F = Tx - Ty;
+						  Tz = Tx + Ty;
+						  T15 = Im[WS(rs, 2)];
+					     }
+					}
+					{
+					     E T3n, Th, T1j, T3N, T1i, T3M, Tk, T1k;
+					     {
+						  E T1g, T1h, Ti, Tj;
+						  {
+						       E Tf, T3v, T16, Tg;
+						       Tf = Rm[WS(rs, 3)];
+						       T3H = T3F + T3G;
+						       T4Q = T3F - T3G;
+						       TO = Tw - Tz;
+						       TA = Tw + Tz;
+						       T3v = T14 + T15;
+						       T16 = T14 - T15;
+						       Tg = Rp[WS(rs, 6)];
+						       T1g = Ip[WS(rs, 6)];
+						       T3w = T3u - T3v;
+						       T4G = T3u + T3v;
+						       T2C = T13 + T16;
+						       T17 = T13 - T16;
+						       T3n = Tf - Tg;
+						       Th = Tf + Tg;
+						       T1h = Im[WS(rs, 3)];
+						  }
+						  Ti = Rp[WS(rs, 1)];
+						  Tj = Rm[WS(rs, 8)];
+						  T1j = Ip[WS(rs, 1)];
+						  T3N = T1g + T1h;
+						  T1i = T1g - T1h;
+						  T3M = Ti - Tj;
+						  Tk = Ti + Tj;
+						  T1k = Im[WS(rs, 8)];
+					     }
+					     {
+						  E TU, TV, Tq, Tr;
+						  {
+						       E Tn, T3o, T1l, To;
+						       Tn = Rp[WS(rs, 8)];
+						       T3O = T3M + T3N;
+						       T4N = T3M - T3N;
+						       TL = Th - Tk;
+						       Tl = Th + Tk;
+						       T3o = T1j + T1k;
+						       T1l = T1j - T1k;
+						       To = Rm[WS(rs, 1)];
+						       TU = Ip[WS(rs, 8)];
+						       T3p = T3n + T3o;
+						       T4D = T3n - T3o;
+						       T2z = T1i + T1l;
+						       T1m = T1i - T1l;
+						       T3r = Tn - To;
+						       Tp = Tn + To;
+						       TV = Im[WS(rs, 1)];
+						  }
+						  Tq = Rm[WS(rs, 6)];
+						  Tr = Rp[WS(rs, 3)];
+						  TX = Ip[WS(rs, 3)];
+						  T3C = TU + TV;
+						  TW = TU - TV;
+						  T3D = Tq - Tr;
+						  Ts = Tq + Tr;
+						  TY = Im[WS(rs, 6)];
+					     }
+					}
+				   }
+				   {
+					E T3E, Tt, T1A, T4E, T4H, T2J, T1B, T2I, TM, TP;
+					{
+					     E T4P, TN, T3s, TZ;
+					     T3q = T3m + T3p;
+					     T43 = T3m - T3p;
+					     T3E = T3C - T3D;
+					     T4P = T3D + T3C;
+					     TN = Tp - Ts;
+					     Tt = Tp + Ts;
+					     T3s = TX + TY;
+					     TZ = TX - TY;
+					     T1n = T1f - T1m;
+					     T1A = T1f + T1m;
+					     T4E = T4C + T4D;
+					     T52 = T4C - T4D;
+					     {
+						  E T3t, T4F, T2B, T10;
+						  T3t = T3r - T3s;
+						  T4F = T3r + T3s;
+						  T2B = TW + TZ;
+						  T10 = TW - TZ;
+						  T42 = T3t - T3w;
+						  T3x = T3t + T3w;
+						  T4H = T4F + T4G;
+						  T53 = T4F - T4G;
+						  T2D = T2B - T2C;
+						  T2J = T2B + T2C;
+						  T1B = T10 + T17;
+						  T18 = T10 - T17;
+						  T2A = T2y - T2z;
+						  T2I = T2y + T2z;
+						  TM = TK + TL;
+						  T1H = TK - TL;
+					     }
+					     T4R = T4P - T4Q;
+					     T4X = T4P + T4Q;
+					     T4W = T4M + T4N;
+					     T4O = T4M - T4N;
+					     T1G = TN - TO;
+					     TP = TN + TO;
+					}
+					{
+					     E Tm, T3X, TB, T3W;
+					     Tm = Te + Tl;
+					     T2O = Te - Tl;
+					     T3I = T3E + T3H;
+					     T3X = T3E - T3H;
+					     TB = Tt + TA;
+					     T2P = Tt - TA;
+					     T3P = T3L + T3O;
+					     T3W = T3L - T3O;
+					     T2K = T2I + T2J;
+					     T2M = T2I - T2J;
+					     T1C = T1A + T1B;
+					     T1E = T1A - T1B;
+					     TC = Tm + TB;
+					     T2w = Tm - TB;
+					     T40 = T3W - T3X;
+					     T3Y = T3W + T3X;
+					     T4K = T4E - T4H;
+					     T4I = T4E + T4H;
+					     TS = TM - TP;
+					     TQ = TM + TP;
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T3A, T3y, T50, T1D, T2t, T2p, T4J, T5t, T5v, T4Z, T4Y;
+			      Rp[0] = T7 + TC;
+			      T3A = T3q - T3x;
+			      T3y = T3q + T3x;
+			      T50 = T4W - T4X;
+			      T4Y = T4W + T4X;
+			      Rm[0] = T2H + T2K;
+			      T1D = FNMS(KP250000000, T1C, T1z);
+			      T2t = T1z + T1C;
+			      T2p = TJ + TQ;
+			      TR = FNMS(KP250000000, TQ, TJ);
+			      T4J = FNMS(KP250000000, T4I, T4B);
+			      T5t = T4B + T4I;
+			      T5v = T4V + T4Y;
+			      T4Z = FNMS(KP250000000, T4Y, T4V);
+			      {
+				   E T4m, T44, T4i, T4p, T49, T3R, T4j, T4a, T3S, T4l, T41, T4q;
+				   {
+					E T3z, T4v, T4w, T3Z, T4z;
+					T3z = FNMS(KP250000000, T3y, T3j);
+					T4v = T3j + T3y;
+					{
+					     E T2u, T2q, T5u, T5w;
+					     T2u = T2s * T2p;
+					     T2q = T2o * T2p;
+					     T5u = T2c * T5t;
+					     T5w = T2c * T5v;
+					     Rm[WS(rs, 5)] = FMA(T2o, T2t, T2u);
+					     Rp[WS(rs, 5)] = FNMS(T2s, T2t, T2q);
+					     Ip[WS(rs, 2)] = FNMS(T2f, T5v, T5u);
+					     Im[WS(rs, 2)] = FMA(T2f, T5t, T5w);
+					     T4w = T4u * T4v;
+					}
+					T3Z = FNMS(KP250000000, T3Y, T3V);
+					T4z = T3V + T3Y;
+					{
+					     E T3Q, T4h, T4A, T4g, T3B;
+					     T3Q = FNMS(KP618033988, T3P, T3I);
+					     T4h = FMA(KP618033988, T3I, T3P);
+					     Ip[WS(rs, 7)] = FNMS(T4y, T4z, T4w);
+					     T4A = T4u * T4z;
+					     T4m = FMA(KP618033988, T42, T43);
+					     T44 = FNMS(KP618033988, T43, T42);
+					     T4g = FMA(KP559016994, T3A, T3z);
+					     T3B = FNMS(KP559016994, T3A, T3z);
+					     Im[WS(rs, 7)] = FMA(T4y, T4v, T4A);
+					     T4i = FNMS(KP951056516, T4h, T4g);
+					     T4p = FMA(KP951056516, T4h, T4g);
+					     T49 = FMA(KP951056516, T3Q, T3B);
+					     T3R = FNMS(KP951056516, T3Q, T3B);
+					}
+					T4j = T4f * T4i;
+					T4a = T48 * T49;
+					T3S = TE * T3R;
+					T4l = FMA(KP559016994, T40, T3Z);
+					T41 = FNMS(KP559016994, T40, T3Z);
+					T4q = T1L * T4p;
+				   }
+				   {
+					E T5d, T4S, T54, T5i, T4L, T5c;
+					T5d = FNMS(KP618033988, T4O, T4R);
+					T4S = FMA(KP618033988, T4R, T4O);
+					{
+					     E T4n, T4r, T4d, T45;
+					     T4n = FMA(KP951056516, T4m, T4l);
+					     T4r = FNMS(KP951056516, T4m, T4l);
+					     T4d = FNMS(KP951056516, T44, T41);
+					     T45 = FMA(KP951056516, T44, T41);
+					     {
+						  E T4o, T4s, T4e, T46;
+						  T4o = T4f * T4n;
+						  Ip[WS(rs, 5)] = FNMS(T4k, T4n, T4j);
+						  T4s = T1L * T4r;
+						  Ip[WS(rs, 9)] = FNMS(T1N, T4r, T4q);
+						  T4e = T48 * T4d;
+						  Ip[WS(rs, 3)] = FNMS(T4c, T4d, T4a);
+						  T46 = TE * T45;
+						  Ip[WS(rs, 1)] = FNMS(TH, T45, T3S);
+						  Im[WS(rs, 5)] = FMA(T4k, T4i, T4o);
+						  Im[WS(rs, 9)] = FMA(T1N, T4p, T4s);
+						  Im[WS(rs, 3)] = FMA(T4c, T49, T4e);
+						  Im[WS(rs, 1)] = FMA(TH, T3R, T46);
+					     }
+					}
+					T54 = FMA(KP618033988, T53, T52);
+					T5i = FNMS(KP618033988, T52, T53);
+					T4L = FMA(KP559016994, T4K, T4J);
+					T5c = FNMS(KP559016994, T4K, T4J);
+					{
+					     E T38, T2Q, T33, T2E, T2v, T37, T2N, T5h, T51, T2L, T2x, T32;
+					     T38 = FNMS(KP618033988, T2O, T2P);
+					     T2Q = FMA(KP618033988, T2P, T2O);
+					     T5h = FNMS(KP559016994, T50, T4Z);
+					     T51 = FMA(KP559016994, T50, T4Z);
+					     {
+						  E T5e, T5n, T57, T4T;
+						  T5e = FNMS(KP951056516, T5d, T5c);
+						  T5n = FMA(KP951056516, T5d, T5c);
+						  T57 = FMA(KP951056516, T4S, T4L);
+						  T4T = FNMS(KP951056516, T4S, T4L);
+						  {
+						       E T5j, T5r, T59, T55;
+						       T5j = FMA(KP951056516, T5i, T5h);
+						       T5r = FNMS(KP951056516, T5i, T5h);
+						       T59 = FNMS(KP951056516, T54, T51);
+						       T55 = FMA(KP951056516, T54, T51);
+						       {
+							    E T5f, T5o, T58, T4U;
+							    T5f = T5b * T5e;
+							    T5o = T5m * T5n;
+							    T58 = T1V * T57;
+							    T4U = TD * T4T;
+							    {
+								 E T5k, T5s, T5a, T56;
+								 T5k = T5b * T5j;
+								 T5s = T5m * T5r;
+								 T5a = T1V * T59;
+								 T56 = TD * T55;
+								 Ip[WS(rs, 6)] = FNMS(T5g, T5j, T5f);
+								 Ip[WS(rs, 8)] = FNMS(T5q, T5r, T5o);
+								 Ip[WS(rs, 4)] = FNMS(T1X, T59, T58);
+								 Ip[0] = FNMS(TG, T55, T4U);
+								 Im[WS(rs, 6)] = FMA(T5g, T5e, T5k);
+								 Im[WS(rs, 8)] = FMA(T5q, T5n, T5s);
+								 Im[WS(rs, 4)] = FMA(T1X, T57, T5a);
+								 Im[0] = FMA(TG, T4T, T56);
+							    }
+						       }
+						  }
+					     }
+					     T2L = FNMS(KP250000000, T2K, T2H);
+					     T33 = FNMS(KP618033988, T2A, T2D);
+					     T2E = FMA(KP618033988, T2D, T2A);
+					     T2v = FNMS(KP250000000, TC, T7);
+					     T37 = FNMS(KP559016994, T2M, T2L);
+					     T2N = FMA(KP559016994, T2M, T2L);
+					     T1I = FNMS(KP618033988, T1H, T1G);
+					     T26 = FMA(KP618033988, T1G, T1H);
+					     T2x = FMA(KP559016994, T2w, T2v);
+					     T32 = FNMS(KP559016994, T2w, T2v);
+					     {
+						  E T3f, T39, T2R, T2Z;
+						  T3f = FNMS(KP951056516, T38, T37);
+						  T39 = FMA(KP951056516, T38, T37);
+						  T2R = FNMS(KP951056516, T2Q, T2N);
+						  T2Z = FMA(KP951056516, T2Q, T2N);
+						  {
+						       E T3c, T34, T2F, T2V;
+						       T3c = FMA(KP951056516, T33, T32);
+						       T34 = FNMS(KP951056516, T33, T32);
+						       T2F = FMA(KP951056516, T2E, T2x);
+						       T2V = FNMS(KP951056516, T2E, T2x);
+						       {
+							    E T3a, T35, T3g, T3d;
+							    T3a = T36 * T34;
+							    T35 = T31 * T34;
+							    T3g = T3e * T3c;
+							    T3d = T3b * T3c;
+							    {
+								 E T30, T2W, T2S, T2G;
+								 T30 = T2Y * T2V;
+								 T2W = T2U * T2V;
+								 T2S = T2b * T2F;
+								 T2G = T29 * T2F;
+								 Rm[WS(rs, 4)] = FMA(T31, T39, T3a);
+								 Rp[WS(rs, 4)] = FNMS(T36, T39, T35);
+								 Rm[WS(rs, 6)] = FMA(T3b, T3f, T3g);
+								 Rp[WS(rs, 6)] = FNMS(T3e, T3f, T3d);
+								 Rm[WS(rs, 8)] = FMA(T2U, T2Z, T30);
+								 Rp[WS(rs, 8)] = FNMS(T2Y, T2Z, T2W);
+								 Rm[WS(rs, 2)] = FMA(T29, T2R, T2S);
+								 Rp[WS(rs, 2)] = FNMS(T2b, T2R, T2G);
+							    }
+						       }
+						  }
+					     }
+					     T1o = FNMS(KP618033988, T1n, T18);
+					     T20 = FMA(KP618033988, T18, T1n);
+					     T1F = FNMS(KP559016994, T1E, T1D);
+					     T25 = FMA(KP559016994, T1E, T1D);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       TT = FNMS(KP559016994, TS, TR);
+	       T1Z = FMA(KP559016994, TS, TR);
+	       {
+		    E T2l, T27, T1J, T1T;
+		    T2l = FNMS(KP951056516, T26, T25);
+		    T27 = FMA(KP951056516, T26, T25);
+		    T1J = FNMS(KP951056516, T1I, T1F);
+		    T1T = FMA(KP951056516, T1I, T1F);
+		    {
+			 E T2h, T21, T1p, T1P;
+			 T2h = FMA(KP951056516, T20, T1Z);
+			 T21 = FNMS(KP951056516, T20, T1Z);
+			 T1p = FMA(KP951056516, T1o, TT);
+			 T1P = FNMS(KP951056516, T1o, TT);
+			 {
+			      E T28, T22, T2m, T2i;
+			      T28 = T24 * T21;
+			      T22 = T1Y * T21;
+			      T2m = T2k * T2h;
+			      T2i = T2g * T2h;
+			      {
+				   E T1U, T1Q, T1K, T1q;
+				   T1U = T1S * T1P;
+				   T1Q = T1O * T1P;
+				   T1K = T1s * T1p;
+				   T1q = TI * T1p;
+				   Rm[WS(rs, 3)] = FMA(T1Y, T27, T28);
+				   Rp[WS(rs, 3)] = FNMS(T24, T27, T22);
+				   Rm[WS(rs, 7)] = FMA(T2g, T2l, T2m);
+				   Rp[WS(rs, 7)] = FNMS(T2k, T2l, T2i);
+				   Rm[WS(rs, 9)] = FMA(T1O, T1T, T1U);
+				   Rp[WS(rs, 9)] = FNMS(T1S, T1T, T1Q);
+				   Rm[WS(rs, 1)] = FMA(TI, T1J, T1K);
+				   Rp[WS(rs, 1)] = FNMS(T1s, T1J, T1q);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cb2_20", twinstr, &GENUS, {136, 58, 140, 0} };
+
+void X(codelet_hc2cb2_20) (planner *p) {
+     X(khc2c_register) (p, hc2cb2_20, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 20 -dif -name hc2cb2_20 -include hc2cb.h */
+
+/*
+ * This function contains 276 FP additions, 164 FP multiplications,
+ * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
+ * 137 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E TD, TG, TE, TH, TJ, T1t, T27, T25, T1T, T1R, T1V, T2j, T2Z, T21, T2X;
+	       E T2T, T2n, T2P, T3V, T41, T3R, T3X, T29, T2c, T4H, T4L, T1L, T1M, T1N, T2d;
+	       E T4R, T1P, T4P, T49, T2N, T2f, T47, T2L;
+	       {
+		    E T1U, T2l, T1Z, T2i, T1S, T2m, T20, T2h;
+		    {
+			 E TF, T1s, TI, T1r;
+			 TD = W[0];
+			 TG = W[1];
+			 TE = W[2];
+			 TH = W[3];
+			 TF = TD * TE;
+			 T1s = TG * TE;
+			 TI = TG * TH;
+			 T1r = TD * TH;
+			 TJ = TF + TI;
+			 T1t = T1r - T1s;
+			 T27 = T1r + T1s;
+			 T25 = TF - TI;
+			 T1T = W[5];
+			 T1U = TH * T1T;
+			 T2l = TD * T1T;
+			 T1Z = TE * T1T;
+			 T2i = TG * T1T;
+			 T1R = W[4];
+			 T1S = TE * T1R;
+			 T2m = TG * T1R;
+			 T20 = TH * T1R;
+			 T2h = TD * T1R;
+		    }
+		    T1V = T1S + T1U;
+		    T2j = T2h - T2i;
+		    T2Z = T1Z + T20;
+		    T21 = T1Z - T20;
+		    T2X = T1S - T1U;
+		    T2T = T2l - T2m;
+		    T2n = T2l + T2m;
+		    T2P = T2h + T2i;
+		    {
+			 E T3T, T3U, T3P, T3Q;
+			 T3T = TJ * T1T;
+			 T3U = T1t * T1R;
+			 T3V = T3T - T3U;
+			 T41 = T3T + T3U;
+			 T3P = TJ * T1R;
+			 T3Q = T1t * T1T;
+			 T3R = T3P + T3Q;
+			 T3X = T3P - T3Q;
+			 {
+			      E T26, T28, T2a, T2b;
+			      T26 = T25 * T1R;
+			      T28 = T27 * T1T;
+			      T29 = T26 + T28;
+			      T2a = T25 * T1T;
+			      T2b = T27 * T1R;
+			      T2c = T2a - T2b;
+			      T4H = T26 - T28;
+			      T4L = T2a + T2b;
+			      T1L = W[6];
+			      T1M = W[7];
+			      T1N = FMA(TD, T1L, TG * T1M);
+			      T2d = FMA(T29, T1L, T2c * T1M);
+			      T4R = FNMS(T1t, T1L, TJ * T1M);
+			      T1P = FNMS(TG, T1L, TD * T1M);
+			      T4P = FMA(TJ, T1L, T1t * T1M);
+			      T49 = FNMS(T27, T1L, T25 * T1M);
+			      T2N = FNMS(TH, T1L, TE * T1M);
+			      T2f = FNMS(T2c, T1L, T29 * T1M);
+			      T47 = FMA(T25, T1L, T27 * T1M);
+			      T2L = FMA(TE, T1L, TH * T1M);
+			 }
+		    }
+	       }
+	       {
+		    E T7, T4i, T4x, TK, T1D, T3i, T3E, T2D, T19, T3L, T3M, T1o, T2x, T4C, T4B;
+		    E T2u, T1v, T4r, T4o, T1u, T2H, T37, T2I, T3e, T3p, T3w, T3x, Tm, TB, TC;
+		    E T4u, T4v, T4y, T2A, T2B, T2E, T1E, T1F, T1G, T4d, T4g, T4j, T3F, T3G, T3H;
+		    E TN, TQ, TR, T48, T4a;
+		    {
+			 E T3, T3g, T1z, T3C, T6, T3D, T1C, T3h;
+			 {
+			      E T1, T2, T1x, T1y;
+			      T1 = Rp[0];
+			      T2 = Rm[WS(rs, 9)];
+			      T3 = T1 + T2;
+			      T3g = T1 - T2;
+			      T1x = Ip[0];
+			      T1y = Im[WS(rs, 9)];
+			      T1z = T1x - T1y;
+			      T3C = T1x + T1y;
+			 }
+			 {
+			      E T4, T5, T1A, T1B;
+			      T4 = Rp[WS(rs, 5)];
+			      T5 = Rm[WS(rs, 4)];
+			      T6 = T4 + T5;
+			      T3D = T4 - T5;
+			      T1A = Ip[WS(rs, 5)];
+			      T1B = Im[WS(rs, 4)];
+			      T1C = T1A - T1B;
+			      T3h = T1A + T1B;
+			 }
+			 T7 = T3 + T6;
+			 T4i = T3g - T3h;
+			 T4x = T3D + T3C;
+			 TK = T3 - T6;
+			 T1D = T1z - T1C;
+			 T3i = T3g + T3h;
+			 T3E = T3C - T3D;
+			 T2D = T1z + T1C;
+		    }
+		    {
+			 E Te, T4b, T4m, TL, T11, T33, T3l, T2s, TA, T4f, T4q, TP, T1n, T3d, T3v;
+			 E T2w, Tl, T4c, T4n, TM, T18, T36, T3o, T2t, Tt, T4e, T4p, TO, T1g, T3a;
+			 E T3s, T2v;
+			 {
+			      E Ta, T3j, TX, T31, Td, T32, T10, T3k;
+			      {
+				   E T8, T9, TV, TW;
+				   T8 = Rp[WS(rs, 4)];
+				   T9 = Rm[WS(rs, 5)];
+				   Ta = T8 + T9;
+				   T3j = T8 - T9;
+				   TV = Ip[WS(rs, 4)];
+				   TW = Im[WS(rs, 5)];
+				   TX = TV - TW;
+				   T31 = TV + TW;
+			      }
+			      {
+				   E Tb, Tc, TY, TZ;
+				   Tb = Rp[WS(rs, 9)];
+				   Tc = Rm[0];
+				   Td = Tb + Tc;
+				   T32 = Tb - Tc;
+				   TY = Ip[WS(rs, 9)];
+				   TZ = Im[0];
+				   T10 = TY - TZ;
+				   T3k = TY + TZ;
+			      }
+			      Te = Ta + Td;
+			      T4b = T3j - T3k;
+			      T4m = T32 + T31;
+			      TL = Ta - Td;
+			      T11 = TX - T10;
+			      T33 = T31 - T32;
+			      T3l = T3j + T3k;
+			      T2s = TX + T10;
+			 }
+			 {
+			      E Tw, T3t, T1j, T3c, Tz, T3b, T1m, T3u;
+			      {
+				   E Tu, Tv, T1h, T1i;
+				   Tu = Rm[WS(rs, 7)];
+				   Tv = Rp[WS(rs, 2)];
+				   Tw = Tu + Tv;
+				   T3t = Tu - Tv;
+				   T1h = Ip[WS(rs, 2)];
+				   T1i = Im[WS(rs, 7)];
+				   T1j = T1h - T1i;
+				   T3c = T1h + T1i;
+			      }
+			      {
+				   E Tx, Ty, T1k, T1l;
+				   Tx = Rm[WS(rs, 2)];
+				   Ty = Rp[WS(rs, 7)];
+				   Tz = Tx + Ty;
+				   T3b = Tx - Ty;
+				   T1k = Ip[WS(rs, 7)];
+				   T1l = Im[WS(rs, 2)];
+				   T1m = T1k - T1l;
+				   T3u = T1k + T1l;
+			      }
+			      TA = Tw + Tz;
+			      T4f = T3t + T3u;
+			      T4q = T3b - T3c;
+			      TP = Tw - Tz;
+			      T1n = T1j - T1m;
+			      T3d = T3b + T3c;
+			      T3v = T3t - T3u;
+			      T2w = T1j + T1m;
+			 }
+			 {
+			      E Th, T3m, T14, T35, Tk, T34, T17, T3n;
+			      {
+				   E Tf, Tg, T12, T13;
+				   Tf = Rm[WS(rs, 3)];
+				   Tg = Rp[WS(rs, 6)];
+				   Th = Tf + Tg;
+				   T3m = Tf - Tg;
+				   T12 = Ip[WS(rs, 6)];
+				   T13 = Im[WS(rs, 3)];
+				   T14 = T12 - T13;
+				   T35 = T12 + T13;
+			      }
+			      {
+				   E Ti, Tj, T15, T16;
+				   Ti = Rp[WS(rs, 1)];
+				   Tj = Rm[WS(rs, 8)];
+				   Tk = Ti + Tj;
+				   T34 = Ti - Tj;
+				   T15 = Ip[WS(rs, 1)];
+				   T16 = Im[WS(rs, 8)];
+				   T17 = T15 - T16;
+				   T3n = T15 + T16;
+			      }
+			      Tl = Th + Tk;
+			      T4c = T3m - T3n;
+			      T4n = T34 - T35;
+			      TM = Th - Tk;
+			      T18 = T14 - T17;
+			      T36 = T34 + T35;
+			      T3o = T3m + T3n;
+			      T2t = T14 + T17;
+			 }
+			 {
+			      E Tp, T3q, T1c, T38, Ts, T39, T1f, T3r;
+			      {
+				   E Tn, To, T1a, T1b;
+				   Tn = Rp[WS(rs, 8)];
+				   To = Rm[WS(rs, 1)];
+				   Tp = Tn + To;
+				   T3q = Tn - To;
+				   T1a = Ip[WS(rs, 8)];
+				   T1b = Im[WS(rs, 1)];
+				   T1c = T1a - T1b;
+				   T38 = T1a + T1b;
+			      }
+			      {
+				   E Tq, Tr, T1d, T1e;
+				   Tq = Rm[WS(rs, 6)];
+				   Tr = Rp[WS(rs, 3)];
+				   Ts = Tq + Tr;
+				   T39 = Tq - Tr;
+				   T1d = Ip[WS(rs, 3)];
+				   T1e = Im[WS(rs, 6)];
+				   T1f = T1d - T1e;
+				   T3r = T1d + T1e;
+			      }
+			      Tt = Tp + Ts;
+			      T4e = T3q + T3r;
+			      T4p = T39 + T38;
+			      TO = Tp - Ts;
+			      T1g = T1c - T1f;
+			      T3a = T38 - T39;
+			      T3s = T3q - T3r;
+			      T2v = T1c + T1f;
+			 }
+			 T19 = T11 - T18;
+			 T3L = T3l - T3o;
+			 T3M = T3s - T3v;
+			 T1o = T1g - T1n;
+			 T2x = T2v - T2w;
+			 T4C = T4e - T4f;
+			 T4B = T4b - T4c;
+			 T2u = T2s - T2t;
+			 T1v = TO - TP;
+			 T4r = T4p - T4q;
+			 T4o = T4m - T4n;
+			 T1u = TL - TM;
+			 T2H = Te - Tl;
+			 T37 = T33 + T36;
+			 T2I = Tt - TA;
+			 T3e = T3a + T3d;
+			 T3p = T3l + T3o;
+			 T3w = T3s + T3v;
+			 T3x = T3p + T3w;
+			 Tm = Te + Tl;
+			 TB = Tt + TA;
+			 TC = Tm + TB;
+			 T4u = T4m + T4n;
+			 T4v = T4p + T4q;
+			 T4y = T4u + T4v;
+			 T2A = T2s + T2t;
+			 T2B = T2v + T2w;
+			 T2E = T2A + T2B;
+			 T1E = T11 + T18;
+			 T1F = T1g + T1n;
+			 T1G = T1E + T1F;
+			 T4d = T4b + T4c;
+			 T4g = T4e + T4f;
+			 T4j = T4d + T4g;
+			 T3F = T33 - T36;
+			 T3G = T3a - T3d;
+			 T3H = T3F + T3G;
+			 TN = TL + TM;
+			 TQ = TO + TP;
+			 TR = TN + TQ;
+		    }
+		    Rp[0] = T7 + TC;
+		    Rm[0] = T2D + T2E;
+		    {
+			 E T2k, T2o, T4T, T4U;
+			 T2k = TK + TR;
+			 T2o = T1D + T1G;
+			 Rp[WS(rs, 5)] = FNMS(T2n, T2o, T2j * T2k);
+			 Rm[WS(rs, 5)] = FMA(T2n, T2k, T2j * T2o);
+			 T4T = T4i + T4j;
+			 T4U = T4x + T4y;
+			 Ip[WS(rs, 2)] = FNMS(T2c, T4U, T29 * T4T);
+			 Im[WS(rs, 2)] = FMA(T29, T4U, T2c * T4T);
+		    }
+		    T48 = T3i + T3x;
+		    T4a = T3E + T3H;
+		    Ip[WS(rs, 7)] = FNMS(T49, T4a, T47 * T48);
+		    Im[WS(rs, 7)] = FMA(T47, T4a, T49 * T48);
+		    {
+			 E T2y, T2J, T2V, T2R, T2G, T2U, T2r, T2Q;
+			 T2y = FMA(KP951056516, T2u, KP587785252 * T2x);
+			 T2J = FMA(KP951056516, T2H, KP587785252 * T2I);
+			 T2V = FNMS(KP951056516, T2I, KP587785252 * T2H);
+			 T2R = FNMS(KP951056516, T2x, KP587785252 * T2u);
+			 {
+			      E T2C, T2F, T2p, T2q;
+			      T2C = KP559016994 * (T2A - T2B);
+			      T2F = FNMS(KP250000000, T2E, T2D);
+			      T2G = T2C + T2F;
+			      T2U = T2F - T2C;
+			      T2p = KP559016994 * (Tm - TB);
+			      T2q = FNMS(KP250000000, TC, T7);
+			      T2r = T2p + T2q;
+			      T2Q = T2q - T2p;
+			 }
+			 {
+			      E T2z, T2K, T2Y, T30;
+			      T2z = T2r + T2y;
+			      T2K = T2G - T2J;
+			      Rp[WS(rs, 2)] = FNMS(T27, T2K, T25 * T2z);
+			      Rm[WS(rs, 2)] = FMA(T27, T2z, T25 * T2K);
+			      T2Y = T2Q - T2R;
+			      T30 = T2V + T2U;
+			      Rp[WS(rs, 6)] = FNMS(T2Z, T30, T2X * T2Y);
+			      Rm[WS(rs, 6)] = FMA(T2Z, T2Y, T2X * T30);
+			 }
+			 {
+			      E T2M, T2O, T2S, T2W;
+			      T2M = T2r - T2y;
+			      T2O = T2J + T2G;
+			      Rp[WS(rs, 8)] = FNMS(T2N, T2O, T2L * T2M);
+			      Rm[WS(rs, 8)] = FMA(T2N, T2M, T2L * T2O);
+			      T2S = T2Q + T2R;
+			      T2W = T2U - T2V;
+			      Rp[WS(rs, 4)] = FNMS(T2T, T2W, T2P * T2S);
+			      Rm[WS(rs, 4)] = FMA(T2T, T2S, T2P * T2W);
+			 }
+		    }
+		    {
+			 E T4s, T4D, T4N, T4I, T4A, T4M, T4l, T4J;
+			 T4s = FMA(KP951056516, T4o, KP587785252 * T4r);
+			 T4D = FMA(KP951056516, T4B, KP587785252 * T4C);
+			 T4N = FNMS(KP951056516, T4C, KP587785252 * T4B);
+			 T4I = FNMS(KP951056516, T4r, KP587785252 * T4o);
+			 {
+			      E T4w, T4z, T4h, T4k;
+			      T4w = KP559016994 * (T4u - T4v);
+			      T4z = FNMS(KP250000000, T4y, T4x);
+			      T4A = T4w + T4z;
+			      T4M = T4z - T4w;
+			      T4h = KP559016994 * (T4d - T4g);
+			      T4k = FNMS(KP250000000, T4j, T4i);
+			      T4l = T4h + T4k;
+			      T4J = T4k - T4h;
+			 }
+			 {
+			      E T4t, T4E, T4Q, T4S;
+			      T4t = T4l - T4s;
+			      T4E = T4A + T4D;
+			      Ip[0] = FNMS(TG, T4E, TD * T4t);
+			      Im[0] = FMA(TD, T4E, TG * T4t);
+			      T4Q = T4J - T4I;
+			      T4S = T4M + T4N;
+			      Ip[WS(rs, 8)] = FNMS(T4R, T4S, T4P * T4Q);
+			      Im[WS(rs, 8)] = FMA(T4P, T4S, T4R * T4Q);
+			 }
+			 {
+			      E T4F, T4G, T4K, T4O;
+			      T4F = T4s + T4l;
+			      T4G = T4A - T4D;
+			      Ip[WS(rs, 4)] = FNMS(T1T, T4G, T1R * T4F);
+			      Im[WS(rs, 4)] = FMA(T1R, T4G, T1T * T4F);
+			      T4K = T4I + T4J;
+			      T4O = T4M - T4N;
+			      Ip[WS(rs, 6)] = FNMS(T4L, T4O, T4H * T4K);
+			      Im[WS(rs, 6)] = FMA(T4H, T4O, T4L * T4K);
+			 }
+		    }
+		    {
+			 E T1p, T1w, T22, T1X, T1J, T23, TU, T1W;
+			 T1p = FNMS(KP951056516, T1o, KP587785252 * T19);
+			 T1w = FNMS(KP951056516, T1v, KP587785252 * T1u);
+			 T22 = FMA(KP951056516, T1u, KP587785252 * T1v);
+			 T1X = FMA(KP951056516, T19, KP587785252 * T1o);
+			 {
+			      E T1H, T1I, TS, TT;
+			      T1H = FNMS(KP250000000, T1G, T1D);
+			      T1I = KP559016994 * (T1E - T1F);
+			      T1J = T1H - T1I;
+			      T23 = T1I + T1H;
+			      TS = FNMS(KP250000000, TR, TK);
+			      TT = KP559016994 * (TN - TQ);
+			      TU = TS - TT;
+			      T1W = TT + TS;
+			 }
+			 {
+			      E T1q, T1K, T2e, T2g;
+			      T1q = TU - T1p;
+			      T1K = T1w + T1J;
+			      Rp[WS(rs, 1)] = FNMS(T1t, T1K, TJ * T1q);
+			      Rm[WS(rs, 1)] = FMA(T1t, T1q, TJ * T1K);
+			      T2e = T1W + T1X;
+			      T2g = T23 - T22;
+			      Rp[WS(rs, 7)] = FNMS(T2f, T2g, T2d * T2e);
+			      Rm[WS(rs, 7)] = FMA(T2f, T2e, T2d * T2g);
+			 }
+			 {
+			      E T1O, T1Q, T1Y, T24;
+			      T1O = TU + T1p;
+			      T1Q = T1J - T1w;
+			      Rp[WS(rs, 9)] = FNMS(T1P, T1Q, T1N * T1O);
+			      Rm[WS(rs, 9)] = FMA(T1P, T1O, T1N * T1Q);
+			      T1Y = T1W - T1X;
+			      T24 = T22 + T23;
+			      Rp[WS(rs, 3)] = FNMS(T21, T24, T1V * T1Y);
+			      Rm[WS(rs, 3)] = FMA(T21, T1Y, T1V * T24);
+			 }
+		    }
+		    {
+			 E T3f, T3N, T43, T3Z, T3K, T42, T3A, T3Y;
+			 T3f = FNMS(KP951056516, T3e, KP587785252 * T37);
+			 T3N = FNMS(KP951056516, T3M, KP587785252 * T3L);
+			 T43 = FMA(KP951056516, T3L, KP587785252 * T3M);
+			 T3Z = FMA(KP951056516, T37, KP587785252 * T3e);
+			 {
+			      E T3I, T3J, T3y, T3z;
+			      T3I = FNMS(KP250000000, T3H, T3E);
+			      T3J = KP559016994 * (T3F - T3G);
+			      T3K = T3I - T3J;
+			      T42 = T3J + T3I;
+			      T3y = FNMS(KP250000000, T3x, T3i);
+			      T3z = KP559016994 * (T3p - T3w);
+			      T3A = T3y - T3z;
+			      T3Y = T3z + T3y;
+			 }
+			 {
+			      E T3B, T3O, T45, T46;
+			      T3B = T3f + T3A;
+			      T3O = T3K - T3N;
+			      Ip[WS(rs, 1)] = FNMS(TH, T3O, TE * T3B);
+			      Im[WS(rs, 1)] = FMA(TE, T3O, TH * T3B);
+			      T45 = T3Z + T3Y;
+			      T46 = T42 - T43;
+			      Ip[WS(rs, 9)] = FNMS(T1M, T46, T1L * T45);
+			      Im[WS(rs, 9)] = FMA(T1L, T46, T1M * T45);
+			 }
+			 {
+			      E T3S, T3W, T40, T44;
+			      T3S = T3A - T3f;
+			      T3W = T3K + T3N;
+			      Ip[WS(rs, 3)] = FNMS(T3V, T3W, T3R * T3S);
+			      Im[WS(rs, 3)] = FMA(T3R, T3W, T3V * T3S);
+			      T40 = T3Y - T3Z;
+			      T44 = T42 + T43;
+			      Ip[WS(rs, 5)] = FNMS(T41, T44, T3X * T40);
+			      Im[WS(rs, 5)] = FMA(T3X, T44, T41 * T40);
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cb2_20", twinstr, &GENUS, {204, 92, 72, 0} };
+
+void X(codelet_hc2cb2_20) (planner *p) {
+     X(khc2c_register) (p, hc2cb2_20, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb2_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb2_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1855 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:58 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 32 -dif -name hc2cb2_32 -include hc2cb.h */
+
+/*
+ * This function contains 488 FP additions, 350 FP multiplications,
+ * (or, 236 additions, 98 multiplications, 252 fused multiply/add),
+ * 204 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb2_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T5u, T6b, T6e, T5I, T66, T60, T5U, T5R, T67, T5L, T61, T5x, T5A, T5D, T5O;
+	       E T62, T5V, T5P;
+	       {
+		    E T11, T14, T12, T37, T17, T1b, T39, T15, T7C, T8P, T8S, T7I, T98, T7e, T78;
+		    E T8V, T3d, T3x, T3a, T3v, T9s, T3G, T4p, T5X, T16, T9m, T3y, T4b, T3C, T4g;
+		    E T5Z, T1a, T4r, T3J, T2O, T1c, T4W, T4s, T3Y, T3K, T3l, T3e, T3i, T3q, T8K;
+		    E T8E, T8m, T7S, T5k, T5e;
+		    {
+			 E T13, T3c, T38, T3F, T7B, T9l, T77, T7d, T9r, T7H;
+			 T11 = W[2];
+			 T14 = W[3];
+			 T12 = W[4];
+			 T37 = W[0];
+			 T17 = W[6];
+			 T1b = W[7];
+			 T13 = T11 * T12;
+			 T3c = T37 * T14;
+			 T38 = T37 * T11;
+			 T3F = T37 * T12;
+			 T7B = T11 * T17;
+			 T9l = T12 * T17;
+			 T77 = T37 * T17;
+			 T7d = T37 * T1b;
+			 T9r = T12 * T1b;
+			 T7H = T11 * T1b;
+			 T39 = W[1];
+			 T15 = W[5];
+			 {
+			      E T3I, T19, T5d, T3b, T18, T2N;
+			      T7C = FMA(T14, T1b, T7B);
+			      T8P = FNMS(T14, T1b, T7B);
+			      T8S = FMA(T14, T17, T7H);
+			      T7I = FNMS(T14, T17, T7H);
+			      T98 = FNMS(T39, T17, T7d);
+			      T7e = FMA(T39, T17, T7d);
+			      T78 = FNMS(T39, T1b, T77);
+			      T8V = FMA(T39, T1b, T77);
+			      T3d = FMA(T39, T11, T3c);
+			      T3x = FNMS(T39, T11, T3c);
+			      T3a = FNMS(T39, T14, T38);
+			      T3v = FMA(T39, T14, T38);
+			      T9s = FNMS(T15, T17, T9r);
+			      T3G = FNMS(T39, T15, T3F);
+			      T4p = FMA(T39, T15, T3F);
+			      T5X = FNMS(T14, T15, T13);
+			      T16 = FMA(T14, T15, T13);
+			      T3I = T37 * T15;
+			      T19 = T11 * T15;
+			      T5d = T3v * T12;
+			      T3b = T3a * T12;
+			      T9m = FMA(T15, T1b, T9l);
+			      {
+				   E T3w, T3B, T5t, T5H;
+				   T3w = T3v * T17;
+				   T3B = T3v * T1b;
+				   T5t = T3a * T17;
+				   T5H = T3a * T1b;
+				   T3y = FNMS(T3x, T1b, T3w);
+				   T4b = FMA(T3x, T1b, T3w);
+				   T3C = FMA(T3x, T17, T3B);
+				   T4g = FNMS(T3x, T17, T3B);
+				   T5u = FMA(T3d, T1b, T5t);
+				   T6b = FNMS(T3d, T1b, T5t);
+				   T6e = FMA(T3d, T17, T5H);
+				   T5I = FNMS(T3d, T17, T5H);
+				   T18 = T16 * T17;
+				   T2N = T16 * T1b;
+				   T5Z = FMA(T14, T12, T19);
+				   T1a = FNMS(T14, T12, T19);
+			      }
+			      {
+				   E T3H, T3X, T4q, T4V, T5Y, T65;
+				   T4q = T4p * T17;
+				   T4V = T4p * T1b;
+				   T4r = FNMS(T39, T12, T3I);
+				   T3J = FMA(T39, T12, T3I);
+				   T2O = FNMS(T1a, T17, T2N);
+				   T1c = FMA(T1a, T1b, T18);
+				   T3H = T3G * T17;
+				   T4W = FNMS(T4r, T17, T4V);
+				   T4s = FMA(T4r, T1b, T4q);
+				   T3X = T3G * T1b;
+				   T5Y = T5X * T17;
+				   T65 = T5X * T1b;
+				   T3Y = FNMS(T3J, T17, T3X);
+				   T3K = FMA(T3J, T1b, T3H);
+				   {
+					E T8J, T8D, T3h, T5j, T8l, T7R;
+					T3h = T3a * T15;
+					T66 = FNMS(T5Z, T17, T65);
+					T60 = FMA(T5Z, T1b, T5Y);
+					T3l = FNMS(T3d, T15, T3b);
+					T3e = FMA(T3d, T15, T3b);
+					T3i = FNMS(T3d, T12, T3h);
+					T3q = FMA(T3d, T12, T3h);
+					T8J = T3l * T1b;
+					T8D = T3l * T17;
+					T5j = T3v * T15;
+					T8l = T3e * T1b;
+					T7R = T3e * T17;
+					T8K = FNMS(T3q, T17, T8J);
+					T8E = FMA(T3q, T1b, T8D);
+					T8m = FNMS(T3i, T17, T8l);
+					T7S = FMA(T3i, T1b, T7R);
+					T5U = FNMS(T3x, T12, T5j);
+					T5k = FMA(T3x, T12, T5j);
+					T5e = FNMS(T3x, T15, T5d);
+					T5R = FMA(T3x, T15, T5d);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T6O, T6i, T7s, T7o, T6j, Tf, T8W, T7V, T99, T8p, T3L, T1t, T3Z, T2X, T5J;
+			 E T4Z, T7t, T6W, T5v, T4v, TZ, T7x, T91, T9d, T28, T3S, T3R, T2h, T5B, T4Q;
+			 E T8v, T8a, T5C, T4N, T6Z, T6J, TK, T7w, T3P, T2z, T9c, T94, T3O, T2I, T5y;
+			 E T4J, T8u, T8h, T5z, T4G, T6Y, T6A, T6P, Tu, T9a, T82, T8X, T8s, T4y, T40;
+			 E T1Q, T3M, T30, T4B, T5w, T52, T7u, T6q;
+			 {
+			      E T6B, T6I, T4M, T4L, T4t, T4u, T6s, T6z;
+			      {
+				   E T1d, T3, T6Q, T2S, T2P, T6, T6R, T1g, Td, T6U, T1i, Ta, T2V, T1r, T6T;
+				   E T1l;
+				   {
+					E T2Q, T2R, T4, T5, T1, T2, T1e, T1f;
+					T1 = Rp[0];
+					T2 = Rm[WS(rs, 15)];
+					{
+					     E T6N, T6h, T7r, T7n;
+					     T6N = T5R * T1b;
+					     T6h = T5R * T17;
+					     T7r = T5e * T1b;
+					     T7n = T5e * T17;
+					     T6O = FNMS(T5U, T17, T6N);
+					     T6i = FMA(T5U, T1b, T6h);
+					     T7s = FNMS(T5k, T17, T7r);
+					     T7o = FMA(T5k, T1b, T7n);
+					     T1d = T1 - T2;
+					     T3 = T1 + T2;
+					}
+					T2Q = Ip[0];
+					T2R = Im[WS(rs, 15)];
+					T4 = Rp[WS(rs, 8)];
+					T5 = Rm[WS(rs, 7)];
+					T1e = Ip[WS(rs, 8)];
+					T6Q = T2Q - T2R;
+					T2S = T2Q + T2R;
+					T2P = T4 - T5;
+					T6 = T4 + T5;
+					T1f = Im[WS(rs, 7)];
+					{
+					     E T1o, T1n, T1p, Tb, Tc;
+					     Tb = Rm[WS(rs, 3)];
+					     Tc = Rp[WS(rs, 12)];
+					     T1o = Ip[WS(rs, 12)];
+					     T6R = T1e - T1f;
+					     T1g = T1e + T1f;
+					     T1n = Tb - Tc;
+					     Td = Tb + Tc;
+					     T1p = Im[WS(rs, 3)];
+					     {
+						  E T1j, T1k, T8, T9, T1q;
+						  T8 = Rp[WS(rs, 4)];
+						  T9 = Rm[WS(rs, 11)];
+						  T1q = T1o + T1p;
+						  T6U = T1o - T1p;
+						  T1j = Ip[WS(rs, 4)];
+						  T1i = T8 - T9;
+						  Ta = T8 + T9;
+						  T1k = Im[WS(rs, 11)];
+						  T2V = T1n + T1q;
+						  T1r = T1n - T1q;
+						  T6T = T1j - T1k;
+						  T1l = T1j + T1k;
+					     }
+					}
+				   }
+				   {
+					E T2U, T6V, T6S, T1h, T1s, T4Y, T4X, T2T, T2W;
+					{
+					     E T7T, T8o, T1m, T7U, T7, Te, T8n;
+					     T7T = T3 - T6;
+					     T7 = T3 + T6;
+					     Te = Ta + Td;
+					     T8o = Ta - Td;
+					     T1m = T1i - T1l;
+					     T2U = T1i + T1l;
+					     T6j = T7 - Te;
+					     Tf = T7 + Te;
+					     T7U = T6U - T6T;
+					     T6V = T6T + T6U;
+					     T6S = T6Q + T6R;
+					     T8n = T6Q - T6R;
+					     T4t = T1d + T1g;
+					     T1h = T1d - T1g;
+					     T8W = T7T + T7U;
+					     T7V = T7T - T7U;
+					     T99 = T8o + T8n;
+					     T8p = T8n - T8o;
+					     T1s = T1m + T1r;
+					     T4Y = T1m - T1r;
+					}
+					T4X = T2S - T2P;
+					T2T = T2P + T2S;
+					T2W = T2U - T2V;
+					T4u = T2U + T2V;
+					T3L = FMA(KP707106781, T1s, T1h);
+					T1t = FNMS(KP707106781, T1s, T1h);
+					T3Z = FMA(KP707106781, T2W, T2T);
+					T2X = FNMS(KP707106781, T2W, T2T);
+					T5J = FNMS(KP707106781, T4Y, T4X);
+					T4Z = FMA(KP707106781, T4Y, T4X);
+					T7t = T6S + T6V;
+					T6W = T6S - T6V;
+				   }
+			      }
+			      {
+				   E T29, T1S, T1V, T87, TR, T2c, T84, T6E, TU, T23, T6F, T22, TX, T24, T2e;
+				   E T21;
+				   {
+					E TO, TN, TP, TL, TM;
+					TL = Rm[0];
+					TM = Rp[WS(rs, 15)];
+					TO = Rp[WS(rs, 7)];
+					T5v = FMA(KP707106781, T4u, T4t);
+					T4v = FNMS(KP707106781, T4u, T4t);
+					TN = TL + TM;
+					T29 = TL - TM;
+					TP = Rm[WS(rs, 8)];
+					{
+					     E T6C, T6D, T1X, T20;
+					     {
+						  E T2a, T2b, T1T, T1U, TQ;
+						  T1T = Ip[WS(rs, 15)];
+						  T1U = Im[0];
+						  TQ = TO + TP;
+						  T1S = TO - TP;
+						  T2a = Ip[WS(rs, 7)];
+						  T6C = T1T - T1U;
+						  T1V = T1T + T1U;
+						  T2b = Im[WS(rs, 8)];
+						  T87 = TN - TQ;
+						  TR = TN + TQ;
+						  T2c = T2a + T2b;
+						  T6D = T2a - T2b;
+					     }
+					     {
+						  E T1Y, T1Z, TS, TT, TV, TW;
+						  TS = Rp[WS(rs, 3)];
+						  TT = Rm[WS(rs, 12)];
+						  T84 = T6C - T6D;
+						  T6E = T6C + T6D;
+						  T1Y = Ip[WS(rs, 3)];
+						  T1X = TS - TT;
+						  TU = TS + TT;
+						  T1Z = Im[WS(rs, 12)];
+						  TV = Rm[WS(rs, 4)];
+						  TW = Rp[WS(rs, 11)];
+						  T23 = Ip[WS(rs, 11)];
+						  T6F = T1Y - T1Z;
+						  T20 = T1Y + T1Z;
+						  T22 = TV - TW;
+						  TX = TV + TW;
+						  T24 = Im[WS(rs, 4)];
+					     }
+					     T2e = T1X - T20;
+					     T21 = T1X + T20;
+					}
+				   }
+				   {
+					E TY, T85, T25, T6G;
+					TY = TU + TX;
+					T85 = TU - TX;
+					T25 = T23 + T24;
+					T6G = T23 - T24;
+					{
+					     E T4O, T1W, T2f, T8Z, T86, T89, T90, T27, T88, T26, T6H, T4P, T2d, T2g;
+					     T4O = T1S + T1V;
+					     T1W = T1S - T1V;
+					     TZ = TR + TY;
+					     T6B = TR - TY;
+					     T88 = T6G - T6F;
+					     T6H = T6F + T6G;
+					     T26 = T22 + T25;
+					     T2f = T22 - T25;
+					     T6I = T6E - T6H;
+					     T7x = T6E + T6H;
+					     T8Z = T85 + T84;
+					     T86 = T84 - T85;
+					     T89 = T87 - T88;
+					     T90 = T87 + T88;
+					     T27 = T21 - T26;
+					     T4M = T21 + T26;
+					     T4L = T29 + T2c;
+					     T2d = T29 - T2c;
+					     T2g = T2e + T2f;
+					     T4P = T2e - T2f;
+					     T91 = FNMS(KP414213562, T90, T8Z);
+					     T9d = FMA(KP414213562, T8Z, T90);
+					     T28 = FNMS(KP707106781, T27, T1W);
+					     T3S = FMA(KP707106781, T27, T1W);
+					     T3R = FMA(KP707106781, T2g, T2d);
+					     T2h = FNMS(KP707106781, T2g, T2d);
+					     T5B = FMA(KP707106781, T4P, T4O);
+					     T4Q = FNMS(KP707106781, T4P, T4O);
+					     T8v = FNMS(KP414213562, T86, T89);
+					     T8a = FMA(KP414213562, T89, T86);
+					}
+				   }
+			      }
+			      {
+				   E T2A, T2j, TC, T8e, T2m, T2D, T6v, T8b, TF, T6w, T2F, T2s, T2t, TI, T6x;
+				   E T2w, TJ, T8c;
+				   {
+					E Tw, Tx, Tz, TA, T6t, T6u;
+					Tw = Rp[WS(rs, 1)];
+					T5C = FMA(KP707106781, T4M, T4L);
+					T4N = FNMS(KP707106781, T4M, T4L);
+					T6Z = T6I - T6B;
+					T6J = T6B + T6I;
+					Tx = Rm[WS(rs, 14)];
+					Tz = Rp[WS(rs, 9)];
+					TA = Rm[WS(rs, 6)];
+					{
+					     E T2k, Ty, TB, T2l, T2B, T2C;
+					     T2k = Ip[WS(rs, 1)];
+					     T2A = Tw - Tx;
+					     Ty = Tw + Tx;
+					     T2j = Tz - TA;
+					     TB = Tz + TA;
+					     T2l = Im[WS(rs, 14)];
+					     T2B = Ip[WS(rs, 9)];
+					     T2C = Im[WS(rs, 6)];
+					     TC = Ty + TB;
+					     T8e = Ty - TB;
+					     T2m = T2k + T2l;
+					     T6t = T2k - T2l;
+					     T6u = T2B - T2C;
+					     T2D = T2B + T2C;
+					}
+					{
+					     E TG, T2o, T2r, TH, T2u, T2v;
+					     {
+						  E TD, TE, T2p, T2q;
+						  TD = Rp[WS(rs, 5)];
+						  T6v = T6t + T6u;
+						  T8b = T6t - T6u;
+						  TE = Rm[WS(rs, 10)];
+						  T2p = Ip[WS(rs, 5)];
+						  T2q = Im[WS(rs, 10)];
+						  TG = Rm[WS(rs, 2)];
+						  T2o = TD - TE;
+						  TF = TD + TE;
+						  T6w = T2p - T2q;
+						  T2r = T2p + T2q;
+						  TH = Rp[WS(rs, 13)];
+						  T2u = Ip[WS(rs, 13)];
+						  T2v = Im[WS(rs, 2)];
+					     }
+					     T2F = T2o - T2r;
+					     T2s = T2o + T2r;
+					     T2t = TG - TH;
+					     TI = TG + TH;
+					     T6x = T2u - T2v;
+					     T2w = T2u + T2v;
+					}
+				   }
+				   TJ = TF + TI;
+				   T8c = TF - TI;
+				   {
+					E T8f, T6y, T2x, T2G;
+					T8f = T6x - T6w;
+					T6y = T6w + T6x;
+					T2x = T2t + T2w;
+					T2G = T2t - T2w;
+					{
+					     E T4H, T2n, T2y, T4F, T8d, T92, T93, T8g;
+					     T6s = TC - TJ;
+					     TK = TC + TJ;
+					     T7w = T6v + T6y;
+					     T6z = T6v - T6y;
+					     T4H = T2m - T2j;
+					     T2n = T2j + T2m;
+					     T2y = T2s - T2x;
+					     T4F = T2s + T2x;
+					     T8d = T8b - T8c;
+					     T92 = T8c + T8b;
+					     T93 = T8e + T8f;
+					     T8g = T8e - T8f;
+					     {
+						  E T4E, T2E, T2H, T4I;
+						  T4E = T2A + T2D;
+						  T2E = T2A - T2D;
+						  T3P = FMA(KP707106781, T2y, T2n);
+						  T2z = FNMS(KP707106781, T2y, T2n);
+						  T9c = FNMS(KP414213562, T92, T93);
+						  T94 = FMA(KP414213562, T93, T92);
+						  T2H = T2F + T2G;
+						  T4I = T2G - T2F;
+						  T3O = FMA(KP707106781, T2H, T2E);
+						  T2I = FNMS(KP707106781, T2H, T2E);
+						  T5y = FMA(KP707106781, T4I, T4H);
+						  T4J = FNMS(KP707106781, T4I, T4H);
+						  T8u = FMA(KP414213562, T8d, T8g);
+						  T8h = FNMS(KP414213562, T8g, T8d);
+						  T5z = FMA(KP707106781, T4F, T4E);
+						  T4G = FNMS(KP707106781, T4F, T4E);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T4w, T1J, T7Z, Tm, T6p, T80, T4x, T1O, T1z, Tp, T1A, T6k, T1x, T1u, Ts;
+				   E T1B;
+				   {
+					E T1K, Ti, T1L, T6n, T1I, T1F, Tl, T1M;
+					{
+					     E T1G, T1H, Tg, Th, Tj, Tk;
+					     Tg = Rp[WS(rs, 2)];
+					     Th = Rm[WS(rs, 13)];
+					     T1G = Ip[WS(rs, 2)];
+					     T6Y = T6s + T6z;
+					     T6A = T6s - T6z;
+					     T1K = Tg - Th;
+					     Ti = Tg + Th;
+					     T1H = Im[WS(rs, 13)];
+					     Tj = Rp[WS(rs, 10)];
+					     Tk = Rm[WS(rs, 5)];
+					     T1L = Ip[WS(rs, 10)];
+					     T6n = T1G - T1H;
+					     T1I = T1G + T1H;
+					     T1F = Tj - Tk;
+					     Tl = Tj + Tk;
+					     T1M = Im[WS(rs, 5)];
+					}
+					{
+					     E T1v, T1w, Tq, Tr;
+					     {
+						  E Tn, T1N, T6o, To;
+						  Tn = Rm[WS(rs, 1)];
+						  T4w = T1I - T1F;
+						  T1J = T1F + T1I;
+						  T7Z = Ti - Tl;
+						  Tm = Ti + Tl;
+						  T1N = T1L + T1M;
+						  T6o = T1L - T1M;
+						  To = Rp[WS(rs, 14)];
+						  T1v = Ip[WS(rs, 14)];
+						  T6p = T6n + T6o;
+						  T80 = T6n - T6o;
+						  T4x = T1K + T1N;
+						  T1O = T1K - T1N;
+						  T1z = Tn - To;
+						  Tp = Tn + To;
+						  T1w = Im[WS(rs, 1)];
+					     }
+					     Tq = Rp[WS(rs, 6)];
+					     Tr = Rm[WS(rs, 9)];
+					     T1A = Ip[WS(rs, 6)];
+					     T6k = T1v - T1w;
+					     T1x = T1v + T1w;
+					     T1u = Tq - Tr;
+					     Ts = Tq + Tr;
+					     T1B = Im[WS(rs, 9)];
+					}
+				   }
+				   {
+					E T4z, T6m, T4A, T2Z, T1E, T1P, T2Y, T50, T51;
+					{
+					     E T1y, T81, T8q, T1D, T7Y, T8r;
+					     {
+						  E T7X, Tt, T1C, T6l, T7W;
+						  T4z = T1u + T1x;
+						  T1y = T1u - T1x;
+						  T7X = Tp - Ts;
+						  Tt = Tp + Ts;
+						  T1C = T1A + T1B;
+						  T6l = T1A - T1B;
+						  T81 = T7Z + T80;
+						  T8q = T7Z - T80;
+						  T6m = T6k + T6l;
+						  T7W = T6k - T6l;
+						  T4A = T1z + T1C;
+						  T1D = T1z - T1C;
+						  T6P = Tm - Tt;
+						  Tu = Tm + Tt;
+						  T7Y = T7W - T7X;
+						  T8r = T7X + T7W;
+					     }
+					     T2Z = FMA(KP414213562, T1y, T1D);
+					     T1E = FNMS(KP414213562, T1D, T1y);
+					     T9a = T81 + T7Y;
+					     T82 = T7Y - T81;
+					     T8X = T8q + T8r;
+					     T8s = T8q - T8r;
+					     T1P = FMA(KP414213562, T1O, T1J);
+					     T2Y = FNMS(KP414213562, T1J, T1O);
+					}
+					T4y = FNMS(KP414213562, T4x, T4w);
+					T50 = FMA(KP414213562, T4w, T4x);
+					T40 = T1P + T1E;
+					T1Q = T1E - T1P;
+					T3M = T2Y + T2Z;
+					T30 = T2Y - T2Z;
+					T51 = FMA(KP414213562, T4z, T4A);
+					T4B = FNMS(KP414213562, T4A, T4z);
+					T5w = T50 + T51;
+					T52 = T50 - T51;
+					T7u = T6p + T6m;
+					T6q = T6m - T6p;
+				   }
+			      }
+			 }
+			 {
+			      E T7D, T7K, T7J, T5K, T4C, T7E, T83, T8w, T8t, T8i, T6r, T70, T6X, T6K;
+			      {
+				   E T8Y, T9e, T9b, T95, T8F, T8G, T8L, T8M;
+				   {
+					E T7v, T7p, T7y, Tv, T10;
+					T7D = Tf - Tu;
+					Tv = Tf + Tu;
+					T10 = TK + TZ;
+					T7K = TK - TZ;
+					T7J = T7t - T7u;
+					T7v = T7t + T7u;
+					T5K = T4B - T4y;
+					T4C = T4y + T4B;
+					T7p = Tv - T10;
+					T7E = T7x - T7w;
+					T7y = T7w + T7x;
+					Rp[0] = Tv + T10;
+					{
+					     E T9p, T9x, T9z, T9v;
+					     {
+						  E T9n, T7A, T7q, T7z, T9o, T9t, T9u;
+						  T8Y = FNMS(KP707106781, T8X, T8W);
+						  T9n = FMA(KP707106781, T8X, T8W);
+						  T7A = T7s * T7p;
+						  T7q = T7o * T7p;
+						  Rm[0] = T7v + T7y;
+						  T7z = T7v - T7y;
+						  T9o = T9c + T9d;
+						  T9e = T9c - T9d;
+						  T9b = FNMS(KP707106781, T9a, T99);
+						  T9t = FMA(KP707106781, T9a, T99);
+						  T9u = T94 + T91;
+						  T95 = T91 - T94;
+						  Rm[WS(rs, 8)] = FMA(T7o, T7z, T7A);
+						  Rp[WS(rs, 8)] = FNMS(T7s, T7z, T7q);
+						  T9p = FNMS(KP923879532, T9o, T9n);
+						  T9x = FMA(KP923879532, T9o, T9n);
+						  T9z = FMA(KP923879532, T9u, T9t);
+						  T9v = FNMS(KP923879532, T9u, T9t);
+					     }
+					     {
+						  E T9y, T9q, T9w, T9A;
+						  T9y = T3v * T9x;
+						  T9q = T9m * T9p;
+						  T9w = T9m * T9v;
+						  T9A = T3v * T9z;
+						  Rp[WS(rs, 1)] = FNMS(T3x, T9z, T9y);
+						  Rp[WS(rs, 9)] = FNMS(T9s, T9v, T9q);
+						  Rm[WS(rs, 9)] = FMA(T9s, T9p, T9w);
+						  Rm[WS(rs, 1)] = FMA(T3x, T9x, T9A);
+					     }
+					}
+					T83 = FMA(KP707106781, T82, T7V);
+					T8F = FNMS(KP707106781, T82, T7V);
+					T8G = T8u + T8v;
+					T8w = T8u - T8v;
+					T8t = FMA(KP707106781, T8s, T8p);
+					T8L = FNMS(KP707106781, T8s, T8p);
+					T8M = T8h + T8a;
+					T8i = T8a - T8h;
+				   }
+				   {
+					E T79, T7a, T7f, T7g;
+					T6r = T6j + T6q;
+					T79 = T6j - T6q;
+					{
+					     E T8Q, T8H, T8T, T8N;
+					     T8Q = FMA(KP923879532, T8G, T8F);
+					     T8H = FNMS(KP923879532, T8G, T8F);
+					     T8T = FMA(KP923879532, T8M, T8L);
+					     T8N = FNMS(KP923879532, T8M, T8L);
+					     {
+						  E T8R, T8I, T8U, T8O;
+						  T8R = T8P * T8Q;
+						  T8I = T8E * T8H;
+						  T8U = T8P * T8T;
+						  T8O = T8E * T8N;
+						  Rp[WS(rs, 15)] = FNMS(T8S, T8T, T8R);
+						  Rp[WS(rs, 7)] = FNMS(T8K, T8N, T8I);
+						  Rm[WS(rs, 15)] = FMA(T8S, T8Q, T8U);
+						  Rm[WS(rs, 7)] = FMA(T8K, T8H, T8O);
+						  T7a = T6Z - T6Y;
+						  T70 = T6Y + T6Z;
+					     }
+					}
+					T6X = T6P + T6W;
+					T7f = T6W - T6P;
+					T7g = T6A - T6J;
+					T6K = T6A + T6J;
+					{
+					     E T7j, T7b, T7l, T7h;
+					     T7j = FMA(KP707106781, T7a, T79);
+					     T7b = FNMS(KP707106781, T7a, T79);
+					     T7l = FMA(KP707106781, T7g, T7f);
+					     T7h = FNMS(KP707106781, T7g, T7f);
+					     {
+						  E T7k, T7c, T7m, T7i;
+						  T7k = T5X * T7j;
+						  T7c = T78 * T7b;
+						  T7m = T5X * T7l;
+						  T7i = T78 * T7h;
+						  Rp[WS(rs, 6)] = FNMS(T5Z, T7l, T7k);
+						  Rp[WS(rs, 14)] = FNMS(T7e, T7h, T7c);
+						  Rm[WS(rs, 6)] = FMA(T5Z, T7j, T7m);
+						  Rm[WS(rs, 14)] = FMA(T7e, T7b, T7i);
+					     }
+					}
+					{
+					     E T9h, T96, T9j, T9f;
+					     T9h = FMA(KP923879532, T95, T8Y);
+					     T96 = FNMS(KP923879532, T95, T8Y);
+					     T9j = FMA(KP923879532, T9e, T9b);
+					     T9f = FNMS(KP923879532, T9e, T9b);
+					     {
+						  E T9k, T9i, T9g, T97;
+						  T9k = T3J * T9h;
+						  T9i = T3G * T9h;
+						  T9g = T98 * T96;
+						  T97 = T8V * T96;
+						  Rm[WS(rs, 5)] = FMA(T3G, T9j, T9k);
+						  Rp[WS(rs, 5)] = FNMS(T3J, T9j, T9i);
+						  Rm[WS(rs, 13)] = FMA(T8V, T9f, T9g);
+						  Rp[WS(rs, 13)] = FNMS(T98, T9f, T97);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T31, T3r, T1R, T3m, T33, T32, T3s, T2K, T8z, T8j;
+				   {
+					E T73, T6L, T75, T71;
+					T73 = FMA(KP707106781, T6K, T6r);
+					T6L = FNMS(KP707106781, T6K, T6r);
+					T75 = FMA(KP707106781, T70, T6X);
+					T71 = FNMS(KP707106781, T70, T6X);
+					{
+					     E T76, T74, T72, T6M;
+					     T76 = T3d * T73;
+					     T74 = T3a * T73;
+					     T72 = T6O * T6L;
+					     T6M = T6i * T6L;
+					     Rm[WS(rs, 2)] = FMA(T3a, T75, T76);
+					     Rp[WS(rs, 2)] = FNMS(T3d, T75, T74);
+					     Rm[WS(rs, 10)] = FMA(T6i, T71, T72);
+					     Rp[WS(rs, 10)] = FNMS(T6O, T71, T6M);
+					}
+				   }
+				   {
+					E T7N, T7F, T7P, T7L;
+					T7N = T7D + T7E;
+					T7F = T7D - T7E;
+					T7P = T7K + T7J;
+					T7L = T7J - T7K;
+					{
+					     E T7O, T7G, T7Q, T7M;
+					     T7O = T4p * T7N;
+					     T7G = T7C * T7F;
+					     T7Q = T4p * T7P;
+					     T7M = T7C * T7L;
+					     Rp[WS(rs, 4)] = FNMS(T4r, T7P, T7O);
+					     Rp[WS(rs, 12)] = FNMS(T7I, T7L, T7G);
+					     Rm[WS(rs, 4)] = FMA(T4r, T7N, T7Q);
+					     Rm[WS(rs, 12)] = FMA(T7I, T7F, T7M);
+					}
+				   }
+				   T31 = FMA(KP923879532, T30, T2X);
+				   T3r = FNMS(KP923879532, T30, T2X);
+				   T8z = FMA(KP923879532, T8i, T83);
+				   T8j = FNMS(KP923879532, T8i, T83);
+				   {
+					E T8B, T8x, T8C, T8A;
+					T8B = FMA(KP923879532, T8w, T8t);
+					T8x = FNMS(KP923879532, T8w, T8t);
+					T8C = T1a * T8z;
+					T8A = T16 * T8z;
+					{
+					     E T8y, T8k, T2i, T2J;
+					     T8y = T8m * T8j;
+					     T8k = T7S * T8j;
+					     Rm[WS(rs, 3)] = FMA(T16, T8B, T8C);
+					     Rp[WS(rs, 3)] = FNMS(T1a, T8B, T8A);
+					     Rm[WS(rs, 11)] = FMA(T7S, T8x, T8y);
+					     Rp[WS(rs, 11)] = FNMS(T8m, T8x, T8k);
+					     T1R = FMA(KP923879532, T1Q, T1t);
+					     T3m = FNMS(KP923879532, T1Q, T1t);
+					     T33 = FNMS(KP668178637, T28, T2h);
+					     T2i = FMA(KP668178637, T2h, T28);
+					     T2J = FNMS(KP668178637, T2I, T2z);
+					     T32 = FMA(KP668178637, T2z, T2I);
+					     T3s = T2J + T2i;
+					     T2K = T2i - T2J;
+					}
+				   }
+				   {
+					E T5l, T53, T5f, T4D, T4K, T4R, T56, T5g;
+					T5l = FNMS(KP923879532, T52, T4Z);
+					T53 = FMA(KP923879532, T52, T4Z);
+					{
+					     E T3t, T3D, T3f, T2L;
+					     T3t = FNMS(KP831469612, T3s, T3r);
+					     T3D = FMA(KP831469612, T3s, T3r);
+					     T3f = FMA(KP831469612, T2K, T1R);
+					     T2L = FNMS(KP831469612, T2K, T1R);
+					     {
+						  E T3n, T34, T3g, T2M;
+						  T3n = T32 + T33;
+						  T34 = T32 - T33;
+						  T3g = T3e * T3f;
+						  T2M = T1c * T2L;
+						  {
+						       E T3o, T3z, T3j, T35;
+						       T3o = FNMS(KP831469612, T3n, T3m);
+						       T3z = FMA(KP831469612, T3n, T3m);
+						       T3j = FMA(KP831469612, T34, T31);
+						       T35 = FNMS(KP831469612, T34, T31);
+						       {
+							    E T3u, T3p, T3E, T3A;
+							    T3u = T3q * T3o;
+							    T3p = T3l * T3o;
+							    T3E = T3C * T3z;
+							    T3A = T3y * T3z;
+							    {
+								 E T3k, T36, T54, T55;
+								 T3k = T3e * T3j;
+								 Ip[WS(rs, 2)] = FNMS(T3i, T3j, T3g);
+								 T36 = T1c * T35;
+								 Ip[WS(rs, 10)] = FNMS(T2O, T35, T2M);
+								 Im[WS(rs, 6)] = FMA(T3l, T3t, T3u);
+								 Ip[WS(rs, 6)] = FNMS(T3q, T3t, T3p);
+								 Im[WS(rs, 14)] = FMA(T3y, T3D, T3E);
+								 Ip[WS(rs, 14)] = FNMS(T3C, T3D, T3A);
+								 Im[WS(rs, 2)] = FMA(T3i, T3f, T3k);
+								 Im[WS(rs, 10)] = FMA(T2O, T2L, T36);
+								 T5f = FMA(KP923879532, T4C, T4v);
+								 T4D = FNMS(KP923879532, T4C, T4v);
+								 T4K = FNMS(KP668178637, T4J, T4G);
+								 T54 = FMA(KP668178637, T4G, T4J);
+								 T55 = FMA(KP668178637, T4N, T4Q);
+								 T4R = FNMS(KP668178637, T4Q, T4N);
+								 T56 = T54 - T55;
+								 T5g = T54 + T55;
+							    }
+						       }
+						  }
+					     }
+					}
+					{
+					     E T4h, T41, T4c, T3N, T3Q, T3T, T44, T4d;
+					     T4h = FNMS(KP923879532, T40, T3Z);
+					     T41 = FMA(KP923879532, T40, T3Z);
+					     {
+						  E T57, T5b, T5h, T5p;
+						  T57 = FNMS(KP831469612, T56, T53);
+						  T5b = FMA(KP831469612, T56, T53);
+						  T5h = FNMS(KP831469612, T5g, T5f);
+						  T5p = FMA(KP831469612, T5g, T5f);
+						  {
+						       E T5m, T4S, T5i, T5q;
+						       T5m = T4K - T4R;
+						       T4S = T4K + T4R;
+						       T5i = T5e * T5h;
+						       T5q = T17 * T5p;
+						       {
+							    E T5n, T5r, T59, T4T;
+							    T5n = FMA(KP831469612, T5m, T5l);
+							    T5r = FNMS(KP831469612, T5m, T5l);
+							    T59 = FMA(KP831469612, T4S, T4D);
+							    T4T = FNMS(KP831469612, T4S, T4D);
+							    {
+								 E T5o, T5s, T5c, T5a;
+								 T5o = T5e * T5n;
+								 Ip[WS(rs, 5)] = FNMS(T5k, T5n, T5i);
+								 T5s = T17 * T5r;
+								 Ip[WS(rs, 13)] = FNMS(T1b, T5r, T5q);
+								 T5c = T14 * T59;
+								 T5a = T11 * T59;
+								 {
+								      E T58, T4U, T42, T43;
+								      T58 = T4W * T4T;
+								      T4U = T4s * T4T;
+								      Im[WS(rs, 5)] = FMA(T5k, T5h, T5o);
+								      Im[WS(rs, 13)] = FMA(T1b, T5p, T5s);
+								      Im[WS(rs, 1)] = FMA(T11, T5b, T5c);
+								      Ip[WS(rs, 1)] = FNMS(T14, T5b, T5a);
+								      Im[WS(rs, 9)] = FMA(T4s, T57, T58);
+								      Ip[WS(rs, 9)] = FNMS(T4W, T57, T4U);
+								      T4c = FNMS(KP923879532, T3M, T3L);
+								      T3N = FMA(KP923879532, T3M, T3L);
+								      T3Q = FNMS(KP198912367, T3P, T3O);
+								      T42 = FMA(KP198912367, T3O, T3P);
+								      T43 = FNMS(KP198912367, T3R, T3S);
+								      T3T = FMA(KP198912367, T3S, T3R);
+								      T44 = T42 + T43;
+								      T4d = T43 - T42;
+								 }
+							    }
+						       }
+						  }
+					     }
+					     T67 = FNMS(KP923879532, T5K, T5J);
+					     T5L = FMA(KP923879532, T5K, T5J);
+					     {
+						  E T45, T49, T4e, T4l;
+						  T45 = FNMS(KP980785280, T44, T41);
+						  T49 = FMA(KP980785280, T44, T41);
+						  T4e = FNMS(KP980785280, T4d, T4c);
+						  T4l = FMA(KP980785280, T4d, T4c);
+						  {
+						       E T4i, T3U, T4f, T4m;
+						       T4i = T3Q - T3T;
+						       T3U = T3Q + T3T;
+						       T4f = T4b * T4e;
+						       T4m = T12 * T4l;
+						       {
+							    E T4j, T4n, T47, T3V;
+							    T4j = FNMS(KP980785280, T4i, T4h);
+							    T4n = FMA(KP980785280, T4i, T4h);
+							    T47 = FMA(KP980785280, T3U, T3N);
+							    T3V = FNMS(KP980785280, T3U, T3N);
+							    {
+								 E T4k, T4o, T4a, T48;
+								 T4k = T4b * T4j;
+								 Ip[WS(rs, 12)] = FNMS(T4g, T4j, T4f);
+								 T4o = T12 * T4n;
+								 Ip[WS(rs, 4)] = FNMS(T15, T4n, T4m);
+								 T4a = T39 * T47;
+								 T48 = T37 * T47;
+								 {
+								      E T46, T3W, T5M, T5N;
+								      T46 = T3Y * T3V;
+								      T3W = T3K * T3V;
+								      Im[WS(rs, 12)] = FMA(T4g, T4e, T4k);
+								      Im[WS(rs, 4)] = FMA(T15, T4l, T4o);
+								      Im[0] = FMA(T37, T49, T4a);
+								      Ip[0] = FNMS(T39, T49, T48);
+								      Im[WS(rs, 8)] = FMA(T3K, T45, T46);
+								      Ip[WS(rs, 8)] = FNMS(T3Y, T45, T3W);
+								      T61 = FMA(KP923879532, T5w, T5v);
+								      T5x = FNMS(KP923879532, T5w, T5v);
+								      T5A = FNMS(KP198912367, T5z, T5y);
+								      T5M = FMA(KP198912367, T5y, T5z);
+								      T5N = FMA(KP198912367, T5B, T5C);
+								      T5D = FNMS(KP198912367, T5C, T5B);
+								      T5O = T5M - T5N;
+								      T62 = T5M + T5N;
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T5V = FMA(KP980785280, T5O, T5L);
+	       T5P = FNMS(KP980785280, T5O, T5L);
+	       {
+		    E T6c, T63, T5E, T68;
+		    T6c = FMA(KP980785280, T62, T61);
+		    T63 = FNMS(KP980785280, T62, T61);
+		    T5E = T5A + T5D;
+		    T68 = T5D - T5A;
+		    {
+			 E T64, T6d, T6f, T69;
+			 T64 = T60 * T63;
+			 T6d = T6b * T6c;
+			 T6f = FNMS(KP980785280, T68, T67);
+			 T69 = FMA(KP980785280, T68, T67);
+			 {
+			      E T5F, T5S, T6a, T6g;
+			      T5F = FMA(KP980785280, T5E, T5x);
+			      T5S = FNMS(KP980785280, T5E, T5x);
+			      T6a = T60 * T69;
+			      Ip[WS(rs, 7)] = FNMS(T66, T69, T64);
+			      T6g = T6b * T6f;
+			      Ip[WS(rs, 15)] = FNMS(T6e, T6f, T6d);
+			      {
+				   E T5W, T5T, T5Q, T5G;
+				   T5W = T5U * T5S;
+				   T5T = T5R * T5S;
+				   T5Q = T5I * T5F;
+				   T5G = T5u * T5F;
+				   Im[WS(rs, 7)] = FMA(T66, T63, T6a);
+				   Im[WS(rs, 15)] = FMA(T6e, T6c, T6g);
+				   Im[WS(rs, 3)] = FMA(T5R, T5V, T5W);
+				   Ip[WS(rs, 3)] = FNMS(T5U, T5V, T5T);
+				   Im[WS(rs, 11)] = FMA(T5u, T5P, T5Q);
+				   Ip[WS(rs, 11)] = FNMS(T5I, T5P, T5G);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cb2_32", twinstr, &GENUS, {236, 98, 252, 0} };
+
+void X(codelet_hc2cb2_32) (planner *p) {
+     X(khc2c_register) (p, hc2cb2_32, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 32 -dif -name hc2cb2_32 -include hc2cb.h */
+
+/*
+ * This function contains 488 FP additions, 280 FP multiplications,
+ * (or, 376 additions, 168 multiplications, 112 fused multiply/add),
+ * 160 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb2_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T11, T14, T12, T15, T17, T2z, T2B, T1c, T18, T1d, T1g, T1k, T2F, T2L, T3t;
+	       E T4H, T3h, T3V, T3b, T4v, T4T, T4X, T6t, T71, T6z, T75, T81, T8x, T8f, T8z;
+	       E T2R, T2V, T8p, T8t, T4r, T4t, T53, T69, T3n, T3r, T7P, T7T, T4P, T4R, T6F;
+	       E T6R, T1f, T2X, T1j, T2Y, T1l, T31, T2d, T2Z, T49, T4h, T4c, T4i, T4d, T4n;
+	       E T4f, T4j;
+	       {
+		    E T2P, T3q, T2U, T3l, T2Q, T3p, T2T, T3m, T2D, T3g, T2K, T39, T2E, T3f, T2J;
+		    E T3a;
+		    {
+			 E T13, T1b, T16, T1a;
+			 T11 = W[0];
+			 T14 = W[1];
+			 T12 = W[2];
+			 T15 = W[3];
+			 T13 = T11 * T12;
+			 T1b = T14 * T12;
+			 T16 = T14 * T15;
+			 T1a = T11 * T15;
+			 T17 = T13 + T16;
+			 T2z = T13 - T16;
+			 T2B = T1a + T1b;
+			 T1c = T1a - T1b;
+			 T18 = W[4];
+			 T2P = T12 * T18;
+			 T3q = T14 * T18;
+			 T2U = T15 * T18;
+			 T3l = T11 * T18;
+			 T1d = W[5];
+			 T2Q = T15 * T1d;
+			 T3p = T11 * T1d;
+			 T2T = T12 * T1d;
+			 T3m = T14 * T1d;
+			 T1g = W[6];
+			 T2D = T11 * T1g;
+			 T3g = T15 * T1g;
+			 T2K = T14 * T1g;
+			 T39 = T12 * T1g;
+			 T1k = W[7];
+			 T2E = T14 * T1k;
+			 T3f = T12 * T1k;
+			 T2J = T11 * T1k;
+			 T3a = T15 * T1k;
+		    }
+		    T2F = T2D - T2E;
+		    T2L = T2J + T2K;
+		    T3t = T39 - T3a;
+		    T4H = T2J - T2K;
+		    T3h = T3f - T3g;
+		    T3V = T3f + T3g;
+		    T3b = T39 + T3a;
+		    T4v = T2D + T2E;
+		    T4T = FMA(T18, T1g, T1d * T1k);
+		    T4X = FNMS(T1d, T1g, T18 * T1k);
+		    {
+			 E T6r, T6s, T6x, T6y;
+			 T6r = T17 * T1g;
+			 T6s = T1c * T1k;
+			 T6t = T6r - T6s;
+			 T71 = T6r + T6s;
+			 T6x = T17 * T1k;
+			 T6y = T1c * T1g;
+			 T6z = T6x + T6y;
+			 T75 = T6x - T6y;
+		    }
+		    {
+			 E T7Z, T80, T8d, T8e;
+			 T7Z = T2z * T1g;
+			 T80 = T2B * T1k;
+			 T81 = T7Z + T80;
+			 T8x = T7Z - T80;
+			 T8d = T2z * T1k;
+			 T8e = T2B * T1g;
+			 T8f = T8d - T8e;
+			 T8z = T8d + T8e;
+			 T2R = T2P - T2Q;
+			 T2V = T2T + T2U;
+			 T8p = FMA(T2R, T1g, T2V * T1k);
+			 T8t = FNMS(T2V, T1g, T2R * T1k);
+		    }
+		    T4r = T2P + T2Q;
+		    T4t = T2T - T2U;
+		    T53 = FMA(T4r, T1g, T4t * T1k);
+		    T69 = FNMS(T4t, T1g, T4r * T1k);
+		    T3n = T3l + T3m;
+		    T3r = T3p - T3q;
+		    T7P = FMA(T3n, T1g, T3r * T1k);
+		    T7T = FNMS(T3r, T1g, T3n * T1k);
+		    T4P = T3l - T3m;
+		    T4R = T3p + T3q;
+		    T6F = FMA(T4P, T1g, T4R * T1k);
+		    T6R = FNMS(T4R, T1g, T4P * T1k);
+		    {
+			 E T19, T1e, T1h, T1i;
+			 T19 = T17 * T18;
+			 T1e = T1c * T1d;
+			 T1f = T19 + T1e;
+			 T2X = T19 - T1e;
+			 T1h = T17 * T1d;
+			 T1i = T1c * T18;
+			 T1j = T1h - T1i;
+			 T2Y = T1h + T1i;
+		    }
+		    T1l = FMA(T1f, T1g, T1j * T1k);
+		    T31 = FNMS(T2Y, T1g, T2X * T1k);
+		    T2d = FNMS(T1j, T1g, T1f * T1k);
+		    T2Z = FMA(T2X, T1g, T2Y * T1k);
+		    {
+			 E T47, T48, T4a, T4b;
+			 T47 = T2z * T18;
+			 T48 = T2B * T1d;
+			 T49 = T47 - T48;
+			 T4h = T47 + T48;
+			 T4a = T2z * T1d;
+			 T4b = T2B * T18;
+			 T4c = T4a + T4b;
+			 T4i = T4a - T4b;
+		    }
+		    T4d = FMA(T49, T1g, T4c * T1k);
+		    T4n = FNMS(T4i, T1g, T4h * T1k);
+		    T4f = FNMS(T4c, T1g, T49 * T1k);
+		    T4j = FMA(T4h, T1g, T4i * T1k);
+	       }
+	       {
+		    E T56, T7b, T7C, T6c, Tf, T1m, T6f, T7c, T3Y, T4I, T2t, T32, T5d, T7D, T3w;
+		    E T4w, Tu, T2e, T7g, T7F, T7j, T7G, T1B, T33, T3z, T40, T5l, T6i, T5s, T6h;
+		    E T3C, T3Z, TK, T1D, T7v, T86, T7y, T85, T1S, T35, T3O, T4C, T5F, T6J, T5M;
+		    E T6K, T3R, T4D, TZ, T1U, T7o, T89, T7r, T88, T29, T36, T3H, T4z, T5Y, T6M;
+		    E T65, T6N, T3K, T4A;
+		    {
+			 E T3, T54, T2h, T6b, T6, T6a, T2k, T55, Ta, T57, T2o, T58, Td, T5a, T2r;
+			 E T5b;
+			 {
+			      E T1, T2, T2f, T2g;
+			      T1 = Rp[0];
+			      T2 = Rm[WS(rs, 15)];
+			      T3 = T1 + T2;
+			      T54 = T1 - T2;
+			      T2f = Ip[0];
+			      T2g = Im[WS(rs, 15)];
+			      T2h = T2f - T2g;
+			      T6b = T2f + T2g;
+			 }
+			 {
+			      E T4, T5, T2i, T2j;
+			      T4 = Rp[WS(rs, 8)];
+			      T5 = Rm[WS(rs, 7)];
+			      T6 = T4 + T5;
+			      T6a = T4 - T5;
+			      T2i = Ip[WS(rs, 8)];
+			      T2j = Im[WS(rs, 7)];
+			      T2k = T2i - T2j;
+			      T55 = T2i + T2j;
+			 }
+			 {
+			      E T8, T9, T2m, T2n;
+			      T8 = Rp[WS(rs, 4)];
+			      T9 = Rm[WS(rs, 11)];
+			      Ta = T8 + T9;
+			      T57 = T8 - T9;
+			      T2m = Ip[WS(rs, 4)];
+			      T2n = Im[WS(rs, 11)];
+			      T2o = T2m - T2n;
+			      T58 = T2m + T2n;
+			 }
+			 {
+			      E Tb, Tc, T2p, T2q;
+			      Tb = Rm[WS(rs, 3)];
+			      Tc = Rp[WS(rs, 12)];
+			      Td = Tb + Tc;
+			      T5a = Tb - Tc;
+			      T2p = Ip[WS(rs, 12)];
+			      T2q = Im[WS(rs, 3)];
+			      T2r = T2p - T2q;
+			      T5b = T2p + T2q;
+			 }
+			 {
+			      E T7, Te, T2l, T2s;
+			      T56 = T54 - T55;
+			      T7b = T54 + T55;
+			      T7C = T6b - T6a;
+			      T6c = T6a + T6b;
+			      T7 = T3 + T6;
+			      Te = Ta + Td;
+			      Tf = T7 + Te;
+			      T1m = T7 - Te;
+			      {
+				   E T6d, T6e, T3W, T3X;
+				   T6d = T57 + T58;
+				   T6e = T5a + T5b;
+				   T6f = KP707106781 * (T6d - T6e);
+				   T7c = KP707106781 * (T6d + T6e);
+				   T3W = T2h - T2k;
+				   T3X = Ta - Td;
+				   T3Y = T3W - T3X;
+				   T4I = T3X + T3W;
+			      }
+			      T2l = T2h + T2k;
+			      T2s = T2o + T2r;
+			      T2t = T2l - T2s;
+			      T32 = T2l + T2s;
+			      {
+				   E T59, T5c, T3u, T3v;
+				   T59 = T57 - T58;
+				   T5c = T5a - T5b;
+				   T5d = KP707106781 * (T59 + T5c);
+				   T7D = KP707106781 * (T59 - T5c);
+				   T3u = T3 - T6;
+				   T3v = T2r - T2o;
+				   T3w = T3u - T3v;
+				   T4w = T3u + T3v;
+			      }
+			 }
+		    }
+		    {
+			 E Ti, T5p, T1w, T5n, Tl, T5m, T1z, T5q, Tp, T5i, T1p, T5g, Ts, T5f, T1s;
+			 E T5j;
+			 {
+			      E Tg, Th, T1u, T1v;
+			      Tg = Rp[WS(rs, 2)];
+			      Th = Rm[WS(rs, 13)];
+			      Ti = Tg + Th;
+			      T5p = Tg - Th;
+			      T1u = Ip[WS(rs, 2)];
+			      T1v = Im[WS(rs, 13)];
+			      T1w = T1u - T1v;
+			      T5n = T1u + T1v;
+			 }
+			 {
+			      E Tj, Tk, T1x, T1y;
+			      Tj = Rp[WS(rs, 10)];
+			      Tk = Rm[WS(rs, 5)];
+			      Tl = Tj + Tk;
+			      T5m = Tj - Tk;
+			      T1x = Ip[WS(rs, 10)];
+			      T1y = Im[WS(rs, 5)];
+			      T1z = T1x - T1y;
+			      T5q = T1x + T1y;
+			 }
+			 {
+			      E Tn, To, T1n, T1o;
+			      Tn = Rm[WS(rs, 1)];
+			      To = Rp[WS(rs, 14)];
+			      Tp = Tn + To;
+			      T5i = Tn - To;
+			      T1n = Ip[WS(rs, 14)];
+			      T1o = Im[WS(rs, 1)];
+			      T1p = T1n - T1o;
+			      T5g = T1n + T1o;
+			 }
+			 {
+			      E Tq, Tr, T1q, T1r;
+			      Tq = Rp[WS(rs, 6)];
+			      Tr = Rm[WS(rs, 9)];
+			      Ts = Tq + Tr;
+			      T5f = Tq - Tr;
+			      T1q = Ip[WS(rs, 6)];
+			      T1r = Im[WS(rs, 9)];
+			      T1s = T1q - T1r;
+			      T5j = T1q + T1r;
+			 }
+			 {
+			      E Tm, Tt, T7e, T7f;
+			      Tm = Ti + Tl;
+			      Tt = Tp + Ts;
+			      Tu = Tm + Tt;
+			      T2e = Tm - Tt;
+			      T7e = T5p + T5q;
+			      T7f = T5n - T5m;
+			      T7g = FNMS(KP923879532, T7f, KP382683432 * T7e);
+			      T7F = FMA(KP382683432, T7f, KP923879532 * T7e);
+			 }
+			 {
+			      E T7h, T7i, T1t, T1A;
+			      T7h = T5i + T5j;
+			      T7i = T5f + T5g;
+			      T7j = FNMS(KP923879532, T7i, KP382683432 * T7h);
+			      T7G = FMA(KP382683432, T7i, KP923879532 * T7h);
+			      T1t = T1p + T1s;
+			      T1A = T1w + T1z;
+			      T1B = T1t - T1A;
+			      T33 = T1A + T1t;
+			 }
+			 {
+			      E T3x, T3y, T5h, T5k;
+			      T3x = T1p - T1s;
+			      T3y = Tp - Ts;
+			      T3z = T3x - T3y;
+			      T40 = T3y + T3x;
+			      T5h = T5f - T5g;
+			      T5k = T5i - T5j;
+			      T5l = FNMS(KP382683432, T5k, KP923879532 * T5h);
+			      T6i = FMA(KP382683432, T5h, KP923879532 * T5k);
+			 }
+			 {
+			      E T5o, T5r, T3A, T3B;
+			      T5o = T5m + T5n;
+			      T5r = T5p - T5q;
+			      T5s = FMA(KP923879532, T5o, KP382683432 * T5r);
+			      T6h = FNMS(KP382683432, T5o, KP923879532 * T5r);
+			      T3A = Ti - Tl;
+			      T3B = T1w - T1z;
+			      T3C = T3A + T3B;
+			      T3Z = T3A - T3B;
+			 }
+		    }
+		    {
+			 E Ty, T5v, T1G, T5H, TB, T5G, T1J, T5w, TI, T5K, T1Q, T5D, TF, T5J, T1N;
+			 E T5A;
+			 {
+			      E Tw, Tx, T1H, T1I;
+			      Tw = Rp[WS(rs, 1)];
+			      Tx = Rm[WS(rs, 14)];
+			      Ty = Tw + Tx;
+			      T5v = Tw - Tx;
+			      {
+				   E T1E, T1F, Tz, TA;
+				   T1E = Ip[WS(rs, 1)];
+				   T1F = Im[WS(rs, 14)];
+				   T1G = T1E - T1F;
+				   T5H = T1E + T1F;
+				   Tz = Rp[WS(rs, 9)];
+				   TA = Rm[WS(rs, 6)];
+				   TB = Tz + TA;
+				   T5G = Tz - TA;
+			      }
+			      T1H = Ip[WS(rs, 9)];
+			      T1I = Im[WS(rs, 6)];
+			      T1J = T1H - T1I;
+			      T5w = T1H + T1I;
+			      {
+				   E TG, TH, T5B, T1O, T1P, T5C;
+				   TG = Rm[WS(rs, 2)];
+				   TH = Rp[WS(rs, 13)];
+				   T5B = TG - TH;
+				   T1O = Ip[WS(rs, 13)];
+				   T1P = Im[WS(rs, 2)];
+				   T5C = T1O + T1P;
+				   TI = TG + TH;
+				   T5K = T5B + T5C;
+				   T1Q = T1O - T1P;
+				   T5D = T5B - T5C;
+			      }
+			      {
+				   E TD, TE, T5y, T1L, T1M, T5z;
+				   TD = Rp[WS(rs, 5)];
+				   TE = Rm[WS(rs, 10)];
+				   T5y = TD - TE;
+				   T1L = Ip[WS(rs, 5)];
+				   T1M = Im[WS(rs, 10)];
+				   T5z = T1L + T1M;
+				   TF = TD + TE;
+				   T5J = T5y + T5z;
+				   T1N = T1L - T1M;
+				   T5A = T5y - T5z;
+			      }
+			 }
+			 {
+			      E TC, TJ, T7t, T7u;
+			      TC = Ty + TB;
+			      TJ = TF + TI;
+			      TK = TC + TJ;
+			      T1D = TC - TJ;
+			      T7t = T5H - T5G;
+			      T7u = KP707106781 * (T5A - T5D);
+			      T7v = T7t + T7u;
+			      T86 = T7t - T7u;
+			 }
+			 {
+			      E T7w, T7x, T1K, T1R;
+			      T7w = T5v + T5w;
+			      T7x = KP707106781 * (T5J + T5K);
+			      T7y = T7w - T7x;
+			      T85 = T7w + T7x;
+			      T1K = T1G + T1J;
+			      T1R = T1N + T1Q;
+			      T1S = T1K - T1R;
+			      T35 = T1K + T1R;
+			 }
+			 {
+			      E T3M, T3N, T5x, T5E;
+			      T3M = T1G - T1J;
+			      T3N = TF - TI;
+			      T3O = T3M - T3N;
+			      T4C = T3N + T3M;
+			      T5x = T5v - T5w;
+			      T5E = KP707106781 * (T5A + T5D);
+			      T5F = T5x - T5E;
+			      T6J = T5x + T5E;
+			 }
+			 {
+			      E T5I, T5L, T3P, T3Q;
+			      T5I = T5G + T5H;
+			      T5L = KP707106781 * (T5J - T5K);
+			      T5M = T5I - T5L;
+			      T6K = T5I + T5L;
+			      T3P = Ty - TB;
+			      T3Q = T1Q - T1N;
+			      T3R = T3P - T3Q;
+			      T4D = T3P + T3Q;
+			 }
+		    }
+		    {
+			 E TN, T5O, T1X, T60, TQ, T5Z, T20, T5P, TX, T63, T27, T5W, TU, T62, T24;
+			 E T5T;
+			 {
+			      E TL, TM, T1Y, T1Z;
+			      TL = Rm[0];
+			      TM = Rp[WS(rs, 15)];
+			      TN = TL + TM;
+			      T5O = TL - TM;
+			      {
+				   E T1V, T1W, TO, TP;
+				   T1V = Ip[WS(rs, 15)];
+				   T1W = Im[0];
+				   T1X = T1V - T1W;
+				   T60 = T1V + T1W;
+				   TO = Rp[WS(rs, 7)];
+				   TP = Rm[WS(rs, 8)];
+				   TQ = TO + TP;
+				   T5Z = TO - TP;
+			      }
+			      T1Y = Ip[WS(rs, 7)];
+			      T1Z = Im[WS(rs, 8)];
+			      T20 = T1Y - T1Z;
+			      T5P = T1Y + T1Z;
+			      {
+				   E TV, TW, T5U, T25, T26, T5V;
+				   TV = Rm[WS(rs, 4)];
+				   TW = Rp[WS(rs, 11)];
+				   T5U = TV - TW;
+				   T25 = Ip[WS(rs, 11)];
+				   T26 = Im[WS(rs, 4)];
+				   T5V = T25 + T26;
+				   TX = TV + TW;
+				   T63 = T5U + T5V;
+				   T27 = T25 - T26;
+				   T5W = T5U - T5V;
+			      }
+			      {
+				   E TS, TT, T5R, T22, T23, T5S;
+				   TS = Rp[WS(rs, 3)];
+				   TT = Rm[WS(rs, 12)];
+				   T5R = TS - TT;
+				   T22 = Ip[WS(rs, 3)];
+				   T23 = Im[WS(rs, 12)];
+				   T5S = T22 + T23;
+				   TU = TS + TT;
+				   T62 = T5R + T5S;
+				   T24 = T22 - T23;
+				   T5T = T5R - T5S;
+			      }
+			 }
+			 {
+			      E TR, TY, T7m, T7n;
+			      TR = TN + TQ;
+			      TY = TU + TX;
+			      TZ = TR + TY;
+			      T1U = TR - TY;
+			      T7m = KP707106781 * (T5T - T5W);
+			      T7n = T5Z + T60;
+			      T7o = T7m - T7n;
+			      T89 = T7n + T7m;
+			 }
+			 {
+			      E T7p, T7q, T21, T28;
+			      T7p = T5O + T5P;
+			      T7q = KP707106781 * (T62 + T63);
+			      T7r = T7p - T7q;
+			      T88 = T7p + T7q;
+			      T21 = T1X + T20;
+			      T28 = T24 + T27;
+			      T29 = T21 - T28;
+			      T36 = T21 + T28;
+			 }
+			 {
+			      E T3F, T3G, T5Q, T5X;
+			      T3F = T1X - T20;
+			      T3G = TU - TX;
+			      T3H = T3F - T3G;
+			      T4z = T3G + T3F;
+			      T5Q = T5O - T5P;
+			      T5X = KP707106781 * (T5T + T5W);
+			      T5Y = T5Q - T5X;
+			      T6M = T5Q + T5X;
+			 }
+			 {
+			      E T61, T64, T3I, T3J;
+			      T61 = T5Z - T60;
+			      T64 = KP707106781 * (T62 - T63);
+			      T65 = T61 - T64;
+			      T6N = T61 + T64;
+			      T3I = TN - TQ;
+			      T3J = T27 - T24;
+			      T3K = T3I - T3J;
+			      T4A = T3I + T3J;
+			 }
+		    }
+		    {
+			 E Tv, T10, T30, T34, T37, T38;
+			 Tv = Tf + Tu;
+			 T10 = TK + TZ;
+			 T30 = Tv - T10;
+			 T34 = T32 + T33;
+			 T37 = T35 + T36;
+			 T38 = T34 - T37;
+			 Rp[0] = Tv + T10;
+			 Rm[0] = T34 + T37;
+			 Rp[WS(rs, 8)] = FNMS(T31, T38, T2Z * T30);
+			 Rm[WS(rs, 8)] = FMA(T31, T30, T2Z * T38);
+		    }
+		    {
+			 E T3e, T3o, T3k, T3s;
+			 {
+			      E T3c, T3d, T3i, T3j;
+			      T3c = Tf - Tu;
+			      T3d = T36 - T35;
+			      T3e = T3c - T3d;
+			      T3o = T3c + T3d;
+			      T3i = T32 - T33;
+			      T3j = TK - TZ;
+			      T3k = T3i - T3j;
+			      T3s = T3j + T3i;
+			 }
+			 Rp[WS(rs, 12)] = FNMS(T3h, T3k, T3b * T3e);
+			 Rm[WS(rs, 12)] = FMA(T3b, T3k, T3h * T3e);
+			 Rp[WS(rs, 4)] = FNMS(T3r, T3s, T3n * T3o);
+			 Rm[WS(rs, 4)] = FMA(T3n, T3s, T3r * T3o);
+		    }
+		    {
+			 E T1C, T2u, T2M, T2G, T2x, T2H, T2b, T2N;
+			 T1C = T1m + T1B;
+			 T2u = T2e + T2t;
+			 T2M = T2t - T2e;
+			 T2G = T1m - T1B;
+			 {
+			      E T2v, T2w, T1T, T2a;
+			      T2v = T1D + T1S;
+			      T2w = T29 - T1U;
+			      T2x = KP707106781 * (T2v + T2w);
+			      T2H = KP707106781 * (T2w - T2v);
+			      T1T = T1D - T1S;
+			      T2a = T1U + T29;
+			      T2b = KP707106781 * (T1T + T2a);
+			      T2N = KP707106781 * (T1T - T2a);
+			 }
+			 {
+			      E T2c, T2y, T2S, T2W;
+			      T2c = T1C - T2b;
+			      T2y = T2u - T2x;
+			      Rp[WS(rs, 10)] = FNMS(T2d, T2y, T1l * T2c);
+			      Rm[WS(rs, 10)] = FMA(T2d, T2c, T1l * T2y);
+			      T2S = T2G + T2H;
+			      T2W = T2M + T2N;
+			      Rp[WS(rs, 6)] = FNMS(T2V, T2W, T2R * T2S);
+			      Rm[WS(rs, 6)] = FMA(T2R, T2W, T2V * T2S);
+			 }
+			 {
+			      E T2A, T2C, T2I, T2O;
+			      T2A = T1C + T2b;
+			      T2C = T2u + T2x;
+			      Rp[WS(rs, 2)] = FNMS(T2B, T2C, T2z * T2A);
+			      Rm[WS(rs, 2)] = FMA(T2B, T2A, T2z * T2C);
+			      T2I = T2G - T2H;
+			      T2O = T2M - T2N;
+			      Rp[WS(rs, 14)] = FNMS(T2L, T2O, T2F * T2I);
+			      Rm[WS(rs, 14)] = FMA(T2F, T2O, T2L * T2I);
+			 }
+		    }
+		    {
+			 E T4y, T4U, T4K, T4Y, T4F, T4Z, T4N, T4V, T4x, T4J;
+			 T4x = KP707106781 * (T3Z + T40);
+			 T4y = T4w - T4x;
+			 T4U = T4w + T4x;
+			 T4J = KP707106781 * (T3C + T3z);
+			 T4K = T4I - T4J;
+			 T4Y = T4I + T4J;
+			 {
+			      E T4B, T4E, T4L, T4M;
+			      T4B = FNMS(KP382683432, T4A, KP923879532 * T4z);
+			      T4E = FMA(KP923879532, T4C, KP382683432 * T4D);
+			      T4F = T4B - T4E;
+			      T4Z = T4E + T4B;
+			      T4L = FNMS(KP382683432, T4C, KP923879532 * T4D);
+			      T4M = FMA(KP382683432, T4z, KP923879532 * T4A);
+			      T4N = T4L - T4M;
+			      T4V = T4L + T4M;
+			 }
+			 {
+			      E T4G, T4O, T51, T52;
+			      T4G = T4y - T4F;
+			      T4O = T4K - T4N;
+			      Rp[WS(rs, 13)] = FNMS(T4H, T4O, T4v * T4G);
+			      Rm[WS(rs, 13)] = FMA(T4H, T4G, T4v * T4O);
+			      T51 = T4U + T4V;
+			      T52 = T4Y + T4Z;
+			      Rp[WS(rs, 1)] = FNMS(T1c, T52, T17 * T51);
+			      Rm[WS(rs, 1)] = FMA(T17, T52, T1c * T51);
+			 }
+			 {
+			      E T4Q, T4S, T4W, T50;
+			      T4Q = T4y + T4F;
+			      T4S = T4K + T4N;
+			      Rp[WS(rs, 5)] = FNMS(T4R, T4S, T4P * T4Q);
+			      Rm[WS(rs, 5)] = FMA(T4R, T4Q, T4P * T4S);
+			      T4W = T4U - T4V;
+			      T50 = T4Y - T4Z;
+			      Rp[WS(rs, 9)] = FNMS(T4X, T50, T4T * T4W);
+			      Rm[WS(rs, 9)] = FMA(T4T, T50, T4X * T4W);
+			 }
+		    }
+		    {
+			 E T3E, T4k, T42, T4o, T3T, T4p, T45, T4l, T3D, T41;
+			 T3D = KP707106781 * (T3z - T3C);
+			 T3E = T3w - T3D;
+			 T4k = T3w + T3D;
+			 T41 = KP707106781 * (T3Z - T40);
+			 T42 = T3Y - T41;
+			 T4o = T3Y + T41;
+			 {
+			      E T3L, T3S, T43, T44;
+			      T3L = FNMS(KP923879532, T3K, KP382683432 * T3H);
+			      T3S = FMA(KP382683432, T3O, KP923879532 * T3R);
+			      T3T = T3L - T3S;
+			      T4p = T3S + T3L;
+			      T43 = FNMS(KP923879532, T3O, KP382683432 * T3R);
+			      T44 = FMA(KP923879532, T3H, KP382683432 * T3K);
+			      T45 = T43 - T44;
+			      T4l = T43 + T44;
+			 }
+			 {
+			      E T3U, T46, T4s, T4u;
+			      T3U = T3E - T3T;
+			      T46 = T42 - T45;
+			      Rp[WS(rs, 15)] = FNMS(T3V, T46, T3t * T3U);
+			      Rm[WS(rs, 15)] = FMA(T3V, T3U, T3t * T46);
+			      T4s = T4k + T4l;
+			      T4u = T4o + T4p;
+			      Rp[WS(rs, 3)] = FNMS(T4t, T4u, T4r * T4s);
+			      Rm[WS(rs, 3)] = FMA(T4r, T4u, T4t * T4s);
+			 }
+			 {
+			      E T4e, T4g, T4m, T4q;
+			      T4e = T3E + T3T;
+			      T4g = T42 + T45;
+			      Rp[WS(rs, 7)] = FNMS(T4f, T4g, T4d * T4e);
+			      Rm[WS(rs, 7)] = FMA(T4f, T4e, T4d * T4g);
+			      T4m = T4k - T4l;
+			      T4q = T4o - T4p;
+			      Rp[WS(rs, 11)] = FNMS(T4n, T4q, T4j * T4m);
+			      Rm[WS(rs, 11)] = FMA(T4j, T4q, T4n * T4m);
+			 }
+		    }
+		    {
+			 E T6I, T72, T6X, T73, T6P, T77, T6U, T76;
+			 {
+			      E T6G, T6H, T6V, T6W;
+			      T6G = T56 + T5d;
+			      T6H = T6h + T6i;
+			      T6I = T6G + T6H;
+			      T72 = T6G - T6H;
+			      T6V = FMA(KP195090322, T6J, KP980785280 * T6K);
+			      T6W = FNMS(KP195090322, T6M, KP980785280 * T6N);
+			      T6X = T6V + T6W;
+			      T73 = T6W - T6V;
+			 }
+			 {
+			      E T6L, T6O, T6S, T6T;
+			      T6L = FNMS(KP195090322, T6K, KP980785280 * T6J);
+			      T6O = FMA(KP980785280, T6M, KP195090322 * T6N);
+			      T6P = T6L + T6O;
+			      T77 = T6L - T6O;
+			      T6S = T6c + T6f;
+			      T6T = T5s + T5l;
+			      T6U = T6S + T6T;
+			      T76 = T6S - T6T;
+			 }
+			 {
+			      E T6Q, T6Y, T79, T7a;
+			      T6Q = T6I - T6P;
+			      T6Y = T6U - T6X;
+			      Ip[WS(rs, 8)] = FNMS(T6R, T6Y, T6F * T6Q);
+			      Im[WS(rs, 8)] = FMA(T6R, T6Q, T6F * T6Y);
+			      T79 = T72 + T73;
+			      T7a = T76 + T77;
+			      Ip[WS(rs, 4)] = FNMS(T1d, T7a, T18 * T79);
+			      Im[WS(rs, 4)] = FMA(T18, T7a, T1d * T79);
+			 }
+			 {
+			      E T6Z, T70, T74, T78;
+			      T6Z = T6I + T6P;
+			      T70 = T6U + T6X;
+			      Ip[0] = FNMS(T14, T70, T11 * T6Z);
+			      Im[0] = FMA(T14, T6Z, T11 * T70);
+			      T74 = T72 - T73;
+			      T78 = T76 - T77;
+			      Ip[WS(rs, 12)] = FNMS(T75, T78, T71 * T74);
+			      Im[WS(rs, 12)] = FMA(T71, T78, T75 * T74);
+			 }
+		    }
+		    {
+			 E T84, T8q, T8l, T8r, T8b, T8v, T8i, T8u;
+			 {
+			      E T82, T83, T8j, T8k;
+			      T82 = T7b + T7c;
+			      T83 = T7F + T7G;
+			      T84 = T82 - T83;
+			      T8q = T82 + T83;
+			      T8j = FMA(KP195090322, T86, KP980785280 * T85);
+			      T8k = FMA(KP195090322, T89, KP980785280 * T88);
+			      T8l = T8j - T8k;
+			      T8r = T8j + T8k;
+			 }
+			 {
+			      E T87, T8a, T8g, T8h;
+			      T87 = FNMS(KP980785280, T86, KP195090322 * T85);
+			      T8a = FNMS(KP980785280, T89, KP195090322 * T88);
+			      T8b = T87 + T8a;
+			      T8v = T87 - T8a;
+			      T8g = T7C - T7D;
+			      T8h = T7g - T7j;
+			      T8i = T8g + T8h;
+			      T8u = T8g - T8h;
+			 }
+			 {
+			      E T8c, T8m, T8y, T8A;
+			      T8c = T84 - T8b;
+			      T8m = T8i - T8l;
+			      Ip[WS(rs, 11)] = FNMS(T8f, T8m, T81 * T8c);
+			      Im[WS(rs, 11)] = FMA(T8f, T8c, T81 * T8m);
+			      T8y = T8q + T8r;
+			      T8A = T8u - T8v;
+			      Ip[WS(rs, 15)] = FNMS(T8z, T8A, T8x * T8y);
+			      Im[WS(rs, 15)] = FMA(T8x, T8A, T8z * T8y);
+			 }
+			 {
+			      E T8n, T8o, T8s, T8w;
+			      T8n = T84 + T8b;
+			      T8o = T8i + T8l;
+			      Ip[WS(rs, 3)] = FNMS(T1j, T8o, T1f * T8n);
+			      Im[WS(rs, 3)] = FMA(T1j, T8n, T1f * T8o);
+			      T8s = T8q - T8r;
+			      T8w = T8u + T8v;
+			      Ip[WS(rs, 7)] = FNMS(T8t, T8w, T8p * T8s);
+			      Im[WS(rs, 7)] = FMA(T8p, T8w, T8t * T8s);
+			 }
+		    }
+		    {
+			 E T5u, T6u, T6n, T6v, T67, T6B, T6k, T6A;
+			 {
+			      E T5e, T5t, T6l, T6m;
+			      T5e = T56 - T5d;
+			      T5t = T5l - T5s;
+			      T5u = T5e + T5t;
+			      T6u = T5e - T5t;
+			      T6l = FMA(KP831469612, T5F, KP555570233 * T5M);
+			      T6m = FNMS(KP831469612, T5Y, KP555570233 * T65);
+			      T6n = T6l + T6m;
+			      T6v = T6m - T6l;
+			 }
+			 {
+			      E T5N, T66, T6g, T6j;
+			      T5N = FNMS(KP831469612, T5M, KP555570233 * T5F);
+			      T66 = FMA(KP555570233, T5Y, KP831469612 * T65);
+			      T67 = T5N + T66;
+			      T6B = T5N - T66;
+			      T6g = T6c - T6f;
+			      T6j = T6h - T6i;
+			      T6k = T6g + T6j;
+			      T6A = T6g - T6j;
+			 }
+			 {
+			      E T68, T6o, T6D, T6E;
+			      T68 = T5u - T67;
+			      T6o = T6k - T6n;
+			      Ip[WS(rs, 10)] = FNMS(T69, T6o, T53 * T68);
+			      Im[WS(rs, 10)] = FMA(T69, T68, T53 * T6o);
+			      T6D = T6u + T6v;
+			      T6E = T6A + T6B;
+			      Ip[WS(rs, 6)] = FNMS(T4c, T6E, T49 * T6D);
+			      Im[WS(rs, 6)] = FMA(T49, T6E, T4c * T6D);
+			 }
+			 {
+			      E T6p, T6q, T6w, T6C;
+			      T6p = T5u + T67;
+			      T6q = T6k + T6n;
+			      Ip[WS(rs, 2)] = FNMS(T4i, T6q, T4h * T6p);
+			      Im[WS(rs, 2)] = FMA(T4i, T6p, T4h * T6q);
+			      T6w = T6u - T6v;
+			      T6C = T6A - T6B;
+			      Ip[WS(rs, 14)] = FNMS(T6z, T6C, T6t * T6w);
+			      Im[WS(rs, 14)] = FMA(T6t, T6C, T6z * T6w);
+			 }
+		    }
+		    {
+			 E T7l, T7Q, T7L, T7R, T7A, T7V, T7I, T7U;
+			 {
+			      E T7d, T7k, T7J, T7K;
+			      T7d = T7b - T7c;
+			      T7k = T7g + T7j;
+			      T7l = T7d - T7k;
+			      T7Q = T7d + T7k;
+			      T7J = FNMS(KP555570233, T7v, KP831469612 * T7y);
+			      T7K = FMA(KP555570233, T7o, KP831469612 * T7r);
+			      T7L = T7J - T7K;
+			      T7R = T7J + T7K;
+			 }
+			 {
+			      E T7s, T7z, T7E, T7H;
+			      T7s = FNMS(KP555570233, T7r, KP831469612 * T7o);
+			      T7z = FMA(KP831469612, T7v, KP555570233 * T7y);
+			      T7A = T7s - T7z;
+			      T7V = T7z + T7s;
+			      T7E = T7C + T7D;
+			      T7H = T7F - T7G;
+			      T7I = T7E - T7H;
+			      T7U = T7E + T7H;
+			 }
+			 {
+			      E T7B, T7M, T7X, T7Y;
+			      T7B = T7l - T7A;
+			      T7M = T7I - T7L;
+			      Ip[WS(rs, 13)] = FNMS(T1k, T7M, T1g * T7B);
+			      Im[WS(rs, 13)] = FMA(T1k, T7B, T1g * T7M);
+			      T7X = T7Q + T7R;
+			      T7Y = T7U + T7V;
+			      Ip[WS(rs, 1)] = FNMS(T15, T7Y, T12 * T7X);
+			      Im[WS(rs, 1)] = FMA(T12, T7Y, T15 * T7X);
+			 }
+			 {
+			      E T7N, T7O, T7S, T7W;
+			      T7N = T7l + T7A;
+			      T7O = T7I + T7L;
+			      Ip[WS(rs, 5)] = FNMS(T2Y, T7O, T2X * T7N);
+			      Im[WS(rs, 5)] = FMA(T2Y, T7N, T2X * T7O);
+			      T7S = T7Q - T7R;
+			      T7W = T7U - T7V;
+			      Ip[WS(rs, 9)] = FNMS(T7T, T7W, T7P * T7S);
+			      Im[WS(rs, 9)] = FMA(T7P, T7W, T7T * T7S);
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cb2_32", twinstr, &GENUS, {376, 168, 112, 0} };
+
+void X(codelet_hc2cb2_32) (planner *p) {
+     X(khc2c_register) (p, hc2cb2_32, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb2_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb2_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:57 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hc2cb2_4 -include hc2cb.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 30 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E Tg, Tc, Te, To, Tn;
+	       {
+		    E T7, Tb, T8, Ta;
+		    T7 = W[0];
+		    Tb = W[3];
+		    T8 = W[2];
+		    Ta = W[1];
+		    {
+			 E Tu, Tj, T3, Tm, Tx, Tr, T6, Tt;
+			 {
+			      E T4, Tp, Tq, T5;
+			      {
+				   E T1, T2, Tk, Tl;
+				   {
+					E Th, Tf, T9, Ti;
+					Th = Ip[0];
+					Tf = T7 * Tb;
+					T9 = T7 * T8;
+					Ti = Im[WS(rs, 1)];
+					T1 = Rp[0];
+					Tg = FNMS(Ta, T8, Tf);
+					Tc = FMA(Ta, Tb, T9);
+					Tu = Th + Ti;
+					Tj = Th - Ti;
+					T2 = Rm[WS(rs, 1)];
+				   }
+				   Tk = Ip[WS(rs, 1)];
+				   Tl = Im[0];
+				   T4 = Rp[WS(rs, 1)];
+				   T3 = T1 + T2;
+				   Tp = T1 - T2;
+				   Tm = Tk - Tl;
+				   Tq = Tk + Tl;
+				   T5 = Rm[0];
+			      }
+			      Tx = Tp + Tq;
+			      Tr = Tp - Tq;
+			      T6 = T4 + T5;
+			      Tt = T4 - T5;
+			 }
+			 {
+			      E Tz, Tv, Td, Ts, Tw, TA, Ty;
+			      Rm[0] = Tj + Tm;
+			      Ts = T7 * Tr;
+			      Tz = Tu - Tt;
+			      Tv = Tt + Tu;
+			      Rp[0] = T3 + T6;
+			      Td = T3 - T6;
+			      Ip[0] = FNMS(Ta, Tv, Ts);
+			      Tw = T7 * Tv;
+			      TA = T8 * Tz;
+			      Ty = T8 * Tx;
+			      Te = Tc * Td;
+			      Im[0] = FMA(Ta, Tr, Tw);
+			      Im[WS(rs, 1)] = FMA(Tb, Tx, TA);
+			      Ip[WS(rs, 1)] = FNMS(Tb, Tz, Ty);
+			      To = Tg * Td;
+			      Tn = Tj - Tm;
+			 }
+		    }
+	       }
+	       Rm[WS(rs, 1)] = FMA(Tc, Tn, To);
+	       Rp[WS(rs, 1)] = FNMS(Tg, Tn, Te);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cb2_4", twinstr, &GENUS, {16, 8, 8, 0} };
+
+void X(codelet_hc2cb2_4) (planner *p) {
+     X(khc2c_register) (p, hc2cb2_4, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hc2cb2_4 -include hc2cb.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 21 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T7, T9, T8, Ta, Tb, Td;
+	       T7 = W[0];
+	       T9 = W[1];
+	       T8 = W[2];
+	       Ta = W[3];
+	       Tb = FMA(T7, T8, T9 * Ta);
+	       Td = FNMS(T9, T8, T7 * Ta);
+	       {
+		    E T3, Tl, Tg, Tp, T6, To, Tj, Tm, Tc, Tk;
+		    {
+			 E T1, T2, Te, Tf;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 1)];
+			 T3 = T1 + T2;
+			 Tl = T1 - T2;
+			 Te = Ip[0];
+			 Tf = Im[WS(rs, 1)];
+			 Tg = Te - Tf;
+			 Tp = Te + Tf;
+		    }
+		    {
+			 E T4, T5, Th, Ti;
+			 T4 = Rp[WS(rs, 1)];
+			 T5 = Rm[0];
+			 T6 = T4 + T5;
+			 To = T4 - T5;
+			 Th = Ip[WS(rs, 1)];
+			 Ti = Im[0];
+			 Tj = Th - Ti;
+			 Tm = Th + Ti;
+		    }
+		    Rp[0] = T3 + T6;
+		    Rm[0] = Tg + Tj;
+		    Tc = T3 - T6;
+		    Tk = Tg - Tj;
+		    Rp[WS(rs, 1)] = FNMS(Td, Tk, Tb * Tc);
+		    Rm[WS(rs, 1)] = FMA(Td, Tc, Tb * Tk);
+		    {
+			 E Tn, Tq, Tr, Ts;
+			 Tn = Tl - Tm;
+			 Tq = To + Tp;
+			 Ip[0] = FNMS(T9, Tq, T7 * Tn);
+			 Im[0] = FMA(T7, Tq, T9 * Tn);
+			 Tr = Tl + Tm;
+			 Ts = Tp - To;
+			 Ip[WS(rs, 1)] = FNMS(Ta, Ts, T8 * Tr);
+			 Im[WS(rs, 1)] = FMA(T8, Ts, Ta * Tr);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cb2_4", twinstr, &GENUS, {16, 8, 8, 0} };
+
+void X(codelet_hc2cb2_4) (planner *p) {
+     X(khc2c_register) (p, hc2cb2_4, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb2_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb2_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:57 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hc2cb2_8 -include hc2cb.h */
+
+/*
+ * This function contains 74 FP additions, 50 FP multiplications,
+ * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
+ * 64 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E Tf, Ti, TK, Tq, TH, TT, TX, TW, TY, TU, TI;
+	       {
+		    E Tg, Tl, Tp, Th, T1n, T1t, Tj;
+		    Tf = W[0];
+		    Tg = W[2];
+		    Tl = W[4];
+		    Tp = W[5];
+		    Ti = W[1];
+		    Th = Tf * Tg;
+		    T1n = Tf * Tl;
+		    T1t = Tf * Tp;
+		    Tj = W[3];
+		    {
+			 E T1o, T1u, Tk, T1b, To, T1e, T13, TP, T1p, T7, T1h, T1v, TZ, Tv, T1i;
+			 E TB, TA, TQ, Te, T1w, TE, T1j;
+			 {
+			      E Tr, T3, Ts, T1f, TO, TL, T6, Tt;
+			      {
+				   E TM, TN, T4, T5;
+				   {
+					E T1, Tn, T2, TJ, Tm;
+					T1 = Rp[0];
+					T1o = FMA(Ti, Tp, T1n);
+					T1u = FNMS(Ti, Tl, T1t);
+					Tk = FMA(Ti, Tj, Th);
+					T1b = FNMS(Ti, Tj, Th);
+					Tn = Tf * Tj;
+					T2 = Rm[WS(rs, 3)];
+					TM = Ip[0];
+					TJ = Tk * Tp;
+					Tm = Tk * Tl;
+					To = FNMS(Ti, Tg, Tn);
+					T1e = FMA(Ti, Tg, Tn);
+					Tr = T1 - T2;
+					T3 = T1 + T2;
+					TK = FNMS(To, Tl, TJ);
+					Tq = FMA(To, Tp, Tm);
+					TN = Im[WS(rs, 3)];
+				   }
+				   T4 = Rp[WS(rs, 2)];
+				   T5 = Rm[WS(rs, 1)];
+				   Ts = Ip[WS(rs, 2)];
+				   T1f = TM - TN;
+				   TO = TM + TN;
+				   TL = T4 - T5;
+				   T6 = T4 + T5;
+				   Tt = Im[WS(rs, 1)];
+			      }
+			      {
+				   E Tw, Ta, TC, Tz, Td, TD;
+				   {
+					E Tx, Ty, Tb, Tc;
+					{
+					     E T8, T1g, Tu, T9;
+					     T8 = Rp[WS(rs, 1)];
+					     T13 = TO - TL;
+					     TP = TL + TO;
+					     T1p = T3 - T6;
+					     T7 = T3 + T6;
+					     T1g = Ts - Tt;
+					     Tu = Ts + Tt;
+					     T9 = Rm[WS(rs, 2)];
+					     Tx = Ip[WS(rs, 1)];
+					     T1h = T1f + T1g;
+					     T1v = T1f - T1g;
+					     TZ = Tr + Tu;
+					     Tv = Tr - Tu;
+					     Tw = T8 - T9;
+					     Ta = T8 + T9;
+					     Ty = Im[WS(rs, 2)];
+					}
+					Tb = Rm[0];
+					Tc = Rp[WS(rs, 3)];
+					TC = Ip[WS(rs, 3)];
+					T1i = Tx - Ty;
+					Tz = Tx + Ty;
+					TB = Tb - Tc;
+					Td = Tb + Tc;
+					TD = Im[0];
+				   }
+				   TA = Tw - Tz;
+				   TQ = Tw + Tz;
+				   Te = Ta + Td;
+				   T1w = Ta - Td;
+				   TE = TC + TD;
+				   T1j = TC - TD;
+			      }
+			 }
+			 {
+			      E T1x, T1k, T1r, TG, TS, T19, T15, T17, T11, T16, T12;
+			      {
+				   E T1B, T1z, T10, T1A, T1C;
+				   T1x = T1v - T1w;
+				   T1B = T1w + T1v;
+				   Rp[0] = T7 + Te;
+				   {
+					E T1q, TR, TF, T14;
+					T1k = T1i + T1j;
+					T1q = T1j - T1i;
+					TR = TB + TE;
+					TF = TB - TE;
+					T1r = T1p - T1q;
+					T1z = T1p + T1q;
+					Rm[0] = T1h + T1k;
+					TG = TA + TF;
+					T14 = TA - TF;
+					TS = TQ - TR;
+					T10 = TQ + TR;
+					T1A = Tk * T1z;
+					T19 = FNMS(KP707106781, T14, T13);
+					T15 = FMA(KP707106781, T14, T13);
+					T1C = Tk * T1B;
+				   }
+				   T17 = FMA(KP707106781, T10, TZ);
+				   T11 = FNMS(KP707106781, T10, TZ);
+				   Rp[WS(rs, 1)] = FNMS(To, T1B, T1A);
+				   T16 = Tg * T15;
+				   Rm[WS(rs, 1)] = FMA(To, T1z, T1C);
+			      }
+			      T12 = Tg * T11;
+			      {
+				   E T1l, T1a, T1c, T18;
+				   Im[WS(rs, 1)] = FMA(Tj, T11, T16);
+				   Ip[WS(rs, 1)] = FNMS(Tj, T15, T12);
+				   T18 = Tl * T17;
+				   T1l = T1h - T1k;
+				   T1a = Tl * T19;
+				   T1c = T7 - Te;
+				   Ip[WS(rs, 3)] = FNMS(Tp, T19, T18);
+				   {
+					E T1s, T1m, T1d, T1y, TV;
+					Im[WS(rs, 3)] = FMA(Tp, T17, T1a);
+					T1m = T1e * T1c;
+					T1d = T1b * T1c;
+					T1s = T1o * T1r;
+					Rm[WS(rs, 2)] = FMA(T1b, T1l, T1m);
+					Rp[WS(rs, 2)] = FNMS(T1e, T1l, T1d);
+					Rp[WS(rs, 3)] = FNMS(T1u, T1x, T1s);
+					T1y = T1o * T1x;
+					TV = FMA(KP707106781, TG, Tv);
+					TH = FNMS(KP707106781, TG, Tv);
+					TT = FNMS(KP707106781, TS, TP);
+					TX = FMA(KP707106781, TS, TP);
+					Rm[WS(rs, 3)] = FMA(T1u, T1r, T1y);
+					TW = Tf * TV;
+					TY = Ti * TV;
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Ip[0] = FNMS(Ti, TX, TW);
+	       Im[0] = FMA(Tf, TX, TY);
+	       TU = TK * TH;
+	       TI = Tq * TH;
+	       Im[WS(rs, 2)] = FMA(Tq, TT, TU);
+	       Ip[WS(rs, 2)] = FNMS(TK, TT, TI);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cb2_8", twinstr, &GENUS, {44, 20, 30, 0} };
+
+void X(codelet_hc2cb2_8) (planner *p) {
+     X(khc2c_register) (p, hc2cb2_8, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hc2cb2_8 -include hc2cb.h */
+
+/*
+ * This function contains 74 FP additions, 44 FP multiplications,
+ * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
+ * 46 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E Tf, Ti, Tg, Tj, Tl, Tp, TP, TR, TF, TG, TH, T15, TL, TT;
+	       {
+		    E Th, To, Tk, Tn;
+		    Tf = W[0];
+		    Ti = W[1];
+		    Tg = W[2];
+		    Tj = W[3];
+		    Th = Tf * Tg;
+		    To = Ti * Tg;
+		    Tk = Ti * Tj;
+		    Tn = Tf * Tj;
+		    Tl = Th - Tk;
+		    Tp = Tn + To;
+		    TP = Th + Tk;
+		    TR = Tn - To;
+		    TF = W[4];
+		    TG = W[5];
+		    TH = FMA(Tf, TF, Ti * TG);
+		    T15 = FNMS(TR, TF, TP * TG);
+		    TL = FNMS(Ti, TF, Tf * TG);
+		    TT = FMA(TP, TF, TR * TG);
+	       }
+	       {
+		    E T7, T1f, T1i, Tw, TI, TW, T18, TM, Te, T19, T1a, TD, TJ, TZ, T12;
+		    E TN, Tm, TE;
+		    {
+			 E T3, TU, Ts, T17, T6, T16, Tv, TV;
+			 {
+			      E T1, T2, Tq, Tr;
+			      T1 = Rp[0];
+			      T2 = Rm[WS(rs, 3)];
+			      T3 = T1 + T2;
+			      TU = T1 - T2;
+			      Tq = Ip[0];
+			      Tr = Im[WS(rs, 3)];
+			      Ts = Tq - Tr;
+			      T17 = Tq + Tr;
+			 }
+			 {
+			      E T4, T5, Tt, Tu;
+			      T4 = Rp[WS(rs, 2)];
+			      T5 = Rm[WS(rs, 1)];
+			      T6 = T4 + T5;
+			      T16 = T4 - T5;
+			      Tt = Ip[WS(rs, 2)];
+			      Tu = Im[WS(rs, 1)];
+			      Tv = Tt - Tu;
+			      TV = Tt + Tu;
+			 }
+			 T7 = T3 + T6;
+			 T1f = TU + TV;
+			 T1i = T17 - T16;
+			 Tw = Ts + Tv;
+			 TI = T3 - T6;
+			 TW = TU - TV;
+			 T18 = T16 + T17;
+			 TM = Ts - Tv;
+		    }
+		    {
+			 E Ta, TX, Tz, TY, Td, T10, TC, T11;
+			 {
+			      E T8, T9, Tx, Ty;
+			      T8 = Rp[WS(rs, 1)];
+			      T9 = Rm[WS(rs, 2)];
+			      Ta = T8 + T9;
+			      TX = T8 - T9;
+			      Tx = Ip[WS(rs, 1)];
+			      Ty = Im[WS(rs, 2)];
+			      Tz = Tx - Ty;
+			      TY = Tx + Ty;
+			 }
+			 {
+			      E Tb, Tc, TA, TB;
+			      Tb = Rm[0];
+			      Tc = Rp[WS(rs, 3)];
+			      Td = Tb + Tc;
+			      T10 = Tb - Tc;
+			      TA = Ip[WS(rs, 3)];
+			      TB = Im[0];
+			      TC = TA - TB;
+			      T11 = TA + TB;
+			 }
+			 Te = Ta + Td;
+			 T19 = TX + TY;
+			 T1a = T10 + T11;
+			 TD = Tz + TC;
+			 TJ = TC - Tz;
+			 TZ = TX - TY;
+			 T12 = T10 - T11;
+			 TN = Ta - Td;
+		    }
+		    Rp[0] = T7 + Te;
+		    Rm[0] = Tw + TD;
+		    Tm = T7 - Te;
+		    TE = Tw - TD;
+		    Rp[WS(rs, 2)] = FNMS(Tp, TE, Tl * Tm);
+		    Rm[WS(rs, 2)] = FMA(Tp, Tm, Tl * TE);
+		    {
+			 E TQ, TS, TK, TO;
+			 TQ = TI + TJ;
+			 TS = TN + TM;
+			 Rp[WS(rs, 1)] = FNMS(TR, TS, TP * TQ);
+			 Rm[WS(rs, 1)] = FMA(TP, TS, TR * TQ);
+			 TK = TI - TJ;
+			 TO = TM - TN;
+			 Rp[WS(rs, 3)] = FNMS(TL, TO, TH * TK);
+			 Rm[WS(rs, 3)] = FMA(TH, TO, TL * TK);
+		    }
+		    {
+			 E T1h, T1l, T1k, T1m, T1g, T1j;
+			 T1g = KP707106781 * (T19 + T1a);
+			 T1h = T1f - T1g;
+			 T1l = T1f + T1g;
+			 T1j = KP707106781 * (TZ - T12);
+			 T1k = T1i + T1j;
+			 T1m = T1i - T1j;
+			 Ip[WS(rs, 1)] = FNMS(Tj, T1k, Tg * T1h);
+			 Im[WS(rs, 1)] = FMA(Tg, T1k, Tj * T1h);
+			 Ip[WS(rs, 3)] = FNMS(TG, T1m, TF * T1l);
+			 Im[WS(rs, 3)] = FMA(TF, T1m, TG * T1l);
+		    }
+		    {
+			 E T14, T1d, T1c, T1e, T13, T1b;
+			 T13 = KP707106781 * (TZ + T12);
+			 T14 = TW - T13;
+			 T1d = TW + T13;
+			 T1b = KP707106781 * (T19 - T1a);
+			 T1c = T18 - T1b;
+			 T1e = T18 + T1b;
+			 Ip[WS(rs, 2)] = FNMS(T15, T1c, TT * T14);
+			 Im[WS(rs, 2)] = FMA(T15, T14, TT * T1c);
+			 Ip[0] = FNMS(Ti, T1e, Tf * T1d);
+			 Im[0] = FMA(Ti, T1d, Tf * T1e);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cb2_8", twinstr, &GENUS, {56, 26, 18, 0} };
+
+void X(codelet_hc2cb2_8) (planner *p) {
+     X(khc2c_register) (p, hc2cb2_8, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:53 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cb_10 -include hc2cb.h */
+
+/*
+ * This function contains 102 FP additions, 72 FP multiplications,
+ * (or, 48 additions, 18 multiplications, 54 fused multiply/add),
+ * 71 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T21, T1Y, T1X;
+	       {
+		    E T1B, TH, T1g, T3, T1V, T1x, T1G, T1E, TM, TK, T11, TB, T7, T1m, T1J;
+		    E TO, Th, T1h, T6, T8, TF, TG, T1i, T9;
+		    TF = Ip[0];
+		    TG = Im[WS(rs, 4)];
+		    {
+			 E T1u, Tp, Tu, T1s, Tz, T1v, Ts, Tv;
+			 {
+			      E Tx, Ty, Tn, To, Tq, Tr;
+			      Tn = Ip[WS(rs, 4)];
+			      To = Im[0];
+			      Tx = Ip[WS(rs, 3)];
+			      T1B = TF + TG;
+			      TH = TF - TG;
+			      T1u = Tn + To;
+			      Tp = Tn - To;
+			      Ty = Im[WS(rs, 1)];
+			      Tq = Ip[WS(rs, 1)];
+			      Tr = Im[WS(rs, 3)];
+			      Tu = Ip[WS(rs, 2)];
+			      T1s = Tx + Ty;
+			      Tz = Tx - Ty;
+			      T1v = Tq + Tr;
+			      Ts = Tq - Tr;
+			      Tv = Im[WS(rs, 2)];
+			 }
+			 {
+			      E T1, T1w, T1D, TJ, Tt, T1r, Tw, T2;
+			      T1 = Rp[0];
+			      T1w = T1u + T1v;
+			      T1D = T1u - T1v;
+			      TJ = Tp + Ts;
+			      Tt = Tp - Ts;
+			      T1r = Tu + Tv;
+			      Tw = Tu - Tv;
+			      T2 = Rm[WS(rs, 4)];
+			      {
+				   E Tb, Tc, Te, Tf;
+				   Tb = Rp[WS(rs, 4)];
+				   {
+					E T1t, T1C, TI, TA;
+					T1t = T1r + T1s;
+					T1C = T1r - T1s;
+					TI = Tw + Tz;
+					TA = Tw - Tz;
+					T1g = T1 - T2;
+					T3 = T1 + T2;
+					T1V = FNMS(KP618033988, T1t, T1w);
+					T1x = FMA(KP618033988, T1w, T1t);
+					T1G = T1C - T1D;
+					T1E = T1C + T1D;
+					TM = TI - TJ;
+					TK = TI + TJ;
+					T11 = FMA(KP618033988, Tt, TA);
+					TB = FNMS(KP618033988, TA, Tt);
+					Tc = Rm[0];
+				   }
+				   Te = Rm[WS(rs, 3)];
+				   Tf = Rp[WS(rs, 1)];
+				   {
+					E T4, T1k, Td, T1l, Tg, T5;
+					T4 = Rp[WS(rs, 2)];
+					T1k = Tb - Tc;
+					Td = Tb + Tc;
+					T1l = Te - Tf;
+					Tg = Te + Tf;
+					T5 = Rm[WS(rs, 2)];
+					T7 = Rm[WS(rs, 1)];
+					T1m = T1k + T1l;
+					T1J = T1k - T1l;
+					TO = Td - Tg;
+					Th = Td + Tg;
+					T1h = T4 - T5;
+					T6 = T4 + T5;
+					T8 = Rp[WS(rs, 3)];
+				   }
+			      }
+			 }
+		    }
+		    Rm[0] = TH + TK;
+		    T1i = T7 - T8;
+		    T9 = T7 + T8;
+		    {
+			 E T2d, T1F, T29, T1I, TP, T2c, T1p, Tl, T1o, Tk, T2b, T2e, T17, T14, T13;
+			 T2d = T1B + T1E;
+			 T1F = FNMS(KP250000000, T1E, T1B);
+			 {
+			      E T1j, Ta, T1n, Ti, T2a;
+			      T29 = W[8];
+			      T1I = T1h - T1i;
+			      T1j = T1h + T1i;
+			      TP = T6 - T9;
+			      Ta = T6 + T9;
+			      T2c = W[9];
+			      T1p = T1j - T1m;
+			      T1n = T1j + T1m;
+			      Tl = Ta - Th;
+			      Ti = Ta + Th;
+			      T1o = FNMS(KP250000000, T1n, T1g);
+			      T2a = T1g + T1n;
+			      Rp[0] = T3 + Ti;
+			      Tk = FNMS(KP250000000, Ti, T3);
+			      T2b = T29 * T2a;
+			      T2e = T2c * T2a;
+			 }
+			 {
+			      E T16, TQ, T10, Tm, TL;
+			      T16 = FMA(KP618033988, TO, TP);
+			      TQ = FNMS(KP618033988, TP, TO);
+			      Ip[WS(rs, 2)] = FNMS(T2c, T2d, T2b);
+			      Im[WS(rs, 2)] = FMA(T29, T2d, T2e);
+			      T10 = FMA(KP559016994, Tl, Tk);
+			      Tm = FNMS(KP559016994, Tl, Tk);
+			      TL = FNMS(KP250000000, TK, TH);
+			      {
+				   E TE, TU, T12, TR, TX, T1d, T1c, T19, TD, T1e, T1b, TW, TT;
+				   {
+					E TC, T15, T1a, TS, Tj, TN;
+					TE = W[3];
+					TC = FMA(KP951056516, TB, Tm);
+					TU = FNMS(KP951056516, TB, Tm);
+					TN = FNMS(KP559016994, TM, TL);
+					T15 = FMA(KP559016994, TM, TL);
+					T12 = FMA(KP951056516, T11, T10);
+					T1a = FNMS(KP951056516, T11, T10);
+					TS = TE * TC;
+					TR = FNMS(KP951056516, TQ, TN);
+					TX = FMA(KP951056516, TQ, TN);
+					Tj = W[2];
+					T1d = FMA(KP951056516, T16, T15);
+					T17 = FNMS(KP951056516, T16, T15);
+					T1c = W[11];
+					T19 = W[10];
+					Rm[WS(rs, 1)] = FMA(Tj, TR, TS);
+					TD = Tj * TC;
+					T1e = T1c * T1a;
+					T1b = T19 * T1a;
+				   }
+				   Rp[WS(rs, 1)] = FNMS(TE, TR, TD);
+				   Rm[WS(rs, 3)] = FMA(T19, T1d, T1e);
+				   Rp[WS(rs, 3)] = FNMS(T1c, T1d, T1b);
+				   TW = W[15];
+				   TT = W[14];
+				   {
+					E TZ, T18, TY, TV;
+					T14 = W[7];
+					TY = TW * TU;
+					TV = TT * TU;
+					TZ = W[6];
+					T18 = T14 * T12;
+					Rm[WS(rs, 4)] = FMA(TT, TX, TY);
+					Rp[WS(rs, 4)] = FNMS(TW, TX, TV);
+					T13 = TZ * T12;
+					Rm[WS(rs, 2)] = FMA(TZ, T17, T18);
+				   }
+			      }
+			 }
+			 {
+			      E T20, T1K, T1q, T1U;
+			      T20 = FNMS(KP618033988, T1I, T1J);
+			      T1K = FMA(KP618033988, T1J, T1I);
+			      Rp[WS(rs, 2)] = FNMS(T14, T17, T13);
+			      T1q = FMA(KP559016994, T1p, T1o);
+			      T1U = FNMS(KP559016994, T1p, T1o);
+			      {
+				   E T1A, T1O, T1W, T1R, T1L, T27, T26, T23, T1z, T28, T25, T1Q, T1N;
+				   {
+					E T1y, T1Z, T24, T1M, T1f, T1H;
+					T1A = W[1];
+					T1O = FMA(KP951056516, T1x, T1q);
+					T1y = FNMS(KP951056516, T1x, T1q);
+					T1Z = FNMS(KP559016994, T1G, T1F);
+					T1H = FMA(KP559016994, T1G, T1F);
+					T24 = FMA(KP951056516, T1V, T1U);
+					T1W = FNMS(KP951056516, T1V, T1U);
+					T1M = T1A * T1y;
+					T1R = FNMS(KP951056516, T1K, T1H);
+					T1L = FMA(KP951056516, T1K, T1H);
+					T1f = W[0];
+					T21 = FMA(KP951056516, T20, T1Z);
+					T27 = FNMS(KP951056516, T20, T1Z);
+					T26 = W[13];
+					T23 = W[12];
+					Im[0] = FMA(T1f, T1L, T1M);
+					T1z = T1f * T1y;
+					T28 = T26 * T24;
+					T25 = T23 * T24;
+				   }
+				   Ip[0] = FNMS(T1A, T1L, T1z);
+				   Im[WS(rs, 3)] = FMA(T23, T27, T28);
+				   Ip[WS(rs, 3)] = FNMS(T26, T27, T25);
+				   T1Q = W[17];
+				   T1N = W[16];
+				   {
+					E T1T, T22, T1S, T1P;
+					T1Y = W[5];
+					T1S = T1Q * T1O;
+					T1P = T1N * T1O;
+					T1T = W[4];
+					T22 = T1Y * T1W;
+					Im[WS(rs, 4)] = FMA(T1N, T1R, T1S);
+					Ip[WS(rs, 4)] = FNMS(T1Q, T1R, T1P);
+					T1X = T1T * T1W;
+					Im[WS(rs, 1)] = FMA(T1T, T21, T22);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 1)] = FNMS(T1Y, T21, T1X);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 10, "hc2cb_10", twinstr, &GENUS, {48, 18, 54, 0} };
+
+void X(codelet_hc2cb_10) (planner *p) {
+     X(khc2c_register) (p, hc2cb_10, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cb_10 -include hc2cb.h */
+
+/*
+ * This function contains 102 FP additions, 60 FP multiplications,
+ * (or, 72 additions, 30 multiplications, 30 fused multiply/add),
+ * 39 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T3, T18, TJ, T1i, TE, TF, T1B, T1A, T1f, T1t, Ti, Tl, Tt, TA, T1w;
+	       E T1v, T1p, T1E, TM, TO;
+	       {
+		    E T1, T2, TH, TI;
+		    T1 = Rp[0];
+		    T2 = Rm[WS(rs, 4)];
+		    T3 = T1 + T2;
+		    T18 = T1 - T2;
+		    TH = Ip[0];
+		    TI = Im[WS(rs, 4)];
+		    TJ = TH - TI;
+		    T1i = TH + TI;
+	       }
+	       {
+		    E T6, T19, Tg, T1d, T9, T1a, Td, T1c;
+		    {
+			 E T4, T5, Te, Tf;
+			 T4 = Rp[WS(rs, 2)];
+			 T5 = Rm[WS(rs, 2)];
+			 T6 = T4 + T5;
+			 T19 = T4 - T5;
+			 Te = Rm[WS(rs, 3)];
+			 Tf = Rp[WS(rs, 1)];
+			 Tg = Te + Tf;
+			 T1d = Te - Tf;
+		    }
+		    {
+			 E T7, T8, Tb, Tc;
+			 T7 = Rm[WS(rs, 1)];
+			 T8 = Rp[WS(rs, 3)];
+			 T9 = T7 + T8;
+			 T1a = T7 - T8;
+			 Tb = Rp[WS(rs, 4)];
+			 Tc = Rm[0];
+			 Td = Tb + Tc;
+			 T1c = Tb - Tc;
+		    }
+		    TE = T6 - T9;
+		    TF = Td - Tg;
+		    T1B = T1c - T1d;
+		    T1A = T19 - T1a;
+		    {
+			 E T1b, T1e, Ta, Th;
+			 T1b = T19 + T1a;
+			 T1e = T1c + T1d;
+			 T1f = T1b + T1e;
+			 T1t = KP559016994 * (T1b - T1e);
+			 Ta = T6 + T9;
+			 Th = Td + Tg;
+			 Ti = Ta + Th;
+			 Tl = KP559016994 * (Ta - Th);
+		    }
+	       }
+	       {
+		    E Tp, T1j, Tz, T1n, Ts, T1k, Tw, T1m;
+		    {
+			 E Tn, To, Tx, Ty;
+			 Tn = Ip[WS(rs, 2)];
+			 To = Im[WS(rs, 2)];
+			 Tp = Tn - To;
+			 T1j = Tn + To;
+			 Tx = Ip[WS(rs, 1)];
+			 Ty = Im[WS(rs, 3)];
+			 Tz = Tx - Ty;
+			 T1n = Tx + Ty;
+		    }
+		    {
+			 E Tq, Tr, Tu, Tv;
+			 Tq = Ip[WS(rs, 3)];
+			 Tr = Im[WS(rs, 1)];
+			 Ts = Tq - Tr;
+			 T1k = Tq + Tr;
+			 Tu = Ip[WS(rs, 4)];
+			 Tv = Im[0];
+			 Tw = Tu - Tv;
+			 T1m = Tu + Tv;
+		    }
+		    Tt = Tp - Ts;
+		    TA = Tw - Tz;
+		    T1w = T1m + T1n;
+		    T1v = T1j + T1k;
+		    {
+			 E T1l, T1o, TK, TL;
+			 T1l = T1j - T1k;
+			 T1o = T1m - T1n;
+			 T1p = T1l + T1o;
+			 T1E = KP559016994 * (T1l - T1o);
+			 TK = Tp + Ts;
+			 TL = Tw + Tz;
+			 TM = TK + TL;
+			 TO = KP559016994 * (TK - TL);
+		    }
+	       }
+	       Rp[0] = T3 + Ti;
+	       Rm[0] = TJ + TM;
+	       {
+		    E T1g, T1q, T17, T1h;
+		    T1g = T18 + T1f;
+		    T1q = T1i + T1p;
+		    T17 = W[8];
+		    T1h = W[9];
+		    Ip[WS(rs, 2)] = FNMS(T1h, T1q, T17 * T1g);
+		    Im[WS(rs, 2)] = FMA(T1h, T1g, T17 * T1q);
+	       }
+	       {
+		    E TB, TG, T11, TX, TP, T10, Tm, TW, TN, Tk;
+		    TB = FNMS(KP951056516, TA, KP587785252 * Tt);
+		    TG = FNMS(KP951056516, TF, KP587785252 * TE);
+		    T11 = FMA(KP951056516, TE, KP587785252 * TF);
+		    TX = FMA(KP951056516, Tt, KP587785252 * TA);
+		    TN = FNMS(KP250000000, TM, TJ);
+		    TP = TN - TO;
+		    T10 = TO + TN;
+		    Tk = FNMS(KP250000000, Ti, T3);
+		    Tm = Tk - Tl;
+		    TW = Tl + Tk;
+		    {
+			 E TC, TQ, Tj, TD;
+			 TC = Tm - TB;
+			 TQ = TG + TP;
+			 Tj = W[2];
+			 TD = W[3];
+			 Rp[WS(rs, 1)] = FNMS(TD, TQ, Tj * TC);
+			 Rm[WS(rs, 1)] = FMA(TD, TC, Tj * TQ);
+		    }
+		    {
+			 E T14, T16, T13, T15;
+			 T14 = TW - TX;
+			 T16 = T11 + T10;
+			 T13 = W[10];
+			 T15 = W[11];
+			 Rp[WS(rs, 3)] = FNMS(T15, T16, T13 * T14);
+			 Rm[WS(rs, 3)] = FMA(T15, T14, T13 * T16);
+		    }
+		    {
+			 E TS, TU, TR, TT;
+			 TS = Tm + TB;
+			 TU = TP - TG;
+			 TR = W[14];
+			 TT = W[15];
+			 Rp[WS(rs, 4)] = FNMS(TT, TU, TR * TS);
+			 Rm[WS(rs, 4)] = FMA(TT, TS, TR * TU);
+		    }
+		    {
+			 E TY, T12, TV, TZ;
+			 TY = TW + TX;
+			 T12 = T10 - T11;
+			 TV = W[6];
+			 TZ = W[7];
+			 Rp[WS(rs, 2)] = FNMS(TZ, T12, TV * TY);
+			 Rm[WS(rs, 2)] = FMA(TZ, TY, TV * T12);
+		    }
+	       }
+	       {
+		    E T1x, T1C, T1Q, T1N, T1F, T1R, T1u, T1M, T1D, T1s;
+		    T1x = FNMS(KP951056516, T1w, KP587785252 * T1v);
+		    T1C = FNMS(KP951056516, T1B, KP587785252 * T1A);
+		    T1Q = FMA(KP951056516, T1A, KP587785252 * T1B);
+		    T1N = FMA(KP951056516, T1v, KP587785252 * T1w);
+		    T1D = FNMS(KP250000000, T1p, T1i);
+		    T1F = T1D - T1E;
+		    T1R = T1E + T1D;
+		    T1s = FNMS(KP250000000, T1f, T18);
+		    T1u = T1s - T1t;
+		    T1M = T1t + T1s;
+		    {
+			 E T1y, T1G, T1r, T1z;
+			 T1y = T1u - T1x;
+			 T1G = T1C + T1F;
+			 T1r = W[12];
+			 T1z = W[13];
+			 Ip[WS(rs, 3)] = FNMS(T1z, T1G, T1r * T1y);
+			 Im[WS(rs, 3)] = FMA(T1r, T1G, T1z * T1y);
+		    }
+		    {
+			 E T1U, T1W, T1T, T1V;
+			 T1U = T1M + T1N;
+			 T1W = T1R - T1Q;
+			 T1T = W[16];
+			 T1V = W[17];
+			 Ip[WS(rs, 4)] = FNMS(T1V, T1W, T1T * T1U);
+			 Im[WS(rs, 4)] = FMA(T1T, T1W, T1V * T1U);
+		    }
+		    {
+			 E T1I, T1K, T1H, T1J;
+			 T1I = T1u + T1x;
+			 T1K = T1F - T1C;
+			 T1H = W[4];
+			 T1J = W[5];
+			 Ip[WS(rs, 1)] = FNMS(T1J, T1K, T1H * T1I);
+			 Im[WS(rs, 1)] = FMA(T1H, T1K, T1J * T1I);
+		    }
+		    {
+			 E T1O, T1S, T1L, T1P;
+			 T1O = T1M - T1N;
+			 T1S = T1Q + T1R;
+			 T1L = W[0];
+			 T1P = W[1];
+			 Ip[0] = FNMS(T1P, T1S, T1L * T1O);
+			 Im[0] = FMA(T1L, T1S, T1P * T1O);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 10, "hc2cb_10", twinstr, &GENUS, {72, 30, 30, 0} };
+
+void X(codelet_hc2cb_10) (planner *p) {
+     X(khc2c_register) (p, hc2cb_10, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:53 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cb_12 -include hc2cb.h */
+
+/*
+ * This function contains 118 FP additions, 68 FP multiplications,
+ * (or, 72 additions, 22 multiplications, 46 fused multiply/add),
+ * 64 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
+	       E T1U, T1X, T1W, T1Y, T1V;
+	       {
+		    E T18, T20, T21, T1b, T2a, T1s, T29, T1p, TO, T11, To, Tb, Tg, T23, T1f;
+		    E Tl, Ty, Tt, T1i, T24, T1z, T2d, T1w, T2c;
+		    {
+			 E T5, Ta, TN, TI;
+			 {
+			      E T1, TE, T6, TM, T7, T1o, T4, T17, TH, T8, TJ, TK;
+			      T1 = Rp[0];
+			      TE = Ip[0];
+			      T6 = Rm[WS(rs, 5)];
+			      TM = Im[WS(rs, 5)];
+			      {
+				   E T2, T3, TF, TG;
+				   T2 = Rp[WS(rs, 4)];
+				   T3 = Rm[WS(rs, 3)];
+				   TF = Ip[WS(rs, 4)];
+				   TG = Im[WS(rs, 3)];
+				   T7 = Rm[WS(rs, 1)];
+				   T1o = T2 - T3;
+				   T4 = T2 + T3;
+				   T17 = TF + TG;
+				   TH = TF - TG;
+				   T8 = Rp[WS(rs, 2)];
+				   TJ = Ip[WS(rs, 2)];
+				   TK = Im[WS(rs, 1)];
+			      }
+			      {
+				   E T1r, T1a, T19, T1q, T9, TL, T16, T1n;
+				   T5 = T1 + T4;
+				   T16 = FNMS(KP500000000, T4, T1);
+				   T1r = T7 - T8;
+				   T9 = T7 + T8;
+				   T1a = TJ + TK;
+				   TL = TJ - TK;
+				   T18 = FNMS(KP866025403, T17, T16);
+				   T20 = FMA(KP866025403, T17, T16);
+				   T19 = FNMS(KP500000000, T9, T6);
+				   Ta = T6 + T9;
+				   TN = TL - TM;
+				   T1q = FMA(KP500000000, TL, TM);
+				   T1n = FNMS(KP500000000, TH, TE);
+				   TI = TE + TH;
+				   T21 = FNMS(KP866025403, T1a, T19);
+				   T1b = FMA(KP866025403, T1a, T19);
+				   T2a = FMA(KP866025403, T1r, T1q);
+				   T1s = FNMS(KP866025403, T1r, T1q);
+				   T29 = FNMS(KP866025403, T1o, T1n);
+				   T1p = FMA(KP866025403, T1o, T1n);
+			      }
+			 }
+			 {
+			      E Tc, Tp, Th, Tx, Ti, Tf, T1v, Ts, T1e, Tj, Tu, Tv;
+			      Tc = Rp[WS(rs, 3)];
+			      TO = TI - TN;
+			      T11 = TI + TN;
+			      Tp = Ip[WS(rs, 3)];
+			      To = T5 - Ta;
+			      Tb = T5 + Ta;
+			      Th = Rm[WS(rs, 2)];
+			      Tx = Im[WS(rs, 2)];
+			      {
+				   E Td, Te, Tq, Tr;
+				   Td = Rm[WS(rs, 4)];
+				   Te = Rm[0];
+				   Tq = Im[WS(rs, 4)];
+				   Tr = Im[0];
+				   Ti = Rp[WS(rs, 1)];
+				   Tf = Td + Te;
+				   T1v = Td - Te;
+				   Ts = Tq + Tr;
+				   T1e = Tq - Tr;
+				   Tj = Rp[WS(rs, 5)];
+				   Tu = Ip[WS(rs, 1)];
+				   Tv = Ip[WS(rs, 5)];
+			      }
+			      {
+				   E T1y, T1h, T1g, T1x, Tk, Tw, T1d, T1u;
+				   T1d = FNMS(KP500000000, Tf, Tc);
+				   Tg = Tc + Tf;
+				   Tk = Ti + Tj;
+				   T1y = Ti - Tj;
+				   Tw = Tu + Tv;
+				   T1h = Tv - Tu;
+				   T23 = FNMS(KP866025403, T1e, T1d);
+				   T1f = FMA(KP866025403, T1e, T1d);
+				   Tl = Th + Tk;
+				   T1g = FNMS(KP500000000, Tk, Th);
+				   T1x = FMA(KP500000000, Tw, Tx);
+				   Ty = Tw - Tx;
+				   Tt = Tp - Ts;
+				   T1u = FMA(KP500000000, Ts, Tp);
+				   T1i = FMA(KP866025403, T1h, T1g);
+				   T24 = FNMS(KP866025403, T1h, T1g);
+				   T1z = FNMS(KP866025403, T1y, T1x);
+				   T2d = FMA(KP866025403, T1y, T1x);
+				   T1w = FMA(KP866025403, T1v, T1u);
+				   T2c = FNMS(KP866025403, T1v, T1u);
+			      }
+			 }
+		    }
+		    {
+			 E TY, T13, TX, T10;
+			 {
+			      E Tn, T12, TC, Tm, TD, TS, TA, Tz;
+			      Tn = W[16];
+			      T12 = Tt + Ty;
+			      Tz = Tt - Ty;
+			      TC = W[17];
+			      Tm = Tg + Tl;
+			      TD = Tg - Tl;
+			      TS = To + Tz;
+			      TA = To - Tz;
+			      {
+				   E TV, TU, TW, TT;
+				   {
+					E TQ, TR, TP, TB;
+					TV = TO - TD;
+					TP = TD + TO;
+					Rp[0] = Tb + Tm;
+					TB = Tn * TA;
+					TQ = Tn * TP;
+					TR = W[4];
+					Ip[WS(rs, 4)] = FNMS(TC, TP, TB);
+					TU = W[5];
+					Im[WS(rs, 4)] = FMA(TC, TA, TQ);
+					TW = TR * TV;
+					TT = TR * TS;
+				   }
+				   Im[WS(rs, 1)] = FMA(TU, TS, TW);
+				   Ip[WS(rs, 1)] = FNMS(TU, TV, TT);
+				   TY = Tb - Tm;
+				   T13 = T11 - T12;
+				   TX = W[10];
+				   T10 = W[11];
+				   Rm[0] = T11 + T12;
+			      }
+			 }
+			 {
+			      E T1K, T1Q, T1P, T1L, T2o, T2u, T2t, T2p;
+			      {
+				   E T1E, T1D, T1H, T1F, T1G, T1t, T1k, T1A;
+				   {
+					E T1c, TZ, T14, T1j;
+					T1K = T18 - T1b;
+					T1c = T18 + T1b;
+					TZ = TX * TY;
+					T14 = T10 * TY;
+					T1j = T1f + T1i;
+					T1Q = T1f - T1i;
+					T1P = T1p + T1s;
+					T1t = T1p - T1s;
+					Rp[WS(rs, 3)] = FNMS(T10, T13, TZ);
+					Rm[WS(rs, 3)] = FMA(TX, T13, T14);
+					T1E = T1c + T1j;
+					T1k = T1c - T1j;
+					T1A = T1w - T1z;
+					T1L = T1w + T1z;
+				   }
+				   {
+					E T15, T1m, T1B, T1l, T1C;
+					T15 = W[18];
+					T1m = W[19];
+					T1D = W[6];
+					T1H = T1t + T1A;
+					T1B = T1t - T1A;
+					T1l = T15 * T1k;
+					T1C = T1m * T1k;
+					T1F = T1D * T1E;
+					T1G = W[7];
+					Rp[WS(rs, 5)] = FNMS(T1m, T1B, T1l);
+					Rm[WS(rs, 5)] = FMA(T15, T1B, T1C);
+				   }
+				   {
+					E T26, T2i, T2l, T2f, T1Z, T28;
+					{
+					     E T22, T1I, T25, T2b, T2e;
+					     T22 = T20 + T21;
+					     T2o = T20 - T21;
+					     Rp[WS(rs, 2)] = FNMS(T1G, T1H, T1F);
+					     T1I = T1G * T1E;
+					     T2u = T23 - T24;
+					     T25 = T23 + T24;
+					     T2b = T29 - T2a;
+					     T2t = T29 + T2a;
+					     T2p = T2c + T2d;
+					     T2e = T2c - T2d;
+					     Rm[WS(rs, 2)] = FMA(T1D, T1H, T1I);
+					     T26 = T22 - T25;
+					     T2i = T22 + T25;
+					     T2l = T2b + T2e;
+					     T2f = T2b - T2e;
+					}
+					T1Z = W[2];
+					T28 = W[3];
+					{
+					     E T2h, T2k, T27, T2g, T2j, T2m;
+					     T2h = W[14];
+					     T2k = W[15];
+					     T27 = T1Z * T26;
+					     T2g = T28 * T26;
+					     T2j = T2h * T2i;
+					     T2m = T2k * T2i;
+					     Rp[WS(rs, 1)] = FNMS(T28, T2f, T27);
+					     Rm[WS(rs, 1)] = FMA(T1Z, T2f, T2g);
+					     Rp[WS(rs, 4)] = FNMS(T2k, T2l, T2j);
+					     Rm[WS(rs, 4)] = FMA(T2h, T2l, T2m);
+					}
+				   }
+			      }
+			      {
+				   E T2y, T2B, T2A, T2C, T2z;
+				   {
+					E T2n, T2q, T2v, T2s, T2r, T2x, T2w;
+					T2n = W[8];
+					T2y = T2o + T2p;
+					T2q = T2o - T2p;
+					T2B = T2t - T2u;
+					T2v = T2t + T2u;
+					T2s = W[9];
+					T2r = T2n * T2q;
+					T2x = W[20];
+					T2w = T2n * T2v;
+					T2A = W[21];
+					Ip[WS(rs, 2)] = FNMS(T2s, T2v, T2r);
+					T2C = T2x * T2B;
+					T2z = T2x * T2y;
+					Im[WS(rs, 2)] = FMA(T2s, T2q, T2w);
+				   }
+				   Im[WS(rs, 5)] = FMA(T2A, T2y, T2C);
+				   Ip[WS(rs, 5)] = FNMS(T2A, T2B, T2z);
+				   {
+					E T1J, T1M, T1R, T1O, T1N, T1T, T1S;
+					T1J = W[0];
+					T1U = T1K + T1L;
+					T1M = T1K - T1L;
+					T1X = T1P - T1Q;
+					T1R = T1P + T1Q;
+					T1O = W[1];
+					T1N = T1J * T1M;
+					T1T = W[12];
+					T1S = T1J * T1R;
+					T1W = W[13];
+					Ip[0] = FNMS(T1O, T1R, T1N);
+					T1Y = T1T * T1X;
+					T1V = T1T * T1U;
+					Im[0] = FMA(T1O, T1M, T1S);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Im[WS(rs, 3)] = FMA(T1W, T1U, T1Y);
+	       Ip[WS(rs, 3)] = FNMS(T1W, T1X, T1V);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 12, "hc2cb_12", twinstr, &GENUS, {72, 22, 46, 0} };
+
+void X(codelet_hc2cb_12) (planner *p) {
+     X(khc2c_register) (p, hc2cb_12, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cb_12 -include hc2cb.h */
+
+/*
+ * This function contains 118 FP additions, 60 FP multiplications,
+ * (or, 88 additions, 30 multiplications, 30 fused multiply/add),
+ * 39 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
+	       E T5, TH, T12, T1M, T1i, T1U, Tl, Ty, T1c, T1Y, T1s, T1Q, Ta, TM, T15;
+	       E T1N, T1l, T1V, Tg, Tt, T19, T1X, T1p, T1P;
+	       {
+		    E T1, TD, T4, T1g, TG, T11, T10, T1h;
+		    T1 = Rp[0];
+		    TD = Ip[0];
+		    {
+			 E T2, T3, TE, TF;
+			 T2 = Rp[WS(rs, 4)];
+			 T3 = Rm[WS(rs, 3)];
+			 T4 = T2 + T3;
+			 T1g = KP866025403 * (T2 - T3);
+			 TE = Ip[WS(rs, 4)];
+			 TF = Im[WS(rs, 3)];
+			 TG = TE - TF;
+			 T11 = KP866025403 * (TE + TF);
+		    }
+		    T5 = T1 + T4;
+		    TH = TD + TG;
+		    T10 = FNMS(KP500000000, T4, T1);
+		    T12 = T10 - T11;
+		    T1M = T10 + T11;
+		    T1h = FNMS(KP500000000, TG, TD);
+		    T1i = T1g + T1h;
+		    T1U = T1h - T1g;
+	       }
+	       {
+		    E Th, Tx, Tk, T1a, Tw, T1r, T1b, T1q;
+		    Th = Rm[WS(rs, 2)];
+		    Tx = Im[WS(rs, 2)];
+		    {
+			 E Ti, Tj, Tu, Tv;
+			 Ti = Rp[WS(rs, 1)];
+			 Tj = Rp[WS(rs, 5)];
+			 Tk = Ti + Tj;
+			 T1a = KP866025403 * (Ti - Tj);
+			 Tu = Ip[WS(rs, 1)];
+			 Tv = Ip[WS(rs, 5)];
+			 Tw = Tu + Tv;
+			 T1r = KP866025403 * (Tv - Tu);
+		    }
+		    Tl = Th + Tk;
+		    Ty = Tw - Tx;
+		    T1b = FMA(KP500000000, Tw, Tx);
+		    T1c = T1a - T1b;
+		    T1Y = T1a + T1b;
+		    T1q = FNMS(KP500000000, Tk, Th);
+		    T1s = T1q + T1r;
+		    T1Q = T1q - T1r;
+	       }
+	       {
+		    E T6, TL, T9, T1j, TK, T14, T13, T1k;
+		    T6 = Rm[WS(rs, 5)];
+		    TL = Im[WS(rs, 5)];
+		    {
+			 E T7, T8, TI, TJ;
+			 T7 = Rm[WS(rs, 1)];
+			 T8 = Rp[WS(rs, 2)];
+			 T9 = T7 + T8;
+			 T1j = KP866025403 * (T7 - T8);
+			 TI = Ip[WS(rs, 2)];
+			 TJ = Im[WS(rs, 1)];
+			 TK = TI - TJ;
+			 T14 = KP866025403 * (TI + TJ);
+		    }
+		    Ta = T6 + T9;
+		    TM = TK - TL;
+		    T13 = FNMS(KP500000000, T9, T6);
+		    T15 = T13 + T14;
+		    T1N = T13 - T14;
+		    T1k = FMA(KP500000000, TK, TL);
+		    T1l = T1j - T1k;
+		    T1V = T1j + T1k;
+	       }
+	       {
+		    E Tc, Tp, Tf, T17, Ts, T1o, T18, T1n;
+		    Tc = Rp[WS(rs, 3)];
+		    Tp = Ip[WS(rs, 3)];
+		    {
+			 E Td, Te, Tq, Tr;
+			 Td = Rm[WS(rs, 4)];
+			 Te = Rm[0];
+			 Tf = Td + Te;
+			 T17 = KP866025403 * (Td - Te);
+			 Tq = Im[WS(rs, 4)];
+			 Tr = Im[0];
+			 Ts = Tq + Tr;
+			 T1o = KP866025403 * (Tq - Tr);
+		    }
+		    Tg = Tc + Tf;
+		    Tt = Tp - Ts;
+		    T18 = FMA(KP500000000, Ts, Tp);
+		    T19 = T17 + T18;
+		    T1X = T18 - T17;
+		    T1n = FNMS(KP500000000, Tf, Tc);
+		    T1p = T1n + T1o;
+		    T1P = T1n - T1o;
+	       }
+	       {
+		    E Tb, Tm, TU, TW, TX, TY, TT, TV;
+		    Tb = T5 + Ta;
+		    Tm = Tg + Tl;
+		    TU = Tb - Tm;
+		    TW = TH + TM;
+		    TX = Tt + Ty;
+		    TY = TW - TX;
+		    Rp[0] = Tb + Tm;
+		    Rm[0] = TW + TX;
+		    TT = W[10];
+		    TV = W[11];
+		    Rp[WS(rs, 3)] = FNMS(TV, TY, TT * TU);
+		    Rm[WS(rs, 3)] = FMA(TV, TU, TT * TY);
+	       }
+	       {
+		    E TA, TQ, TO, TS;
+		    {
+			 E To, Tz, TC, TN;
+			 To = T5 - Ta;
+			 Tz = Tt - Ty;
+			 TA = To - Tz;
+			 TQ = To + Tz;
+			 TC = Tg - Tl;
+			 TN = TH - TM;
+			 TO = TC + TN;
+			 TS = TN - TC;
+		    }
+		    {
+			 E Tn, TB, TP, TR;
+			 Tn = W[16];
+			 TB = W[17];
+			 Ip[WS(rs, 4)] = FNMS(TB, TO, Tn * TA);
+			 Im[WS(rs, 4)] = FMA(Tn, TO, TB * TA);
+			 TP = W[4];
+			 TR = W[5];
+			 Ip[WS(rs, 1)] = FNMS(TR, TS, TP * TQ);
+			 Im[WS(rs, 1)] = FMA(TP, TS, TR * TQ);
+		    }
+	       }
+	       {
+		    E T28, T2e, T2c, T2g;
+		    {
+			 E T26, T27, T2a, T2b;
+			 T26 = T1M - T1N;
+			 T27 = T1X + T1Y;
+			 T28 = T26 - T27;
+			 T2e = T26 + T27;
+			 T2a = T1U + T1V;
+			 T2b = T1P - T1Q;
+			 T2c = T2a + T2b;
+			 T2g = T2a - T2b;
+		    }
+		    {
+			 E T25, T29, T2d, T2f;
+			 T25 = W[8];
+			 T29 = W[9];
+			 Ip[WS(rs, 2)] = FNMS(T29, T2c, T25 * T28);
+			 Im[WS(rs, 2)] = FMA(T25, T2c, T29 * T28);
+			 T2d = W[20];
+			 T2f = W[21];
+			 Ip[WS(rs, 5)] = FNMS(T2f, T2g, T2d * T2e);
+			 Im[WS(rs, 5)] = FMA(T2d, T2g, T2f * T2e);
+		    }
+	       }
+	       {
+		    E T1S, T22, T20, T24;
+		    {
+			 E T1O, T1R, T1W, T1Z;
+			 T1O = T1M + T1N;
+			 T1R = T1P + T1Q;
+			 T1S = T1O - T1R;
+			 T22 = T1O + T1R;
+			 T1W = T1U - T1V;
+			 T1Z = T1X - T1Y;
+			 T20 = T1W - T1Z;
+			 T24 = T1W + T1Z;
+		    }
+		    {
+			 E T1L, T1T, T21, T23;
+			 T1L = W[2];
+			 T1T = W[3];
+			 Rp[WS(rs, 1)] = FNMS(T1T, T20, T1L * T1S);
+			 Rm[WS(rs, 1)] = FMA(T1T, T1S, T1L * T20);
+			 T21 = W[14];
+			 T23 = W[15];
+			 Rp[WS(rs, 4)] = FNMS(T23, T24, T21 * T22);
+			 Rm[WS(rs, 4)] = FMA(T23, T22, T21 * T24);
+		    }
+	       }
+	       {
+		    E T1C, T1I, T1G, T1K;
+		    {
+			 E T1A, T1B, T1E, T1F;
+			 T1A = T12 + T15;
+			 T1B = T1p + T1s;
+			 T1C = T1A - T1B;
+			 T1I = T1A + T1B;
+			 T1E = T1i + T1l;
+			 T1F = T19 + T1c;
+			 T1G = T1E - T1F;
+			 T1K = T1E + T1F;
+		    }
+		    {
+			 E T1z, T1D, T1H, T1J;
+			 T1z = W[18];
+			 T1D = W[19];
+			 Rp[WS(rs, 5)] = FNMS(T1D, T1G, T1z * T1C);
+			 Rm[WS(rs, 5)] = FMA(T1D, T1C, T1z * T1G);
+			 T1H = W[6];
+			 T1J = W[7];
+			 Rp[WS(rs, 2)] = FNMS(T1J, T1K, T1H * T1I);
+			 Rm[WS(rs, 2)] = FMA(T1J, T1I, T1H * T1K);
+		    }
+	       }
+	       {
+		    E T1e, T1w, T1u, T1y;
+		    {
+			 E T16, T1d, T1m, T1t;
+			 T16 = T12 - T15;
+			 T1d = T19 - T1c;
+			 T1e = T16 - T1d;
+			 T1w = T16 + T1d;
+			 T1m = T1i - T1l;
+			 T1t = T1p - T1s;
+			 T1u = T1m + T1t;
+			 T1y = T1m - T1t;
+		    }
+		    {
+			 E TZ, T1f, T1v, T1x;
+			 TZ = W[0];
+			 T1f = W[1];
+			 Ip[0] = FNMS(T1f, T1u, TZ * T1e);
+			 Im[0] = FMA(TZ, T1u, T1f * T1e);
+			 T1v = W[12];
+			 T1x = W[13];
+			 Ip[WS(rs, 3)] = FNMS(T1x, T1y, T1v * T1w);
+			 Im[WS(rs, 3)] = FMA(T1v, T1y, T1x * T1w);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 12, "hc2cb_12", twinstr, &GENUS, {88, 30, 30, 0} };
+
+void X(codelet_hc2cb_12) (planner *p) {
+     X(khc2c_register) (p, hc2cb_12, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,809 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:53 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cb_16 -include hc2cb.h */
+
+/*
+ * This function contains 174 FP additions, 100 FP multiplications,
+ * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
+ * 78 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T1I, T1L, T1K, T1M, T1J;
+	       {
+		    E T1O, TA, T1h, T21, T3b, T2T, T3D, T3r, T1k, T1P, T3y, Tf, T36, T2A, T22;
+		    E TL, T2F, T2U, T3u, T3z, T2K, T2V, T12, Tu, T3E, TX, T1n, T17, T1T, T24;
+		    E T1W, T25;
+		    {
+			 E T2z, TF, TK, T2w;
+			 {
+			      E Tw, T3, T2Q, T1g, T1d, T6, T2R, Tz, Tb, TB, Ta, T2y, TE, Tc, TH;
+			      E TI;
+			      {
+				   E T4, T5, Tx, Ty;
+				   {
+					E T1, T2, T1e, T1f;
+					T1 = Rp[0];
+					T2 = Rm[WS(rs, 7)];
+					T1e = Ip[0];
+					T1f = Im[WS(rs, 7)];
+					T4 = Rp[WS(rs, 4)];
+					Tw = T1 - T2;
+					T3 = T1 + T2;
+					T2Q = T1e - T1f;
+					T1g = T1e + T1f;
+					T5 = Rm[WS(rs, 3)];
+					Tx = Ip[WS(rs, 4)];
+					Ty = Im[WS(rs, 3)];
+				   }
+				   {
+					E T8, T9, TC, TD;
+					T8 = Rp[WS(rs, 2)];
+					T1d = T4 - T5;
+					T6 = T4 + T5;
+					T2R = Tx - Ty;
+					Tz = Tx + Ty;
+					T9 = Rm[WS(rs, 5)];
+					TC = Ip[WS(rs, 2)];
+					TD = Im[WS(rs, 5)];
+					Tb = Rm[WS(rs, 1)];
+					TB = T8 - T9;
+					Ta = T8 + T9;
+					T2y = TC - TD;
+					TE = TC + TD;
+					Tc = Rp[WS(rs, 6)];
+					TH = Ip[WS(rs, 6)];
+					TI = Im[WS(rs, 1)];
+				   }
+			      }
+			      {
+				   E TG, T2x, TJ, Te, T2P, T2S, T3p, Td;
+				   T1O = Tw + Tz;
+				   TA = Tw - Tz;
+				   TG = Tb - Tc;
+				   Td = Tb + Tc;
+				   T2x = TH - TI;
+				   TJ = TH + TI;
+				   T1h = T1d + T1g;
+				   T21 = T1g - T1d;
+				   Te = Ta + Td;
+				   T2P = Ta - Td;
+				   T2S = T2Q - T2R;
+				   T3p = T2Q + T2R;
+				   {
+					E T1i, T1j, T3q, T7;
+					T3q = T2y + T2x;
+					T2z = T2x - T2y;
+					TF = TB - TE;
+					T1i = TB + TE;
+					T3b = T2S - T2P;
+					T2T = T2P + T2S;
+					TK = TG - TJ;
+					T1j = TG + TJ;
+					T3D = T3p - T3q;
+					T3r = T3p + T3q;
+					T2w = T3 - T6;
+					T7 = T3 + T6;
+					T1k = T1i - T1j;
+					T1P = T1i + T1j;
+					T3y = T7 - Te;
+					Tf = T7 + Te;
+				   }
+			      }
+			 }
+			 {
+			      E T13, Ti, T2C, T11, TY, Tl, T2D, T16, Tq, TS, Tp, T2H, TQ, Tr, TT;
+			      E TU;
+			      {
+				   E Tj, Tk, T14, T15;
+				   {
+					E Tg, Th, TZ, T10;
+					Tg = Rp[WS(rs, 1)];
+					T36 = T2w - T2z;
+					T2A = T2w + T2z;
+					T22 = TF - TK;
+					TL = TF + TK;
+					Th = Rm[WS(rs, 6)];
+					TZ = Ip[WS(rs, 1)];
+					T10 = Im[WS(rs, 6)];
+					Tj = Rp[WS(rs, 5)];
+					T13 = Tg - Th;
+					Ti = Tg + Th;
+					T2C = TZ - T10;
+					T11 = TZ + T10;
+					Tk = Rm[WS(rs, 2)];
+					T14 = Ip[WS(rs, 5)];
+					T15 = Im[WS(rs, 2)];
+				   }
+				   {
+					E Tn, To, TO, TP;
+					Tn = Rm[0];
+					TY = Tj - Tk;
+					Tl = Tj + Tk;
+					T2D = T14 - T15;
+					T16 = T14 + T15;
+					To = Rp[WS(rs, 7)];
+					TO = Ip[WS(rs, 7)];
+					TP = Im[0];
+					Tq = Rp[WS(rs, 3)];
+					TS = Tn - To;
+					Tp = Tn + To;
+					T2H = TO - TP;
+					TQ = TO + TP;
+					Tr = Rm[WS(rs, 4)];
+					TT = Ip[WS(rs, 3)];
+					TU = Im[WS(rs, 4)];
+				   }
+			      }
+			      {
+				   E TN, TV, Tm, Tt;
+				   {
+					E T2E, T3s, Ts, T2B, T3t, T2J, T2I, T2G;
+					T2E = T2C - T2D;
+					T3s = T2C + T2D;
+					TN = Tq - Tr;
+					Ts = Tq + Tr;
+					T2I = TT - TU;
+					TV = TT + TU;
+					T2B = Ti - Tl;
+					Tm = Ti + Tl;
+					T3t = T2H + T2I;
+					T2J = T2H - T2I;
+					Tt = Tp + Ts;
+					T2G = Tp - Ts;
+					T2F = T2B - T2E;
+					T2U = T2B + T2E;
+					T3u = T3s + T3t;
+					T3z = T3t - T3s;
+					T2K = T2G + T2J;
+					T2V = T2J - T2G;
+				   }
+				   {
+					E T1U, T1V, T1R, T1S, TR, TW;
+					TR = TN - TQ;
+					T1U = TN + TQ;
+					T1V = TS + TV;
+					TW = TS - TV;
+					T1R = T11 - TY;
+					T12 = TY + T11;
+					Tu = Tm + Tt;
+					T3E = Tm - Tt;
+					TX = FNMS(KP414213562, TW, TR);
+					T1n = FMA(KP414213562, TR, TW);
+					T17 = T13 - T16;
+					T1S = T13 + T16;
+					T1T = FNMS(KP414213562, T1S, T1R);
+					T24 = FMA(KP414213562, T1R, T1S);
+					T1W = FNMS(KP414213562, T1V, T1U);
+					T25 = FMA(KP414213562, T1U, T1V);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T18, T1m, T2W, T2L, T3j, T3i, T3h;
+			 {
+			      E T3m, T3v, T3l, T3o;
+			      Rp[0] = Tf + Tu;
+			      T18 = FMA(KP414213562, T17, T12);
+			      T1m = FNMS(KP414213562, T12, T17);
+			      T3m = Tf - Tu;
+			      T3v = T3r - T3u;
+			      T3l = W[14];
+			      T3o = W[15];
+			      Rm[0] = T3r + T3u;
+			      {
+				   E T3A, T3I, T3L, T3F, T3C, T3G, T3B, T3x, T3n, T3w, T3H, T3K;
+				   T3A = T3y - T3z;
+				   T3I = T3y + T3z;
+				   T3n = T3l * T3m;
+				   T3w = T3o * T3m;
+				   T3L = T3E + T3D;
+				   T3F = T3D - T3E;
+				   T3x = W[22];
+				   Rp[WS(rs, 4)] = FNMS(T3o, T3v, T3n);
+				   Rm[WS(rs, 4)] = FMA(T3l, T3v, T3w);
+				   T3C = W[23];
+				   T3G = T3x * T3F;
+				   T3B = T3x * T3A;
+				   Rm[WS(rs, 6)] = FMA(T3C, T3A, T3G);
+				   Rp[WS(rs, 6)] = FNMS(T3C, T3F, T3B);
+				   T3H = W[6];
+				   T3K = W[7];
+				   {
+					E T3g, T38, T3d, T35, T3a;
+					{
+					     E T37, T3c, T3M, T3J;
+					     T37 = T2V - T2U;
+					     T2W = T2U + T2V;
+					     T2L = T2F + T2K;
+					     T3c = T2F - T2K;
+					     T3M = T3H * T3L;
+					     T3J = T3H * T3I;
+					     T3g = FMA(KP707106781, T37, T36);
+					     T38 = FNMS(KP707106781, T37, T36);
+					     Rm[WS(rs, 2)] = FMA(T3K, T3I, T3M);
+					     Rp[WS(rs, 2)] = FNMS(T3K, T3L, T3J);
+					     T3d = FNMS(KP707106781, T3c, T3b);
+					     T3j = FMA(KP707106781, T3c, T3b);
+					}
+					T35 = W[26];
+					T3a = W[27];
+					{
+					     E T3f, T3e, T39, T3k;
+					     T3f = W[10];
+					     T3i = W[11];
+					     T3e = T35 * T3d;
+					     T39 = T35 * T38;
+					     T3k = T3f * T3j;
+					     T3h = T3f * T3g;
+					     Rm[WS(rs, 7)] = FMA(T3a, T38, T3e);
+					     Rp[WS(rs, 7)] = FNMS(T3a, T3d, T39);
+					     Rm[WS(rs, 3)] = FMA(T3i, T3g, T3k);
+					}
+				   }
+			      }
+			 }
+			 Rp[WS(rs, 3)] = FNMS(T3i, T3j, T3h);
+			 {
+			      E T2g, T2m, T2l, T2h, T2d, T29, T2c, T2b, T2e;
+			      {
+				   E T33, T2Z, T32, T31, T34;
+				   {
+					E T2v, T30, T2M, T2X, T2O, T2N, T2Y;
+					T2v = W[18];
+					T30 = FMA(KP707106781, T2L, T2A);
+					T2M = FNMS(KP707106781, T2L, T2A);
+					T33 = FMA(KP707106781, T2W, T2T);
+					T2X = FNMS(KP707106781, T2W, T2T);
+					T2O = W[19];
+					T2N = T2v * T2M;
+					T2Z = W[2];
+					T32 = W[3];
+					T2Y = T2O * T2M;
+					Rp[WS(rs, 5)] = FNMS(T2O, T2X, T2N);
+					T31 = T2Z * T30;
+					T34 = T32 * T30;
+					Rm[WS(rs, 5)] = FMA(T2v, T2X, T2Y);
+				   }
+				   {
+					E T1Q, T1X, T23, T26;
+					T2g = FMA(KP707106781, T1P, T1O);
+					T1Q = FNMS(KP707106781, T1P, T1O);
+					Rp[WS(rs, 1)] = FNMS(T32, T33, T31);
+					Rm[WS(rs, 1)] = FMA(T2Z, T33, T34);
+					T1X = T1T + T1W;
+					T2m = T1W - T1T;
+					T2l = FNMS(KP707106781, T22, T21);
+					T23 = FMA(KP707106781, T22, T21);
+					T26 = T24 - T25;
+					T2h = T24 + T25;
+					{
+					     E T1N, T2a, T1Y, T27, T20, T1Z, T28;
+					     T1N = W[20];
+					     T2a = FNMS(KP923879532, T1X, T1Q);
+					     T1Y = FMA(KP923879532, T1X, T1Q);
+					     T2d = FMA(KP923879532, T26, T23);
+					     T27 = FNMS(KP923879532, T26, T23);
+					     T20 = W[21];
+					     T1Z = T1N * T1Y;
+					     T29 = W[4];
+					     T2c = W[5];
+					     T28 = T20 * T1Y;
+					     Ip[WS(rs, 5)] = FNMS(T20, T27, T1Z);
+					     T2b = T29 * T2a;
+					     T2e = T2c * T2a;
+					     Im[WS(rs, 5)] = FMA(T1N, T27, T28);
+					}
+				   }
+			      }
+			      {
+				   E T1y, T1E, T1D, T1z, T1v, T1r, T1u, T1t, T1w;
+				   {
+					E TM, T19, T1l, T1o;
+					T1y = FMA(KP707106781, TL, TA);
+					TM = FNMS(KP707106781, TL, TA);
+					Ip[WS(rs, 1)] = FNMS(T2c, T2d, T2b);
+					Im[WS(rs, 1)] = FMA(T29, T2d, T2e);
+					T19 = TX - T18;
+					T1E = T18 + TX;
+					T1D = FMA(KP707106781, T1k, T1h);
+					T1l = FNMS(KP707106781, T1k, T1h);
+					T1o = T1m - T1n;
+					T1z = T1m + T1n;
+					{
+					     E Tv, T1s, T1a, T1p, T1c, T1b, T1q;
+					     Tv = W[24];
+					     T1s = FMA(KP923879532, T19, TM);
+					     T1a = FNMS(KP923879532, T19, TM);
+					     T1v = FMA(KP923879532, T1o, T1l);
+					     T1p = FNMS(KP923879532, T1o, T1l);
+					     T1c = W[25];
+					     T1b = Tv * T1a;
+					     T1r = W[8];
+					     T1u = W[9];
+					     T1q = T1c * T1a;
+					     Ip[WS(rs, 6)] = FNMS(T1c, T1p, T1b);
+					     T1t = T1r * T1s;
+					     T1w = T1u * T1s;
+					     Im[WS(rs, 6)] = FMA(Tv, T1p, T1q);
+					}
+				   }
+				   {
+					E T2q, T2t, T2s, T2u, T2r;
+					Ip[WS(rs, 2)] = FNMS(T1u, T1v, T1t);
+					Im[WS(rs, 2)] = FMA(T1r, T1v, T1w);
+					{
+					     E T2f, T2i, T2n, T2k, T2j, T2p, T2o;
+					     T2f = W[12];
+					     T2q = FMA(KP923879532, T2h, T2g);
+					     T2i = FNMS(KP923879532, T2h, T2g);
+					     T2t = FNMS(KP923879532, T2m, T2l);
+					     T2n = FMA(KP923879532, T2m, T2l);
+					     T2k = W[13];
+					     T2j = T2f * T2i;
+					     T2p = W[28];
+					     T2o = T2f * T2n;
+					     T2s = W[29];
+					     Ip[WS(rs, 3)] = FNMS(T2k, T2n, T2j);
+					     T2u = T2p * T2t;
+					     T2r = T2p * T2q;
+					     Im[WS(rs, 3)] = FMA(T2k, T2i, T2o);
+					}
+					Im[WS(rs, 7)] = FMA(T2s, T2q, T2u);
+					Ip[WS(rs, 7)] = FNMS(T2s, T2t, T2r);
+					{
+					     E T1x, T1A, T1F, T1C, T1B, T1H, T1G;
+					     T1x = W[16];
+					     T1I = FMA(KP923879532, T1z, T1y);
+					     T1A = FNMS(KP923879532, T1z, T1y);
+					     T1L = FMA(KP923879532, T1E, T1D);
+					     T1F = FNMS(KP923879532, T1E, T1D);
+					     T1C = W[17];
+					     T1B = T1x * T1A;
+					     T1H = W[0];
+					     T1G = T1x * T1F;
+					     T1K = W[1];
+					     Ip[WS(rs, 4)] = FNMS(T1C, T1F, T1B);
+					     T1M = T1H * T1L;
+					     T1J = T1H * T1I;
+					     Im[WS(rs, 4)] = FMA(T1C, T1A, T1G);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Im[0] = FMA(T1K, T1I, T1M);
+	       Ip[0] = FNMS(T1K, T1L, T1J);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cb_16", twinstr, &GENUS, {104, 30, 70, 0} };
+
+void X(codelet_hc2cb_16) (planner *p) {
+     X(khc2c_register) (p, hc2cb_16, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cb_16 -include hc2cb.h */
+
+/*
+ * This function contains 174 FP additions, 84 FP multiplications,
+ * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
+ * 50 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T7, T2K, T2W, Tw, T17, T1S, T2k, T1w, Te, TD, T1x, T10, T2n, T2L, T1Z;
+	       E T2X, Tm, T1z, TN, T19, T2e, T2p, T2P, T2Z, Tt, T1A, TW, T1a, T27, T2q;
+	       E T2S, T30;
+	       {
+		    E T3, T1Q, T13, T2j, T6, T2i, T16, T1R;
+		    {
+			 E T1, T2, T11, T12;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 7)];
+			 T3 = T1 + T2;
+			 T1Q = T1 - T2;
+			 T11 = Ip[0];
+			 T12 = Im[WS(rs, 7)];
+			 T13 = T11 - T12;
+			 T2j = T11 + T12;
+		    }
+		    {
+			 E T4, T5, T14, T15;
+			 T4 = Rp[WS(rs, 4)];
+			 T5 = Rm[WS(rs, 3)];
+			 T6 = T4 + T5;
+			 T2i = T4 - T5;
+			 T14 = Ip[WS(rs, 4)];
+			 T15 = Im[WS(rs, 3)];
+			 T16 = T14 - T15;
+			 T1R = T14 + T15;
+		    }
+		    T7 = T3 + T6;
+		    T2K = T1Q + T1R;
+		    T2W = T2j - T2i;
+		    Tw = T3 - T6;
+		    T17 = T13 - T16;
+		    T1S = T1Q - T1R;
+		    T2k = T2i + T2j;
+		    T1w = T13 + T16;
+	       }
+	       {
+		    E Ta, T1T, TC, T1U, Td, T1W, Tz, T1X;
+		    {
+			 E T8, T9, TA, TB;
+			 T8 = Rp[WS(rs, 2)];
+			 T9 = Rm[WS(rs, 5)];
+			 Ta = T8 + T9;
+			 T1T = T8 - T9;
+			 TA = Ip[WS(rs, 2)];
+			 TB = Im[WS(rs, 5)];
+			 TC = TA - TB;
+			 T1U = TA + TB;
+		    }
+		    {
+			 E Tb, Tc, Tx, Ty;
+			 Tb = Rm[WS(rs, 1)];
+			 Tc = Rp[WS(rs, 6)];
+			 Td = Tb + Tc;
+			 T1W = Tb - Tc;
+			 Tx = Ip[WS(rs, 6)];
+			 Ty = Im[WS(rs, 1)];
+			 Tz = Tx - Ty;
+			 T1X = Tx + Ty;
+		    }
+		    Te = Ta + Td;
+		    TD = Tz - TC;
+		    T1x = TC + Tz;
+		    T10 = Ta - Td;
+		    {
+			 E T2l, T2m, T1V, T1Y;
+			 T2l = T1T + T1U;
+			 T2m = T1W + T1X;
+			 T2n = KP707106781 * (T2l - T2m);
+			 T2L = KP707106781 * (T2l + T2m);
+			 T1V = T1T - T1U;
+			 T1Y = T1W - T1X;
+			 T1Z = KP707106781 * (T1V + T1Y);
+			 T2X = KP707106781 * (T1V - T1Y);
+		    }
+	       }
+	       {
+		    E Ti, T2b, TI, T29, Tl, T28, TL, T2c, TF, TM;
+		    {
+			 E Tg, Th, TG, TH;
+			 Tg = Rp[WS(rs, 1)];
+			 Th = Rm[WS(rs, 6)];
+			 Ti = Tg + Th;
+			 T2b = Tg - Th;
+			 TG = Ip[WS(rs, 1)];
+			 TH = Im[WS(rs, 6)];
+			 TI = TG - TH;
+			 T29 = TG + TH;
+		    }
+		    {
+			 E Tj, Tk, TJ, TK;
+			 Tj = Rp[WS(rs, 5)];
+			 Tk = Rm[WS(rs, 2)];
+			 Tl = Tj + Tk;
+			 T28 = Tj - Tk;
+			 TJ = Ip[WS(rs, 5)];
+			 TK = Im[WS(rs, 2)];
+			 TL = TJ - TK;
+			 T2c = TJ + TK;
+		    }
+		    Tm = Ti + Tl;
+		    T1z = TI + TL;
+		    TF = Ti - Tl;
+		    TM = TI - TL;
+		    TN = TF - TM;
+		    T19 = TF + TM;
+		    {
+			 E T2a, T2d, T2N, T2O;
+			 T2a = T28 + T29;
+			 T2d = T2b - T2c;
+			 T2e = FMA(KP923879532, T2a, KP382683432 * T2d);
+			 T2p = FNMS(KP382683432, T2a, KP923879532 * T2d);
+			 T2N = T2b + T2c;
+			 T2O = T29 - T28;
+			 T2P = FNMS(KP923879532, T2O, KP382683432 * T2N);
+			 T2Z = FMA(KP382683432, T2O, KP923879532 * T2N);
+		    }
+	       }
+	       {
+		    E Tp, T24, TR, T22, Ts, T21, TU, T25, TO, TV;
+		    {
+			 E Tn, To, TP, TQ;
+			 Tn = Rm[0];
+			 To = Rp[WS(rs, 7)];
+			 Tp = Tn + To;
+			 T24 = Tn - To;
+			 TP = Ip[WS(rs, 7)];
+			 TQ = Im[0];
+			 TR = TP - TQ;
+			 T22 = TP + TQ;
+		    }
+		    {
+			 E Tq, Tr, TS, TT;
+			 Tq = Rp[WS(rs, 3)];
+			 Tr = Rm[WS(rs, 4)];
+			 Ts = Tq + Tr;
+			 T21 = Tq - Tr;
+			 TS = Ip[WS(rs, 3)];
+			 TT = Im[WS(rs, 4)];
+			 TU = TS - TT;
+			 T25 = TS + TT;
+		    }
+		    Tt = Tp + Ts;
+		    T1A = TR + TU;
+		    TO = Tp - Ts;
+		    TV = TR - TU;
+		    TW = TO + TV;
+		    T1a = TV - TO;
+		    {
+			 E T23, T26, T2Q, T2R;
+			 T23 = T21 - T22;
+			 T26 = T24 - T25;
+			 T27 = FNMS(KP382683432, T26, KP923879532 * T23);
+			 T2q = FMA(KP382683432, T23, KP923879532 * T26);
+			 T2Q = T24 + T25;
+			 T2R = T21 + T22;
+			 T2S = FNMS(KP923879532, T2R, KP382683432 * T2Q);
+			 T30 = FMA(KP382683432, T2R, KP923879532 * T2Q);
+		    }
+	       }
+	       {
+		    E Tf, Tu, T1u, T1y, T1B, T1C, T1t, T1v;
+		    Tf = T7 + Te;
+		    Tu = Tm + Tt;
+		    T1u = Tf - Tu;
+		    T1y = T1w + T1x;
+		    T1B = T1z + T1A;
+		    T1C = T1y - T1B;
+		    Rp[0] = Tf + Tu;
+		    Rm[0] = T1y + T1B;
+		    T1t = W[14];
+		    T1v = W[15];
+		    Rp[WS(rs, 4)] = FNMS(T1v, T1C, T1t * T1u);
+		    Rm[WS(rs, 4)] = FMA(T1v, T1u, T1t * T1C);
+	       }
+	       {
+		    E T2U, T34, T32, T36;
+		    {
+			 E T2M, T2T, T2Y, T31;
+			 T2M = T2K - T2L;
+			 T2T = T2P + T2S;
+			 T2U = T2M - T2T;
+			 T34 = T2M + T2T;
+			 T2Y = T2W + T2X;
+			 T31 = T2Z - T30;
+			 T32 = T2Y - T31;
+			 T36 = T2Y + T31;
+		    }
+		    {
+			 E T2J, T2V, T33, T35;
+			 T2J = W[20];
+			 T2V = W[21];
+			 Ip[WS(rs, 5)] = FNMS(T2V, T32, T2J * T2U);
+			 Im[WS(rs, 5)] = FMA(T2V, T2U, T2J * T32);
+			 T33 = W[4];
+			 T35 = W[5];
+			 Ip[WS(rs, 1)] = FNMS(T35, T36, T33 * T34);
+			 Im[WS(rs, 1)] = FMA(T35, T34, T33 * T36);
+		    }
+	       }
+	       {
+		    E T3a, T3g, T3e, T3i;
+		    {
+			 E T38, T39, T3c, T3d;
+			 T38 = T2K + T2L;
+			 T39 = T2Z + T30;
+			 T3a = T38 - T39;
+			 T3g = T38 + T39;
+			 T3c = T2W - T2X;
+			 T3d = T2P - T2S;
+			 T3e = T3c + T3d;
+			 T3i = T3c - T3d;
+		    }
+		    {
+			 E T37, T3b, T3f, T3h;
+			 T37 = W[12];
+			 T3b = W[13];
+			 Ip[WS(rs, 3)] = FNMS(T3b, T3e, T37 * T3a);
+			 Im[WS(rs, 3)] = FMA(T37, T3e, T3b * T3a);
+			 T3f = W[28];
+			 T3h = W[29];
+			 Ip[WS(rs, 7)] = FNMS(T3h, T3i, T3f * T3g);
+			 Im[WS(rs, 7)] = FMA(T3f, T3i, T3h * T3g);
+		    }
+	       }
+	       {
+		    E TY, T1e, T1c, T1g;
+		    {
+			 E TE, TX, T18, T1b;
+			 TE = Tw + TD;
+			 TX = KP707106781 * (TN + TW);
+			 TY = TE - TX;
+			 T1e = TE + TX;
+			 T18 = T10 + T17;
+			 T1b = KP707106781 * (T19 + T1a);
+			 T1c = T18 - T1b;
+			 T1g = T18 + T1b;
+		    }
+		    {
+			 E Tv, TZ, T1d, T1f;
+			 Tv = W[18];
+			 TZ = W[19];
+			 Rp[WS(rs, 5)] = FNMS(TZ, T1c, Tv * TY);
+			 Rm[WS(rs, 5)] = FMA(TZ, TY, Tv * T1c);
+			 T1d = W[2];
+			 T1f = W[3];
+			 Rp[WS(rs, 1)] = FNMS(T1f, T1g, T1d * T1e);
+			 Rm[WS(rs, 1)] = FMA(T1f, T1e, T1d * T1g);
+		    }
+	       }
+	       {
+		    E T1k, T1q, T1o, T1s;
+		    {
+			 E T1i, T1j, T1m, T1n;
+			 T1i = Tw - TD;
+			 T1j = KP707106781 * (T1a - T19);
+			 T1k = T1i - T1j;
+			 T1q = T1i + T1j;
+			 T1m = T17 - T10;
+			 T1n = KP707106781 * (TN - TW);
+			 T1o = T1m - T1n;
+			 T1s = T1m + T1n;
+		    }
+		    {
+			 E T1h, T1l, T1p, T1r;
+			 T1h = W[26];
+			 T1l = W[27];
+			 Rp[WS(rs, 7)] = FNMS(T1l, T1o, T1h * T1k);
+			 Rm[WS(rs, 7)] = FMA(T1h, T1o, T1l * T1k);
+			 T1p = W[10];
+			 T1r = W[11];
+			 Rp[WS(rs, 3)] = FNMS(T1r, T1s, T1p * T1q);
+			 Rm[WS(rs, 3)] = FMA(T1p, T1s, T1r * T1q);
+		    }
+	       }
+	       {
+		    E T2g, T2u, T2s, T2w;
+		    {
+			 E T20, T2f, T2o, T2r;
+			 T20 = T1S - T1Z;
+			 T2f = T27 - T2e;
+			 T2g = T20 - T2f;
+			 T2u = T20 + T2f;
+			 T2o = T2k - T2n;
+			 T2r = T2p - T2q;
+			 T2s = T2o - T2r;
+			 T2w = T2o + T2r;
+		    }
+		    {
+			 E T1P, T2h, T2t, T2v;
+			 T1P = W[24];
+			 T2h = W[25];
+			 Ip[WS(rs, 6)] = FNMS(T2h, T2s, T1P * T2g);
+			 Im[WS(rs, 6)] = FMA(T2h, T2g, T1P * T2s);
+			 T2t = W[8];
+			 T2v = W[9];
+			 Ip[WS(rs, 2)] = FNMS(T2v, T2w, T2t * T2u);
+			 Im[WS(rs, 2)] = FMA(T2v, T2u, T2t * T2w);
+		    }
+	       }
+	       {
+		    E T2A, T2G, T2E, T2I;
+		    {
+			 E T2y, T2z, T2C, T2D;
+			 T2y = T1S + T1Z;
+			 T2z = T2p + T2q;
+			 T2A = T2y - T2z;
+			 T2G = T2y + T2z;
+			 T2C = T2k + T2n;
+			 T2D = T2e + T27;
+			 T2E = T2C - T2D;
+			 T2I = T2C + T2D;
+		    }
+		    {
+			 E T2x, T2B, T2F, T2H;
+			 T2x = W[16];
+			 T2B = W[17];
+			 Ip[WS(rs, 4)] = FNMS(T2B, T2E, T2x * T2A);
+			 Im[WS(rs, 4)] = FMA(T2x, T2E, T2B * T2A);
+			 T2F = W[0];
+			 T2H = W[1];
+			 Ip[0] = FNMS(T2H, T2I, T2F * T2G);
+			 Im[0] = FMA(T2F, T2I, T2H * T2G);
+		    }
+	       }
+	       {
+		    E T1G, T1M, T1K, T1O;
+		    {
+			 E T1E, T1F, T1I, T1J;
+			 T1E = T7 - Te;
+			 T1F = T1A - T1z;
+			 T1G = T1E - T1F;
+			 T1M = T1E + T1F;
+			 T1I = T1w - T1x;
+			 T1J = Tm - Tt;
+			 T1K = T1I - T1J;
+			 T1O = T1J + T1I;
+		    }
+		    {
+			 E T1D, T1H, T1L, T1N;
+			 T1D = W[22];
+			 T1H = W[23];
+			 Rp[WS(rs, 6)] = FNMS(T1H, T1K, T1D * T1G);
+			 Rm[WS(rs, 6)] = FMA(T1D, T1K, T1H * T1G);
+			 T1L = W[6];
+			 T1N = W[7];
+			 Rp[WS(rs, 2)] = FNMS(T1N, T1O, T1L * T1M);
+			 Rm[WS(rs, 2)] = FMA(T1L, T1O, T1N * T1M);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cb_16", twinstr, &GENUS, {136, 46, 38, 0} };
+
+void X(codelet_hc2cb_16) (planner *p) {
+     X(khc2c_register) (p, hc2cb_16, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:52 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cb_2 -include hc2cb.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T5, T6, T9, T8, T7, Ta;
+	       {
+		    E T1, T2, T3, T4;
+		    T1 = Rp[0];
+		    T2 = Rm[0];
+		    T3 = Ip[0];
+		    T4 = Im[0];
+		    T5 = W[0];
+		    Rp[0] = T1 + T2;
+		    T6 = T1 - T2;
+		    Rm[0] = T3 - T4;
+		    T9 = T3 + T4;
+		    T8 = W[1];
+		    T7 = T5 * T6;
+	       }
+	       Ta = T8 * T6;
+	       Ip[0] = FNMS(T8, T9, T7);
+	       Im[0] = FMA(T5, T9, Ta);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 2, "hc2cb_2", twinstr, &GENUS, {4, 2, 2, 0} };
+
+void X(codelet_hc2cb_2) (planner *p) {
+     X(khc2c_register) (p, hc2cb_2, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cb_2 -include hc2cb.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 9 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T1, T2, T6, T3, T4, T8, T5, T7;
+	       T1 = Rp[0];
+	       T2 = Rm[0];
+	       T6 = T1 - T2;
+	       T3 = Ip[0];
+	       T4 = Im[0];
+	       T8 = T3 + T4;
+	       Rp[0] = T1 + T2;
+	       Rm[0] = T3 - T4;
+	       T5 = W[0];
+	       T7 = W[1];
+	       Ip[0] = FNMS(T7, T8, T5 * T6);
+	       Im[0] = FMA(T7, T6, T5 * T8);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 2, "hc2cb_2", twinstr, &GENUS, {4, 2, 2, 0} };
+
+void X(codelet_hc2cb_2) (planner *p) {
+     X(khc2c_register) (p, hc2cb_2, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1049 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:55 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cb_20 -include hc2cb.h */
+
+/*
+ * This function contains 246 FP additions, 148 FP multiplications,
+ * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
+ * 112 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T1T, T1Q, T1P;
+	       {
+		    E T3z, T4z, TE, T7, T2W, T4e, T2l, T1t, T33, T3H, T3G, T3a, T1i, T2g, T13;
+		    E T4H, T4G, T2d, T1B, T4u, T4B, T4A, T4r, T1A, T2s, T3l, T2t, T3s, T2o, T2q;
+		    E T1w, T1y, TC, T29, T3E, T3C, T4n, T4l, TN, TL;
+		    {
+			 E T4, T2U, T3, T3x, T1p, T5, T1q, T1r;
+			 {
+			      E T1, T2, T1n, T1o;
+			      T1 = Rp[0];
+			      T2 = Rm[WS(rs, 9)];
+			      T1n = Ip[0];
+			      T1o = Im[WS(rs, 9)];
+			      T4 = Rp[WS(rs, 5)];
+			      T2U = T1 - T2;
+			      T3 = T1 + T2;
+			      T3x = T1n + T1o;
+			      T1p = T1n - T1o;
+			      T5 = Rm[WS(rs, 4)];
+			      T1q = Ip[WS(rs, 5)];
+			      T1r = Im[WS(rs, 4)];
+			 }
+			 {
+			      E T3o, T4p, TF, Te, T2Z, T4f, T2b, T1a, T3k, T4t, TJ, TA, T39, T4j, T2f;
+			      E T12, T3r, T4q, TG, Tl, T32, T4g, T2c, T1h, Tq, T34, Tp, T3f, TR, Tr;
+			      E TS, TT;
+			      {
+				   E Tx, T37, Tw, T3j, TY, Ty, TZ, T10;
+				   {
+					E Tb, T2X, Ta, T3m, T16, Tc, T17, T18;
+					{
+					     E T8, T9, T14, T15;
+					     T8 = Rp[WS(rs, 4)];
+					     {
+						  E T3y, T6, T2V, T1s;
+						  T3y = T4 - T5;
+						  T6 = T4 + T5;
+						  T2V = T1q + T1r;
+						  T1s = T1q - T1r;
+						  T3z = T3x - T3y;
+						  T4z = T3y + T3x;
+						  TE = T3 - T6;
+						  T7 = T3 + T6;
+						  T2W = T2U + T2V;
+						  T4e = T2U - T2V;
+						  T2l = T1p + T1s;
+						  T1t = T1p - T1s;
+						  T9 = Rm[WS(rs, 5)];
+					     }
+					     T14 = Ip[WS(rs, 4)];
+					     T15 = Im[WS(rs, 5)];
+					     Tb = Rp[WS(rs, 9)];
+					     T2X = T8 - T9;
+					     Ta = T8 + T9;
+					     T3m = T14 + T15;
+					     T16 = T14 - T15;
+					     Tc = Rm[0];
+					     T17 = Ip[WS(rs, 9)];
+					     T18 = Im[0];
+					}
+					{
+					     E Tu, Tv, TW, TX;
+					     Tu = Rm[WS(rs, 7)];
+					     {
+						  E T3n, Td, T2Y, T19;
+						  T3n = Tb - Tc;
+						  Td = Tb + Tc;
+						  T2Y = T17 + T18;
+						  T19 = T17 - T18;
+						  T3o = T3m - T3n;
+						  T4p = T3n + T3m;
+						  TF = Ta - Td;
+						  Te = Ta + Td;
+						  T2Z = T2X + T2Y;
+						  T4f = T2X - T2Y;
+						  T2b = T16 + T19;
+						  T1a = T16 - T19;
+						  Tv = Rp[WS(rs, 2)];
+					     }
+					     TW = Ip[WS(rs, 2)];
+					     TX = Im[WS(rs, 7)];
+					     Tx = Rm[WS(rs, 2)];
+					     T37 = Tu - Tv;
+					     Tw = Tu + Tv;
+					     T3j = TW + TX;
+					     TY = TW - TX;
+					     Ty = Rp[WS(rs, 7)];
+					     TZ = Ip[WS(rs, 7)];
+					     T10 = Im[WS(rs, 2)];
+					}
+				   }
+				   {
+					E Ti, T30, Th, T3q, T1d, Tj, T1e, T1f;
+					{
+					     E Tf, Tg, T1b, T1c;
+					     Tf = Rm[WS(rs, 3)];
+					     {
+						  E T3i, Tz, T38, T11;
+						  T3i = Tx - Ty;
+						  Tz = Tx + Ty;
+						  T38 = TZ + T10;
+						  T11 = TZ - T10;
+						  T3k = T3i + T3j;
+						  T4t = T3i - T3j;
+						  TJ = Tw - Tz;
+						  TA = Tw + Tz;
+						  T39 = T37 - T38;
+						  T4j = T37 + T38;
+						  T2f = TY + T11;
+						  T12 = TY - T11;
+						  Tg = Rp[WS(rs, 6)];
+					     }
+					     T1b = Ip[WS(rs, 6)];
+					     T1c = Im[WS(rs, 3)];
+					     Ti = Rp[WS(rs, 1)];
+					     T30 = Tf - Tg;
+					     Th = Tf + Tg;
+					     T3q = T1b + T1c;
+					     T1d = T1b - T1c;
+					     Tj = Rm[WS(rs, 8)];
+					     T1e = Ip[WS(rs, 1)];
+					     T1f = Im[WS(rs, 8)];
+					}
+					{
+					     E Tn, To, TP, TQ;
+					     Tn = Rp[WS(rs, 8)];
+					     {
+						  E T3p, Tk, T31, T1g;
+						  T3p = Ti - Tj;
+						  Tk = Ti + Tj;
+						  T31 = T1e + T1f;
+						  T1g = T1e - T1f;
+						  T3r = T3p + T3q;
+						  T4q = T3p - T3q;
+						  TG = Th - Tk;
+						  Tl = Th + Tk;
+						  T32 = T30 + T31;
+						  T4g = T30 - T31;
+						  T2c = T1d + T1g;
+						  T1h = T1d - T1g;
+						  To = Rm[WS(rs, 1)];
+					     }
+					     TP = Ip[WS(rs, 8)];
+					     TQ = Im[WS(rs, 1)];
+					     Tq = Rm[WS(rs, 6)];
+					     T34 = Tn - To;
+					     Tp = Tn + To;
+					     T3f = TP + TQ;
+					     TR = TP - TQ;
+					     Tr = Rp[WS(rs, 3)];
+					     TS = Ip[WS(rs, 3)];
+					     TT = Im[WS(rs, 6)];
+					}
+				   }
+			      }
+			      {
+				   E T3h, Tt, T1u, T2n, T1v, T4k, T4h, T2m, TH, TK, T4s, TI;
+				   T33 = T2Z + T32;
+				   T3H = T2Z - T32;
+				   {
+					E T3g, Ts, T35, TU;
+					T3g = Tq - Tr;
+					Ts = Tq + Tr;
+					T35 = TS + TT;
+					TU = TS - TT;
+					T3h = T3f - T3g;
+					T4s = T3g + T3f;
+					TI = Tp - Ts;
+					Tt = Tp + Ts;
+					{
+					     E T36, T4i, T2e, TV;
+					     T36 = T34 - T35;
+					     T4i = T34 + T35;
+					     T2e = TR + TU;
+					     TV = TR - TU;
+					     T3G = T36 - T39;
+					     T3a = T36 + T39;
+					     T1u = T1a + T1h;
+					     T1i = T1a - T1h;
+					     T2g = T2e - T2f;
+					     T2n = T2e + T2f;
+					     T1v = TV + T12;
+					     T13 = TV - T12;
+					     T4H = T4i - T4j;
+					     T4k = T4i + T4j;
+					}
+				   }
+				   T4h = T4f + T4g;
+				   T4G = T4f - T4g;
+				   T2d = T2b - T2c;
+				   T2m = T2b + T2c;
+				   TH = TF + TG;
+				   T1B = TF - TG;
+				   T4u = T4s - T4t;
+				   T4B = T4s + T4t;
+				   T4A = T4p + T4q;
+				   T4r = T4p - T4q;
+				   T1A = TI - TJ;
+				   TK = TI + TJ;
+				   {
+					E Tm, T3B, TB, T3A;
+					Tm = Te + Tl;
+					T2s = Te - Tl;
+					T3l = T3h + T3k;
+					T3B = T3h - T3k;
+					TB = Tt + TA;
+					T2t = Tt - TA;
+					T3s = T3o + T3r;
+					T3A = T3o - T3r;
+					T2o = T2m + T2n;
+					T2q = T2m - T2n;
+					T1w = T1u + T1v;
+					T1y = T1u - T1v;
+					TC = Tm + TB;
+					T29 = Tm - TB;
+					T3E = T3A - T3B;
+					T3C = T3A + T3B;
+					T4n = T4h - T4k;
+					T4l = T4h + T4k;
+					TN = TH - TK;
+					TL = TH + TK;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T3d, T3b, T4E, T1x, TM, T4m, T58, T5b, T4D, T5a, T5c, T59, T4C;
+			 Rp[0] = T7 + TC;
+			 T3d = T33 - T3a;
+			 T3b = T33 + T3a;
+			 T4E = T4A - T4B;
+			 T4C = T4A + T4B;
+			 Rm[0] = T2l + T2o;
+			 {
+			      E T25, T22, T21, T24, T23, T26, T57;
+			      T1x = FNMS(KP250000000, T1w, T1t);
+			      T25 = T1t + T1w;
+			      T22 = TE + TL;
+			      TM = FNMS(KP250000000, TL, TE);
+			      T21 = W[18];
+			      T24 = W[19];
+			      T4m = FNMS(KP250000000, T4l, T4e);
+			      T58 = T4e + T4l;
+			      T5b = T4z + T4C;
+			      T4D = FNMS(KP250000000, T4C, T4z);
+			      T23 = T21 * T22;
+			      T26 = T24 * T22;
+			      T57 = W[8];
+			      T5a = W[9];
+			      Rp[WS(rs, 5)] = FNMS(T24, T25, T23);
+			      Rm[WS(rs, 5)] = FMA(T21, T25, T26);
+			      T5c = T57 * T5b;
+			      T59 = T57 * T58;
+			 }
+			 {
+			      E T3U, T3Z, T3W, T40, T3V;
+			      {
+				   E T3c, T48, T4b, T3D, T47, T4a;
+				   T3c = FNMS(KP250000000, T3b, T2W);
+				   T48 = T2W + T3b;
+				   T4b = T3z + T3C;
+				   T3D = FNMS(KP250000000, T3C, T3z);
+				   Im[WS(rs, 2)] = FMA(T5a, T58, T5c);
+				   Ip[WS(rs, 2)] = FNMS(T5a, T5b, T59);
+				   T47 = W[28];
+				   T4a = W[29];
+				   {
+					E T3I, T3Y, T42, T3u, T3M, T3X, T3F;
+					{
+					     E T3T, T3t, T4c, T49, T3e, T3S;
+					     T3T = FMA(KP618033988, T3l, T3s);
+					     T3t = FNMS(KP618033988, T3s, T3l);
+					     T4c = T47 * T4b;
+					     T49 = T47 * T48;
+					     T3I = FNMS(KP618033988, T3H, T3G);
+					     T3Y = FMA(KP618033988, T3G, T3H);
+					     Im[WS(rs, 7)] = FMA(T4a, T48, T4c);
+					     Ip[WS(rs, 7)] = FNMS(T4a, T4b, T49);
+					     T3e = FNMS(KP559016994, T3d, T3c);
+					     T3S = FMA(KP559016994, T3d, T3c);
+					     T42 = FMA(KP951056516, T3T, T3S);
+					     T3U = FNMS(KP951056516, T3T, T3S);
+					     T3u = FNMS(KP951056516, T3t, T3e);
+					     T3M = FMA(KP951056516, T3t, T3e);
+					     T3X = FMA(KP559016994, T3E, T3D);
+					     T3F = FNMS(KP559016994, T3E, T3D);
+					}
+					{
+					     E T3P, T45, T44, T46, T43;
+					     {
+						  E T3w, T3J, T3v, T3K, T2T, T41;
+						  T2T = W[4];
+						  T3w = W[5];
+						  T3J = FMA(KP951056516, T3I, T3F);
+						  T3P = FNMS(KP951056516, T3I, T3F);
+						  T45 = FNMS(KP951056516, T3Y, T3X);
+						  T3Z = FMA(KP951056516, T3Y, T3X);
+						  T3v = T2T * T3u;
+						  T3K = T2T * T3J;
+						  T41 = W[36];
+						  T44 = W[37];
+						  Ip[WS(rs, 1)] = FNMS(T3w, T3J, T3v);
+						  Im[WS(rs, 1)] = FMA(T3w, T3u, T3K);
+						  T46 = T41 * T45;
+						  T43 = T41 * T42;
+					     }
+					     {
+						  E T3O, T3Q, T3N, T3L, T3R;
+						  T3L = W[12];
+						  T3O = W[13];
+						  Im[WS(rs, 9)] = FMA(T44, T42, T46);
+						  Ip[WS(rs, 9)] = FNMS(T44, T45, T43);
+						  T3Q = T3L * T3P;
+						  T3N = T3L * T3M;
+						  T3R = W[20];
+						  T3W = W[21];
+						  Im[WS(rs, 3)] = FMA(T3O, T3M, T3Q);
+						  Ip[WS(rs, 3)] = FNMS(T3O, T3P, T3N);
+						  T40 = T3R * T3Z;
+						  T3V = T3R * T3U;
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T4U, T4Z, T4W, T50, T4V, T2L, T2I, T2H;
+				   {
+					E T4T, T4v, T4I, T4Y, T4o, T4S;
+					T4T = FNMS(KP618033988, T4r, T4u);
+					T4v = FMA(KP618033988, T4u, T4r);
+					Im[WS(rs, 5)] = FMA(T3W, T3U, T40);
+					Ip[WS(rs, 5)] = FNMS(T3W, T3Z, T3V);
+					T4I = FMA(KP618033988, T4H, T4G);
+					T4Y = FNMS(KP618033988, T4G, T4H);
+					T4o = FMA(KP559016994, T4n, T4m);
+					T4S = FNMS(KP559016994, T4n, T4m);
+					{
+					     E T52, T4M, T55, T4P, T54, T56, T53;
+					     {
+						  E T4d, T4w, T4J, T4x, T4y, T4X, T4F, T51, T4K;
+						  T4d = W[0];
+						  T4X = FNMS(KP559016994, T4E, T4D);
+						  T4F = FMA(KP559016994, T4E, T4D);
+						  T4U = FNMS(KP951056516, T4T, T4S);
+						  T52 = FMA(KP951056516, T4T, T4S);
+						  T4M = FMA(KP951056516, T4v, T4o);
+						  T4w = FNMS(KP951056516, T4v, T4o);
+						  T4Z = FMA(KP951056516, T4Y, T4X);
+						  T55 = FNMS(KP951056516, T4Y, T4X);
+						  T4P = FNMS(KP951056516, T4I, T4F);
+						  T4J = FMA(KP951056516, T4I, T4F);
+						  T4x = T4d * T4w;
+						  T4y = W[1];
+						  T51 = W[32];
+						  T4K = T4d * T4J;
+						  T54 = W[33];
+						  Ip[0] = FNMS(T4y, T4J, T4x);
+						  T56 = T51 * T55;
+						  T53 = T51 * T52;
+						  Im[0] = FMA(T4y, T4w, T4K);
+					     }
+					     {
+						  E T4O, T4Q, T4N, T4L, T4R;
+						  T4L = W[16];
+						  Im[WS(rs, 8)] = FMA(T54, T52, T56);
+						  Ip[WS(rs, 8)] = FNMS(T54, T55, T53);
+						  T4O = W[17];
+						  T4Q = T4L * T4P;
+						  T4N = T4L * T4M;
+						  T4R = W[24];
+						  T4W = W[25];
+						  Im[WS(rs, 4)] = FMA(T4O, T4M, T4Q);
+						  Ip[WS(rs, 4)] = FNMS(T4O, T4P, T4N);
+						  T50 = T4R * T4Z;
+						  T4V = T4R * T4U;
+					     }
+					}
+				   }
+				   {
+					E T2K, T2u, T2F, T2h, T28, T2J, T2r, T2p;
+					T2K = FNMS(KP618033988, T2s, T2t);
+					T2u = FMA(KP618033988, T2t, T2s);
+					Im[WS(rs, 6)] = FMA(T4W, T4U, T50);
+					Ip[WS(rs, 6)] = FNMS(T4W, T4Z, T4V);
+					T2p = FNMS(KP250000000, T2o, T2l);
+					T2F = FNMS(KP618033988, T2d, T2g);
+					T2h = FMA(KP618033988, T2g, T2d);
+					T28 = FNMS(KP250000000, TC, T7);
+					T2J = FNMS(KP559016994, T2q, T2p);
+					T2r = FMA(KP559016994, T2q, T2p);
+					{
+					     E T2B, T2G, T2y, T2R, T2Q, T2P, T2A, T2x;
+					     {
+						  E T2k, T2v, T27, T2O, T2i, T2a, T2E;
+						  T2k = W[7];
+						  T2a = FMA(KP559016994, T29, T28);
+						  T2E = FNMS(KP559016994, T29, T28);
+						  T2B = FMA(KP951056516, T2u, T2r);
+						  T2v = FNMS(KP951056516, T2u, T2r);
+						  T27 = W[6];
+						  T2O = FMA(KP951056516, T2F, T2E);
+						  T2G = FNMS(KP951056516, T2F, T2E);
+						  T2i = FMA(KP951056516, T2h, T2a);
+						  T2y = FNMS(KP951056516, T2h, T2a);
+						  {
+						       E T2N, T2j, T2w, T2S;
+						       T2L = FMA(KP951056516, T2K, T2J);
+						       T2R = FNMS(KP951056516, T2K, T2J);
+						       T2Q = W[23];
+						       T2N = W[22];
+						       T2j = T27 * T2i;
+						       T2w = T2k * T2i;
+						       T2S = T2Q * T2O;
+						       T2P = T2N * T2O;
+						       Rp[WS(rs, 2)] = FNMS(T2k, T2v, T2j);
+						       Rm[WS(rs, 2)] = FMA(T27, T2v, T2w);
+						       Rm[WS(rs, 6)] = FMA(T2N, T2R, T2S);
+						  }
+					     }
+					     Rp[WS(rs, 6)] = FNMS(T2Q, T2R, T2P);
+					     T2A = W[31];
+					     T2x = W[30];
+					     {
+						  E T2D, T2M, T2C, T2z;
+						  T2I = W[15];
+						  T2C = T2A * T2y;
+						  T2z = T2x * T2y;
+						  T2D = W[14];
+						  T2M = T2I * T2G;
+						  Rm[WS(rs, 8)] = FMA(T2x, T2B, T2C);
+						  Rp[WS(rs, 8)] = FNMS(T2A, T2B, T2z);
+						  T2H = T2D * T2G;
+						  Rm[WS(rs, 4)] = FMA(T2D, T2L, T2M);
+					     }
+					}
+				   }
+				   {
+					E T1S, T1C, T1j, T1N, T1z, T1R;
+					T1S = FMA(KP618033988, T1A, T1B);
+					T1C = FNMS(KP618033988, T1B, T1A);
+					Rp[WS(rs, 4)] = FNMS(T2I, T2L, T2H);
+					T1j = FNMS(KP618033988, T1i, T13);
+					T1N = FMA(KP618033988, T13, T1i);
+					T1z = FNMS(KP559016994, T1y, T1x);
+					T1R = FMA(KP559016994, T1y, T1x);
+					{
+					     E T1J, T1O, T1G, T1Z, T1Y, T1X, T1I, T1F;
+					     {
+						  E T1m, T1D, TD, T1W, T1k, T1M, TO;
+						  T1m = W[3];
+						  T1M = FMA(KP559016994, TN, TM);
+						  TO = FNMS(KP559016994, TN, TM);
+						  T1D = FNMS(KP951056516, T1C, T1z);
+						  T1J = FMA(KP951056516, T1C, T1z);
+						  TD = W[2];
+						  T1O = FNMS(KP951056516, T1N, T1M);
+						  T1W = FMA(KP951056516, T1N, T1M);
+						  T1G = FNMS(KP951056516, T1j, TO);
+						  T1k = FMA(KP951056516, T1j, TO);
+						  {
+						       E T1V, T1l, T1E, T20;
+						       T1Z = FNMS(KP951056516, T1S, T1R);
+						       T1T = FMA(KP951056516, T1S, T1R);
+						       T1Y = W[27];
+						       T1V = W[26];
+						       T1l = TD * T1k;
+						       T1E = T1m * T1k;
+						       T20 = T1Y * T1W;
+						       T1X = T1V * T1W;
+						       Rp[WS(rs, 1)] = FNMS(T1m, T1D, T1l);
+						       Rm[WS(rs, 1)] = FMA(TD, T1D, T1E);
+						       Rm[WS(rs, 7)] = FMA(T1V, T1Z, T20);
+						  }
+					     }
+					     Rp[WS(rs, 7)] = FNMS(T1Y, T1Z, T1X);
+					     T1I = W[35];
+					     T1F = W[34];
+					     {
+						  E T1L, T1U, T1K, T1H;
+						  T1Q = W[11];
+						  T1K = T1I * T1G;
+						  T1H = T1F * T1G;
+						  T1L = W[10];
+						  T1U = T1Q * T1O;
+						  Rm[WS(rs, 9)] = FMA(T1F, T1J, T1K);
+						  Rp[WS(rs, 9)] = FNMS(T1I, T1J, T1H);
+						  T1P = T1L * T1O;
+						  Rm[WS(rs, 3)] = FMA(T1L, T1T, T1U);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Rp[WS(rs, 3)] = FNMS(T1Q, T1T, T1P);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cb_20", twinstr, &GENUS, {136, 38, 110, 0} };
+
+void X(codelet_hc2cb_20) (planner *p) {
+     X(khc2c_register) (p, hc2cb_20, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cb_20 -include hc2cb.h */
+
+/*
+ * This function contains 246 FP additions, 124 FP multiplications,
+ * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
+ * 97 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T7, T3T, T49, TE, T1v, T2T, T3g, T2d, T13, T3n, T3o, T1i, T26, T4e, T4d;
+	       E T23, T1n, T42, T3Z, T1m, T2h, T2I, T2i, T2P, T30, T37, T38, Tm, TB, TC;
+	       E T46, T47, T4a, T2a, T2b, T2e, T1w, T1x, T1y, T3O, T3R, T3U, T3h, T3i, T3j;
+	       E TH, TK, TL;
+	       {
+		    E T3, T2R, T1r, T3e, T6, T3f, T1u, T2S;
+		    {
+			 E T1, T2, T1p, T1q;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 9)];
+			 T3 = T1 + T2;
+			 T2R = T1 - T2;
+			 T1p = Ip[0];
+			 T1q = Im[WS(rs, 9)];
+			 T1r = T1p - T1q;
+			 T3e = T1p + T1q;
+		    }
+		    {
+			 E T4, T5, T1s, T1t;
+			 T4 = Rp[WS(rs, 5)];
+			 T5 = Rm[WS(rs, 4)];
+			 T6 = T4 + T5;
+			 T3f = T4 - T5;
+			 T1s = Ip[WS(rs, 5)];
+			 T1t = Im[WS(rs, 4)];
+			 T1u = T1s - T1t;
+			 T2S = T1s + T1t;
+		    }
+		    T7 = T3 + T6;
+		    T3T = T2R - T2S;
+		    T49 = T3f + T3e;
+		    TE = T3 - T6;
+		    T1v = T1r - T1u;
+		    T2T = T2R + T2S;
+		    T3g = T3e - T3f;
+		    T2d = T1r + T1u;
+	       }
+	       {
+		    E Te, T3M, T3X, TF, TV, T2E, T2W, T21, TA, T3Q, T41, TJ, T1h, T2O, T36;
+		    E T25, Tl, T3N, T3Y, TG, T12, T2H, T2Z, T22, Tt, T3P, T40, TI, T1a, T2L;
+		    E T33, T24;
+		    {
+			 E Ta, T2U, TR, T2C, Td, T2D, TU, T2V;
+			 {
+			      E T8, T9, TP, TQ;
+			      T8 = Rp[WS(rs, 4)];
+			      T9 = Rm[WS(rs, 5)];
+			      Ta = T8 + T9;
+			      T2U = T8 - T9;
+			      TP = Ip[WS(rs, 4)];
+			      TQ = Im[WS(rs, 5)];
+			      TR = TP - TQ;
+			      T2C = TP + TQ;
+			 }
+			 {
+			      E Tb, Tc, TS, TT;
+			      Tb = Rp[WS(rs, 9)];
+			      Tc = Rm[0];
+			      Td = Tb + Tc;
+			      T2D = Tb - Tc;
+			      TS = Ip[WS(rs, 9)];
+			      TT = Im[0];
+			      TU = TS - TT;
+			      T2V = TS + TT;
+			 }
+			 Te = Ta + Td;
+			 T3M = T2U - T2V;
+			 T3X = T2D + T2C;
+			 TF = Ta - Td;
+			 TV = TR - TU;
+			 T2E = T2C - T2D;
+			 T2W = T2U + T2V;
+			 T21 = TR + TU;
+		    }
+		    {
+			 E Tw, T34, T1d, T2N, Tz, T2M, T1g, T35;
+			 {
+			      E Tu, Tv, T1b, T1c;
+			      Tu = Rm[WS(rs, 7)];
+			      Tv = Rp[WS(rs, 2)];
+			      Tw = Tu + Tv;
+			      T34 = Tu - Tv;
+			      T1b = Ip[WS(rs, 2)];
+			      T1c = Im[WS(rs, 7)];
+			      T1d = T1b - T1c;
+			      T2N = T1b + T1c;
+			 }
+			 {
+			      E Tx, Ty, T1e, T1f;
+			      Tx = Rm[WS(rs, 2)];
+			      Ty = Rp[WS(rs, 7)];
+			      Tz = Tx + Ty;
+			      T2M = Tx - Ty;
+			      T1e = Ip[WS(rs, 7)];
+			      T1f = Im[WS(rs, 2)];
+			      T1g = T1e - T1f;
+			      T35 = T1e + T1f;
+			 }
+			 TA = Tw + Tz;
+			 T3Q = T34 + T35;
+			 T41 = T2M - T2N;
+			 TJ = Tw - Tz;
+			 T1h = T1d - T1g;
+			 T2O = T2M + T2N;
+			 T36 = T34 - T35;
+			 T25 = T1d + T1g;
+		    }
+		    {
+			 E Th, T2X, TY, T2G, Tk, T2F, T11, T2Y;
+			 {
+			      E Tf, Tg, TW, TX;
+			      Tf = Rm[WS(rs, 3)];
+			      Tg = Rp[WS(rs, 6)];
+			      Th = Tf + Tg;
+			      T2X = Tf - Tg;
+			      TW = Ip[WS(rs, 6)];
+			      TX = Im[WS(rs, 3)];
+			      TY = TW - TX;
+			      T2G = TW + TX;
+			 }
+			 {
+			      E Ti, Tj, TZ, T10;
+			      Ti = Rp[WS(rs, 1)];
+			      Tj = Rm[WS(rs, 8)];
+			      Tk = Ti + Tj;
+			      T2F = Ti - Tj;
+			      TZ = Ip[WS(rs, 1)];
+			      T10 = Im[WS(rs, 8)];
+			      T11 = TZ - T10;
+			      T2Y = TZ + T10;
+			 }
+			 Tl = Th + Tk;
+			 T3N = T2X - T2Y;
+			 T3Y = T2F - T2G;
+			 TG = Th - Tk;
+			 T12 = TY - T11;
+			 T2H = T2F + T2G;
+			 T2Z = T2X + T2Y;
+			 T22 = TY + T11;
+		    }
+		    {
+			 E Tp, T31, T16, T2J, Ts, T2K, T19, T32;
+			 {
+			      E Tn, To, T14, T15;
+			      Tn = Rp[WS(rs, 8)];
+			      To = Rm[WS(rs, 1)];
+			      Tp = Tn + To;
+			      T31 = Tn - To;
+			      T14 = Ip[WS(rs, 8)];
+			      T15 = Im[WS(rs, 1)];
+			      T16 = T14 - T15;
+			      T2J = T14 + T15;
+			 }
+			 {
+			      E Tq, Tr, T17, T18;
+			      Tq = Rm[WS(rs, 6)];
+			      Tr = Rp[WS(rs, 3)];
+			      Ts = Tq + Tr;
+			      T2K = Tq - Tr;
+			      T17 = Ip[WS(rs, 3)];
+			      T18 = Im[WS(rs, 6)];
+			      T19 = T17 - T18;
+			      T32 = T17 + T18;
+			 }
+			 Tt = Tp + Ts;
+			 T3P = T31 + T32;
+			 T40 = T2K + T2J;
+			 TI = Tp - Ts;
+			 T1a = T16 - T19;
+			 T2L = T2J - T2K;
+			 T33 = T31 - T32;
+			 T24 = T16 + T19;
+		    }
+		    T13 = TV - T12;
+		    T3n = T2W - T2Z;
+		    T3o = T33 - T36;
+		    T1i = T1a - T1h;
+		    T26 = T24 - T25;
+		    T4e = T3P - T3Q;
+		    T4d = T3M - T3N;
+		    T23 = T21 - T22;
+		    T1n = TI - TJ;
+		    T42 = T40 - T41;
+		    T3Z = T3X - T3Y;
+		    T1m = TF - TG;
+		    T2h = Te - Tl;
+		    T2I = T2E + T2H;
+		    T2i = Tt - TA;
+		    T2P = T2L + T2O;
+		    T30 = T2W + T2Z;
+		    T37 = T33 + T36;
+		    T38 = T30 + T37;
+		    Tm = Te + Tl;
+		    TB = Tt + TA;
+		    TC = Tm + TB;
+		    T46 = T3X + T3Y;
+		    T47 = T40 + T41;
+		    T4a = T46 + T47;
+		    T2a = T21 + T22;
+		    T2b = T24 + T25;
+		    T2e = T2a + T2b;
+		    T1w = TV + T12;
+		    T1x = T1a + T1h;
+		    T1y = T1w + T1x;
+		    T3O = T3M + T3N;
+		    T3R = T3P + T3Q;
+		    T3U = T3O + T3R;
+		    T3h = T2E - T2H;
+		    T3i = T2L - T2O;
+		    T3j = T3h + T3i;
+		    TH = TF + TG;
+		    TK = TI + TJ;
+		    TL = TH + TK;
+	       }
+	       Rp[0] = T7 + TC;
+	       Rm[0] = T2d + T2e;
+	       {
+		    E T1U, T1W, T1T, T1V;
+		    T1U = TE + TL;
+		    T1W = T1v + T1y;
+		    T1T = W[18];
+		    T1V = W[19];
+		    Rp[WS(rs, 5)] = FNMS(T1V, T1W, T1T * T1U);
+		    Rm[WS(rs, 5)] = FMA(T1V, T1U, T1T * T1W);
+	       }
+	       {
+		    E T4y, T4A, T4x, T4z;
+		    T4y = T3T + T3U;
+		    T4A = T49 + T4a;
+		    T4x = W[8];
+		    T4z = W[9];
+		    Ip[WS(rs, 2)] = FNMS(T4z, T4A, T4x * T4y);
+		    Im[WS(rs, 2)] = FMA(T4x, T4A, T4z * T4y);
+	       }
+	       {
+		    E T3I, T3K, T3H, T3J;
+		    T3I = T2T + T38;
+		    T3K = T3g + T3j;
+		    T3H = W[28];
+		    T3J = W[29];
+		    Ip[WS(rs, 7)] = FNMS(T3J, T3K, T3H * T3I);
+		    Im[WS(rs, 7)] = FMA(T3H, T3K, T3J * T3I);
+	       }
+	       {
+		    E T27, T2j, T2v, T2r, T2g, T2u, T20, T2q;
+		    T27 = FMA(KP951056516, T23, KP587785252 * T26);
+		    T2j = FMA(KP951056516, T2h, KP587785252 * T2i);
+		    T2v = FNMS(KP951056516, T2i, KP587785252 * T2h);
+		    T2r = FNMS(KP951056516, T26, KP587785252 * T23);
+		    {
+			 E T2c, T2f, T1Y, T1Z;
+			 T2c = KP559016994 * (T2a - T2b);
+			 T2f = FNMS(KP250000000, T2e, T2d);
+			 T2g = T2c + T2f;
+			 T2u = T2f - T2c;
+			 T1Y = KP559016994 * (Tm - TB);
+			 T1Z = FNMS(KP250000000, TC, T7);
+			 T20 = T1Y + T1Z;
+			 T2q = T1Z - T1Y;
+		    }
+		    {
+			 E T28, T2k, T1X, T29;
+			 T28 = T20 + T27;
+			 T2k = T2g - T2j;
+			 T1X = W[6];
+			 T29 = W[7];
+			 Rp[WS(rs, 2)] = FNMS(T29, T2k, T1X * T28);
+			 Rm[WS(rs, 2)] = FMA(T29, T28, T1X * T2k);
+		    }
+		    {
+			 E T2y, T2A, T2x, T2z;
+			 T2y = T2q - T2r;
+			 T2A = T2v + T2u;
+			 T2x = W[22];
+			 T2z = W[23];
+			 Rp[WS(rs, 6)] = FNMS(T2z, T2A, T2x * T2y);
+			 Rm[WS(rs, 6)] = FMA(T2z, T2y, T2x * T2A);
+		    }
+		    {
+			 E T2m, T2o, T2l, T2n;
+			 T2m = T20 - T27;
+			 T2o = T2j + T2g;
+			 T2l = W[30];
+			 T2n = W[31];
+			 Rp[WS(rs, 8)] = FNMS(T2n, T2o, T2l * T2m);
+			 Rm[WS(rs, 8)] = FMA(T2n, T2m, T2l * T2o);
+		    }
+		    {
+			 E T2s, T2w, T2p, T2t;
+			 T2s = T2q + T2r;
+			 T2w = T2u - T2v;
+			 T2p = W[14];
+			 T2t = W[15];
+			 Rp[WS(rs, 4)] = FNMS(T2t, T2w, T2p * T2s);
+			 Rm[WS(rs, 4)] = FMA(T2t, T2s, T2p * T2w);
+		    }
+	       }
+	       {
+		    E T43, T4f, T4r, T4m, T4c, T4q, T3W, T4n;
+		    T43 = FMA(KP951056516, T3Z, KP587785252 * T42);
+		    T4f = FMA(KP951056516, T4d, KP587785252 * T4e);
+		    T4r = FNMS(KP951056516, T4e, KP587785252 * T4d);
+		    T4m = FNMS(KP951056516, T42, KP587785252 * T3Z);
+		    {
+			 E T48, T4b, T3S, T3V;
+			 T48 = KP559016994 * (T46 - T47);
+			 T4b = FNMS(KP250000000, T4a, T49);
+			 T4c = T48 + T4b;
+			 T4q = T4b - T48;
+			 T3S = KP559016994 * (T3O - T3R);
+			 T3V = FNMS(KP250000000, T3U, T3T);
+			 T3W = T3S + T3V;
+			 T4n = T3V - T3S;
+		    }
+		    {
+			 E T44, T4g, T3L, T45;
+			 T44 = T3W - T43;
+			 T4g = T4c + T4f;
+			 T3L = W[0];
+			 T45 = W[1];
+			 Ip[0] = FNMS(T45, T4g, T3L * T44);
+			 Im[0] = FMA(T3L, T4g, T45 * T44);
+		    }
+		    {
+			 E T4u, T4w, T4t, T4v;
+			 T4u = T4n - T4m;
+			 T4w = T4q + T4r;
+			 T4t = W[32];
+			 T4v = W[33];
+			 Ip[WS(rs, 8)] = FNMS(T4v, T4w, T4t * T4u);
+			 Im[WS(rs, 8)] = FMA(T4t, T4w, T4v * T4u);
+		    }
+		    {
+			 E T4i, T4k, T4h, T4j;
+			 T4i = T43 + T3W;
+			 T4k = T4c - T4f;
+			 T4h = W[16];
+			 T4j = W[17];
+			 Ip[WS(rs, 4)] = FNMS(T4j, T4k, T4h * T4i);
+			 Im[WS(rs, 4)] = FMA(T4h, T4k, T4j * T4i);
+		    }
+		    {
+			 E T4o, T4s, T4l, T4p;
+			 T4o = T4m + T4n;
+			 T4s = T4q - T4r;
+			 T4l = W[24];
+			 T4p = W[25];
+			 Ip[WS(rs, 6)] = FNMS(T4p, T4s, T4l * T4o);
+			 Im[WS(rs, 6)] = FMA(T4l, T4s, T4p * T4o);
+		    }
+	       }
+	       {
+		    E T1j, T1o, T1M, T1J, T1B, T1N, TO, T1I;
+		    T1j = FNMS(KP951056516, T1i, KP587785252 * T13);
+		    T1o = FNMS(KP951056516, T1n, KP587785252 * T1m);
+		    T1M = FMA(KP951056516, T1m, KP587785252 * T1n);
+		    T1J = FMA(KP951056516, T13, KP587785252 * T1i);
+		    {
+			 E T1z, T1A, TM, TN;
+			 T1z = FNMS(KP250000000, T1y, T1v);
+			 T1A = KP559016994 * (T1w - T1x);
+			 T1B = T1z - T1A;
+			 T1N = T1A + T1z;
+			 TM = FNMS(KP250000000, TL, TE);
+			 TN = KP559016994 * (TH - TK);
+			 TO = TM - TN;
+			 T1I = TN + TM;
+		    }
+		    {
+			 E T1k, T1C, TD, T1l;
+			 T1k = TO - T1j;
+			 T1C = T1o + T1B;
+			 TD = W[2];
+			 T1l = W[3];
+			 Rp[WS(rs, 1)] = FNMS(T1l, T1C, TD * T1k);
+			 Rm[WS(rs, 1)] = FMA(T1l, T1k, TD * T1C);
+		    }
+		    {
+			 E T1Q, T1S, T1P, T1R;
+			 T1Q = T1I + T1J;
+			 T1S = T1N - T1M;
+			 T1P = W[26];
+			 T1R = W[27];
+			 Rp[WS(rs, 7)] = FNMS(T1R, T1S, T1P * T1Q);
+			 Rm[WS(rs, 7)] = FMA(T1R, T1Q, T1P * T1S);
+		    }
+		    {
+			 E T1E, T1G, T1D, T1F;
+			 T1E = TO + T1j;
+			 T1G = T1B - T1o;
+			 T1D = W[34];
+			 T1F = W[35];
+			 Rp[WS(rs, 9)] = FNMS(T1F, T1G, T1D * T1E);
+			 Rm[WS(rs, 9)] = FMA(T1F, T1E, T1D * T1G);
+		    }
+		    {
+			 E T1K, T1O, T1H, T1L;
+			 T1K = T1I - T1J;
+			 T1O = T1M + T1N;
+			 T1H = W[10];
+			 T1L = W[11];
+			 Rp[WS(rs, 3)] = FNMS(T1L, T1O, T1H * T1K);
+			 Rm[WS(rs, 3)] = FMA(T1L, T1K, T1H * T1O);
+		    }
+	       }
+	       {
+		    E T2Q, T3p, T3B, T3x, T3m, T3A, T3b, T3w;
+		    T2Q = FNMS(KP951056516, T2P, KP587785252 * T2I);
+		    T3p = FNMS(KP951056516, T3o, KP587785252 * T3n);
+		    T3B = FMA(KP951056516, T3n, KP587785252 * T3o);
+		    T3x = FMA(KP951056516, T2I, KP587785252 * T2P);
+		    {
+			 E T3k, T3l, T39, T3a;
+			 T3k = FNMS(KP250000000, T3j, T3g);
+			 T3l = KP559016994 * (T3h - T3i);
+			 T3m = T3k - T3l;
+			 T3A = T3l + T3k;
+			 T39 = FNMS(KP250000000, T38, T2T);
+			 T3a = KP559016994 * (T30 - T37);
+			 T3b = T39 - T3a;
+			 T3w = T3a + T39;
+		    }
+		    {
+			 E T3c, T3q, T2B, T3d;
+			 T3c = T2Q + T3b;
+			 T3q = T3m - T3p;
+			 T2B = W[4];
+			 T3d = W[5];
+			 Ip[WS(rs, 1)] = FNMS(T3d, T3q, T2B * T3c);
+			 Im[WS(rs, 1)] = FMA(T2B, T3q, T3d * T3c);
+		    }
+		    {
+			 E T3E, T3G, T3D, T3F;
+			 T3E = T3x + T3w;
+			 T3G = T3A - T3B;
+			 T3D = W[36];
+			 T3F = W[37];
+			 Ip[WS(rs, 9)] = FNMS(T3F, T3G, T3D * T3E);
+			 Im[WS(rs, 9)] = FMA(T3D, T3G, T3F * T3E);
+		    }
+		    {
+			 E T3s, T3u, T3r, T3t;
+			 T3s = T3b - T2Q;
+			 T3u = T3m + T3p;
+			 T3r = W[12];
+			 T3t = W[13];
+			 Ip[WS(rs, 3)] = FNMS(T3t, T3u, T3r * T3s);
+			 Im[WS(rs, 3)] = FMA(T3r, T3u, T3t * T3s);
+		    }
+		    {
+			 E T3y, T3C, T3v, T3z;
+			 T3y = T3w - T3x;
+			 T3C = T3A + T3B;
+			 T3v = W[20];
+			 T3z = W[21];
+			 Ip[WS(rs, 5)] = FNMS(T3z, T3C, T3v * T3y);
+			 Im[WS(rs, 5)] = FMA(T3v, T3C, T3z * T3y);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cb_20", twinstr, &GENUS, {184, 62, 62, 0} };
+
+void X(codelet_hc2cb_20) (planner *p) {
+     X(khc2c_register) (p, hc2cb_20, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1770 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:55 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -dif -name hc2cb_32 -include hc2cb.h */
+
+/*
+ * This function contains 434 FP additions, 260 FP multiplications,
+ * (or, 236 additions, 62 multiplications, 198 fused multiply/add),
+ * 137 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T5o, T5r, T5q, T5n, T5s, T5p;
+	       {
+		    E T5K, Tf, T8k, T7k, T8x, T7N, T3i, T1i, T3v, T2L, T5f, T4v, T6T, T6m, T52;
+		    E T42, TZ, T6X, T3p, T1X, T8B, T8p, T3o, T26, T58, T4n, T7T, T7z, T59, T4k;
+		    E T6p, T6a, TK, T6W, T8s, T8A, T2o, T3m, T3l, T2x, T55, T4g, T7S, T7G, T56;
+		    E T4d, T6o, T61, T5Q, T5N, T6f, Tu, T8y, T7r, T8l, T7Q, T3w, T1F, T45, T48;
+		    E T3j, T2O, T53, T4y, T62, T69;
+		    {
+			 E T6l, T6i, T40, T41;
+			 {
+			      E T12, T3, T6g, T2G, T2D, T6, T6h, T15, Td, T6k, T1g, T2J, Ta, T17, T1a;
+			      E T6j;
+			      {
+				   E T4, T5, T13, T14;
+				   {
+					E T1, T2, T2E, T2F;
+					T1 = Rp[0];
+					T2 = Rm[WS(rs, 15)];
+					T2E = Ip[0];
+					T2F = Im[WS(rs, 15)];
+					T4 = Rp[WS(rs, 8)];
+					T12 = T1 - T2;
+					T3 = T1 + T2;
+					T6g = T2E - T2F;
+					T2G = T2E + T2F;
+					T5 = Rm[WS(rs, 7)];
+				   }
+				   T13 = Ip[WS(rs, 8)];
+				   T14 = Im[WS(rs, 7)];
+				   {
+					E Tb, Tc, T1d, T1e;
+					Tb = Rm[WS(rs, 3)];
+					T2D = T4 - T5;
+					T6 = T4 + T5;
+					T6h = T13 - T14;
+					T15 = T13 + T14;
+					Tc = Rp[WS(rs, 12)];
+					T1d = Ip[WS(rs, 12)];
+					T1e = Im[WS(rs, 3)];
+					{
+					     E T8, T1c, T1f, T9, T18, T19;
+					     T8 = Rp[WS(rs, 4)];
+					     Td = Tb + Tc;
+					     T1c = Tb - Tc;
+					     T6k = T1d - T1e;
+					     T1f = T1d + T1e;
+					     T9 = Rm[WS(rs, 11)];
+					     T18 = Ip[WS(rs, 4)];
+					     T19 = Im[WS(rs, 11)];
+					     T1g = T1c - T1f;
+					     T2J = T1c + T1f;
+					     Ta = T8 + T9;
+					     T17 = T8 - T9;
+					     T1a = T18 + T19;
+					     T6j = T18 - T19;
+					}
+				   }
+			      }
+			      {
+				   E T2I, T7M, T7L, T16, T1h, T4u, T4t, T2H, T2K;
+				   {
+					E T7i, T7, T1b, Te, T7j;
+					T7i = T3 - T6;
+					T7 = T3 + T6;
+					T2I = T17 + T1a;
+					T1b = T17 - T1a;
+					Te = Ta + Td;
+					T7M = Ta - Td;
+					T7j = T6k - T6j;
+					T6l = T6j + T6k;
+					T6i = T6g + T6h;
+					T7L = T6g - T6h;
+					T5K = T7 - Te;
+					Tf = T7 + Te;
+					T8k = T7i + T7j;
+					T7k = T7i - T7j;
+					T40 = T12 + T15;
+					T16 = T12 - T15;
+					T1h = T1b + T1g;
+					T4u = T1b - T1g;
+				   }
+				   T4t = T2G - T2D;
+				   T2H = T2D + T2G;
+				   T8x = T7M + T7L;
+				   T7N = T7L - T7M;
+				   T3i = FMA(KP707106781, T1h, T16);
+				   T1i = FNMS(KP707106781, T1h, T16);
+				   T2K = T2I - T2J;
+				   T41 = T2I + T2J;
+				   T3v = FMA(KP707106781, T2K, T2H);
+				   T2L = FNMS(KP707106781, T2K, T2H);
+				   T5f = FNMS(KP707106781, T4u, T4t);
+				   T4v = FMA(KP707106781, T4u, T4t);
+			      }
+			 }
+			 {
+			      E T1Y, T1H, TR, T7w, T1K, T21, T65, T7t, TU, T66, T23, T1Q, T1R, TX, T67;
+			      E T1U, TY, T7u;
+			      {
+				   E TL, TM, TO, TP, T63, T64;
+				   TL = Rm[0];
+				   T6T = T6i + T6l;
+				   T6m = T6i - T6l;
+				   T52 = FMA(KP707106781, T41, T40);
+				   T42 = FNMS(KP707106781, T41, T40);
+				   TM = Rp[WS(rs, 15)];
+				   TO = Rp[WS(rs, 7)];
+				   TP = Rm[WS(rs, 8)];
+				   {
+					E T1I, TN, TQ, T1J, T1Z, T20;
+					T1I = Ip[WS(rs, 15)];
+					T1Y = TL - TM;
+					TN = TL + TM;
+					T1H = TO - TP;
+					TQ = TO + TP;
+					T1J = Im[0];
+					T1Z = Ip[WS(rs, 7)];
+					T20 = Im[WS(rs, 8)];
+					TR = TN + TQ;
+					T7w = TN - TQ;
+					T1K = T1I + T1J;
+					T63 = T1I - T1J;
+					T64 = T1Z - T20;
+					T21 = T1Z + T20;
+				   }
+				   {
+					E TV, T1M, T1P, TW, T1S, T1T;
+					{
+					     E TS, TT, T1N, T1O;
+					     TS = Rp[WS(rs, 3)];
+					     T65 = T63 + T64;
+					     T7t = T63 - T64;
+					     TT = Rm[WS(rs, 12)];
+					     T1N = Ip[WS(rs, 3)];
+					     T1O = Im[WS(rs, 12)];
+					     TV = Rm[WS(rs, 4)];
+					     T1M = TS - TT;
+					     TU = TS + TT;
+					     T66 = T1N - T1O;
+					     T1P = T1N + T1O;
+					     TW = Rp[WS(rs, 11)];
+					     T1S = Ip[WS(rs, 11)];
+					     T1T = Im[WS(rs, 4)];
+					}
+					T23 = T1M - T1P;
+					T1Q = T1M + T1P;
+					T1R = TV - TW;
+					TX = TV + TW;
+					T67 = T1S - T1T;
+					T1U = T1S + T1T;
+				   }
+			      }
+			      TY = TU + TX;
+			      T7u = TU - TX;
+			      {
+				   E T7x, T68, T1V, T24;
+				   T7x = T67 - T66;
+				   T68 = T66 + T67;
+				   T1V = T1R + T1U;
+				   T24 = T1R - T1U;
+				   {
+					E T4l, T1L, T1W, T4j, T7v, T8n, T8o, T7y;
+					T62 = TR - TY;
+					TZ = TR + TY;
+					T6X = T65 + T68;
+					T69 = T65 - T68;
+					T4l = T1H + T1K;
+					T1L = T1H - T1K;
+					T1W = T1Q - T1V;
+					T4j = T1Q + T1V;
+					T7v = T7t - T7u;
+					T8n = T7u + T7t;
+					T8o = T7w + T7x;
+					T7y = T7w - T7x;
+					{
+					     E T4i, T22, T25, T4m;
+					     T4i = T1Y + T21;
+					     T22 = T1Y - T21;
+					     T3p = FMA(KP707106781, T1W, T1L);
+					     T1X = FNMS(KP707106781, T1W, T1L);
+					     T8B = FMA(KP414213562, T8n, T8o);
+					     T8p = FNMS(KP414213562, T8o, T8n);
+					     T25 = T23 + T24;
+					     T4m = T23 - T24;
+					     T3o = FMA(KP707106781, T25, T22);
+					     T26 = FNMS(KP707106781, T25, T22);
+					     T58 = FMA(KP707106781, T4m, T4l);
+					     T4n = FNMS(KP707106781, T4m, T4l);
+					     T7T = FNMS(KP414213562, T7v, T7y);
+					     T7z = FMA(KP414213562, T7y, T7v);
+					     T59 = FMA(KP707106781, T4j, T4i);
+					     T4k = FNMS(KP707106781, T4j, T4i);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T5T, T60, T4c, T4b;
+			 {
+			      E T2p, T28, T2b, T7D, TC, T2s, T7A, T5W, TF, T2j, T5X, T2i, TI, T2k, T2u;
+			      E T2h;
+			      {
+				   E Tz, Ty, TA, Tw, Tx;
+				   Tw = Rp[WS(rs, 1)];
+				   Tx = Rm[WS(rs, 14)];
+				   Tz = Rp[WS(rs, 9)];
+				   T6p = T69 - T62;
+				   T6a = T62 + T69;
+				   Ty = Tw + Tx;
+				   T2p = Tw - Tx;
+				   TA = Rm[WS(rs, 6)];
+				   {
+					E T5U, T5V, T2d, T2g;
+					{
+					     E T2q, T2r, T29, T2a, TB;
+					     T29 = Ip[WS(rs, 1)];
+					     T2a = Im[WS(rs, 14)];
+					     TB = Tz + TA;
+					     T28 = Tz - TA;
+					     T2q = Ip[WS(rs, 9)];
+					     T5U = T29 - T2a;
+					     T2b = T29 + T2a;
+					     T2r = Im[WS(rs, 6)];
+					     T7D = Ty - TB;
+					     TC = Ty + TB;
+					     T2s = T2q + T2r;
+					     T5V = T2q - T2r;
+					}
+					{
+					     E T2e, T2f, TD, TE, TG, TH;
+					     TD = Rp[WS(rs, 5)];
+					     TE = Rm[WS(rs, 10)];
+					     T7A = T5U - T5V;
+					     T5W = T5U + T5V;
+					     T2e = Ip[WS(rs, 5)];
+					     T2d = TD - TE;
+					     TF = TD + TE;
+					     T2f = Im[WS(rs, 10)];
+					     TG = Rm[WS(rs, 2)];
+					     TH = Rp[WS(rs, 13)];
+					     T2j = Ip[WS(rs, 13)];
+					     T5X = T2e - T2f;
+					     T2g = T2e + T2f;
+					     T2i = TG - TH;
+					     TI = TG + TH;
+					     T2k = Im[WS(rs, 2)];
+					}
+					T2u = T2d - T2g;
+					T2h = T2d + T2g;
+				   }
+			      }
+			      {
+				   E TJ, T7B, T2l, T5Y;
+				   TJ = TF + TI;
+				   T7B = TF - TI;
+				   T2l = T2j + T2k;
+				   T5Y = T2j - T2k;
+				   {
+					E T4e, T2c, T2v, T8q, T7C, T7F, T8r, T2n, T7E, T2m, T5Z, T4f, T2t, T2w;
+					T4e = T2b - T28;
+					T2c = T28 + T2b;
+					TK = TC + TJ;
+					T5T = TC - TJ;
+					T7E = T5Y - T5X;
+					T5Z = T5X + T5Y;
+					T2m = T2i + T2l;
+					T2v = T2i - T2l;
+					T60 = T5W - T5Z;
+					T6W = T5W + T5Z;
+					T8q = T7B + T7A;
+					T7C = T7A - T7B;
+					T7F = T7D - T7E;
+					T8r = T7D + T7E;
+					T2n = T2h - T2m;
+					T4c = T2h + T2m;
+					T4b = T2p + T2s;
+					T2t = T2p - T2s;
+					T2w = T2u + T2v;
+					T4f = T2v - T2u;
+					T8s = FMA(KP414213562, T8r, T8q);
+					T8A = FNMS(KP414213562, T8q, T8r);
+					T2o = FNMS(KP707106781, T2n, T2c);
+					T3m = FMA(KP707106781, T2n, T2c);
+					T3l = FMA(KP707106781, T2w, T2t);
+					T2x = FNMS(KP707106781, T2w, T2t);
+					T55 = FMA(KP707106781, T4f, T4e);
+					T4g = FNMS(KP707106781, T4f, T4e);
+					T7S = FMA(KP414213562, T7C, T7F);
+					T7G = FNMS(KP414213562, T7F, T7C);
+				   }
+			      }
+			 }
+			 {
+			      E T43, T1y, T7o, Tm, T7p, T44, T1D, Tq, T1o, Tp, T5L, T1m, Tr, T1p, T1q;
+			      {
+				   E Tj, T1z, Ti, T5O, T1x, Tk, T1A, T1B;
+				   {
+					E Tg, Th, T1v, T1w;
+					Tg = Rp[WS(rs, 2)];
+					T56 = FMA(KP707106781, T4c, T4b);
+					T4d = FNMS(KP707106781, T4c, T4b);
+					T6o = T5T + T60;
+					T61 = T5T - T60;
+					Th = Rm[WS(rs, 13)];
+					T1v = Ip[WS(rs, 2)];
+					T1w = Im[WS(rs, 13)];
+					Tj = Rp[WS(rs, 10)];
+					T1z = Tg - Th;
+					Ti = Tg + Th;
+					T5O = T1v - T1w;
+					T1x = T1v + T1w;
+					Tk = Rm[WS(rs, 5)];
+					T1A = Ip[WS(rs, 10)];
+					T1B = Im[WS(rs, 5)];
+				   }
+				   {
+					E Tn, To, T1k, T1l;
+					Tn = Rm[WS(rs, 1)];
+					{
+					     E T1u, Tl, T5P, T1C;
+					     T1u = Tj - Tk;
+					     Tl = Tj + Tk;
+					     T5P = T1A - T1B;
+					     T1C = T1A + T1B;
+					     T43 = T1x - T1u;
+					     T1y = T1u + T1x;
+					     T7o = Ti - Tl;
+					     Tm = Ti + Tl;
+					     T5Q = T5O + T5P;
+					     T7p = T5O - T5P;
+					     T44 = T1z + T1C;
+					     T1D = T1z - T1C;
+					     To = Rp[WS(rs, 14)];
+					}
+					T1k = Ip[WS(rs, 14)];
+					T1l = Im[WS(rs, 1)];
+					Tq = Rp[WS(rs, 6)];
+					T1o = Tn - To;
+					Tp = Tn + To;
+					T5L = T1k - T1l;
+					T1m = T1k + T1l;
+					Tr = Rm[WS(rs, 9)];
+					T1p = Ip[WS(rs, 6)];
+					T1q = Im[WS(rs, 9)];
+				   }
+			      }
+			      {
+				   E T46, T47, T7P, T7O, T2N, T1t, T1E, T2M, T4w, T4x;
+				   {
+					E T1n, Tt, T1s, T7n, T7q, T7m, T7l;
+					{
+					     E T1j, Ts, T5M, T1r;
+					     T1j = Tq - Tr;
+					     Ts = Tq + Tr;
+					     T5M = T1p - T1q;
+					     T1r = T1p + T1q;
+					     T46 = T1j + T1m;
+					     T1n = T1j - T1m;
+					     T7m = Tp - Ts;
+					     Tt = Tp + Ts;
+					     T5N = T5L + T5M;
+					     T7l = T5L - T5M;
+					     T47 = T1o + T1r;
+					     T1s = T1o - T1r;
+					}
+					T7P = T7m + T7l;
+					T7n = T7l - T7m;
+					T7q = T7o + T7p;
+					T7O = T7o - T7p;
+					T6f = Tm - Tt;
+					Tu = Tm + Tt;
+					T8y = T7q + T7n;
+					T7r = T7n - T7q;
+					T2N = FMA(KP414213562, T1n, T1s);
+					T1t = FNMS(KP414213562, T1s, T1n);
+					T1E = FMA(KP414213562, T1D, T1y);
+					T2M = FNMS(KP414213562, T1y, T1D);
+				   }
+				   T8l = T7O + T7P;
+				   T7Q = T7O - T7P;
+				   T3w = T1E + T1t;
+				   T1F = T1t - T1E;
+				   T45 = FNMS(KP414213562, T44, T43);
+				   T4w = FMA(KP414213562, T43, T44);
+				   T4x = FMA(KP414213562, T46, T47);
+				   T48 = FNMS(KP414213562, T47, T46);
+				   T3j = T2M + T2N;
+				   T2O = T2M - T2N;
+				   T53 = T4w + T4x;
+				   T4y = T4w - T4x;
+			      }
+			 }
+		    }
+		    {
+			 E T72, T5g, T49, T78, T77, T73, T7s, T7U, T7R, T7H, T3f, T3e, T3d;
+			 {
+			      E T5R, T8m, T8C, T8z, T8t, T8e, T86, T88, T8h, T8f, T8i, T8c, T8g;
+			      {
+				   E T6P, T6Q, T6Z, T6S, T6R;
+				   {
+					E Tv, T10, T6V, T6Y, T6U;
+					T72 = Tf - Tu;
+					Tv = Tf + Tu;
+					T6U = T5Q + T5N;
+					T5R = T5N - T5Q;
+					T5g = T48 - T45;
+					T49 = T45 + T48;
+					T10 = TK + TZ;
+					T78 = TK - TZ;
+					T77 = T6T - T6U;
+					T6V = T6T + T6U;
+					T6Y = T6W + T6X;
+					T73 = T6X - T6W;
+					T6P = W[30];
+					Rp[0] = Tv + T10;
+					T6Q = Tv - T10;
+					Rm[0] = T6V + T6Y;
+					T6Z = T6V - T6Y;
+					T6S = W[31];
+					T6R = T6P * T6Q;
+				   }
+				   {
+					E T8O, T8W, T8Q, T8Z, T8X, T90, T8U, T8Y;
+					{
+					     E T8R, T8S, T8M, T8N, T70;
+					     T8M = FMA(KP707106781, T8l, T8k);
+					     T8m = FNMS(KP707106781, T8l, T8k);
+					     T8C = T8A - T8B;
+					     T8N = T8A + T8B;
+					     T70 = T6S * T6Q;
+					     Rp[WS(rs, 8)] = FNMS(T6S, T6Z, T6R);
+					     T8R = FMA(KP707106781, T8y, T8x);
+					     T8z = FNMS(KP707106781, T8y, T8x);
+					     T8O = FNMS(KP923879532, T8N, T8M);
+					     T8W = FMA(KP923879532, T8N, T8M);
+					     Rm[WS(rs, 8)] = FMA(T6P, T6Z, T70);
+					     T8S = T8s + T8p;
+					     T8t = T8p - T8s;
+					     {
+						  E T8L, T8T, T8P, T8V;
+						  T8L = W[34];
+						  T8Q = W[35];
+						  T8V = W[2];
+						  T8Z = FMA(KP923879532, T8S, T8R);
+						  T8T = FNMS(KP923879532, T8S, T8R);
+						  T8P = T8L * T8O;
+						  T8X = T8V * T8W;
+						  T90 = T8V * T8Z;
+						  T8U = T8L * T8T;
+						  Rp[WS(rs, 9)] = FNMS(T8Q, T8T, T8P);
+						  T8Y = W[3];
+					     }
+					}
+					{
+					     E T89, T8a, T84, T85;
+					     T84 = FNMS(KP707106781, T7r, T7k);
+					     T7s = FMA(KP707106781, T7r, T7k);
+					     Rm[WS(rs, 9)] = FMA(T8Q, T8O, T8U);
+					     T85 = T7S + T7T;
+					     T7U = T7S - T7T;
+					     Rm[WS(rs, 1)] = FMA(T8Y, T8W, T90);
+					     Rp[WS(rs, 1)] = FNMS(T8Y, T8Z, T8X);
+					     T7R = FMA(KP707106781, T7Q, T7N);
+					     T89 = FNMS(KP707106781, T7Q, T7N);
+					     T8e = FMA(KP923879532, T85, T84);
+					     T86 = FNMS(KP923879532, T85, T84);
+					     T8a = T7G + T7z;
+					     T7H = T7z - T7G;
+					     {
+						  E T83, T8b, T87, T8d;
+						  T83 = W[26];
+						  T88 = W[27];
+						  T8d = W[58];
+						  T8h = FMA(KP923879532, T8a, T89);
+						  T8b = FNMS(KP923879532, T8a, T89);
+						  T87 = T83 * T86;
+						  T8f = T8d * T8e;
+						  T8i = T8d * T8h;
+						  T8c = T83 * T8b;
+						  Rp[WS(rs, 7)] = FNMS(T88, T8b, T87);
+						  T8g = W[59];
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T5S, T6q, T6n, T6K, T6C, T6b, T6E, T6N, T6L, T6O, T6I, T6M;
+				   {
+					E T6F, T6G, T6A, T6B;
+					T6A = T5K - T5R;
+					T5S = T5K + T5R;
+					Rm[WS(rs, 7)] = FMA(T88, T86, T8c);
+					T6B = T6p - T6o;
+					T6q = T6o + T6p;
+					Rm[WS(rs, 15)] = FMA(T8g, T8e, T8i);
+					Rp[WS(rs, 15)] = FNMS(T8g, T8h, T8f);
+					T6n = T6f + T6m;
+					T6F = T6m - T6f;
+					T6K = FMA(KP707106781, T6B, T6A);
+					T6C = FNMS(KP707106781, T6B, T6A);
+					T6G = T61 - T6a;
+					T6b = T61 + T6a;
+					{
+					     E T6z, T6H, T6D, T6J;
+					     T6z = W[54];
+					     T6E = W[55];
+					     T6J = W[22];
+					     T6N = FMA(KP707106781, T6G, T6F);
+					     T6H = FNMS(KP707106781, T6G, T6F);
+					     T6D = T6z * T6C;
+					     T6L = T6J * T6K;
+					     T6O = T6J * T6N;
+					     T6I = T6z * T6H;
+					     Rp[WS(rs, 14)] = FNMS(T6E, T6H, T6D);
+					     T6M = W[23];
+					}
+				   }
+				   {
+					E T8G, T8F, T8J, T8H, T8I, T8u;
+					Rm[WS(rs, 14)] = FMA(T6E, T6C, T6I);
+					Rm[WS(rs, 6)] = FMA(T6M, T6K, T6O);
+					Rp[WS(rs, 6)] = FNMS(T6M, T6N, T6L);
+					T8G = FMA(KP923879532, T8t, T8m);
+					T8u = FNMS(KP923879532, T8t, T8m);
+					{
+					     E T8j, T8w, T8D, T8v, T8E;
+					     T8j = W[50];
+					     T8w = W[51];
+					     T8F = W[18];
+					     T8J = FMA(KP923879532, T8C, T8z);
+					     T8D = FNMS(KP923879532, T8C, T8z);
+					     T8v = T8j * T8u;
+					     T8E = T8w * T8u;
+					     T8H = T8F * T8G;
+					     T8I = W[19];
+					     Rp[WS(rs, 13)] = FNMS(T8w, T8D, T8v);
+					     Rm[WS(rs, 13)] = FMA(T8j, T8D, T8E);
+					}
+					{
+					     E T6c, T6u, T6x, T6r, T8K, T5J, T6e;
+					     Rp[WS(rs, 5)] = FNMS(T8I, T8J, T8H);
+					     T8K = T8I * T8G;
+					     Rm[WS(rs, 5)] = FMA(T8F, T8J, T8K);
+					     T6c = FNMS(KP707106781, T6b, T5S);
+					     T6u = FMA(KP707106781, T6b, T5S);
+					     T6x = FMA(KP707106781, T6q, T6n);
+					     T6r = FNMS(KP707106781, T6q, T6n);
+					     T5J = W[38];
+					     T6e = W[39];
+					     {
+						  E T6t, T6w, T6d, T6s, T6v, T6y;
+						  T6t = W[6];
+						  T6w = W[7];
+						  T6d = T5J * T6c;
+						  T6s = T6e * T6c;
+						  T6v = T6t * T6u;
+						  T6y = T6w * T6u;
+						  Rp[WS(rs, 10)] = FNMS(T6e, T6r, T6d);
+						  Rm[WS(rs, 10)] = FMA(T5J, T6r, T6s);
+						  Rp[WS(rs, 2)] = FNMS(T6w, T6x, T6v);
+						  Rm[WS(rs, 2)] = FMA(T6t, T6x, T6y);
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T7c, T7f, T7e, T7g, T7d;
+			      {
+				   E T71, T74, T79, T76, T75, T7b, T7a;
+				   T71 = W[46];
+				   T7c = T72 + T73;
+				   T74 = T72 - T73;
+				   T7f = T78 + T77;
+				   T79 = T77 - T78;
+				   T76 = W[47];
+				   T75 = T71 * T74;
+				   T7b = W[14];
+				   T7a = T71 * T79;
+				   T7e = W[15];
+				   Rp[WS(rs, 12)] = FNMS(T76, T79, T75);
+				   T7g = T7b * T7f;
+				   T7d = T7b * T7c;
+				   Rm[WS(rs, 12)] = FMA(T76, T74, T7a);
+			      }
+			      {
+				   E T81, T7X, T80, T7Z, T82;
+				   Rm[WS(rs, 4)] = FMA(T7e, T7c, T7g);
+				   Rp[WS(rs, 4)] = FNMS(T7e, T7f, T7d);
+				   {
+					E T7h, T7Y, T7I, T7V, T7K, T7J, T7W;
+					T7h = W[42];
+					T7Y = FMA(KP923879532, T7H, T7s);
+					T7I = FNMS(KP923879532, T7H, T7s);
+					T81 = FMA(KP923879532, T7U, T7R);
+					T7V = FNMS(KP923879532, T7U, T7R);
+					T7K = W[43];
+					T7J = T7h * T7I;
+					T7X = W[10];
+					T80 = W[11];
+					T7W = T7K * T7I;
+					Rp[WS(rs, 11)] = FNMS(T7K, T7V, T7J);
+					T7Z = T7X * T7Y;
+					T82 = T80 * T7Y;
+					Rm[WS(rs, 11)] = FMA(T7h, T7V, T7W);
+				   }
+				   {
+					E T2P, T37, T1G, T32, T2R, T2Q, T38, T2z, T27, T2y;
+					T2P = FMA(KP923879532, T2O, T2L);
+					T37 = FNMS(KP923879532, T2O, T2L);
+					Rp[WS(rs, 3)] = FNMS(T80, T81, T7Z);
+					Rm[WS(rs, 3)] = FMA(T7X, T81, T82);
+					T1G = FMA(KP923879532, T1F, T1i);
+					T32 = FNMS(KP923879532, T1F, T1i);
+					T2R = FNMS(KP668178637, T1X, T26);
+					T27 = FMA(KP668178637, T26, T1X);
+					T2y = FNMS(KP668178637, T2x, T2o);
+					T2Q = FMA(KP668178637, T2o, T2x);
+					T38 = T2y + T27;
+					T2z = T27 - T2y;
+					{
+					     E T2C, T2A, T3c, T34, T2U, T39, T36, T31;
+					     {
+						  E T11, T2W, T2S, T33;
+						  T11 = W[40];
+						  T2C = W[41];
+						  T2A = FNMS(KP831469612, T2z, T1G);
+						  T2W = FMA(KP831469612, T2z, T1G);
+						  T2S = T2Q - T2R;
+						  T33 = T2Q + T2R;
+						  {
+						       E T2V, T2B, T2T, T2Z, T2X, T2Y, T30;
+						       T2V = W[8];
+						       T2B = T11 * T2A;
+						       T3c = FMA(KP831469612, T33, T32);
+						       T34 = FNMS(KP831469612, T33, T32);
+						       T2T = FNMS(KP831469612, T2S, T2P);
+						       T2Z = FMA(KP831469612, T2S, T2P);
+						       T2X = T2V * T2W;
+						       T2Y = W[9];
+						       T30 = T2V * T2Z;
+						       Ip[WS(rs, 10)] = FNMS(T2C, T2T, T2B);
+						       T2U = T11 * T2T;
+						       Ip[WS(rs, 2)] = FNMS(T2Y, T2Z, T2X);
+						       Im[WS(rs, 2)] = FMA(T2Y, T2W, T30);
+						  }
+					     }
+					     T39 = FNMS(KP831469612, T38, T37);
+					     T3f = FMA(KP831469612, T38, T37);
+					     Im[WS(rs, 10)] = FMA(T2C, T2A, T2U);
+					     T36 = W[25];
+					     T31 = W[24];
+					     {
+						  E T3b, T3g, T3a, T35;
+						  T3e = W[57];
+						  T3a = T36 * T34;
+						  T35 = T31 * T34;
+						  T3b = W[56];
+						  T3g = T3e * T3c;
+						  Im[WS(rs, 6)] = FMA(T31, T39, T3a);
+						  Ip[WS(rs, 6)] = FNMS(T36, T39, T35);
+						  T3d = T3b * T3c;
+						  Im[WS(rs, 14)] = FMA(T3b, T3f, T3g);
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T4G, T4J, T4I, T4F, T4K;
+			      {
+				   E T4z, T4R, T4a, T4M, T4h, T4o, T4C, T4N, T4A, T4B;
+				   T4z = FMA(KP923879532, T4y, T4v);
+				   T4R = FNMS(KP923879532, T4y, T4v);
+				   T4a = FNMS(KP923879532, T49, T42);
+				   T4M = FMA(KP923879532, T49, T42);
+				   Ip[WS(rs, 14)] = FNMS(T3e, T3f, T3d);
+				   T4h = FNMS(KP668178637, T4g, T4d);
+				   T4A = FMA(KP668178637, T4d, T4g);
+				   T4B = FMA(KP668178637, T4k, T4n);
+				   T4o = FNMS(KP668178637, T4n, T4k);
+				   T4C = T4A - T4B;
+				   T4N = T4A + T4B;
+				   {
+					E T4W, T4Z, T4q, T4X, T50, T4Y;
+					{
+					     E T4L, T4Q, T4O, T4p, T4S, T4P, T4U, T4V, T4T;
+					     T4L = W[20];
+					     T4Q = W[21];
+					     T4W = FMA(KP831469612, T4N, T4M);
+					     T4O = FNMS(KP831469612, T4N, T4M);
+					     T4p = T4h + T4o;
+					     T4S = T4h - T4o;
+					     T4P = T4L * T4O;
+					     T4V = W[52];
+					     T4Z = FNMS(KP831469612, T4S, T4R);
+					     T4T = FMA(KP831469612, T4S, T4R);
+					     T4q = FNMS(KP831469612, T4p, T4a);
+					     T4G = FMA(KP831469612, T4p, T4a);
+					     Ip[WS(rs, 5)] = FNMS(T4Q, T4T, T4P);
+					     T4U = T4L * T4T;
+					     T4X = T4V * T4W;
+					     T50 = T4V * T4Z;
+					     T4Y = W[53];
+					     Im[WS(rs, 5)] = FMA(T4Q, T4O, T4U);
+					}
+					{
+					     E T4D, T4s, T3Z, T4E, T4r;
+					     T4J = FMA(KP831469612, T4C, T4z);
+					     T4D = FNMS(KP831469612, T4C, T4z);
+					     T4s = W[37];
+					     Im[WS(rs, 13)] = FMA(T4Y, T4W, T50);
+					     Ip[WS(rs, 13)] = FNMS(T4Y, T4Z, T4X);
+					     T3Z = W[36];
+					     T4E = T4s * T4q;
+					     T4I = W[5];
+					     T4r = T3Z * T4q;
+					     Im[WS(rs, 9)] = FMA(T3Z, T4D, T4E);
+					     T4F = W[4];
+					     T4K = T4I * T4G;
+					     Ip[WS(rs, 9)] = FNMS(T4s, T4D, T4r);
+					}
+				   }
+			      }
+			      {
+				   E T3E, T3H, T3G, T3D, T3I;
+				   {
+					E T3x, T3P, T3k, T3K, T3n, T3q, T3A, T3L, T4H, T3y, T3z;
+					T3x = FMA(KP923879532, T3w, T3v);
+					T3P = FNMS(KP923879532, T3w, T3v);
+					T4H = T4F * T4G;
+					Im[WS(rs, 1)] = FMA(T4F, T4J, T4K);
+					T3k = FMA(KP923879532, T3j, T3i);
+					T3K = FNMS(KP923879532, T3j, T3i);
+					T3y = FMA(KP198912367, T3l, T3m);
+					T3n = FNMS(KP198912367, T3m, T3l);
+					Ip[WS(rs, 1)] = FNMS(T4I, T4J, T4H);
+					T3z = FNMS(KP198912367, T3o, T3p);
+					T3q = FMA(KP198912367, T3p, T3o);
+					T3A = T3y + T3z;
+					T3L = T3z - T3y;
+					{
+					     E T3U, T3X, T3s, T3V, T3Y, T3W;
+					     {
+						  E T3J, T3O, T3M, T3r, T3Q, T3N, T3S, T3T, T3R;
+						  T3J = W[48];
+						  T3O = W[49];
+						  T3U = FMA(KP980785280, T3L, T3K);
+						  T3M = FNMS(KP980785280, T3L, T3K);
+						  T3r = T3n + T3q;
+						  T3Q = T3n - T3q;
+						  T3N = T3J * T3M;
+						  T3T = W[16];
+						  T3X = FMA(KP980785280, T3Q, T3P);
+						  T3R = FNMS(KP980785280, T3Q, T3P);
+						  T3s = FNMS(KP980785280, T3r, T3k);
+						  T3E = FMA(KP980785280, T3r, T3k);
+						  Ip[WS(rs, 12)] = FNMS(T3O, T3R, T3N);
+						  T3S = T3J * T3R;
+						  T3V = T3T * T3U;
+						  T3Y = T3T * T3X;
+						  T3W = W[17];
+						  Im[WS(rs, 12)] = FMA(T3O, T3M, T3S);
+					     }
+					     {
+						  E T3B, T3u, T3h, T3C, T3t;
+						  T3H = FMA(KP980785280, T3A, T3x);
+						  T3B = FNMS(KP980785280, T3A, T3x);
+						  T3u = W[33];
+						  Im[WS(rs, 4)] = FMA(T3W, T3U, T3Y);
+						  Ip[WS(rs, 4)] = FNMS(T3W, T3X, T3V);
+						  T3h = W[32];
+						  T3C = T3u * T3s;
+						  T3G = W[1];
+						  T3t = T3h * T3s;
+						  Im[WS(rs, 8)] = FMA(T3h, T3B, T3C);
+						  T3D = W[0];
+						  T3I = T3G * T3E;
+						  Ip[WS(rs, 8)] = FNMS(T3u, T3B, T3t);
+					     }
+					}
+				   }
+				   {
+					E T5h, T5z, T54, T5u, T57, T5a, T5k, T5v, T3F, T5i, T5j;
+					T5h = FMA(KP923879532, T5g, T5f);
+					T5z = FNMS(KP923879532, T5g, T5f);
+					T3F = T3D * T3E;
+					Im[0] = FMA(T3D, T3H, T3I);
+					T54 = FNMS(KP923879532, T53, T52);
+					T5u = FMA(KP923879532, T53, T52);
+					T5i = FMA(KP198912367, T55, T56);
+					T57 = FNMS(KP198912367, T56, T55);
+					Ip[0] = FNMS(T3G, T3H, T3F);
+					T5j = FMA(KP198912367, T58, T59);
+					T5a = FNMS(KP198912367, T59, T58);
+					T5k = T5i - T5j;
+					T5v = T5i + T5j;
+					{
+					     E T5E, T5H, T5c, T5F, T5I, T5G;
+					     {
+						  E T5t, T5y, T5w, T5b, T5A, T5x, T5C, T5D, T5B;
+						  T5t = W[28];
+						  T5y = W[29];
+						  T5E = FMA(KP980785280, T5v, T5u);
+						  T5w = FNMS(KP980785280, T5v, T5u);
+						  T5b = T57 + T5a;
+						  T5A = T5a - T57;
+						  T5x = T5t * T5w;
+						  T5D = W[60];
+						  T5H = FNMS(KP980785280, T5A, T5z);
+						  T5B = FMA(KP980785280, T5A, T5z);
+						  T5c = FMA(KP980785280, T5b, T54);
+						  T5o = FNMS(KP980785280, T5b, T54);
+						  Ip[WS(rs, 7)] = FNMS(T5y, T5B, T5x);
+						  T5C = T5t * T5B;
+						  T5F = T5D * T5E;
+						  T5I = T5D * T5H;
+						  T5G = W[61];
+						  Im[WS(rs, 7)] = FMA(T5y, T5w, T5C);
+					     }
+					     {
+						  E T5l, T5e, T51, T5m, T5d;
+						  T5r = FMA(KP980785280, T5k, T5h);
+						  T5l = FNMS(KP980785280, T5k, T5h);
+						  T5e = W[45];
+						  Im[WS(rs, 15)] = FMA(T5G, T5E, T5I);
+						  Ip[WS(rs, 15)] = FNMS(T5G, T5H, T5F);
+						  T51 = W[44];
+						  T5m = T5e * T5c;
+						  T5q = W[13];
+						  T5d = T51 * T5c;
+						  Im[WS(rs, 11)] = FMA(T51, T5l, T5m);
+						  T5n = W[12];
+						  T5s = T5q * T5o;
+						  Ip[WS(rs, 11)] = FNMS(T5e, T5l, T5d);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T5p = T5n * T5o;
+	       Im[WS(rs, 3)] = FMA(T5n, T5r, T5s);
+	       Ip[WS(rs, 3)] = FNMS(T5q, T5r, T5p);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cb_32", twinstr, &GENUS, {236, 62, 198, 0} };
+
+void X(codelet_hc2cb_32) (planner *p) {
+     X(khc2c_register) (p, hc2cb_32, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -dif -name hc2cb_32 -include hc2cb.h */
+
+/*
+ * This function contains 434 FP additions, 208 FP multiplications,
+ * (or, 340 additions, 114 multiplications, 94 fused multiply/add),
+ * 98 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T4o, T6y, T70, T5u, Tf, T12, T5x, T6z, T3m, T3Y, T29, T2y, T4v, T71, T2U;
+	       E T3M, Tu, T1U, T6D, T73, T6G, T74, T1h, T2z, T2X, T3o, T4D, T5A, T4K, T5z;
+	       E T30, T3n, TK, T1j, T6S, T7w, T6V, T7v, T1y, T2B, T3c, T3S, T4X, T61, T54;
+	       E T62, T3f, T3T, TZ, T1A, T6L, T7z, T6O, T7y, T1P, T2C, T35, T3P, T5g, T64;
+	       E T5n, T65, T38, T3Q;
+	       {
+		    E T3, T4m, T1X, T5t, T6, T5s, T20, T4n, Ta, T4p, T24, T4q, Td, T4s, T27;
+		    E T4t;
+		    {
+			 E T1, T2, T1V, T1W;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 15)];
+			 T3 = T1 + T2;
+			 T4m = T1 - T2;
+			 T1V = Ip[0];
+			 T1W = Im[WS(rs, 15)];
+			 T1X = T1V - T1W;
+			 T5t = T1V + T1W;
+		    }
+		    {
+			 E T4, T5, T1Y, T1Z;
+			 T4 = Rp[WS(rs, 8)];
+			 T5 = Rm[WS(rs, 7)];
+			 T6 = T4 + T5;
+			 T5s = T4 - T5;
+			 T1Y = Ip[WS(rs, 8)];
+			 T1Z = Im[WS(rs, 7)];
+			 T20 = T1Y - T1Z;
+			 T4n = T1Y + T1Z;
+		    }
+		    {
+			 E T8, T9, T22, T23;
+			 T8 = Rp[WS(rs, 4)];
+			 T9 = Rm[WS(rs, 11)];
+			 Ta = T8 + T9;
+			 T4p = T8 - T9;
+			 T22 = Ip[WS(rs, 4)];
+			 T23 = Im[WS(rs, 11)];
+			 T24 = T22 - T23;
+			 T4q = T22 + T23;
+		    }
+		    {
+			 E Tb, Tc, T25, T26;
+			 Tb = Rm[WS(rs, 3)];
+			 Tc = Rp[WS(rs, 12)];
+			 Td = Tb + Tc;
+			 T4s = Tb - Tc;
+			 T25 = Ip[WS(rs, 12)];
+			 T26 = Im[WS(rs, 3)];
+			 T27 = T25 - T26;
+			 T4t = T25 + T26;
+		    }
+		    {
+			 E T7, Te, T21, T28;
+			 T4o = T4m - T4n;
+			 T6y = T4m + T4n;
+			 T70 = T5t - T5s;
+			 T5u = T5s + T5t;
+			 T7 = T3 + T6;
+			 Te = Ta + Td;
+			 Tf = T7 + Te;
+			 T12 = T7 - Te;
+			 {
+			      E T5v, T5w, T3k, T3l;
+			      T5v = T4p + T4q;
+			      T5w = T4s + T4t;
+			      T5x = KP707106781 * (T5v - T5w);
+			      T6z = KP707106781 * (T5v + T5w);
+			      T3k = T1X - T20;
+			      T3l = Ta - Td;
+			      T3m = T3k - T3l;
+			      T3Y = T3l + T3k;
+			 }
+			 T21 = T1X + T20;
+			 T28 = T24 + T27;
+			 T29 = T21 - T28;
+			 T2y = T21 + T28;
+			 {
+			      E T4r, T4u, T2S, T2T;
+			      T4r = T4p - T4q;
+			      T4u = T4s - T4t;
+			      T4v = KP707106781 * (T4r + T4u);
+			      T71 = KP707106781 * (T4r - T4u);
+			      T2S = T3 - T6;
+			      T2T = T27 - T24;
+			      T2U = T2S - T2T;
+			      T3M = T2S + T2T;
+			 }
+		    }
+	       }
+	       {
+		    E Ti, T4H, T1c, T4F, Tl, T4E, T1f, T4I, Tp, T4A, T15, T4y, Ts, T4x, T18;
+		    E T4B;
+		    {
+			 E Tg, Th, T1a, T1b;
+			 Tg = Rp[WS(rs, 2)];
+			 Th = Rm[WS(rs, 13)];
+			 Ti = Tg + Th;
+			 T4H = Tg - Th;
+			 T1a = Ip[WS(rs, 2)];
+			 T1b = Im[WS(rs, 13)];
+			 T1c = T1a - T1b;
+			 T4F = T1a + T1b;
+		    }
+		    {
+			 E Tj, Tk, T1d, T1e;
+			 Tj = Rp[WS(rs, 10)];
+			 Tk = Rm[WS(rs, 5)];
+			 Tl = Tj + Tk;
+			 T4E = Tj - Tk;
+			 T1d = Ip[WS(rs, 10)];
+			 T1e = Im[WS(rs, 5)];
+			 T1f = T1d - T1e;
+			 T4I = T1d + T1e;
+		    }
+		    {
+			 E Tn, To, T13, T14;
+			 Tn = Rm[WS(rs, 1)];
+			 To = Rp[WS(rs, 14)];
+			 Tp = Tn + To;
+			 T4A = Tn - To;
+			 T13 = Ip[WS(rs, 14)];
+			 T14 = Im[WS(rs, 1)];
+			 T15 = T13 - T14;
+			 T4y = T13 + T14;
+		    }
+		    {
+			 E Tq, Tr, T16, T17;
+			 Tq = Rp[WS(rs, 6)];
+			 Tr = Rm[WS(rs, 9)];
+			 Ts = Tq + Tr;
+			 T4x = Tq - Tr;
+			 T16 = Ip[WS(rs, 6)];
+			 T17 = Im[WS(rs, 9)];
+			 T18 = T16 - T17;
+			 T4B = T16 + T17;
+		    }
+		    {
+			 E Tm, Tt, T6B, T6C;
+			 Tm = Ti + Tl;
+			 Tt = Tp + Ts;
+			 Tu = Tm + Tt;
+			 T1U = Tm - Tt;
+			 T6B = T4H + T4I;
+			 T6C = T4F - T4E;
+			 T6D = FNMS(KP923879532, T6C, KP382683432 * T6B);
+			 T73 = FMA(KP382683432, T6C, KP923879532 * T6B);
+		    }
+		    {
+			 E T6E, T6F, T19, T1g;
+			 T6E = T4A + T4B;
+			 T6F = T4x + T4y;
+			 T6G = FNMS(KP923879532, T6F, KP382683432 * T6E);
+			 T74 = FMA(KP382683432, T6F, KP923879532 * T6E);
+			 T19 = T15 + T18;
+			 T1g = T1c + T1f;
+			 T1h = T19 - T1g;
+			 T2z = T1g + T19;
+		    }
+		    {
+			 E T2V, T2W, T4z, T4C;
+			 T2V = T15 - T18;
+			 T2W = Tp - Ts;
+			 T2X = T2V - T2W;
+			 T3o = T2W + T2V;
+			 T4z = T4x - T4y;
+			 T4C = T4A - T4B;
+			 T4D = FNMS(KP382683432, T4C, KP923879532 * T4z);
+			 T5A = FMA(KP382683432, T4z, KP923879532 * T4C);
+		    }
+		    {
+			 E T4G, T4J, T2Y, T2Z;
+			 T4G = T4E + T4F;
+			 T4J = T4H - T4I;
+			 T4K = FMA(KP923879532, T4G, KP382683432 * T4J);
+			 T5z = FNMS(KP382683432, T4G, KP923879532 * T4J);
+			 T2Y = Ti - Tl;
+			 T2Z = T1c - T1f;
+			 T30 = T2Y + T2Z;
+			 T3n = T2Y - T2Z;
+		    }
+	       }
+	       {
+		    E Ty, T4N, T1m, T4Z, TB, T4Y, T1p, T4O, TI, T52, T1w, T4V, TF, T51, T1t;
+		    E T4S;
+		    {
+			 E Tw, Tx, T1n, T1o;
+			 Tw = Rp[WS(rs, 1)];
+			 Tx = Rm[WS(rs, 14)];
+			 Ty = Tw + Tx;
+			 T4N = Tw - Tx;
+			 {
+			      E T1k, T1l, Tz, TA;
+			      T1k = Ip[WS(rs, 1)];
+			      T1l = Im[WS(rs, 14)];
+			      T1m = T1k - T1l;
+			      T4Z = T1k + T1l;
+			      Tz = Rp[WS(rs, 9)];
+			      TA = Rm[WS(rs, 6)];
+			      TB = Tz + TA;
+			      T4Y = Tz - TA;
+			 }
+			 T1n = Ip[WS(rs, 9)];
+			 T1o = Im[WS(rs, 6)];
+			 T1p = T1n - T1o;
+			 T4O = T1n + T1o;
+			 {
+			      E TG, TH, T4T, T1u, T1v, T4U;
+			      TG = Rm[WS(rs, 2)];
+			      TH = Rp[WS(rs, 13)];
+			      T4T = TG - TH;
+			      T1u = Ip[WS(rs, 13)];
+			      T1v = Im[WS(rs, 2)];
+			      T4U = T1u + T1v;
+			      TI = TG + TH;
+			      T52 = T4T + T4U;
+			      T1w = T1u - T1v;
+			      T4V = T4T - T4U;
+			 }
+			 {
+			      E TD, TE, T4Q, T1r, T1s, T4R;
+			      TD = Rp[WS(rs, 5)];
+			      TE = Rm[WS(rs, 10)];
+			      T4Q = TD - TE;
+			      T1r = Ip[WS(rs, 5)];
+			      T1s = Im[WS(rs, 10)];
+			      T4R = T1r + T1s;
+			      TF = TD + TE;
+			      T51 = T4Q + T4R;
+			      T1t = T1r - T1s;
+			      T4S = T4Q - T4R;
+			 }
+		    }
+		    {
+			 E TC, TJ, T6Q, T6R;
+			 TC = Ty + TB;
+			 TJ = TF + TI;
+			 TK = TC + TJ;
+			 T1j = TC - TJ;
+			 T6Q = T4Z - T4Y;
+			 T6R = KP707106781 * (T4S - T4V);
+			 T6S = T6Q + T6R;
+			 T7w = T6Q - T6R;
+		    }
+		    {
+			 E T6T, T6U, T1q, T1x;
+			 T6T = T4N + T4O;
+			 T6U = KP707106781 * (T51 + T52);
+			 T6V = T6T - T6U;
+			 T7v = T6T + T6U;
+			 T1q = T1m + T1p;
+			 T1x = T1t + T1w;
+			 T1y = T1q - T1x;
+			 T2B = T1q + T1x;
+		    }
+		    {
+			 E T3a, T3b, T4P, T4W;
+			 T3a = T1m - T1p;
+			 T3b = TF - TI;
+			 T3c = T3a - T3b;
+			 T3S = T3b + T3a;
+			 T4P = T4N - T4O;
+			 T4W = KP707106781 * (T4S + T4V);
+			 T4X = T4P - T4W;
+			 T61 = T4P + T4W;
+		    }
+		    {
+			 E T50, T53, T3d, T3e;
+			 T50 = T4Y + T4Z;
+			 T53 = KP707106781 * (T51 - T52);
+			 T54 = T50 - T53;
+			 T62 = T50 + T53;
+			 T3d = Ty - TB;
+			 T3e = T1w - T1t;
+			 T3f = T3d - T3e;
+			 T3T = T3d + T3e;
+		    }
+	       }
+	       {
+		    E TN, T56, T1D, T5i, TQ, T5h, T1G, T57, TX, T5l, T1N, T5e, TU, T5k, T1K;
+		    E T5b;
+		    {
+			 E TL, TM, T1E, T1F;
+			 TL = Rm[0];
+			 TM = Rp[WS(rs, 15)];
+			 TN = TL + TM;
+			 T56 = TL - TM;
+			 {
+			      E T1B, T1C, TO, TP;
+			      T1B = Ip[WS(rs, 15)];
+			      T1C = Im[0];
+			      T1D = T1B - T1C;
+			      T5i = T1B + T1C;
+			      TO = Rp[WS(rs, 7)];
+			      TP = Rm[WS(rs, 8)];
+			      TQ = TO + TP;
+			      T5h = TO - TP;
+			 }
+			 T1E = Ip[WS(rs, 7)];
+			 T1F = Im[WS(rs, 8)];
+			 T1G = T1E - T1F;
+			 T57 = T1E + T1F;
+			 {
+			      E TV, TW, T5c, T1L, T1M, T5d;
+			      TV = Rm[WS(rs, 4)];
+			      TW = Rp[WS(rs, 11)];
+			      T5c = TV - TW;
+			      T1L = Ip[WS(rs, 11)];
+			      T1M = Im[WS(rs, 4)];
+			      T5d = T1L + T1M;
+			      TX = TV + TW;
+			      T5l = T5c + T5d;
+			      T1N = T1L - T1M;
+			      T5e = T5c - T5d;
+			 }
+			 {
+			      E TS, TT, T59, T1I, T1J, T5a;
+			      TS = Rp[WS(rs, 3)];
+			      TT = Rm[WS(rs, 12)];
+			      T59 = TS - TT;
+			      T1I = Ip[WS(rs, 3)];
+			      T1J = Im[WS(rs, 12)];
+			      T5a = T1I + T1J;
+			      TU = TS + TT;
+			      T5k = T59 + T5a;
+			      T1K = T1I - T1J;
+			      T5b = T59 - T5a;
+			 }
+		    }
+		    {
+			 E TR, TY, T6J, T6K;
+			 TR = TN + TQ;
+			 TY = TU + TX;
+			 TZ = TR + TY;
+			 T1A = TR - TY;
+			 T6J = KP707106781 * (T5b - T5e);
+			 T6K = T5h + T5i;
+			 T6L = T6J - T6K;
+			 T7z = T6K + T6J;
+		    }
+		    {
+			 E T6M, T6N, T1H, T1O;
+			 T6M = T56 + T57;
+			 T6N = KP707106781 * (T5k + T5l);
+			 T6O = T6M - T6N;
+			 T7y = T6M + T6N;
+			 T1H = T1D + T1G;
+			 T1O = T1K + T1N;
+			 T1P = T1H - T1O;
+			 T2C = T1H + T1O;
+		    }
+		    {
+			 E T33, T34, T58, T5f;
+			 T33 = T1D - T1G;
+			 T34 = TU - TX;
+			 T35 = T33 - T34;
+			 T3P = T34 + T33;
+			 T58 = T56 - T57;
+			 T5f = KP707106781 * (T5b + T5e);
+			 T5g = T58 - T5f;
+			 T64 = T58 + T5f;
+		    }
+		    {
+			 E T5j, T5m, T36, T37;
+			 T5j = T5h - T5i;
+			 T5m = KP707106781 * (T5k - T5l);
+			 T5n = T5j - T5m;
+			 T65 = T5j + T5m;
+			 T36 = TN - TQ;
+			 T37 = T1N - T1K;
+			 T38 = T36 - T37;
+			 T3Q = T36 + T37;
+		    }
+	       }
+	       {
+		    E Tv, T10, T2w, T2A, T2D, T2E, T2v, T2x;
+		    Tv = Tf + Tu;
+		    T10 = TK + TZ;
+		    T2w = Tv - T10;
+		    T2A = T2y + T2z;
+		    T2D = T2B + T2C;
+		    T2E = T2A - T2D;
+		    Rp[0] = Tv + T10;
+		    Rm[0] = T2A + T2D;
+		    T2v = W[30];
+		    T2x = W[31];
+		    Rp[WS(rs, 8)] = FNMS(T2x, T2E, T2v * T2w);
+		    Rm[WS(rs, 8)] = FMA(T2x, T2w, T2v * T2E);
+	       }
+	       {
+		    E T2I, T2O, T2M, T2Q;
+		    {
+			 E T2G, T2H, T2K, T2L;
+			 T2G = Tf - Tu;
+			 T2H = T2C - T2B;
+			 T2I = T2G - T2H;
+			 T2O = T2G + T2H;
+			 T2K = T2y - T2z;
+			 T2L = TK - TZ;
+			 T2M = T2K - T2L;
+			 T2Q = T2L + T2K;
+		    }
+		    {
+			 E T2F, T2J, T2N, T2P;
+			 T2F = W[46];
+			 T2J = W[47];
+			 Rp[WS(rs, 12)] = FNMS(T2J, T2M, T2F * T2I);
+			 Rm[WS(rs, 12)] = FMA(T2F, T2M, T2J * T2I);
+			 T2N = W[14];
+			 T2P = W[15];
+			 Rp[WS(rs, 4)] = FNMS(T2P, T2Q, T2N * T2O);
+			 Rm[WS(rs, 4)] = FMA(T2N, T2Q, T2P * T2O);
+		    }
+	       }
+	       {
+		    E T1i, T2a, T2o, T2k, T2d, T2l, T1R, T2p;
+		    T1i = T12 + T1h;
+		    T2a = T1U + T29;
+		    T2o = T29 - T1U;
+		    T2k = T12 - T1h;
+		    {
+			 E T2b, T2c, T1z, T1Q;
+			 T2b = T1j + T1y;
+			 T2c = T1P - T1A;
+			 T2d = KP707106781 * (T2b + T2c);
+			 T2l = KP707106781 * (T2c - T2b);
+			 T1z = T1j - T1y;
+			 T1Q = T1A + T1P;
+			 T1R = KP707106781 * (T1z + T1Q);
+			 T2p = KP707106781 * (T1z - T1Q);
+		    }
+		    {
+			 E T1S, T2e, T11, T1T;
+			 T1S = T1i - T1R;
+			 T2e = T2a - T2d;
+			 T11 = W[38];
+			 T1T = W[39];
+			 Rp[WS(rs, 10)] = FNMS(T1T, T2e, T11 * T1S);
+			 Rm[WS(rs, 10)] = FMA(T1T, T1S, T11 * T2e);
+		    }
+		    {
+			 E T2s, T2u, T2r, T2t;
+			 T2s = T2k + T2l;
+			 T2u = T2o + T2p;
+			 T2r = W[22];
+			 T2t = W[23];
+			 Rp[WS(rs, 6)] = FNMS(T2t, T2u, T2r * T2s);
+			 Rm[WS(rs, 6)] = FMA(T2r, T2u, T2t * T2s);
+		    }
+		    {
+			 E T2g, T2i, T2f, T2h;
+			 T2g = T1i + T1R;
+			 T2i = T2a + T2d;
+			 T2f = W[6];
+			 T2h = W[7];
+			 Rp[WS(rs, 2)] = FNMS(T2h, T2i, T2f * T2g);
+			 Rm[WS(rs, 2)] = FMA(T2h, T2g, T2f * T2i);
+		    }
+		    {
+			 E T2m, T2q, T2j, T2n;
+			 T2m = T2k - T2l;
+			 T2q = T2o - T2p;
+			 T2j = W[54];
+			 T2n = W[55];
+			 Rp[WS(rs, 14)] = FNMS(T2n, T2q, T2j * T2m);
+			 Rm[WS(rs, 14)] = FMA(T2j, T2q, T2n * T2m);
+		    }
+	       }
+	       {
+		    E T3O, T4a, T40, T4e, T3V, T4f, T43, T4b, T3N, T3Z;
+		    T3N = KP707106781 * (T3n + T3o);
+		    T3O = T3M - T3N;
+		    T4a = T3M + T3N;
+		    T3Z = KP707106781 * (T30 + T2X);
+		    T40 = T3Y - T3Z;
+		    T4e = T3Y + T3Z;
+		    {
+			 E T3R, T3U, T41, T42;
+			 T3R = FNMS(KP382683432, T3Q, KP923879532 * T3P);
+			 T3U = FMA(KP923879532, T3S, KP382683432 * T3T);
+			 T3V = T3R - T3U;
+			 T4f = T3U + T3R;
+			 T41 = FNMS(KP382683432, T3S, KP923879532 * T3T);
+			 T42 = FMA(KP382683432, T3P, KP923879532 * T3Q);
+			 T43 = T41 - T42;
+			 T4b = T41 + T42;
+		    }
+		    {
+			 E T3W, T44, T3L, T3X;
+			 T3W = T3O - T3V;
+			 T44 = T40 - T43;
+			 T3L = W[50];
+			 T3X = W[51];
+			 Rp[WS(rs, 13)] = FNMS(T3X, T44, T3L * T3W);
+			 Rm[WS(rs, 13)] = FMA(T3X, T3W, T3L * T44);
+		    }
+		    {
+			 E T4i, T4k, T4h, T4j;
+			 T4i = T4a + T4b;
+			 T4k = T4e + T4f;
+			 T4h = W[2];
+			 T4j = W[3];
+			 Rp[WS(rs, 1)] = FNMS(T4j, T4k, T4h * T4i);
+			 Rm[WS(rs, 1)] = FMA(T4h, T4k, T4j * T4i);
+		    }
+		    {
+			 E T46, T48, T45, T47;
+			 T46 = T3O + T3V;
+			 T48 = T40 + T43;
+			 T45 = W[18];
+			 T47 = W[19];
+			 Rp[WS(rs, 5)] = FNMS(T47, T48, T45 * T46);
+			 Rm[WS(rs, 5)] = FMA(T47, T46, T45 * T48);
+		    }
+		    {
+			 E T4c, T4g, T49, T4d;
+			 T4c = T4a - T4b;
+			 T4g = T4e - T4f;
+			 T49 = W[34];
+			 T4d = W[35];
+			 Rp[WS(rs, 9)] = FNMS(T4d, T4g, T49 * T4c);
+			 Rm[WS(rs, 9)] = FMA(T49, T4g, T4d * T4c);
+		    }
+	       }
+	       {
+		    E T32, T3A, T3q, T3E, T3h, T3F, T3t, T3B, T31, T3p;
+		    T31 = KP707106781 * (T2X - T30);
+		    T32 = T2U - T31;
+		    T3A = T2U + T31;
+		    T3p = KP707106781 * (T3n - T3o);
+		    T3q = T3m - T3p;
+		    T3E = T3m + T3p;
+		    {
+			 E T39, T3g, T3r, T3s;
+			 T39 = FNMS(KP923879532, T38, KP382683432 * T35);
+			 T3g = FMA(KP382683432, T3c, KP923879532 * T3f);
+			 T3h = T39 - T3g;
+			 T3F = T3g + T39;
+			 T3r = FNMS(KP923879532, T3c, KP382683432 * T3f);
+			 T3s = FMA(KP923879532, T35, KP382683432 * T38);
+			 T3t = T3r - T3s;
+			 T3B = T3r + T3s;
+		    }
+		    {
+			 E T3i, T3u, T2R, T3j;
+			 T3i = T32 - T3h;
+			 T3u = T3q - T3t;
+			 T2R = W[58];
+			 T3j = W[59];
+			 Rp[WS(rs, 15)] = FNMS(T3j, T3u, T2R * T3i);
+			 Rm[WS(rs, 15)] = FMA(T3j, T3i, T2R * T3u);
+		    }
+		    {
+			 E T3I, T3K, T3H, T3J;
+			 T3I = T3A + T3B;
+			 T3K = T3E + T3F;
+			 T3H = W[10];
+			 T3J = W[11];
+			 Rp[WS(rs, 3)] = FNMS(T3J, T3K, T3H * T3I);
+			 Rm[WS(rs, 3)] = FMA(T3H, T3K, T3J * T3I);
+		    }
+		    {
+			 E T3w, T3y, T3v, T3x;
+			 T3w = T32 + T3h;
+			 T3y = T3q + T3t;
+			 T3v = W[26];
+			 T3x = W[27];
+			 Rp[WS(rs, 7)] = FNMS(T3x, T3y, T3v * T3w);
+			 Rm[WS(rs, 7)] = FMA(T3x, T3w, T3v * T3y);
+		    }
+		    {
+			 E T3C, T3G, T3z, T3D;
+			 T3C = T3A - T3B;
+			 T3G = T3E - T3F;
+			 T3z = W[42];
+			 T3D = W[43];
+			 Rp[WS(rs, 11)] = FNMS(T3D, T3G, T3z * T3C);
+			 Rm[WS(rs, 11)] = FMA(T3z, T3G, T3D * T3C);
+		    }
+	       }
+	       {
+		    E T60, T6m, T6f, T6n, T67, T6r, T6c, T6q;
+		    {
+			 E T5Y, T5Z, T6d, T6e;
+			 T5Y = T4o + T4v;
+			 T5Z = T5z + T5A;
+			 T60 = T5Y + T5Z;
+			 T6m = T5Y - T5Z;
+			 T6d = FMA(KP195090322, T61, KP980785280 * T62);
+			 T6e = FNMS(KP195090322, T64, KP980785280 * T65);
+			 T6f = T6d + T6e;
+			 T6n = T6e - T6d;
+		    }
+		    {
+			 E T63, T66, T6a, T6b;
+			 T63 = FNMS(KP195090322, T62, KP980785280 * T61);
+			 T66 = FMA(KP980785280, T64, KP195090322 * T65);
+			 T67 = T63 + T66;
+			 T6r = T63 - T66;
+			 T6a = T5u + T5x;
+			 T6b = T4K + T4D;
+			 T6c = T6a + T6b;
+			 T6q = T6a - T6b;
+		    }
+		    {
+			 E T68, T6g, T5X, T69;
+			 T68 = T60 - T67;
+			 T6g = T6c - T6f;
+			 T5X = W[32];
+			 T69 = W[33];
+			 Ip[WS(rs, 8)] = FNMS(T69, T6g, T5X * T68);
+			 Im[WS(rs, 8)] = FMA(T69, T68, T5X * T6g);
+		    }
+		    {
+			 E T6u, T6w, T6t, T6v;
+			 T6u = T6m + T6n;
+			 T6w = T6q + T6r;
+			 T6t = W[16];
+			 T6v = W[17];
+			 Ip[WS(rs, 4)] = FNMS(T6v, T6w, T6t * T6u);
+			 Im[WS(rs, 4)] = FMA(T6t, T6w, T6v * T6u);
+		    }
+		    {
+			 E T6i, T6k, T6h, T6j;
+			 T6i = T60 + T67;
+			 T6k = T6c + T6f;
+			 T6h = W[0];
+			 T6j = W[1];
+			 Ip[0] = FNMS(T6j, T6k, T6h * T6i);
+			 Im[0] = FMA(T6j, T6i, T6h * T6k);
+		    }
+		    {
+			 E T6o, T6s, T6l, T6p;
+			 T6o = T6m - T6n;
+			 T6s = T6q - T6r;
+			 T6l = W[48];
+			 T6p = W[49];
+			 Ip[WS(rs, 12)] = FNMS(T6p, T6s, T6l * T6o);
+			 Im[WS(rs, 12)] = FMA(T6l, T6s, T6p * T6o);
+		    }
+	       }
+	       {
+		    E T7u, T7Q, T7J, T7R, T7B, T7V, T7G, T7U;
+		    {
+			 E T7s, T7t, T7H, T7I;
+			 T7s = T6y + T6z;
+			 T7t = T73 + T74;
+			 T7u = T7s - T7t;
+			 T7Q = T7s + T7t;
+			 T7H = FMA(KP195090322, T7w, KP980785280 * T7v);
+			 T7I = FMA(KP195090322, T7z, KP980785280 * T7y);
+			 T7J = T7H - T7I;
+			 T7R = T7H + T7I;
+		    }
+		    {
+			 E T7x, T7A, T7E, T7F;
+			 T7x = FNMS(KP980785280, T7w, KP195090322 * T7v);
+			 T7A = FNMS(KP980785280, T7z, KP195090322 * T7y);
+			 T7B = T7x + T7A;
+			 T7V = T7x - T7A;
+			 T7E = T70 - T71;
+			 T7F = T6D - T6G;
+			 T7G = T7E + T7F;
+			 T7U = T7E - T7F;
+		    }
+		    {
+			 E T7C, T7K, T7r, T7D;
+			 T7C = T7u - T7B;
+			 T7K = T7G - T7J;
+			 T7r = W[44];
+			 T7D = W[45];
+			 Ip[WS(rs, 11)] = FNMS(T7D, T7K, T7r * T7C);
+			 Im[WS(rs, 11)] = FMA(T7D, T7C, T7r * T7K);
+		    }
+		    {
+			 E T7Y, T80, T7X, T7Z;
+			 T7Y = T7Q + T7R;
+			 T80 = T7U - T7V;
+			 T7X = W[60];
+			 T7Z = W[61];
+			 Ip[WS(rs, 15)] = FNMS(T7Z, T80, T7X * T7Y);
+			 Im[WS(rs, 15)] = FMA(T7X, T80, T7Z * T7Y);
+		    }
+		    {
+			 E T7M, T7O, T7L, T7N;
+			 T7M = T7u + T7B;
+			 T7O = T7G + T7J;
+			 T7L = W[12];
+			 T7N = W[13];
+			 Ip[WS(rs, 3)] = FNMS(T7N, T7O, T7L * T7M);
+			 Im[WS(rs, 3)] = FMA(T7N, T7M, T7L * T7O);
+		    }
+		    {
+			 E T7S, T7W, T7P, T7T;
+			 T7S = T7Q - T7R;
+			 T7W = T7U + T7V;
+			 T7P = W[28];
+			 T7T = W[29];
+			 Ip[WS(rs, 7)] = FNMS(T7T, T7W, T7P * T7S);
+			 Im[WS(rs, 7)] = FMA(T7P, T7W, T7T * T7S);
+		    }
+	       }
+	       {
+		    E T4M, T5M, T5F, T5N, T5p, T5R, T5C, T5Q;
+		    {
+			 E T4w, T4L, T5D, T5E;
+			 T4w = T4o - T4v;
+			 T4L = T4D - T4K;
+			 T4M = T4w + T4L;
+			 T5M = T4w - T4L;
+			 T5D = FMA(KP831469612, T4X, KP555570233 * T54);
+			 T5E = FNMS(KP831469612, T5g, KP555570233 * T5n);
+			 T5F = T5D + T5E;
+			 T5N = T5E - T5D;
+		    }
+		    {
+			 E T55, T5o, T5y, T5B;
+			 T55 = FNMS(KP831469612, T54, KP555570233 * T4X);
+			 T5o = FMA(KP555570233, T5g, KP831469612 * T5n);
+			 T5p = T55 + T5o;
+			 T5R = T55 - T5o;
+			 T5y = T5u - T5x;
+			 T5B = T5z - T5A;
+			 T5C = T5y + T5B;
+			 T5Q = T5y - T5B;
+		    }
+		    {
+			 E T5q, T5G, T4l, T5r;
+			 T5q = T4M - T5p;
+			 T5G = T5C - T5F;
+			 T4l = W[40];
+			 T5r = W[41];
+			 Ip[WS(rs, 10)] = FNMS(T5r, T5G, T4l * T5q);
+			 Im[WS(rs, 10)] = FMA(T5r, T5q, T4l * T5G);
+		    }
+		    {
+			 E T5U, T5W, T5T, T5V;
+			 T5U = T5M + T5N;
+			 T5W = T5Q + T5R;
+			 T5T = W[24];
+			 T5V = W[25];
+			 Ip[WS(rs, 6)] = FNMS(T5V, T5W, T5T * T5U);
+			 Im[WS(rs, 6)] = FMA(T5T, T5W, T5V * T5U);
+		    }
+		    {
+			 E T5I, T5K, T5H, T5J;
+			 T5I = T4M + T5p;
+			 T5K = T5C + T5F;
+			 T5H = W[8];
+			 T5J = W[9];
+			 Ip[WS(rs, 2)] = FNMS(T5J, T5K, T5H * T5I);
+			 Im[WS(rs, 2)] = FMA(T5J, T5I, T5H * T5K);
+		    }
+		    {
+			 E T5O, T5S, T5L, T5P;
+			 T5O = T5M - T5N;
+			 T5S = T5Q - T5R;
+			 T5L = W[56];
+			 T5P = W[57];
+			 Ip[WS(rs, 14)] = FNMS(T5P, T5S, T5L * T5O);
+			 Im[WS(rs, 14)] = FMA(T5L, T5S, T5P * T5O);
+		    }
+	       }
+	       {
+		    E T6I, T7g, T79, T7h, T6X, T7l, T76, T7k;
+		    {
+			 E T6A, T6H, T77, T78;
+			 T6A = T6y - T6z;
+			 T6H = T6D + T6G;
+			 T6I = T6A - T6H;
+			 T7g = T6A + T6H;
+			 T77 = FNMS(KP555570233, T6S, KP831469612 * T6V);
+			 T78 = FMA(KP555570233, T6L, KP831469612 * T6O);
+			 T79 = T77 - T78;
+			 T7h = T77 + T78;
+		    }
+		    {
+			 E T6P, T6W, T72, T75;
+			 T6P = FNMS(KP555570233, T6O, KP831469612 * T6L);
+			 T6W = FMA(KP831469612, T6S, KP555570233 * T6V);
+			 T6X = T6P - T6W;
+			 T7l = T6W + T6P;
+			 T72 = T70 + T71;
+			 T75 = T73 - T74;
+			 T76 = T72 - T75;
+			 T7k = T72 + T75;
+		    }
+		    {
+			 E T6Y, T7a, T6x, T6Z;
+			 T6Y = T6I - T6X;
+			 T7a = T76 - T79;
+			 T6x = W[52];
+			 T6Z = W[53];
+			 Ip[WS(rs, 13)] = FNMS(T6Z, T7a, T6x * T6Y);
+			 Im[WS(rs, 13)] = FMA(T6Z, T6Y, T6x * T7a);
+		    }
+		    {
+			 E T7o, T7q, T7n, T7p;
+			 T7o = T7g + T7h;
+			 T7q = T7k + T7l;
+			 T7n = W[4];
+			 T7p = W[5];
+			 Ip[WS(rs, 1)] = FNMS(T7p, T7q, T7n * T7o);
+			 Im[WS(rs, 1)] = FMA(T7n, T7q, T7p * T7o);
+		    }
+		    {
+			 E T7c, T7e, T7b, T7d;
+			 T7c = T6I + T6X;
+			 T7e = T76 + T79;
+			 T7b = W[20];
+			 T7d = W[21];
+			 Ip[WS(rs, 5)] = FNMS(T7d, T7e, T7b * T7c);
+			 Im[WS(rs, 5)] = FMA(T7d, T7c, T7b * T7e);
+		    }
+		    {
+			 E T7i, T7m, T7f, T7j;
+			 T7i = T7g - T7h;
+			 T7m = T7k - T7l;
+			 T7f = W[36];
+			 T7j = W[37];
+			 Ip[WS(rs, 9)] = FNMS(T7j, T7m, T7f * T7i);
+			 Im[WS(rs, 9)] = FMA(T7f, T7m, T7j * T7i);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cb_32", twinstr, &GENUS, {340, 114, 94, 0} };
+
+void X(codelet_hc2cb_32) (planner *p) {
+     X(khc2c_register) (p, hc2cb_32, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:52 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cb_4 -include hc2cb.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 25 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E Th, Ta, T7, Ti, T9;
+	       {
+		    E Tq, Td, T3, Tg, Tu, Tm, T6, Tp;
+		    {
+			 E Tk, T4, Tl, T5;
+			 {
+			      E Tb, Tc, T1, T2, Te, Tf;
+			      Tb = Ip[0];
+			      Tc = Im[WS(rs, 1)];
+			      T1 = Rp[0];
+			      T2 = Rm[WS(rs, 1)];
+			      Te = Ip[WS(rs, 1)];
+			      Tq = Tb + Tc;
+			      Td = Tb - Tc;
+			      Tf = Im[0];
+			      Tk = T1 - T2;
+			      T3 = T1 + T2;
+			      T4 = Rp[WS(rs, 1)];
+			      Tg = Te - Tf;
+			      Tl = Te + Tf;
+			      T5 = Rm[0];
+			 }
+			 Tu = Tk + Tl;
+			 Tm = Tk - Tl;
+			 T6 = T4 + T5;
+			 Tp = T4 - T5;
+		    }
+		    Rm[0] = Td + Tg;
+		    {
+			 E Tx, Tr, T8, Tn, Ts, To, Tj;
+			 Tj = W[0];
+			 Tx = Tq - Tp;
+			 Tr = Tp + Tq;
+			 Rp[0] = T3 + T6;
+			 T8 = T3 - T6;
+			 Tn = Tj * Tm;
+			 Ts = Tj * Tr;
+			 To = W[1];
+			 {
+			      E Tt, Tw, Ty, Tv;
+			      Tt = W[4];
+			      Tw = W[5];
+			      Th = Td - Tg;
+			      Im[0] = FMA(To, Tm, Ts);
+			      Ip[0] = FNMS(To, Tr, Tn);
+			      Ty = Tt * Tx;
+			      Tv = Tt * Tu;
+			      Ta = W[3];
+			      T7 = W[2];
+			      Im[WS(rs, 1)] = FMA(Tw, Tu, Ty);
+			      Ip[WS(rs, 1)] = FNMS(Tw, Tx, Tv);
+			      Ti = Ta * T8;
+			      T9 = T7 * T8;
+			 }
+		    }
+	       }
+	       Rm[WS(rs, 1)] = FMA(T7, Th, Ti);
+	       Rp[WS(rs, 1)] = FNMS(Ta, Th, T9);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cb_4", twinstr, &GENUS, {16, 6, 6, 0} };
+
+void X(codelet_hc2cb_4) (planner *p) {
+     X(khc2c_register) (p, hc2cb_4, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cb_4 -include hc2cb.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 13 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T3, Ti, Tc, Tn, T6, Tm, Tf, Tj;
+	       {
+		    E T1, T2, Ta, Tb;
+		    T1 = Rp[0];
+		    T2 = Rm[WS(rs, 1)];
+		    T3 = T1 + T2;
+		    Ti = T1 - T2;
+		    Ta = Ip[0];
+		    Tb = Im[WS(rs, 1)];
+		    Tc = Ta - Tb;
+		    Tn = Ta + Tb;
+	       }
+	       {
+		    E T4, T5, Td, Te;
+		    T4 = Rp[WS(rs, 1)];
+		    T5 = Rm[0];
+		    T6 = T4 + T5;
+		    Tm = T4 - T5;
+		    Td = Ip[WS(rs, 1)];
+		    Te = Im[0];
+		    Tf = Td - Te;
+		    Tj = Td + Te;
+	       }
+	       Rp[0] = T3 + T6;
+	       Rm[0] = Tc + Tf;
+	       {
+		    E T8, Tg, T7, T9;
+		    T8 = T3 - T6;
+		    Tg = Tc - Tf;
+		    T7 = W[2];
+		    T9 = W[3];
+		    Rp[WS(rs, 1)] = FNMS(T9, Tg, T7 * T8);
+		    Rm[WS(rs, 1)] = FMA(T9, T8, T7 * Tg);
+	       }
+	       {
+		    E Tk, To, Th, Tl;
+		    Tk = Ti - Tj;
+		    To = Tm + Tn;
+		    Th = W[0];
+		    Tl = W[1];
+		    Ip[0] = FNMS(Tl, To, Th * Tk);
+		    Im[0] = FMA(Th, To, Tl * Tk);
+	       }
+	       {
+		    E Tq, Ts, Tp, Tr;
+		    Tq = Ti + Tj;
+		    Ts = Tn - Tm;
+		    Tp = W[4];
+		    Tr = W[5];
+		    Ip[WS(rs, 1)] = FNMS(Tr, Ts, Tp * Tq);
+		    Im[WS(rs, 1)] = FMA(Tp, Ts, Tr * Tq);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cb_4", twinstr, &GENUS, {16, 6, 6, 0} };
+
+void X(codelet_hc2cb_4) (planner *p) {
+     X(khc2c_register) (p, hc2cb_4, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:52 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cb_6 -include hc2cb.h */
+
+/*
+ * This function contains 46 FP additions, 32 FP multiplications,
+ * (or, 24 additions, 10 multiplications, 22 fused multiply/add),
+ * 45 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E TK, TR, TB, TM, TL, TS;
+	       {
+		    E Td, TN, TO, TJ, Tn, Tk, TC, T3, Tr, T7, T8, T4, T5;
+		    {
+			 E TI, Tj, Tg, TH, Te, Tf, T1, T2;
+			 {
+			      E Tb, Tc, Th, Ti;
+			      Tb = Ip[0];
+			      Tc = Im[WS(rs, 2)];
+			      Th = Ip[WS(rs, 1)];
+			      Ti = Im[WS(rs, 1)];
+			      Te = Ip[WS(rs, 2)];
+			      Td = Tb - Tc;
+			      TN = Tb + Tc;
+			      Tf = Im[0];
+			      TI = Th + Ti;
+			      Tj = Th - Ti;
+			 }
+			 Tg = Te - Tf;
+			 TH = Te + Tf;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 2)];
+			 TO = TH - TI;
+			 TJ = TH + TI;
+			 Tn = Tj - Tg;
+			 Tk = Tg + Tj;
+			 TC = T1 - T2;
+			 T3 = T1 + T2;
+			 Tr = FNMS(KP500000000, Tk, Td);
+			 T7 = Rm[WS(rs, 1)];
+			 T8 = Rp[WS(rs, 1)];
+			 T4 = Rp[WS(rs, 2)];
+			 T5 = Rm[0];
+		    }
+		    {
+			 E Tl, Tq, TQ, Ts, Ta, T10, TG;
+			 Rm[0] = Td + Tk;
+			 {
+			      E T9, TE, T6, TD, TF;
+			      T9 = T7 + T8;
+			      TE = T7 - T8;
+			      T6 = T4 + T5;
+			      TD = T4 - T5;
+			      Tl = W[2];
+			      Tq = W[3];
+			      TQ = TD - TE;
+			      TF = TD + TE;
+			      Ts = T6 - T9;
+			      Ta = T6 + T9;
+			      T10 = TC + TF;
+			      TG = FNMS(KP500000000, TF, TC);
+			 }
+			 {
+			      E T13, TP, Tz, TZ, Tw, T14, Tv, Ty;
+			      {
+				   E Tt, T12, T11, Tp, Tm, To, Tu;
+				   T13 = TN + TO;
+				   TP = FNMS(KP500000000, TO, TN);
+				   Rp[0] = T3 + Ta;
+				   Tm = FNMS(KP500000000, Ta, T3);
+				   Tz = FMA(KP866025403, Ts, Tr);
+				   Tt = FNMS(KP866025403, Ts, Tr);
+				   TZ = W[4];
+				   To = FNMS(KP866025403, Tn, Tm);
+				   Tw = FMA(KP866025403, Tn, Tm);
+				   Tu = Tl * Tt;
+				   T12 = W[5];
+				   T11 = TZ * T10;
+				   Tp = Tl * To;
+				   Rm[WS(rs, 1)] = FMA(Tq, To, Tu);
+				   T14 = T12 * T10;
+				   Ip[WS(rs, 1)] = FNMS(T12, T13, T11);
+				   Rp[WS(rs, 1)] = FNMS(Tq, Tt, Tp);
+			      }
+			      Im[WS(rs, 1)] = FMA(TZ, T13, T14);
+			      Tv = W[6];
+			      Ty = W[7];
+			      {
+				   E TX, TT, TW, TV, TY, TU, TA, Tx;
+				   TK = FNMS(KP866025403, TJ, TG);
+				   TU = FMA(KP866025403, TJ, TG);
+				   TA = Tv * Tz;
+				   Tx = Tv * Tw;
+				   TX = FNMS(KP866025403, TQ, TP);
+				   TR = FMA(KP866025403, TQ, TP);
+				   Rm[WS(rs, 2)] = FMA(Ty, Tw, TA);
+				   Rp[WS(rs, 2)] = FNMS(Ty, Tz, Tx);
+				   TT = W[8];
+				   TW = W[9];
+				   TB = W[0];
+				   TV = TT * TU;
+				   TY = TW * TU;
+				   TM = W[1];
+				   TL = TB * TK;
+				   Ip[WS(rs, 2)] = FNMS(TW, TX, TV);
+				   Im[WS(rs, 2)] = FMA(TT, TX, TY);
+			      }
+			 }
+		    }
+	       }
+	       Ip[0] = FNMS(TM, TR, TL);
+	       TS = TM * TK;
+	       Im[0] = FMA(TB, TR, TS);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 6, "hc2cb_6", twinstr, &GENUS, {24, 10, 22, 0} };
+
+void X(codelet_hc2cb_6) (planner *p) {
+     X(khc2c_register) (p, hc2cb_6, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cb_6 -include hc2cb.h */
+
+/*
+ * This function contains 46 FP additions, 28 FP multiplications,
+ * (or, 32 additions, 14 multiplications, 14 fused multiply/add),
+ * 25 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T3, Ty, Td, TE, Ta, TO, Tr, TB, Tk, TL, Tn, TH;
+	       {
+		    E T1, T2, Tb, Tc;
+		    T1 = Rp[0];
+		    T2 = Rm[WS(rs, 2)];
+		    T3 = T1 + T2;
+		    Ty = T1 - T2;
+		    Tb = Ip[0];
+		    Tc = Im[WS(rs, 2)];
+		    Td = Tb - Tc;
+		    TE = Tb + Tc;
+	       }
+	       {
+		    E T6, Tz, T9, TA;
+		    {
+			 E T4, T5, T7, T8;
+			 T4 = Rp[WS(rs, 2)];
+			 T5 = Rm[0];
+			 T6 = T4 + T5;
+			 Tz = T4 - T5;
+			 T7 = Rm[WS(rs, 1)];
+			 T8 = Rp[WS(rs, 1)];
+			 T9 = T7 + T8;
+			 TA = T7 - T8;
+		    }
+		    Ta = T6 + T9;
+		    TO = KP866025403 * (Tz - TA);
+		    Tr = KP866025403 * (T6 - T9);
+		    TB = Tz + TA;
+	       }
+	       {
+		    E Tg, TG, Tj, TF;
+		    {
+			 E Te, Tf, Th, Ti;
+			 Te = Ip[WS(rs, 2)];
+			 Tf = Im[0];
+			 Tg = Te - Tf;
+			 TG = Te + Tf;
+			 Th = Ip[WS(rs, 1)];
+			 Ti = Im[WS(rs, 1)];
+			 Tj = Th - Ti;
+			 TF = Th + Ti;
+		    }
+		    Tk = Tg + Tj;
+		    TL = KP866025403 * (TG + TF);
+		    Tn = KP866025403 * (Tj - Tg);
+		    TH = TF - TG;
+	       }
+	       Rp[0] = T3 + Ta;
+	       Rm[0] = Td + Tk;
+	       {
+		    E TC, TI, Tx, TD;
+		    TC = Ty + TB;
+		    TI = TE - TH;
+		    Tx = W[4];
+		    TD = W[5];
+		    Ip[WS(rs, 1)] = FNMS(TD, TI, Tx * TC);
+		    Im[WS(rs, 1)] = FMA(TD, TC, Tx * TI);
+	       }
+	       {
+		    E To, Tu, Ts, Tw, Tm, Tq;
+		    Tm = FNMS(KP500000000, Ta, T3);
+		    To = Tm - Tn;
+		    Tu = Tm + Tn;
+		    Tq = FNMS(KP500000000, Tk, Td);
+		    Ts = Tq - Tr;
+		    Tw = Tr + Tq;
+		    {
+			 E Tl, Tp, Tt, Tv;
+			 Tl = W[2];
+			 Tp = W[3];
+			 Rp[WS(rs, 1)] = FNMS(Tp, Ts, Tl * To);
+			 Rm[WS(rs, 1)] = FMA(Tl, Ts, Tp * To);
+			 Tt = W[6];
+			 Tv = W[7];
+			 Rp[WS(rs, 2)] = FNMS(Tv, Tw, Tt * Tu);
+			 Rm[WS(rs, 2)] = FMA(Tt, Tw, Tv * Tu);
+		    }
+	       }
+	       {
+		    E TM, TS, TQ, TU, TK, TP;
+		    TK = FNMS(KP500000000, TB, Ty);
+		    TM = TK - TL;
+		    TS = TK + TL;
+		    TP = FMA(KP500000000, TH, TE);
+		    TQ = TO + TP;
+		    TU = TP - TO;
+		    {
+			 E TJ, TN, TR, TT;
+			 TJ = W[0];
+			 TN = W[1];
+			 Ip[0] = FNMS(TN, TQ, TJ * TM);
+			 Im[0] = FMA(TN, TM, TJ * TQ);
+			 TR = W[8];
+			 TT = W[9];
+			 Ip[WS(rs, 2)] = FNMS(TT, TU, TR * TS);
+			 Im[WS(rs, 2)] = FMA(TT, TS, TR * TU);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 6, "hc2cb_6", twinstr, &GENUS, {32, 14, 14, 0} };
+
+void X(codelet_hc2cb_6) (planner *p) {
+     X(khc2c_register) (p, hc2cb_6, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cb_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:52 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cb_8 -include hc2cb.h */
+
+/*
+ * This function contains 66 FP additions, 36 FP multiplications,
+ * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
+ * 52 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E Tw, TH, Tf, Ty, Tx, TI;
+	       {
+		    E TV, TD, T1i, T7, T1b, T1n, TQ, Tk, Tp, TE, Te, T1o, T1e, T1j, Tu;
+		    E TF;
+		    {
+			 E T4, Tg, T3, T19, TC, T5, Th, Ti;
+			 {
+			      E T1, T2, TA, TB;
+			      T1 = Rp[0];
+			      T2 = Rm[WS(rs, 3)];
+			      TA = Ip[0];
+			      TB = Im[WS(rs, 3)];
+			      T4 = Rp[WS(rs, 2)];
+			      Tg = T1 - T2;
+			      T3 = T1 + T2;
+			      T19 = TA - TB;
+			      TC = TA + TB;
+			      T5 = Rm[WS(rs, 1)];
+			      Th = Ip[WS(rs, 2)];
+			      Ti = Im[WS(rs, 1)];
+			 }
+			 {
+			      E Tb, Tl, Ta, T1c, To, Tc, Tr, Ts;
+			      {
+				   E T8, T9, Tm, Tn;
+				   T8 = Rp[WS(rs, 1)];
+				   {
+					E Tz, T6, T1a, Tj;
+					Tz = T4 - T5;
+					T6 = T4 + T5;
+					T1a = Th - Ti;
+					Tj = Th + Ti;
+					TV = TC - Tz;
+					TD = Tz + TC;
+					T1i = T3 - T6;
+					T7 = T3 + T6;
+					T1b = T19 + T1a;
+					T1n = T19 - T1a;
+					TQ = Tg + Tj;
+					Tk = Tg - Tj;
+					T9 = Rm[WS(rs, 2)];
+				   }
+				   Tm = Ip[WS(rs, 1)];
+				   Tn = Im[WS(rs, 2)];
+				   Tb = Rm[0];
+				   Tl = T8 - T9;
+				   Ta = T8 + T9;
+				   T1c = Tm - Tn;
+				   To = Tm + Tn;
+				   Tc = Rp[WS(rs, 3)];
+				   Tr = Ip[WS(rs, 3)];
+				   Ts = Im[0];
+			      }
+			      {
+				   E Tq, Td, T1d, Tt;
+				   Tp = Tl - To;
+				   TE = Tl + To;
+				   Tq = Tb - Tc;
+				   Td = Tb + Tc;
+				   T1d = Tr - Ts;
+				   Tt = Tr + Ts;
+				   Te = Ta + Td;
+				   T1o = Ta - Td;
+				   T1e = T1c + T1d;
+				   T1j = T1d - T1c;
+				   Tu = Tq - Tt;
+				   TF = Tq + Tt;
+			      }
+			 }
+		    }
+		    {
+			 E TG, Tv, T10, T13, T1s, T1k, T1p, T1v, T1u, T1w, T1t, TR, TW;
+			 Rp[0] = T7 + Te;
+			 Rm[0] = T1b + T1e;
+			 TG = TE - TF;
+			 TR = TE + TF;
+			 TW = Tp - Tu;
+			 Tv = Tp + Tu;
+			 {
+			      E TP, TS, TX, TU, T1r, TT, TY;
+			      TP = W[4];
+			      T10 = FMA(KP707106781, TR, TQ);
+			      TS = FNMS(KP707106781, TR, TQ);
+			      TX = FMA(KP707106781, TW, TV);
+			      T13 = FNMS(KP707106781, TW, TV);
+			      TU = W[5];
+			      T1s = T1i + T1j;
+			      T1k = T1i - T1j;
+			      TT = TP * TS;
+			      TY = TP * TX;
+			      T1p = T1n - T1o;
+			      T1v = T1o + T1n;
+			      T1r = W[2];
+			      Ip[WS(rs, 1)] = FNMS(TU, TX, TT);
+			      Im[WS(rs, 1)] = FMA(TU, TS, TY);
+			      T1u = W[3];
+			      T1w = T1r * T1v;
+			      T1t = T1r * T1s;
+			 }
+			 {
+			      E T1f, T15, T18, T17, T1g, T1h, T1m;
+			      {
+				   E TZ, T12, T16, T14, T11;
+				   Rm[WS(rs, 1)] = FMA(T1u, T1s, T1w);
+				   Rp[WS(rs, 1)] = FNMS(T1u, T1v, T1t);
+				   TZ = W[12];
+				   T12 = W[13];
+				   T1f = T1b - T1e;
+				   T16 = T7 - Te;
+				   T14 = TZ * T13;
+				   T11 = TZ * T10;
+				   T15 = W[6];
+				   T18 = W[7];
+				   Im[WS(rs, 3)] = FMA(T12, T10, T14);
+				   Ip[WS(rs, 3)] = FNMS(T12, T13, T11);
+				   T17 = T15 * T16;
+				   T1g = T18 * T16;
+			      }
+			      Rp[WS(rs, 2)] = FNMS(T18, T1f, T17);
+			      Rm[WS(rs, 2)] = FMA(T15, T1f, T1g);
+			      T1h = W[10];
+			      T1m = W[11];
+			      {
+				   E TN, TJ, TM, TL, TO, TK, T1q, T1l;
+				   Tw = FNMS(KP707106781, Tv, Tk);
+				   TK = FMA(KP707106781, Tv, Tk);
+				   T1q = T1h * T1p;
+				   T1l = T1h * T1k;
+				   TN = FMA(KP707106781, TG, TD);
+				   TH = FNMS(KP707106781, TG, TD);
+				   Rm[WS(rs, 3)] = FMA(T1m, T1k, T1q);
+				   Rp[WS(rs, 3)] = FNMS(T1m, T1p, T1l);
+				   TJ = W[0];
+				   TM = W[1];
+				   Tf = W[8];
+				   TL = TJ * TK;
+				   TO = TM * TK;
+				   Ty = W[9];
+				   Tx = Tf * Tw;
+				   Ip[0] = FNMS(TM, TN, TL);
+				   Im[0] = FMA(TJ, TN, TO);
+			      }
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 2)] = FNMS(Ty, TH, Tx);
+	       TI = Ty * Tw;
+	       Im[WS(rs, 2)] = FMA(Tf, TH, TI);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cb_8", twinstr, &GENUS, {44, 14, 22, 0} };
+
+void X(codelet_hc2cb_8) (planner *p) {
+     X(khc2c_register) (p, hc2cb_8, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cb_8 -include hc2cb.h */
+
+/*
+ * This function contains 66 FP additions, 32 FP multiplications,
+ * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
+ * 30 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cb_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T7, T18, T1c, To, Ty, TM, TY, TC, Te, TZ, T10, Tv, Tz, TP, TS;
+	       E TD;
+	       {
+		    E T3, TK, Tk, TX, T6, TW, Tn, TL;
+		    {
+			 E T1, T2, Ti, Tj;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 3)];
+			 T3 = T1 + T2;
+			 TK = T1 - T2;
+			 Ti = Ip[0];
+			 Tj = Im[WS(rs, 3)];
+			 Tk = Ti - Tj;
+			 TX = Ti + Tj;
+		    }
+		    {
+			 E T4, T5, Tl, Tm;
+			 T4 = Rp[WS(rs, 2)];
+			 T5 = Rm[WS(rs, 1)];
+			 T6 = T4 + T5;
+			 TW = T4 - T5;
+			 Tl = Ip[WS(rs, 2)];
+			 Tm = Im[WS(rs, 1)];
+			 Tn = Tl - Tm;
+			 TL = Tl + Tm;
+		    }
+		    T7 = T3 + T6;
+		    T18 = TK + TL;
+		    T1c = TX - TW;
+		    To = Tk + Tn;
+		    Ty = T3 - T6;
+		    TM = TK - TL;
+		    TY = TW + TX;
+		    TC = Tk - Tn;
+	       }
+	       {
+		    E Ta, TN, Tr, TO, Td, TQ, Tu, TR;
+		    {
+			 E T8, T9, Tp, Tq;
+			 T8 = Rp[WS(rs, 1)];
+			 T9 = Rm[WS(rs, 2)];
+			 Ta = T8 + T9;
+			 TN = T8 - T9;
+			 Tp = Ip[WS(rs, 1)];
+			 Tq = Im[WS(rs, 2)];
+			 Tr = Tp - Tq;
+			 TO = Tp + Tq;
+		    }
+		    {
+			 E Tb, Tc, Ts, Tt;
+			 Tb = Rm[0];
+			 Tc = Rp[WS(rs, 3)];
+			 Td = Tb + Tc;
+			 TQ = Tb - Tc;
+			 Ts = Ip[WS(rs, 3)];
+			 Tt = Im[0];
+			 Tu = Ts - Tt;
+			 TR = Ts + Tt;
+		    }
+		    Te = Ta + Td;
+		    TZ = TN + TO;
+		    T10 = TQ + TR;
+		    Tv = Tr + Tu;
+		    Tz = Tu - Tr;
+		    TP = TN - TO;
+		    TS = TQ - TR;
+		    TD = Ta - Td;
+	       }
+	       Rp[0] = T7 + Te;
+	       Rm[0] = To + Tv;
+	       {
+		    E Tg, Tw, Tf, Th;
+		    Tg = T7 - Te;
+		    Tw = To - Tv;
+		    Tf = W[6];
+		    Th = W[7];
+		    Rp[WS(rs, 2)] = FNMS(Th, Tw, Tf * Tg);
+		    Rm[WS(rs, 2)] = FMA(Th, Tg, Tf * Tw);
+	       }
+	       {
+		    E TG, TI, TF, TH;
+		    TG = Ty + Tz;
+		    TI = TD + TC;
+		    TF = W[2];
+		    TH = W[3];
+		    Rp[WS(rs, 1)] = FNMS(TH, TI, TF * TG);
+		    Rm[WS(rs, 1)] = FMA(TF, TI, TH * TG);
+	       }
+	       {
+		    E TA, TE, Tx, TB;
+		    TA = Ty - Tz;
+		    TE = TC - TD;
+		    Tx = W[10];
+		    TB = W[11];
+		    Rp[WS(rs, 3)] = FNMS(TB, TE, Tx * TA);
+		    Rm[WS(rs, 3)] = FMA(Tx, TE, TB * TA);
+	       }
+	       {
+		    E T1a, T1g, T1e, T1i, T19, T1d;
+		    T19 = KP707106781 * (TZ + T10);
+		    T1a = T18 - T19;
+		    T1g = T18 + T19;
+		    T1d = KP707106781 * (TP - TS);
+		    T1e = T1c + T1d;
+		    T1i = T1c - T1d;
+		    {
+			 E T17, T1b, T1f, T1h;
+			 T17 = W[4];
+			 T1b = W[5];
+			 Ip[WS(rs, 1)] = FNMS(T1b, T1e, T17 * T1a);
+			 Im[WS(rs, 1)] = FMA(T17, T1e, T1b * T1a);
+			 T1f = W[12];
+			 T1h = W[13];
+			 Ip[WS(rs, 3)] = FNMS(T1h, T1i, T1f * T1g);
+			 Im[WS(rs, 3)] = FMA(T1f, T1i, T1h * T1g);
+		    }
+	       }
+	       {
+		    E TU, T14, T12, T16, TT, T11;
+		    TT = KP707106781 * (TP + TS);
+		    TU = TM - TT;
+		    T14 = TM + TT;
+		    T11 = KP707106781 * (TZ - T10);
+		    T12 = TY - T11;
+		    T16 = TY + T11;
+		    {
+			 E TJ, TV, T13, T15;
+			 TJ = W[8];
+			 TV = W[9];
+			 Ip[WS(rs, 2)] = FNMS(TV, T12, TJ * TU);
+			 Im[WS(rs, 2)] = FMA(TV, TU, TJ * T12);
+			 T13 = W[0];
+			 T15 = W[1];
+			 Ip[0] = FNMS(T15, T16, T13 * T14);
+			 Im[0] = FMA(T15, T14, T13 * T16);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cb_8", twinstr, &GENUS, {52, 18, 14, 0} };
+
+void X(codelet_hc2cb_8) (planner *p) {
+     X(khc2c_register) (p, hc2cb_8, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft2_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft2_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,880 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft2_16 -include hc2cb.h */
+
+/*
+ * This function contains 206 FP additions, 100 FP multiplications,
+ * (or, 136 additions, 30 multiplications, 70 fused multiply/add),
+ * 97 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T3w, T3z, T2Y, T3D, T3x, T3m, T3u, T3C, T3y, T3o, T3k, T3E, T3A;
+	       {
+		    E T20, Tf, T3Q, T32, T3V, T3f, T2a, TN, T2f, T1m, T3G, T2G, T3L, T2T, T26;
+		    E T1F, T3M, T2N, T3H, T2W, T25, Tu, T1n, T1o, T3R, T3i, T2g, T1a, T21, T1y;
+		    E T3W, T39;
+		    {
+			 E T2R, T1B, T2S, T1E;
+			 {
+			      E T1e, T3, T1C, TA, Tx, T6, T1D, T1h, Td, T1A, TL, T1k, Ta, TC, TF;
+			      E T1z;
+			      {
+				   E T4, T5, T1f, T1g;
+				   {
+					E T1, T2, Ty, Tz;
+					T1 = Rp[0];
+					T2 = Rm[WS(rs, 7)];
+					Ty = Ip[0];
+					Tz = Im[WS(rs, 7)];
+					T4 = Rp[WS(rs, 4)];
+					T1e = T1 - T2;
+					T3 = T1 + T2;
+					T1C = Ty - Tz;
+					TA = Ty + Tz;
+					T5 = Rm[WS(rs, 3)];
+				   }
+				   T1f = Ip[WS(rs, 4)];
+				   T1g = Im[WS(rs, 3)];
+				   {
+					E Tb, Tc, TI, TJ;
+					Tb = Rm[WS(rs, 1)];
+					Tx = T4 - T5;
+					T6 = T4 + T5;
+					T1D = T1f - T1g;
+					T1h = T1f + T1g;
+					Tc = Rp[WS(rs, 6)];
+					TI = Im[WS(rs, 1)];
+					TJ = Ip[WS(rs, 6)];
+					{
+					     E T8, TH, TK, T9, TD, TE;
+					     T8 = Rp[WS(rs, 2)];
+					     Td = Tb + Tc;
+					     TH = Tb - Tc;
+					     T1A = TJ - TI;
+					     TK = TI + TJ;
+					     T9 = Rm[WS(rs, 5)];
+					     TD = Ip[WS(rs, 2)];
+					     TE = Im[WS(rs, 5)];
+					     TL = TH + TK;
+					     T1k = TH - TK;
+					     Ta = T8 + T9;
+					     TC = T8 - T9;
+					     TF = TD + TE;
+					     T1z = TD - TE;
+					}
+				   }
+			      }
+			      {
+				   E T2E, TB, T1l, T1i, T3d, T3e, TM, T2F;
+				   {
+					E T7, TG, Te, T30, T31, T1j;
+					T2E = T3 - T6;
+					T7 = T3 + T6;
+					T1j = TC - TF;
+					TG = TC + TF;
+					Te = Ta + Td;
+					T2R = Ta - Td;
+					TB = Tx + TA;
+					T30 = TA - Tx;
+					T31 = T1j - T1k;
+					T1l = T1j + T1k;
+					T1i = T1e - T1h;
+					T3d = T1e + T1h;
+					T20 = T7 - Te;
+					Tf = T7 + Te;
+					T3Q = FNMS(KP707106781, T31, T30);
+					T32 = FMA(KP707106781, T31, T30);
+					T3e = TG + TL;
+					TM = TG - TL;
+				   }
+				   T3V = FMA(KP707106781, T3e, T3d);
+				   T3f = FNMS(KP707106781, T3e, T3d);
+				   T2a = FNMS(KP707106781, TM, TB);
+				   TN = FMA(KP707106781, TM, TB);
+				   T2F = T1A - T1z;
+				   T1B = T1z + T1A;
+				   T2f = FNMS(KP707106781, T1l, T1i);
+				   T1m = FMA(KP707106781, T1l, T1i);
+				   T3G = T2E - T2F;
+				   T2G = T2E + T2F;
+				   T2S = T1C - T1D;
+				   T1E = T1C + T1D;
+			      }
+			 }
+			 {
+			      E T34, TS, T2H, Tm, T1u, T2I, T33, TX, Tq, T14, Tp, T1v, T12, Tr, T15;
+			      E T16;
+			      {
+				   E Tj, TT, Ti, T1s, TR, Tk, TU, TV;
+				   {
+					E Tg, Th, TP, TQ;
+					Tg = Rp[WS(rs, 1)];
+					T3L = T2S - T2R;
+					T2T = T2R + T2S;
+					T26 = T1E - T1B;
+					T1F = T1B + T1E;
+					Th = Rm[WS(rs, 6)];
+					TP = Ip[WS(rs, 1)];
+					TQ = Im[WS(rs, 6)];
+					Tj = Rp[WS(rs, 5)];
+					TT = Tg - Th;
+					Ti = Tg + Th;
+					T1s = TP - TQ;
+					TR = TP + TQ;
+					Tk = Rm[WS(rs, 2)];
+					TU = Ip[WS(rs, 5)];
+					TV = Im[WS(rs, 2)];
+				   }
+				   {
+					E Tn, To, T10, T11;
+					Tn = Rm[0];
+					{
+					     E TO, Tl, T1t, TW;
+					     TO = Tj - Tk;
+					     Tl = Tj + Tk;
+					     T1t = TU - TV;
+					     TW = TU + TV;
+					     T34 = TR - TO;
+					     TS = TO + TR;
+					     T2H = Ti - Tl;
+					     Tm = Ti + Tl;
+					     T1u = T1s + T1t;
+					     T2I = T1s - T1t;
+					     T33 = TT + TW;
+					     TX = TT - TW;
+					     To = Rp[WS(rs, 7)];
+					}
+					T10 = Im[0];
+					T11 = Ip[WS(rs, 7)];
+					Tq = Rp[WS(rs, 3)];
+					T14 = Tn - To;
+					Tp = Tn + To;
+					T1v = T11 - T10;
+					T12 = T10 + T11;
+					Tr = Rm[WS(rs, 4)];
+					T15 = Ip[WS(rs, 3)];
+					T16 = Im[WS(rs, 4)];
+				   }
+			      }
+			      {
+				   E T13, T1x, T18, T35, T3g, T3h, T38, TY, T19;
+				   {
+					E T2U, T2J, T37, Tt, T36, T2V, T2M, T2K, T2L;
+					T2U = T2H + T2I;
+					T2J = T2H - T2I;
+					{
+					     E TZ, Ts, T1w, T17;
+					     TZ = Tq - Tr;
+					     Ts = Tq + Tr;
+					     T1w = T15 - T16;
+					     T17 = T15 + T16;
+					     T37 = TZ + T12;
+					     T13 = TZ - T12;
+					     T2K = Tp - Ts;
+					     Tt = Tp + Ts;
+					     T1x = T1v + T1w;
+					     T2L = T1v - T1w;
+					     T36 = T14 + T17;
+					     T18 = T14 - T17;
+					}
+					T2V = T2L - T2K;
+					T2M = T2K + T2L;
+					T3M = T2J - T2M;
+					T2N = T2J + T2M;
+					T3H = T2V - T2U;
+					T2W = T2U + T2V;
+					T35 = FMA(KP414213562, T34, T33);
+					T3g = FNMS(KP414213562, T33, T34);
+					T25 = Tm - Tt;
+					Tu = Tm + Tt;
+					T3h = FNMS(KP414213562, T36, T37);
+					T38 = FMA(KP414213562, T37, T36);
+				   }
+				   T1n = FNMS(KP414213562, TS, TX);
+				   TY = FMA(KP414213562, TX, TS);
+				   T19 = FNMS(KP414213562, T18, T13);
+				   T1o = FMA(KP414213562, T13, T18);
+				   T3R = T3h - T3g;
+				   T3i = T3g + T3h;
+				   T2g = T19 - TY;
+				   T1a = TY + T19;
+				   T21 = T1x - T1u;
+				   T1y = T1u + T1x;
+				   T3W = T35 + T38;
+				   T39 = T35 - T38;
+			      }
+			 }
+		    }
+		    {
+			 E T27, T22, T2c, T2u, T2x, T2h, T2s, T2A, T2w, T2B, T2v;
+			 {
+			      E T1K, Tv, T1G, T1N, T1Q, T1b, T2b, T1p, Tw, T1d;
+			      T1K = Tf - Tu;
+			      Tv = Tf + Tu;
+			      T1G = T1y + T1F;
+			      T1N = T1F - T1y;
+			      T1Q = FNMS(KP923879532, T1a, TN);
+			      T1b = FMA(KP923879532, T1a, TN);
+			      T2b = T1n - T1o;
+			      T1p = T1n + T1o;
+			      Tw = W[0];
+			      T1d = W[1];
+			      {
+				   E T1T, T1O, T1W, T1S, T1X, T1R;
+				   {
+					E T1J, T1M, T1L, T1V, T1P, T1q;
+					T1T = FNMS(KP923879532, T1p, T1m);
+					T1q = FMA(KP923879532, T1p, T1m);
+					{
+					     E T1c, T1I, T1H, T1r;
+					     T1c = Tw * T1b;
+					     T1J = W[14];
+					     T1H = Tw * T1q;
+					     T1r = FMA(T1d, T1q, T1c);
+					     T1M = W[15];
+					     T1L = T1J * T1K;
+					     T1I = FNMS(T1d, T1b, T1H);
+					     Rm[0] = Tv + T1r;
+					     Rp[0] = Tv - T1r;
+					     T1V = T1M * T1K;
+					     Im[0] = T1I - T1G;
+					     Ip[0] = T1G + T1I;
+					     T1P = W[16];
+					}
+					T1O = FNMS(T1M, T1N, T1L);
+					T1W = FMA(T1J, T1N, T1V);
+					T1S = W[17];
+					T1X = T1P * T1T;
+					T1R = T1P * T1Q;
+				   }
+				   {
+					E T2r, T2n, T2q, T2p, T2z, T2t, T2o, T1Y, T1U;
+					T27 = T25 + T26;
+					T2r = T26 - T25;
+					T2o = T20 - T21;
+					T22 = T20 + T21;
+					T1Y = FNMS(T1S, T1Q, T1X);
+					T1U = FMA(T1S, T1T, T1R);
+					T2n = W[22];
+					T2q = W[23];
+					Im[WS(rs, 4)] = T1Y - T1W;
+					Ip[WS(rs, 4)] = T1W + T1Y;
+					Rm[WS(rs, 4)] = T1O + T1U;
+					Rp[WS(rs, 4)] = T1O - T1U;
+					T2p = T2n * T2o;
+					T2z = T2q * T2o;
+					T2c = FMA(KP923879532, T2b, T2a);
+					T2u = FNMS(KP923879532, T2b, T2a);
+					T2x = FNMS(KP923879532, T2g, T2f);
+					T2h = FMA(KP923879532, T2g, T2f);
+					T2t = W[24];
+					T2s = FNMS(T2q, T2r, T2p);
+					T2A = FMA(T2n, T2r, T2z);
+					T2w = W[25];
+					T2B = T2t * T2x;
+					T2v = T2t * T2u;
+				   }
+			      }
+			 }
+			 {
+			      E T28, T2k, T2e, T2l, T2d;
+			      {
+				   E T1Z, T24, T23, T2j, T29, T2C, T2y;
+				   T2C = FNMS(T2w, T2u, T2B);
+				   T2y = FMA(T2w, T2x, T2v);
+				   T1Z = W[6];
+				   T24 = W[7];
+				   Im[WS(rs, 6)] = T2C - T2A;
+				   Ip[WS(rs, 6)] = T2A + T2C;
+				   Rm[WS(rs, 6)] = T2s + T2y;
+				   Rp[WS(rs, 6)] = T2s - T2y;
+				   T23 = T1Z * T22;
+				   T2j = T24 * T22;
+				   T29 = W[8];
+				   T28 = FNMS(T24, T27, T23);
+				   T2k = FMA(T1Z, T27, T2j);
+				   T2e = W[9];
+				   T2l = T29 * T2h;
+				   T2d = T29 * T2c;
+			      }
+			      {
+				   E T4a, T4d, T3O, T4h, T4b, T40, T48, T4g, T4c, T42, T3Y;
+				   {
+					E T3N, T47, T43, T46, T3F, T45, T4f, T3K, T3J, T3S, T3X, T3Z, T49, T41, T3T;
+					E T3U;
+					{
+					     E T44, T3I, T2m, T2i, T3P;
+					     T44 = FNMS(KP707106781, T3H, T3G);
+					     T3I = FMA(KP707106781, T3H, T3G);
+					     T2m = FNMS(T2e, T2c, T2l);
+					     T2i = FMA(T2e, T2h, T2d);
+					     T3N = FMA(KP707106781, T3M, T3L);
+					     T47 = FNMS(KP707106781, T3M, T3L);
+					     Im[WS(rs, 2)] = T2m - T2k;
+					     Ip[WS(rs, 2)] = T2k + T2m;
+					     Rm[WS(rs, 2)] = T28 + T2i;
+					     Rp[WS(rs, 2)] = T28 - T2i;
+					     T43 = W[26];
+					     T46 = W[27];
+					     T3F = W[10];
+					     T45 = T43 * T44;
+					     T4f = T46 * T44;
+					     T3K = W[11];
+					     T3J = T3F * T3I;
+					     T4a = FNMS(KP923879532, T3R, T3Q);
+					     T3S = FMA(KP923879532, T3R, T3Q);
+					     T3X = FNMS(KP923879532, T3W, T3V);
+					     T4d = FMA(KP923879532, T3W, T3V);
+					     T3Z = T3K * T3I;
+					     T3P = W[12];
+					     T49 = W[28];
+					     T41 = T3P * T3X;
+					     T3T = T3P * T3S;
+					}
+					T3O = FNMS(T3K, T3N, T3J);
+					T4h = T49 * T4d;
+					T4b = T49 * T4a;
+					T40 = FMA(T3F, T3N, T3Z);
+					T3U = W[13];
+					T48 = FNMS(T46, T47, T45);
+					T4g = FMA(T43, T47, T4f);
+					T4c = W[29];
+					T42 = FNMS(T3U, T3S, T41);
+					T3Y = FMA(T3U, T3X, T3T);
+				   }
+				   {
+					E T3t, T2X, T3p, T3s, T2D, T3r, T3B, T2Q, T2P, T3a, T3j, T3l, T3v, T3n, T3b;
+					E T3c;
+					{
+					     E T2O, T3q, T4i, T4e, T2Z;
+					     T4i = FNMS(T4c, T4a, T4h);
+					     T4e = FMA(T4c, T4d, T4b);
+					     Im[WS(rs, 3)] = T42 - T40;
+					     Ip[WS(rs, 3)] = T40 + T42;
+					     Rm[WS(rs, 3)] = T3O + T3Y;
+					     Rp[WS(rs, 3)] = T3O - T3Y;
+					     Im[WS(rs, 7)] = T4i - T4g;
+					     Ip[WS(rs, 7)] = T4g + T4i;
+					     Rm[WS(rs, 7)] = T48 + T4e;
+					     Rp[WS(rs, 7)] = T48 - T4e;
+					     T3t = FNMS(KP707106781, T2W, T2T);
+					     T2X = FMA(KP707106781, T2W, T2T);
+					     T2O = FMA(KP707106781, T2N, T2G);
+					     T3q = FNMS(KP707106781, T2N, T2G);
+					     T3p = W[18];
+					     T3s = W[19];
+					     T2D = W[2];
+					     T3r = T3p * T3q;
+					     T3B = T3s * T3q;
+					     T2Q = W[3];
+					     T2P = T2D * T2O;
+					     T3a = FMA(KP923879532, T39, T32);
+					     T3w = FNMS(KP923879532, T39, T32);
+					     T3z = FMA(KP923879532, T3i, T3f);
+					     T3j = FNMS(KP923879532, T3i, T3f);
+					     T3l = T2Q * T2O;
+					     T2Z = W[4];
+					     T3v = W[20];
+					     T3n = T2Z * T3j;
+					     T3b = T2Z * T3a;
+					}
+					T2Y = FNMS(T2Q, T2X, T2P);
+					T3D = T3v * T3z;
+					T3x = T3v * T3w;
+					T3m = FMA(T2D, T2X, T3l);
+					T3c = W[5];
+					T3u = FNMS(T3s, T3t, T3r);
+					T3C = FMA(T3p, T3t, T3B);
+					T3y = W[21];
+					T3o = FNMS(T3c, T3a, T3n);
+					T3k = FMA(T3c, T3j, T3b);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T3E = FNMS(T3y, T3w, T3D);
+	       T3A = FMA(T3y, T3z, T3x);
+	       Im[WS(rs, 1)] = T3o - T3m;
+	       Ip[WS(rs, 1)] = T3m + T3o;
+	       Rm[WS(rs, 1)] = T2Y + T3k;
+	       Rp[WS(rs, 1)] = T2Y - T3k;
+	       Im[WS(rs, 5)] = T3E - T3C;
+	       Ip[WS(rs, 5)] = T3C + T3E;
+	       Rm[WS(rs, 5)] = T3u + T3A;
+	       Rp[WS(rs, 5)] = T3u - T3A;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cbdft2_16", twinstr, &GENUS, {136, 30, 70, 0} };
+
+void X(codelet_hc2cbdft2_16) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft2_16, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft2_16 -include hc2cb.h */
+
+/*
+ * This function contains 206 FP additions, 84 FP multiplications,
+ * (or, 168 additions, 46 multiplications, 38 fused multiply/add),
+ * 60 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E TB, T2L, T30, T1n, Tf, T1U, T2H, T3p, T1E, T1Z, TM, T31, T2s, T3k, T1i;
+	       E T2M, Tu, T1Y, T2Q, T2X, T2T, T2Y, TY, T1d, T19, T1e, T2v, T2C, T2y, T2D;
+	       E T1x, T1V;
+	       {
+		    E T3, T1j, TA, T1B, T6, Tx, T1m, T1C, Ta, TC, TF, T1y, Td, TH, TK;
+		    E T1z;
+		    {
+			 E T1, T2, Ty, Tz;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 7)];
+			 T3 = T1 + T2;
+			 T1j = T1 - T2;
+			 Ty = Ip[0];
+			 Tz = Im[WS(rs, 7)];
+			 TA = Ty + Tz;
+			 T1B = Ty - Tz;
+		    }
+		    {
+			 E T4, T5, T1k, T1l;
+			 T4 = Rp[WS(rs, 4)];
+			 T5 = Rm[WS(rs, 3)];
+			 T6 = T4 + T5;
+			 Tx = T4 - T5;
+			 T1k = Ip[WS(rs, 4)];
+			 T1l = Im[WS(rs, 3)];
+			 T1m = T1k + T1l;
+			 T1C = T1k - T1l;
+		    }
+		    {
+			 E T8, T9, TD, TE;
+			 T8 = Rp[WS(rs, 2)];
+			 T9 = Rm[WS(rs, 5)];
+			 Ta = T8 + T9;
+			 TC = T8 - T9;
+			 TD = Ip[WS(rs, 2)];
+			 TE = Im[WS(rs, 5)];
+			 TF = TD + TE;
+			 T1y = TD - TE;
+		    }
+		    {
+			 E Tb, Tc, TI, TJ;
+			 Tb = Rm[WS(rs, 1)];
+			 Tc = Rp[WS(rs, 6)];
+			 Td = Tb + Tc;
+			 TH = Tb - Tc;
+			 TI = Im[WS(rs, 1)];
+			 TJ = Ip[WS(rs, 6)];
+			 TK = TI + TJ;
+			 T1z = TJ - TI;
+		    }
+		    {
+			 E T7, Te, TG, TL;
+			 TB = Tx + TA;
+			 T2L = TA - Tx;
+			 T30 = T1j + T1m;
+			 T1n = T1j - T1m;
+			 T7 = T3 + T6;
+			 Te = Ta + Td;
+			 Tf = T7 + Te;
+			 T1U = T7 - Te;
+			 {
+			      E T2F, T2G, T1A, T1D;
+			      T2F = Ta - Td;
+			      T2G = T1B - T1C;
+			      T2H = T2F + T2G;
+			      T3p = T2G - T2F;
+			      T1A = T1y + T1z;
+			      T1D = T1B + T1C;
+			      T1E = T1A + T1D;
+			      T1Z = T1D - T1A;
+			 }
+			 TG = TC + TF;
+			 TL = TH + TK;
+			 TM = KP707106781 * (TG - TL);
+			 T31 = KP707106781 * (TG + TL);
+			 {
+			      E T2q, T2r, T1g, T1h;
+			      T2q = T3 - T6;
+			      T2r = T1z - T1y;
+			      T2s = T2q + T2r;
+			      T3k = T2q - T2r;
+			      T1g = TC - TF;
+			      T1h = TH - TK;
+			      T1i = KP707106781 * (T1g + T1h);
+			      T2M = KP707106781 * (T1g - T1h);
+			 }
+		    }
+	       }
+	       {
+		    E Ti, TT, TR, T1r, Tl, TO, TW, T1s, Tp, T14, T12, T1u, Ts, TZ, T17;
+		    E T1v;
+		    {
+			 E Tg, Th, TP, TQ;
+			 Tg = Rp[WS(rs, 1)];
+			 Th = Rm[WS(rs, 6)];
+			 Ti = Tg + Th;
+			 TT = Tg - Th;
+			 TP = Ip[WS(rs, 1)];
+			 TQ = Im[WS(rs, 6)];
+			 TR = TP + TQ;
+			 T1r = TP - TQ;
+		    }
+		    {
+			 E Tj, Tk, TU, TV;
+			 Tj = Rp[WS(rs, 5)];
+			 Tk = Rm[WS(rs, 2)];
+			 Tl = Tj + Tk;
+			 TO = Tj - Tk;
+			 TU = Ip[WS(rs, 5)];
+			 TV = Im[WS(rs, 2)];
+			 TW = TU + TV;
+			 T1s = TU - TV;
+		    }
+		    {
+			 E Tn, To, T10, T11;
+			 Tn = Rm[0];
+			 To = Rp[WS(rs, 7)];
+			 Tp = Tn + To;
+			 T14 = Tn - To;
+			 T10 = Im[0];
+			 T11 = Ip[WS(rs, 7)];
+			 T12 = T10 + T11;
+			 T1u = T11 - T10;
+		    }
+		    {
+			 E Tq, Tr, T15, T16;
+			 Tq = Rp[WS(rs, 3)];
+			 Tr = Rm[WS(rs, 4)];
+			 Ts = Tq + Tr;
+			 TZ = Tq - Tr;
+			 T15 = Ip[WS(rs, 3)];
+			 T16 = Im[WS(rs, 4)];
+			 T17 = T15 + T16;
+			 T1v = T15 - T16;
+		    }
+		    {
+			 E Tm, Tt, T2O, T2P;
+			 Tm = Ti + Tl;
+			 Tt = Tp + Ts;
+			 Tu = Tm + Tt;
+			 T1Y = Tm - Tt;
+			 T2O = TR - TO;
+			 T2P = TT + TW;
+			 T2Q = FMA(KP382683432, T2O, KP923879532 * T2P);
+			 T2X = FNMS(KP923879532, T2O, KP382683432 * T2P);
+		    }
+		    {
+			 E T2R, T2S, TS, TX;
+			 T2R = TZ + T12;
+			 T2S = T14 + T17;
+			 T2T = FMA(KP382683432, T2R, KP923879532 * T2S);
+			 T2Y = FNMS(KP923879532, T2R, KP382683432 * T2S);
+			 TS = TO + TR;
+			 TX = TT - TW;
+			 TY = FMA(KP923879532, TS, KP382683432 * TX);
+			 T1d = FNMS(KP382683432, TS, KP923879532 * TX);
+		    }
+		    {
+			 E T13, T18, T2t, T2u;
+			 T13 = TZ - T12;
+			 T18 = T14 - T17;
+			 T19 = FNMS(KP382683432, T18, KP923879532 * T13);
+			 T1e = FMA(KP382683432, T13, KP923879532 * T18);
+			 T2t = Ti - Tl;
+			 T2u = T1r - T1s;
+			 T2v = T2t - T2u;
+			 T2C = T2t + T2u;
+		    }
+		    {
+			 E T2w, T2x, T1t, T1w;
+			 T2w = Tp - Ts;
+			 T2x = T1u - T1v;
+			 T2y = T2w + T2x;
+			 T2D = T2x - T2w;
+			 T1t = T1r + T1s;
+			 T1w = T1u + T1v;
+			 T1x = T1t + T1w;
+			 T1V = T1w - T1t;
+		    }
+	       }
+	       {
+		    E Tv, T1F, T1b, T1N, T1p, T1P, T1L, T1R;
+		    Tv = Tf + Tu;
+		    T1F = T1x + T1E;
+		    {
+			 E TN, T1a, T1f, T1o;
+			 TN = TB + TM;
+			 T1a = TY + T19;
+			 T1b = TN + T1a;
+			 T1N = TN - T1a;
+			 T1f = T1d + T1e;
+			 T1o = T1i + T1n;
+			 T1p = T1f + T1o;
+			 T1P = T1o - T1f;
+			 {
+			      E T1I, T1K, T1H, T1J;
+			      T1I = Tf - Tu;
+			      T1K = T1E - T1x;
+			      T1H = W[14];
+			      T1J = W[15];
+			      T1L = FNMS(T1J, T1K, T1H * T1I);
+			      T1R = FMA(T1J, T1I, T1H * T1K);
+			 }
+		    }
+		    {
+			 E T1q, T1G, Tw, T1c;
+			 Tw = W[0];
+			 T1c = W[1];
+			 T1q = FMA(Tw, T1b, T1c * T1p);
+			 T1G = FNMS(T1c, T1b, Tw * T1p);
+			 Rp[0] = Tv - T1q;
+			 Ip[0] = T1F + T1G;
+			 Rm[0] = Tv + T1q;
+			 Im[0] = T1G - T1F;
+		    }
+		    {
+			 E T1Q, T1S, T1M, T1O;
+			 T1M = W[16];
+			 T1O = W[17];
+			 T1Q = FMA(T1M, T1N, T1O * T1P);
+			 T1S = FNMS(T1O, T1N, T1M * T1P);
+			 Rp[WS(rs, 4)] = T1L - T1Q;
+			 Ip[WS(rs, 4)] = T1R + T1S;
+			 Rm[WS(rs, 4)] = T1L + T1Q;
+			 Im[WS(rs, 4)] = T1S - T1R;
+		    }
+	       }
+	       {
+		    E T25, T2j, T29, T2l, T21, T2b, T2h, T2n;
+		    {
+			 E T23, T24, T27, T28;
+			 T23 = TB - TM;
+			 T24 = T1d - T1e;
+			 T25 = T23 + T24;
+			 T2j = T23 - T24;
+			 T27 = T19 - TY;
+			 T28 = T1n - T1i;
+			 T29 = T27 + T28;
+			 T2l = T28 - T27;
+		    }
+		    {
+			 E T1W, T20, T1T, T1X;
+			 T1W = T1U + T1V;
+			 T20 = T1Y + T1Z;
+			 T1T = W[6];
+			 T1X = W[7];
+			 T21 = FNMS(T1X, T20, T1T * T1W);
+			 T2b = FMA(T1X, T1W, T1T * T20);
+		    }
+		    {
+			 E T2e, T2g, T2d, T2f;
+			 T2e = T1U - T1V;
+			 T2g = T1Z - T1Y;
+			 T2d = W[22];
+			 T2f = W[23];
+			 T2h = FNMS(T2f, T2g, T2d * T2e);
+			 T2n = FMA(T2f, T2e, T2d * T2g);
+		    }
+		    {
+			 E T2a, T2c, T22, T26;
+			 T22 = W[8];
+			 T26 = W[9];
+			 T2a = FMA(T22, T25, T26 * T29);
+			 T2c = FNMS(T26, T25, T22 * T29);
+			 Rp[WS(rs, 2)] = T21 - T2a;
+			 Ip[WS(rs, 2)] = T2b + T2c;
+			 Rm[WS(rs, 2)] = T21 + T2a;
+			 Im[WS(rs, 2)] = T2c - T2b;
+		    }
+		    {
+			 E T2m, T2o, T2i, T2k;
+			 T2i = W[24];
+			 T2k = W[25];
+			 T2m = FMA(T2i, T2j, T2k * T2l);
+			 T2o = FNMS(T2k, T2j, T2i * T2l);
+			 Rp[WS(rs, 6)] = T2h - T2m;
+			 Ip[WS(rs, 6)] = T2n + T2o;
+			 Rm[WS(rs, 6)] = T2h + T2m;
+			 Im[WS(rs, 6)] = T2o - T2n;
+		    }
+	       }
+	       {
+		    E T2A, T38, T2I, T3a, T2V, T3d, T33, T3f, T2z, T2E;
+		    T2z = KP707106781 * (T2v + T2y);
+		    T2A = T2s + T2z;
+		    T38 = T2s - T2z;
+		    T2E = KP707106781 * (T2C + T2D);
+		    T2I = T2E + T2H;
+		    T3a = T2H - T2E;
+		    {
+			 E T2N, T2U, T2Z, T32;
+			 T2N = T2L + T2M;
+			 T2U = T2Q - T2T;
+			 T2V = T2N + T2U;
+			 T3d = T2N - T2U;
+			 T2Z = T2X + T2Y;
+			 T32 = T30 - T31;
+			 T33 = T2Z + T32;
+			 T3f = T32 - T2Z;
+		    }
+		    {
+			 E T2J, T35, T34, T36;
+			 {
+			      E T2p, T2B, T2K, T2W;
+			      T2p = W[2];
+			      T2B = W[3];
+			      T2J = FNMS(T2B, T2I, T2p * T2A);
+			      T35 = FMA(T2B, T2A, T2p * T2I);
+			      T2K = W[4];
+			      T2W = W[5];
+			      T34 = FMA(T2K, T2V, T2W * T33);
+			      T36 = FNMS(T2W, T2V, T2K * T33);
+			 }
+			 Rp[WS(rs, 1)] = T2J - T34;
+			 Ip[WS(rs, 1)] = T35 + T36;
+			 Rm[WS(rs, 1)] = T2J + T34;
+			 Im[WS(rs, 1)] = T36 - T35;
+		    }
+		    {
+			 E T3b, T3h, T3g, T3i;
+			 {
+			      E T37, T39, T3c, T3e;
+			      T37 = W[18];
+			      T39 = W[19];
+			      T3b = FNMS(T39, T3a, T37 * T38);
+			      T3h = FMA(T39, T38, T37 * T3a);
+			      T3c = W[20];
+			      T3e = W[21];
+			      T3g = FMA(T3c, T3d, T3e * T3f);
+			      T3i = FNMS(T3e, T3d, T3c * T3f);
+			 }
+			 Rp[WS(rs, 5)] = T3b - T3g;
+			 Ip[WS(rs, 5)] = T3h + T3i;
+			 Rm[WS(rs, 5)] = T3b + T3g;
+			 Im[WS(rs, 5)] = T3i - T3h;
+		    }
+	       }
+	       {
+		    E T3m, T3E, T3q, T3G, T3v, T3J, T3z, T3L, T3l, T3o;
+		    T3l = KP707106781 * (T2D - T2C);
+		    T3m = T3k + T3l;
+		    T3E = T3k - T3l;
+		    T3o = KP707106781 * (T2v - T2y);
+		    T3q = T3o + T3p;
+		    T3G = T3p - T3o;
+		    {
+			 E T3t, T3u, T3x, T3y;
+			 T3t = T2L - T2M;
+			 T3u = T2X - T2Y;
+			 T3v = T3t + T3u;
+			 T3J = T3t - T3u;
+			 T3x = T31 + T30;
+			 T3y = T2Q + T2T;
+			 T3z = T3x - T3y;
+			 T3L = T3y + T3x;
+		    }
+		    {
+			 E T3r, T3B, T3A, T3C;
+			 {
+			      E T3j, T3n, T3s, T3w;
+			      T3j = W[10];
+			      T3n = W[11];
+			      T3r = FNMS(T3n, T3q, T3j * T3m);
+			      T3B = FMA(T3n, T3m, T3j * T3q);
+			      T3s = W[12];
+			      T3w = W[13];
+			      T3A = FMA(T3s, T3v, T3w * T3z);
+			      T3C = FNMS(T3w, T3v, T3s * T3z);
+			 }
+			 Rp[WS(rs, 3)] = T3r - T3A;
+			 Ip[WS(rs, 3)] = T3B + T3C;
+			 Rm[WS(rs, 3)] = T3r + T3A;
+			 Im[WS(rs, 3)] = T3C - T3B;
+		    }
+		    {
+			 E T3H, T3N, T3M, T3O;
+			 {
+			      E T3D, T3F, T3I, T3K;
+			      T3D = W[26];
+			      T3F = W[27];
+			      T3H = FNMS(T3F, T3G, T3D * T3E);
+			      T3N = FMA(T3F, T3E, T3D * T3G);
+			      T3I = W[28];
+			      T3K = W[29];
+			      T3M = FMA(T3I, T3J, T3K * T3L);
+			      T3O = FNMS(T3K, T3J, T3I * T3L);
+			 }
+			 Rp[WS(rs, 7)] = T3H - T3M;
+			 Ip[WS(rs, 7)] = T3N + T3O;
+			 Rm[WS(rs, 7)] = T3H + T3M;
+			 Im[WS(rs, 7)] = T3O - T3N;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cbdft2_16", twinstr, &GENUS, {168, 46, 38, 0} };
+
+void X(codelet_hc2cbdft2_16) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft2_16, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft2_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft2_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1135 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:09 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cbdft2_20 -include hc2cb.h */
+
+/*
+ * This function contains 286 FP additions, 148 FP multiplications,
+ * (or, 176 additions, 38 multiplications, 110 fused multiply/add),
+ * 122 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T5s, T5v, T5t, T5z, T5q, T5y, T5u, T5A, T5w;
+	       {
+		    E T3T, T27, T2o, T41, T2p, T40, TU, T15, T2Q, T1N, T2L, T1w, T59, T4n, T5e;
+		    E T4A, T2m, T24, T2Z, T2h, T4J, T3P, T3Y, T3W, T2d, TJ, T3H, T2c, TD, T52;
+		    E T3G, T1E, T4f, T5I, T4e, T4w, T5L, T4v, T1J, T1H;
+		    {
+			 E T1A, T3, T25, TI, TF, T6, T26, T1D, TO, T47, T3z, Te, T1S, T3M, T1e;
+			 E T4k, TZ, T4a, T3C, Tt, T1Z, T3J, T1p, T4h, T14, T4b, T3D, TA, T22, T3K;
+			 E T1u, T4i, Ti, T1f, Th, T1T, TS, Tj, T1g, T1h;
+			 {
+			      E T4, T5, T1B, T1C;
+			      {
+				   E T1, T2, TG, TH;
+				   T1 = Rp[0];
+				   T2 = Rm[WS(rs, 9)];
+				   TG = Ip[0];
+				   TH = Im[WS(rs, 9)];
+				   T4 = Rp[WS(rs, 5)];
+				   T1A = T1 - T2;
+				   T3 = T1 + T2;
+				   T25 = TG - TH;
+				   TI = TG + TH;
+				   T5 = Rm[WS(rs, 4)];
+				   T1B = Ip[WS(rs, 5)];
+				   T1C = Im[WS(rs, 4)];
+			      }
+			      {
+				   E Tq, T1l, Tp, T1X, TY, Tr, T1m, T1n;
+				   {
+					E Tb, T1a, Ta, T1Q, TN, Tc, T1b, T1c;
+					{
+					     E T8, T9, TL, TM;
+					     T8 = Rp[WS(rs, 4)];
+					     TF = T4 - T5;
+					     T6 = T4 + T5;
+					     T26 = T1B - T1C;
+					     T1D = T1B + T1C;
+					     T9 = Rm[WS(rs, 5)];
+					     TL = Ip[WS(rs, 4)];
+					     TM = Im[WS(rs, 5)];
+					     Tb = Rp[WS(rs, 9)];
+					     T1a = T8 - T9;
+					     Ta = T8 + T9;
+					     T1Q = TL - TM;
+					     TN = TL + TM;
+					     Tc = Rm[0];
+					     T1b = Ip[WS(rs, 9)];
+					     T1c = Im[0];
+					}
+					{
+					     E Tn, To, TW, TX;
+					     Tn = Rp[WS(rs, 8)];
+					     {
+						  E TK, Td, T1R, T1d;
+						  TK = Tb - Tc;
+						  Td = Tb + Tc;
+						  T1R = T1b - T1c;
+						  T1d = T1b + T1c;
+						  TO = TK + TN;
+						  T47 = TN - TK;
+						  T3z = Ta - Td;
+						  Te = Ta + Td;
+						  T1S = T1Q + T1R;
+						  T3M = T1Q - T1R;
+						  T1e = T1a - T1d;
+						  T4k = T1a + T1d;
+						  To = Rm[WS(rs, 1)];
+					     }
+					     TW = Ip[WS(rs, 8)];
+					     TX = Im[WS(rs, 1)];
+					     Tq = Rm[WS(rs, 6)];
+					     T1l = Tn - To;
+					     Tp = Tn + To;
+					     T1X = TW - TX;
+					     TY = TW + TX;
+					     Tr = Rp[WS(rs, 3)];
+					     T1m = Im[WS(rs, 6)];
+					     T1n = Ip[WS(rs, 3)];
+					}
+				   }
+				   {
+					E Tx, T1q, Tw, T20, T13, Ty, T1r, T1s;
+					{
+					     E Tu, Tv, T11, T12;
+					     Tu = Rm[WS(rs, 7)];
+					     {
+						  E TV, Ts, T1Y, T1o;
+						  TV = Tq - Tr;
+						  Ts = Tq + Tr;
+						  T1Y = T1n - T1m;
+						  T1o = T1m + T1n;
+						  TZ = TV + TY;
+						  T4a = TY - TV;
+						  T3C = Tp - Ts;
+						  Tt = Tp + Ts;
+						  T1Z = T1X + T1Y;
+						  T3J = T1X - T1Y;
+						  T1p = T1l + T1o;
+						  T4h = T1l - T1o;
+						  Tv = Rp[WS(rs, 2)];
+					     }
+					     T11 = Im[WS(rs, 7)];
+					     T12 = Ip[WS(rs, 2)];
+					     Tx = Rm[WS(rs, 2)];
+					     T1q = Tu - Tv;
+					     Tw = Tu + Tv;
+					     T20 = T12 - T11;
+					     T13 = T11 + T12;
+					     Ty = Rp[WS(rs, 7)];
+					     T1r = Im[WS(rs, 2)];
+					     T1s = Ip[WS(rs, 7)];
+					}
+					{
+					     E Tf, Tg, TQ, TR;
+					     Tf = Rm[WS(rs, 3)];
+					     {
+						  E T10, Tz, T21, T1t;
+						  T10 = Tx - Ty;
+						  Tz = Tx + Ty;
+						  T21 = T1s - T1r;
+						  T1t = T1r + T1s;
+						  T14 = T10 - T13;
+						  T4b = T10 + T13;
+						  T3D = Tw - Tz;
+						  TA = Tw + Tz;
+						  T22 = T20 + T21;
+						  T3K = T20 - T21;
+						  T1u = T1q + T1t;
+						  T4i = T1q - T1t;
+						  Tg = Rp[WS(rs, 6)];
+					     }
+					     TQ = Im[WS(rs, 3)];
+					     TR = Ip[WS(rs, 6)];
+					     Ti = Rp[WS(rs, 1)];
+					     T1f = Tf - Tg;
+					     Th = Tf + Tg;
+					     T1T = TR - TQ;
+					     TS = TQ + TR;
+					     Tj = Rm[WS(rs, 8)];
+					     T1g = Ip[WS(rs, 1)];
+					     T1h = Im[WS(rs, 8)];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T1V, T3N, TB, T3B, Tm, T3E, T1F, T1G, T4t, T4j, T4m, T4s, T4c, T4y, T4z;
+			      E T49, T3y, T7;
+			      {
+				   E TT, T48, T1j, T4l, T3A, Tl;
+				   T3T = T25 - T26;
+				   T27 = T25 + T26;
+				   {
+					E TP, Tk, T1U, T1i;
+					TP = Ti - Tj;
+					Tk = Ti + Tj;
+					T1U = T1g - T1h;
+					T1i = T1g + T1h;
+					TT = TP - TS;
+					T48 = TP + TS;
+					T3A = Th - Tk;
+					Tl = Th + Tk;
+					T1V = T1T + T1U;
+					T3N = T1T - T1U;
+					T1j = T1f - T1i;
+					T4l = T1f + T1i;
+					T2o = Tt - TA;
+					TB = Tt + TA;
+				   }
+				   T41 = T3z - T3A;
+				   T3B = T3z + T3A;
+				   Tm = Te + Tl;
+				   T2p = Te - Tl;
+				   {
+					E T1L, T1M, T1k, T1v;
+					T40 = T3C - T3D;
+					T3E = T3C + T3D;
+					TU = TO + TT;
+					T1L = TO - TT;
+					T1M = TZ - T14;
+					T15 = TZ + T14;
+					T1F = T1e + T1j;
+					T1k = T1e - T1j;
+					T1v = T1p - T1u;
+					T1G = T1p + T1u;
+					T4t = T4h + T4i;
+					T4j = T4h - T4i;
+					T2Q = FNMS(KP618033988, T1L, T1M);
+					T1N = FMA(KP618033988, T1M, T1L);
+					T2L = FNMS(KP618033988, T1k, T1v);
+					T1w = FMA(KP618033988, T1v, T1k);
+					T4m = T4k - T4l;
+					T4s = T4k + T4l;
+					T4c = T4a - T4b;
+					T4y = T4a + T4b;
+					T4z = T47 + T48;
+					T49 = T47 - T48;
+				   }
+			      }
+			      {
+				   E T2g, T1W, T23, T2f;
+				   T2g = T1S - T1V;
+				   T1W = T1S + T1V;
+				   T59 = FMA(KP618033988, T4j, T4m);
+				   T4n = FNMS(KP618033988, T4m, T4j);
+				   T5e = FMA(KP618033988, T4y, T4z);
+				   T4A = FNMS(KP618033988, T4z, T4y);
+				   T23 = T1Z + T22;
+				   T2f = T1Z - T22;
+				   {
+					E T3V, T3L, T3O, T3U;
+					T3V = T3J + T3K;
+					T3L = T3J - T3K;
+					T2m = T1W - T23;
+					T24 = T1W + T23;
+					T2Z = FMA(KP618033988, T2f, T2g);
+					T2h = FNMS(KP618033988, T2g, T2f);
+					T3O = T3M - T3N;
+					T3U = T3M + T3N;
+					T3y = T3 - T6;
+					T7 = T3 + T6;
+					T4J = FMA(KP618033988, T3L, T3O);
+					T3P = FNMS(KP618033988, T3O, T3L);
+					T3Y = T3U - T3V;
+					T3W = T3U + T3V;
+				   }
+			      }
+			      {
+				   E T46, TC, T3F, T4r, T4d, T4u;
+				   TC = Tm + TB;
+				   T2d = Tm - TB;
+				   TJ = TF + TI;
+				   T46 = TI - TF;
+				   T3H = T3B - T3E;
+				   T3F = T3B + T3E;
+				   T2c = FNMS(KP250000000, TC, T7);
+				   TD = T7 + TC;
+				   T52 = T3y + T3F;
+				   T3G = FNMS(KP250000000, T3F, T3y);
+				   T4r = T1A + T1D;
+				   T1E = T1A - T1D;
+				   T4f = T49 - T4c;
+				   T4d = T49 + T4c;
+				   T5I = T46 + T4d;
+				   T4e = FNMS(KP250000000, T4d, T46);
+				   T4w = T4s - T4t;
+				   T4u = T4s + T4t;
+				   T5L = T4u + T4r;
+				   T4v = FNMS(KP250000000, T4u, T4r);
+				   T1J = T1F - T1G;
+				   T1H = T1F + T1G;
+			      }
+			 }
+		    }
+		    {
+			 E T38, T3b, T39, T3f, T36, T3e, T3a;
+			 {
+			      E T28, T3r, T3o, T3v, T3p, T2b, T2k, T35, T3l, T2H, T2r, T2j, T2z, T2D, T2G;
+			      E T2X, T2F, T2T, T32, T3h, T3k, T31, T3d, T3j, T3t, T1x, T2u, T1O, T2x, T2v;
+			      E T1y, T2B, T29, T2J, T2M, T2R, T2N, T2V;
+			      {
+				   E T2l, T1I, T18, T2q, T34, T17, T16, T3n;
+				   T28 = T24 + T27;
+				   T2l = FNMS(KP250000000, T24, T27);
+				   T3r = T1H + T1E;
+				   T1I = FNMS(KP250000000, T1H, T1E);
+				   T18 = TU - T15;
+				   T16 = TU + T15;
+				   T3n = W[8];
+				   T2q = FNMS(KP618033988, T2p, T2o);
+				   T34 = FMA(KP618033988, T2o, T2p);
+				   T17 = FNMS(KP250000000, T16, TJ);
+				   T3o = TJ + T16;
+				   T3v = T3n * T3r;
+				   T3p = T3n * T3o;
+				   {
+					E T2Y, T2E, T3i, T30;
+					{
+					     E T2e, T33, T2n, T2i;
+					     T2Y = FMA(KP559016994, T2d, T2c);
+					     T2e = FNMS(KP559016994, T2d, T2c);
+					     T2b = W[14];
+					     T2k = W[15];
+					     T33 = FMA(KP559016994, T2m, T2l);
+					     T2n = FNMS(KP559016994, T2m, T2l);
+					     T2E = FMA(KP951056516, T2h, T2e);
+					     T2i = FNMS(KP951056516, T2h, T2e);
+					     T35 = FMA(KP951056516, T34, T33);
+					     T3l = FNMS(KP951056516, T34, T33);
+					     T2H = FNMS(KP951056516, T2q, T2n);
+					     T2r = FMA(KP951056516, T2q, T2n);
+					     T2j = T2b * T2i;
+					     T2z = T2k * T2i;
+					     T2D = W[22];
+					     T2G = W[23];
+					}
+					T2X = W[30];
+					T2F = T2D * T2E;
+					T2T = T2G * T2E;
+					T3i = FMA(KP951056516, T2Z, T2Y);
+					T30 = FNMS(KP951056516, T2Z, T2Y);
+					T32 = W[31];
+					T3h = W[6];
+					T3k = W[7];
+					T31 = T2X * T30;
+					T3d = T32 * T30;
+					T3j = T3h * T3i;
+					T3t = T3k * T3i;
+				   }
+				   {
+					E T2K, T2P, TE, T19, T1K, T2t, T37;
+					T2K = FNMS(KP559016994, T18, T17);
+					T19 = FMA(KP559016994, T18, T17);
+					T1K = FMA(KP559016994, T1J, T1I);
+					T2P = FNMS(KP559016994, T1J, T1I);
+					TE = W[0];
+					T2t = W[16];
+					T1x = FMA(KP951056516, T1w, T19);
+					T2u = FNMS(KP951056516, T1w, T19);
+					T1O = FNMS(KP951056516, T1N, T1K);
+					T2x = FMA(KP951056516, T1N, T1K);
+					T2v = T2t * T2u;
+					T1y = TE * T1x;
+					T2B = T2t * T2x;
+					T29 = TE * T1O;
+					T2J = W[24];
+					T37 = W[32];
+					T2M = FMA(KP951056516, T2L, T2K);
+					T38 = FNMS(KP951056516, T2L, T2K);
+					T2R = FNMS(KP951056516, T2Q, T2P);
+					T3b = FMA(KP951056516, T2Q, T2P);
+					T39 = T37 * T38;
+					T2N = T2J * T2M;
+					T3f = T37 * T3b;
+				   }
+			      }
+			      T2V = T2J * T2R;
+			      {
+				   E T3m, T3u, T3q, T2a, T1P, T1z;
+				   T1z = W[1];
+				   T3m = FNMS(T3k, T3l, T3j);
+				   T3u = FMA(T3h, T3l, T3t);
+				   T3q = W[9];
+				   T2a = FNMS(T1z, T1x, T29);
+				   T1P = FMA(T1z, T1O, T1y);
+				   {
+					E T2s, T2A, T2w, T3w, T3s;
+					T2s = FNMS(T2k, T2r, T2j);
+					T3w = FNMS(T3q, T3o, T3v);
+					T3s = FMA(T3q, T3r, T3p);
+					Im[0] = T2a - T28;
+					Ip[0] = T28 + T2a;
+					Rm[0] = TD + T1P;
+					Rp[0] = TD - T1P;
+					Im[WS(rs, 2)] = T3w - T3u;
+					Ip[WS(rs, 2)] = T3u + T3w;
+					Rm[WS(rs, 2)] = T3m + T3s;
+					Rp[WS(rs, 2)] = T3m - T3s;
+					T2A = FMA(T2b, T2r, T2z);
+					T2w = W[17];
+					{
+					     E T2I, T2U, T2O, T2C, T2y, T2W, T2S;
+					     T2I = FNMS(T2G, T2H, T2F);
+					     T2U = FMA(T2D, T2H, T2T);
+					     T2O = W[25];
+					     T2C = FNMS(T2w, T2u, T2B);
+					     T2y = FMA(T2w, T2x, T2v);
+					     T36 = FNMS(T32, T35, T31);
+					     T2W = FNMS(T2O, T2M, T2V);
+					     T2S = FMA(T2O, T2R, T2N);
+					     Im[WS(rs, 4)] = T2C - T2A;
+					     Ip[WS(rs, 4)] = T2A + T2C;
+					     Rm[WS(rs, 4)] = T2s + T2y;
+					     Rp[WS(rs, 4)] = T2s - T2y;
+					     Im[WS(rs, 6)] = T2W - T2U;
+					     Ip[WS(rs, 6)] = T2U + T2W;
+					     Rm[WS(rs, 6)] = T2I + T2S;
+					     Rp[WS(rs, 6)] = T2I - T2S;
+					     T3e = FMA(T2X, T35, T3d);
+					     T3a = W[33];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T55, T51, T54, T53, T5h, T5P, T5J, T3x, T4P, T5F, T5p, T43, T3R, T3S, T5l;
+			      E T5o, T4D, T5n, T5x, T4H, T4M, T5B, T5E, T4L, T4X, T5D, T5N, T4S, T4o, T4V;
+			      E T4B, T4T, T4p, T4Z, T4F, T57, T5a, T5f, T5b, T5j;
+			      {
+				   E T3X, T4O, T42, T3g, T3c, T5H;
+				   T55 = T3W + T3T;
+				   T3X = FNMS(KP250000000, T3W, T3T);
+				   T51 = W[18];
+				   T3g = FNMS(T3a, T38, T3f);
+				   T3c = FMA(T3a, T3b, T39);
+				   T54 = W[19];
+				   T53 = T51 * T52;
+				   Im[WS(rs, 8)] = T3g - T3e;
+				   Ip[WS(rs, 8)] = T3e + T3g;
+				   Rm[WS(rs, 8)] = T36 + T3c;
+				   Rp[WS(rs, 8)] = T36 - T3c;
+				   T5h = T54 * T52;
+				   T5H = W[28];
+				   T4O = FMA(KP618033988, T40, T41);
+				   T42 = FNMS(KP618033988, T41, T40);
+				   T5P = T5H * T5L;
+				   T5J = T5H * T5I;
+				   {
+					E T4I, T5m, T3Q, T3I, T3Z, T4N, T4K, T5C;
+					T3I = FNMS(KP559016994, T3H, T3G);
+					T4I = FMA(KP559016994, T3H, T3G);
+					T3Z = FNMS(KP559016994, T3Y, T3X);
+					T4N = FMA(KP559016994, T3Y, T3X);
+					T3x = W[2];
+					T5m = FNMS(KP951056516, T3P, T3I);
+					T3Q = FMA(KP951056516, T3P, T3I);
+					T4P = FMA(KP951056516, T4O, T4N);
+					T5F = FNMS(KP951056516, T4O, T4N);
+					T5p = FMA(KP951056516, T42, T3Z);
+					T43 = FNMS(KP951056516, T42, T3Z);
+					T3R = T3x * T3Q;
+					T3S = W[3];
+					T5l = W[34];
+					T5o = W[35];
+					T4D = T3S * T3Q;
+					T5n = T5l * T5m;
+					T5x = T5o * T5m;
+					T4K = FNMS(KP951056516, T4J, T4I);
+					T5C = FMA(KP951056516, T4J, T4I);
+					T4H = W[10];
+					T4M = W[11];
+					T5B = W[26];
+					T5E = W[27];
+					T4L = T4H * T4K;
+					T4X = T4M * T4K;
+					T5D = T5B * T5C;
+					T5N = T5E * T5C;
+				   }
+				   {
+					E T58, T5d, T45, T4g, T4x, T4R, T5r;
+					T4g = FNMS(KP559016994, T4f, T4e);
+					T58 = FMA(KP559016994, T4f, T4e);
+					T5d = FMA(KP559016994, T4w, T4v);
+					T4x = FNMS(KP559016994, T4w, T4v);
+					T45 = W[4];
+					T4R = W[12];
+					T4S = FNMS(KP951056516, T4n, T4g);
+					T4o = FMA(KP951056516, T4n, T4g);
+					T4V = FMA(KP951056516, T4A, T4x);
+					T4B = FNMS(KP951056516, T4A, T4x);
+					T4T = T4R * T4S;
+					T4p = T45 * T4o;
+					T4Z = T4R * T4V;
+					T4F = T45 * T4B;
+					T57 = W[20];
+					T5r = W[36];
+					T5s = FNMS(KP951056516, T59, T58);
+					T5a = FMA(KP951056516, T59, T58);
+					T5v = FMA(KP951056516, T5e, T5d);
+					T5f = FNMS(KP951056516, T5e, T5d);
+					T5t = T5r * T5s;
+					T5b = T57 * T5a;
+					T5z = T5r * T5v;
+				   }
+			      }
+			      T5j = T57 * T5f;
+			      {
+				   E T44, T4E, T5G, T5O, T5K, T4G, T4C, T4q;
+				   T44 = FNMS(T3S, T43, T3R);
+				   T4E = FMA(T3x, T43, T4D);
+				   T4q = W[5];
+				   T5G = FNMS(T5E, T5F, T5D);
+				   T5O = FMA(T5B, T5F, T5N);
+				   T5K = W[29];
+				   T4G = FNMS(T4q, T4o, T4F);
+				   T4C = FMA(T4q, T4B, T4p);
+				   {
+					E T4Q, T4Y, T4U, T5Q, T5M;
+					T4Q = FNMS(T4M, T4P, T4L);
+					T5Q = FNMS(T5K, T5I, T5P);
+					T5M = FMA(T5K, T5L, T5J);
+					Im[WS(rs, 1)] = T4G - T4E;
+					Ip[WS(rs, 1)] = T4E + T4G;
+					Rm[WS(rs, 1)] = T44 + T4C;
+					Rp[WS(rs, 1)] = T44 - T4C;
+					Im[WS(rs, 7)] = T5Q - T5O;
+					Ip[WS(rs, 7)] = T5O + T5Q;
+					Rm[WS(rs, 7)] = T5G + T5M;
+					Rp[WS(rs, 7)] = T5G - T5M;
+					T4Y = FMA(T4H, T4P, T4X);
+					T4U = W[13];
+					{
+					     E T56, T5i, T5c, T50, T4W, T5k, T5g;
+					     T56 = FNMS(T54, T55, T53);
+					     T5i = FMA(T51, T55, T5h);
+					     T5c = W[21];
+					     T50 = FNMS(T4U, T4S, T4Z);
+					     T4W = FMA(T4U, T4V, T4T);
+					     T5q = FNMS(T5o, T5p, T5n);
+					     T5k = FNMS(T5c, T5a, T5j);
+					     T5g = FMA(T5c, T5f, T5b);
+					     Im[WS(rs, 3)] = T50 - T4Y;
+					     Ip[WS(rs, 3)] = T4Y + T50;
+					     Rm[WS(rs, 3)] = T4Q + T4W;
+					     Rp[WS(rs, 3)] = T4Q - T4W;
+					     Im[WS(rs, 5)] = T5k - T5i;
+					     Ip[WS(rs, 5)] = T5i + T5k;
+					     Rm[WS(rs, 5)] = T56 + T5g;
+					     Rp[WS(rs, 5)] = T56 - T5g;
+					     T5y = FMA(T5l, T5p, T5x);
+					     T5u = W[37];
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T5A = FNMS(T5u, T5s, T5z);
+	       T5w = FMA(T5u, T5v, T5t);
+	       Im[WS(rs, 9)] = T5A - T5y;
+	       Ip[WS(rs, 9)] = T5y + T5A;
+	       Rm[WS(rs, 9)] = T5q + T5w;
+	       Rp[WS(rs, 9)] = T5q - T5w;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cbdft2_20", twinstr, &GENUS, {176, 38, 110, 0} };
+
+void X(codelet_hc2cbdft2_20) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft2_20, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cbdft2_20 -include hc2cb.h */
+
+/*
+ * This function contains 286 FP additions, 124 FP multiplications,
+ * (or, 224 additions, 62 multiplications, 62 fused multiply/add),
+ * 89 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T7, T3N, T4a, T16, T1G, T3g, T3D, T26, T1k, T3A, T3B, T1v, T2e, T48, T47;
+	       E T2d, T1L, T43, T40, T1K, T2l, T3t, T2m, T3w, T3n, T3p, TC, T2b, T4d, T4f;
+	       E T23, T2j, T1B, T1H, T3U, T3W, T3G, T3I, T11, T17;
+	       {
+		    E T3, T1C, T15, T24, T6, T12, T1F, T25;
+		    {
+			 E T1, T2, T13, T14;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 9)];
+			 T3 = T1 + T2;
+			 T1C = T1 - T2;
+			 T13 = Ip[0];
+			 T14 = Im[WS(rs, 9)];
+			 T15 = T13 + T14;
+			 T24 = T13 - T14;
+		    }
+		    {
+			 E T4, T5, T1D, T1E;
+			 T4 = Rp[WS(rs, 5)];
+			 T5 = Rm[WS(rs, 4)];
+			 T6 = T4 + T5;
+			 T12 = T4 - T5;
+			 T1D = Ip[WS(rs, 5)];
+			 T1E = Im[WS(rs, 4)];
+			 T1F = T1D + T1E;
+			 T25 = T1D - T1E;
+		    }
+		    T7 = T3 + T6;
+		    T3N = T15 - T12;
+		    T4a = T1C + T1F;
+		    T16 = T12 + T15;
+		    T1G = T1C - T1F;
+		    T3g = T3 - T6;
+		    T3D = T24 - T25;
+		    T26 = T24 + T25;
+	       }
+	       {
+		    E Te, T3O, T3Y, TJ, T1e, T3h, T3r, T1R, TA, T3S, T42, TZ, T1u, T3l, T3v;
+		    E T21, Tl, T3P, T3Z, TO, T1j, T3i, T3s, T1U, Tt, T3R, T41, TU, T1p, T3k;
+		    E T3u, T1Y;
+		    {
+			 E Ta, T1a, TI, T1P, Td, TF, T1d, T1Q;
+			 {
+			      E T8, T9, TG, TH;
+			      T8 = Rp[WS(rs, 4)];
+			      T9 = Rm[WS(rs, 5)];
+			      Ta = T8 + T9;
+			      T1a = T8 - T9;
+			      TG = Ip[WS(rs, 4)];
+			      TH = Im[WS(rs, 5)];
+			      TI = TG + TH;
+			      T1P = TG - TH;
+			 }
+			 {
+			      E Tb, Tc, T1b, T1c;
+			      Tb = Rp[WS(rs, 9)];
+			      Tc = Rm[0];
+			      Td = Tb + Tc;
+			      TF = Tb - Tc;
+			      T1b = Ip[WS(rs, 9)];
+			      T1c = Im[0];
+			      T1d = T1b + T1c;
+			      T1Q = T1b - T1c;
+			 }
+			 Te = Ta + Td;
+			 T3O = TI - TF;
+			 T3Y = T1a + T1d;
+			 TJ = TF + TI;
+			 T1e = T1a - T1d;
+			 T3h = Ta - Td;
+			 T3r = T1P - T1Q;
+			 T1R = T1P + T1Q;
+		    }
+		    {
+			 E Tw, T1q, TY, T1Z, Tz, TV, T1t, T20;
+			 {
+			      E Tu, Tv, TW, TX;
+			      Tu = Rm[WS(rs, 7)];
+			      Tv = Rp[WS(rs, 2)];
+			      Tw = Tu + Tv;
+			      T1q = Tu - Tv;
+			      TW = Im[WS(rs, 7)];
+			      TX = Ip[WS(rs, 2)];
+			      TY = TW + TX;
+			      T1Z = TX - TW;
+			 }
+			 {
+			      E Tx, Ty, T1r, T1s;
+			      Tx = Rm[WS(rs, 2)];
+			      Ty = Rp[WS(rs, 7)];
+			      Tz = Tx + Ty;
+			      TV = Tx - Ty;
+			      T1r = Im[WS(rs, 2)];
+			      T1s = Ip[WS(rs, 7)];
+			      T1t = T1r + T1s;
+			      T20 = T1s - T1r;
+			 }
+			 TA = Tw + Tz;
+			 T3S = TV + TY;
+			 T42 = T1q - T1t;
+			 TZ = TV - TY;
+			 T1u = T1q + T1t;
+			 T3l = Tw - Tz;
+			 T3v = T1Z - T20;
+			 T21 = T1Z + T20;
+		    }
+		    {
+			 E Th, T1f, TN, T1S, Tk, TK, T1i, T1T;
+			 {
+			      E Tf, Tg, TL, TM;
+			      Tf = Rm[WS(rs, 3)];
+			      Tg = Rp[WS(rs, 6)];
+			      Th = Tf + Tg;
+			      T1f = Tf - Tg;
+			      TL = Im[WS(rs, 3)];
+			      TM = Ip[WS(rs, 6)];
+			      TN = TL + TM;
+			      T1S = TM - TL;
+			 }
+			 {
+			      E Ti, Tj, T1g, T1h;
+			      Ti = Rp[WS(rs, 1)];
+			      Tj = Rm[WS(rs, 8)];
+			      Tk = Ti + Tj;
+			      TK = Ti - Tj;
+			      T1g = Ip[WS(rs, 1)];
+			      T1h = Im[WS(rs, 8)];
+			      T1i = T1g + T1h;
+			      T1T = T1g - T1h;
+			 }
+			 Tl = Th + Tk;
+			 T3P = TK + TN;
+			 T3Z = T1f + T1i;
+			 TO = TK - TN;
+			 T1j = T1f - T1i;
+			 T3i = Th - Tk;
+			 T3s = T1S - T1T;
+			 T1U = T1S + T1T;
+		    }
+		    {
+			 E Tp, T1l, TT, T1W, Ts, TQ, T1o, T1X;
+			 {
+			      E Tn, To, TR, TS;
+			      Tn = Rp[WS(rs, 8)];
+			      To = Rm[WS(rs, 1)];
+			      Tp = Tn + To;
+			      T1l = Tn - To;
+			      TR = Ip[WS(rs, 8)];
+			      TS = Im[WS(rs, 1)];
+			      TT = TR + TS;
+			      T1W = TR - TS;
+			 }
+			 {
+			      E Tq, Tr, T1m, T1n;
+			      Tq = Rm[WS(rs, 6)];
+			      Tr = Rp[WS(rs, 3)];
+			      Ts = Tq + Tr;
+			      TQ = Tq - Tr;
+			      T1m = Im[WS(rs, 6)];
+			      T1n = Ip[WS(rs, 3)];
+			      T1o = T1m + T1n;
+			      T1X = T1n - T1m;
+			 }
+			 Tt = Tp + Ts;
+			 T3R = TT - TQ;
+			 T41 = T1l - T1o;
+			 TU = TQ + TT;
+			 T1p = T1l + T1o;
+			 T3k = Tp - Ts;
+			 T3u = T1W - T1X;
+			 T1Y = T1W + T1X;
+		    }
+		    T1k = T1e - T1j;
+		    T3A = T3h - T3i;
+		    T3B = T3k - T3l;
+		    T1v = T1p - T1u;
+		    T2e = T1Y - T21;
+		    T48 = T3R + T3S;
+		    T47 = T3O + T3P;
+		    T2d = T1R - T1U;
+		    T1L = TU - TZ;
+		    T43 = T41 - T42;
+		    T40 = T3Y - T3Z;
+		    T1K = TJ - TO;
+		    T2l = Te - Tl;
+		    T3t = T3r - T3s;
+		    T2m = Tt - TA;
+		    T3w = T3u - T3v;
+		    {
+			 E T3j, T3m, Tm, TB;
+			 T3j = T3h + T3i;
+			 T3m = T3k + T3l;
+			 T3n = T3j + T3m;
+			 T3p = KP559016994 * (T3j - T3m);
+			 Tm = Te + Tl;
+			 TB = Tt + TA;
+			 TC = Tm + TB;
+			 T2b = KP559016994 * (Tm - TB);
+		    }
+		    {
+			 E T4b, T4c, T3Q, T3T;
+			 T4b = T3Y + T3Z;
+			 T4c = T41 + T42;
+			 T4d = T4b + T4c;
+			 T4f = KP559016994 * (T4b - T4c);
+			 {
+			      E T1V, T22, T1z, T1A;
+			      T1V = T1R + T1U;
+			      T22 = T1Y + T21;
+			      T23 = T1V + T22;
+			      T2j = KP559016994 * (T1V - T22);
+			      T1z = T1e + T1j;
+			      T1A = T1p + T1u;
+			      T1B = KP559016994 * (T1z - T1A);
+			      T1H = T1z + T1A;
+			 }
+			 T3Q = T3O - T3P;
+			 T3T = T3R - T3S;
+			 T3U = T3Q + T3T;
+			 T3W = KP559016994 * (T3Q - T3T);
+			 {
+			      E T3E, T3F, TP, T10;
+			      T3E = T3r + T3s;
+			      T3F = T3u + T3v;
+			      T3G = T3E + T3F;
+			      T3I = KP559016994 * (T3E - T3F);
+			      TP = TJ + TO;
+			      T10 = TU + TZ;
+			      T11 = KP559016994 * (TP - T10);
+			      T17 = TP + T10;
+			 }
+		    }
+	       }
+	       {
+		    E TD, T27, T3c, T3e, T2o, T36, T2A, T2U, T1N, T2Z, T2t, T2J, T1x, T2X, T2r;
+		    E T2F, T2g, T34, T2y, T2Q;
+		    TD = T7 + TC;
+		    T27 = T23 + T26;
+		    {
+			 E T39, T3b, T38, T3a;
+			 T39 = T16 + T17;
+			 T3b = T1H + T1G;
+			 T38 = W[8];
+			 T3a = W[9];
+			 T3c = FMA(T38, T39, T3a * T3b);
+			 T3e = FNMS(T3a, T39, T38 * T3b);
+		    }
+		    {
+			 E T2n, T2S, T2k, T2T, T2i;
+			 T2n = FNMS(KP951056516, T2m, KP587785252 * T2l);
+			 T2S = FMA(KP951056516, T2l, KP587785252 * T2m);
+			 T2i = FNMS(KP250000000, T23, T26);
+			 T2k = T2i - T2j;
+			 T2T = T2j + T2i;
+			 T2o = T2k - T2n;
+			 T36 = T2T - T2S;
+			 T2A = T2n + T2k;
+			 T2U = T2S + T2T;
+		    }
+		    {
+			 E T1M, T2H, T1J, T2I, T1I;
+			 T1M = FMA(KP951056516, T1K, KP587785252 * T1L);
+			 T2H = FNMS(KP951056516, T1L, KP587785252 * T1K);
+			 T1I = FNMS(KP250000000, T1H, T1G);
+			 T1J = T1B + T1I;
+			 T2I = T1I - T1B;
+			 T1N = T1J - T1M;
+			 T2Z = T2I - T2H;
+			 T2t = T1M + T1J;
+			 T2J = T2H + T2I;
+		    }
+		    {
+			 E T1w, T2E, T19, T2D, T18;
+			 T1w = FMA(KP951056516, T1k, KP587785252 * T1v);
+			 T2E = FNMS(KP951056516, T1v, KP587785252 * T1k);
+			 T18 = FNMS(KP250000000, T17, T16);
+			 T19 = T11 + T18;
+			 T2D = T18 - T11;
+			 T1x = T19 + T1w;
+			 T2X = T2D + T2E;
+			 T2r = T19 - T1w;
+			 T2F = T2D - T2E;
+		    }
+		    {
+			 E T2f, T2P, T2c, T2O, T2a;
+			 T2f = FNMS(KP951056516, T2e, KP587785252 * T2d);
+			 T2P = FMA(KP951056516, T2d, KP587785252 * T2e);
+			 T2a = FNMS(KP250000000, TC, T7);
+			 T2c = T2a - T2b;
+			 T2O = T2b + T2a;
+			 T2g = T2c + T2f;
+			 T34 = T2O + T2P;
+			 T2y = T2c - T2f;
+			 T2Q = T2O - T2P;
+		    }
+		    {
+			 E T1O, T28, TE, T1y;
+			 TE = W[0];
+			 T1y = W[1];
+			 T1O = FMA(TE, T1x, T1y * T1N);
+			 T28 = FNMS(T1y, T1x, TE * T1N);
+			 Rp[0] = TD - T1O;
+			 Ip[0] = T27 + T28;
+			 Rm[0] = TD + T1O;
+			 Im[0] = T28 - T27;
+		    }
+		    {
+			 E T37, T3d, T33, T35;
+			 T33 = W[6];
+			 T35 = W[7];
+			 T37 = FNMS(T35, T36, T33 * T34);
+			 T3d = FMA(T35, T34, T33 * T36);
+			 Rp[WS(rs, 2)] = T37 - T3c;
+			 Ip[WS(rs, 2)] = T3d + T3e;
+			 Rm[WS(rs, 2)] = T37 + T3c;
+			 Im[WS(rs, 2)] = T3e - T3d;
+		    }
+		    {
+			 E T2p, T2v, T2u, T2w;
+			 {
+			      E T29, T2h, T2q, T2s;
+			      T29 = W[14];
+			      T2h = W[15];
+			      T2p = FNMS(T2h, T2o, T29 * T2g);
+			      T2v = FMA(T2h, T2g, T29 * T2o);
+			      T2q = W[16];
+			      T2s = W[17];
+			      T2u = FMA(T2q, T2r, T2s * T2t);
+			      T2w = FNMS(T2s, T2r, T2q * T2t);
+			 }
+			 Rp[WS(rs, 4)] = T2p - T2u;
+			 Ip[WS(rs, 4)] = T2v + T2w;
+			 Rm[WS(rs, 4)] = T2p + T2u;
+			 Im[WS(rs, 4)] = T2w - T2v;
+		    }
+		    {
+			 E T2B, T2L, T2K, T2M;
+			 {
+			      E T2x, T2z, T2C, T2G;
+			      T2x = W[22];
+			      T2z = W[23];
+			      T2B = FNMS(T2z, T2A, T2x * T2y);
+			      T2L = FMA(T2z, T2y, T2x * T2A);
+			      T2C = W[24];
+			      T2G = W[25];
+			      T2K = FMA(T2C, T2F, T2G * T2J);
+			      T2M = FNMS(T2G, T2F, T2C * T2J);
+			 }
+			 Rp[WS(rs, 6)] = T2B - T2K;
+			 Ip[WS(rs, 6)] = T2L + T2M;
+			 Rm[WS(rs, 6)] = T2B + T2K;
+			 Im[WS(rs, 6)] = T2M - T2L;
+		    }
+		    {
+			 E T2V, T31, T30, T32;
+			 {
+			      E T2N, T2R, T2W, T2Y;
+			      T2N = W[30];
+			      T2R = W[31];
+			      T2V = FNMS(T2R, T2U, T2N * T2Q);
+			      T31 = FMA(T2R, T2Q, T2N * T2U);
+			      T2W = W[32];
+			      T2Y = W[33];
+			      T30 = FMA(T2W, T2X, T2Y * T2Z);
+			      T32 = FNMS(T2Y, T2X, T2W * T2Z);
+			 }
+			 Rp[WS(rs, 8)] = T2V - T30;
+			 Ip[WS(rs, 8)] = T31 + T32;
+			 Rm[WS(rs, 8)] = T2V + T30;
+			 Im[WS(rs, 8)] = T32 - T31;
+		    }
+	       }
+	       {
+		    E T4F, T4P, T5c, T5e, T3y, T54, T4o, T4S, T4h, T4Z, T4x, T4N, T45, T4X, T4v;
+		    E T4J, T3K, T56, T4s, T4U;
+		    {
+			 E T4C, T4E, T4B, T4D;
+			 T4C = T3g + T3n;
+			 T4E = T3G + T3D;
+			 T4B = W[18];
+			 T4D = W[19];
+			 T4F = FNMS(T4D, T4E, T4B * T4C);
+			 T4P = FMA(T4D, T4C, T4B * T4E);
+		    }
+		    {
+			 E T59, T5b, T58, T5a;
+			 T59 = T3N + T3U;
+			 T5b = T4d + T4a;
+			 T58 = W[28];
+			 T5a = W[29];
+			 T5c = FMA(T58, T59, T5a * T5b);
+			 T5e = FNMS(T5a, T59, T58 * T5b);
+		    }
+		    {
+			 E T3x, T4n, T3q, T4m, T3o;
+			 T3x = FNMS(KP951056516, T3w, KP587785252 * T3t);
+			 T4n = FMA(KP951056516, T3t, KP587785252 * T3w);
+			 T3o = FNMS(KP250000000, T3n, T3g);
+			 T3q = T3o - T3p;
+			 T4m = T3p + T3o;
+			 T3y = T3q - T3x;
+			 T54 = T4m + T4n;
+			 T4o = T4m - T4n;
+			 T4S = T3q + T3x;
+		    }
+		    {
+			 E T49, T4M, T4g, T4L, T4e;
+			 T49 = FNMS(KP951056516, T48, KP587785252 * T47);
+			 T4M = FMA(KP951056516, T47, KP587785252 * T48);
+			 T4e = FNMS(KP250000000, T4d, T4a);
+			 T4g = T4e - T4f;
+			 T4L = T4f + T4e;
+			 T4h = T49 + T4g;
+			 T4Z = T4M + T4L;
+			 T4x = T4g - T49;
+			 T4N = T4L - T4M;
+		    }
+		    {
+			 E T44, T4I, T3X, T4H, T3V;
+			 T44 = FNMS(KP951056516, T43, KP587785252 * T40);
+			 T4I = FMA(KP951056516, T40, KP587785252 * T43);
+			 T3V = FNMS(KP250000000, T3U, T3N);
+			 T3X = T3V - T3W;
+			 T4H = T3W + T3V;
+			 T45 = T3X - T44;
+			 T4X = T4H - T4I;
+			 T4v = T3X + T44;
+			 T4J = T4H + T4I;
+		    }
+		    {
+			 E T3C, T4q, T3J, T4r, T3H;
+			 T3C = FNMS(KP951056516, T3B, KP587785252 * T3A);
+			 T4q = FMA(KP951056516, T3A, KP587785252 * T3B);
+			 T3H = FNMS(KP250000000, T3G, T3D);
+			 T3J = T3H - T3I;
+			 T4r = T3I + T3H;
+			 T3K = T3C + T3J;
+			 T56 = T4r - T4q;
+			 T4s = T4q + T4r;
+			 T4U = T3J - T3C;
+		    }
+		    {
+			 E T4O, T4Q, T4G, T4K;
+			 T4G = W[20];
+			 T4K = W[21];
+			 T4O = FMA(T4G, T4J, T4K * T4N);
+			 T4Q = FNMS(T4K, T4J, T4G * T4N);
+			 Rp[WS(rs, 5)] = T4F - T4O;
+			 Ip[WS(rs, 5)] = T4P + T4Q;
+			 Rm[WS(rs, 5)] = T4F + T4O;
+			 Im[WS(rs, 5)] = T4Q - T4P;
+		    }
+		    {
+			 E T57, T5d, T53, T55;
+			 T53 = W[26];
+			 T55 = W[27];
+			 T57 = FNMS(T55, T56, T53 * T54);
+			 T5d = FMA(T55, T54, T53 * T56);
+			 Rp[WS(rs, 7)] = T57 - T5c;
+			 Ip[WS(rs, 7)] = T5d + T5e;
+			 Rm[WS(rs, 7)] = T57 + T5c;
+			 Im[WS(rs, 7)] = T5e - T5d;
+		    }
+		    {
+			 E T3L, T4j, T4i, T4k;
+			 {
+			      E T3f, T3z, T3M, T46;
+			      T3f = W[2];
+			      T3z = W[3];
+			      T3L = FNMS(T3z, T3K, T3f * T3y);
+			      T4j = FMA(T3z, T3y, T3f * T3K);
+			      T3M = W[4];
+			      T46 = W[5];
+			      T4i = FMA(T3M, T45, T46 * T4h);
+			      T4k = FNMS(T46, T45, T3M * T4h);
+			 }
+			 Rp[WS(rs, 1)] = T3L - T4i;
+			 Ip[WS(rs, 1)] = T4j + T4k;
+			 Rm[WS(rs, 1)] = T3L + T4i;
+			 Im[WS(rs, 1)] = T4k - T4j;
+		    }
+		    {
+			 E T4t, T4z, T4y, T4A;
+			 {
+			      E T4l, T4p, T4u, T4w;
+			      T4l = W[10];
+			      T4p = W[11];
+			      T4t = FNMS(T4p, T4s, T4l * T4o);
+			      T4z = FMA(T4p, T4o, T4l * T4s);
+			      T4u = W[12];
+			      T4w = W[13];
+			      T4y = FMA(T4u, T4v, T4w * T4x);
+			      T4A = FNMS(T4w, T4v, T4u * T4x);
+			 }
+			 Rp[WS(rs, 3)] = T4t - T4y;
+			 Ip[WS(rs, 3)] = T4z + T4A;
+			 Rm[WS(rs, 3)] = T4t + T4y;
+			 Im[WS(rs, 3)] = T4A - T4z;
+		    }
+		    {
+			 E T4V, T51, T50, T52;
+			 {
+			      E T4R, T4T, T4W, T4Y;
+			      T4R = W[34];
+			      T4T = W[35];
+			      T4V = FNMS(T4T, T4U, T4R * T4S);
+			      T51 = FMA(T4T, T4S, T4R * T4U);
+			      T4W = W[36];
+			      T4Y = W[37];
+			      T50 = FMA(T4W, T4X, T4Y * T4Z);
+			      T52 = FNMS(T4Y, T4X, T4W * T4Z);
+			 }
+			 Rp[WS(rs, 9)] = T4V - T50;
+			 Ip[WS(rs, 9)] = T51 + T52;
+			 Rm[WS(rs, 9)] = T4V + T50;
+			 Im[WS(rs, 9)] = T52 - T51;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cbdft2_20", twinstr, &GENUS, {224, 62, 62, 0} };
+
+void X(codelet_hc2cbdft2_20) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft2_20, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft2_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft2_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1888 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:08 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -dif -name hc2cbdft2_32 -include hc2cb.h */
+
+/*
+ * This function contains 498 FP additions, 260 FP multiplications,
+ * (or, 300 additions, 62 multiplications, 198 fused multiply/add),
+ * 165 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft2_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T8e, T8h, T7S, T8l, T8f, T84, T8c, T8k, T8g, T86, T82, T8m, T8i;
+	       {
+		    E T4B, T3h, T3K, Tv, T8Y, T6T, T8L, T7i, T8X, T7f, T4Y, T1G, T4K, T1j, T4X;
+		    E T2M, T8C, T6d, T8o, T66, T8K, T6M, T4L, T2P, T4C, T3o, T5q, T4q, T8p, T6C;
+		    E T8B, T6z, T72, T2u, T75, T10, T3P, T3a, T3L, T4t, T4E, T8F, T8t, T4F, T4w;
+		    E T8E, T8w, T6E, T6l, T6F, T6s, T76, T4P, T51, T2R, T28, T8P, T90, T7k, T71;
+		    E T2p, T4R, T2x, T73, T6x, T6y;
+		    {
+			 E T3l, T16, T3m, T2H, T2E, T13, T64, T7, T3i, T2J, T1c, T3j, T1h, T2K, Te;
+			 E T1z, T6R, T6a, Tt, T3g, T6b, T1E, T6Q, Tj, T1p, Ti, T3b, T1n, Tk, T1q;
+			 E T1r;
+			 {
+			      E T1, T2, T4, T5;
+			      {
+				   E T14, T15, T2F, T2G;
+				   T14 = Ip[0];
+				   T15 = Im[WS(rs, 15)];
+				   T2F = Ip[WS(rs, 8)];
+				   T2G = Im[WS(rs, 7)];
+				   T1 = Rp[0];
+				   T3l = T14 - T15;
+				   T16 = T14 + T15;
+				   T3m = T2F - T2G;
+				   T2H = T2F + T2G;
+				   T2 = Rm[WS(rs, 15)];
+				   T4 = Rp[WS(rs, 8)];
+				   T5 = Rm[WS(rs, 7)];
+			      }
+			      {
+				   E T1b, T1e, T18, Ta, T1f, Tb, Tc, T8, T9, T1g, T1d, Td;
+				   {
+					E T19, T3, T6, T1a;
+					T19 = Ip[WS(rs, 4)];
+					T2E = T1 - T2;
+					T3 = T1 + T2;
+					T13 = T4 - T5;
+					T6 = T4 + T5;
+					T1a = Im[WS(rs, 11)];
+					T8 = Rp[WS(rs, 4)];
+					T9 = Rm[WS(rs, 11)];
+					T64 = T3 - T6;
+					T7 = T3 + T6;
+					T1b = T19 + T1a;
+					T3i = T19 - T1a;
+				   }
+				   T1e = Im[WS(rs, 3)];
+				   T18 = T8 - T9;
+				   Ta = T8 + T9;
+				   T1f = Ip[WS(rs, 12)];
+				   Tb = Rm[WS(rs, 3)];
+				   Tc = Rp[WS(rs, 12)];
+				   T2J = T18 - T1b;
+				   T1c = T18 + T1b;
+				   T1g = T1e + T1f;
+				   T3j = T1f - T1e;
+				   T1d = Tb - Tc;
+				   Td = Tb + Tc;
+				   T1h = T1d + T1g;
+				   T2K = T1d - T1g;
+				   T6x = Ta - Td;
+				   Te = Ta + Td;
+			      }
+			      {
+				   E Tq, T1A, Tp, T3e, T1y, Tr, T1B, T1C;
+				   {
+					E Tn, To, T1w, T1x;
+					Tn = Rm[WS(rs, 1)];
+					To = Rp[WS(rs, 14)];
+					T1w = Im[WS(rs, 1)];
+					T1x = Ip[WS(rs, 14)];
+					Tq = Rp[WS(rs, 6)];
+					T1A = Tn - To;
+					Tp = Tn + To;
+					T3e = T1x - T1w;
+					T1y = T1w + T1x;
+					Tr = Rm[WS(rs, 9)];
+					T1B = Ip[WS(rs, 6)];
+					T1C = Im[WS(rs, 9)];
+				   }
+				   {
+					E Tg, Th, T1l, T1m;
+					Tg = Rp[WS(rs, 2)];
+					{
+					     E T1v, Ts, T3f, T1D;
+					     T1v = Tq - Tr;
+					     Ts = Tq + Tr;
+					     T3f = T1B - T1C;
+					     T1D = T1B + T1C;
+					     T1z = T1v - T1y;
+					     T6R = T1v + T1y;
+					     T6a = Tp - Ts;
+					     Tt = Tp + Ts;
+					     T3g = T3e + T3f;
+					     T6b = T3e - T3f;
+					     T1E = T1A - T1D;
+					     T6Q = T1A + T1D;
+					     Th = Rm[WS(rs, 13)];
+					}
+					T1l = Ip[WS(rs, 2)];
+					T1m = Im[WS(rs, 13)];
+					Tj = Rp[WS(rs, 10)];
+					T1p = Tg - Th;
+					Ti = Tg + Th;
+					T3b = T1l - T1m;
+					T1n = T1l + T1m;
+					Tk = Rm[WS(rs, 5)];
+					T1q = Ip[WS(rs, 10)];
+					T1r = Im[WS(rs, 5)];
+				   }
+			      }
+			 }
+			 {
+			      E T4o, T67, T68, T4p, T2I, T1i, T2N, T1u, T1F, T2O, T6K, T17;
+			      {
+				   E Tf, T1o, T1t, Tu, T7g, T6P, T6S, T7h, T7d, T7e;
+				   {
+					E T6O, T6N, T1k, Tl;
+					T4o = T7 - Te;
+					Tf = T7 + Te;
+					T1k = Tj - Tk;
+					Tl = Tj + Tk;
+					{
+					     E T3c, T1s, Tm, T3d;
+					     T3c = T1q - T1r;
+					     T1s = T1q + T1r;
+					     T1o = T1k + T1n;
+					     T6O = T1n - T1k;
+					     T67 = Ti - Tl;
+					     Tm = Ti + Tl;
+					     T3d = T3b + T3c;
+					     T68 = T3b - T3c;
+					     T1t = T1p - T1s;
+					     T6N = T1p + T1s;
+					     T4B = Tm - Tt;
+					     Tu = Tm + Tt;
+					     T4p = T3g - T3d;
+					     T3h = T3d + T3g;
+					}
+					T7g = FNMS(KP414213562, T6N, T6O);
+					T6P = FMA(KP414213562, T6O, T6N);
+					T6S = FMA(KP414213562, T6R, T6Q);
+					T7h = FNMS(KP414213562, T6Q, T6R);
+				   }
+				   T3K = Tf - Tu;
+				   Tv = Tf + Tu;
+				   T8Y = T6P + T6S;
+				   T6T = T6P - T6S;
+				   T2I = T2E - T2H;
+				   T7d = T2E + T2H;
+				   T7e = T1c + T1h;
+				   T1i = T1c - T1h;
+				   T2N = FNMS(KP414213562, T1o, T1t);
+				   T1u = FMA(KP414213562, T1t, T1o);
+				   T8L = T7h - T7g;
+				   T7i = T7g + T7h;
+				   T8X = FMA(KP707106781, T7e, T7d);
+				   T7f = FNMS(KP707106781, T7e, T7d);
+				   T1F = FNMS(KP414213562, T1E, T1z);
+				   T2O = FMA(KP414213562, T1z, T1E);
+				   T6K = T16 - T13;
+				   T17 = T13 + T16;
+			      }
+			      {
+				   E T6L, T6A, T6B, T65, T3k, T2L, T69, T6c, T3n;
+				   T4Y = T1F - T1u;
+				   T1G = T1u + T1F;
+				   T4K = FNMS(KP707106781, T1i, T17);
+				   T1j = FMA(KP707106781, T1i, T17);
+				   T2L = T2J + T2K;
+				   T6L = T2J - T2K;
+				   T6A = T67 + T68;
+				   T69 = T67 - T68;
+				   T6c = T6a + T6b;
+				   T6B = T6b - T6a;
+				   T4X = FNMS(KP707106781, T2L, T2I);
+				   T2M = FMA(KP707106781, T2L, T2I);
+				   T8C = T69 - T6c;
+				   T6d = T69 + T6c;
+				   T65 = T3j - T3i;
+				   T3k = T3i + T3j;
+				   T8o = T64 - T65;
+				   T66 = T64 + T65;
+				   T8K = FNMS(KP707106781, T6L, T6K);
+				   T6M = FMA(KP707106781, T6L, T6K);
+				   T3n = T3l + T3m;
+				   T6y = T3l - T3m;
+				   T4L = T2N - T2O;
+				   T2P = T2N + T2O;
+				   T4C = T3n - T3k;
+				   T3o = T3k + T3n;
+				   T5q = T4o - T4p;
+				   T4q = T4o + T4p;
+				   T8p = T6B - T6A;
+				   T6C = T6A + T6B;
+			      }
+			 }
+		    }
+		    {
+			 E T1M, T6V, T6f, TC, T31, T6j, T23, T6Y, T2v, T2i, TY, T6p, T6n, T35, T2n;
+			 E T2w, T24, T1R, TJ, T6i, T6g, T2Y, T1W, T25, T2q, TN, T2r, T36, T2c, T29;
+			 E TQ, T2s;
+			 {
+			      E TU, T2k, T33, T2j, TX, T2l, T2m, T34;
+			      {
+				   E T1Z, Ty, T20, T2Z, T1L, T1I, TB, T21, T2e, T2h;
+				   {
+					E T1J, T1K, Tw, Tx, Tz, TA;
+					Tw = Rp[WS(rs, 1)];
+					Tx = Rm[WS(rs, 14)];
+					T1J = Ip[WS(rs, 1)];
+					T8B = T6y - T6x;
+					T6z = T6x + T6y;
+					T1Z = Tw - Tx;
+					Ty = Tw + Tx;
+					T1K = Im[WS(rs, 14)];
+					Tz = Rp[WS(rs, 9)];
+					TA = Rm[WS(rs, 6)];
+					T20 = Ip[WS(rs, 9)];
+					T2Z = T1J - T1K;
+					T1L = T1J + T1K;
+					T1I = Tz - TA;
+					TB = Tz + TA;
+					T21 = Im[WS(rs, 6)];
+				   }
+				   {
+					E T2f, T2g, TV, TW;
+					{
+					     E TS, T30, T22, TT;
+					     TS = Rp[WS(rs, 3)];
+					     T1M = T1I + T1L;
+					     T6V = T1L - T1I;
+					     T6f = Ty - TB;
+					     TC = Ty + TB;
+					     T30 = T20 - T21;
+					     T22 = T20 + T21;
+					     TT = Rm[WS(rs, 12)];
+					     T2f = Ip[WS(rs, 3)];
+					     T31 = T2Z + T30;
+					     T6j = T2Z - T30;
+					     T23 = T1Z - T22;
+					     T6Y = T1Z + T22;
+					     T2e = TS - TT;
+					     TU = TS + TT;
+					     T2g = Im[WS(rs, 12)];
+					}
+					TV = Rm[WS(rs, 4)];
+					TW = Rp[WS(rs, 11)];
+					T2k = Im[WS(rs, 4)];
+					T33 = T2f - T2g;
+					T2h = T2f + T2g;
+					T2j = TV - TW;
+					TX = TV + TW;
+					T2l = Ip[WS(rs, 11)];
+				   }
+				   T2v = T2e - T2h;
+				   T2i = T2e + T2h;
+			      }
+			      TY = TU + TX;
+			      T6p = TU - TX;
+			      T2m = T2k + T2l;
+			      T34 = T2l - T2k;
+			      {
+				   E TF, T1T, T2W, T1S, TI, T1U, T1N, T1Q, T1V, T2X;
+				   {
+					E T1O, T1P, TD, TE, TG, TH;
+					TD = Rp[WS(rs, 5)];
+					TE = Rm[WS(rs, 10)];
+					T6n = T34 - T33;
+					T35 = T33 + T34;
+					T2n = T2j + T2m;
+					T2w = T2j - T2m;
+					T1N = TD - TE;
+					TF = TD + TE;
+					T1O = Ip[WS(rs, 5)];
+					T1P = Im[WS(rs, 10)];
+					TG = Rm[WS(rs, 2)];
+					TH = Rp[WS(rs, 13)];
+					T1T = Im[WS(rs, 2)];
+					T2W = T1O - T1P;
+					T1Q = T1O + T1P;
+					T1S = TG - TH;
+					TI = TG + TH;
+					T1U = Ip[WS(rs, 13)];
+				   }
+				   T24 = T1N - T1Q;
+				   T1R = T1N + T1Q;
+				   TJ = TF + TI;
+				   T6i = TF - TI;
+				   T1V = T1T + T1U;
+				   T2X = T1U - T1T;
+				   {
+					E T2a, T2b, TL, TM, TO, TP;
+					TL = Rm[0];
+					TM = Rp[WS(rs, 15)];
+					T6g = T2X - T2W;
+					T2Y = T2W + T2X;
+					T1W = T1S + T1V;
+					T25 = T1S - T1V;
+					T2q = TL - TM;
+					TN = TL + TM;
+					T2a = Im[0];
+					T2b = Ip[WS(rs, 15)];
+					TO = Rp[WS(rs, 7)];
+					TP = Rm[WS(rs, 8)];
+					T2r = Ip[WS(rs, 7)];
+					T36 = T2b - T2a;
+					T2c = T2a + T2b;
+					T29 = TO - TP;
+					TQ = TO + TP;
+					T2s = Im[WS(rs, 8)];
+				   }
+			      }
+			 }
+			 {
+			      E T2d, T4u, T4v, T6r, T6o, T6k, T8u, T8v, T6h;
+			      {
+				   E T4r, T6m, T32, T4s, T6q, T39, T8r, T8s;
+				   {
+					E TK, TR, T37, T2t, TZ, T38;
+					T4r = TC - TJ;
+					TK = TC + TJ;
+					T2d = T29 - T2c;
+					T72 = T29 + T2c;
+					T6m = TN - TQ;
+					TR = TN + TQ;
+					T37 = T2r - T2s;
+					T2t = T2r + T2s;
+					T32 = T2Y + T31;
+					T4s = T31 - T2Y;
+					T4u = TR - TY;
+					TZ = TR + TY;
+					T38 = T36 + T37;
+					T6q = T36 - T37;
+					T2u = T2q - T2t;
+					T75 = T2q + T2t;
+					T10 = TK + TZ;
+					T3P = TK - TZ;
+					T4v = T38 - T35;
+					T39 = T35 + T38;
+				   }
+				   T8r = T6q - T6p;
+				   T6r = T6p + T6q;
+				   T3a = T32 + T39;
+				   T3L = T39 - T32;
+				   T8s = T6m - T6n;
+				   T6o = T6m + T6n;
+				   T4t = T4r - T4s;
+				   T4E = T4r + T4s;
+				   T8F = FNMS(KP414213562, T8r, T8s);
+				   T8t = FMA(KP414213562, T8s, T8r);
+				   T6k = T6i + T6j;
+				   T8u = T6j - T6i;
+				   T8v = T6f - T6g;
+				   T6h = T6f + T6g;
+			      }
+			      {
+				   E T6Z, T1Y, T4O, T26, T6W, T1X, T2o, T4N, T27;
+				   T4F = T4v - T4u;
+				   T4w = T4u + T4v;
+				   T8E = FMA(KP414213562, T8u, T8v);
+				   T8w = FNMS(KP414213562, T8v, T8u);
+				   T6Z = T1R + T1W;
+				   T1X = T1R - T1W;
+				   T6E = FMA(KP414213562, T6h, T6k);
+				   T6l = FNMS(KP414213562, T6k, T6h);
+				   T6F = FNMS(KP414213562, T6o, T6r);
+				   T6s = FMA(KP414213562, T6r, T6o);
+				   T1Y = FMA(KP707106781, T1X, T1M);
+				   T4O = FNMS(KP707106781, T1X, T1M);
+				   T26 = T24 + T25;
+				   T6W = T25 - T24;
+				   T76 = T2i + T2n;
+				   T2o = T2i - T2n;
+				   T4N = FNMS(KP707106781, T26, T23);
+				   T27 = FMA(KP707106781, T26, T23);
+				   {
+					E T8O, T6X, T8N, T70;
+					T8O = FMA(KP707106781, T6W, T6V);
+					T6X = FNMS(KP707106781, T6W, T6V);
+					T8N = FMA(KP707106781, T6Z, T6Y);
+					T70 = FNMS(KP707106781, T6Z, T6Y);
+					T4P = FMA(KP668178637, T4O, T4N);
+					T51 = FNMS(KP668178637, T4N, T4O);
+					T2R = FNMS(KP198912367, T1Y, T27);
+					T28 = FMA(KP198912367, T27, T1Y);
+					T8P = FMA(KP198912367, T8O, T8N);
+					T90 = FNMS(KP198912367, T8N, T8O);
+					T7k = FNMS(KP668178637, T6X, T70);
+					T71 = FMA(KP668178637, T70, T6X);
+					T2p = FMA(KP707106781, T2o, T2d);
+					T4R = FNMS(KP707106781, T2o, T2d);
+				   }
+				   T2x = T2v + T2w;
+				   T73 = T2v - T2w;
+			      }
+			 }
+		    }
+		    {
+			 E T8S, T91, T7l, T78, T5U, T5X, T5y, T61, T5V, T5K, T5S, T60, T5W, T5M, T5I;
+			 {
+			      E T4S, T50, T4e, T4h, T3S, T4l, T4f, T44, T4c, T4k, T4g, T46, T42;
+			      {
+				   E T3Q, T3U, T40, T3Z, T3V, T3A, T3D, T3H, T3B, T3y, T3G, T3C;
+				   {
+					E T11, T3t, T3w, T3q, T3x, T3v, T3F, T12, T2B, T2U, T3z, T2C;
+					{
+					     E T3u, T2S, T2z, T3p, T4Q, T2y;
+					     T3u = Tv - T10;
+					     T11 = Tv + T10;
+					     T4Q = FNMS(KP707106781, T2x, T2u);
+					     T2y = FMA(KP707106781, T2x, T2u);
+					     {
+						  E T8R, T74, T8Q, T77;
+						  T8R = FMA(KP707106781, T73, T72);
+						  T74 = FNMS(KP707106781, T73, T72);
+						  T8Q = FMA(KP707106781, T76, T75);
+						  T77 = FNMS(KP707106781, T76, T75);
+						  T4S = FNMS(KP668178637, T4R, T4Q);
+						  T50 = FMA(KP668178637, T4Q, T4R);
+						  T2S = FMA(KP198912367, T2p, T2y);
+						  T2z = FNMS(KP198912367, T2y, T2p);
+						  T8S = FMA(KP198912367, T8R, T8Q);
+						  T91 = FNMS(KP198912367, T8Q, T8R);
+						  T7l = FNMS(KP668178637, T74, T77);
+						  T78 = FMA(KP668178637, T77, T74);
+						  T3Q = T3o - T3h;
+						  T3p = T3h + T3o;
+					     }
+					     T3t = W[30];
+					     T3w = W[31];
+					     T3q = T3a + T3p;
+					     T3x = T3p - T3a;
+					     T3v = T3t * T3u;
+					     T3F = T3w * T3u;
+					     {
+						  E T1H, T2A, T2Q, T2T;
+						  T3U = FNMS(KP923879532, T1G, T1j);
+						  T1H = FMA(KP923879532, T1G, T1j);
+						  T2A = T28 + T2z;
+						  T40 = T2z - T28;
+						  T3Z = FNMS(KP923879532, T2P, T2M);
+						  T2Q = FMA(KP923879532, T2P, T2M);
+						  T2T = T2R + T2S;
+						  T3V = T2R - T2S;
+						  T12 = W[0];
+						  T3A = FNMS(KP980785280, T2A, T1H);
+						  T2B = FMA(KP980785280, T2A, T1H);
+						  T3D = FNMS(KP980785280, T2T, T2Q);
+						  T2U = FMA(KP980785280, T2T, T2Q);
+						  T3z = W[32];
+						  T2C = T12 * T2B;
+					     }
+					}
+					{
+					     E T2V, T3s, T2D, T3r;
+					     T2D = W[1];
+					     T3r = T12 * T2U;
+					     T3H = T3z * T3D;
+					     T3B = T3z * T3A;
+					     T2V = FMA(T2D, T2U, T2C);
+					     T3s = FNMS(T2D, T2B, T3r);
+					     T3y = FNMS(T3w, T3x, T3v);
+					     T3G = FMA(T3t, T3x, T3F);
+					     Rm[0] = T11 + T2V;
+					     Rp[0] = T11 - T2V;
+					     Im[0] = T3s - T3q;
+					     Ip[0] = T3q + T3s;
+					     T3C = W[33];
+					}
+				   }
+				   {
+					E T4b, T3R, T47, T4a, T3J, T49, T4j, T3O, T3N, T43, T3W, T3T, T41, T4d, T3X;
+					E T45, T3Y;
+					{
+					     E T3M, T48, T3I, T3E;
+					     T3M = T3K + T3L;
+					     T48 = T3K - T3L;
+					     T3I = FNMS(T3C, T3A, T3H);
+					     T3E = FMA(T3C, T3D, T3B);
+					     T4b = T3Q - T3P;
+					     T3R = T3P + T3Q;
+					     Im[WS(rs, 8)] = T3I - T3G;
+					     Ip[WS(rs, 8)] = T3G + T3I;
+					     Rm[WS(rs, 8)] = T3y + T3E;
+					     Rp[WS(rs, 8)] = T3y - T3E;
+					     T47 = W[46];
+					     T4a = W[47];
+					     T3J = W[14];
+					     T49 = T47 * T48;
+					     T4j = T4a * T48;
+					     T3O = W[15];
+					     T3N = T3J * T3M;
+					     T43 = T3O * T3M;
+					     T3W = FMA(KP980785280, T3V, T3U);
+					     T4e = FNMS(KP980785280, T3V, T3U);
+					     T3T = W[16];
+					     T4h = FNMS(KP980785280, T40, T3Z);
+					     T41 = FMA(KP980785280, T40, T3Z);
+					     T4d = W[48];
+					     T3X = T3T * T3W;
+					}
+					T3S = FNMS(T3O, T3R, T3N);
+					T45 = T3T * T41;
+					T4l = T4d * T4h;
+					T4f = T4d * T4e;
+					T44 = FMA(T3J, T3R, T43);
+					T3Y = W[17];
+					T4c = FNMS(T4a, T4b, T49);
+					T4k = FMA(T47, T4b, T4j);
+					T4g = W[49];
+					T46 = FNMS(T3Y, T3W, T45);
+					T42 = FMA(T3Y, T41, T3X);
+				   }
+			      }
+			      {
+				   E T5v, T5r, T5w, T5A, T5G, T5F, T5B, T5g, T5j, T4I, T5n, T5h, T56, T5e, T5m;
+				   E T5i, T58, T54;
+				   {
+					E T4n, T4A, T5d, T4H, T59, T5c, T55, T4z, T5b, T5l, T4J, T4U, T53, T5f, T4V;
+					E T57, T4W;
+					{
+					     E T4D, T4G, T4m, T4i, T5a, T4y, T4x;
+					     T5v = T4C - T4B;
+					     T4D = T4B + T4C;
+					     T4m = FNMS(T4g, T4e, T4l);
+					     T4i = FMA(T4g, T4h, T4f);
+					     Im[WS(rs, 4)] = T46 - T44;
+					     Ip[WS(rs, 4)] = T44 + T46;
+					     Rm[WS(rs, 4)] = T3S + T42;
+					     Rp[WS(rs, 4)] = T3S - T42;
+					     Im[WS(rs, 12)] = T4m - T4k;
+					     Ip[WS(rs, 12)] = T4k + T4m;
+					     Rm[WS(rs, 12)] = T4c + T4i;
+					     Rp[WS(rs, 12)] = T4c - T4i;
+					     T4G = T4E + T4F;
+					     T5r = T4F - T4E;
+					     T5w = T4t - T4w;
+					     T4x = T4t + T4w;
+					     T4n = W[6];
+					     T4A = W[7];
+					     T5d = FNMS(KP707106781, T4G, T4D);
+					     T4H = FMA(KP707106781, T4G, T4D);
+					     T5a = FNMS(KP707106781, T4x, T4q);
+					     T4y = FMA(KP707106781, T4x, T4q);
+					     T59 = W[38];
+					     T5c = W[39];
+					     {
+						  E T4M, T4T, T4Z, T52;
+						  T4M = FMA(KP923879532, T4L, T4K);
+						  T5A = FNMS(KP923879532, T4L, T4K);
+						  T55 = T4A * T4y;
+						  T4z = T4n * T4y;
+						  T5b = T59 * T5a;
+						  T5l = T5c * T5a;
+						  T5G = T4P + T4S;
+						  T4T = T4P - T4S;
+						  T4Z = FMA(KP923879532, T4Y, T4X);
+						  T5F = FNMS(KP923879532, T4Y, T4X);
+						  T5B = T51 + T50;
+						  T52 = T50 - T51;
+						  T4J = W[8];
+						  T4U = FMA(KP831469612, T4T, T4M);
+						  T5g = FNMS(KP831469612, T4T, T4M);
+						  T53 = FMA(KP831469612, T52, T4Z);
+						  T5j = FNMS(KP831469612, T52, T4Z);
+						  T5f = W[40];
+						  T4V = T4J * T4U;
+					     }
+					}
+					T4I = FNMS(T4A, T4H, T4z);
+					T57 = T4J * T53;
+					T5n = T5f * T5j;
+					T5h = T5f * T5g;
+					T56 = FMA(T4n, T4H, T55);
+					T4W = W[9];
+					T5e = FNMS(T5c, T5d, T5b);
+					T5m = FMA(T59, T5d, T5l);
+					T5i = W[41];
+					T58 = FNMS(T4W, T4U, T57);
+					T54 = FMA(T4W, T53, T4V);
+				   }
+				   {
+					E T5p, T5u, T5x, T5R, T5N, T5Q, T5J, T5t, T5P, T5Z, T5z, T5C, T5H, T5T, T5D;
+					E T5L, T5E;
+					{
+					     E T5o, T5k, T5s, T5O;
+					     T5o = FNMS(T5i, T5g, T5n);
+					     T5k = FMA(T5i, T5j, T5h);
+					     Im[WS(rs, 2)] = T58 - T56;
+					     Ip[WS(rs, 2)] = T56 + T58;
+					     Rm[WS(rs, 2)] = T4I + T54;
+					     Rp[WS(rs, 2)] = T4I - T54;
+					     Im[WS(rs, 10)] = T5o - T5m;
+					     Ip[WS(rs, 10)] = T5m + T5o;
+					     Rm[WS(rs, 10)] = T5e + T5k;
+					     Rp[WS(rs, 10)] = T5e - T5k;
+					     T5p = W[22];
+					     T5u = W[23];
+					     T5x = FMA(KP707106781, T5w, T5v);
+					     T5R = FNMS(KP707106781, T5w, T5v);
+					     T5s = FMA(KP707106781, T5r, T5q);
+					     T5O = FNMS(KP707106781, T5r, T5q);
+					     T5N = W[54];
+					     T5Q = W[55];
+					     T5J = T5u * T5s;
+					     T5t = T5p * T5s;
+					     T5P = T5N * T5O;
+					     T5Z = T5Q * T5O;
+					     T5z = W[24];
+					     T5U = FMA(KP831469612, T5B, T5A);
+					     T5C = FNMS(KP831469612, T5B, T5A);
+					     T5X = FMA(KP831469612, T5G, T5F);
+					     T5H = FNMS(KP831469612, T5G, T5F);
+					     T5T = W[56];
+					     T5D = T5z * T5C;
+					}
+					T5y = FNMS(T5u, T5x, T5t);
+					T5L = T5z * T5H;
+					T61 = T5T * T5X;
+					T5V = T5T * T5U;
+					T5K = FMA(T5p, T5x, T5J);
+					T5E = W[25];
+					T5S = FNMS(T5Q, T5R, T5P);
+					T60 = FMA(T5N, T5R, T5Z);
+					T5W = W[57];
+					T5M = FNMS(T5E, T5C, T5L);
+					T5I = FMA(T5E, T5H, T5D);
+				   }
+			      }
+			 }
+			 {
+			      E T7P, T7L, T7K, T7Q, T7U, T80, T7Z, T7V, T9v, T9r, T9q, T9w, T9A, T9G, T9F;
+			      E T9B, T9g, T9j, T8I, T9n, T9h, T96, T9e, T9m, T9i, T98, T94;
+			      {
+				   E T7A, T7D, T6I, T7H, T7B, T7q, T7y, T7G, T7C, T7s, T7o;
+				   {
+					E T63, T7x, T6H, T6w, T7t, T7w, T6v, T7p, T7v, T7F, T6J, T7a, T7n, T7z, T7b;
+					E T7r, T7c;
+					{
+					     E T6D, T6G, T62, T5Y;
+					     T7P = FNMS(KP707106781, T6C, T6z);
+					     T6D = FMA(KP707106781, T6C, T6z);
+					     T62 = FNMS(T5W, T5U, T61);
+					     T5Y = FMA(T5W, T5X, T5V);
+					     Im[WS(rs, 6)] = T5M - T5K;
+					     Ip[WS(rs, 6)] = T5K + T5M;
+					     Rm[WS(rs, 6)] = T5y + T5I;
+					     Rp[WS(rs, 6)] = T5y - T5I;
+					     Im[WS(rs, 14)] = T62 - T60;
+					     Ip[WS(rs, 14)] = T60 + T62;
+					     Rm[WS(rs, 14)] = T5S + T5Y;
+					     Rp[WS(rs, 14)] = T5S - T5Y;
+					     T6G = T6E + T6F;
+					     T7L = T6F - T6E;
+					     {
+						  E T6e, T6t, T7u, T6u;
+						  T7K = FNMS(KP707106781, T6d, T66);
+						  T6e = FMA(KP707106781, T6d, T66);
+						  T6t = T6l + T6s;
+						  T7Q = T6l - T6s;
+						  T63 = W[2];
+						  T7x = FNMS(KP923879532, T6G, T6D);
+						  T6H = FMA(KP923879532, T6G, T6D);
+						  T7u = FNMS(KP923879532, T6t, T6e);
+						  T6u = FMA(KP923879532, T6t, T6e);
+						  T6w = W[3];
+						  T7t = W[34];
+						  T7w = W[35];
+						  T6v = T63 * T6u;
+						  T7p = T6w * T6u;
+						  T7v = T7t * T7u;
+						  T7F = T7w * T7u;
+					     }
+					     {
+						  E T6U, T79, T7j, T7m;
+						  T7U = FNMS(KP923879532, T6T, T6M);
+						  T6U = FMA(KP923879532, T6T, T6M);
+						  T79 = T71 - T78;
+						  T80 = T71 + T78;
+						  T7Z = FMA(KP923879532, T7i, T7f);
+						  T7j = FNMS(KP923879532, T7i, T7f);
+						  T7m = T7k + T7l;
+						  T7V = T7k - T7l;
+						  T6J = W[4];
+						  T7A = FNMS(KP831469612, T79, T6U);
+						  T7a = FMA(KP831469612, T79, T6U);
+						  T7D = FNMS(KP831469612, T7m, T7j);
+						  T7n = FMA(KP831469612, T7m, T7j);
+						  T7z = W[36];
+						  T7b = T6J * T7a;
+					     }
+					}
+					T6I = FNMS(T6w, T6H, T6v);
+					T7r = T6J * T7n;
+					T7H = T7z * T7D;
+					T7B = T7z * T7A;
+					T7q = FMA(T63, T6H, T7p);
+					T7c = W[5];
+					T7y = FNMS(T7w, T7x, T7v);
+					T7G = FMA(T7t, T7x, T7F);
+					T7C = W[37];
+					T7s = FNMS(T7c, T7a, T7r);
+					T7o = FMA(T7c, T7n, T7b);
+				   }
+				   {
+					E T8n, T9d, T8H, T8A, T99, T9c, T8z, T95, T9b, T9l, T8J, T8U, T93, T9f, T8V;
+					E T97, T8W;
+					{
+					     E T8D, T8G, T7I, T7E;
+					     T9v = FNMS(KP707106781, T8C, T8B);
+					     T8D = FMA(KP707106781, T8C, T8B);
+					     T7I = FNMS(T7C, T7A, T7H);
+					     T7E = FMA(T7C, T7D, T7B);
+					     Im[WS(rs, 1)] = T7s - T7q;
+					     Ip[WS(rs, 1)] = T7q + T7s;
+					     Rm[WS(rs, 1)] = T6I + T7o;
+					     Rp[WS(rs, 1)] = T6I - T7o;
+					     Im[WS(rs, 9)] = T7I - T7G;
+					     Ip[WS(rs, 9)] = T7G + T7I;
+					     Rm[WS(rs, 9)] = T7y + T7E;
+					     Rp[WS(rs, 9)] = T7y - T7E;
+					     T8G = T8E - T8F;
+					     T9r = T8E + T8F;
+					     {
+						  E T8q, T8x, T9a, T8y;
+						  T9q = FNMS(KP707106781, T8p, T8o);
+						  T8q = FMA(KP707106781, T8p, T8o);
+						  T8x = T8t - T8w;
+						  T9w = T8w + T8t;
+						  T8n = W[10];
+						  T9d = FNMS(KP923879532, T8G, T8D);
+						  T8H = FMA(KP923879532, T8G, T8D);
+						  T9a = FNMS(KP923879532, T8x, T8q);
+						  T8y = FMA(KP923879532, T8x, T8q);
+						  T8A = W[11];
+						  T99 = W[42];
+						  T9c = W[43];
+						  T8z = T8n * T8y;
+						  T95 = T8A * T8y;
+						  T9b = T99 * T9a;
+						  T9l = T9c * T9a;
+					     }
+					     {
+						  E T8M, T8T, T8Z, T92;
+						  T9A = FNMS(KP923879532, T8L, T8K);
+						  T8M = FMA(KP923879532, T8L, T8K);
+						  T8T = T8P - T8S;
+						  T9G = T8P + T8S;
+						  T9F = FMA(KP923879532, T8Y, T8X);
+						  T8Z = FNMS(KP923879532, T8Y, T8X);
+						  T92 = T90 + T91;
+						  T9B = T91 - T90;
+						  T8J = W[12];
+						  T9g = FNMS(KP980785280, T8T, T8M);
+						  T8U = FMA(KP980785280, T8T, T8M);
+						  T9j = FMA(KP980785280, T92, T8Z);
+						  T93 = FNMS(KP980785280, T92, T8Z);
+						  T9f = W[44];
+						  T8V = T8J * T8U;
+					     }
+					}
+					T8I = FNMS(T8A, T8H, T8z);
+					T97 = T8J * T93;
+					T9n = T9f * T9j;
+					T9h = T9f * T9g;
+					T96 = FMA(T8n, T8H, T95);
+					T8W = W[13];
+					T9e = FNMS(T9c, T9d, T9b);
+					T9m = FMA(T99, T9d, T9l);
+					T9i = W[45];
+					T98 = FNMS(T8W, T8U, T97);
+					T94 = FMA(T8W, T93, T8V);
+				   }
+			      }
+			      {
+				   E T9U, T9X, T9y, Ta1, T9V, T9K, T9S, Ta0, T9W, T9M, T9I;
+				   {
+					E T9p, T9R, T9x, T9u, T9N, T9Q, T9t, T9J, T9P, T9Z, T9z, T9C, T9H, T9T, T9D;
+					E T9L, T9E;
+					{
+					     E T9o, T9k, T9O, T9s;
+					     T9o = FNMS(T9i, T9g, T9n);
+					     T9k = FMA(T9i, T9j, T9h);
+					     Im[WS(rs, 3)] = T98 - T96;
+					     Ip[WS(rs, 3)] = T96 + T98;
+					     Rm[WS(rs, 3)] = T8I + T94;
+					     Rp[WS(rs, 3)] = T8I - T94;
+					     Im[WS(rs, 11)] = T9o - T9m;
+					     Ip[WS(rs, 11)] = T9m + T9o;
+					     Rm[WS(rs, 11)] = T9e + T9k;
+					     Rp[WS(rs, 11)] = T9e - T9k;
+					     T9p = W[26];
+					     T9R = FMA(KP923879532, T9w, T9v);
+					     T9x = FNMS(KP923879532, T9w, T9v);
+					     T9O = FMA(KP923879532, T9r, T9q);
+					     T9s = FNMS(KP923879532, T9r, T9q);
+					     T9u = W[27];
+					     T9N = W[58];
+					     T9Q = W[59];
+					     T9t = T9p * T9s;
+					     T9J = T9u * T9s;
+					     T9P = T9N * T9O;
+					     T9Z = T9Q * T9O;
+					     T9z = W[28];
+					     T9U = FNMS(KP980785280, T9B, T9A);
+					     T9C = FMA(KP980785280, T9B, T9A);
+					     T9X = FMA(KP980785280, T9G, T9F);
+					     T9H = FNMS(KP980785280, T9G, T9F);
+					     T9T = W[60];
+					     T9D = T9z * T9C;
+					}
+					T9y = FNMS(T9u, T9x, T9t);
+					T9L = T9z * T9H;
+					Ta1 = T9T * T9X;
+					T9V = T9T * T9U;
+					T9K = FMA(T9p, T9x, T9J);
+					T9E = W[29];
+					T9S = FNMS(T9Q, T9R, T9P);
+					Ta0 = FMA(T9N, T9R, T9Z);
+					T9W = W[61];
+					T9M = FNMS(T9E, T9C, T9L);
+					T9I = FMA(T9E, T9H, T9D);
+				   }
+				   {
+					E T7J, T8b, T7R, T7O, T87, T8a, T7N, T83, T89, T8j, T7T, T7W, T81, T8d, T7X;
+					E T85, T7Y;
+					{
+					     E Ta2, T9Y, T88, T7M;
+					     Ta2 = FNMS(T9W, T9U, Ta1);
+					     T9Y = FMA(T9W, T9X, T9V);
+					     Im[WS(rs, 7)] = T9M - T9K;
+					     Ip[WS(rs, 7)] = T9K + T9M;
+					     Rm[WS(rs, 7)] = T9y + T9I;
+					     Rp[WS(rs, 7)] = T9y - T9I;
+					     Im[WS(rs, 15)] = Ta2 - Ta0;
+					     Ip[WS(rs, 15)] = Ta0 + Ta2;
+					     Rm[WS(rs, 15)] = T9S + T9Y;
+					     Rp[WS(rs, 15)] = T9S - T9Y;
+					     T7J = W[18];
+					     T8b = FNMS(KP923879532, T7Q, T7P);
+					     T7R = FMA(KP923879532, T7Q, T7P);
+					     T88 = FNMS(KP923879532, T7L, T7K);
+					     T7M = FMA(KP923879532, T7L, T7K);
+					     T7O = W[19];
+					     T87 = W[50];
+					     T8a = W[51];
+					     T7N = T7J * T7M;
+					     T83 = T7O * T7M;
+					     T89 = T87 * T88;
+					     T8j = T8a * T88;
+					     T7T = W[20];
+					     T8e = FNMS(KP831469612, T7V, T7U);
+					     T7W = FMA(KP831469612, T7V, T7U);
+					     T8h = FMA(KP831469612, T80, T7Z);
+					     T81 = FNMS(KP831469612, T80, T7Z);
+					     T8d = W[52];
+					     T7X = T7T * T7W;
+					}
+					T7S = FNMS(T7O, T7R, T7N);
+					T85 = T7T * T81;
+					T8l = T8d * T8h;
+					T8f = T8d * T8e;
+					T84 = FMA(T7J, T7R, T83);
+					T7Y = W[21];
+					T8c = FNMS(T8a, T8b, T89);
+					T8k = FMA(T87, T8b, T8j);
+					T8g = W[53];
+					T86 = FNMS(T7Y, T7W, T85);
+					T82 = FMA(T7Y, T81, T7X);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T8m = FNMS(T8g, T8e, T8l);
+	       T8i = FMA(T8g, T8h, T8f);
+	       Im[WS(rs, 5)] = T86 - T84;
+	       Ip[WS(rs, 5)] = T84 + T86;
+	       Rm[WS(rs, 5)] = T7S + T82;
+	       Rp[WS(rs, 5)] = T7S - T82;
+	       Im[WS(rs, 13)] = T8m - T8k;
+	       Ip[WS(rs, 13)] = T8k + T8m;
+	       Rm[WS(rs, 13)] = T8c + T8i;
+	       Rp[WS(rs, 13)] = T8c - T8i;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cbdft2_32", twinstr, &GENUS, {300, 62, 198, 0} };
+
+void X(codelet_hc2cbdft2_32) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft2_32, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -dif -name hc2cbdft2_32 -include hc2cb.h */
+
+/*
+ * This function contains 498 FP additions, 208 FP multiplications,
+ * (or, 404 additions, 114 multiplications, 94 fused multiply/add),
+ * 102 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft2_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E Tf, T4a, T6h, T7Z, T6P, T8e, T1j, T4v, T2R, T4L, T5C, T7E, T6a, T7U, T3n;
+	       E T4q, TZ, T38, T2p, T4B, T7M, T7R, T2y, T4C, T5Y, T63, T6C, T86, T4i, T4n;
+	       E T6z, T85, TK, T31, T1Y, T4y, T7J, T7Q, T27, T4z, T5R, T62, T6v, T83, T4f;
+	       E T4m, T6s, T82, Tu, T4p, T6o, T8f, T6M, T80, T1G, T4K, T2I, T4w, T5J, T7T;
+	       E T67, T7F, T3g, T4b;
+	       {
+		    E T3, T2M, T16, T3k, T6, T13, T2P, T3l, Td, T3i, T1h, T2K, Ta, T3h, T1c;
+		    E T2J;
+		    {
+			 E T1, T2, T2N, T2O;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 15)];
+			 T3 = T1 + T2;
+			 T2M = T1 - T2;
+			 {
+			      E T14, T15, T4, T5;
+			      T14 = Ip[0];
+			      T15 = Im[WS(rs, 15)];
+			      T16 = T14 + T15;
+			      T3k = T14 - T15;
+			      T4 = Rp[WS(rs, 8)];
+			      T5 = Rm[WS(rs, 7)];
+			      T6 = T4 + T5;
+			      T13 = T4 - T5;
+			 }
+			 T2N = Ip[WS(rs, 8)];
+			 T2O = Im[WS(rs, 7)];
+			 T2P = T2N + T2O;
+			 T3l = T2N - T2O;
+			 {
+			      E Tb, Tc, T1d, T1e, T1f, T1g;
+			      Tb = Rm[WS(rs, 3)];
+			      Tc = Rp[WS(rs, 12)];
+			      T1d = Tb - Tc;
+			      T1e = Im[WS(rs, 3)];
+			      T1f = Ip[WS(rs, 12)];
+			      T1g = T1e + T1f;
+			      Td = Tb + Tc;
+			      T3i = T1f - T1e;
+			      T1h = T1d + T1g;
+			      T2K = T1d - T1g;
+			 }
+			 {
+			      E T8, T9, T18, T19, T1a, T1b;
+			      T8 = Rp[WS(rs, 4)];
+			      T9 = Rm[WS(rs, 11)];
+			      T18 = T8 - T9;
+			      T19 = Ip[WS(rs, 4)];
+			      T1a = Im[WS(rs, 11)];
+			      T1b = T19 + T1a;
+			      Ta = T8 + T9;
+			      T3h = T19 - T1a;
+			      T1c = T18 + T1b;
+			      T2J = T18 - T1b;
+			 }
+		    }
+		    {
+			 E T7, Te, T6f, T6g;
+			 T7 = T3 + T6;
+			 Te = Ta + Td;
+			 Tf = T7 + Te;
+			 T4a = T7 - Te;
+			 T6f = T16 - T13;
+			 T6g = KP707106781 * (T2J - T2K);
+			 T6h = T6f + T6g;
+			 T7Z = T6f - T6g;
+		    }
+		    {
+			 E T6N, T6O, T17, T1i;
+			 T6N = T2M + T2P;
+			 T6O = KP707106781 * (T1c + T1h);
+			 T6P = T6N - T6O;
+			 T8e = T6O + T6N;
+			 T17 = T13 + T16;
+			 T1i = KP707106781 * (T1c - T1h);
+			 T1j = T17 + T1i;
+			 T4v = T17 - T1i;
+		    }
+		    {
+			 E T2L, T2Q, T5A, T5B;
+			 T2L = KP707106781 * (T2J + T2K);
+			 T2Q = T2M - T2P;
+			 T2R = T2L + T2Q;
+			 T4L = T2Q - T2L;
+			 T5A = T3 - T6;
+			 T5B = T3i - T3h;
+			 T5C = T5A + T5B;
+			 T7E = T5A - T5B;
+		    }
+		    {
+			 E T68, T69, T3j, T3m;
+			 T68 = Ta - Td;
+			 T69 = T3k - T3l;
+			 T6a = T68 + T69;
+			 T7U = T69 - T68;
+			 T3j = T3h + T3i;
+			 T3m = T3k + T3l;
+			 T3n = T3j + T3m;
+			 T4q = T3m - T3j;
+		    }
+	       }
+	       {
+		    E TR, T5S, T29, T2t, T2c, T5W, T2w, T37, TY, T5T, T5V, T2i, T2n, T2r, T34;
+		    E T2q, T6A, T6B;
+		    {
+			 E TL, TM, TN, TO, TP, TQ;
+			 TL = Rm[0];
+			 TM = Rp[WS(rs, 15)];
+			 TN = TL + TM;
+			 TO = Rp[WS(rs, 7)];
+			 TP = Rm[WS(rs, 8)];
+			 TQ = TO + TP;
+			 TR = TN + TQ;
+			 T5S = TN - TQ;
+			 T29 = TO - TP;
+			 T2t = TL - TM;
+		    }
+		    {
+			 E T2a, T2b, T35, T2u, T2v, T36;
+			 T2a = Im[0];
+			 T2b = Ip[WS(rs, 15)];
+			 T35 = T2b - T2a;
+			 T2u = Ip[WS(rs, 7)];
+			 T2v = Im[WS(rs, 8)];
+			 T36 = T2u - T2v;
+			 T2c = T2a + T2b;
+			 T5W = T35 - T36;
+			 T2w = T2u + T2v;
+			 T37 = T35 + T36;
+		    }
+		    {
+			 E TU, T2e, T2h, T32, TX, T2j, T2m, T33;
+			 {
+			      E TS, TT, T2f, T2g;
+			      TS = Rp[WS(rs, 3)];
+			      TT = Rm[WS(rs, 12)];
+			      TU = TS + TT;
+			      T2e = TS - TT;
+			      T2f = Ip[WS(rs, 3)];
+			      T2g = Im[WS(rs, 12)];
+			      T2h = T2f + T2g;
+			      T32 = T2f - T2g;
+			 }
+			 {
+			      E TV, TW, T2k, T2l;
+			      TV = Rm[WS(rs, 4)];
+			      TW = Rp[WS(rs, 11)];
+			      TX = TV + TW;
+			      T2j = TV - TW;
+			      T2k = Im[WS(rs, 4)];
+			      T2l = Ip[WS(rs, 11)];
+			      T2m = T2k + T2l;
+			      T33 = T2l - T2k;
+			 }
+			 TY = TU + TX;
+			 T5T = T33 - T32;
+			 T5V = TU - TX;
+			 T2i = T2e + T2h;
+			 T2n = T2j + T2m;
+			 T2r = T2j - T2m;
+			 T34 = T32 + T33;
+			 T2q = T2e - T2h;
+		    }
+		    TZ = TR + TY;
+		    T38 = T34 + T37;
+		    {
+			 E T2d, T2o, T7K, T7L;
+			 T2d = T29 - T2c;
+			 T2o = KP707106781 * (T2i - T2n);
+			 T2p = T2d + T2o;
+			 T4B = T2d - T2o;
+			 T7K = T5S - T5T;
+			 T7L = T5W - T5V;
+			 T7M = FMA(KP382683432, T7K, KP923879532 * T7L);
+			 T7R = FNMS(KP923879532, T7K, KP382683432 * T7L);
+		    }
+		    {
+			 E T2s, T2x, T5U, T5X;
+			 T2s = KP707106781 * (T2q + T2r);
+			 T2x = T2t - T2w;
+			 T2y = T2s + T2x;
+			 T4C = T2x - T2s;
+			 T5U = T5S + T5T;
+			 T5X = T5V + T5W;
+			 T5Y = FMA(KP923879532, T5U, KP382683432 * T5X);
+			 T63 = FNMS(KP382683432, T5U, KP923879532 * T5X);
+		    }
+		    T6A = T2t + T2w;
+		    T6B = KP707106781 * (T2i + T2n);
+		    T6C = T6A - T6B;
+		    T86 = T6B + T6A;
+		    {
+			 E T4g, T4h, T6x, T6y;
+			 T4g = TR - TY;
+			 T4h = T37 - T34;
+			 T4i = T4g + T4h;
+			 T4n = T4h - T4g;
+			 T6x = KP707106781 * (T2q - T2r);
+			 T6y = T29 + T2c;
+			 T6z = T6x - T6y;
+			 T85 = T6y + T6x;
+		    }
+	       }
+	       {
+		    E TC, T5L, T1I, T22, T1L, T5P, T25, T30, TJ, T5M, T5O, T1R, T1W, T20, T2X;
+		    E T1Z, T6t, T6u;
+		    {
+			 E Tw, Tx, Ty, Tz, TA, TB;
+			 Tw = Rp[WS(rs, 1)];
+			 Tx = Rm[WS(rs, 14)];
+			 Ty = Tw + Tx;
+			 Tz = Rp[WS(rs, 9)];
+			 TA = Rm[WS(rs, 6)];
+			 TB = Tz + TA;
+			 TC = Ty + TB;
+			 T5L = Ty - TB;
+			 T1I = Tz - TA;
+			 T22 = Tw - Tx;
+		    }
+		    {
+			 E T1J, T1K, T2Y, T23, T24, T2Z;
+			 T1J = Ip[WS(rs, 1)];
+			 T1K = Im[WS(rs, 14)];
+			 T2Y = T1J - T1K;
+			 T23 = Ip[WS(rs, 9)];
+			 T24 = Im[WS(rs, 6)];
+			 T2Z = T23 - T24;
+			 T1L = T1J + T1K;
+			 T5P = T2Y - T2Z;
+			 T25 = T23 + T24;
+			 T30 = T2Y + T2Z;
+		    }
+		    {
+			 E TF, T1N, T1Q, T2V, TI, T1S, T1V, T2W;
+			 {
+			      E TD, TE, T1O, T1P;
+			      TD = Rp[WS(rs, 5)];
+			      TE = Rm[WS(rs, 10)];
+			      TF = TD + TE;
+			      T1N = TD - TE;
+			      T1O = Ip[WS(rs, 5)];
+			      T1P = Im[WS(rs, 10)];
+			      T1Q = T1O + T1P;
+			      T2V = T1O - T1P;
+			 }
+			 {
+			      E TG, TH, T1T, T1U;
+			      TG = Rm[WS(rs, 2)];
+			      TH = Rp[WS(rs, 13)];
+			      TI = TG + TH;
+			      T1S = TG - TH;
+			      T1T = Im[WS(rs, 2)];
+			      T1U = Ip[WS(rs, 13)];
+			      T1V = T1T + T1U;
+			      T2W = T1U - T1T;
+			 }
+			 TJ = TF + TI;
+			 T5M = T2W - T2V;
+			 T5O = TF - TI;
+			 T1R = T1N + T1Q;
+			 T1W = T1S + T1V;
+			 T20 = T1S - T1V;
+			 T2X = T2V + T2W;
+			 T1Z = T1N - T1Q;
+		    }
+		    TK = TC + TJ;
+		    T31 = T2X + T30;
+		    {
+			 E T1M, T1X, T7H, T7I;
+			 T1M = T1I + T1L;
+			 T1X = KP707106781 * (T1R - T1W);
+			 T1Y = T1M + T1X;
+			 T4y = T1M - T1X;
+			 T7H = T5L - T5M;
+			 T7I = T5P - T5O;
+			 T7J = FNMS(KP923879532, T7I, KP382683432 * T7H);
+			 T7Q = FMA(KP923879532, T7H, KP382683432 * T7I);
+		    }
+		    {
+			 E T21, T26, T5N, T5Q;
+			 T21 = KP707106781 * (T1Z + T20);
+			 T26 = T22 - T25;
+			 T27 = T21 + T26;
+			 T4z = T26 - T21;
+			 T5N = T5L + T5M;
+			 T5Q = T5O + T5P;
+			 T5R = FNMS(KP382683432, T5Q, KP923879532 * T5N);
+			 T62 = FMA(KP382683432, T5N, KP923879532 * T5Q);
+		    }
+		    T6t = T22 + T25;
+		    T6u = KP707106781 * (T1R + T1W);
+		    T6v = T6t - T6u;
+		    T83 = T6u + T6t;
+		    {
+			 E T4d, T4e, T6q, T6r;
+			 T4d = TC - TJ;
+			 T4e = T30 - T2X;
+			 T4f = T4d - T4e;
+			 T4m = T4d + T4e;
+			 T6q = T1L - T1I;
+			 T6r = KP707106781 * (T1Z - T20);
+			 T6s = T6q + T6r;
+			 T82 = T6q - T6r;
+		    }
+	       }
+	       {
+		    E Ti, T3a, Tl, T3b, T1o, T1t, T6j, T6i, T5E, T5D, Tp, T3d, Ts, T3e, T1z;
+		    E T1E, T6m, T6l, T5H, T5G;
+		    {
+			 E T1p, T1n, T1k, T1s;
+			 {
+			      E Tg, Th, T1l, T1m;
+			      Tg = Rp[WS(rs, 2)];
+			      Th = Rm[WS(rs, 13)];
+			      Ti = Tg + Th;
+			      T1p = Tg - Th;
+			      T1l = Ip[WS(rs, 2)];
+			      T1m = Im[WS(rs, 13)];
+			      T1n = T1l + T1m;
+			      T3a = T1l - T1m;
+			 }
+			 {
+			      E Tj, Tk, T1q, T1r;
+			      Tj = Rp[WS(rs, 10)];
+			      Tk = Rm[WS(rs, 5)];
+			      Tl = Tj + Tk;
+			      T1k = Tj - Tk;
+			      T1q = Ip[WS(rs, 10)];
+			      T1r = Im[WS(rs, 5)];
+			      T1s = T1q + T1r;
+			      T3b = T1q - T1r;
+			 }
+			 T1o = T1k + T1n;
+			 T1t = T1p - T1s;
+			 T6j = T1p + T1s;
+			 T6i = T1n - T1k;
+			 T5E = T3a - T3b;
+			 T5D = Ti - Tl;
+		    }
+		    {
+			 E T1A, T1y, T1v, T1D;
+			 {
+			      E Tn, To, T1w, T1x;
+			      Tn = Rm[WS(rs, 1)];
+			      To = Rp[WS(rs, 14)];
+			      Tp = Tn + To;
+			      T1A = Tn - To;
+			      T1w = Im[WS(rs, 1)];
+			      T1x = Ip[WS(rs, 14)];
+			      T1y = T1w + T1x;
+			      T3d = T1x - T1w;
+			 }
+			 {
+			      E Tq, Tr, T1B, T1C;
+			      Tq = Rp[WS(rs, 6)];
+			      Tr = Rm[WS(rs, 9)];
+			      Ts = Tq + Tr;
+			      T1v = Tq - Tr;
+			      T1B = Ip[WS(rs, 6)];
+			      T1C = Im[WS(rs, 9)];
+			      T1D = T1B + T1C;
+			      T3e = T1B - T1C;
+			 }
+			 T1z = T1v - T1y;
+			 T1E = T1A - T1D;
+			 T6m = T1A + T1D;
+			 T6l = T1v + T1y;
+			 T5H = T3d - T3e;
+			 T5G = Tp - Ts;
+		    }
+		    {
+			 E Tm, Tt, T6k, T6n;
+			 Tm = Ti + Tl;
+			 Tt = Tp + Ts;
+			 Tu = Tm + Tt;
+			 T4p = Tm - Tt;
+			 T6k = FMA(KP382683432, T6i, KP923879532 * T6j);
+			 T6n = FMA(KP382683432, T6l, KP923879532 * T6m);
+			 T6o = T6k - T6n;
+			 T8f = T6k + T6n;
+		    }
+		    {
+			 E T6K, T6L, T1u, T1F;
+			 T6K = FNMS(KP923879532, T6i, KP382683432 * T6j);
+			 T6L = FNMS(KP923879532, T6l, KP382683432 * T6m);
+			 T6M = T6K + T6L;
+			 T80 = T6K - T6L;
+			 T1u = FMA(KP923879532, T1o, KP382683432 * T1t);
+			 T1F = FNMS(KP382683432, T1E, KP923879532 * T1z);
+			 T1G = T1u + T1F;
+			 T4K = T1F - T1u;
+		    }
+		    {
+			 E T2G, T2H, T5F, T5I;
+			 T2G = FNMS(KP382683432, T1o, KP923879532 * T1t);
+			 T2H = FMA(KP382683432, T1z, KP923879532 * T1E);
+			 T2I = T2G + T2H;
+			 T4w = T2G - T2H;
+			 T5F = T5D - T5E;
+			 T5I = T5G + T5H;
+			 T5J = KP707106781 * (T5F + T5I);
+			 T7T = KP707106781 * (T5F - T5I);
+		    }
+		    {
+			 E T65, T66, T3c, T3f;
+			 T65 = T5D + T5E;
+			 T66 = T5H - T5G;
+			 T67 = KP707106781 * (T65 + T66);
+			 T7F = KP707106781 * (T66 - T65);
+			 T3c = T3a + T3b;
+			 T3f = T3d + T3e;
+			 T3g = T3c + T3f;
+			 T4b = T3f - T3c;
+		    }
+	       }
+	       {
+		    E T11, T3s, T3p, T3u, T3K, T40, T3G, T3Y, T2T, T43, T3z, T3P, T2B, T45, T3x;
+		    E T3T;
+		    {
+			 E Tv, T10, T3E, T3F;
+			 Tv = Tf + Tu;
+			 T10 = TK + TZ;
+			 T11 = Tv + T10;
+			 T3s = Tv - T10;
+			 {
+			      E T39, T3o, T3I, T3J;
+			      T39 = T31 + T38;
+			      T3o = T3g + T3n;
+			      T3p = T39 + T3o;
+			      T3u = T3o - T39;
+			      T3I = TK - TZ;
+			      T3J = T3n - T3g;
+			      T3K = T3I + T3J;
+			      T40 = T3J - T3I;
+			 }
+			 T3E = Tf - Tu;
+			 T3F = T38 - T31;
+			 T3G = T3E + T3F;
+			 T3Y = T3E - T3F;
+			 {
+			      E T2S, T3N, T2F, T3O, T2D, T2E;
+			      T2S = T2I + T2R;
+			      T3N = T1j - T1G;
+			      T2D = FNMS(KP195090322, T1Y, KP980785280 * T27);
+			      T2E = FMA(KP195090322, T2p, KP980785280 * T2y);
+			      T2F = T2D + T2E;
+			      T3O = T2D - T2E;
+			      T2T = T2F + T2S;
+			      T43 = T3N - T3O;
+			      T3z = T2S - T2F;
+			      T3P = T3N + T3O;
+			 }
+			 {
+			      E T1H, T3S, T2A, T3R, T28, T2z;
+			      T1H = T1j + T1G;
+			      T3S = T2R - T2I;
+			      T28 = FMA(KP980785280, T1Y, KP195090322 * T27);
+			      T2z = FNMS(KP195090322, T2y, KP980785280 * T2p);
+			      T2A = T28 + T2z;
+			      T3R = T2z - T28;
+			      T2B = T1H + T2A;
+			      T45 = T3S - T3R;
+			      T3x = T1H - T2A;
+			      T3T = T3R + T3S;
+			 }
+		    }
+		    {
+			 E T2U, T3q, T12, T2C;
+			 T12 = W[0];
+			 T2C = W[1];
+			 T2U = FMA(T12, T2B, T2C * T2T);
+			 T3q = FNMS(T2C, T2B, T12 * T2T);
+			 Rp[0] = T11 - T2U;
+			 Ip[0] = T3p + T3q;
+			 Rm[0] = T11 + T2U;
+			 Im[0] = T3q - T3p;
+		    }
+		    {
+			 E T41, T47, T46, T48;
+			 {
+			      E T3X, T3Z, T42, T44;
+			      T3X = W[46];
+			      T3Z = W[47];
+			      T41 = FNMS(T3Z, T40, T3X * T3Y);
+			      T47 = FMA(T3Z, T3Y, T3X * T40);
+			      T42 = W[48];
+			      T44 = W[49];
+			      T46 = FMA(T42, T43, T44 * T45);
+			      T48 = FNMS(T44, T43, T42 * T45);
+			 }
+			 Rp[WS(rs, 12)] = T41 - T46;
+			 Ip[WS(rs, 12)] = T47 + T48;
+			 Rm[WS(rs, 12)] = T41 + T46;
+			 Im[WS(rs, 12)] = T48 - T47;
+		    }
+		    {
+			 E T3v, T3B, T3A, T3C;
+			 {
+			      E T3r, T3t, T3w, T3y;
+			      T3r = W[30];
+			      T3t = W[31];
+			      T3v = FNMS(T3t, T3u, T3r * T3s);
+			      T3B = FMA(T3t, T3s, T3r * T3u);
+			      T3w = W[32];
+			      T3y = W[33];
+			      T3A = FMA(T3w, T3x, T3y * T3z);
+			      T3C = FNMS(T3y, T3x, T3w * T3z);
+			 }
+			 Rp[WS(rs, 8)] = T3v - T3A;
+			 Ip[WS(rs, 8)] = T3B + T3C;
+			 Rm[WS(rs, 8)] = T3v + T3A;
+			 Im[WS(rs, 8)] = T3C - T3B;
+		    }
+		    {
+			 E T3L, T3V, T3U, T3W;
+			 {
+			      E T3D, T3H, T3M, T3Q;
+			      T3D = W[14];
+			      T3H = W[15];
+			      T3L = FNMS(T3H, T3K, T3D * T3G);
+			      T3V = FMA(T3H, T3G, T3D * T3K);
+			      T3M = W[16];
+			      T3Q = W[17];
+			      T3U = FMA(T3M, T3P, T3Q * T3T);
+			      T3W = FNMS(T3Q, T3P, T3M * T3T);
+			 }
+			 Rp[WS(rs, 4)] = T3L - T3U;
+			 Ip[WS(rs, 4)] = T3V + T3W;
+			 Rm[WS(rs, 4)] = T3L + T3U;
+			 Im[WS(rs, 4)] = T3W - T3V;
+		    }
+	       }
+	       {
+		    E T7O, T8m, T7W, T8o, T8E, T8U, T8A, T8S, T8h, T8X, T8t, T8J, T89, T8Z, T8r;
+		    E T8N;
+		    {
+			 E T7G, T7N, T8y, T8z;
+			 T7G = T7E + T7F;
+			 T7N = T7J + T7M;
+			 T7O = T7G + T7N;
+			 T8m = T7G - T7N;
+			 {
+			      E T7S, T7V, T8C, T8D;
+			      T7S = T7Q + T7R;
+			      T7V = T7T + T7U;
+			      T7W = T7S + T7V;
+			      T8o = T7V - T7S;
+			      T8C = T7J - T7M;
+			      T8D = T7U - T7T;
+			      T8E = T8C + T8D;
+			      T8U = T8D - T8C;
+			 }
+			 T8y = T7E - T7F;
+			 T8z = T7R - T7Q;
+			 T8A = T8y + T8z;
+			 T8S = T8y - T8z;
+			 {
+			      E T8g, T8H, T8d, T8I, T8b, T8c;
+			      T8g = T8e - T8f;
+			      T8H = T7Z - T80;
+			      T8b = FNMS(KP980785280, T82, KP195090322 * T83);
+			      T8c = FNMS(KP980785280, T85, KP195090322 * T86);
+			      T8d = T8b + T8c;
+			      T8I = T8b - T8c;
+			      T8h = T8d + T8g;
+			      T8X = T8H - T8I;
+			      T8t = T8g - T8d;
+			      T8J = T8H + T8I;
+			 }
+			 {
+			      E T81, T8L, T88, T8M, T84, T87;
+			      T81 = T7Z + T80;
+			      T8L = T8f + T8e;
+			      T84 = FMA(KP195090322, T82, KP980785280 * T83);
+			      T87 = FMA(KP195090322, T85, KP980785280 * T86);
+			      T88 = T84 - T87;
+			      T8M = T84 + T87;
+			      T89 = T81 + T88;
+			      T8Z = T8M + T8L;
+			      T8r = T81 - T88;
+			      T8N = T8L - T8M;
+			 }
+		    }
+		    {
+			 E T7X, T8j, T8i, T8k;
+			 {
+			      E T7D, T7P, T7Y, T8a;
+			      T7D = W[10];
+			      T7P = W[11];
+			      T7X = FNMS(T7P, T7W, T7D * T7O);
+			      T8j = FMA(T7P, T7O, T7D * T7W);
+			      T7Y = W[12];
+			      T8a = W[13];
+			      T8i = FMA(T7Y, T89, T8a * T8h);
+			      T8k = FNMS(T8a, T89, T7Y * T8h);
+			 }
+			 Rp[WS(rs, 3)] = T7X - T8i;
+			 Ip[WS(rs, 3)] = T8j + T8k;
+			 Rm[WS(rs, 3)] = T7X + T8i;
+			 Im[WS(rs, 3)] = T8k - T8j;
+		    }
+		    {
+			 E T8V, T91, T90, T92;
+			 {
+			      E T8R, T8T, T8W, T8Y;
+			      T8R = W[58];
+			      T8T = W[59];
+			      T8V = FNMS(T8T, T8U, T8R * T8S);
+			      T91 = FMA(T8T, T8S, T8R * T8U);
+			      T8W = W[60];
+			      T8Y = W[61];
+			      T90 = FMA(T8W, T8X, T8Y * T8Z);
+			      T92 = FNMS(T8Y, T8X, T8W * T8Z);
+			 }
+			 Rp[WS(rs, 15)] = T8V - T90;
+			 Ip[WS(rs, 15)] = T91 + T92;
+			 Rm[WS(rs, 15)] = T8V + T90;
+			 Im[WS(rs, 15)] = T92 - T91;
+		    }
+		    {
+			 E T8p, T8v, T8u, T8w;
+			 {
+			      E T8l, T8n, T8q, T8s;
+			      T8l = W[42];
+			      T8n = W[43];
+			      T8p = FNMS(T8n, T8o, T8l * T8m);
+			      T8v = FMA(T8n, T8m, T8l * T8o);
+			      T8q = W[44];
+			      T8s = W[45];
+			      T8u = FMA(T8q, T8r, T8s * T8t);
+			      T8w = FNMS(T8s, T8r, T8q * T8t);
+			 }
+			 Rp[WS(rs, 11)] = T8p - T8u;
+			 Ip[WS(rs, 11)] = T8v + T8w;
+			 Rm[WS(rs, 11)] = T8p + T8u;
+			 Im[WS(rs, 11)] = T8w - T8v;
+		    }
+		    {
+			 E T8F, T8P, T8O, T8Q;
+			 {
+			      E T8x, T8B, T8G, T8K;
+			      T8x = W[26];
+			      T8B = W[27];
+			      T8F = FNMS(T8B, T8E, T8x * T8A);
+			      T8P = FMA(T8B, T8A, T8x * T8E);
+			      T8G = W[28];
+			      T8K = W[29];
+			      T8O = FMA(T8G, T8J, T8K * T8N);
+			      T8Q = FNMS(T8K, T8J, T8G * T8N);
+			 }
+			 Rp[WS(rs, 7)] = T8F - T8O;
+			 Ip[WS(rs, 7)] = T8P + T8Q;
+			 Rm[WS(rs, 7)] = T8F + T8O;
+			 Im[WS(rs, 7)] = T8Q - T8P;
+		    }
+	       }
+	       {
+		    E T4k, T4S, T4s, T4U, T5a, T5q, T56, T5o, T4N, T5t, T4Z, T5f, T4F, T5v, T4X;
+		    E T5j;
+		    {
+			 E T4c, T4j, T54, T55;
+			 T4c = T4a + T4b;
+			 T4j = KP707106781 * (T4f + T4i);
+			 T4k = T4c + T4j;
+			 T4S = T4c - T4j;
+			 {
+			      E T4o, T4r, T58, T59;
+			      T4o = KP707106781 * (T4m + T4n);
+			      T4r = T4p + T4q;
+			      T4s = T4o + T4r;
+			      T4U = T4r - T4o;
+			      T58 = KP707106781 * (T4f - T4i);
+			      T59 = T4q - T4p;
+			      T5a = T58 + T59;
+			      T5q = T59 - T58;
+			 }
+			 T54 = T4a - T4b;
+			 T55 = KP707106781 * (T4n - T4m);
+			 T56 = T54 + T55;
+			 T5o = T54 - T55;
+			 {
+			      E T4M, T5d, T4J, T5e, T4H, T4I;
+			      T4M = T4K + T4L;
+			      T5d = T4v - T4w;
+			      T4H = FNMS(KP831469612, T4y, KP555570233 * T4z);
+			      T4I = FMA(KP831469612, T4B, KP555570233 * T4C);
+			      T4J = T4H + T4I;
+			      T5e = T4H - T4I;
+			      T4N = T4J + T4M;
+			      T5t = T5d - T5e;
+			      T4Z = T4M - T4J;
+			      T5f = T5d + T5e;
+			 }
+			 {
+			      E T4x, T5i, T4E, T5h, T4A, T4D;
+			      T4x = T4v + T4w;
+			      T5i = T4L - T4K;
+			      T4A = FMA(KP555570233, T4y, KP831469612 * T4z);
+			      T4D = FNMS(KP831469612, T4C, KP555570233 * T4B);
+			      T4E = T4A + T4D;
+			      T5h = T4D - T4A;
+			      T4F = T4x + T4E;
+			      T5v = T5i - T5h;
+			      T4X = T4x - T4E;
+			      T5j = T5h + T5i;
+			 }
+		    }
+		    {
+			 E T4t, T4P, T4O, T4Q;
+			 {
+			      E T49, T4l, T4u, T4G;
+			      T49 = W[6];
+			      T4l = W[7];
+			      T4t = FNMS(T4l, T4s, T49 * T4k);
+			      T4P = FMA(T4l, T4k, T49 * T4s);
+			      T4u = W[8];
+			      T4G = W[9];
+			      T4O = FMA(T4u, T4F, T4G * T4N);
+			      T4Q = FNMS(T4G, T4F, T4u * T4N);
+			 }
+			 Rp[WS(rs, 2)] = T4t - T4O;
+			 Ip[WS(rs, 2)] = T4P + T4Q;
+			 Rm[WS(rs, 2)] = T4t + T4O;
+			 Im[WS(rs, 2)] = T4Q - T4P;
+		    }
+		    {
+			 E T5r, T5x, T5w, T5y;
+			 {
+			      E T5n, T5p, T5s, T5u;
+			      T5n = W[54];
+			      T5p = W[55];
+			      T5r = FNMS(T5p, T5q, T5n * T5o);
+			      T5x = FMA(T5p, T5o, T5n * T5q);
+			      T5s = W[56];
+			      T5u = W[57];
+			      T5w = FMA(T5s, T5t, T5u * T5v);
+			      T5y = FNMS(T5u, T5t, T5s * T5v);
+			 }
+			 Rp[WS(rs, 14)] = T5r - T5w;
+			 Ip[WS(rs, 14)] = T5x + T5y;
+			 Rm[WS(rs, 14)] = T5r + T5w;
+			 Im[WS(rs, 14)] = T5y - T5x;
+		    }
+		    {
+			 E T4V, T51, T50, T52;
+			 {
+			      E T4R, T4T, T4W, T4Y;
+			      T4R = W[38];
+			      T4T = W[39];
+			      T4V = FNMS(T4T, T4U, T4R * T4S);
+			      T51 = FMA(T4T, T4S, T4R * T4U);
+			      T4W = W[40];
+			      T4Y = W[41];
+			      T50 = FMA(T4W, T4X, T4Y * T4Z);
+			      T52 = FNMS(T4Y, T4X, T4W * T4Z);
+			 }
+			 Rp[WS(rs, 10)] = T4V - T50;
+			 Ip[WS(rs, 10)] = T51 + T52;
+			 Rm[WS(rs, 10)] = T4V + T50;
+			 Im[WS(rs, 10)] = T52 - T51;
+		    }
+		    {
+			 E T5b, T5l, T5k, T5m;
+			 {
+			      E T53, T57, T5c, T5g;
+			      T53 = W[22];
+			      T57 = W[23];
+			      T5b = FNMS(T57, T5a, T53 * T56);
+			      T5l = FMA(T57, T56, T53 * T5a);
+			      T5c = W[24];
+			      T5g = W[25];
+			      T5k = FMA(T5c, T5f, T5g * T5j);
+			      T5m = FNMS(T5g, T5f, T5c * T5j);
+			 }
+			 Rp[WS(rs, 6)] = T5b - T5k;
+			 Ip[WS(rs, 6)] = T5l + T5m;
+			 Rm[WS(rs, 6)] = T5b + T5k;
+			 Im[WS(rs, 6)] = T5m - T5l;
+		    }
+	       }
+	       {
+		    E T60, T6W, T6c, T6Y, T7e, T7u, T7a, T7s, T6R, T7x, T73, T7j, T6F, T7z, T71;
+		    E T7n;
+		    {
+			 E T5K, T5Z, T78, T79;
+			 T5K = T5C + T5J;
+			 T5Z = T5R + T5Y;
+			 T60 = T5K + T5Z;
+			 T6W = T5K - T5Z;
+			 {
+			      E T64, T6b, T7c, T7d;
+			      T64 = T62 + T63;
+			      T6b = T67 + T6a;
+			      T6c = T64 + T6b;
+			      T6Y = T6b - T64;
+			      T7c = T5R - T5Y;
+			      T7d = T6a - T67;
+			      T7e = T7c + T7d;
+			      T7u = T7d - T7c;
+			 }
+			 T78 = T5C - T5J;
+			 T79 = T63 - T62;
+			 T7a = T78 + T79;
+			 T7s = T78 - T79;
+			 {
+			      E T6Q, T7h, T6J, T7i, T6H, T6I;
+			      T6Q = T6M + T6P;
+			      T7h = T6h - T6o;
+			      T6H = FNMS(KP555570233, T6s, KP831469612 * T6v);
+			      T6I = FMA(KP555570233, T6z, KP831469612 * T6C);
+			      T6J = T6H + T6I;
+			      T7i = T6H - T6I;
+			      T6R = T6J + T6Q;
+			      T7x = T7h - T7i;
+			      T73 = T6Q - T6J;
+			      T7j = T7h + T7i;
+			 }
+			 {
+			      E T6p, T7m, T6E, T7l, T6w, T6D;
+			      T6p = T6h + T6o;
+			      T7m = T6P - T6M;
+			      T6w = FMA(KP831469612, T6s, KP555570233 * T6v);
+			      T6D = FNMS(KP555570233, T6C, KP831469612 * T6z);
+			      T6E = T6w + T6D;
+			      T7l = T6D - T6w;
+			      T6F = T6p + T6E;
+			      T7z = T7m - T7l;
+			      T71 = T6p - T6E;
+			      T7n = T7l + T7m;
+			 }
+		    }
+		    {
+			 E T6d, T6T, T6S, T6U;
+			 {
+			      E T5z, T61, T6e, T6G;
+			      T5z = W[2];
+			      T61 = W[3];
+			      T6d = FNMS(T61, T6c, T5z * T60);
+			      T6T = FMA(T61, T60, T5z * T6c);
+			      T6e = W[4];
+			      T6G = W[5];
+			      T6S = FMA(T6e, T6F, T6G * T6R);
+			      T6U = FNMS(T6G, T6F, T6e * T6R);
+			 }
+			 Rp[WS(rs, 1)] = T6d - T6S;
+			 Ip[WS(rs, 1)] = T6T + T6U;
+			 Rm[WS(rs, 1)] = T6d + T6S;
+			 Im[WS(rs, 1)] = T6U - T6T;
+		    }
+		    {
+			 E T7v, T7B, T7A, T7C;
+			 {
+			      E T7r, T7t, T7w, T7y;
+			      T7r = W[50];
+			      T7t = W[51];
+			      T7v = FNMS(T7t, T7u, T7r * T7s);
+			      T7B = FMA(T7t, T7s, T7r * T7u);
+			      T7w = W[52];
+			      T7y = W[53];
+			      T7A = FMA(T7w, T7x, T7y * T7z);
+			      T7C = FNMS(T7y, T7x, T7w * T7z);
+			 }
+			 Rp[WS(rs, 13)] = T7v - T7A;
+			 Ip[WS(rs, 13)] = T7B + T7C;
+			 Rm[WS(rs, 13)] = T7v + T7A;
+			 Im[WS(rs, 13)] = T7C - T7B;
+		    }
+		    {
+			 E T6Z, T75, T74, T76;
+			 {
+			      E T6V, T6X, T70, T72;
+			      T6V = W[34];
+			      T6X = W[35];
+			      T6Z = FNMS(T6X, T6Y, T6V * T6W);
+			      T75 = FMA(T6X, T6W, T6V * T6Y);
+			      T70 = W[36];
+			      T72 = W[37];
+			      T74 = FMA(T70, T71, T72 * T73);
+			      T76 = FNMS(T72, T71, T70 * T73);
+			 }
+			 Rp[WS(rs, 9)] = T6Z - T74;
+			 Ip[WS(rs, 9)] = T75 + T76;
+			 Rm[WS(rs, 9)] = T6Z + T74;
+			 Im[WS(rs, 9)] = T76 - T75;
+		    }
+		    {
+			 E T7f, T7p, T7o, T7q;
+			 {
+			      E T77, T7b, T7g, T7k;
+			      T77 = W[18];
+			      T7b = W[19];
+			      T7f = FNMS(T7b, T7e, T77 * T7a);
+			      T7p = FMA(T7b, T7a, T77 * T7e);
+			      T7g = W[20];
+			      T7k = W[21];
+			      T7o = FMA(T7g, T7j, T7k * T7n);
+			      T7q = FNMS(T7k, T7j, T7g * T7n);
+			 }
+			 Rp[WS(rs, 5)] = T7f - T7o;
+			 Ip[WS(rs, 5)] = T7p + T7q;
+			 Rm[WS(rs, 5)] = T7f + T7o;
+			 Im[WS(rs, 5)] = T7q - T7p;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cbdft2_32", twinstr, &GENUS, {404, 114, 94, 0} };
+
+void X(codelet_hc2cbdft2_32) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft2_32, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft2_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft2_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cbdft2_4 -include hc2cb.h */
+
+/*
+ * This function contains 30 FP additions, 12 FP multiplications,
+ * (or, 24 additions, 6 multiplications, 6 fused multiply/add),
+ * 35 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E Ty, TB, Tw, TE, TA, TF, Tz, TG, TC;
+	       {
+		    E T4, Tg, T3, Tm, Tc, T5, Th, Ti;
+		    {
+			 E T1, T2, Ta, Tb;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 1)];
+			 Ta = Ip[0];
+			 Tb = Im[WS(rs, 1)];
+			 T4 = Rp[WS(rs, 1)];
+			 Tg = T1 - T2;
+			 T3 = T1 + T2;
+			 Tm = Ta - Tb;
+			 Tc = Ta + Tb;
+			 T5 = Rm[0];
+			 Th = Ip[WS(rs, 1)];
+			 Ti = Im[0];
+		    }
+		    {
+			 E T8, Td, T7, Ts, To, Tv, Tk, Te, Tf;
+			 T8 = W[0];
+			 {
+			      E T9, T6, Tn, Tj;
+			      T9 = T4 - T5;
+			      T6 = T4 + T5;
+			      Tn = Th - Ti;
+			      Tj = Th + Ti;
+			      Ty = Tc - T9;
+			      Td = T9 + Tc;
+			      T7 = T3 + T6;
+			      Ts = T3 - T6;
+			      To = Tm + Tn;
+			      Tv = Tm - Tn;
+			      TB = Tg + Tj;
+			      Tk = Tg - Tj;
+			      Te = T8 * Td;
+			 }
+			 Tf = W[1];
+			 {
+			      E Tr, Tu, Tt, TD, Tx, Tp, Tl, Tq;
+			      Tr = W[2];
+			      Tp = T8 * Tk;
+			      Tu = W[3];
+			      Tl = FMA(Tf, Tk, Te);
+			      Tt = Tr * Ts;
+			      Tq = FNMS(Tf, Td, Tp);
+			      TD = Tu * Ts;
+			      Rm[0] = T7 + Tl;
+			      Rp[0] = T7 - Tl;
+			      Im[0] = Tq - To;
+			      Ip[0] = To + Tq;
+			      Tx = W[4];
+			      Tw = FNMS(Tu, Tv, Tt);
+			      TE = FMA(Tr, Tv, TD);
+			      TA = W[5];
+			      TF = Tx * TB;
+			      Tz = Tx * Ty;
+			 }
+		    }
+	       }
+	       TG = FNMS(TA, Ty, TF);
+	       TC = FMA(TA, TB, Tz);
+	       Im[WS(rs, 1)] = TG - TE;
+	       Ip[WS(rs, 1)] = TE + TG;
+	       Rm[WS(rs, 1)] = Tw + TC;
+	       Rp[WS(rs, 1)] = Tw - TC;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cbdft2_4", twinstr, &GENUS, {24, 6, 6, 0} };
+
+void X(codelet_hc2cbdft2_4) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft2_4, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cbdft2_4 -include hc2cb.h */
+
+/*
+ * This function contains 30 FP additions, 12 FP multiplications,
+ * (or, 24 additions, 6 multiplications, 6 fused multiply/add),
+ * 19 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T3, Tl, T6, Tm, Td, Tj, Tx, Tv, Ts, Tq;
+	       {
+		    E Tf, Tc, T9, Ti;
+		    {
+			 E T1, T2, Ta, Tb;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 1)];
+			 T3 = T1 + T2;
+			 Tf = T1 - T2;
+			 Ta = Ip[0];
+			 Tb = Im[WS(rs, 1)];
+			 Tc = Ta + Tb;
+			 Tl = Ta - Tb;
+		    }
+		    {
+			 E T4, T5, Tg, Th;
+			 T4 = Rp[WS(rs, 1)];
+			 T5 = Rm[0];
+			 T6 = T4 + T5;
+			 T9 = T4 - T5;
+			 Tg = Ip[WS(rs, 1)];
+			 Th = Im[0];
+			 Ti = Tg + Th;
+			 Tm = Tg - Th;
+		    }
+		    Td = T9 + Tc;
+		    Tj = Tf - Ti;
+		    Tx = Tf + Ti;
+		    Tv = Tc - T9;
+		    Ts = Tl - Tm;
+		    Tq = T3 - T6;
+	       }
+	       {
+		    E T7, Tn, Tk, To, T8, Te;
+		    T7 = T3 + T6;
+		    Tn = Tl + Tm;
+		    T8 = W[0];
+		    Te = W[1];
+		    Tk = FMA(T8, Td, Te * Tj);
+		    To = FNMS(Te, Td, T8 * Tj);
+		    Rp[0] = T7 - Tk;
+		    Ip[0] = Tn + To;
+		    Rm[0] = T7 + Tk;
+		    Im[0] = To - Tn;
+	       }
+	       {
+		    E Tt, Tz, Ty, TA;
+		    {
+			 E Tp, Tr, Tu, Tw;
+			 Tp = W[2];
+			 Tr = W[3];
+			 Tt = FNMS(Tr, Ts, Tp * Tq);
+			 Tz = FMA(Tr, Tq, Tp * Ts);
+			 Tu = W[4];
+			 Tw = W[5];
+			 Ty = FMA(Tu, Tv, Tw * Tx);
+			 TA = FNMS(Tw, Tv, Tu * Tx);
+		    }
+		    Rp[WS(rs, 1)] = Tt - Ty;
+		    Ip[WS(rs, 1)] = Tz + TA;
+		    Rm[WS(rs, 1)] = Tt + Ty;
+		    Im[WS(rs, 1)] = TA - Tz;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cbdft2_4", twinstr, &GENUS, {24, 6, 6, 0} };
+
+void X(codelet_hc2cbdft2_4) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft2_4, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft2_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft2_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft2_8 -include hc2cb.h */
+
+/*
+ * This function contains 82 FP additions, 36 FP multiplications,
+ * (or, 60 additions, 14 multiplications, 22 fused multiply/add),
+ * 55 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T1m, T1r, T1i, T1u, T1o, T1v, T1n, T1w, T1s;
+	       {
+		    E T1k, Tl, T1p, TE, TP, T1g, TM, T1b, T1f, T1a, TU, Tf, T1l, TH, Tw;
+		    E T1q;
+		    {
+			 E TA, T3, TN, Tk, Th, T6, TO, TD, Tb, Tm, Ta, TK, Tp, Tc, Ts;
+			 E Tt;
+			 {
+			      E T4, T5, TB, TC;
+			      {
+				   E T1, T2, Ti, Tj;
+				   T1 = Rp[0];
+				   T2 = Rm[WS(rs, 3)];
+				   Ti = Ip[0];
+				   Tj = Im[WS(rs, 3)];
+				   T4 = Rp[WS(rs, 2)];
+				   TA = T1 - T2;
+				   T3 = T1 + T2;
+				   TN = Ti - Tj;
+				   Tk = Ti + Tj;
+				   T5 = Rm[WS(rs, 1)];
+				   TB = Ip[WS(rs, 2)];
+				   TC = Im[WS(rs, 1)];
+			      }
+			      {
+				   E T8, T9, Tn, To;
+				   T8 = Rp[WS(rs, 1)];
+				   Th = T4 - T5;
+				   T6 = T4 + T5;
+				   TO = TB - TC;
+				   TD = TB + TC;
+				   T9 = Rm[WS(rs, 2)];
+				   Tn = Ip[WS(rs, 1)];
+				   To = Im[WS(rs, 2)];
+				   Tb = Rm[0];
+				   Tm = T8 - T9;
+				   Ta = T8 + T9;
+				   TK = Tn - To;
+				   Tp = Tn + To;
+				   Tc = Rp[WS(rs, 3)];
+				   Ts = Im[0];
+				   Tt = Ip[WS(rs, 3)];
+			      }
+			 }
+			 {
+			      E Tr, Td, Tu, TL, Te, T7;
+			      T1k = Tk - Th;
+			      Tl = Th + Tk;
+			      Tr = Tb - Tc;
+			      Td = Tb + Tc;
+			      TL = Tt - Ts;
+			      Tu = Ts + Tt;
+			      T1p = TA + TD;
+			      TE = TA - TD;
+			      TP = TN + TO;
+			      T1g = TN - TO;
+			      TM = TK + TL;
+			      T1b = TL - TK;
+			      T1f = Ta - Td;
+			      Te = Ta + Td;
+			      T1a = T3 - T6;
+			      T7 = T3 + T6;
+			      {
+				   E Tq, TF, TG, Tv;
+				   Tq = Tm + Tp;
+				   TF = Tm - Tp;
+				   TG = Tr - Tu;
+				   Tv = Tr + Tu;
+				   TU = T7 - Te;
+				   Tf = T7 + Te;
+				   T1l = TF - TG;
+				   TH = TF + TG;
+				   Tw = Tq - Tv;
+				   T1q = Tq + Tv;
+			      }
+			 }
+		    }
+		    {
+			 E TX, T10, T1c, T13, T1h, T1E, T1H, T1C, T1K, T1G, T1L, T1F;
+			 {
+			      E TQ, Tx, T1y, TI, Tg, Tz;
+			      TX = TP - TM;
+			      TQ = TM + TP;
+			      Tx = FMA(KP707106781, Tw, Tl);
+			      T10 = FNMS(KP707106781, Tw, Tl);
+			      T1c = T1a + T1b;
+			      T1y = T1a - T1b;
+			      T13 = FNMS(KP707106781, TH, TE);
+			      TI = FMA(KP707106781, TH, TE);
+			      Tg = W[0];
+			      Tz = W[1];
+			      {
+				   E T1B, T1A, T1x, T1J, T1z, T1D;
+				   {
+					E TR, Ty, TS, TJ;
+					T1B = T1g - T1f;
+					T1h = T1f + T1g;
+					T1A = W[11];
+					TR = Tg * TI;
+					Ty = Tg * Tx;
+					T1x = W[10];
+					T1J = T1A * T1y;
+					TS = FNMS(Tz, Tx, TR);
+					TJ = FMA(Tz, TI, Ty);
+					T1z = T1x * T1y;
+					T1m = FMA(KP707106781, T1l, T1k);
+					T1E = FNMS(KP707106781, T1l, T1k);
+					Im[0] = TS - TQ;
+					Ip[0] = TQ + TS;
+					Rm[0] = Tf + TJ;
+					Rp[0] = Tf - TJ;
+					T1H = FMA(KP707106781, T1q, T1p);
+					T1r = FNMS(KP707106781, T1q, T1p);
+					T1D = W[12];
+				   }
+				   T1C = FNMS(T1A, T1B, T1z);
+				   T1K = FMA(T1x, T1B, T1J);
+				   T1G = W[13];
+				   T1L = T1D * T1H;
+				   T1F = T1D * T1E;
+			      }
+			 }
+			 {
+			      E TY, T16, T12, T17, T11;
+			      {
+				   E TW, TT, T15, TV, TZ, T1M, T1I;
+				   TW = W[7];
+				   T1M = FNMS(T1G, T1E, T1L);
+				   T1I = FMA(T1G, T1H, T1F);
+				   TT = W[6];
+				   T15 = TW * TU;
+				   Im[WS(rs, 3)] = T1M - T1K;
+				   Ip[WS(rs, 3)] = T1K + T1M;
+				   Rm[WS(rs, 3)] = T1C + T1I;
+				   Rp[WS(rs, 3)] = T1C - T1I;
+				   TV = TT * TU;
+				   TZ = W[8];
+				   TY = FNMS(TW, TX, TV);
+				   T16 = FMA(TT, TX, T15);
+				   T12 = W[9];
+				   T17 = TZ * T13;
+				   T11 = TZ * T10;
+			      }
+			      {
+				   E T1e, T19, T1t, T1d, T1j, T18, T14;
+				   T1e = W[3];
+				   T18 = FNMS(T12, T10, T17);
+				   T14 = FMA(T12, T13, T11);
+				   T19 = W[2];
+				   T1t = T1e * T1c;
+				   Im[WS(rs, 2)] = T18 - T16;
+				   Ip[WS(rs, 2)] = T16 + T18;
+				   Rm[WS(rs, 2)] = TY + T14;
+				   Rp[WS(rs, 2)] = TY - T14;
+				   T1d = T19 * T1c;
+				   T1j = W[4];
+				   T1i = FNMS(T1e, T1h, T1d);
+				   T1u = FMA(T19, T1h, T1t);
+				   T1o = W[5];
+				   T1v = T1j * T1r;
+				   T1n = T1j * T1m;
+			      }
+			 }
+		    }
+	       }
+	       T1w = FNMS(T1o, T1m, T1v);
+	       T1s = FMA(T1o, T1r, T1n);
+	       Im[WS(rs, 1)] = T1w - T1u;
+	       Ip[WS(rs, 1)] = T1u + T1w;
+	       Rm[WS(rs, 1)] = T1i + T1s;
+	       Rp[WS(rs, 1)] = T1i - T1s;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cbdft2_8", twinstr, &GENUS, {60, 14, 22, 0} };
+
+void X(codelet_hc2cbdft2_8) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft2_8, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft2_8 -include hc2cb.h */
+
+/*
+ * This function contains 82 FP additions, 32 FP multiplications,
+ * (or, 68 additions, 18 multiplications, 14 fused multiply/add),
+ * 30 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T7, T1d, T1h, Tl, TG, T14, T19, TO, Te, TL, T18, T15, TB, T1e, Tw;
+	       E T1i;
+	       {
+		    E T3, TC, Tk, TM, T6, Th, TF, TN;
+		    {
+			 E T1, T2, Ti, Tj;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 3)];
+			 T3 = T1 + T2;
+			 TC = T1 - T2;
+			 Ti = Ip[0];
+			 Tj = Im[WS(rs, 3)];
+			 Tk = Ti + Tj;
+			 TM = Ti - Tj;
+		    }
+		    {
+			 E T4, T5, TD, TE;
+			 T4 = Rp[WS(rs, 2)];
+			 T5 = Rm[WS(rs, 1)];
+			 T6 = T4 + T5;
+			 Th = T4 - T5;
+			 TD = Ip[WS(rs, 2)];
+			 TE = Im[WS(rs, 1)];
+			 TF = TD + TE;
+			 TN = TD - TE;
+		    }
+		    T7 = T3 + T6;
+		    T1d = Tk - Th;
+		    T1h = TC + TF;
+		    Tl = Th + Tk;
+		    TG = TC - TF;
+		    T14 = T3 - T6;
+		    T19 = TM - TN;
+		    TO = TM + TN;
+	       }
+	       {
+		    E Ta, Tm, Tp, TJ, Td, Tr, Tu, TK;
+		    {
+			 E T8, T9, Tn, To;
+			 T8 = Rp[WS(rs, 1)];
+			 T9 = Rm[WS(rs, 2)];
+			 Ta = T8 + T9;
+			 Tm = T8 - T9;
+			 Tn = Ip[WS(rs, 1)];
+			 To = Im[WS(rs, 2)];
+			 Tp = Tn + To;
+			 TJ = Tn - To;
+		    }
+		    {
+			 E Tb, Tc, Ts, Tt;
+			 Tb = Rm[0];
+			 Tc = Rp[WS(rs, 3)];
+			 Td = Tb + Tc;
+			 Tr = Tb - Tc;
+			 Ts = Im[0];
+			 Tt = Ip[WS(rs, 3)];
+			 Tu = Ts + Tt;
+			 TK = Tt - Ts;
+		    }
+		    Te = Ta + Td;
+		    TL = TJ + TK;
+		    T18 = Ta - Td;
+		    T15 = TK - TJ;
+		    {
+			 E Tz, TA, Tq, Tv;
+			 Tz = Tm - Tp;
+			 TA = Tr - Tu;
+			 TB = KP707106781 * (Tz + TA);
+			 T1e = KP707106781 * (Tz - TA);
+			 Tq = Tm + Tp;
+			 Tv = Tr + Tu;
+			 Tw = KP707106781 * (Tq - Tv);
+			 T1i = KP707106781 * (Tq + Tv);
+		    }
+	       }
+	       {
+		    E Tf, TP, TI, TQ;
+		    Tf = T7 + Te;
+		    TP = TL + TO;
+		    {
+			 E Tx, TH, Tg, Ty;
+			 Tx = Tl + Tw;
+			 TH = TB + TG;
+			 Tg = W[0];
+			 Ty = W[1];
+			 TI = FMA(Tg, Tx, Ty * TH);
+			 TQ = FNMS(Ty, Tx, Tg * TH);
+		    }
+		    Rp[0] = Tf - TI;
+		    Ip[0] = TP + TQ;
+		    Rm[0] = Tf + TI;
+		    Im[0] = TQ - TP;
+	       }
+	       {
+		    E T1r, T1x, T1w, T1y;
+		    {
+			 E T1o, T1q, T1n, T1p;
+			 T1o = T14 - T15;
+			 T1q = T19 - T18;
+			 T1n = W[10];
+			 T1p = W[11];
+			 T1r = FNMS(T1p, T1q, T1n * T1o);
+			 T1x = FMA(T1p, T1o, T1n * T1q);
+		    }
+		    {
+			 E T1t, T1v, T1s, T1u;
+			 T1t = T1d - T1e;
+			 T1v = T1i + T1h;
+			 T1s = W[12];
+			 T1u = W[13];
+			 T1w = FMA(T1s, T1t, T1u * T1v);
+			 T1y = FNMS(T1u, T1t, T1s * T1v);
+		    }
+		    Rp[WS(rs, 3)] = T1r - T1w;
+		    Ip[WS(rs, 3)] = T1x + T1y;
+		    Rm[WS(rs, 3)] = T1r + T1w;
+		    Im[WS(rs, 3)] = T1y - T1x;
+	       }
+	       {
+		    E TV, T11, T10, T12;
+		    {
+			 E TS, TU, TR, TT;
+			 TS = T7 - Te;
+			 TU = TO - TL;
+			 TR = W[6];
+			 TT = W[7];
+			 TV = FNMS(TT, TU, TR * TS);
+			 T11 = FMA(TT, TS, TR * TU);
+		    }
+		    {
+			 E TX, TZ, TW, TY;
+			 TX = Tl - Tw;
+			 TZ = TG - TB;
+			 TW = W[8];
+			 TY = W[9];
+			 T10 = FMA(TW, TX, TY * TZ);
+			 T12 = FNMS(TY, TX, TW * TZ);
+		    }
+		    Rp[WS(rs, 2)] = TV - T10;
+		    Ip[WS(rs, 2)] = T11 + T12;
+		    Rm[WS(rs, 2)] = TV + T10;
+		    Im[WS(rs, 2)] = T12 - T11;
+	       }
+	       {
+		    E T1b, T1l, T1k, T1m;
+		    {
+			 E T16, T1a, T13, T17;
+			 T16 = T14 + T15;
+			 T1a = T18 + T19;
+			 T13 = W[2];
+			 T17 = W[3];
+			 T1b = FNMS(T17, T1a, T13 * T16);
+			 T1l = FMA(T17, T16, T13 * T1a);
+		    }
+		    {
+			 E T1f, T1j, T1c, T1g;
+			 T1f = T1d + T1e;
+			 T1j = T1h - T1i;
+			 T1c = W[4];
+			 T1g = W[5];
+			 T1k = FMA(T1c, T1f, T1g * T1j);
+			 T1m = FNMS(T1g, T1f, T1c * T1j);
+		    }
+		    Rp[WS(rs, 1)] = T1b - T1k;
+		    Ip[WS(rs, 1)] = T1l + T1m;
+		    Rm[WS(rs, 1)] = T1b + T1k;
+		    Im[WS(rs, 1)] = T1m - T1l;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cbdft2_8", twinstr, &GENUS, {68, 18, 14, 0} };
+
+void X(codelet_hc2cbdft2_8) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft2_8, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:04 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cbdft_10 -include hc2cb.h */
+
+/*
+ * This function contains 122 FP additions, 72 FP multiplications,
+ * (or, 68 additions, 18 multiplications, 54 fused multiply/add),
+ * 95 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T2d, T2f;
+	       {
+		    E T1g, TQ, T1z, TZ, Tu, T23, T1p, T14, Tt, T27, T13, Tj, Tz, T1i, T18;
+		    E TJ, TS, T19, Ty, TA;
+		    {
+			 E Tl, T3, T7, Tm, T6, Tr, TY, T1n, Th, T8, T1, T2;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 4)];
+			 {
+			      E Te, Tp, Td, Tf, Tb, Tc;
+			      Tb = Rp[WS(rs, 4)];
+			      Tc = Rm[0];
+			      Te = Rm[WS(rs, 3)];
+			      Tl = T1 - T2;
+			      T3 = T1 + T2;
+			      Tp = Tb - Tc;
+			      Td = Tb + Tc;
+			      Tf = Rp[WS(rs, 1)];
+			      {
+				   E T4, T5, Tq, Tg;
+				   T4 = Rp[WS(rs, 2)];
+				   T5 = Rm[WS(rs, 2)];
+				   T7 = Rm[WS(rs, 1)];
+				   Tq = Te - Tf;
+				   Tg = Te + Tf;
+				   Tm = T4 - T5;
+				   T6 = T4 + T5;
+				   Tr = Tp + Tq;
+				   TY = Tp - Tq;
+				   T1n = Td - Tg;
+				   Th = Td + Tg;
+				   T8 = Rp[WS(rs, 3)];
+			      }
+			 }
+			 {
+			      E TO, Tn, T9, TP;
+			      TO = Ip[0];
+			      Tn = T7 - T8;
+			      T9 = T7 + T8;
+			      TP = Im[WS(rs, 4)];
+			      {
+				   E TG, TH, TF, T16, TD, TE, Ti;
+				   TD = Ip[WS(rs, 4)];
+				   {
+					E TX, To, T1o, Ta, Ts;
+					TX = Tm - Tn;
+					To = Tm + Tn;
+					T1o = T6 - T9;
+					Ta = T6 + T9;
+					T1g = TO - TP;
+					TQ = TO + TP;
+					T1z = FNMS(KP618033988, TX, TY);
+					TZ = FMA(KP618033988, TY, TX);
+					Ts = To + Tr;
+					Tu = To - Tr;
+					T23 = FMA(KP618033988, T1n, T1o);
+					T1p = FNMS(KP618033988, T1o, T1n);
+					Ti = Ta + Th;
+					T14 = Ta - Th;
+					Tt = FNMS(KP250000000, Ts, Tl);
+					T27 = Tl + Ts;
+					TE = Im[0];
+				   }
+				   T13 = FNMS(KP250000000, Ti, T3);
+				   Tj = T3 + Ti;
+				   TG = Im[WS(rs, 3)];
+				   TH = Ip[WS(rs, 1)];
+				   TF = TD + TE;
+				   T16 = TD - TE;
+				   {
+					E Tw, T17, TI, Tx;
+					Tw = Ip[WS(rs, 2)];
+					T17 = TH - TG;
+					TI = TG + TH;
+					Tx = Im[WS(rs, 2)];
+					Tz = Im[WS(rs, 1)];
+					T1i = T16 + T17;
+					T18 = T16 - T17;
+					TJ = TF + TI;
+					TS = TF - TI;
+					T19 = Tw - Tx;
+					Ty = Tw + Tx;
+					TA = Ip[WS(rs, 3)];
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T26, T2y, T2a, T28, T1q, T1K, T24, T2k, T10, T1Q, T1A, T2q, T29, Tk, TN;
+			 E T2c, T1M, T1P, T2w, TM, T1O, T1S, T1s, T1x, T2m, T2p, T1w, T1C, T2o, T2s;
+			 E T12, T1f, T1G, T1J, T1I, T1E, T1e, T1U, T1W, T21, T2g, T2j, T20, T2e, T2i;
+			 E T2u, T1a, TB;
+			 T1a = TA - Tz;
+			 TB = Tz + TA;
+			 {
+			      E T1Y, T1c, T1u, T1t, T1N, TL, TK, Tv, T2n, T1v;
+			      {
+				   E T1l, TV, T1k, TU, T1b, T1h;
+				   T26 = W[9];
+				   T1b = T19 - T1a;
+				   T1h = T19 + T1a;
+				   {
+					E TC, TR, T1j, TT;
+					TC = Ty + TB;
+					TR = Ty - TB;
+					T1Y = FMA(KP618033988, T18, T1b);
+					T1c = FNMS(KP618033988, T1b, T18);
+					T1j = T1h + T1i;
+					T1l = T1h - T1i;
+					T1u = FNMS(KP618033988, TC, TJ);
+					TK = FMA(KP618033988, TJ, TC);
+					TT = TR + TS;
+					TV = TR - TS;
+					T2y = T1g + T1j;
+					T1k = FNMS(KP250000000, T1j, T1g);
+					T2a = TQ + TT;
+					TU = FNMS(KP250000000, TT, TQ);
+					T28 = T26 * T27;
+				   }
+				   {
+					E T22, T1m, T1y, TW;
+					T22 = FMA(KP559016994, T1l, T1k);
+					T1m = FNMS(KP559016994, T1l, T1k);
+					T1y = FNMS(KP559016994, TV, TU);
+					TW = FMA(KP559016994, TV, TU);
+					T1q = FNMS(KP951056516, T1p, T1m);
+					T1K = FMA(KP951056516, T1p, T1m);
+					T24 = FNMS(KP951056516, T23, T22);
+					T2k = FMA(KP951056516, T23, T22);
+					T10 = FMA(KP951056516, TZ, TW);
+					T1Q = FNMS(KP951056516, TZ, TW);
+					T1A = FMA(KP951056516, T1z, T1y);
+					T2q = FNMS(KP951056516, T1z, T1y);
+					T29 = W[8];
+				   }
+			      }
+			      Tv = FMA(KP559016994, Tu, Tt);
+			      T1t = FNMS(KP559016994, Tu, Tt);
+			      Tk = W[1];
+			      TN = W[0];
+			      T2c = T29 * T27;
+			      T1N = FMA(KP951056516, TK, Tv);
+			      TL = FNMS(KP951056516, TK, Tv);
+			      T1M = W[17];
+			      T1P = W[16];
+			      T2w = TN * TL;
+			      TM = Tk * TL;
+			      T1O = T1M * T1N;
+			      T1S = T1P * T1N;
+			      T2n = FMA(KP951056516, T1u, T1t);
+			      T1v = FNMS(KP951056516, T1u, T1t);
+			      T1s = W[5];
+			      T1x = W[4];
+			      T2m = W[13];
+			      T2p = W[12];
+			      T1w = T1s * T1v;
+			      T1C = T1x * T1v;
+			      T2o = T2m * T2n;
+			      T2s = T2p * T2n;
+			      {
+				   E T1X, T1d, T1H, T15, T2h, T1Z;
+				   T1X = FMA(KP559016994, T14, T13);
+				   T15 = FNMS(KP559016994, T14, T13);
+				   T12 = W[2];
+				   T1f = W[3];
+				   T1G = W[14];
+				   T1d = FMA(KP951056516, T1c, T15);
+				   T1H = FNMS(KP951056516, T1c, T15);
+				   T1J = W[15];
+				   T1I = T1G * T1H;
+				   T1E = T1f * T1d;
+				   T1e = T12 * T1d;
+				   T1U = T1J * T1H;
+				   T2h = FNMS(KP951056516, T1Y, T1X);
+				   T1Z = FMA(KP951056516, T1Y, T1X);
+				   T1W = W[6];
+				   T21 = W[7];
+				   T2g = W[10];
+				   T2j = W[11];
+				   T20 = T1W * T1Z;
+				   T2e = T21 * T1Z;
+				   T2i = T2g * T2h;
+				   T2u = T2j * T2h;
+			      }
+			 }
+			 {
+			      E T1D, T1F, T1L, T1R;
+			      {
+				   E T11, T2x, T1r, T1B;
+				   T11 = FMA(TN, T10, TM);
+				   T2x = FNMS(Tk, T10, T2w);
+				   T1r = FNMS(T1f, T1q, T1e);
+				   T1B = FMA(T1x, T1A, T1w);
+				   Rm[0] = Tj + T11;
+				   Rp[0] = Tj - T11;
+				   Ip[0] = T2x + T2y;
+				   Im[0] = T2x - T2y;
+				   Rp[WS(rs, 1)] = T1r - T1B;
+				   Rm[WS(rs, 1)] = T1B + T1r;
+				   T1D = FNMS(T1s, T1A, T1C);
+				   T1F = FMA(T12, T1q, T1E);
+				   T1L = FNMS(T1J, T1K, T1I);
+				   T1R = FMA(T1P, T1Q, T1O);
+			      }
+			      {
+				   E T1T, T1V, T2t, T2v;
+				   T1T = FNMS(T1M, T1Q, T1S);
+				   Ip[WS(rs, 1)] = T1D + T1F;
+				   Im[WS(rs, 1)] = T1D - T1F;
+				   Rm[WS(rs, 4)] = T1R + T1L;
+				   Rp[WS(rs, 4)] = T1L - T1R;
+				   T1V = FMA(T1G, T1K, T1U);
+				   T2t = FNMS(T2m, T2q, T2s);
+				   T2v = FMA(T2g, T2k, T2u);
+				   {
+					E T2l, T2r, T25, T2b;
+					T2l = FNMS(T2j, T2k, T2i);
+					Ip[WS(rs, 4)] = T1T + T1V;
+					Im[WS(rs, 4)] = T1T - T1V;
+					Ip[WS(rs, 3)] = T2t + T2v;
+					Im[WS(rs, 3)] = T2t - T2v;
+					T2r = FMA(T2p, T2q, T2o);
+					T25 = FNMS(T21, T24, T20);
+					T2b = FMA(T29, T2a, T28);
+					T2d = FNMS(T26, T2a, T2c);
+					Rm[WS(rs, 3)] = T2r + T2l;
+					Rp[WS(rs, 3)] = T2l - T2r;
+					Rm[WS(rs, 2)] = T2b + T25;
+					Rp[WS(rs, 2)] = T25 - T2b;
+					T2f = FMA(T1W, T24, T2e);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 2)] = T2d + T2f;
+	       Im[WS(rs, 2)] = T2d - T2f;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 10, "hc2cbdft_10", twinstr, &GENUS, {68, 18, 54, 0} };
+
+void X(codelet_hc2cbdft_10) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_10, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cbdft_10 -include hc2cb.h */
+
+/*
+ * This function contains 122 FP additions, 60 FP multiplications,
+ * (or, 92 additions, 30 multiplications, 30 fused multiply/add),
+ * 61 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T3, TS, TR, T13, Ti, T12, TT, TU, T1g, T1T, Tr, T1s, TJ, T1h, TG;
+	       E T1m, TK, TL, T1k, T1l, T1b, T1P, TY, T1w;
+	       {
+		    E Td, To, Tg, Tp, Th, TQ, T6, Tl, T9, Tm, Ta, TP, T1, T2;
+		    T1 = Rp[0];
+		    T2 = Rm[WS(rs, 4)];
+		    T3 = T1 + T2;
+		    TS = T1 - T2;
+		    {
+			 E Tb, Tc, Te, Tf;
+			 Tb = Rp[WS(rs, 4)];
+			 Tc = Rm[0];
+			 Td = Tb + Tc;
+			 To = Tb - Tc;
+			 Te = Rm[WS(rs, 3)];
+			 Tf = Rp[WS(rs, 1)];
+			 Tg = Te + Tf;
+			 Tp = Te - Tf;
+		    }
+		    Th = Td + Tg;
+		    TQ = To + Tp;
+		    {
+			 E T4, T5, T7, T8;
+			 T4 = Rp[WS(rs, 2)];
+			 T5 = Rm[WS(rs, 2)];
+			 T6 = T4 + T5;
+			 Tl = T4 - T5;
+			 T7 = Rm[WS(rs, 1)];
+			 T8 = Rp[WS(rs, 3)];
+			 T9 = T7 + T8;
+			 Tm = T7 - T8;
+		    }
+		    Ta = T6 + T9;
+		    TP = Tl + Tm;
+		    TR = KP559016994 * (TP - TQ);
+		    T13 = KP559016994 * (Ta - Th);
+		    Ti = Ta + Th;
+		    T12 = FNMS(KP250000000, Ti, T3);
+		    TT = TP + TQ;
+		    TU = FNMS(KP250000000, TT, TS);
+		    {
+			 E T1e, T1f, Tn, Tq;
+			 T1e = T6 - T9;
+			 T1f = Td - Tg;
+			 T1g = FNMS(KP951056516, T1f, KP587785252 * T1e);
+			 T1T = FMA(KP951056516, T1e, KP587785252 * T1f);
+			 Tn = Tl - Tm;
+			 Tq = To - Tp;
+			 Tr = FMA(KP951056516, Tn, KP587785252 * Tq);
+			 T1s = FNMS(KP951056516, Tq, KP587785252 * Tn);
+		    }
+	       }
+	       {
+		    E TB, T18, TE, T19, TF, T1j, Tu, T15, Tx, T16, Ty, T1i, TH, TI;
+		    TH = Ip[0];
+		    TI = Im[WS(rs, 4)];
+		    TJ = TH + TI;
+		    T1h = TH - TI;
+		    {
+			 E Tz, TA, TC, TD;
+			 Tz = Ip[WS(rs, 4)];
+			 TA = Im[0];
+			 TB = Tz + TA;
+			 T18 = Tz - TA;
+			 TC = Im[WS(rs, 3)];
+			 TD = Ip[WS(rs, 1)];
+			 TE = TC + TD;
+			 T19 = TD - TC;
+		    }
+		    TF = TB - TE;
+		    T1j = T18 + T19;
+		    {
+			 E Ts, Tt, Tv, Tw;
+			 Ts = Ip[WS(rs, 2)];
+			 Tt = Im[WS(rs, 2)];
+			 Tu = Ts + Tt;
+			 T15 = Ts - Tt;
+			 Tv = Im[WS(rs, 1)];
+			 Tw = Ip[WS(rs, 3)];
+			 Tx = Tv + Tw;
+			 T16 = Tw - Tv;
+		    }
+		    Ty = Tu - Tx;
+		    T1i = T15 + T16;
+		    TG = KP559016994 * (Ty - TF);
+		    T1m = KP559016994 * (T1i - T1j);
+		    TK = Ty + TF;
+		    TL = FNMS(KP250000000, TK, TJ);
+		    T1k = T1i + T1j;
+		    T1l = FNMS(KP250000000, T1k, T1h);
+		    {
+			 E T17, T1a, TW, TX;
+			 T17 = T15 - T16;
+			 T1a = T18 - T19;
+			 T1b = FNMS(KP951056516, T1a, KP587785252 * T17);
+			 T1P = FMA(KP951056516, T17, KP587785252 * T1a);
+			 TW = Tu + Tx;
+			 TX = TB + TE;
+			 TY = FMA(KP951056516, TW, KP587785252 * TX);
+			 T1w = FNMS(KP951056516, TX, KP587785252 * TW);
+		    }
+	       }
+	       {
+		    E Tj, T2g, TN, T1H, T1U, T26, TZ, T1J, T1Q, T24, T1c, T1C, T1t, T29, T1o;
+		    E T1E, T1x, T2b, T20, T21, TM, T1S, TV;
+		    Tj = T3 + Ti;
+		    T2g = T1h + T1k;
+		    TM = TG + TL;
+		    TN = Tr + TM;
+		    T1H = TM - Tr;
+		    T1S = T1m + T1l;
+		    T1U = T1S - T1T;
+		    T26 = T1T + T1S;
+		    TV = TR + TU;
+		    TZ = TV - TY;
+		    T1J = TV + TY;
+		    {
+			 E T1O, T14, T1r, T1n, T1v;
+			 T1O = T13 + T12;
+			 T1Q = T1O + T1P;
+			 T24 = T1O - T1P;
+			 T14 = T12 - T13;
+			 T1c = T14 - T1b;
+			 T1C = T14 + T1b;
+			 T1r = TL - TG;
+			 T1t = T1r - T1s;
+			 T29 = T1s + T1r;
+			 T1n = T1l - T1m;
+			 T1o = T1g + T1n;
+			 T1E = T1n - T1g;
+			 T1v = TU - TR;
+			 T1x = T1v + T1w;
+			 T2b = T1v - T1w;
+			 {
+			      E T1X, T1Z, T1W, T1Y;
+			      T1X = TS + TT;
+			      T1Z = TJ + TK;
+			      T1W = W[9];
+			      T1Y = W[8];
+			      T20 = FMA(T1W, T1X, T1Y * T1Z);
+			      T21 = FNMS(T1W, T1Z, T1Y * T1X);
+			 }
+		    }
+		    {
+			 E T10, T2f, Tk, TO;
+			 Tk = W[0];
+			 TO = W[1];
+			 T10 = FMA(Tk, TN, TO * TZ);
+			 T2f = FNMS(TO, TN, Tk * TZ);
+			 Rp[0] = Tj - T10;
+			 Ip[0] = T2f + T2g;
+			 Rm[0] = Tj + T10;
+			 Im[0] = T2f - T2g;
+		    }
+		    {
+			 E T1V, T22, T1N, T1R;
+			 T1N = W[6];
+			 T1R = W[7];
+			 T1V = FNMS(T1R, T1U, T1N * T1Q);
+			 T22 = FMA(T1R, T1Q, T1N * T1U);
+			 Rp[WS(rs, 2)] = T1V - T20;
+			 Ip[WS(rs, 2)] = T21 + T22;
+			 Rm[WS(rs, 2)] = T20 + T1V;
+			 Im[WS(rs, 2)] = T21 - T22;
+		    }
+		    {
+			 E T1p, T1A, T1y, T1z;
+			 {
+			      E T11, T1d, T1q, T1u;
+			      T11 = W[2];
+			      T1d = W[3];
+			      T1p = FNMS(T1d, T1o, T11 * T1c);
+			      T1A = FMA(T1d, T1c, T11 * T1o);
+			      T1q = W[4];
+			      T1u = W[5];
+			      T1y = FMA(T1q, T1t, T1u * T1x);
+			      T1z = FNMS(T1u, T1t, T1q * T1x);
+			 }
+			 Rp[WS(rs, 1)] = T1p - T1y;
+			 Ip[WS(rs, 1)] = T1z + T1A;
+			 Rm[WS(rs, 1)] = T1y + T1p;
+			 Im[WS(rs, 1)] = T1z - T1A;
+		    }
+		    {
+			 E T1F, T1M, T1K, T1L;
+			 {
+			      E T1B, T1D, T1G, T1I;
+			      T1B = W[14];
+			      T1D = W[15];
+			      T1F = FNMS(T1D, T1E, T1B * T1C);
+			      T1M = FMA(T1D, T1C, T1B * T1E);
+			      T1G = W[16];
+			      T1I = W[17];
+			      T1K = FMA(T1G, T1H, T1I * T1J);
+			      T1L = FNMS(T1I, T1H, T1G * T1J);
+			 }
+			 Rp[WS(rs, 4)] = T1F - T1K;
+			 Ip[WS(rs, 4)] = T1L + T1M;
+			 Rm[WS(rs, 4)] = T1K + T1F;
+			 Im[WS(rs, 4)] = T1L - T1M;
+		    }
+		    {
+			 E T27, T2e, T2c, T2d;
+			 {
+			      E T23, T25, T28, T2a;
+			      T23 = W[10];
+			      T25 = W[11];
+			      T27 = FNMS(T25, T26, T23 * T24);
+			      T2e = FMA(T25, T24, T23 * T26);
+			      T28 = W[12];
+			      T2a = W[13];
+			      T2c = FMA(T28, T29, T2a * T2b);
+			      T2d = FNMS(T2a, T29, T28 * T2b);
+			 }
+			 Rp[WS(rs, 3)] = T27 - T2c;
+			 Ip[WS(rs, 3)] = T2d + T2e;
+			 Rm[WS(rs, 3)] = T2c + T27;
+			 Im[WS(rs, 3)] = T2d - T2e;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 10, "hc2cbdft_10", twinstr, &GENUS, {92, 30, 30, 0} };
+
+void X(codelet_hc2cbdft_10) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_10, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,635 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:04 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cbdft_12 -include hc2cb.h */
+
+/*
+ * This function contains 142 FP additions, 68 FP multiplications,
+ * (or, 96 additions, 22 multiplications, 46 fused multiply/add),
+ * 81 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
+	       E T2S, T2V, T2w, T2Z, T2T, T2I, T2Q, T2Y, T2U, T2K, T2G, T30, T2W;
+	       {
+		    E Tb, T1Z, T2D, T1E, T1N, T2y, TD, T2t, T1U, T1e, T2o, TY, T1f, TI, T1g;
+		    E TN, Tm, T1V, T2z, T1H, T1Q, T2E, T19, T2u;
+		    {
+			 E T1c, TU, T1d, TX;
+			 {
+			      E Tu, T6, TT, TS, T5, Tt, Tw, Tx, TB, T9, Ty;
+			      {
+				   E T1, Tp, Tq, Tr, T4, T2, T3, T7, T8, Ts;
+				   T1 = Rp[0];
+				   T2 = Rp[WS(rs, 4)];
+				   T3 = Rm[WS(rs, 3)];
+				   Tp = Ip[0];
+				   Tq = Ip[WS(rs, 4)];
+				   Tr = Im[WS(rs, 3)];
+				   T4 = T2 + T3;
+				   Tu = T2 - T3;
+				   T6 = Rm[WS(rs, 5)];
+				   TT = Tr + Tq;
+				   Ts = Tq - Tr;
+				   TS = FNMS(KP500000000, T4, T1);
+				   T5 = T1 + T4;
+				   T7 = Rm[WS(rs, 1)];
+				   T8 = Rp[WS(rs, 2)];
+				   T1c = Tp + Ts;
+				   Tt = FNMS(KP500000000, Ts, Tp);
+				   Tw = Im[WS(rs, 5)];
+				   Tx = Im[WS(rs, 1)];
+				   TB = T7 - T8;
+				   T9 = T7 + T8;
+				   Ty = Ip[WS(rs, 2)];
+			      }
+			      {
+				   E T1L, Tv, Ta, TV, TW, Tz;
+				   T1L = FNMS(KP866025403, Tu, Tt);
+				   Tv = FMA(KP866025403, Tu, Tt);
+				   Ta = T6 + T9;
+				   TV = FNMS(KP500000000, T9, T6);
+				   TW = Tx + Ty;
+				   Tz = Tx - Ty;
+				   {
+					E TC, T1M, T1C, TA, T1D;
+					T1C = FMA(KP866025403, TT, TS);
+					TU = FNMS(KP866025403, TT, TS);
+					T1d = Tw + Tz;
+					TA = FNMS(KP500000000, Tz, Tw);
+					T1D = FNMS(KP866025403, TW, TV);
+					TX = FMA(KP866025403, TW, TV);
+					Tb = T5 + Ta;
+					T1Z = T5 - Ta;
+					TC = FNMS(KP866025403, TB, TA);
+					T1M = FMA(KP866025403, TB, TA);
+					T2D = T1C - T1D;
+					T1E = T1C + T1D;
+					T1N = T1L - T1M;
+					T2y = T1L + T1M;
+					TD = Tv + TC;
+					T2t = Tv - TC;
+				   }
+			      }
+			 }
+			 {
+			      E T12, Th, TH, TE, Tg, T11, T14, TK, T17, Tk, TL;
+			      {
+				   E Tc, TZ, TF, TG, Tf, Td, Te, Ti, Tj, T10;
+				   Tc = Rp[WS(rs, 3)];
+				   T1U = T1c + T1d;
+				   T1e = T1c - T1d;
+				   T2o = TU + TX;
+				   TY = TU - TX;
+				   Td = Rm[WS(rs, 4)];
+				   Te = Rm[0];
+				   TZ = Ip[WS(rs, 3)];
+				   TF = Im[WS(rs, 4)];
+				   TG = Im[0];
+				   Tf = Td + Te;
+				   T12 = Td - Te;
+				   Th = Rm[WS(rs, 2)];
+				   TH = TF - TG;
+				   T10 = TF + TG;
+				   TE = FNMS(KP500000000, Tf, Tc);
+				   Tg = Tc + Tf;
+				   Ti = Rp[WS(rs, 1)];
+				   Tj = Rp[WS(rs, 5)];
+				   T1f = TZ - T10;
+				   T11 = FMA(KP500000000, T10, TZ);
+				   T14 = Im[WS(rs, 2)];
+				   TK = Ip[WS(rs, 5)];
+				   T17 = Ti - Tj;
+				   Tk = Ti + Tj;
+				   TL = Ip[WS(rs, 1)];
+			      }
+			      {
+				   E T1O, T13, Tl, TJ, TM, T15;
+				   T1O = FNMS(KP866025403, T12, T11);
+				   T13 = FMA(KP866025403, T12, T11);
+				   Tl = Th + Tk;
+				   TJ = FNMS(KP500000000, Tk, Th);
+				   TM = TK - TL;
+				   T15 = TK + TL;
+				   {
+					E T18, T1P, T1F, T16, T1G;
+					T1F = FNMS(KP866025403, TH, TE);
+					TI = FMA(KP866025403, TH, TE);
+					T1g = T15 - T14;
+					T16 = FMA(KP500000000, T15, T14);
+					T1G = FNMS(KP866025403, TM, TJ);
+					TN = FMA(KP866025403, TM, TJ);
+					Tm = Tg + Tl;
+					T1V = Tg - Tl;
+					T18 = FNMS(KP866025403, T17, T16);
+					T1P = FMA(KP866025403, T17, T16);
+					T2z = T1F - T1G;
+					T1H = T1F + T1G;
+					T1Q = T1O - T1P;
+					T2E = T1O + T1P;
+					T19 = T13 + T18;
+					T2u = T13 - T18;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T20, T2p, T1v, T1s, T1q, T1y, T1u, T1z, T1t;
+			 {
+			      E T1m, Tn, T1a, T1p, T1i, To, TP, TR, T1h, TO;
+			      T1m = Tb - Tm;
+			      Tn = Tb + Tm;
+			      T20 = T1f - T1g;
+			      T1h = T1f + T1g;
+			      T2p = TI + TN;
+			      TO = TI - TN;
+			      T1a = TY - T19;
+			      T1v = TY + T19;
+			      T1p = T1e - T1h;
+			      T1i = T1e + T1h;
+			      To = W[0];
+			      T1s = TD - TO;
+			      TP = TD + TO;
+			      TR = W[1];
+			      {
+				   E T1l, T1o, T1n, T1x, T1r;
+				   {
+					E T1j, TQ, T1k, T1b;
+					T1j = To * T1a;
+					TQ = To * TP;
+					T1l = W[10];
+					T1k = FNMS(TR, TP, T1j);
+					T1b = FMA(TR, T1a, TQ);
+					T1o = W[11];
+					T1n = T1l * T1m;
+					Im[0] = T1k - T1i;
+					Ip[0] = T1i + T1k;
+					Rm[0] = Tn + T1b;
+					Rp[0] = Tn - T1b;
+					T1x = T1o * T1m;
+					T1r = W[12];
+				   }
+				   T1q = FNMS(T1o, T1p, T1n);
+				   T1y = FMA(T1l, T1p, T1x);
+				   T1u = W[13];
+				   T1z = T1r * T1v;
+				   T1t = T1r * T1s;
+			      }
+			 }
+			 {
+			      E T2e, T2h, T1S, T2j, T2f, T26, T2c, T2m, T2g, T24, T22;
+			      {
+				   E T2b, T1R, T27, T2a, T1B, T29, T2l, T1K, T1J, T1W, T21, T25, T2d, T23, T1X;
+				   E T1Y;
+				   {
+					E T1I, T28, T1A, T1w, T1T;
+					T1A = FNMS(T1u, T1s, T1z);
+					T1w = FMA(T1u, T1v, T1t);
+					T1I = T1E - T1H;
+					T28 = T1E + T1H;
+					T2b = T1N + T1Q;
+					T1R = T1N - T1Q;
+					Im[WS(rs, 3)] = T1A - T1y;
+					Ip[WS(rs, 3)] = T1y + T1A;
+					Rm[WS(rs, 3)] = T1q + T1w;
+					Rp[WS(rs, 3)] = T1q - T1w;
+					T27 = W[14];
+					T2a = W[15];
+					T1B = W[2];
+					T29 = T27 * T28;
+					T2l = T2a * T28;
+					T1K = W[3];
+					T1J = T1B * T1I;
+					T1W = T1U - T1V;
+					T2e = T1V + T1U;
+					T2h = T1Z - T20;
+					T21 = T1Z + T20;
+					T25 = T1K * T1I;
+					T1T = W[4];
+					T2d = W[16];
+					T23 = T1T * T21;
+					T1X = T1T * T1W;
+				   }
+				   T1S = FNMS(T1K, T1R, T1J);
+				   T2j = T2d * T2h;
+				   T2f = T2d * T2e;
+				   T26 = FMA(T1B, T1R, T25);
+				   T1Y = W[5];
+				   T2c = FNMS(T2a, T2b, T29);
+				   T2m = FMA(T27, T2b, T2l);
+				   T2g = W[17];
+				   T24 = FNMS(T1Y, T1W, T23);
+				   T22 = FMA(T1Y, T21, T1X);
+			      }
+			      {
+				   E T2L, T2O, T2P, T2v, T2N, T2X, T2n, T2s, T2A, T2F, T2r, T2H, T2R, T2J, T2B;
+				   E T2C;
+				   {
+					E T2q, T2k, T2i, T2M, T2x;
+					T2k = FNMS(T2g, T2e, T2j);
+					T2i = FMA(T2g, T2h, T2f);
+					Im[WS(rs, 1)] = T24 - T26;
+					Ip[WS(rs, 1)] = T24 + T26;
+					Rm[WS(rs, 1)] = T22 + T1S;
+					Rp[WS(rs, 1)] = T1S - T22;
+					Im[WS(rs, 4)] = T2k - T2m;
+					Ip[WS(rs, 4)] = T2k + T2m;
+					Rm[WS(rs, 4)] = T2i + T2c;
+					Rp[WS(rs, 4)] = T2c - T2i;
+					T2q = T2o + T2p;
+					T2M = T2o - T2p;
+					T2L = W[18];
+					T2O = W[19];
+					T2P = T2t - T2u;
+					T2v = T2t + T2u;
+					T2N = T2L * T2M;
+					T2X = T2O * T2M;
+					T2n = W[6];
+					T2s = W[7];
+					T2S = T2y - T2z;
+					T2A = T2y + T2z;
+					T2F = T2D - T2E;
+					T2V = T2D + T2E;
+					T2r = T2n * T2q;
+					T2H = T2s * T2q;
+					T2x = W[8];
+					T2R = W[20];
+					T2J = T2x * T2F;
+					T2B = T2x * T2A;
+				   }
+				   T2w = FNMS(T2s, T2v, T2r);
+				   T2Z = T2R * T2V;
+				   T2T = T2R * T2S;
+				   T2I = FMA(T2n, T2v, T2H);
+				   T2C = W[9];
+				   T2Q = FNMS(T2O, T2P, T2N);
+				   T2Y = FMA(T2L, T2P, T2X);
+				   T2U = W[21];
+				   T2K = FNMS(T2C, T2A, T2J);
+				   T2G = FMA(T2C, T2F, T2B);
+			      }
+			 }
+		    }
+	       }
+	       T30 = FNMS(T2U, T2S, T2Z);
+	       T2W = FMA(T2U, T2V, T2T);
+	       Im[WS(rs, 2)] = T2K - T2I;
+	       Ip[WS(rs, 2)] = T2I + T2K;
+	       Rm[WS(rs, 2)] = T2w + T2G;
+	       Rp[WS(rs, 2)] = T2w - T2G;
+	       Im[WS(rs, 5)] = T30 - T2Y;
+	       Ip[WS(rs, 5)] = T2Y + T30;
+	       Rm[WS(rs, 5)] = T2Q + T2W;
+	       Rp[WS(rs, 5)] = T2Q - T2W;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 12, "hc2cbdft_12", twinstr, &GENUS, {96, 22, 46, 0} };
+
+void X(codelet_hc2cbdft_12) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_12, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cbdft_12 -include hc2cb.h */
+
+/*
+ * This function contains 142 FP additions, 60 FP multiplications,
+ * (or, 112 additions, 30 multiplications, 30 fused multiply/add),
+ * 47 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
+	       E Tv, T1E, TC, T1F, TW, T1x, TT, T1w, T1d, T1N, Tb, T1R, TI, T1z, TN;
+	       E T1A, T17, T1I, T12, T1H, T1g, T1S, Tm, T1O;
+	       {
+		    E T1, Tq, T6, TA, T4, Tp, Tt, TS, T9, Tw, Tz, TV;
+		    T1 = Rp[0];
+		    Tq = Ip[0];
+		    T6 = Rm[WS(rs, 5)];
+		    TA = Im[WS(rs, 5)];
+		    {
+			 E T2, T3, Tr, Ts;
+			 T2 = Rp[WS(rs, 4)];
+			 T3 = Rm[WS(rs, 3)];
+			 T4 = T2 + T3;
+			 Tp = KP866025403 * (T2 - T3);
+			 Tr = Im[WS(rs, 3)];
+			 Ts = Ip[WS(rs, 4)];
+			 Tt = Tr - Ts;
+			 TS = KP866025403 * (Tr + Ts);
+		    }
+		    {
+			 E T7, T8, Tx, Ty;
+			 T7 = Rm[WS(rs, 1)];
+			 T8 = Rp[WS(rs, 2)];
+			 T9 = T7 + T8;
+			 Tw = KP866025403 * (T7 - T8);
+			 Tx = Im[WS(rs, 1)];
+			 Ty = Ip[WS(rs, 2)];
+			 Tz = Tx - Ty;
+			 TV = KP866025403 * (Tx + Ty);
+		    }
+		    {
+			 E Tu, TB, TU, TR;
+			 Tu = FMA(KP500000000, Tt, Tq);
+			 Tv = Tp + Tu;
+			 T1E = Tu - Tp;
+			 TB = FMS(KP500000000, Tz, TA);
+			 TC = Tw + TB;
+			 T1F = TB - Tw;
+			 TU = FNMS(KP500000000, T9, T6);
+			 TW = TU + TV;
+			 T1x = TU - TV;
+			 TR = FNMS(KP500000000, T4, T1);
+			 TT = TR - TS;
+			 T1w = TR + TS;
+			 {
+			      E T1b, T1c, T5, Ta;
+			      T1b = Tq - Tt;
+			      T1c = Tz + TA;
+			      T1d = T1b - T1c;
+			      T1N = T1b + T1c;
+			      T5 = T1 + T4;
+			      Ta = T6 + T9;
+			      Tb = T5 + Ta;
+			      T1R = T5 - Ta;
+			 }
+		    }
+	       }
+	       {
+		    E Tc, T10, Th, T15, Tf, TY, TH, TZ, Tk, T13, TM, T14;
+		    Tc = Rp[WS(rs, 3)];
+		    T10 = Ip[WS(rs, 3)];
+		    Th = Rm[WS(rs, 2)];
+		    T15 = Im[WS(rs, 2)];
+		    {
+			 E Td, Te, TF, TG;
+			 Td = Rm[WS(rs, 4)];
+			 Te = Rm[0];
+			 Tf = Td + Te;
+			 TY = KP866025403 * (Td - Te);
+			 TF = Im[WS(rs, 4)];
+			 TG = Im[0];
+			 TH = KP866025403 * (TF - TG);
+			 TZ = TF + TG;
+		    }
+		    {
+			 E Ti, Tj, TK, TL;
+			 Ti = Rp[WS(rs, 1)];
+			 Tj = Rp[WS(rs, 5)];
+			 Tk = Ti + Tj;
+			 T13 = KP866025403 * (Ti - Tj);
+			 TK = Ip[WS(rs, 5)];
+			 TL = Ip[WS(rs, 1)];
+			 TM = KP866025403 * (TK - TL);
+			 T14 = TK + TL;
+		    }
+		    {
+			 E TE, TJ, T16, T11;
+			 TE = FNMS(KP500000000, Tf, Tc);
+			 TI = TE + TH;
+			 T1z = TE - TH;
+			 TJ = FNMS(KP500000000, Tk, Th);
+			 TN = TJ + TM;
+			 T1A = TJ - TM;
+			 T16 = FMA(KP500000000, T14, T15);
+			 T17 = T13 - T16;
+			 T1I = T13 + T16;
+			 T11 = FMA(KP500000000, TZ, T10);
+			 T12 = TY + T11;
+			 T1H = T11 - TY;
+			 {
+			      E T1e, T1f, Tg, Tl;
+			      T1e = T10 - TZ;
+			      T1f = T14 - T15;
+			      T1g = T1e + T1f;
+			      T1S = T1e - T1f;
+			      Tg = Tc + Tf;
+			      Tl = Th + Tk;
+			      Tm = Tg + Tl;
+			      T1O = Tg - Tl;
+			 }
+		    }
+	       }
+	       {
+		    E Tn, T1h, TP, T1p, T19, T1r, T1n, T1t;
+		    Tn = Tb + Tm;
+		    T1h = T1d + T1g;
+		    {
+			 E TD, TO, TX, T18;
+			 TD = Tv - TC;
+			 TO = TI - TN;
+			 TP = TD + TO;
+			 T1p = TD - TO;
+			 TX = TT - TW;
+			 T18 = T12 - T17;
+			 T19 = TX - T18;
+			 T1r = TX + T18;
+			 {
+			      E T1k, T1m, T1j, T1l;
+			      T1k = Tb - Tm;
+			      T1m = T1d - T1g;
+			      T1j = W[10];
+			      T1l = W[11];
+			      T1n = FNMS(T1l, T1m, T1j * T1k);
+			      T1t = FMA(T1l, T1k, T1j * T1m);
+			 }
+		    }
+		    {
+			 E T1a, T1i, To, TQ;
+			 To = W[0];
+			 TQ = W[1];
+			 T1a = FMA(To, TP, TQ * T19);
+			 T1i = FNMS(TQ, TP, To * T19);
+			 Rp[0] = Tn - T1a;
+			 Ip[0] = T1h + T1i;
+			 Rm[0] = Tn + T1a;
+			 Im[0] = T1i - T1h;
+		    }
+		    {
+			 E T1s, T1u, T1o, T1q;
+			 T1o = W[12];
+			 T1q = W[13];
+			 T1s = FMA(T1o, T1p, T1q * T1r);
+			 T1u = FNMS(T1q, T1p, T1o * T1r);
+			 Rp[WS(rs, 3)] = T1n - T1s;
+			 Ip[WS(rs, 3)] = T1t + T1u;
+			 Rm[WS(rs, 3)] = T1n + T1s;
+			 Im[WS(rs, 3)] = T1u - T1t;
+		    }
+	       }
+	       {
+		    E T1C, T1Y, T1K, T20, T1U, T1V, T26, T27;
+		    {
+			 E T1y, T1B, T1G, T1J;
+			 T1y = T1w + T1x;
+			 T1B = T1z + T1A;
+			 T1C = T1y - T1B;
+			 T1Y = T1y + T1B;
+			 T1G = T1E + T1F;
+			 T1J = T1H - T1I;
+			 T1K = T1G - T1J;
+			 T20 = T1G + T1J;
+		    }
+		    {
+			 E T1P, T1T, T1M, T1Q;
+			 T1P = T1N - T1O;
+			 T1T = T1R + T1S;
+			 T1M = W[4];
+			 T1Q = W[5];
+			 T1U = FMA(T1M, T1P, T1Q * T1T);
+			 T1V = FNMS(T1Q, T1P, T1M * T1T);
+		    }
+		    {
+			 E T23, T25, T22, T24;
+			 T23 = T1O + T1N;
+			 T25 = T1R - T1S;
+			 T22 = W[16];
+			 T24 = W[17];
+			 T26 = FMA(T22, T23, T24 * T25);
+			 T27 = FNMS(T24, T23, T22 * T25);
+		    }
+		    {
+			 E T1L, T1W, T1v, T1D;
+			 T1v = W[2];
+			 T1D = W[3];
+			 T1L = FNMS(T1D, T1K, T1v * T1C);
+			 T1W = FMA(T1D, T1C, T1v * T1K);
+			 Rp[WS(rs, 1)] = T1L - T1U;
+			 Ip[WS(rs, 1)] = T1V + T1W;
+			 Rm[WS(rs, 1)] = T1U + T1L;
+			 Im[WS(rs, 1)] = T1V - T1W;
+		    }
+		    {
+			 E T21, T28, T1X, T1Z;
+			 T1X = W[14];
+			 T1Z = W[15];
+			 T21 = FNMS(T1Z, T20, T1X * T1Y);
+			 T28 = FMA(T1Z, T1Y, T1X * T20);
+			 Rp[WS(rs, 4)] = T21 - T26;
+			 Ip[WS(rs, 4)] = T27 + T28;
+			 Rm[WS(rs, 4)] = T26 + T21;
+			 Im[WS(rs, 4)] = T27 - T28;
+		    }
+	       }
+	       {
+		    E T2c, T2u, T2p, T2B, T2g, T2w, T2l, T2z;
+		    {
+			 E T2a, T2b, T2n, T2o;
+			 T2a = TT + TW;
+			 T2b = TI + TN;
+			 T2c = T2a + T2b;
+			 T2u = T2a - T2b;
+			 T2n = T1w - T1x;
+			 T2o = T1H + T1I;
+			 T2p = T2n - T2o;
+			 T2B = T2n + T2o;
+		    }
+		    {
+			 E T2e, T2f, T2j, T2k;
+			 T2e = Tv + TC;
+			 T2f = T12 + T17;
+			 T2g = T2e + T2f;
+			 T2w = T2e - T2f;
+			 T2j = T1E - T1F;
+			 T2k = T1z - T1A;
+			 T2l = T2j + T2k;
+			 T2z = T2j - T2k;
+		    }
+		    {
+			 E T2h, T2r, T2q, T2s;
+			 {
+			      E T29, T2d, T2i, T2m;
+			      T29 = W[6];
+			      T2d = W[7];
+			      T2h = FNMS(T2d, T2g, T29 * T2c);
+			      T2r = FMA(T2d, T2c, T29 * T2g);
+			      T2i = W[8];
+			      T2m = W[9];
+			      T2q = FMA(T2i, T2l, T2m * T2p);
+			      T2s = FNMS(T2m, T2l, T2i * T2p);
+			 }
+			 Rp[WS(rs, 2)] = T2h - T2q;
+			 Ip[WS(rs, 2)] = T2r + T2s;
+			 Rm[WS(rs, 2)] = T2h + T2q;
+			 Im[WS(rs, 2)] = T2s - T2r;
+		    }
+		    {
+			 E T2x, T2D, T2C, T2E;
+			 {
+			      E T2t, T2v, T2y, T2A;
+			      T2t = W[18];
+			      T2v = W[19];
+			      T2x = FNMS(T2v, T2w, T2t * T2u);
+			      T2D = FMA(T2v, T2u, T2t * T2w);
+			      T2y = W[20];
+			      T2A = W[21];
+			      T2C = FMA(T2y, T2z, T2A * T2B);
+			      T2E = FNMS(T2A, T2z, T2y * T2B);
+			 }
+			 Rp[WS(rs, 5)] = T2x - T2C;
+			 Ip[WS(rs, 5)] = T2D + T2E;
+			 Rm[WS(rs, 5)] = T2x + T2C;
+			 Im[WS(rs, 5)] = T2E - T2D;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 12, "hc2cbdft_12", twinstr, &GENUS, {112, 30, 30, 0} };
+
+void X(codelet_hc2cbdft_12) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_12, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,880 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:05 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include hc2cb.h */
+
+/*
+ * This function contains 206 FP additions, 100 FP multiplications,
+ * (or, 136 additions, 30 multiplications, 70 fused multiply/add),
+ * 97 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T3w, T3z, T2Y, T3D, T3x, T3m, T3u, T3C, T3y, T3o, T3k, T3E, T3A;
+	       {
+		    E T20, Tf, T3Q, T32, T3V, T3f, T2a, TN, T2f, T1m, T3G, T2G, T3L, T2T, T26;
+		    E T1F, T3M, T2N, T3H, T2W, T25, Tu, T1n, T1o, T3R, T3i, T2g, T1a, T21, T1y;
+		    E T3W, T39;
+		    {
+			 E T2R, T1B, T2S, T1E;
+			 {
+			      E T1e, T3, T1C, TA, Tx, T6, T1D, T1h, Td, T1A, TL, T1k, Ta, TC, TF;
+			      E T1z;
+			      {
+				   E T4, T5, T1f, T1g;
+				   {
+					E T1, T2, Ty, Tz;
+					T1 = Rp[0];
+					T2 = Rm[WS(rs, 7)];
+					Ty = Ip[0];
+					Tz = Im[WS(rs, 7)];
+					T4 = Rp[WS(rs, 4)];
+					T1e = T1 - T2;
+					T3 = T1 + T2;
+					T1C = Ty - Tz;
+					TA = Ty + Tz;
+					T5 = Rm[WS(rs, 3)];
+				   }
+				   T1f = Ip[WS(rs, 4)];
+				   T1g = Im[WS(rs, 3)];
+				   {
+					E Tb, Tc, TI, TJ;
+					Tb = Rm[WS(rs, 1)];
+					Tx = T4 - T5;
+					T6 = T4 + T5;
+					T1D = T1f - T1g;
+					T1h = T1f + T1g;
+					Tc = Rp[WS(rs, 6)];
+					TI = Im[WS(rs, 1)];
+					TJ = Ip[WS(rs, 6)];
+					{
+					     E T8, TH, TK, T9, TD, TE;
+					     T8 = Rp[WS(rs, 2)];
+					     Td = Tb + Tc;
+					     TH = Tb - Tc;
+					     T1A = TJ - TI;
+					     TK = TI + TJ;
+					     T9 = Rm[WS(rs, 5)];
+					     TD = Ip[WS(rs, 2)];
+					     TE = Im[WS(rs, 5)];
+					     TL = TH + TK;
+					     T1k = TH - TK;
+					     Ta = T8 + T9;
+					     TC = T8 - T9;
+					     TF = TD + TE;
+					     T1z = TD - TE;
+					}
+				   }
+			      }
+			      {
+				   E T2E, TB, T1l, T1i, T3d, T3e, TM, T2F;
+				   {
+					E T7, TG, Te, T30, T31, T1j;
+					T2E = T3 - T6;
+					T7 = T3 + T6;
+					T1j = TC - TF;
+					TG = TC + TF;
+					Te = Ta + Td;
+					T2R = Ta - Td;
+					TB = Tx + TA;
+					T30 = TA - Tx;
+					T31 = T1j - T1k;
+					T1l = T1j + T1k;
+					T1i = T1e - T1h;
+					T3d = T1e + T1h;
+					T20 = T7 - Te;
+					Tf = T7 + Te;
+					T3Q = FNMS(KP707106781, T31, T30);
+					T32 = FMA(KP707106781, T31, T30);
+					T3e = TG + TL;
+					TM = TG - TL;
+				   }
+				   T3V = FMA(KP707106781, T3e, T3d);
+				   T3f = FNMS(KP707106781, T3e, T3d);
+				   T2a = FNMS(KP707106781, TM, TB);
+				   TN = FMA(KP707106781, TM, TB);
+				   T2F = T1A - T1z;
+				   T1B = T1z + T1A;
+				   T2f = FNMS(KP707106781, T1l, T1i);
+				   T1m = FMA(KP707106781, T1l, T1i);
+				   T3G = T2E - T2F;
+				   T2G = T2E + T2F;
+				   T2S = T1C - T1D;
+				   T1E = T1C + T1D;
+			      }
+			 }
+			 {
+			      E T34, TS, T2H, Tm, T1u, T2I, T33, TX, Tq, T14, Tp, T1v, T12, Tr, T15;
+			      E T16;
+			      {
+				   E Tj, TT, Ti, T1s, TR, Tk, TU, TV;
+				   {
+					E Tg, Th, TP, TQ;
+					Tg = Rp[WS(rs, 1)];
+					T3L = T2S - T2R;
+					T2T = T2R + T2S;
+					T26 = T1E - T1B;
+					T1F = T1B + T1E;
+					Th = Rm[WS(rs, 6)];
+					TP = Ip[WS(rs, 1)];
+					TQ = Im[WS(rs, 6)];
+					Tj = Rp[WS(rs, 5)];
+					TT = Tg - Th;
+					Ti = Tg + Th;
+					T1s = TP - TQ;
+					TR = TP + TQ;
+					Tk = Rm[WS(rs, 2)];
+					TU = Ip[WS(rs, 5)];
+					TV = Im[WS(rs, 2)];
+				   }
+				   {
+					E Tn, To, T10, T11;
+					Tn = Rm[0];
+					{
+					     E TO, Tl, T1t, TW;
+					     TO = Tj - Tk;
+					     Tl = Tj + Tk;
+					     T1t = TU - TV;
+					     TW = TU + TV;
+					     T34 = TR - TO;
+					     TS = TO + TR;
+					     T2H = Ti - Tl;
+					     Tm = Ti + Tl;
+					     T1u = T1s + T1t;
+					     T2I = T1s - T1t;
+					     T33 = TT + TW;
+					     TX = TT - TW;
+					     To = Rp[WS(rs, 7)];
+					}
+					T10 = Im[0];
+					T11 = Ip[WS(rs, 7)];
+					Tq = Rp[WS(rs, 3)];
+					T14 = Tn - To;
+					Tp = Tn + To;
+					T1v = T11 - T10;
+					T12 = T10 + T11;
+					Tr = Rm[WS(rs, 4)];
+					T15 = Ip[WS(rs, 3)];
+					T16 = Im[WS(rs, 4)];
+				   }
+			      }
+			      {
+				   E T13, T1x, T18, T35, T3g, T3h, T38, TY, T19;
+				   {
+					E T2U, T2J, T37, Tt, T36, T2V, T2M, T2K, T2L;
+					T2U = T2H + T2I;
+					T2J = T2H - T2I;
+					{
+					     E TZ, Ts, T1w, T17;
+					     TZ = Tq - Tr;
+					     Ts = Tq + Tr;
+					     T1w = T15 - T16;
+					     T17 = T15 + T16;
+					     T37 = TZ + T12;
+					     T13 = TZ - T12;
+					     T2K = Tp - Ts;
+					     Tt = Tp + Ts;
+					     T1x = T1v + T1w;
+					     T2L = T1v - T1w;
+					     T36 = T14 + T17;
+					     T18 = T14 - T17;
+					}
+					T2V = T2L - T2K;
+					T2M = T2K + T2L;
+					T3M = T2J - T2M;
+					T2N = T2J + T2M;
+					T3H = T2V - T2U;
+					T2W = T2U + T2V;
+					T35 = FMA(KP414213562, T34, T33);
+					T3g = FNMS(KP414213562, T33, T34);
+					T25 = Tm - Tt;
+					Tu = Tm + Tt;
+					T3h = FNMS(KP414213562, T36, T37);
+					T38 = FMA(KP414213562, T37, T36);
+				   }
+				   T1n = FNMS(KP414213562, TS, TX);
+				   TY = FMA(KP414213562, TX, TS);
+				   T19 = FNMS(KP414213562, T18, T13);
+				   T1o = FMA(KP414213562, T13, T18);
+				   T3R = T3h - T3g;
+				   T3i = T3g + T3h;
+				   T2g = T19 - TY;
+				   T1a = TY + T19;
+				   T21 = T1x - T1u;
+				   T1y = T1u + T1x;
+				   T3W = T35 + T38;
+				   T39 = T35 - T38;
+			      }
+			 }
+		    }
+		    {
+			 E T27, T22, T2c, T2u, T2x, T2h, T2s, T2A, T2w, T2B, T2v;
+			 {
+			      E T1K, Tv, T1G, T1N, T1Q, T1b, T2b, T1p, Tw, T1d;
+			      T1K = Tf - Tu;
+			      Tv = Tf + Tu;
+			      T1G = T1y + T1F;
+			      T1N = T1F - T1y;
+			      T1Q = FNMS(KP923879532, T1a, TN);
+			      T1b = FMA(KP923879532, T1a, TN);
+			      T2b = T1n - T1o;
+			      T1p = T1n + T1o;
+			      Tw = W[0];
+			      T1d = W[1];
+			      {
+				   E T1T, T1O, T1W, T1S, T1X, T1R;
+				   {
+					E T1J, T1M, T1L, T1V, T1P, T1q;
+					T1T = FNMS(KP923879532, T1p, T1m);
+					T1q = FMA(KP923879532, T1p, T1m);
+					{
+					     E T1c, T1I, T1H, T1r;
+					     T1c = Tw * T1b;
+					     T1J = W[14];
+					     T1H = Tw * T1q;
+					     T1r = FMA(T1d, T1q, T1c);
+					     T1M = W[15];
+					     T1L = T1J * T1K;
+					     T1I = FNMS(T1d, T1b, T1H);
+					     Rm[0] = Tv + T1r;
+					     Rp[0] = Tv - T1r;
+					     T1V = T1M * T1K;
+					     Im[0] = T1I - T1G;
+					     Ip[0] = T1G + T1I;
+					     T1P = W[16];
+					}
+					T1O = FNMS(T1M, T1N, T1L);
+					T1W = FMA(T1J, T1N, T1V);
+					T1S = W[17];
+					T1X = T1P * T1T;
+					T1R = T1P * T1Q;
+				   }
+				   {
+					E T2r, T2n, T2q, T2p, T2z, T2t, T2o, T1Y, T1U;
+					T27 = T25 + T26;
+					T2r = T26 - T25;
+					T2o = T20 - T21;
+					T22 = T20 + T21;
+					T1Y = FNMS(T1S, T1Q, T1X);
+					T1U = FMA(T1S, T1T, T1R);
+					T2n = W[22];
+					T2q = W[23];
+					Im[WS(rs, 4)] = T1Y - T1W;
+					Ip[WS(rs, 4)] = T1W + T1Y;
+					Rm[WS(rs, 4)] = T1O + T1U;
+					Rp[WS(rs, 4)] = T1O - T1U;
+					T2p = T2n * T2o;
+					T2z = T2q * T2o;
+					T2c = FMA(KP923879532, T2b, T2a);
+					T2u = FNMS(KP923879532, T2b, T2a);
+					T2x = FNMS(KP923879532, T2g, T2f);
+					T2h = FMA(KP923879532, T2g, T2f);
+					T2t = W[24];
+					T2s = FNMS(T2q, T2r, T2p);
+					T2A = FMA(T2n, T2r, T2z);
+					T2w = W[25];
+					T2B = T2t * T2x;
+					T2v = T2t * T2u;
+				   }
+			      }
+			 }
+			 {
+			      E T28, T2k, T2e, T2l, T2d;
+			      {
+				   E T1Z, T24, T23, T2j, T29, T2C, T2y;
+				   T2C = FNMS(T2w, T2u, T2B);
+				   T2y = FMA(T2w, T2x, T2v);
+				   T1Z = W[6];
+				   T24 = W[7];
+				   Im[WS(rs, 6)] = T2C - T2A;
+				   Ip[WS(rs, 6)] = T2A + T2C;
+				   Rm[WS(rs, 6)] = T2s + T2y;
+				   Rp[WS(rs, 6)] = T2s - T2y;
+				   T23 = T1Z * T22;
+				   T2j = T24 * T22;
+				   T29 = W[8];
+				   T28 = FNMS(T24, T27, T23);
+				   T2k = FMA(T1Z, T27, T2j);
+				   T2e = W[9];
+				   T2l = T29 * T2h;
+				   T2d = T29 * T2c;
+			      }
+			      {
+				   E T4a, T4d, T3O, T4h, T4b, T40, T48, T4g, T4c, T42, T3Y;
+				   {
+					E T3N, T47, T43, T46, T3F, T45, T4f, T3K, T3J, T3S, T3X, T3Z, T49, T41, T3T;
+					E T3U;
+					{
+					     E T44, T3I, T2m, T2i, T3P;
+					     T44 = FNMS(KP707106781, T3H, T3G);
+					     T3I = FMA(KP707106781, T3H, T3G);
+					     T2m = FNMS(T2e, T2c, T2l);
+					     T2i = FMA(T2e, T2h, T2d);
+					     T3N = FMA(KP707106781, T3M, T3L);
+					     T47 = FNMS(KP707106781, T3M, T3L);
+					     Im[WS(rs, 2)] = T2m - T2k;
+					     Ip[WS(rs, 2)] = T2k + T2m;
+					     Rm[WS(rs, 2)] = T28 + T2i;
+					     Rp[WS(rs, 2)] = T28 - T2i;
+					     T43 = W[26];
+					     T46 = W[27];
+					     T3F = W[10];
+					     T45 = T43 * T44;
+					     T4f = T46 * T44;
+					     T3K = W[11];
+					     T3J = T3F * T3I;
+					     T4a = FNMS(KP923879532, T3R, T3Q);
+					     T3S = FMA(KP923879532, T3R, T3Q);
+					     T3X = FNMS(KP923879532, T3W, T3V);
+					     T4d = FMA(KP923879532, T3W, T3V);
+					     T3Z = T3K * T3I;
+					     T3P = W[12];
+					     T49 = W[28];
+					     T41 = T3P * T3X;
+					     T3T = T3P * T3S;
+					}
+					T3O = FNMS(T3K, T3N, T3J);
+					T4h = T49 * T4d;
+					T4b = T49 * T4a;
+					T40 = FMA(T3F, T3N, T3Z);
+					T3U = W[13];
+					T48 = FNMS(T46, T47, T45);
+					T4g = FMA(T43, T47, T4f);
+					T4c = W[29];
+					T42 = FNMS(T3U, T3S, T41);
+					T3Y = FMA(T3U, T3X, T3T);
+				   }
+				   {
+					E T3t, T2X, T3p, T3s, T2D, T3r, T3B, T2Q, T2P, T3a, T3j, T3l, T3v, T3n, T3b;
+					E T3c;
+					{
+					     E T2O, T3q, T4i, T4e, T2Z;
+					     T4i = FNMS(T4c, T4a, T4h);
+					     T4e = FMA(T4c, T4d, T4b);
+					     Im[WS(rs, 3)] = T42 - T40;
+					     Ip[WS(rs, 3)] = T40 + T42;
+					     Rm[WS(rs, 3)] = T3O + T3Y;
+					     Rp[WS(rs, 3)] = T3O - T3Y;
+					     Im[WS(rs, 7)] = T4i - T4g;
+					     Ip[WS(rs, 7)] = T4g + T4i;
+					     Rm[WS(rs, 7)] = T48 + T4e;
+					     Rp[WS(rs, 7)] = T48 - T4e;
+					     T3t = FNMS(KP707106781, T2W, T2T);
+					     T2X = FMA(KP707106781, T2W, T2T);
+					     T2O = FMA(KP707106781, T2N, T2G);
+					     T3q = FNMS(KP707106781, T2N, T2G);
+					     T3p = W[18];
+					     T3s = W[19];
+					     T2D = W[2];
+					     T3r = T3p * T3q;
+					     T3B = T3s * T3q;
+					     T2Q = W[3];
+					     T2P = T2D * T2O;
+					     T3a = FMA(KP923879532, T39, T32);
+					     T3w = FNMS(KP923879532, T39, T32);
+					     T3z = FMA(KP923879532, T3i, T3f);
+					     T3j = FNMS(KP923879532, T3i, T3f);
+					     T3l = T2Q * T2O;
+					     T2Z = W[4];
+					     T3v = W[20];
+					     T3n = T2Z * T3j;
+					     T3b = T2Z * T3a;
+					}
+					T2Y = FNMS(T2Q, T2X, T2P);
+					T3D = T3v * T3z;
+					T3x = T3v * T3w;
+					T3m = FMA(T2D, T2X, T3l);
+					T3c = W[5];
+					T3u = FNMS(T3s, T3t, T3r);
+					T3C = FMA(T3p, T3t, T3B);
+					T3y = W[21];
+					T3o = FNMS(T3c, T3a, T3n);
+					T3k = FMA(T3c, T3j, T3b);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T3E = FNMS(T3y, T3w, T3D);
+	       T3A = FMA(T3y, T3z, T3x);
+	       Im[WS(rs, 1)] = T3o - T3m;
+	       Ip[WS(rs, 1)] = T3m + T3o;
+	       Rm[WS(rs, 1)] = T2Y + T3k;
+	       Rp[WS(rs, 1)] = T2Y - T3k;
+	       Im[WS(rs, 5)] = T3E - T3C;
+	       Ip[WS(rs, 5)] = T3C + T3E;
+	       Rm[WS(rs, 5)] = T3u + T3A;
+	       Rp[WS(rs, 5)] = T3u - T3A;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, {136, 30, 70, 0} };
+
+void X(codelet_hc2cbdft_16) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include hc2cb.h */
+
+/*
+ * This function contains 206 FP additions, 84 FP multiplications,
+ * (or, 168 additions, 46 multiplications, 38 fused multiply/add),
+ * 60 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E TB, T2L, T30, T1n, Tf, T1U, T2H, T3p, T1E, T1Z, TM, T31, T2s, T3k, T1i;
+	       E T2M, Tu, T1Y, T2Q, T2X, T2T, T2Y, TY, T1d, T19, T1e, T2v, T2C, T2y, T2D;
+	       E T1x, T1V;
+	       {
+		    E T3, T1j, TA, T1B, T6, Tx, T1m, T1C, Ta, TC, TF, T1y, Td, TH, TK;
+		    E T1z;
+		    {
+			 E T1, T2, Ty, Tz;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 7)];
+			 T3 = T1 + T2;
+			 T1j = T1 - T2;
+			 Ty = Ip[0];
+			 Tz = Im[WS(rs, 7)];
+			 TA = Ty + Tz;
+			 T1B = Ty - Tz;
+		    }
+		    {
+			 E T4, T5, T1k, T1l;
+			 T4 = Rp[WS(rs, 4)];
+			 T5 = Rm[WS(rs, 3)];
+			 T6 = T4 + T5;
+			 Tx = T4 - T5;
+			 T1k = Ip[WS(rs, 4)];
+			 T1l = Im[WS(rs, 3)];
+			 T1m = T1k + T1l;
+			 T1C = T1k - T1l;
+		    }
+		    {
+			 E T8, T9, TD, TE;
+			 T8 = Rp[WS(rs, 2)];
+			 T9 = Rm[WS(rs, 5)];
+			 Ta = T8 + T9;
+			 TC = T8 - T9;
+			 TD = Ip[WS(rs, 2)];
+			 TE = Im[WS(rs, 5)];
+			 TF = TD + TE;
+			 T1y = TD - TE;
+		    }
+		    {
+			 E Tb, Tc, TI, TJ;
+			 Tb = Rm[WS(rs, 1)];
+			 Tc = Rp[WS(rs, 6)];
+			 Td = Tb + Tc;
+			 TH = Tb - Tc;
+			 TI = Im[WS(rs, 1)];
+			 TJ = Ip[WS(rs, 6)];
+			 TK = TI + TJ;
+			 T1z = TJ - TI;
+		    }
+		    {
+			 E T7, Te, TG, TL;
+			 TB = Tx + TA;
+			 T2L = TA - Tx;
+			 T30 = T1j + T1m;
+			 T1n = T1j - T1m;
+			 T7 = T3 + T6;
+			 Te = Ta + Td;
+			 Tf = T7 + Te;
+			 T1U = T7 - Te;
+			 {
+			      E T2F, T2G, T1A, T1D;
+			      T2F = Ta - Td;
+			      T2G = T1B - T1C;
+			      T2H = T2F + T2G;
+			      T3p = T2G - T2F;
+			      T1A = T1y + T1z;
+			      T1D = T1B + T1C;
+			      T1E = T1A + T1D;
+			      T1Z = T1D - T1A;
+			 }
+			 TG = TC + TF;
+			 TL = TH + TK;
+			 TM = KP707106781 * (TG - TL);
+			 T31 = KP707106781 * (TG + TL);
+			 {
+			      E T2q, T2r, T1g, T1h;
+			      T2q = T3 - T6;
+			      T2r = T1z - T1y;
+			      T2s = T2q + T2r;
+			      T3k = T2q - T2r;
+			      T1g = TC - TF;
+			      T1h = TH - TK;
+			      T1i = KP707106781 * (T1g + T1h);
+			      T2M = KP707106781 * (T1g - T1h);
+			 }
+		    }
+	       }
+	       {
+		    E Ti, TT, TR, T1r, Tl, TO, TW, T1s, Tp, T14, T12, T1u, Ts, TZ, T17;
+		    E T1v;
+		    {
+			 E Tg, Th, TP, TQ;
+			 Tg = Rp[WS(rs, 1)];
+			 Th = Rm[WS(rs, 6)];
+			 Ti = Tg + Th;
+			 TT = Tg - Th;
+			 TP = Ip[WS(rs, 1)];
+			 TQ = Im[WS(rs, 6)];
+			 TR = TP + TQ;
+			 T1r = TP - TQ;
+		    }
+		    {
+			 E Tj, Tk, TU, TV;
+			 Tj = Rp[WS(rs, 5)];
+			 Tk = Rm[WS(rs, 2)];
+			 Tl = Tj + Tk;
+			 TO = Tj - Tk;
+			 TU = Ip[WS(rs, 5)];
+			 TV = Im[WS(rs, 2)];
+			 TW = TU + TV;
+			 T1s = TU - TV;
+		    }
+		    {
+			 E Tn, To, T10, T11;
+			 Tn = Rm[0];
+			 To = Rp[WS(rs, 7)];
+			 Tp = Tn + To;
+			 T14 = Tn - To;
+			 T10 = Im[0];
+			 T11 = Ip[WS(rs, 7)];
+			 T12 = T10 + T11;
+			 T1u = T11 - T10;
+		    }
+		    {
+			 E Tq, Tr, T15, T16;
+			 Tq = Rp[WS(rs, 3)];
+			 Tr = Rm[WS(rs, 4)];
+			 Ts = Tq + Tr;
+			 TZ = Tq - Tr;
+			 T15 = Ip[WS(rs, 3)];
+			 T16 = Im[WS(rs, 4)];
+			 T17 = T15 + T16;
+			 T1v = T15 - T16;
+		    }
+		    {
+			 E Tm, Tt, T2O, T2P;
+			 Tm = Ti + Tl;
+			 Tt = Tp + Ts;
+			 Tu = Tm + Tt;
+			 T1Y = Tm - Tt;
+			 T2O = TR - TO;
+			 T2P = TT + TW;
+			 T2Q = FMA(KP382683432, T2O, KP923879532 * T2P);
+			 T2X = FNMS(KP923879532, T2O, KP382683432 * T2P);
+		    }
+		    {
+			 E T2R, T2S, TS, TX;
+			 T2R = TZ + T12;
+			 T2S = T14 + T17;
+			 T2T = FMA(KP382683432, T2R, KP923879532 * T2S);
+			 T2Y = FNMS(KP923879532, T2R, KP382683432 * T2S);
+			 TS = TO + TR;
+			 TX = TT - TW;
+			 TY = FMA(KP923879532, TS, KP382683432 * TX);
+			 T1d = FNMS(KP382683432, TS, KP923879532 * TX);
+		    }
+		    {
+			 E T13, T18, T2t, T2u;
+			 T13 = TZ - T12;
+			 T18 = T14 - T17;
+			 T19 = FNMS(KP382683432, T18, KP923879532 * T13);
+			 T1e = FMA(KP382683432, T13, KP923879532 * T18);
+			 T2t = Ti - Tl;
+			 T2u = T1r - T1s;
+			 T2v = T2t - T2u;
+			 T2C = T2t + T2u;
+		    }
+		    {
+			 E T2w, T2x, T1t, T1w;
+			 T2w = Tp - Ts;
+			 T2x = T1u - T1v;
+			 T2y = T2w + T2x;
+			 T2D = T2x - T2w;
+			 T1t = T1r + T1s;
+			 T1w = T1u + T1v;
+			 T1x = T1t + T1w;
+			 T1V = T1w - T1t;
+		    }
+	       }
+	       {
+		    E Tv, T1F, T1b, T1N, T1p, T1P, T1L, T1R;
+		    Tv = Tf + Tu;
+		    T1F = T1x + T1E;
+		    {
+			 E TN, T1a, T1f, T1o;
+			 TN = TB + TM;
+			 T1a = TY + T19;
+			 T1b = TN + T1a;
+			 T1N = TN - T1a;
+			 T1f = T1d + T1e;
+			 T1o = T1i + T1n;
+			 T1p = T1f + T1o;
+			 T1P = T1o - T1f;
+			 {
+			      E T1I, T1K, T1H, T1J;
+			      T1I = Tf - Tu;
+			      T1K = T1E - T1x;
+			      T1H = W[14];
+			      T1J = W[15];
+			      T1L = FNMS(T1J, T1K, T1H * T1I);
+			      T1R = FMA(T1J, T1I, T1H * T1K);
+			 }
+		    }
+		    {
+			 E T1q, T1G, Tw, T1c;
+			 Tw = W[0];
+			 T1c = W[1];
+			 T1q = FMA(Tw, T1b, T1c * T1p);
+			 T1G = FNMS(T1c, T1b, Tw * T1p);
+			 Rp[0] = Tv - T1q;
+			 Ip[0] = T1F + T1G;
+			 Rm[0] = Tv + T1q;
+			 Im[0] = T1G - T1F;
+		    }
+		    {
+			 E T1Q, T1S, T1M, T1O;
+			 T1M = W[16];
+			 T1O = W[17];
+			 T1Q = FMA(T1M, T1N, T1O * T1P);
+			 T1S = FNMS(T1O, T1N, T1M * T1P);
+			 Rp[WS(rs, 4)] = T1L - T1Q;
+			 Ip[WS(rs, 4)] = T1R + T1S;
+			 Rm[WS(rs, 4)] = T1L + T1Q;
+			 Im[WS(rs, 4)] = T1S - T1R;
+		    }
+	       }
+	       {
+		    E T25, T2j, T29, T2l, T21, T2b, T2h, T2n;
+		    {
+			 E T23, T24, T27, T28;
+			 T23 = TB - TM;
+			 T24 = T1d - T1e;
+			 T25 = T23 + T24;
+			 T2j = T23 - T24;
+			 T27 = T19 - TY;
+			 T28 = T1n - T1i;
+			 T29 = T27 + T28;
+			 T2l = T28 - T27;
+		    }
+		    {
+			 E T1W, T20, T1T, T1X;
+			 T1W = T1U + T1V;
+			 T20 = T1Y + T1Z;
+			 T1T = W[6];
+			 T1X = W[7];
+			 T21 = FNMS(T1X, T20, T1T * T1W);
+			 T2b = FMA(T1X, T1W, T1T * T20);
+		    }
+		    {
+			 E T2e, T2g, T2d, T2f;
+			 T2e = T1U - T1V;
+			 T2g = T1Z - T1Y;
+			 T2d = W[22];
+			 T2f = W[23];
+			 T2h = FNMS(T2f, T2g, T2d * T2e);
+			 T2n = FMA(T2f, T2e, T2d * T2g);
+		    }
+		    {
+			 E T2a, T2c, T22, T26;
+			 T22 = W[8];
+			 T26 = W[9];
+			 T2a = FMA(T22, T25, T26 * T29);
+			 T2c = FNMS(T26, T25, T22 * T29);
+			 Rp[WS(rs, 2)] = T21 - T2a;
+			 Ip[WS(rs, 2)] = T2b + T2c;
+			 Rm[WS(rs, 2)] = T21 + T2a;
+			 Im[WS(rs, 2)] = T2c - T2b;
+		    }
+		    {
+			 E T2m, T2o, T2i, T2k;
+			 T2i = W[24];
+			 T2k = W[25];
+			 T2m = FMA(T2i, T2j, T2k * T2l);
+			 T2o = FNMS(T2k, T2j, T2i * T2l);
+			 Rp[WS(rs, 6)] = T2h - T2m;
+			 Ip[WS(rs, 6)] = T2n + T2o;
+			 Rm[WS(rs, 6)] = T2h + T2m;
+			 Im[WS(rs, 6)] = T2o - T2n;
+		    }
+	       }
+	       {
+		    E T2A, T38, T2I, T3a, T2V, T3d, T33, T3f, T2z, T2E;
+		    T2z = KP707106781 * (T2v + T2y);
+		    T2A = T2s + T2z;
+		    T38 = T2s - T2z;
+		    T2E = KP707106781 * (T2C + T2D);
+		    T2I = T2E + T2H;
+		    T3a = T2H - T2E;
+		    {
+			 E T2N, T2U, T2Z, T32;
+			 T2N = T2L + T2M;
+			 T2U = T2Q - T2T;
+			 T2V = T2N + T2U;
+			 T3d = T2N - T2U;
+			 T2Z = T2X + T2Y;
+			 T32 = T30 - T31;
+			 T33 = T2Z + T32;
+			 T3f = T32 - T2Z;
+		    }
+		    {
+			 E T2J, T35, T34, T36;
+			 {
+			      E T2p, T2B, T2K, T2W;
+			      T2p = W[2];
+			      T2B = W[3];
+			      T2J = FNMS(T2B, T2I, T2p * T2A);
+			      T35 = FMA(T2B, T2A, T2p * T2I);
+			      T2K = W[4];
+			      T2W = W[5];
+			      T34 = FMA(T2K, T2V, T2W * T33);
+			      T36 = FNMS(T2W, T2V, T2K * T33);
+			 }
+			 Rp[WS(rs, 1)] = T2J - T34;
+			 Ip[WS(rs, 1)] = T35 + T36;
+			 Rm[WS(rs, 1)] = T2J + T34;
+			 Im[WS(rs, 1)] = T36 - T35;
+		    }
+		    {
+			 E T3b, T3h, T3g, T3i;
+			 {
+			      E T37, T39, T3c, T3e;
+			      T37 = W[18];
+			      T39 = W[19];
+			      T3b = FNMS(T39, T3a, T37 * T38);
+			      T3h = FMA(T39, T38, T37 * T3a);
+			      T3c = W[20];
+			      T3e = W[21];
+			      T3g = FMA(T3c, T3d, T3e * T3f);
+			      T3i = FNMS(T3e, T3d, T3c * T3f);
+			 }
+			 Rp[WS(rs, 5)] = T3b - T3g;
+			 Ip[WS(rs, 5)] = T3h + T3i;
+			 Rm[WS(rs, 5)] = T3b + T3g;
+			 Im[WS(rs, 5)] = T3i - T3h;
+		    }
+	       }
+	       {
+		    E T3m, T3E, T3q, T3G, T3v, T3J, T3z, T3L, T3l, T3o;
+		    T3l = KP707106781 * (T2D - T2C);
+		    T3m = T3k + T3l;
+		    T3E = T3k - T3l;
+		    T3o = KP707106781 * (T2v - T2y);
+		    T3q = T3o + T3p;
+		    T3G = T3p - T3o;
+		    {
+			 E T3t, T3u, T3x, T3y;
+			 T3t = T2L - T2M;
+			 T3u = T2X - T2Y;
+			 T3v = T3t + T3u;
+			 T3J = T3t - T3u;
+			 T3x = T31 + T30;
+			 T3y = T2Q + T2T;
+			 T3z = T3x - T3y;
+			 T3L = T3y + T3x;
+		    }
+		    {
+			 E T3r, T3B, T3A, T3C;
+			 {
+			      E T3j, T3n, T3s, T3w;
+			      T3j = W[10];
+			      T3n = W[11];
+			      T3r = FNMS(T3n, T3q, T3j * T3m);
+			      T3B = FMA(T3n, T3m, T3j * T3q);
+			      T3s = W[12];
+			      T3w = W[13];
+			      T3A = FMA(T3s, T3v, T3w * T3z);
+			      T3C = FNMS(T3w, T3v, T3s * T3z);
+			 }
+			 Rp[WS(rs, 3)] = T3r - T3A;
+			 Ip[WS(rs, 3)] = T3B + T3C;
+			 Rm[WS(rs, 3)] = T3r + T3A;
+			 Im[WS(rs, 3)] = T3C - T3B;
+		    }
+		    {
+			 E T3H, T3N, T3M, T3O;
+			 {
+			      E T3D, T3F, T3I, T3K;
+			      T3D = W[26];
+			      T3F = W[27];
+			      T3H = FNMS(T3F, T3G, T3D * T3E);
+			      T3N = FMA(T3F, T3E, T3D * T3G);
+			      T3I = W[28];
+			      T3K = W[29];
+			      T3M = FMA(T3I, T3J, T3K * T3L);
+			      T3O = FNMS(T3K, T3J, T3I * T3L);
+			 }
+			 Rp[WS(rs, 7)] = T3H - T3M;
+			 Ip[WS(rs, 7)] = T3N + T3O;
+			 Rm[WS(rs, 7)] = T3H + T3M;
+			 Im[WS(rs, 7)] = T3O - T3N;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, {168, 46, 38, 0} };
+
+void X(codelet_hc2cbdft_16) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:03 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cbdft_2 -include hc2cb.h */
+
+/*
+ * This function contains 10 FP additions, 4 FP multiplications,
+ * (or, 8 additions, 2 multiplications, 2 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T9, Ta, T3, Tc, T7, T4;
+	       {
+		    E T1, T2, T5, T6;
+		    T1 = Ip[0];
+		    T2 = Im[0];
+		    T5 = Rp[0];
+		    T6 = Rm[0];
+		    T9 = W[1];
+		    Ta = T1 + T2;
+		    T3 = T1 - T2;
+		    Tc = T5 + T6;
+		    T7 = T5 - T6;
+		    T4 = W[0];
+	       }
+	       {
+		    E Td, T8, Te, Tb;
+		    Td = T9 * T7;
+		    T8 = T4 * T7;
+		    Te = FMA(T4, Ta, Td);
+		    Tb = FNMS(T9, Ta, T8);
+		    Rm[0] = Tc + Te;
+		    Rp[0] = Tc - Te;
+		    Im[0] = Tb - T3;
+		    Ip[0] = T3 + Tb;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 2, "hc2cbdft_2", twinstr, &GENUS, {8, 2, 2, 0} };
+
+void X(codelet_hc2cbdft_2) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_2, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cbdft_2 -include hc2cb.h */
+
+/*
+ * This function contains 10 FP additions, 4 FP multiplications,
+ * (or, 8 additions, 2 multiplications, 2 fused multiply/add),
+ * 9 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T3, T9, T7, Tb;
+	       {
+		    E T1, T2, T5, T6;
+		    T1 = Ip[0];
+		    T2 = Im[0];
+		    T3 = T1 - T2;
+		    T9 = T1 + T2;
+		    T5 = Rp[0];
+		    T6 = Rm[0];
+		    T7 = T5 - T6;
+		    Tb = T5 + T6;
+	       }
+	       {
+		    E Ta, Tc, T4, T8;
+		    T4 = W[0];
+		    T8 = W[1];
+		    Ta = FNMS(T8, T9, T4 * T7);
+		    Tc = FMA(T8, T7, T4 * T9);
+		    Ip[0] = T3 + Ta;
+		    Rp[0] = Tb - Tc;
+		    Im[0] = Ta - T3;
+		    Rm[0] = Tb + Tc;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 2, "hc2cbdft_2", twinstr, &GENUS, {8, 2, 2, 0} };
+
+void X(codelet_hc2cbdft_2) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_2, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1135 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cbdft_20 -include hc2cb.h */
+
+/*
+ * This function contains 286 FP additions, 148 FP multiplications,
+ * (or, 176 additions, 38 multiplications, 110 fused multiply/add),
+ * 122 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T5s, T5v, T5t, T5z, T5q, T5y, T5u, T5A, T5w;
+	       {
+		    E T3T, T27, T2o, T41, T2p, T40, TU, T15, T2Q, T1N, T2L, T1w, T59, T4n, T5e;
+		    E T4A, T2m, T24, T2Z, T2h, T4J, T3P, T3Y, T3W, T2d, TJ, T3H, T2c, TD, T52;
+		    E T3G, T1E, T4f, T5I, T4e, T4w, T5L, T4v, T1J, T1H;
+		    {
+			 E T1A, T3, T25, TI, TF, T6, T26, T1D, TO, T47, T3z, Te, T1S, T3M, T1e;
+			 E T4k, TZ, T4a, T3C, Tt, T1Z, T3J, T1p, T4h, T14, T4b, T3D, TA, T22, T3K;
+			 E T1u, T4i, Ti, T1f, Th, T1T, TS, Tj, T1g, T1h;
+			 {
+			      E T4, T5, T1B, T1C;
+			      {
+				   E T1, T2, TG, TH;
+				   T1 = Rp[0];
+				   T2 = Rm[WS(rs, 9)];
+				   TG = Ip[0];
+				   TH = Im[WS(rs, 9)];
+				   T4 = Rp[WS(rs, 5)];
+				   T1A = T1 - T2;
+				   T3 = T1 + T2;
+				   T25 = TG - TH;
+				   TI = TG + TH;
+				   T5 = Rm[WS(rs, 4)];
+				   T1B = Ip[WS(rs, 5)];
+				   T1C = Im[WS(rs, 4)];
+			      }
+			      {
+				   E Tq, T1l, Tp, T1X, TY, Tr, T1m, T1n;
+				   {
+					E Tb, T1a, Ta, T1Q, TN, Tc, T1b, T1c;
+					{
+					     E T8, T9, TL, TM;
+					     T8 = Rp[WS(rs, 4)];
+					     TF = T4 - T5;
+					     T6 = T4 + T5;
+					     T26 = T1B - T1C;
+					     T1D = T1B + T1C;
+					     T9 = Rm[WS(rs, 5)];
+					     TL = Ip[WS(rs, 4)];
+					     TM = Im[WS(rs, 5)];
+					     Tb = Rp[WS(rs, 9)];
+					     T1a = T8 - T9;
+					     Ta = T8 + T9;
+					     T1Q = TL - TM;
+					     TN = TL + TM;
+					     Tc = Rm[0];
+					     T1b = Ip[WS(rs, 9)];
+					     T1c = Im[0];
+					}
+					{
+					     E Tn, To, TW, TX;
+					     Tn = Rp[WS(rs, 8)];
+					     {
+						  E TK, Td, T1R, T1d;
+						  TK = Tb - Tc;
+						  Td = Tb + Tc;
+						  T1R = T1b - T1c;
+						  T1d = T1b + T1c;
+						  TO = TK + TN;
+						  T47 = TN - TK;
+						  T3z = Ta - Td;
+						  Te = Ta + Td;
+						  T1S = T1Q + T1R;
+						  T3M = T1Q - T1R;
+						  T1e = T1a - T1d;
+						  T4k = T1a + T1d;
+						  To = Rm[WS(rs, 1)];
+					     }
+					     TW = Ip[WS(rs, 8)];
+					     TX = Im[WS(rs, 1)];
+					     Tq = Rm[WS(rs, 6)];
+					     T1l = Tn - To;
+					     Tp = Tn + To;
+					     T1X = TW - TX;
+					     TY = TW + TX;
+					     Tr = Rp[WS(rs, 3)];
+					     T1m = Im[WS(rs, 6)];
+					     T1n = Ip[WS(rs, 3)];
+					}
+				   }
+				   {
+					E Tx, T1q, Tw, T20, T13, Ty, T1r, T1s;
+					{
+					     E Tu, Tv, T11, T12;
+					     Tu = Rm[WS(rs, 7)];
+					     {
+						  E TV, Ts, T1Y, T1o;
+						  TV = Tq - Tr;
+						  Ts = Tq + Tr;
+						  T1Y = T1n - T1m;
+						  T1o = T1m + T1n;
+						  TZ = TV + TY;
+						  T4a = TY - TV;
+						  T3C = Tp - Ts;
+						  Tt = Tp + Ts;
+						  T1Z = T1X + T1Y;
+						  T3J = T1X - T1Y;
+						  T1p = T1l + T1o;
+						  T4h = T1l - T1o;
+						  Tv = Rp[WS(rs, 2)];
+					     }
+					     T11 = Im[WS(rs, 7)];
+					     T12 = Ip[WS(rs, 2)];
+					     Tx = Rm[WS(rs, 2)];
+					     T1q = Tu - Tv;
+					     Tw = Tu + Tv;
+					     T20 = T12 - T11;
+					     T13 = T11 + T12;
+					     Ty = Rp[WS(rs, 7)];
+					     T1r = Im[WS(rs, 2)];
+					     T1s = Ip[WS(rs, 7)];
+					}
+					{
+					     E Tf, Tg, TQ, TR;
+					     Tf = Rm[WS(rs, 3)];
+					     {
+						  E T10, Tz, T21, T1t;
+						  T10 = Tx - Ty;
+						  Tz = Tx + Ty;
+						  T21 = T1s - T1r;
+						  T1t = T1r + T1s;
+						  T14 = T10 - T13;
+						  T4b = T10 + T13;
+						  T3D = Tw - Tz;
+						  TA = Tw + Tz;
+						  T22 = T20 + T21;
+						  T3K = T20 - T21;
+						  T1u = T1q + T1t;
+						  T4i = T1q - T1t;
+						  Tg = Rp[WS(rs, 6)];
+					     }
+					     TQ = Im[WS(rs, 3)];
+					     TR = Ip[WS(rs, 6)];
+					     Ti = Rp[WS(rs, 1)];
+					     T1f = Tf - Tg;
+					     Th = Tf + Tg;
+					     T1T = TR - TQ;
+					     TS = TQ + TR;
+					     Tj = Rm[WS(rs, 8)];
+					     T1g = Ip[WS(rs, 1)];
+					     T1h = Im[WS(rs, 8)];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T1V, T3N, TB, T3B, Tm, T3E, T1F, T1G, T4t, T4j, T4m, T4s, T4c, T4y, T4z;
+			      E T49, T3y, T7;
+			      {
+				   E TT, T48, T1j, T4l, T3A, Tl;
+				   T3T = T25 - T26;
+				   T27 = T25 + T26;
+				   {
+					E TP, Tk, T1U, T1i;
+					TP = Ti - Tj;
+					Tk = Ti + Tj;
+					T1U = T1g - T1h;
+					T1i = T1g + T1h;
+					TT = TP - TS;
+					T48 = TP + TS;
+					T3A = Th - Tk;
+					Tl = Th + Tk;
+					T1V = T1T + T1U;
+					T3N = T1T - T1U;
+					T1j = T1f - T1i;
+					T4l = T1f + T1i;
+					T2o = Tt - TA;
+					TB = Tt + TA;
+				   }
+				   T41 = T3z - T3A;
+				   T3B = T3z + T3A;
+				   Tm = Te + Tl;
+				   T2p = Te - Tl;
+				   {
+					E T1L, T1M, T1k, T1v;
+					T40 = T3C - T3D;
+					T3E = T3C + T3D;
+					TU = TO + TT;
+					T1L = TO - TT;
+					T1M = TZ - T14;
+					T15 = TZ + T14;
+					T1F = T1e + T1j;
+					T1k = T1e - T1j;
+					T1v = T1p - T1u;
+					T1G = T1p + T1u;
+					T4t = T4h + T4i;
+					T4j = T4h - T4i;
+					T2Q = FNMS(KP618033988, T1L, T1M);
+					T1N = FMA(KP618033988, T1M, T1L);
+					T2L = FNMS(KP618033988, T1k, T1v);
+					T1w = FMA(KP618033988, T1v, T1k);
+					T4m = T4k - T4l;
+					T4s = T4k + T4l;
+					T4c = T4a - T4b;
+					T4y = T4a + T4b;
+					T4z = T47 + T48;
+					T49 = T47 - T48;
+				   }
+			      }
+			      {
+				   E T2g, T1W, T23, T2f;
+				   T2g = T1S - T1V;
+				   T1W = T1S + T1V;
+				   T59 = FMA(KP618033988, T4j, T4m);
+				   T4n = FNMS(KP618033988, T4m, T4j);
+				   T5e = FMA(KP618033988, T4y, T4z);
+				   T4A = FNMS(KP618033988, T4z, T4y);
+				   T23 = T1Z + T22;
+				   T2f = T1Z - T22;
+				   {
+					E T3V, T3L, T3O, T3U;
+					T3V = T3J + T3K;
+					T3L = T3J - T3K;
+					T2m = T1W - T23;
+					T24 = T1W + T23;
+					T2Z = FMA(KP618033988, T2f, T2g);
+					T2h = FNMS(KP618033988, T2g, T2f);
+					T3O = T3M - T3N;
+					T3U = T3M + T3N;
+					T3y = T3 - T6;
+					T7 = T3 + T6;
+					T4J = FMA(KP618033988, T3L, T3O);
+					T3P = FNMS(KP618033988, T3O, T3L);
+					T3Y = T3U - T3V;
+					T3W = T3U + T3V;
+				   }
+			      }
+			      {
+				   E T46, TC, T3F, T4r, T4d, T4u;
+				   TC = Tm + TB;
+				   T2d = Tm - TB;
+				   TJ = TF + TI;
+				   T46 = TI - TF;
+				   T3H = T3B - T3E;
+				   T3F = T3B + T3E;
+				   T2c = FNMS(KP250000000, TC, T7);
+				   TD = T7 + TC;
+				   T52 = T3y + T3F;
+				   T3G = FNMS(KP250000000, T3F, T3y);
+				   T4r = T1A + T1D;
+				   T1E = T1A - T1D;
+				   T4f = T49 - T4c;
+				   T4d = T49 + T4c;
+				   T5I = T46 + T4d;
+				   T4e = FNMS(KP250000000, T4d, T46);
+				   T4w = T4s - T4t;
+				   T4u = T4s + T4t;
+				   T5L = T4u + T4r;
+				   T4v = FNMS(KP250000000, T4u, T4r);
+				   T1J = T1F - T1G;
+				   T1H = T1F + T1G;
+			      }
+			 }
+		    }
+		    {
+			 E T38, T3b, T39, T3f, T36, T3e, T3a;
+			 {
+			      E T28, T3r, T3o, T3v, T3p, T2b, T2k, T35, T3l, T2H, T2r, T2j, T2z, T2D, T2G;
+			      E T2X, T2F, T2T, T32, T3h, T3k, T31, T3d, T3j, T3t, T1x, T2u, T1O, T2x, T2v;
+			      E T1y, T2B, T29, T2J, T2M, T2R, T2N, T2V;
+			      {
+				   E T2l, T1I, T18, T2q, T34, T17, T16, T3n;
+				   T28 = T24 + T27;
+				   T2l = FNMS(KP250000000, T24, T27);
+				   T3r = T1H + T1E;
+				   T1I = FNMS(KP250000000, T1H, T1E);
+				   T18 = TU - T15;
+				   T16 = TU + T15;
+				   T3n = W[8];
+				   T2q = FNMS(KP618033988, T2p, T2o);
+				   T34 = FMA(KP618033988, T2o, T2p);
+				   T17 = FNMS(KP250000000, T16, TJ);
+				   T3o = TJ + T16;
+				   T3v = T3n * T3r;
+				   T3p = T3n * T3o;
+				   {
+					E T2Y, T2E, T3i, T30;
+					{
+					     E T2e, T33, T2n, T2i;
+					     T2Y = FMA(KP559016994, T2d, T2c);
+					     T2e = FNMS(KP559016994, T2d, T2c);
+					     T2b = W[14];
+					     T2k = W[15];
+					     T33 = FMA(KP559016994, T2m, T2l);
+					     T2n = FNMS(KP559016994, T2m, T2l);
+					     T2E = FMA(KP951056516, T2h, T2e);
+					     T2i = FNMS(KP951056516, T2h, T2e);
+					     T35 = FMA(KP951056516, T34, T33);
+					     T3l = FNMS(KP951056516, T34, T33);
+					     T2H = FNMS(KP951056516, T2q, T2n);
+					     T2r = FMA(KP951056516, T2q, T2n);
+					     T2j = T2b * T2i;
+					     T2z = T2k * T2i;
+					     T2D = W[22];
+					     T2G = W[23];
+					}
+					T2X = W[30];
+					T2F = T2D * T2E;
+					T2T = T2G * T2E;
+					T3i = FMA(KP951056516, T2Z, T2Y);
+					T30 = FNMS(KP951056516, T2Z, T2Y);
+					T32 = W[31];
+					T3h = W[6];
+					T3k = W[7];
+					T31 = T2X * T30;
+					T3d = T32 * T30;
+					T3j = T3h * T3i;
+					T3t = T3k * T3i;
+				   }
+				   {
+					E T2K, T2P, TE, T19, T1K, T2t, T37;
+					T2K = FNMS(KP559016994, T18, T17);
+					T19 = FMA(KP559016994, T18, T17);
+					T1K = FMA(KP559016994, T1J, T1I);
+					T2P = FNMS(KP559016994, T1J, T1I);
+					TE = W[0];
+					T2t = W[16];
+					T1x = FMA(KP951056516, T1w, T19);
+					T2u = FNMS(KP951056516, T1w, T19);
+					T1O = FNMS(KP951056516, T1N, T1K);
+					T2x = FMA(KP951056516, T1N, T1K);
+					T2v = T2t * T2u;
+					T1y = TE * T1x;
+					T2B = T2t * T2x;
+					T29 = TE * T1O;
+					T2J = W[24];
+					T37 = W[32];
+					T2M = FMA(KP951056516, T2L, T2K);
+					T38 = FNMS(KP951056516, T2L, T2K);
+					T2R = FNMS(KP951056516, T2Q, T2P);
+					T3b = FMA(KP951056516, T2Q, T2P);
+					T39 = T37 * T38;
+					T2N = T2J * T2M;
+					T3f = T37 * T3b;
+				   }
+			      }
+			      T2V = T2J * T2R;
+			      {
+				   E T3m, T3u, T3q, T2a, T1P, T1z;
+				   T1z = W[1];
+				   T3m = FNMS(T3k, T3l, T3j);
+				   T3u = FMA(T3h, T3l, T3t);
+				   T3q = W[9];
+				   T2a = FNMS(T1z, T1x, T29);
+				   T1P = FMA(T1z, T1O, T1y);
+				   {
+					E T2s, T2A, T2w, T3w, T3s;
+					T2s = FNMS(T2k, T2r, T2j);
+					T3w = FNMS(T3q, T3o, T3v);
+					T3s = FMA(T3q, T3r, T3p);
+					Im[0] = T2a - T28;
+					Ip[0] = T28 + T2a;
+					Rm[0] = TD + T1P;
+					Rp[0] = TD - T1P;
+					Im[WS(rs, 2)] = T3w - T3u;
+					Ip[WS(rs, 2)] = T3u + T3w;
+					Rm[WS(rs, 2)] = T3m + T3s;
+					Rp[WS(rs, 2)] = T3m - T3s;
+					T2A = FMA(T2b, T2r, T2z);
+					T2w = W[17];
+					{
+					     E T2I, T2U, T2O, T2C, T2y, T2W, T2S;
+					     T2I = FNMS(T2G, T2H, T2F);
+					     T2U = FMA(T2D, T2H, T2T);
+					     T2O = W[25];
+					     T2C = FNMS(T2w, T2u, T2B);
+					     T2y = FMA(T2w, T2x, T2v);
+					     T36 = FNMS(T32, T35, T31);
+					     T2W = FNMS(T2O, T2M, T2V);
+					     T2S = FMA(T2O, T2R, T2N);
+					     Im[WS(rs, 4)] = T2C - T2A;
+					     Ip[WS(rs, 4)] = T2A + T2C;
+					     Rm[WS(rs, 4)] = T2s + T2y;
+					     Rp[WS(rs, 4)] = T2s - T2y;
+					     Im[WS(rs, 6)] = T2W - T2U;
+					     Ip[WS(rs, 6)] = T2U + T2W;
+					     Rm[WS(rs, 6)] = T2I + T2S;
+					     Rp[WS(rs, 6)] = T2I - T2S;
+					     T3e = FMA(T2X, T35, T3d);
+					     T3a = W[33];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T55, T51, T54, T53, T5h, T5P, T5J, T3x, T4P, T5F, T5p, T43, T3R, T3S, T5l;
+			      E T5o, T4D, T5n, T5x, T4H, T4M, T5B, T5E, T4L, T4X, T5D, T5N, T4S, T4o, T4V;
+			      E T4B, T4T, T4p, T4Z, T4F, T57, T5a, T5f, T5b, T5j;
+			      {
+				   E T3X, T4O, T42, T3g, T3c, T5H;
+				   T55 = T3W + T3T;
+				   T3X = FNMS(KP250000000, T3W, T3T);
+				   T51 = W[18];
+				   T3g = FNMS(T3a, T38, T3f);
+				   T3c = FMA(T3a, T3b, T39);
+				   T54 = W[19];
+				   T53 = T51 * T52;
+				   Im[WS(rs, 8)] = T3g - T3e;
+				   Ip[WS(rs, 8)] = T3e + T3g;
+				   Rm[WS(rs, 8)] = T36 + T3c;
+				   Rp[WS(rs, 8)] = T36 - T3c;
+				   T5h = T54 * T52;
+				   T5H = W[28];
+				   T4O = FMA(KP618033988, T40, T41);
+				   T42 = FNMS(KP618033988, T41, T40);
+				   T5P = T5H * T5L;
+				   T5J = T5H * T5I;
+				   {
+					E T4I, T5m, T3Q, T3I, T3Z, T4N, T4K, T5C;
+					T3I = FNMS(KP559016994, T3H, T3G);
+					T4I = FMA(KP559016994, T3H, T3G);
+					T3Z = FNMS(KP559016994, T3Y, T3X);
+					T4N = FMA(KP559016994, T3Y, T3X);
+					T3x = W[2];
+					T5m = FNMS(KP951056516, T3P, T3I);
+					T3Q = FMA(KP951056516, T3P, T3I);
+					T4P = FMA(KP951056516, T4O, T4N);
+					T5F = FNMS(KP951056516, T4O, T4N);
+					T5p = FMA(KP951056516, T42, T3Z);
+					T43 = FNMS(KP951056516, T42, T3Z);
+					T3R = T3x * T3Q;
+					T3S = W[3];
+					T5l = W[34];
+					T5o = W[35];
+					T4D = T3S * T3Q;
+					T5n = T5l * T5m;
+					T5x = T5o * T5m;
+					T4K = FNMS(KP951056516, T4J, T4I);
+					T5C = FMA(KP951056516, T4J, T4I);
+					T4H = W[10];
+					T4M = W[11];
+					T5B = W[26];
+					T5E = W[27];
+					T4L = T4H * T4K;
+					T4X = T4M * T4K;
+					T5D = T5B * T5C;
+					T5N = T5E * T5C;
+				   }
+				   {
+					E T58, T5d, T45, T4g, T4x, T4R, T5r;
+					T4g = FNMS(KP559016994, T4f, T4e);
+					T58 = FMA(KP559016994, T4f, T4e);
+					T5d = FMA(KP559016994, T4w, T4v);
+					T4x = FNMS(KP559016994, T4w, T4v);
+					T45 = W[4];
+					T4R = W[12];
+					T4S = FNMS(KP951056516, T4n, T4g);
+					T4o = FMA(KP951056516, T4n, T4g);
+					T4V = FMA(KP951056516, T4A, T4x);
+					T4B = FNMS(KP951056516, T4A, T4x);
+					T4T = T4R * T4S;
+					T4p = T45 * T4o;
+					T4Z = T4R * T4V;
+					T4F = T45 * T4B;
+					T57 = W[20];
+					T5r = W[36];
+					T5s = FNMS(KP951056516, T59, T58);
+					T5a = FMA(KP951056516, T59, T58);
+					T5v = FMA(KP951056516, T5e, T5d);
+					T5f = FNMS(KP951056516, T5e, T5d);
+					T5t = T5r * T5s;
+					T5b = T57 * T5a;
+					T5z = T5r * T5v;
+				   }
+			      }
+			      T5j = T57 * T5f;
+			      {
+				   E T44, T4E, T5G, T5O, T5K, T4G, T4C, T4q;
+				   T44 = FNMS(T3S, T43, T3R);
+				   T4E = FMA(T3x, T43, T4D);
+				   T4q = W[5];
+				   T5G = FNMS(T5E, T5F, T5D);
+				   T5O = FMA(T5B, T5F, T5N);
+				   T5K = W[29];
+				   T4G = FNMS(T4q, T4o, T4F);
+				   T4C = FMA(T4q, T4B, T4p);
+				   {
+					E T4Q, T4Y, T4U, T5Q, T5M;
+					T4Q = FNMS(T4M, T4P, T4L);
+					T5Q = FNMS(T5K, T5I, T5P);
+					T5M = FMA(T5K, T5L, T5J);
+					Im[WS(rs, 1)] = T4G - T4E;
+					Ip[WS(rs, 1)] = T4E + T4G;
+					Rm[WS(rs, 1)] = T44 + T4C;
+					Rp[WS(rs, 1)] = T44 - T4C;
+					Im[WS(rs, 7)] = T5Q - T5O;
+					Ip[WS(rs, 7)] = T5O + T5Q;
+					Rm[WS(rs, 7)] = T5G + T5M;
+					Rp[WS(rs, 7)] = T5G - T5M;
+					T4Y = FMA(T4H, T4P, T4X);
+					T4U = W[13];
+					{
+					     E T56, T5i, T5c, T50, T4W, T5k, T5g;
+					     T56 = FNMS(T54, T55, T53);
+					     T5i = FMA(T51, T55, T5h);
+					     T5c = W[21];
+					     T50 = FNMS(T4U, T4S, T4Z);
+					     T4W = FMA(T4U, T4V, T4T);
+					     T5q = FNMS(T5o, T5p, T5n);
+					     T5k = FNMS(T5c, T5a, T5j);
+					     T5g = FMA(T5c, T5f, T5b);
+					     Im[WS(rs, 3)] = T50 - T4Y;
+					     Ip[WS(rs, 3)] = T4Y + T50;
+					     Rm[WS(rs, 3)] = T4Q + T4W;
+					     Rp[WS(rs, 3)] = T4Q - T4W;
+					     Im[WS(rs, 5)] = T5k - T5i;
+					     Ip[WS(rs, 5)] = T5i + T5k;
+					     Rm[WS(rs, 5)] = T56 + T5g;
+					     Rp[WS(rs, 5)] = T56 - T5g;
+					     T5y = FMA(T5l, T5p, T5x);
+					     T5u = W[37];
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T5A = FNMS(T5u, T5s, T5z);
+	       T5w = FMA(T5u, T5v, T5t);
+	       Im[WS(rs, 9)] = T5A - T5y;
+	       Ip[WS(rs, 9)] = T5y + T5A;
+	       Rm[WS(rs, 9)] = T5q + T5w;
+	       Rp[WS(rs, 9)] = T5q - T5w;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cbdft_20", twinstr, &GENUS, {176, 38, 110, 0} };
+
+void X(codelet_hc2cbdft_20) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_20, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cbdft_20 -include hc2cb.h */
+
+/*
+ * This function contains 286 FP additions, 124 FP multiplications,
+ * (or, 224 additions, 62 multiplications, 62 fused multiply/add),
+ * 89 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T7, T3N, T4a, T16, T1G, T3g, T3D, T26, T1k, T3A, T3B, T1v, T2e, T48, T47;
+	       E T2d, T1L, T43, T40, T1K, T2l, T3t, T2m, T3w, T3n, T3p, TC, T2b, T4d, T4f;
+	       E T23, T2j, T1B, T1H, T3U, T3W, T3G, T3I, T11, T17;
+	       {
+		    E T3, T1C, T15, T24, T6, T12, T1F, T25;
+		    {
+			 E T1, T2, T13, T14;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 9)];
+			 T3 = T1 + T2;
+			 T1C = T1 - T2;
+			 T13 = Ip[0];
+			 T14 = Im[WS(rs, 9)];
+			 T15 = T13 + T14;
+			 T24 = T13 - T14;
+		    }
+		    {
+			 E T4, T5, T1D, T1E;
+			 T4 = Rp[WS(rs, 5)];
+			 T5 = Rm[WS(rs, 4)];
+			 T6 = T4 + T5;
+			 T12 = T4 - T5;
+			 T1D = Ip[WS(rs, 5)];
+			 T1E = Im[WS(rs, 4)];
+			 T1F = T1D + T1E;
+			 T25 = T1D - T1E;
+		    }
+		    T7 = T3 + T6;
+		    T3N = T15 - T12;
+		    T4a = T1C + T1F;
+		    T16 = T12 + T15;
+		    T1G = T1C - T1F;
+		    T3g = T3 - T6;
+		    T3D = T24 - T25;
+		    T26 = T24 + T25;
+	       }
+	       {
+		    E Te, T3O, T3Y, TJ, T1e, T3h, T3r, T1R, TA, T3S, T42, TZ, T1u, T3l, T3v;
+		    E T21, Tl, T3P, T3Z, TO, T1j, T3i, T3s, T1U, Tt, T3R, T41, TU, T1p, T3k;
+		    E T3u, T1Y;
+		    {
+			 E Ta, T1a, TI, T1P, Td, TF, T1d, T1Q;
+			 {
+			      E T8, T9, TG, TH;
+			      T8 = Rp[WS(rs, 4)];
+			      T9 = Rm[WS(rs, 5)];
+			      Ta = T8 + T9;
+			      T1a = T8 - T9;
+			      TG = Ip[WS(rs, 4)];
+			      TH = Im[WS(rs, 5)];
+			      TI = TG + TH;
+			      T1P = TG - TH;
+			 }
+			 {
+			      E Tb, Tc, T1b, T1c;
+			      Tb = Rp[WS(rs, 9)];
+			      Tc = Rm[0];
+			      Td = Tb + Tc;
+			      TF = Tb - Tc;
+			      T1b = Ip[WS(rs, 9)];
+			      T1c = Im[0];
+			      T1d = T1b + T1c;
+			      T1Q = T1b - T1c;
+			 }
+			 Te = Ta + Td;
+			 T3O = TI - TF;
+			 T3Y = T1a + T1d;
+			 TJ = TF + TI;
+			 T1e = T1a - T1d;
+			 T3h = Ta - Td;
+			 T3r = T1P - T1Q;
+			 T1R = T1P + T1Q;
+		    }
+		    {
+			 E Tw, T1q, TY, T1Z, Tz, TV, T1t, T20;
+			 {
+			      E Tu, Tv, TW, TX;
+			      Tu = Rm[WS(rs, 7)];
+			      Tv = Rp[WS(rs, 2)];
+			      Tw = Tu + Tv;
+			      T1q = Tu - Tv;
+			      TW = Im[WS(rs, 7)];
+			      TX = Ip[WS(rs, 2)];
+			      TY = TW + TX;
+			      T1Z = TX - TW;
+			 }
+			 {
+			      E Tx, Ty, T1r, T1s;
+			      Tx = Rm[WS(rs, 2)];
+			      Ty = Rp[WS(rs, 7)];
+			      Tz = Tx + Ty;
+			      TV = Tx - Ty;
+			      T1r = Im[WS(rs, 2)];
+			      T1s = Ip[WS(rs, 7)];
+			      T1t = T1r + T1s;
+			      T20 = T1s - T1r;
+			 }
+			 TA = Tw + Tz;
+			 T3S = TV + TY;
+			 T42 = T1q - T1t;
+			 TZ = TV - TY;
+			 T1u = T1q + T1t;
+			 T3l = Tw - Tz;
+			 T3v = T1Z - T20;
+			 T21 = T1Z + T20;
+		    }
+		    {
+			 E Th, T1f, TN, T1S, Tk, TK, T1i, T1T;
+			 {
+			      E Tf, Tg, TL, TM;
+			      Tf = Rm[WS(rs, 3)];
+			      Tg = Rp[WS(rs, 6)];
+			      Th = Tf + Tg;
+			      T1f = Tf - Tg;
+			      TL = Im[WS(rs, 3)];
+			      TM = Ip[WS(rs, 6)];
+			      TN = TL + TM;
+			      T1S = TM - TL;
+			 }
+			 {
+			      E Ti, Tj, T1g, T1h;
+			      Ti = Rp[WS(rs, 1)];
+			      Tj = Rm[WS(rs, 8)];
+			      Tk = Ti + Tj;
+			      TK = Ti - Tj;
+			      T1g = Ip[WS(rs, 1)];
+			      T1h = Im[WS(rs, 8)];
+			      T1i = T1g + T1h;
+			      T1T = T1g - T1h;
+			 }
+			 Tl = Th + Tk;
+			 T3P = TK + TN;
+			 T3Z = T1f + T1i;
+			 TO = TK - TN;
+			 T1j = T1f - T1i;
+			 T3i = Th - Tk;
+			 T3s = T1S - T1T;
+			 T1U = T1S + T1T;
+		    }
+		    {
+			 E Tp, T1l, TT, T1W, Ts, TQ, T1o, T1X;
+			 {
+			      E Tn, To, TR, TS;
+			      Tn = Rp[WS(rs, 8)];
+			      To = Rm[WS(rs, 1)];
+			      Tp = Tn + To;
+			      T1l = Tn - To;
+			      TR = Ip[WS(rs, 8)];
+			      TS = Im[WS(rs, 1)];
+			      TT = TR + TS;
+			      T1W = TR - TS;
+			 }
+			 {
+			      E Tq, Tr, T1m, T1n;
+			      Tq = Rm[WS(rs, 6)];
+			      Tr = Rp[WS(rs, 3)];
+			      Ts = Tq + Tr;
+			      TQ = Tq - Tr;
+			      T1m = Im[WS(rs, 6)];
+			      T1n = Ip[WS(rs, 3)];
+			      T1o = T1m + T1n;
+			      T1X = T1n - T1m;
+			 }
+			 Tt = Tp + Ts;
+			 T3R = TT - TQ;
+			 T41 = T1l - T1o;
+			 TU = TQ + TT;
+			 T1p = T1l + T1o;
+			 T3k = Tp - Ts;
+			 T3u = T1W - T1X;
+			 T1Y = T1W + T1X;
+		    }
+		    T1k = T1e - T1j;
+		    T3A = T3h - T3i;
+		    T3B = T3k - T3l;
+		    T1v = T1p - T1u;
+		    T2e = T1Y - T21;
+		    T48 = T3R + T3S;
+		    T47 = T3O + T3P;
+		    T2d = T1R - T1U;
+		    T1L = TU - TZ;
+		    T43 = T41 - T42;
+		    T40 = T3Y - T3Z;
+		    T1K = TJ - TO;
+		    T2l = Te - Tl;
+		    T3t = T3r - T3s;
+		    T2m = Tt - TA;
+		    T3w = T3u - T3v;
+		    {
+			 E T3j, T3m, Tm, TB;
+			 T3j = T3h + T3i;
+			 T3m = T3k + T3l;
+			 T3n = T3j + T3m;
+			 T3p = KP559016994 * (T3j - T3m);
+			 Tm = Te + Tl;
+			 TB = Tt + TA;
+			 TC = Tm + TB;
+			 T2b = KP559016994 * (Tm - TB);
+		    }
+		    {
+			 E T4b, T4c, T3Q, T3T;
+			 T4b = T3Y + T3Z;
+			 T4c = T41 + T42;
+			 T4d = T4b + T4c;
+			 T4f = KP559016994 * (T4b - T4c);
+			 {
+			      E T1V, T22, T1z, T1A;
+			      T1V = T1R + T1U;
+			      T22 = T1Y + T21;
+			      T23 = T1V + T22;
+			      T2j = KP559016994 * (T1V - T22);
+			      T1z = T1e + T1j;
+			      T1A = T1p + T1u;
+			      T1B = KP559016994 * (T1z - T1A);
+			      T1H = T1z + T1A;
+			 }
+			 T3Q = T3O - T3P;
+			 T3T = T3R - T3S;
+			 T3U = T3Q + T3T;
+			 T3W = KP559016994 * (T3Q - T3T);
+			 {
+			      E T3E, T3F, TP, T10;
+			      T3E = T3r + T3s;
+			      T3F = T3u + T3v;
+			      T3G = T3E + T3F;
+			      T3I = KP559016994 * (T3E - T3F);
+			      TP = TJ + TO;
+			      T10 = TU + TZ;
+			      T11 = KP559016994 * (TP - T10);
+			      T17 = TP + T10;
+			 }
+		    }
+	       }
+	       {
+		    E TD, T27, T3c, T3e, T2o, T36, T2A, T2U, T1N, T2Z, T2t, T2J, T1x, T2X, T2r;
+		    E T2F, T2g, T34, T2y, T2Q;
+		    TD = T7 + TC;
+		    T27 = T23 + T26;
+		    {
+			 E T39, T3b, T38, T3a;
+			 T39 = T16 + T17;
+			 T3b = T1H + T1G;
+			 T38 = W[8];
+			 T3a = W[9];
+			 T3c = FMA(T38, T39, T3a * T3b);
+			 T3e = FNMS(T3a, T39, T38 * T3b);
+		    }
+		    {
+			 E T2n, T2S, T2k, T2T, T2i;
+			 T2n = FNMS(KP951056516, T2m, KP587785252 * T2l);
+			 T2S = FMA(KP951056516, T2l, KP587785252 * T2m);
+			 T2i = FNMS(KP250000000, T23, T26);
+			 T2k = T2i - T2j;
+			 T2T = T2j + T2i;
+			 T2o = T2k - T2n;
+			 T36 = T2T - T2S;
+			 T2A = T2n + T2k;
+			 T2U = T2S + T2T;
+		    }
+		    {
+			 E T1M, T2H, T1J, T2I, T1I;
+			 T1M = FMA(KP951056516, T1K, KP587785252 * T1L);
+			 T2H = FNMS(KP951056516, T1L, KP587785252 * T1K);
+			 T1I = FNMS(KP250000000, T1H, T1G);
+			 T1J = T1B + T1I;
+			 T2I = T1I - T1B;
+			 T1N = T1J - T1M;
+			 T2Z = T2I - T2H;
+			 T2t = T1M + T1J;
+			 T2J = T2H + T2I;
+		    }
+		    {
+			 E T1w, T2E, T19, T2D, T18;
+			 T1w = FMA(KP951056516, T1k, KP587785252 * T1v);
+			 T2E = FNMS(KP951056516, T1v, KP587785252 * T1k);
+			 T18 = FNMS(KP250000000, T17, T16);
+			 T19 = T11 + T18;
+			 T2D = T18 - T11;
+			 T1x = T19 + T1w;
+			 T2X = T2D + T2E;
+			 T2r = T19 - T1w;
+			 T2F = T2D - T2E;
+		    }
+		    {
+			 E T2f, T2P, T2c, T2O, T2a;
+			 T2f = FNMS(KP951056516, T2e, KP587785252 * T2d);
+			 T2P = FMA(KP951056516, T2d, KP587785252 * T2e);
+			 T2a = FNMS(KP250000000, TC, T7);
+			 T2c = T2a - T2b;
+			 T2O = T2b + T2a;
+			 T2g = T2c + T2f;
+			 T34 = T2O + T2P;
+			 T2y = T2c - T2f;
+			 T2Q = T2O - T2P;
+		    }
+		    {
+			 E T1O, T28, TE, T1y;
+			 TE = W[0];
+			 T1y = W[1];
+			 T1O = FMA(TE, T1x, T1y * T1N);
+			 T28 = FNMS(T1y, T1x, TE * T1N);
+			 Rp[0] = TD - T1O;
+			 Ip[0] = T27 + T28;
+			 Rm[0] = TD + T1O;
+			 Im[0] = T28 - T27;
+		    }
+		    {
+			 E T37, T3d, T33, T35;
+			 T33 = W[6];
+			 T35 = W[7];
+			 T37 = FNMS(T35, T36, T33 * T34);
+			 T3d = FMA(T35, T34, T33 * T36);
+			 Rp[WS(rs, 2)] = T37 - T3c;
+			 Ip[WS(rs, 2)] = T3d + T3e;
+			 Rm[WS(rs, 2)] = T37 + T3c;
+			 Im[WS(rs, 2)] = T3e - T3d;
+		    }
+		    {
+			 E T2p, T2v, T2u, T2w;
+			 {
+			      E T29, T2h, T2q, T2s;
+			      T29 = W[14];
+			      T2h = W[15];
+			      T2p = FNMS(T2h, T2o, T29 * T2g);
+			      T2v = FMA(T2h, T2g, T29 * T2o);
+			      T2q = W[16];
+			      T2s = W[17];
+			      T2u = FMA(T2q, T2r, T2s * T2t);
+			      T2w = FNMS(T2s, T2r, T2q * T2t);
+			 }
+			 Rp[WS(rs, 4)] = T2p - T2u;
+			 Ip[WS(rs, 4)] = T2v + T2w;
+			 Rm[WS(rs, 4)] = T2p + T2u;
+			 Im[WS(rs, 4)] = T2w - T2v;
+		    }
+		    {
+			 E T2B, T2L, T2K, T2M;
+			 {
+			      E T2x, T2z, T2C, T2G;
+			      T2x = W[22];
+			      T2z = W[23];
+			      T2B = FNMS(T2z, T2A, T2x * T2y);
+			      T2L = FMA(T2z, T2y, T2x * T2A);
+			      T2C = W[24];
+			      T2G = W[25];
+			      T2K = FMA(T2C, T2F, T2G * T2J);
+			      T2M = FNMS(T2G, T2F, T2C * T2J);
+			 }
+			 Rp[WS(rs, 6)] = T2B - T2K;
+			 Ip[WS(rs, 6)] = T2L + T2M;
+			 Rm[WS(rs, 6)] = T2B + T2K;
+			 Im[WS(rs, 6)] = T2M - T2L;
+		    }
+		    {
+			 E T2V, T31, T30, T32;
+			 {
+			      E T2N, T2R, T2W, T2Y;
+			      T2N = W[30];
+			      T2R = W[31];
+			      T2V = FNMS(T2R, T2U, T2N * T2Q);
+			      T31 = FMA(T2R, T2Q, T2N * T2U);
+			      T2W = W[32];
+			      T2Y = W[33];
+			      T30 = FMA(T2W, T2X, T2Y * T2Z);
+			      T32 = FNMS(T2Y, T2X, T2W * T2Z);
+			 }
+			 Rp[WS(rs, 8)] = T2V - T30;
+			 Ip[WS(rs, 8)] = T31 + T32;
+			 Rm[WS(rs, 8)] = T2V + T30;
+			 Im[WS(rs, 8)] = T32 - T31;
+		    }
+	       }
+	       {
+		    E T4F, T4P, T5c, T5e, T3y, T54, T4o, T4S, T4h, T4Z, T4x, T4N, T45, T4X, T4v;
+		    E T4J, T3K, T56, T4s, T4U;
+		    {
+			 E T4C, T4E, T4B, T4D;
+			 T4C = T3g + T3n;
+			 T4E = T3G + T3D;
+			 T4B = W[18];
+			 T4D = W[19];
+			 T4F = FNMS(T4D, T4E, T4B * T4C);
+			 T4P = FMA(T4D, T4C, T4B * T4E);
+		    }
+		    {
+			 E T59, T5b, T58, T5a;
+			 T59 = T3N + T3U;
+			 T5b = T4d + T4a;
+			 T58 = W[28];
+			 T5a = W[29];
+			 T5c = FMA(T58, T59, T5a * T5b);
+			 T5e = FNMS(T5a, T59, T58 * T5b);
+		    }
+		    {
+			 E T3x, T4n, T3q, T4m, T3o;
+			 T3x = FNMS(KP951056516, T3w, KP587785252 * T3t);
+			 T4n = FMA(KP951056516, T3t, KP587785252 * T3w);
+			 T3o = FNMS(KP250000000, T3n, T3g);
+			 T3q = T3o - T3p;
+			 T4m = T3p + T3o;
+			 T3y = T3q - T3x;
+			 T54 = T4m + T4n;
+			 T4o = T4m - T4n;
+			 T4S = T3q + T3x;
+		    }
+		    {
+			 E T49, T4M, T4g, T4L, T4e;
+			 T49 = FNMS(KP951056516, T48, KP587785252 * T47);
+			 T4M = FMA(KP951056516, T47, KP587785252 * T48);
+			 T4e = FNMS(KP250000000, T4d, T4a);
+			 T4g = T4e - T4f;
+			 T4L = T4f + T4e;
+			 T4h = T49 + T4g;
+			 T4Z = T4M + T4L;
+			 T4x = T4g - T49;
+			 T4N = T4L - T4M;
+		    }
+		    {
+			 E T44, T4I, T3X, T4H, T3V;
+			 T44 = FNMS(KP951056516, T43, KP587785252 * T40);
+			 T4I = FMA(KP951056516, T40, KP587785252 * T43);
+			 T3V = FNMS(KP250000000, T3U, T3N);
+			 T3X = T3V - T3W;
+			 T4H = T3W + T3V;
+			 T45 = T3X - T44;
+			 T4X = T4H - T4I;
+			 T4v = T3X + T44;
+			 T4J = T4H + T4I;
+		    }
+		    {
+			 E T3C, T4q, T3J, T4r, T3H;
+			 T3C = FNMS(KP951056516, T3B, KP587785252 * T3A);
+			 T4q = FMA(KP951056516, T3A, KP587785252 * T3B);
+			 T3H = FNMS(KP250000000, T3G, T3D);
+			 T3J = T3H - T3I;
+			 T4r = T3I + T3H;
+			 T3K = T3C + T3J;
+			 T56 = T4r - T4q;
+			 T4s = T4q + T4r;
+			 T4U = T3J - T3C;
+		    }
+		    {
+			 E T4O, T4Q, T4G, T4K;
+			 T4G = W[20];
+			 T4K = W[21];
+			 T4O = FMA(T4G, T4J, T4K * T4N);
+			 T4Q = FNMS(T4K, T4J, T4G * T4N);
+			 Rp[WS(rs, 5)] = T4F - T4O;
+			 Ip[WS(rs, 5)] = T4P + T4Q;
+			 Rm[WS(rs, 5)] = T4F + T4O;
+			 Im[WS(rs, 5)] = T4Q - T4P;
+		    }
+		    {
+			 E T57, T5d, T53, T55;
+			 T53 = W[26];
+			 T55 = W[27];
+			 T57 = FNMS(T55, T56, T53 * T54);
+			 T5d = FMA(T55, T54, T53 * T56);
+			 Rp[WS(rs, 7)] = T57 - T5c;
+			 Ip[WS(rs, 7)] = T5d + T5e;
+			 Rm[WS(rs, 7)] = T57 + T5c;
+			 Im[WS(rs, 7)] = T5e - T5d;
+		    }
+		    {
+			 E T3L, T4j, T4i, T4k;
+			 {
+			      E T3f, T3z, T3M, T46;
+			      T3f = W[2];
+			      T3z = W[3];
+			      T3L = FNMS(T3z, T3K, T3f * T3y);
+			      T4j = FMA(T3z, T3y, T3f * T3K);
+			      T3M = W[4];
+			      T46 = W[5];
+			      T4i = FMA(T3M, T45, T46 * T4h);
+			      T4k = FNMS(T46, T45, T3M * T4h);
+			 }
+			 Rp[WS(rs, 1)] = T3L - T4i;
+			 Ip[WS(rs, 1)] = T4j + T4k;
+			 Rm[WS(rs, 1)] = T3L + T4i;
+			 Im[WS(rs, 1)] = T4k - T4j;
+		    }
+		    {
+			 E T4t, T4z, T4y, T4A;
+			 {
+			      E T4l, T4p, T4u, T4w;
+			      T4l = W[10];
+			      T4p = W[11];
+			      T4t = FNMS(T4p, T4s, T4l * T4o);
+			      T4z = FMA(T4p, T4o, T4l * T4s);
+			      T4u = W[12];
+			      T4w = W[13];
+			      T4y = FMA(T4u, T4v, T4w * T4x);
+			      T4A = FNMS(T4w, T4v, T4u * T4x);
+			 }
+			 Rp[WS(rs, 3)] = T4t - T4y;
+			 Ip[WS(rs, 3)] = T4z + T4A;
+			 Rm[WS(rs, 3)] = T4t + T4y;
+			 Im[WS(rs, 3)] = T4A - T4z;
+		    }
+		    {
+			 E T4V, T51, T50, T52;
+			 {
+			      E T4R, T4T, T4W, T4Y;
+			      T4R = W[34];
+			      T4T = W[35];
+			      T4V = FNMS(T4T, T4U, T4R * T4S);
+			      T51 = FMA(T4T, T4S, T4R * T4U);
+			      T4W = W[36];
+			      T4Y = W[37];
+			      T50 = FMA(T4W, T4X, T4Y * T4Z);
+			      T52 = FNMS(T4Y, T4X, T4W * T4Z);
+			 }
+			 Rp[WS(rs, 9)] = T4V - T50;
+			 Ip[WS(rs, 9)] = T51 + T52;
+			 Rm[WS(rs, 9)] = T4V + T50;
+			 Im[WS(rs, 9)] = T52 - T51;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cbdft_20", twinstr, &GENUS, {224, 62, 62, 0} };
+
+void X(codelet_hc2cbdft_20) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_20, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1888 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:06 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -dif -name hc2cbdft_32 -include hc2cb.h */
+
+/*
+ * This function contains 498 FP additions, 260 FP multiplications,
+ * (or, 300 additions, 62 multiplications, 198 fused multiply/add),
+ * 165 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T8e, T8h, T7S, T8l, T8f, T84, T8c, T8k, T8g, T86, T82, T8m, T8i;
+	       {
+		    E T4B, T3h, T3K, Tv, T8Y, T6T, T8L, T7i, T8X, T7f, T4Y, T1G, T4K, T1j, T4X;
+		    E T2M, T8C, T6d, T8o, T66, T8K, T6M, T4L, T2P, T4C, T3o, T5q, T4q, T8p, T6C;
+		    E T8B, T6z, T72, T2u, T75, T10, T3P, T3a, T3L, T4t, T4E, T8F, T8t, T4F, T4w;
+		    E T8E, T8w, T6E, T6l, T6F, T6s, T76, T4P, T51, T2R, T28, T8P, T90, T7k, T71;
+		    E T2p, T4R, T2x, T73, T6x, T6y;
+		    {
+			 E T3l, T16, T3m, T2H, T2E, T13, T64, T7, T3i, T2J, T1c, T3j, T1h, T2K, Te;
+			 E T1z, T6R, T6a, Tt, T3g, T6b, T1E, T6Q, Tj, T1p, Ti, T3b, T1n, Tk, T1q;
+			 E T1r;
+			 {
+			      E T1, T2, T4, T5;
+			      {
+				   E T14, T15, T2F, T2G;
+				   T14 = Ip[0];
+				   T15 = Im[WS(rs, 15)];
+				   T2F = Ip[WS(rs, 8)];
+				   T2G = Im[WS(rs, 7)];
+				   T1 = Rp[0];
+				   T3l = T14 - T15;
+				   T16 = T14 + T15;
+				   T3m = T2F - T2G;
+				   T2H = T2F + T2G;
+				   T2 = Rm[WS(rs, 15)];
+				   T4 = Rp[WS(rs, 8)];
+				   T5 = Rm[WS(rs, 7)];
+			      }
+			      {
+				   E T1b, T1e, T18, Ta, T1f, Tb, Tc, T8, T9, T1g, T1d, Td;
+				   {
+					E T19, T3, T6, T1a;
+					T19 = Ip[WS(rs, 4)];
+					T2E = T1 - T2;
+					T3 = T1 + T2;
+					T13 = T4 - T5;
+					T6 = T4 + T5;
+					T1a = Im[WS(rs, 11)];
+					T8 = Rp[WS(rs, 4)];
+					T9 = Rm[WS(rs, 11)];
+					T64 = T3 - T6;
+					T7 = T3 + T6;
+					T1b = T19 + T1a;
+					T3i = T19 - T1a;
+				   }
+				   T1e = Im[WS(rs, 3)];
+				   T18 = T8 - T9;
+				   Ta = T8 + T9;
+				   T1f = Ip[WS(rs, 12)];
+				   Tb = Rm[WS(rs, 3)];
+				   Tc = Rp[WS(rs, 12)];
+				   T2J = T18 - T1b;
+				   T1c = T18 + T1b;
+				   T1g = T1e + T1f;
+				   T3j = T1f - T1e;
+				   T1d = Tb - Tc;
+				   Td = Tb + Tc;
+				   T1h = T1d + T1g;
+				   T2K = T1d - T1g;
+				   T6x = Ta - Td;
+				   Te = Ta + Td;
+			      }
+			      {
+				   E Tq, T1A, Tp, T3e, T1y, Tr, T1B, T1C;
+				   {
+					E Tn, To, T1w, T1x;
+					Tn = Rm[WS(rs, 1)];
+					To = Rp[WS(rs, 14)];
+					T1w = Im[WS(rs, 1)];
+					T1x = Ip[WS(rs, 14)];
+					Tq = Rp[WS(rs, 6)];
+					T1A = Tn - To;
+					Tp = Tn + To;
+					T3e = T1x - T1w;
+					T1y = T1w + T1x;
+					Tr = Rm[WS(rs, 9)];
+					T1B = Ip[WS(rs, 6)];
+					T1C = Im[WS(rs, 9)];
+				   }
+				   {
+					E Tg, Th, T1l, T1m;
+					Tg = Rp[WS(rs, 2)];
+					{
+					     E T1v, Ts, T3f, T1D;
+					     T1v = Tq - Tr;
+					     Ts = Tq + Tr;
+					     T3f = T1B - T1C;
+					     T1D = T1B + T1C;
+					     T1z = T1v - T1y;
+					     T6R = T1v + T1y;
+					     T6a = Tp - Ts;
+					     Tt = Tp + Ts;
+					     T3g = T3e + T3f;
+					     T6b = T3e - T3f;
+					     T1E = T1A - T1D;
+					     T6Q = T1A + T1D;
+					     Th = Rm[WS(rs, 13)];
+					}
+					T1l = Ip[WS(rs, 2)];
+					T1m = Im[WS(rs, 13)];
+					Tj = Rp[WS(rs, 10)];
+					T1p = Tg - Th;
+					Ti = Tg + Th;
+					T3b = T1l - T1m;
+					T1n = T1l + T1m;
+					Tk = Rm[WS(rs, 5)];
+					T1q = Ip[WS(rs, 10)];
+					T1r = Im[WS(rs, 5)];
+				   }
+			      }
+			 }
+			 {
+			      E T4o, T67, T68, T4p, T2I, T1i, T2N, T1u, T1F, T2O, T6K, T17;
+			      {
+				   E Tf, T1o, T1t, Tu, T7g, T6P, T6S, T7h, T7d, T7e;
+				   {
+					E T6O, T6N, T1k, Tl;
+					T4o = T7 - Te;
+					Tf = T7 + Te;
+					T1k = Tj - Tk;
+					Tl = Tj + Tk;
+					{
+					     E T3c, T1s, Tm, T3d;
+					     T3c = T1q - T1r;
+					     T1s = T1q + T1r;
+					     T1o = T1k + T1n;
+					     T6O = T1n - T1k;
+					     T67 = Ti - Tl;
+					     Tm = Ti + Tl;
+					     T3d = T3b + T3c;
+					     T68 = T3b - T3c;
+					     T1t = T1p - T1s;
+					     T6N = T1p + T1s;
+					     T4B = Tm - Tt;
+					     Tu = Tm + Tt;
+					     T4p = T3g - T3d;
+					     T3h = T3d + T3g;
+					}
+					T7g = FNMS(KP414213562, T6N, T6O);
+					T6P = FMA(KP414213562, T6O, T6N);
+					T6S = FMA(KP414213562, T6R, T6Q);
+					T7h = FNMS(KP414213562, T6Q, T6R);
+				   }
+				   T3K = Tf - Tu;
+				   Tv = Tf + Tu;
+				   T8Y = T6P + T6S;
+				   T6T = T6P - T6S;
+				   T2I = T2E - T2H;
+				   T7d = T2E + T2H;
+				   T7e = T1c + T1h;
+				   T1i = T1c - T1h;
+				   T2N = FNMS(KP414213562, T1o, T1t);
+				   T1u = FMA(KP414213562, T1t, T1o);
+				   T8L = T7h - T7g;
+				   T7i = T7g + T7h;
+				   T8X = FMA(KP707106781, T7e, T7d);
+				   T7f = FNMS(KP707106781, T7e, T7d);
+				   T1F = FNMS(KP414213562, T1E, T1z);
+				   T2O = FMA(KP414213562, T1z, T1E);
+				   T6K = T16 - T13;
+				   T17 = T13 + T16;
+			      }
+			      {
+				   E T6L, T6A, T6B, T65, T3k, T2L, T69, T6c, T3n;
+				   T4Y = T1F - T1u;
+				   T1G = T1u + T1F;
+				   T4K = FNMS(KP707106781, T1i, T17);
+				   T1j = FMA(KP707106781, T1i, T17);
+				   T2L = T2J + T2K;
+				   T6L = T2J - T2K;
+				   T6A = T67 + T68;
+				   T69 = T67 - T68;
+				   T6c = T6a + T6b;
+				   T6B = T6b - T6a;
+				   T4X = FNMS(KP707106781, T2L, T2I);
+				   T2M = FMA(KP707106781, T2L, T2I);
+				   T8C = T69 - T6c;
+				   T6d = T69 + T6c;
+				   T65 = T3j - T3i;
+				   T3k = T3i + T3j;
+				   T8o = T64 - T65;
+				   T66 = T64 + T65;
+				   T8K = FNMS(KP707106781, T6L, T6K);
+				   T6M = FMA(KP707106781, T6L, T6K);
+				   T3n = T3l + T3m;
+				   T6y = T3l - T3m;
+				   T4L = T2N - T2O;
+				   T2P = T2N + T2O;
+				   T4C = T3n - T3k;
+				   T3o = T3k + T3n;
+				   T5q = T4o - T4p;
+				   T4q = T4o + T4p;
+				   T8p = T6B - T6A;
+				   T6C = T6A + T6B;
+			      }
+			 }
+		    }
+		    {
+			 E T1M, T6V, T6f, TC, T31, T6j, T23, T6Y, T2v, T2i, TY, T6p, T6n, T35, T2n;
+			 E T2w, T24, T1R, TJ, T6i, T6g, T2Y, T1W, T25, T2q, TN, T2r, T36, T2c, T29;
+			 E TQ, T2s;
+			 {
+			      E TU, T2k, T33, T2j, TX, T2l, T2m, T34;
+			      {
+				   E T1Z, Ty, T20, T2Z, T1L, T1I, TB, T21, T2e, T2h;
+				   {
+					E T1J, T1K, Tw, Tx, Tz, TA;
+					Tw = Rp[WS(rs, 1)];
+					Tx = Rm[WS(rs, 14)];
+					T1J = Ip[WS(rs, 1)];
+					T8B = T6y - T6x;
+					T6z = T6x + T6y;
+					T1Z = Tw - Tx;
+					Ty = Tw + Tx;
+					T1K = Im[WS(rs, 14)];
+					Tz = Rp[WS(rs, 9)];
+					TA = Rm[WS(rs, 6)];
+					T20 = Ip[WS(rs, 9)];
+					T2Z = T1J - T1K;
+					T1L = T1J + T1K;
+					T1I = Tz - TA;
+					TB = Tz + TA;
+					T21 = Im[WS(rs, 6)];
+				   }
+				   {
+					E T2f, T2g, TV, TW;
+					{
+					     E TS, T30, T22, TT;
+					     TS = Rp[WS(rs, 3)];
+					     T1M = T1I + T1L;
+					     T6V = T1L - T1I;
+					     T6f = Ty - TB;
+					     TC = Ty + TB;
+					     T30 = T20 - T21;
+					     T22 = T20 + T21;
+					     TT = Rm[WS(rs, 12)];
+					     T2f = Ip[WS(rs, 3)];
+					     T31 = T2Z + T30;
+					     T6j = T2Z - T30;
+					     T23 = T1Z - T22;
+					     T6Y = T1Z + T22;
+					     T2e = TS - TT;
+					     TU = TS + TT;
+					     T2g = Im[WS(rs, 12)];
+					}
+					TV = Rm[WS(rs, 4)];
+					TW = Rp[WS(rs, 11)];
+					T2k = Im[WS(rs, 4)];
+					T33 = T2f - T2g;
+					T2h = T2f + T2g;
+					T2j = TV - TW;
+					TX = TV + TW;
+					T2l = Ip[WS(rs, 11)];
+				   }
+				   T2v = T2e - T2h;
+				   T2i = T2e + T2h;
+			      }
+			      TY = TU + TX;
+			      T6p = TU - TX;
+			      T2m = T2k + T2l;
+			      T34 = T2l - T2k;
+			      {
+				   E TF, T1T, T2W, T1S, TI, T1U, T1N, T1Q, T1V, T2X;
+				   {
+					E T1O, T1P, TD, TE, TG, TH;
+					TD = Rp[WS(rs, 5)];
+					TE = Rm[WS(rs, 10)];
+					T6n = T34 - T33;
+					T35 = T33 + T34;
+					T2n = T2j + T2m;
+					T2w = T2j - T2m;
+					T1N = TD - TE;
+					TF = TD + TE;
+					T1O = Ip[WS(rs, 5)];
+					T1P = Im[WS(rs, 10)];
+					TG = Rm[WS(rs, 2)];
+					TH = Rp[WS(rs, 13)];
+					T1T = Im[WS(rs, 2)];
+					T2W = T1O - T1P;
+					T1Q = T1O + T1P;
+					T1S = TG - TH;
+					TI = TG + TH;
+					T1U = Ip[WS(rs, 13)];
+				   }
+				   T24 = T1N - T1Q;
+				   T1R = T1N + T1Q;
+				   TJ = TF + TI;
+				   T6i = TF - TI;
+				   T1V = T1T + T1U;
+				   T2X = T1U - T1T;
+				   {
+					E T2a, T2b, TL, TM, TO, TP;
+					TL = Rm[0];
+					TM = Rp[WS(rs, 15)];
+					T6g = T2X - T2W;
+					T2Y = T2W + T2X;
+					T1W = T1S + T1V;
+					T25 = T1S - T1V;
+					T2q = TL - TM;
+					TN = TL + TM;
+					T2a = Im[0];
+					T2b = Ip[WS(rs, 15)];
+					TO = Rp[WS(rs, 7)];
+					TP = Rm[WS(rs, 8)];
+					T2r = Ip[WS(rs, 7)];
+					T36 = T2b - T2a;
+					T2c = T2a + T2b;
+					T29 = TO - TP;
+					TQ = TO + TP;
+					T2s = Im[WS(rs, 8)];
+				   }
+			      }
+			 }
+			 {
+			      E T2d, T4u, T4v, T6r, T6o, T6k, T8u, T8v, T6h;
+			      {
+				   E T4r, T6m, T32, T4s, T6q, T39, T8r, T8s;
+				   {
+					E TK, TR, T37, T2t, TZ, T38;
+					T4r = TC - TJ;
+					TK = TC + TJ;
+					T2d = T29 - T2c;
+					T72 = T29 + T2c;
+					T6m = TN - TQ;
+					TR = TN + TQ;
+					T37 = T2r - T2s;
+					T2t = T2r + T2s;
+					T32 = T2Y + T31;
+					T4s = T31 - T2Y;
+					T4u = TR - TY;
+					TZ = TR + TY;
+					T38 = T36 + T37;
+					T6q = T36 - T37;
+					T2u = T2q - T2t;
+					T75 = T2q + T2t;
+					T10 = TK + TZ;
+					T3P = TK - TZ;
+					T4v = T38 - T35;
+					T39 = T35 + T38;
+				   }
+				   T8r = T6q - T6p;
+				   T6r = T6p + T6q;
+				   T3a = T32 + T39;
+				   T3L = T39 - T32;
+				   T8s = T6m - T6n;
+				   T6o = T6m + T6n;
+				   T4t = T4r - T4s;
+				   T4E = T4r + T4s;
+				   T8F = FNMS(KP414213562, T8r, T8s);
+				   T8t = FMA(KP414213562, T8s, T8r);
+				   T6k = T6i + T6j;
+				   T8u = T6j - T6i;
+				   T8v = T6f - T6g;
+				   T6h = T6f + T6g;
+			      }
+			      {
+				   E T6Z, T1Y, T4O, T26, T6W, T1X, T2o, T4N, T27;
+				   T4F = T4v - T4u;
+				   T4w = T4u + T4v;
+				   T8E = FMA(KP414213562, T8u, T8v);
+				   T8w = FNMS(KP414213562, T8v, T8u);
+				   T6Z = T1R + T1W;
+				   T1X = T1R - T1W;
+				   T6E = FMA(KP414213562, T6h, T6k);
+				   T6l = FNMS(KP414213562, T6k, T6h);
+				   T6F = FNMS(KP414213562, T6o, T6r);
+				   T6s = FMA(KP414213562, T6r, T6o);
+				   T1Y = FMA(KP707106781, T1X, T1M);
+				   T4O = FNMS(KP707106781, T1X, T1M);
+				   T26 = T24 + T25;
+				   T6W = T25 - T24;
+				   T76 = T2i + T2n;
+				   T2o = T2i - T2n;
+				   T4N = FNMS(KP707106781, T26, T23);
+				   T27 = FMA(KP707106781, T26, T23);
+				   {
+					E T8O, T6X, T8N, T70;
+					T8O = FMA(KP707106781, T6W, T6V);
+					T6X = FNMS(KP707106781, T6W, T6V);
+					T8N = FMA(KP707106781, T6Z, T6Y);
+					T70 = FNMS(KP707106781, T6Z, T6Y);
+					T4P = FMA(KP668178637, T4O, T4N);
+					T51 = FNMS(KP668178637, T4N, T4O);
+					T2R = FNMS(KP198912367, T1Y, T27);
+					T28 = FMA(KP198912367, T27, T1Y);
+					T8P = FMA(KP198912367, T8O, T8N);
+					T90 = FNMS(KP198912367, T8N, T8O);
+					T7k = FNMS(KP668178637, T6X, T70);
+					T71 = FMA(KP668178637, T70, T6X);
+					T2p = FMA(KP707106781, T2o, T2d);
+					T4R = FNMS(KP707106781, T2o, T2d);
+				   }
+				   T2x = T2v + T2w;
+				   T73 = T2v - T2w;
+			      }
+			 }
+		    }
+		    {
+			 E T8S, T91, T7l, T78, T5U, T5X, T5y, T61, T5V, T5K, T5S, T60, T5W, T5M, T5I;
+			 {
+			      E T4S, T50, T4e, T4h, T3S, T4l, T4f, T44, T4c, T4k, T4g, T46, T42;
+			      {
+				   E T3Q, T3U, T40, T3Z, T3V, T3A, T3D, T3H, T3B, T3y, T3G, T3C;
+				   {
+					E T11, T3t, T3w, T3q, T3x, T3v, T3F, T12, T2B, T2U, T3z, T2C;
+					{
+					     E T3u, T2S, T2z, T3p, T4Q, T2y;
+					     T3u = Tv - T10;
+					     T11 = Tv + T10;
+					     T4Q = FNMS(KP707106781, T2x, T2u);
+					     T2y = FMA(KP707106781, T2x, T2u);
+					     {
+						  E T8R, T74, T8Q, T77;
+						  T8R = FMA(KP707106781, T73, T72);
+						  T74 = FNMS(KP707106781, T73, T72);
+						  T8Q = FMA(KP707106781, T76, T75);
+						  T77 = FNMS(KP707106781, T76, T75);
+						  T4S = FNMS(KP668178637, T4R, T4Q);
+						  T50 = FMA(KP668178637, T4Q, T4R);
+						  T2S = FMA(KP198912367, T2p, T2y);
+						  T2z = FNMS(KP198912367, T2y, T2p);
+						  T8S = FMA(KP198912367, T8R, T8Q);
+						  T91 = FNMS(KP198912367, T8Q, T8R);
+						  T7l = FNMS(KP668178637, T74, T77);
+						  T78 = FMA(KP668178637, T77, T74);
+						  T3Q = T3o - T3h;
+						  T3p = T3h + T3o;
+					     }
+					     T3t = W[30];
+					     T3w = W[31];
+					     T3q = T3a + T3p;
+					     T3x = T3p - T3a;
+					     T3v = T3t * T3u;
+					     T3F = T3w * T3u;
+					     {
+						  E T1H, T2A, T2Q, T2T;
+						  T3U = FNMS(KP923879532, T1G, T1j);
+						  T1H = FMA(KP923879532, T1G, T1j);
+						  T2A = T28 + T2z;
+						  T40 = T2z - T28;
+						  T3Z = FNMS(KP923879532, T2P, T2M);
+						  T2Q = FMA(KP923879532, T2P, T2M);
+						  T2T = T2R + T2S;
+						  T3V = T2R - T2S;
+						  T12 = W[0];
+						  T3A = FNMS(KP980785280, T2A, T1H);
+						  T2B = FMA(KP980785280, T2A, T1H);
+						  T3D = FNMS(KP980785280, T2T, T2Q);
+						  T2U = FMA(KP980785280, T2T, T2Q);
+						  T3z = W[32];
+						  T2C = T12 * T2B;
+					     }
+					}
+					{
+					     E T2V, T3s, T2D, T3r;
+					     T2D = W[1];
+					     T3r = T12 * T2U;
+					     T3H = T3z * T3D;
+					     T3B = T3z * T3A;
+					     T2V = FMA(T2D, T2U, T2C);
+					     T3s = FNMS(T2D, T2B, T3r);
+					     T3y = FNMS(T3w, T3x, T3v);
+					     T3G = FMA(T3t, T3x, T3F);
+					     Rm[0] = T11 + T2V;
+					     Rp[0] = T11 - T2V;
+					     Im[0] = T3s - T3q;
+					     Ip[0] = T3q + T3s;
+					     T3C = W[33];
+					}
+				   }
+				   {
+					E T4b, T3R, T47, T4a, T3J, T49, T4j, T3O, T3N, T43, T3W, T3T, T41, T4d, T3X;
+					E T45, T3Y;
+					{
+					     E T3M, T48, T3I, T3E;
+					     T3M = T3K + T3L;
+					     T48 = T3K - T3L;
+					     T3I = FNMS(T3C, T3A, T3H);
+					     T3E = FMA(T3C, T3D, T3B);
+					     T4b = T3Q - T3P;
+					     T3R = T3P + T3Q;
+					     Im[WS(rs, 8)] = T3I - T3G;
+					     Ip[WS(rs, 8)] = T3G + T3I;
+					     Rm[WS(rs, 8)] = T3y + T3E;
+					     Rp[WS(rs, 8)] = T3y - T3E;
+					     T47 = W[46];
+					     T4a = W[47];
+					     T3J = W[14];
+					     T49 = T47 * T48;
+					     T4j = T4a * T48;
+					     T3O = W[15];
+					     T3N = T3J * T3M;
+					     T43 = T3O * T3M;
+					     T3W = FMA(KP980785280, T3V, T3U);
+					     T4e = FNMS(KP980785280, T3V, T3U);
+					     T3T = W[16];
+					     T4h = FNMS(KP980785280, T40, T3Z);
+					     T41 = FMA(KP980785280, T40, T3Z);
+					     T4d = W[48];
+					     T3X = T3T * T3W;
+					}
+					T3S = FNMS(T3O, T3R, T3N);
+					T45 = T3T * T41;
+					T4l = T4d * T4h;
+					T4f = T4d * T4e;
+					T44 = FMA(T3J, T3R, T43);
+					T3Y = W[17];
+					T4c = FNMS(T4a, T4b, T49);
+					T4k = FMA(T47, T4b, T4j);
+					T4g = W[49];
+					T46 = FNMS(T3Y, T3W, T45);
+					T42 = FMA(T3Y, T41, T3X);
+				   }
+			      }
+			      {
+				   E T5v, T5r, T5w, T5A, T5G, T5F, T5B, T5g, T5j, T4I, T5n, T5h, T56, T5e, T5m;
+				   E T5i, T58, T54;
+				   {
+					E T4n, T4A, T5d, T4H, T59, T5c, T55, T4z, T5b, T5l, T4J, T4U, T53, T5f, T4V;
+					E T57, T4W;
+					{
+					     E T4D, T4G, T4m, T4i, T5a, T4y, T4x;
+					     T5v = T4C - T4B;
+					     T4D = T4B + T4C;
+					     T4m = FNMS(T4g, T4e, T4l);
+					     T4i = FMA(T4g, T4h, T4f);
+					     Im[WS(rs, 4)] = T46 - T44;
+					     Ip[WS(rs, 4)] = T44 + T46;
+					     Rm[WS(rs, 4)] = T3S + T42;
+					     Rp[WS(rs, 4)] = T3S - T42;
+					     Im[WS(rs, 12)] = T4m - T4k;
+					     Ip[WS(rs, 12)] = T4k + T4m;
+					     Rm[WS(rs, 12)] = T4c + T4i;
+					     Rp[WS(rs, 12)] = T4c - T4i;
+					     T4G = T4E + T4F;
+					     T5r = T4F - T4E;
+					     T5w = T4t - T4w;
+					     T4x = T4t + T4w;
+					     T4n = W[6];
+					     T4A = W[7];
+					     T5d = FNMS(KP707106781, T4G, T4D);
+					     T4H = FMA(KP707106781, T4G, T4D);
+					     T5a = FNMS(KP707106781, T4x, T4q);
+					     T4y = FMA(KP707106781, T4x, T4q);
+					     T59 = W[38];
+					     T5c = W[39];
+					     {
+						  E T4M, T4T, T4Z, T52;
+						  T4M = FMA(KP923879532, T4L, T4K);
+						  T5A = FNMS(KP923879532, T4L, T4K);
+						  T55 = T4A * T4y;
+						  T4z = T4n * T4y;
+						  T5b = T59 * T5a;
+						  T5l = T5c * T5a;
+						  T5G = T4P + T4S;
+						  T4T = T4P - T4S;
+						  T4Z = FMA(KP923879532, T4Y, T4X);
+						  T5F = FNMS(KP923879532, T4Y, T4X);
+						  T5B = T51 + T50;
+						  T52 = T50 - T51;
+						  T4J = W[8];
+						  T4U = FMA(KP831469612, T4T, T4M);
+						  T5g = FNMS(KP831469612, T4T, T4M);
+						  T53 = FMA(KP831469612, T52, T4Z);
+						  T5j = FNMS(KP831469612, T52, T4Z);
+						  T5f = W[40];
+						  T4V = T4J * T4U;
+					     }
+					}
+					T4I = FNMS(T4A, T4H, T4z);
+					T57 = T4J * T53;
+					T5n = T5f * T5j;
+					T5h = T5f * T5g;
+					T56 = FMA(T4n, T4H, T55);
+					T4W = W[9];
+					T5e = FNMS(T5c, T5d, T5b);
+					T5m = FMA(T59, T5d, T5l);
+					T5i = W[41];
+					T58 = FNMS(T4W, T4U, T57);
+					T54 = FMA(T4W, T53, T4V);
+				   }
+				   {
+					E T5p, T5u, T5x, T5R, T5N, T5Q, T5J, T5t, T5P, T5Z, T5z, T5C, T5H, T5T, T5D;
+					E T5L, T5E;
+					{
+					     E T5o, T5k, T5s, T5O;
+					     T5o = FNMS(T5i, T5g, T5n);
+					     T5k = FMA(T5i, T5j, T5h);
+					     Im[WS(rs, 2)] = T58 - T56;
+					     Ip[WS(rs, 2)] = T56 + T58;
+					     Rm[WS(rs, 2)] = T4I + T54;
+					     Rp[WS(rs, 2)] = T4I - T54;
+					     Im[WS(rs, 10)] = T5o - T5m;
+					     Ip[WS(rs, 10)] = T5m + T5o;
+					     Rm[WS(rs, 10)] = T5e + T5k;
+					     Rp[WS(rs, 10)] = T5e - T5k;
+					     T5p = W[22];
+					     T5u = W[23];
+					     T5x = FMA(KP707106781, T5w, T5v);
+					     T5R = FNMS(KP707106781, T5w, T5v);
+					     T5s = FMA(KP707106781, T5r, T5q);
+					     T5O = FNMS(KP707106781, T5r, T5q);
+					     T5N = W[54];
+					     T5Q = W[55];
+					     T5J = T5u * T5s;
+					     T5t = T5p * T5s;
+					     T5P = T5N * T5O;
+					     T5Z = T5Q * T5O;
+					     T5z = W[24];
+					     T5U = FMA(KP831469612, T5B, T5A);
+					     T5C = FNMS(KP831469612, T5B, T5A);
+					     T5X = FMA(KP831469612, T5G, T5F);
+					     T5H = FNMS(KP831469612, T5G, T5F);
+					     T5T = W[56];
+					     T5D = T5z * T5C;
+					}
+					T5y = FNMS(T5u, T5x, T5t);
+					T5L = T5z * T5H;
+					T61 = T5T * T5X;
+					T5V = T5T * T5U;
+					T5K = FMA(T5p, T5x, T5J);
+					T5E = W[25];
+					T5S = FNMS(T5Q, T5R, T5P);
+					T60 = FMA(T5N, T5R, T5Z);
+					T5W = W[57];
+					T5M = FNMS(T5E, T5C, T5L);
+					T5I = FMA(T5E, T5H, T5D);
+				   }
+			      }
+			 }
+			 {
+			      E T7P, T7L, T7K, T7Q, T7U, T80, T7Z, T7V, T9v, T9r, T9q, T9w, T9A, T9G, T9F;
+			      E T9B, T9g, T9j, T8I, T9n, T9h, T96, T9e, T9m, T9i, T98, T94;
+			      {
+				   E T7A, T7D, T6I, T7H, T7B, T7q, T7y, T7G, T7C, T7s, T7o;
+				   {
+					E T63, T7x, T6H, T6w, T7t, T7w, T6v, T7p, T7v, T7F, T6J, T7a, T7n, T7z, T7b;
+					E T7r, T7c;
+					{
+					     E T6D, T6G, T62, T5Y;
+					     T7P = FNMS(KP707106781, T6C, T6z);
+					     T6D = FMA(KP707106781, T6C, T6z);
+					     T62 = FNMS(T5W, T5U, T61);
+					     T5Y = FMA(T5W, T5X, T5V);
+					     Im[WS(rs, 6)] = T5M - T5K;
+					     Ip[WS(rs, 6)] = T5K + T5M;
+					     Rm[WS(rs, 6)] = T5y + T5I;
+					     Rp[WS(rs, 6)] = T5y - T5I;
+					     Im[WS(rs, 14)] = T62 - T60;
+					     Ip[WS(rs, 14)] = T60 + T62;
+					     Rm[WS(rs, 14)] = T5S + T5Y;
+					     Rp[WS(rs, 14)] = T5S - T5Y;
+					     T6G = T6E + T6F;
+					     T7L = T6F - T6E;
+					     {
+						  E T6e, T6t, T7u, T6u;
+						  T7K = FNMS(KP707106781, T6d, T66);
+						  T6e = FMA(KP707106781, T6d, T66);
+						  T6t = T6l + T6s;
+						  T7Q = T6l - T6s;
+						  T63 = W[2];
+						  T7x = FNMS(KP923879532, T6G, T6D);
+						  T6H = FMA(KP923879532, T6G, T6D);
+						  T7u = FNMS(KP923879532, T6t, T6e);
+						  T6u = FMA(KP923879532, T6t, T6e);
+						  T6w = W[3];
+						  T7t = W[34];
+						  T7w = W[35];
+						  T6v = T63 * T6u;
+						  T7p = T6w * T6u;
+						  T7v = T7t * T7u;
+						  T7F = T7w * T7u;
+					     }
+					     {
+						  E T6U, T79, T7j, T7m;
+						  T7U = FNMS(KP923879532, T6T, T6M);
+						  T6U = FMA(KP923879532, T6T, T6M);
+						  T79 = T71 - T78;
+						  T80 = T71 + T78;
+						  T7Z = FMA(KP923879532, T7i, T7f);
+						  T7j = FNMS(KP923879532, T7i, T7f);
+						  T7m = T7k + T7l;
+						  T7V = T7k - T7l;
+						  T6J = W[4];
+						  T7A = FNMS(KP831469612, T79, T6U);
+						  T7a = FMA(KP831469612, T79, T6U);
+						  T7D = FNMS(KP831469612, T7m, T7j);
+						  T7n = FMA(KP831469612, T7m, T7j);
+						  T7z = W[36];
+						  T7b = T6J * T7a;
+					     }
+					}
+					T6I = FNMS(T6w, T6H, T6v);
+					T7r = T6J * T7n;
+					T7H = T7z * T7D;
+					T7B = T7z * T7A;
+					T7q = FMA(T63, T6H, T7p);
+					T7c = W[5];
+					T7y = FNMS(T7w, T7x, T7v);
+					T7G = FMA(T7t, T7x, T7F);
+					T7C = W[37];
+					T7s = FNMS(T7c, T7a, T7r);
+					T7o = FMA(T7c, T7n, T7b);
+				   }
+				   {
+					E T8n, T9d, T8H, T8A, T99, T9c, T8z, T95, T9b, T9l, T8J, T8U, T93, T9f, T8V;
+					E T97, T8W;
+					{
+					     E T8D, T8G, T7I, T7E;
+					     T9v = FNMS(KP707106781, T8C, T8B);
+					     T8D = FMA(KP707106781, T8C, T8B);
+					     T7I = FNMS(T7C, T7A, T7H);
+					     T7E = FMA(T7C, T7D, T7B);
+					     Im[WS(rs, 1)] = T7s - T7q;
+					     Ip[WS(rs, 1)] = T7q + T7s;
+					     Rm[WS(rs, 1)] = T6I + T7o;
+					     Rp[WS(rs, 1)] = T6I - T7o;
+					     Im[WS(rs, 9)] = T7I - T7G;
+					     Ip[WS(rs, 9)] = T7G + T7I;
+					     Rm[WS(rs, 9)] = T7y + T7E;
+					     Rp[WS(rs, 9)] = T7y - T7E;
+					     T8G = T8E - T8F;
+					     T9r = T8E + T8F;
+					     {
+						  E T8q, T8x, T9a, T8y;
+						  T9q = FNMS(KP707106781, T8p, T8o);
+						  T8q = FMA(KP707106781, T8p, T8o);
+						  T8x = T8t - T8w;
+						  T9w = T8w + T8t;
+						  T8n = W[10];
+						  T9d = FNMS(KP923879532, T8G, T8D);
+						  T8H = FMA(KP923879532, T8G, T8D);
+						  T9a = FNMS(KP923879532, T8x, T8q);
+						  T8y = FMA(KP923879532, T8x, T8q);
+						  T8A = W[11];
+						  T99 = W[42];
+						  T9c = W[43];
+						  T8z = T8n * T8y;
+						  T95 = T8A * T8y;
+						  T9b = T99 * T9a;
+						  T9l = T9c * T9a;
+					     }
+					     {
+						  E T8M, T8T, T8Z, T92;
+						  T9A = FNMS(KP923879532, T8L, T8K);
+						  T8M = FMA(KP923879532, T8L, T8K);
+						  T8T = T8P - T8S;
+						  T9G = T8P + T8S;
+						  T9F = FMA(KP923879532, T8Y, T8X);
+						  T8Z = FNMS(KP923879532, T8Y, T8X);
+						  T92 = T90 + T91;
+						  T9B = T91 - T90;
+						  T8J = W[12];
+						  T9g = FNMS(KP980785280, T8T, T8M);
+						  T8U = FMA(KP980785280, T8T, T8M);
+						  T9j = FMA(KP980785280, T92, T8Z);
+						  T93 = FNMS(KP980785280, T92, T8Z);
+						  T9f = W[44];
+						  T8V = T8J * T8U;
+					     }
+					}
+					T8I = FNMS(T8A, T8H, T8z);
+					T97 = T8J * T93;
+					T9n = T9f * T9j;
+					T9h = T9f * T9g;
+					T96 = FMA(T8n, T8H, T95);
+					T8W = W[13];
+					T9e = FNMS(T9c, T9d, T9b);
+					T9m = FMA(T99, T9d, T9l);
+					T9i = W[45];
+					T98 = FNMS(T8W, T8U, T97);
+					T94 = FMA(T8W, T93, T8V);
+				   }
+			      }
+			      {
+				   E T9U, T9X, T9y, Ta1, T9V, T9K, T9S, Ta0, T9W, T9M, T9I;
+				   {
+					E T9p, T9R, T9x, T9u, T9N, T9Q, T9t, T9J, T9P, T9Z, T9z, T9C, T9H, T9T, T9D;
+					E T9L, T9E;
+					{
+					     E T9o, T9k, T9O, T9s;
+					     T9o = FNMS(T9i, T9g, T9n);
+					     T9k = FMA(T9i, T9j, T9h);
+					     Im[WS(rs, 3)] = T98 - T96;
+					     Ip[WS(rs, 3)] = T96 + T98;
+					     Rm[WS(rs, 3)] = T8I + T94;
+					     Rp[WS(rs, 3)] = T8I - T94;
+					     Im[WS(rs, 11)] = T9o - T9m;
+					     Ip[WS(rs, 11)] = T9m + T9o;
+					     Rm[WS(rs, 11)] = T9e + T9k;
+					     Rp[WS(rs, 11)] = T9e - T9k;
+					     T9p = W[26];
+					     T9R = FMA(KP923879532, T9w, T9v);
+					     T9x = FNMS(KP923879532, T9w, T9v);
+					     T9O = FMA(KP923879532, T9r, T9q);
+					     T9s = FNMS(KP923879532, T9r, T9q);
+					     T9u = W[27];
+					     T9N = W[58];
+					     T9Q = W[59];
+					     T9t = T9p * T9s;
+					     T9J = T9u * T9s;
+					     T9P = T9N * T9O;
+					     T9Z = T9Q * T9O;
+					     T9z = W[28];
+					     T9U = FNMS(KP980785280, T9B, T9A);
+					     T9C = FMA(KP980785280, T9B, T9A);
+					     T9X = FMA(KP980785280, T9G, T9F);
+					     T9H = FNMS(KP980785280, T9G, T9F);
+					     T9T = W[60];
+					     T9D = T9z * T9C;
+					}
+					T9y = FNMS(T9u, T9x, T9t);
+					T9L = T9z * T9H;
+					Ta1 = T9T * T9X;
+					T9V = T9T * T9U;
+					T9K = FMA(T9p, T9x, T9J);
+					T9E = W[29];
+					T9S = FNMS(T9Q, T9R, T9P);
+					Ta0 = FMA(T9N, T9R, T9Z);
+					T9W = W[61];
+					T9M = FNMS(T9E, T9C, T9L);
+					T9I = FMA(T9E, T9H, T9D);
+				   }
+				   {
+					E T7J, T8b, T7R, T7O, T87, T8a, T7N, T83, T89, T8j, T7T, T7W, T81, T8d, T7X;
+					E T85, T7Y;
+					{
+					     E Ta2, T9Y, T88, T7M;
+					     Ta2 = FNMS(T9W, T9U, Ta1);
+					     T9Y = FMA(T9W, T9X, T9V);
+					     Im[WS(rs, 7)] = T9M - T9K;
+					     Ip[WS(rs, 7)] = T9K + T9M;
+					     Rm[WS(rs, 7)] = T9y + T9I;
+					     Rp[WS(rs, 7)] = T9y - T9I;
+					     Im[WS(rs, 15)] = Ta2 - Ta0;
+					     Ip[WS(rs, 15)] = Ta0 + Ta2;
+					     Rm[WS(rs, 15)] = T9S + T9Y;
+					     Rp[WS(rs, 15)] = T9S - T9Y;
+					     T7J = W[18];
+					     T8b = FNMS(KP923879532, T7Q, T7P);
+					     T7R = FMA(KP923879532, T7Q, T7P);
+					     T88 = FNMS(KP923879532, T7L, T7K);
+					     T7M = FMA(KP923879532, T7L, T7K);
+					     T7O = W[19];
+					     T87 = W[50];
+					     T8a = W[51];
+					     T7N = T7J * T7M;
+					     T83 = T7O * T7M;
+					     T89 = T87 * T88;
+					     T8j = T8a * T88;
+					     T7T = W[20];
+					     T8e = FNMS(KP831469612, T7V, T7U);
+					     T7W = FMA(KP831469612, T7V, T7U);
+					     T8h = FMA(KP831469612, T80, T7Z);
+					     T81 = FNMS(KP831469612, T80, T7Z);
+					     T8d = W[52];
+					     T7X = T7T * T7W;
+					}
+					T7S = FNMS(T7O, T7R, T7N);
+					T85 = T7T * T81;
+					T8l = T8d * T8h;
+					T8f = T8d * T8e;
+					T84 = FMA(T7J, T7R, T83);
+					T7Y = W[21];
+					T8c = FNMS(T8a, T8b, T89);
+					T8k = FMA(T87, T8b, T8j);
+					T8g = W[53];
+					T86 = FNMS(T7Y, T7W, T85);
+					T82 = FMA(T7Y, T81, T7X);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       T8m = FNMS(T8g, T8e, T8l);
+	       T8i = FMA(T8g, T8h, T8f);
+	       Im[WS(rs, 5)] = T86 - T84;
+	       Ip[WS(rs, 5)] = T84 + T86;
+	       Rm[WS(rs, 5)] = T7S + T82;
+	       Rp[WS(rs, 5)] = T7S - T82;
+	       Im[WS(rs, 13)] = T8m - T8k;
+	       Ip[WS(rs, 13)] = T8k + T8m;
+	       Rm[WS(rs, 13)] = T8c + T8i;
+	       Rp[WS(rs, 13)] = T8c - T8i;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cbdft_32", twinstr, &GENUS, {300, 62, 198, 0} };
+
+void X(codelet_hc2cbdft_32) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_32, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -dif -name hc2cbdft_32 -include hc2cb.h */
+
+/*
+ * This function contains 498 FP additions, 208 FP multiplications,
+ * (or, 404 additions, 114 multiplications, 94 fused multiply/add),
+ * 102 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E Tf, T4a, T6h, T7Z, T6P, T8e, T1j, T4v, T2R, T4L, T5C, T7E, T6a, T7U, T3n;
+	       E T4q, TZ, T38, T2p, T4B, T7M, T7R, T2y, T4C, T5Y, T63, T6C, T86, T4i, T4n;
+	       E T6z, T85, TK, T31, T1Y, T4y, T7J, T7Q, T27, T4z, T5R, T62, T6v, T83, T4f;
+	       E T4m, T6s, T82, Tu, T4p, T6o, T8f, T6M, T80, T1G, T4K, T2I, T4w, T5J, T7T;
+	       E T67, T7F, T3g, T4b;
+	       {
+		    E T3, T2M, T16, T3k, T6, T13, T2P, T3l, Td, T3i, T1h, T2K, Ta, T3h, T1c;
+		    E T2J;
+		    {
+			 E T1, T2, T2N, T2O;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 15)];
+			 T3 = T1 + T2;
+			 T2M = T1 - T2;
+			 {
+			      E T14, T15, T4, T5;
+			      T14 = Ip[0];
+			      T15 = Im[WS(rs, 15)];
+			      T16 = T14 + T15;
+			      T3k = T14 - T15;
+			      T4 = Rp[WS(rs, 8)];
+			      T5 = Rm[WS(rs, 7)];
+			      T6 = T4 + T5;
+			      T13 = T4 - T5;
+			 }
+			 T2N = Ip[WS(rs, 8)];
+			 T2O = Im[WS(rs, 7)];
+			 T2P = T2N + T2O;
+			 T3l = T2N - T2O;
+			 {
+			      E Tb, Tc, T1d, T1e, T1f, T1g;
+			      Tb = Rm[WS(rs, 3)];
+			      Tc = Rp[WS(rs, 12)];
+			      T1d = Tb - Tc;
+			      T1e = Im[WS(rs, 3)];
+			      T1f = Ip[WS(rs, 12)];
+			      T1g = T1e + T1f;
+			      Td = Tb + Tc;
+			      T3i = T1f - T1e;
+			      T1h = T1d + T1g;
+			      T2K = T1d - T1g;
+			 }
+			 {
+			      E T8, T9, T18, T19, T1a, T1b;
+			      T8 = Rp[WS(rs, 4)];
+			      T9 = Rm[WS(rs, 11)];
+			      T18 = T8 - T9;
+			      T19 = Ip[WS(rs, 4)];
+			      T1a = Im[WS(rs, 11)];
+			      T1b = T19 + T1a;
+			      Ta = T8 + T9;
+			      T3h = T19 - T1a;
+			      T1c = T18 + T1b;
+			      T2J = T18 - T1b;
+			 }
+		    }
+		    {
+			 E T7, Te, T6f, T6g;
+			 T7 = T3 + T6;
+			 Te = Ta + Td;
+			 Tf = T7 + Te;
+			 T4a = T7 - Te;
+			 T6f = T16 - T13;
+			 T6g = KP707106781 * (T2J - T2K);
+			 T6h = T6f + T6g;
+			 T7Z = T6f - T6g;
+		    }
+		    {
+			 E T6N, T6O, T17, T1i;
+			 T6N = T2M + T2P;
+			 T6O = KP707106781 * (T1c + T1h);
+			 T6P = T6N - T6O;
+			 T8e = T6O + T6N;
+			 T17 = T13 + T16;
+			 T1i = KP707106781 * (T1c - T1h);
+			 T1j = T17 + T1i;
+			 T4v = T17 - T1i;
+		    }
+		    {
+			 E T2L, T2Q, T5A, T5B;
+			 T2L = KP707106781 * (T2J + T2K);
+			 T2Q = T2M - T2P;
+			 T2R = T2L + T2Q;
+			 T4L = T2Q - T2L;
+			 T5A = T3 - T6;
+			 T5B = T3i - T3h;
+			 T5C = T5A + T5B;
+			 T7E = T5A - T5B;
+		    }
+		    {
+			 E T68, T69, T3j, T3m;
+			 T68 = Ta - Td;
+			 T69 = T3k - T3l;
+			 T6a = T68 + T69;
+			 T7U = T69 - T68;
+			 T3j = T3h + T3i;
+			 T3m = T3k + T3l;
+			 T3n = T3j + T3m;
+			 T4q = T3m - T3j;
+		    }
+	       }
+	       {
+		    E TR, T5S, T29, T2t, T2c, T5W, T2w, T37, TY, T5T, T5V, T2i, T2n, T2r, T34;
+		    E T2q, T6A, T6B;
+		    {
+			 E TL, TM, TN, TO, TP, TQ;
+			 TL = Rm[0];
+			 TM = Rp[WS(rs, 15)];
+			 TN = TL + TM;
+			 TO = Rp[WS(rs, 7)];
+			 TP = Rm[WS(rs, 8)];
+			 TQ = TO + TP;
+			 TR = TN + TQ;
+			 T5S = TN - TQ;
+			 T29 = TO - TP;
+			 T2t = TL - TM;
+		    }
+		    {
+			 E T2a, T2b, T35, T2u, T2v, T36;
+			 T2a = Im[0];
+			 T2b = Ip[WS(rs, 15)];
+			 T35 = T2b - T2a;
+			 T2u = Ip[WS(rs, 7)];
+			 T2v = Im[WS(rs, 8)];
+			 T36 = T2u - T2v;
+			 T2c = T2a + T2b;
+			 T5W = T35 - T36;
+			 T2w = T2u + T2v;
+			 T37 = T35 + T36;
+		    }
+		    {
+			 E TU, T2e, T2h, T32, TX, T2j, T2m, T33;
+			 {
+			      E TS, TT, T2f, T2g;
+			      TS = Rp[WS(rs, 3)];
+			      TT = Rm[WS(rs, 12)];
+			      TU = TS + TT;
+			      T2e = TS - TT;
+			      T2f = Ip[WS(rs, 3)];
+			      T2g = Im[WS(rs, 12)];
+			      T2h = T2f + T2g;
+			      T32 = T2f - T2g;
+			 }
+			 {
+			      E TV, TW, T2k, T2l;
+			      TV = Rm[WS(rs, 4)];
+			      TW = Rp[WS(rs, 11)];
+			      TX = TV + TW;
+			      T2j = TV - TW;
+			      T2k = Im[WS(rs, 4)];
+			      T2l = Ip[WS(rs, 11)];
+			      T2m = T2k + T2l;
+			      T33 = T2l - T2k;
+			 }
+			 TY = TU + TX;
+			 T5T = T33 - T32;
+			 T5V = TU - TX;
+			 T2i = T2e + T2h;
+			 T2n = T2j + T2m;
+			 T2r = T2j - T2m;
+			 T34 = T32 + T33;
+			 T2q = T2e - T2h;
+		    }
+		    TZ = TR + TY;
+		    T38 = T34 + T37;
+		    {
+			 E T2d, T2o, T7K, T7L;
+			 T2d = T29 - T2c;
+			 T2o = KP707106781 * (T2i - T2n);
+			 T2p = T2d + T2o;
+			 T4B = T2d - T2o;
+			 T7K = T5S - T5T;
+			 T7L = T5W - T5V;
+			 T7M = FMA(KP382683432, T7K, KP923879532 * T7L);
+			 T7R = FNMS(KP923879532, T7K, KP382683432 * T7L);
+		    }
+		    {
+			 E T2s, T2x, T5U, T5X;
+			 T2s = KP707106781 * (T2q + T2r);
+			 T2x = T2t - T2w;
+			 T2y = T2s + T2x;
+			 T4C = T2x - T2s;
+			 T5U = T5S + T5T;
+			 T5X = T5V + T5W;
+			 T5Y = FMA(KP923879532, T5U, KP382683432 * T5X);
+			 T63 = FNMS(KP382683432, T5U, KP923879532 * T5X);
+		    }
+		    T6A = T2t + T2w;
+		    T6B = KP707106781 * (T2i + T2n);
+		    T6C = T6A - T6B;
+		    T86 = T6B + T6A;
+		    {
+			 E T4g, T4h, T6x, T6y;
+			 T4g = TR - TY;
+			 T4h = T37 - T34;
+			 T4i = T4g + T4h;
+			 T4n = T4h - T4g;
+			 T6x = KP707106781 * (T2q - T2r);
+			 T6y = T29 + T2c;
+			 T6z = T6x - T6y;
+			 T85 = T6y + T6x;
+		    }
+	       }
+	       {
+		    E TC, T5L, T1I, T22, T1L, T5P, T25, T30, TJ, T5M, T5O, T1R, T1W, T20, T2X;
+		    E T1Z, T6t, T6u;
+		    {
+			 E Tw, Tx, Ty, Tz, TA, TB;
+			 Tw = Rp[WS(rs, 1)];
+			 Tx = Rm[WS(rs, 14)];
+			 Ty = Tw + Tx;
+			 Tz = Rp[WS(rs, 9)];
+			 TA = Rm[WS(rs, 6)];
+			 TB = Tz + TA;
+			 TC = Ty + TB;
+			 T5L = Ty - TB;
+			 T1I = Tz - TA;
+			 T22 = Tw - Tx;
+		    }
+		    {
+			 E T1J, T1K, T2Y, T23, T24, T2Z;
+			 T1J = Ip[WS(rs, 1)];
+			 T1K = Im[WS(rs, 14)];
+			 T2Y = T1J - T1K;
+			 T23 = Ip[WS(rs, 9)];
+			 T24 = Im[WS(rs, 6)];
+			 T2Z = T23 - T24;
+			 T1L = T1J + T1K;
+			 T5P = T2Y - T2Z;
+			 T25 = T23 + T24;
+			 T30 = T2Y + T2Z;
+		    }
+		    {
+			 E TF, T1N, T1Q, T2V, TI, T1S, T1V, T2W;
+			 {
+			      E TD, TE, T1O, T1P;
+			      TD = Rp[WS(rs, 5)];
+			      TE = Rm[WS(rs, 10)];
+			      TF = TD + TE;
+			      T1N = TD - TE;
+			      T1O = Ip[WS(rs, 5)];
+			      T1P = Im[WS(rs, 10)];
+			      T1Q = T1O + T1P;
+			      T2V = T1O - T1P;
+			 }
+			 {
+			      E TG, TH, T1T, T1U;
+			      TG = Rm[WS(rs, 2)];
+			      TH = Rp[WS(rs, 13)];
+			      TI = TG + TH;
+			      T1S = TG - TH;
+			      T1T = Im[WS(rs, 2)];
+			      T1U = Ip[WS(rs, 13)];
+			      T1V = T1T + T1U;
+			      T2W = T1U - T1T;
+			 }
+			 TJ = TF + TI;
+			 T5M = T2W - T2V;
+			 T5O = TF - TI;
+			 T1R = T1N + T1Q;
+			 T1W = T1S + T1V;
+			 T20 = T1S - T1V;
+			 T2X = T2V + T2W;
+			 T1Z = T1N - T1Q;
+		    }
+		    TK = TC + TJ;
+		    T31 = T2X + T30;
+		    {
+			 E T1M, T1X, T7H, T7I;
+			 T1M = T1I + T1L;
+			 T1X = KP707106781 * (T1R - T1W);
+			 T1Y = T1M + T1X;
+			 T4y = T1M - T1X;
+			 T7H = T5L - T5M;
+			 T7I = T5P - T5O;
+			 T7J = FNMS(KP923879532, T7I, KP382683432 * T7H);
+			 T7Q = FMA(KP923879532, T7H, KP382683432 * T7I);
+		    }
+		    {
+			 E T21, T26, T5N, T5Q;
+			 T21 = KP707106781 * (T1Z + T20);
+			 T26 = T22 - T25;
+			 T27 = T21 + T26;
+			 T4z = T26 - T21;
+			 T5N = T5L + T5M;
+			 T5Q = T5O + T5P;
+			 T5R = FNMS(KP382683432, T5Q, KP923879532 * T5N);
+			 T62 = FMA(KP382683432, T5N, KP923879532 * T5Q);
+		    }
+		    T6t = T22 + T25;
+		    T6u = KP707106781 * (T1R + T1W);
+		    T6v = T6t - T6u;
+		    T83 = T6u + T6t;
+		    {
+			 E T4d, T4e, T6q, T6r;
+			 T4d = TC - TJ;
+			 T4e = T30 - T2X;
+			 T4f = T4d - T4e;
+			 T4m = T4d + T4e;
+			 T6q = T1L - T1I;
+			 T6r = KP707106781 * (T1Z - T20);
+			 T6s = T6q + T6r;
+			 T82 = T6q - T6r;
+		    }
+	       }
+	       {
+		    E Ti, T3a, Tl, T3b, T1o, T1t, T6j, T6i, T5E, T5D, Tp, T3d, Ts, T3e, T1z;
+		    E T1E, T6m, T6l, T5H, T5G;
+		    {
+			 E T1p, T1n, T1k, T1s;
+			 {
+			      E Tg, Th, T1l, T1m;
+			      Tg = Rp[WS(rs, 2)];
+			      Th = Rm[WS(rs, 13)];
+			      Ti = Tg + Th;
+			      T1p = Tg - Th;
+			      T1l = Ip[WS(rs, 2)];
+			      T1m = Im[WS(rs, 13)];
+			      T1n = T1l + T1m;
+			      T3a = T1l - T1m;
+			 }
+			 {
+			      E Tj, Tk, T1q, T1r;
+			      Tj = Rp[WS(rs, 10)];
+			      Tk = Rm[WS(rs, 5)];
+			      Tl = Tj + Tk;
+			      T1k = Tj - Tk;
+			      T1q = Ip[WS(rs, 10)];
+			      T1r = Im[WS(rs, 5)];
+			      T1s = T1q + T1r;
+			      T3b = T1q - T1r;
+			 }
+			 T1o = T1k + T1n;
+			 T1t = T1p - T1s;
+			 T6j = T1p + T1s;
+			 T6i = T1n - T1k;
+			 T5E = T3a - T3b;
+			 T5D = Ti - Tl;
+		    }
+		    {
+			 E T1A, T1y, T1v, T1D;
+			 {
+			      E Tn, To, T1w, T1x;
+			      Tn = Rm[WS(rs, 1)];
+			      To = Rp[WS(rs, 14)];
+			      Tp = Tn + To;
+			      T1A = Tn - To;
+			      T1w = Im[WS(rs, 1)];
+			      T1x = Ip[WS(rs, 14)];
+			      T1y = T1w + T1x;
+			      T3d = T1x - T1w;
+			 }
+			 {
+			      E Tq, Tr, T1B, T1C;
+			      Tq = Rp[WS(rs, 6)];
+			      Tr = Rm[WS(rs, 9)];
+			      Ts = Tq + Tr;
+			      T1v = Tq - Tr;
+			      T1B = Ip[WS(rs, 6)];
+			      T1C = Im[WS(rs, 9)];
+			      T1D = T1B + T1C;
+			      T3e = T1B - T1C;
+			 }
+			 T1z = T1v - T1y;
+			 T1E = T1A - T1D;
+			 T6m = T1A + T1D;
+			 T6l = T1v + T1y;
+			 T5H = T3d - T3e;
+			 T5G = Tp - Ts;
+		    }
+		    {
+			 E Tm, Tt, T6k, T6n;
+			 Tm = Ti + Tl;
+			 Tt = Tp + Ts;
+			 Tu = Tm + Tt;
+			 T4p = Tm - Tt;
+			 T6k = FMA(KP382683432, T6i, KP923879532 * T6j);
+			 T6n = FMA(KP382683432, T6l, KP923879532 * T6m);
+			 T6o = T6k - T6n;
+			 T8f = T6k + T6n;
+		    }
+		    {
+			 E T6K, T6L, T1u, T1F;
+			 T6K = FNMS(KP923879532, T6i, KP382683432 * T6j);
+			 T6L = FNMS(KP923879532, T6l, KP382683432 * T6m);
+			 T6M = T6K + T6L;
+			 T80 = T6K - T6L;
+			 T1u = FMA(KP923879532, T1o, KP382683432 * T1t);
+			 T1F = FNMS(KP382683432, T1E, KP923879532 * T1z);
+			 T1G = T1u + T1F;
+			 T4K = T1F - T1u;
+		    }
+		    {
+			 E T2G, T2H, T5F, T5I;
+			 T2G = FNMS(KP382683432, T1o, KP923879532 * T1t);
+			 T2H = FMA(KP382683432, T1z, KP923879532 * T1E);
+			 T2I = T2G + T2H;
+			 T4w = T2G - T2H;
+			 T5F = T5D - T5E;
+			 T5I = T5G + T5H;
+			 T5J = KP707106781 * (T5F + T5I);
+			 T7T = KP707106781 * (T5F - T5I);
+		    }
+		    {
+			 E T65, T66, T3c, T3f;
+			 T65 = T5D + T5E;
+			 T66 = T5H - T5G;
+			 T67 = KP707106781 * (T65 + T66);
+			 T7F = KP707106781 * (T66 - T65);
+			 T3c = T3a + T3b;
+			 T3f = T3d + T3e;
+			 T3g = T3c + T3f;
+			 T4b = T3f - T3c;
+		    }
+	       }
+	       {
+		    E T11, T3s, T3p, T3u, T3K, T40, T3G, T3Y, T2T, T43, T3z, T3P, T2B, T45, T3x;
+		    E T3T;
+		    {
+			 E Tv, T10, T3E, T3F;
+			 Tv = Tf + Tu;
+			 T10 = TK + TZ;
+			 T11 = Tv + T10;
+			 T3s = Tv - T10;
+			 {
+			      E T39, T3o, T3I, T3J;
+			      T39 = T31 + T38;
+			      T3o = T3g + T3n;
+			      T3p = T39 + T3o;
+			      T3u = T3o - T39;
+			      T3I = TK - TZ;
+			      T3J = T3n - T3g;
+			      T3K = T3I + T3J;
+			      T40 = T3J - T3I;
+			 }
+			 T3E = Tf - Tu;
+			 T3F = T38 - T31;
+			 T3G = T3E + T3F;
+			 T3Y = T3E - T3F;
+			 {
+			      E T2S, T3N, T2F, T3O, T2D, T2E;
+			      T2S = T2I + T2R;
+			      T3N = T1j - T1G;
+			      T2D = FNMS(KP195090322, T1Y, KP980785280 * T27);
+			      T2E = FMA(KP195090322, T2p, KP980785280 * T2y);
+			      T2F = T2D + T2E;
+			      T3O = T2D - T2E;
+			      T2T = T2F + T2S;
+			      T43 = T3N - T3O;
+			      T3z = T2S - T2F;
+			      T3P = T3N + T3O;
+			 }
+			 {
+			      E T1H, T3S, T2A, T3R, T28, T2z;
+			      T1H = T1j + T1G;
+			      T3S = T2R - T2I;
+			      T28 = FMA(KP980785280, T1Y, KP195090322 * T27);
+			      T2z = FNMS(KP195090322, T2y, KP980785280 * T2p);
+			      T2A = T28 + T2z;
+			      T3R = T2z - T28;
+			      T2B = T1H + T2A;
+			      T45 = T3S - T3R;
+			      T3x = T1H - T2A;
+			      T3T = T3R + T3S;
+			 }
+		    }
+		    {
+			 E T2U, T3q, T12, T2C;
+			 T12 = W[0];
+			 T2C = W[1];
+			 T2U = FMA(T12, T2B, T2C * T2T);
+			 T3q = FNMS(T2C, T2B, T12 * T2T);
+			 Rp[0] = T11 - T2U;
+			 Ip[0] = T3p + T3q;
+			 Rm[0] = T11 + T2U;
+			 Im[0] = T3q - T3p;
+		    }
+		    {
+			 E T41, T47, T46, T48;
+			 {
+			      E T3X, T3Z, T42, T44;
+			      T3X = W[46];
+			      T3Z = W[47];
+			      T41 = FNMS(T3Z, T40, T3X * T3Y);
+			      T47 = FMA(T3Z, T3Y, T3X * T40);
+			      T42 = W[48];
+			      T44 = W[49];
+			      T46 = FMA(T42, T43, T44 * T45);
+			      T48 = FNMS(T44, T43, T42 * T45);
+			 }
+			 Rp[WS(rs, 12)] = T41 - T46;
+			 Ip[WS(rs, 12)] = T47 + T48;
+			 Rm[WS(rs, 12)] = T41 + T46;
+			 Im[WS(rs, 12)] = T48 - T47;
+		    }
+		    {
+			 E T3v, T3B, T3A, T3C;
+			 {
+			      E T3r, T3t, T3w, T3y;
+			      T3r = W[30];
+			      T3t = W[31];
+			      T3v = FNMS(T3t, T3u, T3r * T3s);
+			      T3B = FMA(T3t, T3s, T3r * T3u);
+			      T3w = W[32];
+			      T3y = W[33];
+			      T3A = FMA(T3w, T3x, T3y * T3z);
+			      T3C = FNMS(T3y, T3x, T3w * T3z);
+			 }
+			 Rp[WS(rs, 8)] = T3v - T3A;
+			 Ip[WS(rs, 8)] = T3B + T3C;
+			 Rm[WS(rs, 8)] = T3v + T3A;
+			 Im[WS(rs, 8)] = T3C - T3B;
+		    }
+		    {
+			 E T3L, T3V, T3U, T3W;
+			 {
+			      E T3D, T3H, T3M, T3Q;
+			      T3D = W[14];
+			      T3H = W[15];
+			      T3L = FNMS(T3H, T3K, T3D * T3G);
+			      T3V = FMA(T3H, T3G, T3D * T3K);
+			      T3M = W[16];
+			      T3Q = W[17];
+			      T3U = FMA(T3M, T3P, T3Q * T3T);
+			      T3W = FNMS(T3Q, T3P, T3M * T3T);
+			 }
+			 Rp[WS(rs, 4)] = T3L - T3U;
+			 Ip[WS(rs, 4)] = T3V + T3W;
+			 Rm[WS(rs, 4)] = T3L + T3U;
+			 Im[WS(rs, 4)] = T3W - T3V;
+		    }
+	       }
+	       {
+		    E T7O, T8m, T7W, T8o, T8E, T8U, T8A, T8S, T8h, T8X, T8t, T8J, T89, T8Z, T8r;
+		    E T8N;
+		    {
+			 E T7G, T7N, T8y, T8z;
+			 T7G = T7E + T7F;
+			 T7N = T7J + T7M;
+			 T7O = T7G + T7N;
+			 T8m = T7G - T7N;
+			 {
+			      E T7S, T7V, T8C, T8D;
+			      T7S = T7Q + T7R;
+			      T7V = T7T + T7U;
+			      T7W = T7S + T7V;
+			      T8o = T7V - T7S;
+			      T8C = T7J - T7M;
+			      T8D = T7U - T7T;
+			      T8E = T8C + T8D;
+			      T8U = T8D - T8C;
+			 }
+			 T8y = T7E - T7F;
+			 T8z = T7R - T7Q;
+			 T8A = T8y + T8z;
+			 T8S = T8y - T8z;
+			 {
+			      E T8g, T8H, T8d, T8I, T8b, T8c;
+			      T8g = T8e - T8f;
+			      T8H = T7Z - T80;
+			      T8b = FNMS(KP980785280, T82, KP195090322 * T83);
+			      T8c = FNMS(KP980785280, T85, KP195090322 * T86);
+			      T8d = T8b + T8c;
+			      T8I = T8b - T8c;
+			      T8h = T8d + T8g;
+			      T8X = T8H - T8I;
+			      T8t = T8g - T8d;
+			      T8J = T8H + T8I;
+			 }
+			 {
+			      E T81, T8L, T88, T8M, T84, T87;
+			      T81 = T7Z + T80;
+			      T8L = T8f + T8e;
+			      T84 = FMA(KP195090322, T82, KP980785280 * T83);
+			      T87 = FMA(KP195090322, T85, KP980785280 * T86);
+			      T88 = T84 - T87;
+			      T8M = T84 + T87;
+			      T89 = T81 + T88;
+			      T8Z = T8M + T8L;
+			      T8r = T81 - T88;
+			      T8N = T8L - T8M;
+			 }
+		    }
+		    {
+			 E T7X, T8j, T8i, T8k;
+			 {
+			      E T7D, T7P, T7Y, T8a;
+			      T7D = W[10];
+			      T7P = W[11];
+			      T7X = FNMS(T7P, T7W, T7D * T7O);
+			      T8j = FMA(T7P, T7O, T7D * T7W);
+			      T7Y = W[12];
+			      T8a = W[13];
+			      T8i = FMA(T7Y, T89, T8a * T8h);
+			      T8k = FNMS(T8a, T89, T7Y * T8h);
+			 }
+			 Rp[WS(rs, 3)] = T7X - T8i;
+			 Ip[WS(rs, 3)] = T8j + T8k;
+			 Rm[WS(rs, 3)] = T7X + T8i;
+			 Im[WS(rs, 3)] = T8k - T8j;
+		    }
+		    {
+			 E T8V, T91, T90, T92;
+			 {
+			      E T8R, T8T, T8W, T8Y;
+			      T8R = W[58];
+			      T8T = W[59];
+			      T8V = FNMS(T8T, T8U, T8R * T8S);
+			      T91 = FMA(T8T, T8S, T8R * T8U);
+			      T8W = W[60];
+			      T8Y = W[61];
+			      T90 = FMA(T8W, T8X, T8Y * T8Z);
+			      T92 = FNMS(T8Y, T8X, T8W * T8Z);
+			 }
+			 Rp[WS(rs, 15)] = T8V - T90;
+			 Ip[WS(rs, 15)] = T91 + T92;
+			 Rm[WS(rs, 15)] = T8V + T90;
+			 Im[WS(rs, 15)] = T92 - T91;
+		    }
+		    {
+			 E T8p, T8v, T8u, T8w;
+			 {
+			      E T8l, T8n, T8q, T8s;
+			      T8l = W[42];
+			      T8n = W[43];
+			      T8p = FNMS(T8n, T8o, T8l * T8m);
+			      T8v = FMA(T8n, T8m, T8l * T8o);
+			      T8q = W[44];
+			      T8s = W[45];
+			      T8u = FMA(T8q, T8r, T8s * T8t);
+			      T8w = FNMS(T8s, T8r, T8q * T8t);
+			 }
+			 Rp[WS(rs, 11)] = T8p - T8u;
+			 Ip[WS(rs, 11)] = T8v + T8w;
+			 Rm[WS(rs, 11)] = T8p + T8u;
+			 Im[WS(rs, 11)] = T8w - T8v;
+		    }
+		    {
+			 E T8F, T8P, T8O, T8Q;
+			 {
+			      E T8x, T8B, T8G, T8K;
+			      T8x = W[26];
+			      T8B = W[27];
+			      T8F = FNMS(T8B, T8E, T8x * T8A);
+			      T8P = FMA(T8B, T8A, T8x * T8E);
+			      T8G = W[28];
+			      T8K = W[29];
+			      T8O = FMA(T8G, T8J, T8K * T8N);
+			      T8Q = FNMS(T8K, T8J, T8G * T8N);
+			 }
+			 Rp[WS(rs, 7)] = T8F - T8O;
+			 Ip[WS(rs, 7)] = T8P + T8Q;
+			 Rm[WS(rs, 7)] = T8F + T8O;
+			 Im[WS(rs, 7)] = T8Q - T8P;
+		    }
+	       }
+	       {
+		    E T4k, T4S, T4s, T4U, T5a, T5q, T56, T5o, T4N, T5t, T4Z, T5f, T4F, T5v, T4X;
+		    E T5j;
+		    {
+			 E T4c, T4j, T54, T55;
+			 T4c = T4a + T4b;
+			 T4j = KP707106781 * (T4f + T4i);
+			 T4k = T4c + T4j;
+			 T4S = T4c - T4j;
+			 {
+			      E T4o, T4r, T58, T59;
+			      T4o = KP707106781 * (T4m + T4n);
+			      T4r = T4p + T4q;
+			      T4s = T4o + T4r;
+			      T4U = T4r - T4o;
+			      T58 = KP707106781 * (T4f - T4i);
+			      T59 = T4q - T4p;
+			      T5a = T58 + T59;
+			      T5q = T59 - T58;
+			 }
+			 T54 = T4a - T4b;
+			 T55 = KP707106781 * (T4n - T4m);
+			 T56 = T54 + T55;
+			 T5o = T54 - T55;
+			 {
+			      E T4M, T5d, T4J, T5e, T4H, T4I;
+			      T4M = T4K + T4L;
+			      T5d = T4v - T4w;
+			      T4H = FNMS(KP831469612, T4y, KP555570233 * T4z);
+			      T4I = FMA(KP831469612, T4B, KP555570233 * T4C);
+			      T4J = T4H + T4I;
+			      T5e = T4H - T4I;
+			      T4N = T4J + T4M;
+			      T5t = T5d - T5e;
+			      T4Z = T4M - T4J;
+			      T5f = T5d + T5e;
+			 }
+			 {
+			      E T4x, T5i, T4E, T5h, T4A, T4D;
+			      T4x = T4v + T4w;
+			      T5i = T4L - T4K;
+			      T4A = FMA(KP555570233, T4y, KP831469612 * T4z);
+			      T4D = FNMS(KP831469612, T4C, KP555570233 * T4B);
+			      T4E = T4A + T4D;
+			      T5h = T4D - T4A;
+			      T4F = T4x + T4E;
+			      T5v = T5i - T5h;
+			      T4X = T4x - T4E;
+			      T5j = T5h + T5i;
+			 }
+		    }
+		    {
+			 E T4t, T4P, T4O, T4Q;
+			 {
+			      E T49, T4l, T4u, T4G;
+			      T49 = W[6];
+			      T4l = W[7];
+			      T4t = FNMS(T4l, T4s, T49 * T4k);
+			      T4P = FMA(T4l, T4k, T49 * T4s);
+			      T4u = W[8];
+			      T4G = W[9];
+			      T4O = FMA(T4u, T4F, T4G * T4N);
+			      T4Q = FNMS(T4G, T4F, T4u * T4N);
+			 }
+			 Rp[WS(rs, 2)] = T4t - T4O;
+			 Ip[WS(rs, 2)] = T4P + T4Q;
+			 Rm[WS(rs, 2)] = T4t + T4O;
+			 Im[WS(rs, 2)] = T4Q - T4P;
+		    }
+		    {
+			 E T5r, T5x, T5w, T5y;
+			 {
+			      E T5n, T5p, T5s, T5u;
+			      T5n = W[54];
+			      T5p = W[55];
+			      T5r = FNMS(T5p, T5q, T5n * T5o);
+			      T5x = FMA(T5p, T5o, T5n * T5q);
+			      T5s = W[56];
+			      T5u = W[57];
+			      T5w = FMA(T5s, T5t, T5u * T5v);
+			      T5y = FNMS(T5u, T5t, T5s * T5v);
+			 }
+			 Rp[WS(rs, 14)] = T5r - T5w;
+			 Ip[WS(rs, 14)] = T5x + T5y;
+			 Rm[WS(rs, 14)] = T5r + T5w;
+			 Im[WS(rs, 14)] = T5y - T5x;
+		    }
+		    {
+			 E T4V, T51, T50, T52;
+			 {
+			      E T4R, T4T, T4W, T4Y;
+			      T4R = W[38];
+			      T4T = W[39];
+			      T4V = FNMS(T4T, T4U, T4R * T4S);
+			      T51 = FMA(T4T, T4S, T4R * T4U);
+			      T4W = W[40];
+			      T4Y = W[41];
+			      T50 = FMA(T4W, T4X, T4Y * T4Z);
+			      T52 = FNMS(T4Y, T4X, T4W * T4Z);
+			 }
+			 Rp[WS(rs, 10)] = T4V - T50;
+			 Ip[WS(rs, 10)] = T51 + T52;
+			 Rm[WS(rs, 10)] = T4V + T50;
+			 Im[WS(rs, 10)] = T52 - T51;
+		    }
+		    {
+			 E T5b, T5l, T5k, T5m;
+			 {
+			      E T53, T57, T5c, T5g;
+			      T53 = W[22];
+			      T57 = W[23];
+			      T5b = FNMS(T57, T5a, T53 * T56);
+			      T5l = FMA(T57, T56, T53 * T5a);
+			      T5c = W[24];
+			      T5g = W[25];
+			      T5k = FMA(T5c, T5f, T5g * T5j);
+			      T5m = FNMS(T5g, T5f, T5c * T5j);
+			 }
+			 Rp[WS(rs, 6)] = T5b - T5k;
+			 Ip[WS(rs, 6)] = T5l + T5m;
+			 Rm[WS(rs, 6)] = T5b + T5k;
+			 Im[WS(rs, 6)] = T5m - T5l;
+		    }
+	       }
+	       {
+		    E T60, T6W, T6c, T6Y, T7e, T7u, T7a, T7s, T6R, T7x, T73, T7j, T6F, T7z, T71;
+		    E T7n;
+		    {
+			 E T5K, T5Z, T78, T79;
+			 T5K = T5C + T5J;
+			 T5Z = T5R + T5Y;
+			 T60 = T5K + T5Z;
+			 T6W = T5K - T5Z;
+			 {
+			      E T64, T6b, T7c, T7d;
+			      T64 = T62 + T63;
+			      T6b = T67 + T6a;
+			      T6c = T64 + T6b;
+			      T6Y = T6b - T64;
+			      T7c = T5R - T5Y;
+			      T7d = T6a - T67;
+			      T7e = T7c + T7d;
+			      T7u = T7d - T7c;
+			 }
+			 T78 = T5C - T5J;
+			 T79 = T63 - T62;
+			 T7a = T78 + T79;
+			 T7s = T78 - T79;
+			 {
+			      E T6Q, T7h, T6J, T7i, T6H, T6I;
+			      T6Q = T6M + T6P;
+			      T7h = T6h - T6o;
+			      T6H = FNMS(KP555570233, T6s, KP831469612 * T6v);
+			      T6I = FMA(KP555570233, T6z, KP831469612 * T6C);
+			      T6J = T6H + T6I;
+			      T7i = T6H - T6I;
+			      T6R = T6J + T6Q;
+			      T7x = T7h - T7i;
+			      T73 = T6Q - T6J;
+			      T7j = T7h + T7i;
+			 }
+			 {
+			      E T6p, T7m, T6E, T7l, T6w, T6D;
+			      T6p = T6h + T6o;
+			      T7m = T6P - T6M;
+			      T6w = FMA(KP831469612, T6s, KP555570233 * T6v);
+			      T6D = FNMS(KP555570233, T6C, KP831469612 * T6z);
+			      T6E = T6w + T6D;
+			      T7l = T6D - T6w;
+			      T6F = T6p + T6E;
+			      T7z = T7m - T7l;
+			      T71 = T6p - T6E;
+			      T7n = T7l + T7m;
+			 }
+		    }
+		    {
+			 E T6d, T6T, T6S, T6U;
+			 {
+			      E T5z, T61, T6e, T6G;
+			      T5z = W[2];
+			      T61 = W[3];
+			      T6d = FNMS(T61, T6c, T5z * T60);
+			      T6T = FMA(T61, T60, T5z * T6c);
+			      T6e = W[4];
+			      T6G = W[5];
+			      T6S = FMA(T6e, T6F, T6G * T6R);
+			      T6U = FNMS(T6G, T6F, T6e * T6R);
+			 }
+			 Rp[WS(rs, 1)] = T6d - T6S;
+			 Ip[WS(rs, 1)] = T6T + T6U;
+			 Rm[WS(rs, 1)] = T6d + T6S;
+			 Im[WS(rs, 1)] = T6U - T6T;
+		    }
+		    {
+			 E T7v, T7B, T7A, T7C;
+			 {
+			      E T7r, T7t, T7w, T7y;
+			      T7r = W[50];
+			      T7t = W[51];
+			      T7v = FNMS(T7t, T7u, T7r * T7s);
+			      T7B = FMA(T7t, T7s, T7r * T7u);
+			      T7w = W[52];
+			      T7y = W[53];
+			      T7A = FMA(T7w, T7x, T7y * T7z);
+			      T7C = FNMS(T7y, T7x, T7w * T7z);
+			 }
+			 Rp[WS(rs, 13)] = T7v - T7A;
+			 Ip[WS(rs, 13)] = T7B + T7C;
+			 Rm[WS(rs, 13)] = T7v + T7A;
+			 Im[WS(rs, 13)] = T7C - T7B;
+		    }
+		    {
+			 E T6Z, T75, T74, T76;
+			 {
+			      E T6V, T6X, T70, T72;
+			      T6V = W[34];
+			      T6X = W[35];
+			      T6Z = FNMS(T6X, T6Y, T6V * T6W);
+			      T75 = FMA(T6X, T6W, T6V * T6Y);
+			      T70 = W[36];
+			      T72 = W[37];
+			      T74 = FMA(T70, T71, T72 * T73);
+			      T76 = FNMS(T72, T71, T70 * T73);
+			 }
+			 Rp[WS(rs, 9)] = T6Z - T74;
+			 Ip[WS(rs, 9)] = T75 + T76;
+			 Rm[WS(rs, 9)] = T6Z + T74;
+			 Im[WS(rs, 9)] = T76 - T75;
+		    }
+		    {
+			 E T7f, T7p, T7o, T7q;
+			 {
+			      E T77, T7b, T7g, T7k;
+			      T77 = W[18];
+			      T7b = W[19];
+			      T7f = FNMS(T7b, T7e, T77 * T7a);
+			      T7p = FMA(T7b, T7a, T77 * T7e);
+			      T7g = W[20];
+			      T7k = W[21];
+			      T7o = FMA(T7g, T7j, T7k * T7n);
+			      T7q = FNMS(T7k, T7j, T7g * T7n);
+			 }
+			 Rp[WS(rs, 5)] = T7f - T7o;
+			 Ip[WS(rs, 5)] = T7p + T7q;
+			 Rm[WS(rs, 5)] = T7f + T7o;
+			 Im[WS(rs, 5)] = T7q - T7p;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cbdft_32", twinstr, &GENUS, {404, 114, 94, 0} };
+
+void X(codelet_hc2cbdft_32) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_32, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:03 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cbdft_4 -include hc2cb.h */
+
+/*
+ * This function contains 30 FP additions, 12 FP multiplications,
+ * (or, 24 additions, 6 multiplications, 6 fused multiply/add),
+ * 35 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E Ty, TB, Tw, TE, TA, TF, Tz, TG, TC;
+	       {
+		    E T4, Tg, T3, Tm, Tc, T5, Th, Ti;
+		    {
+			 E T1, T2, Ta, Tb;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 1)];
+			 Ta = Ip[0];
+			 Tb = Im[WS(rs, 1)];
+			 T4 = Rp[WS(rs, 1)];
+			 Tg = T1 - T2;
+			 T3 = T1 + T2;
+			 Tm = Ta - Tb;
+			 Tc = Ta + Tb;
+			 T5 = Rm[0];
+			 Th = Ip[WS(rs, 1)];
+			 Ti = Im[0];
+		    }
+		    {
+			 E T8, Td, T7, Ts, To, Tv, Tk, Te, Tf;
+			 T8 = W[0];
+			 {
+			      E T9, T6, Tn, Tj;
+			      T9 = T4 - T5;
+			      T6 = T4 + T5;
+			      Tn = Th - Ti;
+			      Tj = Th + Ti;
+			      Ty = Tc - T9;
+			      Td = T9 + Tc;
+			      T7 = T3 + T6;
+			      Ts = T3 - T6;
+			      To = Tm + Tn;
+			      Tv = Tm - Tn;
+			      TB = Tg + Tj;
+			      Tk = Tg - Tj;
+			      Te = T8 * Td;
+			 }
+			 Tf = W[1];
+			 {
+			      E Tr, Tu, Tt, TD, Tx, Tp, Tl, Tq;
+			      Tr = W[2];
+			      Tp = T8 * Tk;
+			      Tu = W[3];
+			      Tl = FMA(Tf, Tk, Te);
+			      Tt = Tr * Ts;
+			      Tq = FNMS(Tf, Td, Tp);
+			      TD = Tu * Ts;
+			      Rm[0] = T7 + Tl;
+			      Rp[0] = T7 - Tl;
+			      Im[0] = Tq - To;
+			      Ip[0] = To + Tq;
+			      Tx = W[4];
+			      Tw = FNMS(Tu, Tv, Tt);
+			      TE = FMA(Tr, Tv, TD);
+			      TA = W[5];
+			      TF = Tx * TB;
+			      Tz = Tx * Ty;
+			 }
+		    }
+	       }
+	       TG = FNMS(TA, Ty, TF);
+	       TC = FMA(TA, TB, Tz);
+	       Im[WS(rs, 1)] = TG - TE;
+	       Ip[WS(rs, 1)] = TE + TG;
+	       Rm[WS(rs, 1)] = Tw + TC;
+	       Rp[WS(rs, 1)] = Tw - TC;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cbdft_4", twinstr, &GENUS, {24, 6, 6, 0} };
+
+void X(codelet_hc2cbdft_4) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_4, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cbdft_4 -include hc2cb.h */
+
+/*
+ * This function contains 30 FP additions, 12 FP multiplications,
+ * (or, 24 additions, 6 multiplications, 6 fused multiply/add),
+ * 19 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T3, Tl, T6, Tm, Td, Tj, Tx, Tv, Ts, Tq;
+	       {
+		    E Tf, Tc, T9, Ti;
+		    {
+			 E T1, T2, Ta, Tb;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 1)];
+			 T3 = T1 + T2;
+			 Tf = T1 - T2;
+			 Ta = Ip[0];
+			 Tb = Im[WS(rs, 1)];
+			 Tc = Ta + Tb;
+			 Tl = Ta - Tb;
+		    }
+		    {
+			 E T4, T5, Tg, Th;
+			 T4 = Rp[WS(rs, 1)];
+			 T5 = Rm[0];
+			 T6 = T4 + T5;
+			 T9 = T4 - T5;
+			 Tg = Ip[WS(rs, 1)];
+			 Th = Im[0];
+			 Ti = Tg + Th;
+			 Tm = Tg - Th;
+		    }
+		    Td = T9 + Tc;
+		    Tj = Tf - Ti;
+		    Tx = Tf + Ti;
+		    Tv = Tc - T9;
+		    Ts = Tl - Tm;
+		    Tq = T3 - T6;
+	       }
+	       {
+		    E T7, Tn, Tk, To, T8, Te;
+		    T7 = T3 + T6;
+		    Tn = Tl + Tm;
+		    T8 = W[0];
+		    Te = W[1];
+		    Tk = FMA(T8, Td, Te * Tj);
+		    To = FNMS(Te, Td, T8 * Tj);
+		    Rp[0] = T7 - Tk;
+		    Ip[0] = Tn + To;
+		    Rm[0] = T7 + Tk;
+		    Im[0] = To - Tn;
+	       }
+	       {
+		    E Tt, Tz, Ty, TA;
+		    {
+			 E Tp, Tr, Tu, Tw;
+			 Tp = W[2];
+			 Tr = W[3];
+			 Tt = FNMS(Tr, Ts, Tp * Tq);
+			 Tz = FMA(Tr, Tq, Tp * Ts);
+			 Tu = W[4];
+			 Tw = W[5];
+			 Ty = FMA(Tu, Tv, Tw * Tx);
+			 TA = FNMS(Tw, Tv, Tu * Tx);
+		    }
+		    Rp[WS(rs, 1)] = Tt - Ty;
+		    Ip[WS(rs, 1)] = Tz + TA;
+		    Rm[WS(rs, 1)] = Tt + Ty;
+		    Im[WS(rs, 1)] = TA - Tz;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cbdft_4", twinstr, &GENUS, {24, 6, 6, 0} };
+
+void X(codelet_hc2cbdft_4) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_4, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:04 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cbdft_6 -include hc2cb.h */
+
+/*
+ * This function contains 58 FP additions, 32 FP multiplications,
+ * (or, 36 additions, 10 multiplications, 22 fused multiply/add),
+ * 52 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T18, T1b, T16, T1e, T1a, T1f, T19, T1g, T1c;
+	       {
+		    E Tw, T4, TV, Tj, TP, TH, Tr, TY, T5, T6, Ta, Ty;
+		    {
+			 E Tg, TF, Tf, TD, Tp, Th;
+			 {
+			      E Td, Te, Tn, To;
+			      Td = Ip[WS(rs, 1)];
+			      Te = Im[WS(rs, 1)];
+			      Tn = Ip[0];
+			      To = Im[WS(rs, 2)];
+			      Tg = Ip[WS(rs, 2)];
+			      TF = Te + Td;
+			      Tf = Td - Te;
+			      TD = Tn + To;
+			      Tp = Tn - To;
+			      Th = Im[0];
+			 }
+			 {
+			      E T2, T3, T8, T9;
+			      T2 = Rp[0];
+			      T3 = Rm[WS(rs, 2)];
+			      {
+				   E Tq, TE, Ti, TG;
+				   T8 = Rm[WS(rs, 1)];
+				   TE = Tg + Th;
+				   Ti = Tg - Th;
+				   Tw = T2 - T3;
+				   T4 = T2 + T3;
+				   TG = TE - TF;
+				   TV = TF + TE;
+				   Tq = Tf + Ti;
+				   Tj = Tf - Ti;
+				   TP = FNMS(KP500000000, TG, TD);
+				   TH = TD + TG;
+				   T9 = Rp[WS(rs, 1)];
+				   Tr = FNMS(KP500000000, Tq, Tp);
+				   TY = Tp + Tq;
+			      }
+			      T5 = Rp[WS(rs, 2)];
+			      T6 = Rm[0];
+			      Ta = T8 + T9;
+			      Ty = T8 - T9;
+			 }
+		    }
+		    {
+			 E TO, TT, Ts, TA, TR, Tc, TN, TW, TS, Tx, T7;
+			 Tx = T5 - T6;
+			 T7 = T5 + T6;
+			 TO = W[0];
+			 TT = W[1];
+			 {
+			      E Tz, TQ, Tb, TU;
+			      Tz = Tx + Ty;
+			      TQ = Tx - Ty;
+			      Tb = T7 + Ta;
+			      Ts = T7 - Ta;
+			      TU = FNMS(KP500000000, Tz, Tw);
+			      TA = Tw + Tz;
+			      TR = FMA(KP866025403, TQ, TP);
+			      T18 = FNMS(KP866025403, TQ, TP);
+			      Tc = FNMS(KP500000000, Tb, T4);
+			      TN = T4 + Tb;
+			      T1b = FMA(KP866025403, TV, TU);
+			      TW = FNMS(KP866025403, TV, TU);
+			      TS = TO * TR;
+			 }
+			 {
+			      E T15, Tt, T12, T1, Tm, TI, TM, Tl, TJ;
+			      {
+				   E Tv, TC, TB, TL, Tk, TZ, TX, T10;
+				   T15 = FMA(KP866025403, Ts, Tr);
+				   Tt = FNMS(KP866025403, Ts, Tr);
+				   TZ = TO * TW;
+				   TX = FMA(TT, TW, TS);
+				   Tv = W[4];
+				   TC = W[5];
+				   T10 = FNMS(TT, TR, TZ);
+				   Rm[0] = TN + TX;
+				   Rp[0] = TN - TX;
+				   TB = Tv * TA;
+				   Im[0] = T10 - TY;
+				   Ip[0] = TY + T10;
+				   TL = TC * TA;
+				   Tk = FNMS(KP866025403, Tj, Tc);
+				   T12 = FMA(KP866025403, Tj, Tc);
+				   T1 = W[3];
+				   Tm = W[2];
+				   TI = FNMS(TC, TH, TB);
+				   TM = FMA(Tv, TH, TL);
+				   Tl = T1 * Tk;
+				   TJ = Tm * Tk;
+			      }
+			      {
+				   E T11, T14, T13, T1d, T17, Tu, TK;
+				   Tu = FMA(Tm, Tt, Tl);
+				   TK = FNMS(T1, Tt, TJ);
+				   T11 = W[6];
+				   T14 = W[7];
+				   Im[WS(rs, 1)] = TI - Tu;
+				   Ip[WS(rs, 1)] = Tu + TI;
+				   Rm[WS(rs, 1)] = TK + TM;
+				   Rp[WS(rs, 1)] = TK - TM;
+				   T13 = T11 * T12;
+				   T1d = T14 * T12;
+				   T17 = W[8];
+				   T16 = FNMS(T14, T15, T13);
+				   T1e = FMA(T11, T15, T1d);
+				   T1a = W[9];
+				   T1f = T17 * T1b;
+				   T19 = T17 * T18;
+			      }
+			 }
+		    }
+	       }
+	       T1g = FNMS(T1a, T18, T1f);
+	       T1c = FMA(T1a, T1b, T19);
+	       Im[WS(rs, 2)] = T1g - T1e;
+	       Ip[WS(rs, 2)] = T1e + T1g;
+	       Rm[WS(rs, 2)] = T16 + T1c;
+	       Rp[WS(rs, 2)] = T16 - T1c;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 6, "hc2cbdft_6", twinstr, &GENUS, {36, 10, 22, 0} };
+
+void X(codelet_hc2cbdft_6) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_6, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cbdft_6 -include hc2cb.h */
+
+/*
+ * This function contains 58 FP additions, 28 FP multiplications,
+ * (or, 44 additions, 14 multiplications, 14 fused multiply/add),
+ * 29 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T4, Tv, Tr, TL, Tb, Tc, Ty, TP, To, TB, Tj, TQ, Tp, Tq, TE;
+	       E TM;
+	       {
+		    E Ta, Tx, T7, Tw, T2, T3;
+		    T2 = Rp[0];
+		    T3 = Rm[WS(rs, 2)];
+		    T4 = T2 + T3;
+		    Tv = T2 - T3;
+		    {
+			 E T8, T9, T5, T6;
+			 T8 = Rm[WS(rs, 1)];
+			 T9 = Rp[WS(rs, 1)];
+			 Ta = T8 + T9;
+			 Tx = T8 - T9;
+			 T5 = Rp[WS(rs, 2)];
+			 T6 = Rm[0];
+			 T7 = T5 + T6;
+			 Tw = T5 - T6;
+		    }
+		    Tr = KP866025403 * (T7 - Ta);
+		    TL = KP866025403 * (Tw - Tx);
+		    Tb = T7 + Ta;
+		    Tc = FNMS(KP500000000, Tb, T4);
+		    Ty = Tw + Tx;
+		    TP = FNMS(KP500000000, Ty, Tv);
+	       }
+	       {
+		    E Tf, TC, Ti, TD, Td, Te;
+		    Td = Ip[WS(rs, 1)];
+		    Te = Im[WS(rs, 1)];
+		    Tf = Td - Te;
+		    TC = Te + Td;
+		    {
+			 E Tm, Tn, Tg, Th;
+			 Tm = Ip[0];
+			 Tn = Im[WS(rs, 2)];
+			 To = Tm - Tn;
+			 TB = Tm + Tn;
+			 Tg = Ip[WS(rs, 2)];
+			 Th = Im[0];
+			 Ti = Tg - Th;
+			 TD = Tg + Th;
+		    }
+		    Tj = KP866025403 * (Tf - Ti);
+		    TQ = KP866025403 * (TC + TD);
+		    Tp = Tf + Ti;
+		    Tq = FNMS(KP500000000, Tp, To);
+		    TE = TC - TD;
+		    TM = FMA(KP500000000, TE, TB);
+	       }
+	       {
+		    E TJ, TT, TS, TU;
+		    TJ = T4 + Tb;
+		    TT = To + Tp;
+		    {
+			 E TN, TR, TK, TO;
+			 TN = TL + TM;
+			 TR = TP - TQ;
+			 TK = W[0];
+			 TO = W[1];
+			 TS = FMA(TK, TN, TO * TR);
+			 TU = FNMS(TO, TN, TK * TR);
+		    }
+		    Rp[0] = TJ - TS;
+		    Ip[0] = TT + TU;
+		    Rm[0] = TJ + TS;
+		    Im[0] = TU - TT;
+	       }
+	       {
+		    E TZ, T15, T14, T16;
+		    {
+			 E TW, TY, TV, TX;
+			 TW = Tc + Tj;
+			 TY = Tr + Tq;
+			 TV = W[6];
+			 TX = W[7];
+			 TZ = FNMS(TX, TY, TV * TW);
+			 T15 = FMA(TX, TW, TV * TY);
+		    }
+		    {
+			 E T11, T13, T10, T12;
+			 T11 = TM - TL;
+			 T13 = TP + TQ;
+			 T10 = W[8];
+			 T12 = W[9];
+			 T14 = FMA(T10, T11, T12 * T13);
+			 T16 = FNMS(T12, T11, T10 * T13);
+		    }
+		    Rp[WS(rs, 2)] = TZ - T14;
+		    Ip[WS(rs, 2)] = T15 + T16;
+		    Rm[WS(rs, 2)] = TZ + T14;
+		    Im[WS(rs, 2)] = T16 - T15;
+	       }
+	       {
+		    E Tt, TH, TG, TI;
+		    {
+			 E Tk, Ts, T1, Tl;
+			 Tk = Tc - Tj;
+			 Ts = Tq - Tr;
+			 T1 = W[3];
+			 Tl = W[2];
+			 Tt = FMA(T1, Tk, Tl * Ts);
+			 TH = FNMS(T1, Ts, Tl * Tk);
+		    }
+		    {
+			 E Tz, TF, Tu, TA;
+			 Tz = Tv + Ty;
+			 TF = TB - TE;
+			 Tu = W[4];
+			 TA = W[5];
+			 TG = FNMS(TA, TF, Tu * Tz);
+			 TI = FMA(TA, Tz, Tu * TF);
+		    }
+		    Ip[WS(rs, 1)] = Tt + TG;
+		    Rp[WS(rs, 1)] = TH - TI;
+		    Im[WS(rs, 1)] = TG - Tt;
+		    Rm[WS(rs, 1)] = TH + TI;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 6, "hc2cbdft_6", twinstr, &GENUS, {44, 14, 14, 0} };
+
+void X(codelet_hc2cbdft_6) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_6, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:04 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft_8 -include hc2cb.h */
+
+/*
+ * This function contains 82 FP additions, 36 FP multiplications,
+ * (or, 60 additions, 14 multiplications, 22 fused multiply/add),
+ * 55 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T1m, T1r, T1i, T1u, T1o, T1v, T1n, T1w, T1s;
+	       {
+		    E T1k, Tl, T1p, TE, TP, T1g, TM, T1b, T1f, T1a, TU, Tf, T1l, TH, Tw;
+		    E T1q;
+		    {
+			 E TA, T3, TN, Tk, Th, T6, TO, TD, Tb, Tm, Ta, TK, Tp, Tc, Ts;
+			 E Tt;
+			 {
+			      E T4, T5, TB, TC;
+			      {
+				   E T1, T2, Ti, Tj;
+				   T1 = Rp[0];
+				   T2 = Rm[WS(rs, 3)];
+				   Ti = Ip[0];
+				   Tj = Im[WS(rs, 3)];
+				   T4 = Rp[WS(rs, 2)];
+				   TA = T1 - T2;
+				   T3 = T1 + T2;
+				   TN = Ti - Tj;
+				   Tk = Ti + Tj;
+				   T5 = Rm[WS(rs, 1)];
+				   TB = Ip[WS(rs, 2)];
+				   TC = Im[WS(rs, 1)];
+			      }
+			      {
+				   E T8, T9, Tn, To;
+				   T8 = Rp[WS(rs, 1)];
+				   Th = T4 - T5;
+				   T6 = T4 + T5;
+				   TO = TB - TC;
+				   TD = TB + TC;
+				   T9 = Rm[WS(rs, 2)];
+				   Tn = Ip[WS(rs, 1)];
+				   To = Im[WS(rs, 2)];
+				   Tb = Rm[0];
+				   Tm = T8 - T9;
+				   Ta = T8 + T9;
+				   TK = Tn - To;
+				   Tp = Tn + To;
+				   Tc = Rp[WS(rs, 3)];
+				   Ts = Im[0];
+				   Tt = Ip[WS(rs, 3)];
+			      }
+			 }
+			 {
+			      E Tr, Td, Tu, TL, Te, T7;
+			      T1k = Tk - Th;
+			      Tl = Th + Tk;
+			      Tr = Tb - Tc;
+			      Td = Tb + Tc;
+			      TL = Tt - Ts;
+			      Tu = Ts + Tt;
+			      T1p = TA + TD;
+			      TE = TA - TD;
+			      TP = TN + TO;
+			      T1g = TN - TO;
+			      TM = TK + TL;
+			      T1b = TL - TK;
+			      T1f = Ta - Td;
+			      Te = Ta + Td;
+			      T1a = T3 - T6;
+			      T7 = T3 + T6;
+			      {
+				   E Tq, TF, TG, Tv;
+				   Tq = Tm + Tp;
+				   TF = Tm - Tp;
+				   TG = Tr - Tu;
+				   Tv = Tr + Tu;
+				   TU = T7 - Te;
+				   Tf = T7 + Te;
+				   T1l = TF - TG;
+				   TH = TF + TG;
+				   Tw = Tq - Tv;
+				   T1q = Tq + Tv;
+			      }
+			 }
+		    }
+		    {
+			 E TX, T10, T1c, T13, T1h, T1E, T1H, T1C, T1K, T1G, T1L, T1F;
+			 {
+			      E TQ, Tx, T1y, TI, Tg, Tz;
+			      TX = TP - TM;
+			      TQ = TM + TP;
+			      Tx = FMA(KP707106781, Tw, Tl);
+			      T10 = FNMS(KP707106781, Tw, Tl);
+			      T1c = T1a + T1b;
+			      T1y = T1a - T1b;
+			      T13 = FNMS(KP707106781, TH, TE);
+			      TI = FMA(KP707106781, TH, TE);
+			      Tg = W[0];
+			      Tz = W[1];
+			      {
+				   E T1B, T1A, T1x, T1J, T1z, T1D;
+				   {
+					E TR, Ty, TS, TJ;
+					T1B = T1g - T1f;
+					T1h = T1f + T1g;
+					T1A = W[11];
+					TR = Tg * TI;
+					Ty = Tg * Tx;
+					T1x = W[10];
+					T1J = T1A * T1y;
+					TS = FNMS(Tz, Tx, TR);
+					TJ = FMA(Tz, TI, Ty);
+					T1z = T1x * T1y;
+					T1m = FMA(KP707106781, T1l, T1k);
+					T1E = FNMS(KP707106781, T1l, T1k);
+					Im[0] = TS - TQ;
+					Ip[0] = TQ + TS;
+					Rm[0] = Tf + TJ;
+					Rp[0] = Tf - TJ;
+					T1H = FMA(KP707106781, T1q, T1p);
+					T1r = FNMS(KP707106781, T1q, T1p);
+					T1D = W[12];
+				   }
+				   T1C = FNMS(T1A, T1B, T1z);
+				   T1K = FMA(T1x, T1B, T1J);
+				   T1G = W[13];
+				   T1L = T1D * T1H;
+				   T1F = T1D * T1E;
+			      }
+			 }
+			 {
+			      E TY, T16, T12, T17, T11;
+			      {
+				   E TW, TT, T15, TV, TZ, T1M, T1I;
+				   TW = W[7];
+				   T1M = FNMS(T1G, T1E, T1L);
+				   T1I = FMA(T1G, T1H, T1F);
+				   TT = W[6];
+				   T15 = TW * TU;
+				   Im[WS(rs, 3)] = T1M - T1K;
+				   Ip[WS(rs, 3)] = T1K + T1M;
+				   Rm[WS(rs, 3)] = T1C + T1I;
+				   Rp[WS(rs, 3)] = T1C - T1I;
+				   TV = TT * TU;
+				   TZ = W[8];
+				   TY = FNMS(TW, TX, TV);
+				   T16 = FMA(TT, TX, T15);
+				   T12 = W[9];
+				   T17 = TZ * T13;
+				   T11 = TZ * T10;
+			      }
+			      {
+				   E T1e, T19, T1t, T1d, T1j, T18, T14;
+				   T1e = W[3];
+				   T18 = FNMS(T12, T10, T17);
+				   T14 = FMA(T12, T13, T11);
+				   T19 = W[2];
+				   T1t = T1e * T1c;
+				   Im[WS(rs, 2)] = T18 - T16;
+				   Ip[WS(rs, 2)] = T16 + T18;
+				   Rm[WS(rs, 2)] = TY + T14;
+				   Rp[WS(rs, 2)] = TY - T14;
+				   T1d = T19 * T1c;
+				   T1j = W[4];
+				   T1i = FNMS(T1e, T1h, T1d);
+				   T1u = FMA(T19, T1h, T1t);
+				   T1o = W[5];
+				   T1v = T1j * T1r;
+				   T1n = T1j * T1m;
+			      }
+			 }
+		    }
+	       }
+	       T1w = FNMS(T1o, T1m, T1v);
+	       T1s = FMA(T1o, T1r, T1n);
+	       Im[WS(rs, 1)] = T1w - T1u;
+	       Ip[WS(rs, 1)] = T1u + T1w;
+	       Rm[WS(rs, 1)] = T1i + T1s;
+	       Rp[WS(rs, 1)] = T1i - T1s;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cbdft_8", twinstr, &GENUS, {60, 14, 22, 0} };
+
+void X(codelet_hc2cbdft_8) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_8, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft_8 -include hc2cb.h */
+
+/*
+ * This function contains 82 FP additions, 32 FP multiplications,
+ * (or, 68 additions, 18 multiplications, 14 fused multiply/add),
+ * 30 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cb.h"
+
+static void hc2cbdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T7, T1d, T1h, Tl, TG, T14, T19, TO, Te, TL, T18, T15, TB, T1e, Tw;
+	       E T1i;
+	       {
+		    E T3, TC, Tk, TM, T6, Th, TF, TN;
+		    {
+			 E T1, T2, Ti, Tj;
+			 T1 = Rp[0];
+			 T2 = Rm[WS(rs, 3)];
+			 T3 = T1 + T2;
+			 TC = T1 - T2;
+			 Ti = Ip[0];
+			 Tj = Im[WS(rs, 3)];
+			 Tk = Ti + Tj;
+			 TM = Ti - Tj;
+		    }
+		    {
+			 E T4, T5, TD, TE;
+			 T4 = Rp[WS(rs, 2)];
+			 T5 = Rm[WS(rs, 1)];
+			 T6 = T4 + T5;
+			 Th = T4 - T5;
+			 TD = Ip[WS(rs, 2)];
+			 TE = Im[WS(rs, 1)];
+			 TF = TD + TE;
+			 TN = TD - TE;
+		    }
+		    T7 = T3 + T6;
+		    T1d = Tk - Th;
+		    T1h = TC + TF;
+		    Tl = Th + Tk;
+		    TG = TC - TF;
+		    T14 = T3 - T6;
+		    T19 = TM - TN;
+		    TO = TM + TN;
+	       }
+	       {
+		    E Ta, Tm, Tp, TJ, Td, Tr, Tu, TK;
+		    {
+			 E T8, T9, Tn, To;
+			 T8 = Rp[WS(rs, 1)];
+			 T9 = Rm[WS(rs, 2)];
+			 Ta = T8 + T9;
+			 Tm = T8 - T9;
+			 Tn = Ip[WS(rs, 1)];
+			 To = Im[WS(rs, 2)];
+			 Tp = Tn + To;
+			 TJ = Tn - To;
+		    }
+		    {
+			 E Tb, Tc, Ts, Tt;
+			 Tb = Rm[0];
+			 Tc = Rp[WS(rs, 3)];
+			 Td = Tb + Tc;
+			 Tr = Tb - Tc;
+			 Ts = Im[0];
+			 Tt = Ip[WS(rs, 3)];
+			 Tu = Ts + Tt;
+			 TK = Tt - Ts;
+		    }
+		    Te = Ta + Td;
+		    TL = TJ + TK;
+		    T18 = Ta - Td;
+		    T15 = TK - TJ;
+		    {
+			 E Tz, TA, Tq, Tv;
+			 Tz = Tm - Tp;
+			 TA = Tr - Tu;
+			 TB = KP707106781 * (Tz + TA);
+			 T1e = KP707106781 * (Tz - TA);
+			 Tq = Tm + Tp;
+			 Tv = Tr + Tu;
+			 Tw = KP707106781 * (Tq - Tv);
+			 T1i = KP707106781 * (Tq + Tv);
+		    }
+	       }
+	       {
+		    E Tf, TP, TI, TQ;
+		    Tf = T7 + Te;
+		    TP = TL + TO;
+		    {
+			 E Tx, TH, Tg, Ty;
+			 Tx = Tl + Tw;
+			 TH = TB + TG;
+			 Tg = W[0];
+			 Ty = W[1];
+			 TI = FMA(Tg, Tx, Ty * TH);
+			 TQ = FNMS(Ty, Tx, Tg * TH);
+		    }
+		    Rp[0] = Tf - TI;
+		    Ip[0] = TP + TQ;
+		    Rm[0] = Tf + TI;
+		    Im[0] = TQ - TP;
+	       }
+	       {
+		    E T1r, T1x, T1w, T1y;
+		    {
+			 E T1o, T1q, T1n, T1p;
+			 T1o = T14 - T15;
+			 T1q = T19 - T18;
+			 T1n = W[10];
+			 T1p = W[11];
+			 T1r = FNMS(T1p, T1q, T1n * T1o);
+			 T1x = FMA(T1p, T1o, T1n * T1q);
+		    }
+		    {
+			 E T1t, T1v, T1s, T1u;
+			 T1t = T1d - T1e;
+			 T1v = T1i + T1h;
+			 T1s = W[12];
+			 T1u = W[13];
+			 T1w = FMA(T1s, T1t, T1u * T1v);
+			 T1y = FNMS(T1u, T1t, T1s * T1v);
+		    }
+		    Rp[WS(rs, 3)] = T1r - T1w;
+		    Ip[WS(rs, 3)] = T1x + T1y;
+		    Rm[WS(rs, 3)] = T1r + T1w;
+		    Im[WS(rs, 3)] = T1y - T1x;
+	       }
+	       {
+		    E TV, T11, T10, T12;
+		    {
+			 E TS, TU, TR, TT;
+			 TS = T7 - Te;
+			 TU = TO - TL;
+			 TR = W[6];
+			 TT = W[7];
+			 TV = FNMS(TT, TU, TR * TS);
+			 T11 = FMA(TT, TS, TR * TU);
+		    }
+		    {
+			 E TX, TZ, TW, TY;
+			 TX = Tl - Tw;
+			 TZ = TG - TB;
+			 TW = W[8];
+			 TY = W[9];
+			 T10 = FMA(TW, TX, TY * TZ);
+			 T12 = FNMS(TY, TX, TW * TZ);
+		    }
+		    Rp[WS(rs, 2)] = TV - T10;
+		    Ip[WS(rs, 2)] = T11 + T12;
+		    Rm[WS(rs, 2)] = TV + T10;
+		    Im[WS(rs, 2)] = T12 - T11;
+	       }
+	       {
+		    E T1b, T1l, T1k, T1m;
+		    {
+			 E T16, T1a, T13, T17;
+			 T16 = T14 + T15;
+			 T1a = T18 + T19;
+			 T13 = W[2];
+			 T17 = W[3];
+			 T1b = FNMS(T17, T1a, T13 * T16);
+			 T1l = FMA(T17, T16, T13 * T1a);
+		    }
+		    {
+			 E T1f, T1j, T1c, T1g;
+			 T1f = T1d + T1e;
+			 T1j = T1h - T1i;
+			 T1c = W[4];
+			 T1g = W[5];
+			 T1k = FMA(T1c, T1f, T1g * T1j);
+			 T1m = FNMS(T1g, T1f, T1c * T1j);
+		    }
+		    Rp[WS(rs, 1)] = T1b - T1k;
+		    Ip[WS(rs, 1)] = T1l + T1m;
+		    Rm[WS(rs, 1)] = T1b + T1k;
+		    Im[WS(rs, 1)] = T1m - T1l;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cbdft_8", twinstr, &GENUS, {68, 18, 14, 0} };
+
+void X(codelet_hc2cbdft_8) (planner *p) {
+     X(khc2c_register) (p, hc2cbdft_8, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:35 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -name r2cbIII_10 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 32 FP additions, 28 FP multiplications,
+ * (or, 14 additions, 10 multiplications, 18 fused multiply/add),
+ * 38 stack variables, 5 constants, and 20 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
+	       E Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv;
+	       {
+		    E T1, To, Ts, Tt, T8, Ta, Te, Tl, Tm, Th, Tn, T9;
+		    T1 = Cr[WS(csr, 2)];
+		    To = Ci[WS(csi, 2)];
+		    {
+			 E T2, T3, T5, T6;
+			 T2 = Cr[WS(csr, 4)];
+			 T3 = Cr[0];
+			 T5 = Cr[WS(csr, 3)];
+			 T6 = Cr[WS(csr, 1)];
+			 {
+			      E Tc, T4, T7, Td, Tf, Tg;
+			      Tc = Ci[WS(csi, 3)];
+			      Ts = T2 - T3;
+			      T4 = T2 + T3;
+			      Tt = T5 - T6;
+			      T7 = T5 + T6;
+			      Td = Ci[WS(csi, 1)];
+			      Tf = Ci[WS(csi, 4)];
+			      Tg = Ci[0];
+			      T8 = T4 + T7;
+			      Ta = T7 - T4;
+			      Te = Tc - Td;
+			      Tl = Tc + Td;
+			      Tm = Tf + Tg;
+			      Th = Tf - Tg;
+			 }
+		    }
+		    R0[0] = KP2_000000000 * (T1 + T8);
+		    Tn = Tl - Tm;
+		    Tq = Tl + Tm;
+		    Ti = FMA(KP618033988, Th, Te);
+		    Tk = FNMS(KP618033988, Te, Th);
+		    R1[WS(rs, 2)] = KP2_000000000 * (Tn - To);
+		    T9 = FMS(KP250000000, T8, T1);
+		    Tu = FMA(KP618033988, Tt, Ts);
+		    Tw = FNMS(KP618033988, Ts, Tt);
+		    Tp = FMA(KP250000000, Tn, To);
+		    Tb = FNMS(KP559016994, Ta, T9);
+		    Tj = FMA(KP559016994, Ta, T9);
+	       }
+	       Tr = FMA(KP559016994, Tq, Tp);
+	       Tv = FNMS(KP559016994, Tq, Tp);
+	       R0[WS(rs, 2)] = -(KP2_000000000 * (FNMS(KP951056516, Tk, Tj)));
+	       R0[WS(rs, 3)] = KP2_000000000 * (FMA(KP951056516, Tk, Tj));
+	       R0[WS(rs, 4)] = -(KP2_000000000 * (FNMS(KP951056516, Ti, Tb)));
+	       R0[WS(rs, 1)] = KP2_000000000 * (FMA(KP951056516, Ti, Tb));
+	       R1[WS(rs, 1)] = KP2_000000000 * (FMA(KP951056516, Tw, Tv));
+	       R1[WS(rs, 3)] = KP2_000000000 * (FNMS(KP951056516, Tw, Tv));
+	       R1[WS(rs, 4)] = -(KP2_000000000 * (FNMS(KP951056516, Tu, Tr)));
+	       R1[0] = -(KP2_000000000 * (FMA(KP951056516, Tu, Tr)));
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 10, "r2cbIII_10", {14, 10, 18, 0}, &GENUS };
+
+void X(codelet_r2cbIII_10) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_10, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -name r2cbIII_10 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 32 FP additions, 16 FP multiplications,
+ * (or, 26 additions, 10 multiplications, 6 fused multiply/add),
+ * 22 stack variables, 5 constants, and 20 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
+	       E T1, To, T8, Tq, Ta, Tp, Te, Ts, Th, Tn;
+	       T1 = Cr[WS(csr, 2)];
+	       To = Ci[WS(csi, 2)];
+	       {
+		    E T2, T3, T4, T5, T6, T7;
+		    T2 = Cr[WS(csr, 4)];
+		    T3 = Cr[0];
+		    T4 = T2 + T3;
+		    T5 = Cr[WS(csr, 3)];
+		    T6 = Cr[WS(csr, 1)];
+		    T7 = T5 + T6;
+		    T8 = T4 + T7;
+		    Tq = T5 - T6;
+		    Ta = KP1_118033988 * (T7 - T4);
+		    Tp = T2 - T3;
+	       }
+	       {
+		    E Tc, Td, Tm, Tf, Tg, Tl;
+		    Tc = Ci[WS(csi, 4)];
+		    Td = Ci[0];
+		    Tm = Tc + Td;
+		    Tf = Ci[WS(csi, 1)];
+		    Tg = Ci[WS(csi, 3)];
+		    Tl = Tg + Tf;
+		    Te = Tc - Td;
+		    Ts = KP1_118033988 * (Tl + Tm);
+		    Th = Tf - Tg;
+		    Tn = Tl - Tm;
+	       }
+	       R0[0] = KP2_000000000 * (T1 + T8);
+	       R1[WS(rs, 2)] = KP2_000000000 * (Tn - To);
+	       {
+		    E Ti, Tj, Tb, Tk, T9;
+		    Ti = FNMS(KP1_902113032, Th, KP1_175570504 * Te);
+		    Tj = FMA(KP1_175570504, Th, KP1_902113032 * Te);
+		    T9 = FNMS(KP2_000000000, T1, KP500000000 * T8);
+		    Tb = T9 - Ta;
+		    Tk = T9 + Ta;
+		    R0[WS(rs, 1)] = Tb + Ti;
+		    R0[WS(rs, 3)] = Tk + Tj;
+		    R0[WS(rs, 4)] = Ti - Tb;
+		    R0[WS(rs, 2)] = Tj - Tk;
+	       }
+	       {
+		    E Tr, Tv, Tu, Tw, Tt;
+		    Tr = FMA(KP1_902113032, Tp, KP1_175570504 * Tq);
+		    Tv = FNMS(KP1_175570504, Tp, KP1_902113032 * Tq);
+		    Tt = FMA(KP500000000, Tn, KP2_000000000 * To);
+		    Tu = Ts + Tt;
+		    Tw = Tt - Ts;
+		    R1[0] = -(Tr + Tu);
+		    R1[WS(rs, 3)] = Tw - Tv;
+		    R1[WS(rs, 4)] = Tr - Tu;
+		    R1[WS(rs, 1)] = Tv + Tw;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 10, "r2cbIII_10", {26, 10, 6, 0}, &GENUS };
+
+void X(codelet_r2cbIII_10) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_10, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:36 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -name r2cbIII_12 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 42 FP additions, 20 FP multiplications,
+ * (or, 30 additions, 8 multiplications, 12 fused multiply/add),
+ * 37 stack variables, 4 constants, and 24 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
+	       E TE, TD, TF, TG;
+	       {
+		    E Tx, T6, Te, Tb, T5, Tw, Ts, To, Th, Ti, T9, TA;
+		    {
+			 E T1, Tq, Tc, Td, T4, T2, T3, T7, T8, Tr;
+			 T1 = Cr[WS(csr, 1)];
+			 T2 = Cr[WS(csr, 5)];
+			 T3 = Cr[WS(csr, 2)];
+			 Tq = Ci[WS(csi, 1)];
+			 Tc = Ci[WS(csi, 5)];
+			 Td = Ci[WS(csi, 2)];
+			 T4 = T2 + T3;
+			 Tx = T2 - T3;
+			 T6 = Cr[WS(csr, 4)];
+			 Te = Tc + Td;
+			 Tr = Td - Tc;
+			 Tb = FNMS(KP2_000000000, T1, T4);
+			 T5 = T1 + T4;
+			 T7 = Cr[0];
+			 Tw = FMA(KP2_000000000, Tq, Tr);
+			 Ts = Tq - Tr;
+			 T8 = Cr[WS(csr, 3)];
+			 To = Ci[WS(csi, 4)];
+			 Th = Ci[0];
+			 Ti = Ci[WS(csi, 3)];
+			 T9 = T7 + T8;
+			 TA = T7 - T8;
+		    }
+		    {
+			 E Tl, Tm, Tv, TC;
+			 {
+			      E Tf, Ty, Tk, TB;
+			      {
+				   E Tj, Tn, Tg, Ta;
+				   Tl = FNMS(KP1_732050807, Te, Tb);
+				   Tf = FMA(KP1_732050807, Te, Tb);
+				   Tj = Th + Ti;
+				   Tn = Ti - Th;
+				   Tg = FNMS(KP2_000000000, T6, T9);
+				   Ta = T6 + T9;
+				   {
+					E Tu, Tt, Tz, Tp;
+					Ty = FMA(KP1_732050807, Tx, Tw);
+					TE = FNMS(KP1_732050807, Tx, Tw);
+					Tz = FMA(KP2_000000000, To, Tn);
+					Tp = Tn - To;
+					Tm = FMA(KP1_732050807, Tj, Tg);
+					Tk = FNMS(KP1_732050807, Tj, Tg);
+					Tu = T5 - Ta;
+					R0[0] = KP2_000000000 * (T5 + Ta);
+					Tt = Tp - Ts;
+					R0[WS(rs, 3)] = KP2_000000000 * (Ts + Tp);
+					Tv = Tk - Tf;
+					TD = FMA(KP1_732050807, TA, Tz);
+					TB = FNMS(KP1_732050807, TA, Tz);
+					R1[WS(rs, 4)] = KP1_414213562 * (Tu + Tt);
+					R1[WS(rs, 1)] = KP1_414213562 * (Tt - Tu);
+				   }
+			      }
+			      R0[WS(rs, 2)] = Tf + Tk;
+			      TC = Ty + TB;
+			      R0[WS(rs, 5)] = TB - Ty;
+			 }
+			 R1[WS(rs, 3)] = KP707106781 * (Tv + TC);
+			 R1[0] = KP707106781 * (Tv - TC);
+			 TF = Tl - Tm;
+			 R0[WS(rs, 4)] = -(Tl + Tm);
+		    }
+	       }
+	       R0[WS(rs, 1)] = TD - TE;
+	       TG = TE + TD;
+	       R1[WS(rs, 5)] = KP707106781 * (TF - TG);
+	       R1[WS(rs, 2)] = KP707106781 * (TF + TG);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 12, "r2cbIII_12", {30, 8, 12, 0}, &GENUS };
+
+void X(codelet_r2cbIII_12) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_12, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -name r2cbIII_12 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 42 FP additions, 20 FP multiplications,
+ * (or, 38 additions, 16 multiplications, 4 fused multiply/add),
+ * 25 stack variables, 4 constants, and 24 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
+	       E T5, Tw, Tb, Te, Tx, Ts, Ta, TA, Tg, Tj, Tz, Tp, Tt, Tu;
+	       {
+		    E T1, T2, T3, T4;
+		    T1 = Cr[WS(csr, 1)];
+		    T2 = Cr[WS(csr, 5)];
+		    T3 = Cr[WS(csr, 2)];
+		    T4 = T2 + T3;
+		    T5 = T1 + T4;
+		    Tw = KP866025403 * (T2 - T3);
+		    Tb = FNMS(KP500000000, T4, T1);
+	       }
+	       {
+		    E Tq, Tc, Td, Tr;
+		    Tq = Ci[WS(csi, 1)];
+		    Tc = Ci[WS(csi, 5)];
+		    Td = Ci[WS(csi, 2)];
+		    Tr = Td - Tc;
+		    Te = KP866025403 * (Tc + Td);
+		    Tx = FMA(KP500000000, Tr, Tq);
+		    Ts = Tq - Tr;
+	       }
+	       {
+		    E T6, T7, T8, T9;
+		    T6 = Cr[WS(csr, 4)];
+		    T7 = Cr[0];
+		    T8 = Cr[WS(csr, 3)];
+		    T9 = T7 + T8;
+		    Ta = T6 + T9;
+		    TA = KP866025403 * (T7 - T8);
+		    Tg = FNMS(KP500000000, T9, T6);
+	       }
+	       {
+		    E To, Th, Ti, Tn;
+		    To = Ci[WS(csi, 4)];
+		    Th = Ci[0];
+		    Ti = Ci[WS(csi, 3)];
+		    Tn = Ti - Th;
+		    Tj = KP866025403 * (Th + Ti);
+		    Tz = FMA(KP500000000, Tn, To);
+		    Tp = Tn - To;
+	       }
+	       R0[0] = KP2_000000000 * (T5 + Ta);
+	       R0[WS(rs, 3)] = KP2_000000000 * (Ts + Tp);
+	       Tt = Tp - Ts;
+	       Tu = T5 - Ta;
+	       R1[WS(rs, 1)] = KP1_414213562 * (Tt - Tu);
+	       R1[WS(rs, 4)] = KP1_414213562 * (Tu + Tt);
+	       {
+		    E Tf, Tk, Tv, Ty, TB, TC;
+		    Tf = Tb - Te;
+		    Tk = Tg + Tj;
+		    Tv = Tf - Tk;
+		    Ty = Tw + Tx;
+		    TB = Tz - TA;
+		    TC = Ty + TB;
+		    R0[WS(rs, 2)] = -(KP2_000000000 * (Tf + Tk));
+		    R0[WS(rs, 5)] = KP2_000000000 * (TB - Ty);
+		    R1[0] = KP1_414213562 * (Tv - TC);
+		    R1[WS(rs, 3)] = KP1_414213562 * (Tv + TC);
+	       }
+	       {
+		    E Tl, Tm, TF, TD, TE, TG;
+		    Tl = Tb + Te;
+		    Tm = Tg - Tj;
+		    TF = Tm - Tl;
+		    TD = TA + Tz;
+		    TE = Tx - Tw;
+		    TG = TE + TD;
+		    R0[WS(rs, 4)] = KP2_000000000 * (Tl + Tm);
+		    R1[WS(rs, 2)] = KP1_414213562 * (TF + TG);
+		    R0[WS(rs, 1)] = KP2_000000000 * (TD - TE);
+		    R1[WS(rs, 5)] = KP1_414213562 * (TF - TG);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 12, "r2cbIII_12", {38, 16, 4, 0}, &GENUS };
+
+void X(codelet_r2cbIII_12) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_12, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:36 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -name r2cbIII_15 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 64 FP additions, 43 FP multiplications,
+ * (or, 21 additions, 0 multiplications, 43 fused multiply/add),
+ * 48 stack variables, 9 constants, and 30 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
+	       E TX, Tv, To, TW, Tl, Tx, Ty, Tw;
+	       {
+		    E TA, Tk, T6, T5, Tz, Th, TI, Tp, Tu, TK, TR, Tn, Td, Tq;
+		    {
+			 E T1, T2, T3, Ti, Tj;
+			 Ti = Ci[WS(csi, 4)];
+			 Tj = Ci[WS(csi, 1)];
+			 T1 = Cr[WS(csr, 7)];
+			 T2 = Cr[WS(csr, 4)];
+			 T3 = Cr[WS(csr, 1)];
+			 TA = FNMS(KP618033988, Ti, Tj);
+			 Tk = FMA(KP618033988, Tj, Ti);
+			 {
+			      E T7, TP, Tc, T8;
+			      T6 = Cr[WS(csr, 2)];
+			      {
+				   E T4, Tg, Ta, Tb, Tf;
+				   T4 = T2 + T3;
+				   Tg = T2 - T3;
+				   Ta = Cr[WS(csr, 3)];
+				   Tb = Cr[WS(csr, 6)];
+				   T7 = Cr[0];
+				   Tf = FNMS(KP500000000, T4, T1);
+				   T5 = FMA(KP2_000000000, T4, T1);
+				   TP = Ta - Tb;
+				   Tc = Ta + Tb;
+				   Tz = FNMS(KP1_118033988, Tg, Tf);
+				   Th = FMA(KP1_118033988, Tg, Tf);
+				   T8 = Cr[WS(csr, 5)];
+			      }
+			      TI = Ci[WS(csi, 2)];
+			      {
+				   E Ts, Tt, TQ, T9;
+				   Ts = Ci[WS(csi, 3)];
+				   Tt = Ci[WS(csi, 6)];
+				   TQ = T7 - T8;
+				   T9 = T7 + T8;
+				   Tp = Ci[0];
+				   Tu = Ts - Tt;
+				   TK = Ts + Tt;
+				   TX = FMA(KP618033988, TP, TQ);
+				   TR = FNMS(KP618033988, TQ, TP);
+				   Tn = T9 - Tc;
+				   Td = T9 + Tc;
+				   Tq = Ci[WS(csi, 5)];
+			      }
+			 }
+		    }
+		    {
+			 E TB, TF, TO, TG, TE;
+			 {
+			      E Tm, T11, TN, TD, TM, T12, TC;
+			      TB = FNMS(KP1_902113032, TA, Tz);
+			      TF = FMA(KP1_902113032, TA, Tz);
+			      {
+				   E Te, Tr, TJ, TL;
+				   Tm = FNMS(KP250000000, Td, T6);
+				   Te = T6 + Td;
+				   Tr = Tp + Tq;
+				   TJ = Tq - Tp;
+				   R0[0] = FMA(KP2_000000000, Te, T5);
+				   T11 = Te - T5;
+				   TN = TJ + TK;
+				   TL = TJ - TK;
+				   Tv = FMA(KP618033988, Tu, Tr);
+				   TD = FNMS(KP618033988, Tr, Tu);
+				   TM = FNMS(KP250000000, TL, TI);
+				   T12 = TL + TI;
+			      }
+			      TC = FNMS(KP559016994, Tn, Tm);
+			      To = FMA(KP559016994, Tn, Tm);
+			      R1[WS(rs, 2)] = FMA(KP1_732050807, T12, T11);
+			      R0[WS(rs, 5)] = FMS(KP1_732050807, T12, T11);
+			      TW = FMA(KP559016994, TN, TM);
+			      TO = FNMS(KP559016994, TN, TM);
+			      TG = FNMS(KP951056516, TD, TC);
+			      TE = FMA(KP951056516, TD, TC);
+			 }
+			 Tl = FNMS(KP1_902113032, Tk, Th);
+			 Tx = FMA(KP1_902113032, Tk, Th);
+			 {
+			      E TS, TU, TT, TH;
+			      TS = FMA(KP951056516, TR, TO);
+			      TU = FNMS(KP951056516, TR, TO);
+			      TT = TF - TG;
+			      R1[WS(rs, 1)] = -(FMA(KP2_000000000, TG, TF));
+			      TH = TB - TE;
+			      R0[WS(rs, 6)] = FMA(KP2_000000000, TE, TB);
+			      R1[WS(rs, 6)] = -(FMA(KP1_732050807, TU, TT));
+			      R0[WS(rs, 4)] = FNMS(KP1_732050807, TU, TT);
+			      R1[WS(rs, 3)] = -(FMA(KP1_732050807, TS, TH));
+			      R0[WS(rs, 1)] = FNMS(KP1_732050807, TS, TH);
+			 }
+		    }
+	       }
+	       Ty = FNMS(KP951056516, Tv, To);
+	       Tw = FMA(KP951056516, Tv, To);
+	       {
+		    E T10, TY, TV, TZ;
+		    T10 = FMA(KP951056516, TX, TW);
+		    TY = FNMS(KP951056516, TX, TW);
+		    TV = Ty - Tx;
+		    R0[WS(rs, 3)] = FMA(KP2_000000000, Ty, Tx);
+		    TZ = Tl - Tw;
+		    R1[WS(rs, 4)] = -(FMA(KP2_000000000, Tw, Tl));
+		    R1[WS(rs, 5)] = FMA(KP1_732050807, TY, TV);
+		    R1[0] = FNMS(KP1_732050807, TY, TV);
+		    R0[WS(rs, 2)] = FMA(KP1_732050807, T10, TZ);
+		    R0[WS(rs, 7)] = FNMS(KP1_732050807, T10, TZ);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 15, "r2cbIII_15", {21, 0, 43, 0}, &GENUS };
+
+void X(codelet_r2cbIII_15) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_15, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -name r2cbIII_15 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 64 FP additions, 26 FP multiplications,
+ * (or, 49 additions, 11 multiplications, 15 fused multiply/add),
+ * 47 stack variables, 14 constants, and 30 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP433012701, +0.433012701892219323381861585376468091735701313);
+     DK(KP968245836, +0.968245836551854221294816349945599902708230426);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP1_647278207, +1.647278207092663851754840078556380006059321028);
+     DK(KP1_018073920, +1.018073920910254366901961726787815297021466329);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
+	       E Tv, TD, T5, Ts, TC, T6, Tf, TW, TK, Td, Tg, TP, To, TN, TA;
+	       E TO, TQ, Tt, Tu, T12, Te, T11;
+	       Tt = Ci[WS(csi, 4)];
+	       Tu = Ci[WS(csi, 1)];
+	       Tv = FMA(KP1_902113032, Tt, KP1_175570504 * Tu);
+	       TD = FNMS(KP1_175570504, Tt, KP1_902113032 * Tu);
+	       {
+		    E T1, T4, Tq, T2, T3, Tr;
+		    T1 = Cr[WS(csr, 7)];
+		    T2 = Cr[WS(csr, 4)];
+		    T3 = Cr[WS(csr, 1)];
+		    T4 = T2 + T3;
+		    Tq = KP1_118033988 * (T2 - T3);
+		    T5 = FMA(KP2_000000000, T4, T1);
+		    Tr = FNMS(KP500000000, T4, T1);
+		    Ts = Tq + Tr;
+		    TC = Tr - Tq;
+	       }
+	       {
+		    E Tc, TJ, T9, TI;
+		    T6 = Cr[WS(csr, 2)];
+		    {
+			 E Ta, Tb, T7, T8;
+			 Ta = Cr[WS(csr, 3)];
+			 Tb = Cr[WS(csr, 6)];
+			 Tc = Ta + Tb;
+			 TJ = Ta - Tb;
+			 T7 = Cr[0];
+			 T8 = Cr[WS(csr, 5)];
+			 T9 = T7 + T8;
+			 TI = T7 - T8;
+		    }
+		    Tf = KP559016994 * (T9 - Tc);
+		    TW = FNMS(KP1_647278207, TJ, KP1_018073920 * TI);
+		    TK = FMA(KP1_647278207, TI, KP1_018073920 * TJ);
+		    Td = T9 + Tc;
+		    Tg = FNMS(KP250000000, Td, T6);
+	       }
+	       {
+		    E Tn, TM, Tk, TL;
+		    TP = Ci[WS(csi, 2)];
+		    {
+			 E Tl, Tm, Ti, Tj;
+			 Tl = Ci[WS(csi, 3)];
+			 Tm = Ci[WS(csi, 6)];
+			 Tn = Tl - Tm;
+			 TM = Tl + Tm;
+			 Ti = Ci[0];
+			 Tj = Ci[WS(csi, 5)];
+			 Tk = Ti + Tj;
+			 TL = Ti - Tj;
+		    }
+		    To = FMA(KP951056516, Tk, KP587785252 * Tn);
+		    TN = KP968245836 * (TL - TM);
+		    TA = FNMS(KP587785252, Tk, KP951056516 * Tn);
+		    TO = TL + TM;
+		    TQ = FMA(KP433012701, TO, KP1_732050807 * TP);
+	       }
+	       T12 = KP1_732050807 * (TP - TO);
+	       Te = T6 + Td;
+	       T11 = Te - T5;
+	       R0[0] = FMA(KP2_000000000, Te, T5);
+	       R0[WS(rs, 5)] = T12 - T11;
+	       R1[WS(rs, 2)] = T11 + T12;
+	       {
+		    E TE, TG, TB, TF, TY, T10, Tz, TX, TV, TZ;
+		    TE = TC - TD;
+		    TG = TC + TD;
+		    Tz = Tg - Tf;
+		    TB = Tz + TA;
+		    TF = TA - Tz;
+		    TX = TN + TQ;
+		    TY = TW - TX;
+		    T10 = TW + TX;
+		    R0[WS(rs, 6)] = FMA(KP2_000000000, TB, TE);
+		    R1[WS(rs, 1)] = FMS(KP2_000000000, TF, TG);
+		    TV = TE - TB;
+		    R0[WS(rs, 1)] = TV + TY;
+		    R1[WS(rs, 3)] = TY - TV;
+		    TZ = TF + TG;
+		    R0[WS(rs, 4)] = TZ - T10;
+		    R1[WS(rs, 6)] = -(TZ + T10);
+	       }
+	       {
+		    E Tw, Ty, Tp, Tx, TS, TU, Th, TR, TH, TT;
+		    Tw = Ts - Tv;
+		    Ty = Ts + Tv;
+		    Th = Tf + Tg;
+		    Tp = Th + To;
+		    Tx = Th - To;
+		    TR = TN - TQ;
+		    TS = TK + TR;
+		    TU = TR - TK;
+		    R1[WS(rs, 4)] = -(FMA(KP2_000000000, Tp, Tw));
+		    R0[WS(rs, 3)] = FMA(KP2_000000000, Tx, Ty);
+		    TH = Tx - Ty;
+		    R1[WS(rs, 5)] = TH - TS;
+		    R1[0] = TH + TS;
+		    TT = Tw - Tp;
+		    R0[WS(rs, 2)] = TT - TU;
+		    R0[WS(rs, 7)] = TT + TU;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 15, "r2cbIII_15", {49, 11, 15, 0}, &GENUS };
+
+void X(codelet_r2cbIII_15) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_15, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:36 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cbIII_16 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 66 FP additions, 36 FP multiplications,
+ * (or, 46 additions, 16 multiplications, 20 fused multiply/add),
+ * 55 stack variables, 9 constants, and 32 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
+	       E TA, TD, Tv, TG, TE, TF;
+	       {
+		    E TK, TP, T7, T13, TW, TH, Tj, TC, To, Te, TX, TS, T12, Tt, TB;
+		    {
+			 E T4, Tf, T3, TU, Tz, T5, Tg, Th;
+			 {
+			      E T1, T2, Tx, Ty;
+			      T1 = Cr[0];
+			      T2 = Cr[WS(csr, 7)];
+			      Tx = Ci[0];
+			      Ty = Ci[WS(csi, 7)];
+			      T4 = Cr[WS(csr, 4)];
+			      Tf = T1 - T2;
+			      T3 = T1 + T2;
+			      TU = Ty - Tx;
+			      Tz = Tx + Ty;
+			      T5 = Cr[WS(csr, 3)];
+			      Tg = Ci[WS(csi, 4)];
+			      Th = Ci[WS(csi, 3)];
+			 }
+			 {
+			      E Tb, Tk, Ta, TR, Tn, Tc, Tq, Tr;
+			      {
+				   E T8, T9, Tl, Tm;
+				   T8 = Cr[WS(csr, 2)];
+				   {
+					E Tw, T6, TV, Ti;
+					Tw = T4 - T5;
+					T6 = T4 + T5;
+					TV = Th - Tg;
+					Ti = Tg + Th;
+					TK = Tw - Tz;
+					TA = Tw + Tz;
+					TP = T3 - T6;
+					T7 = T3 + T6;
+					T13 = TV + TU;
+					TW = TU - TV;
+					TH = Tf + Ti;
+					Tj = Tf - Ti;
+					T9 = Cr[WS(csr, 5)];
+				   }
+				   Tl = Ci[WS(csi, 2)];
+				   Tm = Ci[WS(csi, 5)];
+				   Tb = Cr[WS(csr, 1)];
+				   Tk = T8 - T9;
+				   Ta = T8 + T9;
+				   TR = Tl - Tm;
+				   Tn = Tl + Tm;
+				   Tc = Cr[WS(csr, 6)];
+				   Tq = Ci[WS(csi, 1)];
+				   Tr = Ci[WS(csi, 6)];
+			      }
+			      TC = Tk + Tn;
+			      To = Tk - Tn;
+			      {
+				   E Tp, Td, TQ, Ts;
+				   Tp = Tb - Tc;
+				   Td = Tb + Tc;
+				   TQ = Tr - Tq;
+				   Ts = Tq + Tr;
+				   Te = Ta + Td;
+				   TX = Ta - Td;
+				   TS = TQ - TR;
+				   T12 = TR + TQ;
+				   Tt = Tp - Ts;
+				   TB = Tp + Ts;
+			      }
+			 }
+		    }
+		    {
+			 E T10, TT, TY, TZ;
+			 R0[0] = KP2_000000000 * (T7 + Te);
+			 R0[WS(rs, 4)] = KP2_000000000 * (T13 - T12);
+			 T10 = TP - TS;
+			 TT = TP + TS;
+			 TY = TW - TX;
+			 TZ = TX + TW;
+			 {
+			      E T11, T14, TI, TL, Tu;
+			      T11 = T7 - Te;
+			      T14 = T12 + T13;
+			      R0[WS(rs, 5)] = KP1_847759065 * (FNMS(KP414213562, TT, TY));
+			      R0[WS(rs, 1)] = KP1_847759065 * (FMA(KP414213562, TY, TT));
+			      R0[WS(rs, 6)] = KP1_414213562 * (T14 - T11);
+			      R0[WS(rs, 2)] = KP1_414213562 * (T11 + T14);
+			      TD = TB - TC;
+			      TI = TC + TB;
+			      TL = To - Tt;
+			      Tu = To + Tt;
+			      {
+				   E TO, TJ, TN, TM;
+				   R0[WS(rs, 7)] = -(KP1_847759065 * (FNMS(KP414213562, TZ, T10)));
+				   R0[WS(rs, 3)] = KP1_847759065 * (FMA(KP414213562, T10, TZ));
+				   TO = FMA(KP707106781, TI, TH);
+				   TJ = FNMS(KP707106781, TI, TH);
+				   TN = FMA(KP707106781, TL, TK);
+				   TM = FNMS(KP707106781, TL, TK);
+				   Tv = FMA(KP707106781, Tu, Tj);
+				   TG = FNMS(KP707106781, Tu, Tj);
+				   R1[WS(rs, 3)] = KP1_961570560 * (FMA(KP198912367, TO, TN));
+				   R1[WS(rs, 7)] = -(KP1_961570560 * (FNMS(KP198912367, TN, TO)));
+				   R1[WS(rs, 5)] = KP1_662939224 * (FNMS(KP668178637, TJ, TM));
+				   R1[WS(rs, 1)] = KP1_662939224 * (FMA(KP668178637, TM, TJ));
+			      }
+			 }
+		    }
+	       }
+	       TE = FNMS(KP707106781, TD, TA);
+	       TF = FMA(KP707106781, TD, TA);
+	       R1[WS(rs, 2)] = -(KP1_662939224 * (FNMS(KP668178637, TG, TF)));
+	       R1[WS(rs, 6)] = -(KP1_662939224 * (FMA(KP668178637, TF, TG)));
+	       R1[WS(rs, 4)] = -(KP1_961570560 * (FMA(KP198912367, Tv, TE)));
+	       R1[0] = KP1_961570560 * (FNMS(KP198912367, TE, Tv));
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 16, "r2cbIII_16", {46, 16, 20, 0}, &GENUS };
+
+void X(codelet_r2cbIII_16) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_16, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cbIII_16 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 66 FP additions, 32 FP multiplications,
+ * (or, 54 additions, 20 multiplications, 12 fused multiply/add),
+ * 40 stack variables, 9 constants, and 32 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP390180644, +0.390180644032256535696569736954044481855383236);
+     DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP765366864, +0.765366864730179543456919968060797733522689125);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
+	       E T7, TW, T13, Tj, TD, TK, TP, TH, Te, TX, T12, To, Tt, Tx, TS;
+	       E Tw, TT, TY;
+	       {
+		    E T3, Tf, TC, TV, T6, Tz, Ti, TU;
+		    {
+			 E T1, T2, TA, TB;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 7)];
+			 T3 = T1 + T2;
+			 Tf = T1 - T2;
+			 TA = Ci[0];
+			 TB = Ci[WS(csi, 7)];
+			 TC = TA + TB;
+			 TV = TB - TA;
+		    }
+		    {
+			 E T4, T5, Tg, Th;
+			 T4 = Cr[WS(csr, 4)];
+			 T5 = Cr[WS(csr, 3)];
+			 T6 = T4 + T5;
+			 Tz = T4 - T5;
+			 Tg = Ci[WS(csi, 4)];
+			 Th = Ci[WS(csi, 3)];
+			 Ti = Tg + Th;
+			 TU = Tg - Th;
+		    }
+		    T7 = T3 + T6;
+		    TW = TU + TV;
+		    T13 = TV - TU;
+		    Tj = Tf - Ti;
+		    TD = Tz + TC;
+		    TK = Tz - TC;
+		    TP = T3 - T6;
+		    TH = Tf + Ti;
+	       }
+	       {
+		    E Ta, Tk, Tn, TR, Td, Tp, Ts, TQ;
+		    {
+			 E T8, T9, Tl, Tm;
+			 T8 = Cr[WS(csr, 2)];
+			 T9 = Cr[WS(csr, 5)];
+			 Ta = T8 + T9;
+			 Tk = T8 - T9;
+			 Tl = Ci[WS(csi, 2)];
+			 Tm = Ci[WS(csi, 5)];
+			 Tn = Tl + Tm;
+			 TR = Tl - Tm;
+		    }
+		    {
+			 E Tb, Tc, Tq, Tr;
+			 Tb = Cr[WS(csr, 1)];
+			 Tc = Cr[WS(csr, 6)];
+			 Td = Tb + Tc;
+			 Tp = Tb - Tc;
+			 Tq = Ci[WS(csi, 1)];
+			 Tr = Ci[WS(csi, 6)];
+			 Ts = Tq + Tr;
+			 TQ = Tr - Tq;
+		    }
+		    Te = Ta + Td;
+		    TX = Ta - Td;
+		    T12 = TR + TQ;
+		    To = Tk - Tn;
+		    Tt = Tp - Ts;
+		    Tx = Tp + Ts;
+		    TS = TQ - TR;
+		    Tw = Tk + Tn;
+	       }
+	       R0[0] = KP2_000000000 * (T7 + Te);
+	       R0[WS(rs, 4)] = KP2_000000000 * (T13 - T12);
+	       TT = TP + TS;
+	       TY = TW - TX;
+	       R0[WS(rs, 1)] = FMA(KP1_847759065, TT, KP765366864 * TY);
+	       R0[WS(rs, 5)] = FNMS(KP765366864, TT, KP1_847759065 * TY);
+	       {
+		    E T11, T14, TZ, T10;
+		    T11 = T7 - Te;
+		    T14 = T12 + T13;
+		    R0[WS(rs, 2)] = KP1_414213562 * (T11 + T14);
+		    R0[WS(rs, 6)] = KP1_414213562 * (T14 - T11);
+		    TZ = TP - TS;
+		    T10 = TX + TW;
+		    R0[WS(rs, 3)] = FMA(KP765366864, TZ, KP1_847759065 * T10);
+		    R0[WS(rs, 7)] = FNMS(KP1_847759065, TZ, KP765366864 * T10);
+	       }
+	       {
+		    E TJ, TN, TM, TO, TI, TL;
+		    TI = KP707106781 * (Tw + Tx);
+		    TJ = TH - TI;
+		    TN = TH + TI;
+		    TL = KP707106781 * (To - Tt);
+		    TM = TK - TL;
+		    TO = TL + TK;
+		    R1[WS(rs, 1)] = FMA(KP1_662939224, TJ, KP1_111140466 * TM);
+		    R1[WS(rs, 7)] = FNMS(KP1_961570560, TN, KP390180644 * TO);
+		    R1[WS(rs, 5)] = FNMS(KP1_111140466, TJ, KP1_662939224 * TM);
+		    R1[WS(rs, 3)] = FMA(KP390180644, TN, KP1_961570560 * TO);
+	       }
+	       {
+		    E Tv, TF, TE, TG, Tu, Ty;
+		    Tu = KP707106781 * (To + Tt);
+		    Tv = Tj + Tu;
+		    TF = Tj - Tu;
+		    Ty = KP707106781 * (Tw - Tx);
+		    TE = Ty + TD;
+		    TG = Ty - TD;
+		    R1[0] = FNMS(KP390180644, TE, KP1_961570560 * Tv);
+		    R1[WS(rs, 6)] = FNMS(KP1_662939224, TF, KP1_111140466 * TG);
+		    R1[WS(rs, 4)] = -(FMA(KP390180644, Tv, KP1_961570560 * TE));
+		    R1[WS(rs, 2)] = FMA(KP1_111140466, TF, KP1_662939224 * TG);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 16, "r2cbIII_16", {54, 20, 12, 0}, &GENUS };
+
+void X(codelet_r2cbIII_16) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_16, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:32 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -name r2cbIII_2 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 0 FP additions, 2 FP multiplications,
+ * (or, 0 additions, 2 multiplications, 0 fused multiply/add),
+ * 4 stack variables, 1 constants, and 4 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
+	       E T1, T2;
+	       T1 = Cr[0];
+	       T2 = Ci[0];
+	       R0[0] = KP2_000000000 * T1;
+	       R1[0] = -(KP2_000000000 * T2);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 2, "r2cbIII_2", {0, 2, 0, 0}, &GENUS };
+
+void X(codelet_r2cbIII_2) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_2, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -name r2cbIII_2 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 0 FP additions, 2 FP multiplications,
+ * (or, 0 additions, 2 multiplications, 0 fused multiply/add),
+ * 4 stack variables, 1 constants, and 4 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
+	       E T1, T2;
+	       T1 = Cr[0];
+	       T2 = Ci[0];
+	       R0[0] = KP2_000000000 * T1;
+	       R1[0] = -(KP2_000000000 * T2);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 2, "r2cbIII_2", {0, 2, 0, 0}, &GENUS };
+
+void X(codelet_r2cbIII_2) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_2, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:44 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -name r2cbIII_20 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 94 FP additions, 56 FP multiplications,
+ * (or, 58 additions, 20 multiplications, 36 fused multiply/add),
+ * 59 stack variables, 6 constants, and 40 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
+	       E TZ, TD, TW, Tw, Tt, TF, T1f, T1b;
+	       {
+		    E T1l, Tk, T9, Tj, Ta, TV, TI, Ts, TU, T1t, T11, Tx, T13, TC, T1a;
+		    E T1i, Th, Tv, Ty;
+		    {
+			 E TQ, TS, Tr, Tm, Tn;
+			 {
+			      E T1, T5, T6, T2, T3, T7, TY;
+			      T1 = Cr[WS(csr, 2)];
+			      T5 = Cr[WS(csr, 9)];
+			      T6 = Cr[WS(csr, 5)];
+			      T2 = Cr[WS(csr, 6)];
+			      T3 = Cr[WS(csr, 1)];
+			      TQ = Ci[WS(csi, 2)];
+			      T7 = T5 + T6;
+			      TY = T5 - T6;
+			      {
+				   E T4, TX, T8, Tp, Tq;
+				   T4 = T2 + T3;
+				   TX = T2 - T3;
+				   Tp = Ci[WS(csi, 5)];
+				   Tq = Ci[WS(csi, 9)];
+				   T1l = FNMS(KP618033988, TX, TY);
+				   TZ = FMA(KP618033988, TY, TX);
+				   Tk = T4 - T7;
+				   T8 = T4 + T7;
+				   TS = Tp + Tq;
+				   Tr = Tp - Tq;
+				   T9 = T1 + T8;
+				   Tj = FNMS(KP250000000, T8, T1);
+				   Tm = Ci[WS(csi, 6)];
+				   Tn = Ci[WS(csi, 1)];
+			      }
+			 }
+			 {
+			      E Tb, T19, Tg, Tc;
+			      Ta = Cr[WS(csr, 7)];
+			      {
+				   E Te, Tf, To, TR, TT;
+				   Te = Cr[0];
+				   Tf = Cr[WS(csr, 4)];
+				   To = Tm + Tn;
+				   TR = Tm - Tn;
+				   Tb = Cr[WS(csr, 3)];
+				   T19 = Te - Tf;
+				   Tg = Te + Tf;
+				   TT = TR - TS;
+				   TV = TR + TS;
+				   TI = FNMS(KP618033988, To, Tr);
+				   Ts = FMA(KP618033988, Tr, To);
+				   TU = FNMS(KP250000000, TT, TQ);
+				   T1t = TT + TQ;
+				   Tc = Cr[WS(csr, 8)];
+			      }
+			      T11 = Ci[WS(csi, 7)];
+			      {
+				   E TA, TB, Td, T18;
+				   TA = Ci[WS(csi, 4)];
+				   TB = Ci[0];
+				   Td = Tb + Tc;
+				   T18 = Tb - Tc;
+				   Tx = Ci[WS(csi, 3)];
+				   T13 = TB + TA;
+				   TC = TA - TB;
+				   T1a = FMA(KP618033988, T19, T18);
+				   T1i = FNMS(KP618033988, T18, T19);
+				   Th = Td + Tg;
+				   Tv = Td - Tg;
+				   Ty = Ci[WS(csi, 8)];
+			      }
+			 }
+		    }
+		    {
+			 E Tu, T1w, T16, TL, T15, T1u;
+			 {
+			      E Ti, T12, Tz, T14;
+			      Tu = FNMS(KP250000000, Th, Ta);
+			      Ti = Ta + Th;
+			      T12 = Tx - Ty;
+			      Tz = Tx + Ty;
+			      T1w = T9 - Ti;
+			      T14 = T12 - T13;
+			      T16 = T12 + T13;
+			      TL = FNMS(KP618033988, Tz, TC);
+			      TD = FMA(KP618033988, TC, Tz);
+			      T15 = FNMS(KP250000000, T14, T11);
+			      T1u = T14 + T11;
+			      R0[0] = KP2_000000000 * (T9 + Ti);
+			 }
+			 {
+			      E Tl, TJ, TN, T1q, T1m, TK, T1h, T17, TH, T1k, T1v;
+			      Tl = FMA(KP559016994, Tk, Tj);
+			      TH = FNMS(KP559016994, Tk, Tj);
+			      T1k = FNMS(KP559016994, TV, TU);
+			      TW = FMA(KP559016994, TV, TU);
+			      R0[WS(rs, 5)] = KP2_000000000 * (T1u - T1t);
+			      T1v = T1t + T1u;
+			      TJ = FNMS(KP951056516, TI, TH);
+			      TN = FMA(KP951056516, TI, TH);
+			      T1q = FMA(KP951056516, T1l, T1k);
+			      T1m = FNMS(KP951056516, T1l, T1k);
+			      R1[WS(rs, 7)] = KP1_414213562 * (T1w + T1v);
+			      R1[WS(rs, 2)] = KP1_414213562 * (T1v - T1w);
+			      Tw = FMA(KP559016994, Tv, Tu);
+			      TK = FNMS(KP559016994, Tv, Tu);
+			      T1h = FNMS(KP559016994, T16, T15);
+			      T17 = FMA(KP559016994, T16, T15);
+			      {
+				   E TM, TO, T1j, T1r;
+				   TM = FMA(KP951056516, TL, TK);
+				   TO = FNMS(KP951056516, TL, TK);
+				   T1j = FMA(KP951056516, T1i, T1h);
+				   T1r = FNMS(KP951056516, T1i, T1h);
+				   Tt = FNMS(KP951056516, Ts, Tl);
+				   TF = FMA(KP951056516, Ts, Tl);
+				   {
+					E T1n, T1p, T1s, T1o;
+					T1n = TN - TO;
+					R0[WS(rs, 6)] = -(KP2_000000000 * (TN + TO));
+					T1p = TM - TJ;
+					R0[WS(rs, 4)] = KP2_000000000 * (TJ + TM);
+					T1s = T1q + T1r;
+					R0[WS(rs, 9)] = KP2_000000000 * (T1r - T1q);
+					T1o = T1m + T1j;
+					R0[WS(rs, 1)] = KP2_000000000 * (T1j - T1m);
+					R1[WS(rs, 6)] = KP1_414213562 * (T1p + T1s);
+					R1[WS(rs, 1)] = KP1_414213562 * (T1p - T1s);
+					R1[WS(rs, 3)] = KP1_414213562 * (T1n + T1o);
+					R1[WS(rs, 8)] = KP1_414213562 * (T1n - T1o);
+					T1f = FMA(KP951056516, T1a, T17);
+					T1b = FNMS(KP951056516, T1a, T17);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    E TE, TG, T10, T1e;
+		    TE = FMA(KP951056516, TD, Tw);
+		    TG = FNMS(KP951056516, TD, Tw);
+		    T10 = FMA(KP951056516, TZ, TW);
+		    T1e = FNMS(KP951056516, TZ, TW);
+		    {
+			 E T1d, TP, T1g, T1c;
+			 T1d = TF - TG;
+			 R0[WS(rs, 2)] = -(KP2_000000000 * (TF + TG));
+			 TP = Tt - TE;
+			 R0[WS(rs, 8)] = KP2_000000000 * (Tt + TE);
+			 T1g = T1e + T1f;
+			 R0[WS(rs, 7)] = KP2_000000000 * (T1e - T1f);
+			 T1c = T10 + T1b;
+			 R0[WS(rs, 3)] = KP2_000000000 * (T10 - T1b);
+			 R1[WS(rs, 9)] = -(KP1_414213562 * (T1d + T1g));
+			 R1[WS(rs, 4)] = KP1_414213562 * (T1d - T1g);
+			 R1[WS(rs, 5)] = -(KP1_414213562 * (TP + T1c));
+			 R1[0] = KP1_414213562 * (TP - T1c);
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 20, "r2cbIII_20", {58, 20, 36, 0}, &GENUS };
+
+void X(codelet_r2cbIII_20) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_20, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -name r2cbIII_20 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 94 FP additions, 44 FP multiplications,
+ * (or, 82 additions, 32 multiplications, 12 fused multiply/add),
+ * 43 stack variables, 6 constants, and 40 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
+	       E T1, Tj, T1k, T13, T8, Tk, T17, Ts, T16, TI, T18, T19, Ta, Tu, T1i;
+	       E TS, Th, Tv, TX, TD, TV, TL, TW, TY;
+	       {
+		    E T7, T12, T4, T11;
+		    T1 = Cr[WS(csr, 2)];
+		    {
+			 E T5, T6, T2, T3;
+			 T5 = Cr[WS(csr, 9)];
+			 T6 = Cr[WS(csr, 5)];
+			 T7 = T5 + T6;
+			 T12 = T5 - T6;
+			 T2 = Cr[WS(csr, 6)];
+			 T3 = Cr[WS(csr, 1)];
+			 T4 = T2 + T3;
+			 T11 = T2 - T3;
+		    }
+		    Tj = KP559016994 * (T4 - T7);
+		    T1k = FNMS(KP951056516, T12, KP587785252 * T11);
+		    T13 = FMA(KP951056516, T11, KP587785252 * T12);
+		    T8 = T4 + T7;
+		    Tk = FNMS(KP250000000, T8, T1);
+	       }
+	       {
+		    E Tr, T15, To, T14;
+		    T17 = Ci[WS(csi, 2)];
+		    {
+			 E Tp, Tq, Tm, Tn;
+			 Tp = Ci[WS(csi, 5)];
+			 Tq = Ci[WS(csi, 9)];
+			 Tr = Tp - Tq;
+			 T15 = Tp + Tq;
+			 Tm = Ci[WS(csi, 6)];
+			 Tn = Ci[WS(csi, 1)];
+			 To = Tm + Tn;
+			 T14 = Tm - Tn;
+		    }
+		    Ts = FMA(KP951056516, To, KP587785252 * Tr);
+		    T16 = KP559016994 * (T14 + T15);
+		    TI = FNMS(KP951056516, Tr, KP587785252 * To);
+		    T18 = T14 - T15;
+		    T19 = FNMS(KP250000000, T18, T17);
+	       }
+	       {
+		    E Tg, TR, Td, TQ;
+		    Ta = Cr[WS(csr, 7)];
+		    {
+			 E Te, Tf, Tb, Tc;
+			 Te = Cr[0];
+			 Tf = Cr[WS(csr, 4)];
+			 Tg = Te + Tf;
+			 TR = Te - Tf;
+			 Tb = Cr[WS(csr, 3)];
+			 Tc = Cr[WS(csr, 8)];
+			 Td = Tb + Tc;
+			 TQ = Tb - Tc;
+		    }
+		    Tu = KP559016994 * (Td - Tg);
+		    T1i = FNMS(KP951056516, TR, KP587785252 * TQ);
+		    TS = FMA(KP951056516, TQ, KP587785252 * TR);
+		    Th = Td + Tg;
+		    Tv = FNMS(KP250000000, Th, Ta);
+	       }
+	       {
+		    E TC, TU, Tz, TT;
+		    TX = Ci[WS(csi, 7)];
+		    {
+			 E TA, TB, Tx, Ty;
+			 TA = Ci[WS(csi, 4)];
+			 TB = Ci[0];
+			 TC = TA - TB;
+			 TU = TB + TA;
+			 Tx = Ci[WS(csi, 3)];
+			 Ty = Ci[WS(csi, 8)];
+			 Tz = Tx + Ty;
+			 TT = Ty - Tx;
+		    }
+		    TD = FMA(KP951056516, Tz, KP587785252 * TC);
+		    TV = KP559016994 * (TT - TU);
+		    TL = FNMS(KP587785252, Tz, KP951056516 * TC);
+		    TW = TT + TU;
+		    TY = FMA(KP250000000, TW, TX);
+	       }
+	       {
+		    E T9, Ti, T1w, T1t, T1u, T1v;
+		    T9 = T1 + T8;
+		    Ti = Ta + Th;
+		    T1w = T9 - Ti;
+		    T1t = T18 + T17;
+		    T1u = TX - TW;
+		    T1v = T1t + T1u;
+		    R0[0] = KP2_000000000 * (T9 + Ti);
+		    R0[WS(rs, 5)] = KP2_000000000 * (T1u - T1t);
+		    R1[WS(rs, 2)] = KP1_414213562 * (T1v - T1w);
+		    R1[WS(rs, 7)] = KP1_414213562 * (T1w + T1v);
+	       }
+	       {
+		    E TJ, TO, T1m, T1q, TM, TN, T1j, T1r;
+		    {
+			 E TH, T1l, TK, T1h;
+			 TH = Tk - Tj;
+			 TJ = TH + TI;
+			 TO = TH - TI;
+			 T1l = T19 - T16;
+			 T1m = T1k + T1l;
+			 T1q = T1l - T1k;
+			 TK = Tv - Tu;
+			 TM = TK + TL;
+			 TN = TL - TK;
+			 T1h = TV + TY;
+			 T1j = T1h - T1i;
+			 T1r = T1i + T1h;
+		    }
+		    R0[WS(rs, 4)] = KP2_000000000 * (TJ + TM);
+		    R0[WS(rs, 6)] = KP2_000000000 * (TN - TO);
+		    R0[WS(rs, 9)] = KP2_000000000 * (T1r - T1q);
+		    R0[WS(rs, 1)] = KP2_000000000 * (T1j - T1m);
+		    {
+			 E T1p, T1s, T1n, T1o;
+			 T1p = TM - TJ;
+			 T1s = T1q + T1r;
+			 R1[WS(rs, 1)] = KP1_414213562 * (T1p - T1s);
+			 R1[WS(rs, 6)] = KP1_414213562 * (T1p + T1s);
+			 T1n = TO + TN;
+			 T1o = T1m + T1j;
+			 R1[WS(rs, 8)] = KP1_414213562 * (T1n - T1o);
+			 R1[WS(rs, 3)] = KP1_414213562 * (T1n + T1o);
+		    }
+	       }
+	       {
+		    E Tt, TG, T1b, T1f, TE, TF, T10, T1e;
+		    {
+			 E Tl, T1a, Tw, TZ;
+			 Tl = Tj + Tk;
+			 Tt = Tl - Ts;
+			 TG = Tl + Ts;
+			 T1a = T16 + T19;
+			 T1b = T13 + T1a;
+			 T1f = T1a - T13;
+			 Tw = Tu + Tv;
+			 TE = Tw + TD;
+			 TF = TD - Tw;
+			 TZ = TV - TY;
+			 T10 = TS + TZ;
+			 T1e = TZ - TS;
+		    }
+		    R0[WS(rs, 8)] = KP2_000000000 * (Tt + TE);
+		    R0[WS(rs, 2)] = KP2_000000000 * (TF - TG);
+		    R0[WS(rs, 7)] = KP2_000000000 * (T1f + T1e);
+		    R0[WS(rs, 3)] = KP2_000000000 * (T1b + T10);
+		    {
+			 E T1d, T1g, TP, T1c;
+			 T1d = TG + TF;
+			 T1g = T1e - T1f;
+			 R1[WS(rs, 4)] = KP1_414213562 * (T1d + T1g);
+			 R1[WS(rs, 9)] = KP1_414213562 * (T1g - T1d);
+			 TP = Tt - TE;
+			 T1c = T10 - T1b;
+			 R1[0] = KP1_414213562 * (TP + T1c);
+			 R1[WS(rs, 5)] = KP1_414213562 * (T1c - TP);
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 20, "r2cbIII_20", {82, 32, 12, 0}, &GENUS };
+
+void X(codelet_r2cbIII_20) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_20, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 25 -name r2cbIII_25 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 152 FP additions, 120 FP multiplications,
+ * (or, 32 additions, 0 multiplications, 120 fused multiply/add),
+ * 115 stack variables, 44 constants, and 50 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP979740652, +0.979740652857618686258237536568998933733477632);
+     DK(KP438153340, +0.438153340021931793654057951961031291699532119);
+     DK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DK(KP1_721083328, +1.721083328735889354196523361841037632825608373);
+     DK(KP1_606007150, +1.606007150877320829666881187140752009270929701);
+     DK(KP1_011627398, +1.011627398597394192215998921771049272931807941);
+     DK(KP641441904, +0.641441904830606407298806329068862424939687989);
+     DK(KP595480289, +0.595480289600000014706716770488118292997907308);
+     DK(KP452413526, +0.452413526233009763856834323966348796985206956);
+     DK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DK(KP933137358, +0.933137358350283770603023973254446451924190884);
+     DK(KP1_666834356, +1.666834356657377354817925100486477686277992119);
+     DK(KP1_842354653, +1.842354653930286640500894870830132058718564461);
+     DK(KP1_082908895, +1.082908895072625554092571180165639018104066379);
+     DK(KP576710603, +0.576710603632765877371579268136471017090111488);
+     DK(KP662318342, +0.662318342759882818626911127577439236802190210);
+     DK(KP484291580, +0.484291580564315559745084187732367906918006201);
+     DK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DK(KP1_898359647, +1.898359647016882523151110931686726543423167685);
+     DK(KP1_386580726, +1.386580726567734802700860150804827247498955921);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP1_115827804, +1.115827804063668528375399296931134075984874304);
+     DK(KP470564281, +0.470564281212251493087595091036643380879947982);
+     DK(KP634619297, +0.634619297544148100711287640319130485732531031);
+     DK(KP499013364, +0.499013364214135780976168403431725276668452610);
+     DK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP730409924, +0.730409924561256563751459444999838399157094302);
+     DK(KP549754652, +0.549754652192770074288023275540779861653779767);
+     DK(KP256756360, +0.256756360367726783319498520922669048172391148);
+     DK(KP451418159, +0.451418159099103183892477933432151804893354132);
+     DK(KP846146756, +0.846146756728608505452954290121135880883743802);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP062914667, +0.062914667253649757225485955897349402364686947);
+     DK(KP939062505, +0.939062505817492352556001843133229685779824606);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
+	       E T1P, T2c, T2a, T24, T26, T25, T27, T2b;
+	       {
+		    E T1O, TS, T5, T1N, TP, Te, TA, T2i, T1V, T17, T1B, T2h, T1S, T10, T1C;
+		    E T1a, T19, Tn, T1h, T1l, T1Y, T1e, T21, TJ, T1g;
+		    {
+			 E T1, T2, T3, TQ, TR;
+			 TQ = Ci[WS(csi, 7)];
+			 TR = Ci[WS(csi, 2)];
+			 T1 = Cr[WS(csr, 12)];
+			 T2 = Cr[WS(csr, 7)];
+			 T3 = Cr[WS(csr, 2)];
+			 T1O = FNMS(KP618033988, TQ, TR);
+			 TS = FMA(KP618033988, TR, TQ);
+			 {
+			      E TV, TU, T1U, T16, T12, T1R, TZ, T11;
+			      {
+				   E T6, Tz, T14, T15, TX, Tu, Td, Tx, TY, T4, TO, Ty;
+				   T6 = Cr[WS(csr, 11)];
+				   T4 = T2 + T3;
+				   TO = T3 - T2;
+				   Tz = Ci[WS(csi, 11)];
+				   {
+					E Ta, T9, Tb, T7, T8, TN;
+					T7 = Cr[WS(csr, 6)];
+					T8 = Cr[WS(csr, 8)];
+					TN = FNMS(KP500000000, T4, T1);
+					T5 = FMA(KP2_000000000, T4, T1);
+					Ta = Cr[WS(csr, 1)];
+					T14 = T8 - T7;
+					T9 = T7 + T8;
+					T1N = FMA(KP1_118033988, TO, TN);
+					TP = FNMS(KP1_118033988, TO, TN);
+					Tb = Cr[WS(csr, 3)];
+					{
+					     E Tv, Tw, Ts, Tt, Tc;
+					     Ts = Ci[WS(csi, 8)];
+					     Tt = Ci[WS(csi, 6)];
+					     T15 = Tb - Ta;
+					     Tc = Ta + Tb;
+					     Tv = Ci[WS(csi, 3)];
+					     TX = Tt + Ts;
+					     Tu = Ts - Tt;
+					     Tw = Ci[WS(csi, 1)];
+					     Td = T9 + Tc;
+					     TV = Tc - T9;
+					     Tx = Tv - Tw;
+					     TY = Tw + Tv;
+					}
+				   }
+				   Te = T6 + Td;
+				   TU = FMS(KP250000000, Td, T6);
+				   T1U = FNMS(KP618033988, T14, T15);
+				   T16 = FMA(KP618033988, T15, T14);
+				   T12 = Tx - Tu;
+				   Ty = Tu + Tx;
+				   T1R = FNMS(KP618033988, TX, TY);
+				   TZ = FMA(KP618033988, TY, TX);
+				   TA = Ty - Tz;
+				   T11 = FMA(KP250000000, Ty, Tz);
+			      }
+			      {
+				   E Tf, TI, T1j, T1k, Tm, T1c, TD, TG, T1d, TH;
+				   Tf = Cr[WS(csr, 10)];
+				   TI = Ci[WS(csi, 10)];
+				   {
+					E T13, T1T, TW, T1Q;
+					T13 = FMA(KP559016994, T12, T11);
+					T1T = FNMS(KP559016994, T12, T11);
+					TW = FMA(KP559016994, TV, TU);
+					T1Q = FNMS(KP559016994, TV, TU);
+					T2i = FMA(KP951056516, T1U, T1T);
+					T1V = FNMS(KP951056516, T1U, T1T);
+					T17 = FMA(KP951056516, T16, T13);
+					T1B = FNMS(KP951056516, T16, T13);
+					T2h = FNMS(KP951056516, T1R, T1Q);
+					T1S = FMA(KP951056516, T1R, T1Q);
+					T10 = FNMS(KP951056516, TZ, TW);
+					T1C = FMA(KP951056516, TZ, TW);
+					{
+					     E Tg, Th, Tj, Tk;
+					     Tg = Cr[WS(csr, 5)];
+					     Th = Cr[WS(csr, 9)];
+					     Tj = Cr[0];
+					     Tk = Cr[WS(csr, 4)];
+					     {
+						  E TB, Ti, Tl, TC, TE, TF;
+						  TB = Ci[WS(csi, 9)];
+						  T1j = Tg - Th;
+						  Ti = Tg + Th;
+						  T1k = Tk - Tj;
+						  Tl = Tj + Tk;
+						  TC = Ci[WS(csi, 5)];
+						  TE = Ci[WS(csi, 4)];
+						  TF = Ci[0];
+						  Tm = Ti + Tl;
+						  T1a = Ti - Tl;
+						  T1c = TC + TB;
+						  TD = TB - TC;
+						  TG = TE - TF;
+						  T1d = TF + TE;
+					     }
+					}
+				   }
+				   T19 = FMS(KP250000000, Tm, Tf);
+				   Tn = Tf + Tm;
+				   T1h = TD - TG;
+				   TH = TD + TG;
+				   T1l = FNMS(KP618033988, T1k, T1j);
+				   T1Y = FMA(KP618033988, T1j, T1k);
+				   T1e = FMA(KP618033988, T1d, T1c);
+				   T21 = FNMS(KP618033988, T1c, T1d);
+				   TJ = TH - TI;
+				   T1g = FMA(KP250000000, TH, TI);
+			      }
+			 }
+		    }
+		    {
+			 E T1Z, T1m, T1y, T22, T1f, T1z, T2j, T2g, T2d, T2q, T2s;
+			 {
+			      E Tq, To, T2e, T2f;
+			      Tq = Tn - Te;
+			      To = Te + Tn;
+			      {
+				   E T1i, T1X, T1b, T20;
+				   T1i = FNMS(KP559016994, T1h, T1g);
+				   T1X = FMA(KP559016994, T1h, T1g);
+				   T1b = FNMS(KP559016994, T1a, T19);
+				   T20 = FMA(KP559016994, T1a, T19);
+				   T2e = FMA(KP951056516, T1Y, T1X);
+				   T1Z = FNMS(KP951056516, T1Y, T1X);
+				   T1m = FNMS(KP951056516, T1l, T1i);
+				   T1y = FMA(KP951056516, T1l, T1i);
+				   T2f = FNMS(KP951056516, T21, T20);
+				   T22 = FMA(KP951056516, T21, T20);
+				   T1f = FNMS(KP951056516, T1e, T1b);
+				   T1z = FMA(KP951056516, T1e, T1b);
+			      }
+			      {
+				   E T2o, TK, TM, T2p, Tr, TL, Tp;
+				   T2o = FMA(KP939062505, T2h, T2i);
+				   T2j = FNMS(KP939062505, T2i, T2h);
+				   R0[0] = FMA(KP2_000000000, To, T5);
+				   Tp = FNMS(KP500000000, To, T5);
+				   TK = FMA(KP618033988, TJ, TA);
+				   TM = FNMS(KP618033988, TA, TJ);
+				   T2g = FNMS(KP062914667, T2f, T2e);
+				   T2p = FMA(KP062914667, T2e, T2f);
+				   Tr = FNMS(KP1_118033988, Tq, Tp);
+				   TL = FMA(KP1_118033988, Tq, Tp);
+				   T2d = FMA(KP1_902113032, T1O, T1N);
+				   T1P = FNMS(KP1_902113032, T1O, T1N);
+				   T2q = FMA(KP846146756, T2p, T2o);
+				   T2s = FNMS(KP451418159, T2o, T2p);
+				   R0[WS(rs, 10)] = FMA(KP1_902113032, TK, Tr);
+				   R1[WS(rs, 2)] = FMS(KP1_902113032, TK, Tr);
+				   R1[WS(rs, 7)] = FMS(KP1_902113032, TM, TL);
+				   R0[WS(rs, 5)] = FMA(KP1_902113032, TM, TL);
+			      }
+			 }
+			 {
+			      E T18, T1n, T1x, TT, T2m, T1w, T1u, T2l, T1s, T1t, T2k;
+			      T18 = FNMS(KP256756360, T17, T10);
+			      T1s = FMA(KP256756360, T10, T17);
+			      T1t = FMA(KP549754652, T1f, T1m);
+			      T1n = FNMS(KP549754652, T1m, T1f);
+			      T1x = FNMS(KP1_902113032, TS, TP);
+			      TT = FMA(KP1_902113032, TS, TP);
+			      T2m = FMA(KP730409924, T2j, T2g);
+			      T2k = FNMS(KP730409924, T2j, T2g);
+			      T1w = FNMS(KP683113946, T1s, T1t);
+			      T1u = FMA(KP559154169, T1t, T1s);
+			      R1[WS(rs, 1)] = -(FMA(KP1_996053456, T2k, T2d));
+			      T2l = FNMS(KP499013364, T2k, T2d);
+			      {
+				   E T1K, T1M, T1G, T1E;
+				   {
+					E T1D, T1A, T1q, T1p, T1v, T1r;
+					{
+					     E T1I, T1J, T2n, T2r, T1o;
+					     T1I = FMA(KP634619297, T1B, T1C);
+					     T1D = FNMS(KP634619297, T1C, T1B);
+					     T1A = FMA(KP470564281, T1z, T1y);
+					     T1J = FNMS(KP470564281, T1y, T1z);
+					     T2n = FNMS(KP1_115827804, T2m, T2l);
+					     T2r = FMA(KP1_115827804, T2m, T2l);
+					     T1q = FNMS(KP904730450, T1n, T18);
+					     T1o = FMA(KP904730450, T1n, T18);
+					     R1[WS(rs, 11)] = FMS(KP1_386580726, T2q, T2n);
+					     R0[WS(rs, 4)] = FMA(KP1_386580726, T2q, T2n);
+					     R0[WS(rs, 9)] = FMA(KP1_898359647, T2s, T2r);
+					     R1[WS(rs, 6)] = FMS(KP1_898359647, T2s, T2r);
+					     R1[0] = FMS(KP1_937166322, T1o, TT);
+					     T1p = FMA(KP484291580, T1o, TT);
+					     T1K = FMA(KP662318342, T1J, T1I);
+					     T1M = FNMS(KP576710603, T1I, T1J);
+					}
+					T1v = FMA(KP1_082908895, T1q, T1p);
+					T1r = FNMS(KP1_082908895, T1q, T1p);
+					R1[WS(rs, 10)] = FMS(KP1_842354653, T1u, T1r);
+					R0[WS(rs, 3)] = FMA(KP1_842354653, T1u, T1r);
+					R0[WS(rs, 8)] = FMA(KP1_666834356, T1w, T1v);
+					R1[WS(rs, 5)] = FMS(KP1_666834356, T1w, T1v);
+					T1G = FNMS(KP933137358, T1D, T1A);
+					T1E = FMA(KP933137358, T1D, T1A);
+				   }
+				   {
+					E T23, T28, T29, T1W, T1F, T1H, T1L;
+					T23 = FNMS(KP634619297, T22, T1Z);
+					T28 = FMA(KP634619297, T1Z, T22);
+					T29 = FMA(KP549754652, T1S, T1V);
+					T1W = FNMS(KP549754652, T1V, T1S);
+					R0[WS(rs, 2)] = FMA(KP1_809654104, T1E, T1x);
+					T1F = FNMS(KP452413526, T1E, T1x);
+					T2c = FMA(KP595480289, T28, T29);
+					T2a = FNMS(KP641441904, T29, T28);
+					T1H = FNMS(KP1_011627398, T1G, T1F);
+					T1L = FMA(KP1_011627398, T1G, T1F);
+					R0[WS(rs, 12)] = FNMS(KP1_606007150, T1K, T1H);
+					R1[WS(rs, 4)] = -(FMA(KP1_606007150, T1K, T1H));
+					R1[WS(rs, 9)] = -(FMA(KP1_721083328, T1M, T1L));
+					R0[WS(rs, 7)] = FNMS(KP1_721083328, T1M, T1L);
+					T24 = FNMS(KP963507348, T23, T1W);
+					T26 = FMA(KP963507348, T23, T1W);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       R0[WS(rs, 1)] = FNMS(KP1_752613360, T24, T1P);
+	       T25 = FMA(KP438153340, T24, T1P);
+	       T27 = FMA(KP979740652, T26, T25);
+	       T2b = FNMS(KP979740652, T26, T25);
+	       R1[WS(rs, 8)] = -(FMA(KP1_606007150, T2a, T27));
+	       R0[WS(rs, 6)] = FNMS(KP1_606007150, T2a, T27);
+	       R1[WS(rs, 3)] = -(FMA(KP1_666834356, T2c, T2b));
+	       R0[WS(rs, 11)] = FNMS(KP1_666834356, T2c, T2b);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 25, "r2cbIII_25", {32, 0, 120, 0}, &GENUS };
+
+void X(codelet_r2cbIII_25) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_25, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 25 -name r2cbIII_25 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 152 FP additions, 98 FP multiplications,
+ * (or, 100 additions, 46 multiplications, 52 fused multiply/add),
+ * 65 stack variables, 21 constants, and 50 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
+	       E TS, T1O, T5, TP, T1N, TI, TH, Te, T17, T2h, T1y, T1V, T10, T2g, T1x;
+	       E T1S, Tz, Ty, Tn, T1m, T2e, T1B, T22, T1f, T2d, T1A, T1Z, TQ, TR;
+	       TQ = Ci[WS(csi, 2)];
+	       TR = Ci[WS(csi, 7)];
+	       TS = FNMS(KP1_175570504, TR, KP1_902113032 * TQ);
+	       T1O = FMA(KP1_902113032, TR, KP1_175570504 * TQ);
+	       {
+		    E T1, T4, TN, T2, T3, TO;
+		    T1 = Cr[WS(csr, 12)];
+		    T2 = Cr[WS(csr, 7)];
+		    T3 = Cr[WS(csr, 2)];
+		    T4 = T2 + T3;
+		    TN = KP1_118033988 * (T3 - T2);
+		    T5 = FMA(KP2_000000000, T4, T1);
+		    TO = FMS(KP500000000, T4, T1);
+		    TP = TN - TO;
+		    T1N = TO + TN;
+	       }
+	       {
+		    E T6, Td, T15, TU, T14, T11, TX, TY;
+		    T6 = Cr[WS(csr, 11)];
+		    TI = Ci[WS(csi, 11)];
+		    {
+			 E T7, T8, T9, Ta, Tb, Tc;
+			 T7 = Cr[WS(csr, 6)];
+			 T8 = Cr[WS(csr, 8)];
+			 T9 = T7 + T8;
+			 Ta = Cr[WS(csr, 1)];
+			 Tb = Cr[WS(csr, 3)];
+			 Tc = Ta + Tb;
+			 Td = T9 + Tc;
+			 T15 = Ta - Tb;
+			 TU = KP559016994 * (Tc - T9);
+			 T14 = T8 - T7;
+		    }
+		    {
+			 E TB, TC, TD, TE, TF, TG;
+			 TB = Ci[WS(csi, 6)];
+			 TC = Ci[WS(csi, 8)];
+			 TD = TB - TC;
+			 TE = Ci[WS(csi, 1)];
+			 TF = Ci[WS(csi, 3)];
+			 TG = TE - TF;
+			 TH = TD + TG;
+			 T11 = KP559016994 * (TD - TG);
+			 TX = TB + TC;
+			 TY = TE + TF;
+		    }
+		    Te = T6 + Td;
+		    {
+			 E T16, T1T, T13, T1U, T12;
+			 T16 = FMA(KP587785252, T14, KP951056516 * T15);
+			 T1T = FNMS(KP587785252, T15, KP951056516 * T14);
+			 T12 = FNMS(KP250000000, TH, TI);
+			 T13 = T11 - T12;
+			 T1U = T11 + T12;
+			 T17 = T13 - T16;
+			 T2h = T1T - T1U;
+			 T1y = T16 + T13;
+			 T1V = T1T + T1U;
+		    }
+		    {
+			 E TZ, T1R, TW, T1Q, TV;
+			 TZ = FNMS(KP951056516, TY, KP587785252 * TX);
+			 T1R = FMA(KP951056516, TX, KP587785252 * TY);
+			 TV = FMS(KP250000000, Td, T6);
+			 TW = TU - TV;
+			 T1Q = TV + TU;
+			 T10 = TW + TZ;
+			 T2g = T1Q + T1R;
+			 T1x = TZ - TW;
+			 T1S = T1Q - T1R;
+		    }
+	       }
+	       {
+		    E Tf, Tm, T1k, T19, T1j, T1g, T1c, T1d;
+		    Tf = Cr[WS(csr, 10)];
+		    Tz = Ci[WS(csi, 10)];
+		    {
+			 E Tg, Th, Ti, Tj, Tk, Tl;
+			 Tg = Cr[WS(csr, 5)];
+			 Th = Cr[WS(csr, 9)];
+			 Ti = Tg + Th;
+			 Tj = Cr[0];
+			 Tk = Cr[WS(csr, 4)];
+			 Tl = Tj + Tk;
+			 Tm = Ti + Tl;
+			 T1k = Tj - Tk;
+			 T19 = KP559016994 * (Tl - Ti);
+			 T1j = Th - Tg;
+		    }
+		    {
+			 E Ts, Tt, Tu, Tv, Tw, Tx;
+			 Ts = Ci[WS(csi, 4)];
+			 Tt = Ci[0];
+			 Tu = Ts - Tt;
+			 Tv = Ci[WS(csi, 5)];
+			 Tw = Ci[WS(csi, 9)];
+			 Tx = Tv - Tw;
+			 Ty = Tu - Tx;
+			 T1g = KP559016994 * (Tx + Tu);
+			 T1c = Tv + Tw;
+			 T1d = Tt + Ts;
+		    }
+		    Tn = Tf + Tm;
+		    {
+			 E T1l, T20, T1i, T21, T1h;
+			 T1l = FMA(KP587785252, T1j, KP951056516 * T1k);
+			 T20 = FNMS(KP587785252, T1k, KP951056516 * T1j);
+			 T1h = FMA(KP250000000, Ty, Tz);
+			 T1i = T1g - T1h;
+			 T21 = T1g + T1h;
+			 T1m = T1i - T1l;
+			 T2e = T21 - T20;
+			 T1B = T1l + T1i;
+			 T22 = T20 + T21;
+		    }
+		    {
+			 E T1e, T1Y, T1b, T1X, T1a;
+			 T1e = FNMS(KP951056516, T1d, KP587785252 * T1c);
+			 T1Y = FMA(KP951056516, T1c, KP587785252 * T1d);
+			 T1a = FMS(KP250000000, Tm, Tf);
+			 T1b = T19 - T1a;
+			 T1X = T1a + T19;
+			 T1f = T1b + T1e;
+			 T2d = T1X + T1Y;
+			 T1A = T1e - T1b;
+			 T1Z = T1X - T1Y;
+		    }
+	       }
+	       {
+		    E Tq, To, Tp, TK, TM, TA, TJ, TL, Tr;
+		    Tq = KP1_118033988 * (Tn - Te);
+		    To = Te + Tn;
+		    Tp = FMS(KP500000000, To, T5);
+		    TA = Ty - Tz;
+		    TJ = TH + TI;
+		    TK = FNMS(KP1_902113032, TJ, KP1_175570504 * TA);
+		    TM = FMA(KP1_175570504, TJ, KP1_902113032 * TA);
+		    R0[0] = FMA(KP2_000000000, To, T5);
+		    TL = Tq - Tp;
+		    R0[WS(rs, 5)] = TL + TM;
+		    R1[WS(rs, 7)] = TM - TL;
+		    Tr = Tp + Tq;
+		    R1[WS(rs, 2)] = Tr + TK;
+		    R0[WS(rs, 10)] = TK - Tr;
+	       }
+	       {
+		    E T2q, T2s, T2k, T2j, T2l, T2m, T2r, T2n;
+		    {
+			 E T2o, T2p, T2f, T2i;
+			 T2o = FNMS(KP904827052, T2d, KP425779291 * T2e);
+			 T2p = FNMS(KP535826794, T2h, KP844327925 * T2g);
+			 T2q = FNMS(KP1_902113032, T2p, KP1_175570504 * T2o);
+			 T2s = FMA(KP1_175570504, T2p, KP1_902113032 * T2o);
+			 T2k = T1N + T1O;
+			 T2f = FMA(KP425779291, T2d, KP904827052 * T2e);
+			 T2i = FMA(KP535826794, T2g, KP844327925 * T2h);
+			 T2j = T2f - T2i;
+			 T2l = FMA(KP500000000, T2j, T2k);
+			 T2m = KP1_118033988 * (T2i + T2f);
+		    }
+		    R0[WS(rs, 2)] = FMS(KP2_000000000, T2j, T2k);
+		    T2r = T2m - T2l;
+		    R0[WS(rs, 7)] = T2r + T2s;
+		    R1[WS(rs, 9)] = T2s - T2r;
+		    T2n = T2l + T2m;
+		    R1[WS(rs, 4)] = T2n + T2q;
+		    R0[WS(rs, 12)] = T2q - T2n;
+	       }
+	       {
+		    E T1u, T1w, TT, T1o, T1p, T1q, T1v, T1r;
+		    {
+			 E T1s, T1t, T18, T1n;
+			 T1s = FMA(KP481753674, T10, KP876306680 * T17);
+			 T1t = FMA(KP844327925, T1f, KP535826794 * T1m);
+			 T1u = FMA(KP1_902113032, T1s, KP1_175570504 * T1t);
+			 T1w = FNMS(KP1_175570504, T1s, KP1_902113032 * T1t);
+			 TT = TP - TS;
+			 T18 = FNMS(KP481753674, T17, KP876306680 * T10);
+			 T1n = FNMS(KP844327925, T1m, KP535826794 * T1f);
+			 T1o = T18 + T1n;
+			 T1p = FMS(KP500000000, T1o, TT);
+			 T1q = KP1_118033988 * (T1n - T18);
+		    }
+		    R0[WS(rs, 1)] = FMA(KP2_000000000, T1o, TT);
+		    T1v = T1q - T1p;
+		    R0[WS(rs, 6)] = T1v + T1w;
+		    R1[WS(rs, 8)] = T1w - T1v;
+		    T1r = T1p + T1q;
+		    R1[WS(rs, 3)] = T1r + T1u;
+		    R0[WS(rs, 11)] = T1u - T1r;
+	       }
+	       {
+		    E T1H, T1L, T1E, T1D, T1I, T1J, T1M, T1K;
+		    {
+			 E T1F, T1G, T1z, T1C;
+			 T1F = FNMS(KP062790519, T1B, KP998026728 * T1A);
+			 T1G = FNMS(KP684547105, T1x, KP728968627 * T1y);
+			 T1H = FNMS(KP1_902113032, T1G, KP1_175570504 * T1F);
+			 T1L = FMA(KP1_175570504, T1G, KP1_902113032 * T1F);
+			 T1E = TP + TS;
+			 T1z = FMA(KP728968627, T1x, KP684547105 * T1y);
+			 T1C = FMA(KP062790519, T1A, KP998026728 * T1B);
+			 T1D = T1z + T1C;
+			 T1I = FMA(KP500000000, T1D, T1E);
+			 T1J = KP1_118033988 * (T1C - T1z);
+		    }
+		    R1[WS(rs, 1)] = FMS(KP2_000000000, T1D, T1E);
+		    T1M = T1J - T1I;
+		    R0[WS(rs, 9)] = T1L - T1M;
+		    R1[WS(rs, 6)] = T1L + T1M;
+		    T1K = T1I + T1J;
+		    R1[WS(rs, 11)] = T1H - T1K;
+		    R0[WS(rs, 4)] = T1H + T1K;
+	       }
+	       {
+		    E T2a, T2c, T1P, T24, T25, T26, T2b, T27;
+		    {
+			 E T28, T29, T1W, T23;
+			 T28 = FMA(KP248689887, T1S, KP968583161 * T1V);
+			 T29 = FMA(KP481753674, T1Z, KP876306680 * T22);
+			 T2a = FMA(KP1_902113032, T28, KP1_175570504 * T29);
+			 T2c = FNMS(KP1_175570504, T28, KP1_902113032 * T29);
+			 T1P = T1N - T1O;
+			 T1W = FNMS(KP248689887, T1V, KP968583161 * T1S);
+			 T23 = FNMS(KP481753674, T22, KP876306680 * T1Z);
+			 T24 = T1W + T23;
+			 T25 = FMS(KP500000000, T24, T1P);
+			 T26 = KP1_118033988 * (T23 - T1W);
+		    }
+		    R1[0] = FMA(KP2_000000000, T24, T1P);
+		    T2b = T26 - T25;
+		    R1[WS(rs, 5)] = T2b + T2c;
+		    R0[WS(rs, 8)] = T2c - T2b;
+		    T27 = T25 + T26;
+		    R0[WS(rs, 3)] = T27 + T2a;
+		    R1[WS(rs, 10)] = T2a - T27;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 25, "r2cbIII_25", {100, 46, 52, 0}, &GENUS };
+
+void X(codelet_r2cbIII_25) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_25, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:32 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 3 -name r2cbIII_3 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 4 FP additions, 3 FP multiplications,
+ * (or, 1 additions, 0 multiplications, 3 fused multiply/add),
+ * 7 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
+	       E T4, T1, T2, T3;
+	       T4 = Ci[0];
+	       T1 = Cr[WS(csr, 1)];
+	       T2 = Cr[0];
+	       R0[0] = FMA(KP2_000000000, T2, T1);
+	       T3 = T2 - T1;
+	       R1[0] = FNMS(KP1_732050807, T4, T3);
+	       R0[WS(rs, 1)] = -(FMA(KP1_732050807, T4, T3));
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 3, "r2cbIII_3", {1, 0, 3, 0}, &GENUS };
+
+void X(codelet_r2cbIII_3) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_3, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 3 -name r2cbIII_3 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 4 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 1 multiplications, 1 fused multiply/add),
+ * 8 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
+	       E T5, T1, T2, T3, T4;
+	       T4 = Ci[0];
+	       T5 = KP1_732050807 * T4;
+	       T1 = Cr[WS(csr, 1)];
+	       T2 = Cr[0];
+	       T3 = T2 - T1;
+	       R0[0] = FMA(KP2_000000000, T2, T1);
+	       R0[WS(rs, 1)] = -(T3 + T5);
+	       R1[0] = T3 - T5;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 3, "r2cbIII_3", {3, 1, 1, 0}, &GENUS };
+
+void X(codelet_r2cbIII_3) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_3, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:37 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -name r2cbIII_32 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 174 FP additions, 100 FP multiplications,
+ * (or, 106 additions, 32 multiplications, 68 fused multiply/add),
+ * 101 stack variables, 18 constants, and 64 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP1_763842528, +1.763842528696710059425513727320776699016885241);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP1_913880671, +1.913880671464417729871595773960539938965698411);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP1_990369453, +1.990369453344393772489673906218959843150949737);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP1_546020906, +1.546020906725473921621813219516939601942082586);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
+	       E T1N, T1K, T1Q, T1H, T1O, T1P;
+	       {
+		    E T1I, T1e, T1Z, T7, T2E, T2i, T1x, Tz, Te, T2j, T22, T2F, T1h, T1y, TK;
+		    E T1J, Tm, T2B, TX, Tp, T2m, T28, T1M, T1C, T1k, TW, TY, T2a, T14, T15;
+		    E Ts, TZ;
+		    {
+			 E TE, T1g, TJ, T1f;
+			 {
+			      E T4, Tv, T3, T2g, T1d, T5, Tw, Tx;
+			      {
+				   E T1, T2, T1b, T1c;
+				   T1 = Cr[0];
+				   T2 = Cr[WS(csr, 15)];
+				   T1b = Ci[0];
+				   T1c = Ci[WS(csi, 15)];
+				   T4 = Cr[WS(csr, 8)];
+				   Tv = T1 - T2;
+				   T3 = T1 + T2;
+				   T2g = T1c - T1b;
+				   T1d = T1b + T1c;
+				   T5 = Cr[WS(csr, 7)];
+				   Tw = Ci[WS(csi, 8)];
+				   Tx = Ci[WS(csi, 7)];
+			      }
+			      {
+				   E Tb, TA, Ta, T20, TD, Tc, TG, TH;
+				   {
+					E T8, T9, TB, TC;
+					T8 = Cr[WS(csr, 4)];
+					{
+					     E T1a, T6, T2h, Ty;
+					     T1a = T4 - T5;
+					     T6 = T4 + T5;
+					     T2h = Tx - Tw;
+					     Ty = Tw + Tx;
+					     T1I = T1a - T1d;
+					     T1e = T1a + T1d;
+					     T1Z = T3 - T6;
+					     T7 = T3 + T6;
+					     T2E = T2h + T2g;
+					     T2i = T2g - T2h;
+					     T1x = Tv + Ty;
+					     Tz = Tv - Ty;
+					     T9 = Cr[WS(csr, 11)];
+					}
+					TB = Ci[WS(csi, 4)];
+					TC = Ci[WS(csi, 11)];
+					Tb = Cr[WS(csr, 3)];
+					TA = T8 - T9;
+					Ta = T8 + T9;
+					T20 = TC - TB;
+					TD = TB + TC;
+					Tc = Cr[WS(csr, 12)];
+					TG = Ci[WS(csi, 3)];
+					TH = Ci[WS(csi, 12)];
+				   }
+				   {
+					E TF, Td, T21, TI;
+					TE = TA - TD;
+					T1g = TA + TD;
+					TF = Tb - Tc;
+					Td = Tb + Tc;
+					T21 = TG - TH;
+					TI = TG + TH;
+					Te = Ta + Td;
+					T2j = Ta - Td;
+					T22 = T20 - T21;
+					T2F = T20 + T21;
+					TJ = TF - TI;
+					T1f = TF + TI;
+				   }
+			      }
+			 }
+			 {
+			      E TM, Ti, TN, T25, TU, TR, Tl, TO;
+			      {
+				   E TS, TT, Tg, Th, Tj, Tk;
+				   Tg = Cr[WS(csr, 2)];
+				   Th = Cr[WS(csr, 13)];
+				   T1h = T1f - T1g;
+				   T1y = T1g + T1f;
+				   TK = TE + TJ;
+				   T1J = TE - TJ;
+				   TM = Tg - Th;
+				   Ti = Tg + Th;
+				   TS = Ci[WS(csi, 2)];
+				   TT = Ci[WS(csi, 13)];
+				   Tj = Cr[WS(csr, 10)];
+				   Tk = Cr[WS(csr, 5)];
+				   TN = Ci[WS(csi, 10)];
+				   T25 = TS - TT;
+				   TU = TS + TT;
+				   TR = Tj - Tk;
+				   Tl = Tj + Tk;
+				   TO = Ci[WS(csi, 5)];
+			      }
+			      {
+				   E T12, T13, Tq, Tr;
+				   {
+					E Tn, T1A, TV, T24, T26, TP, To, T27, T1B, TQ;
+					Tn = Cr[WS(csr, 1)];
+					T1A = TR - TU;
+					TV = TR + TU;
+					T24 = Ti - Tl;
+					Tm = Ti + Tl;
+					T26 = TN - TO;
+					TP = TN + TO;
+					To = Cr[WS(csr, 14)];
+					T12 = Ci[WS(csi, 1)];
+					T27 = T25 - T26;
+					T2B = T26 + T25;
+					T1B = TM + TP;
+					TQ = TM - TP;
+					TX = Tn - To;
+					Tp = Tn + To;
+					T2m = T24 + T27;
+					T28 = T24 - T27;
+					T1M = FNMS(KP414213562, T1A, T1B);
+					T1C = FMA(KP414213562, T1B, T1A);
+					T1k = FMA(KP414213562, TQ, TV);
+					TW = FNMS(KP414213562, TV, TQ);
+					T13 = Ci[WS(csi, 14)];
+				   }
+				   Tq = Cr[WS(csr, 6)];
+				   Tr = Cr[WS(csr, 9)];
+				   TY = Ci[WS(csi, 6)];
+				   T2a = T13 - T12;
+				   T14 = T12 + T13;
+				   T15 = Tq - Tr;
+				   Ts = Tq + Tr;
+				   TZ = Ci[WS(csi, 9)];
+			      }
+			 }
+		    }
+		    {
+			 E T1L, T1F, T23, T2n, T2k, T2e, T1p, T1t, T1s, T1i, T1o, T19, T1l, T1q;
+			 {
+			      E T2z, T2G, T2H, T2C, T1j, T17, T2r, T2s, T2u, T2v, T2K, T2D;
+			      {
+				   E T2L, T2d, T2l, T2O;
+				   {
+					E Tf, T2N, Tu, T2M;
+					{
+					     E T1D, T16, T29, Tt, T2b, T10;
+					     T2z = T7 - Te;
+					     Tf = T7 + Te;
+					     T1D = T15 + T14;
+					     T16 = T14 - T15;
+					     T29 = Tp - Ts;
+					     Tt = Tp + Ts;
+					     T2b = TY - TZ;
+					     T10 = TY + TZ;
+					     T2N = T2F + T2E;
+					     T2G = T2E - T2F;
+					     T2H = Tm - Tt;
+					     Tu = Tm + Tt;
+					     {
+						  E T2c, T2A, T1E, T11;
+						  T2c = T2a - T2b;
+						  T2A = T2b + T2a;
+						  T1E = TX + T10;
+						  T11 = TX - T10;
+						  T2L = Tf - Tu;
+						  T2d = T29 + T2c;
+						  T2l = T29 - T2c;
+						  T2C = T2A - T2B;
+						  T2M = T2B + T2A;
+						  T1L = FMA(KP414213562, T1D, T1E);
+						  T1F = FNMS(KP414213562, T1E, T1D);
+						  T1j = FMA(KP414213562, T11, T16);
+						  T17 = FNMS(KP414213562, T16, T11);
+						  T2O = T2M + T2N;
+					     }
+					}
+					R0[0] = KP2_000000000 * (Tf + Tu);
+					R0[WS(rs, 8)] = KP2_000000000 * (T2N - T2M);
+				   }
+				   T23 = T1Z + T22;
+				   T2r = T1Z - T22;
+				   R0[WS(rs, 12)] = KP1_414213562 * (T2O - T2L);
+				   R0[WS(rs, 4)] = KP1_414213562 * (T2L + T2O);
+				   T2s = T2m + T2l;
+				   T2n = T2l - T2m;
+				   T2k = T2i - T2j;
+				   T2u = T2j + T2i;
+				   T2v = T28 - T2d;
+				   T2e = T28 + T2d;
+			      }
+			      {
+				   E T2y, T2t, T2x, T2w;
+				   T2y = FMA(KP707106781, T2s, T2r);
+				   T2t = FNMS(KP707106781, T2s, T2r);
+				   T2x = FMA(KP707106781, T2v, T2u);
+				   T2w = FNMS(KP707106781, T2v, T2u);
+				   R0[WS(rs, 7)] = KP1_961570560 * (FMA(KP198912367, T2y, T2x));
+				   R0[WS(rs, 15)] = -(KP1_961570560 * (FNMS(KP198912367, T2x, T2y)));
+				   R0[WS(rs, 11)] = KP1_662939224 * (FNMS(KP668178637, T2t, T2w));
+				   R0[WS(rs, 3)] = KP1_662939224 * (FMA(KP668178637, T2w, T2t));
+				   T2K = T2z - T2C;
+				   T2D = T2z + T2C;
+			      }
+			      {
+				   E TL, T18, T2J, T2I;
+				   T1p = FNMS(KP707106781, TK, Tz);
+				   TL = FMA(KP707106781, TK, Tz);
+				   T18 = TW + T17;
+				   T1t = TW - T17;
+				   T1s = FMA(KP707106781, T1h, T1e);
+				   T1i = FNMS(KP707106781, T1h, T1e);
+				   T2J = T2H + T2G;
+				   T2I = T2G - T2H;
+				   T1o = FNMS(KP923879532, T18, TL);
+				   T19 = FMA(KP923879532, T18, TL);
+				   R0[WS(rs, 6)] = KP1_847759065 * (FMA(KP414213562, T2K, T2J));
+				   R0[WS(rs, 14)] = -(KP1_847759065 * (FNMS(KP414213562, T2J, T2K)));
+				   R0[WS(rs, 10)] = KP1_847759065 * (FNMS(KP414213562, T2D, T2I));
+				   R0[WS(rs, 2)] = KP1_847759065 * (FMA(KP414213562, T2I, T2D));
+				   T1l = T1j - T1k;
+				   T1q = T1k + T1j;
+			      }
+			 }
+			 {
+			      E T1z, T1U, T1Y, T1T, T1V, T1G;
+			      {
+				   E T1w, T1r, T1n, T1m;
+				   T1n = FMA(KP923879532, T1l, T1i);
+				   T1m = FNMS(KP923879532, T1l, T1i);
+				   T1w = FMA(KP923879532, T1q, T1p);
+				   T1r = FNMS(KP923879532, T1q, T1p);
+				   R1[WS(rs, 4)] = -(KP1_546020906 * (FNMS(KP820678790, T1o, T1n)));
+				   R1[WS(rs, 12)] = -(KP1_546020906 * (FMA(KP820678790, T1n, T1o)));
+				   R1[WS(rs, 8)] = -(KP1_990369453 * (FMA(KP098491403, T19, T1m)));
+				   R1[0] = KP1_990369453 * (FNMS(KP098491403, T1m, T19));
+				   {
+					E T1R, T1S, T1v, T1u;
+					T1z = FNMS(KP707106781, T1y, T1x);
+					T1R = FMA(KP707106781, T1y, T1x);
+					T1S = T1M + T1L;
+					T1N = T1L - T1M;
+					T1K = FNMS(KP707106781, T1J, T1I);
+					T1U = FMA(KP707106781, T1J, T1I);
+					T1v = FNMS(KP923879532, T1t, T1s);
+					T1u = FMA(KP923879532, T1t, T1s);
+					T1Y = FMA(KP923879532, T1S, T1R);
+					T1T = FNMS(KP923879532, T1S, T1R);
+					R1[WS(rs, 6)] = -(KP1_913880671 * (FNMS(KP303346683, T1w, T1v)));
+					R1[WS(rs, 14)] = -(KP1_913880671 * (FMA(KP303346683, T1v, T1w)));
+					R1[WS(rs, 10)] = -(KP1_763842528 * (FMA(KP534511135, T1r, T1u)));
+					R1[WS(rs, 2)] = KP1_763842528 * (FNMS(KP534511135, T1u, T1r));
+					T1V = T1C + T1F;
+					T1G = T1C - T1F;
+				   }
+			      }
+			      {
+				   E T2q, T2f, T1X, T1W, T2p, T2o;
+				   T1X = FMA(KP923879532, T1V, T1U);
+				   T1W = FNMS(KP923879532, T1V, T1U);
+				   T2q = FNMS(KP707106781, T2e, T23);
+				   T2f = FMA(KP707106781, T2e, T23);
+				   R1[WS(rs, 7)] = KP1_990369453 * (FMA(KP098491403, T1Y, T1X));
+				   R1[WS(rs, 15)] = -(KP1_990369453 * (FNMS(KP098491403, T1X, T1Y)));
+				   R1[WS(rs, 11)] = KP1_546020906 * (FNMS(KP820678790, T1T, T1W));
+				   R1[WS(rs, 3)] = KP1_546020906 * (FMA(KP820678790, T1W, T1T));
+				   T2p = FNMS(KP707106781, T2n, T2k);
+				   T2o = FMA(KP707106781, T2n, T2k);
+				   T1Q = FNMS(KP923879532, T1G, T1z);
+				   T1H = FMA(KP923879532, T1G, T1z);
+				   R0[WS(rs, 5)] = KP1_662939224 * (FMA(KP668178637, T2q, T2p));
+				   R0[WS(rs, 13)] = -(KP1_662939224 * (FNMS(KP668178637, T2p, T2q)));
+				   R0[WS(rs, 9)] = KP1_961570560 * (FNMS(KP198912367, T2f, T2o));
+				   R0[WS(rs, 1)] = KP1_961570560 * (FMA(KP198912367, T2o, T2f));
+			      }
+			 }
+		    }
+	       }
+	       T1O = FMA(KP923879532, T1N, T1K);
+	       T1P = FNMS(KP923879532, T1N, T1K);
+	       R1[WS(rs, 5)] = KP1_763842528 * (FMA(KP534511135, T1Q, T1P));
+	       R1[WS(rs, 13)] = -(KP1_763842528 * (FNMS(KP534511135, T1P, T1Q)));
+	       R1[WS(rs, 9)] = KP1_913880671 * (FNMS(KP303346683, T1H, T1O));
+	       R1[WS(rs, 1)] = KP1_913880671 * (FMA(KP303346683, T1O, T1H));
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 32, "r2cbIII_32", {106, 32, 68, 0}, &GENUS };
+
+void X(codelet_r2cbIII_32) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_32, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -name r2cbIII_32 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 174 FP additions, 84 FP multiplications,
+ * (or, 138 additions, 48 multiplications, 36 fused multiply/add),
+ * 66 stack variables, 19 constants, and 64 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_913880671, +1.913880671464417729871595773960539938965698411);
+     DK(KP580569354, +0.580569354508924735272384751634790549382952557);
+     DK(KP942793473, +0.942793473651995297112775251810508755314920638);
+     DK(KP1_763842528, +1.763842528696710059425513727320776699016885241);
+     DK(KP1_546020906, +1.546020906725473921621813219516939601942082586);
+     DK(KP1_268786568, +1.268786568327290996430343226450986741351374190);
+     DK(KP196034280, +0.196034280659121203988391127777283691722273346);
+     DK(KP1_990369453, +1.990369453344393772489673906218959843150949737);
+     DK(KP765366864, +0.765366864730179543456919968060797733522689125);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP390180644, +0.390180644032256535696569736954044481855383236);
+     DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
+	       E T7, T2i, T2F, Tz, T1k, T1I, T1Z, T1x, Te, T22, T2E, T2j, T1f, T1y, TK;
+	       E T1J, Tm, T2B, TW, T1a, T1C, T1L, T28, T2l, Tt, T2A, T17, T1b, T1F, T1M;
+	       E T2d, T2m;
+	       {
+		    E T3, Tv, T1j, T2h, T6, T1g, Ty, T2g;
+		    {
+			 E T1, T2, T1h, T1i;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 15)];
+			 T3 = T1 + T2;
+			 Tv = T1 - T2;
+			 T1h = Ci[0];
+			 T1i = Ci[WS(csi, 15)];
+			 T1j = T1h + T1i;
+			 T2h = T1i - T1h;
+		    }
+		    {
+			 E T4, T5, Tw, Tx;
+			 T4 = Cr[WS(csr, 8)];
+			 T5 = Cr[WS(csr, 7)];
+			 T6 = T4 + T5;
+			 T1g = T4 - T5;
+			 Tw = Ci[WS(csi, 8)];
+			 Tx = Ci[WS(csi, 7)];
+			 Ty = Tw + Tx;
+			 T2g = Tw - Tx;
+		    }
+		    T7 = T3 + T6;
+		    T2i = T2g + T2h;
+		    T2F = T2h - T2g;
+		    Tz = Tv - Ty;
+		    T1k = T1g + T1j;
+		    T1I = T1g - T1j;
+		    T1Z = T3 - T6;
+		    T1x = Tv + Ty;
+	       }
+	       {
+		    E Ta, TA, TD, T21, Td, TF, TI, T20;
+		    {
+			 E T8, T9, TB, TC;
+			 T8 = Cr[WS(csr, 4)];
+			 T9 = Cr[WS(csr, 11)];
+			 Ta = T8 + T9;
+			 TA = T8 - T9;
+			 TB = Ci[WS(csi, 4)];
+			 TC = Ci[WS(csi, 11)];
+			 TD = TB + TC;
+			 T21 = TB - TC;
+		    }
+		    {
+			 E Tb, Tc, TG, TH;
+			 Tb = Cr[WS(csr, 3)];
+			 Tc = Cr[WS(csr, 12)];
+			 Td = Tb + Tc;
+			 TF = Tb - Tc;
+			 TG = Ci[WS(csi, 3)];
+			 TH = Ci[WS(csi, 12)];
+			 TI = TG + TH;
+			 T20 = TH - TG;
+		    }
+		    Te = Ta + Td;
+		    T22 = T20 - T21;
+		    T2E = T21 + T20;
+		    T2j = Ta - Td;
+		    {
+			 E T1d, T1e, TE, TJ;
+			 T1d = TA + TD;
+			 T1e = TF + TI;
+			 T1f = KP707106781 * (T1d - T1e);
+			 T1y = KP707106781 * (T1d + T1e);
+			 TE = TA - TD;
+			 TJ = TF - TI;
+			 TK = KP707106781 * (TE + TJ);
+			 T1J = KP707106781 * (TE - TJ);
+		    }
+	       }
+	       {
+		    E Ti, TM, TU, T25, Tl, TR, TP, T26, TQ, TV;
+		    {
+			 E Tg, Th, TS, TT;
+			 Tg = Cr[WS(csr, 2)];
+			 Th = Cr[WS(csr, 13)];
+			 Ti = Tg + Th;
+			 TM = Tg - Th;
+			 TS = Ci[WS(csi, 2)];
+			 TT = Ci[WS(csi, 13)];
+			 TU = TS + TT;
+			 T25 = TS - TT;
+		    }
+		    {
+			 E Tj, Tk, TN, TO;
+			 Tj = Cr[WS(csr, 10)];
+			 Tk = Cr[WS(csr, 5)];
+			 Tl = Tj + Tk;
+			 TR = Tj - Tk;
+			 TN = Ci[WS(csi, 10)];
+			 TO = Ci[WS(csi, 5)];
+			 TP = TN + TO;
+			 T26 = TN - TO;
+		    }
+		    Tm = Ti + Tl;
+		    T2B = T26 + T25;
+		    TQ = TM - TP;
+		    TV = TR + TU;
+		    TW = FNMS(KP382683432, TV, KP923879532 * TQ);
+		    T1a = FMA(KP382683432, TQ, KP923879532 * TV);
+		    {
+			 E T1A, T1B, T24, T27;
+			 T1A = TM + TP;
+			 T1B = TU - TR;
+			 T1C = FNMS(KP923879532, T1B, KP382683432 * T1A);
+			 T1L = FMA(KP923879532, T1A, KP382683432 * T1B);
+			 T24 = Ti - Tl;
+			 T27 = T25 - T26;
+			 T28 = T24 - T27;
+			 T2l = T24 + T27;
+		    }
+	       }
+	       {
+		    E Tp, TX, T15, T2a, Ts, T12, T10, T2b, T11, T16;
+		    {
+			 E Tn, To, T13, T14;
+			 Tn = Cr[WS(csr, 1)];
+			 To = Cr[WS(csr, 14)];
+			 Tp = Tn + To;
+			 TX = Tn - To;
+			 T13 = Ci[WS(csi, 1)];
+			 T14 = Ci[WS(csi, 14)];
+			 T15 = T13 + T14;
+			 T2a = T14 - T13;
+		    }
+		    {
+			 E Tq, Tr, TY, TZ;
+			 Tq = Cr[WS(csr, 6)];
+			 Tr = Cr[WS(csr, 9)];
+			 Ts = Tq + Tr;
+			 T12 = Tq - Tr;
+			 TY = Ci[WS(csi, 6)];
+			 TZ = Ci[WS(csi, 9)];
+			 T10 = TY + TZ;
+			 T2b = TY - TZ;
+		    }
+		    Tt = Tp + Ts;
+		    T2A = T2b + T2a;
+		    T11 = TX - T10;
+		    T16 = T12 - T15;
+		    T17 = FMA(KP923879532, T11, KP382683432 * T16);
+		    T1b = FNMS(KP382683432, T11, KP923879532 * T16);
+		    {
+			 E T1D, T1E, T29, T2c;
+			 T1D = TX + T10;
+			 T1E = T12 + T15;
+			 T1F = FNMS(KP923879532, T1E, KP382683432 * T1D);
+			 T1M = FMA(KP923879532, T1D, KP382683432 * T1E);
+			 T29 = Tp - Ts;
+			 T2c = T2a - T2b;
+			 T2d = T29 + T2c;
+			 T2m = T2c - T29;
+		    }
+	       }
+	       {
+		    E Tf, Tu, T2L, T2M, T2N, T2O;
+		    Tf = T7 + Te;
+		    Tu = Tm + Tt;
+		    T2L = Tf - Tu;
+		    T2M = T2B + T2A;
+		    T2N = T2F - T2E;
+		    T2O = T2M + T2N;
+		    R0[0] = KP2_000000000 * (Tf + Tu);
+		    R0[WS(rs, 8)] = KP2_000000000 * (T2N - T2M);
+		    R0[WS(rs, 4)] = KP1_414213562 * (T2L + T2O);
+		    R0[WS(rs, 12)] = KP1_414213562 * (T2O - T2L);
+	       }
+	       {
+		    E T2t, T2x, T2w, T2y;
+		    {
+			 E T2r, T2s, T2u, T2v;
+			 T2r = T1Z - T22;
+			 T2s = KP707106781 * (T2m - T2l);
+			 T2t = T2r + T2s;
+			 T2x = T2r - T2s;
+			 T2u = T2j + T2i;
+			 T2v = KP707106781 * (T28 - T2d);
+			 T2w = T2u - T2v;
+			 T2y = T2v + T2u;
+		    }
+		    R0[WS(rs, 3)] = FMA(KP1_662939224, T2t, KP1_111140466 * T2w);
+		    R0[WS(rs, 15)] = FNMS(KP1_961570560, T2x, KP390180644 * T2y);
+		    R0[WS(rs, 11)] = FNMS(KP1_111140466, T2t, KP1_662939224 * T2w);
+		    R0[WS(rs, 7)] = FMA(KP390180644, T2x, KP1_961570560 * T2y);
+	       }
+	       {
+		    E T2D, T2J, T2I, T2K;
+		    {
+			 E T2z, T2C, T2G, T2H;
+			 T2z = T7 - Te;
+			 T2C = T2A - T2B;
+			 T2D = T2z + T2C;
+			 T2J = T2z - T2C;
+			 T2G = T2E + T2F;
+			 T2H = Tm - Tt;
+			 T2I = T2G - T2H;
+			 T2K = T2H + T2G;
+		    }
+		    R0[WS(rs, 2)] = FMA(KP1_847759065, T2D, KP765366864 * T2I);
+		    R0[WS(rs, 14)] = FNMS(KP1_847759065, T2J, KP765366864 * T2K);
+		    R0[WS(rs, 10)] = FNMS(KP765366864, T2D, KP1_847759065 * T2I);
+		    R0[WS(rs, 6)] = FMA(KP765366864, T2J, KP1_847759065 * T2K);
+	       }
+	       {
+		    E T19, T1n, T1m, T1o;
+		    {
+			 E TL, T18, T1c, T1l;
+			 TL = Tz + TK;
+			 T18 = TW + T17;
+			 T19 = TL + T18;
+			 T1n = TL - T18;
+			 T1c = T1a + T1b;
+			 T1l = T1f + T1k;
+			 T1m = T1c + T1l;
+			 T1o = T1c - T1l;
+		    }
+		    R1[0] = FNMS(KP196034280, T1m, KP1_990369453 * T19);
+		    R1[WS(rs, 12)] = FNMS(KP1_546020906, T1n, KP1_268786568 * T1o);
+		    R1[WS(rs, 8)] = -(FMA(KP196034280, T19, KP1_990369453 * T1m));
+		    R1[WS(rs, 4)] = FMA(KP1_268786568, T1n, KP1_546020906 * T1o);
+	       }
+	       {
+		    E T1r, T1v, T1u, T1w;
+		    {
+			 E T1p, T1q, T1s, T1t;
+			 T1p = Tz - TK;
+			 T1q = T1b - T1a;
+			 T1r = T1p + T1q;
+			 T1v = T1p - T1q;
+			 T1s = T1f - T1k;
+			 T1t = TW - T17;
+			 T1u = T1s - T1t;
+			 T1w = T1t + T1s;
+		    }
+		    R1[WS(rs, 2)] = FMA(KP1_763842528, T1r, KP942793473 * T1u);
+		    R1[WS(rs, 14)] = FNMS(KP1_913880671, T1v, KP580569354 * T1w);
+		    R1[WS(rs, 10)] = FNMS(KP942793473, T1r, KP1_763842528 * T1u);
+		    R1[WS(rs, 6)] = FMA(KP580569354, T1v, KP1_913880671 * T1w);
+	       }
+	       {
+		    E T1T, T1X, T1W, T1Y;
+		    {
+			 E T1R, T1S, T1U, T1V;
+			 T1R = T1x + T1y;
+			 T1S = T1L + T1M;
+			 T1T = T1R - T1S;
+			 T1X = T1R + T1S;
+			 T1U = T1J + T1I;
+			 T1V = T1C - T1F;
+			 T1W = T1U - T1V;
+			 T1Y = T1V + T1U;
+		    }
+		    R1[WS(rs, 3)] = FMA(KP1_546020906, T1T, KP1_268786568 * T1W);
+		    R1[WS(rs, 15)] = FNMS(KP1_990369453, T1X, KP196034280 * T1Y);
+		    R1[WS(rs, 11)] = FNMS(KP1_268786568, T1T, KP1_546020906 * T1W);
+		    R1[WS(rs, 7)] = FMA(KP196034280, T1X, KP1_990369453 * T1Y);
+	       }
+	       {
+		    E T2f, T2p, T2o, T2q;
+		    {
+			 E T23, T2e, T2k, T2n;
+			 T23 = T1Z + T22;
+			 T2e = KP707106781 * (T28 + T2d);
+			 T2f = T23 + T2e;
+			 T2p = T23 - T2e;
+			 T2k = T2i - T2j;
+			 T2n = KP707106781 * (T2l + T2m);
+			 T2o = T2k - T2n;
+			 T2q = T2n + T2k;
+		    }
+		    R0[WS(rs, 1)] = FMA(KP1_961570560, T2f, KP390180644 * T2o);
+		    R0[WS(rs, 13)] = FNMS(KP1_662939224, T2p, KP1_111140466 * T2q);
+		    R0[WS(rs, 9)] = FNMS(KP390180644, T2f, KP1_961570560 * T2o);
+		    R0[WS(rs, 5)] = FMA(KP1_111140466, T2p, KP1_662939224 * T2q);
+	       }
+	       {
+		    E T1H, T1P, T1O, T1Q;
+		    {
+			 E T1z, T1G, T1K, T1N;
+			 T1z = T1x - T1y;
+			 T1G = T1C + T1F;
+			 T1H = T1z + T1G;
+			 T1P = T1z - T1G;
+			 T1K = T1I - T1J;
+			 T1N = T1L - T1M;
+			 T1O = T1K - T1N;
+			 T1Q = T1N + T1K;
+		    }
+		    R1[WS(rs, 1)] = FMA(KP1_913880671, T1H, KP580569354 * T1O);
+		    R1[WS(rs, 13)] = FNMS(KP1_763842528, T1P, KP942793473 * T1Q);
+		    R1[WS(rs, 9)] = FNMS(KP580569354, T1H, KP1_913880671 * T1O);
+		    R1[WS(rs, 5)] = FMA(KP942793473, T1P, KP1_763842528 * T1Q);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 32, "r2cbIII_32", {138, 48, 36, 0}, &GENUS };
+
+void X(codelet_r2cbIII_32) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_32, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:33 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -name r2cbIII_4 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 6 additions, 4 multiplications, 0 fused multiply/add),
+ * 9 stack variables, 2 constants, and 8 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
+	       E T1, T2, T4, T5, T3, T6;
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 1)];
+	       T4 = Ci[0];
+	       T5 = Ci[WS(csi, 1)];
+	       R0[0] = KP2_000000000 * (T1 + T2);
+	       T3 = T1 - T2;
+	       R0[WS(rs, 1)] = KP2_000000000 * (T5 - T4);
+	       T6 = T4 + T5;
+	       R1[WS(rs, 1)] = -(KP1_414213562 * (T3 + T6));
+	       R1[0] = KP1_414213562 * (T3 - T6);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 4, "r2cbIII_4", {6, 4, 0, 0}, &GENUS };
+
+void X(codelet_r2cbIII_4) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_4, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -name r2cbIII_4 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 6 additions, 4 multiplications, 0 fused multiply/add),
+ * 9 stack variables, 2 constants, and 8 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
+	       E T1, T2, T3, T4, T5, T6;
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 1)];
+	       T3 = T1 - T2;
+	       T4 = Ci[0];
+	       T5 = Ci[WS(csi, 1)];
+	       T6 = T4 + T5;
+	       R0[0] = KP2_000000000 * (T1 + T2);
+	       R0[WS(rs, 1)] = KP2_000000000 * (T5 - T4);
+	       R1[0] = KP1_414213562 * (T3 - T6);
+	       R1[WS(rs, 1)] = -(KP1_414213562 * (T3 + T6));
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 4, "r2cbIII_4", {6, 4, 0, 0}, &GENUS };
+
+void X(codelet_r2cbIII_4) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_4, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:33 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 5 -name r2cbIII_5 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 12 FP additions, 10 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 10 fused multiply/add),
+ * 18 stack variables, 5 constants, and 10 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
+	       E T1, T2, T3, Tc, Ta, T8, T9;
+	       T8 = Ci[WS(csi, 1)];
+	       T9 = Ci[0];
+	       T1 = Cr[WS(csr, 2)];
+	       T2 = Cr[WS(csr, 1)];
+	       T3 = Cr[0];
+	       Tc = FMS(KP618033988, T8, T9);
+	       Ta = FMA(KP618033988, T9, T8);
+	       {
+		    E T6, T4, T5, T7, Tb;
+		    T6 = T3 - T2;
+		    T4 = T2 + T3;
+		    R0[0] = FMA(KP2_000000000, T4, T1);
+		    T5 = FNMS(KP500000000, T4, T1);
+		    T7 = FNMS(KP1_118033988, T6, T5);
+		    Tb = FMA(KP1_118033988, T6, T5);
+		    R0[WS(rs, 2)] = FNMS(KP1_902113032, Ta, T7);
+		    R1[0] = -(FMA(KP1_902113032, Ta, T7));
+		    R1[WS(rs, 1)] = FMS(KP1_902113032, Tc, Tb);
+		    R0[WS(rs, 1)] = FMA(KP1_902113032, Tc, Tb);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 5, "r2cbIII_5", {2, 0, 10, 0}, &GENUS };
+
+void X(codelet_r2cbIII_5) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_5, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 5 -name r2cbIII_5 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 12 FP additions, 7 FP multiplications,
+ * (or, 8 additions, 3 multiplications, 4 fused multiply/add),
+ * 18 stack variables, 5 constants, and 10 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
+	       E Ta, Tc, T1, T4, T5, T6, Tb, T7;
+	       {
+		    E T8, T9, T2, T3;
+		    T8 = Ci[WS(csi, 1)];
+		    T9 = Ci[0];
+		    Ta = FMA(KP1_902113032, T8, KP1_175570504 * T9);
+		    Tc = FNMS(KP1_902113032, T9, KP1_175570504 * T8);
+		    T1 = Cr[WS(csr, 2)];
+		    T2 = Cr[WS(csr, 1)];
+		    T3 = Cr[0];
+		    T4 = T2 + T3;
+		    T5 = FMS(KP500000000, T4, T1);
+		    T6 = KP1_118033988 * (T3 - T2);
+	       }
+	       R0[0] = FMA(KP2_000000000, T4, T1);
+	       Tb = T6 - T5;
+	       R0[WS(rs, 1)] = Tb + Tc;
+	       R1[WS(rs, 1)] = Tc - Tb;
+	       T7 = T5 + T6;
+	       R1[0] = T7 - Ta;
+	       R0[WS(rs, 2)] = -(T7 + Ta);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 5, "r2cbIII_5", {8, 3, 4, 0}, &GENUS };
+
+void X(codelet_r2cbIII_5) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_5, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:33 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -name r2cbIII_6 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 12 FP additions, 8 FP multiplications,
+ * (or, 6 additions, 2 multiplications, 6 fused multiply/add),
+ * 15 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
+	       E T1, T8, T2, T3, T5, T6;
+	       T1 = Cr[WS(csr, 1)];
+	       T8 = Ci[WS(csi, 1)];
+	       T2 = Cr[WS(csr, 2)];
+	       T3 = Cr[0];
+	       T5 = Ci[WS(csi, 2)];
+	       T6 = Ci[0];
+	       {
+		    E T4, Ta, T7, Tc, Tb, T9;
+		    T4 = T2 + T3;
+		    Ta = T2 - T3;
+		    T7 = T5 + T6;
+		    Tc = T5 - T6;
+		    Tb = FNMS(KP2_000000000, T1, T4);
+		    R0[0] = KP2_000000000 * (T1 + T4);
+		    T9 = FMA(KP2_000000000, T8, T7);
+		    R1[WS(rs, 1)] = KP2_000000000 * (T8 - T7);
+		    R0[WS(rs, 2)] = FMS(KP1_732050807, Tc, Tb);
+		    R0[WS(rs, 1)] = FMA(KP1_732050807, Tc, Tb);
+		    R1[WS(rs, 2)] = FMS(KP1_732050807, Ta, T9);
+		    R1[0] = -(FMA(KP1_732050807, Ta, T9));
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 6, "r2cbIII_6", {6, 2, 6, 0}, &GENUS };
+
+void X(codelet_r2cbIII_6) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_6, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -name r2cbIII_6 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 12 FP additions, 6 FP multiplications,
+ * (or, 10 additions, 4 multiplications, 2 fused multiply/add),
+ * 15 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
+	       E T1, T6, T4, T5, T9, Tb, Ta, Tc;
+	       T1 = Cr[WS(csr, 1)];
+	       T6 = Ci[WS(csi, 1)];
+	       {
+		    E T2, T3, T7, T8;
+		    T2 = Cr[WS(csr, 2)];
+		    T3 = Cr[0];
+		    T4 = T2 + T3;
+		    T5 = KP1_732050807 * (T2 - T3);
+		    T7 = Ci[WS(csi, 2)];
+		    T8 = Ci[0];
+		    T9 = T7 + T8;
+		    Tb = KP1_732050807 * (T7 - T8);
+	       }
+	       R0[0] = KP2_000000000 * (T1 + T4);
+	       R1[WS(rs, 1)] = KP2_000000000 * (T6 - T9);
+	       Ta = FMA(KP2_000000000, T6, T9);
+	       R1[0] = -(T5 + Ta);
+	       R1[WS(rs, 2)] = T5 - Ta;
+	       Tc = FMS(KP2_000000000, T1, T4);
+	       R0[WS(rs, 1)] = Tb - Tc;
+	       R0[WS(rs, 2)] = Tc + Tb;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 6, "r2cbIII_6", {10, 4, 2, 0}, &GENUS };
+
+void X(codelet_r2cbIII_6) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_6, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1545 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:38 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 64 -name r2cbIII_64 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 434 FP additions, 260 FP multiplications,
+ * (or, 238 additions, 64 multiplications, 196 fused multiply/add),
+ * 165 stack variables, 36 constants, and 128 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP357805721, +0.357805721314524104672487743774474392487532769);
+     DK(KP1_883088130, +1.883088130366041556825018805199004714371179592);
+     DK(KP472964775, +0.472964775891319928124438237972992463904131113);
+     DK(KP1_807978586, +1.807978586246886663172400594461074097420264050);
+     DK(KP049126849, +0.049126849769467254105343321271313617079695752);
+     DK(KP1_997590912, +1.997590912410344785429543209518201388886407229);
+     DK(KP906347169, +0.906347169019147157946142717268914412664134293);
+     DK(KP1_481902250, +1.481902250709918182351233794990325459457910619);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP250486960, +0.250486960191305461595702160124721208578685568);
+     DK(KP1_940062506, +1.940062506389087985207968414572200502913731924);
+     DK(KP599376933, +0.599376933681923766271389869014404232837890546);
+     DK(KP1_715457220, +1.715457220000544139804539968569540274084981599);
+     DK(KP148335987, +0.148335987538347428753676511486911367000625355);
+     DK(KP1_978353019, +1.978353019929561946903347476032486127967379067);
+     DK(KP741650546, +0.741650546272035369581266691172079863842265220);
+     DK(KP1_606415062, +1.606415062961289819613353025926283847759138854);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP1_913880671, +1.913880671464417729871595773960539938965698411);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP1_763842528, +1.763842528696710059425513727320776699016885241);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP1_990369453, +1.990369453344393772489673906218959843150949737);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP1_546020906, +1.546020906725473921621813219516939601942082586);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(256, rs), MAKE_VOLATILE_STRIDE(256, csr), MAKE_VOLATILE_STRIDE(256, csi)) {
+	       E T43, T4b, T49, T4e, T3T, T46, T40, T4a;
+	       {
+		    E T3t, T15, T2E, T3U, T6b, Tf, T6Q, T6u, T5J, T4L, T3V, T1g, T5U, T5q, T3u;
+		    E T2H, T6v, Tu, T5r, T4V, T6R, T6e, T2K, T1s, T2J, T1D, T3X, T3B, T5s, T4Q;
+		    E T3Y, T3y, T6g, TK, T5M, T57, T6N, T6j, T35, T1W, T34, T25, T4i, T3J, T5N;
+		    E T52, T4j, T3G, T6l, TZ, T3L, T5P, T5i, T6M, T6o, T3M, T38, T2n, T37, T2w;
+		    E T4l, T3Q, T5Q, T5d;
+		    {
+			 E T3x, T3w, T3E, T3F;
+			 {
+			      E T5p, T5o, T2G, T2F;
+			      {
+				   E T11, T3, T5m, T2D, T2A, T6, T5n, T14, Tb, T16, Ta, T4I, T19, Tc, T1c;
+				   E T1d;
+				   {
+					E T4, T5, T12, T13;
+					{
+					     E T1, T2, T2B, T2C;
+					     T1 = Cr[0];
+					     T2 = Cr[WS(csr, 31)];
+					     T2B = Ci[0];
+					     T2C = Ci[WS(csi, 31)];
+					     T4 = Cr[WS(csr, 16)];
+					     T11 = T1 - T2;
+					     T3 = T1 + T2;
+					     T5m = T2C - T2B;
+					     T2D = T2B + T2C;
+					     T5 = Cr[WS(csr, 15)];
+					     T12 = Ci[WS(csi, 16)];
+					     T13 = Ci[WS(csi, 15)];
+					}
+					{
+					     E T8, T9, T17, T18;
+					     T8 = Cr[WS(csr, 8)];
+					     T2A = T4 - T5;
+					     T6 = T4 + T5;
+					     T5n = T13 - T12;
+					     T14 = T12 + T13;
+					     T9 = Cr[WS(csr, 23)];
+					     T17 = Ci[WS(csi, 8)];
+					     T18 = Ci[WS(csi, 23)];
+					     Tb = Cr[WS(csr, 7)];
+					     T16 = T8 - T9;
+					     Ta = T8 + T9;
+					     T4I = T18 - T17;
+					     T19 = T17 + T18;
+					     Tc = Cr[WS(csr, 24)];
+					     T1c = Ci[WS(csi, 7)];
+					     T1d = Ci[WS(csi, 24)];
+					}
+				   }
+				   {
+					E T1b, T4J, T1e, T4H, T7, Te, Td;
+					T3t = T11 + T14;
+					T15 = T11 - T14;
+					T1b = Tb - Tc;
+					Td = Tb + Tc;
+					T4J = T1c - T1d;
+					T1e = T1c + T1d;
+					T2E = T2A + T2D;
+					T3U = T2A - T2D;
+					T4H = T3 - T6;
+					T7 = T3 + T6;
+					Te = Ta + Td;
+					T5p = Ta - Td;
+					{
+					     E T4K, T6s, T6t, T1a, T1f;
+					     T5o = T5m - T5n;
+					     T6s = T5n + T5m;
+					     T6t = T4I + T4J;
+					     T4K = T4I - T4J;
+					     T6b = T7 - Te;
+					     Tf = T7 + Te;
+					     T6Q = T6t + T6s;
+					     T6u = T6s - T6t;
+					     T2G = T16 + T19;
+					     T1a = T16 - T19;
+					     T1f = T1b - T1e;
+					     T2F = T1b + T1e;
+					     T5J = T4H - T4K;
+					     T4L = T4H + T4K;
+					     T3V = T1a - T1f;
+					     T1g = T1a + T1f;
+					}
+				   }
+			      }
+			      {
+				   E T1i, Ti, T4O, T1q, T1n, Tl, T4N, T1l, Tq, T1t, Tp, T4T, T1A, Tr, T1u;
+				   E T1v;
+				   {
+					E Tj, Tk, T1j, T1k;
+					{
+					     E Tg, Th, T1o, T1p;
+					     Tg = Cr[WS(csr, 4)];
+					     T5U = T5p + T5o;
+					     T5q = T5o - T5p;
+					     T3u = T2G + T2F;
+					     T2H = T2F - T2G;
+					     Th = Cr[WS(csr, 27)];
+					     T1o = Ci[WS(csi, 4)];
+					     T1p = Ci[WS(csi, 27)];
+					     Tj = Cr[WS(csr, 20)];
+					     T1i = Tg - Th;
+					     Ti = Tg + Th;
+					     T4O = T1p - T1o;
+					     T1q = T1o + T1p;
+					     Tk = Cr[WS(csr, 11)];
+					     T1j = Ci[WS(csi, 20)];
+					     T1k = Ci[WS(csi, 11)];
+					}
+					{
+					     E Tn, To, T1y, T1z;
+					     Tn = Cr[WS(csr, 3)];
+					     T1n = Tj - Tk;
+					     Tl = Tj + Tk;
+					     T4N = T1k - T1j;
+					     T1l = T1j + T1k;
+					     To = Cr[WS(csr, 28)];
+					     T1y = Ci[WS(csi, 3)];
+					     T1z = Ci[WS(csi, 28)];
+					     Tq = Cr[WS(csr, 12)];
+					     T1t = Tn - To;
+					     Tp = Tn + To;
+					     T4T = T1y - T1z;
+					     T1A = T1y + T1z;
+					     Tr = Cr[WS(csr, 19)];
+					     T1u = Ci[WS(csi, 12)];
+					     T1v = Ci[WS(csi, 19)];
+					}
+				   }
+				   {
+					E T4M, T1B, T1w, T4P, T1m, T1r, Tm, Ts, T4S;
+					T4M = Ti - Tl;
+					Tm = Ti + Tl;
+					T1B = Tq - Tr;
+					Ts = Tq + Tr;
+					T4S = T1v - T1u;
+					T1w = T1u + T1v;
+					{
+					     E T6c, Tt, T4R, T6d, T4U;
+					     T6c = T4N + T4O;
+					     T4P = T4N - T4O;
+					     Tt = Tp + Ts;
+					     T4R = Tp - Ts;
+					     T6d = T4S + T4T;
+					     T4U = T4S - T4T;
+					     T3x = T1i + T1l;
+					     T1m = T1i - T1l;
+					     T6v = Tm - Tt;
+					     Tu = Tm + Tt;
+					     T5r = T4R - T4U;
+					     T4V = T4R + T4U;
+					     T6R = T6c + T6d;
+					     T6e = T6c - T6d;
+					     T1r = T1n + T1q;
+					     T3w = T1n - T1q;
+					}
+					{
+					     E T3A, T3z, T1x, T1C;
+					     T3A = T1t + T1w;
+					     T1x = T1t - T1w;
+					     T1C = T1A - T1B;
+					     T3z = T1B + T1A;
+					     T2K = FMA(KP414213562, T1m, T1r);
+					     T1s = FNMS(KP414213562, T1r, T1m);
+					     T2J = FMA(KP414213562, T1x, T1C);
+					     T1D = FNMS(KP414213562, T1C, T1x);
+					     T3X = FMA(KP414213562, T3z, T3A);
+					     T3B = FNMS(KP414213562, T3A, T3z);
+					     T5s = T4M + T4P;
+					     T4Q = T4M - T4P;
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T1G, Ty, T54, T20, T1X, TB, T53, T1J, TI, T4Z, T1L, TF, T22, T1U, T50;
+			      E T1O;
+			      {
+				   E T1Y, T1Z, Tz, TA, Tw, Tx, T1H, T1I;
+				   Tw = Cr[WS(csr, 2)];
+				   Tx = Cr[WS(csr, 29)];
+				   T1Y = Ci[WS(csi, 2)];
+				   T3Y = FNMS(KP414213562, T3w, T3x);
+				   T3y = FMA(KP414213562, T3x, T3w);
+				   T1G = Tw - Tx;
+				   Ty = Tw + Tx;
+				   T1Z = Ci[WS(csi, 29)];
+				   Tz = Cr[WS(csr, 18)];
+				   TA = Cr[WS(csr, 13)];
+				   T1H = Ci[WS(csi, 18)];
+				   T54 = T1Y - T1Z;
+				   T20 = T1Y + T1Z;
+				   T1X = Tz - TA;
+				   TB = Tz + TA;
+				   T1I = Ci[WS(csi, 13)];
+				   {
+					E T1R, T1Q, T1S, TG, TH;
+					TG = Cr[WS(csr, 5)];
+					TH = Cr[WS(csr, 26)];
+					T1R = Ci[WS(csi, 5)];
+					T53 = T1H - T1I;
+					T1J = T1H + T1I;
+					T1Q = TG - TH;
+					TI = TG + TH;
+					T1S = Ci[WS(csi, 26)];
+					{
+					     E T1M, T1N, TD, TE, T1T;
+					     TD = Cr[WS(csr, 10)];
+					     TE = Cr[WS(csr, 21)];
+					     T1T = T1R + T1S;
+					     T4Z = T1S - T1R;
+					     T1M = Ci[WS(csi, 10)];
+					     T1L = TD - TE;
+					     TF = TD + TE;
+					     T1N = Ci[WS(csi, 21)];
+					     T22 = T1Q + T1T;
+					     T1U = T1Q - T1T;
+					     T50 = T1M - T1N;
+					     T1O = T1M + T1N;
+					}
+				   }
+			      }
+			      {
+				   E T4Y, T23, T51, T1K, T1V, T3I, T3H, T21, T24;
+				   {
+					E T56, T1P, T6h, T55, TC, TJ, T6i;
+					T4Y = Ty - TB;
+					TC = Ty + TB;
+					TJ = TF + TI;
+					T56 = TF - TI;
+					T1P = T1L - T1O;
+					T23 = T1L + T1O;
+					T6h = T53 + T54;
+					T55 = T53 - T54;
+					T6g = TC - TJ;
+					TK = TC + TJ;
+					T6i = T50 + T4Z;
+					T51 = T4Z - T50;
+					T3E = T1G + T1J;
+					T1K = T1G - T1J;
+					T5M = T56 + T55;
+					T57 = T55 - T56;
+					T6N = T6i + T6h;
+					T6j = T6h - T6i;
+					T1V = T1P + T1U;
+					T3I = T1P - T1U;
+				   }
+				   T3H = T1X - T20;
+				   T21 = T1X + T20;
+				   T24 = T22 - T23;
+				   T3F = T23 + T22;
+				   T35 = FNMS(KP707106781, T1V, T1K);
+				   T1W = FMA(KP707106781, T1V, T1K);
+				   T34 = FMA(KP707106781, T24, T21);
+				   T25 = FNMS(KP707106781, T24, T21);
+				   T4i = FMA(KP707106781, T3I, T3H);
+				   T3J = FNMS(KP707106781, T3I, T3H);
+				   T5N = T4Y - T51;
+				   T52 = T4Y + T51;
+			      }
+			 }
+			 {
+			      E T27, TN, T5f, T2q, T2r, TQ, T5e, T2a, TX, T5a, T2c, TU, T2t, T2l, T5b;
+			      E T2f;
+			      {
+				   E T2o, T2p, TO, TP, TL, TM, T28, T29;
+				   TL = Cr[WS(csr, 1)];
+				   TM = Cr[WS(csr, 30)];
+				   T2o = Ci[WS(csi, 1)];
+				   T4j = FMA(KP707106781, T3F, T3E);
+				   T3G = FNMS(KP707106781, T3F, T3E);
+				   T27 = TL - TM;
+				   TN = TL + TM;
+				   T2p = Ci[WS(csi, 30)];
+				   TO = Cr[WS(csr, 14)];
+				   TP = Cr[WS(csr, 17)];
+				   T28 = Ci[WS(csi, 14)];
+				   T5f = T2p - T2o;
+				   T2q = T2o + T2p;
+				   T2r = TO - TP;
+				   TQ = TO + TP;
+				   T29 = Ci[WS(csi, 17)];
+				   {
+					E T2i, T2h, T2j, TV, TW;
+					TV = Cr[WS(csr, 9)];
+					TW = Cr[WS(csr, 22)];
+					T2i = Ci[WS(csi, 9)];
+					T5e = T28 - T29;
+					T2a = T28 + T29;
+					T2h = TV - TW;
+					TX = TV + TW;
+					T2j = Ci[WS(csi, 22)];
+					{
+					     E T2d, T2e, TS, TT, T2k;
+					     TS = Cr[WS(csr, 6)];
+					     TT = Cr[WS(csr, 25)];
+					     T2k = T2i + T2j;
+					     T5a = T2j - T2i;
+					     T2d = Ci[WS(csi, 6)];
+					     T2c = TS - TT;
+					     TU = TS + TT;
+					     T2e = Ci[WS(csi, 25)];
+					     T2t = T2h + T2k;
+					     T2l = T2h - T2k;
+					     T5b = T2d - T2e;
+					     T2f = T2d + T2e;
+					}
+				   }
+			      }
+			      {
+				   E T59, T2u, T5c, T2b, T2m, T3P, T3O, T2s, T2v;
+				   {
+					E T5h, T2g, T6m, T5g, TR, TY, T6n;
+					T59 = TN - TQ;
+					TR = TN + TQ;
+					TY = TU + TX;
+					T5h = TU - TX;
+					T2g = T2c - T2f;
+					T2u = T2c + T2f;
+					T6m = T5e + T5f;
+					T5g = T5e - T5f;
+					T6l = TR - TY;
+					TZ = TR + TY;
+					T6n = T5b + T5a;
+					T5c = T5a - T5b;
+					T3L = T27 + T2a;
+					T2b = T27 - T2a;
+					T5P = T5h + T5g;
+					T5i = T5g - T5h;
+					T6M = T6n + T6m;
+					T6o = T6m - T6n;
+					T2m = T2g + T2l;
+					T3P = T2g - T2l;
+				   }
+				   T3O = T2r + T2q;
+				   T2s = T2q - T2r;
+				   T2v = T2t - T2u;
+				   T3M = T2u + T2t;
+				   T38 = FNMS(KP707106781, T2m, T2b);
+				   T2n = FMA(KP707106781, T2m, T2b);
+				   T37 = FNMS(KP707106781, T2v, T2s);
+				   T2w = FMA(KP707106781, T2v, T2s);
+				   T4l = FMA(KP707106781, T3P, T3O);
+				   T3Q = FNMS(KP707106781, T3P, T3O);
+				   T5Q = T59 - T5c;
+				   T5d = T59 + T5c;
+			      }
+			 }
+		    }
+		    {
+			 E T4m, T3N, T5t, T5L, T63, T4W, T5Y, T5X, T66, T5W, T67, T5S;
+			 {
+			      E T6T, T6S, T6W, T6P;
+			      {
+				   E T6L, T6O, T6Y, T6X, T6Z, Tv, T10, T70;
+				   T6L = Tf - Tu;
+				   Tv = Tf + Tu;
+				   T10 = TK + TZ;
+				   T6T = TK - TZ;
+				   T6O = T6M - T6N;
+				   T6Y = T6N + T6M;
+				   T4m = FMA(KP707106781, T3M, T3L);
+				   T3N = FNMS(KP707106781, T3M, T3L);
+				   T6X = Tv - T10;
+				   T6S = T6Q - T6R;
+				   T6Z = T6R + T6Q;
+				   R0[0] = KP2_000000000 * (Tv + T10);
+				   R0[WS(rs, 16)] = KP2_000000000 * (T6Z - T6Y);
+				   T70 = T6Y + T6Z;
+				   T6W = T6L - T6O;
+				   T6P = T6L + T6O;
+				   R0[WS(rs, 24)] = KP1_414213562 * (T70 - T6X);
+				   R0[WS(rs, 8)] = KP1_414213562 * (T6X + T70);
+			      }
+			      {
+				   E T6D, T6f, T6w, T6G, T6p, T6x, T6y, T6k, T6V, T6U;
+				   T6D = T6b - T6e;
+				   T6f = T6b + T6e;
+				   T6w = T6u - T6v;
+				   T6G = T6v + T6u;
+				   T6V = T6T + T6S;
+				   T6U = T6S - T6T;
+				   T6p = T6l + T6o;
+				   T6x = T6l - T6o;
+				   R0[WS(rs, 12)] = KP1_847759065 * (FMA(KP414213562, T6W, T6V));
+				   R0[WS(rs, 28)] = -(KP1_847759065 * (FNMS(KP414213562, T6V, T6W)));
+				   R0[WS(rs, 20)] = KP1_847759065 * (FNMS(KP414213562, T6P, T6U));
+				   R0[WS(rs, 4)] = KP1_847759065 * (FMA(KP414213562, T6U, T6P));
+				   T6y = T6g + T6j;
+				   T6k = T6g - T6j;
+				   {
+					E T5V, T5K, T5O, T5R;
+					T5t = T5r - T5s;
+					T5K = T5s + T5r;
+					{
+					     E T6E, T6z, T6H, T6q;
+					     T6E = T6y + T6x;
+					     T6z = T6x - T6y;
+					     T6H = T6k - T6p;
+					     T6q = T6k + T6p;
+					     {
+						  E T6F, T6K, T6B, T6A;
+						  T6F = FNMS(KP707106781, T6E, T6D);
+						  T6K = FMA(KP707106781, T6E, T6D);
+						  T6B = FNMS(KP707106781, T6z, T6w);
+						  T6A = FMA(KP707106781, T6z, T6w);
+						  {
+						       E T6I, T6J, T6C, T6r;
+						       T6I = FNMS(KP707106781, T6H, T6G);
+						       T6J = FMA(KP707106781, T6H, T6G);
+						       T6C = FNMS(KP707106781, T6q, T6f);
+						       T6r = FMA(KP707106781, T6q, T6f);
+						       R0[WS(rs, 22)] = KP1_662939224 * (FNMS(KP668178637, T6F, T6I));
+						       R0[WS(rs, 6)] = KP1_662939224 * (FMA(KP668178637, T6I, T6F));
+						       R0[WS(rs, 30)] = -(KP1_961570560 * (FNMS(KP198912367, T6J, T6K)));
+						       R0[WS(rs, 14)] = KP1_961570560 * (FMA(KP198912367, T6K, T6J));
+						       R0[WS(rs, 26)] = -(KP1_662939224 * (FNMS(KP668178637, T6B, T6C)));
+						       R0[WS(rs, 10)] = KP1_662939224 * (FMA(KP668178637, T6C, T6B));
+						       R0[WS(rs, 18)] = KP1_961570560 * (FNMS(KP198912367, T6r, T6A));
+						       R0[WS(rs, 2)] = KP1_961570560 * (FMA(KP198912367, T6A, T6r));
+						       T5L = FNMS(KP707106781, T5K, T5J);
+						       T63 = FMA(KP707106781, T5K, T5J);
+						  }
+					     }
+					}
+					T5V = T4Q - T4V;
+					T4W = T4Q + T4V;
+					T5Y = FNMS(KP414213562, T5M, T5N);
+					T5O = FMA(KP414213562, T5N, T5M);
+					T5R = FNMS(KP414213562, T5Q, T5P);
+					T5X = FMA(KP414213562, T5P, T5Q);
+					T66 = FMA(KP707106781, T5V, T5U);
+					T5W = FNMS(KP707106781, T5V, T5U);
+					T67 = T5O + T5R;
+					T5S = T5O - T5R;
+				   }
+			      }
+			 }
+			 {
+			      E T1h, T2L, T2I, T3h, T3p, T1E, T3n, T3s, T3b, T3k, T3e, T3o;
+			      {
+				   E T4X, T5B, T5v, T5w, T5E, T5u, T5F, T5k, T58, T5j;
+				   {
+					E T68, T69, T62, T5T, T64, T5Z;
+					T68 = FNMS(KP923879532, T67, T66);
+					T69 = FMA(KP923879532, T67, T66);
+					T62 = FNMS(KP923879532, T5S, T5L);
+					T5T = FMA(KP923879532, T5S, T5L);
+					T64 = T5Y + T5X;
+					T5Z = T5X - T5Y;
+					T4X = FMA(KP707106781, T4W, T4L);
+					T5B = FNMS(KP707106781, T4W, T4L);
+					{
+					     E T65, T6a, T61, T60;
+					     T65 = FNMS(KP923879532, T64, T63);
+					     T6a = FMA(KP923879532, T64, T63);
+					     T61 = FNMS(KP923879532, T5Z, T5W);
+					     T60 = FMA(KP923879532, T5Z, T5W);
+					     R0[WS(rs, 23)] = KP1_546020906 * (FNMS(KP820678790, T65, T68));
+					     R0[WS(rs, 7)] = KP1_546020906 * (FMA(KP820678790, T68, T65));
+					     R0[WS(rs, 31)] = -(KP1_990369453 * (FNMS(KP098491403, T69, T6a)));
+					     R0[WS(rs, 15)] = KP1_990369453 * (FMA(KP098491403, T6a, T69));
+					     R0[WS(rs, 27)] = -(KP1_763842528 * (FNMS(KP534511135, T61, T62)));
+					     R0[WS(rs, 11)] = KP1_763842528 * (FMA(KP534511135, T62, T61));
+					     R0[WS(rs, 19)] = KP1_913880671 * (FNMS(KP303346683, T5T, T60));
+					     R0[WS(rs, 3)] = KP1_913880671 * (FMA(KP303346683, T60, T5T));
+					}
+				   }
+				   T5v = FNMS(KP414213562, T52, T57);
+				   T58 = FMA(KP414213562, T57, T52);
+				   T5j = FNMS(KP414213562, T5i, T5d);
+				   T5w = FMA(KP414213562, T5d, T5i);
+				   T5E = FNMS(KP707106781, T5t, T5q);
+				   T5u = FMA(KP707106781, T5t, T5q);
+				   T5F = T58 - T5j;
+				   T5k = T58 + T5j;
+				   {
+					E T3l, T33, T3c, T3m, T3a, T3d;
+					{
+					     E T39, T3f, T3g, T36;
+					     {
+						  E T31, T5G, T5H, T5A, T5l, T5C, T5x, T32;
+						  T1h = FMA(KP707106781, T1g, T15);
+						  T31 = FNMS(KP707106781, T1g, T15);
+						  T5G = FNMS(KP923879532, T5F, T5E);
+						  T5H = FMA(KP923879532, T5F, T5E);
+						  T5A = FNMS(KP923879532, T5k, T4X);
+						  T5l = FMA(KP923879532, T5k, T4X);
+						  T5C = T5w - T5v;
+						  T5x = T5v + T5w;
+						  T32 = T2K + T2J;
+						  T2L = T2J - T2K;
+						  T39 = FNMS(KP668178637, T38, T37);
+						  T3f = FMA(KP668178637, T37, T38);
+						  {
+						       E T5D, T5I, T5z, T5y;
+						       T5D = FNMS(KP923879532, T5C, T5B);
+						       T5I = FMA(KP923879532, T5C, T5B);
+						       T5z = FNMS(KP923879532, T5x, T5u);
+						       T5y = FMA(KP923879532, T5x, T5u);
+						       T3l = FMA(KP923879532, T32, T31);
+						       T33 = FNMS(KP923879532, T32, T31);
+						       R0[WS(rs, 21)] = KP1_763842528 * (FNMS(KP534511135, T5D, T5G));
+						       R0[WS(rs, 5)] = KP1_763842528 * (FMA(KP534511135, T5G, T5D));
+						       R0[WS(rs, 29)] = -(KP1_913880671 * (FNMS(KP303346683, T5H, T5I)));
+						       R0[WS(rs, 13)] = KP1_913880671 * (FMA(KP303346683, T5I, T5H));
+						       R0[WS(rs, 25)] = -(KP1_546020906 * (FNMS(KP820678790, T5z, T5A)));
+						       R0[WS(rs, 9)] = KP1_546020906 * (FMA(KP820678790, T5A, T5z));
+						       R0[WS(rs, 17)] = KP1_990369453 * (FNMS(KP098491403, T5l, T5y));
+						       R0[WS(rs, 1)] = KP1_990369453 * (FMA(KP098491403, T5y, T5l));
+						       T3g = FMA(KP668178637, T34, T35);
+						       T36 = FNMS(KP668178637, T35, T34);
+						  }
+					     }
+					     T2I = FNMS(KP707106781, T2H, T2E);
+					     T3c = FMA(KP707106781, T2H, T2E);
+					     T3m = T3g + T3f;
+					     T3h = T3f - T3g;
+					     T3p = T39 - T36;
+					     T3a = T36 + T39;
+					     T3d = T1s - T1D;
+					     T1E = T1s + T1D;
+					}
+					T3n = FNMS(KP831469612, T3m, T3l);
+					T3s = FMA(KP831469612, T3m, T3l);
+					T3b = FNMS(KP831469612, T3a, T33);
+					T3k = FMA(KP831469612, T3a, T33);
+					T3e = FMA(KP923879532, T3d, T3c);
+					T3o = FNMS(KP923879532, T3d, T3c);
+				   }
+			      }
+			      {
+				   E T3v, T3Z, T3W, T4v, T4D, T3C, T4B, T4G, T4p, T4y, T4s, T4C;
+				   {
+					E T4z, T4h, T4q, T4A, T4o, T4r;
+					{
+					     E T4n, T4t, T4u, T4k, T4f, T4g;
+					     T3v = FNMS(KP707106781, T3u, T3t);
+					     T4f = FMA(KP707106781, T3u, T3t);
+					     T4g = T3Y + T3X;
+					     T3Z = T3X - T3Y;
+					     {
+						  E T3r, T3q, T3i, T3j;
+						  T3r = FNMS(KP831469612, T3p, T3o);
+						  T3q = FMA(KP831469612, T3p, T3o);
+						  T3i = FNMS(KP831469612, T3h, T3e);
+						  T3j = FMA(KP831469612, T3h, T3e);
+						  R1[WS(rs, 22)] = -(KP1_606415062 * (FMA(KP741650546, T3n, T3q)));
+						  R1[WS(rs, 6)] = KP1_606415062 * (FNMS(KP741650546, T3q, T3n));
+						  R1[WS(rs, 30)] = -(KP1_978353019 * (FMA(KP148335987, T3r, T3s)));
+						  R1[WS(rs, 14)] = -(KP1_978353019 * (FNMS(KP148335987, T3s, T3r)));
+						  R1[WS(rs, 26)] = -(KP1_715457220 * (FMA(KP599376933, T3j, T3k)));
+						  R1[WS(rs, 10)] = -(KP1_715457220 * (FNMS(KP599376933, T3k, T3j)));
+						  R1[WS(rs, 18)] = -(KP1_940062506 * (FMA(KP250486960, T3b, T3i)));
+						  R1[WS(rs, 2)] = KP1_940062506 * (FNMS(KP250486960, T3i, T3b));
+						  T4z = FMA(KP923879532, T4g, T4f);
+						  T4h = FNMS(KP923879532, T4g, T4f);
+					     }
+					     T4n = FNMS(KP198912367, T4m, T4l);
+					     T4t = FMA(KP198912367, T4l, T4m);
+					     T4u = FNMS(KP198912367, T4i, T4j);
+					     T4k = FMA(KP198912367, T4j, T4i);
+					     T3W = FNMS(KP707106781, T3V, T3U);
+					     T4q = FMA(KP707106781, T3V, T3U);
+					     T4A = T4u + T4t;
+					     T4v = T4t - T4u;
+					     T4D = T4k + T4n;
+					     T4o = T4k - T4n;
+					     T4r = T3y + T3B;
+					     T3C = T3y - T3B;
+					}
+					T4B = FNMS(KP980785280, T4A, T4z);
+					T4G = FMA(KP980785280, T4A, T4z);
+					T4p = FMA(KP980785280, T4o, T4h);
+					T4y = FNMS(KP980785280, T4o, T4h);
+					T4s = FNMS(KP923879532, T4r, T4q);
+					T4C = FMA(KP923879532, T4r, T4q);
+				   }
+				   {
+					E T2P, T2X, T2V, T30, T2z, T2S, T2M, T2W;
+					{
+					     E T2T, T1F, T2U, T2y;
+					     {
+						  E T2x, T2N, T2O, T26;
+						  {
+						       E T4F, T4E, T4w, T4x;
+						       T4F = FMA(KP980785280, T4D, T4C);
+						       T4E = FNMS(KP980785280, T4D, T4C);
+						       T4w = FMA(KP980785280, T4v, T4s);
+						       T4x = FNMS(KP980785280, T4v, T4s);
+						       R1[WS(rs, 23)] = KP1_481902250 * (FNMS(KP906347169, T4B, T4E));
+						       R1[WS(rs, 7)] = KP1_481902250 * (FMA(KP906347169, T4E, T4B));
+						       R1[WS(rs, 31)] = -(KP1_997590912 * (FNMS(KP049126849, T4F, T4G)));
+						       R1[WS(rs, 15)] = KP1_997590912 * (FMA(KP049126849, T4G, T4F));
+						       R1[WS(rs, 27)] = -(KP1_807978586 * (FNMS(KP472964775, T4x, T4y)));
+						       R1[WS(rs, 11)] = KP1_807978586 * (FMA(KP472964775, T4y, T4x));
+						       R1[WS(rs, 19)] = KP1_883088130 * (FNMS(KP357805721, T4p, T4w));
+						       R1[WS(rs, 3)] = KP1_883088130 * (FMA(KP357805721, T4w, T4p));
+						       T2T = FNMS(KP923879532, T1E, T1h);
+						       T1F = FMA(KP923879532, T1E, T1h);
+						  }
+						  T2x = FNMS(KP198912367, T2w, T2n);
+						  T2N = FMA(KP198912367, T2n, T2w);
+						  T2O = FMA(KP198912367, T1W, T25);
+						  T26 = FNMS(KP198912367, T25, T1W);
+						  T2U = T2O + T2N;
+						  T2P = T2N - T2O;
+						  T2X = T26 - T2x;
+						  T2y = T26 + T2x;
+					     }
+					     T2V = FNMS(KP980785280, T2U, T2T);
+					     T30 = FMA(KP980785280, T2U, T2T);
+					     T2z = FMA(KP980785280, T2y, T1F);
+					     T2S = FNMS(KP980785280, T2y, T1F);
+					     T2M = FNMS(KP923879532, T2L, T2I);
+					     T2W = FMA(KP923879532, T2L, T2I);
+					}
+					{
+					     E T47, T3D, T48, T3S;
+					     {
+						  E T3K, T41, T42, T3R;
+						  {
+						       E T2Z, T2Y, T2Q, T2R;
+						       T2Z = FNMS(KP980785280, T2X, T2W);
+						       T2Y = FMA(KP980785280, T2X, T2W);
+						       T2Q = FNMS(KP980785280, T2P, T2M);
+						       T2R = FMA(KP980785280, T2P, T2M);
+						       R1[WS(rs, 20)] = -(KP1_807978586 * (FMA(KP472964775, T2V, T2Y)));
+						       R1[WS(rs, 4)] = KP1_807978586 * (FNMS(KP472964775, T2Y, T2V));
+						       R1[WS(rs, 28)] = -(KP1_883088130 * (FMA(KP357805721, T2Z, T30)));
+						       R1[WS(rs, 12)] = -(KP1_883088130 * (FNMS(KP357805721, T30, T2Z)));
+						       R1[WS(rs, 24)] = -(KP1_481902250 * (FMA(KP906347169, T2R, T2S)));
+						       R1[WS(rs, 8)] = -(KP1_481902250 * (FNMS(KP906347169, T2S, T2R)));
+						       R1[WS(rs, 16)] = -(KP1_997590912 * (FMA(KP049126849, T2z, T2Q)));
+						       R1[0] = KP1_997590912 * (FNMS(KP049126849, T2Q, T2z));
+						       T47 = FNMS(KP923879532, T3C, T3v);
+						       T3D = FMA(KP923879532, T3C, T3v);
+						  }
+						  T3K = FMA(KP668178637, T3J, T3G);
+						  T41 = FNMS(KP668178637, T3G, T3J);
+						  T42 = FMA(KP668178637, T3N, T3Q);
+						  T3R = FNMS(KP668178637, T3Q, T3N);
+						  T48 = T42 - T41;
+						  T43 = T41 + T42;
+						  T4b = T3K - T3R;
+						  T3S = T3K + T3R;
+					     }
+					     T49 = FNMS(KP831469612, T48, T47);
+					     T4e = FMA(KP831469612, T48, T47);
+					     T3T = FMA(KP831469612, T3S, T3D);
+					     T46 = FNMS(KP831469612, T3S, T3D);
+					     T40 = FMA(KP923879532, T3Z, T3W);
+					     T4a = FNMS(KP923879532, T3Z, T3W);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    E T4d, T4c, T44, T45;
+		    T4d = FMA(KP831469612, T4b, T4a);
+		    T4c = FNMS(KP831469612, T4b, T4a);
+		    T44 = FMA(KP831469612, T43, T40);
+		    T45 = FNMS(KP831469612, T43, T40);
+		    R1[WS(rs, 21)] = KP1_715457220 * (FNMS(KP599376933, T49, T4c));
+		    R1[WS(rs, 5)] = KP1_715457220 * (FMA(KP599376933, T4c, T49));
+		    R1[WS(rs, 29)] = -(KP1_940062506 * (FNMS(KP250486960, T4d, T4e)));
+		    R1[WS(rs, 13)] = KP1_940062506 * (FMA(KP250486960, T4e, T4d));
+		    R1[WS(rs, 25)] = -(KP1_606415062 * (FNMS(KP741650546, T45, T46)));
+		    R1[WS(rs, 9)] = KP1_606415062 * (FMA(KP741650546, T46, T45));
+		    R1[WS(rs, 17)] = KP1_978353019 * (FNMS(KP148335987, T3T, T44));
+		    R1[WS(rs, 1)] = KP1_978353019 * (FMA(KP148335987, T44, T3T));
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 64, "r2cbIII_64", {238, 64, 196, 0}, &GENUS };
+
+void X(codelet_r2cbIII_64) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_64, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 64 -name r2cbIII_64 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 434 FP additions, 208 FP multiplications,
+ * (or, 342 additions, 116 multiplications, 92 fused multiply/add),
+ * 130 stack variables, 39 constants, and 128 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_343117909, +1.343117909694036801250753700854843606457501264);
+     DK(KP1_481902250, +1.481902250709918182351233794990325459457910619);
+     DK(KP1_807978586, +1.807978586246886663172400594461074097420264050);
+     DK(KP855110186, +0.855110186860564188641933713777597068609157259);
+     DK(KP1_997590912, +1.997590912410344785429543209518201388886407229);
+     DK(KP098135348, +0.098135348654836028509909953885365316629490726);
+     DK(KP673779706, +0.673779706784440101378506425238295140955533559);
+     DK(KP1_883088130, +1.883088130366041556825018805199004714371179592);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP1_191398608, +1.191398608984866686934073057659939779023852677);
+     DK(KP1_606415062, +1.606415062961289819613353025926283847759138854);
+     DK(KP1_715457220, +1.715457220000544139804539968569540274084981599);
+     DK(KP1_028205488, +1.028205488386443453187387677937631545216098241);
+     DK(KP1_978353019, +1.978353019929561946903347476032486127967379067);
+     DK(KP293460948, +0.293460948910723503317700259293435639412430633);
+     DK(KP485960359, +0.485960359806527779896548324154942236641981567);
+     DK(KP1_940062506, +1.940062506389087985207968414572200502913731924);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP1_268786568, +1.268786568327290996430343226450986741351374190);
+     DK(KP1_546020906, +1.546020906725473921621813219516939601942082586);
+     DK(KP1_763842528, +1.763842528696710059425513727320776699016885241);
+     DK(KP942793473, +0.942793473651995297112775251810508755314920638);
+     DK(KP1_990369453, +1.990369453344393772489673906218959843150949737);
+     DK(KP196034280, +0.196034280659121203988391127777283691722273346);
+     DK(KP580569354, +0.580569354508924735272384751634790549382952557);
+     DK(KP1_913880671, +1.913880671464417729871595773960539938965698411);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
+     DK(KP390180644, +0.390180644032256535696569736954044481855383236);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP765366864, +0.765366864730179543456919968060797733522689125);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(256, rs), MAKE_VOLATILE_STRIDE(256, csr), MAKE_VOLATILE_STRIDE(256, csi)) {
+	       E T15, T3t, T3U, T2N, Tf, T6b, T6u, T6R, T4L, T5J, T1g, T3V, T5q, T5U, T2I;
+	       E T3u, Tu, T6v, T4V, T5s, T6e, T6Q, T1s, T2D, T1D, T2E, T3B, T3Y, T4Q, T5r;
+	       E T3y, T3X, TK, T6g, T57, T5N, T6j, T6N, T1W, T34, T25, T35, T3J, T4j, T52;
+	       E T5M, T3G, T4i, TZ, T6l, T5i, T5Q, T6o, T6M, T2n, T37, T2w, T38, T3Q, T4m;
+	       E T5d, T5P, T3N, T4l;
+	       {
+		    E T3, T11, T2M, T5n, T6, T2J, T14, T5m, Ta, T16, T19, T4J, Td, T1b, T1e;
+		    E T4I;
+		    {
+			 E T1, T2, T2K, T2L;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 31)];
+			 T3 = T1 + T2;
+			 T11 = T1 - T2;
+			 T2K = Ci[0];
+			 T2L = Ci[WS(csi, 31)];
+			 T2M = T2K + T2L;
+			 T5n = T2L - T2K;
+		    }
+		    {
+			 E T4, T5, T12, T13;
+			 T4 = Cr[WS(csr, 16)];
+			 T5 = Cr[WS(csr, 15)];
+			 T6 = T4 + T5;
+			 T2J = T4 - T5;
+			 T12 = Ci[WS(csi, 16)];
+			 T13 = Ci[WS(csi, 15)];
+			 T14 = T12 + T13;
+			 T5m = T12 - T13;
+		    }
+		    {
+			 E T8, T9, T17, T18;
+			 T8 = Cr[WS(csr, 8)];
+			 T9 = Cr[WS(csr, 23)];
+			 Ta = T8 + T9;
+			 T16 = T8 - T9;
+			 T17 = Ci[WS(csi, 8)];
+			 T18 = Ci[WS(csi, 23)];
+			 T19 = T17 + T18;
+			 T4J = T17 - T18;
+		    }
+		    {
+			 E Tb, Tc, T1c, T1d;
+			 Tb = Cr[WS(csr, 7)];
+			 Tc = Cr[WS(csr, 24)];
+			 Td = Tb + Tc;
+			 T1b = Tb - Tc;
+			 T1c = Ci[WS(csi, 7)];
+			 T1d = Ci[WS(csi, 24)];
+			 T1e = T1c + T1d;
+			 T4I = T1d - T1c;
+		    }
+		    {
+			 E T7, Te, T1a, T1f;
+			 T15 = T11 - T14;
+			 T3t = T11 + T14;
+			 T3U = T2J - T2M;
+			 T2N = T2J + T2M;
+			 T7 = T3 + T6;
+			 Te = Ta + Td;
+			 Tf = T7 + Te;
+			 T6b = T7 - Te;
+			 {
+			      E T6s, T6t, T4H, T4K;
+			      T6s = T4J + T4I;
+			      T6t = T5n - T5m;
+			      T6u = T6s + T6t;
+			      T6R = T6t - T6s;
+			      T4H = T3 - T6;
+			      T4K = T4I - T4J;
+			      T4L = T4H + T4K;
+			      T5J = T4H - T4K;
+			 }
+			 T1a = T16 - T19;
+			 T1f = T1b - T1e;
+			 T1g = KP707106781 * (T1a + T1f);
+			 T3V = KP707106781 * (T1a - T1f);
+			 {
+			      E T5o, T5p, T2G, T2H;
+			      T5o = T5m + T5n;
+			      T5p = Ta - Td;
+			      T5q = T5o - T5p;
+			      T5U = T5p + T5o;
+			      T2G = T16 + T19;
+			      T2H = T1b + T1e;
+			      T2I = KP707106781 * (T2G - T2H);
+			      T3u = KP707106781 * (T2G + T2H);
+			 }
+		    }
+	       }
+	       {
+		    E Ti, T1i, T1q, T4N, Tl, T1n, T1l, T4O, Tp, T1t, T1B, T4S, Ts, T1y, T1w;
+		    E T4T;
+		    {
+			 E Tg, Th, T1o, T1p;
+			 Tg = Cr[WS(csr, 4)];
+			 Th = Cr[WS(csr, 27)];
+			 Ti = Tg + Th;
+			 T1i = Tg - Th;
+			 T1o = Ci[WS(csi, 4)];
+			 T1p = Ci[WS(csi, 27)];
+			 T1q = T1o + T1p;
+			 T4N = T1o - T1p;
+		    }
+		    {
+			 E Tj, Tk, T1j, T1k;
+			 Tj = Cr[WS(csr, 20)];
+			 Tk = Cr[WS(csr, 11)];
+			 Tl = Tj + Tk;
+			 T1n = Tj - Tk;
+			 T1j = Ci[WS(csi, 20)];
+			 T1k = Ci[WS(csi, 11)];
+			 T1l = T1j + T1k;
+			 T4O = T1j - T1k;
+		    }
+		    {
+			 E Tn, To, T1z, T1A;
+			 Tn = Cr[WS(csr, 3)];
+			 To = Cr[WS(csr, 28)];
+			 Tp = Tn + To;
+			 T1t = Tn - To;
+			 T1z = Ci[WS(csi, 3)];
+			 T1A = Ci[WS(csi, 28)];
+			 T1B = T1z + T1A;
+			 T4S = T1A - T1z;
+		    }
+		    {
+			 E Tq, Tr, T1u, T1v;
+			 Tq = Cr[WS(csr, 12)];
+			 Tr = Cr[WS(csr, 19)];
+			 Ts = Tq + Tr;
+			 T1y = Tq - Tr;
+			 T1u = Ci[WS(csi, 12)];
+			 T1v = Ci[WS(csi, 19)];
+			 T1w = T1u + T1v;
+			 T4T = T1u - T1v;
+		    }
+		    {
+			 E Tm, Tt, T4R, T4U;
+			 Tm = Ti + Tl;
+			 Tt = Tp + Ts;
+			 Tu = Tm + Tt;
+			 T6v = Tm - Tt;
+			 T4R = Tp - Ts;
+			 T4U = T4S - T4T;
+			 T4V = T4R + T4U;
+			 T5s = T4U - T4R;
+		    }
+		    {
+			 E T6c, T6d, T1m, T1r;
+			 T6c = T4T + T4S;
+			 T6d = T4O + T4N;
+			 T6e = T6c - T6d;
+			 T6Q = T6d + T6c;
+			 T1m = T1i - T1l;
+			 T1r = T1n + T1q;
+			 T1s = FNMS(KP382683432, T1r, KP923879532 * T1m);
+			 T2D = FMA(KP382683432, T1m, KP923879532 * T1r);
+		    }
+		    {
+			 E T1x, T1C, T3z, T3A;
+			 T1x = T1t - T1w;
+			 T1C = T1y - T1B;
+			 T1D = FMA(KP923879532, T1x, KP382683432 * T1C);
+			 T2E = FNMS(KP382683432, T1x, KP923879532 * T1C);
+			 T3z = T1t + T1w;
+			 T3A = T1y + T1B;
+			 T3B = FNMS(KP923879532, T3A, KP382683432 * T3z);
+			 T3Y = FMA(KP923879532, T3z, KP382683432 * T3A);
+		    }
+		    {
+			 E T4M, T4P, T3w, T3x;
+			 T4M = Ti - Tl;
+			 T4P = T4N - T4O;
+			 T4Q = T4M - T4P;
+			 T5r = T4M + T4P;
+			 T3w = T1i + T1l;
+			 T3x = T1q - T1n;
+			 T3y = FNMS(KP923879532, T3x, KP382683432 * T3w);
+			 T3X = FMA(KP923879532, T3w, KP382683432 * T3x);
+		    }
+	       }
+	       {
+		    E Ty, T1G, T23, T54, TB, T20, T1J, T55, TI, T4Z, T1U, T1Y, TF, T50, T1P;
+		    E T1X;
+		    {
+			 E Tw, Tx, T1H, T1I;
+			 Tw = Cr[WS(csr, 2)];
+			 Tx = Cr[WS(csr, 29)];
+			 Ty = Tw + Tx;
+			 T1G = Tw - Tx;
+			 {
+			      E T21, T22, Tz, TA;
+			      T21 = Ci[WS(csi, 2)];
+			      T22 = Ci[WS(csi, 29)];
+			      T23 = T21 + T22;
+			      T54 = T21 - T22;
+			      Tz = Cr[WS(csr, 18)];
+			      TA = Cr[WS(csr, 13)];
+			      TB = Tz + TA;
+			      T20 = Tz - TA;
+			 }
+			 T1H = Ci[WS(csi, 18)];
+			 T1I = Ci[WS(csi, 13)];
+			 T1J = T1H + T1I;
+			 T55 = T1H - T1I;
+			 {
+			      E TG, TH, T1Q, T1R, T1S, T1T;
+			      TG = Cr[WS(csr, 5)];
+			      TH = Cr[WS(csr, 26)];
+			      T1Q = TG - TH;
+			      T1R = Ci[WS(csi, 5)];
+			      T1S = Ci[WS(csi, 26)];
+			      T1T = T1R + T1S;
+			      TI = TG + TH;
+			      T4Z = T1S - T1R;
+			      T1U = T1Q - T1T;
+			      T1Y = T1Q + T1T;
+			 }
+			 {
+			      E TD, TE, T1L, T1M, T1N, T1O;
+			      TD = Cr[WS(csr, 10)];
+			      TE = Cr[WS(csr, 21)];
+			      T1L = TD - TE;
+			      T1M = Ci[WS(csi, 10)];
+			      T1N = Ci[WS(csi, 21)];
+			      T1O = T1M + T1N;
+			      TF = TD + TE;
+			      T50 = T1M - T1N;
+			      T1P = T1L - T1O;
+			      T1X = T1L + T1O;
+			 }
+		    }
+		    {
+			 E TC, TJ, T53, T56;
+			 TC = Ty + TB;
+			 TJ = TF + TI;
+			 TK = TC + TJ;
+			 T6g = TC - TJ;
+			 T53 = TF - TI;
+			 T56 = T54 - T55;
+			 T57 = T53 + T56;
+			 T5N = T56 - T53;
+		    }
+		    {
+			 E T6h, T6i, T1K, T1V;
+			 T6h = T55 + T54;
+			 T6i = T50 + T4Z;
+			 T6j = T6h - T6i;
+			 T6N = T6i + T6h;
+			 T1K = T1G - T1J;
+			 T1V = KP707106781 * (T1P + T1U);
+			 T1W = T1K + T1V;
+			 T34 = T1K - T1V;
+		    }
+		    {
+			 E T1Z, T24, T3H, T3I;
+			 T1Z = KP707106781 * (T1X - T1Y);
+			 T24 = T20 + T23;
+			 T25 = T1Z + T24;
+			 T35 = T24 - T1Z;
+			 T3H = KP707106781 * (T1P - T1U);
+			 T3I = T23 - T20;
+			 T3J = T3H + T3I;
+			 T4j = T3I - T3H;
+		    }
+		    {
+			 E T4Y, T51, T3E, T3F;
+			 T4Y = Ty - TB;
+			 T51 = T4Z - T50;
+			 T52 = T4Y + T51;
+			 T5M = T4Y - T51;
+			 T3E = T1G + T1J;
+			 T3F = KP707106781 * (T1X + T1Y);
+			 T3G = T3E - T3F;
+			 T4i = T3E + T3F;
+		    }
+	       }
+	       {
+		    E TN, T27, T2u, T5f, TQ, T2r, T2a, T5g, TX, T5a, T2l, T2p, TU, T5b, T2g;
+		    E T2o;
+		    {
+			 E TL, TM, T28, T29;
+			 TL = Cr[WS(csr, 1)];
+			 TM = Cr[WS(csr, 30)];
+			 TN = TL + TM;
+			 T27 = TL - TM;
+			 {
+			      E T2s, T2t, TO, TP;
+			      T2s = Ci[WS(csi, 1)];
+			      T2t = Ci[WS(csi, 30)];
+			      T2u = T2s + T2t;
+			      T5f = T2t - T2s;
+			      TO = Cr[WS(csr, 14)];
+			      TP = Cr[WS(csr, 17)];
+			      TQ = TO + TP;
+			      T2r = TO - TP;
+			 }
+			 T28 = Ci[WS(csi, 14)];
+			 T29 = Ci[WS(csi, 17)];
+			 T2a = T28 + T29;
+			 T5g = T28 - T29;
+			 {
+			      E TV, TW, T2h, T2i, T2j, T2k;
+			      TV = Cr[WS(csr, 9)];
+			      TW = Cr[WS(csr, 22)];
+			      T2h = TV - TW;
+			      T2i = Ci[WS(csi, 9)];
+			      T2j = Ci[WS(csi, 22)];
+			      T2k = T2i + T2j;
+			      TX = TV + TW;
+			      T5a = T2j - T2i;
+			      T2l = T2h - T2k;
+			      T2p = T2h + T2k;
+			 }
+			 {
+			      E TS, TT, T2c, T2d, T2e, T2f;
+			      TS = Cr[WS(csr, 6)];
+			      TT = Cr[WS(csr, 25)];
+			      T2c = TS - TT;
+			      T2d = Ci[WS(csi, 6)];
+			      T2e = Ci[WS(csi, 25)];
+			      T2f = T2d + T2e;
+			      TU = TS + TT;
+			      T5b = T2d - T2e;
+			      T2g = T2c - T2f;
+			      T2o = T2c + T2f;
+			 }
+		    }
+		    {
+			 E TR, TY, T5e, T5h;
+			 TR = TN + TQ;
+			 TY = TU + TX;
+			 TZ = TR + TY;
+			 T6l = TR - TY;
+			 T5e = TU - TX;
+			 T5h = T5f - T5g;
+			 T5i = T5e + T5h;
+			 T5Q = T5h - T5e;
+		    }
+		    {
+			 E T6m, T6n, T2b, T2m;
+			 T6m = T5g + T5f;
+			 T6n = T5b + T5a;
+			 T6o = T6m - T6n;
+			 T6M = T6n + T6m;
+			 T2b = T27 - T2a;
+			 T2m = KP707106781 * (T2g + T2l);
+			 T2n = T2b + T2m;
+			 T37 = T2b - T2m;
+		    }
+		    {
+			 E T2q, T2v, T3O, T3P;
+			 T2q = KP707106781 * (T2o - T2p);
+			 T2v = T2r - T2u;
+			 T2w = T2q + T2v;
+			 T38 = T2v - T2q;
+			 T3O = KP707106781 * (T2g - T2l);
+			 T3P = T2r + T2u;
+			 T3Q = T3O - T3P;
+			 T4m = T3O + T3P;
+		    }
+		    {
+			 E T59, T5c, T3L, T3M;
+			 T59 = TN - TQ;
+			 T5c = T5a - T5b;
+			 T5d = T59 + T5c;
+			 T5P = T59 - T5c;
+			 T3L = T27 + T2a;
+			 T3M = KP707106781 * (T2o + T2p);
+			 T3N = T3L - T3M;
+			 T4l = T3L + T3M;
+		    }
+	       }
+	       {
+		    E Tv, T10, T6X, T6Y, T6Z, T70;
+		    Tv = Tf + Tu;
+		    T10 = TK + TZ;
+		    T6X = Tv - T10;
+		    T6Y = T6N + T6M;
+		    T6Z = T6R - T6Q;
+		    T70 = T6Y + T6Z;
+		    R0[0] = KP2_000000000 * (Tv + T10);
+		    R0[WS(rs, 16)] = KP2_000000000 * (T6Z - T6Y);
+		    R0[WS(rs, 8)] = KP1_414213562 * (T6X + T70);
+		    R0[WS(rs, 24)] = KP1_414213562 * (T70 - T6X);
+	       }
+	       {
+		    E T6P, T6V, T6U, T6W;
+		    {
+			 E T6L, T6O, T6S, T6T;
+			 T6L = Tf - Tu;
+			 T6O = T6M - T6N;
+			 T6P = T6L + T6O;
+			 T6V = T6L - T6O;
+			 T6S = T6Q + T6R;
+			 T6T = TK - TZ;
+			 T6U = T6S - T6T;
+			 T6W = T6T + T6S;
+		    }
+		    R0[WS(rs, 4)] = FMA(KP1_847759065, T6P, KP765366864 * T6U);
+		    R0[WS(rs, 28)] = FNMS(KP1_847759065, T6V, KP765366864 * T6W);
+		    R0[WS(rs, 20)] = FNMS(KP765366864, T6P, KP1_847759065 * T6U);
+		    R0[WS(rs, 12)] = FMA(KP765366864, T6V, KP1_847759065 * T6W);
+	       }
+	       {
+		    E T6f, T6w, T6G, T6D, T6z, T6E, T6q, T6H;
+		    T6f = T6b + T6e;
+		    T6w = T6u - T6v;
+		    T6G = T6v + T6u;
+		    T6D = T6b - T6e;
+		    {
+			 E T6x, T6y, T6k, T6p;
+			 T6x = T6g + T6j;
+			 T6y = T6o - T6l;
+			 T6z = KP707106781 * (T6x + T6y);
+			 T6E = KP707106781 * (T6y - T6x);
+			 T6k = T6g - T6j;
+			 T6p = T6l + T6o;
+			 T6q = KP707106781 * (T6k + T6p);
+			 T6H = KP707106781 * (T6k - T6p);
+		    }
+		    {
+			 E T6r, T6A, T6J, T6K;
+			 T6r = T6f + T6q;
+			 T6A = T6w - T6z;
+			 R0[WS(rs, 2)] = FMA(KP1_961570560, T6r, KP390180644 * T6A);
+			 R0[WS(rs, 18)] = FNMS(KP390180644, T6r, KP1_961570560 * T6A);
+			 T6J = T6D - T6E;
+			 T6K = T6H + T6G;
+			 R0[WS(rs, 14)] = FMA(KP390180644, T6J, KP1_961570560 * T6K);
+			 R0[WS(rs, 30)] = FNMS(KP1_961570560, T6J, KP390180644 * T6K);
+		    }
+		    {
+			 E T6B, T6C, T6F, T6I;
+			 T6B = T6f - T6q;
+			 T6C = T6z + T6w;
+			 R0[WS(rs, 10)] = FMA(KP1_111140466, T6B, KP1_662939224 * T6C);
+			 R0[WS(rs, 26)] = FNMS(KP1_662939224, T6B, KP1_111140466 * T6C);
+			 T6F = T6D + T6E;
+			 T6I = T6G - T6H;
+			 R0[WS(rs, 6)] = FMA(KP1_662939224, T6F, KP1_111140466 * T6I);
+			 R0[WS(rs, 22)] = FNMS(KP1_111140466, T6F, KP1_662939224 * T6I);
+		    }
+	       }
+	       {
+		    E T5L, T63, T5W, T66, T5S, T67, T5Z, T64, T5K, T5V;
+		    T5K = KP707106781 * (T5s - T5r);
+		    T5L = T5J + T5K;
+		    T63 = T5J - T5K;
+		    T5V = KP707106781 * (T4Q - T4V);
+		    T5W = T5U - T5V;
+		    T66 = T5V + T5U;
+		    {
+			 E T5O, T5R, T5X, T5Y;
+			 T5O = FNMS(KP923879532, T5N, KP382683432 * T5M);
+			 T5R = FMA(KP382683432, T5P, KP923879532 * T5Q);
+			 T5S = T5O + T5R;
+			 T67 = T5O - T5R;
+			 T5X = FMA(KP923879532, T5M, KP382683432 * T5N);
+			 T5Y = FNMS(KP923879532, T5P, KP382683432 * T5Q);
+			 T5Z = T5X + T5Y;
+			 T64 = T5Y - T5X;
+		    }
+		    {
+			 E T5T, T60, T69, T6a;
+			 T5T = T5L + T5S;
+			 T60 = T5W - T5Z;
+			 R0[WS(rs, 3)] = FMA(KP1_913880671, T5T, KP580569354 * T60);
+			 R0[WS(rs, 19)] = FNMS(KP580569354, T5T, KP1_913880671 * T60);
+			 T69 = T63 - T64;
+			 T6a = T67 + T66;
+			 R0[WS(rs, 15)] = FMA(KP196034280, T69, KP1_990369453 * T6a);
+			 R0[WS(rs, 31)] = FNMS(KP1_990369453, T69, KP196034280 * T6a);
+		    }
+		    {
+			 E T61, T62, T65, T68;
+			 T61 = T5L - T5S;
+			 T62 = T5Z + T5W;
+			 R0[WS(rs, 11)] = FMA(KP942793473, T61, KP1_763842528 * T62);
+			 R0[WS(rs, 27)] = FNMS(KP1_763842528, T61, KP942793473 * T62);
+			 T65 = T63 + T64;
+			 T68 = T66 - T67;
+			 R0[WS(rs, 7)] = FMA(KP1_546020906, T65, KP1_268786568 * T68);
+			 R0[WS(rs, 23)] = FNMS(KP1_268786568, T65, KP1_546020906 * T68);
+		    }
+	       }
+	       {
+		    E T4X, T5B, T5u, T5E, T5k, T5F, T5x, T5C, T4W, T5t;
+		    T4W = KP707106781 * (T4Q + T4V);
+		    T4X = T4L + T4W;
+		    T5B = T4L - T4W;
+		    T5t = KP707106781 * (T5r + T5s);
+		    T5u = T5q - T5t;
+		    T5E = T5t + T5q;
+		    {
+			 E T58, T5j, T5v, T5w;
+			 T58 = FNMS(KP382683432, T57, KP923879532 * T52);
+			 T5j = FMA(KP923879532, T5d, KP382683432 * T5i);
+			 T5k = T58 + T5j;
+			 T5F = T58 - T5j;
+			 T5v = FMA(KP382683432, T52, KP923879532 * T57);
+			 T5w = FNMS(KP382683432, T5d, KP923879532 * T5i);
+			 T5x = T5v + T5w;
+			 T5C = T5w - T5v;
+		    }
+		    {
+			 E T5l, T5y, T5H, T5I;
+			 T5l = T4X + T5k;
+			 T5y = T5u - T5x;
+			 R0[WS(rs, 1)] = FMA(KP1_990369453, T5l, KP196034280 * T5y);
+			 R0[WS(rs, 17)] = FNMS(KP196034280, T5l, KP1_990369453 * T5y);
+			 T5H = T5B - T5C;
+			 T5I = T5F + T5E;
+			 R0[WS(rs, 13)] = FMA(KP580569354, T5H, KP1_913880671 * T5I);
+			 R0[WS(rs, 29)] = FNMS(KP1_913880671, T5H, KP580569354 * T5I);
+		    }
+		    {
+			 E T5z, T5A, T5D, T5G;
+			 T5z = T4X - T5k;
+			 T5A = T5x + T5u;
+			 R0[WS(rs, 9)] = FMA(KP1_268786568, T5z, KP1_546020906 * T5A);
+			 R0[WS(rs, 25)] = FNMS(KP1_546020906, T5z, KP1_268786568 * T5A);
+			 T5D = T5B + T5C;
+			 T5G = T5E - T5F;
+			 R0[WS(rs, 5)] = FMA(KP1_763842528, T5D, KP942793473 * T5G);
+			 R0[WS(rs, 21)] = FNMS(KP942793473, T5D, KP1_763842528 * T5G);
+		    }
+	       }
+	       {
+		    E T33, T3l, T3h, T3m, T3a, T3p, T3e, T3o;
+		    {
+			 E T31, T32, T3f, T3g;
+			 T31 = T15 - T1g;
+			 T32 = T2E - T2D;
+			 T33 = T31 + T32;
+			 T3l = T31 - T32;
+			 T3f = FMA(KP831469612, T34, KP555570233 * T35);
+			 T3g = FNMS(KP831469612, T37, KP555570233 * T38);
+			 T3h = T3f + T3g;
+			 T3m = T3g - T3f;
+		    }
+		    {
+			 E T36, T39, T3c, T3d;
+			 T36 = FNMS(KP831469612, T35, KP555570233 * T34);
+			 T39 = FMA(KP555570233, T37, KP831469612 * T38);
+			 T3a = T36 + T39;
+			 T3p = T36 - T39;
+			 T3c = T2I - T2N;
+			 T3d = T1s - T1D;
+			 T3e = T3c - T3d;
+			 T3o = T3d + T3c;
+		    }
+		    {
+			 E T3b, T3i, T3r, T3s;
+			 T3b = T33 + T3a;
+			 T3i = T3e - T3h;
+			 R1[WS(rs, 2)] = FMA(KP1_940062506, T3b, KP485960359 * T3i);
+			 R1[WS(rs, 18)] = FNMS(KP485960359, T3b, KP1_940062506 * T3i);
+			 T3r = T3l - T3m;
+			 T3s = T3p + T3o;
+			 R1[WS(rs, 14)] = FMA(KP293460948, T3r, KP1_978353019 * T3s);
+			 R1[WS(rs, 30)] = FNMS(KP1_978353019, T3r, KP293460948 * T3s);
+		    }
+		    {
+			 E T3j, T3k, T3n, T3q;
+			 T3j = T33 - T3a;
+			 T3k = T3h + T3e;
+			 R1[WS(rs, 10)] = FMA(KP1_028205488, T3j, KP1_715457220 * T3k);
+			 R1[WS(rs, 26)] = FNMS(KP1_715457220, T3j, KP1_028205488 * T3k);
+			 T3n = T3l + T3m;
+			 T3q = T3o - T3p;
+			 R1[WS(rs, 6)] = FMA(KP1_606415062, T3n, KP1_191398608 * T3q);
+			 R1[WS(rs, 22)] = FNMS(KP1_191398608, T3n, KP1_606415062 * T3q);
+		    }
+	       }
+	       {
+		    E T4h, T4z, T4v, T4A, T4o, T4D, T4s, T4C;
+		    {
+			 E T4f, T4g, T4t, T4u;
+			 T4f = T3t + T3u;
+			 T4g = T3X + T3Y;
+			 T4h = T4f - T4g;
+			 T4z = T4f + T4g;
+			 T4t = FMA(KP980785280, T4i, KP195090322 * T4j);
+			 T4u = FMA(KP980785280, T4l, KP195090322 * T4m);
+			 T4v = T4t - T4u;
+			 T4A = T4t + T4u;
+		    }
+		    {
+			 E T4k, T4n, T4q, T4r;
+			 T4k = FNMS(KP980785280, T4j, KP195090322 * T4i);
+			 T4n = FNMS(KP980785280, T4m, KP195090322 * T4l);
+			 T4o = T4k + T4n;
+			 T4D = T4k - T4n;
+			 T4q = T3V + T3U;
+			 T4r = T3y - T3B;
+			 T4s = T4q - T4r;
+			 T4C = T4r + T4q;
+		    }
+		    {
+			 E T4p, T4w, T4F, T4G;
+			 T4p = T4h + T4o;
+			 T4w = T4s - T4v;
+			 R1[WS(rs, 3)] = FMA(KP1_883088130, T4p, KP673779706 * T4w);
+			 R1[WS(rs, 19)] = FNMS(KP673779706, T4p, KP1_883088130 * T4w);
+			 T4F = T4z + T4A;
+			 T4G = T4D + T4C;
+			 R1[WS(rs, 15)] = FMA(KP098135348, T4F, KP1_997590912 * T4G);
+			 R1[WS(rs, 31)] = FNMS(KP1_997590912, T4F, KP098135348 * T4G);
+		    }
+		    {
+			 E T4x, T4y, T4B, T4E;
+			 T4x = T4h - T4o;
+			 T4y = T4v + T4s;
+			 R1[WS(rs, 11)] = FMA(KP855110186, T4x, KP1_807978586 * T4y);
+			 R1[WS(rs, 27)] = FNMS(KP1_807978586, T4x, KP855110186 * T4y);
+			 T4B = T4z - T4A;
+			 T4E = T4C - T4D;
+			 R1[WS(rs, 7)] = FMA(KP1_481902250, T4B, KP1_343117909 * T4E);
+			 R1[WS(rs, 23)] = FNMS(KP1_343117909, T4B, KP1_481902250 * T4E);
+		    }
+	       }
+	       {
+		    E T1F, T2T, T2P, T2W, T2y, T2X, T2C, T2U;
+		    {
+			 E T1h, T1E, T2F, T2O;
+			 T1h = T15 + T1g;
+			 T1E = T1s + T1D;
+			 T1F = T1h + T1E;
+			 T2T = T1h - T1E;
+			 T2F = T2D + T2E;
+			 T2O = T2I + T2N;
+			 T2P = T2F + T2O;
+			 T2W = T2F - T2O;
+		    }
+		    {
+			 E T26, T2x, T2A, T2B;
+			 T26 = FNMS(KP195090322, T25, KP980785280 * T1W);
+			 T2x = FMA(KP980785280, T2n, KP195090322 * T2w);
+			 T2y = T26 + T2x;
+			 T2X = T26 - T2x;
+			 T2A = FMA(KP195090322, T1W, KP980785280 * T25);
+			 T2B = FNMS(KP195090322, T2n, KP980785280 * T2w);
+			 T2C = T2A + T2B;
+			 T2U = T2B - T2A;
+		    }
+		    {
+			 E T2z, T2Q, T2Z, T30;
+			 T2z = T1F + T2y;
+			 T2Q = T2C + T2P;
+			 R1[0] = FNMS(KP098135348, T2Q, KP1_997590912 * T2z);
+			 R1[WS(rs, 16)] = -(FMA(KP098135348, T2z, KP1_997590912 * T2Q));
+			 T2Z = T2T - T2U;
+			 T30 = T2X + T2W;
+			 R1[WS(rs, 12)] = FMA(KP673779706, T2Z, KP1_883088130 * T30);
+			 R1[WS(rs, 28)] = FNMS(KP1_883088130, T2Z, KP673779706 * T30);
+		    }
+		    {
+			 E T2R, T2S, T2V, T2Y;
+			 T2R = T1F - T2y;
+			 T2S = T2C - T2P;
+			 R1[WS(rs, 8)] = FMA(KP1_343117909, T2R, KP1_481902250 * T2S);
+			 R1[WS(rs, 24)] = FNMS(KP1_481902250, T2R, KP1_343117909 * T2S);
+			 T2V = T2T + T2U;
+			 T2Y = T2W - T2X;
+			 R1[WS(rs, 4)] = FMA(KP1_807978586, T2V, KP855110186 * T2Y);
+			 R1[WS(rs, 20)] = FNMS(KP855110186, T2V, KP1_807978586 * T2Y);
+		    }
+	       }
+	       {
+		    E T3D, T47, T43, T48, T3S, T4b, T40, T4a;
+		    {
+			 E T3v, T3C, T41, T42;
+			 T3v = T3t - T3u;
+			 T3C = T3y + T3B;
+			 T3D = T3v + T3C;
+			 T47 = T3v - T3C;
+			 T41 = FMA(KP555570233, T3G, KP831469612 * T3J);
+			 T42 = FNMS(KP555570233, T3N, KP831469612 * T3Q);
+			 T43 = T41 + T42;
+			 T48 = T42 - T41;
+		    }
+		    {
+			 E T3K, T3R, T3W, T3Z;
+			 T3K = FNMS(KP555570233, T3J, KP831469612 * T3G);
+			 T3R = FMA(KP831469612, T3N, KP555570233 * T3Q);
+			 T3S = T3K + T3R;
+			 T4b = T3K - T3R;
+			 T3W = T3U - T3V;
+			 T3Z = T3X - T3Y;
+			 T40 = T3W - T3Z;
+			 T4a = T3Z + T3W;
+		    }
+		    {
+			 E T3T, T44, T4d, T4e;
+			 T3T = T3D + T3S;
+			 T44 = T40 - T43;
+			 R1[WS(rs, 1)] = FMA(KP1_978353019, T3T, KP293460948 * T44);
+			 R1[WS(rs, 17)] = FNMS(KP293460948, T3T, KP1_978353019 * T44);
+			 T4d = T47 - T48;
+			 T4e = T4b + T4a;
+			 R1[WS(rs, 13)] = FMA(KP485960359, T4d, KP1_940062506 * T4e);
+			 R1[WS(rs, 29)] = FNMS(KP1_940062506, T4d, KP485960359 * T4e);
+		    }
+		    {
+			 E T45, T46, T49, T4c;
+			 T45 = T3D - T3S;
+			 T46 = T43 + T40;
+			 R1[WS(rs, 9)] = FMA(KP1_191398608, T45, KP1_606415062 * T46);
+			 R1[WS(rs, 25)] = FNMS(KP1_606415062, T45, KP1_191398608 * T46);
+			 T49 = T47 + T48;
+			 T4c = T4a - T4b;
+			 R1[WS(rs, 5)] = FMA(KP1_715457220, T49, KP1_028205488 * T4c);
+			 R1[WS(rs, 21)] = FNMS(KP1_028205488, T49, KP1_715457220 * T4c);
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 64, "r2cbIII_64", {342, 116, 92, 0}, &GENUS };
+
+void X(codelet_r2cbIII_64) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_64, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:33 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -name r2cbIII_7 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 24 FP additions, 22 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 22 fused multiply/add),
+ * 31 stack variables, 7 constants, and 14 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_949855824, +1.949855824363647214036263365987862434465571601);
+     DK(KP1_801937735, +1.801937735804838252472204639014890102331838324);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
+	       E Tn, Td, Tg, Ti, Tl, T8;
+	       {
+		    E T1, T9, Tb, Ta, T2, T4, Th, Tm, Tc, T3, Te;
+		    T1 = Cr[WS(csr, 3)];
+		    T9 = Ci[WS(csi, 1)];
+		    Tb = Ci[0];
+		    Ta = Ci[WS(csi, 2)];
+		    T2 = Cr[WS(csr, 2)];
+		    T4 = Cr[0];
+		    Th = FMA(KP554958132, T9, Tb);
+		    Tm = FNMS(KP554958132, Ta, T9);
+		    Tc = FMA(KP554958132, Tb, Ta);
+		    T3 = Cr[WS(csr, 1)];
+		    Te = FNMS(KP356895867, T2, T4);
+		    Tn = FNMS(KP801937735, Tm, Tb);
+		    {
+			 E Tf, Tk, T7, T5, Tj, T6;
+			 Td = FMA(KP801937735, Tc, T9);
+			 T5 = T2 + T3 + T4;
+			 Tj = FNMS(KP356895867, T4, T3);
+			 T6 = FNMS(KP356895867, T3, T2);
+			 Tf = FNMS(KP692021471, Te, T3);
+			 R0[0] = FMA(KP2_000000000, T5, T1);
+			 Tk = FNMS(KP692021471, Tj, T2);
+			 T7 = FNMS(KP692021471, T6, T4);
+			 Tg = FNMS(KP1_801937735, Tf, T1);
+			 Ti = FNMS(KP801937735, Th, Ta);
+			 Tl = FNMS(KP1_801937735, Tk, T1);
+			 T8 = FNMS(KP1_801937735, T7, T1);
+		    }
+	       }
+	       R1[WS(rs, 2)] = FMS(KP1_949855824, Ti, Tg);
+	       R0[WS(rs, 1)] = FMA(KP1_949855824, Ti, Tg);
+	       R0[WS(rs, 2)] = FNMS(KP1_949855824, Tn, Tl);
+	       R1[WS(rs, 1)] = -(FMA(KP1_949855824, Tn, Tl));
+	       R0[WS(rs, 3)] = FNMS(KP1_949855824, Td, T8);
+	       R1[0] = -(FMA(KP1_949855824, Td, T8));
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 7, "r2cbIII_7", {2, 0, 22, 0}, &GENUS };
+
+void X(codelet_r2cbIII_7) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_7, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -name r2cbIII_7 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 24 FP additions, 19 FP multiplications,
+ * (or, 9 additions, 4 multiplications, 15 fused multiply/add),
+ * 21 stack variables, 7 constants, and 14 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_246979603, +1.246979603717467061050009768008479621264549462);
+     DK(KP1_801937735, +1.801937735804838252472204639014890102331838324);
+     DK(KP445041867, +0.445041867912628808577805128993589518932711138);
+     DK(KP867767478, +0.867767478235116240951536665696717509219981456);
+     DK(KP1_949855824, +1.949855824363647214036263365987862434465571601);
+     DK(KP1_563662964, +1.563662964936059617416889053348115500464669037);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
+	       E T9, Td, Tb, T1, T4, T2, T3, T5, Tc, Ta, T6, T8, T7;
+	       T6 = Ci[WS(csi, 2)];
+	       T8 = Ci[0];
+	       T7 = Ci[WS(csi, 1)];
+	       T9 = FMA(KP1_563662964, T6, KP1_949855824 * T7) + (KP867767478 * T8);
+	       Td = FNMS(KP1_949855824, T8, KP1_563662964 * T7) - (KP867767478 * T6);
+	       Tb = FNMS(KP1_563662964, T8, KP1_949855824 * T6) - (KP867767478 * T7);
+	       T1 = Cr[WS(csr, 3)];
+	       T4 = Cr[0];
+	       T2 = Cr[WS(csr, 2)];
+	       T3 = Cr[WS(csr, 1)];
+	       T5 = FMA(KP445041867, T3, KP1_801937735 * T4) + FNMA(KP1_246979603, T2, T1);
+	       Tc = FMA(KP1_801937735, T2, KP445041867 * T4) + FNMA(KP1_246979603, T3, T1);
+	       Ta = FMA(KP1_246979603, T4, T1) + FNMA(KP1_801937735, T3, KP445041867 * T2);
+	       R1[0] = T5 - T9;
+	       R0[WS(rs, 3)] = -(T5 + T9);
+	       R0[WS(rs, 2)] = Td - Tc;
+	       R1[WS(rs, 1)] = Tc + Td;
+	       R1[WS(rs, 2)] = Tb - Ta;
+	       R0[WS(rs, 1)] = Ta + Tb;
+	       R0[0] = FMA(KP2_000000000, T2 + T3 + T4, T1);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 7, "r2cbIII_7", {9, 4, 15, 0}, &GENUS };
+
+void X(codelet_r2cbIII_7) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_7, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:34 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -name r2cbIII_8 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 18 additions, 8 multiplications, 4 fused multiply/add),
+ * 23 stack variables, 4 constants, and 16 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
+	       E T4, T7, T3, Tl, Tf, T5, T8, T9, T6, Tc;
+	       {
+		    E T1, T2, Td, Te;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 3)];
+		    Td = Ci[0];
+		    Te = Ci[WS(csi, 3)];
+		    T4 = Cr[WS(csr, 2)];
+		    T7 = T1 - T2;
+		    T3 = T1 + T2;
+		    Tl = Te - Td;
+		    Tf = Td + Te;
+		    T5 = Cr[WS(csr, 1)];
+		    T8 = Ci[WS(csi, 2)];
+		    T9 = Ci[WS(csi, 1)];
+	       }
+	       T6 = T4 + T5;
+	       Tc = T4 - T5;
+	       {
+		    E Ta, Tk, Tg, Th;
+		    Ta = T8 + T9;
+		    Tk = T8 - T9;
+		    Tg = Tc + Tf;
+		    Th = Tc - Tf;
+		    {
+			 E Tj, Tm, Tb, Ti;
+			 Tj = T3 - T6;
+			 R0[0] = KP2_000000000 * (T3 + T6);
+			 Tm = Tk + Tl;
+			 R0[WS(rs, 2)] = KP2_000000000 * (Tl - Tk);
+			 Tb = T7 - Ta;
+			 Ti = T7 + Ta;
+			 R0[WS(rs, 3)] = KP1_414213562 * (Tm - Tj);
+			 R0[WS(rs, 1)] = KP1_414213562 * (Tj + Tm);
+			 R1[WS(rs, 3)] = -(KP1_847759065 * (FNMS(KP414213562, Th, Ti)));
+			 R1[WS(rs, 1)] = KP1_847759065 * (FMA(KP414213562, Ti, Th));
+			 R1[WS(rs, 2)] = -(KP1_847759065 * (FMA(KP414213562, Tb, Tg)));
+			 R1[0] = KP1_847759065 * (FNMS(KP414213562, Tg, Tb));
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 8, "r2cbIII_8", {18, 8, 4, 0}, &GENUS };
+
+void X(codelet_r2cbIII_8) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -name r2cbIII_8 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 18 additions, 8 multiplications, 4 fused multiply/add),
+ * 19 stack variables, 4 constants, and 16 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP765366864, +0.765366864730179543456919968060797733522689125);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
+	       E T3, T7, Tf, Tl, T6, Tc, Ta, Tk, Tb, Tg;
+	       {
+		    E T1, T2, Td, Te;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 3)];
+		    T3 = T1 + T2;
+		    T7 = T1 - T2;
+		    Td = Ci[0];
+		    Te = Ci[WS(csi, 3)];
+		    Tf = Td + Te;
+		    Tl = Te - Td;
+	       }
+	       {
+		    E T4, T5, T8, T9;
+		    T4 = Cr[WS(csr, 2)];
+		    T5 = Cr[WS(csr, 1)];
+		    T6 = T4 + T5;
+		    Tc = T4 - T5;
+		    T8 = Ci[WS(csi, 2)];
+		    T9 = Ci[WS(csi, 1)];
+		    Ta = T8 + T9;
+		    Tk = T8 - T9;
+	       }
+	       R0[0] = KP2_000000000 * (T3 + T6);
+	       R0[WS(rs, 2)] = KP2_000000000 * (Tl - Tk);
+	       Tb = T7 - Ta;
+	       Tg = Tc + Tf;
+	       R1[0] = FNMS(KP765366864, Tg, KP1_847759065 * Tb);
+	       R1[WS(rs, 2)] = -(FMA(KP765366864, Tb, KP1_847759065 * Tg));
+	       {
+		    E Th, Ti, Tj, Tm;
+		    Th = T7 + Ta;
+		    Ti = Tc - Tf;
+		    R1[WS(rs, 1)] = FMA(KP765366864, Th, KP1_847759065 * Ti);
+		    R1[WS(rs, 3)] = FNMS(KP1_847759065, Th, KP765366864 * Ti);
+		    Tj = T3 - T6;
+		    Tm = Tk + Tl;
+		    R0[WS(rs, 1)] = KP1_414213562 * (Tj + Tm);
+		    R0[WS(rs, 3)] = KP1_414213562 * (Tm - Tj);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 8, "r2cbIII_8", {18, 8, 4, 0}, &GENUS };
+
+void X(codelet_r2cbIII_8) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cbIII_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:35 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -name r2cbIII_9 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 32 FP additions, 24 FP multiplications,
+ * (or, 8 additions, 0 multiplications, 24 fused multiply/add),
+ * 40 stack variables, 12 constants, and 18 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_326827896, +1.326827896337876792410842639271782594433726619);
+     DK(KP1_705737063, +1.705737063904886419256501927880148143872040591);
+     DK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DK(KP1_532088886, +1.532088886237956070404785301110833347871664914);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP1_969615506, +1.969615506024416118733486049179046027341286503);
+     DK(KP839099631, +0.839099631177280011763127298123181364687434283);
+     DK(KP176326980, +0.176326980708464973471090386868618986121633062);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
+	       E T4, Td, T3, Th, Tr, Tm, T7, Tc, Tj, Tg, T1, T2;
+	       Tg = Ci[WS(csi, 1)];
+	       T1 = Cr[WS(csr, 4)];
+	       T2 = Cr[WS(csr, 1)];
+	       T4 = Cr[WS(csr, 3)];
+	       Td = Ci[WS(csi, 3)];
+	       {
+		    E T5, Tf, T6, Ta, Tb;
+		    T5 = Cr[0];
+		    Tf = T2 - T1;
+		    T3 = FMA(KP2_000000000, T2, T1);
+		    T6 = Cr[WS(csr, 2)];
+		    Ta = Ci[WS(csi, 2)];
+		    Tb = Ci[0];
+		    Th = FNMS(KP1_732050807, Tg, Tf);
+		    Tr = FMA(KP1_732050807, Tg, Tf);
+		    Tm = T5 - T6;
+		    T7 = T5 + T6;
+		    Tc = Ta - Tb;
+		    Tj = Tb + Ta;
+	       }
+	       {
+		    E Tw, Tq, Tv, Tp, Ti, T8;
+		    Ti = FNMS(KP500000000, T7, T4);
+		    T8 = T4 + T7;
+		    {
+			 E Te, Tl, Tt, Tk, T9;
+			 Te = Tc - Td;
+			 Tl = FMA(KP500000000, Tc, Td);
+			 Tt = FNMS(KP866025403, Tj, Ti);
+			 Tk = FMA(KP866025403, Tj, Ti);
+			 T9 = T8 - T3;
+			 R0[0] = FMA(KP2_000000000, T8, T3);
+			 {
+			      E Ts, Tn, Tu, To;
+			      Ts = FMA(KP866025403, Tm, Tl);
+			      Tn = FNMS(KP866025403, Tm, Tl);
+			      R0[WS(rs, 3)] = FMS(KP1_732050807, Te, T9);
+			      R1[WS(rs, 1)] = FMA(KP1_732050807, Te, T9);
+			      Tu = FMA(KP176326980, Tt, Ts);
+			      Tw = FNMS(KP176326980, Ts, Tt);
+			      To = FMA(KP839099631, Tn, Tk);
+			      Tq = FNMS(KP839099631, Tk, Tn);
+			      R0[WS(rs, 1)] = FMS(KP1_969615506, Tu, Tr);
+			      Tv = FMA(KP984807753, Tu, Tr);
+			      R1[0] = FNMS(KP1_532088886, To, Th);
+			      Tp = FMA(KP766044443, To, Th);
+			 }
+		    }
+		    R0[WS(rs, 4)] = FMS(KP1_705737063, Tw, Tv);
+		    R1[WS(rs, 2)] = FMA(KP1_705737063, Tw, Tv);
+		    R0[WS(rs, 2)] = FMS(KP1_326827896, Tq, Tp);
+		    R1[WS(rs, 3)] = FMA(KP1_326827896, Tq, Tp);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 9, "r2cbIII_9", {8, 0, 24, 0}, &GENUS };
+
+void X(codelet_r2cbIII_9) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_9, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -name r2cbIII_9 -dft-III -include r2cbIII.h */
+
+/*
+ * This function contains 32 FP additions, 18 FP multiplications,
+ * (or, 22 additions, 8 multiplications, 10 fused multiply/add),
+ * 35 stack variables, 12 constants, and 18 memory accesses
+ */
+#include "r2cbIII.h"
+
+static void r2cbIII_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DK(KP1_326827896, +1.326827896337876792410842639271782594433726619);
+     DK(KP1_113340798, +1.113340798452838732905825904094046265936583811);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DK(KP1_705737063, +1.705737063904886419256501927880148143872040591);
+     DK(KP300767466, +0.300767466360870593278543795225003852144476517);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
+	       E T3, Ts, Ti, Td, Tc, T8, To, Tu, Tl, Tt, T9, Te;
+	       {
+		    E Th, T1, T2, Tf, Tg;
+		    Tg = Ci[WS(csi, 1)];
+		    Th = KP1_732050807 * Tg;
+		    T1 = Cr[WS(csr, 4)];
+		    T2 = Cr[WS(csr, 1)];
+		    Tf = T2 - T1;
+		    T3 = FMA(KP2_000000000, T2, T1);
+		    Ts = Tf - Th;
+		    Ti = Tf + Th;
+	       }
+	       {
+		    E T4, T7, Tm, Tk, Tn, Tj;
+		    T4 = Cr[WS(csr, 3)];
+		    Td = Ci[WS(csi, 3)];
+		    {
+			 E T5, T6, Ta, Tb;
+			 T5 = Cr[0];
+			 T6 = Cr[WS(csr, 2)];
+			 T7 = T5 + T6;
+			 Tm = KP866025403 * (T6 - T5);
+			 Ta = Ci[WS(csi, 2)];
+			 Tb = Ci[0];
+			 Tc = Ta - Tb;
+			 Tk = KP866025403 * (Tb + Ta);
+		    }
+		    T8 = T4 + T7;
+		    Tn = FMA(KP500000000, Tc, Td);
+		    To = Tm - Tn;
+		    Tu = Tm + Tn;
+		    Tj = FMS(KP500000000, T7, T4);
+		    Tl = Tj + Tk;
+		    Tt = Tj - Tk;
+	       }
+	       R0[0] = FMA(KP2_000000000, T8, T3);
+	       T9 = T8 - T3;
+	       Te = KP1_732050807 * (Tc - Td);
+	       R1[WS(rs, 1)] = T9 + Te;
+	       R0[WS(rs, 3)] = Te - T9;
+	       {
+		    E Tr, Tp, Tq, Tx, Tv, Tw;
+		    Tr = FNMS(KP1_705737063, Tl, KP300767466 * To);
+		    Tp = FMA(KP173648177, Tl, KP984807753 * To);
+		    Tq = Ti - Tp;
+		    R0[WS(rs, 1)] = -(FMA(KP2_000000000, Tp, Ti));
+		    R0[WS(rs, 4)] = Tr - Tq;
+		    R1[WS(rs, 2)] = Tq + Tr;
+		    Tx = FMA(KP1_113340798, Tt, KP1_326827896 * Tu);
+		    Tv = FNMS(KP642787609, Tu, KP766044443 * Tt);
+		    Tw = Tv - Ts;
+		    R1[0] = FMA(KP2_000000000, Tv, Ts);
+		    R1[WS(rs, 3)] = Tx - Tw;
+		    R0[WS(rs, 2)] = Tw + Tx;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 9, "r2cbIII_9", {22, 8, 10, 0}, &GENUS };
+
+void X(codelet_r2cbIII_9) (planner *p) {
+     X(kr2c_register) (p, r2cbIII_9, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -name r2cb_10 -include r2cb.h */
+
+/*
+ * This function contains 34 FP additions, 20 FP multiplications,
+ * (or, 14 additions, 0 multiplications, 20 fused multiply/add),
+ * 30 stack variables, 5 constants, and 20 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
+	       E Tb, T3, Tc, T6, Tq, To, Ty, Tw, Td, T9;
+	       {
+		    E Tu, Tn, T7, Tv, Tk, T8;
+		    {
+			 E T1, T2, Tl, Tm;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 5)];
+			 Tl = Ci[WS(csi, 2)];
+			 Tm = Ci[WS(csi, 3)];
+			 {
+			      E Ti, Tj, T4, T5;
+			      Ti = Ci[WS(csi, 4)];
+			      Tb = T1 + T2;
+			      T3 = T1 - T2;
+			      Tu = Tl + Tm;
+			      Tn = Tl - Tm;
+			      Tj = Ci[WS(csi, 1)];
+			      T4 = Cr[WS(csr, 2)];
+			      T5 = Cr[WS(csr, 3)];
+			      T7 = Cr[WS(csr, 4)];
+			      Tv = Ti + Tj;
+			      Tk = Ti - Tj;
+			      Tc = T4 + T5;
+			      T6 = T4 - T5;
+			      T8 = Cr[WS(csr, 1)];
+			 }
+		    }
+		    Tq = FMA(KP618033988, Tk, Tn);
+		    To = FNMS(KP618033988, Tn, Tk);
+		    Ty = FNMS(KP618033988, Tu, Tv);
+		    Tw = FMA(KP618033988, Tv, Tu);
+		    Td = T7 + T8;
+		    T9 = T7 - T8;
+	       }
+	       {
+		    E Te, Tg, Ta, Ts, Tf, Tr;
+		    Te = Tc + Td;
+		    Tg = Tc - Td;
+		    Ta = T6 + T9;
+		    Ts = T6 - T9;
+		    Tf = FNMS(KP500000000, Te, Tb);
+		    R0[0] = FMA(KP2_000000000, Te, Tb);
+		    Tr = FNMS(KP500000000, Ta, T3);
+		    R1[WS(rs, 2)] = FMA(KP2_000000000, Ta, T3);
+		    {
+			 E Th, Tp, Tt, Tx;
+			 Th = FNMS(KP1_118033988, Tg, Tf);
+			 Tp = FMA(KP1_118033988, Tg, Tf);
+			 Tt = FMA(KP1_118033988, Ts, Tr);
+			 Tx = FNMS(KP1_118033988, Ts, Tr);
+			 R0[WS(rs, 3)] = FNMS(KP1_902113032, Tq, Tp);
+			 R0[WS(rs, 2)] = FMA(KP1_902113032, Tq, Tp);
+			 R0[WS(rs, 1)] = FMA(KP1_902113032, To, Th);
+			 R0[WS(rs, 4)] = FNMS(KP1_902113032, To, Th);
+			 R1[WS(rs, 1)] = FNMS(KP1_902113032, Ty, Tx);
+			 R1[WS(rs, 3)] = FMA(KP1_902113032, Ty, Tx);
+			 R1[WS(rs, 4)] = FMA(KP1_902113032, Tw, Tt);
+			 R1[0] = FNMS(KP1_902113032, Tw, Tt);
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 10, "r2cb_10", {14, 0, 20, 0}, &GENUS };
+
+void X(codelet_r2cb_10) (planner *p) {
+     X(kr2c_register) (p, r2cb_10, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -name r2cb_10 -include r2cb.h */
+
+/*
+ * This function contains 34 FP additions, 14 FP multiplications,
+ * (or, 26 additions, 6 multiplications, 8 fused multiply/add),
+ * 26 stack variables, 5 constants, and 20 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
+	       E T3, Tb, Tn, Tv, Tk, Tu, Ta, Ts, Te, Tg, Ti, Tj;
+	       {
+		    E T1, T2, Tl, Tm;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 5)];
+		    T3 = T1 - T2;
+		    Tb = T1 + T2;
+		    Tl = Ci[WS(csi, 4)];
+		    Tm = Ci[WS(csi, 1)];
+		    Tn = Tl - Tm;
+		    Tv = Tl + Tm;
+	       }
+	       Ti = Ci[WS(csi, 2)];
+	       Tj = Ci[WS(csi, 3)];
+	       Tk = Ti - Tj;
+	       Tu = Ti + Tj;
+	       {
+		    E T6, Tc, T9, Td;
+		    {
+			 E T4, T5, T7, T8;
+			 T4 = Cr[WS(csr, 2)];
+			 T5 = Cr[WS(csr, 3)];
+			 T6 = T4 - T5;
+			 Tc = T4 + T5;
+			 T7 = Cr[WS(csr, 4)];
+			 T8 = Cr[WS(csr, 1)];
+			 T9 = T7 - T8;
+			 Td = T7 + T8;
+		    }
+		    Ta = T6 + T9;
+		    Ts = KP1_118033988 * (T6 - T9);
+		    Te = Tc + Td;
+		    Tg = KP1_118033988 * (Tc - Td);
+	       }
+	       R1[WS(rs, 2)] = FMA(KP2_000000000, Ta, T3);
+	       R0[0] = FMA(KP2_000000000, Te, Tb);
+	       {
+		    E To, Tq, Th, Tp, Tf;
+		    To = FNMS(KP1_902113032, Tn, KP1_175570504 * Tk);
+		    Tq = FMA(KP1_902113032, Tk, KP1_175570504 * Tn);
+		    Tf = FNMS(KP500000000, Te, Tb);
+		    Th = Tf - Tg;
+		    Tp = Tg + Tf;
+		    R0[WS(rs, 1)] = Th - To;
+		    R0[WS(rs, 2)] = Tp + Tq;
+		    R0[WS(rs, 4)] = Th + To;
+		    R0[WS(rs, 3)] = Tp - Tq;
+	       }
+	       {
+		    E Tw, Ty, Tt, Tx, Tr;
+		    Tw = FNMS(KP1_902113032, Tv, KP1_175570504 * Tu);
+		    Ty = FMA(KP1_902113032, Tu, KP1_175570504 * Tv);
+		    Tr = FNMS(KP500000000, Ta, T3);
+		    Tt = Tr - Ts;
+		    Tx = Ts + Tr;
+		    R1[WS(rs, 3)] = Tt - Tw;
+		    R1[WS(rs, 4)] = Tx + Ty;
+		    R1[WS(rs, 1)] = Tt + Tw;
+		    R1[0] = Tx - Ty;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 10, "r2cb_10", {26, 6, 8, 0}, &GENUS };
+
+void X(codelet_r2cb_10) (planner *p) {
+     X(kr2c_register) (p, r2cb_10, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 11 -name r2cb_11 -include r2cb.h */
+
+/*
+ * This function contains 60 FP additions, 56 FP multiplications,
+ * (or, 4 additions, 0 multiplications, 56 fused multiply/add),
+ * 53 stack variables, 11 constants, and 22 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_11(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_979642883, +1.979642883761865464752184075553437574753038744);
+     DK(KP1_918985947, +1.918985947228994779780736114132655398124909697);
+     DK(KP876768831, +0.876768831002589333891339807079336796764054852);
+     DK(KP918985947, +0.918985947228994779780736114132655398124909697);
+     DK(KP778434453, +0.778434453334651800608337670740821884709317477);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP634356270, +0.634356270682424498893150776899916060542806975);
+     DK(KP342584725, +0.342584725681637509502641509861112333758894680);
+     DK(KP830830026, +0.830830026003772851058548298459246407048009821);
+     DK(KP715370323, +0.715370323453429719112414662767260662417897278);
+     DK(KP521108558, +0.521108558113202722944698153526659300680427422);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(44, rs), MAKE_VOLATILE_STRIDE(44, csr), MAKE_VOLATILE_STRIDE(44, csi)) {
+	       E Tf, Tq, Tt, Tu;
+	       {
+		    E T1, Td, Th, Te, Tg, T2, Ts, TK, TB, TT, Tj, T6, T3, T4, T5;
+		    E Tr;
+		    T1 = Cr[0];
+		    Td = Ci[WS(csi, 3)];
+		    Th = Ci[WS(csi, 5)];
+		    Te = Ci[WS(csi, 2)];
+		    Tf = Ci[WS(csi, 4)];
+		    Tg = Ci[WS(csi, 1)];
+		    Tr = FMA(KP521108558, Td, Th);
+		    T2 = Cr[WS(csr, 1)];
+		    {
+			 E TJ, TA, TS, Ti;
+			 TJ = FMA(KP521108558, Tf, Td);
+			 TA = FNMS(KP521108558, Te, Tf);
+			 TS = FMS(KP521108558, Tg, Te);
+			 Ti = FMA(KP521108558, Th, Tg);
+			 Ts = FNMS(KP715370323, Tr, Te);
+			 TK = FMA(KP715370323, TJ, Tg);
+			 TB = FMA(KP715370323, TA, Th);
+			 TT = FMA(KP715370323, TS, Td);
+			 Tj = FMA(KP715370323, Ti, Tf);
+			 T6 = Cr[WS(csr, 5)];
+		    }
+		    T3 = Cr[WS(csr, 2)];
+		    T4 = Cr[WS(csr, 3)];
+		    T5 = Cr[WS(csr, 4)];
+		    {
+			 E TG, Tx, To, Tl, Tb, TU, TQ, TP, Ta;
+			 {
+			      E Tk, TE, Tv, T8;
+			      Tk = FMA(KP830830026, Tj, Te);
+			      TE = FNMS(KP342584725, T3, T6);
+			      Tv = FNMS(KP342584725, T2, T4);
+			      T8 = FNMS(KP342584725, T4, T3);
+			      {
+				   E T7, Tm, TN, TF;
+				   T7 = T2 + T3 + T4 + T5 + T6;
+				   Tm = FNMS(KP342584725, T5, T2);
+				   TN = FNMS(KP342584725, T6, T5);
+				   TF = FNMS(KP634356270, TE, T2);
+				   {
+					E Tw, T9, Tn, TO;
+					Tw = FNMS(KP634356270, Tv, T6);
+					T9 = FNMS(KP634356270, T8, T5);
+					R0[0] = FMA(KP2_000000000, T7, T1);
+					Tn = FNMS(KP634356270, Tm, T3);
+					TO = FNMS(KP634356270, TN, T4);
+					TG = FNMS(KP778434453, TF, T4);
+					Tx = FNMS(KP778434453, Tw, T5);
+					Ta = FNMS(KP778434453, T9, T2);
+					To = FNMS(KP778434453, Tn, T6);
+					TP = FNMS(KP778434453, TO, T3);
+					Tl = FMA(KP918985947, Tk, Td);
+				   }
+			      }
+			 }
+			 Tb = FNMS(KP876768831, Ta, T6);
+			 TU = FNMS(KP830830026, TT, Tf);
+			 TQ = FNMS(KP876768831, TP, T2);
+			 {
+			      E TI, TL, Ty, TC;
+			      {
+				   E Tc, TV, TR, TH;
+				   TH = FNMS(KP876768831, TG, T5);
+				   Tc = FNMS(KP1_918985947, Tb, T1);
+				   TV = FNMS(KP918985947, TU, Th);
+				   TR = FNMS(KP1_918985947, TQ, T1);
+				   TI = FNMS(KP1_918985947, TH, T1);
+				   R0[WS(rs, 5)] = FMA(KP1_979642883, Tl, Tc);
+				   R1[0] = FNMS(KP1_979642883, Tl, Tc);
+				   R0[WS(rs, 3)] = FMA(KP1_979642883, TV, TR);
+				   R1[WS(rs, 2)] = FNMS(KP1_979642883, TV, TR);
+				   TL = FNMS(KP830830026, TK, Th);
+			      }
+			      Ty = FNMS(KP876768831, Tx, T3);
+			      TC = FNMS(KP830830026, TB, Td);
+			      {
+				   E TM, Tz, TD, Tp;
+				   Tp = FNMS(KP876768831, To, T4);
+				   TM = FMA(KP918985947, TL, Te);
+				   Tz = FNMS(KP1_918985947, Ty, T1);
+				   TD = FNMS(KP918985947, TC, Tg);
+				   Tq = FNMS(KP1_918985947, Tp, T1);
+				   R0[WS(rs, 2)] = FMA(KP1_979642883, TM, TI);
+				   R1[WS(rs, 3)] = FNMS(KP1_979642883, TM, TI);
+				   R0[WS(rs, 4)] = FMA(KP1_979642883, TD, Tz);
+				   R1[WS(rs, 1)] = FNMS(KP1_979642883, TD, Tz);
+				   Tt = FMA(KP830830026, Ts, Tg);
+			      }
+			 }
+		    }
+	       }
+	       Tu = FNMS(KP918985947, Tt, Tf);
+	       R0[WS(rs, 1)] = FMA(KP1_979642883, Tu, Tq);
+	       R1[WS(rs, 4)] = FNMS(KP1_979642883, Tu, Tq);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 11, "r2cb_11", {4, 0, 56, 0}, &GENUS };
+
+void X(codelet_r2cb_11) (planner *p) {
+     X(kr2c_register) (p, r2cb_11, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 11 -name r2cb_11 -include r2cb.h */
+
+/*
+ * This function contains 60 FP additions, 51 FP multiplications,
+ * (or, 19 additions, 10 multiplications, 41 fused multiply/add),
+ * 33 stack variables, 11 constants, and 22 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_11(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_918985947, +1.918985947228994779780736114132655398124909697);
+     DK(KP1_309721467, +1.309721467890570128113850144932587106367582399);
+     DK(KP284629676, +0.284629676546570280887585337232739337582102722);
+     DK(KP830830026, +0.830830026003772851058548298459246407048009821);
+     DK(KP1_682507065, +1.682507065662362337723623297838735435026584997);
+     DK(KP563465113, +0.563465113682859395422835830693233798071555798);
+     DK(KP1_511499148, +1.511499148708516567548071687944688840359434890);
+     DK(KP1_979642883, +1.979642883761865464752184075553437574753038744);
+     DK(KP1_819263990, +1.819263990709036742823430766158056920120482102);
+     DK(KP1_081281634, +1.081281634911195164215271908637383390863541216);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(44, rs), MAKE_VOLATILE_STRIDE(44, csr), MAKE_VOLATILE_STRIDE(44, csi)) {
+	       E Td, Tl, Tf, Th, Tj, T1, T2, T6, T5, T4, T3, T7, Tk, Te, Tg;
+	       E Ti;
+	       {
+		    E T8, Tc, T9, Ta, Tb;
+		    T8 = Ci[WS(csi, 2)];
+		    Tc = Ci[WS(csi, 1)];
+		    T9 = Ci[WS(csi, 4)];
+		    Ta = Ci[WS(csi, 5)];
+		    Tb = Ci[WS(csi, 3)];
+		    Td = FMA(KP1_081281634, T8, KP1_819263990 * T9) + FNMA(KP1_979642883, Ta, KP1_511499148 * Tb) - (KP563465113 * Tc);
+		    Tl = FMA(KP1_979642883, T8, KP1_819263990 * Ta) + FNMA(KP563465113, T9, KP1_081281634 * Tb) - (KP1_511499148 * Tc);
+		    Tf = FMA(KP563465113, T8, KP1_819263990 * Tb) + FNMA(KP1_511499148, Ta, KP1_081281634 * T9) - (KP1_979642883 * Tc);
+		    Th = FMA(KP1_081281634, Tc, KP1_819263990 * T8) + FMA(KP1_979642883, Tb, KP1_511499148 * T9) + (KP563465113 * Ta);
+		    Tj = FMA(KP563465113, Tb, KP1_979642883 * T9) + FNMS(KP1_511499148, T8, KP1_081281634 * Ta) - (KP1_819263990 * Tc);
+	       }
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 1)];
+	       T6 = Cr[WS(csr, 5)];
+	       T5 = Cr[WS(csr, 4)];
+	       T4 = Cr[WS(csr, 3)];
+	       T3 = Cr[WS(csr, 2)];
+	       T7 = FMA(KP1_682507065, T3, T1) + FNMS(KP284629676, T6, KP830830026 * T5) + FNMA(KP1_309721467, T4, KP1_918985947 * T2);
+	       Tk = FMA(KP1_682507065, T4, T1) + FNMS(KP1_918985947, T5, KP830830026 * T6) + FNMA(KP284629676, T3, KP1_309721467 * T2);
+	       Te = FMA(KP830830026, T4, T1) + FNMS(KP1_309721467, T6, KP1_682507065 * T5) + FNMA(KP1_918985947, T3, KP284629676 * T2);
+	       Tg = FMA(KP1_682507065, T2, T1) + FNMS(KP1_918985947, T6, KP830830026 * T3) + FNMA(KP1_309721467, T5, KP284629676 * T4);
+	       Ti = FMA(KP830830026, T2, T1) + FNMS(KP284629676, T5, KP1_682507065 * T6) + FNMA(KP1_918985947, T4, KP1_309721467 * T3);
+	       R0[WS(rs, 3)] = T7 - Td;
+	       R0[WS(rs, 4)] = Te - Tf;
+	       R0[WS(rs, 2)] = Tk + Tl;
+	       R1[WS(rs, 2)] = T7 + Td;
+	       R1[WS(rs, 3)] = Tk - Tl;
+	       R0[WS(rs, 1)] = Ti + Tj;
+	       R1[WS(rs, 1)] = Te + Tf;
+	       R0[WS(rs, 5)] = Tg + Th;
+	       R1[0] = Tg - Th;
+	       R1[WS(rs, 4)] = Ti - Tj;
+	       R0[0] = FMA(KP2_000000000, T2 + T3 + T4 + T5 + T6, T1);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 11, "r2cb_11", {19, 10, 41, 0}, &GENUS };
+
+void X(codelet_r2cb_11) (planner *p) {
+     X(kr2c_register) (p, r2cb_11, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -name r2cb_12 -include r2cb.h */
+
+/*
+ * This function contains 38 FP additions, 16 FP multiplications,
+ * (or, 22 additions, 0 multiplications, 16 fused multiply/add),
+ * 31 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
+	       E Ts, Tr;
+	       {
+		    E Tz, Te, Tn, Tk, Tc, Tw, Ty, Th, T4, T3, Td, T5;
+		    {
+			 E T8, Tu, Tl, Tm, Tb, T9, Ta, T1, T2, Tv;
+			 T8 = Cr[WS(csr, 3)];
+			 T9 = Cr[WS(csr, 5)];
+			 Ta = Cr[WS(csr, 1)];
+			 Tu = Ci[WS(csi, 3)];
+			 Tl = Ci[WS(csi, 5)];
+			 Tm = Ci[WS(csi, 1)];
+			 Tb = T9 + Ta;
+			 Tz = T9 - Ta;
+			 Te = Ci[WS(csi, 4)];
+			 Tn = Tl - Tm;
+			 Tv = Tl + Tm;
+			 Tk = FNMS(KP2_000000000, T8, Tb);
+			 Tc = T8 + Tb;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 4)];
+			 Tw = Tu - Tv;
+			 Ty = FMA(KP2_000000000, Tu, Tv);
+			 Th = Ci[WS(csi, 2)];
+			 T4 = Cr[WS(csr, 6)];
+			 T3 = FMA(KP2_000000000, T2, T1);
+			 Td = T1 - T2;
+			 T5 = Cr[WS(csr, 2)];
+		    }
+		    {
+			 E To, Tp, Tf, Tg, T6, TA, TC;
+			 To = FMA(KP1_732050807, Tn, Tk);
+			 Ts = FNMS(KP1_732050807, Tn, Tk);
+			 Tp = FNMS(KP1_732050807, Te, Td);
+			 Tf = FMA(KP1_732050807, Te, Td);
+			 Tg = T4 - T5;
+			 T6 = FMA(KP2_000000000, T5, T4);
+			 TA = FMA(KP1_732050807, Tz, Ty);
+			 TC = FNMS(KP1_732050807, Tz, Ty);
+			 {
+			      E Tt, T7, Ti, Tq, Tj, TB, Tx;
+			      Tt = T3 - T6;
+			      T7 = T3 + T6;
+			      Ti = FNMS(KP1_732050807, Th, Tg);
+			      Tq = FMA(KP1_732050807, Th, Tg);
+			      R0[0] = FMA(KP2_000000000, Tc, T7);
+			      R0[WS(rs, 3)] = FNMS(KP2_000000000, Tc, T7);
+			      Tj = Tf + Ti;
+			      TB = Tf - Ti;
+			      Tr = Tp + Tq;
+			      Tx = Tp - Tq;
+			      R1[WS(rs, 5)] = TB + TC;
+			      R1[WS(rs, 2)] = TB - TC;
+			      R0[WS(rs, 4)] = Tj - To;
+			      R0[WS(rs, 1)] = Tj + To;
+			      R1[WS(rs, 3)] = Tx + TA;
+			      R1[0] = Tx - TA;
+			      R1[WS(rs, 4)] = FNMS(KP2_000000000, Tw, Tt);
+			      R1[WS(rs, 1)] = FMA(KP2_000000000, Tw, Tt);
+			 }
+		    }
+	       }
+	       R0[WS(rs, 2)] = Tr - Ts;
+	       R0[WS(rs, 5)] = Tr + Ts;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 12, "r2cb_12", {22, 0, 16, 0}, &GENUS };
+
+void X(codelet_r2cb_12) (planner *p) {
+     X(kr2c_register) (p, r2cb_12, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -name r2cb_12 -include r2cb.h */
+
+/*
+ * This function contains 38 FP additions, 10 FP multiplications,
+ * (or, 34 additions, 6 multiplications, 4 fused multiply/add),
+ * 25 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
+	       E T8, Tb, Tm, TA, Tw, Tx, Tp, TB, T3, Tr, Tg, T6, Ts, Tk;
+	       {
+		    E T9, Ta, Tn, To;
+		    T8 = Cr[WS(csr, 3)];
+		    T9 = Cr[WS(csr, 5)];
+		    Ta = Cr[WS(csr, 1)];
+		    Tb = T9 + Ta;
+		    Tm = FMS(KP2_000000000, T8, Tb);
+		    TA = KP1_732050807 * (T9 - Ta);
+		    Tw = Ci[WS(csi, 3)];
+		    Tn = Ci[WS(csi, 5)];
+		    To = Ci[WS(csi, 1)];
+		    Tx = Tn + To;
+		    Tp = KP1_732050807 * (Tn - To);
+		    TB = FMA(KP2_000000000, Tw, Tx);
+	       }
+	       {
+		    E Tf, T1, T2, Td, Te;
+		    Te = Ci[WS(csi, 4)];
+		    Tf = KP1_732050807 * Te;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 4)];
+		    Td = T1 - T2;
+		    T3 = FMA(KP2_000000000, T2, T1);
+		    Tr = Td - Tf;
+		    Tg = Td + Tf;
+	       }
+	       {
+		    E Tj, T4, T5, Th, Ti;
+		    Ti = Ci[WS(csi, 2)];
+		    Tj = KP1_732050807 * Ti;
+		    T4 = Cr[WS(csr, 6)];
+		    T5 = Cr[WS(csr, 2)];
+		    Th = T4 - T5;
+		    T6 = FMA(KP2_000000000, T5, T4);
+		    Ts = Th + Tj;
+		    Tk = Th - Tj;
+	       }
+	       {
+		    E T7, Tc, Tz, TC;
+		    T7 = T3 + T6;
+		    Tc = KP2_000000000 * (T8 + Tb);
+		    R0[WS(rs, 3)] = T7 - Tc;
+		    R0[0] = T7 + Tc;
+		    {
+			 E Tl, Tq, TD, TE;
+			 Tl = Tg + Tk;
+			 Tq = Tm - Tp;
+			 R0[WS(rs, 1)] = Tl - Tq;
+			 R0[WS(rs, 4)] = Tl + Tq;
+			 TD = Tg - Tk;
+			 TE = TB - TA;
+			 R1[WS(rs, 2)] = TD - TE;
+			 R1[WS(rs, 5)] = TD + TE;
+		    }
+		    Tz = Tr - Ts;
+		    TC = TA + TB;
+		    R1[0] = Tz - TC;
+		    R1[WS(rs, 3)] = Tz + TC;
+		    {
+			 E Tv, Ty, Tt, Tu;
+			 Tv = T3 - T6;
+			 Ty = KP2_000000000 * (Tw - Tx);
+			 R1[WS(rs, 4)] = Tv - Ty;
+			 R1[WS(rs, 1)] = Tv + Ty;
+			 Tt = Tr + Ts;
+			 Tu = Tm + Tp;
+			 R0[WS(rs, 5)] = Tt - Tu;
+			 R0[WS(rs, 2)] = Tt + Tu;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 12, "r2cb_12", {34, 6, 4, 0}, &GENUS };
+
+void X(codelet_r2cb_12) (planner *p) {
+     X(kr2c_register) (p, r2cb_12, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3181 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:09 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 128 -name r2cb_128 -include r2cb.h */
+
+/*
+ * This function contains 956 FP additions, 540 FP multiplications,
+ * (or, 416 additions, 0 multiplications, 540 fused multiply/add),
+ * 242 stack variables, 36 constants, and 256 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_128(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_715457220, +1.715457220000544139804539968569540274084981599);
+     DK(KP1_606415062, +1.606415062961289819613353025926283847759138854);
+     DK(KP599376933, +0.599376933681923766271389869014404232837890546);
+     DK(KP741650546, +0.741650546272035369581266691172079863842265220);
+     DK(KP1_978353019, +1.978353019929561946903347476032486127967379067);
+     DK(KP1_940062506, +1.940062506389087985207968414572200502913731924);
+     DK(KP148335987, +0.148335987538347428753676511486911367000625355);
+     DK(KP250486960, +0.250486960191305461595702160124721208578685568);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP1_807978586, +1.807978586246886663172400594461074097420264050);
+     DK(KP1_481902250, +1.481902250709918182351233794990325459457910619);
+     DK(KP472964775, +0.472964775891319928124438237972992463904131113);
+     DK(KP906347169, +0.906347169019147157946142717268914412664134293);
+     DK(KP1_997590912, +1.997590912410344785429543209518201388886407229);
+     DK(KP1_883088130, +1.883088130366041556825018805199004714371179592);
+     DK(KP049126849, +0.049126849769467254105343321271313617079695752);
+     DK(KP357805721, +0.357805721314524104672487743774474392487532769);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP1_763842528, +1.763842528696710059425513727320776699016885241);
+     DK(KP1_913880671, +1.913880671464417729871595773960539938965698411);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP1_990369453, +1.990369453344393772489673906218959843150949737);
+     DK(KP1_546020906, +1.546020906725473921621813219516939601942082586);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(512, rs), MAKE_VOLATILE_STRIDE(512, csr), MAKE_VOLATILE_STRIDE(512, csi)) {
+	       E T9H, T9I, T9X, T9Y;
+	       {
+		    E Tdr, T9, Tcl, Ta9, T6b, T2d, T91, T7j, Tg, Tds, Tcm, Tae, T92, T7m, T6c;
+		    E T2o, Tdu, Tw, Tco, Tap, TeM, Tdx, T6f, T2G, T6e, T2P, T94, T7t, Tcp, Tak;
+		    E T95, T7q, TdM, T1i, TcL, TbD, Tf0, Te6, T6q, T42, T6B, T5t, T9r, T8j, TcA;
+		    E TaY, T9g, T7S, TdA, TM, Tcv, TaN, TeP, TdI, T6i, T38, T6l, T3F, T9b, T7J;
+		    E Tcs, Taw, T98, T7y, T1N, TeW, T6x, T4H, Te8, TdV, T6w, T4Q, T9j, T86, TcO;
+		    E TcI, T9k, T83, TbI, Tbl, T22, TeV, Te0, Te9, T58, T6u, T6t, T5h, T9m, T8d;
+		    E TcP, TcF, T9n, T8a, TbJ, Tbw, Te3, T1x, TcB, TbG, Tf1, TdP, T6C, T4p, T6r;
+		    E T5w, T9h, T8m, TcM, Tb9, T9s, T7Z, TaB, TaG, TdF, T11, Tct, TaQ, TeQ, TdD;
+		    E T6m, T3v, T7B, T7E, T6j, T3I, T99, T7M;
+		    {
+			 E TaU, TaX, T7Q, T7R, Tbk, Tbf;
+			 {
+			      E Td, T2e, Tc, Tab, T2m, Te, T2f, T2g;
+			      {
+				   E T7h, T27, T2c, T7i;
+				   {
+					E T4, T26, T29, T25, T3, T28, T8, T2a;
+					T4 = Cr[WS(csr, 32)];
+					T26 = Ci[WS(csi, 32)];
+					{
+					     E T1, T2, T6, T7;
+					     T1 = Cr[0];
+					     T2 = Cr[WS(csr, 64)];
+					     T6 = Cr[WS(csr, 16)];
+					     T7 = Cr[WS(csr, 48)];
+					     T29 = Ci[WS(csi, 16)];
+					     T25 = T1 - T2;
+					     T3 = T1 + T2;
+					     T28 = T6 - T7;
+					     T8 = T6 + T7;
+					     T2a = Ci[WS(csi, 48)];
+					}
+					{
+					     E Ta7, T5, Ta8, T2b;
+					     Ta7 = FNMS(KP2_000000000, T4, T3);
+					     T5 = FMA(KP2_000000000, T4, T3);
+					     T7h = FMA(KP2_000000000, T26, T25);
+					     T27 = FNMS(KP2_000000000, T26, T25);
+					     Ta8 = T29 - T2a;
+					     T2b = T29 + T2a;
+					     Tdr = FNMS(KP2_000000000, T8, T5);
+					     T9 = FMA(KP2_000000000, T8, T5);
+					     Tcl = FMA(KP2_000000000, Ta8, Ta7);
+					     Ta9 = FNMS(KP2_000000000, Ta8, Ta7);
+					     T2c = T28 - T2b;
+					     T7i = T28 + T2b;
+					}
+				   }
+				   {
+					E Ta, Tb, T2k, T2l;
+					Ta = Cr[WS(csr, 8)];
+					T6b = FNMS(KP1_414213562, T2c, T27);
+					T2d = FMA(KP1_414213562, T2c, T27);
+					T91 = FMA(KP1_414213562, T7i, T7h);
+					T7j = FNMS(KP1_414213562, T7i, T7h);
+					Tb = Cr[WS(csr, 56)];
+					T2k = Ci[WS(csi, 8)];
+					T2l = Ci[WS(csi, 56)];
+					Td = Cr[WS(csr, 40)];
+					T2e = Ta - Tb;
+					Tc = Ta + Tb;
+					Tab = T2k - T2l;
+					T2m = T2k + T2l;
+					Te = Cr[WS(csr, 24)];
+					T2f = Ci[WS(csi, 40)];
+					T2g = Ci[WS(csi, 24)];
+				   }
+			      }
+			      {
+				   E Tag, Taj, T7o, T7p;
+				   {
+					E T2q, Tk, Tam, T2K, T2H, Tn, Tan, T2t, Tu, Tah, T2E, T2N, Tr, T2v, T2y;
+					E Tai;
+					{
+					     E Tl, Tm, T2r, T2s;
+					     {
+						  E Ti, Tj, T2j, Tf, T2I, T2J;
+						  Ti = Cr[WS(csr, 4)];
+						  T2j = Td - Te;
+						  Tf = Td + Te;
+						  {
+						       E Tac, T2h, T7k, T2n;
+						       Tac = T2f - T2g;
+						       T2h = T2f + T2g;
+						       T7k = T2m - T2j;
+						       T2n = T2j + T2m;
+						       {
+							    E Taa, Tad, T7l, T2i;
+							    Taa = Tc - Tf;
+							    Tg = Tc + Tf;
+							    Tad = Tab - Tac;
+							    Tds = Tac + Tab;
+							    T7l = T2e + T2h;
+							    T2i = T2e - T2h;
+							    Tcm = Taa + Tad;
+							    Tae = Taa - Tad;
+							    T92 = FMA(KP414213562, T7k, T7l);
+							    T7m = FNMS(KP414213562, T7l, T7k);
+							    T6c = FMA(KP414213562, T2i, T2n);
+							    T2o = FNMS(KP414213562, T2n, T2i);
+							    Tj = Cr[WS(csr, 60)];
+						       }
+						  }
+						  T2I = Ci[WS(csi, 4)];
+						  T2J = Ci[WS(csi, 60)];
+						  Tl = Cr[WS(csr, 36)];
+						  T2q = Ti - Tj;
+						  Tk = Ti + Tj;
+						  Tam = T2I - T2J;
+						  T2K = T2I + T2J;
+						  Tm = Cr[WS(csr, 28)];
+					     }
+					     T2r = Ci[WS(csi, 36)];
+					     T2s = Ci[WS(csi, 28)];
+					     {
+						  E Ts, Tt, T2B, T2C;
+						  Ts = Cr[WS(csr, 12)];
+						  T2H = Tl - Tm;
+						  Tn = Tl + Tm;
+						  Tan = T2r - T2s;
+						  T2t = T2r + T2s;
+						  Tt = Cr[WS(csr, 52)];
+						  T2B = Ci[WS(csi, 12)];
+						  T2C = Ci[WS(csi, 52)];
+						  {
+						       E Tp, T2A, T2D, Tq, T2w, T2x;
+						       Tp = Cr[WS(csr, 20)];
+						       Tu = Ts + Tt;
+						       T2A = Ts - Tt;
+						       Tah = T2C - T2B;
+						       T2D = T2B + T2C;
+						       Tq = Cr[WS(csr, 44)];
+						       T2w = Ci[WS(csi, 20)];
+						       T2x = Ci[WS(csi, 44)];
+						       T2E = T2A - T2D;
+						       T2N = T2A + T2D;
+						       Tr = Tp + Tq;
+						       T2v = Tp - Tq;
+						       T2y = T2w + T2x;
+						       Tai = T2w - T2x;
+						  }
+					     }
+					}
+					{
+					     E T2M, Tdv, Tdw, T2u, T2F, T7s, T7r, T2L, T2O;
+					     {
+						  E To, T2z, Tv, Tal, Tao;
+						  Tag = Tk - Tn;
+						  To = Tk + Tn;
+						  T2M = T2v + T2y;
+						  T2z = T2v - T2y;
+						  Tv = Tr + Tu;
+						  Tal = Tr - Tu;
+						  Tao = Tam - Tan;
+						  Tdv = Tan + Tam;
+						  Tdu = To - Tv;
+						  Tw = To + Tv;
+						  Tco = Tao - Tal;
+						  Tap = Tal + Tao;
+						  Tdw = Tai + Tah;
+						  Taj = Tah - Tai;
+						  T7o = T2q + T2t;
+						  T2u = T2q - T2t;
+						  T2F = T2z + T2E;
+						  T7s = T2E - T2z;
+					     }
+					     T7r = T2K - T2H;
+					     T2L = T2H + T2K;
+					     TeM = Tdw + Tdv;
+					     Tdx = Tdv - Tdw;
+					     T6f = FNMS(KP707106781, T2F, T2u);
+					     T2G = FMA(KP707106781, T2F, T2u);
+					     T2O = T2M - T2N;
+					     T7p = T2M + T2N;
+					     T6e = FNMS(KP707106781, T2O, T2L);
+					     T2P = FMA(KP707106781, T2O, T2L);
+					     T94 = FMA(KP707106781, T7s, T7r);
+					     T7t = FNMS(KP707106781, T7s, T7r);
+					}
+				   }
+				   {
+					E T3M, T16, TbA, T5o, T5l, T19, TbB, T3P, T1g, TaV, T40, T5r, T1d, T3R, T3U;
+					E TaW;
+					{
+					     E T17, T18, T3N, T3O;
+					     {
+						  E T14, T15, T5m, T5n;
+						  T14 = Cr[WS(csr, 1)];
+						  Tcp = Tag - Taj;
+						  Tak = Tag + Taj;
+						  T95 = FMA(KP707106781, T7p, T7o);
+						  T7q = FNMS(KP707106781, T7p, T7o);
+						  T15 = Cr[WS(csr, 63)];
+						  T5m = Ci[WS(csi, 1)];
+						  T5n = Ci[WS(csi, 63)];
+						  T17 = Cr[WS(csr, 33)];
+						  T3M = T14 - T15;
+						  T16 = T14 + T15;
+						  TbA = T5m - T5n;
+						  T5o = T5m + T5n;
+						  T18 = Cr[WS(csr, 31)];
+					     }
+					     T3N = Ci[WS(csi, 33)];
+					     T3O = Ci[WS(csi, 31)];
+					     {
+						  E T1e, T1f, T3X, T3Y;
+						  T1e = Cr[WS(csr, 15)];
+						  T5l = T17 - T18;
+						  T19 = T17 + T18;
+						  TbB = T3N - T3O;
+						  T3P = T3N + T3O;
+						  T1f = Cr[WS(csr, 49)];
+						  T3X = Ci[WS(csi, 15)];
+						  T3Y = Ci[WS(csi, 49)];
+						  {
+						       E T1b, T3W, T3Z, T1c, T3S, T3T;
+						       T1b = Cr[WS(csr, 17)];
+						       T1g = T1e + T1f;
+						       T3W = T1e - T1f;
+						       TaV = T3Y - T3X;
+						       T3Z = T3X + T3Y;
+						       T1c = Cr[WS(csr, 47)];
+						       T3S = Ci[WS(csi, 17)];
+						       T3T = Ci[WS(csi, 47)];
+						       T40 = T3W - T3Z;
+						       T5r = T3W + T3Z;
+						       T1d = T1b + T1c;
+						       T3R = T1b - T1c;
+						       T3U = T3S + T3T;
+						       TaW = T3S - T3T;
+						  }
+					     }
+					}
+					{
+					     E T5q, Te4, Te5, T3Q, T41, T8i, T8h, T5p, T5s;
+					     {
+						  E T1a, T3V, T1h, Tbz, TbC;
+						  TaU = T16 - T19;
+						  T1a = T16 + T19;
+						  T5q = T3R + T3U;
+						  T3V = T3R - T3U;
+						  T1h = T1d + T1g;
+						  Tbz = T1d - T1g;
+						  TbC = TbA - TbB;
+						  Te4 = TbB + TbA;
+						  TdM = T1a - T1h;
+						  T1i = T1a + T1h;
+						  TcL = TbC - Tbz;
+						  TbD = Tbz + TbC;
+						  Te5 = TaW + TaV;
+						  TaX = TaV - TaW;
+						  T7Q = T3M + T3P;
+						  T3Q = T3M - T3P;
+						  T41 = T3V + T40;
+						  T8i = T40 - T3V;
+					     }
+					     T8h = T5o - T5l;
+					     T5p = T5l + T5o;
+					     Tf0 = Te5 + Te4;
+					     Te6 = Te4 - Te5;
+					     T6q = FNMS(KP707106781, T41, T3Q);
+					     T42 = FMA(KP707106781, T41, T3Q);
+					     T5s = T5q - T5r;
+					     T7R = T5q + T5r;
+					     T6B = FNMS(KP707106781, T5s, T5p);
+					     T5t = FMA(KP707106781, T5s, T5p);
+					     T9r = FMA(KP707106781, T8i, T8h);
+					     T8j = FNMS(KP707106781, T8i, T8h);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E Tas, Tav, T7w, T7x;
+			      {
+				   E T2S, TA, TaK, T3A, T3x, TD, TaL, T2V, TK, Tat, T36, T3D, TH, T2X, T30;
+				   E Tau;
+				   {
+					E TB, TC, T2T, T2U;
+					{
+					     E Ty, Tz, T3y, T3z;
+					     Ty = Cr[WS(csr, 2)];
+					     TcA = TaU - TaX;
+					     TaY = TaU + TaX;
+					     T9g = FMA(KP707106781, T7R, T7Q);
+					     T7S = FNMS(KP707106781, T7R, T7Q);
+					     Tz = Cr[WS(csr, 62)];
+					     T3y = Ci[WS(csi, 2)];
+					     T3z = Ci[WS(csi, 62)];
+					     TB = Cr[WS(csr, 34)];
+					     T2S = Ty - Tz;
+					     TA = Ty + Tz;
+					     TaK = T3y - T3z;
+					     T3A = T3y + T3z;
+					     TC = Cr[WS(csr, 30)];
+					}
+					T2T = Ci[WS(csi, 34)];
+					T2U = Ci[WS(csi, 30)];
+					{
+					     E TI, TJ, T33, T34;
+					     TI = Cr[WS(csr, 14)];
+					     T3x = TB - TC;
+					     TD = TB + TC;
+					     TaL = T2T - T2U;
+					     T2V = T2T + T2U;
+					     TJ = Cr[WS(csr, 50)];
+					     T33 = Ci[WS(csi, 14)];
+					     T34 = Ci[WS(csi, 50)];
+					     {
+						  E TF, T32, T35, TG, T2Y, T2Z;
+						  TF = Cr[WS(csr, 18)];
+						  TK = TI + TJ;
+						  T32 = TI - TJ;
+						  Tat = T34 - T33;
+						  T35 = T33 + T34;
+						  TG = Cr[WS(csr, 46)];
+						  T2Y = Ci[WS(csi, 18)];
+						  T2Z = Ci[WS(csi, 46)];
+						  T36 = T32 - T35;
+						  T3D = T32 + T35;
+						  TH = TF + TG;
+						  T2X = TF - TG;
+						  T30 = T2Y + T2Z;
+						  Tau = T2Y - T2Z;
+					     }
+					}
+				   }
+				   {
+					E T3C, TdG, TdH, T2W, T37, T7I, T7H, T3B, T3E;
+					{
+					     E TE, T31, TL, TaJ, TaM;
+					     Tas = TA - TD;
+					     TE = TA + TD;
+					     T3C = T2X + T30;
+					     T31 = T2X - T30;
+					     TL = TH + TK;
+					     TaJ = TH - TK;
+					     TaM = TaK - TaL;
+					     TdG = TaL + TaK;
+					     TdA = TE - TL;
+					     TM = TE + TL;
+					     Tcv = TaM - TaJ;
+					     TaN = TaJ + TaM;
+					     TdH = Tau + Tat;
+					     Tav = Tat - Tau;
+					     T7w = T2S + T2V;
+					     T2W = T2S - T2V;
+					     T37 = T31 + T36;
+					     T7I = T36 - T31;
+					}
+					T7H = T3A - T3x;
+					T3B = T3x + T3A;
+					TeP = TdH + TdG;
+					TdI = TdG - TdH;
+					T6i = FNMS(KP707106781, T37, T2W);
+					T38 = FMA(KP707106781, T37, T2W);
+					T3E = T3C - T3D;
+					T7x = T3C + T3D;
+					T6l = FNMS(KP707106781, T3E, T3B);
+					T3F = FMA(KP707106781, T3E, T3B);
+					T9b = FMA(KP707106781, T7I, T7H);
+					T7J = FNMS(KP707106781, T7I, T7H);
+				   }
+			      }
+			      {
+				   E T4r, T4I, T1F, Tbb, T4u, T4L, Tbj, TdS, T1I, Tbd, T4N, T4A, T4B, T1L, Tbc;
+				   E T4E, T1M, Tbg;
+				   {
+					E T1z, T1A, T1C, T1D, Tbi, Tbh;
+					T1z = Cr[WS(csr, 5)];
+					Tcs = Tas - Tav;
+					Taw = Tas + Tav;
+					T98 = FMA(KP707106781, T7x, T7w);
+					T7y = FNMS(KP707106781, T7x, T7w);
+					T1A = Cr[WS(csr, 59)];
+					T1C = Cr[WS(csr, 37)];
+					T1D = Cr[WS(csr, 27)];
+					{
+					     E T4s, T1B, T1E, T4t, T4J, T4K;
+					     T4s = Ci[WS(csi, 37)];
+					     T4r = T1z - T1A;
+					     T1B = T1z + T1A;
+					     T4I = T1C - T1D;
+					     T1E = T1C + T1D;
+					     T4t = Ci[WS(csi, 27)];
+					     T4J = Ci[WS(csi, 5)];
+					     T4K = Ci[WS(csi, 59)];
+					     T1F = T1B + T1E;
+					     Tbb = T1B - T1E;
+					     T4u = T4s + T4t;
+					     Tbi = T4s - T4t;
+					     Tbh = T4J - T4K;
+					     T4L = T4J + T4K;
+					}
+					{
+					     E T1J, T4w, T4z, T1K, T4C, T4D;
+					     {
+						  E T1G, T1H, T4x, T4y;
+						  T1G = Cr[WS(csr, 21)];
+						  Tbj = Tbh - Tbi;
+						  TdS = Tbi + Tbh;
+						  T1H = Cr[WS(csr, 43)];
+						  T4x = Ci[WS(csi, 21)];
+						  T4y = Ci[WS(csi, 43)];
+						  T1J = Cr[WS(csr, 11)];
+						  T4w = T1G - T1H;
+						  T1I = T1G + T1H;
+						  Tbd = T4x - T4y;
+						  T4z = T4x + T4y;
+						  T1K = Cr[WS(csr, 53)];
+						  T4C = Ci[WS(csi, 11)];
+						  T4D = Ci[WS(csi, 53)];
+					     }
+					     T4N = T4w + T4z;
+					     T4A = T4w - T4z;
+					     T4B = T1J - T1K;
+					     T1L = T1J + T1K;
+					     Tbc = T4D - T4C;
+					     T4E = T4C + T4D;
+					}
+				   }
+				   T1M = T1I + T1L;
+				   Tbg = T1I - T1L;
+				   {
+					E TdT, Tbe, T4F, T4O;
+					TdT = Tbd + Tbc;
+					Tbe = Tbc - Tbd;
+					T4F = T4B - T4E;
+					T4O = T4B + T4E;
+					{
+					     E TdR, TdU, T81, T4v, T4G, T85;
+					     TdR = T1F - T1M;
+					     T1N = T1F + T1M;
+					     TeW = TdT + TdS;
+					     TdU = TdS - TdT;
+					     T81 = T4r + T4u;
+					     T4v = T4r - T4u;
+					     T4G = T4A + T4F;
+					     T85 = T4F - T4A;
+					     {
+						  E T84, T4M, T4P, T82, TcG, TcH;
+						  T84 = T4L - T4I;
+						  T4M = T4I + T4L;
+						  T6x = FNMS(KP707106781, T4G, T4v);
+						  T4H = FMA(KP707106781, T4G, T4v);
+						  Te8 = TdR + TdU;
+						  TdV = TdR - TdU;
+						  T4P = T4N - T4O;
+						  T82 = T4N + T4O;
+						  Tbk = Tbg + Tbj;
+						  TcG = Tbj - Tbg;
+						  T6w = FNMS(KP707106781, T4P, T4M);
+						  T4Q = FMA(KP707106781, T4P, T4M);
+						  T9j = FMA(KP707106781, T85, T84);
+						  T86 = FNMS(KP707106781, T85, T84);
+						  TcH = Tbb - Tbe;
+						  Tbf = Tbb + Tbe;
+						  TcO = FMA(KP414213562, TcG, TcH);
+						  TcI = FNMS(KP414213562, TcH, TcG);
+						  T9k = FMA(KP707106781, T82, T81);
+						  T83 = FNMS(KP707106781, T82, T81);
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T88, T89, Tbv, Tbq;
+			      {
+				   E T4S, T59, T4V, Tbm, T1U, T5c, TdX, Tbu, T1X, T53, Tbo, T52, T20, T54, T5e;
+				   E T51;
+				   {
+					E T1R, T1Q, T1S, T1O, T1P;
+					T1O = Cr[WS(csr, 3)];
+					T1P = Cr[WS(csr, 61)];
+					T1R = Cr[WS(csr, 29)];
+					TbI = FMA(KP414213562, Tbf, Tbk);
+					Tbl = FNMS(KP414213562, Tbk, Tbf);
+					T1Q = T1O + T1P;
+					T4S = T1O - T1P;
+					T1S = Cr[WS(csr, 35)];
+					{
+					     E Tbt, Tbs, T4X, T50;
+					     {
+						  E T5a, T5b, T4T, T4U, T1T;
+						  T4T = Ci[WS(csi, 29)];
+						  T4U = Ci[WS(csi, 35)];
+						  T1T = T1R + T1S;
+						  T59 = T1R - T1S;
+						  T5a = Ci[WS(csi, 3)];
+						  Tbt = T4T - T4U;
+						  T4V = T4T + T4U;
+						  T5b = Ci[WS(csi, 61)];
+						  Tbm = T1Q - T1T;
+						  T1U = T1Q + T1T;
+						  T5c = T5a + T5b;
+						  Tbs = T5b - T5a;
+					     }
+					     {
+						  E T4Y, T4Z, T1V, T1W, T1Y, T1Z;
+						  T1V = Cr[WS(csr, 13)];
+						  T1W = Cr[WS(csr, 51)];
+						  TdX = Tbt + Tbs;
+						  Tbu = Tbs - Tbt;
+						  T4Y = Ci[WS(csi, 13)];
+						  T4X = T1V - T1W;
+						  T1X = T1V + T1W;
+						  T4Z = Ci[WS(csi, 51)];
+						  T1Y = Cr[WS(csr, 19)];
+						  T1Z = Cr[WS(csr, 45)];
+						  T53 = Ci[WS(csi, 19)];
+						  Tbo = T4Y - T4Z;
+						  T50 = T4Y + T4Z;
+						  T52 = T1Y - T1Z;
+						  T20 = T1Y + T1Z;
+						  T54 = Ci[WS(csi, 45)];
+					     }
+					     T5e = T4X + T50;
+					     T51 = T4X - T50;
+					}
+				   }
+				   {
+					E T21, Tbr, T55, Tbn;
+					T21 = T1X + T20;
+					Tbr = T1X - T20;
+					T55 = T53 + T54;
+					Tbn = T54 - T53;
+					{
+					     E T4W, TdW, Tbp, T5f, TdZ, T57, T8c, TdY, T56;
+					     T88 = T4S + T4V;
+					     T4W = T4S - T4V;
+					     T22 = T1U + T21;
+					     TdW = T1U - T21;
+					     TdY = Tbo + Tbn;
+					     Tbp = Tbn - Tbo;
+					     T56 = T52 - T55;
+					     T5f = T52 + T55;
+					     TeV = TdY + TdX;
+					     TdZ = TdX - TdY;
+					     T57 = T51 + T56;
+					     T8c = T56 - T51;
+					     {
+						  E T8b, T5d, T5g, TcD, TcE;
+						  T8b = T59 + T5c;
+						  T5d = T59 - T5c;
+						  T5g = T5e - T5f;
+						  T89 = T5e + T5f;
+						  Te0 = TdW + TdZ;
+						  Te9 = TdZ - TdW;
+						  T58 = FMA(KP707106781, T57, T4W);
+						  T6u = FNMS(KP707106781, T57, T4W);
+						  T6t = FNMS(KP707106781, T5g, T5d);
+						  T5h = FMA(KP707106781, T5g, T5d);
+						  Tbv = Tbr + Tbu;
+						  TcD = Tbu - Tbr;
+						  TcE = Tbm - Tbp;
+						  Tbq = Tbm + Tbp;
+						  T9m = FNMS(KP707106781, T8c, T8b);
+						  T8d = FMA(KP707106781, T8c, T8b);
+						  TcP = FNMS(KP414213562, TcD, TcE);
+						  TcF = FMA(KP414213562, TcE, TcD);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E Tb3, Tb8, T7V, T7Y;
+				   {
+					E T7T, T4c, TaZ, T1p, TdO, Tb2, T7U, T47, T1t, T4e, T1s, Tb5, T4m, T1u, T4f;
+					E T4g;
+					{
+					     E T1m, T43, T1l, Tb0, T4b, T1n, T44, T45;
+					     {
+						  E T1j, T1k, T49, T4a;
+						  T1j = Cr[WS(csr, 9)];
+						  T9n = FMA(KP707106781, T89, T88);
+						  T8a = FNMS(KP707106781, T89, T88);
+						  TbJ = FNMS(KP414213562, Tbq, Tbv);
+						  Tbw = FMA(KP414213562, Tbv, Tbq);
+						  T1k = Cr[WS(csr, 55)];
+						  T49 = Ci[WS(csi, 9)];
+						  T4a = Ci[WS(csi, 55)];
+						  T1m = Cr[WS(csr, 41)];
+						  T43 = T1j - T1k;
+						  T1l = T1j + T1k;
+						  Tb0 = T49 - T4a;
+						  T4b = T49 + T4a;
+						  T1n = Cr[WS(csr, 23)];
+						  T44 = Ci[WS(csi, 41)];
+						  T45 = Ci[WS(csi, 23)];
+					     }
+					     {
+						  E T1q, T1r, T4k, T4l;
+						  T1q = Cr[WS(csr, 7)];
+						  {
+						       E T48, T1o, Tb1, T46;
+						       T48 = T1m - T1n;
+						       T1o = T1m + T1n;
+						       Tb1 = T44 - T45;
+						       T46 = T44 + T45;
+						       T7T = T4b - T48;
+						       T4c = T48 + T4b;
+						       TaZ = T1l - T1o;
+						       T1p = T1l + T1o;
+						       TdO = Tb1 + Tb0;
+						       Tb2 = Tb0 - Tb1;
+						       T7U = T43 + T46;
+						       T47 = T43 - T46;
+						       T1r = Cr[WS(csr, 57)];
+						  }
+						  T4k = Ci[WS(csi, 7)];
+						  T4l = Ci[WS(csi, 57)];
+						  T1t = Cr[WS(csr, 25)];
+						  T4e = T1q - T1r;
+						  T1s = T1q + T1r;
+						  Tb5 = T4l - T4k;
+						  T4m = T4k + T4l;
+						  T1u = Cr[WS(csr, 39)];
+						  T4f = Ci[WS(csi, 25)];
+						  T4g = Ci[WS(csi, 39)];
+					     }
+					}
+					{
+					     E T7W, TdN, T7X, T5u, T4d, T4o, T5v, T8k, T8l;
+					     {
+						  E T4n, T1w, T4i, TbE, TbF, Tb4, Tb7;
+						  {
+						       E T4j, T1v, Tb6, T4h;
+						       T4j = T1t - T1u;
+						       T1v = T1t + T1u;
+						       Tb6 = T4f - T4g;
+						       T4h = T4f + T4g;
+						       T7W = T4j + T4m;
+						       T4n = T4j - T4m;
+						       Tb4 = T1s - T1v;
+						       T1w = T1s + T1v;
+						       TdN = Tb6 + Tb5;
+						       Tb7 = Tb5 - Tb6;
+						       T7X = T4e + T4h;
+						       T4i = T4e - T4h;
+						  }
+						  Tb3 = TaZ - Tb2;
+						  TbE = TaZ + Tb2;
+						  TbF = Tb7 - Tb4;
+						  Tb8 = Tb4 + Tb7;
+						  Te3 = T1p - T1w;
+						  T1x = T1p + T1w;
+						  TcB = TbE - TbF;
+						  TbG = TbE + TbF;
+						  T5u = FMA(KP414213562, T47, T4c);
+						  T4d = FNMS(KP414213562, T4c, T47);
+						  T4o = FMA(KP414213562, T4n, T4i);
+						  T5v = FNMS(KP414213562, T4i, T4n);
+					     }
+					     Tf1 = TdO + TdN;
+					     TdP = TdN - TdO;
+					     T6C = T4o - T4d;
+					     T4p = T4d + T4o;
+					     T7V = FNMS(KP414213562, T7U, T7T);
+					     T8k = FMA(KP414213562, T7T, T7U);
+					     T8l = FMA(KP414213562, T7W, T7X);
+					     T7Y = FNMS(KP414213562, T7X, T7W);
+					     T6r = T5u - T5v;
+					     T5w = T5u + T5v;
+					     T9h = T8k + T8l;
+					     T8m = T8k - T8l;
+					}
+				   }
+				   {
+					E T7z, T3i, Tax, TT, TdC, TaA, T7A, T3d, TX, T3k, TW, TaD, T3s, TY, T3l;
+					E T3m;
+					{
+					     E TQ, T39, TP, Tay, T3h, TR, T3a, T3b;
+					     {
+						  E TN, TO, T3f, T3g;
+						  TN = Cr[WS(csr, 10)];
+						  TcM = Tb8 - Tb3;
+						  Tb9 = Tb3 + Tb8;
+						  T9s = T7V - T7Y;
+						  T7Z = T7V + T7Y;
+						  TO = Cr[WS(csr, 54)];
+						  T3f = Ci[WS(csi, 10)];
+						  T3g = Ci[WS(csi, 54)];
+						  TQ = Cr[WS(csr, 42)];
+						  T39 = TN - TO;
+						  TP = TN + TO;
+						  Tay = T3f - T3g;
+						  T3h = T3f + T3g;
+						  TR = Cr[WS(csr, 22)];
+						  T3a = Ci[WS(csi, 42)];
+						  T3b = Ci[WS(csi, 22)];
+					     }
+					     {
+						  E TU, TV, T3q, T3r;
+						  TU = Cr[WS(csr, 6)];
+						  {
+						       E T3e, TS, Taz, T3c;
+						       T3e = TQ - TR;
+						       TS = TQ + TR;
+						       Taz = T3a - T3b;
+						       T3c = T3a + T3b;
+						       T7z = T3h - T3e;
+						       T3i = T3e + T3h;
+						       Tax = TP - TS;
+						       TT = TP + TS;
+						       TdC = Taz + Tay;
+						       TaA = Tay - Taz;
+						       T7A = T39 + T3c;
+						       T3d = T39 - T3c;
+						       TV = Cr[WS(csr, 58)];
+						  }
+						  T3q = Ci[WS(csi, 6)];
+						  T3r = Ci[WS(csi, 58)];
+						  TX = Cr[WS(csr, 26)];
+						  T3k = TU - TV;
+						  TW = TU + TV;
+						  TaD = T3r - T3q;
+						  T3s = T3q + T3r;
+						  TY = Cr[WS(csr, 38)];
+						  T3l = Ci[WS(csi, 26)];
+						  T3m = Ci[WS(csi, 38)];
+					     }
+					}
+					{
+					     E T7C, TdB, T7D, T3G, T3j, T3u, T3H, T7K, T7L;
+					     {
+						  E T3t, T10, T3o, TaO, TaP, TaC, TaF;
+						  {
+						       E T3p, TZ, TaE, T3n;
+						       T3p = TX - TY;
+						       TZ = TX + TY;
+						       TaE = T3l - T3m;
+						       T3n = T3l + T3m;
+						       T7C = T3p + T3s;
+						       T3t = T3p - T3s;
+						       TaC = TW - TZ;
+						       T10 = TW + TZ;
+						       TdB = TaE + TaD;
+						       TaF = TaD - TaE;
+						       T7D = T3k + T3n;
+						       T3o = T3k - T3n;
+						  }
+						  TaB = Tax - TaA;
+						  TaO = Tax + TaA;
+						  TaP = TaF - TaC;
+						  TaG = TaC + TaF;
+						  TdF = TT - T10;
+						  T11 = TT + T10;
+						  Tct = TaO - TaP;
+						  TaQ = TaO + TaP;
+						  T3G = FMA(KP414213562, T3d, T3i);
+						  T3j = FNMS(KP414213562, T3i, T3d);
+						  T3u = FMA(KP414213562, T3t, T3o);
+						  T3H = FNMS(KP414213562, T3o, T3t);
+					     }
+					     TeQ = TdC + TdB;
+					     TdD = TdB - TdC;
+					     T6m = T3u - T3j;
+					     T3v = T3j + T3u;
+					     T7B = FNMS(KP414213562, T7A, T7z);
+					     T7K = FMA(KP414213562, T7z, T7A);
+					     T7L = FMA(KP414213562, T7C, T7D);
+					     T7E = FNMS(KP414213562, T7D, T7C);
+					     T6j = T3G - T3H;
+					     T3I = T3G + T3H;
+					     T99 = T7K + T7L;
+					     T7M = T7K - T7L;
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E Tcw, T9c, T7F, Tev, Teu, TeD, Tep, TeG, Tez, TeE, Tes;
+			 {
+			      E TbX, TbY, Tc7, TbP, Tar, Tc5, Tc1, Tc0, Tc4, Tba, TbS, TbL, TbQ, TaS, Tbx;
+			      E Tc8;
+			      {
+				   E TeO, TaH, TeR, TeL, TeU, TeZ, Tf2, TeX, Tfh, Tfn, Tfo, Tfm;
+				   {
+					E T12, Tfg, Tfj, Tx, Tff, T24, Tfi, Tfk, Th, T1y, T23;
+					TeO = TM - T11;
+					T12 = TM + T11;
+					Tcw = TaG - TaB;
+					TaH = TaB + TaG;
+					T9c = T7B - T7E;
+					T7F = T7B + T7E;
+					Tfg = TeQ + TeP;
+					TeR = TeP - TeQ;
+					TeL = FNMS(KP2_000000000, Tg, T9);
+					Th = FMA(KP2_000000000, Tg, T9);
+					T1y = T1i + T1x;
+					TeU = T1i - T1x;
+					TeZ = T1N - T22;
+					T23 = T1N + T22;
+					Tfj = Tf1 + Tf0;
+					Tf2 = Tf0 - Tf1;
+					Tx = FMA(KP2_000000000, Tw, Th);
+					Tff = FNMS(KP2_000000000, Tw, Th);
+					T24 = T1y + T23;
+					Tfi = T1y - T23;
+					TeX = TeV - TeW;
+					Tfk = TeW + TeV;
+					{
+					     E T13, Tfp, Tfl, Tfq;
+					     T13 = FMA(KP2_000000000, T12, Tx);
+					     Tfp = FNMS(KP2_000000000, T12, Tx);
+					     Tfh = FNMS(KP2_000000000, Tfg, Tff);
+					     Tfn = FMA(KP2_000000000, Tfg, Tff);
+					     Tfl = Tfj - Tfk;
+					     Tfq = Tfk + Tfj;
+					     R0[0] = FMA(KP2_000000000, T24, T13);
+					     R0[WS(rs, 32)] = FNMS(KP2_000000000, T24, T13);
+					     R0[WS(rs, 48)] = FMA(KP2_000000000, Tfq, Tfp);
+					     R0[WS(rs, 16)] = FNMS(KP2_000000000, Tfq, Tfp);
+					     Tfo = Tfi + Tfl;
+					     Tfm = Tfi - Tfl;
+					}
+				   }
+				   {
+					E Tf7, TeN, Tfa, Tf3, Tf8, TeS;
+					R0[WS(rs, 8)] = FMA(KP1_414213562, Tfm, Tfh);
+					R0[WS(rs, 40)] = FNMS(KP1_414213562, Tfm, Tfh);
+					R0[WS(rs, 56)] = FMA(KP1_414213562, Tfo, Tfn);
+					R0[WS(rs, 24)] = FNMS(KP1_414213562, Tfo, Tfn);
+					Tf7 = FMA(KP2_000000000, TeM, TeL);
+					TeN = FNMS(KP2_000000000, TeM, TeL);
+					Tfa = Tf2 - TeZ;
+					Tf3 = TeZ + Tf2;
+					Tf8 = TeO + TeR;
+					TeS = TeO - TeR;
+					{
+					     E TbH, TbK, TaI, TaR;
+					     {
+						  E Taf, Tf9, Tfd, Tf5, TeT, Tfb, TeY, Taq;
+						  TbX = FNMS(KP1_414213562, Tae, Ta9);
+						  Taf = FMA(KP1_414213562, Tae, Ta9);
+						  Tf9 = FNMS(KP1_414213562, Tf8, Tf7);
+						  Tfd = FMA(KP1_414213562, Tf8, Tf7);
+						  Tf5 = FNMS(KP1_414213562, TeS, TeN);
+						  TeT = FMA(KP1_414213562, TeS, TeN);
+						  Tfb = TeU - TeX;
+						  TeY = TeU + TeX;
+						  Taq = FNMS(KP414213562, Tap, Tak);
+						  TbY = FMA(KP414213562, Tak, Tap);
+						  Tc7 = FNMS(KP707106781, TbG, TbD);
+						  TbH = FMA(KP707106781, TbG, TbD);
+						  {
+						       E Tfc, Tfe, Tf6, Tf4;
+						       Tfc = FNMS(KP414213562, Tfb, Tfa);
+						       Tfe = FMA(KP414213562, Tfa, Tfb);
+						       Tf6 = FMA(KP414213562, TeY, Tf3);
+						       Tf4 = FNMS(KP414213562, Tf3, TeY);
+						       TbP = FNMS(KP1_847759065, Taq, Taf);
+						       Tar = FMA(KP1_847759065, Taq, Taf);
+						       R0[WS(rs, 44)] = FMA(KP1_847759065, Tfc, Tf9);
+						       R0[WS(rs, 12)] = FNMS(KP1_847759065, Tfc, Tf9);
+						       R0[WS(rs, 60)] = FMA(KP1_847759065, Tfe, Tfd);
+						       R0[WS(rs, 28)] = FNMS(KP1_847759065, Tfe, Tfd);
+						       R0[WS(rs, 52)] = FMA(KP1_847759065, Tf6, Tf5);
+						       R0[WS(rs, 20)] = FNMS(KP1_847759065, Tf6, Tf5);
+						       R0[WS(rs, 4)] = FMA(KP1_847759065, Tf4, TeT);
+						       R0[WS(rs, 36)] = FNMS(KP1_847759065, Tf4, TeT);
+						       TbK = TbI + TbJ;
+						       Tc5 = TbI - TbJ;
+						  }
+					     }
+					     Tc1 = FNMS(KP707106781, TaH, Taw);
+					     TaI = FMA(KP707106781, TaH, Taw);
+					     TaR = FMA(KP707106781, TaQ, TaN);
+					     Tc0 = FNMS(KP707106781, TaQ, TaN);
+					     Tc4 = FNMS(KP707106781, Tb9, TaY);
+					     Tba = FMA(KP707106781, Tb9, TaY);
+					     TbS = FNMS(KP923879532, TbK, TbH);
+					     TbL = FMA(KP923879532, TbK, TbH);
+					     TbQ = FMA(KP198912367, TaI, TaR);
+					     TaS = FNMS(KP198912367, TaR, TaI);
+					     Tbx = Tbl + Tbw;
+					     Tc8 = Tbw - Tbl;
+					}
+				   }
+			      }
+			      {
+				   E Ten, Teo, Tex, Tef, Tdz, Ter, Teq, TdQ, Tei, Teb, Teg, TdK, Te1, Tey;
+				   {
+					E Te7, Tea, TdE, TdJ;
+					{
+					     E Tdt, TbR, TbV, TbN, TaT, TbT, Tby, Tdy;
+					     Ten = FMA(KP2_000000000, Tds, Tdr);
+					     Tdt = FNMS(KP2_000000000, Tds, Tdr);
+					     TbR = FNMS(KP1_961570560, TbQ, TbP);
+					     TbV = FMA(KP1_961570560, TbQ, TbP);
+					     TbN = FNMS(KP1_961570560, TaS, Tar);
+					     TaT = FMA(KP1_961570560, TaS, Tar);
+					     TbT = FNMS(KP923879532, Tbx, Tba);
+					     Tby = FMA(KP923879532, Tbx, Tba);
+					     Tdy = Tdu - Tdx;
+					     Teo = Tdu + Tdx;
+					     Tex = Te6 - Te3;
+					     Te7 = Te3 + Te6;
+					     {
+						  E TbU, TbW, TbO, TbM;
+						  TbU = FNMS(KP820678790, TbT, TbS);
+						  TbW = FMA(KP820678790, TbS, TbT);
+						  TbO = FMA(KP098491403, Tby, TbL);
+						  TbM = FNMS(KP098491403, TbL, Tby);
+						  Tef = FNMS(KP1_414213562, Tdy, Tdt);
+						  Tdz = FMA(KP1_414213562, Tdy, Tdt);
+						  R0[WS(rs, 41)] = FMA(KP1_546020906, TbU, TbR);
+						  R0[WS(rs, 9)] = FNMS(KP1_546020906, TbU, TbR);
+						  R0[WS(rs, 57)] = FMA(KP1_546020906, TbW, TbV);
+						  R0[WS(rs, 25)] = FNMS(KP1_546020906, TbW, TbV);
+						  R0[WS(rs, 49)] = FMA(KP1_990369453, TbO, TbN);
+						  R0[WS(rs, 17)] = FNMS(KP1_990369453, TbO, TbN);
+						  R0[WS(rs, 1)] = FMA(KP1_990369453, TbM, TaT);
+						  R0[WS(rs, 33)] = FNMS(KP1_990369453, TbM, TaT);
+						  Tea = Te8 + Te9;
+						  Tev = Te8 - Te9;
+					     }
+					}
+					Ter = TdA - TdD;
+					TdE = TdA + TdD;
+					TdJ = TdF + TdI;
+					Teq = TdI - TdF;
+					Teu = TdM - TdP;
+					TdQ = TdM + TdP;
+					Tei = FNMS(KP707106781, Tea, Te7);
+					Teb = FMA(KP707106781, Tea, Te7);
+					Teg = FMA(KP414213562, TdE, TdJ);
+					TdK = FNMS(KP414213562, TdJ, TdE);
+					Te1 = TdV + Te0;
+					Tey = Te0 - TdV;
+				   }
+				   {
+					E Tcd, TbZ, Tcg, Tc9, Tce, Tc2;
+					{
+					     E Teh, Tel, Ted, TdL, Tej, Te2;
+					     Teh = FNMS(KP1_847759065, Teg, Tef);
+					     Tel = FMA(KP1_847759065, Teg, Tef);
+					     Ted = FNMS(KP1_847759065, TdK, Tdz);
+					     TdL = FMA(KP1_847759065, TdK, Tdz);
+					     Tej = FNMS(KP707106781, Te1, TdQ);
+					     Te2 = FMA(KP707106781, Te1, TdQ);
+					     {
+						  E Tek, Tem, Tee, Tec;
+						  Tek = FNMS(KP668178637, Tej, Tei);
+						  Tem = FMA(KP668178637, Tei, Tej);
+						  Tee = FMA(KP198912367, Te2, Teb);
+						  Tec = FNMS(KP198912367, Teb, Te2);
+						  Tcd = FMA(KP1_847759065, TbY, TbX);
+						  TbZ = FNMS(KP1_847759065, TbY, TbX);
+						  R0[WS(rs, 42)] = FMA(KP1_662939224, Tek, Teh);
+						  R0[WS(rs, 10)] = FNMS(KP1_662939224, Tek, Teh);
+						  R0[WS(rs, 58)] = FMA(KP1_662939224, Tem, Tel);
+						  R0[WS(rs, 26)] = FNMS(KP1_662939224, Tem, Tel);
+						  R0[WS(rs, 50)] = FMA(KP1_961570560, Tee, Ted);
+						  R0[WS(rs, 18)] = FNMS(KP1_961570560, Tee, Ted);
+						  R0[WS(rs, 2)] = FMA(KP1_961570560, Tec, TdL);
+						  R0[WS(rs, 34)] = FNMS(KP1_961570560, Tec, TdL);
+					     }
+					}
+					Tcg = FMA(KP923879532, Tc8, Tc7);
+					Tc9 = FNMS(KP923879532, Tc8, Tc7);
+					Tce = FMA(KP668178637, Tc0, Tc1);
+					Tc2 = FNMS(KP668178637, Tc1, Tc0);
+					{
+					     E Tcf, Tcj, Tcb, Tc3, Tch, Tc6;
+					     Tcf = FNMS(KP1_662939224, Tce, Tcd);
+					     Tcj = FMA(KP1_662939224, Tce, Tcd);
+					     Tcb = FMA(KP1_662939224, Tc2, TbZ);
+					     Tc3 = FNMS(KP1_662939224, Tc2, TbZ);
+					     Tch = FMA(KP923879532, Tc5, Tc4);
+					     Tc6 = FNMS(KP923879532, Tc5, Tc4);
+					     {
+						  E Tci, Tck, Tcc, Tca;
+						  Tci = FNMS(KP303346683, Tch, Tcg);
+						  Tck = FMA(KP303346683, Tcg, Tch);
+						  Tcc = FMA(KP534511135, Tc6, Tc9);
+						  Tca = FNMS(KP534511135, Tc9, Tc6);
+						  TeD = FMA(KP1_414213562, Teo, Ten);
+						  Tep = FNMS(KP1_414213562, Teo, Ten);
+						  R0[WS(rs, 45)] = FMA(KP1_913880671, Tci, Tcf);
+						  R0[WS(rs, 13)] = FNMS(KP1_913880671, Tci, Tcf);
+						  R0[WS(rs, 61)] = FMA(KP1_913880671, Tck, Tcj);
+						  R0[WS(rs, 29)] = FNMS(KP1_913880671, Tck, Tcj);
+						  R0[WS(rs, 53)] = FMA(KP1_763842528, Tcc, Tcb);
+						  R0[WS(rs, 21)] = FNMS(KP1_763842528, Tcc, Tcb);
+						  R0[WS(rs, 5)] = FMA(KP1_763842528, Tca, Tc3);
+						  R0[WS(rs, 37)] = FNMS(KP1_763842528, Tca, Tc3);
+					     }
+					}
+					TeG = FMA(KP707106781, Tey, Tex);
+					Tez = FNMS(KP707106781, Tey, Tex);
+					TeE = FMA(KP414213562, Teq, Ter);
+					Tes = FNMS(KP414213562, Ter, Teq);
+				   }
+			      }
+			 }
+			 {
+			      E T5L, T5M, T61, T62;
+			      {
+				   E Td3, Td4, Tdd, TcV, Tcr, Tdb, Td7, Td6, Tda, TcC, TcY, TcR, TcW, Tcy, TcJ;
+				   E Tde;
+				   {
+					E TcN, TcQ, Tcu, Tcx;
+					{
+					     E Tcn, TeF, TeJ, TeB, Tet, TeH, Tew, Tcq;
+					     Td3 = FMA(KP1_414213562, Tcm, Tcl);
+					     Tcn = FNMS(KP1_414213562, Tcm, Tcl);
+					     TeF = FNMS(KP1_847759065, TeE, TeD);
+					     TeJ = FMA(KP1_847759065, TeE, TeD);
+					     TeB = FMA(KP1_847759065, Tes, Tep);
+					     Tet = FNMS(KP1_847759065, Tes, Tep);
+					     TeH = FMA(KP707106781, Tev, Teu);
+					     Tew = FNMS(KP707106781, Tev, Teu);
+					     Tcq = FNMS(KP414213562, Tcp, Tco);
+					     Td4 = FMA(KP414213562, Tco, Tcp);
+					     Tdd = FMA(KP707106781, TcM, TcL);
+					     TcN = FNMS(KP707106781, TcM, TcL);
+					     {
+						  E TeI, TeK, TeC, TeA;
+						  TeI = FNMS(KP198912367, TeH, TeG);
+						  TeK = FMA(KP198912367, TeG, TeH);
+						  TeC = FMA(KP668178637, Tew, Tez);
+						  TeA = FNMS(KP668178637, Tez, Tew);
+						  TcV = FMA(KP1_847759065, Tcq, Tcn);
+						  Tcr = FNMS(KP1_847759065, Tcq, Tcn);
+						  R0[WS(rs, 46)] = FMA(KP1_961570560, TeI, TeF);
+						  R0[WS(rs, 14)] = FNMS(KP1_961570560, TeI, TeF);
+						  R0[WS(rs, 62)] = FMA(KP1_961570560, TeK, TeJ);
+						  R0[WS(rs, 30)] = FNMS(KP1_961570560, TeK, TeJ);
+						  R0[WS(rs, 54)] = FMA(KP1_662939224, TeC, TeB);
+						  R0[WS(rs, 22)] = FNMS(KP1_662939224, TeC, TeB);
+						  R0[WS(rs, 6)] = FMA(KP1_662939224, TeA, Tet);
+						  R0[WS(rs, 38)] = FNMS(KP1_662939224, TeA, Tet);
+						  TcQ = TcO - TcP;
+						  Tdb = TcO + TcP;
+					     }
+					}
+					Td7 = FMA(KP707106781, Tct, Tcs);
+					Tcu = FNMS(KP707106781, Tct, Tcs);
+					Tcx = FNMS(KP707106781, Tcw, Tcv);
+					Td6 = FMA(KP707106781, Tcw, Tcv);
+					Tda = FMA(KP707106781, TcB, TcA);
+					TcC = FNMS(KP707106781, TcB, TcA);
+					TcY = FNMS(KP923879532, TcQ, TcN);
+					TcR = FMA(KP923879532, TcQ, TcN);
+					TcW = FMA(KP668178637, Tcu, Tcx);
+					Tcy = FNMS(KP668178637, Tcx, Tcu);
+					TcJ = TcF - TcI;
+					Tde = TcI + TcF;
+				   }
+				   {
+					E Tdj, Td5, Tdm, Tdf, Tdk, Td8;
+					{
+					     E TcX, Td1, TcT, Tcz, TcZ, TcK;
+					     TcX = FNMS(KP1_662939224, TcW, TcV);
+					     Td1 = FMA(KP1_662939224, TcW, TcV);
+					     TcT = FNMS(KP1_662939224, Tcy, Tcr);
+					     Tcz = FMA(KP1_662939224, Tcy, Tcr);
+					     TcZ = FNMS(KP923879532, TcJ, TcC);
+					     TcK = FMA(KP923879532, TcJ, TcC);
+					     {
+						  E Td0, Td2, TcU, TcS;
+						  Td0 = FNMS(KP534511135, TcZ, TcY);
+						  Td2 = FMA(KP534511135, TcY, TcZ);
+						  TcU = FMA(KP303346683, TcK, TcR);
+						  TcS = FNMS(KP303346683, TcR, TcK);
+						  Tdj = FMA(KP1_847759065, Td4, Td3);
+						  Td5 = FNMS(KP1_847759065, Td4, Td3);
+						  R0[WS(rs, 43)] = FMA(KP1_763842528, Td0, TcX);
+						  R0[WS(rs, 11)] = FNMS(KP1_763842528, Td0, TcX);
+						  R0[WS(rs, 59)] = FMA(KP1_763842528, Td2, Td1);
+						  R0[WS(rs, 27)] = FNMS(KP1_763842528, Td2, Td1);
+						  R0[WS(rs, 51)] = FMA(KP1_913880671, TcU, TcT);
+						  R0[WS(rs, 19)] = FNMS(KP1_913880671, TcU, TcT);
+						  R0[WS(rs, 3)] = FMA(KP1_913880671, TcS, Tcz);
+						  R0[WS(rs, 35)] = FNMS(KP1_913880671, TcS, Tcz);
+					     }
+					}
+					Tdm = FMA(KP923879532, Tde, Tdd);
+					Tdf = FNMS(KP923879532, Tde, Tdd);
+					Tdk = FMA(KP198912367, Td6, Td7);
+					Td8 = FNMS(KP198912367, Td7, Td6);
+					{
+					     E T5F, T2R, T5G, T3K, T64, T5S, T5X, T5x, T5U, T4q, T4R, T63, T5P, T5i, T5V;
+					     E T5A;
+					     {
+						  E T5N, T5O, T5R, T3w, T3J, T5Q, T5y, T5z;
+						  {
+						       E T2p, Tdl, Tdp, Tdh, Td9, Tdn, Tdc, T2Q;
+						       T5N = FNMS(KP1_847759065, T2o, T2d);
+						       T2p = FMA(KP1_847759065, T2o, T2d);
+						       Tdl = FNMS(KP1_961570560, Tdk, Tdj);
+						       Tdp = FMA(KP1_961570560, Tdk, Tdj);
+						       Tdh = FMA(KP1_961570560, Td8, Td5);
+						       Td9 = FNMS(KP1_961570560, Td8, Td5);
+						       Tdn = FMA(KP923879532, Tdb, Tda);
+						       Tdc = FNMS(KP923879532, Tdb, Tda);
+						       T2Q = FNMS(KP198912367, T2P, T2G);
+						       T5O = FMA(KP198912367, T2G, T2P);
+						       T5R = FNMS(KP923879532, T3v, T38);
+						       T3w = FMA(KP923879532, T3v, T38);
+						       {
+							    E Tdo, Tdq, Tdi, Tdg;
+							    Tdo = FNMS(KP098491403, Tdn, Tdm);
+							    Tdq = FMA(KP098491403, Tdm, Tdn);
+							    Tdi = FMA(KP820678790, Tdc, Tdf);
+							    Tdg = FNMS(KP820678790, Tdf, Tdc);
+							    T5F = FNMS(KP1_961570560, T2Q, T2p);
+							    T2R = FMA(KP1_961570560, T2Q, T2p);
+							    R0[WS(rs, 47)] = FMA(KP1_990369453, Tdo, Tdl);
+							    R0[WS(rs, 15)] = FNMS(KP1_990369453, Tdo, Tdl);
+							    R0[WS(rs, 63)] = FMA(KP1_990369453, Tdq, Tdp);
+							    R0[WS(rs, 31)] = FNMS(KP1_990369453, Tdq, Tdp);
+							    R0[WS(rs, 55)] = FMA(KP1_546020906, Tdi, Tdh);
+							    R0[WS(rs, 23)] = FNMS(KP1_546020906, Tdi, Tdh);
+							    R0[WS(rs, 7)] = FMA(KP1_546020906, Tdg, Td9);
+							    R0[WS(rs, 39)] = FNMS(KP1_546020906, Tdg, Td9);
+							    T3J = FMA(KP923879532, T3I, T3F);
+							    T5Q = FNMS(KP923879532, T3I, T3F);
+						       }
+						  }
+						  T5G = FMA(KP098491403, T3w, T3J);
+						  T3K = FNMS(KP098491403, T3J, T3w);
+						  T64 = FMA(KP820678790, T5Q, T5R);
+						  T5S = FNMS(KP820678790, T5R, T5Q);
+						  T5X = FNMS(KP923879532, T5w, T5t);
+						  T5x = FMA(KP923879532, T5w, T5t);
+						  T5U = FNMS(KP923879532, T4p, T42);
+						  T4q = FMA(KP923879532, T4p, T42);
+						  T4R = FNMS(KP198912367, T4Q, T4H);
+						  T5y = FMA(KP198912367, T4H, T4Q);
+						  T63 = FMA(KP1_961570560, T5O, T5N);
+						  T5P = FNMS(KP1_961570560, T5O, T5N);
+						  T5z = FNMS(KP198912367, T58, T5h);
+						  T5i = FMA(KP198912367, T5h, T58);
+						  T5V = T5y - T5z;
+						  T5A = T5y + T5z;
+					     }
+					     {
+						  E T5W, T5I, T5Z, T5J;
+						  {
+						       E T5D, T3L, T67, T5B, T5Y, T5j, T65, T69, T66, T5k;
+						       T5D = FNMS(KP1_990369453, T3K, T2R);
+						       T3L = FMA(KP1_990369453, T3K, T2R);
+						       T5W = FNMS(KP980785280, T5V, T5U);
+						       T67 = FMA(KP980785280, T5V, T5U);
+						       T5I = FNMS(KP980785280, T5A, T5x);
+						       T5B = FMA(KP980785280, T5A, T5x);
+						       T5Y = T5i - T4R;
+						       T5j = T4R + T5i;
+						       T65 = FNMS(KP1_546020906, T64, T63);
+						       T69 = FMA(KP1_546020906, T64, T63);
+						       T5Z = FNMS(KP980785280, T5Y, T5X);
+						       T66 = FMA(KP980785280, T5Y, T5X);
+						       T5J = FNMS(KP980785280, T5j, T4q);
+						       T5k = FMA(KP980785280, T5j, T4q);
+						       {
+							    E T68, T6a, T5E, T5C;
+							    T68 = FNMS(KP357805721, T67, T66);
+							    T6a = FMA(KP357805721, T66, T67);
+							    T5E = FMA(KP049126849, T5k, T5B);
+							    T5C = FNMS(KP049126849, T5B, T5k);
+							    R1[WS(rs, 60)] = FMA(KP1_883088130, T6a, T69);
+							    R1[WS(rs, 28)] = FNMS(KP1_883088130, T6a, T69);
+							    R1[WS(rs, 44)] = FMA(KP1_883088130, T68, T65);
+							    R1[WS(rs, 12)] = FNMS(KP1_883088130, T68, T65);
+							    R1[0] = FMA(KP1_997590912, T5C, T3L);
+							    R1[WS(rs, 32)] = FNMS(KP1_997590912, T5C, T3L);
+							    R1[WS(rs, 16)] = FNMS(KP1_997590912, T5E, T5D);
+							    R1[WS(rs, 48)] = FMA(KP1_997590912, T5E, T5D);
+						       }
+						  }
+						  {
+						       E T5H, T5K, T5T, T60;
+						       T5L = FMA(KP1_990369453, T5G, T5F);
+						       T5H = FNMS(KP1_990369453, T5G, T5F);
+						       T5K = FNMS(KP906347169, T5J, T5I);
+						       T5M = FMA(KP906347169, T5I, T5J);
+						       T61 = FMA(KP1_546020906, T5S, T5P);
+						       T5T = FNMS(KP1_546020906, T5S, T5P);
+						       T60 = FNMS(KP472964775, T5Z, T5W);
+						       T62 = FMA(KP472964775, T5W, T5Z);
+						       R1[WS(rs, 40)] = FMA(KP1_481902250, T5K, T5H);
+						       R1[WS(rs, 8)] = FNMS(KP1_481902250, T5K, T5H);
+						       R1[WS(rs, 4)] = FMA(KP1_807978586, T60, T5T);
+						       R1[WS(rs, 36)] = FNMS(KP1_807978586, T60, T5T);
+						  }
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T8B, T8C, T8R, T8S;
+				   {
+					E T8v, T7v, T8w, T7O, T8N, T8n, T8U, T8I, T8T, T8F, T8K, T80, T87, T8e, T8L;
+					E T8q;
+					{
+					     E T8D, T8E, T8H, T8G, T8o, T8p;
+					     {
+						  E T7n, T7u, T7G, T7N;
+						  T8D = FMA(KP1_847759065, T7m, T7j);
+						  T7n = FNMS(KP1_847759065, T7m, T7j);
+						  R1[WS(rs, 52)] = FMA(KP1_807978586, T62, T61);
+						  R1[WS(rs, 20)] = FNMS(KP1_807978586, T62, T61);
+						  R1[WS(rs, 56)] = FMA(KP1_481902250, T5M, T5L);
+						  R1[WS(rs, 24)] = FNMS(KP1_481902250, T5M, T5L);
+						  T7u = FNMS(KP668178637, T7t, T7q);
+						  T8E = FMA(KP668178637, T7q, T7t);
+						  T8H = FMA(KP923879532, T7F, T7y);
+						  T7G = FNMS(KP923879532, T7F, T7y);
+						  T7N = FMA(KP923879532, T7M, T7J);
+						  T8G = FNMS(KP923879532, T7M, T7J);
+						  T8v = FNMS(KP1_662939224, T7u, T7n);
+						  T7v = FMA(KP1_662939224, T7u, T7n);
+						  T8w = FMA(KP303346683, T7G, T7N);
+						  T7O = FNMS(KP303346683, T7N, T7G);
+					     }
+					     T8N = FNMS(KP923879532, T8m, T8j);
+					     T8n = FMA(KP923879532, T8m, T8j);
+					     T8U = FMA(KP534511135, T8G, T8H);
+					     T8I = FNMS(KP534511135, T8H, T8G);
+					     T8T = FMA(KP1_662939224, T8E, T8D);
+					     T8F = FNMS(KP1_662939224, T8E, T8D);
+					     T8K = FMA(KP923879532, T7Z, T7S);
+					     T80 = FNMS(KP923879532, T7Z, T7S);
+					     T87 = FNMS(KP668178637, T86, T83);
+					     T8o = FMA(KP668178637, T83, T86);
+					     T8p = FMA(KP668178637, T8a, T8d);
+					     T8e = FNMS(KP668178637, T8d, T8a);
+					     T8L = T8o + T8p;
+					     T8q = T8o - T8p;
+					}
+					{
+					     E T8M, T8y, T8P, T8z;
+					     {
+						  E T8t, T7P, T8X, T8r, T8O, T8f, T8V, T8Z, T8W, T8g;
+						  T8t = FNMS(KP1_913880671, T7O, T7v);
+						  T7P = FMA(KP1_913880671, T7O, T7v);
+						  T8M = FNMS(KP831469612, T8L, T8K);
+						  T8X = FMA(KP831469612, T8L, T8K);
+						  T8y = FNMS(KP831469612, T8q, T8n);
+						  T8r = FMA(KP831469612, T8q, T8n);
+						  T8O = T8e - T87;
+						  T8f = T87 + T8e;
+						  T8V = FNMS(KP1_763842528, T8U, T8T);
+						  T8Z = FMA(KP1_763842528, T8U, T8T);
+						  T8P = FNMS(KP831469612, T8O, T8N);
+						  T8W = FMA(KP831469612, T8O, T8N);
+						  T8z = FNMS(KP831469612, T8f, T80);
+						  T8g = FMA(KP831469612, T8f, T80);
+						  {
+						       E T8Y, T90, T8u, T8s;
+						       T8Y = FNMS(KP250486960, T8X, T8W);
+						       T90 = FMA(KP250486960, T8W, T8X);
+						       T8u = FMA(KP148335987, T8g, T8r);
+						       T8s = FNMS(KP148335987, T8r, T8g);
+						       R1[WS(rs, 61)] = FMA(KP1_940062506, T90, T8Z);
+						       R1[WS(rs, 29)] = FNMS(KP1_940062506, T90, T8Z);
+						       R1[WS(rs, 45)] = FMA(KP1_940062506, T8Y, T8V);
+						       R1[WS(rs, 13)] = FNMS(KP1_940062506, T8Y, T8V);
+						       R1[WS(rs, 1)] = FMA(KP1_978353019, T8s, T7P);
+						       R1[WS(rs, 33)] = FNMS(KP1_978353019, T8s, T7P);
+						       R1[WS(rs, 17)] = FNMS(KP1_978353019, T8u, T8t);
+						       R1[WS(rs, 49)] = FMA(KP1_978353019, T8u, T8t);
+						  }
+					     }
+					     {
+						  E T8x, T8A, T8J, T8Q;
+						  T8B = FMA(KP1_913880671, T8w, T8v);
+						  T8x = FNMS(KP1_913880671, T8w, T8v);
+						  T8A = FNMS(KP741650546, T8z, T8y);
+						  T8C = FMA(KP741650546, T8y, T8z);
+						  T8R = FMA(KP1_763842528, T8I, T8F);
+						  T8J = FNMS(KP1_763842528, T8I, T8F);
+						  T8Q = FNMS(KP599376933, T8P, T8M);
+						  T8S = FMA(KP599376933, T8M, T8P);
+						  R1[WS(rs, 41)] = FMA(KP1_606415062, T8A, T8x);
+						  R1[WS(rs, 9)] = FNMS(KP1_606415062, T8A, T8x);
+						  R1[WS(rs, 5)] = FMA(KP1_715457220, T8Q, T8J);
+						  R1[WS(rs, 37)] = FNMS(KP1_715457220, T8Q, T8J);
+					     }
+					}
+				   }
+				   {
+					E T6R, T6S, T77, T78;
+					{
+					     E T6L, T6h, T6M, T6o, T73, T6D, T7a, T6Y, T79, T6V, T70, T6s, T6y, T6v, T71;
+					     E T6G;
+					     {
+						  E T6T, T6U, T6X, T6W, T6E, T6F;
+						  {
+						       E T6d, T6g, T6k, T6n;
+						       T6T = FMA(KP1_847759065, T6c, T6b);
+						       T6d = FNMS(KP1_847759065, T6c, T6b);
+						       R1[WS(rs, 53)] = FMA(KP1_715457220, T8S, T8R);
+						       R1[WS(rs, 21)] = FNMS(KP1_715457220, T8S, T8R);
+						       R1[WS(rs, 57)] = FMA(KP1_606415062, T8C, T8B);
+						       R1[WS(rs, 25)] = FNMS(KP1_606415062, T8C, T8B);
+						       T6g = FNMS(KP668178637, T6f, T6e);
+						       T6U = FMA(KP668178637, T6e, T6f);
+						       T6X = FMA(KP923879532, T6j, T6i);
+						       T6k = FNMS(KP923879532, T6j, T6i);
+						       T6n = FNMS(KP923879532, T6m, T6l);
+						       T6W = FMA(KP923879532, T6m, T6l);
+						       T6L = FMA(KP1_662939224, T6g, T6d);
+						       T6h = FNMS(KP1_662939224, T6g, T6d);
+						       T6M = FMA(KP534511135, T6k, T6n);
+						       T6o = FNMS(KP534511135, T6n, T6k);
+						  }
+						  T73 = FMA(KP923879532, T6C, T6B);
+						  T6D = FNMS(KP923879532, T6C, T6B);
+						  T7a = FMA(KP303346683, T6W, T6X);
+						  T6Y = FNMS(KP303346683, T6X, T6W);
+						  T79 = FMA(KP1_662939224, T6U, T6T);
+						  T6V = FNMS(KP1_662939224, T6U, T6T);
+						  T70 = FMA(KP923879532, T6r, T6q);
+						  T6s = FNMS(KP923879532, T6r, T6q);
+						  T6y = FNMS(KP668178637, T6x, T6w);
+						  T6E = FMA(KP668178637, T6w, T6x);
+						  T6F = FNMS(KP668178637, T6t, T6u);
+						  T6v = FMA(KP668178637, T6u, T6t);
+						  T71 = T6E + T6F;
+						  T6G = T6E - T6F;
+					     }
+					     {
+						  E T72, T6O, T75, T6P;
+						  {
+						       E T6J, T6p, T7d, T6H, T74, T6z, T7b, T7f, T7c, T6A;
+						       T6J = FNMS(KP1_763842528, T6o, T6h);
+						       T6p = FMA(KP1_763842528, T6o, T6h);
+						       T72 = FNMS(KP831469612, T71, T70);
+						       T7d = FMA(KP831469612, T71, T70);
+						       T6O = FNMS(KP831469612, T6G, T6D);
+						       T6H = FMA(KP831469612, T6G, T6D);
+						       T74 = T6y + T6v;
+						       T6z = T6v - T6y;
+						       T7b = FNMS(KP1_913880671, T7a, T79);
+						       T7f = FMA(KP1_913880671, T7a, T79);
+						       T75 = FNMS(KP831469612, T74, T73);
+						       T7c = FMA(KP831469612, T74, T73);
+						       T6P = FNMS(KP831469612, T6z, T6s);
+						       T6A = FMA(KP831469612, T6z, T6s);
+						       {
+							    E T7e, T7g, T6K, T6I;
+							    T7e = FNMS(KP148335987, T7d, T7c);
+							    T7g = FMA(KP148335987, T7c, T7d);
+							    T6K = FMA(KP250486960, T6A, T6H);
+							    T6I = FNMS(KP250486960, T6H, T6A);
+							    R1[WS(rs, 62)] = FMA(KP1_978353019, T7g, T7f);
+							    R1[WS(rs, 30)] = FNMS(KP1_978353019, T7g, T7f);
+							    R1[WS(rs, 46)] = FMA(KP1_978353019, T7e, T7b);
+							    R1[WS(rs, 14)] = FNMS(KP1_978353019, T7e, T7b);
+							    R1[WS(rs, 2)] = FMA(KP1_940062506, T6I, T6p);
+							    R1[WS(rs, 34)] = FNMS(KP1_940062506, T6I, T6p);
+							    R1[WS(rs, 18)] = FNMS(KP1_940062506, T6K, T6J);
+							    R1[WS(rs, 50)] = FMA(KP1_940062506, T6K, T6J);
+						       }
+						  }
+						  {
+						       E T6N, T6Q, T6Z, T76;
+						       T6R = FMA(KP1_763842528, T6M, T6L);
+						       T6N = FNMS(KP1_763842528, T6M, T6L);
+						       T6Q = FNMS(KP599376933, T6P, T6O);
+						       T6S = FMA(KP599376933, T6O, T6P);
+						       T77 = FMA(KP1_913880671, T6Y, T6V);
+						       T6Z = FNMS(KP1_913880671, T6Y, T6V);
+						       T76 = FNMS(KP741650546, T75, T72);
+						       T78 = FMA(KP741650546, T72, T75);
+						       R1[WS(rs, 42)] = FMA(KP1_715457220, T6Q, T6N);
+						       R1[WS(rs, 10)] = FNMS(KP1_715457220, T6Q, T6N);
+						       R1[WS(rs, 6)] = FMA(KP1_606415062, T76, T6Z);
+						       R1[WS(rs, 38)] = FNMS(KP1_606415062, T76, T6Z);
+						  }
+					     }
+					}
+					{
+					     E T9B, T97, T9C, T9e, T9T, T9t, Ta0, T9O, T9Z, T9L, T9Q, T9i, T9l, T9o, T9R;
+					     E T9w;
+					     {
+						  E T9J, T9K, T9N, T9M, T9u, T9v;
+						  {
+						       E T93, T96, T9a, T9d;
+						       T9J = FMA(KP1_847759065, T92, T91);
+						       T93 = FNMS(KP1_847759065, T92, T91);
+						       R1[WS(rs, 54)] = FMA(KP1_606415062, T78, T77);
+						       R1[WS(rs, 22)] = FNMS(KP1_606415062, T78, T77);
+						       R1[WS(rs, 58)] = FMA(KP1_715457220, T6S, T6R);
+						       R1[WS(rs, 26)] = FNMS(KP1_715457220, T6S, T6R);
+						       T96 = FNMS(KP198912367, T95, T94);
+						       T9K = FMA(KP198912367, T94, T95);
+						       T9N = FMA(KP923879532, T99, T98);
+						       T9a = FNMS(KP923879532, T99, T98);
+						       T9d = FNMS(KP923879532, T9c, T9b);
+						       T9M = FMA(KP923879532, T9c, T9b);
+						       T9B = FMA(KP1_961570560, T96, T93);
+						       T97 = FNMS(KP1_961570560, T96, T93);
+						       T9C = FMA(KP820678790, T9a, T9d);
+						       T9e = FNMS(KP820678790, T9d, T9a);
+						  }
+						  T9T = FMA(KP923879532, T9s, T9r);
+						  T9t = FNMS(KP923879532, T9s, T9r);
+						  Ta0 = FMA(KP098491403, T9M, T9N);
+						  T9O = FNMS(KP098491403, T9N, T9M);
+						  T9Z = FMA(KP1_961570560, T9K, T9J);
+						  T9L = FNMS(KP1_961570560, T9K, T9J);
+						  T9Q = FMA(KP923879532, T9h, T9g);
+						  T9i = FNMS(KP923879532, T9h, T9g);
+						  T9l = FNMS(KP198912367, T9k, T9j);
+						  T9u = FMA(KP198912367, T9j, T9k);
+						  T9v = FMA(KP198912367, T9m, T9n);
+						  T9o = FNMS(KP198912367, T9n, T9m);
+						  T9R = T9u + T9v;
+						  T9w = T9u - T9v;
+					     }
+					     {
+						  E T9S, T9E, T9V, T9F;
+						  {
+						       E T9z, T9f, Ta3, T9x, T9U, T9p, Ta1, Ta5, Ta2, T9q;
+						       T9z = FNMS(KP1_546020906, T9e, T97);
+						       T9f = FMA(KP1_546020906, T9e, T97);
+						       T9S = FNMS(KP980785280, T9R, T9Q);
+						       Ta3 = FMA(KP980785280, T9R, T9Q);
+						       T9E = FNMS(KP980785280, T9w, T9t);
+						       T9x = FMA(KP980785280, T9w, T9t);
+						       T9U = T9l - T9o;
+						       T9p = T9l + T9o;
+						       Ta1 = FNMS(KP1_990369453, Ta0, T9Z);
+						       Ta5 = FMA(KP1_990369453, Ta0, T9Z);
+						       T9V = FNMS(KP980785280, T9U, T9T);
+						       Ta2 = FMA(KP980785280, T9U, T9T);
+						       T9F = FMA(KP980785280, T9p, T9i);
+						       T9q = FNMS(KP980785280, T9p, T9i);
+						       {
+							    E Ta4, Ta6, T9A, T9y;
+							    Ta4 = FNMS(KP049126849, Ta3, Ta2);
+							    Ta6 = FMA(KP049126849, Ta2, Ta3);
+							    T9A = FMA(KP357805721, T9q, T9x);
+							    T9y = FNMS(KP357805721, T9x, T9q);
+							    R1[WS(rs, 63)] = FMA(KP1_997590912, Ta6, Ta5);
+							    R1[WS(rs, 31)] = FNMS(KP1_997590912, Ta6, Ta5);
+							    R1[WS(rs, 47)] = FMA(KP1_997590912, Ta4, Ta1);
+							    R1[WS(rs, 15)] = FNMS(KP1_997590912, Ta4, Ta1);
+							    R1[WS(rs, 3)] = FMA(KP1_883088130, T9y, T9f);
+							    R1[WS(rs, 35)] = FNMS(KP1_883088130, T9y, T9f);
+							    R1[WS(rs, 19)] = FNMS(KP1_883088130, T9A, T9z);
+							    R1[WS(rs, 51)] = FMA(KP1_883088130, T9A, T9z);
+						       }
+						  }
+						  {
+						       E T9D, T9G, T9P, T9W;
+						       T9H = FMA(KP1_546020906, T9C, T9B);
+						       T9D = FNMS(KP1_546020906, T9C, T9B);
+						       T9G = FNMS(KP472964775, T9F, T9E);
+						       T9I = FMA(KP472964775, T9E, T9F);
+						       T9X = FMA(KP1_990369453, T9O, T9L);
+						       T9P = FNMS(KP1_990369453, T9O, T9L);
+						       T9W = FNMS(KP906347169, T9V, T9S);
+						       T9Y = FMA(KP906347169, T9S, T9V);
+						       R1[WS(rs, 43)] = FMA(KP1_807978586, T9G, T9D);
+						       R1[WS(rs, 11)] = FNMS(KP1_807978586, T9G, T9D);
+						       R1[WS(rs, 7)] = FMA(KP1_481902250, T9W, T9P);
+						       R1[WS(rs, 39)] = FNMS(KP1_481902250, T9W, T9P);
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       R1[WS(rs, 55)] = FMA(KP1_481902250, T9Y, T9X);
+	       R1[WS(rs, 23)] = FNMS(KP1_481902250, T9Y, T9X);
+	       R1[WS(rs, 59)] = FMA(KP1_807978586, T9I, T9H);
+	       R1[WS(rs, 27)] = FNMS(KP1_807978586, T9I, T9H);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 128, "r2cb_128", {416, 0, 540, 0}, &GENUS };
+
+void X(codelet_r2cb_128) (planner *p) {
+     X(kr2c_register) (p, r2cb_128, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 128 -name r2cb_128 -include r2cb.h */
+
+/*
+ * This function contains 956 FP additions, 342 FP multiplications,
+ * (or, 812 additions, 198 multiplications, 144 fused multiply/add),
+ * 198 stack variables, 39 constants, and 256 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_128(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_028205488, +1.028205488386443453187387677937631545216098241);
+     DK(KP1_715457220, +1.715457220000544139804539968569540274084981599);
+     DK(KP1_606415062, +1.606415062961289819613353025926283847759138854);
+     DK(KP1_191398608, +1.191398608984866686934073057659939779023852677);
+     DK(KP1_940062506, +1.940062506389087985207968414572200502913731924);
+     DK(KP485960359, +0.485960359806527779896548324154942236641981567);
+     DK(KP293460948, +0.293460948910723503317700259293435639412430633);
+     DK(KP1_978353019, +1.978353019929561946903347476032486127967379067);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP855110186, +0.855110186860564188641933713777597068609157259);
+     DK(KP1_807978586, +1.807978586246886663172400594461074097420264050);
+     DK(KP1_481902250, +1.481902250709918182351233794990325459457910619);
+     DK(KP1_343117909, +1.343117909694036801250753700854843606457501264);
+     DK(KP1_883088130, +1.883088130366041556825018805199004714371179592);
+     DK(KP673779706, +0.673779706784440101378506425238295140955533559);
+     DK(KP098135348, +0.098135348654836028509909953885365316629490726);
+     DK(KP1_997590912, +1.997590912410344785429543209518201388886407229);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP580569354, +0.580569354508924735272384751634790549382952557);
+     DK(KP1_913880671, +1.913880671464417729871595773960539938965698411);
+     DK(KP942793473, +0.942793473651995297112775251810508755314920638);
+     DK(KP1_763842528, +1.763842528696710059425513727320776699016885241);
+     DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP1_268786568, +1.268786568327290996430343226450986741351374190);
+     DK(KP1_546020906, +1.546020906725473921621813219516939601942082586);
+     DK(KP196034280, +0.196034280659121203988391127777283691722273346);
+     DK(KP1_990369453, +1.990369453344393772489673906218959843150949737);
+     DK(KP390180644, +0.390180644032256535696569736954044481855383236);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP765366864, +0.765366864730179543456919968060797733522689125);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(512, rs), MAKE_VOLATILE_STRIDE(512, csr), MAKE_VOLATILE_STRIDE(512, csi)) {
+	       E Ta, T6q, T2a, T5k, T8x, Tbx, TcF, Ten, Th, T6r, T2j, T5l, T8E, Tby, TcI;
+	       E Teo, Tx, T6t, TcM, Teq, TcP, Ter, T2t, T5n, T2C, T5o, T8Q, TbA, T8X, TbB;
+	       E T6w, T7L, T1j, T6L, Tde, TeC, TdL, TeR, T3v, T5z, T4I, T5O, T9O, TbM, TaV;
+	       E Tc1, T78, T7Z, TN, T6z, TcU, Teu, Td8, Tey, T2N, T5r, T3j, T5v, T9a, TbE;
+	       E T9A, TbI, T6H, T7O, T1O, T7V, T48, T4u, Tds, TeG, T5E, T5K, Taf, TbP, Tdp;
+	       E TeF, T6U, T72, Tam, TbQ, T23, T7U, T4r, T4v, Tdz, TeJ, T5H, T5L, Tay, TbS;
+	       E Tdw, TeI, T6Z, T73, TaF, TbT, T1y, T75, Tdl, TeQ, TdI, TeD, T3O, T5N, T4z;
+	       E T5A, Ta3, Tc0, TaO, TbN, T6O, T80, T12, T6E, Td1, Tex, Td5, Tev, T36, T5u;
+	       E T3a, T5s, T9p, TbH, T9t, TbF, T6C, T7P;
+	       {
+		    E T5, T8s, T3, T8q, T9, T8u, T29, T8v, T6, T26;
+		    {
+			 E T4, T8r, T1, T2;
+			 T4 = Cr[WS(csr, 32)];
+			 T5 = KP2_000000000 * T4;
+			 T8r = Ci[WS(csi, 32)];
+			 T8s = KP2_000000000 * T8r;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 64)];
+			 T3 = T1 + T2;
+			 T8q = T1 - T2;
+			 {
+			      E T7, T8, T27, T28;
+			      T7 = Cr[WS(csr, 16)];
+			      T8 = Cr[WS(csr, 48)];
+			      T9 = KP2_000000000 * (T7 + T8);
+			      T8u = T7 - T8;
+			      T27 = Ci[WS(csi, 16)];
+			      T28 = Ci[WS(csi, 48)];
+			      T29 = KP2_000000000 * (T27 - T28);
+			      T8v = T27 + T28;
+			 }
+		    }
+		    T6 = T3 + T5;
+		    Ta = T6 + T9;
+		    T6q = T6 - T9;
+		    T26 = T3 - T5;
+		    T2a = T26 - T29;
+		    T5k = T26 + T29;
+		    {
+			 E T8t, T8w, TcD, TcE;
+			 T8t = T8q - T8s;
+			 T8w = KP1_414213562 * (T8u - T8v);
+			 T8x = T8t + T8w;
+			 Tbx = T8t - T8w;
+			 TcD = T8q + T8s;
+			 TcE = KP1_414213562 * (T8u + T8v);
+			 TcF = TcD - TcE;
+			 Ten = TcD + TcE;
+		    }
+	       }
+	       {
+		    E Td, T8y, T2e, T8C, Tg, T8B, T2h, T8z, T2b, T2i;
+		    {
+			 E Tb, Tc, T2c, T2d;
+			 Tb = Cr[WS(csr, 8)];
+			 Tc = Cr[WS(csr, 56)];
+			 Td = Tb + Tc;
+			 T8y = Tb - Tc;
+			 T2c = Ci[WS(csi, 8)];
+			 T2d = Ci[WS(csi, 56)];
+			 T2e = T2c - T2d;
+			 T8C = T2c + T2d;
+		    }
+		    {
+			 E Te, Tf, T2f, T2g;
+			 Te = Cr[WS(csr, 40)];
+			 Tf = Cr[WS(csr, 24)];
+			 Tg = Te + Tf;
+			 T8B = Te - Tf;
+			 T2f = Ci[WS(csi, 40)];
+			 T2g = Ci[WS(csi, 24)];
+			 T2h = T2f - T2g;
+			 T8z = T2f + T2g;
+		    }
+		    Th = KP2_000000000 * (Td + Tg);
+		    T6r = KP2_000000000 * (T2h + T2e);
+		    T2b = Td - Tg;
+		    T2i = T2e - T2h;
+		    T2j = KP1_414213562 * (T2b - T2i);
+		    T5l = KP1_414213562 * (T2b + T2i);
+		    {
+			 E T8A, T8D, TcG, TcH;
+			 T8A = T8y - T8z;
+			 T8D = T8B + T8C;
+			 T8E = FNMS(KP765366864, T8D, KP1_847759065 * T8A);
+			 Tby = FMA(KP765366864, T8A, KP1_847759065 * T8D);
+			 TcG = T8y + T8z;
+			 TcH = T8C - T8B;
+			 TcI = FNMS(KP1_847759065, TcH, KP765366864 * TcG);
+			 Teo = FMA(KP1_847759065, TcG, KP765366864 * TcH);
+		    }
+	       }
+	       {
+		    E Tl, T8G, T2x, T8V, To, T8U, T2A, T8H, Tv, T8S, T2o, T8O, Ts, T8R, T2r;
+		    E T8L;
+		    {
+			 E Tj, Tk, T2y, T2z;
+			 Tj = Cr[WS(csr, 4)];
+			 Tk = Cr[WS(csr, 60)];
+			 Tl = Tj + Tk;
+			 T8G = Tj - Tk;
+			 {
+			      E T2v, T2w, Tm, Tn;
+			      T2v = Ci[WS(csi, 4)];
+			      T2w = Ci[WS(csi, 60)];
+			      T2x = T2v - T2w;
+			      T8V = T2v + T2w;
+			      Tm = Cr[WS(csr, 36)];
+			      Tn = Cr[WS(csr, 28)];
+			      To = Tm + Tn;
+			      T8U = Tm - Tn;
+			 }
+			 T2y = Ci[WS(csi, 36)];
+			 T2z = Ci[WS(csi, 28)];
+			 T2A = T2y - T2z;
+			 T8H = T2y + T2z;
+			 {
+			      E Tt, Tu, T8M, T2m, T2n, T8N;
+			      Tt = Cr[WS(csr, 12)];
+			      Tu = Cr[WS(csr, 52)];
+			      T8M = Tt - Tu;
+			      T2m = Ci[WS(csi, 52)];
+			      T2n = Ci[WS(csi, 12)];
+			      T8N = T2n + T2m;
+			      Tv = Tt + Tu;
+			      T8S = T8M + T8N;
+			      T2o = T2m - T2n;
+			      T8O = T8M - T8N;
+			 }
+			 {
+			      E Tq, Tr, T8J, T2p, T2q, T8K;
+			      Tq = Cr[WS(csr, 20)];
+			      Tr = Cr[WS(csr, 44)];
+			      T8J = Tq - Tr;
+			      T2p = Ci[WS(csi, 20)];
+			      T2q = Ci[WS(csi, 44)];
+			      T8K = T2p + T2q;
+			      Ts = Tq + Tr;
+			      T8R = T8J + T8K;
+			      T2r = T2p - T2q;
+			      T8L = T8J - T8K;
+			 }
+		    }
+		    {
+			 E Tp, Tw, TcK, TcL;
+			 Tp = Tl + To;
+			 Tw = Ts + Tv;
+			 Tx = KP2_000000000 * (Tp + Tw);
+			 T6t = Tp - Tw;
+			 TcK = T8G + T8H;
+			 TcL = KP707106781 * (T8R + T8S);
+			 TcM = TcK - TcL;
+			 Teq = TcK + TcL;
+		    }
+		    {
+			 E TcN, TcO, T2l, T2s;
+			 TcN = KP707106781 * (T8L - T8O);
+			 TcO = T8V - T8U;
+			 TcP = TcN + TcO;
+			 Ter = TcO - TcN;
+			 T2l = Tl - To;
+			 T2s = T2o - T2r;
+			 T2t = T2l + T2s;
+			 T5n = T2l - T2s;
+		    }
+		    {
+			 E T2u, T2B, T8I, T8P;
+			 T2u = Ts - Tv;
+			 T2B = T2x - T2A;
+			 T2C = T2u + T2B;
+			 T5o = T2B - T2u;
+			 T8I = T8G - T8H;
+			 T8P = KP707106781 * (T8L + T8O);
+			 T8Q = T8I + T8P;
+			 TbA = T8I - T8P;
+		    }
+		    {
+			 E T8T, T8W, T6u, T6v;
+			 T8T = KP707106781 * (T8R - T8S);
+			 T8W = T8U + T8V;
+			 T8X = T8T + T8W;
+			 TbB = T8W - T8T;
+			 T6u = T2A + T2x;
+			 T6v = T2r + T2o;
+			 T6w = T6u - T6v;
+			 T7L = KP2_000000000 * (T6v + T6u);
+		    }
+	       }
+	       {
+		    E T17, T9E, T4D, TaT, T1a, TaS, T4G, T9F, T1h, TaQ, T3q, T9M, T1e, TaP, T3t;
+		    E T9J;
+		    {
+			 E T15, T16, T4E, T4F;
+			 T15 = Cr[WS(csr, 1)];
+			 T16 = Cr[WS(csr, 63)];
+			 T17 = T15 + T16;
+			 T9E = T15 - T16;
+			 {
+			      E T4B, T4C, T18, T19;
+			      T4B = Ci[WS(csi, 1)];
+			      T4C = Ci[WS(csi, 63)];
+			      T4D = T4B - T4C;
+			      TaT = T4B + T4C;
+			      T18 = Cr[WS(csr, 33)];
+			      T19 = Cr[WS(csr, 31)];
+			      T1a = T18 + T19;
+			      TaS = T18 - T19;
+			 }
+			 T4E = Ci[WS(csi, 33)];
+			 T4F = Ci[WS(csi, 31)];
+			 T4G = T4E - T4F;
+			 T9F = T4E + T4F;
+			 {
+			      E T1f, T1g, T9K, T3o, T3p, T9L;
+			      T1f = Cr[WS(csr, 15)];
+			      T1g = Cr[WS(csr, 49)];
+			      T9K = T1f - T1g;
+			      T3o = Ci[WS(csi, 49)];
+			      T3p = Ci[WS(csi, 15)];
+			      T9L = T3p + T3o;
+			      T1h = T1f + T1g;
+			      TaQ = T9K + T9L;
+			      T3q = T3o - T3p;
+			      T9M = T9K - T9L;
+			 }
+			 {
+			      E T1c, T1d, T9H, T3r, T3s, T9I;
+			      T1c = Cr[WS(csr, 17)];
+			      T1d = Cr[WS(csr, 47)];
+			      T9H = T1c - T1d;
+			      T3r = Ci[WS(csi, 17)];
+			      T3s = Ci[WS(csi, 47)];
+			      T9I = T3r + T3s;
+			      T1e = T1c + T1d;
+			      TaP = T9H + T9I;
+			      T3t = T3r - T3s;
+			      T9J = T9H - T9I;
+			 }
+		    }
+		    {
+			 E T1b, T1i, Tdc, Tdd;
+			 T1b = T17 + T1a;
+			 T1i = T1e + T1h;
+			 T1j = T1b + T1i;
+			 T6L = T1b - T1i;
+			 Tdc = T9E + T9F;
+			 Tdd = KP707106781 * (TaP + TaQ);
+			 Tde = Tdc - Tdd;
+			 TeC = Tdc + Tdd;
+		    }
+		    {
+			 E TdJ, TdK, T3n, T3u;
+			 TdJ = KP707106781 * (T9J - T9M);
+			 TdK = TaT - TaS;
+			 TdL = TdJ + TdK;
+			 TeR = TdK - TdJ;
+			 T3n = T17 - T1a;
+			 T3u = T3q - T3t;
+			 T3v = T3n + T3u;
+			 T5z = T3n - T3u;
+		    }
+		    {
+			 E T4A, T4H, T9G, T9N;
+			 T4A = T1e - T1h;
+			 T4H = T4D - T4G;
+			 T4I = T4A + T4H;
+			 T5O = T4H - T4A;
+			 T9G = T9E - T9F;
+			 T9N = KP707106781 * (T9J + T9M);
+			 T9O = T9G + T9N;
+			 TbM = T9G - T9N;
+		    }
+		    {
+			 E TaR, TaU, T76, T77;
+			 TaR = KP707106781 * (TaP - TaQ);
+			 TaU = TaS + TaT;
+			 TaV = TaR + TaU;
+			 Tc1 = TaU - TaR;
+			 T76 = T4G + T4D;
+			 T77 = T3t + T3q;
+			 T78 = T76 - T77;
+			 T7Z = T77 + T76;
+		    }
+	       }
+	       {
+		    E TB, T90, T3e, T9y, TE, T9x, T3h, T91, TL, T9v, T2I, T98, TI, T9u, T2L;
+		    E T95;
+		    {
+			 E Tz, TA, T3f, T3g;
+			 Tz = Cr[WS(csr, 2)];
+			 TA = Cr[WS(csr, 62)];
+			 TB = Tz + TA;
+			 T90 = Tz - TA;
+			 {
+			      E T3c, T3d, TC, TD;
+			      T3c = Ci[WS(csi, 2)];
+			      T3d = Ci[WS(csi, 62)];
+			      T3e = T3c - T3d;
+			      T9y = T3c + T3d;
+			      TC = Cr[WS(csr, 34)];
+			      TD = Cr[WS(csr, 30)];
+			      TE = TC + TD;
+			      T9x = TC - TD;
+			 }
+			 T3f = Ci[WS(csi, 34)];
+			 T3g = Ci[WS(csi, 30)];
+			 T3h = T3f - T3g;
+			 T91 = T3f + T3g;
+			 {
+			      E TJ, TK, T96, T2G, T2H, T97;
+			      TJ = Cr[WS(csr, 14)];
+			      TK = Cr[WS(csr, 50)];
+			      T96 = TJ - TK;
+			      T2G = Ci[WS(csi, 50)];
+			      T2H = Ci[WS(csi, 14)];
+			      T97 = T2H + T2G;
+			      TL = TJ + TK;
+			      T9v = T96 + T97;
+			      T2I = T2G - T2H;
+			      T98 = T96 - T97;
+			 }
+			 {
+			      E TG, TH, T93, T2J, T2K, T94;
+			      TG = Cr[WS(csr, 18)];
+			      TH = Cr[WS(csr, 46)];
+			      T93 = TG - TH;
+			      T2J = Ci[WS(csi, 18)];
+			      T2K = Ci[WS(csi, 46)];
+			      T94 = T2J + T2K;
+			      TI = TG + TH;
+			      T9u = T93 + T94;
+			      T2L = T2J - T2K;
+			      T95 = T93 - T94;
+			 }
+		    }
+		    {
+			 E TF, TM, TcS, TcT;
+			 TF = TB + TE;
+			 TM = TI + TL;
+			 TN = TF + TM;
+			 T6z = TF - TM;
+			 TcS = T90 + T91;
+			 TcT = KP707106781 * (T9u + T9v);
+			 TcU = TcS - TcT;
+			 Teu = TcS + TcT;
+		    }
+		    {
+			 E Td6, Td7, T2F, T2M;
+			 Td6 = KP707106781 * (T95 - T98);
+			 Td7 = T9y - T9x;
+			 Td8 = Td6 + Td7;
+			 Tey = Td7 - Td6;
+			 T2F = TB - TE;
+			 T2M = T2I - T2L;
+			 T2N = T2F + T2M;
+			 T5r = T2F - T2M;
+		    }
+		    {
+			 E T3b, T3i, T92, T99;
+			 T3b = TI - TL;
+			 T3i = T3e - T3h;
+			 T3j = T3b + T3i;
+			 T5v = T3i - T3b;
+			 T92 = T90 - T91;
+			 T99 = KP707106781 * (T95 + T98);
+			 T9a = T92 + T99;
+			 TbE = T92 - T99;
+		    }
+		    {
+			 E T9w, T9z, T6F, T6G;
+			 T9w = KP707106781 * (T9u - T9v);
+			 T9z = T9x + T9y;
+			 T9A = T9w + T9z;
+			 TbI = T9z - T9w;
+			 T6F = T3h + T3e;
+			 T6G = T2L + T2I;
+			 T6H = T6F - T6G;
+			 T7O = T6G + T6F;
+		    }
+	       }
+	       {
+		    E T1G, Taj, T3Q, Ta5, T46, Tak, T6R, Ta6, T1N, Tag, Tah, T3X, T3Z, Taa, Tad;
+		    E T6S, Tdn, Tdo;
+		    {
+			 E T1A, T1B, T1C, T1D, T1E, T1F;
+			 T1A = Cr[WS(csr, 5)];
+			 T1B = Cr[WS(csr, 59)];
+			 T1C = T1A + T1B;
+			 T1D = Cr[WS(csr, 37)];
+			 T1E = Cr[WS(csr, 27)];
+			 T1F = T1D + T1E;
+			 T1G = T1C + T1F;
+			 Taj = T1D - T1E;
+			 T3Q = T1C - T1F;
+			 Ta5 = T1A - T1B;
+		    }
+		    {
+			 E T40, T41, T42, T43, T44, T45;
+			 T40 = Ci[WS(csi, 5)];
+			 T41 = Ci[WS(csi, 59)];
+			 T42 = T40 - T41;
+			 T43 = Ci[WS(csi, 37)];
+			 T44 = Ci[WS(csi, 27)];
+			 T45 = T43 - T44;
+			 T46 = T42 - T45;
+			 Tak = T40 + T41;
+			 T6R = T45 + T42;
+			 Ta6 = T43 + T44;
+		    }
+		    {
+			 E T1J, Ta8, T3W, Ta9, T1M, Tab, T3T, Tac;
+			 {
+			      E T1H, T1I, T3U, T3V;
+			      T1H = Cr[WS(csr, 21)];
+			      T1I = Cr[WS(csr, 43)];
+			      T1J = T1H + T1I;
+			      Ta8 = T1H - T1I;
+			      T3U = Ci[WS(csi, 21)];
+			      T3V = Ci[WS(csi, 43)];
+			      T3W = T3U - T3V;
+			      Ta9 = T3U + T3V;
+			 }
+			 {
+			      E T1K, T1L, T3R, T3S;
+			      T1K = Cr[WS(csr, 11)];
+			      T1L = Cr[WS(csr, 53)];
+			      T1M = T1K + T1L;
+			      Tab = T1K - T1L;
+			      T3R = Ci[WS(csi, 53)];
+			      T3S = Ci[WS(csi, 11)];
+			      T3T = T3R - T3S;
+			      Tac = T3S + T3R;
+			 }
+			 T1N = T1J + T1M;
+			 Tag = Ta8 + Ta9;
+			 Tah = Tab + Tac;
+			 T3X = T3T - T3W;
+			 T3Z = T1J - T1M;
+			 Taa = Ta8 - Ta9;
+			 Tad = Tab - Tac;
+			 T6S = T3W + T3T;
+		    }
+		    T1O = T1G + T1N;
+		    T7V = T6S + T6R;
+		    {
+			 E T3Y, T47, Tdq, Tdr;
+			 T3Y = T3Q + T3X;
+			 T47 = T3Z + T46;
+			 T48 = FNMS(KP382683432, T47, KP923879532 * T3Y);
+			 T4u = FMA(KP382683432, T3Y, KP923879532 * T47);
+			 Tdq = KP707106781 * (Taa - Tad);
+			 Tdr = Tak - Taj;
+			 Tds = Tdq + Tdr;
+			 TeG = Tdr - Tdq;
+		    }
+		    {
+			 E T5C, T5D, Ta7, Tae;
+			 T5C = T3Q - T3X;
+			 T5D = T46 - T3Z;
+			 T5E = FNMS(KP923879532, T5D, KP382683432 * T5C);
+			 T5K = FMA(KP923879532, T5C, KP382683432 * T5D);
+			 Ta7 = Ta5 - Ta6;
+			 Tae = KP707106781 * (Taa + Tad);
+			 Taf = Ta7 + Tae;
+			 TbP = Ta7 - Tae;
+		    }
+		    Tdn = Ta5 + Ta6;
+		    Tdo = KP707106781 * (Tag + Tah);
+		    Tdp = Tdn - Tdo;
+		    TeF = Tdn + Tdo;
+		    {
+			 E T6Q, T6T, Tai, Tal;
+			 T6Q = T1G - T1N;
+			 T6T = T6R - T6S;
+			 T6U = T6Q - T6T;
+			 T72 = T6Q + T6T;
+			 Tai = KP707106781 * (Tag - Tah);
+			 Tal = Taj + Tak;
+			 Tam = Tai + Tal;
+			 TbQ = Tal - Tai;
+		    }
+	       }
+	       {
+		    E T1V, TaC, T49, Tao, T4p, TaD, T6W, Tap, T22, Taz, TaA, T4g, T4i, Tat, Taw;
+		    E T6X, Tdu, Tdv;
+		    {
+			 E T1P, T1Q, T1R, T1S, T1T, T1U;
+			 T1P = Cr[WS(csr, 3)];
+			 T1Q = Cr[WS(csr, 61)];
+			 T1R = T1P + T1Q;
+			 T1S = Cr[WS(csr, 29)];
+			 T1T = Cr[WS(csr, 35)];
+			 T1U = T1S + T1T;
+			 T1V = T1R + T1U;
+			 TaC = T1S - T1T;
+			 T49 = T1R - T1U;
+			 Tao = T1P - T1Q;
+		    }
+		    {
+			 E T4j, T4k, T4l, T4m, T4n, T4o;
+			 T4j = Ci[WS(csi, 61)];
+			 T4k = Ci[WS(csi, 3)];
+			 T4l = T4j - T4k;
+			 T4m = Ci[WS(csi, 29)];
+			 T4n = Ci[WS(csi, 35)];
+			 T4o = T4m - T4n;
+			 T4p = T4l - T4o;
+			 TaD = T4k + T4j;
+			 T6W = T4o + T4l;
+			 Tap = T4m + T4n;
+		    }
+		    {
+			 E T1Y, Tar, T4f, Tas, T21, Tau, T4c, Tav;
+			 {
+			      E T1W, T1X, T4d, T4e;
+			      T1W = Cr[WS(csr, 13)];
+			      T1X = Cr[WS(csr, 51)];
+			      T1Y = T1W + T1X;
+			      Tar = T1W - T1X;
+			      T4d = Ci[WS(csi, 13)];
+			      T4e = Ci[WS(csi, 51)];
+			      T4f = T4d - T4e;
+			      Tas = T4d + T4e;
+			 }
+			 {
+			      E T1Z, T20, T4a, T4b;
+			      T1Z = Cr[WS(csr, 19)];
+			      T20 = Cr[WS(csr, 45)];
+			      T21 = T1Z + T20;
+			      Tau = T1Z - T20;
+			      T4a = Ci[WS(csi, 45)];
+			      T4b = Ci[WS(csi, 19)];
+			      T4c = T4a - T4b;
+			      Tav = T4b + T4a;
+			 }
+			 T22 = T1Y + T21;
+			 Taz = Tar + Tas;
+			 TaA = Tau + Tav;
+			 T4g = T4c - T4f;
+			 T4i = T1Y - T21;
+			 Tat = Tar - Tas;
+			 Taw = Tau - Tav;
+			 T6X = T4f + T4c;
+		    }
+		    T23 = T1V + T22;
+		    T7U = T6X + T6W;
+		    {
+			 E T4h, T4q, Tdx, Tdy;
+			 T4h = T49 + T4g;
+			 T4q = T4i + T4p;
+			 T4r = FMA(KP923879532, T4h, KP382683432 * T4q);
+			 T4v = FNMS(KP382683432, T4h, KP923879532 * T4q);
+			 Tdx = KP707106781 * (Tat - Taw);
+			 Tdy = TaC + TaD;
+			 Tdz = Tdx - Tdy;
+			 TeJ = Tdx + Tdy;
+		    }
+		    {
+			 E T5F, T5G, Taq, Tax;
+			 T5F = T49 - T4g;
+			 T5G = T4p - T4i;
+			 T5H = FMA(KP382683432, T5F, KP923879532 * T5G);
+			 T5L = FNMS(KP923879532, T5F, KP382683432 * T5G);
+			 Taq = Tao - Tap;
+			 Tax = KP707106781 * (Tat + Taw);
+			 Tay = Taq + Tax;
+			 TbS = Taq - Tax;
+		    }
+		    Tdu = Tao + Tap;
+		    Tdv = KP707106781 * (Taz + TaA);
+		    Tdw = Tdu - Tdv;
+		    TeI = Tdu + Tdv;
+		    {
+			 E T6V, T6Y, TaB, TaE;
+			 T6V = T1V - T22;
+			 T6Y = T6W - T6X;
+			 T6Z = T6V + T6Y;
+			 T73 = T6Y - T6V;
+			 TaB = KP707106781 * (Taz - TaA);
+			 TaE = TaC - TaD;
+			 TaF = TaB + TaE;
+			 TbT = TaE - TaB;
+		    }
+	       }
+	       {
+		    E T1m, T3z, T1p, T3C, T3w, T3D, Tdg, Tdf, T9U, T9R, T1t, T3I, T1w, T3L, T3F;
+		    E T3M, Tdj, Tdi, Ta1, T9Y;
+		    {
+			 E T9P, T9T, T9S, T9Q;
+			 {
+			      E T1k, T1l, T3x, T3y;
+			      T1k = Cr[WS(csr, 9)];
+			      T1l = Cr[WS(csr, 55)];
+			      T1m = T1k + T1l;
+			      T9P = T1k - T1l;
+			      T3x = Ci[WS(csi, 9)];
+			      T3y = Ci[WS(csi, 55)];
+			      T3z = T3x - T3y;
+			      T9T = T3x + T3y;
+			 }
+			 {
+			      E T1n, T1o, T3A, T3B;
+			      T1n = Cr[WS(csr, 41)];
+			      T1o = Cr[WS(csr, 23)];
+			      T1p = T1n + T1o;
+			      T9S = T1n - T1o;
+			      T3A = Ci[WS(csi, 41)];
+			      T3B = Ci[WS(csi, 23)];
+			      T3C = T3A - T3B;
+			      T9Q = T3A + T3B;
+			 }
+			 T3w = T1m - T1p;
+			 T3D = T3z - T3C;
+			 Tdg = T9T - T9S;
+			 Tdf = T9P + T9Q;
+			 T9U = T9S + T9T;
+			 T9R = T9P - T9Q;
+		    }
+		    {
+			 E T9W, Ta0, T9Z, T9X;
+			 {
+			      E T1r, T1s, T3G, T3H;
+			      T1r = Cr[WS(csr, 7)];
+			      T1s = Cr[WS(csr, 57)];
+			      T1t = T1r + T1s;
+			      T9W = T1r - T1s;
+			      T3G = Ci[WS(csi, 57)];
+			      T3H = Ci[WS(csi, 7)];
+			      T3I = T3G - T3H;
+			      Ta0 = T3H + T3G;
+			 }
+			 {
+			      E T1u, T1v, T3J, T3K;
+			      T1u = Cr[WS(csr, 25)];
+			      T1v = Cr[WS(csr, 39)];
+			      T1w = T1u + T1v;
+			      T9Z = T1u - T1v;
+			      T3J = Ci[WS(csi, 25)];
+			      T3K = Ci[WS(csi, 39)];
+			      T3L = T3J - T3K;
+			      T9X = T3J + T3K;
+			 }
+			 T3F = T1t - T1w;
+			 T3M = T3I - T3L;
+			 Tdj = T9Z + Ta0;
+			 Tdi = T9W + T9X;
+			 Ta1 = T9Z - Ta0;
+			 T9Y = T9W - T9X;
+		    }
+		    {
+			 E T1q, T1x, Tdh, Tdk;
+			 T1q = T1m + T1p;
+			 T1x = T1t + T1w;
+			 T1y = T1q + T1x;
+			 T75 = T1q - T1x;
+			 Tdh = FNMS(KP923879532, Tdg, KP382683432 * Tdf);
+			 Tdk = FNMS(KP923879532, Tdj, KP382683432 * Tdi);
+			 Tdl = Tdh + Tdk;
+			 TeQ = Tdh - Tdk;
+		    }
+		    {
+			 E TdG, TdH, T3E, T3N;
+			 TdG = FMA(KP923879532, Tdf, KP382683432 * Tdg);
+			 TdH = FMA(KP923879532, Tdi, KP382683432 * Tdj);
+			 TdI = TdG - TdH;
+			 TeD = TdG + TdH;
+			 T3E = T3w - T3D;
+			 T3N = T3F + T3M;
+			 T3O = KP707106781 * (T3E + T3N);
+			 T5N = KP707106781 * (T3E - T3N);
+		    }
+		    {
+			 E T4x, T4y, T9V, Ta2;
+			 T4x = T3w + T3D;
+			 T4y = T3M - T3F;
+			 T4z = KP707106781 * (T4x + T4y);
+			 T5A = KP707106781 * (T4y - T4x);
+			 T9V = FNMS(KP382683432, T9U, KP923879532 * T9R);
+			 Ta2 = FMA(KP923879532, T9Y, KP382683432 * Ta1);
+			 Ta3 = T9V + Ta2;
+			 Tc0 = T9V - Ta2;
+		    }
+		    {
+			 E TaM, TaN, T6M, T6N;
+			 TaM = FMA(KP382683432, T9R, KP923879532 * T9U);
+			 TaN = FNMS(KP382683432, T9Y, KP923879532 * Ta1);
+			 TaO = TaM + TaN;
+			 TbN = TaN - TaM;
+			 T6M = T3L + T3I;
+			 T6N = T3C + T3z;
+			 T6O = T6M - T6N;
+			 T80 = T6N + T6M;
+		    }
+	       }
+	       {
+		    E TQ, T2R, TT, T2U, T2O, T2V, TcW, TcV, T9g, T9d, TX, T30, T10, T33, T2X;
+		    E T34, TcZ, TcY, T9n, T9k;
+		    {
+			 E T9b, T9f, T9e, T9c;
+			 {
+			      E TO, TP, T2P, T2Q;
+			      TO = Cr[WS(csr, 10)];
+			      TP = Cr[WS(csr, 54)];
+			      TQ = TO + TP;
+			      T9b = TO - TP;
+			      T2P = Ci[WS(csi, 10)];
+			      T2Q = Ci[WS(csi, 54)];
+			      T2R = T2P - T2Q;
+			      T9f = T2P + T2Q;
+			 }
+			 {
+			      E TR, TS, T2S, T2T;
+			      TR = Cr[WS(csr, 42)];
+			      TS = Cr[WS(csr, 22)];
+			      TT = TR + TS;
+			      T9e = TR - TS;
+			      T2S = Ci[WS(csi, 42)];
+			      T2T = Ci[WS(csi, 22)];
+			      T2U = T2S - T2T;
+			      T9c = T2S + T2T;
+			 }
+			 T2O = TQ - TT;
+			 T2V = T2R - T2U;
+			 TcW = T9f - T9e;
+			 TcV = T9b + T9c;
+			 T9g = T9e + T9f;
+			 T9d = T9b - T9c;
+		    }
+		    {
+			 E T9i, T9m, T9l, T9j;
+			 {
+			      E TV, TW, T2Y, T2Z;
+			      TV = Cr[WS(csr, 6)];
+			      TW = Cr[WS(csr, 58)];
+			      TX = TV + TW;
+			      T9i = TV - TW;
+			      T2Y = Ci[WS(csi, 58)];
+			      T2Z = Ci[WS(csi, 6)];
+			      T30 = T2Y - T2Z;
+			      T9m = T2Z + T2Y;
+			 }
+			 {
+			      E TY, TZ, T31, T32;
+			      TY = Cr[WS(csr, 26)];
+			      TZ = Cr[WS(csr, 38)];
+			      T10 = TY + TZ;
+			      T9l = TY - TZ;
+			      T31 = Ci[WS(csi, 26)];
+			      T32 = Ci[WS(csi, 38)];
+			      T33 = T31 - T32;
+			      T9j = T31 + T32;
+			 }
+			 T2X = TX - T10;
+			 T34 = T30 - T33;
+			 TcZ = T9l + T9m;
+			 TcY = T9i + T9j;
+			 T9n = T9l - T9m;
+			 T9k = T9i - T9j;
+		    }
+		    {
+			 E TU, T11, TcX, Td0;
+			 TU = TQ + TT;
+			 T11 = TX + T10;
+			 T12 = TU + T11;
+			 T6E = TU - T11;
+			 TcX = FNMS(KP923879532, TcW, KP382683432 * TcV);
+			 Td0 = FNMS(KP923879532, TcZ, KP382683432 * TcY);
+			 Td1 = TcX + Td0;
+			 Tex = TcX - Td0;
+		    }
+		    {
+			 E Td3, Td4, T2W, T35;
+			 Td3 = FMA(KP923879532, TcV, KP382683432 * TcW);
+			 Td4 = FMA(KP923879532, TcY, KP382683432 * TcZ);
+			 Td5 = Td3 - Td4;
+			 Tev = Td3 + Td4;
+			 T2W = T2O - T2V;
+			 T35 = T2X + T34;
+			 T36 = KP707106781 * (T2W + T35);
+			 T5u = KP707106781 * (T2W - T35);
+		    }
+		    {
+			 E T38, T39, T9h, T9o;
+			 T38 = T2O + T2V;
+			 T39 = T34 - T2X;
+			 T3a = KP707106781 * (T38 + T39);
+			 T5s = KP707106781 * (T39 - T38);
+			 T9h = FNMS(KP382683432, T9g, KP923879532 * T9d);
+			 T9o = FMA(KP923879532, T9k, KP382683432 * T9n);
+			 T9p = T9h + T9o;
+			 TbH = T9h - T9o;
+		    }
+		    {
+			 E T9r, T9s, T6A, T6B;
+			 T9r = FMA(KP382683432, T9d, KP923879532 * T9g);
+			 T9s = FNMS(KP382683432, T9k, KP923879532 * T9n);
+			 T9t = T9r + T9s;
+			 TbF = T9s - T9r;
+			 T6A = T33 + T30;
+			 T6B = T2U + T2R;
+			 T6C = T6A - T6B;
+			 T7P = T6B + T6A;
+		    }
+	       }
+	       {
+		    E T13, T8f, Ty, T8e, T25, T8h, T8k, T8p, Ti, T14, T8o;
+		    T13 = KP2_000000000 * (TN + T12);
+		    T8f = KP2_000000000 * (T7P + T7O);
+		    Ti = Ta + Th;
+		    Ty = Ti + Tx;
+		    T8e = Ti - Tx;
+		    {
+			 E T1z, T24, T8i, T8j;
+			 T1z = T1j + T1y;
+			 T24 = T1O + T23;
+			 T25 = KP2_000000000 * (T1z + T24);
+			 T8h = T1z - T24;
+			 T8i = T80 + T7Z;
+			 T8j = T7V + T7U;
+			 T8k = T8i - T8j;
+			 T8p = KP2_000000000 * (T8j + T8i);
+		    }
+		    T14 = Ty + T13;
+		    R0[WS(rs, 32)] = T14 - T25;
+		    R0[0] = T14 + T25;
+		    T8o = Ty - T13;
+		    R0[WS(rs, 16)] = T8o - T8p;
+		    R0[WS(rs, 48)] = T8o + T8p;
+		    {
+			 E T8g, T8l, T8m, T8n;
+			 T8g = T8e - T8f;
+			 T8l = KP1_414213562 * (T8h - T8k);
+			 R0[WS(rs, 40)] = T8g - T8l;
+			 R0[WS(rs, 8)] = T8g + T8l;
+			 T8m = T8e + T8f;
+			 T8n = KP1_414213562 * (T8h + T8k);
+			 R0[WS(rs, 24)] = T8m - T8n;
+			 R0[WS(rs, 56)] = T8m + T8n;
+		    }
+	       }
+	       {
+		    E T7M, T86, T82, T8a, T7R, T87, T7X, T89, T7K, T7Y, T81;
+		    T7K = Ta - Th;
+		    T7M = T7K - T7L;
+		    T86 = T7K + T7L;
+		    T7Y = T1O - T23;
+		    T81 = T7Z - T80;
+		    T82 = T7Y + T81;
+		    T8a = T81 - T7Y;
+		    {
+			 E T7N, T7Q, T7T, T7W;
+			 T7N = TN - T12;
+			 T7Q = T7O - T7P;
+			 T7R = KP1_414213562 * (T7N - T7Q);
+			 T87 = KP1_414213562 * (T7N + T7Q);
+			 T7T = T1j - T1y;
+			 T7W = T7U - T7V;
+			 T7X = T7T + T7W;
+			 T89 = T7T - T7W;
+		    }
+		    {
+			 E T7S, T83, T8c, T8d;
+			 T7S = T7M + T7R;
+			 T83 = FNMS(KP765366864, T82, KP1_847759065 * T7X);
+			 R0[WS(rs, 36)] = T7S - T83;
+			 R0[WS(rs, 4)] = T7S + T83;
+			 T8c = T86 + T87;
+			 T8d = FMA(KP1_847759065, T89, KP765366864 * T8a);
+			 R0[WS(rs, 28)] = T8c - T8d;
+			 R0[WS(rs, 60)] = T8c + T8d;
+		    }
+		    {
+			 E T84, T85, T88, T8b;
+			 T84 = T7M - T7R;
+			 T85 = FMA(KP765366864, T7X, KP1_847759065 * T82);
+			 R0[WS(rs, 20)] = T84 - T85;
+			 R0[WS(rs, 52)] = T84 + T85;
+			 T88 = T86 - T87;
+			 T8b = FNMS(KP1_847759065, T8a, KP765366864 * T89);
+			 R0[WS(rs, 44)] = T88 - T8b;
+			 R0[WS(rs, 12)] = T88 + T8b;
+		    }
+	       }
+	       {
+		    E T2E, T4O, T4K, T4S, T3l, T4P, T4t, T4R;
+		    {
+			 E T2k, T2D, T4w, T4J;
+			 T2k = T2a + T2j;
+			 T2D = FNMS(KP765366864, T2C, KP1_847759065 * T2t);
+			 T2E = T2k + T2D;
+			 T4O = T2k - T2D;
+			 T4w = T4u + T4v;
+			 T4J = T4z + T4I;
+			 T4K = T4w + T4J;
+			 T4S = T4J - T4w;
+		    }
+		    {
+			 E T37, T3k, T3P, T4s;
+			 T37 = T2N + T36;
+			 T3k = T3a + T3j;
+			 T3l = FNMS(KP390180644, T3k, KP1_961570560 * T37);
+			 T4P = FMA(KP390180644, T37, KP1_961570560 * T3k);
+			 T3P = T3v + T3O;
+			 T4s = T48 + T4r;
+			 T4t = T3P + T4s;
+			 T4R = T3P - T4s;
+		    }
+		    {
+			 E T3m, T4L, T4U, T4V;
+			 T3m = T2E + T3l;
+			 T4L = FNMS(KP196034280, T4K, KP1_990369453 * T4t);
+			 R0[WS(rs, 33)] = T3m - T4L;
+			 R0[WS(rs, 1)] = T3m + T4L;
+			 T4U = T4O + T4P;
+			 T4V = FMA(KP1_546020906, T4R, KP1_268786568 * T4S);
+			 R0[WS(rs, 25)] = T4U - T4V;
+			 R0[WS(rs, 57)] = T4U + T4V;
+		    }
+		    {
+			 E T4M, T4N, T4Q, T4T;
+			 T4M = T2E - T3l;
+			 T4N = FMA(KP196034280, T4t, KP1_990369453 * T4K);
+			 R0[WS(rs, 17)] = T4M - T4N;
+			 R0[WS(rs, 49)] = T4M + T4N;
+			 T4Q = T4O - T4P;
+			 T4T = FNMS(KP1_546020906, T4S, KP1_268786568 * T4R);
+			 R0[WS(rs, 41)] = T4Q - T4T;
+			 R0[WS(rs, 9)] = T4Q + T4T;
+		    }
+	       }
+	       {
+		    E T6y, T7e, T7a, T7i, T6J, T7f, T71, T7h;
+		    {
+			 E T6s, T6x, T74, T79;
+			 T6s = T6q - T6r;
+			 T6x = KP1_414213562 * (T6t - T6w);
+			 T6y = T6s + T6x;
+			 T7e = T6s - T6x;
+			 T74 = KP707106781 * (T72 + T73);
+			 T79 = T75 + T78;
+			 T7a = T74 + T79;
+			 T7i = T79 - T74;
+		    }
+		    {
+			 E T6D, T6I, T6P, T70;
+			 T6D = T6z + T6C;
+			 T6I = T6E + T6H;
+			 T6J = FNMS(KP765366864, T6I, KP1_847759065 * T6D);
+			 T7f = FMA(KP765366864, T6D, KP1_847759065 * T6I);
+			 T6P = T6L + T6O;
+			 T70 = KP707106781 * (T6U + T6Z);
+			 T71 = T6P + T70;
+			 T7h = T6P - T70;
+		    }
+		    {
+			 E T6K, T7b, T7k, T7l;
+			 T6K = T6y + T6J;
+			 T7b = FNMS(KP390180644, T7a, KP1_961570560 * T71);
+			 R0[WS(rs, 34)] = T6K - T7b;
+			 R0[WS(rs, 2)] = T6K + T7b;
+			 T7k = T7e + T7f;
+			 T7l = FMA(KP1_662939224, T7h, KP1_111140466 * T7i);
+			 R0[WS(rs, 26)] = T7k - T7l;
+			 R0[WS(rs, 58)] = T7k + T7l;
+		    }
+		    {
+			 E T7c, T7d, T7g, T7j;
+			 T7c = T6y - T6J;
+			 T7d = FMA(KP390180644, T71, KP1_961570560 * T7a);
+			 R0[WS(rs, 18)] = T7c - T7d;
+			 R0[WS(rs, 50)] = T7c + T7d;
+			 T7g = T7e - T7f;
+			 T7j = FNMS(KP1_662939224, T7i, KP1_111140466 * T7h);
+			 R0[WS(rs, 42)] = T7g - T7j;
+			 R0[WS(rs, 10)] = T7g + T7j;
+		    }
+	       }
+	       {
+		    E T4Y, T5c, T58, T5g, T51, T5d, T55, T5f;
+		    {
+			 E T4W, T4X, T56, T57;
+			 T4W = T2a - T2j;
+			 T4X = FMA(KP765366864, T2t, KP1_847759065 * T2C);
+			 T4Y = T4W - T4X;
+			 T5c = T4W + T4X;
+			 T56 = T48 - T4r;
+			 T57 = T4I - T4z;
+			 T58 = T56 + T57;
+			 T5g = T57 - T56;
+		    }
+		    {
+			 E T4Z, T50, T53, T54;
+			 T4Z = T2N - T36;
+			 T50 = T3j - T3a;
+			 T51 = FNMS(KP1_662939224, T50, KP1_111140466 * T4Z);
+			 T5d = FMA(KP1_662939224, T4Z, KP1_111140466 * T50);
+			 T53 = T3v - T3O;
+			 T54 = T4v - T4u;
+			 T55 = T53 + T54;
+			 T5f = T53 - T54;
+		    }
+		    {
+			 E T52, T59, T5i, T5j;
+			 T52 = T4Y + T51;
+			 T59 = FNMS(KP942793473, T58, KP1_763842528 * T55);
+			 R0[WS(rs, 37)] = T52 - T59;
+			 R0[WS(rs, 5)] = T52 + T59;
+			 T5i = T5c + T5d;
+			 T5j = FMA(KP1_913880671, T5f, KP580569354 * T5g);
+			 R0[WS(rs, 29)] = T5i - T5j;
+			 R0[WS(rs, 61)] = T5i + T5j;
+		    }
+		    {
+			 E T5a, T5b, T5e, T5h;
+			 T5a = T4Y - T51;
+			 T5b = FMA(KP942793473, T55, KP1_763842528 * T58);
+			 R0[WS(rs, 21)] = T5a - T5b;
+			 R0[WS(rs, 53)] = T5a + T5b;
+			 T5e = T5c - T5d;
+			 T5h = FNMS(KP1_913880671, T5g, KP580569354 * T5f);
+			 R0[WS(rs, 45)] = T5e - T5h;
+			 R0[WS(rs, 13)] = T5e + T5h;
+		    }
+	       }
+	       {
+		    E T7o, T7C, T7y, T7G, T7r, T7D, T7v, T7F;
+		    {
+			 E T7m, T7n, T7w, T7x;
+			 T7m = T6q + T6r;
+			 T7n = KP1_414213562 * (T6t + T6w);
+			 T7o = T7m - T7n;
+			 T7C = T7m + T7n;
+			 T7w = KP707106781 * (T6U - T6Z);
+			 T7x = T78 - T75;
+			 T7y = T7w + T7x;
+			 T7G = T7x - T7w;
+		    }
+		    {
+			 E T7p, T7q, T7t, T7u;
+			 T7p = T6z - T6C;
+			 T7q = T6H - T6E;
+			 T7r = FNMS(KP1_847759065, T7q, KP765366864 * T7p);
+			 T7D = FMA(KP1_847759065, T7p, KP765366864 * T7q);
+			 T7t = T6L - T6O;
+			 T7u = KP707106781 * (T73 - T72);
+			 T7v = T7t + T7u;
+			 T7F = T7t - T7u;
+		    }
+		    {
+			 E T7s, T7z, T7I, T7J;
+			 T7s = T7o + T7r;
+			 T7z = FNMS(KP1_111140466, T7y, KP1_662939224 * T7v);
+			 R0[WS(rs, 38)] = T7s - T7z;
+			 R0[WS(rs, 6)] = T7s + T7z;
+			 T7I = T7C + T7D;
+			 T7J = FMA(KP1_961570560, T7F, KP390180644 * T7G);
+			 R0[WS(rs, 30)] = T7I - T7J;
+			 R0[WS(rs, 62)] = T7I + T7J;
+		    }
+		    {
+			 E T7A, T7B, T7E, T7H;
+			 T7A = T7o - T7r;
+			 T7B = FMA(KP1_111140466, T7v, KP1_662939224 * T7y);
+			 R0[WS(rs, 22)] = T7A - T7B;
+			 R0[WS(rs, 54)] = T7A + T7B;
+			 T7E = T7C - T7D;
+			 T7H = FNMS(KP1_961570560, T7G, KP390180644 * T7F);
+			 R0[WS(rs, 46)] = T7E - T7H;
+			 R0[WS(rs, 14)] = T7E + T7H;
+		    }
+	       }
+	       {
+		    E T5q, T5U, T5Q, T5Y, T5x, T5V, T5J, T5X;
+		    {
+			 E T5m, T5p, T5M, T5P;
+			 T5m = T5k - T5l;
+			 T5p = FNMS(KP1_847759065, T5o, KP765366864 * T5n);
+			 T5q = T5m + T5p;
+			 T5U = T5m - T5p;
+			 T5M = T5K + T5L;
+			 T5P = T5N + T5O;
+			 T5Q = T5M + T5P;
+			 T5Y = T5P - T5M;
+		    }
+		    {
+			 E T5t, T5w, T5B, T5I;
+			 T5t = T5r + T5s;
+			 T5w = T5u + T5v;
+			 T5x = FNMS(KP1_111140466, T5w, KP1_662939224 * T5t);
+			 T5V = FMA(KP1_111140466, T5t, KP1_662939224 * T5w);
+			 T5B = T5z + T5A;
+			 T5I = T5E + T5H;
+			 T5J = T5B + T5I;
+			 T5X = T5B - T5I;
+		    }
+		    {
+			 E T5y, T5R, T60, T61;
+			 T5y = T5q + T5x;
+			 T5R = FNMS(KP580569354, T5Q, KP1_913880671 * T5J);
+			 R0[WS(rs, 35)] = T5y - T5R;
+			 R0[WS(rs, 3)] = T5y + T5R;
+			 T60 = T5U + T5V;
+			 T61 = FMA(KP1_763842528, T5X, KP942793473 * T5Y);
+			 R0[WS(rs, 27)] = T60 - T61;
+			 R0[WS(rs, 59)] = T60 + T61;
+		    }
+		    {
+			 E T5S, T5T, T5W, T5Z;
+			 T5S = T5q - T5x;
+			 T5T = FMA(KP580569354, T5J, KP1_913880671 * T5Q);
+			 R0[WS(rs, 19)] = T5S - T5T;
+			 R0[WS(rs, 51)] = T5S + T5T;
+			 T5W = T5U - T5V;
+			 T5Z = FNMS(KP1_763842528, T5Y, KP942793473 * T5X);
+			 R0[WS(rs, 43)] = T5W - T5Z;
+			 R0[WS(rs, 11)] = T5W + T5Z;
+		    }
+	       }
+	       {
+		    E T64, T6i, T6e, T6m, T67, T6j, T6b, T6l;
+		    {
+			 E T62, T63, T6c, T6d;
+			 T62 = T5k + T5l;
+			 T63 = FMA(KP1_847759065, T5n, KP765366864 * T5o);
+			 T64 = T62 - T63;
+			 T6i = T62 + T63;
+			 T6c = T5E - T5H;
+			 T6d = T5O - T5N;
+			 T6e = T6c + T6d;
+			 T6m = T6d - T6c;
+		    }
+		    {
+			 E T65, T66, T69, T6a;
+			 T65 = T5r - T5s;
+			 T66 = T5v - T5u;
+			 T67 = FNMS(KP1_961570560, T66, KP390180644 * T65);
+			 T6j = FMA(KP1_961570560, T65, KP390180644 * T66);
+			 T69 = T5z - T5A;
+			 T6a = T5L - T5K;
+			 T6b = T69 + T6a;
+			 T6l = T69 - T6a;
+		    }
+		    {
+			 E T68, T6f, T6o, T6p;
+			 T68 = T64 + T67;
+			 T6f = FNMS(KP1_268786568, T6e, KP1_546020906 * T6b);
+			 R0[WS(rs, 39)] = T68 - T6f;
+			 R0[WS(rs, 7)] = T68 + T6f;
+			 T6o = T6i + T6j;
+			 T6p = FMA(KP1_990369453, T6l, KP196034280 * T6m);
+			 R0[WS(rs, 31)] = T6o - T6p;
+			 R0[WS(rs, 63)] = T6o + T6p;
+		    }
+		    {
+			 E T6g, T6h, T6k, T6n;
+			 T6g = T64 - T67;
+			 T6h = FMA(KP1_268786568, T6b, KP1_546020906 * T6e);
+			 R0[WS(rs, 23)] = T6g - T6h;
+			 R0[WS(rs, 55)] = T6g + T6h;
+			 T6k = T6i - T6j;
+			 T6n = FNMS(KP1_990369453, T6m, KP196034280 * T6l);
+			 R0[WS(rs, 47)] = T6k - T6n;
+			 R0[WS(rs, 15)] = T6k + T6n;
+		    }
+	       }
+	       {
+		    E T8Z, Tb1, T9C, Tb2, Tbe, Tbq, Tbb, Tbp, TaX, Tbs, Tb5, Tbi, TaI, Tbt, Tb4;
+		    E Tbl;
+		    {
+			 E T8F, T8Y, Tb9, Tba;
+			 T8F = T8x + T8E;
+			 T8Y = FNMS(KP390180644, T8X, KP1_961570560 * T8Q);
+			 T8Z = T8F + T8Y;
+			 Tb1 = T8F - T8Y;
+			 {
+			      E T9q, T9B, Tbc, Tbd;
+			      T9q = T9a + T9p;
+			      T9B = T9t + T9A;
+			      T9C = FNMS(KP196034280, T9B, KP1_990369453 * T9q);
+			      Tb2 = FMA(KP196034280, T9q, KP1_990369453 * T9B);
+			      Tbc = T9a - T9p;
+			      Tbd = T9A - T9t;
+			      Tbe = FNMS(KP1_546020906, Tbd, KP1_268786568 * Tbc);
+			      Tbq = FMA(KP1_546020906, Tbc, KP1_268786568 * Tbd);
+			 }
+			 Tb9 = T8x - T8E;
+			 Tba = FMA(KP390180644, T8Q, KP1_961570560 * T8X);
+			 Tbb = Tb9 - Tba;
+			 Tbp = Tb9 + Tba;
+			 {
+			      E TaW, Tbg, TaL, Tbh, TaJ, TaK;
+			      TaW = TaO + TaV;
+			      Tbg = T9O - Ta3;
+			      TaJ = FMA(KP195090322, Taf, KP980785280 * Tam);
+			      TaK = FNMS(KP195090322, Tay, KP980785280 * TaF);
+			      TaL = TaJ + TaK;
+			      Tbh = TaK - TaJ;
+			      TaX = TaL + TaW;
+			      Tbs = Tbg - Tbh;
+			      Tb5 = TaW - TaL;
+			      Tbi = Tbg + Tbh;
+			 }
+			 {
+			      E Ta4, Tbk, TaH, Tbj, Tan, TaG;
+			      Ta4 = T9O + Ta3;
+			      Tbk = TaV - TaO;
+			      Tan = FNMS(KP195090322, Tam, KP980785280 * Taf);
+			      TaG = FMA(KP980785280, Tay, KP195090322 * TaF);
+			      TaH = Tan + TaG;
+			      Tbj = Tan - TaG;
+			      TaI = Ta4 + TaH;
+			      Tbt = Tbk - Tbj;
+			      Tb4 = Ta4 - TaH;
+			      Tbl = Tbj + Tbk;
+			 }
+		    }
+		    {
+			 E T9D, TaY, Tbr, Tbu;
+			 T9D = T8Z + T9C;
+			 TaY = FNMS(KP098135348, TaX, KP1_997590912 * TaI);
+			 R1[WS(rs, 32)] = T9D - TaY;
+			 R1[0] = T9D + TaY;
+			 Tbr = Tbp - Tbq;
+			 Tbu = FNMS(KP1_883088130, Tbt, KP673779706 * Tbs);
+			 R1[WS(rs, 44)] = Tbr - Tbu;
+			 R1[WS(rs, 12)] = Tbr + Tbu;
+		    }
+		    {
+			 E Tbv, Tbw, TaZ, Tb0;
+			 Tbv = Tbp + Tbq;
+			 Tbw = FMA(KP1_883088130, Tbs, KP673779706 * Tbt);
+			 R1[WS(rs, 28)] = Tbv - Tbw;
+			 R1[WS(rs, 60)] = Tbv + Tbw;
+			 TaZ = T8Z - T9C;
+			 Tb0 = FMA(KP098135348, TaI, KP1_997590912 * TaX);
+			 R1[WS(rs, 16)] = TaZ - Tb0;
+			 R1[WS(rs, 48)] = TaZ + Tb0;
+		    }
+		    {
+			 E Tb3, Tb6, Tbf, Tbm;
+			 Tb3 = Tb1 - Tb2;
+			 Tb6 = FNMS(KP1_481902250, Tb5, KP1_343117909 * Tb4);
+			 R1[WS(rs, 40)] = Tb3 - Tb6;
+			 R1[WS(rs, 8)] = Tb3 + Tb6;
+			 Tbf = Tbb + Tbe;
+			 Tbm = FNMS(KP855110186, Tbl, KP1_807978586 * Tbi);
+			 R1[WS(rs, 36)] = Tbf - Tbm;
+			 R1[WS(rs, 4)] = Tbf + Tbm;
+		    }
+		    {
+			 E Tbn, Tbo, Tb7, Tb8;
+			 Tbn = Tbb - Tbe;
+			 Tbo = FMA(KP855110186, Tbi, KP1_807978586 * Tbl);
+			 R1[WS(rs, 20)] = Tbn - Tbo;
+			 R1[WS(rs, 52)] = Tbn + Tbo;
+			 Tb7 = Tb1 + Tb2;
+			 Tb8 = FMA(KP1_481902250, Tb4, KP1_343117909 * Tb5);
+			 R1[WS(rs, 24)] = Tb7 - Tb8;
+			 R1[WS(rs, 56)] = Tb7 + Tb8;
+		    }
+	       }
+	       {
+		    E TcR, TdR, Tda, TdS, Te4, Teg, Te1, Tef, TdN, Tei, TdV, Te8, TdC, Tej, TdU;
+		    E Teb;
+		    {
+			 E TcJ, TcQ, TdZ, Te0;
+			 TcJ = TcF + TcI;
+			 TcQ = FNMS(KP1_111140466, TcP, KP1_662939224 * TcM);
+			 TcR = TcJ + TcQ;
+			 TdR = TcJ - TcQ;
+			 {
+			      E Td2, Td9, Te2, Te3;
+			      Td2 = TcU + Td1;
+			      Td9 = Td5 + Td8;
+			      Tda = FNMS(KP580569354, Td9, KP1_913880671 * Td2);
+			      TdS = FMA(KP580569354, Td2, KP1_913880671 * Td9);
+			      Te2 = TcU - Td1;
+			      Te3 = Td8 - Td5;
+			      Te4 = FNMS(KP1_763842528, Te3, KP942793473 * Te2);
+			      Teg = FMA(KP1_763842528, Te2, KP942793473 * Te3);
+			 }
+			 TdZ = TcF - TcI;
+			 Te0 = FMA(KP1_111140466, TcM, KP1_662939224 * TcP);
+			 Te1 = TdZ - Te0;
+			 Tef = TdZ + Te0;
+			 {
+			      E TdM, Te6, TdF, Te7, TdD, TdE;
+			      TdM = TdI + TdL;
+			      Te6 = Tde - Tdl;
+			      TdD = FMA(KP555570233, Tdp, KP831469612 * Tds);
+			      TdE = FNMS(KP555570233, Tdw, KP831469612 * Tdz);
+			      TdF = TdD + TdE;
+			      Te7 = TdE - TdD;
+			      TdN = TdF + TdM;
+			      Tei = Te6 - Te7;
+			      TdV = TdM - TdF;
+			      Te8 = Te6 + Te7;
+			 }
+			 {
+			      E Tdm, Tea, TdB, Te9, Tdt, TdA;
+			      Tdm = Tde + Tdl;
+			      Tea = TdL - TdI;
+			      Tdt = FNMS(KP555570233, Tds, KP831469612 * Tdp);
+			      TdA = FMA(KP831469612, Tdw, KP555570233 * Tdz);
+			      TdB = Tdt + TdA;
+			      Te9 = Tdt - TdA;
+			      TdC = Tdm + TdB;
+			      Tej = Tea - Te9;
+			      TdU = Tdm - TdB;
+			      Teb = Te9 + Tea;
+			 }
+		    }
+		    {
+			 E Tdb, TdO, Teh, Tek;
+			 Tdb = TcR + Tda;
+			 TdO = FNMS(KP293460948, TdN, KP1_978353019 * TdC);
+			 R1[WS(rs, 33)] = Tdb - TdO;
+			 R1[WS(rs, 1)] = Tdb + TdO;
+			 Teh = Tef - Teg;
+			 Tek = FNMS(KP1_940062506, Tej, KP485960359 * Tei);
+			 R1[WS(rs, 45)] = Teh - Tek;
+			 R1[WS(rs, 13)] = Teh + Tek;
+		    }
+		    {
+			 E Tel, Tem, TdP, TdQ;
+			 Tel = Tef + Teg;
+			 Tem = FMA(KP1_940062506, Tei, KP485960359 * Tej);
+			 R1[WS(rs, 29)] = Tel - Tem;
+			 R1[WS(rs, 61)] = Tel + Tem;
+			 TdP = TcR - Tda;
+			 TdQ = FMA(KP293460948, TdC, KP1_978353019 * TdN);
+			 R1[WS(rs, 17)] = TdP - TdQ;
+			 R1[WS(rs, 49)] = TdP + TdQ;
+		    }
+		    {
+			 E TdT, TdW, Te5, Tec;
+			 TdT = TdR - TdS;
+			 TdW = FNMS(KP1_606415062, TdV, KP1_191398608 * TdU);
+			 R1[WS(rs, 41)] = TdT - TdW;
+			 R1[WS(rs, 9)] = TdT + TdW;
+			 Te5 = Te1 + Te4;
+			 Tec = FNMS(KP1_028205488, Teb, KP1_715457220 * Te8);
+			 R1[WS(rs, 37)] = Te5 - Tec;
+			 R1[WS(rs, 5)] = Te5 + Tec;
+		    }
+		    {
+			 E Ted, Tee, TdX, TdY;
+			 Ted = Te1 - Te4;
+			 Tee = FMA(KP1_028205488, Te8, KP1_715457220 * Teb);
+			 R1[WS(rs, 21)] = Ted - Tee;
+			 R1[WS(rs, 53)] = Ted + Tee;
+			 TdX = TdR + TdS;
+			 TdY = FMA(KP1_606415062, TdU, KP1_191398608 * TdV);
+			 R1[WS(rs, 25)] = TdX - TdY;
+			 R1[WS(rs, 57)] = TdX + TdY;
+		    }
+	       }
+	       {
+		    E TbD, Tc7, TbK, Tc8, Tck, Tcw, Tch, Tcv, Tc3, Tcy, Tcb, Tco, TbW, Tcz, Tca;
+		    E Tcr;
+		    {
+			 E Tbz, TbC, Tcf, Tcg;
+			 Tbz = Tbx - Tby;
+			 TbC = FNMS(KP1_662939224, TbB, KP1_111140466 * TbA);
+			 TbD = Tbz + TbC;
+			 Tc7 = Tbz - TbC;
+			 {
+			      E TbG, TbJ, Tci, Tcj;
+			      TbG = TbE + TbF;
+			      TbJ = TbH + TbI;
+			      TbK = FNMS(KP942793473, TbJ, KP1_763842528 * TbG);
+			      Tc8 = FMA(KP942793473, TbG, KP1_763842528 * TbJ);
+			      Tci = TbE - TbF;
+			      Tcj = TbI - TbH;
+			      Tck = FNMS(KP1_913880671, Tcj, KP580569354 * Tci);
+			      Tcw = FMA(KP1_913880671, Tci, KP580569354 * Tcj);
+			 }
+			 Tcf = Tbx + Tby;
+			 Tcg = FMA(KP1_662939224, TbA, KP1_111140466 * TbB);
+			 Tch = Tcf - Tcg;
+			 Tcv = Tcf + Tcg;
+			 {
+			      E Tc2, Tcm, TbZ, Tcn, TbX, TbY;
+			      Tc2 = Tc0 + Tc1;
+			      Tcm = TbM - TbN;
+			      TbX = FMA(KP831469612, TbP, KP555570233 * TbQ);
+			      TbY = FNMS(KP831469612, TbS, KP555570233 * TbT);
+			      TbZ = TbX + TbY;
+			      Tcn = TbY - TbX;
+			      Tc3 = TbZ + Tc2;
+			      Tcy = Tcm - Tcn;
+			      Tcb = Tc2 - TbZ;
+			      Tco = Tcm + Tcn;
+			 }
+			 {
+			      E TbO, Tcq, TbV, Tcp, TbR, TbU;
+			      TbO = TbM + TbN;
+			      Tcq = Tc1 - Tc0;
+			      TbR = FNMS(KP831469612, TbQ, KP555570233 * TbP);
+			      TbU = FMA(KP555570233, TbS, KP831469612 * TbT);
+			      TbV = TbR + TbU;
+			      Tcp = TbR - TbU;
+			      TbW = TbO + TbV;
+			      Tcz = Tcq - Tcp;
+			      Tca = TbO - TbV;
+			      Tcr = Tcp + Tcq;
+			 }
+		    }
+		    {
+			 E TbL, Tc4, Tcx, TcA;
+			 TbL = TbD + TbK;
+			 Tc4 = FNMS(KP485960359, Tc3, KP1_940062506 * TbW);
+			 R1[WS(rs, 34)] = TbL - Tc4;
+			 R1[WS(rs, 2)] = TbL + Tc4;
+			 Tcx = Tcv - Tcw;
+			 TcA = FNMS(KP1_978353019, Tcz, KP293460948 * Tcy);
+			 R1[WS(rs, 46)] = Tcx - TcA;
+			 R1[WS(rs, 14)] = Tcx + TcA;
+		    }
+		    {
+			 E TcB, TcC, Tc5, Tc6;
+			 TcB = Tcv + Tcw;
+			 TcC = FMA(KP1_978353019, Tcy, KP293460948 * Tcz);
+			 R1[WS(rs, 30)] = TcB - TcC;
+			 R1[WS(rs, 62)] = TcB + TcC;
+			 Tc5 = TbD - TbK;
+			 Tc6 = FMA(KP485960359, TbW, KP1_940062506 * Tc3);
+			 R1[WS(rs, 18)] = Tc5 - Tc6;
+			 R1[WS(rs, 50)] = Tc5 + Tc6;
+		    }
+		    {
+			 E Tc9, Tcc, Tcl, Tcs;
+			 Tc9 = Tc7 - Tc8;
+			 Tcc = FNMS(KP1_715457220, Tcb, KP1_028205488 * Tca);
+			 R1[WS(rs, 42)] = Tc9 - Tcc;
+			 R1[WS(rs, 10)] = Tc9 + Tcc;
+			 Tcl = Tch + Tck;
+			 Tcs = FNMS(KP1_191398608, Tcr, KP1_606415062 * Tco);
+			 R1[WS(rs, 38)] = Tcl - Tcs;
+			 R1[WS(rs, 6)] = Tcl + Tcs;
+		    }
+		    {
+			 E Tct, Tcu, Tcd, Tce;
+			 Tct = Tch - Tck;
+			 Tcu = FMA(KP1_191398608, Tco, KP1_606415062 * Tcr);
+			 R1[WS(rs, 22)] = Tct - Tcu;
+			 R1[WS(rs, 54)] = Tct + Tcu;
+			 Tcd = Tc7 + Tc8;
+			 Tce = FMA(KP1_715457220, Tca, KP1_028205488 * Tcb);
+			 R1[WS(rs, 26)] = Tcd - Tce;
+			 R1[WS(rs, 58)] = Tcd + Tce;
+		    }
+	       }
+	       {
+		    E Tet, TeX, TeA, TeY, Tfa, Tfm, Tf7, Tfl, TeT, Tfo, Tf1, Tfe, TeM, Tfp, Tf0;
+		    E Tfh;
+		    {
+			 E Tep, Tes, Tf5, Tf6;
+			 Tep = Ten - Teo;
+			 Tes = FNMS(KP1_961570560, Ter, KP390180644 * Teq);
+			 Tet = Tep + Tes;
+			 TeX = Tep - Tes;
+			 {
+			      E Tew, Tez, Tf8, Tf9;
+			      Tew = Teu - Tev;
+			      Tez = Tex + Tey;
+			      TeA = FNMS(KP1_268786568, Tez, KP1_546020906 * Tew);
+			      TeY = FMA(KP1_268786568, Tew, KP1_546020906 * Tez);
+			      Tf8 = Teu + Tev;
+			      Tf9 = Tey - Tex;
+			      Tfa = FNMS(KP1_990369453, Tf9, KP196034280 * Tf8);
+			      Tfm = FMA(KP1_990369453, Tf8, KP196034280 * Tf9);
+			 }
+			 Tf5 = Ten + Teo;
+			 Tf6 = FMA(KP1_961570560, Teq, KP390180644 * Ter);
+			 Tf7 = Tf5 - Tf6;
+			 Tfl = Tf5 + Tf6;
+			 {
+			      E TeS, Tfc, TeP, Tfd, TeN, TeO;
+			      TeS = TeQ + TeR;
+			      Tfc = TeC + TeD;
+			      TeN = FMA(KP980785280, TeF, KP195090322 * TeG);
+			      TeO = FMA(KP980785280, TeI, KP195090322 * TeJ);
+			      TeP = TeN - TeO;
+			      Tfd = TeN + TeO;
+			      TeT = TeP + TeS;
+			      Tfo = Tfc + Tfd;
+			      Tf1 = TeS - TeP;
+			      Tfe = Tfc - Tfd;
+			 }
+			 {
+			      E TeE, Tfg, TeL, Tff, TeH, TeK;
+			      TeE = TeC - TeD;
+			      Tfg = TeR - TeQ;
+			      TeH = FNMS(KP980785280, TeG, KP195090322 * TeF);
+			      TeK = FNMS(KP980785280, TeJ, KP195090322 * TeI);
+			      TeL = TeH + TeK;
+			      Tff = TeH - TeK;
+			      TeM = TeE + TeL;
+			      Tfp = Tfg - Tff;
+			      Tf0 = TeE - TeL;
+			      Tfh = Tff + Tfg;
+			 }
+		    }
+		    {
+			 E TeB, TeU, Tfn, Tfq;
+			 TeB = Tet + TeA;
+			 TeU = FNMS(KP673779706, TeT, KP1_883088130 * TeM);
+			 R1[WS(rs, 35)] = TeB - TeU;
+			 R1[WS(rs, 3)] = TeB + TeU;
+			 Tfn = Tfl - Tfm;
+			 Tfq = FNMS(KP1_997590912, Tfp, KP098135348 * Tfo);
+			 R1[WS(rs, 47)] = Tfn - Tfq;
+			 R1[WS(rs, 15)] = Tfn + Tfq;
+		    }
+		    {
+			 E Tfr, Tfs, TeV, TeW;
+			 Tfr = Tfl + Tfm;
+			 Tfs = FMA(KP1_997590912, Tfo, KP098135348 * Tfp);
+			 R1[WS(rs, 31)] = Tfr - Tfs;
+			 R1[WS(rs, 63)] = Tfr + Tfs;
+			 TeV = Tet - TeA;
+			 TeW = FMA(KP673779706, TeM, KP1_883088130 * TeT);
+			 R1[WS(rs, 19)] = TeV - TeW;
+			 R1[WS(rs, 51)] = TeV + TeW;
+		    }
+		    {
+			 E TeZ, Tf2, Tfb, Tfi;
+			 TeZ = TeX - TeY;
+			 Tf2 = FNMS(KP1_807978586, Tf1, KP855110186 * Tf0);
+			 R1[WS(rs, 43)] = TeZ - Tf2;
+			 R1[WS(rs, 11)] = TeZ + Tf2;
+			 Tfb = Tf7 + Tfa;
+			 Tfi = FNMS(KP1_343117909, Tfh, KP1_481902250 * Tfe);
+			 R1[WS(rs, 39)] = Tfb - Tfi;
+			 R1[WS(rs, 7)] = Tfb + Tfi;
+		    }
+		    {
+			 E Tfj, Tfk, Tf3, Tf4;
+			 Tfj = Tf7 - Tfa;
+			 Tfk = FMA(KP1_343117909, Tfe, KP1_481902250 * Tfh);
+			 R1[WS(rs, 23)] = Tfj - Tfk;
+			 R1[WS(rs, 55)] = Tfj + Tfk;
+			 Tf3 = TeX + TeY;
+			 Tf4 = FMA(KP1_807978586, Tf0, KP855110186 * Tf1);
+			 R1[WS(rs, 27)] = Tf3 - Tf4;
+			 R1[WS(rs, 59)] = Tf3 + Tf4;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 128, "r2cb_128", {812, 198, 144, 0}, &GENUS };
+
+void X(codelet_r2cb_128) (planner *p) {
+     X(kr2c_register) (p, r2cb_128, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:08 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 13 -name r2cb_13 -include r2cb.h */
+
+/*
+ * This function contains 76 FP additions, 58 FP multiplications,
+ * (or, 18 additions, 0 multiplications, 58 fused multiply/add),
+ * 76 stack variables, 26 constants, and 26 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_13(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP968287244, +0.968287244361984016049539446938120421179794516);
+     DK(KP875502302, +0.875502302409147941146295545768755143177842006);
+     DK(KP1_150281458, +1.150281458948006242736771094910906776922003215);
+     DK(KP1_040057143, +1.040057143777729238234261000998465604986476278);
+     DK(KP1_200954543, +1.200954543865330565851538506669526018704025697);
+     DK(KP769338817, +0.769338817572980603471413688209101117038278899);
+     DK(KP600925212, +0.600925212577331548853203544578415991041882762);
+     DK(KP1_033041561, +1.033041561246979445681802577138034271410067244);
+     DK(KP1_007074065, +1.007074065727533254493747707736933954186697125);
+     DK(KP503537032, +0.503537032863766627246873853868466977093348562);
+     DK(KP581704778, +0.581704778510515730456870384989698884939833902);
+     DK(KP859542535, +0.859542535098774820163672132761689612766401925);
+     DK(KP166666666, +0.166666666666666666666666666666666666666666667);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP301479260, +0.301479260047709873958013540496673347309208464);
+     DK(KP226109445, +0.226109445035782405468510155372505010481906348);
+     DK(KP686558370, +0.686558370781754340655719594850823015421401653);
+     DK(KP514918778, +0.514918778086315755491789696138117261566051239);
+     DK(KP957805992, +0.957805992594665126462521754605754580515587217);
+     DK(KP522026385, +0.522026385161275033714027226654165028300441940);
+     DK(KP853480001, +0.853480001859823990758994934970528322872359049);
+     DK(KP038632954, +0.038632954644348171955506895830342264440241080);
+     DK(KP612264650, +0.612264650376756543746494474777125408779395514);
+     DK(KP302775637, +0.302775637731994646559610633735247973125648287);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(52, rs), MAKE_VOLATILE_STRIDE(52, csr), MAKE_VOLATILE_STRIDE(52, csi)) {
+	       E TW, T14, TS, TO, T18, T1e, TY, TX, TQ, Tq, TP, Tl, T1d, Tr;
+	       {
+		    E T1, TN, T16, TJ, TV, TG, TU, Tf, T2, T3, Tb, Ti, T4;
+		    {
+			 E Ts, TB, Tx, Ty, Tv, TE, Tt, Tu, Tz, TC;
+			 Ts = Ci[WS(csi, 5)];
+			 Tt = Ci[WS(csi, 2)];
+			 Tu = Ci[WS(csi, 6)];
+			 TB = Ci[WS(csi, 1)];
+			 Tx = Ci[WS(csi, 3)];
+			 Ty = Ci[WS(csi, 4)];
+			 Tv = Tt + Tu;
+			 TE = Tu - Tt;
+			 T1 = Cr[0];
+			 Tz = Tx + Ty;
+			 TC = Tx - Ty;
+			 {
+			      E TL, Tw, T7, Ta;
+			      TL = Ts + Tv;
+			      Tw = FNMS(KP500000000, Tv, Ts);
+			      T7 = Cr[WS(csr, 5)];
+			      {
+				   E TD, TM, TA, TH;
+				   TD = FNMS(KP500000000, TC, TB);
+				   TM = TB + TC;
+				   TA = FMA(KP866025403, Tz, Tw);
+				   TH = FNMS(KP866025403, Tz, Tw);
+				   TN = FMA(KP302775637, TM, TL);
+				   T16 = FNMS(KP302775637, TL, TM);
+				   {
+					E TF, TI, T8, T9;
+					TF = FMA(KP866025403, TE, TD);
+					TI = FNMS(KP866025403, TE, TD);
+					T8 = Cr[WS(csr, 2)];
+					T9 = Cr[WS(csr, 6)];
+					TJ = FNMS(KP612264650, TI, TH);
+					TV = FMA(KP612264650, TH, TI);
+					TG = FNMS(KP038632954, TF, TA);
+					TU = FMA(KP038632954, TA, TF);
+					Tf = T8 - T9;
+					Ta = T8 + T9;
+				   }
+			      }
+			      T2 = Cr[WS(csr, 1)];
+			      T3 = Cr[WS(csr, 3)];
+			      Tb = T7 + Ta;
+			      Ti = FMS(KP500000000, Ta, T7);
+			      T4 = Cr[WS(csr, 4)];
+			 }
+		    }
+		    {
+			 E T17, TK, T5, Te, Tk, Td;
+			 TW = FMA(KP853480001, TV, TU);
+			 T17 = FNMS(KP853480001, TV, TU);
+			 TK = FNMS(KP853480001, TJ, TG);
+			 T14 = FMA(KP853480001, TJ, TG);
+			 T5 = T3 + T4;
+			 Te = T3 - T4;
+			 {
+			      E Tn, Tg, Th, T6;
+			      TS = FNMS(KP522026385, TK, TN);
+			      TO = FMA(KP957805992, TN, TK);
+			      Tn = Te - Tf;
+			      Tg = Te + Tf;
+			      Th = FNMS(KP500000000, T5, T2);
+			      T6 = T2 + T5;
+			      T18 = FNMS(KP522026385, T17, T16);
+			      T1e = FMA(KP957805992, T16, T17);
+			      {
+				   E Tm, Tj, Tc, Tp, To;
+				   Tm = Th + Ti;
+				   Tj = Th - Ti;
+				   Tc = T6 + Tb;
+				   Tp = T6 - Tb;
+				   To = FNMS(KP514918778, Tn, Tm);
+				   TY = FMA(KP686558370, Tm, Tn);
+				   TX = FNMS(KP226109445, Tg, Tj);
+				   Tk = FMA(KP301479260, Tj, Tg);
+				   R0[0] = FMA(KP2_000000000, Tc, T1);
+				   Td = FNMS(KP166666666, Tc, T1);
+				   TQ = FNMS(KP859542535, To, Tp);
+				   Tq = FMA(KP581704778, Tp, To);
+			      }
+			 }
+			 TP = FNMS(KP503537032, Tk, Td);
+			 Tl = FMA(KP1_007074065, Tk, Td);
+		    }
+	       }
+	       T1d = FNMS(KP1_033041561, Tq, Tl);
+	       Tr = FMA(KP1_033041561, Tq, Tl);
+	       {
+		    E T13, TR, T19, TZ;
+		    T13 = FNMS(KP600925212, TQ, TP);
+		    TR = FMA(KP600925212, TQ, TP);
+		    T19 = FMA(KP769338817, TY, TX);
+		    TZ = FNMS(KP769338817, TY, TX);
+		    R0[WS(rs, 4)] = FMA(KP1_200954543, T1e, T1d);
+		    R1[WS(rs, 2)] = FNMS(KP1_200954543, T1e, T1d);
+		    R0[WS(rs, 6)] = FMA(KP1_200954543, TO, Tr);
+		    R1[0] = FNMS(KP1_200954543, TO, Tr);
+		    {
+			 E T1b, T15, T11, TT;
+			 T1b = FNMS(KP1_040057143, T14, T13);
+			 T15 = FMA(KP1_040057143, T14, T13);
+			 T11 = FMA(KP1_150281458, TS, TR);
+			 TT = FNMS(KP1_150281458, TS, TR);
+			 {
+			      E T1c, T1a, T12, T10;
+			      T1c = FMA(KP875502302, T19, T18);
+			      T1a = FNMS(KP875502302, T19, T18);
+			      T12 = FMA(KP968287244, TZ, TW);
+			      T10 = FNMS(KP968287244, TZ, TW);
+			      R1[WS(rs, 5)] = FMA(KP1_150281458, T1c, T1b);
+			      R0[WS(rs, 3)] = FNMS(KP1_150281458, T1c, T1b);
+			      R1[WS(rs, 3)] = FMA(KP1_150281458, T1a, T15);
+			      R0[WS(rs, 1)] = FNMS(KP1_150281458, T1a, T15);
+			      R0[WS(rs, 5)] = FMA(KP1_040057143, T12, T11);
+			      R0[WS(rs, 2)] = FNMS(KP1_040057143, T12, T11);
+			      R1[WS(rs, 4)] = FMA(KP1_040057143, T10, TT);
+			      R1[WS(rs, 1)] = FNMS(KP1_040057143, T10, TT);
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 13, "r2cb_13", {18, 0, 58, 0}, &GENUS };
+
+void X(codelet_r2cb_13) (planner *p) {
+     X(kr2c_register) (p, r2cb_13, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 13 -name r2cb_13 -include r2cb.h */
+
+/*
+ * This function contains 76 FP additions, 35 FP multiplications,
+ * (or, 56 additions, 15 multiplications, 20 fused multiply/add),
+ * 56 stack variables, 19 constants, and 26 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_13(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_007074065, +1.007074065727533254493747707736933954186697125);
+     DK(KP227708958, +0.227708958111581597949308691735310621069285120);
+     DK(KP531932498, +0.531932498429674575175042127684371897596660533);
+     DK(KP774781170, +0.774781170935234584261351932853525703557550433);
+     DK(KP265966249, +0.265966249214837287587521063842185948798330267);
+     DK(KP516520780, +0.516520780623489722840901288569017135705033622);
+     DK(KP151805972, +0.151805972074387731966205794490207080712856746);
+     DK(KP503537032, +0.503537032863766627246873853868466977093348562);
+     DK(KP166666666, +0.166666666666666666666666666666666666666666667);
+     DK(KP600925212, +0.600925212577331548853203544578415991041882762);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP256247671, +0.256247671582936600958684654061725059144125175);
+     DK(KP156891391, +0.156891391051584611046832726756003269660212636);
+     DK(KP348277202, +0.348277202304271810011321589858529485233929352);
+     DK(KP1_150281458, +1.150281458948006242736771094910906776922003215);
+     DK(KP300238635, +0.300238635966332641462884626667381504676006424);
+     DK(KP011599105, +0.011599105605768290721655456654083252189827041);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(52, rs), MAKE_VOLATILE_STRIDE(52, csr), MAKE_VOLATILE_STRIDE(52, csi)) {
+	       E TG, TS, TR, T15, TJ, TT, T1, Tm, Tc, Td, Tg, Tj, Tk, Tn, To;
+	       E Tp;
+	       {
+		    E Ts, Tv, Tw, TE, TC, TB, Tz, TD, TA, TF;
+		    {
+			 E Tt, Tu, Tx, Ty;
+			 Ts = Ci[WS(csi, 1)];
+			 Tt = Ci[WS(csi, 3)];
+			 Tu = Ci[WS(csi, 4)];
+			 Tv = Tt - Tu;
+			 Tw = FMS(KP2_000000000, Ts, Tv);
+			 TE = KP1_732050807 * (Tt + Tu);
+			 TC = Ci[WS(csi, 5)];
+			 Tx = Ci[WS(csi, 6)];
+			 Ty = Ci[WS(csi, 2)];
+			 TB = Tx + Ty;
+			 Tz = KP1_732050807 * (Tx - Ty);
+			 TD = FNMS(KP2_000000000, TC, TB);
+		    }
+		    TA = Tw + Tz;
+		    TF = TD - TE;
+		    TG = FMA(KP011599105, TA, KP300238635 * TF);
+		    TS = FNMS(KP011599105, TF, KP300238635 * TA);
+		    {
+			 E TP, TQ, TH, TI;
+			 TP = Ts + Tv;
+			 TQ = TB + TC;
+			 TR = FNMS(KP348277202, TQ, KP1_150281458 * TP);
+			 T15 = FMA(KP348277202, TP, KP1_150281458 * TQ);
+			 TH = Tw - Tz;
+			 TI = TE + TD;
+			 TJ = FMA(KP156891391, TH, KP256247671 * TI);
+			 TT = FNMS(KP256247671, TH, KP156891391 * TI);
+		    }
+	       }
+	       {
+		    E Tb, Ti, Tf, T6, Th, Te;
+		    T1 = Cr[0];
+		    {
+			 E T7, T8, T9, Ta;
+			 T7 = Cr[WS(csr, 5)];
+			 T8 = Cr[WS(csr, 2)];
+			 T9 = Cr[WS(csr, 6)];
+			 Ta = T8 + T9;
+			 Tb = T7 + Ta;
+			 Ti = FNMS(KP500000000, Ta, T7);
+			 Tf = T8 - T9;
+		    }
+		    {
+			 E T2, T3, T4, T5;
+			 T2 = Cr[WS(csr, 1)];
+			 T3 = Cr[WS(csr, 3)];
+			 T4 = Cr[WS(csr, 4)];
+			 T5 = T3 + T4;
+			 T6 = T2 + T5;
+			 Th = FNMS(KP500000000, T5, T2);
+			 Te = T3 - T4;
+		    }
+		    Tm = KP600925212 * (T6 - Tb);
+		    Tc = T6 + Tb;
+		    Td = FNMS(KP166666666, Tc, T1);
+		    Tg = Te + Tf;
+		    Tj = Th + Ti;
+		    Tk = FMA(KP503537032, Tg, KP151805972 * Tj);
+		    Tn = Th - Ti;
+		    To = Te - Tf;
+		    Tp = FNMS(KP265966249, To, KP516520780 * Tn);
+	       }
+	       R0[0] = FMA(KP2_000000000, Tc, T1);
+	       {
+		    E TK, T1b, TV, T12, T16, T18, TO, T1a, Tr, T17, T11, T13;
+		    {
+			 E TU, T14, TM, TN;
+			 TK = KP1_732050807 * (TG + TJ);
+			 T1b = KP1_732050807 * (TS - TT);
+			 TU = TS + TT;
+			 TV = TR - TU;
+			 T12 = FMA(KP2_000000000, TU, TR);
+			 T14 = TG - TJ;
+			 T16 = FMS(KP2_000000000, T14, T15);
+			 T18 = T14 + T15;
+			 TM = FMA(KP774781170, To, KP531932498 * Tn);
+			 TN = FNMS(KP1_007074065, Tj, KP227708958 * Tg);
+			 TO = TM - TN;
+			 T1a = TM + TN;
+			 {
+			      E Tl, Tq, TZ, T10;
+			      Tl = Td - Tk;
+			      Tq = Tm - Tp;
+			      Tr = Tl - Tq;
+			      T17 = Tq + Tl;
+			      TZ = FMA(KP2_000000000, Tk, Td);
+			      T10 = FMA(KP2_000000000, Tp, Tm);
+			      T11 = TZ - T10;
+			      T13 = T10 + TZ;
+			 }
+		    }
+		    R1[WS(rs, 2)] = T11 - T12;
+		    R0[WS(rs, 6)] = T13 - T16;
+		    R1[0] = T13 + T16;
+		    R0[WS(rs, 4)] = T11 + T12;
+		    {
+			 E TL, TW, T19, T1c;
+			 TL = Tr - TK;
+			 TW = TO - TV;
+			 R1[WS(rs, 3)] = TL - TW;
+			 R0[WS(rs, 1)] = TL + TW;
+			 T19 = T17 - T18;
+			 T1c = T1a + T1b;
+			 R1[WS(rs, 1)] = T19 - T1c;
+			 R1[WS(rs, 4)] = T1c + T19;
+		    }
+		    {
+			 E T1d, T1e, TX, TY;
+			 T1d = T1a - T1b;
+			 T1e = T17 + T18;
+			 R0[WS(rs, 2)] = T1d + T1e;
+			 R0[WS(rs, 5)] = T1e - T1d;
+			 TX = Tr + TK;
+			 TY = TO + TV;
+			 R0[WS(rs, 3)] = TX - TY;
+			 R1[WS(rs, 5)] = TX + TY;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 13, "r2cb_13", {56, 15, 20, 0}, &GENUS };
+
+void X(codelet_r2cb_13) (planner *p) {
+     X(kr2c_register) (p, r2cb_13, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:08 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 14 -name r2cb_14 -include r2cb.h */
+
+/*
+ * This function contains 62 FP additions, 44 FP multiplications,
+ * (or, 18 additions, 0 multiplications, 44 fused multiply/add),
+ * 58 stack variables, 7 constants, and 28 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_14(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_949855824, +1.949855824363647214036263365987862434465571601);
+     DK(KP1_801937735, +1.801937735804838252472204639014890102331838324);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(56, rs), MAKE_VOLATILE_STRIDE(56, csr), MAKE_VOLATILE_STRIDE(56, csi)) {
+	       E Te, TO, TT, TG, TJ, TD, TR, TE;
+	       {
+		    E T3, TK, To, TM, Tu, TL, Tr, TS, TA, TN, TX, TF, Tv, T7, Tf;
+		    E T6, Th, Tc, T8, T1, T2;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 7)];
+		    {
+			 E Ts, Tt, Tp, Tq, Tm, Tn;
+			 Tm = Ci[WS(csi, 4)];
+			 Tn = Ci[WS(csi, 3)];
+			 Ts = Ci[WS(csi, 6)];
+			 Te = T1 + T2;
+			 T3 = T1 - T2;
+			 TK = Tm + Tn;
+			 To = Tm - Tn;
+			 Tt = Ci[WS(csi, 1)];
+			 Tp = Ci[WS(csi, 2)];
+			 Tq = Ci[WS(csi, 5)];
+			 {
+			      E T4, T5, Ta, Tb;
+			      T4 = Cr[WS(csr, 2)];
+			      TM = Ts + Tt;
+			      Tu = Ts - Tt;
+			      TL = Tp + Tq;
+			      Tr = Tp - Tq;
+			      TS = FMA(KP554958132, TK, TM);
+			      TA = FMA(KP554958132, To, Tu);
+			      TN = FMA(KP554958132, TM, TL);
+			      TX = FNMS(KP554958132, TL, TK);
+			      TF = FNMS(KP554958132, Tr, To);
+			      Tv = FMA(KP554958132, Tu, Tr);
+			      T5 = Cr[WS(csr, 5)];
+			      Ta = Cr[WS(csr, 6)];
+			      Tb = Cr[WS(csr, 1)];
+			      T7 = Cr[WS(csr, 4)];
+			      Tf = T4 + T5;
+			      T6 = T4 - T5;
+			      Th = Ta + Tb;
+			      Tc = Ta - Tb;
+			      T8 = Cr[WS(csr, 3)];
+			 }
+		    }
+		    {
+			 E Tw, Tx, TP, Tg, T9, TY, TC, TI, TQ;
+			 Tw = FMA(KP801937735, Tv, To);
+			 Tx = FNMS(KP356895867, Tf, Th);
+			 TP = FNMS(KP356895867, T6, Tc);
+			 Tg = T7 + T8;
+			 T9 = T7 - T8;
+			 TY = FNMS(KP801937735, TX, TM);
+			 {
+			      E TB, TH, TV, Ty, Tl, Ti, TW, Tz;
+			      TB = FNMS(KP801937735, TA, Tr);
+			      Ti = Tf + Tg + Th;
+			      TC = FNMS(KP356895867, Th, Tg);
+			      {
+				   E Tj, Td, TU, Tk;
+				   Tj = FNMS(KP356895867, Tg, Tf);
+				   Td = T6 + T9 + Tc;
+				   TH = FNMS(KP356895867, T9, T6);
+				   TU = FNMS(KP356895867, Tc, T9);
+				   R0[0] = FMA(KP2_000000000, Ti, Te);
+				   Tk = FNMS(KP692021471, Tj, Th);
+				   R1[WS(rs, 3)] = FMA(KP2_000000000, Td, T3);
+				   TV = FNMS(KP692021471, TU, T6);
+				   Ty = FNMS(KP692021471, Tx, Tg);
+				   Tl = FNMS(KP1_801937735, Tk, Te);
+			      }
+			      TO = FMA(KP801937735, TN, TK);
+			      TW = FNMS(KP1_801937735, TV, T3);
+			      Tz = FNMS(KP1_801937735, Ty, Te);
+			      R0[WS(rs, 3)] = FMA(KP1_949855824, Tw, Tl);
+			      R0[WS(rs, 4)] = FNMS(KP1_949855824, Tw, Tl);
+			      R1[WS(rs, 5)] = FMA(KP1_949855824, TY, TW);
+			      R1[WS(rs, 1)] = FNMS(KP1_949855824, TY, TW);
+			      R0[WS(rs, 6)] = FMA(KP1_949855824, TB, Tz);
+			      R0[WS(rs, 1)] = FNMS(KP1_949855824, TB, Tz);
+			      TI = FNMS(KP692021471, TH, Tc);
+			 }
+			 TT = FNMS(KP801937735, TS, TL);
+			 TQ = FNMS(KP692021471, TP, T9);
+			 TG = FNMS(KP801937735, TF, Tu);
+			 TJ = FNMS(KP1_801937735, TI, T3);
+			 TD = FNMS(KP692021471, TC, Tf);
+			 TR = FNMS(KP1_801937735, TQ, T3);
+		    }
+	       }
+	       R1[WS(rs, 6)] = FMA(KP1_949855824, TO, TJ);
+	       R1[0] = FNMS(KP1_949855824, TO, TJ);
+	       TE = FNMS(KP1_801937735, TD, Te);
+	       R1[WS(rs, 2)] = FMA(KP1_949855824, TT, TR);
+	       R1[WS(rs, 4)] = FNMS(KP1_949855824, TT, TR);
+	       R0[WS(rs, 2)] = FMA(KP1_949855824, TG, TE);
+	       R0[WS(rs, 5)] = FNMS(KP1_949855824, TG, TE);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 14, "r2cb_14", {18, 0, 44, 0}, &GENUS };
+
+void X(codelet_r2cb_14) (planner *p) {
+     X(kr2c_register) (p, r2cb_14, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 14 -name r2cb_14 -include r2cb.h */
+
+/*
+ * This function contains 62 FP additions, 38 FP multiplications,
+ * (or, 36 additions, 12 multiplications, 26 fused multiply/add),
+ * 28 stack variables, 7 constants, and 28 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_14(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_801937735, +1.801937735804838252472204639014890102331838324);
+     DK(KP445041867, +0.445041867912628808577805128993589518932711138);
+     DK(KP1_246979603, +1.246979603717467061050009768008479621264549462);
+     DK(KP867767478, +0.867767478235116240951536665696717509219981456);
+     DK(KP1_949855824, +1.949855824363647214036263365987862434465571601);
+     DK(KP1_563662964, +1.563662964936059617416889053348115500464669037);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(56, rs), MAKE_VOLATILE_STRIDE(56, csr), MAKE_VOLATILE_STRIDE(56, csi)) {
+	       E T3, Td, T6, Te, Tq, Tz, Tn, Ty, Tc, Tg, Tk, Tx, T9, Tf, T1;
+	       E T2;
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 7)];
+	       T3 = T1 - T2;
+	       Td = T1 + T2;
+	       {
+		    E T4, T5, To, Tp;
+		    T4 = Cr[WS(csr, 2)];
+		    T5 = Cr[WS(csr, 5)];
+		    T6 = T4 - T5;
+		    Te = T4 + T5;
+		    To = Ci[WS(csi, 2)];
+		    Tp = Ci[WS(csi, 5)];
+		    Tq = To - Tp;
+		    Tz = To + Tp;
+	       }
+	       {
+		    E Tl, Tm, Ta, Tb;
+		    Tl = Ci[WS(csi, 6)];
+		    Tm = Ci[WS(csi, 1)];
+		    Tn = Tl - Tm;
+		    Ty = Tl + Tm;
+		    Ta = Cr[WS(csr, 6)];
+		    Tb = Cr[WS(csr, 1)];
+		    Tc = Ta - Tb;
+		    Tg = Ta + Tb;
+	       }
+	       {
+		    E Ti, Tj, T7, T8;
+		    Ti = Ci[WS(csi, 4)];
+		    Tj = Ci[WS(csi, 3)];
+		    Tk = Ti - Tj;
+		    Tx = Ti + Tj;
+		    T7 = Cr[WS(csr, 4)];
+		    T8 = Cr[WS(csr, 3)];
+		    T9 = T7 - T8;
+		    Tf = T7 + T8;
+	       }
+	       R1[WS(rs, 3)] = FMA(KP2_000000000, T6 + T9 + Tc, T3);
+	       R0[0] = FMA(KP2_000000000, Te + Tf + Tg, Td);
+	       {
+		    E Tr, Th, TE, TD;
+		    Tr = FNMS(KP1_949855824, Tn, KP1_563662964 * Tk) - (KP867767478 * Tq);
+		    Th = FMA(KP1_246979603, Tf, Td) + FNMA(KP445041867, Tg, KP1_801937735 * Te);
+		    R0[WS(rs, 2)] = Th - Tr;
+		    R0[WS(rs, 5)] = Th + Tr;
+		    TE = FMA(KP867767478, Tx, KP1_563662964 * Ty) - (KP1_949855824 * Tz);
+		    TD = FMA(KP1_246979603, Tc, T3) + FNMA(KP1_801937735, T9, KP445041867 * T6);
+		    R1[WS(rs, 2)] = TD - TE;
+		    R1[WS(rs, 4)] = TD + TE;
+	       }
+	       {
+		    E Tt, Ts, TA, Tw;
+		    Tt = FMA(KP867767478, Tk, KP1_563662964 * Tn) - (KP1_949855824 * Tq);
+		    Ts = FMA(KP1_246979603, Tg, Td) + FNMA(KP1_801937735, Tf, KP445041867 * Te);
+		    R0[WS(rs, 6)] = Ts - Tt;
+		    R0[WS(rs, 1)] = Ts + Tt;
+		    TA = FNMS(KP1_949855824, Ty, KP1_563662964 * Tx) - (KP867767478 * Tz);
+		    Tw = FMA(KP1_246979603, T9, T3) + FNMA(KP445041867, Tc, KP1_801937735 * T6);
+		    R1[WS(rs, 5)] = Tw - TA;
+		    R1[WS(rs, 1)] = Tw + TA;
+	       }
+	       {
+		    E TC, TB, Tv, Tu;
+		    TC = FMA(KP1_563662964, Tz, KP1_949855824 * Tx) + (KP867767478 * Ty);
+		    TB = FMA(KP1_246979603, T6, T3) + FNMA(KP1_801937735, Tc, KP445041867 * T9);
+		    R1[0] = TB - TC;
+		    R1[WS(rs, 6)] = TB + TC;
+		    Tv = FMA(KP1_563662964, Tq, KP1_949855824 * Tk) + (KP867767478 * Tn);
+		    Tu = FMA(KP1_246979603, Te, Td) + FNMA(KP1_801937735, Tg, KP445041867 * Tf);
+		    R0[WS(rs, 4)] = Tu - Tv;
+		    R0[WS(rs, 3)] = Tu + Tv;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 14, "r2cb_14", {36, 12, 26, 0}, &GENUS };
+
+void X(codelet_r2cb_14) (planner *p) {
+     X(kr2c_register) (p, r2cb_14, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:08 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -name r2cb_15 -include r2cb.h */
+
+/*
+ * This function contains 64 FP additions, 43 FP multiplications,
+ * (or, 21 additions, 0 multiplications, 43 fused multiply/add),
+ * 54 stack variables, 9 constants, and 30 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
+	       E TL, Tz, TM, TK;
+	       {
+		    E T3, Th, Tt, TD, TI, TH, TY, TC, TZ, Tu, Tm, Tv, Tr, Te, TW;
+		    E Tg, T1, T2, T12, T10, TV;
+		    Tg = Ci[WS(csi, 5)];
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 5)];
+		    {
+			 E T4, TA, T9, TF, T7, Tj, Tc, Tk, TG, Tq, Tf, Tl, TB;
+			 T4 = Cr[WS(csr, 3)];
+			 TA = Ci[WS(csi, 3)];
+			 T9 = Cr[WS(csr, 6)];
+			 Tf = T1 - T2;
+			 T3 = FMA(KP2_000000000, T2, T1);
+			 TF = Ci[WS(csi, 6)];
+			 {
+			      E Ta, Tb, T5, T6, To, Tp;
+			      T5 = Cr[WS(csr, 7)];
+			      T6 = Cr[WS(csr, 2)];
+			      Th = FMA(KP1_732050807, Tg, Tf);
+			      Tt = FNMS(KP1_732050807, Tg, Tf);
+			      Ta = Cr[WS(csr, 4)];
+			      TD = T5 - T6;
+			      T7 = T5 + T6;
+			      Tb = Cr[WS(csr, 1)];
+			      To = Ci[WS(csi, 4)];
+			      Tp = Ci[WS(csi, 1)];
+			      Tj = Ci[WS(csi, 7)];
+			      Tc = Ta + Tb;
+			      TI = Ta - Tb;
+			      Tk = Ci[WS(csi, 2)];
+			      TG = Tp - To;
+			      Tq = To + Tp;
+			 }
+			 Tl = Tj - Tk;
+			 TB = Tj + Tk;
+			 TH = FNMS(KP500000000, TG, TF);
+			 TY = TG + TF;
+			 TC = FMA(KP500000000, TB, TA);
+			 TZ = TA - TB;
+			 {
+			      E Ti, T8, Td, Tn;
+			      Ti = FNMS(KP2_000000000, T4, T7);
+			      T8 = T4 + T7;
+			      Td = T9 + Tc;
+			      Tn = FNMS(KP2_000000000, T9, Tc);
+			      Tu = FNMS(KP1_732050807, Tl, Ti);
+			      Tm = FMA(KP1_732050807, Tl, Ti);
+			      Tv = FNMS(KP1_732050807, Tq, Tn);
+			      Tr = FMA(KP1_732050807, Tq, Tn);
+			      Te = T8 + Td;
+			      TW = T8 - Td;
+			 }
+		    }
+		    T12 = FMA(KP618033988, TY, TZ);
+		    T10 = FNMS(KP618033988, TZ, TY);
+		    TV = FNMS(KP500000000, Te, T3);
+		    R0[0] = FMA(KP2_000000000, Te, T3);
+		    {
+			 E TJ, TE, TT, TP, TU, TS, Ty, Tw, Tx;
+			 {
+			      E TO, Ts, TQ, TN, TR, T11, TX;
+			      TO = Tr - Tm;
+			      Ts = Tm + Tr;
+			      T11 = FMA(KP1_118033988, TW, TV);
+			      TX = FNMS(KP1_118033988, TW, TV);
+			      TQ = FNMS(KP866025403, TI, TH);
+			      TJ = FMA(KP866025403, TI, TH);
+			      TN = FMA(KP250000000, Ts, Th);
+			      R0[WS(rs, 3)] = FNMS(KP1_902113032, T12, T11);
+			      R1[WS(rs, 4)] = FMA(KP1_902113032, T12, T11);
+			      R0[WS(rs, 6)] = FMA(KP1_902113032, T10, TX);
+			      R1[WS(rs, 1)] = FNMS(KP1_902113032, T10, TX);
+			      TR = FNMS(KP866025403, TD, TC);
+			      TE = FMA(KP866025403, TD, TC);
+			      R1[WS(rs, 2)] = Th - Ts;
+			      TT = FMA(KP559016994, TO, TN);
+			      TP = FNMS(KP559016994, TO, TN);
+			      TU = FMA(KP618033988, TQ, TR);
+			      TS = FNMS(KP618033988, TR, TQ);
+			 }
+			 Ty = Tv - Tu;
+			 Tw = Tu + Tv;
+			 R0[WS(rs, 7)] = FMA(KP1_902113032, TU, TT);
+			 R1[WS(rs, 5)] = FNMS(KP1_902113032, TU, TT);
+			 R0[WS(rs, 1)] = FMA(KP1_902113032, TS, TP);
+			 R0[WS(rs, 4)] = FNMS(KP1_902113032, TS, TP);
+			 Tx = FMA(KP250000000, Tw, Tt);
+			 R0[WS(rs, 5)] = Tt - Tw;
+			 TL = FNMS(KP559016994, Ty, Tx);
+			 Tz = FMA(KP559016994, Ty, Tx);
+			 TM = FNMS(KP618033988, TE, TJ);
+			 TK = FMA(KP618033988, TJ, TE);
+		    }
+	       }
+	       R1[WS(rs, 3)] = FMA(KP1_902113032, TM, TL);
+	       R1[WS(rs, 6)] = FNMS(KP1_902113032, TM, TL);
+	       R0[WS(rs, 2)] = FMA(KP1_902113032, TK, Tz);
+	       R1[0] = FNMS(KP1_902113032, TK, Tz);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 15, "r2cb_15", {21, 0, 43, 0}, &GENUS };
+
+void X(codelet_r2cb_15) (planner *p) {
+     X(kr2c_register) (p, r2cb_15, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -name r2cb_15 -include r2cb.h */
+
+/*
+ * This function contains 64 FP additions, 31 FP multiplications,
+ * (or, 47 additions, 14 multiplications, 17 fused multiply/add),
+ * 44 stack variables, 7 constants, and 30 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
+	       E T3, Tu, Ti, TB, TZ, T10, TE, TG, TJ, Tn, Tv, Ts, Tw, T8, Td;
+	       E Te;
+	       {
+		    E Th, T1, T2, Tf, Tg;
+		    Tg = Ci[WS(csi, 5)];
+		    Th = KP1_732050807 * Tg;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 5)];
+		    Tf = T1 - T2;
+		    T3 = FMA(KP2_000000000, T2, T1);
+		    Tu = Tf - Th;
+		    Ti = Tf + Th;
+	       }
+	       {
+		    E T4, TD, T9, TI, T5, T6, T7, Ta, Tb, Tc, Tr, TH, Tm, TC, Tj;
+		    E To;
+		    T4 = Cr[WS(csr, 3)];
+		    TD = Ci[WS(csi, 3)];
+		    T9 = Cr[WS(csr, 6)];
+		    TI = Ci[WS(csi, 6)];
+		    T5 = Cr[WS(csr, 7)];
+		    T6 = Cr[WS(csr, 2)];
+		    T7 = T5 + T6;
+		    Ta = Cr[WS(csr, 4)];
+		    Tb = Cr[WS(csr, 1)];
+		    Tc = Ta + Tb;
+		    {
+			 E Tp, Tq, Tk, Tl;
+			 Tp = Ci[WS(csi, 4)];
+			 Tq = Ci[WS(csi, 1)];
+			 Tr = KP866025403 * (Tp + Tq);
+			 TH = Tp - Tq;
+			 Tk = Ci[WS(csi, 7)];
+			 Tl = Ci[WS(csi, 2)];
+			 Tm = KP866025403 * (Tk - Tl);
+			 TC = Tk + Tl;
+		    }
+		    TB = KP866025403 * (T5 - T6);
+		    TZ = TD - TC;
+		    T10 = TI - TH;
+		    TE = FMA(KP500000000, TC, TD);
+		    TG = KP866025403 * (Ta - Tb);
+		    TJ = FMA(KP500000000, TH, TI);
+		    Tj = FNMS(KP500000000, T7, T4);
+		    Tn = Tj - Tm;
+		    Tv = Tj + Tm;
+		    To = FNMS(KP500000000, Tc, T9);
+		    Ts = To - Tr;
+		    Tw = To + Tr;
+		    T8 = T4 + T7;
+		    Td = T9 + Tc;
+		    Te = T8 + Td;
+	       }
+	       R0[0] = FMA(KP2_000000000, Te, T3);
+	       {
+		    E T11, T13, TY, T12, TW, TX;
+		    T11 = FNMS(KP1_902113032, T10, KP1_175570504 * TZ);
+		    T13 = FMA(KP1_902113032, TZ, KP1_175570504 * T10);
+		    TW = FNMS(KP500000000, Te, T3);
+		    TX = KP1_118033988 * (T8 - Td);
+		    TY = TW - TX;
+		    T12 = TX + TW;
+		    R0[WS(rs, 6)] = TY - T11;
+		    R1[WS(rs, 4)] = T12 + T13;
+		    R1[WS(rs, 1)] = TY + T11;
+		    R0[WS(rs, 3)] = T12 - T13;
+	       }
+	       {
+		    E TP, Tt, TO, TT, TV, TR, TS, TU, TQ;
+		    TP = KP1_118033988 * (Tn - Ts);
+		    Tt = Tn + Ts;
+		    TO = FNMS(KP500000000, Tt, Ti);
+		    TR = TE - TB;
+		    TS = TJ - TG;
+		    TT = FNMS(KP1_902113032, TS, KP1_175570504 * TR);
+		    TV = FMA(KP1_902113032, TR, KP1_175570504 * TS);
+		    R1[WS(rs, 2)] = FMA(KP2_000000000, Tt, Ti);
+		    TU = TP + TO;
+		    R1[WS(rs, 5)] = TU - TV;
+		    R0[WS(rs, 7)] = TU + TV;
+		    TQ = TO - TP;
+		    R0[WS(rs, 1)] = TQ - TT;
+		    R0[WS(rs, 4)] = TQ + TT;
+	       }
+	       {
+		    E Tz, Tx, Ty, TL, TN, TF, TK, TM, TA;
+		    Tz = KP1_118033988 * (Tv - Tw);
+		    Tx = Tv + Tw;
+		    Ty = FNMS(KP500000000, Tx, Tu);
+		    TF = TB + TE;
+		    TK = TG + TJ;
+		    TL = FNMS(KP1_902113032, TK, KP1_175570504 * TF);
+		    TN = FMA(KP1_902113032, TF, KP1_175570504 * TK);
+		    R0[WS(rs, 5)] = FMA(KP2_000000000, Tx, Tu);
+		    TM = Tz + Ty;
+		    R1[0] = TM - TN;
+		    R0[WS(rs, 2)] = TM + TN;
+		    TA = Ty - Tz;
+		    R1[WS(rs, 3)] = TA - TL;
+		    R1[WS(rs, 6)] = TA + TL;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 15, "r2cb_15", {47, 14, 17, 0}, &GENUS };
+
+void X(codelet_r2cb_15) (planner *p) {
+     X(kr2c_register) (p, r2cb_15, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:08 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cb_16 -include r2cb.h */
+
+/*
+ * This function contains 58 FP additions, 32 FP multiplications,
+ * (or, 26 additions, 0 multiplications, 32 fused multiply/add),
+ * 47 stack variables, 4 constants, and 32 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
+	       E TN, TS, TF, TI;
+	       {
+		    E T8, TD, Tj, TL, T5, TM, TE, To, Td, Tq, Tc, TP, Ty, Te, Tr;
+		    E Ts;
+		    {
+			 E T4, Ti, T1, T2;
+			 T4 = Cr[WS(csr, 4)];
+			 Ti = Ci[WS(csi, 4)];
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 8)];
+			 {
+			      E Tk, Tn, T6, T7;
+			      T6 = Cr[WS(csr, 2)];
+			      T7 = Cr[WS(csr, 6)];
+			      {
+				   E Tl, Th, T3, Tm;
+				   Tl = Ci[WS(csi, 2)];
+				   Th = T1 - T2;
+				   T3 = T1 + T2;
+				   Tk = T6 - T7;
+				   T8 = T6 + T7;
+				   Tm = Ci[WS(csi, 6)];
+				   TD = FMA(KP2_000000000, Ti, Th);
+				   Tj = FNMS(KP2_000000000, Ti, Th);
+				   TL = FNMS(KP2_000000000, T4, T3);
+				   T5 = FMA(KP2_000000000, T4, T3);
+				   Tn = Tl + Tm;
+				   TM = Tl - Tm;
+			      }
+			      {
+				   E Ta, Tb, Tw, Tx;
+				   Ta = Cr[WS(csr, 1)];
+				   TE = Tk + Tn;
+				   To = Tk - Tn;
+				   Tb = Cr[WS(csr, 7)];
+				   Tw = Ci[WS(csi, 1)];
+				   Tx = Ci[WS(csi, 7)];
+				   Td = Cr[WS(csr, 5)];
+				   Tq = Ta - Tb;
+				   Tc = Ta + Tb;
+				   TP = Tw - Tx;
+				   Ty = Tw + Tx;
+				   Te = Cr[WS(csr, 3)];
+				   Tr = Ci[WS(csi, 5)];
+				   Ts = Ci[WS(csi, 3)];
+			      }
+			 }
+		    }
+		    {
+			 E TV, TG, TW, TH, TB, Tp, TA, TC, TJ, TK;
+			 {
+			      E T9, Tz, Tg, Tu, TT, TU, TO, TR;
+			      TV = FNMS(KP2_000000000, T8, T5);
+			      T9 = FMA(KP2_000000000, T8, T5);
+			      {
+				   E Tv, Tf, TQ, Tt;
+				   Tv = Td - Te;
+				   Tf = Td + Te;
+				   TQ = Tr - Ts;
+				   Tt = Tr + Ts;
+				   TG = Ty - Tv;
+				   Tz = Tv + Ty;
+				   TO = Tc - Tf;
+				   Tg = Tc + Tf;
+				   TW = TQ + TP;
+				   TR = TP - TQ;
+				   TH = Tq + Tt;
+				   Tu = Tq - Tt;
+			      }
+			      TN = FNMS(KP2_000000000, TM, TL);
+			      TT = FMA(KP2_000000000, TM, TL);
+			      TU = TO + TR;
+			      TS = TO - TR;
+			      R0[0] = FMA(KP2_000000000, Tg, T9);
+			      R0[WS(rs, 4)] = FNMS(KP2_000000000, Tg, T9);
+			      R0[WS(rs, 7)] = FMA(KP1_414213562, TU, TT);
+			      R0[WS(rs, 3)] = FNMS(KP1_414213562, TU, TT);
+			      TB = FNMS(KP1_414213562, To, Tj);
+			      Tp = FMA(KP1_414213562, To, Tj);
+			      TA = FNMS(KP414213562, Tz, Tu);
+			      TC = FMA(KP414213562, Tu, Tz);
+			 }
+			 R0[WS(rs, 6)] = FMA(KP2_000000000, TW, TV);
+			 R0[WS(rs, 2)] = FNMS(KP2_000000000, TW, TV);
+			 R1[0] = FMA(KP1_847759065, TA, Tp);
+			 R1[WS(rs, 4)] = FNMS(KP1_847759065, TA, Tp);
+			 TF = FNMS(KP1_414213562, TE, TD);
+			 TJ = FMA(KP1_414213562, TE, TD);
+			 TK = FMA(KP414213562, TG, TH);
+			 TI = FNMS(KP414213562, TH, TG);
+			 R1[WS(rs, 6)] = FMA(KP1_847759065, TC, TB);
+			 R1[WS(rs, 2)] = FNMS(KP1_847759065, TC, TB);
+			 R1[WS(rs, 7)] = FMA(KP1_847759065, TK, TJ);
+			 R1[WS(rs, 3)] = FNMS(KP1_847759065, TK, TJ);
+		    }
+	       }
+	       R0[WS(rs, 1)] = FMA(KP1_414213562, TS, TN);
+	       R0[WS(rs, 5)] = FNMS(KP1_414213562, TS, TN);
+	       R1[WS(rs, 5)] = FMA(KP1_847759065, TI, TF);
+	       R1[WS(rs, 1)] = FNMS(KP1_847759065, TI, TF);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 16, "r2cb_16", {26, 0, 32, 0}, &GENUS };
+
+void X(codelet_r2cb_16) (planner *p) {
+     X(kr2c_register) (p, r2cb_16, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cb_16 -include r2cb.h */
+
+/*
+ * This function contains 58 FP additions, 18 FP multiplications,
+ * (or, 54 additions, 14 multiplications, 4 fused multiply/add),
+ * 31 stack variables, 4 constants, and 32 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP765366864, +0.765366864730179543456919968060797733522689125);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
+	       E T9, TS, Tl, TG, T6, TR, Ti, TD, Td, Tq, Tg, Tt, Tn, Tu, TV;
+	       E TU, TN, TK;
+	       {
+		    E T7, T8, TE, Tj, Tk, TF;
+		    T7 = Cr[WS(csr, 2)];
+		    T8 = Cr[WS(csr, 6)];
+		    TE = T7 - T8;
+		    Tj = Ci[WS(csi, 2)];
+		    Tk = Ci[WS(csi, 6)];
+		    TF = Tj + Tk;
+		    T9 = KP2_000000000 * (T7 + T8);
+		    TS = KP1_414213562 * (TE + TF);
+		    Tl = KP2_000000000 * (Tj - Tk);
+		    TG = KP1_414213562 * (TE - TF);
+	       }
+	       {
+		    E T5, TC, T3, TA;
+		    {
+			 E T4, TB, T1, T2;
+			 T4 = Cr[WS(csr, 4)];
+			 T5 = KP2_000000000 * T4;
+			 TB = Ci[WS(csi, 4)];
+			 TC = KP2_000000000 * TB;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 8)];
+			 T3 = T1 + T2;
+			 TA = T1 - T2;
+		    }
+		    T6 = T3 + T5;
+		    TR = TA + TC;
+		    Ti = T3 - T5;
+		    TD = TA - TC;
+	       }
+	       {
+		    E TI, TM, TL, TJ;
+		    {
+			 E Tb, Tc, To, Tp;
+			 Tb = Cr[WS(csr, 1)];
+			 Tc = Cr[WS(csr, 7)];
+			 Td = Tb + Tc;
+			 TI = Tb - Tc;
+			 To = Ci[WS(csi, 1)];
+			 Tp = Ci[WS(csi, 7)];
+			 Tq = To - Tp;
+			 TM = To + Tp;
+		    }
+		    {
+			 E Te, Tf, Tr, Ts;
+			 Te = Cr[WS(csr, 5)];
+			 Tf = Cr[WS(csr, 3)];
+			 Tg = Te + Tf;
+			 TL = Te - Tf;
+			 Tr = Ci[WS(csi, 5)];
+			 Ts = Ci[WS(csi, 3)];
+			 Tt = Tr - Ts;
+			 TJ = Tr + Ts;
+		    }
+		    Tn = Td - Tg;
+		    Tu = Tq - Tt;
+		    TV = TM - TL;
+		    TU = TI + TJ;
+		    TN = TL + TM;
+		    TK = TI - TJ;
+	       }
+	       {
+		    E Ta, Th, TT, TW;
+		    Ta = T6 + T9;
+		    Th = KP2_000000000 * (Td + Tg);
+		    R0[WS(rs, 4)] = Ta - Th;
+		    R0[0] = Ta + Th;
+		    TT = TR - TS;
+		    TW = FNMS(KP1_847759065, TV, KP765366864 * TU);
+		    R1[WS(rs, 5)] = TT - TW;
+		    R1[WS(rs, 1)] = TT + TW;
+	       }
+	       {
+		    E TX, TY, Tm, Tv;
+		    TX = TR + TS;
+		    TY = FMA(KP1_847759065, TU, KP765366864 * TV);
+		    R1[WS(rs, 3)] = TX - TY;
+		    R1[WS(rs, 7)] = TX + TY;
+		    Tm = Ti - Tl;
+		    Tv = KP1_414213562 * (Tn - Tu);
+		    R0[WS(rs, 5)] = Tm - Tv;
+		    R0[WS(rs, 1)] = Tm + Tv;
+	       }
+	       {
+		    E Tw, Tx, TH, TO;
+		    Tw = Ti + Tl;
+		    Tx = KP1_414213562 * (Tn + Tu);
+		    R0[WS(rs, 3)] = Tw - Tx;
+		    R0[WS(rs, 7)] = Tw + Tx;
+		    TH = TD + TG;
+		    TO = FNMS(KP765366864, TN, KP1_847759065 * TK);
+		    R1[WS(rs, 4)] = TH - TO;
+		    R1[0] = TH + TO;
+	       }
+	       {
+		    E TP, TQ, Ty, Tz;
+		    TP = TD - TG;
+		    TQ = FMA(KP765366864, TK, KP1_847759065 * TN);
+		    R1[WS(rs, 2)] = TP - TQ;
+		    R1[WS(rs, 6)] = TP + TQ;
+		    Ty = T6 - T9;
+		    Tz = KP2_000000000 * (Tt + Tq);
+		    R0[WS(rs, 2)] = Ty - Tz;
+		    R0[WS(rs, 6)] = Ty + Tz;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 16, "r2cb_16", {54, 14, 4, 0}, &GENUS };
+
+void X(codelet_r2cb_16) (planner *p) {
+     X(kr2c_register) (p, r2cb_16, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -name r2cb_2 -include r2cb.h */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 3 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
+	       E T1, T2;
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 1)];
+	       R0[0] = T1 + T2;
+	       R1[0] = T1 - T2;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 2, "r2cb_2", {2, 0, 0, 0}, &GENUS };
+
+void X(codelet_r2cb_2) (planner *p) {
+     X(kr2c_register) (p, r2cb_2, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -name r2cb_2 -include r2cb.h */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 3 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
+	       E T1, T2;
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 1)];
+	       R1[0] = T1 - T2;
+	       R0[0] = T1 + T2;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 2, "r2cb_2", {2, 0, 0, 0}, &GENUS };
+
+void X(codelet_r2cb_2) (planner *p) {
+     X(kr2c_register) (p, r2cb_2, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:09 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -name r2cb_20 -include r2cb.h */
+
+/*
+ * This function contains 86 FP additions, 44 FP multiplications,
+ * (or, 42 additions, 0 multiplications, 44 fused multiply/add),
+ * 69 stack variables, 5 constants, and 40 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
+	       E TY, T1o, T1m, T14, T12, TX, T1n, T1j, TZ, T13;
+	       {
+		    E Tr, TD, Tl, T5, T1a, T1l, T1d, T1k, TT, T10, TO, T11, TE, TF, Tk;
+		    E TI, TC, T1i, To, TG, T16;
+		    {
+			 E T4, Tq, T1, T2;
+			 T4 = Cr[WS(csr, 5)];
+			 Tq = Ci[WS(csi, 5)];
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 10)];
+			 {
+			      E Ts, T8, T19, TR, T18, Tb, TS, Tv, Tx, Tf, Ty, T1c, TM, T1b, Ti;
+			      E Tz, Tt, Tu, TN, TA;
+			      {
+				   E TP, TQ, T9, Ta;
+				   {
+					E T6, T7, Tp, T3;
+					T6 = Cr[WS(csr, 4)];
+					T7 = Cr[WS(csr, 6)];
+					TP = Ci[WS(csi, 4)];
+					Tp = T1 - T2;
+					T3 = T1 + T2;
+					Ts = T6 - T7;
+					T8 = T6 + T7;
+					Tr = FMA(KP2_000000000, Tq, Tp);
+					TD = FNMS(KP2_000000000, Tq, Tp);
+					Tl = FMA(KP2_000000000, T4, T3);
+					T5 = FNMS(KP2_000000000, T4, T3);
+					TQ = Ci[WS(csi, 6)];
+				   }
+				   T9 = Cr[WS(csr, 9)];
+				   Ta = Cr[WS(csr, 1)];
+				   Tt = Ci[WS(csi, 9)];
+				   T19 = TP + TQ;
+				   TR = TP - TQ;
+				   T18 = T9 - Ta;
+				   Tb = T9 + Ta;
+				   Tu = Ci[WS(csi, 1)];
+			      }
+			      {
+				   E TK, TL, Td, Te, Tg, Th;
+				   Td = Cr[WS(csr, 8)];
+				   Te = Cr[WS(csr, 2)];
+				   TK = Ci[WS(csi, 8)];
+				   TS = Tt - Tu;
+				   Tv = Tt + Tu;
+				   Tx = Td - Te;
+				   Tf = Td + Te;
+				   TL = Ci[WS(csi, 2)];
+				   Tg = Cr[WS(csr, 7)];
+				   Th = Cr[WS(csr, 3)];
+				   Ty = Ci[WS(csi, 7)];
+				   T1c = TK + TL;
+				   TM = TK - TL;
+				   T1b = Tg - Th;
+				   Ti = Tg + Th;
+				   Tz = Ci[WS(csi, 3)];
+			      }
+			      T1a = T18 + T19;
+			      T1l = T19 - T18;
+			      T1d = T1b + T1c;
+			      T1k = T1c - T1b;
+			      TT = TR - TS;
+			      T10 = TS + TR;
+			      TN = Tz - Ty;
+			      TA = Ty + Tz;
+			      TO = TM - TN;
+			      T11 = TN + TM;
+			      {
+				   E Tm, Tc, Tj, Tn, Tw, TB;
+				   Tm = T8 + Tb;
+				   Tc = T8 - Tb;
+				   Tj = Tf - Ti;
+				   Tn = Tf + Ti;
+				   TE = Ts - Tv;
+				   Tw = Ts + Tv;
+				   TB = Tx - TA;
+				   TF = Tx + TA;
+				   Tk = Tc + Tj;
+				   TI = Tc - Tj;
+				   TC = Tw + TB;
+				   T1i = Tw - TB;
+				   TY = Tm - Tn;
+				   To = Tm + Tn;
+			      }
+			 }
+		    }
+		    R0[WS(rs, 5)] = FMA(KP2_000000000, Tk, T5);
+		    R1[WS(rs, 7)] = FMA(KP2_000000000, TC, Tr);
+		    TG = TE + TF;
+		    T16 = TE - TF;
+		    R0[0] = FMA(KP2_000000000, To, Tl);
+		    {
+			 E TU, TW, T1g, T1e, T15, TV, TJ, TH, T1h, T1f, T17;
+			 TU = FNMS(KP618033988, TT, TO);
+			 TW = FMA(KP618033988, TO, TT);
+			 R1[WS(rs, 2)] = FMA(KP2_000000000, TG, TD);
+			 TH = FNMS(KP500000000, Tk, T5);
+			 T1g = FNMS(KP618033988, T1a, T1d);
+			 T1e = FMA(KP618033988, T1d, T1a);
+			 T15 = FNMS(KP500000000, TG, TD);
+			 TV = FMA(KP1_118033988, TI, TH);
+			 TJ = FNMS(KP1_118033988, TI, TH);
+			 T1o = FMA(KP618033988, T1k, T1l);
+			 T1m = FNMS(KP618033988, T1l, T1k);
+			 R0[WS(rs, 3)] = FNMS(KP1_902113032, TW, TV);
+			 R0[WS(rs, 7)] = FMA(KP1_902113032, TW, TV);
+			 R0[WS(rs, 1)] = FMA(KP1_902113032, TU, TJ);
+			 R0[WS(rs, 9)] = FNMS(KP1_902113032, TU, TJ);
+			 T1f = FNMS(KP1_118033988, T16, T15);
+			 T17 = FMA(KP1_118033988, T16, T15);
+			 T1h = FNMS(KP500000000, TC, Tr);
+			 R1[WS(rs, 6)] = FNMS(KP1_902113032, T1g, T1f);
+			 R1[WS(rs, 8)] = FMA(KP1_902113032, T1g, T1f);
+			 R1[WS(rs, 4)] = FMA(KP1_902113032, T1e, T17);
+			 R1[0] = FNMS(KP1_902113032, T1e, T17);
+			 T14 = FNMS(KP618033988, T10, T11);
+			 T12 = FMA(KP618033988, T11, T10);
+			 TX = FNMS(KP500000000, To, Tl);
+			 T1n = FMA(KP1_118033988, T1i, T1h);
+			 T1j = FNMS(KP1_118033988, T1i, T1h);
+		    }
+	       }
+	       R1[WS(rs, 5)] = FNMS(KP1_902113032, T1o, T1n);
+	       R1[WS(rs, 9)] = FMA(KP1_902113032, T1o, T1n);
+	       R1[WS(rs, 3)] = FMA(KP1_902113032, T1m, T1j);
+	       R1[WS(rs, 1)] = FNMS(KP1_902113032, T1m, T1j);
+	       TZ = FMA(KP1_118033988, TY, TX);
+	       T13 = FNMS(KP1_118033988, TY, TX);
+	       R0[WS(rs, 4)] = FNMS(KP1_902113032, T14, T13);
+	       R0[WS(rs, 6)] = FMA(KP1_902113032, T14, T13);
+	       R0[WS(rs, 2)] = FMA(KP1_902113032, T12, TZ);
+	       R0[WS(rs, 8)] = FNMS(KP1_902113032, T12, TZ);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 20, "r2cb_20", {42, 0, 44, 0}, &GENUS };
+
+void X(codelet_r2cb_20) (planner *p) {
+     X(kr2c_register) (p, r2cb_20, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -name r2cb_20 -include r2cb.h */
+
+/*
+ * This function contains 86 FP additions, 30 FP multiplications,
+ * (or, 70 additions, 14 multiplications, 16 fused multiply/add),
+ * 50 stack variables, 5 constants, and 40 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
+	       E T6, TF, Tm, Tt, TQ, T1n, T1f, T12, T1m, TV, T13, T1c, Td, Tk, Tl;
+	       E Ty, TD, TE, Tn, To, Tp, TG, TH, TI;
+	       {
+		    E T5, Ts, T3, Tq;
+		    {
+			 E T4, Tr, T1, T2;
+			 T4 = Cr[WS(csr, 5)];
+			 T5 = KP2_000000000 * T4;
+			 Tr = Ci[WS(csi, 5)];
+			 Ts = KP2_000000000 * Tr;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 10)];
+			 T3 = T1 + T2;
+			 Tq = T1 - T2;
+		    }
+		    T6 = T3 - T5;
+		    TF = Tq - Ts;
+		    Tm = T3 + T5;
+		    Tt = Tq + Ts;
+	       }
+	       {
+		    E T9, Tu, TO, T1b, Tc, T1a, Tx, TP, Tg, Tz, TT, T1e, Tj, T1d, TC;
+		    E TU;
+		    {
+			 E T7, T8, TM, TN;
+			 T7 = Cr[WS(csr, 4)];
+			 T8 = Cr[WS(csr, 6)];
+			 T9 = T7 + T8;
+			 Tu = T7 - T8;
+			 TM = Ci[WS(csi, 4)];
+			 TN = Ci[WS(csi, 6)];
+			 TO = TM - TN;
+			 T1b = TM + TN;
+		    }
+		    {
+			 E Ta, Tb, Tv, Tw;
+			 Ta = Cr[WS(csr, 9)];
+			 Tb = Cr[WS(csr, 1)];
+			 Tc = Ta + Tb;
+			 T1a = Ta - Tb;
+			 Tv = Ci[WS(csi, 9)];
+			 Tw = Ci[WS(csi, 1)];
+			 Tx = Tv + Tw;
+			 TP = Tv - Tw;
+		    }
+		    {
+			 E Te, Tf, TR, TS;
+			 Te = Cr[WS(csr, 8)];
+			 Tf = Cr[WS(csr, 2)];
+			 Tg = Te + Tf;
+			 Tz = Te - Tf;
+			 TR = Ci[WS(csi, 8)];
+			 TS = Ci[WS(csi, 2)];
+			 TT = TR - TS;
+			 T1e = TR + TS;
+		    }
+		    {
+			 E Th, Ti, TA, TB;
+			 Th = Cr[WS(csr, 7)];
+			 Ti = Cr[WS(csr, 3)];
+			 Tj = Th + Ti;
+			 T1d = Th - Ti;
+			 TA = Ci[WS(csi, 7)];
+			 TB = Ci[WS(csi, 3)];
+			 TC = TA + TB;
+			 TU = TB - TA;
+		    }
+		    TQ = TO - TP;
+		    T1n = T1e - T1d;
+		    T1f = T1d + T1e;
+		    T12 = TP + TO;
+		    T1m = T1b - T1a;
+		    TV = TT - TU;
+		    T13 = TU + TT;
+		    T1c = T1a + T1b;
+		    Td = T9 - Tc;
+		    Tk = Tg - Tj;
+		    Tl = Td + Tk;
+		    Ty = Tu + Tx;
+		    TD = Tz - TC;
+		    TE = Ty + TD;
+		    Tn = T9 + Tc;
+		    To = Tg + Tj;
+		    Tp = Tn + To;
+		    TG = Tu - Tx;
+		    TH = Tz + TC;
+		    TI = TG + TH;
+	       }
+	       R0[WS(rs, 5)] = FMA(KP2_000000000, Tl, T6);
+	       R1[WS(rs, 7)] = FMA(KP2_000000000, TE, Tt);
+	       R1[WS(rs, 2)] = FMA(KP2_000000000, TI, TF);
+	       R0[0] = FMA(KP2_000000000, Tp, Tm);
+	       {
+		    E TW, TY, TL, TX, TJ, TK;
+		    TW = FNMS(KP1_902113032, TV, KP1_175570504 * TQ);
+		    TY = FMA(KP1_902113032, TQ, KP1_175570504 * TV);
+		    TJ = FNMS(KP500000000, Tl, T6);
+		    TK = KP1_118033988 * (Td - Tk);
+		    TL = TJ - TK;
+		    TX = TK + TJ;
+		    R0[WS(rs, 1)] = TL - TW;
+		    R0[WS(rs, 7)] = TX + TY;
+		    R0[WS(rs, 9)] = TL + TW;
+		    R0[WS(rs, 3)] = TX - TY;
+	       }
+	       {
+		    E T1g, T1i, T19, T1h, T17, T18;
+		    T1g = FNMS(KP1_902113032, T1f, KP1_175570504 * T1c);
+		    T1i = FMA(KP1_902113032, T1c, KP1_175570504 * T1f);
+		    T17 = FNMS(KP500000000, TI, TF);
+		    T18 = KP1_118033988 * (TG - TH);
+		    T19 = T17 - T18;
+		    T1h = T18 + T17;
+		    R1[WS(rs, 8)] = T19 - T1g;
+		    R1[WS(rs, 4)] = T1h + T1i;
+		    R1[WS(rs, 6)] = T19 + T1g;
+		    R1[0] = T1h - T1i;
+	       }
+	       {
+		    E T1o, T1q, T1l, T1p, T1j, T1k;
+		    T1o = FNMS(KP1_902113032, T1n, KP1_175570504 * T1m);
+		    T1q = FMA(KP1_902113032, T1m, KP1_175570504 * T1n);
+		    T1j = FNMS(KP500000000, TE, Tt);
+		    T1k = KP1_118033988 * (Ty - TD);
+		    T1l = T1j - T1k;
+		    T1p = T1k + T1j;
+		    R1[WS(rs, 3)] = T1l - T1o;
+		    R1[WS(rs, 9)] = T1p + T1q;
+		    R1[WS(rs, 1)] = T1l + T1o;
+		    R1[WS(rs, 5)] = T1p - T1q;
+	       }
+	       {
+		    E T14, T16, T11, T15, TZ, T10;
+		    T14 = FNMS(KP1_902113032, T13, KP1_175570504 * T12);
+		    T16 = FMA(KP1_902113032, T12, KP1_175570504 * T13);
+		    TZ = FNMS(KP500000000, Tp, Tm);
+		    T10 = KP1_118033988 * (Tn - To);
+		    T11 = TZ - T10;
+		    T15 = T10 + TZ;
+		    R0[WS(rs, 6)] = T11 - T14;
+		    R0[WS(rs, 2)] = T15 + T16;
+		    R0[WS(rs, 4)] = T11 + T14;
+		    R0[WS(rs, 8)] = T15 - T16;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 20, "r2cb_20", {70, 14, 16, 0}, &GENUS };
+
+void X(codelet_r2cb_20) (planner *p) {
+     X(kr2c_register) (p, r2cb_20, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,615 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:10 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 25 -name r2cb_25 -include r2cb.h */
+
+/*
+ * This function contains 152 FP additions, 120 FP multiplications,
+ * (or, 32 additions, 0 multiplications, 120 fused multiply/add),
+ * 115 stack variables, 44 constants, and 50 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP979740652, +0.979740652857618686258237536568998933733477632);
+     DK(KP438153340, +0.438153340021931793654057951961031291699532119);
+     DK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DK(KP1_606007150, +1.606007150877320829666881187140752009270929701);
+     DK(KP1_721083328, +1.721083328735889354196523361841037632825608373);
+     DK(KP1_011627398, +1.011627398597394192215998921771049272931807941);
+     DK(KP595480289, +0.595480289600000014706716770488118292997907308);
+     DK(KP641441904, +0.641441904830606407298806329068862424939687989);
+     DK(KP452413526, +0.452413526233009763856834323966348796985206956);
+     DK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DK(KP933137358, +0.933137358350283770603023973254446451924190884);
+     DK(KP1_666834356, +1.666834356657377354817925100486477686277992119);
+     DK(KP1_842354653, +1.842354653930286640500894870830132058718564461);
+     DK(KP1_082908895, +1.082908895072625554092571180165639018104066379);
+     DK(KP662318342, +0.662318342759882818626911127577439236802190210);
+     DK(KP576710603, +0.576710603632765877371579268136471017090111488);
+     DK(KP484291580, +0.484291580564315559745084187732367906918006201);
+     DK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DK(KP1_898359647, +1.898359647016882523151110931686726543423167685);
+     DK(KP1_386580726, +1.386580726567734802700860150804827247498955921);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP1_115827804, +1.115827804063668528375399296931134075984874304);
+     DK(KP634619297, +0.634619297544148100711287640319130485732531031);
+     DK(KP470564281, +0.470564281212251493087595091036643380879947982);
+     DK(KP499013364, +0.499013364214135780976168403431725276668452610);
+     DK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP730409924, +0.730409924561256563751459444999838399157094302);
+     DK(KP549754652, +0.549754652192770074288023275540779861653779767);
+     DK(KP256756360, +0.256756360367726783319498520922669048172391148);
+     DK(KP451418159, +0.451418159099103183892477933432151804893354132);
+     DK(KP846146756, +0.846146756728608505452954290121135880883743802);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP062914667, +0.062914667253649757225485955897349402364686947);
+     DK(KP939062505, +0.939062505817492352556001843133229685779824606);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
+	       E T1H, T24, T22, T1W, T1Y, T1X, T1Z, T23;
+	       {
+		    E T1G, Tu, T5, T1F, Tr, Te, T2o, T1N, T2a, T1t, TR, T1K, T29, T1u, TG;
+		    E TU, TT, Tn, T1d, T1Q, T2p, T1T, T12, T1P, T1a;
+		    {
+			 E T1, T2, T3, Ts, Tt;
+			 Ts = Ci[WS(csi, 5)];
+			 Tt = Ci[WS(csi, 10)];
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 5)];
+			 T3 = Cr[WS(csr, 10)];
+			 T1G = FMS(KP618033988, Ts, Tt);
+			 Tu = FMA(KP618033988, Tt, Ts);
+			 {
+			      E Tx, Tw, T1M, TQ, TM, T1J, TF, TL;
+			      {
+				   E T6, TH, TO, TP, TB, TI, Td, TJ, TE, T4, Tq, TK;
+				   T6 = Cr[WS(csr, 1)];
+				   T4 = T2 + T3;
+				   Tq = T2 - T3;
+				   TH = Ci[WS(csi, 1)];
+				   {
+					E Ta, T9, Tb, T7, T8, Tp;
+					T7 = Cr[WS(csr, 6)];
+					T8 = Cr[WS(csr, 4)];
+					Tp = FNMS(KP500000000, T4, T1);
+					T5 = FMA(KP2_000000000, T4, T1);
+					Ta = Cr[WS(csr, 11)];
+					TO = T7 - T8;
+					T9 = T7 + T8;
+					T1F = FNMS(KP1_118033988, Tq, Tp);
+					Tr = FMA(KP1_118033988, Tq, Tp);
+					Tb = Cr[WS(csr, 9)];
+					{
+					     E TC, TD, Tz, TA, Tc;
+					     Tz = Ci[WS(csi, 6)];
+					     TA = Ci[WS(csi, 4)];
+					     TP = Tb - Ta;
+					     Tc = Ta + Tb;
+					     TC = Ci[WS(csi, 11)];
+					     TB = Tz + TA;
+					     TI = Tz - TA;
+					     TD = Ci[WS(csi, 9)];
+					     Td = T9 + Tc;
+					     Tx = T9 - Tc;
+					     TJ = TC - TD;
+					     TE = TC + TD;
+					}
+				   }
+				   Te = T6 + Td;
+				   Tw = FNMS(KP250000000, Td, T6);
+				   T1M = FMA(KP618033988, TO, TP);
+				   TQ = FNMS(KP618033988, TP, TO);
+				   TK = TI + TJ;
+				   TM = TI - TJ;
+				   T1J = FNMS(KP618033988, TB, TE);
+				   TF = FMA(KP618033988, TE, TB);
+				   TL = FNMS(KP250000000, TK, TH);
+				   T2o = TK + TH;
+			      }
+			      {
+				   E Tf, T14, T1b, T1c, Tm, TY, T15, T16, T11, T17, T19, T18;
+				   Tf = Cr[WS(csr, 2)];
+				   {
+					E T1L, TN, T1I, Ty;
+					T1L = FNMS(KP559016994, TM, TL);
+					TN = FMA(KP559016994, TM, TL);
+					T1I = FNMS(KP559016994, Tx, Tw);
+					Ty = FMA(KP559016994, Tx, Tw);
+					T1N = FMA(KP951056516, T1M, T1L);
+					T2a = FNMS(KP951056516, T1M, T1L);
+					T1t = FNMS(KP951056516, TQ, TN);
+					TR = FMA(KP951056516, TQ, TN);
+					T1K = FMA(KP951056516, T1J, T1I);
+					T29 = FNMS(KP951056516, T1J, T1I);
+					T1u = FMA(KP951056516, TF, Ty);
+					TG = FNMS(KP951056516, TF, Ty);
+					T14 = Ci[WS(csi, 2)];
+				   }
+				   {
+					E Tg, Th, Tj, Tk;
+					Tg = Cr[WS(csr, 7)];
+					Th = Cr[WS(csr, 3)];
+					Tj = Cr[WS(csr, 12)];
+					Tk = Cr[WS(csr, 8)];
+					{
+					     E TW, Ti, Tl, TX, TZ, T10;
+					     TW = Ci[WS(csi, 7)];
+					     T1b = Th - Tg;
+					     Ti = Tg + Th;
+					     T1c = Tj - Tk;
+					     Tl = Tj + Tk;
+					     TX = Ci[WS(csi, 3)];
+					     TZ = Ci[WS(csi, 12)];
+					     T10 = Ci[WS(csi, 8)];
+					     Tm = Ti + Tl;
+					     TU = Tl - Ti;
+					     TY = TW + TX;
+					     T15 = TW - TX;
+					     T16 = TZ - T10;
+					     T11 = TZ + T10;
+					}
+				   }
+				   TT = FNMS(KP250000000, Tm, Tf);
+				   Tn = Tf + Tm;
+				   T17 = T15 + T16;
+				   T19 = T16 - T15;
+				   T1d = FNMS(KP618033988, T1c, T1b);
+				   T1Q = FMA(KP618033988, T1b, T1c);
+				   T18 = FNMS(KP250000000, T17, T14);
+				   T2p = T17 + T14;
+				   T1T = FNMS(KP618033988, TY, T11);
+				   T12 = FMA(KP618033988, T11, TY);
+				   T1P = FMA(KP559016994, T19, T18);
+				   T1a = FNMS(KP559016994, T19, T18);
+			      }
+			 }
+		    }
+		    {
+			 E T1R, T1e, T1q, T1U, T13, T1r, T2b, T28, T25, T2i, T2k;
+			 {
+			      E T2m, To, T26, T27, TV, T1S;
+			      T2m = Te - Tn;
+			      To = Te + Tn;
+			      TV = FNMS(KP559016994, TU, TT);
+			      T1S = FMA(KP559016994, TU, TT);
+			      T26 = FMA(KP951056516, T1Q, T1P);
+			      T1R = FNMS(KP951056516, T1Q, T1P);
+			      T1e = FNMS(KP951056516, T1d, T1a);
+			      T1q = FMA(KP951056516, T1d, T1a);
+			      T27 = FNMS(KP951056516, T1T, T1S);
+			      T1U = FMA(KP951056516, T1T, T1S);
+			      T13 = FNMS(KP951056516, T12, TV);
+			      T1r = FMA(KP951056516, T12, TV);
+			      {
+				   E T2g, T2q, T2s, T2h, T2n, T2r, T2l;
+				   T2g = FMA(KP939062505, T29, T2a);
+				   T2b = FNMS(KP939062505, T2a, T29);
+				   R0[0] = FMA(KP2_000000000, To, T5);
+				   T2l = FNMS(KP500000000, To, T5);
+				   T2q = FMA(KP618033988, T2p, T2o);
+				   T2s = FNMS(KP618033988, T2o, T2p);
+				   T28 = FNMS(KP062914667, T27, T26);
+				   T2h = FMA(KP062914667, T26, T27);
+				   T2n = FMA(KP1_118033988, T2m, T2l);
+				   T2r = FNMS(KP1_118033988, T2m, T2l);
+				   T25 = FMA(KP1_902113032, T1G, T1F);
+				   T1H = FNMS(KP1_902113032, T1G, T1F);
+				   T2i = FMA(KP846146756, T2h, T2g);
+				   T2k = FNMS(KP451418159, T2g, T2h);
+				   R0[WS(rs, 10)] = FMA(KP1_902113032, T2q, T2n);
+				   R1[WS(rs, 2)] = FNMS(KP1_902113032, T2q, T2n);
+				   R0[WS(rs, 5)] = FMA(KP1_902113032, T2s, T2r);
+				   R1[WS(rs, 7)] = FNMS(KP1_902113032, T2s, T2r);
+			      }
+			 }
+			 {
+			      E TS, T1f, T1p, Tv, T2e, T1o, T1m, T2d, T1k, T1l, T2c;
+			      TS = FNMS(KP256756360, TR, TG);
+			      T1k = FMA(KP256756360, TG, TR);
+			      T1l = FMA(KP549754652, T13, T1e);
+			      T1f = FNMS(KP549754652, T1e, T13);
+			      T1p = FMA(KP1_902113032, Tu, Tr);
+			      Tv = FNMS(KP1_902113032, Tu, Tr);
+			      T2e = FMA(KP730409924, T2b, T28);
+			      T2c = FNMS(KP730409924, T2b, T28);
+			      T1o = FNMS(KP683113946, T1k, T1l);
+			      T1m = FMA(KP559154169, T1l, T1k);
+			      R1[WS(rs, 1)] = FNMS(KP1_996053456, T2c, T25);
+			      T2d = FMA(KP499013364, T2c, T25);
+			      {
+				   E T1C, T1E, T1y, T1w;
+				   {
+					E T1s, T1v, T1i, T1h, T1n, T1j;
+					{
+					     E T1A, T1B, T2f, T2j, T1g;
+					     T1A = FNMS(KP470564281, T1q, T1r);
+					     T1s = FMA(KP470564281, T1r, T1q);
+					     T1v = FNMS(KP634619297, T1u, T1t);
+					     T1B = FMA(KP634619297, T1t, T1u);
+					     T2f = FMA(KP1_115827804, T2e, T2d);
+					     T2j = FNMS(KP1_115827804, T2e, T2d);
+					     T1i = FNMS(KP904730450, T1f, TS);
+					     T1g = FMA(KP904730450, T1f, TS);
+					     R1[WS(rs, 11)] = FMA(KP1_386580726, T2i, T2f);
+					     R0[WS(rs, 4)] = FNMS(KP1_386580726, T2i, T2f);
+					     R1[WS(rs, 6)] = FMA(KP1_898359647, T2k, T2j);
+					     R0[WS(rs, 9)] = FNMS(KP1_898359647, T2k, T2j);
+					     R1[0] = FMA(KP1_937166322, T1g, Tv);
+					     T1h = FNMS(KP484291580, T1g, Tv);
+					     T1C = FNMS(KP576710603, T1B, T1A);
+					     T1E = FMA(KP662318342, T1A, T1B);
+					}
+					T1n = FNMS(KP1_082908895, T1i, T1h);
+					T1j = FMA(KP1_082908895, T1i, T1h);
+					R1[WS(rs, 10)] = FMA(KP1_842354653, T1m, T1j);
+					R0[WS(rs, 3)] = FNMS(KP1_842354653, T1m, T1j);
+					R1[WS(rs, 5)] = FMA(KP1_666834356, T1o, T1n);
+					R0[WS(rs, 8)] = FNMS(KP1_666834356, T1o, T1n);
+					T1y = FNMS(KP933137358, T1v, T1s);
+					T1w = FMA(KP933137358, T1v, T1s);
+				   }
+				   {
+					E T1O, T20, T21, T1V, T1x, T1z, T1D;
+					T1O = FNMS(KP549754652, T1N, T1K);
+					T20 = FMA(KP549754652, T1K, T1N);
+					T21 = FMA(KP634619297, T1R, T1U);
+					T1V = FNMS(KP634619297, T1U, T1R);
+					R0[WS(rs, 2)] = FNMS(KP1_809654104, T1w, T1p);
+					T1x = FMA(KP452413526, T1w, T1p);
+					T24 = FNMS(KP641441904, T20, T21);
+					T22 = FMA(KP595480289, T21, T20);
+					T1z = FNMS(KP1_011627398, T1y, T1x);
+					T1D = FMA(KP1_011627398, T1y, T1x);
+					R1[WS(rs, 9)] = FNMS(KP1_721083328, T1C, T1z);
+					R0[WS(rs, 7)] = FMA(KP1_721083328, T1C, T1z);
+					R0[WS(rs, 12)] = FMA(KP1_606007150, T1E, T1D);
+					R1[WS(rs, 4)] = FNMS(KP1_606007150, T1E, T1D);
+					T1W = FNMS(KP963507348, T1V, T1O);
+					T1Y = FMA(KP963507348, T1V, T1O);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       R0[WS(rs, 1)] = FMA(KP1_752613360, T1W, T1H);
+	       T1X = FNMS(KP438153340, T1W, T1H);
+	       T1Z = FMA(KP979740652, T1Y, T1X);
+	       T23 = FNMS(KP979740652, T1Y, T1X);
+	       R0[WS(rs, 11)] = FMA(KP1_666834356, T22, T1Z);
+	       R1[WS(rs, 3)] = FNMS(KP1_666834356, T22, T1Z);
+	       R1[WS(rs, 8)] = FNMS(KP1_606007150, T24, T23);
+	       R0[WS(rs, 6)] = FMA(KP1_606007150, T24, T23);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 25, "r2cb_25", {32, 0, 120, 0}, &GENUS };
+
+void X(codelet_r2cb_25) (planner *p) {
+     X(kr2c_register) (p, r2cb_25, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 25 -name r2cb_25 -include r2cb.h */
+
+/*
+ * This function contains 152 FP additions, 98 FP multiplications,
+ * (or, 100 additions, 46 multiplications, 52 fused multiply/add),
+ * 65 stack variables, 21 constants, and 50 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
+	       E Tu, T1G, T5, Tr, T1F, TN, TO, Te, TR, T27, T1r, T1N, TG, T26, T1q;
+	       E T1K, T1a, T1b, Tn, T1e, T2a, T1u, T1U, T13, T29, T1t, T1R, Ts, Tt;
+	       Ts = Ci[WS(csi, 5)];
+	       Tt = Ci[WS(csi, 10)];
+	       Tu = FMA(KP1_902113032, Ts, KP1_175570504 * Tt);
+	       T1G = FNMS(KP1_902113032, Tt, KP1_175570504 * Ts);
+	       {
+		    E T1, T4, Tp, T2, T3, Tq;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 5)];
+		    T3 = Cr[WS(csr, 10)];
+		    T4 = T2 + T3;
+		    Tp = KP1_118033988 * (T2 - T3);
+		    T5 = FMA(KP2_000000000, T4, T1);
+		    Tq = FNMS(KP500000000, T4, T1);
+		    Tr = Tp + Tq;
+		    T1F = Tq - Tp;
+	       }
+	       {
+		    E T6, Td, TI, Tw, TH, TB, TE, TM;
+		    T6 = Cr[WS(csr, 1)];
+		    TN = Ci[WS(csi, 1)];
+		    {
+			 E T7, T8, T9, Ta, Tb, Tc;
+			 T7 = Cr[WS(csr, 6)];
+			 T8 = Cr[WS(csr, 4)];
+			 T9 = T7 + T8;
+			 Ta = Cr[WS(csr, 11)];
+			 Tb = Cr[WS(csr, 9)];
+			 Tc = Ta + Tb;
+			 Td = T9 + Tc;
+			 TI = Ta - Tb;
+			 Tw = KP559016994 * (T9 - Tc);
+			 TH = T7 - T8;
+		    }
+		    {
+			 E Tz, TA, TK, TC, TD, TL;
+			 Tz = Ci[WS(csi, 6)];
+			 TA = Ci[WS(csi, 4)];
+			 TK = Tz - TA;
+			 TC = Ci[WS(csi, 11)];
+			 TD = Ci[WS(csi, 9)];
+			 TL = TC - TD;
+			 TB = Tz + TA;
+			 TO = TK + TL;
+			 TE = TC + TD;
+			 TM = KP559016994 * (TK - TL);
+		    }
+		    Te = T6 + Td;
+		    {
+			 E TJ, T1L, TQ, T1M, TP;
+			 TJ = FMA(KP951056516, TH, KP587785252 * TI);
+			 T1L = FNMS(KP951056516, TI, KP587785252 * TH);
+			 TP = FNMS(KP250000000, TO, TN);
+			 TQ = TM + TP;
+			 T1M = TP - TM;
+			 TR = TJ + TQ;
+			 T27 = T1M - T1L;
+			 T1r = TQ - TJ;
+			 T1N = T1L + T1M;
+		    }
+		    {
+			 E TF, T1J, Ty, T1I, Tx;
+			 TF = FMA(KP951056516, TB, KP587785252 * TE);
+			 T1J = FNMS(KP951056516, TE, KP587785252 * TB);
+			 Tx = FNMS(KP250000000, Td, T6);
+			 Ty = Tw + Tx;
+			 T1I = Tx - Tw;
+			 TG = Ty - TF;
+			 T26 = T1I + T1J;
+			 T1q = Ty + TF;
+			 T1K = T1I - T1J;
+		    }
+	       }
+	       {
+		    E Tf, Tm, T15, TT, T14, TY, T11, T19;
+		    Tf = Cr[WS(csr, 2)];
+		    T1a = Ci[WS(csi, 2)];
+		    {
+			 E Tg, Th, Ti, Tj, Tk, Tl;
+			 Tg = Cr[WS(csr, 7)];
+			 Th = Cr[WS(csr, 3)];
+			 Ti = Tg + Th;
+			 Tj = Cr[WS(csr, 12)];
+			 Tk = Cr[WS(csr, 8)];
+			 Tl = Tj + Tk;
+			 Tm = Ti + Tl;
+			 T15 = Tj - Tk;
+			 TT = KP559016994 * (Ti - Tl);
+			 T14 = Tg - Th;
+		    }
+		    {
+			 E TW, TX, T17, TZ, T10, T18;
+			 TW = Ci[WS(csi, 7)];
+			 TX = Ci[WS(csi, 3)];
+			 T17 = TW - TX;
+			 TZ = Ci[WS(csi, 12)];
+			 T10 = Ci[WS(csi, 8)];
+			 T18 = TZ - T10;
+			 TY = TW + TX;
+			 T1b = T17 + T18;
+			 T11 = TZ + T10;
+			 T19 = KP559016994 * (T17 - T18);
+		    }
+		    Tn = Tf + Tm;
+		    {
+			 E T16, T1S, T1d, T1T, T1c;
+			 T16 = FMA(KP951056516, T14, KP587785252 * T15);
+			 T1S = FNMS(KP951056516, T15, KP587785252 * T14);
+			 T1c = FNMS(KP250000000, T1b, T1a);
+			 T1d = T19 + T1c;
+			 T1T = T1c - T19;
+			 T1e = T16 + T1d;
+			 T2a = T1T - T1S;
+			 T1u = T1d - T16;
+			 T1U = T1S + T1T;
+		    }
+		    {
+			 E T12, T1Q, TV, T1P, TU;
+			 T12 = FMA(KP951056516, TY, KP587785252 * T11);
+			 T1Q = FNMS(KP951056516, T11, KP587785252 * TY);
+			 TU = FNMS(KP250000000, Tm, Tf);
+			 TV = TT + TU;
+			 T1P = TU - TT;
+			 T13 = TV - T12;
+			 T29 = T1P + T1Q;
+			 T1t = TV + T12;
+			 T1R = T1P - T1Q;
+		    }
+	       }
+	       {
+		    E T2m, To, T2l, T2q, T2s, T2o, T2p, T2r, T2n;
+		    T2m = KP1_118033988 * (Te - Tn);
+		    To = Te + Tn;
+		    T2l = FNMS(KP500000000, To, T5);
+		    T2o = TO + TN;
+		    T2p = T1b + T1a;
+		    T2q = FNMS(KP1_902113032, T2p, KP1_175570504 * T2o);
+		    T2s = FMA(KP1_902113032, T2o, KP1_175570504 * T2p);
+		    R0[0] = FMA(KP2_000000000, To, T5);
+		    T2r = T2m + T2l;
+		    R1[WS(rs, 2)] = T2r - T2s;
+		    R0[WS(rs, 10)] = T2r + T2s;
+		    T2n = T2l - T2m;
+		    R0[WS(rs, 5)] = T2n - T2q;
+		    R1[WS(rs, 7)] = T2n + T2q;
+	       }
+	       {
+		    E T2i, T2k, T25, T2c, T2d, T2e, T2j, T2f;
+		    {
+			 E T2g, T2h, T28, T2b;
+			 T2g = FMA(KP684547105, T26, KP728968627 * T27);
+			 T2h = FMA(KP998026728, T29, KP062790519 * T2a);
+			 T2i = FNMS(KP1_902113032, T2h, KP1_175570504 * T2g);
+			 T2k = FMA(KP1_902113032, T2g, KP1_175570504 * T2h);
+			 T25 = T1F + T1G;
+			 T28 = FNMS(KP684547105, T27, KP728968627 * T26);
+			 T2b = FNMS(KP998026728, T2a, KP062790519 * T29);
+			 T2c = T28 + T2b;
+			 T2d = FNMS(KP500000000, T2c, T25);
+			 T2e = KP1_118033988 * (T28 - T2b);
+		    }
+		    R1[WS(rs, 1)] = FMA(KP2_000000000, T2c, T25);
+		    T2j = T2e + T2d;
+		    R0[WS(rs, 4)] = T2j - T2k;
+		    R1[WS(rs, 11)] = T2j + T2k;
+		    T2f = T2d - T2e;
+		    R1[WS(rs, 6)] = T2f - T2i;
+		    R0[WS(rs, 9)] = T2f + T2i;
+	       }
+	       {
+		    E T1m, T1o, Tv, T1g, T1h, T1i, T1n, T1j;
+		    {
+			 E T1k, T1l, TS, T1f;
+			 T1k = FMA(KP248689887, TG, KP968583161 * TR);
+			 T1l = FMA(KP481753674, T13, KP876306680 * T1e);
+			 T1m = FNMS(KP1_902113032, T1l, KP1_175570504 * T1k);
+			 T1o = FMA(KP1_902113032, T1k, KP1_175570504 * T1l);
+			 Tv = Tr - Tu;
+			 TS = FNMS(KP248689887, TR, KP968583161 * TG);
+			 T1f = FNMS(KP481753674, T1e, KP876306680 * T13);
+			 T1g = TS + T1f;
+			 T1h = FNMS(KP500000000, T1g, Tv);
+			 T1i = KP1_118033988 * (TS - T1f);
+		    }
+		    R1[0] = FMA(KP2_000000000, T1g, Tv);
+		    T1n = T1i + T1h;
+		    R0[WS(rs, 3)] = T1n - T1o;
+		    R1[WS(rs, 10)] = T1n + T1o;
+		    T1j = T1h - T1i;
+		    R1[WS(rs, 5)] = T1j - T1m;
+		    R0[WS(rs, 8)] = T1j + T1m;
+	       }
+	       {
+		    E T1C, T1E, T1p, T1w, T1x, T1y, T1D, T1z;
+		    {
+			 E T1A, T1B, T1s, T1v;
+			 T1A = FMA(KP844327925, T1q, KP535826794 * T1r);
+			 T1B = FNMS(KP425779291, T1u, KP904827052 * T1t);
+			 T1C = FNMS(KP1_902113032, T1B, KP1_175570504 * T1A);
+			 T1E = FMA(KP1_902113032, T1A, KP1_175570504 * T1B);
+			 T1p = Tr + Tu;
+			 T1s = FNMS(KP844327925, T1r, KP535826794 * T1q);
+			 T1v = FMA(KP425779291, T1t, KP904827052 * T1u);
+			 T1w = T1s - T1v;
+			 T1x = FNMS(KP500000000, T1w, T1p);
+			 T1y = KP1_118033988 * (T1s + T1v);
+		    }
+		    R0[WS(rs, 2)] = FMA(KP2_000000000, T1w, T1p);
+		    T1D = T1x + T1y;
+		    R1[WS(rs, 4)] = T1D - T1E;
+		    R0[WS(rs, 12)] = T1E + T1D;
+		    T1z = T1x - T1y;
+		    R0[WS(rs, 7)] = T1z - T1C;
+		    R1[WS(rs, 9)] = T1C + T1z;
+	       }
+	       {
+		    E T22, T24, T1H, T1W, T1X, T1Y, T23, T1Z;
+		    {
+			 E T20, T21, T1O, T1V;
+			 T20 = FMA(KP481753674, T1K, KP876306680 * T1N);
+			 T21 = FMA(KP844327925, T1R, KP535826794 * T1U);
+			 T22 = FNMS(KP1_902113032, T21, KP1_175570504 * T20);
+			 T24 = FMA(KP1_902113032, T20, KP1_175570504 * T21);
+			 T1H = T1F - T1G;
+			 T1O = FNMS(KP481753674, T1N, KP876306680 * T1K);
+			 T1V = FNMS(KP844327925, T1U, KP535826794 * T1R);
+			 T1W = T1O + T1V;
+			 T1X = FNMS(KP500000000, T1W, T1H);
+			 T1Y = KP1_118033988 * (T1O - T1V);
+		    }
+		    R0[WS(rs, 1)] = FMA(KP2_000000000, T1W, T1H);
+		    T23 = T1Y + T1X;
+		    R1[WS(rs, 3)] = T23 - T24;
+		    R0[WS(rs, 11)] = T23 + T24;
+		    T1Z = T1X - T1Y;
+		    R0[WS(rs, 6)] = T1Z - T22;
+		    R1[WS(rs, 8)] = T1Z + T22;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 25, "r2cb_25", {100, 46, 52, 0}, &GENUS };
+
+void X(codelet_r2cb_25) (planner *p) {
+     X(kr2c_register) (p, r2cb_25, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 3 -name r2cb_3 -include r2cb.h */
+
+/*
+ * This function contains 4 FP additions, 3 FP multiplications,
+ * (or, 1 additions, 0 multiplications, 3 fused multiply/add),
+ * 7 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
+	       E T4, T1, T2, T3;
+	       T4 = Ci[WS(csi, 1)];
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 1)];
+	       R0[0] = FMA(KP2_000000000, T2, T1);
+	       T3 = T1 - T2;
+	       R1[0] = FNMS(KP1_732050807, T4, T3);
+	       R0[WS(rs, 1)] = FMA(KP1_732050807, T4, T3);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 3, "r2cb_3", {1, 0, 3, 0}, &GENUS };
+
+void X(codelet_r2cb_3) (planner *p) {
+     X(kr2c_register) (p, r2cb_3, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 3 -name r2cb_3 -include r2cb.h */
+
+/*
+ * This function contains 4 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 1 multiplications, 1 fused multiply/add),
+ * 8 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
+	       E T5, T1, T2, T3, T4;
+	       T4 = Ci[WS(csi, 1)];
+	       T5 = KP1_732050807 * T4;
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 1)];
+	       T3 = T1 - T2;
+	       R0[0] = FMA(KP2_000000000, T2, T1);
+	       R0[WS(rs, 1)] = T3 + T5;
+	       R1[0] = T3 - T5;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 3, "r2cb_3", {3, 1, 1, 0}, &GENUS };
+
+void X(codelet_r2cb_3) (planner *p) {
+     X(kr2c_register) (p, r2cb_3, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:09 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -name r2cb_32 -include r2cb.h */
+
+/*
+ * This function contains 156 FP additions, 84 FP multiplications,
+ * (or, 72 additions, 0 multiplications, 84 fused multiply/add),
+ * 82 stack variables, 9 constants, and 64 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
+	       E T1F, T1C, T1H, T1z, T1G, T1I;
+	       {
+		    E T8, T1t, Tz, T1R, T5, T1S, T1u, TE, T1w, TP, T1U, Tg, T2m, T1X, T1x;
+		    E TK, T1D, T1d, T20, To, T2p, T28, T1A, TW, T11, T1e, Tv, T25, T23, T2q;
+		    E T16, T1f, TA, TD;
+		    {
+			 E T4, Ty, T1, T2, T6, T7;
+			 T4 = Cr[WS(csr, 8)];
+			 Ty = Ci[WS(csi, 8)];
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 16)];
+			 T6 = Cr[WS(csr, 4)];
+			 T7 = Cr[WS(csr, 12)];
+			 {
+			      E TB, Tx, T3, TC;
+			      TB = Ci[WS(csi, 4)];
+			      Tx = T1 - T2;
+			      T3 = T1 + T2;
+			      TA = T6 - T7;
+			      T8 = T6 + T7;
+			      TC = Ci[WS(csi, 12)];
+			      T1t = FMA(KP2_000000000, Ty, Tx);
+			      Tz = FNMS(KP2_000000000, Ty, Tx);
+			      T1R = FNMS(KP2_000000000, T4, T3);
+			      T5 = FMA(KP2_000000000, T4, T3);
+			      TD = TB + TC;
+			      T1S = TB - TC;
+			 }
+		    }
+		    {
+			 E Td, TG, Tc, T1V, TO, Te, TH, TI;
+			 {
+			      E Ta, Tb, TM, TN;
+			      Ta = Cr[WS(csr, 2)];
+			      T1u = TA + TD;
+			      TE = TA - TD;
+			      Tb = Cr[WS(csr, 14)];
+			      TM = Ci[WS(csi, 2)];
+			      TN = Ci[WS(csi, 14)];
+			      Td = Cr[WS(csr, 10)];
+			      TG = Ta - Tb;
+			      Tc = Ta + Tb;
+			      T1V = TM - TN;
+			      TO = TM + TN;
+			      Te = Cr[WS(csr, 6)];
+			      TH = Ci[WS(csi, 10)];
+			      TI = Ci[WS(csi, 6)];
+			 }
+			 {
+			      E Tl, TS, Tk, T26, T1c, Tm, TT, TU;
+			      {
+				   E Ti, Tj, T1a, T1b;
+				   Ti = Cr[WS(csr, 1)];
+				   {
+					E TL, Tf, T1W, TJ;
+					TL = Td - Te;
+					Tf = Td + Te;
+					T1W = TH - TI;
+					TJ = TH + TI;
+					T1w = TO - TL;
+					TP = TL + TO;
+					T1U = Tc - Tf;
+					Tg = Tc + Tf;
+					T2m = T1W + T1V;
+					T1X = T1V - T1W;
+					T1x = TG + TJ;
+					TK = TG - TJ;
+					Tj = Cr[WS(csr, 15)];
+				   }
+				   T1a = Ci[WS(csi, 1)];
+				   T1b = Ci[WS(csi, 15)];
+				   Tl = Cr[WS(csr, 9)];
+				   TS = Ti - Tj;
+				   Tk = Ti + Tj;
+				   T26 = T1a - T1b;
+				   T1c = T1a + T1b;
+				   Tm = Cr[WS(csr, 7)];
+				   TT = Ci[WS(csi, 9)];
+				   TU = Ci[WS(csi, 7)];
+			      }
+			      {
+				   E Ts, TX, Tr, T22, T10, Tt, T13, T14;
+				   {
+					E Tp, Tq, TY, TZ;
+					Tp = Cr[WS(csr, 5)];
+					{
+					     E T19, Tn, T27, TV;
+					     T19 = Tl - Tm;
+					     Tn = Tl + Tm;
+					     T27 = TT - TU;
+					     TV = TT + TU;
+					     T1D = T1c - T19;
+					     T1d = T19 + T1c;
+					     T20 = Tk - Tn;
+					     To = Tk + Tn;
+					     T2p = T27 + T26;
+					     T28 = T26 - T27;
+					     T1A = TS + TV;
+					     TW = TS - TV;
+					     Tq = Cr[WS(csr, 11)];
+					}
+					TY = Ci[WS(csi, 5)];
+					TZ = Ci[WS(csi, 11)];
+					Ts = Cr[WS(csr, 3)];
+					TX = Tp - Tq;
+					Tr = Tp + Tq;
+					T22 = TY - TZ;
+					T10 = TY + TZ;
+					Tt = Cr[WS(csr, 13)];
+					T13 = Ci[WS(csi, 3)];
+					T14 = Ci[WS(csi, 13)];
+				   }
+				   {
+					E T12, Tu, T21, T15;
+					T11 = TX - T10;
+					T1e = TX + T10;
+					T12 = Ts - Tt;
+					Tu = Ts + Tt;
+					T21 = T14 - T13;
+					T15 = T13 + T14;
+					Tv = Tr + Tu;
+					T25 = Tr - Tu;
+					T23 = T21 - T22;
+					T2q = T22 + T21;
+					T16 = T12 - T15;
+					T1f = T12 + T15;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T1B, T1E, T1l, T1m, T1p, T1o, T1T, T1Y, T29, T2g, T2j, T2f, T2h, T24;
+			 {
+			      E T1g, T17, T2n, T2t, T2u, T2s;
+			      {
+				   E T2o, Tw, T2w, T2r, T2l, T9, Th, T2v;
+				   T2o = To - Tv;
+				   Tw = To + Tv;
+				   T2w = T2q + T2p;
+				   T2r = T2p - T2q;
+				   T1g = T1e - T1f;
+				   T1B = T1e + T1f;
+				   T17 = T11 + T16;
+				   T1E = T16 - T11;
+				   T2l = FNMS(KP2_000000000, T8, T5);
+				   T9 = FMA(KP2_000000000, T8, T5);
+				   Th = FMA(KP2_000000000, Tg, T9);
+				   T2v = FNMS(KP2_000000000, Tg, T9);
+				   T2n = FNMS(KP2_000000000, T2m, T2l);
+				   T2t = FMA(KP2_000000000, T2m, T2l);
+				   R0[WS(rs, 4)] = FNMS(KP2_000000000, T2w, T2v);
+				   R0[WS(rs, 12)] = FMA(KP2_000000000, T2w, T2v);
+				   R0[0] = FMA(KP2_000000000, Tw, Th);
+				   R0[WS(rs, 8)] = FNMS(KP2_000000000, Tw, Th);
+				   T2u = T2o + T2r;
+				   T2s = T2o - T2r;
+			      }
+			      {
+				   E T1j, TR, T18, T1h, TF, TQ;
+				   T1l = FNMS(KP1_414213562, TE, Tz);
+				   TF = FMA(KP1_414213562, TE, Tz);
+				   TQ = FNMS(KP414213562, TP, TK);
+				   T1m = FMA(KP414213562, TK, TP);
+				   R0[WS(rs, 2)] = FMA(KP1_414213562, T2s, T2n);
+				   R0[WS(rs, 10)] = FNMS(KP1_414213562, T2s, T2n);
+				   R0[WS(rs, 6)] = FNMS(KP1_414213562, T2u, T2t);
+				   R0[WS(rs, 14)] = FMA(KP1_414213562, T2u, T2t);
+				   T1j = FNMS(KP1_847759065, TQ, TF);
+				   TR = FMA(KP1_847759065, TQ, TF);
+				   T1p = FNMS(KP707106781, T17, TW);
+				   T18 = FMA(KP707106781, T17, TW);
+				   T1h = FMA(KP707106781, T1g, T1d);
+				   T1o = FNMS(KP707106781, T1g, T1d);
+				   {
+					E T2d, T2e, T1k, T1i;
+					T1T = FNMS(KP2_000000000, T1S, T1R);
+					T2d = FMA(KP2_000000000, T1S, T1R);
+					T2e = T1U + T1X;
+					T1Y = T1U - T1X;
+					T29 = T25 + T28;
+					T2g = T28 - T25;
+					T1k = FMA(KP198912367, T18, T1h);
+					T1i = FNMS(KP198912367, T1h, T18);
+					T2j = FMA(KP1_414213562, T2e, T2d);
+					T2f = FNMS(KP1_414213562, T2e, T2d);
+					R1[WS(rs, 4)] = FNMS(KP1_961570560, T1k, T1j);
+					R1[WS(rs, 12)] = FMA(KP1_961570560, T1k, T1j);
+					R1[0] = FMA(KP1_961570560, T1i, TR);
+					R1[WS(rs, 8)] = FNMS(KP1_961570560, T1i, TR);
+					T2h = T20 - T23;
+					T24 = T20 + T23;
+				   }
+			      }
+			 }
+			 {
+			      E T1v, T1y, T1M, T1P, T1L, T1N;
+			      {
+				   E T1r, T1n, T2k, T2i;
+				   T2k = FMA(KP414213562, T2g, T2h);
+				   T2i = FNMS(KP414213562, T2h, T2g);
+				   T1r = FMA(KP1_847759065, T1m, T1l);
+				   T1n = FNMS(KP1_847759065, T1m, T1l);
+				   R0[WS(rs, 7)] = FNMS(KP1_847759065, T2k, T2j);
+				   R0[WS(rs, 15)] = FMA(KP1_847759065, T2k, T2j);
+				   R0[WS(rs, 11)] = FMA(KP1_847759065, T2i, T2f);
+				   R0[WS(rs, 3)] = FNMS(KP1_847759065, T2i, T2f);
+				   {
+					E T1J, T1K, T1s, T1q;
+					T1v = FNMS(KP1_414213562, T1u, T1t);
+					T1J = FMA(KP1_414213562, T1u, T1t);
+					T1K = FMA(KP414213562, T1w, T1x);
+					T1y = FNMS(KP414213562, T1x, T1w);
+					T1F = FNMS(KP707106781, T1E, T1D);
+					T1M = FMA(KP707106781, T1E, T1D);
+					T1s = FMA(KP668178637, T1o, T1p);
+					T1q = FNMS(KP668178637, T1p, T1o);
+					T1P = FMA(KP1_847759065, T1K, T1J);
+					T1L = FNMS(KP1_847759065, T1K, T1J);
+					R1[WS(rs, 6)] = FNMS(KP1_662939224, T1s, T1r);
+					R1[WS(rs, 14)] = FMA(KP1_662939224, T1s, T1r);
+					R1[WS(rs, 10)] = FMA(KP1_662939224, T1q, T1n);
+					R1[WS(rs, 2)] = FNMS(KP1_662939224, T1q, T1n);
+					T1N = FMA(KP707106781, T1B, T1A);
+					T1C = FNMS(KP707106781, T1B, T1A);
+				   }
+			      }
+			      {
+				   E T2b, T1Z, T1Q, T1O, T2c, T2a;
+				   T1Q = FMA(KP198912367, T1M, T1N);
+				   T1O = FNMS(KP198912367, T1N, T1M);
+				   T2b = FNMS(KP1_414213562, T1Y, T1T);
+				   T1Z = FMA(KP1_414213562, T1Y, T1T);
+				   R1[WS(rs, 7)] = FNMS(KP1_961570560, T1Q, T1P);
+				   R1[WS(rs, 15)] = FMA(KP1_961570560, T1Q, T1P);
+				   R1[WS(rs, 11)] = FMA(KP1_961570560, T1O, T1L);
+				   R1[WS(rs, 3)] = FNMS(KP1_961570560, T1O, T1L);
+				   T2c = FMA(KP414213562, T24, T29);
+				   T2a = FNMS(KP414213562, T29, T24);
+				   T1H = FMA(KP1_847759065, T1y, T1v);
+				   T1z = FNMS(KP1_847759065, T1y, T1v);
+				   R0[WS(rs, 5)] = FNMS(KP1_847759065, T2c, T2b);
+				   R0[WS(rs, 13)] = FMA(KP1_847759065, T2c, T2b);
+				   R0[WS(rs, 1)] = FMA(KP1_847759065, T2a, T1Z);
+				   R0[WS(rs, 9)] = FNMS(KP1_847759065, T2a, T1Z);
+			      }
+			 }
+		    }
+	       }
+	       T1G = FNMS(KP668178637, T1F, T1C);
+	       T1I = FMA(KP668178637, T1C, T1F);
+	       R1[WS(rs, 5)] = FNMS(KP1_662939224, T1I, T1H);
+	       R1[WS(rs, 13)] = FMA(KP1_662939224, T1I, T1H);
+	       R1[WS(rs, 1)] = FMA(KP1_662939224, T1G, T1z);
+	       R1[WS(rs, 9)] = FNMS(KP1_662939224, T1G, T1z);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 32, "r2cb_32", {72, 0, 84, 0}, &GENUS };
+
+void X(codelet_r2cb_32) (planner *p) {
+     X(kr2c_register) (p, r2cb_32, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -name r2cb_32 -include r2cb.h */
+
+/*
+ * This function contains 156 FP additions, 50 FP multiplications,
+ * (or, 140 additions, 34 multiplications, 16 fused multiply/add),
+ * 54 stack variables, 9 constants, and 64 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP390180644, +0.390180644032256535696569736954044481855383236);
+     DK(KP765366864, +0.765366864730179543456919968060797733522689125);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
+	       E T9, T2c, TB, T1y, T6, T2b, Ty, T1v, Th, T2e, T2f, TD, TK, T1C, T1F;
+	       E T1h, Tp, T2i, T2m, TN, T13, T1K, T1Y, T1k, Tw, TU, T1l, TW, T1V, T2j;
+	       E T1R, T2l;
+	       {
+		    E T7, T8, T1w, Tz, TA, T1x;
+		    T7 = Cr[WS(csr, 4)];
+		    T8 = Cr[WS(csr, 12)];
+		    T1w = T7 - T8;
+		    Tz = Ci[WS(csi, 4)];
+		    TA = Ci[WS(csi, 12)];
+		    T1x = Tz + TA;
+		    T9 = KP2_000000000 * (T7 + T8);
+		    T2c = KP1_414213562 * (T1w + T1x);
+		    TB = KP2_000000000 * (Tz - TA);
+		    T1y = KP1_414213562 * (T1w - T1x);
+	       }
+	       {
+		    E T5, T1u, T3, T1s;
+		    {
+			 E T4, T1t, T1, T2;
+			 T4 = Cr[WS(csr, 8)];
+			 T5 = KP2_000000000 * T4;
+			 T1t = Ci[WS(csi, 8)];
+			 T1u = KP2_000000000 * T1t;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 16)];
+			 T3 = T1 + T2;
+			 T1s = T1 - T2;
+		    }
+		    T6 = T3 + T5;
+		    T2b = T1s + T1u;
+		    Ty = T3 - T5;
+		    T1v = T1s - T1u;
+	       }
+	       {
+		    E Td, T1A, TG, T1E, Tg, T1D, TJ, T1B;
+		    {
+			 E Tb, Tc, TE, TF;
+			 Tb = Cr[WS(csr, 2)];
+			 Tc = Cr[WS(csr, 14)];
+			 Td = Tb + Tc;
+			 T1A = Tb - Tc;
+			 TE = Ci[WS(csi, 2)];
+			 TF = Ci[WS(csi, 14)];
+			 TG = TE - TF;
+			 T1E = TE + TF;
+		    }
+		    {
+			 E Te, Tf, TH, TI;
+			 Te = Cr[WS(csr, 10)];
+			 Tf = Cr[WS(csr, 6)];
+			 Tg = Te + Tf;
+			 T1D = Te - Tf;
+			 TH = Ci[WS(csi, 10)];
+			 TI = Ci[WS(csi, 6)];
+			 TJ = TH - TI;
+			 T1B = TH + TI;
+		    }
+		    Th = KP2_000000000 * (Td + Tg);
+		    T2e = T1A + T1B;
+		    T2f = T1E - T1D;
+		    TD = Td - Tg;
+		    TK = TG - TJ;
+		    T1C = T1A - T1B;
+		    T1F = T1D + T1E;
+		    T1h = KP2_000000000 * (TJ + TG);
+	       }
+	       {
+		    E Tl, T1I, TZ, T1X, To, T1W, T12, T1J;
+		    {
+			 E Tj, Tk, TX, TY;
+			 Tj = Cr[WS(csr, 1)];
+			 Tk = Cr[WS(csr, 15)];
+			 Tl = Tj + Tk;
+			 T1I = Tj - Tk;
+			 TX = Ci[WS(csi, 1)];
+			 TY = Ci[WS(csi, 15)];
+			 TZ = TX - TY;
+			 T1X = TX + TY;
+		    }
+		    {
+			 E Tm, Tn, T10, T11;
+			 Tm = Cr[WS(csr, 9)];
+			 Tn = Cr[WS(csr, 7)];
+			 To = Tm + Tn;
+			 T1W = Tm - Tn;
+			 T10 = Ci[WS(csi, 9)];
+			 T11 = Ci[WS(csi, 7)];
+			 T12 = T10 - T11;
+			 T1J = T10 + T11;
+		    }
+		    Tp = Tl + To;
+		    T2i = T1I + T1J;
+		    T2m = T1X - T1W;
+		    TN = Tl - To;
+		    T13 = TZ - T12;
+		    T1K = T1I - T1J;
+		    T1Y = T1W + T1X;
+		    T1k = T12 + TZ;
+	       }
+	       {
+		    E Ts, T1L, TT, T1M, Tv, T1O, TQ, T1P;
+		    {
+			 E Tq, Tr, TR, TS;
+			 Tq = Cr[WS(csr, 5)];
+			 Tr = Cr[WS(csr, 11)];
+			 Ts = Tq + Tr;
+			 T1L = Tq - Tr;
+			 TR = Ci[WS(csi, 5)];
+			 TS = Ci[WS(csi, 11)];
+			 TT = TR - TS;
+			 T1M = TR + TS;
+		    }
+		    {
+			 E Tt, Tu, TO, TP;
+			 Tt = Cr[WS(csr, 3)];
+			 Tu = Cr[WS(csr, 13)];
+			 Tv = Tt + Tu;
+			 T1O = Tt - Tu;
+			 TO = Ci[WS(csi, 13)];
+			 TP = Ci[WS(csi, 3)];
+			 TQ = TO - TP;
+			 T1P = TP + TO;
+		    }
+		    Tw = Ts + Tv;
+		    TU = TQ - TT;
+		    T1l = TT + TQ;
+		    TW = Ts - Tv;
+		    {
+			 E T1T, T1U, T1N, T1Q;
+			 T1T = T1L + T1M;
+			 T1U = T1O + T1P;
+			 T1V = KP707106781 * (T1T - T1U);
+			 T2j = KP707106781 * (T1T + T1U);
+			 T1N = T1L - T1M;
+			 T1Q = T1O - T1P;
+			 T1R = KP707106781 * (T1N + T1Q);
+			 T2l = KP707106781 * (T1N - T1Q);
+		    }
+	       }
+	       {
+		    E Tx, T1r, Ti, T1q, Ta;
+		    Tx = KP2_000000000 * (Tp + Tw);
+		    T1r = KP2_000000000 * (T1l + T1k);
+		    Ta = T6 + T9;
+		    Ti = Ta + Th;
+		    T1q = Ta - Th;
+		    R0[WS(rs, 8)] = Ti - Tx;
+		    R0[WS(rs, 12)] = T1q + T1r;
+		    R0[0] = Ti + Tx;
+		    R0[WS(rs, 4)] = T1q - T1r;
+	       }
+	       {
+		    E T1i, T1o, T1n, T1p, T1g, T1j, T1m;
+		    T1g = T6 - T9;
+		    T1i = T1g - T1h;
+		    T1o = T1g + T1h;
+		    T1j = Tp - Tw;
+		    T1m = T1k - T1l;
+		    T1n = KP1_414213562 * (T1j - T1m);
+		    T1p = KP1_414213562 * (T1j + T1m);
+		    R0[WS(rs, 10)] = T1i - T1n;
+		    R0[WS(rs, 14)] = T1o + T1p;
+		    R0[WS(rs, 2)] = T1i + T1n;
+		    R0[WS(rs, 6)] = T1o - T1p;
+	       }
+	       {
+		    E TM, T16, T15, T17;
+		    {
+			 E TC, TL, TV, T14;
+			 TC = Ty - TB;
+			 TL = KP1_414213562 * (TD - TK);
+			 TM = TC + TL;
+			 T16 = TC - TL;
+			 TV = TN + TU;
+			 T14 = TW + T13;
+			 T15 = FNMS(KP765366864, T14, KP1_847759065 * TV);
+			 T17 = FMA(KP765366864, TV, KP1_847759065 * T14);
+		    }
+		    R0[WS(rs, 9)] = TM - T15;
+		    R0[WS(rs, 13)] = T16 + T17;
+		    R0[WS(rs, 1)] = TM + T15;
+		    R0[WS(rs, 5)] = T16 - T17;
+	       }
+	       {
+		    E T2t, T2x, T2w, T2y;
+		    {
+			 E T2r, T2s, T2u, T2v;
+			 T2r = T2b + T2c;
+			 T2s = FMA(KP1_847759065, T2e, KP765366864 * T2f);
+			 T2t = T2r - T2s;
+			 T2x = T2r + T2s;
+			 T2u = T2i + T2j;
+			 T2v = T2m - T2l;
+			 T2w = FNMS(KP1_961570560, T2v, KP390180644 * T2u);
+			 T2y = FMA(KP1_961570560, T2u, KP390180644 * T2v);
+		    }
+		    R1[WS(rs, 11)] = T2t - T2w;
+		    R1[WS(rs, 15)] = T2x + T2y;
+		    R1[WS(rs, 3)] = T2t + T2w;
+		    R1[WS(rs, 7)] = T2x - T2y;
+	       }
+	       {
+		    E T1a, T1e, T1d, T1f;
+		    {
+			 E T18, T19, T1b, T1c;
+			 T18 = Ty + TB;
+			 T19 = KP1_414213562 * (TD + TK);
+			 T1a = T18 - T19;
+			 T1e = T18 + T19;
+			 T1b = TN - TU;
+			 T1c = T13 - TW;
+			 T1d = FNMS(KP1_847759065, T1c, KP765366864 * T1b);
+			 T1f = FMA(KP1_847759065, T1b, KP765366864 * T1c);
+		    }
+		    R0[WS(rs, 11)] = T1a - T1d;
+		    R0[WS(rs, 15)] = T1e + T1f;
+		    R0[WS(rs, 3)] = T1a + T1d;
+		    R0[WS(rs, 7)] = T1e - T1f;
+	       }
+	       {
+		    E T25, T29, T28, T2a;
+		    {
+			 E T23, T24, T26, T27;
+			 T23 = T1v - T1y;
+			 T24 = FMA(KP765366864, T1C, KP1_847759065 * T1F);
+			 T25 = T23 - T24;
+			 T29 = T23 + T24;
+			 T26 = T1K - T1R;
+			 T27 = T1Y - T1V;
+			 T28 = FNMS(KP1_662939224, T27, KP1_111140466 * T26);
+			 T2a = FMA(KP1_662939224, T26, KP1_111140466 * T27);
+		    }
+		    R1[WS(rs, 10)] = T25 - T28;
+		    R1[WS(rs, 14)] = T29 + T2a;
+		    R1[WS(rs, 2)] = T25 + T28;
+		    R1[WS(rs, 6)] = T29 - T2a;
+	       }
+	       {
+		    E T2h, T2p, T2o, T2q;
+		    {
+			 E T2d, T2g, T2k, T2n;
+			 T2d = T2b - T2c;
+			 T2g = FNMS(KP1_847759065, T2f, KP765366864 * T2e);
+			 T2h = T2d + T2g;
+			 T2p = T2d - T2g;
+			 T2k = T2i - T2j;
+			 T2n = T2l + T2m;
+			 T2o = FNMS(KP1_111140466, T2n, KP1_662939224 * T2k);
+			 T2q = FMA(KP1_111140466, T2k, KP1_662939224 * T2n);
+		    }
+		    R1[WS(rs, 9)] = T2h - T2o;
+		    R1[WS(rs, 13)] = T2p + T2q;
+		    R1[WS(rs, 1)] = T2h + T2o;
+		    R1[WS(rs, 5)] = T2p - T2q;
+	       }
+	       {
+		    E T1H, T21, T20, T22;
+		    {
+			 E T1z, T1G, T1S, T1Z;
+			 T1z = T1v + T1y;
+			 T1G = FNMS(KP765366864, T1F, KP1_847759065 * T1C);
+			 T1H = T1z + T1G;
+			 T21 = T1z - T1G;
+			 T1S = T1K + T1R;
+			 T1Z = T1V + T1Y;
+			 T20 = FNMS(KP390180644, T1Z, KP1_961570560 * T1S);
+			 T22 = FMA(KP390180644, T1S, KP1_961570560 * T1Z);
+		    }
+		    R1[WS(rs, 8)] = T1H - T20;
+		    R1[WS(rs, 12)] = T21 + T22;
+		    R1[0] = T1H + T20;
+		    R1[WS(rs, 4)] = T21 - T22;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 32, "r2cb_32", {140, 34, 16, 0}, &GENUS };
+
+void X(codelet_r2cb_32) (planner *p) {
+     X(kr2c_register) (p, r2cb_32, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -name r2cb_4 -include r2cb.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 4 fused multiply/add),
+ * 8 stack variables, 1 constants, and 8 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
+	       E T4, T6, T1, T2, T3, T5;
+	       T4 = Cr[WS(csr, 1)];
+	       T6 = Ci[WS(csi, 1)];
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 2)];
+	       T3 = T1 + T2;
+	       T5 = T1 - T2;
+	       R1[0] = FNMS(KP2_000000000, T6, T5);
+	       R1[WS(rs, 1)] = FMA(KP2_000000000, T6, T5);
+	       R0[0] = FMA(KP2_000000000, T4, T3);
+	       R0[WS(rs, 1)] = FNMS(KP2_000000000, T4, T3);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 4, "r2cb_4", {2, 0, 4, 0}, &GENUS };
+
+void X(codelet_r2cb_4) (planner *p) {
+     X(kr2c_register) (p, r2cb_4, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -name r2cb_4 -include r2cb.h */
+
+/*
+ * This function contains 6 FP additions, 2 FP multiplications,
+ * (or, 6 additions, 2 multiplications, 0 fused multiply/add),
+ * 10 stack variables, 1 constants, and 8 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
+	       E T5, T8, T3, T6;
+	       {
+		    E T4, T7, T1, T2;
+		    T4 = Cr[WS(csr, 1)];
+		    T5 = KP2_000000000 * T4;
+		    T7 = Ci[WS(csi, 1)];
+		    T8 = KP2_000000000 * T7;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 2)];
+		    T3 = T1 + T2;
+		    T6 = T1 - T2;
+	       }
+	       R0[WS(rs, 1)] = T3 - T5;
+	       R1[WS(rs, 1)] = T6 + T8;
+	       R0[0] = T3 + T5;
+	       R1[0] = T6 - T8;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 4, "r2cb_4", {6, 2, 0, 0}, &GENUS };
+
+void X(codelet_r2cb_4) (planner *p) {
+     X(kr2c_register) (p, r2cb_4, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 5 -name r2cb_5 -include r2cb.h */
+
+/*
+ * This function contains 12 FP additions, 10 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 10 fused multiply/add),
+ * 18 stack variables, 5 constants, and 10 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
+	       E T1, T2, T3, Tc, Ta, T8, T9;
+	       T8 = Ci[WS(csi, 1)];
+	       T9 = Ci[WS(csi, 2)];
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 1)];
+	       T3 = Cr[WS(csr, 2)];
+	       Tc = FMS(KP618033988, T8, T9);
+	       Ta = FMA(KP618033988, T9, T8);
+	       {
+		    E T6, T4, T5, T7, Tb;
+		    T6 = T2 - T3;
+		    T4 = T2 + T3;
+		    R0[0] = FMA(KP2_000000000, T4, T1);
+		    T5 = FNMS(KP500000000, T4, T1);
+		    T7 = FMA(KP1_118033988, T6, T5);
+		    Tb = FNMS(KP1_118033988, T6, T5);
+		    R0[WS(rs, 2)] = FMA(KP1_902113032, Ta, T7);
+		    R1[0] = FNMS(KP1_902113032, Ta, T7);
+		    R1[WS(rs, 1)] = FMA(KP1_902113032, Tc, Tb);
+		    R0[WS(rs, 1)] = FNMS(KP1_902113032, Tc, Tb);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 5, "r2cb_5", {2, 0, 10, 0}, &GENUS };
+
+void X(codelet_r2cb_5) (planner *p) {
+     X(kr2c_register) (p, r2cb_5, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 5 -name r2cb_5 -include r2cb.h */
+
+/*
+ * This function contains 12 FP additions, 7 FP multiplications,
+ * (or, 8 additions, 3 multiplications, 4 fused multiply/add),
+ * 18 stack variables, 5 constants, and 10 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
+     DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
+	       E Ta, Tc, T1, T4, T5, T6, Tb, T7;
+	       {
+		    E T8, T9, T2, T3;
+		    T8 = Ci[WS(csi, 1)];
+		    T9 = Ci[WS(csi, 2)];
+		    Ta = FNMS(KP1_902113032, T9, KP1_175570504 * T8);
+		    Tc = FMA(KP1_902113032, T8, KP1_175570504 * T9);
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 1)];
+		    T3 = Cr[WS(csr, 2)];
+		    T4 = T2 + T3;
+		    T5 = FNMS(KP500000000, T4, T1);
+		    T6 = KP1_118033988 * (T2 - T3);
+	       }
+	       R0[0] = FMA(KP2_000000000, T4, T1);
+	       Tb = T6 + T5;
+	       R1[0] = Tb - Tc;
+	       R0[WS(rs, 2)] = Tb + Tc;
+	       T7 = T5 - T6;
+	       R0[WS(rs, 1)] = T7 - Ta;
+	       R1[WS(rs, 1)] = T7 + Ta;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 5, "r2cb_5", {8, 3, 4, 0}, &GENUS };
+
+void X(codelet_r2cb_5) (planner *p) {
+     X(kr2c_register) (p, r2cb_5, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -name r2cb_6 -include r2cb.h */
+
+/*
+ * This function contains 14 FP additions, 6 FP multiplications,
+ * (or, 8 additions, 0 multiplications, 6 fused multiply/add),
+ * 13 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
+	       E T4, T7, T3, Te, Tc, T5;
+	       {
+		    E T1, T2, Ta, Tb;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 3)];
+		    Ta = Ci[WS(csi, 2)];
+		    Tb = Ci[WS(csi, 1)];
+		    T4 = Cr[WS(csr, 2)];
+		    T7 = T1 - T2;
+		    T3 = T1 + T2;
+		    Te = Ta + Tb;
+		    Tc = Ta - Tb;
+		    T5 = Cr[WS(csr, 1)];
+	       }
+	       {
+		    E T6, T8, Td, T9;
+		    T6 = T4 + T5;
+		    T8 = T5 - T4;
+		    Td = T7 + T8;
+		    R1[WS(rs, 1)] = FNMS(KP2_000000000, T8, T7);
+		    T9 = T3 - T6;
+		    R0[0] = FMA(KP2_000000000, T6, T3);
+		    R1[WS(rs, 2)] = FMA(KP1_732050807, Te, Td);
+		    R1[0] = FNMS(KP1_732050807, Te, Td);
+		    R0[WS(rs, 1)] = FMA(KP1_732050807, Tc, T9);
+		    R0[WS(rs, 2)] = FNMS(KP1_732050807, Tc, T9);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 6, "r2cb_6", {8, 0, 6, 0}, &GENUS };
+
+void X(codelet_r2cb_6) (planner *p) {
+     X(kr2c_register) (p, r2cb_6, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -name r2cb_6 -include r2cb.h */
+
+/*
+ * This function contains 14 FP additions, 4 FP multiplications,
+ * (or, 12 additions, 2 multiplications, 2 fused multiply/add),
+ * 17 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
+	       E T3, T7, Tc, Te, T6, T8, T1, T2, T9, Td;
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 3)];
+	       T3 = T1 - T2;
+	       T7 = T1 + T2;
+	       {
+		    E Ta, Tb, T4, T5;
+		    Ta = Ci[WS(csi, 2)];
+		    Tb = Ci[WS(csi, 1)];
+		    Tc = KP1_732050807 * (Ta - Tb);
+		    Te = KP1_732050807 * (Ta + Tb);
+		    T4 = Cr[WS(csr, 2)];
+		    T5 = Cr[WS(csr, 1)];
+		    T6 = T4 - T5;
+		    T8 = T4 + T5;
+	       }
+	       R1[WS(rs, 1)] = FMA(KP2_000000000, T6, T3);
+	       R0[0] = FMA(KP2_000000000, T8, T7);
+	       T9 = T7 - T8;
+	       R0[WS(rs, 2)] = T9 - Tc;
+	       R0[WS(rs, 1)] = T9 + Tc;
+	       Td = T3 - T6;
+	       R1[0] = Td - Te;
+	       R1[WS(rs, 2)] = Td + Te;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 6, "r2cb_6", {12, 2, 2, 0}, &GENUS };
+
+void X(codelet_r2cb_6) (planner *p) {
+     X(kr2c_register) (p, r2cb_6, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1392 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:09 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 64 -name r2cb_64 -include r2cb.h */
+
+/*
+ * This function contains 394 FP additions, 216 FP multiplications,
+ * (or, 178 additions, 0 multiplications, 216 fused multiply/add),
+ * 143 stack variables, 18 constants, and 128 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_990369453, +1.990369453344393772489673906218959843150949737);
+     DK(KP1_546020906, +1.546020906725473921621813219516939601942082586);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP1_913880671, +1.913880671464417729871595773960539938965698411);
+     DK(KP1_763842528, +1.763842528696710059425513727320776699016885241);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(256, rs), MAKE_VOLATILE_STRIDE(256, csr), MAKE_VOLATILE_STRIDE(256, csi)) {
+	       E T3d, T32, T37, T2Z, T3f, T3b, T3c, T35;
+	       {
+		    E T5H, T9, T5j, T4p, T2T, T1b, T3Z, T3j, Tg, T5I, T5k, T4u, T40, T3m, T2U;
+		    E T1m, T3o, T1s, T1J, T3r, T5K, Tw, T5N, T6c, T4A, T5n, T3s, T1D, T5m, T4F;
+		    E T3p, T1M, T3w, T1U, T2z, T3H, T5Q, TM, T6f, T5Y, T5q, T4M, T3I, T25, T5t;
+		    E T53, T3x, T2C, T3A, T5V, T11, T6g, T5T, T55, T4W, T3z, T2E, T2h, T2F, T2s;
+		    E T3L, T3E, T54, T4R;
+		    {
+			 E Td, T1c, Tc, T4r, T1k, Te, T1d, T1e;
+			 {
+			      E T3h, T15, T1a, T3i;
+			      {
+				   E T4, T14, T17, T13, T3, T16, T8, T18;
+				   T4 = Cr[WS(csr, 16)];
+				   T14 = Ci[WS(csi, 16)];
+				   {
+					E T1, T2, T6, T7;
+					T1 = Cr[0];
+					T2 = Cr[WS(csr, 32)];
+					T6 = Cr[WS(csr, 8)];
+					T7 = Cr[WS(csr, 24)];
+					T17 = Ci[WS(csi, 8)];
+					T13 = T1 - T2;
+					T3 = T1 + T2;
+					T16 = T6 - T7;
+					T8 = T6 + T7;
+					T18 = Ci[WS(csi, 24)];
+				   }
+				   {
+					E T4n, T5, T4o, T19;
+					T4n = FNMS(KP2_000000000, T4, T3);
+					T5 = FMA(KP2_000000000, T4, T3);
+					T3h = FMA(KP2_000000000, T14, T13);
+					T15 = FNMS(KP2_000000000, T14, T13);
+					T4o = T17 - T18;
+					T19 = T17 + T18;
+					T5H = FNMS(KP2_000000000, T8, T5);
+					T9 = FMA(KP2_000000000, T8, T5);
+					T5j = FMA(KP2_000000000, T4o, T4n);
+					T4p = FNMS(KP2_000000000, T4o, T4n);
+					T1a = T16 - T19;
+					T3i = T16 + T19;
+				   }
+			      }
+			      {
+				   E Ta, Tb, T1i, T1j;
+				   Ta = Cr[WS(csr, 4)];
+				   T2T = FNMS(KP1_414213562, T1a, T15);
+				   T1b = FMA(KP1_414213562, T1a, T15);
+				   T3Z = FMA(KP1_414213562, T3i, T3h);
+				   T3j = FNMS(KP1_414213562, T3i, T3h);
+				   Tb = Cr[WS(csr, 28)];
+				   T1i = Ci[WS(csi, 4)];
+				   T1j = Ci[WS(csi, 28)];
+				   Td = Cr[WS(csr, 20)];
+				   T1c = Ta - Tb;
+				   Tc = Ta + Tb;
+				   T4r = T1i - T1j;
+				   T1k = T1i + T1j;
+				   Te = Cr[WS(csr, 12)];
+				   T1d = Ci[WS(csi, 20)];
+				   T1e = Ci[WS(csi, 12)];
+			      }
+			 }
+			 {
+			      E T4B, T4E, T1K, T1L;
+			      {
+				   E T1o, Tk, T4C, T1I, T1F, Tn, T4D, T1r, Ts, T1t, Tr, T4y, T1w, Tt, T1z;
+				   E T1A;
+				   {
+					E Tl, Tm, T1p, T1q;
+					{
+					     E Ti, Tj, T1G, T1H, T1h, Tf;
+					     Ti = Cr[WS(csr, 2)];
+					     T1h = Td - Te;
+					     Tf = Td + Te;
+					     {
+						  E T4s, T1f, T3k, T1l;
+						  T4s = T1d - T1e;
+						  T1f = T1d + T1e;
+						  T3k = T1k - T1h;
+						  T1l = T1h + T1k;
+						  {
+						       E T4q, T4t, T3l, T1g;
+						       T4q = Tc - Tf;
+						       Tg = Tc + Tf;
+						       T4t = T4r - T4s;
+						       T5I = T4s + T4r;
+						       T3l = T1c + T1f;
+						       T1g = T1c - T1f;
+						       T5k = T4q + T4t;
+						       T4u = T4q - T4t;
+						       T40 = FMA(KP414213562, T3k, T3l);
+						       T3m = FNMS(KP414213562, T3l, T3k);
+						       T2U = FMA(KP414213562, T1g, T1l);
+						       T1m = FNMS(KP414213562, T1l, T1g);
+						       Tj = Cr[WS(csr, 30)];
+						  }
+					     }
+					     T1G = Ci[WS(csi, 2)];
+					     T1H = Ci[WS(csi, 30)];
+					     Tl = Cr[WS(csr, 18)];
+					     T1o = Ti - Tj;
+					     Tk = Ti + Tj;
+					     T4C = T1G - T1H;
+					     T1I = T1G + T1H;
+					     Tm = Cr[WS(csr, 14)];
+					     T1p = Ci[WS(csi, 18)];
+					     T1q = Ci[WS(csi, 14)];
+					}
+					{
+					     E Tp, Tq, T1u, T1v;
+					     Tp = Cr[WS(csr, 10)];
+					     T1F = Tl - Tm;
+					     Tn = Tl + Tm;
+					     T4D = T1p - T1q;
+					     T1r = T1p + T1q;
+					     Tq = Cr[WS(csr, 22)];
+					     T1u = Ci[WS(csi, 10)];
+					     T1v = Ci[WS(csi, 22)];
+					     Ts = Cr[WS(csr, 6)];
+					     T1t = Tp - Tq;
+					     Tr = Tp + Tq;
+					     T4y = T1u - T1v;
+					     T1w = T1u + T1v;
+					     Tt = Cr[WS(csr, 26)];
+					     T1z = Ci[WS(csi, 6)];
+					     T1A = Ci[WS(csi, 26)];
+					}
+				   }
+				   {
+					E T1y, T4x, T1B, T4w, To, Tv, Tu;
+					T3o = T1o + T1r;
+					T1s = T1o - T1r;
+					T1y = Ts - Tt;
+					Tu = Ts + Tt;
+					T4x = T1A - T1z;
+					T1B = T1z + T1A;
+					T1J = T1F + T1I;
+					T3r = T1I - T1F;
+					T4w = Tk - Tn;
+					To = Tk + Tn;
+					Tv = Tr + Tu;
+					T4B = Tr - Tu;
+					{
+					     E T4z, T5L, T5M, T1x, T1C;
+					     T4E = T4C - T4D;
+					     T5L = T4D + T4C;
+					     T5M = T4y + T4x;
+					     T4z = T4x - T4y;
+					     T5K = To - Tv;
+					     Tw = To + Tv;
+					     T5N = T5L - T5M;
+					     T6c = T5M + T5L;
+					     T1K = T1t + T1w;
+					     T1x = T1t - T1w;
+					     T1C = T1y - T1B;
+					     T1L = T1y + T1B;
+					     T4A = T4w + T4z;
+					     T5n = T4w - T4z;
+					     T3s = T1C - T1x;
+					     T1D = T1x + T1C;
+					}
+				   }
+			      }
+			      {
+				   E T4Z, T52, T2A, T2B;
+				   {
+					E T1Q, TA, T50, T2y, T2v, TD, T51, T1T, TI, T1V, TH, T4K, T1Y, TJ, T21;
+					E T22;
+					{
+					     E TB, TC, T1R, T1S;
+					     {
+						  E Ty, Tz, T2w, T2x;
+						  Ty = Cr[WS(csr, 1)];
+						  T5m = T4E - T4B;
+						  T4F = T4B + T4E;
+						  T3p = T1K + T1L;
+						  T1M = T1K - T1L;
+						  Tz = Cr[WS(csr, 31)];
+						  T2w = Ci[WS(csi, 1)];
+						  T2x = Ci[WS(csi, 31)];
+						  TB = Cr[WS(csr, 17)];
+						  T1Q = Ty - Tz;
+						  TA = Ty + Tz;
+						  T50 = T2w - T2x;
+						  T2y = T2w + T2x;
+						  TC = Cr[WS(csr, 15)];
+						  T1R = Ci[WS(csi, 17)];
+						  T1S = Ci[WS(csi, 15)];
+					     }
+					     {
+						  E TF, TG, T1W, T1X;
+						  TF = Cr[WS(csr, 9)];
+						  T2v = TB - TC;
+						  TD = TB + TC;
+						  T51 = T1R - T1S;
+						  T1T = T1R + T1S;
+						  TG = Cr[WS(csr, 23)];
+						  T1W = Ci[WS(csi, 9)];
+						  T1X = Ci[WS(csi, 23)];
+						  TI = Cr[WS(csr, 7)];
+						  T1V = TF - TG;
+						  TH = TF + TG;
+						  T4K = T1W - T1X;
+						  T1Y = T1W + T1X;
+						  TJ = Cr[WS(csr, 25)];
+						  T21 = Ci[WS(csi, 7)];
+						  T22 = Ci[WS(csi, 25)];
+					     }
+					}
+					{
+					     E T20, T4J, T23, T4I, TE, TL, TK;
+					     T3w = T1Q + T1T;
+					     T1U = T1Q - T1T;
+					     T20 = TI - TJ;
+					     TK = TI + TJ;
+					     T4J = T22 - T21;
+					     T23 = T21 + T22;
+					     T2z = T2v + T2y;
+					     T3H = T2y - T2v;
+					     T4I = TA - TD;
+					     TE = TA + TD;
+					     TL = TH + TK;
+					     T4Z = TH - TK;
+					     {
+						  E T4L, T5W, T5X, T1Z, T24;
+						  T52 = T50 - T51;
+						  T5W = T51 + T50;
+						  T5X = T4K + T4J;
+						  T4L = T4J - T4K;
+						  T5Q = TE - TL;
+						  TM = TE + TL;
+						  T6f = T5X + T5W;
+						  T5Y = T5W - T5X;
+						  T2A = T1V + T1Y;
+						  T1Z = T1V - T1Y;
+						  T24 = T20 - T23;
+						  T2B = T20 + T23;
+						  T5q = T4I - T4L;
+						  T4M = T4I + T4L;
+						  T3I = T24 - T1Z;
+						  T25 = T1Z + T24;
+					     }
+					}
+				   }
+				   {
+					E T27, TP, T4O, T2f, T2c, TS, T4P, T2a, TX, T2i, TW, T4T, T2q, TY, T2j;
+					E T2k;
+					{
+					     E TQ, TR, T28, T29;
+					     {
+						  E TN, TO, T2d, T2e;
+						  TN = Cr[WS(csr, 5)];
+						  T5t = T52 - T4Z;
+						  T53 = T4Z + T52;
+						  T3x = T2A + T2B;
+						  T2C = T2A - T2B;
+						  TO = Cr[WS(csr, 27)];
+						  T2d = Ci[WS(csi, 5)];
+						  T2e = Ci[WS(csi, 27)];
+						  TQ = Cr[WS(csr, 21)];
+						  T27 = TN - TO;
+						  TP = TN + TO;
+						  T4O = T2d - T2e;
+						  T2f = T2d + T2e;
+						  TR = Cr[WS(csr, 11)];
+						  T28 = Ci[WS(csi, 21)];
+						  T29 = Ci[WS(csi, 11)];
+					     }
+					     {
+						  E TU, TV, T2o, T2p;
+						  TU = Cr[WS(csr, 3)];
+						  T2c = TQ - TR;
+						  TS = TQ + TR;
+						  T4P = T28 - T29;
+						  T2a = T28 + T29;
+						  TV = Cr[WS(csr, 29)];
+						  T2o = Ci[WS(csi, 3)];
+						  T2p = Ci[WS(csi, 29)];
+						  TX = Cr[WS(csr, 13)];
+						  T2i = TU - TV;
+						  TW = TU + TV;
+						  T4T = T2p - T2o;
+						  T2q = T2o + T2p;
+						  TY = Cr[WS(csr, 19)];
+						  T2j = Ci[WS(csi, 13)];
+						  T2k = Ci[WS(csi, 19)];
+					     }
+					}
+					{
+					     E T4N, T2n, T2l, T4Q, T2b, T2g, TT, TZ, T4U;
+					     T4N = TP - TS;
+					     TT = TP + TS;
+					     T2n = TX - TY;
+					     TZ = TX + TY;
+					     T4U = T2j - T2k;
+					     T2l = T2j + T2k;
+					     {
+						  E T5S, T10, T4S, T4V, T5R;
+						  T5S = T4P + T4O;
+						  T4Q = T4O - T4P;
+						  T10 = TW + TZ;
+						  T4S = TW - TZ;
+						  T4V = T4T - T4U;
+						  T5R = T4U + T4T;
+						  T3A = T27 + T2a;
+						  T2b = T27 - T2a;
+						  T5V = TT - T10;
+						  T11 = TT + T10;
+						  T6g = T5S + T5R;
+						  T5T = T5R - T5S;
+						  T55 = T4V - T4S;
+						  T4W = T4S + T4V;
+						  T2g = T2c + T2f;
+						  T3z = T2f - T2c;
+					     }
+					     {
+						  E T3D, T3C, T2m, T2r;
+						  T3D = T2i + T2l;
+						  T2m = T2i - T2l;
+						  T2r = T2n - T2q;
+						  T3C = T2n + T2q;
+						  T2E = FMA(KP414213562, T2b, T2g);
+						  T2h = FNMS(KP414213562, T2g, T2b);
+						  T2F = FNMS(KP414213562, T2m, T2r);
+						  T2s = FMA(KP414213562, T2r, T2m);
+						  T3L = FMA(KP414213562, T3C, T3D);
+						  T3E = FNMS(KP414213562, T3D, T3C);
+						  T54 = T4N + T4Q;
+						  T4R = T4N - T4Q;
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T3K, T3B, T5u, T5r, T5d, T5g;
+			 {
+			      E T6e, T6h, T6b, T5J, T5O, T5Z, T66, T69, T65, T67, T5U, T12, T6m, Th;
+			      T6e = TM - T11;
+			      T12 = TM + T11;
+			      T6m = T6g + T6f;
+			      T6h = T6f - T6g;
+			      T6b = FNMS(KP2_000000000, Tg, T9);
+			      Th = FMA(KP2_000000000, Tg, T9);
+			      T3K = FMA(KP414213562, T3z, T3A);
+			      T3B = FNMS(KP414213562, T3A, T3z);
+			      {
+				   E T63, T64, T6l, Tx;
+				   T5J = FNMS(KP2_000000000, T5I, T5H);
+				   T63 = FMA(KP2_000000000, T5I, T5H);
+				   T64 = T5K + T5N;
+				   T5O = T5K - T5N;
+				   T5Z = T5V + T5Y;
+				   T66 = T5Y - T5V;
+				   T6l = FNMS(KP2_000000000, Tw, Th);
+				   Tx = FMA(KP2_000000000, Tw, Th);
+				   T69 = FMA(KP1_414213562, T64, T63);
+				   T65 = FNMS(KP1_414213562, T64, T63);
+				   R0[WS(rs, 8)] = FNMS(KP2_000000000, T6m, T6l);
+				   R0[WS(rs, 24)] = FMA(KP2_000000000, T6m, T6l);
+				   R0[0] = FMA(KP2_000000000, T12, Tx);
+				   R0[WS(rs, 16)] = FNMS(KP2_000000000, T12, Tx);
+				   T67 = T5Q - T5T;
+				   T5U = T5Q + T5T;
+			      }
+			      {
+				   E T6j, T6d, T6a, T68;
+				   T6a = FMA(KP414213562, T66, T67);
+				   T68 = FNMS(KP414213562, T67, T66);
+				   T6j = FMA(KP2_000000000, T6c, T6b);
+				   T6d = FNMS(KP2_000000000, T6c, T6b);
+				   R0[WS(rs, 14)] = FNMS(KP1_847759065, T6a, T69);
+				   R0[WS(rs, 30)] = FMA(KP1_847759065, T6a, T69);
+				   R0[WS(rs, 22)] = FMA(KP1_847759065, T68, T65);
+				   R0[WS(rs, 6)] = FNMS(KP1_847759065, T68, T65);
+				   {
+					E T61, T5P, T6k, T6i;
+					T6k = T6e + T6h;
+					T6i = T6e - T6h;
+					T61 = FNMS(KP1_414213562, T5O, T5J);
+					T5P = FMA(KP1_414213562, T5O, T5J);
+					R0[WS(rs, 12)] = FNMS(KP1_414213562, T6k, T6j);
+					R0[WS(rs, 28)] = FMA(KP1_414213562, T6k, T6j);
+					R0[WS(rs, 4)] = FMA(KP1_414213562, T6i, T6d);
+					R0[WS(rs, 20)] = FNMS(KP1_414213562, T6i, T6d);
+					{
+					     E T5b, T4v, T5f, T4Y, T5e, T57, T4G, T5c;
+					     {
+						  E T4X, T56, T62, T60;
+						  T5u = T4W - T4R;
+						  T4X = T4R + T4W;
+						  T56 = T54 + T55;
+						  T5r = T54 - T55;
+						  T5b = FNMS(KP1_414213562, T4u, T4p);
+						  T4v = FMA(KP1_414213562, T4u, T4p);
+						  T62 = FMA(KP414213562, T5U, T5Z);
+						  T60 = FNMS(KP414213562, T5Z, T5U);
+						  T5f = FNMS(KP707106781, T4X, T4M);
+						  T4Y = FMA(KP707106781, T4X, T4M);
+						  T5e = FNMS(KP707106781, T56, T53);
+						  T57 = FMA(KP707106781, T56, T53);
+						  R0[WS(rs, 10)] = FNMS(KP1_847759065, T62, T61);
+						  R0[WS(rs, 26)] = FMA(KP1_847759065, T62, T61);
+						  R0[WS(rs, 2)] = FMA(KP1_847759065, T60, T5P);
+						  R0[WS(rs, 18)] = FNMS(KP1_847759065, T60, T5P);
+						  T4G = FNMS(KP414213562, T4F, T4A);
+						  T5c = FMA(KP414213562, T4A, T4F);
+					     }
+					     {
+						  E T5a, T59, T5h, T5i, T58, T4H;
+						  T5a = FMA(KP198912367, T4Y, T57);
+						  T58 = FNMS(KP198912367, T57, T4Y);
+						  T59 = FNMS(KP1_847759065, T4G, T4v);
+						  T4H = FMA(KP1_847759065, T4G, T4v);
+						  T5h = FMA(KP1_847759065, T5c, T5b);
+						  T5d = FNMS(KP1_847759065, T5c, T5b);
+						  T5i = FMA(KP668178637, T5e, T5f);
+						  T5g = FNMS(KP668178637, T5f, T5e);
+						  R0[WS(rs, 1)] = FMA(KP1_961570560, T58, T4H);
+						  R0[WS(rs, 17)] = FNMS(KP1_961570560, T58, T4H);
+						  R0[WS(rs, 29)] = FMA(KP1_662939224, T5i, T5h);
+						  R0[WS(rs, 13)] = FNMS(KP1_662939224, T5i, T5h);
+						  R0[WS(rs, 25)] = FMA(KP1_961570560, T5a, T59);
+						  R0[WS(rs, 9)] = FNMS(KP1_961570560, T5a, T59);
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T43, T42, T46, T4a, T49, T3V, T3G, T47, T3P, T3v, T3X, T3T, T3U, T3N, T5B;
+			      E T5E;
+			      {
+				   E T5s, T5D, T5z, T5l, T5C, T5v, T5o, T5A;
+				   R0[WS(rs, 21)] = FMA(KP1_662939224, T5g, T5d);
+				   R0[WS(rs, 5)] = FNMS(KP1_662939224, T5g, T5d);
+				   T5s = FNMS(KP707106781, T5r, T5q);
+				   T5D = FMA(KP707106781, T5r, T5q);
+				   T5z = FMA(KP1_414213562, T5k, T5j);
+				   T5l = FNMS(KP1_414213562, T5k, T5j);
+				   T5C = FMA(KP707106781, T5u, T5t);
+				   T5v = FNMS(KP707106781, T5u, T5t);
+				   T5o = FNMS(KP414213562, T5n, T5m);
+				   T5A = FMA(KP414213562, T5m, T5n);
+				   {
+					E T5y, T5x, T5F, T5G, T5w, T5p;
+					T5y = FMA(KP668178637, T5s, T5v);
+					T5w = FNMS(KP668178637, T5v, T5s);
+					T5x = FMA(KP1_847759065, T5o, T5l);
+					T5p = FNMS(KP1_847759065, T5o, T5l);
+					T5F = FMA(KP1_847759065, T5A, T5z);
+					T5B = FNMS(KP1_847759065, T5A, T5z);
+					T5G = FMA(KP198912367, T5C, T5D);
+					T5E = FNMS(KP198912367, T5D, T5C);
+					R0[WS(rs, 3)] = FMA(KP1_662939224, T5w, T5p);
+					R0[WS(rs, 19)] = FNMS(KP1_662939224, T5w, T5p);
+					R0[WS(rs, 31)] = FMA(KP1_961570560, T5G, T5F);
+					R0[WS(rs, 15)] = FNMS(KP1_961570560, T5G, T5F);
+					R0[WS(rs, 27)] = FMA(KP1_662939224, T5y, T5x);
+					R0[WS(rs, 11)] = FNMS(KP1_662939224, T5y, T5x);
+				   }
+			      }
+			      {
+				   E T3R, T3n, T3J, T3S, T3u, T3M;
+				   T3R = FMA(KP1_847759065, T3m, T3j);
+				   T3n = FNMS(KP1_847759065, T3m, T3j);
+				   R0[WS(rs, 23)] = FMA(KP1_961570560, T5E, T5B);
+				   R0[WS(rs, 7)] = FNMS(KP1_961570560, T5E, T5B);
+				   {
+					E T3q, T3t, T3y, T3F;
+					T43 = FMA(KP707106781, T3p, T3o);
+					T3q = FNMS(KP707106781, T3p, T3o);
+					T3t = FNMS(KP707106781, T3s, T3r);
+					T42 = FMA(KP707106781, T3s, T3r);
+					T46 = FMA(KP707106781, T3x, T3w);
+					T3y = FNMS(KP707106781, T3x, T3w);
+					T3F = T3B + T3E;
+					T4a = T3B - T3E;
+					T49 = FMA(KP707106781, T3I, T3H);
+					T3J = FNMS(KP707106781, T3I, T3H);
+					T3S = FMA(KP668178637, T3q, T3t);
+					T3u = FNMS(KP668178637, T3t, T3q);
+					T3V = FMA(KP923879532, T3F, T3y);
+					T3G = FNMS(KP923879532, T3F, T3y);
+					T3M = T3K - T3L;
+					T47 = T3K + T3L;
+				   }
+				   T3P = FNMS(KP1_662939224, T3u, T3n);
+				   T3v = FMA(KP1_662939224, T3u, T3n);
+				   T3X = FMA(KP1_662939224, T3S, T3R);
+				   T3T = FNMS(KP1_662939224, T3S, T3R);
+				   T3U = FNMS(KP923879532, T3M, T3J);
+				   T3N = FMA(KP923879532, T3M, T3J);
+			      }
+			      {
+				   E T2X, T2W, T30, T34, T33, T2P, T2u, T31, T2J, T1P, T2R, T2N, T2O, T2H;
+				   {
+					E T2L, T1n, T2D, T2M, T1O, T2G;
+					T2L = FNMS(KP1_847759065, T1m, T1b);
+					T1n = FMA(KP1_847759065, T1m, T1b);
+					{
+					     E T3W, T3Y, T3Q, T3O;
+					     T3W = FNMS(KP534511135, T3V, T3U);
+					     T3Y = FMA(KP534511135, T3U, T3V);
+					     T3Q = FMA(KP303346683, T3G, T3N);
+					     T3O = FNMS(KP303346683, T3N, T3G);
+					     R1[WS(rs, 21)] = FMA(KP1_763842528, T3W, T3T);
+					     R1[WS(rs, 5)] = FNMS(KP1_763842528, T3W, T3T);
+					     R1[WS(rs, 29)] = FMA(KP1_763842528, T3Y, T3X);
+					     R1[WS(rs, 13)] = FNMS(KP1_763842528, T3Y, T3X);
+					     R1[WS(rs, 25)] = FMA(KP1_913880671, T3Q, T3P);
+					     R1[WS(rs, 9)] = FNMS(KP1_913880671, T3Q, T3P);
+					     R1[WS(rs, 1)] = FMA(KP1_913880671, T3O, T3v);
+					     R1[WS(rs, 17)] = FNMS(KP1_913880671, T3O, T3v);
+					}
+					{
+					     E T1E, T1N, T26, T2t;
+					     T2X = FNMS(KP707106781, T1D, T1s);
+					     T1E = FMA(KP707106781, T1D, T1s);
+					     T1N = FMA(KP707106781, T1M, T1J);
+					     T2W = FNMS(KP707106781, T1M, T1J);
+					     T30 = FNMS(KP707106781, T25, T1U);
+					     T26 = FMA(KP707106781, T25, T1U);
+					     T2t = T2h + T2s;
+					     T34 = T2s - T2h;
+					     T33 = FNMS(KP707106781, T2C, T2z);
+					     T2D = FMA(KP707106781, T2C, T2z);
+					     T2M = FMA(KP198912367, T1E, T1N);
+					     T1O = FNMS(KP198912367, T1N, T1E);
+					     T2P = FNMS(KP923879532, T2t, T26);
+					     T2u = FMA(KP923879532, T2t, T26);
+					     T2G = T2E + T2F;
+					     T31 = T2E - T2F;
+					}
+					T2J = FNMS(KP1_961570560, T1O, T1n);
+					T1P = FMA(KP1_961570560, T1O, T1n);
+					T2R = FMA(KP1_961570560, T2M, T2L);
+					T2N = FNMS(KP1_961570560, T2M, T2L);
+					T2O = FNMS(KP923879532, T2G, T2D);
+					T2H = FMA(KP923879532, T2G, T2D);
+				   }
+				   {
+					E T4j, T48, T4d, T45, T4l, T4h, T4i, T4b;
+					{
+					     E T4f, T41, T4g, T44;
+					     T4f = FMA(KP1_847759065, T40, T3Z);
+					     T41 = FNMS(KP1_847759065, T40, T3Z);
+					     {
+						  E T2Q, T2S, T2K, T2I;
+						  T2Q = FNMS(KP820678790, T2P, T2O);
+						  T2S = FMA(KP820678790, T2O, T2P);
+						  T2K = FMA(KP098491403, T2u, T2H);
+						  T2I = FNMS(KP098491403, T2H, T2u);
+						  R1[WS(rs, 20)] = FMA(KP1_546020906, T2Q, T2N);
+						  R1[WS(rs, 4)] = FNMS(KP1_546020906, T2Q, T2N);
+						  R1[WS(rs, 28)] = FMA(KP1_546020906, T2S, T2R);
+						  R1[WS(rs, 12)] = FNMS(KP1_546020906, T2S, T2R);
+						  R1[WS(rs, 24)] = FMA(KP1_990369453, T2K, T2J);
+						  R1[WS(rs, 8)] = FNMS(KP1_990369453, T2K, T2J);
+						  R1[0] = FMA(KP1_990369453, T2I, T1P);
+						  R1[WS(rs, 16)] = FNMS(KP1_990369453, T2I, T1P);
+					     }
+					     T4g = FMA(KP198912367, T42, T43);
+					     T44 = FNMS(KP198912367, T43, T42);
+					     T4j = FMA(KP923879532, T47, T46);
+					     T48 = FNMS(KP923879532, T47, T46);
+					     T4d = FMA(KP1_961570560, T44, T41);
+					     T45 = FNMS(KP1_961570560, T44, T41);
+					     T4l = FMA(KP1_961570560, T4g, T4f);
+					     T4h = FNMS(KP1_961570560, T4g, T4f);
+					     T4i = FMA(KP923879532, T4a, T49);
+					     T4b = FNMS(KP923879532, T4a, T49);
+					}
+					{
+					     E T39, T2V, T3a, T2Y;
+					     T39 = FMA(KP1_847759065, T2U, T2T);
+					     T2V = FNMS(KP1_847759065, T2U, T2T);
+					     {
+						  E T4k, T4m, T4e, T4c;
+						  T4k = FNMS(KP098491403, T4j, T4i);
+						  T4m = FMA(KP098491403, T4i, T4j);
+						  T4e = FMA(KP820678790, T48, T4b);
+						  T4c = FNMS(KP820678790, T4b, T48);
+						  R1[WS(rs, 23)] = FMA(KP1_990369453, T4k, T4h);
+						  R1[WS(rs, 7)] = FNMS(KP1_990369453, T4k, T4h);
+						  R1[WS(rs, 31)] = FMA(KP1_990369453, T4m, T4l);
+						  R1[WS(rs, 15)] = FNMS(KP1_990369453, T4m, T4l);
+						  R1[WS(rs, 27)] = FMA(KP1_546020906, T4e, T4d);
+						  R1[WS(rs, 11)] = FNMS(KP1_546020906, T4e, T4d);
+						  R1[WS(rs, 3)] = FMA(KP1_546020906, T4c, T45);
+						  R1[WS(rs, 19)] = FNMS(KP1_546020906, T4c, T45);
+					     }
+					     T3a = FMA(KP668178637, T2W, T2X);
+					     T2Y = FNMS(KP668178637, T2X, T2W);
+					     T3d = FMA(KP923879532, T31, T30);
+					     T32 = FNMS(KP923879532, T31, T30);
+					     T37 = FMA(KP1_662939224, T2Y, T2V);
+					     T2Z = FNMS(KP1_662939224, T2Y, T2V);
+					     T3f = FMA(KP1_662939224, T3a, T39);
+					     T3b = FNMS(KP1_662939224, T3a, T39);
+					     T3c = FMA(KP923879532, T34, T33);
+					     T35 = FNMS(KP923879532, T34, T33);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       {
+		    E T3g, T3e, T36, T38;
+		    T3g = FMA(KP303346683, T3c, T3d);
+		    T3e = FNMS(KP303346683, T3d, T3c);
+		    T36 = FNMS(KP534511135, T35, T32);
+		    T38 = FMA(KP534511135, T32, T35);
+		    R1[WS(rs, 22)] = FMA(KP1_913880671, T3e, T3b);
+		    R1[WS(rs, 6)] = FNMS(KP1_913880671, T3e, T3b);
+		    R1[WS(rs, 30)] = FMA(KP1_913880671, T3g, T3f);
+		    R1[WS(rs, 14)] = FNMS(KP1_913880671, T3g, T3f);
+		    R1[WS(rs, 26)] = FMA(KP1_763842528, T38, T37);
+		    R1[WS(rs, 10)] = FNMS(KP1_763842528, T38, T37);
+		    R1[WS(rs, 2)] = FMA(KP1_763842528, T36, T2Z);
+		    R1[WS(rs, 18)] = FNMS(KP1_763842528, T36, T2Z);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 64, "r2cb_64", {178, 0, 216, 0}, &GENUS };
+
+void X(codelet_r2cb_64) (planner *p) {
+     X(kr2c_register) (p, r2cb_64, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 64 -name r2cb_64 -include r2cb.h */
+
+/*
+ * This function contains 394 FP additions, 134 FP multiplications,
+ * (or, 342 additions, 82 multiplications, 52 fused multiply/add),
+ * 110 stack variables, 19 constants, and 128 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_268786568, +1.268786568327290996430343226450986741351374190);
+     DK(KP1_546020906, +1.546020906725473921621813219516939601942082586);
+     DK(KP196034280, +0.196034280659121203988391127777283691722273346);
+     DK(KP1_990369453, +1.990369453344393772489673906218959843150949737);
+     DK(KP942793473, +0.942793473651995297112775251810508755314920638);
+     DK(KP1_763842528, +1.763842528696710059425513727320776699016885241);
+     DK(KP580569354, +0.580569354508924735272384751634790549382952557);
+     DK(KP1_913880671, +1.913880671464417729871595773960539938965698411);
+     DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP390180644, +0.390180644032256535696569736954044481855383236);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP765366864, +0.765366864730179543456919968060797733522689125);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(256, rs), MAKE_VOLATILE_STRIDE(256, csr), MAKE_VOLATILE_STRIDE(256, csi)) {
+	       E Ta, T2S, T18, T2u, T3F, T4V, T5l, T61, Th, T2T, T1h, T2v, T3M, T4W, T5o;
+	       E T62, T3Q, T5q, T5u, T44, Tp, Tw, T2V, T2W, T2X, T2Y, T3X, T5t, T1r, T2x;
+	       E T41, T5r, T1A, T2y, T4a, T5y, T5N, T4H, TN, T31, T4E, T5z, T39, T3q, T1L;
+	       E T2B, T4h, T5M, T2h, T2F, T12, T36, T5D, T5J, T5G, T5K, T1U, T26, T23, T27;
+	       E T4p, T4z, T4w, T4A, T34, T3r;
+	       {
+		    E T5, T3A, T3, T3y, T9, T3C, T17, T3D, T6, T14;
+		    {
+			 E T4, T3z, T1, T2;
+			 T4 = Cr[WS(csr, 16)];
+			 T5 = KP2_000000000 * T4;
+			 T3z = Ci[WS(csi, 16)];
+			 T3A = KP2_000000000 * T3z;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 32)];
+			 T3 = T1 + T2;
+			 T3y = T1 - T2;
+			 {
+			      E T7, T8, T15, T16;
+			      T7 = Cr[WS(csr, 8)];
+			      T8 = Cr[WS(csr, 24)];
+			      T9 = KP2_000000000 * (T7 + T8);
+			      T3C = T7 - T8;
+			      T15 = Ci[WS(csi, 8)];
+			      T16 = Ci[WS(csi, 24)];
+			      T17 = KP2_000000000 * (T15 - T16);
+			      T3D = T15 + T16;
+			 }
+		    }
+		    T6 = T3 + T5;
+		    Ta = T6 + T9;
+		    T2S = T6 - T9;
+		    T14 = T3 - T5;
+		    T18 = T14 - T17;
+		    T2u = T14 + T17;
+		    {
+			 E T3B, T3E, T5j, T5k;
+			 T3B = T3y - T3A;
+			 T3E = KP1_414213562 * (T3C - T3D);
+			 T3F = T3B + T3E;
+			 T4V = T3B - T3E;
+			 T5j = T3y + T3A;
+			 T5k = KP1_414213562 * (T3C + T3D);
+			 T5l = T5j - T5k;
+			 T61 = T5j + T5k;
+		    }
+	       }
+	       {
+		    E Td, T3G, T1c, T3K, Tg, T3J, T1f, T3H, T19, T1g;
+		    {
+			 E Tb, Tc, T1a, T1b;
+			 Tb = Cr[WS(csr, 4)];
+			 Tc = Cr[WS(csr, 28)];
+			 Td = Tb + Tc;
+			 T3G = Tb - Tc;
+			 T1a = Ci[WS(csi, 4)];
+			 T1b = Ci[WS(csi, 28)];
+			 T1c = T1a - T1b;
+			 T3K = T1a + T1b;
+		    }
+		    {
+			 E Te, Tf, T1d, T1e;
+			 Te = Cr[WS(csr, 20)];
+			 Tf = Cr[WS(csr, 12)];
+			 Tg = Te + Tf;
+			 T3J = Te - Tf;
+			 T1d = Ci[WS(csi, 20)];
+			 T1e = Ci[WS(csi, 12)];
+			 T1f = T1d - T1e;
+			 T3H = T1d + T1e;
+		    }
+		    Th = KP2_000000000 * (Td + Tg);
+		    T2T = KP2_000000000 * (T1f + T1c);
+		    T19 = Td - Tg;
+		    T1g = T1c - T1f;
+		    T1h = KP1_414213562 * (T19 - T1g);
+		    T2v = KP1_414213562 * (T19 + T1g);
+		    {
+			 E T3I, T3L, T5m, T5n;
+			 T3I = T3G - T3H;
+			 T3L = T3J + T3K;
+			 T3M = FNMS(KP765366864, T3L, KP1_847759065 * T3I);
+			 T4W = FMA(KP765366864, T3I, KP1_847759065 * T3L);
+			 T5m = T3G + T3H;
+			 T5n = T3K - T3J;
+			 T5o = FNMS(KP1_847759065, T5n, KP765366864 * T5m);
+			 T62 = FMA(KP1_847759065, T5m, KP765366864 * T5n);
+		    }
+	       }
+	       {
+		    E Tl, T3O, T1v, T43, To, T42, T1y, T3P, Ts, T3R, T1p, T3S, Tv, T3U, T1m;
+		    E T3V;
+		    {
+			 E Tj, Tk, T1t, T1u;
+			 Tj = Cr[WS(csr, 2)];
+			 Tk = Cr[WS(csr, 30)];
+			 Tl = Tj + Tk;
+			 T3O = Tj - Tk;
+			 T1t = Ci[WS(csi, 2)];
+			 T1u = Ci[WS(csi, 30)];
+			 T1v = T1t - T1u;
+			 T43 = T1t + T1u;
+		    }
+		    {
+			 E Tm, Tn, T1w, T1x;
+			 Tm = Cr[WS(csr, 18)];
+			 Tn = Cr[WS(csr, 14)];
+			 To = Tm + Tn;
+			 T42 = Tm - Tn;
+			 T1w = Ci[WS(csi, 18)];
+			 T1x = Ci[WS(csi, 14)];
+			 T1y = T1w - T1x;
+			 T3P = T1w + T1x;
+		    }
+		    {
+			 E Tq, Tr, T1n, T1o;
+			 Tq = Cr[WS(csr, 10)];
+			 Tr = Cr[WS(csr, 22)];
+			 Ts = Tq + Tr;
+			 T3R = Tq - Tr;
+			 T1n = Ci[WS(csi, 10)];
+			 T1o = Ci[WS(csi, 22)];
+			 T1p = T1n - T1o;
+			 T3S = T1n + T1o;
+		    }
+		    {
+			 E Tt, Tu, T1k, T1l;
+			 Tt = Cr[WS(csr, 6)];
+			 Tu = Cr[WS(csr, 26)];
+			 Tv = Tt + Tu;
+			 T3U = Tt - Tu;
+			 T1k = Ci[WS(csi, 26)];
+			 T1l = Ci[WS(csi, 6)];
+			 T1m = T1k - T1l;
+			 T3V = T1l + T1k;
+		    }
+		    T3Q = T3O - T3P;
+		    T5q = T3O + T3P;
+		    T5u = T43 - T42;
+		    T44 = T42 + T43;
+		    Tp = Tl + To;
+		    Tw = Ts + Tv;
+		    T2V = Tp - Tw;
+		    {
+			 E T3T, T3W, T1j, T1q;
+			 T2W = T1y + T1v;
+			 T2X = T1p + T1m;
+			 T2Y = T2W - T2X;
+			 T3T = T3R - T3S;
+			 T3W = T3U - T3V;
+			 T3X = KP707106781 * (T3T + T3W);
+			 T5t = KP707106781 * (T3T - T3W);
+			 T1j = Tl - To;
+			 T1q = T1m - T1p;
+			 T1r = T1j + T1q;
+			 T2x = T1j - T1q;
+			 {
+			      E T3Z, T40, T1s, T1z;
+			      T3Z = T3R + T3S;
+			      T40 = T3U + T3V;
+			      T41 = KP707106781 * (T3Z - T40);
+			      T5r = KP707106781 * (T3Z + T40);
+			      T1s = Ts - Tv;
+			      T1z = T1v - T1y;
+			      T1A = T1s + T1z;
+			      T2y = T1z - T1s;
+			 }
+		    }
+	       }
+	       {
+		    E TB, T48, T2c, T4G, TE, T4F, T2f, T49, TI, T4b, T1J, T4c, TL, T4e, T1G;
+		    E T4f;
+		    {
+			 E Tz, TA, T2a, T2b;
+			 Tz = Cr[WS(csr, 1)];
+			 TA = Cr[WS(csr, 31)];
+			 TB = Tz + TA;
+			 T48 = Tz - TA;
+			 T2a = Ci[WS(csi, 1)];
+			 T2b = Ci[WS(csi, 31)];
+			 T2c = T2a - T2b;
+			 T4G = T2a + T2b;
+		    }
+		    {
+			 E TC, TD, T2d, T2e;
+			 TC = Cr[WS(csr, 17)];
+			 TD = Cr[WS(csr, 15)];
+			 TE = TC + TD;
+			 T4F = TC - TD;
+			 T2d = Ci[WS(csi, 17)];
+			 T2e = Ci[WS(csi, 15)];
+			 T2f = T2d - T2e;
+			 T49 = T2d + T2e;
+		    }
+		    {
+			 E TG, TH, T1H, T1I;
+			 TG = Cr[WS(csr, 9)];
+			 TH = Cr[WS(csr, 23)];
+			 TI = TG + TH;
+			 T4b = TG - TH;
+			 T1H = Ci[WS(csi, 9)];
+			 T1I = Ci[WS(csi, 23)];
+			 T1J = T1H - T1I;
+			 T4c = T1H + T1I;
+		    }
+		    {
+			 E TJ, TK, T1E, T1F;
+			 TJ = Cr[WS(csr, 7)];
+			 TK = Cr[WS(csr, 25)];
+			 TL = TJ + TK;
+			 T4e = TJ - TK;
+			 T1E = Ci[WS(csi, 25)];
+			 T1F = Ci[WS(csi, 7)];
+			 T1G = T1E - T1F;
+			 T4f = T1F + T1E;
+		    }
+		    {
+			 E TF, TM, T1D, T1K;
+			 T4a = T48 - T49;
+			 T5y = T48 + T49;
+			 T5N = T4G - T4F;
+			 T4H = T4F + T4G;
+			 TF = TB + TE;
+			 TM = TI + TL;
+			 TN = TF + TM;
+			 T31 = TF - TM;
+			 {
+			      E T4C, T4D, T37, T38;
+			      T4C = T4b + T4c;
+			      T4D = T4e + T4f;
+			      T4E = KP707106781 * (T4C - T4D);
+			      T5z = KP707106781 * (T4C + T4D);
+			      T37 = T2f + T2c;
+			      T38 = T1J + T1G;
+			      T39 = T37 - T38;
+			      T3q = T38 + T37;
+			 }
+			 T1D = TB - TE;
+			 T1K = T1G - T1J;
+			 T1L = T1D + T1K;
+			 T2B = T1D - T1K;
+			 {
+			      E T4d, T4g, T29, T2g;
+			      T4d = T4b - T4c;
+			      T4g = T4e - T4f;
+			      T4h = KP707106781 * (T4d + T4g);
+			      T5M = KP707106781 * (T4d - T4g);
+			      T29 = TI - TL;
+			      T2g = T2c - T2f;
+			      T2h = T29 + T2g;
+			      T2F = T2g - T29;
+			 }
+		    }
+	       }
+	       {
+		    E TQ, T4j, T1P, T4n, TT, T4m, T1S, T4k, TX, T4q, T1Y, T4u, T10, T4t, T21;
+		    E T4r;
+		    {
+			 E TO, TP, T1N, T1O;
+			 TO = Cr[WS(csr, 5)];
+			 TP = Cr[WS(csr, 27)];
+			 TQ = TO + TP;
+			 T4j = TO - TP;
+			 T1N = Ci[WS(csi, 5)];
+			 T1O = Ci[WS(csi, 27)];
+			 T1P = T1N - T1O;
+			 T4n = T1N + T1O;
+		    }
+		    {
+			 E TR, TS, T1Q, T1R;
+			 TR = Cr[WS(csr, 21)];
+			 TS = Cr[WS(csr, 11)];
+			 TT = TR + TS;
+			 T4m = TR - TS;
+			 T1Q = Ci[WS(csi, 21)];
+			 T1R = Ci[WS(csi, 11)];
+			 T1S = T1Q - T1R;
+			 T4k = T1Q + T1R;
+		    }
+		    {
+			 E TV, TW, T1W, T1X;
+			 TV = Cr[WS(csr, 3)];
+			 TW = Cr[WS(csr, 29)];
+			 TX = TV + TW;
+			 T4q = TV - TW;
+			 T1W = Ci[WS(csi, 29)];
+			 T1X = Ci[WS(csi, 3)];
+			 T1Y = T1W - T1X;
+			 T4u = T1X + T1W;
+		    }
+		    {
+			 E TY, TZ, T1Z, T20;
+			 TY = Cr[WS(csr, 13)];
+			 TZ = Cr[WS(csr, 19)];
+			 T10 = TY + TZ;
+			 T4t = TY - TZ;
+			 T1Z = Ci[WS(csi, 13)];
+			 T20 = Ci[WS(csi, 19)];
+			 T21 = T1Z - T20;
+			 T4r = T1Z + T20;
+		    }
+		    {
+			 E TU, T11, T5B, T5C;
+			 TU = TQ + TT;
+			 T11 = TX + T10;
+			 T12 = TU + T11;
+			 T36 = TU - T11;
+			 T5B = T4j + T4k;
+			 T5C = T4n - T4m;
+			 T5D = FNMS(KP923879532, T5C, KP382683432 * T5B);
+			 T5J = FMA(KP923879532, T5B, KP382683432 * T5C);
+		    }
+		    {
+			 E T5E, T5F, T1M, T1T;
+			 T5E = T4q + T4r;
+			 T5F = T4t + T4u;
+			 T5G = FNMS(KP923879532, T5F, KP382683432 * T5E);
+			 T5K = FMA(KP923879532, T5E, KP382683432 * T5F);
+			 T1M = TQ - TT;
+			 T1T = T1P - T1S;
+			 T1U = T1M - T1T;
+			 T26 = T1M + T1T;
+		    }
+		    {
+			 E T1V, T22, T4l, T4o;
+			 T1V = TX - T10;
+			 T22 = T1Y - T21;
+			 T23 = T1V + T22;
+			 T27 = T22 - T1V;
+			 T4l = T4j - T4k;
+			 T4o = T4m + T4n;
+			 T4p = FNMS(KP382683432, T4o, KP923879532 * T4l);
+			 T4z = FMA(KP382683432, T4l, KP923879532 * T4o);
+		    }
+		    {
+			 E T4s, T4v, T32, T33;
+			 T4s = T4q - T4r;
+			 T4v = T4t - T4u;
+			 T4w = FMA(KP923879532, T4s, KP382683432 * T4v);
+			 T4A = FNMS(KP382683432, T4s, KP923879532 * T4v);
+			 T32 = T21 + T1Y;
+			 T33 = T1S + T1P;
+			 T34 = T32 - T33;
+			 T3r = T33 + T32;
+		    }
+	       }
+	       {
+		    E T13, T3x, Ty, T3w, Ti, Tx;
+		    T13 = KP2_000000000 * (TN + T12);
+		    T3x = KP2_000000000 * (T3r + T3q);
+		    Ti = Ta + Th;
+		    Tx = KP2_000000000 * (Tp + Tw);
+		    Ty = Ti + Tx;
+		    T3w = Ti - Tx;
+		    R0[WS(rs, 16)] = Ty - T13;
+		    R0[WS(rs, 24)] = T3w + T3x;
+		    R0[0] = Ty + T13;
+		    R0[WS(rs, 8)] = T3w - T3x;
+	       }
+	       {
+		    E T3g, T3k, T3j, T3l;
+		    {
+			 E T3e, T3f, T3h, T3i;
+			 T3e = T2S + T2T;
+			 T3f = KP1_414213562 * (T2V + T2Y);
+			 T3g = T3e - T3f;
+			 T3k = T3e + T3f;
+			 T3h = T31 - T34;
+			 T3i = T39 - T36;
+			 T3j = FNMS(KP1_847759065, T3i, KP765366864 * T3h);
+			 T3l = FMA(KP1_847759065, T3h, KP765366864 * T3i);
+		    }
+		    R0[WS(rs, 22)] = T3g - T3j;
+		    R0[WS(rs, 30)] = T3k + T3l;
+		    R0[WS(rs, 6)] = T3g + T3j;
+		    R0[WS(rs, 14)] = T3k - T3l;
+	       }
+	       {
+		    E T3o, T3u, T3t, T3v;
+		    {
+			 E T3m, T3n, T3p, T3s;
+			 T3m = Ta - Th;
+			 T3n = KP2_000000000 * (T2X + T2W);
+			 T3o = T3m - T3n;
+			 T3u = T3m + T3n;
+			 T3p = TN - T12;
+			 T3s = T3q - T3r;
+			 T3t = KP1_414213562 * (T3p - T3s);
+			 T3v = KP1_414213562 * (T3p + T3s);
+		    }
+		    R0[WS(rs, 20)] = T3o - T3t;
+		    R0[WS(rs, 28)] = T3u + T3v;
+		    R0[WS(rs, 4)] = T3o + T3t;
+		    R0[WS(rs, 12)] = T3u - T3v;
+	       }
+	       {
+		    E T30, T3c, T3b, T3d;
+		    {
+			 E T2U, T2Z, T35, T3a;
+			 T2U = T2S - T2T;
+			 T2Z = KP1_414213562 * (T2V - T2Y);
+			 T30 = T2U + T2Z;
+			 T3c = T2U - T2Z;
+			 T35 = T31 + T34;
+			 T3a = T36 + T39;
+			 T3b = FNMS(KP765366864, T3a, KP1_847759065 * T35);
+			 T3d = FMA(KP765366864, T35, KP1_847759065 * T3a);
+		    }
+		    R0[WS(rs, 18)] = T30 - T3b;
+		    R0[WS(rs, 26)] = T3c + T3d;
+		    R0[WS(rs, 2)] = T30 + T3b;
+		    R0[WS(rs, 10)] = T3c - T3d;
+	       }
+	       {
+		    E T25, T2p, T2i, T2q, T1C, T2k, T2o, T2s, T24, T28;
+		    T24 = KP707106781 * (T1U + T23);
+		    T25 = T1L + T24;
+		    T2p = T1L - T24;
+		    T28 = KP707106781 * (T26 + T27);
+		    T2i = T28 + T2h;
+		    T2q = T2h - T28;
+		    {
+			 E T1i, T1B, T2m, T2n;
+			 T1i = T18 + T1h;
+			 T1B = FNMS(KP765366864, T1A, KP1_847759065 * T1r);
+			 T1C = T1i + T1B;
+			 T2k = T1i - T1B;
+			 T2m = T18 - T1h;
+			 T2n = FMA(KP765366864, T1r, KP1_847759065 * T1A);
+			 T2o = T2m - T2n;
+			 T2s = T2m + T2n;
+		    }
+		    {
+			 E T2j, T2t, T2l, T2r;
+			 T2j = FNMS(KP390180644, T2i, KP1_961570560 * T25);
+			 R0[WS(rs, 17)] = T1C - T2j;
+			 R0[WS(rs, 1)] = T1C + T2j;
+			 T2t = FMA(KP1_662939224, T2p, KP1_111140466 * T2q);
+			 R0[WS(rs, 13)] = T2s - T2t;
+			 R0[WS(rs, 29)] = T2s + T2t;
+			 T2l = FMA(KP390180644, T25, KP1_961570560 * T2i);
+			 R0[WS(rs, 9)] = T2k - T2l;
+			 R0[WS(rs, 25)] = T2k + T2l;
+			 T2r = FNMS(KP1_662939224, T2q, KP1_111140466 * T2p);
+			 R0[WS(rs, 21)] = T2o - T2r;
+			 R0[WS(rs, 5)] = T2o + T2r;
+		    }
+	       }
+	       {
+		    E T2D, T2N, T2G, T2O, T2A, T2I, T2M, T2Q, T2C, T2E;
+		    T2C = KP707106781 * (T27 - T26);
+		    T2D = T2B + T2C;
+		    T2N = T2B - T2C;
+		    T2E = KP707106781 * (T1U - T23);
+		    T2G = T2E + T2F;
+		    T2O = T2F - T2E;
+		    {
+			 E T2w, T2z, T2K, T2L;
+			 T2w = T2u - T2v;
+			 T2z = FNMS(KP1_847759065, T2y, KP765366864 * T2x);
+			 T2A = T2w + T2z;
+			 T2I = T2w - T2z;
+			 T2K = T2u + T2v;
+			 T2L = FMA(KP1_847759065, T2x, KP765366864 * T2y);
+			 T2M = T2K - T2L;
+			 T2Q = T2K + T2L;
+		    }
+		    {
+			 E T2H, T2R, T2J, T2P;
+			 T2H = FNMS(KP1_111140466, T2G, KP1_662939224 * T2D);
+			 R0[WS(rs, 19)] = T2A - T2H;
+			 R0[WS(rs, 3)] = T2A + T2H;
+			 T2R = FMA(KP1_961570560, T2N, KP390180644 * T2O);
+			 R0[WS(rs, 15)] = T2Q - T2R;
+			 R0[WS(rs, 31)] = T2Q + T2R;
+			 T2J = FMA(KP1_111140466, T2D, KP1_662939224 * T2G);
+			 R0[WS(rs, 11)] = T2I - T2J;
+			 R0[WS(rs, 27)] = T2I + T2J;
+			 T2P = FNMS(KP1_961570560, T2O, KP390180644 * T2N);
+			 R0[WS(rs, 23)] = T2M - T2P;
+			 R0[WS(rs, 7)] = T2M + T2P;
+		    }
+	       }
+	       {
+		    E T5p, T5T, T5w, T5U, T5I, T5W, T5P, T5X, T5s, T5v;
+		    T5p = T5l + T5o;
+		    T5T = T5l - T5o;
+		    T5s = T5q - T5r;
+		    T5v = T5t + T5u;
+		    T5w = FNMS(KP1_111140466, T5v, KP1_662939224 * T5s);
+		    T5U = FMA(KP1_111140466, T5s, KP1_662939224 * T5v);
+		    {
+			 E T5A, T5H, T5L, T5O;
+			 T5A = T5y - T5z;
+			 T5H = T5D + T5G;
+			 T5I = T5A + T5H;
+			 T5W = T5A - T5H;
+			 T5L = T5J - T5K;
+			 T5O = T5M + T5N;
+			 T5P = T5L + T5O;
+			 T5X = T5O - T5L;
+		    }
+		    {
+			 E T5x, T5Q, T5Z, T60;
+			 T5x = T5p + T5w;
+			 T5Q = FNMS(KP580569354, T5P, KP1_913880671 * T5I);
+			 R1[WS(rs, 17)] = T5x - T5Q;
+			 R1[WS(rs, 1)] = T5x + T5Q;
+			 T5Z = T5T + T5U;
+			 T60 = FMA(KP1_763842528, T5W, KP942793473 * T5X);
+			 R1[WS(rs, 13)] = T5Z - T60;
+			 R1[WS(rs, 29)] = T5Z + T60;
+		    }
+		    {
+			 E T5R, T5S, T5V, T5Y;
+			 T5R = T5p - T5w;
+			 T5S = FMA(KP580569354, T5I, KP1_913880671 * T5P);
+			 R1[WS(rs, 9)] = T5R - T5S;
+			 R1[WS(rs, 25)] = T5R + T5S;
+			 T5V = T5T - T5U;
+			 T5Y = FNMS(KP1_763842528, T5X, KP942793473 * T5W);
+			 R1[WS(rs, 21)] = T5V - T5Y;
+			 R1[WS(rs, 5)] = T5V + T5Y;
+		    }
+	       }
+	       {
+		    E T3N, T4N, T46, T4O, T4y, T4Q, T4J, T4R, T3Y, T45;
+		    T3N = T3F + T3M;
+		    T4N = T3F - T3M;
+		    T3Y = T3Q + T3X;
+		    T45 = T41 + T44;
+		    T46 = FNMS(KP390180644, T45, KP1_961570560 * T3Y);
+		    T4O = FMA(KP390180644, T3Y, KP1_961570560 * T45);
+		    {
+			 E T4i, T4x, T4B, T4I;
+			 T4i = T4a + T4h;
+			 T4x = T4p + T4w;
+			 T4y = T4i + T4x;
+			 T4Q = T4i - T4x;
+			 T4B = T4z + T4A;
+			 T4I = T4E + T4H;
+			 T4J = T4B + T4I;
+			 T4R = T4I - T4B;
+		    }
+		    {
+			 E T47, T4K, T4T, T4U;
+			 T47 = T3N + T46;
+			 T4K = FNMS(KP196034280, T4J, KP1_990369453 * T4y);
+			 R1[WS(rs, 16)] = T47 - T4K;
+			 R1[0] = T47 + T4K;
+			 T4T = T4N + T4O;
+			 T4U = FMA(KP1_546020906, T4Q, KP1_268786568 * T4R);
+			 R1[WS(rs, 12)] = T4T - T4U;
+			 R1[WS(rs, 28)] = T4T + T4U;
+		    }
+		    {
+			 E T4L, T4M, T4P, T4S;
+			 T4L = T3N - T46;
+			 T4M = FMA(KP196034280, T4y, KP1_990369453 * T4J);
+			 R1[WS(rs, 8)] = T4L - T4M;
+			 R1[WS(rs, 24)] = T4L + T4M;
+			 T4P = T4N - T4O;
+			 T4S = FNMS(KP1_546020906, T4R, KP1_268786568 * T4Q);
+			 R1[WS(rs, 20)] = T4P - T4S;
+			 R1[WS(rs, 4)] = T4P + T4S;
+		    }
+	       }
+	       {
+		    E T63, T6h, T66, T6i, T6a, T6k, T6d, T6l, T64, T65;
+		    T63 = T61 - T62;
+		    T6h = T61 + T62;
+		    T64 = T5q + T5r;
+		    T65 = T5u - T5t;
+		    T66 = FNMS(KP1_961570560, T65, KP390180644 * T64);
+		    T6i = FMA(KP1_961570560, T64, KP390180644 * T65);
+		    {
+			 E T68, T69, T6b, T6c;
+			 T68 = T5y + T5z;
+			 T69 = T5J + T5K;
+			 T6a = T68 - T69;
+			 T6k = T68 + T69;
+			 T6b = T5D - T5G;
+			 T6c = T5N - T5M;
+			 T6d = T6b + T6c;
+			 T6l = T6c - T6b;
+		    }
+		    {
+			 E T67, T6e, T6n, T6o;
+			 T67 = T63 + T66;
+			 T6e = FNMS(KP1_268786568, T6d, KP1_546020906 * T6a);
+			 R1[WS(rs, 19)] = T67 - T6e;
+			 R1[WS(rs, 3)] = T67 + T6e;
+			 T6n = T6h + T6i;
+			 T6o = FMA(KP1_990369453, T6k, KP196034280 * T6l);
+			 R1[WS(rs, 15)] = T6n - T6o;
+			 R1[WS(rs, 31)] = T6n + T6o;
+		    }
+		    {
+			 E T6f, T6g, T6j, T6m;
+			 T6f = T63 - T66;
+			 T6g = FMA(KP1_268786568, T6a, KP1_546020906 * T6d);
+			 R1[WS(rs, 11)] = T6f - T6g;
+			 R1[WS(rs, 27)] = T6f + T6g;
+			 T6j = T6h - T6i;
+			 T6m = FNMS(KP1_990369453, T6l, KP196034280 * T6k);
+			 R1[WS(rs, 23)] = T6j - T6m;
+			 R1[WS(rs, 7)] = T6j + T6m;
+		    }
+	       }
+	       {
+		    E T4X, T5b, T50, T5c, T54, T5e, T57, T5f, T4Y, T4Z;
+		    T4X = T4V - T4W;
+		    T5b = T4V + T4W;
+		    T4Y = T3Q - T3X;
+		    T4Z = T44 - T41;
+		    T50 = FNMS(KP1_662939224, T4Z, KP1_111140466 * T4Y);
+		    T5c = FMA(KP1_662939224, T4Y, KP1_111140466 * T4Z);
+		    {
+			 E T52, T53, T55, T56;
+			 T52 = T4a - T4h;
+			 T53 = T4A - T4z;
+			 T54 = T52 + T53;
+			 T5e = T52 - T53;
+			 T55 = T4p - T4w;
+			 T56 = T4H - T4E;
+			 T57 = T55 + T56;
+			 T5f = T56 - T55;
+		    }
+		    {
+			 E T51, T58, T5h, T5i;
+			 T51 = T4X + T50;
+			 T58 = FNMS(KP942793473, T57, KP1_763842528 * T54);
+			 R1[WS(rs, 18)] = T51 - T58;
+			 R1[WS(rs, 2)] = T51 + T58;
+			 T5h = T5b + T5c;
+			 T5i = FMA(KP1_913880671, T5e, KP580569354 * T5f);
+			 R1[WS(rs, 14)] = T5h - T5i;
+			 R1[WS(rs, 30)] = T5h + T5i;
+		    }
+		    {
+			 E T59, T5a, T5d, T5g;
+			 T59 = T4X - T50;
+			 T5a = FMA(KP942793473, T54, KP1_763842528 * T57);
+			 R1[WS(rs, 10)] = T59 - T5a;
+			 R1[WS(rs, 26)] = T59 + T5a;
+			 T5d = T5b - T5c;
+			 T5g = FNMS(KP1_913880671, T5f, KP580569354 * T5e);
+			 R1[WS(rs, 22)] = T5d - T5g;
+			 R1[WS(rs, 6)] = T5d + T5g;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 64, "r2cb_64", {342, 82, 52, 0}, &GENUS };
+
+void X(codelet_r2cb_64) (planner *p) {
+     X(kr2c_register) (p, r2cb_64, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -name r2cb_7 -include r2cb.h */
+
+/*
+ * This function contains 24 FP additions, 22 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 22 fused multiply/add),
+ * 31 stack variables, 7 constants, and 14 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_949855824, +1.949855824363647214036263365987862434465571601);
+     DK(KP1_801937735, +1.801937735804838252472204639014890102331838324);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
+	       E Tn, Td, Tg, Ti, Tl, T8;
+	       {
+		    E T1, T9, Tb, Ta, T2, T4, Th, Tm, Tc, T3, Te;
+		    T1 = Cr[0];
+		    T9 = Ci[WS(csi, 2)];
+		    Tb = Ci[WS(csi, 3)];
+		    Ta = Ci[WS(csi, 1)];
+		    T2 = Cr[WS(csr, 1)];
+		    T4 = Cr[WS(csr, 3)];
+		    Th = FMA(KP554958132, T9, Tb);
+		    Tm = FMS(KP554958132, Ta, T9);
+		    Tc = FMA(KP554958132, Tb, Ta);
+		    T3 = Cr[WS(csr, 2)];
+		    Te = FNMS(KP356895867, T2, T4);
+		    Tn = FMA(KP801937735, Tm, Tb);
+		    {
+			 E Tf, Tk, T7, T5, Tj, T6;
+			 Td = FMA(KP801937735, Tc, T9);
+			 T5 = T2 + T3 + T4;
+			 Tj = FNMS(KP356895867, T4, T3);
+			 T6 = FNMS(KP356895867, T3, T2);
+			 Tf = FNMS(KP692021471, Te, T3);
+			 R0[0] = FMA(KP2_000000000, T5, T1);
+			 Tk = FNMS(KP692021471, Tj, T2);
+			 T7 = FNMS(KP692021471, T6, T4);
+			 Tg = FNMS(KP1_801937735, Tf, T1);
+			 Ti = FNMS(KP801937735, Th, Ta);
+			 Tl = FNMS(KP1_801937735, Tk, T1);
+			 T8 = FNMS(KP1_801937735, T7, T1);
+		    }
+	       }
+	       R1[WS(rs, 2)] = FMA(KP1_949855824, Ti, Tg);
+	       R0[WS(rs, 1)] = FNMS(KP1_949855824, Ti, Tg);
+	       R0[WS(rs, 2)] = FMA(KP1_949855824, Tn, Tl);
+	       R1[WS(rs, 1)] = FNMS(KP1_949855824, Tn, Tl);
+	       R0[WS(rs, 3)] = FMA(KP1_949855824, Td, T8);
+	       R1[0] = FNMS(KP1_949855824, Td, T8);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 7, "r2cb_7", {2, 0, 22, 0}, &GENUS };
+
+void X(codelet_r2cb_7) (planner *p) {
+     X(kr2c_register) (p, r2cb_7, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -name r2cb_7 -include r2cb.h */
+
+/*
+ * This function contains 24 FP additions, 19 FP multiplications,
+ * (or, 11 additions, 6 multiplications, 13 fused multiply/add),
+ * 21 stack variables, 7 constants, and 14 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_801937735, +1.801937735804838252472204639014890102331838324);
+     DK(KP445041867, +0.445041867912628808577805128993589518932711138);
+     DK(KP1_246979603, +1.246979603717467061050009768008479621264549462);
+     DK(KP867767478, +0.867767478235116240951536665696717509219981456);
+     DK(KP1_949855824, +1.949855824363647214036263365987862434465571601);
+     DK(KP1_563662964, +1.563662964936059617416889053348115500464669037);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
+	       E T9, Td, Tb, T1, T4, T2, T3, T5, Tc, Ta, T6, T8, T7;
+	       T6 = Ci[WS(csi, 2)];
+	       T8 = Ci[WS(csi, 1)];
+	       T7 = Ci[WS(csi, 3)];
+	       T9 = FNMS(KP1_949855824, T7, KP1_563662964 * T6) - (KP867767478 * T8);
+	       Td = FMA(KP867767478, T6, KP1_563662964 * T7) - (KP1_949855824 * T8);
+	       Tb = FMA(KP1_563662964, T8, KP1_949855824 * T6) + (KP867767478 * T7);
+	       T1 = Cr[0];
+	       T4 = Cr[WS(csr, 3)];
+	       T2 = Cr[WS(csr, 1)];
+	       T3 = Cr[WS(csr, 2)];
+	       T5 = FMA(KP1_246979603, T3, T1) + FNMA(KP445041867, T4, KP1_801937735 * T2);
+	       Tc = FMA(KP1_246979603, T4, T1) + FNMA(KP1_801937735, T3, KP445041867 * T2);
+	       Ta = FMA(KP1_246979603, T2, T1) + FNMA(KP1_801937735, T4, KP445041867 * T3);
+	       R0[WS(rs, 2)] = T5 - T9;
+	       R1[WS(rs, 1)] = T5 + T9;
+	       R0[WS(rs, 1)] = Tc + Td;
+	       R1[WS(rs, 2)] = Tc - Td;
+	       R0[WS(rs, 3)] = Ta + Tb;
+	       R1[0] = Ta - Tb;
+	       R0[0] = FMA(KP2_000000000, T2 + T3 + T4, T1);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 7, "r2cb_7", {11, 6, 13, 0}, &GENUS };
+
+void X(codelet_r2cb_7) (planner *p) {
+     X(kr2c_register) (p, r2cb_7, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -name r2cb_8 -include r2cb.h */
+
+/*
+ * This function contains 20 FP additions, 12 FP multiplications,
+ * (or, 8 additions, 0 multiplications, 12 fused multiply/add),
+ * 19 stack variables, 2 constants, and 16 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
+	       E Th, Tb, Tg, Ti;
+	       {
+		    E T4, Ta, Td, T9, T3, Tc, T8, Te;
+		    T4 = Cr[WS(csr, 2)];
+		    Ta = Ci[WS(csi, 2)];
+		    {
+			 E T1, T2, T6, T7;
+			 T1 = Cr[0];
+			 T2 = Cr[WS(csr, 4)];
+			 T6 = Cr[WS(csr, 1)];
+			 T7 = Cr[WS(csr, 3)];
+			 Td = Ci[WS(csi, 1)];
+			 T9 = T1 - T2;
+			 T3 = T1 + T2;
+			 Tc = T6 - T7;
+			 T8 = T6 + T7;
+			 Te = Ci[WS(csi, 3)];
+		    }
+		    {
+			 E Tj, T5, Tk, Tf;
+			 Tj = FNMS(KP2_000000000, T4, T3);
+			 T5 = FMA(KP2_000000000, T4, T3);
+			 Th = FMA(KP2_000000000, Ta, T9);
+			 Tb = FNMS(KP2_000000000, Ta, T9);
+			 Tk = Td - Te;
+			 Tf = Td + Te;
+			 R0[0] = FMA(KP2_000000000, T8, T5);
+			 R0[WS(rs, 2)] = FNMS(KP2_000000000, T8, T5);
+			 R0[WS(rs, 3)] = FMA(KP2_000000000, Tk, Tj);
+			 R0[WS(rs, 1)] = FNMS(KP2_000000000, Tk, Tj);
+			 Tg = Tc - Tf;
+			 Ti = Tc + Tf;
+		    }
+	       }
+	       R1[0] = FMA(KP1_414213562, Tg, Tb);
+	       R1[WS(rs, 2)] = FNMS(KP1_414213562, Tg, Tb);
+	       R1[WS(rs, 3)] = FMA(KP1_414213562, Ti, Th);
+	       R1[WS(rs, 1)] = FNMS(KP1_414213562, Ti, Th);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 8, "r2cb_8", {8, 0, 12, 0}, &GENUS };
+
+void X(codelet_r2cb_8) (planner *p) {
+     X(kr2c_register) (p, r2cb_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -name r2cb_8 -include r2cb.h */
+
+/*
+ * This function contains 20 FP additions, 6 FP multiplications,
+ * (or, 20 additions, 6 multiplications, 0 fused multiply/add),
+ * 21 stack variables, 2 constants, and 16 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
+	       E T5, Tg, T3, Te, T9, Ti, Td, Tj, T6, Ta;
+	       {
+		    E T4, Tf, T1, T2;
+		    T4 = Cr[WS(csr, 2)];
+		    T5 = KP2_000000000 * T4;
+		    Tf = Ci[WS(csi, 2)];
+		    Tg = KP2_000000000 * Tf;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 4)];
+		    T3 = T1 + T2;
+		    Te = T1 - T2;
+		    {
+			 E T7, T8, Tb, Tc;
+			 T7 = Cr[WS(csr, 1)];
+			 T8 = Cr[WS(csr, 3)];
+			 T9 = KP2_000000000 * (T7 + T8);
+			 Ti = T7 - T8;
+			 Tb = Ci[WS(csi, 1)];
+			 Tc = Ci[WS(csi, 3)];
+			 Td = KP2_000000000 * (Tb - Tc);
+			 Tj = Tb + Tc;
+		    }
+	       }
+	       T6 = T3 + T5;
+	       R0[WS(rs, 2)] = T6 - T9;
+	       R0[0] = T6 + T9;
+	       Ta = T3 - T5;
+	       R0[WS(rs, 1)] = Ta - Td;
+	       R0[WS(rs, 3)] = Ta + Td;
+	       {
+		    E Th, Tk, Tl, Tm;
+		    Th = Te - Tg;
+		    Tk = KP1_414213562 * (Ti - Tj);
+		    R1[WS(rs, 2)] = Th - Tk;
+		    R1[0] = Th + Tk;
+		    Tl = Te + Tg;
+		    Tm = KP1_414213562 * (Ti + Tj);
+		    R1[WS(rs, 1)] = Tl - Tm;
+		    R1[WS(rs, 3)] = Tl + Tm;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 8, "r2cb_8", {20, 6, 0, 0}, &GENUS };
+
+void X(codelet_r2cb_8) (planner *p) {
+     X(kr2c_register) (p, r2cb_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cb/r2cb_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:41:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -name r2cb_9 -include r2cb.h */
+
+/*
+ * This function contains 32 FP additions, 24 FP multiplications,
+ * (or, 8 additions, 0 multiplications, 24 fused multiply/add),
+ * 40 stack variables, 12 constants, and 18 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_326827896, +1.326827896337876792410842639271782594433726619);
+     DK(KP1_705737063, +1.705737063904886419256501927880148143872040591);
+     DK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DK(KP1_532088886, +1.532088886237956070404785301110833347871664914);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP1_969615506, +1.969615506024416118733486049179046027341286503);
+     DK(KP839099631, +0.839099631177280011763127298123181364687434283);
+     DK(KP176326980, +0.176326980708464973471090386868618986121633062);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
+	       E T4, Th, T3, Tb, Tp, Tk, T7, Tf, Ti, Ta, T1, T2;
+	       Ta = Ci[WS(csi, 3)];
+	       T1 = Cr[0];
+	       T2 = Cr[WS(csr, 3)];
+	       T4 = Cr[WS(csr, 1)];
+	       Th = Ci[WS(csi, 1)];
+	       {
+		    E T5, T9, T6, Td, Te;
+		    T5 = Cr[WS(csr, 4)];
+		    T9 = T1 - T2;
+		    T3 = FMA(KP2_000000000, T2, T1);
+		    T6 = Cr[WS(csr, 2)];
+		    Td = Ci[WS(csi, 4)];
+		    Te = Ci[WS(csi, 2)];
+		    Tb = FNMS(KP1_732050807, Ta, T9);
+		    Tp = FMA(KP1_732050807, Ta, T9);
+		    Tk = T6 - T5;
+		    T7 = T5 + T6;
+		    Tf = Td + Te;
+		    Ti = Td - Te;
+	       }
+	       {
+		    E Tu, To, Tt, Tn, Tc, T8;
+		    Tc = FNMS(KP500000000, T7, T4);
+		    T8 = T4 + T7;
+		    {
+			 E Tw, Tj, Tr, Tg, Tv;
+			 Tw = Ti + Th;
+			 Tj = FNMS(KP500000000, Ti, Th);
+			 Tr = FMA(KP866025403, Tf, Tc);
+			 Tg = FNMS(KP866025403, Tf, Tc);
+			 Tv = T3 - T8;
+			 R0[0] = FMA(KP2_000000000, T8, T3);
+			 {
+			      E Tq, Tl, Ts, Tm;
+			      Tq = FMA(KP866025403, Tk, Tj);
+			      Tl = FNMS(KP866025403, Tk, Tj);
+			      R0[WS(rs, 3)] = FMA(KP1_732050807, Tw, Tv);
+			      R1[WS(rs, 1)] = FNMS(KP1_732050807, Tw, Tv);
+			      Ts = FNMS(KP176326980, Tr, Tq);
+			      Tu = FMA(KP176326980, Tq, Tr);
+			      Tm = FNMS(KP839099631, Tl, Tg);
+			      To = FMA(KP839099631, Tg, Tl);
+			      R0[WS(rs, 1)] = FNMS(KP1_969615506, Ts, Tp);
+			      Tt = FMA(KP984807753, Ts, Tp);
+			      R1[0] = FMA(KP1_532088886, Tm, Tb);
+			      Tn = FNMS(KP766044443, Tm, Tb);
+			 }
+		    }
+		    R1[WS(rs, 2)] = FNMS(KP1_705737063, Tu, Tt);
+		    R0[WS(rs, 4)] = FMA(KP1_705737063, Tu, Tt);
+		    R0[WS(rs, 2)] = FNMS(KP1_326827896, To, Tn);
+		    R1[WS(rs, 3)] = FMA(KP1_326827896, To, Tn);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 9, "r2cb_9", {8, 0, 24, 0}, &GENUS };
+
+void X(codelet_r2cb_9) (planner *p) {
+     X(kr2c_register) (p, r2cb_9, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -name r2cb_9 -include r2cb.h */
+
+/*
+ * This function contains 32 FP additions, 18 FP multiplications,
+ * (or, 22 additions, 8 multiplications, 10 fused multiply/add),
+ * 35 stack variables, 12 constants, and 18 memory accesses
+ */
+#include "r2cb.h"
+
+static void r2cb_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DK(KP300767466, +0.300767466360870593278543795225003852144476517);
+     DK(KP1_705737063, +1.705737063904886419256501927880148143872040591);
+     DK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DK(KP1_326827896, +1.326827896337876792410842639271782594433726619);
+     DK(KP1_113340798, +1.113340798452838732905825904094046265936583811);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
+	       E T3, Tq, Tc, Tk, Tj, T8, Tm, Ts, Th, Tr, Tw, Tx;
+	       {
+		    E Tb, T1, T2, T9, Ta;
+		    Ta = Ci[WS(csi, 3)];
+		    Tb = KP1_732050807 * Ta;
+		    T1 = Cr[0];
+		    T2 = Cr[WS(csr, 3)];
+		    T9 = T1 - T2;
+		    T3 = FMA(KP2_000000000, T2, T1);
+		    Tq = T9 + Tb;
+		    Tc = T9 - Tb;
+	       }
+	       {
+		    E T4, T7, Ti, Tg, Tl, Td;
+		    T4 = Cr[WS(csr, 1)];
+		    Tk = Ci[WS(csi, 1)];
+		    {
+			 E T5, T6, Te, Tf;
+			 T5 = Cr[WS(csr, 4)];
+			 T6 = Cr[WS(csr, 2)];
+			 T7 = T5 + T6;
+			 Ti = KP866025403 * (T5 - T6);
+			 Te = Ci[WS(csi, 4)];
+			 Tf = Ci[WS(csi, 2)];
+			 Tg = KP866025403 * (Te + Tf);
+			 Tj = Tf - Te;
+		    }
+		    T8 = T4 + T7;
+		    Tl = FMA(KP500000000, Tj, Tk);
+		    Tm = Ti + Tl;
+		    Ts = Tl - Ti;
+		    Td = FNMS(KP500000000, T7, T4);
+		    Th = Td - Tg;
+		    Tr = Td + Tg;
+	       }
+	       R0[0] = FMA(KP2_000000000, T8, T3);
+	       Tw = T3 - T8;
+	       Tx = KP1_732050807 * (Tk - Tj);
+	       R1[WS(rs, 1)] = Tw - Tx;
+	       R0[WS(rs, 3)] = Tw + Tx;
+	       {
+		    E Tp, Tn, To, Tv, Tt, Tu;
+		    Tp = FMA(KP1_113340798, Th, KP1_326827896 * Tm);
+		    Tn = FNMS(KP642787609, Tm, KP766044443 * Th);
+		    To = Tc - Tn;
+		    R1[0] = FMA(KP2_000000000, Tn, Tc);
+		    R1[WS(rs, 3)] = To + Tp;
+		    R0[WS(rs, 2)] = To - Tp;
+		    Tv = FMA(KP1_705737063, Tr, KP300767466 * Ts);
+		    Tt = FNMS(KP984807753, Ts, KP173648177 * Tr);
+		    Tu = Tq - Tt;
+		    R0[WS(rs, 1)] = FMA(KP2_000000000, Tt, Tq);
+		    R0[WS(rs, 4)] = Tu + Tv;
+		    R1[WS(rs, 2)] = Tu - Tv;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 9, "r2cb_9", {22, 8, 10, 0}, &GENUS };
+
+void X(codelet_r2cb_9) (planner *p) {
+     X(kr2c_register) (p, r2cb_9, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cbIII.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cbIII.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(rdft_r2cbIII_genus)
+extern const kr2c_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(rdft_r2cf_genus)
+extern const kr2c_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,110 @@
+# This Makefile.am specifies a set of codelets, efficient transforms
+# of small sizes, that are used as building blocks (kernels) by FFTW
+# to build up large transforms, as well as the options for generating
+# and compiling them.
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+###########################################################################
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/scalar
+noinst_LTLIBRARIES = librdft_scalar_r2cf.la
+
+###########################################################################
+# r2cf_<n> is a hard-coded real-to-complex FFT of size <n> (base cases
+# of real-input FFT recursion)
+R2CF = r2cf_2.c r2cf_3.c r2cf_4.c r2cf_5.c r2cf_6.c r2cf_7.c r2cf_8.c	\
+r2cf_9.c r2cf_10.c r2cf_11.c r2cf_12.c r2cf_13.c r2cf_14.c r2cf_15.c	\
+r2cf_16.c r2cf_32.c r2cf_64.c r2cf_128.c \
+r2cf_20.c r2cf_25.c # r2cf_30.c r2cf_40.c r2cf_50.c
+
+###########################################################################
+# hf_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT
+# step for a real-input FFT.  Every hf codelet must have a
+# corresponding r2cfII codelet (see below)!
+HF = hf_2.c hf_3.c hf_4.c hf_5.c hf_6.c hf_7.c hf_8.c hf_9.c	\
+hf_10.c hf_12.c hf_15.c hf_16.c hf_32.c hf_64.c \
+hf_20.c hf_25.c # hf_30.c hf_40.c hf_50.c
+
+# like hf, but generates part of its trig table on the fly (good for large n)
+HF2 = hf2_4.c hf2_8.c hf2_16.c hf2_32.c \
+hf2_5.c hf2_20.c hf2_25.c
+
+# an r2cf transform where the input is shifted by half a sample (output
+# is multiplied by a phase).  This is needed as part of the DIT recursion;
+# every hf_<r> or hf2_<r> codelet should have a corresponding r2cfII_<r>
+R2CFII = r2cfII_2.c r2cfII_3.c r2cfII_4.c r2cfII_5.c r2cfII_6.c		\
+r2cfII_7.c r2cfII_8.c r2cfII_9.c r2cfII_10.c r2cfII_12.c r2cfII_15.c	\
+r2cfII_16.c r2cfII_32.c r2cfII_64.c \
+r2cfII_20.c r2cfII_25.c # r2cfII_30.c r2cfII_40.c r2cfII_50.c
+
+###########################################################################
+# hc2cf_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT
+# step for a real-input FFT with rdft2-style output.  <r> must be even.
+HC2CF = hc2cf_2.c hc2cf_4.c hc2cf_6.c hc2cf_8.c hc2cf_10.c hc2cf_12.c	\
+hc2cf_16.c hc2cf_32.c \
+hc2cf_20.c # hc2cf_30.c
+
+HC2CFDFT = hc2cfdft_2.c hc2cfdft_4.c hc2cfdft_6.c hc2cfdft_8.c	\
+hc2cfdft_10.c hc2cfdft_12.c hc2cfdft_16.c hc2cfdft_32.c \
+hc2cfdft_20.c # hc2cfdft_30.c
+
+# like hc2cf, but generates part of its trig table on the fly (good
+# for large n)
+HC2CF2 = hc2cf2_4.c hc2cf2_8.c hc2cf2_16.c hc2cf2_32.c \
+hc2cf2_20.c # hc2cf2_30.c
+HC2CFDFT2 = hc2cfdft2_4.c hc2cfdft2_8.c hc2cfdft2_16.c hc2cfdft2_32.c \
+hc2cfdft2_20.c # hc2cfdft2_30.c
+
+###########################################################################
+ALL_CODELETS = $(R2CF) $(HF) $(HF2) $(R2CFII) $(HC2CF) $(HC2CF2)	\
+$(HC2CFDFT) $(HC2CFDFT2)
+
+BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
+
+librdft_scalar_r2cf_la_SOURCES = $(BUILT_SOURCES)
+
+SOLVTAB_NAME = X(solvtab_rdft_r2cf)
+XRENAME=X
+
+# special rules for regenerating codelets.
+include $(top_srcdir)/support/Makefile.codelets
+
+if MAINTAINER_MODE
+FLAGS_R2CF=$(RDFT_FLAGS_COMMON)
+FLAGS_HF=$(RDFT_FLAGS_COMMON)
+FLAGS_HF2=$(RDFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
+FLAGS_HC2CF=$(RDFT_FLAGS_COMMON)
+FLAGS_HC2CF2=$(RDFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
+FLAGS_R2CFII=$(RDFT_FLAGS_COMMON)
+
+r2cf_%.c:  $(CODELET_DEPS) $(GEN_R2CF)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CF) $(FLAGS_R2CF) -n $* -name r2cf_$* -include "r2cf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hf_%.c:  $(CODELET_DEPS) $(GEN_HC2HC)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HF) -n $* -dit -name hf_$* -include "hf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hf2_%.c:  $(CODELET_DEPS) $(GEN_HC2HC)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HF2) -n $* -dit -name hf2_$* -include "hf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+r2cfII_%.c:  $(CODELET_DEPS) $(GEN_R2CF)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CF) $(FLAGS_R2CF) -n $* -name r2cfII_$* -dft-II -include "r2cfII.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hc2cf_%.c:  $(CODELET_DEPS) $(GEN_HC2C)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CF) -n $* -dit -name hc2cf_$* -include "hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hc2cf2_%.c:  $(CODELET_DEPS) $(GEN_HC2C)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CF2) -n $* -dit -name hc2cf2_$* -include "hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hc2cfdft_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CF) -n $* -dit -name hc2cfdft_$* -include "hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hc2cfdft2_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CF2) -n $* -dit -name hc2cfdft2_$* -include "hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+endif # MAINTAINER_MODE
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,833 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This Makefile.am specifies a set of codelets, efficient transforms
+# of small sizes, that are used as building blocks (kernels) by FFTW
+# to build up large transforms, as well as the options for generating
+# and compiling them.
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+# -*- makefile -*-
+# This file contains special make rules to generate codelets.
+# Most of this file requires GNU make .
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/support/Makefile.codelets
+subdir = rdft/scalar/r2cf
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+librdft_scalar_r2cf_la_LIBADD =
+am__objects_1 = r2cf_2.lo r2cf_3.lo r2cf_4.lo r2cf_5.lo r2cf_6.lo \
+	r2cf_7.lo r2cf_8.lo r2cf_9.lo r2cf_10.lo r2cf_11.lo r2cf_12.lo \
+	r2cf_13.lo r2cf_14.lo r2cf_15.lo r2cf_16.lo r2cf_32.lo \
+	r2cf_64.lo r2cf_128.lo r2cf_20.lo r2cf_25.lo
+am__objects_2 = hf_2.lo hf_3.lo hf_4.lo hf_5.lo hf_6.lo hf_7.lo \
+	hf_8.lo hf_9.lo hf_10.lo hf_12.lo hf_15.lo hf_16.lo hf_32.lo \
+	hf_64.lo hf_20.lo hf_25.lo
+am__objects_3 = hf2_4.lo hf2_8.lo hf2_16.lo hf2_32.lo hf2_5.lo \
+	hf2_20.lo hf2_25.lo
+am__objects_4 = r2cfII_2.lo r2cfII_3.lo r2cfII_4.lo r2cfII_5.lo \
+	r2cfII_6.lo r2cfII_7.lo r2cfII_8.lo r2cfII_9.lo r2cfII_10.lo \
+	r2cfII_12.lo r2cfII_15.lo r2cfII_16.lo r2cfII_32.lo \
+	r2cfII_64.lo r2cfII_20.lo r2cfII_25.lo
+am__objects_5 = hc2cf_2.lo hc2cf_4.lo hc2cf_6.lo hc2cf_8.lo \
+	hc2cf_10.lo hc2cf_12.lo hc2cf_16.lo hc2cf_32.lo hc2cf_20.lo
+am__objects_6 = hc2cf2_4.lo hc2cf2_8.lo hc2cf2_16.lo hc2cf2_32.lo \
+	hc2cf2_20.lo
+am__objects_7 = hc2cfdft_2.lo hc2cfdft_4.lo hc2cfdft_6.lo \
+	hc2cfdft_8.lo hc2cfdft_10.lo hc2cfdft_12.lo hc2cfdft_16.lo \
+	hc2cfdft_32.lo hc2cfdft_20.lo
+am__objects_8 = hc2cfdft2_4.lo hc2cfdft2_8.lo hc2cfdft2_16.lo \
+	hc2cfdft2_32.lo hc2cfdft2_20.lo
+am__objects_9 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
+	$(am__objects_4) $(am__objects_5) $(am__objects_6) \
+	$(am__objects_7) $(am__objects_8)
+am__objects_10 = codlist.lo
+am__objects_11 = $(am__objects_9) $(am__objects_10)
+am_librdft_scalar_r2cf_la_OBJECTS = $(am__objects_11)
+librdft_scalar_r2cf_la_OBJECTS = $(am_librdft_scalar_r2cf_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(librdft_scalar_r2cf_la_SOURCES)
+DIST_SOURCES = $(librdft_scalar_r2cf_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+
+###########################################################################
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/scalar
+
+noinst_LTLIBRARIES = librdft_scalar_r2cf.la
+
+###########################################################################
+# r2cf_<n> is a hard-coded real-to-complex FFT of size <n> (base cases
+# of real-input FFT recursion)
+R2CF = r2cf_2.c r2cf_3.c r2cf_4.c r2cf_5.c r2cf_6.c r2cf_7.c r2cf_8.c	\
+r2cf_9.c r2cf_10.c r2cf_11.c r2cf_12.c r2cf_13.c r2cf_14.c r2cf_15.c	\
+r2cf_16.c r2cf_32.c r2cf_64.c r2cf_128.c \
+r2cf_20.c r2cf_25.c # r2cf_30.c r2cf_40.c r2cf_50.c
+
+
+###########################################################################
+# hf_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT
+# step for a real-input FFT.  Every hf codelet must have a
+# corresponding r2cfII codelet (see below)!
+HF = hf_2.c hf_3.c hf_4.c hf_5.c hf_6.c hf_7.c hf_8.c hf_9.c	\
+hf_10.c hf_12.c hf_15.c hf_16.c hf_32.c hf_64.c \
+hf_20.c hf_25.c # hf_30.c hf_40.c hf_50.c
+
+
+# like hf, but generates part of its trig table on the fly (good for large n)
+HF2 = hf2_4.c hf2_8.c hf2_16.c hf2_32.c \
+hf2_5.c hf2_20.c hf2_25.c
+
+
+# an r2cf transform where the input is shifted by half a sample (output
+# is multiplied by a phase).  This is needed as part of the DIT recursion;
+# every hf_<r> or hf2_<r> codelet should have a corresponding r2cfII_<r>
+R2CFII = r2cfII_2.c r2cfII_3.c r2cfII_4.c r2cfII_5.c r2cfII_6.c		\
+r2cfII_7.c r2cfII_8.c r2cfII_9.c r2cfII_10.c r2cfII_12.c r2cfII_15.c	\
+r2cfII_16.c r2cfII_32.c r2cfII_64.c \
+r2cfII_20.c r2cfII_25.c # r2cfII_30.c r2cfII_40.c r2cfII_50.c
+
+
+###########################################################################
+# hc2cf_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT
+# step for a real-input FFT with rdft2-style output.  <r> must be even.
+HC2CF = hc2cf_2.c hc2cf_4.c hc2cf_6.c hc2cf_8.c hc2cf_10.c hc2cf_12.c	\
+hc2cf_16.c hc2cf_32.c \
+hc2cf_20.c # hc2cf_30.c
+
+HC2CFDFT = hc2cfdft_2.c hc2cfdft_4.c hc2cfdft_6.c hc2cfdft_8.c	\
+hc2cfdft_10.c hc2cfdft_12.c hc2cfdft_16.c hc2cfdft_32.c \
+hc2cfdft_20.c # hc2cfdft_30.c
+
+
+# like hc2cf, but generates part of its trig table on the fly (good
+# for large n)
+HC2CF2 = hc2cf2_4.c hc2cf2_8.c hc2cf2_16.c hc2cf2_32.c \
+hc2cf2_20.c # hc2cf2_30.c
+
+HC2CFDFT2 = hc2cfdft2_4.c hc2cfdft2_8.c hc2cfdft2_16.c hc2cfdft2_32.c \
+hc2cfdft2_20.c # hc2cfdft2_30.c
+
+
+###########################################################################
+ALL_CODELETS = $(R2CF) $(HF) $(HF2) $(R2CFII) $(HC2CF) $(HC2CF2)	\
+$(HC2CFDFT) $(HC2CFDFT2)
+
+BUILT_SOURCES = $(ALL_CODELETS) $(CODLIST)
+librdft_scalar_r2cf_la_SOURCES = $(BUILT_SOURCES)
+SOLVTAB_NAME = X(solvtab_rdft_r2cf)
+XRENAME = X
+CODLIST = codlist.c
+CODELET_NAME = codelet_
+@MAINTAINER_MODE_TRUE@INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
+@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
+@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
+@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
+@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
+@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
+@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
+@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
+@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
+@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
+@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
+@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
+@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE) 
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
+@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
+@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+
+# special rules for regenerating codelets.
+@MAINTAINER_MODE_TRUE@FLAGS_R2CF = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_HF = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_HF2 = $(RDFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
+@MAINTAINER_MODE_TRUE@FLAGS_HC2CF = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_HC2CF2 = $(RDFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
+@MAINTAINER_MODE_TRUE@FLAGS_R2CFII = $(RDFT_FLAGS_COMMON)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/scalar/r2cf/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/scalar/r2cf/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/support/Makefile.codelets:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+librdft_scalar_r2cf.la: $(librdft_scalar_r2cf_la_OBJECTS) $(librdft_scalar_r2cf_la_DEPENDENCIES) $(EXTRA_librdft_scalar_r2cf_la_DEPENDENCIES) 
+	$(LINK)  $(librdft_scalar_r2cf_la_OBJECTS) $(librdft_scalar_r2cf_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf2_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf2_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf2_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf2_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf2_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cf_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft2_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft2_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft2_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft2_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft2_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdft_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf2_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf2_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf2_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf2_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf2_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf2_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf2_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hf_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cfII_9.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_11.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_128.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_13.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_14.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_15.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_25.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_3.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_5.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_7.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2cf_9.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic \
+	maintainer-clean-local
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic \
+	maintainer-clean-local mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am
+
+
+# rule to build codlist
+$(CODLIST): Makefile
+	(									\
+	echo "#include \"ifftw.h\"";						\
+	echo $(INCLUDE_SIMD_HEADER);						\
+	echo;									\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+             echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);";	\
+           fi									\
+	done;									\
+	echo;									\
+	echo;									\
+	echo "extern const solvtab $(SOLVTAB_NAME);";				\
+	echo "const solvtab $(SOLVTAB_NAME) = {";				\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+	     echo "   SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),";		\
+	   fi									\
+	done;									\
+	echo "   SOLVTAB_END";							\
+	echo "};";								\
+	) >$@
+
+# only delete codlist.c in maintainer-mode, since it is included in the dist
+# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
+maintainer-clean-local:
+	rm -f $(CODLIST)
+
+# cancel the hideous builtin rules that cause an infinite loop
+@MAINTAINER_MODE_TRUE@%: %.o
+@MAINTAINER_MODE_TRUE@%: %.s
+@MAINTAINER_MODE_TRUE@%: %.c
+@MAINTAINER_MODE_TRUE@%: %.S
+
+@MAINTAINER_MODE_TRUE@r2cf_%.c:  $(CODELET_DEPS) $(GEN_R2CF)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CF) $(FLAGS_R2CF) -n $* -name r2cf_$* -include "r2cf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hf_%.c:  $(CODELET_DEPS) $(GEN_HC2HC)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HF) -n $* -dit -name hf_$* -include "hf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hf2_%.c:  $(CODELET_DEPS) $(GEN_HC2HC)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HF2) -n $* -dit -name hf2_$* -include "hf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@r2cfII_%.c:  $(CODELET_DEPS) $(GEN_R2CF)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CF) $(FLAGS_R2CF) -n $* -name r2cfII_$* -dft-II -include "r2cfII.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hc2cf_%.c:  $(CODELET_DEPS) $(GEN_HC2C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CF) -n $* -dit -name hc2cf_$* -include "hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hc2cf2_%.c:  $(CODELET_DEPS) $(GEN_HC2C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CF2) -n $* -dit -name hc2cf2_$* -include "hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hc2cfdft_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CF) -n $* -dit -name hc2cfdft_$* -include "hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hc2cfdft2_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CF2) -n $* -dit -name hc2cfdft2_$* -include "hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,183 @@
+#include "ifftw.h"
+
+
+extern void X(codelet_r2cf_2)(planner *);
+extern void X(codelet_r2cf_3)(planner *);
+extern void X(codelet_r2cf_4)(planner *);
+extern void X(codelet_r2cf_5)(planner *);
+extern void X(codelet_r2cf_6)(planner *);
+extern void X(codelet_r2cf_7)(planner *);
+extern void X(codelet_r2cf_8)(planner *);
+extern void X(codelet_r2cf_9)(planner *);
+extern void X(codelet_r2cf_10)(planner *);
+extern void X(codelet_r2cf_11)(planner *);
+extern void X(codelet_r2cf_12)(planner *);
+extern void X(codelet_r2cf_13)(planner *);
+extern void X(codelet_r2cf_14)(planner *);
+extern void X(codelet_r2cf_15)(planner *);
+extern void X(codelet_r2cf_16)(planner *);
+extern void X(codelet_r2cf_32)(planner *);
+extern void X(codelet_r2cf_64)(planner *);
+extern void X(codelet_r2cf_128)(planner *);
+extern void X(codelet_r2cf_20)(planner *);
+extern void X(codelet_r2cf_25)(planner *);
+extern void X(codelet_hf_2)(planner *);
+extern void X(codelet_hf_3)(planner *);
+extern void X(codelet_hf_4)(planner *);
+extern void X(codelet_hf_5)(planner *);
+extern void X(codelet_hf_6)(planner *);
+extern void X(codelet_hf_7)(planner *);
+extern void X(codelet_hf_8)(planner *);
+extern void X(codelet_hf_9)(planner *);
+extern void X(codelet_hf_10)(planner *);
+extern void X(codelet_hf_12)(planner *);
+extern void X(codelet_hf_15)(planner *);
+extern void X(codelet_hf_16)(planner *);
+extern void X(codelet_hf_32)(planner *);
+extern void X(codelet_hf_64)(planner *);
+extern void X(codelet_hf_20)(planner *);
+extern void X(codelet_hf_25)(planner *);
+extern void X(codelet_hf2_4)(planner *);
+extern void X(codelet_hf2_8)(planner *);
+extern void X(codelet_hf2_16)(planner *);
+extern void X(codelet_hf2_32)(planner *);
+extern void X(codelet_hf2_5)(planner *);
+extern void X(codelet_hf2_20)(planner *);
+extern void X(codelet_hf2_25)(planner *);
+extern void X(codelet_r2cfII_2)(planner *);
+extern void X(codelet_r2cfII_3)(planner *);
+extern void X(codelet_r2cfII_4)(planner *);
+extern void X(codelet_r2cfII_5)(planner *);
+extern void X(codelet_r2cfII_6)(planner *);
+extern void X(codelet_r2cfII_7)(planner *);
+extern void X(codelet_r2cfII_8)(planner *);
+extern void X(codelet_r2cfII_9)(planner *);
+extern void X(codelet_r2cfII_10)(planner *);
+extern void X(codelet_r2cfII_12)(planner *);
+extern void X(codelet_r2cfII_15)(planner *);
+extern void X(codelet_r2cfII_16)(planner *);
+extern void X(codelet_r2cfII_32)(planner *);
+extern void X(codelet_r2cfII_64)(planner *);
+extern void X(codelet_r2cfII_20)(planner *);
+extern void X(codelet_r2cfII_25)(planner *);
+extern void X(codelet_hc2cf_2)(planner *);
+extern void X(codelet_hc2cf_4)(planner *);
+extern void X(codelet_hc2cf_6)(planner *);
+extern void X(codelet_hc2cf_8)(planner *);
+extern void X(codelet_hc2cf_10)(planner *);
+extern void X(codelet_hc2cf_12)(planner *);
+extern void X(codelet_hc2cf_16)(planner *);
+extern void X(codelet_hc2cf_32)(planner *);
+extern void X(codelet_hc2cf_20)(planner *);
+extern void X(codelet_hc2cf2_4)(planner *);
+extern void X(codelet_hc2cf2_8)(planner *);
+extern void X(codelet_hc2cf2_16)(planner *);
+extern void X(codelet_hc2cf2_32)(planner *);
+extern void X(codelet_hc2cf2_20)(planner *);
+extern void X(codelet_hc2cfdft_2)(planner *);
+extern void X(codelet_hc2cfdft_4)(planner *);
+extern void X(codelet_hc2cfdft_6)(planner *);
+extern void X(codelet_hc2cfdft_8)(planner *);
+extern void X(codelet_hc2cfdft_10)(planner *);
+extern void X(codelet_hc2cfdft_12)(planner *);
+extern void X(codelet_hc2cfdft_16)(planner *);
+extern void X(codelet_hc2cfdft_32)(planner *);
+extern void X(codelet_hc2cfdft_20)(planner *);
+extern void X(codelet_hc2cfdft2_4)(planner *);
+extern void X(codelet_hc2cfdft2_8)(planner *);
+extern void X(codelet_hc2cfdft2_16)(planner *);
+extern void X(codelet_hc2cfdft2_32)(planner *);
+extern void X(codelet_hc2cfdft2_20)(planner *);
+
+
+extern const solvtab X(solvtab_rdft_r2cf);
+const solvtab X(solvtab_rdft_r2cf) = {
+   SOLVTAB(X(codelet_r2cf_2)),
+   SOLVTAB(X(codelet_r2cf_3)),
+   SOLVTAB(X(codelet_r2cf_4)),
+   SOLVTAB(X(codelet_r2cf_5)),
+   SOLVTAB(X(codelet_r2cf_6)),
+   SOLVTAB(X(codelet_r2cf_7)),
+   SOLVTAB(X(codelet_r2cf_8)),
+   SOLVTAB(X(codelet_r2cf_9)),
+   SOLVTAB(X(codelet_r2cf_10)),
+   SOLVTAB(X(codelet_r2cf_11)),
+   SOLVTAB(X(codelet_r2cf_12)),
+   SOLVTAB(X(codelet_r2cf_13)),
+   SOLVTAB(X(codelet_r2cf_14)),
+   SOLVTAB(X(codelet_r2cf_15)),
+   SOLVTAB(X(codelet_r2cf_16)),
+   SOLVTAB(X(codelet_r2cf_32)),
+   SOLVTAB(X(codelet_r2cf_64)),
+   SOLVTAB(X(codelet_r2cf_128)),
+   SOLVTAB(X(codelet_r2cf_20)),
+   SOLVTAB(X(codelet_r2cf_25)),
+   SOLVTAB(X(codelet_hf_2)),
+   SOLVTAB(X(codelet_hf_3)),
+   SOLVTAB(X(codelet_hf_4)),
+   SOLVTAB(X(codelet_hf_5)),
+   SOLVTAB(X(codelet_hf_6)),
+   SOLVTAB(X(codelet_hf_7)),
+   SOLVTAB(X(codelet_hf_8)),
+   SOLVTAB(X(codelet_hf_9)),
+   SOLVTAB(X(codelet_hf_10)),
+   SOLVTAB(X(codelet_hf_12)),
+   SOLVTAB(X(codelet_hf_15)),
+   SOLVTAB(X(codelet_hf_16)),
+   SOLVTAB(X(codelet_hf_32)),
+   SOLVTAB(X(codelet_hf_64)),
+   SOLVTAB(X(codelet_hf_20)),
+   SOLVTAB(X(codelet_hf_25)),
+   SOLVTAB(X(codelet_hf2_4)),
+   SOLVTAB(X(codelet_hf2_8)),
+   SOLVTAB(X(codelet_hf2_16)),
+   SOLVTAB(X(codelet_hf2_32)),
+   SOLVTAB(X(codelet_hf2_5)),
+   SOLVTAB(X(codelet_hf2_20)),
+   SOLVTAB(X(codelet_hf2_25)),
+   SOLVTAB(X(codelet_r2cfII_2)),
+   SOLVTAB(X(codelet_r2cfII_3)),
+   SOLVTAB(X(codelet_r2cfII_4)),
+   SOLVTAB(X(codelet_r2cfII_5)),
+   SOLVTAB(X(codelet_r2cfII_6)),
+   SOLVTAB(X(codelet_r2cfII_7)),
+   SOLVTAB(X(codelet_r2cfII_8)),
+   SOLVTAB(X(codelet_r2cfII_9)),
+   SOLVTAB(X(codelet_r2cfII_10)),
+   SOLVTAB(X(codelet_r2cfII_12)),
+   SOLVTAB(X(codelet_r2cfII_15)),
+   SOLVTAB(X(codelet_r2cfII_16)),
+   SOLVTAB(X(codelet_r2cfII_32)),
+   SOLVTAB(X(codelet_r2cfII_64)),
+   SOLVTAB(X(codelet_r2cfII_20)),
+   SOLVTAB(X(codelet_r2cfII_25)),
+   SOLVTAB(X(codelet_hc2cf_2)),
+   SOLVTAB(X(codelet_hc2cf_4)),
+   SOLVTAB(X(codelet_hc2cf_6)),
+   SOLVTAB(X(codelet_hc2cf_8)),
+   SOLVTAB(X(codelet_hc2cf_10)),
+   SOLVTAB(X(codelet_hc2cf_12)),
+   SOLVTAB(X(codelet_hc2cf_16)),
+   SOLVTAB(X(codelet_hc2cf_32)),
+   SOLVTAB(X(codelet_hc2cf_20)),
+   SOLVTAB(X(codelet_hc2cf2_4)),
+   SOLVTAB(X(codelet_hc2cf2_8)),
+   SOLVTAB(X(codelet_hc2cf2_16)),
+   SOLVTAB(X(codelet_hc2cf2_32)),
+   SOLVTAB(X(codelet_hc2cf2_20)),
+   SOLVTAB(X(codelet_hc2cfdft_2)),
+   SOLVTAB(X(codelet_hc2cfdft_4)),
+   SOLVTAB(X(codelet_hc2cfdft_6)),
+   SOLVTAB(X(codelet_hc2cfdft_8)),
+   SOLVTAB(X(codelet_hc2cfdft_10)),
+   SOLVTAB(X(codelet_hc2cfdft_12)),
+   SOLVTAB(X(codelet_hc2cfdft_16)),
+   SOLVTAB(X(codelet_hc2cfdft_32)),
+   SOLVTAB(X(codelet_hc2cfdft_20)),
+   SOLVTAB(X(codelet_hc2cfdft2_4)),
+   SOLVTAB(X(codelet_hc2cfdft2_8)),
+   SOLVTAB(X(codelet_hc2cfdft2_16)),
+   SOLVTAB(X(codelet_hc2cfdft2_32)),
+   SOLVTAB(X(codelet_hc2cfdft2_20)),
+   SOLVTAB_END
+};
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf2_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf2_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,827 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:42 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cf2_16 -include hc2cf.h */
+
+/*
+ * This function contains 196 FP additions, 134 FP multiplications,
+ * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
+ * 100 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T3S, T3R;
+	       {
+		    E T2, Tf, TM, TO, T3, Tg, TN, TS, T4, Tp, T6, T5, Th;
+		    T2 = W[0];
+		    Tf = W[2];
+		    TM = W[6];
+		    TO = W[7];
+		    T3 = W[4];
+		    Tg = T2 * Tf;
+		    TN = T2 * TM;
+		    TS = T2 * TO;
+		    T4 = T2 * T3;
+		    Tp = Tf * T3;
+		    T6 = W[5];
+		    T5 = W[1];
+		    Th = W[3];
+		    {
+			 E TZ, Te, T1U, T3A, T3L, T2D, T1G, T2B, T3h, T1R, T2w, T2I, T3i, Tx, T3M;
+			 E T1Z, T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, TX;
+			 E T10, TV, T2a, TY, T2b;
+			 {
+			      E TF, TP, TT, Tq, TW, Tz, Tu, TI, TC, T1m, T1f, T1p, T1j, Tr, Ts;
+			      E Tv, To, T1W;
+			      {
+				   E Ti, Tm, T1L, T1O, T1D, T1A, T1x, T2z, T1F, T2y;
+				   {
+					E T1, T7, Tb, T3z, T8, T1z, T9, Tc;
+					{
+					     E T1i, T1e, T1C, T1y, Tt, Ta, Tl;
+					     T1 = Rp[0];
+					     Tt = Tf * T6;
+					     Ta = T2 * T6;
+					     T7 = FMA(T5, T6, T4);
+					     TF = FNMS(T5, T6, T4);
+					     TP = FMA(T5, TO, TN);
+					     TT = FNMS(T5, TM, TS);
+					     Tq = FNMS(Th, T6, Tp);
+					     TW = FMA(Th, T6, Tp);
+					     Tz = FMA(T5, Th, Tg);
+					     Ti = FNMS(T5, Th, Tg);
+					     Tl = T2 * Th;
+					     Tu = FMA(Th, T3, Tt);
+					     TZ = FNMS(Th, T3, Tt);
+					     TI = FMA(T5, T3, Ta);
+					     Tb = FNMS(T5, T3, Ta);
+					     T1i = Ti * T6;
+					     T1e = Ti * T3;
+					     T1C = Tz * T6;
+					     T1y = Tz * T3;
+					     Tm = FMA(T5, Tf, Tl);
+					     TC = FNMS(T5, Tf, Tl);
+					     T3z = Rm[0];
+					     T8 = Rp[WS(rs, 4)];
+					     T1m = FNMS(Tm, T6, T1e);
+					     T1f = FMA(Tm, T6, T1e);
+					     T1p = FMA(Tm, T3, T1i);
+					     T1j = FNMS(Tm, T3, T1i);
+					     T1L = FNMS(TC, T6, T1y);
+					     T1z = FMA(TC, T6, T1y);
+					     T1O = FMA(TC, T3, T1C);
+					     T1D = FNMS(TC, T3, T1C);
+					     T9 = T7 * T8;
+					     Tc = Rm[WS(rs, 4)];
+					}
+					{
+					     E T1u, T1w, T1v, T2x, T3y, T1B, T1E, Td, T3x;
+					     T1u = Ip[WS(rs, 7)];
+					     T1w = Im[WS(rs, 7)];
+					     T1A = Ip[WS(rs, 3)];
+					     Td = FMA(Tb, Tc, T9);
+					     T3x = T7 * Tc;
+					     T1v = TM * T1u;
+					     T2x = TM * T1w;
+					     Te = T1 + Td;
+					     T1U = T1 - Td;
+					     T3y = FNMS(Tb, T8, T3x);
+					     T1B = T1z * T1A;
+					     T1E = Im[WS(rs, 3)];
+					     T1x = FMA(TO, T1w, T1v);
+					     T3A = T3y + T3z;
+					     T3L = T3z - T3y;
+					     T2z = T1z * T1E;
+					     T1F = FMA(T1D, T1E, T1B);
+					     T2y = FNMS(TO, T1u, T2x);
+					}
+				   }
+				   {
+					E T1H, T1I, T1J, T1M, T1P, T2A;
+					T1H = Ip[WS(rs, 1)];
+					T2A = FNMS(T1D, T1A, T2z);
+					T2D = T1x - T1F;
+					T1G = T1x + T1F;
+					T1I = Tf * T1H;
+					T2B = T2y - T2A;
+					T3h = T2y + T2A;
+					T1J = Im[WS(rs, 1)];
+					T1M = Ip[WS(rs, 5)];
+					T1P = Im[WS(rs, 5)];
+					{
+					     E Tj, Tk, Tn, T1V;
+					     {
+						  E T1K, T2F, T1Q, T2H, T2E, T1N, T2G;
+						  Tj = Rp[WS(rs, 2)];
+						  T1K = FMA(Th, T1J, T1I);
+						  T2E = Tf * T1J;
+						  T1N = T1L * T1M;
+						  T2G = T1L * T1P;
+						  Tk = Ti * Tj;
+						  T2F = FNMS(Th, T1H, T2E);
+						  T1Q = FMA(T1O, T1P, T1N);
+						  T2H = FNMS(T1O, T1M, T2G);
+						  Tn = Rm[WS(rs, 2)];
+						  Tr = Rp[WS(rs, 6)];
+						  T1R = T1K + T1Q;
+						  T2w = T1Q - T1K;
+						  T2I = T2F - T2H;
+						  T3i = T2F + T2H;
+						  T1V = Ti * Tn;
+						  Ts = Tq * Tr;
+						  Tv = Rm[WS(rs, 6)];
+					     }
+					     To = FMA(Tm, Tn, Tk);
+					     T1W = FNMS(Tm, Tj, T1V);
+					}
+				   }
+			      }
+			      {
+				   E T19, T1b, T18, T2i, T1a, T2j;
+				   {
+					E TE, T22, TK, T24;
+					{
+					     E TA, TD, TB, T21, TG, TJ, TH, T23, T1Y, Tw, T1X;
+					     TA = Rp[WS(rs, 1)];
+					     Tw = FMA(Tu, Tv, Ts);
+					     T1X = Tq * Tv;
+					     TD = Rm[WS(rs, 1)];
+					     TB = Tz * TA;
+					     Tx = To + Tw;
+					     T3M = To - Tw;
+					     T1Y = FNMS(Tu, Tr, T1X);
+					     T21 = Tz * TD;
+					     TG = Rp[WS(rs, 5)];
+					     TJ = Rm[WS(rs, 5)];
+					     T1Z = T1W - T1Y;
+					     T3w = T1W + T1Y;
+					     TH = TF * TG;
+					     T23 = TF * TJ;
+					     TE = FMA(TC, TD, TB);
+					     T22 = FNMS(TC, TA, T21);
+					     TK = FMA(TI, TJ, TH);
+					     T24 = FNMS(TI, TG, T23);
+					}
+					{
+					     E T15, T17, T16, T2h;
+					     T15 = Ip[0];
+					     T17 = Im[0];
+					     TL = TE + TK;
+					     T26 = TE - TK;
+					     T25 = T22 - T24;
+					     T37 = T22 + T24;
+					     T16 = T2 * T15;
+					     T2h = T2 * T17;
+					     T19 = Ip[WS(rs, 4)];
+					     T1b = Im[WS(rs, 4)];
+					     T18 = FMA(T5, T17, T16);
+					     T2i = FNMS(T5, T15, T2h);
+					     T1a = T3 * T19;
+					     T2j = T3 * T1b;
+					}
+				   }
+				   {
+					E T1n, T1q, T1l, T2q, T1o, T2r;
+					{
+					     E T1g, T1k, T1h, T2p, T1c, T2k;
+					     T1g = Ip[WS(rs, 2)];
+					     T1k = Im[WS(rs, 2)];
+					     T1c = FMA(T6, T1b, T1a);
+					     T2k = FNMS(T6, T19, T2j);
+					     T1h = T1f * T1g;
+					     T2p = T1f * T1k;
+					     T1d = T18 + T1c;
+					     T2o = T18 - T1c;
+					     T2l = T2i - T2k;
+					     T3c = T2i + T2k;
+					     T1n = Ip[WS(rs, 6)];
+					     T1q = Im[WS(rs, 6)];
+					     T1l = FMA(T1j, T1k, T1h);
+					     T2q = FNMS(T1j, T1g, T2p);
+					     T1o = T1m * T1n;
+					     T2r = T1m * T1q;
+					}
+					{
+					     E TQ, TU, TR, T29, T1r, T2s;
+					     TQ = Rp[WS(rs, 7)];
+					     TU = Rm[WS(rs, 7)];
+					     T1r = FMA(T1p, T1q, T1o);
+					     T2s = FNMS(T1p, T1n, T2r);
+					     TR = TP * TQ;
+					     T29 = TP * TU;
+					     T1s = T1l + T1r;
+					     T2m = T1l - T1r;
+					     T2t = T2q - T2s;
+					     T3d = T2q + T2s;
+					     TX = Rp[WS(rs, 3)];
+					     T10 = Rm[WS(rs, 3)];
+					     TV = FMA(TT, TU, TR);
+					     T2a = FNMS(TT, TQ, T29);
+					     TY = TW * TX;
+					     T2b = TW * T10;
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T36, T3G, T3b, T3g, T28, T2d, T3F, T39, T3e, T3q, T3C, T3j, T3u, T3t;
+			      {
+				   E T3D, T1T, T3r, T14, T3E, T3s;
+				   {
+					E Ty, T3B, T11, T2c, T13, T3v;
+					T36 = Te - Tx;
+					Ty = Te + Tx;
+					T3B = T3w + T3A;
+					T3G = T3A - T3w;
+					T11 = FMA(TZ, T10, TY);
+					T2c = FNMS(TZ, TX, T2b);
+					{
+					     E T1t, T1S, T12, T38;
+					     T3b = T1d - T1s;
+					     T1t = T1d + T1s;
+					     T1S = T1G + T1R;
+					     T3g = T1G - T1R;
+					     T12 = TV + T11;
+					     T28 = TV - T11;
+					     T2d = T2a - T2c;
+					     T38 = T2a + T2c;
+					     T3D = T1S - T1t;
+					     T1T = T1t + T1S;
+					     T13 = TL + T12;
+					     T3F = T12 - TL;
+					     T39 = T37 - T38;
+					     T3v = T37 + T38;
+					}
+					T3e = T3c - T3d;
+					T3r = T3c + T3d;
+					T3q = Ty - T13;
+					T14 = Ty + T13;
+					T3E = T3B - T3v;
+					T3C = T3v + T3B;
+					T3s = T3h + T3i;
+					T3j = T3h - T3i;
+				   }
+				   Rm[WS(rs, 7)] = T14 - T1T;
+				   Rp[0] = T14 + T1T;
+				   Im[WS(rs, 3)] = T3D - T3E;
+				   T3u = T3r + T3s;
+				   T3t = T3r - T3s;
+				   Ip[WS(rs, 4)] = T3D + T3E;
+			      }
+			      {
+				   E T3m, T3a, T3J, T3H;
+				   Ip[0] = T3u + T3C;
+				   Im[WS(rs, 7)] = T3u - T3C;
+				   Rp[WS(rs, 4)] = T3q + T3t;
+				   Rm[WS(rs, 3)] = T3q - T3t;
+				   T3m = T36 - T39;
+				   T3a = T36 + T39;
+				   T3J = T3G - T3F;
+				   T3H = T3F + T3G;
+				   {
+					E T2Q, T20, T3N, T3T, T2J, T2C, T3O, T2f, T34, T30, T2W, T2V, T3U, T2T, T2N;
+					E T2v;
+					{
+					     E T2R, T27, T2e, T2S;
+					     {
+						  E T3n, T3f, T3o, T3k;
+						  T2Q = T1U + T1Z;
+						  T20 = T1U - T1Z;
+						  T3n = T3e - T3b;
+						  T3f = T3b + T3e;
+						  T3o = T3g + T3j;
+						  T3k = T3g - T3j;
+						  T3N = T3L - T3M;
+						  T3T = T3M + T3L;
+						  {
+						       E T3p, T3I, T3K, T3l;
+						       T3p = T3n - T3o;
+						       T3I = T3n + T3o;
+						       T3K = T3k - T3f;
+						       T3l = T3f + T3k;
+						       Rp[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
+						       Rm[WS(rs, 1)] = FNMS(KP707106781, T3p, T3m);
+						       Ip[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
+						       Im[WS(rs, 5)] = FMS(KP707106781, T3I, T3H);
+						       Ip[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
+						       Im[WS(rs, 1)] = FMS(KP707106781, T3K, T3J);
+						       Rp[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
+						       Rm[WS(rs, 5)] = FNMS(KP707106781, T3l, T3a);
+						       T2R = T26 + T25;
+						       T27 = T25 - T26;
+						       T2e = T28 + T2d;
+						       T2S = T28 - T2d;
+						  }
+					     }
+					     {
+						  E T2Y, T2Z, T2n, T2u;
+						  T2J = T2D - T2I;
+						  T2Y = T2D + T2I;
+						  T2Z = T2B + T2w;
+						  T2C = T2w - T2B;
+						  T3O = T27 + T2e;
+						  T2f = T27 - T2e;
+						  T34 = FMA(KP414213562, T2Y, T2Z);
+						  T30 = FNMS(KP414213562, T2Z, T2Y);
+						  T2W = T2l - T2m;
+						  T2n = T2l + T2m;
+						  T2u = T2o - T2t;
+						  T2V = T2o + T2t;
+						  T3U = T2S - T2R;
+						  T2T = T2R + T2S;
+						  T2N = FNMS(KP414213562, T2n, T2u);
+						  T2v = FMA(KP414213562, T2u, T2n);
+					     }
+					}
+					{
+					     E T33, T2X, T3X, T3Y;
+					     {
+						  E T2M, T2g, T2O, T2K, T3V, T3W, T2P, T2L;
+						  T2M = FNMS(KP707106781, T2f, T20);
+						  T2g = FMA(KP707106781, T2f, T20);
+						  T33 = FNMS(KP414213562, T2V, T2W);
+						  T2X = FMA(KP414213562, T2W, T2V);
+						  T2O = FNMS(KP414213562, T2C, T2J);
+						  T2K = FMA(KP414213562, T2J, T2C);
+						  T3V = FMA(KP707106781, T3U, T3T);
+						  T3X = FNMS(KP707106781, T3U, T3T);
+						  T3W = T2O - T2N;
+						  T2P = T2N + T2O;
+						  T3Y = T2K - T2v;
+						  T2L = T2v + T2K;
+						  Ip[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
+						  Im[WS(rs, 4)] = FMS(KP923879532, T3W, T3V);
+						  Rp[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
+						  Rm[WS(rs, 4)] = FNMS(KP923879532, T2L, T2g);
+						  Rm[0] = FMA(KP923879532, T2P, T2M);
+						  Rp[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
+					     }
+					     {
+						  E T32, T3P, T3Q, T35, T2U, T31;
+						  T32 = FNMS(KP707106781, T2T, T2Q);
+						  T2U = FMA(KP707106781, T2T, T2Q);
+						  T31 = T2X + T30;
+						  T3S = T30 - T2X;
+						  T3R = FNMS(KP707106781, T3O, T3N);
+						  T3P = FMA(KP707106781, T3O, T3N);
+						  Ip[WS(rs, 7)] = FMA(KP923879532, T3Y, T3X);
+						  Im[0] = FMS(KP923879532, T3Y, T3X);
+						  Rp[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
+						  Rm[WS(rs, 6)] = FNMS(KP923879532, T31, T2U);
+						  T3Q = T33 + T34;
+						  T35 = T33 - T34;
+						  Ip[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
+						  Im[WS(rs, 6)] = FMS(KP923879532, T3Q, T3P);
+						  Rp[WS(rs, 5)] = FMA(KP923879532, T35, T32);
+						  Rm[WS(rs, 2)] = FNMS(KP923879532, T35, T32);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
+	       Im[WS(rs, 2)] = FMS(KP923879532, T3S, T3R);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cf2_16", twinstr, &GENUS, {104, 42, 92, 0} };
+
+void X(codelet_hc2cf2_16) (planner *p) {
+     X(khc2c_register) (p, hc2cf2_16, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cf2_16 -include hc2cf.h */
+
+/*
+ * This function contains 196 FP additions, 108 FP multiplications,
+ * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
+ * 82 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
+	       E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
+	       {
+		    E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
+		    {
+			 E Th, Tn, Tj, Tm;
+			 T2 = W[0];
+			 T5 = W[1];
+			 Tg = W[2];
+			 Ti = W[3];
+			 Th = T2 * Tg;
+			 Tn = T5 * Tg;
+			 Tj = T5 * Ti;
+			 Tm = T2 * Ti;
+			 Tk = Th - Tj;
+			 To = Tm + Tn;
+			 TE = Tm - Tn;
+			 TC = Th + Tj;
+			 T6 = W[5];
+			 T7 = T5 * T6;
+			 Tv = Tg * T6;
+			 Ta = T2 * T6;
+			 Ts = Ti * T6;
+			 T3 = W[4];
+			 T4 = T2 * T3;
+			 Tw = Ti * T3;
+			 Tb = T5 * T3;
+			 Tr = Tg * T3;
+		    }
+		    T8 = T4 + T7;
+		    TW = Tv - Tw;
+		    TJ = Ta + Tb;
+		    Tt = Tr - Ts;
+		    TU = Tr + Ts;
+		    Tc = Ta - Tb;
+		    Tx = Tv + Tw;
+		    TH = T4 - T7;
+		    TN = W[6];
+		    TO = W[7];
+		    TP = FMA(T2, TN, T5 * TO);
+		    TR = FNMS(T5, TN, T2 * TO);
+		    {
+			 E T1d, T1e, T19, T1a;
+			 T1d = Tk * T6;
+			 T1e = To * T3;
+			 T1f = T1d - T1e;
+			 T1k = T1d + T1e;
+			 T19 = Tk * T3;
+			 T1a = To * T6;
+			 T1b = T19 + T1a;
+			 T1i = T19 - T1a;
+		    }
+		    {
+			 E T1w, T1x, T1s, T1t;
+			 T1w = TC * T6;
+			 T1x = TE * T3;
+			 T1y = T1w - T1x;
+			 T1H = T1w + T1x;
+			 T1s = TC * T3;
+			 T1t = TE * T6;
+			 T1u = T1s + T1t;
+			 T1F = T1s - T1t;
+		    }
+	       }
+	       {
+		    E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
+		    E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
+		    E T2S, T2T, T28, T2A, T2d, T2B;
+		    {
+			 E T1, T3d, Te, T3c, T9, Td;
+			 T1 = Rp[0];
+			 T3d = Rm[0];
+			 T9 = Rp[WS(rs, 4)];
+			 Td = Rm[WS(rs, 4)];
+			 Te = FMA(T8, T9, Tc * Td);
+			 T3c = FNMS(Tc, T9, T8 * Td);
+			 Tf = T1 + Te;
+			 T3r = T3d - T3c;
+			 T1N = T1 - Te;
+			 T3e = T3c + T3d;
+		    }
+		    {
+			 E Tq, T1O, Tz, T1P;
+			 {
+			      E Tl, Tp, Tu, Ty;
+			      Tl = Rp[WS(rs, 2)];
+			      Tp = Rm[WS(rs, 2)];
+			      Tq = FMA(Tk, Tl, To * Tp);
+			      T1O = FNMS(To, Tl, Tk * Tp);
+			      Tu = Rp[WS(rs, 6)];
+			      Ty = Rm[WS(rs, 6)];
+			      Tz = FMA(Tt, Tu, Tx * Ty);
+			      T1P = FNMS(Tx, Tu, Tt * Ty);
+			 }
+			 TA = Tq + Tz;
+			 T3s = Tq - Tz;
+			 T1Q = T1O - T1P;
+			 T3b = T1O + T1P;
+		    }
+		    {
+			 E TG, T1S, TL, T1T, T1U, T1V;
+			 {
+			      E TD, TF, TI, TK;
+			      TD = Rp[WS(rs, 1)];
+			      TF = Rm[WS(rs, 1)];
+			      TG = FMA(TC, TD, TE * TF);
+			      T1S = FNMS(TE, TD, TC * TF);
+			      TI = Rp[WS(rs, 5)];
+			      TK = Rm[WS(rs, 5)];
+			      TL = FMA(TH, TI, TJ * TK);
+			      T1T = FNMS(TJ, TI, TH * TK);
+			 }
+			 TM = TG + TL;
+			 T2M = T1S + T1T;
+			 T1U = T1S - T1T;
+			 T1V = TG - TL;
+			 T1W = T1U - T1V;
+			 T2w = T1V + T1U;
+		    }
+		    {
+			 E TT, T1Y, TY, T1Z, T1X, T20;
+			 {
+			      E TQ, TS, TV, TX;
+			      TQ = Rp[WS(rs, 7)];
+			      TS = Rm[WS(rs, 7)];
+			      TT = FMA(TP, TQ, TR * TS);
+			      T1Y = FNMS(TR, TQ, TP * TS);
+			      TV = Rp[WS(rs, 3)];
+			      TX = Rm[WS(rs, 3)];
+			      TY = FMA(TU, TV, TW * TX);
+			      T1Z = FNMS(TW, TV, TU * TX);
+			 }
+			 TZ = TT + TY;
+			 T2N = T1Y + T1Z;
+			 T1X = TT - TY;
+			 T20 = T1Y - T1Z;
+			 T21 = T1X + T20;
+			 T2x = T1X - T20;
+		    }
+		    {
+			 E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
+			 {
+			      E T1p, T1q, T1G, T1I;
+			      T1p = Ip[WS(rs, 7)];
+			      T1q = Im[WS(rs, 7)];
+			      T1r = FMA(TN, T1p, TO * T1q);
+			      T2k = FNMS(TO, T1p, TN * T1q);
+			      T1G = Ip[WS(rs, 5)];
+			      T1I = Im[WS(rs, 5)];
+			      T1J = FMA(T1F, T1G, T1H * T1I);
+			      T2h = FNMS(T1H, T1G, T1F * T1I);
+			 }
+			 {
+			      E T1v, T1z, T1C, T1D;
+			      T1v = Ip[WS(rs, 3)];
+			      T1z = Im[WS(rs, 3)];
+			      T1A = FMA(T1u, T1v, T1y * T1z);
+			      T2l = FNMS(T1y, T1v, T1u * T1z);
+			      T1C = Ip[WS(rs, 1)];
+			      T1D = Im[WS(rs, 1)];
+			      T1E = FMA(Tg, T1C, Ti * T1D);
+			      T2g = FNMS(Ti, T1C, Tg * T1D);
+			 }
+			 T1B = T1r + T1A;
+			 T1K = T1E + T1J;
+			 T2V = T1B - T1K;
+			 T2W = T2k + T2l;
+			 T2X = T2g + T2h;
+			 T2Y = T2W - T2X;
+			 {
+			      E T2f, T2i, T2m, T2n;
+			      T2f = T1r - T1A;
+			      T2i = T2g - T2h;
+			      T2j = T2f - T2i;
+			      T2D = T2f + T2i;
+			      T2m = T2k - T2l;
+			      T2n = T1E - T1J;
+			      T2o = T2m + T2n;
+			      T2E = T2m - T2n;
+			 }
+		    }
+		    {
+			 E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
+			 {
+			      E T12, T13, T1j, T1l;
+			      T12 = Ip[0];
+			      T13 = Im[0];
+			      T14 = FMA(T2, T12, T5 * T13);
+			      T24 = FNMS(T5, T12, T2 * T13);
+			      T1j = Ip[WS(rs, 6)];
+			      T1l = Im[WS(rs, 6)];
+			      T1m = FMA(T1i, T1j, T1k * T1l);
+			      T2b = FNMS(T1k, T1j, T1i * T1l);
+			 }
+			 {
+			      E T15, T16, T1c, T1g;
+			      T15 = Ip[WS(rs, 4)];
+			      T16 = Im[WS(rs, 4)];
+			      T17 = FMA(T3, T15, T6 * T16);
+			      T25 = FNMS(T6, T15, T3 * T16);
+			      T1c = Ip[WS(rs, 2)];
+			      T1g = Im[WS(rs, 2)];
+			      T1h = FMA(T1b, T1c, T1f * T1g);
+			      T2a = FNMS(T1f, T1c, T1b * T1g);
+			 }
+			 T18 = T14 + T17;
+			 T1n = T1h + T1m;
+			 T2Q = T18 - T1n;
+			 T2R = T24 + T25;
+			 T2S = T2a + T2b;
+			 T2T = T2R - T2S;
+			 {
+			      E T26, T27, T29, T2c;
+			      T26 = T24 - T25;
+			      T27 = T1h - T1m;
+			      T28 = T26 + T27;
+			      T2A = T26 - T27;
+			      T29 = T14 - T17;
+			      T2c = T2a - T2b;
+			      T2d = T29 - T2c;
+			      T2B = T29 + T2c;
+			 }
+		    }
+		    {
+			 E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
+			 {
+			      E T1R, T22, T3y, T3z;
+			      T1R = T1N - T1Q;
+			      T22 = KP707106781 * (T1W - T21);
+			      T23 = T1R + T22;
+			      T2r = T1R - T22;
+			      T3y = KP707106781 * (T2x - T2w);
+			      T3z = T3s + T3r;
+			      T3A = T3y + T3z;
+			      T3C = T3z - T3y;
+			 }
+			 {
+			      E T2e, T2p, T2s, T2t;
+			      T2e = FMA(KP923879532, T28, KP382683432 * T2d);
+			      T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
+			      T2q = T2e + T2p;
+			      T3B = T2p - T2e;
+			      T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
+			      T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
+			      T2u = T2s - T2t;
+			      T3x = T2s + T2t;
+			 }
+			 Rm[WS(rs, 4)] = T23 - T2q;
+			 Im[WS(rs, 4)] = T3x - T3A;
+			 Rp[WS(rs, 3)] = T23 + T2q;
+			 Ip[WS(rs, 3)] = T3x + T3A;
+			 Rm[0] = T2r - T2u;
+			 Im[0] = T3B - T3C;
+			 Rp[WS(rs, 7)] = T2r + T2u;
+			 Ip[WS(rs, 7)] = T3B + T3C;
+		    }
+		    {
+			 E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
+			 {
+			      E T2L, T2O, T3k, T3l;
+			      T2L = Tf - TA;
+			      T2O = T2M - T2N;
+			      T2P = T2L + T2O;
+			      T31 = T2L - T2O;
+			      T3k = TZ - TM;
+			      T3l = T3e - T3b;
+			      T3m = T3k + T3l;
+			      T3o = T3l - T3k;
+			 }
+			 {
+			      E T2U, T2Z, T32, T33;
+			      T2U = T2Q + T2T;
+			      T2Z = T2V - T2Y;
+			      T30 = KP707106781 * (T2U + T2Z);
+			      T3n = KP707106781 * (T2Z - T2U);
+			      T32 = T2T - T2Q;
+			      T33 = T2V + T2Y;
+			      T34 = KP707106781 * (T32 - T33);
+			      T3j = KP707106781 * (T32 + T33);
+			 }
+			 Rm[WS(rs, 5)] = T2P - T30;
+			 Im[WS(rs, 5)] = T3j - T3m;
+			 Rp[WS(rs, 2)] = T2P + T30;
+			 Ip[WS(rs, 2)] = T3j + T3m;
+			 Rm[WS(rs, 1)] = T31 - T34;
+			 Im[WS(rs, 1)] = T3n - T3o;
+			 Rp[WS(rs, 6)] = T31 + T34;
+			 Ip[WS(rs, 6)] = T3n + T3o;
+		    }
+		    {
+			 E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
+			 {
+			      E T2v, T2y, T3q, T3t;
+			      T2v = T1N + T1Q;
+			      T2y = KP707106781 * (T2w + T2x);
+			      T2z = T2v + T2y;
+			      T2H = T2v - T2y;
+			      T3q = KP707106781 * (T1W + T21);
+			      T3t = T3r - T3s;
+			      T3u = T3q + T3t;
+			      T3w = T3t - T3q;
+			 }
+			 {
+			      E T2C, T2F, T2I, T2J;
+			      T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
+			      T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
+			      T2G = T2C + T2F;
+			      T3v = T2F - T2C;
+			      T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
+			      T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
+			      T2K = T2I - T2J;
+			      T3p = T2I + T2J;
+			 }
+			 Rm[WS(rs, 6)] = T2z - T2G;
+			 Im[WS(rs, 6)] = T3p - T3u;
+			 Rp[WS(rs, 1)] = T2z + T2G;
+			 Ip[WS(rs, 1)] = T3p + T3u;
+			 Rm[WS(rs, 2)] = T2H - T2K;
+			 Im[WS(rs, 2)] = T3v - T3w;
+			 Rp[WS(rs, 5)] = T2H + T2K;
+			 Ip[WS(rs, 5)] = T3v + T3w;
+		    }
+		    {
+			 E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
+			 {
+			      E TB, T10, T3a, T3f;
+			      TB = Tf + TA;
+			      T10 = TM + TZ;
+			      T11 = TB + T10;
+			      T35 = TB - T10;
+			      T3a = T2M + T2N;
+			      T3f = T3b + T3e;
+			      T3g = T3a + T3f;
+			      T3i = T3f - T3a;
+			 }
+			 {
+			      E T1o, T1L, T36, T37;
+			      T1o = T18 + T1n;
+			      T1L = T1B + T1K;
+			      T1M = T1o + T1L;
+			      T3h = T1L - T1o;
+			      T36 = T2R + T2S;
+			      T37 = T2W + T2X;
+			      T38 = T36 - T37;
+			      T39 = T36 + T37;
+			 }
+			 Rm[WS(rs, 7)] = T11 - T1M;
+			 Im[WS(rs, 7)] = T39 - T3g;
+			 Rp[0] = T11 + T1M;
+			 Ip[0] = T39 + T3g;
+			 Rm[WS(rs, 3)] = T35 - T38;
+			 Im[WS(rs, 3)] = T3h - T3i;
+			 Rp[WS(rs, 4)] = T35 + T38;
+			 Ip[WS(rs, 4)] = T3h + T3i;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cf2_16", twinstr, &GENUS, {156, 68, 40, 0} };
+
+void X(codelet_hc2cf2_16) (planner *p) {
+     X(khc2c_register) (p, hc2cf2_16, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf2_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf2_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1064 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:44 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -dit -name hc2cf2_20 -include hc2cf.h */
+
+/*
+ * This function contains 276 FP additions, 198 FP multiplications,
+ * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
+ * 142 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T59, T5i, T5k, T5e, T5c, T5d, T5j, T5f;
+	       {
+		    E T2, Th, Tf, T6, T5, Tl, T1p, T1n, Ti, T3, Tt, Tv, T24, T1f, T1D;
+		    E Tb, T1P, Tm, T21, T1b, T7, T1A, Tw, T1H, T13, TA, T1L, T17, T1S, Tq;
+		    E T1o, T2g, T1t, T2c, TO, TK;
+		    {
+			 E T1e, Ta, Tk, Tg;
+			 T2 = W[0];
+			 Th = W[3];
+			 Tf = W[2];
+			 T6 = W[5];
+			 T5 = W[1];
+			 Tk = T2 * Th;
+			 Tg = T2 * Tf;
+			 T1e = Tf * T6;
+			 Ta = T2 * T6;
+			 Tl = FMA(T5, Tf, Tk);
+			 T1p = FNMS(T5, Tf, Tk);
+			 T1n = FMA(T5, Th, Tg);
+			 Ti = FNMS(T5, Th, Tg);
+			 T3 = W[4];
+			 Tt = W[6];
+			 Tv = W[7];
+			 {
+			      E Tp, Tj, TN, TJ;
+			      Tp = Ti * T6;
+			      T24 = FMA(Th, T3, T1e);
+			      T1f = FNMS(Th, T3, T1e);
+			      T1D = FNMS(T5, T3, Ta);
+			      Tb = FMA(T5, T3, Ta);
+			      Tj = Ti * T3;
+			      {
+				   E T1a, T4, Tu, T1G;
+				   T1a = Tf * T3;
+				   T4 = T2 * T3;
+				   Tu = Ti * Tt;
+				   T1G = T2 * Tt;
+				   {
+					E T12, Tz, T1K, T16;
+					T12 = Tf * Tt;
+					Tz = Ti * Tv;
+					T1K = T2 * Tv;
+					T16 = Tf * Tv;
+					T1P = FNMS(Tl, T6, Tj);
+					Tm = FMA(Tl, T6, Tj);
+					T21 = FNMS(Th, T6, T1a);
+					T1b = FMA(Th, T6, T1a);
+					T7 = FNMS(T5, T6, T4);
+					T1A = FMA(T5, T6, T4);
+					Tw = FMA(Tl, Tv, Tu);
+					T1H = FMA(T5, Tv, T1G);
+					T13 = FMA(Th, Tv, T12);
+					TA = FNMS(Tl, Tt, Tz);
+					T1L = FNMS(T5, Tt, T1K);
+					T17 = FNMS(Th, Tt, T16);
+					T1S = FMA(Tl, T3, Tp);
+					Tq = FNMS(Tl, T3, Tp);
+				   }
+			      }
+			      T1o = T1n * T3;
+			      T2g = T1n * Tv;
+			      TN = Tm * Tv;
+			      TJ = Tm * Tt;
+			      T1t = T1n * T6;
+			      T2c = T1n * Tt;
+			      TO = FNMS(Tq, Tt, TN);
+			      TK = FMA(Tq, Tv, TJ);
+			 }
+		    }
+		    {
+			 E Te, T2C, T4L, T57, T58, TD, T2H, T4H, T3J, T3Z, T11, T2v, T2P, T3P, T4d;
+			 E T4z, T3n, T43, T2r, T2z, T3b, T3T, T4n, T4v, T3u, T42, T20, T2y, T34, T3S;
+			 E T4k, T4w, T1c, T19, T1d, T3y, T1w, T2U, T1g, T1j, T1l;
+			 {
+			      E T2d, T2h, T2k, T1q, T1u, T2n, TL, TI, TM, T3F, TZ, T2N, TP, TS, TU;
+			      {
+				   E T1, T4K, T8, T9, Tc;
+				   T1 = Rp[0];
+				   T4K = Rm[0];
+				   T8 = Rp[WS(rs, 5)];
+				   T2d = FMA(T1p, Tv, T2c);
+				   T2h = FNMS(T1p, Tt, T2g);
+				   T2k = FMA(T1p, T6, T1o);
+				   T1q = FNMS(T1p, T6, T1o);
+				   T1u = FMA(T1p, T3, T1t);
+				   T2n = FNMS(T1p, T3, T1t);
+				   T9 = T7 * T8;
+				   Tc = Rm[WS(rs, 5)];
+				   {
+					E Tx, Ts, T2F, TC, T2E;
+					{
+					     E Tn, Tr, To, T2D, T4J, Ty, TB, Td, T4I;
+					     Tn = Ip[WS(rs, 2)];
+					     Tr = Im[WS(rs, 2)];
+					     Tx = Ip[WS(rs, 7)];
+					     Td = FMA(Tb, Tc, T9);
+					     T4I = T7 * Tc;
+					     To = Tm * Tn;
+					     T2D = Tm * Tr;
+					     Te = T1 + Td;
+					     T2C = T1 - Td;
+					     T4J = FNMS(Tb, T8, T4I);
+					     Ty = Tw * Tx;
+					     TB = Im[WS(rs, 7)];
+					     Ts = FMA(Tq, Tr, To);
+					     T4L = T4J + T4K;
+					     T57 = T4K - T4J;
+					     T2F = Tw * TB;
+					     TC = FMA(TA, TB, Ty);
+					     T2E = FNMS(Tq, Tn, T2D);
+					}
+					{
+					     E TF, TG, TH, TW, TY, T2G, T3E, TX, T2M;
+					     TF = Rp[WS(rs, 2)];
+					     T2G = FNMS(TA, Tx, T2F);
+					     T58 = Ts - TC;
+					     TD = Ts + TC;
+					     TG = Ti * TF;
+					     T2H = T2E - T2G;
+					     T4H = T2E + T2G;
+					     TH = Rm[WS(rs, 2)];
+					     TW = Ip[WS(rs, 9)];
+					     TY = Im[WS(rs, 9)];
+					     TL = Rp[WS(rs, 7)];
+					     TI = FMA(Tl, TH, TG);
+					     T3E = Ti * TH;
+					     TX = Tt * TW;
+					     T2M = Tt * TY;
+					     TM = TK * TL;
+					     T3F = FNMS(Tl, TF, T3E);
+					     TZ = FMA(Tv, TY, TX);
+					     T2N = FNMS(Tv, TW, T2M);
+					     TP = Rm[WS(rs, 7)];
+					     TS = Ip[WS(rs, 4)];
+					     TU = Im[WS(rs, 4)];
+					}
+				   }
+			      }
+			      {
+				   E T27, T26, T28, T3j, T2p, T39, T29, T2e, T2i;
+				   {
+					E T22, T23, T25, T2l, T2o, T3i, T2m, T38;
+					{
+					     E TR, T2J, T3H, TV, T2L, T4b, T3I;
+					     T22 = Rp[WS(rs, 6)];
+					     {
+						  E TQ, T3G, TT, T2K;
+						  TQ = FMA(TO, TP, TM);
+						  T3G = TK * TP;
+						  TT = T3 * TS;
+						  T2K = T3 * TU;
+						  TR = TI + TQ;
+						  T2J = TI - TQ;
+						  T3H = FNMS(TO, TL, T3G);
+						  TV = FMA(T6, TU, TT);
+						  T2L = FNMS(T6, TS, T2K);
+						  T23 = T21 * T22;
+					     }
+					     T4b = T3F + T3H;
+					     T3I = T3F - T3H;
+					     {
+						  E T10, T3D, T4c, T2O;
+						  T10 = TV + TZ;
+						  T3D = TZ - TV;
+						  T4c = T2L + T2N;
+						  T2O = T2L - T2N;
+						  T3J = T3D - T3I;
+						  T3Z = T3I + T3D;
+						  T11 = TR - T10;
+						  T2v = TR + T10;
+						  T2P = T2J - T2O;
+						  T3P = T2J + T2O;
+						  T4d = T4b + T4c;
+						  T4z = T4c - T4b;
+						  T25 = Rm[WS(rs, 6)];
+					     }
+					}
+					T2l = Ip[WS(rs, 3)];
+					T2o = Im[WS(rs, 3)];
+					T27 = Rp[WS(rs, 1)];
+					T26 = FMA(T24, T25, T23);
+					T3i = T21 * T25;
+					T2m = T2k * T2l;
+					T38 = T2k * T2o;
+					T28 = T1n * T27;
+					T3j = FNMS(T24, T22, T3i);
+					T2p = FMA(T2n, T2o, T2m);
+					T39 = FNMS(T2n, T2l, T38);
+					T29 = Rm[WS(rs, 1)];
+					T2e = Ip[WS(rs, 8)];
+					T2i = Im[WS(rs, 8)];
+				   }
+				   {
+					E T1I, T1F, T1J, T3q, T1Y, T32, T1M, T1Q, T1T;
+					{
+					     E T1B, T1C, T1E, T1V, T1X, T3p, T1W, T31;
+					     {
+						  E T2b, T35, T3l, T2j, T37, T4l, T3m;
+						  T1B = Rp[WS(rs, 4)];
+						  {
+						       E T2a, T3k, T2f, T36;
+						       T2a = FMA(T1p, T29, T28);
+						       T3k = T1n * T29;
+						       T2f = T2d * T2e;
+						       T36 = T2d * T2i;
+						       T2b = T26 + T2a;
+						       T35 = T26 - T2a;
+						       T3l = FNMS(T1p, T27, T3k);
+						       T2j = FMA(T2h, T2i, T2f);
+						       T37 = FNMS(T2h, T2e, T36);
+						       T1C = T1A * T1B;
+						  }
+						  T4l = T3j + T3l;
+						  T3m = T3j - T3l;
+						  {
+						       E T2q, T3h, T4m, T3a;
+						       T2q = T2j + T2p;
+						       T3h = T2p - T2j;
+						       T4m = T37 + T39;
+						       T3a = T37 - T39;
+						       T3n = T3h - T3m;
+						       T43 = T3m + T3h;
+						       T2r = T2b - T2q;
+						       T2z = T2b + T2q;
+						       T3b = T35 - T3a;
+						       T3T = T35 + T3a;
+						       T4n = T4l + T4m;
+						       T4v = T4m - T4l;
+						       T1E = Rm[WS(rs, 4)];
+						  }
+					     }
+					     T1V = Ip[WS(rs, 1)];
+					     T1X = Im[WS(rs, 1)];
+					     T1I = Rp[WS(rs, 9)];
+					     T1F = FMA(T1D, T1E, T1C);
+					     T3p = T1A * T1E;
+					     T1W = Tf * T1V;
+					     T31 = Tf * T1X;
+					     T1J = T1H * T1I;
+					     T3q = FNMS(T1D, T1B, T3p);
+					     T1Y = FMA(Th, T1X, T1W);
+					     T32 = FNMS(Th, T1V, T31);
+					     T1M = Rm[WS(rs, 9)];
+					     T1Q = Ip[WS(rs, 6)];
+					     T1T = Im[WS(rs, 6)];
+					}
+					{
+					     E T14, T15, T18, T1r, T1v, T3x, T1s, T2T;
+					     {
+						  E T1O, T2Y, T3s, T1U, T30, T4i, T3t;
+						  T14 = Rp[WS(rs, 8)];
+						  {
+						       E T1N, T3r, T1R, T2Z;
+						       T1N = FMA(T1L, T1M, T1J);
+						       T3r = T1H * T1M;
+						       T1R = T1P * T1Q;
+						       T2Z = T1P * T1T;
+						       T1O = T1F + T1N;
+						       T2Y = T1F - T1N;
+						       T3s = FNMS(T1L, T1I, T3r);
+						       T1U = FMA(T1S, T1T, T1R);
+						       T30 = FNMS(T1S, T1Q, T2Z);
+						       T15 = T13 * T14;
+						  }
+						  T4i = T3q + T3s;
+						  T3t = T3q - T3s;
+						  {
+						       E T1Z, T3o, T4j, T33;
+						       T1Z = T1U + T1Y;
+						       T3o = T1Y - T1U;
+						       T4j = T30 + T32;
+						       T33 = T30 - T32;
+						       T3u = T3o - T3t;
+						       T42 = T3t + T3o;
+						       T20 = T1O - T1Z;
+						       T2y = T1O + T1Z;
+						       T34 = T2Y - T33;
+						       T3S = T2Y + T33;
+						       T4k = T4i + T4j;
+						       T4w = T4j - T4i;
+						       T18 = Rm[WS(rs, 8)];
+						  }
+					     }
+					     T1r = Ip[WS(rs, 5)];
+					     T1v = Im[WS(rs, 5)];
+					     T1c = Rp[WS(rs, 3)];
+					     T19 = FMA(T17, T18, T15);
+					     T3x = T13 * T18;
+					     T1s = T1q * T1r;
+					     T2T = T1q * T1v;
+					     T1d = T1b * T1c;
+					     T3y = FNMS(T17, T14, T3x);
+					     T1w = FMA(T1u, T1v, T1s);
+					     T2U = FNMS(T1u, T1r, T2T);
+					     T1g = Rm[WS(rs, 3)];
+					     T1j = Ip[0];
+					     T1l = Im[0];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T3C, T40, T2W, T3Q, T4M, T4E, T4F, T4U, T4S;
+			      {
+				   E T4W, T2u, T2w, T4g, T4V, T4D, T4B, T54, T56, T4Y, T4u, T4C;
+				   {
+					E T4x, TE, T53, T1z, T2s, T52, T4A, T4t, T4s, T2t;
+					{
+					     E T1i, T2Q, T3A, T1m, T2S;
+					     T4x = T4v - T4w;
+					     T4W = T4w + T4v;
+					     {
+						  E T1h, T3z, T1k, T2R;
+						  T1h = FMA(T1f, T1g, T1d);
+						  T3z = T1b * T1g;
+						  T1k = T2 * T1j;
+						  T2R = T2 * T1l;
+						  T1i = T19 + T1h;
+						  T2Q = T19 - T1h;
+						  T3A = FNMS(T1f, T1c, T3z);
+						  T1m = FMA(T5, T1l, T1k);
+						  T2S = FNMS(T5, T1j, T2R);
+					     }
+					     TE = Te - TD;
+					     T2u = Te + TD;
+					     {
+						  E T4e, T3B, T1x, T3w;
+						  T4e = T3y + T3A;
+						  T3B = T3y - T3A;
+						  T1x = T1m + T1w;
+						  T3w = T1w - T1m;
+						  {
+						       E T4f, T2V, T1y, T4y;
+						       T4f = T2S + T2U;
+						       T2V = T2S - T2U;
+						       T3C = T3w - T3B;
+						       T40 = T3B + T3w;
+						       T1y = T1i - T1x;
+						       T2w = T1i + T1x;
+						       T2W = T2Q - T2V;
+						       T3Q = T2Q + T2V;
+						       T4g = T4e + T4f;
+						       T4y = T4f - T4e;
+						       T53 = T1y - T11;
+						       T1z = T11 + T1y;
+						       T2s = T20 + T2r;
+						       T52 = T20 - T2r;
+						       T4V = T4z + T4y;
+						       T4A = T4y - T4z;
+						  }
+					     }
+					}
+					T4t = T1z - T2s;
+					T2t = T1z + T2s;
+					T4D = FMA(KP618033988, T4x, T4A);
+					T4B = FNMS(KP618033988, T4A, T4x);
+					T54 = FMA(KP618033988, T53, T52);
+					T56 = FNMS(KP618033988, T52, T53);
+					Rm[WS(rs, 9)] = TE + T2t;
+					T4s = FNMS(KP250000000, T2t, TE);
+					T4Y = T4L - T4H;
+					T4M = T4H + T4L;
+					T4u = FNMS(KP559016994, T4t, T4s);
+					T4C = FMA(KP559016994, T4t, T4s);
+				   }
+				   {
+					E T2x, T4Q, T4p, T4r, T4R, T2A, T51, T55;
+					{
+					     E T4h, T50, T4X, T4o, T4Z;
+					     T4E = T4d + T4g;
+					     T4h = T4d - T4g;
+					     Rm[WS(rs, 1)] = FMA(KP951056516, T4B, T4u);
+					     Rp[WS(rs, 2)] = FNMS(KP951056516, T4B, T4u);
+					     Rp[WS(rs, 6)] = FMA(KP951056516, T4D, T4C);
+					     Rm[WS(rs, 5)] = FNMS(KP951056516, T4D, T4C);
+					     T50 = T4W - T4V;
+					     T4X = T4V + T4W;
+					     T4o = T4k - T4n;
+					     T4F = T4k + T4n;
+					     T2x = T2v + T2w;
+					     T4Q = T2v - T2w;
+					     Im[WS(rs, 9)] = T4X - T4Y;
+					     T4Z = FMA(KP250000000, T4X, T4Y);
+					     T4p = FMA(KP618033988, T4o, T4h);
+					     T4r = FNMS(KP618033988, T4h, T4o);
+					     T4R = T2z - T2y;
+					     T2A = T2y + T2z;
+					     T51 = FNMS(KP559016994, T50, T4Z);
+					     T55 = FMA(KP559016994, T50, T4Z);
+					}
+					{
+					     E T49, T48, T2B, T4a, T4q;
+					     T2B = T2x + T2A;
+					     T49 = T2x - T2A;
+					     Ip[WS(rs, 2)] = FMA(KP951056516, T54, T51);
+					     Im[WS(rs, 1)] = FMS(KP951056516, T54, T51);
+					     Ip[WS(rs, 6)] = FMA(KP951056516, T56, T55);
+					     Im[WS(rs, 5)] = FMS(KP951056516, T56, T55);
+					     Rp[0] = T2u + T2B;
+					     T48 = FNMS(KP250000000, T2B, T2u);
+					     T4a = FMA(KP559016994, T49, T48);
+					     T4q = FNMS(KP559016994, T49, T48);
+					     T4U = FMA(KP618033988, T4Q, T4R);
+					     T4S = FNMS(KP618033988, T4R, T4Q);
+					     Rm[WS(rs, 3)] = FMA(KP951056516, T4p, T4a);
+					     Rp[WS(rs, 4)] = FNMS(KP951056516, T4p, T4a);
+					     Rp[WS(rs, 8)] = FMA(KP951056516, T4r, T4q);
+					     Rm[WS(rs, 7)] = FNMS(KP951056516, T4r, T4q);
+					}
+				   }
+			      }
+			      {
+				   E T3O, T5u, T5w, T5o, T5q, T5n;
+				   {
+					E T5m, T5l, T2I, T4O, T3N, T3L, T2X, T5s, T4N, T5t, T3c, T3v, T3K, T4G;
+					T5m = T3u + T3n;
+					T3v = T3n - T3u;
+					T3K = T3C - T3J;
+					T5l = T3J + T3C;
+					T3O = T2C + T2H;
+					T2I = T2C - T2H;
+					T4O = T4E - T4F;
+					T4G = T4E + T4F;
+					T3N = FMA(KP618033988, T3v, T3K);
+					T3L = FNMS(KP618033988, T3K, T3v);
+					T2X = T2P + T2W;
+					T5s = T2P - T2W;
+					Ip[0] = T4G + T4M;
+					T4N = FNMS(KP250000000, T4G, T4M);
+					T5t = T34 - T3b;
+					T3c = T34 + T3b;
+					{
+					     E T3f, T3e, T4P, T4T, T3d, T3M, T3g;
+					     T4P = FMA(KP559016994, T4O, T4N);
+					     T4T = FNMS(KP559016994, T4O, T4N);
+					     T3f = T2X - T3c;
+					     T3d = T2X + T3c;
+					     Ip[WS(rs, 4)] = FMA(KP951056516, T4S, T4P);
+					     Im[WS(rs, 3)] = FMS(KP951056516, T4S, T4P);
+					     Ip[WS(rs, 8)] = FMA(KP951056516, T4U, T4T);
+					     Im[WS(rs, 7)] = FMS(KP951056516, T4U, T4T);
+					     Rm[WS(rs, 4)] = T2I + T3d;
+					     T3e = FNMS(KP250000000, T3d, T2I);
+					     T5u = FMA(KP618033988, T5t, T5s);
+					     T5w = FNMS(KP618033988, T5s, T5t);
+					     T5o = T58 + T57;
+					     T59 = T57 - T58;
+					     T3M = FMA(KP559016994, T3f, T3e);
+					     T3g = FNMS(KP559016994, T3f, T3e);
+					     Rp[WS(rs, 7)] = FNMS(KP951056516, T3L, T3g);
+					     Rp[WS(rs, 3)] = FMA(KP951056516, T3L, T3g);
+					     Rm[0] = FNMS(KP951056516, T3N, T3M);
+					     Rm[WS(rs, 8)] = FMA(KP951056516, T3N, T3M);
+					     T5q = T5l - T5m;
+					     T5n = T5l + T5m;
+					}
+				   }
+				   {
+					E T5a, T5b, T47, T45, T5h, T5g, T3V, T3X, T41, T44, T5p, T3W, T46, T3Y;
+					T5a = T3Z + T40;
+					T41 = T3Z - T40;
+					T44 = T42 - T43;
+					T5b = T42 + T43;
+					Im[WS(rs, 4)] = T5n - T5o;
+					T5p = FMA(KP250000000, T5n, T5o);
+					T47 = FNMS(KP618033988, T41, T44);
+					T45 = FMA(KP618033988, T44, T41);
+					{
+					     E T5r, T5v, T3R, T3U;
+					     T5r = FNMS(KP559016994, T5q, T5p);
+					     T5v = FMA(KP559016994, T5q, T5p);
+					     T3R = T3P + T3Q;
+					     T5h = T3P - T3Q;
+					     T5g = T3S - T3T;
+					     T3U = T3S + T3T;
+					     Im[0] = -(FMA(KP951056516, T5u, T5r));
+					     Im[WS(rs, 8)] = FMS(KP951056516, T5u, T5r);
+					     Ip[WS(rs, 7)] = FMA(KP951056516, T5w, T5v);
+					     Ip[WS(rs, 3)] = FNMS(KP951056516, T5w, T5v);
+					     T3V = T3R + T3U;
+					     T3X = T3R - T3U;
+					}
+					Rp[WS(rs, 5)] = T3O + T3V;
+					T3W = FNMS(KP250000000, T3V, T3O);
+					T5i = FNMS(KP618033988, T5h, T5g);
+					T5k = FMA(KP618033988, T5g, T5h);
+					T46 = FNMS(KP559016994, T3X, T3W);
+					T3Y = FMA(KP559016994, T3X, T3W);
+					Rp[WS(rs, 9)] = FNMS(KP951056516, T45, T3Y);
+					Rp[WS(rs, 1)] = FMA(KP951056516, T45, T3Y);
+					Rm[WS(rs, 2)] = FNMS(KP951056516, T47, T46);
+					Rm[WS(rs, 6)] = FMA(KP951056516, T47, T46);
+					T5e = T5a - T5b;
+					T5c = T5a + T5b;
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 5)] = T5c + T59;
+	       T5d = FNMS(KP250000000, T5c, T59);
+	       T5j = FMA(KP559016994, T5e, T5d);
+	       T5f = FNMS(KP559016994, T5e, T5d);
+	       Im[WS(rs, 2)] = -(FMA(KP951056516, T5i, T5f));
+	       Im[WS(rs, 6)] = FMS(KP951056516, T5i, T5f);
+	       Ip[WS(rs, 9)] = FMA(KP951056516, T5k, T5j);
+	       Ip[WS(rs, 1)] = FNMS(KP951056516, T5k, T5j);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cf2_20", twinstr, &GENUS, {136, 58, 140, 0} };
+
+void X(codelet_hc2cf2_20) (planner *p) {
+     X(khc2c_register) (p, hc2cf2_20, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -dit -name hc2cf2_20 -include hc2cf.h */
+
+/*
+ * This function contains 276 FP additions, 164 FP multiplications,
+ * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
+ * 123 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T2, T5, Tg, Ti, Tk, To, T1h, T1f, T6, T3, T8, T14, T1Q, Tc, T1O;
+	       E T1v, T18, T1t, T1n, T24, T1j, T22, Tq, Tu, T1E, T1G, Tx, Ty, Tz, TJ;
+	       E T1Z, TB, T1X, T1A, TZ, TL, T1y, TX;
+	       {
+		    E T7, T16, Ta, T13, T4, T17, Tb, T12;
+		    {
+			 E Th, Tn, Tj, Tm;
+			 T2 = W[0];
+			 T5 = W[1];
+			 Tg = W[2];
+			 Ti = W[3];
+			 Th = T2 * Tg;
+			 Tn = T5 * Tg;
+			 Tj = T5 * Ti;
+			 Tm = T2 * Ti;
+			 Tk = Th - Tj;
+			 To = Tm + Tn;
+			 T1h = Tm - Tn;
+			 T1f = Th + Tj;
+			 T6 = W[5];
+			 T7 = T5 * T6;
+			 T16 = Tg * T6;
+			 Ta = T2 * T6;
+			 T13 = Ti * T6;
+			 T3 = W[4];
+			 T4 = T2 * T3;
+			 T17 = Ti * T3;
+			 Tb = T5 * T3;
+			 T12 = Tg * T3;
+		    }
+		    T8 = T4 - T7;
+		    T14 = T12 + T13;
+		    T1Q = T16 + T17;
+		    Tc = Ta + Tb;
+		    T1O = T12 - T13;
+		    T1v = Ta - Tb;
+		    T18 = T16 - T17;
+		    T1t = T4 + T7;
+		    {
+			 E T1l, T1m, T1g, T1i;
+			 T1l = T1f * T6;
+			 T1m = T1h * T3;
+			 T1n = T1l + T1m;
+			 T24 = T1l - T1m;
+			 T1g = T1f * T3;
+			 T1i = T1h * T6;
+			 T1j = T1g - T1i;
+			 T22 = T1g + T1i;
+			 {
+			      E Tl, Tp, Ts, Tt;
+			      Tl = Tk * T3;
+			      Tp = To * T6;
+			      Tq = Tl + Tp;
+			      Ts = Tk * T6;
+			      Tt = To * T3;
+			      Tu = Ts - Tt;
+			      T1E = Tl - Tp;
+			      T1G = Ts + Tt;
+			      Tx = W[6];
+			      Ty = W[7];
+			      Tz = FMA(Tk, Tx, To * Ty);
+			      TJ = FMA(Tq, Tx, Tu * Ty);
+			      T1Z = FNMS(T1h, Tx, T1f * Ty);
+			      TB = FNMS(To, Tx, Tk * Ty);
+			      T1X = FMA(T1f, Tx, T1h * Ty);
+			      T1A = FNMS(T5, Tx, T2 * Ty);
+			      TZ = FNMS(Ti, Tx, Tg * Ty);
+			      TL = FNMS(Tu, Tx, Tq * Ty);
+			      T1y = FMA(T2, Tx, T5 * Ty);
+			      TX = FMA(Tg, Tx, Ti * Ty);
+			 }
+		    }
+	       }
+	       {
+		    E TF, T2b, T4D, T4M, T2K, T3r, T4a, T4m, T1N, T28, T29, T3J, T3M, T44, T3U;
+		    E T3V, T4j, T2f, T2g, T2h, T2n, T2s, T4K, T3g, T3h, T4z, T3n, T3o, T3p, T30;
+		    E T35, T36, TW, T1r, T1s, T3C, T3F, T43, T3X, T3Y, T4k, T2c, T2d, T2e, T2y;
+		    E T2D, T4J, T3d, T3e, T4y, T3k, T3l, T3m, T2P, T2U, T2V;
+		    {
+			 E T1, T48, Te, T47, Tw, T2H, TD, T2I, T9, Td;
+			 T1 = Rp[0];
+			 T48 = Rm[0];
+			 T9 = Rp[WS(rs, 5)];
+			 Td = Rm[WS(rs, 5)];
+			 Te = FMA(T8, T9, Tc * Td);
+			 T47 = FNMS(Tc, T9, T8 * Td);
+			 {
+			      E Tr, Tv, TA, TC;
+			      Tr = Ip[WS(rs, 2)];
+			      Tv = Im[WS(rs, 2)];
+			      Tw = FMA(Tq, Tr, Tu * Tv);
+			      T2H = FNMS(Tu, Tr, Tq * Tv);
+			      TA = Ip[WS(rs, 7)];
+			      TC = Im[WS(rs, 7)];
+			      TD = FMA(Tz, TA, TB * TC);
+			      T2I = FNMS(TB, TA, Tz * TC);
+			 }
+			 {
+			      E Tf, TE, T4B, T4C;
+			      Tf = T1 + Te;
+			      TE = Tw + TD;
+			      TF = Tf - TE;
+			      T2b = Tf + TE;
+			      T4B = T48 - T47;
+			      T4C = Tw - TD;
+			      T4D = T4B - T4C;
+			      T4M = T4C + T4B;
+			 }
+			 {
+			      E T2G, T2J, T46, T49;
+			      T2G = T1 - Te;
+			      T2J = T2H - T2I;
+			      T2K = T2G - T2J;
+			      T3r = T2G + T2J;
+			      T46 = T2H + T2I;
+			      T49 = T47 + T48;
+			      T4a = T46 + T49;
+			      T4m = T49 - T46;
+			 }
+		    }
+		    {
+			 E T1D, T3H, T2l, T2W, T27, T3L, T2r, T34, T1M, T3I, T2m, T2Z, T1W, T3K, T2q;
+			 E T31;
+			 {
+			      E T1x, T2j, T1C, T2k;
+			      {
+				   E T1u, T1w, T1z, T1B;
+				   T1u = Rp[WS(rs, 4)];
+				   T1w = Rm[WS(rs, 4)];
+				   T1x = FMA(T1t, T1u, T1v * T1w);
+				   T2j = FNMS(T1v, T1u, T1t * T1w);
+				   T1z = Rp[WS(rs, 9)];
+				   T1B = Rm[WS(rs, 9)];
+				   T1C = FMA(T1y, T1z, T1A * T1B);
+				   T2k = FNMS(T1A, T1z, T1y * T1B);
+			      }
+			      T1D = T1x + T1C;
+			      T3H = T2j + T2k;
+			      T2l = T2j - T2k;
+			      T2W = T1x - T1C;
+			 }
+			 {
+			      E T21, T32, T26, T33;
+			      {
+				   E T1Y, T20, T23, T25;
+				   T1Y = Ip[WS(rs, 8)];
+				   T20 = Im[WS(rs, 8)];
+				   T21 = FMA(T1X, T1Y, T1Z * T20);
+				   T32 = FNMS(T1Z, T1Y, T1X * T20);
+				   T23 = Ip[WS(rs, 3)];
+				   T25 = Im[WS(rs, 3)];
+				   T26 = FMA(T22, T23, T24 * T25);
+				   T33 = FNMS(T24, T23, T22 * T25);
+			      }
+			      T27 = T21 + T26;
+			      T3L = T32 + T33;
+			      T2r = T21 - T26;
+			      T34 = T32 - T33;
+			 }
+			 {
+			      E T1I, T2X, T1L, T2Y;
+			      {
+				   E T1F, T1H, T1J, T1K;
+				   T1F = Ip[WS(rs, 6)];
+				   T1H = Im[WS(rs, 6)];
+				   T1I = FMA(T1E, T1F, T1G * T1H);
+				   T2X = FNMS(T1G, T1F, T1E * T1H);
+				   T1J = Ip[WS(rs, 1)];
+				   T1K = Im[WS(rs, 1)];
+				   T1L = FMA(Tg, T1J, Ti * T1K);
+				   T2Y = FNMS(Ti, T1J, Tg * T1K);
+			      }
+			      T1M = T1I + T1L;
+			      T3I = T2X + T2Y;
+			      T2m = T1I - T1L;
+			      T2Z = T2X - T2Y;
+			 }
+			 {
+			      E T1S, T2o, T1V, T2p;
+			      {
+				   E T1P, T1R, T1T, T1U;
+				   T1P = Rp[WS(rs, 6)];
+				   T1R = Rm[WS(rs, 6)];
+				   T1S = FMA(T1O, T1P, T1Q * T1R);
+				   T2o = FNMS(T1Q, T1P, T1O * T1R);
+				   T1T = Rp[WS(rs, 1)];
+				   T1U = Rm[WS(rs, 1)];
+				   T1V = FMA(T1f, T1T, T1h * T1U);
+				   T2p = FNMS(T1h, T1T, T1f * T1U);
+			      }
+			      T1W = T1S + T1V;
+			      T3K = T2o + T2p;
+			      T2q = T2o - T2p;
+			      T31 = T1S - T1V;
+			 }
+			 T1N = T1D - T1M;
+			 T28 = T1W - T27;
+			 T29 = T1N + T28;
+			 T3J = T3H + T3I;
+			 T3M = T3K + T3L;
+			 T44 = T3J + T3M;
+			 T3U = T3H - T3I;
+			 T3V = T3L - T3K;
+			 T4j = T3V - T3U;
+			 T2f = T1D + T1M;
+			 T2g = T1W + T27;
+			 T2h = T2f + T2g;
+			 T2n = T2l + T2m;
+			 T2s = T2q + T2r;
+			 T4K = T2n + T2s;
+			 T3g = T2l - T2m;
+			 T3h = T2q - T2r;
+			 T4z = T3g + T3h;
+			 T3n = T2W + T2Z;
+			 T3o = T31 + T34;
+			 T3p = T3n + T3o;
+			 T30 = T2W - T2Z;
+			 T35 = T31 - T34;
+			 T36 = T30 + T35;
+		    }
+		    {
+			 E TO, T3A, T2w, T2L, T1q, T3E, T2z, T2T, TV, T3B, T2x, T2O, T1b, T3D, T2C;
+			 E T2Q;
+			 {
+			      E TI, T2u, TN, T2v;
+			      {
+				   E TG, TH, TK, TM;
+				   TG = Rp[WS(rs, 2)];
+				   TH = Rm[WS(rs, 2)];
+				   TI = FMA(Tk, TG, To * TH);
+				   T2u = FNMS(To, TG, Tk * TH);
+				   TK = Rp[WS(rs, 7)];
+				   TM = Rm[WS(rs, 7)];
+				   TN = FMA(TJ, TK, TL * TM);
+				   T2v = FNMS(TL, TK, TJ * TM);
+			      }
+			      TO = TI + TN;
+			      T3A = T2u + T2v;
+			      T2w = T2u - T2v;
+			      T2L = TI - TN;
+			 }
+			 {
+			      E T1e, T2R, T1p, T2S;
+			      {
+				   E T1c, T1d, T1k, T1o;
+				   T1c = Ip[0];
+				   T1d = Im[0];
+				   T1e = FMA(T2, T1c, T5 * T1d);
+				   T2R = FNMS(T5, T1c, T2 * T1d);
+				   T1k = Ip[WS(rs, 5)];
+				   T1o = Im[WS(rs, 5)];
+				   T1p = FMA(T1j, T1k, T1n * T1o);
+				   T2S = FNMS(T1n, T1k, T1j * T1o);
+			      }
+			      T1q = T1e + T1p;
+			      T3E = T2R + T2S;
+			      T2z = T1p - T1e;
+			      T2T = T2R - T2S;
+			 }
+			 {
+			      E TR, T2M, TU, T2N;
+			      {
+				   E TP, TQ, TS, TT;
+				   TP = Ip[WS(rs, 4)];
+				   TQ = Im[WS(rs, 4)];
+				   TR = FMA(T3, TP, T6 * TQ);
+				   T2M = FNMS(T6, TP, T3 * TQ);
+				   TS = Ip[WS(rs, 9)];
+				   TT = Im[WS(rs, 9)];
+				   TU = FMA(Tx, TS, Ty * TT);
+				   T2N = FNMS(Ty, TS, Tx * TT);
+			      }
+			      TV = TR + TU;
+			      T3B = T2M + T2N;
+			      T2x = TR - TU;
+			      T2O = T2M - T2N;
+			 }
+			 {
+			      E T11, T2A, T1a, T2B;
+			      {
+				   E TY, T10, T15, T19;
+				   TY = Rp[WS(rs, 8)];
+				   T10 = Rm[WS(rs, 8)];
+				   T11 = FMA(TX, TY, TZ * T10);
+				   T2A = FNMS(TZ, TY, TX * T10);
+				   T15 = Rp[WS(rs, 3)];
+				   T19 = Rm[WS(rs, 3)];
+				   T1a = FMA(T14, T15, T18 * T19);
+				   T2B = FNMS(T18, T15, T14 * T19);
+			      }
+			      T1b = T11 + T1a;
+			      T3D = T2A + T2B;
+			      T2C = T2A - T2B;
+			      T2Q = T11 - T1a;
+			 }
+			 TW = TO - TV;
+			 T1r = T1b - T1q;
+			 T1s = TW + T1r;
+			 T3C = T3A + T3B;
+			 T3F = T3D + T3E;
+			 T43 = T3C + T3F;
+			 T3X = T3A - T3B;
+			 T3Y = T3D - T3E;
+			 T4k = T3X + T3Y;
+			 T2c = TO + TV;
+			 T2d = T1b + T1q;
+			 T2e = T2c + T2d;
+			 T2y = T2w + T2x;
+			 T2D = T2z - T2C;
+			 T4J = T2D - T2y;
+			 T3d = T2w - T2x;
+			 T3e = T2C + T2z;
+			 T4y = T3d + T3e;
+			 T3k = T2L + T2O;
+			 T3l = T2Q + T2T;
+			 T3m = T3k + T3l;
+			 T2P = T2L - T2O;
+			 T2U = T2Q - T2T;
+			 T2V = T2P + T2U;
+		    }
+		    {
+			 E T3S, T2a, T3R, T40, T42, T3W, T3Z, T41, T3T;
+			 T3S = KP559016994 * (T1s - T29);
+			 T2a = T1s + T29;
+			 T3R = FNMS(KP250000000, T2a, TF);
+			 T3W = T3U + T3V;
+			 T3Z = T3X - T3Y;
+			 T40 = FNMS(KP587785252, T3Z, KP951056516 * T3W);
+			 T42 = FMA(KP951056516, T3Z, KP587785252 * T3W);
+			 Rm[WS(rs, 9)] = TF + T2a;
+			 T41 = T3S + T3R;
+			 Rm[WS(rs, 5)] = T41 - T42;
+			 Rp[WS(rs, 6)] = T41 + T42;
+			 T3T = T3R - T3S;
+			 Rp[WS(rs, 2)] = T3T - T40;
+			 Rm[WS(rs, 1)] = T3T + T40;
+		    }
+		    {
+			 E T4r, T4l, T4q, T4p, T4t, T4n, T4o, T4u, T4s;
+			 T4r = KP559016994 * (T4k + T4j);
+			 T4l = T4j - T4k;
+			 T4q = FMA(KP250000000, T4l, T4m);
+			 T4n = T1r - TW;
+			 T4o = T1N - T28;
+			 T4p = FMA(KP587785252, T4n, KP951056516 * T4o);
+			 T4t = FNMS(KP587785252, T4o, KP951056516 * T4n);
+			 Im[WS(rs, 9)] = T4l - T4m;
+			 T4u = T4r + T4q;
+			 Im[WS(rs, 5)] = T4t - T4u;
+			 Ip[WS(rs, 6)] = T4t + T4u;
+			 T4s = T4q - T4r;
+			 Im[WS(rs, 1)] = T4p - T4s;
+			 Ip[WS(rs, 2)] = T4p + T4s;
+		    }
+		    {
+			 E T3x, T2i, T3y, T3O, T3Q, T3G, T3N, T3P, T3z;
+			 T3x = KP559016994 * (T2e - T2h);
+			 T2i = T2e + T2h;
+			 T3y = FNMS(KP250000000, T2i, T2b);
+			 T3G = T3C - T3F;
+			 T3N = T3J - T3M;
+			 T3O = FMA(KP951056516, T3G, KP587785252 * T3N);
+			 T3Q = FNMS(KP587785252, T3G, KP951056516 * T3N);
+			 Rp[0] = T2b + T2i;
+			 T3P = T3y - T3x;
+			 Rm[WS(rs, 7)] = T3P - T3Q;
+			 Rp[WS(rs, 8)] = T3P + T3Q;
+			 T3z = T3x + T3y;
+			 Rp[WS(rs, 4)] = T3z - T3O;
+			 Rm[WS(rs, 3)] = T3z + T3O;
+		    }
+		    {
+			 E T4e, T45, T4f, T4d, T4h, T4b, T4c, T4i, T4g;
+			 T4e = KP559016994 * (T43 - T44);
+			 T45 = T43 + T44;
+			 T4f = FNMS(KP250000000, T45, T4a);
+			 T4b = T2c - T2d;
+			 T4c = T2f - T2g;
+			 T4d = FMA(KP951056516, T4b, KP587785252 * T4c);
+			 T4h = FNMS(KP951056516, T4c, KP587785252 * T4b);
+			 Ip[0] = T45 + T4a;
+			 T4i = T4f - T4e;
+			 Im[WS(rs, 7)] = T4h - T4i;
+			 Ip[WS(rs, 8)] = T4h + T4i;
+			 T4g = T4e + T4f;
+			 Im[WS(rs, 3)] = T4d - T4g;
+			 Ip[WS(rs, 4)] = T4d + T4g;
+		    }
+		    {
+			 E T39, T37, T38, T2F, T3b, T2t, T2E, T3c, T3a;
+			 T39 = KP559016994 * (T2V - T36);
+			 T37 = T2V + T36;
+			 T38 = FNMS(KP250000000, T37, T2K);
+			 T2t = T2n - T2s;
+			 T2E = T2y + T2D;
+			 T2F = FNMS(KP587785252, T2E, KP951056516 * T2t);
+			 T3b = FMA(KP951056516, T2E, KP587785252 * T2t);
+			 Rm[WS(rs, 4)] = T2K + T37;
+			 T3c = T39 + T38;
+			 Rm[WS(rs, 8)] = T3b + T3c;
+			 Rm[0] = T3c - T3b;
+			 T3a = T38 - T39;
+			 Rp[WS(rs, 3)] = T2F + T3a;
+			 Rp[WS(rs, 7)] = T3a - T2F;
+		    }
+		    {
+			 E T4Q, T4L, T4R, T4P, T4U, T4N, T4O, T4T, T4S;
+			 T4Q = KP559016994 * (T4J + T4K);
+			 T4L = T4J - T4K;
+			 T4R = FMA(KP250000000, T4L, T4M);
+			 T4N = T2P - T2U;
+			 T4O = T30 - T35;
+			 T4P = FMA(KP951056516, T4N, KP587785252 * T4O);
+			 T4U = FNMS(KP587785252, T4N, KP951056516 * T4O);
+			 Im[WS(rs, 4)] = T4L - T4M;
+			 T4T = T4Q + T4R;
+			 Ip[WS(rs, 3)] = T4T - T4U;
+			 Ip[WS(rs, 7)] = T4U + T4T;
+			 T4S = T4Q - T4R;
+			 Im[WS(rs, 8)] = T4P + T4S;
+			 Im[0] = T4S - T4P;
+		    }
+		    {
+			 E T3q, T3s, T3t, T3j, T3v, T3f, T3i, T3w, T3u;
+			 T3q = KP559016994 * (T3m - T3p);
+			 T3s = T3m + T3p;
+			 T3t = FNMS(KP250000000, T3s, T3r);
+			 T3f = T3d - T3e;
+			 T3i = T3g - T3h;
+			 T3j = FMA(KP951056516, T3f, KP587785252 * T3i);
+			 T3v = FNMS(KP587785252, T3f, KP951056516 * T3i);
+			 Rp[WS(rs, 5)] = T3r + T3s;
+			 T3w = T3t - T3q;
+			 Rm[WS(rs, 6)] = T3v + T3w;
+			 Rm[WS(rs, 2)] = T3w - T3v;
+			 T3u = T3q + T3t;
+			 Rp[WS(rs, 1)] = T3j + T3u;
+			 Rp[WS(rs, 9)] = T3u - T3j;
+		    }
+		    {
+			 E T4A, T4E, T4F, T4x, T4I, T4v, T4w, T4H, T4G;
+			 T4A = KP559016994 * (T4y - T4z);
+			 T4E = T4y + T4z;
+			 T4F = FNMS(KP250000000, T4E, T4D);
+			 T4v = T3n - T3o;
+			 T4w = T3k - T3l;
+			 T4x = FNMS(KP587785252, T4w, KP951056516 * T4v);
+			 T4I = FMA(KP951056516, T4w, KP587785252 * T4v);
+			 Ip[WS(rs, 5)] = T4E + T4D;
+			 T4H = T4A + T4F;
+			 Ip[WS(rs, 1)] = T4H - T4I;
+			 Ip[WS(rs, 9)] = T4I + T4H;
+			 T4G = T4A - T4F;
+			 Im[WS(rs, 6)] = T4x + T4G;
+			 Im[WS(rs, 2)] = T4G - T4x;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cf2_20", twinstr, &GENUS, {204, 92, 72, 0} };
+
+void X(codelet_hc2cf2_20) (planner *p) {
+     X(khc2c_register) (p, hc2cf2_20, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf2_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf2_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1841 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:43 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -dit -name hc2cf2_32 -include hc2cf.h */
+
+/*
+ * This function contains 488 FP additions, 350 FP multiplications,
+ * (or, 236 additions, 98 multiplications, 252 fused multiply/add),
+ * 181 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf2_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T9A, T9z;
+	       {
+		    E T2, T8, T3, T6, Te, Tr, T18, T4, Ta, Tz, T1n, T10, Ti, T5, Tc;
+		    T2 = W[0];
+		    T8 = W[4];
+		    T3 = W[2];
+		    T6 = W[3];
+		    Te = W[6];
+		    Tr = T2 * T8;
+		    T18 = T3 * T8;
+		    T4 = T2 * T3;
+		    Ta = T2 * T6;
+		    Tz = T3 * Te;
+		    T1n = T8 * Te;
+		    T10 = T2 * Te;
+		    Ti = W[7];
+		    T5 = W[1];
+		    Tc = W[5];
+		    {
+			 E T34, T31, T2X, T2T, Tq, T46, T8H, T97, TH, T98, T4b, T8D, TZ, T7f, T4j;
+			 E T6t, T1g, T7g, T4q, T6u, T1J, T7m, T6y, T4z, T7l, T8d, T6x, T4G, T2k, T7o;
+			 E T7r, T8e, T6B, T4O, T6A, T4V, T7L, T3G, T6P, T61, T6M, T5E, T8n, T7J, T5s;
+			 E T6I, T2N, T7A, T55, T6F, T7x, T8i, T5L, T62, T43, T7G, T5S, T63, T7O, T8o;
+			 E T2U, T2R, T2V, T58, T3a, T5h, T2Y, T32, T35;
+			 {
+			      E T1K, T23, T1N, T26, T2b, T1U, T3C, T3j, T3z, T3f, T1R, T29, TR, Th, T2J;
+			      E T2F, Td, TP, T3r, T3n, T2w, T2s, T3Q, T3M, T1Z, T1V, T2g, T2c;
+			      {
+				   E T11, T1C, TM, Tb, TJ, T7, T1o, T19, T1w, T1F, T15, T1s, T1d, T1z, TW;
+				   E TS, Ty, T48, TG, T4a;
+				   {
+					E T1, TA, Ts, TE, Tw, Tn, Tj, T8G, Tk, To, T14;
+					T1 = Rp[0];
+					TA = FMA(T6, Ti, Tz);
+					T1K = FNMS(T6, Ti, Tz);
+					T14 = T2 * Ti;
+					{
+					     E T1r, TD, T1c, Tv;
+					     T1r = T8 * Ti;
+					     TD = T3 * Ti;
+					     T11 = FNMS(T5, Ti, T10);
+					     T1C = FMA(T5, Ti, T10);
+					     TM = FMA(T5, T3, Ta);
+					     Tb = FNMS(T5, T3, Ta);
+					     TJ = FNMS(T5, T6, T4);
+					     T7 = FMA(T5, T6, T4);
+					     T1o = FMA(Tc, Ti, T1n);
+					     T23 = FMA(T6, Tc, T18);
+					     T19 = FNMS(T6, Tc, T18);
+					     T1w = FNMS(T5, Tc, Tr);
+					     Ts = FMA(T5, Tc, Tr);
+					     T1c = T3 * Tc;
+					     Tv = T2 * Tc;
+					     T1F = FNMS(T5, Te, T14);
+					     T15 = FMA(T5, Te, T14);
+					     T1s = FNMS(Tc, Te, T1r);
+					     T1N = FMA(T6, Te, TD);
+					     TE = FNMS(T6, Te, TD);
+					     {
+						  E T1T, T3i, T3e, T1Q;
+						  T1T = TJ * Tc;
+						  T3i = TJ * Ti;
+						  T3e = TJ * Te;
+						  T1Q = TJ * T8;
+						  {
+						       E Tg, T2I, T2E, T9;
+						       Tg = T7 * Tc;
+						       T2I = T7 * Ti;
+						       T2E = T7 * Te;
+						       T9 = T7 * T8;
+						       {
+							    E T3q, T3m, T2v, T2r;
+							    T3q = T19 * Ti;
+							    T3m = T19 * Te;
+							    T2v = T1w * Ti;
+							    T2r = T1w * Te;
+							    {
+								 E T2W, T2S, T3P, T3L;
+								 T2W = T23 * Ti;
+								 T2S = T23 * Te;
+								 T3P = Ts * Ti;
+								 T3L = Ts * Te;
+								 T26 = FNMS(T6, T8, T1c);
+								 T1d = FMA(T6, T8, T1c);
+								 T1z = FMA(T5, T8, Tv);
+								 Tw = FNMS(T5, T8, Tv);
+								 T2b = FNMS(TM, T8, T1T);
+								 T1U = FMA(TM, T8, T1T);
+								 T3C = FNMS(TM, Te, T3i);
+								 T3j = FMA(TM, Te, T3i);
+								 T3z = FMA(TM, Ti, T3e);
+								 T3f = FNMS(TM, Ti, T3e);
+								 T1R = FNMS(TM, Tc, T1Q);
+								 T29 = FMA(TM, Tc, T1Q);
+								 TR = FNMS(Tb, T8, Tg);
+								 Th = FMA(Tb, T8, Tg);
+								 T34 = FMA(Tb, Te, T2I);
+								 T2J = FNMS(Tb, Te, T2I);
+								 T31 = FNMS(Tb, Ti, T2E);
+								 T2F = FMA(Tb, Ti, T2E);
+								 Td = FNMS(Tb, Tc, T9);
+								 TP = FMA(Tb, Tc, T9);
+								 T2X = FNMS(T26, Te, T2W);
+								 T2T = FMA(T26, Ti, T2S);
+								 T3r = FNMS(T1d, Te, T3q);
+								 T3n = FMA(T1d, Ti, T3m);
+								 T2w = FNMS(T1z, Te, T2v);
+								 T2s = FMA(T1z, Ti, T2r);
+								 T3Q = FNMS(Tw, Te, T3P);
+								 T3M = FMA(Tw, Ti, T3L);
+								 {
+								      E T1Y, T1S, T2f, T2a;
+								      T1Y = T1R * Ti;
+								      T1S = T1R * Te;
+								      T2f = T29 * Ti;
+								      T2a = T29 * Te;
+								      {
+									   E Tm, Tf, TV, TQ;
+									   Tm = Td * Ti;
+									   Tf = Td * Te;
+									   TV = TP * Ti;
+									   TQ = TP * Te;
+									   T1Z = FNMS(T1U, Te, T1Y);
+									   T1V = FMA(T1U, Ti, T1S);
+									   T2g = FNMS(T2b, Te, T2f);
+									   T2c = FMA(T2b, Ti, T2a);
+									   Tn = FNMS(Th, Te, Tm);
+									   Tj = FMA(Th, Ti, Tf);
+									   TW = FNMS(TR, Te, TV);
+									   TS = FMA(TR, Ti, TQ);
+									   T8G = Rm[0];
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					Tk = Rp[WS(rs, 8)];
+					To = Rm[WS(rs, 8)];
+					{
+					     E Tt, Tx, Tu, T47, TB, TF, TC, T49;
+					     {
+						  E Tl, T8E, Tp, T8F;
+						  Tt = Rp[WS(rs, 4)];
+						  Tx = Rm[WS(rs, 4)];
+						  Tl = Tj * Tk;
+						  T8E = Tj * To;
+						  Tu = Ts * Tt;
+						  T47 = Ts * Tx;
+						  Tp = FMA(Tn, To, Tl);
+						  T8F = FNMS(Tn, Tk, T8E);
+						  TB = Rp[WS(rs, 12)];
+						  TF = Rm[WS(rs, 12)];
+						  Tq = T1 + Tp;
+						  T46 = T1 - Tp;
+						  T8H = T8F + T8G;
+						  T97 = T8G - T8F;
+						  TC = TA * TB;
+						  T49 = TA * TF;
+					     }
+					     Ty = FMA(Tw, Tx, Tu);
+					     T48 = FNMS(Tw, Tt, T47);
+					     TG = FMA(TE, TF, TC);
+					     T4a = FNMS(TE, TB, T49);
+					}
+				   }
+				   {
+					E TT, TX, TO, T4f, TU, T4g;
+					{
+					     E TK, TN, TL, T4e;
+					     TK = Rp[WS(rs, 2)];
+					     TN = Rm[WS(rs, 2)];
+					     TH = Ty + TG;
+					     T98 = Ty - TG;
+					     T4b = T48 - T4a;
+					     T8D = T48 + T4a;
+					     TL = TJ * TK;
+					     T4e = TJ * TN;
+					     TT = Rp[WS(rs, 10)];
+					     TX = Rm[WS(rs, 10)];
+					     TO = FMA(TM, TN, TL);
+					     T4f = FNMS(TM, TK, T4e);
+					     TU = TS * TT;
+					     T4g = TS * TX;
+					}
+					{
+					     E T17, T4m, T1a, T1e, T4d, T4i;
+					     {
+						  E T12, T16, TY, T4h, T13, T4l;
+						  T12 = Rp[WS(rs, 14)];
+						  T16 = Rm[WS(rs, 14)];
+						  TY = FMA(TW, TX, TU);
+						  T4h = FNMS(TW, TT, T4g);
+						  T13 = T11 * T12;
+						  T4l = T11 * T16;
+						  TZ = TO + TY;
+						  T4d = TO - TY;
+						  T7f = T4f + T4h;
+						  T4i = T4f - T4h;
+						  T17 = FMA(T15, T16, T13);
+						  T4m = FNMS(T15, T12, T4l);
+					     }
+					     T4j = T4d + T4i;
+					     T6t = T4i - T4d;
+					     T1a = Rp[WS(rs, 6)];
+					     T1e = Rm[WS(rs, 6)];
+					     {
+						  E T1m, T4B, T1H, T4x, T1x, T1A, T1u, T4D, T1y, T4u;
+						  {
+						       E T1D, T1G, T1E, T4w;
+						       {
+							    E T1f, T4o, T4k, T4p;
+							    {
+								 E T1j, T1l, T1b, T4n, T1k, T4A;
+								 T1j = Rp[WS(rs, 1)];
+								 T1l = Rm[WS(rs, 1)];
+								 T1b = T19 * T1a;
+								 T4n = T19 * T1e;
+								 T1k = T7 * T1j;
+								 T4A = T7 * T1l;
+								 T1f = FMA(T1d, T1e, T1b);
+								 T4o = FNMS(T1d, T1a, T4n);
+								 T1m = FMA(Tb, T1l, T1k);
+								 T4B = FNMS(Tb, T1j, T4A);
+							    }
+							    T1g = T17 + T1f;
+							    T4k = T17 - T1f;
+							    T7g = T4m + T4o;
+							    T4p = T4m - T4o;
+							    T1D = Rp[WS(rs, 13)];
+							    T1G = Rm[WS(rs, 13)];
+							    T4q = T4k - T4p;
+							    T6u = T4k + T4p;
+							    T1E = T1C * T1D;
+							    T4w = T1C * T1G;
+						       }
+						       {
+							    E T1p, T1t, T1q, T4C;
+							    T1p = Rp[WS(rs, 9)];
+							    T1t = Rm[WS(rs, 9)];
+							    T1H = FMA(T1F, T1G, T1E);
+							    T4x = FNMS(T1F, T1D, T4w);
+							    T1q = T1o * T1p;
+							    T4C = T1o * T1t;
+							    T1x = Rp[WS(rs, 5)];
+							    T1A = Rm[WS(rs, 5)];
+							    T1u = FMA(T1s, T1t, T1q);
+							    T4D = FNMS(T1s, T1p, T4C);
+							    T1y = T1w * T1x;
+							    T4u = T1w * T1A;
+						       }
+						  }
+						  {
+						       E T4t, T1v, T7j, T4E, T1B, T4v;
+						       T4t = T1m - T1u;
+						       T1v = T1m + T1u;
+						       T7j = T4B + T4D;
+						       T4E = T4B - T4D;
+						       T1B = FMA(T1z, T1A, T1y);
+						       T4v = FNMS(T1z, T1x, T4u);
+						       {
+							    E T4F, T1I, T4y, T7k;
+							    T4F = T1B - T1H;
+							    T1I = T1B + T1H;
+							    T4y = T4v - T4x;
+							    T7k = T4v + T4x;
+							    T1J = T1v + T1I;
+							    T7m = T1v - T1I;
+							    T6y = T4t - T4y;
+							    T4z = T4t + T4y;
+							    T7l = T7j - T7k;
+							    T8d = T7j + T7k;
+							    T6x = T4E + T4F;
+							    T4G = T4E - T4F;
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T5C, T3u, T5y, T7H, T5Z, T3F, T60, T5A, T4T, T4U;
+				   {
+					E T1P, T4Q, T2i, T4M, T21, T4S, T28, T4K;
+					{
+					     E T1L, T1O, T1W, T20;
+					     T1L = Rp[WS(rs, 15)];
+					     T1O = Rm[WS(rs, 15)];
+					     {
+						  E T2d, T2h, T1M, T4P, T2e, T4L;
+						  T2d = Rp[WS(rs, 11)];
+						  T2h = Rm[WS(rs, 11)];
+						  T1M = T1K * T1L;
+						  T4P = T1K * T1O;
+						  T2e = T2c * T2d;
+						  T4L = T2c * T2h;
+						  T1P = FMA(T1N, T1O, T1M);
+						  T4Q = FNMS(T1N, T1L, T4P);
+						  T2i = FMA(T2g, T2h, T2e);
+						  T4M = FNMS(T2g, T2d, T4L);
+					     }
+					     T1W = Rp[WS(rs, 7)];
+					     T20 = Rm[WS(rs, 7)];
+					     {
+						  E T24, T27, T1X, T4R, T25, T4J;
+						  T24 = Rp[WS(rs, 3)];
+						  T27 = Rm[WS(rs, 3)];
+						  T1X = T1V * T1W;
+						  T4R = T1V * T20;
+						  T25 = T23 * T24;
+						  T4J = T23 * T27;
+						  T21 = FMA(T1Z, T20, T1X);
+						  T4S = FNMS(T1Z, T1W, T4R);
+						  T28 = FMA(T26, T27, T25);
+						  T4K = FNMS(T26, T24, T4J);
+					     }
+					}
+					{
+					     E T4I, T22, T7p, T2j, T7q, T4N;
+					     T4I = T1P - T21;
+					     T22 = T1P + T21;
+					     T7p = T4Q + T4S;
+					     T4T = T4Q - T4S;
+					     T4U = T28 - T2i;
+					     T2j = T28 + T2i;
+					     T7q = T4K + T4M;
+					     T4N = T4K - T4M;
+					     T2k = T22 + T2j;
+					     T7o = T22 - T2j;
+					     T7r = T7p - T7q;
+					     T8e = T7p + T7q;
+					     T6B = T4I - T4N;
+					     T4O = T4I + T4N;
+					}
+				   }
+				   {
+					E T3l, T5W, T3E, T3v, T3t, T3w, T3x, T5Y, T3A, T3B, T3D, T3y, T5z;
+					{
+					     E T3g, T3k, T3h, T5V;
+					     T3g = Ip[WS(rs, 15)];
+					     T3k = Im[WS(rs, 15)];
+					     T3A = Ip[WS(rs, 11)];
+					     T6A = T4T + T4U;
+					     T4V = T4T - T4U;
+					     T3h = T3f * T3g;
+					     T5V = T3f * T3k;
+					     T3B = T3z * T3A;
+					     T3D = Im[WS(rs, 11)];
+					     T3l = FMA(T3j, T3k, T3h);
+					     T5W = FNMS(T3j, T3g, T5V);
+					}
+					{
+					     E T3o, T5B, T3s, T3p, T5X;
+					     T3o = Ip[WS(rs, 7)];
+					     T3E = FMA(T3C, T3D, T3B);
+					     T5B = T3z * T3D;
+					     T3s = Im[WS(rs, 7)];
+					     T3p = T3n * T3o;
+					     T3v = Ip[WS(rs, 3)];
+					     T5C = FNMS(T3C, T3A, T5B);
+					     T5X = T3n * T3s;
+					     T3t = FMA(T3r, T3s, T3p);
+					     T3w = TP * T3v;
+					     T3x = Im[WS(rs, 3)];
+					     T5Y = FNMS(T3r, T3o, T5X);
+					}
+					T3u = T3l + T3t;
+					T5y = T3l - T3t;
+					T3y = FMA(TR, T3x, T3w);
+					T5z = TP * T3x;
+					T7H = T5W + T5Y;
+					T5Z = T5W - T5Y;
+					T3F = T3y + T3E;
+					T60 = T3E - T3y;
+					T5A = FNMS(TR, T3v, T5z);
+				   }
+				   {
+					E T2t, T2q, T2u, T5n, T2L, T53, T2x, T2A, T2C;
+					{
+					     E T2n, T2o, T2p, T2G, T2K, T5D, T7I, T5m, T2H, T52;
+					     T2n = Ip[0];
+					     T7L = T3u - T3F;
+					     T3G = T3u + T3F;
+					     T5D = T5A - T5C;
+					     T7I = T5A + T5C;
+					     T6P = T60 - T5Z;
+					     T61 = T5Z + T60;
+					     T6M = T5y - T5D;
+					     T5E = T5y + T5D;
+					     T8n = T7H + T7I;
+					     T7J = T7H - T7I;
+					     T2o = T2 * T2n;
+					     T2p = Im[0];
+					     T2G = Ip[WS(rs, 12)];
+					     T2K = Im[WS(rs, 12)];
+					     T2t = Ip[WS(rs, 8)];
+					     T2q = FMA(T5, T2p, T2o);
+					     T5m = T2 * T2p;
+					     T2H = T2F * T2G;
+					     T52 = T2F * T2K;
+					     T2u = T2s * T2t;
+					     T5n = FNMS(T5, T2n, T5m);
+					     T2L = FMA(T2J, T2K, T2H);
+					     T53 = FNMS(T2J, T2G, T52);
+					     T2x = Im[WS(rs, 8)];
+					     T2A = Ip[WS(rs, 4)];
+					     T2C = Im[WS(rs, 4)];
+					}
+					{
+					     E T3N, T3K, T3O, T5H, T41, T5Q, T3R, T3U, T3W;
+					     {
+						  E T3H, T3I, T3J, T3Y, T40, T5G, T3Z, T5P;
+						  {
+						       E T2z, T4Z, T5p, T2D, T51, T7v, T5q;
+						       T3H = Ip[WS(rs, 1)];
+						       {
+							    E T2y, T5o, T2B, T50;
+							    T2y = FMA(T2w, T2x, T2u);
+							    T5o = T2s * T2x;
+							    T2B = T8 * T2A;
+							    T50 = T8 * T2C;
+							    T2z = T2q + T2y;
+							    T4Z = T2q - T2y;
+							    T5p = FNMS(T2w, T2t, T5o);
+							    T2D = FMA(Tc, T2C, T2B);
+							    T51 = FNMS(Tc, T2A, T50);
+							    T3I = T3 * T3H;
+						       }
+						       T7v = T5n + T5p;
+						       T5q = T5n - T5p;
+						       {
+							    E T2M, T5r, T7w, T54;
+							    T2M = T2D + T2L;
+							    T5r = T2D - T2L;
+							    T7w = T51 + T53;
+							    T54 = T51 - T53;
+							    T5s = T5q - T5r;
+							    T6I = T5q + T5r;
+							    T2N = T2z + T2M;
+							    T7A = T2z - T2M;
+							    T55 = T4Z + T54;
+							    T6F = T4Z - T54;
+							    T7x = T7v - T7w;
+							    T8i = T7v + T7w;
+							    T3J = Im[WS(rs, 1)];
+						       }
+						  }
+						  T3Y = Ip[WS(rs, 5)];
+						  T40 = Im[WS(rs, 5)];
+						  T3N = Ip[WS(rs, 9)];
+						  T3K = FMA(T6, T3J, T3I);
+						  T5G = T3 * T3J;
+						  T3Z = Td * T3Y;
+						  T5P = Td * T40;
+						  T3O = T3M * T3N;
+						  T5H = FNMS(T6, T3H, T5G);
+						  T41 = FMA(Th, T40, T3Z);
+						  T5Q = FNMS(Th, T3Y, T5P);
+						  T3R = Im[WS(rs, 9)];
+						  T3U = Ip[WS(rs, 13)];
+						  T3W = Im[WS(rs, 13)];
+					     }
+					     {
+						  E T2O, T2P, T2Q, T37, T39, T57, T38, T5g;
+						  {
+						       E T3T, T5F, T5J, T3X, T5O, T7M, T5K;
+						       T2O = Ip[WS(rs, 2)];
+						       {
+							    E T3S, T5I, T3V, T5N;
+							    T3S = FMA(T3Q, T3R, T3O);
+							    T5I = T3M * T3R;
+							    T3V = Te * T3U;
+							    T5N = Te * T3W;
+							    T3T = T3K + T3S;
+							    T5F = T3K - T3S;
+							    T5J = FNMS(T3Q, T3N, T5I);
+							    T3X = FMA(Ti, T3W, T3V);
+							    T5O = FNMS(Ti, T3U, T5N);
+							    T2P = T29 * T2O;
+						       }
+						       T7M = T5H + T5J;
+						       T5K = T5H - T5J;
+						       {
+							    E T42, T5M, T7N, T5R;
+							    T42 = T3X + T41;
+							    T5M = T3X - T41;
+							    T7N = T5O + T5Q;
+							    T5R = T5O - T5Q;
+							    T5L = T5F + T5K;
+							    T62 = T5K - T5F;
+							    T43 = T3T + T42;
+							    T7G = T42 - T3T;
+							    T5S = T5M - T5R;
+							    T63 = T5M + T5R;
+							    T7O = T7M - T7N;
+							    T8o = T7M + T7N;
+							    T2Q = Im[WS(rs, 2)];
+						       }
+						  }
+						  T37 = Ip[WS(rs, 6)];
+						  T39 = Im[WS(rs, 6)];
+						  T2U = Ip[WS(rs, 10)];
+						  T2R = FMA(T2b, T2Q, T2P);
+						  T57 = T29 * T2Q;
+						  T38 = T1R * T37;
+						  T5g = T1R * T39;
+						  T2V = T2T * T2U;
+						  T58 = FNMS(T2b, T2O, T57);
+						  T3a = FMA(T1U, T39, T38);
+						  T5h = FNMS(T1U, T37, T5g);
+						  T2Y = Im[WS(rs, 10)];
+						  T32 = Ip[WS(rs, 14)];
+						  T35 = Im[WS(rs, 14)];
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T5c, T5t, T5j, T5u, T88, T90, T8Z, T8b;
+			      {
+				   E T7e, T8T, T7y, T7D, T7h, T8U, T8S, T8R;
+				   {
+					E T8c, T1i, T8A, T8z, T8O, T8J, T8N, T2l, T8L, T45, T8t, T8l, T8u, T8q, T3c;
+					E T8k, T8p, T8w, T2m;
+					{
+					     E T8x, T8y, T8j, T8C, T8I;
+					     {
+						  E TI, T30, T56, T5a, T36, T5f, T1h, T7B, T5b;
+						  TI = Tq + TH;
+						  T7e = Tq - TH;
+						  {
+						       E T2Z, T59, T33, T5e;
+						       T2Z = FMA(T2X, T2Y, T2V);
+						       T59 = T2T * T2Y;
+						       T33 = T31 * T32;
+						       T5e = T31 * T35;
+						       T30 = T2R + T2Z;
+						       T56 = T2R - T2Z;
+						       T5a = FNMS(T2X, T2U, T59);
+						       T36 = FMA(T34, T35, T33);
+						       T5f = FNMS(T34, T32, T5e);
+						       T1h = TZ + T1g;
+						       T8T = T1g - TZ;
+						  }
+						  T7B = T58 + T5a;
+						  T5b = T58 - T5a;
+						  {
+						       E T3b, T5d, T7C, T5i;
+						       T3b = T36 + T3a;
+						       T5d = T36 - T3a;
+						       T7C = T5f + T5h;
+						       T5i = T5f - T5h;
+						       T5c = T56 + T5b;
+						       T5t = T5b - T56;
+						       T3c = T30 + T3b;
+						       T7y = T3b - T30;
+						       T5j = T5d - T5i;
+						       T5u = T5d + T5i;
+						       T7D = T7B - T7C;
+						       T8j = T7B + T7C;
+						       T8c = TI - T1h;
+						       T1i = TI + T1h;
+						  }
+					     }
+					     T8k = T8i - T8j;
+					     T8x = T8i + T8j;
+					     T8y = T8n + T8o;
+					     T8p = T8n - T8o;
+					     T7h = T7f - T7g;
+					     T8C = T7f + T7g;
+					     T8I = T8D + T8H;
+					     T8U = T8H - T8D;
+					     T8A = T8x + T8y;
+					     T8z = T8x - T8y;
+					     T8O = T8I - T8C;
+					     T8J = T8C + T8I;
+					}
+					{
+					     E T8h, T8m, T3d, T44;
+					     T8h = T2N - T3c;
+					     T3d = T2N + T3c;
+					     T44 = T3G + T43;
+					     T8m = T3G - T43;
+					     T8N = T2k - T1J;
+					     T2l = T1J + T2k;
+					     T8L = T44 - T3d;
+					     T45 = T3d + T44;
+					     T8t = T8k - T8h;
+					     T8l = T8h + T8k;
+					     T8u = T8m + T8p;
+					     T8q = T8m - T8p;
+					}
+					T8w = T1i - T2l;
+					T2m = T1i + T2l;
+					{
+					     E T8s, T8P, T8Q, T8v;
+					     {
+						  E T8r, T8M, T8K, T8g, T8B, T8f;
+						  T8S = T8q - T8l;
+						  T8r = T8l + T8q;
+						  T8B = T8d + T8e;
+						  T8f = T8d - T8e;
+						  Rp[0] = T2m + T45;
+						  Rm[WS(rs, 15)] = T2m - T45;
+						  Rp[WS(rs, 8)] = T8w + T8z;
+						  Rm[WS(rs, 7)] = T8w - T8z;
+						  T8M = T8J - T8B;
+						  T8K = T8B + T8J;
+						  T8g = T8c + T8f;
+						  T8s = T8c - T8f;
+						  T8R = T8O - T8N;
+						  T8P = T8N + T8O;
+						  Ip[WS(rs, 8)] = T8L + T8M;
+						  Im[WS(rs, 7)] = T8L - T8M;
+						  Ip[0] = T8A + T8K;
+						  Im[WS(rs, 15)] = T8A - T8K;
+						  Rp[WS(rs, 4)] = FMA(KP707106781, T8r, T8g);
+						  Rm[WS(rs, 11)] = FNMS(KP707106781, T8r, T8g);
+						  T8Q = T8t + T8u;
+						  T8v = T8t - T8u;
+					     }
+					     Ip[WS(rs, 4)] = FMA(KP707106781, T8Q, T8P);
+					     Im[WS(rs, 11)] = FMS(KP707106781, T8Q, T8P);
+					     Rp[WS(rs, 12)] = FMA(KP707106781, T8v, T8s);
+					     Rm[WS(rs, 3)] = FNMS(KP707106781, T8v, T8s);
+					}
+				   }
+				   {
+					E T7P, T7W, T7i, T7K, T8a, T86, T91, T8V, T8W, T7t, T7T, T7F, T92, T7Z, T89;
+					E T83;
+					{
+					     E T7X, T7n, T7s, T7Y, T84, T85;
+					     T7P = T7L - T7O;
+					     T84 = T7L + T7O;
+					     Ip[WS(rs, 12)] = FMA(KP707106781, T8S, T8R);
+					     Im[WS(rs, 3)] = FMS(KP707106781, T8S, T8R);
+					     T7W = T7e + T7h;
+					     T7i = T7e - T7h;
+					     T85 = T7J + T7G;
+					     T7K = T7G - T7J;
+					     T7X = T7m + T7l;
+					     T7n = T7l - T7m;
+					     T8a = FMA(KP414213562, T84, T85);
+					     T86 = FNMS(KP414213562, T85, T84);
+					     T91 = T8U - T8T;
+					     T8V = T8T + T8U;
+					     T7s = T7o + T7r;
+					     T7Y = T7o - T7r;
+					     {
+						  E T82, T81, T7z, T7E;
+						  T82 = T7x + T7y;
+						  T7z = T7x - T7y;
+						  T7E = T7A - T7D;
+						  T81 = T7A + T7D;
+						  T8W = T7n + T7s;
+						  T7t = T7n - T7s;
+						  T7T = FNMS(KP414213562, T7z, T7E);
+						  T7F = FMA(KP414213562, T7E, T7z);
+						  T92 = T7Y - T7X;
+						  T7Z = T7X + T7Y;
+						  T89 = FNMS(KP414213562, T81, T82);
+						  T83 = FMA(KP414213562, T82, T81);
+					     }
+					}
+					{
+					     E T7S, T7u, T93, T95, T7U, T7Q;
+					     T7S = FNMS(KP707106781, T7t, T7i);
+					     T7u = FMA(KP707106781, T7t, T7i);
+					     T93 = FMA(KP707106781, T92, T91);
+					     T95 = FNMS(KP707106781, T92, T91);
+					     T7U = FNMS(KP414213562, T7K, T7P);
+					     T7Q = FMA(KP414213562, T7P, T7K);
+					     {
+						  E T80, T87, T8X, T8Y;
+						  T88 = FNMS(KP707106781, T7Z, T7W);
+						  T80 = FMA(KP707106781, T7Z, T7W);
+						  {
+						       E T7V, T94, T96, T7R;
+						       T7V = T7T + T7U;
+						       T94 = T7U - T7T;
+						       T96 = T7Q - T7F;
+						       T7R = T7F + T7Q;
+						       Rm[WS(rs, 1)] = FMA(KP923879532, T7V, T7S);
+						       Rp[WS(rs, 14)] = FNMS(KP923879532, T7V, T7S);
+						       Ip[WS(rs, 6)] = FMA(KP923879532, T94, T93);
+						       Im[WS(rs, 9)] = FMS(KP923879532, T94, T93);
+						       Ip[WS(rs, 14)] = FMA(KP923879532, T96, T95);
+						       Im[WS(rs, 1)] = FMS(KP923879532, T96, T95);
+						       Rp[WS(rs, 6)] = FMA(KP923879532, T7R, T7u);
+						       Rm[WS(rs, 9)] = FNMS(KP923879532, T7R, T7u);
+						       T87 = T83 + T86;
+						       T90 = T86 - T83;
+						  }
+						  T8Z = FNMS(KP707106781, T8W, T8V);
+						  T8X = FMA(KP707106781, T8W, T8V);
+						  T8Y = T89 + T8a;
+						  T8b = T89 - T8a;
+						  Rp[WS(rs, 2)] = FMA(KP923879532, T87, T80);
+						  Rm[WS(rs, 13)] = FNMS(KP923879532, T87, T80);
+						  Ip[WS(rs, 2)] = FMA(KP923879532, T8Y, T8X);
+						  Im[WS(rs, 13)] = FMS(KP923879532, T8Y, T8X);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T6s, T9o, T9n, T6v, T6Q, T6N, T6J, T6G, T9k, T9j;
+				   {
+					E T6c, T4s, T9i, T4X, T9h, T9b, T9c, T6f, T5U, T6k, T64, T5k, T5v;
+					{
+					     E T6d, T6e, T99, T9a, T5T;
+					     {
+						  E T4c, T4r, T4H, T4W;
+						  T6s = T46 - T4b;
+						  T4c = T46 + T4b;
+						  Rp[WS(rs, 10)] = FMA(KP923879532, T8b, T88);
+						  Rm[WS(rs, 5)] = FNMS(KP923879532, T8b, T88);
+						  Ip[WS(rs, 10)] = FMA(KP923879532, T90, T8Z);
+						  Im[WS(rs, 5)] = FMS(KP923879532, T90, T8Z);
+						  T4r = T4j + T4q;
+						  T9o = T4q - T4j;
+						  T6d = FNMS(KP414213562, T4z, T4G);
+						  T4H = FMA(KP414213562, T4G, T4z);
+						  T4W = FNMS(KP414213562, T4V, T4O);
+						  T6e = FMA(KP414213562, T4O, T4V);
+						  T9n = T98 + T97;
+						  T99 = T97 - T98;
+						  T6c = FNMS(KP707106781, T4r, T4c);
+						  T4s = FMA(KP707106781, T4r, T4c);
+						  T9i = T4W - T4H;
+						  T4X = T4H + T4W;
+						  T9a = T6t + T6u;
+						  T6v = T6t - T6u;
+					     }
+					     T6Q = T5S - T5L;
+					     T5T = T5L + T5S;
+					     T9h = FNMS(KP707106781, T9a, T99);
+					     T9b = FMA(KP707106781, T9a, T99);
+					     T9c = T6d + T6e;
+					     T6f = T6d - T6e;
+					     T5U = FMA(KP707106781, T5T, T5E);
+					     T6k = FNMS(KP707106781, T5T, T5E);
+					     T64 = T62 + T63;
+					     T6N = T63 - T62;
+					     T6J = T5c - T5j;
+					     T5k = T5c + T5j;
+					     T5v = T5t + T5u;
+					     T6G = T5u - T5t;
+					}
+					{
+					     E T6m, T6q, T6j, T6p, T9f, T9g;
+					     {
+						  E T68, T4Y, T6a, T66, T69, T5x, T9d, T6l, T65, T9e, T6b, T67;
+						  T68 = FNMS(KP923879532, T4X, T4s);
+						  T4Y = FMA(KP923879532, T4X, T4s);
+						  T6l = FNMS(KP707106781, T64, T61);
+						  T65 = FMA(KP707106781, T64, T61);
+						  {
+						       E T6h, T5l, T6i, T5w;
+						       T6h = FNMS(KP707106781, T5k, T55);
+						       T5l = FMA(KP707106781, T5k, T55);
+						       T6i = FNMS(KP707106781, T5v, T5s);
+						       T5w = FMA(KP707106781, T5v, T5s);
+						       T6m = FMA(KP668178637, T6l, T6k);
+						       T6q = FNMS(KP668178637, T6k, T6l);
+						       T6a = FMA(KP198912367, T5U, T65);
+						       T66 = FNMS(KP198912367, T65, T5U);
+						       T6j = FNMS(KP668178637, T6i, T6h);
+						       T6p = FMA(KP668178637, T6h, T6i);
+						       T69 = FNMS(KP198912367, T5l, T5w);
+						       T5x = FMA(KP198912367, T5w, T5l);
+						  }
+						  T9d = FMA(KP923879532, T9c, T9b);
+						  T9f = FNMS(KP923879532, T9c, T9b);
+						  T9e = T69 + T6a;
+						  T6b = T69 - T6a;
+						  T9g = T66 - T5x;
+						  T67 = T5x + T66;
+						  Ip[WS(rs, 1)] = FMA(KP980785280, T9e, T9d);
+						  Im[WS(rs, 14)] = FMS(KP980785280, T9e, T9d);
+						  Rp[WS(rs, 1)] = FMA(KP980785280, T67, T4Y);
+						  Rm[WS(rs, 14)] = FNMS(KP980785280, T67, T4Y);
+						  Rp[WS(rs, 9)] = FMA(KP980785280, T6b, T68);
+						  Rm[WS(rs, 6)] = FNMS(KP980785280, T6b, T68);
+					     }
+					     {
+						  E T6o, T9l, T9m, T6r, T6g, T6n;
+						  T6o = FMA(KP923879532, T6f, T6c);
+						  T6g = FNMS(KP923879532, T6f, T6c);
+						  T6n = T6j + T6m;
+						  T9k = T6m - T6j;
+						  T9j = FMA(KP923879532, T9i, T9h);
+						  T9l = FNMS(KP923879532, T9i, T9h);
+						  Ip[WS(rs, 9)] = FMA(KP980785280, T9g, T9f);
+						  Im[WS(rs, 6)] = FMS(KP980785280, T9g, T9f);
+						  Rm[WS(rs, 2)] = FMA(KP831469612, T6n, T6g);
+						  Rp[WS(rs, 13)] = FNMS(KP831469612, T6n, T6g);
+						  T9m = T6p + T6q;
+						  T6r = T6p - T6q;
+						  Ip[WS(rs, 13)] = FNMS(KP831469612, T9m, T9l);
+						  Im[WS(rs, 2)] = -(FMA(KP831469612, T9m, T9l));
+						  Rp[WS(rs, 5)] = FMA(KP831469612, T6r, T6o);
+						  Rm[WS(rs, 10)] = FNMS(KP831469612, T6r, T6o);
+					     }
+					}
+				   }
+				   {
+					E T6Y, T6w, T9w, T6D, T9v, T9p, T9q, T71, T6H, T74, T78, T7c, T6W, T6S;
+					{
+					     E T6Z, T6z, T6C, T70;
+					     T6Z = FNMS(KP414213562, T6x, T6y);
+					     T6z = FMA(KP414213562, T6y, T6x);
+					     Ip[WS(rs, 5)] = FMA(KP831469612, T9k, T9j);
+					     Im[WS(rs, 10)] = FMS(KP831469612, T9k, T9j);
+					     T6Y = FNMS(KP707106781, T6v, T6s);
+					     T6w = FMA(KP707106781, T6v, T6s);
+					     T6C = FNMS(KP414213562, T6B, T6A);
+					     T70 = FMA(KP414213562, T6A, T6B);
+					     T9w = T6z + T6C;
+					     T6D = T6z - T6C;
+					     T9v = FNMS(KP707106781, T9o, T9n);
+					     T9p = FMA(KP707106781, T9o, T9n);
+					     {
+						  E T77, T6O, T76, T6R;
+						  T9q = T70 - T6Z;
+						  T71 = T6Z + T70;
+						  T77 = FMA(KP707106781, T6N, T6M);
+						  T6O = FNMS(KP707106781, T6N, T6M);
+						  T76 = FMA(KP707106781, T6Q, T6P);
+						  T6R = FNMS(KP707106781, T6Q, T6P);
+						  T6H = FNMS(KP707106781, T6G, T6F);
+						  T74 = FMA(KP707106781, T6G, T6F);
+						  T78 = FMA(KP198912367, T77, T76);
+						  T7c = FNMS(KP198912367, T76, T77);
+						  T6W = FNMS(KP668178637, T6O, T6R);
+						  T6S = FMA(KP668178637, T6R, T6O);
+					     }
+					}
+					{
+					     E T6U, T6E, T9r, T9t, T73, T6K;
+					     T6U = FNMS(KP923879532, T6D, T6w);
+					     T6E = FMA(KP923879532, T6D, T6w);
+					     T9r = FMA(KP923879532, T9q, T9p);
+					     T9t = FNMS(KP923879532, T9q, T9p);
+					     T73 = FMA(KP707106781, T6J, T6I);
+					     T6K = FNMS(KP707106781, T6J, T6I);
+					     {
+						  E T7a, T9x, T9y, T7d;
+						  {
+						       E T72, T7b, T6V, T6L, T79, T75;
+						       T7a = FMA(KP923879532, T71, T6Y);
+						       T72 = FNMS(KP923879532, T71, T6Y);
+						       T75 = FMA(KP198912367, T74, T73);
+						       T7b = FNMS(KP198912367, T73, T74);
+						       T6V = FNMS(KP668178637, T6H, T6K);
+						       T6L = FMA(KP668178637, T6K, T6H);
+						       T79 = T75 + T78;
+						       T9A = T78 - T75;
+						       T9z = FMA(KP923879532, T9w, T9v);
+						       T9x = FNMS(KP923879532, T9w, T9v);
+						       {
+							    E T6X, T9s, T9u, T6T;
+							    T6X = T6V + T6W;
+							    T9s = T6V - T6W;
+							    T9u = T6S - T6L;
+							    T6T = T6L + T6S;
+							    Rp[WS(rs, 7)] = FMA(KP980785280, T79, T72);
+							    Rm[WS(rs, 8)] = FNMS(KP980785280, T79, T72);
+							    Rp[WS(rs, 11)] = FMA(KP831469612, T6X, T6U);
+							    Rm[WS(rs, 4)] = FNMS(KP831469612, T6X, T6U);
+							    Ip[WS(rs, 3)] = FMA(KP831469612, T9s, T9r);
+							    Im[WS(rs, 12)] = FMS(KP831469612, T9s, T9r);
+							    Ip[WS(rs, 11)] = FMA(KP831469612, T9u, T9t);
+							    Im[WS(rs, 4)] = FMS(KP831469612, T9u, T9t);
+							    Rp[WS(rs, 3)] = FMA(KP831469612, T6T, T6E);
+							    Rm[WS(rs, 12)] = FNMS(KP831469612, T6T, T6E);
+							    T9y = T7c - T7b;
+							    T7d = T7b + T7c;
+						       }
+						  }
+						  Ip[WS(rs, 7)] = FMA(KP980785280, T9y, T9x);
+						  Im[WS(rs, 8)] = FMS(KP980785280, T9y, T9x);
+						  Rm[0] = FMA(KP980785280, T7d, T7a);
+						  Rp[WS(rs, 15)] = FNMS(KP980785280, T7d, T7a);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 15)] = FMA(KP980785280, T9A, T9z);
+	       Im[0] = FMS(KP980785280, T9A, T9z);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cf2_32", twinstr, &GENUS, {236, 98, 252, 0} };
+
+void X(codelet_hc2cf2_32) (planner *p) {
+     X(khc2c_register) (p, hc2cf2_32, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -dit -name hc2cf2_32 -include hc2cf.h */
+
+/*
+ * This function contains 488 FP additions, 280 FP multiplications,
+ * (or, 376 additions, 168 multiplications, 112 fused multiply/add),
+ * 158 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf2_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T2, T5, T3, T6, T8, TM, TO, Td, T9, Te, Th, Tl, TD, TH, T1y;
+	       E T1H, T15, T1A, T11, T1F, T1n, T1p, T2q, T2I, T2u, T2K, T2V, T3b, T2Z, T3d;
+	       E Tu, Ty, T3l, T3n, T1t, T1v, T2f, T2h, T1a, T1e, T32, T34, T1W, T1Y, T2C;
+	       E T2E, Tg, TR, Tk, TS, Tm, TV, To, TT, T1M, T21, T1P, T22, T1Q, T25;
+	       E T1S, T23;
+	       {
+		    E Ts, T1d, Tx, T18, Tt, T1c, Tw, T19, TB, T14, TG, TZ, TC, T13, TF;
+		    E T10;
+		    {
+			 E T4, Tc, T7, Tb;
+			 T2 = W[0];
+			 T5 = W[1];
+			 T3 = W[2];
+			 T6 = W[3];
+			 T4 = T2 * T3;
+			 Tc = T5 * T3;
+			 T7 = T5 * T6;
+			 Tb = T2 * T6;
+			 T8 = T4 + T7;
+			 TM = T4 - T7;
+			 TO = Tb + Tc;
+			 Td = Tb - Tc;
+			 T9 = W[4];
+			 Ts = T2 * T9;
+			 T1d = T6 * T9;
+			 Tx = T5 * T9;
+			 T18 = T3 * T9;
+			 Te = W[5];
+			 Tt = T5 * Te;
+			 T1c = T3 * Te;
+			 Tw = T2 * Te;
+			 T19 = T6 * Te;
+			 Th = W[6];
+			 TB = T3 * Th;
+			 T14 = T5 * Th;
+			 TG = T6 * Th;
+			 TZ = T2 * Th;
+			 Tl = W[7];
+			 TC = T6 * Tl;
+			 T13 = T2 * Tl;
+			 TF = T3 * Tl;
+			 T10 = T5 * Tl;
+		    }
+		    TD = TB + TC;
+		    TH = TF - TG;
+		    T1y = TZ + T10;
+		    T1H = TF + TG;
+		    T15 = T13 + T14;
+		    T1A = T13 - T14;
+		    T11 = TZ - T10;
+		    T1F = TB - TC;
+		    T1n = FMA(T9, Th, Te * Tl);
+		    T1p = FNMS(Te, Th, T9 * Tl);
+		    {
+			 E T2o, T2p, T2s, T2t;
+			 T2o = T8 * Th;
+			 T2p = Td * Tl;
+			 T2q = T2o + T2p;
+			 T2I = T2o - T2p;
+			 T2s = T8 * Tl;
+			 T2t = Td * Th;
+			 T2u = T2s - T2t;
+			 T2K = T2s + T2t;
+		    }
+		    {
+			 E T2T, T2U, T2X, T2Y;
+			 T2T = TM * Th;
+			 T2U = TO * Tl;
+			 T2V = T2T - T2U;
+			 T3b = T2T + T2U;
+			 T2X = TM * Tl;
+			 T2Y = TO * Th;
+			 T2Z = T2X + T2Y;
+			 T3d = T2X - T2Y;
+			 Tu = Ts + Tt;
+			 Ty = Tw - Tx;
+			 T3l = FMA(Tu, Th, Ty * Tl);
+			 T3n = FNMS(Ty, Th, Tu * Tl);
+		    }
+		    T1t = Ts - Tt;
+		    T1v = Tw + Tx;
+		    T2f = FMA(T1t, Th, T1v * Tl);
+		    T2h = FNMS(T1v, Th, T1t * Tl);
+		    T1a = T18 - T19;
+		    T1e = T1c + T1d;
+		    T32 = FMA(T1a, Th, T1e * Tl);
+		    T34 = FNMS(T1e, Th, T1a * Tl);
+		    T1W = T18 + T19;
+		    T1Y = T1c - T1d;
+		    T2C = FMA(T1W, Th, T1Y * Tl);
+		    T2E = FNMS(T1Y, Th, T1W * Tl);
+		    {
+			 E Ta, Tf, Ti, Tj;
+			 Ta = T8 * T9;
+			 Tf = Td * Te;
+			 Tg = Ta - Tf;
+			 TR = Ta + Tf;
+			 Ti = T8 * Te;
+			 Tj = Td * T9;
+			 Tk = Ti + Tj;
+			 TS = Ti - Tj;
+		    }
+		    Tm = FMA(Tg, Th, Tk * Tl);
+		    TV = FNMS(TS, Th, TR * Tl);
+		    To = FNMS(Tk, Th, Tg * Tl);
+		    TT = FMA(TR, Th, TS * Tl);
+		    {
+			 E T1K, T1L, T1N, T1O;
+			 T1K = TM * T9;
+			 T1L = TO * Te;
+			 T1M = T1K - T1L;
+			 T21 = T1K + T1L;
+			 T1N = TM * Te;
+			 T1O = TO * T9;
+			 T1P = T1N + T1O;
+			 T22 = T1N - T1O;
+		    }
+		    T1Q = FMA(T1M, Th, T1P * Tl);
+		    T25 = FNMS(T22, Th, T21 * Tl);
+		    T1S = FNMS(T1P, Th, T1M * Tl);
+		    T23 = FMA(T21, Th, T22 * Tl);
+	       }
+	       {
+		    E TL, T6f, T8c, T8q, T3F, T5t, T7I, T7W, T2y, T6B, T6y, T7j, T4k, T5J, T4B;
+		    E T5G, T3h, T6H, T6O, T7o, T4L, T5N, T52, T5Q, T1i, T7V, T6i, T7D, T3K, T5u;
+		    E T3P, T5v, T1E, T6n, T6m, T7e, T3W, T5y, T41, T5z, T29, T6p, T6s, T7f, T47;
+		    E T5B, T4c, T5C, T2R, T6z, T6E, T7k, T4v, T5H, T4E, T5K, T3y, T6P, T6K, T7p;
+		    E T4W, T5R, T55, T5O;
+		    {
+			 E T1, T7G, Tq, T7F, TA, T3C, TJ, T3D, Tn, Tp;
+			 T1 = Rp[0];
+			 T7G = Rm[0];
+			 Tn = Rp[WS(rs, 8)];
+			 Tp = Rm[WS(rs, 8)];
+			 Tq = FMA(Tm, Tn, To * Tp);
+			 T7F = FNMS(To, Tn, Tm * Tp);
+			 {
+			      E Tv, Tz, TE, TI;
+			      Tv = Rp[WS(rs, 4)];
+			      Tz = Rm[WS(rs, 4)];
+			      TA = FMA(Tu, Tv, Ty * Tz);
+			      T3C = FNMS(Ty, Tv, Tu * Tz);
+			      TE = Rp[WS(rs, 12)];
+			      TI = Rm[WS(rs, 12)];
+			      TJ = FMA(TD, TE, TH * TI);
+			      T3D = FNMS(TH, TE, TD * TI);
+			 }
+			 {
+			      E Tr, TK, T8a, T8b;
+			      Tr = T1 + Tq;
+			      TK = TA + TJ;
+			      TL = Tr + TK;
+			      T6f = Tr - TK;
+			      T8a = T7G - T7F;
+			      T8b = TA - TJ;
+			      T8c = T8a - T8b;
+			      T8q = T8b + T8a;
+			 }
+			 {
+			      E T3B, T3E, T7E, T7H;
+			      T3B = T1 - Tq;
+			      T3E = T3C - T3D;
+			      T3F = T3B - T3E;
+			      T5t = T3B + T3E;
+			      T7E = T3C + T3D;
+			      T7H = T7F + T7G;
+			      T7I = T7E + T7H;
+			      T7W = T7H - T7E;
+			 }
+		    }
+		    {
+			 E T2e, T4g, T2w, T4z, T2j, T4h, T2n, T4y;
+			 {
+			      E T2c, T2d, T2r, T2v;
+			      T2c = Ip[0];
+			      T2d = Im[0];
+			      T2e = FMA(T2, T2c, T5 * T2d);
+			      T4g = FNMS(T5, T2c, T2 * T2d);
+			      T2r = Ip[WS(rs, 12)];
+			      T2v = Im[WS(rs, 12)];
+			      T2w = FMA(T2q, T2r, T2u * T2v);
+			      T4z = FNMS(T2u, T2r, T2q * T2v);
+			 }
+			 {
+			      E T2g, T2i, T2l, T2m;
+			      T2g = Ip[WS(rs, 8)];
+			      T2i = Im[WS(rs, 8)];
+			      T2j = FMA(T2f, T2g, T2h * T2i);
+			      T4h = FNMS(T2h, T2g, T2f * T2i);
+			      T2l = Ip[WS(rs, 4)];
+			      T2m = Im[WS(rs, 4)];
+			      T2n = FMA(T9, T2l, Te * T2m);
+			      T4y = FNMS(Te, T2l, T9 * T2m);
+			 }
+			 {
+			      E T2k, T2x, T6w, T6x;
+			      T2k = T2e + T2j;
+			      T2x = T2n + T2w;
+			      T2y = T2k + T2x;
+			      T6B = T2k - T2x;
+			      T6w = T4g + T4h;
+			      T6x = T4y + T4z;
+			      T6y = T6w - T6x;
+			      T7j = T6w + T6x;
+			 }
+			 {
+			      E T4i, T4j, T4x, T4A;
+			      T4i = T4g - T4h;
+			      T4j = T2n - T2w;
+			      T4k = T4i + T4j;
+			      T5J = T4i - T4j;
+			      T4x = T2e - T2j;
+			      T4A = T4y - T4z;
+			      T4B = T4x - T4A;
+			      T5G = T4x + T4A;
+			 }
+		    }
+		    {
+			 E T31, T4Y, T3f, T4J, T36, T4Z, T3a, T4I;
+			 {
+			      E T2W, T30, T3c, T3e;
+			      T2W = Ip[WS(rs, 15)];
+			      T30 = Im[WS(rs, 15)];
+			      T31 = FMA(T2V, T2W, T2Z * T30);
+			      T4Y = FNMS(T2Z, T2W, T2V * T30);
+			      T3c = Ip[WS(rs, 11)];
+			      T3e = Im[WS(rs, 11)];
+			      T3f = FMA(T3b, T3c, T3d * T3e);
+			      T4J = FNMS(T3d, T3c, T3b * T3e);
+			 }
+			 {
+			      E T33, T35, T38, T39;
+			      T33 = Ip[WS(rs, 7)];
+			      T35 = Im[WS(rs, 7)];
+			      T36 = FMA(T32, T33, T34 * T35);
+			      T4Z = FNMS(T34, T33, T32 * T35);
+			      T38 = Ip[WS(rs, 3)];
+			      T39 = Im[WS(rs, 3)];
+			      T3a = FMA(TR, T38, TS * T39);
+			      T4I = FNMS(TS, T38, TR * T39);
+			 }
+			 {
+			      E T37, T3g, T6M, T6N;
+			      T37 = T31 + T36;
+			      T3g = T3a + T3f;
+			      T3h = T37 + T3g;
+			      T6H = T37 - T3g;
+			      T6M = T4Y + T4Z;
+			      T6N = T4I + T4J;
+			      T6O = T6M - T6N;
+			      T7o = T6M + T6N;
+			 }
+			 {
+			      E T4H, T4K, T50, T51;
+			      T4H = T31 - T36;
+			      T4K = T4I - T4J;
+			      T4L = T4H - T4K;
+			      T5N = T4H + T4K;
+			      T50 = T4Y - T4Z;
+			      T51 = T3a - T3f;
+			      T52 = T50 + T51;
+			      T5Q = T50 - T51;
+			 }
+		    }
+		    {
+			 E TQ, T3G, T1g, T3N, TX, T3H, T17, T3M;
+			 {
+			      E TN, TP, T1b, T1f;
+			      TN = Rp[WS(rs, 2)];
+			      TP = Rm[WS(rs, 2)];
+			      TQ = FMA(TM, TN, TO * TP);
+			      T3G = FNMS(TO, TN, TM * TP);
+			      T1b = Rp[WS(rs, 6)];
+			      T1f = Rm[WS(rs, 6)];
+			      T1g = FMA(T1a, T1b, T1e * T1f);
+			      T3N = FNMS(T1e, T1b, T1a * T1f);
+			 }
+			 {
+			      E TU, TW, T12, T16;
+			      TU = Rp[WS(rs, 10)];
+			      TW = Rm[WS(rs, 10)];
+			      TX = FMA(TT, TU, TV * TW);
+			      T3H = FNMS(TV, TU, TT * TW);
+			      T12 = Rp[WS(rs, 14)];
+			      T16 = Rm[WS(rs, 14)];
+			      T17 = FMA(T11, T12, T15 * T16);
+			      T3M = FNMS(T15, T12, T11 * T16);
+			 }
+			 {
+			      E TY, T1h, T6g, T6h;
+			      TY = TQ + TX;
+			      T1h = T17 + T1g;
+			      T1i = TY + T1h;
+			      T7V = T1h - TY;
+			      T6g = T3G + T3H;
+			      T6h = T3M + T3N;
+			      T6i = T6g - T6h;
+			      T7D = T6g + T6h;
+			 }
+			 {
+			      E T3I, T3J, T3L, T3O;
+			      T3I = T3G - T3H;
+			      T3J = TQ - TX;
+			      T3K = T3I - T3J;
+			      T5u = T3J + T3I;
+			      T3L = T17 - T1g;
+			      T3O = T3M - T3N;
+			      T3P = T3L + T3O;
+			      T5v = T3L - T3O;
+			 }
+		    }
+		    {
+			 E T1m, T3S, T1C, T3Z, T1r, T3T, T1x, T3Y;
+			 {
+			      E T1k, T1l, T1z, T1B;
+			      T1k = Rp[WS(rs, 1)];
+			      T1l = Rm[WS(rs, 1)];
+			      T1m = FMA(T8, T1k, Td * T1l);
+			      T3S = FNMS(Td, T1k, T8 * T1l);
+			      T1z = Rp[WS(rs, 13)];
+			      T1B = Rm[WS(rs, 13)];
+			      T1C = FMA(T1y, T1z, T1A * T1B);
+			      T3Z = FNMS(T1A, T1z, T1y * T1B);
+			 }
+			 {
+			      E T1o, T1q, T1u, T1w;
+			      T1o = Rp[WS(rs, 9)];
+			      T1q = Rm[WS(rs, 9)];
+			      T1r = FMA(T1n, T1o, T1p * T1q);
+			      T3T = FNMS(T1p, T1o, T1n * T1q);
+			      T1u = Rp[WS(rs, 5)];
+			      T1w = Rm[WS(rs, 5)];
+			      T1x = FMA(T1t, T1u, T1v * T1w);
+			      T3Y = FNMS(T1v, T1u, T1t * T1w);
+			 }
+			 {
+			      E T1s, T1D, T6k, T6l;
+			      T1s = T1m + T1r;
+			      T1D = T1x + T1C;
+			      T1E = T1s + T1D;
+			      T6n = T1s - T1D;
+			      T6k = T3S + T3T;
+			      T6l = T3Y + T3Z;
+			      T6m = T6k - T6l;
+			      T7e = T6k + T6l;
+			 }
+			 {
+			      E T3U, T3V, T3X, T40;
+			      T3U = T3S - T3T;
+			      T3V = T1x - T1C;
+			      T3W = T3U + T3V;
+			      T5y = T3U - T3V;
+			      T3X = T1m - T1r;
+			      T40 = T3Y - T3Z;
+			      T41 = T3X - T40;
+			      T5z = T3X + T40;
+			 }
+		    }
+		    {
+			 E T1J, T43, T27, T4a, T1U, T44, T20, T49;
+			 {
+			      E T1G, T1I, T24, T26;
+			      T1G = Rp[WS(rs, 15)];
+			      T1I = Rm[WS(rs, 15)];
+			      T1J = FMA(T1F, T1G, T1H * T1I);
+			      T43 = FNMS(T1H, T1G, T1F * T1I);
+			      T24 = Rp[WS(rs, 11)];
+			      T26 = Rm[WS(rs, 11)];
+			      T27 = FMA(T23, T24, T25 * T26);
+			      T4a = FNMS(T25, T24, T23 * T26);
+			 }
+			 {
+			      E T1R, T1T, T1X, T1Z;
+			      T1R = Rp[WS(rs, 7)];
+			      T1T = Rm[WS(rs, 7)];
+			      T1U = FMA(T1Q, T1R, T1S * T1T);
+			      T44 = FNMS(T1S, T1R, T1Q * T1T);
+			      T1X = Rp[WS(rs, 3)];
+			      T1Z = Rm[WS(rs, 3)];
+			      T20 = FMA(T1W, T1X, T1Y * T1Z);
+			      T49 = FNMS(T1Y, T1X, T1W * T1Z);
+			 }
+			 {
+			      E T1V, T28, T6q, T6r;
+			      T1V = T1J + T1U;
+			      T28 = T20 + T27;
+			      T29 = T1V + T28;
+			      T6p = T1V - T28;
+			      T6q = T43 + T44;
+			      T6r = T49 + T4a;
+			      T6s = T6q - T6r;
+			      T7f = T6q + T6r;
+			 }
+			 {
+			      E T45, T46, T48, T4b;
+			      T45 = T43 - T44;
+			      T46 = T20 - T27;
+			      T47 = T45 + T46;
+			      T5B = T45 - T46;
+			      T48 = T1J - T1U;
+			      T4b = T49 - T4a;
+			      T4c = T48 - T4b;
+			      T5C = T48 + T4b;
+			 }
+		    }
+		    {
+			 E T2B, T4r, T2G, T4s, T4q, T4t, T2M, T4m, T2P, T4n, T4l, T4o;
+			 {
+			      E T2z, T2A, T2D, T2F;
+			      T2z = Ip[WS(rs, 2)];
+			      T2A = Im[WS(rs, 2)];
+			      T2B = FMA(T21, T2z, T22 * T2A);
+			      T4r = FNMS(T22, T2z, T21 * T2A);
+			      T2D = Ip[WS(rs, 10)];
+			      T2F = Im[WS(rs, 10)];
+			      T2G = FMA(T2C, T2D, T2E * T2F);
+			      T4s = FNMS(T2E, T2D, T2C * T2F);
+			 }
+			 T4q = T2B - T2G;
+			 T4t = T4r - T4s;
+			 {
+			      E T2J, T2L, T2N, T2O;
+			      T2J = Ip[WS(rs, 14)];
+			      T2L = Im[WS(rs, 14)];
+			      T2M = FMA(T2I, T2J, T2K * T2L);
+			      T4m = FNMS(T2K, T2J, T2I * T2L);
+			      T2N = Ip[WS(rs, 6)];
+			      T2O = Im[WS(rs, 6)];
+			      T2P = FMA(T1M, T2N, T1P * T2O);
+			      T4n = FNMS(T1P, T2N, T1M * T2O);
+			 }
+			 T4l = T2M - T2P;
+			 T4o = T4m - T4n;
+			 {
+			      E T2H, T2Q, T6C, T6D;
+			      T2H = T2B + T2G;
+			      T2Q = T2M + T2P;
+			      T2R = T2H + T2Q;
+			      T6z = T2Q - T2H;
+			      T6C = T4r + T4s;
+			      T6D = T4m + T4n;
+			      T6E = T6C - T6D;
+			      T7k = T6C + T6D;
+			 }
+			 {
+			      E T4p, T4u, T4C, T4D;
+			      T4p = T4l - T4o;
+			      T4u = T4q + T4t;
+			      T4v = KP707106781 * (T4p - T4u);
+			      T5H = KP707106781 * (T4u + T4p);
+			      T4C = T4t - T4q;
+			      T4D = T4l + T4o;
+			      T4E = KP707106781 * (T4C - T4D);
+			      T5K = KP707106781 * (T4C + T4D);
+			 }
+		    }
+		    {
+			 E T3k, T4M, T3p, T4N, T4O, T4P, T3t, T4S, T3w, T4T, T4R, T4U;
+			 {
+			      E T3i, T3j, T3m, T3o;
+			      T3i = Ip[WS(rs, 1)];
+			      T3j = Im[WS(rs, 1)];
+			      T3k = FMA(T3, T3i, T6 * T3j);
+			      T4M = FNMS(T6, T3i, T3 * T3j);
+			      T3m = Ip[WS(rs, 9)];
+			      T3o = Im[WS(rs, 9)];
+			      T3p = FMA(T3l, T3m, T3n * T3o);
+			      T4N = FNMS(T3n, T3m, T3l * T3o);
+			 }
+			 T4O = T4M - T4N;
+			 T4P = T3k - T3p;
+			 {
+			      E T3r, T3s, T3u, T3v;
+			      T3r = Ip[WS(rs, 13)];
+			      T3s = Im[WS(rs, 13)];
+			      T3t = FMA(Th, T3r, Tl * T3s);
+			      T4S = FNMS(Tl, T3r, Th * T3s);
+			      T3u = Ip[WS(rs, 5)];
+			      T3v = Im[WS(rs, 5)];
+			      T3w = FMA(Tg, T3u, Tk * T3v);
+			      T4T = FNMS(Tk, T3u, Tg * T3v);
+			 }
+			 T4R = T3t - T3w;
+			 T4U = T4S - T4T;
+			 {
+			      E T3q, T3x, T6I, T6J;
+			      T3q = T3k + T3p;
+			      T3x = T3t + T3w;
+			      T3y = T3q + T3x;
+			      T6P = T3x - T3q;
+			      T6I = T4M + T4N;
+			      T6J = T4S + T4T;
+			      T6K = T6I - T6J;
+			      T7p = T6I + T6J;
+			 }
+			 {
+			      E T4Q, T4V, T53, T54;
+			      T4Q = T4O - T4P;
+			      T4V = T4R + T4U;
+			      T4W = KP707106781 * (T4Q - T4V);
+			      T5R = KP707106781 * (T4Q + T4V);
+			      T53 = T4R - T4U;
+			      T54 = T4P + T4O;
+			      T55 = KP707106781 * (T53 - T54);
+			      T5O = KP707106781 * (T54 + T53);
+			 }
+		    }
+		    {
+			 E T2b, T7x, T7K, T7M, T3A, T7L, T7A, T7B;
+			 {
+			      E T1j, T2a, T7C, T7J;
+			      T1j = TL + T1i;
+			      T2a = T1E + T29;
+			      T2b = T1j + T2a;
+			      T7x = T1j - T2a;
+			      T7C = T7e + T7f;
+			      T7J = T7D + T7I;
+			      T7K = T7C + T7J;
+			      T7M = T7J - T7C;
+			 }
+			 {
+			      E T2S, T3z, T7y, T7z;
+			      T2S = T2y + T2R;
+			      T3z = T3h + T3y;
+			      T3A = T2S + T3z;
+			      T7L = T3z - T2S;
+			      T7y = T7j + T7k;
+			      T7z = T7o + T7p;
+			      T7A = T7y - T7z;
+			      T7B = T7y + T7z;
+			 }
+			 Rm[WS(rs, 15)] = T2b - T3A;
+			 Im[WS(rs, 15)] = T7B - T7K;
+			 Rp[0] = T2b + T3A;
+			 Ip[0] = T7B + T7K;
+			 Rm[WS(rs, 7)] = T7x - T7A;
+			 Im[WS(rs, 7)] = T7L - T7M;
+			 Rp[WS(rs, 8)] = T7x + T7A;
+			 Ip[WS(rs, 8)] = T7L + T7M;
+		    }
+		    {
+			 E T7h, T7t, T7Q, T7S, T7m, T7u, T7r, T7v;
+			 {
+			      E T7d, T7g, T7O, T7P;
+			      T7d = TL - T1i;
+			      T7g = T7e - T7f;
+			      T7h = T7d + T7g;
+			      T7t = T7d - T7g;
+			      T7O = T29 - T1E;
+			      T7P = T7I - T7D;
+			      T7Q = T7O + T7P;
+			      T7S = T7P - T7O;
+			 }
+			 {
+			      E T7i, T7l, T7n, T7q;
+			      T7i = T2y - T2R;
+			      T7l = T7j - T7k;
+			      T7m = T7i + T7l;
+			      T7u = T7l - T7i;
+			      T7n = T3h - T3y;
+			      T7q = T7o - T7p;
+			      T7r = T7n - T7q;
+			      T7v = T7n + T7q;
+			 }
+			 {
+			      E T7s, T7N, T7w, T7R;
+			      T7s = KP707106781 * (T7m + T7r);
+			      Rm[WS(rs, 11)] = T7h - T7s;
+			      Rp[WS(rs, 4)] = T7h + T7s;
+			      T7N = KP707106781 * (T7u + T7v);
+			      Im[WS(rs, 11)] = T7N - T7Q;
+			      Ip[WS(rs, 4)] = T7N + T7Q;
+			      T7w = KP707106781 * (T7u - T7v);
+			      Rm[WS(rs, 3)] = T7t - T7w;
+			      Rp[WS(rs, 12)] = T7t + T7w;
+			      T7R = KP707106781 * (T7r - T7m);
+			      Im[WS(rs, 3)] = T7R - T7S;
+			      Ip[WS(rs, 12)] = T7R + T7S;
+			 }
+		    }
+		    {
+			 E T6j, T7X, T83, T6X, T6u, T7U, T77, T7b, T70, T82, T6G, T6U, T74, T7a, T6R;
+			 E T6V;
+			 {
+			      E T6o, T6t, T6A, T6F;
+			      T6j = T6f - T6i;
+			      T7X = T7V + T7W;
+			      T83 = T7W - T7V;
+			      T6X = T6f + T6i;
+			      T6o = T6m - T6n;
+			      T6t = T6p + T6s;
+			      T6u = KP707106781 * (T6o - T6t);
+			      T7U = KP707106781 * (T6o + T6t);
+			      {
+				   E T75, T76, T6Y, T6Z;
+				   T75 = T6H + T6K;
+				   T76 = T6O + T6P;
+				   T77 = FNMS(KP382683432, T76, KP923879532 * T75);
+				   T7b = FMA(KP923879532, T76, KP382683432 * T75);
+				   T6Y = T6n + T6m;
+				   T6Z = T6p - T6s;
+				   T70 = KP707106781 * (T6Y + T6Z);
+				   T82 = KP707106781 * (T6Z - T6Y);
+			      }
+			      T6A = T6y - T6z;
+			      T6F = T6B - T6E;
+			      T6G = FMA(KP923879532, T6A, KP382683432 * T6F);
+			      T6U = FNMS(KP923879532, T6F, KP382683432 * T6A);
+			      {
+				   E T72, T73, T6L, T6Q;
+				   T72 = T6y + T6z;
+				   T73 = T6B + T6E;
+				   T74 = FMA(KP382683432, T72, KP923879532 * T73);
+				   T7a = FNMS(KP382683432, T73, KP923879532 * T72);
+				   T6L = T6H - T6K;
+				   T6Q = T6O - T6P;
+				   T6R = FNMS(KP923879532, T6Q, KP382683432 * T6L);
+				   T6V = FMA(KP382683432, T6Q, KP923879532 * T6L);
+			      }
+			 }
+			 {
+			      E T6v, T6S, T81, T84;
+			      T6v = T6j + T6u;
+			      T6S = T6G + T6R;
+			      Rm[WS(rs, 9)] = T6v - T6S;
+			      Rp[WS(rs, 6)] = T6v + T6S;
+			      T81 = T6U + T6V;
+			      T84 = T82 + T83;
+			      Im[WS(rs, 9)] = T81 - T84;
+			      Ip[WS(rs, 6)] = T81 + T84;
+			 }
+			 {
+			      E T6T, T6W, T85, T86;
+			      T6T = T6j - T6u;
+			      T6W = T6U - T6V;
+			      Rm[WS(rs, 1)] = T6T - T6W;
+			      Rp[WS(rs, 14)] = T6T + T6W;
+			      T85 = T6R - T6G;
+			      T86 = T83 - T82;
+			      Im[WS(rs, 1)] = T85 - T86;
+			      Ip[WS(rs, 14)] = T85 + T86;
+			 }
+			 {
+			      E T71, T78, T7T, T7Y;
+			      T71 = T6X + T70;
+			      T78 = T74 + T77;
+			      Rm[WS(rs, 13)] = T71 - T78;
+			      Rp[WS(rs, 2)] = T71 + T78;
+			      T7T = T7a + T7b;
+			      T7Y = T7U + T7X;
+			      Im[WS(rs, 13)] = T7T - T7Y;
+			      Ip[WS(rs, 2)] = T7T + T7Y;
+			 }
+			 {
+			      E T79, T7c, T7Z, T80;
+			      T79 = T6X - T70;
+			      T7c = T7a - T7b;
+			      Rm[WS(rs, 5)] = T79 - T7c;
+			      Rp[WS(rs, 10)] = T79 + T7c;
+			      T7Z = T77 - T74;
+			      T80 = T7X - T7U;
+			      Im[WS(rs, 5)] = T7Z - T80;
+			      Ip[WS(rs, 10)] = T7Z + T80;
+			 }
+		    }
+		    {
+			 E T3R, T5d, T8r, T8x, T4e, T8o, T5n, T5r, T4G, T5a, T5g, T8w, T5k, T5q, T57;
+			 E T5b, T3Q, T8p;
+			 T3Q = KP707106781 * (T3K - T3P);
+			 T3R = T3F - T3Q;
+			 T5d = T3F + T3Q;
+			 T8p = KP707106781 * (T5v - T5u);
+			 T8r = T8p + T8q;
+			 T8x = T8q - T8p;
+			 {
+			      E T42, T4d, T5l, T5m;
+			      T42 = FNMS(KP923879532, T41, KP382683432 * T3W);
+			      T4d = FMA(KP382683432, T47, KP923879532 * T4c);
+			      T4e = T42 - T4d;
+			      T8o = T42 + T4d;
+			      T5l = T4L + T4W;
+			      T5m = T52 + T55;
+			      T5n = FNMS(KP555570233, T5m, KP831469612 * T5l);
+			      T5r = FMA(KP831469612, T5m, KP555570233 * T5l);
+			 }
+			 {
+			      E T4w, T4F, T5e, T5f;
+			      T4w = T4k - T4v;
+			      T4F = T4B - T4E;
+			      T4G = FMA(KP980785280, T4w, KP195090322 * T4F);
+			      T5a = FNMS(KP980785280, T4F, KP195090322 * T4w);
+			      T5e = FMA(KP923879532, T3W, KP382683432 * T41);
+			      T5f = FNMS(KP923879532, T47, KP382683432 * T4c);
+			      T5g = T5e + T5f;
+			      T8w = T5f - T5e;
+			 }
+			 {
+			      E T5i, T5j, T4X, T56;
+			      T5i = T4k + T4v;
+			      T5j = T4B + T4E;
+			      T5k = FMA(KP555570233, T5i, KP831469612 * T5j);
+			      T5q = FNMS(KP555570233, T5j, KP831469612 * T5i);
+			      T4X = T4L - T4W;
+			      T56 = T52 - T55;
+			      T57 = FNMS(KP980785280, T56, KP195090322 * T4X);
+			      T5b = FMA(KP195090322, T56, KP980785280 * T4X);
+			 }
+			 {
+			      E T4f, T58, T8v, T8y;
+			      T4f = T3R + T4e;
+			      T58 = T4G + T57;
+			      Rm[WS(rs, 8)] = T4f - T58;
+			      Rp[WS(rs, 7)] = T4f + T58;
+			      T8v = T5a + T5b;
+			      T8y = T8w + T8x;
+			      Im[WS(rs, 8)] = T8v - T8y;
+			      Ip[WS(rs, 7)] = T8v + T8y;
+			 }
+			 {
+			      E T59, T5c, T8z, T8A;
+			      T59 = T3R - T4e;
+			      T5c = T5a - T5b;
+			      Rm[0] = T59 - T5c;
+			      Rp[WS(rs, 15)] = T59 + T5c;
+			      T8z = T57 - T4G;
+			      T8A = T8x - T8w;
+			      Im[0] = T8z - T8A;
+			      Ip[WS(rs, 15)] = T8z + T8A;
+			 }
+			 {
+			      E T5h, T5o, T8n, T8s;
+			      T5h = T5d + T5g;
+			      T5o = T5k + T5n;
+			      Rm[WS(rs, 12)] = T5h - T5o;
+			      Rp[WS(rs, 3)] = T5h + T5o;
+			      T8n = T5q + T5r;
+			      T8s = T8o + T8r;
+			      Im[WS(rs, 12)] = T8n - T8s;
+			      Ip[WS(rs, 3)] = T8n + T8s;
+			 }
+			 {
+			      E T5p, T5s, T8t, T8u;
+			      T5p = T5d - T5g;
+			      T5s = T5q - T5r;
+			      Rm[WS(rs, 4)] = T5p - T5s;
+			      Rp[WS(rs, 11)] = T5p + T5s;
+			      T8t = T5n - T5k;
+			      T8u = T8r - T8o;
+			      Im[WS(rs, 4)] = T8t - T8u;
+			      Ip[WS(rs, 11)] = T8t + T8u;
+			 }
+		    }
+		    {
+			 E T5x, T5Z, T8d, T8j, T5E, T88, T69, T6d, T5M, T5W, T62, T8i, T66, T6c, T5T;
+			 E T5X, T5w, T89;
+			 T5w = KP707106781 * (T5u + T5v);
+			 T5x = T5t - T5w;
+			 T5Z = T5t + T5w;
+			 T89 = KP707106781 * (T3K + T3P);
+			 T8d = T89 + T8c;
+			 T8j = T8c - T89;
+			 {
+			      E T5A, T5D, T67, T68;
+			      T5A = FNMS(KP382683432, T5z, KP923879532 * T5y);
+			      T5D = FMA(KP923879532, T5B, KP382683432 * T5C);
+			      T5E = T5A - T5D;
+			      T88 = T5A + T5D;
+			      T67 = T5N + T5O;
+			      T68 = T5Q + T5R;
+			      T69 = FNMS(KP195090322, T68, KP980785280 * T67);
+			      T6d = FMA(KP195090322, T67, KP980785280 * T68);
+			 }
+			 {
+			      E T5I, T5L, T60, T61;
+			      T5I = T5G - T5H;
+			      T5L = T5J - T5K;
+			      T5M = FMA(KP555570233, T5I, KP831469612 * T5L);
+			      T5W = FNMS(KP831469612, T5I, KP555570233 * T5L);
+			      T60 = FMA(KP382683432, T5y, KP923879532 * T5z);
+			      T61 = FNMS(KP382683432, T5B, KP923879532 * T5C);
+			      T62 = T60 + T61;
+			      T8i = T61 - T60;
+			 }
+			 {
+			      E T64, T65, T5P, T5S;
+			      T64 = T5G + T5H;
+			      T65 = T5J + T5K;
+			      T66 = FMA(KP980785280, T64, KP195090322 * T65);
+			      T6c = FNMS(KP195090322, T64, KP980785280 * T65);
+			      T5P = T5N - T5O;
+			      T5S = T5Q - T5R;
+			      T5T = FNMS(KP831469612, T5S, KP555570233 * T5P);
+			      T5X = FMA(KP831469612, T5P, KP555570233 * T5S);
+			 }
+			 {
+			      E T5F, T5U, T8h, T8k;
+			      T5F = T5x + T5E;
+			      T5U = T5M + T5T;
+			      Rm[WS(rs, 10)] = T5F - T5U;
+			      Rp[WS(rs, 5)] = T5F + T5U;
+			      T8h = T5W + T5X;
+			      T8k = T8i + T8j;
+			      Im[WS(rs, 10)] = T8h - T8k;
+			      Ip[WS(rs, 5)] = T8h + T8k;
+			 }
+			 {
+			      E T5V, T5Y, T8l, T8m;
+			      T5V = T5x - T5E;
+			      T5Y = T5W - T5X;
+			      Rm[WS(rs, 2)] = T5V - T5Y;
+			      Rp[WS(rs, 13)] = T5V + T5Y;
+			      T8l = T5T - T5M;
+			      T8m = T8j - T8i;
+			      Im[WS(rs, 2)] = T8l - T8m;
+			      Ip[WS(rs, 13)] = T8l + T8m;
+			 }
+			 {
+			      E T63, T6a, T87, T8e;
+			      T63 = T5Z + T62;
+			      T6a = T66 + T69;
+			      Rm[WS(rs, 14)] = T63 - T6a;
+			      Rp[WS(rs, 1)] = T63 + T6a;
+			      T87 = T6c + T6d;
+			      T8e = T88 + T8d;
+			      Im[WS(rs, 14)] = T87 - T8e;
+			      Ip[WS(rs, 1)] = T87 + T8e;
+			 }
+			 {
+			      E T6b, T6e, T8f, T8g;
+			      T6b = T5Z - T62;
+			      T6e = T6c - T6d;
+			      Rm[WS(rs, 6)] = T6b - T6e;
+			      Rp[WS(rs, 9)] = T6b + T6e;
+			      T8f = T69 - T66;
+			      T8g = T8d - T88;
+			      Im[WS(rs, 6)] = T8f - T8g;
+			      Ip[WS(rs, 9)] = T8f + T8g;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cf2_32", twinstr, &GENUS, {376, 168, 112, 0} };
+
+void X(codelet_hc2cf2_32) (planner *p) {
+     X(khc2c_register) (p, hc2cf2_32, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf2_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf2_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:42 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cf2_4 -include hc2cf.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 33 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E Ti, Tq, To, Te, Ty, TA, Tm, Ts;
+	       {
+		    E T2, T6, T3, T5;
+		    T2 = W[0];
+		    T6 = W[3];
+		    T3 = W[2];
+		    T5 = W[1];
+		    {
+			 E T1, Tx, Td, Tw, Tj, Tl, Ta, T4, Tk, Tr;
+			 T1 = Rp[0];
+			 Ta = T2 * T6;
+			 T4 = T2 * T3;
+			 Tx = Rm[0];
+			 {
+			      E T8, Tb, T7, Tc;
+			      T8 = Rp[WS(rs, 1)];
+			      Tb = FNMS(T5, T3, Ta);
+			      T7 = FMA(T5, T6, T4);
+			      Tc = Rm[WS(rs, 1)];
+			      {
+				   E Tf, Th, T9, Tv, Tg, Tp;
+				   Tf = Ip[0];
+				   Th = Im[0];
+				   T9 = T7 * T8;
+				   Tv = T7 * Tc;
+				   Tg = T2 * Tf;
+				   Tp = T2 * Th;
+				   Td = FMA(Tb, Tc, T9);
+				   Tw = FNMS(Tb, T8, Tv);
+				   Ti = FMA(T5, Th, Tg);
+				   Tq = FNMS(T5, Tf, Tp);
+			      }
+			      Tj = Ip[WS(rs, 1)];
+			      Tl = Im[WS(rs, 1)];
+			 }
+			 To = T1 - Td;
+			 Te = T1 + Td;
+			 Ty = Tw + Tx;
+			 TA = Tx - Tw;
+			 Tk = T3 * Tj;
+			 Tr = T3 * Tl;
+			 Tm = FMA(T6, Tl, Tk);
+			 Ts = FNMS(T6, Tj, Tr);
+		    }
+	       }
+	       {
+		    E Tn, Tz, Tu, Tt;
+		    Tn = Ti + Tm;
+		    Tz = Tm - Ti;
+		    Tu = Tq + Ts;
+		    Tt = Tq - Ts;
+		    Ip[WS(rs, 1)] = Tz + TA;
+		    Im[0] = Tz - TA;
+		    Rp[0] = Te + Tn;
+		    Rm[WS(rs, 1)] = Te - Tn;
+		    Rp[WS(rs, 1)] = To + Tt;
+		    Rm[0] = To - Tt;
+		    Ip[0] = Tu + Ty;
+		    Im[WS(rs, 1)] = Tu - Ty;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cf2_4", twinstr, &GENUS, {16, 8, 8, 0} };
+
+void X(codelet_hc2cf2_4) (planner *p) {
+     X(khc2c_register) (p, hc2cf2_4, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cf2_4 -include hc2cf.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 21 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T2, T4, T3, T5, T6, T8;
+	       T2 = W[0];
+	       T4 = W[1];
+	       T3 = W[2];
+	       T5 = W[3];
+	       T6 = FMA(T2, T3, T4 * T5);
+	       T8 = FNMS(T4, T3, T2 * T5);
+	       {
+		    E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
+		    T1 = Rp[0];
+		    Tp = Rm[0];
+		    T7 = Rp[WS(rs, 1)];
+		    T9 = Rm[WS(rs, 1)];
+		    Ta = FMA(T6, T7, T8 * T9);
+		    To = FNMS(T8, T7, T6 * T9);
+		    {
+			 E Tc, Td, Tf, Tg;
+			 Tc = Ip[0];
+			 Td = Im[0];
+			 Te = FMA(T2, Tc, T4 * Td);
+			 Tk = FNMS(T4, Tc, T2 * Td);
+			 Tf = Ip[WS(rs, 1)];
+			 Tg = Im[WS(rs, 1)];
+			 Th = FMA(T3, Tf, T5 * Tg);
+			 Tl = FNMS(T5, Tf, T3 * Tg);
+		    }
+		    {
+			 E Tb, Ti, Tn, Tq;
+			 Tb = T1 + Ta;
+			 Ti = Te + Th;
+			 Rm[WS(rs, 1)] = Tb - Ti;
+			 Rp[0] = Tb + Ti;
+			 Tn = Tk + Tl;
+			 Tq = To + Tp;
+			 Im[WS(rs, 1)] = Tn - Tq;
+			 Ip[0] = Tn + Tq;
+		    }
+		    {
+			 E Tj, Tm, Tr, Ts;
+			 Tj = T1 - Ta;
+			 Tm = Tk - Tl;
+			 Rm[0] = Tj - Tm;
+			 Rp[WS(rs, 1)] = Tj + Tm;
+			 Tr = Th - Te;
+			 Ts = Tp - To;
+			 Im[0] = Tr - Ts;
+			 Ip[WS(rs, 1)] = Tr + Ts;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cf2_4", twinstr, &GENUS, {16, 8, 8, 0} };
+
+void X(codelet_hc2cf2_4) (planner *p) {
+     X(khc2c_register) (p, hc2cf2_4, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf2_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf2_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:42 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cf2_8 -include hc2cf.h */
+
+/*
+ * This function contains 74 FP additions, 50 FP multiplications,
+ * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
+ * 64 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E TS, T1m, TJ, T1l, T1k, Tw, T1w, T1u;
+	       {
+		    E T2, T3, Tl, Tn, T5, T4, Tm, Tr, T6;
+		    T2 = W[0];
+		    T3 = W[2];
+		    Tl = W[4];
+		    Tn = W[5];
+		    T5 = W[1];
+		    T4 = T2 * T3;
+		    Tm = T2 * Tl;
+		    Tr = T2 * Tn;
+		    T6 = W[3];
+		    {
+			 E T1, T1s, TG, Td, T1r, Tu, TY, Tk, TW, T18, T1d, TD, TH, TA, T13;
+			 E TE, T14;
+			 {
+			      E To, Ts, Tf, T7, T8, Ti, Tb, T9, Tc, TC, Ta, TF, TB, Tg, Th;
+			      E Tj;
+			      T1 = Rp[0];
+			      To = FMA(T5, Tn, Tm);
+			      Ts = FNMS(T5, Tl, Tr);
+			      Tf = FMA(T5, T6, T4);
+			      T7 = FNMS(T5, T6, T4);
+			      Ta = T2 * T6;
+			      T1s = Rm[0];
+			      T8 = Rp[WS(rs, 2)];
+			      TF = Tf * Tn;
+			      TB = Tf * Tl;
+			      Ti = FNMS(T5, T3, Ta);
+			      Tb = FMA(T5, T3, Ta);
+			      T9 = T7 * T8;
+			      Tc = Rm[WS(rs, 2)];
+			      TG = FNMS(Ti, Tl, TF);
+			      TC = FMA(Ti, Tn, TB);
+			      {
+				   E Tp, T1q, Tt, Tq, TX;
+				   Tp = Rp[WS(rs, 3)];
+				   Td = FMA(Tb, Tc, T9);
+				   T1q = T7 * Tc;
+				   Tt = Rm[WS(rs, 3)];
+				   Tq = To * Tp;
+				   Tg = Rp[WS(rs, 1)];
+				   T1r = FNMS(Tb, T8, T1q);
+				   TX = To * Tt;
+				   Tu = FMA(Ts, Tt, Tq);
+				   Th = Tf * Tg;
+				   Tj = Rm[WS(rs, 1)];
+				   TY = FNMS(Ts, Tp, TX);
+			      }
+			      {
+				   E TO, TQ, TN, TP, T1a, T1b;
+				   {
+					E TK, TM, TL, T19, TV;
+					TK = Ip[WS(rs, 3)];
+					TM = Im[WS(rs, 3)];
+					Tk = FMA(Ti, Tj, Th);
+					TV = Tf * Tj;
+					TL = Tl * TK;
+					T19 = Tl * TM;
+					TO = Ip[WS(rs, 1)];
+					TW = FNMS(Ti, Tg, TV);
+					TQ = Im[WS(rs, 1)];
+					TN = FMA(Tn, TM, TL);
+					TP = T3 * TO;
+					T1a = FNMS(Tn, TK, T19);
+					T1b = T3 * TQ;
+				   }
+				   {
+					E Tx, Tz, Ty, T12, T1c, TR;
+					Tx = Ip[0];
+					TR = FMA(T6, TQ, TP);
+					Tz = Im[0];
+					T1c = FNMS(T6, TO, T1b);
+					Ty = T2 * Tx;
+					T18 = TN - TR;
+					TS = TN + TR;
+					T12 = T2 * Tz;
+					T1d = T1a - T1c;
+					T1m = T1a + T1c;
+					TD = Ip[WS(rs, 2)];
+					TH = Im[WS(rs, 2)];
+					TA = FMA(T5, Tz, Ty);
+					T13 = FNMS(T5, Tx, T12);
+					TE = TC * TD;
+					T14 = TC * TH;
+				   }
+			      }
+			 }
+			 {
+			      E Te, T1p, T1t, Tv;
+			      {
+				   E T1g, T10, T1z, T1B, T1A, T1j, T1C, T1f;
+				   {
+					E T1x, T11, T16, T1y;
+					{
+					     E TU, TZ, TI, T15;
+					     Te = T1 + Td;
+					     TU = T1 - Td;
+					     TZ = TW - TY;
+					     T1p = TW + TY;
+					     TI = FMA(TG, TH, TE);
+					     T15 = FNMS(TG, TD, T14);
+					     T1t = T1r + T1s;
+					     T1x = T1s - T1r;
+					     T1g = TU - TZ;
+					     T10 = TU + TZ;
+					     T11 = TA - TI;
+					     TJ = TA + TI;
+					     T1l = T13 + T15;
+					     T16 = T13 - T15;
+					     T1y = Tk - Tu;
+					     Tv = Tk + Tu;
+					}
+					{
+					     E T1i, T1e, T17, T1h;
+					     T1i = T18 + T1d;
+					     T1e = T18 - T1d;
+					     T17 = T11 + T16;
+					     T1h = T16 - T11;
+					     T1z = T1x - T1y;
+					     T1B = T1y + T1x;
+					     T1A = T1h + T1i;
+					     T1j = T1h - T1i;
+					     T1C = T1e - T17;
+					     T1f = T17 + T1e;
+					}
+				   }
+				   Rm[0] = FNMS(KP707106781, T1j, T1g);
+				   Im[0] = FMS(KP707106781, T1C, T1B);
+				   Rp[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
+				   Rm[WS(rs, 2)] = FNMS(KP707106781, T1f, T10);
+				   Ip[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
+				   Im[WS(rs, 2)] = FMS(KP707106781, T1A, T1z);
+				   Rp[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
+				   Ip[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
+			      }
+			      T1k = Te - Tv;
+			      Tw = Te + Tv;
+			      T1w = T1t - T1p;
+			      T1u = T1p + T1t;
+			 }
+		    }
+	       }
+	       {
+		    E TT, T1v, T1n, T1o;
+		    TT = TJ + TS;
+		    T1v = TS - TJ;
+		    T1n = T1l - T1m;
+		    T1o = T1l + T1m;
+		    Ip[WS(rs, 2)] = T1v + T1w;
+		    Im[WS(rs, 1)] = T1v - T1w;
+		    Rp[0] = Tw + TT;
+		    Rm[WS(rs, 3)] = Tw - TT;
+		    Ip[0] = T1o + T1u;
+		    Im[WS(rs, 3)] = T1o - T1u;
+		    Rp[WS(rs, 2)] = T1k + T1n;
+		    Rm[WS(rs, 1)] = T1k - T1n;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cf2_8", twinstr, &GENUS, {44, 20, 30, 0} };
+
+void X(codelet_hc2cf2_8) (planner *p) {
+     X(khc2c_register) (p, hc2cf2_8, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cf2_8 -include hc2cf.h */
+
+/*
+ * This function contains 74 FP additions, 44 FP multiplications,
+ * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
+ * 42 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
+	       {
+		    E T4, Tb, T7, Ta;
+		    T2 = W[0];
+		    T5 = W[1];
+		    T3 = W[2];
+		    T6 = W[3];
+		    T4 = T2 * T3;
+		    Tb = T5 * T3;
+		    T7 = T5 * T6;
+		    Ta = T2 * T6;
+		    T8 = T4 - T7;
+		    Tc = Ta + Tb;
+		    Tg = T4 + T7;
+		    Ti = Ta - Tb;
+		    Tl = W[4];
+		    Tm = W[5];
+		    Tn = FMA(T2, Tl, T5 * Tm);
+		    Tz = FNMS(Ti, Tl, Tg * Tm);
+		    Tp = FNMS(T5, Tl, T2 * Tm);
+		    Tx = FMA(Tg, Tl, Ti * Tm);
+	       }
+	       {
+		    E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
+		    E TT;
+		    {
+			 E T1, T1c, Te, T1b, T9, Td;
+			 T1 = Rp[0];
+			 T1c = Rm[0];
+			 T9 = Rp[WS(rs, 2)];
+			 Td = Rm[WS(rs, 2)];
+			 Te = FMA(T8, T9, Tc * Td);
+			 T1b = FNMS(Tc, T9, T8 * Td);
+			 Tf = T1 + Te;
+			 T1i = T1c - T1b;
+			 TL = T1 - Te;
+			 T1d = T1b + T1c;
+		    }
+		    {
+			 E TF, TW, TI, TX;
+			 {
+			      E TD, TE, TG, TH;
+			      TD = Ip[WS(rs, 3)];
+			      TE = Im[WS(rs, 3)];
+			      TF = FMA(Tl, TD, Tm * TE);
+			      TW = FNMS(Tm, TD, Tl * TE);
+			      TG = Ip[WS(rs, 1)];
+			      TH = Im[WS(rs, 1)];
+			      TI = FMA(T3, TG, T6 * TH);
+			      TX = FNMS(T6, TG, T3 * TH);
+			 }
+			 TJ = TF + TI;
+			 T17 = TW + TX;
+			 TV = TF - TI;
+			 TY = TW - TX;
+		    }
+		    {
+			 E Tk, TM, Tr, TN;
+			 {
+			      E Th, Tj, To, Tq;
+			      Th = Rp[WS(rs, 1)];
+			      Tj = Rm[WS(rs, 1)];
+			      Tk = FMA(Tg, Th, Ti * Tj);
+			      TM = FNMS(Ti, Th, Tg * Tj);
+			      To = Rp[WS(rs, 3)];
+			      Tq = Rm[WS(rs, 3)];
+			      Tr = FMA(Tn, To, Tp * Tq);
+			      TN = FNMS(Tp, To, Tn * Tq);
+			 }
+			 Ts = Tk + Tr;
+			 T1j = Tk - Tr;
+			 TO = TM - TN;
+			 T1a = TM + TN;
+		    }
+		    {
+			 E Tw, TR, TB, TS;
+			 {
+			      E Tu, Tv, Ty, TA;
+			      Tu = Ip[0];
+			      Tv = Im[0];
+			      Tw = FMA(T2, Tu, T5 * Tv);
+			      TR = FNMS(T5, Tu, T2 * Tv);
+			      Ty = Ip[WS(rs, 2)];
+			      TA = Im[WS(rs, 2)];
+			      TB = FMA(Tx, Ty, Tz * TA);
+			      TS = FNMS(Tz, Ty, Tx * TA);
+			 }
+			 TC = Tw + TB;
+			 T16 = TR + TS;
+			 TQ = Tw - TB;
+			 TT = TR - TS;
+		    }
+		    {
+			 E Tt, TK, T1f, T1g;
+			 Tt = Tf + Ts;
+			 TK = TC + TJ;
+			 Rm[WS(rs, 3)] = Tt - TK;
+			 Rp[0] = Tt + TK;
+			 {
+			      E T19, T1e, T15, T18;
+			      T19 = T16 + T17;
+			      T1e = T1a + T1d;
+			      Im[WS(rs, 3)] = T19 - T1e;
+			      Ip[0] = T19 + T1e;
+			      T15 = Tf - Ts;
+			      T18 = T16 - T17;
+			      Rm[WS(rs, 1)] = T15 - T18;
+			      Rp[WS(rs, 2)] = T15 + T18;
+			 }
+			 T1f = TJ - TC;
+			 T1g = T1d - T1a;
+			 Im[WS(rs, 1)] = T1f - T1g;
+			 Ip[WS(rs, 2)] = T1f + T1g;
+			 {
+			      E T11, T1k, T14, T1h, T12, T13;
+			      T11 = TL - TO;
+			      T1k = T1i - T1j;
+			      T12 = TT - TQ;
+			      T13 = TV + TY;
+			      T14 = KP707106781 * (T12 - T13);
+			      T1h = KP707106781 * (T12 + T13);
+			      Rm[0] = T11 - T14;
+			      Ip[WS(rs, 1)] = T1h + T1k;
+			      Rp[WS(rs, 3)] = T11 + T14;
+			      Im[WS(rs, 2)] = T1h - T1k;
+			 }
+			 {
+			      E TP, T1m, T10, T1l, TU, TZ;
+			      TP = TL + TO;
+			      T1m = T1j + T1i;
+			      TU = TQ + TT;
+			      TZ = TV - TY;
+			      T10 = KP707106781 * (TU + TZ);
+			      T1l = KP707106781 * (TZ - TU);
+			      Rm[WS(rs, 2)] = TP - T10;
+			      Ip[WS(rs, 3)] = T1l + T1m;
+			      Rp[WS(rs, 1)] = TP + T10;
+			      Im[0] = T1l - T1m;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cf2_8", twinstr, &GENUS, {56, 26, 18, 0} };
+
+void X(codelet_hc2cf2_8) (planner *p) {
+     X(khc2c_register) (p, hc2cf2_8, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:30 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cf_10 -include hc2cf.h */
+
+/*
+ * This function contains 102 FP additions, 72 FP multiplications,
+ * (or, 48 additions, 18 multiplications, 54 fused multiply/add),
+ * 70 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T1X, T21, T20, T22;
+	       {
+		    E T26, T1U, T8, T12, T1n, T1P, T24, T1K, T1Y, T18, T10, T2b, T1H, T23, T15;
+		    E T1Z, T2a, Tz, T1O, T1y;
+		    {
+			 E T1, T1T, T3, T6, T2, T5;
+			 T1 = Rp[0];
+			 T1T = Rm[0];
+			 T3 = Ip[WS(rs, 2)];
+			 T6 = Im[WS(rs, 2)];
+			 T2 = W[8];
+			 T5 = W[9];
+			 {
+			      E T1l, TY, T1h, T1J, TM, T16, T1j, TS;
+			      {
+				   E TF, T1e, TO, TR, T1g, TL, TN, TQ, T1i, TP;
+				   {
+					E TU, TX, TT, TW;
+					{
+					     E TB, TE, T1R, T4, TA, TD;
+					     TB = Rp[WS(rs, 2)];
+					     TE = Rm[WS(rs, 2)];
+					     T1R = T2 * T6;
+					     T4 = T2 * T3;
+					     TA = W[6];
+					     TD = W[7];
+					     {
+						  E T1S, T7, T1d, TC;
+						  T1S = FNMS(T5, T3, T1R);
+						  T7 = FMA(T5, T6, T4);
+						  T1d = TA * TE;
+						  TC = TA * TB;
+						  T26 = T1T - T1S;
+						  T1U = T1S + T1T;
+						  T8 = T1 - T7;
+						  T12 = T1 + T7;
+						  TF = FMA(TD, TE, TC);
+						  T1e = FNMS(TD, TB, T1d);
+					     }
+					}
+					TU = Ip[0];
+					TX = Im[0];
+					TT = W[0];
+					TW = W[1];
+					{
+					     E TH, TK, TJ, T1f, TI, T1k, TV, TG;
+					     TH = Ip[WS(rs, 4)];
+					     TK = Im[WS(rs, 4)];
+					     T1k = TT * TX;
+					     TV = TT * TU;
+					     TG = W[16];
+					     TJ = W[17];
+					     T1l = FNMS(TW, TU, T1k);
+					     TY = FMA(TW, TX, TV);
+					     T1f = TG * TK;
+					     TI = TG * TH;
+					     TO = Rp[WS(rs, 3)];
+					     TR = Rm[WS(rs, 3)];
+					     T1g = FNMS(TJ, TH, T1f);
+					     TL = FMA(TJ, TK, TI);
+					     TN = W[10];
+					     TQ = W[11];
+					}
+				   }
+				   T1h = T1e + T1g;
+				   T1J = T1g - T1e;
+				   TM = TF - TL;
+				   T16 = TF + TL;
+				   T1i = TN * TR;
+				   TP = TN * TO;
+				   T1j = FNMS(TQ, TO, T1i);
+				   TS = FMA(TQ, TR, TP);
+			      }
+			      {
+				   E T1p, Te, T1w, Tx, Tn, Tq, Tp, T1r, Tk, T1t, To;
+				   {
+					E Tt, Tw, Tv, T1v, Tu;
+					{
+					     E Ta, Td, T9, Tc, T1o, Tb, Ts;
+					     Ta = Rp[WS(rs, 1)];
+					     Td = Rm[WS(rs, 1)];
+					     {
+						  E T1I, T1m, TZ, T17;
+						  T1I = T1l - T1j;
+						  T1m = T1j + T1l;
+						  TZ = TS - TY;
+						  T17 = TS + TY;
+						  T1n = T1h - T1m;
+						  T1P = T1h + T1m;
+						  T24 = T1J + T1I;
+						  T1K = T1I - T1J;
+						  T1Y = T16 - T17;
+						  T18 = T16 + T17;
+						  T10 = TM + TZ;
+						  T2b = TZ - TM;
+						  T9 = W[2];
+					     }
+					     Tc = W[3];
+					     Tt = Ip[WS(rs, 1)];
+					     Tw = Im[WS(rs, 1)];
+					     T1o = T9 * Td;
+					     Tb = T9 * Ta;
+					     Ts = W[4];
+					     Tv = W[5];
+					     T1p = FNMS(Tc, Ta, T1o);
+					     Te = FMA(Tc, Td, Tb);
+					     T1v = Ts * Tw;
+					     Tu = Ts * Tt;
+					}
+					{
+					     E Tg, Tj, Tf, Ti, T1q, Th, Tm;
+					     Tg = Ip[WS(rs, 3)];
+					     Tj = Im[WS(rs, 3)];
+					     T1w = FNMS(Tv, Tt, T1v);
+					     Tx = FMA(Tv, Tw, Tu);
+					     Tf = W[12];
+					     Ti = W[13];
+					     Tn = Rp[WS(rs, 4)];
+					     Tq = Rm[WS(rs, 4)];
+					     T1q = Tf * Tj;
+					     Th = Tf * Tg;
+					     Tm = W[14];
+					     Tp = W[15];
+					     T1r = FNMS(Ti, Tg, T1q);
+					     Tk = FMA(Ti, Tj, Th);
+					     T1t = Tm * Tq;
+					     To = Tm * Tn;
+					}
+				   }
+				   {
+					E T1s, T1G, Tl, T13, T1u, Tr;
+					T1s = T1p + T1r;
+					T1G = T1r - T1p;
+					Tl = Te - Tk;
+					T13 = Te + Tk;
+					T1u = FNMS(Tp, Tn, T1t);
+					Tr = FMA(Tp, Tq, To);
+					{
+					     E T1x, T1F, T14, Ty;
+					     T1x = T1u + T1w;
+					     T1F = T1w - T1u;
+					     T14 = Tr + Tx;
+					     Ty = Tr - Tx;
+					     T1H = T1F - T1G;
+					     T23 = T1G + T1F;
+					     T15 = T13 + T14;
+					     T1Z = T13 - T14;
+					     T2a = Ty - Tl;
+					     Tz = Tl + Ty;
+					     T1O = T1s + T1x;
+					     T1y = T1s - T1x;
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T2c, T2e, T29, T2d;
+			 {
+			      E T1D, T11, T25, T28, T27;
+			      T1D = Tz - T10;
+			      T11 = Tz + T10;
+			      T25 = T23 + T24;
+			      T28 = T24 - T23;
+			      {
+				   E T1N, T1L, T1C, T1M, T1E;
+				   T1N = FNMS(KP618033988, T1H, T1K);
+				   T1L = FMA(KP618033988, T1K, T1H);
+				   Rm[WS(rs, 4)] = T8 + T11;
+				   T1C = FNMS(KP250000000, T11, T8);
+				   T1M = FNMS(KP559016994, T1D, T1C);
+				   T1E = FMA(KP559016994, T1D, T1C);
+				   T27 = FMA(KP250000000, T25, T26);
+				   T2c = FMA(KP618033988, T2b, T2a);
+				   T2e = FNMS(KP618033988, T2a, T2b);
+				   Rp[WS(rs, 1)] = FMA(KP951056516, T1L, T1E);
+				   Rm[0] = FNMS(KP951056516, T1L, T1E);
+				   Rp[WS(rs, 3)] = FMA(KP951056516, T1N, T1M);
+				   Rm[WS(rs, 2)] = FNMS(KP951056516, T1N, T1M);
+			      }
+			      Im[WS(rs, 4)] = T25 - T26;
+			      T29 = FMA(KP559016994, T28, T27);
+			      T2d = FNMS(KP559016994, T28, T27);
+			 }
+			 {
+			      E T1c, T1A, T1z, T1B, T19, T1b, T1a, T1Q, T1W, T1V;
+			      T19 = T15 + T18;
+			      T1b = T15 - T18;
+			      Ip[WS(rs, 3)] = FMA(KP951056516, T2e, T2d);
+			      Im[WS(rs, 2)] = FMS(KP951056516, T2e, T2d);
+			      Ip[WS(rs, 1)] = FMA(KP951056516, T2c, T29);
+			      Im[0] = FMS(KP951056516, T2c, T29);
+			      T1a = FNMS(KP250000000, T19, T12);
+			      Rp[0] = T12 + T19;
+			      T1c = FNMS(KP559016994, T1b, T1a);
+			      T1A = FMA(KP559016994, T1b, T1a);
+			      T1z = FNMS(KP618033988, T1y, T1n);
+			      T1B = FMA(KP618033988, T1n, T1y);
+			      T1Q = T1O + T1P;
+			      T1W = T1O - T1P;
+			      Rm[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
+			      Rp[WS(rs, 4)] = FNMS(KP951056516, T1B, T1A);
+			      Rm[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
+			      Rp[WS(rs, 2)] = FNMS(KP951056516, T1z, T1c);
+			      T1V = FNMS(KP250000000, T1Q, T1U);
+			      Ip[0] = T1Q + T1U;
+			      T1X = FNMS(KP559016994, T1W, T1V);
+			      T21 = FMA(KP559016994, T1W, T1V);
+			      T20 = FNMS(KP618033988, T1Z, T1Y);
+			      T22 = FMA(KP618033988, T1Y, T1Z);
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 4)] = FMA(KP951056516, T22, T21);
+	       Im[WS(rs, 3)] = FMS(KP951056516, T22, T21);
+	       Ip[WS(rs, 2)] = FMA(KP951056516, T20, T1X);
+	       Im[WS(rs, 1)] = FMS(KP951056516, T20, T1X);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 10, "hc2cf_10", twinstr, &GENUS, {48, 18, 54, 0} };
+
+void X(codelet_hc2cf_10) (planner *p) {
+     X(khc2c_register) (p, hc2cf_10, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cf_10 -include hc2cf.h */
+
+/*
+ * This function contains 102 FP additions, 60 FP multiplications,
+ * (or, 72 additions, 30 multiplications, 30 fused multiply/add),
+ * 45 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T7, T1O, TT, T1C, TF, TQ, TR, T1r, T1s, T1L, TX, TY, TZ, T16, T19;
+	       E T1y, Ti, Tt, Tu, T1o, T1p, T1M, TU, TV, TW, T1d, T1g, T1x;
+	       {
+		    E T1, T1B, T6, T1A;
+		    T1 = Rp[0];
+		    T1B = Rm[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = Ip[WS(rs, 2)];
+			 T5 = Im[WS(rs, 2)];
+			 T2 = W[8];
+			 T4 = W[9];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T1A = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 - T6;
+		    T1O = T1B - T1A;
+		    TT = T1 + T6;
+		    T1C = T1A + T1B;
+	       }
+	       {
+		    E Tz, T14, TP, T18, TE, T15, TK, T17;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = Rp[WS(rs, 2)];
+			 Ty = Rm[WS(rs, 2)];
+			 Tv = W[6];
+			 Tx = W[7];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T14 = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TM, TO, TL, TN;
+			 TM = Ip[0];
+			 TO = Im[0];
+			 TL = W[0];
+			 TN = W[1];
+			 TP = FMA(TL, TM, TN * TO);
+			 T18 = FNMS(TN, TM, TL * TO);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = Ip[WS(rs, 4)];
+			 TD = Im[WS(rs, 4)];
+			 TA = W[16];
+			 TC = W[17];
+			 TE = FMA(TA, TB, TC * TD);
+			 T15 = FNMS(TC, TB, TA * TD);
+		    }
+		    {
+			 E TH, TJ, TG, TI;
+			 TH = Rp[WS(rs, 3)];
+			 TJ = Rm[WS(rs, 3)];
+			 TG = W[10];
+			 TI = W[11];
+			 TK = FMA(TG, TH, TI * TJ);
+			 T17 = FNMS(TI, TH, TG * TJ);
+		    }
+		    TF = Tz - TE;
+		    TQ = TK - TP;
+		    TR = TF + TQ;
+		    T1r = T14 - T15;
+		    T1s = T18 - T17;
+		    T1L = T1s - T1r;
+		    TX = Tz + TE;
+		    TY = TK + TP;
+		    TZ = TX + TY;
+		    T16 = T14 + T15;
+		    T19 = T17 + T18;
+		    T1y = T16 + T19;
+	       }
+	       {
+		    E Tc, T1b, Ts, T1f, Th, T1c, Tn, T1e;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = Rp[WS(rs, 1)];
+			 Tb = Rm[WS(rs, 1)];
+			 T8 = W[2];
+			 Ta = W[3];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T1b = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = Ip[WS(rs, 1)];
+			 Tr = Im[WS(rs, 1)];
+			 To = W[4];
+			 Tq = W[5];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 T1f = FNMS(Tq, Tp, To * Tr);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = Ip[WS(rs, 3)];
+			 Tg = Im[WS(rs, 3)];
+			 Td = W[12];
+			 Tf = W[13];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T1c = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = Rp[WS(rs, 4)];
+			 Tm = Rm[WS(rs, 4)];
+			 Tj = W[14];
+			 Tl = W[15];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 T1e = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    Ti = Tc - Th;
+		    Tt = Tn - Ts;
+		    Tu = Ti + Tt;
+		    T1o = T1b - T1c;
+		    T1p = T1e - T1f;
+		    T1M = T1o + T1p;
+		    TU = Tc + Th;
+		    TV = Tn + Ts;
+		    TW = TU + TV;
+		    T1d = T1b + T1c;
+		    T1g = T1e + T1f;
+		    T1x = T1d + T1g;
+	       }
+	       {
+		    E T1l, TS, T1m, T1u, T1w, T1q, T1t, T1v, T1n;
+		    T1l = KP559016994 * (Tu - TR);
+		    TS = Tu + TR;
+		    T1m = FNMS(KP250000000, TS, T7);
+		    T1q = T1o - T1p;
+		    T1t = T1r + T1s;
+		    T1u = FMA(KP951056516, T1q, KP587785252 * T1t);
+		    T1w = FNMS(KP587785252, T1q, KP951056516 * T1t);
+		    Rm[WS(rs, 4)] = T7 + TS;
+		    T1v = T1m - T1l;
+		    Rm[WS(rs, 2)] = T1v - T1w;
+		    Rp[WS(rs, 3)] = T1v + T1w;
+		    T1n = T1l + T1m;
+		    Rm[0] = T1n - T1u;
+		    Rp[WS(rs, 1)] = T1n + T1u;
+	       }
+	       {
+		    E T1S, T1N, T1T, T1R, T1V, T1P, T1Q, T1W, T1U;
+		    T1S = KP559016994 * (T1M + T1L);
+		    T1N = T1L - T1M;
+		    T1T = FMA(KP250000000, T1N, T1O);
+		    T1P = TQ - TF;
+		    T1Q = Ti - Tt;
+		    T1R = FNMS(KP951056516, T1Q, KP587785252 * T1P);
+		    T1V = FMA(KP587785252, T1Q, KP951056516 * T1P);
+		    Im[WS(rs, 4)] = T1N - T1O;
+		    T1W = T1T - T1S;
+		    Im[WS(rs, 2)] = T1V - T1W;
+		    Ip[WS(rs, 3)] = T1V + T1W;
+		    T1U = T1S + T1T;
+		    Im[0] = T1R - T1U;
+		    Ip[WS(rs, 1)] = T1R + T1U;
+	       }
+	       {
+		    E T12, T10, T11, T1i, T1k, T1a, T1h, T1j, T13;
+		    T12 = KP559016994 * (TW - TZ);
+		    T10 = TW + TZ;
+		    T11 = FNMS(KP250000000, T10, TT);
+		    T1a = T16 - T19;
+		    T1h = T1d - T1g;
+		    T1i = FNMS(KP587785252, T1h, KP951056516 * T1a);
+		    T1k = FMA(KP951056516, T1h, KP587785252 * T1a);
+		    Rp[0] = TT + T10;
+		    T1j = T12 + T11;
+		    Rp[WS(rs, 4)] = T1j - T1k;
+		    Rm[WS(rs, 3)] = T1j + T1k;
+		    T13 = T11 - T12;
+		    Rp[WS(rs, 2)] = T13 - T1i;
+		    Rm[WS(rs, 1)] = T13 + T1i;
+	       }
+	       {
+		    E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
+		    T1H = KP559016994 * (T1x - T1y);
+		    T1z = T1x + T1y;
+		    T1G = FNMS(KP250000000, T1z, T1C);
+		    T1D = TX - TY;
+		    T1E = TU - TV;
+		    T1F = FNMS(KP587785252, T1E, KP951056516 * T1D);
+		    T1J = FMA(KP951056516, T1E, KP587785252 * T1D);
+		    Ip[0] = T1z + T1C;
+		    T1K = T1H + T1G;
+		    Im[WS(rs, 3)] = T1J - T1K;
+		    Ip[WS(rs, 4)] = T1J + T1K;
+		    T1I = T1G - T1H;
+		    Im[WS(rs, 1)] = T1F - T1I;
+		    Ip[WS(rs, 2)] = T1F + T1I;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 10, "hc2cf_10", twinstr, &GENUS, {72, 30, 30, 0} };
+
+void X(codelet_hc2cf_10) (planner *p) {
+     X(khc2c_register) (p, hc2cf_10, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:31 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cf_12 -include hc2cf.h */
+
+/*
+ * This function contains 118 FP additions, 68 FP multiplications,
+ * (or, 72 additions, 22 multiplications, 46 fused multiply/add),
+ * 84 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
+	       E T2n, T2u;
+	       {
+		    E T1, T2i, T2e, Tl, T1Y, T10, T1S, TG, T2f, T1s, T2s, Ty, T1Z, T1H, T21;
+		    E T1d, TI, TL, T2h, T1l, T2p, Te, TJ, T1w, TO, TR, TN, TK, TQ;
+		    {
+			 E TW, TZ, TY, T1X, TX;
+			 T1 = Rp[0];
+			 T2i = Rm[0];
+			 {
+			      E Th, Tk, Tg, Tj, T2d, Ti, TV;
+			      Th = Rp[WS(rs, 3)];
+			      Tk = Rm[WS(rs, 3)];
+			      Tg = W[10];
+			      Tj = W[11];
+			      TW = Ip[WS(rs, 4)];
+			      TZ = Im[WS(rs, 4)];
+			      T2d = Tg * Tk;
+			      Ti = Tg * Th;
+			      TV = W[16];
+			      TY = W[17];
+			      T2e = FNMS(Tj, Th, T2d);
+			      Tl = FMA(Tj, Tk, Ti);
+			      T1X = TV * TZ;
+			      TX = TV * TW;
+			 }
+			 {
+			      E Tn, Tq, Tt, T1o, To, Tw, Ts, Tp, Tv;
+			      {
+				   E TC, TF, TB, TE, T1R, TD, Tm;
+				   TC = Ip[WS(rs, 1)];
+				   TF = Im[WS(rs, 1)];
+				   T1Y = FNMS(TY, TW, T1X);
+				   T10 = FMA(TY, TZ, TX);
+				   TB = W[4];
+				   TE = W[5];
+				   Tn = Rp[WS(rs, 5)];
+				   Tq = Rm[WS(rs, 5)];
+				   T1R = TB * TF;
+				   TD = TB * TC;
+				   Tm = W[18];
+				   Tt = Rp[WS(rs, 1)];
+				   T1S = FNMS(TE, TC, T1R);
+				   TG = FMA(TE, TF, TD);
+				   T1o = Tm * Tq;
+				   To = Tm * Tn;
+				   Tw = Rm[WS(rs, 1)];
+				   Ts = W[2];
+				   Tp = W[19];
+				   Tv = W[3];
+			      }
+			      {
+				   E T12, T15, T13, T1D, T18, T1b, T17, T14, T1a;
+				   {
+					E T1p, Tr, T1r, Tx, T1q, Tu, T11;
+					T12 = Ip[0];
+					T1q = Ts * Tw;
+					Tu = Ts * Tt;
+					T1p = FNMS(Tp, Tn, T1o);
+					Tr = FMA(Tp, Tq, To);
+					T1r = FNMS(Tv, Tt, T1q);
+					Tx = FMA(Tv, Tw, Tu);
+					T15 = Im[0];
+					T11 = W[0];
+					T2f = T1p + T1r;
+					T1s = T1p - T1r;
+					T2s = Tx - Tr;
+					Ty = Tr + Tx;
+					T13 = T11 * T12;
+					T1D = T11 * T15;
+				   }
+				   T18 = Ip[WS(rs, 2)];
+				   T1b = Im[WS(rs, 2)];
+				   T17 = W[8];
+				   T14 = W[1];
+				   T1a = W[9];
+				   {
+					E T3, T6, T4, T1h, T9, Tc, T8, T5, Tb;
+					{
+					     E T1E, T16, T1G, T1c, T1F, T19, T2;
+					     T3 = Rp[WS(rs, 2)];
+					     T1F = T17 * T1b;
+					     T19 = T17 * T18;
+					     T1E = FNMS(T14, T12, T1D);
+					     T16 = FMA(T14, T15, T13);
+					     T1G = FNMS(T1a, T18, T1F);
+					     T1c = FMA(T1a, T1b, T19);
+					     T6 = Rm[WS(rs, 2)];
+					     T2 = W[6];
+					     T1Z = T1E + T1G;
+					     T1H = T1E - T1G;
+					     T21 = T1c - T16;
+					     T1d = T16 + T1c;
+					     T4 = T2 * T3;
+					     T1h = T2 * T6;
+					}
+					T9 = Rp[WS(rs, 4)];
+					Tc = Rm[WS(rs, 4)];
+					T8 = W[14];
+					T5 = W[7];
+					Tb = W[15];
+					{
+					     E T1i, T7, T1k, Td, T1j, Ta, TH;
+					     TI = Ip[WS(rs, 3)];
+					     T1j = T8 * Tc;
+					     Ta = T8 * T9;
+					     T1i = FNMS(T5, T3, T1h);
+					     T7 = FMA(T5, T6, T4);
+					     T1k = FNMS(Tb, T9, T1j);
+					     Td = FMA(Tb, Tc, Ta);
+					     TL = Im[WS(rs, 3)];
+					     TH = W[12];
+					     T2h = T1i + T1k;
+					     T1l = T1i - T1k;
+					     T2p = Td - T7;
+					     Te = T7 + Td;
+					     TJ = TH * TI;
+					     T1w = TH * TL;
+					}
+					TO = Ip[WS(rs, 5)];
+					TR = Im[WS(rs, 5)];
+					TN = W[20];
+					TK = W[13];
+					TQ = W[21];
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T1g, T1n, T2r, T1A, T1V, T28, TA, T2o, T1v, T1C, T1U, T29, T2m, T2k, T2l;
+			 E T1f, T2a, T20;
+			 {
+			      E T2g, T1T, TT, T2j, TU, T1e;
+			      {
+				   E Tf, T1x, TM, T1z, TS, Tz, T1y, TP;
+				   T1g = FNMS(KP500000000, Te, T1);
+				   Tf = T1 + Te;
+				   T1y = TN * TR;
+				   TP = TN * TO;
+				   T1x = FNMS(TK, TI, T1w);
+				   TM = FMA(TK, TL, TJ);
+				   T1z = FNMS(TQ, TO, T1y);
+				   TS = FMA(TQ, TR, TP);
+				   Tz = Tl + Ty;
+				   T1n = FNMS(KP500000000, Ty, Tl);
+				   T2r = FNMS(KP500000000, T2f, T2e);
+				   T2g = T2e + T2f;
+				   T1T = T1x + T1z;
+				   T1A = T1x - T1z;
+				   T1V = TS - TM;
+				   TT = TM + TS;
+				   T28 = Tf - Tz;
+				   TA = Tf + Tz;
+				   T2j = T2h + T2i;
+				   T2o = FNMS(KP500000000, T2h, T2i);
+			      }
+			      T1v = FNMS(KP500000000, TT, TG);
+			      TU = TG + TT;
+			      T1e = T10 + T1d;
+			      T1C = FNMS(KP500000000, T1d, T10);
+			      T1U = FNMS(KP500000000, T1T, T1S);
+			      T29 = T1S + T1T;
+			      T2m = T2j - T2g;
+			      T2k = T2g + T2j;
+			      T2l = TU - T1e;
+			      T1f = TU + T1e;
+			      T2a = T1Y + T1Z;
+			      T20 = FNMS(KP500000000, T1Z, T1Y);
+			 }
+			 {
+			      E T1m, T1K, T2z, T2q, T2y, T2t, T1L, T1t, T1B, T1N, T2c, T2b;
+			      Im[WS(rs, 2)] = T2l - T2m;
+			      Ip[WS(rs, 3)] = T2l + T2m;
+			      Rp[0] = TA + T1f;
+			      Rm[WS(rs, 5)] = TA - T1f;
+			      T2c = T29 + T2a;
+			      T2b = T29 - T2a;
+			      T1m = FNMS(KP866025403, T1l, T1g);
+			      T1K = FMA(KP866025403, T1l, T1g);
+			      Ip[0] = T2c + T2k;
+			      Im[WS(rs, 5)] = T2c - T2k;
+			      Rm[WS(rs, 2)] = T28 + T2b;
+			      Rp[WS(rs, 3)] = T28 - T2b;
+			      T2z = FNMS(KP866025403, T2p, T2o);
+			      T2q = FMA(KP866025403, T2p, T2o);
+			      T2y = FNMS(KP866025403, T2s, T2r);
+			      T2t = FMA(KP866025403, T2s, T2r);
+			      T1L = FMA(KP866025403, T1s, T1n);
+			      T1t = FNMS(KP866025403, T1s, T1n);
+			      T1B = FNMS(KP866025403, T1A, T1v);
+			      T1N = FMA(KP866025403, T1A, T1v);
+			      {
+				   E T1Q, T2C, T23, T24, T2B, T27, T2v, T2w;
+				   {
+					E T1u, T25, T26, T1O, T1I, T2A, T2x, T1W, T22, T1M, T1J, T1P;
+					T1Q = T1m - T1t;
+					T1u = T1m + T1t;
+					T25 = FMA(KP866025403, T1V, T1U);
+					T1W = FNMS(KP866025403, T1V, T1U);
+					T26 = FMA(KP866025403, T21, T20);
+					T22 = FNMS(KP866025403, T21, T20);
+					T1O = FMA(KP866025403, T1H, T1C);
+					T1I = FNMS(KP866025403, T1H, T1C);
+					T2A = T2y + T2z;
+					T2C = T2z - T2y;
+					T23 = T1W - T22;
+					T2x = T1W + T22;
+					T1M = T1K + T1L;
+					T24 = T1K - T1L;
+					T2B = T1I - T1B;
+					T1J = T1B + T1I;
+					T1P = T1N + T1O;
+					T2n = T1O - T1N;
+					Ip[WS(rs, 2)] = T2A - T2x;
+					Im[WS(rs, 3)] = -(T2x + T2A);
+					Rm[WS(rs, 3)] = T1u + T1J;
+					Rp[WS(rs, 2)] = T1u - T1J;
+					Rm[WS(rs, 1)] = T1M - T1P;
+					Rp[WS(rs, 4)] = T1M + T1P;
+					T27 = T25 - T26;
+					T2v = T25 + T26;
+					T2w = T2t + T2q;
+					T2u = T2q - T2t;
+				   }
+				   Ip[WS(rs, 4)] = T2v + T2w;
+				   Im[WS(rs, 1)] = T2v - T2w;
+				   Rp[WS(rs, 5)] = T1Q + T23;
+				   Rm[0] = T1Q - T23;
+				   Ip[WS(rs, 5)] = T2B + T2C;
+				   Im[0] = T2B - T2C;
+				   Rp[WS(rs, 1)] = T24 + T27;
+				   Rm[WS(rs, 4)] = T24 - T27;
+			      }
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 1)] = T2n + T2u;
+	       Im[WS(rs, 4)] = T2n - T2u;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 12, "hc2cf_12", twinstr, &GENUS, {72, 22, 46, 0} };
+
+void X(codelet_hc2cf_12) (planner *p) {
+     X(khc2c_register) (p, hc2cf_12, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cf_12 -include hc2cf.h */
+
+/*
+ * This function contains 118 FP additions, 60 FP multiplications,
+ * (or, 88 additions, 30 multiplications, 30 fused multiply/add),
+ * 47 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
+	       E T1, T1W, T18, T22, Tc, T15, T1V, T23, TR, T1E, T1o, T1D, T12, T1l, T1F;
+	       E T1G, Ti, T1S, T1d, T25, Tt, T1a, T1T, T26, TA, T1y, T1j, T1B, TL, T1g;
+	       E T1z, T1A;
+	       {
+		    E T6, T16, Tb, T17;
+		    T1 = Rp[0];
+		    T1W = Rm[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = Rp[WS(rs, 2)];
+			 T5 = Rm[WS(rs, 2)];
+			 T2 = W[6];
+			 T4 = W[7];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T16 = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = Rp[WS(rs, 4)];
+			 Ta = Rm[WS(rs, 4)];
+			 T7 = W[14];
+			 T9 = W[15];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 T17 = FNMS(T9, T8, T7 * Ta);
+		    }
+		    T18 = KP866025403 * (T16 - T17);
+		    T22 = KP866025403 * (Tb - T6);
+		    Tc = T6 + Tb;
+		    T15 = FNMS(KP500000000, Tc, T1);
+		    T1V = T16 + T17;
+		    T23 = FNMS(KP500000000, T1V, T1W);
+	       }
+	       {
+		    E T11, T1n, TW, T1m;
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = Ip[WS(rs, 4)];
+			 TQ = Im[WS(rs, 4)];
+			 TN = W[16];
+			 TP = W[17];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T1E = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E TY, T10, TX, TZ;
+			 TY = Ip[WS(rs, 2)];
+			 T10 = Im[WS(rs, 2)];
+			 TX = W[8];
+			 TZ = W[9];
+			 T11 = FMA(TX, TY, TZ * T10);
+			 T1n = FNMS(TZ, TY, TX * T10);
+		    }
+		    {
+			 E TT, TV, TS, TU;
+			 TT = Ip[0];
+			 TV = Im[0];
+			 TS = W[0];
+			 TU = W[1];
+			 TW = FMA(TS, TT, TU * TV);
+			 T1m = FNMS(TU, TT, TS * TV);
+		    }
+		    T1o = KP866025403 * (T1m - T1n);
+		    T1D = KP866025403 * (T11 - TW);
+		    T12 = TW + T11;
+		    T1l = FNMS(KP500000000, T12, TR);
+		    T1F = T1m + T1n;
+		    T1G = FNMS(KP500000000, T1F, T1E);
+	       }
+	       {
+		    E Ts, T1c, Tn, T1b;
+		    {
+			 E Tf, Th, Te, Tg;
+			 Tf = Rp[WS(rs, 3)];
+			 Th = Rm[WS(rs, 3)];
+			 Te = W[10];
+			 Tg = W[11];
+			 Ti = FMA(Te, Tf, Tg * Th);
+			 T1S = FNMS(Tg, Tf, Te * Th);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = Rp[WS(rs, 1)];
+			 Tr = Rm[WS(rs, 1)];
+			 To = W[2];
+			 Tq = W[3];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 T1c = FNMS(Tq, Tp, To * Tr);
+		    }
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = Rp[WS(rs, 5)];
+			 Tm = Rm[WS(rs, 5)];
+			 Tj = W[18];
+			 Tl = W[19];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 T1b = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    T1d = KP866025403 * (T1b - T1c);
+		    T25 = KP866025403 * (Ts - Tn);
+		    Tt = Tn + Ts;
+		    T1a = FNMS(KP500000000, Tt, Ti);
+		    T1T = T1b + T1c;
+		    T26 = FNMS(KP500000000, T1T, T1S);
+	       }
+	       {
+		    E TK, T1i, TF, T1h;
+		    {
+			 E Tx, Tz, Tw, Ty;
+			 Tx = Ip[WS(rs, 1)];
+			 Tz = Im[WS(rs, 1)];
+			 Tw = W[4];
+			 Ty = W[5];
+			 TA = FMA(Tw, Tx, Ty * Tz);
+			 T1y = FNMS(Ty, Tx, Tw * Tz);
+		    }
+		    {
+			 E TH, TJ, TG, TI;
+			 TH = Ip[WS(rs, 5)];
+			 TJ = Im[WS(rs, 5)];
+			 TG = W[20];
+			 TI = W[21];
+			 TK = FMA(TG, TH, TI * TJ);
+			 T1i = FNMS(TI, TH, TG * TJ);
+		    }
+		    {
+			 E TC, TE, TB, TD;
+			 TC = Ip[WS(rs, 3)];
+			 TE = Im[WS(rs, 3)];
+			 TB = W[12];
+			 TD = W[13];
+			 TF = FMA(TB, TC, TD * TE);
+			 T1h = FNMS(TD, TC, TB * TE);
+		    }
+		    T1j = KP866025403 * (T1h - T1i);
+		    T1B = KP866025403 * (TK - TF);
+		    TL = TF + TK;
+		    T1g = FNMS(KP500000000, TL, TA);
+		    T1z = T1h + T1i;
+		    T1A = FNMS(KP500000000, T1z, T1y);
+	       }
+	       {
+		    E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
+		    {
+			 E Td, Tu, T1U, T1X;
+			 Td = T1 + Tc;
+			 Tu = Ti + Tt;
+			 Tv = Td + Tu;
+			 T1N = Td - Tu;
+			 T1U = T1S + T1T;
+			 T1X = T1V + T1W;
+			 T1Y = T1U + T1X;
+			 T20 = T1X - T1U;
+		    }
+		    {
+			 E TM, T13, T1O, T1P;
+			 TM = TA + TL;
+			 T13 = TR + T12;
+			 T14 = TM + T13;
+			 T1Z = TM - T13;
+			 T1O = T1y + T1z;
+			 T1P = T1E + T1F;
+			 T1Q = T1O - T1P;
+			 T1R = T1O + T1P;
+		    }
+		    Rm[WS(rs, 5)] = Tv - T14;
+		    Im[WS(rs, 5)] = T1R - T1Y;
+		    Rp[0] = Tv + T14;
+		    Ip[0] = T1R + T1Y;
+		    Rp[WS(rs, 3)] = T1N - T1Q;
+		    Ip[WS(rs, 3)] = T1Z + T20;
+		    Rm[WS(rs, 2)] = T1N + T1Q;
+		    Im[WS(rs, 2)] = T1Z - T20;
+	       }
+	       {
+		    E T1t, T1J, T28, T2a, T1w, T21, T1M, T29;
+		    {
+			 E T1r, T1s, T24, T27;
+			 T1r = T15 + T18;
+			 T1s = T1a + T1d;
+			 T1t = T1r + T1s;
+			 T1J = T1r - T1s;
+			 T24 = T22 + T23;
+			 T27 = T25 + T26;
+			 T28 = T24 - T27;
+			 T2a = T27 + T24;
+		    }
+		    {
+			 E T1u, T1v, T1K, T1L;
+			 T1u = T1g + T1j;
+			 T1v = T1l + T1o;
+			 T1w = T1u + T1v;
+			 T21 = T1v - T1u;
+			 T1K = T1B + T1A;
+			 T1L = T1D + T1G;
+			 T1M = T1K - T1L;
+			 T29 = T1K + T1L;
+		    }
+		    Rm[WS(rs, 1)] = T1t - T1w;
+		    Im[WS(rs, 1)] = T29 - T2a;
+		    Rp[WS(rs, 4)] = T1t + T1w;
+		    Ip[WS(rs, 4)] = T29 + T2a;
+		    Rm[WS(rs, 4)] = T1J - T1M;
+		    Im[WS(rs, 4)] = T21 - T28;
+		    Rp[WS(rs, 1)] = T1J + T1M;
+		    Ip[WS(rs, 1)] = T21 + T28;
+	       }
+	       {
+		    E T1f, T1x, T2e, T2g, T1q, T2f, T1I, T2b;
+		    {
+			 E T19, T1e, T2c, T2d;
+			 T19 = T15 - T18;
+			 T1e = T1a - T1d;
+			 T1f = T19 + T1e;
+			 T1x = T19 - T1e;
+			 T2c = T26 - T25;
+			 T2d = T23 - T22;
+			 T2e = T2c + T2d;
+			 T2g = T2d - T2c;
+		    }
+		    {
+			 E T1k, T1p, T1C, T1H;
+			 T1k = T1g - T1j;
+			 T1p = T1l - T1o;
+			 T1q = T1k + T1p;
+			 T2f = T1p - T1k;
+			 T1C = T1A - T1B;
+			 T1H = T1D - T1G;
+			 T1I = T1C + T1H;
+			 T2b = T1H - T1C;
+		    }
+		    Rp[WS(rs, 2)] = T1f - T1q;
+		    Ip[WS(rs, 2)] = T2b + T2e;
+		    Rm[WS(rs, 3)] = T1f + T1q;
+		    Im[WS(rs, 3)] = T2b - T2e;
+		    Rm[0] = T1x - T1I;
+		    Im[0] = T2f - T2g;
+		    Rp[WS(rs, 5)] = T1x + T1I;
+		    Ip[WS(rs, 5)] = T2f + T2g;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 12, "hc2cf_12", twinstr, &GENUS, {88, 30, 30, 0} };
+
+void X(codelet_hc2cf_12) (planner *p) {
+     X(khc2c_register) (p, hc2cf_12, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,785 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:31 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cf_16 -include hc2cf.h */
+
+/*
+ * This function contains 174 FP additions, 100 FP multiplications,
+ * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
+ * 97 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T3G, T3F;
+	       {
+		    E T3z, T3o, T8, T1I, T2p, T35, T2r, T1s, T2w, T36, T2k, T1F, T3k, T1N, T3A;
+		    E Tl, T1T, T2V, T1U, Tz, T29, T30, T2c, T11, TB, TE, T2h, T31, T2a, T1e;
+		    E TC, T1X, TH, TK, TG, TD, TJ;
+		    {
+			 E Ta, Td, Tb, T1J, Tg, Tj, Tf, Tc, Ti;
+			 {
+			      E T1h, T1k, T1n, T2l, T1i, T1q, T1m, T1j, T1p;
+			      {
+				   E T1, T3n, T3, T6, T2, T5;
+				   T1 = Rp[0];
+				   T3n = Rm[0];
+				   T3 = Rp[WS(rs, 4)];
+				   T6 = Rm[WS(rs, 4)];
+				   T2 = W[14];
+				   T5 = W[15];
+				   {
+					E T3l, T4, T1g, T3m, T7;
+					T1h = Ip[WS(rs, 7)];
+					T1k = Im[WS(rs, 7)];
+					T3l = T2 * T6;
+					T4 = T2 * T3;
+					T1g = W[28];
+					T1n = Ip[WS(rs, 3)];
+					T3m = FNMS(T5, T3, T3l);
+					T7 = FMA(T5, T6, T4);
+					T2l = T1g * T1k;
+					T1i = T1g * T1h;
+					T3z = T3n - T3m;
+					T3o = T3m + T3n;
+					T8 = T1 + T7;
+					T1I = T1 - T7;
+					T1q = Im[WS(rs, 3)];
+					T1m = W[12];
+				   }
+				   T1j = W[29];
+				   T1p = W[13];
+			      }
+			      {
+				   E T1u, T1x, T1v, T2s, T1A, T1D, T1z, T1w, T1C;
+				   {
+					E T2m, T1l, T2o, T1r, T2n, T1o, T1t;
+					T1u = Ip[WS(rs, 1)];
+					T2n = T1m * T1q;
+					T1o = T1m * T1n;
+					T2m = FNMS(T1j, T1h, T2l);
+					T1l = FMA(T1j, T1k, T1i);
+					T2o = FNMS(T1p, T1n, T2n);
+					T1r = FMA(T1p, T1q, T1o);
+					T1x = Im[WS(rs, 1)];
+					T1t = W[4];
+					T2p = T2m - T2o;
+					T35 = T2m + T2o;
+					T2r = T1l - T1r;
+					T1s = T1l + T1r;
+					T1v = T1t * T1u;
+					T2s = T1t * T1x;
+				   }
+				   T1A = Ip[WS(rs, 5)];
+				   T1D = Im[WS(rs, 5)];
+				   T1z = W[20];
+				   T1w = W[5];
+				   T1C = W[21];
+				   {
+					E T2t, T1y, T2v, T1E, T2u, T1B, T9;
+					Ta = Rp[WS(rs, 2)];
+					T2u = T1z * T1D;
+					T1B = T1z * T1A;
+					T2t = FNMS(T1w, T1u, T2s);
+					T1y = FMA(T1w, T1x, T1v);
+					T2v = FNMS(T1C, T1A, T2u);
+					T1E = FMA(T1C, T1D, T1B);
+					Td = Rm[WS(rs, 2)];
+					T9 = W[6];
+					T2w = T2t - T2v;
+					T36 = T2t + T2v;
+					T2k = T1E - T1y;
+					T1F = T1y + T1E;
+					Tb = T9 * Ta;
+					T1J = T9 * Td;
+				   }
+				   Tg = Rp[WS(rs, 6)];
+				   Tj = Rm[WS(rs, 6)];
+				   Tf = W[22];
+				   Tc = W[7];
+				   Ti = W[23];
+			      }
+			 }
+			 {
+			      E TQ, TT, TR, T25, TW, TZ, TV, TS, TY;
+			      {
+				   E To, Tr, Tp, T1P, Tu, Tx, Tt, Tq, Tw;
+				   {
+					E T1K, Te, T1M, Tk, T1L, Th, Tn;
+					To = Rp[WS(rs, 1)];
+					T1L = Tf * Tj;
+					Th = Tf * Tg;
+					T1K = FNMS(Tc, Ta, T1J);
+					Te = FMA(Tc, Td, Tb);
+					T1M = FNMS(Ti, Tg, T1L);
+					Tk = FMA(Ti, Tj, Th);
+					Tr = Rm[WS(rs, 1)];
+					Tn = W[2];
+					T3k = T1K + T1M;
+					T1N = T1K - T1M;
+					T3A = Te - Tk;
+					Tl = Te + Tk;
+					Tp = Tn * To;
+					T1P = Tn * Tr;
+				   }
+				   Tu = Rp[WS(rs, 5)];
+				   Tx = Rm[WS(rs, 5)];
+				   Tt = W[18];
+				   Tq = W[3];
+				   Tw = W[19];
+				   {
+					E T1Q, Ts, T1S, Ty, T1R, Tv, TP;
+					TQ = Ip[0];
+					T1R = Tt * Tx;
+					Tv = Tt * Tu;
+					T1Q = FNMS(Tq, To, T1P);
+					Ts = FMA(Tq, Tr, Tp);
+					T1S = FNMS(Tw, Tu, T1R);
+					Ty = FMA(Tw, Tx, Tv);
+					TT = Im[0];
+					TP = W[0];
+					T1T = T1Q - T1S;
+					T2V = T1Q + T1S;
+					T1U = Ts - Ty;
+					Tz = Ts + Ty;
+					TR = TP * TQ;
+					T25 = TP * TT;
+				   }
+				   TW = Ip[WS(rs, 4)];
+				   TZ = Im[WS(rs, 4)];
+				   TV = W[16];
+				   TS = W[1];
+				   TY = W[17];
+			      }
+			      {
+				   E T13, T16, T14, T2d, T19, T1c, T18, T15, T1b;
+				   {
+					E T26, TU, T28, T10, T27, TX, T12;
+					T13 = Ip[WS(rs, 2)];
+					T27 = TV * TZ;
+					TX = TV * TW;
+					T26 = FNMS(TS, TQ, T25);
+					TU = FMA(TS, TT, TR);
+					T28 = FNMS(TY, TW, T27);
+					T10 = FMA(TY, TZ, TX);
+					T16 = Im[WS(rs, 2)];
+					T12 = W[8];
+					T29 = T26 - T28;
+					T30 = T26 + T28;
+					T2c = TU - T10;
+					T11 = TU + T10;
+					T14 = T12 * T13;
+					T2d = T12 * T16;
+				   }
+				   T19 = Ip[WS(rs, 6)];
+				   T1c = Im[WS(rs, 6)];
+				   T18 = W[24];
+				   T15 = W[9];
+				   T1b = W[25];
+				   {
+					E T2e, T17, T2g, T1d, T2f, T1a, TA;
+					TB = Rp[WS(rs, 7)];
+					T2f = T18 * T1c;
+					T1a = T18 * T19;
+					T2e = FNMS(T15, T13, T2d);
+					T17 = FMA(T15, T16, T14);
+					T2g = FNMS(T1b, T19, T2f);
+					T1d = FMA(T1b, T1c, T1a);
+					TE = Rm[WS(rs, 7)];
+					TA = W[26];
+					T2h = T2e - T2g;
+					T31 = T2e + T2g;
+					T2a = T17 - T1d;
+					T1e = T17 + T1d;
+					TC = TA * TB;
+					T1X = TA * TE;
+				   }
+				   TH = Rp[WS(rs, 3)];
+				   TK = Rm[WS(rs, 3)];
+				   TG = W[10];
+				   TD = W[27];
+				   TJ = W[11];
+			      }
+			 }
+		    }
+		    {
+			 E T2U, T3u, T2Z, T21, T1W, T34, T2X, T3f, T32, T3t, T1H, T3q, T3e, TO, T3g;
+			 E T37, T3r, T3s, T3h, T3i;
+			 {
+			      E Tm, T1Y, TF, T20, TL, T3p, T1Z, TI;
+			      T2U = T8 - Tl;
+			      Tm = T8 + Tl;
+			      T1Z = TG * TK;
+			      TI = TG * TH;
+			      T1Y = FNMS(TD, TB, T1X);
+			      TF = FMA(TD, TE, TC);
+			      T20 = FNMS(TJ, TH, T1Z);
+			      TL = FMA(TJ, TK, TI);
+			      T3p = T3k + T3o;
+			      T3u = T3o - T3k;
+			      {
+				   E T1f, TM, T1G, T3j, T2W, TN;
+				   T2Z = T11 - T1e;
+				   T1f = T11 + T1e;
+				   T21 = T1Y - T20;
+				   T2W = T1Y + T20;
+				   T1W = TF - TL;
+				   TM = TF + TL;
+				   T1G = T1s + T1F;
+				   T34 = T1s - T1F;
+				   T2X = T2V - T2W;
+				   T3j = T2V + T2W;
+				   T3f = T30 + T31;
+				   T32 = T30 - T31;
+				   T3t = TM - Tz;
+				   TN = Tz + TM;
+				   T3r = T1G - T1f;
+				   T1H = T1f + T1G;
+				   T3s = T3p - T3j;
+				   T3q = T3j + T3p;
+				   T3e = Tm - TN;
+				   TO = Tm + TN;
+				   T3g = T35 + T36;
+				   T37 = T35 - T36;
+			      }
+			 }
+			 Im[WS(rs, 3)] = T3r - T3s;
+			 Ip[WS(rs, 4)] = T3r + T3s;
+			 Rp[0] = TO + T1H;
+			 Rm[WS(rs, 7)] = TO - T1H;
+			 T3h = T3f - T3g;
+			 T3i = T3f + T3g;
+			 {
+			      E T3a, T2Y, T3x, T3v, T3b, T33;
+			      Ip[0] = T3i + T3q;
+			      Im[WS(rs, 7)] = T3i - T3q;
+			      Rp[WS(rs, 4)] = T3e + T3h;
+			      Rm[WS(rs, 3)] = T3e - T3h;
+			      T3a = T2U - T2X;
+			      T2Y = T2U + T2X;
+			      T3x = T3u - T3t;
+			      T3v = T3t + T3u;
+			      T3b = T32 - T2Z;
+			      T33 = T2Z + T32;
+			      {
+				   E T2E, T1O, T3B, T3H, T2x, T2q, T3C, T23, T2S, T2O, T2K, T2J, T3I, T2H, T2B;
+				   E T2j;
+				   {
+					E T2F, T1V, T22, T2G, T3c, T38;
+					T2E = T1I + T1N;
+					T1O = T1I - T1N;
+					T3B = T3z - T3A;
+					T3H = T3A + T3z;
+					T3c = T34 + T37;
+					T38 = T34 - T37;
+					T2F = T1U + T1T;
+					T1V = T1T - T1U;
+					{
+					     E T3d, T3w, T3y, T39;
+					     T3d = T3b - T3c;
+					     T3w = T3b + T3c;
+					     T3y = T38 - T33;
+					     T39 = T33 + T38;
+					     Rp[WS(rs, 6)] = FMA(KP707106781, T3d, T3a);
+					     Rm[WS(rs, 1)] = FNMS(KP707106781, T3d, T3a);
+					     Ip[WS(rs, 2)] = FMA(KP707106781, T3w, T3v);
+					     Im[WS(rs, 5)] = FMS(KP707106781, T3w, T3v);
+					     Ip[WS(rs, 6)] = FMA(KP707106781, T3y, T3x);
+					     Im[WS(rs, 1)] = FMS(KP707106781, T3y, T3x);
+					     Rp[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
+					     Rm[WS(rs, 5)] = FNMS(KP707106781, T39, T2Y);
+					     T22 = T1W + T21;
+					     T2G = T1W - T21;
+					}
+					{
+					     E T2M, T2N, T2b, T2i;
+					     T2x = T2r - T2w;
+					     T2M = T2r + T2w;
+					     T2N = T2p + T2k;
+					     T2q = T2k - T2p;
+					     T3C = T1V + T22;
+					     T23 = T1V - T22;
+					     T2S = FMA(KP414213562, T2M, T2N);
+					     T2O = FNMS(KP414213562, T2N, T2M);
+					     T2K = T29 - T2a;
+					     T2b = T29 + T2a;
+					     T2i = T2c - T2h;
+					     T2J = T2c + T2h;
+					     T3I = T2G - T2F;
+					     T2H = T2F + T2G;
+					     T2B = FNMS(KP414213562, T2b, T2i);
+					     T2j = FMA(KP414213562, T2i, T2b);
+					}
+				   }
+				   {
+					E T2R, T2L, T3L, T3M;
+					{
+					     E T2A, T24, T2C, T2y, T3J, T3K, T2D, T2z;
+					     T2A = FNMS(KP707106781, T23, T1O);
+					     T24 = FMA(KP707106781, T23, T1O);
+					     T2R = FNMS(KP414213562, T2J, T2K);
+					     T2L = FMA(KP414213562, T2K, T2J);
+					     T2C = FNMS(KP414213562, T2q, T2x);
+					     T2y = FMA(KP414213562, T2x, T2q);
+					     T3J = FMA(KP707106781, T3I, T3H);
+					     T3L = FNMS(KP707106781, T3I, T3H);
+					     T3K = T2C - T2B;
+					     T2D = T2B + T2C;
+					     T3M = T2y - T2j;
+					     T2z = T2j + T2y;
+					     Ip[WS(rs, 3)] = FMA(KP923879532, T3K, T3J);
+					     Im[WS(rs, 4)] = FMS(KP923879532, T3K, T3J);
+					     Rp[WS(rs, 3)] = FMA(KP923879532, T2z, T24);
+					     Rm[WS(rs, 4)] = FNMS(KP923879532, T2z, T24);
+					     Rm[0] = FMA(KP923879532, T2D, T2A);
+					     Rp[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A);
+					}
+					{
+					     E T2Q, T3D, T3E, T2T, T2I, T2P;
+					     T2Q = FNMS(KP707106781, T2H, T2E);
+					     T2I = FMA(KP707106781, T2H, T2E);
+					     T2P = T2L + T2O;
+					     T3G = T2O - T2L;
+					     T3F = FNMS(KP707106781, T3C, T3B);
+					     T3D = FMA(KP707106781, T3C, T3B);
+					     Ip[WS(rs, 7)] = FMA(KP923879532, T3M, T3L);
+					     Im[0] = FMS(KP923879532, T3M, T3L);
+					     Rp[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
+					     Rm[WS(rs, 6)] = FNMS(KP923879532, T2P, T2I);
+					     T3E = T2R + T2S;
+					     T2T = T2R - T2S;
+					     Ip[WS(rs, 1)] = FMA(KP923879532, T3E, T3D);
+					     Im[WS(rs, 6)] = FMS(KP923879532, T3E, T3D);
+					     Rp[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q);
+					     Rm[WS(rs, 2)] = FNMS(KP923879532, T2T, T2Q);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 5)] = FMA(KP923879532, T3G, T3F);
+	       Im[WS(rs, 2)] = FMS(KP923879532, T3G, T3F);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cf_16", twinstr, &GENUS, {104, 30, 70, 0} };
+
+void X(codelet_hc2cf_16) (planner *p) {
+     X(khc2c_register) (p, hc2cf_16, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cf_16 -include hc2cf.h */
+
+/*
+ * This function contains 174 FP additions, 84 FP multiplications,
+ * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
+ * 52 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
+	       E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
+	       E T2y, T2z, T1O, T2g, T1T, T2h;
+	       {
+		    E T1, T2T, T6, T2S;
+		    T1 = Rp[0];
+		    T2T = Rm[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = Rp[WS(rs, 4)];
+			 T5 = Rm[WS(rs, 4)];
+			 T2 = W[14];
+			 T4 = W[15];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T2S = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 + T6;
+		    T37 = T2T - T2S;
+		    T1t = T1 - T6;
+		    T2U = T2S + T2T;
+	       }
+	       {
+		    E Tc, T1u, Th, T1v;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = Rp[WS(rs, 2)];
+			 Tb = Rm[WS(rs, 2)];
+			 T8 = W[6];
+			 Ta = W[7];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T1u = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = Rp[WS(rs, 6)];
+			 Tg = Rm[WS(rs, 6)];
+			 Td = W[22];
+			 Tf = W[23];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T1v = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Ti = Tc + Th;
+		    T38 = Tc - Th;
+		    T1w = T1u - T1v;
+		    T2R = T1u + T1v;
+	       }
+	       {
+		    E To, T1y, Tt, T1z, T1A, T1B;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = Rp[WS(rs, 1)];
+			 Tn = Rm[WS(rs, 1)];
+			 Tk = W[2];
+			 Tm = W[3];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 T1y = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = Rp[WS(rs, 5)];
+			 Ts = Rm[WS(rs, 5)];
+			 Tp = W[18];
+			 Tr = W[19];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 T1z = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    Tu = To + Tt;
+		    T2s = T1y + T1z;
+		    T1A = T1y - T1z;
+		    T1B = To - Tt;
+		    T1C = T1A - T1B;
+		    T2c = T1B + T1A;
+	       }
+	       {
+		    E Tz, T1E, TE, T1F, T1D, T1G;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = Rp[WS(rs, 7)];
+			 Ty = Rm[WS(rs, 7)];
+			 Tv = W[26];
+			 Tx = W[27];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T1E = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = Rp[WS(rs, 3)];
+			 TD = Rm[WS(rs, 3)];
+			 TA = W[10];
+			 TC = W[11];
+			 TE = FMA(TA, TB, TC * TD);
+			 T1F = FNMS(TC, TB, TA * TD);
+		    }
+		    TF = Tz + TE;
+		    T2t = T1E + T1F;
+		    T1D = Tz - TE;
+		    T1G = T1E - T1F;
+		    T1H = T1D + T1G;
+		    T2d = T1D - T1G;
+	       }
+	       {
+		    E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
+		    {
+			 E T16, T18, T15, T17;
+			 T16 = Ip[WS(rs, 7)];
+			 T18 = Im[WS(rs, 7)];
+			 T15 = W[28];
+			 T17 = W[29];
+			 T19 = FMA(T15, T16, T17 * T18);
+			 T20 = FNMS(T17, T16, T15 * T18);
+		    }
+		    {
+			 E T1m, T1o, T1l, T1n;
+			 T1m = Ip[WS(rs, 5)];
+			 T1o = Im[WS(rs, 5)];
+			 T1l = W[20];
+			 T1n = W[21];
+			 T1p = FMA(T1l, T1m, T1n * T1o);
+			 T1X = FNMS(T1n, T1m, T1l * T1o);
+		    }
+		    {
+			 E T1b, T1d, T1a, T1c;
+			 T1b = Ip[WS(rs, 3)];
+			 T1d = Im[WS(rs, 3)];
+			 T1a = W[12];
+			 T1c = W[13];
+			 T1e = FMA(T1a, T1b, T1c * T1d);
+			 T21 = FNMS(T1c, T1b, T1a * T1d);
+		    }
+		    {
+			 E T1h, T1j, T1g, T1i;
+			 T1h = Ip[WS(rs, 1)];
+			 T1j = Im[WS(rs, 1)];
+			 T1g = W[4];
+			 T1i = W[5];
+			 T1k = FMA(T1g, T1h, T1i * T1j);
+			 T1W = FNMS(T1i, T1h, T1g * T1j);
+		    }
+		    T1f = T19 + T1e;
+		    T1q = T1k + T1p;
+		    T2B = T1f - T1q;
+		    T2C = T20 + T21;
+		    T2D = T1W + T1X;
+		    T2E = T2C - T2D;
+		    {
+			 E T1V, T1Y, T22, T23;
+			 T1V = T19 - T1e;
+			 T1Y = T1W - T1X;
+			 T1Z = T1V - T1Y;
+			 T2j = T1V + T1Y;
+			 T22 = T20 - T21;
+			 T23 = T1k - T1p;
+			 T24 = T22 + T23;
+			 T2k = T22 - T23;
+		    }
+	       }
+	       {
+		    E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
+		    {
+			 E TJ, TL, TI, TK;
+			 TJ = Ip[0];
+			 TL = Im[0];
+			 TI = W[0];
+			 TK = W[1];
+			 TM = FMA(TI, TJ, TK * TL);
+			 T1K = FNMS(TK, TJ, TI * TL);
+		    }
+		    {
+			 E TZ, T11, TY, T10;
+			 TZ = Ip[WS(rs, 6)];
+			 T11 = Im[WS(rs, 6)];
+			 TY = W[24];
+			 T10 = W[25];
+			 T12 = FMA(TY, TZ, T10 * T11);
+			 T1R = FNMS(T10, TZ, TY * T11);
+		    }
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = Ip[WS(rs, 4)];
+			 TQ = Im[WS(rs, 4)];
+			 TN = W[16];
+			 TP = W[17];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T1L = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E TU, TW, TT, TV;
+			 TU = Ip[WS(rs, 2)];
+			 TW = Im[WS(rs, 2)];
+			 TT = W[8];
+			 TV = W[9];
+			 TX = FMA(TT, TU, TV * TW);
+			 T1Q = FNMS(TV, TU, TT * TW);
+		    }
+		    TS = TM + TR;
+		    T13 = TX + T12;
+		    T2w = TS - T13;
+		    T2x = T1K + T1L;
+		    T2y = T1Q + T1R;
+		    T2z = T2x - T2y;
+		    {
+			 E T1M, T1N, T1P, T1S;
+			 T1M = T1K - T1L;
+			 T1N = TX - T12;
+			 T1O = T1M + T1N;
+			 T2g = T1M - T1N;
+			 T1P = TM - TR;
+			 T1S = T1Q - T1R;
+			 T1T = T1P - T1S;
+			 T2h = T1P + T1S;
+		    }
+	       }
+	       {
+		    E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
+		    {
+			 E T1x, T1I, T3e, T3f;
+			 T1x = T1t - T1w;
+			 T1I = KP707106781 * (T1C - T1H);
+			 T1J = T1x + T1I;
+			 T27 = T1x - T1I;
+			 T3e = KP707106781 * (T2d - T2c);
+			 T3f = T38 + T37;
+			 T3g = T3e + T3f;
+			 T3i = T3f - T3e;
+		    }
+		    {
+			 E T1U, T25, T28, T29;
+			 T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
+			 T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
+			 T26 = T1U + T25;
+			 T3h = T25 - T1U;
+			 T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
+			 T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
+			 T2a = T28 - T29;
+			 T3d = T28 + T29;
+		    }
+		    Rm[WS(rs, 4)] = T1J - T26;
+		    Im[WS(rs, 4)] = T3d - T3g;
+		    Rp[WS(rs, 3)] = T1J + T26;
+		    Ip[WS(rs, 3)] = T3d + T3g;
+		    Rm[0] = T27 - T2a;
+		    Im[0] = T3h - T3i;
+		    Rp[WS(rs, 7)] = T27 + T2a;
+		    Ip[WS(rs, 7)] = T3h + T3i;
+	       }
+	       {
+		    E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
+		    {
+			 E T2r, T2u, T30, T31;
+			 T2r = T7 - Ti;
+			 T2u = T2s - T2t;
+			 T2v = T2r + T2u;
+			 T2H = T2r - T2u;
+			 T30 = TF - Tu;
+			 T31 = T2U - T2R;
+			 T32 = T30 + T31;
+			 T34 = T31 - T30;
+		    }
+		    {
+			 E T2A, T2F, T2I, T2J;
+			 T2A = T2w + T2z;
+			 T2F = T2B - T2E;
+			 T2G = KP707106781 * (T2A + T2F);
+			 T33 = KP707106781 * (T2F - T2A);
+			 T2I = T2z - T2w;
+			 T2J = T2B + T2E;
+			 T2K = KP707106781 * (T2I - T2J);
+			 T2Z = KP707106781 * (T2I + T2J);
+		    }
+		    Rm[WS(rs, 5)] = T2v - T2G;
+		    Im[WS(rs, 5)] = T2Z - T32;
+		    Rp[WS(rs, 2)] = T2v + T2G;
+		    Ip[WS(rs, 2)] = T2Z + T32;
+		    Rm[WS(rs, 1)] = T2H - T2K;
+		    Im[WS(rs, 1)] = T33 - T34;
+		    Rp[WS(rs, 6)] = T2H + T2K;
+		    Ip[WS(rs, 6)] = T33 + T34;
+	       }
+	       {
+		    E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
+		    {
+			 E T2b, T2e, T36, T39;
+			 T2b = T1t + T1w;
+			 T2e = KP707106781 * (T2c + T2d);
+			 T2f = T2b + T2e;
+			 T2n = T2b - T2e;
+			 T36 = KP707106781 * (T1C + T1H);
+			 T39 = T37 - T38;
+			 T3a = T36 + T39;
+			 T3c = T39 - T36;
+		    }
+		    {
+			 E T2i, T2l, T2o, T2p;
+			 T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
+			 T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
+			 T2m = T2i + T2l;
+			 T3b = T2l - T2i;
+			 T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
+			 T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
+			 T2q = T2o - T2p;
+			 T35 = T2o + T2p;
+		    }
+		    Rm[WS(rs, 6)] = T2f - T2m;
+		    Im[WS(rs, 6)] = T35 - T3a;
+		    Rp[WS(rs, 1)] = T2f + T2m;
+		    Ip[WS(rs, 1)] = T35 + T3a;
+		    Rm[WS(rs, 2)] = T2n - T2q;
+		    Im[WS(rs, 2)] = T3b - T3c;
+		    Rp[WS(rs, 5)] = T2n + T2q;
+		    Ip[WS(rs, 5)] = T3b + T3c;
+	       }
+	       {
+		    E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
+		    {
+			 E Tj, TG, T2Q, T2V;
+			 Tj = T7 + Ti;
+			 TG = Tu + TF;
+			 TH = Tj + TG;
+			 T2L = Tj - TG;
+			 T2Q = T2s + T2t;
+			 T2V = T2R + T2U;
+			 T2W = T2Q + T2V;
+			 T2Y = T2V - T2Q;
+		    }
+		    {
+			 E T14, T1r, T2M, T2N;
+			 T14 = TS + T13;
+			 T1r = T1f + T1q;
+			 T1s = T14 + T1r;
+			 T2X = T1r - T14;
+			 T2M = T2x + T2y;
+			 T2N = T2C + T2D;
+			 T2O = T2M - T2N;
+			 T2P = T2M + T2N;
+		    }
+		    Rm[WS(rs, 7)] = TH - T1s;
+		    Im[WS(rs, 7)] = T2P - T2W;
+		    Rp[0] = TH + T1s;
+		    Ip[0] = T2P + T2W;
+		    Rm[WS(rs, 3)] = T2L - T2O;
+		    Im[WS(rs, 3)] = T2X - T2Y;
+		    Rp[WS(rs, 4)] = T2L + T2O;
+		    Ip[WS(rs, 4)] = T2X + T2Y;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cf_16", twinstr, &GENUS, {136, 46, 38, 0} };
+
+void X(codelet_hc2cf_16) (planner *p) {
+     X(khc2c_register) (p, hc2cf_16, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:30 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cf_2 -include hc2cf.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T1, Ta, T3, T6, T2, T5;
+	       T1 = Rp[0];
+	       Ta = Rm[0];
+	       T3 = Ip[0];
+	       T6 = Im[0];
+	       T2 = W[0];
+	       T5 = W[1];
+	       {
+		    E T8, T4, T9, T7;
+		    T8 = T2 * T6;
+		    T4 = T2 * T3;
+		    T9 = FNMS(T5, T3, T8);
+		    T7 = FMA(T5, T6, T4);
+		    Ip[0] = T9 + Ta;
+		    Im[0] = T9 - Ta;
+		    Rp[0] = T1 + T7;
+		    Rm[0] = T1 - T7;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 2, "hc2cf_2", twinstr, &GENUS, {4, 2, 2, 0} };
+
+void X(codelet_hc2cf_2) (planner *p) {
+     X(khc2c_register) (p, hc2cf_2, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cf_2 -include hc2cf.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 9 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T1, T8, T6, T7;
+	       T1 = Rp[0];
+	       T8 = Rm[0];
+	       {
+		    E T3, T5, T2, T4;
+		    T3 = Ip[0];
+		    T5 = Im[0];
+		    T2 = W[0];
+		    T4 = W[1];
+		    T6 = FMA(T2, T3, T4 * T5);
+		    T7 = FNMS(T4, T3, T2 * T5);
+	       }
+	       Rm[0] = T1 - T6;
+	       Im[0] = T7 - T8;
+	       Rp[0] = T1 + T6;
+	       Ip[0] = T7 + T8;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 2, "hc2cf_2", twinstr, &GENUS, {4, 2, 2, 0} };
+
+void X(codelet_hc2cf_2) (planner *p) {
+     X(khc2c_register) (p, hc2cf_2, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:41 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cf_20 -include hc2cf.h */
+
+/*
+ * This function contains 246 FP additions, 148 FP multiplications,
+ * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
+ * 97 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T4P, T4Y, T50, T4U, T4S, T4T, T4Z, T4V;
+	       {
+		    E T4N, T4r, T8, T2i, T4n, T2n, T4O, Tl, T2v, T3v, T3T, T4f, TN, T2b, T3F;
+		    E T3p, T2R, T3z, T43, T4b, T27, T2f, T3J, T33, T2K, T3y, T40, T4c, T1G, T2e;
+		    E T3I, T3a, T2C, T3w, T3W, T4e, T1e, T2c, T3G, T3i;
+		    {
+			 E T1, T4q, T3, T6, T2, T5;
+			 T1 = Rp[0];
+			 T4q = Rm[0];
+			 T3 = Rp[WS(rs, 5)];
+			 T6 = Rm[WS(rs, 5)];
+			 T2 = W[18];
+			 T5 = W[19];
+			 {
+			      E Ta, Td, Tg, T2j, Tb, Tj, Tf, Tc, Ti;
+			      {
+				   E T4o, T4, T9, T4p, T7;
+				   Ta = Ip[WS(rs, 2)];
+				   Td = Im[WS(rs, 2)];
+				   T4o = T2 * T6;
+				   T4 = T2 * T3;
+				   T9 = W[8];
+				   Tg = Ip[WS(rs, 7)];
+				   T4p = FNMS(T5, T3, T4o);
+				   T7 = FMA(T5, T6, T4);
+				   T2j = T9 * Td;
+				   Tb = T9 * Ta;
+				   T4N = T4q - T4p;
+				   T4r = T4p + T4q;
+				   T8 = T1 + T7;
+				   T2i = T1 - T7;
+				   Tj = Im[WS(rs, 7)];
+				   Tf = W[28];
+			      }
+			      Tc = W[9];
+			      Ti = W[29];
+			      {
+				   E T3l, Ts, T2t, TL, TB, TE, TD, T3n, Ty, T2q, TC;
+				   {
+					E TH, TK, TJ, T2s, TI;
+					{
+					     E To, Tr, Tp, T3k, Tq, TG;
+					     {
+						  E T2k, Te, T2m, Tk, T2l, Th, Tn;
+						  To = Rp[WS(rs, 2)];
+						  T2l = Tf * Tj;
+						  Th = Tf * Tg;
+						  T2k = FNMS(Tc, Ta, T2j);
+						  Te = FMA(Tc, Td, Tb);
+						  T2m = FNMS(Ti, Tg, T2l);
+						  Tk = FMA(Ti, Tj, Th);
+						  Tr = Rm[WS(rs, 2)];
+						  Tn = W[6];
+						  T4n = T2k + T2m;
+						  T2n = T2k - T2m;
+						  T4O = Te - Tk;
+						  Tl = Te + Tk;
+						  Tp = Tn * To;
+						  T3k = Tn * Tr;
+					     }
+					     Tq = W[7];
+					     TH = Ip[WS(rs, 9)];
+					     TK = Im[WS(rs, 9)];
+					     TG = W[36];
+					     T3l = FNMS(Tq, To, T3k);
+					     Ts = FMA(Tq, Tr, Tp);
+					     TJ = W[37];
+					     T2s = TG * TK;
+					     TI = TG * TH;
+					}
+					{
+					     E Tu, Tx, Tt, Tw, T3m, Tv, TA;
+					     Tu = Rp[WS(rs, 7)];
+					     Tx = Rm[WS(rs, 7)];
+					     T2t = FNMS(TJ, TH, T2s);
+					     TL = FMA(TJ, TK, TI);
+					     Tt = W[26];
+					     Tw = W[27];
+					     TB = Ip[WS(rs, 4)];
+					     TE = Im[WS(rs, 4)];
+					     T3m = Tt * Tx;
+					     Tv = Tt * Tu;
+					     TA = W[16];
+					     TD = W[17];
+					     T3n = FNMS(Tw, Tu, T3m);
+					     Ty = FMA(Tw, Tx, Tv);
+					     T2q = TA * TE;
+					     TC = TA * TB;
+					}
+				   }
+				   {
+					E T3o, T3R, Tz, T2p, T2r, TF;
+					T3o = T3l - T3n;
+					T3R = T3l + T3n;
+					Tz = Ts + Ty;
+					T2p = Ts - Ty;
+					T2r = FNMS(TD, TB, T2q);
+					TF = FMA(TD, TE, TC);
+					{
+					     E T3S, T2u, TM, T3j;
+					     T3S = T2r + T2t;
+					     T2u = T2r - T2t;
+					     TM = TF + TL;
+					     T3j = TL - TF;
+					     T2v = T2p - T2u;
+					     T3v = T2p + T2u;
+					     T3T = T3R + T3S;
+					     T4f = T3S - T3R;
+					     TN = Tz - TM;
+					     T2b = Tz + TM;
+					     T3F = T3o + T3j;
+					     T3p = T3j - T3o;
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T2Z, T1M, T2P, T25, T1V, T1Y, T1X, T31, T1S, T2M, T1W;
+			 {
+			      E T21, T24, T23, T2O, T22;
+			      {
+				   E T1I, T1L, T1H, T1K, T2Y, T1J, T20;
+				   T1I = Rp[WS(rs, 6)];
+				   T1L = Rm[WS(rs, 6)];
+				   T1H = W[22];
+				   T1K = W[23];
+				   T21 = Ip[WS(rs, 3)];
+				   T24 = Im[WS(rs, 3)];
+				   T2Y = T1H * T1L;
+				   T1J = T1H * T1I;
+				   T20 = W[12];
+				   T23 = W[13];
+				   T2Z = FNMS(T1K, T1I, T2Y);
+				   T1M = FMA(T1K, T1L, T1J);
+				   T2O = T20 * T24;
+				   T22 = T20 * T21;
+			      }
+			      {
+				   E T1O, T1R, T1N, T1Q, T30, T1P, T1U;
+				   T1O = Rp[WS(rs, 1)];
+				   T1R = Rm[WS(rs, 1)];
+				   T2P = FNMS(T23, T21, T2O);
+				   T25 = FMA(T23, T24, T22);
+				   T1N = W[2];
+				   T1Q = W[3];
+				   T1V = Ip[WS(rs, 8)];
+				   T1Y = Im[WS(rs, 8)];
+				   T30 = T1N * T1R;
+				   T1P = T1N * T1O;
+				   T1U = W[32];
+				   T1X = W[33];
+				   T31 = FNMS(T1Q, T1O, T30);
+				   T1S = FMA(T1Q, T1R, T1P);
+				   T2M = T1U * T1Y;
+				   T1W = T1U * T1V;
+			      }
+			 }
+			 {
+			      E T32, T41, T1T, T2L, T2N, T1Z;
+			      T32 = T2Z - T31;
+			      T41 = T2Z + T31;
+			      T1T = T1M + T1S;
+			      T2L = T1M - T1S;
+			      T2N = FNMS(T1X, T1V, T2M);
+			      T1Z = FMA(T1X, T1Y, T1W);
+			      {
+				   E T42, T2Q, T26, T2X;
+				   T42 = T2N + T2P;
+				   T2Q = T2N - T2P;
+				   T26 = T1Z + T25;
+				   T2X = T25 - T1Z;
+				   T2R = T2L - T2Q;
+				   T3z = T2L + T2Q;
+				   T43 = T41 + T42;
+				   T4b = T42 - T41;
+				   T27 = T1T - T26;
+				   T2f = T1T + T26;
+				   T3J = T32 + T2X;
+				   T33 = T2X - T32;
+			      }
+			 }
+		    }
+		    {
+			 E T36, T1l, T2I, T1E, T1u, T1x, T1w, T38, T1r, T2F, T1v;
+			 {
+			      E T1A, T1D, T1C, T2H, T1B;
+			      {
+				   E T1h, T1k, T1g, T1j, T35, T1i, T1z;
+				   T1h = Rp[WS(rs, 4)];
+				   T1k = Rm[WS(rs, 4)];
+				   T1g = W[14];
+				   T1j = W[15];
+				   T1A = Ip[WS(rs, 1)];
+				   T1D = Im[WS(rs, 1)];
+				   T35 = T1g * T1k;
+				   T1i = T1g * T1h;
+				   T1z = W[4];
+				   T1C = W[5];
+				   T36 = FNMS(T1j, T1h, T35);
+				   T1l = FMA(T1j, T1k, T1i);
+				   T2H = T1z * T1D;
+				   T1B = T1z * T1A;
+			      }
+			      {
+				   E T1n, T1q, T1m, T1p, T37, T1o, T1t;
+				   T1n = Rp[WS(rs, 9)];
+				   T1q = Rm[WS(rs, 9)];
+				   T2I = FNMS(T1C, T1A, T2H);
+				   T1E = FMA(T1C, T1D, T1B);
+				   T1m = W[34];
+				   T1p = W[35];
+				   T1u = Ip[WS(rs, 6)];
+				   T1x = Im[WS(rs, 6)];
+				   T37 = T1m * T1q;
+				   T1o = T1m * T1n;
+				   T1t = W[24];
+				   T1w = W[25];
+				   T38 = FNMS(T1p, T1n, T37);
+				   T1r = FMA(T1p, T1q, T1o);
+				   T2F = T1t * T1x;
+				   T1v = T1t * T1u;
+			      }
+			 }
+			 {
+			      E T39, T3Y, T1s, T2E, T2G, T1y;
+			      T39 = T36 - T38;
+			      T3Y = T36 + T38;
+			      T1s = T1l + T1r;
+			      T2E = T1l - T1r;
+			      T2G = FNMS(T1w, T1u, T2F);
+			      T1y = FMA(T1w, T1x, T1v);
+			      {
+				   E T3Z, T2J, T1F, T34;
+				   T3Z = T2G + T2I;
+				   T2J = T2G - T2I;
+				   T1F = T1y + T1E;
+				   T34 = T1E - T1y;
+				   T2K = T2E - T2J;
+				   T3y = T2E + T2J;
+				   T40 = T3Y + T3Z;
+				   T4c = T3Z - T3Y;
+				   T1G = T1s - T1F;
+				   T2e = T1s + T1F;
+				   T3I = T39 + T34;
+				   T3a = T34 - T39;
+			      }
+			 }
+		    }
+		    {
+			 E T3e, TT, T2A, T1c, T12, T15, T14, T3g, TZ, T2x, T13;
+			 {
+			      E T18, T1b, T1a, T2z, T19;
+			      {
+				   E TP, TS, TO, TR, T3d, TQ, T17;
+				   TP = Rp[WS(rs, 8)];
+				   TS = Rm[WS(rs, 8)];
+				   TO = W[30];
+				   TR = W[31];
+				   T18 = Ip[WS(rs, 5)];
+				   T1b = Im[WS(rs, 5)];
+				   T3d = TO * TS;
+				   TQ = TO * TP;
+				   T17 = W[20];
+				   T1a = W[21];
+				   T3e = FNMS(TR, TP, T3d);
+				   TT = FMA(TR, TS, TQ);
+				   T2z = T17 * T1b;
+				   T19 = T17 * T18;
+			      }
+			      {
+				   E TV, TY, TU, TX, T3f, TW, T11;
+				   TV = Rp[WS(rs, 3)];
+				   TY = Rm[WS(rs, 3)];
+				   T2A = FNMS(T1a, T18, T2z);
+				   T1c = FMA(T1a, T1b, T19);
+				   TU = W[10];
+				   TX = W[11];
+				   T12 = Ip[0];
+				   T15 = Im[0];
+				   T3f = TU * TY;
+				   TW = TU * TV;
+				   T11 = W[0];
+				   T14 = W[1];
+				   T3g = FNMS(TX, TV, T3f);
+				   TZ = FMA(TX, TY, TW);
+				   T2x = T11 * T15;
+				   T13 = T11 * T12;
+			      }
+			 }
+			 {
+			      E T3h, T3U, T10, T2w, T2y, T16;
+			      T3h = T3e - T3g;
+			      T3U = T3e + T3g;
+			      T10 = TT + TZ;
+			      T2w = TT - TZ;
+			      T2y = FNMS(T14, T12, T2x);
+			      T16 = FMA(T14, T15, T13);
+			      {
+				   E T3V, T2B, T1d, T3c;
+				   T3V = T2y + T2A;
+				   T2B = T2y - T2A;
+				   T1d = T16 + T1c;
+				   T3c = T1c - T16;
+				   T2C = T2w - T2B;
+				   T3w = T2w + T2B;
+				   T3W = T3U + T3V;
+				   T4e = T3V - T3U;
+				   T1e = T10 - T1d;
+				   T2c = T10 + T1d;
+				   T3G = T3h + T3c;
+				   T3i = T3c - T3h;
+			      }
+			 }
+		    }
+		    {
+			 E T4s, T4k, T4l, T45, T47, T3P, T4y, T4A, T3O;
+			 {
+			      E T4C, T4B, T2a, T4j, T4h, T4E, T4M, T4K, T4i, T4a;
+			      {
+				   E Tm, T1f, T4J, T4I, T28, T4d, T4g, T29, T49, T48;
+				   T4C = T4c + T4b;
+				   T4d = T4b - T4c;
+				   T4g = T4e - T4f;
+				   T4B = T4f + T4e;
+				   T2a = T8 + Tl;
+				   Tm = T8 - Tl;
+				   T1f = TN + T1e;
+				   T4J = T1e - TN;
+				   T4I = T1G - T27;
+				   T28 = T1G + T27;
+				   T4j = FMA(KP618033988, T4d, T4g);
+				   T4h = FNMS(KP618033988, T4g, T4d);
+				   T29 = T1f + T28;
+				   T49 = T1f - T28;
+				   T4E = T4r - T4n;
+				   T4s = T4n + T4r;
+				   Rm[WS(rs, 9)] = Tm + T29;
+				   T48 = FNMS(KP250000000, T29, Tm);
+				   T4M = FNMS(KP618033988, T4I, T4J);
+				   T4K = FMA(KP618033988, T4J, T4I);
+				   T4i = FMA(KP559016994, T49, T48);
+				   T4a = FNMS(KP559016994, T49, T48);
+			      }
+			      {
+				   E T2d, T4w, T4x, T2g, T2h;
+				   {
+					E T3X, T4G, T4F, T44, T4D, T4L, T4H;
+					T4k = T3T + T3W;
+					T3X = T3T - T3W;
+					T4G = T4C - T4B;
+					T4D = T4B + T4C;
+					Rm[WS(rs, 1)] = FMA(KP951056516, T4h, T4a);
+					Rp[WS(rs, 2)] = FNMS(KP951056516, T4h, T4a);
+					Rp[WS(rs, 6)] = FMA(KP951056516, T4j, T4i);
+					Rm[WS(rs, 5)] = FNMS(KP951056516, T4j, T4i);
+					Im[WS(rs, 9)] = T4D - T4E;
+					T4F = FMA(KP250000000, T4D, T4E);
+					T44 = T40 - T43;
+					T4l = T40 + T43;
+					T2d = T2b + T2c;
+					T4w = T2b - T2c;
+					T4L = FMA(KP559016994, T4G, T4F);
+					T4H = FNMS(KP559016994, T4G, T4F);
+					T45 = FMA(KP618033988, T44, T3X);
+					T47 = FNMS(KP618033988, T3X, T44);
+					Ip[WS(rs, 2)] = FMA(KP951056516, T4K, T4H);
+					Im[WS(rs, 1)] = FMS(KP951056516, T4K, T4H);
+					Ip[WS(rs, 6)] = FMA(KP951056516, T4M, T4L);
+					Im[WS(rs, 5)] = FMS(KP951056516, T4M, T4L);
+					T4x = T2f - T2e;
+					T2g = T2e + T2f;
+				   }
+				   T2h = T2d + T2g;
+				   T3P = T2d - T2g;
+				   T4y = FNMS(KP618033988, T4x, T4w);
+				   T4A = FMA(KP618033988, T4w, T4x);
+				   Rp[0] = T2a + T2h;
+				   T3O = FNMS(KP250000000, T2h, T2a);
+			      }
+			 }
+			 {
+			      E T3u, T54, T5a, T5c, T56, T53;
+			      {
+				   E T52, T51, T3t, T3r, T2o, T58, T59, T2T, T2V, T4u, T4t, T2U, T3s, T2W;
+				   {
+					E T3b, T3q, T46, T3Q, T4m;
+					T52 = T3a + T33;
+					T3b = T33 - T3a;
+					T3q = T3i - T3p;
+					T51 = T3p + T3i;
+					T46 = FNMS(KP559016994, T3P, T3O);
+					T3Q = FMA(KP559016994, T3P, T3O);
+					T4m = T4k + T4l;
+					T4u = T4k - T4l;
+					Rm[WS(rs, 3)] = FMA(KP951056516, T45, T3Q);
+					Rp[WS(rs, 4)] = FNMS(KP951056516, T45, T3Q);
+					Rp[WS(rs, 8)] = FMA(KP951056516, T47, T46);
+					Rm[WS(rs, 7)] = FNMS(KP951056516, T47, T46);
+					Ip[0] = T4m + T4s;
+					T4t = FNMS(KP250000000, T4m, T4s);
+					T3t = FMA(KP618033988, T3b, T3q);
+					T3r = FNMS(KP618033988, T3q, T3b);
+				   }
+				   T3u = T2i + T2n;
+				   T2o = T2i - T2n;
+				   {
+					E T4v, T4z, T2D, T2S;
+					T4v = FMA(KP559016994, T4u, T4t);
+					T4z = FNMS(KP559016994, T4u, T4t);
+					T2D = T2v + T2C;
+					T58 = T2v - T2C;
+					T59 = T2K - T2R;
+					T2S = T2K + T2R;
+					Ip[WS(rs, 4)] = FMA(KP951056516, T4y, T4v);
+					Im[WS(rs, 3)] = FMS(KP951056516, T4y, T4v);
+					Ip[WS(rs, 8)] = FMA(KP951056516, T4A, T4z);
+					Im[WS(rs, 7)] = FMS(KP951056516, T4A, T4z);
+					T2T = T2D + T2S;
+					T2V = T2D - T2S;
+				   }
+				   Rm[WS(rs, 4)] = T2o + T2T;
+				   T2U = FNMS(KP250000000, T2T, T2o);
+				   T54 = T4O + T4N;
+				   T4P = T4N - T4O;
+				   T5a = FMA(KP618033988, T59, T58);
+				   T5c = FNMS(KP618033988, T58, T59);
+				   T3s = FMA(KP559016994, T2V, T2U);
+				   T2W = FNMS(KP559016994, T2V, T2U);
+				   Rp[WS(rs, 7)] = FNMS(KP951056516, T3r, T2W);
+				   Rp[WS(rs, 3)] = FMA(KP951056516, T3r, T2W);
+				   Rm[0] = FNMS(KP951056516, T3t, T3s);
+				   Rm[WS(rs, 8)] = FMA(KP951056516, T3t, T3s);
+				   T56 = T51 - T52;
+				   T53 = T51 + T52;
+			      }
+			      {
+				   E T4Q, T4R, T3N, T3L, T4X, T4W, T3B, T3D, T3H, T3K, T55, T3C, T3M, T3E;
+				   T4Q = T3F + T3G;
+				   T3H = T3F - T3G;
+				   T3K = T3I - T3J;
+				   T4R = T3I + T3J;
+				   Im[WS(rs, 4)] = T53 - T54;
+				   T55 = FMA(KP250000000, T53, T54);
+				   T3N = FNMS(KP618033988, T3H, T3K);
+				   T3L = FMA(KP618033988, T3K, T3H);
+				   {
+					E T57, T5b, T3x, T3A;
+					T57 = FNMS(KP559016994, T56, T55);
+					T5b = FMA(KP559016994, T56, T55);
+					T3x = T3v + T3w;
+					T4X = T3v - T3w;
+					T4W = T3y - T3z;
+					T3A = T3y + T3z;
+					Im[0] = -(FMA(KP951056516, T5a, T57));
+					Im[WS(rs, 8)] = FMS(KP951056516, T5a, T57);
+					Ip[WS(rs, 7)] = FMA(KP951056516, T5c, T5b);
+					Ip[WS(rs, 3)] = FNMS(KP951056516, T5c, T5b);
+					T3B = T3x + T3A;
+					T3D = T3x - T3A;
+				   }
+				   Rp[WS(rs, 5)] = T3u + T3B;
+				   T3C = FNMS(KP250000000, T3B, T3u);
+				   T4Y = FNMS(KP618033988, T4X, T4W);
+				   T50 = FMA(KP618033988, T4W, T4X);
+				   T3M = FNMS(KP559016994, T3D, T3C);
+				   T3E = FMA(KP559016994, T3D, T3C);
+				   Rp[WS(rs, 9)] = FNMS(KP951056516, T3L, T3E);
+				   Rp[WS(rs, 1)] = FMA(KP951056516, T3L, T3E);
+				   Rm[WS(rs, 2)] = FNMS(KP951056516, T3N, T3M);
+				   Rm[WS(rs, 6)] = FMA(KP951056516, T3N, T3M);
+				   T4U = T4Q - T4R;
+				   T4S = T4Q + T4R;
+			      }
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 5)] = T4S + T4P;
+	       T4T = FNMS(KP250000000, T4S, T4P);
+	       T4Z = FMA(KP559016994, T4U, T4T);
+	       T4V = FNMS(KP559016994, T4U, T4T);
+	       Im[WS(rs, 2)] = -(FMA(KP951056516, T4Y, T4V));
+	       Im[WS(rs, 6)] = FMS(KP951056516, T4Y, T4V);
+	       Ip[WS(rs, 9)] = FMA(KP951056516, T50, T4Z);
+	       Ip[WS(rs, 1)] = FNMS(KP951056516, T50, T4Z);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cf_20", twinstr, &GENUS, {136, 38, 110, 0} };
+
+void X(codelet_hc2cf_20) (planner *p) {
+     X(khc2c_register) (p, hc2cf_20, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cf_20 -include hc2cf.h */
+
+/*
+ * This function contains 246 FP additions, 124 FP multiplications,
+ * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
+ * 85 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E Tj, T1R, T4j, T4s, T2q, T37, T3Q, T42, T1r, T1O, T1P, T3p, T3s, T3K, T3A;
+	       E T3B, T3Z, T1V, T1W, T1X, T23, T28, T4q, T2W, T2X, T4f, T33, T34, T35, T2G;
+	       E T2L, T2M, TG, T13, T14, T3i, T3l, T3J, T3D, T3E, T40, T1S, T1T, T1U, T2e;
+	       E T2j, T4p, T2T, T2U, T4e, T30, T31, T32, T2v, T2A, T2B;
+	       {
+		    E T1, T3O, T6, T3N, Tc, T2n, Th, T2o;
+		    T1 = Rp[0];
+		    T3O = Rm[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = Rp[WS(rs, 5)];
+			 T5 = Rm[WS(rs, 5)];
+			 T2 = W[18];
+			 T4 = W[19];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T3N = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = Ip[WS(rs, 2)];
+			 Tb = Im[WS(rs, 2)];
+			 T8 = W[8];
+			 Ta = W[9];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T2n = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = Ip[WS(rs, 7)];
+			 Tg = Im[WS(rs, 7)];
+			 Td = W[28];
+			 Tf = W[29];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T2o = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E T7, Ti, T4h, T4i;
+			 T7 = T1 + T6;
+			 Ti = Tc + Th;
+			 Tj = T7 - Ti;
+			 T1R = T7 + Ti;
+			 T4h = T3O - T3N;
+			 T4i = Tc - Th;
+			 T4j = T4h - T4i;
+			 T4s = T4i + T4h;
+		    }
+		    {
+			 E T2m, T2p, T3M, T3P;
+			 T2m = T1 - T6;
+			 T2p = T2n - T2o;
+			 T2q = T2m - T2p;
+			 T37 = T2m + T2p;
+			 T3M = T2n + T2o;
+			 T3P = T3N + T3O;
+			 T3Q = T3M + T3P;
+			 T42 = T3P - T3M;
+		    }
+	       }
+	       {
+		    E T1f, T3n, T21, T2C, T1N, T3r, T27, T2K, T1q, T3o, T22, T2F, T1C, T3q, T26;
+		    E T2H;
+		    {
+			 E T19, T1Z, T1e, T20;
+			 {
+			      E T16, T18, T15, T17;
+			      T16 = Rp[WS(rs, 4)];
+			      T18 = Rm[WS(rs, 4)];
+			      T15 = W[14];
+			      T17 = W[15];
+			      T19 = FMA(T15, T16, T17 * T18);
+			      T1Z = FNMS(T17, T16, T15 * T18);
+			 }
+			 {
+			      E T1b, T1d, T1a, T1c;
+			      T1b = Rp[WS(rs, 9)];
+			      T1d = Rm[WS(rs, 9)];
+			      T1a = W[34];
+			      T1c = W[35];
+			      T1e = FMA(T1a, T1b, T1c * T1d);
+			      T20 = FNMS(T1c, T1b, T1a * T1d);
+			 }
+			 T1f = T19 + T1e;
+			 T3n = T1Z + T20;
+			 T21 = T1Z - T20;
+			 T2C = T19 - T1e;
+		    }
+		    {
+			 E T1H, T2I, T1M, T2J;
+			 {
+			      E T1E, T1G, T1D, T1F;
+			      T1E = Ip[WS(rs, 8)];
+			      T1G = Im[WS(rs, 8)];
+			      T1D = W[32];
+			      T1F = W[33];
+			      T1H = FMA(T1D, T1E, T1F * T1G);
+			      T2I = FNMS(T1F, T1E, T1D * T1G);
+			 }
+			 {
+			      E T1J, T1L, T1I, T1K;
+			      T1J = Ip[WS(rs, 3)];
+			      T1L = Im[WS(rs, 3)];
+			      T1I = W[12];
+			      T1K = W[13];
+			      T1M = FMA(T1I, T1J, T1K * T1L);
+			      T2J = FNMS(T1K, T1J, T1I * T1L);
+			 }
+			 T1N = T1H + T1M;
+			 T3r = T2I + T2J;
+			 T27 = T1H - T1M;
+			 T2K = T2I - T2J;
+		    }
+		    {
+			 E T1k, T2D, T1p, T2E;
+			 {
+			      E T1h, T1j, T1g, T1i;
+			      T1h = Ip[WS(rs, 6)];
+			      T1j = Im[WS(rs, 6)];
+			      T1g = W[24];
+			      T1i = W[25];
+			      T1k = FMA(T1g, T1h, T1i * T1j);
+			      T2D = FNMS(T1i, T1h, T1g * T1j);
+			 }
+			 {
+			      E T1m, T1o, T1l, T1n;
+			      T1m = Ip[WS(rs, 1)];
+			      T1o = Im[WS(rs, 1)];
+			      T1l = W[4];
+			      T1n = W[5];
+			      T1p = FMA(T1l, T1m, T1n * T1o);
+			      T2E = FNMS(T1n, T1m, T1l * T1o);
+			 }
+			 T1q = T1k + T1p;
+			 T3o = T2D + T2E;
+			 T22 = T1k - T1p;
+			 T2F = T2D - T2E;
+		    }
+		    {
+			 E T1w, T24, T1B, T25;
+			 {
+			      E T1t, T1v, T1s, T1u;
+			      T1t = Rp[WS(rs, 6)];
+			      T1v = Rm[WS(rs, 6)];
+			      T1s = W[22];
+			      T1u = W[23];
+			      T1w = FMA(T1s, T1t, T1u * T1v);
+			      T24 = FNMS(T1u, T1t, T1s * T1v);
+			 }
+			 {
+			      E T1y, T1A, T1x, T1z;
+			      T1y = Rp[WS(rs, 1)];
+			      T1A = Rm[WS(rs, 1)];
+			      T1x = W[2];
+			      T1z = W[3];
+			      T1B = FMA(T1x, T1y, T1z * T1A);
+			      T25 = FNMS(T1z, T1y, T1x * T1A);
+			 }
+			 T1C = T1w + T1B;
+			 T3q = T24 + T25;
+			 T26 = T24 - T25;
+			 T2H = T1w - T1B;
+		    }
+		    T1r = T1f - T1q;
+		    T1O = T1C - T1N;
+		    T1P = T1r + T1O;
+		    T3p = T3n + T3o;
+		    T3s = T3q + T3r;
+		    T3K = T3p + T3s;
+		    T3A = T3n - T3o;
+		    T3B = T3r - T3q;
+		    T3Z = T3B - T3A;
+		    T1V = T1f + T1q;
+		    T1W = T1C + T1N;
+		    T1X = T1V + T1W;
+		    T23 = T21 + T22;
+		    T28 = T26 + T27;
+		    T4q = T23 + T28;
+		    T2W = T21 - T22;
+		    T2X = T26 - T27;
+		    T4f = T2W + T2X;
+		    T33 = T2C + T2F;
+		    T34 = T2H + T2K;
+		    T35 = T33 + T34;
+		    T2G = T2C - T2F;
+		    T2L = T2H - T2K;
+		    T2M = T2G + T2L;
+	       }
+	       {
+		    E Tu, T3g, T2c, T2r, T12, T3k, T2f, T2z, TF, T3h, T2d, T2u, TR, T3j, T2i;
+		    E T2w;
+		    {
+			 E To, T2a, Tt, T2b;
+			 {
+			      E Tl, Tn, Tk, Tm;
+			      Tl = Rp[WS(rs, 2)];
+			      Tn = Rm[WS(rs, 2)];
+			      Tk = W[6];
+			      Tm = W[7];
+			      To = FMA(Tk, Tl, Tm * Tn);
+			      T2a = FNMS(Tm, Tl, Tk * Tn);
+			 }
+			 {
+			      E Tq, Ts, Tp, Tr;
+			      Tq = Rp[WS(rs, 7)];
+			      Ts = Rm[WS(rs, 7)];
+			      Tp = W[26];
+			      Tr = W[27];
+			      Tt = FMA(Tp, Tq, Tr * Ts);
+			      T2b = FNMS(Tr, Tq, Tp * Ts);
+			 }
+			 Tu = To + Tt;
+			 T3g = T2a + T2b;
+			 T2c = T2a - T2b;
+			 T2r = To - Tt;
+		    }
+		    {
+			 E TW, T2x, T11, T2y;
+			 {
+			      E TT, TV, TS, TU;
+			      TT = Ip[0];
+			      TV = Im[0];
+			      TS = W[0];
+			      TU = W[1];
+			      TW = FMA(TS, TT, TU * TV);
+			      T2x = FNMS(TU, TT, TS * TV);
+			 }
+			 {
+			      E TY, T10, TX, TZ;
+			      TY = Ip[WS(rs, 5)];
+			      T10 = Im[WS(rs, 5)];
+			      TX = W[20];
+			      TZ = W[21];
+			      T11 = FMA(TX, TY, TZ * T10);
+			      T2y = FNMS(TZ, TY, TX * T10);
+			 }
+			 T12 = TW + T11;
+			 T3k = T2x + T2y;
+			 T2f = T11 - TW;
+			 T2z = T2x - T2y;
+		    }
+		    {
+			 E Tz, T2s, TE, T2t;
+			 {
+			      E Tw, Ty, Tv, Tx;
+			      Tw = Ip[WS(rs, 4)];
+			      Ty = Im[WS(rs, 4)];
+			      Tv = W[16];
+			      Tx = W[17];
+			      Tz = FMA(Tv, Tw, Tx * Ty);
+			      T2s = FNMS(Tx, Tw, Tv * Ty);
+			 }
+			 {
+			      E TB, TD, TA, TC;
+			      TB = Ip[WS(rs, 9)];
+			      TD = Im[WS(rs, 9)];
+			      TA = W[36];
+			      TC = W[37];
+			      TE = FMA(TA, TB, TC * TD);
+			      T2t = FNMS(TC, TB, TA * TD);
+			 }
+			 TF = Tz + TE;
+			 T3h = T2s + T2t;
+			 T2d = Tz - TE;
+			 T2u = T2s - T2t;
+		    }
+		    {
+			 E TL, T2g, TQ, T2h;
+			 {
+			      E TI, TK, TH, TJ;
+			      TI = Rp[WS(rs, 8)];
+			      TK = Rm[WS(rs, 8)];
+			      TH = W[30];
+			      TJ = W[31];
+			      TL = FMA(TH, TI, TJ * TK);
+			      T2g = FNMS(TJ, TI, TH * TK);
+			 }
+			 {
+			      E TN, TP, TM, TO;
+			      TN = Rp[WS(rs, 3)];
+			      TP = Rm[WS(rs, 3)];
+			      TM = W[10];
+			      TO = W[11];
+			      TQ = FMA(TM, TN, TO * TP);
+			      T2h = FNMS(TO, TN, TM * TP);
+			 }
+			 TR = TL + TQ;
+			 T3j = T2g + T2h;
+			 T2i = T2g - T2h;
+			 T2w = TL - TQ;
+		    }
+		    TG = Tu - TF;
+		    T13 = TR - T12;
+		    T14 = TG + T13;
+		    T3i = T3g + T3h;
+		    T3l = T3j + T3k;
+		    T3J = T3i + T3l;
+		    T3D = T3g - T3h;
+		    T3E = T3j - T3k;
+		    T40 = T3D + T3E;
+		    T1S = Tu + TF;
+		    T1T = TR + T12;
+		    T1U = T1S + T1T;
+		    T2e = T2c + T2d;
+		    T2j = T2f - T2i;
+		    T4p = T2j - T2e;
+		    T2T = T2c - T2d;
+		    T2U = T2i + T2f;
+		    T4e = T2T + T2U;
+		    T30 = T2r + T2u;
+		    T31 = T2w + T2z;
+		    T32 = T30 + T31;
+		    T2v = T2r - T2u;
+		    T2A = T2w - T2z;
+		    T2B = T2v + T2A;
+	       }
+	       {
+		    E T3y, T1Q, T3x, T3G, T3I, T3C, T3F, T3H, T3z;
+		    T3y = KP559016994 * (T14 - T1P);
+		    T1Q = T14 + T1P;
+		    T3x = FNMS(KP250000000, T1Q, Tj);
+		    T3C = T3A + T3B;
+		    T3F = T3D - T3E;
+		    T3G = FNMS(KP587785252, T3F, KP951056516 * T3C);
+		    T3I = FMA(KP951056516, T3F, KP587785252 * T3C);
+		    Rm[WS(rs, 9)] = Tj + T1Q;
+		    T3H = T3y + T3x;
+		    Rm[WS(rs, 5)] = T3H - T3I;
+		    Rp[WS(rs, 6)] = T3H + T3I;
+		    T3z = T3x - T3y;
+		    Rp[WS(rs, 2)] = T3z - T3G;
+		    Rm[WS(rs, 1)] = T3z + T3G;
+	       }
+	       {
+		    E T47, T41, T46, T45, T49, T43, T44, T4a, T48;
+		    T47 = KP559016994 * (T40 + T3Z);
+		    T41 = T3Z - T40;
+		    T46 = FMA(KP250000000, T41, T42);
+		    T43 = T13 - TG;
+		    T44 = T1r - T1O;
+		    T45 = FMA(KP587785252, T43, KP951056516 * T44);
+		    T49 = FNMS(KP587785252, T44, KP951056516 * T43);
+		    Im[WS(rs, 9)] = T41 - T42;
+		    T4a = T47 + T46;
+		    Im[WS(rs, 5)] = T49 - T4a;
+		    Ip[WS(rs, 6)] = T49 + T4a;
+		    T48 = T46 - T47;
+		    Im[WS(rs, 1)] = T45 - T48;
+		    Ip[WS(rs, 2)] = T45 + T48;
+	       }
+	       {
+		    E T3d, T1Y, T3e, T3u, T3w, T3m, T3t, T3v, T3f;
+		    T3d = KP559016994 * (T1U - T1X);
+		    T1Y = T1U + T1X;
+		    T3e = FNMS(KP250000000, T1Y, T1R);
+		    T3m = T3i - T3l;
+		    T3t = T3p - T3s;
+		    T3u = FMA(KP951056516, T3m, KP587785252 * T3t);
+		    T3w = FNMS(KP587785252, T3m, KP951056516 * T3t);
+		    Rp[0] = T1R + T1Y;
+		    T3v = T3e - T3d;
+		    Rm[WS(rs, 7)] = T3v - T3w;
+		    Rp[WS(rs, 8)] = T3v + T3w;
+		    T3f = T3d + T3e;
+		    Rp[WS(rs, 4)] = T3f - T3u;
+		    Rm[WS(rs, 3)] = T3f + T3u;
+	       }
+	       {
+		    E T3U, T3L, T3V, T3T, T3X, T3R, T3S, T3Y, T3W;
+		    T3U = KP559016994 * (T3J - T3K);
+		    T3L = T3J + T3K;
+		    T3V = FNMS(KP250000000, T3L, T3Q);
+		    T3R = T1S - T1T;
+		    T3S = T1V - T1W;
+		    T3T = FMA(KP951056516, T3R, KP587785252 * T3S);
+		    T3X = FNMS(KP951056516, T3S, KP587785252 * T3R);
+		    Ip[0] = T3L + T3Q;
+		    T3Y = T3V - T3U;
+		    Im[WS(rs, 7)] = T3X - T3Y;
+		    Ip[WS(rs, 8)] = T3X + T3Y;
+		    T3W = T3U + T3V;
+		    Im[WS(rs, 3)] = T3T - T3W;
+		    Ip[WS(rs, 4)] = T3T + T3W;
+	       }
+	       {
+		    E T2P, T2N, T2O, T2l, T2R, T29, T2k, T2S, T2Q;
+		    T2P = KP559016994 * (T2B - T2M);
+		    T2N = T2B + T2M;
+		    T2O = FNMS(KP250000000, T2N, T2q);
+		    T29 = T23 - T28;
+		    T2k = T2e + T2j;
+		    T2l = FNMS(KP587785252, T2k, KP951056516 * T29);
+		    T2R = FMA(KP951056516, T2k, KP587785252 * T29);
+		    Rm[WS(rs, 4)] = T2q + T2N;
+		    T2S = T2P + T2O;
+		    Rm[WS(rs, 8)] = T2R + T2S;
+		    Rm[0] = T2S - T2R;
+		    T2Q = T2O - T2P;
+		    Rp[WS(rs, 3)] = T2l + T2Q;
+		    Rp[WS(rs, 7)] = T2Q - T2l;
+	       }
+	       {
+		    E T4w, T4r, T4x, T4v, T4A, T4t, T4u, T4z, T4y;
+		    T4w = KP559016994 * (T4p + T4q);
+		    T4r = T4p - T4q;
+		    T4x = FMA(KP250000000, T4r, T4s);
+		    T4t = T2v - T2A;
+		    T4u = T2G - T2L;
+		    T4v = FMA(KP951056516, T4t, KP587785252 * T4u);
+		    T4A = FNMS(KP587785252, T4t, KP951056516 * T4u);
+		    Im[WS(rs, 4)] = T4r - T4s;
+		    T4z = T4w + T4x;
+		    Ip[WS(rs, 3)] = T4z - T4A;
+		    Ip[WS(rs, 7)] = T4A + T4z;
+		    T4y = T4w - T4x;
+		    Im[WS(rs, 8)] = T4v + T4y;
+		    Im[0] = T4y - T4v;
+	       }
+	       {
+		    E T36, T38, T39, T2Z, T3b, T2V, T2Y, T3c, T3a;
+		    T36 = KP559016994 * (T32 - T35);
+		    T38 = T32 + T35;
+		    T39 = FNMS(KP250000000, T38, T37);
+		    T2V = T2T - T2U;
+		    T2Y = T2W - T2X;
+		    T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
+		    T3b = FNMS(KP587785252, T2V, KP951056516 * T2Y);
+		    Rp[WS(rs, 5)] = T37 + T38;
+		    T3c = T39 - T36;
+		    Rm[WS(rs, 6)] = T3b + T3c;
+		    Rm[WS(rs, 2)] = T3c - T3b;
+		    T3a = T36 + T39;
+		    Rp[WS(rs, 1)] = T2Z + T3a;
+		    Rp[WS(rs, 9)] = T3a - T2Z;
+	       }
+	       {
+		    E T4g, T4k, T4l, T4d, T4o, T4b, T4c, T4n, T4m;
+		    T4g = KP559016994 * (T4e - T4f);
+		    T4k = T4e + T4f;
+		    T4l = FNMS(KP250000000, T4k, T4j);
+		    T4b = T33 - T34;
+		    T4c = T30 - T31;
+		    T4d = FNMS(KP587785252, T4c, KP951056516 * T4b);
+		    T4o = FMA(KP951056516, T4c, KP587785252 * T4b);
+		    Ip[WS(rs, 5)] = T4k + T4j;
+		    T4n = T4g + T4l;
+		    Ip[WS(rs, 1)] = T4n - T4o;
+		    Ip[WS(rs, 9)] = T4o + T4n;
+		    T4m = T4g - T4l;
+		    Im[WS(rs, 6)] = T4d + T4m;
+		    Im[WS(rs, 2)] = T4m - T4d;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cf_20", twinstr, &GENUS, {184, 62, 62, 0} };
+
+void X(codelet_hc2cf_20) (planner *p) {
+     X(khc2c_register) (p, hc2cf_20, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1771 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:33 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 32 -dit -name hc2cf_32 -include hc2cf.h */
+
+/*
+ * This function contains 434 FP additions, 260 FP multiplications,
+ * (or, 236 additions, 62 multiplications, 198 fused multiply/add),
+ * 135 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T90, T8Z;
+	       {
+		    E T8x, T87, T8, T3w, T83, T3B, T8y, Tl, T6F, Tz, T3J, T5T, T6G, TM, T3Q;
+		    E T5U, T3Z, T5Y, T7D, T6L, T5X, T46, T6M, T1f, T4e, T61, T7E, T6R, T6O, T1G;
+		    E T60, T4l, T54, T6c, T79, T7N, T32, T7b, T6f, T5r, T4v, T65, T6X, T7I, T29;
+		    E T70, T68, T4S, T5s, T5b, T7O, T7e, T76, T3t, T5t, T5i, T4H, T2y, T4B, T71;
+		    E T2m, T4w, T4F, T2s;
+		    {
+			 E T3X, T1d, T44, T6J, T11, T3T, T3V, T17, T5h, T5c;
+			 {
+			      E Ta, Td, Tg, T3x, Tb, Tj, Tf, Tc, Ti;
+			      {
+				   E T1, T86, T3, T6, T2, T5;
+				   T1 = Rp[0];
+				   T86 = Rm[0];
+				   T3 = Rp[WS(rs, 8)];
+				   T6 = Rm[WS(rs, 8)];
+				   T2 = W[30];
+				   T5 = W[31];
+				   {
+					E T84, T4, T9, T85, T7;
+					Ta = Rp[WS(rs, 4)];
+					Td = Rm[WS(rs, 4)];
+					T84 = T2 * T6;
+					T4 = T2 * T3;
+					T9 = W[14];
+					Tg = Rp[WS(rs, 12)];
+					T85 = FNMS(T5, T3, T84);
+					T7 = FMA(T5, T6, T4);
+					T3x = T9 * Td;
+					Tb = T9 * Ta;
+					T8x = T86 - T85;
+					T87 = T85 + T86;
+					T8 = T1 + T7;
+					T3w = T1 - T7;
+					Tj = Rm[WS(rs, 12)];
+					Tf = W[46];
+				   }
+				   Tc = W[15];
+				   Ti = W[47];
+			      }
+			      {
+				   E Tu, Tx, T3F, Ts, Tw, T3G, Tv;
+				   {
+					E To, Tr, Tp, T3E, Tq, Tt;
+					{
+					     E T3y, Te, T3A, Tk, T3z, Th, Tn;
+					     To = Rp[WS(rs, 2)];
+					     T3z = Tf * Tj;
+					     Th = Tf * Tg;
+					     T3y = FNMS(Tc, Ta, T3x);
+					     Te = FMA(Tc, Td, Tb);
+					     T3A = FNMS(Ti, Tg, T3z);
+					     Tk = FMA(Ti, Tj, Th);
+					     Tr = Rm[WS(rs, 2)];
+					     Tn = W[6];
+					     T83 = T3y + T3A;
+					     T3B = T3y - T3A;
+					     T8y = Te - Tk;
+					     Tl = Te + Tk;
+					     Tp = Tn * To;
+					     T3E = Tn * Tr;
+					}
+					Tq = W[7];
+					Tu = Rp[WS(rs, 10)];
+					Tx = Rm[WS(rs, 10)];
+					Tt = W[38];
+					T3F = FNMS(Tq, To, T3E);
+					Ts = FMA(Tq, Tr, Tp);
+					Tw = W[39];
+					T3G = Tt * Tx;
+					Tv = Tt * Tu;
+				   }
+				   {
+					E T3M, TF, TH, TK, TG, TJ, TE, TD, TC;
+					{
+					     E TB, T3H, Ty, TA, T3I, T3D, T3L;
+					     TB = Rp[WS(rs, 14)];
+					     TE = Rm[WS(rs, 14)];
+					     T3H = FNMS(Tw, Tu, T3G);
+					     Ty = FMA(Tw, Tx, Tv);
+					     TA = W[54];
+					     TD = W[55];
+					     T6F = T3F + T3H;
+					     T3I = T3F - T3H;
+					     Tz = Ts + Ty;
+					     T3D = Ts - Ty;
+					     T3L = TA * TE;
+					     TC = TA * TB;
+					     T3J = T3D + T3I;
+					     T5T = T3I - T3D;
+					     T3M = FNMS(TD, TB, T3L);
+					}
+					TF = FMA(TD, TE, TC);
+					TH = Rp[WS(rs, 6)];
+					TK = Rm[WS(rs, 6)];
+					TG = W[22];
+					TJ = W[23];
+					{
+					     E TU, T41, T13, T16, T43, T10, T12, T15, T3U, T14;
+					     {
+						  E T19, T1c, T18, T1b, T3P, T3K;
+						  {
+						       E TQ, TT, T3N, TI, TP, TS;
+						       TQ = Rp[WS(rs, 1)];
+						       TT = Rm[WS(rs, 1)];
+						       T3N = TG * TK;
+						       TI = TG * TH;
+						       TP = W[2];
+						       TS = W[3];
+						       {
+							    E T3O, TL, T40, TR;
+							    T3O = FNMS(TJ, TH, T3N);
+							    TL = FMA(TJ, TK, TI);
+							    T40 = TP * TT;
+							    TR = TP * TQ;
+							    T6G = T3M + T3O;
+							    T3P = T3M - T3O;
+							    TM = TF + TL;
+							    T3K = TF - TL;
+							    TU = FMA(TS, TT, TR);
+							    T41 = FNMS(TS, TQ, T40);
+						       }
+						  }
+						  T3Q = T3K - T3P;
+						  T5U = T3K + T3P;
+						  T19 = Rp[WS(rs, 13)];
+						  T1c = Rm[WS(rs, 13)];
+						  T18 = W[50];
+						  T1b = W[51];
+						  {
+						       E TW, TZ, TY, T42, TX, T3W, T1a, TV;
+						       TW = Rp[WS(rs, 9)];
+						       TZ = Rm[WS(rs, 9)];
+						       T3W = T18 * T1c;
+						       T1a = T18 * T19;
+						       TV = W[34];
+						       TY = W[35];
+						       T3X = FNMS(T1b, T19, T3W);
+						       T1d = FMA(T1b, T1c, T1a);
+						       T42 = TV * TZ;
+						       TX = TV * TW;
+						       T13 = Rp[WS(rs, 5)];
+						       T16 = Rm[WS(rs, 5)];
+						       T43 = FNMS(TY, TW, T42);
+						       T10 = FMA(TY, TZ, TX);
+						       T12 = W[18];
+						       T15 = W[19];
+						  }
+					     }
+					     T44 = T41 - T43;
+					     T6J = T41 + T43;
+					     T11 = TU + T10;
+					     T3T = TU - T10;
+					     T3U = T12 * T16;
+					     T14 = T12 * T13;
+					     T3V = FNMS(T15, T13, T3U);
+					     T17 = FMA(T15, T16, T14);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T4g, T1l, T4c, T1E, T1u, T1x, T1w, T4i, T1r, T49, T1v;
+			      {
+				   E T1A, T1D, T1C, T4b, T1B;
+				   {
+					E T1h, T1k, T1g, T1j, T4f, T1i, T1z;
+					T1h = Rp[WS(rs, 15)];
+					T1k = Rm[WS(rs, 15)];
+					{
+					     E T6K, T3Y, T1e, T45;
+					     T6K = T3V + T3X;
+					     T3Y = T3V - T3X;
+					     T1e = T17 + T1d;
+					     T45 = T17 - T1d;
+					     T3Z = T3T + T3Y;
+					     T5Y = T3T - T3Y;
+					     T7D = T6J + T6K;
+					     T6L = T6J - T6K;
+					     T5X = T44 + T45;
+					     T46 = T44 - T45;
+					     T6M = T11 - T1e;
+					     T1f = T11 + T1e;
+					     T1g = W[58];
+					}
+					T1j = W[59];
+					T1A = Rp[WS(rs, 11)];
+					T1D = Rm[WS(rs, 11)];
+					T4f = T1g * T1k;
+					T1i = T1g * T1h;
+					T1z = W[42];
+					T1C = W[43];
+					T4g = FNMS(T1j, T1h, T4f);
+					T1l = FMA(T1j, T1k, T1i);
+					T4b = T1z * T1D;
+					T1B = T1z * T1A;
+				   }
+				   {
+					E T1n, T1q, T1m, T1p, T4h, T1o, T1t;
+					T1n = Rp[WS(rs, 7)];
+					T1q = Rm[WS(rs, 7)];
+					T4c = FNMS(T1C, T1A, T4b);
+					T1E = FMA(T1C, T1D, T1B);
+					T1m = W[26];
+					T1p = W[27];
+					T1u = Rp[WS(rs, 3)];
+					T1x = Rm[WS(rs, 3)];
+					T4h = T1m * T1q;
+					T1o = T1m * T1n;
+					T1t = W[10];
+					T1w = W[11];
+					T4i = FNMS(T1p, T1n, T4h);
+					T1r = FMA(T1p, T1q, T1o);
+					T49 = T1t * T1x;
+					T1v = T1t * T1u;
+				   }
+			      }
+			      {
+				   E T4j, T6P, T1s, T48, T4a, T1y;
+				   T4j = T4g - T4i;
+				   T6P = T4g + T4i;
+				   T1s = T1l + T1r;
+				   T48 = T1l - T1r;
+				   T4a = FNMS(T1w, T1u, T49);
+				   T1y = FMA(T1w, T1x, T1v);
+				   {
+					E T6Q, T4d, T4k, T1F;
+					T6Q = T4a + T4c;
+					T4d = T4a - T4c;
+					T4k = T1y - T1E;
+					T1F = T1y + T1E;
+					T4e = T48 + T4d;
+					T61 = T48 - T4d;
+					T7E = T6P + T6Q;
+					T6R = T6P - T6Q;
+					T6O = T1s - T1F;
+					T1G = T1s + T1F;
+					T60 = T4j + T4k;
+					T4l = T4j - T4k;
+				   }
+			      }
+			 }
+			 {
+			      E T5m, T2H, T52, T30, T2Q, T2T, T2S, T5o, T2N, T4Z, T2R;
+			      {
+				   E T2W, T2Z, T2Y, T51, T2X;
+				   {
+					E T2D, T2G, T2C, T2F, T5l, T2E, T2V;
+					T2D = Ip[WS(rs, 15)];
+					T2G = Im[WS(rs, 15)];
+					T2C = W[60];
+					T2F = W[61];
+					T2W = Ip[WS(rs, 11)];
+					T2Z = Im[WS(rs, 11)];
+					T5l = T2C * T2G;
+					T2E = T2C * T2D;
+					T2V = W[44];
+					T2Y = W[45];
+					T5m = FNMS(T2F, T2D, T5l);
+					T2H = FMA(T2F, T2G, T2E);
+					T51 = T2V * T2Z;
+					T2X = T2V * T2W;
+				   }
+				   {
+					E T2J, T2M, T2I, T2L, T5n, T2K, T2P;
+					T2J = Ip[WS(rs, 7)];
+					T2M = Im[WS(rs, 7)];
+					T52 = FNMS(T2Y, T2W, T51);
+					T30 = FMA(T2Y, T2Z, T2X);
+					T2I = W[28];
+					T2L = W[29];
+					T2Q = Ip[WS(rs, 3)];
+					T2T = Im[WS(rs, 3)];
+					T5n = T2I * T2M;
+					T2K = T2I * T2J;
+					T2P = W[12];
+					T2S = W[13];
+					T5o = FNMS(T2L, T2J, T5n);
+					T2N = FMA(T2L, T2M, T2K);
+					T4Z = T2P * T2T;
+					T2R = T2P * T2Q;
+				   }
+			      }
+			      {
+				   E T5p, T77, T2O, T4Y, T50, T2U;
+				   T5p = T5m - T5o;
+				   T77 = T5m + T5o;
+				   T2O = T2H + T2N;
+				   T4Y = T2H - T2N;
+				   T50 = FNMS(T2S, T2Q, T4Z);
+				   T2U = FMA(T2S, T2T, T2R);
+				   {
+					E T78, T53, T5q, T31;
+					T78 = T50 + T52;
+					T53 = T50 - T52;
+					T5q = T30 - T2U;
+					T31 = T2U + T30;
+					T54 = T4Y + T53;
+					T6c = T4Y - T53;
+					T79 = T77 - T78;
+					T7N = T77 + T78;
+					T32 = T2O + T31;
+					T7b = T2O - T31;
+					T6f = T5q - T5p;
+					T5r = T5p + T5q;
+				   }
+			      }
+			 }
+			 {
+			      E T4N, T1O, T4t, T27, T1X, T20, T1Z, T4P, T1U, T4q, T1Y;
+			      {
+				   E T23, T26, T25, T4s, T24;
+				   {
+					E T1K, T1N, T1J, T1M, T4M, T1L, T22;
+					T1K = Ip[0];
+					T1N = Im[0];
+					T1J = W[0];
+					T1M = W[1];
+					T23 = Ip[WS(rs, 12)];
+					T26 = Im[WS(rs, 12)];
+					T4M = T1J * T1N;
+					T1L = T1J * T1K;
+					T22 = W[48];
+					T25 = W[49];
+					T4N = FNMS(T1M, T1K, T4M);
+					T1O = FMA(T1M, T1N, T1L);
+					T4s = T22 * T26;
+					T24 = T22 * T23;
+				   }
+				   {
+					E T1Q, T1T, T1P, T1S, T4O, T1R, T1W;
+					T1Q = Ip[WS(rs, 8)];
+					T1T = Im[WS(rs, 8)];
+					T4t = FNMS(T25, T23, T4s);
+					T27 = FMA(T25, T26, T24);
+					T1P = W[32];
+					T1S = W[33];
+					T1X = Ip[WS(rs, 4)];
+					T20 = Im[WS(rs, 4)];
+					T4O = T1P * T1T;
+					T1R = T1P * T1Q;
+					T1W = W[16];
+					T1Z = W[17];
+					T4P = FNMS(T1S, T1Q, T4O);
+					T1U = FMA(T1S, T1T, T1R);
+					T4q = T1W * T20;
+					T1Y = T1W * T1X;
+				   }
+			      }
+			      {
+				   E T4Q, T6V, T1V, T4p, T4r, T21;
+				   T4Q = T4N - T4P;
+				   T6V = T4N + T4P;
+				   T1V = T1O + T1U;
+				   T4p = T1O - T1U;
+				   T4r = FNMS(T1Z, T1X, T4q);
+				   T21 = FMA(T1Z, T20, T1Y);
+				   {
+					E T6W, T4u, T4R, T28;
+					T6W = T4r + T4t;
+					T4u = T4r - T4t;
+					T4R = T21 - T27;
+					T28 = T21 + T27;
+					T4v = T4p + T4u;
+					T65 = T4p - T4u;
+					T6X = T6V - T6W;
+					T7I = T6V + T6W;
+					T29 = T1V + T28;
+					T70 = T1V - T28;
+					T68 = T4Q + T4R;
+					T4S = T4Q - T4R;
+				   }
+			      }
+			 }
+			 {
+			      E T57, T38, T5g, T3r, T3h, T3k, T3j, T59, T3e, T5d, T3i;
+			      {
+				   E T3n, T3q, T3p, T5f, T3o;
+				   {
+					E T34, T37, T33, T36, T56, T35, T3m;
+					T34 = Ip[WS(rs, 1)];
+					T37 = Im[WS(rs, 1)];
+					T33 = W[4];
+					T36 = W[5];
+					T3n = Ip[WS(rs, 5)];
+					T3q = Im[WS(rs, 5)];
+					T56 = T33 * T37;
+					T35 = T33 * T34;
+					T3m = W[20];
+					T3p = W[21];
+					T57 = FNMS(T36, T34, T56);
+					T38 = FMA(T36, T37, T35);
+					T5f = T3m * T3q;
+					T3o = T3m * T3n;
+				   }
+				   {
+					E T3a, T3d, T39, T3c, T58, T3b, T3g;
+					T3a = Ip[WS(rs, 9)];
+					T3d = Im[WS(rs, 9)];
+					T5g = FNMS(T3p, T3n, T5f);
+					T3r = FMA(T3p, T3q, T3o);
+					T39 = W[36];
+					T3c = W[37];
+					T3h = Ip[WS(rs, 13)];
+					T3k = Im[WS(rs, 13)];
+					T58 = T39 * T3d;
+					T3b = T39 * T3a;
+					T3g = W[52];
+					T3j = W[53];
+					T59 = FNMS(T3c, T3a, T58);
+					T3e = FMA(T3c, T3d, T3b);
+					T5d = T3g * T3k;
+					T3i = T3g * T3h;
+				   }
+			      }
+			      {
+				   E T5a, T7c, T3f, T55, T5e, T3l, T7d, T3s;
+				   T5a = T57 - T59;
+				   T7c = T57 + T59;
+				   T3f = T38 + T3e;
+				   T55 = T38 - T3e;
+				   T5e = FNMS(T3j, T3h, T5d);
+				   T3l = FMA(T3j, T3k, T3i);
+				   T5h = T5e - T5g;
+				   T7d = T5e + T5g;
+				   T3s = T3l + T3r;
+				   T5c = T3l - T3r;
+				   T5s = T5a - T55;
+				   T5b = T55 + T5a;
+				   T7O = T7c + T7d;
+				   T7e = T7c - T7d;
+				   T76 = T3s - T3f;
+				   T3t = T3f + T3s;
+			      }
+			 }
+			 {
+			      E T4y, T2f, T2o, T2r, T4A, T2l, T2n, T2q, T4E, T2p;
+			      {
+				   E T2u, T2x, T2t, T2w;
+				   {
+					E T2b, T2e, T2d, T4x, T2c, T2a;
+					T2b = Ip[WS(rs, 2)];
+					T2e = Im[WS(rs, 2)];
+					T2a = W[8];
+					T5t = T5c + T5h;
+					T5i = T5c - T5h;
+					T2d = W[9];
+					T4x = T2a * T2e;
+					T2c = T2a * T2b;
+					T2u = Ip[WS(rs, 6)];
+					T2x = Im[WS(rs, 6)];
+					T4y = FNMS(T2d, T2b, T4x);
+					T2f = FMA(T2d, T2e, T2c);
+					T2t = W[24];
+					T2w = W[25];
+				   }
+				   {
+					E T2h, T2k, T2j, T4z, T2i, T4G, T2v, T2g;
+					T2h = Ip[WS(rs, 10)];
+					T2k = Im[WS(rs, 10)];
+					T4G = T2t * T2x;
+					T2v = T2t * T2u;
+					T2g = W[40];
+					T2j = W[41];
+					T4H = FNMS(T2w, T2u, T4G);
+					T2y = FMA(T2w, T2x, T2v);
+					T4z = T2g * T2k;
+					T2i = T2g * T2h;
+					T2o = Ip[WS(rs, 14)];
+					T2r = Im[WS(rs, 14)];
+					T4A = FNMS(T2j, T2h, T4z);
+					T2l = FMA(T2j, T2k, T2i);
+					T2n = W[56];
+					T2q = W[57];
+				   }
+			      }
+			      T4B = T4y - T4A;
+			      T71 = T4y + T4A;
+			      T2m = T2f + T2l;
+			      T4w = T2f - T2l;
+			      T4E = T2n * T2r;
+			      T2p = T2n * T2o;
+			      T4F = FNMS(T2q, T2o, T4E);
+			      T2s = FMA(T2q, T2r, T2p);
+			 }
+		    }
+		    {
+			 E T4T, T4C, T4J, T4U, T7y, T8q, T8p, T7B;
+			 {
+			      E T6E, T8j, T73, T6Y, T6H, T8k, T8i, T8h;
+			      {
+				   E T7C, TO, T80, T7Z, T8e, T89, T8d, T1H, T8b, T3v, T7T, T7L, T7U, T7Q, T2A;
+				   E T7K, T7P, T7W, T1I;
+				   {
+					E T7X, T7Y, T7J, T82, T88;
+					{
+					     E Tm, T4I, T72, T4D, T2z, TN;
+					     T6E = T8 - Tl;
+					     Tm = T8 + Tl;
+					     T4T = T4B - T4w;
+					     T4C = T4w + T4B;
+					     T4I = T4F - T4H;
+					     T72 = T4F + T4H;
+					     T4D = T2s - T2y;
+					     T2z = T2s + T2y;
+					     TN = Tz + TM;
+					     T8j = TM - Tz;
+					     T73 = T71 - T72;
+					     T7J = T71 + T72;
+					     T4J = T4D - T4I;
+					     T4U = T4D + T4I;
+					     T2A = T2m + T2z;
+					     T6Y = T2z - T2m;
+					     T7C = Tm - TN;
+					     TO = Tm + TN;
+					}
+					T7K = T7I - T7J;
+					T7X = T7I + T7J;
+					T7Y = T7N + T7O;
+					T7P = T7N - T7O;
+					T6H = T6F - T6G;
+					T82 = T6F + T6G;
+					T88 = T83 + T87;
+					T8k = T87 - T83;
+					T80 = T7X + T7Y;
+					T7Z = T7X - T7Y;
+					T8e = T88 - T82;
+					T89 = T82 + T88;
+				   }
+				   {
+					E T7H, T7M, T2B, T3u;
+					T7H = T29 - T2A;
+					T2B = T29 + T2A;
+					T3u = T32 + T3t;
+					T7M = T32 - T3t;
+					T8d = T1G - T1f;
+					T1H = T1f + T1G;
+					T8b = T3u - T2B;
+					T3v = T2B + T3u;
+					T7T = T7K - T7H;
+					T7L = T7H + T7K;
+					T7U = T7M + T7P;
+					T7Q = T7M - T7P;
+				   }
+				   T7W = TO - T1H;
+				   T1I = TO + T1H;
+				   {
+					E T7S, T8f, T8g, T7V;
+					{
+					     E T7R, T8c, T8a, T7G, T81, T7F;
+					     T8i = T7Q - T7L;
+					     T7R = T7L + T7Q;
+					     T81 = T7D + T7E;
+					     T7F = T7D - T7E;
+					     Rp[0] = T1I + T3v;
+					     Rm[WS(rs, 15)] = T1I - T3v;
+					     Rp[WS(rs, 8)] = T7W + T7Z;
+					     Rm[WS(rs, 7)] = T7W - T7Z;
+					     T8c = T89 - T81;
+					     T8a = T81 + T89;
+					     T7G = T7C + T7F;
+					     T7S = T7C - T7F;
+					     T8h = T8e - T8d;
+					     T8f = T8d + T8e;
+					     Ip[WS(rs, 8)] = T8b + T8c;
+					     Im[WS(rs, 7)] = T8b - T8c;
+					     Ip[0] = T80 + T8a;
+					     Im[WS(rs, 15)] = T80 - T8a;
+					     Rp[WS(rs, 4)] = FMA(KP707106781, T7R, T7G);
+					     Rm[WS(rs, 11)] = FNMS(KP707106781, T7R, T7G);
+					     T8g = T7T + T7U;
+					     T7V = T7T - T7U;
+					}
+					Ip[WS(rs, 4)] = FMA(KP707106781, T8g, T8f);
+					Im[WS(rs, 11)] = FMS(KP707106781, T8g, T8f);
+					Rp[WS(rs, 12)] = FMA(KP707106781, T7V, T7S);
+					Rm[WS(rs, 3)] = FNMS(KP707106781, T7V, T7S);
+				   }
+			      }
+			      {
+				   E T7f, T7m, T6I, T7a, T7A, T7w, T8r, T8l, T8m, T6T, T7j, T75, T8s, T7p, T7z;
+				   E T7t;
+				   {
+					E T7n, T6N, T6S, T7o, T7u, T7v;
+					T7f = T7b - T7e;
+					T7u = T7b + T7e;
+					Ip[WS(rs, 12)] = FMA(KP707106781, T8i, T8h);
+					Im[WS(rs, 3)] = FMS(KP707106781, T8i, T8h);
+					T7m = T6E + T6H;
+					T6I = T6E - T6H;
+					T7v = T79 + T76;
+					T7a = T76 - T79;
+					T7n = T6M + T6L;
+					T6N = T6L - T6M;
+					T7A = FMA(KP414213562, T7u, T7v);
+					T7w = FNMS(KP414213562, T7v, T7u);
+					T8r = T8k - T8j;
+					T8l = T8j + T8k;
+					T6S = T6O + T6R;
+					T7o = T6O - T6R;
+					{
+					     E T7s, T7r, T6Z, T74;
+					     T7s = T6X + T6Y;
+					     T6Z = T6X - T6Y;
+					     T74 = T70 - T73;
+					     T7r = T70 + T73;
+					     T8m = T6N + T6S;
+					     T6T = T6N - T6S;
+					     T7j = FNMS(KP414213562, T6Z, T74);
+					     T75 = FMA(KP414213562, T74, T6Z);
+					     T8s = T7o - T7n;
+					     T7p = T7n + T7o;
+					     T7z = FNMS(KP414213562, T7r, T7s);
+					     T7t = FMA(KP414213562, T7s, T7r);
+					}
+				   }
+				   {
+					E T7i, T6U, T8t, T8v, T7k, T7g;
+					T7i = FNMS(KP707106781, T6T, T6I);
+					T6U = FMA(KP707106781, T6T, T6I);
+					T8t = FMA(KP707106781, T8s, T8r);
+					T8v = FNMS(KP707106781, T8s, T8r);
+					T7k = FNMS(KP414213562, T7a, T7f);
+					T7g = FMA(KP414213562, T7f, T7a);
+					{
+					     E T7q, T7x, T8n, T8o;
+					     T7y = FNMS(KP707106781, T7p, T7m);
+					     T7q = FMA(KP707106781, T7p, T7m);
+					     {
+						  E T7l, T8u, T8w, T7h;
+						  T7l = T7j + T7k;
+						  T8u = T7k - T7j;
+						  T8w = T7g - T75;
+						  T7h = T75 + T7g;
+						  Rm[WS(rs, 1)] = FMA(KP923879532, T7l, T7i);
+						  Rp[WS(rs, 14)] = FNMS(KP923879532, T7l, T7i);
+						  Ip[WS(rs, 6)] = FMA(KP923879532, T8u, T8t);
+						  Im[WS(rs, 9)] = FMS(KP923879532, T8u, T8t);
+						  Ip[WS(rs, 14)] = FMA(KP923879532, T8w, T8v);
+						  Im[WS(rs, 1)] = FMS(KP923879532, T8w, T8v);
+						  Rp[WS(rs, 6)] = FMA(KP923879532, T7h, T6U);
+						  Rm[WS(rs, 9)] = FNMS(KP923879532, T7h, T6U);
+						  T7x = T7t + T7w;
+						  T8q = T7w - T7t;
+					     }
+					     T8p = FNMS(KP707106781, T8m, T8l);
+					     T8n = FMA(KP707106781, T8m, T8l);
+					     T8o = T7z + T7A;
+					     T7B = T7z - T7A;
+					     Rp[WS(rs, 2)] = FMA(KP923879532, T7x, T7q);
+					     Rm[WS(rs, 13)] = FNMS(KP923879532, T7x, T7q);
+					     Ip[WS(rs, 2)] = FMA(KP923879532, T8o, T8n);
+					     Im[WS(rs, 13)] = FMS(KP923879532, T8o, T8n);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T5S, T8O, T8N, T5V, T6g, T6d, T69, T66, T8K, T8J;
+			      {
+				   E T5C, T3S, T8I, T4n, T8H, T8B, T8C, T5F, T5k, T5K, T5u, T4K, T4V;
+				   {
+					E T5D, T5E, T8z, T8A, T5j;
+					{
+					     E T3C, T3R, T47, T4m;
+					     T5S = T3w - T3B;
+					     T3C = T3w + T3B;
+					     Rp[WS(rs, 10)] = FMA(KP923879532, T7B, T7y);
+					     Rm[WS(rs, 5)] = FNMS(KP923879532, T7B, T7y);
+					     Ip[WS(rs, 10)] = FMA(KP923879532, T8q, T8p);
+					     Im[WS(rs, 5)] = FMS(KP923879532, T8q, T8p);
+					     T3R = T3J + T3Q;
+					     T8O = T3Q - T3J;
+					     T5D = FNMS(KP414213562, T3Z, T46);
+					     T47 = FMA(KP414213562, T46, T3Z);
+					     T4m = FNMS(KP414213562, T4l, T4e);
+					     T5E = FMA(KP414213562, T4e, T4l);
+					     T8N = T8y + T8x;
+					     T8z = T8x - T8y;
+					     T5C = FNMS(KP707106781, T3R, T3C);
+					     T3S = FMA(KP707106781, T3R, T3C);
+					     T8I = T4m - T47;
+					     T4n = T47 + T4m;
+					     T8A = T5T + T5U;
+					     T5V = T5T - T5U;
+					}
+					T6g = T5i - T5b;
+					T5j = T5b + T5i;
+					T8H = FNMS(KP707106781, T8A, T8z);
+					T8B = FMA(KP707106781, T8A, T8z);
+					T8C = T5D + T5E;
+					T5F = T5D - T5E;
+					T5k = FMA(KP707106781, T5j, T54);
+					T5K = FNMS(KP707106781, T5j, T54);
+					T5u = T5s + T5t;
+					T6d = T5t - T5s;
+					T69 = T4C - T4J;
+					T4K = T4C + T4J;
+					T4V = T4T + T4U;
+					T66 = T4U - T4T;
+				   }
+				   {
+					E T5M, T5Q, T5J, T5P, T8F, T8G;
+					{
+					     E T5y, T4o, T5A, T5w, T5z, T4X, T8D, T5L, T5v, T8E, T5B, T5x;
+					     T5y = FNMS(KP923879532, T4n, T3S);
+					     T4o = FMA(KP923879532, T4n, T3S);
+					     T5L = FNMS(KP707106781, T5u, T5r);
+					     T5v = FMA(KP707106781, T5u, T5r);
+					     {
+						  E T5H, T4L, T5I, T4W;
+						  T5H = FNMS(KP707106781, T4K, T4v);
+						  T4L = FMA(KP707106781, T4K, T4v);
+						  T5I = FNMS(KP707106781, T4V, T4S);
+						  T4W = FMA(KP707106781, T4V, T4S);
+						  T5M = FMA(KP668178637, T5L, T5K);
+						  T5Q = FNMS(KP668178637, T5K, T5L);
+						  T5A = FMA(KP198912367, T5k, T5v);
+						  T5w = FNMS(KP198912367, T5v, T5k);
+						  T5J = FNMS(KP668178637, T5I, T5H);
+						  T5P = FMA(KP668178637, T5H, T5I);
+						  T5z = FNMS(KP198912367, T4L, T4W);
+						  T4X = FMA(KP198912367, T4W, T4L);
+					     }
+					     T8D = FMA(KP923879532, T8C, T8B);
+					     T8F = FNMS(KP923879532, T8C, T8B);
+					     T8E = T5z + T5A;
+					     T5B = T5z - T5A;
+					     T8G = T5w - T4X;
+					     T5x = T4X + T5w;
+					     Ip[WS(rs, 1)] = FMA(KP980785280, T8E, T8D);
+					     Im[WS(rs, 14)] = FMS(KP980785280, T8E, T8D);
+					     Rp[WS(rs, 1)] = FMA(KP980785280, T5x, T4o);
+					     Rm[WS(rs, 14)] = FNMS(KP980785280, T5x, T4o);
+					     Rp[WS(rs, 9)] = FMA(KP980785280, T5B, T5y);
+					     Rm[WS(rs, 6)] = FNMS(KP980785280, T5B, T5y);
+					}
+					{
+					     E T5O, T8L, T8M, T5R, T5G, T5N;
+					     T5O = FMA(KP923879532, T5F, T5C);
+					     T5G = FNMS(KP923879532, T5F, T5C);
+					     T5N = T5J + T5M;
+					     T8K = T5M - T5J;
+					     T8J = FMA(KP923879532, T8I, T8H);
+					     T8L = FNMS(KP923879532, T8I, T8H);
+					     Ip[WS(rs, 9)] = FMA(KP980785280, T8G, T8F);
+					     Im[WS(rs, 6)] = FMS(KP980785280, T8G, T8F);
+					     Rm[WS(rs, 2)] = FMA(KP831469612, T5N, T5G);
+					     Rp[WS(rs, 13)] = FNMS(KP831469612, T5N, T5G);
+					     T8M = T5P + T5Q;
+					     T5R = T5P - T5Q;
+					     Ip[WS(rs, 13)] = FNMS(KP831469612, T8M, T8L);
+					     Im[WS(rs, 2)] = -(FMA(KP831469612, T8M, T8L));
+					     Rp[WS(rs, 5)] = FMA(KP831469612, T5R, T5O);
+					     Rm[WS(rs, 10)] = FNMS(KP831469612, T5R, T5O);
+					}
+				   }
+			      }
+			      {
+				   E T6o, T5W, T8W, T63, T8V, T8P, T8Q, T6r, T67, T6u, T6y, T6C, T6m, T6i;
+				   {
+					E T6p, T5Z, T62, T6q;
+					T6p = FNMS(KP414213562, T5X, T5Y);
+					T5Z = FMA(KP414213562, T5Y, T5X);
+					Ip[WS(rs, 5)] = FMA(KP831469612, T8K, T8J);
+					Im[WS(rs, 10)] = FMS(KP831469612, T8K, T8J);
+					T6o = FNMS(KP707106781, T5V, T5S);
+					T5W = FMA(KP707106781, T5V, T5S);
+					T62 = FNMS(KP414213562, T61, T60);
+					T6q = FMA(KP414213562, T60, T61);
+					T8W = T5Z + T62;
+					T63 = T5Z - T62;
+					T8V = FNMS(KP707106781, T8O, T8N);
+					T8P = FMA(KP707106781, T8O, T8N);
+					{
+					     E T6x, T6e, T6w, T6h;
+					     T8Q = T6q - T6p;
+					     T6r = T6p + T6q;
+					     T6x = FMA(KP707106781, T6d, T6c);
+					     T6e = FNMS(KP707106781, T6d, T6c);
+					     T6w = FMA(KP707106781, T6g, T6f);
+					     T6h = FNMS(KP707106781, T6g, T6f);
+					     T67 = FNMS(KP707106781, T66, T65);
+					     T6u = FMA(KP707106781, T66, T65);
+					     T6y = FMA(KP198912367, T6x, T6w);
+					     T6C = FNMS(KP198912367, T6w, T6x);
+					     T6m = FNMS(KP668178637, T6e, T6h);
+					     T6i = FMA(KP668178637, T6h, T6e);
+					}
+				   }
+				   {
+					E T6k, T64, T8R, T8T, T6t, T6a;
+					T6k = FNMS(KP923879532, T63, T5W);
+					T64 = FMA(KP923879532, T63, T5W);
+					T8R = FMA(KP923879532, T8Q, T8P);
+					T8T = FNMS(KP923879532, T8Q, T8P);
+					T6t = FMA(KP707106781, T69, T68);
+					T6a = FNMS(KP707106781, T69, T68);
+					{
+					     E T6A, T8X, T8Y, T6D;
+					     {
+						  E T6s, T6B, T6l, T6b, T6z, T6v;
+						  T6A = FMA(KP923879532, T6r, T6o);
+						  T6s = FNMS(KP923879532, T6r, T6o);
+						  T6v = FMA(KP198912367, T6u, T6t);
+						  T6B = FNMS(KP198912367, T6t, T6u);
+						  T6l = FNMS(KP668178637, T67, T6a);
+						  T6b = FMA(KP668178637, T6a, T67);
+						  T6z = T6v + T6y;
+						  T90 = T6y - T6v;
+						  T8Z = FMA(KP923879532, T8W, T8V);
+						  T8X = FNMS(KP923879532, T8W, T8V);
+						  {
+						       E T6n, T8S, T8U, T6j;
+						       T6n = T6l + T6m;
+						       T8S = T6l - T6m;
+						       T8U = T6i - T6b;
+						       T6j = T6b + T6i;
+						       Rp[WS(rs, 7)] = FMA(KP980785280, T6z, T6s);
+						       Rm[WS(rs, 8)] = FNMS(KP980785280, T6z, T6s);
+						       Rp[WS(rs, 11)] = FMA(KP831469612, T6n, T6k);
+						       Rm[WS(rs, 4)] = FNMS(KP831469612, T6n, T6k);
+						       Ip[WS(rs, 3)] = FMA(KP831469612, T8S, T8R);
+						       Im[WS(rs, 12)] = FMS(KP831469612, T8S, T8R);
+						       Ip[WS(rs, 11)] = FMA(KP831469612, T8U, T8T);
+						       Im[WS(rs, 4)] = FMS(KP831469612, T8U, T8T);
+						       Rp[WS(rs, 3)] = FMA(KP831469612, T6j, T64);
+						       Rm[WS(rs, 12)] = FNMS(KP831469612, T6j, T64);
+						       T8Y = T6C - T6B;
+						       T6D = T6B + T6C;
+						  }
+					     }
+					     Ip[WS(rs, 7)] = FMA(KP980785280, T8Y, T8X);
+					     Im[WS(rs, 8)] = FMS(KP980785280, T8Y, T8X);
+					     Rm[0] = FMA(KP980785280, T6D, T6A);
+					     Rp[WS(rs, 15)] = FNMS(KP980785280, T6D, T6A);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Ip[WS(rs, 15)] = FMA(KP980785280, T90, T8Z);
+	       Im[0] = FMS(KP980785280, T90, T8Z);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cf_32", twinstr, &GENUS, {236, 62, 198, 0} };
+
+void X(codelet_hc2cf_32) (planner *p) {
+     X(khc2c_register) (p, hc2cf_32, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 32 -dit -name hc2cf_32 -include hc2cf.h */
+
+/*
+ * This function contains 434 FP additions, 208 FP multiplications,
+ * (or, 340 additions, 114 multiplications, 94 fused multiply/add),
+ * 96 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T59, T41;
+	       E T56, T2B, T67, T6e, T6O, T4b, T5d, T4s, T5g, TG, T7l, T5I, T73, T3a, T4U;
+	       E T3f, T4V, T14, T5N, T5M, T6E, T3m, T4Y, T3r, T4Z, T1r, T5P, T5S, T6F, T3x;
+	       E T51, T3C, T52, T2d, T5Z, T64, T6K, T3V, T57, T44, T5a, T2Y, T6f, T6a, T6P;
+	       E T4m, T5h, T4v, T5e;
+	       {
+		    E T1, T76, T6, T75, Tc, T32, Th, T33;
+		    T1 = Rp[0];
+		    T76 = Rm[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = Rp[WS(rs, 8)];
+			 T5 = Rm[WS(rs, 8)];
+			 T2 = W[30];
+			 T4 = W[31];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T75 = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = Rp[WS(rs, 4)];
+			 Tb = Rm[WS(rs, 4)];
+			 T8 = W[14];
+			 Ta = W[15];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T32 = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = Rp[WS(rs, 12)];
+			 Tg = Rm[WS(rs, 12)];
+			 Td = W[46];
+			 Tf = W[47];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T33 = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E T7, Ti, T7A, T7B;
+			 T7 = T1 + T6;
+			 Ti = Tc + Th;
+			 Tj = T7 + Ti;
+			 T5F = T7 - Ti;
+			 T7A = T76 - T75;
+			 T7B = Tc - Th;
+			 T7C = T7A - T7B;
+			 T7Q = T7B + T7A;
+		    }
+		    {
+			 E T31, T34, T74, T77;
+			 T31 = T1 - T6;
+			 T34 = T32 - T33;
+			 T35 = T31 - T34;
+			 T4T = T31 + T34;
+			 T74 = T32 + T33;
+			 T77 = T75 + T76;
+			 T78 = T74 + T77;
+			 T7m = T77 - T74;
+		    }
+	       }
+	       {
+		    E T1y, T3G, T1O, T3Z, T1D, T3H, T1J, T3Y;
+		    {
+			 E T1v, T1x, T1u, T1w;
+			 T1v = Ip[0];
+			 T1x = Im[0];
+			 T1u = W[0];
+			 T1w = W[1];
+			 T1y = FMA(T1u, T1v, T1w * T1x);
+			 T3G = FNMS(T1w, T1v, T1u * T1x);
+		    }
+		    {
+			 E T1L, T1N, T1K, T1M;
+			 T1L = Ip[WS(rs, 12)];
+			 T1N = Im[WS(rs, 12)];
+			 T1K = W[48];
+			 T1M = W[49];
+			 T1O = FMA(T1K, T1L, T1M * T1N);
+			 T3Z = FNMS(T1M, T1L, T1K * T1N);
+		    }
+		    {
+			 E T1A, T1C, T1z, T1B;
+			 T1A = Ip[WS(rs, 8)];
+			 T1C = Im[WS(rs, 8)];
+			 T1z = W[32];
+			 T1B = W[33];
+			 T1D = FMA(T1z, T1A, T1B * T1C);
+			 T3H = FNMS(T1B, T1A, T1z * T1C);
+		    }
+		    {
+			 E T1G, T1I, T1F, T1H;
+			 T1G = Ip[WS(rs, 4)];
+			 T1I = Im[WS(rs, 4)];
+			 T1F = W[16];
+			 T1H = W[17];
+			 T1J = FMA(T1F, T1G, T1H * T1I);
+			 T3Y = FNMS(T1H, T1G, T1F * T1I);
+		    }
+		    {
+			 E T1E, T1P, T5W, T5X;
+			 T1E = T1y + T1D;
+			 T1P = T1J + T1O;
+			 T1Q = T1E + T1P;
+			 T61 = T1E - T1P;
+			 T5W = T3G + T3H;
+			 T5X = T3Y + T3Z;
+			 T5Y = T5W - T5X;
+			 T6J = T5W + T5X;
+		    }
+		    {
+			 E T3I, T3J, T3X, T40;
+			 T3I = T3G - T3H;
+			 T3J = T1J - T1O;
+			 T3K = T3I + T3J;
+			 T59 = T3I - T3J;
+			 T3X = T1y - T1D;
+			 T40 = T3Y - T3Z;
+			 T41 = T3X - T40;
+			 T56 = T3X + T40;
+		    }
+	       }
+	       {
+		    E T2j, T4o, T2z, T49, T2o, T4p, T2u, T48;
+		    {
+			 E T2g, T2i, T2f, T2h;
+			 T2g = Ip[WS(rs, 15)];
+			 T2i = Im[WS(rs, 15)];
+			 T2f = W[60];
+			 T2h = W[61];
+			 T2j = FMA(T2f, T2g, T2h * T2i);
+			 T4o = FNMS(T2h, T2g, T2f * T2i);
+		    }
+		    {
+			 E T2w, T2y, T2v, T2x;
+			 T2w = Ip[WS(rs, 11)];
+			 T2y = Im[WS(rs, 11)];
+			 T2v = W[44];
+			 T2x = W[45];
+			 T2z = FMA(T2v, T2w, T2x * T2y);
+			 T49 = FNMS(T2x, T2w, T2v * T2y);
+		    }
+		    {
+			 E T2l, T2n, T2k, T2m;
+			 T2l = Ip[WS(rs, 7)];
+			 T2n = Im[WS(rs, 7)];
+			 T2k = W[28];
+			 T2m = W[29];
+			 T2o = FMA(T2k, T2l, T2m * T2n);
+			 T4p = FNMS(T2m, T2l, T2k * T2n);
+		    }
+		    {
+			 E T2r, T2t, T2q, T2s;
+			 T2r = Ip[WS(rs, 3)];
+			 T2t = Im[WS(rs, 3)];
+			 T2q = W[12];
+			 T2s = W[13];
+			 T2u = FMA(T2q, T2r, T2s * T2t);
+			 T48 = FNMS(T2s, T2r, T2q * T2t);
+		    }
+		    {
+			 E T2p, T2A, T6c, T6d;
+			 T2p = T2j + T2o;
+			 T2A = T2u + T2z;
+			 T2B = T2p + T2A;
+			 T67 = T2p - T2A;
+			 T6c = T4o + T4p;
+			 T6d = T48 + T49;
+			 T6e = T6c - T6d;
+			 T6O = T6c + T6d;
+		    }
+		    {
+			 E T47, T4a, T4q, T4r;
+			 T47 = T2j - T2o;
+			 T4a = T48 - T49;
+			 T4b = T47 - T4a;
+			 T5d = T47 + T4a;
+			 T4q = T4o - T4p;
+			 T4r = T2u - T2z;
+			 T4s = T4q + T4r;
+			 T5g = T4q - T4r;
+		    }
+	       }
+	       {
+		    E To, T36, TE, T3d, Tt, T37, Tz, T3c;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = Rp[WS(rs, 2)];
+			 Tn = Rm[WS(rs, 2)];
+			 Tk = W[6];
+			 Tm = W[7];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 T36 = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = Rp[WS(rs, 6)];
+			 TD = Rm[WS(rs, 6)];
+			 TA = W[22];
+			 TC = W[23];
+			 TE = FMA(TA, TB, TC * TD);
+			 T3d = FNMS(TC, TB, TA * TD);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = Rp[WS(rs, 10)];
+			 Ts = Rm[WS(rs, 10)];
+			 Tp = W[38];
+			 Tr = W[39];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 T37 = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = Rp[WS(rs, 14)];
+			 Ty = Rm[WS(rs, 14)];
+			 Tv = W[54];
+			 Tx = W[55];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T3c = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E Tu, TF, T5G, T5H;
+			 Tu = To + Tt;
+			 TF = Tz + TE;
+			 TG = Tu + TF;
+			 T7l = TF - Tu;
+			 T5G = T36 + T37;
+			 T5H = T3c + T3d;
+			 T5I = T5G - T5H;
+			 T73 = T5G + T5H;
+		    }
+		    {
+			 E T38, T39, T3b, T3e;
+			 T38 = T36 - T37;
+			 T39 = To - Tt;
+			 T3a = T38 - T39;
+			 T4U = T39 + T38;
+			 T3b = Tz - TE;
+			 T3e = T3c - T3d;
+			 T3f = T3b + T3e;
+			 T4V = T3b - T3e;
+		    }
+	       }
+	       {
+		    E TM, T3i, T12, T3p, TR, T3j, TX, T3o;
+		    {
+			 E TJ, TL, TI, TK;
+			 TJ = Rp[WS(rs, 1)];
+			 TL = Rm[WS(rs, 1)];
+			 TI = W[2];
+			 TK = W[3];
+			 TM = FMA(TI, TJ, TK * TL);
+			 T3i = FNMS(TK, TJ, TI * TL);
+		    }
+		    {
+			 E TZ, T11, TY, T10;
+			 TZ = Rp[WS(rs, 13)];
+			 T11 = Rm[WS(rs, 13)];
+			 TY = W[50];
+			 T10 = W[51];
+			 T12 = FMA(TY, TZ, T10 * T11);
+			 T3p = FNMS(T10, TZ, TY * T11);
+		    }
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = Rp[WS(rs, 9)];
+			 TQ = Rm[WS(rs, 9)];
+			 TN = W[34];
+			 TP = W[35];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T3j = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E TU, TW, TT, TV;
+			 TU = Rp[WS(rs, 5)];
+			 TW = Rm[WS(rs, 5)];
+			 TT = W[18];
+			 TV = W[19];
+			 TX = FMA(TT, TU, TV * TW);
+			 T3o = FNMS(TV, TU, TT * TW);
+		    }
+		    {
+			 E TS, T13, T5K, T5L;
+			 TS = TM + TR;
+			 T13 = TX + T12;
+			 T14 = TS + T13;
+			 T5N = TS - T13;
+			 T5K = T3i + T3j;
+			 T5L = T3o + T3p;
+			 T5M = T5K - T5L;
+			 T6E = T5K + T5L;
+		    }
+		    {
+			 E T3k, T3l, T3n, T3q;
+			 T3k = T3i - T3j;
+			 T3l = TX - T12;
+			 T3m = T3k + T3l;
+			 T4Y = T3k - T3l;
+			 T3n = TM - TR;
+			 T3q = T3o - T3p;
+			 T3r = T3n - T3q;
+			 T4Z = T3n + T3q;
+		    }
+	       }
+	       {
+		    E T19, T3t, T1p, T3A, T1e, T3u, T1k, T3z;
+		    {
+			 E T16, T18, T15, T17;
+			 T16 = Rp[WS(rs, 15)];
+			 T18 = Rm[WS(rs, 15)];
+			 T15 = W[58];
+			 T17 = W[59];
+			 T19 = FMA(T15, T16, T17 * T18);
+			 T3t = FNMS(T17, T16, T15 * T18);
+		    }
+		    {
+			 E T1m, T1o, T1l, T1n;
+			 T1m = Rp[WS(rs, 11)];
+			 T1o = Rm[WS(rs, 11)];
+			 T1l = W[42];
+			 T1n = W[43];
+			 T1p = FMA(T1l, T1m, T1n * T1o);
+			 T3A = FNMS(T1n, T1m, T1l * T1o);
+		    }
+		    {
+			 E T1b, T1d, T1a, T1c;
+			 T1b = Rp[WS(rs, 7)];
+			 T1d = Rm[WS(rs, 7)];
+			 T1a = W[26];
+			 T1c = W[27];
+			 T1e = FMA(T1a, T1b, T1c * T1d);
+			 T3u = FNMS(T1c, T1b, T1a * T1d);
+		    }
+		    {
+			 E T1h, T1j, T1g, T1i;
+			 T1h = Rp[WS(rs, 3)];
+			 T1j = Rm[WS(rs, 3)];
+			 T1g = W[10];
+			 T1i = W[11];
+			 T1k = FMA(T1g, T1h, T1i * T1j);
+			 T3z = FNMS(T1i, T1h, T1g * T1j);
+		    }
+		    {
+			 E T1f, T1q, T5Q, T5R;
+			 T1f = T19 + T1e;
+			 T1q = T1k + T1p;
+			 T1r = T1f + T1q;
+			 T5P = T1f - T1q;
+			 T5Q = T3t + T3u;
+			 T5R = T3z + T3A;
+			 T5S = T5Q - T5R;
+			 T6F = T5Q + T5R;
+		    }
+		    {
+			 E T3v, T3w, T3y, T3B;
+			 T3v = T3t - T3u;
+			 T3w = T1k - T1p;
+			 T3x = T3v + T3w;
+			 T51 = T3v - T3w;
+			 T3y = T19 - T1e;
+			 T3B = T3z - T3A;
+			 T3C = T3y - T3B;
+			 T52 = T3y + T3B;
+		    }
+	       }
+	       {
+		    E T1V, T3R, T20, T3S, T3Q, T3T, T26, T3M, T2b, T3N, T3L, T3O;
+		    {
+			 E T1S, T1U, T1R, T1T;
+			 T1S = Ip[WS(rs, 2)];
+			 T1U = Im[WS(rs, 2)];
+			 T1R = W[8];
+			 T1T = W[9];
+			 T1V = FMA(T1R, T1S, T1T * T1U);
+			 T3R = FNMS(T1T, T1S, T1R * T1U);
+		    }
+		    {
+			 E T1X, T1Z, T1W, T1Y;
+			 T1X = Ip[WS(rs, 10)];
+			 T1Z = Im[WS(rs, 10)];
+			 T1W = W[40];
+			 T1Y = W[41];
+			 T20 = FMA(T1W, T1X, T1Y * T1Z);
+			 T3S = FNMS(T1Y, T1X, T1W * T1Z);
+		    }
+		    T3Q = T1V - T20;
+		    T3T = T3R - T3S;
+		    {
+			 E T23, T25, T22, T24;
+			 T23 = Ip[WS(rs, 14)];
+			 T25 = Im[WS(rs, 14)];
+			 T22 = W[56];
+			 T24 = W[57];
+			 T26 = FMA(T22, T23, T24 * T25);
+			 T3M = FNMS(T24, T23, T22 * T25);
+		    }
+		    {
+			 E T28, T2a, T27, T29;
+			 T28 = Ip[WS(rs, 6)];
+			 T2a = Im[WS(rs, 6)];
+			 T27 = W[24];
+			 T29 = W[25];
+			 T2b = FMA(T27, T28, T29 * T2a);
+			 T3N = FNMS(T29, T28, T27 * T2a);
+		    }
+		    T3L = T26 - T2b;
+		    T3O = T3M - T3N;
+		    {
+			 E T21, T2c, T62, T63;
+			 T21 = T1V + T20;
+			 T2c = T26 + T2b;
+			 T2d = T21 + T2c;
+			 T5Z = T2c - T21;
+			 T62 = T3R + T3S;
+			 T63 = T3M + T3N;
+			 T64 = T62 - T63;
+			 T6K = T62 + T63;
+		    }
+		    {
+			 E T3P, T3U, T42, T43;
+			 T3P = T3L - T3O;
+			 T3U = T3Q + T3T;
+			 T3V = KP707106781 * (T3P - T3U);
+			 T57 = KP707106781 * (T3U + T3P);
+			 T42 = T3T - T3Q;
+			 T43 = T3L + T3O;
+			 T44 = KP707106781 * (T42 - T43);
+			 T5a = KP707106781 * (T42 + T43);
+		    }
+	       }
+	       {
+		    E T2G, T4c, T2L, T4d, T4e, T4f, T2R, T4i, T2W, T4j, T4h, T4k;
+		    {
+			 E T2D, T2F, T2C, T2E;
+			 T2D = Ip[WS(rs, 1)];
+			 T2F = Im[WS(rs, 1)];
+			 T2C = W[4];
+			 T2E = W[5];
+			 T2G = FMA(T2C, T2D, T2E * T2F);
+			 T4c = FNMS(T2E, T2D, T2C * T2F);
+		    }
+		    {
+			 E T2I, T2K, T2H, T2J;
+			 T2I = Ip[WS(rs, 9)];
+			 T2K = Im[WS(rs, 9)];
+			 T2H = W[36];
+			 T2J = W[37];
+			 T2L = FMA(T2H, T2I, T2J * T2K);
+			 T4d = FNMS(T2J, T2I, T2H * T2K);
+		    }
+		    T4e = T4c - T4d;
+		    T4f = T2G - T2L;
+		    {
+			 E T2O, T2Q, T2N, T2P;
+			 T2O = Ip[WS(rs, 13)];
+			 T2Q = Im[WS(rs, 13)];
+			 T2N = W[52];
+			 T2P = W[53];
+			 T2R = FMA(T2N, T2O, T2P * T2Q);
+			 T4i = FNMS(T2P, T2O, T2N * T2Q);
+		    }
+		    {
+			 E T2T, T2V, T2S, T2U;
+			 T2T = Ip[WS(rs, 5)];
+			 T2V = Im[WS(rs, 5)];
+			 T2S = W[20];
+			 T2U = W[21];
+			 T2W = FMA(T2S, T2T, T2U * T2V);
+			 T4j = FNMS(T2U, T2T, T2S * T2V);
+		    }
+		    T4h = T2R - T2W;
+		    T4k = T4i - T4j;
+		    {
+			 E T2M, T2X, T68, T69;
+			 T2M = T2G + T2L;
+			 T2X = T2R + T2W;
+			 T2Y = T2M + T2X;
+			 T6f = T2X - T2M;
+			 T68 = T4c + T4d;
+			 T69 = T4i + T4j;
+			 T6a = T68 - T69;
+			 T6P = T68 + T69;
+		    }
+		    {
+			 E T4g, T4l, T4t, T4u;
+			 T4g = T4e - T4f;
+			 T4l = T4h + T4k;
+			 T4m = KP707106781 * (T4g - T4l);
+			 T5h = KP707106781 * (T4g + T4l);
+			 T4t = T4h - T4k;
+			 T4u = T4f + T4e;
+			 T4v = KP707106781 * (T4t - T4u);
+			 T5e = KP707106781 * (T4u + T4t);
+		    }
+	       }
+	       {
+		    E T1t, T6X, T7a, T7c, T30, T7b, T70, T71;
+		    {
+			 E TH, T1s, T72, T79;
+			 TH = Tj + TG;
+			 T1s = T14 + T1r;
+			 T1t = TH + T1s;
+			 T6X = TH - T1s;
+			 T72 = T6E + T6F;
+			 T79 = T73 + T78;
+			 T7a = T72 + T79;
+			 T7c = T79 - T72;
+		    }
+		    {
+			 E T2e, T2Z, T6Y, T6Z;
+			 T2e = T1Q + T2d;
+			 T2Z = T2B + T2Y;
+			 T30 = T2e + T2Z;
+			 T7b = T2Z - T2e;
+			 T6Y = T6J + T6K;
+			 T6Z = T6O + T6P;
+			 T70 = T6Y - T6Z;
+			 T71 = T6Y + T6Z;
+		    }
+		    Rm[WS(rs, 15)] = T1t - T30;
+		    Im[WS(rs, 15)] = T71 - T7a;
+		    Rp[0] = T1t + T30;
+		    Ip[0] = T71 + T7a;
+		    Rm[WS(rs, 7)] = T6X - T70;
+		    Im[WS(rs, 7)] = T7b - T7c;
+		    Rp[WS(rs, 8)] = T6X + T70;
+		    Ip[WS(rs, 8)] = T7b + T7c;
+	       }
+	       {
+		    E T6H, T6T, T7g, T7i, T6M, T6U, T6R, T6V;
+		    {
+			 E T6D, T6G, T7e, T7f;
+			 T6D = Tj - TG;
+			 T6G = T6E - T6F;
+			 T6H = T6D + T6G;
+			 T6T = T6D - T6G;
+			 T7e = T1r - T14;
+			 T7f = T78 - T73;
+			 T7g = T7e + T7f;
+			 T7i = T7f - T7e;
+		    }
+		    {
+			 E T6I, T6L, T6N, T6Q;
+			 T6I = T1Q - T2d;
+			 T6L = T6J - T6K;
+			 T6M = T6I + T6L;
+			 T6U = T6L - T6I;
+			 T6N = T2B - T2Y;
+			 T6Q = T6O - T6P;
+			 T6R = T6N - T6Q;
+			 T6V = T6N + T6Q;
+		    }
+		    {
+			 E T6S, T7d, T6W, T7h;
+			 T6S = KP707106781 * (T6M + T6R);
+			 Rm[WS(rs, 11)] = T6H - T6S;
+			 Rp[WS(rs, 4)] = T6H + T6S;
+			 T7d = KP707106781 * (T6U + T6V);
+			 Im[WS(rs, 11)] = T7d - T7g;
+			 Ip[WS(rs, 4)] = T7d + T7g;
+			 T6W = KP707106781 * (T6U - T6V);
+			 Rm[WS(rs, 3)] = T6T - T6W;
+			 Rp[WS(rs, 12)] = T6T + T6W;
+			 T7h = KP707106781 * (T6R - T6M);
+			 Im[WS(rs, 3)] = T7h - T7i;
+			 Ip[WS(rs, 12)] = T7h + T7i;
+		    }
+	       }
+	       {
+		    E T5J, T7n, T7t, T6n, T5U, T7k, T6x, T6B, T6q, T7s, T66, T6k, T6u, T6A, T6h;
+		    E T6l;
+		    {
+			 E T5O, T5T, T60, T65;
+			 T5J = T5F - T5I;
+			 T7n = T7l + T7m;
+			 T7t = T7m - T7l;
+			 T6n = T5F + T5I;
+			 T5O = T5M - T5N;
+			 T5T = T5P + T5S;
+			 T5U = KP707106781 * (T5O - T5T);
+			 T7k = KP707106781 * (T5O + T5T);
+			 {
+			      E T6v, T6w, T6o, T6p;
+			      T6v = T67 + T6a;
+			      T6w = T6e + T6f;
+			      T6x = FNMS(KP382683432, T6w, KP923879532 * T6v);
+			      T6B = FMA(KP923879532, T6w, KP382683432 * T6v);
+			      T6o = T5N + T5M;
+			      T6p = T5P - T5S;
+			      T6q = KP707106781 * (T6o + T6p);
+			      T7s = KP707106781 * (T6p - T6o);
+			 }
+			 T60 = T5Y - T5Z;
+			 T65 = T61 - T64;
+			 T66 = FMA(KP923879532, T60, KP382683432 * T65);
+			 T6k = FNMS(KP923879532, T65, KP382683432 * T60);
+			 {
+			      E T6s, T6t, T6b, T6g;
+			      T6s = T5Y + T5Z;
+			      T6t = T61 + T64;
+			      T6u = FMA(KP382683432, T6s, KP923879532 * T6t);
+			      T6A = FNMS(KP382683432, T6t, KP923879532 * T6s);
+			      T6b = T67 - T6a;
+			      T6g = T6e - T6f;
+			      T6h = FNMS(KP923879532, T6g, KP382683432 * T6b);
+			      T6l = FMA(KP382683432, T6g, KP923879532 * T6b);
+			 }
+		    }
+		    {
+			 E T5V, T6i, T7r, T7u;
+			 T5V = T5J + T5U;
+			 T6i = T66 + T6h;
+			 Rm[WS(rs, 9)] = T5V - T6i;
+			 Rp[WS(rs, 6)] = T5V + T6i;
+			 T7r = T6k + T6l;
+			 T7u = T7s + T7t;
+			 Im[WS(rs, 9)] = T7r - T7u;
+			 Ip[WS(rs, 6)] = T7r + T7u;
+		    }
+		    {
+			 E T6j, T6m, T7v, T7w;
+			 T6j = T5J - T5U;
+			 T6m = T6k - T6l;
+			 Rm[WS(rs, 1)] = T6j - T6m;
+			 Rp[WS(rs, 14)] = T6j + T6m;
+			 T7v = T6h - T66;
+			 T7w = T7t - T7s;
+			 Im[WS(rs, 1)] = T7v - T7w;
+			 Ip[WS(rs, 14)] = T7v + T7w;
+		    }
+		    {
+			 E T6r, T6y, T7j, T7o;
+			 T6r = T6n + T6q;
+			 T6y = T6u + T6x;
+			 Rm[WS(rs, 13)] = T6r - T6y;
+			 Rp[WS(rs, 2)] = T6r + T6y;
+			 T7j = T6A + T6B;
+			 T7o = T7k + T7n;
+			 Im[WS(rs, 13)] = T7j - T7o;
+			 Ip[WS(rs, 2)] = T7j + T7o;
+		    }
+		    {
+			 E T6z, T6C, T7p, T7q;
+			 T6z = T6n - T6q;
+			 T6C = T6A - T6B;
+			 Rm[WS(rs, 5)] = T6z - T6C;
+			 Rp[WS(rs, 10)] = T6z + T6C;
+			 T7p = T6x - T6u;
+			 T7q = T7n - T7k;
+			 Im[WS(rs, 5)] = T7p - T7q;
+			 Ip[WS(rs, 10)] = T7p + T7q;
+		    }
+	       }
+	       {
+		    E T3h, T4D, T7R, T7X, T3E, T7O, T4N, T4R, T46, T4A, T4G, T7W, T4K, T4Q, T4x;
+		    E T4B, T3g, T7P;
+		    T3g = KP707106781 * (T3a - T3f);
+		    T3h = T35 - T3g;
+		    T4D = T35 + T3g;
+		    T7P = KP707106781 * (T4V - T4U);
+		    T7R = T7P + T7Q;
+		    T7X = T7Q - T7P;
+		    {
+			 E T3s, T3D, T4L, T4M;
+			 T3s = FNMS(KP923879532, T3r, KP382683432 * T3m);
+			 T3D = FMA(KP382683432, T3x, KP923879532 * T3C);
+			 T3E = T3s - T3D;
+			 T7O = T3s + T3D;
+			 T4L = T4b + T4m;
+			 T4M = T4s + T4v;
+			 T4N = FNMS(KP555570233, T4M, KP831469612 * T4L);
+			 T4R = FMA(KP831469612, T4M, KP555570233 * T4L);
+		    }
+		    {
+			 E T3W, T45, T4E, T4F;
+			 T3W = T3K - T3V;
+			 T45 = T41 - T44;
+			 T46 = FMA(KP980785280, T3W, KP195090322 * T45);
+			 T4A = FNMS(KP980785280, T45, KP195090322 * T3W);
+			 T4E = FMA(KP923879532, T3m, KP382683432 * T3r);
+			 T4F = FNMS(KP923879532, T3x, KP382683432 * T3C);
+			 T4G = T4E + T4F;
+			 T7W = T4F - T4E;
+		    }
+		    {
+			 E T4I, T4J, T4n, T4w;
+			 T4I = T3K + T3V;
+			 T4J = T41 + T44;
+			 T4K = FMA(KP555570233, T4I, KP831469612 * T4J);
+			 T4Q = FNMS(KP555570233, T4J, KP831469612 * T4I);
+			 T4n = T4b - T4m;
+			 T4w = T4s - T4v;
+			 T4x = FNMS(KP980785280, T4w, KP195090322 * T4n);
+			 T4B = FMA(KP195090322, T4w, KP980785280 * T4n);
+		    }
+		    {
+			 E T3F, T4y, T7V, T7Y;
+			 T3F = T3h + T3E;
+			 T4y = T46 + T4x;
+			 Rm[WS(rs, 8)] = T3F - T4y;
+			 Rp[WS(rs, 7)] = T3F + T4y;
+			 T7V = T4A + T4B;
+			 T7Y = T7W + T7X;
+			 Im[WS(rs, 8)] = T7V - T7Y;
+			 Ip[WS(rs, 7)] = T7V + T7Y;
+		    }
+		    {
+			 E T4z, T4C, T7Z, T80;
+			 T4z = T3h - T3E;
+			 T4C = T4A - T4B;
+			 Rm[0] = T4z - T4C;
+			 Rp[WS(rs, 15)] = T4z + T4C;
+			 T7Z = T4x - T46;
+			 T80 = T7X - T7W;
+			 Im[0] = T7Z - T80;
+			 Ip[WS(rs, 15)] = T7Z + T80;
+		    }
+		    {
+			 E T4H, T4O, T7N, T7S;
+			 T4H = T4D + T4G;
+			 T4O = T4K + T4N;
+			 Rm[WS(rs, 12)] = T4H - T4O;
+			 Rp[WS(rs, 3)] = T4H + T4O;
+			 T7N = T4Q + T4R;
+			 T7S = T7O + T7R;
+			 Im[WS(rs, 12)] = T7N - T7S;
+			 Ip[WS(rs, 3)] = T7N + T7S;
+		    }
+		    {
+			 E T4P, T4S, T7T, T7U;
+			 T4P = T4D - T4G;
+			 T4S = T4Q - T4R;
+			 Rm[WS(rs, 4)] = T4P - T4S;
+			 Rp[WS(rs, 11)] = T4P + T4S;
+			 T7T = T4N - T4K;
+			 T7U = T7R - T7O;
+			 Im[WS(rs, 4)] = T7T - T7U;
+			 Ip[WS(rs, 11)] = T7T + T7U;
+		    }
+	       }
+	       {
+		    E T4X, T5p, T7D, T7J, T54, T7y, T5z, T5D, T5c, T5m, T5s, T7I, T5w, T5C, T5j;
+		    E T5n, T4W, T7z;
+		    T4W = KP707106781 * (T4U + T4V);
+		    T4X = T4T - T4W;
+		    T5p = T4T + T4W;
+		    T7z = KP707106781 * (T3a + T3f);
+		    T7D = T7z + T7C;
+		    T7J = T7C - T7z;
+		    {
+			 E T50, T53, T5x, T5y;
+			 T50 = FNMS(KP382683432, T4Z, KP923879532 * T4Y);
+			 T53 = FMA(KP923879532, T51, KP382683432 * T52);
+			 T54 = T50 - T53;
+			 T7y = T50 + T53;
+			 T5x = T5d + T5e;
+			 T5y = T5g + T5h;
+			 T5z = FNMS(KP195090322, T5y, KP980785280 * T5x);
+			 T5D = FMA(KP195090322, T5x, KP980785280 * T5y);
+		    }
+		    {
+			 E T58, T5b, T5q, T5r;
+			 T58 = T56 - T57;
+			 T5b = T59 - T5a;
+			 T5c = FMA(KP555570233, T58, KP831469612 * T5b);
+			 T5m = FNMS(KP831469612, T58, KP555570233 * T5b);
+			 T5q = FMA(KP382683432, T4Y, KP923879532 * T4Z);
+			 T5r = FNMS(KP382683432, T51, KP923879532 * T52);
+			 T5s = T5q + T5r;
+			 T7I = T5r - T5q;
+		    }
+		    {
+			 E T5u, T5v, T5f, T5i;
+			 T5u = T56 + T57;
+			 T5v = T59 + T5a;
+			 T5w = FMA(KP980785280, T5u, KP195090322 * T5v);
+			 T5C = FNMS(KP195090322, T5u, KP980785280 * T5v);
+			 T5f = T5d - T5e;
+			 T5i = T5g - T5h;
+			 T5j = FNMS(KP831469612, T5i, KP555570233 * T5f);
+			 T5n = FMA(KP831469612, T5f, KP555570233 * T5i);
+		    }
+		    {
+			 E T55, T5k, T7H, T7K;
+			 T55 = T4X + T54;
+			 T5k = T5c + T5j;
+			 Rm[WS(rs, 10)] = T55 - T5k;
+			 Rp[WS(rs, 5)] = T55 + T5k;
+			 T7H = T5m + T5n;
+			 T7K = T7I + T7J;
+			 Im[WS(rs, 10)] = T7H - T7K;
+			 Ip[WS(rs, 5)] = T7H + T7K;
+		    }
+		    {
+			 E T5l, T5o, T7L, T7M;
+			 T5l = T4X - T54;
+			 T5o = T5m - T5n;
+			 Rm[WS(rs, 2)] = T5l - T5o;
+			 Rp[WS(rs, 13)] = T5l + T5o;
+			 T7L = T5j - T5c;
+			 T7M = T7J - T7I;
+			 Im[WS(rs, 2)] = T7L - T7M;
+			 Ip[WS(rs, 13)] = T7L + T7M;
+		    }
+		    {
+			 E T5t, T5A, T7x, T7E;
+			 T5t = T5p + T5s;
+			 T5A = T5w + T5z;
+			 Rm[WS(rs, 14)] = T5t - T5A;
+			 Rp[WS(rs, 1)] = T5t + T5A;
+			 T7x = T5C + T5D;
+			 T7E = T7y + T7D;
+			 Im[WS(rs, 14)] = T7x - T7E;
+			 Ip[WS(rs, 1)] = T7x + T7E;
+		    }
+		    {
+			 E T5B, T5E, T7F, T7G;
+			 T5B = T5p - T5s;
+			 T5E = T5C - T5D;
+			 Rm[WS(rs, 6)] = T5B - T5E;
+			 Rp[WS(rs, 9)] = T5B + T5E;
+			 T7F = T5z - T5w;
+			 T7G = T7D - T7y;
+			 Im[WS(rs, 6)] = T7F - T7G;
+			 Ip[WS(rs, 9)] = T7F + T7G;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cf_32", twinstr, &GENUS, {340, 114, 94, 0} };
+
+void X(codelet_hc2cf_32) (planner *p) {
+     X(khc2c_register) (p, hc2cf_32, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:30 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cf_4 -include hc2cf.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 31 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E To, Te, Tm, T8, Tw, Ty, Tq, Tk;
+	       {
+		    E T1, Tv, Tu, T7, Tg, Tj, Tf, Ti, Tp, Th;
+		    T1 = Rp[0];
+		    Tv = Rm[0];
+		    {
+			 E T3, T6, T2, T5;
+			 T3 = Rp[WS(rs, 1)];
+			 T6 = Rm[WS(rs, 1)];
+			 T2 = W[2];
+			 T5 = W[3];
+			 {
+			      E Ta, Td, Tc, Tn, Tb, Tt, T4, T9;
+			      Ta = Ip[0];
+			      Td = Im[0];
+			      Tt = T2 * T6;
+			      T4 = T2 * T3;
+			      T9 = W[0];
+			      Tc = W[1];
+			      Tu = FNMS(T5, T3, Tt);
+			      T7 = FMA(T5, T6, T4);
+			      Tn = T9 * Td;
+			      Tb = T9 * Ta;
+			      Tg = Ip[WS(rs, 1)];
+			      Tj = Im[WS(rs, 1)];
+			      To = FNMS(Tc, Ta, Tn);
+			      Te = FMA(Tc, Td, Tb);
+			      Tf = W[4];
+			      Ti = W[5];
+			 }
+		    }
+		    Tm = T1 - T7;
+		    T8 = T1 + T7;
+		    Tw = Tu + Tv;
+		    Ty = Tv - Tu;
+		    Tp = Tf * Tj;
+		    Th = Tf * Tg;
+		    Tq = FNMS(Ti, Tg, Tp);
+		    Tk = FMA(Ti, Tj, Th);
+	       }
+	       {
+		    E Ts, Tr, Tl, Tx;
+		    Ts = To + Tq;
+		    Tr = To - Tq;
+		    Tl = Te + Tk;
+		    Tx = Tk - Te;
+		    Rp[WS(rs, 1)] = Tm + Tr;
+		    Rm[0] = Tm - Tr;
+		    Ip[0] = Ts + Tw;
+		    Im[WS(rs, 1)] = Ts - Tw;
+		    Ip[WS(rs, 1)] = Tx + Ty;
+		    Im[0] = Tx - Ty;
+		    Rp[0] = T8 + Tl;
+		    Rm[WS(rs, 1)] = T8 - Tl;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cf_4", twinstr, &GENUS, {16, 6, 6, 0} };
+
+void X(codelet_hc2cf_4) (planner *p) {
+     X(khc2c_register) (p, hc2cf_4, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cf_4 -include hc2cf.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 13 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T1, Tp, T6, To, Tc, Tk, Th, Tl;
+	       T1 = Rp[0];
+	       Tp = Rm[0];
+	       {
+		    E T3, T5, T2, T4;
+		    T3 = Rp[WS(rs, 1)];
+		    T5 = Rm[WS(rs, 1)];
+		    T2 = W[2];
+		    T4 = W[3];
+		    T6 = FMA(T2, T3, T4 * T5);
+		    To = FNMS(T4, T3, T2 * T5);
+	       }
+	       {
+		    E T9, Tb, T8, Ta;
+		    T9 = Ip[0];
+		    Tb = Im[0];
+		    T8 = W[0];
+		    Ta = W[1];
+		    Tc = FMA(T8, T9, Ta * Tb);
+		    Tk = FNMS(Ta, T9, T8 * Tb);
+	       }
+	       {
+		    E Te, Tg, Td, Tf;
+		    Te = Ip[WS(rs, 1)];
+		    Tg = Im[WS(rs, 1)];
+		    Td = W[4];
+		    Tf = W[5];
+		    Th = FMA(Td, Te, Tf * Tg);
+		    Tl = FNMS(Tf, Te, Td * Tg);
+	       }
+	       {
+		    E T7, Ti, Tn, Tq;
+		    T7 = T1 + T6;
+		    Ti = Tc + Th;
+		    Rm[WS(rs, 1)] = T7 - Ti;
+		    Rp[0] = T7 + Ti;
+		    Tn = Tk + Tl;
+		    Tq = To + Tp;
+		    Im[WS(rs, 1)] = Tn - Tq;
+		    Ip[0] = Tn + Tq;
+	       }
+	       {
+		    E Tj, Tm, Tr, Ts;
+		    Tj = T1 - T6;
+		    Tm = Tk - Tl;
+		    Rm[0] = Tj - Tm;
+		    Rp[WS(rs, 1)] = Tj + Tm;
+		    Tr = Th - Tc;
+		    Ts = Tp - To;
+		    Im[0] = Tr - Ts;
+		    Ip[WS(rs, 1)] = Tr + Ts;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cf_4", twinstr, &GENUS, {16, 6, 6, 0} };
+
+void X(codelet_hc2cf_4) (planner *p) {
+     X(khc2c_register) (p, hc2cf_4, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:30 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cf_6 -include hc2cf.h */
+
+/*
+ * This function contains 46 FP additions, 32 FP multiplications,
+ * (or, 24 additions, 10 multiplications, 22 fused multiply/add),
+ * 47 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E TY, TU, T10, TZ;
+	       {
+		    E T1, TX, TW, T7, Tn, Tq, TJ, TS, TB, Tl, To, TK, Tt, Tw, Ts;
+		    E Tp, Tv;
+		    T1 = Rp[0];
+		    TX = Rm[0];
+		    {
+			 E T3, T6, T2, T5;
+			 T3 = Ip[WS(rs, 1)];
+			 T6 = Im[WS(rs, 1)];
+			 T2 = W[4];
+			 T5 = W[5];
+			 {
+			      E Ta, Td, Tg, TF, Tb, Tj, Tf, Tc, Ti, TV, T4, T9;
+			      Ta = Rp[WS(rs, 1)];
+			      Td = Rm[WS(rs, 1)];
+			      TV = T2 * T6;
+			      T4 = T2 * T3;
+			      T9 = W[2];
+			      Tg = Ip[WS(rs, 2)];
+			      TW = FNMS(T5, T3, TV);
+			      T7 = FMA(T5, T6, T4);
+			      TF = T9 * Td;
+			      Tb = T9 * Ta;
+			      Tj = Im[WS(rs, 2)];
+			      Tf = W[8];
+			      Tc = W[3];
+			      Ti = W[9];
+			      {
+				   E TG, Te, TI, Tk, TH, Th, Tm;
+				   Tn = Rp[WS(rs, 2)];
+				   TH = Tf * Tj;
+				   Th = Tf * Tg;
+				   TG = FNMS(Tc, Ta, TF);
+				   Te = FMA(Tc, Td, Tb);
+				   TI = FNMS(Ti, Tg, TH);
+				   Tk = FMA(Ti, Tj, Th);
+				   Tq = Rm[WS(rs, 2)];
+				   Tm = W[6];
+				   TJ = TG + TI;
+				   TS = TI - TG;
+				   TB = Te + Tk;
+				   Tl = Te - Tk;
+				   To = Tm * Tn;
+				   TK = Tm * Tq;
+			      }
+			      Tt = Ip[0];
+			      Tw = Im[0];
+			      Ts = W[0];
+			      Tp = W[7];
+			      Tv = W[1];
+			 }
+		    }
+		    {
+			 E TA, T8, TL, Tr, TN, Tx, T12, TM, Tu;
+			 TA = T1 + T7;
+			 T8 = T1 - T7;
+			 TM = Ts * Tw;
+			 Tu = Ts * Tt;
+			 TL = FNMS(Tp, Tn, TK);
+			 Tr = FMA(Tp, Tq, To);
+			 TN = FNMS(Tv, Tt, TM);
+			 Tx = FMA(Tv, Tw, Tu);
+			 T12 = TX - TW;
+			 TY = TW + TX;
+			 {
+			      E TP, TT, TD, TQ, TE, Tz, T14, T13;
+			      {
+				   E TO, TR, TC, Ty, T11;
+				   TO = TL + TN;
+				   TR = TN - TL;
+				   TC = Tr + Tx;
+				   Ty = Tr - Tx;
+				   TP = TJ - TO;
+				   TU = TJ + TO;
+				   TT = TR - TS;
+				   T11 = TS + TR;
+				   Tz = Tl + Ty;
+				   T14 = Ty - Tl;
+				   Im[WS(rs, 2)] = T11 - T12;
+				   T13 = FMA(KP500000000, T11, T12);
+				   T10 = TB - TC;
+				   TD = TB + TC;
+			      }
+			      Rm[WS(rs, 2)] = T8 + Tz;
+			      TQ = FNMS(KP500000000, Tz, T8);
+			      Im[0] = FMS(KP866025403, T14, T13);
+			      Ip[WS(rs, 1)] = FMA(KP866025403, T14, T13);
+			      TE = FNMS(KP500000000, TD, TA);
+			      Rm[0] = FNMS(KP866025403, TT, TQ);
+			      Rp[WS(rs, 1)] = FMA(KP866025403, TT, TQ);
+			      Rp[0] = TA + TD;
+			      Rm[WS(rs, 1)] = FMA(KP866025403, TP, TE);
+			      Rp[WS(rs, 2)] = FNMS(KP866025403, TP, TE);
+			 }
+		    }
+	       }
+	       Ip[0] = TU + TY;
+	       TZ = FNMS(KP500000000, TU, TY);
+	       Im[WS(rs, 1)] = FMS(KP866025403, T10, TZ);
+	       Ip[WS(rs, 2)] = FMA(KP866025403, T10, TZ);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 6, "hc2cf_6", twinstr, &GENUS, {24, 10, 22, 0} };
+
+void X(codelet_hc2cf_6) (planner *p) {
+     X(khc2c_register) (p, hc2cf_6, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cf_6 -include hc2cf.h */
+
+/*
+ * This function contains 46 FP additions, 28 FP multiplications,
+ * (or, 32 additions, 14 multiplications, 14 fused multiply/add),
+ * 23 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
+	       {
+		    E T1, TN, T6, TM;
+		    T1 = Rp[0];
+		    TN = Rm[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = Ip[WS(rs, 1)];
+			 T5 = Im[WS(rs, 1)];
+			 T2 = W[4];
+			 T4 = W[5];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 TM = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 - T6;
+		    TS = TN - TM;
+		    Tv = T1 + T6;
+		    TO = TM + TN;
+	       }
+	       {
+		    E Tn, TD, Ts, TE;
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = Rp[WS(rs, 2)];
+			 Tm = Rm[WS(rs, 2)];
+			 Tj = W[6];
+			 Tl = W[7];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 TD = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = Ip[0];
+			 Tr = Im[0];
+			 To = W[0];
+			 Tq = W[1];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 TE = FNMS(Tq, Tp, To * Tr);
+		    }
+		    Tt = Tn - Ts;
+		    TJ = TE - TD;
+		    Tx = Tn + Ts;
+		    TF = TD + TE;
+	       }
+	       {
+		    E Tc, TA, Th, TB;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = Rp[WS(rs, 1)];
+			 Tb = Rm[WS(rs, 1)];
+			 T8 = W[2];
+			 Ta = W[3];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 TA = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = Ip[WS(rs, 2)];
+			 Tg = Im[WS(rs, 2)];
+			 Td = W[8];
+			 Tf = W[9];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 TB = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Ti = Tc - Th;
+		    TI = TA - TB;
+		    Tw = Tc + Th;
+		    TC = TA + TB;
+	       }
+	       {
+		    E TK, Tu, TH, TT, TR, TU;
+		    TK = KP866025403 * (TI + TJ);
+		    Tu = Ti + Tt;
+		    TH = FNMS(KP500000000, Tu, T7);
+		    Rm[WS(rs, 2)] = T7 + Tu;
+		    Rp[WS(rs, 1)] = TH + TK;
+		    Rm[0] = TH - TK;
+		    TT = KP866025403 * (Tt - Ti);
+		    TR = TJ - TI;
+		    TU = FMA(KP500000000, TR, TS);
+		    Im[WS(rs, 2)] = TR - TS;
+		    Ip[WS(rs, 1)] = TT + TU;
+		    Im[0] = TT - TU;
+	       }
+	       {
+		    E TG, Ty, Tz, TP, TL, TQ;
+		    TG = KP866025403 * (TC - TF);
+		    Ty = Tw + Tx;
+		    Tz = FNMS(KP500000000, Ty, Tv);
+		    Rp[0] = Tv + Ty;
+		    Rm[WS(rs, 1)] = Tz + TG;
+		    Rp[WS(rs, 2)] = Tz - TG;
+		    TP = KP866025403 * (Tw - Tx);
+		    TL = TC + TF;
+		    TQ = FNMS(KP500000000, TL, TO);
+		    Ip[0] = TL + TO;
+		    Ip[WS(rs, 2)] = TP + TQ;
+		    Im[WS(rs, 1)] = TP - TQ;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 6, "hc2cf_6", twinstr, &GENUS, {32, 14, 14, 0} };
+
+void X(codelet_hc2cf_6) (planner *p) {
+     X(khc2c_register) (p, hc2cf_6, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cf_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:30 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cf_8 -include hc2cf.h */
+
+/*
+ * This function contains 66 FP additions, 36 FP multiplications,
+ * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
+ * 61 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T1g, T1f, T1e, Tm, T1q, T1o, T1p, TN, T1h, T1i;
+	       {
+		    E T1, T1m, T1l, T7, TS, Tk, TQ, Te, To, Tr, T17, TM, T12, Tu, TW;
+		    E Tp, Tx, Tt, Tq, Tw;
+		    {
+			 E T3, T6, T2, T5;
+			 T1 = Rp[0];
+			 T1m = Rm[0];
+			 T3 = Rp[WS(rs, 2)];
+			 T6 = Rm[WS(rs, 2)];
+			 T2 = W[6];
+			 T5 = W[7];
+			 {
+			      E Ta, Td, T9, Tc;
+			      {
+				   E Tg, Tj, Ti, TR, Th, T1k, T4, Tf;
+				   Tg = Rp[WS(rs, 3)];
+				   Tj = Rm[WS(rs, 3)];
+				   T1k = T2 * T6;
+				   T4 = T2 * T3;
+				   Tf = W[10];
+				   Ti = W[11];
+				   T1l = FNMS(T5, T3, T1k);
+				   T7 = FMA(T5, T6, T4);
+				   TR = Tf * Tj;
+				   Th = Tf * Tg;
+				   Ta = Rp[WS(rs, 1)];
+				   Td = Rm[WS(rs, 1)];
+				   TS = FNMS(Ti, Tg, TR);
+				   Tk = FMA(Ti, Tj, Th);
+				   T9 = W[2];
+				   Tc = W[3];
+			      }
+			      {
+				   E TB, TE, TH, T13, TC, TK, TG, TD, TJ, TP, Tb, TA, Tn;
+				   TB = Ip[WS(rs, 3)];
+				   TE = Im[WS(rs, 3)];
+				   TP = T9 * Td;
+				   Tb = T9 * Ta;
+				   TA = W[12];
+				   TH = Ip[WS(rs, 1)];
+				   TQ = FNMS(Tc, Ta, TP);
+				   Te = FMA(Tc, Td, Tb);
+				   T13 = TA * TE;
+				   TC = TA * TB;
+				   TK = Im[WS(rs, 1)];
+				   TG = W[4];
+				   TD = W[13];
+				   TJ = W[5];
+				   {
+					E T14, TF, T16, TL, T15, TI;
+					To = Ip[0];
+					T15 = TG * TK;
+					TI = TG * TH;
+					T14 = FNMS(TD, TB, T13);
+					TF = FMA(TD, TE, TC);
+					T16 = FNMS(TJ, TH, T15);
+					TL = FMA(TJ, TK, TI);
+					Tr = Im[0];
+					Tn = W[0];
+					T17 = T14 - T16;
+					T1g = T14 + T16;
+					TM = TF + TL;
+					T12 = TF - TL;
+				   }
+				   Tu = Ip[WS(rs, 2)];
+				   TW = Tn * Tr;
+				   Tp = Tn * To;
+				   Tx = Im[WS(rs, 2)];
+				   Tt = W[8];
+				   Tq = W[1];
+				   Tw = W[9];
+			      }
+			 }
+		    }
+		    {
+			 E T8, T1j, T1n, Tz, T1a, TU, Tl, T1b, T1c, T1v, T1t, T1w, T19, T1u, T1d;
+			 {
+			      E T1r, T10, TV, T1s, T11, T18;
+			      {
+				   E TO, TX, Ts, TZ, Ty, TT, TY, Tv;
+				   T8 = T1 + T7;
+				   TO = T1 - T7;
+				   TY = Tt * Tx;
+				   Tv = Tt * Tu;
+				   TX = FNMS(Tq, To, TW);
+				   Ts = FMA(Tq, Tr, Tp);
+				   TZ = FNMS(Tw, Tu, TY);
+				   Ty = FMA(Tw, Tx, Tv);
+				   TT = TQ - TS;
+				   T1j = TQ + TS;
+				   T1n = T1l + T1m;
+				   T1r = T1m - T1l;
+				   T10 = TX - TZ;
+				   T1f = TX + TZ;
+				   Tz = Ts + Ty;
+				   TV = Ts - Ty;
+				   T1a = TO - TT;
+				   TU = TO + TT;
+				   T1s = Te - Tk;
+				   Tl = Te + Tk;
+			      }
+			      T1b = T10 - TV;
+			      T11 = TV + T10;
+			      T18 = T12 - T17;
+			      T1c = T12 + T17;
+			      T1v = T1s + T1r;
+			      T1t = T1r - T1s;
+			      T1w = T18 - T11;
+			      T19 = T11 + T18;
+			 }
+			 Ip[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
+			 Im[0] = FMS(KP707106781, T1w, T1v);
+			 Rp[WS(rs, 1)] = FMA(KP707106781, T19, TU);
+			 Rm[WS(rs, 2)] = FNMS(KP707106781, T19, TU);
+			 T1u = T1b + T1c;
+			 T1d = T1b - T1c;
+			 Ip[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
+			 Im[WS(rs, 2)] = FMS(KP707106781, T1u, T1t);
+			 Rp[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
+			 Rm[0] = FNMS(KP707106781, T1d, T1a);
+			 T1e = T8 - Tl;
+			 Tm = T8 + Tl;
+			 T1q = T1n - T1j;
+			 T1o = T1j + T1n;
+			 T1p = TM - Tz;
+			 TN = Tz + TM;
+		    }
+	       }
+	       Ip[WS(rs, 2)] = T1p + T1q;
+	       Im[WS(rs, 1)] = T1p - T1q;
+	       Rp[0] = Tm + TN;
+	       Rm[WS(rs, 3)] = Tm - TN;
+	       T1h = T1f - T1g;
+	       T1i = T1f + T1g;
+	       Ip[0] = T1i + T1o;
+	       Im[WS(rs, 3)] = T1i - T1o;
+	       Rp[WS(rs, 2)] = T1e + T1h;
+	       Rm[WS(rs, 1)] = T1e - T1h;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cf_8", twinstr, &GENUS, {44, 14, 22, 0} };
+
+void X(codelet_hc2cf_8) (planner *p) {
+     X(khc2c_register) (p, hc2cf_8, &desc, HC2C_VIA_RDFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cf_8 -include hc2cf.h */
+
+/*
+ * This function contains 66 FP additions, 32 FP multiplications,
+ * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
+ * 28 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cf_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
+	       E TP;
+	       {
+		    E T1, T18, T6, T17;
+		    T1 = Rp[0];
+		    T18 = Rm[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = Rp[WS(rs, 2)];
+			 T5 = Rm[WS(rs, 2)];
+			 T2 = W[6];
+			 T4 = W[7];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T17 = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 + T6;
+		    T1e = T18 - T17;
+		    TH = T1 - T6;
+		    T19 = T17 + T18;
+	       }
+	       {
+		    E Tz, TS, TE, TT;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = Ip[WS(rs, 3)];
+			 Ty = Im[WS(rs, 3)];
+			 Tv = W[12];
+			 Tx = W[13];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 TS = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = Ip[WS(rs, 1)];
+			 TD = Im[WS(rs, 1)];
+			 TA = W[4];
+			 TC = W[5];
+			 TE = FMA(TA, TB, TC * TD);
+			 TT = FNMS(TC, TB, TA * TD);
+		    }
+		    TF = Tz + TE;
+		    T13 = TS + TT;
+		    TR = Tz - TE;
+		    TU = TS - TT;
+	       }
+	       {
+		    E Tc, TI, Th, TJ;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = Rp[WS(rs, 1)];
+			 Tb = Rm[WS(rs, 1)];
+			 T8 = W[2];
+			 Ta = W[3];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 TI = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = Rp[WS(rs, 3)];
+			 Tg = Rm[WS(rs, 3)];
+			 Td = W[10];
+			 Tf = W[11];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 TJ = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Ti = Tc + Th;
+		    T1f = Tc - Th;
+		    TK = TI - TJ;
+		    T16 = TI + TJ;
+	       }
+	       {
+		    E To, TN, Tt, TO;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = Ip[0];
+			 Tn = Im[0];
+			 Tk = W[0];
+			 Tm = W[1];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 TN = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = Ip[WS(rs, 2)];
+			 Ts = Im[WS(rs, 2)];
+			 Tp = W[8];
+			 Tr = W[9];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 TO = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    Tu = To + Tt;
+		    T12 = TN + TO;
+		    TM = To - Tt;
+		    TP = TN - TO;
+	       }
+	       {
+		    E Tj, TG, T1b, T1c;
+		    Tj = T7 + Ti;
+		    TG = Tu + TF;
+		    Rm[WS(rs, 3)] = Tj - TG;
+		    Rp[0] = Tj + TG;
+		    {
+			 E T15, T1a, T11, T14;
+			 T15 = T12 + T13;
+			 T1a = T16 + T19;
+			 Im[WS(rs, 3)] = T15 - T1a;
+			 Ip[0] = T15 + T1a;
+			 T11 = T7 - Ti;
+			 T14 = T12 - T13;
+			 Rm[WS(rs, 1)] = T11 - T14;
+			 Rp[WS(rs, 2)] = T11 + T14;
+		    }
+		    T1b = TF - Tu;
+		    T1c = T19 - T16;
+		    Im[WS(rs, 1)] = T1b - T1c;
+		    Ip[WS(rs, 2)] = T1b + T1c;
+		    {
+			 E TX, T1g, T10, T1d, TY, TZ;
+			 TX = TH - TK;
+			 T1g = T1e - T1f;
+			 TY = TP - TM;
+			 TZ = TR + TU;
+			 T10 = KP707106781 * (TY - TZ);
+			 T1d = KP707106781 * (TY + TZ);
+			 Rm[0] = TX - T10;
+			 Ip[WS(rs, 1)] = T1d + T1g;
+			 Rp[WS(rs, 3)] = TX + T10;
+			 Im[WS(rs, 2)] = T1d - T1g;
+		    }
+		    {
+			 E TL, T1i, TW, T1h, TQ, TV;
+			 TL = TH + TK;
+			 T1i = T1f + T1e;
+			 TQ = TM + TP;
+			 TV = TR - TU;
+			 TW = KP707106781 * (TQ + TV);
+			 T1h = KP707106781 * (TV - TQ);
+			 Rm[WS(rs, 2)] = TL - TW;
+			 Ip[WS(rs, 3)] = T1h + T1i;
+			 Rp[WS(rs, 1)] = TL + TW;
+			 Im[0] = T1h - T1i;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cf_8", twinstr, &GENUS, {52, 18, 14, 0} };
+
+void X(codelet_hc2cf_8) (planner *p) {
+     X(khc2c_register) (p, hc2cf_8, &desc, HC2C_VIA_RDFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft2_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft2_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,916 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:50 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cfdft2_16 -include hc2cf.h */
+
+/*
+ * This function contains 228 FP additions, 166 FP multiplications,
+ * (or, 136 additions, 74 multiplications, 92 fused multiply/add),
+ * 103 stack variables, 4 constants, and 64 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T4p, T4o, T4n, T4s;
+	       {
+		    E T1, T2, Tw, Ty, Th, T3, Tx, TE, Ti, TK, Tj, T4, T5;
+		    T1 = W[0];
+		    T2 = W[2];
+		    Tw = W[6];
+		    Ty = W[7];
+		    Th = W[4];
+		    T3 = T1 * T2;
+		    Tx = T1 * Tw;
+		    TE = T1 * Ty;
+		    Ti = T1 * Th;
+		    TK = T2 * Th;
+		    Tj = W[5];
+		    T4 = W[1];
+		    T5 = W[3];
+		    {
+			 E T1v, T2q, T1s, T2s, T38, T3T, T1Y, T3P, T17, T1h, T2x, T2v, T33, T3Q, T3S;
+			 E T1N, Tv, T3A, T2E, T3B, T3L, T2c, T3I, T2S, TW, T3E, T3J, T2n, T3D, T2J;
+			 E T3M, T2X;
+			 {
+			      E TF, Tk, Tz, TL, T6, TR, Tq, Tc, T2h, T25, T2k, T29, T1G, T1M, T2P;
+			      E T2R;
+			      {
+				   E T18, TY, T1d, T13, T1H, T1A, T1K, T1E, T37, T1R, T35, T1X;
+				   {
+					E T1j, T1o, T1W, T1p, T1m, T1Q, T1U, T1q;
+					{
+					     E T1k, T1l, T1S, T1T;
+					     {
+						  E T1t, T28, T24, T1D, T1z, T1u, TQ, Tp, Tb;
+						  T1t = Ip[0];
+						  TQ = T2 * Tj;
+						  Tp = T1 * Tj;
+						  TF = FNMS(T4, Tw, TE);
+						  T1j = FMA(T4, Tj, Ti);
+						  Tk = FNMS(T4, Tj, Ti);
+						  Tz = FMA(T4, Ty, Tx);
+						  T18 = FNMS(T5, Tj, TK);
+						  TL = FMA(T5, Tj, TK);
+						  TY = FNMS(T4, T5, T3);
+						  T6 = FMA(T4, T5, T3);
+						  Tb = T1 * T5;
+						  TR = FNMS(T5, Th, TQ);
+						  T1d = FMA(T5, Th, TQ);
+						  Tq = FMA(T4, Th, Tp);
+						  T1o = FNMS(T4, Th, Tp);
+						  T28 = T6 * Tj;
+						  T24 = T6 * Th;
+						  T1D = TY * Tj;
+						  T1z = TY * Th;
+						  Tc = FNMS(T4, T2, Tb);
+						  T13 = FMA(T4, T2, Tb);
+						  T1u = Im[0];
+						  T1k = Ip[WS(rs, 4)];
+						  T2h = FMA(Tc, Tj, T24);
+						  T25 = FNMS(Tc, Tj, T24);
+						  T2k = FNMS(Tc, Th, T28);
+						  T29 = FMA(Tc, Th, T28);
+						  T1H = FNMS(T13, Tj, T1z);
+						  T1A = FMA(T13, Tj, T1z);
+						  T1K = FMA(T13, Th, T1D);
+						  T1E = FNMS(T13, Th, T1D);
+						  T1W = T1t + T1u;
+						  T1v = T1t - T1u;
+						  T1l = Im[WS(rs, 4)];
+					     }
+					     T1S = Rm[0];
+					     T1T = Rp[0];
+					     T1p = Rp[WS(rs, 4)];
+					     T1m = T1k - T1l;
+					     T1Q = T1k + T1l;
+					     T2q = T1T + T1S;
+					     T1U = T1S - T1T;
+					     T1q = Rm[WS(rs, 4)];
+					}
+					{
+					     E T36, T1V, T1O, T1r, T1n, T1P, T34, T2r;
+					     T36 = T4 * T1U;
+					     T1V = T1 * T1U;
+					     T1O = T1q - T1p;
+					     T1r = T1p + T1q;
+					     T1n = T1j * T1m;
+					     T37 = FMA(T1, T1W, T36);
+					     T2r = T1j * T1r;
+					     T1P = Th * T1O;
+					     T34 = Tj * T1O;
+					     T1s = FNMS(T1o, T1r, T1n);
+					     T2s = FMA(T1o, T1m, T2r);
+					     T1R = FNMS(Tj, T1Q, T1P);
+					     T35 = FMA(Th, T1Q, T34);
+					     T1X = FNMS(T4, T1W, T1V);
+					}
+				   }
+				   {
+					E T1F, T11, T1e, T16, T1L, T1b, T1f, T1C, T2Z;
+					{
+					     E T14, T15, TZ, T10, T19, T1a, T1B;
+					     TZ = Ip[WS(rs, 2)];
+					     T10 = Im[WS(rs, 2)];
+					     T38 = T35 + T37;
+					     T3T = T37 - T35;
+					     T1Y = T1R + T1X;
+					     T3P = T1X - T1R;
+					     T1F = TZ + T10;
+					     T11 = TZ - T10;
+					     T14 = Rp[WS(rs, 2)];
+					     T15 = Rm[WS(rs, 2)];
+					     T19 = Ip[WS(rs, 6)];
+					     T1a = Im[WS(rs, 6)];
+					     T1e = Rp[WS(rs, 6)];
+					     T16 = T14 + T15;
+					     T1B = T15 - T14;
+					     T1L = T19 + T1a;
+					     T1b = T19 - T1a;
+					     T1f = Rm[WS(rs, 6)];
+					     T1C = T1A * T1B;
+					     T2Z = T1E * T1B;
+					}
+					{
+					     E T1J, T31, T2u, T30, T32;
+					     {
+						  E T12, T1g, T1I, T1c, T2w;
+						  T12 = TY * T11;
+						  T1g = T1e + T1f;
+						  T1I = T1f - T1e;
+						  T1c = T18 * T1b;
+						  T17 = FNMS(T13, T16, T12);
+						  T2w = T18 * T1g;
+						  T1J = T1H * T1I;
+						  T31 = T1K * T1I;
+						  T1h = FNMS(T1d, T1g, T1c);
+						  T2x = FMA(T1d, T1b, T2w);
+					     }
+					     T2u = TY * T16;
+					     T30 = FMA(T1A, T1F, T2Z);
+					     T32 = FMA(T1H, T1L, T31);
+					     T1G = FNMS(T1E, T1F, T1C);
+					     T2v = FMA(T13, T11, T2u);
+					     T1M = FNMS(T1K, T1L, T1J);
+					     T33 = T30 + T32;
+					     T3Q = T30 - T32;
+					}
+				   }
+			      }
+			      {
+				   E Tl, T22, T9, T20, Tf, T2O, Ta, T21, T2A, Tm, Tr, Ts;
+				   {
+					E T7, T8, Td, Te;
+					T7 = Ip[WS(rs, 1)];
+					T3S = T1G - T1M;
+					T1N = T1G + T1M;
+					T8 = Im[WS(rs, 1)];
+					Td = Rp[WS(rs, 1)];
+					Te = Rm[WS(rs, 1)];
+					Tl = Ip[WS(rs, 5)];
+					T22 = T7 + T8;
+					T9 = T7 - T8;
+					T20 = Td - Te;
+					Tf = Td + Te;
+					T2O = T2 * T22;
+					Ta = T6 * T9;
+					T21 = T2 * T20;
+					T2A = T6 * Tf;
+					Tm = Im[WS(rs, 5)];
+					Tr = Rp[WS(rs, 5)];
+					Ts = Rm[WS(rs, 5)];
+				   }
+				   {
+					E Tg, T2a, Tn, T26, T2Q, T27, T2C, T2B, Tu, Tt, To, T23, T2D, T2b;
+					Tg = FNMS(Tc, Tf, Ta);
+					T2a = Tl + Tm;
+					Tn = Tl - Tm;
+					T26 = Tr - Ts;
+					Tt = Tr + Ts;
+					T2Q = T25 * T2a;
+					To = Tk * Tn;
+					T27 = T25 * T26;
+					T2C = Tk * Tt;
+					T2B = FMA(Tc, T9, T2A);
+					Tu = FNMS(Tq, Tt, To);
+					T23 = FMA(T5, T22, T21);
+					T2D = FMA(Tq, Tn, T2C);
+					T2b = FMA(T29, T2a, T27);
+					Tv = Tg + Tu;
+					T3A = Tg - Tu;
+					T2P = FNMS(T5, T20, T2O);
+					T2E = T2B + T2D;
+					T3B = T2B - T2D;
+					T3L = T2b - T23;
+					T2c = T23 + T2b;
+					T2R = FNMS(T29, T26, T2Q);
+				   }
+			      }
+			      {
+				   E T2f, TC, T2T, TD, T2d, TI, TS, T2e, T2F, T2l, TO, TT;
+				   {
+					E TG, TH, TA, TB, TM, TN;
+					TA = Ip[WS(rs, 7)];
+					TB = Im[WS(rs, 7)];
+					TG = Rp[WS(rs, 7)];
+					T3I = T2R - T2P;
+					T2S = T2P + T2R;
+					T2f = TA + TB;
+					TC = TA - TB;
+					TH = Rm[WS(rs, 7)];
+					TM = Ip[WS(rs, 3)];
+					T2T = Tw * T2f;
+					TD = Tz * TC;
+					T2d = TG - TH;
+					TI = TG + TH;
+					TN = Im[WS(rs, 3)];
+					TS = Rp[WS(rs, 3)];
+					T2e = Tw * T2d;
+					T2F = Tz * TI;
+					T2l = TM + TN;
+					TO = TM - TN;
+					TT = Rm[WS(rs, 3)];
+				   }
+				   {
+					E TJ, T2V, TP, T2i, TU, T2G;
+					TJ = FNMS(TF, TI, TD);
+					T2V = T2h * T2l;
+					TP = TL * TO;
+					T2i = TS - TT;
+					TU = TS + TT;
+					T2G = FMA(TF, TC, T2F);
+					{
+					     E T2g, T2j, TV, T2H;
+					     T2g = FMA(Ty, T2f, T2e);
+					     T2j = T2h * T2i;
+					     TV = FNMS(TR, TU, TP);
+					     T2H = TL * TU;
+					     {
+						  E T2U, T2m, T2I, T2W;
+						  T2U = FNMS(Ty, T2d, T2T);
+						  T2m = FMA(T2k, T2l, T2j);
+						  TW = TJ + TV;
+						  T3E = TJ - TV;
+						  T2I = FMA(TR, TO, T2H);
+						  T2W = FNMS(T2k, T2i, T2V);
+						  T3J = T2m - T2g;
+						  T2n = T2g + T2m;
+						  T3D = T2G - T2I;
+						  T2J = T2G + T2I;
+						  T3M = T2U - T2W;
+						  T2X = T2U + T2W;
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T3Y, T3x, T3X, T3y, T3r, T3q, T3p, T3u;
+			      {
+				   E T2Y, T3o, TX, T3s, T3i, T39, T3t, T3l, T3e, T1x, T2M, T2p, T3d, T2K, T2t;
+				   E T2y;
+				   {
+					E T2o, T1Z, T3j, T3k, T1i, T1w, T3g, T3h;
+					T2Y = T2S + T2X;
+					T3g = T2X - T2S;
+					T3h = T2c - T2n;
+					T2o = T2c + T2n;
+					T1Z = T1N + T1Y;
+					T3j = T1Y - T1N;
+					T3o = Tv - TW;
+					TX = Tv + TW;
+					T3s = T3g - T3h;
+					T3i = T3g + T3h;
+					T3k = T38 - T33;
+					T39 = T33 + T38;
+					T3Y = T17 - T1h;
+					T1i = T17 + T1h;
+					T1w = T1s + T1v;
+					T3x = T1v - T1s;
+					T3t = T3j + T3k;
+					T3l = T3j - T3k;
+					T3e = T1w - T1i;
+					T1x = T1i + T1w;
+					T2M = T2o + T1Z;
+					T2p = T1Z - T2o;
+					T3d = T2J - T2E;
+					T2K = T2E + T2J;
+					T3X = T2q - T2s;
+					T2t = T2q + T2s;
+					T2y = T2v + T2x;
+					T3y = T2v - T2x;
+				   }
+				   {
+					E T2N, T3c, T3a, T3n, T3b, T2L, T2z, T1y;
+					T2N = T1x - TX;
+					T1y = TX + T1x;
+					T3c = T2Y + T39;
+					T3a = T2Y - T39;
+					T3n = T2t - T2y;
+					T2z = T2t + T2y;
+					Ip[0] = KP500000000 * (T1y + T2p);
+					Im[WS(rs, 7)] = KP500000000 * (T2p - T1y);
+					T3b = T2z + T2K;
+					T2L = T2z - T2K;
+					{
+					     E T3f, T3m, T3v, T3w;
+					     T3r = T3e - T3d;
+					     T3f = T3d + T3e;
+					     Im[WS(rs, 3)] = KP500000000 * (T3a - T2N);
+					     Ip[WS(rs, 4)] = KP500000000 * (T2N + T3a);
+					     Rp[WS(rs, 4)] = KP500000000 * (T2L + T2M);
+					     Rm[WS(rs, 3)] = KP500000000 * (T2L - T2M);
+					     Rp[0] = KP500000000 * (T3b + T3c);
+					     Rm[WS(rs, 7)] = KP500000000 * (T3b - T3c);
+					     T3m = T3i + T3l;
+					     T3q = T3l - T3i;
+					     T3p = T3n - T3o;
+					     T3v = T3n + T3o;
+					     T3w = T3s + T3t;
+					     T3u = T3s - T3t;
+					     Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP707106781, T3m, T3f)));
+					     Ip[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3m, T3f));
+					     Rp[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3w, T3v));
+					     Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP707106781, T3w, T3v));
+					}
+				   }
+			      }
+			      {
+				   E T3R, T4b, T3z, T4q, T4g, T3U, T40, T41, T4r, T4j, T4m, T3G, T46, T3O, T4l;
+				   E T3Z, T4c;
+				   {
+					E T3K, T3N, T4h, T4i, T3C, T3F, T4e, T4f;
+					Rp[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3q, T3p));
+					Rm[WS(rs, 1)] = KP500000000 * (FNMS(KP707106781, T3q, T3p));
+					Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP707106781, T3u, T3r)));
+					Ip[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3u, T3r));
+					T3K = T3I + T3J;
+					T4e = T3I - T3J;
+					T4f = T3M - T3L;
+					T3N = T3L + T3M;
+					T3R = T3P - T3Q;
+					T4h = T3Q + T3P;
+					T4b = T3y + T3x;
+					T3z = T3x - T3y;
+					T4q = FNMS(KP414213562, T4e, T4f);
+					T4g = FMA(KP414213562, T4f, T4e);
+					T4i = T3T - T3S;
+					T3U = T3S + T3T;
+					T40 = T3B + T3A;
+					T3C = T3A - T3B;
+					T3F = T3D + T3E;
+					T41 = T3D - T3E;
+					T4r = FNMS(KP414213562, T4h, T4i);
+					T4j = FMA(KP414213562, T4i, T4h);
+					T4m = T3C - T3F;
+					T3G = T3C + T3F;
+					T46 = FNMS(KP414213562, T3K, T3N);
+					T3O = FMA(KP414213562, T3N, T3K);
+					T4l = T3X - T3Y;
+					T3Z = T3X + T3Y;
+				   }
+				   {
+					E T45, T3H, T42, T47, T3V;
+					T45 = FNMS(KP707106781, T3G, T3z);
+					T3H = FMA(KP707106781, T3G, T3z);
+					T4c = T41 - T40;
+					T42 = T40 + T41;
+					T47 = FMA(KP414213562, T3R, T3U);
+					T3V = FNMS(KP414213562, T3U, T3R);
+					{
+					     E T49, T43, T48, T4a, T44, T3W;
+					     T49 = FMA(KP707106781, T42, T3Z);
+					     T43 = FNMS(KP707106781, T42, T3Z);
+					     T48 = T46 - T47;
+					     T4a = T46 + T47;
+					     T44 = T3V - T3O;
+					     T3W = T3O + T3V;
+					     Rp[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T4a, T49));
+					     Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP923879532, T4a, T49));
+					     Rp[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T44, T43));
+					     Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP923879532, T44, T43));
+					     Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP923879532, T3W, T3H)));
+					     Ip[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3W, T3H));
+					     Ip[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T48, T45));
+					     Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP923879532, T48, T45)));
+					}
+				   }
+				   {
+					E T4d, T4k, T4t, T4u;
+					T4p = FMA(KP707106781, T4c, T4b);
+					T4d = FNMS(KP707106781, T4c, T4b);
+					T4k = T4g - T4j;
+					T4o = T4g + T4j;
+					T4n = FMA(KP707106781, T4m, T4l);
+					T4t = FNMS(KP707106781, T4m, T4l);
+					T4u = T4q + T4r;
+					T4s = T4q - T4r;
+					Im[0] = -(KP500000000 * (FNMS(KP923879532, T4k, T4d)));
+					Ip[WS(rs, 7)] = KP500000000 * (FMA(KP923879532, T4k, T4d));
+					Rm[0] = KP500000000 * (FMA(KP923879532, T4u, T4t));
+					Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP923879532, T4u, T4t));
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Rp[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4o, T4n));
+	       Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP923879532, T4o, T4n));
+	       Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP923879532, T4s, T4p)));
+	       Ip[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4s, T4p));
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cfdft2_16", twinstr, &GENUS, {136, 74, 92, 0} };
+
+void X(codelet_hc2cfdft2_16) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft2_16, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cfdft2_16 -include hc2cf.h */
+
+/*
+ * This function contains 228 FP additions, 124 FP multiplications,
+ * (or, 188 additions, 84 multiplications, 40 fused multiply/add),
+ * 91 stack variables, 4 constants, and 64 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP461939766, +0.461939766255643378064091594698394143411208313);
+     DK(KP191341716, +0.191341716182544885864229992015199433380672281);
+     DK(KP353553390, +0.353553390593273762200422181052424519642417969);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T1, T4, T2, T5, T7, Td, T12, TY, Tk, Ti, Tm, T1l, T1b, TL, T1h;
+	       E Ts, TR, T17, Ty, Tz, TA, TE, T1L, T1Q, T1H, T1O, T24, T2d, T20, T2b;
+	       {
+		    E Tl, TP, Tq, TK, Tj, TQ, Tr, TJ;
+		    {
+			 E T3, Tc, T6, Tb;
+			 T1 = W[0];
+			 T4 = W[1];
+			 T2 = W[2];
+			 T5 = W[3];
+			 T3 = T1 * T2;
+			 Tc = T4 * T2;
+			 T6 = T4 * T5;
+			 Tb = T1 * T5;
+			 T7 = T3 + T6;
+			 Td = Tb - Tc;
+			 T12 = Tb + Tc;
+			 TY = T3 - T6;
+			 Tk = W[5];
+			 Tl = T4 * Tk;
+			 TP = T2 * Tk;
+			 Tq = T1 * Tk;
+			 TK = T5 * Tk;
+			 Ti = W[4];
+			 Tj = T1 * Ti;
+			 TQ = T5 * Ti;
+			 Tr = T4 * Ti;
+			 TJ = T2 * Ti;
+		    }
+		    Tm = Tj - Tl;
+		    T1l = Tq - Tr;
+		    T1b = TP + TQ;
+		    TL = TJ + TK;
+		    T1h = Tj + Tl;
+		    Ts = Tq + Tr;
+		    TR = TP - TQ;
+		    T17 = TJ - TK;
+		    Ty = W[6];
+		    Tz = W[7];
+		    TA = FMA(T1, Ty, T4 * Tz);
+		    TE = FNMS(T4, Ty, T1 * Tz);
+		    {
+			 E T1J, T1K, T1F, T1G;
+			 T1J = TY * Tk;
+			 T1K = T12 * Ti;
+			 T1L = T1J - T1K;
+			 T1Q = T1J + T1K;
+			 T1F = TY * Ti;
+			 T1G = T12 * Tk;
+			 T1H = T1F + T1G;
+			 T1O = T1F - T1G;
+		    }
+		    {
+			 E T22, T23, T1Y, T1Z;
+			 T22 = T7 * Tk;
+			 T23 = Td * Ti;
+			 T24 = T22 + T23;
+			 T2d = T22 - T23;
+			 T1Y = T7 * Ti;
+			 T1Z = Td * Tk;
+			 T20 = T1Y - T1Z;
+			 T2b = T1Y + T1Z;
+		    }
+	       }
+	       {
+		    E T1t, T3i, T2l, T3B, T1E, T3t, T2M, T3x, T1g, T3C, T2J, T3u, T1T, T3w, T2o;
+		    E T3j, Tx, T3b, T2C, T3q, T27, T3m, T2s, T3c, TW, T3f, T2F, T3n, T2g, T3p;
+		    E T2v, T3e;
+		    {
+			 E T1k, T1C, T1o, T1B, T1s, T1z, T1y, T2j, T1p, T2k;
+			 {
+			      E T1i, T1j, T1m, T1n;
+			      T1i = Ip[WS(rs, 4)];
+			      T1j = Im[WS(rs, 4)];
+			      T1k = T1i - T1j;
+			      T1C = T1i + T1j;
+			      T1m = Rp[WS(rs, 4)];
+			      T1n = Rm[WS(rs, 4)];
+			      T1o = T1m + T1n;
+			      T1B = T1m - T1n;
+			 }
+			 {
+			      E T1q, T1r, T1w, T1x;
+			      T1q = Ip[0];
+			      T1r = Im[0];
+			      T1s = T1q - T1r;
+			      T1z = T1q + T1r;
+			      T1w = Rm[0];
+			      T1x = Rp[0];
+			      T1y = T1w - T1x;
+			      T2j = T1x + T1w;
+			 }
+			 T1p = FNMS(T1l, T1o, T1h * T1k);
+			 T1t = T1p + T1s;
+			 T3i = T1s - T1p;
+			 T2k = FMA(T1h, T1o, T1l * T1k);
+			 T2l = T2j + T2k;
+			 T3B = T2j - T2k;
+			 {
+			      E T1A, T1D, T2K, T2L;
+			      T1A = FNMS(T4, T1z, T1 * T1y);
+			      T1D = FMA(Ti, T1B, Tk * T1C);
+			      T1E = T1A - T1D;
+			      T3t = T1D + T1A;
+			      T2K = FNMS(Tk, T1B, Ti * T1C);
+			      T2L = FMA(T4, T1y, T1 * T1z);
+			      T2M = T2K + T2L;
+			      T3x = T2L - T2K;
+			 }
+		    }
+		    {
+			 E T11, T1M, T15, T1I, T1a, T1R, T1e, T1P;
+			 {
+			      E TZ, T10, T13, T14;
+			      TZ = Ip[WS(rs, 2)];
+			      T10 = Im[WS(rs, 2)];
+			      T11 = TZ - T10;
+			      T1M = TZ + T10;
+			      T13 = Rp[WS(rs, 2)];
+			      T14 = Rm[WS(rs, 2)];
+			      T15 = T13 + T14;
+			      T1I = T13 - T14;
+			 }
+			 {
+			      E T18, T19, T1c, T1d;
+			      T18 = Ip[WS(rs, 6)];
+			      T19 = Im[WS(rs, 6)];
+			      T1a = T18 - T19;
+			      T1R = T18 + T19;
+			      T1c = Rp[WS(rs, 6)];
+			      T1d = Rm[WS(rs, 6)];
+			      T1e = T1c + T1d;
+			      T1P = T1c - T1d;
+			 }
+			 {
+			      E T16, T1f, T2H, T2I;
+			      T16 = FNMS(T12, T15, TY * T11);
+			      T1f = FNMS(T1b, T1e, T17 * T1a);
+			      T1g = T16 + T1f;
+			      T3C = T16 - T1f;
+			      T2H = FNMS(T1L, T1I, T1H * T1M);
+			      T2I = FNMS(T1Q, T1P, T1O * T1R);
+			      T2J = T2H + T2I;
+			      T3u = T2H - T2I;
+			 }
+			 {
+			      E T1N, T1S, T2m, T2n;
+			      T1N = FMA(T1H, T1I, T1L * T1M);
+			      T1S = FMA(T1O, T1P, T1Q * T1R);
+			      T1T = T1N + T1S;
+			      T3w = T1S - T1N;
+			      T2m = FMA(TY, T15, T12 * T11);
+			      T2n = FMA(T17, T1e, T1b * T1a);
+			      T2o = T2m + T2n;
+			      T3j = T2m - T2n;
+			 }
+		    }
+		    {
+			 E Ta, T1W, Tg, T1V, Tp, T25, Tv, T21;
+			 {
+			      E T8, T9, Te, Tf;
+			      T8 = Ip[WS(rs, 1)];
+			      T9 = Im[WS(rs, 1)];
+			      Ta = T8 - T9;
+			      T1W = T8 + T9;
+			      Te = Rp[WS(rs, 1)];
+			      Tf = Rm[WS(rs, 1)];
+			      Tg = Te + Tf;
+			      T1V = Te - Tf;
+			 }
+			 {
+			      E Tn, To, Tt, Tu;
+			      Tn = Ip[WS(rs, 5)];
+			      To = Im[WS(rs, 5)];
+			      Tp = Tn - To;
+			      T25 = Tn + To;
+			      Tt = Rp[WS(rs, 5)];
+			      Tu = Rm[WS(rs, 5)];
+			      Tv = Tt + Tu;
+			      T21 = Tt - Tu;
+			 }
+			 {
+			      E Th, Tw, T2A, T2B;
+			      Th = FNMS(Td, Tg, T7 * Ta);
+			      Tw = FNMS(Ts, Tv, Tm * Tp);
+			      Tx = Th + Tw;
+			      T3b = Th - Tw;
+			      T2A = FNMS(T5, T1V, T2 * T1W);
+			      T2B = FNMS(T24, T21, T20 * T25);
+			      T2C = T2A + T2B;
+			      T3q = T2A - T2B;
+			 }
+			 {
+			      E T1X, T26, T2q, T2r;
+			      T1X = FMA(T2, T1V, T5 * T1W);
+			      T26 = FMA(T20, T21, T24 * T25);
+			      T27 = T1X + T26;
+			      T3m = T26 - T1X;
+			      T2q = FMA(T7, Tg, Td * Ta);
+			      T2r = FMA(Tm, Tv, Ts * Tp);
+			      T2s = T2q + T2r;
+			      T3c = T2q - T2r;
+			 }
+		    }
+		    {
+			 E TD, T29, TH, T28, TO, T2e, TU, T2c;
+			 {
+			      E TB, TC, TF, TG;
+			      TB = Ip[WS(rs, 7)];
+			      TC = Im[WS(rs, 7)];
+			      TD = TB - TC;
+			      T29 = TB + TC;
+			      TF = Rp[WS(rs, 7)];
+			      TG = Rm[WS(rs, 7)];
+			      TH = TF + TG;
+			      T28 = TF - TG;
+			 }
+			 {
+			      E TM, TN, TS, TT;
+			      TM = Ip[WS(rs, 3)];
+			      TN = Im[WS(rs, 3)];
+			      TO = TM - TN;
+			      T2e = TM + TN;
+			      TS = Rp[WS(rs, 3)];
+			      TT = Rm[WS(rs, 3)];
+			      TU = TS + TT;
+			      T2c = TS - TT;
+			 }
+			 {
+			      E TI, TV, T2D, T2E;
+			      TI = FNMS(TE, TH, TA * TD);
+			      TV = FNMS(TR, TU, TL * TO);
+			      TW = TI + TV;
+			      T3f = TI - TV;
+			      T2D = FNMS(Tz, T28, Ty * T29);
+			      T2E = FNMS(T2d, T2c, T2b * T2e);
+			      T2F = T2D + T2E;
+			      T3n = T2D - T2E;
+			 }
+			 {
+			      E T2a, T2f, T2t, T2u;
+			      T2a = FMA(Ty, T28, Tz * T29);
+			      T2f = FMA(T2b, T2c, T2d * T2e);
+			      T2g = T2a + T2f;
+			      T3p = T2f - T2a;
+			      T2t = FMA(TA, TH, TE * TD);
+			      T2u = FMA(TL, TU, TR * TO);
+			      T2v = T2t + T2u;
+			      T3e = T2t - T2u;
+			 }
+		    }
+		    {
+			 E T1v, T2z, T2O, T2Q, T2i, T2y, T2x, T2P;
+			 {
+			      E TX, T1u, T2G, T2N;
+			      TX = Tx + TW;
+			      T1u = T1g + T1t;
+			      T1v = TX + T1u;
+			      T2z = T1u - TX;
+			      T2G = T2C + T2F;
+			      T2N = T2J + T2M;
+			      T2O = T2G - T2N;
+			      T2Q = T2G + T2N;
+			 }
+			 {
+			      E T1U, T2h, T2p, T2w;
+			      T1U = T1E - T1T;
+			      T2h = T27 + T2g;
+			      T2i = T1U - T2h;
+			      T2y = T2h + T1U;
+			      T2p = T2l + T2o;
+			      T2w = T2s + T2v;
+			      T2x = T2p - T2w;
+			      T2P = T2p + T2w;
+			 }
+			 Ip[0] = KP500000000 * (T1v + T2i);
+			 Rp[0] = KP500000000 * (T2P + T2Q);
+			 Im[WS(rs, 7)] = KP500000000 * (T2i - T1v);
+			 Rm[WS(rs, 7)] = KP500000000 * (T2P - T2Q);
+			 Rm[WS(rs, 3)] = KP500000000 * (T2x - T2y);
+			 Im[WS(rs, 3)] = KP500000000 * (T2O - T2z);
+			 Rp[WS(rs, 4)] = KP500000000 * (T2x + T2y);
+			 Ip[WS(rs, 4)] = KP500000000 * (T2z + T2O);
+		    }
+		    {
+			 E T2T, T35, T33, T39, T2W, T36, T2Z, T37;
+			 {
+			      E T2R, T2S, T31, T32;
+			      T2R = T2v - T2s;
+			      T2S = T1t - T1g;
+			      T2T = KP500000000 * (T2R + T2S);
+			      T35 = KP500000000 * (T2S - T2R);
+			      T31 = T2l - T2o;
+			      T32 = Tx - TW;
+			      T33 = KP500000000 * (T31 - T32);
+			      T39 = KP500000000 * (T31 + T32);
+			 }
+			 {
+			      E T2U, T2V, T2X, T2Y;
+			      T2U = T2F - T2C;
+			      T2V = T27 - T2g;
+			      T2W = T2U + T2V;
+			      T36 = T2U - T2V;
+			      T2X = T1T + T1E;
+			      T2Y = T2M - T2J;
+			      T2Z = T2X - T2Y;
+			      T37 = T2X + T2Y;
+			 }
+			 {
+			      E T30, T3a, T34, T38;
+			      T30 = KP353553390 * (T2W + T2Z);
+			      Ip[WS(rs, 2)] = T2T + T30;
+			      Im[WS(rs, 5)] = T30 - T2T;
+			      T3a = KP353553390 * (T36 + T37);
+			      Rm[WS(rs, 5)] = T39 - T3a;
+			      Rp[WS(rs, 2)] = T39 + T3a;
+			      T34 = KP353553390 * (T2Z - T2W);
+			      Rm[WS(rs, 1)] = T33 - T34;
+			      Rp[WS(rs, 6)] = T33 + T34;
+			      T38 = KP353553390 * (T36 - T37);
+			      Ip[WS(rs, 6)] = T35 + T38;
+			      Im[WS(rs, 1)] = T38 - T35;
+			 }
+		    }
+		    {
+			 E T3k, T3Q, T3Z, T3D, T3h, T40, T3X, T45, T3G, T3P, T3s, T3K, T3U, T44, T3z;
+			 E T3L;
+			 {
+			      E T3d, T3g, T3o, T3r;
+			      T3k = KP500000000 * (T3i - T3j);
+			      T3Q = KP500000000 * (T3j + T3i);
+			      T3Z = KP500000000 * (T3B - T3C);
+			      T3D = KP500000000 * (T3B + T3C);
+			      T3d = T3b - T3c;
+			      T3g = T3e + T3f;
+			      T3h = KP353553390 * (T3d + T3g);
+			      T40 = KP353553390 * (T3d - T3g);
+			      {
+				   E T3V, T3W, T3E, T3F;
+				   T3V = T3u + T3t;
+				   T3W = T3x - T3w;
+				   T3X = FNMS(KP461939766, T3W, KP191341716 * T3V);
+				   T45 = FMA(KP461939766, T3V, KP191341716 * T3W);
+				   T3E = T3c + T3b;
+				   T3F = T3e - T3f;
+				   T3G = KP353553390 * (T3E + T3F);
+				   T3P = KP353553390 * (T3F - T3E);
+			      }
+			      T3o = T3m + T3n;
+			      T3r = T3p - T3q;
+			      T3s = FMA(KP191341716, T3o, KP461939766 * T3r);
+			      T3K = FNMS(KP191341716, T3r, KP461939766 * T3o);
+			      {
+				   E T3S, T3T, T3v, T3y;
+				   T3S = T3n - T3m;
+				   T3T = T3q + T3p;
+				   T3U = FMA(KP461939766, T3S, KP191341716 * T3T);
+				   T44 = FNMS(KP461939766, T3T, KP191341716 * T3S);
+				   T3v = T3t - T3u;
+				   T3y = T3w + T3x;
+				   T3z = FNMS(KP191341716, T3y, KP461939766 * T3v);
+				   T3L = FMA(KP191341716, T3v, KP461939766 * T3y);
+			      }
+			 }
+			 {
+			      E T3l, T3A, T3N, T3O;
+			      T3l = T3h + T3k;
+			      T3A = T3s + T3z;
+			      Ip[WS(rs, 1)] = T3l + T3A;
+			      Im[WS(rs, 6)] = T3A - T3l;
+			      T3N = T3D + T3G;
+			      T3O = T3K + T3L;
+			      Rm[WS(rs, 6)] = T3N - T3O;
+			      Rp[WS(rs, 1)] = T3N + T3O;
+			 }
+			 {
+			      E T3H, T3I, T3J, T3M;
+			      T3H = T3D - T3G;
+			      T3I = T3z - T3s;
+			      Rm[WS(rs, 2)] = T3H - T3I;
+			      Rp[WS(rs, 5)] = T3H + T3I;
+			      T3J = T3k - T3h;
+			      T3M = T3K - T3L;
+			      Ip[WS(rs, 5)] = T3J + T3M;
+			      Im[WS(rs, 2)] = T3M - T3J;
+			 }
+			 {
+			      E T3R, T3Y, T47, T48;
+			      T3R = T3P + T3Q;
+			      T3Y = T3U + T3X;
+			      Ip[WS(rs, 3)] = T3R + T3Y;
+			      Im[WS(rs, 4)] = T3Y - T3R;
+			      T47 = T3Z + T40;
+			      T48 = T44 + T45;
+			      Rm[WS(rs, 4)] = T47 - T48;
+			      Rp[WS(rs, 3)] = T47 + T48;
+			 }
+			 {
+			      E T41, T42, T43, T46;
+			      T41 = T3Z - T40;
+			      T42 = T3X - T3U;
+			      Rm[0] = T41 - T42;
+			      Rp[WS(rs, 7)] = T41 + T42;
+			      T43 = T3Q - T3P;
+			      T46 = T44 - T45;
+			      Ip[WS(rs, 7)] = T43 + T46;
+			      Im[0] = T46 - T43;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cfdft2_16", twinstr, &GENUS, {188, 84, 40, 0} };
+
+void X(codelet_hc2cfdft2_16) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft2_16, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft2_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft2_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1191 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:53 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -dit -name hc2cfdft2_20 -include hc2cf.h */
+
+/*
+ * This function contains 316 FP additions, 238 FP multiplications,
+ * (or, 176 additions, 98 multiplications, 140 fused multiply/add),
+ * 180 stack variables, 5 constants, and 80 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T5h, T5C, T5E, T5y, T5w, T5x, T5D, T5z;
+	       {
+		    E Tm, Tq, Tn, T1, T6, Tg, Tp, Tb, T1i, TU, Tr, TW, Tx, T2B, T1A;
+		    E T1u, T2y, T33, T26, T1o, T30, T22, TD, T1Q, T2a, T2e, T2V, T2R, TG, T1V;
+		    E TV, TH, TN, T2t, T12, T2p;
+		    {
+			 E Tw, To, T29, T1h, T1n, T2d, TC, T2U;
+			 Tm = W[0];
+			 Tq = W[3];
+			 Tn = W[2];
+			 T1 = W[6];
+			 T6 = W[7];
+			 Tw = Tm * Tq;
+			 To = Tm * Tn;
+			 T29 = Tm * T1;
+			 T1h = Tn * T1;
+			 T1n = Tn * T6;
+			 T2d = Tm * T6;
+			 Tg = W[5];
+			 Tp = W[1];
+			 Tb = W[4];
+			 {
+			      E T21, T25, T1t, T1z;
+			      T1i = FMA(Tq, T6, T1h);
+			      T25 = Tm * Tg;
+			      T1z = Tn * Tg;
+			      TU = FMA(Tp, Tq, To);
+			      Tr = FNMS(Tp, Tq, To);
+			      TW = FNMS(Tp, Tn, Tw);
+			      Tx = FMA(Tp, Tn, Tw);
+			      T1t = Tn * Tb;
+			      T21 = Tm * Tb;
+			      T2B = FMA(Tq, Tb, T1z);
+			      T1A = FNMS(Tq, Tb, T1z);
+			      TC = Tr * Tb;
+			      T1u = FMA(Tq, Tg, T1t);
+			      T2y = FNMS(Tq, Tg, T1t);
+			      T33 = FMA(Tp, Tb, T25);
+			      T26 = FNMS(Tp, Tb, T25);
+			      T1o = FNMS(Tq, T1, T1n);
+			      T30 = FNMS(Tp, Tg, T21);
+			      T22 = FMA(Tp, Tg, T21);
+			 }
+			 TD = FMA(Tx, Tg, TC);
+			 T1Q = FNMS(Tx, Tg, TC);
+			 T2a = FMA(Tp, T6, T29);
+			 T2e = FNMS(Tp, T1, T2d);
+			 T2U = Tr * T6;
+			 {
+			      E T2Q, TE, TM, TF;
+			      T2Q = Tr * T1;
+			      TF = Tr * Tg;
+			      T2V = FNMS(Tx, T1, T2U);
+			      T2R = FMA(Tx, T6, T2Q);
+			      TG = FNMS(Tx, Tb, TF);
+			      T1V = FMA(Tx, Tb, TF);
+			      TE = TD * T1;
+			      TM = TD * T6;
+			      TV = TU * Tb;
+			      TH = FMA(TG, T6, TE);
+			      TN = FNMS(TG, T1, TM);
+			      T2t = TU * T1;
+			      T12 = TU * Tg;
+			      T2p = TU * T6;
+			 }
+		    }
+		    {
+			 E T36, T3Q, T5f, T4D, T5g, T2Y, T4E, T3P, T5R, T5k, T39, TT, T3T, T3m, T49;
+			 E T4X, T5T, T5r, T3c, T2i, T3W, T3B, T4o, T4U, T5U, T5u, T3d, T2J, T3X, T3I;
+			 E T4v, T4V, T5Q, T5n, T3a, T1G, T3U, T3t, T4g, T4Y;
+			 {
+			      E T13, T2m, T2q, T2u, T2f, T9, T2O, TA, T2c, T4k, T3i, T5, T2Z, T1e, T2G;
+			      E T1O, T2W, TQ, T2C, T1Y, T3v, T27, Tj, T1l, T2v, T3g, T1m, T1D, T2n, T1x;
+			      E T2k, T3E, T4c, T2l, T1y, T10, T31, T16, T34, T32, T11, T4B, T3p, T4A, T1T;
+			      E T3n, T1b, T2A, T4q, T1U, Te, Tf, T24, T4i, T1r, T4a, T3C, T2s, T43, Tv;
+			      E T3L, T2N, T45, TL, T3N, T2T, T2E, T1K;
+			      {
+				   E T2j, TX, T1B, T1C;
+				   {
+					E T1c, T1d, T1M, T1N;
+					{
+					     E T2, T3, T7, T8;
+					     T7 = Rp[WS(rs, 9)];
+					     T8 = Rm[WS(rs, 9)];
+					     T2 = Ip[WS(rs, 9)];
+					     T2j = FMA(TW, Tg, TV);
+					     TX = FNMS(TW, Tg, TV);
+					     T13 = FMA(TW, Tb, T12);
+					     T2m = FNMS(TW, Tb, T12);
+					     T2q = FNMS(TW, T1, T2p);
+					     T2u = FMA(TW, T6, T2t);
+					     T2f = T7 + T8;
+					     T9 = T7 - T8;
+					     T3 = Im[WS(rs, 9)];
+					     {
+						  E Ty, Tz, T2b, T4;
+						  Ty = Rp[WS(rs, 2)];
+						  Tz = Rm[WS(rs, 2)];
+						  T1c = Ip[0];
+						  T2b = T2 - T3;
+						  T4 = T2 + T3;
+						  T2O = Ty - Tz;
+						  TA = Ty + Tz;
+						  T2c = T2a * T2b;
+						  T4k = T2e * T2b;
+						  T3i = T6 * T4;
+						  T5 = T1 * T4;
+						  T1d = Im[0];
+						  T1M = Rp[WS(rs, 1)];
+						  T1N = Rm[WS(rs, 1)];
+					     }
+					}
+					{
+					     E TO, TP, T1W, T1X;
+					     TO = Rp[WS(rs, 7)];
+					     T2Z = T1c - T1d;
+					     T1e = T1c + T1d;
+					     T2G = T1M + T1N;
+					     T1O = T1M - T1N;
+					     TP = Rm[WS(rs, 7)];
+					     T1W = Rm[WS(rs, 6)];
+					     T1X = Rp[WS(rs, 6)];
+					     {
+						  E Th, Ti, T1j, T1k;
+						  Th = Rm[WS(rs, 4)];
+						  T2W = TO - TP;
+						  TQ = TO + TP;
+						  T2C = T1X + T1W;
+						  T1Y = T1W - T1X;
+						  Ti = Rp[WS(rs, 4)];
+						  T1j = Ip[WS(rs, 8)];
+						  T1k = Im[WS(rs, 8)];
+						  T3v = T1Q * T1Y;
+						  T27 = Ti + Th;
+						  Tj = Th - Ti;
+						  T1l = T1j - T1k;
+						  T2v = T1j + T1k;
+						  T1B = Rp[WS(rs, 3)];
+						  T3g = Tb * Tj;
+						  T1m = T1i * T1l;
+						  T1C = Rm[WS(rs, 3)];
+					     }
+					}
+				   }
+				   {
+					E T18, T19, T1R, T1S;
+					{
+					     E TY, TZ, T1v, T1w, T14, T15;
+					     T1v = Ip[WS(rs, 3)];
+					     T1w = Im[WS(rs, 3)];
+					     TY = Ip[WS(rs, 5)];
+					     T1D = T1B + T1C;
+					     T2n = T1B - T1C;
+					     T1x = T1v - T1w;
+					     T2k = T1v + T1w;
+					     T3E = T2j * T2n;
+					     T4c = T1u * T1D;
+					     T2l = T2j * T2k;
+					     T1y = T1u * T1x;
+					     TZ = Im[WS(rs, 5)];
+					     T14 = Rp[WS(rs, 5)];
+					     T15 = Rm[WS(rs, 5)];
+					     T18 = Rm[0];
+					     T10 = TY + TZ;
+					     T31 = TY - TZ;
+					     T16 = T14 - T15;
+					     T34 = T14 + T15;
+					     T32 = T30 * T31;
+					     T11 = TX * T10;
+					     T4B = T30 * T34;
+					     T3p = TX * T16;
+					     T19 = Rp[0];
+					     T1R = Ip[WS(rs, 6)];
+					     T1S = Im[WS(rs, 6)];
+					}
+					{
+					     E T2r, T23, T1p, T1q;
+					     {
+						  E Tc, T1a, T2z, Td;
+						  Tc = Ip[WS(rs, 4)];
+						  T1a = T18 - T19;
+						  T4A = T19 + T18;
+						  T1T = T1R + T1S;
+						  T2z = T1R - T1S;
+						  Td = Im[WS(rs, 4)];
+						  T3n = Tm * T1a;
+						  T1b = Tp * T1a;
+						  T2A = T2y * T2z;
+						  T4q = T2B * T2z;
+						  T1U = T1Q * T1T;
+						  T23 = Tc - Td;
+						  Te = Tc + Td;
+					     }
+					     T1p = Rp[WS(rs, 8)];
+					     T1q = Rm[WS(rs, 8)];
+					     Tf = Tb * Te;
+					     T24 = T22 * T23;
+					     T4i = T26 * T23;
+					     T1r = T1p + T1q;
+					     T2r = T1q - T1p;
+					     {
+						  E T2M, Tu, Ts, Tt;
+						  Ts = Ip[WS(rs, 2)];
+						  Tt = Im[WS(rs, 2)];
+						  T4a = T1i * T1r;
+						  T3C = T2u * T2r;
+						  T2s = T2q * T2r;
+						  T2M = Ts + Tt;
+						  Tu = Ts - Tt;
+						  {
+						       E T2S, TK, TI, TJ, T1I, T1J;
+						       TI = Ip[WS(rs, 7)];
+						       TJ = Im[WS(rs, 7)];
+						       T43 = Tx * Tu;
+						       Tv = Tr * Tu;
+						       T3L = TG * T2M;
+						       T2N = TD * T2M;
+						       T2S = TI + TJ;
+						       TK = TI - TJ;
+						       T1I = Ip[WS(rs, 1)];
+						       T1J = Im[WS(rs, 1)];
+						       T45 = TN * TK;
+						       TL = TH * TK;
+						       T3N = T2V * T2S;
+						       T2T = T2R * T2S;
+						       T2E = T1I - T1J;
+						       T1K = T1I + T1J;
+						  }
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T3x, T1L, T2F, T4s, T2P, T2X, T3M, T3O, T35, T4C;
+				   T35 = FNMS(T33, T34, T32);
+				   T4C = FMA(T33, T31, T4B);
+				   T3x = Tq * T1K;
+				   T1L = Tn * T1K;
+				   T2F = TU * T2E;
+				   T4s = TW * T2E;
+				   T36 = T2Z - T35;
+				   T3Q = T35 + T2Z;
+				   T5f = T4A + T4C;
+				   T4D = T4A - T4C;
+				   T2P = FNMS(TG, T2O, T2N);
+				   T2X = FNMS(T2V, T2W, T2T);
+				   T3M = FMA(TD, T2O, T3L);
+				   T3O = FMA(T2R, T2W, T3N);
+				   {
+					E TB, T5j, Tl, T5i, T47, TR, T3h, T3j;
+					{
+					     E Ta, Tk, T44, T46;
+					     Ta = FNMS(T6, T9, T5);
+					     T5g = T2P + T2X;
+					     T2Y = T2P - T2X;
+					     T4E = T3O - T3M;
+					     T3P = T3M + T3O;
+					     Tk = FMA(Tg, Tj, Tf);
+					     T44 = FMA(Tr, TA, T43);
+					     T46 = FMA(TH, TQ, T45);
+					     TB = FNMS(Tx, TA, Tv);
+					     T5j = Tk + Ta;
+					     Tl = Ta - Tk;
+					     T5i = T44 + T46;
+					     T47 = T44 - T46;
+					     TR = FNMS(TN, TQ, TL);
+					     T3h = FNMS(Tg, Te, T3g);
+					     T3j = FMA(T1, T9, T3i);
+					}
+					{
+					     E T3l, T48, T3k, TS;
+					     T5R = T5i - T5j;
+					     T5k = T5i + T5j;
+					     T3l = TB + TR;
+					     TS = TB - TR;
+					     T48 = T3h + T3j;
+					     T3k = T3h - T3j;
+					     T39 = TS + Tl;
+					     TT = Tl - TS;
+					     T3T = T3l + T3k;
+					     T3m = T3k - T3l;
+					     T49 = T47 + T48;
+					     T4X = T47 - T48;
+					}
+				   }
+				   {
+					E T28, T5q, T20, T5p, T4m, T2g, T3w, T3y;
+					{
+					     E T1P, T1Z, T4j, T4l;
+					     T1P = FNMS(Tq, T1O, T1L);
+					     T1Z = FMA(T1V, T1Y, T1U);
+					     T4j = FMA(T22, T27, T4i);
+					     T4l = FMA(T2a, T2f, T4k);
+					     T28 = FNMS(T26, T27, T24);
+					     T5q = T1Z + T1P;
+					     T20 = T1P - T1Z;
+					     T5p = T4j + T4l;
+					     T4m = T4j - T4l;
+					     T2g = FNMS(T2e, T2f, T2c);
+					     T3w = FNMS(T1V, T1T, T3v);
+					     T3y = FMA(Tn, T1O, T3x);
+					}
+					{
+					     E T3A, T4n, T3z, T2h;
+					     T5T = T5p - T5q;
+					     T5r = T5p + T5q;
+					     T3A = T28 + T2g;
+					     T2h = T28 - T2g;
+					     T4n = T3w + T3y;
+					     T3z = T3w - T3y;
+					     T3c = T2h + T20;
+					     T2i = T20 - T2h;
+					     T3W = T3A + T3z;
+					     T3B = T3z - T3A;
+					     T4o = T4m + T4n;
+					     T4U = T4m - T4n;
+					}
+				   }
+				   {
+					E T2D, T5s, T2x, T5t, T4u, T2H, T3D, T3F;
+					{
+					     E T2o, T2w, T4r, T4t;
+					     T2o = FNMS(T2m, T2n, T2l);
+					     T2w = FMA(T2u, T2v, T2s);
+					     T4r = FMA(T2y, T2C, T4q);
+					     T4t = FMA(TU, T2G, T4s);
+					     T2D = FNMS(T2B, T2C, T2A);
+					     T5s = T2w + T2o;
+					     T2x = T2o - T2w;
+					     T5t = T4r + T4t;
+					     T4u = T4r - T4t;
+					     T2H = FNMS(TW, T2G, T2F);
+					     T3D = FNMS(T2q, T2v, T3C);
+					     T3F = FMA(T2m, T2k, T3E);
+					}
+					{
+					     E T3H, T4p, T3G, T2I;
+					     T5U = T5t - T5s;
+					     T5u = T5s + T5t;
+					     T3H = T2D + T2H;
+					     T2I = T2D - T2H;
+					     T4p = T3D + T3F;
+					     T3G = T3D - T3F;
+					     T3d = T2x + T2I;
+					     T2J = T2x - T2I;
+					     T3X = T3G + T3H;
+					     T3I = T3G - T3H;
+					     T4v = T4p + T4u;
+					     T4V = T4u - T4p;
+					}
+				   }
+				   {
+					E T1s, T5m, T1g, T5l, T4e, T1E, T3o, T3q;
+					{
+					     E T17, T1f, T4b, T4d;
+					     T17 = FNMS(T13, T16, T11);
+					     T1f = FMA(Tm, T1e, T1b);
+					     T4b = FMA(T1o, T1l, T4a);
+					     T4d = FMA(T1A, T1x, T4c);
+					     T1s = FNMS(T1o, T1r, T1m);
+					     T5m = T17 + T1f;
+					     T1g = T17 - T1f;
+					     T5l = T4b + T4d;
+					     T4e = T4b - T4d;
+					     T1E = FNMS(T1A, T1D, T1y);
+					     T3o = FNMS(Tp, T1e, T3n);
+					     T3q = FMA(T13, T10, T3p);
+					}
+					{
+					     E T3s, T4f, T3r, T1F;
+					     T5Q = T5l - T5m;
+					     T5n = T5l + T5m;
+					     T3s = T1s + T1E;
+					     T1F = T1s - T1E;
+					     T4f = T3q + T3o;
+					     T3r = T3o - T3q;
+					     T3a = T1F + T1g;
+					     T1G = T1g - T1F;
+					     T3U = T3s + T3r;
+					     T3t = T3r - T3s;
+					     T4g = T4e + T4f;
+					     T4Y = T4e - T4f;
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T4F, T4G, T4H, T4x, T4z, T41, T4O, T4Q, T40;
+			      {
+				   E T55, T38, T54, T50, T52, T53, T5e, T5c, T51, T4T;
+				   {
+					E T4W, T37, T4Z, T1H, T5b, T5a, T2K, T2L, T4S, T4R;
+					T55 = T4U + T4V;
+					T4W = T4U - T4V;
+					T37 = T2Y + T36;
+					T38 = T36 - T2Y;
+					T54 = T4X + T4Y;
+					T4Z = T4X - T4Y;
+					T1H = TT + T1G;
+					T5b = T1G - TT;
+					T5a = T2J - T2i;
+					T2K = T2i + T2J;
+					T50 = FNMS(KP618033988, T4Z, T4W);
+					T52 = FMA(KP618033988, T4W, T4Z);
+					T2L = T1H + T2K;
+					T4S = T1H - T2K;
+					T53 = T4D - T4E;
+					T4F = T4D + T4E;
+					Im[WS(rs, 4)] = KP500000000 * (T2L - T37);
+					T4R = FMA(KP250000000, T2L, T37);
+					T5e = FMA(KP618033988, T5a, T5b);
+					T5c = FNMS(KP618033988, T5b, T5a);
+					T51 = FNMS(KP559016994, T4S, T4R);
+					T4T = FMA(KP559016994, T4S, T4R);
+				   }
+				   {
+					E T3b, T4M, T4N, T3e, T3f;
+					{
+					     E T4h, T58, T57, T4w, T56, T5d, T59;
+					     T4G = T49 + T4g;
+					     T4h = T49 - T4g;
+					     T58 = T54 - T55;
+					     T56 = T54 + T55;
+					     Ip[WS(rs, 7)] = KP500000000 * (FMA(KP951056516, T50, T4T));
+					     Ip[WS(rs, 3)] = KP500000000 * (FNMS(KP951056516, T50, T4T));
+					     Im[WS(rs, 8)] = -(KP500000000 * (FNMS(KP951056516, T52, T51)));
+					     Im[0] = -(KP500000000 * (FMA(KP951056516, T52, T51)));
+					     Rm[WS(rs, 4)] = KP500000000 * (T53 + T56);
+					     T57 = FNMS(KP250000000, T56, T53);
+					     T4w = T4o - T4v;
+					     T4H = T4o + T4v;
+					     T3b = T39 + T3a;
+					     T4M = T39 - T3a;
+					     T5d = FMA(KP559016994, T58, T57);
+					     T59 = FNMS(KP559016994, T58, T57);
+					     T4x = FMA(KP618033988, T4w, T4h);
+					     T4z = FNMS(KP618033988, T4h, T4w);
+					     Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T5c, T59));
+					     Rp[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T5c, T59));
+					     Rm[0] = KP500000000 * (FNMS(KP951056516, T5e, T5d));
+					     Rm[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5e, T5d));
+					     T4N = T3c - T3d;
+					     T3e = T3c + T3d;
+					}
+					T3f = T3b + T3e;
+					T41 = T3b - T3e;
+					T4O = FMA(KP618033988, T4N, T4M);
+					T4Q = FNMS(KP618033988, T4M, T4N);
+					Ip[WS(rs, 5)] = KP500000000 * (T38 + T3f);
+					T40 = FNMS(KP250000000, T3f, T38);
+				   }
+			      }
+			      {
+				   E T3S, T5Z, T68, T6a, T64, T62;
+				   {
+					E T60, T61, T5Y, T5W, T3R, T67, T66, T3K, T5O, T4K, T4J, T5N, T5X, T5P;
+					{
+					     E T5S, T5V, T4y, T42, T4I;
+					     T60 = T5R + T5Q;
+					     T5S = T5Q - T5R;
+					     T5V = T5T - T5U;
+					     T61 = T5T + T5U;
+					     T4y = FNMS(KP559016994, T41, T40);
+					     T42 = FMA(KP559016994, T41, T40);
+					     T4I = T4G + T4H;
+					     T4K = T4G - T4H;
+					     Ip[WS(rs, 9)] = KP500000000 * (FMA(KP951056516, T4x, T42));
+					     Ip[WS(rs, 1)] = KP500000000 * (FNMS(KP951056516, T4x, T42));
+					     Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP951056516, T4z, T4y)));
+					     Im[WS(rs, 2)] = -(KP500000000 * (FMA(KP951056516, T4z, T4y)));
+					     Rp[WS(rs, 5)] = KP500000000 * (T4F + T4I);
+					     T4J = FNMS(KP250000000, T4I, T4F);
+					     T5Y = FMA(KP618033988, T5S, T5V);
+					     T5W = FNMS(KP618033988, T5V, T5S);
+					}
+					T3S = T3Q - T3P;
+					T3R = T3P + T3Q;
+					{
+					     E T4L, T4P, T3u, T3J;
+					     T4L = FMA(KP559016994, T4K, T4J);
+					     T4P = FNMS(KP559016994, T4K, T4J);
+					     T3u = T3m + T3t;
+					     T67 = T3t - T3m;
+					     T66 = T3I - T3B;
+					     T3J = T3B + T3I;
+					     Rp[WS(rs, 9)] = KP500000000 * (FNMS(KP951056516, T4O, T4L));
+					     Rp[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T4O, T4L));
+					     Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T4Q, T4P));
+					     Rm[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T4Q, T4P));
+					     T3K = T3u + T3J;
+					     T5O = T3J - T3u;
+					}
+					Im[WS(rs, 9)] = KP500000000 * (T3K - T3R);
+					T5N = FMA(KP250000000, T3K, T3R);
+					T5Z = T5f - T5g;
+					T5h = T5f + T5g;
+					T68 = FNMS(KP618033988, T67, T66);
+					T6a = FMA(KP618033988, T66, T67);
+					T5X = FNMS(KP559016994, T5O, T5N);
+					T5P = FMA(KP559016994, T5O, T5N);
+					Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP951056516, T5W, T5P)));
+					Ip[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T5W, T5P));
+					Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP951056516, T5Y, T5X)));
+					Ip[WS(rs, 2)] = KP500000000 * (FMA(KP951056516, T5Y, T5X));
+					T64 = T60 - T61;
+					T62 = T60 + T61;
+				   }
+				   {
+					E T5o, T5v, T5M, T5K, T5A, T5B, T3Z, T5G, T5I, T5J, T63, T5F, T5L, T5H;
+					T5o = T5k + T5n;
+					T5I = T5k - T5n;
+					T5J = T5u - T5r;
+					T5v = T5r + T5u;
+					Rm[WS(rs, 9)] = KP500000000 * (T5Z + T62);
+					T63 = FNMS(KP250000000, T62, T5Z);
+					T5M = FMA(KP618033988, T5I, T5J);
+					T5K = FNMS(KP618033988, T5J, T5I);
+					{
+					     E T65, T69, T3V, T3Y;
+					     T65 = FNMS(KP559016994, T64, T63);
+					     T69 = FMA(KP559016994, T64, T63);
+					     T3V = T3T + T3U;
+					     T5A = T3T - T3U;
+					     T5B = T3W - T3X;
+					     T3Y = T3W + T3X;
+					     Rm[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T68, T65));
+					     Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T68, T65));
+					     Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP951056516, T6a, T69));
+					     Rp[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T6a, T69));
+					     T3Z = T3V + T3Y;
+					     T5G = T3V - T3Y;
+					}
+					Ip[0] = KP500000000 * (T3S + T3Z);
+					T5F = FNMS(KP250000000, T3Z, T3S);
+					T5C = FMA(KP618033988, T5B, T5A);
+					T5E = FNMS(KP618033988, T5A, T5B);
+					T5L = FNMS(KP559016994, T5G, T5F);
+					T5H = FMA(KP559016994, T5G, T5F);
+					Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP951056516, T5K, T5H)));
+					Ip[WS(rs, 4)] = KP500000000 * (FMA(KP951056516, T5K, T5H));
+					Im[WS(rs, 7)] = -(KP500000000 * (FNMS(KP951056516, T5M, T5L)));
+					Ip[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5M, T5L));
+					T5y = T5o - T5v;
+					T5w = T5o + T5v;
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Rp[0] = KP500000000 * (T5h + T5w);
+	       T5x = FNMS(KP250000000, T5w, T5h);
+	       T5D = FNMS(KP559016994, T5y, T5x);
+	       T5z = FMA(KP559016994, T5y, T5x);
+	       Rm[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T5C, T5z));
+	       Rp[WS(rs, 4)] = KP500000000 * (FNMS(KP951056516, T5C, T5z));
+	       Rm[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T5E, T5D));
+	       Rp[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5E, T5D));
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cfdft2_20", twinstr, &GENUS, {176, 98, 140, 0} };
+
+void X(codelet_hc2cfdft2_20) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft2_20, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -dit -name hc2cfdft2_20 -include hc2cf.h */
+
+/*
+ * This function contains 316 FP additions, 180 FP multiplications,
+ * (or, 244 additions, 108 multiplications, 72 fused multiply/add),
+ * 134 stack variables, 5 constants, and 80 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP125000000, +0.125000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP279508497, +0.279508497187473712051146708591409529430077295);
+     DK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T4, T7, Tm, To, Tq, Tu, T1I, T1G, T8, T5, Ta, T1u, T2u, Tg, T2s;
+	       E T21, T1A, T1Z, T1O, T2I, T1K, T2G, Tw, TC, T2a, T2e, TH, TI, TJ, TX;
+	       E T2D, TN, T2B, T26, T1n, TZ, T24, T1j;
+	       {
+		    E T9, T1y, Te, T1t, T6, T1z, Tf, T1s;
+		    {
+			 E Tn, Tt, Tp, Ts;
+			 T4 = W[0];
+			 T7 = W[1];
+			 Tm = W[2];
+			 To = W[3];
+			 Tn = T4 * Tm;
+			 Tt = T7 * Tm;
+			 Tp = T7 * To;
+			 Ts = T4 * To;
+			 Tq = Tn - Tp;
+			 Tu = Ts + Tt;
+			 T1I = Ts - Tt;
+			 T1G = Tn + Tp;
+			 T8 = W[5];
+			 T9 = T7 * T8;
+			 T1y = Tm * T8;
+			 Te = T4 * T8;
+			 T1t = To * T8;
+			 T5 = W[4];
+			 T6 = T4 * T5;
+			 T1z = To * T5;
+			 Tf = T7 * T5;
+			 T1s = Tm * T5;
+		    }
+		    Ta = T6 - T9;
+		    T1u = T1s + T1t;
+		    T2u = T1y + T1z;
+		    Tg = Te + Tf;
+		    T2s = T1s - T1t;
+		    T21 = Te - Tf;
+		    T1A = T1y - T1z;
+		    T1Z = T6 + T9;
+		    {
+			 E T1M, T1N, T1H, T1J;
+			 T1M = T1G * T8;
+			 T1N = T1I * T5;
+			 T1O = T1M + T1N;
+			 T2I = T1M - T1N;
+			 T1H = T1G * T5;
+			 T1J = T1I * T8;
+			 T1K = T1H - T1J;
+			 T2G = T1H + T1J;
+			 {
+			      E Tr, Tv, TA, TB;
+			      Tr = Tq * T5;
+			      Tv = Tu * T8;
+			      Tw = Tr + Tv;
+			      TA = Tq * T8;
+			      TB = Tu * T5;
+			      TC = TA - TB;
+			      T2a = Tr - Tv;
+			      T2e = TA + TB;
+			      TH = W[6];
+			      TI = W[7];
+			      TJ = FMA(Tq, TH, Tu * TI);
+			      TX = FMA(Tw, TH, TC * TI);
+			      T2D = FMA(T1G, TH, T1I * TI);
+			      TN = FNMS(Tu, TH, Tq * TI);
+			      T2B = FNMS(T1I, TH, T1G * TI);
+			      T26 = FNMS(T7, TH, T4 * TI);
+			      T1n = FNMS(To, TH, Tm * TI);
+			      TZ = FNMS(TC, TH, Tw * TI);
+			      T24 = FMA(T4, TH, T7 * TI);
+			      T1j = FMA(Tm, TH, To * TI);
+			 }
+		    }
+	       }
+	       {
+		    E Tl, T3n, T1i, T2Q, T47, T50, T4S, T5i, T2M, T2T, T4I, T5f, T4L, T5e, T4P;
+		    E T5h, T2r, T2S, T1X, T2P, T31, T3u, T36, T3t, T3E, T4l, T3U, T4j, T3h, T3r;
+		    E T3J, T4m, T3c, T3q, T3P, T4i, TS, T51, T3m, T48;
+		    {
+			 E T3, T45, T1V, T3f, Tz, TF, TW, T3A, TM, TQ, T11, T3B, Td, Tj, T1Q;
+			 E T3e, T19, T3L, T23, T39, T2p, T3S, T2z, T34, T1E, T3G, T2K, T2Y, T1g, T3M;
+			 E T28, T3a, T2i, T3R, T2w, T33, T1r, T3F, T2F, T2X, T4N, T4O;
+			 {
+			      E T1, T2, T1R, T1S, T1T, T1U;
+			      T1 = Ip[0];
+			      T2 = Im[0];
+			      T1R = T1 + T2;
+			      T1S = Rp[0];
+			      T1T = Rm[0];
+			      T1U = T1S - T1T;
+			      T3 = T1 - T2;
+			      T45 = T1S + T1T;
+			      T1V = FNMS(T7, T1U, T4 * T1R);
+			      T3f = FMA(T4, T1U, T7 * T1R);
+			 }
+			 {
+			      E Tx, Ty, TU, TD, TE, TV;
+			      Tx = Ip[WS(rs, 2)];
+			      Ty = Im[WS(rs, 2)];
+			      TU = Tx - Ty;
+			      TD = Rp[WS(rs, 2)];
+			      TE = Rm[WS(rs, 2)];
+			      TV = TD + TE;
+			      Tz = Tx + Ty;
+			      TF = TD - TE;
+			      TW = FNMS(Tu, TV, Tq * TU);
+			      T3A = FMA(Tu, TU, Tq * TV);
+			 }
+			 {
+			      E TK, TL, TY, TO, TP, T10;
+			      TK = Ip[WS(rs, 7)];
+			      TL = Im[WS(rs, 7)];
+			      TY = TK - TL;
+			      TO = Rp[WS(rs, 7)];
+			      TP = Rm[WS(rs, 7)];
+			      T10 = TO + TP;
+			      TM = TK + TL;
+			      TQ = TO - TP;
+			      T11 = FNMS(TZ, T10, TX * TY);
+			      T3B = FMA(TZ, TY, TX * T10);
+			 }
+			 {
+			      E Tb, Tc, T1L, Th, Ti, T1P;
+			      Tb = Ip[WS(rs, 5)];
+			      Tc = Im[WS(rs, 5)];
+			      T1L = Tb + Tc;
+			      Th = Rp[WS(rs, 5)];
+			      Ti = Rm[WS(rs, 5)];
+			      T1P = Th - Ti;
+			      Td = Tb - Tc;
+			      Tj = Th + Ti;
+			      T1Q = FNMS(T1O, T1P, T1K * T1L);
+			      T3e = FMA(T1K, T1P, T1O * T1L);
+			 }
+			 {
+			      E T15, T20, T18, T22;
+			      {
+				   E T13, T14, T16, T17;
+				   T13 = Ip[WS(rs, 4)];
+				   T14 = Im[WS(rs, 4)];
+				   T15 = T13 + T14;
+				   T20 = T13 - T14;
+				   T16 = Rp[WS(rs, 4)];
+				   T17 = Rm[WS(rs, 4)];
+				   T18 = T16 - T17;
+				   T22 = T16 + T17;
+			      }
+			      T19 = FNMS(T8, T18, T5 * T15);
+			      T3L = FMA(T21, T20, T1Z * T22);
+			      T23 = FNMS(T21, T22, T1Z * T20);
+			      T39 = FMA(T8, T15, T5 * T18);
+			 }
+			 {
+			      E T2l, T2x, T2o, T2y;
+			      {
+				   E T2j, T2k, T2m, T2n;
+				   T2j = Ip[WS(rs, 1)];
+				   T2k = Im[WS(rs, 1)];
+				   T2l = T2j + T2k;
+				   T2x = T2j - T2k;
+				   T2m = Rp[WS(rs, 1)];
+				   T2n = Rm[WS(rs, 1)];
+				   T2o = T2m - T2n;
+				   T2y = T2m + T2n;
+			      }
+			      T2p = FNMS(To, T2o, Tm * T2l);
+			      T3S = FMA(T1I, T2x, T1G * T2y);
+			      T2z = FNMS(T1I, T2y, T1G * T2x);
+			      T34 = FMA(To, T2l, Tm * T2o);
+			 }
+			 {
+			      E T1x, T2H, T1D, T2J;
+			      {
+				   E T1v, T1w, T1B, T1C;
+				   T1v = Ip[WS(rs, 3)];
+				   T1w = Im[WS(rs, 3)];
+				   T1x = T1v - T1w;
+				   T2H = T1v + T1w;
+				   T1B = Rp[WS(rs, 3)];
+				   T1C = Rm[WS(rs, 3)];
+				   T1D = T1B + T1C;
+				   T2J = T1B - T1C;
+			      }
+			      T1E = FNMS(T1A, T1D, T1u * T1x);
+			      T3G = FMA(T1u, T1D, T1A * T1x);
+			      T2K = FNMS(T2I, T2J, T2G * T2H);
+			      T2Y = FMA(T2G, T2J, T2I * T2H);
+			 }
+			 {
+			      E T1c, T25, T1f, T27;
+			      {
+				   E T1a, T1b, T1d, T1e;
+				   T1a = Ip[WS(rs, 9)];
+				   T1b = Im[WS(rs, 9)];
+				   T1c = T1a + T1b;
+				   T25 = T1a - T1b;
+				   T1d = Rp[WS(rs, 9)];
+				   T1e = Rm[WS(rs, 9)];
+				   T1f = T1d - T1e;
+				   T27 = T1d + T1e;
+			      }
+			      T1g = FNMS(TI, T1f, TH * T1c);
+			      T3M = FMA(T26, T25, T24 * T27);
+			      T28 = FNMS(T26, T27, T24 * T25);
+			      T3a = FMA(TI, T1c, TH * T1f);
+			 }
+			 {
+			      E T2d, T2t, T2h, T2v;
+			      {
+				   E T2b, T2c, T2f, T2g;
+				   T2b = Ip[WS(rs, 6)];
+				   T2c = Im[WS(rs, 6)];
+				   T2d = T2b + T2c;
+				   T2t = T2b - T2c;
+				   T2f = Rp[WS(rs, 6)];
+				   T2g = Rm[WS(rs, 6)];
+				   T2h = T2f - T2g;
+				   T2v = T2f + T2g;
+			      }
+			      T2i = FNMS(T2e, T2h, T2a * T2d);
+			      T3R = FMA(T2u, T2t, T2s * T2v);
+			      T2w = FNMS(T2u, T2v, T2s * T2t);
+			      T33 = FMA(T2e, T2d, T2a * T2h);
+			 }
+			 {
+			      E T1m, T2E, T1q, T2C;
+			      {
+				   E T1k, T1l, T1o, T1p;
+				   T1k = Ip[WS(rs, 8)];
+				   T1l = Im[WS(rs, 8)];
+				   T1m = T1k - T1l;
+				   T2E = T1k + T1l;
+				   T1o = Rp[WS(rs, 8)];
+				   T1p = Rm[WS(rs, 8)];
+				   T1q = T1o + T1p;
+				   T2C = T1p - T1o;
+			      }
+			      T1r = FNMS(T1n, T1q, T1j * T1m);
+			      T3F = FMA(T1j, T1q, T1n * T1m);
+			      T2F = FMA(T2B, T2C, T2D * T2E);
+			      T2X = FNMS(T2B, T2E, T2D * T2C);
+			 }
+			 {
+			      E Tk, T12, T1h, T46;
+			      Tk = FNMS(Tg, Tj, Ta * Td);
+			      Tl = T3 - Tk;
+			      T3n = Tk + T3;
+			      T12 = TW - T11;
+			      T1h = T19 - T1g;
+			      T1i = T12 - T1h;
+			      T2Q = T12 + T1h;
+			      T46 = FMA(Ta, Tj, Tg * Td);
+			      T47 = T45 - T46;
+			      T50 = T45 + T46;
+			      {
+				   E T4Q, T4R, T2A, T2L;
+				   T4Q = T2F + T2K;
+				   T4R = T3R + T3S;
+				   T4S = T4Q + T4R;
+				   T5i = T4R - T4Q;
+				   T2A = T2w - T2z;
+				   T2L = T2F - T2K;
+				   T2M = T2A - T2L;
+				   T2T = T2L + T2A;
+			      }
+			 }
+			 {
+			      E T4G, T4H, T4J, T4K;
+			      T4G = T3A + T3B;
+			      T4H = T19 + T1g;
+			      T4I = T4G + T4H;
+			      T5f = T4G - T4H;
+			      T4J = T3F + T3G;
+			      T4K = T1Q + T1V;
+			      T4L = T4J + T4K;
+			      T5e = T4J - T4K;
+			 }
+			 T4N = T3L + T3M;
+			 T4O = T2i + T2p;
+			 T4P = T4N + T4O;
+			 T5h = T4N - T4O;
+			 {
+			      E T29, T2q, T1F, T1W;
+			      T29 = T23 - T28;
+			      T2q = T2i - T2p;
+			      T2r = T29 - T2q;
+			      T2S = T29 + T2q;
+			      T1F = T1r - T1E;
+			      T1W = T1Q - T1V;
+			      T1X = T1F + T1W;
+			      T2P = T1W - T1F;
+			 }
+			 {
+			      E T3C, T3D, T3N, T3O;
+			      {
+				   E T2Z, T30, T32, T35;
+				   T2Z = T2X - T2Y;
+				   T30 = T2w + T2z;
+				   T31 = T2Z - T30;
+				   T3u = T2Z + T30;
+				   T32 = T23 + T28;
+				   T35 = T33 + T34;
+				   T36 = T32 + T35;
+				   T3t = T32 - T35;
+			      }
+			      T3C = T3A - T3B;
+			      T3D = T3a - T39;
+			      T3E = T3C + T3D;
+			      T4l = T3C - T3D;
+			      {
+				   E T3Q, T3T, T3d, T3g;
+				   T3Q = T2X + T2Y;
+				   T3T = T3R - T3S;
+				   T3U = T3Q + T3T;
+				   T4j = T3T - T3Q;
+				   T3d = T1r + T1E;
+				   T3g = T3e + T3f;
+				   T3h = T3d + T3g;
+				   T3r = T3d - T3g;
+			      }
+			      {
+				   E T3H, T3I, T38, T3b;
+				   T3H = T3F - T3G;
+				   T3I = T3e - T3f;
+				   T3J = T3H + T3I;
+				   T4m = T3H - T3I;
+				   T38 = TW + T11;
+				   T3b = T39 + T3a;
+				   T3c = T38 + T3b;
+				   T3q = T38 - T3b;
+			      }
+			      T3N = T3L - T3M;
+			      T3O = T34 - T33;
+			      T3P = T3N + T3O;
+			      T4i = T3N - T3O;
+			      {
+				   E TG, TR, T3k, T3l;
+				   TG = FNMS(TC, TF, Tw * Tz);
+				   TR = FNMS(TN, TQ, TJ * TM);
+				   TS = TG - TR;
+				   T51 = TG + TR;
+				   T3k = FMA(TC, Tz, Tw * TF);
+				   T3l = FMA(TN, TM, TJ * TQ);
+				   T3m = T3k + T3l;
+				   T48 = T3l - T3k;
+			      }
+			 }
+		    }
+		    {
+			 E T3W, T3Y, TT, T2O, T3x, T3y, T3X, T3z;
+			 {
+			      E T3K, T3V, T1Y, T2N;
+			      T3K = T3E - T3J;
+			      T3V = T3P - T3U;
+			      T3W = FMA(KP475528258, T3K, KP293892626 * T3V);
+			      T3Y = FNMS(KP293892626, T3K, KP475528258 * T3V);
+			      TT = Tl - TS;
+			      T1Y = T1i + T1X;
+			      T2N = T2r + T2M;
+			      T2O = T1Y + T2N;
+			      T3x = KP279508497 * (T1Y - T2N);
+			      T3y = FNMS(KP125000000, T2O, KP500000000 * TT);
+			 }
+			 Ip[WS(rs, 5)] = KP500000000 * (TT + T2O);
+			 T3X = T3x - T3y;
+			 Im[WS(rs, 2)] = T3X - T3Y;
+			 Im[WS(rs, 6)] = T3X + T3Y;
+			 T3z = T3x + T3y;
+			 Ip[WS(rs, 1)] = T3z - T3W;
+			 Ip[WS(rs, 9)] = T3z + T3W;
+		    }
+		    {
+			 E T41, T4d, T49, T4a, T44, T4b, T4e, T4c;
+			 {
+			      E T3Z, T40, T42, T43;
+			      T3Z = T1i - T1X;
+			      T40 = T2r - T2M;
+			      T41 = FMA(KP475528258, T3Z, KP293892626 * T40);
+			      T4d = FNMS(KP293892626, T3Z, KP475528258 * T40);
+			      T49 = T47 + T48;
+			      T42 = T3E + T3J;
+			      T43 = T3P + T3U;
+			      T4a = T42 + T43;
+			      T44 = KP279508497 * (T42 - T43);
+			      T4b = FNMS(KP125000000, T4a, KP500000000 * T49);
+			 }
+			 Rp[WS(rs, 5)] = KP500000000 * (T49 + T4a);
+			 T4e = T4b - T44;
+			 Rm[WS(rs, 6)] = T4d + T4e;
+			 Rm[WS(rs, 2)] = T4e - T4d;
+			 T4c = T44 + T4b;
+			 Rp[WS(rs, 1)] = T41 + T4c;
+			 Rp[WS(rs, 9)] = T4c - T41;
+		    }
+		    {
+			 E T4o, T4q, T2W, T2V, T4f, T4g, T4p, T4h;
+			 {
+			      E T4k, T4n, T2R, T2U;
+			      T4k = T4i - T4j;
+			      T4n = T4l - T4m;
+			      T4o = FNMS(KP293892626, T4n, KP475528258 * T4k);
+			      T4q = FMA(KP475528258, T4n, KP293892626 * T4k);
+			      T2W = TS + Tl;
+			      T2R = T2P - T2Q;
+			      T2U = T2S + T2T;
+			      T2V = T2R - T2U;
+			      T4f = FMA(KP500000000, T2W, KP125000000 * T2V);
+			      T4g = KP279508497 * (T2R + T2U);
+			 }
+			 Im[WS(rs, 4)] = KP500000000 * (T2V - T2W);
+			 T4p = T4g - T4f;
+			 Im[0] = T4p - T4q;
+			 Im[WS(rs, 8)] = T4p + T4q;
+			 T4h = T4f + T4g;
+			 Ip[WS(rs, 3)] = T4h - T4o;
+			 Ip[WS(rs, 7)] = T4h + T4o;
+		    }
+		    {
+			 E T4t, T4B, T4u, T4x, T4y, T4z, T4C, T4A;
+			 {
+			      E T4r, T4s, T4v, T4w;
+			      T4r = T2S - T2T;
+			      T4s = T2Q + T2P;
+			      T4t = FNMS(KP293892626, T4s, KP475528258 * T4r);
+			      T4B = FMA(KP475528258, T4s, KP293892626 * T4r);
+			      T4u = T47 - T48;
+			      T4v = T4l + T4m;
+			      T4w = T4i + T4j;
+			      T4x = T4v + T4w;
+			      T4y = FNMS(KP125000000, T4x, KP500000000 * T4u);
+			      T4z = KP279508497 * (T4v - T4w);
+			 }
+			 Rm[WS(rs, 4)] = KP500000000 * (T4u + T4x);
+			 T4C = T4z + T4y;
+			 Rm[WS(rs, 8)] = T4B + T4C;
+			 Rm[0] = T4C - T4B;
+			 T4A = T4y - T4z;
+			 Rp[WS(rs, 3)] = T4t + T4A;
+			 Rp[WS(rs, 7)] = T4A - T4t;
+		    }
+		    {
+			 E T5k, T5m, T3o, T3j, T5b, T5c, T5l, T5d;
+			 {
+			      E T5g, T5j, T37, T3i;
+			      T5g = T5e - T5f;
+			      T5j = T5h - T5i;
+			      T5k = FNMS(KP293892626, T5j, KP475528258 * T5g);
+			      T5m = FMA(KP293892626, T5g, KP475528258 * T5j);
+			      T3o = T3m + T3n;
+			      T37 = T31 - T36;
+			      T3i = T3c + T3h;
+			      T3j = T37 - T3i;
+			      T5b = FMA(KP500000000, T3o, KP125000000 * T3j);
+			      T5c = KP279508497 * (T3i + T37);
+			 }
+			 Im[WS(rs, 9)] = KP500000000 * (T3j - T3o);
+			 T5l = T5b - T5c;
+			 Ip[WS(rs, 2)] = T5l + T5m;
+			 Im[WS(rs, 1)] = T5m - T5l;
+			 T5d = T5b + T5c;
+			 Ip[WS(rs, 6)] = T5d + T5k;
+			 Im[WS(rs, 5)] = T5k - T5d;
+		    }
+		    {
+			 E T5w, T5x, T5n, T5q, T5r, T5s, T5y, T5t;
+			 {
+			      E T5u, T5v, T5o, T5p;
+			      T5u = T36 + T31;
+			      T5v = T3c - T3h;
+			      T5w = FNMS(KP293892626, T5v, KP475528258 * T5u);
+			      T5x = FMA(KP475528258, T5v, KP293892626 * T5u);
+			      T5n = T50 - T51;
+			      T5o = T5f + T5e;
+			      T5p = T5h + T5i;
+			      T5q = T5o + T5p;
+			      T5r = FNMS(KP125000000, T5q, KP500000000 * T5n);
+			      T5s = KP279508497 * (T5o - T5p);
+			 }
+			 Rm[WS(rs, 9)] = KP500000000 * (T5n + T5q);
+			 T5y = T5s + T5r;
+			 Rp[WS(rs, 6)] = T5x + T5y;
+			 Rm[WS(rs, 5)] = T5y - T5x;
+			 T5t = T5r - T5s;
+			 Rp[WS(rs, 2)] = T5t - T5w;
+			 Rm[WS(rs, 1)] = T5w + T5t;
+		    }
+		    {
+			 E T4U, T4W, T3p, T3w, T4D, T4E, T4V, T4F;
+			 {
+			      E T4M, T4T, T3s, T3v;
+			      T4M = T4I - T4L;
+			      T4T = T4P - T4S;
+			      T4U = FNMS(KP475528258, T4T, KP293892626 * T4M);
+			      T4W = FMA(KP475528258, T4M, KP293892626 * T4T);
+			      T3p = T3n - T3m;
+			      T3s = T3q + T3r;
+			      T3v = T3t + T3u;
+			      T3w = T3s + T3v;
+			      T4D = FNMS(KP125000000, T3w, KP500000000 * T3p);
+			      T4E = KP279508497 * (T3s - T3v);
+			 }
+			 Ip[0] = KP500000000 * (T3p + T3w);
+			 T4V = T4E + T4D;
+			 Ip[WS(rs, 4)] = T4V + T4W;
+			 Im[WS(rs, 3)] = T4W - T4V;
+			 T4F = T4D - T4E;
+			 Ip[WS(rs, 8)] = T4F + T4U;
+			 Im[WS(rs, 7)] = T4U - T4F;
+		    }
+		    {
+			 E T58, T59, T52, T53, T4Z, T54, T5a, T55;
+			 {
+			      E T56, T57, T4X, T4Y;
+			      T56 = T3q - T3r;
+			      T57 = T3t - T3u;
+			      T58 = FMA(KP475528258, T56, KP293892626 * T57);
+			      T59 = FNMS(KP293892626, T56, KP475528258 * T57);
+			      T52 = T50 + T51;
+			      T4X = T4I + T4L;
+			      T4Y = T4P + T4S;
+			      T53 = T4X + T4Y;
+			      T4Z = KP279508497 * (T4X - T4Y);
+			      T54 = FNMS(KP125000000, T53, KP500000000 * T52);
+			 }
+			 Rp[0] = KP500000000 * (T52 + T53);
+			 T5a = T54 - T4Z;
+			 Rp[WS(rs, 8)] = T59 + T5a;
+			 Rm[WS(rs, 7)] = T5a - T59;
+			 T55 = T4Z + T54;
+			 Rp[WS(rs, 4)] = T55 - T58;
+			 Rm[WS(rs, 3)] = T58 + T55;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cfdft2_20", twinstr, &GENUS, {244, 108, 72, 0} };
+
+void X(codelet_hc2cfdft2_20) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft2_20, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft2_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft2_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2012 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:52 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -dit -name hc2cfdft2_32 -include hc2cf.h */
+
+/*
+ * This function contains 552 FP additions, 414 FP multiplications,
+ * (or, 300 additions, 162 multiplications, 252 fused multiply/add),
+ * 196 stack variables, 8 constants, and 128 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft2_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E Tax, TaA;
+	       {
+		    E T1, Th, T2, T5, Ti, Ty, T1t, T3, Tb, Tj, TY, TK, Tl, T4, Tk;
+		    T1 = W[0];
+		    Th = W[4];
+		    T2 = W[2];
+		    T5 = W[3];
+		    Ti = W[6];
+		    Ty = T1 * Th;
+		    T1t = T2 * Th;
+		    T3 = T1 * T2;
+		    Tb = T1 * T5;
+		    Tj = Th * Ti;
+		    TY = T2 * Ti;
+		    TK = T1 * Ti;
+		    Tl = W[7];
+		    T4 = W[1];
+		    Tk = W[5];
+		    {
+			 E T3j, T7Z, T5b, T93, T6B, T8V, T4d, T8J, T8r, T6e, T8l, T1T, T8C, T54, T8i;
+			 E T5O, T94, T31, T8K, T6w, T8U, T3Y, T80, T5g, T8B, T69, T8h, T1s, T8q, T4T;
+			 E T8k, T5J, Tx, T8a, T5y, T8d, T4s, T5Y, T8v, T8E, T2k, T82, T6l, T3z, T83;
+			 E T5m, T8X, T8O, T2F, T86, T6q, T3M, T85, T5r, T8Y, T8R, TW, T8e, T8x, T4B;
+			 E T5D, T8b, T63, T8w;
+			 {
+			      E TL, T2l, T1c, Tc, T1a, T6, Tm, T2v, Tz, T2q, TR, Ts, T2A, TF, T1H;
+			      E T1g, T1d, T1F, T34, T3F, T3B, T32, T3w, T3s, T4p, T4l, T2f, T29, T4K, T4S;
+			      E T5G, T5I;
+			      {
+				   E TZ, T2R, T2H, T15, T2W, T2M, T4I, T4E, T3V, T3S, T4Q, T4M, T1n, T1h, T4X;
+				   E T53, T5L, T5N, T5d, T5f;
+				   {
+					E T1u, T1A, T51, T4Y, T28, T25, T44, T40, T1O, T1I, T3b, T35, T4b, T3i, T45;
+					E T38, T39, T58, T49, T3e, T41;
+					{
+					     E T3g, T3h, T36, T37, TQ;
+					     T3g = Ip[0];
+					     TZ = FNMS(T5, Tl, TY);
+					     T2R = FMA(T5, Tl, TY);
+					     TQ = T1 * Tl;
+					     {
+						  E T14, Tr, T1z, TE;
+						  T14 = T2 * Tl;
+						  Tr = Th * Tl;
+						  TL = FMA(T4, Tl, TK);
+						  T2l = FNMS(T4, Tl, TK);
+						  T1c = FMA(T4, T2, Tb);
+						  Tc = FNMS(T4, T2, Tb);
+						  T1a = FNMS(T4, T5, T3);
+						  T6 = FMA(T4, T5, T3);
+						  Tm = FMA(Tk, Tl, Tj);
+						  T2v = FNMS(T5, Tk, T1t);
+						  T1u = FMA(T5, Tk, T1t);
+						  Tz = FNMS(T4, Tk, Ty);
+						  T2H = FMA(T4, Tk, Ty);
+						  T1z = T2 * Tk;
+						  TE = T1 * Tk;
+						  T2q = FMA(T4, Ti, TQ);
+						  TR = FNMS(T4, Ti, TQ);
+						  T15 = FMA(T5, Ti, T14);
+						  T2W = FNMS(T5, Ti, T14);
+						  Ts = FNMS(Tk, Ti, Tr);
+						  {
+						       E T1f, T4H, T4D, T1b;
+						       T1f = T1a * Tk;
+						       T4H = T1a * Tl;
+						       T4D = T1a * Ti;
+						       T1b = T1a * Th;
+						       {
+							    E T27, T3E, T3A, T24;
+							    T27 = T6 * Tk;
+							    T3E = T6 * Tl;
+							    T3A = T6 * Ti;
+							    T24 = T6 * Th;
+							    {
+								 E T3v, T3r, T4P, T4L;
+								 T3v = T1u * Tl;
+								 T3r = T1u * Ti;
+								 T4P = T2v * Tl;
+								 T4L = T2v * Ti;
+								 {
+								      E T4o, T4k, T43, T3Z;
+								      T4o = T2H * Tl;
+								      T4k = T2H * Ti;
+								      T43 = Tz * Tl;
+								      T3Z = Tz * Ti;
+								      T1A = FNMS(T5, Th, T1z);
+								      T2A = FMA(T5, Th, T1z);
+								      T2M = FNMS(T4, Th, TE);
+								      TF = FMA(T4, Th, TE);
+								      T1H = FNMS(T1c, Th, T1f);
+								      T1g = FMA(T1c, Th, T1f);
+								      T51 = FNMS(T1c, Ti, T4H);
+								      T4I = FMA(T1c, Ti, T4H);
+								      T4Y = FMA(T1c, Tl, T4D);
+								      T4E = FNMS(T1c, Tl, T4D);
+								      T1d = FNMS(T1c, Tk, T1b);
+								      T1F = FMA(T1c, Tk, T1b);
+								      T34 = FMA(Tc, Th, T27);
+								      T28 = FNMS(Tc, Th, T27);
+								      T3V = FNMS(Tc, Ti, T3E);
+								      T3F = FMA(Tc, Ti, T3E);
+								      T3S = FMA(Tc, Tl, T3A);
+								      T3B = FNMS(Tc, Tl, T3A);
+								      T25 = FMA(Tc, Tk, T24);
+								      T32 = FNMS(Tc, Tk, T24);
+								      T3w = FNMS(T1A, Ti, T3v);
+								      T3s = FMA(T1A, Tl, T3r);
+								      T4Q = FNMS(T2A, Ti, T4P);
+								      T4M = FMA(T2A, Tl, T4L);
+								      T4p = FNMS(T2M, Ti, T4o);
+								      T4l = FMA(T2M, Tl, T4k);
+								      T44 = FNMS(TF, Ti, T43);
+								      T40 = FMA(TF, Tl, T3Z);
+								      {
+									   E T1m, T1e, T1N, T1G;
+									   T1m = T1d * Tl;
+									   T1e = T1d * Ti;
+									   T1N = T1F * Tl;
+									   T1G = T1F * Ti;
+									   {
+										E T2e, T26, T3a, T33;
+										T2e = T25 * Tl;
+										T26 = T25 * Ti;
+										T3a = T32 * Tl;
+										T33 = T32 * Ti;
+										T1n = FNMS(T1g, Ti, T1m);
+										T1h = FMA(T1g, Tl, T1e);
+										T1O = FNMS(T1H, Ti, T1N);
+										T1I = FMA(T1H, Tl, T1G);
+										T2f = FNMS(T28, Ti, T2e);
+										T29 = FMA(T28, Tl, T26);
+										T3b = FNMS(T34, Ti, T3a);
+										T35 = FMA(T34, Tl, T33);
+										T3h = Im[0];
+									   }
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					     T36 = Ip[WS(rs, 8)];
+					     T37 = Im[WS(rs, 8)];
+					     {
+						  E T47, T48, T3c, T3d;
+						  T47 = Rm[0];
+						  T4b = T3g + T3h;
+						  T3i = T3g - T3h;
+						  T45 = T36 + T37;
+						  T38 = T36 - T37;
+						  T48 = Rp[0];
+						  T3c = Rp[WS(rs, 8)];
+						  T3d = Rm[WS(rs, 8)];
+						  T39 = T35 * T38;
+						  T58 = T48 + T47;
+						  T49 = T47 - T48;
+						  T3e = T3c + T3d;
+						  T41 = T3d - T3c;
+					     }
+					}
+					{
+					     E T4W, T1x, T1y, T6a, T4U, T1D, T1P, T4V, T5K, T52, T1L, T1Q;
+					     {
+						  E T1B, T1C, T1J, T1K;
+						  {
+						       E T1v, T6A, T4c, T5a, T6y, T46, T1w, T6z, T4a;
+						       T1v = Ip[WS(rs, 3)];
+						       T6z = T4 * T49;
+						       T4a = T1 * T49;
+						       {
+							    E T3f, T59, T6x, T42;
+							    T3f = FNMS(T3b, T3e, T39);
+							    T59 = T35 * T3e;
+							    T6x = T44 * T41;
+							    T42 = T40 * T41;
+							    T6A = FMA(T1, T4b, T6z);
+							    T4c = FNMS(T4, T4b, T4a);
+							    T3j = T3f + T3i;
+							    T7Z = T3i - T3f;
+							    T5a = FMA(T3b, T38, T59);
+							    T6y = FMA(T40, T45, T6x);
+							    T46 = FNMS(T44, T45, T42);
+							    T1w = Im[WS(rs, 3)];
+						       }
+						       T5b = T58 + T5a;
+						       T93 = T58 - T5a;
+						       T6B = T6y + T6A;
+						       T8V = T6A - T6y;
+						       T4d = T46 + T4c;
+						       T8J = T4c - T46;
+						       T4W = T1v + T1w;
+						       T1x = T1v - T1w;
+						  }
+						  T1B = Rp[WS(rs, 3)];
+						  T1C = Rm[WS(rs, 3)];
+						  T1y = T1u * T1x;
+						  T6a = T25 * T4W;
+						  T1J = Ip[WS(rs, 11)];
+						  T4U = T1B - T1C;
+						  T1D = T1B + T1C;
+						  T1K = Im[WS(rs, 11)];
+						  T1P = Rp[WS(rs, 11)];
+						  T4V = T25 * T4U;
+						  T5K = T1u * T1D;
+						  T52 = T1J + T1K;
+						  T1L = T1J - T1K;
+						  T1Q = Rm[WS(rs, 11)];
+					     }
+					     {
+						  E T1E, T6c, T1M, T4Z, T1R, T6b;
+						  T1E = FNMS(T1A, T1D, T1y);
+						  T6c = T4Y * T52;
+						  T1M = T1I * T1L;
+						  T4Z = T1P - T1Q;
+						  T1R = T1P + T1Q;
+						  T6b = FNMS(T28, T4U, T6a);
+						  {
+						       E T5M, T6d, T50, T1S;
+						       T4X = FMA(T28, T4W, T4V);
+						       T6d = FNMS(T51, T4Z, T6c);
+						       T50 = T4Y * T4Z;
+						       T1S = FNMS(T1O, T1R, T1M);
+						       T5M = T1I * T1R;
+						       T8r = T6d - T6b;
+						       T6e = T6b + T6d;
+						       T8l = T1E - T1S;
+						       T1T = T1E + T1S;
+						       T53 = FMA(T51, T52, T50);
+						       T5L = FMA(T1A, T1x, T5K);
+						       T5N = FMA(T1O, T1L, T5M);
+						  }
+					     }
+					}
+				   }
+				   {
+					E T3Q, T2K, T2P, T2L, T6s, T3P, T5c, T3W, T2U, T2X, T2Y, T2V;
+					{
+					     E T2I, T2J, T2N, T2O, T2S, T3O, T2T;
+					     T2I = Ip[WS(rs, 4)];
+					     T8C = T53 - T4X;
+					     T54 = T4X + T53;
+					     T8i = T5L - T5N;
+					     T5O = T5L + T5N;
+					     T2J = Im[WS(rs, 4)];
+					     T2N = Rp[WS(rs, 4)];
+					     T2O = Rm[WS(rs, 4)];
+					     T2S = Ip[WS(rs, 12)];
+					     T3Q = T2I + T2J;
+					     T2K = T2I - T2J;
+					     T3O = T2O - T2N;
+					     T2P = T2N + T2O;
+					     T2T = Im[WS(rs, 12)];
+					     T2L = T2H * T2K;
+					     T6s = Tk * T3O;
+					     T3P = Th * T3O;
+					     T5c = T2H * T2P;
+					     T3W = T2S + T2T;
+					     T2U = T2S - T2T;
+					     T2X = Rp[WS(rs, 12)];
+					     T2Y = Rm[WS(rs, 12)];
+					     T2V = T2R * T2U;
+					}
+					{
+					     E T2Q, T6t, T3T, T2Z, T3R, T6u, T3U;
+					     T2Q = FNMS(T2M, T2P, T2L);
+					     T6t = FMA(Th, T3Q, T6s);
+					     T3T = T2Y - T2X;
+					     T2Z = T2X + T2Y;
+					     T3R = FNMS(Tk, T3Q, T3P);
+					     T5d = FMA(T2M, T2K, T5c);
+					     T6u = T3V * T3T;
+					     T3U = T3S * T3T;
+					     {
+						  E T30, T5e, T6v, T3X;
+						  T30 = FNMS(T2W, T2Z, T2V);
+						  T5e = T2R * T2Z;
+						  T6v = FMA(T3S, T3W, T6u);
+						  T3X = FNMS(T3V, T3W, T3U);
+						  T94 = T2Q - T30;
+						  T31 = T2Q + T30;
+						  T8K = T6t - T6v;
+						  T6w = T6t + T6v;
+						  T8U = T3R - T3X;
+						  T3Y = T3R + T3X;
+						  T5f = FMA(T2W, T2U, T5e);
+					     }
+					}
+				   }
+				   {
+					E T4J, T12, T65, T13, T4F, T18, T1o, T4G, T5F, T4R, T1k, T1p;
+					{
+					     E T16, T17, T10, T11, T1i, T1j;
+					     T10 = Ip[WS(rs, 15)];
+					     T11 = Im[WS(rs, 15)];
+					     T16 = Rp[WS(rs, 15)];
+					     T80 = T5d - T5f;
+					     T5g = T5d + T5f;
+					     T4J = T10 + T11;
+					     T12 = T10 - T11;
+					     T17 = Rm[WS(rs, 15)];
+					     T1i = Ip[WS(rs, 7)];
+					     T65 = T4E * T4J;
+					     T13 = TZ * T12;
+					     T4F = T16 - T17;
+					     T18 = T16 + T17;
+					     T1j = Im[WS(rs, 7)];
+					     T1o = Rp[WS(rs, 7)];
+					     T4G = T4E * T4F;
+					     T5F = TZ * T18;
+					     T4R = T1i + T1j;
+					     T1k = T1i - T1j;
+					     T1p = Rm[WS(rs, 7)];
+					}
+					{
+					     E T19, T67, T1l, T4N, T1q, T66;
+					     T19 = FNMS(T15, T18, T13);
+					     T67 = T4M * T4R;
+					     T1l = T1h * T1k;
+					     T4N = T1o - T1p;
+					     T1q = T1o + T1p;
+					     T66 = FNMS(T4I, T4F, T65);
+					     {
+						  E T5H, T68, T4O, T1r;
+						  T4K = FMA(T4I, T4J, T4G);
+						  T68 = FNMS(T4Q, T4N, T67);
+						  T4O = T4M * T4N;
+						  T1r = FNMS(T1n, T1q, T1l);
+						  T5H = T1h * T1q;
+						  T8B = T66 - T68;
+						  T69 = T66 + T68;
+						  T8h = T19 - T1r;
+						  T1s = T19 + T1r;
+						  T4S = FMA(T4Q, T4R, T4O);
+						  T5G = FMA(T15, T12, T5F);
+						  T5I = FMA(T1n, T1k, T5H);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T2c, T3x, T2d, T23, T5j, T3q, T2i, T3t, T6i, T8t, T5V, T5X;
+				   {
+					E Tn, T4i, T9, T4g, Tf, T5U, Ta, T4h, T5u, To, Tt, Tu;
+					{
+					     E T7, T8, Td, Te;
+					     T7 = Ip[WS(rs, 1)];
+					     T8q = T4S - T4K;
+					     T4T = T4K + T4S;
+					     T8k = T5G - T5I;
+					     T5J = T5G + T5I;
+					     T8 = Im[WS(rs, 1)];
+					     Td = Rp[WS(rs, 1)];
+					     Te = Rm[WS(rs, 1)];
+					     Tn = Ip[WS(rs, 9)];
+					     T4i = T7 + T8;
+					     T9 = T7 - T8;
+					     T4g = Td - Te;
+					     Tf = Td + Te;
+					     T5U = T2 * T4i;
+					     Ta = T6 * T9;
+					     T4h = T2 * T4g;
+					     T5u = T6 * Tf;
+					     To = Im[WS(rs, 9)];
+					     Tt = Rp[WS(rs, 9)];
+					     Tu = Rm[WS(rs, 9)];
+					}
+					{
+					     E Tg, T4q, Tp, T4m, Tv, T5W, Tq, T4n, T5w;
+					     Tg = FNMS(Tc, Tf, Ta);
+					     T4q = Tn + To;
+					     Tp = Tn - To;
+					     T4m = Tt - Tu;
+					     Tv = Tt + Tu;
+					     T5W = T4l * T4q;
+					     Tq = Tm * Tp;
+					     T4n = T4l * T4m;
+					     T5w = Tm * Tv;
+					     {
+						  E T5v, Tw, T4j, T5x, T4r;
+						  T5v = FMA(Tc, T9, T5u);
+						  Tw = FNMS(Ts, Tv, Tq);
+						  T4j = FMA(T5, T4i, T4h);
+						  T5x = FMA(Ts, Tp, T5w);
+						  T4r = FMA(T4p, T4q, T4n);
+						  Tx = Tg + Tw;
+						  T8a = Tg - Tw;
+						  T5y = T5v + T5x;
+						  T8d = T5v - T5x;
+						  T4s = T4j + T4r;
+						  T8t = T4r - T4j;
+						  T5V = FNMS(T5, T4g, T5U);
+						  T5X = FNMS(T4p, T4m, T5W);
+					     }
+					}
+				   }
+				   {
+					E T3p, T1Y, T1Z, T22, T2g, T6h, T3o, T5i, T2h;
+					{
+					     E T20, T21, T1W, T1X, T8u, T2a, T2b, T3n;
+					     T1W = Ip[WS(rs, 2)];
+					     T1X = Im[WS(rs, 2)];
+					     T8u = T5V - T5X;
+					     T5Y = T5V + T5X;
+					     T20 = Rp[WS(rs, 2)];
+					     T3p = T1W + T1X;
+					     T1Y = T1W - T1X;
+					     T8v = T8t - T8u;
+					     T8E = T8u + T8t;
+					     T21 = Rm[WS(rs, 2)];
+					     T1Z = T1a * T1Y;
+					     T2a = Ip[WS(rs, 10)];
+					     T2b = Im[WS(rs, 10)];
+					     T3n = T21 - T20;
+					     T22 = T20 + T21;
+					     T2g = Rp[WS(rs, 10)];
+					     T2c = T2a - T2b;
+					     T3x = T2a + T2b;
+					     T6h = T1H * T3n;
+					     T3o = T1F * T3n;
+					     T5i = T1a * T22;
+					     T2d = T29 * T2c;
+					     T2h = Rm[WS(rs, 10)];
+					}
+					T23 = FNMS(T1c, T22, T1Z);
+					T5j = FMA(T1c, T1Y, T5i);
+					T3q = FNMS(T1H, T3p, T3o);
+					T2i = T2g + T2h;
+					T3t = T2h - T2g;
+					T6i = FMA(T1F, T3p, T6h);
+				   }
+				   {
+					E T2y, T3K, T2z, T2u, T5o, T3H, T2D, T3I, T6n;
+					{
+					     E T3G, T2o, T2p, T2t, T6m, T3D, T5n, T2B, T2C;
+					     {
+						  E T2r, T2s, T2m, T2n, T3C, T2w, T2x;
+						  {
+						       E T8N, T8M, T6j, T3u, T2j;
+						       T2m = Ip[WS(rs, 14)];
+						       T6j = T3w * T3t;
+						       T3u = T3s * T3t;
+						       T2j = FNMS(T2f, T2i, T2d);
+						       {
+							    E T5k, T6k, T3y, T5l;
+							    T5k = T29 * T2i;
+							    T6k = FMA(T3s, T3x, T6j);
+							    T3y = FNMS(T3w, T3x, T3u);
+							    T2k = T23 + T2j;
+							    T82 = T23 - T2j;
+							    T5l = FMA(T2f, T2c, T5k);
+							    T6l = T6i + T6k;
+							    T8N = T6i - T6k;
+							    T3z = T3q + T3y;
+							    T8M = T3q - T3y;
+							    T83 = T5j - T5l;
+							    T5m = T5j + T5l;
+							    T2n = Im[WS(rs, 14)];
+						       }
+						       T8X = T8M + T8N;
+						       T8O = T8M - T8N;
+						  }
+						  T2r = Rp[WS(rs, 14)];
+						  T3G = T2m + T2n;
+						  T2o = T2m - T2n;
+						  T2s = Rm[WS(rs, 14)];
+						  T2w = Ip[WS(rs, 6)];
+						  T2x = Im[WS(rs, 6)];
+						  T2p = T2l * T2o;
+						  T3C = T2s - T2r;
+						  T2t = T2r + T2s;
+						  T2y = T2w - T2x;
+						  T3K = T2w + T2x;
+						  T6m = T3F * T3C;
+						  T3D = T3B * T3C;
+						  T5n = T2l * T2t;
+						  T2z = T2v * T2y;
+						  T2B = Rp[WS(rs, 6)];
+						  T2C = Rm[WS(rs, 6)];
+					     }
+					     T2u = FNMS(T2q, T2t, T2p);
+					     T5o = FMA(T2q, T2o, T5n);
+					     T3H = FNMS(T3F, T3G, T3D);
+					     T2D = T2B + T2C;
+					     T3I = T2C - T2B;
+					     T6n = FMA(T3B, T3G, T6m);
+					}
+					{
+					     E T4v, TC, T5Z, TD, T4t, TI, TS, T4u, T5z, T4z, TO, TT;
+					     {
+						  E TG, TH, TA, TB, TM, TN;
+						  {
+						       E T8Q, T8P, T6o, T3J, T2E;
+						       TA = Ip[WS(rs, 5)];
+						       T6o = T1g * T3I;
+						       T3J = T1d * T3I;
+						       T2E = FNMS(T2A, T2D, T2z);
+						       {
+							    E T5p, T6p, T3L, T5q;
+							    T5p = T2v * T2D;
+							    T6p = FMA(T1d, T3K, T6o);
+							    T3L = FNMS(T1g, T3K, T3J);
+							    T2F = T2u + T2E;
+							    T86 = T2u - T2E;
+							    T5q = FMA(T2A, T2y, T5p);
+							    T6q = T6n + T6p;
+							    T8Q = T6n - T6p;
+							    T3M = T3H + T3L;
+							    T8P = T3H - T3L;
+							    T85 = T5o - T5q;
+							    T5r = T5o + T5q;
+							    TB = Im[WS(rs, 5)];
+						       }
+						       T8Y = T8Q - T8P;
+						       T8R = T8P + T8Q;
+						  }
+						  TG = Rp[WS(rs, 5)];
+						  T4v = TA + TB;
+						  TC = TA - TB;
+						  TH = Rm[WS(rs, 5)];
+						  TM = Ip[WS(rs, 13)];
+						  T5Z = T32 * T4v;
+						  TD = Tz * TC;
+						  T4t = TG - TH;
+						  TI = TG + TH;
+						  TN = Im[WS(rs, 13)];
+						  TS = Rp[WS(rs, 13)];
+						  T4u = T32 * T4t;
+						  T5z = Tz * TI;
+						  T4z = TM + TN;
+						  TO = TM - TN;
+						  TT = Rm[WS(rs, 13)];
+					     }
+					     {
+						  E TJ, T61, TP, T4x, TU;
+						  TJ = FNMS(TF, TI, TD);
+						  T61 = Ti * T4z;
+						  TP = TL * TO;
+						  T4x = TS - TT;
+						  TU = TS + TT;
+						  {
+						       E T5A, T60, T5C, T62;
+						       T5A = FMA(TF, TC, T5z);
+						       {
+							    E T4w, T4y, TV, T5B, T4A;
+							    T4w = FMA(T34, T4v, T4u);
+							    T4y = Ti * T4x;
+							    TV = FNMS(TR, TU, TP);
+							    T5B = TL * TU;
+							    T60 = FNMS(T34, T4t, T5Z);
+							    T4A = FMA(Tl, T4z, T4y);
+							    TW = TJ + TV;
+							    T8e = TJ - TV;
+							    T5C = FMA(TR, TO, T5B);
+							    T8x = T4w - T4A;
+							    T4B = T4w + T4A;
+							    T62 = FNMS(Tl, T4x, T61);
+						       }
+						       T5D = T5A + T5C;
+						       T8b = T5A - T5C;
+						       T63 = T60 + T62;
+						       T8w = T62 - T60;
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T74, T78, T8F, T8y, T7s, T72, T75, T77, T7r, T71, T7f, T7d, T7c, T7g, T7m;
+			      E T7k, T7j, T7n, T6V, T6Y, T7T, T7W;
+			      {
+				   E T6S, T1V, T6I, T3l, T6H, T5Q, T6R, T5t, T56, T6g, T6N, T4f, T6M, T6W, T6D;
+				   E T6O;
+				   {
+					E T2G, T3k, T5E, T5P, TX, T1U, T5h, T5s;
+					T74 = Tx - TW;
+					TX = Tx + TW;
+					T1U = T1s + T1T;
+					T78 = T1s - T1T;
+					T8F = T8w - T8x;
+					T8y = T8w + T8x;
+					T7s = T2k - T2F;
+					T2G = T2k + T2F;
+					T6S = TX - T1U;
+					T1V = TX + T1U;
+					T3k = T31 + T3j;
+					T72 = T3j - T31;
+					T75 = T5y - T5D;
+					T5E = T5y + T5D;
+					T5P = T5J + T5O;
+					T77 = T5J - T5O;
+					T7r = T5b - T5g;
+					T5h = T5b + T5g;
+					T6I = T3k - T2G;
+					T3l = T2G + T3k;
+					T6H = T5P - T5E;
+					T5Q = T5E + T5P;
+					T5s = T5m + T5r;
+					T71 = T5r - T5m;
+					{
+					     E T64, T6L, T6f, T4C, T55;
+					     T7f = T4B - T4s;
+					     T4C = T4s + T4B;
+					     T55 = T4T + T54;
+					     T7d = T54 - T4T;
+					     T7c = T63 - T5Y;
+					     T64 = T5Y + T63;
+					     T6R = T5h - T5s;
+					     T5t = T5h + T5s;
+					     T6L = T4C - T55;
+					     T56 = T4C + T55;
+					     T7g = T69 - T6e;
+					     T6f = T69 + T6e;
+					     {
+						  E T6r, T6C, T3N, T4e, T6K;
+						  T7m = T3z - T3M;
+						  T3N = T3z + T3M;
+						  T4e = T3Y + T4d;
+						  T7k = T4d - T3Y;
+						  T6K = T6f - T64;
+						  T6g = T64 + T6f;
+						  T7j = T6q - T6l;
+						  T6r = T6l + T6q;
+						  T6N = T4e - T3N;
+						  T4f = T3N + T4e;
+						  T7n = T6B - T6w;
+						  T6C = T6w + T6B;
+						  T6M = T6K + T6L;
+						  T6W = T6K - T6L;
+						  T6D = T6r + T6C;
+						  T6O = T6C - T6r;
+					     }
+					}
+				   }
+				   {
+					E T5T, T6X, T6P, T6E;
+					{
+					     E T5S, T5R, T6F, T6G, T3m, T57;
+					     T5T = T3l - T1V;
+					     T3m = T1V + T3l;
+					     T57 = T4f - T56;
+					     T5S = T56 + T4f;
+					     T6X = T6N + T6O;
+					     T6P = T6N - T6O;
+					     T5R = T5t - T5Q;
+					     T6F = T5t + T5Q;
+					     Im[WS(rs, 15)] = KP500000000 * (T57 - T3m);
+					     Ip[0] = KP500000000 * (T3m + T57);
+					     T6G = T6g + T6D;
+					     T6E = T6g - T6D;
+					     Rp[0] = KP500000000 * (T6F + T6G);
+					     Rm[WS(rs, 15)] = KP500000000 * (T6F - T6G);
+					     Rp[WS(rs, 8)] = KP500000000 * (T5R + T5S);
+					     Rm[WS(rs, 7)] = KP500000000 * (T5R - T5S);
+					}
+					{
+					     E T6U, T6T, T6Z, T70, T6J, T6Q;
+					     T6V = T6I - T6H;
+					     T6J = T6H + T6I;
+					     T6Q = T6M + T6P;
+					     T6U = T6P - T6M;
+					     T6T = T6R - T6S;
+					     T6Z = T6R + T6S;
+					     Im[WS(rs, 7)] = KP500000000 * (T6E - T5T);
+					     Ip[WS(rs, 8)] = KP500000000 * (T5T + T6E);
+					     Im[WS(rs, 11)] = -(KP500000000 * (FNMS(KP707106781, T6Q, T6J)));
+					     Ip[WS(rs, 4)] = KP500000000 * (FMA(KP707106781, T6Q, T6J));
+					     T70 = T6W + T6X;
+					     T6Y = T6W - T6X;
+					     Rp[WS(rs, 4)] = KP500000000 * (FMA(KP707106781, T70, T6Z));
+					     Rm[WS(rs, 11)] = KP500000000 * (FNMS(KP707106781, T70, T6Z));
+					     Rp[WS(rs, 12)] = KP500000000 * (FMA(KP707106781, T6U, T6T));
+					     Rm[WS(rs, 3)] = KP500000000 * (FNMS(KP707106781, T6U, T6T));
+					}
+				   }
+			      }
+			      {
+				   E T7F, T73, T7P, T7t, T7G, T7w, T7Q, T7a, T7L, T7l, T7K, T7U, T7A, T7i, T7u;
+				   E T76;
+				   Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP707106781, T6Y, T6V)));
+				   Ip[WS(rs, 12)] = KP500000000 * (FMA(KP707106781, T6Y, T6V));
+				   T7F = T72 - T71;
+				   T73 = T71 + T72;
+				   T7P = T7r - T7s;
+				   T7t = T7r + T7s;
+				   T7u = T75 + T74;
+				   T76 = T74 - T75;
+				   {
+					E T7I, T7e, T7v, T79, T7J, T7h;
+					T7v = T77 - T78;
+					T79 = T77 + T78;
+					T7I = T7c - T7d;
+					T7e = T7c + T7d;
+					T7G = T7v - T7u;
+					T7w = T7u + T7v;
+					T7Q = T76 - T79;
+					T7a = T76 + T79;
+					T7J = T7g - T7f;
+					T7h = T7f + T7g;
+					T7L = T7k - T7j;
+					T7l = T7j + T7k;
+					T7K = FMA(KP414213562, T7J, T7I);
+					T7U = FNMS(KP414213562, T7I, T7J);
+					T7A = FNMS(KP414213562, T7e, T7h);
+					T7i = FMA(KP414213562, T7h, T7e);
+				   }
+				   {
+					E T7z, T7b, T7D, T7x, T7M, T7o;
+					T7z = FNMS(KP707106781, T7a, T73);
+					T7b = FMA(KP707106781, T7a, T73);
+					T7D = FMA(KP707106781, T7w, T7t);
+					T7x = FNMS(KP707106781, T7w, T7t);
+					T7M = T7n - T7m;
+					T7o = T7m + T7n;
+					{
+					     E T7S, T7R, T7X, T7Y;
+					     {
+						  E T7H, T7V, T7B, T7p, T7O, T7N;
+						  T7T = FMA(KP707106781, T7G, T7F);
+						  T7H = FNMS(KP707106781, T7G, T7F);
+						  T7N = FMA(KP414213562, T7M, T7L);
+						  T7V = FNMS(KP414213562, T7L, T7M);
+						  T7B = FMA(KP414213562, T7l, T7o);
+						  T7p = FNMS(KP414213562, T7o, T7l);
+						  T7O = T7K - T7N;
+						  T7S = T7K + T7N;
+						  T7R = FMA(KP707106781, T7Q, T7P);
+						  T7X = FNMS(KP707106781, T7Q, T7P);
+						  {
+						       E T7C, T7E, T7y, T7q;
+						       T7C = T7A - T7B;
+						       T7E = T7A + T7B;
+						       T7y = T7p - T7i;
+						       T7q = T7i + T7p;
+						       Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP923879532, T7O, T7H)));
+						       Ip[WS(rs, 14)] = KP500000000 * (FMA(KP923879532, T7O, T7H));
+						       Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP923879532, T7C, T7z)));
+						       Ip[WS(rs, 10)] = KP500000000 * (FMA(KP923879532, T7C, T7z));
+						       Rp[WS(rs, 2)] = KP500000000 * (FMA(KP923879532, T7E, T7D));
+						       Rm[WS(rs, 13)] = KP500000000 * (FNMS(KP923879532, T7E, T7D));
+						       Rp[WS(rs, 10)] = KP500000000 * (FMA(KP923879532, T7y, T7x));
+						       Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP923879532, T7y, T7x));
+						       Im[WS(rs, 13)] = -(KP500000000 * (FNMS(KP923879532, T7q, T7b)));
+						       Ip[WS(rs, 2)] = KP500000000 * (FMA(KP923879532, T7q, T7b));
+						       T7Y = T7U + T7V;
+						       T7W = T7U - T7V;
+						  }
+					     }
+					     Rm[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T7Y, T7X));
+					     Rp[WS(rs, 14)] = KP500000000 * (FNMS(KP923879532, T7Y, T7X));
+					     Rp[WS(rs, 6)] = KP500000000 * (FMA(KP923879532, T7S, T7R));
+					     Rm[WS(rs, 9)] = KP500000000 * (FNMS(KP923879532, T7S, T7R));
+					}
+				   }
+			      }
+			      {
+				   E Ta7, Tat, T9l, T89, T9H, Taj, T9v, T99, T9m, T9c, T9w, T8o, Tao, Tay, Tae;
+				   E Ta3, T9q, T9A, T9g, T8I, T8Z, T8W, Tak, Taa, Tau, T9O, T9r, T8T, Tar, Taz;
+				   E Taf, T9W;
+				   {
+					E T9M, T9L, T9J, T9I, T8s, T8G, T8D, Ta0, Tam, T9Z, Ta1, T8z, Ta9, T9K;
+					{
+					     E T9F, T81, Ta5, T95, T96, T97, Ta6, T88, T84, T87;
+					     T9F = T80 + T7Z;
+					     T81 = T7Z - T80;
+					     Ta5 = T93 - T94;
+					     T95 = T93 + T94;
+					     T96 = T83 + T82;
+					     T84 = T82 - T83;
+					     Im[WS(rs, 9)] = -(KP500000000 * (FNMS(KP923879532, T7W, T7T)));
+					     Ip[WS(rs, 6)] = KP500000000 * (FMA(KP923879532, T7W, T7T));
+					     T87 = T85 + T86;
+					     T97 = T85 - T86;
+					     Ta6 = T84 - T87;
+					     T88 = T84 + T87;
+					     {
+						  E T8j, T9a, T8g, T8m;
+						  {
+						       E T8c, T9G, T98, T8f;
+						       T9M = T8a + T8b;
+						       T8c = T8a - T8b;
+						       Ta7 = FMA(KP707106781, Ta6, Ta5);
+						       Tat = FNMS(KP707106781, Ta6, Ta5);
+						       T9l = FNMS(KP707106781, T88, T81);
+						       T89 = FMA(KP707106781, T88, T81);
+						       T9G = T97 - T96;
+						       T98 = T96 + T97;
+						       T8f = T8d + T8e;
+						       T9L = T8d - T8e;
+						       T9J = T8h + T8i;
+						       T8j = T8h - T8i;
+						       T9H = FMA(KP707106781, T9G, T9F);
+						       Taj = FNMS(KP707106781, T9G, T9F);
+						       T9v = FNMS(KP707106781, T98, T95);
+						       T99 = FMA(KP707106781, T98, T95);
+						       T9a = FMA(KP414213562, T8c, T8f);
+						       T8g = FNMS(KP414213562, T8f, T8c);
+						       T8m = T8k + T8l;
+						       T9I = T8k - T8l;
+						  }
+						  {
+						       E T9X, T9Y, T9b, T8n;
+						       T8s = T8q + T8r;
+						       T9X = T8r - T8q;
+						       T9Y = T8F - T8E;
+						       T8G = T8E + T8F;
+						       T8D = T8B + T8C;
+						       Ta0 = T8B - T8C;
+						       T9b = FNMS(KP414213562, T8j, T8m);
+						       T8n = FMA(KP414213562, T8m, T8j);
+						       Tam = FMA(KP707106781, T9Y, T9X);
+						       T9Z = FNMS(KP707106781, T9Y, T9X);
+						       T9m = T9b - T9a;
+						       T9c = T9a + T9b;
+						       T9w = T8g - T8n;
+						       T8o = T8g + T8n;
+						       Ta1 = T8y - T8v;
+						       T8z = T8v + T8y;
+						  }
+					     }
+					}
+					{
+					     E T9o, T8A, Tan, Ta2, T9p, T8H;
+					     Tan = FMA(KP707106781, Ta1, Ta0);
+					     Ta2 = FNMS(KP707106781, Ta1, Ta0);
+					     T9o = FNMS(KP707106781, T8z, T8s);
+					     T8A = FMA(KP707106781, T8z, T8s);
+					     Tao = FMA(KP198912367, Tan, Tam);
+					     Tay = FNMS(KP198912367, Tam, Tan);
+					     Tae = FMA(KP668178637, T9Z, Ta2);
+					     Ta3 = FNMS(KP668178637, Ta2, T9Z);
+					     T9p = FNMS(KP707106781, T8G, T8D);
+					     T8H = FMA(KP707106781, T8G, T8D);
+					     Ta9 = FNMS(KP414213562, T9I, T9J);
+					     T9K = FMA(KP414213562, T9J, T9I);
+					     T9q = FNMS(KP668178637, T9p, T9o);
+					     T9A = FMA(KP668178637, T9o, T9p);
+					     T9g = FNMS(KP198912367, T8A, T8H);
+					     T8I = FMA(KP198912367, T8H, T8A);
+					}
+					{
+					     E T8L, T9T, Tap, T9S, T9U, T8S, Taq, T9V;
+					     {
+						  E T9Q, T9R, Ta8, T9N;
+						  T8L = T8J - T8K;
+						  T9Q = T8K + T8J;
+						  T9R = T8X - T8Y;
+						  T8Z = T8X + T8Y;
+						  T8W = T8U + T8V;
+						  T9T = T8V - T8U;
+						  Ta8 = FMA(KP414213562, T9L, T9M);
+						  T9N = FNMS(KP414213562, T9M, T9L);
+						  Tap = FMA(KP707106781, T9R, T9Q);
+						  T9S = FNMS(KP707106781, T9R, T9Q);
+						  Tak = Ta8 + Ta9;
+						  Taa = Ta8 - Ta9;
+						  Tau = T9N + T9K;
+						  T9O = T9K - T9N;
+						  T9U = T8R - T8O;
+						  T8S = T8O + T8R;
+					     }
+					     Taq = FMA(KP707106781, T9U, T9T);
+					     T9V = FNMS(KP707106781, T9U, T9T);
+					     T9r = FNMS(KP707106781, T8S, T8L);
+					     T8T = FMA(KP707106781, T8S, T8L);
+					     Tar = FMA(KP198912367, Taq, Tap);
+					     Taz = FNMS(KP198912367, Tap, Taq);
+					     Taf = FMA(KP668178637, T9S, T9V);
+					     T9W = FNMS(KP668178637, T9V, T9S);
+					}
+				   }
+				   {
+					E T9z, T9C, Tad, Tag;
+					{
+					     E T9f, T8p, T9j, T9d, T9s, T90;
+					     T9f = FNMS(KP923879532, T8o, T89);
+					     T8p = FMA(KP923879532, T8o, T89);
+					     T9j = FMA(KP923879532, T9c, T99);
+					     T9d = FNMS(KP923879532, T9c, T99);
+					     T9s = FNMS(KP707106781, T8Z, T8W);
+					     T90 = FMA(KP707106781, T8Z, T8W);
+					     {
+						  E T9y, T9x, T9D, T9E;
+						  {
+						       E T9n, T9B, T9h, T91, T9u, T9t;
+						       T9z = FMA(KP923879532, T9m, T9l);
+						       T9n = FNMS(KP923879532, T9m, T9l);
+						       T9t = FMA(KP668178637, T9s, T9r);
+						       T9B = FNMS(KP668178637, T9r, T9s);
+						       T9h = FMA(KP198912367, T8T, T90);
+						       T91 = FNMS(KP198912367, T90, T8T);
+						       T9u = T9q + T9t;
+						       T9y = T9t - T9q;
+						       T9x = FMA(KP923879532, T9w, T9v);
+						       T9D = FNMS(KP923879532, T9w, T9v);
+						       {
+							    E T9i, T9k, T9e, T92;
+							    T9i = T9g - T9h;
+							    T9k = T9g + T9h;
+							    T9e = T91 - T8I;
+							    T92 = T8I + T91;
+							    Im[WS(rs, 2)] = -(KP500000000 * (FMA(KP831469612, T9u, T9n)));
+							    Ip[WS(rs, 13)] = KP500000000 * (FNMS(KP831469612, T9u, T9n));
+							    Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP980785280, T9i, T9f)));
+							    Ip[WS(rs, 9)] = KP500000000 * (FMA(KP980785280, T9i, T9f));
+							    Rp[WS(rs, 1)] = KP500000000 * (FMA(KP980785280, T9k, T9j));
+							    Rm[WS(rs, 14)] = KP500000000 * (FNMS(KP980785280, T9k, T9j));
+							    Rp[WS(rs, 9)] = KP500000000 * (FMA(KP980785280, T9e, T9d));
+							    Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP980785280, T9e, T9d));
+							    Im[WS(rs, 14)] = -(KP500000000 * (FNMS(KP980785280, T92, T8p)));
+							    Ip[WS(rs, 1)] = KP500000000 * (FMA(KP980785280, T92, T8p));
+							    T9E = T9A + T9B;
+							    T9C = T9A - T9B;
+						       }
+						  }
+						  Rm[WS(rs, 2)] = KP500000000 * (FMA(KP831469612, T9E, T9D));
+						  Rp[WS(rs, 13)] = KP500000000 * (FNMS(KP831469612, T9E, T9D));
+						  Rp[WS(rs, 5)] = KP500000000 * (FMA(KP831469612, T9y, T9x));
+						  Rm[WS(rs, 10)] = KP500000000 * (FNMS(KP831469612, T9y, T9x));
+					     }
+					}
+					{
+					     E Tac, Tab, Tah, Tai, T9P, Ta4;
+					     Tad = FNMS(KP923879532, T9O, T9H);
+					     T9P = FMA(KP923879532, T9O, T9H);
+					     Ta4 = T9W - Ta3;
+					     Tac = Ta3 + T9W;
+					     Tab = FNMS(KP923879532, Taa, Ta7);
+					     Tah = FMA(KP923879532, Taa, Ta7);
+					     Im[WS(rs, 10)] = -(KP500000000 * (FNMS(KP831469612, T9C, T9z)));
+					     Ip[WS(rs, 5)] = KP500000000 * (FMA(KP831469612, T9C, T9z));
+					     Im[WS(rs, 12)] = -(KP500000000 * (FNMS(KP831469612, Ta4, T9P)));
+					     Ip[WS(rs, 3)] = KP500000000 * (FMA(KP831469612, Ta4, T9P));
+					     Tai = Tae + Taf;
+					     Tag = Tae - Taf;
+					     Rp[WS(rs, 3)] = KP500000000 * (FMA(KP831469612, Tai, Tah));
+					     Rm[WS(rs, 12)] = KP500000000 * (FNMS(KP831469612, Tai, Tah));
+					     Rp[WS(rs, 11)] = KP500000000 * (FMA(KP831469612, Tac, Tab));
+					     Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP831469612, Tac, Tab));
+					}
+					{
+					     E Taw, Tav, TaB, TaC, Tal, Tas;
+					     Tax = FNMS(KP923879532, Tak, Taj);
+					     Tal = FMA(KP923879532, Tak, Taj);
+					     Tas = Tao - Tar;
+					     Taw = Tao + Tar;
+					     Tav = FNMS(KP923879532, Tau, Tat);
+					     TaB = FMA(KP923879532, Tau, Tat);
+					     Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP831469612, Tag, Tad)));
+					     Ip[WS(rs, 11)] = KP500000000 * (FMA(KP831469612, Tag, Tad));
+					     Im[0] = -(KP500000000 * (FNMS(KP980785280, Tas, Tal)));
+					     Ip[WS(rs, 15)] = KP500000000 * (FMA(KP980785280, Tas, Tal));
+					     TaC = Tay + Taz;
+					     TaA = Tay - Taz;
+					     Rm[0] = KP500000000 * (FMA(KP980785280, TaC, TaB));
+					     Rp[WS(rs, 15)] = KP500000000 * (FNMS(KP980785280, TaC, TaB));
+					     Rp[WS(rs, 7)] = KP500000000 * (FMA(KP980785280, Taw, Tav));
+					     Rm[WS(rs, 8)] = KP500000000 * (FNMS(KP980785280, Taw, Tav));
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Im[WS(rs, 8)] = -(KP500000000 * (FNMS(KP980785280, TaA, Tax)));
+	       Ip[WS(rs, 7)] = KP500000000 * (FMA(KP980785280, TaA, Tax));
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cfdft2_32", twinstr, &GENUS, {300, 162, 252, 0} };
+
+void X(codelet_hc2cfdft2_32) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft2_32, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -dit -name hc2cfdft2_32 -include hc2cf.h */
+
+/*
+ * This function contains 552 FP additions, 300 FP multiplications,
+ * (or, 440 additions, 188 multiplications, 112 fused multiply/add),
+ * 166 stack variables, 9 constants, and 128 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft2_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP277785116, +0.277785116509801112371415406974266437187468595);
+     DK(KP415734806, +0.415734806151272618539394188808952878369280406);
+     DK(KP097545161, +0.097545161008064133924142434238511120463845809);
+     DK(KP490392640, +0.490392640201615224563091118067119518486966865);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP191341716, +0.191341716182544885864229992015199433380672281);
+     DK(KP461939766, +0.461939766255643378064091594698394143411208313);
+     DK(KP353553390, +0.353553390593273762200422181052424519642417969);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T1, T4, T2, T5, T7, T1b, T1d, Td, Ti, Tk, Tj, Tl, TL, TR, T2h;
+	       E T2O, T16, T2l, T10, T2K, Tm, Tq, T3s, T3K, T3w, T3M, T4e, T4u, T4i, T4w;
+	       E Ty, TE, T3h, T3j, T2q, T2u, T4l, T4n, T1v, T1B, T3E, T3G, T2B, T2F, T3Y;
+	       E T40, T1f, T1G, T1i, T1H, T1j, T1M, T1n, T1I, T23, T2U, T26, T2V, T27, T30;
+	       E T2b, T2W;
+	       {
+		    E Tw, T1A, TD, T1t, Tx, T1z, TC, T1u, TJ, T15, TQ, TY, TK, T14, TP;
+		    E TZ;
+		    {
+			 E T3, Tc, T6, Tb;
+			 T1 = W[0];
+			 T4 = W[1];
+			 T2 = W[2];
+			 T5 = W[3];
+			 T3 = T1 * T2;
+			 Tc = T4 * T2;
+			 T6 = T4 * T5;
+			 Tb = T1 * T5;
+			 T7 = T3 + T6;
+			 T1b = T3 - T6;
+			 T1d = Tb + Tc;
+			 Td = Tb - Tc;
+			 Ti = W[4];
+			 Tw = T1 * Ti;
+			 T1A = T5 * Ti;
+			 TD = T4 * Ti;
+			 T1t = T2 * Ti;
+			 Tk = W[5];
+			 Tx = T4 * Tk;
+			 T1z = T2 * Tk;
+			 TC = T1 * Tk;
+			 T1u = T5 * Tk;
+			 Tj = W[6];
+			 TJ = T1 * Tj;
+			 T15 = T5 * Tj;
+			 TQ = T4 * Tj;
+			 TY = T2 * Tj;
+			 Tl = W[7];
+			 TK = T4 * Tl;
+			 T14 = T2 * Tl;
+			 TP = T1 * Tl;
+			 TZ = T5 * Tl;
+		    }
+		    TL = TJ + TK;
+		    TR = TP - TQ;
+		    T2h = TJ - TK;
+		    T2O = T14 - T15;
+		    T16 = T14 + T15;
+		    T2l = TP + TQ;
+		    T10 = TY - TZ;
+		    T2K = TY + TZ;
+		    Tm = FMA(Ti, Tj, Tk * Tl);
+		    Tq = FNMS(Tk, Tj, Ti * Tl);
+		    {
+			 E T3q, T3r, T3u, T3v;
+			 T3q = T7 * Tj;
+			 T3r = Td * Tl;
+			 T3s = T3q + T3r;
+			 T3K = T3q - T3r;
+			 T3u = T7 * Tl;
+			 T3v = Td * Tj;
+			 T3w = T3u - T3v;
+			 T3M = T3u + T3v;
+		    }
+		    {
+			 E T4c, T4d, T4g, T4h;
+			 T4c = T1b * Tj;
+			 T4d = T1d * Tl;
+			 T4e = T4c - T4d;
+			 T4u = T4c + T4d;
+			 T4g = T1b * Tl;
+			 T4h = T1d * Tj;
+			 T4i = T4g + T4h;
+			 T4w = T4g - T4h;
+			 Ty = Tw - Tx;
+			 TE = TC + TD;
+			 T3h = FMA(Ty, Tj, TE * Tl);
+			 T3j = FNMS(TE, Tj, Ty * Tl);
+		    }
+		    T2q = T1t - T1u;
+		    T2u = T1z + T1A;
+		    T4l = FMA(T2q, Tj, T2u * Tl);
+		    T4n = FNMS(T2u, Tj, T2q * Tl);
+		    T1v = T1t + T1u;
+		    T1B = T1z - T1A;
+		    T3E = FMA(T1v, Tj, T1B * Tl);
+		    T3G = FNMS(T1B, Tj, T1v * Tl);
+		    T2B = Tw + Tx;
+		    T2F = TC - TD;
+		    T3Y = FMA(T2B, Tj, T2F * Tl);
+		    T40 = FNMS(T2F, Tj, T2B * Tl);
+		    {
+			 E T1c, T1e, T1g, T1h;
+			 T1c = T1b * Ti;
+			 T1e = T1d * Tk;
+			 T1f = T1c - T1e;
+			 T1G = T1c + T1e;
+			 T1g = T1b * Tk;
+			 T1h = T1d * Ti;
+			 T1i = T1g + T1h;
+			 T1H = T1g - T1h;
+		    }
+		    T1j = FMA(T1f, Tj, T1i * Tl);
+		    T1M = FNMS(T1H, Tj, T1G * Tl);
+		    T1n = FNMS(T1i, Tj, T1f * Tl);
+		    T1I = FMA(T1G, Tj, T1H * Tl);
+		    {
+			 E T21, T22, T24, T25;
+			 T21 = T7 * Ti;
+			 T22 = Td * Tk;
+			 T23 = T21 + T22;
+			 T2U = T21 - T22;
+			 T24 = T7 * Tk;
+			 T25 = Td * Ti;
+			 T26 = T24 - T25;
+			 T2V = T24 + T25;
+		    }
+		    T27 = FMA(T23, Tj, T26 * Tl);
+		    T30 = FNMS(T2V, Tj, T2U * Tl);
+		    T2b = FNMS(T26, Tj, T23 * Tl);
+		    T2W = FMA(T2U, Tj, T2V * Tl);
+	       }
+	       {
+		    E T38, T7l, T7S, T8Y, T7Z, T91, T3A, T6k, T4F, T83, T5C, T6n, T2T, T84, T4I;
+		    E T7m, T2g, T4M, T4P, T2z, T3T, T6m, T7O, T7V, T7j, T87, T5v, T6j, T7L, T7U;
+		    E T7g, T86, Tv, TW, T61, T4U, T4X, T62, T4b, T6c, T7v, T7C, T5g, T6f, T74;
+		    E T8G, T7s, T7B, T71, T8F, T1s, T1R, T65, T51, T54, T64, T4A, T6g, T7G, T8U;
+		    E T5n, T6d, T7b, T8J, T7z, T8R, T78, T8I;
+		    {
+			 E T2E, T2I, T3p, T5w, T37, T4D, T3g, T5A, T2N, T2R, T3y, T5x, T2Z, T33, T3l;
+			 E T5z;
+			 {
+			      E T2C, T2D, T3o, T2G, T2H, T3n;
+			      T2C = Ip[WS(rs, 4)];
+			      T2D = Im[WS(rs, 4)];
+			      T3o = T2C + T2D;
+			      T2G = Rp[WS(rs, 4)];
+			      T2H = Rm[WS(rs, 4)];
+			      T3n = T2G - T2H;
+			      T2E = T2C - T2D;
+			      T2I = T2G + T2H;
+			      T3p = FMA(Ti, T3n, Tk * T3o);
+			      T5w = FNMS(Tk, T3n, Ti * T3o);
+			 }
+			 {
+			      E T35, T36, T3f, T3c, T3d, T3e;
+			      T35 = Ip[0];
+			      T36 = Im[0];
+			      T3f = T35 + T36;
+			      T3c = Rm[0];
+			      T3d = Rp[0];
+			      T3e = T3c - T3d;
+			      T37 = T35 - T36;
+			      T4D = T3d + T3c;
+			      T3g = FNMS(T4, T3f, T1 * T3e);
+			      T5A = FMA(T4, T3e, T1 * T3f);
+			 }
+			 {
+			      E T2L, T2M, T3x, T2P, T2Q, T3t;
+			      T2L = Ip[WS(rs, 12)];
+			      T2M = Im[WS(rs, 12)];
+			      T3x = T2L + T2M;
+			      T2P = Rp[WS(rs, 12)];
+			      T2Q = Rm[WS(rs, 12)];
+			      T3t = T2P - T2Q;
+			      T2N = T2L - T2M;
+			      T2R = T2P + T2Q;
+			      T3y = FMA(T3s, T3t, T3w * T3x);
+			      T5x = FNMS(T3w, T3t, T3s * T3x);
+			 }
+			 {
+			      E T2X, T2Y, T3k, T31, T32, T3i;
+			      T2X = Ip[WS(rs, 8)];
+			      T2Y = Im[WS(rs, 8)];
+			      T3k = T2X + T2Y;
+			      T31 = Rp[WS(rs, 8)];
+			      T32 = Rm[WS(rs, 8)];
+			      T3i = T31 - T32;
+			      T2Z = T2X - T2Y;
+			      T33 = T31 + T32;
+			      T3l = FMA(T3h, T3i, T3j * T3k);
+			      T5z = FNMS(T3j, T3i, T3h * T3k);
+			 }
+			 {
+			      E T34, T7Q, T7R, T4E, T5y, T5B;
+			      T34 = FNMS(T30, T33, T2W * T2Z);
+			      T38 = T34 + T37;
+			      T7l = T37 - T34;
+			      T7Q = T3l + T3g;
+			      T7R = T5w - T5x;
+			      T7S = T7Q - T7R;
+			      T8Y = T7R + T7Q;
+			      {
+				   E T7X, T7Y, T3m, T3z;
+				   T7X = T3y - T3p;
+				   T7Y = T5A - T5z;
+				   T7Z = T7X + T7Y;
+				   T91 = T7Y - T7X;
+				   T3m = T3g - T3l;
+				   T3z = T3p + T3y;
+				   T3A = T3m - T3z;
+				   T6k = T3z + T3m;
+			      }
+			      T4E = FMA(T2W, T33, T30 * T2Z);
+			      T4F = T4D + T4E;
+			      T83 = T4D - T4E;
+			      T5y = T5w + T5x;
+			      T5B = T5z + T5A;
+			      T5C = T5y + T5B;
+			      T6n = T5B - T5y;
+			      {
+				   E T2J, T2S, T4G, T4H;
+				   T2J = FNMS(T2F, T2I, T2B * T2E);
+				   T2S = FNMS(T2O, T2R, T2K * T2N);
+				   T2T = T2J + T2S;
+				   T84 = T2J - T2S;
+				   T4G = FMA(T2B, T2I, T2F * T2E);
+				   T4H = FMA(T2K, T2R, T2O * T2N);
+				   T4I = T4G + T4H;
+				   T7m = T4G - T4H;
+			      }
+			 }
+		    }
+		    {
+			 E T20, T5p, T3D, T4K, T2y, T5t, T3R, T4O, T2f, T5q, T3I, T4L, T2p, T5s, T3O;
+			 E T4N;
+			 {
+			      E T1W, T3C, T1Z, T3B;
+			      {
+				   E T1U, T1V, T1X, T1Y;
+				   T1U = Ip[WS(rs, 2)];
+				   T1V = Im[WS(rs, 2)];
+				   T1W = T1U - T1V;
+				   T3C = T1U + T1V;
+				   T1X = Rp[WS(rs, 2)];
+				   T1Y = Rm[WS(rs, 2)];
+				   T1Z = T1X + T1Y;
+				   T3B = T1X - T1Y;
+			      }
+			      T20 = FNMS(T1d, T1Z, T1b * T1W);
+			      T5p = FNMS(T1H, T3B, T1G * T3C);
+			      T3D = FMA(T1G, T3B, T1H * T3C);
+			      T4K = FMA(T1b, T1Z, T1d * T1W);
+			 }
+			 {
+			      E T2t, T3Q, T2x, T3P;
+			      {
+				   E T2r, T2s, T2v, T2w;
+				   T2r = Ip[WS(rs, 6)];
+				   T2s = Im[WS(rs, 6)];
+				   T2t = T2r - T2s;
+				   T3Q = T2r + T2s;
+				   T2v = Rp[WS(rs, 6)];
+				   T2w = Rm[WS(rs, 6)];
+				   T2x = T2v + T2w;
+				   T3P = T2v - T2w;
+			      }
+			      T2y = FNMS(T2u, T2x, T2q * T2t);
+			      T5t = FNMS(T1i, T3P, T1f * T3Q);
+			      T3R = FMA(T1f, T3P, T1i * T3Q);
+			      T4O = FMA(T2q, T2x, T2u * T2t);
+			 }
+			 {
+			      E T2a, T3H, T2e, T3F;
+			      {
+				   E T28, T29, T2c, T2d;
+				   T28 = Ip[WS(rs, 10)];
+				   T29 = Im[WS(rs, 10)];
+				   T2a = T28 - T29;
+				   T3H = T28 + T29;
+				   T2c = Rp[WS(rs, 10)];
+				   T2d = Rm[WS(rs, 10)];
+				   T2e = T2c + T2d;
+				   T3F = T2c - T2d;
+			      }
+			      T2f = FNMS(T2b, T2e, T27 * T2a);
+			      T5q = FNMS(T3G, T3F, T3E * T3H);
+			      T3I = FMA(T3E, T3F, T3G * T3H);
+			      T4L = FMA(T27, T2e, T2b * T2a);
+			 }
+			 {
+			      E T2k, T3N, T2o, T3L;
+			      {
+				   E T2i, T2j, T2m, T2n;
+				   T2i = Ip[WS(rs, 14)];
+				   T2j = Im[WS(rs, 14)];
+				   T2k = T2i - T2j;
+				   T3N = T2i + T2j;
+				   T2m = Rp[WS(rs, 14)];
+				   T2n = Rm[WS(rs, 14)];
+				   T2o = T2m + T2n;
+				   T3L = T2m - T2n;
+			      }
+			      T2p = FNMS(T2l, T2o, T2h * T2k);
+			      T5s = FNMS(T3M, T3L, T3K * T3N);
+			      T3O = FMA(T3K, T3L, T3M * T3N);
+			      T4N = FMA(T2h, T2o, T2l * T2k);
+			 }
+			 {
+			      E T3J, T3S, T5r, T5u;
+			      T2g = T20 + T2f;
+			      T4M = T4K + T4L;
+			      T4P = T4N + T4O;
+			      T2z = T2p + T2y;
+			      T3J = T3D + T3I;
+			      T3S = T3O + T3R;
+			      T3T = T3J + T3S;
+			      T6m = T3S - T3J;
+			      {
+				   E T7M, T7N, T7h, T7i;
+				   T7M = T5s - T5t;
+				   T7N = T3R - T3O;
+				   T7O = T7M + T7N;
+				   T7V = T7M - T7N;
+				   T7h = T4N - T4O;
+				   T7i = T2p - T2y;
+				   T7j = T7h + T7i;
+				   T87 = T7h - T7i;
+			      }
+			      T5r = T5p + T5q;
+			      T5u = T5s + T5t;
+			      T5v = T5r + T5u;
+			      T6j = T5u - T5r;
+			      {
+				   E T7J, T7K, T7e, T7f;
+				   T7J = T3I - T3D;
+				   T7K = T5p - T5q;
+				   T7L = T7J - T7K;
+				   T7U = T7K + T7J;
+				   T7e = T20 - T2f;
+				   T7f = T4K - T4L;
+				   T7g = T7e - T7f;
+				   T86 = T7f + T7e;
+			      }
+			 }
+		    }
+		    {
+			 E Th, T5a, T3X, T4S, TV, T5e, T49, T4W, Tu, T5b, T42, T4T, TI, T5d, T46;
+			 E T4V;
+			 {
+			      E Ta, T3W, Tg, T3V;
+			      {
+				   E T8, T9, Te, Tf;
+				   T8 = Ip[WS(rs, 1)];
+				   T9 = Im[WS(rs, 1)];
+				   Ta = T8 - T9;
+				   T3W = T8 + T9;
+				   Te = Rp[WS(rs, 1)];
+				   Tf = Rm[WS(rs, 1)];
+				   Tg = Te + Tf;
+				   T3V = Te - Tf;
+			      }
+			      Th = FNMS(Td, Tg, T7 * Ta);
+			      T5a = FNMS(T5, T3V, T2 * T3W);
+			      T3X = FMA(T2, T3V, T5 * T3W);
+			      T4S = FMA(T7, Tg, Td * Ta);
+			 }
+			 {
+			      E TO, T48, TU, T47;
+			      {
+				   E TM, TN, TS, TT;
+				   TM = Ip[WS(rs, 13)];
+				   TN = Im[WS(rs, 13)];
+				   TO = TM - TN;
+				   T48 = TM + TN;
+				   TS = Rp[WS(rs, 13)];
+				   TT = Rm[WS(rs, 13)];
+				   TU = TS + TT;
+				   T47 = TS - TT;
+			      }
+			      TV = FNMS(TR, TU, TL * TO);
+			      T5e = FNMS(Tl, T47, Tj * T48);
+			      T49 = FMA(Tj, T47, Tl * T48);
+			      T4W = FMA(TL, TU, TR * TO);
+			 }
+			 {
+			      E Tp, T41, Tt, T3Z;
+			      {
+				   E Tn, To, Tr, Ts;
+				   Tn = Ip[WS(rs, 9)];
+				   To = Im[WS(rs, 9)];
+				   Tp = Tn - To;
+				   T41 = Tn + To;
+				   Tr = Rp[WS(rs, 9)];
+				   Ts = Rm[WS(rs, 9)];
+				   Tt = Tr + Ts;
+				   T3Z = Tr - Ts;
+			      }
+			      Tu = FNMS(Tq, Tt, Tm * Tp);
+			      T5b = FNMS(T40, T3Z, T3Y * T41);
+			      T42 = FMA(T3Y, T3Z, T40 * T41);
+			      T4T = FMA(Tm, Tt, Tq * Tp);
+			 }
+			 {
+			      E TB, T45, TH, T44;
+			      {
+				   E Tz, TA, TF, TG;
+				   Tz = Ip[WS(rs, 5)];
+				   TA = Im[WS(rs, 5)];
+				   TB = Tz - TA;
+				   T45 = Tz + TA;
+				   TF = Rp[WS(rs, 5)];
+				   TG = Rm[WS(rs, 5)];
+				   TH = TF + TG;
+				   T44 = TF - TG;
+			      }
+			      TI = FNMS(TE, TH, Ty * TB);
+			      T5d = FNMS(T2V, T44, T2U * T45);
+			      T46 = FMA(T2U, T44, T2V * T45);
+			      T4V = FMA(Ty, TH, TE * TB);
+			 }
+			 Tv = Th + Tu;
+			 TW = TI + TV;
+			 T61 = Tv - TW;
+			 T4U = T4S + T4T;
+			 T4X = T4V + T4W;
+			 T62 = T4U - T4X;
+			 {
+			      E T43, T4a, T7t, T7u;
+			      T43 = T3X + T42;
+			      T4a = T46 + T49;
+			      T4b = T43 + T4a;
+			      T6c = T4a - T43;
+			      T7t = T5e - T5d;
+			      T7u = T46 - T49;
+			      T7v = T7t + T7u;
+			      T7C = T7t - T7u;
+			 }
+			 {
+			      E T5c, T5f, T72, T73;
+			      T5c = T5a + T5b;
+			      T5f = T5d + T5e;
+			      T5g = T5c + T5f;
+			      T6f = T5f - T5c;
+			      T72 = T4S - T4T;
+			      T73 = TI - TV;
+			      T74 = T72 + T73;
+			      T8G = T72 - T73;
+			 }
+			 {
+			      E T7q, T7r, T6Z, T70;
+			      T7q = T42 - T3X;
+			      T7r = T5a - T5b;
+			      T7s = T7q - T7r;
+			      T7B = T7r + T7q;
+			      T6Z = Th - Tu;
+			      T70 = T4V - T4W;
+			      T71 = T6Z - T70;
+			      T8F = T6Z + T70;
+			 }
+		    }
+		    {
+			 E T1a, T5h, T4k, T4Z, T1Q, T5l, T4y, T53, T1r, T5i, T4p, T50, T1F, T5k, T4t;
+			 E T52;
+			 {
+			      E T13, T4j, T19, T4f;
+			      {
+				   E T11, T12, T17, T18;
+				   T11 = Ip[WS(rs, 15)];
+				   T12 = Im[WS(rs, 15)];
+				   T13 = T11 - T12;
+				   T4j = T11 + T12;
+				   T17 = Rp[WS(rs, 15)];
+				   T18 = Rm[WS(rs, 15)];
+				   T19 = T17 + T18;
+				   T4f = T17 - T18;
+			      }
+			      T1a = FNMS(T16, T19, T10 * T13);
+			      T5h = FNMS(T4i, T4f, T4e * T4j);
+			      T4k = FMA(T4e, T4f, T4i * T4j);
+			      T4Z = FMA(T10, T19, T16 * T13);
+			 }
+			 {
+			      E T1L, T4x, T1P, T4v;
+			      {
+				   E T1J, T1K, T1N, T1O;
+				   T1J = Ip[WS(rs, 11)];
+				   T1K = Im[WS(rs, 11)];
+				   T1L = T1J - T1K;
+				   T4x = T1J + T1K;
+				   T1N = Rp[WS(rs, 11)];
+				   T1O = Rm[WS(rs, 11)];
+				   T1P = T1N + T1O;
+				   T4v = T1N - T1O;
+			      }
+			      T1Q = FNMS(T1M, T1P, T1I * T1L);
+			      T5l = FNMS(T4w, T4v, T4u * T4x);
+			      T4y = FMA(T4u, T4v, T4w * T4x);
+			      T53 = FMA(T1I, T1P, T1M * T1L);
+			 }
+			 {
+			      E T1m, T4o, T1q, T4m;
+			      {
+				   E T1k, T1l, T1o, T1p;
+				   T1k = Ip[WS(rs, 7)];
+				   T1l = Im[WS(rs, 7)];
+				   T1m = T1k - T1l;
+				   T4o = T1k + T1l;
+				   T1o = Rp[WS(rs, 7)];
+				   T1p = Rm[WS(rs, 7)];
+				   T1q = T1o + T1p;
+				   T4m = T1o - T1p;
+			      }
+			      T1r = FNMS(T1n, T1q, T1j * T1m);
+			      T5i = FNMS(T4n, T4m, T4l * T4o);
+			      T4p = FMA(T4l, T4m, T4n * T4o);
+			      T50 = FMA(T1j, T1q, T1n * T1m);
+			 }
+			 {
+			      E T1y, T4s, T1E, T4r;
+			      {
+				   E T1w, T1x, T1C, T1D;
+				   T1w = Ip[WS(rs, 3)];
+				   T1x = Im[WS(rs, 3)];
+				   T1y = T1w - T1x;
+				   T4s = T1w + T1x;
+				   T1C = Rp[WS(rs, 3)];
+				   T1D = Rm[WS(rs, 3)];
+				   T1E = T1C + T1D;
+				   T4r = T1C - T1D;
+			      }
+			      T1F = FNMS(T1B, T1E, T1v * T1y);
+			      T5k = FNMS(T26, T4r, T23 * T4s);
+			      T4t = FMA(T23, T4r, T26 * T4s);
+			      T52 = FMA(T1v, T1E, T1B * T1y);
+			 }
+			 T1s = T1a + T1r;
+			 T1R = T1F + T1Q;
+			 T65 = T1s - T1R;
+			 T51 = T4Z + T50;
+			 T54 = T52 + T53;
+			 T64 = T51 - T54;
+			 {
+			      E T4q, T4z, T7E, T7F;
+			      T4q = T4k + T4p;
+			      T4z = T4t + T4y;
+			      T4A = T4q + T4z;
+			      T6g = T4z - T4q;
+			      T7E = T5h - T5i;
+			      T7F = T4y - T4t;
+			      T7G = T7E + T7F;
+			      T8U = T7E - T7F;
+			 }
+			 {
+			      E T5j, T5m, T79, T7a;
+			      T5j = T5h + T5i;
+			      T5m = T5k + T5l;
+			      T5n = T5j + T5m;
+			      T6d = T5j - T5m;
+			      T79 = T4Z - T50;
+			      T7a = T1F - T1Q;
+			      T7b = T79 + T7a;
+			      T8J = T79 - T7a;
+			 }
+			 {
+			      E T7x, T7y, T76, T77;
+			      T7x = T4p - T4k;
+			      T7y = T5k - T5l;
+			      T7z = T7x - T7y;
+			      T8R = T7x + T7y;
+			      T76 = T1a - T1r;
+			      T77 = T52 - T53;
+			      T78 = T76 - T77;
+			      T8I = T76 + T77;
+			 }
+		    }
+		    {
+			 E T1T, T5S, T5M, T5W, T5P, T5X, T3a, T5I, T4C, T58, T56, T5H, T5E, T5G, T4R;
+			 E T5R;
+			 {
+			      E TX, T1S, T5K, T5L;
+			      TX = Tv + TW;
+			      T1S = T1s + T1R;
+			      T1T = TX + T1S;
+			      T5S = TX - T1S;
+			      T5K = T5n - T5g;
+			      T5L = T4b - T4A;
+			      T5M = T5K + T5L;
+			      T5W = T5K - T5L;
+			 }
+			 {
+			      E T5N, T5O, T2A, T39;
+			      T5N = T3T + T3A;
+			      T5O = T5C - T5v;
+			      T5P = T5N - T5O;
+			      T5X = T5N + T5O;
+			      T2A = T2g + T2z;
+			      T39 = T2T + T38;
+			      T3a = T2A + T39;
+			      T5I = T39 - T2A;
+			 }
+			 {
+			      E T3U, T4B, T4Y, T55;
+			      T3U = T3A - T3T;
+			      T4B = T4b + T4A;
+			      T4C = T3U - T4B;
+			      T58 = T4B + T3U;
+			      T4Y = T4U + T4X;
+			      T55 = T51 + T54;
+			      T56 = T4Y + T55;
+			      T5H = T55 - T4Y;
+			 }
+			 {
+			      E T5o, T5D, T4J, T4Q;
+			      T5o = T5g + T5n;
+			      T5D = T5v + T5C;
+			      T5E = T5o - T5D;
+			      T5G = T5o + T5D;
+			      T4J = T4F + T4I;
+			      T4Q = T4M + T4P;
+			      T4R = T4J + T4Q;
+			      T5R = T4J - T4Q;
+			 }
+			 {
+			      E T3b, T5F, T57, T59;
+			      T3b = T1T + T3a;
+			      Ip[0] = KP500000000 * (T3b + T4C);
+			      Im[WS(rs, 15)] = KP500000000 * (T4C - T3b);
+			      T5F = T4R + T56;
+			      Rm[WS(rs, 15)] = KP500000000 * (T5F - T5G);
+			      Rp[0] = KP500000000 * (T5F + T5G);
+			      T57 = T4R - T56;
+			      Rm[WS(rs, 7)] = KP500000000 * (T57 - T58);
+			      Rp[WS(rs, 8)] = KP500000000 * (T57 + T58);
+			      T59 = T3a - T1T;
+			      Ip[WS(rs, 8)] = KP500000000 * (T59 + T5E);
+			      Im[WS(rs, 7)] = KP500000000 * (T5E - T59);
+			 }
+			 {
+			      E T5J, T5Q, T5Z, T60;
+			      T5J = KP500000000 * (T5H + T5I);
+			      T5Q = KP353553390 * (T5M + T5P);
+			      Ip[WS(rs, 4)] = T5J + T5Q;
+			      Im[WS(rs, 11)] = T5Q - T5J;
+			      T5Z = KP500000000 * (T5R + T5S);
+			      T60 = KP353553390 * (T5W + T5X);
+			      Rm[WS(rs, 11)] = T5Z - T60;
+			      Rp[WS(rs, 4)] = T5Z + T60;
+			 }
+			 {
+			      E T5T, T5U, T5V, T5Y;
+			      T5T = KP500000000 * (T5R - T5S);
+			      T5U = KP353553390 * (T5P - T5M);
+			      Rm[WS(rs, 3)] = T5T - T5U;
+			      Rp[WS(rs, 12)] = T5T + T5U;
+			      T5V = KP500000000 * (T5I - T5H);
+			      T5Y = KP353553390 * (T5W - T5X);
+			      Ip[WS(rs, 12)] = T5V + T5Y;
+			      Im[WS(rs, 3)] = T5Y - T5V;
+			 }
+		    }
+		    {
+			 E T67, T6Q, T6K, T6U, T6N, T6V, T6a, T6G, T6i, T6A, T6t, T6P, T6w, T6F, T6p;
+			 E T6B;
+			 {
+			      E T63, T66, T6I, T6J;
+			      T63 = T61 - T62;
+			      T66 = T64 + T65;
+			      T67 = KP353553390 * (T63 + T66);
+			      T6Q = KP353553390 * (T63 - T66);
+			      T6I = T6d - T6c;
+			      T6J = T6g - T6f;
+			      T6K = FMA(KP461939766, T6I, KP191341716 * T6J);
+			      T6U = FNMS(KP461939766, T6J, KP191341716 * T6I);
+			 }
+			 {
+			      E T6L, T6M, T68, T69;
+			      T6L = T6k - T6j;
+			      T6M = T6n - T6m;
+			      T6N = FNMS(KP461939766, T6M, KP191341716 * T6L);
+			      T6V = FMA(KP461939766, T6L, KP191341716 * T6M);
+			      T68 = T4P - T4M;
+			      T69 = T38 - T2T;
+			      T6a = KP500000000 * (T68 + T69);
+			      T6G = KP500000000 * (T69 - T68);
+			 }
+			 {
+			      E T6e, T6h, T6r, T6s;
+			      T6e = T6c + T6d;
+			      T6h = T6f + T6g;
+			      T6i = FMA(KP191341716, T6e, KP461939766 * T6h);
+			      T6A = FNMS(KP191341716, T6h, KP461939766 * T6e);
+			      T6r = T4F - T4I;
+			      T6s = T2g - T2z;
+			      T6t = KP500000000 * (T6r + T6s);
+			      T6P = KP500000000 * (T6r - T6s);
+			 }
+			 {
+			      E T6u, T6v, T6l, T6o;
+			      T6u = T62 + T61;
+			      T6v = T64 - T65;
+			      T6w = KP353553390 * (T6u + T6v);
+			      T6F = KP353553390 * (T6v - T6u);
+			      T6l = T6j + T6k;
+			      T6o = T6m + T6n;
+			      T6p = FNMS(KP191341716, T6o, KP461939766 * T6l);
+			      T6B = FMA(KP191341716, T6l, KP461939766 * T6o);
+			 }
+			 {
+			      E T6b, T6q, T6D, T6E;
+			      T6b = T67 + T6a;
+			      T6q = T6i + T6p;
+			      Ip[WS(rs, 2)] = T6b + T6q;
+			      Im[WS(rs, 13)] = T6q - T6b;
+			      T6D = T6t + T6w;
+			      T6E = T6A + T6B;
+			      Rm[WS(rs, 13)] = T6D - T6E;
+			      Rp[WS(rs, 2)] = T6D + T6E;
+			 }
+			 {
+			      E T6x, T6y, T6z, T6C;
+			      T6x = T6t - T6w;
+			      T6y = T6p - T6i;
+			      Rm[WS(rs, 5)] = T6x - T6y;
+			      Rp[WS(rs, 10)] = T6x + T6y;
+			      T6z = T6a - T67;
+			      T6C = T6A - T6B;
+			      Ip[WS(rs, 10)] = T6z + T6C;
+			      Im[WS(rs, 5)] = T6C - T6z;
+			 }
+			 {
+			      E T6H, T6O, T6X, T6Y;
+			      T6H = T6F + T6G;
+			      T6O = T6K + T6N;
+			      Ip[WS(rs, 6)] = T6H + T6O;
+			      Im[WS(rs, 9)] = T6O - T6H;
+			      T6X = T6P + T6Q;
+			      T6Y = T6U + T6V;
+			      Rm[WS(rs, 9)] = T6X - T6Y;
+			      Rp[WS(rs, 6)] = T6X + T6Y;
+			 }
+			 {
+			      E T6R, T6S, T6T, T6W;
+			      T6R = T6P - T6Q;
+			      T6S = T6N - T6K;
+			      Rm[WS(rs, 1)] = T6R - T6S;
+			      Rp[WS(rs, 14)] = T6R + T6S;
+			      T6T = T6G - T6F;
+			      T6W = T6U - T6V;
+			      Ip[WS(rs, 14)] = T6T + T6W;
+			      Im[WS(rs, 1)] = T6W - T6T;
+			 }
+		    }
+		    {
+			 E T7d, T8w, T7o, T8m, T8c, T8l, T89, T8v, T81, T8B, T8h, T8t, T7I, T8A, T8g;
+			 E T8q;
+			 {
+			      E T75, T7c, T85, T88;
+			      T75 = FNMS(KP191341716, T74, KP461939766 * T71);
+			      T7c = FMA(KP461939766, T78, KP191341716 * T7b);
+			      T7d = T75 + T7c;
+			      T8w = T75 - T7c;
+			      {
+				   E T7k, T7n, T8a, T8b;
+				   T7k = KP353553390 * (T7g + T7j);
+				   T7n = KP500000000 * (T7l - T7m);
+				   T7o = T7k + T7n;
+				   T8m = T7n - T7k;
+				   T8a = FMA(KP191341716, T71, KP461939766 * T74);
+				   T8b = FNMS(KP191341716, T78, KP461939766 * T7b);
+				   T8c = T8a + T8b;
+				   T8l = T8b - T8a;
+			      }
+			      T85 = KP500000000 * (T83 + T84);
+			      T88 = KP353553390 * (T86 + T87);
+			      T89 = T85 + T88;
+			      T8v = T85 - T88;
+			      {
+				   E T7T, T8r, T80, T8s, T7P, T7W;
+				   T7P = KP707106781 * (T7L + T7O);
+				   T7T = T7P + T7S;
+				   T8r = T7S - T7P;
+				   T7W = KP707106781 * (T7U + T7V);
+				   T80 = T7W + T7Z;
+				   T8s = T7Z - T7W;
+				   T81 = FNMS(KP097545161, T80, KP490392640 * T7T);
+				   T8B = FMA(KP415734806, T8r, KP277785116 * T8s);
+				   T8h = FMA(KP097545161, T7T, KP490392640 * T80);
+				   T8t = FNMS(KP415734806, T8s, KP277785116 * T8r);
+			      }
+			      {
+				   E T7A, T8o, T7H, T8p, T7w, T7D;
+				   T7w = KP707106781 * (T7s + T7v);
+				   T7A = T7w + T7z;
+				   T8o = T7z - T7w;
+				   T7D = KP707106781 * (T7B + T7C);
+				   T7H = T7D + T7G;
+				   T8p = T7G - T7D;
+				   T7I = FMA(KP490392640, T7A, KP097545161 * T7H);
+				   T8A = FNMS(KP415734806, T8o, KP277785116 * T8p);
+				   T8g = FNMS(KP097545161, T7A, KP490392640 * T7H);
+				   T8q = FMA(KP277785116, T8o, KP415734806 * T8p);
+			      }
+			 }
+			 {
+			      E T7p, T82, T8j, T8k;
+			      T7p = T7d + T7o;
+			      T82 = T7I + T81;
+			      Ip[WS(rs, 1)] = T7p + T82;
+			      Im[WS(rs, 14)] = T82 - T7p;
+			      T8j = T89 + T8c;
+			      T8k = T8g + T8h;
+			      Rm[WS(rs, 14)] = T8j - T8k;
+			      Rp[WS(rs, 1)] = T8j + T8k;
+			 }
+			 {
+			      E T8d, T8e, T8f, T8i;
+			      T8d = T89 - T8c;
+			      T8e = T81 - T7I;
+			      Rm[WS(rs, 6)] = T8d - T8e;
+			      Rp[WS(rs, 9)] = T8d + T8e;
+			      T8f = T7o - T7d;
+			      T8i = T8g - T8h;
+			      Ip[WS(rs, 9)] = T8f + T8i;
+			      Im[WS(rs, 6)] = T8i - T8f;
+			 }
+			 {
+			      E T8n, T8u, T8D, T8E;
+			      T8n = T8l + T8m;
+			      T8u = T8q + T8t;
+			      Ip[WS(rs, 5)] = T8n + T8u;
+			      Im[WS(rs, 10)] = T8u - T8n;
+			      T8D = T8v + T8w;
+			      T8E = T8A + T8B;
+			      Rm[WS(rs, 10)] = T8D - T8E;
+			      Rp[WS(rs, 5)] = T8D + T8E;
+			 }
+			 {
+			      E T8x, T8y, T8z, T8C;
+			      T8x = T8v - T8w;
+			      T8y = T8t - T8q;
+			      Rm[WS(rs, 2)] = T8x - T8y;
+			      Rp[WS(rs, 13)] = T8x + T8y;
+			      T8z = T8m - T8l;
+			      T8C = T8A - T8B;
+			      Ip[WS(rs, 13)] = T8z + T8C;
+			      Im[WS(rs, 2)] = T8C - T8z;
+			 }
+		    }
+		    {
+			 E T8L, T9u, T8O, T9k, T9a, T9j, T97, T9t, T93, T9z, T9f, T9r, T8W, T9y, T9e;
+			 E T9o;
+			 {
+			      E T8H, T8K, T95, T96;
+			      T8H = FNMS(KP461939766, T8G, KP191341716 * T8F);
+			      T8K = FMA(KP191341716, T8I, KP461939766 * T8J);
+			      T8L = T8H + T8K;
+			      T9u = T8H - T8K;
+			      {
+				   E T8M, T8N, T98, T99;
+				   T8M = KP353553390 * (T87 - T86);
+				   T8N = KP500000000 * (T7m + T7l);
+				   T8O = T8M + T8N;
+				   T9k = T8N - T8M;
+				   T98 = FMA(KP461939766, T8F, KP191341716 * T8G);
+				   T99 = FNMS(KP461939766, T8I, KP191341716 * T8J);
+				   T9a = T98 + T99;
+				   T9j = T99 - T98;
+			      }
+			      T95 = KP500000000 * (T83 - T84);
+			      T96 = KP353553390 * (T7g - T7j);
+			      T97 = T95 + T96;
+			      T9t = T95 - T96;
+			      {
+				   E T8Z, T9p, T92, T9q, T8X, T90;
+				   T8X = KP707106781 * (T7V - T7U);
+				   T8Z = T8X + T8Y;
+				   T9p = T8Y - T8X;
+				   T90 = KP707106781 * (T7L - T7O);
+				   T92 = T90 + T91;
+				   T9q = T91 - T90;
+				   T93 = FNMS(KP277785116, T92, KP415734806 * T8Z);
+				   T9z = FMA(KP490392640, T9p, KP097545161 * T9q);
+				   T9f = FMA(KP277785116, T8Z, KP415734806 * T92);
+				   T9r = FNMS(KP490392640, T9q, KP097545161 * T9p);
+			      }
+			      {
+				   E T8S, T9m, T8V, T9n, T8Q, T8T;
+				   T8Q = KP707106781 * (T7C - T7B);
+				   T8S = T8Q + T8R;
+				   T9m = T8R - T8Q;
+				   T8T = KP707106781 * (T7s - T7v);
+				   T8V = T8T + T8U;
+				   T9n = T8U - T8T;
+				   T8W = FMA(KP415734806, T8S, KP277785116 * T8V);
+				   T9y = FNMS(KP490392640, T9m, KP097545161 * T9n);
+				   T9e = FNMS(KP277785116, T8S, KP415734806 * T8V);
+				   T9o = FMA(KP097545161, T9m, KP490392640 * T9n);
+			      }
+			 }
+			 {
+			      E T8P, T94, T9h, T9i;
+			      T8P = T8L + T8O;
+			      T94 = T8W + T93;
+			      Ip[WS(rs, 3)] = T8P + T94;
+			      Im[WS(rs, 12)] = T94 - T8P;
+			      T9h = T97 + T9a;
+			      T9i = T9e + T9f;
+			      Rm[WS(rs, 12)] = T9h - T9i;
+			      Rp[WS(rs, 3)] = T9h + T9i;
+			 }
+			 {
+			      E T9b, T9c, T9d, T9g;
+			      T9b = T97 - T9a;
+			      T9c = T93 - T8W;
+			      Rm[WS(rs, 4)] = T9b - T9c;
+			      Rp[WS(rs, 11)] = T9b + T9c;
+			      T9d = T8O - T8L;
+			      T9g = T9e - T9f;
+			      Ip[WS(rs, 11)] = T9d + T9g;
+			      Im[WS(rs, 4)] = T9g - T9d;
+			 }
+			 {
+			      E T9l, T9s, T9B, T9C;
+			      T9l = T9j + T9k;
+			      T9s = T9o + T9r;
+			      Ip[WS(rs, 7)] = T9l + T9s;
+			      Im[WS(rs, 8)] = T9s - T9l;
+			      T9B = T9t + T9u;
+			      T9C = T9y + T9z;
+			      Rm[WS(rs, 8)] = T9B - T9C;
+			      Rp[WS(rs, 7)] = T9B + T9C;
+			 }
+			 {
+			      E T9v, T9w, T9x, T9A;
+			      T9v = T9t - T9u;
+			      T9w = T9r - T9o;
+			      Rm[0] = T9v - T9w;
+			      Rp[WS(rs, 15)] = T9v + T9w;
+			      T9x = T9k - T9j;
+			      T9A = T9y - T9z;
+			      Ip[WS(rs, 15)] = T9x + T9A;
+			      Im[0] = T9A - T9x;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cfdft2_32", twinstr, &GENUS, {440, 188, 112, 0} };
+
+void X(codelet_hc2cfdft2_32) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft2_32, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft2_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft2_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:50 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cfdft2_4 -include hc2cf.h */
+
+/*
+ * This function contains 32 FP additions, 24 FP multiplications,
+ * (or, 24 additions, 16 multiplications, 8 fused multiply/add),
+ * 33 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T1, T5, T2, T4;
+	       T1 = W[0];
+	       T5 = W[3];
+	       T2 = W[2];
+	       T4 = W[1];
+	       {
+		    E Tc, T6, Tp, Tj, Tw, Tt, T9, TE, To, TC, Ta, Tr, Tf, Tl, Tm;
+		    {
+			 E Th, Tb, T3, Ti;
+			 Th = Ip[0];
+			 Tb = T1 * T5;
+			 T3 = T1 * T2;
+			 Ti = Im[0];
+			 Tl = Rm[0];
+			 Tc = FNMS(T4, T2, Tb);
+			 T6 = FMA(T4, T5, T3);
+			 Tp = Th + Ti;
+			 Tj = Th - Ti;
+			 Tm = Rp[0];
+		    }
+		    {
+			 E T7, T8, Td, Tn, Te;
+			 T7 = Ip[WS(rs, 1)];
+			 T8 = Im[WS(rs, 1)];
+			 Td = Rp[WS(rs, 1)];
+			 Tw = Tm + Tl;
+			 Tn = Tl - Tm;
+			 Tt = T7 + T8;
+			 T9 = T7 - T8;
+			 Te = Rm[WS(rs, 1)];
+			 TE = T4 * Tn;
+			 To = T1 * Tn;
+			 TC = T2 * Tt;
+			 Ta = T6 * T9;
+			 Tr = Td - Te;
+			 Tf = Td + Te;
+		    }
+		    {
+			 E Tq, Tk, TB, Ty, Tu, TI, TG, TF;
+			 Tq = FNMS(T4, Tp, To);
+			 TF = FMA(T1, Tp, TE);
+			 {
+			      E Tg, Tx, TD, Ts;
+			      Tg = FNMS(Tc, Tf, Ta);
+			      Tx = T6 * Tf;
+			      TD = FNMS(T5, Tr, TC);
+			      Ts = T2 * Tr;
+			      Tk = Tg + Tj;
+			      TB = Tj - Tg;
+			      Ty = FMA(Tc, T9, Tx);
+			      Tu = FMA(T5, Tt, Ts);
+			      TI = TD + TF;
+			      TG = TD - TF;
+			 }
+			 {
+			      E Tz, TH, Tv, TA;
+			      Tz = Tw - Ty;
+			      TH = Tw + Ty;
+			      Tv = Tq - Tu;
+			      TA = Tu + Tq;
+			      Rp[0] = KP500000000 * (TH + TI);
+			      Rm[WS(rs, 1)] = KP500000000 * (TH - TI);
+			      Rm[0] = KP500000000 * (Tz - TA);
+			      Im[WS(rs, 1)] = KP500000000 * (Tv - Tk);
+			      Ip[0] = KP500000000 * (Tk + Tv);
+			      Im[0] = KP500000000 * (TG - TB);
+			      Rp[WS(rs, 1)] = KP500000000 * (Tz + TA);
+			      Ip[WS(rs, 1)] = KP500000000 * (TB + TG);
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cfdft2_4", twinstr, &GENUS, {24, 16, 8, 0} };
+
+void X(codelet_hc2cfdft2_4) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft2_4, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cfdft2_4 -include hc2cf.h */
+
+/*
+ * This function contains 32 FP additions, 24 FP multiplications,
+ * (or, 24 additions, 16 multiplications, 8 fused multiply/add),
+ * 24 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T1, T3, T2, T4, T5, T9;
+	       T1 = W[0];
+	       T3 = W[1];
+	       T2 = W[2];
+	       T4 = W[3];
+	       T5 = FMA(T1, T2, T3 * T4);
+	       T9 = FNMS(T3, T2, T1 * T4);
+	       {
+		    E Tg, Tr, Tm, Tx, Td, Tw, Tp, Ts;
+		    {
+			 E Te, Tf, Tl, Ti, Tj, Tk;
+			 Te = Ip[0];
+			 Tf = Im[0];
+			 Tl = Te + Tf;
+			 Ti = Rm[0];
+			 Tj = Rp[0];
+			 Tk = Ti - Tj;
+			 Tg = Te - Tf;
+			 Tr = Tj + Ti;
+			 Tm = FNMS(T3, Tl, T1 * Tk);
+			 Tx = FMA(T3, Tk, T1 * Tl);
+		    }
+		    {
+			 E T8, To, Tc, Tn;
+			 {
+			      E T6, T7, Ta, Tb;
+			      T6 = Ip[WS(rs, 1)];
+			      T7 = Im[WS(rs, 1)];
+			      T8 = T6 - T7;
+			      To = T6 + T7;
+			      Ta = Rp[WS(rs, 1)];
+			      Tb = Rm[WS(rs, 1)];
+			      Tc = Ta + Tb;
+			      Tn = Ta - Tb;
+			 }
+			 Td = FNMS(T9, Tc, T5 * T8);
+			 Tw = FNMS(T4, Tn, T2 * To);
+			 Tp = FMA(T2, Tn, T4 * To);
+			 Ts = FMA(T5, Tc, T9 * T8);
+		    }
+		    {
+			 E Th, Tq, Tz, TA;
+			 Th = Td + Tg;
+			 Tq = Tm - Tp;
+			 Ip[0] = KP500000000 * (Th + Tq);
+			 Im[WS(rs, 1)] = KP500000000 * (Tq - Th);
+			 Tz = Tr + Ts;
+			 TA = Tw + Tx;
+			 Rm[WS(rs, 1)] = KP500000000 * (Tz - TA);
+			 Rp[0] = KP500000000 * (Tz + TA);
+		    }
+		    {
+			 E Tt, Tu, Tv, Ty;
+			 Tt = Tr - Ts;
+			 Tu = Tp + Tm;
+			 Rm[0] = KP500000000 * (Tt - Tu);
+			 Rp[WS(rs, 1)] = KP500000000 * (Tt + Tu);
+			 Tv = Tg - Td;
+			 Ty = Tw - Tx;
+			 Ip[WS(rs, 1)] = KP500000000 * (Tv + Ty);
+			 Im[0] = KP500000000 * (Ty - Tv);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cfdft2_4", twinstr, &GENUS, {24, 16, 8, 0} };
+
+void X(codelet_hc2cfdft2_4) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft2_4, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft2_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft2_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:50 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cfdft2_8 -include hc2cf.h */
+
+/*
+ * This function contains 90 FP additions, 66 FP multiplications,
+ * (or, 60 additions, 36 multiplications, 30 fused multiply/add),
+ * 68 stack variables, 2 constants, and 32 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T1G, T1F, T1C, T1D, T1N, T1B, T1R, T1L;
+	       {
+		    E T1, T2, Th, Tj, T4, T3, Ti, Tp, T5;
+		    T1 = W[0];
+		    T2 = W[2];
+		    Th = W[4];
+		    Tj = W[5];
+		    T4 = W[1];
+		    T3 = T1 * T2;
+		    Ti = T1 * Th;
+		    Tp = T1 * Tj;
+		    T5 = W[3];
+		    {
+			 E Tk, Tq, TI, T1a, T1u, TY, TF, TS, T1s, T1c, Tr, T1n, Tg, T16, Tn;
+			 E T13, T1f, Ts, To, T1o;
+			 {
+			      E T6, Tw, Tc, TB, TQ, TM, TC, TR, Tz, TD, TA;
+			      {
+				   E TX, TV, TT, TU;
+				   {
+					E TG, Tb, TH, TP, TL;
+					TG = Ip[0];
+					Tk = FMA(T4, Tj, Ti);
+					Tq = FNMS(T4, Th, Tp);
+					T6 = FMA(T4, T5, T3);
+					Tw = FNMS(T4, T5, T3);
+					Tb = T1 * T5;
+					TH = Im[0];
+					TT = Rm[0];
+					TP = T6 * Tj;
+					TL = T6 * Th;
+					Tc = FNMS(T4, T2, Tb);
+					TB = FMA(T4, T2, Tb);
+					TX = TG + TH;
+					TI = TG - TH;
+					TU = Rp[0];
+					TQ = FNMS(Tc, Th, TP);
+					TM = FMA(Tc, Tj, TL);
+				   }
+				   T1a = TU + TT;
+				   TV = TT - TU;
+				   {
+					E Tx, Ty, T1t, TW;
+					Tx = Ip[WS(rs, 2)];
+					Ty = Im[WS(rs, 2)];
+					T1t = T4 * TV;
+					TW = T1 * TV;
+					TC = Rp[WS(rs, 2)];
+					TR = Tx + Ty;
+					Tz = Tx - Ty;
+					T1u = FMA(T1, TX, T1t);
+					TY = FNMS(T4, TX, TW);
+					TD = Rm[WS(rs, 2)];
+				   }
+				   TA = Tw * Tz;
+			      }
+			      {
+				   E Td, T9, T12, Te, Ta, T1m;
+				   {
+					E T7, T8, TN, TE, TO, T1r, T1b;
+					T7 = Ip[WS(rs, 1)];
+					T8 = Im[WS(rs, 1)];
+					TN = TD - TC;
+					TE = TC + TD;
+					Td = Rp[WS(rs, 1)];
+					T9 = T7 - T8;
+					T12 = T7 + T8;
+					TO = TM * TN;
+					T1r = TQ * TN;
+					T1b = Tw * TE;
+					TF = FNMS(TB, TE, TA);
+					TS = FNMS(TQ, TR, TO);
+					T1s = FMA(TM, TR, T1r);
+					T1c = FMA(TB, Tz, T1b);
+					Te = Rm[WS(rs, 1)];
+				   }
+				   Ta = T6 * T9;
+				   T1m = T2 * T12;
+				   {
+					E Tl, T10, Tf, Tm, T11, T1e;
+					Tl = Ip[WS(rs, 3)];
+					T10 = Td - Te;
+					Tf = Td + Te;
+					Tm = Im[WS(rs, 3)];
+					Tr = Rp[WS(rs, 3)];
+					T11 = T2 * T10;
+					T1n = FNMS(T5, T10, T1m);
+					T1e = T6 * Tf;
+					Tg = FNMS(Tc, Tf, Ta);
+					T16 = Tl + Tm;
+					Tn = Tl - Tm;
+					T13 = FMA(T5, T12, T11);
+					T1f = FMA(Tc, T9, T1e);
+					Ts = Rm[WS(rs, 3)];
+				   }
+				   To = Tk * Tn;
+				   T1o = Th * T16;
+			      }
+			 }
+			 {
+			      E T1z, T1K, T1y, T1k, T1J, T1A, T1x, T1j;
+			      {
+				   E T1w, TK, T1l, T19, T1d, T1i;
+				   {
+					E TJ, T14, Tt, T1v, T1h;
+					T1z = TI - TF;
+					TJ = TF + TI;
+					T14 = Tr - Ts;
+					Tt = Tr + Ts;
+					T1v = T1s + T1u;
+					T1G = T1u - T1s;
+					{
+					     E TZ, T1q, Tv, T18, T15;
+					     T1F = TY - TS;
+					     TZ = TS + TY;
+					     T15 = Th * T14;
+					     {
+						  E T1p, T1g, Tu, T17;
+						  T1p = FNMS(Tj, T14, T1o);
+						  T1g = Tk * Tt;
+						  Tu = FNMS(Tq, Tt, To);
+						  T17 = FMA(Tj, T16, T15);
+						  T1C = T1p - T1n;
+						  T1q = T1n + T1p;
+						  T1h = FMA(Tq, Tn, T1g);
+						  T1K = Tg - Tu;
+						  Tv = Tg + Tu;
+						  T18 = T13 + T17;
+						  T1D = T13 - T17;
+					     }
+					     T1w = T1q - T1v;
+					     T1y = T1q + T1v;
+					     TK = Tv + TJ;
+					     T1l = TJ - Tv;
+					     T1k = T18 + TZ;
+					     T19 = TZ - T18;
+					}
+					T1J = T1a - T1c;
+					T1d = T1a + T1c;
+					T1i = T1f + T1h;
+					T1A = T1f - T1h;
+				   }
+				   Ip[0] = KP500000000 * (TK + T19);
+				   Im[WS(rs, 3)] = KP500000000 * (T19 - TK);
+				   Im[WS(rs, 1)] = KP500000000 * (T1w - T1l);
+				   T1x = T1d + T1i;
+				   T1j = T1d - T1i;
+				   Ip[WS(rs, 2)] = KP500000000 * (T1l + T1w);
+			      }
+			      Rm[WS(rs, 3)] = KP500000000 * (T1x - T1y);
+			      Rp[0] = KP500000000 * (T1x + T1y);
+			      Rp[WS(rs, 2)] = KP500000000 * (T1j + T1k);
+			      Rm[WS(rs, 1)] = KP500000000 * (T1j - T1k);
+			      T1N = T1A + T1z;
+			      T1B = T1z - T1A;
+			      T1R = T1J + T1K;
+			      T1L = T1J - T1K;
+			 }
+		    }
+	       }
+	       {
+		    E T1E, T1O, T1H, T1P;
+		    T1E = T1C + T1D;
+		    T1O = T1C - T1D;
+		    T1H = T1F - T1G;
+		    T1P = T1F + T1G;
+		    {
+			 E T1S, T1Q, T1I, T1M;
+			 T1S = T1O + T1P;
+			 T1Q = T1O - T1P;
+			 T1I = T1E + T1H;
+			 T1M = T1H - T1E;
+			 Im[0] = -(KP500000000 * (FNMS(KP707106781, T1Q, T1N)));
+			 Ip[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1Q, T1N));
+			 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1S, T1R));
+			 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP707106781, T1S, T1R));
+			 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1M, T1L));
+			 Rm[0] = KP500000000 * (FNMS(KP707106781, T1M, T1L));
+			 Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP707106781, T1I, T1B)));
+			 Ip[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1I, T1B));
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cfdft2_8", twinstr, &GENUS, {60, 36, 30, 0} };
+
+void X(codelet_hc2cfdft2_8) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft2_8, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cfdft2_8 -include hc2cf.h */
+
+/*
+ * This function contains 90 FP additions, 56 FP multiplications,
+ * (or, 72 additions, 38 multiplications, 18 fused multiply/add),
+ * 51 stack variables, 2 constants, and 32 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP353553390, +0.353553390593273762200422181052424519642417969);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T1, T4, T2, T5, Tu, Ty, T7, Td, Ti, Tj, Tk, TP, To, TN;
+	       {
+		    E T3, Tc, T6, Tb;
+		    T1 = W[0];
+		    T4 = W[1];
+		    T2 = W[2];
+		    T5 = W[3];
+		    T3 = T1 * T2;
+		    Tc = T4 * T2;
+		    T6 = T4 * T5;
+		    Tb = T1 * T5;
+		    Tu = T3 - T6;
+		    Ty = Tb + Tc;
+		    T7 = T3 + T6;
+		    Td = Tb - Tc;
+		    Ti = W[4];
+		    Tj = W[5];
+		    Tk = FMA(T1, Ti, T4 * Tj);
+		    TP = FNMS(Td, Ti, T7 * Tj);
+		    To = FNMS(T4, Ti, T1 * Tj);
+		    TN = FMA(T7, Ti, Td * Tj);
+	       }
+	       {
+		    E TF, T11, TC, T12, T1d, T1e, T1q, TM, TR, T1p, Th, Ts, T15, T14, T1a;
+		    E T1b, T1m, TV, TY, T1n;
+		    {
+			 E TD, TE, TL, TI, TJ, TK, Tx, TQ, TB, TO;
+			 TD = Ip[0];
+			 TE = Im[0];
+			 TL = TD + TE;
+			 TI = Rm[0];
+			 TJ = Rp[0];
+			 TK = TI - TJ;
+			 {
+			      E Tv, Tw, Tz, TA;
+			      Tv = Ip[WS(rs, 2)];
+			      Tw = Im[WS(rs, 2)];
+			      Tx = Tv - Tw;
+			      TQ = Tv + Tw;
+			      Tz = Rp[WS(rs, 2)];
+			      TA = Rm[WS(rs, 2)];
+			      TB = Tz + TA;
+			      TO = Tz - TA;
+			 }
+			 TF = TD - TE;
+			 T11 = TJ + TI;
+			 TC = FNMS(Ty, TB, Tu * Tx);
+			 T12 = FMA(Tu, TB, Ty * Tx);
+			 T1d = FNMS(TP, TO, TN * TQ);
+			 T1e = FMA(T4, TK, T1 * TL);
+			 T1q = T1e - T1d;
+			 TM = FNMS(T4, TL, T1 * TK);
+			 TR = FMA(TN, TO, TP * TQ);
+			 T1p = TR + TM;
+		    }
+		    {
+			 E Ta, TU, Tg, TT, Tn, TX, Tr, TW;
+			 {
+			      E T8, T9, Te, Tf;
+			      T8 = Ip[WS(rs, 1)];
+			      T9 = Im[WS(rs, 1)];
+			      Ta = T8 - T9;
+			      TU = T8 + T9;
+			      Te = Rp[WS(rs, 1)];
+			      Tf = Rm[WS(rs, 1)];
+			      Tg = Te + Tf;
+			      TT = Te - Tf;
+			 }
+			 {
+			      E Tl, Tm, Tp, Tq;
+			      Tl = Ip[WS(rs, 3)];
+			      Tm = Im[WS(rs, 3)];
+			      Tn = Tl - Tm;
+			      TX = Tl + Tm;
+			      Tp = Rp[WS(rs, 3)];
+			      Tq = Rm[WS(rs, 3)];
+			      Tr = Tp + Tq;
+			      TW = Tp - Tq;
+			 }
+			 Th = FNMS(Td, Tg, T7 * Ta);
+			 Ts = FNMS(To, Tr, Tk * Tn);
+			 T15 = FMA(Tk, Tr, To * Tn);
+			 T14 = FMA(T7, Tg, Td * Ta);
+			 T1a = FNMS(T5, TT, T2 * TU);
+			 T1b = FNMS(Tj, TW, Ti * TX);
+			 T1m = T1b - T1a;
+			 TV = FMA(T2, TT, T5 * TU);
+			 TY = FMA(Ti, TW, Tj * TX);
+			 T1n = TV - TY;
+		    }
+		    {
+			 E T1l, T1x, T1A, T1C, T1s, T1w, T1v, T1B;
+			 {
+			      E T1j, T1k, T1y, T1z;
+			      T1j = TF - TC;
+			      T1k = T14 - T15;
+			      T1l = KP500000000 * (T1j - T1k);
+			      T1x = KP500000000 * (T1k + T1j);
+			      T1y = T1m - T1n;
+			      T1z = T1p + T1q;
+			      T1A = KP353553390 * (T1y - T1z);
+			      T1C = KP353553390 * (T1y + T1z);
+			 }
+			 {
+			      E T1o, T1r, T1t, T1u;
+			      T1o = T1m + T1n;
+			      T1r = T1p - T1q;
+			      T1s = KP353553390 * (T1o + T1r);
+			      T1w = KP353553390 * (T1r - T1o);
+			      T1t = T11 - T12;
+			      T1u = Th - Ts;
+			      T1v = KP500000000 * (T1t - T1u);
+			      T1B = KP500000000 * (T1t + T1u);
+			 }
+			 Ip[WS(rs, 1)] = T1l + T1s;
+			 Rp[WS(rs, 1)] = T1B + T1C;
+			 Im[WS(rs, 2)] = T1s - T1l;
+			 Rm[WS(rs, 2)] = T1B - T1C;
+			 Rm[0] = T1v - T1w;
+			 Im[0] = T1A - T1x;
+			 Rp[WS(rs, 3)] = T1v + T1w;
+			 Ip[WS(rs, 3)] = T1x + T1A;
+		    }
+		    {
+			 E TH, T19, T1g, T1i, T10, T18, T17, T1h;
+			 {
+			      E Tt, TG, T1c, T1f;
+			      Tt = Th + Ts;
+			      TG = TC + TF;
+			      TH = Tt + TG;
+			      T19 = TG - Tt;
+			      T1c = T1a + T1b;
+			      T1f = T1d + T1e;
+			      T1g = T1c - T1f;
+			      T1i = T1c + T1f;
+			 }
+			 {
+			      E TS, TZ, T13, T16;
+			      TS = TM - TR;
+			      TZ = TV + TY;
+			      T10 = TS - TZ;
+			      T18 = TZ + TS;
+			      T13 = T11 + T12;
+			      T16 = T14 + T15;
+			      T17 = T13 - T16;
+			      T1h = T13 + T16;
+			 }
+			 Ip[0] = KP500000000 * (TH + T10);
+			 Rp[0] = KP500000000 * (T1h + T1i);
+			 Im[WS(rs, 3)] = KP500000000 * (T10 - TH);
+			 Rm[WS(rs, 3)] = KP500000000 * (T1h - T1i);
+			 Rm[WS(rs, 1)] = KP500000000 * (T17 - T18);
+			 Im[WS(rs, 1)] = KP500000000 * (T1g - T19);
+			 Rp[WS(rs, 2)] = KP500000000 * (T17 + T18);
+			 Ip[WS(rs, 2)] = KP500000000 * (T19 + T1g);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cfdft2_8", twinstr, &GENUS, {72, 38, 18, 0} };
+
+void X(codelet_hc2cfdft2_8) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft2_8, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cfdft_10 -include hc2cf.h */
+
+/*
+ * This function contains 122 FP additions, 92 FP multiplications,
+ * (or, 68 additions, 38 multiplications, 54 fused multiply/add),
+ * 94 stack variables, 5 constants, and 40 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T1x, T1I, T1T, T22, T20;
+	       {
+		    E T3, T1u, T1S, T2f, Td, T1w, T14, T1p, T1j, T1q, T1N, T2e, T1z, To, T2i;
+		    E T1H, TQ, T1n, Ty, T1B;
+		    {
+			 E T1h, TW, Tc, T1b, T1g, T1f, T1Q, TV, T7, TS, T1J, TU, Ts, T19, T18;
+			 E T15, Tx, T17, T1O, T1A, Tt, TD, Ti, TE, Tn, TA, T1F, TC, T1y, Tj;
+			 E T11, T12, TJ, TZ, TO, TY, TG, T1L, T1e, T1, T2;
+			 T1 = Ip[0];
+			 T2 = Im[0];
+			 {
+			      E Ta, Tb, T1c, T1d;
+			      Ta = Rp[WS(rs, 2)];
+			      Tb = Rm[WS(rs, 2)];
+			      T1c = Rm[0];
+			      T1h = T1 + T2;
+			      T3 = T1 - T2;
+			      T1d = Rp[0];
+			      TW = Ta + Tb;
+			      Tc = Ta - Tb;
+			      T1b = W[0];
+			      T1u = T1d + T1c;
+			      T1e = T1c - T1d;
+			      T1g = W[1];
+			 }
+			 {
+			      E T16, Tp, TT, T5, T6, TB, Tf;
+			      T5 = Ip[WS(rs, 2)];
+			      T6 = Im[WS(rs, 2)];
+			      T1f = T1b * T1e;
+			      T1Q = T1g * T1e;
+			      TV = W[7];
+			      T7 = T5 + T6;
+			      TT = T5 - T6;
+			      TS = W[6];
+			      {
+				   E Tv, Tw, Tq, Tr;
+				   Tq = Rm[WS(rs, 3)];
+				   Tr = Rp[WS(rs, 3)];
+				   T1J = TV * TT;
+				   TU = TS * TT;
+				   Tv = Ip[WS(rs, 3)];
+				   Ts = Tq - Tr;
+				   T19 = Tr + Tq;
+				   Tw = Im[WS(rs, 3)];
+				   T18 = W[11];
+				   T15 = W[10];
+				   Tx = Tv + Tw;
+				   T16 = Tv - Tw;
+				   Tp = W[12];
+			      }
+			      {
+				   E Tg, Th, Tl, Tm;
+				   Tg = Ip[WS(rs, 1)];
+				   T17 = T15 * T16;
+				   T1O = T18 * T16;
+				   T1A = Tp * Tx;
+				   Tt = Tp * Ts;
+				   Th = Im[WS(rs, 1)];
+				   Tl = Rp[WS(rs, 1)];
+				   Tm = Rm[WS(rs, 1)];
+				   TD = W[5];
+				   Ti = Tg - Th;
+				   TE = Tg + Th;
+				   Tn = Tl + Tm;
+				   TB = Tm - Tl;
+				   TA = W[4];
+				   Tf = W[2];
+				   T1F = TD * TB;
+			      }
+			      {
+				   E TH, TI, TM, TN;
+				   TH = Ip[WS(rs, 4)];
+				   TC = TA * TB;
+				   T1y = Tf * Tn;
+				   Tj = Tf * Ti;
+				   TI = Im[WS(rs, 4)];
+				   TM = Rp[WS(rs, 4)];
+				   TN = Rm[WS(rs, 4)];
+				   T11 = W[17];
+				   T12 = TH + TI;
+				   TJ = TH - TI;
+				   TZ = TN - TM;
+				   TO = TM + TN;
+				   TY = W[16];
+				   TG = W[14];
+				   T1L = T11 * TZ;
+			      }
+			 }
+			 {
+			      E T10, T1D, TK, T4, T9, T1P, T1R, T8, T1v;
+			      T10 = TY * TZ;
+			      T1D = TG * TO;
+			      TK = TG * TJ;
+			      T4 = W[9];
+			      T9 = W[8];
+			      T1P = FMA(T15, T19, T1O);
+			      T1R = FMA(T1b, T1h, T1Q);
+			      T8 = T4 * T7;
+			      T1v = T9 * T7;
+			      {
+				   E TX, T13, T1a, T1i;
+				   TX = FNMS(TV, TW, TU);
+				   T1S = T1P - T1R;
+				   T2f = T1P + T1R;
+				   Td = FMA(T9, Tc, T8);
+				   T1w = FNMS(T4, Tc, T1v);
+				   T13 = FNMS(T11, T12, T10);
+				   T1a = FNMS(T18, T19, T17);
+				   T1i = FNMS(T1g, T1h, T1f);
+				   {
+					E T1K, T1M, TF, T1G, TL;
+					T1K = FMA(TS, TW, T1J);
+					T14 = TX + T13;
+					T1p = T13 - TX;
+					T1j = T1a + T1i;
+					T1q = T1i - T1a;
+					T1M = FMA(TY, T12, T1L);
+					TF = FNMS(TD, TE, TC);
+					T1G = FMA(TA, TE, T1F);
+					TL = W[15];
+					T1N = T1K - T1M;
+					T2e = T1K + T1M;
+					{
+					     E Tk, T1E, TP, Tu;
+					     Tk = W[3];
+					     T1E = FMA(TL, TJ, T1D);
+					     TP = FNMS(TL, TO, TK);
+					     Tu = W[13];
+					     T1z = FMA(Tk, Ti, T1y);
+					     To = FNMS(Tk, Tn, Tj);
+					     T2i = T1G + T1E;
+					     T1H = T1E - T1G;
+					     TQ = TF + TP;
+					     T1n = TF - TP;
+					     Ty = FNMS(Tu, Tx, Tt);
+					     T1B = FMA(Tu, Ts, T1A);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T2p, T1t, T1m, T1C, T2o, T2m, T2k, T2w, T2y, T2n, T2d, T2l;
+			 {
+			      E T2g, Te, T2h, T2u, T1k, TR, T2v, Tz;
+			      T2p = T2e + T2f;
+			      T2g = T2e - T2f;
+			      Te = T3 - Td;
+			      T1t = Td + T3;
+			      Tz = To + Ty;
+			      T1m = Ty - To;
+			      T2h = T1z + T1B;
+			      T1C = T1z - T1B;
+			      T2u = T14 - T1j;
+			      T1k = T14 + T1j;
+			      TR = Tz + TQ;
+			      T2v = Tz - TQ;
+			      {
+				   E T2c, T2b, T2j, T1l;
+				   T2j = T2h - T2i;
+				   T2o = T2h + T2i;
+				   T2c = TR - T1k;
+				   T1l = TR + T1k;
+				   T2m = FMA(KP618033988, T2g, T2j);
+				   T2k = FNMS(KP618033988, T2j, T2g);
+				   T2w = FNMS(KP618033988, T2v, T2u);
+				   T2y = FMA(KP618033988, T2u, T2v);
+				   Ip[0] = KP500000000 * (Te + T1l);
+				   T2b = FNMS(KP250000000, T1l, Te);
+				   T2n = T1u + T1w;
+				   T1x = T1u - T1w;
+				   T2d = FNMS(KP559016994, T2c, T2b);
+				   T2l = FMA(KP559016994, T2c, T2b);
+			      }
+			 }
+			 {
+			      E T1o, T1Y, T28, T2a, T1Z, T1r, T2t, T2x;
+			      {
+				   E T26, T2s, T2q, T27, T2r;
+				   T1I = T1C + T1H;
+				   T26 = T1H - T1C;
+				   Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP951056516, T2k, T2d)));
+				   Ip[WS(rs, 2)] = KP500000000 * (FMA(KP951056516, T2k, T2d));
+				   Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP951056516, T2m, T2l)));
+				   Ip[WS(rs, 4)] = KP500000000 * (FMA(KP951056516, T2m, T2l));
+				   T2s = T2o - T2p;
+				   T2q = T2o + T2p;
+				   T27 = T1S - T1N;
+				   T1T = T1N + T1S;
+				   T1o = T1m + T1n;
+				   T1Y = T1n - T1m;
+				   Rp[0] = KP500000000 * (T2n + T2q);
+				   T2r = FNMS(KP250000000, T2q, T2n);
+				   T28 = FMA(KP618033988, T27, T26);
+				   T2a = FNMS(KP618033988, T26, T27);
+				   T1Z = T1q - T1p;
+				   T1r = T1p + T1q;
+				   T2t = FNMS(KP559016994, T2s, T2r);
+				   T2x = FMA(KP559016994, T2s, T2r);
+			      }
+			      {
+				   E T24, T23, T1s, T25, T29;
+				   T1s = T1o + T1r;
+				   T24 = T1r - T1o;
+				   Rm[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T2w, T2t));
+				   Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T2w, T2t));
+				   Rm[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T2y, T2x));
+				   Rp[WS(rs, 4)] = KP500000000 * (FNMS(KP951056516, T2y, T2x));
+				   Im[WS(rs, 4)] = KP500000000 * (T1s - T1t);
+				   T23 = FMA(KP250000000, T1s, T1t);
+				   T25 = FMA(KP559016994, T24, T23);
+				   T29 = FNMS(KP559016994, T24, T23);
+				   T22 = FNMS(KP618033988, T1Y, T1Z);
+				   T20 = FMA(KP618033988, T1Z, T1Y);
+				   Im[0] = -(KP500000000 * (FNMS(KP951056516, T28, T25)));
+				   Ip[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T28, T25));
+				   Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP951056516, T2a, T29)));
+				   Ip[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T2a, T29));
+			      }
+			 }
+		    }
+	       }
+	       {
+		    E T1U, T1W, T1V, T21, T1X;
+		    T1U = T1I + T1T;
+		    T1W = T1I - T1T;
+		    Rm[WS(rs, 4)] = KP500000000 * (T1x + T1U);
+		    T1V = FNMS(KP250000000, T1U, T1x);
+		    T21 = FNMS(KP559016994, T1W, T1V);
+		    T1X = FMA(KP559016994, T1W, T1V);
+		    Rm[0] = KP500000000 * (FNMS(KP951056516, T20, T1X));
+		    Rp[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T20, T1X));
+		    Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T22, T21));
+		    Rp[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T22, T21));
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 10, "hc2cfdft_10", twinstr, &GENUS, {68, 38, 54, 0} };
+
+void X(codelet_hc2cfdft_10) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_10, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cfdft_10 -include hc2cf.h */
+
+/*
+ * This function contains 122 FP additions, 68 FP multiplications,
+ * (or, 92 additions, 38 multiplications, 30 fused multiply/add),
+ * 62 stack variables, 5 constants, and 40 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DK(KP125000000, +0.125000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP279508497, +0.279508497187473712051146708591409529430077295);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E Tw, TL, TM, T1W, T1X, T27, T1Z, T20, T26, TX, T1a, T1b, T1d, T1e, T1f;
+	       E T1q, T1t, T1u, T1x, T1A, T1B, T1g, T1h, T1i, Td, T25, T1k, T1F;
+	       {
+		    E T3, T1D, T19, T1z, T7, Tb, TR, T1v, Tm, T1o, TK, T1s, Tv, T1p, T12;
+		    E T1y, TF, T1r, TW, T1w;
+		    {
+			 E T1, T2, T18, T14, T15, T16, T13, T17;
+			 T1 = Ip[0];
+			 T2 = Im[0];
+			 T18 = T1 + T2;
+			 T14 = Rm[0];
+			 T15 = Rp[0];
+			 T16 = T14 - T15;
+			 T3 = T1 - T2;
+			 T1D = T15 + T14;
+			 T13 = W[0];
+			 T17 = W[1];
+			 T19 = FNMS(T17, T18, T13 * T16);
+			 T1z = FMA(T17, T16, T13 * T18);
+		    }
+		    {
+			 E T5, T6, TO, T9, Ta, TQ, TN, TP;
+			 T5 = Ip[WS(rs, 2)];
+			 T6 = Im[WS(rs, 2)];
+			 TO = T5 - T6;
+			 T9 = Rp[WS(rs, 2)];
+			 Ta = Rm[WS(rs, 2)];
+			 TQ = T9 + Ta;
+			 T7 = T5 + T6;
+			 Tb = T9 - Ta;
+			 TN = W[6];
+			 TP = W[7];
+			 TR = FNMS(TP, TQ, TN * TO);
+			 T1v = FMA(TP, TO, TN * TQ);
+		    }
+		    {
+			 E Th, TJ, Tl, TH;
+			 {
+			      E Tf, Tg, Tj, Tk;
+			      Tf = Ip[WS(rs, 1)];
+			      Tg = Im[WS(rs, 1)];
+			      Th = Tf - Tg;
+			      TJ = Tf + Tg;
+			      Tj = Rp[WS(rs, 1)];
+			      Tk = Rm[WS(rs, 1)];
+			      Tl = Tj + Tk;
+			      TH = Tj - Tk;
+			 }
+			 {
+			      E Te, Ti, TG, TI;
+			      Te = W[2];
+			      Ti = W[3];
+			      Tm = FNMS(Ti, Tl, Te * Th);
+			      T1o = FMA(Te, Tl, Ti * Th);
+			      TG = W[4];
+			      TI = W[5];
+			      TK = FMA(TG, TH, TI * TJ);
+			      T1s = FNMS(TI, TH, TG * TJ);
+			 }
+		    }
+		    {
+			 E Tq, TZ, Tu, T11;
+			 {
+			      E To, Tp, Ts, Tt;
+			      To = Ip[WS(rs, 3)];
+			      Tp = Im[WS(rs, 3)];
+			      Tq = To + Tp;
+			      TZ = To - Tp;
+			      Ts = Rp[WS(rs, 3)];
+			      Tt = Rm[WS(rs, 3)];
+			      Tu = Ts - Tt;
+			      T11 = Ts + Tt;
+			 }
+			 {
+			      E Tn, Tr, TY, T10;
+			      Tn = W[13];
+			      Tr = W[12];
+			      Tv = FMA(Tn, Tq, Tr * Tu);
+			      T1p = FNMS(Tn, Tu, Tr * Tq);
+			      TY = W[10];
+			      T10 = W[11];
+			      T12 = FNMS(T10, T11, TY * TZ);
+			      T1y = FMA(T10, TZ, TY * T11);
+			 }
+		    }
+		    {
+			 E TA, TV, TE, TT;
+			 {
+			      E Ty, Tz, TC, TD;
+			      Ty = Ip[WS(rs, 4)];
+			      Tz = Im[WS(rs, 4)];
+			      TA = Ty - Tz;
+			      TV = Ty + Tz;
+			      TC = Rp[WS(rs, 4)];
+			      TD = Rm[WS(rs, 4)];
+			      TE = TC + TD;
+			      TT = TC - TD;
+			 }
+			 {
+			      E Tx, TB, TS, TU;
+			      Tx = W[14];
+			      TB = W[15];
+			      TF = FNMS(TB, TE, Tx * TA);
+			      T1r = FMA(Tx, TE, TB * TA);
+			      TS = W[16];
+			      TU = W[17];
+			      TW = FMA(TS, TT, TU * TV);
+			      T1w = FNMS(TU, TT, TS * TV);
+			 }
+		    }
+		    Tw = Tm - Tv;
+		    TL = TF - TK;
+		    TM = Tw + TL;
+		    T1W = T1v + T1w;
+		    T1X = T1y + T1z;
+		    T27 = T1W + T1X;
+		    T1Z = T1o + T1p;
+		    T20 = T1s + T1r;
+		    T26 = T1Z + T20;
+		    TX = TR - TW;
+		    T1a = T12 + T19;
+		    T1b = TX + T1a;
+		    T1d = T19 - T12;
+		    T1e = TR + TW;
+		    T1f = T1d - T1e;
+		    T1q = T1o - T1p;
+		    T1t = T1r - T1s;
+		    T1u = T1q + T1t;
+		    T1x = T1v - T1w;
+		    T1A = T1y - T1z;
+		    T1B = T1x + T1A;
+		    T1g = Tm + Tv;
+		    T1h = TK + TF;
+		    T1i = T1g + T1h;
+		    {
+			 E Tc, T1E, T4, T8;
+			 T4 = W[9];
+			 T8 = W[8];
+			 Tc = FMA(T4, T7, T8 * Tb);
+			 T1E = FNMS(T4, Tb, T8 * T7);
+			 Td = T3 - Tc;
+			 T25 = T1D + T1E;
+			 T1k = Tc + T3;
+			 T1F = T1D - T1E;
+		    }
+	       }
+	       {
+		    E T1U, T1c, T1T, T22, T24, T1Y, T21, T23, T1V;
+		    T1U = KP279508497 * (TM - T1b);
+		    T1c = TM + T1b;
+		    T1T = FNMS(KP125000000, T1c, KP500000000 * Td);
+		    T1Y = T1W - T1X;
+		    T21 = T1Z - T20;
+		    T22 = FNMS(KP293892626, T21, KP475528258 * T1Y);
+		    T24 = FMA(KP475528258, T21, KP293892626 * T1Y);
+		    Ip[0] = KP500000000 * (Td + T1c);
+		    T23 = T1U + T1T;
+		    Ip[WS(rs, 4)] = T23 + T24;
+		    Im[WS(rs, 3)] = T24 - T23;
+		    T1V = T1T - T1U;
+		    Ip[WS(rs, 2)] = T1V + T22;
+		    Im[WS(rs, 1)] = T22 - T1V;
+	       }
+	       {
+		    E T2a, T28, T29, T2e, T2g, T2c, T2d, T2f, T2b;
+		    T2a = KP279508497 * (T26 - T27);
+		    T28 = T26 + T27;
+		    T29 = FNMS(KP125000000, T28, KP500000000 * T25);
+		    T2c = TX - T1a;
+		    T2d = Tw - TL;
+		    T2e = FNMS(KP293892626, T2d, KP475528258 * T2c);
+		    T2g = FMA(KP475528258, T2d, KP293892626 * T2c);
+		    Rp[0] = KP500000000 * (T25 + T28);
+		    T2f = T2a + T29;
+		    Rp[WS(rs, 4)] = T2f - T2g;
+		    Rm[WS(rs, 3)] = T2g + T2f;
+		    T2b = T29 - T2a;
+		    Rp[WS(rs, 2)] = T2b - T2e;
+		    Rm[WS(rs, 1)] = T2e + T2b;
+	       }
+	       {
+		    E T1M, T1j, T1L, T1Q, T1S, T1O, T1P, T1R, T1N;
+		    T1M = KP279508497 * (T1i + T1f);
+		    T1j = T1f - T1i;
+		    T1L = FMA(KP500000000, T1k, KP125000000 * T1j);
+		    T1O = T1A - T1x;
+		    T1P = T1q - T1t;
+		    T1Q = FNMS(KP475528258, T1P, KP293892626 * T1O);
+		    T1S = FMA(KP293892626, T1P, KP475528258 * T1O);
+		    Im[WS(rs, 4)] = KP500000000 * (T1j - T1k);
+		    T1R = T1L - T1M;
+		    Ip[WS(rs, 3)] = T1R + T1S;
+		    Im[WS(rs, 2)] = T1S - T1R;
+		    T1N = T1L + T1M;
+		    Ip[WS(rs, 1)] = T1N + T1Q;
+		    Im[0] = T1Q - T1N;
+	       }
+	       {
+		    E T1C, T1G, T1H, T1n, T1J, T1l, T1m, T1K, T1I;
+		    T1C = KP279508497 * (T1u - T1B);
+		    T1G = T1u + T1B;
+		    T1H = FNMS(KP125000000, T1G, KP500000000 * T1F);
+		    T1l = T1g - T1h;
+		    T1m = T1e + T1d;
+		    T1n = FMA(KP475528258, T1l, KP293892626 * T1m);
+		    T1J = FNMS(KP293892626, T1l, KP475528258 * T1m);
+		    Rm[WS(rs, 4)] = KP500000000 * (T1F + T1G);
+		    T1K = T1H - T1C;
+		    Rp[WS(rs, 3)] = T1J + T1K;
+		    Rm[WS(rs, 2)] = T1K - T1J;
+		    T1I = T1C + T1H;
+		    Rp[WS(rs, 1)] = T1n + T1I;
+		    Rm[0] = T1I - T1n;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 10, "hc2cfdft_10", twinstr, &GENUS, {92, 38, 30, 0} };
+
+void X(codelet_hc2cfdft_10) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_10, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cfdft_12 -include hc2cf.h */
+
+/*
+ * This function contains 142 FP additions, 92 FP multiplications,
+ * (or, 96 additions, 46 multiplications, 46 fused multiply/add),
+ * 71 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
+	       E T2z, T2M;
+	       {
+		    E To, T1E, T2H, T1m, T1W, Tl, T1J, T2i, T2K, T1B, T2I, T2e, T19, T2E, T2C;
+		    E T27, T1M, Tz, T2B, T1f, T1O, TJ, TT, T1Q;
+		    {
+			 E T2b, T1s, T1A, T2d;
+			 {
+			      E T1u, T1z, T1v, T2c, T1i, Te, T1l, Tj, Tf, T1H, T4, T1o, T1, T1r, T9;
+			      E T1n, T5;
+			      {
+				   E T1x, T1y, T1t, Tm, Tn;
+				   Tm = Ip[0];
+				   Tn = Im[0];
+				   T1x = Rp[0];
+				   T1y = Rm[0];
+				   T1t = W[0];
+				   T1u = Tm + Tn;
+				   To = Tm - Tn;
+				   {
+					E Th, Ti, Tb, Tc, Td;
+					Tc = Ip[WS(rs, 4)];
+					T1z = T1x - T1y;
+					T1E = T1x + T1y;
+					Td = Im[WS(rs, 4)];
+					T1v = T1t * T1u;
+					Th = Rp[WS(rs, 4)];
+					T2c = T1t * T1z;
+					T1i = Tc + Td;
+					Te = Tc - Td;
+					Ti = Rm[WS(rs, 4)];
+					Tb = W[14];
+					{
+					     E T7, T8, T2, T3;
+					     T2 = Ip[WS(rs, 2)];
+					     T1l = Th - Ti;
+					     Tj = Th + Ti;
+					     Tf = Tb * Te;
+					     T3 = Im[WS(rs, 2)];
+					     T7 = Rp[WS(rs, 2)];
+					     T1H = Tb * Tj;
+					     T8 = Rm[WS(rs, 2)];
+					     T4 = T2 - T3;
+					     T1o = T2 + T3;
+					     T1 = W[6];
+					     T1r = T7 - T8;
+					     T9 = T7 + T8;
+					     T1n = W[8];
+					     T5 = T1 * T4;
+					}
+				   }
+			      }
+			      {
+				   E T1F, T2a, T1p, T1h, T1k;
+				   T1F = T1 * T9;
+				   T2a = T1n * T1r;
+				   T1p = T1n * T1o;
+				   T1h = W[16];
+				   T1k = W[17];
+				   {
+					E T1G, Ta, Tk, T1I, T1q, T1w;
+					{
+					     E T6, Tg, T2G, T1j;
+					     T6 = W[7];
+					     Tg = W[15];
+					     T2G = T1h * T1l;
+					     T1j = T1h * T1i;
+					     T1G = FMA(T6, T4, T1F);
+					     Ta = FNMS(T6, T9, T5);
+					     T2H = FMA(T1k, T1i, T2G);
+					     T1m = FNMS(T1k, T1l, T1j);
+					     Tk = FNMS(Tg, Tj, Tf);
+					     T1I = FMA(Tg, Te, T1H);
+					}
+					T1q = W[9];
+					T1w = W[1];
+					T1W = Ta - Tk;
+					Tl = Ta + Tk;
+					T1J = T1G + T1I;
+					T2i = T1I - T1G;
+					T2b = FMA(T1q, T1o, T2a);
+					T1s = FNMS(T1q, T1r, T1p);
+					T1A = FNMS(T1w, T1z, T1v);
+					T2d = FMA(T1w, T1u, T2c);
+				   }
+			      }
+			 }
+			 {
+			      E T11, Tt, T10, TX, Ty, TZ, T23, T1b, TN, TS, T1e, T1P, TO, T17, TD;
+			      E T16, T13, T14, TI, TA;
+			      {
+				   E Tw, Tx, Tr, Ts, TK;
+				   Tr = Ip[WS(rs, 3)];
+				   Ts = Im[WS(rs, 3)];
+				   T2K = T1s - T1A;
+				   T1B = T1s + T1A;
+				   T2I = T2b + T2d;
+				   T2e = T2b - T2d;
+				   Tw = Rp[WS(rs, 3)];
+				   T11 = Tr + Ts;
+				   Tt = Tr - Ts;
+				   Tx = Rm[WS(rs, 3)];
+				   T10 = W[12];
+				   TX = W[13];
+				   {
+					E TL, TY, TM, TQ, TR;
+					TL = Ip[WS(rs, 1)];
+					Ty = Tw + Tx;
+					TY = Tx - Tw;
+					TM = Im[WS(rs, 1)];
+					TQ = Rp[WS(rs, 1)];
+					TR = Rm[WS(rs, 1)];
+					TZ = TX * TY;
+					T23 = T10 * TY;
+					T1b = TL + TM;
+					TN = TL - TM;
+					TS = TQ + TR;
+					T1e = TQ - TR;
+				   }
+				   TK = W[2];
+				   {
+					E TG, TH, TB, TC;
+					TB = Ip[WS(rs, 5)];
+					TC = Im[WS(rs, 5)];
+					TG = Rp[WS(rs, 5)];
+					T1P = TK * TS;
+					TO = TK * TN;
+					T17 = TB + TC;
+					TD = TB - TC;
+					TH = Rm[WS(rs, 5)];
+					T16 = W[20];
+					T13 = W[21];
+					T14 = TH - TG;
+					TI = TG + TH;
+					TA = W[18];
+				   }
+			      }
+			      {
+				   E T12, T1N, TE, T18, T24, T26, T25, T15;
+				   T12 = FMA(T10, T11, TZ);
+				   T15 = T13 * T14;
+				   T25 = T16 * T14;
+				   T1N = TA * TI;
+				   TE = TA * TD;
+				   T18 = FMA(T16, T17, T15);
+				   T24 = FNMS(TX, T11, T23);
+				   T26 = FNMS(T13, T17, T25);
+				   {
+					E Tv, T1L, Tu, Tq;
+					Tq = W[10];
+					T19 = T12 + T18;
+					T2E = T18 - T12;
+					Tv = W[11];
+					T2C = T24 + T26;
+					T27 = T24 - T26;
+					T1L = Tq * Ty;
+					Tu = Tq * Tt;
+					{
+					     E T1d, T2A, T1c, T1a, TF, TP;
+					     T1a = W[4];
+					     T1d = W[5];
+					     T1M = FMA(Tv, Tt, T1L);
+					     Tz = FNMS(Tv, Ty, Tu);
+					     T2A = T1a * T1e;
+					     T1c = T1a * T1b;
+					     TF = W[19];
+					     TP = W[3];
+					     T2B = FMA(T1d, T1b, T2A);
+					     T1f = FNMS(T1d, T1e, T1c);
+					     T1O = FMA(TF, TD, T1N);
+					     TJ = FNMS(TF, TI, TE);
+					     TT = FNMS(TP, TS, TO);
+					     T1Q = FMA(TP, TN, T1P);
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T2h, T2D, T1Z, T2l, T2J, T22, T2k, T29, T30, T1U, T1V, T1Y, T2Z, T1T;
+			 {
+			      E T2Y, TW, T2V, T1D, T1K, T1S;
+			      {
+				   E Tp, T2W, TU, T1R, T2X, T1g, TV, T1C;
+				   T2h = FNMS(KP500000000, Tl, To);
+				   Tp = Tl + To;
+				   T2W = T2C - T2B;
+				   T2D = FMA(KP500000000, T2C, T2B);
+				   T1Z = TJ - TT;
+				   TU = TJ + TT;
+				   T1R = T1O + T1Q;
+				   T2l = T1Q - T1O;
+				   T2J = FNMS(KP500000000, T2I, T2H);
+				   T2X = T2H + T2I;
+				   T1g = T19 + T1f;
+				   T22 = FNMS(KP500000000, T19, T1f);
+				   T2k = FNMS(KP500000000, TU, Tz);
+				   TV = Tz + TU;
+				   T1C = T1m + T1B;
+				   T29 = FNMS(KP500000000, T1B, T1m);
+				   T2Y = T2W - T2X;
+				   T30 = T2W + T2X;
+				   TW = Tp - TV;
+				   T2V = TV + Tp;
+				   T1U = T1g + T1C;
+				   T1D = T1g - T1C;
+				   T1V = FNMS(KP500000000, T1J, T1E);
+				   T1K = T1E + T1J;
+				   T1S = T1M + T1R;
+				   T1Y = FNMS(KP500000000, T1R, T1M);
+			      }
+			      Ip[WS(rs, 3)] = KP500000000 * (TW + T1D);
+			      Im[WS(rs, 2)] = KP500000000 * (T1D - TW);
+			      Im[WS(rs, 5)] = KP500000000 * (T2Y - T2V);
+			      T2Z = T1K - T1S;
+			      T1T = T1K + T1S;
+			      Ip[0] = KP500000000 * (T2V + T2Y);
+			 }
+			 {
+			      E T2v, T1X, T2Q, T2F, T2R, T2L, T2w, T20, T2t, T28, T2p, T2j;
+			      Rm[WS(rs, 2)] = KP500000000 * (T2Z + T30);
+			      Rp[WS(rs, 3)] = KP500000000 * (T2Z - T30);
+			      Rp[0] = KP500000000 * (T1T + T1U);
+			      Rm[WS(rs, 5)] = KP500000000 * (T1T - T1U);
+			      T2v = FMA(KP866025403, T1W, T1V);
+			      T1X = FNMS(KP866025403, T1W, T1V);
+			      T2Q = FMA(KP866025403, T2E, T2D);
+			      T2F = FNMS(KP866025403, T2E, T2D);
+			      T2R = FMA(KP866025403, T2K, T2J);
+			      T2L = FNMS(KP866025403, T2K, T2J);
+			      T2w = FMA(KP866025403, T1Z, T1Y);
+			      T20 = FNMS(KP866025403, T1Z, T1Y);
+			      T2t = FMA(KP866025403, T27, T22);
+			      T28 = FNMS(KP866025403, T27, T22);
+			      T2p = FMA(KP866025403, T2i, T2h);
+			      T2j = FNMS(KP866025403, T2i, T2h);
+			      {
+				   E T2T, T2q, T2s, T2U;
+				   {
+					E T21, T2f, T2S, T2n, T2P, T2m, T2o, T2g;
+					T2T = T1X - T20;
+					T21 = T1X + T20;
+					T2q = FMA(KP866025403, T2l, T2k);
+					T2m = FNMS(KP866025403, T2l, T2k);
+					T2s = FMA(KP866025403, T2e, T29);
+					T2f = FNMS(KP866025403, T2e, T29);
+					T2S = T2Q + T2R;
+					T2U = T2R - T2Q;
+					T2n = T2j - T2m;
+					T2P = T2m + T2j;
+					T2o = T2f - T28;
+					T2g = T28 + T2f;
+					Im[WS(rs, 3)] = KP500000000 * (T2S - T2P);
+					Ip[WS(rs, 2)] = KP500000000 * (T2P + T2S);
+					Rm[WS(rs, 3)] = KP500000000 * (T21 + T2g);
+					Rp[WS(rs, 2)] = KP500000000 * (T21 - T2g);
+					Ip[WS(rs, 5)] = KP500000000 * (T2n + T2o);
+					Im[0] = KP500000000 * (T2o - T2n);
+				   }
+				   {
+					E T2y, T2x, T2N, T2O, T2r, T2u;
+					T2z = T2q + T2p;
+					T2r = T2p - T2q;
+					T2u = T2s - T2t;
+					T2y = T2t + T2s;
+					T2x = T2v + T2w;
+					T2N = T2v - T2w;
+					Rp[WS(rs, 5)] = KP500000000 * (T2T + T2U);
+					Rm[0] = KP500000000 * (T2T - T2U);
+					Im[WS(rs, 4)] = KP500000000 * (T2u - T2r);
+					Ip[WS(rs, 1)] = KP500000000 * (T2r + T2u);
+					T2O = T2L - T2F;
+					T2M = T2F + T2L;
+					Rp[WS(rs, 1)] = KP500000000 * (T2N + T2O);
+					Rm[WS(rs, 4)] = KP500000000 * (T2N - T2O);
+					Rp[WS(rs, 4)] = KP500000000 * (T2x + T2y);
+					Rm[WS(rs, 1)] = KP500000000 * (T2x - T2y);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Im[WS(rs, 1)] = -(KP500000000 * (T2z + T2M));
+	       Ip[WS(rs, 4)] = KP500000000 * (T2z - T2M);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 12, "hc2cfdft_12", twinstr, &GENUS, {96, 46, 46, 0} };
+
+void X(codelet_hc2cfdft_12) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_12, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cfdft_12 -include hc2cf.h */
+
+/*
+ * This function contains 142 FP additions, 76 FP multiplications,
+ * (or, 112 additions, 46 multiplications, 30 fused multiply/add),
+ * 52 stack variables, 3 constants, and 48 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP433012701, +0.433012701892219323381861585376468091735701313);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
+	       E Tm, T1t, T1d, T2j, Tj, T1Y, T1w, T1G, T1q, T2q, T1U, T2k, Tw, T1y, T17;
+	       E T2g, TP, T21, T1B, T1J, T12, T2u, T1P, T2h;
+	       {
+		    E Tk, Tl, T1k, T1m, T1n, T1o, T4, T1f, T8, T1h, Th, T1c, Td, T1a, T19;
+		    E T1b;
+		    {
+			 E T2, T3, T6, T7;
+			 Tk = Ip[0];
+			 Tl = Im[0];
+			 T1k = Tk + Tl;
+			 T1m = Rp[0];
+			 T1n = Rm[0];
+			 T1o = T1m - T1n;
+			 T2 = Ip[WS(rs, 2)];
+			 T3 = Im[WS(rs, 2)];
+			 T4 = T2 - T3;
+			 T1f = T2 + T3;
+			 T6 = Rp[WS(rs, 2)];
+			 T7 = Rm[WS(rs, 2)];
+			 T8 = T6 + T7;
+			 T1h = T6 - T7;
+			 {
+			      E Tf, Tg, Tb, Tc;
+			      Tf = Rp[WS(rs, 4)];
+			      Tg = Rm[WS(rs, 4)];
+			      Th = Tf + Tg;
+			      T1c = Tf - Tg;
+			      Tb = Ip[WS(rs, 4)];
+			      Tc = Im[WS(rs, 4)];
+			      Td = Tb - Tc;
+			      T1a = Tb + Tc;
+			 }
+		    }
+		    Tm = Tk - Tl;
+		    T1t = T1m + T1n;
+		    T19 = W[16];
+		    T1b = W[17];
+		    T1d = FNMS(T1b, T1c, T19 * T1a);
+		    T2j = FMA(T19, T1c, T1b * T1a);
+		    {
+			 E T9, T1u, Ti, T1v;
+			 {
+			      E T1, T5, Ta, Te;
+			      T1 = W[6];
+			      T5 = W[7];
+			      T9 = FNMS(T5, T8, T1 * T4);
+			      T1u = FMA(T1, T8, T5 * T4);
+			      Ta = W[14];
+			      Te = W[15];
+			      Ti = FNMS(Te, Th, Ta * Td);
+			      T1v = FMA(Ta, Th, Te * Td);
+			 }
+			 Tj = T9 + Ti;
+			 T1Y = KP433012701 * (T1v - T1u);
+			 T1w = T1u + T1v;
+			 T1G = KP433012701 * (T9 - Ti);
+		    }
+		    {
+			 E T1i, T1S, T1p, T1T;
+			 {
+			      E T1e, T1g, T1j, T1l;
+			      T1e = W[8];
+			      T1g = W[9];
+			      T1i = FNMS(T1g, T1h, T1e * T1f);
+			      T1S = FMA(T1e, T1h, T1g * T1f);
+			      T1j = W[0];
+			      T1l = W[1];
+			      T1p = FNMS(T1l, T1o, T1j * T1k);
+			      T1T = FMA(T1j, T1o, T1l * T1k);
+			 }
+			 T1q = T1i + T1p;
+			 T2q = KP433012701 * (T1i - T1p);
+			 T1U = KP433012701 * (T1S - T1T);
+			 T2k = T1S + T1T;
+		    }
+	       }
+	       {
+		    E Tr, TT, Tv, TV, TA, TY, TE, T10, TN, T14, TJ, T16;
+		    {
+			 E Tp, Tq, TC, TD;
+			 Tp = Ip[WS(rs, 3)];
+			 Tq = Im[WS(rs, 3)];
+			 Tr = Tp - Tq;
+			 TT = Tp + Tq;
+			 {
+			      E Tt, Tu, Ty, Tz;
+			      Tt = Rp[WS(rs, 3)];
+			      Tu = Rm[WS(rs, 3)];
+			      Tv = Tt + Tu;
+			      TV = Tt - Tu;
+			      Ty = Ip[WS(rs, 5)];
+			      Tz = Im[WS(rs, 5)];
+			      TA = Ty - Tz;
+			      TY = Ty + Tz;
+			 }
+			 TC = Rp[WS(rs, 5)];
+			 TD = Rm[WS(rs, 5)];
+			 TE = TC + TD;
+			 T10 = TC - TD;
+			 {
+			      E TL, TM, TH, TI;
+			      TL = Rp[WS(rs, 1)];
+			      TM = Rm[WS(rs, 1)];
+			      TN = TL + TM;
+			      T14 = TM - TL;
+			      TH = Ip[WS(rs, 1)];
+			      TI = Im[WS(rs, 1)];
+			      TJ = TH - TI;
+			      T16 = TH + TI;
+			 }
+		    }
+		    {
+			 E To, Ts, T13, T15;
+			 To = W[10];
+			 Ts = W[11];
+			 Tw = FNMS(Ts, Tv, To * Tr);
+			 T1y = FMA(To, Tv, Ts * Tr);
+			 T13 = W[5];
+			 T15 = W[4];
+			 T17 = FMA(T13, T14, T15 * T16);
+			 T2g = FNMS(T13, T16, T15 * T14);
+		    }
+		    {
+			 E TF, T1z, TO, T1A;
+			 {
+			      E Tx, TB, TG, TK;
+			      Tx = W[18];
+			      TB = W[19];
+			      TF = FNMS(TB, TE, Tx * TA);
+			      T1z = FMA(Tx, TE, TB * TA);
+			      TG = W[2];
+			      TK = W[3];
+			      TO = FNMS(TK, TN, TG * TJ);
+			      T1A = FMA(TG, TN, TK * TJ);
+			 }
+			 TP = TF + TO;
+			 T21 = KP433012701 * (T1A - T1z);
+			 T1B = T1z + T1A;
+			 T1J = KP433012701 * (TF - TO);
+		    }
+		    {
+			 E TW, T1O, T11, T1N;
+			 {
+			      E TS, TU, TX, TZ;
+			      TS = W[12];
+			      TU = W[13];
+			      TW = FNMS(TU, TV, TS * TT);
+			      T1O = FMA(TS, TV, TU * TT);
+			      TX = W[20];
+			      TZ = W[21];
+			      T11 = FNMS(TZ, T10, TX * TY);
+			      T1N = FMA(TX, T10, TZ * TY);
+			 }
+			 T12 = TW + T11;
+			 T2u = KP433012701 * (T11 - TW);
+			 T1P = KP433012701 * (T1N - T1O);
+			 T2h = T1O + T1N;
+		    }
+	       }
+	       {
+		    E TR, T2f, T2m, T2o, T1s, T1E, T1D, T2n;
+		    {
+			 E Tn, TQ, T2i, T2l;
+			 Tn = Tj + Tm;
+			 TQ = Tw + TP;
+			 TR = Tn - TQ;
+			 T2f = TQ + Tn;
+			 T2i = T2g - T2h;
+			 T2l = T2j + T2k;
+			 T2m = T2i - T2l;
+			 T2o = T2i + T2l;
+		    }
+		    {
+			 E T18, T1r, T1x, T1C;
+			 T18 = T12 + T17;
+			 T1r = T1d + T1q;
+			 T1s = T18 - T1r;
+			 T1E = T18 + T1r;
+			 T1x = T1t + T1w;
+			 T1C = T1y + T1B;
+			 T1D = T1x + T1C;
+			 T2n = T1x - T1C;
+		    }
+		    Ip[WS(rs, 3)] = KP500000000 * (TR + T1s);
+		    Rp[WS(rs, 3)] = KP500000000 * (T2n - T2o);
+		    Im[WS(rs, 2)] = KP500000000 * (T1s - TR);
+		    Rm[WS(rs, 2)] = KP500000000 * (T2n + T2o);
+		    Rm[WS(rs, 5)] = KP500000000 * (T1D - T1E);
+		    Im[WS(rs, 5)] = KP500000000 * (T2m - T2f);
+		    Rp[0] = KP500000000 * (T1D + T1E);
+		    Ip[0] = KP500000000 * (T2f + T2m);
+	       }
+	       {
+		    E T1H, T2b, T2s, T2B, T2v, T2A, T1K, T2c, T1Q, T29, T1Z, T25, T22, T26, T1V;
+		    E T28;
+		    {
+			 E T1F, T2r, T2t, T1I;
+			 T1F = FNMS(KP250000000, T1w, KP500000000 * T1t);
+			 T1H = T1F - T1G;
+			 T2b = T1F + T1G;
+			 T2r = FNMS(KP500000000, T2j, KP250000000 * T2k);
+			 T2s = T2q - T2r;
+			 T2B = T2q + T2r;
+			 T2t = FMA(KP250000000, T2h, KP500000000 * T2g);
+			 T2v = T2t - T2u;
+			 T2A = T2u + T2t;
+			 T1I = FNMS(KP250000000, T1B, KP500000000 * T1y);
+			 T1K = T1I - T1J;
+			 T2c = T1I + T1J;
+		    }
+		    {
+			 E T1M, T1X, T20, T1R;
+			 T1M = FNMS(KP250000000, T12, KP500000000 * T17);
+			 T1Q = T1M - T1P;
+			 T29 = T1P + T1M;
+			 T1X = FNMS(KP250000000, Tj, KP500000000 * Tm);
+			 T1Z = T1X - T1Y;
+			 T25 = T1Y + T1X;
+			 T20 = FNMS(KP250000000, TP, KP500000000 * Tw);
+			 T22 = T20 - T21;
+			 T26 = T21 + T20;
+			 T1R = FNMS(KP250000000, T1q, KP500000000 * T1d);
+			 T1V = T1R - T1U;
+			 T28 = T1R + T1U;
+		    }
+		    {
+			 E T1L, T1W, T2p, T2w;
+			 T1L = T1H + T1K;
+			 T1W = T1Q + T1V;
+			 Rp[WS(rs, 2)] = T1L - T1W;
+			 Rm[WS(rs, 3)] = T1L + T1W;
+			 T2p = T22 + T1Z;
+			 T2w = T2s - T2v;
+			 Ip[WS(rs, 2)] = T2p + T2w;
+			 Im[WS(rs, 3)] = T2w - T2p;
+		    }
+		    {
+			 E T23, T24, T2x, T2y;
+			 T23 = T1Z - T22;
+			 T24 = T1V - T1Q;
+			 Ip[WS(rs, 5)] = T23 + T24;
+			 Im[0] = T24 - T23;
+			 T2x = T1H - T1K;
+			 T2y = T2v + T2s;
+			 Rm[0] = T2x - T2y;
+			 Rp[WS(rs, 5)] = T2x + T2y;
+		    }
+		    {
+			 E T27, T2a, T2z, T2C;
+			 T27 = T25 - T26;
+			 T2a = T28 - T29;
+			 Ip[WS(rs, 1)] = T27 + T2a;
+			 Im[WS(rs, 4)] = T2a - T27;
+			 T2z = T2b - T2c;
+			 T2C = T2A - T2B;
+			 Rm[WS(rs, 4)] = T2z - T2C;
+			 Rp[WS(rs, 1)] = T2z + T2C;
+		    }
+		    {
+			 E T2d, T2e, T2D, T2E;
+			 T2d = T2b + T2c;
+			 T2e = T29 + T28;
+			 Rm[WS(rs, 1)] = T2d - T2e;
+			 Rp[WS(rs, 4)] = T2d + T2e;
+			 T2D = T26 + T25;
+			 T2E = T2A + T2B;
+			 Ip[WS(rs, 4)] = T2D + T2E;
+			 Im[WS(rs, 1)] = T2E - T2D;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 12, "hc2cfdft_12", twinstr, &GENUS, {112, 46, 30, 0} };
+
+void X(codelet_hc2cfdft_12) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_12, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,896 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:46 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cfdft_16 -include hc2cf.h */
+
+/*
+ * This function contains 206 FP additions, 132 FP multiplications,
+ * (or, 136 additions, 62 multiplications, 70 fused multiply/add),
+ * 96 stack variables, 4 constants, and 64 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T4d, T4g;
+	       {
+		    E T1f, T2e, T3D, T1K, T2g, T1c, T3H, T2W, T2j, TR, T3E, T2R, T2l, T11, T3G;
+		    E T1v, T3p, T2s, Tl, T3o, T3w, T2G, T3z, T1Y, T23, T20, T2H, T21, T29, Tz;
+		    E T26, TE, TA, T2v, T2J, T27, Tv, T2u, TB, T22, T28;
+		    {
+			 E T1o, T1u, T2T, T2V;
+			 {
+			      E T1I, T1A, T16, T1C, T1H, T1G, T2U, T1z, T1b, T1x, T1w;
+			      {
+				   E T1d, T1e, T14, T15;
+				   T1d = Ip[0];
+				   T1e = Im[0];
+				   T14 = Ip[WS(rs, 4)];
+				   T15 = Im[WS(rs, 4)];
+				   {
+					E T1F, T1D, T1E, T19, T1a;
+					T1D = Rm[0];
+					T1I = T1d + T1e;
+					T1f = T1d - T1e;
+					T1E = Rp[0];
+					T1A = T14 + T15;
+					T16 = T14 - T15;
+					T1C = W[0];
+					T2e = T1E + T1D;
+					T1F = T1D - T1E;
+					T1H = W[1];
+					T19 = Rp[WS(rs, 4)];
+					T1a = Rm[WS(rs, 4)];
+					T1G = T1C * T1F;
+					T2U = T1H * T1F;
+					T1z = W[17];
+					T1b = T19 + T1a;
+					T1x = T1a - T19;
+					T1w = W[16];
+				   }
+			      }
+			      {
+				   E T2S, T1y, T13, T18;
+				   T2S = T1z * T1x;
+				   T1y = T1w * T1x;
+				   T13 = W[14];
+				   T18 = W[15];
+				   {
+					E T1J, T1B, T2f, T17;
+					T1J = FNMS(T1H, T1I, T1G);
+					T1B = FNMS(T1z, T1A, T1y);
+					T2f = T13 * T1b;
+					T17 = T13 * T16;
+					T2T = FMA(T1w, T1A, T2S);
+					T3D = T1J - T1B;
+					T1K = T1B + T1J;
+					T2g = FMA(T18, T16, T2f);
+					T1c = FNMS(T18, T1b, T17);
+					T2V = FMA(T1C, T1I, T2U);
+				   }
+			      }
+			 }
+			 {
+			      E T1n, TL, T1m, T1j, TQ, T1l, T2N, TV, T1t, T10, T1q, T1s, T1p, T1r, T2O;
+			      E T2Q;
+			      {
+				   E TO, TP, TJ, TK;
+				   TJ = Ip[WS(rs, 2)];
+				   TK = Im[WS(rs, 2)];
+				   TO = Rp[WS(rs, 2)];
+				   T3H = T2V - T2T;
+				   T2W = T2T + T2V;
+				   T1n = TJ + TK;
+				   TL = TJ - TK;
+				   TP = Rm[WS(rs, 2)];
+				   T1m = W[9];
+				   T1j = W[8];
+				   {
+					E TT, T1k, TU, TY, TZ;
+					TT = Ip[WS(rs, 6)];
+					TQ = TO + TP;
+					T1k = TP - TO;
+					TU = Im[WS(rs, 6)];
+					TY = Rp[WS(rs, 6)];
+					TZ = Rm[WS(rs, 6)];
+					T1l = T1j * T1k;
+					T2N = T1m * T1k;
+					TV = TT - TU;
+					T1t = TT + TU;
+					T10 = TY + TZ;
+					T1q = TZ - TY;
+					T1s = W[25];
+					T1p = W[24];
+				   }
+			      }
+			      {
+				   E TN, T2P, T2i, TM, TI;
+				   TI = W[6];
+				   TN = W[7];
+				   T2P = T1s * T1q;
+				   T1r = T1p * T1q;
+				   T2i = TI * TQ;
+				   TM = TI * TL;
+				   T2O = FMA(T1j, T1n, T2N);
+				   T2Q = FMA(T1p, T1t, T2P);
+				   T2j = FMA(TN, TL, T2i);
+				   TR = FNMS(TN, TQ, TM);
+			      }
+			      {
+				   E TX, T2k, TW, TS;
+				   TS = W[22];
+				   T3E = T2O - T2Q;
+				   T2R = T2O + T2Q;
+				   TX = W[23];
+				   T2k = TS * T10;
+				   TW = TS * TV;
+				   T1o = FNMS(T1m, T1n, T1l);
+				   T1u = FNMS(T1s, T1t, T1r);
+				   T2l = FMA(TX, TV, T2k);
+				   T11 = FNMS(TX, T10, TW);
+			      }
+			 }
+			 {
+			      E T1Q, T1N, T2C, T1O, T1W, Te, T1T, Tj, Tf, T2q, T2E, T1U, Ta, T2p, Tg;
+			      E T1P, T1V;
+			      {
+				   E T4, T9, T5, T2o, Tb, T1S, T1, T1M, T6;
+				   {
+					E T2, T3, T7, T8;
+					T2 = Ip[WS(rs, 1)];
+					T3G = T1o - T1u;
+					T1v = T1o + T1u;
+					T3 = Im[WS(rs, 1)];
+					T7 = Rp[WS(rs, 1)];
+					T8 = Rm[WS(rs, 1)];
+					T1 = W[2];
+					T1Q = T2 + T3;
+					T4 = T2 - T3;
+					T1N = T7 - T8;
+					T9 = T7 + T8;
+					T1M = W[4];
+					T5 = T1 * T4;
+				   }
+				   {
+					E Tc, Td, Th, Ti;
+					Tc = Ip[WS(rs, 5)];
+					T2o = T1 * T9;
+					T2C = T1M * T1Q;
+					T1O = T1M * T1N;
+					Td = Im[WS(rs, 5)];
+					Th = Rp[WS(rs, 5)];
+					Ti = Rm[WS(rs, 5)];
+					Tb = W[18];
+					T1W = Tc + Td;
+					Te = Tc - Td;
+					T1T = Th - Ti;
+					Tj = Th + Ti;
+					T1S = W[20];
+					Tf = Tb * Te;
+				   }
+				   T6 = W[3];
+				   T2q = Tb * Tj;
+				   T2E = T1S * T1W;
+				   T1U = T1S * T1T;
+				   Ta = FNMS(T6, T9, T5);
+				   T2p = FMA(T6, T4, T2o);
+				   Tg = W[19];
+				   T1P = W[5];
+				   T1V = W[21];
+			      }
+			      {
+				   E Tp, Tu, Tq, T2t, Tw, T25, Tm, T1Z, Tr;
+				   {
+					E Tn, To, Ts, Tt, T2r, Tk;
+					Tn = Ip[WS(rs, 7)];
+					T2r = FMA(Tg, Te, T2q);
+					Tk = FNMS(Tg, Tj, Tf);
+					{
+					     E T2D, T1R, T2F, T1X;
+					     T2D = FNMS(T1P, T1N, T2C);
+					     T1R = FMA(T1P, T1Q, T1O);
+					     T2F = FNMS(T1V, T1T, T2E);
+					     T1X = FMA(T1V, T1W, T1U);
+					     T3p = T2p - T2r;
+					     T2s = T2p + T2r;
+					     Tl = Ta + Tk;
+					     T3o = Ta - Tk;
+					     T3w = T2F - T2D;
+					     T2G = T2D + T2F;
+					     T3z = T1X - T1R;
+					     T1Y = T1R + T1X;
+					     To = Im[WS(rs, 7)];
+					}
+					Ts = Rp[WS(rs, 7)];
+					Tt = Rm[WS(rs, 7)];
+					Tm = W[26];
+					T23 = Tn + To;
+					Tp = Tn - To;
+					T20 = Ts - Tt;
+					Tu = Ts + Tt;
+					T1Z = W[28];
+					Tq = Tm * Tp;
+				   }
+				   {
+					E Tx, Ty, TC, TD;
+					Tx = Ip[WS(rs, 3)];
+					T2t = Tm * Tu;
+					T2H = T1Z * T23;
+					T21 = T1Z * T20;
+					Ty = Im[WS(rs, 3)];
+					TC = Rp[WS(rs, 3)];
+					TD = Rm[WS(rs, 3)];
+					Tw = W[10];
+					T29 = Tx + Ty;
+					Tz = Tx - Ty;
+					T26 = TC - TD;
+					TE = TC + TD;
+					T25 = W[12];
+					TA = Tw * Tz;
+				   }
+				   Tr = W[27];
+				   T2v = Tw * TE;
+				   T2J = T25 * T29;
+				   T27 = T25 * T26;
+				   Tv = FNMS(Tr, Tu, Tq);
+				   T2u = FMA(Tr, Tp, T2t);
+				   TB = W[11];
+				   T22 = W[29];
+				   T28 = W[13];
+			      }
+			 }
+		    }
+		    {
+			 E T3r, T3s, T3A, T3x, T3M, T3l, T3L, T3m, T3f, T3i;
+			 {
+			      E T3c, TH, T36, T3g, T3h, T39, T32, T1h, T2A, T2d, T2h, T31, T2y, T30, T2Y;
+			      E T2m, T2B, T1i;
+			      {
+				   E T2x, T2M, T1L, T2c, T2X, T12, T1g;
+				   {
+					E TG, T2b, T34, T2L, T2w, TF, T37, T38, T35;
+					T2w = FMA(TB, Tz, T2v);
+					TF = FNMS(TB, TE, TA);
+					{
+					     E T2I, T24, T2K, T2a;
+					     T2I = FNMS(T22, T20, T2H);
+					     T24 = FMA(T22, T23, T21);
+					     T2K = FNMS(T28, T26, T2J);
+					     T2a = FMA(T28, T29, T27);
+					     T3r = T2u - T2w;
+					     T2x = T2u + T2w;
+					     TG = Tv + TF;
+					     T3s = Tv - TF;
+					     T2L = T2I + T2K;
+					     T3A = T2I - T2K;
+					     T3x = T2a - T24;
+					     T2b = T24 + T2a;
+					}
+					T2M = T2G + T2L;
+					T34 = T2L - T2G;
+					T37 = T1K - T1v;
+					T1L = T1v + T1K;
+					T2c = T1Y + T2b;
+					T35 = T1Y - T2b;
+					T3c = Tl - TG;
+					TH = Tl + TG;
+					T38 = T2W - T2R;
+					T2X = T2R + T2W;
+					T36 = T34 + T35;
+					T3g = T34 - T35;
+					T3M = TR - T11;
+					T12 = TR + T11;
+					T3h = T37 + T38;
+					T39 = T37 - T38;
+					T1g = T1c + T1f;
+					T3l = T1f - T1c;
+				   }
+				   T32 = T1g - T12;
+				   T1h = T12 + T1g;
+				   T2A = T2c + T1L;
+				   T2d = T1L - T2c;
+				   T3L = T2e - T2g;
+				   T2h = T2e + T2g;
+				   T31 = T2x - T2s;
+				   T2y = T2s + T2x;
+				   T30 = T2M + T2X;
+				   T2Y = T2M - T2X;
+				   T2m = T2j + T2l;
+				   T3m = T2j - T2l;
+			      }
+			      T2B = T1h - TH;
+			      T1i = TH + T1h;
+			      {
+				   E T3e, T3d, T3j, T3k;
+				   {
+					E T33, T3b, T2z, T2Z, T3a, T2n;
+					T3f = T32 - T31;
+					T33 = T31 + T32;
+					T3b = T2h - T2m;
+					T2n = T2h + T2m;
+					Im[WS(rs, 7)] = KP500000000 * (T2d - T1i);
+					Ip[0] = KP500000000 * (T1i + T2d);
+					Im[WS(rs, 3)] = KP500000000 * (T2Y - T2B);
+					Ip[WS(rs, 4)] = KP500000000 * (T2B + T2Y);
+					T2z = T2n - T2y;
+					T2Z = T2n + T2y;
+					T3a = T36 + T39;
+					T3e = T39 - T36;
+					T3d = T3b - T3c;
+					T3j = T3b + T3c;
+					Rp[WS(rs, 4)] = KP500000000 * (T2z + T2A);
+					Rm[WS(rs, 3)] = KP500000000 * (T2z - T2A);
+					Rp[0] = KP500000000 * (T2Z + T30);
+					Rm[WS(rs, 7)] = KP500000000 * (T2Z - T30);
+					Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP707106781, T3a, T33)));
+					Ip[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3a, T33));
+					T3k = T3g + T3h;
+					T3i = T3g - T3h;
+				   }
+				   Rp[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3k, T3j));
+				   Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP707106781, T3k, T3j));
+				   Rp[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3e, T3d));
+				   Rm[WS(rs, 1)] = KP500000000 * (FNMS(KP707106781, T3e, T3d));
+			      }
+			 }
+			 {
+			      E T3Z, T3n, T3F, T3I, T4e, T44, T4f, T47, T4a, T3u, T3U, T3C, T49, T3N, T40;
+			      E T3Q;
+			      {
+				   E T3y, T3B, T3O, T3q, T3t, T3P;
+				   {
+					E T42, T43, T45, T46;
+					T3y = T3w + T3x;
+					T42 = T3w - T3x;
+					Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP707106781, T3i, T3f)));
+					Ip[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3i, T3f));
+					T3Z = T3m + T3l;
+					T3n = T3l - T3m;
+					T43 = T3A - T3z;
+					T3B = T3z + T3A;
+					T3F = T3D - T3E;
+					T45 = T3E + T3D;
+					T46 = T3H - T3G;
+					T3I = T3G + T3H;
+					T3O = T3p + T3o;
+					T3q = T3o - T3p;
+					T4e = FNMS(KP414213562, T42, T43);
+					T44 = FMA(KP414213562, T43, T42);
+					T4f = FNMS(KP414213562, T45, T46);
+					T47 = FMA(KP414213562, T46, T45);
+					T3t = T3r + T3s;
+					T3P = T3r - T3s;
+				   }
+				   T4a = T3q - T3t;
+				   T3u = T3q + T3t;
+				   T3U = FNMS(KP414213562, T3y, T3B);
+				   T3C = FMA(KP414213562, T3B, T3y);
+				   T49 = T3L - T3M;
+				   T3N = T3L + T3M;
+				   T40 = T3P - T3O;
+				   T3Q = T3O + T3P;
+			      }
+			      {
+				   E T3T, T3v, T3X, T3R, T3J, T3V;
+				   T3T = FNMS(KP707106781, T3u, T3n);
+				   T3v = FMA(KP707106781, T3u, T3n);
+				   T3X = FMA(KP707106781, T3Q, T3N);
+				   T3R = FNMS(KP707106781, T3Q, T3N);
+				   T3J = FNMS(KP414213562, T3I, T3F);
+				   T3V = FMA(KP414213562, T3F, T3I);
+				   {
+					E T4c, T4b, T4h, T4i, T41, T48;
+					T4d = FMA(KP707106781, T40, T3Z);
+					T41 = FNMS(KP707106781, T40, T3Z);
+					T48 = T44 - T47;
+					T4c = T44 + T47;
+					{
+					     E T3Y, T3W, T3K, T3S;
+					     T3Y = T3U + T3V;
+					     T3W = T3U - T3V;
+					     T3K = T3C + T3J;
+					     T3S = T3J - T3C;
+					     Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP923879532, T3W, T3T)));
+					     Ip[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T3W, T3T));
+					     Rp[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3Y, T3X));
+					     Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP923879532, T3Y, T3X));
+					     Rp[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T3S, T3R));
+					     Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP923879532, T3S, T3R));
+					     Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP923879532, T3K, T3v)));
+					     Ip[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3K, T3v));
+					     Ip[WS(rs, 7)] = KP500000000 * (FMA(KP923879532, T48, T41));
+					     Im[0] = -(KP500000000 * (FNMS(KP923879532, T48, T41)));
+					}
+					T4b = FMA(KP707106781, T4a, T49);
+					T4h = FNMS(KP707106781, T4a, T49);
+					T4i = T4e + T4f;
+					T4g = T4e - T4f;
+					Rm[0] = KP500000000 * (FMA(KP923879532, T4i, T4h));
+					Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP923879532, T4i, T4h));
+					Rp[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4c, T4b));
+					Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP923879532, T4c, T4b));
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP923879532, T4g, T4d)));
+	       Ip[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4g, T4d));
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cfdft_16", twinstr, &GENUS, {136, 62, 70, 0} };
+
+void X(codelet_hc2cfdft_16) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_16, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cfdft_16 -include hc2cf.h */
+
+/*
+ * This function contains 206 FP additions, 100 FP multiplications,
+ * (or, 168 additions, 62 multiplications, 38 fused multiply/add),
+ * 61 stack variables, 4 constants, and 64 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP461939766, +0.461939766255643378064091594698394143411208313);
+     DK(KP191341716, +0.191341716182544885864229992015199433380672281);
+     DK(KP353553390, +0.353553390593273762200422181052424519642417969);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T19, T3h, T21, T2Y, T1o, T3d, T2s, T39, TW, T3i, T24, T2Z, T1z, T3c, T2p;
+	       E T3a, Tj, T2S, T28, T2R, T1L, T36, T2i, T32, TC, T2V, T2b, T2U, T1W, T35;
+	       E T2l, T33;
+	       {
+		    E T10, T1m, T14, T1k, T18, T1h, T1f, T1Z;
+		    {
+			 E TY, TZ, T12, T13;
+			 TY = Ip[WS(rs, 4)];
+			 TZ = Im[WS(rs, 4)];
+			 T10 = TY - TZ;
+			 T1m = TY + TZ;
+			 T12 = Rp[WS(rs, 4)];
+			 T13 = Rm[WS(rs, 4)];
+			 T14 = T12 + T13;
+			 T1k = T12 - T13;
+		    }
+		    {
+			 E T16, T17, T1d, T1e;
+			 T16 = Ip[0];
+			 T17 = Im[0];
+			 T18 = T16 - T17;
+			 T1h = T16 + T17;
+			 T1d = Rm[0];
+			 T1e = Rp[0];
+			 T1f = T1d - T1e;
+			 T1Z = T1e + T1d;
+		    }
+		    {
+			 E T15, T20, TX, T11;
+			 TX = W[14];
+			 T11 = W[15];
+			 T15 = FNMS(T11, T14, TX * T10);
+			 T20 = FMA(TX, T14, T11 * T10);
+			 T19 = T15 + T18;
+			 T3h = T1Z - T20;
+			 T21 = T1Z + T20;
+			 T2Y = T18 - T15;
+		    }
+		    {
+			 E T1i, T2r, T1n, T2q;
+			 {
+			      E T1c, T1g, T1j, T1l;
+			      T1c = W[0];
+			      T1g = W[1];
+			      T1i = FNMS(T1g, T1h, T1c * T1f);
+			      T2r = FMA(T1g, T1f, T1c * T1h);
+			      T1j = W[16];
+			      T1l = W[17];
+			      T1n = FMA(T1j, T1k, T1l * T1m);
+			      T2q = FNMS(T1l, T1k, T1j * T1m);
+			 }
+			 T1o = T1i - T1n;
+			 T3d = T2r - T2q;
+			 T2s = T2q + T2r;
+			 T39 = T1n + T1i;
+		    }
+	       }
+	       {
+		    E TH, T1s, TL, T1q, TQ, T1x, TU, T1v;
+		    {
+			 E TF, TG, TJ, TK;
+			 TF = Ip[WS(rs, 2)];
+			 TG = Im[WS(rs, 2)];
+			 TH = TF - TG;
+			 T1s = TF + TG;
+			 TJ = Rp[WS(rs, 2)];
+			 TK = Rm[WS(rs, 2)];
+			 TL = TJ + TK;
+			 T1q = TJ - TK;
+		    }
+		    {
+			 E TO, TP, TS, TT;
+			 TO = Ip[WS(rs, 6)];
+			 TP = Im[WS(rs, 6)];
+			 TQ = TO - TP;
+			 T1x = TO + TP;
+			 TS = Rp[WS(rs, 6)];
+			 TT = Rm[WS(rs, 6)];
+			 TU = TS + TT;
+			 T1v = TS - TT;
+		    }
+		    {
+			 E TM, T22, TV, T23;
+			 {
+			      E TE, TI, TN, TR;
+			      TE = W[6];
+			      TI = W[7];
+			      TM = FNMS(TI, TL, TE * TH);
+			      T22 = FMA(TE, TL, TI * TH);
+			      TN = W[22];
+			      TR = W[23];
+			      TV = FNMS(TR, TU, TN * TQ);
+			      T23 = FMA(TN, TU, TR * TQ);
+			 }
+			 TW = TM + TV;
+			 T3i = TM - TV;
+			 T24 = T22 + T23;
+			 T2Z = T22 - T23;
+		    }
+		    {
+			 E T1t, T2n, T1y, T2o;
+			 {
+			      E T1p, T1r, T1u, T1w;
+			      T1p = W[8];
+			      T1r = W[9];
+			      T1t = FMA(T1p, T1q, T1r * T1s);
+			      T2n = FNMS(T1r, T1q, T1p * T1s);
+			      T1u = W[24];
+			      T1w = W[25];
+			      T1y = FMA(T1u, T1v, T1w * T1x);
+			      T2o = FNMS(T1w, T1v, T1u * T1x);
+			 }
+			 T1z = T1t + T1y;
+			 T3c = T1y - T1t;
+			 T2p = T2n + T2o;
+			 T3a = T2n - T2o;
+		    }
+	       }
+	       {
+		    E T4, T1E, T8, T1C, Td, T1J, Th, T1H;
+		    {
+			 E T2, T3, T6, T7;
+			 T2 = Ip[WS(rs, 1)];
+			 T3 = Im[WS(rs, 1)];
+			 T4 = T2 - T3;
+			 T1E = T2 + T3;
+			 T6 = Rp[WS(rs, 1)];
+			 T7 = Rm[WS(rs, 1)];
+			 T8 = T6 + T7;
+			 T1C = T6 - T7;
+		    }
+		    {
+			 E Tb, Tc, Tf, Tg;
+			 Tb = Ip[WS(rs, 5)];
+			 Tc = Im[WS(rs, 5)];
+			 Td = Tb - Tc;
+			 T1J = Tb + Tc;
+			 Tf = Rp[WS(rs, 5)];
+			 Tg = Rm[WS(rs, 5)];
+			 Th = Tf + Tg;
+			 T1H = Tf - Tg;
+		    }
+		    {
+			 E T9, T26, Ti, T27;
+			 {
+			      E T1, T5, Ta, Te;
+			      T1 = W[2];
+			      T5 = W[3];
+			      T9 = FNMS(T5, T8, T1 * T4);
+			      T26 = FMA(T1, T8, T5 * T4);
+			      Ta = W[18];
+			      Te = W[19];
+			      Ti = FNMS(Te, Th, Ta * Td);
+			      T27 = FMA(Ta, Th, Te * Td);
+			 }
+			 Tj = T9 + Ti;
+			 T2S = T26 - T27;
+			 T28 = T26 + T27;
+			 T2R = T9 - Ti;
+		    }
+		    {
+			 E T1F, T2g, T1K, T2h;
+			 {
+			      E T1B, T1D, T1G, T1I;
+			      T1B = W[4];
+			      T1D = W[5];
+			      T1F = FMA(T1B, T1C, T1D * T1E);
+			      T2g = FNMS(T1D, T1C, T1B * T1E);
+			      T1G = W[20];
+			      T1I = W[21];
+			      T1K = FMA(T1G, T1H, T1I * T1J);
+			      T2h = FNMS(T1I, T1H, T1G * T1J);
+			 }
+			 T1L = T1F + T1K;
+			 T36 = T2g - T2h;
+			 T2i = T2g + T2h;
+			 T32 = T1K - T1F;
+		    }
+	       }
+	       {
+		    E Tn, T1P, Tr, T1N, Tw, T1U, TA, T1S;
+		    {
+			 E Tl, Tm, Tp, Tq;
+			 Tl = Ip[WS(rs, 7)];
+			 Tm = Im[WS(rs, 7)];
+			 Tn = Tl - Tm;
+			 T1P = Tl + Tm;
+			 Tp = Rp[WS(rs, 7)];
+			 Tq = Rm[WS(rs, 7)];
+			 Tr = Tp + Tq;
+			 T1N = Tp - Tq;
+		    }
+		    {
+			 E Tu, Tv, Ty, Tz;
+			 Tu = Ip[WS(rs, 3)];
+			 Tv = Im[WS(rs, 3)];
+			 Tw = Tu - Tv;
+			 T1U = Tu + Tv;
+			 Ty = Rp[WS(rs, 3)];
+			 Tz = Rm[WS(rs, 3)];
+			 TA = Ty + Tz;
+			 T1S = Ty - Tz;
+		    }
+		    {
+			 E Ts, T29, TB, T2a;
+			 {
+			      E Tk, To, Tt, Tx;
+			      Tk = W[26];
+			      To = W[27];
+			      Ts = FNMS(To, Tr, Tk * Tn);
+			      T29 = FMA(Tk, Tr, To * Tn);
+			      Tt = W[10];
+			      Tx = W[11];
+			      TB = FNMS(Tx, TA, Tt * Tw);
+			      T2a = FMA(Tt, TA, Tx * Tw);
+			 }
+			 TC = Ts + TB;
+			 T2V = Ts - TB;
+			 T2b = T29 + T2a;
+			 T2U = T29 - T2a;
+		    }
+		    {
+			 E T1Q, T2j, T1V, T2k;
+			 {
+			      E T1M, T1O, T1R, T1T;
+			      T1M = W[28];
+			      T1O = W[29];
+			      T1Q = FMA(T1M, T1N, T1O * T1P);
+			      T2j = FNMS(T1O, T1N, T1M * T1P);
+			      T1R = W[12];
+			      T1T = W[13];
+			      T1V = FMA(T1R, T1S, T1T * T1U);
+			      T2k = FNMS(T1T, T1S, T1R * T1U);
+			 }
+			 T1W = T1Q + T1V;
+			 T35 = T1V - T1Q;
+			 T2l = T2j + T2k;
+			 T33 = T2j - T2k;
+		    }
+	       }
+	       {
+		    E T1b, T2f, T2u, T2w, T1Y, T2e, T2d, T2v;
+		    {
+			 E TD, T1a, T2m, T2t;
+			 TD = Tj + TC;
+			 T1a = TW + T19;
+			 T1b = TD + T1a;
+			 T2f = T1a - TD;
+			 T2m = T2i + T2l;
+			 T2t = T2p + T2s;
+			 T2u = T2m - T2t;
+			 T2w = T2m + T2t;
+		    }
+		    {
+			 E T1A, T1X, T25, T2c;
+			 T1A = T1o - T1z;
+			 T1X = T1L + T1W;
+			 T1Y = T1A - T1X;
+			 T2e = T1X + T1A;
+			 T25 = T21 + T24;
+			 T2c = T28 + T2b;
+			 T2d = T25 - T2c;
+			 T2v = T25 + T2c;
+		    }
+		    Ip[0] = KP500000000 * (T1b + T1Y);
+		    Rp[0] = KP500000000 * (T2v + T2w);
+		    Im[WS(rs, 7)] = KP500000000 * (T1Y - T1b);
+		    Rm[WS(rs, 7)] = KP500000000 * (T2v - T2w);
+		    Rm[WS(rs, 3)] = KP500000000 * (T2d - T2e);
+		    Im[WS(rs, 3)] = KP500000000 * (T2u - T2f);
+		    Rp[WS(rs, 4)] = KP500000000 * (T2d + T2e);
+		    Ip[WS(rs, 4)] = KP500000000 * (T2f + T2u);
+	       }
+	       {
+		    E T2z, T2L, T2J, T2P, T2C, T2M, T2F, T2N;
+		    {
+			 E T2x, T2y, T2H, T2I;
+			 T2x = T2b - T28;
+			 T2y = T19 - TW;
+			 T2z = KP500000000 * (T2x + T2y);
+			 T2L = KP500000000 * (T2y - T2x);
+			 T2H = T21 - T24;
+			 T2I = Tj - TC;
+			 T2J = KP500000000 * (T2H - T2I);
+			 T2P = KP500000000 * (T2H + T2I);
+		    }
+		    {
+			 E T2A, T2B, T2D, T2E;
+			 T2A = T2l - T2i;
+			 T2B = T1L - T1W;
+			 T2C = T2A + T2B;
+			 T2M = T2A - T2B;
+			 T2D = T1z + T1o;
+			 T2E = T2s - T2p;
+			 T2F = T2D - T2E;
+			 T2N = T2D + T2E;
+		    }
+		    {
+			 E T2G, T2Q, T2K, T2O;
+			 T2G = KP353553390 * (T2C + T2F);
+			 Ip[WS(rs, 2)] = T2z + T2G;
+			 Im[WS(rs, 5)] = T2G - T2z;
+			 T2Q = KP353553390 * (T2M + T2N);
+			 Rm[WS(rs, 5)] = T2P - T2Q;
+			 Rp[WS(rs, 2)] = T2P + T2Q;
+			 T2K = KP353553390 * (T2F - T2C);
+			 Rm[WS(rs, 1)] = T2J - T2K;
+			 Rp[WS(rs, 6)] = T2J + T2K;
+			 T2O = KP353553390 * (T2M - T2N);
+			 Ip[WS(rs, 6)] = T2L + T2O;
+			 Im[WS(rs, 1)] = T2O - T2L;
+		    }
+	       }
+	       {
+		    E T30, T3w, T3F, T3j, T2X, T3G, T3D, T3L, T3m, T3v, T38, T3q, T3A, T3K, T3f;
+		    E T3r;
+		    {
+			 E T2T, T2W, T34, T37;
+			 T30 = KP500000000 * (T2Y - T2Z);
+			 T3w = KP500000000 * (T2Z + T2Y);
+			 T3F = KP500000000 * (T3h - T3i);
+			 T3j = KP500000000 * (T3h + T3i);
+			 T2T = T2R - T2S;
+			 T2W = T2U + T2V;
+			 T2X = KP353553390 * (T2T + T2W);
+			 T3G = KP353553390 * (T2T - T2W);
+			 {
+			      E T3B, T3C, T3k, T3l;
+			      T3B = T3a + T39;
+			      T3C = T3d - T3c;
+			      T3D = FNMS(KP461939766, T3C, KP191341716 * T3B);
+			      T3L = FMA(KP461939766, T3B, KP191341716 * T3C);
+			      T3k = T2S + T2R;
+			      T3l = T2U - T2V;
+			      T3m = KP353553390 * (T3k + T3l);
+			      T3v = KP353553390 * (T3l - T3k);
+			 }
+			 T34 = T32 + T33;
+			 T37 = T35 - T36;
+			 T38 = FMA(KP191341716, T34, KP461939766 * T37);
+			 T3q = FNMS(KP191341716, T37, KP461939766 * T34);
+			 {
+			      E T3y, T3z, T3b, T3e;
+			      T3y = T33 - T32;
+			      T3z = T36 + T35;
+			      T3A = FMA(KP461939766, T3y, KP191341716 * T3z);
+			      T3K = FNMS(KP461939766, T3z, KP191341716 * T3y);
+			      T3b = T39 - T3a;
+			      T3e = T3c + T3d;
+			      T3f = FNMS(KP191341716, T3e, KP461939766 * T3b);
+			      T3r = FMA(KP191341716, T3b, KP461939766 * T3e);
+			 }
+		    }
+		    {
+			 E T31, T3g, T3t, T3u;
+			 T31 = T2X + T30;
+			 T3g = T38 + T3f;
+			 Ip[WS(rs, 1)] = T31 + T3g;
+			 Im[WS(rs, 6)] = T3g - T31;
+			 T3t = T3j + T3m;
+			 T3u = T3q + T3r;
+			 Rm[WS(rs, 6)] = T3t - T3u;
+			 Rp[WS(rs, 1)] = T3t + T3u;
+		    }
+		    {
+			 E T3n, T3o, T3p, T3s;
+			 T3n = T3j - T3m;
+			 T3o = T3f - T38;
+			 Rm[WS(rs, 2)] = T3n - T3o;
+			 Rp[WS(rs, 5)] = T3n + T3o;
+			 T3p = T30 - T2X;
+			 T3s = T3q - T3r;
+			 Ip[WS(rs, 5)] = T3p + T3s;
+			 Im[WS(rs, 2)] = T3s - T3p;
+		    }
+		    {
+			 E T3x, T3E, T3N, T3O;
+			 T3x = T3v + T3w;
+			 T3E = T3A + T3D;
+			 Ip[WS(rs, 3)] = T3x + T3E;
+			 Im[WS(rs, 4)] = T3E - T3x;
+			 T3N = T3F + T3G;
+			 T3O = T3K + T3L;
+			 Rm[WS(rs, 4)] = T3N - T3O;
+			 Rp[WS(rs, 3)] = T3N + T3O;
+		    }
+		    {
+			 E T3H, T3I, T3J, T3M;
+			 T3H = T3F - T3G;
+			 T3I = T3D - T3A;
+			 Rm[0] = T3H - T3I;
+			 Rp[WS(rs, 7)] = T3H + T3I;
+			 T3J = T3w - T3v;
+			 T3M = T3K - T3L;
+			 Ip[WS(rs, 7)] = T3J + T3M;
+			 Im[0] = T3M - T3J;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 16, "hc2cfdft_16", twinstr, &GENUS, {168, 62, 38, 0} };
+
+void X(codelet_hc2cfdft_16) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_16, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:44 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cfdft_2 -include hc2cf.h */
+
+/*
+ * This function contains 10 FP additions, 8 FP multiplications,
+ * (or, 8 additions, 6 multiplications, 2 fused multiply/add),
+ * 12 stack variables, 1 constants, and 8 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T9, Ta, T3, Tc, T7, T4;
+	       {
+		    E T1, T2, T5, T6;
+		    T1 = Ip[0];
+		    T2 = Im[0];
+		    T5 = Rm[0];
+		    T6 = Rp[0];
+		    T9 = W[1];
+		    Ta = T1 + T2;
+		    T3 = T1 - T2;
+		    Tc = T6 + T5;
+		    T7 = T5 - T6;
+		    T4 = W[0];
+	       }
+	       {
+		    E Td, T8, Te, Tb;
+		    Td = T9 * T7;
+		    T8 = T4 * T7;
+		    Te = FMA(T4, Ta, Td);
+		    Tb = FNMS(T9, Ta, T8);
+		    Rp[0] = KP500000000 * (Tc + Te);
+		    Rm[0] = KP500000000 * (Tc - Te);
+		    Im[0] = KP500000000 * (Tb - T3);
+		    Ip[0] = KP500000000 * (T3 + Tb);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 2, "hc2cfdft_2", twinstr, &GENUS, {8, 6, 2, 0} };
+
+void X(codelet_hc2cfdft_2) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_2, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cfdft_2 -include hc2cf.h */
+
+/*
+ * This function contains 10 FP additions, 8 FP multiplications,
+ * (or, 8 additions, 6 multiplications, 2 fused multiply/add),
+ * 10 stack variables, 1 constants, and 8 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T3, T9, T7, Tb;
+	       {
+		    E T1, T2, T5, T6;
+		    T1 = Ip[0];
+		    T2 = Im[0];
+		    T3 = T1 - T2;
+		    T9 = T1 + T2;
+		    T5 = Rm[0];
+		    T6 = Rp[0];
+		    T7 = T5 - T6;
+		    Tb = T6 + T5;
+	       }
+	       {
+		    E Ta, Tc, T4, T8;
+		    T4 = W[0];
+		    T8 = W[1];
+		    Ta = FNMS(T8, T9, T4 * T7);
+		    Tc = FMA(T8, T7, T4 * T9);
+		    Ip[0] = KP500000000 * (T3 + Ta);
+		    Rp[0] = KP500000000 * (Tb + Tc);
+		    Im[0] = KP500000000 * (Ta - T3);
+		    Rm[0] = KP500000000 * (Tb - Tc);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 2, "hc2cfdft_2", twinstr, &GENUS, {8, 6, 2, 0} };
+
+void X(codelet_hc2cfdft_2) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_2, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1143 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:48 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cfdft_20 -include hc2cf.h */
+
+/*
+ * This function contains 286 FP additions, 188 FP multiplications,
+ * (or, 176 additions, 78 multiplications, 110 fused multiply/add),
+ * 174 stack variables, 5 constants, and 80 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T4X, T5i, T5k, T5e, T5c, T5d, T5j, T5f;
+	       {
+		    E T2E, T4W, T3v, T4k, T2M, T3w, T4V, T4j, T2p, T2T, T5a, T5A, T3D, T3o, T4b;
+		    E T4B, T1Y, T2S, T5z, T57, T3h, T3C, T4A, T44, TH, T2P, T50, T5x, T3z, T32;
+		    E T3P, T4D, T3V, T3U, T5w, T53, T2Q, T1o, T3A, T39;
+		    {
+			 E T1V, T9, T2w, Tu, T1, T6, T1R, T1U, T1T, T2Y, T5, T40, T2F, T10, T2C;
+			 E TE, TX, T2m, T1y, T4g, TS, T33, TW, Tw, TB, T2y, T2B, TA, T3L, T2A;
+			 E T3t, T1q, T1v, T2i, T2l, T2k, T3d, T1u, T48, Tm, Tr, T2s, T2v, T2u, T3J;
+			 E Tq, T3r, T20, T1g, T23, T1l, T1h, T3S, T3k, T21, T2H, TL, T2K, TQ, TM;
+			 E T35, T4h, T2I, T2f, T2g, T1I, T1D, T2c, T46, T2e, T3b, T1E, T28, T16, T29;
+			 E T1b, T25, T3i, T27, T3Q, T17, T1O, T1P, Tj, T1M, Te, T1L, Tb, T3Y, TV;
+			 E T1d, T1Z;
+			 {
+			      E T1S, T4, T7, T8;
+			      T7 = Rp[WS(rs, 9)];
+			      T8 = Rm[WS(rs, 9)];
+			      {
+				   E Ts, Tt, T2, T3;
+				   Ts = Rp[WS(rs, 2)];
+				   Tt = Rm[WS(rs, 2)];
+				   T2 = Ip[WS(rs, 9)];
+				   T1V = T7 + T8;
+				   T9 = T7 - T8;
+				   T2w = Ts - Tt;
+				   Tu = Ts + Tt;
+				   T3 = Im[WS(rs, 9)];
+				   T1 = W[36];
+				   T6 = W[37];
+				   T1R = W[34];
+				   T1S = T2 - T3;
+				   T4 = T2 + T3;
+				   T1U = W[35];
+			      }
+			      {
+				   E TY, TZ, TC, TD;
+				   TY = Ip[0];
+				   T1T = T1R * T1S;
+				   T2Y = T6 * T4;
+				   T5 = T1 * T4;
+				   T40 = T1U * T1S;
+				   TZ = Im[0];
+				   TC = Rp[WS(rs, 7)];
+				   TD = Rm[WS(rs, 7)];
+				   {
+					E T1w, T1x, TT, TU;
+					T1w = Rp[WS(rs, 1)];
+					T2F = TY - TZ;
+					T10 = TY + TZ;
+					T2C = TC - TD;
+					TE = TC + TD;
+					T1x = Rm[WS(rs, 1)];
+					TT = Rm[0];
+					TU = Rp[0];
+					TX = W[0];
+					T2m = T1w + T1x;
+					T1y = T1w - T1x;
+					T4g = TU + TT;
+					TV = TT - TU;
+					TS = W[1];
+				   }
+			      }
+			 }
+			 {
+			      E T2j, T1t, T1r, T1s;
+			      {
+				   E Tx, Ty, T2z, Tz;
+				   Tx = Ip[WS(rs, 7)];
+				   Ty = Im[WS(rs, 7)];
+				   T33 = TX * TV;
+				   TW = TS * TV;
+				   Tw = W[26];
+				   T2z = Tx + Ty;
+				   Tz = Tx - Ty;
+				   TB = W[27];
+				   T2y = W[28];
+				   T2B = W[29];
+				   TA = Tw * Tz;
+				   T3L = TB * Tz;
+				   T2A = T2y * T2z;
+				   T3t = T2B * T2z;
+			      }
+			      T1r = Ip[WS(rs, 1)];
+			      T1s = Im[WS(rs, 1)];
+			      T1q = W[4];
+			      T1v = W[5];
+			      T2i = W[2];
+			      T2j = T1r - T1s;
+			      T1t = T1r + T1s;
+			      T2l = W[3];
+			      {
+				   E T2t, Tp, Tn, To;
+				   Tn = Ip[WS(rs, 2)];
+				   T2k = T2i * T2j;
+				   T3d = T1v * T1t;
+				   T1u = T1q * T1t;
+				   T48 = T2l * T2j;
+				   To = Im[WS(rs, 2)];
+				   Tm = W[6];
+				   Tr = W[7];
+				   T2s = W[8];
+				   T2t = Tn + To;
+				   Tp = Tn - To;
+				   T2v = W[9];
+				   {
+					E T1e, T1f, T1j, T1k;
+					T1e = Ip[WS(rs, 3)];
+					T2u = T2s * T2t;
+					T3J = Tr * Tp;
+					Tq = Tm * Tp;
+					T3r = T2v * T2t;
+					T1f = Im[WS(rs, 3)];
+					T1j = Rp[WS(rs, 3)];
+					T1k = Rm[WS(rs, 3)];
+					T1d = W[10];
+					T20 = T1e + T1f;
+					T1g = T1e - T1f;
+					T23 = T1j - T1k;
+					T1l = T1j + T1k;
+					T1Z = W[12];
+					T1h = T1d * T1g;
+				   }
+			      }
+			 }
+			 {
+			      E T2d, T1A, TI, T2G, T26, T13;
+			      {
+				   E TJ, TK, TO, TP;
+				   TJ = Ip[WS(rs, 5)];
+				   T3S = T1d * T1l;
+				   T3k = T1Z * T23;
+				   T21 = T1Z * T20;
+				   TK = Im[WS(rs, 5)];
+				   TO = Rp[WS(rs, 5)];
+				   TP = Rm[WS(rs, 5)];
+				   TI = W[20];
+				   T2H = TJ - TK;
+				   TL = TJ + TK;
+				   T2K = TO + TP;
+				   TQ = TO - TP;
+				   T2G = W[18];
+				   TM = TI * TL;
+			      }
+			      {
+				   E T1G, T1H, T1B, T1C;
+				   T1G = Rm[WS(rs, 6)];
+				   T35 = TI * TQ;
+				   T4h = T2G * T2K;
+				   T2I = T2G * T2H;
+				   T1H = Rp[WS(rs, 6)];
+				   T1B = Ip[WS(rs, 6)];
+				   T1C = Im[WS(rs, 6)];
+				   T2f = W[23];
+				   T2g = T1H + T1G;
+				   T1I = T1G - T1H;
+				   T2d = T1B - T1C;
+				   T1D = T1B + T1C;
+				   T2c = W[22];
+				   T1A = W[24];
+				   T46 = T2f * T2d;
+			      }
+			      {
+				   E T14, T15, T19, T1a;
+				   T14 = Ip[WS(rs, 8)];
+				   T2e = T2c * T2d;
+				   T3b = T1A * T1I;
+				   T1E = T1A * T1D;
+				   T15 = Im[WS(rs, 8)];
+				   T19 = Rp[WS(rs, 8)];
+				   T1a = Rm[WS(rs, 8)];
+				   T28 = W[32];
+				   T16 = T14 - T15;
+				   T29 = T14 + T15;
+				   T1b = T19 + T1a;
+				   T26 = T1a - T19;
+				   T25 = W[33];
+				   T13 = W[30];
+				   T3i = T28 * T26;
+			      }
+			      {
+				   E Th, Ti, Tc, Td;
+				   Th = Rm[WS(rs, 4)];
+				   T27 = T25 * T26;
+				   T3Q = T13 * T1b;
+				   T17 = T13 * T16;
+				   Ti = Rp[WS(rs, 4)];
+				   Tc = Ip[WS(rs, 4)];
+				   Td = Im[WS(rs, 4)];
+				   T1O = W[15];
+				   T1P = Ti + Th;
+				   Tj = Th - Ti;
+				   T1M = Tc - Td;
+				   Te = Tc + Td;
+				   T1L = W[14];
+				   Tb = W[16];
+				   T3Y = T1O * T1M;
+			      }
+			 }
+			 {
+			      E T1N, T2W, Tf, T2L, T4i;
+			      {
+				   E T2x, T2D, T3s, T3u, T2J;
+				   T2x = FNMS(T2v, T2w, T2u);
+				   T1N = T1L * T1M;
+				   T2W = Tb * Tj;
+				   Tf = Tb * Te;
+				   T2D = FNMS(T2B, T2C, T2A);
+				   T3s = FMA(T2s, T2w, T3r);
+				   T3u = FMA(T2y, T2C, T3t);
+				   T2J = W[19];
+				   T2E = T2x - T2D;
+				   T4W = T2x + T2D;
+				   T3v = T3s + T3u;
+				   T4k = T3u - T3s;
+				   T2L = FNMS(T2J, T2K, T2I);
+				   T4i = FMA(T2J, T2H, T4h);
+			      }
+			      {
+				   E T42, T43, T45, T4a, T3O, T3N;
+				   {
+					E T2a, T3j, T47, T3l, T24, T2o, T3n, T49, T22, T2h, T2n;
+					T2a = FMA(T28, T29, T27);
+					T3j = FNMS(T25, T29, T3i);
+					T2M = T2F - T2L;
+					T3w = T2L + T2F;
+					T4V = T4g + T4i;
+					T4j = T4g - T4i;
+					T22 = W[13];
+					T2h = FNMS(T2f, T2g, T2e);
+					T2n = FNMS(T2l, T2m, T2k);
+					T47 = FMA(T2c, T2g, T46);
+					T3l = FMA(T22, T20, T3k);
+					T24 = FNMS(T22, T23, T21);
+					T2o = T2h - T2n;
+					T3n = T2h + T2n;
+					T49 = FMA(T2i, T2m, T48);
+					{
+					     E T2b, T58, T3m, T59;
+					     T2b = T24 - T2a;
+					     T58 = T2a + T24;
+					     T3m = T3j - T3l;
+					     T45 = T3j + T3l;
+					     T4a = T47 - T49;
+					     T59 = T47 + T49;
+					     T2p = T2b - T2o;
+					     T2T = T2b + T2o;
+					     T5a = T58 + T59;
+					     T5A = T59 - T58;
+					     T3D = T3m + T3n;
+					     T3o = T3m - T3n;
+					}
+				   }
+				   {
+					E T1z, T3e, T1Q, T3c, T1J, T1W, T3Z, T41, T1F;
+					T1z = FNMS(T1v, T1y, T1u);
+					T3e = FMA(T1q, T1y, T3d);
+					T1F = W[25];
+					T4b = T45 + T4a;
+					T4B = T4a - T45;
+					T1Q = FNMS(T1O, T1P, T1N);
+					T3c = FNMS(T1F, T1D, T3b);
+					T1J = FMA(T1F, T1I, T1E);
+					T1W = FNMS(T1U, T1V, T1T);
+					T3Z = FMA(T1L, T1P, T3Y);
+					T41 = FMA(T1R, T1V, T40);
+					{
+					     E T56, T3g, T55, T1K, T1X, T3f;
+					     T56 = T1J + T1z;
+					     T1K = T1z - T1J;
+					     T3g = T1Q + T1W;
+					     T1X = T1Q - T1W;
+					     T55 = T3Z + T41;
+					     T42 = T3Z - T41;
+					     T1Y = T1K - T1X;
+					     T2S = T1X + T1K;
+					     T43 = T3c + T3e;
+					     T3f = T3c - T3e;
+					     T5z = T55 - T56;
+					     T57 = T55 + T56;
+					     T3h = T3f - T3g;
+					     T3C = T3g + T3f;
+					}
+				   }
+				   {
+					E Ta, T2Z, T3K, T2X, Tk, TG, T31, T3M, Tg, Tv, TF;
+					Ta = FNMS(T6, T9, T5);
+					T4A = T42 - T43;
+					T44 = T42 + T43;
+					T2Z = FMA(T1, T9, T2Y);
+					Tg = W[17];
+					Tv = FNMS(Tr, Tu, Tq);
+					TF = FNMS(TB, TE, TA);
+					T3K = FMA(Tm, Tu, T3J);
+					T2X = FNMS(Tg, Te, T2W);
+					Tk = FMA(Tg, Tj, Tf);
+					TG = Tv - TF;
+					T31 = Tv + TF;
+					T3M = FMA(Tw, TE, T3L);
+					{
+					     E Tl, T4Z, T30, T4Y;
+					     Tl = Ta - Tk;
+					     T4Z = Tk + Ta;
+					     T30 = T2X - T2Z;
+					     T3O = T2X + T2Z;
+					     T3N = T3K - T3M;
+					     T4Y = T3K + T3M;
+					     TH = Tl - TG;
+					     T2P = TG + Tl;
+					     T50 = T4Y + T4Z;
+					     T5x = T4Y - T4Z;
+					     T3z = T31 + T30;
+					     T32 = T30 - T31;
+					}
+				   }
+				   {
+					E T11, T34, T36, TR, T1i, T3R, T1c, TN, T18;
+					T11 = FMA(TX, T10, TW);
+					T34 = FNMS(TS, T10, T33);
+					TN = W[21];
+					T3P = T3N + T3O;
+					T4D = T3N - T3O;
+					T18 = W[31];
+					T36 = FMA(TN, TL, T35);
+					TR = FNMS(TN, TQ, TM);
+					T1i = W[11];
+					T3R = FMA(T18, T16, T3Q);
+					T1c = FNMS(T18, T1b, T17);
+					{
+					     E T52, T12, T3T, T1m;
+					     T52 = TR + T11;
+					     T12 = TR - T11;
+					     T3T = FMA(T1i, T1g, T3S);
+					     T1m = FNMS(T1i, T1l, T1h);
+					     {
+						  E T37, T51, T38, T1n;
+						  T3V = T36 + T34;
+						  T37 = T34 - T36;
+						  T51 = T3R + T3T;
+						  T3U = T3R - T3T;
+						  T38 = T1c + T1m;
+						  T1n = T1c - T1m;
+						  T5w = T51 - T52;
+						  T53 = T51 + T52;
+						  T2Q = T1n + T12;
+						  T1o = T12 - T1n;
+						  T3A = T38 + T37;
+						  T39 = T37 - T38;
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T4l, T4m, T4n, T4w, T4u;
+			 {
+			      E T4L, T2O, T3W, T4K, T4I, T4G, T4S, T4U, T4J, T4z, T4H;
+			      {
+				   E T4C, T2N, T4R, T1p, T4E, T2q, T4Q;
+				   T4L = T4A + T4B;
+				   T4C = T4A - T4B;
+				   T2N = T2E + T2M;
+				   T2O = T2M - T2E;
+				   T4R = T1o - TH;
+				   T1p = TH + T1o;
+				   T4E = T3U - T3V;
+				   T3W = T3U + T3V;
+				   T2q = T1Y + T2p;
+				   T4Q = T2p - T1Y;
+				   {
+					E T4y, T4x, T4F, T2r;
+					T4F = T4D - T4E;
+					T4K = T4D + T4E;
+					T4y = T1p - T2q;
+					T2r = T1p + T2q;
+					T4I = FMA(KP618033988, T4C, T4F);
+					T4G = FNMS(KP618033988, T4F, T4C);
+					T4S = FNMS(KP618033988, T4R, T4Q);
+					T4U = FMA(KP618033988, T4Q, T4R);
+					Im[WS(rs, 4)] = KP500000000 * (T2r - T2N);
+					T4x = FMA(KP250000000, T2r, T2N);
+					T4J = T4j - T4k;
+					T4l = T4j + T4k;
+					T4z = FMA(KP559016994, T4y, T4x);
+					T4H = FNMS(KP559016994, T4y, T4x);
+				   }
+			      }
+			      {
+				   E T2R, T4s, T4d, T4f, T4t, T2U, T4P, T4T;
+				   {
+					E T3X, T4O, T4M, T4c, T4N;
+					T4m = T3P + T3W;
+					T3X = T3P - T3W;
+					Ip[WS(rs, 7)] = KP500000000 * (FMA(KP951056516, T4G, T4z));
+					Ip[WS(rs, 3)] = KP500000000 * (FNMS(KP951056516, T4G, T4z));
+					Im[WS(rs, 8)] = -(KP500000000 * (FNMS(KP951056516, T4I, T4H)));
+					Im[0] = -(KP500000000 * (FMA(KP951056516, T4I, T4H)));
+					T4O = T4K - T4L;
+					T4M = T4K + T4L;
+					T4c = T44 - T4b;
+					T4n = T44 + T4b;
+					T2R = T2P + T2Q;
+					T4s = T2P - T2Q;
+					Rm[WS(rs, 4)] = KP500000000 * (T4J + T4M);
+					T4N = FNMS(KP250000000, T4M, T4J);
+					T4d = FMA(KP618033988, T4c, T3X);
+					T4f = FNMS(KP618033988, T3X, T4c);
+					T4t = T2S - T2T;
+					T2U = T2S + T2T;
+					T4P = FNMS(KP559016994, T4O, T4N);
+					T4T = FMA(KP559016994, T4O, T4N);
+				   }
+				   {
+					E T3H, T3G, T2V, T3I, T4e;
+					T2V = T2R + T2U;
+					T3H = T2R - T2U;
+					Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T4S, T4P));
+					Rp[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T4S, T4P));
+					Rm[0] = KP500000000 * (FNMS(KP951056516, T4U, T4T));
+					Rm[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T4U, T4T));
+					Ip[WS(rs, 5)] = KP500000000 * (T2O + T2V);
+					T3G = FNMS(KP250000000, T2V, T2O);
+					T3I = FMA(KP559016994, T3H, T3G);
+					T4e = FNMS(KP559016994, T3H, T3G);
+					T4w = FNMS(KP618033988, T4s, T4t);
+					T4u = FMA(KP618033988, T4t, T4s);
+					Ip[WS(rs, 9)] = KP500000000 * (FMA(KP951056516, T4d, T3I));
+					Ip[WS(rs, 1)] = KP500000000 * (FNMS(KP951056516, T4d, T3I));
+					Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP951056516, T4f, T4e)));
+					Im[WS(rs, 2)] = -(KP500000000 * (FMA(KP951056516, T4f, T4e)));
+				   }
+			      }
+			 }
+			 {
+			      E T3y, T5O, T5Q, T5F, T5K, T5I;
+			      {
+				   E T5G, T5H, T3x, T4q, T5E, T5C, T3a, T5N, T4p, T5M, T3p, T5y, T5B, T4o;
+				   T5G = T5x + T5w;
+				   T5y = T5w - T5x;
+				   T5B = T5z - T5A;
+				   T5H = T5z + T5A;
+				   T3y = T3w - T3v;
+				   T3x = T3v + T3w;
+				   T4q = T4m - T4n;
+				   T4o = T4m + T4n;
+				   T5E = FMA(KP618033988, T5y, T5B);
+				   T5C = FNMS(KP618033988, T5B, T5y);
+				   T3a = T32 + T39;
+				   T5N = T39 - T32;
+				   Rp[WS(rs, 5)] = KP500000000 * (T4l + T4o);
+				   T4p = FNMS(KP250000000, T4o, T4l);
+				   T5M = T3o - T3h;
+				   T3p = T3h + T3o;
+				   {
+					E T5u, T5t, T4r, T4v, T3q, T5D, T5v;
+					T4r = FMA(KP559016994, T4q, T4p);
+					T4v = FNMS(KP559016994, T4q, T4p);
+					T5u = T3p - T3a;
+					T3q = T3a + T3p;
+					Rp[WS(rs, 9)] = KP500000000 * (FNMS(KP951056516, T4u, T4r));
+					Rp[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T4u, T4r));
+					Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T4w, T4v));
+					Rm[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T4w, T4v));
+					Im[WS(rs, 9)] = KP500000000 * (T3q - T3x);
+					T5t = FMA(KP250000000, T3q, T3x);
+					T5O = FNMS(KP618033988, T5N, T5M);
+					T5Q = FMA(KP618033988, T5M, T5N);
+					T5F = T4V - T4W;
+					T4X = T4V + T4W;
+					T5D = FNMS(KP559016994, T5u, T5t);
+					T5v = FMA(KP559016994, T5u, T5t);
+					Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP951056516, T5C, T5v)));
+					Ip[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T5C, T5v));
+					Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP951056516, T5E, T5D)));
+					Ip[WS(rs, 2)] = KP500000000 * (FMA(KP951056516, T5E, T5D));
+					T5K = T5G - T5H;
+					T5I = T5G + T5H;
+				   }
+			      }
+			      {
+				   E T54, T5b, T5s, T5q, T5g, T5h, T3F, T5m, T5o, T5p, T5J, T5l, T5r, T5n;
+				   T54 = T50 + T53;
+				   T5o = T50 - T53;
+				   T5p = T5a - T57;
+				   T5b = T57 + T5a;
+				   Rm[WS(rs, 9)] = KP500000000 * (T5F + T5I);
+				   T5J = FNMS(KP250000000, T5I, T5F);
+				   T5s = FMA(KP618033988, T5o, T5p);
+				   T5q = FNMS(KP618033988, T5p, T5o);
+				   {
+					E T5L, T5P, T3B, T3E;
+					T5L = FNMS(KP559016994, T5K, T5J);
+					T5P = FMA(KP559016994, T5K, T5J);
+					T3B = T3z + T3A;
+					T5g = T3z - T3A;
+					T5h = T3C - T3D;
+					T3E = T3C + T3D;
+					Rm[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T5O, T5L));
+					Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T5O, T5L));
+					Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP951056516, T5Q, T5P));
+					Rp[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T5Q, T5P));
+					T3F = T3B + T3E;
+					T5m = T3B - T3E;
+				   }
+				   Ip[0] = KP500000000 * (T3y + T3F);
+				   T5l = FNMS(KP250000000, T3F, T3y);
+				   T5i = FMA(KP618033988, T5h, T5g);
+				   T5k = FNMS(KP618033988, T5g, T5h);
+				   T5r = FNMS(KP559016994, T5m, T5l);
+				   T5n = FMA(KP559016994, T5m, T5l);
+				   Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP951056516, T5q, T5n)));
+				   Ip[WS(rs, 4)] = KP500000000 * (FMA(KP951056516, T5q, T5n));
+				   Im[WS(rs, 7)] = -(KP500000000 * (FNMS(KP951056516, T5s, T5r)));
+				   Ip[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5s, T5r));
+				   T5e = T54 - T5b;
+				   T5c = T54 + T5b;
+			      }
+			 }
+		    }
+	       }
+	       Rp[0] = KP500000000 * (T4X + T5c);
+	       T5d = FNMS(KP250000000, T5c, T4X);
+	       T5j = FNMS(KP559016994, T5e, T5d);
+	       T5f = FMA(KP559016994, T5e, T5d);
+	       Rm[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T5i, T5f));
+	       Rp[WS(rs, 4)] = KP500000000 * (FNMS(KP951056516, T5i, T5f));
+	       Rm[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T5k, T5j));
+	       Rp[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5k, T5j));
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cfdft_20", twinstr, &GENUS, {176, 78, 110, 0} };
+
+void X(codelet_hc2cfdft_20) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_20, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cfdft_20 -include hc2cf.h */
+
+/*
+ * This function contains 286 FP additions, 140 FP multiplications,
+ * (or, 224 additions, 78 multiplications, 62 fused multiply/add),
+ * 98 stack variables, 5 constants, and 80 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP125000000, +0.125000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP279508497, +0.279508497187473712051146708591409529430077295);
+     DK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
+	       E T12, T2w, T4o, T4V, T2H, T3a, T4y, T4Y, T1z, T2v, T25, T2y, T2s, T2z, T4v;
+	       E T4X, T4r, T4U, T3A, T3Z, T2X, T37, T3k, T41, T2M, T39, T3v, T3Y, T2S, T36;
+	       E T3p, T42, Td, T4G, T33, T3N, Tw, T4H, T32, T3O;
+	       {
+		    E T3, T3L, T1x, T2V, Th, Tl, TC, T3g, Tq, Tu, TH, T3h, T7, Tb, T1q;
+		    E T2U, TR, T2P, T1F, T3r, T23, T2K, T2f, T3y, T1k, T3m, T2q, T2E, T10, T2Q;
+		    E T1K, T3s, T1U, T2J, T2a, T3x, T1b, T3l, T2l, T2D;
+		    {
+			 E T1, T2, T1s, T1u, T1v, T1w, T1r, T1t;
+			 T1 = Ip[0];
+			 T2 = Im[0];
+			 T1s = T1 + T2;
+			 T1u = Rp[0];
+			 T1v = Rm[0];
+			 T1w = T1u - T1v;
+			 T3 = T1 - T2;
+			 T3L = T1u + T1v;
+			 T1r = W[0];
+			 T1t = W[1];
+			 T1x = FNMS(T1t, T1w, T1r * T1s);
+			 T2V = FMA(T1r, T1w, T1t * T1s);
+		    }
+		    {
+			 E Tf, Tg, Tz, Tj, Tk, TB, Ty, TA;
+			 Tf = Ip[WS(rs, 2)];
+			 Tg = Im[WS(rs, 2)];
+			 Tz = Tf - Tg;
+			 Tj = Rp[WS(rs, 2)];
+			 Tk = Rm[WS(rs, 2)];
+			 TB = Tj + Tk;
+			 Th = Tf + Tg;
+			 Tl = Tj - Tk;
+			 Ty = W[6];
+			 TA = W[7];
+			 TC = FNMS(TA, TB, Ty * Tz);
+			 T3g = FMA(TA, Tz, Ty * TB);
+		    }
+		    {
+			 E To, Tp, TE, Ts, Tt, TG, TD, TF;
+			 To = Ip[WS(rs, 7)];
+			 Tp = Im[WS(rs, 7)];
+			 TE = To - Tp;
+			 Ts = Rp[WS(rs, 7)];
+			 Tt = Rm[WS(rs, 7)];
+			 TG = Ts + Tt;
+			 Tq = To + Tp;
+			 Tu = Ts - Tt;
+			 TD = W[26];
+			 TF = W[27];
+			 TH = FNMS(TF, TG, TD * TE);
+			 T3h = FMA(TF, TE, TD * TG);
+		    }
+		    {
+			 E T5, T6, T1n, T9, Ta, T1p, T1m, T1o;
+			 T5 = Ip[WS(rs, 5)];
+			 T6 = Im[WS(rs, 5)];
+			 T1n = T5 + T6;
+			 T9 = Rp[WS(rs, 5)];
+			 Ta = Rm[WS(rs, 5)];
+			 T1p = T9 - Ta;
+			 T7 = T5 - T6;
+			 Tb = T9 + Ta;
+			 T1m = W[20];
+			 T1o = W[21];
+			 T1q = FNMS(T1o, T1p, T1m * T1n);
+			 T2U = FMA(T1m, T1p, T1o * T1n);
+		    }
+		    {
+			 E TM, T1C, TQ, T1E;
+			 {
+			      E TK, TL, TO, TP;
+			      TK = Ip[WS(rs, 4)];
+			      TL = Im[WS(rs, 4)];
+			      TM = TK + TL;
+			      T1C = TK - TL;
+			      TO = Rp[WS(rs, 4)];
+			      TP = Rm[WS(rs, 4)];
+			      TQ = TO - TP;
+			      T1E = TO + TP;
+			 }
+			 {
+			      E TJ, TN, T1B, T1D;
+			      TJ = W[16];
+			      TN = W[17];
+			      TR = FNMS(TN, TQ, TJ * TM);
+			      T2P = FMA(TN, TM, TJ * TQ);
+			      T1B = W[14];
+			      T1D = W[15];
+			      T1F = FNMS(T1D, T1E, T1B * T1C);
+			      T3r = FMA(T1D, T1C, T1B * T1E);
+			 }
+		    }
+		    {
+			 E T1Y, T2c, T22, T2e;
+			 {
+			      E T1W, T1X, T20, T21;
+			      T1W = Ip[WS(rs, 1)];
+			      T1X = Im[WS(rs, 1)];
+			      T1Y = T1W + T1X;
+			      T2c = T1W - T1X;
+			      T20 = Rp[WS(rs, 1)];
+			      T21 = Rm[WS(rs, 1)];
+			      T22 = T20 - T21;
+			      T2e = T20 + T21;
+			 }
+			 {
+			      E T1V, T1Z, T2b, T2d;
+			      T1V = W[4];
+			      T1Z = W[5];
+			      T23 = FNMS(T1Z, T22, T1V * T1Y);
+			      T2K = FMA(T1Z, T1Y, T1V * T22);
+			      T2b = W[2];
+			      T2d = W[3];
+			      T2f = FNMS(T2d, T2e, T2b * T2c);
+			      T3y = FMA(T2d, T2c, T2b * T2e);
+			 }
+		    }
+		    {
+			 E T1f, T2n, T1j, T2p;
+			 {
+			      E T1d, T1e, T1h, T1i;
+			      T1d = Ip[WS(rs, 3)];
+			      T1e = Im[WS(rs, 3)];
+			      T1f = T1d - T1e;
+			      T2n = T1d + T1e;
+			      T1h = Rp[WS(rs, 3)];
+			      T1i = Rm[WS(rs, 3)];
+			      T1j = T1h + T1i;
+			      T2p = T1h - T1i;
+			 }
+			 {
+			      E T1c, T1g, T2m, T2o;
+			      T1c = W[10];
+			      T1g = W[11];
+			      T1k = FNMS(T1g, T1j, T1c * T1f);
+			      T3m = FMA(T1c, T1j, T1g * T1f);
+			      T2m = W[12];
+			      T2o = W[13];
+			      T2q = FNMS(T2o, T2p, T2m * T2n);
+			      T2E = FMA(T2m, T2p, T2o * T2n);
+			 }
+		    }
+		    {
+			 E TV, T1H, TZ, T1J;
+			 {
+			      E TT, TU, TX, TY;
+			      TT = Ip[WS(rs, 9)];
+			      TU = Im[WS(rs, 9)];
+			      TV = TT + TU;
+			      T1H = TT - TU;
+			      TX = Rp[WS(rs, 9)];
+			      TY = Rm[WS(rs, 9)];
+			      TZ = TX - TY;
+			      T1J = TX + TY;
+			 }
+			 {
+			      E TS, TW, T1G, T1I;
+			      TS = W[36];
+			      TW = W[37];
+			      T10 = FNMS(TW, TZ, TS * TV);
+			      T2Q = FMA(TW, TV, TS * TZ);
+			      T1G = W[34];
+			      T1I = W[35];
+			      T1K = FNMS(T1I, T1J, T1G * T1H);
+			      T3s = FMA(T1I, T1H, T1G * T1J);
+			 }
+		    }
+		    {
+			 E T1P, T27, T1T, T29;
+			 {
+			      E T1N, T1O, T1R, T1S;
+			      T1N = Ip[WS(rs, 6)];
+			      T1O = Im[WS(rs, 6)];
+			      T1P = T1N + T1O;
+			      T27 = T1N - T1O;
+			      T1R = Rp[WS(rs, 6)];
+			      T1S = Rm[WS(rs, 6)];
+			      T1T = T1R - T1S;
+			      T29 = T1R + T1S;
+			 }
+			 {
+			      E T1M, T1Q, T26, T28;
+			      T1M = W[24];
+			      T1Q = W[25];
+			      T1U = FNMS(T1Q, T1T, T1M * T1P);
+			      T2J = FMA(T1Q, T1P, T1M * T1T);
+			      T26 = W[22];
+			      T28 = W[23];
+			      T2a = FNMS(T28, T29, T26 * T27);
+			      T3x = FMA(T28, T27, T26 * T29);
+			 }
+		    }
+		    {
+			 E T16, T2k, T1a, T2i;
+			 {
+			      E T14, T15, T18, T19;
+			      T14 = Ip[WS(rs, 8)];
+			      T15 = Im[WS(rs, 8)];
+			      T16 = T14 - T15;
+			      T2k = T14 + T15;
+			      T18 = Rp[WS(rs, 8)];
+			      T19 = Rm[WS(rs, 8)];
+			      T1a = T18 + T19;
+			      T2i = T19 - T18;
+			 }
+			 {
+			      E T13, T17, T2h, T2j;
+			      T13 = W[30];
+			      T17 = W[31];
+			      T1b = FNMS(T17, T1a, T13 * T16);
+			      T3l = FMA(T13, T1a, T17 * T16);
+			      T2h = W[33];
+			      T2j = W[32];
+			      T2l = FMA(T2h, T2i, T2j * T2k);
+			      T2D = FNMS(T2h, T2k, T2j * T2i);
+			 }
+		    }
+		    {
+			 E T2g, T2r, T3n, T3o;
+			 {
+			      E TI, T11, T4m, T4n;
+			      TI = TC - TH;
+			      T11 = TR - T10;
+			      T12 = TI - T11;
+			      T2w = TI + T11;
+			      T4m = T3g + T3h;
+			      T4n = TR + T10;
+			      T4o = T4m + T4n;
+			      T4V = T4m - T4n;
+			 }
+			 {
+			      E T2F, T2G, T4w, T4x;
+			      T2F = T2D - T2E;
+			      T2G = T2a + T2f;
+			      T2H = T2F - T2G;
+			      T3a = T2F + T2G;
+			      T4w = T2l + T2q;
+			      T4x = T3x + T3y;
+			      T4y = T4w + T4x;
+			      T4Y = T4x - T4w;
+			 }
+			 {
+			      E T1l, T1y, T1L, T24;
+			      T1l = T1b - T1k;
+			      T1y = T1q - T1x;
+			      T1z = T1l + T1y;
+			      T2v = T1y - T1l;
+			      T1L = T1F - T1K;
+			      T24 = T1U - T23;
+			      T25 = T1L - T24;
+			      T2y = T1L + T24;
+			 }
+			 T2g = T2a - T2f;
+			 T2r = T2l - T2q;
+			 T2s = T2g - T2r;
+			 T2z = T2r + T2g;
+			 {
+			      E T4t, T4u, T4p, T4q;
+			      T4t = T3r + T3s;
+			      T4u = T1U + T23;
+			      T4v = T4t + T4u;
+			      T4X = T4t - T4u;
+			      T4p = T3l + T3m;
+			      T4q = T1q + T1x;
+			      T4r = T4p + T4q;
+			      T4U = T4p - T4q;
+			 }
+			 {
+			      E T3w, T3z, T2T, T2W;
+			      T3w = T2D + T2E;
+			      T3z = T3x - T3y;
+			      T3A = T3w + T3z;
+			      T3Z = T3z - T3w;
+			      T2T = T1b + T1k;
+			      T2W = T2U + T2V;
+			      T2X = T2T + T2W;
+			      T37 = T2T - T2W;
+			 }
+			 {
+			      E T3i, T3j, T2I, T2L;
+			      T3i = T3g - T3h;
+			      T3j = T2Q - T2P;
+			      T3k = T3i + T3j;
+			      T41 = T3i - T3j;
+			      T2I = T1F + T1K;
+			      T2L = T2J + T2K;
+			      T2M = T2I + T2L;
+			      T39 = T2I - T2L;
+			 }
+			 {
+			      E T3t, T3u, T2O, T2R;
+			      T3t = T3r - T3s;
+			      T3u = T2K - T2J;
+			      T3v = T3t + T3u;
+			      T3Y = T3t - T3u;
+			      T2O = TC + TH;
+			      T2R = T2P + T2Q;
+			      T2S = T2O + T2R;
+			      T36 = T2O - T2R;
+			 }
+			 T3n = T3l - T3m;
+			 T3o = T2U - T2V;
+			 T3p = T3n + T3o;
+			 T42 = T3n - T3o;
+			 {
+			      E Tc, T3M, T4, T8;
+			      T4 = W[18];
+			      T8 = W[19];
+			      Tc = FNMS(T8, Tb, T4 * T7);
+			      T3M = FMA(T4, Tb, T8 * T7);
+			      Td = T3 - Tc;
+			      T4G = T3L + T3M;
+			      T33 = Tc + T3;
+			      T3N = T3L - T3M;
+			 }
+			 {
+			      E Tm, T30, Tv, T31;
+			      {
+				   E Te, Ti, Tn, Tr;
+				   Te = W[8];
+				   Ti = W[9];
+				   Tm = FNMS(Ti, Tl, Te * Th);
+				   T30 = FMA(Ti, Th, Te * Tl);
+				   Tn = W[28];
+				   Tr = W[29];
+				   Tv = FNMS(Tr, Tu, Tn * Tq);
+				   T31 = FMA(Tr, Tq, Tn * Tu);
+			      }
+			      Tw = Tm - Tv;
+			      T4H = Tm + Tv;
+			      T32 = T30 + T31;
+			      T3O = T31 - T30;
+			 }
+		    }
+	       }
+	       {
+		    E T3C, T3E, Tx, T2u, T3d, T3e, T3D, T3f;
+		    {
+			 E T3q, T3B, T1A, T2t;
+			 T3q = T3k - T3p;
+			 T3B = T3v - T3A;
+			 T3C = FMA(KP475528258, T3q, KP293892626 * T3B);
+			 T3E = FNMS(KP293892626, T3q, KP475528258 * T3B);
+			 Tx = Td - Tw;
+			 T1A = T12 + T1z;
+			 T2t = T25 + T2s;
+			 T2u = T1A + T2t;
+			 T3d = KP279508497 * (T1A - T2t);
+			 T3e = FNMS(KP125000000, T2u, KP500000000 * Tx);
+		    }
+		    Ip[WS(rs, 5)] = KP500000000 * (Tx + T2u);
+		    T3D = T3d - T3e;
+		    Im[WS(rs, 2)] = T3D - T3E;
+		    Im[WS(rs, 6)] = T3D + T3E;
+		    T3f = T3d + T3e;
+		    Ip[WS(rs, 1)] = T3f - T3C;
+		    Ip[WS(rs, 9)] = T3f + T3C;
+	       }
+	       {
+		    E T3H, T3T, T3P, T3Q, T3K, T3R, T3U, T3S;
+		    {
+			 E T3F, T3G, T3I, T3J;
+			 T3F = T12 - T1z;
+			 T3G = T25 - T2s;
+			 T3H = FMA(KP475528258, T3F, KP293892626 * T3G);
+			 T3T = FNMS(KP293892626, T3F, KP475528258 * T3G);
+			 T3P = T3N + T3O;
+			 T3I = T3k + T3p;
+			 T3J = T3v + T3A;
+			 T3Q = T3I + T3J;
+			 T3K = KP279508497 * (T3I - T3J);
+			 T3R = FNMS(KP125000000, T3Q, KP500000000 * T3P);
+		    }
+		    Rp[WS(rs, 5)] = KP500000000 * (T3P + T3Q);
+		    T3U = T3R - T3K;
+		    Rm[WS(rs, 6)] = T3T + T3U;
+		    Rm[WS(rs, 2)] = T3U - T3T;
+		    T3S = T3K + T3R;
+		    Rp[WS(rs, 1)] = T3H + T3S;
+		    Rp[WS(rs, 9)] = T3S - T3H;
+	       }
+	       {
+		    E T44, T46, T2C, T2B, T3V, T3W, T45, T3X;
+		    {
+			 E T40, T43, T2x, T2A;
+			 T40 = T3Y - T3Z;
+			 T43 = T41 - T42;
+			 T44 = FNMS(KP293892626, T43, KP475528258 * T40);
+			 T46 = FMA(KP475528258, T43, KP293892626 * T40);
+			 T2C = Tw + Td;
+			 T2x = T2v - T2w;
+			 T2A = T2y + T2z;
+			 T2B = T2x - T2A;
+			 T3V = FMA(KP500000000, T2C, KP125000000 * T2B);
+			 T3W = KP279508497 * (T2x + T2A);
+		    }
+		    Im[WS(rs, 4)] = KP500000000 * (T2B - T2C);
+		    T45 = T3W - T3V;
+		    Im[0] = T45 - T46;
+		    Im[WS(rs, 8)] = T45 + T46;
+		    T3X = T3V + T3W;
+		    Ip[WS(rs, 3)] = T3X - T44;
+		    Ip[WS(rs, 7)] = T3X + T44;
+	       }
+	       {
+		    E T49, T4h, T4a, T4d, T4e, T4f, T4i, T4g;
+		    {
+			 E T47, T48, T4b, T4c;
+			 T47 = T2y - T2z;
+			 T48 = T2w + T2v;
+			 T49 = FNMS(KP293892626, T48, KP475528258 * T47);
+			 T4h = FMA(KP475528258, T48, KP293892626 * T47);
+			 T4a = T3N - T3O;
+			 T4b = T41 + T42;
+			 T4c = T3Y + T3Z;
+			 T4d = T4b + T4c;
+			 T4e = FNMS(KP125000000, T4d, KP500000000 * T4a);
+			 T4f = KP279508497 * (T4b - T4c);
+		    }
+		    Rm[WS(rs, 4)] = KP500000000 * (T4a + T4d);
+		    T4i = T4f + T4e;
+		    Rm[WS(rs, 8)] = T4h + T4i;
+		    Rm[0] = T4i - T4h;
+		    T4g = T4e - T4f;
+		    Rp[WS(rs, 3)] = T49 + T4g;
+		    Rp[WS(rs, 7)] = T4g - T49;
+	       }
+	       {
+		    E T50, T52, T34, T2Z, T4R, T4S, T51, T4T;
+		    {
+			 E T4W, T4Z, T2N, T2Y;
+			 T4W = T4U - T4V;
+			 T4Z = T4X - T4Y;
+			 T50 = FNMS(KP293892626, T4Z, KP475528258 * T4W);
+			 T52 = FMA(KP293892626, T4W, KP475528258 * T4Z);
+			 T34 = T32 + T33;
+			 T2N = T2H - T2M;
+			 T2Y = T2S + T2X;
+			 T2Z = T2N - T2Y;
+			 T4R = FMA(KP500000000, T34, KP125000000 * T2Z);
+			 T4S = KP279508497 * (T2Y + T2N);
+		    }
+		    Im[WS(rs, 9)] = KP500000000 * (T2Z - T34);
+		    T51 = T4R - T4S;
+		    Ip[WS(rs, 2)] = T51 + T52;
+		    Im[WS(rs, 1)] = T52 - T51;
+		    T4T = T4R + T4S;
+		    Ip[WS(rs, 6)] = T4T + T50;
+		    Im[WS(rs, 5)] = T50 - T4T;
+	       }
+	       {
+		    E T5c, T5d, T53, T56, T57, T58, T5e, T59;
+		    {
+			 E T5a, T5b, T54, T55;
+			 T5a = T2M + T2H;
+			 T5b = T2S - T2X;
+			 T5c = FNMS(KP293892626, T5b, KP475528258 * T5a);
+			 T5d = FMA(KP475528258, T5b, KP293892626 * T5a);
+			 T53 = T4G - T4H;
+			 T54 = T4V + T4U;
+			 T55 = T4X + T4Y;
+			 T56 = T54 + T55;
+			 T57 = FNMS(KP125000000, T56, KP500000000 * T53);
+			 T58 = KP279508497 * (T54 - T55);
+		    }
+		    Rm[WS(rs, 9)] = KP500000000 * (T53 + T56);
+		    T5e = T58 + T57;
+		    Rp[WS(rs, 6)] = T5d + T5e;
+		    Rm[WS(rs, 5)] = T5e - T5d;
+		    T59 = T57 - T58;
+		    Rp[WS(rs, 2)] = T59 - T5c;
+		    Rm[WS(rs, 1)] = T5c + T59;
+	       }
+	       {
+		    E T4A, T4C, T35, T3c, T4j, T4k, T4B, T4l;
+		    {
+			 E T4s, T4z, T38, T3b;
+			 T4s = T4o - T4r;
+			 T4z = T4v - T4y;
+			 T4A = FNMS(KP475528258, T4z, KP293892626 * T4s);
+			 T4C = FMA(KP475528258, T4s, KP293892626 * T4z);
+			 T35 = T33 - T32;
+			 T38 = T36 + T37;
+			 T3b = T39 + T3a;
+			 T3c = T38 + T3b;
+			 T4j = FNMS(KP125000000, T3c, KP500000000 * T35);
+			 T4k = KP279508497 * (T38 - T3b);
+		    }
+		    Ip[0] = KP500000000 * (T35 + T3c);
+		    T4B = T4k + T4j;
+		    Ip[WS(rs, 4)] = T4B + T4C;
+		    Im[WS(rs, 3)] = T4C - T4B;
+		    T4l = T4j - T4k;
+		    Ip[WS(rs, 8)] = T4l + T4A;
+		    Im[WS(rs, 7)] = T4A - T4l;
+	       }
+	       {
+		    E T4O, T4P, T4I, T4J, T4F, T4K, T4Q, T4L;
+		    {
+			 E T4M, T4N, T4D, T4E;
+			 T4M = T36 - T37;
+			 T4N = T39 - T3a;
+			 T4O = FMA(KP475528258, T4M, KP293892626 * T4N);
+			 T4P = FNMS(KP293892626, T4M, KP475528258 * T4N);
+			 T4I = T4G + T4H;
+			 T4D = T4o + T4r;
+			 T4E = T4v + T4y;
+			 T4J = T4D + T4E;
+			 T4F = KP279508497 * (T4D - T4E);
+			 T4K = FNMS(KP125000000, T4J, KP500000000 * T4I);
+		    }
+		    Rp[0] = KP500000000 * (T4I + T4J);
+		    T4Q = T4K - T4F;
+		    Rp[WS(rs, 8)] = T4P + T4Q;
+		    Rm[WS(rs, 7)] = T4Q - T4P;
+		    T4L = T4F + T4K;
+		    Rp[WS(rs, 4)] = T4L - T4O;
+		    Rm[WS(rs, 3)] = T4O + T4L;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 20, "hc2cfdft_20", twinstr, &GENUS, {224, 78, 62, 0} };
+
+void X(codelet_hc2cfdft_20) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_20, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1943 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:46 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 32 -dit -name hc2cfdft_32 -include hc2cf.h */
+
+/*
+ * This function contains 498 FP additions, 324 FP multiplications,
+ * (or, 300 additions, 126 multiplications, 198 fused multiply/add),
+ * 172 stack variables, 8 constants, and 128 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T9X, Ta0;
+	       {
+		    E T3B, T89, T61, T8l, T2F, T7p, T8t, T4B, T7I, T5e, T7L, T1n, T7R, T5E, T82;
+		    E T4u, T3m, T8k, T5W, T8a, T2r, T8u, T4G, T7q, T59, T7K, T7H, T12, T5z, T81;
+		    E T7Q, T4h, T4Y, T7D, T7A, Tl, T5o, T3Q, T84, T7V, T2V, T4M, T7t, T7s, T1K;
+		    E T5L, T8e, T8n, T38, T7v, T4R, T7w, T25, T5Q, T8h, T8o, T3V, T3S, T5p, T3T;
+		    E T41, Tz, T3Y, TE, TA, T51, T5r, T3Z, Tv, T50, TB, T3U, T40;
+		    {
+			 E T49, T46, T5v, T47, T4f, TV, T4c, T10, TW, T57, T5x, T4d, TR, T56, TX;
+			 E T48, T4e;
+			 {
+			      E T4m, T4j, T5A, T4k, T4s, T1g, T4p, T1l, T1h, T5c, T5C, T4q, T1c, T5b, T1i;
+			      E T4l, T4r;
+			      {
+				   E T2E, T4y, T2B, T4A;
+				   {
+					E T3y, T3z, T3t, T5Z, T3x, T2v, T3r, T3q, T3n, T2A, T3o, T2s;
+					{
+					     E T2C, T2D, T3w, T3u, T3v;
+					     T2C = Ip[0];
+					     T2D = Im[0];
+					     T3u = Rm[0];
+					     T3v = Rp[0];
+					     T3y = W[1];
+					     T3z = T2C + T2D;
+					     T2E = T2C - T2D;
+					     T4y = T3v + T3u;
+					     T3w = T3u - T3v;
+					     T3t = W[0];
+					     {
+						  E T2y, T2z, T2t, T2u;
+						  T2t = Ip[WS(rs, 8)];
+						  T2u = Im[WS(rs, 8)];
+						  T5Z = T3y * T3w;
+						  T3x = T3t * T3w;
+						  T2y = Rp[WS(rs, 8)];
+						  T2v = T2t - T2u;
+						  T3r = T2t + T2u;
+						  T2z = Rm[WS(rs, 8)];
+						  T3q = W[33];
+						  T3n = W[32];
+						  T2A = T2y + T2z;
+						  T3o = T2z - T2y;
+						  T2s = W[30];
+					     }
+					}
+					{
+					     E T3A, T5X, T4z, T2w, T3s, T3p, T5Y, T60, T2x;
+					     T3A = FNMS(T3y, T3z, T3x);
+					     T3p = T3n * T3o;
+					     T5X = T3q * T3o;
+					     T4z = T2s * T2A;
+					     T2w = T2s * T2v;
+					     T3s = FNMS(T3q, T3r, T3p);
+					     T5Y = FMA(T3n, T3r, T5X);
+					     T60 = FMA(T3t, T3z, T5Z);
+					     T2x = W[31];
+					     T3B = T3s + T3A;
+					     T89 = T3A - T3s;
+					     T61 = T5Y + T60;
+					     T8l = T60 - T5Y;
+					     T2B = FNMS(T2x, T2A, T2w);
+					     T4A = FMA(T2x, T2v, T4z);
+					}
+				   }
+				   {
+					E T16, T1b, T17, T5a, T1d, T4o, T18;
+					{
+					     E T19, T1a, T13, T4i, T14, T15;
+					     T14 = Ip[WS(rs, 3)];
+					     T15 = Im[WS(rs, 3)];
+					     T2F = T2B + T2E;
+					     T7p = T2E - T2B;
+					     T8t = T4y - T4A;
+					     T4B = T4y + T4A;
+					     T4m = T14 + T15;
+					     T16 = T14 - T15;
+					     T19 = Rp[WS(rs, 3)];
+					     T1a = Rm[WS(rs, 3)];
+					     T13 = W[10];
+					     T4i = W[12];
+					     {
+						  E T1e, T1f, T1j, T1k;
+						  T1e = Ip[WS(rs, 11)];
+						  T4j = T19 - T1a;
+						  T1b = T19 + T1a;
+						  T17 = T13 * T16;
+						  T5A = T4i * T4m;
+						  T4k = T4i * T4j;
+						  T5a = T13 * T1b;
+						  T1f = Im[WS(rs, 11)];
+						  T1j = Rp[WS(rs, 11)];
+						  T1k = Rm[WS(rs, 11)];
+						  T1d = W[42];
+						  T4s = T1e + T1f;
+						  T1g = T1e - T1f;
+						  T4p = T1j - T1k;
+						  T1l = T1j + T1k;
+						  T4o = W[44];
+						  T1h = T1d * T1g;
+					     }
+					}
+					T18 = W[11];
+					T5c = T1d * T1l;
+					T5C = T4o * T4s;
+					T4q = T4o * T4p;
+					T1c = FNMS(T18, T1b, T17);
+					T5b = FMA(T18, T16, T5a);
+					T1i = W[43];
+					T4l = W[13];
+					T4r = W[45];
+				   }
+			      }
+			      {
+				   E T4D, T2g, T2q, T4F;
+				   {
+					E T3d, T3e, T2a, T2f, T3a, T5S, T3c, T4C, T2b, T3j, T2k, T3k, T2p, T3h, T3g;
+					E T2h, T5U, T3b, T27;
+					{
+					     E T28, T29, T2d, T2e, T5d, T1m;
+					     T28 = Ip[WS(rs, 4)];
+					     T5d = FMA(T1i, T1g, T5c);
+					     T1m = FNMS(T1i, T1l, T1h);
+					     {
+						  E T5B, T4n, T5D, T4t;
+						  T5B = FNMS(T4l, T4j, T5A);
+						  T4n = FMA(T4l, T4m, T4k);
+						  T5D = FNMS(T4r, T4p, T5C);
+						  T4t = FMA(T4r, T4s, T4q);
+						  T7I = T5b - T5d;
+						  T5e = T5b + T5d;
+						  T7L = T1c - T1m;
+						  T1n = T1c + T1m;
+						  T7R = T5D - T5B;
+						  T5E = T5B + T5D;
+						  T82 = T4t - T4n;
+						  T4u = T4n + T4t;
+						  T29 = Im[WS(rs, 4)];
+					     }
+					     T2d = Rp[WS(rs, 4)];
+					     T2e = Rm[WS(rs, 4)];
+					     T3d = W[17];
+					     T3e = T28 + T29;
+					     T2a = T28 - T29;
+					     T3b = T2e - T2d;
+					     T2f = T2d + T2e;
+					     T3a = W[16];
+					     T27 = W[14];
+					     T5S = T3d * T3b;
+					}
+					{
+					     E T2i, T2j, T2n, T2o;
+					     T2i = Ip[WS(rs, 12)];
+					     T3c = T3a * T3b;
+					     T4C = T27 * T2f;
+					     T2b = T27 * T2a;
+					     T2j = Im[WS(rs, 12)];
+					     T2n = Rp[WS(rs, 12)];
+					     T2o = Rm[WS(rs, 12)];
+					     T3j = W[49];
+					     T2k = T2i - T2j;
+					     T3k = T2i + T2j;
+					     T2p = T2n + T2o;
+					     T3h = T2o - T2n;
+					     T3g = W[48];
+					     T2h = W[46];
+					     T5U = T3j * T3h;
+					}
+					{
+					     E T3f, T3i, T4E, T2l;
+					     T3f = FNMS(T3d, T3e, T3c);
+					     T3i = T3g * T3h;
+					     T4E = T2h * T2p;
+					     T2l = T2h * T2k;
+					     {
+						  E T5T, T3l, T5V, T2c, T2m;
+						  T5T = FMA(T3a, T3e, T5S);
+						  T3l = FNMS(T3j, T3k, T3i);
+						  T5V = FMA(T3g, T3k, T5U);
+						  T2c = W[15];
+						  T2m = W[47];
+						  T3m = T3f + T3l;
+						  T8k = T3f - T3l;
+						  T5W = T5T + T5V;
+						  T8a = T5T - T5V;
+						  T4D = FMA(T2c, T2a, T4C);
+						  T2g = FNMS(T2c, T2f, T2b);
+						  T2q = FNMS(T2m, T2p, T2l);
+						  T4F = FMA(T2m, T2k, T4E);
+					     }
+					}
+				   }
+				   {
+					E TL, TQ, TM, T55, TS, T4b, TN;
+					{
+					     E TO, TP, TI, T45, TJ, TK;
+					     TJ = Ip[WS(rs, 15)];
+					     TK = Im[WS(rs, 15)];
+					     T2r = T2g + T2q;
+					     T8u = T2g - T2q;
+					     T4G = T4D + T4F;
+					     T7q = T4D - T4F;
+					     T49 = TJ + TK;
+					     TL = TJ - TK;
+					     TO = Rp[WS(rs, 15)];
+					     TP = Rm[WS(rs, 15)];
+					     TI = W[58];
+					     T45 = W[60];
+					     {
+						  E TT, TU, TY, TZ;
+						  TT = Ip[WS(rs, 7)];
+						  T46 = TO - TP;
+						  TQ = TO + TP;
+						  TM = TI * TL;
+						  T5v = T45 * T49;
+						  T47 = T45 * T46;
+						  T55 = TI * TQ;
+						  TU = Im[WS(rs, 7)];
+						  TY = Rp[WS(rs, 7)];
+						  TZ = Rm[WS(rs, 7)];
+						  TS = W[26];
+						  T4f = TT + TU;
+						  TV = TT - TU;
+						  T4c = TY - TZ;
+						  T10 = TY + TZ;
+						  T4b = W[28];
+						  TW = TS * TV;
+					     }
+					}
+					TN = W[59];
+					T57 = TS * T10;
+					T5x = T4b * T4f;
+					T4d = T4b * T4c;
+					TR = FNMS(TN, TQ, TM);
+					T56 = FMA(TN, TL, T55);
+					TX = W[27];
+					T48 = W[61];
+					T4e = W[29];
+				   }
+			      }
+			 }
+			 {
+			      E T8c, T8d, T8f, T8g;
+			      {
+				   E T3I, T3F, T5k, T3G, T3O, Te, T3L, Tj, Tf, T4W, T5m, T3M, Ta, T4V, Tg;
+				   E T3H, T3N;
+				   {
+					E T4, T9, T5, T4U, Tb, T3K, T1, T3E, T6;
+					{
+					     E T2, T3, T7, T8, T58, T11;
+					     T2 = Ip[WS(rs, 1)];
+					     T58 = FMA(TX, TV, T57);
+					     T11 = FNMS(TX, T10, TW);
+					     {
+						  E T5w, T4a, T5y, T4g;
+						  T5w = FNMS(T48, T46, T5v);
+						  T4a = FMA(T48, T49, T47);
+						  T5y = FNMS(T4e, T4c, T5x);
+						  T4g = FMA(T4e, T4f, T4d);
+						  T59 = T56 + T58;
+						  T7K = T56 - T58;
+						  T7H = TR - T11;
+						  T12 = TR + T11;
+						  T5z = T5w + T5y;
+						  T81 = T5w - T5y;
+						  T7Q = T4g - T4a;
+						  T4h = T4a + T4g;
+						  T3 = Im[WS(rs, 1)];
+					     }
+					     T7 = Rp[WS(rs, 1)];
+					     T8 = Rm[WS(rs, 1)];
+					     T1 = W[2];
+					     T3I = T2 + T3;
+					     T4 = T2 - T3;
+					     T3F = T7 - T8;
+					     T9 = T7 + T8;
+					     T3E = W[4];
+					     T5 = T1 * T4;
+					}
+					{
+					     E Tc, Td, Th, Ti;
+					     Tc = Ip[WS(rs, 9)];
+					     T4U = T1 * T9;
+					     T5k = T3E * T3I;
+					     T3G = T3E * T3F;
+					     Td = Im[WS(rs, 9)];
+					     Th = Rp[WS(rs, 9)];
+					     Ti = Rm[WS(rs, 9)];
+					     Tb = W[34];
+					     T3O = Tc + Td;
+					     Te = Tc - Td;
+					     T3L = Th - Ti;
+					     Tj = Th + Ti;
+					     T3K = W[36];
+					     Tf = Tb * Te;
+					}
+					T6 = W[3];
+					T4W = Tb * Tj;
+					T5m = T3K * T3O;
+					T3M = T3K * T3L;
+					Ta = FNMS(T6, T9, T5);
+					T4V = FMA(T6, T4, T4U);
+					Tg = W[35];
+					T3H = W[5];
+					T3N = W[37];
+				   }
+				   {
+					E T1t, T2N, T2M, T2J, T1y, T2L, T5H, T4I, T1u, T2S, T1D, T2T, T1I, T2Q, T2P;
+					E T1A, T5J;
+					{
+					     E T2K, T1q, T1w, T1x;
+					     {
+						  E T1r, T7U, T7T, T1s, T4X, Tk;
+						  T1r = Ip[WS(rs, 2)];
+						  T4X = FMA(Tg, Te, T4W);
+						  Tk = FNMS(Tg, Tj, Tf);
+						  {
+						       E T5l, T3J, T5n, T3P;
+						       T5l = FNMS(T3H, T3F, T5k);
+						       T3J = FMA(T3H, T3I, T3G);
+						       T5n = FNMS(T3N, T3L, T5m);
+						       T3P = FMA(T3N, T3O, T3M);
+						       T4Y = T4V + T4X;
+						       T7D = T4V - T4X;
+						       T7A = Ta - Tk;
+						       Tl = Ta + Tk;
+						       T7U = T5l - T5n;
+						       T5o = T5l + T5n;
+						       T7T = T3P - T3J;
+						       T3Q = T3J + T3P;
+						       T1s = Im[WS(rs, 2)];
+						  }
+						  T1w = Rp[WS(rs, 2)];
+						  T84 = T7U + T7T;
+						  T7V = T7T - T7U;
+						  T1t = T1r - T1s;
+						  T2N = T1r + T1s;
+						  T1x = Rm[WS(rs, 2)];
+					     }
+					     T2M = W[9];
+					     T2J = W[8];
+					     T1y = T1w + T1x;
+					     T2K = T1x - T1w;
+					     T1q = W[6];
+					     {
+						  E T1B, T1C, T1G, T1H;
+						  T1B = Ip[WS(rs, 10)];
+						  T2L = T2J * T2K;
+						  T5H = T2M * T2K;
+						  T4I = T1q * T1y;
+						  T1u = T1q * T1t;
+						  T1C = Im[WS(rs, 10)];
+						  T1G = Rp[WS(rs, 10)];
+						  T1H = Rm[WS(rs, 10)];
+						  T2S = W[41];
+						  T1D = T1B - T1C;
+						  T2T = T1B + T1C;
+						  T1I = T1G + T1H;
+						  T2Q = T1H - T1G;
+						  T2P = W[40];
+						  T1A = W[38];
+						  T5J = T2S * T2Q;
+					     }
+					}
+					{
+					     E T2R, T4K, T1E, T1z, T4J, T1F, T1v, T2O, T2U;
+					     T1v = W[7];
+					     T2R = T2P * T2Q;
+					     T4K = T1A * T1I;
+					     T1E = T1A * T1D;
+					     T1z = FNMS(T1v, T1y, T1u);
+					     T4J = FMA(T1v, T1t, T4I);
+					     T1F = W[39];
+					     T2O = FNMS(T2M, T2N, T2L);
+					     T2U = FNMS(T2S, T2T, T2R);
+					     {
+						  E T5I, T4L, T1J, T5K;
+						  T5I = FMA(T2J, T2N, T5H);
+						  T4L = FMA(T1F, T1D, T4K);
+						  T1J = FNMS(T1F, T1I, T1E);
+						  T8c = T2O - T2U;
+						  T2V = T2O + T2U;
+						  T5K = FMA(T2P, T2T, T5J);
+						  T4M = T4J + T4L;
+						  T7t = T4J - T4L;
+						  T7s = T1z - T1J;
+						  T1K = T1z + T1J;
+						  T8d = T5I - T5K;
+						  T5L = T5I + T5K;
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T2Z, T30, T1O, T1T, T2W, T5M, T2Y, T4N, T1P, T35, T1Y, T36, T23, T33, T32;
+				   E T1V, T5O, T2X, T1L;
+				   {
+					E T1M, T1N, T1R, T1S;
+					T1M = Ip[WS(rs, 14)];
+					T8e = T8c - T8d;
+					T8n = T8c + T8d;
+					T1N = Im[WS(rs, 14)];
+					T1R = Rp[WS(rs, 14)];
+					T1S = Rm[WS(rs, 14)];
+					T2Z = W[57];
+					T30 = T1M + T1N;
+					T1O = T1M - T1N;
+					T2X = T1S - T1R;
+					T1T = T1R + T1S;
+					T2W = W[56];
+					T1L = W[54];
+					T5M = T2Z * T2X;
+				   }
+				   {
+					E T1W, T1X, T21, T22;
+					T1W = Ip[WS(rs, 6)];
+					T2Y = T2W * T2X;
+					T4N = T1L * T1T;
+					T1P = T1L * T1O;
+					T1X = Im[WS(rs, 6)];
+					T21 = Rp[WS(rs, 6)];
+					T22 = Rm[WS(rs, 6)];
+					T35 = W[25];
+					T1Y = T1W - T1X;
+					T36 = T1W + T1X;
+					T23 = T21 + T22;
+					T33 = T22 - T21;
+					T32 = W[24];
+					T1V = W[22];
+					T5O = T35 * T33;
+				   }
+				   {
+					E T34, T4P, T1Z, T1U, T4O, T20, T1Q, T31, T37;
+					T1Q = W[55];
+					T34 = T32 * T33;
+					T4P = T1V * T23;
+					T1Z = T1V * T1Y;
+					T1U = FNMS(T1Q, T1T, T1P);
+					T4O = FMA(T1Q, T1O, T4N);
+					T20 = W[23];
+					T31 = FNMS(T2Z, T30, T2Y);
+					T37 = FNMS(T35, T36, T34);
+					{
+					     E T5N, T4Q, T24, T5P;
+					     T5N = FMA(T2W, T30, T5M);
+					     T4Q = FMA(T20, T1Y, T4P);
+					     T24 = FNMS(T20, T23, T1Z);
+					     T8f = T31 - T37;
+					     T38 = T31 + T37;
+					     T5P = FMA(T32, T36, T5O);
+					     T7v = T4O - T4Q;
+					     T4R = T4O + T4Q;
+					     T7w = T1U - T24;
+					     T25 = T1U + T24;
+					     T8g = T5N - T5P;
+					     T5Q = T5N + T5P;
+					}
+				   }
+			      }
+			      {
+				   E Tp, Tu, Tq, T4Z, Tw, T3X, Tm, T3R, Tr;
+				   {
+					E Tn, To, Ts, Tt;
+					Tn = Ip[WS(rs, 5)];
+					T8h = T8f + T8g;
+					T8o = T8g - T8f;
+					To = Im[WS(rs, 5)];
+					Ts = Rp[WS(rs, 5)];
+					Tt = Rm[WS(rs, 5)];
+					Tm = W[18];
+					T3V = Tn + To;
+					Tp = Tn - To;
+					T3S = Ts - Tt;
+					Tu = Ts + Tt;
+					T3R = W[20];
+					Tq = Tm * Tp;
+				   }
+				   {
+					E Tx, Ty, TC, TD;
+					Tx = Ip[WS(rs, 13)];
+					T4Z = Tm * Tu;
+					T5p = T3R * T3V;
+					T3T = T3R * T3S;
+					Ty = Im[WS(rs, 13)];
+					TC = Rp[WS(rs, 13)];
+					TD = Rm[WS(rs, 13)];
+					Tw = W[50];
+					T41 = Tx + Ty;
+					Tz = Tx - Ty;
+					T3Y = TC - TD;
+					TE = TC + TD;
+					T3X = W[52];
+					TA = Tw * Tz;
+				   }
+				   Tr = W[19];
+				   T51 = Tw * TE;
+				   T5r = T3X * T41;
+				   T3Z = T3X * T3Y;
+				   Tv = FNMS(Tr, Tu, Tq);
+				   T50 = FMA(Tr, Tp, T4Z);
+				   TB = W[51];
+				   T3U = W[21];
+				   T40 = W[53];
+			      }
+			 }
+		    }
+		    {
+			 E T6y, T7B, T7E, T6u, T6S, T85, T7Y, T6s, T6v, T6x, T6R, T6r, T6F, T6D, T6C;
+			 E T6G, T6M, T6K, T6J, T6N, T6l, T6o, T7j, T7m;
+			 {
+			      E T6i, T1p, T68, T2H, T67, T5g, T6h, T4T, T4w, T5G, T6d, T3D, T6c, T6m, T63;
+			      E T6e;
+			      {
+				   E T5t, T43, T26, T2G, T54, T5f, T4H, T4S;
+				   {
+					E T1o, T53, T7W, T7X, TH, T52, TF, T5q;
+					T6y = T12 - T1n;
+					T1o = T12 + T1n;
+					T52 = FMA(TB, Tz, T51);
+					TF = FNMS(TB, TE, TA);
+					T5q = FNMS(T3U, T3S, T5p);
+					{
+					     E T3W, T5s, T42, TG;
+					     T3W = FMA(T3U, T3V, T3T);
+					     T5s = FNMS(T40, T3Y, T5r);
+					     T42 = FMA(T40, T41, T3Z);
+					     T7B = T50 - T52;
+					     T53 = T50 + T52;
+					     T7E = Tv - TF;
+					     TG = Tv + TF;
+					     T7W = T5s - T5q;
+					     T5t = T5q + T5s;
+					     T7X = T3W - T42;
+					     T43 = T3W + T42;
+					     TH = Tl + TG;
+					     T6u = Tl - TG;
+					}
+					T6S = T1K - T25;
+					T26 = T1K + T25;
+					T85 = T7W - T7X;
+					T7Y = T7W + T7X;
+					T6i = TH - T1o;
+					T1p = TH + T1o;
+					T2G = T2r + T2F;
+					T6s = T2F - T2r;
+					T6v = T4Y - T53;
+					T54 = T4Y + T53;
+					T5f = T59 + T5e;
+					T6x = T59 - T5e;
+				   }
+				   T6R = T4B - T4G;
+				   T4H = T4B + T4G;
+				   T68 = T2G - T26;
+				   T2H = T26 + T2G;
+				   T67 = T5f - T54;
+				   T5g = T54 + T5f;
+				   T4S = T4M + T4R;
+				   T6r = T4R - T4M;
+				   {
+					E T5u, T6b, T5F, T44, T4v;
+					T6F = T43 - T3Q;
+					T44 = T3Q + T43;
+					T4v = T4h + T4u;
+					T6D = T4u - T4h;
+					T6C = T5t - T5o;
+					T5u = T5o + T5t;
+					T6h = T4H - T4S;
+					T4T = T4H + T4S;
+					T6b = T44 - T4v;
+					T4w = T44 + T4v;
+					T6G = T5z - T5E;
+					T5F = T5z + T5E;
+					{
+					     E T5R, T62, T39, T3C, T6a;
+					     T6M = T2V - T38;
+					     T39 = T2V + T38;
+					     T3C = T3m + T3B;
+					     T6K = T3B - T3m;
+					     T6a = T5F - T5u;
+					     T5G = T5u + T5F;
+					     T6J = T5Q - T5L;
+					     T5R = T5L + T5Q;
+					     T6d = T3C - T39;
+					     T3D = T39 + T3C;
+					     T6N = T61 - T5W;
+					     T62 = T5W + T61;
+					     T6c = T6a + T6b;
+					     T6m = T6a - T6b;
+					     T63 = T5R + T62;
+					     T6e = T62 - T5R;
+					}
+				   }
+			      }
+			      {
+				   E T5j, T6n, T6f, T64;
+				   {
+					E T5i, T5h, T65, T66, T2I, T4x;
+					T5j = T2H - T1p;
+					T2I = T1p + T2H;
+					T4x = T3D - T4w;
+					T5i = T4w + T3D;
+					T6n = T6d + T6e;
+					T6f = T6d - T6e;
+					T5h = T4T - T5g;
+					T65 = T4T + T5g;
+					Im[WS(rs, 15)] = KP500000000 * (T4x - T2I);
+					Ip[0] = KP500000000 * (T2I + T4x);
+					T66 = T5G + T63;
+					T64 = T5G - T63;
+					Rp[0] = KP500000000 * (T65 + T66);
+					Rm[WS(rs, 15)] = KP500000000 * (T65 - T66);
+					Rp[WS(rs, 8)] = KP500000000 * (T5h + T5i);
+					Rm[WS(rs, 7)] = KP500000000 * (T5h - T5i);
+				   }
+				   {
+					E T6k, T6j, T6p, T6q, T69, T6g;
+					T6l = T68 - T67;
+					T69 = T67 + T68;
+					T6g = T6c + T6f;
+					T6k = T6f - T6c;
+					T6j = T6h - T6i;
+					T6p = T6h + T6i;
+					Im[WS(rs, 7)] = KP500000000 * (T64 - T5j);
+					Ip[WS(rs, 8)] = KP500000000 * (T5j + T64);
+					Im[WS(rs, 11)] = -(KP500000000 * (FNMS(KP707106781, T6g, T69)));
+					Ip[WS(rs, 4)] = KP500000000 * (FMA(KP707106781, T6g, T69));
+					T6q = T6m + T6n;
+					T6o = T6m - T6n;
+					Rp[WS(rs, 4)] = KP500000000 * (FMA(KP707106781, T6q, T6p));
+					Rm[WS(rs, 11)] = KP500000000 * (FNMS(KP707106781, T6q, T6p));
+					Rp[WS(rs, 12)] = KP500000000 * (FMA(KP707106781, T6k, T6j));
+					Rm[WS(rs, 3)] = KP500000000 * (FNMS(KP707106781, T6k, T6j));
+				   }
+			      }
+			 }
+			 {
+			      E T75, T6t, T7f, T6T, T76, T6W, T7g, T6A, T7b, T6L, T7a, T7k, T70, T6I, T6U;
+			      E T6w;
+			      Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP707106781, T6o, T6l)));
+			      Ip[WS(rs, 12)] = KP500000000 * (FMA(KP707106781, T6o, T6l));
+			      T75 = T6s - T6r;
+			      T6t = T6r + T6s;
+			      T7f = T6R - T6S;
+			      T6T = T6R + T6S;
+			      T6U = T6v + T6u;
+			      T6w = T6u - T6v;
+			      {
+				   E T78, T6E, T6V, T6z, T79, T6H;
+				   T6V = T6x - T6y;
+				   T6z = T6x + T6y;
+				   T78 = T6C - T6D;
+				   T6E = T6C + T6D;
+				   T76 = T6V - T6U;
+				   T6W = T6U + T6V;
+				   T7g = T6w - T6z;
+				   T6A = T6w + T6z;
+				   T79 = T6G - T6F;
+				   T6H = T6F + T6G;
+				   T7b = T6K - T6J;
+				   T6L = T6J + T6K;
+				   T7a = FMA(KP414213562, T79, T78);
+				   T7k = FNMS(KP414213562, T78, T79);
+				   T70 = FNMS(KP414213562, T6E, T6H);
+				   T6I = FMA(KP414213562, T6H, T6E);
+			      }
+			      {
+				   E T6Z, T6B, T73, T6X, T7c, T6O;
+				   T6Z = FNMS(KP707106781, T6A, T6t);
+				   T6B = FMA(KP707106781, T6A, T6t);
+				   T73 = FMA(KP707106781, T6W, T6T);
+				   T6X = FNMS(KP707106781, T6W, T6T);
+				   T7c = T6N - T6M;
+				   T6O = T6M + T6N;
+				   {
+					E T7i, T7h, T7n, T7o;
+					{
+					     E T77, T7l, T71, T6P, T7e, T7d;
+					     T7j = FMA(KP707106781, T76, T75);
+					     T77 = FNMS(KP707106781, T76, T75);
+					     T7d = FMA(KP414213562, T7c, T7b);
+					     T7l = FNMS(KP414213562, T7b, T7c);
+					     T71 = FMA(KP414213562, T6L, T6O);
+					     T6P = FNMS(KP414213562, T6O, T6L);
+					     T7e = T7a - T7d;
+					     T7i = T7a + T7d;
+					     T7h = FMA(KP707106781, T7g, T7f);
+					     T7n = FNMS(KP707106781, T7g, T7f);
+					     {
+						  E T72, T74, T6Y, T6Q;
+						  T72 = T70 - T71;
+						  T74 = T70 + T71;
+						  T6Y = T6P - T6I;
+						  T6Q = T6I + T6P;
+						  Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP923879532, T7e, T77)));
+						  Ip[WS(rs, 14)] = KP500000000 * (FMA(KP923879532, T7e, T77));
+						  Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP923879532, T72, T6Z)));
+						  Ip[WS(rs, 10)] = KP500000000 * (FMA(KP923879532, T72, T6Z));
+						  Rp[WS(rs, 2)] = KP500000000 * (FMA(KP923879532, T74, T73));
+						  Rm[WS(rs, 13)] = KP500000000 * (FNMS(KP923879532, T74, T73));
+						  Rp[WS(rs, 10)] = KP500000000 * (FMA(KP923879532, T6Y, T6X));
+						  Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP923879532, T6Y, T6X));
+						  Im[WS(rs, 13)] = -(KP500000000 * (FNMS(KP923879532, T6Q, T6B)));
+						  Ip[WS(rs, 2)] = KP500000000 * (FMA(KP923879532, T6Q, T6B));
+						  T7o = T7k + T7l;
+						  T7m = T7k - T7l;
+					     }
+					}
+					Rm[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T7o, T7n));
+					Rp[WS(rs, 14)] = KP500000000 * (FNMS(KP923879532, T7o, T7n));
+					Rp[WS(rs, 6)] = KP500000000 * (FMA(KP923879532, T7i, T7h));
+					Rm[WS(rs, 9)] = KP500000000 * (FNMS(KP923879532, T7i, T7h));
+				   }
+			      }
+			 }
+			 {
+			      E T9x, T9T, T8L, T7z, T97, T9J, T8V, T8z, T8M, T8C, T8W, T7O, T9O, T9Y, T9E;
+			      E T9t, T8Q, T90, T8G, T88, T8p, T8m, T9K, T9A, T9U, T9e, T8R, T8j, T9R, T9Z;
+			      E T9F, T9m;
+			      {
+				   E T9c, T9b, T99, T98, T7S, T86, T83, T9q, T9M, T9p, T9r, T7Z, T9z, T9a;
+				   {
+					E T95, T7r, T9v, T8v, T8w, T8x, T9w, T7y, T7u, T7x;
+					T95 = T7q + T7p;
+					T7r = T7p - T7q;
+					T9v = T8t - T8u;
+					T8v = T8t + T8u;
+					T8w = T7t + T7s;
+					T7u = T7s - T7t;
+					Im[WS(rs, 9)] = -(KP500000000 * (FNMS(KP923879532, T7m, T7j)));
+					Ip[WS(rs, 6)] = KP500000000 * (FMA(KP923879532, T7m, T7j));
+					T7x = T7v + T7w;
+					T8x = T7v - T7w;
+					T9w = T7u - T7x;
+					T7y = T7u + T7x;
+					{
+					     E T7J, T8A, T7G, T7M;
+					     {
+						  E T7C, T96, T8y, T7F;
+						  T9c = T7A + T7B;
+						  T7C = T7A - T7B;
+						  T9x = FMA(KP707106781, T9w, T9v);
+						  T9T = FNMS(KP707106781, T9w, T9v);
+						  T8L = FNMS(KP707106781, T7y, T7r);
+						  T7z = FMA(KP707106781, T7y, T7r);
+						  T96 = T8x - T8w;
+						  T8y = T8w + T8x;
+						  T7F = T7D + T7E;
+						  T9b = T7D - T7E;
+						  T99 = T7H + T7I;
+						  T7J = T7H - T7I;
+						  T97 = FMA(KP707106781, T96, T95);
+						  T9J = FNMS(KP707106781, T96, T95);
+						  T8V = FNMS(KP707106781, T8y, T8v);
+						  T8z = FMA(KP707106781, T8y, T8v);
+						  T8A = FMA(KP414213562, T7C, T7F);
+						  T7G = FNMS(KP414213562, T7F, T7C);
+						  T7M = T7K + T7L;
+						  T98 = T7K - T7L;
+					     }
+					     {
+						  E T9n, T9o, T8B, T7N;
+						  T7S = T7Q + T7R;
+						  T9n = T7R - T7Q;
+						  T9o = T85 - T84;
+						  T86 = T84 + T85;
+						  T83 = T81 + T82;
+						  T9q = T81 - T82;
+						  T8B = FNMS(KP414213562, T7J, T7M);
+						  T7N = FMA(KP414213562, T7M, T7J);
+						  T9M = FMA(KP707106781, T9o, T9n);
+						  T9p = FNMS(KP707106781, T9o, T9n);
+						  T8M = T8B - T8A;
+						  T8C = T8A + T8B;
+						  T8W = T7G - T7N;
+						  T7O = T7G + T7N;
+						  T9r = T7Y - T7V;
+						  T7Z = T7V + T7Y;
+					     }
+					}
+				   }
+				   {
+					E T8O, T80, T9N, T9s, T8P, T87;
+					T9N = FMA(KP707106781, T9r, T9q);
+					T9s = FNMS(KP707106781, T9r, T9q);
+					T8O = FNMS(KP707106781, T7Z, T7S);
+					T80 = FMA(KP707106781, T7Z, T7S);
+					T9O = FMA(KP198912367, T9N, T9M);
+					T9Y = FNMS(KP198912367, T9M, T9N);
+					T9E = FMA(KP668178637, T9p, T9s);
+					T9t = FNMS(KP668178637, T9s, T9p);
+					T8P = FNMS(KP707106781, T86, T83);
+					T87 = FMA(KP707106781, T86, T83);
+					T9z = FNMS(KP414213562, T98, T99);
+					T9a = FMA(KP414213562, T99, T98);
+					T8Q = FNMS(KP668178637, T8P, T8O);
+					T90 = FMA(KP668178637, T8O, T8P);
+					T8G = FNMS(KP198912367, T80, T87);
+					T88 = FMA(KP198912367, T87, T80);
+				   }
+				   {
+					E T8b, T9j, T9P, T9i, T9k, T8i, T9Q, T9l;
+					{
+					     E T9g, T9h, T9y, T9d;
+					     T8b = T89 - T8a;
+					     T9g = T8a + T89;
+					     T9h = T8n - T8o;
+					     T8p = T8n + T8o;
+					     T8m = T8k + T8l;
+					     T9j = T8l - T8k;
+					     T9y = FMA(KP414213562, T9b, T9c);
+					     T9d = FNMS(KP414213562, T9c, T9b);
+					     T9P = FMA(KP707106781, T9h, T9g);
+					     T9i = FNMS(KP707106781, T9h, T9g);
+					     T9K = T9y + T9z;
+					     T9A = T9y - T9z;
+					     T9U = T9d + T9a;
+					     T9e = T9a - T9d;
+					     T9k = T8h - T8e;
+					     T8i = T8e + T8h;
+					}
+					T9Q = FMA(KP707106781, T9k, T9j);
+					T9l = FNMS(KP707106781, T9k, T9j);
+					T8R = FNMS(KP707106781, T8i, T8b);
+					T8j = FMA(KP707106781, T8i, T8b);
+					T9R = FMA(KP198912367, T9Q, T9P);
+					T9Z = FNMS(KP198912367, T9P, T9Q);
+					T9F = FMA(KP668178637, T9i, T9l);
+					T9m = FNMS(KP668178637, T9l, T9i);
+				   }
+			      }
+			      {
+				   E T8Z, T92, T9D, T9G;
+				   {
+					E T8F, T7P, T8J, T8D, T8S, T8q;
+					T8F = FNMS(KP923879532, T7O, T7z);
+					T7P = FMA(KP923879532, T7O, T7z);
+					T8J = FMA(KP923879532, T8C, T8z);
+					T8D = FNMS(KP923879532, T8C, T8z);
+					T8S = FNMS(KP707106781, T8p, T8m);
+					T8q = FMA(KP707106781, T8p, T8m);
+					{
+					     E T8Y, T8X, T93, T94;
+					     {
+						  E T8N, T91, T8H, T8r, T8U, T8T;
+						  T8Z = FMA(KP923879532, T8M, T8L);
+						  T8N = FNMS(KP923879532, T8M, T8L);
+						  T8T = FMA(KP668178637, T8S, T8R);
+						  T91 = FNMS(KP668178637, T8R, T8S);
+						  T8H = FMA(KP198912367, T8j, T8q);
+						  T8r = FNMS(KP198912367, T8q, T8j);
+						  T8U = T8Q + T8T;
+						  T8Y = T8T - T8Q;
+						  T8X = FMA(KP923879532, T8W, T8V);
+						  T93 = FNMS(KP923879532, T8W, T8V);
+						  {
+						       E T8I, T8K, T8E, T8s;
+						       T8I = T8G - T8H;
+						       T8K = T8G + T8H;
+						       T8E = T8r - T88;
+						       T8s = T88 + T8r;
+						       Im[WS(rs, 2)] = -(KP500000000 * (FMA(KP831469612, T8U, T8N)));
+						       Ip[WS(rs, 13)] = KP500000000 * (FNMS(KP831469612, T8U, T8N));
+						       Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP980785280, T8I, T8F)));
+						       Ip[WS(rs, 9)] = KP500000000 * (FMA(KP980785280, T8I, T8F));
+						       Rp[WS(rs, 1)] = KP500000000 * (FMA(KP980785280, T8K, T8J));
+						       Rm[WS(rs, 14)] = KP500000000 * (FNMS(KP980785280, T8K, T8J));
+						       Rp[WS(rs, 9)] = KP500000000 * (FMA(KP980785280, T8E, T8D));
+						       Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP980785280, T8E, T8D));
+						       Im[WS(rs, 14)] = -(KP500000000 * (FNMS(KP980785280, T8s, T7P)));
+						       Ip[WS(rs, 1)] = KP500000000 * (FMA(KP980785280, T8s, T7P));
+						       T94 = T90 + T91;
+						       T92 = T90 - T91;
+						  }
+					     }
+					     Rm[WS(rs, 2)] = KP500000000 * (FMA(KP831469612, T94, T93));
+					     Rp[WS(rs, 13)] = KP500000000 * (FNMS(KP831469612, T94, T93));
+					     Rp[WS(rs, 5)] = KP500000000 * (FMA(KP831469612, T8Y, T8X));
+					     Rm[WS(rs, 10)] = KP500000000 * (FNMS(KP831469612, T8Y, T8X));
+					}
+				   }
+				   {
+					E T9C, T9B, T9H, T9I, T9f, T9u;
+					T9D = FNMS(KP923879532, T9e, T97);
+					T9f = FMA(KP923879532, T9e, T97);
+					T9u = T9m - T9t;
+					T9C = T9t + T9m;
+					T9B = FNMS(KP923879532, T9A, T9x);
+					T9H = FMA(KP923879532, T9A, T9x);
+					Im[WS(rs, 10)] = -(KP500000000 * (FNMS(KP831469612, T92, T8Z)));
+					Ip[WS(rs, 5)] = KP500000000 * (FMA(KP831469612, T92, T8Z));
+					Im[WS(rs, 12)] = -(KP500000000 * (FNMS(KP831469612, T9u, T9f)));
+					Ip[WS(rs, 3)] = KP500000000 * (FMA(KP831469612, T9u, T9f));
+					T9I = T9E + T9F;
+					T9G = T9E - T9F;
+					Rp[WS(rs, 3)] = KP500000000 * (FMA(KP831469612, T9I, T9H));
+					Rm[WS(rs, 12)] = KP500000000 * (FNMS(KP831469612, T9I, T9H));
+					Rp[WS(rs, 11)] = KP500000000 * (FMA(KP831469612, T9C, T9B));
+					Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP831469612, T9C, T9B));
+				   }
+				   {
+					E T9W, T9V, Ta1, Ta2, T9L, T9S;
+					T9X = FNMS(KP923879532, T9K, T9J);
+					T9L = FMA(KP923879532, T9K, T9J);
+					T9S = T9O - T9R;
+					T9W = T9O + T9R;
+					T9V = FNMS(KP923879532, T9U, T9T);
+					Ta1 = FMA(KP923879532, T9U, T9T);
+					Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP831469612, T9G, T9D)));
+					Ip[WS(rs, 11)] = KP500000000 * (FMA(KP831469612, T9G, T9D));
+					Im[0] = -(KP500000000 * (FNMS(KP980785280, T9S, T9L)));
+					Ip[WS(rs, 15)] = KP500000000 * (FMA(KP980785280, T9S, T9L));
+					Ta2 = T9Y + T9Z;
+					Ta0 = T9Y - T9Z;
+					Rm[0] = KP500000000 * (FMA(KP980785280, Ta2, Ta1));
+					Rp[WS(rs, 15)] = KP500000000 * (FNMS(KP980785280, Ta2, Ta1));
+					Rp[WS(rs, 7)] = KP500000000 * (FMA(KP980785280, T9W, T9V));
+					Rm[WS(rs, 8)] = KP500000000 * (FNMS(KP980785280, T9W, T9V));
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Im[WS(rs, 8)] = -(KP500000000 * (FNMS(KP980785280, Ta0, T9X)));
+	       Ip[WS(rs, 7)] = KP500000000 * (FMA(KP980785280, Ta0, T9X));
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cfdft_32", twinstr, &GENUS, {300, 126, 198, 0} };
+
+void X(codelet_hc2cfdft_32) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_32, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 32 -dit -name hc2cfdft_32 -include hc2cf.h */
+
+/*
+ * This function contains 498 FP additions, 228 FP multiplications,
+ * (or, 404 additions, 134 multiplications, 94 fused multiply/add),
+ * 106 stack variables, 9 constants, and 128 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP277785116, +0.277785116509801112371415406974266437187468595);
+     DK(KP415734806, +0.415734806151272618539394188808952878369280406);
+     DK(KP097545161, +0.097545161008064133924142434238511120463845809);
+     DK(KP490392640, +0.490392640201615224563091118067119518486966865);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP191341716, +0.191341716182544885864229992015199433380672281);
+     DK(KP461939766, +0.461939766255643378064091594698394143411208313);
+     DK(KP353553390, +0.353553390593273762200422181052424519642417969);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E T2S, T5K, T52, T5N, T7p, T8r, T7i, T8o, T2q, T7t, T45, T6L, T2d, T7u, T48;
+	       E T6M, T1A, T4c, T4f, T1T, T3f, T5M, T7e, T7l, T6J, T7x, T4V, T5J, T7b, T7k;
+	       E T6G, T7w, Tj, TC, T5r, T4k, T4n, T5s, T3D, T5C, T6V, T72, T4G, T5F, T6u;
+	       E T86, T6S, T71, T6r, T85, TW, T1f, T5v, T4r, T4u, T5u, T40, T5G, T76, T8k;
+	       E T4N, T5D, T6B, T89, T6Z, T8h, T6y, T88;
+	       {
+		    E T1Y, T22, T2L, T4W, T2p, T43, T2A, T50, T27, T2b, T2Q, T4X, T2h, T2l, T2F;
+		    E T4Z;
+		    {
+			 E T1W, T1X, T2K, T20, T21, T2I, T2H, T2J;
+			 T1W = Ip[WS(rs, 4)];
+			 T1X = Im[WS(rs, 4)];
+			 T2K = T1W + T1X;
+			 T20 = Rp[WS(rs, 4)];
+			 T21 = Rm[WS(rs, 4)];
+			 T2I = T20 - T21;
+			 T1Y = T1W - T1X;
+			 T22 = T20 + T21;
+			 T2H = W[16];
+			 T2J = W[17];
+			 T2L = FMA(T2H, T2I, T2J * T2K);
+			 T4W = FNMS(T2J, T2I, T2H * T2K);
+		    }
+		    {
+			 E T2n, T2o, T2z, T2v, T2w, T2x, T2u, T2y;
+			 T2n = Ip[0];
+			 T2o = Im[0];
+			 T2z = T2n + T2o;
+			 T2v = Rm[0];
+			 T2w = Rp[0];
+			 T2x = T2v - T2w;
+			 T2p = T2n - T2o;
+			 T43 = T2w + T2v;
+			 T2u = W[0];
+			 T2y = W[1];
+			 T2A = FNMS(T2y, T2z, T2u * T2x);
+			 T50 = FMA(T2y, T2x, T2u * T2z);
+		    }
+		    {
+			 E T25, T26, T2P, T29, T2a, T2N, T2M, T2O;
+			 T25 = Ip[WS(rs, 12)];
+			 T26 = Im[WS(rs, 12)];
+			 T2P = T25 + T26;
+			 T29 = Rp[WS(rs, 12)];
+			 T2a = Rm[WS(rs, 12)];
+			 T2N = T29 - T2a;
+			 T27 = T25 - T26;
+			 T2b = T29 + T2a;
+			 T2M = W[48];
+			 T2O = W[49];
+			 T2Q = FMA(T2M, T2N, T2O * T2P);
+			 T4X = FNMS(T2O, T2N, T2M * T2P);
+		    }
+		    {
+			 E T2f, T2g, T2E, T2j, T2k, T2C, T2B, T2D;
+			 T2f = Ip[WS(rs, 8)];
+			 T2g = Im[WS(rs, 8)];
+			 T2E = T2f + T2g;
+			 T2j = Rp[WS(rs, 8)];
+			 T2k = Rm[WS(rs, 8)];
+			 T2C = T2j - T2k;
+			 T2h = T2f - T2g;
+			 T2l = T2j + T2k;
+			 T2B = W[32];
+			 T2D = W[33];
+			 T2F = FMA(T2B, T2C, T2D * T2E);
+			 T4Z = FNMS(T2D, T2C, T2B * T2E);
+		    }
+		    {
+			 E T2G, T2R, T7g, T7h;
+			 T2G = T2A - T2F;
+			 T2R = T2L + T2Q;
+			 T2S = T2G - T2R;
+			 T5K = T2R + T2G;
+			 {
+			      E T4Y, T51, T7n, T7o;
+			      T4Y = T4W + T4X;
+			      T51 = T4Z + T50;
+			      T52 = T4Y + T51;
+			      T5N = T51 - T4Y;
+			      T7n = T2Q - T2L;
+			      T7o = T50 - T4Z;
+			      T7p = T7n + T7o;
+			      T8r = T7o - T7n;
+			 }
+			 T7g = T2F + T2A;
+			 T7h = T4W - T4X;
+			 T7i = T7g - T7h;
+			 T8o = T7h + T7g;
+			 {
+			      E T2m, T44, T2e, T2i;
+			      T2e = W[30];
+			      T2i = W[31];
+			      T2m = FNMS(T2i, T2l, T2e * T2h);
+			      T44 = FMA(T2e, T2l, T2i * T2h);
+			      T2q = T2m + T2p;
+			      T7t = T43 - T44;
+			      T45 = T43 + T44;
+			      T6L = T2p - T2m;
+			 }
+			 {
+			      E T23, T46, T2c, T47;
+			      {
+				   E T1V, T1Z, T24, T28;
+				   T1V = W[14];
+				   T1Z = W[15];
+				   T23 = FNMS(T1Z, T22, T1V * T1Y);
+				   T46 = FMA(T1V, T22, T1Z * T1Y);
+				   T24 = W[46];
+				   T28 = W[47];
+				   T2c = FNMS(T28, T2b, T24 * T27);
+				   T47 = FMA(T24, T2b, T28 * T27);
+			      }
+			      T2d = T23 + T2c;
+			      T7u = T23 - T2c;
+			      T48 = T46 + T47;
+			      T6M = T46 - T47;
+			 }
+		    }
+	       }
+	       {
+		    E T1q, T4a, T2X, T4P, T1S, T4e, T3d, T4T, T1z, T4b, T32, T4Q, T1J, T4d, T38;
+		    E T4S;
+		    {
+			 E T1l, T2W, T1p, T2U;
+			 {
+			      E T1j, T1k, T1n, T1o;
+			      T1j = Ip[WS(rs, 2)];
+			      T1k = Im[WS(rs, 2)];
+			      T1l = T1j - T1k;
+			      T2W = T1j + T1k;
+			      T1n = Rp[WS(rs, 2)];
+			      T1o = Rm[WS(rs, 2)];
+			      T1p = T1n + T1o;
+			      T2U = T1n - T1o;
+			 }
+			 {
+			      E T1i, T1m, T2T, T2V;
+			      T1i = W[6];
+			      T1m = W[7];
+			      T1q = FNMS(T1m, T1p, T1i * T1l);
+			      T4a = FMA(T1i, T1p, T1m * T1l);
+			      T2T = W[8];
+			      T2V = W[9];
+			      T2X = FMA(T2T, T2U, T2V * T2W);
+			      T4P = FNMS(T2V, T2U, T2T * T2W);
+			 }
+		    }
+		    {
+			 E T1N, T3c, T1R, T3a;
+			 {
+			      E T1L, T1M, T1P, T1Q;
+			      T1L = Ip[WS(rs, 6)];
+			      T1M = Im[WS(rs, 6)];
+			      T1N = T1L - T1M;
+			      T3c = T1L + T1M;
+			      T1P = Rp[WS(rs, 6)];
+			      T1Q = Rm[WS(rs, 6)];
+			      T1R = T1P + T1Q;
+			      T3a = T1P - T1Q;
+			 }
+			 {
+			      E T1K, T1O, T39, T3b;
+			      T1K = W[22];
+			      T1O = W[23];
+			      T1S = FNMS(T1O, T1R, T1K * T1N);
+			      T4e = FMA(T1K, T1R, T1O * T1N);
+			      T39 = W[24];
+			      T3b = W[25];
+			      T3d = FMA(T39, T3a, T3b * T3c);
+			      T4T = FNMS(T3b, T3a, T39 * T3c);
+			 }
+		    }
+		    {
+			 E T1u, T31, T1y, T2Z;
+			 {
+			      E T1s, T1t, T1w, T1x;
+			      T1s = Ip[WS(rs, 10)];
+			      T1t = Im[WS(rs, 10)];
+			      T1u = T1s - T1t;
+			      T31 = T1s + T1t;
+			      T1w = Rp[WS(rs, 10)];
+			      T1x = Rm[WS(rs, 10)];
+			      T1y = T1w + T1x;
+			      T2Z = T1w - T1x;
+			 }
+			 {
+			      E T1r, T1v, T2Y, T30;
+			      T1r = W[38];
+			      T1v = W[39];
+			      T1z = FNMS(T1v, T1y, T1r * T1u);
+			      T4b = FMA(T1r, T1y, T1v * T1u);
+			      T2Y = W[40];
+			      T30 = W[41];
+			      T32 = FMA(T2Y, T2Z, T30 * T31);
+			      T4Q = FNMS(T30, T2Z, T2Y * T31);
+			 }
+		    }
+		    {
+			 E T1E, T37, T1I, T35;
+			 {
+			      E T1C, T1D, T1G, T1H;
+			      T1C = Ip[WS(rs, 14)];
+			      T1D = Im[WS(rs, 14)];
+			      T1E = T1C - T1D;
+			      T37 = T1C + T1D;
+			      T1G = Rp[WS(rs, 14)];
+			      T1H = Rm[WS(rs, 14)];
+			      T1I = T1G + T1H;
+			      T35 = T1G - T1H;
+			 }
+			 {
+			      E T1B, T1F, T34, T36;
+			      T1B = W[54];
+			      T1F = W[55];
+			      T1J = FNMS(T1F, T1I, T1B * T1E);
+			      T4d = FMA(T1B, T1I, T1F * T1E);
+			      T34 = W[56];
+			      T36 = W[57];
+			      T38 = FMA(T34, T35, T36 * T37);
+			      T4S = FNMS(T36, T35, T34 * T37);
+			 }
+		    }
+		    {
+			 E T33, T3e, T4R, T4U;
+			 T1A = T1q + T1z;
+			 T4c = T4a + T4b;
+			 T4f = T4d + T4e;
+			 T1T = T1J + T1S;
+			 T33 = T2X + T32;
+			 T3e = T38 + T3d;
+			 T3f = T33 + T3e;
+			 T5M = T3e - T33;
+			 {
+			      E T7c, T7d, T6H, T6I;
+			      T7c = T4S - T4T;
+			      T7d = T3d - T38;
+			      T7e = T7c + T7d;
+			      T7l = T7c - T7d;
+			      T6H = T4d - T4e;
+			      T6I = T1J - T1S;
+			      T6J = T6H + T6I;
+			      T7x = T6H - T6I;
+			 }
+			 T4R = T4P + T4Q;
+			 T4U = T4S + T4T;
+			 T4V = T4R + T4U;
+			 T5J = T4U - T4R;
+			 {
+			      E T79, T7a, T6E, T6F;
+			      T79 = T32 - T2X;
+			      T7a = T4P - T4Q;
+			      T7b = T79 - T7a;
+			      T7k = T7a + T79;
+			      T6E = T1q - T1z;
+			      T6F = T4a - T4b;
+			      T6G = T6E - T6F;
+			      T7w = T6F + T6E;
+			 }
+		    }
+	       }
+	       {
+		    E T9, T4i, T3l, T4A, TB, T4m, T3B, T4E, Ti, T4j, T3q, T4B, Ts, T4l, T3w;
+		    E T4D;
+		    {
+			 E T4, T3k, T8, T3i;
+			 {
+			      E T2, T3, T6, T7;
+			      T2 = Ip[WS(rs, 1)];
+			      T3 = Im[WS(rs, 1)];
+			      T4 = T2 - T3;
+			      T3k = T2 + T3;
+			      T6 = Rp[WS(rs, 1)];
+			      T7 = Rm[WS(rs, 1)];
+			      T8 = T6 + T7;
+			      T3i = T6 - T7;
+			 }
+			 {
+			      E T1, T5, T3h, T3j;
+			      T1 = W[2];
+			      T5 = W[3];
+			      T9 = FNMS(T5, T8, T1 * T4);
+			      T4i = FMA(T1, T8, T5 * T4);
+			      T3h = W[4];
+			      T3j = W[5];
+			      T3l = FMA(T3h, T3i, T3j * T3k);
+			      T4A = FNMS(T3j, T3i, T3h * T3k);
+			 }
+		    }
+		    {
+			 E Tw, T3A, TA, T3y;
+			 {
+			      E Tu, Tv, Ty, Tz;
+			      Tu = Ip[WS(rs, 13)];
+			      Tv = Im[WS(rs, 13)];
+			      Tw = Tu - Tv;
+			      T3A = Tu + Tv;
+			      Ty = Rp[WS(rs, 13)];
+			      Tz = Rm[WS(rs, 13)];
+			      TA = Ty + Tz;
+			      T3y = Ty - Tz;
+			 }
+			 {
+			      E Tt, Tx, T3x, T3z;
+			      Tt = W[50];
+			      Tx = W[51];
+			      TB = FNMS(Tx, TA, Tt * Tw);
+			      T4m = FMA(Tt, TA, Tx * Tw);
+			      T3x = W[52];
+			      T3z = W[53];
+			      T3B = FMA(T3x, T3y, T3z * T3A);
+			      T4E = FNMS(T3z, T3y, T3x * T3A);
+			 }
+		    }
+		    {
+			 E Td, T3p, Th, T3n;
+			 {
+			      E Tb, Tc, Tf, Tg;
+			      Tb = Ip[WS(rs, 9)];
+			      Tc = Im[WS(rs, 9)];
+			      Td = Tb - Tc;
+			      T3p = Tb + Tc;
+			      Tf = Rp[WS(rs, 9)];
+			      Tg = Rm[WS(rs, 9)];
+			      Th = Tf + Tg;
+			      T3n = Tf - Tg;
+			 }
+			 {
+			      E Ta, Te, T3m, T3o;
+			      Ta = W[34];
+			      Te = W[35];
+			      Ti = FNMS(Te, Th, Ta * Td);
+			      T4j = FMA(Ta, Th, Te * Td);
+			      T3m = W[36];
+			      T3o = W[37];
+			      T3q = FMA(T3m, T3n, T3o * T3p);
+			      T4B = FNMS(T3o, T3n, T3m * T3p);
+			 }
+		    }
+		    {
+			 E Tn, T3v, Tr, T3t;
+			 {
+			      E Tl, Tm, Tp, Tq;
+			      Tl = Ip[WS(rs, 5)];
+			      Tm = Im[WS(rs, 5)];
+			      Tn = Tl - Tm;
+			      T3v = Tl + Tm;
+			      Tp = Rp[WS(rs, 5)];
+			      Tq = Rm[WS(rs, 5)];
+			      Tr = Tp + Tq;
+			      T3t = Tp - Tq;
+			 }
+			 {
+			      E Tk, To, T3s, T3u;
+			      Tk = W[18];
+			      To = W[19];
+			      Ts = FNMS(To, Tr, Tk * Tn);
+			      T4l = FMA(Tk, Tr, To * Tn);
+			      T3s = W[20];
+			      T3u = W[21];
+			      T3w = FMA(T3s, T3t, T3u * T3v);
+			      T4D = FNMS(T3u, T3t, T3s * T3v);
+			 }
+		    }
+		    Tj = T9 + Ti;
+		    TC = Ts + TB;
+		    T5r = Tj - TC;
+		    T4k = T4i + T4j;
+		    T4n = T4l + T4m;
+		    T5s = T4k - T4n;
+		    {
+			 E T3r, T3C, T6T, T6U;
+			 T3r = T3l + T3q;
+			 T3C = T3w + T3B;
+			 T3D = T3r + T3C;
+			 T5C = T3C - T3r;
+			 T6T = T4E - T4D;
+			 T6U = T3w - T3B;
+			 T6V = T6T + T6U;
+			 T72 = T6T - T6U;
+		    }
+		    {
+			 E T4C, T4F, T6s, T6t;
+			 T4C = T4A + T4B;
+			 T4F = T4D + T4E;
+			 T4G = T4C + T4F;
+			 T5F = T4F - T4C;
+			 T6s = T4i - T4j;
+			 T6t = Ts - TB;
+			 T6u = T6s + T6t;
+			 T86 = T6s - T6t;
+		    }
+		    {
+			 E T6Q, T6R, T6p, T6q;
+			 T6Q = T3q - T3l;
+			 T6R = T4A - T4B;
+			 T6S = T6Q - T6R;
+			 T71 = T6R + T6Q;
+			 T6p = T9 - Ti;
+			 T6q = T4l - T4m;
+			 T6r = T6p - T6q;
+			 T85 = T6p + T6q;
+		    }
+	       }
+	       {
+		    E TM, T4p, T3I, T4H, T1e, T4t, T3Y, T4L, TV, T4q, T3N, T4I, T15, T4s, T3T;
+		    E T4K;
+		    {
+			 E TH, T3H, TL, T3F;
+			 {
+			      E TF, TG, TJ, TK;
+			      TF = Ip[WS(rs, 15)];
+			      TG = Im[WS(rs, 15)];
+			      TH = TF - TG;
+			      T3H = TF + TG;
+			      TJ = Rp[WS(rs, 15)];
+			      TK = Rm[WS(rs, 15)];
+			      TL = TJ + TK;
+			      T3F = TJ - TK;
+			 }
+			 {
+			      E TE, TI, T3E, T3G;
+			      TE = W[58];
+			      TI = W[59];
+			      TM = FNMS(TI, TL, TE * TH);
+			      T4p = FMA(TE, TL, TI * TH);
+			      T3E = W[60];
+			      T3G = W[61];
+			      T3I = FMA(T3E, T3F, T3G * T3H);
+			      T4H = FNMS(T3G, T3F, T3E * T3H);
+			 }
+		    }
+		    {
+			 E T19, T3X, T1d, T3V;
+			 {
+			      E T17, T18, T1b, T1c;
+			      T17 = Ip[WS(rs, 11)];
+			      T18 = Im[WS(rs, 11)];
+			      T19 = T17 - T18;
+			      T3X = T17 + T18;
+			      T1b = Rp[WS(rs, 11)];
+			      T1c = Rm[WS(rs, 11)];
+			      T1d = T1b + T1c;
+			      T3V = T1b - T1c;
+			 }
+			 {
+			      E T16, T1a, T3U, T3W;
+			      T16 = W[42];
+			      T1a = W[43];
+			      T1e = FNMS(T1a, T1d, T16 * T19);
+			      T4t = FMA(T16, T1d, T1a * T19);
+			      T3U = W[44];
+			      T3W = W[45];
+			      T3Y = FMA(T3U, T3V, T3W * T3X);
+			      T4L = FNMS(T3W, T3V, T3U * T3X);
+			 }
+		    }
+		    {
+			 E TQ, T3M, TU, T3K;
+			 {
+			      E TO, TP, TS, TT;
+			      TO = Ip[WS(rs, 7)];
+			      TP = Im[WS(rs, 7)];
+			      TQ = TO - TP;
+			      T3M = TO + TP;
+			      TS = Rp[WS(rs, 7)];
+			      TT = Rm[WS(rs, 7)];
+			      TU = TS + TT;
+			      T3K = TS - TT;
+			 }
+			 {
+			      E TN, TR, T3J, T3L;
+			      TN = W[26];
+			      TR = W[27];
+			      TV = FNMS(TR, TU, TN * TQ);
+			      T4q = FMA(TN, TU, TR * TQ);
+			      T3J = W[28];
+			      T3L = W[29];
+			      T3N = FMA(T3J, T3K, T3L * T3M);
+			      T4I = FNMS(T3L, T3K, T3J * T3M);
+			 }
+		    }
+		    {
+			 E T10, T3S, T14, T3Q;
+			 {
+			      E TY, TZ, T12, T13;
+			      TY = Ip[WS(rs, 3)];
+			      TZ = Im[WS(rs, 3)];
+			      T10 = TY - TZ;
+			      T3S = TY + TZ;
+			      T12 = Rp[WS(rs, 3)];
+			      T13 = Rm[WS(rs, 3)];
+			      T14 = T12 + T13;
+			      T3Q = T12 - T13;
+			 }
+			 {
+			      E TX, T11, T3P, T3R;
+			      TX = W[10];
+			      T11 = W[11];
+			      T15 = FNMS(T11, T14, TX * T10);
+			      T4s = FMA(TX, T14, T11 * T10);
+			      T3P = W[12];
+			      T3R = W[13];
+			      T3T = FMA(T3P, T3Q, T3R * T3S);
+			      T4K = FNMS(T3R, T3Q, T3P * T3S);
+			 }
+		    }
+		    TW = TM + TV;
+		    T1f = T15 + T1e;
+		    T5v = TW - T1f;
+		    T4r = T4p + T4q;
+		    T4u = T4s + T4t;
+		    T5u = T4r - T4u;
+		    {
+			 E T3O, T3Z, T74, T75;
+			 T3O = T3I + T3N;
+			 T3Z = T3T + T3Y;
+			 T40 = T3O + T3Z;
+			 T5G = T3Z - T3O;
+			 T74 = T4H - T4I;
+			 T75 = T3Y - T3T;
+			 T76 = T74 + T75;
+			 T8k = T74 - T75;
+		    }
+		    {
+			 E T4J, T4M, T6z, T6A;
+			 T4J = T4H + T4I;
+			 T4M = T4K + T4L;
+			 T4N = T4J + T4M;
+			 T5D = T4J - T4M;
+			 T6z = T4p - T4q;
+			 T6A = T15 - T1e;
+			 T6B = T6z + T6A;
+			 T89 = T6z - T6A;
+		    }
+		    {
+			 E T6X, T6Y, T6w, T6x;
+			 T6X = T3N - T3I;
+			 T6Y = T4K - T4L;
+			 T6Z = T6X - T6Y;
+			 T8h = T6X + T6Y;
+			 T6w = TM - TV;
+			 T6x = T4s - T4t;
+			 T6y = T6w - T6x;
+			 T88 = T6w + T6x;
+		    }
+	       }
+	       {
+		    E T1h, T5i, T5c, T5m, T5f, T5n, T2s, T58, T42, T4y, T4w, T57, T54, T56, T4h;
+		    E T5h;
+		    {
+			 E TD, T1g, T5a, T5b;
+			 TD = Tj + TC;
+			 T1g = TW + T1f;
+			 T1h = TD + T1g;
+			 T5i = TD - T1g;
+			 T5a = T4N - T4G;
+			 T5b = T3D - T40;
+			 T5c = T5a + T5b;
+			 T5m = T5a - T5b;
+		    }
+		    {
+			 E T5d, T5e, T1U, T2r;
+			 T5d = T3f + T2S;
+			 T5e = T52 - T4V;
+			 T5f = T5d - T5e;
+			 T5n = T5d + T5e;
+			 T1U = T1A + T1T;
+			 T2r = T2d + T2q;
+			 T2s = T1U + T2r;
+			 T58 = T2r - T1U;
+		    }
+		    {
+			 E T3g, T41, T4o, T4v;
+			 T3g = T2S - T3f;
+			 T41 = T3D + T40;
+			 T42 = T3g - T41;
+			 T4y = T41 + T3g;
+			 T4o = T4k + T4n;
+			 T4v = T4r + T4u;
+			 T4w = T4o + T4v;
+			 T57 = T4v - T4o;
+		    }
+		    {
+			 E T4O, T53, T49, T4g;
+			 T4O = T4G + T4N;
+			 T53 = T4V + T52;
+			 T54 = T4O - T53;
+			 T56 = T4O + T53;
+			 T49 = T45 + T48;
+			 T4g = T4c + T4f;
+			 T4h = T49 + T4g;
+			 T5h = T49 - T4g;
+		    }
+		    {
+			 E T2t, T55, T4x, T4z;
+			 T2t = T1h + T2s;
+			 Ip[0] = KP500000000 * (T2t + T42);
+			 Im[WS(rs, 15)] = KP500000000 * (T42 - T2t);
+			 T55 = T4h + T4w;
+			 Rm[WS(rs, 15)] = KP500000000 * (T55 - T56);
+			 Rp[0] = KP500000000 * (T55 + T56);
+			 T4x = T4h - T4w;
+			 Rm[WS(rs, 7)] = KP500000000 * (T4x - T4y);
+			 Rp[WS(rs, 8)] = KP500000000 * (T4x + T4y);
+			 T4z = T2s - T1h;
+			 Ip[WS(rs, 8)] = KP500000000 * (T4z + T54);
+			 Im[WS(rs, 7)] = KP500000000 * (T54 - T4z);
+		    }
+		    {
+			 E T59, T5g, T5p, T5q;
+			 T59 = KP500000000 * (T57 + T58);
+			 T5g = KP353553390 * (T5c + T5f);
+			 Ip[WS(rs, 4)] = T59 + T5g;
+			 Im[WS(rs, 11)] = T5g - T59;
+			 T5p = KP500000000 * (T5h + T5i);
+			 T5q = KP353553390 * (T5m + T5n);
+			 Rm[WS(rs, 11)] = T5p - T5q;
+			 Rp[WS(rs, 4)] = T5p + T5q;
+		    }
+		    {
+			 E T5j, T5k, T5l, T5o;
+			 T5j = KP500000000 * (T5h - T5i);
+			 T5k = KP353553390 * (T5f - T5c);
+			 Rm[WS(rs, 3)] = T5j - T5k;
+			 Rp[WS(rs, 12)] = T5j + T5k;
+			 T5l = KP500000000 * (T58 - T57);
+			 T5o = KP353553390 * (T5m - T5n);
+			 Ip[WS(rs, 12)] = T5l + T5o;
+			 Im[WS(rs, 3)] = T5o - T5l;
+		    }
+	       }
+	       {
+		    E T5x, T6g, T6a, T6k, T6d, T6l, T5A, T66, T5I, T60, T5T, T6f, T5W, T65, T5P;
+		    E T61;
+		    {
+			 E T5t, T5w, T68, T69;
+			 T5t = T5r - T5s;
+			 T5w = T5u + T5v;
+			 T5x = KP353553390 * (T5t + T5w);
+			 T6g = KP353553390 * (T5t - T5w);
+			 T68 = T5D - T5C;
+			 T69 = T5G - T5F;
+			 T6a = FMA(KP461939766, T68, KP191341716 * T69);
+			 T6k = FNMS(KP461939766, T69, KP191341716 * T68);
+		    }
+		    {
+			 E T6b, T6c, T5y, T5z;
+			 T6b = T5K - T5J;
+			 T6c = T5N - T5M;
+			 T6d = FNMS(KP461939766, T6c, KP191341716 * T6b);
+			 T6l = FMA(KP461939766, T6b, KP191341716 * T6c);
+			 T5y = T4f - T4c;
+			 T5z = T2q - T2d;
+			 T5A = KP500000000 * (T5y + T5z);
+			 T66 = KP500000000 * (T5z - T5y);
+		    }
+		    {
+			 E T5E, T5H, T5R, T5S;
+			 T5E = T5C + T5D;
+			 T5H = T5F + T5G;
+			 T5I = FMA(KP191341716, T5E, KP461939766 * T5H);
+			 T60 = FNMS(KP191341716, T5H, KP461939766 * T5E);
+			 T5R = T45 - T48;
+			 T5S = T1A - T1T;
+			 T5T = KP500000000 * (T5R + T5S);
+			 T6f = KP500000000 * (T5R - T5S);
+		    }
+		    {
+			 E T5U, T5V, T5L, T5O;
+			 T5U = T5s + T5r;
+			 T5V = T5u - T5v;
+			 T5W = KP353553390 * (T5U + T5V);
+			 T65 = KP353553390 * (T5V - T5U);
+			 T5L = T5J + T5K;
+			 T5O = T5M + T5N;
+			 T5P = FNMS(KP191341716, T5O, KP461939766 * T5L);
+			 T61 = FMA(KP191341716, T5L, KP461939766 * T5O);
+		    }
+		    {
+			 E T5B, T5Q, T63, T64;
+			 T5B = T5x + T5A;
+			 T5Q = T5I + T5P;
+			 Ip[WS(rs, 2)] = T5B + T5Q;
+			 Im[WS(rs, 13)] = T5Q - T5B;
+			 T63 = T5T + T5W;
+			 T64 = T60 + T61;
+			 Rm[WS(rs, 13)] = T63 - T64;
+			 Rp[WS(rs, 2)] = T63 + T64;
+		    }
+		    {
+			 E T5X, T5Y, T5Z, T62;
+			 T5X = T5T - T5W;
+			 T5Y = T5P - T5I;
+			 Rm[WS(rs, 5)] = T5X - T5Y;
+			 Rp[WS(rs, 10)] = T5X + T5Y;
+			 T5Z = T5A - T5x;
+			 T62 = T60 - T61;
+			 Ip[WS(rs, 10)] = T5Z + T62;
+			 Im[WS(rs, 5)] = T62 - T5Z;
+		    }
+		    {
+			 E T67, T6e, T6n, T6o;
+			 T67 = T65 + T66;
+			 T6e = T6a + T6d;
+			 Ip[WS(rs, 6)] = T67 + T6e;
+			 Im[WS(rs, 9)] = T6e - T67;
+			 T6n = T6f + T6g;
+			 T6o = T6k + T6l;
+			 Rm[WS(rs, 9)] = T6n - T6o;
+			 Rp[WS(rs, 6)] = T6n + T6o;
+		    }
+		    {
+			 E T6h, T6i, T6j, T6m;
+			 T6h = T6f - T6g;
+			 T6i = T6d - T6a;
+			 Rm[WS(rs, 1)] = T6h - T6i;
+			 Rp[WS(rs, 14)] = T6h + T6i;
+			 T6j = T66 - T65;
+			 T6m = T6k - T6l;
+			 Ip[WS(rs, 14)] = T6j + T6m;
+			 Im[WS(rs, 1)] = T6m - T6j;
+		    }
+	       }
+	       {
+		    E T6D, T7W, T6O, T7M, T7C, T7L, T7z, T7V, T7r, T81, T7H, T7T, T78, T80, T7G;
+		    E T7Q;
+		    {
+			 E T6v, T6C, T7v, T7y;
+			 T6v = FNMS(KP191341716, T6u, KP461939766 * T6r);
+			 T6C = FMA(KP461939766, T6y, KP191341716 * T6B);
+			 T6D = T6v + T6C;
+			 T7W = T6v - T6C;
+			 {
+			      E T6K, T6N, T7A, T7B;
+			      T6K = KP353553390 * (T6G + T6J);
+			      T6N = KP500000000 * (T6L - T6M);
+			      T6O = T6K + T6N;
+			      T7M = T6N - T6K;
+			      T7A = FMA(KP191341716, T6r, KP461939766 * T6u);
+			      T7B = FNMS(KP191341716, T6y, KP461939766 * T6B);
+			      T7C = T7A + T7B;
+			      T7L = T7B - T7A;
+			 }
+			 T7v = KP500000000 * (T7t + T7u);
+			 T7y = KP353553390 * (T7w + T7x);
+			 T7z = T7v + T7y;
+			 T7V = T7v - T7y;
+			 {
+			      E T7j, T7R, T7q, T7S, T7f, T7m;
+			      T7f = KP707106781 * (T7b + T7e);
+			      T7j = T7f + T7i;
+			      T7R = T7i - T7f;
+			      T7m = KP707106781 * (T7k + T7l);
+			      T7q = T7m + T7p;
+			      T7S = T7p - T7m;
+			      T7r = FNMS(KP097545161, T7q, KP490392640 * T7j);
+			      T81 = FMA(KP415734806, T7R, KP277785116 * T7S);
+			      T7H = FMA(KP097545161, T7j, KP490392640 * T7q);
+			      T7T = FNMS(KP415734806, T7S, KP277785116 * T7R);
+			 }
+			 {
+			      E T70, T7O, T77, T7P, T6W, T73;
+			      T6W = KP707106781 * (T6S + T6V);
+			      T70 = T6W + T6Z;
+			      T7O = T6Z - T6W;
+			      T73 = KP707106781 * (T71 + T72);
+			      T77 = T73 + T76;
+			      T7P = T76 - T73;
+			      T78 = FMA(KP490392640, T70, KP097545161 * T77);
+			      T80 = FNMS(KP415734806, T7O, KP277785116 * T7P);
+			      T7G = FNMS(KP097545161, T70, KP490392640 * T77);
+			      T7Q = FMA(KP277785116, T7O, KP415734806 * T7P);
+			 }
+		    }
+		    {
+			 E T6P, T7s, T7J, T7K;
+			 T6P = T6D + T6O;
+			 T7s = T78 + T7r;
+			 Ip[WS(rs, 1)] = T6P + T7s;
+			 Im[WS(rs, 14)] = T7s - T6P;
+			 T7J = T7z + T7C;
+			 T7K = T7G + T7H;
+			 Rm[WS(rs, 14)] = T7J - T7K;
+			 Rp[WS(rs, 1)] = T7J + T7K;
+		    }
+		    {
+			 E T7D, T7E, T7F, T7I;
+			 T7D = T7z - T7C;
+			 T7E = T7r - T78;
+			 Rm[WS(rs, 6)] = T7D - T7E;
+			 Rp[WS(rs, 9)] = T7D + T7E;
+			 T7F = T6O - T6D;
+			 T7I = T7G - T7H;
+			 Ip[WS(rs, 9)] = T7F + T7I;
+			 Im[WS(rs, 6)] = T7I - T7F;
+		    }
+		    {
+			 E T7N, T7U, T83, T84;
+			 T7N = T7L + T7M;
+			 T7U = T7Q + T7T;
+			 Ip[WS(rs, 5)] = T7N + T7U;
+			 Im[WS(rs, 10)] = T7U - T7N;
+			 T83 = T7V + T7W;
+			 T84 = T80 + T81;
+			 Rm[WS(rs, 10)] = T83 - T84;
+			 Rp[WS(rs, 5)] = T83 + T84;
+		    }
+		    {
+			 E T7X, T7Y, T7Z, T82;
+			 T7X = T7V - T7W;
+			 T7Y = T7T - T7Q;
+			 Rm[WS(rs, 2)] = T7X - T7Y;
+			 Rp[WS(rs, 13)] = T7X + T7Y;
+			 T7Z = T7M - T7L;
+			 T82 = T80 - T81;
+			 Ip[WS(rs, 13)] = T7Z + T82;
+			 Im[WS(rs, 2)] = T82 - T7Z;
+		    }
+	       }
+	       {
+		    E T8b, T8U, T8e, T8K, T8A, T8J, T8x, T8T, T8t, T8Z, T8F, T8R, T8m, T8Y, T8E;
+		    E T8O;
+		    {
+			 E T87, T8a, T8v, T8w;
+			 T87 = FNMS(KP461939766, T86, KP191341716 * T85);
+			 T8a = FMA(KP191341716, T88, KP461939766 * T89);
+			 T8b = T87 + T8a;
+			 T8U = T87 - T8a;
+			 {
+			      E T8c, T8d, T8y, T8z;
+			      T8c = KP353553390 * (T7x - T7w);
+			      T8d = KP500000000 * (T6M + T6L);
+			      T8e = T8c + T8d;
+			      T8K = T8d - T8c;
+			      T8y = FMA(KP461939766, T85, KP191341716 * T86);
+			      T8z = FNMS(KP461939766, T88, KP191341716 * T89);
+			      T8A = T8y + T8z;
+			      T8J = T8z - T8y;
+			 }
+			 T8v = KP500000000 * (T7t - T7u);
+			 T8w = KP353553390 * (T6G - T6J);
+			 T8x = T8v + T8w;
+			 T8T = T8v - T8w;
+			 {
+			      E T8p, T8P, T8s, T8Q, T8n, T8q;
+			      T8n = KP707106781 * (T7l - T7k);
+			      T8p = T8n + T8o;
+			      T8P = T8o - T8n;
+			      T8q = KP707106781 * (T7b - T7e);
+			      T8s = T8q + T8r;
+			      T8Q = T8r - T8q;
+			      T8t = FNMS(KP277785116, T8s, KP415734806 * T8p);
+			      T8Z = FMA(KP490392640, T8P, KP097545161 * T8Q);
+			      T8F = FMA(KP277785116, T8p, KP415734806 * T8s);
+			      T8R = FNMS(KP490392640, T8Q, KP097545161 * T8P);
+			 }
+			 {
+			      E T8i, T8M, T8l, T8N, T8g, T8j;
+			      T8g = KP707106781 * (T72 - T71);
+			      T8i = T8g + T8h;
+			      T8M = T8h - T8g;
+			      T8j = KP707106781 * (T6S - T6V);
+			      T8l = T8j + T8k;
+			      T8N = T8k - T8j;
+			      T8m = FMA(KP415734806, T8i, KP277785116 * T8l);
+			      T8Y = FNMS(KP490392640, T8M, KP097545161 * T8N);
+			      T8E = FNMS(KP277785116, T8i, KP415734806 * T8l);
+			      T8O = FMA(KP097545161, T8M, KP490392640 * T8N);
+			 }
+		    }
+		    {
+			 E T8f, T8u, T8H, T8I;
+			 T8f = T8b + T8e;
+			 T8u = T8m + T8t;
+			 Ip[WS(rs, 3)] = T8f + T8u;
+			 Im[WS(rs, 12)] = T8u - T8f;
+			 T8H = T8x + T8A;
+			 T8I = T8E + T8F;
+			 Rm[WS(rs, 12)] = T8H - T8I;
+			 Rp[WS(rs, 3)] = T8H + T8I;
+		    }
+		    {
+			 E T8B, T8C, T8D, T8G;
+			 T8B = T8x - T8A;
+			 T8C = T8t - T8m;
+			 Rm[WS(rs, 4)] = T8B - T8C;
+			 Rp[WS(rs, 11)] = T8B + T8C;
+			 T8D = T8e - T8b;
+			 T8G = T8E - T8F;
+			 Ip[WS(rs, 11)] = T8D + T8G;
+			 Im[WS(rs, 4)] = T8G - T8D;
+		    }
+		    {
+			 E T8L, T8S, T91, T92;
+			 T8L = T8J + T8K;
+			 T8S = T8O + T8R;
+			 Ip[WS(rs, 7)] = T8L + T8S;
+			 Im[WS(rs, 8)] = T8S - T8L;
+			 T91 = T8T + T8U;
+			 T92 = T8Y + T8Z;
+			 Rm[WS(rs, 8)] = T91 - T92;
+			 Rp[WS(rs, 7)] = T91 + T92;
+		    }
+		    {
+			 E T8V, T8W, T8X, T90;
+			 T8V = T8T - T8U;
+			 T8W = T8R - T8O;
+			 Rm[0] = T8V - T8W;
+			 Rp[WS(rs, 15)] = T8V + T8W;
+			 T8X = T8K - T8J;
+			 T90 = T8Y - T8Z;
+			 Ip[WS(rs, 15)] = T8X + T90;
+			 Im[0] = T90 - T8X;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 32, "hc2cfdft_32", twinstr, &GENUS, {404, 134, 94, 0} };
+
+void X(codelet_hc2cfdft_32) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_32, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:44 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cfdft_4 -include hc2cf.h */
+
+/*
+ * This function contains 30 FP additions, 20 FP multiplications,
+ * (or, 24 additions, 14 multiplications, 6 fused multiply/add),
+ * 32 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E Td, Tu, Tr, T4, Tm, To, T9, T5, TA, Tp, Tv, TD, T6, Tq;
+	       {
+		    E Tk, Tl, Tf, TC, Tj, T7, T8, T1, Tn, Tb, Tc;
+		    Tb = Ip[0];
+		    Tc = Im[0];
+		    {
+			 E Ti, Tg, Th, T2, T3;
+			 Tg = Rm[0];
+			 Th = Rp[0];
+			 Tk = W[1];
+			 Tl = Tb + Tc;
+			 Td = Tb - Tc;
+			 Tu = Th + Tg;
+			 Ti = Tg - Th;
+			 Tf = W[0];
+			 T2 = Ip[WS(rs, 1)];
+			 T3 = Im[WS(rs, 1)];
+			 TC = Tk * Ti;
+			 Tj = Tf * Ti;
+			 T7 = Rp[WS(rs, 1)];
+			 Tr = T2 + T3;
+			 T4 = T2 - T3;
+			 T8 = Rm[WS(rs, 1)];
+			 T1 = W[2];
+			 Tn = W[4];
+		    }
+		    Tm = FNMS(Tk, Tl, Tj);
+		    To = T7 - T8;
+		    T9 = T7 + T8;
+		    T5 = T1 * T4;
+		    TA = Tn * Tr;
+		    Tp = Tn * To;
+		    Tv = T1 * T9;
+		    TD = FMA(Tf, Tl, TC);
+		    T6 = W[3];
+		    Tq = W[5];
+	       }
+	       {
+		    E Tw, Ta, TB, Ts;
+		    Tw = FMA(T6, T4, Tv);
+		    Ta = FNMS(T6, T9, T5);
+		    TB = FNMS(Tq, To, TA);
+		    Ts = FMA(Tq, Tr, Tp);
+		    {
+			 E TF, Tx, Te, Tz;
+			 TF = Tu + Tw;
+			 Tx = Tu - Tw;
+			 Te = Ta + Td;
+			 Tz = Td - Ta;
+			 {
+			      E TG, TE, Tt, Ty;
+			      TG = TB + TD;
+			      TE = TB - TD;
+			      Tt = Tm - Ts;
+			      Ty = Ts + Tm;
+			      Im[0] = KP500000000 * (TE - Tz);
+			      Ip[WS(rs, 1)] = KP500000000 * (Tz + TE);
+			      Rp[0] = KP500000000 * (TF + TG);
+			      Rm[WS(rs, 1)] = KP500000000 * (TF - TG);
+			      Rp[WS(rs, 1)] = KP500000000 * (Tx + Ty);
+			      Rm[0] = KP500000000 * (Tx - Ty);
+			      Im[WS(rs, 1)] = KP500000000 * (Tt - Te);
+			      Ip[0] = KP500000000 * (Te + Tt);
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cfdft_4", twinstr, &GENUS, {24, 14, 6, 0} };
+
+void X(codelet_hc2cfdft_4) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_4, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cfdft_4 -include hc2cf.h */
+
+/*
+ * This function contains 30 FP additions, 20 FP multiplications,
+ * (or, 24 additions, 14 multiplications, 6 fused multiply/add),
+ * 18 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E Tc, Tr, Tk, Tx, T9, Ts, Tp, Tw;
+	       {
+		    E Ta, Tb, Tj, Tf, Tg, Th, Te, Ti;
+		    Ta = Ip[0];
+		    Tb = Im[0];
+		    Tj = Ta + Tb;
+		    Tf = Rm[0];
+		    Tg = Rp[0];
+		    Th = Tf - Tg;
+		    Tc = Ta - Tb;
+		    Tr = Tg + Tf;
+		    Te = W[0];
+		    Ti = W[1];
+		    Tk = FNMS(Ti, Tj, Te * Th);
+		    Tx = FMA(Ti, Th, Te * Tj);
+	       }
+	       {
+		    E T4, To, T8, Tm;
+		    {
+			 E T2, T3, T6, T7;
+			 T2 = Ip[WS(rs, 1)];
+			 T3 = Im[WS(rs, 1)];
+			 T4 = T2 - T3;
+			 To = T2 + T3;
+			 T6 = Rp[WS(rs, 1)];
+			 T7 = Rm[WS(rs, 1)];
+			 T8 = T6 + T7;
+			 Tm = T6 - T7;
+		    }
+		    {
+			 E T1, T5, Tl, Tn;
+			 T1 = W[2];
+			 T5 = W[3];
+			 T9 = FNMS(T5, T8, T1 * T4);
+			 Ts = FMA(T1, T8, T5 * T4);
+			 Tl = W[4];
+			 Tn = W[5];
+			 Tp = FMA(Tl, Tm, Tn * To);
+			 Tw = FNMS(Tn, Tm, Tl * To);
+		    }
+	       }
+	       {
+		    E Td, Tq, Tz, TA;
+		    Td = T9 + Tc;
+		    Tq = Tk - Tp;
+		    Ip[0] = KP500000000 * (Td + Tq);
+		    Im[WS(rs, 1)] = KP500000000 * (Tq - Td);
+		    Tz = Tr + Ts;
+		    TA = Tw + Tx;
+		    Rm[WS(rs, 1)] = KP500000000 * (Tz - TA);
+		    Rp[0] = KP500000000 * (Tz + TA);
+	       }
+	       {
+		    E Tt, Tu, Tv, Ty;
+		    Tt = Tr - Ts;
+		    Tu = Tp + Tk;
+		    Rm[0] = KP500000000 * (Tt - Tu);
+		    Rp[WS(rs, 1)] = KP500000000 * (Tt + Tu);
+		    Tv = Tc - T9;
+		    Ty = Tw - Tx;
+		    Ip[WS(rs, 1)] = KP500000000 * (Tv + Ty);
+		    Im[0] = KP500000000 * (Ty - Tv);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 4, "hc2cfdft_4", twinstr, &GENUS, {24, 14, 6, 0} };
+
+void X(codelet_hc2cfdft_4) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_4, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:44 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cfdft_6 -include hc2cf.h */
+
+/*
+ * This function contains 58 FP additions, 44 FP multiplications,
+ * (or, 36 additions, 22 multiplications, 22 fused multiply/add),
+ * 42 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E TP, TT, TN, TM, TY, T13;
+	       {
+		    E T3, TQ, TJ, T12, Tu, TB, TX, T10, Tj, Tf, Ti, Td, Th, TU, TS;
+		    {
+			 E TC, TI, TF, TH, TA, Tw, TZ;
+			 {
+			      E T1, T2, TD, TE;
+			      T1 = Ip[0];
+			      T2 = Im[0];
+			      TD = Rm[0];
+			      TE = Rp[0];
+			      TC = W[0];
+			      T3 = T1 - T2;
+			      TI = T1 + T2;
+			      TQ = TE + TD;
+			      TF = TD - TE;
+			      TH = W[1];
+			 }
+			 {
+			      E Tr, To, Ts, Tl, Tq;
+			      {
+				   E Tm, Tn, TG, T11;
+				   Tm = Rm[WS(rs, 2)];
+				   Tn = Rp[WS(rs, 2)];
+				   TG = TC * TF;
+				   T11 = TH * TF;
+				   Tr = Ip[WS(rs, 2)];
+				   TA = Tn + Tm;
+				   To = Tm - Tn;
+				   TJ = FNMS(TH, TI, TG);
+				   T12 = FMA(TC, TI, T11);
+				   Ts = Im[WS(rs, 2)];
+			      }
+			      Tl = W[8];
+			      Tq = W[9];
+			      {
+				   E Tz, Ty, TW, Tx, Tt, Tp;
+				   Tw = W[6];
+				   Tx = Tr - Ts;
+				   Tt = Tr + Ts;
+				   Tp = Tl * To;
+				   Tz = W[7];
+				   Ty = Tw * Tx;
+				   TW = Tl * Tt;
+				   Tu = FNMS(Tq, Tt, Tp);
+				   TZ = Tz * Tx;
+				   TB = FNMS(Tz, TA, Ty);
+				   TX = FMA(Tq, To, TW);
+			      }
+			 }
+			 {
+			      E T5, T6, Ta, Tb;
+			      T5 = Ip[WS(rs, 1)];
+			      T10 = FMA(Tw, TA, TZ);
+			      T6 = Im[WS(rs, 1)];
+			      Ta = Rp[WS(rs, 1)];
+			      Tb = Rm[WS(rs, 1)];
+			      {
+				   E T4, Tg, T7, Tc, T9, T8, TR;
+				   T4 = W[5];
+				   Tg = T5 - T6;
+				   T7 = T5 + T6;
+				   Tj = Ta + Tb;
+				   Tc = Ta - Tb;
+				   T9 = W[4];
+				   T8 = T4 * T7;
+				   Tf = W[2];
+				   Ti = W[3];
+				   TR = T9 * T7;
+				   Td = FMA(T9, Tc, T8);
+				   Th = Tf * Tg;
+				   TU = Ti * Tg;
+				   TS = FNMS(T4, Tc, TR);
+			      }
+			 }
+		    }
+		    {
+			 E Te, T1d, TK, Tv, T1a, T1b, Tk, TV;
+			 TP = Td + T3;
+			 Te = T3 - Td;
+			 Tk = FNMS(Ti, Tj, Th);
+			 TV = FMA(Tf, Tj, TU);
+			 T1d = TQ + TS;
+			 TT = TQ - TS;
+			 TN = TJ - TB;
+			 TK = TB + TJ;
+			 Tv = Tk + Tu;
+			 TM = Tu - Tk;
+			 TY = TV - TX;
+			 T1a = TV + TX;
+			 T1b = T10 + T12;
+			 T13 = T10 - T12;
+			 {
+			      E T1g, TL, T1e, T1c, T19, T1f;
+			      T1g = Tv - TK;
+			      TL = Tv + TK;
+			      T1e = T1a + T1b;
+			      T1c = T1a - T1b;
+			      T19 = FNMS(KP500000000, TL, Te);
+			      Ip[0] = KP500000000 * (Te + TL);
+			      T1f = FNMS(KP500000000, T1e, T1d);
+			      Rp[0] = KP500000000 * (T1d + T1e);
+			      Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP866025403, T1c, T19)));
+			      Ip[WS(rs, 2)] = KP500000000 * (FMA(KP866025403, T1c, T19));
+			      Rm[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T1g, T1f));
+			      Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP866025403, T1g, T1f));
+			 }
+		    }
+	       }
+	       {
+		    E TO, T16, T14, T18, T17, T15;
+		    TO = TM + TN;
+		    T16 = TN - TM;
+		    T14 = TY + T13;
+		    T18 = T13 - TY;
+		    T17 = FMA(KP500000000, TO, TP);
+		    Im[WS(rs, 2)] = KP500000000 * (TO - TP);
+		    T15 = FNMS(KP500000000, T14, TT);
+		    Rm[WS(rs, 2)] = KP500000000 * (TT + T14);
+		    Im[0] = -(KP500000000 * (FNMS(KP866025403, T18, T17)));
+		    Ip[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T18, T17));
+		    Rm[0] = KP500000000 * (FNMS(KP866025403, T16, T15));
+		    Rp[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T16, T15));
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 6, "hc2cfdft_6", twinstr, &GENUS, {36, 22, 22, 0} };
+
+void X(codelet_hc2cfdft_6) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_6, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cfdft_6 -include hc2cf.h */
+
+/*
+ * This function contains 58 FP additions, 36 FP multiplications,
+ * (or, 44 additions, 22 multiplications, 14 fused multiply/add),
+ * 40 stack variables, 3 constants, and 24 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP433012701, +0.433012701892219323381861585376468091735701313);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T3, TM, Tc, TN, Ts, T10, TI, TR, TF, T11, TH, TU;
+	       {
+		    E T1, T2, TD, Tz, TA, TB, T7, Tf, Tb, Th, Tq, Tw, Tm, Tu, T4;
+		    E T8;
+		    {
+			 E T5, T6, T9, Ta;
+			 T1 = Ip[0];
+			 T2 = Im[0];
+			 TD = T1 + T2;
+			 Tz = Rm[0];
+			 TA = Rp[0];
+			 TB = Tz - TA;
+			 T5 = Ip[WS(rs, 1)];
+			 T6 = Im[WS(rs, 1)];
+			 T7 = T5 + T6;
+			 Tf = T5 - T6;
+			 T9 = Rp[WS(rs, 1)];
+			 Ta = Rm[WS(rs, 1)];
+			 Tb = T9 - Ta;
+			 Th = T9 + Ta;
+			 {
+			      E To, Tp, Tk, Tl;
+			      To = Rp[WS(rs, 2)];
+			      Tp = Rm[WS(rs, 2)];
+			      Tq = To - Tp;
+			      Tw = To + Tp;
+			      Tk = Ip[WS(rs, 2)];
+			      Tl = Im[WS(rs, 2)];
+			      Tm = Tk + Tl;
+			      Tu = Tk - Tl;
+			 }
+		    }
+		    T3 = T1 - T2;
+		    TM = TA + Tz;
+		    T4 = W[5];
+		    T8 = W[4];
+		    Tc = FMA(T4, T7, T8 * Tb);
+		    TN = FNMS(T4, Tb, T8 * T7);
+		    {
+			 E Ti, TP, Tr, TQ;
+			 {
+			      E Te, Tg, Tj, Tn;
+			      Te = W[2];
+			      Tg = W[3];
+			      Ti = FNMS(Tg, Th, Te * Tf);
+			      TP = FMA(Tg, Tf, Te * Th);
+			      Tj = W[9];
+			      Tn = W[8];
+			      Tr = FMA(Tj, Tm, Tn * Tq);
+			      TQ = FNMS(Tj, Tq, Tn * Tm);
+			 }
+			 Ts = Ti - Tr;
+			 T10 = TP + TQ;
+			 TI = Ti + Tr;
+			 TR = TP - TQ;
+		    }
+		    {
+			 E Tx, TS, TE, TT;
+			 {
+			      E Tt, Tv, Ty, TC;
+			      Tt = W[6];
+			      Tv = W[7];
+			      Tx = FNMS(Tv, Tw, Tt * Tu);
+			      TS = FMA(Tv, Tu, Tt * Tw);
+			      Ty = W[0];
+			      TC = W[1];
+			      TE = FNMS(TC, TD, Ty * TB);
+			      TT = FMA(TC, TB, Ty * TD);
+			 }
+			 TF = Tx + TE;
+			 T11 = TS + TT;
+			 TH = TE - Tx;
+			 TU = TS - TT;
+		    }
+	       }
+	       {
+		    E T12, Td, TG, TZ;
+		    T12 = KP433012701 * (T10 - T11);
+		    Td = T3 - Tc;
+		    TG = Ts + TF;
+		    TZ = FNMS(KP250000000, TG, KP500000000 * Td);
+		    Ip[0] = KP500000000 * (Td + TG);
+		    Im[WS(rs, 1)] = T12 - TZ;
+		    Ip[WS(rs, 2)] = TZ + T12;
+	       }
+	       {
+		    E T16, T13, T14, T15;
+		    T16 = KP433012701 * (Ts - TF);
+		    T13 = TM + TN;
+		    T14 = T10 + T11;
+		    T15 = FNMS(KP250000000, T14, KP500000000 * T13);
+		    Rp[WS(rs, 2)] = T15 - T16;
+		    Rp[0] = KP500000000 * (T13 + T14);
+		    Rm[WS(rs, 1)] = T16 + T15;
+	       }
+	       {
+		    E TY, TJ, TK, TX;
+		    TY = KP433012701 * (TU - TR);
+		    TJ = TH - TI;
+		    TK = Tc + T3;
+		    TX = FMA(KP500000000, TK, KP250000000 * TJ);
+		    Im[WS(rs, 2)] = KP500000000 * (TJ - TK);
+		    Im[0] = TY - TX;
+		    Ip[WS(rs, 1)] = TX + TY;
+	       }
+	       {
+		    E TL, TO, TV, TW;
+		    TL = KP433012701 * (TI + TH);
+		    TO = TM - TN;
+		    TV = TR + TU;
+		    TW = FNMS(KP250000000, TV, KP500000000 * TO);
+		    Rp[WS(rs, 1)] = TL + TW;
+		    Rm[WS(rs, 2)] = KP500000000 * (TO + TV);
+		    Rm[0] = TW - TL;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 6, "hc2cfdft_6", twinstr, &GENUS, {44, 22, 14, 0} };
+
+void X(codelet_hc2cfdft_6) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_6, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hc2cfdft_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:44 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cfdft_8 -include hc2cf.h */
+
+/*
+ * This function contains 82 FP additions, 52 FP multiplications,
+ * (or, 60 additions, 30 multiplications, 22 fused multiply/add),
+ * 55 stack variables, 2 constants, and 32 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T1A, T1w, T1z, T1x, T1H, T1v, T1L, T1F;
+	       {
+		    E Ty, T14, TO, T1o, Tv, TG, T16, T1m, Ta, T19, T1h, TV, T10, TX, TZ;
+		    E Tk, T1i, TY, T1b, TF, TB, T1l;
+		    {
+			 E TH, TN, TK, TM;
+			 {
+			      E Tw, Tx, TI, TJ;
+			      Tw = Ip[0];
+			      Tx = Im[0];
+			      TI = Rm[0];
+			      TJ = Rp[0];
+			      TH = W[0];
+			      Ty = Tw - Tx;
+			      TN = Tw + Tx;
+			      T14 = TJ + TI;
+			      TK = TI - TJ;
+			      TM = W[1];
+			 }
+			 {
+			      E Ts, Tp, Tt, Tm, Tr;
+			      {
+				   E Tn, To, TL, T1n;
+				   Tn = Ip[WS(rs, 2)];
+				   To = Im[WS(rs, 2)];
+				   TL = TH * TK;
+				   T1n = TM * TK;
+				   Ts = Rp[WS(rs, 2)];
+				   TF = Tn + To;
+				   Tp = Tn - To;
+				   TO = FNMS(TM, TN, TL);
+				   T1o = FMA(TH, TN, T1n);
+				   Tt = Rm[WS(rs, 2)];
+			      }
+			      Tm = W[6];
+			      Tr = W[7];
+			      {
+				   E TE, TD, T15, TC, Tu, Tq;
+				   TB = W[8];
+				   TC = Tt - Ts;
+				   Tu = Ts + Tt;
+				   Tq = Tm * Tp;
+				   TE = W[9];
+				   TD = TB * TC;
+				   T15 = Tm * Tu;
+				   Tv = FNMS(Tr, Tu, Tq);
+				   T1l = TE * TC;
+				   TG = FNMS(TE, TF, TD);
+				   T16 = FMA(Tr, Tp, T15);
+			      }
+			 }
+		    }
+		    {
+			 E TU, TR, TT, T1g, TS;
+			 {
+			      E T2, T3, T7, T8;
+			      T2 = Ip[WS(rs, 1)];
+			      T1m = FMA(TB, TF, T1l);
+			      T3 = Im[WS(rs, 1)];
+			      T7 = Rp[WS(rs, 1)];
+			      T8 = Rm[WS(rs, 1)];
+			      {
+				   E T1, T4, T9, T6, T5, TQ, T18;
+				   T1 = W[2];
+				   TU = T2 + T3;
+				   T4 = T2 - T3;
+				   TR = T7 - T8;
+				   T9 = T7 + T8;
+				   T6 = W[3];
+				   T5 = T1 * T4;
+				   TQ = W[4];
+				   T18 = T1 * T9;
+				   TT = W[5];
+				   Ta = FNMS(T6, T9, T5);
+				   T1g = TQ * TU;
+				   TS = TQ * TR;
+				   T19 = FMA(T6, T4, T18);
+			      }
+			 }
+			 {
+			      E Tc, Td, Th, Ti;
+			      Tc = Ip[WS(rs, 3)];
+			      T1h = FNMS(TT, TR, T1g);
+			      TV = FMA(TT, TU, TS);
+			      Td = Im[WS(rs, 3)];
+			      Th = Rp[WS(rs, 3)];
+			      Ti = Rm[WS(rs, 3)];
+			      {
+				   E Tb, Te, Tj, Tg, Tf, TW, T1a;
+				   Tb = W[10];
+				   T10 = Tc + Td;
+				   Te = Tc - Td;
+				   TX = Th - Ti;
+				   Tj = Th + Ti;
+				   Tg = W[11];
+				   Tf = Tb * Te;
+				   TW = W[12];
+				   T1a = Tb * Tj;
+				   TZ = W[13];
+				   Tk = FNMS(Tg, Tj, Tf);
+				   T1i = TW * T10;
+				   TY = TW * TX;
+				   T1b = FMA(Tg, Te, T1a);
+			      }
+			 }
+		    }
+		    {
+			 E T1E, T1t, TA, T1s, T1D, T1u, T1e, T13, T1r, T1d;
+			 {
+			      E TP, T1f, T1q, T12, T17, T1c;
+			      {
+				   E Tl, T11, Tz, T1p, T1k, T1j;
+				   T1E = Ta - Tk;
+				   Tl = Ta + Tk;
+				   T1j = FNMS(TZ, TX, T1i);
+				   T11 = FMA(TZ, T10, TY);
+				   Tz = Tv + Ty;
+				   T1t = Ty - Tv;
+				   T1A = T1o - T1m;
+				   T1p = T1m + T1o;
+				   T1k = T1h + T1j;
+				   T1w = T1j - T1h;
+				   T1z = TO - TG;
+				   TP = TG + TO;
+				   T1f = Tz - Tl;
+				   TA = Tl + Tz;
+				   T1s = T1k + T1p;
+				   T1q = T1k - T1p;
+				   T12 = TV + T11;
+				   T1x = TV - T11;
+				   T1D = T14 - T16;
+				   T17 = T14 + T16;
+				   T1c = T19 + T1b;
+				   T1u = T19 - T1b;
+			      }
+			      Im[WS(rs, 1)] = KP500000000 * (T1q - T1f);
+			      T1e = T12 + TP;
+			      T13 = TP - T12;
+			      T1r = T17 + T1c;
+			      T1d = T17 - T1c;
+			      Ip[WS(rs, 2)] = KP500000000 * (T1f + T1q);
+			 }
+			 Im[WS(rs, 3)] = KP500000000 * (T13 - TA);
+			 Ip[0] = KP500000000 * (TA + T13);
+			 Rm[WS(rs, 3)] = KP500000000 * (T1r - T1s);
+			 Rp[0] = KP500000000 * (T1r + T1s);
+			 Rp[WS(rs, 2)] = KP500000000 * (T1d + T1e);
+			 Rm[WS(rs, 1)] = KP500000000 * (T1d - T1e);
+			 T1H = T1u + T1t;
+			 T1v = T1t - T1u;
+			 T1L = T1D + T1E;
+			 T1F = T1D - T1E;
+		    }
+	       }
+	       {
+		    E T1y, T1I, T1B, T1J;
+		    T1y = T1w + T1x;
+		    T1I = T1w - T1x;
+		    T1B = T1z - T1A;
+		    T1J = T1z + T1A;
+		    {
+			 E T1M, T1K, T1C, T1G;
+			 T1M = T1I + T1J;
+			 T1K = T1I - T1J;
+			 T1C = T1y + T1B;
+			 T1G = T1B - T1y;
+			 Im[0] = -(KP500000000 * (FNMS(KP707106781, T1K, T1H)));
+			 Ip[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1K, T1H));
+			 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1M, T1L));
+			 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP707106781, T1M, T1L));
+			 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1G, T1F));
+			 Rm[0] = KP500000000 * (FNMS(KP707106781, T1G, T1F));
+			 Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP707106781, T1C, T1v)));
+			 Ip[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1C, T1v));
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cfdft_8", twinstr, &GENUS, {60, 30, 22, 0} };
+
+void X(codelet_hc2cfdft_8) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_8, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cfdft_8 -include hc2cf.h */
+
+/*
+ * This function contains 82 FP additions, 44 FP multiplications,
+ * (or, 68 additions, 30 multiplications, 14 fused multiply/add),
+ * 39 stack variables, 2 constants, and 32 memory accesses
+ */
+#include "hc2cf.h"
+
+static void hc2cfdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP353553390, +0.353553390593273762200422181052424519642417969);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E Tv, TX, Ts, TY, TE, T1a, TJ, T19, T1l, T1m, T9, T10, Ti, T11, TP;
+	       E T16, TU, T17, T1i, T1j;
+	       {
+		    E Tt, Tu, TD, Tz, TA, TB, Tn, TI, Tr, TG, Tk, To;
+		    Tt = Ip[0];
+		    Tu = Im[0];
+		    TD = Tt + Tu;
+		    Tz = Rm[0];
+		    TA = Rp[0];
+		    TB = Tz - TA;
+		    {
+			 E Tl, Tm, Tp, Tq;
+			 Tl = Ip[WS(rs, 2)];
+			 Tm = Im[WS(rs, 2)];
+			 Tn = Tl - Tm;
+			 TI = Tl + Tm;
+			 Tp = Rp[WS(rs, 2)];
+			 Tq = Rm[WS(rs, 2)];
+			 Tr = Tp + Tq;
+			 TG = Tp - Tq;
+		    }
+		    Tv = Tt - Tu;
+		    TX = TA + Tz;
+		    Tk = W[6];
+		    To = W[7];
+		    Ts = FNMS(To, Tr, Tk * Tn);
+		    TY = FMA(Tk, Tr, To * Tn);
+		    {
+			 E Ty, TC, TF, TH;
+			 Ty = W[0];
+			 TC = W[1];
+			 TE = FNMS(TC, TD, Ty * TB);
+			 T1a = FMA(TC, TB, Ty * TD);
+			 TF = W[8];
+			 TH = W[9];
+			 TJ = FMA(TF, TG, TH * TI);
+			 T19 = FNMS(TH, TG, TF * TI);
+		    }
+		    T1l = TJ + TE;
+		    T1m = T1a - T19;
+	       }
+	       {
+		    E T4, TO, T8, TM, Td, TT, Th, TR;
+		    {
+			 E T2, T3, T6, T7;
+			 T2 = Ip[WS(rs, 1)];
+			 T3 = Im[WS(rs, 1)];
+			 T4 = T2 - T3;
+			 TO = T2 + T3;
+			 T6 = Rp[WS(rs, 1)];
+			 T7 = Rm[WS(rs, 1)];
+			 T8 = T6 + T7;
+			 TM = T6 - T7;
+		    }
+		    {
+			 E Tb, Tc, Tf, Tg;
+			 Tb = Ip[WS(rs, 3)];
+			 Tc = Im[WS(rs, 3)];
+			 Td = Tb - Tc;
+			 TT = Tb + Tc;
+			 Tf = Rp[WS(rs, 3)];
+			 Tg = Rm[WS(rs, 3)];
+			 Th = Tf + Tg;
+			 TR = Tf - Tg;
+		    }
+		    {
+			 E T1, T5, Ta, Te;
+			 T1 = W[2];
+			 T5 = W[3];
+			 T9 = FNMS(T5, T8, T1 * T4);
+			 T10 = FMA(T1, T8, T5 * T4);
+			 Ta = W[10];
+			 Te = W[11];
+			 Ti = FNMS(Te, Th, Ta * Td);
+			 T11 = FMA(Ta, Th, Te * Td);
+			 {
+			      E TL, TN, TQ, TS;
+			      TL = W[4];
+			      TN = W[5];
+			      TP = FMA(TL, TM, TN * TO);
+			      T16 = FNMS(TN, TM, TL * TO);
+			      TQ = W[12];
+			      TS = W[13];
+			      TU = FMA(TQ, TR, TS * TT);
+			      T17 = FNMS(TS, TR, TQ * TT);
+			 }
+			 T1i = T17 - T16;
+			 T1j = TP - TU;
+		    }
+	       }
+	       {
+		    E T1h, T1t, T1w, T1y, T1o, T1s, T1r, T1x;
+		    {
+			 E T1f, T1g, T1u, T1v;
+			 T1f = Tv - Ts;
+			 T1g = T10 - T11;
+			 T1h = KP500000000 * (T1f - T1g);
+			 T1t = KP500000000 * (T1g + T1f);
+			 T1u = T1i - T1j;
+			 T1v = T1l + T1m;
+			 T1w = KP353553390 * (T1u - T1v);
+			 T1y = KP353553390 * (T1u + T1v);
+		    }
+		    {
+			 E T1k, T1n, T1p, T1q;
+			 T1k = T1i + T1j;
+			 T1n = T1l - T1m;
+			 T1o = KP353553390 * (T1k + T1n);
+			 T1s = KP353553390 * (T1n - T1k);
+			 T1p = TX - TY;
+			 T1q = T9 - Ti;
+			 T1r = KP500000000 * (T1p - T1q);
+			 T1x = KP500000000 * (T1p + T1q);
+		    }
+		    Ip[WS(rs, 1)] = T1h + T1o;
+		    Rp[WS(rs, 1)] = T1x + T1y;
+		    Im[WS(rs, 2)] = T1o - T1h;
+		    Rm[WS(rs, 2)] = T1x - T1y;
+		    Rm[0] = T1r - T1s;
+		    Im[0] = T1w - T1t;
+		    Rp[WS(rs, 3)] = T1r + T1s;
+		    Ip[WS(rs, 3)] = T1t + T1w;
+	       }
+	       {
+		    E Tx, T15, T1c, T1e, TW, T14, T13, T1d;
+		    {
+			 E Tj, Tw, T18, T1b;
+			 Tj = T9 + Ti;
+			 Tw = Ts + Tv;
+			 Tx = Tj + Tw;
+			 T15 = Tw - Tj;
+			 T18 = T16 + T17;
+			 T1b = T19 + T1a;
+			 T1c = T18 - T1b;
+			 T1e = T18 + T1b;
+		    }
+		    {
+			 E TK, TV, TZ, T12;
+			 TK = TE - TJ;
+			 TV = TP + TU;
+			 TW = TK - TV;
+			 T14 = TV + TK;
+			 TZ = TX + TY;
+			 T12 = T10 + T11;
+			 T13 = TZ - T12;
+			 T1d = TZ + T12;
+		    }
+		    Ip[0] = KP500000000 * (Tx + TW);
+		    Rp[0] = KP500000000 * (T1d + T1e);
+		    Im[WS(rs, 3)] = KP500000000 * (TW - Tx);
+		    Rm[WS(rs, 3)] = KP500000000 * (T1d - T1e);
+		    Rm[WS(rs, 1)] = KP500000000 * (T13 - T14);
+		    Im[WS(rs, 1)] = KP500000000 * (T1c - T15);
+		    Rp[WS(rs, 2)] = KP500000000 * (T13 + T14);
+		    Ip[WS(rs, 2)] = KP500000000 * (T15 + T1c);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2c_desc desc = { 8, "hc2cfdft_8", twinstr, &GENUS, {68, 30, 14, 0} };
+
+void X(codelet_hc2cfdft_8) (planner *p) {
+     X(khc2c_register) (p, hc2cfdft_8, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf2_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf2_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,824 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:02 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hf2_16 -include hf.h */
+
+/*
+ * This function contains 196 FP additions, 134 FP multiplications,
+ * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
+ * 106 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T35, T32;
+	       {
+		    E T2, Tf, TM, TO, T3, Tg, TN, TS, T4, Tp, T6, T5, Th;
+		    T2 = W[0];
+		    Tf = W[2];
+		    TM = W[6];
+		    TO = W[7];
+		    T3 = W[4];
+		    Tg = T2 * Tf;
+		    TN = T2 * TM;
+		    TS = T2 * TO;
+		    T4 = T2 * T3;
+		    Tp = Tf * T3;
+		    T6 = W[5];
+		    T5 = W[1];
+		    Th = W[3];
+		    {
+			 E TZ, Te, T1U, T3A, T3M, T2w, T1G, T2I, T3h, T1R, T2D, T2B, T3i, Tx, T3L;
+			 E T1Z, T3w, TL, T21, T26, T38, T1d, T2h, T2s, T3c, T1s, T2t, T2m, T3d, TX;
+			 E T10, TV, T2a, TY, T2b;
+			 {
+			      E TF, TP, TT, Tq, TW, Tz, Tu, TI, TC, T1m, T1f, T1p, T1j, Tr, Ts;
+			      E Tv, To, T1W;
+			      {
+				   E Ti, Tm, T1L, T1O, T1D, T1A, T1x, T2G, T1F, T2F;
+				   {
+					E T1, T7, Tb, T3z, T8, T1z, T9, Tc;
+					{
+					     E T1i, T1e, T1C, T1y, Tt, Ta, Tl;
+					     T1 = cr[0];
+					     Tt = Tf * T6;
+					     Ta = T2 * T6;
+					     T7 = FMA(T5, T6, T4);
+					     TF = FNMS(T5, T6, T4);
+					     TP = FMA(T5, TO, TN);
+					     TT = FNMS(T5, TM, TS);
+					     Tq = FNMS(Th, T6, Tp);
+					     TW = FMA(Th, T6, Tp);
+					     Tz = FMA(T5, Th, Tg);
+					     Ti = FNMS(T5, Th, Tg);
+					     Tl = T2 * Th;
+					     Tu = FMA(Th, T3, Tt);
+					     TZ = FNMS(Th, T3, Tt);
+					     TI = FMA(T5, T3, Ta);
+					     Tb = FNMS(T5, T3, Ta);
+					     T1i = Ti * T6;
+					     T1e = Ti * T3;
+					     T1C = Tz * T6;
+					     T1y = Tz * T3;
+					     Tm = FMA(T5, Tf, Tl);
+					     TC = FNMS(T5, Tf, Tl);
+					     T3z = ci[0];
+					     T8 = cr[WS(rs, 8)];
+					     T1m = FNMS(Tm, T6, T1e);
+					     T1f = FMA(Tm, T6, T1e);
+					     T1p = FMA(Tm, T3, T1i);
+					     T1j = FNMS(Tm, T3, T1i);
+					     T1L = FNMS(TC, T6, T1y);
+					     T1z = FMA(TC, T6, T1y);
+					     T1O = FMA(TC, T3, T1C);
+					     T1D = FNMS(TC, T3, T1C);
+					     T9 = T7 * T8;
+					     Tc = ci[WS(rs, 8)];
+					}
+					{
+					     E T1u, T1w, T1v, T2E, T3y, T1B, T1E, Td, T3x;
+					     T1u = cr[WS(rs, 15)];
+					     T1w = ci[WS(rs, 15)];
+					     T1A = cr[WS(rs, 7)];
+					     Td = FMA(Tb, Tc, T9);
+					     T3x = T7 * Tc;
+					     T1v = TM * T1u;
+					     T2E = TM * T1w;
+					     Te = T1 + Td;
+					     T1U = T1 - Td;
+					     T3y = FNMS(Tb, T8, T3x);
+					     T1B = T1z * T1A;
+					     T1E = ci[WS(rs, 7)];
+					     T1x = FMA(TO, T1w, T1v);
+					     T3A = T3y + T3z;
+					     T3M = T3z - T3y;
+					     T2G = T1z * T1E;
+					     T1F = FMA(T1D, T1E, T1B);
+					     T2F = FNMS(TO, T1u, T2E);
+					}
+				   }
+				   {
+					E T1H, T1I, T1J, T1M, T1P, T2H;
+					T1H = cr[WS(rs, 3)];
+					T2H = FNMS(T1D, T1A, T2G);
+					T2w = T1x - T1F;
+					T1G = T1x + T1F;
+					T1I = Tf * T1H;
+					T2I = T2F - T2H;
+					T3h = T2F + T2H;
+					T1J = ci[WS(rs, 3)];
+					T1M = cr[WS(rs, 11)];
+					T1P = ci[WS(rs, 11)];
+					{
+					     E Tj, Tk, Tn, T1V;
+					     {
+						  E T1K, T2y, T1Q, T2A, T2x, T1N, T2z;
+						  Tj = cr[WS(rs, 4)];
+						  T1K = FMA(Th, T1J, T1I);
+						  T2x = Tf * T1J;
+						  T1N = T1L * T1M;
+						  T2z = T1L * T1P;
+						  Tk = Ti * Tj;
+						  T2y = FNMS(Th, T1H, T2x);
+						  T1Q = FMA(T1O, T1P, T1N);
+						  T2A = FNMS(T1O, T1M, T2z);
+						  Tn = ci[WS(rs, 4)];
+						  Tr = cr[WS(rs, 12)];
+						  T1R = T1K + T1Q;
+						  T2D = T1Q - T1K;
+						  T2B = T2y - T2A;
+						  T3i = T2y + T2A;
+						  T1V = Ti * Tn;
+						  Ts = Tq * Tr;
+						  Tv = ci[WS(rs, 12)];
+					     }
+					     To = FMA(Tm, Tn, Tk);
+					     T1W = FNMS(Tm, Tj, T1V);
+					}
+				   }
+			      }
+			      {
+				   E T19, T1b, T18, T2p, T1a, T2q;
+				   {
+					E TE, T23, TK, T25;
+					{
+					     E TA, TD, TB, T22, TG, TJ, TH, T24, T1Y, Tw, T1X;
+					     TA = cr[WS(rs, 2)];
+					     Tw = FMA(Tu, Tv, Ts);
+					     T1X = Tq * Tv;
+					     TD = ci[WS(rs, 2)];
+					     TB = Tz * TA;
+					     Tx = To + Tw;
+					     T3L = To - Tw;
+					     T1Y = FNMS(Tu, Tr, T1X);
+					     T22 = Tz * TD;
+					     TG = cr[WS(rs, 10)];
+					     TJ = ci[WS(rs, 10)];
+					     T1Z = T1W - T1Y;
+					     T3w = T1W + T1Y;
+					     TH = TF * TG;
+					     T24 = TF * TJ;
+					     TE = FMA(TC, TD, TB);
+					     T23 = FNMS(TC, TA, T22);
+					     TK = FMA(TI, TJ, TH);
+					     T25 = FNMS(TI, TG, T24);
+					}
+					{
+					     E T15, T17, T16, T2o;
+					     T15 = cr[WS(rs, 1)];
+					     T17 = ci[WS(rs, 1)];
+					     TL = TE + TK;
+					     T21 = TE - TK;
+					     T26 = T23 - T25;
+					     T38 = T23 + T25;
+					     T16 = T2 * T15;
+					     T2o = T2 * T17;
+					     T19 = cr[WS(rs, 9)];
+					     T1b = ci[WS(rs, 9)];
+					     T18 = FMA(T5, T17, T16);
+					     T2p = FNMS(T5, T15, T2o);
+					     T1a = T3 * T19;
+					     T2q = T3 * T1b;
+					}
+				   }
+				   {
+					E T1n, T1q, T1l, T2j, T1o, T2k;
+					{
+					     E T1g, T1k, T1h, T2i, T1c, T2r;
+					     T1g = cr[WS(rs, 5)];
+					     T1k = ci[WS(rs, 5)];
+					     T1c = FMA(T6, T1b, T1a);
+					     T2r = FNMS(T6, T19, T2q);
+					     T1h = T1f * T1g;
+					     T2i = T1f * T1k;
+					     T1d = T18 + T1c;
+					     T2h = T18 - T1c;
+					     T2s = T2p - T2r;
+					     T3c = T2p + T2r;
+					     T1n = cr[WS(rs, 13)];
+					     T1q = ci[WS(rs, 13)];
+					     T1l = FMA(T1j, T1k, T1h);
+					     T2j = FNMS(T1j, T1g, T2i);
+					     T1o = T1m * T1n;
+					     T2k = T1m * T1q;
+					}
+					{
+					     E TQ, TU, TR, T29, T1r, T2l;
+					     TQ = cr[WS(rs, 14)];
+					     TU = ci[WS(rs, 14)];
+					     T1r = FMA(T1p, T1q, T1o);
+					     T2l = FNMS(T1p, T1n, T2k);
+					     TR = TP * TQ;
+					     T29 = TP * TU;
+					     T1s = T1l + T1r;
+					     T2t = T1l - T1r;
+					     T2m = T2j - T2l;
+					     T3d = T2j + T2l;
+					     TX = cr[WS(rs, 6)];
+					     T10 = ci[WS(rs, 6)];
+					     TV = FMA(TT, TU, TR);
+					     T2a = FNMS(TT, TQ, T29);
+					     TY = TW * TX;
+					     T2b = TW * T10;
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T36, T3G, T3b, T3g, T28, T2d, T3F, T39, T3j, T3q, T3C, T3e, T3u, T3t;
+			      {
+				   E T3D, T1T, T3r, T14, T3E, T3s;
+				   {
+					E Ty, T3B, T11, T2c, T13, T3v;
+					T36 = Te - Tx;
+					Ty = Te + Tx;
+					T3B = T3w + T3A;
+					T3G = T3A - T3w;
+					T11 = FMA(TZ, T10, TY);
+					T2c = FNMS(TZ, TX, T2b);
+					{
+					     E T1t, T1S, T12, T37;
+					     T3b = T1d - T1s;
+					     T1t = T1d + T1s;
+					     T1S = T1G + T1R;
+					     T3g = T1G - T1R;
+					     T12 = TV + T11;
+					     T28 = TV - T11;
+					     T2d = T2a - T2c;
+					     T37 = T2a + T2c;
+					     T3D = T1S - T1t;
+					     T1T = T1t + T1S;
+					     T13 = TL + T12;
+					     T3F = TL - T12;
+					     T39 = T37 - T38;
+					     T3v = T38 + T37;
+					}
+					T3j = T3h - T3i;
+					T3r = T3h + T3i;
+					T3q = Ty - T13;
+					T14 = Ty + T13;
+					T3E = T3B - T3v;
+					T3C = T3v + T3B;
+					T3s = T3c + T3d;
+					T3e = T3c - T3d;
+				   }
+				   ci[WS(rs, 7)] = T14 - T1T;
+				   cr[WS(rs, 12)] = T3D - T3E;
+				   ci[WS(rs, 11)] = T3D + T3E;
+				   T3u = T3s + T3r;
+				   T3t = T3r - T3s;
+				   cr[0] = T14 + T1T;
+			      }
+			      {
+				   E T3m, T3a, T3J, T3H;
+				   ci[WS(rs, 15)] = T3u + T3C;
+				   cr[WS(rs, 8)] = T3u - T3C;
+				   ci[WS(rs, 3)] = T3q + T3t;
+				   cr[WS(rs, 4)] = T3q - T3t;
+				   T3m = T36 + T39;
+				   T3a = T36 - T39;
+				   T3J = T3G - T3F;
+				   T3H = T3F + T3G;
+				   {
+					E T2Q, T20, T3N, T3T, T2C, T2J, T3U, T2f, T33, T30, T2V, T2W, T3O, T2T, T2N;
+					E T2v;
+					{
+					     E T2R, T27, T2e, T2S;
+					     {
+						  E T3n, T3f, T3o, T3k;
+						  T2Q = T1U + T1Z;
+						  T20 = T1U - T1Z;
+						  T3n = T3b - T3e;
+						  T3f = T3b + T3e;
+						  T3o = T3g + T3j;
+						  T3k = T3g - T3j;
+						  T3N = T3L + T3M;
+						  T3T = T3M - T3L;
+						  {
+						       E T3p, T3K, T3I, T3l;
+						       T3p = T3n + T3o;
+						       T3K = T3o - T3n;
+						       T3I = T3k - T3f;
+						       T3l = T3f + T3k;
+						       ci[WS(rs, 1)] = FMA(KP707106781, T3p, T3m);
+						       cr[WS(rs, 6)] = FNMS(KP707106781, T3p, T3m);
+						       ci[WS(rs, 13)] = FMA(KP707106781, T3K, T3J);
+						       cr[WS(rs, 10)] = FMS(KP707106781, T3K, T3J);
+						       ci[WS(rs, 9)] = FMA(KP707106781, T3I, T3H);
+						       cr[WS(rs, 14)] = FMS(KP707106781, T3I, T3H);
+						       cr[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
+						       ci[WS(rs, 5)] = FNMS(KP707106781, T3l, T3a);
+						       T2R = T21 + T26;
+						       T27 = T21 - T26;
+						       T2e = T28 + T2d;
+						       T2S = T28 - T2d;
+						  }
+					     }
+					     {
+						  E T2Y, T2Z, T2n, T2u;
+						  T2C = T2w - T2B;
+						  T2Y = T2w + T2B;
+						  T2Z = T2I + T2D;
+						  T2J = T2D - T2I;
+						  T3U = T2e - T27;
+						  T2f = T27 + T2e;
+						  T33 = FMA(KP414213562, T2Y, T2Z);
+						  T30 = FNMS(KP414213562, T2Z, T2Y);
+						  T2V = T2h + T2m;
+						  T2n = T2h - T2m;
+						  T2u = T2s + T2t;
+						  T2W = T2s - T2t;
+						  T3O = T2R - T2S;
+						  T2T = T2R + T2S;
+						  T2N = FMA(KP414213562, T2n, T2u);
+						  T2v = FNMS(KP414213562, T2u, T2n);
+					     }
+					}
+					{
+					     E T2M, T3S, T31, T2P, T3Q, T3R, T3P, T2U;
+					     {
+						  E T2g, T2X, T2O, T2K, T3V, T3X, T3W, T34, T2L, T3Y;
+						  T2M = FNMS(KP707106781, T2f, T20);
+						  T2g = FMA(KP707106781, T2f, T20);
+						  T34 = FNMS(KP414213562, T2V, T2W);
+						  T2X = FMA(KP414213562, T2W, T2V);
+						  T2O = FMA(KP414213562, T2C, T2J);
+						  T2K = FNMS(KP414213562, T2J, T2C);
+						  T3V = FMA(KP707106781, T3U, T3T);
+						  T3X = FNMS(KP707106781, T3U, T3T);
+						  T35 = T33 - T34;
+						  T3W = T34 + T33;
+						  T3S = T2K - T2v;
+						  T2L = T2v + T2K;
+						  T3Y = T30 - T2X;
+						  T31 = T2X + T30;
+						  ci[WS(rs, 14)] = FMA(KP923879532, T3W, T3V);
+						  cr[WS(rs, 9)] = FMS(KP923879532, T3W, T3V);
+						  ci[0] = FMA(KP923879532, T2L, T2g);
+						  cr[WS(rs, 7)] = FNMS(KP923879532, T2L, T2g);
+						  cr[WS(rs, 13)] = FMS(KP923879532, T3Y, T3X);
+						  ci[WS(rs, 10)] = FMA(KP923879532, T3Y, T3X);
+						  T2P = T2N + T2O;
+						  T3Q = T2O - T2N;
+					     }
+					     T32 = FNMS(KP707106781, T2T, T2Q);
+					     T2U = FMA(KP707106781, T2T, T2Q);
+					     T3R = FNMS(KP707106781, T3O, T3N);
+					     T3P = FMA(KP707106781, T3O, T3N);
+					     cr[WS(rs, 3)] = FMA(KP923879532, T2P, T2M);
+					     ci[WS(rs, 4)] = FNMS(KP923879532, T2P, T2M);
+					     cr[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
+					     ci[WS(rs, 6)] = FNMS(KP923879532, T31, T2U);
+					     ci[WS(rs, 8)] = FMA(KP923879532, T3Q, T3P);
+					     cr[WS(rs, 15)] = FMS(KP923879532, T3Q, T3P);
+					     ci[WS(rs, 12)] = FMA(KP923879532, T3S, T3R);
+					     cr[WS(rs, 11)] = FMS(KP923879532, T3S, T3R);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 2)] = FMA(KP923879532, T35, T32);
+	       cr[WS(rs, 5)] = FNMS(KP923879532, T35, T32);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 16, "hf2_16", twinstr, &GENUS, {104, 42, 92, 0} };
+
+void X(codelet_hf2_16) (planner *p) {
+     X(khc2hc_register) (p, hf2_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hf2_16 -include hf.h */
+
+/*
+ * This function contains 196 FP additions, 108 FP multiplications,
+ * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
+ * 82 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
+	       E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
+	       {
+		    E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
+		    {
+			 E Th, Tn, Tj, Tm;
+			 T2 = W[0];
+			 T5 = W[1];
+			 Tg = W[2];
+			 Ti = W[3];
+			 Th = T2 * Tg;
+			 Tn = T5 * Tg;
+			 Tj = T5 * Ti;
+			 Tm = T2 * Ti;
+			 Tk = Th - Tj;
+			 To = Tm + Tn;
+			 TE = Tm - Tn;
+			 TC = Th + Tj;
+			 T6 = W[5];
+			 T7 = T5 * T6;
+			 Tv = Tg * T6;
+			 Ta = T2 * T6;
+			 Ts = Ti * T6;
+			 T3 = W[4];
+			 T4 = T2 * T3;
+			 Tw = Ti * T3;
+			 Tb = T5 * T3;
+			 Tr = Tg * T3;
+		    }
+		    T8 = T4 + T7;
+		    TW = Tv - Tw;
+		    TJ = Ta + Tb;
+		    Tt = Tr - Ts;
+		    TU = Tr + Ts;
+		    Tc = Ta - Tb;
+		    Tx = Tv + Tw;
+		    TH = T4 - T7;
+		    TN = W[6];
+		    TO = W[7];
+		    TP = FMA(T2, TN, T5 * TO);
+		    TR = FNMS(T5, TN, T2 * TO);
+		    {
+			 E T1d, T1e, T19, T1a;
+			 T1d = Tk * T6;
+			 T1e = To * T3;
+			 T1f = T1d - T1e;
+			 T1k = T1d + T1e;
+			 T19 = Tk * T3;
+			 T1a = To * T6;
+			 T1b = T19 + T1a;
+			 T1i = T19 - T1a;
+		    }
+		    {
+			 E T1w, T1x, T1s, T1t;
+			 T1w = TC * T6;
+			 T1x = TE * T3;
+			 T1y = T1w - T1x;
+			 T1H = T1w + T1x;
+			 T1s = TC * T3;
+			 T1t = TE * T6;
+			 T1u = T1s + T1t;
+			 T1F = T1s - T1t;
+		    }
+	       }
+	       {
+		    E Tf, T3s, T1N, T3e, TA, T3r, T1Q, T3b, TM, T2N, T1W, T2w, TZ, T2M, T21;
+		    E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2E, T2o, T2D, T18, T1n, T2Q, T2R;
+		    E T2S, T2T, T28, T2B, T2d, T2A;
+		    {
+			 E T1, T3d, Te, T3c, T9, Td;
+			 T1 = cr[0];
+			 T3d = ci[0];
+			 T9 = cr[WS(rs, 8)];
+			 Td = ci[WS(rs, 8)];
+			 Te = FMA(T8, T9, Tc * Td);
+			 T3c = FNMS(Tc, T9, T8 * Td);
+			 Tf = T1 + Te;
+			 T3s = T3d - T3c;
+			 T1N = T1 - Te;
+			 T3e = T3c + T3d;
+		    }
+		    {
+			 E Tq, T1O, Tz, T1P;
+			 {
+			      E Tl, Tp, Tu, Ty;
+			      Tl = cr[WS(rs, 4)];
+			      Tp = ci[WS(rs, 4)];
+			      Tq = FMA(Tk, Tl, To * Tp);
+			      T1O = FNMS(To, Tl, Tk * Tp);
+			      Tu = cr[WS(rs, 12)];
+			      Ty = ci[WS(rs, 12)];
+			      Tz = FMA(Tt, Tu, Tx * Ty);
+			      T1P = FNMS(Tx, Tu, Tt * Ty);
+			 }
+			 TA = Tq + Tz;
+			 T3r = Tq - Tz;
+			 T1Q = T1O - T1P;
+			 T3b = T1O + T1P;
+		    }
+		    {
+			 E TG, T1T, TL, T1U, T1S, T1V;
+			 {
+			      E TD, TF, TI, TK;
+			      TD = cr[WS(rs, 2)];
+			      TF = ci[WS(rs, 2)];
+			      TG = FMA(TC, TD, TE * TF);
+			      T1T = FNMS(TE, TD, TC * TF);
+			      TI = cr[WS(rs, 10)];
+			      TK = ci[WS(rs, 10)];
+			      TL = FMA(TH, TI, TJ * TK);
+			      T1U = FNMS(TJ, TI, TH * TK);
+			 }
+			 TM = TG + TL;
+			 T2N = T1T + T1U;
+			 T1S = TG - TL;
+			 T1V = T1T - T1U;
+			 T1W = T1S - T1V;
+			 T2w = T1S + T1V;
+		    }
+		    {
+			 E TT, T1Y, TY, T1Z, T1X, T20;
+			 {
+			      E TQ, TS, TV, TX;
+			      TQ = cr[WS(rs, 14)];
+			      TS = ci[WS(rs, 14)];
+			      TT = FMA(TP, TQ, TR * TS);
+			      T1Y = FNMS(TR, TQ, TP * TS);
+			      TV = cr[WS(rs, 6)];
+			      TX = ci[WS(rs, 6)];
+			      TY = FMA(TU, TV, TW * TX);
+			      T1Z = FNMS(TW, TV, TU * TX);
+			 }
+			 TZ = TT + TY;
+			 T2M = T1Y + T1Z;
+			 T1X = TT - TY;
+			 T20 = T1Y - T1Z;
+			 T21 = T1X + T20;
+			 T2x = T1X - T20;
+		    }
+		    {
+			 E T1r, T2f, T1J, T2m, T1A, T2g, T1E, T2l;
+			 {
+			      E T1p, T1q, T1G, T1I;
+			      T1p = cr[WS(rs, 15)];
+			      T1q = ci[WS(rs, 15)];
+			      T1r = FMA(TN, T1p, TO * T1q);
+			      T2f = FNMS(TO, T1p, TN * T1q);
+			      T1G = cr[WS(rs, 11)];
+			      T1I = ci[WS(rs, 11)];
+			      T1J = FMA(T1F, T1G, T1H * T1I);
+			      T2m = FNMS(T1H, T1G, T1F * T1I);
+			 }
+			 {
+			      E T1v, T1z, T1C, T1D;
+			      T1v = cr[WS(rs, 7)];
+			      T1z = ci[WS(rs, 7)];
+			      T1A = FMA(T1u, T1v, T1y * T1z);
+			      T2g = FNMS(T1y, T1v, T1u * T1z);
+			      T1C = cr[WS(rs, 3)];
+			      T1D = ci[WS(rs, 3)];
+			      T1E = FMA(Tg, T1C, Ti * T1D);
+			      T2l = FNMS(Ti, T1C, Tg * T1D);
+			 }
+			 T1B = T1r + T1A;
+			 T1K = T1E + T1J;
+			 T2V = T1B - T1K;
+			 T2W = T2f + T2g;
+			 T2X = T2l + T2m;
+			 T2Y = T2W - T2X;
+			 {
+			      E T2h, T2i, T2k, T2n;
+			      T2h = T2f - T2g;
+			      T2i = T1E - T1J;
+			      T2j = T2h + T2i;
+			      T2E = T2h - T2i;
+			      T2k = T1r - T1A;
+			      T2n = T2l - T2m;
+			      T2o = T2k - T2n;
+			      T2D = T2k + T2n;
+			 }
+		    }
+		    {
+			 E T14, T29, T1m, T26, T17, T2a, T1h, T25;
+			 {
+			      E T12, T13, T1j, T1l;
+			      T12 = cr[WS(rs, 1)];
+			      T13 = ci[WS(rs, 1)];
+			      T14 = FMA(T2, T12, T5 * T13);
+			      T29 = FNMS(T5, T12, T2 * T13);
+			      T1j = cr[WS(rs, 13)];
+			      T1l = ci[WS(rs, 13)];
+			      T1m = FMA(T1i, T1j, T1k * T1l);
+			      T26 = FNMS(T1k, T1j, T1i * T1l);
+			 }
+			 {
+			      E T15, T16, T1c, T1g;
+			      T15 = cr[WS(rs, 9)];
+			      T16 = ci[WS(rs, 9)];
+			      T17 = FMA(T3, T15, T6 * T16);
+			      T2a = FNMS(T6, T15, T3 * T16);
+			      T1c = cr[WS(rs, 5)];
+			      T1g = ci[WS(rs, 5)];
+			      T1h = FMA(T1b, T1c, T1f * T1g);
+			      T25 = FNMS(T1f, T1c, T1b * T1g);
+			 }
+			 T18 = T14 + T17;
+			 T1n = T1h + T1m;
+			 T2Q = T18 - T1n;
+			 T2R = T29 + T2a;
+			 T2S = T25 + T26;
+			 T2T = T2R - T2S;
+			 {
+			      E T24, T27, T2b, T2c;
+			      T24 = T14 - T17;
+			      T27 = T25 - T26;
+			      T28 = T24 - T27;
+			      T2B = T24 + T27;
+			      T2b = T29 - T2a;
+			      T2c = T1h - T1m;
+			      T2d = T2b + T2c;
+			      T2A = T2b - T2c;
+			 }
+		    }
+		    {
+			 E T23, T2r, T3u, T3w, T2q, T3v, T2u, T3p;
+			 {
+			      E T1R, T22, T3q, T3t;
+			      T1R = T1N - T1Q;
+			      T22 = KP707106781 * (T1W + T21);
+			      T23 = T1R + T22;
+			      T2r = T1R - T22;
+			      T3q = KP707106781 * (T2w - T2x);
+			      T3t = T3r + T3s;
+			      T3u = T3q + T3t;
+			      T3w = T3t - T3q;
+			 }
+			 {
+			      E T2e, T2p, T2s, T2t;
+			      T2e = FNMS(KP382683432, T2d, KP923879532 * T28);
+			      T2p = FMA(KP382683432, T2j, KP923879532 * T2o);
+			      T2q = T2e + T2p;
+			      T3v = T2p - T2e;
+			      T2s = FMA(KP923879532, T2d, KP382683432 * T28);
+			      T2t = FNMS(KP923879532, T2j, KP382683432 * T2o);
+			      T2u = T2s + T2t;
+			      T3p = T2t - T2s;
+			 }
+			 cr[WS(rs, 7)] = T23 - T2q;
+			 cr[WS(rs, 11)] = T3v - T3w;
+			 ci[WS(rs, 12)] = T3v + T3w;
+			 ci[0] = T23 + T2q;
+			 ci[WS(rs, 4)] = T2r - T2u;
+			 cr[WS(rs, 15)] = T3p - T3u;
+			 ci[WS(rs, 8)] = T3p + T3u;
+			 cr[WS(rs, 3)] = T2r + T2u;
+		    }
+		    {
+			 E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
+			 {
+			      E TB, T10, T3a, T3f;
+			      TB = Tf + TA;
+			      T10 = TM + TZ;
+			      T11 = TB + T10;
+			      T35 = TB - T10;
+			      T3a = T2N + T2M;
+			      T3f = T3b + T3e;
+			      T3g = T3a + T3f;
+			      T3i = T3f - T3a;
+			 }
+			 {
+			      E T1o, T1L, T36, T37;
+			      T1o = T18 + T1n;
+			      T1L = T1B + T1K;
+			      T1M = T1o + T1L;
+			      T3h = T1L - T1o;
+			      T36 = T2W + T2X;
+			      T37 = T2R + T2S;
+			      T38 = T36 - T37;
+			      T39 = T37 + T36;
+			 }
+			 ci[WS(rs, 7)] = T11 - T1M;
+			 cr[WS(rs, 12)] = T3h - T3i;
+			 ci[WS(rs, 11)] = T3h + T3i;
+			 cr[0] = T11 + T1M;
+			 cr[WS(rs, 4)] = T35 - T38;
+			 cr[WS(rs, 8)] = T39 - T3g;
+			 ci[WS(rs, 15)] = T39 + T3g;
+			 ci[WS(rs, 3)] = T35 + T38;
+		    }
+		    {
+			 E T2z, T2H, T3A, T3C, T2G, T3B, T2K, T3x;
+			 {
+			      E T2v, T2y, T3y, T3z;
+			      T2v = T1N + T1Q;
+			      T2y = KP707106781 * (T2w + T2x);
+			      T2z = T2v + T2y;
+			      T2H = T2v - T2y;
+			      T3y = KP707106781 * (T21 - T1W);
+			      T3z = T3s - T3r;
+			      T3A = T3y + T3z;
+			      T3C = T3z - T3y;
+			 }
+			 {
+			      E T2C, T2F, T2I, T2J;
+			      T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
+			      T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
+			      T2G = T2C + T2F;
+			      T3B = T2F - T2C;
+			      T2I = FNMS(KP923879532, T2A, KP382683432 * T2B);
+			      T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
+			      T2K = T2I + T2J;
+			      T3x = T2J - T2I;
+			 }
+			 ci[WS(rs, 6)] = T2z - T2G;
+			 cr[WS(rs, 13)] = T3B - T3C;
+			 ci[WS(rs, 10)] = T3B + T3C;
+			 cr[WS(rs, 1)] = T2z + T2G;
+			 cr[WS(rs, 5)] = T2H - T2K;
+			 cr[WS(rs, 9)] = T3x - T3A;
+			 ci[WS(rs, 14)] = T3x + T3A;
+			 ci[WS(rs, 2)] = T2H + T2K;
+		    }
+		    {
+			 E T2P, T31, T3m, T3o, T30, T3j, T34, T3n;
+			 {
+			      E T2L, T2O, T3k, T3l;
+			      T2L = Tf - TA;
+			      T2O = T2M - T2N;
+			      T2P = T2L - T2O;
+			      T31 = T2L + T2O;
+			      T3k = TM - TZ;
+			      T3l = T3e - T3b;
+			      T3m = T3k + T3l;
+			      T3o = T3l - T3k;
+			 }
+			 {
+			      E T2U, T2Z, T32, T33;
+			      T2U = T2Q + T2T;
+			      T2Z = T2V - T2Y;
+			      T30 = KP707106781 * (T2U + T2Z);
+			      T3j = KP707106781 * (T2Z - T2U);
+			      T32 = T2Q - T2T;
+			      T33 = T2V + T2Y;
+			      T34 = KP707106781 * (T32 + T33);
+			      T3n = KP707106781 * (T33 - T32);
+			 }
+			 ci[WS(rs, 5)] = T2P - T30;
+			 cr[WS(rs, 10)] = T3n - T3o;
+			 ci[WS(rs, 13)] = T3n + T3o;
+			 cr[WS(rs, 2)] = T2P + T30;
+			 cr[WS(rs, 6)] = T31 - T34;
+			 cr[WS(rs, 14)] = T3j - T3m;
+			 ci[WS(rs, 9)] = T3j + T3m;
+			 ci[WS(rs, 1)] = T31 + T34;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 16, "hf2_16", twinstr, &GENUS, {156, 68, 40, 0} };
+
+void X(codelet_hf2_16) (planner *p) {
+     X(khc2hc_register) (p, hf2_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf2_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf2_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1062 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -dit -name hf2_20 -include hf.h */
+
+/*
+ * This function contains 276 FP additions, 198 FP multiplications,
+ * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
+ * 146 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T5o, T5u, T5w, T5q, T5n, T5p, T5v, T5r;
+	       {
+		    E T2, Th, Tf, T6, T5, Tl, T1p, T1n, Ti, T3, Tt, Tv, T24, T1f, T1D;
+		    E Tb, T1P, Tm, T21, T1b, T7, T1A, Tw, T1H, T13, TA, T1L, T17, T1S, Tq;
+		    E T1o, T2g, T1t, T2c, TO, TK;
+		    {
+			 E T1e, Ta, Tk, Tg;
+			 T2 = W[0];
+			 Th = W[3];
+			 Tf = W[2];
+			 T6 = W[5];
+			 T5 = W[1];
+			 Tk = T2 * Th;
+			 Tg = T2 * Tf;
+			 T1e = Tf * T6;
+			 Ta = T2 * T6;
+			 Tl = FMA(T5, Tf, Tk);
+			 T1p = FNMS(T5, Tf, Tk);
+			 T1n = FMA(T5, Th, Tg);
+			 Ti = FNMS(T5, Th, Tg);
+			 T3 = W[4];
+			 Tt = W[6];
+			 Tv = W[7];
+			 {
+			      E Tp, Tj, TN, TJ;
+			      Tp = Ti * T6;
+			      T24 = FMA(Th, T3, T1e);
+			      T1f = FNMS(Th, T3, T1e);
+			      T1D = FNMS(T5, T3, Ta);
+			      Tb = FMA(T5, T3, Ta);
+			      Tj = Ti * T3;
+			      {
+				   E T1a, T4, Tu, T1G;
+				   T1a = Tf * T3;
+				   T4 = T2 * T3;
+				   Tu = Ti * Tt;
+				   T1G = T2 * Tt;
+				   {
+					E T12, Tz, T1K, T16;
+					T12 = Tf * Tt;
+					Tz = Ti * Tv;
+					T1K = T2 * Tv;
+					T16 = Tf * Tv;
+					T1P = FNMS(Tl, T6, Tj);
+					Tm = FMA(Tl, T6, Tj);
+					T21 = FNMS(Th, T6, T1a);
+					T1b = FMA(Th, T6, T1a);
+					T7 = FNMS(T5, T6, T4);
+					T1A = FMA(T5, T6, T4);
+					Tw = FMA(Tl, Tv, Tu);
+					T1H = FMA(T5, Tv, T1G);
+					T13 = FMA(Th, Tv, T12);
+					TA = FNMS(Tl, Tt, Tz);
+					T1L = FNMS(T5, Tt, T1K);
+					T17 = FNMS(Th, Tt, T16);
+					T1S = FMA(Tl, T3, Tp);
+					Tq = FNMS(Tl, T3, Tp);
+				   }
+			      }
+			      T1o = T1n * T3;
+			      T2g = T1n * Tv;
+			      TN = Tm * Tv;
+			      TJ = Tm * Tt;
+			      T1t = T1n * T6;
+			      T2c = T1n * Tt;
+			      TO = FNMS(Tq, Tt, TN);
+			      TK = FMA(Tq, Tv, TJ);
+			 }
+		    }
+		    {
+			 E Te, T2C, T4K, T57, T58, TD, T2H, T4L, T3u, T3Z, T11, T2v, T2P, T3P, T4n;
+			 E T4v, T3C, T43, T2r, T2z, T3b, T3T, T4d, T4z, T3J, T42, T20, T2y, T34, T3S;
+			 E T4g, T4y, T1c, T19, T1d, T3j, T1w, T2U, T1g, T1j, T1l;
+			 {
+			      E T2d, T2h, T2k, T1q, T1u, T2n, TL, TI, TM, T3q, TZ, T2N, TP, TS, TU;
+			      {
+				   E T1, T4J, T8, T9, Tc;
+				   T1 = cr[0];
+				   T4J = ci[0];
+				   T8 = cr[WS(rs, 10)];
+				   T2d = FMA(T1p, Tv, T2c);
+				   T2h = FNMS(T1p, Tt, T2g);
+				   T2k = FMA(T1p, T6, T1o);
+				   T1q = FNMS(T1p, T6, T1o);
+				   T1u = FMA(T1p, T3, T1t);
+				   T2n = FNMS(T1p, T3, T1t);
+				   T9 = T7 * T8;
+				   Tc = ci[WS(rs, 10)];
+				   {
+					E Tx, Ts, T2F, TC, T2E;
+					{
+					     E Tn, Tr, To, T2D, T4I, Ty, TB, Td, T4H;
+					     Tn = cr[WS(rs, 5)];
+					     Tr = ci[WS(rs, 5)];
+					     Tx = cr[WS(rs, 15)];
+					     Td = FMA(Tb, Tc, T9);
+					     T4H = T7 * Tc;
+					     To = Tm * Tn;
+					     T2D = Tm * Tr;
+					     Te = T1 + Td;
+					     T2C = T1 - Td;
+					     T4I = FNMS(Tb, T8, T4H);
+					     Ty = Tw * Tx;
+					     TB = ci[WS(rs, 15)];
+					     Ts = FMA(Tq, Tr, To);
+					     T4K = T4I + T4J;
+					     T57 = T4J - T4I;
+					     T2F = Tw * TB;
+					     TC = FMA(TA, TB, Ty);
+					     T2E = FNMS(Tq, Tn, T2D);
+					}
+					{
+					     E TF, TG, TH, TW, TY, T2G, T3p, TX, T2M;
+					     TF = cr[WS(rs, 4)];
+					     T2G = FNMS(TA, Tx, T2F);
+					     T58 = Ts - TC;
+					     TD = Ts + TC;
+					     TG = Ti * TF;
+					     T2H = T2E - T2G;
+					     T4L = T2E + T2G;
+					     TH = ci[WS(rs, 4)];
+					     TW = cr[WS(rs, 19)];
+					     TY = ci[WS(rs, 19)];
+					     TL = cr[WS(rs, 14)];
+					     TI = FMA(Tl, TH, TG);
+					     T3p = Ti * TH;
+					     TX = Tt * TW;
+					     T2M = Tt * TY;
+					     TM = TK * TL;
+					     T3q = FNMS(Tl, TF, T3p);
+					     TZ = FMA(Tv, TY, TX);
+					     T2N = FNMS(Tv, TW, T2M);
+					     TP = ci[WS(rs, 14)];
+					     TS = cr[WS(rs, 9)];
+					     TU = ci[WS(rs, 9)];
+					}
+				   }
+			      }
+			      {
+				   E T27, T26, T28, T3y, T2p, T39, T29, T2e, T2i;
+				   {
+					E T22, T23, T25, T2l, T2o, T3x, T2m, T38;
+					{
+					     E TR, T2J, T3s, TV, T2L, T4m, T3t;
+					     T22 = cr[WS(rs, 12)];
+					     {
+						  E TQ, T3r, TT, T2K;
+						  TQ = FMA(TO, TP, TM);
+						  T3r = TK * TP;
+						  TT = T3 * TS;
+						  T2K = T3 * TU;
+						  TR = TI + TQ;
+						  T2J = TI - TQ;
+						  T3s = FNMS(TO, TL, T3r);
+						  TV = FMA(T6, TU, TT);
+						  T2L = FNMS(T6, TS, T2K);
+						  T23 = T21 * T22;
+					     }
+					     T4m = T3q + T3s;
+					     T3t = T3q - T3s;
+					     {
+						  E T10, T3o, T4l, T2O;
+						  T10 = TV + TZ;
+						  T3o = TZ - TV;
+						  T4l = T2L + T2N;
+						  T2O = T2L - T2N;
+						  T3u = T3o - T3t;
+						  T3Z = T3t + T3o;
+						  T11 = TR - T10;
+						  T2v = TR + T10;
+						  T2P = T2J - T2O;
+						  T3P = T2J + T2O;
+						  T4n = T4l - T4m;
+						  T4v = T4m + T4l;
+						  T25 = ci[WS(rs, 12)];
+					     }
+					}
+					T2l = cr[WS(rs, 7)];
+					T2o = ci[WS(rs, 7)];
+					T27 = cr[WS(rs, 2)];
+					T26 = FMA(T24, T25, T23);
+					T3x = T21 * T25;
+					T2m = T2k * T2l;
+					T38 = T2k * T2o;
+					T28 = T1n * T27;
+					T3y = FNMS(T24, T22, T3x);
+					T2p = FMA(T2n, T2o, T2m);
+					T39 = FNMS(T2n, T2l, T38);
+					T29 = ci[WS(rs, 2)];
+					T2e = cr[WS(rs, 17)];
+					T2i = ci[WS(rs, 17)];
+				   }
+				   {
+					E T1I, T1F, T1J, T3F, T1Y, T32, T1M, T1Q, T1T;
+					{
+					     E T1B, T1C, T1E, T1V, T1X, T3E, T1W, T31;
+					     {
+						  E T2b, T35, T3A, T2j, T37, T4c, T3B;
+						  T1B = cr[WS(rs, 8)];
+						  {
+						       E T2a, T3z, T2f, T36;
+						       T2a = FMA(T1p, T29, T28);
+						       T3z = T1n * T29;
+						       T2f = T2d * T2e;
+						       T36 = T2d * T2i;
+						       T2b = T26 + T2a;
+						       T35 = T26 - T2a;
+						       T3A = FNMS(T1p, T27, T3z);
+						       T2j = FMA(T2h, T2i, T2f);
+						       T37 = FNMS(T2h, T2e, T36);
+						       T1C = T1A * T1B;
+						  }
+						  T4c = T3y + T3A;
+						  T3B = T3y - T3A;
+						  {
+						       E T2q, T3w, T4b, T3a;
+						       T2q = T2j + T2p;
+						       T3w = T2p - T2j;
+						       T4b = T37 + T39;
+						       T3a = T37 - T39;
+						       T3C = T3w - T3B;
+						       T43 = T3B + T3w;
+						       T2r = T2b - T2q;
+						       T2z = T2b + T2q;
+						       T3b = T35 - T3a;
+						       T3T = T35 + T3a;
+						       T4d = T4b - T4c;
+						       T4z = T4c + T4b;
+						       T1E = ci[WS(rs, 8)];
+						  }
+					     }
+					     T1V = cr[WS(rs, 3)];
+					     T1X = ci[WS(rs, 3)];
+					     T1I = cr[WS(rs, 18)];
+					     T1F = FMA(T1D, T1E, T1C);
+					     T3E = T1A * T1E;
+					     T1W = Tf * T1V;
+					     T31 = Tf * T1X;
+					     T1J = T1H * T1I;
+					     T3F = FNMS(T1D, T1B, T3E);
+					     T1Y = FMA(Th, T1X, T1W);
+					     T32 = FNMS(Th, T1V, T31);
+					     T1M = ci[WS(rs, 18)];
+					     T1Q = cr[WS(rs, 13)];
+					     T1T = ci[WS(rs, 13)];
+					}
+					{
+					     E T14, T15, T18, T1r, T1v, T3i, T1s, T2T;
+					     {
+						  E T1O, T2Y, T3H, T1U, T30, T4f, T3I;
+						  T14 = cr[WS(rs, 16)];
+						  {
+						       E T1N, T3G, T1R, T2Z;
+						       T1N = FMA(T1L, T1M, T1J);
+						       T3G = T1H * T1M;
+						       T1R = T1P * T1Q;
+						       T2Z = T1P * T1T;
+						       T1O = T1F + T1N;
+						       T2Y = T1F - T1N;
+						       T3H = FNMS(T1L, T1I, T3G);
+						       T1U = FMA(T1S, T1T, T1R);
+						       T30 = FNMS(T1S, T1Q, T2Z);
+						       T15 = T13 * T14;
+						  }
+						  T4f = T3F + T3H;
+						  T3I = T3F - T3H;
+						  {
+						       E T1Z, T3D, T4e, T33;
+						       T1Z = T1U + T1Y;
+						       T3D = T1Y - T1U;
+						       T4e = T30 + T32;
+						       T33 = T30 - T32;
+						       T3J = T3D - T3I;
+						       T42 = T3I + T3D;
+						       T20 = T1O - T1Z;
+						       T2y = T1O + T1Z;
+						       T34 = T2Y - T33;
+						       T3S = T2Y + T33;
+						       T4g = T4e - T4f;
+						       T4y = T4f + T4e;
+						       T18 = ci[WS(rs, 16)];
+						  }
+					     }
+					     T1r = cr[WS(rs, 11)];
+					     T1v = ci[WS(rs, 11)];
+					     T1c = cr[WS(rs, 6)];
+					     T19 = FMA(T17, T18, T15);
+					     T3i = T13 * T18;
+					     T1s = T1q * T1r;
+					     T2T = T1q * T1v;
+					     T1d = T1b * T1c;
+					     T3j = FNMS(T17, T14, T3i);
+					     T1w = FMA(T1u, T1v, T1s);
+					     T2U = FNMS(T1u, T1r, T2T);
+					     T1g = ci[WS(rs, 6)];
+					     T1j = cr[WS(rs, 1)];
+					     T1l = ci[WS(rs, 1)];
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T4F, T4Q, T4R, T5a, T4E, T5b, T2I, T5h, T5g, T4W, T4X, T53, T52, T5l, T5m;
+			      E T5s, T2X, T3N, T3L, T3c, T5t;
+			      {
+				   E T2u, T3n, T2w, T2W, T4w, T4r, T4p, T45, T47, T3O, T3R, T4a, T4q, T3U;
+				   {
+					E T4h, TE, T40, T3Q, T4k, T1z, T2s, T49, T48;
+					{
+					     E T1i, T2Q, T3l, T1m, T2S, T4j, T3m;
+					     T4h = T4d - T4g;
+					     T4F = T4g + T4d;
+					     {
+						  E T1h, T3k, T1k, T2R;
+						  T1h = FMA(T1f, T1g, T1d);
+						  T3k = T1b * T1g;
+						  T1k = T2 * T1j;
+						  T2R = T2 * T1l;
+						  T1i = T19 + T1h;
+						  T2Q = T19 - T1h;
+						  T3l = FNMS(T1f, T1c, T3k);
+						  T1m = FMA(T5, T1l, T1k);
+						  T2S = FNMS(T5, T1j, T2R);
+					     }
+					     TE = Te - TD;
+					     T2u = Te + TD;
+					     T4j = T3j + T3l;
+					     T3m = T3j - T3l;
+					     {
+						  E T1x, T3h, T4i, T2V, T1y;
+						  T1x = T1m + T1w;
+						  T3h = T1w - T1m;
+						  T4i = T2S + T2U;
+						  T2V = T2S - T2U;
+						  T3n = T3h - T3m;
+						  T40 = T3m + T3h;
+						  T1y = T1i - T1x;
+						  T2w = T1i + T1x;
+						  T2W = T2Q - T2V;
+						  T3Q = T2Q + T2V;
+						  T4k = T4i - T4j;
+						  T4w = T4j + T4i;
+						  T4Q = T1y - T11;
+						  T1z = T11 + T1y;
+						  T2s = T20 + T2r;
+						  T4R = T20 - T2r;
+					     }
+					}
+					{
+					     E T41, T4o, T44, T2t;
+					     T5a = T3Z + T40;
+					     T41 = T3Z - T40;
+					     T4o = T4k - T4n;
+					     T4E = T4n + T4k;
+					     T5b = T42 + T43;
+					     T44 = T42 - T43;
+					     T49 = T1z - T2s;
+					     T2t = T1z + T2s;
+					     T4r = FMA(KP618033988, T4h, T4o);
+					     T4p = FNMS(KP618033988, T4o, T4h);
+					     T45 = FMA(KP618033988, T44, T41);
+					     T47 = FNMS(KP618033988, T41, T44);
+					     ci[WS(rs, 9)] = TE + T2t;
+					     T48 = FNMS(KP250000000, T2t, TE);
+					}
+					T3O = T2C + T2H;
+					T2I = T2C - T2H;
+					T5h = T3P - T3Q;
+					T3R = T3P + T3Q;
+					T4a = FNMS(KP559016994, T49, T48);
+					T4q = FMA(KP559016994, T49, T48);
+					T3U = T3S + T3T;
+					T5g = T3S - T3T;
+				   }
+				   {
+					E T2x, T4B, T4D, T2A, T3Y, T46;
+					{
+					     E T4x, T3X, T3V, T4A, T3W;
+					     T4W = T4v + T4w;
+					     T4x = T4v - T4w;
+					     ci[WS(rs, 1)] = FMA(KP951056516, T4p, T4a);
+					     cr[WS(rs, 2)] = FNMS(KP951056516, T4p, T4a);
+					     cr[WS(rs, 6)] = FMA(KP951056516, T4r, T4q);
+					     ci[WS(rs, 5)] = FNMS(KP951056516, T4r, T4q);
+					     T3X = T3R - T3U;
+					     T3V = T3R + T3U;
+					     T4A = T4y - T4z;
+					     T4X = T4y + T4z;
+					     T2x = T2v + T2w;
+					     T53 = T2v - T2w;
+					     cr[WS(rs, 5)] = T3O + T3V;
+					     T3W = FNMS(KP250000000, T3V, T3O);
+					     T4B = FMA(KP618033988, T4A, T4x);
+					     T4D = FNMS(KP618033988, T4x, T4A);
+					     T52 = T2z - T2y;
+					     T2A = T2y + T2z;
+					     T3Y = FMA(KP559016994, T3X, T3W);
+					     T46 = FNMS(KP559016994, T3X, T3W);
+					}
+					{
+					     E T3v, T4t, T4s, T3K, T2B, T4u, T4C;
+					     T3v = T3n - T3u;
+					     T5l = T3u + T3n;
+					     T2B = T2x + T2A;
+					     T4t = T2x - T2A;
+					     cr[WS(rs, 9)] = FNMS(KP951056516, T45, T3Y);
+					     cr[WS(rs, 1)] = FMA(KP951056516, T45, T3Y);
+					     ci[WS(rs, 6)] = FMA(KP951056516, T47, T46);
+					     ci[WS(rs, 2)] = FNMS(KP951056516, T47, T46);
+					     cr[0] = T2u + T2B;
+					     T4s = FNMS(KP250000000, T2B, T2u);
+					     T5m = T3J + T3C;
+					     T3K = T3C - T3J;
+					     T5s = T2P - T2W;
+					     T2X = T2P + T2W;
+					     T4u = FMA(KP559016994, T4t, T4s);
+					     T4C = FNMS(KP559016994, T4t, T4s);
+					     T3N = FNMS(KP618033988, T3v, T3K);
+					     T3L = FMA(KP618033988, T3K, T3v);
+					     ci[WS(rs, 3)] = FMA(KP951056516, T4B, T4u);
+					     cr[WS(rs, 4)] = FNMS(KP951056516, T4B, T4u);
+					     cr[WS(rs, 8)] = FMA(KP951056516, T4D, T4C);
+					     ci[WS(rs, 7)] = FNMS(KP951056516, T4D, T4C);
+					     T3c = T34 + T3b;
+					     T5t = T34 - T3b;
+					}
+				   }
+			      }
+			      {
+				   E T4V, T5i, T5k, T59, T5e, T5c;
+				   {
+					E T4M, T3f, T4U, T4S, T3e, T3d;
+					T4V = T4L + T4K;
+					T4M = T4K - T4L;
+					T3f = T2X - T3c;
+					T3d = T2X + T3c;
+					T4U = FMA(KP618033988, T4Q, T4R);
+					T4S = FNMS(KP618033988, T4R, T4Q);
+					ci[WS(rs, 4)] = T2I + T3d;
+					T3e = FNMS(KP250000000, T3d, T2I);
+					{
+					     E T4O, T4N, T3g, T3M, T4G, T4T, T4P;
+					     T3g = FMA(KP559016994, T3f, T3e);
+					     T3M = FNMS(KP559016994, T3f, T3e);
+					     T4O = T4F - T4E;
+					     T4G = T4E + T4F;
+					     ci[WS(rs, 8)] = FMA(KP951056516, T3L, T3g);
+					     ci[0] = FNMS(KP951056516, T3L, T3g);
+					     cr[WS(rs, 7)] = FNMS(KP951056516, T3N, T3M);
+					     cr[WS(rs, 3)] = FMA(KP951056516, T3N, T3M);
+					     cr[WS(rs, 10)] = T4G - T4M;
+					     T4N = FMA(KP250000000, T4G, T4M);
+					     T5i = FNMS(KP618033988, T5h, T5g);
+					     T5k = FMA(KP618033988, T5g, T5h);
+					     T59 = T57 - T58;
+					     T5o = T58 + T57;
+					     T4T = FNMS(KP559016994, T4O, T4N);
+					     T4P = FMA(KP559016994, T4O, T4N);
+					     ci[WS(rs, 13)] = FMA(KP951056516, T4S, T4P);
+					     cr[WS(rs, 14)] = FMS(KP951056516, T4S, T4P);
+					     ci[WS(rs, 17)] = FMA(KP951056516, T4U, T4T);
+					     cr[WS(rs, 18)] = FMS(KP951056516, T4U, T4T);
+					     T5e = T5a - T5b;
+					     T5c = T5a + T5b;
+					}
+				   }
+				   {
+					E T56, T54, T4Y, T50, T5d, T5f, T5j, T4Z, T55, T51;
+					ci[WS(rs, 14)] = T5c + T59;
+					T5d = FNMS(KP250000000, T5c, T59);
+					T56 = FNMS(KP618033988, T52, T53);
+					T54 = FMA(KP618033988, T53, T52);
+					T5f = FNMS(KP559016994, T5e, T5d);
+					T5j = FMA(KP559016994, T5e, T5d);
+					cr[WS(rs, 17)] = -(FMA(KP951056516, T5i, T5f));
+					cr[WS(rs, 13)] = FMS(KP951056516, T5i, T5f);
+					ci[WS(rs, 18)] = FNMS(KP951056516, T5k, T5j);
+					ci[WS(rs, 10)] = FMA(KP951056516, T5k, T5j);
+					T4Y = T4W + T4X;
+					T50 = T4W - T4X;
+					ci[WS(rs, 19)] = T4Y + T4V;
+					T4Z = FNMS(KP250000000, T4Y, T4V);
+					T5u = FMA(KP618033988, T5t, T5s);
+					T5w = FNMS(KP618033988, T5s, T5t);
+					T55 = FMA(KP559016994, T50, T4Z);
+					T51 = FNMS(KP559016994, T50, T4Z);
+					ci[WS(rs, 11)] = FMA(KP951056516, T54, T51);
+					cr[WS(rs, 12)] = FMS(KP951056516, T54, T51);
+					ci[WS(rs, 15)] = FMA(KP951056516, T56, T55);
+					cr[WS(rs, 16)] = FMS(KP951056516, T56, T55);
+					T5q = T5l - T5m;
+					T5n = T5l + T5m;
+				   }
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 15)] = T5n - T5o;
+	       T5p = FMA(KP250000000, T5n, T5o);
+	       T5v = FMA(KP559016994, T5q, T5p);
+	       T5r = FNMS(KP559016994, T5q, T5p);
+	       cr[WS(rs, 19)] = -(FMA(KP951056516, T5u, T5r));
+	       cr[WS(rs, 11)] = FMS(KP951056516, T5u, T5r);
+	       ci[WS(rs, 16)] = FNMS(KP951056516, T5w, T5v);
+	       ci[WS(rs, 12)] = FMA(KP951056516, T5w, T5v);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 20, "hf2_20", twinstr, &GENUS, {136, 58, 140, 0} };
+
+void X(codelet_hf2_20) (planner *p) {
+     X(khc2hc_register) (p, hf2_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -dit -name hf2_20 -include hf.h */
+
+/*
+ * This function contains 276 FP additions, 164 FP multiplications,
+ * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
+ * 123 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T2, T5, Tg, Ti, Tk, To, T1h, T1f, T6, T3, T8, T14, T1Q, Tc, T1O;
+	       E T1v, T18, T1t, T1n, T24, T1j, T22, Tq, Tu, T1E, T1G, Tx, Ty, Tz, TJ;
+	       E T1Z, TB, T1X, T1A, TZ, TL, T1y, TX;
+	       {
+		    E T7, T16, Ta, T13, T4, T17, Tb, T12;
+		    {
+			 E Th, Tn, Tj, Tm;
+			 T2 = W[0];
+			 T5 = W[1];
+			 Tg = W[2];
+			 Ti = W[3];
+			 Th = T2 * Tg;
+			 Tn = T5 * Tg;
+			 Tj = T5 * Ti;
+			 Tm = T2 * Ti;
+			 Tk = Th - Tj;
+			 To = Tm + Tn;
+			 T1h = Tm - Tn;
+			 T1f = Th + Tj;
+			 T6 = W[5];
+			 T7 = T5 * T6;
+			 T16 = Tg * T6;
+			 Ta = T2 * T6;
+			 T13 = Ti * T6;
+			 T3 = W[4];
+			 T4 = T2 * T3;
+			 T17 = Ti * T3;
+			 Tb = T5 * T3;
+			 T12 = Tg * T3;
+		    }
+		    T8 = T4 - T7;
+		    T14 = T12 + T13;
+		    T1Q = T16 + T17;
+		    Tc = Ta + Tb;
+		    T1O = T12 - T13;
+		    T1v = Ta - Tb;
+		    T18 = T16 - T17;
+		    T1t = T4 + T7;
+		    {
+			 E T1l, T1m, T1g, T1i;
+			 T1l = T1f * T6;
+			 T1m = T1h * T3;
+			 T1n = T1l + T1m;
+			 T24 = T1l - T1m;
+			 T1g = T1f * T3;
+			 T1i = T1h * T6;
+			 T1j = T1g - T1i;
+			 T22 = T1g + T1i;
+			 {
+			      E Tl, Tp, Ts, Tt;
+			      Tl = Tk * T3;
+			      Tp = To * T6;
+			      Tq = Tl + Tp;
+			      Ts = Tk * T6;
+			      Tt = To * T3;
+			      Tu = Ts - Tt;
+			      T1E = Tl - Tp;
+			      T1G = Ts + Tt;
+			      Tx = W[6];
+			      Ty = W[7];
+			      Tz = FMA(Tk, Tx, To * Ty);
+			      TJ = FMA(Tq, Tx, Tu * Ty);
+			      T1Z = FNMS(T1h, Tx, T1f * Ty);
+			      TB = FNMS(To, Tx, Tk * Ty);
+			      T1X = FMA(T1f, Tx, T1h * Ty);
+			      T1A = FNMS(T5, Tx, T2 * Ty);
+			      TZ = FNMS(Ti, Tx, Tg * Ty);
+			      TL = FNMS(Tu, Tx, Tq * Ty);
+			      T1y = FMA(T2, Tx, T5 * Ty);
+			      TX = FMA(Tg, Tx, Ti * Ty);
+			 }
+		    }
+	       }
+	       {
+		    E TF, T2b, T4D, T4M, T2K, T3r, T4a, T4m, T1N, T28, T29, T3C, T3F, T43, T3X;
+		    E T3Y, T4o, T2f, T2g, T2h, T2y, T2D, T2E, T3g, T3h, T4z, T3n, T3o, T3p, T33;
+		    E T38, T4K, TW, T1r, T1s, T3J, T3M, T44, T3U, T3V, T4n, T2c, T2d, T2e, T2n;
+		    E T2s, T2t, T3d, T3e, T4y, T3k, T3l, T3m, T2S, T2X, T4J;
+		    {
+			 E T1, T47, Te, T46, Tw, T2H, TD, T2I, T9, Td;
+			 T1 = cr[0];
+			 T47 = ci[0];
+			 T9 = cr[WS(rs, 10)];
+			 Td = ci[WS(rs, 10)];
+			 Te = FMA(T8, T9, Tc * Td);
+			 T46 = FNMS(Tc, T9, T8 * Td);
+			 {
+			      E Tr, Tv, TA, TC;
+			      Tr = cr[WS(rs, 5)];
+			      Tv = ci[WS(rs, 5)];
+			      Tw = FMA(Tq, Tr, Tu * Tv);
+			      T2H = FNMS(Tu, Tr, Tq * Tv);
+			      TA = cr[WS(rs, 15)];
+			      TC = ci[WS(rs, 15)];
+			      TD = FMA(Tz, TA, TB * TC);
+			      T2I = FNMS(TB, TA, Tz * TC);
+			 }
+			 {
+			      E Tf, TE, T4B, T4C;
+			      Tf = T1 + Te;
+			      TE = Tw + TD;
+			      TF = Tf - TE;
+			      T2b = Tf + TE;
+			      T4B = T47 - T46;
+			      T4C = Tw - TD;
+			      T4D = T4B - T4C;
+			      T4M = T4C + T4B;
+			 }
+			 {
+			      E T2G, T2J, T48, T49;
+			      T2G = T1 - Te;
+			      T2J = T2H - T2I;
+			      T2K = T2G - T2J;
+			      T3r = T2G + T2J;
+			      T48 = T46 + T47;
+			      T49 = T2H + T2I;
+			      T4a = T48 - T49;
+			      T4m = T49 + T48;
+			 }
+		    }
+		    {
+			 E T1D, T3A, T2u, T31, T27, T3D, T2C, T37, T1M, T3B, T2x, T32, T1W, T3E, T2z;
+			 E T36;
+			 {
+			      E T1x, T2Z, T1C, T30;
+			      {
+				   E T1u, T1w, T1z, T1B;
+				   T1u = cr[WS(rs, 8)];
+				   T1w = ci[WS(rs, 8)];
+				   T1x = FMA(T1t, T1u, T1v * T1w);
+				   T2Z = FNMS(T1v, T1u, T1t * T1w);
+				   T1z = cr[WS(rs, 18)];
+				   T1B = ci[WS(rs, 18)];
+				   T1C = FMA(T1y, T1z, T1A * T1B);
+				   T30 = FNMS(T1A, T1z, T1y * T1B);
+			      }
+			      T1D = T1x + T1C;
+			      T3A = T2Z + T30;
+			      T2u = T1x - T1C;
+			      T31 = T2Z - T30;
+			 }
+			 {
+			      E T21, T2A, T26, T2B;
+			      {
+				   E T1Y, T20, T23, T25;
+				   T1Y = cr[WS(rs, 17)];
+				   T20 = ci[WS(rs, 17)];
+				   T21 = FMA(T1X, T1Y, T1Z * T20);
+				   T2A = FNMS(T1Z, T1Y, T1X * T20);
+				   T23 = cr[WS(rs, 7)];
+				   T25 = ci[WS(rs, 7)];
+				   T26 = FMA(T22, T23, T24 * T25);
+				   T2B = FNMS(T24, T23, T22 * T25);
+			      }
+			      T27 = T21 + T26;
+			      T3D = T2A + T2B;
+			      T2C = T2A - T2B;
+			      T37 = T21 - T26;
+			 }
+			 {
+			      E T1I, T2v, T1L, T2w;
+			      {
+				   E T1F, T1H, T1J, T1K;
+				   T1F = cr[WS(rs, 13)];
+				   T1H = ci[WS(rs, 13)];
+				   T1I = FMA(T1E, T1F, T1G * T1H);
+				   T2v = FNMS(T1G, T1F, T1E * T1H);
+				   T1J = cr[WS(rs, 3)];
+				   T1K = ci[WS(rs, 3)];
+				   T1L = FMA(Tg, T1J, Ti * T1K);
+				   T2w = FNMS(Ti, T1J, Tg * T1K);
+			      }
+			      T1M = T1I + T1L;
+			      T3B = T2v + T2w;
+			      T2x = T2v - T2w;
+			      T32 = T1I - T1L;
+			 }
+			 {
+			      E T1S, T34, T1V, T35;
+			      {
+				   E T1P, T1R, T1T, T1U;
+				   T1P = cr[WS(rs, 12)];
+				   T1R = ci[WS(rs, 12)];
+				   T1S = FMA(T1O, T1P, T1Q * T1R);
+				   T34 = FNMS(T1Q, T1P, T1O * T1R);
+				   T1T = cr[WS(rs, 2)];
+				   T1U = ci[WS(rs, 2)];
+				   T1V = FMA(T1f, T1T, T1h * T1U);
+				   T35 = FNMS(T1h, T1T, T1f * T1U);
+			      }
+			      T1W = T1S + T1V;
+			      T3E = T34 + T35;
+			      T2z = T1S - T1V;
+			      T36 = T34 - T35;
+			 }
+			 T1N = T1D - T1M;
+			 T28 = T1W - T27;
+			 T29 = T1N + T28;
+			 T3C = T3A - T3B;
+			 T3F = T3D - T3E;
+			 T43 = T3F - T3C;
+			 T3X = T3A + T3B;
+			 T3Y = T3E + T3D;
+			 T4o = T3X + T3Y;
+			 T2f = T1D + T1M;
+			 T2g = T1W + T27;
+			 T2h = T2f + T2g;
+			 T2y = T2u - T2x;
+			 T2D = T2z - T2C;
+			 T2E = T2y + T2D;
+			 T3g = T31 - T32;
+			 T3h = T36 - T37;
+			 T4z = T3g + T3h;
+			 T3n = T2u + T2x;
+			 T3o = T2z + T2C;
+			 T3p = T3n + T3o;
+			 T33 = T31 + T32;
+			 T38 = T36 + T37;
+			 T4K = T33 + T38;
+		    }
+		    {
+			 E TO, T3H, T2j, T2Q, T1q, T3L, T2r, T2T, TV, T3I, T2m, T2R, T1b, T3K, T2o;
+			 E T2W;
+			 {
+			      E TI, T2O, TN, T2P;
+			      {
+				   E TG, TH, TK, TM;
+				   TG = cr[WS(rs, 4)];
+				   TH = ci[WS(rs, 4)];
+				   TI = FMA(Tk, TG, To * TH);
+				   T2O = FNMS(To, TG, Tk * TH);
+				   TK = cr[WS(rs, 14)];
+				   TM = ci[WS(rs, 14)];
+				   TN = FMA(TJ, TK, TL * TM);
+				   T2P = FNMS(TL, TK, TJ * TM);
+			      }
+			      TO = TI + TN;
+			      T3H = T2O + T2P;
+			      T2j = TI - TN;
+			      T2Q = T2O - T2P;
+			 }
+			 {
+			      E T1e, T2p, T1p, T2q;
+			      {
+				   E T1c, T1d, T1k, T1o;
+				   T1c = cr[WS(rs, 1)];
+				   T1d = ci[WS(rs, 1)];
+				   T1e = FMA(T2, T1c, T5 * T1d);
+				   T2p = FNMS(T5, T1c, T2 * T1d);
+				   T1k = cr[WS(rs, 11)];
+				   T1o = ci[WS(rs, 11)];
+				   T1p = FMA(T1j, T1k, T1n * T1o);
+				   T2q = FNMS(T1n, T1k, T1j * T1o);
+			      }
+			      T1q = T1e + T1p;
+			      T3L = T2p + T2q;
+			      T2r = T2p - T2q;
+			      T2T = T1p - T1e;
+			 }
+			 {
+			      E TR, T2k, TU, T2l;
+			      {
+				   E TP, TQ, TS, TT;
+				   TP = cr[WS(rs, 9)];
+				   TQ = ci[WS(rs, 9)];
+				   TR = FMA(T3, TP, T6 * TQ);
+				   T2k = FNMS(T6, TP, T3 * TQ);
+				   TS = cr[WS(rs, 19)];
+				   TT = ci[WS(rs, 19)];
+				   TU = FMA(Tx, TS, Ty * TT);
+				   T2l = FNMS(Ty, TS, Tx * TT);
+			      }
+			      TV = TR + TU;
+			      T3I = T2k + T2l;
+			      T2m = T2k - T2l;
+			      T2R = TR - TU;
+			 }
+			 {
+			      E T11, T2U, T1a, T2V;
+			      {
+				   E TY, T10, T15, T19;
+				   TY = cr[WS(rs, 16)];
+				   T10 = ci[WS(rs, 16)];
+				   T11 = FMA(TX, TY, TZ * T10);
+				   T2U = FNMS(TZ, TY, TX * T10);
+				   T15 = cr[WS(rs, 6)];
+				   T19 = ci[WS(rs, 6)];
+				   T1a = FMA(T14, T15, T18 * T19);
+				   T2V = FNMS(T18, T15, T14 * T19);
+			      }
+			      T1b = T11 + T1a;
+			      T3K = T2U + T2V;
+			      T2o = T11 - T1a;
+			      T2W = T2U - T2V;
+			 }
+			 TW = TO - TV;
+			 T1r = T1b - T1q;
+			 T1s = TW + T1r;
+			 T3J = T3H - T3I;
+			 T3M = T3K - T3L;
+			 T44 = T3J + T3M;
+			 T3U = T3H + T3I;
+			 T3V = T3K + T3L;
+			 T4n = T3U + T3V;
+			 T2c = TO + TV;
+			 T2d = T1b + T1q;
+			 T2e = T2c + T2d;
+			 T2n = T2j - T2m;
+			 T2s = T2o - T2r;
+			 T2t = T2n + T2s;
+			 T3d = T2Q - T2R;
+			 T3e = T2W + T2T;
+			 T4y = T3d + T3e;
+			 T3k = T2j + T2m;
+			 T3l = T2o + T2r;
+			 T3m = T3k + T3l;
+			 T2S = T2Q + T2R;
+			 T2X = T2T - T2W;
+			 T4J = T2X - T2S;
+		    }
+		    {
+			 E T3y, T2a, T3x, T3O, T3Q, T3G, T3N, T3P, T3z;
+			 T3y = KP559016994 * (T1s - T29);
+			 T2a = T1s + T29;
+			 T3x = FNMS(KP250000000, T2a, TF);
+			 T3G = T3C + T3F;
+			 T3N = T3J - T3M;
+			 T3O = FNMS(KP587785252, T3N, KP951056516 * T3G);
+			 T3Q = FMA(KP951056516, T3N, KP587785252 * T3G);
+			 ci[WS(rs, 9)] = TF + T2a;
+			 T3P = T3y + T3x;
+			 ci[WS(rs, 5)] = T3P - T3Q;
+			 cr[WS(rs, 6)] = T3P + T3Q;
+			 T3z = T3x - T3y;
+			 cr[WS(rs, 2)] = T3z - T3O;
+			 ci[WS(rs, 1)] = T3z + T3O;
+		    }
+		    {
+			 E T3q, T3s, T3t, T3j, T3w, T3f, T3i, T3v, T3u;
+			 T3q = KP559016994 * (T3m - T3p);
+			 T3s = T3m + T3p;
+			 T3t = FNMS(KP250000000, T3s, T3r);
+			 T3f = T3d - T3e;
+			 T3i = T3g - T3h;
+			 T3j = FMA(KP951056516, T3f, KP587785252 * T3i);
+			 T3w = FNMS(KP587785252, T3f, KP951056516 * T3i);
+			 cr[WS(rs, 5)] = T3r + T3s;
+			 T3v = T3t - T3q;
+			 ci[WS(rs, 2)] = T3v - T3w;
+			 ci[WS(rs, 6)] = T3w + T3v;
+			 T3u = T3q + T3t;
+			 cr[WS(rs, 1)] = T3j + T3u;
+			 cr[WS(rs, 9)] = T3u - T3j;
+		    }
+		    {
+			 E T3R, T2i, T3S, T40, T42, T3W, T3Z, T41, T3T;
+			 T3R = KP559016994 * (T2e - T2h);
+			 T2i = T2e + T2h;
+			 T3S = FNMS(KP250000000, T2i, T2b);
+			 T3W = T3U - T3V;
+			 T3Z = T3X - T3Y;
+			 T40 = FMA(KP951056516, T3W, KP587785252 * T3Z);
+			 T42 = FNMS(KP587785252, T3W, KP951056516 * T3Z);
+			 cr[0] = T2b + T2i;
+			 T41 = T3S - T3R;
+			 ci[WS(rs, 7)] = T41 - T42;
+			 cr[WS(rs, 8)] = T41 + T42;
+			 T3T = T3R + T3S;
+			 cr[WS(rs, 4)] = T3T - T40;
+			 ci[WS(rs, 3)] = T3T + T40;
+		    }
+		    {
+			 E T2F, T2L, T2M, T3a, T3b, T2Y, T39, T3c, T2N;
+			 T2F = KP559016994 * (T2t - T2E);
+			 T2L = T2t + T2E;
+			 T2M = FNMS(KP250000000, T2L, T2K);
+			 T2Y = T2S + T2X;
+			 T39 = T33 - T38;
+			 T3a = FMA(KP951056516, T2Y, KP587785252 * T39);
+			 T3b = FNMS(KP587785252, T2Y, KP951056516 * T39);
+			 ci[WS(rs, 4)] = T2K + T2L;
+			 T3c = T2M - T2F;
+			 cr[WS(rs, 3)] = T3b + T3c;
+			 cr[WS(rs, 7)] = T3c - T3b;
+			 T2N = T2F + T2M;
+			 ci[0] = T2N - T3a;
+			 ci[WS(rs, 8)] = T3a + T2N;
+		    }
+		    {
+			 E T4e, T45, T4f, T4d, T4h, T4b, T4c, T4i, T4g;
+			 T4e = KP559016994 * (T44 + T43);
+			 T45 = T43 - T44;
+			 T4f = FMA(KP250000000, T45, T4a);
+			 T4b = T1r - TW;
+			 T4c = T1N - T28;
+			 T4d = FNMS(KP587785252, T4c, KP951056516 * T4b);
+			 T4h = FMA(KP587785252, T4b, KP951056516 * T4c);
+			 cr[WS(rs, 10)] = T45 - T4a;
+			 T4i = T4f - T4e;
+			 cr[WS(rs, 18)] = T4h - T4i;
+			 ci[WS(rs, 17)] = T4h + T4i;
+			 T4g = T4e + T4f;
+			 cr[WS(rs, 14)] = T4d - T4g;
+			 ci[WS(rs, 13)] = T4d + T4g;
+		    }
+		    {
+			 E T4A, T4E, T4F, T4x, T4H, T4v, T4w, T4I, T4G;
+			 T4A = KP559016994 * (T4y - T4z);
+			 T4E = T4y + T4z;
+			 T4F = FNMS(KP250000000, T4E, T4D);
+			 T4v = T3n - T3o;
+			 T4w = T3k - T3l;
+			 T4x = FNMS(KP587785252, T4w, KP951056516 * T4v);
+			 T4H = FMA(KP951056516, T4w, KP587785252 * T4v);
+			 ci[WS(rs, 14)] = T4E + T4D;
+			 T4I = T4A + T4F;
+			 ci[WS(rs, 10)] = T4H + T4I;
+			 ci[WS(rs, 18)] = T4I - T4H;
+			 T4G = T4A - T4F;
+			 cr[WS(rs, 13)] = T4x + T4G;
+			 cr[WS(rs, 17)] = T4G - T4x;
+		    }
+		    {
+			 E T4r, T4p, T4q, T4l, T4t, T4j, T4k, T4u, T4s;
+			 T4r = KP559016994 * (T4n - T4o);
+			 T4p = T4n + T4o;
+			 T4q = FNMS(KP250000000, T4p, T4m);
+			 T4j = T2c - T2d;
+			 T4k = T2f - T2g;
+			 T4l = FNMS(KP951056516, T4k, KP587785252 * T4j);
+			 T4t = FMA(KP951056516, T4j, KP587785252 * T4k);
+			 ci[WS(rs, 19)] = T4p + T4m;
+			 T4u = T4r + T4q;
+			 cr[WS(rs, 16)] = T4t - T4u;
+			 ci[WS(rs, 15)] = T4t + T4u;
+			 T4s = T4q - T4r;
+			 cr[WS(rs, 12)] = T4l - T4s;
+			 ci[WS(rs, 11)] = T4l + T4s;
+		    }
+		    {
+			 E T4Q, T4L, T4R, T4P, T4T, T4N, T4O, T4U, T4S;
+			 T4Q = KP559016994 * (T4J + T4K);
+			 T4L = T4J - T4K;
+			 T4R = FMA(KP250000000, T4L, T4M);
+			 T4N = T2n - T2s;
+			 T4O = T2y - T2D;
+			 T4P = FMA(KP951056516, T4N, KP587785252 * T4O);
+			 T4T = FNMS(KP587785252, T4N, KP951056516 * T4O);
+			 cr[WS(rs, 15)] = T4L - T4M;
+			 T4U = T4Q + T4R;
+			 ci[WS(rs, 12)] = T4T + T4U;
+			 ci[WS(rs, 16)] = T4U - T4T;
+			 T4S = T4Q - T4R;
+			 cr[WS(rs, 11)] = T4P + T4S;
+			 cr[WS(rs, 19)] = T4S - T4P;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 19},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 20, "hf2_20", twinstr, &GENUS, {204, 92, 72, 0} };
+
+void X(codelet_hf2_20) (planner *p) {
+     X(khc2hc_register) (p, hf2_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf2_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf2_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1625 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:09 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 25 -dit -name hf2_25 -include hf.h */
+
+/*
+ * This function contains 440 FP additions, 434 FP multiplications,
+ * (or, 84 additions, 78 multiplications, 356 fused multiply/add),
+ * 215 stack variables, 47 constants, and 100 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_25(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DK(KP621716863, +0.621716863012209892444754556304102309693593202);
+     DK(KP614372930, +0.614372930789563808870829930444362096004872855);
+     DK(KP557913902, +0.557913902031834264187699648465567037992437152);
+     DK(KP249506682, +0.249506682107067890488084201715862638334226305);
+     DK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DK(KP968479752, +0.968479752739016373193524836781420152702090879);
+     DK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP994076283, +0.994076283785401014123185814696322018529298887);
+     DK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DK(KP062914667, +0.062914667253649757225485955897349402364686947);
+     DK(KP921177326, +0.921177326965143320250447435415066029359282231);
+     DK(KP833417178, +0.833417178328688677408962550243238843138996060);
+     DK(KP541454447, +0.541454447536312777046285590082819509052033189);
+     DK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DK(KP943557151, +0.943557151597354104399655195398983005179443399);
+     DK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DK(KP554608978, +0.554608978404018097464974850792216217022558774);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP248028675, +0.248028675328619457762448260696444630363259177);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DK(KP525970792, +0.525970792408939708442463226536226366643874659);
+     DK(KP726211448, +0.726211448929902658173535992263577167607493062);
+     DK(KP549754652, +0.549754652192770074288023275540779861653779767);
+     DK(KP871714437, +0.871714437527667770979999223229522602943903653);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP939062505, +0.939062505817492352556001843133229685779824606);
+     DK(KP851038619, +0.851038619207379630836264138867114231259902550);
+     DK(KP256756360, +0.256756360367726783319498520922669048172391148);
+     DK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DK(KP634619297, +0.634619297544148100711287640319130485732531031);
+     DK(KP912018591, +0.912018591466481957908415381764119056233607330);
+     DK(KP470564281, +0.470564281212251493087595091036643380879947982);
+     DK(KP827271945, +0.827271945972475634034355757144307982555673741);
+     DK(KP126329378, +0.126329378446108174786050455341811215027378105);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E T7M, T6S, T6Q, T7S, T7Q, T7L, T6R, T6J, T7N, T7R;
+	       {
+		    E T2, T8, T3, T6, Tk, Tv, TS, T4, Ta, TD, T2L, T10, Tm, T5, Tc;
+		    T2 = W[0];
+		    T8 = W[4];
+		    T3 = W[2];
+		    T6 = W[3];
+		    Tk = W[6];
+		    Tv = T2 * T8;
+		    TS = T3 * T8;
+		    T4 = T2 * T3;
+		    Ta = T2 * T6;
+		    TD = T8 * Tk;
+		    T2L = T2 * Tk;
+		    T10 = T3 * Tk;
+		    Tm = W[7];
+		    T5 = W[1];
+		    Tc = W[5];
+		    {
+			 E T7u, T7U, T4s, T6a, T4g, TN, T4f, T7q, T8j, T7p, T4G, T6k, T3a, T4z, T6n;
+			 E T6m, T4w, T4a, T4D, T6j, T6C, T54, T6z, T5b, T1v, T3t, T6y, T58, T6B, T51;
+			 E T6v, T5j, T6s, T5q, T21, T3H, T6r, T5n, T6u, T5g, T26, T3K, T4N, T2A, T3U;
+			 E T4U, T2c, T3M, T2k, T3O;
+			 {
+			      E T11, T1b, Tb, T19, T7, T2m, TT, T15, T2Q, TX, T2p, T1g, T2a, T2e, T2i;
+			      E T27, T1c, T1O, T1K, T1q, T1m, T2x, T2t, T1W, T1S, T2G, T3Y, T2N, T4F, T38;
+			      E T48, T4y, T2K, T40, T2S, T41;
+			      {
+				   E T2M, T1j, T1l, T2X, T2U, T35, T31, T7l, T7n, T7m, T2O, T2R;
+				   {
+					E T1, Tj, T4j, TK, T4q, TC, T4o, Tt, T4l;
+					{
+					     E TE, Tw, TI, TA, Th, Tr, Tn, Td, Te, Ti, T14, T2P, TH, Tx, TB;
+					     T1 = cr[0];
+					     T11 = FMA(T6, Tm, T10);
+					     T14 = T3 * Tm;
+					     T2P = T2 * Tm;
+					     TH = T8 * Tm;
+					     T2M = FMA(T5, Tm, T2L);
+					     T1b = FNMS(T5, T3, Ta);
+					     Tb = FMA(T5, T3, Ta);
+					     T19 = FMA(T5, T6, T4);
+					     T7 = FNMS(T5, T6, T4);
+					     T2m = FNMS(T6, Tc, TS);
+					     TT = FMA(T6, Tc, TS);
+					     TE = FMA(Tc, Tm, TD);
+					     T1j = FMA(T5, Tc, Tv);
+					     Tw = FNMS(T5, Tc, Tv);
+					     {
+						  E TW, Tz, T1f, T2d;
+						  TW = T3 * Tc;
+						  Tz = T2 * Tc;
+						  T15 = FNMS(T6, Tk, T14);
+						  T2Q = FNMS(T5, Tk, T2P);
+						  TI = FNMS(Tc, Tk, TH);
+						  T1f = T19 * Tc;
+						  T2d = T19 * Tk;
+						  {
+						       E T2h, T1a, Tg, Tq;
+						       T2h = T19 * Tm;
+						       T1a = T19 * T8;
+						       Tg = T7 * Tc;
+						       Tq = T7 * Tm;
+						       {
+							    E Tl, T9, T1p, T1k;
+							    Tl = T7 * Tk;
+							    T9 = T7 * T8;
+							    T1p = T1j * Tm;
+							    T1k = T1j * Tk;
+							    {
+								 E T34, T30, T1N, T1J;
+								 T34 = TT * Tm;
+								 T30 = TT * Tk;
+								 T1N = Tw * Tm;
+								 T1J = Tw * Tk;
+								 TX = FNMS(T6, T8, TW);
+								 T2p = FMA(T6, T8, TW);
+								 TA = FMA(T5, T8, Tz);
+								 T1l = FNMS(T5, T8, Tz);
+								 T1g = FMA(T1b, T8, T1f);
+								 T2a = FNMS(T1b, T8, T1f);
+								 T2e = FMA(T1b, Tm, T2d);
+								 T2i = FNMS(T1b, Tk, T2h);
+								 T27 = FMA(T1b, Tc, T1a);
+								 T1c = FNMS(T1b, Tc, T1a);
+								 T2X = FMA(Tb, T8, Tg);
+								 Th = FNMS(Tb, T8, Tg);
+								 Tr = FNMS(Tb, Tk, Tq);
+								 Tn = FMA(Tb, Tm, Tl);
+								 Td = FMA(Tb, Tc, T9);
+								 T2U = FNMS(Tb, Tc, T9);
+								 T35 = FNMS(TX, Tk, T34);
+								 T31 = FMA(TX, Tm, T30);
+								 T1O = FNMS(TA, Tk, T1N);
+								 T1K = FMA(TA, Tm, T1J);
+								 T1q = FNMS(T1l, Tk, T1p);
+								 T1m = FMA(T1l, Tm, T1k);
+								 {
+								      E T2w, T2s, T1V, T1R;
+								      T2w = T27 * Tm;
+								      T2s = T27 * Tk;
+								      T1V = Td * Tm;
+								      T1R = Td * Tk;
+								      T2x = FNMS(T2a, Tk, T2w);
+								      T2t = FMA(T2a, Tm, T2s);
+								      T1W = FNMS(Th, Tk, T1V);
+								      T1S = FMA(Th, Tm, T1R);
+								      T7l = ci[0];
+								      Te = cr[WS(rs, 5)];
+								      Ti = ci[WS(rs, 5)];
+								 }
+							    }
+						       }
+						  }
+					     }
+					     {
+						  E TF, TJ, Tf, T4i, TG, T4p;
+						  TF = cr[WS(rs, 15)];
+						  TJ = ci[WS(rs, 15)];
+						  Tf = Td * Te;
+						  T4i = Td * Ti;
+						  TG = TE * TF;
+						  T4p = TE * TJ;
+						  Tj = FMA(Th, Ti, Tf);
+						  T4j = FNMS(Th, Te, T4i);
+						  TK = FMA(TI, TJ, TG);
+						  T4q = FNMS(TI, TF, T4p);
+					     }
+					     Tx = cr[WS(rs, 10)];
+					     TB = ci[WS(rs, 10)];
+					     {
+						  E To, Ts, Ty, T4n, Tp, T4k;
+						  To = cr[WS(rs, 20)];
+						  Ts = ci[WS(rs, 20)];
+						  Ty = Tw * Tx;
+						  T4n = Tw * TB;
+						  Tp = Tn * To;
+						  T4k = Tn * Ts;
+						  TC = FMA(TA, TB, Ty);
+						  T4o = FNMS(TA, Tx, T4n);
+						  Tt = FMA(Tr, Ts, Tp);
+						  T4l = FNMS(Tr, To, T4k);
+					     }
+					}
+					{
+					     E TL, T7s, T4r, Tu, T7t, T4m, TM;
+					     TL = TC + TK;
+					     T7s = TC - TK;
+					     T4r = T4o - T4q;
+					     T7n = T4o + T4q;
+					     Tu = Tj + Tt;
+					     T7t = Tj - Tt;
+					     T4m = T4j - T4l;
+					     T7m = T4j + T4l;
+					     T7u = FNMS(KP618033988, T7t, T7s);
+					     T7U = FMA(KP618033988, T7s, T7t);
+					     T4s = FMA(KP618033988, T4r, T4m);
+					     T6a = FNMS(KP618033988, T4m, T4r);
+					     T4g = Tu - TL;
+					     TM = Tu + TL;
+					     TN = T1 + TM;
+					     T4f = FNMS(KP250000000, TM, T1);
+					}
+				   }
+				   {
+					E T2D, T2F, T7o, T2E, T3X;
+					T2D = cr[WS(rs, 3)];
+					T2F = ci[WS(rs, 3)];
+					T7q = T7m - T7n;
+					T7o = T7m + T7n;
+					T2E = T3 * T2D;
+					T3X = T3 * T2F;
+					{
+					     E T2V, T2W, T2Y, T32, T36;
+					     T2V = cr[WS(rs, 13)];
+					     T8j = T7o + T7l;
+					     T7p = FNMS(KP250000000, T7o, T7l);
+					     T2G = FMA(T6, T2F, T2E);
+					     T3Y = FNMS(T6, T2D, T3X);
+					     T2W = T2U * T2V;
+					     T2Y = ci[WS(rs, 13)];
+					     T32 = cr[WS(rs, 18)];
+					     T36 = ci[WS(rs, 18)];
+					     {
+						  E T2H, T2I, T2J, T3Z;
+						  {
+						       E T2Z, T45, T37, T47, T44, T33, T46;
+						       T2H = cr[WS(rs, 8)];
+						       T2Z = FMA(T2X, T2Y, T2W);
+						       T44 = T2U * T2Y;
+						       T33 = T31 * T32;
+						       T46 = T31 * T36;
+						       T2I = T1j * T2H;
+						       T45 = FNMS(T2X, T2V, T44);
+						       T37 = FMA(T35, T36, T33);
+						       T47 = FNMS(T35, T32, T46);
+						       T2J = ci[WS(rs, 8)];
+						       T2N = cr[WS(rs, 23)];
+						       T4F = T2Z - T37;
+						       T38 = T2Z + T37;
+						       T48 = T45 + T47;
+						       T4y = T47 - T45;
+						       T3Z = T1j * T2J;
+						       T2O = T2M * T2N;
+						       T2R = ci[WS(rs, 23)];
+						  }
+						  T2K = FMA(T1l, T2J, T2I);
+						  T40 = FNMS(T1l, T2H, T3Z);
+					     }
+					}
+				   }
+				   T2S = FMA(T2Q, T2R, T2O);
+				   T41 = T2M * T2R;
+			      }
+			      {
+				   E TR, T3h, T1t, T53, T3r, T5a, TZ, T3j, T17, T3l;
+				   {
+					E T12, T16, T13, T3k;
+					{
+					     E TO, TP, T4C, T4B, TQ;
+					     {
+						  E T2T, T4E, T42, T4v, T39;
+						  TO = cr[WS(rs, 1)];
+						  T2T = T2K + T2S;
+						  T4E = T2K - T2S;
+						  T42 = FNMS(T2Q, T2N, T41);
+						  TP = T2 * TO;
+						  T4G = FMA(KP618033988, T4F, T4E);
+						  T6k = FNMS(KP618033988, T4E, T4F);
+						  T4v = T38 - T2T;
+						  T39 = T2T + T38;
+						  {
+						       E T43, T4x, T4u, T49;
+						       T43 = T40 + T42;
+						       T4x = T42 - T40;
+						       T4u = FNMS(KP250000000, T39, T2G);
+						       T3a = T2G + T39;
+						       T4z = FMA(KP618033988, T4y, T4x);
+						       T6n = FNMS(KP618033988, T4x, T4y);
+						       T4C = T48 - T43;
+						       T49 = T43 + T48;
+						       T6m = FMA(KP559016994, T4v, T4u);
+						       T4w = FNMS(KP559016994, T4v, T4u);
+						       T4B = FNMS(KP250000000, T49, T3Y);
+						       T4a = T3Y + T49;
+						       TQ = ci[WS(rs, 1)];
+						  }
+					     }
+					     {
+						  E T1n, T1r, T1i, T1o, T3o, T3p;
+						  {
+						       E T1d, T1h, T1e, T3n, T3g;
+						       T1d = cr[WS(rs, 11)];
+						       T1h = ci[WS(rs, 11)];
+						       T4D = FNMS(KP559016994, T4C, T4B);
+						       T6j = FMA(KP559016994, T4C, T4B);
+						       TR = FMA(T5, TQ, TP);
+						       T3g = T2 * TQ;
+						       T1e = T1c * T1d;
+						       T3n = T1c * T1h;
+						       T1n = cr[WS(rs, 16)];
+						       T3h = FNMS(T5, TO, T3g);
+						       T1r = ci[WS(rs, 16)];
+						       T1i = FMA(T1g, T1h, T1e);
+						       T1o = T1m * T1n;
+						       T3o = FNMS(T1g, T1d, T3n);
+						       T3p = T1m * T1r;
+						  }
+						  {
+						       E TU, TY, TV, T3i, T3q, T1s;
+						       TU = cr[WS(rs, 6)];
+						       T1s = FMA(T1q, T1r, T1o);
+						       TY = ci[WS(rs, 6)];
+						       T3q = FNMS(T1q, T1n, T3p);
+						       TV = TT * TU;
+						       T1t = T1i + T1s;
+						       T53 = T1s - T1i;
+						       T3i = TT * TY;
+						       T3r = T3o + T3q;
+						       T5a = T3q - T3o;
+						       T12 = cr[WS(rs, 21)];
+						       T16 = ci[WS(rs, 21)];
+						       TZ = FMA(TX, TY, TV);
+						       T3j = FNMS(TX, TU, T3i);
+						       T13 = T11 * T12;
+						       T3k = T11 * T16;
+						  }
+					     }
+					}
+					T17 = FMA(T15, T16, T13);
+					T3l = FNMS(T15, T12, T3k);
+				   }
+				   {
+					E T1z, T3v, T5i, T1Z, T3F, T5p, T1D, T3x, T1H, T3z;
+					{
+					     E T1E, T1G, T1F, T3y;
+					     {
+						  E T1w, T1y, T1x, T57, T50, T56, T4Z, T3u, T18, T52;
+						  T1w = cr[WS(rs, 4)];
+						  T1y = ci[WS(rs, 4)];
+						  T18 = TZ + T17;
+						  T52 = T17 - TZ;
+						  {
+						       E T3m, T59, T1u, T3s;
+						       T3m = T3j + T3l;
+						       T59 = T3j - T3l;
+						       T1x = T7 * T1w;
+						       T6C = FNMS(KP618033988, T52, T53);
+						       T54 = FMA(KP618033988, T53, T52);
+						       T1u = T18 + T1t;
+						       T57 = T18 - T1t;
+						       T6z = FMA(KP618033988, T59, T5a);
+						       T5b = FNMS(KP618033988, T5a, T59);
+						       T3s = T3m + T3r;
+						       T50 = T3m - T3r;
+						       T1v = TR + T1u;
+						       T56 = FNMS(KP250000000, T1u, TR);
+						       T3t = T3h + T3s;
+						       T4Z = FNMS(KP250000000, T3s, T3h);
+						       T3u = T7 * T1y;
+						  }
+						  T6y = FNMS(KP559016994, T57, T56);
+						  T58 = FMA(KP559016994, T57, T56);
+						  T6B = FNMS(KP559016994, T50, T4Z);
+						  T51 = FMA(KP559016994, T50, T4Z);
+						  T1z = FMA(Tb, T1y, T1x);
+						  T3v = FNMS(Tb, T1w, T3u);
+					     }
+					     {
+						  E T1Q, T3C, T1Y, T3E;
+						  {
+						       E T1L, T1P, T1T, T1X, T1M, T3B, T1U, T3D;
+						       T1L = cr[WS(rs, 14)];
+						       T1P = ci[WS(rs, 14)];
+						       T1T = cr[WS(rs, 19)];
+						       T1X = ci[WS(rs, 19)];
+						       T1M = T1K * T1L;
+						       T3B = T1K * T1P;
+						       T1U = T1S * T1T;
+						       T3D = T1S * T1X;
+						       T1Q = FMA(T1O, T1P, T1M);
+						       T3C = FNMS(T1O, T1L, T3B);
+						       T1Y = FMA(T1W, T1X, T1U);
+						       T3E = FNMS(T1W, T1T, T3D);
+						  }
+						  {
+						       E T1A, T1C, T1B, T3w;
+						       T1A = cr[WS(rs, 9)];
+						       T1C = ci[WS(rs, 9)];
+						       T5i = T1Y - T1Q;
+						       T1Z = T1Q + T1Y;
+						       T3F = T3C + T3E;
+						       T5p = T3E - T3C;
+						       T1B = T8 * T1A;
+						       T3w = T8 * T1C;
+						       T1E = cr[WS(rs, 24)];
+						       T1G = ci[WS(rs, 24)];
+						       T1D = FMA(Tc, T1C, T1B);
+						       T3x = FNMS(Tc, T1A, T3w);
+						       T1F = Tk * T1E;
+						       T3y = Tk * T1G;
+						  }
+					     }
+					     T1H = FMA(Tm, T1G, T1F);
+					     T3z = FNMS(Tm, T1E, T3y);
+					}
+					{
+					     E T2f, T2j, T2g, T3N;
+					     {
+						  E T23, T25, T24, T5m, T5f, T5l, T5e, T3J, T1I, T5h;
+						  T23 = cr[WS(rs, 2)];
+						  T25 = ci[WS(rs, 2)];
+						  T1I = T1D + T1H;
+						  T5h = T1H - T1D;
+						  {
+						       E T3A, T5o, T20, T3G;
+						       T3A = T3x + T3z;
+						       T5o = T3z - T3x;
+						       T24 = T19 * T23;
+						       T6v = FNMS(KP618033988, T5h, T5i);
+						       T5j = FMA(KP618033988, T5i, T5h);
+						       T20 = T1I + T1Z;
+						       T5m = T1I - T1Z;
+						       T6s = FNMS(KP618033988, T5o, T5p);
+						       T5q = FMA(KP618033988, T5p, T5o);
+						       T3G = T3A + T3F;
+						       T5f = T3F - T3A;
+						       T21 = T1z + T20;
+						       T5l = FNMS(KP250000000, T20, T1z);
+						       T3H = T3v + T3G;
+						       T5e = FNMS(KP250000000, T3G, T3v);
+						       T3J = T19 * T25;
+						  }
+						  T6r = FNMS(KP559016994, T5m, T5l);
+						  T5n = FMA(KP559016994, T5m, T5l);
+						  T6u = FMA(KP559016994, T5f, T5e);
+						  T5g = FNMS(KP559016994, T5f, T5e);
+						  T26 = FMA(T1b, T25, T24);
+						  T3K = FNMS(T1b, T23, T3J);
+					     }
+					     {
+						  E T2r, T3R, T2z, T3T;
+						  {
+						       E T2n, T2q, T2u, T2y, T2o, T3Q, T2v, T3S;
+						       T2n = cr[WS(rs, 12)];
+						       T2q = ci[WS(rs, 12)];
+						       T2u = cr[WS(rs, 17)];
+						       T2y = ci[WS(rs, 17)];
+						       T2o = T2m * T2n;
+						       T3Q = T2m * T2q;
+						       T2v = T2t * T2u;
+						       T3S = T2t * T2y;
+						       T2r = FMA(T2p, T2q, T2o);
+						       T3R = FNMS(T2p, T2n, T3Q);
+						       T2z = FMA(T2x, T2y, T2v);
+						       T3T = FNMS(T2x, T2u, T3S);
+						  }
+						  {
+						       E T28, T2b, T29, T3L;
+						       T28 = cr[WS(rs, 7)];
+						       T2b = ci[WS(rs, 7)];
+						       T4N = T2z - T2r;
+						       T2A = T2r + T2z;
+						       T3U = T3R + T3T;
+						       T4U = T3R - T3T;
+						       T29 = T27 * T28;
+						       T3L = T27 * T2b;
+						       T2f = cr[WS(rs, 22)];
+						       T2j = ci[WS(rs, 22)];
+						       T2c = FMA(T2a, T2b, T29);
+						       T3M = FNMS(T2a, T28, T3L);
+						       T2g = T2e * T2f;
+						       T3N = T2e * T2j;
+						  }
+					     }
+					     T2k = FMA(T2i, T2j, T2g);
+					     T3O = FNMS(T2i, T2f, T3N);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T8k, T6d, T6g, T8r, T6f, T8l, T6c, T8q, T69, T7r, T5Y, T8g, T8i, T66, T68;
+			      E T5X, T8d, T8h;
+			      {
+				   E T4O, T4V, T22, T4S, T4L, T3b, T4e, T4c, T3I;
+				   T8k = T3t + T3H;
+				   T3I = T3t - T3H;
+				   {
+					E T2l, T4M, T3P, T4T;
+					T2l = T2c + T2k;
+					T4M = T2k - T2c;
+					T3P = T3M + T3O;
+					T4T = T3O - T3M;
+					T4O = FMA(KP618033988, T4N, T4M);
+					T6d = FNMS(KP618033988, T4M, T4N);
+					{
+					     E T4R, T2B, T4K, T3V;
+					     T4R = T2A - T2l;
+					     T2B = T2l + T2A;
+					     T4V = FNMS(KP618033988, T4U, T4T);
+					     T6g = FMA(KP618033988, T4T, T4U);
+					     T4K = T3U - T3P;
+					     T3V = T3P + T3U;
+					     {
+						  E T4Q, T2C, T4J, T3W, T4b;
+						  T4Q = FNMS(KP250000000, T2B, T26);
+						  T2C = T26 + T2B;
+						  T4J = FNMS(KP250000000, T3V, T3K);
+						  T3W = T3K + T3V;
+						  T8r = T21 - T1v;
+						  T22 = T1v + T21;
+						  T4S = FNMS(KP559016994, T4R, T4Q);
+						  T6f = FMA(KP559016994, T4R, T4Q);
+						  T4b = T3W - T4a;
+						  T8l = T3W + T4a;
+						  T6c = FMA(KP559016994, T4K, T4J);
+						  T4L = FNMS(KP559016994, T4K, T4J);
+						  T8q = T2C - T3a;
+						  T3b = T2C + T3a;
+						  T4e = FNMS(KP618033988, T3I, T4b);
+						  T4c = FMA(KP618033988, T4b, T3I);
+					     }
+					}
+				   }
+				   {
+					E T5H, T4t, T7V, T87, T5Q, T5P, T5D, T8e, T5A, T8f, T5K, T60, T8c, T8a, T5u;
+					E T5w, T5U, T64, T5N, T61;
+					{
+					     E T3e, T3d, T4h, T3c, T7T;
+					     T4h = FMA(KP559016994, T4g, T4f);
+					     T69 = FNMS(KP559016994, T4g, T4f);
+					     T3c = T22 + T3b;
+					     T3e = T22 - T3b;
+					     T7r = FNMS(KP559016994, T7q, T7p);
+					     T7T = FMA(KP559016994, T7q, T7p);
+					     T5H = FMA(KP951056516, T4s, T4h);
+					     T4t = FNMS(KP951056516, T4s, T4h);
+					     cr[0] = TN + T3c;
+					     T3d = FNMS(KP250000000, T3c, TN);
+					     T7V = FNMS(KP951056516, T7U, T7T);
+					     T87 = FMA(KP951056516, T7U, T7T);
+					     {
+						  E T5S, T5T, T5L, T4I, T5B, T5M, T55, T5J, T5s, T5z, T4X, T5C, T5I, T5c;
+						  {
+						       E T5k, T5r, T4P, T4W;
+						       {
+							    E T4A, T4d, T3f, T4H;
+							    T4A = FMA(KP951056516, T4z, T4w);
+							    T5S = FNMS(KP951056516, T4z, T4w);
+							    T4d = FNMS(KP559016994, T3e, T3d);
+							    T3f = FMA(KP559016994, T3e, T3d);
+							    T5T = FNMS(KP951056516, T4G, T4D);
+							    T4H = FMA(KP951056516, T4G, T4D);
+							    T5k = FNMS(KP951056516, T5j, T5g);
+							    T5L = FMA(KP951056516, T5j, T5g);
+							    cr[WS(rs, 5)] = FMA(KP951056516, T4c, T3f);
+							    ci[WS(rs, 4)] = FNMS(KP951056516, T4c, T3f);
+							    ci[WS(rs, 9)] = FMA(KP951056516, T4e, T4d);
+							    cr[WS(rs, 10)] = FNMS(KP951056516, T4e, T4d);
+							    T4I = FNMS(KP126329378, T4H, T4A);
+							    T5B = FMA(KP126329378, T4A, T4H);
+							    T5M = FNMS(KP951056516, T5q, T5n);
+							    T5r = FMA(KP951056516, T5q, T5n);
+						       }
+						       T4P = FNMS(KP951056516, T4O, T4L);
+						       T5Q = FMA(KP951056516, T4O, T4L);
+						       T5P = FNMS(KP951056516, T4V, T4S);
+						       T4W = FMA(KP951056516, T4V, T4S);
+						       T55 = FNMS(KP951056516, T54, T51);
+						       T5J = FMA(KP951056516, T54, T51);
+						       T5s = FMA(KP827271945, T5r, T5k);
+						       T5z = FNMS(KP827271945, T5k, T5r);
+						       T4X = FNMS(KP470564281, T4W, T4P);
+						       T5C = FMA(KP470564281, T4P, T4W);
+						       T5I = FMA(KP951056516, T5b, T58);
+						       T5c = FNMS(KP951056516, T5b, T58);
+						  }
+						  {
+						       E T88, T4Y, T5d, T5y, T89, T5t;
+						       T5D = FNMS(KP912018591, T5C, T5B);
+						       T88 = FMA(KP912018591, T5C, T5B);
+						       T8e = FMA(KP912018591, T4X, T4I);
+						       T4Y = FNMS(KP912018591, T4X, T4I);
+						       T5d = FMA(KP634619297, T5c, T55);
+						       T5y = FNMS(KP634619297, T55, T5c);
+						       T5A = FMA(KP912575812, T5z, T5y);
+						       T89 = FNMS(KP912575812, T5z, T5y);
+						       T8f = FMA(KP912575812, T5s, T5d);
+						       T5t = FNMS(KP912575812, T5s, T5d);
+						       T5K = FMA(KP256756360, T5J, T5I);
+						       T60 = FNMS(KP256756360, T5I, T5J);
+						       T8c = FNMS(KP851038619, T89, T88);
+						       T8a = FMA(KP851038619, T89, T88);
+						       T5u = FNMS(KP851038619, T5t, T4Y);
+						       T5w = FMA(KP851038619, T5t, T4Y);
+						  }
+						  T5U = FMA(KP939062505, T5T, T5S);
+						  T64 = FNMS(KP939062505, T5S, T5T);
+						  T5N = FMA(KP634619297, T5M, T5L);
+						  T61 = FNMS(KP634619297, T5L, T5M);
+					     }
+					}
+					{
+					     E T62, T7W, T83, T5O, T5R, T63;
+					     cr[WS(rs, 4)] = FNMS(KP992114701, T5u, T4t);
+					     T62 = FMA(KP871714437, T61, T60);
+					     T7W = FNMS(KP871714437, T61, T60);
+					     T83 = FNMS(KP871714437, T5N, T5K);
+					     T5O = FMA(KP871714437, T5N, T5K);
+					     T5R = FMA(KP549754652, T5Q, T5P);
+					     T63 = FNMS(KP549754652, T5P, T5Q);
+					     ci[WS(rs, 20)] = FNMS(KP992114701, T8a, T87);
+					     {
+						  E T65, T5W, T84, T86, T81, T85, T8b;
+						  {
+						       E T5E, T5G, T82, T80, T7Y, T5v, T7X, T5V, T5F, T5x, T7Z;
+						       T5E = FNMS(KP726211448, T5D, T5A);
+						       T5G = FMA(KP525970792, T5A, T5D);
+						       T65 = FNMS(KP831864738, T64, T63);
+						       T7X = FMA(KP831864738, T64, T63);
+						       T82 = FNMS(KP831864738, T5U, T5R);
+						       T5V = FMA(KP831864738, T5U, T5R);
+						       T80 = FNMS(KP904730450, T7X, T7W);
+						       T7Y = FMA(KP904730450, T7X, T7W);
+						       T5Y = FNMS(KP904730450, T5V, T5O);
+						       T5W = FMA(KP904730450, T5V, T5O);
+						       T5v = FMA(KP248028675, T5u, T4t);
+						       ci[WS(rs, 23)] = FMA(KP968583161, T7Y, T7V);
+						       cr[WS(rs, 1)] = FMA(KP968583161, T5W, T5H);
+						       T84 = FNMS(KP683113946, T83, T82);
+						       T86 = FMA(KP559154169, T82, T83);
+						       T5F = FNMS(KP554608978, T5w, T5v);
+						       T5x = FMA(KP554608978, T5w, T5v);
+						       T7Z = FNMS(KP242145790, T7Y, T7V);
+						       ci[WS(rs, 10)] = FNMS(KP943557151, T5G, T5F);
+						       ci[WS(rs, 5)] = FMA(KP943557151, T5G, T5F);
+						       ci[0] = FMA(KP803003575, T5E, T5x);
+						       cr[WS(rs, 9)] = FNMS(KP803003575, T5E, T5x);
+						       T81 = FNMS(KP541454447, T80, T7Z);
+						       T85 = FMA(KP541454447, T80, T7Z);
+						  }
+						  T8g = FNMS(KP525970792, T8f, T8e);
+						  T8i = FMA(KP726211448, T8e, T8f);
+						  ci[WS(rs, 13)] = FMA(KP833417178, T84, T81);
+						  cr[WS(rs, 16)] = FMS(KP833417178, T84, T81);
+						  cr[WS(rs, 21)] = -(FMA(KP921177326, T86, T85));
+						  ci[WS(rs, 18)] = FNMS(KP921177326, T86, T85);
+						  T8b = FMA(KP248028675, T8a, T87);
+						  T66 = FMA(KP559154169, T65, T62);
+						  T68 = FNMS(KP683113946, T62, T65);
+						  T5X = FNMS(KP242145790, T5W, T5H);
+						  T8d = FNMS(KP554608978, T8c, T8b);
+						  T8h = FMA(KP554608978, T8c, T8b);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T8s, T8u, T5Z, T67;
+				   cr[WS(rs, 24)] = -(FMA(KP803003575, T8i, T8h));
+				   ci[WS(rs, 15)] = FNMS(KP803003575, T8i, T8h);
+				   cr[WS(rs, 19)] = FMS(KP943557151, T8g, T8d);
+				   cr[WS(rs, 14)] = -(FMA(KP943557151, T8g, T8d));
+				   T5Z = FMA(KP541454447, T5Y, T5X);
+				   T67 = FNMS(KP541454447, T5Y, T5X);
+				   cr[WS(rs, 11)] = FNMS(KP833417178, T68, T67);
+				   ci[WS(rs, 8)] = FMA(KP833417178, T68, T67);
+				   cr[WS(rs, 6)] = FMA(KP921177326, T66, T5Z);
+				   ci[WS(rs, 3)] = FNMS(KP921177326, T66, T5Z);
+				   T8s = FMA(KP618033988, T8r, T8q);
+				   T8u = FNMS(KP618033988, T8q, T8r);
+				   {
+					E T6X, T6T, T6b, T7H, T7v, T6Y, T72, T71, T6P, T7O, T6M, T7P, T7K, T6G, T6I;
+					E T6W, T7f, T7d, T76;
+					{
+					     E T74, T75, T6i, T6N, T6L, T6E, T6U, T6l, T6o, T6V, T6t, T6w;
+					     {
+						  E T6e, T8o, T8n, T6h, T8m;
+						  T6X = FNMS(KP951056516, T6d, T6c);
+						  T6e = FMA(KP951056516, T6d, T6c);
+						  T8o = T8k - T8l;
+						  T8m = T8k + T8l;
+						  T6T = FNMS(KP951056516, T6a, T69);
+						  T6b = FMA(KP951056516, T6a, T69);
+						  T7H = FNMS(KP951056516, T7u, T7r);
+						  T7v = FMA(KP951056516, T7u, T7r);
+						  ci[WS(rs, 24)] = T8m + T8j;
+						  T8n = FNMS(KP250000000, T8m, T8j);
+						  T6h = FMA(KP951056516, T6g, T6f);
+						  T6Y = FNMS(KP951056516, T6g, T6f);
+						  {
+						       E T6A, T6D, T8t, T8p;
+						       T74 = FMA(KP951056516, T6z, T6y);
+						       T6A = FNMS(KP951056516, T6z, T6y);
+						       T6D = FMA(KP951056516, T6C, T6B);
+						       T75 = FNMS(KP951056516, T6C, T6B);
+						       T8t = FMA(KP559016994, T8o, T8n);
+						       T8p = FNMS(KP559016994, T8o, T8n);
+						       T6i = FMA(KP062914667, T6h, T6e);
+						       T6N = FNMS(KP062914667, T6e, T6h);
+						       ci[WS(rs, 14)] = FMA(KP951056516, T8s, T8p);
+						       cr[WS(rs, 15)] = FMS(KP951056516, T8s, T8p);
+						       ci[WS(rs, 19)] = FMA(KP951056516, T8u, T8t);
+						       cr[WS(rs, 20)] = FMS(KP951056516, T8u, T8t);
+						       T6L = FNMS(KP939062505, T6A, T6D);
+						       T6E = FMA(KP939062505, T6D, T6A);
+						  }
+					     }
+					     T6U = FMA(KP951056516, T6k, T6j);
+					     T6l = FNMS(KP951056516, T6k, T6j);
+					     T6o = FNMS(KP951056516, T6n, T6m);
+					     T6V = FMA(KP951056516, T6n, T6m);
+					     T72 = FMA(KP951056516, T6s, T6r);
+					     T6t = FNMS(KP951056516, T6s, T6r);
+					     T6w = FMA(KP951056516, T6v, T6u);
+					     T71 = FNMS(KP951056516, T6v, T6u);
+					     {
+						  E T6q, T6F, T6O, T6p;
+						  T6O = FMA(KP827271945, T6l, T6o);
+						  T6p = FNMS(KP827271945, T6o, T6l);
+						  {
+						       E T6K, T6x, T7I, T7J;
+						       T6K = FMA(KP126329378, T6t, T6w);
+						       T6x = FNMS(KP126329378, T6w, T6t);
+						       T7I = FMA(KP772036680, T6O, T6N);
+						       T6P = FNMS(KP772036680, T6O, T6N);
+						       T6q = FMA(KP772036680, T6p, T6i);
+						       T7O = FNMS(KP772036680, T6p, T6i);
+						       T7J = FNMS(KP734762448, T6L, T6K);
+						       T6M = FMA(KP734762448, T6L, T6K);
+						       T6F = FNMS(KP734762448, T6E, T6x);
+						       T7P = FMA(KP734762448, T6E, T6x);
+						       T7K = FMA(KP994076283, T7J, T7I);
+						       T7M = FNMS(KP994076283, T7J, T7I);
+						  }
+						  T6G = FNMS(KP994076283, T6F, T6q);
+						  T6I = FMA(KP994076283, T6F, T6q);
+					     }
+					     T6W = FMA(KP062914667, T6V, T6U);
+					     T7f = FNMS(KP062914667, T6U, T6V);
+					     T7d = FNMS(KP549754652, T74, T75);
+					     T76 = FMA(KP549754652, T75, T74);
+					}
+					{
+					     E T7h, T7C, T7e, T7D, T7y, T7A, T78, T7a;
+					     {
+						  E T70, T77, T7g, T6Z;
+						  cr[WS(rs, 3)] = FMA(KP998026728, T6G, T6b);
+						  T7g = FNMS(KP634619297, T6X, T6Y);
+						  T6Z = FMA(KP634619297, T6Y, T6X);
+						  {
+						       E T7c, T73, T7w, T7x;
+						       T7c = FMA(KP470564281, T71, T72);
+						       T73 = FNMS(KP470564281, T72, T71);
+						       T7w = FMA(KP845997307, T7g, T7f);
+						       T7h = FNMS(KP845997307, T7g, T7f);
+						       T70 = FMA(KP845997307, T6Z, T6W);
+						       T7C = FNMS(KP845997307, T6Z, T6W);
+						       T7x = FNMS(KP968479752, T7d, T7c);
+						       T7e = FMA(KP968479752, T7d, T7c);
+						       T77 = FMA(KP968479752, T76, T73);
+						       T7D = FNMS(KP968479752, T76, T73);
+						       T7y = FMA(KP906616052, T7x, T7w);
+						       T7A = FNMS(KP906616052, T7x, T7w);
+						  }
+						  ci[WS(rs, 21)] = FNMS(KP998026728, T7K, T7H);
+						  T78 = FMA(KP906616052, T77, T70);
+						  T7a = FNMS(KP906616052, T77, T70);
+					     }
+					     {
+						  E T7G, T7E, T7k, T7i, T79, T7F, T7B, T7z, T6H, T7j, T7b;
+						  T7G = FMA(KP681693190, T7C, T7D);
+						  T7E = FNMS(KP560319534, T7D, T7C);
+						  ci[WS(rs, 22)] = FNMS(KP998026728, T7y, T7v);
+						  cr[WS(rs, 2)] = FMA(KP998026728, T78, T6T);
+						  T7z = FMA(KP249506682, T7y, T7v);
+						  T7k = FNMS(KP560319534, T7e, T7h);
+						  T7i = FMA(KP681693190, T7h, T7e);
+						  T79 = FNMS(KP249506682, T78, T6T);
+						  T7F = FMA(KP557913902, T7A, T7z);
+						  T7B = FNMS(KP557913902, T7A, T7z);
+						  T6S = FMA(KP614372930, T6M, T6P);
+						  T6Q = FNMS(KP621716863, T6P, T6M);
+						  cr[WS(rs, 22)] = FMS(KP860541664, T7G, T7F);
+						  ci[WS(rs, 17)] = FMA(KP860541664, T7G, T7F);
+						  ci[WS(rs, 12)] = FNMS(KP949179823, T7E, T7B);
+						  cr[WS(rs, 17)] = -(FMA(KP949179823, T7E, T7B));
+						  T7j = FMA(KP557913902, T7a, T79);
+						  T7b = FNMS(KP557913902, T7a, T79);
+						  T6H = FNMS(KP249506682, T6G, T6b);
+						  ci[WS(rs, 7)] = FMA(KP949179823, T7k, T7j);
+						  cr[WS(rs, 12)] = FNMS(KP949179823, T7k, T7j);
+						  cr[WS(rs, 7)] = FMA(KP860541664, T7i, T7b);
+						  ci[WS(rs, 2)] = FNMS(KP860541664, T7i, T7b);
+						  T7S = FMA(KP621716863, T7O, T7P);
+						  T7Q = FNMS(KP614372930, T7P, T7O);
+						  T7L = FMA(KP249506682, T7K, T7H);
+						  T6R = FMA(KP557913902, T6I, T6H);
+						  T6J = FNMS(KP557913902, T6I, T6H);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 6)] = FNMS(KP949179823, T6S, T6R);
+	       ci[WS(rs, 11)] = FMA(KP949179823, T6S, T6R);
+	       cr[WS(rs, 8)] = FMA(KP943557151, T6Q, T6J);
+	       ci[WS(rs, 1)] = FNMS(KP943557151, T6Q, T6J);
+	       T7N = FNMS(KP557913902, T7M, T7L);
+	       T7R = FMA(KP557913902, T7M, T7L);
+	       cr[WS(rs, 23)] = -(FMA(KP943557151, T7S, T7R));
+	       ci[WS(rs, 16)] = FNMS(KP943557151, T7S, T7R);
+	       cr[WS(rs, 18)] = FMS(KP949179823, T7Q, T7N);
+	       cr[WS(rs, 13)] = -(FMA(KP949179823, T7Q, T7N));
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 24},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 25, "hf2_25", twinstr, &GENUS, {84, 78, 356, 0} };
+
+void X(codelet_hf2_25) (planner *p) {
+     X(khc2hc_register) (p, hf2_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 25 -dit -name hf2_25 -include hf.h */
+
+/*
+ * This function contains 440 FP additions, 340 FP multiplications,
+ * (or, 280 additions, 180 multiplications, 160 fused multiply/add),
+ * 149 stack variables, 20 constants, and 100 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_25(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E T2, T5, T3, T6, T8, Td, T16, T14, Te, T9, T21, T23, Tx, TR, T1g;
+	       E TB, T1f, TV, T1Q, Tg, T1S, Tk, T18, T2s, T1c, T2q, Tn, To, Tp, Tr;
+	       E T28, T2x, TY, T2k, T2m, T2v, TG, TE, T10, T1h, T1E, T26, T1B, T1G, T1V;
+	       E T1X, T1z, T1j;
+	       {
+		    E Tw, TT, Tz, TQ, Tv, TU, TA, TP;
+		    {
+			 E T4, Tc, T7, Tb;
+			 T2 = W[0];
+			 T5 = W[1];
+			 T3 = W[2];
+			 T6 = W[3];
+			 T4 = T2 * T3;
+			 Tc = T5 * T3;
+			 T7 = T5 * T6;
+			 Tb = T2 * T6;
+			 T8 = T4 - T7;
+			 Td = Tb + Tc;
+			 T16 = Tb - Tc;
+			 T14 = T4 + T7;
+			 Te = W[5];
+			 Tw = T5 * Te;
+			 TT = T3 * Te;
+			 Tz = T2 * Te;
+			 TQ = T6 * Te;
+			 T9 = W[4];
+			 Tv = T2 * T9;
+			 TU = T6 * T9;
+			 TA = T5 * T9;
+			 TP = T3 * T9;
+		    }
+		    T21 = TP - TQ;
+		    T23 = TT + TU;
+		    {
+			 E T15, T17, Ta, Tf, T1a, T1b, Ti, Tj;
+			 Tx = Tv - Tw;
+			 TR = TP + TQ;
+			 T1g = Tz - TA;
+			 TB = Tz + TA;
+			 T1f = Tv + Tw;
+			 TV = TT - TU;
+			 T15 = T14 * T9;
+			 T17 = T16 * Te;
+			 T1Q = T15 + T17;
+			 Ta = T8 * T9;
+			 Tf = Td * Te;
+			 Tg = Ta + Tf;
+			 T1a = T14 * Te;
+			 T1b = T16 * T9;
+			 T1S = T1a - T1b;
+			 Ti = T8 * Te;
+			 Tj = Td * T9;
+			 Tk = Ti - Tj;
+			 T18 = T15 - T17;
+			 T2s = Ti + Tj;
+			 T1c = T1a + T1b;
+			 T2q = Ta - Tf;
+			 Tn = W[6];
+			 To = W[7];
+			 Tp = FMA(T8, Tn, Td * To);
+			 Tr = FNMS(Td, Tn, T8 * To);
+			 T28 = FNMS(T1S, Tn, T1Q * To);
+			 T2x = FNMS(TV, Tn, TR * To);
+			 TY = FMA(T3, Tn, T6 * To);
+			 T2k = FMA(T2, Tn, T5 * To);
+			 T2m = FNMS(T5, Tn, T2 * To);
+			 T2v = FMA(TR, Tn, TV * To);
+			 TG = FNMS(Te, Tn, T9 * To);
+			 TE = FMA(T9, Tn, Te * To);
+			 T10 = FNMS(T6, Tn, T3 * To);
+			 T1h = FMA(T1f, Tn, T1g * To);
+			 T1E = FMA(Tg, Tn, Tk * To);
+			 T26 = FMA(T1Q, Tn, T1S * To);
+			 T1B = FNMS(TB, Tn, Tx * To);
+			 T1G = FNMS(Tk, Tn, Tg * To);
+			 T1V = FMA(T14, Tn, T16 * To);
+			 T1X = FNMS(T16, Tn, T14 * To);
+			 T1z = FMA(Tx, Tn, TB * To);
+			 T1j = FNMS(T1g, Tn, T1f * To);
+		    }
+	       }
+	       {
+		    E T1, T6v, T2F, T6A, TK, T2G, T6y, T6z, T6u, T71, T2O, T52, T2C, T6k, T4c;
+		    E T5X, T4L, T5s, T4j, T5W, T4K, T5v, T1o, T6g, T30, T5M, T4A, T56, T3b, T5N;
+		    E T4B, T59, T1L, T6h, T3r, T5P, T4E, T5d, T3y, T5Q, T4D, T5g, T2d, T6j, T3P;
+		    E T5U, T4I, T5o, T3W, T5T, T4H, T5l;
+		    {
+			 E Tm, T2I, Tt, T2J, Tu, T6w, TD, T2L, TI, T2M, TJ, T6x;
+			 T1 = cr[0];
+			 T6v = ci[0];
+			 {
+			      E Th, Tl, Tq, Ts;
+			      Th = cr[WS(rs, 5)];
+			      Tl = ci[WS(rs, 5)];
+			      Tm = FMA(Tg, Th, Tk * Tl);
+			      T2I = FNMS(Tk, Th, Tg * Tl);
+			      Tq = cr[WS(rs, 20)];
+			      Ts = ci[WS(rs, 20)];
+			      Tt = FMA(Tp, Tq, Tr * Ts);
+			      T2J = FNMS(Tr, Tq, Tp * Ts);
+			 }
+			 Tu = Tm + Tt;
+			 T6w = T2I + T2J;
+			 {
+			      E Ty, TC, TF, TH;
+			      Ty = cr[WS(rs, 10)];
+			      TC = ci[WS(rs, 10)];
+			      TD = FMA(Tx, Ty, TB * TC);
+			      T2L = FNMS(TB, Ty, Tx * TC);
+			      TF = cr[WS(rs, 15)];
+			      TH = ci[WS(rs, 15)];
+			      TI = FMA(TE, TF, TG * TH);
+			      T2M = FNMS(TG, TF, TE * TH);
+			 }
+			 TJ = TD + TI;
+			 T6x = T2L + T2M;
+			 T2F = KP559016994 * (Tu - TJ);
+			 T6A = KP559016994 * (T6w - T6x);
+			 TK = Tu + TJ;
+			 T2G = FNMS(KP250000000, TK, T1);
+			 T6y = T6w + T6x;
+			 T6z = FNMS(KP250000000, T6y, T6v);
+			 {
+			      E T6s, T6t, T2K, T2N;
+			      T6s = TD - TI;
+			      T6t = Tm - Tt;
+			      T6u = FNMS(KP587785252, T6t, KP951056516 * T6s);
+			      T71 = FMA(KP951056516, T6t, KP587785252 * T6s);
+			      T2K = T2I - T2J;
+			      T2N = T2L - T2M;
+			      T2O = FMA(KP951056516, T2K, KP587785252 * T2N);
+			      T52 = FNMS(KP587785252, T2K, KP951056516 * T2N);
+			 }
+		    }
+		    {
+			 E T2g, T48, T3Y, T3Z, T4h, T4g, T43, T46, T49, T2p, T2A, T2B, T2e, T2f;
+			 T2e = cr[WS(rs, 3)];
+			 T2f = ci[WS(rs, 3)];
+			 T2g = FMA(T3, T2e, T6 * T2f);
+			 T48 = FNMS(T6, T2e, T3 * T2f);
+			 {
+			      E T2j, T41, T2z, T45, T2o, T42, T2u, T44;
+			      {
+				   E T2h, T2i, T2w, T2y;
+				   T2h = cr[WS(rs, 8)];
+				   T2i = ci[WS(rs, 8)];
+				   T2j = FMA(T1f, T2h, T1g * T2i);
+				   T41 = FNMS(T1g, T2h, T1f * T2i);
+				   T2w = cr[WS(rs, 18)];
+				   T2y = ci[WS(rs, 18)];
+				   T2z = FMA(T2v, T2w, T2x * T2y);
+				   T45 = FNMS(T2x, T2w, T2v * T2y);
+			      }
+			      {
+				   E T2l, T2n, T2r, T2t;
+				   T2l = cr[WS(rs, 23)];
+				   T2n = ci[WS(rs, 23)];
+				   T2o = FMA(T2k, T2l, T2m * T2n);
+				   T42 = FNMS(T2m, T2l, T2k * T2n);
+				   T2r = cr[WS(rs, 13)];
+				   T2t = ci[WS(rs, 13)];
+				   T2u = FMA(T2q, T2r, T2s * T2t);
+				   T44 = FNMS(T2s, T2r, T2q * T2t);
+			      }
+			      T3Y = T2j - T2o;
+			      T3Z = T2u - T2z;
+			      T4h = T44 - T45;
+			      T4g = T41 - T42;
+			      T43 = T41 + T42;
+			      T46 = T44 + T45;
+			      T49 = T43 + T46;
+			      T2p = T2j + T2o;
+			      T2A = T2u + T2z;
+			      T2B = T2p + T2A;
+			 }
+			 T2C = T2g + T2B;
+			 T6k = T48 + T49;
+			 {
+			      E T40, T5r, T4b, T5q, T47, T4a;
+			      T40 = FMA(KP951056516, T3Y, KP587785252 * T3Z);
+			      T5r = FNMS(KP587785252, T3Y, KP951056516 * T3Z);
+			      T47 = KP559016994 * (T43 - T46);
+			      T4a = FNMS(KP250000000, T49, T48);
+			      T4b = T47 + T4a;
+			      T5q = T4a - T47;
+			      T4c = T40 + T4b;
+			      T5X = T5r + T5q;
+			      T4L = T4b - T40;
+			      T5s = T5q - T5r;
+			 }
+			 {
+			      E T4i, T5u, T4f, T5t, T4d, T4e;
+			      T4i = FMA(KP951056516, T4g, KP587785252 * T4h);
+			      T5u = FNMS(KP587785252, T4g, KP951056516 * T4h);
+			      T4d = KP559016994 * (T2p - T2A);
+			      T4e = FNMS(KP250000000, T2B, T2g);
+			      T4f = T4d + T4e;
+			      T5t = T4e - T4d;
+			      T4j = T4f - T4i;
+			      T5W = T5t - T5u;
+			      T4K = T4f + T4i;
+			      T5v = T5t + T5u;
+			 }
+		    }
+		    {
+			 E TO, T37, T2V, T2Y, T32, T31, T34, T35, T38, T13, T1m, T1n, TM, TN;
+			 TM = cr[WS(rs, 1)];
+			 TN = ci[WS(rs, 1)];
+			 TO = FMA(T2, TM, T5 * TN);
+			 T37 = FNMS(T5, TM, T2 * TN);
+			 {
+			      E TX, T2T, T1l, T2X, T12, T2U, T1e, T2W;
+			      {
+				   E TS, TW, T1i, T1k;
+				   TS = cr[WS(rs, 6)];
+				   TW = ci[WS(rs, 6)];
+				   TX = FMA(TR, TS, TV * TW);
+				   T2T = FNMS(TV, TS, TR * TW);
+				   T1i = cr[WS(rs, 16)];
+				   T1k = ci[WS(rs, 16)];
+				   T1l = FMA(T1h, T1i, T1j * T1k);
+				   T2X = FNMS(T1j, T1i, T1h * T1k);
+			      }
+			      {
+				   E TZ, T11, T19, T1d;
+				   TZ = cr[WS(rs, 21)];
+				   T11 = ci[WS(rs, 21)];
+				   T12 = FMA(TY, TZ, T10 * T11);
+				   T2U = FNMS(T10, TZ, TY * T11);
+				   T19 = cr[WS(rs, 11)];
+				   T1d = ci[WS(rs, 11)];
+				   T1e = FMA(T18, T19, T1c * T1d);
+				   T2W = FNMS(T1c, T19, T18 * T1d);
+			      }
+			      T2V = T2T - T2U;
+			      T2Y = T2W - T2X;
+			      T32 = T1e - T1l;
+			      T31 = TX - T12;
+			      T34 = T2T + T2U;
+			      T35 = T2W + T2X;
+			      T38 = T34 + T35;
+			      T13 = TX + T12;
+			      T1m = T1e + T1l;
+			      T1n = T13 + T1m;
+			 }
+			 T1o = TO + T1n;
+			 T6g = T37 + T38;
+			 {
+			      E T2Z, T55, T2S, T54, T2Q, T2R;
+			      T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
+			      T55 = FNMS(KP587785252, T2V, KP951056516 * T2Y);
+			      T2Q = KP559016994 * (T13 - T1m);
+			      T2R = FNMS(KP250000000, T1n, TO);
+			      T2S = T2Q + T2R;
+			      T54 = T2R - T2Q;
+			      T30 = T2S - T2Z;
+			      T5M = T54 - T55;
+			      T4A = T2S + T2Z;
+			      T56 = T54 + T55;
+			 }
+			 {
+			      E T33, T58, T3a, T57, T36, T39;
+			      T33 = FMA(KP951056516, T31, KP587785252 * T32);
+			      T58 = FNMS(KP587785252, T31, KP951056516 * T32);
+			      T36 = KP559016994 * (T34 - T35);
+			      T39 = FNMS(KP250000000, T38, T37);
+			      T3a = T36 + T39;
+			      T57 = T39 - T36;
+			      T3b = T33 + T3a;
+			      T5N = T58 + T57;
+			      T4B = T3a - T33;
+			      T59 = T57 - T58;
+			 }
+		    }
+		    {
+			 E T1r, T3n, T3d, T3e, T3w, T3v, T3i, T3l, T3o, T1y, T1J, T1K, T1p, T1q;
+			 T1p = cr[WS(rs, 4)];
+			 T1q = ci[WS(rs, 4)];
+			 T1r = FMA(T8, T1p, Td * T1q);
+			 T3n = FNMS(Td, T1p, T8 * T1q);
+			 {
+			      E T1u, T3g, T1I, T3k, T1x, T3h, T1D, T3j;
+			      {
+				   E T1s, T1t, T1F, T1H;
+				   T1s = cr[WS(rs, 9)];
+				   T1t = ci[WS(rs, 9)];
+				   T1u = FMA(T9, T1s, Te * T1t);
+				   T3g = FNMS(Te, T1s, T9 * T1t);
+				   T1F = cr[WS(rs, 19)];
+				   T1H = ci[WS(rs, 19)];
+				   T1I = FMA(T1E, T1F, T1G * T1H);
+				   T3k = FNMS(T1G, T1F, T1E * T1H);
+			      }
+			      {
+				   E T1v, T1w, T1A, T1C;
+				   T1v = cr[WS(rs, 24)];
+				   T1w = ci[WS(rs, 24)];
+				   T1x = FMA(Tn, T1v, To * T1w);
+				   T3h = FNMS(To, T1v, Tn * T1w);
+				   T1A = cr[WS(rs, 14)];
+				   T1C = ci[WS(rs, 14)];
+				   T1D = FMA(T1z, T1A, T1B * T1C);
+				   T3j = FNMS(T1B, T1A, T1z * T1C);
+			      }
+			      T3d = T1x - T1u;
+			      T3e = T1D - T1I;
+			      T3w = T3j - T3k;
+			      T3v = T3g - T3h;
+			      T3i = T3g + T3h;
+			      T3l = T3j + T3k;
+			      T3o = T3i + T3l;
+			      T1y = T1u + T1x;
+			      T1J = T1D + T1I;
+			      T1K = T1y + T1J;
+			 }
+			 T1L = T1r + T1K;
+			 T6h = T3n + T3o;
+			 {
+			      E T3f, T5c, T3q, T5b, T3m, T3p;
+			      T3f = FNMS(KP587785252, T3e, KP951056516 * T3d);
+			      T5c = FMA(KP587785252, T3d, KP951056516 * T3e);
+			      T3m = KP559016994 * (T3i - T3l);
+			      T3p = FNMS(KP250000000, T3o, T3n);
+			      T3q = T3m + T3p;
+			      T5b = T3p - T3m;
+			      T3r = T3f - T3q;
+			      T5P = T5c + T5b;
+			      T4E = T3f + T3q;
+			      T5d = T5b - T5c;
+			 }
+			 {
+			      E T3x, T5f, T3u, T5e, T3s, T3t;
+			      T3x = FMA(KP951056516, T3v, KP587785252 * T3w);
+			      T5f = FNMS(KP587785252, T3v, KP951056516 * T3w);
+			      T3s = KP559016994 * (T1y - T1J);
+			      T3t = FNMS(KP250000000, T1K, T1r);
+			      T3u = T3s + T3t;
+			      T5e = T3t - T3s;
+			      T3y = T3u - T3x;
+			      T5Q = T5e - T5f;
+			      T4D = T3u + T3x;
+			      T5g = T5e + T5f;
+			 }
+		    }
+		    {
+			 E T1P, T3L, T3B, T3C, T3U, T3T, T3G, T3J, T3M, T20, T2b, T2c, T1N, T1O;
+			 T1N = cr[WS(rs, 2)];
+			 T1O = ci[WS(rs, 2)];
+			 T1P = FMA(T14, T1N, T16 * T1O);
+			 T3L = FNMS(T16, T1N, T14 * T1O);
+			 {
+			      E T1U, T3E, T2a, T3I, T1Z, T3F, T25, T3H;
+			      {
+				   E T1R, T1T, T27, T29;
+				   T1R = cr[WS(rs, 7)];
+				   T1T = ci[WS(rs, 7)];
+				   T1U = FMA(T1Q, T1R, T1S * T1T);
+				   T3E = FNMS(T1S, T1R, T1Q * T1T);
+				   T27 = cr[WS(rs, 17)];
+				   T29 = ci[WS(rs, 17)];
+				   T2a = FMA(T26, T27, T28 * T29);
+				   T3I = FNMS(T28, T27, T26 * T29);
+			      }
+			      {
+				   E T1W, T1Y, T22, T24;
+				   T1W = cr[WS(rs, 22)];
+				   T1Y = ci[WS(rs, 22)];
+				   T1Z = FMA(T1V, T1W, T1X * T1Y);
+				   T3F = FNMS(T1X, T1W, T1V * T1Y);
+				   T22 = cr[WS(rs, 12)];
+				   T24 = ci[WS(rs, 12)];
+				   T25 = FMA(T21, T22, T23 * T24);
+				   T3H = FNMS(T23, T22, T21 * T24);
+			      }
+			      T3B = T1U - T1Z;
+			      T3C = T25 - T2a;
+			      T3U = T3H - T3I;
+			      T3T = T3E - T3F;
+			      T3G = T3E + T3F;
+			      T3J = T3H + T3I;
+			      T3M = T3G + T3J;
+			      T20 = T1U + T1Z;
+			      T2b = T25 + T2a;
+			      T2c = T20 + T2b;
+			 }
+			 T2d = T1P + T2c;
+			 T6j = T3L + T3M;
+			 {
+			      E T3D, T5n, T3O, T5m, T3K, T3N;
+			      T3D = FMA(KP951056516, T3B, KP587785252 * T3C);
+			      T5n = FNMS(KP587785252, T3B, KP951056516 * T3C);
+			      T3K = KP559016994 * (T3G - T3J);
+			      T3N = FNMS(KP250000000, T3M, T3L);
+			      T3O = T3K + T3N;
+			      T5m = T3N - T3K;
+			      T3P = T3D + T3O;
+			      T5U = T5n + T5m;
+			      T4I = T3O - T3D;
+			      T5o = T5m - T5n;
+			 }
+			 {
+			      E T3V, T5k, T3S, T5j, T3Q, T3R;
+			      T3V = FMA(KP951056516, T3T, KP587785252 * T3U);
+			      T5k = FNMS(KP587785252, T3T, KP951056516 * T3U);
+			      T3Q = KP559016994 * (T20 - T2b);
+			      T3R = FNMS(KP250000000, T2c, T1P);
+			      T3S = T3Q + T3R;
+			      T5j = T3R - T3Q;
+			      T3W = T3S - T3V;
+			      T5T = T5j - T5k;
+			      T4H = T3S + T3V;
+			      T5l = T5j + T5k;
+			 }
+		    }
+		    {
+			 E T6m, T6o, TL, T2E, T6d, T6e, T6n, T6f;
+			 {
+			      E T6i, T6l, T1M, T2D;
+			      T6i = T6g - T6h;
+			      T6l = T6j - T6k;
+			      T6m = FMA(KP951056516, T6i, KP587785252 * T6l);
+			      T6o = FNMS(KP587785252, T6i, KP951056516 * T6l);
+			      TL = T1 + TK;
+			      T1M = T1o + T1L;
+			      T2D = T2d + T2C;
+			      T2E = T1M + T2D;
+			      T6d = KP559016994 * (T1M - T2D);
+			      T6e = FNMS(KP250000000, T2E, TL);
+			 }
+			 cr[0] = TL + T2E;
+			 T6n = T6e - T6d;
+			 cr[WS(rs, 10)] = T6n - T6o;
+			 ci[WS(rs, 9)] = T6n + T6o;
+			 T6f = T6d + T6e;
+			 ci[WS(rs, 4)] = T6f - T6m;
+			 cr[WS(rs, 5)] = T6f + T6m;
+		    }
+		    {
+			 E T2P, T4z, T72, T7e, T4m, T7j, T4n, T7i, T4U, T77, T4X, T75, T4O, T6Y, T4P;
+			 E T6X, T4s, T7f, T4v, T7d, T2H, T70;
+			 T2H = T2F + T2G;
+			 T2P = T2H - T2O;
+			 T4z = T2H + T2O;
+			 T70 = T6A + T6z;
+			 T72 = T70 - T71;
+			 T7e = T71 + T70;
+			 {
+			      E T3c, T3z, T3A, T3X, T4k, T4l;
+			      T3c = FMA(KP535826794, T30, KP844327925 * T3b);
+			      T3z = FNMS(KP637423989, T3y, KP770513242 * T3r);
+			      T3A = T3c + T3z;
+			      T3X = FNMS(KP425779291, T3W, KP904827052 * T3P);
+			      T4k = FNMS(KP992114701, T4j, KP125333233 * T4c);
+			      T4l = T3X + T4k;
+			      T4m = T3A + T4l;
+			      T7j = T3X - T4k;
+			      T4n = KP559016994 * (T3A - T4l);
+			      T7i = T3z - T3c;
+			 }
+			 {
+			      E T4S, T4T, T73, T4V, T4W, T74;
+			      T4S = FNMS(KP248689887, T4A, KP968583161 * T4B);
+			      T4T = FNMS(KP844327925, T4D, KP535826794 * T4E);
+			      T73 = T4S + T4T;
+			      T4V = FNMS(KP481753674, T4H, KP876306680 * T4I);
+			      T4W = FNMS(KP684547105, T4K, KP728968627 * T4L);
+			      T74 = T4V + T4W;
+			      T4U = T4S - T4T;
+			      T77 = KP559016994 * (T73 - T74);
+			      T4X = T4V - T4W;
+			      T75 = T73 + T74;
+			 }
+			 {
+			      E T4C, T4F, T4G, T4J, T4M, T4N;
+			      T4C = FMA(KP968583161, T4A, KP248689887 * T4B);
+			      T4F = FMA(KP535826794, T4D, KP844327925 * T4E);
+			      T4G = T4C + T4F;
+			      T4J = FMA(KP876306680, T4H, KP481753674 * T4I);
+			      T4M = FMA(KP728968627, T4K, KP684547105 * T4L);
+			      T4N = T4J + T4M;
+			      T4O = T4G + T4N;
+			      T6Y = T4J - T4M;
+			      T4P = KP559016994 * (T4G - T4N);
+			      T6X = T4F - T4C;
+			 }
+			 {
+			      E T4q, T4r, T7b, T4t, T4u, T7c;
+			      T4q = FNMS(KP844327925, T30, KP535826794 * T3b);
+			      T4r = FMA(KP770513242, T3y, KP637423989 * T3r);
+			      T7b = T4q + T4r;
+			      T4t = FMA(KP125333233, T4j, KP992114701 * T4c);
+			      T4u = FMA(KP904827052, T3W, KP425779291 * T3P);
+			      T7c = T4u + T4t;
+			      T4s = T4q - T4r;
+			      T7f = T7b - T7c;
+			      T4v = T4t - T4u;
+			      T7d = KP559016994 * (T7b + T7c);
+			 }
+			 cr[WS(rs, 4)] = T2P + T4m;
+			 ci[WS(rs, 23)] = T75 + T72;
+			 ci[WS(rs, 20)] = T7f + T7e;
+			 cr[WS(rs, 1)] = T4z + T4O;
+			 {
+			      E T4w, T4y, T4p, T4x, T4o;
+			      T4w = FMA(KP951056516, T4s, KP587785252 * T4v);
+			      T4y = FNMS(KP587785252, T4s, KP951056516 * T4v);
+			      T4o = FNMS(KP250000000, T4m, T2P);
+			      T4p = T4n + T4o;
+			      T4x = T4o - T4n;
+			      ci[0] = T4p - T4w;
+			      ci[WS(rs, 5)] = T4x + T4y;
+			      cr[WS(rs, 9)] = T4p + T4w;
+			      ci[WS(rs, 10)] = T4x - T4y;
+			 }
+			 {
+			      E T6Z, T79, T78, T7a, T76;
+			      T6Z = FMA(KP587785252, T6X, KP951056516 * T6Y);
+			      T79 = FNMS(KP587785252, T6Y, KP951056516 * T6X);
+			      T76 = FNMS(KP250000000, T75, T72);
+			      T78 = T76 - T77;
+			      T7a = T77 + T76;
+			      cr[WS(rs, 16)] = T6Z - T78;
+			      ci[WS(rs, 18)] = T79 + T7a;
+			      ci[WS(rs, 13)] = T6Z + T78;
+			      cr[WS(rs, 21)] = T79 - T7a;
+			 }
+			 {
+			      E T7k, T7l, T7h, T7m, T7g;
+			      T7k = FMA(KP587785252, T7i, KP951056516 * T7j);
+			      T7l = FNMS(KP587785252, T7j, KP951056516 * T7i);
+			      T7g = FNMS(KP250000000, T7f, T7e);
+			      T7h = T7d - T7g;
+			      T7m = T7d + T7g;
+			      cr[WS(rs, 14)] = T7h - T7k;
+			      ci[WS(rs, 15)] = T7l + T7m;
+			      cr[WS(rs, 19)] = T7k + T7h;
+			      cr[WS(rs, 24)] = T7l - T7m;
+			 }
+			 {
+			      E T4Y, T50, T4R, T4Z, T4Q;
+			      T4Y = FMA(KP951056516, T4U, KP587785252 * T4X);
+			      T50 = FNMS(KP587785252, T4U, KP951056516 * T4X);
+			      T4Q = FNMS(KP250000000, T4O, T4z);
+			      T4R = T4P + T4Q;
+			      T4Z = T4Q - T4P;
+			      ci[WS(rs, 3)] = T4R - T4Y;
+			      ci[WS(rs, 8)] = T4Z + T50;
+			      cr[WS(rs, 6)] = T4R + T4Y;
+			      cr[WS(rs, 11)] = T4Z - T50;
+			 }
+		    }
+		    {
+			 E T7p, T7x, T7q, T7t, T7u, T7v, T7y, T7w;
+			 {
+			      E T7n, T7o, T7r, T7s;
+			      T7n = T1L - T1o;
+			      T7o = T2d - T2C;
+			      T7p = FMA(KP587785252, T7n, KP951056516 * T7o);
+			      T7x = FNMS(KP587785252, T7o, KP951056516 * T7n);
+			      T7q = T6y + T6v;
+			      T7r = T6g + T6h;
+			      T7s = T6j + T6k;
+			      T7t = T7r + T7s;
+			      T7u = FNMS(KP250000000, T7t, T7q);
+			      T7v = KP559016994 * (T7r - T7s);
+			 }
+			 ci[WS(rs, 24)] = T7t + T7q;
+			 T7y = T7v + T7u;
+			 cr[WS(rs, 20)] = T7x - T7y;
+			 ci[WS(rs, 19)] = T7x + T7y;
+			 T7w = T7u - T7v;
+			 cr[WS(rs, 15)] = T7p - T7w;
+			 ci[WS(rs, 14)] = T7p + T7w;
+		    }
+		    {
+			 E T53, T5L, T6C, T6O, T5y, T6T, T5z, T6S, T66, T6H, T69, T6F, T60, T6q, T61;
+			 E T6p, T5E, T6P, T5H, T6N, T51, T6B;
+			 T51 = T2G - T2F;
+			 T53 = T51 + T52;
+			 T5L = T51 - T52;
+			 T6B = T6z - T6A;
+			 T6C = T6u + T6B;
+			 T6O = T6B - T6u;
+			 {
+			      E T5a, T5h, T5i, T5p, T5w, T5x;
+			      T5a = FMA(KP728968627, T56, KP684547105 * T59);
+			      T5h = FNMS(KP992114701, T5g, KP125333233 * T5d);
+			      T5i = T5a + T5h;
+			      T5p = FMA(KP062790519, T5l, KP998026728 * T5o);
+			      T5w = FNMS(KP637423989, T5v, KP770513242 * T5s);
+			      T5x = T5p + T5w;
+			      T5y = T5i + T5x;
+			      T6T = T5p - T5w;
+			      T5z = KP559016994 * (T5i - T5x);
+			      T6S = T5h - T5a;
+			 }
+			 {
+			      E T64, T65, T6D, T67, T68, T6E;
+			      T64 = FNMS(KP481753674, T5M, KP876306680 * T5N);
+			      T65 = FMA(KP904827052, T5Q, KP425779291 * T5P);
+			      T6D = T64 - T65;
+			      T67 = FNMS(KP844327925, T5T, KP535826794 * T5U);
+			      T68 = FNMS(KP998026728, T5W, KP062790519 * T5X);
+			      T6E = T67 + T68;
+			      T66 = T64 + T65;
+			      T6H = KP559016994 * (T6D - T6E);
+			      T69 = T67 - T68;
+			      T6F = T6D + T6E;
+			 }
+			 {
+			      E T5O, T5R, T5S, T5V, T5Y, T5Z;
+			      T5O = FMA(KP876306680, T5M, KP481753674 * T5N);
+			      T5R = FNMS(KP425779291, T5Q, KP904827052 * T5P);
+			      T5S = T5O + T5R;
+			      T5V = FMA(KP535826794, T5T, KP844327925 * T5U);
+			      T5Y = FMA(KP062790519, T5W, KP998026728 * T5X);
+			      T5Z = T5V + T5Y;
+			      T60 = T5S + T5Z;
+			      T6q = T5V - T5Y;
+			      T61 = KP559016994 * (T5S - T5Z);
+			      T6p = T5R - T5O;
+			 }
+			 {
+			      E T5C, T5D, T6L, T5F, T5G, T6M;
+			      T5C = FNMS(KP684547105, T56, KP728968627 * T59);
+			      T5D = FMA(KP125333233, T5g, KP992114701 * T5d);
+			      T6L = T5C - T5D;
+			      T5F = FNMS(KP998026728, T5l, KP062790519 * T5o);
+			      T5G = FMA(KP770513242, T5v, KP637423989 * T5s);
+			      T6M = T5F - T5G;
+			      T5E = T5C + T5D;
+			      T6P = T6L + T6M;
+			      T5H = T5F + T5G;
+			      T6N = KP559016994 * (T6L - T6M);
+			 }
+			 cr[WS(rs, 3)] = T53 + T5y;
+			 ci[WS(rs, 22)] = T6F + T6C;
+			 ci[WS(rs, 21)] = T6P + T6O;
+			 cr[WS(rs, 2)] = T5L + T60;
+			 {
+			      E T6r, T6J, T6I, T6K, T6G;
+			      T6r = FMA(KP587785252, T6p, KP951056516 * T6q);
+			      T6J = FNMS(KP587785252, T6q, KP951056516 * T6p);
+			      T6G = FNMS(KP250000000, T6F, T6C);
+			      T6I = T6G - T6H;
+			      T6K = T6H + T6G;
+			      cr[WS(rs, 17)] = T6r - T6I;
+			      ci[WS(rs, 17)] = T6J + T6K;
+			      ci[WS(rs, 12)] = T6r + T6I;
+			      cr[WS(rs, 22)] = T6J - T6K;
+			 }
+			 {
+			      E T6a, T6c, T63, T6b, T62;
+			      T6a = FMA(KP951056516, T66, KP587785252 * T69);
+			      T6c = FNMS(KP587785252, T66, KP951056516 * T69);
+			      T62 = FNMS(KP250000000, T60, T5L);
+			      T63 = T61 + T62;
+			      T6b = T62 - T61;
+			      ci[WS(rs, 2)] = T63 - T6a;
+			      ci[WS(rs, 7)] = T6b + T6c;
+			      cr[WS(rs, 7)] = T63 + T6a;
+			      cr[WS(rs, 12)] = T6b - T6c;
+			 }
+			 {
+			      E T5I, T5K, T5B, T5J, T5A;
+			      T5I = FMA(KP951056516, T5E, KP587785252 * T5H);
+			      T5K = FNMS(KP587785252, T5E, KP951056516 * T5H);
+			      T5A = FNMS(KP250000000, T5y, T53);
+			      T5B = T5z + T5A;
+			      T5J = T5A - T5z;
+			      ci[WS(rs, 1)] = T5B - T5I;
+			      ci[WS(rs, 6)] = T5J + T5K;
+			      cr[WS(rs, 8)] = T5B + T5I;
+			      ci[WS(rs, 11)] = T5J - T5K;
+			 }
+			 {
+			      E T6U, T6V, T6R, T6W, T6Q;
+			      T6U = FMA(KP587785252, T6S, KP951056516 * T6T);
+			      T6V = FNMS(KP587785252, T6T, KP951056516 * T6S);
+			      T6Q = FNMS(KP250000000, T6P, T6O);
+			      T6R = T6N - T6Q;
+			      T6W = T6N + T6Q;
+			      cr[WS(rs, 13)] = T6R - T6U;
+			      ci[WS(rs, 16)] = T6V + T6W;
+			      cr[WS(rs, 18)] = T6U + T6R;
+			      cr[WS(rs, 23)] = T6V - T6W;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 24},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 25, "hf2_25", twinstr, &GENUS, {280, 180, 160, 0} };
+
+void X(codelet_hf2_25) (planner *p) {
+     X(khc2hc_register) (p, hf2_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf2_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf2_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1842 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:04 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -dit -name hf2_32 -include hf.h */
+
+/*
+ * This function contains 488 FP additions, 350 FP multiplications,
+ * (or, 236 additions, 98 multiplications, 252 fused multiply/add),
+ * 181 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T7d, T7a;
+	       {
+		    E T2, T8, T3, T6, Te, Tr, T18, T4, Ta, Tz, T1n, T10, Ti, T5, Tc;
+		    T2 = W[0];
+		    T8 = W[4];
+		    T3 = W[2];
+		    T6 = W[3];
+		    Te = W[6];
+		    Tr = T2 * T8;
+		    T18 = T3 * T8;
+		    T4 = T2 * T3;
+		    Ta = T2 * T6;
+		    Tz = T3 * Te;
+		    T1n = T8 * Te;
+		    T10 = T2 * Te;
+		    Ti = W[7];
+		    T5 = W[1];
+		    Tc = W[5];
+		    {
+			 E T34, T31, T2X, T2T, Tq, T46, T8H, T98, TH, T97, T4b, T8D, TZ, T7g, T4j;
+			 E T6t, T1g, T7f, T4q, T6u, T4z, T6y, T1J, T7j, T7m, T8e, T6x, T4G, T2k, T7o;
+			 E T7r, T8d, T6B, T4O, T6A, T4V, T6P, T61, T7G, T3G, T6M, T5E, T8n, T7N, T6I;
+			 E T5s, T7v, T2N, T6F, T55, T8i, T7C, T5L, T63, T43, T7O, T5S, T62, T7J, T8o;
+			 E T2U, T2R, T2V, T58, T3a, T5h, T2Y, T32, T35;
+			 {
+			      E T1K, T23, T1N, T26, T2b, T1U, T3C, T3j, T3z, T3f, T1R, T29, TR, Th, T2J;
+			      E T2F, Td, TP, T3r, T3n, T2w, T2s, T3Q, T3M, T1Z, T1V, T2g, T2c;
+			      {
+				   E T11, T1C, TM, Tb, TJ, T7, T1o, T19, T1w, T1F, T15, T1s, T1d, T1z, TW;
+				   E TS, Ty, T48, TG, T4a;
+				   {
+					E T1, TA, Ts, TE, Tw, Tn, Tj, T8G, Tk, To, T14;
+					T1 = cr[0];
+					TA = FMA(T6, Ti, Tz);
+					T1K = FNMS(T6, Ti, Tz);
+					T14 = T2 * Ti;
+					{
+					     E T1r, TD, T1c, Tv;
+					     T1r = T8 * Ti;
+					     TD = T3 * Ti;
+					     T11 = FNMS(T5, Ti, T10);
+					     T1C = FMA(T5, Ti, T10);
+					     TM = FMA(T5, T3, Ta);
+					     Tb = FNMS(T5, T3, Ta);
+					     TJ = FNMS(T5, T6, T4);
+					     T7 = FMA(T5, T6, T4);
+					     T1o = FMA(Tc, Ti, T1n);
+					     T23 = FMA(T6, Tc, T18);
+					     T19 = FNMS(T6, Tc, T18);
+					     T1w = FNMS(T5, Tc, Tr);
+					     Ts = FMA(T5, Tc, Tr);
+					     T1c = T3 * Tc;
+					     Tv = T2 * Tc;
+					     T1F = FNMS(T5, Te, T14);
+					     T15 = FMA(T5, Te, T14);
+					     T1s = FNMS(Tc, Te, T1r);
+					     T1N = FMA(T6, Te, TD);
+					     TE = FNMS(T6, Te, TD);
+					     {
+						  E T1T, T3i, T3e, T1Q;
+						  T1T = TJ * Tc;
+						  T3i = TJ * Ti;
+						  T3e = TJ * Te;
+						  T1Q = TJ * T8;
+						  {
+						       E Tg, T2I, T2E, T9;
+						       Tg = T7 * Tc;
+						       T2I = T7 * Ti;
+						       T2E = T7 * Te;
+						       T9 = T7 * T8;
+						       {
+							    E T3q, T3m, T2v, T2r;
+							    T3q = T19 * Ti;
+							    T3m = T19 * Te;
+							    T2v = T1w * Ti;
+							    T2r = T1w * Te;
+							    {
+								 E T2W, T2S, T3P, T3L;
+								 T2W = T23 * Ti;
+								 T2S = T23 * Te;
+								 T3P = Ts * Ti;
+								 T3L = Ts * Te;
+								 T26 = FNMS(T6, T8, T1c);
+								 T1d = FMA(T6, T8, T1c);
+								 T1z = FMA(T5, T8, Tv);
+								 Tw = FNMS(T5, T8, Tv);
+								 T2b = FNMS(TM, T8, T1T);
+								 T1U = FMA(TM, T8, T1T);
+								 T3C = FNMS(TM, Te, T3i);
+								 T3j = FMA(TM, Te, T3i);
+								 T3z = FMA(TM, Ti, T3e);
+								 T3f = FNMS(TM, Ti, T3e);
+								 T1R = FNMS(TM, Tc, T1Q);
+								 T29 = FMA(TM, Tc, T1Q);
+								 TR = FNMS(Tb, T8, Tg);
+								 Th = FMA(Tb, T8, Tg);
+								 T34 = FMA(Tb, Te, T2I);
+								 T2J = FNMS(Tb, Te, T2I);
+								 T31 = FNMS(Tb, Ti, T2E);
+								 T2F = FMA(Tb, Ti, T2E);
+								 Td = FNMS(Tb, Tc, T9);
+								 TP = FMA(Tb, Tc, T9);
+								 T2X = FNMS(T26, Te, T2W);
+								 T2T = FMA(T26, Ti, T2S);
+								 T3r = FNMS(T1d, Te, T3q);
+								 T3n = FMA(T1d, Ti, T3m);
+								 T2w = FNMS(T1z, Te, T2v);
+								 T2s = FMA(T1z, Ti, T2r);
+								 T3Q = FNMS(Tw, Te, T3P);
+								 T3M = FMA(Tw, Ti, T3L);
+								 {
+								      E T1Y, T1S, T2f, T2a;
+								      T1Y = T1R * Ti;
+								      T1S = T1R * Te;
+								      T2f = T29 * Ti;
+								      T2a = T29 * Te;
+								      {
+									   E Tm, Tf, TV, TQ;
+									   Tm = Td * Ti;
+									   Tf = Td * Te;
+									   TV = TP * Ti;
+									   TQ = TP * Te;
+									   T1Z = FNMS(T1U, Te, T1Y);
+									   T1V = FMA(T1U, Ti, T1S);
+									   T2g = FNMS(T2b, Te, T2f);
+									   T2c = FMA(T2b, Ti, T2a);
+									   Tn = FNMS(Th, Te, Tm);
+									   Tj = FMA(Th, Ti, Tf);
+									   TW = FNMS(TR, Te, TV);
+									   TS = FMA(TR, Ti, TQ);
+									   T8G = ci[0];
+								      }
+								 }
+							    }
+						       }
+						  }
+					     }
+					}
+					Tk = cr[WS(rs, 16)];
+					To = ci[WS(rs, 16)];
+					{
+					     E Tt, Tx, Tu, T47, TB, TF, TC, T49;
+					     {
+						  E Tl, T8E, Tp, T8F;
+						  Tt = cr[WS(rs, 8)];
+						  Tx = ci[WS(rs, 8)];
+						  Tl = Tj * Tk;
+						  T8E = Tj * To;
+						  Tu = Ts * Tt;
+						  T47 = Ts * Tx;
+						  Tp = FMA(Tn, To, Tl);
+						  T8F = FNMS(Tn, Tk, T8E);
+						  TB = cr[WS(rs, 24)];
+						  TF = ci[WS(rs, 24)];
+						  Tq = T1 + Tp;
+						  T46 = T1 - Tp;
+						  T8H = T8F + T8G;
+						  T98 = T8G - T8F;
+						  TC = TA * TB;
+						  T49 = TA * TF;
+					     }
+					     Ty = FMA(Tw, Tx, Tu);
+					     T48 = FNMS(Tw, Tt, T47);
+					     TG = FMA(TE, TF, TC);
+					     T4a = FNMS(TE, TB, T49);
+					}
+				   }
+				   {
+					E TT, TX, TO, T4f, TU, T4g;
+					{
+					     E TK, TN, TL, T4e;
+					     TK = cr[WS(rs, 4)];
+					     TN = ci[WS(rs, 4)];
+					     TH = Ty + TG;
+					     T97 = Ty - TG;
+					     T4b = T48 - T4a;
+					     T8D = T48 + T4a;
+					     TL = TJ * TK;
+					     T4e = TJ * TN;
+					     TT = cr[WS(rs, 20)];
+					     TX = ci[WS(rs, 20)];
+					     TO = FMA(TM, TN, TL);
+					     T4f = FNMS(TM, TK, T4e);
+					     TU = TS * TT;
+					     T4g = TS * TX;
+					}
+					{
+					     E T17, T4m, T1a, T1e, T4d, T4i;
+					     {
+						  E T12, T16, TY, T4h, T13, T4l;
+						  T12 = cr[WS(rs, 28)];
+						  T16 = ci[WS(rs, 28)];
+						  TY = FMA(TW, TX, TU);
+						  T4h = FNMS(TW, TT, T4g);
+						  T13 = T11 * T12;
+						  T4l = T11 * T16;
+						  TZ = TO + TY;
+						  T4d = TO - TY;
+						  T7g = T4f + T4h;
+						  T4i = T4f - T4h;
+						  T17 = FMA(T15, T16, T13);
+						  T4m = FNMS(T15, T12, T4l);
+					     }
+					     T4j = T4d - T4i;
+					     T6t = T4d + T4i;
+					     T1a = cr[WS(rs, 12)];
+					     T1e = ci[WS(rs, 12)];
+					     {
+						  E T1m, T4u, T1H, T4E, T1x, T1A, T1u, T4w, T1y, T4B;
+						  {
+						       E T1D, T1G, T1E, T4D;
+						       {
+							    E T1f, T4o, T4k, T4p;
+							    {
+								 E T1j, T1l, T1b, T4n, T1k, T4t;
+								 T1j = cr[WS(rs, 2)];
+								 T1l = ci[WS(rs, 2)];
+								 T1b = T19 * T1a;
+								 T4n = T19 * T1e;
+								 T1k = T7 * T1j;
+								 T4t = T7 * T1l;
+								 T1f = FMA(T1d, T1e, T1b);
+								 T4o = FNMS(T1d, T1a, T4n);
+								 T1m = FMA(Tb, T1l, T1k);
+								 T4u = FNMS(Tb, T1j, T4t);
+							    }
+							    T1g = T17 + T1f;
+							    T4k = T17 - T1f;
+							    T7f = T4m + T4o;
+							    T4p = T4m - T4o;
+							    T1D = cr[WS(rs, 26)];
+							    T1G = ci[WS(rs, 26)];
+							    T4q = T4k + T4p;
+							    T6u = T4k - T4p;
+							    T1E = T1C * T1D;
+							    T4D = T1C * T1G;
+						       }
+						       {
+							    E T1p, T1t, T1q, T4v;
+							    T1p = cr[WS(rs, 18)];
+							    T1t = ci[WS(rs, 18)];
+							    T1H = FMA(T1F, T1G, T1E);
+							    T4E = FNMS(T1F, T1D, T4D);
+							    T1q = T1o * T1p;
+							    T4v = T1o * T1t;
+							    T1x = cr[WS(rs, 10)];
+							    T1A = ci[WS(rs, 10)];
+							    T1u = FMA(T1s, T1t, T1q);
+							    T4w = FNMS(T1s, T1p, T4v);
+							    T1y = T1w * T1x;
+							    T4B = T1w * T1A;
+						       }
+						  }
+						  {
+						       E T4A, T1v, T7k, T4x, T1B, T4C;
+						       T4A = T1m - T1u;
+						       T1v = T1m + T1u;
+						       T7k = T4u + T4w;
+						       T4x = T4u - T4w;
+						       T1B = FMA(T1z, T1A, T1y);
+						       T4C = FNMS(T1z, T1x, T4B);
+						       {
+							    E T1I, T4y, T4F, T7l;
+							    T1I = T1B + T1H;
+							    T4y = T1B - T1H;
+							    T4F = T4C - T4E;
+							    T7l = T4C + T4E;
+							    T4z = T4x + T4y;
+							    T6y = T4x - T4y;
+							    T1J = T1v + T1I;
+							    T7j = T1v - T1I;
+							    T7m = T7k - T7l;
+							    T8e = T7k + T7l;
+							    T6x = T4A + T4F;
+							    T4G = T4A - T4F;
+						       }
+						  }
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T5C, T3u, T5y, T7L, T60, T5V, T3F, T5A, T4P, T4U;
+				   {
+					E T1P, T4J, T2i, T4T, T21, T4L, T28, T4R;
+					{
+					     E T1L, T1O, T1W, T20;
+					     T1L = cr[WS(rs, 30)];
+					     T1O = ci[WS(rs, 30)];
+					     {
+						  E T2d, T2h, T1M, T4I, T2e, T4S;
+						  T2d = cr[WS(rs, 22)];
+						  T2h = ci[WS(rs, 22)];
+						  T1M = T1K * T1L;
+						  T4I = T1K * T1O;
+						  T2e = T2c * T2d;
+						  T4S = T2c * T2h;
+						  T1P = FMA(T1N, T1O, T1M);
+						  T4J = FNMS(T1N, T1L, T4I);
+						  T2i = FMA(T2g, T2h, T2e);
+						  T4T = FNMS(T2g, T2d, T4S);
+					     }
+					     T1W = cr[WS(rs, 14)];
+					     T20 = ci[WS(rs, 14)];
+					     {
+						  E T24, T27, T1X, T4K, T25, T4Q;
+						  T24 = cr[WS(rs, 6)];
+						  T27 = ci[WS(rs, 6)];
+						  T1X = T1V * T1W;
+						  T4K = T1V * T20;
+						  T25 = T23 * T24;
+						  T4Q = T23 * T27;
+						  T21 = FMA(T1Z, T20, T1X);
+						  T4L = FNMS(T1Z, T1W, T4K);
+						  T28 = FMA(T26, T27, T25);
+						  T4R = FNMS(T26, T24, T4Q);
+					     }
+					}
+					{
+					     E T22, T7p, T4M, T4N, T2j, T7q;
+					     T4P = T1P - T21;
+					     T22 = T1P + T21;
+					     T7p = T4J + T4L;
+					     T4M = T4J - T4L;
+					     T4N = T28 - T2i;
+					     T2j = T28 + T2i;
+					     T7q = T4R + T4T;
+					     T4U = T4R - T4T;
+					     T2k = T22 + T2j;
+					     T7o = T22 - T2j;
+					     T7r = T7p - T7q;
+					     T8d = T7p + T7q;
+					     T6B = T4M - T4N;
+					     T4O = T4M + T4N;
+					}
+				   }
+				   {
+					E T3l, T5X, T3E, T3v, T3t, T3w, T3x, T5Z, T3A, T3B, T3D, T3y, T5z;
+					{
+					     E T3g, T3k, T3h, T5W;
+					     T3g = cr[WS(rs, 31)];
+					     T3k = ci[WS(rs, 31)];
+					     T3A = cr[WS(rs, 23)];
+					     T6A = T4P + T4U;
+					     T4V = T4P - T4U;
+					     T3h = T3f * T3g;
+					     T5W = T3f * T3k;
+					     T3B = T3z * T3A;
+					     T3D = ci[WS(rs, 23)];
+					     T3l = FMA(T3j, T3k, T3h);
+					     T5X = FNMS(T3j, T3g, T5W);
+					}
+					{
+					     E T3o, T5B, T3s, T3p, T5Y;
+					     T3o = cr[WS(rs, 15)];
+					     T3E = FMA(T3C, T3D, T3B);
+					     T5B = T3z * T3D;
+					     T3s = ci[WS(rs, 15)];
+					     T3p = T3n * T3o;
+					     T3v = cr[WS(rs, 7)];
+					     T5C = FNMS(T3C, T3A, T5B);
+					     T5Y = T3n * T3s;
+					     T3t = FMA(T3r, T3s, T3p);
+					     T3w = TP * T3v;
+					     T3x = ci[WS(rs, 7)];
+					     T5Z = FNMS(T3r, T3o, T5Y);
+					}
+					T3u = T3l + T3t;
+					T5y = T3l - T3t;
+					T3y = FMA(TR, T3x, T3w);
+					T5z = TP * T3x;
+					T7L = T5X + T5Z;
+					T60 = T5X - T5Z;
+					T5V = T3E - T3y;
+					T3F = T3y + T3E;
+					T5A = FNMS(TR, T3v, T5z);
+				   }
+				   {
+					E T2L, T53, T4Z, T2z, T7A, T5q, T2D, T51;
+					{
+					     E T2q, T5n, T2y, T2A, T2C, T5p, T2B, T50;
+					     {
+						  E T2G, T2K, T2n, T5m, T2t, T5o;
+						  {
+						       E T2o, T2p, T5D, T7M;
+						       T2n = cr[WS(rs, 1)];
+						       T6P = T60 + T5V;
+						       T61 = T5V - T60;
+						       T7G = T3u - T3F;
+						       T3G = T3u + T3F;
+						       T5D = T5A - T5C;
+						       T7M = T5A + T5C;
+						       T2o = T2 * T2n;
+						       T2p = ci[WS(rs, 1)];
+						       T6M = T5y + T5D;
+						       T5E = T5y - T5D;
+						       T8n = T7L + T7M;
+						       T7N = T7L - T7M;
+						       T5m = T2 * T2p;
+						       T2q = FMA(T5, T2p, T2o);
+						  }
+						  T2G = cr[WS(rs, 25)];
+						  T2K = ci[WS(rs, 25)];
+						  T5n = FNMS(T5, T2n, T5m);
+						  {
+						       E T2x, T2u, T2H, T52;
+						       T2t = cr[WS(rs, 17)];
+						       T2H = T2F * T2G;
+						       T52 = T2F * T2K;
+						       T2x = ci[WS(rs, 17)];
+						       T2u = T2s * T2t;
+						       T2L = FMA(T2J, T2K, T2H);
+						       T53 = FNMS(T2J, T2G, T52);
+						       T5o = T2s * T2x;
+						       T2y = FMA(T2w, T2x, T2u);
+						  }
+						  T2A = cr[WS(rs, 9)];
+						  T2C = ci[WS(rs, 9)];
+						  T5p = FNMS(T2w, T2t, T5o);
+					     }
+					     T4Z = T2q - T2y;
+					     T2z = T2q + T2y;
+					     T2B = T8 * T2A;
+					     T50 = T8 * T2C;
+					     T7A = T5n + T5p;
+					     T5q = T5n - T5p;
+					     T2D = FMA(Tc, T2C, T2B);
+					     T51 = FNMS(Tc, T2A, T50);
+					}
+					{
+					     E T3N, T3K, T3O, T5H, T41, T5Q, T3R, T3U, T3W;
+					     {
+						  E T3H, T3I, T3J, T3Y, T40, T5G, T3Z, T5P;
+						  T3H = cr[WS(rs, 3)];
+						  {
+						       E T5r, T2M, T54, T7B;
+						       T5r = T2D - T2L;
+						       T2M = T2D + T2L;
+						       T54 = T51 - T53;
+						       T7B = T51 + T53;
+						       T6I = T5q - T5r;
+						       T5s = T5q + T5r;
+						       T7v = T2z - T2M;
+						       T2N = T2z + T2M;
+						       T6F = T4Z + T54;
+						       T55 = T4Z - T54;
+						       T8i = T7A + T7B;
+						       T7C = T7A - T7B;
+						       T3I = T3 * T3H;
+						  }
+						  T3J = ci[WS(rs, 3)];
+						  T3Y = cr[WS(rs, 11)];
+						  T40 = ci[WS(rs, 11)];
+						  T3N = cr[WS(rs, 19)];
+						  T3K = FMA(T6, T3J, T3I);
+						  T5G = T3 * T3J;
+						  T3Z = Td * T3Y;
+						  T5P = Td * T40;
+						  T3O = T3M * T3N;
+						  T5H = FNMS(T6, T3H, T5G);
+						  T41 = FMA(Th, T40, T3Z);
+						  T5Q = FNMS(Th, T3Y, T5P);
+						  T3R = ci[WS(rs, 19)];
+						  T3U = cr[WS(rs, 27)];
+						  T3W = ci[WS(rs, 27)];
+					     }
+					     {
+						  E T2O, T2P, T2Q, T37, T39, T57, T38, T5g;
+						  {
+						       E T3T, T5F, T5J, T3X, T5O, T7I, T5K;
+						       T2O = cr[WS(rs, 5)];
+						       {
+							    E T3S, T5I, T3V, T5N;
+							    T3S = FMA(T3Q, T3R, T3O);
+							    T5I = T3M * T3R;
+							    T3V = Te * T3U;
+							    T5N = Te * T3W;
+							    T3T = T3K + T3S;
+							    T5F = T3K - T3S;
+							    T5J = FNMS(T3Q, T3N, T5I);
+							    T3X = FMA(Ti, T3W, T3V);
+							    T5O = FNMS(Ti, T3U, T5N);
+							    T2P = T29 * T2O;
+						       }
+						       T7I = T5H + T5J;
+						       T5K = T5H - T5J;
+						       {
+							    E T42, T5M, T7H, T5R;
+							    T42 = T3X + T41;
+							    T5M = T3X - T41;
+							    T7H = T5O + T5Q;
+							    T5R = T5O - T5Q;
+							    T5L = T5F - T5K;
+							    T63 = T5F + T5K;
+							    T43 = T3T + T42;
+							    T7O = T42 - T3T;
+							    T5S = T5M + T5R;
+							    T62 = T5M - T5R;
+							    T7J = T7H - T7I;
+							    T8o = T7I + T7H;
+							    T2Q = ci[WS(rs, 5)];
+						       }
+						  }
+						  T37 = cr[WS(rs, 13)];
+						  T39 = ci[WS(rs, 13)];
+						  T2U = cr[WS(rs, 21)];
+						  T2R = FMA(T2b, T2Q, T2P);
+						  T57 = T29 * T2Q;
+						  T38 = T1R * T37;
+						  T5g = T1R * T39;
+						  T2V = T2T * T2U;
+						  T58 = FNMS(T2b, T2O, T57);
+						  T3a = FMA(T1U, T39, T38);
+						  T5h = FNMS(T1U, T37, T5g);
+						  T2Y = ci[WS(rs, 21)];
+						  T32 = cr[WS(rs, 29)];
+						  T35 = ci[WS(rs, 29)];
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T7e, T8T, T7D, T7y, T7h, T8U, T6s, T9o, T9n, T6v, T6Q, T6N, T6J, T6G, T6o;
+			      E T6r;
+			      {
+				   E T5c, T5t, T5j, T5u, T8s, T8v;
+				   {
+					E T8c, T1i, T8A, T8z, T8O, T8J, T8N, T2l, T8L, T45, T8t, T8l, T8u, T8q, T3c;
+					E T8p, T8k, T8w, T2m;
+					{
+					     E T8x, T8y, T8j, T8C, T8I;
+					     {
+						  E TI, T30, T56, T5a, T36, T5f, T1h, T7x, T5b;
+						  TI = Tq + TH;
+						  T7e = Tq - TH;
+						  {
+						       E T2Z, T59, T33, T5e;
+						       T2Z = FMA(T2X, T2Y, T2V);
+						       T59 = T2T * T2Y;
+						       T33 = T31 * T32;
+						       T5e = T31 * T35;
+						       T30 = T2R + T2Z;
+						       T56 = T2R - T2Z;
+						       T5a = FNMS(T2X, T2U, T59);
+						       T36 = FMA(T34, T35, T33);
+						       T5f = FNMS(T34, T32, T5e);
+						       T1h = TZ + T1g;
+						       T8T = TZ - T1g;
+						  }
+						  T7x = T58 + T5a;
+						  T5b = T58 - T5a;
+						  {
+						       E T3b, T5d, T7w, T5i;
+						       T3b = T36 + T3a;
+						       T5d = T36 - T3a;
+						       T7w = T5f + T5h;
+						       T5i = T5f - T5h;
+						       T5c = T56 - T5b;
+						       T5t = T56 + T5b;
+						       T3c = T30 + T3b;
+						       T7D = T30 - T3b;
+						       T5j = T5d + T5i;
+						       T5u = T5i - T5d;
+						       T7y = T7w - T7x;
+						       T8j = T7x + T7w;
+						       T8c = TI - T1h;
+						       T1i = TI + T1h;
+						  }
+					     }
+					     T8p = T8n - T8o;
+					     T8x = T8n + T8o;
+					     T8y = T8i + T8j;
+					     T8k = T8i - T8j;
+					     T7h = T7f - T7g;
+					     T8C = T7g + T7f;
+					     T8I = T8D + T8H;
+					     T8U = T8H - T8D;
+					     T8A = T8y + T8x;
+					     T8z = T8x - T8y;
+					     T8O = T8I - T8C;
+					     T8J = T8C + T8I;
+					}
+					{
+					     E T8h, T8m, T3d, T44;
+					     T8h = T2N - T3c;
+					     T3d = T2N + T3c;
+					     T44 = T3G + T43;
+					     T8m = T3G - T43;
+					     T8N = T1J - T2k;
+					     T2l = T1J + T2k;
+					     T8L = T44 - T3d;
+					     T45 = T3d + T44;
+					     T8t = T8h - T8k;
+					     T8l = T8h + T8k;
+					     T8u = T8m + T8p;
+					     T8q = T8m - T8p;
+					}
+					T8w = T1i - T2l;
+					T2m = T1i + T2l;
+					{
+					     E T8Q, T8R, T8P, T8S;
+					     {
+						  E T8r, T8M, T8K, T8g, T8B, T8f;
+						  T8Q = T8q - T8l;
+						  T8r = T8l + T8q;
+						  T8B = T8e + T8d;
+						  T8f = T8d - T8e;
+						  cr[0] = T2m + T45;
+						  ci[WS(rs, 15)] = T2m - T45;
+						  ci[WS(rs, 7)] = T8w + T8z;
+						  cr[WS(rs, 8)] = T8w - T8z;
+						  T8M = T8J - T8B;
+						  T8K = T8B + T8J;
+						  T8g = T8c - T8f;
+						  T8s = T8c + T8f;
+						  T8R = T8O - T8N;
+						  T8P = T8N + T8O;
+						  ci[WS(rs, 23)] = T8L + T8M;
+						  cr[WS(rs, 24)] = T8L - T8M;
+						  ci[WS(rs, 31)] = T8A + T8K;
+						  cr[WS(rs, 16)] = T8A - T8K;
+						  cr[WS(rs, 4)] = FMA(KP707106781, T8r, T8g);
+						  ci[WS(rs, 11)] = FNMS(KP707106781, T8r, T8g);
+					     }
+					     T8S = T8u - T8t;
+					     T8v = T8t + T8u;
+					     ci[WS(rs, 19)] = FMA(KP707106781, T8Q, T8P);
+					     cr[WS(rs, 28)] = FMS(KP707106781, T8Q, T8P);
+					     ci[WS(rs, 27)] = FMA(KP707106781, T8S, T8R);
+					     cr[WS(rs, 20)] = FMS(KP707106781, T8S, T8R);
+					}
+				   }
+				   {
+					E T6c, T4s, T9c, T4X, T9h, T9b, T9i, T6f, T5l, T6h, T6m, T6q, T6a, T66, T5v;
+					{
+					     E T6d, T4H, T4W, T6e, T99, T9a, T4c, T4r, T5T, T64;
+					     T6s = T46 + T4b;
+					     T4c = T46 - T4b;
+					     T4r = T4j + T4q;
+					     T9o = T4q - T4j;
+					     T6d = FNMS(KP414213562, T4z, T4G);
+					     T4H = FMA(KP414213562, T4G, T4z);
+					     ci[WS(rs, 3)] = FMA(KP707106781, T8v, T8s);
+					     cr[WS(rs, 12)] = FNMS(KP707106781, T8v, T8s);
+					     T6c = FMA(KP707106781, T4r, T4c);
+					     T4s = FNMS(KP707106781, T4r, T4c);
+					     T4W = FNMS(KP414213562, T4V, T4O);
+					     T6e = FMA(KP414213562, T4O, T4V);
+					     T9n = T98 - T97;
+					     T99 = T97 + T98;
+					     T9a = T6t - T6u;
+					     T6v = T6t + T6u;
+					     T9c = T4H + T4W;
+					     T4X = T4H - T4W;
+					     T9h = FNMS(KP707106781, T9a, T99);
+					     T9b = FMA(KP707106781, T9a, T99);
+					     T6Q = T5S - T5L;
+					     T5T = T5L + T5S;
+					     T64 = T62 - T63;
+					     T6N = T63 + T62;
+					     {
+						  E T6k, T5U, T6l, T65, T5k;
+						  T6J = T5j - T5c;
+						  T5k = T5c + T5j;
+						  T9i = T6e - T6d;
+						  T6f = T6d + T6e;
+						  T6k = FMA(KP707106781, T5T, T5E);
+						  T5U = FNMS(KP707106781, T5T, T5E);
+						  T6l = FMA(KP707106781, T64, T61);
+						  T65 = FNMS(KP707106781, T64, T61);
+						  T5l = FNMS(KP707106781, T5k, T55);
+						  T6h = FMA(KP707106781, T5k, T55);
+						  T6m = FNMS(KP198912367, T6l, T6k);
+						  T6q = FMA(KP198912367, T6k, T6l);
+						  T6a = FNMS(KP668178637, T5U, T65);
+						  T66 = FMA(KP668178637, T65, T5U);
+						  T5v = T5t + T5u;
+						  T6G = T5t - T5u;
+					     }
+					}
+					{
+					     E T68, T4Y, T9j, T9l, T6i, T5w;
+					     T68 = FNMS(KP923879532, T4X, T4s);
+					     T4Y = FMA(KP923879532, T4X, T4s);
+					     T9j = FMA(KP923879532, T9i, T9h);
+					     T9l = FNMS(KP923879532, T9i, T9h);
+					     T6i = FMA(KP707106781, T5v, T5s);
+					     T5w = FNMS(KP707106781, T5v, T5s);
+					     {
+						  E T9g, T9f, T9d, T9e;
+						  {
+						       E T6g, T6p, T69, T5x, T6n, T6j;
+						       T6o = FNMS(KP923879532, T6f, T6c);
+						       T6g = FMA(KP923879532, T6f, T6c);
+						       T6j = FNMS(KP198912367, T6i, T6h);
+						       T6p = FMA(KP198912367, T6h, T6i);
+						       T69 = FNMS(KP668178637, T5l, T5w);
+						       T5x = FMA(KP668178637, T5w, T5l);
+						       T6n = T6j + T6m;
+						       T9g = T6m - T6j;
+						       T9f = FNMS(KP923879532, T9c, T9b);
+						       T9d = FMA(KP923879532, T9c, T9b);
+						       {
+							    E T6b, T9k, T9m, T67;
+							    T6b = T69 + T6a;
+							    T9k = T69 - T6a;
+							    T9m = T66 - T5x;
+							    T67 = T5x + T66;
+							    ci[0] = FMA(KP980785280, T6n, T6g);
+							    cr[WS(rs, 15)] = FNMS(KP980785280, T6n, T6g);
+							    ci[WS(rs, 4)] = FNMS(KP831469612, T6b, T68);
+							    cr[WS(rs, 11)] = FMA(KP831469612, T6b, T68);
+							    ci[WS(rs, 28)] = FMA(KP831469612, T9k, T9j);
+							    cr[WS(rs, 19)] = FMS(KP831469612, T9k, T9j);
+							    ci[WS(rs, 20)] = FMA(KP831469612, T9m, T9l);
+							    cr[WS(rs, 27)] = FMS(KP831469612, T9m, T9l);
+							    cr[WS(rs, 3)] = FMA(KP831469612, T67, T4Y);
+							    ci[WS(rs, 12)] = FNMS(KP831469612, T67, T4Y);
+							    T9e = T6q - T6p;
+							    T6r = T6p + T6q;
+						       }
+						  }
+						  ci[WS(rs, 16)] = FMA(KP980785280, T9e, T9d);
+						  cr[WS(rs, 31)] = FMS(KP980785280, T9e, T9d);
+						  ci[WS(rs, 24)] = FMA(KP980785280, T9g, T9f);
+						  cr[WS(rs, 23)] = FMS(KP980785280, T9g, T9f);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T88, T90, T8Z, T8b;
+				   {
+					E T7K, T7W, T7i, T7P, T8a, T86, T91, T8V, T8W, T7t, T7U, T7F, T92, T7Z, T89;
+					E T83;
+					{
+					     E T7X, T7n, T7s, T7Y, T84, T85;
+					     T7K = T7G - T7J;
+					     T84 = T7G + T7J;
+					     cr[WS(rs, 7)] = FMA(KP980785280, T6r, T6o);
+					     ci[WS(rs, 8)] = FNMS(KP980785280, T6r, T6o);
+					     T7W = T7e + T7h;
+					     T7i = T7e - T7h;
+					     T85 = T7O - T7N;
+					     T7P = T7N + T7O;
+					     T7X = T7j - T7m;
+					     T7n = T7j + T7m;
+					     T8a = FMA(KP414213562, T84, T85);
+					     T86 = FNMS(KP414213562, T85, T84);
+					     T91 = T8U - T8T;
+					     T8V = T8T + T8U;
+					     T7s = T7o - T7r;
+					     T7Y = T7o + T7r;
+					     {
+						  E T81, T82, T7z, T7E;
+						  T81 = T7v + T7y;
+						  T7z = T7v - T7y;
+						  T7E = T7C - T7D;
+						  T82 = T7C + T7D;
+						  T8W = T7n - T7s;
+						  T7t = T7n + T7s;
+						  T7U = FNMS(KP414213562, T7z, T7E);
+						  T7F = FMA(KP414213562, T7E, T7z);
+						  T92 = T7Y - T7X;
+						  T7Z = T7X + T7Y;
+						  T89 = FMA(KP414213562, T81, T82);
+						  T83 = FNMS(KP414213562, T82, T81);
+					     }
+					}
+					{
+					     E T7S, T7u, T93, T95, T7T, T7Q;
+					     T7S = FNMS(KP707106781, T7t, T7i);
+					     T7u = FMA(KP707106781, T7t, T7i);
+					     T93 = FMA(KP707106781, T92, T91);
+					     T95 = FNMS(KP707106781, T92, T91);
+					     T7T = FMA(KP414213562, T7K, T7P);
+					     T7Q = FNMS(KP414213562, T7P, T7K);
+					     {
+						  E T80, T87, T8X, T8Y;
+						  T88 = FNMS(KP707106781, T7Z, T7W);
+						  T80 = FMA(KP707106781, T7Z, T7W);
+						  {
+						       E T7V, T94, T96, T7R;
+						       T7V = T7T - T7U;
+						       T94 = T7U + T7T;
+						       T96 = T7Q - T7F;
+						       T7R = T7F + T7Q;
+						       ci[WS(rs, 5)] = FMA(KP923879532, T7V, T7S);
+						       cr[WS(rs, 10)] = FNMS(KP923879532, T7V, T7S);
+						       ci[WS(rs, 29)] = FMA(KP923879532, T94, T93);
+						       cr[WS(rs, 18)] = FMS(KP923879532, T94, T93);
+						       ci[WS(rs, 21)] = FMA(KP923879532, T96, T95);
+						       cr[WS(rs, 26)] = FMS(KP923879532, T96, T95);
+						       cr[WS(rs, 2)] = FMA(KP923879532, T7R, T7u);
+						       ci[WS(rs, 13)] = FNMS(KP923879532, T7R, T7u);
+						       T87 = T83 + T86;
+						       T90 = T86 - T83;
+						  }
+						  T8Z = FNMS(KP707106781, T8W, T8V);
+						  T8X = FMA(KP707106781, T8W, T8V);
+						  T8Y = T8a - T89;
+						  T8b = T89 + T8a;
+						  ci[WS(rs, 1)] = FMA(KP923879532, T87, T80);
+						  cr[WS(rs, 14)] = FNMS(KP923879532, T87, T80);
+						  ci[WS(rs, 17)] = FMA(KP923879532, T8Y, T8X);
+						  cr[WS(rs, 30)] = FMS(KP923879532, T8Y, T8X);
+					     }
+					}
+				   }
+				   {
+					E T6Y, T6w, T9w, T6D, T9v, T9p, T9q, T71, T6O, T76;
+					{
+					     E T70, T6Z, T6z, T6C;
+					     ci[WS(rs, 25)] = FMA(KP923879532, T90, T8Z);
+					     cr[WS(rs, 22)] = FMS(KP923879532, T90, T8Z);
+					     cr[WS(rs, 6)] = FMA(KP923879532, T8b, T88);
+					     ci[WS(rs, 9)] = FNMS(KP923879532, T8b, T88);
+					     T70 = FNMS(KP414213562, T6x, T6y);
+					     T6z = FMA(KP414213562, T6y, T6x);
+					     T6C = FNMS(KP414213562, T6B, T6A);
+					     T6Z = FMA(KP414213562, T6A, T6B);
+					     T6Y = FNMS(KP707106781, T6v, T6s);
+					     T6w = FMA(KP707106781, T6v, T6s);
+					     T9w = T6z - T6C;
+					     T6D = T6z + T6C;
+					     T9v = FNMS(KP707106781, T9o, T9n);
+					     T9p = FMA(KP707106781, T9o, T9n);
+					     T9q = T70 + T6Z;
+					     T71 = T6Z - T70;
+					     T6O = FMA(KP707106781, T6N, T6M);
+					     T76 = FNMS(KP707106781, T6N, T6M);
+					}
+					{
+					     E T6U, T9u, T79, T6X, T9s, T9t, T9r, T72;
+					     {
+						  E T6E, T78, T6V, T6S, T75, T6W, T6L, T9x, T9z, T9y, T6T, T9A;
+						  {
+						       E T7c, T7b, T77, T6R;
+						       T6U = FNMS(KP923879532, T6D, T6w);
+						       T6E = FMA(KP923879532, T6D, T6w);
+						       T77 = FNMS(KP707106781, T6Q, T6P);
+						       T6R = FMA(KP707106781, T6Q, T6P);
+						       {
+							    E T73, T6H, T74, T6K;
+							    T73 = FNMS(KP707106781, T6G, T6F);
+							    T6H = FMA(KP707106781, T6G, T6F);
+							    T74 = FNMS(KP707106781, T6J, T6I);
+							    T6K = FMA(KP707106781, T6J, T6I);
+							    T78 = FMA(KP668178637, T77, T76);
+							    T7c = FNMS(KP668178637, T76, T77);
+							    T6V = FMA(KP198912367, T6O, T6R);
+							    T6S = FNMS(KP198912367, T6R, T6O);
+							    T75 = FNMS(KP668178637, T74, T73);
+							    T7b = FMA(KP668178637, T73, T74);
+							    T6W = FNMS(KP198912367, T6H, T6K);
+							    T6L = FMA(KP198912367, T6K, T6H);
+						       }
+						       T9x = FMA(KP923879532, T9w, T9v);
+						       T9z = FNMS(KP923879532, T9w, T9v);
+						       T7d = T7b - T7c;
+						       T9y = T7b + T7c;
+						  }
+						  T9u = T6S - T6L;
+						  T6T = T6L + T6S;
+						  T9A = T78 - T75;
+						  T79 = T75 + T78;
+						  ci[WS(rs, 18)] = FNMS(KP831469612, T9y, T9x);
+						  cr[WS(rs, 29)] = -(FMA(KP831469612, T9y, T9x));
+						  cr[WS(rs, 1)] = FMA(KP980785280, T6T, T6E);
+						  ci[WS(rs, 14)] = FNMS(KP980785280, T6T, T6E);
+						  cr[WS(rs, 21)] = FMS(KP831469612, T9A, T9z);
+						  ci[WS(rs, 26)] = FMA(KP831469612, T9A, T9z);
+						  T6X = T6V - T6W;
+						  T9s = T6W + T6V;
+					     }
+					     T7a = FNMS(KP923879532, T71, T6Y);
+					     T72 = FMA(KP923879532, T71, T6Y);
+					     T9t = FNMS(KP923879532, T9q, T9p);
+					     T9r = FMA(KP923879532, T9q, T9p);
+					     ci[WS(rs, 6)] = FMA(KP980785280, T6X, T6U);
+					     cr[WS(rs, 9)] = FNMS(KP980785280, T6X, T6U);
+					     ci[WS(rs, 2)] = FMA(KP831469612, T79, T72);
+					     cr[WS(rs, 13)] = FNMS(KP831469612, T79, T72);
+					     ci[WS(rs, 30)] = FMA(KP980785280, T9s, T9r);
+					     cr[WS(rs, 17)] = FMS(KP980785280, T9s, T9r);
+					     ci[WS(rs, 22)] = FMA(KP980785280, T9u, T9t);
+					     cr[WS(rs, 25)] = FMS(KP980785280, T9u, T9t);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 5)] = FMA(KP831469612, T7d, T7a);
+	       ci[WS(rs, 10)] = FNMS(KP831469612, T7d, T7a);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 32, "hf2_32", twinstr, &GENUS, {236, 98, 252, 0} };
+
+void X(codelet_hf2_32) (planner *p) {
+     X(khc2hc_register) (p, hf2_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -dit -name hf2_32 -include hf.h */
+
+/*
+ * This function contains 488 FP additions, 280 FP multiplications,
+ * (or, 376 additions, 168 multiplications, 112 fused multiply/add),
+ * 158 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T2, T5, T3, T6, T8, TM, TO, Td, T9, Te, Th, Tl, TD, TH, T1y;
+	       E T1H, T15, T1A, T11, T1F, T1n, T1p, T2q, T2I, T2u, T2K, T2V, T3b, T2Z, T3d;
+	       E Tu, Ty, T3l, T3n, T1t, T1v, T2f, T2h, T1a, T1e, T32, T34, T1W, T1Y, T2C;
+	       E T2E, Tg, TR, Tk, TS, Tm, TV, To, TT, T1M, T21, T1P, T22, T1Q, T25;
+	       E T1S, T23;
+	       {
+		    E Ts, T1d, Tx, T18, Tt, T1c, Tw, T19, TB, T14, TG, TZ, TC, T13, TF;
+		    E T10;
+		    {
+			 E T4, Tc, T7, Tb;
+			 T2 = W[0];
+			 T5 = W[1];
+			 T3 = W[2];
+			 T6 = W[3];
+			 T4 = T2 * T3;
+			 Tc = T5 * T3;
+			 T7 = T5 * T6;
+			 Tb = T2 * T6;
+			 T8 = T4 + T7;
+			 TM = T4 - T7;
+			 TO = Tb + Tc;
+			 Td = Tb - Tc;
+			 T9 = W[4];
+			 Ts = T2 * T9;
+			 T1d = T6 * T9;
+			 Tx = T5 * T9;
+			 T18 = T3 * T9;
+			 Te = W[5];
+			 Tt = T5 * Te;
+			 T1c = T3 * Te;
+			 Tw = T2 * Te;
+			 T19 = T6 * Te;
+			 Th = W[6];
+			 TB = T3 * Th;
+			 T14 = T5 * Th;
+			 TG = T6 * Th;
+			 TZ = T2 * Th;
+			 Tl = W[7];
+			 TC = T6 * Tl;
+			 T13 = T2 * Tl;
+			 TF = T3 * Tl;
+			 T10 = T5 * Tl;
+		    }
+		    TD = TB + TC;
+		    TH = TF - TG;
+		    T1y = TZ + T10;
+		    T1H = TF + TG;
+		    T15 = T13 + T14;
+		    T1A = T13 - T14;
+		    T11 = TZ - T10;
+		    T1F = TB - TC;
+		    T1n = FMA(T9, Th, Te * Tl);
+		    T1p = FNMS(Te, Th, T9 * Tl);
+		    {
+			 E T2o, T2p, T2s, T2t;
+			 T2o = T8 * Th;
+			 T2p = Td * Tl;
+			 T2q = T2o + T2p;
+			 T2I = T2o - T2p;
+			 T2s = T8 * Tl;
+			 T2t = Td * Th;
+			 T2u = T2s - T2t;
+			 T2K = T2s + T2t;
+		    }
+		    {
+			 E T2T, T2U, T2X, T2Y;
+			 T2T = TM * Th;
+			 T2U = TO * Tl;
+			 T2V = T2T - T2U;
+			 T3b = T2T + T2U;
+			 T2X = TM * Tl;
+			 T2Y = TO * Th;
+			 T2Z = T2X + T2Y;
+			 T3d = T2X - T2Y;
+			 Tu = Ts + Tt;
+			 Ty = Tw - Tx;
+			 T3l = FMA(Tu, Th, Ty * Tl);
+			 T3n = FNMS(Ty, Th, Tu * Tl);
+		    }
+		    T1t = Ts - Tt;
+		    T1v = Tw + Tx;
+		    T2f = FMA(T1t, Th, T1v * Tl);
+		    T2h = FNMS(T1v, Th, T1t * Tl);
+		    T1a = T18 - T19;
+		    T1e = T1c + T1d;
+		    T32 = FMA(T1a, Th, T1e * Tl);
+		    T34 = FNMS(T1e, Th, T1a * Tl);
+		    T1W = T18 + T19;
+		    T1Y = T1c - T1d;
+		    T2C = FMA(T1W, Th, T1Y * Tl);
+		    T2E = FNMS(T1Y, Th, T1W * Tl);
+		    {
+			 E Ta, Tf, Ti, Tj;
+			 Ta = T8 * T9;
+			 Tf = Td * Te;
+			 Tg = Ta - Tf;
+			 TR = Ta + Tf;
+			 Ti = T8 * Te;
+			 Tj = Td * T9;
+			 Tk = Ti + Tj;
+			 TS = Ti - Tj;
+		    }
+		    Tm = FMA(Tg, Th, Tk * Tl);
+		    TV = FNMS(TS, Th, TR * Tl);
+		    To = FNMS(Tk, Th, Tg * Tl);
+		    TT = FMA(TR, Th, TS * Tl);
+		    {
+			 E T1K, T1L, T1N, T1O;
+			 T1K = TM * T9;
+			 T1L = TO * Te;
+			 T1M = T1K - T1L;
+			 T21 = T1K + T1L;
+			 T1N = TM * Te;
+			 T1O = TO * T9;
+			 T1P = T1N + T1O;
+			 T22 = T1N - T1O;
+		    }
+		    T1Q = FMA(T1M, Th, T1P * Tl);
+		    T25 = FNMS(T22, Th, T21 * Tl);
+		    T1S = FNMS(T1P, Th, T1M * Tl);
+		    T23 = FMA(T21, Th, T22 * Tl);
+	       }
+	       {
+		    E TL, T6f, T8c, T8q, T3F, T5t, T7I, T7W, T2y, T6B, T6y, T7j, T4k, T5G, T4B;
+		    E T5J, T3h, T6H, T6O, T7o, T4L, T5Q, T52, T5N, T1i, T7V, T6i, T7D, T3K, T5u;
+		    E T3P, T5v, T1E, T6k, T6n, T7f, T3W, T5z, T41, T5y, T29, T6p, T6s, T7e, T47;
+		    E T5C, T4c, T5B, T2R, T6z, T6E, T7k, T4v, T5K, T4E, T5H, T3y, T6P, T6K, T7p;
+		    E T4W, T5O, T55, T5R;
+		    {
+			 E T1, T7G, Tq, T7F, TA, T3C, TJ, T3D, Tn, Tp;
+			 T1 = cr[0];
+			 T7G = ci[0];
+			 Tn = cr[WS(rs, 16)];
+			 Tp = ci[WS(rs, 16)];
+			 Tq = FMA(Tm, Tn, To * Tp);
+			 T7F = FNMS(To, Tn, Tm * Tp);
+			 {
+			      E Tv, Tz, TE, TI;
+			      Tv = cr[WS(rs, 8)];
+			      Tz = ci[WS(rs, 8)];
+			      TA = FMA(Tu, Tv, Ty * Tz);
+			      T3C = FNMS(Ty, Tv, Tu * Tz);
+			      TE = cr[WS(rs, 24)];
+			      TI = ci[WS(rs, 24)];
+			      TJ = FMA(TD, TE, TH * TI);
+			      T3D = FNMS(TH, TE, TD * TI);
+			 }
+			 {
+			      E Tr, TK, T8a, T8b;
+			      Tr = T1 + Tq;
+			      TK = TA + TJ;
+			      TL = Tr + TK;
+			      T6f = Tr - TK;
+			      T8a = TA - TJ;
+			      T8b = T7G - T7F;
+			      T8c = T8a + T8b;
+			      T8q = T8b - T8a;
+			 }
+			 {
+			      E T3B, T3E, T7E, T7H;
+			      T3B = T1 - Tq;
+			      T3E = T3C - T3D;
+			      T3F = T3B + T3E;
+			      T5t = T3B - T3E;
+			      T7E = T3C + T3D;
+			      T7H = T7F + T7G;
+			      T7I = T7E + T7H;
+			      T7W = T7H - T7E;
+			 }
+		    }
+		    {
+			 E T2e, T4x, T2w, T4i, T2j, T4y, T2n, T4h;
+			 {
+			      E T2c, T2d, T2r, T2v;
+			      T2c = cr[WS(rs, 1)];
+			      T2d = ci[WS(rs, 1)];
+			      T2e = FMA(T2, T2c, T5 * T2d);
+			      T4x = FNMS(T5, T2c, T2 * T2d);
+			      T2r = cr[WS(rs, 25)];
+			      T2v = ci[WS(rs, 25)];
+			      T2w = FMA(T2q, T2r, T2u * T2v);
+			      T4i = FNMS(T2u, T2r, T2q * T2v);
+			 }
+			 {
+			      E T2g, T2i, T2l, T2m;
+			      T2g = cr[WS(rs, 17)];
+			      T2i = ci[WS(rs, 17)];
+			      T2j = FMA(T2f, T2g, T2h * T2i);
+			      T4y = FNMS(T2h, T2g, T2f * T2i);
+			      T2l = cr[WS(rs, 9)];
+			      T2m = ci[WS(rs, 9)];
+			      T2n = FMA(T9, T2l, Te * T2m);
+			      T4h = FNMS(Te, T2l, T9 * T2m);
+			 }
+			 {
+			      E T2k, T2x, T6w, T6x;
+			      T2k = T2e + T2j;
+			      T2x = T2n + T2w;
+			      T2y = T2k + T2x;
+			      T6B = T2k - T2x;
+			      T6w = T4x + T4y;
+			      T6x = T4h + T4i;
+			      T6y = T6w - T6x;
+			      T7j = T6w + T6x;
+			 }
+			 {
+			      E T4g, T4j, T4z, T4A;
+			      T4g = T2e - T2j;
+			      T4j = T4h - T4i;
+			      T4k = T4g + T4j;
+			      T5G = T4g - T4j;
+			      T4z = T4x - T4y;
+			      T4A = T2n - T2w;
+			      T4B = T4z - T4A;
+			      T5J = T4z + T4A;
+			 }
+		    }
+		    {
+			 E T31, T4H, T3f, T50, T36, T4I, T3a, T4Z;
+			 {
+			      E T2W, T30, T3c, T3e;
+			      T2W = cr[WS(rs, 31)];
+			      T30 = ci[WS(rs, 31)];
+			      T31 = FMA(T2V, T2W, T2Z * T30);
+			      T4H = FNMS(T2Z, T2W, T2V * T30);
+			      T3c = cr[WS(rs, 23)];
+			      T3e = ci[WS(rs, 23)];
+			      T3f = FMA(T3b, T3c, T3d * T3e);
+			      T50 = FNMS(T3d, T3c, T3b * T3e);
+			 }
+			 {
+			      E T33, T35, T38, T39;
+			      T33 = cr[WS(rs, 15)];
+			      T35 = ci[WS(rs, 15)];
+			      T36 = FMA(T32, T33, T34 * T35);
+			      T4I = FNMS(T34, T33, T32 * T35);
+			      T38 = cr[WS(rs, 7)];
+			      T39 = ci[WS(rs, 7)];
+			      T3a = FMA(TR, T38, TS * T39);
+			      T4Z = FNMS(TS, T38, TR * T39);
+			 }
+			 {
+			      E T37, T3g, T6M, T6N;
+			      T37 = T31 + T36;
+			      T3g = T3a + T3f;
+			      T3h = T37 + T3g;
+			      T6H = T37 - T3g;
+			      T6M = T4H + T4I;
+			      T6N = T4Z + T50;
+			      T6O = T6M - T6N;
+			      T7o = T6M + T6N;
+			 }
+			 {
+			      E T4J, T4K, T4Y, T51;
+			      T4J = T4H - T4I;
+			      T4K = T3a - T3f;
+			      T4L = T4J - T4K;
+			      T5Q = T4J + T4K;
+			      T4Y = T31 - T36;
+			      T51 = T4Z - T50;
+			      T52 = T4Y + T51;
+			      T5N = T4Y - T51;
+			 }
+		    }
+		    {
+			 E TQ, T3H, T1g, T3N, TX, T3I, T17, T3M;
+			 {
+			      E TN, TP, T1b, T1f;
+			      TN = cr[WS(rs, 4)];
+			      TP = ci[WS(rs, 4)];
+			      TQ = FMA(TM, TN, TO * TP);
+			      T3H = FNMS(TO, TN, TM * TP);
+			      T1b = cr[WS(rs, 12)];
+			      T1f = ci[WS(rs, 12)];
+			      T1g = FMA(T1a, T1b, T1e * T1f);
+			      T3N = FNMS(T1e, T1b, T1a * T1f);
+			 }
+			 {
+			      E TU, TW, T12, T16;
+			      TU = cr[WS(rs, 20)];
+			      TW = ci[WS(rs, 20)];
+			      TX = FMA(TT, TU, TV * TW);
+			      T3I = FNMS(TV, TU, TT * TW);
+			      T12 = cr[WS(rs, 28)];
+			      T16 = ci[WS(rs, 28)];
+			      T17 = FMA(T11, T12, T15 * T16);
+			      T3M = FNMS(T15, T12, T11 * T16);
+			 }
+			 {
+			      E TY, T1h, T6g, T6h;
+			      TY = TQ + TX;
+			      T1h = T17 + T1g;
+			      T1i = TY + T1h;
+			      T7V = TY - T1h;
+			      T6g = T3M + T3N;
+			      T6h = T3H + T3I;
+			      T6i = T6g - T6h;
+			      T7D = T6h + T6g;
+			 }
+			 {
+			      E T3G, T3J, T3L, T3O;
+			      T3G = TQ - TX;
+			      T3J = T3H - T3I;
+			      T3K = T3G + T3J;
+			      T5u = T3G - T3J;
+			      T3L = T17 - T1g;
+			      T3O = T3M - T3N;
+			      T3P = T3L - T3O;
+			      T5v = T3L + T3O;
+			 }
+		    }
+		    {
+			 E T1m, T3X, T1C, T3U, T1r, T3Y, T1x, T3T;
+			 {
+			      E T1k, T1l, T1z, T1B;
+			      T1k = cr[WS(rs, 2)];
+			      T1l = ci[WS(rs, 2)];
+			      T1m = FMA(T8, T1k, Td * T1l);
+			      T3X = FNMS(Td, T1k, T8 * T1l);
+			      T1z = cr[WS(rs, 26)];
+			      T1B = ci[WS(rs, 26)];
+			      T1C = FMA(T1y, T1z, T1A * T1B);
+			      T3U = FNMS(T1A, T1z, T1y * T1B);
+			 }
+			 {
+			      E T1o, T1q, T1u, T1w;
+			      T1o = cr[WS(rs, 18)];
+			      T1q = ci[WS(rs, 18)];
+			      T1r = FMA(T1n, T1o, T1p * T1q);
+			      T3Y = FNMS(T1p, T1o, T1n * T1q);
+			      T1u = cr[WS(rs, 10)];
+			      T1w = ci[WS(rs, 10)];
+			      T1x = FMA(T1t, T1u, T1v * T1w);
+			      T3T = FNMS(T1v, T1u, T1t * T1w);
+			 }
+			 {
+			      E T1s, T1D, T6l, T6m;
+			      T1s = T1m + T1r;
+			      T1D = T1x + T1C;
+			      T1E = T1s + T1D;
+			      T6k = T1s - T1D;
+			      T6l = T3X + T3Y;
+			      T6m = T3T + T3U;
+			      T6n = T6l - T6m;
+			      T7f = T6l + T6m;
+			 }
+			 {
+			      E T3S, T3V, T3Z, T40;
+			      T3S = T1m - T1r;
+			      T3V = T3T - T3U;
+			      T3W = T3S + T3V;
+			      T5z = T3S - T3V;
+			      T3Z = T3X - T3Y;
+			      T40 = T1x - T1C;
+			      T41 = T3Z - T40;
+			      T5y = T3Z + T40;
+			 }
+		    }
+		    {
+			 E T1J, T43, T27, T4a, T1U, T44, T20, T49;
+			 {
+			      E T1G, T1I, T24, T26;
+			      T1G = cr[WS(rs, 30)];
+			      T1I = ci[WS(rs, 30)];
+			      T1J = FMA(T1F, T1G, T1H * T1I);
+			      T43 = FNMS(T1H, T1G, T1F * T1I);
+			      T24 = cr[WS(rs, 22)];
+			      T26 = ci[WS(rs, 22)];
+			      T27 = FMA(T23, T24, T25 * T26);
+			      T4a = FNMS(T25, T24, T23 * T26);
+			 }
+			 {
+			      E T1R, T1T, T1X, T1Z;
+			      T1R = cr[WS(rs, 14)];
+			      T1T = ci[WS(rs, 14)];
+			      T1U = FMA(T1Q, T1R, T1S * T1T);
+			      T44 = FNMS(T1S, T1R, T1Q * T1T);
+			      T1X = cr[WS(rs, 6)];
+			      T1Z = ci[WS(rs, 6)];
+			      T20 = FMA(T1W, T1X, T1Y * T1Z);
+			      T49 = FNMS(T1Y, T1X, T1W * T1Z);
+			 }
+			 {
+			      E T1V, T28, T6q, T6r;
+			      T1V = T1J + T1U;
+			      T28 = T20 + T27;
+			      T29 = T1V + T28;
+			      T6p = T1V - T28;
+			      T6q = T43 + T44;
+			      T6r = T49 + T4a;
+			      T6s = T6q - T6r;
+			      T7e = T6q + T6r;
+			 }
+			 {
+			      E T45, T46, T48, T4b;
+			      T45 = T43 - T44;
+			      T46 = T20 - T27;
+			      T47 = T45 - T46;
+			      T5C = T45 + T46;
+			      T48 = T1J - T1U;
+			      T4b = T49 - T4a;
+			      T4c = T48 + T4b;
+			      T5B = T48 - T4b;
+			 }
+		    }
+		    {
+			 E T2B, T4m, T2G, T4n, T4l, T4o, T2M, T4q, T2P, T4r, T4s, T4t;
+			 {
+			      E T2z, T2A, T2D, T2F;
+			      T2z = cr[WS(rs, 5)];
+			      T2A = ci[WS(rs, 5)];
+			      T2B = FMA(T21, T2z, T22 * T2A);
+			      T4m = FNMS(T22, T2z, T21 * T2A);
+			      T2D = cr[WS(rs, 21)];
+			      T2F = ci[WS(rs, 21)];
+			      T2G = FMA(T2C, T2D, T2E * T2F);
+			      T4n = FNMS(T2E, T2D, T2C * T2F);
+			 }
+			 T4l = T2B - T2G;
+			 T4o = T4m - T4n;
+			 {
+			      E T2J, T2L, T2N, T2O;
+			      T2J = cr[WS(rs, 29)];
+			      T2L = ci[WS(rs, 29)];
+			      T2M = FMA(T2I, T2J, T2K * T2L);
+			      T4q = FNMS(T2K, T2J, T2I * T2L);
+			      T2N = cr[WS(rs, 13)];
+			      T2O = ci[WS(rs, 13)];
+			      T2P = FMA(T1M, T2N, T1P * T2O);
+			      T4r = FNMS(T1P, T2N, T1M * T2O);
+			 }
+			 T4s = T4q - T4r;
+			 T4t = T2M - T2P;
+			 {
+			      E T2H, T2Q, T6C, T6D;
+			      T2H = T2B + T2G;
+			      T2Q = T2M + T2P;
+			      T2R = T2H + T2Q;
+			      T6z = T2H - T2Q;
+			      T6C = T4q + T4r;
+			      T6D = T4m + T4n;
+			      T6E = T6C - T6D;
+			      T7k = T6D + T6C;
+			 }
+			 {
+			      E T4p, T4u, T4C, T4D;
+			      T4p = T4l + T4o;
+			      T4u = T4s - T4t;
+			      T4v = KP707106781 * (T4p - T4u);
+			      T5K = KP707106781 * (T4p + T4u);
+			      T4C = T4t + T4s;
+			      T4D = T4l - T4o;
+			      T4E = KP707106781 * (T4C - T4D);
+			      T5H = KP707106781 * (T4D + T4C);
+			 }
+		    }
+		    {
+			 E T3k, T4S, T3p, T4T, T4R, T4U, T3t, T4N, T3w, T4O, T4M, T4P;
+			 {
+			      E T3i, T3j, T3m, T3o;
+			      T3i = cr[WS(rs, 3)];
+			      T3j = ci[WS(rs, 3)];
+			      T3k = FMA(T3, T3i, T6 * T3j);
+			      T4S = FNMS(T6, T3i, T3 * T3j);
+			      T3m = cr[WS(rs, 19)];
+			      T3o = ci[WS(rs, 19)];
+			      T3p = FMA(T3l, T3m, T3n * T3o);
+			      T4T = FNMS(T3n, T3m, T3l * T3o);
+			 }
+			 T4R = T3k - T3p;
+			 T4U = T4S - T4T;
+			 {
+			      E T3r, T3s, T3u, T3v;
+			      T3r = cr[WS(rs, 27)];
+			      T3s = ci[WS(rs, 27)];
+			      T3t = FMA(Th, T3r, Tl * T3s);
+			      T4N = FNMS(Tl, T3r, Th * T3s);
+			      T3u = cr[WS(rs, 11)];
+			      T3v = ci[WS(rs, 11)];
+			      T3w = FMA(Tg, T3u, Tk * T3v);
+			      T4O = FNMS(Tk, T3u, Tg * T3v);
+			 }
+			 T4M = T3t - T3w;
+			 T4P = T4N - T4O;
+			 {
+			      E T3q, T3x, T6I, T6J;
+			      T3q = T3k + T3p;
+			      T3x = T3t + T3w;
+			      T3y = T3q + T3x;
+			      T6P = T3q - T3x;
+			      T6I = T4N + T4O;
+			      T6J = T4S + T4T;
+			      T6K = T6I - T6J;
+			      T7p = T6J + T6I;
+			 }
+			 {
+			      E T4Q, T4V, T53, T54;
+			      T4Q = T4M + T4P;
+			      T4V = T4R - T4U;
+			      T4W = KP707106781 * (T4Q - T4V);
+			      T5O = KP707106781 * (T4V + T4Q);
+			      T53 = T4R + T4U;
+			      T54 = T4P - T4M;
+			      T55 = KP707106781 * (T53 - T54);
+			      T5R = KP707106781 * (T53 + T54);
+			 }
+		    }
+		    {
+			 E T2b, T7x, T7K, T7M, T3A, T7L, T7A, T7B;
+			 {
+			      E T1j, T2a, T7C, T7J;
+			      T1j = TL + T1i;
+			      T2a = T1E + T29;
+			      T2b = T1j + T2a;
+			      T7x = T1j - T2a;
+			      T7C = T7f + T7e;
+			      T7J = T7D + T7I;
+			      T7K = T7C + T7J;
+			      T7M = T7J - T7C;
+			 }
+			 {
+			      E T2S, T3z, T7y, T7z;
+			      T2S = T2y + T2R;
+			      T3z = T3h + T3y;
+			      T3A = T2S + T3z;
+			      T7L = T3z - T2S;
+			      T7y = T7o + T7p;
+			      T7z = T7j + T7k;
+			      T7A = T7y - T7z;
+			      T7B = T7z + T7y;
+			 }
+			 ci[WS(rs, 15)] = T2b - T3A;
+			 cr[WS(rs, 24)] = T7L - T7M;
+			 ci[WS(rs, 23)] = T7L + T7M;
+			 cr[0] = T2b + T3A;
+			 cr[WS(rs, 8)] = T7x - T7A;
+			 cr[WS(rs, 16)] = T7B - T7K;
+			 ci[WS(rs, 31)] = T7B + T7K;
+			 ci[WS(rs, 7)] = T7x + T7A;
+		    }
+		    {
+			 E T5x, T5Z, T8d, T8j, T5E, T88, T69, T6d, T5M, T5W, T62, T8i, T66, T6c, T5T;
+			 E T5X, T5w, T89;
+			 T5w = KP707106781 * (T5u + T5v);
+			 T5x = T5t - T5w;
+			 T5Z = T5t + T5w;
+			 T89 = KP707106781 * (T3K - T3P);
+			 T8d = T89 + T8c;
+			 T8j = T8c - T89;
+			 {
+			      E T5A, T5D, T67, T68;
+			      T5A = FMA(KP923879532, T5y, KP382683432 * T5z);
+			      T5D = FNMS(KP923879532, T5C, KP382683432 * T5B);
+			      T5E = T5A + T5D;
+			      T88 = T5A - T5D;
+			      T67 = T5N + T5O;
+			      T68 = T5Q + T5R;
+			      T69 = FNMS(KP980785280, T68, KP195090322 * T67);
+			      T6d = FMA(KP980785280, T67, KP195090322 * T68);
+			 }
+			 {
+			      E T5I, T5L, T60, T61;
+			      T5I = T5G - T5H;
+			      T5L = T5J - T5K;
+			      T5M = FMA(KP831469612, T5I, KP555570233 * T5L);
+			      T5W = FNMS(KP831469612, T5L, KP555570233 * T5I);
+			      T60 = FNMS(KP382683432, T5y, KP923879532 * T5z);
+			      T61 = FMA(KP382683432, T5C, KP923879532 * T5B);
+			      T62 = T60 + T61;
+			      T8i = T61 - T60;
+			 }
+			 {
+			      E T64, T65, T5P, T5S;
+			      T64 = T5G + T5H;
+			      T65 = T5J + T5K;
+			      T66 = FMA(KP195090322, T64, KP980785280 * T65);
+			      T6c = FNMS(KP195090322, T65, KP980785280 * T64);
+			      T5P = T5N - T5O;
+			      T5S = T5Q - T5R;
+			      T5T = FNMS(KP555570233, T5S, KP831469612 * T5P);
+			      T5X = FMA(KP555570233, T5P, KP831469612 * T5S);
+			 }
+			 {
+			      E T5F, T5U, T8h, T8k;
+			      T5F = T5x + T5E;
+			      T5U = T5M + T5T;
+			      ci[WS(rs, 12)] = T5F - T5U;
+			      cr[WS(rs, 3)] = T5F + T5U;
+			      T8h = T5X - T5W;
+			      T8k = T8i + T8j;
+			      cr[WS(rs, 19)] = T8h - T8k;
+			      ci[WS(rs, 28)] = T8h + T8k;
+			 }
+			 {
+			      E T8l, T8m, T5V, T5Y;
+			      T8l = T5T - T5M;
+			      T8m = T8j - T8i;
+			      cr[WS(rs, 27)] = T8l - T8m;
+			      ci[WS(rs, 20)] = T8l + T8m;
+			      T5V = T5x - T5E;
+			      T5Y = T5W + T5X;
+			      cr[WS(rs, 11)] = T5V - T5Y;
+			      ci[WS(rs, 4)] = T5V + T5Y;
+			 }
+			 {
+			      E T63, T6a, T87, T8e;
+			      T63 = T5Z - T62;
+			      T6a = T66 + T69;
+			      ci[WS(rs, 8)] = T63 - T6a;
+			      cr[WS(rs, 7)] = T63 + T6a;
+			      T87 = T69 - T66;
+			      T8e = T88 + T8d;
+			      cr[WS(rs, 31)] = T87 - T8e;
+			      ci[WS(rs, 16)] = T87 + T8e;
+			 }
+			 {
+			      E T8f, T8g, T6b, T6e;
+			      T8f = T6d - T6c;
+			      T8g = T8d - T88;
+			      cr[WS(rs, 23)] = T8f - T8g;
+			      ci[WS(rs, 24)] = T8f + T8g;
+			      T6b = T5Z + T62;
+			      T6e = T6c + T6d;
+			      cr[WS(rs, 15)] = T6b - T6e;
+			      ci[0] = T6b + T6e;
+			 }
+		    }
+		    {
+			 E T7h, T7t, T7Q, T7S, T7m, T7u, T7r, T7v;
+			 {
+			      E T7d, T7g, T7O, T7P;
+			      T7d = TL - T1i;
+			      T7g = T7e - T7f;
+			      T7h = T7d - T7g;
+			      T7t = T7d + T7g;
+			      T7O = T1E - T29;
+			      T7P = T7I - T7D;
+			      T7Q = T7O + T7P;
+			      T7S = T7P - T7O;
+			 }
+			 {
+			      E T7i, T7l, T7n, T7q;
+			      T7i = T2y - T2R;
+			      T7l = T7j - T7k;
+			      T7m = T7i + T7l;
+			      T7u = T7i - T7l;
+			      T7n = T3h - T3y;
+			      T7q = T7o - T7p;
+			      T7r = T7n - T7q;
+			      T7v = T7n + T7q;
+			 }
+			 {
+			      E T7s, T7R, T7w, T7N;
+			      T7s = KP707106781 * (T7m + T7r);
+			      ci[WS(rs, 11)] = T7h - T7s;
+			      cr[WS(rs, 4)] = T7h + T7s;
+			      T7R = KP707106781 * (T7v - T7u);
+			      cr[WS(rs, 20)] = T7R - T7S;
+			      ci[WS(rs, 27)] = T7R + T7S;
+			      T7w = KP707106781 * (T7u + T7v);
+			      cr[WS(rs, 12)] = T7t - T7w;
+			      ci[WS(rs, 3)] = T7t + T7w;
+			      T7N = KP707106781 * (T7r - T7m);
+			      cr[WS(rs, 28)] = T7N - T7Q;
+			      ci[WS(rs, 19)] = T7N + T7Q;
+			 }
+		    }
+		    {
+			 E T6j, T7X, T83, T6X, T6u, T7U, T77, T7b, T70, T82, T6G, T6U, T74, T7a, T6R;
+			 E T6V;
+			 {
+			      E T6o, T6t, T6A, T6F;
+			      T6j = T6f - T6i;
+			      T7X = T7V + T7W;
+			      T83 = T7W - T7V;
+			      T6X = T6f + T6i;
+			      T6o = T6k + T6n;
+			      T6t = T6p - T6s;
+			      T6u = KP707106781 * (T6o + T6t);
+			      T7U = KP707106781 * (T6o - T6t);
+			      {
+				   E T75, T76, T6Y, T6Z;
+				   T75 = T6O + T6P;
+				   T76 = T6H + T6K;
+				   T77 = FMA(KP382683432, T75, KP923879532 * T76);
+				   T7b = FNMS(KP923879532, T75, KP382683432 * T76);
+				   T6Y = T6k - T6n;
+				   T6Z = T6p + T6s;
+				   T70 = KP707106781 * (T6Y + T6Z);
+				   T82 = KP707106781 * (T6Z - T6Y);
+			      }
+			      T6A = T6y - T6z;
+			      T6F = T6B - T6E;
+			      T6G = FMA(KP382683432, T6A, KP923879532 * T6F);
+			      T6U = FNMS(KP923879532, T6A, KP382683432 * T6F);
+			      {
+				   E T72, T73, T6L, T6Q;
+				   T72 = T6B + T6E;
+				   T73 = T6y + T6z;
+				   T74 = FNMS(KP382683432, T73, KP923879532 * T72);
+				   T7a = FMA(KP923879532, T73, KP382683432 * T72);
+				   T6L = T6H - T6K;
+				   T6Q = T6O - T6P;
+				   T6R = FNMS(KP382683432, T6Q, KP923879532 * T6L);
+				   T6V = FMA(KP923879532, T6Q, KP382683432 * T6L);
+			      }
+			 }
+			 {
+			      E T6v, T6S, T81, T84;
+			      T6v = T6j + T6u;
+			      T6S = T6G + T6R;
+			      ci[WS(rs, 13)] = T6v - T6S;
+			      cr[WS(rs, 2)] = T6v + T6S;
+			      T81 = T6V - T6U;
+			      T84 = T82 + T83;
+			      cr[WS(rs, 18)] = T81 - T84;
+			      ci[WS(rs, 29)] = T81 + T84;
+			 }
+			 {
+			      E T85, T86, T6T, T6W;
+			      T85 = T6R - T6G;
+			      T86 = T83 - T82;
+			      cr[WS(rs, 26)] = T85 - T86;
+			      ci[WS(rs, 21)] = T85 + T86;
+			      T6T = T6j - T6u;
+			      T6W = T6U + T6V;
+			      cr[WS(rs, 10)] = T6T - T6W;
+			      ci[WS(rs, 5)] = T6T + T6W;
+			 }
+			 {
+			      E T71, T78, T7T, T7Y;
+			      T71 = T6X + T70;
+			      T78 = T74 + T77;
+			      cr[WS(rs, 14)] = T71 - T78;
+			      ci[WS(rs, 1)] = T71 + T78;
+			      T7T = T7b - T7a;
+			      T7Y = T7U + T7X;
+			      cr[WS(rs, 30)] = T7T - T7Y;
+			      ci[WS(rs, 17)] = T7T + T7Y;
+			 }
+			 {
+			      E T7Z, T80, T79, T7c;
+			      T7Z = T77 - T74;
+			      T80 = T7X - T7U;
+			      cr[WS(rs, 22)] = T7Z - T80;
+			      ci[WS(rs, 25)] = T7Z + T80;
+			      T79 = T6X - T70;
+			      T7c = T7a + T7b;
+			      ci[WS(rs, 9)] = T79 - T7c;
+			      cr[WS(rs, 6)] = T79 + T7c;
+			 }
+		    }
+		    {
+			 E T3R, T5d, T8r, T8x, T4e, T8o, T5n, T5r, T4G, T5a, T5g, T8w, T5k, T5q, T57;
+			 E T5b, T3Q, T8p;
+			 T3Q = KP707106781 * (T3K + T3P);
+			 T3R = T3F - T3Q;
+			 T5d = T3F + T3Q;
+			 T8p = KP707106781 * (T5v - T5u);
+			 T8r = T8p + T8q;
+			 T8x = T8q - T8p;
+			 {
+			      E T42, T4d, T5l, T5m;
+			      T42 = FNMS(KP923879532, T41, KP382683432 * T3W);
+			      T4d = FMA(KP923879532, T47, KP382683432 * T4c);
+			      T4e = T42 + T4d;
+			      T8o = T4d - T42;
+			      T5l = T52 + T55;
+			      T5m = T4L + T4W;
+			      T5n = FNMS(KP195090322, T5m, KP980785280 * T5l);
+			      T5r = FMA(KP980785280, T5m, KP195090322 * T5l);
+			 }
+			 {
+			      E T4w, T4F, T5e, T5f;
+			      T4w = T4k - T4v;
+			      T4F = T4B - T4E;
+			      T4G = FNMS(KP555570233, T4F, KP831469612 * T4w);
+			      T5a = FMA(KP831469612, T4F, KP555570233 * T4w);
+			      T5e = FMA(KP382683432, T41, KP923879532 * T3W);
+			      T5f = FNMS(KP382683432, T47, KP923879532 * T4c);
+			      T5g = T5e + T5f;
+			      T8w = T5e - T5f;
+			 }
+			 {
+			      E T5i, T5j, T4X, T56;
+			      T5i = T4B + T4E;
+			      T5j = T4k + T4v;
+			      T5k = FMA(KP195090322, T5i, KP980785280 * T5j);
+			      T5q = FNMS(KP980785280, T5i, KP195090322 * T5j);
+			      T4X = T4L - T4W;
+			      T56 = T52 - T55;
+			      T57 = FMA(KP555570233, T4X, KP831469612 * T56);
+			      T5b = FNMS(KP831469612, T4X, KP555570233 * T56);
+			 }
+			 {
+			      E T4f, T58, T8v, T8y;
+			      T4f = T3R + T4e;
+			      T58 = T4G + T57;
+			      cr[WS(rs, 13)] = T4f - T58;
+			      ci[WS(rs, 2)] = T4f + T58;
+			      T8v = T5b - T5a;
+			      T8y = T8w + T8x;
+			      cr[WS(rs, 29)] = T8v - T8y;
+			      ci[WS(rs, 18)] = T8v + T8y;
+			 }
+			 {
+			      E T8z, T8A, T59, T5c;
+			      T8z = T57 - T4G;
+			      T8A = T8x - T8w;
+			      cr[WS(rs, 21)] = T8z - T8A;
+			      ci[WS(rs, 26)] = T8z + T8A;
+			      T59 = T3R - T4e;
+			      T5c = T5a + T5b;
+			      ci[WS(rs, 10)] = T59 - T5c;
+			      cr[WS(rs, 5)] = T59 + T5c;
+			 }
+			 {
+			      E T5h, T5o, T8n, T8s;
+			      T5h = T5d + T5g;
+			      T5o = T5k + T5n;
+			      ci[WS(rs, 14)] = T5h - T5o;
+			      cr[WS(rs, 1)] = T5h + T5o;
+			      T8n = T5r - T5q;
+			      T8s = T8o + T8r;
+			      cr[WS(rs, 17)] = T8n - T8s;
+			      ci[WS(rs, 30)] = T8n + T8s;
+			 }
+			 {
+			      E T8t, T8u, T5p, T5s;
+			      T8t = T5n - T5k;
+			      T8u = T8r - T8o;
+			      cr[WS(rs, 25)] = T8t - T8u;
+			      ci[WS(rs, 22)] = T8t + T8u;
+			      T5p = T5d - T5g;
+			      T5s = T5q + T5r;
+			      cr[WS(rs, 9)] = T5p - T5s;
+			      ci[WS(rs, 6)] = T5p + T5s;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 9},
+     {TW_CEXP, 1, 27},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 32, "hf2_32", twinstr, &GENUS, {376, 168, 112, 0} };
+
+void X(codelet_hf2_32) (planner *p) {
+     X(khc2hc_register) (p, hf2_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf2_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf2_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:02 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hf2_4 -include hf.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 33 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E Ti, Tq, To, Te, TA, Ty, Tm, Ts;
+	       {
+		    E T2, T6, T3, T5;
+		    T2 = W[0];
+		    T6 = W[3];
+		    T3 = W[2];
+		    T5 = W[1];
+		    {
+			 E T1, Tx, Td, Tw, Tj, Tl, Ta, T4, Tk, Tr;
+			 T1 = cr[0];
+			 Ta = T2 * T6;
+			 T4 = T2 * T3;
+			 Tx = ci[0];
+			 {
+			      E T8, Tb, T7, Tc;
+			      T8 = cr[WS(rs, 2)];
+			      Tb = FNMS(T5, T3, Ta);
+			      T7 = FMA(T5, T6, T4);
+			      Tc = ci[WS(rs, 2)];
+			      {
+				   E Tf, Th, T9, Tv, Tg, Tp;
+				   Tf = cr[WS(rs, 1)];
+				   Th = ci[WS(rs, 1)];
+				   T9 = T7 * T8;
+				   Tv = T7 * Tc;
+				   Tg = T2 * Tf;
+				   Tp = T2 * Th;
+				   Td = FMA(Tb, Tc, T9);
+				   Tw = FNMS(Tb, T8, Tv);
+				   Ti = FMA(T5, Th, Tg);
+				   Tq = FNMS(T5, Tf, Tp);
+			      }
+			      Tj = cr[WS(rs, 3)];
+			      Tl = ci[WS(rs, 3)];
+			 }
+			 To = T1 - Td;
+			 Te = T1 + Td;
+			 Tk = T3 * Tj;
+			 Tr = T3 * Tl;
+			 TA = Tx - Tw;
+			 Ty = Tw + Tx;
+			 Tm = FMA(T6, Tl, Tk);
+			 Ts = FNMS(T6, Tj, Tr);
+		    }
+	       }
+	       {
+		    E Tn, Tz, Tt, Tu;
+		    Tn = Ti + Tm;
+		    Tz = Tm - Ti;
+		    Tt = Tq - Ts;
+		    Tu = Tq + Ts;
+		    ci[WS(rs, 2)] = Tz + TA;
+		    cr[WS(rs, 3)] = Tz - TA;
+		    cr[0] = Te + Tn;
+		    ci[WS(rs, 1)] = Te - Tn;
+		    ci[WS(rs, 3)] = Tu + Ty;
+		    cr[WS(rs, 2)] = Tu - Ty;
+		    cr[WS(rs, 1)] = To + Tt;
+		    ci[0] = To - Tt;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 4, "hf2_4", twinstr, &GENUS, {16, 8, 8, 0} };
+
+void X(codelet_hf2_4) (planner *p) {
+     X(khc2hc_register) (p, hf2_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hf2_4 -include hf.h */
+
+/*
+ * This function contains 24 FP additions, 16 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
+ * 21 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T2, T4, T3, T5, T6, T8;
+	       T2 = W[0];
+	       T4 = W[1];
+	       T3 = W[2];
+	       T5 = W[3];
+	       T6 = FMA(T2, T3, T4 * T5);
+	       T8 = FNMS(T4, T3, T2 * T5);
+	       {
+		    E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
+		    T1 = cr[0];
+		    Tp = ci[0];
+		    T7 = cr[WS(rs, 2)];
+		    T9 = ci[WS(rs, 2)];
+		    Ta = FMA(T6, T7, T8 * T9);
+		    To = FNMS(T8, T7, T6 * T9);
+		    {
+			 E Tc, Td, Tf, Tg;
+			 Tc = cr[WS(rs, 1)];
+			 Td = ci[WS(rs, 1)];
+			 Te = FMA(T2, Tc, T4 * Td);
+			 Tk = FNMS(T4, Tc, T2 * Td);
+			 Tf = cr[WS(rs, 3)];
+			 Tg = ci[WS(rs, 3)];
+			 Th = FMA(T3, Tf, T5 * Tg);
+			 Tl = FNMS(T5, Tf, T3 * Tg);
+		    }
+		    {
+			 E Tb, Ti, Tj, Tm;
+			 Tb = T1 + Ta;
+			 Ti = Te + Th;
+			 ci[WS(rs, 1)] = Tb - Ti;
+			 cr[0] = Tb + Ti;
+			 Tj = T1 - Ta;
+			 Tm = Tk - Tl;
+			 ci[0] = Tj - Tm;
+			 cr[WS(rs, 1)] = Tj + Tm;
+		    }
+		    {
+			 E Tn, Tq, Tr, Ts;
+			 Tn = Tk + Tl;
+			 Tq = To + Tp;
+			 cr[WS(rs, 2)] = Tn - Tq;
+			 ci[WS(rs, 3)] = Tn + Tq;
+			 Tr = Th - Te;
+			 Ts = Tp - To;
+			 cr[WS(rs, 3)] = Tr - Ts;
+			 ci[WS(rs, 2)] = Tr + Ts;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 4, "hf2_4", twinstr, &GENUS, {16, 8, 8, 0} };
+
+void X(codelet_hf2_4) (planner *p) {
+     X(khc2hc_register) (p, hf2_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf2_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf2_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:07 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -dit -name hf2_5 -include hf.h */
+
+/*
+ * This function contains 44 FP additions, 40 FP multiplications,
+ * (or, 14 additions, 10 multiplications, 30 fused multiply/add),
+ * 47 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E Ta, T1, TL, Tp, TT, Ti, TM, TC, To, TE, Ts, TF, T2, T8, T5;
+	       E TS, Tt, TG;
+	       T2 = W[0];
+	       Ta = W[3];
+	       T8 = W[2];
+	       T5 = W[1];
+	       {
+		    E Tq, Tr, Te, T9;
+		    T1 = cr[0];
+		    Te = T2 * Ta;
+		    T9 = T2 * T8;
+		    TL = ci[0];
+		    {
+			 E T3, Tf, Tm, Tj, Tb, T4, T6, Tc, Tg;
+			 T3 = cr[WS(rs, 1)];
+			 Tf = FMA(T5, T8, Te);
+			 Tm = FNMS(T5, T8, Te);
+			 Tj = FMA(T5, Ta, T9);
+			 Tb = FNMS(T5, Ta, T9);
+			 T4 = T2 * T3;
+			 T6 = ci[WS(rs, 1)];
+			 Tc = cr[WS(rs, 4)];
+			 Tg = ci[WS(rs, 4)];
+			 {
+			      E Tk, Tl, Tn, TD;
+			      {
+				   E T7, Tz, Th, TB, Ty, Td, TA;
+				   Tk = cr[WS(rs, 2)];
+				   T7 = FMA(T5, T6, T4);
+				   Ty = T2 * T6;
+				   Td = Tb * Tc;
+				   TA = Tb * Tg;
+				   Tl = Tj * Tk;
+				   Tz = FNMS(T5, T3, Ty);
+				   Th = FMA(Tf, Tg, Td);
+				   TB = FNMS(Tf, Tc, TA);
+				   Tn = ci[WS(rs, 2)];
+				   Tp = cr[WS(rs, 3)];
+				   TT = Th - T7;
+				   Ti = T7 + Th;
+				   TM = Tz + TB;
+				   TC = Tz - TB;
+				   TD = Tj * Tn;
+				   Tq = T8 * Tp;
+				   Tr = ci[WS(rs, 3)];
+			      }
+			      To = FMA(Tm, Tn, Tl);
+			      TE = FNMS(Tm, Tk, TD);
+			 }
+		    }
+		    Ts = FMA(Ta, Tr, Tq);
+		    TF = T8 * Tr;
+	       }
+	       TS = To - Ts;
+	       Tt = To + Ts;
+	       TG = FNMS(Ta, Tp, TF);
+	       {
+		    E TU, TW, TV, TR, Tw, Tu;
+		    TU = FMA(KP618033988, TT, TS);
+		    TW = FNMS(KP618033988, TS, TT);
+		    Tw = Ti - Tt;
+		    Tu = Ti + Tt;
+		    {
+			 E TN, TH, Tv, TI, TK;
+			 TN = TE + TG;
+			 TH = TE - TG;
+			 cr[0] = T1 + Tu;
+			 Tv = FNMS(KP250000000, Tu, T1);
+			 TI = FMA(KP618033988, TH, TC);
+			 TK = FNMS(KP618033988, TC, TH);
+			 {
+			      E TQ, TO, Tx, TJ, TP;
+			      TQ = TM - TN;
+			      TO = TM + TN;
+			      Tx = FMA(KP559016994, Tw, Tv);
+			      TJ = FNMS(KP559016994, Tw, Tv);
+			      ci[WS(rs, 4)] = TO + TL;
+			      TP = FNMS(KP250000000, TO, TL);
+			      ci[WS(rs, 1)] = FMA(KP951056516, TK, TJ);
+			      cr[WS(rs, 2)] = FNMS(KP951056516, TK, TJ);
+			      cr[WS(rs, 1)] = FMA(KP951056516, TI, Tx);
+			      ci[0] = FNMS(KP951056516, TI, Tx);
+			      TV = FMA(KP559016994, TQ, TP);
+			      TR = FNMS(KP559016994, TQ, TP);
+			 }
+		    }
+		    ci[WS(rs, 2)] = FMA(KP951056516, TU, TR);
+		    cr[WS(rs, 3)] = FMS(KP951056516, TU, TR);
+		    ci[WS(rs, 3)] = FMA(KP951056516, TW, TV);
+		    cr[WS(rs, 4)] = FMS(KP951056516, TW, TV);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 5, "hf2_5", twinstr, &GENUS, {14, 10, 30, 0} };
+
+void X(codelet_hf2_5) (planner *p) {
+     X(khc2hc_register) (p, hf2_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -dit -name hf2_5 -include hf.h */
+
+/*
+ * This function contains 44 FP additions, 32 FP multiplications,
+ * (or, 30 additions, 18 multiplications, 14 fused multiply/add),
+ * 37 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E T2, T4, T7, T9, Tb, Tl, Tf, Tj;
+	       {
+		    E T8, Te, Ta, Td;
+		    T2 = W[0];
+		    T4 = W[1];
+		    T7 = W[2];
+		    T9 = W[3];
+		    T8 = T2 * T7;
+		    Te = T4 * T7;
+		    Ta = T4 * T9;
+		    Td = T2 * T9;
+		    Tb = T8 - Ta;
+		    Tl = Td - Te;
+		    Tf = Td + Te;
+		    Tj = T8 + Ta;
+	       }
+	       {
+		    E T1, TI, Ty, TB, TG, TF, TJ, TK, TL, Ti, Tr, Ts;
+		    T1 = cr[0];
+		    TI = ci[0];
+		    {
+			 E T6, Tw, Tq, TA, Th, Tx, Tn, Tz;
+			 {
+			      E T3, T5, To, Tp;
+			      T3 = cr[WS(rs, 1)];
+			      T5 = ci[WS(rs, 1)];
+			      T6 = FMA(T2, T3, T4 * T5);
+			      Tw = FNMS(T4, T3, T2 * T5);
+			      To = cr[WS(rs, 3)];
+			      Tp = ci[WS(rs, 3)];
+			      Tq = FMA(T7, To, T9 * Tp);
+			      TA = FNMS(T9, To, T7 * Tp);
+			 }
+			 {
+			      E Tc, Tg, Tk, Tm;
+			      Tc = cr[WS(rs, 4)];
+			      Tg = ci[WS(rs, 4)];
+			      Th = FMA(Tb, Tc, Tf * Tg);
+			      Tx = FNMS(Tf, Tc, Tb * Tg);
+			      Tk = cr[WS(rs, 2)];
+			      Tm = ci[WS(rs, 2)];
+			      Tn = FMA(Tj, Tk, Tl * Tm);
+			      Tz = FNMS(Tl, Tk, Tj * Tm);
+			 }
+			 Ty = Tw - Tx;
+			 TB = Tz - TA;
+			 TG = Tn - Tq;
+			 TF = Th - T6;
+			 TJ = Tw + Tx;
+			 TK = Tz + TA;
+			 TL = TJ + TK;
+			 Ti = T6 + Th;
+			 Tr = Tn + Tq;
+			 Ts = Ti + Tr;
+		    }
+		    cr[0] = T1 + Ts;
+		    {
+			 E TC, TE, Tv, TD, Tt, Tu;
+			 TC = FMA(KP951056516, Ty, KP587785252 * TB);
+			 TE = FNMS(KP587785252, Ty, KP951056516 * TB);
+			 Tt = KP559016994 * (Ti - Tr);
+			 Tu = FNMS(KP250000000, Ts, T1);
+			 Tv = Tt + Tu;
+			 TD = Tu - Tt;
+			 ci[0] = Tv - TC;
+			 ci[WS(rs, 1)] = TD + TE;
+			 cr[WS(rs, 1)] = Tv + TC;
+			 cr[WS(rs, 2)] = TD - TE;
+		    }
+		    ci[WS(rs, 4)] = TL + TI;
+		    {
+			 E TH, TP, TO, TQ, TM, TN;
+			 TH = FMA(KP587785252, TF, KP951056516 * TG);
+			 TP = FNMS(KP587785252, TG, KP951056516 * TF);
+			 TM = FNMS(KP250000000, TL, TI);
+			 TN = KP559016994 * (TJ - TK);
+			 TO = TM - TN;
+			 TQ = TN + TM;
+			 cr[WS(rs, 3)] = TH - TO;
+			 ci[WS(rs, 3)] = TP + TQ;
+			 ci[WS(rs, 2)] = TH + TO;
+			 cr[WS(rs, 4)] = TP - TQ;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 5, "hf2_5", twinstr, &GENUS, {30, 18, 14, 0} };
+
+void X(codelet_hf2_5) (planner *p) {
+     X(khc2hc_register) (p, hf2_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf2_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf2_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:02 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hf2_8 -include hf.h */
+
+/*
+ * This function contains 74 FP additions, 50 FP multiplications,
+ * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
+ * 64 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E TS, T1l, TJ, T1m, T1k, Tw, T1w, T1u;
+	       {
+		    E T2, T3, Tl, Tn, T5, T4, Tm, Tr, T6;
+		    T2 = W[0];
+		    T3 = W[2];
+		    Tl = W[4];
+		    Tn = W[5];
+		    T5 = W[1];
+		    T4 = T2 * T3;
+		    Tm = T2 * Tl;
+		    Tr = T2 * Tn;
+		    T6 = W[3];
+		    {
+			 E T1, T1s, TG, Td, T1r, Tu, TY, Tk, TW, T18, T1d, TD, TH, TA, T13;
+			 E TE, T14;
+			 {
+			      E To, Ts, Tf, T7, T8, Ti, Tb, T9, Tc, TC, Ta, TF, TB, Tg, Th;
+			      E Tj;
+			      T1 = cr[0];
+			      To = FMA(T5, Tn, Tm);
+			      Ts = FNMS(T5, Tl, Tr);
+			      Tf = FMA(T5, T6, T4);
+			      T7 = FNMS(T5, T6, T4);
+			      Ta = T2 * T6;
+			      T1s = ci[0];
+			      T8 = cr[WS(rs, 4)];
+			      TF = Tf * Tn;
+			      TB = Tf * Tl;
+			      Ti = FNMS(T5, T3, Ta);
+			      Tb = FMA(T5, T3, Ta);
+			      T9 = T7 * T8;
+			      Tc = ci[WS(rs, 4)];
+			      TG = FNMS(Ti, Tl, TF);
+			      TC = FMA(Ti, Tn, TB);
+			      {
+				   E Tp, T1q, Tt, Tq, TX;
+				   Tp = cr[WS(rs, 6)];
+				   Td = FMA(Tb, Tc, T9);
+				   T1q = T7 * Tc;
+				   Tt = ci[WS(rs, 6)];
+				   Tq = To * Tp;
+				   Tg = cr[WS(rs, 2)];
+				   T1r = FNMS(Tb, T8, T1q);
+				   TX = To * Tt;
+				   Tu = FMA(Ts, Tt, Tq);
+				   Th = Tf * Tg;
+				   Tj = ci[WS(rs, 2)];
+				   TY = FNMS(Ts, Tp, TX);
+			      }
+			      {
+				   E TO, TQ, TN, TP, T1a, T1b;
+				   {
+					E TK, TM, TL, T19, TV;
+					TK = cr[WS(rs, 7)];
+					TM = ci[WS(rs, 7)];
+					Tk = FMA(Ti, Tj, Th);
+					TV = Tf * Tj;
+					TL = Tl * TK;
+					T19 = Tl * TM;
+					TO = cr[WS(rs, 3)];
+					TW = FNMS(Ti, Tg, TV);
+					TQ = ci[WS(rs, 3)];
+					TN = FMA(Tn, TM, TL);
+					TP = T3 * TO;
+					T1a = FNMS(Tn, TK, T19);
+					T1b = T3 * TQ;
+				   }
+				   {
+					E Tx, Tz, Ty, T12, T1c, TR;
+					Tx = cr[WS(rs, 1)];
+					TR = FMA(T6, TQ, TP);
+					Tz = ci[WS(rs, 1)];
+					T1c = FNMS(T6, TO, T1b);
+					Ty = T2 * Tx;
+					T18 = TN - TR;
+					TS = TN + TR;
+					T12 = T2 * Tz;
+					T1d = T1a - T1c;
+					T1l = T1a + T1c;
+					TD = cr[WS(rs, 5)];
+					TH = ci[WS(rs, 5)];
+					TA = FMA(T5, Tz, Ty);
+					T13 = FNMS(T5, Tx, T12);
+					TE = TC * TD;
+					T14 = TC * TH;
+				   }
+			      }
+			 }
+			 {
+			      E Te, T1p, Tv, T1t;
+			      {
+				   E T1g, T10, T1z, T1B, T1C, T1j, T1A, T1f;
+				   {
+					E T1x, T11, T16, T1y;
+					{
+					     E TU, TZ, TI, T15;
+					     Te = T1 + Td;
+					     TU = T1 - Td;
+					     TZ = TW - TY;
+					     T1p = TW + TY;
+					     TI = FMA(TG, TH, TE);
+					     T15 = FNMS(TG, TD, T14);
+					     Tv = Tk + Tu;
+					     T1x = Tk - Tu;
+					     T1g = TU - TZ;
+					     T10 = TU + TZ;
+					     T11 = TA - TI;
+					     TJ = TA + TI;
+					     T1m = T13 + T15;
+					     T16 = T13 - T15;
+					     T1y = T1s - T1r;
+					     T1t = T1r + T1s;
+					}
+					{
+					     E T1i, T1e, T17, T1h;
+					     T1i = T18 + T1d;
+					     T1e = T18 - T1d;
+					     T17 = T11 + T16;
+					     T1h = T11 - T16;
+					     T1z = T1x + T1y;
+					     T1B = T1y - T1x;
+					     T1C = T1i - T1h;
+					     T1j = T1h + T1i;
+					     T1A = T1e - T17;
+					     T1f = T17 + T1e;
+					}
+				   }
+				   cr[WS(rs, 3)] = FNMS(KP707106781, T1j, T1g);
+				   cr[WS(rs, 7)] = FMS(KP707106781, T1A, T1z);
+				   cr[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
+				   ci[WS(rs, 2)] = FNMS(KP707106781, T1f, T10);
+				   ci[WS(rs, 6)] = FMA(KP707106781, T1C, T1B);
+				   cr[WS(rs, 5)] = FMS(KP707106781, T1C, T1B);
+				   ci[WS(rs, 4)] = FMA(KP707106781, T1A, T1z);
+				   ci[0] = FMA(KP707106781, T1j, T1g);
+			      }
+			      T1k = Te - Tv;
+			      Tw = Te + Tv;
+			      T1w = T1t - T1p;
+			      T1u = T1p + T1t;
+			 }
+		    }
+	       }
+	       {
+		    E TT, T1v, T1n, T1o;
+		    TT = TJ + TS;
+		    T1v = TS - TJ;
+		    T1n = T1l - T1m;
+		    T1o = T1m + T1l;
+		    ci[WS(rs, 5)] = T1v + T1w;
+		    cr[WS(rs, 6)] = T1v - T1w;
+		    cr[0] = Tw + TT;
+		    ci[WS(rs, 3)] = Tw - TT;
+		    ci[WS(rs, 7)] = T1o + T1u;
+		    cr[WS(rs, 4)] = T1o - T1u;
+		    ci[WS(rs, 1)] = T1k + T1n;
+		    cr[WS(rs, 2)] = T1k - T1n;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 8, "hf2_8", twinstr, &GENUS, {44, 20, 30, 0} };
+
+void X(codelet_hf2_8) (planner *p) {
+     X(khc2hc_register) (p, hf2_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hf2_8 -include hf.h */
+
+/*
+ * This function contains 74 FP additions, 44 FP multiplications,
+ * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
+ * 42 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hf.h"
+
+static void hf2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
+	       {
+		    E T4, Tb, T7, Ta;
+		    T2 = W[0];
+		    T5 = W[1];
+		    T3 = W[2];
+		    T6 = W[3];
+		    T4 = T2 * T3;
+		    Tb = T5 * T3;
+		    T7 = T5 * T6;
+		    Ta = T2 * T6;
+		    T8 = T4 - T7;
+		    Tc = Ta + Tb;
+		    Tg = T4 + T7;
+		    Ti = Ta - Tb;
+		    Tl = W[4];
+		    Tm = W[5];
+		    Tn = FMA(T2, Tl, T5 * Tm);
+		    Tz = FNMS(Ti, Tl, Tg * Tm);
+		    Tp = FNMS(T5, Tl, T2 * Tm);
+		    Tx = FMA(Tg, Tl, Ti * Tm);
+	       }
+	       {
+		    E Tf, T1j, TL, T1d, TJ, T16, TV, TY, Ts, T1i, TO, T1a, TC, T17, TQ;
+		    E TT;
+		    {
+			 E T1, T1c, Te, T1b, T9, Td;
+			 T1 = cr[0];
+			 T1c = ci[0];
+			 T9 = cr[WS(rs, 4)];
+			 Td = ci[WS(rs, 4)];
+			 Te = FMA(T8, T9, Tc * Td);
+			 T1b = FNMS(Tc, T9, T8 * Td);
+			 Tf = T1 + Te;
+			 T1j = T1c - T1b;
+			 TL = T1 - Te;
+			 T1d = T1b + T1c;
+		    }
+		    {
+			 E TF, TW, TI, TX;
+			 {
+			      E TD, TE, TG, TH;
+			      TD = cr[WS(rs, 7)];
+			      TE = ci[WS(rs, 7)];
+			      TF = FMA(Tl, TD, Tm * TE);
+			      TW = FNMS(Tm, TD, Tl * TE);
+			      TG = cr[WS(rs, 3)];
+			      TH = ci[WS(rs, 3)];
+			      TI = FMA(T3, TG, T6 * TH);
+			      TX = FNMS(T6, TG, T3 * TH);
+			 }
+			 TJ = TF + TI;
+			 T16 = TW + TX;
+			 TV = TF - TI;
+			 TY = TW - TX;
+		    }
+		    {
+			 E Tk, TM, Tr, TN;
+			 {
+			      E Th, Tj, To, Tq;
+			      Th = cr[WS(rs, 2)];
+			      Tj = ci[WS(rs, 2)];
+			      Tk = FMA(Tg, Th, Ti * Tj);
+			      TM = FNMS(Ti, Th, Tg * Tj);
+			      To = cr[WS(rs, 6)];
+			      Tq = ci[WS(rs, 6)];
+			      Tr = FMA(Tn, To, Tp * Tq);
+			      TN = FNMS(Tp, To, Tn * Tq);
+			 }
+			 Ts = Tk + Tr;
+			 T1i = Tk - Tr;
+			 TO = TM - TN;
+			 T1a = TM + TN;
+		    }
+		    {
+			 E Tw, TR, TB, TS;
+			 {
+			      E Tu, Tv, Ty, TA;
+			      Tu = cr[WS(rs, 1)];
+			      Tv = ci[WS(rs, 1)];
+			      Tw = FMA(T2, Tu, T5 * Tv);
+			      TR = FNMS(T5, Tu, T2 * Tv);
+			      Ty = cr[WS(rs, 5)];
+			      TA = ci[WS(rs, 5)];
+			      TB = FMA(Tx, Ty, Tz * TA);
+			      TS = FNMS(Tz, Ty, Tx * TA);
+			 }
+			 TC = Tw + TB;
+			 T17 = TR + TS;
+			 TQ = Tw - TB;
+			 TT = TR - TS;
+		    }
+		    {
+			 E Tt, TK, T1f, T1g;
+			 Tt = Tf + Ts;
+			 TK = TC + TJ;
+			 ci[WS(rs, 3)] = Tt - TK;
+			 cr[0] = Tt + TK;
+			 T1f = TJ - TC;
+			 T1g = T1d - T1a;
+			 cr[WS(rs, 6)] = T1f - T1g;
+			 ci[WS(rs, 5)] = T1f + T1g;
+			 {
+			      E T11, T1m, T14, T1l, T12, T13;
+			      T11 = TL - TO;
+			      T1m = T1j - T1i;
+			      T12 = TQ - TT;
+			      T13 = TV + TY;
+			      T14 = KP707106781 * (T12 + T13);
+			      T1l = KP707106781 * (T13 - T12);
+			      cr[WS(rs, 3)] = T11 - T14;
+			      ci[WS(rs, 6)] = T1l + T1m;
+			      ci[0] = T11 + T14;
+			      cr[WS(rs, 5)] = T1l - T1m;
+			 }
+		    }
+		    {
+			 E T19, T1e, T15, T18;
+			 T19 = T17 + T16;
+			 T1e = T1a + T1d;
+			 cr[WS(rs, 4)] = T19 - T1e;
+			 ci[WS(rs, 7)] = T19 + T1e;
+			 T15 = Tf - Ts;
+			 T18 = T16 - T17;
+			 cr[WS(rs, 2)] = T15 - T18;
+			 ci[WS(rs, 1)] = T15 + T18;
+			 {
+			      E TP, T1k, T10, T1h, TU, TZ;
+			      TP = TL + TO;
+			      T1k = T1i + T1j;
+			      TU = TQ + TT;
+			      TZ = TV - TY;
+			      T10 = KP707106781 * (TU + TZ);
+			      T1h = KP707106781 * (TZ - TU);
+			      ci[WS(rs, 2)] = TP - T10;
+			      ci[WS(rs, 4)] = T1h + T1k;
+			      cr[WS(rs, 1)] = TP + T10;
+			      cr[WS(rs, 7)] = T1h - T1k;
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_CEXP, 1, 1},
+     {TW_CEXP, 1, 3},
+     {TW_CEXP, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 8, "hf2_8", twinstr, &GENUS, {56, 26, 18, 0} };
+
+void X(codelet_hf2_8) (planner *p) {
+     X(khc2hc_register) (p, hf2_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:52 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hf_10 -include hf.h */
+
+/*
+ * This function contains 102 FP additions, 72 FP multiplications,
+ * (or, 48 additions, 18 multiplications, 54 fused multiply/add),
+ * 72 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hf.h"
+
+static void hf_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
+	       E T29, T2d, T2c, T2e;
+	       {
+		    E T23, T1U, T8, T12, T1y, T1P, T25, T1H, T2b, T18, T10, T1Y, T1I, Tl, T13;
+		    E T1J, Ty, T14, T1n, T1O, T24, T1K;
+		    {
+			 E T1, T1R, T3, T6, T2, T5;
+			 T1 = cr[0];
+			 T1R = ci[0];
+			 T3 = cr[WS(rs, 5)];
+			 T6 = ci[WS(rs, 5)];
+			 T2 = W[8];
+			 T5 = W[9];
+			 {
+			      E T1p, TY, T1x, T1F, TM, T16, T1r, TS;
+			      {
+				   E TF, T1w, TO, TR, T1u, TL, TN, TQ, T1q, TP;
+				   {
+					E TU, TX, TT, TW;
+					{
+					     E TB, TE, T1S, T4, TA, TD;
+					     TB = cr[WS(rs, 4)];
+					     TE = ci[WS(rs, 4)];
+					     T1S = T2 * T6;
+					     T4 = T2 * T3;
+					     TA = W[6];
+					     TD = W[7];
+					     {
+						  E T1T, T7, T1v, TC;
+						  T1T = FNMS(T5, T3, T1S);
+						  T7 = FMA(T5, T6, T4);
+						  T1v = TA * TE;
+						  TC = TA * TB;
+						  T23 = T1T + T1R;
+						  T1U = T1R - T1T;
+						  T8 = T1 - T7;
+						  T12 = T1 + T7;
+						  TF = FMA(TD, TE, TC);
+						  T1w = FNMS(TD, TB, T1v);
+					     }
+					}
+					TU = cr[WS(rs, 1)];
+					TX = ci[WS(rs, 1)];
+					TT = W[0];
+					TW = W[1];
+					{
+					     E TH, TK, TJ, T1t, TI, T1o, TV, TG;
+					     TH = cr[WS(rs, 9)];
+					     TK = ci[WS(rs, 9)];
+					     T1o = TT * TX;
+					     TV = TT * TU;
+					     TG = W[16];
+					     TJ = W[17];
+					     T1p = FNMS(TW, TU, T1o);
+					     TY = FMA(TW, TX, TV);
+					     T1t = TG * TK;
+					     TI = TG * TH;
+					     TO = cr[WS(rs, 6)];
+					     TR = ci[WS(rs, 6)];
+					     T1u = FNMS(TJ, TH, T1t);
+					     TL = FMA(TJ, TK, TI);
+					     TN = W[10];
+					     TQ = W[11];
+					}
+				   }
+				   T1x = T1u - T1w;
+				   T1F = T1w + T1u;
+				   TM = TF - TL;
+				   T16 = TF + TL;
+				   T1q = TN * TR;
+				   TP = TN * TO;
+				   T1r = FNMS(TQ, TO, T1q);
+				   TS = FMA(TQ, TR, TP);
+			      }
+			      {
+				   E T1l, Te, T1e, Tx, Tn, Tq, Tp, T1j, Tk, T1f, To;
+				   {
+					E Tt, Tw, Tv, T1d, Tu;
+					{
+					     E Ta, Td, T9, Tc, T1k, Tb, Ts;
+					     Ta = cr[WS(rs, 2)];
+					     Td = ci[WS(rs, 2)];
+					     {
+						  E T1G, T1s, TZ, T17;
+						  T1G = T1r + T1p;
+						  T1s = T1p - T1r;
+						  TZ = TS - TY;
+						  T17 = TS + TY;
+						  T1y = T1s - T1x;
+						  T1P = T1x + T1s;
+						  T25 = T1F + T1G;
+						  T1H = T1F - T1G;
+						  T2b = T16 - T17;
+						  T18 = T16 + T17;
+						  T10 = TM + TZ;
+						  T1Y = TZ - TM;
+						  T9 = W[2];
+					     }
+					     Tc = W[3];
+					     Tt = cr[WS(rs, 3)];
+					     Tw = ci[WS(rs, 3)];
+					     T1k = T9 * Td;
+					     Tb = T9 * Ta;
+					     Ts = W[4];
+					     Tv = W[5];
+					     T1l = FNMS(Tc, Ta, T1k);
+					     Te = FMA(Tc, Td, Tb);
+					     T1d = Ts * Tw;
+					     Tu = Ts * Tt;
+					}
+					{
+					     E Tg, Tj, Tf, Ti, T1i, Th, Tm;
+					     Tg = cr[WS(rs, 7)];
+					     Tj = ci[WS(rs, 7)];
+					     T1e = FNMS(Tv, Tt, T1d);
+					     Tx = FMA(Tv, Tw, Tu);
+					     Tf = W[12];
+					     Ti = W[13];
+					     Tn = cr[WS(rs, 8)];
+					     Tq = ci[WS(rs, 8)];
+					     T1i = Tf * Tj;
+					     Th = Tf * Tg;
+					     Tm = W[14];
+					     Tp = W[15];
+					     T1j = FNMS(Ti, Tg, T1i);
+					     Tk = FMA(Ti, Tj, Th);
+					     T1f = Tm * Tq;
+					     To = Tm * Tn;
+					}
+				   }
+				   {
+					E T1m, T1g, Tr, T1h;
+					T1m = T1j - T1l;
+					T1I = T1l + T1j;
+					Tl = Te - Tk;
+					T13 = Te + Tk;
+					T1g = FNMS(Tp, Tn, T1f);
+					Tr = FMA(Tp, Tq, To);
+					T1J = T1g + T1e;
+					T1h = T1e - T1g;
+					Ty = Tr - Tx;
+					T14 = Tr + Tx;
+					T1n = T1h - T1m;
+					T1O = T1m + T1h;
+				   }
+			      }
+			 }
+		    }
+		    T24 = T1I + T1J;
+		    T1K = T1I - T1J;
+		    {
+			 E T2a, T15, Tz, T1Z;
+			 T2a = T13 - T14;
+			 T15 = T13 + T14;
+			 Tz = Tl + Ty;
+			 T1Z = Ty - Tl;
+			 {
+			      E T1L, T1N, T1E, T1M;
+			      {
+				   E T19, T1D, T1C, T11, T1b;
+				   T19 = T15 + T18;
+				   T1D = T15 - T18;
+				   T11 = Tz + T10;
+				   T1b = Tz - T10;
+				   {
+					E T1B, T1z, T1a, T1A, T1c;
+					T1B = FNMS(KP618033988, T1n, T1y);
+					T1z = FMA(KP618033988, T1y, T1n);
+					ci[WS(rs, 4)] = T8 + T11;
+					T1a = FNMS(KP250000000, T11, T8);
+					T1A = FNMS(KP559016994, T1b, T1a);
+					T1c = FMA(KP559016994, T1b, T1a);
+					T1C = FNMS(KP250000000, T19, T12);
+					T1L = FNMS(KP618033988, T1K, T1H);
+					T1N = FMA(KP618033988, T1H, T1K);
+					cr[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
+					ci[0] = FNMS(KP951056516, T1z, T1c);
+					cr[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
+					ci[WS(rs, 2)] = FNMS(KP951056516, T1B, T1A);
+				   }
+				   cr[0] = T12 + T19;
+				   T1E = FNMS(KP559016994, T1D, T1C);
+				   T1M = FMA(KP559016994, T1D, T1C);
+			      }
+			      {
+				   E T1X, T21, T20, T22, T1Q, T1W, T1V, T26, T28, T27;
+				   T1Q = T1O + T1P;
+				   T1W = T1P - T1O;
+				   ci[WS(rs, 3)] = FMA(KP951056516, T1N, T1M);
+				   cr[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M);
+				   ci[WS(rs, 1)] = FMA(KP951056516, T1L, T1E);
+				   cr[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E);
+				   T1V = FMA(KP250000000, T1Q, T1U);
+				   cr[WS(rs, 5)] = T1Q - T1U;
+				   T1X = FNMS(KP559016994, T1W, T1V);
+				   T21 = FMA(KP559016994, T1W, T1V);
+				   T20 = FNMS(KP618033988, T1Z, T1Y);
+				   T22 = FMA(KP618033988, T1Y, T1Z);
+				   T26 = T24 + T25;
+				   T28 = T24 - T25;
+				   ci[WS(rs, 8)] = FMA(KP951056516, T22, T21);
+				   cr[WS(rs, 9)] = FMS(KP951056516, T22, T21);
+				   ci[WS(rs, 6)] = FMA(KP951056516, T20, T1X);
+				   cr[WS(rs, 7)] = FMS(KP951056516, T20, T1X);
+				   T27 = FNMS(KP250000000, T26, T23);
+				   ci[WS(rs, 9)] = T26 + T23;
+				   T29 = FMA(KP559016994, T28, T27);
+				   T2d = FNMS(KP559016994, T28, T27);
+				   T2c = FMA(KP618033988, T2b, T2a);
+				   T2e = FNMS(KP618033988, T2a, T2b);
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 7)] = FMA(KP951056516, T2e, T2d);
+	       cr[WS(rs, 8)] = FMS(KP951056516, T2e, T2d);
+	       ci[WS(rs, 5)] = FMA(KP951056516, T2c, T29);
+	       cr[WS(rs, 6)] = FMS(KP951056516, T2c, T29);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 10, "hf_10", twinstr, &GENUS, {48, 18, 54, 0} };
+
+void X(codelet_hf_10) (planner *p) {
+     X(khc2hc_register) (p, hf_10, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hf_10 -include hf.h */
+
+/*
+ * This function contains 102 FP additions, 60 FP multiplications,
+ * (or, 72 additions, 30 multiplications, 30 fused multiply/add),
+ * 45 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hf.h"
+
+static void hf_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
+	       E T7, T1R, TT, T1C, TF, TQ, TR, T1o, T1p, T1P, TX, TY, TZ, T1d, T1g;
+	       E T1x, Ti, Tt, Tu, T1r, T1s, T1O, TU, TV, TW, T16, T19, T1y;
+	       {
+		    E T1, T1A, T6, T1B;
+		    T1 = cr[0];
+		    T1A = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 5)];
+			 T5 = ci[WS(rs, 5)];
+			 T2 = W[8];
+			 T4 = W[9];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T1B = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 - T6;
+		    T1R = T1B + T1A;
+		    TT = T1 + T6;
+		    T1C = T1A - T1B;
+	       }
+	       {
+		    E Tz, T1b, TP, T1e, TE, T1c, TK, T1f;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = cr[WS(rs, 4)];
+			 Ty = ci[WS(rs, 4)];
+			 Tv = W[6];
+			 Tx = W[7];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T1b = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TM, TO, TL, TN;
+			 TM = cr[WS(rs, 1)];
+			 TO = ci[WS(rs, 1)];
+			 TL = W[0];
+			 TN = W[1];
+			 TP = FMA(TL, TM, TN * TO);
+			 T1e = FNMS(TN, TM, TL * TO);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = cr[WS(rs, 9)];
+			 TD = ci[WS(rs, 9)];
+			 TA = W[16];
+			 TC = W[17];
+			 TE = FMA(TA, TB, TC * TD);
+			 T1c = FNMS(TC, TB, TA * TD);
+		    }
+		    {
+			 E TH, TJ, TG, TI;
+			 TH = cr[WS(rs, 6)];
+			 TJ = ci[WS(rs, 6)];
+			 TG = W[10];
+			 TI = W[11];
+			 TK = FMA(TG, TH, TI * TJ);
+			 T1f = FNMS(TI, TH, TG * TJ);
+		    }
+		    TF = Tz - TE;
+		    TQ = TK - TP;
+		    TR = TF + TQ;
+		    T1o = T1b + T1c;
+		    T1p = T1f + T1e;
+		    T1P = T1o + T1p;
+		    TX = Tz + TE;
+		    TY = TK + TP;
+		    TZ = TX + TY;
+		    T1d = T1b - T1c;
+		    T1g = T1e - T1f;
+		    T1x = T1g - T1d;
+	       }
+	       {
+		    E Tc, T14, Ts, T18, Th, T15, Tn, T17;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = cr[WS(rs, 2)];
+			 Tb = ci[WS(rs, 2)];
+			 T8 = W[2];
+			 Ta = W[3];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T14 = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = cr[WS(rs, 3)];
+			 Tr = ci[WS(rs, 3)];
+			 To = W[4];
+			 Tq = W[5];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 T18 = FNMS(Tq, Tp, To * Tr);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = cr[WS(rs, 7)];
+			 Tg = ci[WS(rs, 7)];
+			 Td = W[12];
+			 Tf = W[13];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T15 = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = cr[WS(rs, 8)];
+			 Tm = ci[WS(rs, 8)];
+			 Tj = W[14];
+			 Tl = W[15];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 T17 = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    Ti = Tc - Th;
+		    Tt = Tn - Ts;
+		    Tu = Ti + Tt;
+		    T1r = T14 + T15;
+		    T1s = T17 + T18;
+		    T1O = T1r + T1s;
+		    TU = Tc + Th;
+		    TV = Tn + Ts;
+		    TW = TU + TV;
+		    T16 = T14 - T15;
+		    T19 = T17 - T18;
+		    T1y = T16 + T19;
+	       }
+	       {
+		    E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13;
+		    T11 = KP559016994 * (Tu - TR);
+		    TS = Tu + TR;
+		    T12 = FNMS(KP250000000, TS, T7);
+		    T1a = T16 - T19;
+		    T1h = T1d + T1g;
+		    T1i = FMA(KP951056516, T1a, KP587785252 * T1h);
+		    T1k = FNMS(KP587785252, T1a, KP951056516 * T1h);
+		    ci[WS(rs, 4)] = T7 + TS;
+		    T1j = T12 - T11;
+		    ci[WS(rs, 2)] = T1j - T1k;
+		    cr[WS(rs, 3)] = T1j + T1k;
+		    T13 = T11 + T12;
+		    ci[0] = T13 - T1i;
+		    cr[WS(rs, 1)] = T13 + T1i;
+	       }
+	       {
+		    E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n;
+		    T1m = KP559016994 * (TW - TZ);
+		    T10 = TW + TZ;
+		    T1l = FNMS(KP250000000, T10, TT);
+		    T1q = T1o - T1p;
+		    T1t = T1r - T1s;
+		    T1u = FNMS(KP587785252, T1t, KP951056516 * T1q);
+		    T1w = FMA(KP951056516, T1t, KP587785252 * T1q);
+		    cr[0] = TT + T10;
+		    T1v = T1m + T1l;
+		    cr[WS(rs, 4)] = T1v - T1w;
+		    ci[WS(rs, 3)] = T1v + T1w;
+		    T1n = T1l - T1m;
+		    cr[WS(rs, 2)] = T1n - T1u;
+		    ci[WS(rs, 1)] = T1n + T1u;
+	       }
+	       {
+		    E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
+		    T1H = KP559016994 * (T1y + T1x);
+		    T1z = T1x - T1y;
+		    T1G = FMA(KP250000000, T1z, T1C);
+		    T1D = Ti - Tt;
+		    T1E = TQ - TF;
+		    T1F = FMA(KP587785252, T1D, KP951056516 * T1E);
+		    T1J = FNMS(KP951056516, T1D, KP587785252 * T1E);
+		    cr[WS(rs, 5)] = T1z - T1C;
+		    T1K = T1H + T1G;
+		    cr[WS(rs, 9)] = T1J - T1K;
+		    ci[WS(rs, 8)] = T1J + T1K;
+		    T1I = T1G - T1H;
+		    cr[WS(rs, 7)] = T1F - T1I;
+		    ci[WS(rs, 6)] = T1F + T1I;
+	       }
+	       {
+		    E T1Q, T1S, T1T, T1N, T1V, T1L, T1M, T1W, T1U;
+		    T1Q = KP559016994 * (T1O - T1P);
+		    T1S = T1O + T1P;
+		    T1T = FNMS(KP250000000, T1S, T1R);
+		    T1L = TU - TV;
+		    T1M = TX - TY;
+		    T1N = FMA(KP951056516, T1L, KP587785252 * T1M);
+		    T1V = FNMS(KP587785252, T1L, KP951056516 * T1M);
+		    ci[WS(rs, 9)] = T1S + T1R;
+		    T1W = T1T - T1Q;
+		    cr[WS(rs, 8)] = T1V - T1W;
+		    ci[WS(rs, 7)] = T1V + T1W;
+		    T1U = T1Q + T1T;
+		    cr[WS(rs, 6)] = T1N - T1U;
+		    ci[WS(rs, 5)] = T1N + T1U;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 10},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 10, "hf_10", twinstr, &GENUS, {72, 30, 30, 0} };
+
+void X(codelet_hf_10) (planner *p) {
+     X(khc2hc_register) (p, hf_10, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:52 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hf_12 -include hf.h */
+
+/*
+ * This function contains 118 FP additions, 68 FP multiplications,
+ * (or, 72 additions, 22 multiplications, 46 fused multiply/add),
+ * 84 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hf.h"
+
+static void hf_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T2u, T2n;
+	       {
+		    E T1, T2i, T2e, Tl, T1Y, T10, T1S, TG, T2f, T1s, T2s, Ty, T1Z, T1H, T21;
+		    E T1d, TI, TL, T2h, T1l, T2p, Te, TJ, T1w, TO, TR, TN, TK, TQ;
+		    {
+			 E TW, TZ, TY, T1X, TX;
+			 T1 = cr[0];
+			 T2i = ci[0];
+			 {
+			      E Th, Tk, Tg, Tj, T2d, Ti, TV;
+			      Th = cr[WS(rs, 6)];
+			      Tk = ci[WS(rs, 6)];
+			      Tg = W[10];
+			      Tj = W[11];
+			      TW = cr[WS(rs, 9)];
+			      TZ = ci[WS(rs, 9)];
+			      T2d = Tg * Tk;
+			      Ti = Tg * Th;
+			      TV = W[16];
+			      TY = W[17];
+			      T2e = FNMS(Tj, Th, T2d);
+			      Tl = FMA(Tj, Tk, Ti);
+			      T1X = TV * TZ;
+			      TX = TV * TW;
+			 }
+			 {
+			      E Tn, Tq, Tt, T1o, To, Tw, Ts, Tp, Tv;
+			      {
+				   E TC, TF, TB, TE, T1R, TD, Tm;
+				   TC = cr[WS(rs, 3)];
+				   TF = ci[WS(rs, 3)];
+				   T1Y = FNMS(TY, TW, T1X);
+				   T10 = FMA(TY, TZ, TX);
+				   TB = W[4];
+				   TE = W[5];
+				   Tn = cr[WS(rs, 10)];
+				   Tq = ci[WS(rs, 10)];
+				   T1R = TB * TF;
+				   TD = TB * TC;
+				   Tm = W[18];
+				   Tt = cr[WS(rs, 2)];
+				   T1S = FNMS(TE, TC, T1R);
+				   TG = FMA(TE, TF, TD);
+				   T1o = Tm * Tq;
+				   To = Tm * Tn;
+				   Tw = ci[WS(rs, 2)];
+				   Ts = W[2];
+				   Tp = W[19];
+				   Tv = W[3];
+			      }
+			      {
+				   E T12, T15, T13, T1D, T18, T1b, T17, T14, T1a;
+				   {
+					E T1p, Tr, T1r, Tx, T1q, Tu, T11;
+					T12 = cr[WS(rs, 1)];
+					T1q = Ts * Tw;
+					Tu = Ts * Tt;
+					T1p = FNMS(Tp, Tn, T1o);
+					Tr = FMA(Tp, Tq, To);
+					T1r = FNMS(Tv, Tt, T1q);
+					Tx = FMA(Tv, Tw, Tu);
+					T15 = ci[WS(rs, 1)];
+					T11 = W[0];
+					T2f = T1p + T1r;
+					T1s = T1p - T1r;
+					T2s = Tx - Tr;
+					Ty = Tr + Tx;
+					T13 = T11 * T12;
+					T1D = T11 * T15;
+				   }
+				   T18 = cr[WS(rs, 5)];
+				   T1b = ci[WS(rs, 5)];
+				   T17 = W[8];
+				   T14 = W[1];
+				   T1a = W[9];
+				   {
+					E T3, T6, T4, T1h, T9, Tc, T8, T5, Tb;
+					{
+					     E T1E, T16, T1G, T1c, T1F, T19, T2;
+					     T3 = cr[WS(rs, 4)];
+					     T1F = T17 * T1b;
+					     T19 = T17 * T18;
+					     T1E = FNMS(T14, T12, T1D);
+					     T16 = FMA(T14, T15, T13);
+					     T1G = FNMS(T1a, T18, T1F);
+					     T1c = FMA(T1a, T1b, T19);
+					     T6 = ci[WS(rs, 4)];
+					     T2 = W[6];
+					     T1Z = T1E + T1G;
+					     T1H = T1E - T1G;
+					     T21 = T1c - T16;
+					     T1d = T16 + T1c;
+					     T4 = T2 * T3;
+					     T1h = T2 * T6;
+					}
+					T9 = cr[WS(rs, 8)];
+					Tc = ci[WS(rs, 8)];
+					T8 = W[14];
+					T5 = W[7];
+					Tb = W[15];
+					{
+					     E T1i, T7, T1k, Td, T1j, Ta, TH;
+					     TI = cr[WS(rs, 7)];
+					     T1j = T8 * Tc;
+					     Ta = T8 * T9;
+					     T1i = FNMS(T5, T3, T1h);
+					     T7 = FMA(T5, T6, T4);
+					     T1k = FNMS(Tb, T9, T1j);
+					     Td = FMA(Tb, Tc, Ta);
+					     TL = ci[WS(rs, 7)];
+					     TH = W[12];
+					     T2h = T1i + T1k;
+					     T1l = T1i - T1k;
+					     T2p = Td - T7;
+					     Te = T7 + Td;
+					     TJ = TH * TI;
+					     T1w = TH * TL;
+					}
+					TO = cr[WS(rs, 11)];
+					TR = ci[WS(rs, 11)];
+					TN = W[20];
+					TK = W[13];
+					TQ = W[21];
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T1g, T1n, T2r, T1A, T1V, T28, TA, T2o, T1v, T1C, T1U, T29, T2m, T2k, T2l;
+			 E T1f, T2a, T20;
+			 {
+			      E T2g, T1T, TT, T2j, TU, T1e;
+			      {
+				   E Tf, T1x, TM, T1z, TS, Tz, T1y, TP;
+				   T1g = FNMS(KP500000000, Te, T1);
+				   Tf = T1 + Te;
+				   T1y = TN * TR;
+				   TP = TN * TO;
+				   T1x = FNMS(TK, TI, T1w);
+				   TM = FMA(TK, TL, TJ);
+				   T1z = FNMS(TQ, TO, T1y);
+				   TS = FMA(TQ, TR, TP);
+				   Tz = Tl + Ty;
+				   T1n = FNMS(KP500000000, Ty, Tl);
+				   T2r = FNMS(KP500000000, T2f, T2e);
+				   T2g = T2e + T2f;
+				   T1T = T1x + T1z;
+				   T1A = T1x - T1z;
+				   T1V = TS - TM;
+				   TT = TM + TS;
+				   T28 = Tf - Tz;
+				   TA = Tf + Tz;
+				   T2j = T2h + T2i;
+				   T2o = FNMS(KP500000000, T2h, T2i);
+			      }
+			      T1v = FNMS(KP500000000, TT, TG);
+			      TU = TG + TT;
+			      T1e = T10 + T1d;
+			      T1C = FNMS(KP500000000, T1d, T10);
+			      T1U = FNMS(KP500000000, T1T, T1S);
+			      T29 = T1S + T1T;
+			      T2m = T2j - T2g;
+			      T2k = T2g + T2j;
+			      T2l = TU - T1e;
+			      T1f = TU + T1e;
+			      T2a = T1Y + T1Z;
+			      T20 = FNMS(KP500000000, T1Z, T1Y);
+			 }
+			 {
+			      E T1m, T1K, T2y, T2q, T2z, T2t, T1L, T1t, T1B, T1N, T2c, T2b;
+			      ci[WS(rs, 8)] = T2l + T2m;
+			      cr[WS(rs, 9)] = T2l - T2m;
+			      cr[0] = TA + T1f;
+			      ci[WS(rs, 5)] = TA - T1f;
+			      T2c = T29 + T2a;
+			      T2b = T29 - T2a;
+			      T1m = FNMS(KP866025403, T1l, T1g);
+			      T1K = FMA(KP866025403, T1l, T1g);
+			      ci[WS(rs, 11)] = T2c + T2k;
+			      cr[WS(rs, 6)] = T2c - T2k;
+			      ci[WS(rs, 2)] = T28 + T2b;
+			      cr[WS(rs, 3)] = T28 - T2b;
+			      T2y = FMA(KP866025403, T2p, T2o);
+			      T2q = FNMS(KP866025403, T2p, T2o);
+			      T2z = FMA(KP866025403, T2s, T2r);
+			      T2t = FNMS(KP866025403, T2s, T2r);
+			      T1L = FMA(KP866025403, T1s, T1n);
+			      T1t = FNMS(KP866025403, T1s, T1n);
+			      T1B = FNMS(KP866025403, T1A, T1v);
+			      T1N = FMA(KP866025403, T1A, T1v);
+			      {
+				   E T1Q, T23, T27, T2A, T1P, T2x, T24, T1M;
+				   {
+					E T1u, T25, T26, T1O, T1I, T2w, T2v, T1W, T22, T2B, T1J, T2C;
+					T1Q = T1m - T1t;
+					T1u = T1m + T1t;
+					T25 = FMA(KP866025403, T1V, T1U);
+					T1W = FNMS(KP866025403, T1V, T1U);
+					T26 = FMA(KP866025403, T21, T20);
+					T22 = FNMS(KP866025403, T21, T20);
+					T1O = FMA(KP866025403, T1H, T1C);
+					T1I = FNMS(KP866025403, T1H, T1C);
+					T2w = T2t + T2q;
+					T2u = T2q - T2t;
+					T23 = T1W - T22;
+					T2v = T1W + T22;
+					T2B = T25 + T26;
+					T27 = T25 - T26;
+					T2n = T1I - T1B;
+					T1J = T1B + T1I;
+					T2C = T2z + T2y;
+					T2A = T2y - T2z;
+					ci[WS(rs, 9)] = T2w - T2v;
+					cr[WS(rs, 8)] = -(T2v + T2w);
+					ci[WS(rs, 3)] = T1u + T1J;
+					cr[WS(rs, 2)] = T1u - T1J;
+					cr[WS(rs, 10)] = T2B - T2C;
+					ci[WS(rs, 7)] = T2B + T2C;
+					T1P = T1N + T1O;
+					T2x = T1O - T1N;
+				   }
+				   T24 = T1K - T1L;
+				   T1M = T1K + T1L;
+				   ci[WS(rs, 10)] = T2x + T2A;
+				   cr[WS(rs, 7)] = T2x - T2A;
+				   cr[WS(rs, 4)] = T1M + T1P;
+				   ci[WS(rs, 1)] = T1M - T1P;
+				   cr[WS(rs, 1)] = T24 + T27;
+				   ci[WS(rs, 4)] = T24 - T27;
+				   cr[WS(rs, 5)] = T1Q + T23;
+				   ci[0] = T1Q - T23;
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 6)] = T2n + T2u;
+	       cr[WS(rs, 11)] = T2n - T2u;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 12, "hf_12", twinstr, &GENUS, {72, 22, 46, 0} };
+
+void X(codelet_hf_12) (planner *p) {
+     X(khc2hc_register) (p, hf_12, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hf_12 -include hf.h */
+
+/*
+ * This function contains 118 FP additions, 60 FP multiplications,
+ * (or, 88 additions, 30 multiplications, 30 fused multiply/add),
+ * 47 stack variables, 2 constants, and 48 memory accesses
+ */
+#include "hf.h"
+
+static void hf_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
+	       E T1, T1W, T18, T23, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F;
+	       E T1G, Ti, T1S, T1d, T26, Tt, T1a, T1T, T25, TA, T1y, T1j, T1B, TL, T1g;
+	       E T1z, T1A;
+	       {
+		    E T6, T16, Tb, T17;
+		    T1 = cr[0];
+		    T1W = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 4)];
+			 T5 = ci[WS(rs, 4)];
+			 T2 = W[6];
+			 T4 = W[7];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T16 = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = cr[WS(rs, 8)];
+			 Ta = ci[WS(rs, 8)];
+			 T7 = W[14];
+			 T9 = W[15];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 T17 = FNMS(T9, T8, T7 * Ta);
+		    }
+		    T18 = KP866025403 * (T16 - T17);
+		    T23 = KP866025403 * (Tb - T6);
+		    Tc = T6 + Tb;
+		    T15 = FNMS(KP500000000, Tc, T1);
+		    T1V = T16 + T17;
+		    T22 = FNMS(KP500000000, T1V, T1W);
+	       }
+	       {
+		    E T11, T1n, TW, T1m;
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = cr[WS(rs, 9)];
+			 TQ = ci[WS(rs, 9)];
+			 TN = W[16];
+			 TP = W[17];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T1E = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E TY, T10, TX, TZ;
+			 TY = cr[WS(rs, 5)];
+			 T10 = ci[WS(rs, 5)];
+			 TX = W[8];
+			 TZ = W[9];
+			 T11 = FMA(TX, TY, TZ * T10);
+			 T1n = FNMS(TZ, TY, TX * T10);
+		    }
+		    {
+			 E TT, TV, TS, TU;
+			 TT = cr[WS(rs, 1)];
+			 TV = ci[WS(rs, 1)];
+			 TS = W[0];
+			 TU = W[1];
+			 TW = FMA(TS, TT, TU * TV);
+			 T1m = FNMS(TU, TT, TS * TV);
+		    }
+		    T1o = KP866025403 * (T1m - T1n);
+		    T1D = KP866025403 * (T11 - TW);
+		    T12 = TW + T11;
+		    T1l = FNMS(KP500000000, T12, TR);
+		    T1F = T1m + T1n;
+		    T1G = FNMS(KP500000000, T1F, T1E);
+	       }
+	       {
+		    E Ts, T1c, Tn, T1b;
+		    {
+			 E Tf, Th, Te, Tg;
+			 Tf = cr[WS(rs, 6)];
+			 Th = ci[WS(rs, 6)];
+			 Te = W[10];
+			 Tg = W[11];
+			 Ti = FMA(Te, Tf, Tg * Th);
+			 T1S = FNMS(Tg, Tf, Te * Th);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = cr[WS(rs, 2)];
+			 Tr = ci[WS(rs, 2)];
+			 To = W[2];
+			 Tq = W[3];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 T1c = FNMS(Tq, Tp, To * Tr);
+		    }
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = cr[WS(rs, 10)];
+			 Tm = ci[WS(rs, 10)];
+			 Tj = W[18];
+			 Tl = W[19];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 T1b = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    T1d = KP866025403 * (T1b - T1c);
+		    T26 = KP866025403 * (Ts - Tn);
+		    Tt = Tn + Ts;
+		    T1a = FNMS(KP500000000, Tt, Ti);
+		    T1T = T1b + T1c;
+		    T25 = FNMS(KP500000000, T1T, T1S);
+	       }
+	       {
+		    E TK, T1i, TF, T1h;
+		    {
+			 E Tx, Tz, Tw, Ty;
+			 Tx = cr[WS(rs, 3)];
+			 Tz = ci[WS(rs, 3)];
+			 Tw = W[4];
+			 Ty = W[5];
+			 TA = FMA(Tw, Tx, Ty * Tz);
+			 T1y = FNMS(Ty, Tx, Tw * Tz);
+		    }
+		    {
+			 E TH, TJ, TG, TI;
+			 TH = cr[WS(rs, 11)];
+			 TJ = ci[WS(rs, 11)];
+			 TG = W[20];
+			 TI = W[21];
+			 TK = FMA(TG, TH, TI * TJ);
+			 T1i = FNMS(TI, TH, TG * TJ);
+		    }
+		    {
+			 E TC, TE, TB, TD;
+			 TC = cr[WS(rs, 7)];
+			 TE = ci[WS(rs, 7)];
+			 TB = W[12];
+			 TD = W[13];
+			 TF = FMA(TB, TC, TD * TE);
+			 T1h = FNMS(TD, TC, TB * TE);
+		    }
+		    T1j = KP866025403 * (T1h - T1i);
+		    T1B = KP866025403 * (TK - TF);
+		    TL = TF + TK;
+		    T1g = FNMS(KP500000000, TL, TA);
+		    T1z = T1h + T1i;
+		    T1A = FNMS(KP500000000, T1z, T1y);
+	       }
+	       {
+		    E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
+		    {
+			 E Td, Tu, T1U, T1X;
+			 Td = T1 + Tc;
+			 Tu = Ti + Tt;
+			 Tv = Td + Tu;
+			 T1N = Td - Tu;
+			 T1U = T1S + T1T;
+			 T1X = T1V + T1W;
+			 T1Y = T1U + T1X;
+			 T20 = T1X - T1U;
+		    }
+		    {
+			 E TM, T13, T1O, T1P;
+			 TM = TA + TL;
+			 T13 = TR + T12;
+			 T14 = TM + T13;
+			 T1Z = TM - T13;
+			 T1O = T1y + T1z;
+			 T1P = T1E + T1F;
+			 T1Q = T1O - T1P;
+			 T1R = T1O + T1P;
+		    }
+		    ci[WS(rs, 5)] = Tv - T14;
+		    cr[WS(rs, 9)] = T1Z - T20;
+		    ci[WS(rs, 8)] = T1Z + T20;
+		    cr[0] = Tv + T14;
+		    cr[WS(rs, 3)] = T1N - T1Q;
+		    cr[WS(rs, 6)] = T1R - T1Y;
+		    ci[WS(rs, 11)] = T1R + T1Y;
+		    ci[WS(rs, 2)] = T1N + T1Q;
+	       }
+	       {
+		    E T1f, T1x, T28, T2a, T1q, T21, T1I, T29;
+		    {
+			 E T19, T1e, T24, T27;
+			 T19 = T15 - T18;
+			 T1e = T1a - T1d;
+			 T1f = T19 + T1e;
+			 T1x = T19 - T1e;
+			 T24 = T22 - T23;
+			 T27 = T25 - T26;
+			 T28 = T24 - T27;
+			 T2a = T27 + T24;
+		    }
+		    {
+			 E T1k, T1p, T1C, T1H;
+			 T1k = T1g - T1j;
+			 T1p = T1l - T1o;
+			 T1q = T1k + T1p;
+			 T21 = T1p - T1k;
+			 T1C = T1A - T1B;
+			 T1H = T1D - T1G;
+			 T1I = T1C + T1H;
+			 T29 = T1H - T1C;
+		    }
+		    cr[WS(rs, 2)] = T1f - T1q;
+		    cr[WS(rs, 8)] = T29 - T2a;
+		    ci[WS(rs, 9)] = T29 + T2a;
+		    ci[WS(rs, 3)] = T1f + T1q;
+		    ci[0] = T1x - T1I;
+		    cr[WS(rs, 11)] = T21 - T28;
+		    ci[WS(rs, 6)] = T21 + T28;
+		    cr[WS(rs, 5)] = T1x + T1I;
+	       }
+	       {
+		    E T1t, T1J, T2e, T2g, T1w, T2b, T1M, T2f;
+		    {
+			 E T1r, T1s, T2c, T2d;
+			 T1r = T15 + T18;
+			 T1s = T1a + T1d;
+			 T1t = T1r + T1s;
+			 T1J = T1r - T1s;
+			 T2c = T23 + T22;
+			 T2d = T26 + T25;
+			 T2e = T2c - T2d;
+			 T2g = T2d + T2c;
+		    }
+		    {
+			 E T1u, T1v, T1K, T1L;
+			 T1u = T1g + T1j;
+			 T1v = T1l + T1o;
+			 T1w = T1u + T1v;
+			 T2b = T1v - T1u;
+			 T1K = T1B + T1A;
+			 T1L = T1D + T1G;
+			 T1M = T1K - T1L;
+			 T2f = T1K + T1L;
+		    }
+		    ci[WS(rs, 1)] = T1t - T1w;
+		    cr[WS(rs, 1)] = T1J + T1M;
+		    cr[WS(rs, 4)] = T1t + T1w;
+		    ci[WS(rs, 4)] = T1J - T1M;
+		    cr[WS(rs, 7)] = T2b - T2e;
+		    ci[WS(rs, 7)] = T2f + T2g;
+		    ci[WS(rs, 10)] = T2b + T2e;
+		    cr[WS(rs, 10)] = T2f - T2g;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 12},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 12, "hf_12", twinstr, &GENUS, {88, 30, 30, 0} };
+
+void X(codelet_hf_12) (planner *p) {
+     X(khc2hc_register) (p, hf_12, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:52 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 15 -dit -name hf_15 -include hf.h */
+
+/*
+ * This function contains 184 FP additions, 140 FP multiplications,
+ * (or, 72 additions, 28 multiplications, 112 fused multiply/add),
+ * 97 stack variables, 6 constants, and 60 memory accesses
+ */
+#include "hf.h"
+
+static void hf_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
+	       E T3v, T3E, T3G, T3A, T3y, T3z, T3F, T3B;
+	       {
+		    E T1G, T3l, T3H, T3k, T1B, Tf, T37, T1y, T2Y, T2M, T2a, T2i, T39, Tz, T2U;
+		    E T2t, T1O, T2e, T3a, TT, T10, T2V, T2z, T1V, T2f, T2C, T12, T15, T14, T21;
+		    E T1c, T1Y, T13;
+		    {
+			 E T2I, T1k, T1m, T1p, T1o, T28, T1w, T25, T1n;
+			 {
+			      E T1, T3i, T9, Tc, Tb, T1D, T7, T1E, Ta, T1j, T1i, T1h;
+			      T1 = cr[0];
+			      T3i = ci[0];
+			      {
+				   E T3, T6, T2, T5, T1C, T4, T8;
+				   T3 = cr[WS(rs, 5)];
+				   T6 = ci[WS(rs, 5)];
+				   T2 = W[8];
+				   T5 = W[9];
+				   T9 = cr[WS(rs, 10)];
+				   Tc = ci[WS(rs, 10)];
+				   T1C = T2 * T6;
+				   T4 = T2 * T3;
+				   T8 = W[18];
+				   Tb = W[19];
+				   T1D = FNMS(T5, T3, T1C);
+				   T7 = FMA(T5, T6, T4);
+				   T1E = T8 * Tc;
+				   Ta = T8 * T9;
+			      }
+			      {
+				   E T1g, T1F, Td, T1f, T3j, Te, T2H;
+				   T1g = cr[WS(rs, 9)];
+				   T1j = ci[WS(rs, 9)];
+				   T1F = FNMS(Tb, T9, T1E);
+				   Td = FMA(Tb, Tc, Ta);
+				   T1f = W[16];
+				   T1i = W[17];
+				   T1G = T1D - T1F;
+				   T3j = T1D + T1F;
+				   T3l = Td - T7;
+				   Te = T7 + Td;
+				   T2H = T1f * T1j;
+				   T1h = T1f * T1g;
+				   T3H = T3j + T3i;
+				   T3k = FNMS(KP500000000, T3j, T3i);
+				   T1B = FNMS(KP500000000, Te, T1);
+				   Tf = T1 + Te;
+				   T2I = FNMS(T1i, T1g, T2H);
+			      }
+			      T1k = FMA(T1i, T1j, T1h);
+			      {
+				   E T1s, T1v, T1r, T1u, T27, T1t, T1l;
+				   T1s = cr[WS(rs, 4)];
+				   T1v = ci[WS(rs, 4)];
+				   T1r = W[6];
+				   T1u = W[7];
+				   T1m = cr[WS(rs, 14)];
+				   T1p = ci[WS(rs, 14)];
+				   T27 = T1r * T1v;
+				   T1t = T1r * T1s;
+				   T1l = W[26];
+				   T1o = W[27];
+				   T28 = FNMS(T1u, T1s, T27);
+				   T1w = FMA(T1u, T1v, T1t);
+				   T25 = T1l * T1p;
+				   T1n = T1l * T1m;
+			      }
+			 }
+			 {
+			      E Tl, T2p, Tn, Tq, Tp, T1M, Tx, T1J, To;
+			      {
+				   E Th, Tk, T26, T1q, Tg, Tj;
+				   Th = cr[WS(rs, 3)];
+				   Tk = ci[WS(rs, 3)];
+				   T26 = FNMS(T1o, T1m, T25);
+				   T1q = FMA(T1o, T1p, T1n);
+				   Tg = W[4];
+				   Tj = W[5];
+				   {
+					E T29, T2J, T1x, T2L;
+					T29 = T26 - T28;
+					T2J = T26 + T28;
+					T1x = T1q + T1w;
+					T2L = T1q - T1w;
+					{
+					     E T2o, Ti, T2K, T24;
+					     T2o = Tg * Tk;
+					     Ti = Tg * Th;
+					     T2K = FNMS(KP500000000, T2J, T2I);
+					     T37 = T2I + T2J;
+					     T24 = FNMS(KP500000000, T1x, T1k);
+					     T1y = T1k + T1x;
+					     Tl = FMA(Tj, Tk, Ti);
+					     T2Y = FMA(KP866025403, T2L, T2K);
+					     T2M = FNMS(KP866025403, T2L, T2K);
+					     T2a = FNMS(KP866025403, T29, T24);
+					     T2i = FMA(KP866025403, T29, T24);
+					     T2p = FNMS(Tj, Th, T2o);
+					}
+				   }
+			      }
+			      {
+				   E Tt, Tw, Ts, Tv, T1L, Tu, Tm;
+				   Tt = cr[WS(rs, 13)];
+				   Tw = ci[WS(rs, 13)];
+				   Ts = W[24];
+				   Tv = W[25];
+				   Tn = cr[WS(rs, 8)];
+				   Tq = ci[WS(rs, 8)];
+				   T1L = Ts * Tw;
+				   Tu = Ts * Tt;
+				   Tm = W[14];
+				   Tp = W[15];
+				   T1M = FNMS(Tv, Tt, T1L);
+				   Tx = FMA(Tv, Tw, Tu);
+				   T1J = Tm * Tq;
+				   To = Tm * Tn;
+			      }
+			      {
+				   E TF, T2v, TH, TK, TJ, T1T, TR, T1Q, TI;
+				   {
+					E TB, TE, T1K, Tr, TA, TD;
+					TB = cr[WS(rs, 12)];
+					TE = ci[WS(rs, 12)];
+					T1K = FNMS(Tp, Tn, T1J);
+					Tr = FMA(Tp, Tq, To);
+					TA = W[22];
+					TD = W[23];
+					{
+					     E T1N, T2q, Ty, T2s;
+					     T1N = T1K - T1M;
+					     T2q = T1K + T1M;
+					     Ty = Tr + Tx;
+					     T2s = Tr - Tx;
+					     {
+						  E T2u, TC, T2r, T1I;
+						  T2u = TA * TE;
+						  TC = TA * TB;
+						  T2r = FNMS(KP500000000, T2q, T2p);
+						  T39 = T2p + T2q;
+						  T1I = FNMS(KP500000000, Ty, Tl);
+						  Tz = Tl + Ty;
+						  TF = FMA(TD, TE, TC);
+						  T2U = FMA(KP866025403, T2s, T2r);
+						  T2t = FNMS(KP866025403, T2s, T2r);
+						  T1O = FNMS(KP866025403, T1N, T1I);
+						  T2e = FMA(KP866025403, T1N, T1I);
+						  T2v = FNMS(TD, TB, T2u);
+					     }
+					}
+				   }
+				   {
+					E TN, TQ, TM, TP, T1S, TO, TG;
+					TN = cr[WS(rs, 7)];
+					TQ = ci[WS(rs, 7)];
+					TM = W[12];
+					TP = W[13];
+					TH = cr[WS(rs, 2)];
+					TK = ci[WS(rs, 2)];
+					T1S = TM * TQ;
+					TO = TM * TN;
+					TG = W[2];
+					TJ = W[3];
+					T1T = FNMS(TP, TN, T1S);
+					TR = FMA(TP, TQ, TO);
+					T1Q = TG * TK;
+					TI = TG * TH;
+				   }
+				   {
+					E TW, TZ, T1R, TL, TV, TY;
+					TW = cr[WS(rs, 6)];
+					TZ = ci[WS(rs, 6)];
+					T1R = FNMS(TJ, TH, T1Q);
+					TL = FMA(TJ, TK, TI);
+					TV = W[10];
+					TY = W[11];
+					{
+					     E T1U, T2w, TS, T2y;
+					     T1U = T1R - T1T;
+					     T2w = T1R + T1T;
+					     TS = TL + TR;
+					     T2y = TL - TR;
+					     {
+						  E T2B, TX, T2x, T1P;
+						  T2B = TV * TZ;
+						  TX = TV * TW;
+						  T2x = FNMS(KP500000000, T2w, T2v);
+						  T3a = T2v + T2w;
+						  T1P = FNMS(KP500000000, TS, TF);
+						  TT = TF + TS;
+						  T10 = FMA(TY, TZ, TX);
+						  T2V = FMA(KP866025403, T2y, T2x);
+						  T2z = FNMS(KP866025403, T2y, T2x);
+						  T1V = FNMS(KP866025403, T1U, T1P);
+						  T2f = FMA(KP866025403, T1U, T1P);
+						  T2C = FNMS(TY, TW, T2B);
+					     }
+					}
+				   }
+				   {
+					E T18, T1b, T17, T1a, T20, T19, T11;
+					T18 = cr[WS(rs, 1)];
+					T1b = ci[WS(rs, 1)];
+					T17 = W[0];
+					T1a = W[1];
+					T12 = cr[WS(rs, 11)];
+					T15 = ci[WS(rs, 11)];
+					T20 = T17 * T1b;
+					T19 = T17 * T18;
+					T11 = W[20];
+					T14 = W[21];
+					T21 = FNMS(T1a, T18, T20);
+					T1c = FMA(T1a, T1b, T19);
+					T1Y = T11 * T15;
+					T13 = T11 * T12;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T3I, T3O, T3w, T2d, T3J, T3P, T3x, T3C, T3D, T3f, T3g, T2Q, T2O, T3r, T3q;
+			 E T2k, T2m;
+			 {
+			      E T3b, T1Z, T16, TU;
+			      T3I = T39 + T3a;
+			      T3b = T39 - T3a;
+			      T1Z = FNMS(T14, T12, T1Y);
+			      T16 = FMA(T14, T15, T13);
+			      T3O = TT - Tz;
+			      TU = Tz + TT;
+			      {
+				   E T1H, T2G, T2h, T3e, T3c, T34, T1W, T32, T30, T33, T2b, T2S, T2R;
+				   {
+					E T2W, T22, T1d, T2F, T2E, T36, T2D;
+					T2W = T2U - T2V;
+					T3w = T2U + T2V;
+					T22 = T1Z - T21;
+					T2D = T1Z + T21;
+					T1d = T16 + T1c;
+					T2F = T16 - T1c;
+					T2E = FNMS(KP500000000, T2D, T2C);
+					T36 = T2C + T2D;
+					T2d = FMA(KP866025403, T1G, T1B);
+					T1H = FNMS(KP866025403, T1G, T1B);
+					{
+					     E T1e, T1X, T38, T2X;
+					     T1e = T10 + T1d;
+					     T1X = FNMS(KP500000000, T1d, T10);
+					     T38 = T36 - T37;
+					     T3J = T36 + T37;
+					     T2G = FNMS(KP866025403, T2F, T2E);
+					     T2X = FMA(KP866025403, T2F, T2E);
+					     {
+						  E T1z, T23, T2Z, T1A;
+						  T3P = T1y - T1e;
+						  T1z = T1e + T1y;
+						  T23 = FNMS(KP866025403, T22, T1X);
+						  T2h = FMA(KP866025403, T22, T1X);
+						  T3e = FMA(KP618033988, T38, T3b);
+						  T3c = FNMS(KP618033988, T3b, T38);
+						  T2Z = T2X - T2Y;
+						  T3x = T2X + T2Y;
+						  T1A = TU + T1z;
+						  T34 = TU - T1z;
+						  T3C = T1O - T1V;
+						  T1W = T1O + T1V;
+						  T32 = FNMS(KP618033988, T2W, T2Z);
+						  T30 = FMA(KP618033988, T2Z, T2W);
+						  cr[0] = Tf + T1A;
+						  T33 = FNMS(KP250000000, T1A, Tf);
+						  T2b = T23 + T2a;
+						  T3D = T23 - T2a;
+					     }
+					}
+				   }
+				   {
+					E T2A, T2N, T3d, T35, T2c;
+					T3f = T2t + T2z;
+					T2A = T2t - T2z;
+					T2N = T2G - T2M;
+					T3g = T2G + T2M;
+					T3d = FMA(KP559016994, T34, T33);
+					T35 = FNMS(KP559016994, T34, T33);
+					T2c = T1W + T2b;
+					T2S = T1W - T2b;
+					cr[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
+					ci[WS(rs, 2)] = FNMS(KP951056516, T3c, T35);
+					cr[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
+					ci[WS(rs, 5)] = FNMS(KP951056516, T3e, T3d);
+					cr[WS(rs, 5)] = T1H + T2c;
+					T2R = FNMS(KP250000000, T2c, T1H);
+					T2Q = FNMS(KP618033988, T2A, T2N);
+					T2O = FMA(KP618033988, T2N, T2A);
+				   }
+				   {
+					E T2T, T31, T2g, T2j;
+					T2T = FMA(KP559016994, T2S, T2R);
+					T31 = FNMS(KP559016994, T2S, T2R);
+					T2g = T2e + T2f;
+					T3r = T2e - T2f;
+					T3q = T2h - T2i;
+					T2j = T2h + T2i;
+					ci[WS(rs, 3)] = FMA(KP951056516, T30, T2T);
+					ci[0] = FNMS(KP951056516, T30, T2T);
+					ci[WS(rs, 6)] = FMA(KP951056516, T32, T31);
+					cr[WS(rs, 2)] = FNMS(KP951056516, T32, T31);
+					T2k = T2g + T2j;
+					T2m = T2g - T2j;
+				   }
+			      }
+			 }
+			 {
+			      E T3m, T3s, T3u, T3o, T3h, T2l, T2n, T2P;
+			      ci[WS(rs, 4)] = T2d + T2k;
+			      T2l = FNMS(KP250000000, T2k, T2d);
+			      T3m = FMA(KP866025403, T3l, T3k);
+			      T3v = FNMS(KP866025403, T3l, T3k);
+			      T3s = FNMS(KP618033988, T3r, T3q);
+			      T3u = FMA(KP618033988, T3q, T3r);
+			      T2n = FMA(KP559016994, T2m, T2l);
+			      T2P = FNMS(KP559016994, T2m, T2l);
+			      ci[WS(rs, 1)] = FMA(KP951056516, T2Q, T2P);
+			      cr[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
+			      cr[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
+			      cr[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
+			      T3o = T3f - T3g;
+			      T3h = T3f + T3g;
+			      {
+				   E T3S, T3Q, T3K, T3M, T3n, T3p, T3t, T3L, T3R, T3N;
+				   cr[WS(rs, 10)] = -(T3h + T3m);
+				   T3n = FNMS(KP250000000, T3h, T3m);
+				   T3S = FNMS(KP618033988, T3O, T3P);
+				   T3Q = FMA(KP618033988, T3P, T3O);
+				   T3p = FNMS(KP559016994, T3o, T3n);
+				   T3t = FMA(KP559016994, T3o, T3n);
+				   ci[WS(rs, 7)] = FMA(KP951056516, T3s, T3p);
+				   cr[WS(rs, 13)] = FMS(KP951056516, T3s, T3p);
+				   ci[WS(rs, 13)] = FNMS(KP951056516, T3u, T3t);
+				   ci[WS(rs, 10)] = FMA(KP951056516, T3u, T3t);
+				   T3K = T3I + T3J;
+				   T3M = T3I - T3J;
+				   ci[WS(rs, 14)] = T3K + T3H;
+				   T3L = FNMS(KP250000000, T3K, T3H);
+				   T3E = FMA(KP618033988, T3D, T3C);
+				   T3G = FNMS(KP618033988, T3C, T3D);
+				   T3R = FNMS(KP559016994, T3M, T3L);
+				   T3N = FMA(KP559016994, T3M, T3L);
+				   ci[WS(rs, 8)] = FMA(KP951056516, T3Q, T3N);
+				   cr[WS(rs, 9)] = FMS(KP951056516, T3Q, T3N);
+				   ci[WS(rs, 11)] = FMA(KP951056516, T3S, T3R);
+				   cr[WS(rs, 12)] = FMS(KP951056516, T3S, T3R);
+				   T3A = T3x - T3w;
+				   T3y = T3w + T3x;
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 9)] = T3y + T3v;
+	       T3z = FNMS(KP250000000, T3y, T3v);
+	       T3F = FMA(KP559016994, T3A, T3z);
+	       T3B = FNMS(KP559016994, T3A, T3z);
+	       cr[WS(rs, 14)] = -(FMA(KP951056516, T3E, T3B));
+	       cr[WS(rs, 11)] = FMS(KP951056516, T3E, T3B);
+	       ci[WS(rs, 12)] = FMA(KP951056516, T3G, T3F);
+	       cr[WS(rs, 8)] = FMS(KP951056516, T3G, T3F);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 15, "hf_15", twinstr, &GENUS, {72, 28, 112, 0} };
+
+void X(codelet_hf_15) (planner *p) {
+     X(khc2hc_register) (p, hf_15, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 15 -dit -name hf_15 -include hf.h */
+
+/*
+ * This function contains 184 FP additions, 112 FP multiplications,
+ * (or, 128 additions, 56 multiplications, 56 fused multiply/add),
+ * 65 stack variables, 6 constants, and 60 memory accesses
+ */
+#include "hf.h"
+
+static void hf_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
+	       E T1q, T2Q, Td, T1n, T2T, T3l, T13, T1k, T1l, T2E, T2F, T3j, T1H, T1T, T2k;
+	       E T2w, T2f, T2v, T1M, T1U, Tu, TL, TM, T2H, T2I, T3i, T1w, T1Q, T29, T2t;
+	       E T24, T2s, T1B, T1R;
+	       {
+		    E T1, T2R, T6, T1o, Tb, T1p, Tc, T2S;
+		    T1 = cr[0];
+		    T2R = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 5)];
+			 T5 = ci[WS(rs, 5)];
+			 T2 = W[8];
+			 T4 = W[9];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T1o = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = cr[WS(rs, 10)];
+			 Ta = ci[WS(rs, 10)];
+			 T7 = W[18];
+			 T9 = W[19];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 T1p = FNMS(T9, T8, T7 * Ta);
+		    }
+		    T1q = KP866025403 * (T1o - T1p);
+		    T2Q = KP866025403 * (Tb - T6);
+		    Tc = T6 + Tb;
+		    Td = T1 + Tc;
+		    T1n = FNMS(KP500000000, Tc, T1);
+		    T2S = T1o + T1p;
+		    T2T = FNMS(KP500000000, T2S, T2R);
+		    T3l = T2S + T2R;
+	       }
+	       {
+		    E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
+		    E T2i;
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = cr[WS(rs, 6)];
+			 TQ = ci[WS(rs, 6)];
+			 TN = W[10];
+			 TP = W[11];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T2c = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E T15, T17, T14, T16;
+			 T15 = cr[WS(rs, 9)];
+			 T17 = ci[WS(rs, 9)];
+			 T14 = W[16];
+			 T16 = W[17];
+			 T18 = FMA(T14, T15, T16 * T17);
+			 T2h = FNMS(T16, T15, T14 * T17);
+		    }
+		    {
+			 E TT, TV, TS, TU;
+			 TT = cr[WS(rs, 11)];
+			 TV = ci[WS(rs, 11)];
+			 TS = W[20];
+			 TU = W[21];
+			 TW = FMA(TS, TT, TU * TV);
+			 T1E = FNMS(TU, TT, TS * TV);
+		    }
+		    {
+			 E TY, T10, TX, TZ;
+			 TY = cr[WS(rs, 1)];
+			 T10 = ci[WS(rs, 1)];
+			 TX = W[0];
+			 TZ = W[1];
+			 T11 = FMA(TX, TY, TZ * T10);
+			 T1F = FNMS(TZ, TY, TX * T10);
+		    }
+		    T12 = TW + T11;
+		    T2d = T1E + T1F;
+		    {
+			 E T1a, T1c, T19, T1b;
+			 T1a = cr[WS(rs, 14)];
+			 T1c = ci[WS(rs, 14)];
+			 T19 = W[26];
+			 T1b = W[27];
+			 T1d = FMA(T19, T1a, T1b * T1c);
+			 T1J = FNMS(T1b, T1a, T19 * T1c);
+		    }
+		    {
+			 E T1f, T1h, T1e, T1g;
+			 T1f = cr[WS(rs, 4)];
+			 T1h = ci[WS(rs, 4)];
+			 T1e = W[6];
+			 T1g = W[7];
+			 T1i = FMA(T1e, T1f, T1g * T1h);
+			 T1K = FNMS(T1g, T1f, T1e * T1h);
+		    }
+		    T1j = T1d + T1i;
+		    T2i = T1J + T1K;
+		    {
+			 E T1D, T1G, T2g, T2j;
+			 T13 = TR + T12;
+			 T1k = T18 + T1j;
+			 T1l = T13 + T1k;
+			 T2E = T2c + T2d;
+			 T2F = T2h + T2i;
+			 T3j = T2E + T2F;
+			 T1D = FNMS(KP500000000, T12, TR);
+			 T1G = KP866025403 * (T1E - T1F);
+			 T1H = T1D - T1G;
+			 T1T = T1D + T1G;
+			 T2g = KP866025403 * (T1d - T1i);
+			 T2j = FNMS(KP500000000, T2i, T2h);
+			 T2k = T2g - T2j;
+			 T2w = T2g + T2j;
+			 {
+			      E T2b, T2e, T1I, T1L;
+			      T2b = KP866025403 * (T11 - TW);
+			      T2e = FNMS(KP500000000, T2d, T2c);
+			      T2f = T2b + T2e;
+			      T2v = T2e - T2b;
+			      T1I = FNMS(KP500000000, T1j, T18);
+			      T1L = KP866025403 * (T1J - T1K);
+			      T1M = T1I - T1L;
+			      T1U = T1I + T1L;
+			 }
+		    }
+	       }
+	       {
+		    E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
+		    E T27;
+		    {
+			 E Tf, Th, Te, Tg;
+			 Tf = cr[WS(rs, 3)];
+			 Th = ci[WS(rs, 3)];
+			 Te = W[4];
+			 Tg = W[5];
+			 Ti = FMA(Te, Tf, Tg * Th);
+			 T21 = FNMS(Tg, Tf, Te * Th);
+		    }
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = cr[WS(rs, 12)];
+			 Ty = ci[WS(rs, 12)];
+			 Tv = W[22];
+			 Tx = W[23];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T26 = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = cr[WS(rs, 8)];
+			 Tm = ci[WS(rs, 8)];
+			 Tj = W[14];
+			 Tl = W[15];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 T1t = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = cr[WS(rs, 13)];
+			 Tr = ci[WS(rs, 13)];
+			 To = W[24];
+			 Tq = W[25];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 T1u = FNMS(Tq, Tp, To * Tr);
+		    }
+		    Tt = Tn + Ts;
+		    T22 = T1t + T1u;
+		    {
+			 E TB, TD, TA, TC;
+			 TB = cr[WS(rs, 2)];
+			 TD = ci[WS(rs, 2)];
+			 TA = W[2];
+			 TC = W[3];
+			 TE = FMA(TA, TB, TC * TD);
+			 T1y = FNMS(TC, TB, TA * TD);
+		    }
+		    {
+			 E TG, TI, TF, TH;
+			 TG = cr[WS(rs, 7)];
+			 TI = ci[WS(rs, 7)];
+			 TF = W[12];
+			 TH = W[13];
+			 TJ = FMA(TF, TG, TH * TI);
+			 T1z = FNMS(TH, TG, TF * TI);
+		    }
+		    TK = TE + TJ;
+		    T27 = T1y + T1z;
+		    {
+			 E T1s, T1v, T25, T28;
+			 Tu = Ti + Tt;
+			 TL = Tz + TK;
+			 TM = Tu + TL;
+			 T2H = T21 + T22;
+			 T2I = T26 + T27;
+			 T3i = T2H + T2I;
+			 T1s = FNMS(KP500000000, Tt, Ti);
+			 T1v = KP866025403 * (T1t - T1u);
+			 T1w = T1s - T1v;
+			 T1Q = T1s + T1v;
+			 T25 = KP866025403 * (TJ - TE);
+			 T28 = FNMS(KP500000000, T27, T26);
+			 T29 = T25 + T28;
+			 T2t = T28 - T25;
+			 {
+			      E T20, T23, T1x, T1A;
+			      T20 = KP866025403 * (Ts - Tn);
+			      T23 = FNMS(KP500000000, T22, T21);
+			      T24 = T20 + T23;
+			      T2s = T23 - T20;
+			      T1x = FNMS(KP500000000, TK, Tz);
+			      T1A = KP866025403 * (T1y - T1z);
+			      T1B = T1x - T1A;
+			      T1R = T1x + T1A;
+			 }
+		    }
+	       }
+	       {
+		    E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
+		    T2C = KP559016994 * (TM - T1l);
+		    T1m = TM + T1l;
+		    T2B = FNMS(KP250000000, T1m, Td);
+		    T2G = T2E - T2F;
+		    T2J = T2H - T2I;
+		    T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
+		    T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
+		    cr[0] = Td + T1m;
+		    T2L = T2C + T2B;
+		    ci[WS(rs, 5)] = T2L - T2M;
+		    cr[WS(rs, 6)] = T2L + T2M;
+		    T2D = T2B - T2C;
+		    ci[WS(rs, 2)] = T2D - T2K;
+		    cr[WS(rs, 3)] = T2D + T2K;
+	       }
+	       {
+		    E T3k, T3m, T3n, T3h, T3p, T3f, T3g, T3q, T3o;
+		    T3k = KP559016994 * (T3i - T3j);
+		    T3m = T3i + T3j;
+		    T3n = FNMS(KP250000000, T3m, T3l);
+		    T3f = T1k - T13;
+		    T3g = Tu - TL;
+		    T3h = FNMS(KP951056516, T3g, KP587785252 * T3f);
+		    T3p = FMA(KP587785252, T3g, KP951056516 * T3f);
+		    ci[WS(rs, 14)] = T3m + T3l;
+		    T3q = T3n - T3k;
+		    cr[WS(rs, 12)] = T3p - T3q;
+		    ci[WS(rs, 11)] = T3p + T3q;
+		    T3o = T3k + T3n;
+		    cr[WS(rs, 9)] = T3h - T3o;
+		    ci[WS(rs, 8)] = T3h + T3o;
+	       }
+	       {
+		    E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
+		    {
+			 E T2u, T2x, T1C, T1N;
+			 T2u = T2s - T2t;
+			 T2x = T2v - T2w;
+			 T2y = FMA(KP951056516, T2u, KP587785252 * T2x);
+			 T2A = FNMS(KP587785252, T2u, KP951056516 * T2x);
+			 T1r = T1n - T1q;
+			 T1C = T1w + T1B;
+			 T1N = T1H + T1M;
+			 T1O = T1C + T1N;
+			 T2p = KP559016994 * (T1C - T1N);
+			 T2q = FNMS(KP250000000, T1O, T1r);
+		    }
+		    cr[WS(rs, 5)] = T1r + T1O;
+		    T2z = T2q - T2p;
+		    cr[WS(rs, 2)] = T2z - T2A;
+		    ci[WS(rs, 6)] = T2z + T2A;
+		    T2r = T2p + T2q;
+		    ci[0] = T2r - T2y;
+		    ci[WS(rs, 3)] = T2r + T2y;
+	       }
+	       {
+		    E T35, T3d, T39, T3a, T38, T3b, T3e, T3c;
+		    {
+			 E T33, T34, T36, T37;
+			 T33 = T1w - T1B;
+			 T34 = T1H - T1M;
+			 T35 = FMA(KP951056516, T33, KP587785252 * T34);
+			 T3d = FNMS(KP587785252, T33, KP951056516 * T34);
+			 T39 = T2T - T2Q;
+			 T36 = T2v + T2w;
+			 T37 = T2s + T2t;
+			 T3a = T37 + T36;
+			 T38 = KP559016994 * (T36 - T37);
+			 T3b = FNMS(KP250000000, T3a, T39);
+		    }
+		    ci[WS(rs, 9)] = T3a + T39;
+		    T3e = T38 + T3b;
+		    cr[WS(rs, 8)] = T3d - T3e;
+		    ci[WS(rs, 12)] = T3d + T3e;
+		    T3c = T38 - T3b;
+		    cr[WS(rs, 11)] = T35 + T3c;
+		    cr[WS(rs, 14)] = T3c - T35;
+	       }
+	       {
+		    E T2X, T31, T2U, T2P, T2Y, T2Z, T32, T30;
+		    {
+			 E T2V, T2W, T2N, T2O;
+			 T2V = T1T - T1U;
+			 T2W = T1Q - T1R;
+			 T2X = FNMS(KP587785252, T2W, KP951056516 * T2V);
+			 T31 = FMA(KP951056516, T2W, KP587785252 * T2V);
+			 T2U = T2Q + T2T;
+			 T2N = T2k - T2f;
+			 T2O = T24 + T29;
+			 T2P = T2N - T2O;
+			 T2Y = FMA(KP250000000, T2P, T2U);
+			 T2Z = KP559016994 * (T2O + T2N);
+		    }
+		    cr[WS(rs, 10)] = T2P - T2U;
+		    T32 = T2Z + T2Y;
+		    ci[WS(rs, 10)] = T31 + T32;
+		    ci[WS(rs, 13)] = T32 - T31;
+		    T30 = T2Y - T2Z;
+		    cr[WS(rs, 13)] = T2X - T30;
+		    ci[WS(rs, 7)] = T2X + T30;
+	       }
+	       {
+		    E T2m, T2o, T1P, T1W, T1X, T1Y, T1Z, T2n;
+		    {
+			 E T2a, T2l, T1S, T1V;
+			 T2a = T24 - T29;
+			 T2l = T2f + T2k;
+			 T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
+			 T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
+			 T1P = T1n + T1q;
+			 T1S = T1Q + T1R;
+			 T1V = T1T + T1U;
+			 T1W = T1S + T1V;
+			 T1X = KP559016994 * (T1S - T1V);
+			 T1Y = FNMS(KP250000000, T1W, T1P);
+		    }
+		    ci[WS(rs, 4)] = T1P + T1W;
+		    T1Z = T1X + T1Y;
+		    cr[WS(rs, 4)] = T1Z - T2m;
+		    cr[WS(rs, 1)] = T1Z + T2m;
+		    T2n = T1Y - T1X;
+		    cr[WS(rs, 7)] = T2n - T2o;
+		    ci[WS(rs, 1)] = T2n + T2o;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 15},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 15, "hf_15", twinstr, &GENUS, {128, 56, 56, 0} };
+
+void X(codelet_hf_15) (planner *p) {
+     X(khc2hc_register) (p, hf_15, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:54 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hf_16 -include hf.h */
+
+/*
+ * This function contains 174 FP additions, 100 FP multiplications,
+ * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
+ * 95 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hf.h"
+
+static void hf_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T2T, T2Q;
+	       {
+		    E T3A, T3o, T8, T1I, T2w, T35, T2k, T1s, T2p, T36, T2r, T1F, T3k, T1N, T3z;
+		    E Tl, T1U, T2W, T1P, Tz, T2g, T30, T25, T11, TB, TE, T2a, T31, T2h, T1e;
+		    E TC, T1X, TH, TK, TG, TD, TJ;
+		    {
+			 E Ta, Td, Tb, T1J, Tg, Tj, Tf, Tc, Ti;
+			 {
+			      E T1h, T1k, T1n, T2s, T1i, T1q, T1m, T1j, T1p;
+			      {
+				   E T1, T3n, T3, T6, T2, T5;
+				   T1 = cr[0];
+				   T3n = ci[0];
+				   T3 = cr[WS(rs, 8)];
+				   T6 = ci[WS(rs, 8)];
+				   T2 = W[14];
+				   T5 = W[15];
+				   {
+					E T3l, T4, T1g, T3m, T7;
+					T1h = cr[WS(rs, 15)];
+					T1k = ci[WS(rs, 15)];
+					T3l = T2 * T6;
+					T4 = T2 * T3;
+					T1g = W[28];
+					T1n = cr[WS(rs, 7)];
+					T3m = FNMS(T5, T3, T3l);
+					T7 = FMA(T5, T6, T4);
+					T2s = T1g * T1k;
+					T1i = T1g * T1h;
+					T3A = T3n - T3m;
+					T3o = T3m + T3n;
+					T8 = T1 + T7;
+					T1I = T1 - T7;
+					T1q = ci[WS(rs, 7)];
+					T1m = W[12];
+				   }
+				   T1j = W[29];
+				   T1p = W[13];
+			      }
+			      {
+				   E T1u, T1x, T1v, T2l, T1A, T1D, T1z, T1w, T1C;
+				   {
+					E T2t, T1l, T2v, T1r, T2u, T1o, T1t;
+					T1u = cr[WS(rs, 3)];
+					T2u = T1m * T1q;
+					T1o = T1m * T1n;
+					T2t = FNMS(T1j, T1h, T2s);
+					T1l = FMA(T1j, T1k, T1i);
+					T2v = FNMS(T1p, T1n, T2u);
+					T1r = FMA(T1p, T1q, T1o);
+					T1x = ci[WS(rs, 3)];
+					T1t = W[4];
+					T2w = T2t - T2v;
+					T35 = T2t + T2v;
+					T2k = T1l - T1r;
+					T1s = T1l + T1r;
+					T1v = T1t * T1u;
+					T2l = T1t * T1x;
+				   }
+				   T1A = cr[WS(rs, 11)];
+				   T1D = ci[WS(rs, 11)];
+				   T1z = W[20];
+				   T1w = W[5];
+				   T1C = W[21];
+				   {
+					E T2m, T1y, T2o, T1E, T2n, T1B, T9;
+					Ta = cr[WS(rs, 4)];
+					T2n = T1z * T1D;
+					T1B = T1z * T1A;
+					T2m = FNMS(T1w, T1u, T2l);
+					T1y = FMA(T1w, T1x, T1v);
+					T2o = FNMS(T1C, T1A, T2n);
+					T1E = FMA(T1C, T1D, T1B);
+					Td = ci[WS(rs, 4)];
+					T9 = W[6];
+					T2p = T2m - T2o;
+					T36 = T2m + T2o;
+					T2r = T1E - T1y;
+					T1F = T1y + T1E;
+					Tb = T9 * Ta;
+					T1J = T9 * Td;
+				   }
+				   Tg = cr[WS(rs, 12)];
+				   Tj = ci[WS(rs, 12)];
+				   Tf = W[22];
+				   Tc = W[7];
+				   Ti = W[23];
+			      }
+			 }
+			 {
+			      E TQ, TT, TR, T2c, TW, TZ, TV, TS, TY;
+			      {
+				   E To, Tr, Tp, T1Q, Tu, Tx, Tt, Tq, Tw;
+				   {
+					E T1K, Te, T1M, Tk, T1L, Th, Tn;
+					To = cr[WS(rs, 2)];
+					T1L = Tf * Tj;
+					Th = Tf * Tg;
+					T1K = FNMS(Tc, Ta, T1J);
+					Te = FMA(Tc, Td, Tb);
+					T1M = FNMS(Ti, Tg, T1L);
+					Tk = FMA(Ti, Tj, Th);
+					Tr = ci[WS(rs, 2)];
+					Tn = W[2];
+					T3k = T1K + T1M;
+					T1N = T1K - T1M;
+					T3z = Te - Tk;
+					Tl = Te + Tk;
+					Tp = Tn * To;
+					T1Q = Tn * Tr;
+				   }
+				   Tu = cr[WS(rs, 10)];
+				   Tx = ci[WS(rs, 10)];
+				   Tt = W[18];
+				   Tq = W[3];
+				   Tw = W[19];
+				   {
+					E T1R, Ts, T1T, Ty, T1S, Tv, TP;
+					TQ = cr[WS(rs, 1)];
+					T1S = Tt * Tx;
+					Tv = Tt * Tu;
+					T1R = FNMS(Tq, To, T1Q);
+					Ts = FMA(Tq, Tr, Tp);
+					T1T = FNMS(Tw, Tu, T1S);
+					Ty = FMA(Tw, Tx, Tv);
+					TT = ci[WS(rs, 1)];
+					TP = W[0];
+					T1U = T1R - T1T;
+					T2W = T1R + T1T;
+					T1P = Ts - Ty;
+					Tz = Ts + Ty;
+					TR = TP * TQ;
+					T2c = TP * TT;
+				   }
+				   TW = cr[WS(rs, 9)];
+				   TZ = ci[WS(rs, 9)];
+				   TV = W[16];
+				   TS = W[1];
+				   TY = W[17];
+			      }
+			      {
+				   E T13, T16, T14, T26, T19, T1c, T18, T15, T1b;
+				   {
+					E T2d, TU, T2f, T10, T2e, TX, T12;
+					T13 = cr[WS(rs, 5)];
+					T2e = TV * TZ;
+					TX = TV * TW;
+					T2d = FNMS(TS, TQ, T2c);
+					TU = FMA(TS, TT, TR);
+					T2f = FNMS(TY, TW, T2e);
+					T10 = FMA(TY, TZ, TX);
+					T16 = ci[WS(rs, 5)];
+					T12 = W[8];
+					T2g = T2d - T2f;
+					T30 = T2d + T2f;
+					T25 = TU - T10;
+					T11 = TU + T10;
+					T14 = T12 * T13;
+					T26 = T12 * T16;
+				   }
+				   T19 = cr[WS(rs, 13)];
+				   T1c = ci[WS(rs, 13)];
+				   T18 = W[24];
+				   T15 = W[9];
+				   T1b = W[25];
+				   {
+					E T27, T17, T29, T1d, T28, T1a, TA;
+					TB = cr[WS(rs, 14)];
+					T28 = T18 * T1c;
+					T1a = T18 * T19;
+					T27 = FNMS(T15, T13, T26);
+					T17 = FMA(T15, T16, T14);
+					T29 = FNMS(T1b, T19, T28);
+					T1d = FMA(T1b, T1c, T1a);
+					TE = ci[WS(rs, 14)];
+					TA = W[26];
+					T2a = T27 - T29;
+					T31 = T27 + T29;
+					T2h = T17 - T1d;
+					T1e = T17 + T1d;
+					TC = TA * TB;
+					T1X = TA * TE;
+				   }
+				   TH = cr[WS(rs, 6)];
+				   TK = ci[WS(rs, 6)];
+				   TG = W[10];
+				   TD = W[27];
+				   TJ = W[11];
+			      }
+			 }
+		    }
+		    {
+			 E T2U, T3u, T2Z, T21, T1W, T34, T2X, T37, T3t, T3q, T3e, T32, T3i, T3h;
+			 {
+			      E T3f, T3r, T1H, T3s, TO, T3g;
+			      {
+				   E Tm, T1Y, TF, T20, TL, T3p, T1Z, TI;
+				   T2U = T8 - Tl;
+				   Tm = T8 + Tl;
+				   T1Z = TG * TK;
+				   TI = TG * TH;
+				   T1Y = FNMS(TD, TB, T1X);
+				   TF = FMA(TD, TE, TC);
+				   T20 = FNMS(TJ, TH, T1Z);
+				   TL = FMA(TJ, TK, TI);
+				   T3p = T3k + T3o;
+				   T3u = T3o - T3k;
+				   {
+					E T1f, TM, T1G, T3j, T2V, TN;
+					T2Z = T11 - T1e;
+					T1f = T11 + T1e;
+					T21 = T1Y - T20;
+					T2V = T1Y + T20;
+					T1W = TF - TL;
+					TM = TF + TL;
+					T1G = T1s + T1F;
+					T34 = T1s - T1F;
+					T2X = T2V - T2W;
+					T3j = T2W + T2V;
+					T3f = T35 + T36;
+					T37 = T35 - T36;
+					T3t = Tz - TM;
+					TN = Tz + TM;
+					T3r = T1G - T1f;
+					T1H = T1f + T1G;
+					T3s = T3p - T3j;
+					T3q = T3j + T3p;
+					T3e = Tm - TN;
+					TO = Tm + TN;
+					T3g = T30 + T31;
+					T32 = T30 - T31;
+				   }
+			      }
+			      cr[WS(rs, 12)] = T3r - T3s;
+			      ci[WS(rs, 11)] = T3r + T3s;
+			      ci[WS(rs, 7)] = TO - T1H;
+			      T3i = T3g + T3f;
+			      T3h = T3f - T3g;
+			      cr[0] = TO + T1H;
+			 }
+			 {
+			      E T3a, T2Y, T3x, T3v;
+			      ci[WS(rs, 15)] = T3i + T3q;
+			      cr[WS(rs, 8)] = T3i - T3q;
+			      ci[WS(rs, 3)] = T3e + T3h;
+			      cr[WS(rs, 4)] = T3e - T3h;
+			      T3a = T2U + T2X;
+			      T2Y = T2U - T2X;
+			      T3x = T3u - T3t;
+			      T3v = T3t + T3u;
+			      {
+				   E T2E, T1O, T3B, T3H, T2q, T2x, T3I, T23, T2R, T2O, T2J, T2K, T3C, T2H, T2B;
+				   E T2j;
+				   {
+					E T2F, T1V, T22, T2G;
+					{
+					     E T3b, T33, T3c, T38;
+					     T2E = T1I + T1N;
+					     T1O = T1I - T1N;
+					     T3b = T2Z - T32;
+					     T33 = T2Z + T32;
+					     T3c = T34 + T37;
+					     T38 = T34 - T37;
+					     T3B = T3z + T3A;
+					     T3H = T3A - T3z;
+					     {
+						  E T3d, T3y, T3w, T39;
+						  T3d = T3b + T3c;
+						  T3y = T3c - T3b;
+						  T3w = T38 - T33;
+						  T39 = T33 + T38;
+						  ci[WS(rs, 1)] = FMA(KP707106781, T3d, T3a);
+						  cr[WS(rs, 6)] = FNMS(KP707106781, T3d, T3a);
+						  ci[WS(rs, 13)] = FMA(KP707106781, T3y, T3x);
+						  cr[WS(rs, 10)] = FMS(KP707106781, T3y, T3x);
+						  ci[WS(rs, 9)] = FMA(KP707106781, T3w, T3v);
+						  cr[WS(rs, 14)] = FMS(KP707106781, T3w, T3v);
+						  cr[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
+						  ci[WS(rs, 5)] = FNMS(KP707106781, T39, T2Y);
+						  T2F = T1P + T1U;
+						  T1V = T1P - T1U;
+						  T22 = T1W + T21;
+						  T2G = T1W - T21;
+					     }
+					}
+					{
+					     E T2M, T2N, T2b, T2i;
+					     T2q = T2k - T2p;
+					     T2M = T2k + T2p;
+					     T2N = T2w + T2r;
+					     T2x = T2r - T2w;
+					     T3I = T22 - T1V;
+					     T23 = T1V + T22;
+					     T2R = FMA(KP414213562, T2M, T2N);
+					     T2O = FNMS(KP414213562, T2N, T2M);
+					     T2J = T25 + T2a;
+					     T2b = T25 - T2a;
+					     T2i = T2g + T2h;
+					     T2K = T2g - T2h;
+					     T3C = T2F - T2G;
+					     T2H = T2F + T2G;
+					     T2B = FMA(KP414213562, T2b, T2i);
+					     T2j = FNMS(KP414213562, T2i, T2b);
+					}
+				   }
+				   {
+					E T2A, T3G, T2P, T2D, T3E, T3F, T3D, T2I;
+					{
+					     E T24, T2L, T2C, T2y, T3J, T3L, T3K, T2S, T2z, T3M;
+					     T2A = FNMS(KP707106781, T23, T1O);
+					     T24 = FMA(KP707106781, T23, T1O);
+					     T2S = FNMS(KP414213562, T2J, T2K);
+					     T2L = FMA(KP414213562, T2K, T2J);
+					     T2C = FMA(KP414213562, T2q, T2x);
+					     T2y = FNMS(KP414213562, T2x, T2q);
+					     T3J = FMA(KP707106781, T3I, T3H);
+					     T3L = FNMS(KP707106781, T3I, T3H);
+					     T2T = T2R - T2S;
+					     T3K = T2S + T2R;
+					     T3G = T2y - T2j;
+					     T2z = T2j + T2y;
+					     T3M = T2O - T2L;
+					     T2P = T2L + T2O;
+					     ci[WS(rs, 14)] = FMA(KP923879532, T3K, T3J);
+					     cr[WS(rs, 9)] = FMS(KP923879532, T3K, T3J);
+					     ci[0] = FMA(KP923879532, T2z, T24);
+					     cr[WS(rs, 7)] = FNMS(KP923879532, T2z, T24);
+					     cr[WS(rs, 13)] = FMS(KP923879532, T3M, T3L);
+					     ci[WS(rs, 10)] = FMA(KP923879532, T3M, T3L);
+					     T2D = T2B + T2C;
+					     T3E = T2C - T2B;
+					}
+					T2Q = FNMS(KP707106781, T2H, T2E);
+					T2I = FMA(KP707106781, T2H, T2E);
+					T3F = FNMS(KP707106781, T3C, T3B);
+					T3D = FMA(KP707106781, T3C, T3B);
+					cr[WS(rs, 3)] = FMA(KP923879532, T2D, T2A);
+					ci[WS(rs, 4)] = FNMS(KP923879532, T2D, T2A);
+					cr[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
+					ci[WS(rs, 6)] = FNMS(KP923879532, T2P, T2I);
+					ci[WS(rs, 8)] = FMA(KP923879532, T3E, T3D);
+					cr[WS(rs, 15)] = FMS(KP923879532, T3E, T3D);
+					ci[WS(rs, 12)] = FMA(KP923879532, T3G, T3F);
+					cr[WS(rs, 11)] = FMS(KP923879532, T3G, T3F);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 2)] = FMA(KP923879532, T2T, T2Q);
+	       cr[WS(rs, 5)] = FNMS(KP923879532, T2T, T2Q);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 16, "hf_16", twinstr, &GENUS, {104, 30, 70, 0} };
+
+void X(codelet_hf_16) (planner *p) {
+     X(khc2hc_register) (p, hf_16, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hf_16 -include hf.h */
+
+/*
+ * This function contains 174 FP additions, 84 FP multiplications,
+ * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
+ * 52 stack variables, 3 constants, and 64 memory accesses
+ */
+#include "hf.h"
+
+static void hf_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
+	       E T7, T38, T1t, T2U, Ti, T37, T1w, T2R, Tu, T2t, T1C, T2c, TF, T2s, T1H;
+	       E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2k, T24, T2j, TS, T13, T2w, T2x;
+	       E T2y, T2z, T1O, T2h, T1T, T2g;
+	       {
+		    E T1, T2T, T6, T2S;
+		    T1 = cr[0];
+		    T2T = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 8)];
+			 T5 = ci[WS(rs, 8)];
+			 T2 = W[14];
+			 T4 = W[15];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T2S = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 + T6;
+		    T38 = T2T - T2S;
+		    T1t = T1 - T6;
+		    T2U = T2S + T2T;
+	       }
+	       {
+		    E Tc, T1u, Th, T1v;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = cr[WS(rs, 4)];
+			 Tb = ci[WS(rs, 4)];
+			 T8 = W[6];
+			 Ta = W[7];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T1u = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = cr[WS(rs, 12)];
+			 Tg = ci[WS(rs, 12)];
+			 Td = W[22];
+			 Tf = W[23];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T1v = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Ti = Tc + Th;
+		    T37 = Tc - Th;
+		    T1w = T1u - T1v;
+		    T2R = T1u + T1v;
+	       }
+	       {
+		    E To, T1z, Tt, T1A, T1y, T1B;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = cr[WS(rs, 2)];
+			 Tn = ci[WS(rs, 2)];
+			 Tk = W[2];
+			 Tm = W[3];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 T1z = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = cr[WS(rs, 10)];
+			 Ts = ci[WS(rs, 10)];
+			 Tp = W[18];
+			 Tr = W[19];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 T1A = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    Tu = To + Tt;
+		    T2t = T1z + T1A;
+		    T1y = To - Tt;
+		    T1B = T1z - T1A;
+		    T1C = T1y - T1B;
+		    T2c = T1y + T1B;
+	       }
+	       {
+		    E Tz, T1E, TE, T1F, T1D, T1G;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = cr[WS(rs, 14)];
+			 Ty = ci[WS(rs, 14)];
+			 Tv = W[26];
+			 Tx = W[27];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T1E = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = cr[WS(rs, 6)];
+			 TD = ci[WS(rs, 6)];
+			 TA = W[10];
+			 TC = W[11];
+			 TE = FMA(TA, TB, TC * TD);
+			 T1F = FNMS(TC, TB, TA * TD);
+		    }
+		    TF = Tz + TE;
+		    T2s = T1E + T1F;
+		    T1D = Tz - TE;
+		    T1G = T1E - T1F;
+		    T1H = T1D + T1G;
+		    T2d = T1D - T1G;
+	       }
+	       {
+		    E T19, T1V, T1p, T22, T1e, T1W, T1k, T21;
+		    {
+			 E T16, T18, T15, T17;
+			 T16 = cr[WS(rs, 15)];
+			 T18 = ci[WS(rs, 15)];
+			 T15 = W[28];
+			 T17 = W[29];
+			 T19 = FMA(T15, T16, T17 * T18);
+			 T1V = FNMS(T17, T16, T15 * T18);
+		    }
+		    {
+			 E T1m, T1o, T1l, T1n;
+			 T1m = cr[WS(rs, 11)];
+			 T1o = ci[WS(rs, 11)];
+			 T1l = W[20];
+			 T1n = W[21];
+			 T1p = FMA(T1l, T1m, T1n * T1o);
+			 T22 = FNMS(T1n, T1m, T1l * T1o);
+		    }
+		    {
+			 E T1b, T1d, T1a, T1c;
+			 T1b = cr[WS(rs, 7)];
+			 T1d = ci[WS(rs, 7)];
+			 T1a = W[12];
+			 T1c = W[13];
+			 T1e = FMA(T1a, T1b, T1c * T1d);
+			 T1W = FNMS(T1c, T1b, T1a * T1d);
+		    }
+		    {
+			 E T1h, T1j, T1g, T1i;
+			 T1h = cr[WS(rs, 3)];
+			 T1j = ci[WS(rs, 3)];
+			 T1g = W[4];
+			 T1i = W[5];
+			 T1k = FMA(T1g, T1h, T1i * T1j);
+			 T21 = FNMS(T1i, T1h, T1g * T1j);
+		    }
+		    T1f = T19 + T1e;
+		    T1q = T1k + T1p;
+		    T2B = T1f - T1q;
+		    T2C = T1V + T1W;
+		    T2D = T21 + T22;
+		    T2E = T2C - T2D;
+		    {
+			 E T1X, T1Y, T20, T23;
+			 T1X = T1V - T1W;
+			 T1Y = T1k - T1p;
+			 T1Z = T1X + T1Y;
+			 T2k = T1X - T1Y;
+			 T20 = T19 - T1e;
+			 T23 = T21 - T22;
+			 T24 = T20 - T23;
+			 T2j = T20 + T23;
+		    }
+	       }
+	       {
+		    E TM, T1P, T12, T1M, TR, T1Q, TX, T1L;
+		    {
+			 E TJ, TL, TI, TK;
+			 TJ = cr[WS(rs, 1)];
+			 TL = ci[WS(rs, 1)];
+			 TI = W[0];
+			 TK = W[1];
+			 TM = FMA(TI, TJ, TK * TL);
+			 T1P = FNMS(TK, TJ, TI * TL);
+		    }
+		    {
+			 E TZ, T11, TY, T10;
+			 TZ = cr[WS(rs, 13)];
+			 T11 = ci[WS(rs, 13)];
+			 TY = W[24];
+			 T10 = W[25];
+			 T12 = FMA(TY, TZ, T10 * T11);
+			 T1M = FNMS(T10, TZ, TY * T11);
+		    }
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = cr[WS(rs, 9)];
+			 TQ = ci[WS(rs, 9)];
+			 TN = W[16];
+			 TP = W[17];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T1Q = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E TU, TW, TT, TV;
+			 TU = cr[WS(rs, 5)];
+			 TW = ci[WS(rs, 5)];
+			 TT = W[8];
+			 TV = W[9];
+			 TX = FMA(TT, TU, TV * TW);
+			 T1L = FNMS(TV, TU, TT * TW);
+		    }
+		    TS = TM + TR;
+		    T13 = TX + T12;
+		    T2w = TS - T13;
+		    T2x = T1P + T1Q;
+		    T2y = T1L + T1M;
+		    T2z = T2x - T2y;
+		    {
+			 E T1K, T1N, T1R, T1S;
+			 T1K = TM - TR;
+			 T1N = T1L - T1M;
+			 T1O = T1K - T1N;
+			 T2h = T1K + T1N;
+			 T1R = T1P - T1Q;
+			 T1S = TX - T12;
+			 T1T = T1R + T1S;
+			 T2g = T1R - T1S;
+		    }
+	       }
+	       {
+		    E T1J, T27, T3a, T3c, T26, T3b, T2a, T35;
+		    {
+			 E T1x, T1I, T36, T39;
+			 T1x = T1t - T1w;
+			 T1I = KP707106781 * (T1C + T1H);
+			 T1J = T1x + T1I;
+			 T27 = T1x - T1I;
+			 T36 = KP707106781 * (T2c - T2d);
+			 T39 = T37 + T38;
+			 T3a = T36 + T39;
+			 T3c = T39 - T36;
+		    }
+		    {
+			 E T1U, T25, T28, T29;
+			 T1U = FNMS(KP382683432, T1T, KP923879532 * T1O);
+			 T25 = FMA(KP382683432, T1Z, KP923879532 * T24);
+			 T26 = T1U + T25;
+			 T3b = T25 - T1U;
+			 T28 = FMA(KP923879532, T1T, KP382683432 * T1O);
+			 T29 = FNMS(KP923879532, T1Z, KP382683432 * T24);
+			 T2a = T28 + T29;
+			 T35 = T29 - T28;
+		    }
+		    cr[WS(rs, 7)] = T1J - T26;
+		    cr[WS(rs, 11)] = T3b - T3c;
+		    ci[WS(rs, 12)] = T3b + T3c;
+		    ci[0] = T1J + T26;
+		    ci[WS(rs, 4)] = T27 - T2a;
+		    cr[WS(rs, 15)] = T35 - T3a;
+		    ci[WS(rs, 8)] = T35 + T3a;
+		    cr[WS(rs, 3)] = T27 + T2a;
+	       }
+	       {
+		    E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
+		    {
+			 E Tj, TG, T2Q, T2V;
+			 Tj = T7 + Ti;
+			 TG = Tu + TF;
+			 TH = Tj + TG;
+			 T2L = Tj - TG;
+			 T2Q = T2t + T2s;
+			 T2V = T2R + T2U;
+			 T2W = T2Q + T2V;
+			 T2Y = T2V - T2Q;
+		    }
+		    {
+			 E T14, T1r, T2M, T2N;
+			 T14 = TS + T13;
+			 T1r = T1f + T1q;
+			 T1s = T14 + T1r;
+			 T2X = T1r - T14;
+			 T2M = T2C + T2D;
+			 T2N = T2x + T2y;
+			 T2O = T2M - T2N;
+			 T2P = T2N + T2M;
+		    }
+		    ci[WS(rs, 7)] = TH - T1s;
+		    cr[WS(rs, 12)] = T2X - T2Y;
+		    ci[WS(rs, 11)] = T2X + T2Y;
+		    cr[0] = TH + T1s;
+		    cr[WS(rs, 4)] = T2L - T2O;
+		    cr[WS(rs, 8)] = T2P - T2W;
+		    ci[WS(rs, 15)] = T2P + T2W;
+		    ci[WS(rs, 3)] = T2L + T2O;
+	       }
+	       {
+		    E T2f, T2n, T3g, T3i, T2m, T3h, T2q, T3d;
+		    {
+			 E T2b, T2e, T3e, T3f;
+			 T2b = T1t + T1w;
+			 T2e = KP707106781 * (T2c + T2d);
+			 T2f = T2b + T2e;
+			 T2n = T2b - T2e;
+			 T3e = KP707106781 * (T1H - T1C);
+			 T3f = T38 - T37;
+			 T3g = T3e + T3f;
+			 T3i = T3f - T3e;
+		    }
+		    {
+			 E T2i, T2l, T2o, T2p;
+			 T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
+			 T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
+			 T2m = T2i + T2l;
+			 T3h = T2l - T2i;
+			 T2o = FNMS(KP923879532, T2g, KP382683432 * T2h);
+			 T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
+			 T2q = T2o + T2p;
+			 T3d = T2p - T2o;
+		    }
+		    ci[WS(rs, 6)] = T2f - T2m;
+		    cr[WS(rs, 13)] = T3h - T3i;
+		    ci[WS(rs, 10)] = T3h + T3i;
+		    cr[WS(rs, 1)] = T2f + T2m;
+		    cr[WS(rs, 5)] = T2n - T2q;
+		    cr[WS(rs, 9)] = T3d - T3g;
+		    ci[WS(rs, 14)] = T3d + T3g;
+		    ci[WS(rs, 2)] = T2n + T2q;
+	       }
+	       {
+		    E T2v, T2H, T32, T34, T2G, T2Z, T2K, T33;
+		    {
+			 E T2r, T2u, T30, T31;
+			 T2r = T7 - Ti;
+			 T2u = T2s - T2t;
+			 T2v = T2r - T2u;
+			 T2H = T2r + T2u;
+			 T30 = Tu - TF;
+			 T31 = T2U - T2R;
+			 T32 = T30 + T31;
+			 T34 = T31 - T30;
+		    }
+		    {
+			 E T2A, T2F, T2I, T2J;
+			 T2A = T2w + T2z;
+			 T2F = T2B - T2E;
+			 T2G = KP707106781 * (T2A + T2F);
+			 T2Z = KP707106781 * (T2F - T2A);
+			 T2I = T2w - T2z;
+			 T2J = T2B + T2E;
+			 T2K = KP707106781 * (T2I + T2J);
+			 T33 = KP707106781 * (T2J - T2I);
+		    }
+		    ci[WS(rs, 5)] = T2v - T2G;
+		    cr[WS(rs, 10)] = T33 - T34;
+		    ci[WS(rs, 13)] = T33 + T34;
+		    cr[WS(rs, 2)] = T2v + T2G;
+		    cr[WS(rs, 6)] = T2H - T2K;
+		    cr[WS(rs, 14)] = T2Z - T32;
+		    ci[WS(rs, 9)] = T2Z + T32;
+		    ci[WS(rs, 1)] = T2H + T2K;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 16},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 16, "hf_16", twinstr, &GENUS, {136, 46, 38, 0} };
+
+void X(codelet_hf_16) (planner *p) {
+     X(khc2hc_register) (p, hf_16, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:49 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hf_2 -include hf.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 11 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hf.h"
+
+static void hf_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
+	       E T1, Ta, T3, T6, T2, T5;
+	       T1 = cr[0];
+	       Ta = ci[0];
+	       T3 = cr[WS(rs, 1)];
+	       T6 = ci[WS(rs, 1)];
+	       T2 = W[0];
+	       T5 = W[1];
+	       {
+		    E T8, T4, T9, T7;
+		    T8 = T2 * T6;
+		    T4 = T2 * T3;
+		    T9 = FNMS(T5, T3, T8);
+		    T7 = FMA(T5, T6, T4);
+		    ci[WS(rs, 1)] = T9 + Ta;
+		    cr[WS(rs, 1)] = T9 - Ta;
+		    cr[0] = T1 + T7;
+		    ci[0] = T1 - T7;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 2, "hf_2", twinstr, &GENUS, {4, 2, 2, 0} };
+
+void X(codelet_hf_2) (planner *p) {
+     X(khc2hc_register) (p, hf_2, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hf_2 -include hf.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 4 additions, 2 multiplications, 2 fused multiply/add),
+ * 9 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hf.h"
+
+static void hf_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
+	       E T1, T8, T6, T7;
+	       T1 = cr[0];
+	       T8 = ci[0];
+	       {
+		    E T3, T5, T2, T4;
+		    T3 = cr[WS(rs, 1)];
+		    T5 = ci[WS(rs, 1)];
+		    T2 = W[0];
+		    T4 = W[1];
+		    T6 = FMA(T2, T3, T4 * T5);
+		    T7 = FNMS(T4, T3, T2 * T5);
+	       }
+	       ci[0] = T1 - T6;
+	       cr[0] = T1 + T6;
+	       cr[WS(rs, 1)] = T7 - T8;
+	       ci[WS(rs, 1)] = T7 + T8;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 2},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 2, "hf_2", twinstr, &GENUS, {4, 2, 2, 0} };
+
+void X(codelet_hf_2) (planner *p) {
+     X(khc2hc_register) (p, hf_2, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1027 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:00 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hf_20 -include hf.h */
+
+/*
+ * This function contains 246 FP additions, 148 FP multiplications,
+ * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
+ * 100 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hf.h"
+
+static void hf_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E T54, T5a, T5c, T56, T53, T55, T5b, T57;
+	       {
+		    E T4N, T4q, T8, T2i, T4r, T2n, T4O, Tl, T2v, T3v, T43, T4b, TN, T2b, T3F;
+		    E T3a, T2R, T3z, T3T, T4f, T27, T2f, T3J, T3i, T2K, T3y, T3W, T4e, T1G, T2e;
+		    E T3I, T3p, T2C, T3w, T40, T4c, T1e, T2c, T3G, T33;
+		    {
+			 E T1, T4p, T3, T6, T2, T5;
+			 T1 = cr[0];
+			 T4p = ci[0];
+			 T3 = cr[WS(rs, 10)];
+			 T6 = ci[WS(rs, 10)];
+			 T2 = W[18];
+			 T5 = W[19];
+			 {
+			      E Ta, Td, Tg, T2j, Tb, Tj, Tf, Tc, Ti;
+			      {
+				   E T4n, T4, T9, T4o, T7;
+				   Ta = cr[WS(rs, 5)];
+				   Td = ci[WS(rs, 5)];
+				   T4n = T2 * T6;
+				   T4 = T2 * T3;
+				   T9 = W[8];
+				   Tg = cr[WS(rs, 15)];
+				   T4o = FNMS(T5, T3, T4n);
+				   T7 = FMA(T5, T6, T4);
+				   T2j = T9 * Td;
+				   Tb = T9 * Ta;
+				   T4N = T4p - T4o;
+				   T4q = T4o + T4p;
+				   T8 = T1 + T7;
+				   T2i = T1 - T7;
+				   Tj = ci[WS(rs, 15)];
+				   Tf = W[28];
+			      }
+			      Tc = W[9];
+			      Ti = W[29];
+			      {
+				   E T36, Ts, T2t, TL, TB, TE, TD, T38, Ty, T2q, TC;
+				   {
+					E TH, TK, TJ, T2s, TI;
+					{
+					     E To, Tr, Tp, T35, Tq, TG;
+					     {
+						  E T2k, Te, T2m, Tk, T2l, Th, Tn;
+						  To = cr[WS(rs, 4)];
+						  T2l = Tf * Tj;
+						  Th = Tf * Tg;
+						  T2k = FNMS(Tc, Ta, T2j);
+						  Te = FMA(Tc, Td, Tb);
+						  T2m = FNMS(Ti, Tg, T2l);
+						  Tk = FMA(Ti, Tj, Th);
+						  Tr = ci[WS(rs, 4)];
+						  Tn = W[6];
+						  T4r = T2k + T2m;
+						  T2n = T2k - T2m;
+						  T4O = Te - Tk;
+						  Tl = Te + Tk;
+						  Tp = Tn * To;
+						  T35 = Tn * Tr;
+					     }
+					     Tq = W[7];
+					     TH = cr[WS(rs, 19)];
+					     TK = ci[WS(rs, 19)];
+					     TG = W[36];
+					     T36 = FNMS(Tq, To, T35);
+					     Ts = FMA(Tq, Tr, Tp);
+					     TJ = W[37];
+					     T2s = TG * TK;
+					     TI = TG * TH;
+					}
+					{
+					     E Tu, Tx, Tt, Tw, T37, Tv, TA;
+					     Tu = cr[WS(rs, 14)];
+					     Tx = ci[WS(rs, 14)];
+					     T2t = FNMS(TJ, TH, T2s);
+					     TL = FMA(TJ, TK, TI);
+					     Tt = W[26];
+					     Tw = W[27];
+					     TB = cr[WS(rs, 9)];
+					     TE = ci[WS(rs, 9)];
+					     T37 = Tt * Tx;
+					     Tv = Tt * Tu;
+					     TA = W[16];
+					     TD = W[17];
+					     T38 = FNMS(Tw, Tu, T37);
+					     Ty = FMA(Tw, Tx, Tv);
+					     T2q = TA * TE;
+					     TC = TA * TB;
+					}
+				   }
+				   {
+					E T39, T42, Tz, T2p, T2r, TF;
+					T39 = T36 - T38;
+					T42 = T36 + T38;
+					Tz = Ts + Ty;
+					T2p = Ts - Ty;
+					T2r = FNMS(TD, TB, T2q);
+					TF = FMA(TD, TE, TC);
+					{
+					     E T41, T2u, TM, T34;
+					     T41 = T2r + T2t;
+					     T2u = T2r - T2t;
+					     TM = TF + TL;
+					     T34 = TL - TF;
+					     T2v = T2p - T2u;
+					     T3v = T2p + T2u;
+					     T43 = T41 - T42;
+					     T4b = T42 + T41;
+					     TN = Tz - TM;
+					     T2b = Tz + TM;
+					     T3F = T39 + T34;
+					     T3a = T34 - T39;
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T3e, T1M, T2P, T25, T1V, T1Y, T1X, T3g, T1S, T2M, T1W;
+			 {
+			      E T21, T24, T23, T2O, T22;
+			      {
+				   E T1I, T1L, T1H, T1K, T3d, T1J, T20;
+				   T1I = cr[WS(rs, 12)];
+				   T1L = ci[WS(rs, 12)];
+				   T1H = W[22];
+				   T1K = W[23];
+				   T21 = cr[WS(rs, 7)];
+				   T24 = ci[WS(rs, 7)];
+				   T3d = T1H * T1L;
+				   T1J = T1H * T1I;
+				   T20 = W[12];
+				   T23 = W[13];
+				   T3e = FNMS(T1K, T1I, T3d);
+				   T1M = FMA(T1K, T1L, T1J);
+				   T2O = T20 * T24;
+				   T22 = T20 * T21;
+			      }
+			      {
+				   E T1O, T1R, T1N, T1Q, T3f, T1P, T1U;
+				   T1O = cr[WS(rs, 2)];
+				   T1R = ci[WS(rs, 2)];
+				   T2P = FNMS(T23, T21, T2O);
+				   T25 = FMA(T23, T24, T22);
+				   T1N = W[2];
+				   T1Q = W[3];
+				   T1V = cr[WS(rs, 17)];
+				   T1Y = ci[WS(rs, 17)];
+				   T3f = T1N * T1R;
+				   T1P = T1N * T1O;
+				   T1U = W[32];
+				   T1X = W[33];
+				   T3g = FNMS(T1Q, T1O, T3f);
+				   T1S = FMA(T1Q, T1R, T1P);
+				   T2M = T1U * T1Y;
+				   T1W = T1U * T1V;
+			      }
+			 }
+			 {
+			      E T3h, T3S, T1T, T2L, T2N, T1Z;
+			      T3h = T3e - T3g;
+			      T3S = T3e + T3g;
+			      T1T = T1M + T1S;
+			      T2L = T1M - T1S;
+			      T2N = FNMS(T1X, T1V, T2M);
+			      T1Z = FMA(T1X, T1Y, T1W);
+			      {
+				   E T3R, T2Q, T26, T3c;
+				   T3R = T2N + T2P;
+				   T2Q = T2N - T2P;
+				   T26 = T1Z + T25;
+				   T3c = T25 - T1Z;
+				   T2R = T2L - T2Q;
+				   T3z = T2L + T2Q;
+				   T3T = T3R - T3S;
+				   T4f = T3S + T3R;
+				   T27 = T1T - T26;
+				   T2f = T1T + T26;
+				   T3J = T3h + T3c;
+				   T3i = T3c - T3h;
+			      }
+			 }
+		    }
+		    {
+			 E T3l, T1l, T2I, T1E, T1u, T1x, T1w, T3n, T1r, T2F, T1v;
+			 {
+			      E T1A, T1D, T1C, T2H, T1B;
+			      {
+				   E T1h, T1k, T1g, T1j, T3k, T1i, T1z;
+				   T1h = cr[WS(rs, 8)];
+				   T1k = ci[WS(rs, 8)];
+				   T1g = W[14];
+				   T1j = W[15];
+				   T1A = cr[WS(rs, 3)];
+				   T1D = ci[WS(rs, 3)];
+				   T3k = T1g * T1k;
+				   T1i = T1g * T1h;
+				   T1z = W[4];
+				   T1C = W[5];
+				   T3l = FNMS(T1j, T1h, T3k);
+				   T1l = FMA(T1j, T1k, T1i);
+				   T2H = T1z * T1D;
+				   T1B = T1z * T1A;
+			      }
+			      {
+				   E T1n, T1q, T1m, T1p, T3m, T1o, T1t;
+				   T1n = cr[WS(rs, 18)];
+				   T1q = ci[WS(rs, 18)];
+				   T2I = FNMS(T1C, T1A, T2H);
+				   T1E = FMA(T1C, T1D, T1B);
+				   T1m = W[34];
+				   T1p = W[35];
+				   T1u = cr[WS(rs, 13)];
+				   T1x = ci[WS(rs, 13)];
+				   T3m = T1m * T1q;
+				   T1o = T1m * T1n;
+				   T1t = W[24];
+				   T1w = W[25];
+				   T3n = FNMS(T1p, T1n, T3m);
+				   T1r = FMA(T1p, T1q, T1o);
+				   T2F = T1t * T1x;
+				   T1v = T1t * T1u;
+			      }
+			 }
+			 {
+			      E T3o, T3V, T1s, T2E, T2G, T1y;
+			      T3o = T3l - T3n;
+			      T3V = T3l + T3n;
+			      T1s = T1l + T1r;
+			      T2E = T1l - T1r;
+			      T2G = FNMS(T1w, T1u, T2F);
+			      T1y = FMA(T1w, T1x, T1v);
+			      {
+				   E T3U, T2J, T1F, T3j;
+				   T3U = T2G + T2I;
+				   T2J = T2G - T2I;
+				   T1F = T1y + T1E;
+				   T3j = T1E - T1y;
+				   T2K = T2E - T2J;
+				   T3y = T2E + T2J;
+				   T3W = T3U - T3V;
+				   T4e = T3V + T3U;
+				   T1G = T1s - T1F;
+				   T2e = T1s + T1F;
+				   T3I = T3o + T3j;
+				   T3p = T3j - T3o;
+			      }
+			 }
+		    }
+		    {
+			 E T2Z, TT, T2A, T1c, T12, T15, T14, T31, TZ, T2x, T13;
+			 {
+			      E T18, T1b, T1a, T2z, T19;
+			      {
+				   E TP, TS, TO, TR, T2Y, TQ, T17;
+				   TP = cr[WS(rs, 16)];
+				   TS = ci[WS(rs, 16)];
+				   TO = W[30];
+				   TR = W[31];
+				   T18 = cr[WS(rs, 11)];
+				   T1b = ci[WS(rs, 11)];
+				   T2Y = TO * TS;
+				   TQ = TO * TP;
+				   T17 = W[20];
+				   T1a = W[21];
+				   T2Z = FNMS(TR, TP, T2Y);
+				   TT = FMA(TR, TS, TQ);
+				   T2z = T17 * T1b;
+				   T19 = T17 * T18;
+			      }
+			      {
+				   E TV, TY, TU, TX, T30, TW, T11;
+				   TV = cr[WS(rs, 6)];
+				   TY = ci[WS(rs, 6)];
+				   T2A = FNMS(T1a, T18, T2z);
+				   T1c = FMA(T1a, T1b, T19);
+				   TU = W[10];
+				   TX = W[11];
+				   T12 = cr[WS(rs, 1)];
+				   T15 = ci[WS(rs, 1)];
+				   T30 = TU * TY;
+				   TW = TU * TV;
+				   T11 = W[0];
+				   T14 = W[1];
+				   T31 = FNMS(TX, TV, T30);
+				   TZ = FMA(TX, TY, TW);
+				   T2x = T11 * T15;
+				   T13 = T11 * T12;
+			      }
+			 }
+			 {
+			      E T32, T3Z, T10, T2w, T2y, T16;
+			      T32 = T2Z - T31;
+			      T3Z = T2Z + T31;
+			      T10 = TT + TZ;
+			      T2w = TT - TZ;
+			      T2y = FNMS(T14, T12, T2x);
+			      T16 = FMA(T14, T15, T13);
+			      {
+				   E T3Y, T2B, T1d, T2X;
+				   T3Y = T2y + T2A;
+				   T2B = T2y - T2A;
+				   T1d = T16 + T1c;
+				   T2X = T1c - T16;
+				   T2C = T2w - T2B;
+				   T3w = T2w + T2B;
+				   T40 = T3Y - T3Z;
+				   T4c = T3Z + T3Y;
+				   T1e = T10 - T1d;
+				   T2c = T10 + T1d;
+				   T3G = T32 + T2X;
+				   T33 = T2X - T32;
+			      }
+			 }
+		    }
+		    {
+			 E T4l, T4k, T4w, T4x, T4Q, T4R, T2o, T4X, T4W, T4C, T4D, T4J, T4h, T4j, T4I;
+			 E T51, T52, T49, T3r, T3t, T58, T2D, T48, T2S, T59;
+			 {
+			      E T2a, T47, T45, T3u, T3x, T3N, T3L, T3A, T46, T3Q;
+			      {
+				   E Tm, T1f, T28, T3X, T44;
+				   T4l = T3W + T3T;
+				   T3X = T3T - T3W;
+				   T44 = T40 - T43;
+				   T4k = T43 + T40;
+				   T2a = T8 + Tl;
+				   Tm = T8 - Tl;
+				   T1f = TN + T1e;
+				   T4w = T1e - TN;
+				   T4x = T1G - T27;
+				   T28 = T1G + T27;
+				   T47 = FMA(KP618033988, T3X, T44);
+				   T45 = FNMS(KP618033988, T44, T3X);
+				   {
+					E T3H, T29, T3P, T3K, T3O;
+					T3H = T3F - T3G;
+					T4Q = T3F + T3G;
+					T29 = T1f + T28;
+					T3P = T1f - T28;
+					T4R = T3I + T3J;
+					T3K = T3I - T3J;
+					T3u = T2i + T2n;
+					T2o = T2i - T2n;
+					T4X = T3v - T3w;
+					T3x = T3v + T3w;
+					ci[WS(rs, 9)] = Tm + T29;
+					T3O = FNMS(KP250000000, T29, Tm);
+					T3N = FNMS(KP618033988, T3H, T3K);
+					T3L = FMA(KP618033988, T3K, T3H);
+					T3A = T3y + T3z;
+					T4W = T3y - T3z;
+					T46 = FMA(KP559016994, T3P, T3O);
+					T3Q = FNMS(KP559016994, T3P, T3O);
+				   }
+			      }
+			      {
+				   E T2d, T2g, T3b, T3q, T2h;
+				   {
+					E T4d, T3D, T3C, T4g, T3B, T3M, T3E;
+					T4C = T4b + T4c;
+					T4d = T4b - T4c;
+					T3D = T3x - T3A;
+					T3B = T3x + T3A;
+					ci[WS(rs, 1)] = FMA(KP951056516, T45, T3Q);
+					cr[WS(rs, 2)] = FNMS(KP951056516, T45, T3Q);
+					cr[WS(rs, 6)] = FMA(KP951056516, T47, T46);
+					ci[WS(rs, 5)] = FNMS(KP951056516, T47, T46);
+					cr[WS(rs, 5)] = T3u + T3B;
+					T3C = FNMS(KP250000000, T3B, T3u);
+					T4g = T4e - T4f;
+					T4D = T4e + T4f;
+					T2d = T2b + T2c;
+					T4J = T2b - T2c;
+					T3M = FNMS(KP559016994, T3D, T3C);
+					T3E = FMA(KP559016994, T3D, T3C);
+					T4h = FMA(KP618033988, T4g, T4d);
+					T4j = FNMS(KP618033988, T4d, T4g);
+					cr[WS(rs, 9)] = FNMS(KP951056516, T3L, T3E);
+					cr[WS(rs, 1)] = FMA(KP951056516, T3L, T3E);
+					ci[WS(rs, 6)] = FMA(KP951056516, T3N, T3M);
+					ci[WS(rs, 2)] = FNMS(KP951056516, T3N, T3M);
+					T4I = T2f - T2e;
+					T2g = T2e + T2f;
+				   }
+				   T3b = T33 - T3a;
+				   T51 = T3a + T33;
+				   T52 = T3p + T3i;
+				   T3q = T3i - T3p;
+				   T2h = T2d + T2g;
+				   T49 = T2d - T2g;
+				   T3r = FMA(KP618033988, T3q, T3b);
+				   T3t = FNMS(KP618033988, T3b, T3q);
+				   T58 = T2v - T2C;
+				   T2D = T2v + T2C;
+				   cr[0] = T2a + T2h;
+				   T48 = FNMS(KP250000000, T2h, T2a);
+				   T2S = T2K + T2R;
+				   T59 = T2K - T2R;
+			      }
+			 }
+			 {
+			      E T4B, T4P, T4Y, T50, T4U, T4S;
+			      {
+				   E T4A, T4y, T4s, T4m, T4u, T4t, T4z, T4v;
+				   {
+					E T2V, T2U, T4i, T4a, T2T, T2W, T3s;
+					T4i = FNMS(KP559016994, T49, T48);
+					T4a = FMA(KP559016994, T49, T48);
+					T2T = T2D + T2S;
+					T2V = T2D - T2S;
+					ci[WS(rs, 3)] = FMA(KP951056516, T4h, T4a);
+					cr[WS(rs, 4)] = FNMS(KP951056516, T4h, T4a);
+					cr[WS(rs, 8)] = FMA(KP951056516, T4j, T4i);
+					ci[WS(rs, 7)] = FNMS(KP951056516, T4j, T4i);
+					ci[WS(rs, 4)] = T2o + T2T;
+					T2U = FNMS(KP250000000, T2T, T2o);
+					T4A = FMA(KP618033988, T4w, T4x);
+					T4y = FNMS(KP618033988, T4x, T4w);
+					T4B = T4r + T4q;
+					T4s = T4q - T4r;
+					T2W = FMA(KP559016994, T2V, T2U);
+					T3s = FNMS(KP559016994, T2V, T2U);
+					ci[WS(rs, 8)] = FMA(KP951056516, T3r, T2W);
+					ci[0] = FNMS(KP951056516, T3r, T2W);
+					cr[WS(rs, 7)] = FNMS(KP951056516, T3t, T3s);
+					cr[WS(rs, 3)] = FMA(KP951056516, T3t, T3s);
+					T4m = T4k + T4l;
+					T4u = T4l - T4k;
+				   }
+				   cr[WS(rs, 10)] = T4m - T4s;
+				   T4t = FMA(KP250000000, T4m, T4s);
+				   T4P = T4N - T4O;
+				   T54 = T4O + T4N;
+				   T4Y = FNMS(KP618033988, T4X, T4W);
+				   T50 = FMA(KP618033988, T4W, T4X);
+				   T4z = FNMS(KP559016994, T4u, T4t);
+				   T4v = FMA(KP559016994, T4u, T4t);
+				   ci[WS(rs, 13)] = FMA(KP951056516, T4y, T4v);
+				   cr[WS(rs, 14)] = FMS(KP951056516, T4y, T4v);
+				   ci[WS(rs, 17)] = FMA(KP951056516, T4A, T4z);
+				   cr[WS(rs, 18)] = FMS(KP951056516, T4A, T4z);
+				   T4U = T4Q - T4R;
+				   T4S = T4Q + T4R;
+			      }
+			      {
+				   E T4M, T4K, T4E, T4G, T4T, T4V, T4Z, T4F, T4L, T4H;
+				   ci[WS(rs, 14)] = T4S + T4P;
+				   T4T = FNMS(KP250000000, T4S, T4P);
+				   T4M = FNMS(KP618033988, T4I, T4J);
+				   T4K = FMA(KP618033988, T4J, T4I);
+				   T4V = FNMS(KP559016994, T4U, T4T);
+				   T4Z = FMA(KP559016994, T4U, T4T);
+				   cr[WS(rs, 17)] = -(FMA(KP951056516, T4Y, T4V));
+				   cr[WS(rs, 13)] = FMS(KP951056516, T4Y, T4V);
+				   ci[WS(rs, 18)] = FNMS(KP951056516, T50, T4Z);
+				   ci[WS(rs, 10)] = FMA(KP951056516, T50, T4Z);
+				   T4E = T4C + T4D;
+				   T4G = T4C - T4D;
+				   ci[WS(rs, 19)] = T4E + T4B;
+				   T4F = FNMS(KP250000000, T4E, T4B);
+				   T5a = FMA(KP618033988, T59, T58);
+				   T5c = FNMS(KP618033988, T58, T59);
+				   T4L = FMA(KP559016994, T4G, T4F);
+				   T4H = FNMS(KP559016994, T4G, T4F);
+				   ci[WS(rs, 11)] = FMA(KP951056516, T4K, T4H);
+				   cr[WS(rs, 12)] = FMS(KP951056516, T4K, T4H);
+				   ci[WS(rs, 15)] = FMA(KP951056516, T4M, T4L);
+				   cr[WS(rs, 16)] = FMS(KP951056516, T4M, T4L);
+				   T56 = T51 - T52;
+				   T53 = T51 + T52;
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 15)] = T53 - T54;
+	       T55 = FMA(KP250000000, T53, T54);
+	       T5b = FMA(KP559016994, T56, T55);
+	       T57 = FNMS(KP559016994, T56, T55);
+	       cr[WS(rs, 19)] = -(FMA(KP951056516, T5a, T57));
+	       cr[WS(rs, 11)] = FMS(KP951056516, T5a, T57);
+	       ci[WS(rs, 16)] = FNMS(KP951056516, T5c, T5b);
+	       ci[WS(rs, 12)] = FMA(KP951056516, T5c, T5b);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 20, "hf_20", twinstr, &GENUS, {136, 38, 110, 0} };
+
+void X(codelet_hf_20) (planner *p) {
+     X(khc2hc_register) (p, hf_20, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hf_20 -include hf.h */
+
+/*
+ * This function contains 246 FP additions, 124 FP multiplications,
+ * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
+ * 85 stack variables, 4 constants, and 80 memory accesses
+ */
+#include "hf.h"
+
+static void hf_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
+	       E Tj, T1R, T4j, T4s, T2q, T37, T3Q, T42, T1r, T1O, T1P, T3i, T3l, T3J, T3D;
+	       E T3E, T44, T1V, T1W, T1X, T2e, T2j, T2k, T2W, T2X, T4f, T33, T34, T35, T2J;
+	       E T2O, T4q, TG, T13, T14, T3p, T3s, T3K, T3A, T3B, T43, T1S, T1T, T1U, T23;
+	       E T28, T29, T2T, T2U, T4e, T30, T31, T32, T2y, T2D, T4p;
+	       {
+		    E T1, T3N, T6, T3M, Tc, T2n, Th, T2o;
+		    T1 = cr[0];
+		    T3N = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 10)];
+			 T5 = ci[WS(rs, 10)];
+			 T2 = W[18];
+			 T4 = W[19];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T3M = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = cr[WS(rs, 5)];
+			 Tb = ci[WS(rs, 5)];
+			 T8 = W[8];
+			 Ta = W[9];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T2n = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = cr[WS(rs, 15)];
+			 Tg = ci[WS(rs, 15)];
+			 Td = W[28];
+			 Tf = W[29];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T2o = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E T7, Ti, T4h, T4i;
+			 T7 = T1 + T6;
+			 Ti = Tc + Th;
+			 Tj = T7 - Ti;
+			 T1R = T7 + Ti;
+			 T4h = T3N - T3M;
+			 T4i = Tc - Th;
+			 T4j = T4h - T4i;
+			 T4s = T4i + T4h;
+		    }
+		    {
+			 E T2m, T2p, T3O, T3P;
+			 T2m = T1 - T6;
+			 T2p = T2n - T2o;
+			 T2q = T2m - T2p;
+			 T37 = T2m + T2p;
+			 T3O = T3M + T3N;
+			 T3P = T2n + T2o;
+			 T3Q = T3O - T3P;
+			 T42 = T3P + T3O;
+		    }
+	       }
+	       {
+		    E T1f, T3g, T2a, T2H, T1N, T3j, T2i, T2N, T1q, T3h, T2d, T2I, T1C, T3k, T2f;
+		    E T2M;
+		    {
+			 E T19, T2F, T1e, T2G;
+			 {
+			      E T16, T18, T15, T17;
+			      T16 = cr[WS(rs, 8)];
+			      T18 = ci[WS(rs, 8)];
+			      T15 = W[14];
+			      T17 = W[15];
+			      T19 = FMA(T15, T16, T17 * T18);
+			      T2F = FNMS(T17, T16, T15 * T18);
+			 }
+			 {
+			      E T1b, T1d, T1a, T1c;
+			      T1b = cr[WS(rs, 18)];
+			      T1d = ci[WS(rs, 18)];
+			      T1a = W[34];
+			      T1c = W[35];
+			      T1e = FMA(T1a, T1b, T1c * T1d);
+			      T2G = FNMS(T1c, T1b, T1a * T1d);
+			 }
+			 T1f = T19 + T1e;
+			 T3g = T2F + T2G;
+			 T2a = T19 - T1e;
+			 T2H = T2F - T2G;
+		    }
+		    {
+			 E T1H, T2g, T1M, T2h;
+			 {
+			      E T1E, T1G, T1D, T1F;
+			      T1E = cr[WS(rs, 17)];
+			      T1G = ci[WS(rs, 17)];
+			      T1D = W[32];
+			      T1F = W[33];
+			      T1H = FMA(T1D, T1E, T1F * T1G);
+			      T2g = FNMS(T1F, T1E, T1D * T1G);
+			 }
+			 {
+			      E T1J, T1L, T1I, T1K;
+			      T1J = cr[WS(rs, 7)];
+			      T1L = ci[WS(rs, 7)];
+			      T1I = W[12];
+			      T1K = W[13];
+			      T1M = FMA(T1I, T1J, T1K * T1L);
+			      T2h = FNMS(T1K, T1J, T1I * T1L);
+			 }
+			 T1N = T1H + T1M;
+			 T3j = T2g + T2h;
+			 T2i = T2g - T2h;
+			 T2N = T1H - T1M;
+		    }
+		    {
+			 E T1k, T2b, T1p, T2c;
+			 {
+			      E T1h, T1j, T1g, T1i;
+			      T1h = cr[WS(rs, 13)];
+			      T1j = ci[WS(rs, 13)];
+			      T1g = W[24];
+			      T1i = W[25];
+			      T1k = FMA(T1g, T1h, T1i * T1j);
+			      T2b = FNMS(T1i, T1h, T1g * T1j);
+			 }
+			 {
+			      E T1m, T1o, T1l, T1n;
+			      T1m = cr[WS(rs, 3)];
+			      T1o = ci[WS(rs, 3)];
+			      T1l = W[4];
+			      T1n = W[5];
+			      T1p = FMA(T1l, T1m, T1n * T1o);
+			      T2c = FNMS(T1n, T1m, T1l * T1o);
+			 }
+			 T1q = T1k + T1p;
+			 T3h = T2b + T2c;
+			 T2d = T2b - T2c;
+			 T2I = T1k - T1p;
+		    }
+		    {
+			 E T1w, T2K, T1B, T2L;
+			 {
+			      E T1t, T1v, T1s, T1u;
+			      T1t = cr[WS(rs, 12)];
+			      T1v = ci[WS(rs, 12)];
+			      T1s = W[22];
+			      T1u = W[23];
+			      T1w = FMA(T1s, T1t, T1u * T1v);
+			      T2K = FNMS(T1u, T1t, T1s * T1v);
+			 }
+			 {
+			      E T1y, T1A, T1x, T1z;
+			      T1y = cr[WS(rs, 2)];
+			      T1A = ci[WS(rs, 2)];
+			      T1x = W[2];
+			      T1z = W[3];
+			      T1B = FMA(T1x, T1y, T1z * T1A);
+			      T2L = FNMS(T1z, T1y, T1x * T1A);
+			 }
+			 T1C = T1w + T1B;
+			 T3k = T2K + T2L;
+			 T2f = T1w - T1B;
+			 T2M = T2K - T2L;
+		    }
+		    T1r = T1f - T1q;
+		    T1O = T1C - T1N;
+		    T1P = T1r + T1O;
+		    T3i = T3g - T3h;
+		    T3l = T3j - T3k;
+		    T3J = T3l - T3i;
+		    T3D = T3g + T3h;
+		    T3E = T3k + T3j;
+		    T44 = T3D + T3E;
+		    T1V = T1f + T1q;
+		    T1W = T1C + T1N;
+		    T1X = T1V + T1W;
+		    T2e = T2a - T2d;
+		    T2j = T2f - T2i;
+		    T2k = T2e + T2j;
+		    T2W = T2H - T2I;
+		    T2X = T2M - T2N;
+		    T4f = T2W + T2X;
+		    T33 = T2a + T2d;
+		    T34 = T2f + T2i;
+		    T35 = T33 + T34;
+		    T2J = T2H + T2I;
+		    T2O = T2M + T2N;
+		    T4q = T2J + T2O;
+	       }
+	       {
+		    E Tu, T3n, T1Z, T2w, T12, T3r, T27, T2z, TF, T3o, T22, T2x, TR, T3q, T24;
+		    E T2C;
+		    {
+			 E To, T2u, Tt, T2v;
+			 {
+			      E Tl, Tn, Tk, Tm;
+			      Tl = cr[WS(rs, 4)];
+			      Tn = ci[WS(rs, 4)];
+			      Tk = W[6];
+			      Tm = W[7];
+			      To = FMA(Tk, Tl, Tm * Tn);
+			      T2u = FNMS(Tm, Tl, Tk * Tn);
+			 }
+			 {
+			      E Tq, Ts, Tp, Tr;
+			      Tq = cr[WS(rs, 14)];
+			      Ts = ci[WS(rs, 14)];
+			      Tp = W[26];
+			      Tr = W[27];
+			      Tt = FMA(Tp, Tq, Tr * Ts);
+			      T2v = FNMS(Tr, Tq, Tp * Ts);
+			 }
+			 Tu = To + Tt;
+			 T3n = T2u + T2v;
+			 T1Z = To - Tt;
+			 T2w = T2u - T2v;
+		    }
+		    {
+			 E TW, T25, T11, T26;
+			 {
+			      E TT, TV, TS, TU;
+			      TT = cr[WS(rs, 1)];
+			      TV = ci[WS(rs, 1)];
+			      TS = W[0];
+			      TU = W[1];
+			      TW = FMA(TS, TT, TU * TV);
+			      T25 = FNMS(TU, TT, TS * TV);
+			 }
+			 {
+			      E TY, T10, TX, TZ;
+			      TY = cr[WS(rs, 11)];
+			      T10 = ci[WS(rs, 11)];
+			      TX = W[20];
+			      TZ = W[21];
+			      T11 = FMA(TX, TY, TZ * T10);
+			      T26 = FNMS(TZ, TY, TX * T10);
+			 }
+			 T12 = TW + T11;
+			 T3r = T25 + T26;
+			 T27 = T25 - T26;
+			 T2z = T11 - TW;
+		    }
+		    {
+			 E Tz, T20, TE, T21;
+			 {
+			      E Tw, Ty, Tv, Tx;
+			      Tw = cr[WS(rs, 9)];
+			      Ty = ci[WS(rs, 9)];
+			      Tv = W[16];
+			      Tx = W[17];
+			      Tz = FMA(Tv, Tw, Tx * Ty);
+			      T20 = FNMS(Tx, Tw, Tv * Ty);
+			 }
+			 {
+			      E TB, TD, TA, TC;
+			      TB = cr[WS(rs, 19)];
+			      TD = ci[WS(rs, 19)];
+			      TA = W[36];
+			      TC = W[37];
+			      TE = FMA(TA, TB, TC * TD);
+			      T21 = FNMS(TC, TB, TA * TD);
+			 }
+			 TF = Tz + TE;
+			 T3o = T20 + T21;
+			 T22 = T20 - T21;
+			 T2x = Tz - TE;
+		    }
+		    {
+			 E TL, T2A, TQ, T2B;
+			 {
+			      E TI, TK, TH, TJ;
+			      TI = cr[WS(rs, 16)];
+			      TK = ci[WS(rs, 16)];
+			      TH = W[30];
+			      TJ = W[31];
+			      TL = FMA(TH, TI, TJ * TK);
+			      T2A = FNMS(TJ, TI, TH * TK);
+			 }
+			 {
+			      E TN, TP, TM, TO;
+			      TN = cr[WS(rs, 6)];
+			      TP = ci[WS(rs, 6)];
+			      TM = W[10];
+			      TO = W[11];
+			      TQ = FMA(TM, TN, TO * TP);
+			      T2B = FNMS(TO, TN, TM * TP);
+			 }
+			 TR = TL + TQ;
+			 T3q = T2A + T2B;
+			 T24 = TL - TQ;
+			 T2C = T2A - T2B;
+		    }
+		    TG = Tu - TF;
+		    T13 = TR - T12;
+		    T14 = TG + T13;
+		    T3p = T3n - T3o;
+		    T3s = T3q - T3r;
+		    T3K = T3p + T3s;
+		    T3A = T3n + T3o;
+		    T3B = T3q + T3r;
+		    T43 = T3A + T3B;
+		    T1S = Tu + TF;
+		    T1T = TR + T12;
+		    T1U = T1S + T1T;
+		    T23 = T1Z - T22;
+		    T28 = T24 - T27;
+		    T29 = T23 + T28;
+		    T2T = T2w - T2x;
+		    T2U = T2C + T2z;
+		    T4e = T2T + T2U;
+		    T30 = T1Z + T22;
+		    T31 = T24 + T27;
+		    T32 = T30 + T31;
+		    T2y = T2w + T2x;
+		    T2D = T2z - T2C;
+		    T4p = T2D - T2y;
+	       }
+	       {
+		    E T3e, T1Q, T3d, T3u, T3w, T3m, T3t, T3v, T3f;
+		    T3e = KP559016994 * (T14 - T1P);
+		    T1Q = T14 + T1P;
+		    T3d = FNMS(KP250000000, T1Q, Tj);
+		    T3m = T3i + T3l;
+		    T3t = T3p - T3s;
+		    T3u = FNMS(KP587785252, T3t, KP951056516 * T3m);
+		    T3w = FMA(KP951056516, T3t, KP587785252 * T3m);
+		    ci[WS(rs, 9)] = Tj + T1Q;
+		    T3v = T3e + T3d;
+		    ci[WS(rs, 5)] = T3v - T3w;
+		    cr[WS(rs, 6)] = T3v + T3w;
+		    T3f = T3d - T3e;
+		    cr[WS(rs, 2)] = T3f - T3u;
+		    ci[WS(rs, 1)] = T3f + T3u;
+	       }
+	       {
+		    E T36, T38, T39, T2Z, T3c, T2V, T2Y, T3b, T3a;
+		    T36 = KP559016994 * (T32 - T35);
+		    T38 = T32 + T35;
+		    T39 = FNMS(KP250000000, T38, T37);
+		    T2V = T2T - T2U;
+		    T2Y = T2W - T2X;
+		    T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
+		    T3c = FNMS(KP587785252, T2V, KP951056516 * T2Y);
+		    cr[WS(rs, 5)] = T37 + T38;
+		    T3b = T39 - T36;
+		    ci[WS(rs, 2)] = T3b - T3c;
+		    ci[WS(rs, 6)] = T3c + T3b;
+		    T3a = T36 + T39;
+		    cr[WS(rs, 1)] = T2Z + T3a;
+		    cr[WS(rs, 9)] = T3a - T2Z;
+	       }
+	       {
+		    E T3x, T1Y, T3y, T3G, T3I, T3C, T3F, T3H, T3z;
+		    T3x = KP559016994 * (T1U - T1X);
+		    T1Y = T1U + T1X;
+		    T3y = FNMS(KP250000000, T1Y, T1R);
+		    T3C = T3A - T3B;
+		    T3F = T3D - T3E;
+		    T3G = FMA(KP951056516, T3C, KP587785252 * T3F);
+		    T3I = FNMS(KP587785252, T3C, KP951056516 * T3F);
+		    cr[0] = T1R + T1Y;
+		    T3H = T3y - T3x;
+		    ci[WS(rs, 7)] = T3H - T3I;
+		    cr[WS(rs, 8)] = T3H + T3I;
+		    T3z = T3x + T3y;
+		    cr[WS(rs, 4)] = T3z - T3G;
+		    ci[WS(rs, 3)] = T3z + T3G;
+	       }
+	       {
+		    E T2l, T2r, T2s, T2Q, T2R, T2E, T2P, T2S, T2t;
+		    T2l = KP559016994 * (T29 - T2k);
+		    T2r = T29 + T2k;
+		    T2s = FNMS(KP250000000, T2r, T2q);
+		    T2E = T2y + T2D;
+		    T2P = T2J - T2O;
+		    T2Q = FMA(KP951056516, T2E, KP587785252 * T2P);
+		    T2R = FNMS(KP587785252, T2E, KP951056516 * T2P);
+		    ci[WS(rs, 4)] = T2q + T2r;
+		    T2S = T2s - T2l;
+		    cr[WS(rs, 3)] = T2R + T2S;
+		    cr[WS(rs, 7)] = T2S - T2R;
+		    T2t = T2l + T2s;
+		    ci[0] = T2t - T2Q;
+		    ci[WS(rs, 8)] = T2Q + T2t;
+	       }
+	       {
+		    E T3U, T3L, T3V, T3T, T3X, T3R, T3S, T3Y, T3W;
+		    T3U = KP559016994 * (T3K + T3J);
+		    T3L = T3J - T3K;
+		    T3V = FMA(KP250000000, T3L, T3Q);
+		    T3R = T13 - TG;
+		    T3S = T1r - T1O;
+		    T3T = FNMS(KP587785252, T3S, KP951056516 * T3R);
+		    T3X = FMA(KP587785252, T3R, KP951056516 * T3S);
+		    cr[WS(rs, 10)] = T3L - T3Q;
+		    T3Y = T3V - T3U;
+		    cr[WS(rs, 18)] = T3X - T3Y;
+		    ci[WS(rs, 17)] = T3X + T3Y;
+		    T3W = T3U + T3V;
+		    cr[WS(rs, 14)] = T3T - T3W;
+		    ci[WS(rs, 13)] = T3T + T3W;
+	       }
+	       {
+		    E T4g, T4k, T4l, T4d, T4n, T4b, T4c, T4o, T4m;
+		    T4g = KP559016994 * (T4e - T4f);
+		    T4k = T4e + T4f;
+		    T4l = FNMS(KP250000000, T4k, T4j);
+		    T4b = T33 - T34;
+		    T4c = T30 - T31;
+		    T4d = FNMS(KP587785252, T4c, KP951056516 * T4b);
+		    T4n = FMA(KP951056516, T4c, KP587785252 * T4b);
+		    ci[WS(rs, 14)] = T4k + T4j;
+		    T4o = T4g + T4l;
+		    ci[WS(rs, 10)] = T4n + T4o;
+		    ci[WS(rs, 18)] = T4o - T4n;
+		    T4m = T4g - T4l;
+		    cr[WS(rs, 13)] = T4d + T4m;
+		    cr[WS(rs, 17)] = T4m - T4d;
+	       }
+	       {
+		    E T47, T45, T46, T41, T49, T3Z, T40, T4a, T48;
+		    T47 = KP559016994 * (T43 - T44);
+		    T45 = T43 + T44;
+		    T46 = FNMS(KP250000000, T45, T42);
+		    T3Z = T1S - T1T;
+		    T40 = T1V - T1W;
+		    T41 = FNMS(KP951056516, T40, KP587785252 * T3Z);
+		    T49 = FMA(KP951056516, T3Z, KP587785252 * T40);
+		    ci[WS(rs, 19)] = T45 + T42;
+		    T4a = T47 + T46;
+		    cr[WS(rs, 16)] = T49 - T4a;
+		    ci[WS(rs, 15)] = T49 + T4a;
+		    T48 = T46 - T47;
+		    cr[WS(rs, 12)] = T41 - T48;
+		    ci[WS(rs, 11)] = T41 + T48;
+	       }
+	       {
+		    E T4w, T4r, T4x, T4v, T4z, T4t, T4u, T4A, T4y;
+		    T4w = KP559016994 * (T4p + T4q);
+		    T4r = T4p - T4q;
+		    T4x = FMA(KP250000000, T4r, T4s);
+		    T4t = T23 - T28;
+		    T4u = T2e - T2j;
+		    T4v = FMA(KP951056516, T4t, KP587785252 * T4u);
+		    T4z = FNMS(KP587785252, T4t, KP951056516 * T4u);
+		    cr[WS(rs, 15)] = T4r - T4s;
+		    T4A = T4w + T4x;
+		    ci[WS(rs, 12)] = T4z + T4A;
+		    ci[WS(rs, 16)] = T4A - T4z;
+		    T4y = T4w - T4x;
+		    cr[WS(rs, 11)] = T4v + T4y;
+		    cr[WS(rs, 19)] = T4y - T4v;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 20},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 20, "hf_20", twinstr, &GENUS, {184, 62, 62, 0} };
+
+void X(codelet_hf_20) (planner *p) {
+     X(khc2hc_register) (p, hf_20, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1573 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:01 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 25 -dit -name hf_25 -include hf.h */
+
+/*
+ * This function contains 400 FP additions, 364 FP multiplications,
+ * (or, 84 additions, 48 multiplications, 316 fused multiply/add),
+ * 178 stack variables, 47 constants, and 100 memory accesses
+ */
+#include "hf.h"
+
+static void hf_25(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DK(KP621716863, +0.621716863012209892444754556304102309693593202);
+     DK(KP614372930, +0.614372930789563808870829930444362096004872855);
+     DK(KP557913902, +0.557913902031834264187699648465567037992437152);
+     DK(KP249506682, +0.249506682107067890488084201715862638334226305);
+     DK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DK(KP968479752, +0.968479752739016373193524836781420152702090879);
+     DK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP994076283, +0.994076283785401014123185814696322018529298887);
+     DK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DK(KP062914667, +0.062914667253649757225485955897349402364686947);
+     DK(KP833417178, +0.833417178328688677408962550243238843138996060);
+     DK(KP921177326, +0.921177326965143320250447435415066029359282231);
+     DK(KP541454447, +0.541454447536312777046285590082819509052033189);
+     DK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DK(KP943557151, +0.943557151597354104399655195398983005179443399);
+     DK(KP554608978, +0.554608978404018097464974850792216217022558774);
+     DK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP248028675, +0.248028675328619457762448260696444630363259177);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP525970792, +0.525970792408939708442463226536226366643874659);
+     DK(KP726211448, +0.726211448929902658173535992263577167607493062);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DK(KP871714437, +0.871714437527667770979999223229522602943903653);
+     DK(KP549754652, +0.549754652192770074288023275540779861653779767);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP939062505, +0.939062505817492352556001843133229685779824606);
+     DK(KP256756360, +0.256756360367726783319498520922669048172391148);
+     DK(KP851038619, +0.851038619207379630836264138867114231259902550);
+     DK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DK(KP912018591, +0.912018591466481957908415381764119056233607330);
+     DK(KP634619297, +0.634619297544148100711287640319130485732531031);
+     DK(KP470564281, +0.470564281212251493087595091036643380879947982);
+     DK(KP827271945, +0.827271945972475634034355757144307982555673741);
+     DK(KP126329378, +0.126329378446108174786050455341811215027378105);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 48); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 48, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E T7i, T6o, T6m, T7o, T7m, T7h, T6n, T6f, T7j, T7n;
+	       {
+		    E T6W, T5G, T3Y, T3M, T7q, T70, T6V, T7P, Tt, T3L, T5T, T45, T5Q, T4c, T3G;
+		    E T2G, T5P, T49, T5S, T42, T65, T4H, T68, T4A, T2Z, T11, T67, T4x, T64, T4E;
+		    E T5Y, T4W, T61, T4P, T3d, T1z, T60, T4M, T5X, T4T, T3g, T1G, T3q, T4q, T4j;
+		    E T26, T3i, T1M, T3k, T1S;
+		    {
+			 E T3u, T2e, T3E, T44, T4b, T2E, T3w, T2k, T3y, T2q;
+			 {
+			      E T1, T6R, T3P, T7, T3W, Tq, T9, Tc, Tb, T3U, Tk, T3Q, Ta;
+			      {
+				   E T3, T6, T2, T5;
+				   T1 = cr[0];
+				   T6R = ci[0];
+				   T3 = cr[WS(rs, 5)];
+				   T6 = ci[WS(rs, 5)];
+				   T2 = W[8];
+				   T5 = W[9];
+				   {
+					E Tm, Tp, To, T3V, Tn, T3O, T4, Tl;
+					Tm = cr[WS(rs, 15)];
+					Tp = ci[WS(rs, 15)];
+					T3O = T2 * T6;
+					T4 = T2 * T3;
+					Tl = W[28];
+					To = W[29];
+					T3P = FNMS(T5, T3, T3O);
+					T7 = FMA(T5, T6, T4);
+					T3V = Tl * Tp;
+					Tn = Tl * Tm;
+					{
+					     E Tg, Tj, Tf, Ti, T3T, Th, T8;
+					     Tg = cr[WS(rs, 10)];
+					     Tj = ci[WS(rs, 10)];
+					     T3W = FNMS(To, Tm, T3V);
+					     Tq = FMA(To, Tp, Tn);
+					     Tf = W[18];
+					     Ti = W[19];
+					     T9 = cr[WS(rs, 20)];
+					     Tc = ci[WS(rs, 20)];
+					     T3T = Tf * Tj;
+					     Th = Tf * Tg;
+					     T8 = W[38];
+					     Tb = W[39];
+					     T3U = FNMS(Ti, Tg, T3T);
+					     Tk = FMA(Ti, Tj, Th);
+					     T3Q = T8 * Tc;
+					     Ta = T8 * T9;
+					}
+				   }
+			      }
+			      {
+				   E T6T, T3X, T6Y, Tr, T3R, Td;
+				   T6T = T3U + T3W;
+				   T3X = T3U - T3W;
+				   T6Y = Tk - Tq;
+				   Tr = Tk + Tq;
+				   T3R = FNMS(Tb, T9, T3Q);
+				   Td = FMA(Tb, Tc, Ta);
+				   {
+					E T3S, T6Z, Te, T6U, T6S, Ts;
+					T3S = T3P - T3R;
+					T6S = T3P + T3R;
+					T6Z = T7 - Td;
+					Te = T7 + Td;
+					T6W = T6S - T6T;
+					T6U = T6S + T6T;
+					T5G = FNMS(KP618033988, T3S, T3X);
+					T3Y = FMA(KP618033988, T3X, T3S);
+					T3M = Te - Tr;
+					Ts = Te + Tr;
+					T7q = FMA(KP618033988, T6Y, T6Z);
+					T70 = FNMS(KP618033988, T6Z, T6Y);
+					T6V = FNMS(KP250000000, T6U, T6R);
+					T7P = T6U + T6R;
+					Tt = T1 + Ts;
+					T3L = FNMS(KP250000000, Ts, T1);
+				   }
+			      }
+			 }
+			 {
+			      E T2g, T2j, T2m, T3v, T2h, T2p, T2l, T2i, T2o, T3x, T2n;
+			      {
+				   E T2a, T2d, T29, T2c;
+				   T2a = cr[WS(rs, 3)];
+				   T2d = ci[WS(rs, 3)];
+				   T29 = W[4];
+				   T2c = W[5];
+				   {
+					E T2t, T2w, T2z, T3A, T2u, T2C, T2y, T2v, T2B, T3t, T2b, T2s, T2f;
+					T2t = cr[WS(rs, 13)];
+					T2w = ci[WS(rs, 13)];
+					T3t = T29 * T2d;
+					T2b = T29 * T2a;
+					T2s = W[24];
+					T2z = cr[WS(rs, 18)];
+					T3u = FNMS(T2c, T2a, T3t);
+					T2e = FMA(T2c, T2d, T2b);
+					T3A = T2s * T2w;
+					T2u = T2s * T2t;
+					T2C = ci[WS(rs, 18)];
+					T2y = W[34];
+					T2v = W[25];
+					T2B = W[35];
+					{
+					     E T3B, T2x, T3D, T2D, T3C, T2A;
+					     T2g = cr[WS(rs, 8)];
+					     T3C = T2y * T2C;
+					     T2A = T2y * T2z;
+					     T3B = FNMS(T2v, T2t, T3A);
+					     T2x = FMA(T2v, T2w, T2u);
+					     T3D = FNMS(T2B, T2z, T3C);
+					     T2D = FMA(T2B, T2C, T2A);
+					     T2j = ci[WS(rs, 8)];
+					     T2f = W[14];
+					     T3E = T3B + T3D;
+					     T44 = T3D - T3B;
+					     T4b = T2x - T2D;
+					     T2E = T2x + T2D;
+					}
+					T2m = cr[WS(rs, 23)];
+					T3v = T2f * T2j;
+					T2h = T2f * T2g;
+					T2p = ci[WS(rs, 23)];
+					T2l = W[44];
+					T2i = W[15];
+					T2o = W[45];
+				   }
+			      }
+			      T3x = T2l * T2p;
+			      T2n = T2l * T2m;
+			      T3w = FNMS(T2i, T2g, T3v);
+			      T2k = FMA(T2i, T2j, T2h);
+			      T3y = FNMS(T2o, T2m, T3x);
+			      T2q = FMA(T2o, T2p, T2n);
+			 }
+			 {
+			      E T2N, Tz, T2X, T4G, T4z, TZ, T2P, TF, T2R, TL;
+			      {
+				   E TB, TE, TH, T2O, TC, TK, TG, TD, TJ, T2Q, TI;
+				   {
+					E Tv, Ty, Tu, Tx;
+					{
+					     E T48, T41, T47, T40, T43, T3z;
+					     Tv = cr[WS(rs, 1)];
+					     T43 = T3y - T3w;
+					     T3z = T3w + T3y;
+					     {
+						  E T4a, T2r, T3F, T2F;
+						  T4a = T2k - T2q;
+						  T2r = T2k + T2q;
+						  T5T = FNMS(KP618033988, T43, T44);
+						  T45 = FMA(KP618033988, T44, T43);
+						  T3F = T3z + T3E;
+						  T48 = T3E - T3z;
+						  T5Q = FNMS(KP618033988, T4a, T4b);
+						  T4c = FMA(KP618033988, T4b, T4a);
+						  T2F = T2r + T2E;
+						  T41 = T2E - T2r;
+						  T3G = T3u + T3F;
+						  T47 = FNMS(KP250000000, T3F, T3u);
+						  T2G = T2e + T2F;
+						  T40 = FNMS(KP250000000, T2F, T2e);
+						  Ty = ci[WS(rs, 1)];
+					     }
+					     T5P = FMA(KP559016994, T48, T47);
+					     T49 = FNMS(KP559016994, T48, T47);
+					     T5S = FMA(KP559016994, T41, T40);
+					     T42 = FNMS(KP559016994, T41, T40);
+					     Tu = W[0];
+					}
+					Tx = W[1];
+					{
+					     E TO, TR, TU, T2T, TP, TX, TT, TQ, TW, T2M, Tw, TN, TA;
+					     TO = cr[WS(rs, 11)];
+					     TR = ci[WS(rs, 11)];
+					     T2M = Tu * Ty;
+					     Tw = Tu * Tv;
+					     TN = W[20];
+					     TU = cr[WS(rs, 16)];
+					     T2N = FNMS(Tx, Tv, T2M);
+					     Tz = FMA(Tx, Ty, Tw);
+					     T2T = TN * TR;
+					     TP = TN * TO;
+					     TX = ci[WS(rs, 16)];
+					     TT = W[30];
+					     TQ = W[21];
+					     TW = W[31];
+					     {
+						  E T2U, TS, T2W, TY, T2V, TV;
+						  TB = cr[WS(rs, 6)];
+						  T2V = TT * TX;
+						  TV = TT * TU;
+						  T2U = FNMS(TQ, TO, T2T);
+						  TS = FMA(TQ, TR, TP);
+						  T2W = FNMS(TW, TU, T2V);
+						  TY = FMA(TW, TX, TV);
+						  TE = ci[WS(rs, 6)];
+						  TA = W[10];
+						  T2X = T2U + T2W;
+						  T4G = T2W - T2U;
+						  T4z = TY - TS;
+						  TZ = TS + TY;
+					     }
+					     TH = cr[WS(rs, 21)];
+					     T2O = TA * TE;
+					     TC = TA * TB;
+					     TK = ci[WS(rs, 21)];
+					     TG = W[40];
+					     TD = W[11];
+					     TJ = W[41];
+					}
+				   }
+				   T2Q = TG * TK;
+				   TI = TG * TH;
+				   T2P = FNMS(TD, TB, T2O);
+				   TF = FMA(TD, TE, TC);
+				   T2R = FNMS(TJ, TH, T2Q);
+				   TL = FMA(TJ, TK, TI);
+			      }
+			      {
+				   E T31, T17, T3b, T4V, T4O, T1x, T33, T1d, T35, T1j;
+				   {
+					E T19, T1c, T1f, T32, T1a, T1i, T1e, T1b, T1h, T34, T1g;
+					{
+					     E T13, T16, T12, T15;
+					     {
+						  E T4w, T4D, T4v, T4C, T4F, T2S;
+						  T13 = cr[WS(rs, 4)];
+						  T4F = T2P - T2R;
+						  T2S = T2P + T2R;
+						  {
+						       E T4y, TM, T2Y, T10;
+						       T4y = TL - TF;
+						       TM = TF + TL;
+						       T65 = FMA(KP618033988, T4F, T4G);
+						       T4H = FNMS(KP618033988, T4G, T4F);
+						       T2Y = T2S + T2X;
+						       T4w = T2S - T2X;
+						       T68 = FNMS(KP618033988, T4y, T4z);
+						       T4A = FMA(KP618033988, T4z, T4y);
+						       T10 = TM + TZ;
+						       T4D = TM - TZ;
+						       T2Z = T2N + T2Y;
+						       T4v = FNMS(KP250000000, T2Y, T2N);
+						       T11 = Tz + T10;
+						       T4C = FNMS(KP250000000, T10, Tz);
+						       T16 = ci[WS(rs, 4)];
+						  }
+						  T67 = FNMS(KP559016994, T4w, T4v);
+						  T4x = FMA(KP559016994, T4w, T4v);
+						  T64 = FNMS(KP559016994, T4D, T4C);
+						  T4E = FMA(KP559016994, T4D, T4C);
+						  T12 = W[6];
+					     }
+					     T15 = W[7];
+					     {
+						  E T1m, T1p, T1s, T37, T1n, T1v, T1r, T1o, T1u, T30, T14, T1l, T18;
+						  T1m = cr[WS(rs, 14)];
+						  T1p = ci[WS(rs, 14)];
+						  T30 = T12 * T16;
+						  T14 = T12 * T13;
+						  T1l = W[26];
+						  T1s = cr[WS(rs, 19)];
+						  T31 = FNMS(T15, T13, T30);
+						  T17 = FMA(T15, T16, T14);
+						  T37 = T1l * T1p;
+						  T1n = T1l * T1m;
+						  T1v = ci[WS(rs, 19)];
+						  T1r = W[36];
+						  T1o = W[27];
+						  T1u = W[37];
+						  {
+						       E T38, T1q, T3a, T1w, T39, T1t;
+						       T19 = cr[WS(rs, 9)];
+						       T39 = T1r * T1v;
+						       T1t = T1r * T1s;
+						       T38 = FNMS(T1o, T1m, T37);
+						       T1q = FMA(T1o, T1p, T1n);
+						       T3a = FNMS(T1u, T1s, T39);
+						       T1w = FMA(T1u, T1v, T1t);
+						       T1c = ci[WS(rs, 9)];
+						       T18 = W[16];
+						       T3b = T38 + T3a;
+						       T4V = T3a - T38;
+						       T4O = T1w - T1q;
+						       T1x = T1q + T1w;
+						  }
+						  T1f = cr[WS(rs, 24)];
+						  T32 = T18 * T1c;
+						  T1a = T18 * T19;
+						  T1i = ci[WS(rs, 24)];
+						  T1e = W[46];
+						  T1b = W[17];
+						  T1h = W[47];
+					     }
+					}
+					T34 = T1e * T1i;
+					T1g = T1e * T1f;
+					T33 = FNMS(T1b, T19, T32);
+					T1d = FMA(T1b, T1c, T1a);
+					T35 = FNMS(T1h, T1f, T34);
+					T1j = FMA(T1h, T1i, T1g);
+				   }
+				   {
+					E T1I, T1L, T1O, T3h, T1J, T1R, T1N, T1K, T1Q, T3j, T1P;
+					{
+					     E T1C, T1F, T1B, T1E;
+					     {
+						  E T4L, T4S, T4K, T4R, T4U, T36;
+						  T1C = cr[WS(rs, 2)];
+						  T4U = T35 - T33;
+						  T36 = T33 + T35;
+						  {
+						       E T4N, T1k, T3c, T1y;
+						       T4N = T1j - T1d;
+						       T1k = T1d + T1j;
+						       T5Y = FNMS(KP618033988, T4U, T4V);
+						       T4W = FMA(KP618033988, T4V, T4U);
+						       T3c = T36 + T3b;
+						       T4L = T3b - T36;
+						       T61 = FNMS(KP618033988, T4N, T4O);
+						       T4P = FMA(KP618033988, T4O, T4N);
+						       T1y = T1k + T1x;
+						       T4S = T1k - T1x;
+						       T3d = T31 + T3c;
+						       T4K = FNMS(KP250000000, T3c, T31);
+						       T1z = T17 + T1y;
+						       T4R = FNMS(KP250000000, T1y, T17);
+						       T1F = ci[WS(rs, 2)];
+						  }
+						  T60 = FMA(KP559016994, T4L, T4K);
+						  T4M = FNMS(KP559016994, T4L, T4K);
+						  T5X = FNMS(KP559016994, T4S, T4R);
+						  T4T = FMA(KP559016994, T4S, T4R);
+						  T1B = W[2];
+					     }
+					     T1E = W[3];
+					     {
+						  E T1V, T1Y, T21, T3m, T1W, T24, T20, T1X, T23, T3f, T1D, T1U, T1H;
+						  T1V = cr[WS(rs, 12)];
+						  T1Y = ci[WS(rs, 12)];
+						  T3f = T1B * T1F;
+						  T1D = T1B * T1C;
+						  T1U = W[22];
+						  T21 = cr[WS(rs, 17)];
+						  T3g = FNMS(T1E, T1C, T3f);
+						  T1G = FMA(T1E, T1F, T1D);
+						  T3m = T1U * T1Y;
+						  T1W = T1U * T1V;
+						  T24 = ci[WS(rs, 17)];
+						  T20 = W[32];
+						  T1X = W[23];
+						  T23 = W[33];
+						  {
+						       E T3n, T1Z, T3p, T25, T3o, T22;
+						       T1I = cr[WS(rs, 7)];
+						       T3o = T20 * T24;
+						       T22 = T20 * T21;
+						       T3n = FNMS(T1X, T1V, T3m);
+						       T1Z = FMA(T1X, T1Y, T1W);
+						       T3p = FNMS(T23, T21, T3o);
+						       T25 = FMA(T23, T24, T22);
+						       T1L = ci[WS(rs, 7)];
+						       T1H = W[12];
+						       T3q = T3n + T3p;
+						       T4q = T3n - T3p;
+						       T4j = T25 - T1Z;
+						       T26 = T1Z + T25;
+						  }
+						  T1O = cr[WS(rs, 22)];
+						  T3h = T1H * T1L;
+						  T1J = T1H * T1I;
+						  T1R = ci[WS(rs, 22)];
+						  T1N = W[42];
+						  T1K = W[13];
+						  T1Q = W[43];
+					     }
+					}
+					T3j = T1N * T1R;
+					T1P = T1N * T1O;
+					T3i = FNMS(T1K, T1I, T3h);
+					T1M = FMA(T1K, T1L, T1J);
+					T3k = FNMS(T1Q, T1O, T3j);
+					T1S = FMA(T1Q, T1R, T1P);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T7Q, T5M, T5J, T7R, T5I, T5L, T7X, T7W, T5F, T6X, T5u, T7M, T7O, T5C, T5E;
+			 E T5t, T7J, T7N;
+			 {
+			      E T4r, T4k, T4h, T4o, T3K, T3I, T1A, T2H, T28;
+			      {
+				   E T3e, T4g, T4n, T4f, T4m, T3H, T4p, T3l;
+				   T7Q = T2Z + T3d;
+				   T3e = T2Z - T3d;
+				   T4p = T3k - T3i;
+				   T3l = T3i + T3k;
+				   {
+					E T4i, T1T, T3r, T27, T3s;
+					T4i = T1S - T1M;
+					T1T = T1M + T1S;
+					T5M = FMA(KP618033988, T4p, T4q);
+					T4r = FNMS(KP618033988, T4q, T4p);
+					T3r = T3l + T3q;
+					T4g = T3q - T3l;
+					T5J = FNMS(KP618033988, T4i, T4j);
+					T4k = FMA(KP618033988, T4j, T4i);
+					T27 = T1T + T26;
+					T4n = T26 - T1T;
+					T3s = T3g + T3r;
+					T4f = FNMS(KP250000000, T3r, T3g);
+					T28 = T1G + T27;
+					T4m = FNMS(KP250000000, T27, T1G);
+					T3H = T3s - T3G;
+					T7R = T3s + T3G;
+				   }
+				   T5I = FMA(KP559016994, T4g, T4f);
+				   T4h = FNMS(KP559016994, T4g, T4f);
+				   T5L = FMA(KP559016994, T4n, T4m);
+				   T4o = FNMS(KP559016994, T4n, T4m);
+				   T3K = FNMS(KP618033988, T3e, T3H);
+				   T3I = FMA(KP618033988, T3H, T3e);
+			      }
+			      T1A = T11 + T1z;
+			      T7X = T1z - T11;
+			      T7W = T28 - T2G;
+			      T2H = T28 + T2G;
+			      {
+				   E T3Z, T5d, T7r, T7D, T5h, T5i, T5m, T5l, T59, T7K, T56, T7L, T7I, T7G, T52;
+				   E T50, T5w, T5g, T5q, T5A, T3N, T7p;
+				   T3N = FMA(KP559016994, T3M, T3L);
+				   T5F = FNMS(KP559016994, T3M, T3L);
+				   T6X = FNMS(KP559016994, T6W, T6V);
+				   T7p = FMA(KP559016994, T6W, T6V);
+				   {
+					E T5o, T5p, T57, T4e, T4Y, T55, T4l, T4s, T4B, T5f, T5e, T4I;
+					{
+					     E T46, T2K, T2J, T4d, T2I;
+					     T46 = FMA(KP951056516, T45, T42);
+					     T5o = FNMS(KP951056516, T45, T42);
+					     T2I = T1A + T2H;
+					     T2K = T1A - T2H;
+					     T3Z = FNMS(KP951056516, T3Y, T3N);
+					     T5d = FMA(KP951056516, T3Y, T3N);
+					     T7r = FNMS(KP951056516, T7q, T7p);
+					     T7D = FMA(KP951056516, T7q, T7p);
+					     cr[0] = Tt + T2I;
+					     T2J = FNMS(KP250000000, T2I, Tt);
+					     T5p = FNMS(KP951056516, T4c, T49);
+					     T4d = FMA(KP951056516, T4c, T49);
+					     {
+						  E T4Q, T4X, T2L, T3J;
+						  T4Q = FNMS(KP951056516, T4P, T4M);
+						  T5h = FMA(KP951056516, T4P, T4M);
+						  T5i = FNMS(KP951056516, T4W, T4T);
+						  T4X = FMA(KP951056516, T4W, T4T);
+						  T2L = FMA(KP559016994, T2K, T2J);
+						  T3J = FNMS(KP559016994, T2K, T2J);
+						  T57 = FMA(KP126329378, T46, T4d);
+						  T4e = FNMS(KP126329378, T4d, T46);
+						  cr[WS(rs, 5)] = FMA(KP951056516, T3I, T2L);
+						  ci[WS(rs, 4)] = FNMS(KP951056516, T3I, T2L);
+						  ci[WS(rs, 9)] = FMA(KP951056516, T3K, T3J);
+						  cr[WS(rs, 10)] = FNMS(KP951056516, T3K, T3J);
+						  T4Y = FMA(KP827271945, T4X, T4Q);
+						  T55 = FNMS(KP827271945, T4Q, T4X);
+					     }
+					}
+					T4l = FNMS(KP951056516, T4k, T4h);
+					T5m = FMA(KP951056516, T4k, T4h);
+					T5l = FNMS(KP951056516, T4r, T4o);
+					T4s = FMA(KP951056516, T4r, T4o);
+					T4B = FNMS(KP951056516, T4A, T4x);
+					T5f = FMA(KP951056516, T4A, T4x);
+					T5e = FMA(KP951056516, T4H, T4E);
+					T4I = FNMS(KP951056516, T4H, T4E);
+					{
+					     E T4u, T4Z, T4t, T58;
+					     T4t = FNMS(KP470564281, T4s, T4l);
+					     T58 = FMA(KP470564281, T4l, T4s);
+					     {
+						  E T4J, T54, T7E, T7F;
+						  T4J = FMA(KP634619297, T4I, T4B);
+						  T54 = FNMS(KP634619297, T4B, T4I);
+						  T59 = FNMS(KP912018591, T58, T57);
+						  T7E = FMA(KP912018591, T58, T57);
+						  T7K = FMA(KP912018591, T4t, T4e);
+						  T4u = FNMS(KP912018591, T4t, T4e);
+						  T56 = FMA(KP912575812, T55, T54);
+						  T7F = FNMS(KP912575812, T55, T54);
+						  T7L = FMA(KP912575812, T4Y, T4J);
+						  T4Z = FNMS(KP912575812, T4Y, T4J);
+						  T7I = FNMS(KP851038619, T7F, T7E);
+						  T7G = FMA(KP851038619, T7F, T7E);
+					     }
+					     T52 = FMA(KP851038619, T4Z, T4u);
+					     T50 = FNMS(KP851038619, T4Z, T4u);
+					}
+					T5w = FNMS(KP256756360, T5e, T5f);
+					T5g = FMA(KP256756360, T5f, T5e);
+					T5q = FMA(KP939062505, T5p, T5o);
+					T5A = FNMS(KP939062505, T5o, T5p);
+				   }
+				   {
+					E T5y, T7z, T5B, T7y, T7w, T7u, T5s;
+					{
+					     E T5k, T5r, T5j, T5x;
+					     cr[WS(rs, 4)] = FNMS(KP992114701, T50, T3Z);
+					     T5j = FMA(KP634619297, T5i, T5h);
+					     T5x = FNMS(KP634619297, T5h, T5i);
+					     {
+						  E T5n, T5z, T7s, T7t;
+						  T5n = FMA(KP549754652, T5m, T5l);
+						  T5z = FNMS(KP549754652, T5l, T5m);
+						  T5y = FMA(KP871714437, T5x, T5w);
+						  T7s = FNMS(KP871714437, T5x, T5w);
+						  T7z = FNMS(KP871714437, T5j, T5g);
+						  T5k = FMA(KP871714437, T5j, T5g);
+						  T5B = FNMS(KP831864738, T5A, T5z);
+						  T7t = FMA(KP831864738, T5A, T5z);
+						  T7y = FNMS(KP831864738, T5q, T5n);
+						  T5r = FMA(KP831864738, T5q, T5n);
+						  T7w = FNMS(KP904730450, T7t, T7s);
+						  T7u = FMA(KP904730450, T7t, T7s);
+					     }
+					     ci[WS(rs, 20)] = FNMS(KP992114701, T7G, T7D);
+					     T5u = FNMS(KP904730450, T5r, T5k);
+					     T5s = FMA(KP904730450, T5r, T5k);
+					}
+					{
+					     E T5a, T5c, T7A, T7C, T7v, T53, T5b, T51, T7H, T7x, T7B;
+					     T5a = FNMS(KP726211448, T59, T56);
+					     T5c = FMA(KP525970792, T56, T59);
+					     ci[WS(rs, 23)] = FMA(KP968583161, T7u, T7r);
+					     cr[WS(rs, 1)] = FMA(KP968583161, T5s, T5d);
+					     T51 = FMA(KP248028675, T50, T3Z);
+					     T7A = FNMS(KP683113946, T7z, T7y);
+					     T7C = FMA(KP559154169, T7y, T7z);
+					     T7v = FNMS(KP242145790, T7u, T7r);
+					     T53 = FMA(KP554608978, T52, T51);
+					     T5b = FNMS(KP554608978, T52, T51);
+					     T7M = FNMS(KP525970792, T7L, T7K);
+					     T7O = FMA(KP726211448, T7K, T7L);
+					     ci[WS(rs, 10)] = FNMS(KP943557151, T5c, T5b);
+					     ci[WS(rs, 5)] = FMA(KP943557151, T5c, T5b);
+					     ci[0] = FMA(KP803003575, T5a, T53);
+					     cr[WS(rs, 9)] = FNMS(KP803003575, T5a, T53);
+					     T7x = FNMS(KP541454447, T7w, T7v);
+					     T7B = FMA(KP541454447, T7w, T7v);
+					     T7H = FMA(KP248028675, T7G, T7D);
+					     cr[WS(rs, 21)] = -(FMA(KP921177326, T7C, T7B));
+					     ci[WS(rs, 18)] = FNMS(KP921177326, T7C, T7B);
+					     ci[WS(rs, 13)] = FMA(KP833417178, T7A, T7x);
+					     cr[WS(rs, 16)] = FMS(KP833417178, T7A, T7x);
+					     T5C = FMA(KP559154169, T5B, T5y);
+					     T5E = FNMS(KP683113946, T5y, T5B);
+					     T5t = FNMS(KP242145790, T5s, T5d);
+					     T7J = FNMS(KP554608978, T7I, T7H);
+					     T7N = FMA(KP554608978, T7I, T7H);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T7Y, T80, T5v, T5D;
+			      cr[WS(rs, 24)] = -(FMA(KP803003575, T7O, T7N));
+			      ci[WS(rs, 15)] = FNMS(KP803003575, T7O, T7N);
+			      cr[WS(rs, 19)] = FMS(KP943557151, T7M, T7J);
+			      cr[WS(rs, 14)] = -(FMA(KP943557151, T7M, T7J));
+			      T5v = FMA(KP541454447, T5u, T5t);
+			      T5D = FNMS(KP541454447, T5u, T5t);
+			      cr[WS(rs, 11)] = FNMS(KP833417178, T5E, T5D);
+			      ci[WS(rs, 8)] = FMA(KP833417178, T5E, T5D);
+			      cr[WS(rs, 6)] = FMA(KP921177326, T5C, T5v);
+			      ci[WS(rs, 3)] = FNMS(KP921177326, T5C, T5v);
+			      T7Y = FMA(KP618033988, T7X, T7W);
+			      T80 = FNMS(KP618033988, T7W, T7X);
+			      {
+				   E T6t, T6p, T5H, T7d, T71, T6u, T6y, T6x, T6l, T7k, T6i, T7l, T7g, T6c, T6e;
+				   E T6s, T6L, T6J, T6C;
+				   {
+					E T6A, T6B, T5O, T6j, T6h, T6a, T6q, T5R, T5U, T6r, T5Z, T62;
+					{
+					     E T5K, T7U, T7T, T5N, T7S;
+					     T6t = FNMS(KP951056516, T5J, T5I);
+					     T5K = FMA(KP951056516, T5J, T5I);
+					     T7U = T7Q - T7R;
+					     T7S = T7Q + T7R;
+					     T6p = FNMS(KP951056516, T5G, T5F);
+					     T5H = FMA(KP951056516, T5G, T5F);
+					     T7d = FNMS(KP951056516, T70, T6X);
+					     T71 = FMA(KP951056516, T70, T6X);
+					     ci[WS(rs, 24)] = T7S + T7P;
+					     T7T = FNMS(KP250000000, T7S, T7P);
+					     T5N = FMA(KP951056516, T5M, T5L);
+					     T6u = FNMS(KP951056516, T5M, T5L);
+					     {
+						  E T66, T69, T7Z, T7V;
+						  T6A = FMA(KP951056516, T65, T64);
+						  T66 = FNMS(KP951056516, T65, T64);
+						  T69 = FMA(KP951056516, T68, T67);
+						  T6B = FNMS(KP951056516, T68, T67);
+						  T7Z = FMA(KP559016994, T7U, T7T);
+						  T7V = FNMS(KP559016994, T7U, T7T);
+						  T5O = FMA(KP062914667, T5N, T5K);
+						  T6j = FNMS(KP062914667, T5K, T5N);
+						  ci[WS(rs, 14)] = FMA(KP951056516, T7Y, T7V);
+						  cr[WS(rs, 15)] = FMS(KP951056516, T7Y, T7V);
+						  ci[WS(rs, 19)] = FMA(KP951056516, T80, T7Z);
+						  cr[WS(rs, 20)] = FMS(KP951056516, T80, T7Z);
+						  T6h = FNMS(KP939062505, T66, T69);
+						  T6a = FMA(KP939062505, T69, T66);
+					     }
+					}
+					T6q = FMA(KP951056516, T5Q, T5P);
+					T5R = FNMS(KP951056516, T5Q, T5P);
+					T5U = FNMS(KP951056516, T5T, T5S);
+					T6r = FMA(KP951056516, T5T, T5S);
+					T6y = FMA(KP951056516, T5Y, T5X);
+					T5Z = FNMS(KP951056516, T5Y, T5X);
+					T62 = FMA(KP951056516, T61, T60);
+					T6x = FNMS(KP951056516, T61, T60);
+					{
+					     E T5W, T6b, T6k, T5V;
+					     T6k = FMA(KP827271945, T5R, T5U);
+					     T5V = FNMS(KP827271945, T5U, T5R);
+					     {
+						  E T6g, T63, T7e, T7f;
+						  T6g = FMA(KP126329378, T5Z, T62);
+						  T63 = FNMS(KP126329378, T62, T5Z);
+						  T7e = FMA(KP772036680, T6k, T6j);
+						  T6l = FNMS(KP772036680, T6k, T6j);
+						  T5W = FMA(KP772036680, T5V, T5O);
+						  T7k = FNMS(KP772036680, T5V, T5O);
+						  T7f = FNMS(KP734762448, T6h, T6g);
+						  T6i = FMA(KP734762448, T6h, T6g);
+						  T6b = FNMS(KP734762448, T6a, T63);
+						  T7l = FMA(KP734762448, T6a, T63);
+						  T7g = FMA(KP994076283, T7f, T7e);
+						  T7i = FNMS(KP994076283, T7f, T7e);
+					     }
+					     T6c = FNMS(KP994076283, T6b, T5W);
+					     T6e = FMA(KP994076283, T6b, T5W);
+					}
+					T6s = FMA(KP062914667, T6r, T6q);
+					T6L = FNMS(KP062914667, T6q, T6r);
+					T6J = FNMS(KP549754652, T6A, T6B);
+					T6C = FMA(KP549754652, T6B, T6A);
+				   }
+				   {
+					E T6N, T78, T6K, T79, T74, T76, T6E, T6G;
+					{
+					     E T6w, T6D, T6M, T6v;
+					     cr[WS(rs, 3)] = FMA(KP998026728, T6c, T5H);
+					     T6M = FNMS(KP634619297, T6t, T6u);
+					     T6v = FMA(KP634619297, T6u, T6t);
+					     {
+						  E T6I, T6z, T72, T73;
+						  T6I = FMA(KP470564281, T6x, T6y);
+						  T6z = FNMS(KP470564281, T6y, T6x);
+						  T72 = FMA(KP845997307, T6M, T6L);
+						  T6N = FNMS(KP845997307, T6M, T6L);
+						  T6w = FMA(KP845997307, T6v, T6s);
+						  T78 = FNMS(KP845997307, T6v, T6s);
+						  T73 = FNMS(KP968479752, T6J, T6I);
+						  T6K = FMA(KP968479752, T6J, T6I);
+						  T6D = FMA(KP968479752, T6C, T6z);
+						  T79 = FNMS(KP968479752, T6C, T6z);
+						  T74 = FMA(KP906616052, T73, T72);
+						  T76 = FNMS(KP906616052, T73, T72);
+					     }
+					     ci[WS(rs, 21)] = FNMS(KP998026728, T7g, T7d);
+					     T6E = FMA(KP906616052, T6D, T6w);
+					     T6G = FNMS(KP906616052, T6D, T6w);
+					}
+					{
+					     E T7c, T7a, T6Q, T6O, T6F, T7b, T77, T75, T6d, T6P, T6H;
+					     T7c = FMA(KP681693190, T78, T79);
+					     T7a = FNMS(KP560319534, T79, T78);
+					     ci[WS(rs, 22)] = FNMS(KP998026728, T74, T71);
+					     cr[WS(rs, 2)] = FMA(KP998026728, T6E, T6p);
+					     T75 = FMA(KP249506682, T74, T71);
+					     T6Q = FNMS(KP560319534, T6K, T6N);
+					     T6O = FMA(KP681693190, T6N, T6K);
+					     T6F = FNMS(KP249506682, T6E, T6p);
+					     T7b = FMA(KP557913902, T76, T75);
+					     T77 = FNMS(KP557913902, T76, T75);
+					     T6o = FMA(KP614372930, T6i, T6l);
+					     T6m = FNMS(KP621716863, T6l, T6i);
+					     cr[WS(rs, 22)] = FMS(KP860541664, T7c, T7b);
+					     ci[WS(rs, 17)] = FMA(KP860541664, T7c, T7b);
+					     ci[WS(rs, 12)] = FNMS(KP949179823, T7a, T77);
+					     cr[WS(rs, 17)] = -(FMA(KP949179823, T7a, T77));
+					     T6P = FMA(KP557913902, T6G, T6F);
+					     T6H = FNMS(KP557913902, T6G, T6F);
+					     T6d = FNMS(KP249506682, T6c, T5H);
+					     ci[WS(rs, 7)] = FMA(KP949179823, T6Q, T6P);
+					     cr[WS(rs, 12)] = FNMS(KP949179823, T6Q, T6P);
+					     cr[WS(rs, 7)] = FMA(KP860541664, T6O, T6H);
+					     ci[WS(rs, 2)] = FNMS(KP860541664, T6O, T6H);
+					     T7o = FMA(KP621716863, T7k, T7l);
+					     T7m = FNMS(KP614372930, T7l, T7k);
+					     T7h = FMA(KP249506682, T7g, T7d);
+					     T6n = FMA(KP557913902, T6e, T6d);
+					     T6f = FNMS(KP557913902, T6e, T6d);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 6)] = FNMS(KP949179823, T6o, T6n);
+	       ci[WS(rs, 11)] = FMA(KP949179823, T6o, T6n);
+	       cr[WS(rs, 8)] = FMA(KP943557151, T6m, T6f);
+	       ci[WS(rs, 1)] = FNMS(KP943557151, T6m, T6f);
+	       T7j = FNMS(KP557913902, T7i, T7h);
+	       T7n = FMA(KP557913902, T7i, T7h);
+	       cr[WS(rs, 23)] = -(FMA(KP943557151, T7o, T7n));
+	       ci[WS(rs, 16)] = FNMS(KP943557151, T7o, T7n);
+	       cr[WS(rs, 18)] = FMS(KP949179823, T7m, T7j);
+	       cr[WS(rs, 13)] = -(FMA(KP949179823, T7m, T7j));
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 25},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 25, "hf_25", twinstr, &GENUS, {84, 48, 316, 0} };
+
+void X(codelet_hf_25) (planner *p) {
+     X(khc2hc_register) (p, hf_25, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 25 -dit -name hf_25 -include hf.h */
+
+/*
+ * This function contains 400 FP additions, 280 FP multiplications,
+ * (or, 260 additions, 140 multiplications, 140 fused multiply/add),
+ * 101 stack variables, 20 constants, and 100 memory accesses
+ */
+#include "hf.h"
+
+static void hf_25(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 48); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 48, MAKE_VOLATILE_STRIDE(50, rs)) {
+	       E T1, T6b, T2l, T6g, To, T2m, T6e, T6f, T6a, T6H, T2u, T4I, T2i, T60, T3S;
+	       E T5D, T4r, T58, T3Z, T5C, T4q, T5b, TS, T5W, T2G, T5s, T4g, T4M, T2R, T5t;
+	       E T4h, T4P, T1l, T5X, T37, T5v, T4k, T4T, T3e, T5w, T4j, T4W, T1P, T5Z, T3v;
+	       E T5A, T4o, T54, T3C, T5z, T4n, T51;
+	       {
+		    E T6, T2o, Tb, T2p, Tc, T6c, Th, T2r, Tm, T2s, Tn, T6d;
+		    T1 = cr[0];
+		    T6b = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 5)];
+			 T5 = ci[WS(rs, 5)];
+			 T2 = W[8];
+			 T4 = W[9];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T2o = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = cr[WS(rs, 20)];
+			 Ta = ci[WS(rs, 20)];
+			 T7 = W[38];
+			 T9 = W[39];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 T2p = FNMS(T9, T8, T7 * Ta);
+		    }
+		    Tc = T6 + Tb;
+		    T6c = T2o + T2p;
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = cr[WS(rs, 10)];
+			 Tg = ci[WS(rs, 10)];
+			 Td = W[18];
+			 Tf = W[19];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T2r = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E Tj, Tl, Ti, Tk;
+			 Tj = cr[WS(rs, 15)];
+			 Tl = ci[WS(rs, 15)];
+			 Ti = W[28];
+			 Tk = W[29];
+			 Tm = FMA(Ti, Tj, Tk * Tl);
+			 T2s = FNMS(Tk, Tj, Ti * Tl);
+		    }
+		    Tn = Th + Tm;
+		    T6d = T2r + T2s;
+		    T2l = KP559016994 * (Tc - Tn);
+		    T6g = KP559016994 * (T6c - T6d);
+		    To = Tc + Tn;
+		    T2m = FNMS(KP250000000, To, T1);
+		    T6e = T6c + T6d;
+		    T6f = FNMS(KP250000000, T6e, T6b);
+		    {
+			 E T68, T69, T2q, T2t;
+			 T68 = Th - Tm;
+			 T69 = T6 - Tb;
+			 T6a = FNMS(KP587785252, T69, KP951056516 * T68);
+			 T6H = FMA(KP951056516, T69, KP587785252 * T68);
+			 T2q = T2o - T2p;
+			 T2t = T2r - T2s;
+			 T2u = FMA(KP951056516, T2q, KP587785252 * T2t);
+			 T4I = FNMS(KP587785252, T2q, KP951056516 * T2t);
+		    }
+	       }
+	       {
+		    E T1U, T3O, T3E, T3F, T3X, T3W, T3J, T3M, T3P, T25, T2g, T2h;
+		    {
+			 E T1R, T1T, T1Q, T1S;
+			 T1R = cr[WS(rs, 3)];
+			 T1T = ci[WS(rs, 3)];
+			 T1Q = W[4];
+			 T1S = W[5];
+			 T1U = FMA(T1Q, T1R, T1S * T1T);
+			 T3O = FNMS(T1S, T1R, T1Q * T1T);
+		    }
+		    {
+			 E T1Z, T3H, T2f, T3L, T24, T3I, T2a, T3K;
+			 {
+			      E T1W, T1Y, T1V, T1X;
+			      T1W = cr[WS(rs, 8)];
+			      T1Y = ci[WS(rs, 8)];
+			      T1V = W[14];
+			      T1X = W[15];
+			      T1Z = FMA(T1V, T1W, T1X * T1Y);
+			      T3H = FNMS(T1X, T1W, T1V * T1Y);
+			 }
+			 {
+			      E T2c, T2e, T2b, T2d;
+			      T2c = cr[WS(rs, 18)];
+			      T2e = ci[WS(rs, 18)];
+			      T2b = W[34];
+			      T2d = W[35];
+			      T2f = FMA(T2b, T2c, T2d * T2e);
+			      T3L = FNMS(T2d, T2c, T2b * T2e);
+			 }
+			 {
+			      E T21, T23, T20, T22;
+			      T21 = cr[WS(rs, 23)];
+			      T23 = ci[WS(rs, 23)];
+			      T20 = W[44];
+			      T22 = W[45];
+			      T24 = FMA(T20, T21, T22 * T23);
+			      T3I = FNMS(T22, T21, T20 * T23);
+			 }
+			 {
+			      E T27, T29, T26, T28;
+			      T27 = cr[WS(rs, 13)];
+			      T29 = ci[WS(rs, 13)];
+			      T26 = W[24];
+			      T28 = W[25];
+			      T2a = FMA(T26, T27, T28 * T29);
+			      T3K = FNMS(T28, T27, T26 * T29);
+			 }
+			 T3E = T1Z - T24;
+			 T3F = T2a - T2f;
+			 T3X = T3K - T3L;
+			 T3W = T3H - T3I;
+			 T3J = T3H + T3I;
+			 T3M = T3K + T3L;
+			 T3P = T3J + T3M;
+			 T25 = T1Z + T24;
+			 T2g = T2a + T2f;
+			 T2h = T25 + T2g;
+		    }
+		    T2i = T1U + T2h;
+		    T60 = T3O + T3P;
+		    {
+			 E T3G, T57, T3R, T56, T3N, T3Q;
+			 T3G = FMA(KP951056516, T3E, KP587785252 * T3F);
+			 T57 = FNMS(KP587785252, T3E, KP951056516 * T3F);
+			 T3N = KP559016994 * (T3J - T3M);
+			 T3Q = FNMS(KP250000000, T3P, T3O);
+			 T3R = T3N + T3Q;
+			 T56 = T3Q - T3N;
+			 T3S = T3G + T3R;
+			 T5D = T57 + T56;
+			 T4r = T3R - T3G;
+			 T58 = T56 - T57;
+		    }
+		    {
+			 E T3Y, T5a, T3V, T59, T3T, T3U;
+			 T3Y = FMA(KP951056516, T3W, KP587785252 * T3X);
+			 T5a = FNMS(KP587785252, T3W, KP951056516 * T3X);
+			 T3T = KP559016994 * (T25 - T2g);
+			 T3U = FNMS(KP250000000, T2h, T1U);
+			 T3V = T3T + T3U;
+			 T59 = T3U - T3T;
+			 T3Z = T3V - T3Y;
+			 T5C = T59 - T5a;
+			 T4q = T3V + T3Y;
+			 T5b = T59 + T5a;
+		    }
+	       }
+	       {
+		    E Tu, T2N, T2B, T2E, T2I, T2H, T2K, T2L, T2O, TF, TQ, TR;
+		    {
+			 E Tr, Tt, Tq, Ts;
+			 Tr = cr[WS(rs, 1)];
+			 Tt = ci[WS(rs, 1)];
+			 Tq = W[0];
+			 Ts = W[1];
+			 Tu = FMA(Tq, Tr, Ts * Tt);
+			 T2N = FNMS(Ts, Tr, Tq * Tt);
+		    }
+		    {
+			 E Tz, T2z, TP, T2D, TE, T2A, TK, T2C;
+			 {
+			      E Tw, Ty, Tv, Tx;
+			      Tw = cr[WS(rs, 6)];
+			      Ty = ci[WS(rs, 6)];
+			      Tv = W[10];
+			      Tx = W[11];
+			      Tz = FMA(Tv, Tw, Tx * Ty);
+			      T2z = FNMS(Tx, Tw, Tv * Ty);
+			 }
+			 {
+			      E TM, TO, TL, TN;
+			      TM = cr[WS(rs, 16)];
+			      TO = ci[WS(rs, 16)];
+			      TL = W[30];
+			      TN = W[31];
+			      TP = FMA(TL, TM, TN * TO);
+			      T2D = FNMS(TN, TM, TL * TO);
+			 }
+			 {
+			      E TB, TD, TA, TC;
+			      TB = cr[WS(rs, 21)];
+			      TD = ci[WS(rs, 21)];
+			      TA = W[40];
+			      TC = W[41];
+			      TE = FMA(TA, TB, TC * TD);
+			      T2A = FNMS(TC, TB, TA * TD);
+			 }
+			 {
+			      E TH, TJ, TG, TI;
+			      TH = cr[WS(rs, 11)];
+			      TJ = ci[WS(rs, 11)];
+			      TG = W[20];
+			      TI = W[21];
+			      TK = FMA(TG, TH, TI * TJ);
+			      T2C = FNMS(TI, TH, TG * TJ);
+			 }
+			 T2B = T2z - T2A;
+			 T2E = T2C - T2D;
+			 T2I = TK - TP;
+			 T2H = Tz - TE;
+			 T2K = T2z + T2A;
+			 T2L = T2C + T2D;
+			 T2O = T2K + T2L;
+			 TF = Tz + TE;
+			 TQ = TK + TP;
+			 TR = TF + TQ;
+		    }
+		    TS = Tu + TR;
+		    T5W = T2N + T2O;
+		    {
+			 E T2F, T4L, T2y, T4K, T2w, T2x;
+			 T2F = FMA(KP951056516, T2B, KP587785252 * T2E);
+			 T4L = FNMS(KP587785252, T2B, KP951056516 * T2E);
+			 T2w = KP559016994 * (TF - TQ);
+			 T2x = FNMS(KP250000000, TR, Tu);
+			 T2y = T2w + T2x;
+			 T4K = T2x - T2w;
+			 T2G = T2y - T2F;
+			 T5s = T4K - T4L;
+			 T4g = T2y + T2F;
+			 T4M = T4K + T4L;
+		    }
+		    {
+			 E T2J, T4O, T2Q, T4N, T2M, T2P;
+			 T2J = FMA(KP951056516, T2H, KP587785252 * T2I);
+			 T4O = FNMS(KP587785252, T2H, KP951056516 * T2I);
+			 T2M = KP559016994 * (T2K - T2L);
+			 T2P = FNMS(KP250000000, T2O, T2N);
+			 T2Q = T2M + T2P;
+			 T4N = T2P - T2M;
+			 T2R = T2J + T2Q;
+			 T5t = T4O + T4N;
+			 T4h = T2Q - T2J;
+			 T4P = T4N - T4O;
+		    }
+	       }
+	       {
+		    E TX, T33, T2T, T2U, T3c, T3b, T2Y, T31, T34, T18, T1j, T1k;
+		    {
+			 E TU, TW, TT, TV;
+			 TU = cr[WS(rs, 4)];
+			 TW = ci[WS(rs, 4)];
+			 TT = W[6];
+			 TV = W[7];
+			 TX = FMA(TT, TU, TV * TW);
+			 T33 = FNMS(TV, TU, TT * TW);
+		    }
+		    {
+			 E T12, T2W, T1i, T30, T17, T2X, T1d, T2Z;
+			 {
+			      E TZ, T11, TY, T10;
+			      TZ = cr[WS(rs, 9)];
+			      T11 = ci[WS(rs, 9)];
+			      TY = W[16];
+			      T10 = W[17];
+			      T12 = FMA(TY, TZ, T10 * T11);
+			      T2W = FNMS(T10, TZ, TY * T11);
+			 }
+			 {
+			      E T1f, T1h, T1e, T1g;
+			      T1f = cr[WS(rs, 19)];
+			      T1h = ci[WS(rs, 19)];
+			      T1e = W[36];
+			      T1g = W[37];
+			      T1i = FMA(T1e, T1f, T1g * T1h);
+			      T30 = FNMS(T1g, T1f, T1e * T1h);
+			 }
+			 {
+			      E T14, T16, T13, T15;
+			      T14 = cr[WS(rs, 24)];
+			      T16 = ci[WS(rs, 24)];
+			      T13 = W[46];
+			      T15 = W[47];
+			      T17 = FMA(T13, T14, T15 * T16);
+			      T2X = FNMS(T15, T14, T13 * T16);
+			 }
+			 {
+			      E T1a, T1c, T19, T1b;
+			      T1a = cr[WS(rs, 14)];
+			      T1c = ci[WS(rs, 14)];
+			      T19 = W[26];
+			      T1b = W[27];
+			      T1d = FMA(T19, T1a, T1b * T1c);
+			      T2Z = FNMS(T1b, T1a, T19 * T1c);
+			 }
+			 T2T = T17 - T12;
+			 T2U = T1d - T1i;
+			 T3c = T2Z - T30;
+			 T3b = T2W - T2X;
+			 T2Y = T2W + T2X;
+			 T31 = T2Z + T30;
+			 T34 = T2Y + T31;
+			 T18 = T12 + T17;
+			 T1j = T1d + T1i;
+			 T1k = T18 + T1j;
+		    }
+		    T1l = TX + T1k;
+		    T5X = T33 + T34;
+		    {
+			 E T2V, T4S, T36, T4R, T32, T35;
+			 T2V = FNMS(KP587785252, T2U, KP951056516 * T2T);
+			 T4S = FMA(KP587785252, T2T, KP951056516 * T2U);
+			 T32 = KP559016994 * (T2Y - T31);
+			 T35 = FNMS(KP250000000, T34, T33);
+			 T36 = T32 + T35;
+			 T4R = T35 - T32;
+			 T37 = T2V - T36;
+			 T5v = T4S + T4R;
+			 T4k = T2V + T36;
+			 T4T = T4R - T4S;
+		    }
+		    {
+			 E T3d, T4V, T3a, T4U, T38, T39;
+			 T3d = FMA(KP951056516, T3b, KP587785252 * T3c);
+			 T4V = FNMS(KP587785252, T3b, KP951056516 * T3c);
+			 T38 = KP559016994 * (T18 - T1j);
+			 T39 = FNMS(KP250000000, T1k, TX);
+			 T3a = T38 + T39;
+			 T4U = T39 - T38;
+			 T3e = T3a - T3d;
+			 T5w = T4U - T4V;
+			 T4j = T3a + T3d;
+			 T4W = T4U + T4V;
+		    }
+	       }
+	       {
+		    E T1r, T3r, T3h, T3i, T3A, T3z, T3m, T3p, T3s, T1C, T1N, T1O;
+		    {
+			 E T1o, T1q, T1n, T1p;
+			 T1o = cr[WS(rs, 2)];
+			 T1q = ci[WS(rs, 2)];
+			 T1n = W[2];
+			 T1p = W[3];
+			 T1r = FMA(T1n, T1o, T1p * T1q);
+			 T3r = FNMS(T1p, T1o, T1n * T1q);
+		    }
+		    {
+			 E T1w, T3k, T1M, T3o, T1B, T3l, T1H, T3n;
+			 {
+			      E T1t, T1v, T1s, T1u;
+			      T1t = cr[WS(rs, 7)];
+			      T1v = ci[WS(rs, 7)];
+			      T1s = W[12];
+			      T1u = W[13];
+			      T1w = FMA(T1s, T1t, T1u * T1v);
+			      T3k = FNMS(T1u, T1t, T1s * T1v);
+			 }
+			 {
+			      E T1J, T1L, T1I, T1K;
+			      T1J = cr[WS(rs, 17)];
+			      T1L = ci[WS(rs, 17)];
+			      T1I = W[32];
+			      T1K = W[33];
+			      T1M = FMA(T1I, T1J, T1K * T1L);
+			      T3o = FNMS(T1K, T1J, T1I * T1L);
+			 }
+			 {
+			      E T1y, T1A, T1x, T1z;
+			      T1y = cr[WS(rs, 22)];
+			      T1A = ci[WS(rs, 22)];
+			      T1x = W[42];
+			      T1z = W[43];
+			      T1B = FMA(T1x, T1y, T1z * T1A);
+			      T3l = FNMS(T1z, T1y, T1x * T1A);
+			 }
+			 {
+			      E T1E, T1G, T1D, T1F;
+			      T1E = cr[WS(rs, 12)];
+			      T1G = ci[WS(rs, 12)];
+			      T1D = W[22];
+			      T1F = W[23];
+			      T1H = FMA(T1D, T1E, T1F * T1G);
+			      T3n = FNMS(T1F, T1E, T1D * T1G);
+			 }
+			 T3h = T1w - T1B;
+			 T3i = T1H - T1M;
+			 T3A = T3n - T3o;
+			 T3z = T3k - T3l;
+			 T3m = T3k + T3l;
+			 T3p = T3n + T3o;
+			 T3s = T3m + T3p;
+			 T1C = T1w + T1B;
+			 T1N = T1H + T1M;
+			 T1O = T1C + T1N;
+		    }
+		    T1P = T1r + T1O;
+		    T5Z = T3r + T3s;
+		    {
+			 E T3j, T53, T3u, T52, T3q, T3t;
+			 T3j = FMA(KP951056516, T3h, KP587785252 * T3i);
+			 T53 = FNMS(KP587785252, T3h, KP951056516 * T3i);
+			 T3q = KP559016994 * (T3m - T3p);
+			 T3t = FNMS(KP250000000, T3s, T3r);
+			 T3u = T3q + T3t;
+			 T52 = T3t - T3q;
+			 T3v = T3j + T3u;
+			 T5A = T53 + T52;
+			 T4o = T3u - T3j;
+			 T54 = T52 - T53;
+		    }
+		    {
+			 E T3B, T50, T3y, T4Z, T3w, T3x;
+			 T3B = FMA(KP951056516, T3z, KP587785252 * T3A);
+			 T50 = FNMS(KP587785252, T3z, KP951056516 * T3A);
+			 T3w = KP559016994 * (T1C - T1N);
+			 T3x = FNMS(KP250000000, T1O, T1r);
+			 T3y = T3w + T3x;
+			 T4Z = T3x - T3w;
+			 T3C = T3y - T3B;
+			 T5z = T4Z - T50;
+			 T4n = T3y + T3B;
+			 T51 = T4Z + T50;
+		    }
+	       }
+	       {
+		    E T62, T64, Tp, T2k, T5T, T5U, T63, T5V;
+		    {
+			 E T5Y, T61, T1m, T2j;
+			 T5Y = T5W - T5X;
+			 T61 = T5Z - T60;
+			 T62 = FMA(KP951056516, T5Y, KP587785252 * T61);
+			 T64 = FNMS(KP587785252, T5Y, KP951056516 * T61);
+			 Tp = T1 + To;
+			 T1m = TS + T1l;
+			 T2j = T1P + T2i;
+			 T2k = T1m + T2j;
+			 T5T = KP559016994 * (T1m - T2j);
+			 T5U = FNMS(KP250000000, T2k, Tp);
+		    }
+		    cr[0] = Tp + T2k;
+		    T63 = T5U - T5T;
+		    cr[WS(rs, 10)] = T63 - T64;
+		    ci[WS(rs, 9)] = T63 + T64;
+		    T5V = T5T + T5U;
+		    ci[WS(rs, 4)] = T5V - T62;
+		    cr[WS(rs, 5)] = T5V + T62;
+	       }
+	       {
+		    E T2v, T4f, T6I, T6U, T42, T6Z, T43, T6Y, T4A, T6N, T4D, T6L, T4u, T6E, T4v;
+		    E T6D, T48, T6V, T4b, T6T, T2n, T6G;
+		    T2n = T2l + T2m;
+		    T2v = T2n - T2u;
+		    T4f = T2n + T2u;
+		    T6G = T6g + T6f;
+		    T6I = T6G - T6H;
+		    T6U = T6H + T6G;
+		    {
+			 E T2S, T3f, T3g, T3D, T40, T41;
+			 T2S = FMA(KP535826794, T2G, KP844327925 * T2R);
+			 T3f = FNMS(KP637423989, T3e, KP770513242 * T37);
+			 T3g = T2S + T3f;
+			 T3D = FNMS(KP425779291, T3C, KP904827052 * T3v);
+			 T40 = FNMS(KP992114701, T3Z, KP125333233 * T3S);
+			 T41 = T3D + T40;
+			 T42 = T3g + T41;
+			 T6Z = T3D - T40;
+			 T43 = KP559016994 * (T3g - T41);
+			 T6Y = T3f - T2S;
+		    }
+		    {
+			 E T4y, T4z, T6J, T4B, T4C, T6K;
+			 T4y = FNMS(KP248689887, T4g, KP968583161 * T4h);
+			 T4z = FNMS(KP844327925, T4j, KP535826794 * T4k);
+			 T6J = T4y + T4z;
+			 T4B = FNMS(KP481753674, T4n, KP876306680 * T4o);
+			 T4C = FNMS(KP684547105, T4q, KP728968627 * T4r);
+			 T6K = T4B + T4C;
+			 T4A = T4y - T4z;
+			 T6N = KP559016994 * (T6J - T6K);
+			 T4D = T4B - T4C;
+			 T6L = T6J + T6K;
+		    }
+		    {
+			 E T4i, T4l, T4m, T4p, T4s, T4t;
+			 T4i = FMA(KP968583161, T4g, KP248689887 * T4h);
+			 T4l = FMA(KP535826794, T4j, KP844327925 * T4k);
+			 T4m = T4i + T4l;
+			 T4p = FMA(KP876306680, T4n, KP481753674 * T4o);
+			 T4s = FMA(KP728968627, T4q, KP684547105 * T4r);
+			 T4t = T4p + T4s;
+			 T4u = T4m + T4t;
+			 T6E = T4p - T4s;
+			 T4v = KP559016994 * (T4m - T4t);
+			 T6D = T4l - T4i;
+		    }
+		    {
+			 E T46, T47, T6R, T49, T4a, T6S;
+			 T46 = FNMS(KP844327925, T2G, KP535826794 * T2R);
+			 T47 = FMA(KP770513242, T3e, KP637423989 * T37);
+			 T6R = T46 + T47;
+			 T49 = FMA(KP125333233, T3Z, KP992114701 * T3S);
+			 T4a = FMA(KP904827052, T3C, KP425779291 * T3v);
+			 T6S = T4a + T49;
+			 T48 = T46 - T47;
+			 T6V = T6R - T6S;
+			 T4b = T49 - T4a;
+			 T6T = KP559016994 * (T6R + T6S);
+		    }
+		    cr[WS(rs, 4)] = T2v + T42;
+		    ci[WS(rs, 23)] = T6L + T6I;
+		    ci[WS(rs, 20)] = T6V + T6U;
+		    cr[WS(rs, 1)] = T4f + T4u;
+		    {
+			 E T4c, T4e, T45, T4d, T44;
+			 T4c = FMA(KP951056516, T48, KP587785252 * T4b);
+			 T4e = FNMS(KP587785252, T48, KP951056516 * T4b);
+			 T44 = FNMS(KP250000000, T42, T2v);
+			 T45 = T43 + T44;
+			 T4d = T44 - T43;
+			 ci[0] = T45 - T4c;
+			 ci[WS(rs, 5)] = T4d + T4e;
+			 cr[WS(rs, 9)] = T45 + T4c;
+			 ci[WS(rs, 10)] = T4d - T4e;
+		    }
+		    {
+			 E T6F, T6P, T6O, T6Q, T6M;
+			 T6F = FMA(KP587785252, T6D, KP951056516 * T6E);
+			 T6P = FNMS(KP587785252, T6E, KP951056516 * T6D);
+			 T6M = FNMS(KP250000000, T6L, T6I);
+			 T6O = T6M - T6N;
+			 T6Q = T6N + T6M;
+			 cr[WS(rs, 16)] = T6F - T6O;
+			 ci[WS(rs, 18)] = T6P + T6Q;
+			 ci[WS(rs, 13)] = T6F + T6O;
+			 cr[WS(rs, 21)] = T6P - T6Q;
+		    }
+		    {
+			 E T70, T71, T6X, T72, T6W;
+			 T70 = FMA(KP587785252, T6Y, KP951056516 * T6Z);
+			 T71 = FNMS(KP587785252, T6Z, KP951056516 * T6Y);
+			 T6W = FNMS(KP250000000, T6V, T6U);
+			 T6X = T6T - T6W;
+			 T72 = T6T + T6W;
+			 cr[WS(rs, 14)] = T6X - T70;
+			 ci[WS(rs, 15)] = T71 + T72;
+			 cr[WS(rs, 19)] = T70 + T6X;
+			 cr[WS(rs, 24)] = T71 - T72;
+		    }
+		    {
+			 E T4E, T4G, T4x, T4F, T4w;
+			 T4E = FMA(KP951056516, T4A, KP587785252 * T4D);
+			 T4G = FNMS(KP587785252, T4A, KP951056516 * T4D);
+			 T4w = FNMS(KP250000000, T4u, T4f);
+			 T4x = T4v + T4w;
+			 T4F = T4w - T4v;
+			 ci[WS(rs, 3)] = T4x - T4E;
+			 ci[WS(rs, 8)] = T4F + T4G;
+			 cr[WS(rs, 6)] = T4x + T4E;
+			 cr[WS(rs, 11)] = T4F - T4G;
+		    }
+	       }
+	       {
+		    E T75, T7d, T76, T79, T7a, T7b, T7e, T7c;
+		    {
+			 E T73, T74, T77, T78;
+			 T73 = T1l - TS;
+			 T74 = T1P - T2i;
+			 T75 = FMA(KP587785252, T73, KP951056516 * T74);
+			 T7d = FNMS(KP587785252, T74, KP951056516 * T73);
+			 T76 = T6e + T6b;
+			 T77 = T5W + T5X;
+			 T78 = T5Z + T60;
+			 T79 = T77 + T78;
+			 T7a = FNMS(KP250000000, T79, T76);
+			 T7b = KP559016994 * (T77 - T78);
+		    }
+		    ci[WS(rs, 24)] = T79 + T76;
+		    T7e = T7b + T7a;
+		    cr[WS(rs, 20)] = T7d - T7e;
+		    ci[WS(rs, 19)] = T7d + T7e;
+		    T7c = T7a - T7b;
+		    cr[WS(rs, 15)] = T75 - T7c;
+		    ci[WS(rs, 14)] = T75 + T7c;
+	       }
+	       {
+		    E T4J, T5r, T6i, T6u, T5e, T6z, T5f, T6y, T5M, T6n, T5P, T6l, T5G, T66, T5H;
+		    E T65, T5k, T6v, T5n, T6t, T4H, T6h;
+		    T4H = T2m - T2l;
+		    T4J = T4H + T4I;
+		    T5r = T4H - T4I;
+		    T6h = T6f - T6g;
+		    T6i = T6a + T6h;
+		    T6u = T6h - T6a;
+		    {
+			 E T4Q, T4X, T4Y, T55, T5c, T5d;
+			 T4Q = FMA(KP728968627, T4M, KP684547105 * T4P);
+			 T4X = FNMS(KP992114701, T4W, KP125333233 * T4T);
+			 T4Y = T4Q + T4X;
+			 T55 = FMA(KP062790519, T51, KP998026728 * T54);
+			 T5c = FNMS(KP637423989, T5b, KP770513242 * T58);
+			 T5d = T55 + T5c;
+			 T5e = T4Y + T5d;
+			 T6z = T55 - T5c;
+			 T5f = KP559016994 * (T4Y - T5d);
+			 T6y = T4X - T4Q;
+		    }
+		    {
+			 E T5K, T5L, T6j, T5N, T5O, T6k;
+			 T5K = FNMS(KP481753674, T5s, KP876306680 * T5t);
+			 T5L = FMA(KP904827052, T5w, KP425779291 * T5v);
+			 T6j = T5K - T5L;
+			 T5N = FNMS(KP844327925, T5z, KP535826794 * T5A);
+			 T5O = FNMS(KP998026728, T5C, KP062790519 * T5D);
+			 T6k = T5N + T5O;
+			 T5M = T5K + T5L;
+			 T6n = KP559016994 * (T6j - T6k);
+			 T5P = T5N - T5O;
+			 T6l = T6j + T6k;
+		    }
+		    {
+			 E T5u, T5x, T5y, T5B, T5E, T5F;
+			 T5u = FMA(KP876306680, T5s, KP481753674 * T5t);
+			 T5x = FNMS(KP425779291, T5w, KP904827052 * T5v);
+			 T5y = T5u + T5x;
+			 T5B = FMA(KP535826794, T5z, KP844327925 * T5A);
+			 T5E = FMA(KP062790519, T5C, KP998026728 * T5D);
+			 T5F = T5B + T5E;
+			 T5G = T5y + T5F;
+			 T66 = T5B - T5E;
+			 T5H = KP559016994 * (T5y - T5F);
+			 T65 = T5x - T5u;
+		    }
+		    {
+			 E T5i, T5j, T6r, T5l, T5m, T6s;
+			 T5i = FNMS(KP684547105, T4M, KP728968627 * T4P);
+			 T5j = FMA(KP125333233, T4W, KP992114701 * T4T);
+			 T6r = T5i - T5j;
+			 T5l = FNMS(KP998026728, T51, KP062790519 * T54);
+			 T5m = FMA(KP770513242, T5b, KP637423989 * T58);
+			 T6s = T5l - T5m;
+			 T5k = T5i + T5j;
+			 T6v = T6r + T6s;
+			 T5n = T5l + T5m;
+			 T6t = KP559016994 * (T6r - T6s);
+		    }
+		    cr[WS(rs, 3)] = T4J + T5e;
+		    ci[WS(rs, 22)] = T6l + T6i;
+		    ci[WS(rs, 21)] = T6v + T6u;
+		    cr[WS(rs, 2)] = T5r + T5G;
+		    {
+			 E T67, T6p, T6o, T6q, T6m;
+			 T67 = FMA(KP587785252, T65, KP951056516 * T66);
+			 T6p = FNMS(KP587785252, T66, KP951056516 * T65);
+			 T6m = FNMS(KP250000000, T6l, T6i);
+			 T6o = T6m - T6n;
+			 T6q = T6n + T6m;
+			 cr[WS(rs, 17)] = T67 - T6o;
+			 ci[WS(rs, 17)] = T6p + T6q;
+			 ci[WS(rs, 12)] = T67 + T6o;
+			 cr[WS(rs, 22)] = T6p - T6q;
+		    }
+		    {
+			 E T5Q, T5S, T5J, T5R, T5I;
+			 T5Q = FMA(KP951056516, T5M, KP587785252 * T5P);
+			 T5S = FNMS(KP587785252, T5M, KP951056516 * T5P);
+			 T5I = FNMS(KP250000000, T5G, T5r);
+			 T5J = T5H + T5I;
+			 T5R = T5I - T5H;
+			 ci[WS(rs, 2)] = T5J - T5Q;
+			 ci[WS(rs, 7)] = T5R + T5S;
+			 cr[WS(rs, 7)] = T5J + T5Q;
+			 cr[WS(rs, 12)] = T5R - T5S;
+		    }
+		    {
+			 E T5o, T5q, T5h, T5p, T5g;
+			 T5o = FMA(KP951056516, T5k, KP587785252 * T5n);
+			 T5q = FNMS(KP587785252, T5k, KP951056516 * T5n);
+			 T5g = FNMS(KP250000000, T5e, T4J);
+			 T5h = T5f + T5g;
+			 T5p = T5g - T5f;
+			 ci[WS(rs, 1)] = T5h - T5o;
+			 ci[WS(rs, 6)] = T5p + T5q;
+			 cr[WS(rs, 8)] = T5h + T5o;
+			 ci[WS(rs, 11)] = T5p - T5q;
+		    }
+		    {
+			 E T6A, T6B, T6x, T6C, T6w;
+			 T6A = FMA(KP587785252, T6y, KP951056516 * T6z);
+			 T6B = FNMS(KP587785252, T6z, KP951056516 * T6y);
+			 T6w = FNMS(KP250000000, T6v, T6u);
+			 T6x = T6t - T6w;
+			 T6C = T6t + T6w;
+			 cr[WS(rs, 13)] = T6x - T6A;
+			 ci[WS(rs, 16)] = T6B + T6C;
+			 cr[WS(rs, 18)] = T6A + T6x;
+			 cr[WS(rs, 23)] = T6B - T6C;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 25},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 25, "hf_25", twinstr, &GENUS, {260, 140, 140, 0} };
+
+void X(codelet_hf_25) (planner *p) {
+     X(khc2hc_register) (p, hf_25, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:49 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 3 -dit -name hf_3 -include hf.h */
+
+/*
+ * This function contains 16 FP additions, 14 FP multiplications,
+ * (or, 6 additions, 4 multiplications, 10 fused multiply/add),
+ * 21 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "hf.h"
+
+static void hf_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
+	       E T1, Tl, T9, Tc, Tb, Th, T7, Ti, Ta, Tj, Td;
+	       T1 = cr[0];
+	       Tl = ci[0];
+	       {
+		    E T3, T6, T2, T5, Tg, T4, T8;
+		    T3 = cr[WS(rs, 1)];
+		    T6 = ci[WS(rs, 1)];
+		    T2 = W[0];
+		    T5 = W[1];
+		    T9 = cr[WS(rs, 2)];
+		    Tc = ci[WS(rs, 2)];
+		    Tg = T2 * T6;
+		    T4 = T2 * T3;
+		    T8 = W[2];
+		    Tb = W[3];
+		    Th = FNMS(T5, T3, Tg);
+		    T7 = FMA(T5, T6, T4);
+		    Ti = T8 * Tc;
+		    Ta = T8 * T9;
+	       }
+	       Tj = FNMS(Tb, T9, Ti);
+	       Td = FMA(Tb, Tc, Ta);
+	       {
+		    E Tk, Te, To, Tn, Tm, Tf;
+		    Tk = Th - Tj;
+		    Tm = Th + Tj;
+		    Te = T7 + Td;
+		    To = Td - T7;
+		    ci[WS(rs, 2)] = Tm + Tl;
+		    Tn = FNMS(KP500000000, Tm, Tl);
+		    cr[0] = T1 + Te;
+		    Tf = FNMS(KP500000000, Te, T1);
+		    ci[WS(rs, 1)] = FMA(KP866025403, To, Tn);
+		    cr[WS(rs, 2)] = FMS(KP866025403, To, Tn);
+		    cr[WS(rs, 1)] = FMA(KP866025403, Tk, Tf);
+		    ci[0] = FNMS(KP866025403, Tk, Tf);
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 3, "hf_3", twinstr, &GENUS, {6, 4, 10, 0} };
+
+void X(codelet_hf_3) (planner *p) {
+     X(khc2hc_register) (p, hf_3, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 3 -dit -name hf_3 -include hf.h */
+
+/*
+ * This function contains 16 FP additions, 12 FP multiplications,
+ * (or, 10 additions, 6 multiplications, 6 fused multiply/add),
+ * 15 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "hf.h"
+
+static void hf_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
+	       E T1, Ti, T6, Te, Tb, Tf, Tc, Tj;
+	       T1 = cr[0];
+	       Ti = ci[0];
+	       {
+		    E T3, T5, T2, T4;
+		    T3 = cr[WS(rs, 1)];
+		    T5 = ci[WS(rs, 1)];
+		    T2 = W[0];
+		    T4 = W[1];
+		    T6 = FMA(T2, T3, T4 * T5);
+		    Te = FNMS(T4, T3, T2 * T5);
+	       }
+	       {
+		    E T8, Ta, T7, T9;
+		    T8 = cr[WS(rs, 2)];
+		    Ta = ci[WS(rs, 2)];
+		    T7 = W[2];
+		    T9 = W[3];
+		    Tb = FMA(T7, T8, T9 * Ta);
+		    Tf = FNMS(T9, T8, T7 * Ta);
+	       }
+	       Tc = T6 + Tb;
+	       Tj = Te + Tf;
+	       {
+		    E Td, Tg, Th, Tk;
+		    cr[0] = T1 + Tc;
+		    Td = FNMS(KP500000000, Tc, T1);
+		    Tg = KP866025403 * (Te - Tf);
+		    ci[0] = Td - Tg;
+		    cr[WS(rs, 1)] = Td + Tg;
+		    ci[WS(rs, 2)] = Tj + Ti;
+		    Th = KP866025403 * (Tb - T6);
+		    Tk = FNMS(KP500000000, Tj, Ti);
+		    cr[WS(rs, 2)] = Th - Tk;
+		    ci[WS(rs, 1)] = Th + Tk;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 3},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 3, "hf_3", twinstr, &GENUS, {10, 6, 6, 0} };
+
+void X(codelet_hf_3) (planner *p) {
+     X(khc2hc_register) (p, hf_3, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1769 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:54 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 32 -dit -name hf_32 -include hf.h */
+
+/*
+ * This function contains 434 FP additions, 260 FP multiplications,
+ * (or, 236 additions, 62 multiplications, 198 fused multiply/add),
+ * 135 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hf.h"
+
+static void hf_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E T6D, T6A;
+	       {
+		    E T8y, T87, T8, T3w, T83, T3B, T8x, Tl, T6G, Tz, T3J, T5T, T6F, TM, T3Q;
+		    E T5U, T46, T5X, T7E, T6M, T5Y, T3Z, T6J, T1f, T7D, T6R, T61, T4e, T6O, T1G;
+		    E T60, T4l, T54, T6c, T7d, T7N, T32, T76, T6f, T5r, T4v, T65, T72, T7I, T29;
+		    E T6V, T68, T4S, T5t, T5b, T7O, T79, T7e, T3t, T5s, T5i, T4H, T2y, T4B, T6X;
+		    E T2m, T4w, T4F, T2s;
+		    {
+			 E T44, T1d, T3X, T6K, T11, T40, T42, T17, T5h, T5c;
+			 {
+			      E Ta, Td, Tg, T3x, Tb, Tj, Tf, Tc, Ti;
+			      {
+				   E T1, T86, T3, T6, T2, T5;
+				   T1 = cr[0];
+				   T86 = ci[0];
+				   T3 = cr[WS(rs, 16)];
+				   T6 = ci[WS(rs, 16)];
+				   T2 = W[30];
+				   T5 = W[31];
+				   {
+					E T84, T4, T9, T85, T7;
+					Ta = cr[WS(rs, 8)];
+					Td = ci[WS(rs, 8)];
+					T84 = T2 * T6;
+					T4 = T2 * T3;
+					T9 = W[14];
+					Tg = cr[WS(rs, 24)];
+					T85 = FNMS(T5, T3, T84);
+					T7 = FMA(T5, T6, T4);
+					T3x = T9 * Td;
+					Tb = T9 * Ta;
+					T8y = T86 - T85;
+					T87 = T85 + T86;
+					T8 = T1 + T7;
+					T3w = T1 - T7;
+					Tj = ci[WS(rs, 24)];
+					Tf = W[46];
+				   }
+				   Tc = W[15];
+				   Ti = W[47];
+			      }
+			      {
+				   E Tu, Tx, T3F, Ts, Tw, T3G, Tv;
+				   {
+					E To, Tr, Tp, T3E, Tq, Tt;
+					{
+					     E T3y, Te, T3A, Tk, T3z, Th, Tn;
+					     To = cr[WS(rs, 4)];
+					     T3z = Tf * Tj;
+					     Th = Tf * Tg;
+					     T3y = FNMS(Tc, Ta, T3x);
+					     Te = FMA(Tc, Td, Tb);
+					     T3A = FNMS(Ti, Tg, T3z);
+					     Tk = FMA(Ti, Tj, Th);
+					     Tr = ci[WS(rs, 4)];
+					     Tn = W[6];
+					     T83 = T3y + T3A;
+					     T3B = T3y - T3A;
+					     T8x = Te - Tk;
+					     Tl = Te + Tk;
+					     Tp = Tn * To;
+					     T3E = Tn * Tr;
+					}
+					Tq = W[7];
+					Tu = cr[WS(rs, 20)];
+					Tx = ci[WS(rs, 20)];
+					Tt = W[38];
+					T3F = FNMS(Tq, To, T3E);
+					Ts = FMA(Tq, Tr, Tp);
+					Tw = W[39];
+					T3G = Tt * Tx;
+					Tv = Tt * Tu;
+				   }
+				   {
+					E T3M, TF, TH, TK, TG, TJ, TE, TD, TC;
+					{
+					     E TB, T3H, Ty, TA, T3I, T3D, T3L;
+					     TB = cr[WS(rs, 28)];
+					     TE = ci[WS(rs, 28)];
+					     T3H = FNMS(Tw, Tu, T3G);
+					     Ty = FMA(Tw, Tx, Tv);
+					     TA = W[54];
+					     TD = W[55];
+					     T6G = T3F + T3H;
+					     T3I = T3F - T3H;
+					     Tz = Ts + Ty;
+					     T3D = Ts - Ty;
+					     T3L = TA * TE;
+					     TC = TA * TB;
+					     T3J = T3D - T3I;
+					     T5T = T3D + T3I;
+					     T3M = FNMS(TD, TB, T3L);
+					}
+					TF = FMA(TD, TE, TC);
+					TH = cr[WS(rs, 12)];
+					TK = ci[WS(rs, 12)];
+					TG = W[22];
+					TJ = W[23];
+					{
+					     E TU, T3U, T13, T16, T3W, T10, T12, T15, T41, T14;
+					     {
+						  E T19, T1c, T18, T1b, T3P, T3K;
+						  {
+						       E TQ, TT, T3N, TI, TP, TS;
+						       TQ = cr[WS(rs, 2)];
+						       TT = ci[WS(rs, 2)];
+						       T3N = TG * TK;
+						       TI = TG * TH;
+						       TP = W[2];
+						       TS = W[3];
+						       {
+							    E T3O, TL, T3T, TR;
+							    T3O = FNMS(TJ, TH, T3N);
+							    TL = FMA(TJ, TK, TI);
+							    T3T = TP * TT;
+							    TR = TP * TQ;
+							    T6F = T3M + T3O;
+							    T3P = T3M - T3O;
+							    TM = TF + TL;
+							    T3K = TF - TL;
+							    TU = FMA(TS, TT, TR);
+							    T3U = FNMS(TS, TQ, T3T);
+						       }
+						  }
+						  T3Q = T3K + T3P;
+						  T5U = T3K - T3P;
+						  T19 = cr[WS(rs, 26)];
+						  T1c = ci[WS(rs, 26)];
+						  T18 = W[50];
+						  T1b = W[51];
+						  {
+						       E TW, TZ, TY, T3V, TX, T43, T1a, TV;
+						       TW = cr[WS(rs, 18)];
+						       TZ = ci[WS(rs, 18)];
+						       T43 = T18 * T1c;
+						       T1a = T18 * T19;
+						       TV = W[34];
+						       TY = W[35];
+						       T44 = FNMS(T1b, T19, T43);
+						       T1d = FMA(T1b, T1c, T1a);
+						       T3V = TV * TZ;
+						       TX = TV * TW;
+						       T13 = cr[WS(rs, 10)];
+						       T16 = ci[WS(rs, 10)];
+						       T3W = FNMS(TY, TW, T3V);
+						       T10 = FMA(TY, TZ, TX);
+						       T12 = W[18];
+						       T15 = W[19];
+						  }
+					     }
+					     T3X = T3U - T3W;
+					     T6K = T3U + T3W;
+					     T11 = TU + T10;
+					     T40 = TU - T10;
+					     T41 = T12 * T16;
+					     T14 = T12 * T13;
+					     T42 = FNMS(T15, T13, T41);
+					     T17 = FMA(T15, T16, T14);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T49, T1l, T4j, T1E, T1u, T1x, T1w, T4b, T1r, T4g, T1v;
+			      {
+				   E T1A, T1D, T1C, T4i, T1B;
+				   {
+					E T1h, T1k, T1g, T1j, T48, T1i, T1z;
+					T1h = cr[WS(rs, 30)];
+					T1k = ci[WS(rs, 30)];
+					{
+					     E T6L, T45, T1e, T3Y;
+					     T6L = T42 + T44;
+					     T45 = T42 - T44;
+					     T1e = T17 + T1d;
+					     T3Y = T17 - T1d;
+					     T46 = T40 - T45;
+					     T5X = T40 + T45;
+					     T7E = T6K + T6L;
+					     T6M = T6K - T6L;
+					     T5Y = T3X - T3Y;
+					     T3Z = T3X + T3Y;
+					     T6J = T11 - T1e;
+					     T1f = T11 + T1e;
+					     T1g = W[58];
+					}
+					T1j = W[59];
+					T1A = cr[WS(rs, 22)];
+					T1D = ci[WS(rs, 22)];
+					T48 = T1g * T1k;
+					T1i = T1g * T1h;
+					T1z = W[42];
+					T1C = W[43];
+					T49 = FNMS(T1j, T1h, T48);
+					T1l = FMA(T1j, T1k, T1i);
+					T4i = T1z * T1D;
+					T1B = T1z * T1A;
+				   }
+				   {
+					E T1n, T1q, T1m, T1p, T4a, T1o, T1t;
+					T1n = cr[WS(rs, 14)];
+					T1q = ci[WS(rs, 14)];
+					T4j = FNMS(T1C, T1A, T4i);
+					T1E = FMA(T1C, T1D, T1B);
+					T1m = W[26];
+					T1p = W[27];
+					T1u = cr[WS(rs, 6)];
+					T1x = ci[WS(rs, 6)];
+					T4a = T1m * T1q;
+					T1o = T1m * T1n;
+					T1t = W[10];
+					T1w = W[11];
+					T4b = FNMS(T1p, T1n, T4a);
+					T1r = FMA(T1p, T1q, T1o);
+					T4g = T1t * T1x;
+					T1v = T1t * T1u;
+				   }
+			      }
+			      {
+				   E T4c, T6P, T1s, T4f, T4h, T1y;
+				   T4c = T49 - T4b;
+				   T6P = T49 + T4b;
+				   T1s = T1l + T1r;
+				   T4f = T1l - T1r;
+				   T4h = FNMS(T1w, T1u, T4g);
+				   T1y = FMA(T1w, T1x, T1v);
+				   {
+					E T4k, T6Q, T4d, T1F;
+					T4k = T4h - T4j;
+					T6Q = T4h + T4j;
+					T4d = T1y - T1E;
+					T1F = T1y + T1E;
+					T7D = T6P + T6Q;
+					T6R = T6P - T6Q;
+					T61 = T4c - T4d;
+					T4e = T4c + T4d;
+					T6O = T1s - T1F;
+					T1G = T1s + T1F;
+					T60 = T4f + T4k;
+					T4l = T4f - T4k;
+				   }
+			      }
+			 }
+			 {
+			      E T5n, T2H, T52, T30, T2Q, T2T, T2S, T5p, T2N, T4Z, T2R;
+			      {
+				   E T2W, T2Z, T2Y, T51, T2X;
+				   {
+					E T2D, T2G, T2C, T2F, T5m, T2E, T2V;
+					T2D = cr[WS(rs, 31)];
+					T2G = ci[WS(rs, 31)];
+					T2C = W[60];
+					T2F = W[61];
+					T2W = cr[WS(rs, 23)];
+					T2Z = ci[WS(rs, 23)];
+					T5m = T2C * T2G;
+					T2E = T2C * T2D;
+					T2V = W[44];
+					T2Y = W[45];
+					T5n = FNMS(T2F, T2D, T5m);
+					T2H = FMA(T2F, T2G, T2E);
+					T51 = T2V * T2Z;
+					T2X = T2V * T2W;
+				   }
+				   {
+					E T2J, T2M, T2I, T2L, T5o, T2K, T2P;
+					T2J = cr[WS(rs, 15)];
+					T2M = ci[WS(rs, 15)];
+					T52 = FNMS(T2Y, T2W, T51);
+					T30 = FMA(T2Y, T2Z, T2X);
+					T2I = W[28];
+					T2L = W[29];
+					T2Q = cr[WS(rs, 7)];
+					T2T = ci[WS(rs, 7)];
+					T5o = T2I * T2M;
+					T2K = T2I * T2J;
+					T2P = W[12];
+					T2S = W[13];
+					T5p = FNMS(T2L, T2J, T5o);
+					T2N = FMA(T2L, T2M, T2K);
+					T4Z = T2P * T2T;
+					T2R = T2P * T2Q;
+				   }
+			      }
+			      {
+				   E T5q, T7b, T2O, T4Y, T50, T2U;
+				   T5q = T5n - T5p;
+				   T7b = T5n + T5p;
+				   T2O = T2H + T2N;
+				   T4Y = T2H - T2N;
+				   T50 = FNMS(T2S, T2Q, T4Z);
+				   T2U = FMA(T2S, T2T, T2R);
+				   {
+					E T7c, T53, T31, T5l;
+					T7c = T50 + T52;
+					T53 = T50 - T52;
+					T31 = T2U + T30;
+					T5l = T30 - T2U;
+					T54 = T4Y - T53;
+					T6c = T4Y + T53;
+					T7d = T7b - T7c;
+					T7N = T7b + T7c;
+					T32 = T2O + T31;
+					T76 = T2O - T31;
+					T6f = T5q + T5l;
+					T5r = T5l - T5q;
+				   }
+			      }
+			 }
+			 {
+			      E T4N, T1O, T4t, T27, T1X, T20, T1Z, T4P, T1U, T4q, T1Y;
+			      {
+				   E T23, T26, T25, T4s, T24;
+				   {
+					E T1K, T1N, T1J, T1M, T4M, T1L, T22;
+					T1K = cr[WS(rs, 1)];
+					T1N = ci[WS(rs, 1)];
+					T1J = W[0];
+					T1M = W[1];
+					T23 = cr[WS(rs, 25)];
+					T26 = ci[WS(rs, 25)];
+					T4M = T1J * T1N;
+					T1L = T1J * T1K;
+					T22 = W[48];
+					T25 = W[49];
+					T4N = FNMS(T1M, T1K, T4M);
+					T1O = FMA(T1M, T1N, T1L);
+					T4s = T22 * T26;
+					T24 = T22 * T23;
+				   }
+				   {
+					E T1Q, T1T, T1P, T1S, T4O, T1R, T1W;
+					T1Q = cr[WS(rs, 17)];
+					T1T = ci[WS(rs, 17)];
+					T4t = FNMS(T25, T23, T4s);
+					T27 = FMA(T25, T26, T24);
+					T1P = W[32];
+					T1S = W[33];
+					T1X = cr[WS(rs, 9)];
+					T20 = ci[WS(rs, 9)];
+					T4O = T1P * T1T;
+					T1R = T1P * T1Q;
+					T1W = W[16];
+					T1Z = W[17];
+					T4P = FNMS(T1S, T1Q, T4O);
+					T1U = FMA(T1S, T1T, T1R);
+					T4q = T1W * T20;
+					T1Y = T1W * T1X;
+				   }
+			      }
+			      {
+				   E T4Q, T70, T1V, T4p, T4r, T21;
+				   T4Q = T4N - T4P;
+				   T70 = T4N + T4P;
+				   T1V = T1O + T1U;
+				   T4p = T1O - T1U;
+				   T4r = FNMS(T1Z, T1X, T4q);
+				   T21 = FMA(T1Z, T20, T1Y);
+				   {
+					E T71, T4u, T4R, T28;
+					T71 = T4r + T4t;
+					T4u = T4r - T4t;
+					T4R = T21 - T27;
+					T28 = T21 + T27;
+					T4v = T4p - T4u;
+					T65 = T4p + T4u;
+					T72 = T70 - T71;
+					T7I = T70 + T71;
+					T29 = T1V + T28;
+					T6V = T1V - T28;
+					T68 = T4Q - T4R;
+					T4S = T4Q + T4R;
+				   }
+			      }
+			 }
+			 {
+			      E T57, T38, T5g, T3r, T3h, T3k, T3j, T59, T3e, T5d, T3i;
+			      {
+				   E T3n, T3q, T3p, T5f, T3o;
+				   {
+					E T34, T37, T33, T36, T56, T35, T3m;
+					T34 = cr[WS(rs, 3)];
+					T37 = ci[WS(rs, 3)];
+					T33 = W[4];
+					T36 = W[5];
+					T3n = cr[WS(rs, 11)];
+					T3q = ci[WS(rs, 11)];
+					T56 = T33 * T37;
+					T35 = T33 * T34;
+					T3m = W[20];
+					T3p = W[21];
+					T57 = FNMS(T36, T34, T56);
+					T38 = FMA(T36, T37, T35);
+					T5f = T3m * T3q;
+					T3o = T3m * T3n;
+				   }
+				   {
+					E T3a, T3d, T39, T3c, T58, T3b, T3g;
+					T3a = cr[WS(rs, 19)];
+					T3d = ci[WS(rs, 19)];
+					T5g = FNMS(T3p, T3n, T5f);
+					T3r = FMA(T3p, T3q, T3o);
+					T39 = W[36];
+					T3c = W[37];
+					T3h = cr[WS(rs, 27)];
+					T3k = ci[WS(rs, 27)];
+					T58 = T39 * T3d;
+					T3b = T39 * T3a;
+					T3g = W[52];
+					T3j = W[53];
+					T59 = FNMS(T3c, T3a, T58);
+					T3e = FMA(T3c, T3d, T3b);
+					T5d = T3g * T3k;
+					T3i = T3g * T3h;
+				   }
+			      }
+			      {
+				   E T5a, T78, T3f, T55, T5e, T3l, T77, T3s;
+				   T5a = T57 - T59;
+				   T78 = T57 + T59;
+				   T3f = T38 + T3e;
+				   T55 = T38 - T3e;
+				   T5e = FNMS(T3j, T3h, T5d);
+				   T3l = FMA(T3j, T3k, T3i);
+				   T5h = T5e - T5g;
+				   T77 = T5e + T5g;
+				   T3s = T3l + T3r;
+				   T5c = T3l - T3r;
+				   T5t = T55 + T5a;
+				   T5b = T55 - T5a;
+				   T7O = T78 + T77;
+				   T79 = T77 - T78;
+				   T7e = T3s - T3f;
+				   T3t = T3f + T3s;
+			      }
+			 }
+			 {
+			      E T4y, T2f, T2o, T2r, T4A, T2l, T2n, T2q, T4E, T2p;
+			      {
+				   E T2u, T2x, T2t, T2w;
+				   {
+					E T2b, T2e, T2d, T4x, T2c, T2a;
+					T2b = cr[WS(rs, 5)];
+					T2e = ci[WS(rs, 5)];
+					T2a = W[8];
+					T5s = T5c - T5h;
+					T5i = T5c + T5h;
+					T2d = W[9];
+					T4x = T2a * T2e;
+					T2c = T2a * T2b;
+					T2u = cr[WS(rs, 13)];
+					T2x = ci[WS(rs, 13)];
+					T4y = FNMS(T2d, T2b, T4x);
+					T2f = FMA(T2d, T2e, T2c);
+					T2t = W[24];
+					T2w = W[25];
+				   }
+				   {
+					E T2h, T2k, T2j, T4z, T2i, T4G, T2v, T2g;
+					T2h = cr[WS(rs, 21)];
+					T2k = ci[WS(rs, 21)];
+					T4G = T2t * T2x;
+					T2v = T2t * T2u;
+					T2g = W[40];
+					T2j = W[41];
+					T4H = FNMS(T2w, T2u, T4G);
+					T2y = FMA(T2w, T2x, T2v);
+					T4z = T2g * T2k;
+					T2i = T2g * T2h;
+					T2o = cr[WS(rs, 29)];
+					T2r = ci[WS(rs, 29)];
+					T4A = FNMS(T2j, T2h, T4z);
+					T2l = FMA(T2j, T2k, T2i);
+					T2n = W[56];
+					T2q = W[57];
+				   }
+			      }
+			      T4B = T4y - T4A;
+			      T6X = T4y + T4A;
+			      T2m = T2f + T2l;
+			      T4w = T2f - T2l;
+			      T4E = T2n * T2r;
+			      T2p = T2n * T2o;
+			      T4F = FNMS(T2q, T2o, T4E);
+			      T2s = FMA(T2q, T2r, T2p);
+			 }
+		    }
+		    {
+			 E T6E, T8j, T6Y, T73, T6H, T8k, T5S, T8O, T8N, T5V, T6g, T6d, T69, T66, T5O;
+			 E T5R;
+			 {
+			      E T4T, T4C, T4J, T4U, T7S, T7V;
+			      {
+				   E T7C, TO, T80, T7Z, T8e, T89, T8d, T1H, T8b, T3v, T7T, T7L, T7U, T7Q, T2A;
+				   E T7P, T7K, T7W, T1I;
+				   {
+					E T7X, T7Y, T7J, T82, T88;
+					{
+					     E Tm, T4I, T6W, T4D, T2z, TN;
+					     T6E = T8 - Tl;
+					     Tm = T8 + Tl;
+					     T4T = T4w + T4B;
+					     T4C = T4w - T4B;
+					     T4I = T4F - T4H;
+					     T6W = T4F + T4H;
+					     T4D = T2s - T2y;
+					     T2z = T2s + T2y;
+					     TN = Tz + TM;
+					     T8j = Tz - TM;
+					     T6Y = T6W - T6X;
+					     T7J = T6X + T6W;
+					     T4J = T4D + T4I;
+					     T4U = T4I - T4D;
+					     T2A = T2m + T2z;
+					     T73 = T2m - T2z;
+					     T7C = Tm - TN;
+					     TO = Tm + TN;
+					}
+					T7P = T7N - T7O;
+					T7X = T7N + T7O;
+					T7Y = T7I + T7J;
+					T7K = T7I - T7J;
+					T6H = T6F - T6G;
+					T82 = T6G + T6F;
+					T88 = T83 + T87;
+					T8k = T87 - T83;
+					T80 = T7Y + T7X;
+					T7Z = T7X - T7Y;
+					T8e = T88 - T82;
+					T89 = T82 + T88;
+				   }
+				   {
+					E T7H, T7M, T2B, T3u;
+					T7H = T29 - T2A;
+					T2B = T29 + T2A;
+					T3u = T32 + T3t;
+					T7M = T32 - T3t;
+					T8d = T1f - T1G;
+					T1H = T1f + T1G;
+					T8b = T3u - T2B;
+					T3v = T2B + T3u;
+					T7T = T7H - T7K;
+					T7L = T7H + T7K;
+					T7U = T7M + T7P;
+					T7Q = T7M - T7P;
+				   }
+				   T7W = TO - T1H;
+				   T1I = TO + T1H;
+				   {
+					E T8g, T8h, T8f, T8i;
+					{
+					     E T7R, T8c, T8a, T7G, T81, T7F;
+					     T8g = T7Q - T7L;
+					     T7R = T7L + T7Q;
+					     T81 = T7E + T7D;
+					     T7F = T7D - T7E;
+					     cr[0] = T1I + T3v;
+					     ci[WS(rs, 15)] = T1I - T3v;
+					     ci[WS(rs, 7)] = T7W + T7Z;
+					     cr[WS(rs, 8)] = T7W - T7Z;
+					     T8c = T89 - T81;
+					     T8a = T81 + T89;
+					     T7G = T7C - T7F;
+					     T7S = T7C + T7F;
+					     T8h = T8e - T8d;
+					     T8f = T8d + T8e;
+					     ci[WS(rs, 23)] = T8b + T8c;
+					     cr[WS(rs, 24)] = T8b - T8c;
+					     ci[WS(rs, 31)] = T80 + T8a;
+					     cr[WS(rs, 16)] = T80 - T8a;
+					     cr[WS(rs, 4)] = FMA(KP707106781, T7R, T7G);
+					     ci[WS(rs, 11)] = FNMS(KP707106781, T7R, T7G);
+					}
+					T8i = T7U - T7T;
+					T7V = T7T + T7U;
+					ci[WS(rs, 19)] = FMA(KP707106781, T8g, T8f);
+					cr[WS(rs, 28)] = FMS(KP707106781, T8g, T8f);
+					ci[WS(rs, 27)] = FMA(KP707106781, T8i, T8h);
+					cr[WS(rs, 20)] = FMS(KP707106781, T8i, T8h);
+				   }
+			      }
+			      {
+				   E T5C, T3S, T8C, T4n, T8H, T8B, T8I, T5F, T4L, T5H, T5M, T5Q, T5A, T5w, T4V;
+				   {
+					E T5D, T47, T4m, T5E, T8z, T8A, T3C, T3R, T5j, T5u;
+					T5S = T3w + T3B;
+					T3C = T3w - T3B;
+					T3R = T3J + T3Q;
+					T8O = T3Q - T3J;
+					T5D = FNMS(KP414213562, T3Z, T46);
+					T47 = FMA(KP414213562, T46, T3Z);
+					ci[WS(rs, 3)] = FMA(KP707106781, T7V, T7S);
+					cr[WS(rs, 12)] = FNMS(KP707106781, T7V, T7S);
+					T5C = FMA(KP707106781, T3R, T3C);
+					T3S = FNMS(KP707106781, T3R, T3C);
+					T4m = FNMS(KP414213562, T4l, T4e);
+					T5E = FMA(KP414213562, T4e, T4l);
+					T8N = T8y - T8x;
+					T8z = T8x + T8y;
+					T8A = T5T - T5U;
+					T5V = T5T + T5U;
+					T8C = T47 + T4m;
+					T4n = T47 - T4m;
+					T8H = FNMS(KP707106781, T8A, T8z);
+					T8B = FMA(KP707106781, T8A, T8z);
+					T6g = T5i - T5b;
+					T5j = T5b + T5i;
+					T5u = T5s - T5t;
+					T6d = T5t + T5s;
+					{
+					     E T5K, T5k, T5L, T5v, T4K;
+					     T69 = T4J - T4C;
+					     T4K = T4C + T4J;
+					     T8I = T5E - T5D;
+					     T5F = T5D + T5E;
+					     T5K = FMA(KP707106781, T5j, T54);
+					     T5k = FNMS(KP707106781, T5j, T54);
+					     T5L = FMA(KP707106781, T5u, T5r);
+					     T5v = FNMS(KP707106781, T5u, T5r);
+					     T4L = FNMS(KP707106781, T4K, T4v);
+					     T5H = FMA(KP707106781, T4K, T4v);
+					     T5M = FNMS(KP198912367, T5L, T5K);
+					     T5Q = FMA(KP198912367, T5K, T5L);
+					     T5A = FNMS(KP668178637, T5k, T5v);
+					     T5w = FMA(KP668178637, T5v, T5k);
+					     T4V = T4T + T4U;
+					     T66 = T4T - T4U;
+					}
+				   }
+				   {
+					E T5y, T4o, T8J, T8L, T5I, T4W;
+					T5y = FNMS(KP923879532, T4n, T3S);
+					T4o = FMA(KP923879532, T4n, T3S);
+					T8J = FMA(KP923879532, T8I, T8H);
+					T8L = FNMS(KP923879532, T8I, T8H);
+					T5I = FMA(KP707106781, T4V, T4S);
+					T4W = FNMS(KP707106781, T4V, T4S);
+					{
+					     E T8G, T8F, T8D, T8E;
+					     {
+						  E T5G, T5P, T5z, T4X, T5N, T5J;
+						  T5O = FNMS(KP923879532, T5F, T5C);
+						  T5G = FMA(KP923879532, T5F, T5C);
+						  T5J = FNMS(KP198912367, T5I, T5H);
+						  T5P = FMA(KP198912367, T5H, T5I);
+						  T5z = FNMS(KP668178637, T4L, T4W);
+						  T4X = FMA(KP668178637, T4W, T4L);
+						  T5N = T5J + T5M;
+						  T8G = T5M - T5J;
+						  T8F = FNMS(KP923879532, T8C, T8B);
+						  T8D = FMA(KP923879532, T8C, T8B);
+						  {
+						       E T5B, T8K, T8M, T5x;
+						       T5B = T5z + T5A;
+						       T8K = T5z - T5A;
+						       T8M = T5w - T4X;
+						       T5x = T4X + T5w;
+						       ci[0] = FMA(KP980785280, T5N, T5G);
+						       cr[WS(rs, 15)] = FNMS(KP980785280, T5N, T5G);
+						       ci[WS(rs, 4)] = FNMS(KP831469612, T5B, T5y);
+						       cr[WS(rs, 11)] = FMA(KP831469612, T5B, T5y);
+						       ci[WS(rs, 28)] = FMA(KP831469612, T8K, T8J);
+						       cr[WS(rs, 19)] = FMS(KP831469612, T8K, T8J);
+						       ci[WS(rs, 20)] = FMA(KP831469612, T8M, T8L);
+						       cr[WS(rs, 27)] = FMS(KP831469612, T8M, T8L);
+						       cr[WS(rs, 3)] = FMA(KP831469612, T5x, T4o);
+						       ci[WS(rs, 12)] = FNMS(KP831469612, T5x, T4o);
+						       T8E = T5Q - T5P;
+						       T5R = T5P + T5Q;
+						  }
+					     }
+					     ci[WS(rs, 16)] = FMA(KP980785280, T8E, T8D);
+					     cr[WS(rs, 31)] = FMS(KP980785280, T8E, T8D);
+					     ci[WS(rs, 24)] = FMA(KP980785280, T8G, T8F);
+					     cr[WS(rs, 23)] = FMS(KP980785280, T8G, T8F);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T7y, T8q, T8p, T7B;
+			      {
+				   E T7a, T7m, T6I, T7f, T7A, T7w, T8r, T8l, T8m, T6T, T7k, T75, T8s, T7p, T7z;
+				   E T7t;
+				   {
+					E T7n, T6N, T6S, T7o, T7u, T7v;
+					T7a = T76 - T79;
+					T7u = T76 + T79;
+					cr[WS(rs, 7)] = FMA(KP980785280, T5R, T5O);
+					ci[WS(rs, 8)] = FNMS(KP980785280, T5R, T5O);
+					T7m = T6E + T6H;
+					T6I = T6E - T6H;
+					T7v = T7e - T7d;
+					T7f = T7d + T7e;
+					T7n = T6J - T6M;
+					T6N = T6J + T6M;
+					T7A = FMA(KP414213562, T7u, T7v);
+					T7w = FNMS(KP414213562, T7v, T7u);
+					T8r = T8k - T8j;
+					T8l = T8j + T8k;
+					T6S = T6O - T6R;
+					T7o = T6O + T6R;
+					{
+					     E T7r, T7s, T6Z, T74;
+					     T7r = T6V + T6Y;
+					     T6Z = T6V - T6Y;
+					     T74 = T72 - T73;
+					     T7s = T72 + T73;
+					     T8m = T6N - T6S;
+					     T6T = T6N + T6S;
+					     T7k = FNMS(KP414213562, T6Z, T74);
+					     T75 = FMA(KP414213562, T74, T6Z);
+					     T8s = T7o - T7n;
+					     T7p = T7n + T7o;
+					     T7z = FMA(KP414213562, T7r, T7s);
+					     T7t = FNMS(KP414213562, T7s, T7r);
+					}
+				   }
+				   {
+					E T7i, T6U, T8t, T8v, T7j, T7g;
+					T7i = FNMS(KP707106781, T6T, T6I);
+					T6U = FMA(KP707106781, T6T, T6I);
+					T8t = FMA(KP707106781, T8s, T8r);
+					T8v = FNMS(KP707106781, T8s, T8r);
+					T7j = FMA(KP414213562, T7a, T7f);
+					T7g = FNMS(KP414213562, T7f, T7a);
+					{
+					     E T7q, T7x, T8n, T8o;
+					     T7y = FNMS(KP707106781, T7p, T7m);
+					     T7q = FMA(KP707106781, T7p, T7m);
+					     {
+						  E T7l, T8u, T8w, T7h;
+						  T7l = T7j - T7k;
+						  T8u = T7k + T7j;
+						  T8w = T7g - T75;
+						  T7h = T75 + T7g;
+						  ci[WS(rs, 5)] = FMA(KP923879532, T7l, T7i);
+						  cr[WS(rs, 10)] = FNMS(KP923879532, T7l, T7i);
+						  ci[WS(rs, 29)] = FMA(KP923879532, T8u, T8t);
+						  cr[WS(rs, 18)] = FMS(KP923879532, T8u, T8t);
+						  ci[WS(rs, 21)] = FMA(KP923879532, T8w, T8v);
+						  cr[WS(rs, 26)] = FMS(KP923879532, T8w, T8v);
+						  cr[WS(rs, 2)] = FMA(KP923879532, T7h, T6U);
+						  ci[WS(rs, 13)] = FNMS(KP923879532, T7h, T6U);
+						  T7x = T7t + T7w;
+						  T8q = T7w - T7t;
+					     }
+					     T8p = FNMS(KP707106781, T8m, T8l);
+					     T8n = FMA(KP707106781, T8m, T8l);
+					     T8o = T7A - T7z;
+					     T7B = T7z + T7A;
+					     ci[WS(rs, 1)] = FMA(KP923879532, T7x, T7q);
+					     cr[WS(rs, 14)] = FNMS(KP923879532, T7x, T7q);
+					     ci[WS(rs, 17)] = FMA(KP923879532, T8o, T8n);
+					     cr[WS(rs, 30)] = FMS(KP923879532, T8o, T8n);
+					}
+				   }
+			      }
+			      {
+				   E T6o, T5W, T8W, T63, T8V, T8P, T8Q, T6r, T6e, T6w;
+				   {
+					E T6q, T6p, T5Z, T62;
+					ci[WS(rs, 25)] = FMA(KP923879532, T8q, T8p);
+					cr[WS(rs, 22)] = FMS(KP923879532, T8q, T8p);
+					cr[WS(rs, 6)] = FMA(KP923879532, T7B, T7y);
+					ci[WS(rs, 9)] = FNMS(KP923879532, T7B, T7y);
+					T6q = FNMS(KP414213562, T5X, T5Y);
+					T5Z = FMA(KP414213562, T5Y, T5X);
+					T62 = FNMS(KP414213562, T61, T60);
+					T6p = FMA(KP414213562, T60, T61);
+					T6o = FNMS(KP707106781, T5V, T5S);
+					T5W = FMA(KP707106781, T5V, T5S);
+					T8W = T5Z - T62;
+					T63 = T5Z + T62;
+					T8V = FNMS(KP707106781, T8O, T8N);
+					T8P = FMA(KP707106781, T8O, T8N);
+					T8Q = T6q + T6p;
+					T6r = T6p - T6q;
+					T6e = FMA(KP707106781, T6d, T6c);
+					T6w = FNMS(KP707106781, T6d, T6c);
+				   }
+				   {
+					E T6k, T8U, T6z, T6n, T8S, T8T, T8R, T6s;
+					{
+					     E T64, T6y, T6l, T6i, T6v, T6m, T6b, T8X, T8Z, T8Y, T6j, T90;
+					     {
+						  E T6C, T6B, T6x, T6h;
+						  T6k = FNMS(KP923879532, T63, T5W);
+						  T64 = FMA(KP923879532, T63, T5W);
+						  T6x = FNMS(KP707106781, T6g, T6f);
+						  T6h = FMA(KP707106781, T6g, T6f);
+						  {
+						       E T6t, T67, T6u, T6a;
+						       T6t = FNMS(KP707106781, T66, T65);
+						       T67 = FMA(KP707106781, T66, T65);
+						       T6u = FNMS(KP707106781, T69, T68);
+						       T6a = FMA(KP707106781, T69, T68);
+						       T6y = FMA(KP668178637, T6x, T6w);
+						       T6C = FNMS(KP668178637, T6w, T6x);
+						       T6l = FMA(KP198912367, T6e, T6h);
+						       T6i = FNMS(KP198912367, T6h, T6e);
+						       T6v = FNMS(KP668178637, T6u, T6t);
+						       T6B = FMA(KP668178637, T6t, T6u);
+						       T6m = FNMS(KP198912367, T67, T6a);
+						       T6b = FMA(KP198912367, T6a, T67);
+						  }
+						  T8X = FMA(KP923879532, T8W, T8V);
+						  T8Z = FNMS(KP923879532, T8W, T8V);
+						  T6D = T6B - T6C;
+						  T8Y = T6B + T6C;
+					     }
+					     T8U = T6i - T6b;
+					     T6j = T6b + T6i;
+					     T90 = T6y - T6v;
+					     T6z = T6v + T6y;
+					     ci[WS(rs, 18)] = FNMS(KP831469612, T8Y, T8X);
+					     cr[WS(rs, 29)] = -(FMA(KP831469612, T8Y, T8X));
+					     cr[WS(rs, 1)] = FMA(KP980785280, T6j, T64);
+					     ci[WS(rs, 14)] = FNMS(KP980785280, T6j, T64);
+					     cr[WS(rs, 21)] = FMS(KP831469612, T90, T8Z);
+					     ci[WS(rs, 26)] = FMA(KP831469612, T90, T8Z);
+					     T6n = T6l - T6m;
+					     T8S = T6m + T6l;
+					}
+					T6A = FNMS(KP923879532, T6r, T6o);
+					T6s = FMA(KP923879532, T6r, T6o);
+					T8T = FNMS(KP923879532, T8Q, T8P);
+					T8R = FMA(KP923879532, T8Q, T8P);
+					ci[WS(rs, 6)] = FMA(KP980785280, T6n, T6k);
+					cr[WS(rs, 9)] = FNMS(KP980785280, T6n, T6k);
+					ci[WS(rs, 2)] = FMA(KP831469612, T6z, T6s);
+					cr[WS(rs, 13)] = FNMS(KP831469612, T6z, T6s);
+					ci[WS(rs, 30)] = FMA(KP980785280, T8S, T8R);
+					cr[WS(rs, 17)] = FMS(KP980785280, T8S, T8R);
+					ci[WS(rs, 22)] = FMA(KP980785280, T8U, T8T);
+					cr[WS(rs, 25)] = FMS(KP980785280, T8U, T8T);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 5)] = FMA(KP831469612, T6D, T6A);
+	       ci[WS(rs, 10)] = FNMS(KP831469612, T6D, T6A);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 32, "hf_32", twinstr, &GENUS, {236, 62, 198, 0} };
+
+void X(codelet_hf_32) (planner *p) {
+     X(khc2hc_register) (p, hf_32, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 32 -dit -name hf_32 -include hf.h */
+
+/*
+ * This function contains 434 FP additions, 208 FP multiplications,
+ * (or, 340 additions, 114 multiplications, 94 fused multiply/add),
+ * 96 stack variables, 7 constants, and 128 memory accesses
+ */
+#include "hf.h"
+
+static void hf_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) {
+	       E Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T56, T41;
+	       E T59, T2B, T67, T6e, T6O, T4b, T5g, T4s, T5d, TG, T7l, T5I, T73, T3a, T4U;
+	       E T3f, T4V, T14, T5K, T5N, T6F, T3m, T4Z, T3r, T4Y, T1r, T5P, T5S, T6E, T3x;
+	       E T52, T3C, T51, T2d, T5Z, T64, T6K, T3V, T5a, T44, T57, T2Y, T6f, T6a, T6P;
+	       E T4m, T5e, T4v, T5h;
+	       {
+		    E T1, T76, T6, T75, Tc, T32, Th, T33;
+		    T1 = cr[0];
+		    T76 = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 16)];
+			 T5 = ci[WS(rs, 16)];
+			 T2 = W[30];
+			 T4 = W[31];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T75 = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = cr[WS(rs, 8)];
+			 Tb = ci[WS(rs, 8)];
+			 T8 = W[14];
+			 Ta = W[15];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T32 = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = cr[WS(rs, 24)];
+			 Tg = ci[WS(rs, 24)];
+			 Td = W[46];
+			 Tf = W[47];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T33 = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E T7, Ti, T7A, T7B;
+			 T7 = T1 + T6;
+			 Ti = Tc + Th;
+			 Tj = T7 + Ti;
+			 T5F = T7 - Ti;
+			 T7A = Tc - Th;
+			 T7B = T76 - T75;
+			 T7C = T7A + T7B;
+			 T7Q = T7B - T7A;
+		    }
+		    {
+			 E T31, T34, T74, T77;
+			 T31 = T1 - T6;
+			 T34 = T32 - T33;
+			 T35 = T31 + T34;
+			 T4T = T31 - T34;
+			 T74 = T32 + T33;
+			 T77 = T75 + T76;
+			 T78 = T74 + T77;
+			 T7m = T77 - T74;
+		    }
+	       }
+	       {
+		    E T1y, T3X, T1O, T3I, T1D, T3Y, T1J, T3H;
+		    {
+			 E T1v, T1x, T1u, T1w;
+			 T1v = cr[WS(rs, 1)];
+			 T1x = ci[WS(rs, 1)];
+			 T1u = W[0];
+			 T1w = W[1];
+			 T1y = FMA(T1u, T1v, T1w * T1x);
+			 T3X = FNMS(T1w, T1v, T1u * T1x);
+		    }
+		    {
+			 E T1L, T1N, T1K, T1M;
+			 T1L = cr[WS(rs, 25)];
+			 T1N = ci[WS(rs, 25)];
+			 T1K = W[48];
+			 T1M = W[49];
+			 T1O = FMA(T1K, T1L, T1M * T1N);
+			 T3I = FNMS(T1M, T1L, T1K * T1N);
+		    }
+		    {
+			 E T1A, T1C, T1z, T1B;
+			 T1A = cr[WS(rs, 17)];
+			 T1C = ci[WS(rs, 17)];
+			 T1z = W[32];
+			 T1B = W[33];
+			 T1D = FMA(T1z, T1A, T1B * T1C);
+			 T3Y = FNMS(T1B, T1A, T1z * T1C);
+		    }
+		    {
+			 E T1G, T1I, T1F, T1H;
+			 T1G = cr[WS(rs, 9)];
+			 T1I = ci[WS(rs, 9)];
+			 T1F = W[16];
+			 T1H = W[17];
+			 T1J = FMA(T1F, T1G, T1H * T1I);
+			 T3H = FNMS(T1H, T1G, T1F * T1I);
+		    }
+		    {
+			 E T1E, T1P, T5W, T5X;
+			 T1E = T1y + T1D;
+			 T1P = T1J + T1O;
+			 T1Q = T1E + T1P;
+			 T61 = T1E - T1P;
+			 T5W = T3X + T3Y;
+			 T5X = T3H + T3I;
+			 T5Y = T5W - T5X;
+			 T6J = T5W + T5X;
+		    }
+		    {
+			 E T3G, T3J, T3Z, T40;
+			 T3G = T1y - T1D;
+			 T3J = T3H - T3I;
+			 T3K = T3G + T3J;
+			 T56 = T3G - T3J;
+			 T3Z = T3X - T3Y;
+			 T40 = T1J - T1O;
+			 T41 = T3Z - T40;
+			 T59 = T3Z + T40;
+		    }
+	       }
+	       {
+		    E T2j, T47, T2z, T4q, T2o, T48, T2u, T4p;
+		    {
+			 E T2g, T2i, T2f, T2h;
+			 T2g = cr[WS(rs, 31)];
+			 T2i = ci[WS(rs, 31)];
+			 T2f = W[60];
+			 T2h = W[61];
+			 T2j = FMA(T2f, T2g, T2h * T2i);
+			 T47 = FNMS(T2h, T2g, T2f * T2i);
+		    }
+		    {
+			 E T2w, T2y, T2v, T2x;
+			 T2w = cr[WS(rs, 23)];
+			 T2y = ci[WS(rs, 23)];
+			 T2v = W[44];
+			 T2x = W[45];
+			 T2z = FMA(T2v, T2w, T2x * T2y);
+			 T4q = FNMS(T2x, T2w, T2v * T2y);
+		    }
+		    {
+			 E T2l, T2n, T2k, T2m;
+			 T2l = cr[WS(rs, 15)];
+			 T2n = ci[WS(rs, 15)];
+			 T2k = W[28];
+			 T2m = W[29];
+			 T2o = FMA(T2k, T2l, T2m * T2n);
+			 T48 = FNMS(T2m, T2l, T2k * T2n);
+		    }
+		    {
+			 E T2r, T2t, T2q, T2s;
+			 T2r = cr[WS(rs, 7)];
+			 T2t = ci[WS(rs, 7)];
+			 T2q = W[12];
+			 T2s = W[13];
+			 T2u = FMA(T2q, T2r, T2s * T2t);
+			 T4p = FNMS(T2s, T2r, T2q * T2t);
+		    }
+		    {
+			 E T2p, T2A, T6c, T6d;
+			 T2p = T2j + T2o;
+			 T2A = T2u + T2z;
+			 T2B = T2p + T2A;
+			 T67 = T2p - T2A;
+			 T6c = T47 + T48;
+			 T6d = T4p + T4q;
+			 T6e = T6c - T6d;
+			 T6O = T6c + T6d;
+		    }
+		    {
+			 E T49, T4a, T4o, T4r;
+			 T49 = T47 - T48;
+			 T4a = T2u - T2z;
+			 T4b = T49 - T4a;
+			 T5g = T49 + T4a;
+			 T4o = T2j - T2o;
+			 T4r = T4p - T4q;
+			 T4s = T4o + T4r;
+			 T5d = T4o - T4r;
+		    }
+	       }
+	       {
+		    E To, T37, TE, T3d, Tt, T38, Tz, T3c;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = cr[WS(rs, 4)];
+			 Tn = ci[WS(rs, 4)];
+			 Tk = W[6];
+			 Tm = W[7];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 T37 = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = cr[WS(rs, 12)];
+			 TD = ci[WS(rs, 12)];
+			 TA = W[22];
+			 TC = W[23];
+			 TE = FMA(TA, TB, TC * TD);
+			 T3d = FNMS(TC, TB, TA * TD);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = cr[WS(rs, 20)];
+			 Ts = ci[WS(rs, 20)];
+			 Tp = W[38];
+			 Tr = W[39];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 T38 = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = cr[WS(rs, 28)];
+			 Ty = ci[WS(rs, 28)];
+			 Tv = W[54];
+			 Tx = W[55];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T3c = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E Tu, TF, T5G, T5H;
+			 Tu = To + Tt;
+			 TF = Tz + TE;
+			 TG = Tu + TF;
+			 T7l = Tu - TF;
+			 T5G = T3c + T3d;
+			 T5H = T37 + T38;
+			 T5I = T5G - T5H;
+			 T73 = T5H + T5G;
+		    }
+		    {
+			 E T36, T39, T3b, T3e;
+			 T36 = To - Tt;
+			 T39 = T37 - T38;
+			 T3a = T36 + T39;
+			 T4U = T36 - T39;
+			 T3b = Tz - TE;
+			 T3e = T3c - T3d;
+			 T3f = T3b - T3e;
+			 T4V = T3b + T3e;
+		    }
+	       }
+	       {
+		    E TM, T3n, T12, T3k, TR, T3o, TX, T3j;
+		    {
+			 E TJ, TL, TI, TK;
+			 TJ = cr[WS(rs, 2)];
+			 TL = ci[WS(rs, 2)];
+			 TI = W[2];
+			 TK = W[3];
+			 TM = FMA(TI, TJ, TK * TL);
+			 T3n = FNMS(TK, TJ, TI * TL);
+		    }
+		    {
+			 E TZ, T11, TY, T10;
+			 TZ = cr[WS(rs, 26)];
+			 T11 = ci[WS(rs, 26)];
+			 TY = W[50];
+			 T10 = W[51];
+			 T12 = FMA(TY, TZ, T10 * T11);
+			 T3k = FNMS(T10, TZ, TY * T11);
+		    }
+		    {
+			 E TO, TQ, TN, TP;
+			 TO = cr[WS(rs, 18)];
+			 TQ = ci[WS(rs, 18)];
+			 TN = W[34];
+			 TP = W[35];
+			 TR = FMA(TN, TO, TP * TQ);
+			 T3o = FNMS(TP, TO, TN * TQ);
+		    }
+		    {
+			 E TU, TW, TT, TV;
+			 TU = cr[WS(rs, 10)];
+			 TW = ci[WS(rs, 10)];
+			 TT = W[18];
+			 TV = W[19];
+			 TX = FMA(TT, TU, TV * TW);
+			 T3j = FNMS(TV, TU, TT * TW);
+		    }
+		    {
+			 E TS, T13, T5L, T5M;
+			 TS = TM + TR;
+			 T13 = TX + T12;
+			 T14 = TS + T13;
+			 T5K = TS - T13;
+			 T5L = T3n + T3o;
+			 T5M = T3j + T3k;
+			 T5N = T5L - T5M;
+			 T6F = T5L + T5M;
+		    }
+		    {
+			 E T3i, T3l, T3p, T3q;
+			 T3i = TM - TR;
+			 T3l = T3j - T3k;
+			 T3m = T3i + T3l;
+			 T4Z = T3i - T3l;
+			 T3p = T3n - T3o;
+			 T3q = TX - T12;
+			 T3r = T3p - T3q;
+			 T4Y = T3p + T3q;
+		    }
+	       }
+	       {
+		    E T19, T3t, T1p, T3A, T1e, T3u, T1k, T3z;
+		    {
+			 E T16, T18, T15, T17;
+			 T16 = cr[WS(rs, 30)];
+			 T18 = ci[WS(rs, 30)];
+			 T15 = W[58];
+			 T17 = W[59];
+			 T19 = FMA(T15, T16, T17 * T18);
+			 T3t = FNMS(T17, T16, T15 * T18);
+		    }
+		    {
+			 E T1m, T1o, T1l, T1n;
+			 T1m = cr[WS(rs, 22)];
+			 T1o = ci[WS(rs, 22)];
+			 T1l = W[42];
+			 T1n = W[43];
+			 T1p = FMA(T1l, T1m, T1n * T1o);
+			 T3A = FNMS(T1n, T1m, T1l * T1o);
+		    }
+		    {
+			 E T1b, T1d, T1a, T1c;
+			 T1b = cr[WS(rs, 14)];
+			 T1d = ci[WS(rs, 14)];
+			 T1a = W[26];
+			 T1c = W[27];
+			 T1e = FMA(T1a, T1b, T1c * T1d);
+			 T3u = FNMS(T1c, T1b, T1a * T1d);
+		    }
+		    {
+			 E T1h, T1j, T1g, T1i;
+			 T1h = cr[WS(rs, 6)];
+			 T1j = ci[WS(rs, 6)];
+			 T1g = W[10];
+			 T1i = W[11];
+			 T1k = FMA(T1g, T1h, T1i * T1j);
+			 T3z = FNMS(T1i, T1h, T1g * T1j);
+		    }
+		    {
+			 E T1f, T1q, T5Q, T5R;
+			 T1f = T19 + T1e;
+			 T1q = T1k + T1p;
+			 T1r = T1f + T1q;
+			 T5P = T1f - T1q;
+			 T5Q = T3t + T3u;
+			 T5R = T3z + T3A;
+			 T5S = T5Q - T5R;
+			 T6E = T5Q + T5R;
+		    }
+		    {
+			 E T3v, T3w, T3y, T3B;
+			 T3v = T3t - T3u;
+			 T3w = T1k - T1p;
+			 T3x = T3v - T3w;
+			 T52 = T3v + T3w;
+			 T3y = T19 - T1e;
+			 T3B = T3z - T3A;
+			 T3C = T3y + T3B;
+			 T51 = T3y - T3B;
+		    }
+	       }
+	       {
+		    E T1V, T3M, T20, T3N, T3L, T3O, T26, T3Q, T2b, T3R, T3S, T3T;
+		    {
+			 E T1S, T1U, T1R, T1T;
+			 T1S = cr[WS(rs, 5)];
+			 T1U = ci[WS(rs, 5)];
+			 T1R = W[8];
+			 T1T = W[9];
+			 T1V = FMA(T1R, T1S, T1T * T1U);
+			 T3M = FNMS(T1T, T1S, T1R * T1U);
+		    }
+		    {
+			 E T1X, T1Z, T1W, T1Y;
+			 T1X = cr[WS(rs, 21)];
+			 T1Z = ci[WS(rs, 21)];
+			 T1W = W[40];
+			 T1Y = W[41];
+			 T20 = FMA(T1W, T1X, T1Y * T1Z);
+			 T3N = FNMS(T1Y, T1X, T1W * T1Z);
+		    }
+		    T3L = T1V - T20;
+		    T3O = T3M - T3N;
+		    {
+			 E T23, T25, T22, T24;
+			 T23 = cr[WS(rs, 29)];
+			 T25 = ci[WS(rs, 29)];
+			 T22 = W[56];
+			 T24 = W[57];
+			 T26 = FMA(T22, T23, T24 * T25);
+			 T3Q = FNMS(T24, T23, T22 * T25);
+		    }
+		    {
+			 E T28, T2a, T27, T29;
+			 T28 = cr[WS(rs, 13)];
+			 T2a = ci[WS(rs, 13)];
+			 T27 = W[24];
+			 T29 = W[25];
+			 T2b = FMA(T27, T28, T29 * T2a);
+			 T3R = FNMS(T29, T28, T27 * T2a);
+		    }
+		    T3S = T3Q - T3R;
+		    T3T = T26 - T2b;
+		    {
+			 E T21, T2c, T62, T63;
+			 T21 = T1V + T20;
+			 T2c = T26 + T2b;
+			 T2d = T21 + T2c;
+			 T5Z = T21 - T2c;
+			 T62 = T3Q + T3R;
+			 T63 = T3M + T3N;
+			 T64 = T62 - T63;
+			 T6K = T63 + T62;
+		    }
+		    {
+			 E T3P, T3U, T42, T43;
+			 T3P = T3L + T3O;
+			 T3U = T3S - T3T;
+			 T3V = KP707106781 * (T3P - T3U);
+			 T5a = KP707106781 * (T3P + T3U);
+			 T42 = T3T + T3S;
+			 T43 = T3L - T3O;
+			 T44 = KP707106781 * (T42 - T43);
+			 T57 = KP707106781 * (T43 + T42);
+		    }
+	       }
+	       {
+		    E T2G, T4i, T2L, T4j, T4h, T4k, T2R, T4d, T2W, T4e, T4c, T4f;
+		    {
+			 E T2D, T2F, T2C, T2E;
+			 T2D = cr[WS(rs, 3)];
+			 T2F = ci[WS(rs, 3)];
+			 T2C = W[4];
+			 T2E = W[5];
+			 T2G = FMA(T2C, T2D, T2E * T2F);
+			 T4i = FNMS(T2E, T2D, T2C * T2F);
+		    }
+		    {
+			 E T2I, T2K, T2H, T2J;
+			 T2I = cr[WS(rs, 19)];
+			 T2K = ci[WS(rs, 19)];
+			 T2H = W[36];
+			 T2J = W[37];
+			 T2L = FMA(T2H, T2I, T2J * T2K);
+			 T4j = FNMS(T2J, T2I, T2H * T2K);
+		    }
+		    T4h = T2G - T2L;
+		    T4k = T4i - T4j;
+		    {
+			 E T2O, T2Q, T2N, T2P;
+			 T2O = cr[WS(rs, 27)];
+			 T2Q = ci[WS(rs, 27)];
+			 T2N = W[52];
+			 T2P = W[53];
+			 T2R = FMA(T2N, T2O, T2P * T2Q);
+			 T4d = FNMS(T2P, T2O, T2N * T2Q);
+		    }
+		    {
+			 E T2T, T2V, T2S, T2U;
+			 T2T = cr[WS(rs, 11)];
+			 T2V = ci[WS(rs, 11)];
+			 T2S = W[20];
+			 T2U = W[21];
+			 T2W = FMA(T2S, T2T, T2U * T2V);
+			 T4e = FNMS(T2U, T2T, T2S * T2V);
+		    }
+		    T4c = T2R - T2W;
+		    T4f = T4d - T4e;
+		    {
+			 E T2M, T2X, T68, T69;
+			 T2M = T2G + T2L;
+			 T2X = T2R + T2W;
+			 T2Y = T2M + T2X;
+			 T6f = T2M - T2X;
+			 T68 = T4d + T4e;
+			 T69 = T4i + T4j;
+			 T6a = T68 - T69;
+			 T6P = T69 + T68;
+		    }
+		    {
+			 E T4g, T4l, T4t, T4u;
+			 T4g = T4c + T4f;
+			 T4l = T4h - T4k;
+			 T4m = KP707106781 * (T4g - T4l);
+			 T5e = KP707106781 * (T4l + T4g);
+			 T4t = T4h + T4k;
+			 T4u = T4f - T4c;
+			 T4v = KP707106781 * (T4t - T4u);
+			 T5h = KP707106781 * (T4t + T4u);
+		    }
+	       }
+	       {
+		    E T1t, T6X, T7a, T7c, T30, T7b, T70, T71;
+		    {
+			 E TH, T1s, T72, T79;
+			 TH = Tj + TG;
+			 T1s = T14 + T1r;
+			 T1t = TH + T1s;
+			 T6X = TH - T1s;
+			 T72 = T6F + T6E;
+			 T79 = T73 + T78;
+			 T7a = T72 + T79;
+			 T7c = T79 - T72;
+		    }
+		    {
+			 E T2e, T2Z, T6Y, T6Z;
+			 T2e = T1Q + T2d;
+			 T2Z = T2B + T2Y;
+			 T30 = T2e + T2Z;
+			 T7b = T2Z - T2e;
+			 T6Y = T6O + T6P;
+			 T6Z = T6J + T6K;
+			 T70 = T6Y - T6Z;
+			 T71 = T6Z + T6Y;
+		    }
+		    ci[WS(rs, 15)] = T1t - T30;
+		    cr[WS(rs, 24)] = T7b - T7c;
+		    ci[WS(rs, 23)] = T7b + T7c;
+		    cr[0] = T1t + T30;
+		    cr[WS(rs, 8)] = T6X - T70;
+		    cr[WS(rs, 16)] = T71 - T7a;
+		    ci[WS(rs, 31)] = T71 + T7a;
+		    ci[WS(rs, 7)] = T6X + T70;
+	       }
+	       {
+		    E T4X, T5p, T7D, T7J, T54, T7y, T5z, T5D, T5c, T5m, T5s, T7I, T5w, T5C, T5j;
+		    E T5n, T4W, T7z;
+		    T4W = KP707106781 * (T4U + T4V);
+		    T4X = T4T - T4W;
+		    T5p = T4T + T4W;
+		    T7z = KP707106781 * (T3a - T3f);
+		    T7D = T7z + T7C;
+		    T7J = T7C - T7z;
+		    {
+			 E T50, T53, T5x, T5y;
+			 T50 = FMA(KP923879532, T4Y, KP382683432 * T4Z);
+			 T53 = FNMS(KP923879532, T52, KP382683432 * T51);
+			 T54 = T50 + T53;
+			 T7y = T50 - T53;
+			 T5x = T5d + T5e;
+			 T5y = T5g + T5h;
+			 T5z = FNMS(KP980785280, T5y, KP195090322 * T5x);
+			 T5D = FMA(KP980785280, T5x, KP195090322 * T5y);
+		    }
+		    {
+			 E T58, T5b, T5q, T5r;
+			 T58 = T56 - T57;
+			 T5b = T59 - T5a;
+			 T5c = FMA(KP831469612, T58, KP555570233 * T5b);
+			 T5m = FNMS(KP831469612, T5b, KP555570233 * T58);
+			 T5q = FNMS(KP382683432, T4Y, KP923879532 * T4Z);
+			 T5r = FMA(KP382683432, T52, KP923879532 * T51);
+			 T5s = T5q + T5r;
+			 T7I = T5r - T5q;
+		    }
+		    {
+			 E T5u, T5v, T5f, T5i;
+			 T5u = T56 + T57;
+			 T5v = T59 + T5a;
+			 T5w = FMA(KP195090322, T5u, KP980785280 * T5v);
+			 T5C = FNMS(KP195090322, T5v, KP980785280 * T5u);
+			 T5f = T5d - T5e;
+			 T5i = T5g - T5h;
+			 T5j = FNMS(KP555570233, T5i, KP831469612 * T5f);
+			 T5n = FMA(KP555570233, T5f, KP831469612 * T5i);
+		    }
+		    {
+			 E T55, T5k, T7H, T7K;
+			 T55 = T4X + T54;
+			 T5k = T5c + T5j;
+			 ci[WS(rs, 12)] = T55 - T5k;
+			 cr[WS(rs, 3)] = T55 + T5k;
+			 T7H = T5n - T5m;
+			 T7K = T7I + T7J;
+			 cr[WS(rs, 19)] = T7H - T7K;
+			 ci[WS(rs, 28)] = T7H + T7K;
+		    }
+		    {
+			 E T7L, T7M, T5l, T5o;
+			 T7L = T5j - T5c;
+			 T7M = T7J - T7I;
+			 cr[WS(rs, 27)] = T7L - T7M;
+			 ci[WS(rs, 20)] = T7L + T7M;
+			 T5l = T4X - T54;
+			 T5o = T5m + T5n;
+			 cr[WS(rs, 11)] = T5l - T5o;
+			 ci[WS(rs, 4)] = T5l + T5o;
+		    }
+		    {
+			 E T5t, T5A, T7x, T7E;
+			 T5t = T5p - T5s;
+			 T5A = T5w + T5z;
+			 ci[WS(rs, 8)] = T5t - T5A;
+			 cr[WS(rs, 7)] = T5t + T5A;
+			 T7x = T5z - T5w;
+			 T7E = T7y + T7D;
+			 cr[WS(rs, 31)] = T7x - T7E;
+			 ci[WS(rs, 16)] = T7x + T7E;
+		    }
+		    {
+			 E T7F, T7G, T5B, T5E;
+			 T7F = T5D - T5C;
+			 T7G = T7D - T7y;
+			 cr[WS(rs, 23)] = T7F - T7G;
+			 ci[WS(rs, 24)] = T7F + T7G;
+			 T5B = T5p + T5s;
+			 T5E = T5C + T5D;
+			 cr[WS(rs, 15)] = T5B - T5E;
+			 ci[0] = T5B + T5E;
+		    }
+	       }
+	       {
+		    E T6H, T6T, T7g, T7i, T6M, T6U, T6R, T6V;
+		    {
+			 E T6D, T6G, T7e, T7f;
+			 T6D = Tj - TG;
+			 T6G = T6E - T6F;
+			 T6H = T6D - T6G;
+			 T6T = T6D + T6G;
+			 T7e = T14 - T1r;
+			 T7f = T78 - T73;
+			 T7g = T7e + T7f;
+			 T7i = T7f - T7e;
+		    }
+		    {
+			 E T6I, T6L, T6N, T6Q;
+			 T6I = T1Q - T2d;
+			 T6L = T6J - T6K;
+			 T6M = T6I + T6L;
+			 T6U = T6I - T6L;
+			 T6N = T2B - T2Y;
+			 T6Q = T6O - T6P;
+			 T6R = T6N - T6Q;
+			 T6V = T6N + T6Q;
+		    }
+		    {
+			 E T6S, T7h, T6W, T7d;
+			 T6S = KP707106781 * (T6M + T6R);
+			 ci[WS(rs, 11)] = T6H - T6S;
+			 cr[WS(rs, 4)] = T6H + T6S;
+			 T7h = KP707106781 * (T6V - T6U);
+			 cr[WS(rs, 20)] = T7h - T7i;
+			 ci[WS(rs, 27)] = T7h + T7i;
+			 T6W = KP707106781 * (T6U + T6V);
+			 cr[WS(rs, 12)] = T6T - T6W;
+			 ci[WS(rs, 3)] = T6T + T6W;
+			 T7d = KP707106781 * (T6R - T6M);
+			 cr[WS(rs, 28)] = T7d - T7g;
+			 ci[WS(rs, 19)] = T7d + T7g;
+		    }
+	       }
+	       {
+		    E T5J, T7n, T7t, T6n, T5U, T7k, T6x, T6B, T6q, T7s, T66, T6k, T6u, T6A, T6h;
+		    E T6l;
+		    {
+			 E T5O, T5T, T60, T65;
+			 T5J = T5F - T5I;
+			 T7n = T7l + T7m;
+			 T7t = T7m - T7l;
+			 T6n = T5F + T5I;
+			 T5O = T5K + T5N;
+			 T5T = T5P - T5S;
+			 T5U = KP707106781 * (T5O + T5T);
+			 T7k = KP707106781 * (T5O - T5T);
+			 {
+			      E T6v, T6w, T6o, T6p;
+			      T6v = T6e + T6f;
+			      T6w = T67 + T6a;
+			      T6x = FMA(KP382683432, T6v, KP923879532 * T6w);
+			      T6B = FNMS(KP923879532, T6v, KP382683432 * T6w);
+			      T6o = T5K - T5N;
+			      T6p = T5P + T5S;
+			      T6q = KP707106781 * (T6o + T6p);
+			      T7s = KP707106781 * (T6p - T6o);
+			 }
+			 T60 = T5Y - T5Z;
+			 T65 = T61 - T64;
+			 T66 = FMA(KP382683432, T60, KP923879532 * T65);
+			 T6k = FNMS(KP923879532, T60, KP382683432 * T65);
+			 {
+			      E T6s, T6t, T6b, T6g;
+			      T6s = T61 + T64;
+			      T6t = T5Y + T5Z;
+			      T6u = FNMS(KP382683432, T6t, KP923879532 * T6s);
+			      T6A = FMA(KP923879532, T6t, KP382683432 * T6s);
+			      T6b = T67 - T6a;
+			      T6g = T6e - T6f;
+			      T6h = FNMS(KP382683432, T6g, KP923879532 * T6b);
+			      T6l = FMA(KP923879532, T6g, KP382683432 * T6b);
+			 }
+		    }
+		    {
+			 E T5V, T6i, T7r, T7u;
+			 T5V = T5J + T5U;
+			 T6i = T66 + T6h;
+			 ci[WS(rs, 13)] = T5V - T6i;
+			 cr[WS(rs, 2)] = T5V + T6i;
+			 T7r = T6l - T6k;
+			 T7u = T7s + T7t;
+			 cr[WS(rs, 18)] = T7r - T7u;
+			 ci[WS(rs, 29)] = T7r + T7u;
+		    }
+		    {
+			 E T7v, T7w, T6j, T6m;
+			 T7v = T6h - T66;
+			 T7w = T7t - T7s;
+			 cr[WS(rs, 26)] = T7v - T7w;
+			 ci[WS(rs, 21)] = T7v + T7w;
+			 T6j = T5J - T5U;
+			 T6m = T6k + T6l;
+			 cr[WS(rs, 10)] = T6j - T6m;
+			 ci[WS(rs, 5)] = T6j + T6m;
+		    }
+		    {
+			 E T6r, T6y, T7j, T7o;
+			 T6r = T6n + T6q;
+			 T6y = T6u + T6x;
+			 cr[WS(rs, 14)] = T6r - T6y;
+			 ci[WS(rs, 1)] = T6r + T6y;
+			 T7j = T6B - T6A;
+			 T7o = T7k + T7n;
+			 cr[WS(rs, 30)] = T7j - T7o;
+			 ci[WS(rs, 17)] = T7j + T7o;
+		    }
+		    {
+			 E T7p, T7q, T6z, T6C;
+			 T7p = T6x - T6u;
+			 T7q = T7n - T7k;
+			 cr[WS(rs, 22)] = T7p - T7q;
+			 ci[WS(rs, 25)] = T7p + T7q;
+			 T6z = T6n - T6q;
+			 T6C = T6A + T6B;
+			 ci[WS(rs, 9)] = T6z - T6C;
+			 cr[WS(rs, 6)] = T6z + T6C;
+		    }
+	       }
+	       {
+		    E T3h, T4D, T7R, T7X, T3E, T7O, T4N, T4R, T46, T4A, T4G, T7W, T4K, T4Q, T4x;
+		    E T4B, T3g, T7P;
+		    T3g = KP707106781 * (T3a + T3f);
+		    T3h = T35 - T3g;
+		    T4D = T35 + T3g;
+		    T7P = KP707106781 * (T4V - T4U);
+		    T7R = T7P + T7Q;
+		    T7X = T7Q - T7P;
+		    {
+			 E T3s, T3D, T4L, T4M;
+			 T3s = FNMS(KP923879532, T3r, KP382683432 * T3m);
+			 T3D = FMA(KP923879532, T3x, KP382683432 * T3C);
+			 T3E = T3s + T3D;
+			 T7O = T3D - T3s;
+			 T4L = T4s + T4v;
+			 T4M = T4b + T4m;
+			 T4N = FNMS(KP195090322, T4M, KP980785280 * T4L);
+			 T4R = FMA(KP980785280, T4M, KP195090322 * T4L);
+		    }
+		    {
+			 E T3W, T45, T4E, T4F;
+			 T3W = T3K - T3V;
+			 T45 = T41 - T44;
+			 T46 = FNMS(KP555570233, T45, KP831469612 * T3W);
+			 T4A = FMA(KP831469612, T45, KP555570233 * T3W);
+			 T4E = FMA(KP382683432, T3r, KP923879532 * T3m);
+			 T4F = FNMS(KP382683432, T3x, KP923879532 * T3C);
+			 T4G = T4E + T4F;
+			 T7W = T4E - T4F;
+		    }
+		    {
+			 E T4I, T4J, T4n, T4w;
+			 T4I = T41 + T44;
+			 T4J = T3K + T3V;
+			 T4K = FMA(KP195090322, T4I, KP980785280 * T4J);
+			 T4Q = FNMS(KP980785280, T4I, KP195090322 * T4J);
+			 T4n = T4b - T4m;
+			 T4w = T4s - T4v;
+			 T4x = FMA(KP555570233, T4n, KP831469612 * T4w);
+			 T4B = FNMS(KP831469612, T4n, KP555570233 * T4w);
+		    }
+		    {
+			 E T3F, T4y, T7V, T7Y;
+			 T3F = T3h + T3E;
+			 T4y = T46 + T4x;
+			 cr[WS(rs, 13)] = T3F - T4y;
+			 ci[WS(rs, 2)] = T3F + T4y;
+			 T7V = T4B - T4A;
+			 T7Y = T7W + T7X;
+			 cr[WS(rs, 29)] = T7V - T7Y;
+			 ci[WS(rs, 18)] = T7V + T7Y;
+		    }
+		    {
+			 E T7Z, T80, T4z, T4C;
+			 T7Z = T4x - T46;
+			 T80 = T7X - T7W;
+			 cr[WS(rs, 21)] = T7Z - T80;
+			 ci[WS(rs, 26)] = T7Z + T80;
+			 T4z = T3h - T3E;
+			 T4C = T4A + T4B;
+			 ci[WS(rs, 10)] = T4z - T4C;
+			 cr[WS(rs, 5)] = T4z + T4C;
+		    }
+		    {
+			 E T4H, T4O, T7N, T7S;
+			 T4H = T4D + T4G;
+			 T4O = T4K + T4N;
+			 ci[WS(rs, 14)] = T4H - T4O;
+			 cr[WS(rs, 1)] = T4H + T4O;
+			 T7N = T4R - T4Q;
+			 T7S = T7O + T7R;
+			 cr[WS(rs, 17)] = T7N - T7S;
+			 ci[WS(rs, 30)] = T7N + T7S;
+		    }
+		    {
+			 E T7T, T7U, T4P, T4S;
+			 T7T = T4N - T4K;
+			 T7U = T7R - T7O;
+			 cr[WS(rs, 25)] = T7T - T7U;
+			 ci[WS(rs, 22)] = T7T + T7U;
+			 T4P = T4D - T4G;
+			 T4S = T4Q + T4R;
+			 cr[WS(rs, 9)] = T4P - T4S;
+			 ci[WS(rs, 6)] = T4P + T4S;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 32},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 32, "hf_32", twinstr, &GENUS, {340, 114, 94, 0} };
+
+void X(codelet_hf_32) (planner *p) {
+     X(khc2hc_register) (p, hf_32, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:50 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hf_4 -include hf.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 31 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hf.h"
+
+static void hf_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E To, Te, Tm, T8, Ty, Tw, Tq, Tk;
+	       {
+		    E T1, Tv, Tu, T7, Tg, Tj, Tf, Ti, Tp, Th;
+		    T1 = cr[0];
+		    Tv = ci[0];
+		    {
+			 E T3, T6, T2, T5;
+			 T3 = cr[WS(rs, 2)];
+			 T6 = ci[WS(rs, 2)];
+			 T2 = W[2];
+			 T5 = W[3];
+			 {
+			      E Ta, Td, Tc, Tn, Tb, Tt, T4, T9;
+			      Ta = cr[WS(rs, 1)];
+			      Td = ci[WS(rs, 1)];
+			      Tt = T2 * T6;
+			      T4 = T2 * T3;
+			      T9 = W[0];
+			      Tc = W[1];
+			      Tu = FNMS(T5, T3, Tt);
+			      T7 = FMA(T5, T6, T4);
+			      Tn = T9 * Td;
+			      Tb = T9 * Ta;
+			      Tg = cr[WS(rs, 3)];
+			      Tj = ci[WS(rs, 3)];
+			      To = FNMS(Tc, Ta, Tn);
+			      Te = FMA(Tc, Td, Tb);
+			      Tf = W[4];
+			      Ti = W[5];
+			 }
+		    }
+		    Tm = T1 - T7;
+		    T8 = T1 + T7;
+		    Tp = Tf * Tj;
+		    Th = Tf * Tg;
+		    Ty = Tv - Tu;
+		    Tw = Tu + Tv;
+		    Tq = FNMS(Ti, Tg, Tp);
+		    Tk = FMA(Ti, Tj, Th);
+	       }
+	       {
+		    E Tr, Ts, Tl, Tx;
+		    Tr = To - Tq;
+		    Ts = To + Tq;
+		    Tl = Te + Tk;
+		    Tx = Tk - Te;
+		    ci[WS(rs, 3)] = Ts + Tw;
+		    cr[WS(rs, 2)] = Ts - Tw;
+		    cr[WS(rs, 1)] = Tm + Tr;
+		    ci[0] = Tm - Tr;
+		    ci[WS(rs, 2)] = Tx + Ty;
+		    cr[WS(rs, 3)] = Tx - Ty;
+		    cr[0] = T8 + Tl;
+		    ci[WS(rs, 1)] = T8 - Tl;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 4, "hf_4", twinstr, &GENUS, {16, 6, 6, 0} };
+
+void X(codelet_hf_4) (planner *p) {
+     X(khc2hc_register) (p, hf_4, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hf_4 -include hf.h */
+
+/*
+ * This function contains 22 FP additions, 12 FP multiplications,
+ * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
+ * 13 stack variables, 0 constants, and 16 memory accesses
+ */
+#include "hf.h"
+
+static void hf_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
+	       E T1, Tp, T6, To, Tc, Tk, Th, Tl;
+	       T1 = cr[0];
+	       Tp = ci[0];
+	       {
+		    E T3, T5, T2, T4;
+		    T3 = cr[WS(rs, 2)];
+		    T5 = ci[WS(rs, 2)];
+		    T2 = W[2];
+		    T4 = W[3];
+		    T6 = FMA(T2, T3, T4 * T5);
+		    To = FNMS(T4, T3, T2 * T5);
+	       }
+	       {
+		    E T9, Tb, T8, Ta;
+		    T9 = cr[WS(rs, 1)];
+		    Tb = ci[WS(rs, 1)];
+		    T8 = W[0];
+		    Ta = W[1];
+		    Tc = FMA(T8, T9, Ta * Tb);
+		    Tk = FNMS(Ta, T9, T8 * Tb);
+	       }
+	       {
+		    E Te, Tg, Td, Tf;
+		    Te = cr[WS(rs, 3)];
+		    Tg = ci[WS(rs, 3)];
+		    Td = W[4];
+		    Tf = W[5];
+		    Th = FMA(Td, Te, Tf * Tg);
+		    Tl = FNMS(Tf, Te, Td * Tg);
+	       }
+	       {
+		    E T7, Ti, Tj, Tm;
+		    T7 = T1 + T6;
+		    Ti = Tc + Th;
+		    ci[WS(rs, 1)] = T7 - Ti;
+		    cr[0] = T7 + Ti;
+		    Tj = T1 - T6;
+		    Tm = Tk - Tl;
+		    ci[0] = Tj - Tm;
+		    cr[WS(rs, 1)] = Tj + Tm;
+	       }
+	       {
+		    E Tn, Tq, Tr, Ts;
+		    Tn = Tk + Tl;
+		    Tq = To + Tp;
+		    cr[WS(rs, 2)] = Tn - Tq;
+		    ci[WS(rs, 3)] = Tn + Tq;
+		    Tr = Th - Tc;
+		    Ts = Tp - To;
+		    cr[WS(rs, 3)] = Tr - Ts;
+		    ci[WS(rs, 2)] = Tr + Ts;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 4},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 4, "hf_4", twinstr, &GENUS, {16, 6, 6, 0} };
+
+void X(codelet_hf_4) (planner *p) {
+     X(khc2hc_register) (p, hf_4, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:50 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 5 -dit -name hf_5 -include hf.h */
+
+/*
+ * This function contains 40 FP additions, 34 FP multiplications,
+ * (or, 14 additions, 8 multiplications, 26 fused multiply/add),
+ * 43 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "hf.h"
+
+static void hf_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E T1, TJ, TK, TA, TR, Te, TC, Tk, TE, Tq;
+	       {
+		    E Tg, Tj, Tm, TB, Th, Tp, Tl, Ti, To, TD, Tn;
+		    T1 = cr[0];
+		    TJ = ci[0];
+		    {
+			 E T9, Tc, Ty, Ta, Tb, Tx, T7, Tf, Tz, Td;
+			 {
+			      E T3, T6, T8, Tw, T4, T2, T5;
+			      T3 = cr[WS(rs, 1)];
+			      T6 = ci[WS(rs, 1)];
+			      T2 = W[0];
+			      T9 = cr[WS(rs, 4)];
+			      Tc = ci[WS(rs, 4)];
+			      T8 = W[6];
+			      Tw = T2 * T6;
+			      T4 = T2 * T3;
+			      T5 = W[1];
+			      Ty = T8 * Tc;
+			      Ta = T8 * T9;
+			      Tb = W[7];
+			      Tx = FNMS(T5, T3, Tw);
+			      T7 = FMA(T5, T6, T4);
+			 }
+			 Tg = cr[WS(rs, 2)];
+			 Tz = FNMS(Tb, T9, Ty);
+			 Td = FMA(Tb, Tc, Ta);
+			 Tj = ci[WS(rs, 2)];
+			 Tf = W[2];
+			 TK = Tx + Tz;
+			 TA = Tx - Tz;
+			 TR = Td - T7;
+			 Te = T7 + Td;
+			 Tm = cr[WS(rs, 3)];
+			 TB = Tf * Tj;
+			 Th = Tf * Tg;
+			 Tp = ci[WS(rs, 3)];
+			 Tl = W[4];
+			 Ti = W[3];
+			 To = W[5];
+		    }
+		    TD = Tl * Tp;
+		    Tn = Tl * Tm;
+		    TC = FNMS(Ti, Tg, TB);
+		    Tk = FMA(Ti, Tj, Th);
+		    TE = FNMS(To, Tm, TD);
+		    Tq = FMA(To, Tp, Tn);
+	       }
+	       {
+		    E TG, TI, TO, TS, TU, Tu, TN, Tt, TL, TF;
+		    TL = TC + TE;
+		    TF = TC - TE;
+		    {
+			 E Tr, TQ, TM, Ts;
+			 Tr = Tk + Tq;
+			 TQ = Tk - Tq;
+			 TG = FMA(KP618033988, TF, TA);
+			 TI = FNMS(KP618033988, TA, TF);
+			 TO = TK - TL;
+			 TM = TK + TL;
+			 TS = FMA(KP618033988, TR, TQ);
+			 TU = FNMS(KP618033988, TQ, TR);
+			 Tu = Te - Tr;
+			 Ts = Te + Tr;
+			 ci[WS(rs, 4)] = TM + TJ;
+			 TN = FNMS(KP250000000, TM, TJ);
+			 cr[0] = T1 + Ts;
+			 Tt = FNMS(KP250000000, Ts, T1);
+		    }
+		    {
+			 E TT, TP, Tv, TH;
+			 TT = FMA(KP559016994, TO, TN);
+			 TP = FNMS(KP559016994, TO, TN);
+			 Tv = FMA(KP559016994, Tu, Tt);
+			 TH = FNMS(KP559016994, Tu, Tt);
+			 ci[WS(rs, 2)] = FMA(KP951056516, TS, TP);
+			 cr[WS(rs, 3)] = FMS(KP951056516, TS, TP);
+			 ci[WS(rs, 3)] = FMA(KP951056516, TU, TT);
+			 cr[WS(rs, 4)] = FMS(KP951056516, TU, TT);
+			 ci[WS(rs, 1)] = FMA(KP951056516, TI, TH);
+			 cr[WS(rs, 2)] = FNMS(KP951056516, TI, TH);
+			 cr[WS(rs, 1)] = FMA(KP951056516, TG, Tv);
+			 ci[0] = FNMS(KP951056516, TG, Tv);
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 5},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 5, "hf_5", twinstr, &GENUS, {14, 8, 26, 0} };
+
+void X(codelet_hf_5) (planner *p) {
+     X(khc2hc_register) (p, hf_5, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 5 -dit -name hf_5 -include hf.h */
+
+/*
+ * This function contains 40 FP additions, 28 FP multiplications,
+ * (or, 26 additions, 14 multiplications, 14 fused multiply/add),
+ * 29 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "hf.h"
+
+static void hf_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
+	       E T1, TE, Tu, Tx, TC, TB, TF, TG, TH, Tc, Tn, To;
+	       T1 = cr[0];
+	       TE = ci[0];
+	       {
+		    E T6, Ts, Tm, Tw, Tb, Tt, Th, Tv;
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 1)];
+			 T5 = ci[WS(rs, 1)];
+			 T2 = W[0];
+			 T4 = W[1];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 Ts = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E Tj, Tl, Ti, Tk;
+			 Tj = cr[WS(rs, 3)];
+			 Tl = ci[WS(rs, 3)];
+			 Ti = W[4];
+			 Tk = W[5];
+			 Tm = FMA(Ti, Tj, Tk * Tl);
+			 Tw = FNMS(Tk, Tj, Ti * Tl);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = cr[WS(rs, 4)];
+			 Ta = ci[WS(rs, 4)];
+			 T7 = W[6];
+			 T9 = W[7];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 Tt = FNMS(T9, T8, T7 * Ta);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = cr[WS(rs, 2)];
+			 Tg = ci[WS(rs, 2)];
+			 Td = W[2];
+			 Tf = W[3];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 Tv = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Tu = Ts - Tt;
+		    Tx = Tv - Tw;
+		    TC = Th - Tm;
+		    TB = Tb - T6;
+		    TF = Ts + Tt;
+		    TG = Tv + Tw;
+		    TH = TF + TG;
+		    Tc = T6 + Tb;
+		    Tn = Th + Tm;
+		    To = Tc + Tn;
+	       }
+	       cr[0] = T1 + To;
+	       {
+		    E Ty, TA, Tr, Tz, Tp, Tq;
+		    Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
+		    TA = FNMS(KP587785252, Tu, KP951056516 * Tx);
+		    Tp = KP559016994 * (Tc - Tn);
+		    Tq = FNMS(KP250000000, To, T1);
+		    Tr = Tp + Tq;
+		    Tz = Tq - Tp;
+		    ci[0] = Tr - Ty;
+		    ci[WS(rs, 1)] = Tz + TA;
+		    cr[WS(rs, 1)] = Tr + Ty;
+		    cr[WS(rs, 2)] = Tz - TA;
+	       }
+	       ci[WS(rs, 4)] = TH + TE;
+	       {
+		    E TD, TL, TK, TM, TI, TJ;
+		    TD = FMA(KP587785252, TB, KP951056516 * TC);
+		    TL = FNMS(KP587785252, TC, KP951056516 * TB);
+		    TI = FNMS(KP250000000, TH, TE);
+		    TJ = KP559016994 * (TF - TG);
+		    TK = TI - TJ;
+		    TM = TJ + TI;
+		    cr[WS(rs, 3)] = TD - TK;
+		    ci[WS(rs, 3)] = TL + TM;
+		    ci[WS(rs, 2)] = TD + TK;
+		    cr[WS(rs, 4)] = TL - TM;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 5},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 5, "hf_5", twinstr, &GENUS, {26, 14, 14, 0} };
+
+void X(codelet_hf_5) (planner *p) {
+     X(khc2hc_register) (p, hf_5, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:50 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hf_6 -include hf.h */
+
+/*
+ * This function contains 46 FP additions, 32 FP multiplications,
+ * (or, 24 additions, 10 multiplications, 22 fused multiply/add),
+ * 47 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hf.h"
+
+static void hf_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
+	       E T11, T12, T14, T13;
+	       {
+		    E T1, TV, TX, T7, Tn, Tq, TO, TR, TB, Tl, To, TH, Tt, Tw, Ts;
+		    E Tp, Tv;
+		    T1 = cr[0];
+		    TV = ci[0];
+		    {
+			 E T3, T6, T2, T5;
+			 T3 = cr[WS(rs, 3)];
+			 T6 = ci[WS(rs, 3)];
+			 T2 = W[4];
+			 T5 = W[5];
+			 {
+			      E Ta, Td, Tg, TM, Tb, Tj, Tf, Tc, Ti, TW, T4, T9;
+			      Ta = cr[WS(rs, 2)];
+			      Td = ci[WS(rs, 2)];
+			      TW = T2 * T6;
+			      T4 = T2 * T3;
+			      T9 = W[2];
+			      Tg = cr[WS(rs, 5)];
+			      TX = FNMS(T5, T3, TW);
+			      T7 = FMA(T5, T6, T4);
+			      TM = T9 * Td;
+			      Tb = T9 * Ta;
+			      Tj = ci[WS(rs, 5)];
+			      Tf = W[8];
+			      Tc = W[3];
+			      Ti = W[9];
+			      {
+				   E TN, Te, TL, Tk, TK, Th, Tm;
+				   Tn = cr[WS(rs, 4)];
+				   TK = Tf * Tj;
+				   Th = Tf * Tg;
+				   TN = FNMS(Tc, Ta, TM);
+				   Te = FMA(Tc, Td, Tb);
+				   TL = FNMS(Ti, Tg, TK);
+				   Tk = FMA(Ti, Tj, Th);
+				   Tq = ci[WS(rs, 4)];
+				   Tm = W[6];
+				   TO = TL - TN;
+				   TR = TN + TL;
+				   TB = Te + Tk;
+				   Tl = Te - Tk;
+				   To = Tm * Tn;
+				   TH = Tm * Tq;
+			      }
+			      Tt = cr[WS(rs, 1)];
+			      Tw = ci[WS(rs, 1)];
+			      Ts = W[0];
+			      Tp = W[7];
+			      Tv = W[1];
+			 }
+		    }
+		    {
+			 E TA, T8, TI, Tr, TG, Tx, TF, Tu;
+			 TA = T1 + T7;
+			 T8 = T1 - T7;
+			 TF = Ts * Tw;
+			 Tu = Ts * Tt;
+			 TI = FNMS(Tp, Tn, TH);
+			 Tr = FMA(Tp, Tq, To);
+			 TG = FNMS(Tv, Tt, TF);
+			 Tx = FMA(Tv, Tw, Tu);
+			 {
+			      E TY, TU, TP, TT, TD, T10, Tz, TZ, TQ, TE;
+			      T11 = TX + TV;
+			      TY = TV - TX;
+			      {
+				   E TJ, TS, TC, Ty;
+				   TJ = TG - TI;
+				   TS = TI + TG;
+				   TC = Tr + Tx;
+				   Ty = Tr - Tx;
+				   TU = TO + TJ;
+				   TP = TJ - TO;
+				   TT = TR - TS;
+				   T12 = TR + TS;
+				   T14 = TB - TC;
+				   TD = TB + TC;
+				   T10 = Ty - Tl;
+				   Tz = Tl + Ty;
+				   TZ = FMA(KP500000000, TU, TY);
+			      }
+			      cr[0] = TA + TD;
+			      TQ = FNMS(KP500000000, TD, TA);
+			      ci[WS(rs, 2)] = T8 + Tz;
+			      TE = FNMS(KP500000000, Tz, T8);
+			      cr[WS(rs, 3)] = TU - TY;
+			      cr[WS(rs, 2)] = FNMS(KP866025403, TT, TQ);
+			      ci[WS(rs, 1)] = FMA(KP866025403, TT, TQ);
+			      ci[0] = FNMS(KP866025403, TP, TE);
+			      cr[WS(rs, 1)] = FMA(KP866025403, TP, TE);
+			      ci[WS(rs, 4)] = FMA(KP866025403, T10, TZ);
+			      cr[WS(rs, 5)] = FMS(KP866025403, T10, TZ);
+			 }
+		    }
+	       }
+	       ci[WS(rs, 5)] = T12 + T11;
+	       T13 = FNMS(KP500000000, T12, T11);
+	       ci[WS(rs, 3)] = FMA(KP866025403, T14, T13);
+	       cr[WS(rs, 4)] = FMS(KP866025403, T14, T13);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 6, "hf_6", twinstr, &GENUS, {24, 10, 22, 0} };
+
+void X(codelet_hf_6) (planner *p) {
+     X(khc2hc_register) (p, hf_6, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hf_6 -include hf.h */
+
+/*
+ * This function contains 46 FP additions, 28 FP multiplications,
+ * (or, 32 additions, 14 multiplications, 14 fused multiply/add),
+ * 23 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hf.h"
+
+static void hf_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
+	       E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
+	       {
+		    E T1, TM, T6, TN;
+		    T1 = cr[0];
+		    TM = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 3)];
+			 T5 = ci[WS(rs, 3)];
+			 T2 = W[4];
+			 T4 = W[5];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 TN = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 - T6;
+		    TS = TN + TM;
+		    Tv = T1 + T6;
+		    TO = TM - TN;
+	       }
+	       {
+		    E Tn, TE, Ts, TD;
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = cr[WS(rs, 4)];
+			 Tm = ci[WS(rs, 4)];
+			 Tj = W[6];
+			 Tl = W[7];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 TE = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = cr[WS(rs, 1)];
+			 Tr = ci[WS(rs, 1)];
+			 To = W[0];
+			 Tq = W[1];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 TD = FNMS(Tq, Tp, To * Tr);
+		    }
+		    Tt = Tn - Ts;
+		    TJ = TE + TD;
+		    Tx = Tn + Ts;
+		    TF = TD - TE;
+	       }
+	       {
+		    E Tc, TA, Th, TB;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = cr[WS(rs, 2)];
+			 Tb = ci[WS(rs, 2)];
+			 T8 = W[2];
+			 Ta = W[3];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 TA = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = cr[WS(rs, 5)];
+			 Tg = ci[WS(rs, 5)];
+			 Td = W[8];
+			 Tf = W[9];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 TB = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Ti = Tc - Th;
+		    TI = TA + TB;
+		    Tw = Tc + Th;
+		    TC = TA - TB;
+	       }
+	       {
+		    E TG, Tu, Tz, TK, Ty, TH;
+		    TG = KP866025403 * (TC + TF);
+		    Tu = Ti + Tt;
+		    Tz = FNMS(KP500000000, Tu, T7);
+		    ci[WS(rs, 2)] = T7 + Tu;
+		    cr[WS(rs, 1)] = Tz + TG;
+		    ci[0] = Tz - TG;
+		    TK = KP866025403 * (TI - TJ);
+		    Ty = Tw + Tx;
+		    TH = FNMS(KP500000000, Ty, Tv);
+		    cr[0] = Tv + Ty;
+		    ci[WS(rs, 1)] = TH + TK;
+		    cr[WS(rs, 2)] = TH - TK;
+	       }
+	       {
+		    E TP, TL, TQ, TR, TT, TU;
+		    TP = KP866025403 * (Tt - Ti);
+		    TL = TF - TC;
+		    TQ = FMA(KP500000000, TL, TO);
+		    cr[WS(rs, 3)] = TL - TO;
+		    ci[WS(rs, 4)] = TP + TQ;
+		    cr[WS(rs, 5)] = TP - TQ;
+		    TR = KP866025403 * (Tw - Tx);
+		    TT = TI + TJ;
+		    TU = FNMS(KP500000000, TT, TS);
+		    cr[WS(rs, 4)] = TR - TU;
+		    ci[WS(rs, 5)] = TT + TS;
+		    ci[WS(rs, 3)] = TR + TU;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 6},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 6, "hf_6", twinstr, &GENUS, {32, 14, 14, 0} };
+
+void X(codelet_hf_6) (planner *p) {
+     X(khc2hc_register) (p, hf_6, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3948 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:55 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 64 -dit -name hf_64 -include hf.h */
+
+/*
+ * This function contains 1038 FP additions, 644 FP multiplications,
+ * (or, 520 additions, 126 multiplications, 518 fused multiply/add),
+ * 246 stack variables, 15 constants, and 256 memory accesses
+ */
+#include "hf.h"
+
+static void hf_64(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 126); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 126, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E Tku, Tky, Tkt, Tkx;
+	       {
+		    E TiV, Tjm, T7e, TcA, TjR, Tkl, Tm, TeM, T7Q, TcI, TeZ, Thr, T1G, TeW, TcJ;
+		    E T7X, T87, TcN, Tf5, Thw, T29, Tf8, TcQ, T8u, Taq, Tdm, Tgc, ThX, T5K, TfS;
+		    E Tdx, Tbj, TcB, T7l, TiP, TeP, Tjl, TN, TcC, T7s, T7B, TcF, TeU, Ths, T7I;
+		    E TcG, T1f, TeR, T8G, TcU, Tfg, ThB, T32, Tfj, TcX, T93, T9h, Td3, TfK, ThM;
+		    E T3X, Tfr, Tde, Taa, Thx, Tfb, Tf6, T2A, T8x, TcO, T8m, TcR, Tfm, ThC, T3t;
+		    E Tfh, T96, TcV, T8V, TcY, ThN, Tfu, TfL, T4o, Tad, Td4, T9w, Tdf, TfV, ThY;
+		    E T6b, Tg9, Tbm, Tdn, TaF, Tdy, ThJ, T4Q, TfN, TfA, Taf, T9M, Td8, Tdh, ThI;
+		    E T5h, TfO, TfF, Tag, Ta1, Tdb, Tdi, ThU, T6D, Tgf, Tg1, Tbo, TaV, Tdr, TdA;
+		    E Tb2, Tds, Tg5, ThT, Tg2, T74, Tdt, Tb9;
+		    {
+			 E T7a, Te, T78, T8, TjQ, TiU, T7c, Tk;
+			 {
+			      E T1, TiT, TiS, T7, Tg, Tj, Tf, Ti, T7b, Th;
+			      T1 = cr[0];
+			      TiT = ci[0];
+			      {
+				   E T3, T6, T2, T5;
+				   T3 = cr[WS(rs, 32)];
+				   T6 = ci[WS(rs, 32)];
+				   T2 = W[62];
+				   T5 = W[63];
+				   {
+					E Ta, Td, Tc, T79, Tb, TiR, T4, T9;
+					Ta = cr[WS(rs, 16)];
+					Td = ci[WS(rs, 16)];
+					TiR = T2 * T6;
+					T4 = T2 * T3;
+					T9 = W[30];
+					Tc = W[31];
+					TiS = FNMS(T5, T3, TiR);
+					T7 = FMA(T5, T6, T4);
+					T79 = T9 * Td;
+					Tb = T9 * Ta;
+					Tg = cr[WS(rs, 48)];
+					Tj = ci[WS(rs, 48)];
+					T7a = FNMS(Tc, Ta, T79);
+					Te = FMA(Tc, Td, Tb);
+					Tf = W[94];
+					Ti = W[95];
+				   }
+			      }
+			      T78 = T1 - T7;
+			      T8 = T1 + T7;
+			      TjQ = TiT - TiS;
+			      TiU = TiS + TiT;
+			      T7b = Tf * Tj;
+			      Th = Tf * Tg;
+			      T7c = FNMS(Ti, Tg, T7b);
+			      Tk = FMA(Ti, Tj, Th);
+			 }
+			 {
+			      E T7S, T1l, T7O, T1E, T1u, T1x, T1w, T7U, T1r, T7L, T1v;
+			      {
+				   E T1A, T1D, T1C, T7N, T1B;
+				   {
+					E T1h, T1k, T1g, T1j, T7R, T1i, T1z;
+					T1h = cr[WS(rs, 60)];
+					T1k = ci[WS(rs, 60)];
+					{
+					     E T7d, TiQ, Tl, TjP;
+					     T7d = T7a - T7c;
+					     TiQ = T7a + T7c;
+					     Tl = Te + Tk;
+					     TjP = Te - Tk;
+					     TiV = TiQ + TiU;
+					     Tjm = TiU - TiQ;
+					     T7e = T78 - T7d;
+					     TcA = T78 + T7d;
+					     TjR = TjP + TjQ;
+					     Tkl = TjQ - TjP;
+					     Tm = T8 + Tl;
+					     TeM = T8 - Tl;
+					     T1g = W[118];
+					}
+					T1j = W[119];
+					T1A = cr[WS(rs, 44)];
+					T1D = ci[WS(rs, 44)];
+					T7R = T1g * T1k;
+					T1i = T1g * T1h;
+					T1z = W[86];
+					T1C = W[87];
+					T7S = FNMS(T1j, T1h, T7R);
+					T1l = FMA(T1j, T1k, T1i);
+					T7N = T1z * T1D;
+					T1B = T1z * T1A;
+				   }
+				   {
+					E T1n, T1q, T1m, T1p, T7T, T1o, T1t;
+					T1n = cr[WS(rs, 28)];
+					T1q = ci[WS(rs, 28)];
+					T7O = FNMS(T1C, T1A, T7N);
+					T1E = FMA(T1C, T1D, T1B);
+					T1m = W[54];
+					T1p = W[55];
+					T1u = cr[WS(rs, 12)];
+					T1x = ci[WS(rs, 12)];
+					T7T = T1m * T1q;
+					T1o = T1m * T1n;
+					T1t = W[22];
+					T1w = W[23];
+					T7U = FNMS(T1p, T1n, T7T);
+					T1r = FMA(T1p, T1q, T1o);
+					T7L = T1t * T1x;
+					T1v = T1t * T1u;
+				   }
+			      }
+			      {
+				   E T7V, TeX, T1s, T7K, T7M, T1y;
+				   T7V = T7S - T7U;
+				   TeX = T7S + T7U;
+				   T1s = T1l + T1r;
+				   T7K = T1l - T1r;
+				   T7M = FNMS(T1w, T1u, T7L);
+				   T1y = FMA(T1w, T1x, T1v);
+				   {
+					E TeY, T7P, T7W, T1F;
+					TeY = T7M + T7O;
+					T7P = T7M - T7O;
+					T7W = T1y - T1E;
+					T1F = T1y + T1E;
+					T7Q = T7K - T7P;
+					TcI = T7K + T7P;
+					TeZ = TeX - TeY;
+					Thr = TeX + TeY;
+					T1G = T1s + T1F;
+					TeW = T1s - T1F;
+					TcJ = T7V - T7W;
+					T7X = T7V + T7W;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T8p, T1O, T85, T27, T1X, T20, T1Z, T8r, T1U, T82, T1Y;
+			 {
+			      E T23, T26, T25, T84, T24;
+			      {
+				   E T1K, T1N, T1J, T1M, T8o, T1L, T22;
+				   T1K = cr[WS(rs, 2)];
+				   T1N = ci[WS(rs, 2)];
+				   T1J = W[2];
+				   T1M = W[3];
+				   T23 = cr[WS(rs, 50)];
+				   T26 = ci[WS(rs, 50)];
+				   T8o = T1J * T1N;
+				   T1L = T1J * T1K;
+				   T22 = W[98];
+				   T25 = W[99];
+				   T8p = FNMS(T1M, T1K, T8o);
+				   T1O = FMA(T1M, T1N, T1L);
+				   T84 = T22 * T26;
+				   T24 = T22 * T23;
+			      }
+			      {
+				   E T1Q, T1T, T1P, T1S, T8q, T1R, T1W;
+				   T1Q = cr[WS(rs, 34)];
+				   T1T = ci[WS(rs, 34)];
+				   T85 = FNMS(T25, T23, T84);
+				   T27 = FMA(T25, T26, T24);
+				   T1P = W[66];
+				   T1S = W[67];
+				   T1X = cr[WS(rs, 18)];
+				   T20 = ci[WS(rs, 18)];
+				   T8q = T1P * T1T;
+				   T1R = T1P * T1Q;
+				   T1W = W[34];
+				   T1Z = W[35];
+				   T8r = FNMS(T1S, T1Q, T8q);
+				   T1U = FMA(T1S, T1T, T1R);
+				   T82 = T1W * T20;
+				   T1Y = T1W * T1X;
+			      }
+			 }
+			 {
+			      E T8s, Tf3, T1V, T81, T83, T21;
+			      T8s = T8p - T8r;
+			      Tf3 = T8p + T8r;
+			      T1V = T1O + T1U;
+			      T81 = T1O - T1U;
+			      T83 = FNMS(T1Z, T1X, T82);
+			      T21 = FMA(T1Z, T20, T1Y);
+			      {
+				   E Tf4, T86, T8t, T28;
+				   Tf4 = T83 + T85;
+				   T86 = T83 - T85;
+				   T8t = T21 - T27;
+				   T28 = T21 + T27;
+				   T87 = T81 - T86;
+				   TcN = T81 + T86;
+				   Tf5 = Tf3 - Tf4;
+				   Thw = Tf3 + Tf4;
+				   T29 = T1V + T28;
+				   Tf8 = T1V - T28;
+				   TcQ = T8s - T8t;
+				   T8u = T8s + T8t;
+			      }
+			 }
+		    }
+		    {
+			 E Tbf, T5p, Tao, T5I, T5y, T5B, T5A, Tbh, T5v, Tal, T5z;
+			 {
+			      E T5E, T5H, T5G, Tan, T5F;
+			      {
+				   E T5l, T5o, T5k, T5n, Tbe, T5m, T5D;
+				   T5l = cr[WS(rs, 63)];
+				   T5o = ci[WS(rs, 63)];
+				   T5k = W[124];
+				   T5n = W[125];
+				   T5E = cr[WS(rs, 47)];
+				   T5H = ci[WS(rs, 47)];
+				   Tbe = T5k * T5o;
+				   T5m = T5k * T5l;
+				   T5D = W[92];
+				   T5G = W[93];
+				   Tbf = FNMS(T5n, T5l, Tbe);
+				   T5p = FMA(T5n, T5o, T5m);
+				   Tan = T5D * T5H;
+				   T5F = T5D * T5E;
+			      }
+			      {
+				   E T5r, T5u, T5q, T5t, Tbg, T5s, T5x;
+				   T5r = cr[WS(rs, 31)];
+				   T5u = ci[WS(rs, 31)];
+				   Tao = FNMS(T5G, T5E, Tan);
+				   T5I = FMA(T5G, T5H, T5F);
+				   T5q = W[60];
+				   T5t = W[61];
+				   T5y = cr[WS(rs, 15)];
+				   T5B = ci[WS(rs, 15)];
+				   Tbg = T5q * T5u;
+				   T5s = T5q * T5r;
+				   T5x = W[28];
+				   T5A = W[29];
+				   Tbh = FNMS(T5t, T5r, Tbg);
+				   T5v = FMA(T5t, T5u, T5s);
+				   Tal = T5x * T5B;
+				   T5z = T5x * T5y;
+			      }
+			 }
+			 {
+			      E Tbi, Tga, T5w, Tak, Tam, T5C;
+			      Tbi = Tbf - Tbh;
+			      Tga = Tbf + Tbh;
+			      T5w = T5p + T5v;
+			      Tak = T5p - T5v;
+			      Tam = FNMS(T5A, T5y, Tal);
+			      T5C = FMA(T5A, T5B, T5z);
+			      {
+				   E Tgb, Tap, T5J, Tbd;
+				   Tgb = Tam + Tao;
+				   Tap = Tam - Tao;
+				   T5J = T5C + T5I;
+				   Tbd = T5I - T5C;
+				   Taq = Tak - Tap;
+				   Tdm = Tak + Tap;
+				   Tgc = Tga - Tgb;
+				   ThX = Tga + Tgb;
+				   T5K = T5w + T5J;
+				   TfS = T5w - T5J;
+				   Tdx = Tbi + Tbd;
+				   Tbj = Tbd - Tbi;
+			      }
+			 }
+		    }
+		    {
+			 E T7z, T1d, T7G, TeS, T11, T7v, T7x, T17, T7r, T7m;
+			 {
+			      E T7h, Ts, T7q, TL, TB, TE, TD, T7j, Ty, T7n, TC;
+			      {
+				   E TH, TK, TJ, T7p, TI;
+				   {
+					E To, Tr, Tn, Tq, T7g, Tp, TG;
+					To = cr[WS(rs, 8)];
+					Tr = ci[WS(rs, 8)];
+					Tn = W[14];
+					Tq = W[15];
+					TH = cr[WS(rs, 24)];
+					TK = ci[WS(rs, 24)];
+					T7g = Tn * Tr;
+					Tp = Tn * To;
+					TG = W[46];
+					TJ = W[47];
+					T7h = FNMS(Tq, To, T7g);
+					Ts = FMA(Tq, Tr, Tp);
+					T7p = TG * TK;
+					TI = TG * TH;
+				   }
+				   {
+					E Tu, Tx, Tt, Tw, T7i, Tv, TA;
+					Tu = cr[WS(rs, 40)];
+					Tx = ci[WS(rs, 40)];
+					T7q = FNMS(TJ, TH, T7p);
+					TL = FMA(TJ, TK, TI);
+					Tt = W[78];
+					Tw = W[79];
+					TB = cr[WS(rs, 56)];
+					TE = ci[WS(rs, 56)];
+					T7i = Tt * Tx;
+					Tv = Tt * Tu;
+					TA = W[110];
+					TD = W[111];
+					T7j = FNMS(Tw, Tu, T7i);
+					Ty = FMA(Tw, Tx, Tv);
+					T7n = TA * TE;
+					TC = TA * TB;
+				   }
+			      }
+			      {
+				   E T7k, TeO, Tz, T7f, T7o, TF, TeN, TM;
+				   T7k = T7h - T7j;
+				   TeO = T7h + T7j;
+				   Tz = Ts + Ty;
+				   T7f = Ts - Ty;
+				   T7o = FNMS(TD, TB, T7n);
+				   TF = FMA(TD, TE, TC);
+				   T7r = T7o - T7q;
+				   TeN = T7o + T7q;
+				   TM = TF + TL;
+				   T7m = TF - TL;
+				   TcB = T7f + T7k;
+				   T7l = T7f - T7k;
+				   TiP = TeO + TeN;
+				   TeP = TeN - TeO;
+				   Tjl = Tz - TM;
+				   TN = Tz + TM;
+			      }
+			 }
+			 {
+			      E T7D, TU, T13, T16, T7F, T10, T12, T15, T7w, T14;
+			      {
+				   E T19, T1c, T18, T1b;
+				   {
+					E TQ, TT, TS, T7C, TR, TP;
+					TQ = cr[WS(rs, 4)];
+					TT = ci[WS(rs, 4)];
+					TP = W[6];
+					TcC = T7m - T7r;
+					T7s = T7m + T7r;
+					TS = W[7];
+					T7C = TP * TT;
+					TR = TP * TQ;
+					T19 = cr[WS(rs, 52)];
+					T1c = ci[WS(rs, 52)];
+					T7D = FNMS(TS, TQ, T7C);
+					TU = FMA(TS, TT, TR);
+					T18 = W[102];
+					T1b = W[103];
+				   }
+				   {
+					E TW, TZ, TY, T7E, TX, T7y, T1a, TV;
+					TW = cr[WS(rs, 36)];
+					TZ = ci[WS(rs, 36)];
+					T7y = T18 * T1c;
+					T1a = T18 * T19;
+					TV = W[70];
+					TY = W[71];
+					T7z = FNMS(T1b, T19, T7y);
+					T1d = FMA(T1b, T1c, T1a);
+					T7E = TV * TZ;
+					TX = TV * TW;
+					T13 = cr[WS(rs, 20)];
+					T16 = ci[WS(rs, 20)];
+					T7F = FNMS(TY, TW, T7E);
+					T10 = FMA(TY, TZ, TX);
+					T12 = W[38];
+					T15 = W[39];
+				   }
+			      }
+			      T7G = T7D - T7F;
+			      TeS = T7D + T7F;
+			      T11 = TU + T10;
+			      T7v = TU - T10;
+			      T7w = T12 * T16;
+			      T14 = T12 * T13;
+			      T7x = FNMS(T15, T13, T7w);
+			      T17 = FMA(T15, T16, T14);
+			 }
+			 {
+			      E T8Y, T2H, T8E, T30, T2Q, T2T, T2S, T90, T2N, T8B, T2R;
+			      {
+				   E T2W, T2Z, T2Y, T8D, T2X;
+				   {
+					E T2D, T2G, T2C, T2F, T8X, T2E, T2V;
+					T2D = cr[WS(rs, 62)];
+					T2G = ci[WS(rs, 62)];
+					{
+					     E TeT, T7A, T1e, T7H;
+					     TeT = T7x + T7z;
+					     T7A = T7x - T7z;
+					     T1e = T17 + T1d;
+					     T7H = T17 - T1d;
+					     T7B = T7v - T7A;
+					     TcF = T7v + T7A;
+					     TeU = TeS - TeT;
+					     Ths = TeS + TeT;
+					     T7I = T7G + T7H;
+					     TcG = T7G - T7H;
+					     T1f = T11 + T1e;
+					     TeR = T11 - T1e;
+					     T2C = W[122];
+					}
+					T2F = W[123];
+					T2W = cr[WS(rs, 46)];
+					T2Z = ci[WS(rs, 46)];
+					T8X = T2C * T2G;
+					T2E = T2C * T2D;
+					T2V = W[90];
+					T2Y = W[91];
+					T8Y = FNMS(T2F, T2D, T8X);
+					T2H = FMA(T2F, T2G, T2E);
+					T8D = T2V * T2Z;
+					T2X = T2V * T2W;
+				   }
+				   {
+					E T2J, T2M, T2I, T2L, T8Z, T2K, T2P;
+					T2J = cr[WS(rs, 30)];
+					T2M = ci[WS(rs, 30)];
+					T8E = FNMS(T2Y, T2W, T8D);
+					T30 = FMA(T2Y, T2Z, T2X);
+					T2I = W[58];
+					T2L = W[59];
+					T2Q = cr[WS(rs, 14)];
+					T2T = ci[WS(rs, 14)];
+					T8Z = T2I * T2M;
+					T2K = T2I * T2J;
+					T2P = W[26];
+					T2S = W[27];
+					T90 = FNMS(T2L, T2J, T8Z);
+					T2N = FMA(T2L, T2M, T2K);
+					T8B = T2P * T2T;
+					T2R = T2P * T2Q;
+				   }
+			      }
+			      {
+				   E T91, Tfe, T2O, T8A, T8C, T2U;
+				   T91 = T8Y - T90;
+				   Tfe = T8Y + T90;
+				   T2O = T2H + T2N;
+				   T8A = T2H - T2N;
+				   T8C = FNMS(T2S, T2Q, T8B);
+				   T2U = FMA(T2S, T2T, T2R);
+				   {
+					E Tff, T8F, T92, T31;
+					Tff = T8C + T8E;
+					T8F = T8C - T8E;
+					T92 = T2U - T30;
+					T31 = T2U + T30;
+					T8G = T8A - T8F;
+					TcU = T8A + T8F;
+					Tfg = Tfe - Tff;
+					ThB = Tfe + Tff;
+					T32 = T2O + T31;
+					Tfj = T2O - T31;
+					TcX = T91 - T92;
+					T93 = T91 + T92;
+				   }
+			      }
+			 }
+			 {
+			      E Ta5, T3C, T9f, T3V, T3L, T3O, T3N, Ta7, T3I, T9c, T3M;
+			      {
+				   E T3R, T3U, T3T, T9e, T3S;
+				   {
+					E T3y, T3B, T3x, T3A, Ta4, T3z, T3Q;
+					T3y = cr[WS(rs, 1)];
+					T3B = ci[WS(rs, 1)];
+					T3x = W[0];
+					T3A = W[1];
+					T3R = cr[WS(rs, 49)];
+					T3U = ci[WS(rs, 49)];
+					Ta4 = T3x * T3B;
+					T3z = T3x * T3y;
+					T3Q = W[96];
+					T3T = W[97];
+					Ta5 = FNMS(T3A, T3y, Ta4);
+					T3C = FMA(T3A, T3B, T3z);
+					T9e = T3Q * T3U;
+					T3S = T3Q * T3R;
+				   }
+				   {
+					E T3E, T3H, T3D, T3G, Ta6, T3F, T3K;
+					T3E = cr[WS(rs, 33)];
+					T3H = ci[WS(rs, 33)];
+					T9f = FNMS(T3T, T3R, T9e);
+					T3V = FMA(T3T, T3U, T3S);
+					T3D = W[64];
+					T3G = W[65];
+					T3L = cr[WS(rs, 17)];
+					T3O = ci[WS(rs, 17)];
+					Ta6 = T3D * T3H;
+					T3F = T3D * T3E;
+					T3K = W[32];
+					T3N = W[33];
+					Ta7 = FNMS(T3G, T3E, Ta6);
+					T3I = FMA(T3G, T3H, T3F);
+					T9c = T3K * T3O;
+					T3M = T3K * T3L;
+				   }
+			      }
+			      {
+				   E Ta8, TfI, T3J, T9b, T9d, T3P;
+				   Ta8 = Ta5 - Ta7;
+				   TfI = Ta5 + Ta7;
+				   T3J = T3C + T3I;
+				   T9b = T3C - T3I;
+				   T9d = FNMS(T3N, T3L, T9c);
+				   T3P = FMA(T3N, T3O, T3M);
+				   {
+					E TfJ, T9g, Ta9, T3W;
+					TfJ = T9d + T9f;
+					T9g = T9d - T9f;
+					Ta9 = T3P - T3V;
+					T3W = T3P + T3V;
+					T9h = T9b - T9g;
+					Td3 = T9b + T9g;
+					TfK = TfI - TfJ;
+					ThM = TfI + TfJ;
+					T3X = T3J + T3W;
+					Tfr = T3J - T3W;
+					Tde = Ta8 - Ta9;
+					Taa = Ta8 + Ta9;
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E TaC, T69, Taw, TfU, T5X, Tar, TaA, T63;
+			 {
+			      E T8S, T3r, T8M, Tfl, T3f, T8H, T8Q, T3l;
+			      {
+				   E T8k, T8f, T8v, T8e;
+				   {
+					E T8a, T2f, T8j, T2y, T2o, T2r, T2q, T8c, T2l, T8g, T2p;
+					{
+					     E T2u, T2x, T2w, T8i, T2v;
+					     {
+						  E T2b, T2e, T2a, T2d, T89, T2c, T2t;
+						  T2b = cr[WS(rs, 10)];
+						  T2e = ci[WS(rs, 10)];
+						  T2a = W[18];
+						  T2d = W[19];
+						  T2u = cr[WS(rs, 26)];
+						  T2x = ci[WS(rs, 26)];
+						  T89 = T2a * T2e;
+						  T2c = T2a * T2b;
+						  T2t = W[50];
+						  T2w = W[51];
+						  T8a = FNMS(T2d, T2b, T89);
+						  T2f = FMA(T2d, T2e, T2c);
+						  T8i = T2t * T2x;
+						  T2v = T2t * T2u;
+					     }
+					     {
+						  E T2h, T2k, T2g, T2j, T8b, T2i, T2n;
+						  T2h = cr[WS(rs, 42)];
+						  T2k = ci[WS(rs, 42)];
+						  T8j = FNMS(T2w, T2u, T8i);
+						  T2y = FMA(T2w, T2x, T2v);
+						  T2g = W[82];
+						  T2j = W[83];
+						  T2o = cr[WS(rs, 58)];
+						  T2r = ci[WS(rs, 58)];
+						  T8b = T2g * T2k;
+						  T2i = T2g * T2h;
+						  T2n = W[114];
+						  T2q = W[115];
+						  T8c = FNMS(T2j, T2h, T8b);
+						  T2l = FMA(T2j, T2k, T2i);
+						  T8g = T2n * T2r;
+						  T2p = T2n * T2o;
+					     }
+					}
+					{
+					     E T8d, Tfa, T2m, T88, T8h, T2s, Tf9, T2z;
+					     T8d = T8a - T8c;
+					     Tfa = T8a + T8c;
+					     T2m = T2f + T2l;
+					     T88 = T2f - T2l;
+					     T8h = FNMS(T2q, T2o, T8g);
+					     T2s = FMA(T2q, T2r, T2p);
+					     T8k = T8h - T8j;
+					     Tf9 = T8h + T8j;
+					     T2z = T2s + T2y;
+					     T8f = T2s - T2y;
+					     T8v = T88 + T8d;
+					     T8e = T88 - T8d;
+					     Thx = Tfa + Tf9;
+					     Tfb = Tf9 - Tfa;
+					     Tf6 = T2m - T2z;
+					     T2A = T2m + T2z;
+					}
+				   }
+				   {
+					E T38, T8J, T3h, T3k, T8L, T3e, T3g, T3j, T8P, T3i;
+					{
+					     E T3n, T3q, T3m, T3p;
+					     {
+						  E T34, T37, T33, T8w, T8l, T36, T8I, T35;
+						  T34 = cr[WS(rs, 6)];
+						  T37 = ci[WS(rs, 6)];
+						  T33 = W[10];
+						  T8w = T8k - T8f;
+						  T8l = T8f + T8k;
+						  T36 = W[11];
+						  T8I = T33 * T37;
+						  T35 = T33 * T34;
+						  T8x = T8v + T8w;
+						  TcO = T8v - T8w;
+						  T8m = T8e + T8l;
+						  TcR = T8l - T8e;
+						  T38 = FMA(T36, T37, T35);
+						  T8J = FNMS(T36, T34, T8I);
+					     }
+					     T3n = cr[WS(rs, 22)];
+					     T3q = ci[WS(rs, 22)];
+					     T3m = W[42];
+					     T3p = W[43];
+					     {
+						  E T3a, T3d, T3c, T8K, T3b, T8R, T3o, T39;
+						  T3a = cr[WS(rs, 38)];
+						  T3d = ci[WS(rs, 38)];
+						  T8R = T3m * T3q;
+						  T3o = T3m * T3n;
+						  T39 = W[74];
+						  T3c = W[75];
+						  T8S = FNMS(T3p, T3n, T8R);
+						  T3r = FMA(T3p, T3q, T3o);
+						  T8K = T39 * T3d;
+						  T3b = T39 * T3a;
+						  T3h = cr[WS(rs, 54)];
+						  T3k = ci[WS(rs, 54)];
+						  T8L = FNMS(T3c, T3a, T8K);
+						  T3e = FMA(T3c, T3d, T3b);
+						  T3g = W[106];
+						  T3j = W[107];
+					     }
+					}
+					T8M = T8J - T8L;
+					Tfl = T8J + T8L;
+					T3f = T38 + T3e;
+					T8H = T38 - T3e;
+					T8P = T3g * T3k;
+					T3i = T3g * T3h;
+					T8Q = FNMS(T3j, T3h, T8P);
+					T3l = FMA(T3j, T3k, T3i);
+				   }
+			      }
+			      {
+				   E T9u, T9p, Tab, T9o;
+				   {
+					E T9k, T43, T9t, T4m, T4c, T4f, T4e, T9m, T49, T9q, T4d;
+					{
+					     E T4i, T4l, T4k, T9s, T4j;
+					     {
+						  E T3Z, T42, T3Y, T41, T9j, T40, T4h;
+						  {
+						       E T94, T8N, T8T, Tfk, T8O, T3s, T8U, T95;
+						       T3Z = cr[WS(rs, 9)];
+						       T94 = T8H + T8M;
+						       T8N = T8H - T8M;
+						       T8T = T8Q - T8S;
+						       Tfk = T8Q + T8S;
+						       T8O = T3l - T3r;
+						       T3s = T3l + T3r;
+						       T42 = ci[WS(rs, 9)];
+						       Tfm = Tfk - Tfl;
+						       ThC = Tfl + Tfk;
+						       T8U = T8O + T8T;
+						       T95 = T8T - T8O;
+						       T3t = T3f + T3s;
+						       Tfh = T3f - T3s;
+						       T96 = T94 + T95;
+						       TcV = T94 - T95;
+						       T8V = T8N + T8U;
+						       TcY = T8U - T8N;
+						       T3Y = W[16];
+						  }
+						  T41 = W[17];
+						  T4i = cr[WS(rs, 25)];
+						  T4l = ci[WS(rs, 25)];
+						  T9j = T3Y * T42;
+						  T40 = T3Y * T3Z;
+						  T4h = W[48];
+						  T4k = W[49];
+						  T9k = FNMS(T41, T3Z, T9j);
+						  T43 = FMA(T41, T42, T40);
+						  T9s = T4h * T4l;
+						  T4j = T4h * T4i;
+					     }
+					     {
+						  E T45, T48, T44, T47, T9l, T46, T4b;
+						  T45 = cr[WS(rs, 41)];
+						  T48 = ci[WS(rs, 41)];
+						  T9t = FNMS(T4k, T4i, T9s);
+						  T4m = FMA(T4k, T4l, T4j);
+						  T44 = W[80];
+						  T47 = W[81];
+						  T4c = cr[WS(rs, 57)];
+						  T4f = ci[WS(rs, 57)];
+						  T9l = T44 * T48;
+						  T46 = T44 * T45;
+						  T4b = W[112];
+						  T4e = W[113];
+						  T9m = FNMS(T47, T45, T9l);
+						  T49 = FMA(T47, T48, T46);
+						  T9q = T4b * T4f;
+						  T4d = T4b * T4c;
+					     }
+					}
+					{
+					     E T9n, Tft, T4a, T9i, T9r, T4g, Tfs, T4n;
+					     T9n = T9k - T9m;
+					     Tft = T9k + T9m;
+					     T4a = T43 + T49;
+					     T9i = T43 - T49;
+					     T9r = FNMS(T4e, T4c, T9q);
+					     T4g = FMA(T4e, T4f, T4d);
+					     T9u = T9r - T9t;
+					     Tfs = T9r + T9t;
+					     T4n = T4g + T4m;
+					     T9p = T4g - T4m;
+					     Tab = T9i + T9n;
+					     T9o = T9i - T9n;
+					     ThN = Tft + Tfs;
+					     Tfu = Tfs - Tft;
+					     TfL = T4a - T4n;
+					     T4o = T4a + T4n;
+					}
+				   }
+				   {
+					E T5Q, Tat, T5Z, T62, Tav, T5W, T5Y, T61, Taz, T60;
+					{
+					     E T65, T68, T64, T67;
+					     {
+						  E T5M, T5P, T5L, Tac, T9v, T5O, Tas, T5N;
+						  T5M = cr[WS(rs, 7)];
+						  T5P = ci[WS(rs, 7)];
+						  T5L = W[12];
+						  Tac = T9u - T9p;
+						  T9v = T9p + T9u;
+						  T5O = W[13];
+						  Tas = T5L * T5P;
+						  T5N = T5L * T5M;
+						  Tad = Tab + Tac;
+						  Td4 = Tab - Tac;
+						  T9w = T9o + T9v;
+						  Tdf = T9v - T9o;
+						  T5Q = FMA(T5O, T5P, T5N);
+						  Tat = FNMS(T5O, T5M, Tas);
+					     }
+					     T65 = cr[WS(rs, 23)];
+					     T68 = ci[WS(rs, 23)];
+					     T64 = W[44];
+					     T67 = W[45];
+					     {
+						  E T5S, T5V, T5U, Tau, T5T, TaB, T66, T5R;
+						  T5S = cr[WS(rs, 39)];
+						  T5V = ci[WS(rs, 39)];
+						  TaB = T64 * T68;
+						  T66 = T64 * T65;
+						  T5R = W[76];
+						  T5U = W[77];
+						  TaC = FNMS(T67, T65, TaB);
+						  T69 = FMA(T67, T68, T66);
+						  Tau = T5R * T5V;
+						  T5T = T5R * T5S;
+						  T5Z = cr[WS(rs, 55)];
+						  T62 = ci[WS(rs, 55)];
+						  Tav = FNMS(T5U, T5S, Tau);
+						  T5W = FMA(T5U, T5V, T5T);
+						  T5Y = W[108];
+						  T61 = W[109];
+					     }
+					}
+					Taw = Tat - Tav;
+					TfU = Tat + Tav;
+					T5X = T5Q + T5W;
+					Tar = T5Q - T5W;
+					Taz = T5Y * T62;
+					T60 = T5Y * T5Z;
+					TaA = FNMS(T61, T5Z, Taz);
+					T63 = FMA(T61, T62, T60);
+				   }
+			      }
+			 }
+			 {
+			      E T9T, Td9, TfE, TfB, Tda, Ta0;
+			      {
+				   E T9E, Td6, Tfz, Tfw, Td7, T9L;
+				   {
+					E T9G, T4v, T9C, T4O, T4E, T4H, T4G, T9I, T4B, T9z, T4F;
+					{
+					     E T4K, T4N, T4M, T9B, T4L;
+					     {
+						  E T4r, T4u, T4q, T4t, T9F, T4s, T4J;
+						  {
+						       E Tbl, Tax, TaD, TfT, Tay, T6a, TaE, Tbk;
+						       T4r = cr[WS(rs, 5)];
+						       Tbl = Tar + Taw;
+						       Tax = Tar - Taw;
+						       TaD = TaA - TaC;
+						       TfT = TaA + TaC;
+						       Tay = T63 - T69;
+						       T6a = T63 + T69;
+						       T4u = ci[WS(rs, 5)];
+						       TfV = TfT - TfU;
+						       ThY = TfU + TfT;
+						       TaE = Tay + TaD;
+						       Tbk = Tay - TaD;
+						       T6b = T5X + T6a;
+						       Tg9 = T6a - T5X;
+						       Tbm = Tbk - Tbl;
+						       Tdn = Tbl + Tbk;
+						       TaF = Tax + TaE;
+						       Tdy = TaE - Tax;
+						       T4q = W[8];
+						  }
+						  T4t = W[9];
+						  T4K = cr[WS(rs, 53)];
+						  T4N = ci[WS(rs, 53)];
+						  T9F = T4q * T4u;
+						  T4s = T4q * T4r;
+						  T4J = W[104];
+						  T4M = W[105];
+						  T9G = FNMS(T4t, T4r, T9F);
+						  T4v = FMA(T4t, T4u, T4s);
+						  T9B = T4J * T4N;
+						  T4L = T4J * T4K;
+					     }
+					     {
+						  E T4x, T4A, T4w, T4z, T9H, T4y, T4D;
+						  T4x = cr[WS(rs, 37)];
+						  T4A = ci[WS(rs, 37)];
+						  T9C = FNMS(T4M, T4K, T9B);
+						  T4O = FMA(T4M, T4N, T4L);
+						  T4w = W[72];
+						  T4z = W[73];
+						  T4E = cr[WS(rs, 21)];
+						  T4H = ci[WS(rs, 21)];
+						  T9H = T4w * T4A;
+						  T4y = T4w * T4x;
+						  T4D = W[40];
+						  T4G = W[41];
+						  T9I = FNMS(T4z, T4x, T9H);
+						  T4B = FMA(T4z, T4A, T4y);
+						  T9z = T4D * T4H;
+						  T4F = T4D * T4E;
+					     }
+					}
+					{
+					     E T9J, Tfx, T4C, T9y, T9A, T4I;
+					     T9J = T9G - T9I;
+					     Tfx = T9G + T9I;
+					     T4C = T4v + T4B;
+					     T9y = T4v - T4B;
+					     T9A = FNMS(T4G, T4E, T9z);
+					     T4I = FMA(T4G, T4H, T4F);
+					     {
+						  E Tfy, T9D, T9K, T4P;
+						  Tfy = T9A + T9C;
+						  T9D = T9A - T9C;
+						  T9K = T4I - T4O;
+						  T4P = T4I + T4O;
+						  T9E = T9y - T9D;
+						  Td6 = T9y + T9D;
+						  Tfz = Tfx - Tfy;
+						  ThJ = Tfx + Tfy;
+						  Tfw = T4C - T4P;
+						  T4Q = T4C + T4P;
+						  Td7 = T9J - T9K;
+						  T9L = T9J + T9K;
+					     }
+					}
+				   }
+				   {
+					E T9V, T4W, T9R, T5f, T55, T58, T57, T9X, T52, T9O, T56;
+					{
+					     E T5b, T5e, T5d, T9Q, T5c;
+					     {
+						  E T4S, T4V, T4R, T4U, T9U, T4T, T5a;
+						  T4S = cr[WS(rs, 61)];
+						  TfN = Tfw + Tfz;
+						  TfA = Tfw - Tfz;
+						  Taf = FMA(KP414213562, T9E, T9L);
+						  T9M = FNMS(KP414213562, T9L, T9E);
+						  Td8 = FMA(KP414213562, Td7, Td6);
+						  Tdh = FNMS(KP414213562, Td6, Td7);
+						  T4V = ci[WS(rs, 61)];
+						  T4R = W[120];
+						  T4U = W[121];
+						  T5b = cr[WS(rs, 45)];
+						  T5e = ci[WS(rs, 45)];
+						  T9U = T4R * T4V;
+						  T4T = T4R * T4S;
+						  T5a = W[88];
+						  T5d = W[89];
+						  T9V = FNMS(T4U, T4S, T9U);
+						  T4W = FMA(T4U, T4V, T4T);
+						  T9Q = T5a * T5e;
+						  T5c = T5a * T5b;
+					     }
+					     {
+						  E T4Y, T51, T4X, T50, T9W, T4Z, T54;
+						  T4Y = cr[WS(rs, 29)];
+						  T51 = ci[WS(rs, 29)];
+						  T9R = FNMS(T5d, T5b, T9Q);
+						  T5f = FMA(T5d, T5e, T5c);
+						  T4X = W[56];
+						  T50 = W[57];
+						  T55 = cr[WS(rs, 13)];
+						  T58 = ci[WS(rs, 13)];
+						  T9W = T4X * T51;
+						  T4Z = T4X * T4Y;
+						  T54 = W[24];
+						  T57 = W[25];
+						  T9X = FNMS(T50, T4Y, T9W);
+						  T52 = FMA(T50, T51, T4Z);
+						  T9O = T54 * T58;
+						  T56 = T54 * T55;
+					     }
+					}
+					{
+					     E T9Y, TfC, T53, T9N, T9P, T59;
+					     T9Y = T9V - T9X;
+					     TfC = T9V + T9X;
+					     T53 = T4W + T52;
+					     T9N = T4W - T52;
+					     T9P = FNMS(T57, T55, T9O);
+					     T59 = FMA(T57, T58, T56);
+					     {
+						  E TfD, T9S, T9Z, T5g;
+						  TfD = T9P + T9R;
+						  T9S = T9P - T9R;
+						  T9Z = T59 - T5f;
+						  T5g = T59 + T5f;
+						  T9T = T9N - T9S;
+						  Td9 = T9N + T9S;
+						  TfE = TfC - TfD;
+						  ThI = TfC + TfD;
+						  TfB = T53 - T5g;
+						  T5h = T53 + T5g;
+						  Tda = T9Y - T9Z;
+						  Ta0 = T9Y + T9Z;
+					     }
+					}
+				   }
+			      }
+			      {
+				   E TaN, Tdp, Tg0, TfX, Tdq, TaU;
+				   {
+					E TaQ, T6i, TaL, T6B, T6r, T6u, T6t, TaS, T6o, TaI, T6s;
+					{
+					     E T6x, T6A, T6z, TaK, T6y;
+					     {
+						  E T6e, T6h, T6d, T6g, TaP, T6f, T6w;
+						  T6e = cr[WS(rs, 3)];
+						  TfO = TfE - TfB;
+						  TfF = TfB + TfE;
+						  Tag = FNMS(KP414213562, T9T, Ta0);
+						  Ta1 = FMA(KP414213562, Ta0, T9T);
+						  Tdb = FNMS(KP414213562, Tda, Td9);
+						  Tdi = FMA(KP414213562, Td9, Tda);
+						  T6h = ci[WS(rs, 3)];
+						  T6d = W[4];
+						  T6g = W[5];
+						  T6x = cr[WS(rs, 51)];
+						  T6A = ci[WS(rs, 51)];
+						  TaP = T6d * T6h;
+						  T6f = T6d * T6e;
+						  T6w = W[100];
+						  T6z = W[101];
+						  TaQ = FNMS(T6g, T6e, TaP);
+						  T6i = FMA(T6g, T6h, T6f);
+						  TaK = T6w * T6A;
+						  T6y = T6w * T6x;
+					     }
+					     {
+						  E T6k, T6n, T6j, T6m, TaR, T6l, T6q;
+						  T6k = cr[WS(rs, 35)];
+						  T6n = ci[WS(rs, 35)];
+						  TaL = FNMS(T6z, T6x, TaK);
+						  T6B = FMA(T6z, T6A, T6y);
+						  T6j = W[68];
+						  T6m = W[69];
+						  T6r = cr[WS(rs, 19)];
+						  T6u = ci[WS(rs, 19)];
+						  TaR = T6j * T6n;
+						  T6l = T6j * T6k;
+						  T6q = W[36];
+						  T6t = W[37];
+						  TaS = FNMS(T6m, T6k, TaR);
+						  T6o = FMA(T6m, T6n, T6l);
+						  TaI = T6q * T6u;
+						  T6s = T6q * T6r;
+					     }
+					}
+					{
+					     E TaT, TfY, T6p, TaH, TaJ, T6v;
+					     TaT = TaQ - TaS;
+					     TfY = TaQ + TaS;
+					     T6p = T6i + T6o;
+					     TaH = T6i - T6o;
+					     TaJ = FNMS(T6t, T6r, TaI);
+					     T6v = FMA(T6t, T6u, T6s);
+					     {
+						  E TfZ, TaM, T6C, TaO;
+						  TfZ = TaJ + TaL;
+						  TaM = TaJ - TaL;
+						  T6C = T6v + T6B;
+						  TaO = T6B - T6v;
+						  TaN = TaH - TaM;
+						  Tdp = TaH + TaM;
+						  Tg0 = TfY - TfZ;
+						  ThU = TfY + TfZ;
+						  TfX = T6p - T6C;
+						  T6D = T6p + T6C;
+						  Tdq = TaT + TaO;
+						  TaU = TaO - TaT;
+					     }
+					}
+				   }
+				   {
+					E Tb5, T6J, Tb0, T72, T6S, T6V, T6U, Tb7, T6P, TaX, T6T;
+					{
+					     E T6Y, T71, T70, TaZ, T6Z;
+					     {
+						  E T6F, T6I, T6E, T6H, Tb4, T6G, T6X;
+						  T6F = cr[WS(rs, 59)];
+						  Tgf = TfX + Tg0;
+						  Tg1 = TfX - Tg0;
+						  Tbo = FNMS(KP414213562, TaN, TaU);
+						  TaV = FMA(KP414213562, TaU, TaN);
+						  Tdr = FMA(KP414213562, Tdq, Tdp);
+						  TdA = FNMS(KP414213562, Tdp, Tdq);
+						  T6I = ci[WS(rs, 59)];
+						  T6E = W[116];
+						  T6H = W[117];
+						  T6Y = cr[WS(rs, 43)];
+						  T71 = ci[WS(rs, 43)];
+						  Tb4 = T6E * T6I;
+						  T6G = T6E * T6F;
+						  T6X = W[84];
+						  T70 = W[85];
+						  Tb5 = FNMS(T6H, T6F, Tb4);
+						  T6J = FMA(T6H, T6I, T6G);
+						  TaZ = T6X * T71;
+						  T6Z = T6X * T6Y;
+					     }
+					     {
+						  E T6L, T6O, T6K, T6N, Tb6, T6M, T6R;
+						  T6L = cr[WS(rs, 27)];
+						  T6O = ci[WS(rs, 27)];
+						  Tb0 = FNMS(T70, T6Y, TaZ);
+						  T72 = FMA(T70, T71, T6Z);
+						  T6K = W[52];
+						  T6N = W[53];
+						  T6S = cr[WS(rs, 11)];
+						  T6V = ci[WS(rs, 11)];
+						  Tb6 = T6K * T6O;
+						  T6M = T6K * T6L;
+						  T6R = W[20];
+						  T6U = W[21];
+						  Tb7 = FNMS(T6N, T6L, Tb6);
+						  T6P = FMA(T6N, T6O, T6M);
+						  TaX = T6R * T6V;
+						  T6T = T6R * T6S;
+					     }
+					}
+					{
+					     E Tb8, Tg3, T6Q, TaW, TaY, T6W;
+					     Tb8 = Tb5 - Tb7;
+					     Tg3 = Tb5 + Tb7;
+					     T6Q = T6J + T6P;
+					     TaW = T6J - T6P;
+					     TaY = FNMS(T6U, T6S, TaX);
+					     T6W = FMA(T6U, T6V, T6T);
+					     {
+						  E Tg4, Tb1, T73, Tb3;
+						  Tg4 = TaY + Tb0;
+						  Tb1 = TaY - Tb0;
+						  T73 = T6W + T72;
+						  Tb3 = T72 - T6W;
+						  Tb2 = TaW - Tb1;
+						  Tds = TaW + Tb1;
+						  Tg5 = Tg3 - Tg4;
+						  ThT = Tg3 + Tg4;
+						  Tg2 = T6Q - T73;
+						  T74 = T6Q + T73;
+						  Tdt = Tb8 + Tb3;
+						  Tb9 = Tb3 - Tb8;
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E Thq, Tge, Tg6, Tdu, TdB, Tj7, Thv, ThA, Tht, Tj8, ThD, Thy, ThS, Ti0, ThZ;
+			 E ThV, ThH, ThP, ThO, ThK, Tkm, TcD, Tk0, Tk4, TjZ, Tk3, Tik, Tin;
+			 {
+			      E Tbp, Tba, TiI, TiL;
+			      {
+				   E Tio, T1I, Tj1, T3v, Tj2, TiX, TiN, Tir, T76, TiJ, TiC, TiG, T5j, Tit, Tiw;
+				   E TiK;
+				   {
+					E TiO, TiW, Tip, Tiq;
+					{
+					     E TO, T1H, T2B, T3u;
+					     Thq = Tm - TN;
+					     TO = Tm + TN;
+					     Tge = Tg2 - Tg5;
+					     Tg6 = Tg2 + Tg5;
+					     Tbp = FMA(KP414213562, Tb2, Tb9);
+					     Tba = FNMS(KP414213562, Tb9, Tb2);
+					     Tdu = FNMS(KP414213562, Tdt, Tds);
+					     TdB = FMA(KP414213562, Tds, Tdt);
+					     T1H = T1f + T1G;
+					     Tj7 = T1f - T1G;
+					     Thv = T29 - T2A;
+					     T2B = T29 + T2A;
+					     T3u = T32 + T3t;
+					     ThA = T32 - T3t;
+					     Tht = Thr - Ths;
+					     TiO = Ths + Thr;
+					     Tio = TO - T1H;
+					     T1I = TO + T1H;
+					     Tj1 = T2B - T3u;
+					     T3v = T2B + T3u;
+					     TiW = TiP + TiV;
+					     Tj8 = TiV - TiP;
+					}
+					ThD = ThB - ThC;
+					Tip = ThB + ThC;
+					Tiq = Thw + Thx;
+					Thy = Thw - Thx;
+					{
+					     E T6c, T75, Tiz, TiA;
+					     ThS = T5K - T6b;
+					     T6c = T5K + T6b;
+					     Tj2 = TiW - TiO;
+					     TiX = TiO + TiW;
+					     TiN = Tiq + Tip;
+					     Tir = Tip - Tiq;
+					     T75 = T6D + T74;
+					     Ti0 = T74 - T6D;
+					     ThZ = ThX - ThY;
+					     Tiz = ThX + ThY;
+					     TiA = ThU + ThT;
+					     ThV = ThT - ThU;
+					     {
+						  E T4p, Tiy, TiB, T5i, Tiu, Tiv;
+						  ThH = T3X - T4o;
+						  T4p = T3X + T4o;
+						  T76 = T6c + T75;
+						  Tiy = T6c - T75;
+						  TiJ = Tiz + TiA;
+						  TiB = Tiz - TiA;
+						  T5i = T4Q + T5h;
+						  ThP = T4Q - T5h;
+						  ThO = ThM - ThN;
+						  Tiu = ThM + ThN;
+						  Tiv = ThJ + ThI;
+						  ThK = ThI - ThJ;
+						  TiC = Tiy - TiB;
+						  TiG = Tiy + TiB;
+						  T5j = T4p + T5i;
+						  Tit = T4p - T5i;
+						  Tiw = Tiu - Tiv;
+						  TiK = Tiu + Tiv;
+					     }
+					}
+				   }
+				   {
+					E TiZ, TiD, TiH, TiE, Tis, TiM, TiY, Tj0;
+					{
+					     E T3w, TiF, Tix, T77, Tj5, Tj3, Tj6, Tj4;
+					     TiI = T1I - T3v;
+					     T3w = T1I + T3v;
+					     TiF = Tit - Tiw;
+					     Tix = Tit + Tiw;
+					     T77 = T5j + T76;
+					     TiZ = T76 - T5j;
+					     Tj5 = Tj2 - Tj1;
+					     Tj3 = Tj1 + Tj2;
+					     TiD = Tix + TiC;
+					     Tj4 = TiC - Tix;
+					     cr[0] = T3w + T77;
+					     ci[WS(rs, 31)] = T3w - T77;
+					     Tj6 = TiG - TiF;
+					     TiH = TiF + TiG;
+					     ci[WS(rs, 39)] = FMA(KP707106781, Tj4, Tj3);
+					     cr[WS(rs, 56)] = FMS(KP707106781, Tj4, Tj3);
+					     TiE = Tio + Tir;
+					     Tis = Tio - Tir;
+					     ci[WS(rs, 55)] = FMA(KP707106781, Tj6, Tj5);
+					     cr[WS(rs, 40)] = FMS(KP707106781, Tj6, Tj5);
+					}
+					TiL = TiJ - TiK;
+					TiM = TiK + TiJ;
+					cr[WS(rs, 8)] = FMA(KP707106781, TiD, Tis);
+					ci[WS(rs, 23)] = FNMS(KP707106781, TiD, Tis);
+					ci[WS(rs, 7)] = FMA(KP707106781, TiH, TiE);
+					cr[WS(rs, 24)] = FNMS(KP707106781, TiH, TiE);
+					TiY = TiN + TiX;
+					Tj0 = TiX - TiN;
+					ci[WS(rs, 63)] = TiM + TiY;
+					cr[WS(rs, 32)] = TiM - TiY;
+					ci[WS(rs, 47)] = TiZ + Tj0;
+					cr[WS(rs, 48)] = TiZ - Tj0;
+				   }
+			      }
+			      {
+				   E TjW, TbB, Tk2, T99, TbF, TbL, Tbv, Taj, Tcu, Tcy, Tci, Tce, Tcr, Tcx, Tch;
+				   E Tc7, Tcn, Tkg, Tka, TbZ, TbP, T7J, TbO, T7u, Tk7, TjT, TbI, TbM, Tbw, Tbs;
+				   E T7Y, TbQ;
+				   {
+					E TbX, TbW, TbU, TbT, Tc1, Tc5, Tc4, Tc2, TaG, Tbq, Tbn, Tcb, Tcs, Tca, Tcc;
+					E Tbb, Tcm, TbV;
+					{
+					     E T8W, Tbz, T8z, T97, T8n, T8y;
+					     TbX = FNMS(KP707106781, T8m, T87);
+					     T8n = FMA(KP707106781, T8m, T87);
+					     T8y = FMA(KP707106781, T8x, T8u);
+					     TbW = FNMS(KP707106781, T8x, T8u);
+					     TbU = FNMS(KP707106781, T8V, T8G);
+					     T8W = FMA(KP707106781, T8V, T8G);
+					     ci[WS(rs, 15)] = TiI + TiL;
+					     cr[WS(rs, 16)] = TiI - TiL;
+					     Tbz = FMA(KP198912367, T8n, T8y);
+					     T8z = FNMS(KP198912367, T8y, T8n);
+					     T97 = FMA(KP707106781, T96, T93);
+					     TbT = FNMS(KP707106781, T96, T93);
+					     {
+						  E Tae, TbD, Ta3, Tah;
+						  {
+						       E T9x, Ta2, TbA, T98;
+						       Tc1 = FNMS(KP707106781, T9w, T9h);
+						       T9x = FMA(KP707106781, T9w, T9h);
+						       Ta2 = T9M + Ta1;
+						       Tc5 = Ta1 - T9M;
+						       Tc4 = FNMS(KP707106781, Tad, Taa);
+						       Tae = FMA(KP707106781, Tad, Taa);
+						       TbA = FNMS(KP198912367, T8W, T97);
+						       T98 = FMA(KP198912367, T97, T8W);
+						       TbD = FNMS(KP923879532, Ta2, T9x);
+						       Ta3 = FMA(KP923879532, Ta2, T9x);
+						       TjW = Tbz + TbA;
+						       TbB = Tbz - TbA;
+						       Tk2 = T98 - T8z;
+						       T99 = T8z + T98;
+						       Tah = Taf + Tag;
+						       Tc2 = Taf - Tag;
+						  }
+						  {
+						       E Tc8, Tc9, TbE, Tai;
+						       TaG = FMA(KP707106781, TaF, Taq);
+						       Tc8 = FNMS(KP707106781, TaF, Taq);
+						       Tc9 = Tbp - Tbo;
+						       Tbq = Tbo + Tbp;
+						       Tbn = FMA(KP707106781, Tbm, Tbj);
+						       Tcb = FNMS(KP707106781, Tbm, Tbj);
+						       TbE = FNMS(KP923879532, Tah, Tae);
+						       Tai = FMA(KP923879532, Tah, Tae);
+						       Tcs = FMA(KP923879532, Tc9, Tc8);
+						       Tca = FNMS(KP923879532, Tc9, Tc8);
+						       TbF = FMA(KP820678790, TbE, TbD);
+						       TbL = FNMS(KP820678790, TbD, TbE);
+						       Tbv = FMA(KP098491403, Ta3, Tai);
+						       Taj = FNMS(KP098491403, Tai, Ta3);
+						       Tcc = Tba - TaV;
+						       Tbb = TaV + Tba;
+						  }
+					     }
+					}
+					{
+					     E Tcp, Tc3, Tct, Tcd, Tcq, Tc6;
+					     Tct = FNMS(KP923879532, Tcc, Tcb);
+					     Tcd = FMA(KP923879532, Tcc, Tcb);
+					     Tcp = FMA(KP923879532, Tc2, Tc1);
+					     Tc3 = FNMS(KP923879532, Tc2, Tc1);
+					     Tcu = FMA(KP303346683, Tct, Tcs);
+					     Tcy = FNMS(KP303346683, Tcs, Tct);
+					     Tci = FMA(KP534511135, Tca, Tcd);
+					     Tce = FNMS(KP534511135, Tcd, Tca);
+					     Tcq = FMA(KP923879532, Tc5, Tc4);
+					     Tc6 = FNMS(KP923879532, Tc5, Tc4);
+					     Tcm = FNMS(KP668178637, TbT, TbU);
+					     TbV = FMA(KP668178637, TbU, TbT);
+					     Tcr = FMA(KP303346683, Tcq, Tcp);
+					     Tcx = FNMS(KP303346683, Tcp, Tcq);
+					     Tch = FMA(KP534511135, Tc3, Tc6);
+					     Tc7 = FNMS(KP534511135, Tc6, Tc3);
+					}
+					{
+					     E TbG, Tbc, Tcl, TbY;
+					     Tcl = FMA(KP668178637, TbW, TbX);
+					     TbY = FNMS(KP668178637, TbX, TbW);
+					     TbG = FNMS(KP923879532, Tbb, TaG);
+					     Tbc = FMA(KP923879532, Tbb, TaG);
+					     Tcn = Tcl + Tcm;
+					     Tkg = Tcl - Tcm;
+					     Tka = TbY + TbV;
+					     TbZ = TbV - TbY;
+					     {
+						  E T7t, TjS, TbH, Tbr;
+						  Tkm = T7s - T7l;
+						  T7t = T7l + T7s;
+						  TjS = TcB - TcC;
+						  TcD = TcB + TcC;
+						  TbP = FMA(KP414213562, T7B, T7I);
+						  T7J = FNMS(KP414213562, T7I, T7B);
+						  TbH = FNMS(KP923879532, Tbq, Tbn);
+						  Tbr = FMA(KP923879532, Tbq, Tbn);
+						  TbO = FNMS(KP707106781, T7t, T7e);
+						  T7u = FMA(KP707106781, T7t, T7e);
+						  Tk7 = FNMS(KP707106781, TjS, TjR);
+						  TjT = FMA(KP707106781, TjS, TjR);
+						  TbI = FMA(KP820678790, TbH, TbG);
+						  TbM = FNMS(KP820678790, TbG, TbH);
+						  Tbw = FMA(KP098491403, Tbc, Tbr);
+						  Tbs = FNMS(KP098491403, Tbr, Tbc);
+						  T7Y = FMA(KP414213562, T7X, T7Q);
+						  TbQ = FNMS(KP414213562, T7Q, T7X);
+					     }
+					}
+				   }
+				   {
+					E Tk1, TjV, Tck, TbS, Tkd, Tcz, Tkh, Tcf, TjY, Tk6, Tke, Tcv, Tki, Tcj;
+					{
+					     E Tbu, TbC, Tkb, Tkc, Tkj, Tkk, Tbx, TbJ;
+					     {
+						  E Tbt, Tkf, Tk9, T9a, TbK, TbN, Tby;
+						  Tk0 = Tbs - Taj;
+						  Tbt = Taj + Tbs;
+						  {
+						       E Tk8, T7Z, TjU, TbR, T80;
+						       Tk8 = T7Y - T7J;
+						       T7Z = T7J + T7Y;
+						       TjU = TbP + TbQ;
+						       TbR = TbP - TbQ;
+						       Tkf = FNMS(KP923879532, Tk8, Tk7);
+						       Tk9 = FMA(KP923879532, Tk8, Tk7);
+						       Tby = FNMS(KP923879532, T7Z, T7u);
+						       T80 = FMA(KP923879532, T7Z, T7u);
+						       Tk1 = FNMS(KP923879532, TjU, TjT);
+						       TjV = FMA(KP923879532, TjU, TjT);
+						       Tck = FMA(KP923879532, TbR, TbO);
+						       TbS = FNMS(KP923879532, TbR, TbO);
+						       T9a = FMA(KP980785280, T99, T80);
+						       Tbu = FNMS(KP980785280, T99, T80);
+						  }
+						  TbC = FMA(KP980785280, TbB, Tby);
+						  TbK = FNMS(KP980785280, TbB, Tby);
+						  TbN = TbL + TbM;
+						  Tk4 = TbL - TbM;
+						  Tkd = FNMS(KP831469612, Tka, Tk9);
+						  Tkb = FMA(KP831469612, Tka, Tk9);
+						  ci[0] = FMA(KP995184726, Tbt, T9a);
+						  cr[WS(rs, 31)] = FNMS(KP995184726, Tbt, T9a);
+						  ci[WS(rs, 8)] = FNMS(KP773010453, TbN, TbK);
+						  cr[WS(rs, 23)] = FMA(KP773010453, TbN, TbK);
+						  Tkc = Tcx - Tcy;
+						  Tcz = Tcx + Tcy;
+						  Tkh = FMA(KP831469612, Tkg, Tkf);
+						  Tkj = FNMS(KP831469612, Tkg, Tkf);
+						  Tkk = Tce - Tc7;
+						  Tcf = Tc7 + Tce;
+					     }
+					     ci[WS(rs, 60)] = FMA(KP956940335, Tkc, Tkb);
+					     cr[WS(rs, 35)] = FMS(KP956940335, Tkc, Tkb);
+					     ci[WS(rs, 52)] = FMA(KP881921264, Tkk, Tkj);
+					     cr[WS(rs, 43)] = FMS(KP881921264, Tkk, Tkj);
+					     Tbx = Tbv + Tbw;
+					     TjY = Tbw - Tbv;
+					     TbJ = TbF + TbI;
+					     Tk6 = TbI - TbF;
+					     cr[WS(rs, 15)] = FMA(KP995184726, Tbx, Tbu);
+					     ci[WS(rs, 16)] = FNMS(KP995184726, Tbx, Tbu);
+					     cr[WS(rs, 7)] = FMA(KP773010453, TbJ, TbC);
+					     ci[WS(rs, 24)] = FNMS(KP773010453, TbJ, TbC);
+					     Tke = Tcu - Tcr;
+					     Tcv = Tcr + Tcu;
+					     Tki = Tci - Tch;
+					     Tcj = Tch + Tci;
+					}
+					{
+					     E Tcg, Tco, TjX, Tk5, Tc0, Tcw;
+					     Tcg = FNMS(KP831469612, TbZ, TbS);
+					     Tc0 = FMA(KP831469612, TbZ, TbS);
+					     ci[WS(rs, 44)] = FMA(KP956940335, Tke, Tkd);
+					     cr[WS(rs, 51)] = FMS(KP956940335, Tke, Tkd);
+					     ci[WS(rs, 36)] = FMA(KP881921264, Tki, Tkh);
+					     cr[WS(rs, 59)] = FMS(KP881921264, Tki, Tkh);
+					     Tco = FMA(KP831469612, Tcn, Tck);
+					     Tcw = FNMS(KP831469612, Tcn, Tck);
+					     TjZ = FNMS(KP980785280, TjW, TjV);
+					     TjX = FMA(KP980785280, TjW, TjV);
+					     ci[WS(rs, 4)] = FMA(KP881921264, Tcf, Tc0);
+					     cr[WS(rs, 27)] = FNMS(KP881921264, Tcf, Tc0);
+					     ci[WS(rs, 12)] = FNMS(KP956940335, Tcz, Tcw);
+					     cr[WS(rs, 19)] = FMA(KP956940335, Tcz, Tcw);
+					     Tk3 = FMA(KP980785280, Tk2, Tk1);
+					     Tk5 = FNMS(KP980785280, Tk2, Tk1);
+					     ci[WS(rs, 32)] = FMA(KP995184726, TjY, TjX);
+					     cr[WS(rs, 63)] = FMS(KP995184726, TjY, TjX);
+					     ci[WS(rs, 40)] = FMA(KP773010453, Tk6, Tk5);
+					     cr[WS(rs, 55)] = FMS(KP773010453, Tk6, Tk5);
+					     cr[WS(rs, 11)] = FMA(KP881921264, Tcj, Tcg);
+					     ci[WS(rs, 20)] = FNMS(KP881921264, Tcj, Tcg);
+					     cr[WS(rs, 3)] = FMA(KP956940335, Tcv, Tco);
+					     ci[WS(rs, 28)] = FNMS(KP956940335, Tcv, Tco);
+					}
+				   }
+			      }
+			 }
+			 {
+			      E Ti8, Thu, Tjf, Tj9, Tib, Tjg, Tja, ThF, Tig, ThW, Tif, Til, Ti6, ThR;
+			      ci[WS(rs, 48)] = FMA(KP995184726, Tk0, TjZ);
+			      cr[WS(rs, 47)] = FMS(KP995184726, Tk0, TjZ);
+			      ci[WS(rs, 56)] = FMA(KP773010453, Tk4, Tk3);
+			      cr[WS(rs, 39)] = FMS(KP773010453, Tk4, Tk3);
+			      Ti8 = Thq + Tht;
+			      Thu = Thq - Tht;
+			      Tjf = Tj8 - Tj7;
+			      Tj9 = Tj7 + Tj8;
+			      {
+				   E Tid, ThL, Tie, ThQ;
+				   {
+					E Ti9, Thz, Tia, ThE;
+					Ti9 = Thv - Thy;
+					Thz = Thv + Thy;
+					Tia = ThA + ThD;
+					ThE = ThA - ThD;
+					Tib = Ti9 + Tia;
+					Tjg = Tia - Ti9;
+					Tja = Thz - ThE;
+					ThF = Thz + ThE;
+					Tid = ThH + ThK;
+					ThL = ThH - ThK;
+				   }
+				   Tie = ThO + ThP;
+				   ThQ = ThO - ThP;
+				   Tig = ThS + ThV;
+				   ThW = ThS - ThV;
+				   Tif = FNMS(KP414213562, Tie, Tid);
+				   Til = FMA(KP414213562, Tid, Tie);
+				   Ti6 = FNMS(KP414213562, ThL, ThQ);
+				   ThR = FMA(KP414213562, ThQ, ThL);
+			      }
+			      {
+				   E Ti4, ThG, Tjh, Tjj, Tih, Ti1;
+				   Ti4 = FNMS(KP707106781, ThF, Thu);
+				   ThG = FMA(KP707106781, ThF, Thu);
+				   Tjh = FMA(KP707106781, Tjg, Tjf);
+				   Tjj = FNMS(KP707106781, Tjg, Tjf);
+				   Tih = Ti0 - ThZ;
+				   Ti1 = ThZ + Ti0;
+				   {
+					E Tje, Tjd, Tjb, Tjc;
+					{
+					     E Tic, Tim, Ti5, Ti2, Tij, Tii;
+					     Tik = FNMS(KP707106781, Tib, Ti8);
+					     Tic = FMA(KP707106781, Tib, Ti8);
+					     Tii = FNMS(KP414213562, Tih, Tig);
+					     Tim = FMA(KP414213562, Tig, Tih);
+					     Ti5 = FMA(KP414213562, ThW, Ti1);
+					     Ti2 = FNMS(KP414213562, Ti1, ThW);
+					     Tij = Tif + Tii;
+					     Tje = Tii - Tif;
+					     Tjd = FNMS(KP707106781, Tja, Tj9);
+					     Tjb = FMA(KP707106781, Tja, Tj9);
+					     {
+						  E Ti7, Tji, Tjk, Ti3;
+						  Ti7 = Ti5 - Ti6;
+						  Tji = Ti6 + Ti5;
+						  Tjk = Ti2 - ThR;
+						  Ti3 = ThR + Ti2;
+						  ci[WS(rs, 3)] = FMA(KP923879532, Tij, Tic);
+						  cr[WS(rs, 28)] = FNMS(KP923879532, Tij, Tic);
+						  ci[WS(rs, 11)] = FMA(KP923879532, Ti7, Ti4);
+						  cr[WS(rs, 20)] = FNMS(KP923879532, Ti7, Ti4);
+						  ci[WS(rs, 59)] = FMA(KP923879532, Tji, Tjh);
+						  cr[WS(rs, 36)] = FMS(KP923879532, Tji, Tjh);
+						  ci[WS(rs, 43)] = FMA(KP923879532, Tjk, Tjj);
+						  cr[WS(rs, 52)] = FMS(KP923879532, Tjk, Tjj);
+						  cr[WS(rs, 4)] = FMA(KP923879532, Ti3, ThG);
+						  ci[WS(rs, 27)] = FNMS(KP923879532, Ti3, ThG);
+						  Tjc = Tim - Til;
+						  Tin = Til + Tim;
+					     }
+					}
+					ci[WS(rs, 35)] = FMA(KP923879532, Tjc, Tjb);
+					cr[WS(rs, 60)] = FMS(KP923879532, Tjc, Tjb);
+					ci[WS(rs, 51)] = FMA(KP923879532, Tje, Tjd);
+					cr[WS(rs, 44)] = FMS(KP923879532, Tje, Tjd);
+				   }
+			      }
+			 }
+			 {
+			      E Tjy, Tju, Tjt, Tjx;
+			      {
+				   E TjD, TjJ, Tgo, Tf2, Tjp, Tjv, Tha, TgI, Tgg, Tgd, Tgr, Tjw, Tjq, Tfp, Thk;
+				   E Tho, Th7, Th4, Tgv, TgB, Tgl, TfR, TjE, Thd, TjK, TgP, Tgw, Tg8, Thh, Thn;
+				   E Th8, TgX;
+				   {
+					E TgK, TgJ, TgN, TgM, TfW, Th1, Thi, Th0, Th2, Tg7;
+					{
+					     E TgE, TeQ, TjB, Tjn, TgF, TgG, TjC, Tf1, TeV, Tf0;
+					     TgE = TeM - TeP;
+					     TeQ = TeM + TeP;
+					     TjB = Tjm - Tjl;
+					     Tjn = Tjl + Tjm;
+					     TgF = TeR + TeU;
+					     TeV = TeR - TeU;
+					     cr[WS(rs, 12)] = FMA(KP923879532, Tin, Tik);
+					     ci[WS(rs, 19)] = FNMS(KP923879532, Tin, Tik);
+					     Tf0 = TeW + TeZ;
+					     TgG = TeW - TeZ;
+					     TjC = Tf0 - TeV;
+					     Tf1 = TeV + Tf0;
+					     {
+						  E Tfi, Tgp, Tfd, Tfn;
+						  {
+						       E Tf7, Tjo, TgH, Tfc;
+						       TgK = Tf5 - Tf6;
+						       Tf7 = Tf5 + Tf6;
+						       TjD = FMA(KP707106781, TjC, TjB);
+						       TjJ = FNMS(KP707106781, TjC, TjB);
+						       Tgo = FMA(KP707106781, Tf1, TeQ);
+						       Tf2 = FNMS(KP707106781, Tf1, TeQ);
+						       Tjo = TgF - TgG;
+						       TgH = TgF + TgG;
+						       Tfc = Tf8 + Tfb;
+						       TgJ = Tf8 - Tfb;
+						       TgN = Tfg - Tfh;
+						       Tfi = Tfg + Tfh;
+						       Tjp = FMA(KP707106781, Tjo, Tjn);
+						       Tjv = FNMS(KP707106781, Tjo, Tjn);
+						       Tha = FNMS(KP707106781, TgH, TgE);
+						       TgI = FMA(KP707106781, TgH, TgE);
+						       Tgp = FNMS(KP414213562, Tf7, Tfc);
+						       Tfd = FMA(KP414213562, Tfc, Tf7);
+						       Tfn = Tfj + Tfm;
+						       TgM = Tfj - Tfm;
+						  }
+						  {
+						       E TgY, TgZ, Tgq, Tfo;
+						       TfW = TfS + TfV;
+						       TgY = TfS - TfV;
+						       TgZ = Tgf + Tge;
+						       Tgg = Tge - Tgf;
+						       Tgd = Tg9 - Tgc;
+						       Th1 = Tgc + Tg9;
+						       Tgq = FMA(KP414213562, Tfi, Tfn);
+						       Tfo = FNMS(KP414213562, Tfn, Tfi);
+						       Thi = FNMS(KP707106781, TgZ, TgY);
+						       Th0 = FMA(KP707106781, TgZ, TgY);
+						       Tgr = Tgp + Tgq;
+						       Tjw = Tgq - Tgp;
+						       Tjq = Tfd + Tfo;
+						       Tfp = Tfd - Tfo;
+						       Th2 = Tg6 - Tg1;
+						       Tg7 = Tg1 + Tg6;
+						  }
+					     }
+					}
+					{
+					     E TgR, TgV, TgU, TgS, Thc, TgL;
+					     {
+						  E TfM, Tgt, TfH, TfP, Tgu, TfQ;
+						  {
+						       E Tfv, TfG, Thj, Th3;
+						       TgR = Tfr - Tfu;
+						       Tfv = Tfr + Tfu;
+						       TfG = TfA + TfF;
+						       TgV = TfF - TfA;
+						       TgU = TfK - TfL;
+						       TfM = TfK + TfL;
+						       Thj = FNMS(KP707106781, Th2, Th1);
+						       Th3 = FMA(KP707106781, Th2, Th1);
+						       Tgt = FMA(KP707106781, TfG, Tfv);
+						       TfH = FNMS(KP707106781, TfG, Tfv);
+						       Thk = FMA(KP668178637, Thj, Thi);
+						       Tho = FNMS(KP668178637, Thi, Thj);
+						       Th7 = FMA(KP198912367, Th0, Th3);
+						       Th4 = FNMS(KP198912367, Th3, Th0);
+						       TfP = TfN + TfO;
+						       TgS = TfN - TfO;
+						  }
+						  Tgu = FMA(KP707106781, TfP, TfM);
+						  TfQ = FNMS(KP707106781, TfP, TfM);
+						  Thc = FNMS(KP414213562, TgJ, TgK);
+						  TgL = FMA(KP414213562, TgK, TgJ);
+						  Tgv = FNMS(KP198912367, Tgu, Tgt);
+						  TgB = FMA(KP198912367, Tgt, Tgu);
+						  Tgl = FNMS(KP668178637, TfH, TfQ);
+						  TfR = FMA(KP668178637, TfQ, TfH);
+					     }
+					     {
+						  E Thf, TgT, Thb, TgO, Thg, TgW;
+						  Thb = FMA(KP414213562, TgM, TgN);
+						  TgO = FNMS(KP414213562, TgN, TgM);
+						  Thf = FNMS(KP707106781, TgS, TgR);
+						  TgT = FMA(KP707106781, TgS, TgR);
+						  TjE = Thc + Thb;
+						  Thd = Thb - Thc;
+						  TjK = TgL - TgO;
+						  TgP = TgL + TgO;
+						  Thg = FNMS(KP707106781, TgV, TgU);
+						  TgW = FMA(KP707106781, TgV, TgU);
+						  Tgw = FMA(KP707106781, Tg7, TfW);
+						  Tg8 = FNMS(KP707106781, Tg7, TfW);
+						  Thh = FNMS(KP668178637, Thg, Thf);
+						  Thn = FMA(KP668178637, Thf, Thg);
+						  Th8 = FNMS(KP198912367, TgT, TgW);
+						  TgX = FMA(KP198912367, TgW, TgT);
+					     }
+					}
+				   }
+				   {
+					E TjH, Th9, TjL, Tjs, TjA, Thl, TjI, Th5, TjM, Thp;
+					{
+					     E Tgk, Tfq, TgA, Tgs, TjN, Tgy, Tgm, TgD, Tgj, TjO, Tgn, Tgz;
+					     Tgk = FNMS(KP923879532, Tfp, Tf2);
+					     Tfq = FMA(KP923879532, Tfp, Tf2);
+					     TgA = FNMS(KP923879532, Tgr, Tgo);
+					     Tgs = FMA(KP923879532, Tgr, Tgo);
+					     {
+						  E TjF, Tgx, Tgh, TjG, TgC, Tgi;
+						  TjH = FNMS(KP923879532, TjE, TjD);
+						  TjF = FMA(KP923879532, TjE, TjD);
+						  Tgx = FMA(KP707106781, Tgg, Tgd);
+						  Tgh = FNMS(KP707106781, Tgg, Tgd);
+						  TjG = Th8 + Th7;
+						  Th9 = Th7 - Th8;
+						  TjL = FMA(KP923879532, TjK, TjJ);
+						  TjN = FNMS(KP923879532, TjK, TjJ);
+						  Tgy = FNMS(KP198912367, Tgx, Tgw);
+						  TgC = FMA(KP198912367, Tgw, Tgx);
+						  Tgm = FNMS(KP668178637, Tg8, Tgh);
+						  Tgi = FMA(KP668178637, Tgh, Tg8);
+						  ci[WS(rs, 61)] = FMA(KP980785280, TjG, TjF);
+						  cr[WS(rs, 34)] = FMS(KP980785280, TjG, TjF);
+						  TgD = TgB + TgC;
+						  Tjs = TgC - TgB;
+						  TjA = Tgi - TfR;
+						  Tgj = TfR + Tgi;
+						  TjO = Thk - Thh;
+						  Thl = Thh + Thk;
+					     }
+					     cr[WS(rs, 14)] = FMA(KP980785280, TgD, TgA);
+					     ci[WS(rs, 17)] = FNMS(KP980785280, TgD, TgA);
+					     cr[WS(rs, 6)] = FMA(KP831469612, Tgj, Tfq);
+					     ci[WS(rs, 25)] = FNMS(KP831469612, Tgj, Tfq);
+					     ci[WS(rs, 53)] = FMA(KP831469612, TjO, TjN);
+					     cr[WS(rs, 42)] = FMS(KP831469612, TjO, TjN);
+					     Tgn = Tgl + Tgm;
+					     Tjy = Tgl - Tgm;
+					     Tgz = Tgv + Tgy;
+					     Tju = Tgy - Tgv;
+					     ci[WS(rs, 9)] = FNMS(KP831469612, Tgn, Tgk);
+					     cr[WS(rs, 22)] = FMA(KP831469612, Tgn, Tgk);
+					     ci[WS(rs, 1)] = FMA(KP980785280, Tgz, Tgs);
+					     cr[WS(rs, 30)] = FNMS(KP980785280, Tgz, Tgs);
+					     TjI = Th4 - TgX;
+					     Th5 = TgX + Th4;
+					     TjM = Thn + Tho;
+					     Thp = Thn - Tho;
+					}
+					{
+					     E Th6, The, Tjr, Tjz, TgQ, Thm;
+					     Th6 = FNMS(KP923879532, TgP, TgI);
+					     TgQ = FMA(KP923879532, TgP, TgI);
+					     ci[WS(rs, 45)] = FMA(KP980785280, TjI, TjH);
+					     cr[WS(rs, 50)] = FMS(KP980785280, TjI, TjH);
+					     ci[WS(rs, 37)] = FNMS(KP831469612, TjM, TjL);
+					     cr[WS(rs, 58)] = -(FMA(KP831469612, TjM, TjL));
+					     The = FMA(KP923879532, Thd, Tha);
+					     Thm = FNMS(KP923879532, Thd, Tha);
+					     Tjt = FNMS(KP923879532, Tjq, Tjp);
+					     Tjr = FMA(KP923879532, Tjq, Tjp);
+					     cr[WS(rs, 2)] = FMA(KP980785280, Th5, TgQ);
+					     ci[WS(rs, 29)] = FNMS(KP980785280, Th5, TgQ);
+					     cr[WS(rs, 10)] = FMA(KP831469612, Thp, Thm);
+					     ci[WS(rs, 21)] = FNMS(KP831469612, Thp, Thm);
+					     Tjx = FMA(KP923879532, Tjw, Tjv);
+					     Tjz = FNMS(KP923879532, Tjw, Tjv);
+					     ci[WS(rs, 33)] = FMA(KP980785280, Tjs, Tjr);
+					     cr[WS(rs, 62)] = FMS(KP980785280, Tjs, Tjr);
+					     ci[WS(rs, 41)] = FMA(KP831469612, TjA, Tjz);
+					     cr[WS(rs, 54)] = FMS(KP831469612, TjA, Tjz);
+					     ci[WS(rs, 13)] = FMA(KP980785280, Th9, Th6);
+					     cr[WS(rs, 18)] = FNMS(KP980785280, Th9, Th6);
+					     ci[WS(rs, 5)] = FMA(KP831469612, Thl, The);
+					     cr[WS(rs, 26)] = FNMS(KP831469612, Thl, The);
+					}
+				   }
+			      }
+			      {
+				   E Tkq, TdN, Tkw, Td1, TdR, TdX, TdI, Tdl, TeG, TeK, Tet, Teq, TeD, TeJ, Teu;
+				   E Tej, Tez, TkK, TkE, Teb, Te2, TcH, Te0, TcE, TkB, Tkn, TdU, TdY, TdH, TdE;
+				   E TcK, Te1;
+				   {
+					E Te6, Te5, Te9, Te8, Ted, Teh, Teg, Tee, Tdo, TdC, Tdz, Ten, TeE, Tem, Teo;
+					E Tdv, Tex, Te7;
+					{
+					     E TcP, TcS, TcW, TcZ;
+					     Te6 = FNMS(KP707106781, TcO, TcN);
+					     TcP = FMA(KP707106781, TcO, TcN);
+					     ci[WS(rs, 49)] = FMA(KP980785280, Tju, Tjt);
+					     cr[WS(rs, 46)] = FMS(KP980785280, Tju, Tjt);
+					     ci[WS(rs, 57)] = FMA(KP831469612, Tjy, Tjx);
+					     cr[WS(rs, 38)] = FMS(KP831469612, Tjy, Tjx);
+					     TcS = FMA(KP707106781, TcR, TcQ);
+					     Te5 = FNMS(KP707106781, TcR, TcQ);
+					     Te9 = FNMS(KP707106781, TcV, TcU);
+					     TcW = FMA(KP707106781, TcV, TcU);
+					     TcZ = FMA(KP707106781, TcY, TcX);
+					     Te8 = FNMS(KP707106781, TcY, TcX);
+					     {
+						  E Tdg, TdP, Tdd, Tdj;
+						  {
+						       E Td5, TdM, TcT, TdL, Td0, Tdc;
+						       Ted = FNMS(KP707106781, Td4, Td3);
+						       Td5 = FMA(KP707106781, Td4, Td3);
+						       TdM = FNMS(KP198912367, TcP, TcS);
+						       TcT = FMA(KP198912367, TcS, TcP);
+						       TdL = FMA(KP198912367, TcW, TcZ);
+						       Td0 = FNMS(KP198912367, TcZ, TcW);
+						       Tdc = Td8 + Tdb;
+						       Teh = Td8 - Tdb;
+						       Teg = FNMS(KP707106781, Tdf, Tde);
+						       Tdg = FMA(KP707106781, Tdf, Tde);
+						       Tkq = TdM + TdL;
+						       TdN = TdL - TdM;
+						       Tkw = TcT - Td0;
+						       Td1 = TcT + Td0;
+						       TdP = FNMS(KP923879532, Tdc, Td5);
+						       Tdd = FMA(KP923879532, Tdc, Td5);
+						       Tdj = Tdh + Tdi;
+						       Tee = Tdi - Tdh;
+						  }
+						  {
+						       E Tek, Tel, TdQ, Tdk;
+						       Tdo = FMA(KP707106781, Tdn, Tdm);
+						       Tek = FNMS(KP707106781, Tdn, Tdm);
+						       Tel = TdB - TdA;
+						       TdC = TdA + TdB;
+						       Tdz = FMA(KP707106781, Tdy, Tdx);
+						       Ten = FNMS(KP707106781, Tdy, Tdx);
+						       TdQ = FNMS(KP923879532, Tdj, Tdg);
+						       Tdk = FMA(KP923879532, Tdj, Tdg);
+						       TeE = FMA(KP923879532, Tel, Tek);
+						       Tem = FNMS(KP923879532, Tel, Tek);
+						       TdR = FNMS(KP820678790, TdQ, TdP);
+						       TdX = FMA(KP820678790, TdP, TdQ);
+						       TdI = FNMS(KP098491403, Tdd, Tdk);
+						       Tdl = FMA(KP098491403, Tdk, Tdd);
+						       Teo = Tdu - Tdr;
+						       Tdv = Tdr + Tdu;
+						  }
+					     }
+					}
+					{
+					     E TeB, Tef, TeF, Tep, TeC, Tei;
+					     TeF = FNMS(KP923879532, Teo, Ten);
+					     Tep = FMA(KP923879532, Teo, Ten);
+					     TeB = FMA(KP923879532, Tee, Ted);
+					     Tef = FNMS(KP923879532, Tee, Ted);
+					     TeG = FMA(KP303346683, TeF, TeE);
+					     TeK = FNMS(KP303346683, TeE, TeF);
+					     Tet = FMA(KP534511135, Tem, Tep);
+					     Teq = FNMS(KP534511135, Tep, Tem);
+					     TeC = FMA(KP923879532, Teh, Teg);
+					     Tei = FNMS(KP923879532, Teh, Teg);
+					     Tex = FNMS(KP668178637, Te5, Te6);
+					     Te7 = FMA(KP668178637, Te6, Te5);
+					     TeD = FNMS(KP303346683, TeC, TeB);
+					     TeJ = FMA(KP303346683, TeB, TeC);
+					     Teu = FNMS(KP534511135, Tef, Tei);
+					     Tej = FMA(KP534511135, Tei, Tef);
+					}
+					{
+					     E TdS, Tdw, Tey, Tea, TdT, TdD;
+					     Tey = FMA(KP668178637, Te8, Te9);
+					     Tea = FNMS(KP668178637, Te9, Te8);
+					     TdS = FNMS(KP923879532, Tdv, Tdo);
+					     Tdw = FMA(KP923879532, Tdv, Tdo);
+					     Tez = Tex + Tey;
+					     TkK = Tey - Tex;
+					     TkE = Te7 + Tea;
+					     Teb = Te7 - Tea;
+					     Te2 = FNMS(KP414213562, TcF, TcG);
+					     TcH = FMA(KP414213562, TcG, TcF);
+					     TdT = FNMS(KP923879532, TdC, Tdz);
+					     TdD = FMA(KP923879532, TdC, Tdz);
+					     Te0 = FNMS(KP707106781, TcD, TcA);
+					     TcE = FMA(KP707106781, TcD, TcA);
+					     TkB = FNMS(KP707106781, Tkm, Tkl);
+					     Tkn = FMA(KP707106781, Tkm, Tkl);
+					     TdU = FMA(KP820678790, TdT, TdS);
+					     TdY = FNMS(KP820678790, TdS, TdT);
+					     TdH = FMA(KP098491403, Tdw, TdD);
+					     TdE = FNMS(KP098491403, TdD, Tdw);
+					     TcK = FNMS(KP414213562, TcJ, TcI);
+					     Te1 = FMA(KP414213562, TcI, TcJ);
+					}
+				   }
+				   {
+					E Tkv, Tkp, Tew, Te4, TkH, TeL, TkL, Ter, Tks, TkA, TkI, TeH, TkM, Tev;
+					{
+					     E TdG, TdO, TkF, TkG, TkN, TkO, TdJ, TdV;
+					     {
+						  E TdF, TkJ, TkD, Td2, TdW, TdZ, TdK;
+						  Tku = TdE - Tdl;
+						  TdF = Tdl + TdE;
+						  {
+						       E TkC, TcL, Tko, Te3, TcM;
+						       TkC = TcH - TcK;
+						       TcL = TcH + TcK;
+						       Tko = Te2 + Te1;
+						       Te3 = Te1 - Te2;
+						       TkJ = FNMS(KP923879532, TkC, TkB);
+						       TkD = FMA(KP923879532, TkC, TkB);
+						       TdK = FNMS(KP923879532, TcL, TcE);
+						       TcM = FMA(KP923879532, TcL, TcE);
+						       Tkv = FNMS(KP923879532, Tko, Tkn);
+						       Tkp = FMA(KP923879532, Tko, Tkn);
+						       Tew = FMA(KP923879532, Te3, Te0);
+						       Te4 = FNMS(KP923879532, Te3, Te0);
+						       Td2 = FMA(KP980785280, Td1, TcM);
+						       TdG = FNMS(KP980785280, Td1, TcM);
+						  }
+						  TdO = FMA(KP980785280, TdN, TdK);
+						  TdW = FNMS(KP980785280, TdN, TdK);
+						  TdZ = TdX - TdY;
+						  Tky = TdX + TdY;
+						  TkH = FNMS(KP831469612, TkE, TkD);
+						  TkF = FMA(KP831469612, TkE, TkD);
+						  cr[WS(rs, 1)] = FMA(KP995184726, TdF, Td2);
+						  ci[WS(rs, 30)] = FNMS(KP995184726, TdF, Td2);
+						  cr[WS(rs, 9)] = FMA(KP773010453, TdZ, TdW);
+						  ci[WS(rs, 22)] = FNMS(KP773010453, TdZ, TdW);
+						  TkG = TeJ + TeK;
+						  TeL = TeJ - TeK;
+						  TkL = FMA(KP831469612, TkK, TkJ);
+						  TkN = FNMS(KP831469612, TkK, TkJ);
+						  TkO = Teq - Tej;
+						  Ter = Tej + Teq;
+					     }
+					     ci[WS(rs, 34)] = FNMS(KP956940335, TkG, TkF);
+					     cr[WS(rs, 61)] = -(FMA(KP956940335, TkG, TkF));
+					     ci[WS(rs, 42)] = FMA(KP881921264, TkO, TkN);
+					     cr[WS(rs, 53)] = FMS(KP881921264, TkO, TkN);
+					     TdJ = TdH - TdI;
+					     Tks = TdI + TdH;
+					     TdV = TdR + TdU;
+					     TkA = TdU - TdR;
+					     ci[WS(rs, 14)] = FMA(KP995184726, TdJ, TdG);
+					     cr[WS(rs, 17)] = FNMS(KP995184726, TdJ, TdG);
+					     ci[WS(rs, 6)] = FMA(KP773010453, TdV, TdO);
+					     cr[WS(rs, 25)] = FNMS(KP773010453, TdV, TdO);
+					     TkI = TeG - TeD;
+					     TeH = TeD + TeG;
+					     TkM = Teu + Tet;
+					     Tev = Tet - Teu;
+					}
+					{
+					     E Tes, TeA, Tkr, Tkz, Tec, TeI;
+					     Tes = FNMS(KP831469612, Teb, Te4);
+					     Tec = FMA(KP831469612, Teb, Te4);
+					     ci[WS(rs, 50)] = FMA(KP956940335, TkI, TkH);
+					     cr[WS(rs, 45)] = FMS(KP956940335, TkI, TkH);
+					     ci[WS(rs, 58)] = FMA(KP881921264, TkM, TkL);
+					     cr[WS(rs, 37)] = FMS(KP881921264, TkM, TkL);
+					     TeA = FMA(KP831469612, Tez, Tew);
+					     TeI = FNMS(KP831469612, Tez, Tew);
+					     Tkt = FNMS(KP980785280, Tkq, Tkp);
+					     Tkr = FMA(KP980785280, Tkq, Tkp);
+					     cr[WS(rs, 5)] = FMA(KP881921264, Ter, Tec);
+					     ci[WS(rs, 26)] = FNMS(KP881921264, Ter, Tec);
+					     cr[WS(rs, 13)] = FMA(KP956940335, TeL, TeI);
+					     ci[WS(rs, 18)] = FNMS(KP956940335, TeL, TeI);
+					     Tkx = FMA(KP980785280, Tkw, Tkv);
+					     Tkz = FNMS(KP980785280, Tkw, Tkv);
+					     ci[WS(rs, 62)] = FMA(KP995184726, Tks, Tkr);
+					     cr[WS(rs, 33)] = FMS(KP995184726, Tks, Tkr);
+					     ci[WS(rs, 54)] = FMA(KP773010453, TkA, Tkz);
+					     cr[WS(rs, 41)] = FMS(KP773010453, TkA, Tkz);
+					     ci[WS(rs, 10)] = FMA(KP881921264, Tev, Tes);
+					     cr[WS(rs, 21)] = FNMS(KP881921264, Tev, Tes);
+					     ci[WS(rs, 2)] = FMA(KP956940335, TeH, TeA);
+					     cr[WS(rs, 29)] = FNMS(KP956940335, TeH, TeA);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       ci[WS(rs, 46)] = FMA(KP995184726, Tku, Tkt);
+	       cr[WS(rs, 49)] = FMS(KP995184726, Tku, Tkt);
+	       ci[WS(rs, 38)] = FNMS(KP773010453, Tky, Tkx);
+	       cr[WS(rs, 57)] = -(FMA(KP773010453, Tky, Tkx));
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 64},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 64, "hf_64", twinstr, &GENUS, {520, 126, 518, 0} };
+
+void X(codelet_hf_64) (planner *p) {
+     X(khc2hc_register) (p, hf_64, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 64 -dit -name hf_64 -include hf.h */
+
+/*
+ * This function contains 1038 FP additions, 500 FP multiplications,
+ * (or, 808 additions, 270 multiplications, 230 fused multiply/add),
+ * 176 stack variables, 15 constants, and 256 memory accesses
+ */
+#include "hf.h"
+
+static void hf_64(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 126); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 126, MAKE_VOLATILE_STRIDE(128, rs)) {
+	       E Tj, TcL, ThT, Tin, T6b, Taz, TgT, Thn, TG, Thm, TcO, TgO, T6m, Tim, TaC;
+	       E ThQ, T14, Tfr, T6y, T9O, TaG, Tc0, TcU, TeE, T1r, Tfq, T6J, T9P, TaJ, Tc1;
+	       E TcZ, TeF, T1Q, T2d, Tfu, Tfv, Tfw, Tfx, T6Q, TaM, Tdb, TeI, T71, TaQ, T7a;
+	       E TaN, Td6, TeJ, T77, TaP, T2B, T2Y, Tfz, TfA, TfB, TfC, T7h, TaW, Tdm, TeL;
+	       E T7s, TaU, T7B, TaX, Tdh, TeM, T7y, TaT, T5j, TfR, Tec, TeX, TfY, Tgy, T8D;
+	       E Tbl, T8O, Tbx, T9l, Tbm, TdV, Tf0, T9i, Tbw, T3M, TfL, TdL, TeT, TfI, Tgt;
+	       E T7K, Tbd, T7V, Tb3, T8s, Tbe, Tdu, TeQ, T8p, Tb2, T4x, TfJ, TdE, TdM, TfO;
+	       E Tgu, T87, T8u, T8i, T8v, Tba, Tbh, Tdz, TdN, Tb7, Tbg, T64, TfZ, Te5, Ted;
+	       E TfU, Tgz, T90, T9n, T9b, T9o, Tbt, TbA, Te0, Tee, Tbq, Tbz;
+	       {
+		    E T1, TgR, T6, TgQ, Tc, T68, Th, T69;
+		    T1 = cr[0];
+		    TgR = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 32)];
+			 T5 = ci[WS(rs, 32)];
+			 T2 = W[62];
+			 T4 = W[63];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 TgQ = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = cr[WS(rs, 16)];
+			 Tb = ci[WS(rs, 16)];
+			 T8 = W[30];
+			 Ta = W[31];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 T68 = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = cr[WS(rs, 48)];
+			 Tg = ci[WS(rs, 48)];
+			 Td = W[94];
+			 Tf = W[95];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 T69 = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E T7, Ti, ThR, ThS;
+			 T7 = T1 + T6;
+			 Ti = Tc + Th;
+			 Tj = T7 + Ti;
+			 TcL = T7 - Ti;
+			 ThR = Tc - Th;
+			 ThS = TgR - TgQ;
+			 ThT = ThR + ThS;
+			 Tin = ThS - ThR;
+		    }
+		    {
+			 E T67, T6a, TgP, TgS;
+			 T67 = T1 - T6;
+			 T6a = T68 - T69;
+			 T6b = T67 - T6a;
+			 Taz = T67 + T6a;
+			 TgP = T68 + T69;
+			 TgS = TgQ + TgR;
+			 TgT = TgP + TgS;
+			 Thn = TgS - TgP;
+		    }
+	       }
+	       {
+		    E To, T6d, Tt, T6e, T6c, T6f, Tz, T6i, TE, T6j, T6h, T6k;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = cr[WS(rs, 8)];
+			 Tn = ci[WS(rs, 8)];
+			 Tk = W[14];
+			 Tm = W[15];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 T6d = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = cr[WS(rs, 40)];
+			 Ts = ci[WS(rs, 40)];
+			 Tp = W[78];
+			 Tr = W[79];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 T6e = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    T6c = To - Tt;
+		    T6f = T6d - T6e;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = cr[WS(rs, 56)];
+			 Ty = ci[WS(rs, 56)];
+			 Tv = W[110];
+			 Tx = W[111];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T6i = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = cr[WS(rs, 24)];
+			 TD = ci[WS(rs, 24)];
+			 TA = W[46];
+			 TC = W[47];
+			 TE = FMA(TA, TB, TC * TD);
+			 T6j = FNMS(TC, TB, TA * TD);
+		    }
+		    T6h = Tz - TE;
+		    T6k = T6i - T6j;
+		    {
+			 E Tu, TF, TcM, TcN;
+			 Tu = To + Tt;
+			 TF = Tz + TE;
+			 TG = Tu + TF;
+			 Thm = Tu - TF;
+			 TcM = T6i + T6j;
+			 TcN = T6d + T6e;
+			 TcO = TcM - TcN;
+			 TgO = TcN + TcM;
+		    }
+		    {
+			 E T6g, T6l, TaA, TaB;
+			 T6g = T6c - T6f;
+			 T6l = T6h + T6k;
+			 T6m = KP707106781 * (T6g + T6l);
+			 Tim = KP707106781 * (T6l - T6g);
+			 TaA = T6c + T6f;
+			 TaB = T6h - T6k;
+			 TaC = KP707106781 * (TaA + TaB);
+			 ThQ = KP707106781 * (TaA - TaB);
+		    }
+	       }
+	       {
+		    E TS, TcR, T6o, T6v, T13, TcS, T6r, T6w, T6s, T6x;
+		    {
+			 E TM, T6t, TR, T6u;
+			 {
+			      E TJ, TL, TI, TK;
+			      TJ = cr[WS(rs, 4)];
+			      TL = ci[WS(rs, 4)];
+			      TI = W[6];
+			      TK = W[7];
+			      TM = FMA(TI, TJ, TK * TL);
+			      T6t = FNMS(TK, TJ, TI * TL);
+			 }
+			 {
+			      E TO, TQ, TN, TP;
+			      TO = cr[WS(rs, 36)];
+			      TQ = ci[WS(rs, 36)];
+			      TN = W[70];
+			      TP = W[71];
+			      TR = FMA(TN, TO, TP * TQ);
+			      T6u = FNMS(TP, TO, TN * TQ);
+			 }
+			 TS = TM + TR;
+			 TcR = T6t + T6u;
+			 T6o = TM - TR;
+			 T6v = T6t - T6u;
+		    }
+		    {
+			 E TX, T6p, T12, T6q;
+			 {
+			      E TU, TW, TT, TV;
+			      TU = cr[WS(rs, 20)];
+			      TW = ci[WS(rs, 20)];
+			      TT = W[38];
+			      TV = W[39];
+			      TX = FMA(TT, TU, TV * TW);
+			      T6p = FNMS(TV, TU, TT * TW);
+			 }
+			 {
+			      E TZ, T11, TY, T10;
+			      TZ = cr[WS(rs, 52)];
+			      T11 = ci[WS(rs, 52)];
+			      TY = W[102];
+			      T10 = W[103];
+			      T12 = FMA(TY, TZ, T10 * T11);
+			      T6q = FNMS(T10, TZ, TY * T11);
+			 }
+			 T13 = TX + T12;
+			 TcS = T6p + T6q;
+			 T6r = T6p - T6q;
+			 T6w = TX - T12;
+		    }
+		    T14 = TS + T13;
+		    Tfr = TcR + TcS;
+		    T6s = T6o - T6r;
+		    T6x = T6v + T6w;
+		    T6y = FNMS(KP382683432, T6x, KP923879532 * T6s);
+		    T9O = FMA(KP923879532, T6x, KP382683432 * T6s);
+		    {
+			 E TaE, TaF, TcQ, TcT;
+			 TaE = T6v - T6w;
+			 TaF = T6o + T6r;
+			 TaG = FMA(KP382683432, TaE, KP923879532 * TaF);
+			 Tc0 = FNMS(KP923879532, TaE, KP382683432 * TaF);
+			 TcQ = TS - T13;
+			 TcT = TcR - TcS;
+			 TcU = TcQ + TcT;
+			 TeE = TcQ - TcT;
+		    }
+	       }
+	       {
+		    E T1f, TcW, T6B, T6E, T1q, TcX, T6C, T6H, T6D, T6I;
+		    {
+			 E T19, T6z, T1e, T6A;
+			 {
+			      E T16, T18, T15, T17;
+			      T16 = cr[WS(rs, 60)];
+			      T18 = ci[WS(rs, 60)];
+			      T15 = W[118];
+			      T17 = W[119];
+			      T19 = FMA(T15, T16, T17 * T18);
+			      T6z = FNMS(T17, T16, T15 * T18);
+			 }
+			 {
+			      E T1b, T1d, T1a, T1c;
+			      T1b = cr[WS(rs, 28)];
+			      T1d = ci[WS(rs, 28)];
+			      T1a = W[54];
+			      T1c = W[55];
+			      T1e = FMA(T1a, T1b, T1c * T1d);
+			      T6A = FNMS(T1c, T1b, T1a * T1d);
+			 }
+			 T1f = T19 + T1e;
+			 TcW = T6z + T6A;
+			 T6B = T6z - T6A;
+			 T6E = T19 - T1e;
+		    }
+		    {
+			 E T1k, T6F, T1p, T6G;
+			 {
+			      E T1h, T1j, T1g, T1i;
+			      T1h = cr[WS(rs, 12)];
+			      T1j = ci[WS(rs, 12)];
+			      T1g = W[22];
+			      T1i = W[23];
+			      T1k = FMA(T1g, T1h, T1i * T1j);
+			      T6F = FNMS(T1i, T1h, T1g * T1j);
+			 }
+			 {
+			      E T1m, T1o, T1l, T1n;
+			      T1m = cr[WS(rs, 44)];
+			      T1o = ci[WS(rs, 44)];
+			      T1l = W[86];
+			      T1n = W[87];
+			      T1p = FMA(T1l, T1m, T1n * T1o);
+			      T6G = FNMS(T1n, T1m, T1l * T1o);
+			 }
+			 T1q = T1k + T1p;
+			 TcX = T6F + T6G;
+			 T6C = T1k - T1p;
+			 T6H = T6F - T6G;
+		    }
+		    T1r = T1f + T1q;
+		    Tfq = TcW + TcX;
+		    T6D = T6B + T6C;
+		    T6I = T6E - T6H;
+		    T6J = FMA(KP382683432, T6D, KP923879532 * T6I);
+		    T9P = FNMS(KP923879532, T6D, KP382683432 * T6I);
+		    {
+			 E TaH, TaI, TcV, TcY;
+			 TaH = T6E + T6H;
+			 TaI = T6B - T6C;
+			 TaJ = FNMS(KP382683432, TaI, KP923879532 * TaH);
+			 Tc1 = FMA(KP923879532, TaI, KP382683432 * TaH);
+			 TcV = T1f - T1q;
+			 TcY = TcW - TcX;
+			 TcZ = TcV - TcY;
+			 TeF = TcV + TcY;
+		    }
+	       }
+	       {
+		    E T1y, T73, T1D, T74, T1E, Td7, T1J, T6N, T1O, T6O, T1P, Td8, T21, Td4, T6R;
+		    E T6U, T2c, Td3, T6W, T6Z;
+		    {
+			 E T1v, T1x, T1u, T1w;
+			 T1v = cr[WS(rs, 2)];
+			 T1x = ci[WS(rs, 2)];
+			 T1u = W[2];
+			 T1w = W[3];
+			 T1y = FMA(T1u, T1v, T1w * T1x);
+			 T73 = FNMS(T1w, T1v, T1u * T1x);
+		    }
+		    {
+			 E T1A, T1C, T1z, T1B;
+			 T1A = cr[WS(rs, 34)];
+			 T1C = ci[WS(rs, 34)];
+			 T1z = W[66];
+			 T1B = W[67];
+			 T1D = FMA(T1z, T1A, T1B * T1C);
+			 T74 = FNMS(T1B, T1A, T1z * T1C);
+		    }
+		    T1E = T1y + T1D;
+		    Td7 = T73 + T74;
+		    {
+			 E T1G, T1I, T1F, T1H;
+			 T1G = cr[WS(rs, 18)];
+			 T1I = ci[WS(rs, 18)];
+			 T1F = W[34];
+			 T1H = W[35];
+			 T1J = FMA(T1F, T1G, T1H * T1I);
+			 T6N = FNMS(T1H, T1G, T1F * T1I);
+		    }
+		    {
+			 E T1L, T1N, T1K, T1M;
+			 T1L = cr[WS(rs, 50)];
+			 T1N = ci[WS(rs, 50)];
+			 T1K = W[98];
+			 T1M = W[99];
+			 T1O = FMA(T1K, T1L, T1M * T1N);
+			 T6O = FNMS(T1M, T1L, T1K * T1N);
+		    }
+		    T1P = T1J + T1O;
+		    Td8 = T6N + T6O;
+		    {
+			 E T1V, T6S, T20, T6T;
+			 {
+			      E T1S, T1U, T1R, T1T;
+			      T1S = cr[WS(rs, 10)];
+			      T1U = ci[WS(rs, 10)];
+			      T1R = W[18];
+			      T1T = W[19];
+			      T1V = FMA(T1R, T1S, T1T * T1U);
+			      T6S = FNMS(T1T, T1S, T1R * T1U);
+			 }
+			 {
+			      E T1X, T1Z, T1W, T1Y;
+			      T1X = cr[WS(rs, 42)];
+			      T1Z = ci[WS(rs, 42)];
+			      T1W = W[82];
+			      T1Y = W[83];
+			      T20 = FMA(T1W, T1X, T1Y * T1Z);
+			      T6T = FNMS(T1Y, T1X, T1W * T1Z);
+			 }
+			 T21 = T1V + T20;
+			 Td4 = T6S + T6T;
+			 T6R = T1V - T20;
+			 T6U = T6S - T6T;
+		    }
+		    {
+			 E T26, T6X, T2b, T6Y;
+			 {
+			      E T23, T25, T22, T24;
+			      T23 = cr[WS(rs, 58)];
+			      T25 = ci[WS(rs, 58)];
+			      T22 = W[114];
+			      T24 = W[115];
+			      T26 = FMA(T22, T23, T24 * T25);
+			      T6X = FNMS(T24, T23, T22 * T25);
+			 }
+			 {
+			      E T28, T2a, T27, T29;
+			      T28 = cr[WS(rs, 26)];
+			      T2a = ci[WS(rs, 26)];
+			      T27 = W[50];
+			      T29 = W[51];
+			      T2b = FMA(T27, T28, T29 * T2a);
+			      T6Y = FNMS(T29, T28, T27 * T2a);
+			 }
+			 T2c = T26 + T2b;
+			 Td3 = T6X + T6Y;
+			 T6W = T26 - T2b;
+			 T6Z = T6X - T6Y;
+		    }
+		    T1Q = T1E + T1P;
+		    T2d = T21 + T2c;
+		    Tfu = T1Q - T2d;
+		    Tfv = Td7 + Td8;
+		    Tfw = Td4 + Td3;
+		    Tfx = Tfv - Tfw;
+		    {
+			 E T6M, T6P, Td9, Tda;
+			 T6M = T1y - T1D;
+			 T6P = T6N - T6O;
+			 T6Q = T6M - T6P;
+			 TaM = T6M + T6P;
+			 Td9 = Td7 - Td8;
+			 Tda = T21 - T2c;
+			 Tdb = Td9 - Tda;
+			 TeI = Td9 + Tda;
+		    }
+		    {
+			 E T6V, T70, T78, T79;
+			 T6V = T6R - T6U;
+			 T70 = T6W + T6Z;
+			 T71 = KP707106781 * (T6V + T70);
+			 TaQ = KP707106781 * (T70 - T6V);
+			 T78 = T6R + T6U;
+			 T79 = T6Z - T6W;
+			 T7a = KP707106781 * (T78 + T79);
+			 TaN = KP707106781 * (T78 - T79);
+		    }
+		    {
+			 E Td2, Td5, T75, T76;
+			 Td2 = T1E - T1P;
+			 Td5 = Td3 - Td4;
+			 Td6 = Td2 - Td5;
+			 TeJ = Td2 + Td5;
+			 T75 = T73 - T74;
+			 T76 = T1J - T1O;
+			 T77 = T75 + T76;
+			 TaP = T75 - T76;
+		    }
+	       }
+	       {
+		    E T2j, T7u, T2o, T7v, T2p, Tdd, T2u, T7e, T2z, T7f, T2A, Tde, T2M, Tdk, T7i;
+		    E T7l, T2X, Tdj, T7n, T7q;
+		    {
+			 E T2g, T2i, T2f, T2h;
+			 T2g = cr[WS(rs, 62)];
+			 T2i = ci[WS(rs, 62)];
+			 T2f = W[122];
+			 T2h = W[123];
+			 T2j = FMA(T2f, T2g, T2h * T2i);
+			 T7u = FNMS(T2h, T2g, T2f * T2i);
+		    }
+		    {
+			 E T2l, T2n, T2k, T2m;
+			 T2l = cr[WS(rs, 30)];
+			 T2n = ci[WS(rs, 30)];
+			 T2k = W[58];
+			 T2m = W[59];
+			 T2o = FMA(T2k, T2l, T2m * T2n);
+			 T7v = FNMS(T2m, T2l, T2k * T2n);
+		    }
+		    T2p = T2j + T2o;
+		    Tdd = T7u + T7v;
+		    {
+			 E T2r, T2t, T2q, T2s;
+			 T2r = cr[WS(rs, 14)];
+			 T2t = ci[WS(rs, 14)];
+			 T2q = W[26];
+			 T2s = W[27];
+			 T2u = FMA(T2q, T2r, T2s * T2t);
+			 T7e = FNMS(T2s, T2r, T2q * T2t);
+		    }
+		    {
+			 E T2w, T2y, T2v, T2x;
+			 T2w = cr[WS(rs, 46)];
+			 T2y = ci[WS(rs, 46)];
+			 T2v = W[90];
+			 T2x = W[91];
+			 T2z = FMA(T2v, T2w, T2x * T2y);
+			 T7f = FNMS(T2x, T2w, T2v * T2y);
+		    }
+		    T2A = T2u + T2z;
+		    Tde = T7e + T7f;
+		    {
+			 E T2G, T7j, T2L, T7k;
+			 {
+			      E T2D, T2F, T2C, T2E;
+			      T2D = cr[WS(rs, 6)];
+			      T2F = ci[WS(rs, 6)];
+			      T2C = W[10];
+			      T2E = W[11];
+			      T2G = FMA(T2C, T2D, T2E * T2F);
+			      T7j = FNMS(T2E, T2D, T2C * T2F);
+			 }
+			 {
+			      E T2I, T2K, T2H, T2J;
+			      T2I = cr[WS(rs, 38)];
+			      T2K = ci[WS(rs, 38)];
+			      T2H = W[74];
+			      T2J = W[75];
+			      T2L = FMA(T2H, T2I, T2J * T2K);
+			      T7k = FNMS(T2J, T2I, T2H * T2K);
+			 }
+			 T2M = T2G + T2L;
+			 Tdk = T7j + T7k;
+			 T7i = T2G - T2L;
+			 T7l = T7j - T7k;
+		    }
+		    {
+			 E T2R, T7o, T2W, T7p;
+			 {
+			      E T2O, T2Q, T2N, T2P;
+			      T2O = cr[WS(rs, 54)];
+			      T2Q = ci[WS(rs, 54)];
+			      T2N = W[106];
+			      T2P = W[107];
+			      T2R = FMA(T2N, T2O, T2P * T2Q);
+			      T7o = FNMS(T2P, T2O, T2N * T2Q);
+			 }
+			 {
+			      E T2T, T2V, T2S, T2U;
+			      T2T = cr[WS(rs, 22)];
+			      T2V = ci[WS(rs, 22)];
+			      T2S = W[42];
+			      T2U = W[43];
+			      T2W = FMA(T2S, T2T, T2U * T2V);
+			      T7p = FNMS(T2U, T2T, T2S * T2V);
+			 }
+			 T2X = T2R + T2W;
+			 Tdj = T7o + T7p;
+			 T7n = T2R - T2W;
+			 T7q = T7o - T7p;
+		    }
+		    T2B = T2p + T2A;
+		    T2Y = T2M + T2X;
+		    Tfz = T2B - T2Y;
+		    TfA = Tdd + Tde;
+		    TfB = Tdk + Tdj;
+		    TfC = TfA - TfB;
+		    {
+			 E T7d, T7g, Tdi, Tdl;
+			 T7d = T2j - T2o;
+			 T7g = T7e - T7f;
+			 T7h = T7d - T7g;
+			 TaW = T7d + T7g;
+			 Tdi = T2p - T2A;
+			 Tdl = Tdj - Tdk;
+			 Tdm = Tdi - Tdl;
+			 TeL = Tdi + Tdl;
+		    }
+		    {
+			 E T7m, T7r, T7z, T7A;
+			 T7m = T7i - T7l;
+			 T7r = T7n + T7q;
+			 T7s = KP707106781 * (T7m + T7r);
+			 TaU = KP707106781 * (T7r - T7m);
+			 T7z = T7i + T7l;
+			 T7A = T7q - T7n;
+			 T7B = KP707106781 * (T7z + T7A);
+			 TaX = KP707106781 * (T7z - T7A);
+		    }
+		    {
+			 E Tdf, Tdg, T7w, T7x;
+			 Tdf = Tdd - Tde;
+			 Tdg = T2M - T2X;
+			 Tdh = Tdf - Tdg;
+			 TeM = Tdf + Tdg;
+			 T7w = T7u - T7v;
+			 T7x = T2u - T2z;
+			 T7y = T7w + T7x;
+			 TaT = T7w - T7x;
+		    }
+	       }
+	       {
+		    E T4D, T9e, T4I, T9f, T4J, TdR, T4O, T8A, T4T, T8B, T4U, TdS, T56, Tea, T8E;
+		    E T8H, T5h, Te9, T8J, T8M;
+		    {
+			 E T4A, T4C, T4z, T4B;
+			 T4A = cr[WS(rs, 63)];
+			 T4C = ci[WS(rs, 63)];
+			 T4z = W[124];
+			 T4B = W[125];
+			 T4D = FMA(T4z, T4A, T4B * T4C);
+			 T9e = FNMS(T4B, T4A, T4z * T4C);
+		    }
+		    {
+			 E T4F, T4H, T4E, T4G;
+			 T4F = cr[WS(rs, 31)];
+			 T4H = ci[WS(rs, 31)];
+			 T4E = W[60];
+			 T4G = W[61];
+			 T4I = FMA(T4E, T4F, T4G * T4H);
+			 T9f = FNMS(T4G, T4F, T4E * T4H);
+		    }
+		    T4J = T4D + T4I;
+		    TdR = T9e + T9f;
+		    {
+			 E T4L, T4N, T4K, T4M;
+			 T4L = cr[WS(rs, 15)];
+			 T4N = ci[WS(rs, 15)];
+			 T4K = W[28];
+			 T4M = W[29];
+			 T4O = FMA(T4K, T4L, T4M * T4N);
+			 T8A = FNMS(T4M, T4L, T4K * T4N);
+		    }
+		    {
+			 E T4Q, T4S, T4P, T4R;
+			 T4Q = cr[WS(rs, 47)];
+			 T4S = ci[WS(rs, 47)];
+			 T4P = W[92];
+			 T4R = W[93];
+			 T4T = FMA(T4P, T4Q, T4R * T4S);
+			 T8B = FNMS(T4R, T4Q, T4P * T4S);
+		    }
+		    T4U = T4O + T4T;
+		    TdS = T8A + T8B;
+		    {
+			 E T50, T8F, T55, T8G;
+			 {
+			      E T4X, T4Z, T4W, T4Y;
+			      T4X = cr[WS(rs, 7)];
+			      T4Z = ci[WS(rs, 7)];
+			      T4W = W[12];
+			      T4Y = W[13];
+			      T50 = FMA(T4W, T4X, T4Y * T4Z);
+			      T8F = FNMS(T4Y, T4X, T4W * T4Z);
+			 }
+			 {
+			      E T52, T54, T51, T53;
+			      T52 = cr[WS(rs, 39)];
+			      T54 = ci[WS(rs, 39)];
+			      T51 = W[76];
+			      T53 = W[77];
+			      T55 = FMA(T51, T52, T53 * T54);
+			      T8G = FNMS(T53, T52, T51 * T54);
+			 }
+			 T56 = T50 + T55;
+			 Tea = T8F + T8G;
+			 T8E = T50 - T55;
+			 T8H = T8F - T8G;
+		    }
+		    {
+			 E T5b, T8K, T5g, T8L;
+			 {
+			      E T58, T5a, T57, T59;
+			      T58 = cr[WS(rs, 55)];
+			      T5a = ci[WS(rs, 55)];
+			      T57 = W[108];
+			      T59 = W[109];
+			      T5b = FMA(T57, T58, T59 * T5a);
+			      T8K = FNMS(T59, T58, T57 * T5a);
+			 }
+			 {
+			      E T5d, T5f, T5c, T5e;
+			      T5d = cr[WS(rs, 23)];
+			      T5f = ci[WS(rs, 23)];
+			      T5c = W[44];
+			      T5e = W[45];
+			      T5g = FMA(T5c, T5d, T5e * T5f);
+			      T8L = FNMS(T5e, T5d, T5c * T5f);
+			 }
+			 T5h = T5b + T5g;
+			 Te9 = T8K + T8L;
+			 T8J = T5b - T5g;
+			 T8M = T8K - T8L;
+		    }
+		    {
+			 E T4V, T5i, Te8, Teb;
+			 T4V = T4J + T4U;
+			 T5i = T56 + T5h;
+			 T5j = T4V + T5i;
+			 TfR = T4V - T5i;
+			 Te8 = T4J - T4U;
+			 Teb = Te9 - Tea;
+			 Tec = Te8 - Teb;
+			 TeX = Te8 + Teb;
+		    }
+		    {
+			 E TfW, TfX, T8z, T8C;
+			 TfW = TdR + TdS;
+			 TfX = Tea + Te9;
+			 TfY = TfW - TfX;
+			 Tgy = TfW + TfX;
+			 T8z = T4D - T4I;
+			 T8C = T8A - T8B;
+			 T8D = T8z - T8C;
+			 Tbl = T8z + T8C;
+		    }
+		    {
+			 E T8I, T8N, T9j, T9k;
+			 T8I = T8E - T8H;
+			 T8N = T8J + T8M;
+			 T8O = KP707106781 * (T8I + T8N);
+			 Tbx = KP707106781 * (T8N - T8I);
+			 T9j = T8E + T8H;
+			 T9k = T8M - T8J;
+			 T9l = KP707106781 * (T9j + T9k);
+			 Tbm = KP707106781 * (T9j - T9k);
+		    }
+		    {
+			 E TdT, TdU, T9g, T9h;
+			 TdT = TdR - TdS;
+			 TdU = T56 - T5h;
+			 TdV = TdT - TdU;
+			 Tf0 = TdT + TdU;
+			 T9g = T9e - T9f;
+			 T9h = T4O - T4T;
+			 T9i = T9g + T9h;
+			 Tbw = T9g - T9h;
+		    }
+	       }
+	       {
+		    E T36, T7G, T3b, T7H, T3c, TdH, T3h, T8m, T3m, T8n, T3n, TdI, T3z, Tds, T7L;
+		    E T7O, T3K, Tdr, T7S, T7T;
+		    {
+			 E T33, T35, T32, T34;
+			 T33 = cr[WS(rs, 1)];
+			 T35 = ci[WS(rs, 1)];
+			 T32 = W[0];
+			 T34 = W[1];
+			 T36 = FMA(T32, T33, T34 * T35);
+			 T7G = FNMS(T34, T33, T32 * T35);
+		    }
+		    {
+			 E T38, T3a, T37, T39;
+			 T38 = cr[WS(rs, 33)];
+			 T3a = ci[WS(rs, 33)];
+			 T37 = W[64];
+			 T39 = W[65];
+			 T3b = FMA(T37, T38, T39 * T3a);
+			 T7H = FNMS(T39, T38, T37 * T3a);
+		    }
+		    T3c = T36 + T3b;
+		    TdH = T7G + T7H;
+		    {
+			 E T3e, T3g, T3d, T3f;
+			 T3e = cr[WS(rs, 17)];
+			 T3g = ci[WS(rs, 17)];
+			 T3d = W[32];
+			 T3f = W[33];
+			 T3h = FMA(T3d, T3e, T3f * T3g);
+			 T8m = FNMS(T3f, T3e, T3d * T3g);
+		    }
+		    {
+			 E T3j, T3l, T3i, T3k;
+			 T3j = cr[WS(rs, 49)];
+			 T3l = ci[WS(rs, 49)];
+			 T3i = W[96];
+			 T3k = W[97];
+			 T3m = FMA(T3i, T3j, T3k * T3l);
+			 T8n = FNMS(T3k, T3j, T3i * T3l);
+		    }
+		    T3n = T3h + T3m;
+		    TdI = T8m + T8n;
+		    {
+			 E T3t, T7M, T3y, T7N;
+			 {
+			      E T3q, T3s, T3p, T3r;
+			      T3q = cr[WS(rs, 9)];
+			      T3s = ci[WS(rs, 9)];
+			      T3p = W[16];
+			      T3r = W[17];
+			      T3t = FMA(T3p, T3q, T3r * T3s);
+			      T7M = FNMS(T3r, T3q, T3p * T3s);
+			 }
+			 {
+			      E T3v, T3x, T3u, T3w;
+			      T3v = cr[WS(rs, 41)];
+			      T3x = ci[WS(rs, 41)];
+			      T3u = W[80];
+			      T3w = W[81];
+			      T3y = FMA(T3u, T3v, T3w * T3x);
+			      T7N = FNMS(T3w, T3v, T3u * T3x);
+			 }
+			 T3z = T3t + T3y;
+			 Tds = T7M + T7N;
+			 T7L = T3t - T3y;
+			 T7O = T7M - T7N;
+		    }
+		    {
+			 E T3E, T7Q, T3J, T7R;
+			 {
+			      E T3B, T3D, T3A, T3C;
+			      T3B = cr[WS(rs, 57)];
+			      T3D = ci[WS(rs, 57)];
+			      T3A = W[112];
+			      T3C = W[113];
+			      T3E = FMA(T3A, T3B, T3C * T3D);
+			      T7Q = FNMS(T3C, T3B, T3A * T3D);
+			 }
+			 {
+			      E T3G, T3I, T3F, T3H;
+			      T3G = cr[WS(rs, 25)];
+			      T3I = ci[WS(rs, 25)];
+			      T3F = W[48];
+			      T3H = W[49];
+			      T3J = FMA(T3F, T3G, T3H * T3I);
+			      T7R = FNMS(T3H, T3G, T3F * T3I);
+			 }
+			 T3K = T3E + T3J;
+			 Tdr = T7Q + T7R;
+			 T7S = T7Q - T7R;
+			 T7T = T3E - T3J;
+		    }
+		    {
+			 E T3o, T3L, TdJ, TdK;
+			 T3o = T3c + T3n;
+			 T3L = T3z + T3K;
+			 T3M = T3o + T3L;
+			 TfL = T3o - T3L;
+			 TdJ = TdH - TdI;
+			 TdK = T3z - T3K;
+			 TdL = TdJ - TdK;
+			 TeT = TdJ + TdK;
+		    }
+		    {
+			 E TfG, TfH, T7I, T7J;
+			 TfG = TdH + TdI;
+			 TfH = Tds + Tdr;
+			 TfI = TfG - TfH;
+			 Tgt = TfG + TfH;
+			 T7I = T7G - T7H;
+			 T7J = T3h - T3m;
+			 T7K = T7I + T7J;
+			 Tbd = T7I - T7J;
+		    }
+		    {
+			 E T7P, T7U, T8q, T8r;
+			 T7P = T7L + T7O;
+			 T7U = T7S - T7T;
+			 T7V = KP707106781 * (T7P + T7U);
+			 Tb3 = KP707106781 * (T7P - T7U);
+			 T8q = T7L - T7O;
+			 T8r = T7T + T7S;
+			 T8s = KP707106781 * (T8q + T8r);
+			 Tbe = KP707106781 * (T8r - T8q);
+		    }
+		    {
+			 E Tdq, Tdt, T8l, T8o;
+			 Tdq = T3c - T3n;
+			 Tdt = Tdr - Tds;
+			 Tdu = Tdq - Tdt;
+			 TeQ = Tdq + Tdt;
+			 T8l = T36 - T3b;
+			 T8o = T8m - T8n;
+			 T8p = T8l - T8o;
+			 Tb2 = T8l + T8o;
+		    }
+	       }
+	       {
+		    E T3X, Tdw, T7Z, T82, T4v, TdB, T8b, T8g, T48, Tdx, T80, T85, T4k, TdA, T8a;
+		    E T8d;
+		    {
+			 E T3R, T7X, T3W, T7Y;
+			 {
+			      E T3O, T3Q, T3N, T3P;
+			      T3O = cr[WS(rs, 5)];
+			      T3Q = ci[WS(rs, 5)];
+			      T3N = W[8];
+			      T3P = W[9];
+			      T3R = FMA(T3N, T3O, T3P * T3Q);
+			      T7X = FNMS(T3P, T3O, T3N * T3Q);
+			 }
+			 {
+			      E T3T, T3V, T3S, T3U;
+			      T3T = cr[WS(rs, 37)];
+			      T3V = ci[WS(rs, 37)];
+			      T3S = W[72];
+			      T3U = W[73];
+			      T3W = FMA(T3S, T3T, T3U * T3V);
+			      T7Y = FNMS(T3U, T3T, T3S * T3V);
+			 }
+			 T3X = T3R + T3W;
+			 Tdw = T7X + T7Y;
+			 T7Z = T7X - T7Y;
+			 T82 = T3R - T3W;
+		    }
+		    {
+			 E T4p, T8e, T4u, T8f;
+			 {
+			      E T4m, T4o, T4l, T4n;
+			      T4m = cr[WS(rs, 13)];
+			      T4o = ci[WS(rs, 13)];
+			      T4l = W[24];
+			      T4n = W[25];
+			      T4p = FMA(T4l, T4m, T4n * T4o);
+			      T8e = FNMS(T4n, T4m, T4l * T4o);
+			 }
+			 {
+			      E T4r, T4t, T4q, T4s;
+			      T4r = cr[WS(rs, 45)];
+			      T4t = ci[WS(rs, 45)];
+			      T4q = W[88];
+			      T4s = W[89];
+			      T4u = FMA(T4q, T4r, T4s * T4t);
+			      T8f = FNMS(T4s, T4r, T4q * T4t);
+			 }
+			 T4v = T4p + T4u;
+			 TdB = T8e + T8f;
+			 T8b = T4p - T4u;
+			 T8g = T8e - T8f;
+		    }
+		    {
+			 E T42, T83, T47, T84;
+			 {
+			      E T3Z, T41, T3Y, T40;
+			      T3Z = cr[WS(rs, 21)];
+			      T41 = ci[WS(rs, 21)];
+			      T3Y = W[40];
+			      T40 = W[41];
+			      T42 = FMA(T3Y, T3Z, T40 * T41);
+			      T83 = FNMS(T40, T3Z, T3Y * T41);
+			 }
+			 {
+			      E T44, T46, T43, T45;
+			      T44 = cr[WS(rs, 53)];
+			      T46 = ci[WS(rs, 53)];
+			      T43 = W[104];
+			      T45 = W[105];
+			      T47 = FMA(T43, T44, T45 * T46);
+			      T84 = FNMS(T45, T44, T43 * T46);
+			 }
+			 T48 = T42 + T47;
+			 Tdx = T83 + T84;
+			 T80 = T42 - T47;
+			 T85 = T83 - T84;
+		    }
+		    {
+			 E T4e, T88, T4j, T89;
+			 {
+			      E T4b, T4d, T4a, T4c;
+			      T4b = cr[WS(rs, 61)];
+			      T4d = ci[WS(rs, 61)];
+			      T4a = W[120];
+			      T4c = W[121];
+			      T4e = FMA(T4a, T4b, T4c * T4d);
+			      T88 = FNMS(T4c, T4b, T4a * T4d);
+			 }
+			 {
+			      E T4g, T4i, T4f, T4h;
+			      T4g = cr[WS(rs, 29)];
+			      T4i = ci[WS(rs, 29)];
+			      T4f = W[56];
+			      T4h = W[57];
+			      T4j = FMA(T4f, T4g, T4h * T4i);
+			      T89 = FNMS(T4h, T4g, T4f * T4i);
+			 }
+			 T4k = T4e + T4j;
+			 TdA = T88 + T89;
+			 T8a = T88 - T89;
+			 T8d = T4e - T4j;
+		    }
+		    {
+			 E T49, T4w, TdC, TdD;
+			 T49 = T3X + T48;
+			 T4w = T4k + T4v;
+			 T4x = T49 + T4w;
+			 TfJ = T49 - T4w;
+			 TdC = TdA - TdB;
+			 TdD = T4k - T4v;
+			 TdE = TdC - TdD;
+			 TdM = TdD + TdC;
+		    }
+		    {
+			 E TfM, TfN, T81, T86;
+			 TfM = TdA + TdB;
+			 TfN = Tdw + Tdx;
+			 TfO = TfM - TfN;
+			 Tgu = TfN + TfM;
+			 T81 = T7Z + T80;
+			 T86 = T82 - T85;
+			 T87 = FMA(KP923879532, T81, KP382683432 * T86);
+			 T8u = FNMS(KP382683432, T81, KP923879532 * T86);
+		    }
+		    {
+			 E T8c, T8h, Tb8, Tb9;
+			 T8c = T8a + T8b;
+			 T8h = T8d - T8g;
+			 T8i = FNMS(KP382683432, T8h, KP923879532 * T8c);
+			 T8v = FMA(KP382683432, T8c, KP923879532 * T8h);
+			 Tb8 = T8d + T8g;
+			 Tb9 = T8a - T8b;
+			 Tba = FNMS(KP382683432, Tb9, KP923879532 * Tb8);
+			 Tbh = FMA(KP923879532, Tb9, KP382683432 * Tb8);
+		    }
+		    {
+			 E Tdv, Tdy, Tb5, Tb6;
+			 Tdv = T3X - T48;
+			 Tdy = Tdw - Tdx;
+			 Tdz = Tdv + Tdy;
+			 TdN = Tdv - Tdy;
+			 Tb5 = T7Z - T80;
+			 Tb6 = T82 + T85;
+			 Tb7 = FMA(KP382683432, Tb5, KP923879532 * Tb6);
+			 Tbg = FNMS(KP382683432, Tb6, KP923879532 * Tb5);
+		    }
+	       }
+	       {
+		    E T5u, Te2, T8Q, T8X, T62, TdY, T94, T99, T5F, Te3, T8T, T8Y, T5R, TdX, T93;
+		    E T96;
+		    {
+			 E T5o, T8V, T5t, T8W;
+			 {
+			      E T5l, T5n, T5k, T5m;
+			      T5l = cr[WS(rs, 3)];
+			      T5n = ci[WS(rs, 3)];
+			      T5k = W[4];
+			      T5m = W[5];
+			      T5o = FMA(T5k, T5l, T5m * T5n);
+			      T8V = FNMS(T5m, T5l, T5k * T5n);
+			 }
+			 {
+			      E T5q, T5s, T5p, T5r;
+			      T5q = cr[WS(rs, 35)];
+			      T5s = ci[WS(rs, 35)];
+			      T5p = W[68];
+			      T5r = W[69];
+			      T5t = FMA(T5p, T5q, T5r * T5s);
+			      T8W = FNMS(T5r, T5q, T5p * T5s);
+			 }
+			 T5u = T5o + T5t;
+			 Te2 = T8V + T8W;
+			 T8Q = T5o - T5t;
+			 T8X = T8V - T8W;
+		    }
+		    {
+			 E T5W, T97, T61, T98;
+			 {
+			      E T5T, T5V, T5S, T5U;
+			      T5T = cr[WS(rs, 11)];
+			      T5V = ci[WS(rs, 11)];
+			      T5S = W[20];
+			      T5U = W[21];
+			      T5W = FMA(T5S, T5T, T5U * T5V);
+			      T97 = FNMS(T5U, T5T, T5S * T5V);
+			 }
+			 {
+			      E T5Y, T60, T5X, T5Z;
+			      T5Y = cr[WS(rs, 43)];
+			      T60 = ci[WS(rs, 43)];
+			      T5X = W[84];
+			      T5Z = W[85];
+			      T61 = FMA(T5X, T5Y, T5Z * T60);
+			      T98 = FNMS(T5Z, T5Y, T5X * T60);
+			 }
+			 T62 = T5W + T61;
+			 TdY = T97 + T98;
+			 T94 = T5W - T61;
+			 T99 = T97 - T98;
+		    }
+		    {
+			 E T5z, T8R, T5E, T8S;
+			 {
+			      E T5w, T5y, T5v, T5x;
+			      T5w = cr[WS(rs, 19)];
+			      T5y = ci[WS(rs, 19)];
+			      T5v = W[36];
+			      T5x = W[37];
+			      T5z = FMA(T5v, T5w, T5x * T5y);
+			      T8R = FNMS(T5x, T5w, T5v * T5y);
+			 }
+			 {
+			      E T5B, T5D, T5A, T5C;
+			      T5B = cr[WS(rs, 51)];
+			      T5D = ci[WS(rs, 51)];
+			      T5A = W[100];
+			      T5C = W[101];
+			      T5E = FMA(T5A, T5B, T5C * T5D);
+			      T8S = FNMS(T5C, T5B, T5A * T5D);
+			 }
+			 T5F = T5z + T5E;
+			 Te3 = T8R + T8S;
+			 T8T = T8R - T8S;
+			 T8Y = T5z - T5E;
+		    }
+		    {
+			 E T5L, T91, T5Q, T92;
+			 {
+			      E T5I, T5K, T5H, T5J;
+			      T5I = cr[WS(rs, 59)];
+			      T5K = ci[WS(rs, 59)];
+			      T5H = W[116];
+			      T5J = W[117];
+			      T5L = FMA(T5H, T5I, T5J * T5K);
+			      T91 = FNMS(T5J, T5I, T5H * T5K);
+			 }
+			 {
+			      E T5N, T5P, T5M, T5O;
+			      T5N = cr[WS(rs, 27)];
+			      T5P = ci[WS(rs, 27)];
+			      T5M = W[52];
+			      T5O = W[53];
+			      T5Q = FMA(T5M, T5N, T5O * T5P);
+			      T92 = FNMS(T5O, T5N, T5M * T5P);
+			 }
+			 T5R = T5L + T5Q;
+			 TdX = T91 + T92;
+			 T93 = T91 - T92;
+			 T96 = T5L - T5Q;
+		    }
+		    {
+			 E T5G, T63, Te1, Te4;
+			 T5G = T5u + T5F;
+			 T63 = T5R + T62;
+			 T64 = T5G + T63;
+			 TfZ = T5G - T63;
+			 Te1 = T5u - T5F;
+			 Te4 = Te2 - Te3;
+			 Te5 = Te1 - Te4;
+			 Ted = Te1 + Te4;
+		    }
+		    {
+			 E TfS, TfT, T8U, T8Z;
+			 TfS = TdX + TdY;
+			 TfT = Te2 + Te3;
+			 TfU = TfS - TfT;
+			 Tgz = TfT + TfS;
+			 T8U = T8Q - T8T;
+			 T8Z = T8X + T8Y;
+			 T90 = FNMS(KP382683432, T8Z, KP923879532 * T8U);
+			 T9n = FMA(KP923879532, T8Z, KP382683432 * T8U);
+		    }
+		    {
+			 E T95, T9a, Tbr, Tbs;
+			 T95 = T93 + T94;
+			 T9a = T96 - T99;
+			 T9b = FMA(KP382683432, T95, KP923879532 * T9a);
+			 T9o = FNMS(KP382683432, T9a, KP923879532 * T95);
+			 Tbr = T96 + T99;
+			 Tbs = T93 - T94;
+			 Tbt = FNMS(KP382683432, Tbs, KP923879532 * Tbr);
+			 TbA = FMA(KP923879532, Tbs, KP382683432 * Tbr);
+		    }
+		    {
+			 E TdW, TdZ, Tbo, Tbp;
+			 TdW = T5R - T62;
+			 TdZ = TdX - TdY;
+			 Te0 = TdW + TdZ;
+			 Tee = TdZ - TdW;
+			 Tbo = T8X - T8Y;
+			 Tbp = T8Q + T8T;
+			 Tbq = FMA(KP382683432, Tbo, KP923879532 * Tbp);
+			 Tbz = FNMS(KP382683432, Tbp, KP923879532 * Tbo);
+		    }
+	       }
+	       {
+		    E T1t, Tgn, TgK, TgL, TgV, Th1, T30, Th0, T66, TgX, Tgw, TgE, TgB, TgF, Tgq;
+		    E TgM;
+		    {
+			 E TH, T1s, TgI, TgJ;
+			 TH = Tj + TG;
+			 T1s = T14 + T1r;
+			 T1t = TH + T1s;
+			 Tgn = TH - T1s;
+			 TgI = Tgy + Tgz;
+			 TgJ = Tgt + Tgu;
+			 TgK = TgI - TgJ;
+			 TgL = TgJ + TgI;
+		    }
+		    {
+			 E TgN, TgU, T2e, T2Z;
+			 TgN = Tfr + Tfq;
+			 TgU = TgO + TgT;
+			 TgV = TgN + TgU;
+			 Th1 = TgU - TgN;
+			 T2e = T1Q + T2d;
+			 T2Z = T2B + T2Y;
+			 T30 = T2e + T2Z;
+			 Th0 = T2e - T2Z;
+		    }
+		    {
+			 E T4y, T65, Tgs, Tgv;
+			 T4y = T3M + T4x;
+			 T65 = T5j + T64;
+			 T66 = T4y + T65;
+			 TgX = T65 - T4y;
+			 Tgs = T3M - T4x;
+			 Tgv = Tgt - Tgu;
+			 Tgw = Tgs + Tgv;
+			 TgE = Tgs - Tgv;
+		    }
+		    {
+			 E Tgx, TgA, Tgo, Tgp;
+			 Tgx = T5j - T64;
+			 TgA = Tgy - Tgz;
+			 TgB = Tgx - TgA;
+			 TgF = Tgx + TgA;
+			 Tgo = TfA + TfB;
+			 Tgp = Tfv + Tfw;
+			 Tgq = Tgo - Tgp;
+			 TgM = Tgp + Tgo;
+		    }
+		    {
+			 E T31, TgW, TgY, TgH;
+			 T31 = T1t + T30;
+			 ci[WS(rs, 31)] = T31 - T66;
+			 cr[0] = T31 + T66;
+			 TgW = TgM + TgV;
+			 cr[WS(rs, 32)] = TgL - TgW;
+			 ci[WS(rs, 63)] = TgL + TgW;
+			 TgY = TgV - TgM;
+			 cr[WS(rs, 48)] = TgX - TgY;
+			 ci[WS(rs, 47)] = TgX + TgY;
+			 TgH = T1t - T30;
+			 cr[WS(rs, 16)] = TgH - TgK;
+			 ci[WS(rs, 15)] = TgH + TgK;
+		    }
+		    {
+			 E Tgr, TgC, TgZ, Th2;
+			 Tgr = Tgn - Tgq;
+			 TgC = KP707106781 * (Tgw + TgB);
+			 ci[WS(rs, 23)] = Tgr - TgC;
+			 cr[WS(rs, 8)] = Tgr + TgC;
+			 TgZ = KP707106781 * (TgB - Tgw);
+			 Th2 = Th0 + Th1;
+			 cr[WS(rs, 56)] = TgZ - Th2;
+			 ci[WS(rs, 39)] = TgZ + Th2;
+		    }
+		    {
+			 E Th3, Th4, TgD, TgG;
+			 Th3 = KP707106781 * (TgF - TgE);
+			 Th4 = Th1 - Th0;
+			 cr[WS(rs, 40)] = Th3 - Th4;
+			 ci[WS(rs, 55)] = Th3 + Th4;
+			 TgD = Tgn + Tgq;
+			 TgG = KP707106781 * (TgE + TgF);
+			 cr[WS(rs, 24)] = TgD - TgG;
+			 ci[WS(rs, 7)] = TgD + TgG;
+		    }
+	       }
+	       {
+		    E T6L, T9x, ThV, Ti1, T7E, Ti0, T9A, ThO, T8y, T9K, T9u, T9E, T9r, T9L, T9v;
+		    E T9H;
+		    {
+			 E T6n, T6K, ThP, ThU;
+			 T6n = T6b + T6m;
+			 T6K = T6y + T6J;
+			 T6L = T6n - T6K;
+			 T9x = T6n + T6K;
+			 ThP = T9O - T9P;
+			 ThU = ThQ + ThT;
+			 ThV = ThP + ThU;
+			 Ti1 = ThU - ThP;
+		    }
+		    {
+			 E T7c, T9y, T7D, T9z;
+			 {
+			      E T72, T7b, T7t, T7C;
+			      T72 = T6Q + T71;
+			      T7b = T77 + T7a;
+			      T7c = FMA(KP195090322, T72, KP980785280 * T7b);
+			      T9y = FNMS(KP195090322, T7b, KP980785280 * T72);
+			      T7t = T7h + T7s;
+			      T7C = T7y + T7B;
+			      T7D = FNMS(KP980785280, T7C, KP195090322 * T7t);
+			      T9z = FMA(KP980785280, T7t, KP195090322 * T7C);
+			 }
+			 T7E = T7c + T7D;
+			 Ti0 = T9z - T9y;
+			 T9A = T9y + T9z;
+			 ThO = T7c - T7D;
+		    }
+		    {
+			 E T8k, T9D, T8x, T9C;
+			 {
+			      E T7W, T8j, T8t, T8w;
+			      T7W = T7K + T7V;
+			      T8j = T87 + T8i;
+			      T8k = T7W - T8j;
+			      T9D = T7W + T8j;
+			      T8t = T8p + T8s;
+			      T8w = T8u + T8v;
+			      T8x = T8t - T8w;
+			      T9C = T8t + T8w;
+			 }
+			 T8y = FMA(KP634393284, T8k, KP773010453 * T8x);
+			 T9K = FMA(KP995184726, T9D, KP098017140 * T9C);
+			 T9u = FNMS(KP773010453, T8k, KP634393284 * T8x);
+			 T9E = FNMS(KP098017140, T9D, KP995184726 * T9C);
+		    }
+		    {
+			 E T9d, T9G, T9q, T9F;
+			 {
+			      E T8P, T9c, T9m, T9p;
+			      T8P = T8D + T8O;
+			      T9c = T90 + T9b;
+			      T9d = T8P - T9c;
+			      T9G = T8P + T9c;
+			      T9m = T9i + T9l;
+			      T9p = T9n + T9o;
+			      T9q = T9m - T9p;
+			      T9F = T9m + T9p;
+			 }
+			 T9r = FNMS(KP634393284, T9q, KP773010453 * T9d);
+			 T9L = FNMS(KP995184726, T9F, KP098017140 * T9G);
+			 T9v = FMA(KP773010453, T9q, KP634393284 * T9d);
+			 T9H = FMA(KP098017140, T9F, KP995184726 * T9G);
+		    }
+		    {
+			 E T7F, T9s, ThZ, Ti2;
+			 T7F = T6L + T7E;
+			 T9s = T8y + T9r;
+			 ci[WS(rs, 24)] = T7F - T9s;
+			 cr[WS(rs, 7)] = T7F + T9s;
+			 ThZ = T9v - T9u;
+			 Ti2 = Ti0 + Ti1;
+			 cr[WS(rs, 39)] = ThZ - Ti2;
+			 ci[WS(rs, 56)] = ThZ + Ti2;
+		    }
+		    {
+			 E Ti3, Ti4, T9t, T9w;
+			 Ti3 = T9r - T8y;
+			 Ti4 = Ti1 - Ti0;
+			 cr[WS(rs, 55)] = Ti3 - Ti4;
+			 ci[WS(rs, 40)] = Ti3 + Ti4;
+			 T9t = T6L - T7E;
+			 T9w = T9u + T9v;
+			 cr[WS(rs, 23)] = T9t - T9w;
+			 ci[WS(rs, 8)] = T9t + T9w;
+		    }
+		    {
+			 E T9B, T9I, ThN, ThW;
+			 T9B = T9x + T9A;
+			 T9I = T9E + T9H;
+			 cr[WS(rs, 31)] = T9B - T9I;
+			 ci[0] = T9B + T9I;
+			 ThN = T9L - T9K;
+			 ThW = ThO + ThV;
+			 cr[WS(rs, 63)] = ThN - ThW;
+			 ci[WS(rs, 32)] = ThN + ThW;
+		    }
+		    {
+			 E ThX, ThY, T9J, T9M;
+			 ThX = T9H - T9E;
+			 ThY = ThV - ThO;
+			 cr[WS(rs, 47)] = ThX - ThY;
+			 ci[WS(rs, 48)] = ThX + ThY;
+			 T9J = T9x - T9A;
+			 T9M = T9K + T9L;
+			 ci[WS(rs, 16)] = T9J - T9M;
+			 cr[WS(rs, 15)] = T9J + T9M;
+		    }
+	       }
+	       {
+		    E Tft, Tg7, Tgh, Tgl, Th9, Thf, TfE, Th6, TfQ, Tg4, Tga, The, Tge, Tgk, Tg1;
+		    E Tg5;
+		    {
+			 E Tfp, Tfs, Tgf, Tgg;
+			 Tfp = Tj - TG;
+			 Tfs = Tfq - Tfr;
+			 Tft = Tfp - Tfs;
+			 Tg7 = Tfp + Tfs;
+			 Tgf = TfY + TfZ;
+			 Tgg = TfR + TfU;
+			 Tgh = FMA(KP382683432, Tgf, KP923879532 * Tgg);
+			 Tgl = FNMS(KP923879532, Tgf, KP382683432 * Tgg);
+		    }
+		    {
+			 E Th7, Th8, Tfy, TfD;
+			 Th7 = T14 - T1r;
+			 Th8 = TgT - TgO;
+			 Th9 = Th7 + Th8;
+			 Thf = Th8 - Th7;
+			 Tfy = Tfu + Tfx;
+			 TfD = Tfz - TfC;
+			 TfE = KP707106781 * (Tfy + TfD);
+			 Th6 = KP707106781 * (Tfy - TfD);
+		    }
+		    {
+			 E TfK, TfP, Tg8, Tg9;
+			 TfK = TfI - TfJ;
+			 TfP = TfL - TfO;
+			 TfQ = FMA(KP382683432, TfK, KP923879532 * TfP);
+			 Tg4 = FNMS(KP923879532, TfK, KP382683432 * TfP);
+			 Tg8 = Tfu - Tfx;
+			 Tg9 = Tfz + TfC;
+			 Tga = KP707106781 * (Tg8 + Tg9);
+			 The = KP707106781 * (Tg9 - Tg8);
+		    }
+		    {
+			 E Tgc, Tgd, TfV, Tg0;
+			 Tgc = TfL + TfO;
+			 Tgd = TfI + TfJ;
+			 Tge = FNMS(KP382683432, Tgd, KP923879532 * Tgc);
+			 Tgk = FMA(KP923879532, Tgd, KP382683432 * Tgc);
+			 TfV = TfR - TfU;
+			 Tg0 = TfY - TfZ;
+			 Tg1 = FNMS(KP382683432, Tg0, KP923879532 * TfV);
+			 Tg5 = FMA(KP923879532, Tg0, KP382683432 * TfV);
+		    }
+		    {
+			 E TfF, Tg2, Thd, Thg;
+			 TfF = Tft + TfE;
+			 Tg2 = TfQ + Tg1;
+			 ci[WS(rs, 27)] = TfF - Tg2;
+			 cr[WS(rs, 4)] = TfF + Tg2;
+			 Thd = Tg5 - Tg4;
+			 Thg = The + Thf;
+			 cr[WS(rs, 36)] = Thd - Thg;
+			 ci[WS(rs, 59)] = Thd + Thg;
+		    }
+		    {
+			 E Thh, Thi, Tg3, Tg6;
+			 Thh = Tg1 - TfQ;
+			 Thi = Thf - The;
+			 cr[WS(rs, 52)] = Thh - Thi;
+			 ci[WS(rs, 43)] = Thh + Thi;
+			 Tg3 = Tft - TfE;
+			 Tg6 = Tg4 + Tg5;
+			 cr[WS(rs, 20)] = Tg3 - Tg6;
+			 ci[WS(rs, 11)] = Tg3 + Tg6;
+		    }
+		    {
+			 E Tgb, Tgi, Th5, Tha;
+			 Tgb = Tg7 + Tga;
+			 Tgi = Tge + Tgh;
+			 cr[WS(rs, 28)] = Tgb - Tgi;
+			 ci[WS(rs, 3)] = Tgb + Tgi;
+			 Th5 = Tgl - Tgk;
+			 Tha = Th6 + Th9;
+			 cr[WS(rs, 60)] = Th5 - Tha;
+			 ci[WS(rs, 35)] = Th5 + Tha;
+		    }
+		    {
+			 E Thb, Thc, Tgj, Tgm;
+			 Thb = Tgh - Tge;
+			 Thc = Th9 - Th6;
+			 cr[WS(rs, 44)] = Thb - Thc;
+			 ci[WS(rs, 51)] = Thb + Thc;
+			 Tgj = Tg7 - Tga;
+			 Tgm = Tgk + Tgl;
+			 ci[WS(rs, 19)] = Tgj - Tgm;
+			 cr[WS(rs, 12)] = Tgj + Tgm;
+		    }
+	       }
+	       {
+		    E TeH, Tf9, TeO, Thk, Thp, Thv, Tfc, Thu, Tf3, Tfn, Tf7, Tfj, TeW, Tfm, Tf6;
+		    E Tfg;
+		    {
+			 E TeD, TeG, Tfa, Tfb;
+			 TeD = TcL + TcO;
+			 TeG = KP707106781 * (TeE + TeF);
+			 TeH = TeD - TeG;
+			 Tf9 = TeD + TeG;
+			 {
+			      E TeK, TeN, Thl, Tho;
+			      TeK = FMA(KP923879532, TeI, KP382683432 * TeJ);
+			      TeN = FNMS(KP923879532, TeM, KP382683432 * TeL);
+			      TeO = TeK + TeN;
+			      Thk = TeK - TeN;
+			      Thl = KP707106781 * (TcU - TcZ);
+			      Tho = Thm + Thn;
+			      Thp = Thl + Tho;
+			      Thv = Tho - Thl;
+			 }
+			 Tfa = FNMS(KP382683432, TeI, KP923879532 * TeJ);
+			 Tfb = FMA(KP382683432, TeM, KP923879532 * TeL);
+			 Tfc = Tfa + Tfb;
+			 Thu = Tfb - Tfa;
+			 {
+			      E TeZ, Tfh, Tf2, Tfi, TeY, Tf1;
+			      TeY = KP707106781 * (Te5 + Te0);
+			      TeZ = TeX - TeY;
+			      Tfh = TeX + TeY;
+			      Tf1 = KP707106781 * (Ted + Tee);
+			      Tf2 = Tf0 - Tf1;
+			      Tfi = Tf0 + Tf1;
+			      Tf3 = FNMS(KP555570233, Tf2, KP831469612 * TeZ);
+			      Tfn = FMA(KP980785280, Tfh, KP195090322 * Tfi);
+			      Tf7 = FMA(KP555570233, TeZ, KP831469612 * Tf2);
+			      Tfj = FNMS(KP980785280, Tfi, KP195090322 * Tfh);
+			 }
+			 {
+			      E TeS, Tfe, TeV, Tff, TeR, TeU;
+			      TeR = KP707106781 * (TdN + TdM);
+			      TeS = TeQ - TeR;
+			      Tfe = TeQ + TeR;
+			      TeU = KP707106781 * (Tdz + TdE);
+			      TeV = TeT - TeU;
+			      Tff = TeT + TeU;
+			      TeW = FMA(KP831469612, TeS, KP555570233 * TeV);
+			      Tfm = FNMS(KP195090322, Tff, KP980785280 * Tfe);
+			      Tf6 = FNMS(KP831469612, TeV, KP555570233 * TeS);
+			      Tfg = FMA(KP195090322, Tfe, KP980785280 * Tff);
+			 }
+		    }
+		    {
+			 E TeP, Tf4, Tht, Thw;
+			 TeP = TeH + TeO;
+			 Tf4 = TeW + Tf3;
+			 ci[WS(rs, 25)] = TeP - Tf4;
+			 cr[WS(rs, 6)] = TeP + Tf4;
+			 Tht = Tf7 - Tf6;
+			 Thw = Thu + Thv;
+			 cr[WS(rs, 38)] = Tht - Thw;
+			 ci[WS(rs, 57)] = Tht + Thw;
+		    }
+		    {
+			 E Thx, Thy, Tf5, Tf8;
+			 Thx = Tf3 - TeW;
+			 Thy = Thv - Thu;
+			 cr[WS(rs, 54)] = Thx - Thy;
+			 ci[WS(rs, 41)] = Thx + Thy;
+			 Tf5 = TeH - TeO;
+			 Tf8 = Tf6 + Tf7;
+			 cr[WS(rs, 22)] = Tf5 - Tf8;
+			 ci[WS(rs, 9)] = Tf5 + Tf8;
+		    }
+		    {
+			 E Tfd, Tfk, Thj, Thq;
+			 Tfd = Tf9 - Tfc;
+			 Tfk = Tfg + Tfj;
+			 ci[WS(rs, 17)] = Tfd - Tfk;
+			 cr[WS(rs, 14)] = Tfd + Tfk;
+			 Thj = Tfj - Tfg;
+			 Thq = Thk + Thp;
+			 cr[WS(rs, 62)] = Thj - Thq;
+			 ci[WS(rs, 33)] = Thj + Thq;
+		    }
+		    {
+			 E Thr, Ths, Tfl, Tfo;
+			 Thr = Tfn - Tfm;
+			 Ths = Thp - Thk;
+			 cr[WS(rs, 46)] = Thr - Ths;
+			 ci[WS(rs, 49)] = Thr + Ths;
+			 Tfl = Tf9 + Tfc;
+			 Tfo = Tfm + Tfn;
+			 cr[WS(rs, 30)] = Tfl - Tfo;
+			 ci[WS(rs, 1)] = Tfl + Tfo;
+		    }
+	       }
+	       {
+		    E Td1, Ten, Tdo, ThA, ThD, ThJ, Teq, ThI, Teh, TeB, Tel, Tex, TdQ, TeA, Tek;
+		    E Teu;
+		    {
+			 E TcP, Td0, Teo, Tep;
+			 TcP = TcL - TcO;
+			 Td0 = KP707106781 * (TcU + TcZ);
+			 Td1 = TcP - Td0;
+			 Ten = TcP + Td0;
+			 {
+			      E Tdc, Tdn, ThB, ThC;
+			      Tdc = FNMS(KP923879532, Tdb, KP382683432 * Td6);
+			      Tdn = FMA(KP923879532, Tdh, KP382683432 * Tdm);
+			      Tdo = Tdc + Tdn;
+			      ThA = Tdn - Tdc;
+			      ThB = KP707106781 * (TeF - TeE);
+			      ThC = Thn - Thm;
+			      ThD = ThB + ThC;
+			      ThJ = ThC - ThB;
+			 }
+			 Teo = FMA(KP382683432, Tdb, KP923879532 * Td6);
+			 Tep = FNMS(KP382683432, Tdh, KP923879532 * Tdm);
+			 Teq = Teo + Tep;
+			 ThI = Teo - Tep;
+			 {
+			      E Te7, Tew, Teg, Tev, Te6, Tef;
+			      Te6 = KP707106781 * (Te0 - Te5);
+			      Te7 = TdV - Te6;
+			      Tew = TdV + Te6;
+			      Tef = KP707106781 * (Ted - Tee);
+			      Teg = Tec - Tef;
+			      Tev = Tec + Tef;
+			      Teh = FMA(KP555570233, Te7, KP831469612 * Teg);
+			      TeB = FMA(KP980785280, Tew, KP195090322 * Tev);
+			      Tel = FNMS(KP831469612, Te7, KP555570233 * Teg);
+			      Tex = FNMS(KP195090322, Tew, KP980785280 * Tev);
+			 }
+			 {
+			      E TdG, Tet, TdP, Tes, TdF, TdO;
+			      TdF = KP707106781 * (Tdz - TdE);
+			      TdG = Tdu - TdF;
+			      Tet = Tdu + TdF;
+			      TdO = KP707106781 * (TdM - TdN);
+			      TdP = TdL - TdO;
+			      Tes = TdL + TdO;
+			      TdQ = FNMS(KP555570233, TdP, KP831469612 * TdG);
+			      TeA = FNMS(KP980785280, Tes, KP195090322 * Tet);
+			      Tek = FMA(KP831469612, TdP, KP555570233 * TdG);
+			      Teu = FMA(KP195090322, Tes, KP980785280 * Tet);
+			 }
+		    }
+		    {
+			 E Tdp, Tei, ThH, ThK;
+			 Tdp = Td1 + Tdo;
+			 Tei = TdQ + Teh;
+			 cr[WS(rs, 26)] = Tdp - Tei;
+			 ci[WS(rs, 5)] = Tdp + Tei;
+			 ThH = Tel - Tek;
+			 ThK = ThI + ThJ;
+			 cr[WS(rs, 58)] = ThH - ThK;
+			 ci[WS(rs, 37)] = ThH + ThK;
+		    }
+		    {
+			 E ThL, ThM, Tej, Tem;
+			 ThL = Teh - TdQ;
+			 ThM = ThJ - ThI;
+			 cr[WS(rs, 42)] = ThL - ThM;
+			 ci[WS(rs, 53)] = ThL + ThM;
+			 Tej = Td1 - Tdo;
+			 Tem = Tek + Tel;
+			 ci[WS(rs, 21)] = Tej - Tem;
+			 cr[WS(rs, 10)] = Tej + Tem;
+		    }
+		    {
+			 E Ter, Tey, Thz, ThE;
+			 Ter = Ten + Teq;
+			 Tey = Teu + Tex;
+			 ci[WS(rs, 29)] = Ter - Tey;
+			 cr[WS(rs, 2)] = Ter + Tey;
+			 Thz = TeB - TeA;
+			 ThE = ThA + ThD;
+			 cr[WS(rs, 34)] = Thz - ThE;
+			 ci[WS(rs, 61)] = Thz + ThE;
+		    }
+		    {
+			 E ThF, ThG, Tez, TeC;
+			 ThF = Tex - Teu;
+			 ThG = ThD - ThA;
+			 cr[WS(rs, 50)] = ThF - ThG;
+			 ci[WS(rs, 45)] = ThF + ThG;
+			 Tez = Ten - Teq;
+			 TeC = TeA + TeB;
+			 cr[WS(rs, 18)] = Tez - TeC;
+			 ci[WS(rs, 13)] = Tez + TeC;
+		    }
+	       }
+	       {
+		    E Tc3, Tcv, TiD, TiJ, Tca, TiI, Tcy, TiA, Tci, TcI, Tcs, TcC, Tcp, TcJ, Tct;
+		    E TcF;
+		    {
+			 E TbZ, Tc2, TiB, TiC;
+			 TbZ = Taz - TaC;
+			 Tc2 = Tc0 + Tc1;
+			 Tc3 = TbZ - Tc2;
+			 Tcv = TbZ + Tc2;
+			 TiB = TaG - TaJ;
+			 TiC = Tin - Tim;
+			 TiD = TiB + TiC;
+			 TiJ = TiC - TiB;
+		    }
+		    {
+			 E Tc6, Tcw, Tc9, Tcx;
+			 {
+			      E Tc4, Tc5, Tc7, Tc8;
+			      Tc4 = TaP - TaQ;
+			      Tc5 = TaM - TaN;
+			      Tc6 = FMA(KP831469612, Tc4, KP555570233 * Tc5);
+			      Tcw = FNMS(KP555570233, Tc4, KP831469612 * Tc5);
+			      Tc7 = TaW - TaX;
+			      Tc8 = TaT - TaU;
+			      Tc9 = FNMS(KP831469612, Tc8, KP555570233 * Tc7);
+			      Tcx = FMA(KP555570233, Tc8, KP831469612 * Tc7);
+			 }
+			 Tca = Tc6 + Tc9;
+			 TiI = Tcx - Tcw;
+			 Tcy = Tcw + Tcx;
+			 TiA = Tc6 - Tc9;
+		    }
+		    {
+			 E Tce, TcB, Tch, TcA;
+			 {
+			      E Tcc, Tcd, Tcf, Tcg;
+			      Tcc = Tbd - Tbe;
+			      Tcd = Tb7 - Tba;
+			      Tce = Tcc - Tcd;
+			      TcB = Tcc + Tcd;
+			      Tcf = Tb2 - Tb3;
+			      Tcg = Tbh - Tbg;
+			      Tch = Tcf - Tcg;
+			      TcA = Tcf + Tcg;
+			 }
+			 Tci = FMA(KP471396736, Tce, KP881921264 * Tch);
+			 TcI = FMA(KP956940335, TcB, KP290284677 * TcA);
+			 Tcs = FNMS(KP881921264, Tce, KP471396736 * Tch);
+			 TcC = FNMS(KP290284677, TcB, KP956940335 * TcA);
+		    }
+		    {
+			 E Tcl, TcE, Tco, TcD;
+			 {
+			      E Tcj, Tck, Tcm, Tcn;
+			      Tcj = Tbl - Tbm;
+			      Tck = TbA - Tbz;
+			      Tcl = Tcj - Tck;
+			      TcE = Tcj + Tck;
+			      Tcm = Tbw - Tbx;
+			      Tcn = Tbq - Tbt;
+			      Tco = Tcm - Tcn;
+			      TcD = Tcm + Tcn;
+			 }
+			 Tcp = FNMS(KP471396736, Tco, KP881921264 * Tcl);
+			 TcJ = FNMS(KP956940335, TcD, KP290284677 * TcE);
+			 Tct = FMA(KP881921264, Tco, KP471396736 * Tcl);
+			 TcF = FMA(KP290284677, TcD, KP956940335 * TcE);
+		    }
+		    {
+			 E Tcb, Tcq, TiH, TiK;
+			 Tcb = Tc3 + Tca;
+			 Tcq = Tci + Tcp;
+			 ci[WS(rs, 26)] = Tcb - Tcq;
+			 cr[WS(rs, 5)] = Tcb + Tcq;
+			 TiH = Tct - Tcs;
+			 TiK = TiI + TiJ;
+			 cr[WS(rs, 37)] = TiH - TiK;
+			 ci[WS(rs, 58)] = TiH + TiK;
+		    }
+		    {
+			 E TiL, TiM, Tcr, Tcu;
+			 TiL = Tcp - Tci;
+			 TiM = TiJ - TiI;
+			 cr[WS(rs, 53)] = TiL - TiM;
+			 ci[WS(rs, 42)] = TiL + TiM;
+			 Tcr = Tc3 - Tca;
+			 Tcu = Tcs + Tct;
+			 cr[WS(rs, 21)] = Tcr - Tcu;
+			 ci[WS(rs, 10)] = Tcr + Tcu;
+		    }
+		    {
+			 E Tcz, TcG, Tiz, TiE;
+			 Tcz = Tcv + Tcy;
+			 TcG = TcC + TcF;
+			 cr[WS(rs, 29)] = Tcz - TcG;
+			 ci[WS(rs, 2)] = Tcz + TcG;
+			 Tiz = TcJ - TcI;
+			 TiE = TiA + TiD;
+			 cr[WS(rs, 61)] = Tiz - TiE;
+			 ci[WS(rs, 34)] = Tiz + TiE;
+		    }
+		    {
+			 E TiF, TiG, TcH, TcK;
+			 TiF = TcF - TcC;
+			 TiG = TiD - TiA;
+			 cr[WS(rs, 45)] = TiF - TiG;
+			 ci[WS(rs, 50)] = TiF + TiG;
+			 TcH = Tcv - Tcy;
+			 TcK = TcI + TcJ;
+			 ci[WS(rs, 18)] = TcH - TcK;
+			 cr[WS(rs, 13)] = TcH + TcK;
+		    }
+	       }
+	       {
+		    E TaL, TbJ, Tip, Tiv, Tb0, Tiu, TbM, Tik, Tbk, TbW, TbG, TbQ, TbD, TbX, TbH;
+		    E TbT;
+		    {
+			 E TaD, TaK, Til, Tio;
+			 TaD = Taz + TaC;
+			 TaK = TaG + TaJ;
+			 TaL = TaD - TaK;
+			 TbJ = TaD + TaK;
+			 Til = Tc1 - Tc0;
+			 Tio = Tim + Tin;
+			 Tip = Til + Tio;
+			 Tiv = Tio - Til;
+		    }
+		    {
+			 E TaS, TbK, TaZ, TbL;
+			 {
+			      E TaO, TaR, TaV, TaY;
+			      TaO = TaM + TaN;
+			      TaR = TaP + TaQ;
+			      TaS = FNMS(KP980785280, TaR, KP195090322 * TaO);
+			      TbK = FMA(KP195090322, TaR, KP980785280 * TaO);
+			      TaV = TaT + TaU;
+			      TaY = TaW + TaX;
+			      TaZ = FMA(KP980785280, TaV, KP195090322 * TaY);
+			      TbL = FNMS(KP195090322, TaV, KP980785280 * TaY);
+			 }
+			 Tb0 = TaS + TaZ;
+			 Tiu = TbK - TbL;
+			 TbM = TbK + TbL;
+			 Tik = TaZ - TaS;
+		    }
+		    {
+			 E Tbc, TbO, Tbj, TbP;
+			 {
+			      E Tb4, Tbb, Tbf, Tbi;
+			      Tb4 = Tb2 + Tb3;
+			      Tbb = Tb7 + Tba;
+			      Tbc = Tb4 - Tbb;
+			      TbO = Tb4 + Tbb;
+			      Tbf = Tbd + Tbe;
+			      Tbi = Tbg + Tbh;
+			      Tbj = Tbf - Tbi;
+			      TbP = Tbf + Tbi;
+			 }
+			 Tbk = FMA(KP634393284, Tbc, KP773010453 * Tbj);
+			 TbW = FNMS(KP995184726, TbP, KP098017140 * TbO);
+			 TbG = FNMS(KP634393284, Tbj, KP773010453 * Tbc);
+			 TbQ = FMA(KP995184726, TbO, KP098017140 * TbP);
+		    }
+		    {
+			 E Tbv, TbR, TbC, TbS;
+			 {
+			      E Tbn, Tbu, Tby, TbB;
+			      Tbn = Tbl + Tbm;
+			      Tbu = Tbq + Tbt;
+			      Tbv = Tbn - Tbu;
+			      TbR = Tbn + Tbu;
+			      Tby = Tbw + Tbx;
+			      TbB = Tbz + TbA;
+			      TbC = Tby - TbB;
+			      TbS = Tby + TbB;
+			 }
+			 TbD = FNMS(KP773010453, TbC, KP634393284 * Tbv);
+			 TbX = FMA(KP098017140, TbR, KP995184726 * TbS);
+			 TbH = FMA(KP773010453, Tbv, KP634393284 * TbC);
+			 TbT = FNMS(KP098017140, TbS, KP995184726 * TbR);
+		    }
+		    {
+			 E Tb1, TbE, Tit, Tiw;
+			 Tb1 = TaL - Tb0;
+			 TbE = Tbk + TbD;
+			 ci[WS(rs, 22)] = Tb1 - TbE;
+			 cr[WS(rs, 9)] = Tb1 + TbE;
+			 Tit = TbD - Tbk;
+			 Tiw = Tiu + Tiv;
+			 cr[WS(rs, 57)] = Tit - Tiw;
+			 ci[WS(rs, 38)] = Tit + Tiw;
+		    }
+		    {
+			 E Tix, Tiy, TbF, TbI;
+			 Tix = TbH - TbG;
+			 Tiy = Tiv - Tiu;
+			 cr[WS(rs, 41)] = Tix - Tiy;
+			 ci[WS(rs, 54)] = Tix + Tiy;
+			 TbF = TaL + Tb0;
+			 TbI = TbG + TbH;
+			 cr[WS(rs, 25)] = TbF - TbI;
+			 ci[WS(rs, 6)] = TbF + TbI;
+		    }
+		    {
+			 E TbN, TbU, Tij, Tiq;
+			 TbN = TbJ + TbM;
+			 TbU = TbQ + TbT;
+			 ci[WS(rs, 30)] = TbN - TbU;
+			 cr[WS(rs, 1)] = TbN + TbU;
+			 Tij = TbX - TbW;
+			 Tiq = Tik + Tip;
+			 cr[WS(rs, 33)] = Tij - Tiq;
+			 ci[WS(rs, 62)] = Tij + Tiq;
+		    }
+		    {
+			 E Tir, Tis, TbV, TbY;
+			 Tir = TbT - TbQ;
+			 Tis = Tip - Tik;
+			 cr[WS(rs, 49)] = Tir - Tis;
+			 ci[WS(rs, 46)] = Tir + Tis;
+			 TbV = TbJ - TbM;
+			 TbY = TbW + TbX;
+			 cr[WS(rs, 17)] = TbV - TbY;
+			 ci[WS(rs, 14)] = TbV + TbY;
+		    }
+	       }
+	       {
+		    E T9R, Taj, Ti9, Tif, T9Y, Tie, Tam, Ti6, Ta6, Taw, Tag, Taq, Tad, Tax, Tah;
+		    E Tat;
+		    {
+			 E T9N, T9Q, Ti7, Ti8;
+			 T9N = T6b - T6m;
+			 T9Q = T9O + T9P;
+			 T9R = T9N - T9Q;
+			 Taj = T9N + T9Q;
+			 Ti7 = T6J - T6y;
+			 Ti8 = ThT - ThQ;
+			 Ti9 = Ti7 + Ti8;
+			 Tif = Ti8 - Ti7;
+		    }
+		    {
+			 E T9U, Tak, T9X, Tal;
+			 {
+			      E T9S, T9T, T9V, T9W;
+			      T9S = T6Q - T71;
+			      T9T = T77 - T7a;
+			      T9U = FNMS(KP831469612, T9T, KP555570233 * T9S);
+			      Tak = FMA(KP831469612, T9S, KP555570233 * T9T);
+			      T9V = T7h - T7s;
+			      T9W = T7y - T7B;
+			      T9X = FMA(KP555570233, T9V, KP831469612 * T9W);
+			      Tal = FNMS(KP555570233, T9W, KP831469612 * T9V);
+			 }
+			 T9Y = T9U + T9X;
+			 Tie = Tak - Tal;
+			 Tam = Tak + Tal;
+			 Ti6 = T9X - T9U;
+		    }
+		    {
+			 E Ta2, Tao, Ta5, Tap;
+			 {
+			      E Ta0, Ta1, Ta3, Ta4;
+			      Ta0 = T8p - T8s;
+			      Ta1 = T87 - T8i;
+			      Ta2 = Ta0 - Ta1;
+			      Tao = Ta0 + Ta1;
+			      Ta3 = T7K - T7V;
+			      Ta4 = T8v - T8u;
+			      Ta5 = Ta3 - Ta4;
+			      Tap = Ta3 + Ta4;
+			 }
+			 Ta6 = FMA(KP471396736, Ta2, KP881921264 * Ta5);
+			 Taw = FNMS(KP956940335, Tap, KP290284677 * Tao);
+			 Tag = FNMS(KP471396736, Ta5, KP881921264 * Ta2);
+			 Taq = FMA(KP956940335, Tao, KP290284677 * Tap);
+		    }
+		    {
+			 E Ta9, Tar, Tac, Tas;
+			 {
+			      E Ta7, Ta8, Taa, Tab;
+			      Ta7 = T8D - T8O;
+			      Ta8 = T9n - T9o;
+			      Ta9 = Ta7 - Ta8;
+			      Tar = Ta7 + Ta8;
+			      Taa = T9i - T9l;
+			      Tab = T9b - T90;
+			      Tac = Taa - Tab;
+			      Tas = Taa + Tab;
+			 }
+			 Tad = FNMS(KP881921264, Tac, KP471396736 * Ta9);
+			 Tax = FMA(KP290284677, Tar, KP956940335 * Tas);
+			 Tah = FMA(KP881921264, Ta9, KP471396736 * Tac);
+			 Tat = FNMS(KP290284677, Tas, KP956940335 * Tar);
+		    }
+		    {
+			 E T9Z, Tae, Tid, Tig;
+			 T9Z = T9R - T9Y;
+			 Tae = Ta6 + Tad;
+			 ci[WS(rs, 20)] = T9Z - Tae;
+			 cr[WS(rs, 11)] = T9Z + Tae;
+			 Tid = Tad - Ta6;
+			 Tig = Tie + Tif;
+			 cr[WS(rs, 59)] = Tid - Tig;
+			 ci[WS(rs, 36)] = Tid + Tig;
+		    }
+		    {
+			 E Tih, Tii, Taf, Tai;
+			 Tih = Tah - Tag;
+			 Tii = Tif - Tie;
+			 cr[WS(rs, 43)] = Tih - Tii;
+			 ci[WS(rs, 52)] = Tih + Tii;
+			 Taf = T9R + T9Y;
+			 Tai = Tag + Tah;
+			 cr[WS(rs, 27)] = Taf - Tai;
+			 ci[WS(rs, 4)] = Taf + Tai;
+		    }
+		    {
+			 E Tan, Tau, Ti5, Tia;
+			 Tan = Taj + Tam;
+			 Tau = Taq + Tat;
+			 ci[WS(rs, 28)] = Tan - Tau;
+			 cr[WS(rs, 3)] = Tan + Tau;
+			 Ti5 = Tax - Taw;
+			 Tia = Ti6 + Ti9;
+			 cr[WS(rs, 35)] = Ti5 - Tia;
+			 ci[WS(rs, 60)] = Ti5 + Tia;
+		    }
+		    {
+			 E Tib, Tic, Tav, Tay;
+			 Tib = Tat - Taq;
+			 Tic = Ti9 - Ti6;
+			 cr[WS(rs, 51)] = Tib - Tic;
+			 ci[WS(rs, 44)] = Tib + Tic;
+			 Tav = Taj - Tam;
+			 Tay = Taw + Tax;
+			 cr[WS(rs, 19)] = Tav - Tay;
+			 ci[WS(rs, 12)] = Tav + Tay;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 64},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 64, "hf_64", twinstr, &GENUS, {808, 270, 230, 0} };
+
+void X(codelet_hf_64) (planner *p) {
+     X(khc2hc_register) (p, hf_64, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:50 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 7 -dit -name hf_7 -include hf.h */
+
+/*
+ * This function contains 72 FP additions, 66 FP multiplications,
+ * (or, 18 additions, 12 multiplications, 54 fused multiply/add),
+ * 62 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "hf.h"
+
+static void hf_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
+	       E T1, TR, T18, T10, T12, T16, T11, T13;
+	       {
+		    E T19, T1a, T1i, Te, Tt, Tw, T1b, TM, T1h, Tr, Tu, TS, Tz, TC, Ty;
+		    E Tv, TB;
+		    T1 = cr[0];
+		    T19 = ci[0];
+		    {
+			 E T9, Tc, TP, Ta, Tb, TO, T7;
+			 {
+			      E T3, T6, T8, TN, T4, T2, T5;
+			      T3 = cr[WS(rs, 1)];
+			      T6 = ci[WS(rs, 1)];
+			      T2 = W[0];
+			      T9 = cr[WS(rs, 6)];
+			      Tc = ci[WS(rs, 6)];
+			      T8 = W[10];
+			      TN = T2 * T6;
+			      T4 = T2 * T3;
+			      T5 = W[1];
+			      TP = T8 * Tc;
+			      Ta = T8 * T9;
+			      Tb = W[11];
+			      TO = FNMS(T5, T3, TN);
+			      T7 = FMA(T5, T6, T4);
+			 }
+			 {
+			      E Tg, Tj, Th, TI, Tm, Tp, Tl, Ti, To, TQ, Td, Tf;
+			      Tg = cr[WS(rs, 2)];
+			      TQ = FNMS(Tb, T9, TP);
+			      Td = FMA(Tb, Tc, Ta);
+			      Tj = ci[WS(rs, 2)];
+			      Tf = W[2];
+			      T1a = TO + TQ;
+			      TR = TO - TQ;
+			      T1i = Td - T7;
+			      Te = T7 + Td;
+			      Th = Tf * Tg;
+			      TI = Tf * Tj;
+			      Tm = cr[WS(rs, 5)];
+			      Tp = ci[WS(rs, 5)];
+			      Tl = W[8];
+			      Ti = W[3];
+			      To = W[9];
+			      {
+				   E TJ, Tk, TL, Tq, TK, Tn, Ts;
+				   Tt = cr[WS(rs, 3)];
+				   TK = Tl * Tp;
+				   Tn = Tl * Tm;
+				   TJ = FNMS(Ti, Tg, TI);
+				   Tk = FMA(Ti, Tj, Th);
+				   TL = FNMS(To, Tm, TK);
+				   Tq = FMA(To, Tp, Tn);
+				   Tw = ci[WS(rs, 3)];
+				   Ts = W[4];
+				   T1b = TJ + TL;
+				   TM = TJ - TL;
+				   T1h = Tq - Tk;
+				   Tr = Tk + Tq;
+				   Tu = Ts * Tt;
+				   TS = Ts * Tw;
+			      }
+			      Tz = cr[WS(rs, 4)];
+			      TC = ci[WS(rs, 4)];
+			      Ty = W[6];
+			      Tv = W[5];
+			      TB = W[7];
+			 }
+		    }
+		    {
+			 E TF, TT, Tx, TV, TD, T1q, TU, TA;
+			 TF = FNMS(KP356895867, Tr, Te);
+			 TU = Ty * TC;
+			 TA = Ty * Tz;
+			 TT = FNMS(Tv, Tt, TS);
+			 Tx = FMA(Tv, Tw, Tu);
+			 TV = FNMS(TB, Tz, TU);
+			 TD = FMA(TB, TC, TA);
+			 T1q = FNMS(KP356895867, T1b, T1a);
+			 {
+			      E TW, TE, T1k, T1f;
+			      {
+				   E T1e, T1s, TY, T1p, T1u, TH, T1n, T1j, T1c, T1g;
+				   T1j = FNMS(KP554958132, T1i, T1h);
+				   T1c = TT + TV;
+				   TW = TT - TV;
+				   T1g = TD - Tx;
+				   TE = Tx + TD;
+				   {
+					E T1d, T1l, T1r, TX;
+					T1d = FNMS(KP356895867, T1c, T1b);
+					T1l = FNMS(KP356895867, T1a, T1c);
+					T1r = FNMS(KP692021471, T1q, T1c);
+					ci[WS(rs, 6)] = T1a + T1b + T1c + T19;
+					TX = FMA(KP554958132, TW, TR);
+					{
+					     E T1o, T1t, TG, T1m;
+					     T1o = FMA(KP554958132, T1h, T1g);
+					     T1t = FMA(KP554958132, T1g, T1i);
+					     TG = FNMS(KP692021471, TF, TE);
+					     cr[0] = T1 + Te + Tr + TE;
+					     T1e = FNMS(KP692021471, T1d, T1a);
+					     T1m = FNMS(KP692021471, T1l, T1b);
+					     T1s = FNMS(KP900968867, T1r, T19);
+					     TY = FMA(KP801937735, TX, TM);
+					     T1p = FNMS(KP801937735, T1o, T1i);
+					     T1u = FMA(KP801937735, T1t, T1h);
+					     TH = FNMS(KP900968867, TG, T1);
+					     T1n = FNMS(KP900968867, T1m, T19);
+					     T1k = FNMS(KP801937735, T1j, T1g);
+					}
+				   }
+				   ci[WS(rs, 5)] = FMA(KP974927912, T1u, T1s);
+				   cr[WS(rs, 6)] = FMS(KP974927912, T1u, T1s);
+				   cr[WS(rs, 1)] = FMA(KP974927912, TY, TH);
+				   ci[0] = FNMS(KP974927912, TY, TH);
+				   ci[WS(rs, 4)] = FMA(KP974927912, T1p, T1n);
+				   cr[WS(rs, 5)] = FMS(KP974927912, T1p, T1n);
+				   T1f = FNMS(KP900968867, T1e, T19);
+			      }
+			      {
+				   E T14, T17, T15, TZ;
+				   T14 = FNMS(KP356895867, TE, Tr);
+				   T17 = FNMS(KP554958132, TR, TM);
+				   TZ = FNMS(KP356895867, Te, TE);
+				   ci[WS(rs, 3)] = FMA(KP974927912, T1k, T1f);
+				   cr[WS(rs, 4)] = FMS(KP974927912, T1k, T1f);
+				   T15 = FNMS(KP692021471, T14, Te);
+				   T18 = FNMS(KP801937735, T17, TW);
+				   T10 = FNMS(KP692021471, TZ, Tr);
+				   T12 = FMA(KP554958132, TM, TW);
+				   T16 = FNMS(KP900968867, T15, T1);
+			      }
+			 }
+		    }
+	       }
+	       T11 = FNMS(KP900968867, T10, T1);
+	       T13 = FNMS(KP801937735, T12, TR);
+	       cr[WS(rs, 3)] = FMA(KP974927912, T18, T16);
+	       ci[WS(rs, 2)] = FNMS(KP974927912, T18, T16);
+	       cr[WS(rs, 2)] = FMA(KP974927912, T13, T11);
+	       ci[WS(rs, 1)] = FNMS(KP974927912, T13, T11);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 7, "hf_7", twinstr, &GENUS, {18, 12, 54, 0} };
+
+void X(codelet_hf_7) (planner *p) {
+     X(khc2hc_register) (p, hf_7, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 7 -dit -name hf_7 -include hf.h */
+
+/*
+ * This function contains 72 FP additions, 60 FP multiplications,
+ * (or, 36 additions, 24 multiplications, 36 fused multiply/add),
+ * 29 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "hf.h"
+
+static void hf_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
+	       E T1, TT, Tc, TV, TC, TO, Tn, TS, TI, TP, Ty, TU, TF, TQ;
+	       T1 = cr[0];
+	       TT = ci[0];
+	       {
+		    E T6, TA, Tb, TB;
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 1)];
+			 T5 = ci[WS(rs, 1)];
+			 T2 = W[0];
+			 T4 = W[1];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 TA = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = cr[WS(rs, 6)];
+			 Ta = ci[WS(rs, 6)];
+			 T7 = W[10];
+			 T9 = W[11];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 TB = FNMS(T9, T8, T7 * Ta);
+		    }
+		    Tc = T6 + Tb;
+		    TV = TA + TB;
+		    TC = TA - TB;
+		    TO = Tb - T6;
+	       }
+	       {
+		    E Th, TG, Tm, TH;
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = cr[WS(rs, 2)];
+			 Tg = ci[WS(rs, 2)];
+			 Td = W[2];
+			 Tf = W[3];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 TG = FNMS(Tf, Te, Td * Tg);
+		    }
+		    {
+			 E Tj, Tl, Ti, Tk;
+			 Tj = cr[WS(rs, 5)];
+			 Tl = ci[WS(rs, 5)];
+			 Ti = W[8];
+			 Tk = W[9];
+			 Tm = FMA(Ti, Tj, Tk * Tl);
+			 TH = FNMS(Tk, Tj, Ti * Tl);
+		    }
+		    Tn = Th + Tm;
+		    TS = TG + TH;
+		    TI = TG - TH;
+		    TP = Th - Tm;
+	       }
+	       {
+		    E Ts, TD, Tx, TE;
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = cr[WS(rs, 3)];
+			 Tr = ci[WS(rs, 3)];
+			 To = W[4];
+			 Tq = W[5];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 TD = FNMS(Tq, Tp, To * Tr);
+		    }
+		    {
+			 E Tu, Tw, Tt, Tv;
+			 Tu = cr[WS(rs, 4)];
+			 Tw = ci[WS(rs, 4)];
+			 Tt = W[6];
+			 Tv = W[7];
+			 Tx = FMA(Tt, Tu, Tv * Tw);
+			 TE = FNMS(Tv, Tu, Tt * Tw);
+		    }
+		    Ty = Ts + Tx;
+		    TU = TD + TE;
+		    TF = TD - TE;
+		    TQ = Tx - Ts;
+	       }
+	       {
+		    E TL, TK, TZ, T10;
+		    cr[0] = T1 + Tc + Tn + Ty;
+		    TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
+		    TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
+		    ci[0] = TK - TL;
+		    cr[WS(rs, 1)] = TK + TL;
+		    ci[WS(rs, 6)] = TV + TS + TU + TT;
+		    TZ = FMA(KP781831482, TO, KP433883739 * TQ) - (KP974927912 * TP);
+		    T10 = FMA(KP623489801, TV, TT) + FNMA(KP900968867, TU, KP222520933 * TS);
+		    cr[WS(rs, 6)] = TZ - T10;
+		    ci[WS(rs, 5)] = TZ + T10;
+	       }
+	       {
+		    E TX, TY, TR, TW;
+		    TX = FMA(KP974927912, TO, KP433883739 * TP) - (KP781831482 * TQ);
+		    TY = FMA(KP623489801, TU, TT) + FNMA(KP900968867, TS, KP222520933 * TV);
+		    cr[WS(rs, 5)] = TX - TY;
+		    ci[WS(rs, 4)] = TX + TY;
+		    TR = FMA(KP433883739, TO, KP781831482 * TP) + (KP974927912 * TQ);
+		    TW = FMA(KP623489801, TS, TT) + FNMA(KP222520933, TU, KP900968867 * TV);
+		    cr[WS(rs, 4)] = TR - TW;
+		    ci[WS(rs, 3)] = TR + TW;
+	       }
+	       {
+		    E TN, TM, TJ, Tz;
+		    TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
+		    TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
+		    ci[WS(rs, 2)] = TM - TN;
+		    cr[WS(rs, 3)] = TM + TN;
+		    TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
+		    Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
+		    ci[WS(rs, 1)] = Tz - TJ;
+		    cr[WS(rs, 2)] = Tz + TJ;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 7},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 7, "hf_7", twinstr, &GENUS, {36, 24, 36, 0} };
+
+void X(codelet_hf_7) (planner *p) {
+     X(khc2hc_register) (p, hf_7, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:51 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hf_8 -include hf.h */
+
+/*
+ * This function contains 66 FP additions, 36 FP multiplications,
+ * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
+ * 61 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hf.h"
+
+static void hf_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T1f, T1g, T1e, Tm, T1q, T1o, T1p, TN, T1h, T1i;
+	       {
+		    E T1, T1m, T1l, T7, TS, Tk, TQ, Te, To, Tr, T17, TM, T12, Tu, TW;
+		    E Tp, Tx, Tt, Tq, Tw;
+		    {
+			 E T3, T6, T2, T5;
+			 T1 = cr[0];
+			 T1m = ci[0];
+			 T3 = cr[WS(rs, 4)];
+			 T6 = ci[WS(rs, 4)];
+			 T2 = W[6];
+			 T5 = W[7];
+			 {
+			      E Ta, Td, T9, Tc;
+			      {
+				   E Tg, Tj, Ti, TR, Th, T1k, T4, Tf;
+				   Tg = cr[WS(rs, 6)];
+				   Tj = ci[WS(rs, 6)];
+				   T1k = T2 * T6;
+				   T4 = T2 * T3;
+				   Tf = W[10];
+				   Ti = W[11];
+				   T1l = FNMS(T5, T3, T1k);
+				   T7 = FMA(T5, T6, T4);
+				   TR = Tf * Tj;
+				   Th = Tf * Tg;
+				   Ta = cr[WS(rs, 2)];
+				   Td = ci[WS(rs, 2)];
+				   TS = FNMS(Ti, Tg, TR);
+				   Tk = FMA(Ti, Tj, Th);
+				   T9 = W[2];
+				   Tc = W[3];
+			      }
+			      {
+				   E TB, TE, TH, T13, TC, TK, TG, TD, TJ, TP, Tb, TA, Tn;
+				   TB = cr[WS(rs, 7)];
+				   TE = ci[WS(rs, 7)];
+				   TP = T9 * Td;
+				   Tb = T9 * Ta;
+				   TA = W[12];
+				   TH = cr[WS(rs, 3)];
+				   TQ = FNMS(Tc, Ta, TP);
+				   Te = FMA(Tc, Td, Tb);
+				   T13 = TA * TE;
+				   TC = TA * TB;
+				   TK = ci[WS(rs, 3)];
+				   TG = W[4];
+				   TD = W[13];
+				   TJ = W[5];
+				   {
+					E T14, TF, T16, TL, T15, TI;
+					To = cr[WS(rs, 1)];
+					T15 = TG * TK;
+					TI = TG * TH;
+					T14 = FNMS(TD, TB, T13);
+					TF = FMA(TD, TE, TC);
+					T16 = FNMS(TJ, TH, T15);
+					TL = FMA(TJ, TK, TI);
+					Tr = ci[WS(rs, 1)];
+					Tn = W[0];
+					T17 = T14 - T16;
+					T1f = T14 + T16;
+					TM = TF + TL;
+					T12 = TF - TL;
+				   }
+				   Tu = cr[WS(rs, 5)];
+				   TW = Tn * Tr;
+				   Tp = Tn * To;
+				   Tx = ci[WS(rs, 5)];
+				   Tt = W[8];
+				   Tq = W[1];
+				   Tw = W[9];
+			      }
+			 }
+		    }
+		    {
+			 E T8, T1j, Tl, Tz, T1a, TU, T1n, T1b, T1c, T1v, T1t, T1u, T19, T1w, T1d;
+			 {
+			      E T1r, T10, TV, T1s, T11, T18;
+			      {
+				   E TO, TX, Ts, TZ, Ty, TT, TY, Tv;
+				   T8 = T1 + T7;
+				   TO = T1 - T7;
+				   TY = Tt * Tx;
+				   Tv = Tt * Tu;
+				   TX = FNMS(Tq, To, TW);
+				   Ts = FMA(Tq, Tr, Tp);
+				   TZ = FNMS(Tw, Tu, TY);
+				   Ty = FMA(Tw, Tx, Tv);
+				   TT = TQ - TS;
+				   T1j = TQ + TS;
+				   Tl = Te + Tk;
+				   T1r = Te - Tk;
+				   T10 = TX - TZ;
+				   T1g = TX + TZ;
+				   Tz = Ts + Ty;
+				   TV = Ts - Ty;
+				   T1a = TO - TT;
+				   TU = TO + TT;
+				   T1s = T1m - T1l;
+				   T1n = T1l + T1m;
+			      }
+			      T1b = TV - T10;
+			      T11 = TV + T10;
+			      T18 = T12 - T17;
+			      T1c = T12 + T17;
+			      T1v = T1s - T1r;
+			      T1t = T1r + T1s;
+			      T1u = T18 - T11;
+			      T19 = T11 + T18;
+			 }
+			 ci[WS(rs, 4)] = FMA(KP707106781, T1u, T1t);
+			 cr[WS(rs, 7)] = FMS(KP707106781, T1u, T1t);
+			 cr[WS(rs, 1)] = FMA(KP707106781, T19, TU);
+			 ci[WS(rs, 2)] = FNMS(KP707106781, T19, TU);
+			 T1w = T1c - T1b;
+			 T1d = T1b + T1c;
+			 ci[WS(rs, 6)] = FMA(KP707106781, T1w, T1v);
+			 cr[WS(rs, 5)] = FMS(KP707106781, T1w, T1v);
+			 ci[0] = FMA(KP707106781, T1d, T1a);
+			 cr[WS(rs, 3)] = FNMS(KP707106781, T1d, T1a);
+			 T1e = T8 - Tl;
+			 Tm = T8 + Tl;
+			 T1q = T1n - T1j;
+			 T1o = T1j + T1n;
+			 T1p = TM - Tz;
+			 TN = Tz + TM;
+		    }
+	       }
+	       ci[WS(rs, 5)] = T1p + T1q;
+	       cr[WS(rs, 6)] = T1p - T1q;
+	       cr[0] = Tm + TN;
+	       ci[WS(rs, 3)] = Tm - TN;
+	       T1h = T1f - T1g;
+	       T1i = T1g + T1f;
+	       ci[WS(rs, 7)] = T1i + T1o;
+	       cr[WS(rs, 4)] = T1i - T1o;
+	       ci[WS(rs, 1)] = T1e + T1h;
+	       cr[WS(rs, 2)] = T1e - T1h;
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 8, "hf_8", twinstr, &GENUS, {44, 14, 22, 0} };
+
+void X(codelet_hf_8) (planner *p) {
+     X(khc2hc_register) (p, hf_8, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hf_8 -include hf.h */
+
+/*
+ * This function contains 66 FP additions, 32 FP multiplications,
+ * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
+ * 28 stack variables, 1 constants, and 32 memory accesses
+ */
+#include "hf.h"
+
+static void hf_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
+	       E T7, T1f, TH, T19, TF, T12, TR, TU, Ti, T1e, TK, T16, Tu, T13, TM;
+	       E TP;
+	       {
+		    E T1, T18, T6, T17;
+		    T1 = cr[0];
+		    T18 = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 4)];
+			 T5 = ci[WS(rs, 4)];
+			 T2 = W[6];
+			 T4 = W[7];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 T17 = FNMS(T4, T3, T2 * T5);
+		    }
+		    T7 = T1 + T6;
+		    T1f = T18 - T17;
+		    TH = T1 - T6;
+		    T19 = T17 + T18;
+	       }
+	       {
+		    E Tz, TS, TE, TT;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = cr[WS(rs, 7)];
+			 Ty = ci[WS(rs, 7)];
+			 Tv = W[12];
+			 Tx = W[13];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 TS = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = cr[WS(rs, 3)];
+			 TD = ci[WS(rs, 3)];
+			 TA = W[4];
+			 TC = W[5];
+			 TE = FMA(TA, TB, TC * TD);
+			 TT = FNMS(TC, TB, TA * TD);
+		    }
+		    TF = Tz + TE;
+		    T12 = TS + TT;
+		    TR = Tz - TE;
+		    TU = TS - TT;
+	       }
+	       {
+		    E Tc, TI, Th, TJ;
+		    {
+			 E T9, Tb, T8, Ta;
+			 T9 = cr[WS(rs, 2)];
+			 Tb = ci[WS(rs, 2)];
+			 T8 = W[2];
+			 Ta = W[3];
+			 Tc = FMA(T8, T9, Ta * Tb);
+			 TI = FNMS(Ta, T9, T8 * Tb);
+		    }
+		    {
+			 E Te, Tg, Td, Tf;
+			 Te = cr[WS(rs, 6)];
+			 Tg = ci[WS(rs, 6)];
+			 Td = W[10];
+			 Tf = W[11];
+			 Th = FMA(Td, Te, Tf * Tg);
+			 TJ = FNMS(Tf, Te, Td * Tg);
+		    }
+		    Ti = Tc + Th;
+		    T1e = Tc - Th;
+		    TK = TI - TJ;
+		    T16 = TI + TJ;
+	       }
+	       {
+		    E To, TN, Tt, TO;
+		    {
+			 E Tl, Tn, Tk, Tm;
+			 Tl = cr[WS(rs, 1)];
+			 Tn = ci[WS(rs, 1)];
+			 Tk = W[0];
+			 Tm = W[1];
+			 To = FMA(Tk, Tl, Tm * Tn);
+			 TN = FNMS(Tm, Tl, Tk * Tn);
+		    }
+		    {
+			 E Tq, Ts, Tp, Tr;
+			 Tq = cr[WS(rs, 5)];
+			 Ts = ci[WS(rs, 5)];
+			 Tp = W[8];
+			 Tr = W[9];
+			 Tt = FMA(Tp, Tq, Tr * Ts);
+			 TO = FNMS(Tr, Tq, Tp * Ts);
+		    }
+		    Tu = To + Tt;
+		    T13 = TN + TO;
+		    TM = To - Tt;
+		    TP = TN - TO;
+	       }
+	       {
+		    E Tj, TG, T1b, T1c;
+		    Tj = T7 + Ti;
+		    TG = Tu + TF;
+		    ci[WS(rs, 3)] = Tj - TG;
+		    cr[0] = Tj + TG;
+		    T1b = TF - Tu;
+		    T1c = T19 - T16;
+		    cr[WS(rs, 6)] = T1b - T1c;
+		    ci[WS(rs, 5)] = T1b + T1c;
+		    {
+			 E TX, T1i, T10, T1h, TY, TZ;
+			 TX = TH - TK;
+			 T1i = T1f - T1e;
+			 TY = TM - TP;
+			 TZ = TR + TU;
+			 T10 = KP707106781 * (TY + TZ);
+			 T1h = KP707106781 * (TZ - TY);
+			 cr[WS(rs, 3)] = TX - T10;
+			 ci[WS(rs, 6)] = T1h + T1i;
+			 ci[0] = TX + T10;
+			 cr[WS(rs, 5)] = T1h - T1i;
+		    }
+	       }
+	       {
+		    E T15, T1a, T11, T14;
+		    T15 = T13 + T12;
+		    T1a = T16 + T19;
+		    cr[WS(rs, 4)] = T15 - T1a;
+		    ci[WS(rs, 7)] = T15 + T1a;
+		    T11 = T7 - Ti;
+		    T14 = T12 - T13;
+		    cr[WS(rs, 2)] = T11 - T14;
+		    ci[WS(rs, 1)] = T11 + T14;
+		    {
+			 E TL, T1g, TW, T1d, TQ, TV;
+			 TL = TH + TK;
+			 T1g = T1e + T1f;
+			 TQ = TM + TP;
+			 TV = TR - TU;
+			 TW = KP707106781 * (TQ + TV);
+			 T1d = KP707106781 * (TV - TQ);
+			 ci[WS(rs, 2)] = TL - TW;
+			 ci[WS(rs, 4)] = T1d + T1g;
+			 cr[WS(rs, 1)] = TL + TW;
+			 cr[WS(rs, 7)] = T1d - T1g;
+		    }
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 8},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 8, "hf_8", twinstr, &GENUS, {52, 18, 14, 0} };
+
+void X(codelet_hf_8) (planner *p) {
+     X(khc2hc_register) (p, hf_8, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/hf_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/hf_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:51 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 9 -dit -name hf_9 -include hf.h */
+
+/*
+ * This function contains 96 FP additions, 88 FP multiplications,
+ * (or, 24 additions, 16 multiplications, 72 fused multiply/add),
+ * 69 stack variables, 10 constants, and 36 memory accesses
+ */
+#include "hf.h"
+
+static void hf_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP777861913, +0.777861913430206160028177977318626690410586096);
+     DK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DK(KP839099631, +0.839099631177280011763127298123181364687434283);
+     DK(KP492403876, +0.492403876506104029683371512294761506835321626);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP954188894, +0.954188894138671133499268364187245676532219158);
+     DK(KP363970234, +0.363970234266202361351047882776834043890471784);
+     DK(KP176326980, +0.176326980708464973471090386868618986121633062);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
+	       E T20, T1Z;
+	       {
+		    E T1, T1P, T1Q, T10, T1S, Te, TB, T1d, T1a, T19, T1M, TE, T1c, Tz, T1n;
+		    E TC, TH, TK, T1k, TR, TG, TJ, TD;
+		    T1 = cr[0];
+		    T1P = ci[0];
+		    {
+			 E T9, Tc, TY, Ta, Tb, TX, T7;
+			 {
+			      E T3, T6, T8, TW, T4, T2, T5;
+			      T3 = cr[WS(rs, 3)];
+			      T6 = ci[WS(rs, 3)];
+			      T2 = W[4];
+			      T9 = cr[WS(rs, 6)];
+			      Tc = ci[WS(rs, 6)];
+			      T8 = W[10];
+			      TW = T2 * T6;
+			      T4 = T2 * T3;
+			      T5 = W[5];
+			      TY = T8 * Tc;
+			      Ta = T8 * T9;
+			      Tb = W[11];
+			      TX = FNMS(T5, T3, TW);
+			      T7 = FMA(T5, T6, T4);
+			 }
+			 {
+			      E Th, Tk, Ti, T12, Tn, Tq, Tp, T17, Tx, T14, To, Tj, TZ, Td, Tg;
+			      E TA, Tl, Ty;
+			      Th = cr[WS(rs, 1)];
+			      TZ = FNMS(Tb, T9, TY);
+			      Td = FMA(Tb, Tc, Ta);
+			      Tk = ci[WS(rs, 1)];
+			      Tg = W[0];
+			      T1Q = TX + TZ;
+			      T10 = TX - TZ;
+			      T1S = Td - T7;
+			      Te = T7 + Td;
+			      Ti = Tg * Th;
+			      T12 = Tg * Tk;
+			      {
+				   E Tt, Tw, Ts, Tv, T16, Tu, Tm;
+				   Tt = cr[WS(rs, 7)];
+				   Tw = ci[WS(rs, 7)];
+				   Ts = W[12];
+				   Tv = W[13];
+				   Tn = cr[WS(rs, 4)];
+				   Tq = ci[WS(rs, 4)];
+				   T16 = Ts * Tw;
+				   Tu = Ts * Tt;
+				   Tm = W[6];
+				   Tp = W[7];
+				   T17 = FNMS(Tv, Tt, T16);
+				   Tx = FMA(Tv, Tw, Tu);
+				   T14 = Tm * Tq;
+				   To = Tm * Tn;
+			      }
+			      Tj = W[1];
+			      TB = cr[WS(rs, 2)];
+			      {
+				   E T15, Tr, T13, T18;
+				   T15 = FNMS(Tp, Tn, T14);
+				   Tr = FMA(Tp, Tq, To);
+				   T13 = FNMS(Tj, Th, T12);
+				   Tl = FMA(Tj, Tk, Ti);
+				   T18 = T15 + T17;
+				   T1d = T15 - T17;
+				   Ty = Tr + Tx;
+				   T1a = Tr - Tx;
+				   T19 = FNMS(KP500000000, T18, T13);
+				   T1M = T13 + T18;
+				   TE = ci[WS(rs, 2)];
+			      }
+			      T1c = FNMS(KP500000000, Ty, Tl);
+			      Tz = Tl + Ty;
+			      TA = W[2];
+			      {
+				   E TN, TQ, TP, T1j, TO, TM;
+				   TN = cr[WS(rs, 8)];
+				   TQ = ci[WS(rs, 8)];
+				   TM = W[14];
+				   T1n = TA * TE;
+				   TC = TA * TB;
+				   TP = W[15];
+				   T1j = TM * TQ;
+				   TO = TM * TN;
+				   TH = cr[WS(rs, 5)];
+				   TK = ci[WS(rs, 5)];
+				   T1k = FNMS(TP, TN, T1j);
+				   TR = FMA(TP, TQ, TO);
+				   TG = W[8];
+				   TJ = W[9];
+			      }
+			      TD = W[3];
+			 }
+		    }
+		    {
+			 E TV, Tf, T21, T1R, T1l, T1r, T1q, T1N, TT, T1g;
+			 {
+			      E T1o, TF, T1i, TL, T1h, TI, TS, T1p;
+			      TV = FNMS(KP500000000, Te, T1);
+			      Tf = T1 + Te;
+			      T1h = TG * TK;
+			      TI = TG * TH;
+			      T1o = FNMS(TD, TB, T1n);
+			      TF = FMA(TD, TE, TC);
+			      T1i = FNMS(TJ, TH, T1h);
+			      TL = FMA(TJ, TK, TI);
+			      T21 = T1Q + T1P;
+			      T1R = FNMS(KP500000000, T1Q, T1P);
+			      T1p = T1i + T1k;
+			      T1l = T1i - T1k;
+			      TS = TL + TR;
+			      T1r = TR - TL;
+			      T1q = FNMS(KP500000000, T1p, T1o);
+			      T1N = T1o + T1p;
+			      TT = TF + TS;
+			      T1g = FNMS(KP500000000, TS, TF);
+			 }
+			 {
+			      E T11, T1z, T1E, T1D, T1X, T1T, T1I, T1C, T1Y, T1y, T1u, T24, TU;
+			      T24 = TT - Tz;
+			      TU = Tz + TT;
+			      {
+				   E T22, T1O, T1L, T23;
+				   T22 = T1M + T1N;
+				   T1O = T1M - T1N;
+				   T11 = FNMS(KP866025403, T10, TV);
+				   T1z = FMA(KP866025403, T10, TV);
+				   T1L = FNMS(KP500000000, TU, Tf);
+				   cr[0] = Tf + TU;
+				   T23 = FNMS(KP500000000, T22, T21);
+				   ci[WS(rs, 8)] = T22 + T21;
+				   cr[WS(rs, 3)] = FMA(KP866025403, T1O, T1L);
+				   ci[WS(rs, 2)] = FNMS(KP866025403, T1O, T1L);
+				   ci[WS(rs, 5)] = FMA(KP866025403, T24, T23);
+				   cr[WS(rs, 6)] = FMS(KP866025403, T24, T23);
+			      }
+			      {
+				   E T1B, T1m, T1w, T1f, T1s, T1A, T1b, T1e, T1x, T1t;
+				   T1E = FNMS(KP866025403, T1a, T19);
+				   T1b = FMA(KP866025403, T1a, T19);
+				   T1e = FNMS(KP866025403, T1d, T1c);
+				   T1D = FMA(KP866025403, T1d, T1c);
+				   T1B = FMA(KP866025403, T1l, T1g);
+				   T1m = FNMS(KP866025403, T1l, T1g);
+				   T1X = FNMS(KP866025403, T1S, T1R);
+				   T1T = FMA(KP866025403, T1S, T1R);
+				   T1w = FNMS(KP176326980, T1b, T1e);
+				   T1f = FMA(KP176326980, T1e, T1b);
+				   T1s = FNMS(KP866025403, T1r, T1q);
+				   T1A = FMA(KP866025403, T1r, T1q);
+				   T1x = FMA(KP363970234, T1m, T1s);
+				   T1t = FNMS(KP363970234, T1s, T1m);
+				   T1I = FNMS(KP176326980, T1A, T1B);
+				   T1C = FMA(KP176326980, T1B, T1A);
+				   T1Y = FMA(KP954188894, T1x, T1w);
+				   T1y = FNMS(KP954188894, T1x, T1w);
+				   T20 = FMA(KP954188894, T1t, T1f);
+				   T1u = FNMS(KP954188894, T1t, T1f);
+			      }
+			      {
+				   E T1F, T1J, T1v, T1U, T1K;
+				   ci[WS(rs, 6)] = FNMS(KP984807753, T1Y, T1X);
+				   T1v = FNMS(KP492403876, T1u, T11);
+				   cr[WS(rs, 2)] = FMA(KP984807753, T1u, T11);
+				   T1F = FMA(KP839099631, T1E, T1D);
+				   T1J = FNMS(KP839099631, T1D, T1E);
+				   ci[WS(rs, 3)] = FNMS(KP852868531, T1y, T1v);
+				   ci[0] = FMA(KP852868531, T1y, T1v);
+				   T1U = FNMS(KP777861913, T1J, T1I);
+				   T1K = FMA(KP777861913, T1J, T1I);
+				   {
+					E T1G, T1W, T1V, T1H;
+					T1G = FMA(KP777861913, T1F, T1C);
+					T1W = FNMS(KP777861913, T1F, T1C);
+					T1Z = FMA(KP492403876, T1Y, T1X);
+					T1V = FMA(KP492403876, T1U, T1T);
+					ci[WS(rs, 7)] = FNMS(KP984807753, T1U, T1T);
+					T1H = FNMS(KP492403876, T1G, T1z);
+					cr[WS(rs, 1)] = FMA(KP984807753, T1G, T1z);
+					ci[WS(rs, 4)] = FMA(KP852868531, T1W, T1V);
+					cr[WS(rs, 7)] = FMS(KP852868531, T1W, T1V);
+					cr[WS(rs, 4)] = FMA(KP852868531, T1K, T1H);
+					ci[WS(rs, 1)] = FNMS(KP852868531, T1K, T1H);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       cr[WS(rs, 8)] = -(FMA(KP852868531, T20, T1Z));
+	       cr[WS(rs, 5)] = FMS(KP852868531, T20, T1Z);
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 9},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 9, "hf_9", twinstr, &GENUS, {24, 16, 72, 0} };
+
+void X(codelet_hf_9) (planner *p) {
+     X(khc2hc_register) (p, hf_9, &desc);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 9 -dit -name hf_9 -include hf.h */
+
+/*
+ * This function contains 96 FP additions, 72 FP multiplications,
+ * (or, 60 additions, 36 multiplications, 36 fused multiply/add),
+ * 41 stack variables, 8 constants, and 36 memory accesses
+ */
+#include "hf.h"
+
+static void hf_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
+	       E T1, T1B, TQ, T1A, Tc, TN, T1C, T1D, TL, T1x, T19, T1o, T1c, T1n, Tu;
+	       E T1w, TW, T1k, T11, T1l;
+	       {
+		    E T6, TO, Tb, TP;
+		    T1 = cr[0];
+		    T1B = ci[0];
+		    {
+			 E T3, T5, T2, T4;
+			 T3 = cr[WS(rs, 3)];
+			 T5 = ci[WS(rs, 3)];
+			 T2 = W[4];
+			 T4 = W[5];
+			 T6 = FMA(T2, T3, T4 * T5);
+			 TO = FNMS(T4, T3, T2 * T5);
+		    }
+		    {
+			 E T8, Ta, T7, T9;
+			 T8 = cr[WS(rs, 6)];
+			 Ta = ci[WS(rs, 6)];
+			 T7 = W[10];
+			 T9 = W[11];
+			 Tb = FMA(T7, T8, T9 * Ta);
+			 TP = FNMS(T9, T8, T7 * Ta);
+		    }
+		    TQ = KP866025403 * (TO - TP);
+		    T1A = KP866025403 * (Tb - T6);
+		    Tc = T6 + Tb;
+		    TN = FNMS(KP500000000, Tc, T1);
+		    T1C = TO + TP;
+		    T1D = FNMS(KP500000000, T1C, T1B);
+	       }
+	       {
+		    E Tz, T13, TE, T14, TJ, T15, TK, T16;
+		    {
+			 E Tw, Ty, Tv, Tx;
+			 Tw = cr[WS(rs, 2)];
+			 Ty = ci[WS(rs, 2)];
+			 Tv = W[2];
+			 Tx = W[3];
+			 Tz = FMA(Tv, Tw, Tx * Ty);
+			 T13 = FNMS(Tx, Tw, Tv * Ty);
+		    }
+		    {
+			 E TB, TD, TA, TC;
+			 TB = cr[WS(rs, 5)];
+			 TD = ci[WS(rs, 5)];
+			 TA = W[8];
+			 TC = W[9];
+			 TE = FMA(TA, TB, TC * TD);
+			 T14 = FNMS(TC, TB, TA * TD);
+		    }
+		    {
+			 E TG, TI, TF, TH;
+			 TG = cr[WS(rs, 8)];
+			 TI = ci[WS(rs, 8)];
+			 TF = W[14];
+			 TH = W[15];
+			 TJ = FMA(TF, TG, TH * TI);
+			 T15 = FNMS(TH, TG, TF * TI);
+		    }
+		    TK = TE + TJ;
+		    T16 = T14 + T15;
+		    TL = Tz + TK;
+		    T1x = T13 + T16;
+		    {
+			 E T17, T18, T1a, T1b;
+			 T17 = FNMS(KP500000000, T16, T13);
+			 T18 = KP866025403 * (TJ - TE);
+			 T19 = T17 - T18;
+			 T1o = T18 + T17;
+			 T1a = FNMS(KP500000000, TK, Tz);
+			 T1b = KP866025403 * (T14 - T15);
+			 T1c = T1a - T1b;
+			 T1n = T1a + T1b;
+		    }
+	       }
+	       {
+		    E Ti, TX, Tn, TT, Ts, TU, Tt, TY;
+		    {
+			 E Tf, Th, Te, Tg;
+			 Tf = cr[WS(rs, 1)];
+			 Th = ci[WS(rs, 1)];
+			 Te = W[0];
+			 Tg = W[1];
+			 Ti = FMA(Te, Tf, Tg * Th);
+			 TX = FNMS(Tg, Tf, Te * Th);
+		    }
+		    {
+			 E Tk, Tm, Tj, Tl;
+			 Tk = cr[WS(rs, 4)];
+			 Tm = ci[WS(rs, 4)];
+			 Tj = W[6];
+			 Tl = W[7];
+			 Tn = FMA(Tj, Tk, Tl * Tm);
+			 TT = FNMS(Tl, Tk, Tj * Tm);
+		    }
+		    {
+			 E Tp, Tr, To, Tq;
+			 Tp = cr[WS(rs, 7)];
+			 Tr = ci[WS(rs, 7)];
+			 To = W[12];
+			 Tq = W[13];
+			 Ts = FMA(To, Tp, Tq * Tr);
+			 TU = FNMS(Tq, Tp, To * Tr);
+		    }
+		    Tt = Tn + Ts;
+		    TY = TT + TU;
+		    Tu = Ti + Tt;
+		    T1w = TX + TY;
+		    {
+			 E TS, TV, TZ, T10;
+			 TS = FNMS(KP500000000, Tt, Ti);
+			 TV = KP866025403 * (TT - TU);
+			 TW = TS - TV;
+			 T1k = TS + TV;
+			 TZ = FNMS(KP500000000, TY, TX);
+			 T10 = KP866025403 * (Ts - Tn);
+			 T11 = TZ - T10;
+			 T1l = T10 + TZ;
+		    }
+	       }
+	       {
+		    E T1y, Td, TM, T1v;
+		    T1y = KP866025403 * (T1w - T1x);
+		    Td = T1 + Tc;
+		    TM = Tu + TL;
+		    T1v = FNMS(KP500000000, TM, Td);
+		    cr[0] = Td + TM;
+		    cr[WS(rs, 3)] = T1v + T1y;
+		    ci[WS(rs, 2)] = T1v - T1y;
+	       }
+	       {
+		    E TR, T1I, T1e, T1K, T1i, T1H, T1f, T1J;
+		    TR = TN - TQ;
+		    T1I = T1D - T1A;
+		    {
+			 E T12, T1d, T1g, T1h;
+			 T12 = FMA(KP173648177, TW, KP984807753 * T11);
+			 T1d = FNMS(KP939692620, T1c, KP342020143 * T19);
+			 T1e = T12 + T1d;
+			 T1K = KP866025403 * (T1d - T12);
+			 T1g = FNMS(KP984807753, TW, KP173648177 * T11);
+			 T1h = FMA(KP342020143, T1c, KP939692620 * T19);
+			 T1i = KP866025403 * (T1g + T1h);
+			 T1H = T1g - T1h;
+		    }
+		    cr[WS(rs, 2)] = TR + T1e;
+		    ci[WS(rs, 6)] = T1H + T1I;
+		    T1f = FNMS(KP500000000, T1e, TR);
+		    ci[0] = T1f - T1i;
+		    ci[WS(rs, 3)] = T1f + T1i;
+		    T1J = FMS(KP500000000, T1H, T1I);
+		    cr[WS(rs, 5)] = T1J - T1K;
+		    cr[WS(rs, 8)] = T1K + T1J;
+	       }
+	       {
+		    E T1L, T1M, T1N, T1O;
+		    T1L = KP866025403 * (TL - Tu);
+		    T1M = T1C + T1B;
+		    T1N = T1w + T1x;
+		    T1O = FNMS(KP500000000, T1N, T1M);
+		    cr[WS(rs, 6)] = T1L - T1O;
+		    ci[WS(rs, 8)] = T1N + T1M;
+		    ci[WS(rs, 5)] = T1L + T1O;
+	       }
+	       {
+		    E T1j, T1E, T1q, T1z, T1u, T1F, T1r, T1G;
+		    T1j = TN + TQ;
+		    T1E = T1A + T1D;
+		    {
+			 E T1m, T1p, T1s, T1t;
+			 T1m = FMA(KP766044443, T1k, KP642787609 * T1l);
+			 T1p = FMA(KP173648177, T1n, KP984807753 * T1o);
+			 T1q = T1m + T1p;
+			 T1z = KP866025403 * (T1p - T1m);
+			 T1s = FNMS(KP642787609, T1k, KP766044443 * T1l);
+			 T1t = FNMS(KP984807753, T1n, KP173648177 * T1o);
+			 T1u = KP866025403 * (T1s - T1t);
+			 T1F = T1s + T1t;
+		    }
+		    cr[WS(rs, 1)] = T1j + T1q;
+		    T1r = FNMS(KP500000000, T1q, T1j);
+		    ci[WS(rs, 1)] = T1r - T1u;
+		    cr[WS(rs, 4)] = T1r + T1u;
+		    ci[WS(rs, 7)] = T1F + T1E;
+		    T1G = FNMS(KP500000000, T1F, T1E);
+		    cr[WS(rs, 7)] = T1z - T1G;
+		    ci[WS(rs, 4)] = T1z + T1G;
+	       }
+	  }
+     }
+}
+
+static const tw_instr twinstr[] = {
+     {TW_FULL, 1, 9},
+     {TW_NEXT, 1, 0}
+};
+
+static const hc2hc_desc desc = { 9, "hf_9", twinstr, &GENUS, {60, 36, 36, 0} };
+
+void X(codelet_hf_9) (planner *p) {
+     X(khc2hc_register) (p, hf_9, &desc);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:15 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cfII_10 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 32 FP additions, 18 FP multiplications,
+ * (or, 14 additions, 0 multiplications, 18 fused multiply/add),
+ * 37 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
+	       E Tq, Ti, Tk, Tu, Tw, Tp, Tb, Tj, Tr, Tv;
+	       {
+		    E T1, To, Ts, Tt, T8, Ta, Te, Tm, Tl, Th, Tn, T9;
+		    T1 = R0[0];
+		    To = R1[WS(rs, 2)];
+		    {
+			 E T2, T3, T5, T6;
+			 T2 = R0[WS(rs, 2)];
+			 T3 = R0[WS(rs, 3)];
+			 T5 = R0[WS(rs, 4)];
+			 T6 = R0[WS(rs, 1)];
+			 {
+			      E Tc, T4, T7, Td, Tf, Tg;
+			      Tc = R1[0];
+			      Ts = T2 + T3;
+			      T4 = T2 - T3;
+			      Tt = T5 + T6;
+			      T7 = T5 - T6;
+			      Td = R1[WS(rs, 4)];
+			      Tf = R1[WS(rs, 1)];
+			      Tg = R1[WS(rs, 3)];
+			      T8 = T4 + T7;
+			      Ta = T4 - T7;
+			      Te = Tc - Td;
+			      Tm = Tc + Td;
+			      Tl = Tf + Tg;
+			      Th = Tf - Tg;
+			 }
+		    }
+		    Cr[WS(csr, 2)] = T1 + T8;
+		    Tn = Tl - Tm;
+		    Tq = Tm + Tl;
+		    Ti = FMA(KP618033988, Th, Te);
+		    Tk = FNMS(KP618033988, Te, Th);
+		    Ci[WS(csi, 2)] = Tn - To;
+		    T9 = FNMS(KP250000000, T8, T1);
+		    Tu = FMA(KP618033988, Tt, Ts);
+		    Tw = FNMS(KP618033988, Ts, Tt);
+		    Tp = FMA(KP250000000, Tn, To);
+		    Tb = FMA(KP559016994, Ta, T9);
+		    Tj = FNMS(KP559016994, Ta, T9);
+	       }
+	       Tr = FMA(KP559016994, Tq, Tp);
+	       Tv = FNMS(KP559016994, Tq, Tp);
+	       Cr[WS(csr, 1)] = FNMS(KP951056516, Tk, Tj);
+	       Cr[WS(csr, 3)] = FMA(KP951056516, Tk, Tj);
+	       Cr[0] = FMA(KP951056516, Ti, Tb);
+	       Cr[WS(csr, 4)] = FNMS(KP951056516, Ti, Tb);
+	       Ci[WS(csi, 1)] = FNMS(KP951056516, Tw, Tv);
+	       Ci[WS(csi, 3)] = FMA(KP951056516, Tw, Tv);
+	       Ci[WS(csi, 4)] = FMS(KP951056516, Tu, Tr);
+	       Ci[0] = -(FMA(KP951056516, Tu, Tr));
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 10, "r2cfII_10", {14, 0, 18, 0}, &GENUS };
+
+void X(codelet_r2cfII_10) (planner *p) {
+     X(kr2c_register) (p, r2cfII_10, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cfII_10 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 32 FP additions, 12 FP multiplications,
+ * (or, 26 additions, 6 multiplications, 6 fused multiply/add),
+ * 21 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
+	       E T1, To, T8, Tq, T9, Tp, Te, Ts, Th, Tn;
+	       T1 = R0[0];
+	       To = R1[WS(rs, 2)];
+	       {
+		    E T2, T3, T4, T5, T6, T7;
+		    T2 = R0[WS(rs, 2)];
+		    T3 = R0[WS(rs, 3)];
+		    T4 = T2 - T3;
+		    T5 = R0[WS(rs, 4)];
+		    T6 = R0[WS(rs, 1)];
+		    T7 = T5 - T6;
+		    T8 = T4 + T7;
+		    Tq = T5 + T6;
+		    T9 = KP559016994 * (T4 - T7);
+		    Tp = T2 + T3;
+	       }
+	       {
+		    E Tc, Td, Tm, Tf, Tg, Tl;
+		    Tc = R1[0];
+		    Td = R1[WS(rs, 4)];
+		    Tm = Tc + Td;
+		    Tf = R1[WS(rs, 1)];
+		    Tg = R1[WS(rs, 3)];
+		    Tl = Tf + Tg;
+		    Te = Tc - Td;
+		    Ts = KP559016994 * (Tm + Tl);
+		    Th = Tf - Tg;
+		    Tn = Tl - Tm;
+	       }
+	       Cr[WS(csr, 2)] = T1 + T8;
+	       Ci[WS(csi, 2)] = Tn - To;
+	       {
+		    E Ti, Tk, Tb, Tj, Ta;
+		    Ti = FMA(KP951056516, Te, KP587785252 * Th);
+		    Tk = FNMS(KP587785252, Te, KP951056516 * Th);
+		    Ta = FNMS(KP250000000, T8, T1);
+		    Tb = T9 + Ta;
+		    Tj = Ta - T9;
+		    Cr[WS(csr, 4)] = Tb - Ti;
+		    Cr[WS(csr, 3)] = Tj + Tk;
+		    Cr[0] = Tb + Ti;
+		    Cr[WS(csr, 1)] = Tj - Tk;
+	       }
+	       {
+		    E Tr, Tw, Tu, Tv, Tt;
+		    Tr = FMA(KP951056516, Tp, KP587785252 * Tq);
+		    Tw = FNMS(KP587785252, Tp, KP951056516 * Tq);
+		    Tt = FMA(KP250000000, Tn, To);
+		    Tu = Ts + Tt;
+		    Tv = Tt - Ts;
+		    Ci[0] = -(Tr + Tu);
+		    Ci[WS(csi, 3)] = Tw + Tv;
+		    Ci[WS(csi, 4)] = Tr - Tu;
+		    Ci[WS(csi, 1)] = Tv - Tw;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 10, "r2cfII_10", {26, 6, 6, 0}, &GENUS };
+
+void X(codelet_r2cfII_10) (planner *p) {
+     X(kr2c_register) (p, r2cfII_10, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:15 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cfII_12 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 45 FP additions, 24 FP multiplications,
+ * (or, 21 additions, 0 multiplications, 24 fused multiply/add),
+ * 37 stack variables, 3 constants, and 24 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
+	       E TD, TB, Tp, T9, Tq, Tr, TE, To, Ts, TC;
+	       {
+		    E T8, T1, Tv, Tm, TF, Tz, Tl, Ta, Tb, Tt, TA, T4, Tc;
+		    {
+			 E Tx, Th, Ti, Tj, Ty, T6, T7, T2, T3, Tk;
+			 Tx = R0[WS(rs, 3)];
+			 T6 = R0[WS(rs, 5)];
+			 T7 = R0[WS(rs, 1)];
+			 Th = R1[WS(rs, 4)];
+			 Ti = R1[WS(rs, 2)];
+			 Tj = R1[0];
+			 Ty = T6 + T7;
+			 T8 = T6 - T7;
+			 T1 = R0[0];
+			 Tv = Ti - Tj - Th;
+			 Tk = Ti - Tj;
+			 Tm = Ti + Tj;
+			 TF = Tx - Ty;
+			 Tz = FMA(KP500000000, Ty, Tx);
+			 T2 = R0[WS(rs, 2)];
+			 T3 = R0[WS(rs, 4)];
+			 Tl = FMA(KP500000000, Tk, Th);
+			 Ta = R1[WS(rs, 1)];
+			 Tb = R1[WS(rs, 3)];
+			 Tt = T1 + T3 - T2;
+			 TA = T3 + T2;
+			 T4 = T2 - T3;
+			 Tc = R1[WS(rs, 5)];
+		    }
+		    {
+			 E Tn, Tg, T5, Tu;
+			 TD = FNMS(KP866025403, TA, Tz);
+			 TB = FMA(KP866025403, TA, Tz);
+			 T5 = FMA(KP500000000, T4, T1);
+			 Tu = Ta + Tc - Tb;
+			 {
+			      E Td, Tf, TG, Tw, Te;
+			      Td = Tb - Tc;
+			      Tf = Tc + Tb;
+			      Tp = FMA(KP866025403, T8, T5);
+			      T9 = FNMS(KP866025403, T8, T5);
+			      TG = Tv - Tu;
+			      Tw = Tu + Tv;
+			      Te = FMA(KP500000000, Td, Ta);
+			      Tq = FMA(KP866025403, Tm, Tl);
+			      Tn = FNMS(KP866025403, Tm, Tl);
+			      Ci[WS(csi, 1)] = FMA(KP707106781, TG, TF);
+			      Ci[WS(csi, 4)] = FMS(KP707106781, TG, TF);
+			      Cr[WS(csr, 4)] = FMA(KP707106781, Tw, Tt);
+			      Cr[WS(csr, 1)] = FNMS(KP707106781, Tw, Tt);
+			      Tg = FNMS(KP866025403, Tf, Te);
+			      Tr = FMA(KP866025403, Tf, Te);
+			 }
+			 TE = Tg + Tn;
+			 To = Tg - Tn;
+		    }
+	       }
+	       Ci[WS(csi, 2)] = FMS(KP707106781, TE, TD);
+	       Ci[WS(csi, 3)] = FMA(KP707106781, TE, TD);
+	       Cr[0] = FMA(KP707106781, To, T9);
+	       Cr[WS(csr, 5)] = FNMS(KP707106781, To, T9);
+	       Ts = Tq - Tr;
+	       TC = Tr + Tq;
+	       Ci[0] = -(FMA(KP707106781, TC, TB));
+	       Ci[WS(csi, 5)] = FNMS(KP707106781, TC, TB);
+	       Cr[WS(csr, 2)] = FMA(KP707106781, Ts, Tp);
+	       Cr[WS(csr, 3)] = FNMS(KP707106781, Ts, Tp);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 12, "r2cfII_12", {21, 0, 24, 0}, &GENUS };
+
+void X(codelet_r2cfII_12) (planner *p) {
+     X(kr2c_register) (p, r2cfII_12, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cfII_12 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 43 FP additions, 12 FP multiplications,
+ * (or, 39 additions, 8 multiplications, 4 fused multiply/add),
+ * 28 stack variables, 5 constants, and 24 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP353553390, +0.353553390593273762200422181052424519642417969);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP612372435, +0.612372435695794524549321018676472847991486870);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
+	       E Tx, Tg, T4, Tz, Ty, Tj, TA, T9, Tm, Tl, Te, Tp, To, Tf, TE;
+	       E TF;
+	       {
+		    E T1, T3, T2, Th, Ti;
+		    T1 = R0[0];
+		    T3 = R0[WS(rs, 2)];
+		    T2 = R0[WS(rs, 4)];
+		    Tx = KP866025403 * (T2 + T3);
+		    Tg = FMA(KP500000000, T3 - T2, T1);
+		    T4 = T1 + T2 - T3;
+		    Tz = R0[WS(rs, 3)];
+		    Th = R0[WS(rs, 5)];
+		    Ti = R0[WS(rs, 1)];
+		    Ty = Th + Ti;
+		    Tj = KP866025403 * (Th - Ti);
+		    TA = FMA(KP500000000, Ty, Tz);
+	       }
+	       {
+		    E T5, T6, T7, T8;
+		    T5 = R1[WS(rs, 1)];
+		    T6 = R1[WS(rs, 5)];
+		    T7 = R1[WS(rs, 3)];
+		    T8 = T6 - T7;
+		    T9 = T5 + T8;
+		    Tm = KP612372435 * (T6 + T7);
+		    Tl = FNMS(KP353553390, T8, KP707106781 * T5);
+	       }
+	       {
+		    E Td, Ta, Tb, Tc;
+		    Td = R1[WS(rs, 4)];
+		    Ta = R1[WS(rs, 2)];
+		    Tb = R1[0];
+		    Tc = Ta - Tb;
+		    Te = Tc - Td;
+		    Tp = FMA(KP353553390, Tc, KP707106781 * Td);
+		    To = KP612372435 * (Ta + Tb);
+	       }
+	       Tf = KP707106781 * (T9 + Te);
+	       Cr[WS(csr, 1)] = T4 - Tf;
+	       Cr[WS(csr, 4)] = T4 + Tf;
+	       TE = KP707106781 * (Te - T9);
+	       TF = Tz - Ty;
+	       Ci[WS(csi, 4)] = TE - TF;
+	       Ci[WS(csi, 1)] = TE + TF;
+	       {
+		    E Tk, TB, Tr, Tw, Tn, Tq;
+		    Tk = Tg - Tj;
+		    TB = Tx - TA;
+		    Tn = Tl - Tm;
+		    Tq = To - Tp;
+		    Tr = Tn + Tq;
+		    Tw = Tn - Tq;
+		    Cr[WS(csr, 5)] = Tk - Tr;
+		    Ci[WS(csi, 2)] = Tw + TB;
+		    Cr[0] = Tk + Tr;
+		    Ci[WS(csi, 3)] = Tw - TB;
+	       }
+	       {
+		    E Ts, TD, Tv, TC, Tt, Tu;
+		    Ts = Tg + Tj;
+		    TD = Tx + TA;
+		    Tt = To + Tp;
+		    Tu = Tm + Tl;
+		    Tv = Tt - Tu;
+		    TC = Tu + Tt;
+		    Cr[WS(csr, 3)] = Ts - Tv;
+		    Ci[WS(csi, 5)] = TD - TC;
+		    Cr[WS(csr, 2)] = Ts + Tv;
+		    Ci[0] = -(TC + TD);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 12, "r2cfII_12", {39, 8, 4, 0}, &GENUS };
+
+void X(codelet_r2cfII_12) (planner *p) {
+     X(kr2c_register) (p, r2cfII_12, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:15 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cfII_15 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 72 FP additions, 41 FP multiplications,
+ * (or, 38 additions, 7 multiplications, 34 fused multiply/add),
+ * 57 stack variables, 12 constants, and 30 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DK(KP910592997, +0.910592997310029334643087372129977886038870291);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DK(KP552786404, +0.552786404500042060718165266253744752911876328);
+     DK(KP447213595, +0.447213595499957939281834733746255247088123672);
+     DK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
+	       E T9, TQ, TV, TW, Tw, TJ;
+	       {
+		    E Ta, Tl, Tg, T8, T7, TF, TX, TT, Tm, Th, TM, TZ, Tr, Tn, Tj;
+		    E Tz, To, TN, TH, Tp, TO;
+		    Ta = R0[WS(rs, 5)];
+		    Tl = R1[WS(rs, 2)];
+		    {
+			 E T1, T2, T5, T3, T4;
+			 T1 = R0[0];
+			 T2 = R0[WS(rs, 3)];
+			 T5 = R1[WS(rs, 4)];
+			 T3 = R0[WS(rs, 6)];
+			 T4 = R1[WS(rs, 1)];
+			 {
+			      E Tb, TL, Te, TK, TR, Tf, Ti, Ty;
+			      Tb = R1[0];
+			      TR = T2 + T5;
+			      Tg = R0[WS(rs, 2)];
+			      {
+				   E T6, TS, Tc, Td;
+				   T6 = T2 + T3 - T4 - T5;
+				   T8 = (T3 + T5 - T2) - T4;
+				   TS = T3 + T4;
+				   Tc = R1[WS(rs, 3)];
+				   Td = R1[WS(rs, 6)];
+				   T7 = FNMS(KP250000000, T6, T1);
+				   TF = T1 + T6;
+				   TX = FNMS(KP618033988, TR, TS);
+				   TT = FMA(KP618033988, TS, TR);
+				   TL = Tc - Td;
+				   Te = Tc + Td;
+			      }
+			      TK = Tg + Tb;
+			      Tm = R0[WS(rs, 7)];
+			      Tf = Tb - Te;
+			      Th = Tb + Te;
+			      TM = FMA(KP618033988, TL, TK);
+			      TZ = FNMS(KP618033988, TK, TL);
+			      Ti = FMA(KP809016994, Th, Tg);
+			      Ty = FMA(KP447213595, Th, Tf);
+			      Tr = R1[WS(rs, 5)];
+			      Tn = R0[WS(rs, 1)];
+			      Tj = FNMS(KP552786404, Ti, Tf);
+			      Tz = FNMS(KP690983005, Ty, Tg);
+			      To = R0[WS(rs, 4)];
+			      TN = Tr + Tm;
+			 }
+		    }
+		    TH = Ta + Tg - Th;
+		    Tp = Tn + To;
+		    TO = To - Tn;
+		    {
+			 E Tx, TA, TP, T14, T11, Tu, TD;
+			 {
+			      E T10, TI, TC, TY;
+			      T9 = FNMS(KP559016994, T8, T7);
+			      Tx = FMA(KP559016994, T8, T7);
+			      TA = FNMS(KP809016994, Tz, Ta);
+			      TP = FMA(KP618033988, TO, TN);
+			      TY = FNMS(KP618033988, TN, TO);
+			      {
+				   E Tq, Ts, TG, Tt, TB;
+				   Tq = Tm - Tp;
+				   Ts = Tm + Tp;
+				   T14 = TZ - TY;
+				   T10 = TY + TZ;
+				   TG = Ts - Tr - Tl;
+				   Tt = FMA(KP809016994, Ts, Tr);
+				   TB = FMA(KP447213595, Ts, Tq);
+				   T11 = FMA(KP500000000, T10, TX);
+				   Ci[WS(csi, 2)] = KP866025403 * (TH - TG);
+				   TI = TG + TH;
+				   Tu = FNMS(KP552786404, Tt, Tq);
+				   TC = FNMS(KP690983005, TB, Tr);
+			      }
+			      Ci[WS(csi, 1)] = KP951056516 * (T10 - TX);
+			      Cr[WS(csr, 7)] = TF + TI;
+			      Cr[WS(csr, 2)] = FNMS(KP500000000, TI, TF);
+			      TD = FNMS(KP809016994, TC, Tl);
+			 }
+			 {
+			      E TU, Tk, T13, Tv, T12, TE;
+			      TQ = TM - TP;
+			      TU = TP + TM;
+			      T12 = TD + TA;
+			      TE = TA - TD;
+			      Tk = FNMS(KP559016994, Tj, Ta);
+			      TV = FMA(KP500000000, TU, TT);
+			      Ci[WS(csi, 6)] = -(KP951056516 * (FMA(KP910592997, T12, T11)));
+			      Ci[WS(csi, 3)] = KP951056516 * (FNMS(KP910592997, T12, T11));
+			      T13 = FNMS(KP500000000, TE, Tx);
+			      Cr[WS(csr, 1)] = Tx + TE;
+			      Tv = FNMS(KP559016994, Tu, Tl);
+			      Ci[WS(csi, 4)] = KP951056516 * (TT - TU);
+			      Cr[WS(csr, 6)] = FMA(KP823639103, T14, T13);
+			      Cr[WS(csr, 3)] = FNMS(KP823639103, T14, T13);
+			      TW = Tv + Tk;
+			      Tw = Tk - Tv;
+			 }
+		    }
+	       }
+	       Ci[WS(csi, 5)] = -(KP951056516 * (FNMS(KP910592997, TW, TV)));
+	       Ci[0] = -(KP951056516 * (FMA(KP910592997, TW, TV)));
+	       TJ = FNMS(KP500000000, Tw, T9);
+	       Cr[WS(csr, 4)] = T9 + Tw;
+	       Cr[0] = FMA(KP823639103, TQ, TJ);
+	       Cr[WS(csr, 5)] = FNMS(KP823639103, TQ, TJ);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 15, "r2cfII_15", {38, 7, 34, 0}, &GENUS };
+
+void X(codelet_r2cfII_15) (planner *p) {
+     X(kr2c_register) (p, r2cfII_15, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cfII_15 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 72 FP additions, 33 FP multiplications,
+ * (or, 54 additions, 15 multiplications, 18 fused multiply/add),
+ * 37 stack variables, 8 constants, and 30 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
+	       E T1, T2, Tx, TR, TE, T7, TD, Th, Tm, Tr, TQ, TA, TB, Tf, Te;
+	       E Tu, TS, Td, TH, TO;
+	       T1 = R0[WS(rs, 5)];
+	       {
+		    E T3, Tv, T6, Tw, T4, T5;
+		    T2 = R0[WS(rs, 2)];
+		    T3 = R1[0];
+		    Tv = T2 + T3;
+		    T4 = R1[WS(rs, 3)];
+		    T5 = R1[WS(rs, 6)];
+		    T6 = T4 + T5;
+		    Tw = T4 - T5;
+		    Tx = FMA(KP951056516, Tv, KP587785252 * Tw);
+		    TR = FNMS(KP587785252, Tv, KP951056516 * Tw);
+		    TE = KP559016994 * (T3 - T6);
+		    T7 = T3 + T6;
+		    TD = KP250000000 * T7;
+	       }
+	       {
+		    E Ti, Tl, Tj, Tk, Tp, Tq;
+		    Th = R0[0];
+		    Ti = R1[WS(rs, 4)];
+		    Tl = R0[WS(rs, 6)];
+		    Tj = R1[WS(rs, 1)];
+		    Tk = R0[WS(rs, 3)];
+		    Tp = Tk + Ti;
+		    Tq = Tl + Tj;
+		    Tm = Ti + Tj - (Tk + Tl);
+		    Tr = FMA(KP951056516, Tp, KP587785252 * Tq);
+		    TQ = FNMS(KP951056516, Tq, KP587785252 * Tp);
+		    TA = FMA(KP250000000, Tm, Th);
+		    TB = KP559016994 * (Tl + Ti - (Tk + Tj));
+	       }
+	       {
+		    E T9, Tt, Tc, Ts, Ta, Tb, TG;
+		    Tf = R1[WS(rs, 2)];
+		    T9 = R0[WS(rs, 7)];
+		    Te = R1[WS(rs, 5)];
+		    Tt = T9 + Te;
+		    Ta = R0[WS(rs, 1)];
+		    Tb = R0[WS(rs, 4)];
+		    Tc = Ta + Tb;
+		    Ts = Ta - Tb;
+		    Tu = FNMS(KP951056516, Tt, KP587785252 * Ts);
+		    TS = FMA(KP951056516, Ts, KP587785252 * Tt);
+		    Td = T9 + Tc;
+		    TG = KP559016994 * (T9 - Tc);
+		    TH = FNMS(KP309016994, Te, TG) + FNMA(KP250000000, Td, Tf);
+		    TO = FMS(KP809016994, Te, Tf) + FNMA(KP250000000, Td, TG);
+	       }
+	       {
+		    E Tn, T8, Tg, To;
+		    Tn = Th - Tm;
+		    T8 = T1 + T2 - T7;
+		    Tg = Td - Te - Tf;
+		    To = T8 + Tg;
+		    Ci[WS(csi, 2)] = KP866025403 * (T8 - Tg);
+		    Cr[WS(csr, 2)] = FNMS(KP500000000, To, Tn);
+		    Cr[WS(csr, 7)] = Tn + To;
+	       }
+	       {
+		    E TM, TX, TT, TV, TP, TU, TN, TW;
+		    TM = TB + TA;
+		    TX = KP866025403 * (TR + TS);
+		    TT = TR - TS;
+		    TV = FMS(KP500000000, TT, TQ);
+		    TN = T1 + TE + FNMS(KP809016994, T2, TD);
+		    TP = TN + TO;
+		    TU = KP866025403 * (TO - TN);
+		    Cr[WS(csr, 1)] = TM + TP;
+		    Ci[WS(csi, 1)] = TQ + TT;
+		    Ci[WS(csi, 6)] = TU - TV;
+		    Ci[WS(csi, 3)] = TU + TV;
+		    TW = FNMS(KP500000000, TP, TM);
+		    Cr[WS(csr, 3)] = TW - TX;
+		    Cr[WS(csr, 6)] = TW + TX;
+	       }
+	       {
+		    E Tz, TC, Ty, TK, TI, TL, TF, TJ;
+		    Tz = KP866025403 * (Tx + Tu);
+		    TC = TA - TB;
+		    Ty = Tu - Tx;
+		    TK = FMS(KP500000000, Ty, Tr);
+		    TF = FMA(KP309016994, T2, T1) + TD - TE;
+		    TI = TF + TH;
+		    TL = KP866025403 * (TH - TF);
+		    Ci[WS(csi, 4)] = Tr + Ty;
+		    Cr[WS(csr, 4)] = TC + TI;
+		    Ci[WS(csi, 5)] = TK - TL;
+		    Ci[0] = TK + TL;
+		    TJ = FNMS(KP500000000, TI, TC);
+		    Cr[0] = Tz + TJ;
+		    Cr[WS(csr, 5)] = TJ - Tz;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 15, "r2cfII_15", {54, 15, 18, 0}, &GENUS };
+
+void X(codelet_r2cfII_15) (planner *p) {
+     X(kr2c_register) (p, r2cfII_15, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:15 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cfII_16 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 66 FP additions, 48 FP multiplications,
+ * (or, 18 additions, 0 multiplications, 48 fused multiply/add),
+ * 54 stack variables, 7 constants, and 32 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
+	       E TN, TF, TX, TV, TO, TP, TY, TM, TQ, TW;
+	       {
+		    E TT, TZ, TB, T5, Tu, TK, TJ, Tr, T9, TC, T8, Tl, TH, TG, Ti;
+		    E Ta;
+		    {
+			 E T1, TR, Tn, Ts, To, TS, T4, Tp, T2, T3;
+			 T1 = R0[0];
+			 TR = R0[WS(rs, 4)];
+			 T2 = R0[WS(rs, 2)];
+			 T3 = R0[WS(rs, 6)];
+			 Tn = R1[WS(rs, 7)];
+			 Ts = R1[WS(rs, 3)];
+			 To = R1[WS(rs, 1)];
+			 TS = T2 + T3;
+			 T4 = T2 - T3;
+			 Tp = R1[WS(rs, 5)];
+			 {
+			      E Te, Tj, Tf, Tg, Tt, Tq;
+			      Te = R1[0];
+			      TT = FMA(KP707106781, TS, TR);
+			      TZ = FNMS(KP707106781, TS, TR);
+			      TB = FMA(KP707106781, T4, T1);
+			      T5 = FNMS(KP707106781, T4, T1);
+			      Tt = To + Tp;
+			      Tq = To - Tp;
+			      Tj = R1[WS(rs, 4)];
+			      Tf = R1[WS(rs, 2)];
+			      Tu = FNMS(KP707106781, Tt, Ts);
+			      TK = FMA(KP707106781, Tt, Ts);
+			      TJ = FMS(KP707106781, Tq, Tn);
+			      Tr = FMA(KP707106781, Tq, Tn);
+			      Tg = R1[WS(rs, 6)];
+			      {
+				   E T6, T7, Tk, Th;
+				   T6 = R0[WS(rs, 5)];
+				   T7 = R0[WS(rs, 1)];
+				   T9 = R0[WS(rs, 3)];
+				   Tk = Tf + Tg;
+				   Th = Tf - Tg;
+				   TC = FNMS(KP414213562, T6, T7);
+				   T8 = FMA(KP414213562, T7, T6);
+				   Tl = FNMS(KP707106781, Tk, Tj);
+				   TH = FMA(KP707106781, Tk, Tj);
+				   TG = FMA(KP707106781, Th, Te);
+				   Ti = FNMS(KP707106781, Th, Te);
+				   Ta = R0[WS(rs, 7)];
+			      }
+			 }
+		    }
+		    {
+			 E TE, TU, Ty, Tv, TI, TL;
+			 Ty = FNMS(KP668178637, Tr, Tu);
+			 Tv = FMA(KP668178637, Tu, Tr);
+			 {
+			      E Tw, T14, T12, TA, T11, T13, Tx, Td;
+			      {
+				   E Tz, Tm, TD, Tb, T10, Tc;
+				   Tz = FNMS(KP668178637, Ti, Tl);
+				   Tm = FMA(KP668178637, Tl, Ti);
+				   TD = FMS(KP414213562, T9, Ta);
+				   Tb = FMA(KP414213562, Ta, T9);
+				   Tw = Tm - Tv;
+				   T14 = Tm + Tv;
+				   T10 = TD - TC;
+				   TE = TC + TD;
+				   Tc = T8 - Tb;
+				   TU = T8 + Tb;
+				   T12 = Tz + Ty;
+				   TA = Ty - Tz;
+				   T11 = FMA(KP923879532, T10, TZ);
+				   T13 = FNMS(KP923879532, T10, TZ);
+				   Tx = FNMS(KP923879532, Tc, T5);
+				   Td = FMA(KP923879532, Tc, T5);
+			      }
+			      Ci[WS(csi, 2)] = -(FMA(KP831469612, T14, T13));
+			      Ci[WS(csi, 5)] = FNMS(KP831469612, T14, T13);
+			      Cr[WS(csr, 1)] = FMA(KP831469612, Tw, Td);
+			      Cr[WS(csr, 6)] = FNMS(KP831469612, Tw, Td);
+			      Cr[WS(csr, 5)] = FNMS(KP831469612, TA, Tx);
+			      Ci[WS(csi, 1)] = FMA(KP831469612, T12, T11);
+			      Cr[WS(csr, 2)] = FMA(KP831469612, TA, Tx);
+			      Ci[WS(csi, 6)] = FMS(KP831469612, T12, T11);
+			 }
+			 TN = FNMS(KP923879532, TE, TB);
+			 TF = FMA(KP923879532, TE, TB);
+			 TX = FNMS(KP923879532, TU, TT);
+			 TV = FMA(KP923879532, TU, TT);
+			 TO = FMA(KP198912367, TG, TH);
+			 TI = FNMS(KP198912367, TH, TG);
+			 TL = FMA(KP198912367, TK, TJ);
+			 TP = FNMS(KP198912367, TJ, TK);
+			 TY = TL - TI;
+			 TM = TI + TL;
+		    }
+	       }
+	       Ci[WS(csi, 4)] = FMS(KP980785280, TY, TX);
+	       Ci[WS(csi, 3)] = FMA(KP980785280, TY, TX);
+	       Cr[0] = FMA(KP980785280, TM, TF);
+	       Cr[WS(csr, 7)] = FNMS(KP980785280, TM, TF);
+	       TQ = TO - TP;
+	       TW = TO + TP;
+	       Ci[0] = -(FMA(KP980785280, TW, TV));
+	       Ci[WS(csi, 7)] = FNMS(KP980785280, TW, TV);
+	       Cr[WS(csr, 3)] = FMA(KP980785280, TQ, TN);
+	       Cr[WS(csr, 4)] = FNMS(KP980785280, TQ, TN);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 16, "r2cfII_16", {18, 0, 48, 0}, &GENUS };
+
+void X(codelet_r2cfII_16) (planner *p) {
+     X(kr2c_register) (p, r2cfII_16, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cfII_16 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 66 FP additions, 30 FP multiplications,
+ * (or, 54 additions, 18 multiplications, 12 fused multiply/add),
+ * 32 stack variables, 7 constants, and 32 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
+	       E T5, T11, TB, TV, Tr, TK, Tu, TJ, Ti, TH, Tl, TG, Tc, T10, TE;
+	       E TS;
+	       {
+		    E T1, TU, T4, TT, T2, T3;
+		    T1 = R0[0];
+		    TU = R0[WS(rs, 4)];
+		    T2 = R0[WS(rs, 2)];
+		    T3 = R0[WS(rs, 6)];
+		    T4 = KP707106781 * (T2 - T3);
+		    TT = KP707106781 * (T2 + T3);
+		    T5 = T1 + T4;
+		    T11 = TU - TT;
+		    TB = T1 - T4;
+		    TV = TT + TU;
+	       }
+	       {
+		    E Tq, Tt, Tp, Ts, Tn, To;
+		    Tq = R1[WS(rs, 7)];
+		    Tt = R1[WS(rs, 3)];
+		    Tn = R1[WS(rs, 1)];
+		    To = R1[WS(rs, 5)];
+		    Tp = KP707106781 * (Tn - To);
+		    Ts = KP707106781 * (Tn + To);
+		    Tr = Tp - Tq;
+		    TK = Tt - Ts;
+		    Tu = Ts + Tt;
+		    TJ = Tp + Tq;
+	       }
+	       {
+		    E Te, Tk, Th, Tj, Tf, Tg;
+		    Te = R1[0];
+		    Tk = R1[WS(rs, 4)];
+		    Tf = R1[WS(rs, 2)];
+		    Tg = R1[WS(rs, 6)];
+		    Th = KP707106781 * (Tf - Tg);
+		    Tj = KP707106781 * (Tf + Tg);
+		    Ti = Te + Th;
+		    TH = Tk - Tj;
+		    Tl = Tj + Tk;
+		    TG = Te - Th;
+	       }
+	       {
+		    E T8, TC, Tb, TD;
+		    {
+			 E T6, T7, T9, Ta;
+			 T6 = R0[WS(rs, 1)];
+			 T7 = R0[WS(rs, 5)];
+			 T8 = FNMS(KP382683432, T7, KP923879532 * T6);
+			 TC = FMA(KP382683432, T6, KP923879532 * T7);
+			 T9 = R0[WS(rs, 3)];
+			 Ta = R0[WS(rs, 7)];
+			 Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
+			 TD = FMA(KP923879532, T9, KP382683432 * Ta);
+		    }
+		    Tc = T8 + Tb;
+		    T10 = Tb - T8;
+		    TE = TC - TD;
+		    TS = TC + TD;
+	       }
+	       {
+		    E Td, TW, Tw, TR, Tm, Tv;
+		    Td = T5 - Tc;
+		    TW = TS + TV;
+		    Tm = FMA(KP195090322, Ti, KP980785280 * Tl);
+		    Tv = FNMS(KP980785280, Tu, KP195090322 * Tr);
+		    Tw = Tm + Tv;
+		    TR = Tv - Tm;
+		    Cr[WS(csr, 4)] = Td - Tw;
+		    Ci[WS(csi, 7)] = TR + TW;
+		    Cr[WS(csr, 3)] = Td + Tw;
+		    Ci[0] = TR - TW;
+	       }
+	       {
+		    E Tx, TY, TA, TX, Ty, Tz;
+		    Tx = T5 + Tc;
+		    TY = TV - TS;
+		    Ty = FNMS(KP195090322, Tl, KP980785280 * Ti);
+		    Tz = FMA(KP980785280, Tr, KP195090322 * Tu);
+		    TA = Ty + Tz;
+		    TX = Tz - Ty;
+		    Cr[WS(csr, 7)] = Tx - TA;
+		    Ci[WS(csi, 3)] = TX + TY;
+		    Cr[0] = Tx + TA;
+		    Ci[WS(csi, 4)] = TX - TY;
+	       }
+	       {
+		    E TF, T12, TM, TZ, TI, TL;
+		    TF = TB + TE;
+		    T12 = T10 - T11;
+		    TI = FMA(KP831469612, TG, KP555570233 * TH);
+		    TL = FMA(KP831469612, TJ, KP555570233 * TK);
+		    TM = TI - TL;
+		    TZ = TI + TL;
+		    Cr[WS(csr, 6)] = TF - TM;
+		    Ci[WS(csi, 2)] = T12 - TZ;
+		    Cr[WS(csr, 1)] = TF + TM;
+		    Ci[WS(csi, 5)] = -(TZ + T12);
+	       }
+	       {
+		    E TN, T14, TQ, T13, TO, TP;
+		    TN = TB - TE;
+		    T14 = T10 + T11;
+		    TO = FNMS(KP555570233, TJ, KP831469612 * TK);
+		    TP = FNMS(KP555570233, TG, KP831469612 * TH);
+		    TQ = TO - TP;
+		    T13 = TP + TO;
+		    Cr[WS(csr, 5)] = TN - TQ;
+		    Ci[WS(csi, 1)] = T13 + T14;
+		    Cr[WS(csr, 2)] = TN + TQ;
+		    Ci[WS(csi, 6)] = T13 - T14;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 16, "r2cfII_16", {54, 18, 12, 0}, &GENUS };
+
+void X(codelet_r2cfII_16) (planner *p) {
+     X(kr2c_register) (p, r2cfII_16, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:13 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cfII_2 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 0 FP additions, 0 FP multiplications,
+ * (or, 0 additions, 0 multiplications, 0 fused multiply/add),
+ * 3 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
+	       E T1, T2;
+	       T1 = R0[0];
+	       T2 = R1[0];
+	       Cr[0] = T1;
+	       Ci[0] = -T2;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 2, "r2cfII_2", {0, 0, 0, 0}, &GENUS };
+
+void X(codelet_r2cfII_2) (planner *p) {
+     X(kr2c_register) (p, r2cfII_2, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cfII_2 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 0 FP additions, 0 FP multiplications,
+ * (or, 0 additions, 0 multiplications, 0 fused multiply/add),
+ * 3 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
+	       E T1, T2;
+	       T1 = R0[0];
+	       T2 = R1[0];
+	       Cr[0] = T1;
+	       Ci[0] = -T2;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 2, "r2cfII_2", {0, 0, 0, 0}, &GENUS };
+
+void X(codelet_r2cfII_2) (planner *p) {
+     X(kr2c_register) (p, r2cfII_2, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:23 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cfII_20 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 102 FP additions, 63 FP multiplications,
+ * (or, 39 additions, 0 multiplications, 63 fused multiply/add),
+ * 67 stack variables, 10 constants, and 40 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DK(KP552786404, +0.552786404500042060718165266253744752911876328);
+     DK(KP447213595, +0.447213595499957939281834733746255247088123672);
+     DK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP381966011, +0.381966011250105151795413165634361882279690820);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
+	       E Tv, TK, TN, Th, T1l, T1n, Ts, TH;
+	       {
+		    E Ti, T1d, T1f, T1e, T1g, T1p, TS, Tg, To, T8, T7, T19, T1r, T1k, Tx;
+		    E Tp, TX, Ty, TF, Tr, TV, Tz, TA, TI;
+		    {
+			 E Ta, Tb, Td, Te;
+			 Ti = R1[WS(rs, 2)];
+			 T1d = R0[WS(rs, 5)];
+			 Ta = R0[WS(rs, 9)];
+			 Tb = R0[WS(rs, 1)];
+			 Td = R0[WS(rs, 3)];
+			 Te = R0[WS(rs, 7)];
+			 {
+			      E T1, T2, T5, T3, T4, T1i, Tc, Tf;
+			      T1 = R0[0];
+			      T1f = Ta + Tb;
+			      Tc = Ta - Tb;
+			      T1e = Td + Te;
+			      Tf = Td - Te;
+			      T2 = R0[WS(rs, 4)];
+			      T5 = R0[WS(rs, 6)];
+			      T1g = FMA(KP381966011, T1f, T1e);
+			      T1p = FMA(KP381966011, T1e, T1f);
+			      TS = FMA(KP618033988, Tc, Tf);
+			      Tg = FNMS(KP618033988, Tf, Tc);
+			      T3 = R0[WS(rs, 8)];
+			      T4 = R0[WS(rs, 2)];
+			      T1i = T2 + T5;
+			      {
+				   E Tj, Tu, Tm, Tt, Tn, Tq, TU;
+				   Tj = R1[WS(rs, 8)];
+				   To = R1[WS(rs, 6)];
+				   {
+					E T6, T1j, Tk, Tl;
+					T6 = T2 + T3 - T4 - T5;
+					T8 = (T3 + T5 - T2) - T4;
+					T1j = T3 + T4;
+					Tk = R1[0];
+					Tl = R1[WS(rs, 4)];
+					T7 = FNMS(KP250000000, T6, T1);
+					T19 = T1 + T6;
+					T1r = FNMS(KP618033988, T1i, T1j);
+					T1k = FMA(KP618033988, T1j, T1i);
+					Tu = Tk - Tl;
+					Tm = Tk + Tl;
+				   }
+				   Tt = To + Tj;
+				   Tx = R1[WS(rs, 7)];
+				   Tn = Tj - Tm;
+				   Tp = Tj + Tm;
+				   Tv = FNMS(KP618033988, Tu, Tt);
+				   TX = FMA(KP618033988, Tt, Tu);
+				   Tq = FMA(KP809016994, Tp, To);
+				   TU = FMA(KP447213595, Tp, Tn);
+				   Ty = R1[WS(rs, 1)];
+				   TF = R1[WS(rs, 3)];
+				   Tr = FNMS(KP552786404, Tq, Tn);
+				   TV = FNMS(KP690983005, TU, To);
+				   Tz = R1[WS(rs, 5)];
+				   TA = R1[WS(rs, 9)];
+				   TI = TF + Ty;
+			      }
+			 }
+		    }
+		    {
+			 E T1w, TJ, TB, T1a;
+			 T1w = T1f + T1d - T1e;
+			 TJ = Tz - TA;
+			 TB = Tz + TA;
+			 T1a = Ti + To - Tp;
+			 {
+			      E T9, T12, TT, T15, TG, TD, T1s, T1u, TW, T11, T10, T1h;
+			      {
+				   E TE, TC, TR, T1b;
+				   T9 = FNMS(KP559016994, T8, T7);
+				   TR = FMA(KP559016994, T8, T7);
+				   TK = FMA(KP618033988, TJ, TI);
+				   T12 = FNMS(KP618033988, TI, TJ);
+				   TE = Ty - TB;
+				   TC = Ty + TB;
+				   TT = FMA(KP951056516, TS, TR);
+				   T15 = FNMS(KP951056516, TS, TR);
+				   TG = FNMS(KP552786404, TF, TE);
+				   T1b = TC - TF - Tx;
+				   {
+					E TZ, T1q, T1c, T1x;
+					TZ = FMA(KP447213595, TC, TE);
+					TD = FMA(KP250000000, TC, Tx);
+					T1q = FNMS(KP809016994, T1p, T1d);
+					T1c = T1a + T1b;
+					T1x = T1a - T1b;
+					T10 = FNMS(KP690983005, TZ, TF);
+					T1s = FNMS(KP951056516, T1r, T1q);
+					T1u = FMA(KP951056516, T1r, T1q);
+					Ci[WS(csi, 7)] = FMA(KP707106781, T1x, T1w);
+					Ci[WS(csi, 2)] = FMS(KP707106781, T1x, T1w);
+					Cr[WS(csr, 7)] = FMA(KP707106781, T1c, T19);
+					Cr[WS(csr, 2)] = FNMS(KP707106781, T1c, T19);
+				   }
+			      }
+			      TW = FNMS(KP809016994, TV, Ti);
+			      T11 = FNMS(KP809016994, T10, Tx);
+			      T1h = FMA(KP809016994, T1g, T1d);
+			      {
+				   E T17, TY, T16, T13;
+				   T17 = FNMS(KP951056516, TX, TW);
+				   TY = FMA(KP951056516, TX, TW);
+				   T16 = FMA(KP951056516, T12, T11);
+				   T13 = FNMS(KP951056516, T12, T11);
+				   TN = FMA(KP951056516, Tg, T9);
+				   Th = FNMS(KP951056516, Tg, T9);
+				   {
+					E T18, T1v, T1t, T14;
+					T18 = T16 - T17;
+					T1v = T17 + T16;
+					T1t = TY + T13;
+					T14 = TY - T13;
+					Cr[WS(csr, 1)] = FMA(KP707106781, T18, T15);
+					Cr[WS(csr, 8)] = FNMS(KP707106781, T18, T15);
+					Ci[WS(csi, 3)] = FMA(KP707106781, T1v, T1u);
+					Ci[WS(csi, 6)] = FMS(KP707106781, T1v, T1u);
+					Ci[WS(csi, 1)] = FNMS(KP707106781, T1t, T1s);
+					Ci[WS(csi, 8)] = -(FMA(KP707106781, T1t, T1s));
+					Cr[WS(csr, 3)] = FMA(KP707106781, T14, TT);
+					Cr[WS(csr, 6)] = FNMS(KP707106781, T14, TT);
+					T1l = FMA(KP951056516, T1k, T1h);
+					T1n = FNMS(KP951056516, T1k, T1h);
+				   }
+			      }
+			      Ts = FNMS(KP559016994, Tr, Ti);
+			      TH = FNMS(KP559016994, TG, TD);
+			 }
+		    }
+	       }
+	       {
+		    E TO, Tw, TP, TL;
+		    TO = FMA(KP951056516, Tv, Ts);
+		    Tw = FNMS(KP951056516, Tv, Ts);
+		    TP = FMA(KP951056516, TK, TH);
+		    TL = FNMS(KP951056516, TK, TH);
+		    {
+			 E TQ, T1m, T1o, TM;
+			 TQ = TO - TP;
+			 T1m = TO + TP;
+			 T1o = Tw + TL;
+			 TM = Tw - TL;
+			 Cr[WS(csr, 4)] = FMA(KP707106781, TQ, TN);
+			 Cr[WS(csr, 5)] = FNMS(KP707106781, TQ, TN);
+			 Ci[WS(csi, 9)] = FNMS(KP707106781, T1m, T1l);
+			 Ci[0] = -(FMA(KP707106781, T1m, T1l));
+			 Ci[WS(csi, 5)] = FNMS(KP707106781, T1o, T1n);
+			 Ci[WS(csi, 4)] = -(FMA(KP707106781, T1o, T1n));
+			 Cr[0] = FMA(KP707106781, TM, Th);
+			 Cr[WS(csr, 9)] = FNMS(KP707106781, TM, Th);
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 20, "r2cfII_20", {39, 0, 63, 0}, &GENUS };
+
+void X(codelet_r2cfII_20) (planner *p) {
+     X(kr2c_register) (p, r2cfII_20, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cfII_20 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 102 FP additions, 34 FP multiplications,
+ * (or, 86 additions, 18 multiplications, 16 fused multiply/add),
+ * 60 stack variables, 13 constants, and 40 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP572061402, +0.572061402817684297600072783580302076536153377);
+     DK(KP218508012, +0.218508012224410535399650602527877556893735408);
+     DK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP176776695, +0.176776695296636881100211090526212259821208984);
+     DK(KP395284707, +0.395284707521047416499861693054089816714944392);
+     DK(KP672498511, +0.672498511963957326960058968885748755876783111);
+     DK(KP415626937, +0.415626937777453428589967464113135184222253485);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
+	       E T8, TD, Tm, TN, T9, TC, TY, TE, Te, TF, Tl, TK, T12, TL, Tk;
+	       E TM, T1, T6, Tq, T1l, T1c, Tp, T1f, T1e, T1d, Ty, TW, T1g, T1m, Tx;
+	       E Tu;
+	       T8 = R1[WS(rs, 2)];
+	       TD = KP707106781 * T8;
+	       Tm = R1[WS(rs, 7)];
+	       TN = KP707106781 * Tm;
+	       {
+		    E Ta, TA, Td, TB, Tb, Tc;
+		    T9 = R1[WS(rs, 6)];
+		    Ta = R1[WS(rs, 8)];
+		    TA = T9 + Ta;
+		    Tb = R1[0];
+		    Tc = R1[WS(rs, 4)];
+		    Td = Tb + Tc;
+		    TB = Tb - Tc;
+		    TC = FMA(KP415626937, TA, KP672498511 * TB);
+		    TY = FNMS(KP415626937, TB, KP672498511 * TA);
+		    TE = KP395284707 * (Ta - Td);
+		    Te = Ta + Td;
+		    TF = KP176776695 * Te;
+	       }
+	       {
+		    E Tg, TJ, Tj, TI, Th, Ti;
+		    Tg = R1[WS(rs, 1)];
+		    Tl = R1[WS(rs, 3)];
+		    TJ = Tg + Tl;
+		    Th = R1[WS(rs, 5)];
+		    Ti = R1[WS(rs, 9)];
+		    Tj = Th + Ti;
+		    TI = Th - Ti;
+		    TK = FNMS(KP415626937, TJ, KP672498511 * TI);
+		    T12 = FMA(KP415626937, TI, KP672498511 * TJ);
+		    TL = KP395284707 * (Tg - Tj);
+		    Tk = Tg + Tj;
+		    TM = KP176776695 * Tk;
+	       }
+	       {
+		    E T2, T5, T3, T4, T1a, T1b;
+		    T1 = R0[0];
+		    T2 = R0[WS(rs, 6)];
+		    T5 = R0[WS(rs, 8)];
+		    T3 = R0[WS(rs, 2)];
+		    T4 = R0[WS(rs, 4)];
+		    T1a = T4 + T2;
+		    T1b = T5 + T3;
+		    T6 = T2 + T3 - (T4 + T5);
+		    Tq = FMA(KP250000000, T6, T1);
+		    T1l = FNMS(KP951056516, T1b, KP587785252 * T1a);
+		    T1c = FMA(KP951056516, T1a, KP587785252 * T1b);
+		    Tp = KP559016994 * (T5 + T2 - (T4 + T3));
+	       }
+	       T1f = R0[WS(rs, 5)];
+	       {
+		    E Tv, Tw, Ts, Tt;
+		    Tv = R0[WS(rs, 9)];
+		    Tw = R0[WS(rs, 1)];
+		    Tx = Tv - Tw;
+		    T1e = Tv + Tw;
+		    Ts = R0[WS(rs, 3)];
+		    Tt = R0[WS(rs, 7)];
+		    Tu = Ts - Tt;
+		    T1d = Ts + Tt;
+	       }
+	       Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
+	       TW = FNMS(KP951056516, Tx, KP587785252 * Tu);
+	       T1g = FMA(KP809016994, T1d, KP309016994 * T1e) + T1f;
+	       T1m = FNMS(KP809016994, T1e, T1f) - (KP309016994 * T1d);
+	       {
+		    E T7, T1r, To, T1q, Tf, Tn;
+		    T7 = T1 - T6;
+		    T1r = T1e + T1f - T1d;
+		    Tf = T8 + (T9 - Te);
+		    Tn = (Tk - Tl) - Tm;
+		    To = KP707106781 * (Tf + Tn);
+		    T1q = KP707106781 * (Tf - Tn);
+		    Cr[WS(csr, 2)] = T7 - To;
+		    Ci[WS(csi, 2)] = T1q - T1r;
+		    Cr[WS(csr, 7)] = T7 + To;
+		    Ci[WS(csi, 7)] = T1q + T1r;
+	       }
+	       {
+		    E T1h, T1j, TX, T15, T10, T16, T13, T17, TV, TZ, T11;
+		    T1h = T1c - T1g;
+		    T1j = T1c + T1g;
+		    TV = Tq - Tp;
+		    TX = TV - TW;
+		    T15 = TV + TW;
+		    TZ = FMA(KP218508012, T9, TD) + TF - TE;
+		    T10 = TY + TZ;
+		    T16 = TZ - TY;
+		    T11 = FNMS(KP218508012, Tl, TL) - (TM + TN);
+		    T13 = T11 - T12;
+		    T17 = T11 + T12;
+		    {
+			 E T14, T19, T18, T1i;
+			 T14 = T10 + T13;
+			 Cr[WS(csr, 5)] = TX - T14;
+			 Cr[WS(csr, 4)] = TX + T14;
+			 T19 = T17 - T16;
+			 Ci[WS(csi, 5)] = T19 - T1h;
+			 Ci[WS(csi, 4)] = T19 + T1h;
+			 T18 = T16 + T17;
+			 Cr[WS(csr, 9)] = T15 - T18;
+			 Cr[0] = T15 + T18;
+			 T1i = T13 - T10;
+			 Ci[0] = T1i - T1j;
+			 Ci[WS(csi, 9)] = T1i + T1j;
+		    }
+	       }
+	       {
+		    E T1n, T1p, Tz, TR, TH, TS, TP, TT, Tr, TG, TO;
+		    T1n = T1l + T1m;
+		    T1p = T1m - T1l;
+		    Tr = Tp + Tq;
+		    Tz = Tr + Ty;
+		    TR = Tr - Ty;
+		    TG = TD + TE + FNMS(KP572061402, T9, TF);
+		    TH = TC + TG;
+		    TS = TC - TG;
+		    TO = TL + TM + FNMS(KP572061402, Tl, TN);
+		    TP = TK - TO;
+		    TT = TK + TO;
+		    {
+			 E TQ, T1o, TU, T1k;
+			 TQ = TH + TP;
+			 Cr[WS(csr, 6)] = Tz - TQ;
+			 Cr[WS(csr, 3)] = Tz + TQ;
+			 T1o = TT - TS;
+			 Ci[WS(csi, 6)] = T1o - T1p;
+			 Ci[WS(csi, 3)] = T1o + T1p;
+			 TU = TS + TT;
+			 Cr[WS(csr, 8)] = TR - TU;
+			 Cr[WS(csr, 1)] = TR + TU;
+			 T1k = TP - TH;
+			 Ci[WS(csi, 8)] = T1k - T1n;
+			 Ci[WS(csi, 1)] = T1k + T1n;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 20, "r2cfII_20", {86, 18, 16, 0}, &GENUS };
+
+void X(codelet_r2cfII_20) (planner *p) {
+     X(kr2c_register) (p, r2cfII_20, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,783 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:24 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cfII_25 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 212 FP additions, 177 FP multiplications,
+ * (or, 47 additions, 12 multiplications, 165 fused multiply/add),
+ * 163 stack variables, 67 constants, and 50 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP876091699, +0.876091699473550838204498029706869638173524346);
+     DK(KP792626838, +0.792626838241819413632131824093538848057784557);
+     DK(KP690668130, +0.690668130712929053565177988380887884042527623);
+     DK(KP809385824, +0.809385824416008241660603814668679683846476688);
+     DK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DK(KP237294955, +0.237294955877110315393888866460840817927895961);
+     DK(KP897376177, +0.897376177523557693138608077137219684419427330);
+     DK(KP997675361, +0.997675361079556513670859573984492383596555031);
+     DK(KP584303379, +0.584303379262766050358567120694562180043261496);
+     DK(KP653711795, +0.653711795629256296299985401753308353544378892);
+     DK(KP591287873, +0.591287873858343558732323717242372865934480959);
+     DK(KP645989928, +0.645989928319777763844272876603899665178054552);
+     DK(KP956723877, +0.956723877038460305821989399535483155872969262);
+     DK(KP952936919, +0.952936919628306576880750665357914584765951388);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP945422727, +0.945422727388575946270360266328811958657216298);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP999754674, +0.999754674276473633366203429228112409535557487);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DK(KP669429328, +0.669429328479476605641803240971985825917022098);
+     DK(KP916574801, +0.916574801383451584742370439148878693530976769);
+     DK(KP829049696, +0.829049696159252993975487806364305442437946767);
+     DK(KP923225144, +0.923225144846402650453449441572664695995209956);
+     DK(KP262346850, +0.262346850930607871785420028382979691334784273);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DK(KP763583905, +0.763583905359130246362948588764067237776594106);
+     DK(KP921078979, +0.921078979742360627699756128143719920817673854);
+     DK(KP904508497, +0.904508497187473712051146708591409529430077295);
+     DK(KP248028675, +0.248028675328619457762448260696444630363259177);
+     DK(KP894834959, +0.894834959464455102997960030820114611498661386);
+     DK(KP982009705, +0.982009705009746369461829878184175962711969869);
+     DK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DK(KP958953096, +0.958953096729998668045963838399037225970891871);
+     DK(KP867381224, +0.867381224396525206773171885031575671309956167);
+     DK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP869845200, +0.869845200362138853122720822420327157933056305);
+     DK(KP786782374, +0.786782374965295178365099601674911834788448471);
+     DK(KP120146378, +0.120146378570687701782758537356596213647956445);
+     DK(KP132830569, +0.132830569247582714407653942074819768844536507);
+     DK(KP269969613, +0.269969613759572083574752974412347470060951301);
+     DK(KP244189809, +0.244189809627953270309879511234821255780225091);
+     DK(KP987388751, +0.987388751065621252324603216482382109400433949);
+     DK(KP893101515, +0.893101515366181661711202267938416198338079437);
+     DK(KP494780565, +0.494780565770515410344588413655324772219443730);
+     DK(KP447533225, +0.447533225982656890041886979663652563063114397);
+     DK(KP522847744, +0.522847744331509716623755382187077770911012542);
+     DK(KP578046249, +0.578046249379945007321754579646815604023525655);
+     DK(KP066152395, +0.066152395967733048213034281011006031460903353);
+     DK(KP059835404, +0.059835404262124915169548397419498386427871950);
+     DK(KP667278218, +0.667278218140296670899089292254759909713898805);
+     DK(KP603558818, +0.603558818296015001454675132653458027918768137);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
+	       E T2R, T2T, T2D, T2C, T2H, T2G, T2B, T2P, T2S;
+	       {
+		    E T2A, TJ, T1K, T3l, T2z, TB, T2d, T2l, T1N, T21, T15, T1g, T1s, T1D, T9;
+		    E T25, T1X, T2o, T2g, T1z, T1u, T1j, TQ, Ti, T1a, T2f, T2p, T1U, T24, TX;
+		    E T1k, T1v, T1A, T19, Ts, T18, T1P;
+		    {
+			 E Tt, Tw, TZ, Tx, Ty;
+			 {
+			      E T2v, TG, TH, TD, TE, TI, T2x;
+			      T2v = R0[0];
+			      TG = R0[WS(rs, 10)];
+			      TH = R1[WS(rs, 2)];
+			      TD = R0[WS(rs, 5)];
+			      TE = R1[WS(rs, 7)];
+			      Tt = R0[WS(rs, 2)];
+			      TI = TG + TH;
+			      T2x = TG - TH;
+			      {
+				   E TF, T2w, Tu, Tv, T2y;
+				   TF = TD + TE;
+				   T2w = TD - TE;
+				   Tu = R0[WS(rs, 7)];
+				   Tv = R1[WS(rs, 9)];
+				   T2A = T2w - T2x;
+				   T2y = T2w + T2x;
+				   TJ = FMA(KP618033988, TI, TF);
+				   T1K = FNMS(KP618033988, TF, TI);
+				   T3l = T2v + T2y;
+				   T2z = FNMS(KP250000000, T2y, T2v);
+				   Tw = Tu - Tv;
+				   TZ = Tu + Tv;
+				   Tx = R0[WS(rs, 12)];
+				   Ty = R1[WS(rs, 4)];
+			      }
+			 }
+			 {
+			      E TO, TN, TM, T1V;
+			      {
+				   E T1, T1M, T11, T13, T4, TK, T12, TL, T7, T5, TA, T6, T14, T1L, T8;
+				   T1 = R0[WS(rs, 1)];
+				   {
+					E T2, T10, Tz, T3;
+					T2 = R0[WS(rs, 6)];
+					T10 = Tx + Ty;
+					Tz = Tx - Ty;
+					T3 = R1[WS(rs, 8)];
+					T5 = R0[WS(rs, 11)];
+					T1M = FNMS(KP618033988, TZ, T10);
+					T11 = FMA(KP618033988, T10, TZ);
+					T13 = Tz - Tw;
+					TA = Tw + Tz;
+					T4 = T2 - T3;
+					TK = T2 + T3;
+					T6 = R1[WS(rs, 3)];
+				   }
+				   TB = Tt + TA;
+				   T12 = FNMS(KP250000000, TA, Tt);
+				   TL = T5 + T6;
+				   T7 = T5 - T6;
+				   T14 = FNMS(KP559016994, T13, T12);
+				   T1L = FMA(KP559016994, T13, T12);
+				   T8 = T4 + T7;
+				   TO = T4 - T7;
+				   T2d = FNMS(KP603558818, T1M, T1L);
+				   T2l = FMA(KP667278218, T1L, T1M);
+				   T1N = FMA(KP059835404, T1M, T1L);
+				   T21 = FNMS(KP066152395, T1L, T1M);
+				   T15 = FMA(KP578046249, T14, T11);
+				   T1g = FNMS(KP522847744, T11, T14);
+				   T1s = FMA(KP447533225, T11, T14);
+				   T1D = FNMS(KP494780565, T14, T11);
+				   TN = FNMS(KP250000000, T8, T1);
+				   T9 = T1 + T8;
+				   TM = FMA(KP618033988, TL, TK);
+				   T1V = FNMS(KP618033988, TK, TL);
+			      }
+			      {
+				   E Th, Td, TU, Tc, Te;
+				   Th = R0[WS(rs, 4)];
+				   {
+					E Ta, Tb, T1W, TP;
+					Ta = R0[WS(rs, 9)];
+					Tb = R1[WS(rs, 11)];
+					T1W = FNMS(KP559016994, TO, TN);
+					TP = FMA(KP559016994, TO, TN);
+					Td = R1[WS(rs, 6)];
+					TU = Ta + Tb;
+					Tc = Ta - Tb;
+					T25 = FNMS(KP893101515, T1V, T1W);
+					T1X = FMA(KP987388751, T1W, T1V);
+					T2o = FMA(KP522847744, T1V, T1W);
+					T2g = FNMS(KP578046249, T1W, T1V);
+					T1z = FMA(KP667278218, TP, TM);
+					T1u = FNMS(KP603558818, TM, TP);
+					T1j = FNMS(KP244189809, TM, TP);
+					TQ = FMA(KP269969613, TP, TM);
+					Te = R1[WS(rs, 1)];
+				   }
+				   {
+					E Tk, T1S, TW, TS, Tn, T16, TR, T17, Tq, To, Tg, Tp, TT, T1T, Tr;
+					Tk = R0[WS(rs, 3)];
+					{
+					     E Tl, TV, Tf, Tm;
+					     Tl = R0[WS(rs, 8)];
+					     TV = Te - Td;
+					     Tf = Td + Te;
+					     Tm = R1[WS(rs, 10)];
+					     To = R1[0];
+					     T1S = FMA(KP618033988, TU, TV);
+					     TW = FNMS(KP618033988, TV, TU);
+					     TS = Tc + Tf;
+					     Tg = Tc - Tf;
+					     Tn = Tl - Tm;
+					     T16 = Tl + Tm;
+					     Tp = R1[WS(rs, 5)];
+					}
+					Ti = Tg + Th;
+					TR = FNMS(KP250000000, Tg, Th);
+					T17 = Tp - To;
+					Tq = To + Tp;
+					TT = FMA(KP559016994, TS, TR);
+					T1T = FNMS(KP559016994, TS, TR);
+					Tr = Tn - Tq;
+					T1a = Tn + Tq;
+					T2f = FNMS(KP447533225, T1S, T1T);
+					T2p = FMA(KP494780565, T1T, T1S);
+					T1U = FMA(KP132830569, T1T, T1S);
+					T24 = FNMS(KP120146378, T1S, T1T);
+					TX = FMA(KP603558818, TW, TT);
+					T1k = FNMS(KP667278218, TT, TW);
+					T1v = FNMS(KP786782374, TW, TT);
+					T1A = FMA(KP869845200, TT, TW);
+					T19 = FNMS(KP250000000, Tr, Tk);
+					Ts = Tk + Tr;
+					T18 = FMA(KP618033988, T17, T16);
+					T1P = FNMS(KP618033988, T16, T17);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T22, T1Q, T1h, T1c, T2O, T2N, T2m, T3a, T3b, T2q, T1y, T3f, T2e, T2h, T3e;
+			 E T1H, T1J;
+			 {
+			      E T3m, T3n, T2k, T2c, T1C, T1r;
+			      {
+				   E Tj, TC, T1O, T1b;
+				   T3m = T9 + Ti;
+				   Tj = T9 - Ti;
+				   TC = Ts - TB;
+				   T3n = TB + Ts;
+				   T1O = FNMS(KP559016994, T1a, T19);
+				   T1b = FMA(KP559016994, T1a, T19);
+				   Ci[WS(csi, 7)] = KP951056516 * (FMA(KP618033988, Tj, TC));
+				   Ci[WS(csi, 2)] = -(KP951056516 * (FNMS(KP618033988, TC, Tj)));
+				   T22 = FMA(KP869845200, T1O, T1P);
+				   T1Q = FNMS(KP786782374, T1P, T1O);
+				   T2k = FMA(KP066152395, T1O, T1P);
+				   T2c = FNMS(KP059835404, T1P, T1O);
+				   T1C = FNMS(KP120146378, T18, T1b);
+				   T1r = FMA(KP132830569, T1b, T18);
+				   T1h = FNMS(KP893101515, T18, T1b);
+				   T1c = FMA(KP987388751, T1b, T18);
+			      }
+			      {
+				   E T1B, T1E, T1t, T3o, T3q, T1w, T3p;
+				   T1B = FMA(KP912575812, T1A, T1z);
+				   T2O = FNMS(KP912575812, T1A, T1z);
+				   T2N = FNMS(KP867381224, T1D, T1C);
+				   T1E = FMA(KP867381224, T1D, T1C);
+				   T1t = FMA(KP958953096, T1s, T1r);
+				   T2R = FNMS(KP958953096, T1s, T1r);
+				   T3o = T3m + T3n;
+				   T3q = T3m - T3n;
+				   T2T = FMA(KP912575812, T1v, T1u);
+				   T1w = FNMS(KP912575812, T1v, T1u);
+				   T2m = FNMS(KP845997307, T2l, T2k);
+				   T3a = FMA(KP845997307, T2l, T2k);
+				   T3b = FNMS(KP982009705, T2p, T2o);
+				   T2q = FMA(KP982009705, T2p, T2o);
+				   T3p = FNMS(KP250000000, T3o, T3l);
+				   Cr[WS(csr, 12)] = T3o + T3l;
+				   {
+					E T1x, T1F, T1G, T1I;
+					T1x = FMA(KP894834959, T1w, T1t);
+					T1F = FNMS(KP894834959, T1w, T1t);
+					Cr[WS(csr, 7)] = FNMS(KP559016994, T3q, T3p);
+					Cr[WS(csr, 2)] = FMA(KP559016994, T3q, T3p);
+					T1y = FMA(KP248028675, T1x, TJ);
+					T1G = FNMS(KP904508497, T1F, T1E);
+					T1I = FNMS(KP894834959, T1B, T1F);
+					T3f = FNMS(KP845997307, T2d, T2c);
+					T2e = FMA(KP845997307, T2d, T2c);
+					T2h = FNMS(KP921078979, T2g, T2f);
+					T3e = FMA(KP921078979, T2g, T2f);
+					T1H = FMA(KP763583905, T1G, T1B);
+					T1J = FMA(KP559016994, T1I, T1E);
+				   }
+			      }
+			 }
+			 {
+			      E T1i, T1l, T23, T30, T2Z, T26, T1R, T33, T1f, T1n, T1p, T34, T1Y, T3d, T3k;
+			      E T3i;
+			      {
+				   E T2j, TY, T2s, T2u, T1d, T1m, T1e;
+				   T2D = FMA(KP831864738, T1h, T1g);
+				   T1i = FNMS(KP831864738, T1h, T1g);
+				   {
+					E T2i, T2n, T2r, T2t;
+					T2i = FMA(KP906616052, T2h, T2e);
+					T2n = FNMS(KP906616052, T2h, T2e);
+					Ci[WS(csi, 4)] = KP951056516 * (FNMS(KP803003575, T1H, T1y));
+					Ci[WS(csi, 9)] = KP951056516 * (FNMS(KP992114701, T1J, T1y));
+					T2j = FMA(KP262346850, T2i, T1K);
+					T2r = FNMS(KP923225144, T2q, T2n);
+					T2t = T2m + T2n;
+					T2C = FNMS(KP829049696, T1k, T1j);
+					T1l = FMA(KP829049696, T1k, T1j);
+					TY = FMA(KP916574801, TX, TQ);
+					T2H = FNMS(KP916574801, TX, TQ);
+					T2s = FNMS(KP618033988, T2r, T2m);
+					T2u = FNMS(KP669429328, T2t, T2q);
+					T2G = FNMS(KP831864738, T1c, T15);
+					T1d = FMA(KP831864738, T1c, T15);
+				   }
+				   T23 = FNMS(KP772036680, T22, T21);
+				   T30 = FMA(KP772036680, T22, T21);
+				   Ci[WS(csi, 8)] = KP951056516 * (FMA(KP949179823, T2s, T2j));
+				   Ci[WS(csi, 3)] = KP951056516 * (FNMS(KP876306680, T2u, T2j));
+				   T1m = FNMS(KP904730450, T1d, TY);
+				   T1e = FMA(KP904730450, T1d, TY);
+				   T2Z = FNMS(KP734762448, T25, T24);
+				   T26 = FMA(KP734762448, T25, T24);
+				   T1R = FMA(KP772036680, T1Q, T1N);
+				   T33 = FNMS(KP772036680, T1Q, T1N);
+				   T1f = FNMS(KP242145790, T1e, TJ);
+				   Ci[0] = -(KP951056516 * (FMA(KP968583161, T1e, TJ)));
+				   T1n = FNMS(KP904508497, T1m, T1l);
+				   T1p = FNMS(KP999754674, T1m, T1i);
+				   T34 = FNMS(KP734762448, T1X, T1U);
+				   T1Y = FMA(KP734762448, T1X, T1U);
+			      }
+			      {
+				   E T2Y, T31, T38, T36, T3c, T3g;
+				   {
+					E T20, T28, T2a, T29, T2b, T35;
+					T2Y = FNMS(KP559016994, T2A, T2z);
+					T2B = FMA(KP559016994, T2A, T2z);
+					{
+					     E T1o, T1q, T27, T1Z;
+					     T1o = FNMS(KP683113946, T1n, T1i);
+					     T1q = FMA(KP559154169, T1p, T1l);
+					     T27 = FNMS(KP945422727, T1Y, T1R);
+					     T1Z = FMA(KP945422727, T1Y, T1R);
+					     Ci[WS(csi, 5)] = -(KP951056516 * (FNMS(KP876306680, T1o, T1f)));
+					     Ci[WS(csi, 10)] = -(KP951056516 * (FNMS(KP968583161, T1q, T1f)));
+					     T20 = FNMS(KP262346850, T1Z, T1K);
+					     Ci[WS(csi, 1)] = -(KP998026728 * (FMA(KP952936919, T1K, T1Z)));
+					     T28 = FMA(KP956723877, T27, T26);
+					     T2a = T27 - T23;
+					}
+					T29 = FMA(KP645989928, T28, T23);
+					T2b = FMA(KP591287873, T2a, T26);
+					Ci[WS(csi, 6)] = -(KP951056516 * (FMA(KP949179823, T29, T20)));
+					Ci[WS(csi, 11)] = -(KP951056516 * (FNMS(KP992114701, T2b, T20)));
+					T31 = FMA(KP956723877, T30, T2Z);
+					T35 = FNMS(KP956723877, T30, T2Z);
+					T38 = FMA(KP618033988, T35, T34);
+					T36 = T34 + T35;
+				   }
+				   Cr[WS(csr, 1)] = FNMS(KP992114701, T31, T2Y);
+				   T3c = FMA(KP923225144, T3b, T3a);
+				   T3g = FNMS(KP923225144, T3b, T3a);
+				   {
+					E T32, T37, T3h, T3j, T39;
+					T32 = FMA(KP248028675, T31, T2Y);
+					T39 = FNMS(KP653711795, T33, T38);
+					T37 = FMA(KP584303379, T36, T33);
+					T3h = FNMS(KP904508497, T3g, T3f);
+					T3j = FNMS(KP997675361, T3g, T3e);
+					Cr[WS(csr, 11)] = FNMS(KP897376177, T39, T32);
+					Cr[WS(csr, 6)] = FMA(KP949179823, T37, T32);
+					T3d = FNMS(KP237294955, T3c, T2Y);
+					T3k = FNMS(KP560319534, T3j, T3f);
+					T3i = FMA(KP681693190, T3h, T3e);
+				   }
+			      }
+			      Cr[WS(csr, 8)] = FMA(KP949179823, T3k, T3d);
+			      Cr[WS(csr, 3)] = FMA(KP860541664, T3i, T3d);
+			      T2P = FNMS(KP809385824, T2O, T2N);
+			      T2S = FMA(KP809385824, T2O, T2N);
+			 }
+		    }
+	       }
+	       {
+		    E T2F, T2K, T2M, T2Q;
+		    T2Q = FMA(KP248028675, T2P, T2B);
+		    {
+			 E T2U, T2W, T2E, T2I;
+			 T2U = FNMS(KP894834959, T2T, T2S);
+			 T2W = T2R + T2S;
+			 T2E = FMA(KP904730450, T2D, T2C);
+			 T2I = FNMS(KP904730450, T2D, T2C);
+			 {
+			      E T2V, T2X, T2J, T2L;
+			      T2V = FNMS(KP618033988, T2U, T2R);
+			      T2X = FNMS(KP690668130, T2W, T2T);
+			      T2F = FNMS(KP242145790, T2E, T2B);
+			      Cr[0] = FMA(KP968583161, T2E, T2B);
+			      T2J = T2H + T2I;
+			      T2L = FMA(KP904730450, T2G, T2I);
+			      Cr[WS(csr, 9)] = FMA(KP897376177, T2V, T2Q);
+			      Cr[WS(csr, 4)] = FNMS(KP803003575, T2X, T2Q);
+			      T2K = FNMS(KP683113946, T2J, T2G);
+			      T2M = FMA(KP618033988, T2L, T2H);
+			 }
+		    }
+		    Cr[WS(csr, 5)] = FMA(KP792626838, T2K, T2F);
+		    Cr[WS(csr, 10)] = FMA(KP876091699, T2M, T2F);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 25, "r2cfII_25", {47, 12, 165, 0}, &GENUS };
+
+void X(codelet_r2cfII_25) (planner *p) {
+     X(kr2c_register) (p, r2cfII_25, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cfII_25 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 213 FP additions, 148 FP multiplications,
+ * (or, 126 additions, 61 multiplications, 87 fused multiply/add),
+ * 94 stack variables, 38 constants, and 50 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP125581039, +0.125581039058626752152356449131262266244969664);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP497379774, +0.497379774329709576484567492012895936835134813);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP250666467, +0.250666467128608490746237519633017587885836494);
+     DK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
+     DK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP851558583, +0.851558583130145297725004891488503407959946084);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
+     DK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
+     DK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
+     DK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
+	       E TE, TR, T2i, T1z, TL, TS, TB, T2d, T1l, T1i, T2c, T9, T23, TZ, TW;
+	       E T22, Ti, T26, T16, T13, T25, Ts, T2a, T1e, T1b, T29, TP, TQ;
+	       {
+		    E TK, T1y, TH, T1x;
+		    TE = R0[0];
+		    {
+			 E TI, TJ, TF, TG;
+			 TI = R0[WS(rs, 10)];
+			 TJ = R1[WS(rs, 2)];
+			 TK = TI - TJ;
+			 T1y = TI + TJ;
+			 TF = R0[WS(rs, 5)];
+			 TG = R1[WS(rs, 7)];
+			 TH = TF - TG;
+			 T1x = TF + TG;
+		    }
+		    TR = KP559016994 * (TH - TK);
+		    T2i = FNMS(KP587785252, T1x, KP951056516 * T1y);
+		    T1z = FMA(KP951056516, T1x, KP587785252 * T1y);
+		    TL = TH + TK;
+		    TS = FNMS(KP250000000, TL, TE);
+	       }
+	       {
+		    E Tt, Tw, Tz, TA, T1k, T1j, T1g, T1h;
+		    Tt = R0[WS(rs, 3)];
+		    {
+			 E Tu, Tv, Tx, Ty;
+			 Tu = R0[WS(rs, 8)];
+			 Tv = R1[WS(rs, 10)];
+			 Tw = Tu - Tv;
+			 Tx = R1[0];
+			 Ty = R1[WS(rs, 5)];
+			 Tz = Tx + Ty;
+			 TA = Tw - Tz;
+			 T1k = Ty - Tx;
+			 T1j = Tu + Tv;
+		    }
+		    TB = Tt + TA;
+		    T2d = FNMS(KP293892626, T1j, KP475528258 * T1k);
+		    T1l = FMA(KP475528258, T1j, KP293892626 * T1k);
+		    T1g = FNMS(KP250000000, TA, Tt);
+		    T1h = KP559016994 * (Tw + Tz);
+		    T1i = T1g + T1h;
+		    T2c = T1g - T1h;
+	       }
+	       {
+		    E T1, T4, T7, T8, TY, TX, TU, TV;
+		    T1 = R0[WS(rs, 1)];
+		    {
+			 E T2, T3, T5, T6;
+			 T2 = R0[WS(rs, 6)];
+			 T3 = R1[WS(rs, 8)];
+			 T4 = T2 - T3;
+			 T5 = R0[WS(rs, 11)];
+			 T6 = R1[WS(rs, 3)];
+			 T7 = T5 - T6;
+			 T8 = T4 + T7;
+			 TY = T5 + T6;
+			 TX = T2 + T3;
+		    }
+		    T9 = T1 + T8;
+		    T23 = FNMS(KP293892626, TX, KP475528258 * TY);
+		    TZ = FMA(KP475528258, TX, KP293892626 * TY);
+		    TU = KP559016994 * (T4 - T7);
+		    TV = FNMS(KP250000000, T8, T1);
+		    TW = TU + TV;
+		    T22 = TV - TU;
+	       }
+	       {
+		    E Ta, Td, Tg, Th, T15, T14, T11, T12;
+		    Ta = R0[WS(rs, 4)];
+		    {
+			 E Tb, Tc, Te, Tf;
+			 Tb = R0[WS(rs, 9)];
+			 Tc = R1[WS(rs, 11)];
+			 Td = Tb - Tc;
+			 Te = R1[WS(rs, 1)];
+			 Tf = R1[WS(rs, 6)];
+			 Tg = Te + Tf;
+			 Th = Td - Tg;
+			 T15 = Tf - Te;
+			 T14 = Tb + Tc;
+		    }
+		    Ti = Ta + Th;
+		    T26 = FNMS(KP293892626, T14, KP475528258 * T15);
+		    T16 = FMA(KP475528258, T14, KP293892626 * T15);
+		    T11 = FNMS(KP250000000, Th, Ta);
+		    T12 = KP559016994 * (Td + Tg);
+		    T13 = T11 + T12;
+		    T25 = T11 - T12;
+	       }
+	       {
+		    E Tk, Tn, Tq, Tr, T1d, T1c, T19, T1a;
+		    Tk = R0[WS(rs, 2)];
+		    {
+			 E Tl, Tm, To, Tp;
+			 Tl = R0[WS(rs, 7)];
+			 Tm = R1[WS(rs, 9)];
+			 Tn = Tl - Tm;
+			 To = R0[WS(rs, 12)];
+			 Tp = R1[WS(rs, 4)];
+			 Tq = To - Tp;
+			 Tr = Tn + Tq;
+			 T1d = To + Tp;
+			 T1c = Tl + Tm;
+		    }
+		    Ts = Tk + Tr;
+		    T2a = FNMS(KP293892626, T1c, KP475528258 * T1d);
+		    T1e = FMA(KP475528258, T1c, KP293892626 * T1d);
+		    T19 = KP559016994 * (Tn - Tq);
+		    T1a = FNMS(KP250000000, Tr, Tk);
+		    T1b = T19 + T1a;
+		    T29 = T1a - T19;
+	       }
+	       TP = TB - Ts;
+	       TQ = T9 - Ti;
+	       Ci[WS(csi, 2)] = FNMS(KP951056516, TQ, KP587785252 * TP);
+	       Ci[WS(csi, 7)] = FMA(KP587785252, TQ, KP951056516 * TP);
+	       {
+		    E TM, TD, TN, Tj, TC, TO;
+		    TM = TE + TL;
+		    Tj = T9 + Ti;
+		    TC = Ts + TB;
+		    TD = KP559016994 * (Tj - TC);
+		    TN = Tj + TC;
+		    Cr[WS(csr, 12)] = TM + TN;
+		    TO = FNMS(KP250000000, TN, TM);
+		    Cr[WS(csr, 2)] = TD + TO;
+		    Cr[WS(csr, 7)] = TO - TD;
+	       }
+	       {
+		    E TT, T1J, T1Y, T1U, T1X, T1P, T1V, T1M, T1W, T1A, T1B, T1r, T1C, T1v, T18;
+		    E T1n, T1o, T1G, T1D;
+		    TT = TR + TS;
+		    {
+			 E T1H, T1I, T1S, T1T;
+			 T1H = FNMS(KP844327925, TW, KP1_071653589 * TZ);
+			 T1I = FNMS(KP1_274847979, T16, KP770513242 * T13);
+			 T1J = T1H - T1I;
+			 T1Y = T1H + T1I;
+			 T1S = FMA(KP125333233, T1i, KP1_984229402 * T1l);
+			 T1T = FMA(KP904827052, T1b, KP851558583 * T1e);
+			 T1U = T1S - T1T;
+			 T1X = T1T + T1S;
+		    }
+		    {
+			 E T1N, T1O, T1K, T1L;
+			 T1N = FMA(KP535826794, TW, KP1_688655851 * TZ);
+			 T1O = FMA(KP637423989, T13, KP1_541026485 * T16);
+			 T1P = T1N - T1O;
+			 T1V = T1N + T1O;
+			 T1K = FNMS(KP1_809654104, T1e, KP425779291 * T1b);
+			 T1L = FNMS(KP992114701, T1i, KP250666467 * T1l);
+			 T1M = T1K - T1L;
+			 T1W = T1K + T1L;
+		    }
+		    {
+			 E T1p, T1q, T1t, T1u;
+			 T1p = FMA(KP844327925, T13, KP1_071653589 * T16);
+			 T1q = FMA(KP248689887, TW, KP1_937166322 * TZ);
+			 T1A = T1q + T1p;
+			 T1t = FMA(KP481753674, T1b, KP1_752613360 * T1e);
+			 T1u = FMA(KP684547105, T1i, KP1_457937254 * T1l);
+			 T1B = T1t + T1u;
+			 T1r = T1p - T1q;
+			 T1C = T1A + T1B;
+			 T1v = T1t - T1u;
+		    }
+		    {
+			 E T10, T17, T1f, T1m;
+			 T10 = FNMS(KP497379774, TZ, KP968583161 * TW);
+			 T17 = FNMS(KP1_688655851, T16, KP535826794 * T13);
+			 T18 = T10 + T17;
+			 T1f = FNMS(KP963507348, T1e, KP876306680 * T1b);
+			 T1m = FNMS(KP1_369094211, T1l, KP728968627 * T1i);
+			 T1n = T1f + T1m;
+			 T1o = T18 + T1n;
+			 T1G = T10 - T17;
+			 T1D = T1f - T1m;
+		    }
+		    {
+			 E T1R, T1Q, T20, T1Z;
+			 Cr[0] = TT + T1o;
+			 Ci[0] = -(T1z + T1C);
+			 T1R = KP559016994 * (T1P + T1M);
+			 T1Q = FMA(KP250000000, T1M - T1P, TT);
+			 Cr[WS(csr, 4)] = FMA(KP951056516, T1J, T1Q) + FMA(KP587785252, T1U, T1R);
+			 Cr[WS(csr, 9)] = FMA(KP951056516, T1U, T1Q) + FNMA(KP587785252, T1J, T1R);
+			 T20 = KP559016994 * (T1Y + T1X);
+			 T1Z = FMA(KP250000000, T1X - T1Y, T1z);
+			 Ci[WS(csi, 9)] = FMA(KP587785252, T1V, KP951056516 * T1W) + T1Z - T20;
+			 Ci[WS(csi, 4)] = FMA(KP587785252, T1W, T1Z) + FNMS(KP951056516, T1V, T20);
+			 {
+			      E T1E, T1F, T1s, T1w;
+			      T1E = FMS(KP250000000, T1C, T1z);
+			      T1F = KP559016994 * (T1B - T1A);
+			      Ci[WS(csi, 5)] = FMA(KP951056516, T1D, T1E) + FNMA(KP587785252, T1G, T1F);
+			      Ci[WS(csi, 10)] = FMA(KP951056516, T1G, KP587785252 * T1D) + T1E + T1F;
+			      T1s = FNMS(KP250000000, T1o, TT);
+			      T1w = KP559016994 * (T18 - T1n);
+			      Cr[WS(csr, 5)] = FMA(KP587785252, T1r, T1s) + FMS(KP951056516, T1v, T1w);
+			      Cr[WS(csr, 10)] = T1w + FMA(KP587785252, T1v, T1s) - (KP951056516 * T1r);
+			 }
+		    }
+	       }
+	       {
+		    E T21, T2z, T2L, T2K, T2M, T2F, T2P, T2C, T2Q, T2l, T2o, T2p, T2w, T2u, T28;
+		    E T2f, T2g, T2s, T2h;
+		    T21 = TS - TR;
+		    {
+			 E T2x, T2y, T2I, T2J;
+			 T2x = FNMS(KP844327925, T29, KP1_071653589 * T2a);
+			 T2y = FNMS(KP125581039, T2d, KP998026728 * T2c);
+			 T2z = T2x + T2y;
+			 T2L = T2y - T2x;
+			 T2I = FNMS(KP481753674, T22, KP1_752613360 * T23);
+			 T2J = FMA(KP904827052, T25, KP851558583 * T26);
+			 T2K = T2I + T2J;
+			 T2M = T2I - T2J;
+		    }
+		    {
+			 E T2D, T2E, T2A, T2B;
+			 T2D = FMA(KP535826794, T29, KP1_688655851 * T2a);
+			 T2E = FMA(KP062790519, T2c, KP1_996053456 * T2d);
+			 T2F = T2D + T2E;
+			 T2P = T2E - T2D;
+			 T2A = FMA(KP876306680, T22, KP963507348 * T23);
+			 T2B = FNMS(KP425779291, T25, KP1_809654104 * T26);
+			 T2C = T2A + T2B;
+			 T2Q = T2A - T2B;
+		    }
+		    {
+			 E T2j, T2k, T2m, T2n;
+			 T2j = FNMS(KP125333233, T25, KP1_984229402 * T26);
+			 T2k = FMA(KP684547105, T22, KP1_457937254 * T23);
+			 T2l = T2j - T2k;
+			 T2m = FNMS(KP770513242, T2c, KP1_274847979 * T2d);
+			 T2n = FMA(KP998026728, T29, KP125581039 * T2a);
+			 T2o = T2m - T2n;
+			 T2p = T2l + T2o;
+			 T2w = T2k + T2j;
+			 T2u = T2n + T2m;
+		    }
+		    {
+			 E T24, T27, T2b, T2e;
+			 T24 = FNMS(KP1_369094211, T23, KP728968627 * T22);
+			 T27 = FMA(KP992114701, T25, KP250666467 * T26);
+			 T28 = T24 - T27;
+			 T2b = FNMS(KP1_996053456, T2a, KP062790519 * T29);
+			 T2e = FMA(KP637423989, T2c, KP1_541026485 * T2d);
+			 T2f = T2b - T2e;
+			 T2g = T28 + T2f;
+			 T2s = T24 + T27;
+			 T2h = T2b + T2e;
+		    }
+		    {
+			 E T2H, T2G, T2O, T2N;
+			 Cr[WS(csr, 1)] = T21 + T2g;
+			 Ci[WS(csi, 1)] = T2p - T2i;
+			 T2H = KP559016994 * (T2C - T2F);
+			 T2G = FNMS(KP250000000, T2C + T2F, T21);
+			 Cr[WS(csr, 8)] = FMA(KP951056516, T2z, T2G) + FNMA(KP587785252, T2K, T2H);
+			 Cr[WS(csr, 3)] = FMA(KP951056516, T2K, KP587785252 * T2z) + T2G + T2H;
+			 T2O = KP559016994 * (T2M + T2L);
+			 T2N = FMA(KP250000000, T2L - T2M, T2i);
+			 Ci[WS(csi, 3)] = T2N + FMA(KP587785252, T2P, T2O) - (KP951056516 * T2Q);
+			 Ci[WS(csi, 8)] = FMA(KP587785252, T2Q, T2N) + FMS(KP951056516, T2P, T2O);
+			 {
+			      E T2t, T2v, T2q, T2r;
+			      T2t = FNMS(KP250000000, T2g, T21);
+			      T2v = KP559016994 * (T28 - T2f);
+			      Cr[WS(csr, 6)] = FMA(KP951056516, T2u, T2t) + FNMA(KP587785252, T2w, T2v);
+			      Cr[WS(csr, 11)] = FMA(KP951056516, T2w, T2v) + FMA(KP587785252, T2u, T2t);
+			      T2q = KP250000000 * T2p;
+			      T2r = KP559016994 * (T2l - T2o);
+			      Ci[WS(csi, 6)] = FMS(KP951056516, T2h, T2i + T2q) + FNMA(KP587785252, T2s, T2r);
+			      Ci[WS(csi, 11)] = FMA(KP951056516, T2s, KP587785252 * T2h) + T2r - (T2i + T2q);
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 25, "r2cfII_25", {126, 61, 87, 0}, &GENUS };
+
+void X(codelet_r2cfII_25) (planner *p) {
+     X(kr2c_register) (p, r2cfII_25, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:13 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cfII_3 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 4 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 1 multiplications, 1 fused multiply/add),
+ * 7 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
+	       E T3, T1, T2, T4;
+	       T3 = R0[0];
+	       T1 = R1[0];
+	       T2 = R0[WS(rs, 1)];
+	       Ci[0] = -(KP866025403 * (T1 + T2));
+	       T4 = T2 - T1;
+	       Cr[WS(csr, 1)] = T3 + T4;
+	       Cr[0] = FNMS(KP500000000, T4, T3);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 3, "r2cfII_3", {3, 1, 1, 0}, &GENUS };
+
+void X(codelet_r2cfII_3) (planner *p) {
+     X(kr2c_register) (p, r2cfII_3, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cfII_3 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 4 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 1 multiplications, 1 fused multiply/add),
+ * 7 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
+	       E T1, T2, T3, T4;
+	       T1 = R0[0];
+	       T2 = R1[0];
+	       T3 = R0[WS(rs, 1)];
+	       T4 = T2 - T3;
+	       Cr[WS(csr, 1)] = T1 - T4;
+	       Ci[0] = -(KP866025403 * (T2 + T3));
+	       Cr[0] = FMA(KP500000000, T4, T1);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 3, "r2cfII_3", {3, 1, 1, 0}, &GENUS };
+
+void X(codelet_r2cfII_3) (planner *p) {
+     X(kr2c_register) (p, r2cfII_3, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:16 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cfII_32 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 174 FP additions, 128 FP multiplications,
+ * (or, 46 additions, 0 multiplications, 128 fused multiply/add),
+ * 96 stack variables, 15 constants, and 64 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
+	       E T23, T1S, T21, T1L, T2z, T2x, T1Z, T22;
+	       {
+		    E T2n, T2B, T1z, T5, T1C, T2C, T2o, Tc, T27, T1J, T1l, Tm, T26, T1G, T1k;
+		    E Tv, T1s, T1c, T2e, T1Y, T1r, T15, T2d, T1V, TP, TF, T1M, TC, T1P, TN;
+		    E TO, TI;
+		    {
+			 E T1A, T8, Te, Tj, Tf, T1B, Tb, Tg;
+			 {
+			      E T1, T2l, T2, T3, T9, Ta;
+			      T1 = R0[0];
+			      T2l = R0[WS(rs, 8)];
+			      T2 = R0[WS(rs, 4)];
+			      T3 = R0[WS(rs, 12)];
+			      {
+				   E T6, T7, T2m, T4;
+				   T6 = R0[WS(rs, 10)];
+				   T7 = R0[WS(rs, 2)];
+				   T9 = R0[WS(rs, 6)];
+				   T2m = T2 + T3;
+				   T4 = T2 - T3;
+				   T1A = FNMS(KP414213562, T6, T7);
+				   T8 = FMA(KP414213562, T7, T6);
+				   T2n = FMA(KP707106781, T2m, T2l);
+				   T2B = FNMS(KP707106781, T2m, T2l);
+				   T1z = FMA(KP707106781, T4, T1);
+				   T5 = FNMS(KP707106781, T4, T1);
+				   Ta = R0[WS(rs, 14)];
+			      }
+			      Te = R0[WS(rs, 7)];
+			      Tj = R0[WS(rs, 15)];
+			      Tf = R0[WS(rs, 3)];
+			      T1B = FMS(KP414213562, T9, Ta);
+			      Tb = FMA(KP414213562, Ta, T9);
+			      Tg = R0[WS(rs, 11)];
+			 }
+			 {
+			      E Tn, Ts, To, T1I, Tl, T1H, Ti, Tp, Tk, Th, T1T, T1U;
+			      Tn = R0[WS(rs, 9)];
+			      T1C = T1A + T1B;
+			      T2C = T1B - T1A;
+			      T2o = T8 + Tb;
+			      Tc = T8 - Tb;
+			      Tk = Tg - Tf;
+			      Th = Tf + Tg;
+			      Ts = R0[WS(rs, 1)];
+			      To = R0[WS(rs, 5)];
+			      T1I = FMA(KP707106781, Tk, Tj);
+			      Tl = FNMS(KP707106781, Tk, Tj);
+			      T1H = FMA(KP707106781, Th, Te);
+			      Ti = FNMS(KP707106781, Th, Te);
+			      Tp = R0[WS(rs, 13)];
+			      {
+				   E TT, T16, TY, T17, TW, TZ, T11, T12, Tt, Tq;
+				   TT = R1[WS(rs, 15)];
+				   T27 = FNMS(KP198912367, T1H, T1I);
+				   T1J = FMA(KP198912367, T1I, T1H);
+				   T1l = FMA(KP668178637, Ti, Tl);
+				   Tm = FNMS(KP668178637, Tl, Ti);
+				   Tt = To - Tp;
+				   Tq = To + Tp;
+				   T16 = R1[WS(rs, 7)];
+				   {
+					E TU, T1F, Tu, T1E, Tr, TV;
+					TU = R1[WS(rs, 3)];
+					T1F = FMA(KP707106781, Tt, Ts);
+					Tu = FNMS(KP707106781, Tt, Ts);
+					T1E = FMA(KP707106781, Tq, Tn);
+					Tr = FNMS(KP707106781, Tq, Tn);
+					TV = R1[WS(rs, 11)];
+					TY = R1[WS(rs, 9)];
+					T26 = FNMS(KP198912367, T1E, T1F);
+					T1G = FMA(KP198912367, T1F, T1E);
+					T1k = FMA(KP668178637, Tr, Tu);
+					Tv = FNMS(KP668178637, Tu, Tr);
+					T17 = TU + TV;
+					TW = TU - TV;
+					TZ = R1[WS(rs, 1)];
+					T11 = R1[WS(rs, 5)];
+					T12 = R1[WS(rs, 13)];
+				   }
+				   {
+					E TX, T1a, T10, T19, T13, T1W, T18, T1b, T14, T1X;
+					T1T = FMS(KP707106781, TW, TT);
+					TX = FMA(KP707106781, TW, TT);
+					T1a = FNMS(KP414213562, TY, TZ);
+					T10 = FMA(KP414213562, TZ, TY);
+					T19 = FMS(KP414213562, T11, T12);
+					T13 = FMA(KP414213562, T12, T11);
+					T1W = FMA(KP707106781, T17, T16);
+					T18 = FNMS(KP707106781, T17, T16);
+					T1b = T19 - T1a;
+					T1U = T1a + T19;
+					T14 = T10 - T13;
+					T1X = T10 + T13;
+					T1s = FMA(KP923879532, T1b, T18);
+					T1c = FNMS(KP923879532, T1b, T18);
+					T2e = FMA(KP923879532, T1X, T1W);
+					T1Y = FNMS(KP923879532, T1X, T1W);
+					T1r = FNMS(KP923879532, T14, TX);
+					T15 = FMA(KP923879532, T14, TX);
+				   }
+			      }
+			      {
+				   E Ty, TL, TG, TM, TB, TH;
+				   Ty = R1[0];
+				   TL = R1[WS(rs, 8)];
+				   {
+					E Tz, TA, TD, TE;
+					Tz = R1[WS(rs, 4)];
+					T2d = FMA(KP923879532, T1U, T1T);
+					T1V = FNMS(KP923879532, T1U, T1T);
+					TA = R1[WS(rs, 12)];
+					TD = R1[WS(rs, 10)];
+					TE = R1[WS(rs, 2)];
+					TG = R1[WS(rs, 6)];
+					TM = Tz + TA;
+					TB = Tz - TA;
+					TP = FNMS(KP414213562, TD, TE);
+					TF = FMA(KP414213562, TE, TD);
+					TH = R1[WS(rs, 14)];
+				   }
+				   T1M = FMA(KP707106781, TB, Ty);
+				   TC = FNMS(KP707106781, TB, Ty);
+				   T1P = FMA(KP707106781, TM, TL);
+				   TN = FNMS(KP707106781, TM, TL);
+				   TO = FMS(KP414213562, TG, TH);
+				   TI = FMA(KP414213562, TH, TG);
+			      }
+			 }
+		    }
+		    {
+			 E T1j, T1O, T1p, T1R, T1o, T2E, T2D, T1m, T1D, T2w, T2v, T1K, T2i, T2c, T2h;
+			 E T29, T2t, T2r, T2f, T2j;
+			 {
+			      E T2a, T2b, T1g, TS, T1f, Tx, T2N, T2L, T1d, T1h;
+			      {
+				   E Td, TR, TK, Tw, T2J, T2K;
+				   T1j = FMA(KP923879532, Tc, T5);
+				   Td = FNMS(KP923879532, Tc, T5);
+				   {
+					E T1N, TQ, T1Q, TJ;
+					T1N = TP + TO;
+					TQ = TO - TP;
+					T1Q = TF + TI;
+					TJ = TF - TI;
+					T2a = FMA(KP923879532, T1N, T1M);
+					T1O = FNMS(KP923879532, T1N, T1M);
+					T1p = FMA(KP923879532, TQ, TN);
+					TR = FNMS(KP923879532, TQ, TN);
+					T2b = FMA(KP923879532, T1Q, T1P);
+					T1R = FNMS(KP923879532, T1Q, T1P);
+					T1o = FMA(KP923879532, TJ, TC);
+					TK = FNMS(KP923879532, TJ, TC);
+					Tw = Tm - Tv;
+					T2E = Tv + Tm;
+				   }
+				   T2D = FMA(KP923879532, T2C, T2B);
+				   T2J = FNMS(KP923879532, T2C, T2B);
+				   T2K = T1k + T1l;
+				   T1m = T1k - T1l;
+				   T1g = FMA(KP534511135, TK, TR);
+				   TS = FNMS(KP534511135, TR, TK);
+				   T1f = FNMS(KP831469612, Tw, Td);
+				   Tx = FMA(KP831469612, Tw, Td);
+				   T2N = FNMS(KP831469612, T2K, T2J);
+				   T2L = FMA(KP831469612, T2K, T2J);
+				   T1d = FNMS(KP534511135, T1c, T15);
+				   T1h = FMA(KP534511135, T15, T1c);
+			      }
+			      {
+				   E T25, T28, T2p, T2q;
+				   T1D = FNMS(KP923879532, T1C, T1z);
+				   T25 = FMA(KP923879532, T1C, T1z);
+				   {
+					E T2O, T1e, T2M, T1i;
+					T2O = TS + T1d;
+					T1e = TS - T1d;
+					T2M = T1g + T1h;
+					T1i = T1g - T1h;
+					Ci[WS(csi, 5)] = FNMS(KP881921264, T2O, T2N);
+					Ci[WS(csi, 10)] = -(FMA(KP881921264, T2O, T2N));
+					Cr[WS(csr, 2)] = FMA(KP881921264, T1e, Tx);
+					Cr[WS(csr, 13)] = FNMS(KP881921264, T1e, Tx);
+					Ci[WS(csi, 2)] = -(FMA(KP881921264, T2M, T2L));
+					Ci[WS(csi, 13)] = FNMS(KP881921264, T2M, T2L);
+					Cr[WS(csr, 5)] = FMA(KP881921264, T1i, T1f);
+					Cr[WS(csr, 10)] = FNMS(KP881921264, T1i, T1f);
+					T28 = T26 - T27;
+					T2w = T26 + T27;
+				   }
+				   T2v = FNMS(KP923879532, T2o, T2n);
+				   T2p = FMA(KP923879532, T2o, T2n);
+				   T2q = T1G + T1J;
+				   T1K = T1G - T1J;
+				   T2i = FMA(KP098491403, T2a, T2b);
+				   T2c = FNMS(KP098491403, T2b, T2a);
+				   T2h = FNMS(KP980785280, T28, T25);
+				   T29 = FMA(KP980785280, T28, T25);
+				   T2t = FNMS(KP980785280, T2q, T2p);
+				   T2r = FMA(KP980785280, T2q, T2p);
+				   T2f = FMA(KP098491403, T2e, T2d);
+				   T2j = FNMS(KP098491403, T2d, T2e);
+			      }
+			 }
+			 {
+			      E T1x, T1q, T1v, T1n, T2H, T2F, T1t, T1w;
+			      {
+				   E T2u, T2g, T2s, T2k;
+				   T2u = T2f - T2c;
+				   T2g = T2c + T2f;
+				   T2s = T2i + T2j;
+				   T2k = T2i - T2j;
+				   Ci[WS(csi, 7)] = FMA(KP995184726, T2u, T2t);
+				   Ci[WS(csi, 8)] = FMS(KP995184726, T2u, T2t);
+				   Cr[0] = FMA(KP995184726, T2g, T29);
+				   Cr[WS(csr, 15)] = FNMS(KP995184726, T2g, T29);
+				   Ci[0] = -(FMA(KP995184726, T2s, T2r));
+				   Ci[WS(csi, 15)] = FNMS(KP995184726, T2s, T2r);
+				   Cr[WS(csr, 7)] = FMA(KP995184726, T2k, T2h);
+				   Cr[WS(csr, 8)] = FNMS(KP995184726, T2k, T2h);
+			      }
+			      T1x = FNMS(KP303346683, T1o, T1p);
+			      T1q = FMA(KP303346683, T1p, T1o);
+			      T1v = FNMS(KP831469612, T1m, T1j);
+			      T1n = FMA(KP831469612, T1m, T1j);
+			      T2H = FNMS(KP831469612, T2E, T2D);
+			      T2F = FMA(KP831469612, T2E, T2D);
+			      T1t = FMA(KP303346683, T1s, T1r);
+			      T1w = FNMS(KP303346683, T1r, T1s);
+			      {
+				   E T2I, T1u, T2G, T1y;
+				   T2I = T1q + T1t;
+				   T1u = T1q - T1t;
+				   T2G = T1x + T1w;
+				   T1y = T1w - T1x;
+				   Ci[WS(csi, 6)] = -(FMA(KP956940335, T2I, T2H));
+				   Ci[WS(csi, 9)] = FNMS(KP956940335, T2I, T2H);
+				   Cr[WS(csr, 1)] = FMA(KP956940335, T1u, T1n);
+				   Cr[WS(csr, 14)] = FNMS(KP956940335, T1u, T1n);
+				   Ci[WS(csi, 1)] = FMA(KP956940335, T2G, T2F);
+				   Ci[WS(csi, 14)] = FMS(KP956940335, T2G, T2F);
+				   Cr[WS(csr, 6)] = FMA(KP956940335, T1y, T1v);
+				   Cr[WS(csr, 9)] = FNMS(KP956940335, T1y, T1v);
+			      }
+			      T23 = FNMS(KP820678790, T1O, T1R);
+			      T1S = FMA(KP820678790, T1R, T1O);
+			      T21 = FNMS(KP980785280, T1K, T1D);
+			      T1L = FMA(KP980785280, T1K, T1D);
+			      T2z = FMA(KP980785280, T2w, T2v);
+			      T2x = FNMS(KP980785280, T2w, T2v);
+			      T1Z = FNMS(KP820678790, T1Y, T1V);
+			      T22 = FMA(KP820678790, T1V, T1Y);
+			 }
+		    }
+	       }
+	       {
+		    E T20, T2A, T24, T2y;
+		    T20 = T1S + T1Z;
+		    T2A = T1Z - T1S;
+		    T24 = T22 - T23;
+		    T2y = T23 + T22;
+		    Ci[WS(csi, 4)] = FMS(KP773010453, T2A, T2z);
+		    Ci[WS(csi, 11)] = FMA(KP773010453, T2A, T2z);
+		    Cr[WS(csr, 3)] = FMA(KP773010453, T20, T1L);
+		    Cr[WS(csr, 12)] = FNMS(KP773010453, T20, T1L);
+		    Ci[WS(csi, 3)] = FMA(KP773010453, T2y, T2x);
+		    Ci[WS(csi, 12)] = FMS(KP773010453, T2y, T2x);
+		    Cr[WS(csr, 4)] = FMA(KP773010453, T24, T21);
+		    Cr[WS(csr, 11)] = FNMS(KP773010453, T24, T21);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 32, "r2cfII_32", {46, 0, 128, 0}, &GENUS };
+
+void X(codelet_r2cfII_32) (planner *p) {
+     X(kr2c_register) (p, r2cfII_32, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cfII_32 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 174 FP additions, 82 FP multiplications,
+ * (or, 138 additions, 46 multiplications, 36 fused multiply/add),
+ * 62 stack variables, 15 constants, and 64 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
+	       E T5, T2D, T1z, T2q, Tc, T2C, T1C, T2n, Tm, T1k, T1J, T26, Tv, T1l, T1G;
+	       E T27, T15, T1r, T1Y, T2e, T1c, T1s, T1V, T2d, TK, T1o, T1R, T2b, TR, T1p;
+	       E T1O, T2a;
+	       {
+		    E T1, T2p, T4, T2o, T2, T3;
+		    T1 = R0[0];
+		    T2p = R0[WS(rs, 8)];
+		    T2 = R0[WS(rs, 4)];
+		    T3 = R0[WS(rs, 12)];
+		    T4 = KP707106781 * (T2 - T3);
+		    T2o = KP707106781 * (T2 + T3);
+		    T5 = T1 + T4;
+		    T2D = T2p - T2o;
+		    T1z = T1 - T4;
+		    T2q = T2o + T2p;
+	       }
+	       {
+		    E T8, T1A, Tb, T1B;
+		    {
+			 E T6, T7, T9, Ta;
+			 T6 = R0[WS(rs, 2)];
+			 T7 = R0[WS(rs, 10)];
+			 T8 = FNMS(KP382683432, T7, KP923879532 * T6);
+			 T1A = FMA(KP382683432, T6, KP923879532 * T7);
+			 T9 = R0[WS(rs, 6)];
+			 Ta = R0[WS(rs, 14)];
+			 Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
+			 T1B = FMA(KP923879532, T9, KP382683432 * Ta);
+		    }
+		    Tc = T8 + Tb;
+		    T2C = Tb - T8;
+		    T1C = T1A - T1B;
+		    T2n = T1A + T1B;
+	       }
+	       {
+		    E Te, Tk, Th, Tj, Tf, Tg;
+		    Te = R0[WS(rs, 1)];
+		    Tk = R0[WS(rs, 9)];
+		    Tf = R0[WS(rs, 5)];
+		    Tg = R0[WS(rs, 13)];
+		    Th = KP707106781 * (Tf - Tg);
+		    Tj = KP707106781 * (Tf + Tg);
+		    {
+			 E Ti, Tl, T1H, T1I;
+			 Ti = Te + Th;
+			 Tl = Tj + Tk;
+			 Tm = FNMS(KP195090322, Tl, KP980785280 * Ti);
+			 T1k = FMA(KP195090322, Ti, KP980785280 * Tl);
+			 T1H = Tk - Tj;
+			 T1I = Te - Th;
+			 T1J = FNMS(KP555570233, T1I, KP831469612 * T1H);
+			 T26 = FMA(KP831469612, T1I, KP555570233 * T1H);
+		    }
+	       }
+	       {
+		    E Tq, Tt, Tp, Ts, Tn, To;
+		    Tq = R0[WS(rs, 15)];
+		    Tt = R0[WS(rs, 7)];
+		    Tn = R0[WS(rs, 3)];
+		    To = R0[WS(rs, 11)];
+		    Tp = KP707106781 * (Tn - To);
+		    Ts = KP707106781 * (Tn + To);
+		    {
+			 E Tr, Tu, T1E, T1F;
+			 Tr = Tp - Tq;
+			 Tu = Ts + Tt;
+			 Tv = FMA(KP980785280, Tr, KP195090322 * Tu);
+			 T1l = FNMS(KP980785280, Tu, KP195090322 * Tr);
+			 T1E = Tt - Ts;
+			 T1F = Tp + Tq;
+			 T1G = FNMS(KP555570233, T1F, KP831469612 * T1E);
+			 T27 = FMA(KP831469612, T1F, KP555570233 * T1E);
+		    }
+	       }
+	       {
+		    E TW, T1a, TV, T19, T10, T16, T13, T17, TT, TU;
+		    TW = R1[WS(rs, 15)];
+		    T1a = R1[WS(rs, 7)];
+		    TT = R1[WS(rs, 3)];
+		    TU = R1[WS(rs, 11)];
+		    TV = KP707106781 * (TT - TU);
+		    T19 = KP707106781 * (TT + TU);
+		    {
+			 E TY, TZ, T11, T12;
+			 TY = R1[WS(rs, 1)];
+			 TZ = R1[WS(rs, 9)];
+			 T10 = FNMS(KP382683432, TZ, KP923879532 * TY);
+			 T16 = FMA(KP382683432, TY, KP923879532 * TZ);
+			 T11 = R1[WS(rs, 5)];
+			 T12 = R1[WS(rs, 13)];
+			 T13 = FNMS(KP923879532, T12, KP382683432 * T11);
+			 T17 = FMA(KP923879532, T11, KP382683432 * T12);
+		    }
+		    {
+			 E TX, T14, T1W, T1X;
+			 TX = TV - TW;
+			 T14 = T10 + T13;
+			 T15 = TX + T14;
+			 T1r = TX - T14;
+			 T1W = T13 - T10;
+			 T1X = T1a - T19;
+			 T1Y = T1W - T1X;
+			 T2e = T1W + T1X;
+		    }
+		    {
+			 E T18, T1b, T1T, T1U;
+			 T18 = T16 + T17;
+			 T1b = T19 + T1a;
+			 T1c = T18 + T1b;
+			 T1s = T1b - T18;
+			 T1T = TV + TW;
+			 T1U = T16 - T17;
+			 T1V = T1T + T1U;
+			 T2d = T1U - T1T;
+		    }
+	       }
+	       {
+		    E Ty, TP, TB, TO, TF, TL, TI, TM, Tz, TA;
+		    Ty = R1[0];
+		    TP = R1[WS(rs, 8)];
+		    Tz = R1[WS(rs, 4)];
+		    TA = R1[WS(rs, 12)];
+		    TB = KP707106781 * (Tz - TA);
+		    TO = KP707106781 * (Tz + TA);
+		    {
+			 E TD, TE, TG, TH;
+			 TD = R1[WS(rs, 2)];
+			 TE = R1[WS(rs, 10)];
+			 TF = FNMS(KP382683432, TE, KP923879532 * TD);
+			 TL = FMA(KP382683432, TD, KP923879532 * TE);
+			 TG = R1[WS(rs, 6)];
+			 TH = R1[WS(rs, 14)];
+			 TI = FNMS(KP923879532, TH, KP382683432 * TG);
+			 TM = FMA(KP923879532, TG, KP382683432 * TH);
+		    }
+		    {
+			 E TC, TJ, T1P, T1Q;
+			 TC = Ty + TB;
+			 TJ = TF + TI;
+			 TK = TC + TJ;
+			 T1o = TC - TJ;
+			 T1P = TI - TF;
+			 T1Q = TP - TO;
+			 T1R = T1P - T1Q;
+			 T2b = T1P + T1Q;
+		    }
+		    {
+			 E TN, TQ, T1M, T1N;
+			 TN = TL + TM;
+			 TQ = TO + TP;
+			 TR = TN + TQ;
+			 T1p = TQ - TN;
+			 T1M = Ty - TB;
+			 T1N = TL - TM;
+			 T1O = T1M - T1N;
+			 T2a = T1M + T1N;
+		    }
+	       }
+	       {
+		    E Tx, T1f, T2s, T2u, T1e, T2l, T1i, T2t;
+		    {
+			 E Td, Tw, T2m, T2r;
+			 Td = T5 + Tc;
+			 Tw = Tm + Tv;
+			 Tx = Td - Tw;
+			 T1f = Td + Tw;
+			 T2m = T1l - T1k;
+			 T2r = T2n + T2q;
+			 T2s = T2m - T2r;
+			 T2u = T2m + T2r;
+		    }
+		    {
+			 E TS, T1d, T1g, T1h;
+			 TS = FMA(KP098017140, TK, KP995184726 * TR);
+			 T1d = FNMS(KP995184726, T1c, KP098017140 * T15);
+			 T1e = TS + T1d;
+			 T2l = T1d - TS;
+			 T1g = FNMS(KP098017140, TR, KP995184726 * TK);
+			 T1h = FMA(KP995184726, T15, KP098017140 * T1c);
+			 T1i = T1g + T1h;
+			 T2t = T1h - T1g;
+		    }
+		    Cr[WS(csr, 8)] = Tx - T1e;
+		    Ci[WS(csi, 8)] = T2t - T2u;
+		    Cr[WS(csr, 7)] = Tx + T1e;
+		    Ci[WS(csi, 7)] = T2t + T2u;
+		    Cr[WS(csr, 15)] = T1f - T1i;
+		    Ci[WS(csi, 15)] = T2l - T2s;
+		    Cr[0] = T1f + T1i;
+		    Ci[0] = T2l + T2s;
+	       }
+	       {
+		    E T29, T2h, T2M, T2O, T2g, T2J, T2k, T2N;
+		    {
+			 E T25, T28, T2K, T2L;
+			 T25 = T1z + T1C;
+			 T28 = T26 - T27;
+			 T29 = T25 + T28;
+			 T2h = T25 - T28;
+			 T2K = T1J + T1G;
+			 T2L = T2C + T2D;
+			 T2M = T2K - T2L;
+			 T2O = T2K + T2L;
+		    }
+		    {
+			 E T2c, T2f, T2i, T2j;
+			 T2c = FMA(KP956940335, T2a, KP290284677 * T2b);
+			 T2f = FNMS(KP290284677, T2e, KP956940335 * T2d);
+			 T2g = T2c + T2f;
+			 T2J = T2f - T2c;
+			 T2i = FMA(KP290284677, T2d, KP956940335 * T2e);
+			 T2j = FNMS(KP290284677, T2a, KP956940335 * T2b);
+			 T2k = T2i - T2j;
+			 T2N = T2j + T2i;
+		    }
+		    Cr[WS(csr, 14)] = T29 - T2g;
+		    Ci[WS(csi, 14)] = T2N - T2O;
+		    Cr[WS(csr, 1)] = T29 + T2g;
+		    Ci[WS(csi, 1)] = T2N + T2O;
+		    Cr[WS(csr, 9)] = T2h - T2k;
+		    Ci[WS(csi, 9)] = T2J - T2M;
+		    Cr[WS(csr, 6)] = T2h + T2k;
+		    Ci[WS(csi, 6)] = T2J + T2M;
+	       }
+	       {
+		    E T1n, T1v, T2y, T2A, T1u, T2v, T1y, T2z;
+		    {
+			 E T1j, T1m, T2w, T2x;
+			 T1j = T5 - Tc;
+			 T1m = T1k + T1l;
+			 T1n = T1j + T1m;
+			 T1v = T1j - T1m;
+			 T2w = Tv - Tm;
+			 T2x = T2q - T2n;
+			 T2y = T2w - T2x;
+			 T2A = T2w + T2x;
+		    }
+		    {
+			 E T1q, T1t, T1w, T1x;
+			 T1q = FMA(KP773010453, T1o, KP634393284 * T1p);
+			 T1t = FNMS(KP634393284, T1s, KP773010453 * T1r);
+			 T1u = T1q + T1t;
+			 T2v = T1t - T1q;
+			 T1w = FMA(KP634393284, T1r, KP773010453 * T1s);
+			 T1x = FNMS(KP634393284, T1o, KP773010453 * T1p);
+			 T1y = T1w - T1x;
+			 T2z = T1x + T1w;
+		    }
+		    Cr[WS(csr, 12)] = T1n - T1u;
+		    Ci[WS(csi, 12)] = T2z - T2A;
+		    Cr[WS(csr, 3)] = T1n + T1u;
+		    Ci[WS(csi, 3)] = T2z + T2A;
+		    Cr[WS(csr, 11)] = T1v - T1y;
+		    Ci[WS(csi, 11)] = T2v - T2y;
+		    Cr[WS(csr, 4)] = T1v + T1y;
+		    Ci[WS(csi, 4)] = T2v + T2y;
+	       }
+	       {
+		    E T1L, T21, T2G, T2I, T20, T2H, T24, T2B;
+		    {
+			 E T1D, T1K, T2E, T2F;
+			 T1D = T1z - T1C;
+			 T1K = T1G - T1J;
+			 T1L = T1D + T1K;
+			 T21 = T1D - T1K;
+			 T2E = T2C - T2D;
+			 T2F = T26 + T27;
+			 T2G = T2E - T2F;
+			 T2I = T2F + T2E;
+		    }
+		    {
+			 E T1S, T1Z, T22, T23;
+			 T1S = FMA(KP881921264, T1O, KP471396736 * T1R);
+			 T1Z = FMA(KP881921264, T1V, KP471396736 * T1Y);
+			 T20 = T1S - T1Z;
+			 T2H = T1S + T1Z;
+			 T22 = FNMS(KP471396736, T1V, KP881921264 * T1Y);
+			 T23 = FNMS(KP471396736, T1O, KP881921264 * T1R);
+			 T24 = T22 - T23;
+			 T2B = T23 + T22;
+		    }
+		    Cr[WS(csr, 13)] = T1L - T20;
+		    Ci[WS(csi, 13)] = T2B - T2G;
+		    Cr[WS(csr, 2)] = T1L + T20;
+		    Ci[WS(csi, 2)] = T2B + T2G;
+		    Cr[WS(csr, 10)] = T21 - T24;
+		    Ci[WS(csi, 10)] = T2I - T2H;
+		    Cr[WS(csr, 5)] = T21 + T24;
+		    Ci[WS(csi, 5)] = -(T2H + T2I);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 32, "r2cfII_32", {138, 46, 36, 0}, &GENUS };
+
+void X(codelet_r2cfII_32) (planner *p) {
+     X(kr2c_register) (p, r2cfII_32, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:13 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cfII_4 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 6 FP additions, 4 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 4 fused multiply/add),
+ * 8 stack variables, 1 constants, and 8 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
+	       E T1, T5, T2, T3, T4, T6;
+	       T1 = R0[0];
+	       T5 = R0[WS(rs, 1)];
+	       T2 = R1[0];
+	       T3 = R1[WS(rs, 1)];
+	       T4 = T2 - T3;
+	       T6 = T2 + T3;
+	       Ci[0] = -(FMA(KP707106781, T6, T5));
+	       Ci[WS(csi, 1)] = FNMS(KP707106781, T6, T5);
+	       Cr[0] = FMA(KP707106781, T4, T1);
+	       Cr[WS(csr, 1)] = FNMS(KP707106781, T4, T1);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 4, "r2cfII_4", {2, 0, 4, 0}, &GENUS };
+
+void X(codelet_r2cfII_4) (planner *p) {
+     X(kr2c_register) (p, r2cfII_4, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cfII_4 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 6 FP additions, 2 FP multiplications,
+ * (or, 6 additions, 2 multiplications, 0 fused multiply/add),
+ * 8 stack variables, 1 constants, and 8 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
+	       E T1, T6, T4, T5, T2, T3;
+	       T1 = R0[0];
+	       T6 = R0[WS(rs, 1)];
+	       T2 = R1[0];
+	       T3 = R1[WS(rs, 1)];
+	       T4 = KP707106781 * (T2 - T3);
+	       T5 = KP707106781 * (T2 + T3);
+	       Cr[WS(csr, 1)] = T1 - T4;
+	       Ci[WS(csi, 1)] = T6 - T5;
+	       Cr[0] = T1 + T4;
+	       Ci[0] = -(T5 + T6);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 4, "r2cfII_4", {6, 2, 0, 0}, &GENUS };
+
+void X(codelet_r2cfII_4) (planner *p) {
+     X(kr2c_register) (p, r2cfII_4, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:13 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cfII_5 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 12 FP additions, 7 FP multiplications,
+ * (or, 7 additions, 2 multiplications, 5 fused multiply/add),
+ * 17 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
+	       E T1, T2, T3, T5, T6;
+	       T1 = R0[0];
+	       T2 = R0[WS(rs, 1)];
+	       T3 = R1[WS(rs, 1)];
+	       T5 = R0[WS(rs, 2)];
+	       T6 = R1[0];
+	       {
+		    E Tb, T4, Tc, T7, Ta, T8, T9;
+		    Tb = T2 + T3;
+		    T4 = T2 - T3;
+		    Tc = T5 + T6;
+		    T7 = T5 - T6;
+		    Ci[0] = -(KP951056516 * (FMA(KP618033988, Tc, Tb)));
+		    Ci[WS(csi, 1)] = -(KP951056516 * (FNMS(KP618033988, Tb, Tc)));
+		    Ta = T4 - T7;
+		    T8 = T4 + T7;
+		    T9 = FNMS(KP250000000, T8, T1);
+		    Cr[WS(csr, 2)] = T1 + T8;
+		    Cr[WS(csr, 1)] = FNMS(KP559016994, Ta, T9);
+		    Cr[0] = FMA(KP559016994, Ta, T9);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 5, "r2cfII_5", {7, 2, 5, 0}, &GENUS };
+
+void X(codelet_r2cfII_5) (planner *p) {
+     X(kr2c_register) (p, r2cfII_5, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cfII_5 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 12 FP additions, 6 FP multiplications,
+ * (or, 9 additions, 3 multiplications, 3 fused multiply/add),
+ * 17 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
+	       E T8, T3, T6, T9, Tc, Tb, T7, Ta;
+	       T8 = R0[0];
+	       {
+		    E T1, T2, T4, T5;
+		    T1 = R0[WS(rs, 1)];
+		    T2 = R1[WS(rs, 1)];
+		    T3 = T1 - T2;
+		    T4 = R0[WS(rs, 2)];
+		    T5 = R1[0];
+		    T6 = T4 - T5;
+		    T9 = T3 + T6;
+		    Tc = T4 + T5;
+		    Tb = T1 + T2;
+	       }
+	       Cr[WS(csr, 2)] = T8 + T9;
+	       Ci[WS(csi, 1)] = FNMS(KP951056516, Tc, KP587785252 * Tb);
+	       Ci[0] = -(FMA(KP951056516, Tb, KP587785252 * Tc));
+	       T7 = KP559016994 * (T3 - T6);
+	       Ta = FNMS(KP250000000, T9, T8);
+	       Cr[0] = T7 + Ta;
+	       Cr[WS(csr, 1)] = Ta - T7;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 5, "r2cfII_5", {9, 3, 3, 0}, &GENUS };
+
+void X(codelet_r2cfII_5) (planner *p) {
+     X(kr2c_register) (p, r2cfII_5, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:13 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cfII_6 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 13 FP additions, 6 FP multiplications,
+ * (or, 7 additions, 0 multiplications, 6 fused multiply/add),
+ * 15 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
+	       E T1, T9, T2, T3, T6, T7;
+	       T1 = R0[0];
+	       T9 = R1[WS(rs, 1)];
+	       T2 = R0[WS(rs, 2)];
+	       T3 = R0[WS(rs, 1)];
+	       T6 = R1[WS(rs, 2)];
+	       T7 = R1[0];
+	       {
+		    E Tc, T4, Ta, T8, T5, Tb;
+		    Cr[WS(csr, 1)] = T1 + T2 - T3;
+		    Tc = T2 + T3;
+		    T4 = T3 - T2;
+		    Ta = T6 + T7;
+		    T8 = T6 - T7;
+		    T5 = FMA(KP500000000, T4, T1);
+		    Tb = FMA(KP500000000, Ta, T9);
+		    Ci[WS(csi, 1)] = T9 - Ta;
+		    Cr[WS(csr, 2)] = FMA(KP866025403, T8, T5);
+		    Cr[0] = FNMS(KP866025403, T8, T5);
+		    Ci[WS(csi, 2)] = FMS(KP866025403, Tc, Tb);
+		    Ci[0] = -(FMA(KP866025403, Tc, Tb));
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 6, "r2cfII_6", {7, 0, 6, 0}, &GENUS };
+
+void X(codelet_r2cfII_6) (planner *p) {
+     X(kr2c_register) (p, r2cfII_6, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cfII_6 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 13 FP additions, 4 FP multiplications,
+ * (or, 11 additions, 2 multiplications, 2 fused multiply/add),
+ * 14 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
+	       E Ta, T7, T9, T1, T3, T2, T8, T4, T5, T6, Tb;
+	       Ta = R1[WS(rs, 1)];
+	       T5 = R1[WS(rs, 2)];
+	       T6 = R1[0];
+	       T7 = KP866025403 * (T5 - T6);
+	       T9 = T5 + T6;
+	       T1 = R0[0];
+	       T3 = R0[WS(rs, 1)];
+	       T2 = R0[WS(rs, 2)];
+	       T8 = KP866025403 * (T2 + T3);
+	       T4 = FMA(KP500000000, T3 - T2, T1);
+	       Cr[0] = T4 - T7;
+	       Cr[WS(csr, 2)] = T4 + T7;
+	       Ci[WS(csi, 1)] = Ta - T9;
+	       Cr[WS(csr, 1)] = T1 + T2 - T3;
+	       Tb = FMA(KP500000000, T9, Ta);
+	       Ci[0] = -(T8 + Tb);
+	       Ci[WS(csi, 2)] = T8 - Tb;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 6, "r2cfII_6", {11, 2, 2, 0}, &GENUS };
+
+void X(codelet_r2cfII_6) (planner *p) {
+     X(kr2c_register) (p, r2cfII_6, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1535 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:17 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 64 -name r2cfII_64 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 434 FP additions, 320 FP multiplications,
+ * (or, 114 additions, 0 multiplications, 320 fused multiply/add),
+ * 158 stack variables, 31 constants, and 128 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP941544065, +0.941544065183020778412509402599502357185589796);
+     DK(KP903989293, +0.903989293123443331586200297230537048710132025);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP472964775, +0.472964775891319928124438237972992463904131113);
+     DK(KP357805721, +0.357805721314524104672487743774474392487532769);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP989176509, +0.989176509964780973451673738016243063983689533);
+     DK(KP803207531, +0.803207531480644909806676512963141923879569427);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP741650546, +0.741650546272035369581266691172079863842265220);
+     DK(KP148335987, +0.148335987538347428753676511486911367000625355);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP998795456, +0.998795456205172392714771604759100694443203615);
+     DK(KP740951125, +0.740951125354959091175616897495162729728955309);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP906347169, +0.906347169019147157946142717268914412664134293);
+     DK(KP049126849, +0.049126849769467254105343321271313617079695752);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP970031253, +0.970031253194543992603984207286100251456865962);
+     DK(KP857728610, +0.857728610000272069902269984284770137042490799);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP599376933, +0.599376933681923766271389869014404232837890546);
+     DK(KP250486960, +0.250486960191305461595702160124721208578685568);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(256, rs), MAKE_VOLATILE_STRIDE(256, csr), MAKE_VOLATILE_STRIDE(256, csi)) {
+	       E T5b, T6q, T6p, T5e;
+	       {
+		    E T5h, T3Z, T35, Tm, T5g, T3W, T34, Tv, T5f, T3T, T6N, T6z, T6j, T65, T33;
+		    E Td, T5z, T4D, T3q, T2C, T5C, T4O, T3n, T2b, T5k, T4b, T3c, TR, T5l, T4e;
+		    E T3b, TK, T5n, T44, T39, T1c, T5o, T47, T38, T15, T5s, T4k, T3j, T1T, T5v;
+		    E T4v, T3g, T1s, T1t, T1y, T5D, T4K, T5A, T4R, T3o, T2F, T3r, T2u, T1C, T1H;
+		    E T1D, T1z, T1w, T1E;
+		    {
+			 E T2A, T26, T4B, T23, T4M, T2y, T2z, T29;
+			 {
+			      E Te, Tj, Tn, Ts, To, Tk, Th, Tp, Tf, Tg;
+			      Te = R0[WS(rs, 14)];
+			      Tj = R0[WS(rs, 30)];
+			      Tf = R0[WS(rs, 6)];
+			      Tg = R0[WS(rs, 22)];
+			      Tn = R0[WS(rs, 18)];
+			      Ts = R0[WS(rs, 2)];
+			      To = R0[WS(rs, 10)];
+			      Tk = Tg - Tf;
+			      Th = Tf + Tg;
+			      Tp = R0[WS(rs, 26)];
+			      {
+				   E T3Q, T8, T3P, T5, T6x, T63, T3R, Tb;
+				   {
+					E T1, T61, T9, T62, T4, Ta;
+					{
+					     E T3V, Tu, T3U, Tr, T3Y, Tl;
+					     T1 = R0[0];
+					     T3Y = FMA(KP707106781, Tk, Tj);
+					     Tl = FNMS(KP707106781, Tk, Tj);
+					     {
+						  E T3X, Ti, Tt, Tq;
+						  T3X = FMA(KP707106781, Th, Te);
+						  Ti = FNMS(KP707106781, Th, Te);
+						  Tt = To - Tp;
+						  Tq = To + Tp;
+						  T5h = FNMS(KP198912367, T3X, T3Y);
+						  T3Z = FMA(KP198912367, T3Y, T3X);
+						  T35 = FMA(KP668178637, Ti, Tl);
+						  Tm = FNMS(KP668178637, Tl, Ti);
+						  T3V = FMA(KP707106781, Tt, Ts);
+						  Tu = FNMS(KP707106781, Tt, Ts);
+						  T3U = FMA(KP707106781, Tq, Tn);
+						  Tr = FNMS(KP707106781, Tq, Tn);
+						  T61 = R0[WS(rs, 16)];
+					     }
+					     {
+						  E T2, T3, T6, T7;
+						  T2 = R0[WS(rs, 8)];
+						  T5g = FNMS(KP198912367, T3U, T3V);
+						  T3W = FMA(KP198912367, T3V, T3U);
+						  T34 = FMA(KP668178637, Tr, Tu);
+						  Tv = FNMS(KP668178637, Tu, Tr);
+						  T3 = R0[WS(rs, 24)];
+						  T6 = R0[WS(rs, 20)];
+						  T7 = R0[WS(rs, 4)];
+						  T9 = R0[WS(rs, 12)];
+						  T62 = T2 + T3;
+						  T4 = T2 - T3;
+						  T3Q = FNMS(KP414213562, T6, T7);
+						  T8 = FMA(KP414213562, T7, T6);
+						  Ta = R0[WS(rs, 28)];
+					     }
+					}
+					T3P = FMA(KP707106781, T4, T1);
+					T5 = FNMS(KP707106781, T4, T1);
+					T6x = FNMS(KP707106781, T62, T61);
+					T63 = FMA(KP707106781, T62, T61);
+					T3R = FMS(KP414213562, T9, Ta);
+					Tb = FMA(KP414213562, Ta, T9);
+				   }
+				   {
+					E T1Z, T2w, T27, T2x, T22, T28;
+					T1Z = R1[WS(rs, 31)];
+					{
+					     E T3S, T6y, T64, Tc;
+					     T3S = T3Q + T3R;
+					     T6y = T3R - T3Q;
+					     T64 = T8 + Tb;
+					     Tc = T8 - Tb;
+					     T5f = FMA(KP923879532, T3S, T3P);
+					     T3T = FNMS(KP923879532, T3S, T3P);
+					     T6N = FNMS(KP923879532, T6y, T6x);
+					     T6z = FMA(KP923879532, T6y, T6x);
+					     T6j = FNMS(KP923879532, T64, T63);
+					     T65 = FMA(KP923879532, T64, T63);
+					     T33 = FMA(KP923879532, Tc, T5);
+					     Td = FNMS(KP923879532, Tc, T5);
+					     T2w = R1[WS(rs, 15)];
+					}
+					{
+					     E T20, T21, T24, T25;
+					     T20 = R1[WS(rs, 7)];
+					     T21 = R1[WS(rs, 23)];
+					     T24 = R1[WS(rs, 19)];
+					     T25 = R1[WS(rs, 3)];
+					     T27 = R1[WS(rs, 11)];
+					     T2x = T20 + T21;
+					     T22 = T20 - T21;
+					     T2A = FNMS(KP414213562, T24, T25);
+					     T26 = FMA(KP414213562, T25, T24);
+					     T28 = R1[WS(rs, 27)];
+					}
+					T4B = FMS(KP707106781, T22, T1Z);
+					T23 = FMA(KP707106781, T22, T1Z);
+					T4M = FMA(KP707106781, T2x, T2w);
+					T2y = FNMS(KP707106781, T2x, T2w);
+					T2z = FMS(KP414213562, T27, T28);
+					T29 = FMA(KP414213562, T28, T27);
+				   }
+			      }
+			 }
+			 {
+			      E T1a, T10, T42, TX, T45, T18, T19, T13;
+			      {
+				   E TP, TF, T49, TC, T4c, TN, TO, TI;
+				   {
+					E Ty, TL, TG, TM, TB, TH;
+					Ty = R0[WS(rs, 17)];
+					{
+					     E T4C, T2B, T4N, T2a;
+					     T4C = T2A + T2z;
+					     T2B = T2z - T2A;
+					     T4N = T26 + T29;
+					     T2a = T26 - T29;
+					     T5z = FMA(KP923879532, T4C, T4B);
+					     T4D = FNMS(KP923879532, T4C, T4B);
+					     T3q = FMA(KP923879532, T2B, T2y);
+					     T2C = FNMS(KP923879532, T2B, T2y);
+					     T5C = FMA(KP923879532, T4N, T4M);
+					     T4O = FNMS(KP923879532, T4N, T4M);
+					     T3n = FNMS(KP923879532, T2a, T23);
+					     T2b = FMA(KP923879532, T2a, T23);
+					     TL = R0[WS(rs, 1)];
+					}
+					{
+					     E Tz, TA, TD, TE;
+					     Tz = R0[WS(rs, 9)];
+					     TA = R0[WS(rs, 25)];
+					     TD = R0[WS(rs, 29)];
+					     TE = R0[WS(rs, 13)];
+					     TG = R0[WS(rs, 5)];
+					     TM = Tz - TA;
+					     TB = Tz + TA;
+					     TP = FMA(KP414213562, TD, TE);
+					     TF = FMS(KP414213562, TE, TD);
+					     TH = R0[WS(rs, 21)];
+					}
+					T49 = FMA(KP707106781, TB, Ty);
+					TC = FNMS(KP707106781, TB, Ty);
+					T4c = FMA(KP707106781, TM, TL);
+					TN = FNMS(KP707106781, TM, TL);
+					TO = FMA(KP414213562, TG, TH);
+					TI = FNMS(KP414213562, TH, TG);
+				   }
+				   {
+					E TT, T16, T11, T17, TW, T12;
+					TT = R0[WS(rs, 15)];
+					{
+					     E T4a, TQ, T4d, TJ;
+					     T4a = TO + TP;
+					     TQ = TO - TP;
+					     T4d = TI + TF;
+					     TJ = TF - TI;
+					     T5k = FMA(KP923879532, T4a, T49);
+					     T4b = FNMS(KP923879532, T4a, T49);
+					     T3c = FMA(KP923879532, TQ, TN);
+					     TR = FNMS(KP923879532, TQ, TN);
+					     T5l = FMA(KP923879532, T4d, T4c);
+					     T4e = FNMS(KP923879532, T4d, T4c);
+					     T3b = FMA(KP923879532, TJ, TC);
+					     TK = FNMS(KP923879532, TJ, TC);
+					     T16 = R0[WS(rs, 31)];
+					}
+					{
+					     E TU, TV, TY, TZ;
+					     TU = R0[WS(rs, 7)];
+					     TV = R0[WS(rs, 23)];
+					     TY = R0[WS(rs, 3)];
+					     TZ = R0[WS(rs, 19)];
+					     T11 = R0[WS(rs, 27)];
+					     T17 = TV - TU;
+					     TW = TU + TV;
+					     T1a = FMA(KP414213562, TY, TZ);
+					     T10 = FMS(KP414213562, TZ, TY);
+					     T12 = R0[WS(rs, 11)];
+					}
+					T42 = FMA(KP707106781, TW, TT);
+					TX = FNMS(KP707106781, TW, TT);
+					T45 = FMA(KP707106781, T17, T16);
+					T18 = FNMS(KP707106781, T17, T16);
+					T19 = FMA(KP414213562, T11, T12);
+					T13 = FNMS(KP414213562, T12, T11);
+				   }
+			      }
+			      {
+				   E T1R, T1n, T4i, T1k, T4t, T1P, T1Q, T1q;
+				   {
+					E T1g, T1N, T1o, T1O, T1j, T1p;
+					T1g = R1[0];
+					{
+					     E T43, T1b, T46, T14;
+					     T43 = T1a + T19;
+					     T1b = T19 - T1a;
+					     T46 = T10 + T13;
+					     T14 = T10 - T13;
+					     T5n = FMA(KP923879532, T43, T42);
+					     T44 = FNMS(KP923879532, T43, T42);
+					     T39 = FMA(KP923879532, T1b, T18);
+					     T1c = FNMS(KP923879532, T1b, T18);
+					     T5o = FMA(KP923879532, T46, T45);
+					     T47 = FNMS(KP923879532, T46, T45);
+					     T38 = FMA(KP923879532, T14, TX);
+					     T15 = FNMS(KP923879532, T14, TX);
+					     T1N = R1[WS(rs, 16)];
+					}
+					{
+					     E T1h, T1i, T1l, T1m;
+					     T1h = R1[WS(rs, 8)];
+					     T1i = R1[WS(rs, 24)];
+					     T1l = R1[WS(rs, 20)];
+					     T1m = R1[WS(rs, 4)];
+					     T1o = R1[WS(rs, 12)];
+					     T1O = T1h + T1i;
+					     T1j = T1h - T1i;
+					     T1R = FNMS(KP414213562, T1l, T1m);
+					     T1n = FMA(KP414213562, T1m, T1l);
+					     T1p = R1[WS(rs, 28)];
+					}
+					T4i = FMA(KP707106781, T1j, T1g);
+					T1k = FNMS(KP707106781, T1j, T1g);
+					T4t = FMA(KP707106781, T1O, T1N);
+					T1P = FNMS(KP707106781, T1O, T1N);
+					T1Q = FMS(KP414213562, T1o, T1p);
+					T1q = FMA(KP414213562, T1p, T1o);
+				   }
+				   {
+					E T2c, T2h, T2l, T2q, T2m, T2i, T2f, T2n, T2d, T2e;
+					T2c = R1[WS(rs, 13)];
+					{
+					     E T4j, T1S, T4u, T1r;
+					     T4j = T1R + T1Q;
+					     T1S = T1Q - T1R;
+					     T4u = T1n + T1q;
+					     T1r = T1n - T1q;
+					     T5s = FMA(KP923879532, T4j, T4i);
+					     T4k = FNMS(KP923879532, T4j, T4i);
+					     T3j = FMA(KP923879532, T1S, T1P);
+					     T1T = FNMS(KP923879532, T1S, T1P);
+					     T5v = FMA(KP923879532, T4u, T4t);
+					     T4v = FNMS(KP923879532, T4u, T4t);
+					     T3g = FMA(KP923879532, T1r, T1k);
+					     T1s = FNMS(KP923879532, T1r, T1k);
+					     T2h = R1[WS(rs, 29)];
+					     T2d = R1[WS(rs, 5)];
+					     T2e = R1[WS(rs, 21)];
+					}
+					T2l = R1[WS(rs, 17)];
+					T2q = R1[WS(rs, 1)];
+					T2m = R1[WS(rs, 9)];
+					T2i = T2d - T2e;
+					T2f = T2d + T2e;
+					T2n = R1[WS(rs, 25)];
+					{
+					     E T1u, T1v, T2j, T4I;
+					     T1t = R1[WS(rs, 14)];
+					     T2j = FMA(KP707106781, T2i, T2h);
+					     T4I = FMS(KP707106781, T2i, T2h);
+					     {
+						  E T4H, T2g, T2r, T2o;
+						  T4H = FMA(KP707106781, T2f, T2c);
+						  T2g = FNMS(KP707106781, T2f, T2c);
+						  T2r = T2m - T2n;
+						  T2o = T2m + T2n;
+						  {
+						       E T4J, T4P, T2E, T2k;
+						       T4J = FNMS(KP198912367, T4I, T4H);
+						       T4P = FMA(KP198912367, T4H, T4I);
+						       T2E = FMA(KP668178637, T2g, T2j);
+						       T2k = FNMS(KP668178637, T2j, T2g);
+						       {
+							    E T2s, T4F, T4E, T2p;
+							    T2s = FNMS(KP707106781, T2r, T2q);
+							    T4F = FMA(KP707106781, T2r, T2q);
+							    T4E = FMA(KP707106781, T2o, T2l);
+							    T2p = FNMS(KP707106781, T2o, T2l);
+							    T1y = R1[WS(rs, 30)];
+							    T1u = R1[WS(rs, 6)];
+							    {
+								 E T4G, T4Q, T2D, T2t;
+								 T4G = FMA(KP198912367, T4F, T4E);
+								 T4Q = FNMS(KP198912367, T4E, T4F);
+								 T2D = FMA(KP668178637, T2p, T2s);
+								 T2t = FNMS(KP668178637, T2s, T2p);
+								 T5D = T4G + T4J;
+								 T4K = T4G - T4J;
+								 T5A = T4Q + T4P;
+								 T4R = T4P - T4Q;
+								 T3o = T2D - T2E;
+								 T2F = T2D + T2E;
+								 T3r = T2t + T2k;
+								 T2u = T2k - T2t;
+								 T1v = R1[WS(rs, 22)];
+							    }
+						       }
+						  }
+					     }
+					     T1C = R1[WS(rs, 18)];
+					     T1H = R1[WS(rs, 2)];
+					     T1D = R1[WS(rs, 10)];
+					     T1z = T1u - T1v;
+					     T1w = T1u + T1v;
+					     T1E = R1[WS(rs, 26)];
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T6A, T4r, T4y, T3h, T3k, T36, T6k, T40, T5X, T6c, T6b, T60;
+			 {
+			      E T5w, T5t, T2Z, T6U, T6T, T32;
+			      {
+				   E Tx, T2N, T2v, T6V, T6P, T6Q, T1e, T2G, T31, T2X, T2L, T1Y, T6W, T2Q, T30;
+				   E T2U;
+				   {
+					E T1W, T1L, T2O, T2P, T2V, T2W, T6O, TS, T1d;
+					{
+					     E T4q, T4w, T1V, T1B, T1J, T4m, T4l, T1G, Tw, T1A, T4p;
+					     T6A = Tv + Tm;
+					     Tw = Tm - Tv;
+					     T1A = FMA(KP707106781, T1z, T1y);
+					     T4p = FMS(KP707106781, T1z, T1y);
+					     {
+						  E T4o, T1x, T1I, T1F;
+						  T4o = FMA(KP707106781, T1w, T1t);
+						  T1x = FNMS(KP707106781, T1w, T1t);
+						  T1I = T1D - T1E;
+						  T1F = T1D + T1E;
+						  T4q = FNMS(KP198912367, T4p, T4o);
+						  T4w = FMA(KP198912367, T4o, T4p);
+						  T1V = FMA(KP668178637, T1x, T1A);
+						  T1B = FNMS(KP668178637, T1A, T1x);
+						  T1J = FNMS(KP707106781, T1I, T1H);
+						  T4m = FMA(KP707106781, T1I, T1H);
+						  T4l = FMA(KP707106781, T1F, T1C);
+						  T1G = FNMS(KP707106781, T1F, T1C);
+						  Tx = FNMS(KP831469612, Tw, Td);
+						  T2N = FMA(KP831469612, Tw, Td);
+					     }
+					     {
+						  E T4n, T4x, T1U, T1K;
+						  T4n = FMA(KP198912367, T4m, T4l);
+						  T4x = FNMS(KP198912367, T4l, T4m);
+						  T1U = FMA(KP668178637, T1G, T1J);
+						  T1K = FNMS(KP668178637, T1J, T1G);
+						  T5w = T4n + T4q;
+						  T4r = T4n - T4q;
+						  T5t = T4x + T4w;
+						  T4y = T4w - T4x;
+						  T3h = T1U - T1V;
+						  T1W = T1U + T1V;
+						  T3k = T1K + T1B;
+						  T1L = T1B - T1K;
+						  T6O = T34 + T35;
+						  T36 = T34 - T35;
+					     }
+					}
+					T2O = FNMS(KP534511135, TK, TR);
+					TS = FMA(KP534511135, TR, TK);
+					T1d = FMA(KP534511135, T1c, T15);
+					T2P = FNMS(KP534511135, T15, T1c);
+					T2v = FMA(KP831469612, T2u, T2b);
+					T2V = FNMS(KP831469612, T2u, T2b);
+					T6V = FNMS(KP831469612, T6O, T6N);
+					T6P = FMA(KP831469612, T6O, T6N);
+					T6Q = TS + T1d;
+					T1e = TS - T1d;
+					T2W = FMA(KP831469612, T2F, T2C);
+					T2G = FNMS(KP831469612, T2F, T2C);
+					{
+					     E T2S, T2T, T1M, T1X;
+					     T2S = FMA(KP831469612, T1L, T1s);
+					     T1M = FNMS(KP831469612, T1L, T1s);
+					     T1X = FNMS(KP831469612, T1W, T1T);
+					     T2T = FMA(KP831469612, T1W, T1T);
+					     T31 = FMA(KP250486960, T2V, T2W);
+					     T2X = FNMS(KP250486960, T2W, T2V);
+					     T2L = FNMS(KP599376933, T1M, T1X);
+					     T1Y = FMA(KP599376933, T1X, T1M);
+					     T6W = T2O + T2P;
+					     T2Q = T2O - T2P;
+					     T30 = FMA(KP250486960, T2S, T2T);
+					     T2U = FNMS(KP250486960, T2T, T2S);
+					}
+				   }
+				   {
+					E T2J, T1f, T6X, T6Z, T2K, T2H;
+					T2J = FNMS(KP881921264, T1e, Tx);
+					T1f = FMA(KP881921264, T1e, Tx);
+					T6X = FNMS(KP881921264, T6W, T6V);
+					T6Z = FMA(KP881921264, T6W, T6V);
+					T2K = FNMS(KP599376933, T2v, T2G);
+					T2H = FMA(KP599376933, T2G, T2v);
+					{
+					     E T2R, T2Y, T6R, T6S;
+					     T2Z = FNMS(KP881921264, T2Q, T2N);
+					     T2R = FMA(KP881921264, T2Q, T2N);
+					     {
+						  E T2M, T6Y, T70, T2I;
+						  T2M = T2K - T2L;
+						  T6Y = T2L + T2K;
+						  T70 = T1Y + T2H;
+						  T2I = T1Y - T2H;
+						  Cr[WS(csr, 10)] = FMA(KP857728610, T2M, T2J);
+						  Cr[WS(csr, 21)] = FNMS(KP857728610, T2M, T2J);
+						  Ci[WS(csi, 5)] = FMA(KP857728610, T6Y, T6X);
+						  Ci[WS(csi, 26)] = FMS(KP857728610, T6Y, T6X);
+						  Ci[WS(csi, 21)] = FNMS(KP857728610, T70, T6Z);
+						  Ci[WS(csi, 10)] = -(FMA(KP857728610, T70, T6Z));
+						  Cr[WS(csr, 5)] = FMA(KP857728610, T2I, T1f);
+						  Cr[WS(csr, 26)] = FNMS(KP857728610, T2I, T1f);
+						  T2Y = T2U - T2X;
+						  T6U = T2U + T2X;
+					     }
+					     T6T = FNMS(KP881921264, T6Q, T6P);
+					     T6R = FMA(KP881921264, T6Q, T6P);
+					     T6S = T30 + T31;
+					     T32 = T30 - T31;
+					     Cr[WS(csr, 2)] = FMA(KP970031253, T2Y, T2R);
+					     Cr[WS(csr, 29)] = FNMS(KP970031253, T2Y, T2R);
+					     Ci[WS(csi, 29)] = FNMS(KP970031253, T6S, T6R);
+					     Ci[WS(csi, 2)] = -(FMA(KP970031253, T6S, T6R));
+					}
+				   }
+			      }
+			      {
+				   E T5j, T5L, T5B, T6d, T67, T68, T5q, T5E, T5Z, T5V, T5J, T5y, T6e, T5O, T5Y;
+				   E T5S;
+				   {
+					E T5M, T5N, T5T, T5U;
+					{
+					     E T66, T5i, T5m, T5p;
+					     T6k = T5g + T5h;
+					     T5i = T5g - T5h;
+					     Cr[WS(csr, 13)] = FMA(KP970031253, T32, T2Z);
+					     Cr[WS(csr, 18)] = FNMS(KP970031253, T32, T2Z);
+					     Ci[WS(csi, 13)] = FNMS(KP970031253, T6U, T6T);
+					     Ci[WS(csi, 18)] = -(FMA(KP970031253, T6U, T6T));
+					     T5j = FNMS(KP980785280, T5i, T5f);
+					     T5L = FMA(KP980785280, T5i, T5f);
+					     T66 = T3W + T3Z;
+					     T40 = T3W - T3Z;
+					     T5M = FNMS(KP098491403, T5k, T5l);
+					     T5m = FMA(KP098491403, T5l, T5k);
+					     T5p = FMA(KP098491403, T5o, T5n);
+					     T5N = FNMS(KP098491403, T5n, T5o);
+					     T5B = FNMS(KP980785280, T5A, T5z);
+					     T5T = FMA(KP980785280, T5A, T5z);
+					     T6d = FNMS(KP980785280, T66, T65);
+					     T67 = FMA(KP980785280, T66, T65);
+					     T68 = T5m + T5p;
+					     T5q = T5m - T5p;
+					     T5U = FMA(KP980785280, T5D, T5C);
+					     T5E = FNMS(KP980785280, T5D, T5C);
+					}
+					{
+					     E T5Q, T5R, T5u, T5x;
+					     T5Q = FMA(KP980785280, T5t, T5s);
+					     T5u = FNMS(KP980785280, T5t, T5s);
+					     T5x = FNMS(KP980785280, T5w, T5v);
+					     T5R = FMA(KP980785280, T5w, T5v);
+					     T5Z = FNMS(KP049126849, T5T, T5U);
+					     T5V = FMA(KP049126849, T5U, T5T);
+					     T5J = FNMS(KP906347169, T5u, T5x);
+					     T5y = FMA(KP906347169, T5x, T5u);
+					     T6e = T5M + T5N;
+					     T5O = T5M - T5N;
+					     T5Y = FMA(KP049126849, T5Q, T5R);
+					     T5S = FNMS(KP049126849, T5R, T5Q);
+					}
+				   }
+				   {
+					E T5H, T5r, T6f, T6h, T5I, T5F;
+					T5H = FNMS(KP995184726, T5q, T5j);
+					T5r = FMA(KP995184726, T5q, T5j);
+					T6f = FNMS(KP995184726, T6e, T6d);
+					T6h = FMA(KP995184726, T6e, T6d);
+					T5I = FMA(KP906347169, T5B, T5E);
+					T5F = FNMS(KP906347169, T5E, T5B);
+					{
+					     E T5P, T5W, T69, T6a;
+					     T5X = FNMS(KP995184726, T5O, T5L);
+					     T5P = FMA(KP995184726, T5O, T5L);
+					     {
+						  E T5K, T6g, T6i, T5G;
+						  T5K = T5I - T5J;
+						  T6g = T5J + T5I;
+						  T6i = T5F - T5y;
+						  T5G = T5y + T5F;
+						  Cr[WS(csr, 8)] = FMA(KP740951125, T5K, T5H);
+						  Cr[WS(csr, 23)] = FNMS(KP740951125, T5K, T5H);
+						  Ci[WS(csi, 7)] = FMA(KP740951125, T6g, T6f);
+						  Ci[WS(csi, 24)] = FMS(KP740951125, T6g, T6f);
+						  Ci[WS(csi, 23)] = FMA(KP740951125, T6i, T6h);
+						  Ci[WS(csi, 8)] = FMS(KP740951125, T6i, T6h);
+						  Cr[WS(csr, 7)] = FMA(KP740951125, T5G, T5r);
+						  Cr[WS(csr, 24)] = FNMS(KP740951125, T5G, T5r);
+						  T5W = T5S + T5V;
+						  T6c = T5V - T5S;
+					     }
+					     T6b = FNMS(KP995184726, T68, T67);
+					     T69 = FMA(KP995184726, T68, T67);
+					     T6a = T5Y + T5Z;
+					     T60 = T5Y - T5Z;
+					     Cr[0] = FMA(KP998795456, T5W, T5P);
+					     Cr[WS(csr, 31)] = FNMS(KP998795456, T5W, T5P);
+					     Ci[WS(csi, 31)] = FNMS(KP998795456, T6a, T69);
+					     Ci[0] = -(FMA(KP998795456, T6a, T69));
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T3L, T6G, T6F, T3O;
+			      {
+				   E T37, T3z, T3p, T6H, T6B, T6C, T3e, T3s, T3M, T3J, T3w, T3m, T6I, T3C, T3N;
+				   E T3G;
+				   {
+					E T3B, T3A, T3H, T3I, T3a, T3d;
+					Cr[WS(csr, 15)] = FMA(KP998795456, T60, T5X);
+					Cr[WS(csr, 16)] = FNMS(KP998795456, T60, T5X);
+					Ci[WS(csi, 15)] = FMA(KP998795456, T6c, T6b);
+					Ci[WS(csi, 16)] = FMS(KP998795456, T6c, T6b);
+					T37 = FNMS(KP831469612, T36, T33);
+					T3z = FMA(KP831469612, T36, T33);
+					T3B = FMA(KP303346683, T38, T39);
+					T3a = FNMS(KP303346683, T39, T38);
+					T3d = FNMS(KP303346683, T3c, T3b);
+					T3A = FMA(KP303346683, T3b, T3c);
+					T3p = FMA(KP831469612, T3o, T3n);
+					T3H = FNMS(KP831469612, T3o, T3n);
+					T6H = FNMS(KP831469612, T6A, T6z);
+					T6B = FMA(KP831469612, T6A, T6z);
+					T6C = T3d + T3a;
+					T3e = T3a - T3d;
+					T3I = FMA(KP831469612, T3r, T3q);
+					T3s = FNMS(KP831469612, T3r, T3q);
+					{
+					     E T3E, T3F, T3i, T3l;
+					     T3E = FMA(KP831469612, T3h, T3g);
+					     T3i = FNMS(KP831469612, T3h, T3g);
+					     T3l = FNMS(KP831469612, T3k, T3j);
+					     T3F = FMA(KP831469612, T3k, T3j);
+					     T3M = FNMS(KP148335987, T3H, T3I);
+					     T3J = FMA(KP148335987, T3I, T3H);
+					     T3w = FMA(KP741650546, T3i, T3l);
+					     T3m = FNMS(KP741650546, T3l, T3i);
+					     T6I = T3A + T3B;
+					     T3C = T3A - T3B;
+					     T3N = FNMS(KP148335987, T3E, T3F);
+					     T3G = FMA(KP148335987, T3F, T3E);
+					}
+				   }
+				   {
+					E T3v, T3f, T6J, T6L, T3x, T3t;
+					T3v = FNMS(KP956940335, T3e, T37);
+					T3f = FMA(KP956940335, T3e, T37);
+					T6J = FMA(KP956940335, T6I, T6H);
+					T6L = FNMS(KP956940335, T6I, T6H);
+					T3x = FMA(KP741650546, T3p, T3s);
+					T3t = FNMS(KP741650546, T3s, T3p);
+					{
+					     E T3D, T3K, T6D, T6E;
+					     T3L = FNMS(KP956940335, T3C, T3z);
+					     T3D = FMA(KP956940335, T3C, T3z);
+					     {
+						  E T3y, T6K, T6M, T3u;
+						  T3y = T3w - T3x;
+						  T6K = T3w + T3x;
+						  T6M = T3m + T3t;
+						  T3u = T3m - T3t;
+						  Cr[WS(csr, 9)] = FMA(KP803207531, T3y, T3v);
+						  Cr[WS(csr, 22)] = FNMS(KP803207531, T3y, T3v);
+						  Ci[WS(csi, 25)] = FNMS(KP803207531, T6K, T6J);
+						  Ci[WS(csi, 6)] = -(FMA(KP803207531, T6K, T6J));
+						  Ci[WS(csi, 9)] = FNMS(KP803207531, T6M, T6L);
+						  Ci[WS(csi, 22)] = -(FMA(KP803207531, T6M, T6L));
+						  Cr[WS(csr, 6)] = FMA(KP803207531, T3u, T3f);
+						  Cr[WS(csr, 25)] = FNMS(KP803207531, T3u, T3f);
+						  T3K = T3G - T3J;
+						  T6G = T3G + T3J;
+					     }
+					     T6F = FNMS(KP956940335, T6C, T6B);
+					     T6D = FMA(KP956940335, T6C, T6B);
+					     T6E = T3N + T3M;
+					     T3O = T3M - T3N;
+					     Cr[WS(csr, 1)] = FMA(KP989176509, T3K, T3D);
+					     Cr[WS(csr, 30)] = FNMS(KP989176509, T3K, T3D);
+					     Ci[WS(csi, 1)] = FMA(KP989176509, T6E, T6D);
+					     Ci[WS(csi, 30)] = FMS(KP989176509, T6E, T6D);
+					}
+				   }
+			      }
+			      {
+				   E T41, T4Z, T4L, T6r, T6l, T6m, T4g, T4S, T5c, T59, T4W, T4A, T6s, T52, T5d;
+				   E T56;
+				   {
+					E T51, T50, T57, T58, T48, T4f;
+					Cr[WS(csr, 14)] = FMA(KP989176509, T3O, T3L);
+					Cr[WS(csr, 17)] = FNMS(KP989176509, T3O, T3L);
+					Ci[WS(csi, 17)] = FNMS(KP989176509, T6G, T6F);
+					Ci[WS(csi, 14)] = -(FMA(KP989176509, T6G, T6F));
+					T41 = FNMS(KP980785280, T40, T3T);
+					T4Z = FMA(KP980785280, T40, T3T);
+					T51 = FMA(KP820678790, T44, T47);
+					T48 = FNMS(KP820678790, T47, T44);
+					T4f = FNMS(KP820678790, T4e, T4b);
+					T50 = FMA(KP820678790, T4b, T4e);
+					T4L = FNMS(KP980785280, T4K, T4D);
+					T57 = FMA(KP980785280, T4K, T4D);
+					T6r = FMA(KP980785280, T6k, T6j);
+					T6l = FNMS(KP980785280, T6k, T6j);
+					T6m = T4f + T48;
+					T4g = T48 - T4f;
+					T58 = FMA(KP980785280, T4R, T4O);
+					T4S = FNMS(KP980785280, T4R, T4O);
+					{
+					     E T54, T55, T4s, T4z;
+					     T54 = FMA(KP980785280, T4r, T4k);
+					     T4s = FNMS(KP980785280, T4r, T4k);
+					     T4z = FNMS(KP980785280, T4y, T4v);
+					     T55 = FMA(KP980785280, T4y, T4v);
+					     T5c = FMA(KP357805721, T57, T58);
+					     T59 = FNMS(KP357805721, T58, T57);
+					     T4W = FMA(KP472964775, T4s, T4z);
+					     T4A = FNMS(KP472964775, T4z, T4s);
+					     T6s = T50 + T51;
+					     T52 = T50 - T51;
+					     T5d = FNMS(KP357805721, T54, T55);
+					     T56 = FMA(KP357805721, T55, T54);
+					}
+				   }
+				   {
+					E T4V, T4h, T6t, T6v, T4X, T4T;
+					T4V = FNMS(KP773010453, T4g, T41);
+					T4h = FMA(KP773010453, T4g, T41);
+					T6t = FMA(KP773010453, T6s, T6r);
+					T6v = FNMS(KP773010453, T6s, T6r);
+					T4X = FNMS(KP472964775, T4L, T4S);
+					T4T = FMA(KP472964775, T4S, T4L);
+					{
+					     E T53, T5a, T6n, T6o;
+					     T5b = FNMS(KP773010453, T52, T4Z);
+					     T53 = FMA(KP773010453, T52, T4Z);
+					     {
+						  E T4Y, T6u, T6w, T4U;
+						  T4Y = T4W - T4X;
+						  T6u = T4W + T4X;
+						  T6w = T4T - T4A;
+						  T4U = T4A + T4T;
+						  Cr[WS(csr, 11)] = FMA(KP903989293, T4Y, T4V);
+						  Cr[WS(csr, 20)] = FNMS(KP903989293, T4Y, T4V);
+						  Ci[WS(csi, 27)] = FNMS(KP903989293, T6u, T6t);
+						  Ci[WS(csi, 4)] = -(FMA(KP903989293, T6u, T6t));
+						  Ci[WS(csi, 11)] = FMA(KP903989293, T6w, T6v);
+						  Ci[WS(csi, 20)] = FMS(KP903989293, T6w, T6v);
+						  Cr[WS(csr, 4)] = FMA(KP903989293, T4U, T4h);
+						  Cr[WS(csr, 27)] = FNMS(KP903989293, T4U, T4h);
+						  T5a = T56 + T59;
+						  T6q = T59 - T56;
+					     }
+					     T6p = FNMS(KP773010453, T6m, T6l);
+					     T6n = FMA(KP773010453, T6m, T6l);
+					     T6o = T5d + T5c;
+					     T5e = T5c - T5d;
+					     Cr[WS(csr, 3)] = FMA(KP941544065, T5a, T53);
+					     Cr[WS(csr, 28)] = FNMS(KP941544065, T5a, T53);
+					     Ci[WS(csi, 3)] = FMA(KP941544065, T6o, T6n);
+					     Ci[WS(csi, 28)] = FMS(KP941544065, T6o, T6n);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Cr[WS(csr, 12)] = FMA(KP941544065, T5e, T5b);
+	       Cr[WS(csr, 19)] = FNMS(KP941544065, T5e, T5b);
+	       Ci[WS(csi, 19)] = FMA(KP941544065, T6q, T6p);
+	       Ci[WS(csi, 12)] = FMS(KP941544065, T6q, T6p);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 64, "r2cfII_64", {114, 0, 320, 0}, &GENUS };
+
+void X(codelet_r2cfII_64) (planner *p) {
+     X(kr2c_register) (p, r2cfII_64, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 64 -name r2cfII_64 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 434 FP additions, 206 FP multiplications,
+ * (or, 342 additions, 114 multiplications, 92 fused multiply/add),
+ * 118 stack variables, 31 constants, and 128 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP242980179, +0.242980179903263889948274162077471118320990783);
+     DK(KP970031253, +0.970031253194543992603984207286100251456865962);
+     DK(KP857728610, +0.857728610000272069902269984284770137042490799);
+     DK(KP514102744, +0.514102744193221726593693838968815772608049120);
+     DK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP427555093, +0.427555093430282094320966856888798534304578629);
+     DK(KP903989293, +0.903989293123443331586200297230537048710132025);
+     DK(KP336889853, +0.336889853392220050689253212619147570477766780);
+     DK(KP941544065, +0.941544065183020778412509402599502357185589796);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DK(KP595699304, +0.595699304492433343467036528829969889511926338);
+     DK(KP803207531, +0.803207531480644909806676512963141923879569427);
+     DK(KP146730474, +0.146730474455361751658850129646717819706215317);
+     DK(KP989176509, +0.989176509964780973451673738016243063983689533);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DK(KP049067674, +0.049067674327418014254954976942682658314745363);
+     DK(KP998795456, +0.998795456205172392714771604759100694443203615);
+     DK(KP671558954, +0.671558954847018400625376850427421803228750632);
+     DK(KP740951125, +0.740951125354959091175616897495162729728955309);
+     DK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(256, rs), MAKE_VOLATILE_STRIDE(256, csr), MAKE_VOLATILE_STRIDE(256, csi)) {
+	       E Tm, T34, T3Z, T5g, Tv, T35, T3W, T5h, Td, T33, T6B, T6Q, T3T, T5f, T68;
+	       E T6m, T2b, T3n, T4O, T5D, T2F, T3r, T4K, T5z, TK, T3c, T47, T5n, TR, T3b;
+	       E T44, T5o, T15, T38, T4e, T5l, T1c, T39, T4b, T5k, T1s, T3g, T4v, T5w, T1W;
+	       E T3k, T4k, T5s, T2u, T3q, T4R, T5A, T2y, T3o, T4H, T5C, T1L, T3j, T4y, T5t;
+	       E T1P, T3h, T4r, T5v;
+	       {
+		    E Te, Tk, Th, Tj, Tf, Tg;
+		    Te = R0[WS(rs, 2)];
+		    Tk = R0[WS(rs, 18)];
+		    Tf = R0[WS(rs, 10)];
+		    Tg = R0[WS(rs, 26)];
+		    Th = KP707106781 * (Tf - Tg);
+		    Tj = KP707106781 * (Tf + Tg);
+		    {
+			 E Ti, Tl, T3X, T3Y;
+			 Ti = Te + Th;
+			 Tl = Tj + Tk;
+			 Tm = FNMS(KP195090322, Tl, KP980785280 * Ti);
+			 T34 = FMA(KP195090322, Ti, KP980785280 * Tl);
+			 T3X = Tk - Tj;
+			 T3Y = Te - Th;
+			 T3Z = FNMS(KP555570233, T3Y, KP831469612 * T3X);
+			 T5g = FMA(KP831469612, T3Y, KP555570233 * T3X);
+		    }
+	       }
+	       {
+		    E Tq, Tt, Tp, Ts, Tn, To;
+		    Tq = R0[WS(rs, 30)];
+		    Tt = R0[WS(rs, 14)];
+		    Tn = R0[WS(rs, 6)];
+		    To = R0[WS(rs, 22)];
+		    Tp = KP707106781 * (Tn - To);
+		    Ts = KP707106781 * (Tn + To);
+		    {
+			 E Tr, Tu, T3U, T3V;
+			 Tr = Tp - Tq;
+			 Tu = Ts + Tt;
+			 Tv = FMA(KP980785280, Tr, KP195090322 * Tu);
+			 T35 = FNMS(KP980785280, Tu, KP195090322 * Tr);
+			 T3U = Tt - Ts;
+			 T3V = Tp + Tq;
+			 T3W = FNMS(KP555570233, T3V, KP831469612 * T3U);
+			 T5h = FMA(KP831469612, T3V, KP555570233 * T3U);
+		    }
+	       }
+	       {
+		    E T1, T66, T4, T65, T8, T3Q, Tb, T3R, T2, T3;
+		    T1 = R0[0];
+		    T66 = R0[WS(rs, 16)];
+		    T2 = R0[WS(rs, 8)];
+		    T3 = R0[WS(rs, 24)];
+		    T4 = KP707106781 * (T2 - T3);
+		    T65 = KP707106781 * (T2 + T3);
+		    {
+			 E T6, T7, T9, Ta;
+			 T6 = R0[WS(rs, 4)];
+			 T7 = R0[WS(rs, 20)];
+			 T8 = FNMS(KP382683432, T7, KP923879532 * T6);
+			 T3Q = FMA(KP382683432, T6, KP923879532 * T7);
+			 T9 = R0[WS(rs, 12)];
+			 Ta = R0[WS(rs, 28)];
+			 Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
+			 T3R = FMA(KP923879532, T9, KP382683432 * Ta);
+		    }
+		    {
+			 E T5, Tc, T6z, T6A;
+			 T5 = T1 + T4;
+			 Tc = T8 + Tb;
+			 Td = T5 + Tc;
+			 T33 = T5 - Tc;
+			 T6z = Tb - T8;
+			 T6A = T66 - T65;
+			 T6B = T6z - T6A;
+			 T6Q = T6z + T6A;
+		    }
+		    {
+			 E T3P, T3S, T64, T67;
+			 T3P = T1 - T4;
+			 T3S = T3Q - T3R;
+			 T3T = T3P - T3S;
+			 T5f = T3P + T3S;
+			 T64 = T3Q + T3R;
+			 T67 = T65 + T66;
+			 T68 = T64 + T67;
+			 T6m = T67 - T64;
+		    }
+	       }
+	       {
+		    E T22, T2D, T21, T2C, T26, T2z, T29, T2A, T1Z, T20;
+		    T22 = R1[WS(rs, 31)];
+		    T2D = R1[WS(rs, 15)];
+		    T1Z = R1[WS(rs, 7)];
+		    T20 = R1[WS(rs, 23)];
+		    T21 = KP707106781 * (T1Z - T20);
+		    T2C = KP707106781 * (T1Z + T20);
+		    {
+			 E T24, T25, T27, T28;
+			 T24 = R1[WS(rs, 3)];
+			 T25 = R1[WS(rs, 19)];
+			 T26 = FNMS(KP382683432, T25, KP923879532 * T24);
+			 T2z = FMA(KP382683432, T24, KP923879532 * T25);
+			 T27 = R1[WS(rs, 11)];
+			 T28 = R1[WS(rs, 27)];
+			 T29 = FNMS(KP923879532, T28, KP382683432 * T27);
+			 T2A = FMA(KP923879532, T27, KP382683432 * T28);
+		    }
+		    {
+			 E T23, T2a, T4M, T4N;
+			 T23 = T21 - T22;
+			 T2a = T26 + T29;
+			 T2b = T23 + T2a;
+			 T3n = T23 - T2a;
+			 T4M = T29 - T26;
+			 T4N = T2D - T2C;
+			 T4O = T4M - T4N;
+			 T5D = T4M + T4N;
+		    }
+		    {
+			 E T2B, T2E, T4I, T4J;
+			 T2B = T2z + T2A;
+			 T2E = T2C + T2D;
+			 T2F = T2B + T2E;
+			 T3r = T2E - T2B;
+			 T4I = T21 + T22;
+			 T4J = T2z - T2A;
+			 T4K = T4I + T4J;
+			 T5z = T4J - T4I;
+		    }
+	       }
+	       {
+		    E Ty, TP, TB, TO, TF, TL, TI, TM, Tz, TA;
+		    Ty = R0[WS(rs, 1)];
+		    TP = R0[WS(rs, 17)];
+		    Tz = R0[WS(rs, 9)];
+		    TA = R0[WS(rs, 25)];
+		    TB = KP707106781 * (Tz - TA);
+		    TO = KP707106781 * (Tz + TA);
+		    {
+			 E TD, TE, TG, TH;
+			 TD = R0[WS(rs, 5)];
+			 TE = R0[WS(rs, 21)];
+			 TF = FNMS(KP382683432, TE, KP923879532 * TD);
+			 TL = FMA(KP382683432, TD, KP923879532 * TE);
+			 TG = R0[WS(rs, 13)];
+			 TH = R0[WS(rs, 29)];
+			 TI = FNMS(KP923879532, TH, KP382683432 * TG);
+			 TM = FMA(KP923879532, TG, KP382683432 * TH);
+		    }
+		    {
+			 E TC, TJ, T45, T46;
+			 TC = Ty + TB;
+			 TJ = TF + TI;
+			 TK = TC + TJ;
+			 T3c = TC - TJ;
+			 T45 = TI - TF;
+			 T46 = TP - TO;
+			 T47 = T45 - T46;
+			 T5n = T45 + T46;
+		    }
+		    {
+			 E TN, TQ, T42, T43;
+			 TN = TL + TM;
+			 TQ = TO + TP;
+			 TR = TN + TQ;
+			 T3b = TQ - TN;
+			 T42 = Ty - TB;
+			 T43 = TL - TM;
+			 T44 = T42 - T43;
+			 T5o = T42 + T43;
+		    }
+	       }
+	       {
+		    E TW, T1a, TV, T19, T10, T16, T13, T17, TT, TU;
+		    TW = R0[WS(rs, 31)];
+		    T1a = R0[WS(rs, 15)];
+		    TT = R0[WS(rs, 7)];
+		    TU = R0[WS(rs, 23)];
+		    TV = KP707106781 * (TT - TU);
+		    T19 = KP707106781 * (TT + TU);
+		    {
+			 E TY, TZ, T11, T12;
+			 TY = R0[WS(rs, 3)];
+			 TZ = R0[WS(rs, 19)];
+			 T10 = FNMS(KP382683432, TZ, KP923879532 * TY);
+			 T16 = FMA(KP382683432, TY, KP923879532 * TZ);
+			 T11 = R0[WS(rs, 11)];
+			 T12 = R0[WS(rs, 27)];
+			 T13 = FNMS(KP923879532, T12, KP382683432 * T11);
+			 T17 = FMA(KP923879532, T11, KP382683432 * T12);
+		    }
+		    {
+			 E TX, T14, T4c, T4d;
+			 TX = TV - TW;
+			 T14 = T10 + T13;
+			 T15 = TX + T14;
+			 T38 = TX - T14;
+			 T4c = T13 - T10;
+			 T4d = T1a - T19;
+			 T4e = T4c - T4d;
+			 T5l = T4c + T4d;
+		    }
+		    {
+			 E T18, T1b, T49, T4a;
+			 T18 = T16 + T17;
+			 T1b = T19 + T1a;
+			 T1c = T18 + T1b;
+			 T39 = T1b - T18;
+			 T49 = TV + TW;
+			 T4a = T16 - T17;
+			 T4b = T49 + T4a;
+			 T5k = T4a - T49;
+		    }
+	       }
+	       {
+		    E T1g, T1U, T1j, T1T, T1n, T1Q, T1q, T1R, T1h, T1i;
+		    T1g = R1[0];
+		    T1U = R1[WS(rs, 16)];
+		    T1h = R1[WS(rs, 8)];
+		    T1i = R1[WS(rs, 24)];
+		    T1j = KP707106781 * (T1h - T1i);
+		    T1T = KP707106781 * (T1h + T1i);
+		    {
+			 E T1l, T1m, T1o, T1p;
+			 T1l = R1[WS(rs, 4)];
+			 T1m = R1[WS(rs, 20)];
+			 T1n = FNMS(KP382683432, T1m, KP923879532 * T1l);
+			 T1Q = FMA(KP382683432, T1l, KP923879532 * T1m);
+			 T1o = R1[WS(rs, 12)];
+			 T1p = R1[WS(rs, 28)];
+			 T1q = FNMS(KP923879532, T1p, KP382683432 * T1o);
+			 T1R = FMA(KP923879532, T1o, KP382683432 * T1p);
+		    }
+		    {
+			 E T1k, T1r, T4t, T4u;
+			 T1k = T1g + T1j;
+			 T1r = T1n + T1q;
+			 T1s = T1k + T1r;
+			 T3g = T1k - T1r;
+			 T4t = T1q - T1n;
+			 T4u = T1U - T1T;
+			 T4v = T4t - T4u;
+			 T5w = T4t + T4u;
+		    }
+		    {
+			 E T1S, T1V, T4i, T4j;
+			 T1S = T1Q + T1R;
+			 T1V = T1T + T1U;
+			 T1W = T1S + T1V;
+			 T3k = T1V - T1S;
+			 T4i = T1g - T1j;
+			 T4j = T1Q - T1R;
+			 T4k = T4i - T4j;
+			 T5s = T4i + T4j;
+		    }
+	       }
+	       {
+		    E T2g, T4F, T2j, T4E, T2p, T4C, T2s, T4B;
+		    {
+			 E T2c, T2i, T2f, T2h, T2d, T2e;
+			 T2c = R1[WS(rs, 1)];
+			 T2i = R1[WS(rs, 17)];
+			 T2d = R1[WS(rs, 9)];
+			 T2e = R1[WS(rs, 25)];
+			 T2f = KP707106781 * (T2d - T2e);
+			 T2h = KP707106781 * (T2d + T2e);
+			 T2g = T2c + T2f;
+			 T4F = T2c - T2f;
+			 T2j = T2h + T2i;
+			 T4E = T2i - T2h;
+		    }
+		    {
+			 E T2o, T2r, T2n, T2q, T2l, T2m;
+			 T2o = R1[WS(rs, 29)];
+			 T2r = R1[WS(rs, 13)];
+			 T2l = R1[WS(rs, 5)];
+			 T2m = R1[WS(rs, 21)];
+			 T2n = KP707106781 * (T2l - T2m);
+			 T2q = KP707106781 * (T2l + T2m);
+			 T2p = T2n - T2o;
+			 T4C = T2n + T2o;
+			 T2s = T2q + T2r;
+			 T4B = T2r - T2q;
+		    }
+		    {
+			 E T2k, T2t, T4P, T4Q;
+			 T2k = FNMS(KP195090322, T2j, KP980785280 * T2g);
+			 T2t = FMA(KP980785280, T2p, KP195090322 * T2s);
+			 T2u = T2k + T2t;
+			 T3q = T2t - T2k;
+			 T4P = FMA(KP831469612, T4F, KP555570233 * T4E);
+			 T4Q = FMA(KP831469612, T4C, KP555570233 * T4B);
+			 T4R = T4P + T4Q;
+			 T5A = T4P - T4Q;
+		    }
+		    {
+			 E T2w, T2x, T4D, T4G;
+			 T2w = FNMS(KP980785280, T2s, KP195090322 * T2p);
+			 T2x = FMA(KP195090322, T2g, KP980785280 * T2j);
+			 T2y = T2w - T2x;
+			 T3o = T2x + T2w;
+			 T4D = FNMS(KP555570233, T4C, KP831469612 * T4B);
+			 T4G = FNMS(KP555570233, T4F, KP831469612 * T4E);
+			 T4H = T4D - T4G;
+			 T5C = T4G + T4D;
+		    }
+	       }
+	       {
+		    E T1x, T4p, T1A, T4o, T1G, T4m, T1J, T4l;
+		    {
+			 E T1t, T1z, T1w, T1y, T1u, T1v;
+			 T1t = R1[WS(rs, 2)];
+			 T1z = R1[WS(rs, 18)];
+			 T1u = R1[WS(rs, 10)];
+			 T1v = R1[WS(rs, 26)];
+			 T1w = KP707106781 * (T1u - T1v);
+			 T1y = KP707106781 * (T1u + T1v);
+			 T1x = T1t + T1w;
+			 T4p = T1t - T1w;
+			 T1A = T1y + T1z;
+			 T4o = T1z - T1y;
+		    }
+		    {
+			 E T1F, T1I, T1E, T1H, T1C, T1D;
+			 T1F = R1[WS(rs, 30)];
+			 T1I = R1[WS(rs, 14)];
+			 T1C = R1[WS(rs, 6)];
+			 T1D = R1[WS(rs, 22)];
+			 T1E = KP707106781 * (T1C - T1D);
+			 T1H = KP707106781 * (T1C + T1D);
+			 T1G = T1E - T1F;
+			 T4m = T1E + T1F;
+			 T1J = T1H + T1I;
+			 T4l = T1I - T1H;
+		    }
+		    {
+			 E T1B, T1K, T4w, T4x;
+			 T1B = FNMS(KP195090322, T1A, KP980785280 * T1x);
+			 T1K = FMA(KP980785280, T1G, KP195090322 * T1J);
+			 T1L = T1B + T1K;
+			 T3j = T1K - T1B;
+			 T4w = FMA(KP831469612, T4p, KP555570233 * T4o);
+			 T4x = FMA(KP831469612, T4m, KP555570233 * T4l);
+			 T4y = T4w + T4x;
+			 T5t = T4w - T4x;
+		    }
+		    {
+			 E T1N, T1O, T4n, T4q;
+			 T1N = FNMS(KP980785280, T1J, KP195090322 * T1G);
+			 T1O = FMA(KP195090322, T1x, KP980785280 * T1A);
+			 T1P = T1N - T1O;
+			 T3h = T1O + T1N;
+			 T4n = FNMS(KP555570233, T4m, KP831469612 * T4l);
+			 T4q = FNMS(KP555570233, T4p, KP831469612 * T4o);
+			 T4r = T4n - T4q;
+			 T5v = T4q + T4n;
+		    }
+	       }
+	       {
+		    E Tx, T2N, T69, T6f, T1e, T6e, T2X, T30, T1Y, T2L, T2Q, T62, T2U, T31, T2H;
+		    E T2K, Tw, T63;
+		    Tw = Tm + Tv;
+		    Tx = Td + Tw;
+		    T2N = Td - Tw;
+		    T63 = T35 - T34;
+		    T69 = T63 - T68;
+		    T6f = T63 + T68;
+		    {
+			 E TS, T1d, T2V, T2W;
+			 TS = FNMS(KP098017140, TR, KP995184726 * TK);
+			 T1d = FMA(KP995184726, T15, KP098017140 * T1c);
+			 T1e = TS + T1d;
+			 T6e = T1d - TS;
+			 T2V = T2b - T2u;
+			 T2W = T2y + T2F;
+			 T2X = FNMS(KP671558954, T2W, KP740951125 * T2V);
+			 T30 = FMA(KP671558954, T2V, KP740951125 * T2W);
+		    }
+		    {
+			 E T1M, T1X, T2O, T2P;
+			 T1M = T1s + T1L;
+			 T1X = T1P - T1W;
+			 T1Y = FMA(KP998795456, T1M, KP049067674 * T1X);
+			 T2L = FNMS(KP049067674, T1M, KP998795456 * T1X);
+			 T2O = FMA(KP098017140, TK, KP995184726 * TR);
+			 T2P = FNMS(KP995184726, T1c, KP098017140 * T15);
+			 T2Q = T2O + T2P;
+			 T62 = T2P - T2O;
+		    }
+		    {
+			 E T2S, T2T, T2v, T2G;
+			 T2S = T1s - T1L;
+			 T2T = T1P + T1W;
+			 T2U = FMA(KP740951125, T2S, KP671558954 * T2T);
+			 T31 = FNMS(KP671558954, T2S, KP740951125 * T2T);
+			 T2v = T2b + T2u;
+			 T2G = T2y - T2F;
+			 T2H = FNMS(KP049067674, T2G, KP998795456 * T2v);
+			 T2K = FMA(KP049067674, T2v, KP998795456 * T2G);
+		    }
+		    {
+			 E T1f, T2I, T6b, T6c;
+			 T1f = Tx + T1e;
+			 T2I = T1Y + T2H;
+			 Cr[WS(csr, 31)] = T1f - T2I;
+			 Cr[0] = T1f + T2I;
+			 T6b = T2L + T2K;
+			 T6c = T62 + T69;
+			 Ci[WS(csi, 31)] = T6b - T6c;
+			 Ci[0] = T6b + T6c;
+		    }
+		    {
+			 E T2J, T2M, T61, T6a;
+			 T2J = Tx - T1e;
+			 T2M = T2K - T2L;
+			 Cr[WS(csr, 16)] = T2J - T2M;
+			 Cr[WS(csr, 15)] = T2J + T2M;
+			 T61 = T2H - T1Y;
+			 T6a = T62 - T69;
+			 Ci[WS(csi, 16)] = T61 - T6a;
+			 Ci[WS(csi, 15)] = T61 + T6a;
+		    }
+		    {
+			 E T2R, T2Y, T6h, T6i;
+			 T2R = T2N + T2Q;
+			 T2Y = T2U + T2X;
+			 Cr[WS(csr, 24)] = T2R - T2Y;
+			 Cr[WS(csr, 7)] = T2R + T2Y;
+			 T6h = T31 + T30;
+			 T6i = T6e + T6f;
+			 Ci[WS(csi, 24)] = T6h - T6i;
+			 Ci[WS(csi, 7)] = T6h + T6i;
+		    }
+		    {
+			 E T2Z, T32, T6d, T6g;
+			 T2Z = T2N - T2Q;
+			 T32 = T30 - T31;
+			 Cr[WS(csr, 23)] = T2Z - T32;
+			 Cr[WS(csr, 8)] = T2Z + T32;
+			 T6d = T2X - T2U;
+			 T6g = T6e - T6f;
+			 Ci[WS(csi, 23)] = T6d - T6g;
+			 Ci[WS(csi, 8)] = T6d + T6g;
+		    }
+	       }
+	       {
+		    E T5j, T5L, T6R, T6X, T5q, T6W, T5V, T5Y, T5y, T5J, T5O, T6O, T5S, T5Z, T5F;
+		    E T5I, T5i, T6P;
+		    T5i = T5g - T5h;
+		    T5j = T5f - T5i;
+		    T5L = T5f + T5i;
+		    T6P = T3Z + T3W;
+		    T6R = T6P - T6Q;
+		    T6X = T6P + T6Q;
+		    {
+			 E T5m, T5p, T5T, T5U;
+			 T5m = FMA(KP290284677, T5k, KP956940335 * T5l);
+			 T5p = FNMS(KP290284677, T5o, KP956940335 * T5n);
+			 T5q = T5m - T5p;
+			 T6W = T5p + T5m;
+			 T5T = T5z + T5A;
+			 T5U = T5C + T5D;
+			 T5V = FNMS(KP146730474, T5U, KP989176509 * T5T);
+			 T5Y = FMA(KP146730474, T5T, KP989176509 * T5U);
+		    }
+		    {
+			 E T5u, T5x, T5M, T5N;
+			 T5u = T5s - T5t;
+			 T5x = T5v - T5w;
+			 T5y = FMA(KP803207531, T5u, KP595699304 * T5x);
+			 T5J = FNMS(KP595699304, T5u, KP803207531 * T5x);
+			 T5M = FMA(KP956940335, T5o, KP290284677 * T5n);
+			 T5N = FNMS(KP290284677, T5l, KP956940335 * T5k);
+			 T5O = T5M + T5N;
+			 T6O = T5N - T5M;
+		    }
+		    {
+			 E T5Q, T5R, T5B, T5E;
+			 T5Q = T5s + T5t;
+			 T5R = T5v + T5w;
+			 T5S = FMA(KP989176509, T5Q, KP146730474 * T5R);
+			 T5Z = FNMS(KP146730474, T5Q, KP989176509 * T5R);
+			 T5B = T5z - T5A;
+			 T5E = T5C - T5D;
+			 T5F = FNMS(KP595699304, T5E, KP803207531 * T5B);
+			 T5I = FMA(KP595699304, T5B, KP803207531 * T5E);
+		    }
+		    {
+			 E T5r, T5G, T6T, T6U;
+			 T5r = T5j + T5q;
+			 T5G = T5y + T5F;
+			 Cr[WS(csr, 25)] = T5r - T5G;
+			 Cr[WS(csr, 6)] = T5r + T5G;
+			 T6T = T5J + T5I;
+			 T6U = T6O + T6R;
+			 Ci[WS(csi, 25)] = T6T - T6U;
+			 Ci[WS(csi, 6)] = T6T + T6U;
+		    }
+		    {
+			 E T5H, T5K, T6N, T6S;
+			 T5H = T5j - T5q;
+			 T5K = T5I - T5J;
+			 Cr[WS(csr, 22)] = T5H - T5K;
+			 Cr[WS(csr, 9)] = T5H + T5K;
+			 T6N = T5F - T5y;
+			 T6S = T6O - T6R;
+			 Ci[WS(csi, 22)] = T6N - T6S;
+			 Ci[WS(csi, 9)] = T6N + T6S;
+		    }
+		    {
+			 E T5P, T5W, T6Z, T70;
+			 T5P = T5L + T5O;
+			 T5W = T5S + T5V;
+			 Cr[WS(csr, 30)] = T5P - T5W;
+			 Cr[WS(csr, 1)] = T5P + T5W;
+			 T6Z = T5Z + T5Y;
+			 T70 = T6W + T6X;
+			 Ci[WS(csi, 30)] = T6Z - T70;
+			 Ci[WS(csi, 1)] = T6Z + T70;
+		    }
+		    {
+			 E T5X, T60, T6V, T6Y;
+			 T5X = T5L - T5O;
+			 T60 = T5Y - T5Z;
+			 Cr[WS(csr, 17)] = T5X - T60;
+			 Cr[WS(csr, 14)] = T5X + T60;
+			 T6V = T5V - T5S;
+			 T6Y = T6W - T6X;
+			 Ci[WS(csi, 17)] = T6V - T6Y;
+			 Ci[WS(csi, 14)] = T6V + T6Y;
+		    }
+	       }
+	       {
+		    E T37, T3z, T6n, T6t, T3e, T6s, T3J, T3M, T3m, T3x, T3C, T6k, T3G, T3N, T3t;
+		    E T3w, T36, T6l;
+		    T36 = T34 + T35;
+		    T37 = T33 - T36;
+		    T3z = T33 + T36;
+		    T6l = Tv - Tm;
+		    T6n = T6l - T6m;
+		    T6t = T6l + T6m;
+		    {
+			 E T3a, T3d, T3H, T3I;
+			 T3a = FMA(KP634393284, T38, KP773010453 * T39);
+			 T3d = FNMS(KP634393284, T3c, KP773010453 * T3b);
+			 T3e = T3a - T3d;
+			 T6s = T3d + T3a;
+			 T3H = T3n + T3o;
+			 T3I = T3q + T3r;
+			 T3J = FNMS(KP336889853, T3I, KP941544065 * T3H);
+			 T3M = FMA(KP336889853, T3H, KP941544065 * T3I);
+		    }
+		    {
+			 E T3i, T3l, T3A, T3B;
+			 T3i = T3g - T3h;
+			 T3l = T3j - T3k;
+			 T3m = FMA(KP903989293, T3i, KP427555093 * T3l);
+			 T3x = FNMS(KP427555093, T3i, KP903989293 * T3l);
+			 T3A = FMA(KP773010453, T3c, KP634393284 * T3b);
+			 T3B = FNMS(KP634393284, T39, KP773010453 * T38);
+			 T3C = T3A + T3B;
+			 T6k = T3B - T3A;
+		    }
+		    {
+			 E T3E, T3F, T3p, T3s;
+			 T3E = T3g + T3h;
+			 T3F = T3j + T3k;
+			 T3G = FMA(KP941544065, T3E, KP336889853 * T3F);
+			 T3N = FNMS(KP336889853, T3E, KP941544065 * T3F);
+			 T3p = T3n - T3o;
+			 T3s = T3q - T3r;
+			 T3t = FNMS(KP427555093, T3s, KP903989293 * T3p);
+			 T3w = FMA(KP427555093, T3p, KP903989293 * T3s);
+		    }
+		    {
+			 E T3f, T3u, T6p, T6q;
+			 T3f = T37 + T3e;
+			 T3u = T3m + T3t;
+			 Cr[WS(csr, 27)] = T3f - T3u;
+			 Cr[WS(csr, 4)] = T3f + T3u;
+			 T6p = T3x + T3w;
+			 T6q = T6k + T6n;
+			 Ci[WS(csi, 27)] = T6p - T6q;
+			 Ci[WS(csi, 4)] = T6p + T6q;
+		    }
+		    {
+			 E T3v, T3y, T6j, T6o;
+			 T3v = T37 - T3e;
+			 T3y = T3w - T3x;
+			 Cr[WS(csr, 20)] = T3v - T3y;
+			 Cr[WS(csr, 11)] = T3v + T3y;
+			 T6j = T3t - T3m;
+			 T6o = T6k - T6n;
+			 Ci[WS(csi, 20)] = T6j - T6o;
+			 Ci[WS(csi, 11)] = T6j + T6o;
+		    }
+		    {
+			 E T3D, T3K, T6v, T6w;
+			 T3D = T3z + T3C;
+			 T3K = T3G + T3J;
+			 Cr[WS(csr, 28)] = T3D - T3K;
+			 Cr[WS(csr, 3)] = T3D + T3K;
+			 T6v = T3N + T3M;
+			 T6w = T6s + T6t;
+			 Ci[WS(csi, 28)] = T6v - T6w;
+			 Ci[WS(csi, 3)] = T6v + T6w;
+		    }
+		    {
+			 E T3L, T3O, T6r, T6u;
+			 T3L = T3z - T3C;
+			 T3O = T3M - T3N;
+			 Cr[WS(csr, 19)] = T3L - T3O;
+			 Cr[WS(csr, 12)] = T3L + T3O;
+			 T6r = T3J - T3G;
+			 T6u = T6s - T6t;
+			 Ci[WS(csi, 19)] = T6r - T6u;
+			 Ci[WS(csi, 12)] = T6r + T6u;
+		    }
+	       }
+	       {
+		    E T41, T4Z, T6D, T6J, T4g, T6I, T59, T5d, T4A, T4X, T52, T6y, T56, T5c, T4T;
+		    E T4W, T40, T6C;
+		    T40 = T3W - T3Z;
+		    T41 = T3T + T40;
+		    T4Z = T3T - T40;
+		    T6C = T5g + T5h;
+		    T6D = T6B - T6C;
+		    T6J = T6C + T6B;
+		    {
+			 E T48, T4f, T57, T58;
+			 T48 = FMA(KP881921264, T44, KP471396736 * T47);
+			 T4f = FMA(KP881921264, T4b, KP471396736 * T4e);
+			 T4g = T48 - T4f;
+			 T6I = T48 + T4f;
+			 T57 = T4K + T4H;
+			 T58 = T4R + T4O;
+			 T59 = FMA(KP514102744, T57, KP857728610 * T58);
+			 T5d = FNMS(KP857728610, T57, KP514102744 * T58);
+		    }
+		    {
+			 E T4s, T4z, T50, T51;
+			 T4s = T4k + T4r;
+			 T4z = T4v - T4y;
+			 T4A = FMA(KP970031253, T4s, KP242980179 * T4z);
+			 T4X = FNMS(KP242980179, T4s, KP970031253 * T4z);
+			 T50 = FNMS(KP471396736, T4b, KP881921264 * T4e);
+			 T51 = FNMS(KP471396736, T44, KP881921264 * T47);
+			 T52 = T50 - T51;
+			 T6y = T51 + T50;
+		    }
+		    {
+			 E T54, T55, T4L, T4S;
+			 T54 = T4k - T4r;
+			 T55 = T4y + T4v;
+			 T56 = FMA(KP514102744, T54, KP857728610 * T55);
+			 T5c = FNMS(KP514102744, T55, KP857728610 * T54);
+			 T4L = T4H - T4K;
+			 T4S = T4O - T4R;
+			 T4T = FNMS(KP242980179, T4S, KP970031253 * T4L);
+			 T4W = FMA(KP242980179, T4L, KP970031253 * T4S);
+		    }
+		    {
+			 E T4h, T4U, T6F, T6G;
+			 T4h = T41 + T4g;
+			 T4U = T4A + T4T;
+			 Cr[WS(csr, 29)] = T4h - T4U;
+			 Cr[WS(csr, 2)] = T4h + T4U;
+			 T6F = T4X + T4W;
+			 T6G = T6y + T6D;
+			 Ci[WS(csi, 29)] = T6F - T6G;
+			 Ci[WS(csi, 2)] = T6F + T6G;
+		    }
+		    {
+			 E T4V, T4Y, T6x, T6E;
+			 T4V = T41 - T4g;
+			 T4Y = T4W - T4X;
+			 Cr[WS(csr, 18)] = T4V - T4Y;
+			 Cr[WS(csr, 13)] = T4V + T4Y;
+			 T6x = T4T - T4A;
+			 T6E = T6y - T6D;
+			 Ci[WS(csi, 18)] = T6x - T6E;
+			 Ci[WS(csi, 13)] = T6x + T6E;
+		    }
+		    {
+			 E T53, T5a, T6L, T6M;
+			 T53 = T4Z - T52;
+			 T5a = T56 - T59;
+			 Cr[WS(csr, 21)] = T53 - T5a;
+			 Cr[WS(csr, 10)] = T53 + T5a;
+			 T6L = T5d - T5c;
+			 T6M = T6J - T6I;
+			 Ci[WS(csi, 21)] = T6L - T6M;
+			 Ci[WS(csi, 10)] = T6L + T6M;
+		    }
+		    {
+			 E T5b, T5e, T6H, T6K;
+			 T5b = T4Z + T52;
+			 T5e = T5c + T5d;
+			 Cr[WS(csr, 26)] = T5b - T5e;
+			 Cr[WS(csr, 5)] = T5b + T5e;
+			 T6H = T56 + T59;
+			 T6K = T6I + T6J;
+			 Ci[WS(csi, 5)] = -(T6H + T6K);
+			 Ci[WS(csi, 26)] = T6K - T6H;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 64, "r2cfII_64", {342, 114, 92, 0}, &GENUS };
+
+void X(codelet_r2cfII_64) (planner *p) {
+     X(kr2c_register) (p, r2cfII_64, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:13 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cfII_7 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 24 FP additions, 18 FP multiplications,
+ * (or, 9 additions, 3 multiplications, 15 fused multiply/add),
+ * 25 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
+	       E Td, Tk;
+	       {
+		    E T4, T3, Te, T5, T9, Tf, T6, Tg, Tj;
+		    Td = R0[0];
+		    {
+			 E T1, T2, T7, T8;
+			 T1 = R0[WS(rs, 1)];
+			 T2 = R1[WS(rs, 2)];
+			 T7 = R1[WS(rs, 1)];
+			 T8 = R0[WS(rs, 2)];
+			 T4 = R1[0];
+			 T3 = T1 + T2;
+			 Te = T1 - T2;
+			 T5 = R0[WS(rs, 3)];
+			 T9 = T7 + T8;
+			 Tf = T8 - T7;
+		    }
+		    T6 = T4 + T5;
+		    Tg = T5 - T4;
+		    Tj = FNMS(KP356895867, Tf, Te);
+		    {
+			 E Ta, Th, Tl, Tb, Ti, Tm, Tc;
+			 Tb = FNMS(KP554958132, T3, T9);
+			 Ta = FMA(KP554958132, T9, T6);
+			 Th = FNMS(KP356895867, Tg, Tf);
+			 Tl = FNMS(KP356895867, Te, Tg);
+			 Ci[WS(csi, 1)] = -(KP974927912 * (FNMS(KP801937735, Tb, T6)));
+			 Ci[WS(csi, 2)] = KP974927912 * (FNMS(KP801937735, Ta, T3));
+			 Ti = FNMS(KP692021471, Th, Te);
+			 Tm = FNMS(KP692021471, Tl, Tf);
+			 Cr[WS(csr, 3)] = Te + Tg + Tf + Td;
+			 Tc = FMA(KP554958132, T6, T3);
+			 Cr[WS(csr, 1)] = FNMS(KP900968867, Ti, Td);
+			 Cr[WS(csr, 2)] = FNMS(KP900968867, Tm, Td);
+			 Tk = FNMS(KP692021471, Tj, Tg);
+			 Ci[0] = -(KP974927912 * (FMA(KP801937735, Tc, T9)));
+		    }
+	       }
+	       Cr[0] = FNMS(KP900968867, Tk, Td);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 7, "r2cfII_7", {9, 3, 15, 0}, &GENUS };
+
+void X(codelet_r2cfII_7) (planner *p) {
+     X(kr2c_register) (p, r2cfII_7, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cfII_7 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 24 FP additions, 18 FP multiplications,
+ * (or, 12 additions, 6 multiplications, 12 fused multiply/add),
+ * 20 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
+	       E T1, Ta, Td, T4, Tb, T7, Tc, T8, T9;
+	       T1 = R0[0];
+	       T8 = R1[0];
+	       T9 = R0[WS(rs, 3)];
+	       Ta = T8 - T9;
+	       Td = T8 + T9;
+	       {
+		    E T2, T3, T5, T6;
+		    T2 = R0[WS(rs, 1)];
+		    T3 = R1[WS(rs, 2)];
+		    T4 = T2 - T3;
+		    Tb = T2 + T3;
+		    T5 = R1[WS(rs, 1)];
+		    T6 = R0[WS(rs, 2)];
+		    T7 = T5 - T6;
+		    Tc = T5 + T6;
+	       }
+	       Ci[0] = -(FMA(KP781831482, Tb, KP974927912 * Tc) + (KP433883739 * Td));
+	       Ci[WS(csi, 1)] = FNMS(KP974927912, Td, KP781831482 * Tc) - (KP433883739 * Tb);
+	       Cr[0] = FMA(KP623489801, T4, T1) + FMA(KP222520933, T7, KP900968867 * Ta);
+	       Ci[WS(csi, 2)] = FNMS(KP781831482, Td, KP974927912 * Tb) - (KP433883739 * Tc);
+	       Cr[WS(csr, 2)] = FMA(KP900968867, T7, T1) + FNMA(KP623489801, Ta, KP222520933 * T4);
+	       Cr[WS(csr, 1)] = FMA(KP222520933, Ta, T1) + FNMA(KP623489801, T7, KP900968867 * T4);
+	       Cr[WS(csr, 3)] = T1 + T4 - (T7 + Ta);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 7, "r2cfII_7", {12, 6, 12, 0}, &GENUS };
+
+void X(codelet_r2cfII_7) (planner *p) {
+     X(kr2c_register) (p, r2cfII_7, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:14 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cfII_8 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 22 FP additions, 16 FP multiplications,
+ * (or, 6 additions, 0 multiplications, 16 fused multiply/add),
+ * 22 stack variables, 3 constants, and 16 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
+	       E Te, T8, Td, T5, Tj, Tl, Tf, Tb;
+	       {
+		    E T1, Th, T9, Ti, T4, Ta;
+		    T1 = R0[0];
+		    Th = R0[WS(rs, 2)];
+		    {
+			 E T2, T3, T6, T7;
+			 T2 = R0[WS(rs, 1)];
+			 T3 = R0[WS(rs, 3)];
+			 T6 = R1[0];
+			 T7 = R1[WS(rs, 2)];
+			 T9 = R1[WS(rs, 3)];
+			 Ti = T2 + T3;
+			 T4 = T2 - T3;
+			 Te = FMA(KP414213562, T6, T7);
+			 T8 = FNMS(KP414213562, T7, T6);
+			 Ta = R1[WS(rs, 1)];
+		    }
+		    Td = FNMS(KP707106781, T4, T1);
+		    T5 = FMA(KP707106781, T4, T1);
+		    Tj = FMA(KP707106781, Ti, Th);
+		    Tl = FNMS(KP707106781, Ti, Th);
+		    Tf = FMA(KP414213562, T9, Ta);
+		    Tb = FMS(KP414213562, Ta, T9);
+	       }
+	       {
+		    E Tk, Tg, Tc, Tm;
+		    Tk = Te + Tf;
+		    Tg = Te - Tf;
+		    Tc = T8 + Tb;
+		    Tm = Tb - T8;
+		    Cr[WS(csr, 1)] = FMA(KP923879532, Tg, Td);
+		    Cr[WS(csr, 2)] = FNMS(KP923879532, Tg, Td);
+		    Ci[WS(csi, 3)] = FNMS(KP923879532, Tk, Tj);
+		    Ci[0] = -(FMA(KP923879532, Tk, Tj));
+		    Ci[WS(csi, 1)] = FMA(KP923879532, Tm, Tl);
+		    Ci[WS(csi, 2)] = FMS(KP923879532, Tm, Tl);
+		    Cr[0] = FMA(KP923879532, Tc, T5);
+		    Cr[WS(csr, 3)] = FNMS(KP923879532, Tc, T5);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 8, "r2cfII_8", {6, 0, 16, 0}, &GENUS };
+
+void X(codelet_r2cfII_8) (planner *p) {
+     X(kr2c_register) (p, r2cfII_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cfII_8 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 22 FP additions, 10 FP multiplications,
+ * (or, 18 additions, 6 multiplications, 4 fused multiply/add),
+ * 18 stack variables, 3 constants, and 16 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
+	       E T1, Tj, T4, Ti, T8, Te, Tb, Tf, T2, T3;
+	       T1 = R0[0];
+	       Tj = R0[WS(rs, 2)];
+	       T2 = R0[WS(rs, 1)];
+	       T3 = R0[WS(rs, 3)];
+	       T4 = KP707106781 * (T2 - T3);
+	       Ti = KP707106781 * (T2 + T3);
+	       {
+		    E T6, T7, T9, Ta;
+		    T6 = R1[0];
+		    T7 = R1[WS(rs, 2)];
+		    T8 = FNMS(KP382683432, T7, KP923879532 * T6);
+		    Te = FMA(KP382683432, T6, KP923879532 * T7);
+		    T9 = R1[WS(rs, 1)];
+		    Ta = R1[WS(rs, 3)];
+		    Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
+		    Tf = FMA(KP923879532, T9, KP382683432 * Ta);
+	       }
+	       {
+		    E T5, Tc, Th, Tk;
+		    T5 = T1 + T4;
+		    Tc = T8 + Tb;
+		    Cr[WS(csr, 3)] = T5 - Tc;
+		    Cr[0] = T5 + Tc;
+		    Th = Te + Tf;
+		    Tk = Ti + Tj;
+		    Ci[0] = -(Th + Tk);
+		    Ci[WS(csi, 3)] = Tk - Th;
+	       }
+	       {
+		    E Td, Tg, Tl, Tm;
+		    Td = T1 - T4;
+		    Tg = Te - Tf;
+		    Cr[WS(csr, 2)] = Td - Tg;
+		    Cr[WS(csr, 1)] = Td + Tg;
+		    Tl = Tb - T8;
+		    Tm = Tj - Ti;
+		    Ci[WS(csi, 2)] = Tl - Tm;
+		    Ci[WS(csi, 1)] = Tl + Tm;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 8, "r2cfII_8", {18, 6, 4, 0}, &GENUS };
+
+void X(codelet_r2cfII_8) (planner *p) {
+     X(kr2c_register) (p, r2cfII_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cfII_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:40:14 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cfII_9 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 42 FP additions, 34 FP multiplications,
+ * (or, 12 additions, 4 multiplications, 30 fused multiply/add),
+ * 46 stack variables, 17 constants, and 18 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DK(KP879385241, +0.879385241571816768108218554649462939872416269);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DK(KP666666666, +0.666666666666666666666666666666666666666666667);
+     DK(KP673648177, +0.673648177666930348851716626769314796000375677);
+     DK(KP898197570, +0.898197570222573798468955502359086394667167570);
+     DK(KP826351822, +0.826351822333069651148283373230685203999624323);
+     DK(KP907603734, +0.907603734547952313649323976213898122064543220);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP420276625, +0.420276625461206169731530603237061658838781920);
+     DK(KP315207469, +0.315207469095904627298647952427796244129086440);
+     DK(KP203604859, +0.203604859554852403062088995281827210665664861);
+     DK(KP152703644, +0.152703644666139302296566746461370407999248646);
+     DK(KP726681596, +0.726681596905677465811651808188092531873167623);
+     DK(KP968908795, +0.968908795874236621082202410917456709164223497);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
+	       E To, T5, Tp, Ta, Ti, Tm, TB, Tq, Tt, Tf, Th;
+	       {
+		    E T1, T6, T4, Tb, Tk, T9, Tc, Td, Tl, Te;
+		    {
+			 E T2, T3, T7, T8;
+			 T1 = R0[0];
+			 T2 = R0[WS(rs, 3)];
+			 T3 = R1[WS(rs, 1)];
+			 T6 = R0[WS(rs, 1)];
+			 T7 = R0[WS(rs, 4)];
+			 T8 = R1[WS(rs, 2)];
+			 T4 = T2 - T3;
+			 To = T2 + T3;
+			 Tb = R0[WS(rs, 2)];
+			 Tk = T7 + T8;
+			 T9 = T7 - T8;
+			 Tc = R1[0];
+			 Td = R1[WS(rs, 3)];
+		    }
+		    T5 = T1 + T4;
+		    Tp = FNMS(KP500000000, T4, T1);
+		    Ta = T6 + T9;
+		    Tl = FNMS(KP500000000, T9, T6);
+		    Te = Tc + Td;
+		    Ti = Tc - Td;
+		    Tm = FMA(KP968908795, Tl, Tk);
+		    TB = FNMS(KP726681596, Tk, Tl);
+		    Tq = FNMS(KP152703644, Tk, Tl);
+		    Tt = FMA(KP203604859, Tl, Tk);
+		    Tf = Tb - Te;
+		    Th = FMA(KP500000000, Te, Tb);
+	       }
+	       {
+		    E Ts, Tr, TA, Tj, Tg;
+		    Ts = FMA(KP315207469, Ti, Th);
+		    Tr = FNMS(KP420276625, Th, Ti);
+		    TA = FMA(KP203604859, Th, Ti);
+		    Tj = FNMS(KP152703644, Ti, Th);
+		    Tg = Ta + Tf;
+		    Ci[WS(csi, 1)] = KP866025403 * (Tf - Ta);
+		    {
+			 E Tu, Tx, TF, TC;
+			 Tu = FNMS(KP907603734, Tt, Ts);
+			 Tx = FNMS(KP826351822, Tr, Tq);
+			 TF = FMA(KP898197570, TB, TA);
+			 TC = FNMS(KP898197570, TB, TA);
+			 {
+			      E TE, Tn, Tv, Ty;
+			      TE = FNMS(KP673648177, Tm, Tj);
+			      Tn = FMA(KP673648177, Tm, Tj);
+			      Cr[WS(csr, 4)] = T5 + Tg;
+			      Cr[WS(csr, 1)] = FNMS(KP500000000, Tg, T5);
+			      Tv = FNMS(KP666666666, Tu, Tr);
+			      Ty = FNMS(KP666666666, Tx, Tt);
+			      Cr[0] = FMA(KP852868531, TF, Tp);
+			      {
+				   E TG, TD, Tw, Tz;
+				   TG = FMA(KP500000000, TF, TE);
+				   Ci[0] = -(KP984807753 * (FMA(KP879385241, To, Tn)));
+				   TD = FNMS(KP666666666, Tn, TC);
+				   Tw = FMA(KP826351822, Tv, Tq);
+				   Tz = FMA(KP907603734, Ty, Ts);
+				   Cr[WS(csr, 3)] = FNMS(KP852868531, TG, Tp);
+				   Ci[WS(csi, 3)] = -(KP866025403 * (FMA(KP852868531, TD, To)));
+				   Cr[WS(csr, 2)] = FNMS(KP852868531, Tw, Tp);
+				   Ci[WS(csi, 2)] = KP866025403 * (FNMS(KP939692620, Tz, To));
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 9, "r2cfII_9", {12, 4, 30, 0}, &GENUS };
+
+void X(codelet_r2cfII_9) (planner *p) {
+     X(kr2c_register) (p, r2cfII_9, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cfII_9 -dft-II -include r2cfII.h */
+
+/*
+ * This function contains 42 FP additions, 30 FP multiplications,
+ * (or, 25 additions, 13 multiplications, 17 fused multiply/add),
+ * 39 stack variables, 14 constants, and 18 memory accesses
+ */
+#include "r2cfII.h"
+
+static void r2cfII_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP663413948, +0.663413948168938396205421319635891297216863310);
+     DK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DK(KP556670399, +0.556670399226419366452912952047023132968291906);
+     DK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP150383733, +0.150383733180435296639271897612501926072238258);
+     DK(KP813797681, +0.813797681349373692844693217248393223289101568);
+     DK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DK(KP296198132, +0.296198132726023843175338011893050938967728390);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
+	       E T1, T4, To, Ta, Tl, Tk, Tf, Ti, Th, T2, T3, T5, Tg;
+	       T1 = R0[0];
+	       T2 = R1[WS(rs, 1)];
+	       T3 = R0[WS(rs, 3)];
+	       T4 = T2 - T3;
+	       To = T2 + T3;
+	       {
+		    E T6, T7, T8, T9;
+		    T6 = R0[WS(rs, 1)];
+		    T7 = R1[WS(rs, 2)];
+		    T8 = R0[WS(rs, 4)];
+		    T9 = T7 - T8;
+		    Ta = T6 - T9;
+		    Tl = T7 + T8;
+		    Tk = FMA(KP500000000, T9, T6);
+	       }
+	       {
+		    E Tb, Tc, Td, Te;
+		    Tb = R0[WS(rs, 2)];
+		    Tc = R1[0];
+		    Td = R1[WS(rs, 3)];
+		    Te = Tc + Td;
+		    Tf = Tb - Te;
+		    Ti = FMA(KP500000000, Te, Tb);
+		    Th = Tc - Td;
+	       }
+	       Ci[WS(csi, 1)] = KP866025403 * (Tf - Ta);
+	       T5 = T1 - T4;
+	       Tg = Ta + Tf;
+	       Cr[WS(csr, 1)] = FNMS(KP500000000, Tg, T5);
+	       Cr[WS(csr, 4)] = T5 + Tg;
+	       {
+		    E Tr, Tt, Tw, Tv, Tu, Tp, Tq, Ts, Tj, Tm, Tn;
+		    Tr = FMA(KP500000000, T4, T1);
+		    Tt = FMA(KP296198132, Th, KP939692620 * Ti);
+		    Tw = FNMS(KP813797681, Th, KP342020143 * Ti);
+		    Tv = FNMS(KP984807753, Tk, KP150383733 * Tl);
+		    Tu = FMA(KP173648177, Tk, KP852868531 * Tl);
+		    Tp = FNMS(KP556670399, Tl, KP766044443 * Tk);
+		    Tq = FMA(KP852868531, Th, KP173648177 * Ti);
+		    Ts = Tp + Tq;
+		    Tj = FNMS(KP984807753, Ti, KP150383733 * Th);
+		    Tm = FMA(KP642787609, Tk, KP663413948 * Tl);
+		    Tn = Tj - Tm;
+		    Ci[0] = FNMS(KP866025403, To, Tn);
+		    Cr[0] = Tr + Ts;
+		    Ci[WS(csi, 3)] = FNMS(KP500000000, Tn, KP866025403 * ((Tp - Tq) - To));
+		    Cr[WS(csr, 3)] = FMA(KP866025403, Tm + Tj, Tr) - (KP500000000 * Ts);
+		    Ci[WS(csi, 2)] = FMA(KP866025403, To - (Tu + Tt), KP500000000 * (Tw - Tv));
+		    Cr[WS(csr, 2)] = FMA(KP500000000, Tt - Tu, Tr) + (KP866025403 * (Tv + Tw));
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 9, "r2cfII_9", {25, 13, 17, 0}, &GENUS };
+
+void X(codelet_r2cfII_9) (planner *p) {
+     X(kr2c_register) (p, r2cfII_9, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cf_10 -include r2cf.h */
+
+/*
+ * This function contains 34 FP additions, 14 FP multiplications,
+ * (or, 24 additions, 4 multiplications, 10 fused multiply/add),
+ * 29 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
+	       E Tt, T3, T7, Tq, T6, Tv, Tp, Tm, Th, T8, T1, T2, T9, Tr;
+	       T1 = R0[0];
+	       T2 = R1[WS(rs, 2)];
+	       {
+		    E Te, Tn, Td, Tf, Tb, Tc;
+		    Tb = R0[WS(rs, 2)];
+		    Tc = R1[WS(rs, 4)];
+		    Te = R0[WS(rs, 3)];
+		    Tt = T1 + T2;
+		    T3 = T1 - T2;
+		    Tn = Tb + Tc;
+		    Td = Tb - Tc;
+		    Tf = R1[0];
+		    {
+			 E T4, T5, To, Tg;
+			 T4 = R0[WS(rs, 1)];
+			 T5 = R1[WS(rs, 3)];
+			 T7 = R0[WS(rs, 4)];
+			 To = Te + Tf;
+			 Tg = Te - Tf;
+			 Tq = T4 + T5;
+			 T6 = T4 - T5;
+			 Tv = Tn + To;
+			 Tp = Tn - To;
+			 Tm = Tg - Td;
+			 Th = Td + Tg;
+			 T8 = R1[WS(rs, 1)];
+		    }
+	       }
+	       T9 = T7 - T8;
+	       Tr = T7 + T8;
+	       {
+		    E Ty, Tk, Tx, Tj, Tu, Ts;
+		    Tu = Tq + Tr;
+		    Ts = Tq - Tr;
+		    {
+			 E Ta, Tl, Tw, Ti;
+			 Ta = T6 + T9;
+			 Tl = T6 - T9;
+			 Ci[WS(csi, 4)] = KP951056516 * (FMA(KP618033988, Tp, Ts));
+			 Ci[WS(csi, 2)] = KP951056516 * (FNMS(KP618033988, Ts, Tp));
+			 Ty = Tu - Tv;
+			 Tw = Tu + Tv;
+			 Ci[WS(csi, 3)] = KP951056516 * (FMA(KP618033988, Tl, Tm));
+			 Ci[WS(csi, 1)] = -(KP951056516 * (FNMS(KP618033988, Tm, Tl)));
+			 Tk = Ta - Th;
+			 Ti = Ta + Th;
+			 Cr[0] = Tt + Tw;
+			 Tx = FNMS(KP250000000, Tw, Tt);
+			 Cr[WS(csr, 5)] = T3 + Ti;
+			 Tj = FNMS(KP250000000, Ti, T3);
+		    }
+		    Cr[WS(csr, 4)] = FMA(KP559016994, Ty, Tx);
+		    Cr[WS(csr, 2)] = FNMS(KP559016994, Ty, Tx);
+		    Cr[WS(csr, 3)] = FNMS(KP559016994, Tk, Tj);
+		    Cr[WS(csr, 1)] = FMA(KP559016994, Tk, Tj);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 10, "r2cf_10", {24, 4, 10, 0}, &GENUS };
+
+void X(codelet_r2cf_10) (planner *p) {
+     X(kr2c_register) (p, r2cf_10, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cf_10 -include r2cf.h */
+
+/*
+ * This function contains 34 FP additions, 12 FP multiplications,
+ * (or, 28 additions, 6 multiplications, 6 fused multiply/add),
+ * 26 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
+	       E Ti, Tt, Ta, Tn, Td, To, Te, Tv, T3, Tq, T6, Tr, T7, Tu, Tg;
+	       E Th;
+	       Tg = R0[0];
+	       Th = R1[WS(rs, 2)];
+	       Ti = Tg - Th;
+	       Tt = Tg + Th;
+	       {
+		    E T8, T9, Tb, Tc;
+		    T8 = R0[WS(rs, 2)];
+		    T9 = R1[WS(rs, 4)];
+		    Ta = T8 - T9;
+		    Tn = T8 + T9;
+		    Tb = R0[WS(rs, 3)];
+		    Tc = R1[0];
+		    Td = Tb - Tc;
+		    To = Tb + Tc;
+	       }
+	       Te = Ta + Td;
+	       Tv = Tn + To;
+	       {
+		    E T1, T2, T4, T5;
+		    T1 = R0[WS(rs, 1)];
+		    T2 = R1[WS(rs, 3)];
+		    T3 = T1 - T2;
+		    Tq = T1 + T2;
+		    T4 = R0[WS(rs, 4)];
+		    T5 = R1[WS(rs, 1)];
+		    T6 = T4 - T5;
+		    Tr = T4 + T5;
+	       }
+	       T7 = T3 + T6;
+	       Tu = Tq + Tr;
+	       {
+		    E Tl, Tm, Tf, Tj, Tk;
+		    Tl = Td - Ta;
+		    Tm = T3 - T6;
+		    Ci[WS(csi, 1)] = FNMS(KP951056516, Tm, KP587785252 * Tl);
+		    Ci[WS(csi, 3)] = FMA(KP587785252, Tm, KP951056516 * Tl);
+		    Tf = KP559016994 * (T7 - Te);
+		    Tj = T7 + Te;
+		    Tk = FNMS(KP250000000, Tj, Ti);
+		    Cr[WS(csr, 1)] = Tf + Tk;
+		    Cr[WS(csr, 5)] = Ti + Tj;
+		    Cr[WS(csr, 3)] = Tk - Tf;
+	       }
+	       {
+		    E Tp, Ts, Ty, Tw, Tx;
+		    Tp = Tn - To;
+		    Ts = Tq - Tr;
+		    Ci[WS(csi, 2)] = FNMS(KP587785252, Ts, KP951056516 * Tp);
+		    Ci[WS(csi, 4)] = FMA(KP951056516, Ts, KP587785252 * Tp);
+		    Ty = KP559016994 * (Tu - Tv);
+		    Tw = Tu + Tv;
+		    Tx = FNMS(KP250000000, Tw, Tt);
+		    Cr[WS(csr, 2)] = Tx - Ty;
+		    Cr[0] = Tt + Tw;
+		    Cr[WS(csr, 4)] = Ty + Tx;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 10, "r2cf_10", {28, 6, 6, 0}, &GENUS };
+
+void X(codelet_r2cf_10) (planner *p) {
+     X(kr2c_register) (p, r2cf_10, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_11.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_11.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 11 -name r2cf_11 -include r2cf.h */
+
+/*
+ * This function contains 60 FP additions, 50 FP multiplications,
+ * (or, 15 additions, 5 multiplications, 45 fused multiply/add),
+ * 51 stack variables, 10 constants, and 22 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_11(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP959492973, +0.959492973614497389890368057066327699062454848);
+     DK(KP876768831, +0.876768831002589333891339807079336796764054852);
+     DK(KP918985947, +0.918985947228994779780736114132655398124909697);
+     DK(KP989821441, +0.989821441880932732376092037776718787376519372);
+     DK(KP778434453, +0.778434453334651800608337670740821884709317477);
+     DK(KP830830026, +0.830830026003772851058548298459246407048009821);
+     DK(KP715370323, +0.715370323453429719112414662767260662417897278);
+     DK(KP634356270, +0.634356270682424498893150776899916060542806975);
+     DK(KP342584725, +0.342584725681637509502641509861112333758894680);
+     DK(KP521108558, +0.521108558113202722944698153526659300680427422);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(44, rs), MAKE_VOLATILE_STRIDE(44, csr), MAKE_VOLATILE_STRIDE(44, csi)) {
+	       E T1, Tg, TF, TB, TI, TL, Tz, TA;
+	       {
+		    E T4, TC, TE, T7, TD, Ta, TS, TG, TJ, Td, TP, TM, Ty, Tq, Th;
+		    E Tt, Tl;
+		    T1 = R0[0];
+		    {
+			 E Tb, Tc, Tx, Tp;
+			 {
+			      E T2, T3, Te, Tf;
+			      T2 = R1[0];
+			      T3 = R0[WS(rs, 5)];
+			      Te = R1[WS(rs, 2)];
+			      Tf = R0[WS(rs, 3)];
+			      {
+				   E T5, T6, T8, T9;
+				   T5 = R0[WS(rs, 1)];
+				   T4 = T2 + T3;
+				   TC = T3 - T2;
+				   Tg = Te + Tf;
+				   TE = Tf - Te;
+				   T6 = R1[WS(rs, 4)];
+				   T8 = R1[WS(rs, 1)];
+				   T9 = R0[WS(rs, 4)];
+				   Tb = R0[WS(rs, 2)];
+				   T7 = T5 + T6;
+				   TD = T5 - T6;
+				   Ta = T8 + T9;
+				   TF = T9 - T8;
+				   Tc = R1[WS(rs, 3)];
+			      }
+			 }
+			 TS = FMA(KP521108558, TC, TD);
+			 TG = FMA(KP521108558, TF, TE);
+			 TJ = FMA(KP521108558, TE, TC);
+			 Td = Tb + Tc;
+			 TB = Tb - Tc;
+			 Tx = FNMS(KP342584725, Ta, T7);
+			 Tp = FNMS(KP342584725, T4, Ta);
+			 TP = FNMS(KP521108558, TB, TF);
+			 TM = FNMS(KP521108558, TD, TB);
+			 Ty = FNMS(KP634356270, Tx, Td);
+			 Tq = FNMS(KP634356270, Tp, Tg);
+			 Th = FNMS(KP342584725, Tg, Td);
+			 Tt = FNMS(KP342584725, Td, T4);
+			 Tl = FNMS(KP342584725, T7, Tg);
+		    }
+		    {
+			 E Tu, Ts, TN, Tv;
+			 {
+			      E Tm, TU, Tj, Ti, TT;
+			      TT = FMA(KP715370323, TS, TF);
+			      Ti = FNMS(KP634356270, Th, Ta);
+			      Tu = FNMS(KP634356270, Tt, T7);
+			      Tm = FNMS(KP634356270, Tl, T4);
+			      TU = FMA(KP830830026, TT, TB);
+			      Tj = FNMS(KP778434453, Ti, T7);
+			      {
+				   E Tk, TR, To, Tn, TQ, Tr;
+				   TQ = FMA(KP715370323, TP, TC);
+				   Tn = FNMS(KP778434453, Tm, Ta);
+				   Ci[WS(csi, 5)] = KP989821441 * (FMA(KP918985947, TU, TE));
+				   Tk = FNMS(KP876768831, Tj, T4);
+				   TR = FNMS(KP830830026, TQ, TE);
+				   To = FNMS(KP876768831, Tn, Td);
+				   Tr = FNMS(KP778434453, Tq, Td);
+				   Cr[WS(csr, 5)] = FNMS(KP959492973, Tk, T1);
+				   Ci[WS(csi, 4)] = KP989821441 * (FNMS(KP918985947, TR, TD));
+				   Cr[WS(csr, 4)] = FNMS(KP959492973, To, T1);
+				   Ts = FNMS(KP876768831, Tr, T7);
+			      }
+			 }
+			 TN = FNMS(KP715370323, TM, TE);
+			 Tv = FNMS(KP778434453, Tu, Tg);
+			 Cr[0] = T1 + T4 + T7 + Ta + Td + Tg;
+			 Cr[WS(csr, 3)] = FNMS(KP959492973, Ts, T1);
+			 {
+			      E TO, Tw, TH, TK;
+			      TO = FNMS(KP830830026, TN, TF);
+			      Tw = FNMS(KP876768831, Tv, Ta);
+			      TH = FMA(KP715370323, TG, TD);
+			      TK = FNMS(KP715370323, TJ, TB);
+			      Ci[WS(csi, 3)] = KP989821441 * (FNMS(KP918985947, TO, TC));
+			      Cr[WS(csr, 2)] = FNMS(KP959492973, Tw, T1);
+			      TI = FNMS(KP830830026, TH, TC);
+			      TL = FMA(KP830830026, TK, TD);
+			      Tz = FNMS(KP778434453, Ty, T4);
+			 }
+		    }
+	       }
+	       Ci[WS(csi, 2)] = KP989821441 * (FMA(KP918985947, TI, TB));
+	       Ci[WS(csi, 1)] = KP989821441 * (FNMS(KP918985947, TL, TF));
+	       TA = FNMS(KP876768831, Tz, Tg);
+	       Cr[WS(csr, 1)] = FNMS(KP959492973, TA, T1);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 11, "r2cf_11", {15, 5, 45, 0}, &GENUS };
+
+void X(codelet_r2cf_11) (planner *p) {
+     X(kr2c_register) (p, r2cf_11, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 11 -name r2cf_11 -include r2cf.h */
+
+/*
+ * This function contains 60 FP additions, 50 FP multiplications,
+ * (or, 20 additions, 10 multiplications, 40 fused multiply/add),
+ * 28 stack variables, 10 constants, and 22 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_11(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP654860733, +0.654860733945285064056925072466293553183791199);
+     DK(KP142314838, +0.142314838273285140443792668616369668791051361);
+     DK(KP959492973, +0.959492973614497389890368057066327699062454848);
+     DK(KP415415013, +0.415415013001886425529274149229623203524004910);
+     DK(KP841253532, +0.841253532831181168861811648919367717513292498);
+     DK(KP989821441, +0.989821441880932732376092037776718787376519372);
+     DK(KP909631995, +0.909631995354518371411715383079028460060241051);
+     DK(KP281732556, +0.281732556841429697711417915346616899035777899);
+     DK(KP540640817, +0.540640817455597582107635954318691695431770608);
+     DK(KP755749574, +0.755749574354258283774035843972344420179717445);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(44, rs), MAKE_VOLATILE_STRIDE(44, csr), MAKE_VOLATILE_STRIDE(44, csi)) {
+	       E T1, T4, Tl, Tg, Th, Td, Ti, Ta, Tk, T7, Tj, Tb, Tc;
+	       T1 = R0[0];
+	       {
+		    E T2, T3, Te, Tf;
+		    T2 = R0[WS(rs, 1)];
+		    T3 = R1[WS(rs, 4)];
+		    T4 = T2 + T3;
+		    Tl = T3 - T2;
+		    Te = R1[0];
+		    Tf = R0[WS(rs, 5)];
+		    Tg = Te + Tf;
+		    Th = Tf - Te;
+	       }
+	       Tb = R1[WS(rs, 1)];
+	       Tc = R0[WS(rs, 4)];
+	       Td = Tb + Tc;
+	       Ti = Tc - Tb;
+	       {
+		    E T8, T9, T5, T6;
+		    T8 = R1[WS(rs, 2)];
+		    T9 = R0[WS(rs, 3)];
+		    Ta = T8 + T9;
+		    Tk = T9 - T8;
+		    T5 = R0[WS(rs, 2)];
+		    T6 = R1[WS(rs, 3)];
+		    T7 = T5 + T6;
+		    Tj = T6 - T5;
+	       }
+	       Ci[WS(csi, 4)] = FMA(KP755749574, Th, KP540640817 * Ti) + FNMS(KP909631995, Tk, KP281732556 * Tj) - (KP989821441 * Tl);
+	       Cr[WS(csr, 4)] = FMA(KP841253532, Td, T1) + FNMS(KP959492973, T7, KP415415013 * Ta) + FNMA(KP142314838, T4, KP654860733 * Tg);
+	       Ci[WS(csi, 2)] = FMA(KP909631995, Th, KP755749574 * Tl) + FNMA(KP540640817, Tk, KP989821441 * Tj) - (KP281732556 * Ti);
+	       Ci[WS(csi, 5)] = FMA(KP281732556, Th, KP755749574 * Ti) + FNMS(KP909631995, Tj, KP989821441 * Tk) - (KP540640817 * Tl);
+	       Ci[WS(csi, 1)] = FMA(KP540640817, Th, KP909631995 * Tl) + FMA(KP989821441, Ti, KP755749574 * Tj) + (KP281732556 * Tk);
+	       Ci[WS(csi, 3)] = FMA(KP989821441, Th, KP540640817 * Tj) + FNMS(KP909631995, Ti, KP755749574 * Tk) - (KP281732556 * Tl);
+	       Cr[WS(csr, 3)] = FMA(KP415415013, Td, T1) + FNMS(KP654860733, Ta, KP841253532 * T7) + FNMA(KP959492973, T4, KP142314838 * Tg);
+	       Cr[WS(csr, 1)] = FMA(KP841253532, Tg, T1) + FNMS(KP959492973, Ta, KP415415013 * T4) + FNMA(KP654860733, T7, KP142314838 * Td);
+	       Cr[0] = T1 + Tg + T4 + Td + T7 + Ta;
+	       Cr[WS(csr, 2)] = FMA(KP415415013, Tg, T1) + FNMS(KP142314838, T7, KP841253532 * Ta) + FNMA(KP959492973, Td, KP654860733 * T4);
+	       Cr[WS(csr, 5)] = FMA(KP841253532, T4, T1) + FNMS(KP142314838, Ta, KP415415013 * T7) + FNMA(KP654860733, Td, KP959492973 * Tg);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 11, "r2cf_11", {20, 10, 40, 0}, &GENUS };
+
+void X(codelet_r2cf_11) (planner *p) {
+     X(kr2c_register) (p, r2cf_11, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:46 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cf_12 -include r2cf.h */
+
+/*
+ * This function contains 38 FP additions, 10 FP multiplications,
+ * (or, 30 additions, 2 multiplications, 8 fused multiply/add),
+ * 31 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
+	       E Tm, T6, Ty, Tp, T5, Tk, Tt, Tb, Tc, Td, T9, Tn;
+	       {
+		    E T1, Tg, Th, Ti, T4, T2, T3, T7, T8, Tj;
+		    T1 = R0[0];
+		    T2 = R0[WS(rs, 2)];
+		    T3 = R0[WS(rs, 4)];
+		    Tg = R1[WS(rs, 1)];
+		    Th = R1[WS(rs, 3)];
+		    Ti = R1[WS(rs, 5)];
+		    T4 = T2 + T3;
+		    Tm = T3 - T2;
+		    T6 = R0[WS(rs, 3)];
+		    Ty = Ti - Th;
+		    Tj = Th + Ti;
+		    Tp = FNMS(KP500000000, T4, T1);
+		    T5 = T1 + T4;
+		    T7 = R0[WS(rs, 5)];
+		    Tk = FNMS(KP500000000, Tj, Tg);
+		    Tt = Tg + Tj;
+		    T8 = R0[WS(rs, 1)];
+		    Tb = R1[WS(rs, 4)];
+		    Tc = R1[0];
+		    Td = R1[WS(rs, 2)];
+		    T9 = T7 + T8;
+		    Tn = T8 - T7;
+	       }
+	       {
+		    E Te, Tz, To, TC;
+		    Te = Tc + Td;
+		    Tz = Td - Tc;
+		    To = Tm - Tn;
+		    TC = Tm + Tn;
+		    {
+			 E Ta, Tq, TA, TB;
+			 Ta = T6 + T9;
+			 Tq = FNMS(KP500000000, T9, T6);
+			 TA = Ty - Tz;
+			 TB = Ty + Tz;
+			 {
+			      E Tf, Tu, Tx, Tr;
+			      Tf = FNMS(KP500000000, Te, Tb);
+			      Tu = Tb + Te;
+			      Tx = Tp - Tq;
+			      Tr = Tp + Tq;
+			      {
+				   E Tv, Tw, Tl, Ts;
+				   Tv = T5 + Ta;
+				   Cr[WS(csr, 3)] = T5 - Ta;
+				   Ci[WS(csi, 4)] = KP866025403 * (TC + TB);
+				   Ci[WS(csi, 2)] = KP866025403 * (TB - TC);
+				   Tw = Tt + Tu;
+				   Ci[WS(csi, 3)] = Tt - Tu;
+				   Tl = Tf - Tk;
+				   Ts = Tk + Tf;
+				   Cr[WS(csr, 1)] = FMA(KP866025403, TA, Tx);
+				   Cr[WS(csr, 5)] = FNMS(KP866025403, TA, Tx);
+				   Cr[0] = Tv + Tw;
+				   Cr[WS(csr, 6)] = Tv - Tw;
+				   Cr[WS(csr, 4)] = Tr + Ts;
+				   Cr[WS(csr, 2)] = Tr - Ts;
+				   Ci[WS(csi, 5)] = FNMS(KP866025403, To, Tl);
+				   Ci[WS(csi, 1)] = FMA(KP866025403, To, Tl);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 12, "r2cf_12", {30, 2, 8, 0}, &GENUS };
+
+void X(codelet_r2cf_12) (planner *p) {
+     X(kr2c_register) (p, r2cf_12, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cf_12 -include r2cf.h */
+
+/*
+ * This function contains 38 FP additions, 8 FP multiplications,
+ * (or, 34 additions, 4 multiplications, 4 fused multiply/add),
+ * 21 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
+	       E T5, Tp, Tb, Tn, Ty, Tt, Ta, Tq, Tc, Ti, Tz, Tu, Td, To;
+	       {
+		    E T1, T2, T3, T4;
+		    T1 = R0[0];
+		    T2 = R0[WS(rs, 2)];
+		    T3 = R0[WS(rs, 4)];
+		    T4 = T2 + T3;
+		    T5 = T1 + T4;
+		    Tp = FNMS(KP500000000, T4, T1);
+		    Tb = T3 - T2;
+	       }
+	       {
+		    E Tj, Tk, Tl, Tm;
+		    Tj = R1[WS(rs, 1)];
+		    Tk = R1[WS(rs, 3)];
+		    Tl = R1[WS(rs, 5)];
+		    Tm = Tk + Tl;
+		    Tn = FNMS(KP500000000, Tm, Tj);
+		    Ty = Tl - Tk;
+		    Tt = Tj + Tm;
+	       }
+	       {
+		    E T6, T7, T8, T9;
+		    T6 = R0[WS(rs, 3)];
+		    T7 = R0[WS(rs, 5)];
+		    T8 = R0[WS(rs, 1)];
+		    T9 = T7 + T8;
+		    Ta = T6 + T9;
+		    Tq = FNMS(KP500000000, T9, T6);
+		    Tc = T8 - T7;
+	       }
+	       {
+		    E Te, Tf, Tg, Th;
+		    Te = R1[WS(rs, 4)];
+		    Tf = R1[0];
+		    Tg = R1[WS(rs, 2)];
+		    Th = Tf + Tg;
+		    Ti = FNMS(KP500000000, Th, Te);
+		    Tz = Tg - Tf;
+		    Tu = Te + Th;
+	       }
+	       Cr[WS(csr, 3)] = T5 - Ta;
+	       Ci[WS(csi, 3)] = Tt - Tu;
+	       Td = KP866025403 * (Tb - Tc);
+	       To = Ti - Tn;
+	       Ci[WS(csi, 1)] = Td + To;
+	       Ci[WS(csi, 5)] = To - Td;
+	       {
+		    E Tx, TA, Tv, Tw;
+		    Tx = Tp - Tq;
+		    TA = KP866025403 * (Ty - Tz);
+		    Cr[WS(csr, 5)] = Tx - TA;
+		    Cr[WS(csr, 1)] = Tx + TA;
+		    Tv = T5 + Ta;
+		    Tw = Tt + Tu;
+		    Cr[WS(csr, 6)] = Tv - Tw;
+		    Cr[0] = Tv + Tw;
+	       }
+	       {
+		    E Tr, Ts, TB, TC;
+		    Tr = Tp + Tq;
+		    Ts = Tn + Ti;
+		    Cr[WS(csr, 2)] = Tr - Ts;
+		    Cr[WS(csr, 4)] = Tr + Ts;
+		    TB = Ty + Tz;
+		    TC = Tb + Tc;
+		    Ci[WS(csi, 2)] = KP866025403 * (TB - TC);
+		    Ci[WS(csi, 4)] = KP866025403 * (TC + TB);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 12, "r2cf_12", {34, 4, 4, 0}, &GENUS };
+
+void X(codelet_r2cf_12) (planner *p) {
+     X(kr2c_register) (p, r2cf_12, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_128.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_128.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3180 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:47 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 128 -name r2cf_128 -include r2cf.h */
+
+/*
+ * This function contains 956 FP additions, 516 FP multiplications,
+ * (or, 440 additions, 0 multiplications, 516 fused multiply/add),
+ * 229 stack variables, 31 constants, and 256 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_128(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP989176509, +0.989176509964780973451673738016243063983689533);
+     DK(KP803207531, +0.803207531480644909806676512963141923879569427);
+     DK(KP148335987, +0.148335987538347428753676511486911367000625355);
+     DK(KP741650546, +0.741650546272035369581266691172079863842265220);
+     DK(KP998795456, +0.998795456205172392714771604759100694443203615);
+     DK(KP740951125, +0.740951125354959091175616897495162729728955309);
+     DK(KP049126849, +0.049126849769467254105343321271313617079695752);
+     DK(KP906347169, +0.906347169019147157946142717268914412664134293);
+     DK(KP857728610, +0.857728610000272069902269984284770137042490799);
+     DK(KP970031253, +0.970031253194543992603984207286100251456865962);
+     DK(KP599376933, +0.599376933681923766271389869014404232837890546);
+     DK(KP250486960, +0.250486960191305461595702160124721208578685568);
+     DK(KP941544065, +0.941544065183020778412509402599502357185589796);
+     DK(KP903989293, +0.903989293123443331586200297230537048710132025);
+     DK(KP472964775, +0.472964775891319928124438237972992463904131113);
+     DK(KP357805721, +0.357805721314524104672487743774474392487532769);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(512, rs), MAKE_VOLATILE_STRIDE(512, csr), MAKE_VOLATILE_STRIDE(512, csi)) {
+	       E T95, T96;
+	       {
+		    E TcD, TdR, T5P, T8v, T27, T7r, Tf, Ta5, T7s, T5S, T8w, T2e, TdS, TcG, Tbn;
+		    E Tu, TcK, TdU, TK, Ta6, T7w, T8y, T2o, T5U, TcN, TdV, TZ, Ta7, T7z, T8z;
+		    E T2x, T5V, T1g, Taa, Tab, T1v, Tew, TcX, Tex, TcU, T6A, T2M, T9b, T7E, T9a;
+		    E T7H, T6z, T2T, TeO, TdK, TeL, Tdz, T9p, T8d, T6O, T5G, T6L, T4X, Tc3, TaV;
+		    E Tc4, Tbi, T9s, T8o, TeH, Tdp, TeE, Tde, T9i, T7U, T6H, T4r, T6E, T3I, TbW;
+		    E Tao, TbX, TaL, T9l, T85, T1L, Tad, Tae, T20, Tez, Td6, TeA, Td3, T6x, T37;
+		    E T9e, T7L, T9d, T7O, T6w, T3e, TbZ, T3Z, T4s, Tc0, TeF, Tds, T4t, T4g, T87;
+		    E T80, TeI, Tdl, T86, T7X, TaM, TaD, Tb2, Tc6, T8e, T8f, T5e, T5H, Tb9, Tc7;
+		    E TeM, TdN, T5I, T5v, T8q, T8j, TeP, TdG;
+		    {
+			 E T7G, T2S, T2P, T7F;
+			 {
+			      E T28, Ti, Tn, T2c, Ts, T29, Tl, To;
+			      {
+				   E T4, T23, T3, T25, Td, T5, T8, T9;
+				   {
+					E T1, T2, Tb, Tc;
+					T1 = R0[0];
+					T2 = R0[WS(rs, 32)];
+					Tb = R0[WS(rs, 56)];
+					Tc = R0[WS(rs, 24)];
+					T4 = R0[WS(rs, 16)];
+					T23 = T1 - T2;
+					T3 = T1 + T2;
+					T25 = Tb - Tc;
+					Td = Tb + Tc;
+					T5 = R0[WS(rs, 48)];
+					T8 = R0[WS(rs, 8)];
+					T9 = R0[WS(rs, 40)];
+				   }
+				   {
+					E Tq, Tr, Tj, Tk;
+					{
+					     E Tg, T5N, T6, T24, Ta, Th;
+					     Tg = R0[WS(rs, 4)];
+					     T5N = T4 - T5;
+					     T6 = T4 + T5;
+					     T24 = T8 - T9;
+					     Ta = T8 + T9;
+					     Th = R0[WS(rs, 36)];
+					     {
+						  E T7, T26, T5O, Te;
+						  TcD = T3 - T6;
+						  T7 = T3 + T6;
+						  T26 = T24 + T25;
+						  T5O = T25 - T24;
+						  TdR = Td - Ta;
+						  Te = Ta + Td;
+						  T5P = FNMS(KP707106781, T5O, T5N);
+						  T8v = FMA(KP707106781, T5O, T5N);
+						  T27 = FMA(KP707106781, T26, T23);
+						  T7r = FNMS(KP707106781, T26, T23);
+						  Tf = T7 + Te;
+						  Ta5 = T7 - Te;
+						  T28 = Tg - Th;
+						  Ti = Tg + Th;
+					     }
+					}
+					Tq = R0[WS(rs, 12)];
+					Tr = R0[WS(rs, 44)];
+					Tj = R0[WS(rs, 20)];
+					Tk = R0[WS(rs, 52)];
+					Tn = R0[WS(rs, 60)];
+					T2c = Tq - Tr;
+					Ts = Tq + Tr;
+					T29 = Tj - Tk;
+					Tl = Tj + Tk;
+					To = R0[WS(rs, 28)];
+				   }
+			      }
+			      {
+				   E T2g, T2l, T2h, TF, TcI, TC, T2i, TI;
+				   {
+					E Ty, TG, TB, TH;
+					{
+					     E Tw, T5Q, T2a, TcE, Tm, T2b, Tp, Tx;
+					     Tw = R0[WS(rs, 2)];
+					     T5Q = FMA(KP414213562, T28, T29);
+					     T2a = FNMS(KP414213562, T29, T28);
+					     TcE = Ti - Tl;
+					     Tm = Ti + Tl;
+					     T2b = Tn - To;
+					     Tp = Tn + To;
+					     Tx = R0[WS(rs, 34)];
+					     {
+						  E Tz, TA, TD, TE;
+						  Tz = R0[WS(rs, 18)];
+						  {
+						       E T5R, T2d, TcF, Tt;
+						       T5R = FNMS(KP414213562, T2b, T2c);
+						       T2d = FMA(KP414213562, T2c, T2b);
+						       TcF = Tp - Ts;
+						       Tt = Tp + Ts;
+						       T2g = Tw - Tx;
+						       Ty = Tw + Tx;
+						       T7s = T5Q - T5R;
+						       T5S = T5Q + T5R;
+						       T8w = T2d - T2a;
+						       T2e = T2a + T2d;
+						       TdS = TcF - TcE;
+						       TcG = TcE + TcF;
+						       Tbn = Tt - Tm;
+						       Tu = Tm + Tt;
+						       TA = R0[WS(rs, 50)];
+						  }
+						  TD = R0[WS(rs, 10)];
+						  TE = R0[WS(rs, 42)];
+						  TG = R0[WS(rs, 58)];
+						  T2l = Tz - TA;
+						  TB = Tz + TA;
+						  T2h = TD - TE;
+						  TF = TD + TE;
+						  TH = R0[WS(rs, 26)];
+					     }
+					}
+					TcI = Ty - TB;
+					TC = Ty + TB;
+					T2i = TG - TH;
+					TI = TG + TH;
+				   }
+				   {
+					E T2p, T2u, T2q, TU, TcL, TR, T2r, TX;
+					{
+					     E TN, TV, TQ, TW;
+					     {
+						  E T2k, T7u, T2n, T7v, TL, TM;
+						  TL = R0[WS(rs, 62)];
+						  TM = R0[WS(rs, 30)];
+						  {
+						       E TJ, TcJ, T2m, T2j;
+						       TJ = TF + TI;
+						       TcJ = TI - TF;
+						       T2m = T2h - T2i;
+						       T2j = T2h + T2i;
+						       TcK = FMA(KP414213562, TcJ, TcI);
+						       TdU = FNMS(KP414213562, TcI, TcJ);
+						       TK = TC + TJ;
+						       Ta6 = TC - TJ;
+						       T2k = FMA(KP707106781, T2j, T2g);
+						       T7u = FNMS(KP707106781, T2j, T2g);
+						       T2n = FMA(KP707106781, T2m, T2l);
+						       T7v = FNMS(KP707106781, T2m, T2l);
+						       T2p = TL - TM;
+						       TN = TL + TM;
+						  }
+						  T7w = FMA(KP668178637, T7v, T7u);
+						  T8y = FNMS(KP668178637, T7u, T7v);
+						  T2o = FNMS(KP198912367, T2n, T2k);
+						  T5U = FMA(KP198912367, T2k, T2n);
+						  {
+						       E TO, TP, TS, TT;
+						       TO = R0[WS(rs, 14)];
+						       TP = R0[WS(rs, 46)];
+						       TS = R0[WS(rs, 6)];
+						       TT = R0[WS(rs, 38)];
+						       TV = R0[WS(rs, 54)];
+						       T2u = TO - TP;
+						       TQ = TO + TP;
+						       T2q = TS - TT;
+						       TU = TS + TT;
+						       TW = R0[WS(rs, 22)];
+						  }
+					     }
+					     TcL = TN - TQ;
+					     TR = TN + TQ;
+					     T2r = TV - TW;
+					     TX = TV + TW;
+					}
+					{
+					     E T2A, T14, T2N, T17, T1b, T1e, T2D, T2O, T1r, T2I, T1q, T2Q, T2H, TcR, T1n;
+					     E T1s, T15, T16;
+					     {
+						  E T2t, T7x, T2w, T7y, T12, T13;
+						  T12 = R0[WS(rs, 1)];
+						  T13 = R0[WS(rs, 33)];
+						  {
+						       E TY, TcM, T2v, T2s;
+						       TY = TU + TX;
+						       TcM = TX - TU;
+						       T2v = T2q - T2r;
+						       T2s = T2q + T2r;
+						       TcN = FNMS(KP414213562, TcM, TcL);
+						       TdV = FMA(KP414213562, TcL, TcM);
+						       TZ = TR + TY;
+						       Ta7 = TR - TY;
+						       T2t = FMA(KP707106781, T2s, T2p);
+						       T7x = FNMS(KP707106781, T2s, T2p);
+						       T2w = FMA(KP707106781, T2v, T2u);
+						       T7y = FNMS(KP707106781, T2v, T2u);
+						       T2A = T12 - T13;
+						       T14 = T12 + T13;
+						  }
+						  T7z = FNMS(KP668178637, T7y, T7x);
+						  T8z = FMA(KP668178637, T7x, T7y);
+						  T2x = FMA(KP198912367, T2w, T2t);
+						  T5V = FNMS(KP198912367, T2t, T2w);
+						  T15 = R0[WS(rs, 17)];
+						  T16 = R0[WS(rs, 49)];
+					     }
+					     {
+						  E T1c, T2B, T1d, T19, T1a;
+						  T19 = R0[WS(rs, 9)];
+						  T1a = R0[WS(rs, 41)];
+						  T1c = R0[WS(rs, 57)];
+						  T2N = T15 - T16;
+						  T17 = T15 + T16;
+						  T2B = T19 - T1a;
+						  T1b = T19 + T1a;
+						  T1d = R0[WS(rs, 25)];
+						  {
+						       E T1k, T2F, T1j, T1l, T1h, T1i, T2C;
+						       T1h = R0[WS(rs, 5)];
+						       T1i = R0[WS(rs, 37)];
+						       T2C = T1c - T1d;
+						       T1e = T1c + T1d;
+						       T1k = R0[WS(rs, 21)];
+						       T2F = T1h - T1i;
+						       T1j = T1h + T1i;
+						       T2D = T2B + T2C;
+						       T2O = T2B - T2C;
+						       T1l = R0[WS(rs, 53)];
+						       {
+							    E T1o, T1p, T2G, T1m;
+							    T1o = R0[WS(rs, 61)];
+							    T1p = R0[WS(rs, 29)];
+							    T1r = R0[WS(rs, 13)];
+							    T2G = T1k - T1l;
+							    T1m = T1k + T1l;
+							    T2I = T1o - T1p;
+							    T1q = T1o + T1p;
+							    T2Q = FMA(KP414213562, T2F, T2G);
+							    T2H = FNMS(KP414213562, T2G, T2F);
+							    TcR = T1j - T1m;
+							    T1n = T1j + T1m;
+							    T1s = R0[WS(rs, 45)];
+						       }
+						  }
+					     }
+					     {
+						  E TcQ, TcV, T2K, T2R, T1u, TcT, TcW, TcS;
+						  {
+						       E T18, T1f, T1t, T2J;
+						       T18 = T14 + T17;
+						       TcQ = T14 - T17;
+						       TcV = T1e - T1b;
+						       T1f = T1b + T1e;
+						       T1t = T1r + T1s;
+						       T2J = T1r - T1s;
+						       T1g = T18 + T1f;
+						       Taa = T18 - T1f;
+						       T2K = FMA(KP414213562, T2J, T2I);
+						       T2R = FNMS(KP414213562, T2I, T2J);
+						       T1u = T1q + T1t;
+						       TcS = T1q - T1t;
+						  }
+						  TcT = TcR + TcS;
+						  TcW = TcS - TcR;
+						  {
+						       E T7C, T2E, T2L, T7D;
+						       T7C = FNMS(KP707106781, T2D, T2A);
+						       T2E = FMA(KP707106781, T2D, T2A);
+						       Tab = T1u - T1n;
+						       T1v = T1n + T1u;
+						       Tew = FNMS(KP707106781, TcW, TcV);
+						       TcX = FMA(KP707106781, TcW, TcV);
+						       Tex = FNMS(KP707106781, TcT, TcQ);
+						       TcU = FMA(KP707106781, TcT, TcQ);
+						       T2L = T2H + T2K;
+						       T7G = T2K - T2H;
+						       T7D = T2Q - T2R;
+						       T2S = T2Q + T2R;
+						       T2P = FMA(KP707106781, T2O, T2N);
+						       T7F = FNMS(KP707106781, T2O, T2N);
+						       T6A = FNMS(KP923879532, T2L, T2E);
+						       T2M = FMA(KP923879532, T2L, T2E);
+						       T9b = FNMS(KP923879532, T7D, T7C);
+						       T7E = FMA(KP923879532, T7D, T7C);
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T83, T84, T8m, T8n;
+			      {
+				   E TaP, T4z, TaQ, T5A, TaS, TaT, T4G, T5B, T4O, T5D, Tbh, Tdw, T4R, Tbc, T4S;
+				   E T4T;
+				   {
+					E T4x, T4y, T5y, T5z;
+					T4x = R1[WS(rs, 63)];
+					T9a = FNMS(KP923879532, T7G, T7F);
+					T7H = FMA(KP923879532, T7G, T7F);
+					T6z = FNMS(KP923879532, T2S, T2P);
+					T2T = FMA(KP923879532, T2S, T2P);
+					T4y = R1[WS(rs, 31)];
+					T5y = R1[WS(rs, 47)];
+					T5z = R1[WS(rs, 15)];
+					{
+					     E T4A, T4B, T4D, T4E;
+					     T4A = R1[WS(rs, 7)];
+					     TaP = T4x + T4y;
+					     T4z = T4x - T4y;
+					     TaQ = T5z + T5y;
+					     T5A = T5y - T5z;
+					     T4B = R1[WS(rs, 39)];
+					     T4D = R1[WS(rs, 55)];
+					     T4E = R1[WS(rs, 23)];
+					     {
+						  E T4K, Tbf, Tbg, T4N, T4P, T4Q;
+						  {
+						       E T4I, T4C, T4F, T4J, T4L, T4M;
+						       T4I = R1[WS(rs, 3)];
+						       TaS = T4A + T4B;
+						       T4C = T4A - T4B;
+						       TaT = T4D + T4E;
+						       T4F = T4D - T4E;
+						       T4J = R1[WS(rs, 35)];
+						       T4L = R1[WS(rs, 51)];
+						       T4M = R1[WS(rs, 19)];
+						       T4G = T4C + T4F;
+						       T5B = T4F - T4C;
+						       T4K = T4I - T4J;
+						       Tbf = T4I + T4J;
+						       Tbg = T4M + T4L;
+						       T4N = T4L - T4M;
+						  }
+						  T4P = R1[WS(rs, 59)];
+						  T4Q = R1[WS(rs, 27)];
+						  T4O = FMA(KP414213562, T4N, T4K);
+						  T5D = FNMS(KP414213562, T4K, T4N);
+						  Tbh = Tbf + Tbg;
+						  Tdw = Tbf - Tbg;
+						  T4R = T4P - T4Q;
+						  Tbc = T4P + T4Q;
+						  T4S = R1[WS(rs, 43)];
+						  T4T = R1[WS(rs, 11)];
+					     }
+					}
+				   }
+				   {
+					E T4H, T8b, TaR, Tdv, TdI, TaU, T4U, Tbd, T5C;
+					T4H = FMA(KP707106781, T4G, T4z);
+					T8b = FNMS(KP707106781, T4G, T4z);
+					TaR = TaP + TaQ;
+					Tdv = TaP - TaQ;
+					TdI = TaT - TaS;
+					TaU = TaS + TaT;
+					T4U = T4S - T4T;
+					Tbd = T4T + T4S;
+					T8m = FNMS(KP707106781, T5B, T5A);
+					T5C = FMA(KP707106781, T5B, T5A);
+					{
+					     E Tbe, Tdx, T5E, T4V;
+					     Tbe = Tbc + Tbd;
+					     Tdx = Tbc - Tbd;
+					     T5E = FMA(KP414213562, T4R, T4U);
+					     T4V = FNMS(KP414213562, T4U, T4R);
+					     {
+						  E Tdy, TdJ, T5F, T8c, T4W;
+						  Tdy = Tdw + Tdx;
+						  TdJ = Tdx - Tdw;
+						  T5F = T5D + T5E;
+						  T8c = T5E - T5D;
+						  T8n = T4V - T4O;
+						  T4W = T4O + T4V;
+						  TeO = FNMS(KP707106781, TdJ, TdI);
+						  TdK = FMA(KP707106781, TdJ, TdI);
+						  TeL = FNMS(KP707106781, Tdy, Tdv);
+						  Tdz = FMA(KP707106781, Tdy, Tdv);
+						  T9p = FNMS(KP923879532, T8c, T8b);
+						  T8d = FMA(KP923879532, T8c, T8b);
+						  T6O = FNMS(KP923879532, T5F, T5C);
+						  T5G = FMA(KP923879532, T5F, T5C);
+						  T6L = FNMS(KP923879532, T4W, T4H);
+						  T4X = FMA(KP923879532, T4W, T4H);
+					     }
+					     Tc3 = TaR + TaU;
+					     TaV = TaR - TaU;
+					     Tc4 = Tbh + Tbe;
+					     Tbi = Tbe - Tbh;
+					}
+				   }
+			      }
+			      {
+				   E Tai, T3k, Taj, T4l, Tal, Tam, T4m, T3r, T3D, TaF, T3C, Tdb, TaK, T3z, T4o;
+				   E T3E;
+				   {
+					E T4j, T4k, T3i, T3j;
+					T3i = R1[0];
+					T3j = R1[WS(rs, 32)];
+					T4j = R1[WS(rs, 16)];
+					T9s = FMA(KP923879532, T8n, T8m);
+					T8o = FNMS(KP923879532, T8n, T8m);
+					Tai = T3i + T3j;
+					T3k = T3i - T3j;
+					T4k = R1[WS(rs, 48)];
+					{
+					     E T3o, T3n, T3p, T3l, T3m;
+					     T3l = R1[WS(rs, 8)];
+					     T3m = R1[WS(rs, 40)];
+					     T3o = R1[WS(rs, 56)];
+					     Taj = T4j + T4k;
+					     T4l = T4j - T4k;
+					     T3n = T3l - T3m;
+					     Tal = T3l + T3m;
+					     T3p = R1[WS(rs, 24)];
+					     {
+						  E T3w, TaI, T3v, T3x, T3t, T3u, T3q;
+						  T3t = R1[WS(rs, 4)];
+						  T3u = R1[WS(rs, 36)];
+						  T3q = T3o - T3p;
+						  Tam = T3o + T3p;
+						  T3w = R1[WS(rs, 20)];
+						  TaI = T3t + T3u;
+						  T3v = T3t - T3u;
+						  T4m = T3n - T3q;
+						  T3r = T3n + T3q;
+						  T3x = R1[WS(rs, 52)];
+						  {
+						       E T3A, T3B, TaJ, T3y;
+						       T3A = R1[WS(rs, 60)];
+						       T3B = R1[WS(rs, 28)];
+						       T3D = R1[WS(rs, 12)];
+						       TaJ = T3w + T3x;
+						       T3y = T3w - T3x;
+						       TaF = T3A + T3B;
+						       T3C = T3A - T3B;
+						       Tdb = TaI - TaJ;
+						       TaK = TaI + TaJ;
+						       T3z = FNMS(KP414213562, T3y, T3v);
+						       T4o = FMA(KP414213562, T3v, T3y);
+						       T3E = R1[WS(rs, 44)];
+						  }
+					     }
+					}
+				   }
+				   {
+					E T3s, T7S, Tak, Tda, Tdn, Tan, T3F, TaG, T4n;
+					T3s = FMA(KP707106781, T3r, T3k);
+					T7S = FNMS(KP707106781, T3r, T3k);
+					Tak = Tai + Taj;
+					Tda = Tai - Taj;
+					Tdn = Tam - Tal;
+					Tan = Tal + Tam;
+					T3F = T3D - T3E;
+					TaG = T3D + T3E;
+					T83 = FNMS(KP707106781, T4m, T4l);
+					T4n = FMA(KP707106781, T4m, T4l);
+					{
+					     E TaH, Tdc, T4p, T3G;
+					     TaH = TaF + TaG;
+					     Tdc = TaF - TaG;
+					     T4p = FNMS(KP414213562, T3C, T3F);
+					     T3G = FMA(KP414213562, T3F, T3C);
+					     {
+						  E Tdd, Tdo, T4q, T7T, T3H;
+						  Tdd = Tdb + Tdc;
+						  Tdo = Tdc - Tdb;
+						  T4q = T4o + T4p;
+						  T7T = T4o - T4p;
+						  T84 = T3G - T3z;
+						  T3H = T3z + T3G;
+						  TeH = FNMS(KP707106781, Tdo, Tdn);
+						  Tdp = FMA(KP707106781, Tdo, Tdn);
+						  TeE = FNMS(KP707106781, Tdd, Tda);
+						  Tde = FMA(KP707106781, Tdd, Tda);
+						  T9i = FNMS(KP923879532, T7T, T7S);
+						  T7U = FMA(KP923879532, T7T, T7S);
+						  T6H = FNMS(KP923879532, T4q, T4n);
+						  T4r = FMA(KP923879532, T4q, T4n);
+						  T6E = FNMS(KP923879532, T3H, T3s);
+						  T3I = FMA(KP923879532, T3H, T3s);
+					     }
+					     TbW = Tak + Tan;
+					     Tao = Tak - Tan;
+					     TbX = TaK + TaH;
+					     TaL = TaH - TaK;
+					}
+				   }
+			      }
+			      {
+				   E T7N, T3d, T3a, T7M;
+				   {
+					E T2V, T1z, T38, T1C, T1G, T1J, T2Y, T39, T1W, T33, T1V, T3b, T32, Td0, T1S;
+					E T1X;
+					{
+					     E T1A, T1B, T1x, T1y;
+					     T1x = R0[WS(rs, 63)];
+					     T1y = R0[WS(rs, 31)];
+					     T1A = R0[WS(rs, 15)];
+					     T9l = FNMS(KP923879532, T84, T83);
+					     T85 = FMA(KP923879532, T84, T83);
+					     T2V = T1x - T1y;
+					     T1z = T1x + T1y;
+					     T1B = R0[WS(rs, 47)];
+					     {
+						  E T1H, T2W, T1I, T1E, T1F;
+						  T1E = R0[WS(rs, 7)];
+						  T1F = R0[WS(rs, 39)];
+						  T1H = R0[WS(rs, 55)];
+						  T38 = T1A - T1B;
+						  T1C = T1A + T1B;
+						  T2W = T1E - T1F;
+						  T1G = T1E + T1F;
+						  T1I = R0[WS(rs, 23)];
+						  {
+						       E T1P, T30, T1O, T1Q, T1M, T1N, T2X;
+						       T1M = R0[WS(rs, 3)];
+						       T1N = R0[WS(rs, 35)];
+						       T2X = T1H - T1I;
+						       T1J = T1H + T1I;
+						       T1P = R0[WS(rs, 19)];
+						       T30 = T1M - T1N;
+						       T1O = T1M + T1N;
+						       T2Y = T2W + T2X;
+						       T39 = T2W - T2X;
+						       T1Q = R0[WS(rs, 51)];
+						       {
+							    E T1T, T1U, T31, T1R;
+							    T1T = R0[WS(rs, 59)];
+							    T1U = R0[WS(rs, 27)];
+							    T1W = R0[WS(rs, 11)];
+							    T31 = T1P - T1Q;
+							    T1R = T1P + T1Q;
+							    T33 = T1T - T1U;
+							    T1V = T1T + T1U;
+							    T3b = FMA(KP414213562, T30, T31);
+							    T32 = FNMS(KP414213562, T31, T30);
+							    Td0 = T1O - T1R;
+							    T1S = T1O + T1R;
+							    T1X = R0[WS(rs, 43)];
+						       }
+						  }
+					     }
+					}
+					{
+					     E TcZ, Td4, T35, T3c, T1Z, Td2, Td5, Td1;
+					     {
+						  E T1D, T1K, T1Y, T34;
+						  T1D = T1z + T1C;
+						  TcZ = T1z - T1C;
+						  Td4 = T1J - T1G;
+						  T1K = T1G + T1J;
+						  T1Y = T1W + T1X;
+						  T34 = T1W - T1X;
+						  T1L = T1D + T1K;
+						  Tad = T1D - T1K;
+						  T35 = FMA(KP414213562, T34, T33);
+						  T3c = FNMS(KP414213562, T33, T34);
+						  T1Z = T1V + T1Y;
+						  Td1 = T1V - T1Y;
+					     }
+					     Td2 = Td0 + Td1;
+					     Td5 = Td1 - Td0;
+					     {
+						  E T7J, T2Z, T36, T7K;
+						  T7J = FNMS(KP707106781, T2Y, T2V);
+						  T2Z = FMA(KP707106781, T2Y, T2V);
+						  Tae = T1Z - T1S;
+						  T20 = T1S + T1Z;
+						  Tez = FNMS(KP707106781, Td5, Td4);
+						  Td6 = FMA(KP707106781, Td5, Td4);
+						  TeA = FNMS(KP707106781, Td2, TcZ);
+						  Td3 = FMA(KP707106781, Td2, TcZ);
+						  T36 = T32 + T35;
+						  T7N = T35 - T32;
+						  T7K = T3b - T3c;
+						  T3d = T3b + T3c;
+						  T3a = FMA(KP707106781, T39, T38);
+						  T7M = FNMS(KP707106781, T39, T38);
+						  T6x = FNMS(KP923879532, T36, T2Z);
+						  T37 = FMA(KP923879532, T36, T2Z);
+						  T9e = FNMS(KP923879532, T7K, T7J);
+						  T7L = FMA(KP923879532, T7K, T7J);
+					     }
+					}
+				   }
+				   {
+					E Tav, T7V, T7W, TaC;
+					{
+					     E T3L, T3W, Tdf, Tar, T42, T4d, Tay, Tdi, T46, Tau, Tdg, T3X, T3S, Taz, T45;
+					     E T47, Taw, Tax;
+					     {
+						  E T3J, T3K, T3U, T3V;
+						  T3J = R1[WS(rs, 2)];
+						  T9d = FNMS(KP923879532, T7N, T7M);
+						  T7O = FMA(KP923879532, T7N, T7M);
+						  T6w = FNMS(KP923879532, T3d, T3a);
+						  T3e = FMA(KP923879532, T3d, T3a);
+						  T3K = R1[WS(rs, 34)];
+						  T3U = R1[WS(rs, 18)];
+						  T3V = R1[WS(rs, 50)];
+						  {
+						       E T40, Tap, Taq, T41, T4b, T4c;
+						       T40 = R1[WS(rs, 62)];
+						       T3L = T3J - T3K;
+						       Tap = T3J + T3K;
+						       T3W = T3U - T3V;
+						       Taq = T3U + T3V;
+						       T41 = R1[WS(rs, 30)];
+						       T4b = R1[WS(rs, 14)];
+						       T4c = R1[WS(rs, 46)];
+						       Tdf = Tap - Taq;
+						       Tar = Tap + Taq;
+						       T42 = T40 - T41;
+						       Taw = T40 + T41;
+						       Tax = T4b + T4c;
+						       T4d = T4b - T4c;
+						  }
+					     }
+					     {
+						  E T3M, T3N, T3P, T3Q;
+						  T3M = R1[WS(rs, 10)];
+						  Tay = Taw + Tax;
+						  Tdi = Taw - Tax;
+						  T3N = R1[WS(rs, 42)];
+						  T3P = R1[WS(rs, 58)];
+						  T3Q = R1[WS(rs, 26)];
+						  {
+						       E T43, Tas, T3O, Tat, T3R, T44;
+						       T43 = R1[WS(rs, 6)];
+						       Tas = T3M + T3N;
+						       T3O = T3M - T3N;
+						       Tat = T3P + T3Q;
+						       T3R = T3P - T3Q;
+						       T44 = R1[WS(rs, 38)];
+						       T46 = R1[WS(rs, 54)];
+						       Tau = Tas + Tat;
+						       Tdg = Tat - Tas;
+						       T3X = T3O - T3R;
+						       T3S = T3O + T3R;
+						       Taz = T43 + T44;
+						       T45 = T43 - T44;
+						       T47 = R1[WS(rs, 22)];
+						  }
+					     }
+					     {
+						  E Tdq, Tdh, T49, T4e, Tdr, Tdk;
+						  Tav = Tar - Tau;
+						  TbZ = Tar + Tau;
+						  {
+						       E T3T, T3Y, TaA, T48, Tdj, TaB;
+						       T3T = FMA(KP707106781, T3S, T3L);
+						       T7V = FNMS(KP707106781, T3S, T3L);
+						       T7W = FNMS(KP707106781, T3X, T3W);
+						       T3Y = FMA(KP707106781, T3X, T3W);
+						       TaA = T46 + T47;
+						       T48 = T46 - T47;
+						       Tdq = FNMS(KP414213562, Tdf, Tdg);
+						       Tdh = FMA(KP414213562, Tdg, Tdf);
+						       T3Z = FNMS(KP198912367, T3Y, T3T);
+						       T4s = FMA(KP198912367, T3T, T3Y);
+						       Tdj = TaA - Taz;
+						       TaB = Taz + TaA;
+						       T49 = T45 + T48;
+						       T4e = T45 - T48;
+						       TaC = Tay - TaB;
+						       Tc0 = Tay + TaB;
+						       Tdr = FMA(KP414213562, Tdi, Tdj);
+						       Tdk = FNMS(KP414213562, Tdj, Tdi);
+						  }
+						  {
+						       E T7Z, T7Y, T4f, T4a;
+						       T7Z = FNMS(KP707106781, T4e, T4d);
+						       T4f = FMA(KP707106781, T4e, T4d);
+						       T4a = FMA(KP707106781, T49, T42);
+						       T7Y = FNMS(KP707106781, T49, T42);
+						       TeF = Tdr - Tdq;
+						       Tds = Tdq + Tdr;
+						       T4t = FNMS(KP198912367, T4a, T4f);
+						       T4g = FMA(KP198912367, T4f, T4a);
+						       T87 = FMA(KP668178637, T7Y, T7Z);
+						       T80 = FNMS(KP668178637, T7Z, T7Y);
+						       TeI = Tdh - Tdk;
+						       Tdl = Tdh + Tdk;
+						  }
+					     }
+					}
+					{
+					     E T50, T5b, TdA, TaY, T5h, T5s, Tb5, TdD, T5l, Tb1, TdB, T5c, T57, Tb6, T5k;
+					     E T5m, Tb3, Tb4;
+					     {
+						  E T4Y, T4Z, T59, T5a;
+						  T4Y = R1[WS(rs, 1)];
+						  T86 = FNMS(KP668178637, T7V, T7W);
+						  T7X = FMA(KP668178637, T7W, T7V);
+						  TaM = TaC - Tav;
+						  TaD = Tav + TaC;
+						  T4Z = R1[WS(rs, 33)];
+						  T59 = R1[WS(rs, 49)];
+						  T5a = R1[WS(rs, 17)];
+						  {
+						       E T5f, TaW, TaX, T5g, T5q, T5r;
+						       T5f = R1[WS(rs, 61)];
+						       T50 = T4Y - T4Z;
+						       TaW = T4Y + T4Z;
+						       T5b = T59 - T5a;
+						       TaX = T5a + T59;
+						       T5g = R1[WS(rs, 29)];
+						       T5q = R1[WS(rs, 45)];
+						       T5r = R1[WS(rs, 13)];
+						       TdA = TaW - TaX;
+						       TaY = TaW + TaX;
+						       T5h = T5f - T5g;
+						       Tb3 = T5f + T5g;
+						       Tb4 = T5r + T5q;
+						       T5s = T5q - T5r;
+						  }
+					     }
+					     {
+						  E T51, T52, T54, T55;
+						  T51 = R1[WS(rs, 9)];
+						  Tb5 = Tb3 + Tb4;
+						  TdD = Tb3 - Tb4;
+						  T52 = R1[WS(rs, 41)];
+						  T54 = R1[WS(rs, 57)];
+						  T55 = R1[WS(rs, 25)];
+						  {
+						       E T5i, TaZ, T53, Tb0, T56, T5j;
+						       T5i = R1[WS(rs, 5)];
+						       TaZ = T51 + T52;
+						       T53 = T51 - T52;
+						       Tb0 = T54 + T55;
+						       T56 = T54 - T55;
+						       T5j = R1[WS(rs, 37)];
+						       T5l = R1[WS(rs, 53)];
+						       Tb1 = TaZ + Tb0;
+						       TdB = Tb0 - TaZ;
+						       T5c = T56 - T53;
+						       T57 = T53 + T56;
+						       Tb6 = T5i + T5j;
+						       T5k = T5i - T5j;
+						       T5m = R1[WS(rs, 21)];
+						  }
+					     }
+					     {
+						  E TdL, TdC, T5o, T5t, TdM, TdF;
+						  Tb2 = TaY - Tb1;
+						  Tc6 = TaY + Tb1;
+						  {
+						       E T58, T5d, Tb7, T5n, TdE, Tb8;
+						       T58 = FMA(KP707106781, T57, T50);
+						       T8e = FNMS(KP707106781, T57, T50);
+						       T8f = FNMS(KP707106781, T5c, T5b);
+						       T5d = FMA(KP707106781, T5c, T5b);
+						       Tb7 = T5l + T5m;
+						       T5n = T5l - T5m;
+						       TdL = FNMS(KP414213562, TdA, TdB);
+						       TdC = FMA(KP414213562, TdB, TdA);
+						       T5e = FMA(KP198912367, T5d, T58);
+						       T5H = FNMS(KP198912367, T58, T5d);
+						       TdE = Tb7 - Tb6;
+						       Tb8 = Tb6 + Tb7;
+						       T5o = T5k + T5n;
+						       T5t = T5n - T5k;
+						       Tb9 = Tb5 - Tb8;
+						       Tc7 = Tb5 + Tb8;
+						       TdM = FMA(KP414213562, TdD, TdE);
+						       TdF = FNMS(KP414213562, TdE, TdD);
+						  }
+						  {
+						       E T8i, T8h, T5u, T5p;
+						       T8i = FNMS(KP707106781, T5t, T5s);
+						       T5u = FMA(KP707106781, T5t, T5s);
+						       T5p = FMA(KP707106781, T5o, T5h);
+						       T8h = FNMS(KP707106781, T5o, T5h);
+						       TeM = TdM - TdL;
+						       TdN = TdL + TdM;
+						       T5I = FMA(KP198912367, T5p, T5u);
+						       T5v = FNMS(KP198912367, T5u, T5p);
+						       T8q = FNMS(KP668178637, T8h, T8i);
+						       T8j = FMA(KP668178637, T8i, T8h);
+						       TeP = TdF - TdC;
+						       TdG = TdC + TdF;
+						  }
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T8p, T8g, TcH, TdW, TdT, TcO, Tfp, Tfk, Tfj, Tfq;
+			 {
+			      E Tbj, Tba, Tcy, Tco, TcB, Tcl, Tcx, Tcv, Tcz, Tcr;
+			      {
+				   E Tch, Tct, Tcp, Tcq, Tci, T1w, TbV, T11, Tcf, Tc9, T21, Tcj, Tcm, TbY, Tc1;
+				   E Tcn, Tcu, Tck;
+				   {
+					E Tv, T10, Tc5, Tc8;
+					Tch = Tf - Tu;
+					Tv = Tf + Tu;
+					T8p = FMA(KP668178637, T8e, T8f);
+					T8g = FNMS(KP668178637, T8f, T8e);
+					Tbj = Tb9 - Tb2;
+					Tba = Tb2 + Tb9;
+					T10 = TK + TZ;
+					Tct = TZ - TK;
+					Tcp = Tc3 - Tc4;
+					Tc5 = Tc3 + Tc4;
+					Tc8 = Tc6 + Tc7;
+					Tcq = Tc7 - Tc6;
+					Tci = T1g - T1v;
+					T1w = T1g + T1v;
+					TbV = Tv - T10;
+					T11 = Tv + T10;
+					Tcf = Tc5 + Tc8;
+					Tc9 = Tc5 - Tc8;
+					T21 = T1L + T20;
+					Tcj = T1L - T20;
+					Tcm = TbW - TbX;
+					TbY = TbW + TbX;
+					Tc1 = TbZ + Tc0;
+					Tcn = Tc0 - TbZ;
+				   }
+				   {
+					E Tcb, T22, Tce, Tc2;
+					Tcb = T21 - T1w;
+					T22 = T1w + T21;
+					Tce = TbY + Tc1;
+					Tc2 = TbY - Tc1;
+					{
+					     E Tcd, Tcg, Tca, Tcc;
+					     Tcd = T11 + T22;
+					     Cr[WS(csr, 32)] = T11 - T22;
+					     Tcg = Tce + Tcf;
+					     Ci[WS(csi, 32)] = Tcf - Tce;
+					     Tca = Tc2 + Tc9;
+					     Tcc = Tc9 - Tc2;
+					     Cr[0] = Tcd + Tcg;
+					     Cr[WS(csr, 64)] = Tcd - Tcg;
+					     Ci[WS(csi, 48)] = FMS(KP707106781, Tcc, Tcb);
+					     Ci[WS(csi, 16)] = FMA(KP707106781, Tcc, Tcb);
+					     Cr[WS(csr, 16)] = FMA(KP707106781, Tca, TbV);
+					     Cr[WS(csr, 48)] = FNMS(KP707106781, Tca, TbV);
+					     Tcu = Tcj - Tci;
+					     Tck = Tci + Tcj;
+					     Tcy = FNMS(KP414213562, Tcm, Tcn);
+					     Tco = FMA(KP414213562, Tcn, Tcm);
+					}
+				   }
+				   TcB = FNMS(KP707106781, Tck, Tch);
+				   Tcl = FMA(KP707106781, Tck, Tch);
+				   Tcx = FMA(KP707106781, Tcu, Tct);
+				   Tcv = FNMS(KP707106781, Tcu, Tct);
+				   Tcz = FMA(KP414213562, Tcp, Tcq);
+				   Tcr = FNMS(KP414213562, Tcq, Tcp);
+			      }
+			      {
+				   E TbT, TbO, TbN, TbU;
+				   {
+					E Ta9, TbB, Tbb, TbL, Tbp, TbM, Tag, Tbk, TbR, TbJ, Tbw, TaO, TbC, Tbs, TbQ;
+					E TbG;
+					{
+					     E Tbq, Tbr, TbH, TbI;
+					     {
+						  E Tbo, Ta8, Tac, Taf;
+						  Tbo = Ta7 - Ta6;
+						  Ta8 = Ta6 + Ta7;
+						  {
+						       E TcC, TcA, Tcw, Tcs;
+						       TcC = Tcz - Tcy;
+						       TcA = Tcy + Tcz;
+						       Tcw = Tcr - Tco;
+						       Tcs = Tco + Tcr;
+						       Cr[WS(csr, 24)] = FMA(KP923879532, TcC, TcB);
+						       Cr[WS(csr, 40)] = FNMS(KP923879532, TcC, TcB);
+						       Ci[WS(csi, 56)] = FMS(KP923879532, TcA, Tcx);
+						       Ci[WS(csi, 8)] = FMA(KP923879532, TcA, Tcx);
+						       Ci[WS(csi, 40)] = FMA(KP923879532, Tcw, Tcv);
+						       Ci[WS(csi, 24)] = FMS(KP923879532, Tcw, Tcv);
+						       Cr[WS(csr, 8)] = FMA(KP923879532, Tcs, Tcl);
+						       Cr[WS(csr, 56)] = FNMS(KP923879532, Tcs, Tcl);
+						       Ta9 = FMA(KP707106781, Ta8, Ta5);
+						       TbB = FNMS(KP707106781, Ta8, Ta5);
+						  }
+						  Tbq = FNMS(KP414213562, Taa, Tab);
+						  Tac = FMA(KP414213562, Tab, Taa);
+						  Taf = FNMS(KP414213562, Tae, Tad);
+						  Tbr = FMA(KP414213562, Tad, Tae);
+						  Tbb = FMA(KP707106781, Tba, TaV);
+						  TbH = FNMS(KP707106781, Tba, TaV);
+						  TbL = FNMS(KP707106781, Tbo, Tbn);
+						  Tbp = FMA(KP707106781, Tbo, Tbn);
+						  TbM = Taf - Tac;
+						  Tag = Tac + Taf;
+						  TbI = FNMS(KP707106781, Tbj, Tbi);
+						  Tbk = FMA(KP707106781, Tbj, Tbi);
+					     }
+					     {
+						  E TbE, TbF, TaE, TaN;
+						  TbE = FNMS(KP707106781, TaD, Tao);
+						  TaE = FMA(KP707106781, TaD, Tao);
+						  TaN = FMA(KP707106781, TaM, TaL);
+						  TbF = FNMS(KP707106781, TaM, TaL);
+						  TbR = FNMS(KP668178637, TbH, TbI);
+						  TbJ = FMA(KP668178637, TbI, TbH);
+						  Tbw = FNMS(KP198912367, TaE, TaN);
+						  TaO = FMA(KP198912367, TaN, TaE);
+						  TbC = Tbr - Tbq;
+						  Tbs = Tbq + Tbr;
+						  TbQ = FMA(KP668178637, TbE, TbF);
+						  TbG = FNMS(KP668178637, TbF, TbE);
+					     }
+					}
+					{
+					     E Tbz, Tah, Tbv, Tbt, Tbx, Tbl;
+					     Tbz = FNMS(KP923879532, Tag, Ta9);
+					     Tah = FMA(KP923879532, Tag, Ta9);
+					     Tbv = FMA(KP923879532, Tbs, Tbp);
+					     Tbt = FNMS(KP923879532, Tbs, Tbp);
+					     Tbx = FMA(KP198912367, Tbb, Tbk);
+					     Tbl = FNMS(KP198912367, Tbk, Tbb);
+					     {
+						  E TbD, TbK, TbP, TbS;
+						  TbT = FNMS(KP923879532, TbC, TbB);
+						  TbD = FMA(KP923879532, TbC, TbB);
+						  {
+						       E TbA, Tby, Tbu, Tbm;
+						       TbA = Tbx - Tbw;
+						       Tby = Tbw + Tbx;
+						       Tbu = Tbl - TaO;
+						       Tbm = TaO + Tbl;
+						       Cr[WS(csr, 28)] = FMA(KP980785280, TbA, Tbz);
+						       Cr[WS(csr, 36)] = FNMS(KP980785280, TbA, Tbz);
+						       Ci[WS(csi, 60)] = FMS(KP980785280, Tby, Tbv);
+						       Ci[WS(csi, 4)] = FMA(KP980785280, Tby, Tbv);
+						       Ci[WS(csi, 36)] = FMA(KP980785280, Tbu, Tbt);
+						       Ci[WS(csi, 28)] = FMS(KP980785280, Tbu, Tbt);
+						       Cr[WS(csr, 4)] = FMA(KP980785280, Tbm, Tah);
+						       Cr[WS(csr, 60)] = FNMS(KP980785280, Tbm, Tah);
+						       TbK = TbG + TbJ;
+						       TbO = TbJ - TbG;
+						  }
+						  TbN = FMA(KP923879532, TbM, TbL);
+						  TbP = FNMS(KP923879532, TbM, TbL);
+						  TbS = TbQ + TbR;
+						  TbU = TbQ - TbR;
+						  Cr[WS(csr, 12)] = FMA(KP831469612, TbK, TbD);
+						  Cr[WS(csr, 52)] = FNMS(KP831469612, TbK, TbD);
+						  Ci[WS(csi, 52)] = FNMS(KP831469612, TbS, TbP);
+						  Ci[WS(csi, 12)] = -(FMA(KP831469612, TbS, TbP));
+					     }
+					}
+				   }
+				   {
+					E TeN, Tf7, Tev, Tfm, Tfc, TeQ, TeX, TeW, Tfn, Tff, Tfi, TeC, Tf2, TeK, Tfh;
+					E TeV, Tf8;
+					{
+					     E TeG, TeJ, Tfd, Tfe, Tey, TeB, TeT, TeU;
+					     {
+						  E Tet, Teu, Tfa, Tfb;
+						  TcH = FMA(KP707106781, TcG, TcD);
+						  Tet = FNMS(KP707106781, TcG, TcD);
+						  Ci[WS(csi, 44)] = FMS(KP831469612, TbO, TbN);
+						  Ci[WS(csi, 20)] = FMA(KP831469612, TbO, TbN);
+						  Cr[WS(csr, 20)] = FMA(KP831469612, TbU, TbT);
+						  Cr[WS(csr, 44)] = FNMS(KP831469612, TbU, TbT);
+						  Teu = TdV - TdU;
+						  TdW = TdU + TdV;
+						  TeG = FNMS(KP923879532, TeF, TeE);
+						  Tfa = FMA(KP923879532, TeF, TeE);
+						  Tfb = FMA(KP923879532, TeI, TeH);
+						  TeJ = FNMS(KP923879532, TeI, TeH);
+						  TeN = FNMS(KP923879532, TeM, TeL);
+						  Tfd = FMA(KP923879532, TeM, TeL);
+						  Tf7 = FMA(KP923879532, Teu, Tet);
+						  Tev = FNMS(KP923879532, Teu, Tet);
+						  Tfm = FMA(KP303346683, Tfa, Tfb);
+						  Tfc = FNMS(KP303346683, Tfb, Tfa);
+						  Tfe = FNMS(KP923879532, TeP, TeO);
+						  TeQ = FMA(KP923879532, TeP, TeO);
+						  TeX = FNMS(KP668178637, Tew, Tex);
+						  Tey = FMA(KP668178637, Tex, Tew);
+						  TeB = FNMS(KP668178637, TeA, Tez);
+						  TeW = FMA(KP668178637, Tez, TeA);
+					     }
+					     Tfn = FNMS(KP303346683, Tfd, Tfe);
+					     Tff = FMA(KP303346683, Tfe, Tfd);
+					     Tfi = Tey + TeB;
+					     TeC = Tey - TeB;
+					     TdT = FMA(KP707106781, TdS, TdR);
+					     TeT = FNMS(KP707106781, TdS, TdR);
+					     TeU = TcN - TcK;
+					     TcO = TcK + TcN;
+					     Tf2 = FNMS(KP534511135, TeG, TeJ);
+					     TeK = FMA(KP534511135, TeJ, TeG);
+					     Tfh = FNMS(KP923879532, TeU, TeT);
+					     TeV = FMA(KP923879532, TeU, TeT);
+					}
+					{
+					     E Tf5, TeD, TeY, Tf3, TeR;
+					     Tf5 = FNMS(KP831469612, TeC, Tev);
+					     TeD = FMA(KP831469612, TeC, Tev);
+					     Tf8 = TeX + TeW;
+					     TeY = TeW - TeX;
+					     Tf3 = FMA(KP534511135, TeN, TeQ);
+					     TeR = FNMS(KP534511135, TeQ, TeN);
+					     {
+						  E Tf1, TeZ, Tf6, Tf4, Tf0, TeS;
+						  Tf1 = FMA(KP831469612, TeY, TeV);
+						  TeZ = FNMS(KP831469612, TeY, TeV);
+						  Tf6 = Tf3 - Tf2;
+						  Tf4 = Tf2 + Tf3;
+						  Tf0 = TeR - TeK;
+						  TeS = TeK + TeR;
+						  Ci[WS(csi, 54)] = FMS(KP881921264, Tf4, Tf1);
+						  Ci[WS(csi, 10)] = FMA(KP881921264, Tf4, Tf1);
+						  Ci[WS(csi, 42)] = FMA(KP881921264, Tf0, TeZ);
+						  Ci[WS(csi, 22)] = FMS(KP881921264, Tf0, TeZ);
+						  Cr[WS(csr, 10)] = FMA(KP881921264, TeS, TeD);
+						  Cr[WS(csr, 54)] = FNMS(KP881921264, TeS, TeD);
+						  Cr[WS(csr, 42)] = FNMS(KP881921264, Tf6, Tf5);
+						  Cr[WS(csr, 22)] = FMA(KP881921264, Tf6, Tf5);
+					     }
+					}
+					{
+					     E Tf9, Tfg, Tfl, Tfo;
+					     Tfp = FNMS(KP831469612, Tf8, Tf7);
+					     Tf9 = FMA(KP831469612, Tf8, Tf7);
+					     Tfg = Tfc + Tff;
+					     Tfk = Tff - Tfc;
+					     Tfj = FNMS(KP831469612, Tfi, Tfh);
+					     Tfl = FMA(KP831469612, Tfi, Tfh);
+					     Tfo = Tfm + Tfn;
+					     Tfq = Tfm - Tfn;
+					     Cr[WS(csr, 6)] = FMA(KP956940335, Tfg, Tf9);
+					     Cr[WS(csr, 58)] = FNMS(KP956940335, Tfg, Tf9);
+					     Ci[WS(csi, 58)] = FNMS(KP956940335, Tfo, Tfl);
+					     Ci[WS(csi, 6)] = -(FMA(KP956940335, Tfo, Tfl));
+					}
+				   }
+			      }
+			 }
+			 {
+			      E T2f, T5W, T5T, T2y, T5J, T5w, T4u, T4h, T7p, T7q;
+			      {
+				   E Ter, Tem, Tel, Tes;
+				   {
+					E TdH, Te9, TcP, Teo, Tee, TdO, TdY, TdZ, Tep, Teh, Tek, Td8, Te4, Tdu, Tej;
+					E TdX, Tea;
+					{
+					     E Tdm, Tdt, Tef, Teg, TcY, Td7, Tec, Ted;
+					     Ci[WS(csi, 38)] = FMS(KP956940335, Tfk, Tfj);
+					     Ci[WS(csi, 26)] = FMA(KP956940335, Tfk, Tfj);
+					     Cr[WS(csr, 26)] = FMA(KP956940335, Tfq, Tfp);
+					     Cr[WS(csr, 38)] = FNMS(KP956940335, Tfq, Tfp);
+					     Tdm = FMA(KP923879532, Tdl, Tde);
+					     Tec = FNMS(KP923879532, Tdl, Tde);
+					     Ted = FNMS(KP923879532, Tds, Tdp);
+					     Tdt = FMA(KP923879532, Tds, Tdp);
+					     TdH = FMA(KP923879532, TdG, Tdz);
+					     Tef = FNMS(KP923879532, TdG, Tdz);
+					     Te9 = FNMS(KP923879532, TcO, TcH);
+					     TcP = FMA(KP923879532, TcO, TcH);
+					     Teo = FMA(KP820678790, Tec, Ted);
+					     Tee = FNMS(KP820678790, Ted, Tec);
+					     Teg = FNMS(KP923879532, TdN, TdK);
+					     TdO = FMA(KP923879532, TdN, TdK);
+					     TdY = FNMS(KP198912367, TcU, TcX);
+					     TcY = FMA(KP198912367, TcX, TcU);
+					     Td7 = FNMS(KP198912367, Td6, Td3);
+					     TdZ = FMA(KP198912367, Td3, Td6);
+					     Tep = FNMS(KP820678790, Tef, Teg);
+					     Teh = FMA(KP820678790, Teg, Tef);
+					     Tek = Td7 - TcY;
+					     Td8 = TcY + Td7;
+					     Te4 = FNMS(KP098491403, Tdm, Tdt);
+					     Tdu = FMA(KP098491403, Tdt, Tdm);
+					     Tej = FNMS(KP923879532, TdW, TdT);
+					     TdX = FMA(KP923879532, TdW, TdT);
+					}
+					{
+					     E Te7, Td9, Te0, Te5, TdP;
+					     Te7 = FNMS(KP980785280, Td8, TcP);
+					     Td9 = FMA(KP980785280, Td8, TcP);
+					     Tea = TdZ - TdY;
+					     Te0 = TdY + TdZ;
+					     Te5 = FMA(KP098491403, TdH, TdO);
+					     TdP = FNMS(KP098491403, TdO, TdH);
+					     {
+						  E Te3, Te1, Te8, Te6, Te2, TdQ;
+						  Te3 = FMA(KP980785280, Te0, TdX);
+						  Te1 = FNMS(KP980785280, Te0, TdX);
+						  Te8 = Te5 - Te4;
+						  Te6 = Te4 + Te5;
+						  Te2 = TdP - Tdu;
+						  TdQ = Tdu + TdP;
+						  Ci[WS(csi, 62)] = FMS(KP995184726, Te6, Te3);
+						  Ci[WS(csi, 2)] = FMA(KP995184726, Te6, Te3);
+						  Ci[WS(csi, 34)] = FMA(KP995184726, Te2, Te1);
+						  Ci[WS(csi, 30)] = FMS(KP995184726, Te2, Te1);
+						  Cr[WS(csr, 2)] = FMA(KP995184726, TdQ, Td9);
+						  Cr[WS(csr, 62)] = FNMS(KP995184726, TdQ, Td9);
+						  Cr[WS(csr, 34)] = FNMS(KP995184726, Te8, Te7);
+						  Cr[WS(csr, 30)] = FMA(KP995184726, Te8, Te7);
+					     }
+					}
+					{
+					     E Teb, Tei, Ten, Teq;
+					     Ter = FNMS(KP980785280, Tea, Te9);
+					     Teb = FMA(KP980785280, Tea, Te9);
+					     Tei = Tee + Teh;
+					     Tem = Teh - Tee;
+					     Tel = FMA(KP980785280, Tek, Tej);
+					     Ten = FNMS(KP980785280, Tek, Tej);
+					     Teq = Teo + Tep;
+					     Tes = Teo - Tep;
+					     Cr[WS(csr, 14)] = FMA(KP773010453, Tei, Teb);
+					     Cr[WS(csr, 50)] = FNMS(KP773010453, Tei, Teb);
+					     Ci[WS(csi, 50)] = FNMS(KP773010453, Teq, Ten);
+					     Ci[WS(csi, 14)] = -(FMA(KP773010453, Teq, Ten));
+					}
+				   }
+				   {
+					E T77, T6v, T7i, T6C, T78, T6Y, T7h, T6V, T6N, T7d, T6P, T6F, T6I;
+					{
+					     E T6W, T6X, T6T, T6U, T6M;
+					     {
+						  E T6t, T6u, T6y, T6B;
+						  T2f = FMA(KP923879532, T2e, T27);
+						  T6t = FNMS(KP923879532, T2e, T27);
+						  Ci[WS(csi, 46)] = FMS(KP773010453, Tem, Tel);
+						  Ci[WS(csi, 18)] = FMA(KP773010453, Tem, Tel);
+						  Cr[WS(csr, 18)] = FMA(KP773010453, Tes, Ter);
+						  Cr[WS(csr, 46)] = FNMS(KP773010453, Tes, Ter);
+						  T6u = T5U - T5V;
+						  T5W = T5U + T5V;
+						  T6W = FNMS(KP820678790, T6w, T6x);
+						  T6y = FMA(KP820678790, T6x, T6w);
+						  T6B = FNMS(KP820678790, T6A, T6z);
+						  T6X = FMA(KP820678790, T6z, T6A);
+						  T77 = FMA(KP980785280, T6u, T6t);
+						  T6v = FNMS(KP980785280, T6u, T6t);
+						  T7i = T6B + T6y;
+						  T6C = T6y - T6B;
+					     }
+					     T5T = FMA(KP923879532, T5S, T5P);
+					     T6T = FNMS(KP923879532, T5S, T5P);
+					     T6U = T2x - T2o;
+					     T2y = T2o + T2x;
+					     T5J = T5H + T5I;
+					     T6M = T5I - T5H;
+					     T78 = T6X + T6W;
+					     T6Y = T6W - T6X;
+					     T7h = FMA(KP980785280, T6U, T6T);
+					     T6V = FNMS(KP980785280, T6U, T6T);
+					     T6N = FNMS(KP980785280, T6M, T6L);
+					     T7d = FMA(KP980785280, T6M, T6L);
+					     T6P = T5v - T5e;
+					     T5w = T5e + T5v;
+					     T4u = T4s + T4t;
+					     T6F = T4s - T4t;
+					     T6I = T4g - T3Z;
+					     T4h = T3Z + T4g;
+					}
+					{
+					     E T75, T7f, T7n, T7c, T7m, T76;
+					     {
+						  E T6D, T72, T6R, T73, T6K, T71, T6Z, T7e, T6Q, T74, T70, T6S;
+						  T75 = FNMS(KP773010453, T6C, T6v);
+						  T6D = FMA(KP773010453, T6C, T6v);
+						  T7e = FNMS(KP980785280, T6P, T6O);
+						  T6Q = FMA(KP980785280, T6P, T6O);
+						  {
+						       E T7a, T6G, T7b, T6J;
+						       T7a = FMA(KP980785280, T6F, T6E);
+						       T6G = FNMS(KP980785280, T6F, T6E);
+						       T7b = FMA(KP980785280, T6I, T6H);
+						       T6J = FNMS(KP980785280, T6I, T6H);
+						       T7f = FMA(KP357805721, T7e, T7d);
+						       T7n = FNMS(KP357805721, T7d, T7e);
+						       T72 = FMA(KP472964775, T6N, T6Q);
+						       T6R = FNMS(KP472964775, T6Q, T6N);
+						       T7c = FMA(KP357805721, T7b, T7a);
+						       T7m = FNMS(KP357805721, T7a, T7b);
+						       T73 = FMA(KP472964775, T6G, T6J);
+						       T6K = FNMS(KP472964775, T6J, T6G);
+						  }
+						  T71 = FNMS(KP773010453, T6Y, T6V);
+						  T6Z = FMA(KP773010453, T6Y, T6V);
+						  T74 = T72 - T73;
+						  T76 = T73 + T72;
+						  T70 = T6R - T6K;
+						  T6S = T6K + T6R;
+						  Ci[WS(csi, 55)] = FMA(KP903989293, T74, T71);
+						  Ci[WS(csi, 9)] = FMS(KP903989293, T74, T71);
+						  Cr[WS(csr, 9)] = FMA(KP903989293, T6S, T6D);
+						  Cr[WS(csr, 55)] = FNMS(KP903989293, T6S, T6D);
+						  Ci[WS(csi, 41)] = FMS(KP903989293, T70, T6Z);
+						  Ci[WS(csi, 23)] = FMA(KP903989293, T70, T6Z);
+					     }
+					     {
+						  E T7k, T7j, T7l, T7o, T79, T7g;
+						  T7p = FNMS(KP773010453, T78, T77);
+						  T79 = FMA(KP773010453, T78, T77);
+						  T7g = T7c + T7f;
+						  T7k = T7f - T7c;
+						  T7j = FNMS(KP773010453, T7i, T7h);
+						  T7l = FMA(KP773010453, T7i, T7h);
+						  Cr[WS(csr, 23)] = FMA(KP903989293, T76, T75);
+						  Cr[WS(csr, 41)] = FNMS(KP903989293, T76, T75);
+						  Cr[WS(csr, 7)] = FMA(KP941544065, T7g, T79);
+						  Cr[WS(csr, 57)] = FNMS(KP941544065, T7g, T79);
+						  T7o = T7m - T7n;
+						  T7q = T7m + T7n;
+						  Ci[WS(csi, 57)] = FMS(KP941544065, T7o, T7l);
+						  Ci[WS(csi, 7)] = FMA(KP941544065, T7o, T7l);
+						  Ci[WS(csi, 39)] = FMA(KP941544065, T7k, T7j);
+						  Ci[WS(csi, 25)] = FMS(KP941544065, T7k, T7j);
+					     }
+					}
+				   }
+			      }
+			      {
+				   E T7t, T8A, T8x, T7A, T8r, T8k, T88, T81, Ta3, Ta4, T6r, T6s;
+				   {
+					E T9L, T99, T9W, T9g, T9M, T9C, T9V, T9z, T9k, T9O, T9T, Ta0, T9H, T9v, T9m;
+					{
+					     E T9B, T9c, T9f, T9A, T97, T98;
+					     T7t = FMA(KP923879532, T7s, T7r);
+					     T97 = FNMS(KP923879532, T7s, T7r);
+					     T98 = T8z - T8y;
+					     T8A = T8y + T8z;
+					     T9B = FNMS(KP534511135, T9a, T9b);
+					     T9c = FMA(KP534511135, T9b, T9a);
+					     Cr[WS(csr, 25)] = FNMS(KP941544065, T7q, T7p);
+					     Cr[WS(csr, 39)] = FMA(KP941544065, T7q, T7p);
+					     T9L = FMA(KP831469612, T98, T97);
+					     T99 = FNMS(KP831469612, T98, T97);
+					     T9f = FNMS(KP534511135, T9e, T9d);
+					     T9A = FMA(KP534511135, T9d, T9e);
+					     {
+						  E T9x, T9y, T9q, T9t;
+						  T8x = FMA(KP923879532, T8w, T8v);
+						  T9x = FNMS(KP923879532, T8w, T8v);
+						  T9W = T9c + T9f;
+						  T9g = T9c - T9f;
+						  T9M = T9B + T9A;
+						  T9C = T9A - T9B;
+						  T9y = T7z - T7w;
+						  T7A = T7w + T7z;
+						  T8r = T8p + T8q;
+						  T9q = T8p - T8q;
+						  T9t = T8j - T8g;
+						  T8k = T8g + T8j;
+						  {
+						       E T9R, T9r, T9S, T9u, T9j;
+						       T88 = T86 + T87;
+						       T9j = T87 - T86;
+						       T9V = FNMS(KP831469612, T9y, T9x);
+						       T9z = FMA(KP831469612, T9y, T9x);
+						       T9R = FMA(KP831469612, T9q, T9p);
+						       T9r = FNMS(KP831469612, T9q, T9p);
+						       T9S = FMA(KP831469612, T9t, T9s);
+						       T9u = FNMS(KP831469612, T9t, T9s);
+						       T9k = FNMS(KP831469612, T9j, T9i);
+						       T9O = FMA(KP831469612, T9j, T9i);
+						       T9T = FNMS(KP250486960, T9S, T9R);
+						       Ta0 = FMA(KP250486960, T9R, T9S);
+						       T9H = FNMS(KP599376933, T9r, T9u);
+						       T9v = FMA(KP599376933, T9u, T9r);
+						       T9m = T7X - T80;
+						       T81 = T7X + T80;
+						  }
+					     }
+					}
+					{
+					     E T9J, T9h, T9F, T9D, T9P, T9n;
+					     T9J = FNMS(KP881921264, T9g, T99);
+					     T9h = FMA(KP881921264, T9g, T99);
+					     T9F = FMA(KP881921264, T9C, T9z);
+					     T9D = FNMS(KP881921264, T9C, T9z);
+					     T9P = FMA(KP831469612, T9m, T9l);
+					     T9n = FNMS(KP831469612, T9m, T9l);
+					     {
+						  E T9Y, T9X, T9Z, Ta2;
+						  {
+						       E T9N, Ta1, T9G, T9o, T9U, T9Q;
+						       Ta3 = FNMS(KP881921264, T9M, T9L);
+						       T9N = FMA(KP881921264, T9M, T9L);
+						       T9Q = FNMS(KP250486960, T9P, T9O);
+						       Ta1 = FMA(KP250486960, T9O, T9P);
+						       T9G = FNMS(KP599376933, T9k, T9n);
+						       T9o = FMA(KP599376933, T9n, T9k);
+						       T9U = T9Q + T9T;
+						       T9Y = T9T - T9Q;
+						       T9X = FNMS(KP881921264, T9W, T9V);
+						       T9Z = FMA(KP881921264, T9W, T9V);
+						       {
+							    E T9K, T9I, T9E, T9w;
+							    T9K = T9G + T9H;
+							    T9I = T9G - T9H;
+							    T9E = T9v - T9o;
+							    T9w = T9o + T9v;
+							    Cr[WS(csr, 5)] = FMA(KP970031253, T9U, T9N);
+							    Cr[WS(csr, 59)] = FNMS(KP970031253, T9U, T9N);
+							    Cr[WS(csr, 21)] = FNMS(KP857728610, T9K, T9J);
+							    Cr[WS(csr, 43)] = FMA(KP857728610, T9K, T9J);
+							    Ci[WS(csi, 53)] = FMS(KP857728610, T9I, T9F);
+							    Ci[WS(csi, 11)] = FMA(KP857728610, T9I, T9F);
+							    Ci[WS(csi, 43)] = FMA(KP857728610, T9E, T9D);
+							    Ci[WS(csi, 21)] = FMS(KP857728610, T9E, T9D);
+							    Cr[WS(csr, 11)] = FMA(KP857728610, T9w, T9h);
+							    Cr[WS(csr, 53)] = FNMS(KP857728610, T9w, T9h);
+							    Ta2 = Ta0 - Ta1;
+							    Ta4 = Ta1 + Ta0;
+						       }
+						  }
+						  Ci[WS(csi, 59)] = FMA(KP970031253, Ta2, T9Z);
+						  Ci[WS(csi, 5)] = FMS(KP970031253, Ta2, T9Z);
+						  Ci[WS(csi, 37)] = FMS(KP970031253, T9Y, T9X);
+						  Ci[WS(csi, 27)] = FMA(KP970031253, T9Y, T9X);
+					     }
+					}
+				   }
+				   {
+					E T69, T2z, T6k, T3g, T6a, T60, T6j, T5X, T4i, T6c, T6h, T6p, T64, T5L;
+					{
+					     E T5Y, T2U, T3f, T5Z;
+					     T5Y = FMA(KP098491403, T2M, T2T);
+					     T2U = FNMS(KP098491403, T2T, T2M);
+					     Cr[WS(csr, 27)] = FMA(KP970031253, Ta4, Ta3);
+					     Cr[WS(csr, 37)] = FNMS(KP970031253, Ta4, Ta3);
+					     T69 = FNMS(KP980785280, T2y, T2f);
+					     T2z = FMA(KP980785280, T2y, T2f);
+					     T3f = FMA(KP098491403, T3e, T37);
+					     T5Z = FNMS(KP098491403, T37, T3e);
+					     T6k = T3f - T2U;
+					     T3g = T2U + T3f;
+					     T6a = T5Y - T5Z;
+					     T60 = T5Y + T5Z;
+					     {
+						  E T6f, T5x, T6g, T5K;
+						  T6j = FNMS(KP980785280, T5W, T5T);
+						  T5X = FMA(KP980785280, T5W, T5T);
+						  T6f = FNMS(KP980785280, T5w, T4X);
+						  T5x = FMA(KP980785280, T5w, T4X);
+						  T6g = FNMS(KP980785280, T5J, T5G);
+						  T5K = FMA(KP980785280, T5J, T5G);
+						  T4i = FMA(KP980785280, T4h, T3I);
+						  T6c = FNMS(KP980785280, T4h, T3I);
+						  T6h = FMA(KP906347169, T6g, T6f);
+						  T6p = FNMS(KP906347169, T6f, T6g);
+						  T64 = FMA(KP049126849, T5x, T5K);
+						  T5L = FNMS(KP049126849, T5K, T5x);
+					     }
+					}
+					{
+					     E T67, T3h, T63, T61, T6d, T4v;
+					     T67 = FNMS(KP995184726, T3g, T2z);
+					     T3h = FMA(KP995184726, T3g, T2z);
+					     T63 = FMA(KP995184726, T60, T5X);
+					     T61 = FNMS(KP995184726, T60, T5X);
+					     T6d = FNMS(KP980785280, T4u, T4r);
+					     T4v = FMA(KP980785280, T4u, T4r);
+					     {
+						  E T6m, T6l, T6n, T6q;
+						  {
+						       E T6b, T6o, T65, T4w, T6i, T6e;
+						       T6r = FNMS(KP995184726, T6a, T69);
+						       T6b = FMA(KP995184726, T6a, T69);
+						       T6e = FMA(KP906347169, T6d, T6c);
+						       T6o = FNMS(KP906347169, T6c, T6d);
+						       T65 = FMA(KP049126849, T4i, T4v);
+						       T4w = FNMS(KP049126849, T4v, T4i);
+						       T6i = T6e + T6h;
+						       T6m = T6h - T6e;
+						       T6l = FNMS(KP995184726, T6k, T6j);
+						       T6n = FMA(KP995184726, T6k, T6j);
+						       {
+							    E T68, T66, T62, T5M;
+							    T68 = T65 + T64;
+							    T66 = T64 - T65;
+							    T62 = T5L - T4w;
+							    T5M = T4w + T5L;
+							    Cr[WS(csr, 15)] = FMA(KP740951125, T6i, T6b);
+							    Cr[WS(csr, 49)] = FNMS(KP740951125, T6i, T6b);
+							    Cr[WS(csr, 31)] = FMA(KP998795456, T68, T67);
+							    Cr[WS(csr, 33)] = FNMS(KP998795456, T68, T67);
+							    Ci[WS(csi, 63)] = FMA(KP998795456, T66, T63);
+							    Ci[WS(csi, 1)] = FMS(KP998795456, T66, T63);
+							    Ci[WS(csi, 33)] = FMS(KP998795456, T62, T61);
+							    Ci[WS(csi, 31)] = FMA(KP998795456, T62, T61);
+							    Cr[WS(csr, 1)] = FMA(KP998795456, T5M, T3h);
+							    Cr[WS(csr, 63)] = FNMS(KP998795456, T5M, T3h);
+							    T6q = T6o - T6p;
+							    T6s = T6o + T6p;
+						       }
+						  }
+						  Ci[WS(csi, 49)] = FMS(KP740951125, T6q, T6n);
+						  Ci[WS(csi, 15)] = FMA(KP740951125, T6q, T6n);
+						  Ci[WS(csi, 47)] = FMA(KP740951125, T6m, T6l);
+						  Ci[WS(csi, 17)] = FMS(KP740951125, T6m, T6l);
+					     }
+					}
+				   }
+				   {
+					E T8N, T7B, T8Y, T7Q, T8O, T8E, T8X, T8B, T82, T8Q, T8V, T92, T8J, T8t;
+					{
+					     E T8C, T7I, T7P, T8D;
+					     T8C = FNMS(KP303346683, T7E, T7H);
+					     T7I = FMA(KP303346683, T7H, T7E);
+					     Cr[WS(csr, 17)] = FNMS(KP740951125, T6s, T6r);
+					     Cr[WS(csr, 47)] = FMA(KP740951125, T6s, T6r);
+					     T8N = FNMS(KP831469612, T7A, T7t);
+					     T7B = FMA(KP831469612, T7A, T7t);
+					     T7P = FNMS(KP303346683, T7O, T7L);
+					     T8D = FMA(KP303346683, T7L, T7O);
+					     T8Y = T7P - T7I;
+					     T7Q = T7I + T7P;
+					     T8O = T8D - T8C;
+					     T8E = T8C + T8D;
+					     {
+						  E T8T, T8l, T8U, T8s;
+						  T8X = FNMS(KP831469612, T8A, T8x);
+						  T8B = FMA(KP831469612, T8A, T8x);
+						  T8T = FNMS(KP831469612, T8k, T8d);
+						  T8l = FMA(KP831469612, T8k, T8d);
+						  T8U = FNMS(KP831469612, T8r, T8o);
+						  T8s = FMA(KP831469612, T8r, T8o);
+						  T82 = FMA(KP831469612, T81, T7U);
+						  T8Q = FNMS(KP831469612, T81, T7U);
+						  T8V = FNMS(KP741650546, T8U, T8T);
+						  T92 = FMA(KP741650546, T8T, T8U);
+						  T8J = FNMS(KP148335987, T8l, T8s);
+						  T8t = FMA(KP148335987, T8s, T8l);
+					     }
+					}
+					{
+					     E T8L, T7R, T8H, T8F, T8R, T89;
+					     T8L = FNMS(KP956940335, T7Q, T7B);
+					     T7R = FMA(KP956940335, T7Q, T7B);
+					     T8H = FMA(KP956940335, T8E, T8B);
+					     T8F = FNMS(KP956940335, T8E, T8B);
+					     T8R = FNMS(KP831469612, T88, T85);
+					     T89 = FMA(KP831469612, T88, T85);
+					     {
+						  E T90, T8Z, T91, T94;
+						  {
+						       E T8P, T93, T8I, T8a, T8W, T8S;
+						       T95 = FNMS(KP956940335, T8O, T8N);
+						       T8P = FMA(KP956940335, T8O, T8N);
+						       T8S = FNMS(KP741650546, T8R, T8Q);
+						       T93 = FMA(KP741650546, T8Q, T8R);
+						       T8I = FNMS(KP148335987, T82, T89);
+						       T8a = FMA(KP148335987, T89, T82);
+						       T8W = T8S + T8V;
+						       T90 = T8V - T8S;
+						       T8Z = FMA(KP956940335, T8Y, T8X);
+						       T91 = FNMS(KP956940335, T8Y, T8X);
+						       {
+							    E T8M, T8K, T8G, T8u;
+							    T8M = T8I + T8J;
+							    T8K = T8I - T8J;
+							    T8G = T8t - T8a;
+							    T8u = T8a + T8t;
+							    Cr[WS(csr, 13)] = FMA(KP803207531, T8W, T8P);
+							    Cr[WS(csr, 51)] = FNMS(KP803207531, T8W, T8P);
+							    Cr[WS(csr, 29)] = FNMS(KP989176509, T8M, T8L);
+							    Cr[WS(csr, 35)] = FMA(KP989176509, T8M, T8L);
+							    Ci[WS(csi, 61)] = FMS(KP989176509, T8K, T8H);
+							    Ci[WS(csi, 3)] = FMA(KP989176509, T8K, T8H);
+							    Ci[WS(csi, 35)] = FMA(KP989176509, T8G, T8F);
+							    Ci[WS(csi, 29)] = FMS(KP989176509, T8G, T8F);
+							    Cr[WS(csr, 3)] = FMA(KP989176509, T8u, T7R);
+							    Cr[WS(csr, 61)] = FNMS(KP989176509, T8u, T7R);
+							    T94 = T92 - T93;
+							    T96 = T93 + T92;
+						       }
+						  }
+						  Ci[WS(csi, 51)] = FMA(KP803207531, T94, T91);
+						  Ci[WS(csi, 13)] = FMS(KP803207531, T94, T91);
+						  Ci[WS(csi, 45)] = FMS(KP803207531, T90, T8Z);
+						  Ci[WS(csi, 19)] = FMA(KP803207531, T90, T8Z);
+					     }
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Cr[WS(csr, 19)] = FMA(KP803207531, T96, T95);
+	       Cr[WS(csr, 45)] = FNMS(KP803207531, T96, T95);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 128, "r2cf_128", {440, 0, 516, 0}, &GENUS };
+
+void X(codelet_r2cf_128) (planner *p) {
+     X(kr2c_register) (p, r2cf_128, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 128 -name r2cf_128 -include r2cf.h */
+
+/*
+ * This function contains 956 FP additions, 330 FP multiplications,
+ * (or, 812 additions, 186 multiplications, 144 fused multiply/add),
+ * 186 stack variables, 31 constants, and 256 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_128(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP803207531, +0.803207531480644909806676512963141923879569427);
+     DK(KP595699304, +0.595699304492433343467036528829969889511926338);
+     DK(KP146730474, +0.146730474455361751658850129646717819706215317);
+     DK(KP989176509, +0.989176509964780973451673738016243063983689533);
+     DK(KP740951125, +0.740951125354959091175616897495162729728955309);
+     DK(KP671558954, +0.671558954847018400625376850427421803228750632);
+     DK(KP049067674, +0.049067674327418014254954976942682658314745363);
+     DK(KP998795456, +0.998795456205172392714771604759100694443203615);
+     DK(KP242980179, +0.242980179903263889948274162077471118320990783);
+     DK(KP970031253, +0.970031253194543992603984207286100251456865962);
+     DK(KP514102744, +0.514102744193221726593693838968815772608049120);
+     DK(KP857728610, +0.857728610000272069902269984284770137042490799);
+     DK(KP336889853, +0.336889853392220050689253212619147570477766780);
+     DK(KP941544065, +0.941544065183020778412509402599502357185589796);
+     DK(KP427555093, +0.427555093430282094320966856888798534304578629);
+     DK(KP903989293, +0.903989293123443331586200297230537048710132025);
+     DK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(512, rs), MAKE_VOLATILE_STRIDE(512, csr), MAKE_VOLATILE_STRIDE(512, csi)) {
+	       E TcD, TdU, T27, T7r, T5S, T8y, Tf, Ta5, Tu, Tbq, TcG, TdV, T2e, T8z, T5V;
+	       E T7s, TK, Ta6, TcK, TdX, T2o, T5X, T7w, T8B, TZ, Ta7, TcN, TdY, T2x, T5Y;
+	       E T7z, T8C, T1g, Taa, TcU, TeA, TcX, Tez, T1v, Tab, T2M, T6z, T7E, T9e, T7H;
+	       E T9d, T2T, T6A, T4X, T6L, Tdz, TeL, TdK, TeP, T5G, T6P, T8d, T9p, TaV, Tc3;
+	       E Tbi, Tc4, T8o, T9t, T3I, T6H, Tde, TeH, Tdp, TeF, T4r, T6F, T7U, T9l, Tao;
+	       E TbW, TaL, TbX, T85, T9j, T1L, Tad, Td3, Tew, Td6, Tex, T20, Tae, T37, T6x;
+	       E T7L, T9a, T7O, T9b, T3e, T6w, TbZ, Tc0, T3Z, T4s, Tds, TeI, T4g, T4t, T80;
+	       E T87, Tdl, TeE, T7X, T86, TaD, TaM, Tc6, Tc7, T5e, T5H, TdN, TeM, T5v, T5I;
+	       E T8j, T8q, TdG, TeO, T8g, T8p, Tba, Tbj;
+	       {
+		    E T3, T23, Td, T25, T6, T5R, Ta, T24;
+		    {
+			 E T1, T2, Tb, Tc;
+			 T1 = R0[0];
+			 T2 = R0[WS(rs, 32)];
+			 T3 = T1 + T2;
+			 T23 = T1 - T2;
+			 Tb = R0[WS(rs, 56)];
+			 Tc = R0[WS(rs, 24)];
+			 Td = Tb + Tc;
+			 T25 = Tb - Tc;
+		    }
+		    {
+			 E T4, T5, T8, T9;
+			 T4 = R0[WS(rs, 16)];
+			 T5 = R0[WS(rs, 48)];
+			 T6 = T4 + T5;
+			 T5R = T4 - T5;
+			 T8 = R0[WS(rs, 8)];
+			 T9 = R0[WS(rs, 40)];
+			 Ta = T8 + T9;
+			 T24 = T8 - T9;
+		    }
+		    TcD = T3 - T6;
+		    TdU = Td - Ta;
+		    {
+			 E T26, T5Q, T7, Te;
+			 T26 = KP707106781 * (T24 + T25);
+			 T27 = T23 + T26;
+			 T7r = T23 - T26;
+			 T5Q = KP707106781 * (T25 - T24);
+			 T5S = T5Q - T5R;
+			 T8y = T5R + T5Q;
+			 T7 = T3 + T6;
+			 Te = Ta + Td;
+			 Tf = T7 + Te;
+			 Ta5 = T7 - Te;
+		    }
+	       }
+	       {
+		    E Ti, T28, Ts, T2c, Tl, T29, Tp, T2b;
+		    {
+			 E Tg, Th, Tq, Tr;
+			 Tg = R0[WS(rs, 4)];
+			 Th = R0[WS(rs, 36)];
+			 Ti = Tg + Th;
+			 T28 = Tg - Th;
+			 Tq = R0[WS(rs, 12)];
+			 Tr = R0[WS(rs, 44)];
+			 Ts = Tq + Tr;
+			 T2c = Tq - Tr;
+		    }
+		    {
+			 E Tj, Tk, Tn, To;
+			 Tj = R0[WS(rs, 20)];
+			 Tk = R0[WS(rs, 52)];
+			 Tl = Tj + Tk;
+			 T29 = Tj - Tk;
+			 Tn = R0[WS(rs, 60)];
+			 To = R0[WS(rs, 28)];
+			 Tp = Tn + To;
+			 T2b = Tn - To;
+		    }
+		    {
+			 E Tm, Tt, TcE, TcF;
+			 Tm = Ti + Tl;
+			 Tt = Tp + Ts;
+			 Tu = Tm + Tt;
+			 Tbq = Tt - Tm;
+			 TcE = Ti - Tl;
+			 TcF = Tp - Ts;
+			 TcG = KP707106781 * (TcE + TcF);
+			 TdV = KP707106781 * (TcF - TcE);
+		    }
+		    {
+			 E T2a, T2d, T5T, T5U;
+			 T2a = FNMS(KP382683432, T29, KP923879532 * T28);
+			 T2d = FMA(KP923879532, T2b, KP382683432 * T2c);
+			 T2e = T2a + T2d;
+			 T8z = T2d - T2a;
+			 T5T = FNMS(KP923879532, T2c, KP382683432 * T2b);
+			 T5U = FMA(KP382683432, T28, KP923879532 * T29);
+			 T5V = T5T - T5U;
+			 T7s = T5U + T5T;
+		    }
+	       }
+	       {
+		    E Ty, T2g, TB, T2m, TF, T2l, TI, T2j;
+		    {
+			 E Tw, Tx, Tz, TA;
+			 Tw = R0[WS(rs, 2)];
+			 Tx = R0[WS(rs, 34)];
+			 Ty = Tw + Tx;
+			 T2g = Tw - Tx;
+			 Tz = R0[WS(rs, 18)];
+			 TA = R0[WS(rs, 50)];
+			 TB = Tz + TA;
+			 T2m = Tz - TA;
+			 {
+			      E TD, TE, T2h, TG, TH, T2i;
+			      TD = R0[WS(rs, 10)];
+			      TE = R0[WS(rs, 42)];
+			      T2h = TD - TE;
+			      TG = R0[WS(rs, 58)];
+			      TH = R0[WS(rs, 26)];
+			      T2i = TG - TH;
+			      TF = TD + TE;
+			      T2l = KP707106781 * (T2i - T2h);
+			      TI = TG + TH;
+			      T2j = KP707106781 * (T2h + T2i);
+			 }
+		    }
+		    {
+			 E TC, TJ, TcI, TcJ;
+			 TC = Ty + TB;
+			 TJ = TF + TI;
+			 TK = TC + TJ;
+			 Ta6 = TC - TJ;
+			 TcI = Ty - TB;
+			 TcJ = TI - TF;
+			 TcK = FMA(KP923879532, TcI, KP382683432 * TcJ);
+			 TdX = FNMS(KP382683432, TcI, KP923879532 * TcJ);
+		    }
+		    {
+			 E T2k, T2n, T7u, T7v;
+			 T2k = T2g + T2j;
+			 T2n = T2l - T2m;
+			 T2o = FMA(KP980785280, T2k, KP195090322 * T2n);
+			 T5X = FNMS(KP195090322, T2k, KP980785280 * T2n);
+			 T7u = T2g - T2j;
+			 T7v = T2m + T2l;
+			 T7w = FMA(KP831469612, T7u, KP555570233 * T7v);
+			 T8B = FNMS(KP555570233, T7u, KP831469612 * T7v);
+		    }
+	       }
+	       {
+		    E TN, T2p, TQ, T2v, TU, T2u, TX, T2s;
+		    {
+			 E TL, TM, TO, TP;
+			 TL = R0[WS(rs, 62)];
+			 TM = R0[WS(rs, 30)];
+			 TN = TL + TM;
+			 T2p = TL - TM;
+			 TO = R0[WS(rs, 14)];
+			 TP = R0[WS(rs, 46)];
+			 TQ = TO + TP;
+			 T2v = TO - TP;
+			 {
+			      E TS, TT, T2q, TV, TW, T2r;
+			      TS = R0[WS(rs, 6)];
+			      TT = R0[WS(rs, 38)];
+			      T2q = TS - TT;
+			      TV = R0[WS(rs, 54)];
+			      TW = R0[WS(rs, 22)];
+			      T2r = TV - TW;
+			      TU = TS + TT;
+			      T2u = KP707106781 * (T2r - T2q);
+			      TX = TV + TW;
+			      T2s = KP707106781 * (T2q + T2r);
+			 }
+		    }
+		    {
+			 E TR, TY, TcL, TcM;
+			 TR = TN + TQ;
+			 TY = TU + TX;
+			 TZ = TR + TY;
+			 Ta7 = TR - TY;
+			 TcL = TN - TQ;
+			 TcM = TX - TU;
+			 TcN = FNMS(KP382683432, TcM, KP923879532 * TcL);
+			 TdY = FMA(KP382683432, TcL, KP923879532 * TcM);
+		    }
+		    {
+			 E T2t, T2w, T7x, T7y;
+			 T2t = T2p + T2s;
+			 T2w = T2u - T2v;
+			 T2x = FNMS(KP195090322, T2w, KP980785280 * T2t);
+			 T5Y = FMA(KP195090322, T2t, KP980785280 * T2w);
+			 T7x = T2p - T2s;
+			 T7y = T2v + T2u;
+			 T7z = FNMS(KP555570233, T7y, KP831469612 * T7x);
+			 T8C = FMA(KP555570233, T7x, KP831469612 * T7y);
+		    }
+	       }
+	       {
+		    E T14, T2N, T17, T2D, T1b, T2O, T1e, T2C, T1j, T1m, T2K, TcR, T2Q, T1q, T1t;
+		    E T2H, TcS, T2R;
+		    {
+			 E T12, T13, T15, T16;
+			 T12 = R0[WS(rs, 1)];
+			 T13 = R0[WS(rs, 33)];
+			 T14 = T12 + T13;
+			 T2N = T12 - T13;
+			 T15 = R0[WS(rs, 17)];
+			 T16 = R0[WS(rs, 49)];
+			 T17 = T15 + T16;
+			 T2D = T15 - T16;
+		    }
+		    {
+			 E T19, T1a, T2B, T1c, T1d, T2A;
+			 T19 = R0[WS(rs, 9)];
+			 T1a = R0[WS(rs, 41)];
+			 T2B = T19 - T1a;
+			 T1c = R0[WS(rs, 57)];
+			 T1d = R0[WS(rs, 25)];
+			 T2A = T1c - T1d;
+			 T1b = T19 + T1a;
+			 T2O = KP707106781 * (T2B + T2A);
+			 T1e = T1c + T1d;
+			 T2C = KP707106781 * (T2A - T2B);
+		    }
+		    {
+			 E T2I, T2J, T2F, T2G;
+			 {
+			      E T1h, T1i, T1k, T1l;
+			      T1h = R0[WS(rs, 5)];
+			      T1i = R0[WS(rs, 37)];
+			      T1j = T1h + T1i;
+			      T2I = T1h - T1i;
+			      T1k = R0[WS(rs, 21)];
+			      T1l = R0[WS(rs, 53)];
+			      T1m = T1k + T1l;
+			      T2J = T1k - T1l;
+			 }
+			 T2K = FMA(KP382683432, T2I, KP923879532 * T2J);
+			 TcR = T1j - T1m;
+			 T2Q = FNMS(KP382683432, T2J, KP923879532 * T2I);
+			 {
+			      E T1o, T1p, T1r, T1s;
+			      T1o = R0[WS(rs, 61)];
+			      T1p = R0[WS(rs, 29)];
+			      T1q = T1o + T1p;
+			      T2F = T1o - T1p;
+			      T1r = R0[WS(rs, 13)];
+			      T1s = R0[WS(rs, 45)];
+			      T1t = T1r + T1s;
+			      T2G = T1r - T1s;
+			 }
+			 T2H = FNMS(KP923879532, T2G, KP382683432 * T2F);
+			 TcS = T1q - T1t;
+			 T2R = FMA(KP923879532, T2F, KP382683432 * T2G);
+		    }
+		    {
+			 E T18, T1f, TcQ, TcT;
+			 T18 = T14 + T17;
+			 T1f = T1b + T1e;
+			 T1g = T18 + T1f;
+			 Taa = T18 - T1f;
+			 TcQ = T14 - T17;
+			 TcT = KP707106781 * (TcR + TcS);
+			 TcU = TcQ + TcT;
+			 TeA = TcQ - TcT;
+		    }
+		    {
+			 E TcV, TcW, T1n, T1u;
+			 TcV = T1e - T1b;
+			 TcW = KP707106781 * (TcS - TcR);
+			 TcX = TcV + TcW;
+			 Tez = TcW - TcV;
+			 T1n = T1j + T1m;
+			 T1u = T1q + T1t;
+			 T1v = T1n + T1u;
+			 Tab = T1u - T1n;
+		    }
+		    {
+			 E T2E, T2L, T7C, T7D;
+			 T2E = T2C - T2D;
+			 T2L = T2H - T2K;
+			 T2M = T2E + T2L;
+			 T6z = T2L - T2E;
+			 T7C = T2N - T2O;
+			 T7D = T2K + T2H;
+			 T7E = T7C + T7D;
+			 T9e = T7C - T7D;
+		    }
+		    {
+			 E T7F, T7G, T2P, T2S;
+			 T7F = T2D + T2C;
+			 T7G = T2R - T2Q;
+			 T7H = T7F + T7G;
+			 T9d = T7G - T7F;
+			 T2P = T2N + T2O;
+			 T2S = T2Q + T2R;
+			 T2T = T2P + T2S;
+			 T6A = T2P - T2S;
+		    }
+	       }
+	       {
+		    E T4z, TaP, T5B, TaQ, T4G, TaT, T5y, TaS, Tbf, Tbg, T4O, Tdw, T5E, Tbc, Tbd;
+		    E T4V, Tdx, T5D;
+		    {
+			 E T4x, T4y, T5z, T5A;
+			 T4x = R1[WS(rs, 63)];
+			 T4y = R1[WS(rs, 31)];
+			 T4z = T4x - T4y;
+			 TaP = T4x + T4y;
+			 T5z = R1[WS(rs, 15)];
+			 T5A = R1[WS(rs, 47)];
+			 T5B = T5z - T5A;
+			 TaQ = T5z + T5A;
+		    }
+		    {
+			 E T4A, T4B, T4C, T4D, T4E, T4F;
+			 T4A = R1[WS(rs, 7)];
+			 T4B = R1[WS(rs, 39)];
+			 T4C = T4A - T4B;
+			 T4D = R1[WS(rs, 55)];
+			 T4E = R1[WS(rs, 23)];
+			 T4F = T4D - T4E;
+			 T4G = KP707106781 * (T4C + T4F);
+			 TaT = T4D + T4E;
+			 T5y = KP707106781 * (T4F - T4C);
+			 TaS = T4A + T4B;
+		    }
+		    {
+			 E T4K, T4N, T4R, T4U;
+			 {
+			      E T4I, T4J, T4L, T4M;
+			      T4I = R1[WS(rs, 3)];
+			      T4J = R1[WS(rs, 35)];
+			      T4K = T4I - T4J;
+			      Tbf = T4I + T4J;
+			      T4L = R1[WS(rs, 19)];
+			      T4M = R1[WS(rs, 51)];
+			      T4N = T4L - T4M;
+			      Tbg = T4L + T4M;
+			 }
+			 T4O = FNMS(KP382683432, T4N, KP923879532 * T4K);
+			 Tdw = Tbf - Tbg;
+			 T5E = FMA(KP382683432, T4K, KP923879532 * T4N);
+			 {
+			      E T4P, T4Q, T4S, T4T;
+			      T4P = R1[WS(rs, 59)];
+			      T4Q = R1[WS(rs, 27)];
+			      T4R = T4P - T4Q;
+			      Tbc = T4P + T4Q;
+			      T4S = R1[WS(rs, 11)];
+			      T4T = R1[WS(rs, 43)];
+			      T4U = T4S - T4T;
+			      Tbd = T4S + T4T;
+			 }
+			 T4V = FMA(KP923879532, T4R, KP382683432 * T4U);
+			 Tdx = Tbc - Tbd;
+			 T5D = FNMS(KP923879532, T4U, KP382683432 * T4R);
+		    }
+		    {
+			 E T4H, T4W, Tdv, Tdy;
+			 T4H = T4z + T4G;
+			 T4W = T4O + T4V;
+			 T4X = T4H + T4W;
+			 T6L = T4H - T4W;
+			 Tdv = TaP - TaQ;
+			 Tdy = KP707106781 * (Tdw + Tdx);
+			 Tdz = Tdv + Tdy;
+			 TeL = Tdv - Tdy;
+		    }
+		    {
+			 E TdI, TdJ, T5C, T5F;
+			 TdI = TaT - TaS;
+			 TdJ = KP707106781 * (Tdx - Tdw);
+			 TdK = TdI + TdJ;
+			 TeP = TdJ - TdI;
+			 T5C = T5y - T5B;
+			 T5F = T5D - T5E;
+			 T5G = T5C + T5F;
+			 T6P = T5F - T5C;
+		    }
+		    {
+			 E T8b, T8c, TaR, TaU;
+			 T8b = T4z - T4G;
+			 T8c = T5E + T5D;
+			 T8d = T8b + T8c;
+			 T9p = T8b - T8c;
+			 TaR = TaP + TaQ;
+			 TaU = TaS + TaT;
+			 TaV = TaR - TaU;
+			 Tc3 = TaR + TaU;
+		    }
+		    {
+			 E Tbe, Tbh, T8m, T8n;
+			 Tbe = Tbc + Tbd;
+			 Tbh = Tbf + Tbg;
+			 Tbi = Tbe - Tbh;
+			 Tc4 = Tbh + Tbe;
+			 T8m = T5B + T5y;
+			 T8n = T4V - T4O;
+			 T8o = T8m + T8n;
+			 T9t = T8n - T8m;
+		    }
+	       }
+	       {
+		    E T3k, Tai, T4m, Taj, T3r, Tam, T4j, Tal, TaI, TaJ, T3z, Tdb, T4p, TaF, TaG;
+		    E T3G, Tdc, T4o;
+		    {
+			 E T3i, T3j, T4k, T4l;
+			 T3i = R1[0];
+			 T3j = R1[WS(rs, 32)];
+			 T3k = T3i - T3j;
+			 Tai = T3i + T3j;
+			 T4k = R1[WS(rs, 16)];
+			 T4l = R1[WS(rs, 48)];
+			 T4m = T4k - T4l;
+			 Taj = T4k + T4l;
+		    }
+		    {
+			 E T3l, T3m, T3n, T3o, T3p, T3q;
+			 T3l = R1[WS(rs, 8)];
+			 T3m = R1[WS(rs, 40)];
+			 T3n = T3l - T3m;
+			 T3o = R1[WS(rs, 56)];
+			 T3p = R1[WS(rs, 24)];
+			 T3q = T3o - T3p;
+			 T3r = KP707106781 * (T3n + T3q);
+			 Tam = T3o + T3p;
+			 T4j = KP707106781 * (T3q - T3n);
+			 Tal = T3l + T3m;
+		    }
+		    {
+			 E T3v, T3y, T3C, T3F;
+			 {
+			      E T3t, T3u, T3w, T3x;
+			      T3t = R1[WS(rs, 4)];
+			      T3u = R1[WS(rs, 36)];
+			      T3v = T3t - T3u;
+			      TaI = T3t + T3u;
+			      T3w = R1[WS(rs, 20)];
+			      T3x = R1[WS(rs, 52)];
+			      T3y = T3w - T3x;
+			      TaJ = T3w + T3x;
+			 }
+			 T3z = FNMS(KP382683432, T3y, KP923879532 * T3v);
+			 Tdb = TaI - TaJ;
+			 T4p = FMA(KP382683432, T3v, KP923879532 * T3y);
+			 {
+			      E T3A, T3B, T3D, T3E;
+			      T3A = R1[WS(rs, 60)];
+			      T3B = R1[WS(rs, 28)];
+			      T3C = T3A - T3B;
+			      TaF = T3A + T3B;
+			      T3D = R1[WS(rs, 12)];
+			      T3E = R1[WS(rs, 44)];
+			      T3F = T3D - T3E;
+			      TaG = T3D + T3E;
+			 }
+			 T3G = FMA(KP923879532, T3C, KP382683432 * T3F);
+			 Tdc = TaF - TaG;
+			 T4o = FNMS(KP923879532, T3F, KP382683432 * T3C);
+		    }
+		    {
+			 E T3s, T3H, Tda, Tdd;
+			 T3s = T3k + T3r;
+			 T3H = T3z + T3G;
+			 T3I = T3s + T3H;
+			 T6H = T3s - T3H;
+			 Tda = Tai - Taj;
+			 Tdd = KP707106781 * (Tdb + Tdc);
+			 Tde = Tda + Tdd;
+			 TeH = Tda - Tdd;
+		    }
+		    {
+			 E Tdn, Tdo, T4n, T4q;
+			 Tdn = Tam - Tal;
+			 Tdo = KP707106781 * (Tdc - Tdb);
+			 Tdp = Tdn + Tdo;
+			 TeF = Tdo - Tdn;
+			 T4n = T4j - T4m;
+			 T4q = T4o - T4p;
+			 T4r = T4n + T4q;
+			 T6F = T4q - T4n;
+		    }
+		    {
+			 E T7S, T7T, Tak, Tan;
+			 T7S = T3k - T3r;
+			 T7T = T4p + T4o;
+			 T7U = T7S + T7T;
+			 T9l = T7S - T7T;
+			 Tak = Tai + Taj;
+			 Tan = Tal + Tam;
+			 Tao = Tak - Tan;
+			 TbW = Tak + Tan;
+		    }
+		    {
+			 E TaH, TaK, T83, T84;
+			 TaH = TaF + TaG;
+			 TaK = TaI + TaJ;
+			 TaL = TaH - TaK;
+			 TbX = TaK + TaH;
+			 T83 = T4m + T4j;
+			 T84 = T3G - T3z;
+			 T85 = T83 + T84;
+			 T9j = T84 - T83;
+		    }
+	       }
+	       {
+		    E T1z, T2V, T1C, T39, T1G, T38, T1J, T2Y, T1O, T1R, T32, Td0, T3c, T1V, T1Y;
+		    E T35, Td1, T3b;
+		    {
+			 E T1x, T1y, T1A, T1B;
+			 T1x = R0[WS(rs, 63)];
+			 T1y = R0[WS(rs, 31)];
+			 T1z = T1x + T1y;
+			 T2V = T1x - T1y;
+			 T1A = R0[WS(rs, 15)];
+			 T1B = R0[WS(rs, 47)];
+			 T1C = T1A + T1B;
+			 T39 = T1A - T1B;
+		    }
+		    {
+			 E T1E, T1F, T2W, T1H, T1I, T2X;
+			 T1E = R0[WS(rs, 7)];
+			 T1F = R0[WS(rs, 39)];
+			 T2W = T1E - T1F;
+			 T1H = R0[WS(rs, 55)];
+			 T1I = R0[WS(rs, 23)];
+			 T2X = T1H - T1I;
+			 T1G = T1E + T1F;
+			 T38 = KP707106781 * (T2X - T2W);
+			 T1J = T1H + T1I;
+			 T2Y = KP707106781 * (T2W + T2X);
+		    }
+		    {
+			 E T30, T31, T33, T34;
+			 {
+			      E T1M, T1N, T1P, T1Q;
+			      T1M = R0[WS(rs, 3)];
+			      T1N = R0[WS(rs, 35)];
+			      T1O = T1M + T1N;
+			      T30 = T1M - T1N;
+			      T1P = R0[WS(rs, 19)];
+			      T1Q = R0[WS(rs, 51)];
+			      T1R = T1P + T1Q;
+			      T31 = T1P - T1Q;
+			 }
+			 T32 = FNMS(KP382683432, T31, KP923879532 * T30);
+			 Td0 = T1O - T1R;
+			 T3c = FMA(KP382683432, T30, KP923879532 * T31);
+			 {
+			      E T1T, T1U, T1W, T1X;
+			      T1T = R0[WS(rs, 59)];
+			      T1U = R0[WS(rs, 27)];
+			      T1V = T1T + T1U;
+			      T33 = T1T - T1U;
+			      T1W = R0[WS(rs, 11)];
+			      T1X = R0[WS(rs, 43)];
+			      T1Y = T1W + T1X;
+			      T34 = T1W - T1X;
+			 }
+			 T35 = FMA(KP923879532, T33, KP382683432 * T34);
+			 Td1 = T1V - T1Y;
+			 T3b = FNMS(KP923879532, T34, KP382683432 * T33);
+		    }
+		    {
+			 E T1D, T1K, TcZ, Td2;
+			 T1D = T1z + T1C;
+			 T1K = T1G + T1J;
+			 T1L = T1D + T1K;
+			 Tad = T1D - T1K;
+			 TcZ = T1z - T1C;
+			 Td2 = KP707106781 * (Td0 + Td1);
+			 Td3 = TcZ + Td2;
+			 Tew = TcZ - Td2;
+		    }
+		    {
+			 E Td4, Td5, T1S, T1Z;
+			 Td4 = T1J - T1G;
+			 Td5 = KP707106781 * (Td1 - Td0);
+			 Td6 = Td4 + Td5;
+			 Tex = Td5 - Td4;
+			 T1S = T1O + T1R;
+			 T1Z = T1V + T1Y;
+			 T20 = T1S + T1Z;
+			 Tae = T1Z - T1S;
+		    }
+		    {
+			 E T2Z, T36, T7J, T7K;
+			 T2Z = T2V + T2Y;
+			 T36 = T32 + T35;
+			 T37 = T2Z + T36;
+			 T6x = T2Z - T36;
+			 T7J = T2V - T2Y;
+			 T7K = T3c + T3b;
+			 T7L = T7J + T7K;
+			 T9a = T7J - T7K;
+		    }
+		    {
+			 E T7M, T7N, T3a, T3d;
+			 T7M = T39 + T38;
+			 T7N = T35 - T32;
+			 T7O = T7M + T7N;
+			 T9b = T7N - T7M;
+			 T3a = T38 - T39;
+			 T3d = T3b - T3c;
+			 T3e = T3a + T3d;
+			 T6w = T3d - T3a;
+		    }
+	       }
+	       {
+		    E T3L, Tdf, T3X, Tar, T42, Tdi, T4e, Tay, T3S, Tdg, T3U, Tau, T49, Tdj, T4b;
+		    E TaB, Tdh, Tdk;
+		    {
+			 E T3J, T3K, Tap, T3V, T3W, Taq;
+			 T3J = R1[WS(rs, 2)];
+			 T3K = R1[WS(rs, 34)];
+			 Tap = T3J + T3K;
+			 T3V = R1[WS(rs, 18)];
+			 T3W = R1[WS(rs, 50)];
+			 Taq = T3V + T3W;
+			 T3L = T3J - T3K;
+			 Tdf = Tap - Taq;
+			 T3X = T3V - T3W;
+			 Tar = Tap + Taq;
+		    }
+		    {
+			 E T40, T41, Taw, T4c, T4d, Tax;
+			 T40 = R1[WS(rs, 62)];
+			 T41 = R1[WS(rs, 30)];
+			 Taw = T40 + T41;
+			 T4c = R1[WS(rs, 14)];
+			 T4d = R1[WS(rs, 46)];
+			 Tax = T4c + T4d;
+			 T42 = T40 - T41;
+			 Tdi = Taw - Tax;
+			 T4e = T4c - T4d;
+			 Tay = Taw + Tax;
+		    }
+		    {
+			 E T3O, Tas, T3R, Tat;
+			 {
+			      E T3M, T3N, T3P, T3Q;
+			      T3M = R1[WS(rs, 10)];
+			      T3N = R1[WS(rs, 42)];
+			      T3O = T3M - T3N;
+			      Tas = T3M + T3N;
+			      T3P = R1[WS(rs, 58)];
+			      T3Q = R1[WS(rs, 26)];
+			      T3R = T3P - T3Q;
+			      Tat = T3P + T3Q;
+			 }
+			 T3S = KP707106781 * (T3O + T3R);
+			 Tdg = Tat - Tas;
+			 T3U = KP707106781 * (T3R - T3O);
+			 Tau = Tas + Tat;
+		    }
+		    {
+			 E T45, Taz, T48, TaA;
+			 {
+			      E T43, T44, T46, T47;
+			      T43 = R1[WS(rs, 6)];
+			      T44 = R1[WS(rs, 38)];
+			      T45 = T43 - T44;
+			      Taz = T43 + T44;
+			      T46 = R1[WS(rs, 54)];
+			      T47 = R1[WS(rs, 22)];
+			      T48 = T46 - T47;
+			      TaA = T46 + T47;
+			 }
+			 T49 = KP707106781 * (T45 + T48);
+			 Tdj = TaA - Taz;
+			 T4b = KP707106781 * (T48 - T45);
+			 TaB = Taz + TaA;
+		    }
+		    TbZ = Tar + Tau;
+		    Tc0 = Tay + TaB;
+		    {
+			 E T3T, T3Y, Tdq, Tdr;
+			 T3T = T3L + T3S;
+			 T3Y = T3U - T3X;
+			 T3Z = FMA(KP980785280, T3T, KP195090322 * T3Y);
+			 T4s = FNMS(KP195090322, T3T, KP980785280 * T3Y);
+			 Tdq = FNMS(KP382683432, Tdf, KP923879532 * Tdg);
+			 Tdr = FMA(KP382683432, Tdi, KP923879532 * Tdj);
+			 Tds = Tdq + Tdr;
+			 TeI = Tdr - Tdq;
+		    }
+		    {
+			 E T4a, T4f, T7Y, T7Z;
+			 T4a = T42 + T49;
+			 T4f = T4b - T4e;
+			 T4g = FNMS(KP195090322, T4f, KP980785280 * T4a);
+			 T4t = FMA(KP195090322, T4a, KP980785280 * T4f);
+			 T7Y = T42 - T49;
+			 T7Z = T4e + T4b;
+			 T80 = FNMS(KP555570233, T7Z, KP831469612 * T7Y);
+			 T87 = FMA(KP555570233, T7Y, KP831469612 * T7Z);
+		    }
+		    Tdh = FMA(KP923879532, Tdf, KP382683432 * Tdg);
+		    Tdk = FNMS(KP382683432, Tdj, KP923879532 * Tdi);
+		    Tdl = Tdh + Tdk;
+		    TeE = Tdk - Tdh;
+		    {
+			 E T7V, T7W, Tav, TaC;
+			 T7V = T3L - T3S;
+			 T7W = T3X + T3U;
+			 T7X = FMA(KP831469612, T7V, KP555570233 * T7W);
+			 T86 = FNMS(KP555570233, T7V, KP831469612 * T7W);
+			 Tav = Tar - Tau;
+			 TaC = Tay - TaB;
+			 TaD = KP707106781 * (Tav + TaC);
+			 TaM = KP707106781 * (TaC - Tav);
+		    }
+	       }
+	       {
+		    E T50, TdA, T5c, TaY, T5h, TdD, T5t, Tb5, T57, TdB, T59, Tb1, T5o, TdE, T5q;
+		    E Tb8, TdC, TdF;
+		    {
+			 E T4Y, T4Z, TaW, T5a, T5b, TaX;
+			 T4Y = R1[WS(rs, 1)];
+			 T4Z = R1[WS(rs, 33)];
+			 TaW = T4Y + T4Z;
+			 T5a = R1[WS(rs, 17)];
+			 T5b = R1[WS(rs, 49)];
+			 TaX = T5a + T5b;
+			 T50 = T4Y - T4Z;
+			 TdA = TaW - TaX;
+			 T5c = T5a - T5b;
+			 TaY = TaW + TaX;
+		    }
+		    {
+			 E T5f, T5g, Tb3, T5r, T5s, Tb4;
+			 T5f = R1[WS(rs, 61)];
+			 T5g = R1[WS(rs, 29)];
+			 Tb3 = T5f + T5g;
+			 T5r = R1[WS(rs, 13)];
+			 T5s = R1[WS(rs, 45)];
+			 Tb4 = T5r + T5s;
+			 T5h = T5f - T5g;
+			 TdD = Tb3 - Tb4;
+			 T5t = T5r - T5s;
+			 Tb5 = Tb3 + Tb4;
+		    }
+		    {
+			 E T53, TaZ, T56, Tb0;
+			 {
+			      E T51, T52, T54, T55;
+			      T51 = R1[WS(rs, 9)];
+			      T52 = R1[WS(rs, 41)];
+			      T53 = T51 - T52;
+			      TaZ = T51 + T52;
+			      T54 = R1[WS(rs, 57)];
+			      T55 = R1[WS(rs, 25)];
+			      T56 = T54 - T55;
+			      Tb0 = T54 + T55;
+			 }
+			 T57 = KP707106781 * (T53 + T56);
+			 TdB = Tb0 - TaZ;
+			 T59 = KP707106781 * (T56 - T53);
+			 Tb1 = TaZ + Tb0;
+		    }
+		    {
+			 E T5k, Tb6, T5n, Tb7;
+			 {
+			      E T5i, T5j, T5l, T5m;
+			      T5i = R1[WS(rs, 5)];
+			      T5j = R1[WS(rs, 37)];
+			      T5k = T5i - T5j;
+			      Tb6 = T5i + T5j;
+			      T5l = R1[WS(rs, 53)];
+			      T5m = R1[WS(rs, 21)];
+			      T5n = T5l - T5m;
+			      Tb7 = T5l + T5m;
+			 }
+			 T5o = KP707106781 * (T5k + T5n);
+			 TdE = Tb7 - Tb6;
+			 T5q = KP707106781 * (T5n - T5k);
+			 Tb8 = Tb6 + Tb7;
+		    }
+		    Tc6 = TaY + Tb1;
+		    Tc7 = Tb5 + Tb8;
+		    {
+			 E T58, T5d, TdL, TdM;
+			 T58 = T50 + T57;
+			 T5d = T59 - T5c;
+			 T5e = FMA(KP980785280, T58, KP195090322 * T5d);
+			 T5H = FNMS(KP195090322, T58, KP980785280 * T5d);
+			 TdL = FNMS(KP382683432, TdA, KP923879532 * TdB);
+			 TdM = FMA(KP382683432, TdD, KP923879532 * TdE);
+			 TdN = TdL + TdM;
+			 TeM = TdM - TdL;
+		    }
+		    {
+			 E T5p, T5u, T8h, T8i;
+			 T5p = T5h + T5o;
+			 T5u = T5q - T5t;
+			 T5v = FNMS(KP195090322, T5u, KP980785280 * T5p);
+			 T5I = FMA(KP195090322, T5p, KP980785280 * T5u);
+			 T8h = T5h - T5o;
+			 T8i = T5t + T5q;
+			 T8j = FNMS(KP555570233, T8i, KP831469612 * T8h);
+			 T8q = FMA(KP555570233, T8h, KP831469612 * T8i);
+		    }
+		    TdC = FMA(KP923879532, TdA, KP382683432 * TdB);
+		    TdF = FNMS(KP382683432, TdE, KP923879532 * TdD);
+		    TdG = TdC + TdF;
+		    TeO = TdF - TdC;
+		    {
+			 E T8e, T8f, Tb2, Tb9;
+			 T8e = T50 - T57;
+			 T8f = T5c + T59;
+			 T8g = FMA(KP831469612, T8e, KP555570233 * T8f);
+			 T8p = FNMS(KP555570233, T8e, KP831469612 * T8f);
+			 Tb2 = TaY - Tb1;
+			 Tb9 = Tb5 - Tb8;
+			 Tba = KP707106781 * (Tb2 + Tb9);
+			 Tbj = KP707106781 * (Tb9 - Tb2);
+		    }
+	       }
+	       {
+		    E T11, TbV, Tc9, Tcf, T22, Tcb, Tc2, Tce;
+		    {
+			 E Tv, T10, Tc5, Tc8;
+			 Tv = Tf + Tu;
+			 T10 = TK + TZ;
+			 T11 = Tv + T10;
+			 TbV = Tv - T10;
+			 Tc5 = Tc3 + Tc4;
+			 Tc8 = Tc6 + Tc7;
+			 Tc9 = Tc5 - Tc8;
+			 Tcf = Tc5 + Tc8;
+		    }
+		    {
+			 E T1w, T21, TbY, Tc1;
+			 T1w = T1g + T1v;
+			 T21 = T1L + T20;
+			 T22 = T1w + T21;
+			 Tcb = T21 - T1w;
+			 TbY = TbW + TbX;
+			 Tc1 = TbZ + Tc0;
+			 Tc2 = TbY - Tc1;
+			 Tce = TbY + Tc1;
+		    }
+		    Cr[WS(csr, 32)] = T11 - T22;
+		    Ci[WS(csi, 32)] = Tcf - Tce;
+		    {
+			 E Tca, Tcc, Tcd, Tcg;
+			 Tca = KP707106781 * (Tc2 + Tc9);
+			 Cr[WS(csr, 48)] = TbV - Tca;
+			 Cr[WS(csr, 16)] = TbV + Tca;
+			 Tcc = KP707106781 * (Tc9 - Tc2);
+			 Ci[WS(csi, 16)] = Tcb + Tcc;
+			 Ci[WS(csi, 48)] = Tcc - Tcb;
+			 Tcd = T11 + T22;
+			 Tcg = Tce + Tcf;
+			 Cr[WS(csr, 64)] = Tcd - Tcg;
+			 Cr[0] = Tcd + Tcg;
+		    }
+	       }
+	       {
+		    E Tch, Tcu, Tck, Tct, Tco, Tcy, Tcr, Tcz, Tci, Tcj;
+		    Tch = Tf - Tu;
+		    Tcu = TZ - TK;
+		    Tci = T1g - T1v;
+		    Tcj = T1L - T20;
+		    Tck = KP707106781 * (Tci + Tcj);
+		    Tct = KP707106781 * (Tcj - Tci);
+		    {
+			 E Tcm, Tcn, Tcp, Tcq;
+			 Tcm = TbW - TbX;
+			 Tcn = Tc0 - TbZ;
+			 Tco = FMA(KP923879532, Tcm, KP382683432 * Tcn);
+			 Tcy = FNMS(KP382683432, Tcm, KP923879532 * Tcn);
+			 Tcp = Tc3 - Tc4;
+			 Tcq = Tc7 - Tc6;
+			 Tcr = FNMS(KP382683432, Tcq, KP923879532 * Tcp);
+			 Tcz = FMA(KP382683432, Tcp, KP923879532 * Tcq);
+		    }
+		    {
+			 E Tcl, Tcs, Tcx, TcA;
+			 Tcl = Tch + Tck;
+			 Tcs = Tco + Tcr;
+			 Cr[WS(csr, 56)] = Tcl - Tcs;
+			 Cr[WS(csr, 8)] = Tcl + Tcs;
+			 Tcx = Tcu + Tct;
+			 TcA = Tcy + Tcz;
+			 Ci[WS(csi, 8)] = Tcx + TcA;
+			 Ci[WS(csi, 56)] = TcA - Tcx;
+		    }
+		    {
+			 E Tcv, Tcw, TcB, TcC;
+			 Tcv = Tct - Tcu;
+			 Tcw = Tcr - Tco;
+			 Ci[WS(csi, 24)] = Tcv + Tcw;
+			 Ci[WS(csi, 40)] = Tcw - Tcv;
+			 TcB = Tch - Tck;
+			 TcC = Tcz - Tcy;
+			 Cr[WS(csr, 40)] = TcB - TcC;
+			 Cr[WS(csr, 24)] = TcB + TcC;
+		    }
+	       }
+	       {
+		    E Ta9, TbB, Tbs, TbM, Tag, TbL, TbJ, TbR, TaO, Tbw, Tbp, TbC, TbG, TbQ, Tbl;
+		    E Tbx, Ta8, Tbr;
+		    Ta8 = KP707106781 * (Ta6 + Ta7);
+		    Ta9 = Ta5 + Ta8;
+		    TbB = Ta5 - Ta8;
+		    Tbr = KP707106781 * (Ta7 - Ta6);
+		    Tbs = Tbq + Tbr;
+		    TbM = Tbr - Tbq;
+		    {
+			 E Tac, Taf, TbH, TbI;
+			 Tac = FMA(KP923879532, Taa, KP382683432 * Tab);
+			 Taf = FNMS(KP382683432, Tae, KP923879532 * Tad);
+			 Tag = Tac + Taf;
+			 TbL = Taf - Tac;
+			 TbH = TaV - Tba;
+			 TbI = Tbj - Tbi;
+			 TbJ = FNMS(KP555570233, TbI, KP831469612 * TbH);
+			 TbR = FMA(KP555570233, TbH, KP831469612 * TbI);
+		    }
+		    {
+			 E TaE, TaN, Tbn, Tbo;
+			 TaE = Tao + TaD;
+			 TaN = TaL + TaM;
+			 TaO = FMA(KP980785280, TaE, KP195090322 * TaN);
+			 Tbw = FNMS(KP195090322, TaE, KP980785280 * TaN);
+			 Tbn = FNMS(KP382683432, Taa, KP923879532 * Tab);
+			 Tbo = FMA(KP382683432, Tad, KP923879532 * Tae);
+			 Tbp = Tbn + Tbo;
+			 TbC = Tbo - Tbn;
+		    }
+		    {
+			 E TbE, TbF, Tbb, Tbk;
+			 TbE = Tao - TaD;
+			 TbF = TaM - TaL;
+			 TbG = FMA(KP831469612, TbE, KP555570233 * TbF);
+			 TbQ = FNMS(KP555570233, TbE, KP831469612 * TbF);
+			 Tbb = TaV + Tba;
+			 Tbk = Tbi + Tbj;
+			 Tbl = FNMS(KP195090322, Tbk, KP980785280 * Tbb);
+			 Tbx = FMA(KP195090322, Tbb, KP980785280 * Tbk);
+		    }
+		    {
+			 E Tah, Tbm, Tbv, Tby;
+			 Tah = Ta9 + Tag;
+			 Tbm = TaO + Tbl;
+			 Cr[WS(csr, 60)] = Tah - Tbm;
+			 Cr[WS(csr, 4)] = Tah + Tbm;
+			 Tbv = Tbs + Tbp;
+			 Tby = Tbw + Tbx;
+			 Ci[WS(csi, 4)] = Tbv + Tby;
+			 Ci[WS(csi, 60)] = Tby - Tbv;
+		    }
+		    {
+			 E Tbt, Tbu, Tbz, TbA;
+			 Tbt = Tbp - Tbs;
+			 Tbu = Tbl - TaO;
+			 Ci[WS(csi, 28)] = Tbt + Tbu;
+			 Ci[WS(csi, 36)] = Tbu - Tbt;
+			 Tbz = Ta9 - Tag;
+			 TbA = Tbx - Tbw;
+			 Cr[WS(csr, 36)] = Tbz - TbA;
+			 Cr[WS(csr, 28)] = Tbz + TbA;
+		    }
+		    {
+			 E TbD, TbK, TbP, TbS;
+			 TbD = TbB + TbC;
+			 TbK = TbG + TbJ;
+			 Cr[WS(csr, 52)] = TbD - TbK;
+			 Cr[WS(csr, 12)] = TbD + TbK;
+			 TbP = TbM + TbL;
+			 TbS = TbQ + TbR;
+			 Ci[WS(csi, 12)] = TbP + TbS;
+			 Ci[WS(csi, 52)] = TbS - TbP;
+		    }
+		    {
+			 E TbN, TbO, TbT, TbU;
+			 TbN = TbL - TbM;
+			 TbO = TbJ - TbG;
+			 Ci[WS(csi, 20)] = TbN + TbO;
+			 Ci[WS(csi, 44)] = TbO - TbN;
+			 TbT = TbB - TbC;
+			 TbU = TbR - TbQ;
+			 Cr[WS(csr, 44)] = TbT - TbU;
+			 Cr[WS(csr, 20)] = TbT + TbU;
+		    }
+	       }
+	       {
+		    E Tev, Tf7, Tfc, Tfm, Tff, Tfn, TeC, Tfh, TeK, Tf2, TeV, Tf8, TeY, Tfi, TeR;
+		    E Tf3;
+		    {
+			 E Tet, Teu, Tfa, Tfb;
+			 Tet = TcD - TcG;
+			 Teu = TdY - TdX;
+			 Tev = Tet - Teu;
+			 Tf7 = Tet + Teu;
+			 Tfa = TeF + TeE;
+			 Tfb = TeH + TeI;
+			 Tfc = FMA(KP290284677, Tfa, KP956940335 * Tfb);
+			 Tfm = FNMS(KP290284677, Tfb, KP956940335 * Tfa);
+		    }
+		    {
+			 E Tfd, Tfe, Tey, TeB;
+			 Tfd = TeL + TeM;
+			 Tfe = TeP + TeO;
+			 Tff = FNMS(KP290284677, Tfe, KP956940335 * Tfd);
+			 Tfn = FMA(KP956940335, Tfe, KP290284677 * Tfd);
+			 Tey = FMA(KP555570233, Tew, KP831469612 * Tex);
+			 TeB = FNMS(KP555570233, TeA, KP831469612 * Tez);
+			 TeC = Tey - TeB;
+			 Tfh = TeB + Tey;
+		    }
+		    {
+			 E TeG, TeJ, TeT, TeU;
+			 TeG = TeE - TeF;
+			 TeJ = TeH - TeI;
+			 TeK = FMA(KP471396736, TeG, KP881921264 * TeJ);
+			 Tf2 = FNMS(KP471396736, TeJ, KP881921264 * TeG);
+			 TeT = FNMS(KP555570233, Tex, KP831469612 * Tew);
+			 TeU = FMA(KP831469612, TeA, KP555570233 * Tez);
+			 TeV = TeT - TeU;
+			 Tf8 = TeU + TeT;
+		    }
+		    {
+			 E TeW, TeX, TeN, TeQ;
+			 TeW = TcN - TcK;
+			 TeX = TdV - TdU;
+			 TeY = TeW - TeX;
+			 Tfi = TeX + TeW;
+			 TeN = TeL - TeM;
+			 TeQ = TeO - TeP;
+			 TeR = FNMS(KP471396736, TeQ, KP881921264 * TeN);
+			 Tf3 = FMA(KP881921264, TeQ, KP471396736 * TeN);
+		    }
+		    {
+			 E TeD, TeS, Tf1, Tf4;
+			 TeD = Tev + TeC;
+			 TeS = TeK + TeR;
+			 Cr[WS(csr, 54)] = TeD - TeS;
+			 Cr[WS(csr, 10)] = TeD + TeS;
+			 Tf1 = TeY + TeV;
+			 Tf4 = Tf2 + Tf3;
+			 Ci[WS(csi, 10)] = Tf1 + Tf4;
+			 Ci[WS(csi, 54)] = Tf4 - Tf1;
+		    }
+		    {
+			 E TeZ, Tf0, Tf5, Tf6;
+			 TeZ = TeV - TeY;
+			 Tf0 = TeR - TeK;
+			 Ci[WS(csi, 22)] = TeZ + Tf0;
+			 Ci[WS(csi, 42)] = Tf0 - TeZ;
+			 Tf5 = Tev - TeC;
+			 Tf6 = Tf3 - Tf2;
+			 Cr[WS(csr, 42)] = Tf5 - Tf6;
+			 Cr[WS(csr, 22)] = Tf5 + Tf6;
+		    }
+		    {
+			 E Tf9, Tfg, Tfl, Tfo;
+			 Tf9 = Tf7 + Tf8;
+			 Tfg = Tfc + Tff;
+			 Cr[WS(csr, 58)] = Tf9 - Tfg;
+			 Cr[WS(csr, 6)] = Tf9 + Tfg;
+			 Tfl = Tfi + Tfh;
+			 Tfo = Tfm + Tfn;
+			 Ci[WS(csi, 6)] = Tfl + Tfo;
+			 Ci[WS(csi, 58)] = Tfo - Tfl;
+		    }
+		    {
+			 E Tfj, Tfk, Tfp, Tfq;
+			 Tfj = Tfh - Tfi;
+			 Tfk = Tff - Tfc;
+			 Ci[WS(csi, 26)] = Tfj + Tfk;
+			 Ci[WS(csi, 38)] = Tfk - Tfj;
+			 Tfp = Tf7 - Tf8;
+			 Tfq = Tfn - Tfm;
+			 Cr[WS(csr, 38)] = Tfp - Tfq;
+			 Cr[WS(csr, 26)] = Tfp + Tfq;
+		    }
+	       }
+	       {
+		    E TcP, Te9, Tee, Teo, Teh, Tep, Td8, Tej, Tdu, Te4, TdT, Tea, Te0, Tek, TdP;
+		    E Te5;
+		    {
+			 E TcH, TcO, Tec, Ted;
+			 TcH = TcD + TcG;
+			 TcO = TcK + TcN;
+			 TcP = TcH + TcO;
+			 Te9 = TcH - TcO;
+			 Tec = Tde - Tdl;
+			 Ted = Tds - Tdp;
+			 Tee = FMA(KP773010453, Tec, KP634393284 * Ted);
+			 Teo = FNMS(KP634393284, Tec, KP773010453 * Ted);
+		    }
+		    {
+			 E Tef, Teg, TcY, Td7;
+			 Tef = Tdz - TdG;
+			 Teg = TdN - TdK;
+			 Teh = FNMS(KP634393284, Teg, KP773010453 * Tef);
+			 Tep = FMA(KP634393284, Tef, KP773010453 * Teg);
+			 TcY = FMA(KP980785280, TcU, KP195090322 * TcX);
+			 Td7 = FNMS(KP195090322, Td6, KP980785280 * Td3);
+			 Td8 = TcY + Td7;
+			 Tej = Td7 - TcY;
+		    }
+		    {
+			 E Tdm, Tdt, TdR, TdS;
+			 Tdm = Tde + Tdl;
+			 Tdt = Tdp + Tds;
+			 Tdu = FMA(KP995184726, Tdm, KP098017140 * Tdt);
+			 Te4 = FNMS(KP098017140, Tdm, KP995184726 * Tdt);
+			 TdR = FNMS(KP195090322, TcU, KP980785280 * TcX);
+			 TdS = FMA(KP195090322, Td3, KP980785280 * Td6);
+			 TdT = TdR + TdS;
+			 Tea = TdS - TdR;
+		    }
+		    {
+			 E TdW, TdZ, TdH, TdO;
+			 TdW = TdU + TdV;
+			 TdZ = TdX + TdY;
+			 Te0 = TdW + TdZ;
+			 Tek = TdZ - TdW;
+			 TdH = Tdz + TdG;
+			 TdO = TdK + TdN;
+			 TdP = FNMS(KP098017140, TdO, KP995184726 * TdH);
+			 Te5 = FMA(KP098017140, TdH, KP995184726 * TdO);
+		    }
+		    {
+			 E Td9, TdQ, Te3, Te6;
+			 Td9 = TcP + Td8;
+			 TdQ = Tdu + TdP;
+			 Cr[WS(csr, 62)] = Td9 - TdQ;
+			 Cr[WS(csr, 2)] = Td9 + TdQ;
+			 Te3 = Te0 + TdT;
+			 Te6 = Te4 + Te5;
+			 Ci[WS(csi, 2)] = Te3 + Te6;
+			 Ci[WS(csi, 62)] = Te6 - Te3;
+		    }
+		    {
+			 E Te1, Te2, Te7, Te8;
+			 Te1 = TdT - Te0;
+			 Te2 = TdP - Tdu;
+			 Ci[WS(csi, 30)] = Te1 + Te2;
+			 Ci[WS(csi, 34)] = Te2 - Te1;
+			 Te7 = TcP - Td8;
+			 Te8 = Te5 - Te4;
+			 Cr[WS(csr, 34)] = Te7 - Te8;
+			 Cr[WS(csr, 30)] = Te7 + Te8;
+		    }
+		    {
+			 E Teb, Tei, Ten, Teq;
+			 Teb = Te9 + Tea;
+			 Tei = Tee + Teh;
+			 Cr[WS(csr, 50)] = Teb - Tei;
+			 Cr[WS(csr, 14)] = Teb + Tei;
+			 Ten = Tek + Tej;
+			 Teq = Teo + Tep;
+			 Ci[WS(csi, 14)] = Ten + Teq;
+			 Ci[WS(csi, 50)] = Teq - Ten;
+		    }
+		    {
+			 E Tel, Tem, Ter, Tes;
+			 Tel = Tej - Tek;
+			 Tem = Teh - Tee;
+			 Ci[WS(csi, 18)] = Tel + Tem;
+			 Ci[WS(csi, 46)] = Tem - Tel;
+			 Ter = Te9 - Tea;
+			 Tes = Tep - Teo;
+			 Cr[WS(csr, 46)] = Ter - Tes;
+			 Cr[WS(csr, 18)] = Ter + Tes;
+		    }
+	       }
+	       {
+		    E T6v, T77, T6C, T7h, T6Y, T7i, T6V, T78, T6R, T7n, T73, T7f, T6K, T7m, T72;
+		    E T7c;
+		    {
+			 E T6t, T6u, T6T, T6U;
+			 T6t = T27 - T2e;
+			 T6u = T5Y - T5X;
+			 T6v = T6t - T6u;
+			 T77 = T6t + T6u;
+			 {
+			      E T6y, T6B, T6W, T6X;
+			      T6y = FMA(KP773010453, T6w, KP634393284 * T6x);
+			      T6B = FNMS(KP634393284, T6A, KP773010453 * T6z);
+			      T6C = T6y - T6B;
+			      T7h = T6B + T6y;
+			      T6W = T2x - T2o;
+			      T6X = T5V - T5S;
+			      T6Y = T6W - T6X;
+			      T7i = T6X + T6W;
+			 }
+			 T6T = FNMS(KP634393284, T6w, KP773010453 * T6x);
+			 T6U = FMA(KP634393284, T6z, KP773010453 * T6A);
+			 T6V = T6T - T6U;
+			 T78 = T6U + T6T;
+			 {
+			      E T6N, T7d, T6Q, T7e, T6M, T6O;
+			      T6M = T5I - T5H;
+			      T6N = T6L - T6M;
+			      T7d = T6L + T6M;
+			      T6O = T5v - T5e;
+			      T6Q = T6O - T6P;
+			      T7e = T6P + T6O;
+			      T6R = FNMS(KP427555093, T6Q, KP903989293 * T6N);
+			      T7n = FMA(KP941544065, T7e, KP336889853 * T7d);
+			      T73 = FMA(KP903989293, T6Q, KP427555093 * T6N);
+			      T7f = FNMS(KP336889853, T7e, KP941544065 * T7d);
+			 }
+			 {
+			      E T6G, T7a, T6J, T7b, T6E, T6I;
+			      T6E = T4g - T3Z;
+			      T6G = T6E - T6F;
+			      T7a = T6F + T6E;
+			      T6I = T4t - T4s;
+			      T6J = T6H - T6I;
+			      T7b = T6H + T6I;
+			      T6K = FMA(KP427555093, T6G, KP903989293 * T6J);
+			      T7m = FNMS(KP336889853, T7b, KP941544065 * T7a);
+			      T72 = FNMS(KP427555093, T6J, KP903989293 * T6G);
+			      T7c = FMA(KP336889853, T7a, KP941544065 * T7b);
+			 }
+		    }
+		    {
+			 E T6D, T6S, T71, T74;
+			 T6D = T6v + T6C;
+			 T6S = T6K + T6R;
+			 Cr[WS(csr, 55)] = T6D - T6S;
+			 Cr[WS(csr, 9)] = T6D + T6S;
+			 T71 = T6Y + T6V;
+			 T74 = T72 + T73;
+			 Ci[WS(csi, 9)] = T71 + T74;
+			 Ci[WS(csi, 55)] = T74 - T71;
+		    }
+		    {
+			 E T6Z, T70, T75, T76;
+			 T6Z = T6V - T6Y;
+			 T70 = T6R - T6K;
+			 Ci[WS(csi, 23)] = T6Z + T70;
+			 Ci[WS(csi, 41)] = T70 - T6Z;
+			 T75 = T6v - T6C;
+			 T76 = T73 - T72;
+			 Cr[WS(csr, 41)] = T75 - T76;
+			 Cr[WS(csr, 23)] = T75 + T76;
+		    }
+		    {
+			 E T79, T7g, T7l, T7o;
+			 T79 = T77 + T78;
+			 T7g = T7c + T7f;
+			 Cr[WS(csr, 57)] = T79 - T7g;
+			 Cr[WS(csr, 7)] = T79 + T7g;
+			 T7l = T7i + T7h;
+			 T7o = T7m + T7n;
+			 Ci[WS(csi, 7)] = T7l + T7o;
+			 Ci[WS(csi, 57)] = T7o - T7l;
+		    }
+		    {
+			 E T7j, T7k, T7p, T7q;
+			 T7j = T7h - T7i;
+			 T7k = T7f - T7c;
+			 Ci[WS(csi, 25)] = T7j + T7k;
+			 Ci[WS(csi, 39)] = T7k - T7j;
+			 T7p = T77 - T78;
+			 T7q = T7n - T7m;
+			 Cr[WS(csr, 39)] = T7p - T7q;
+			 Cr[WS(csr, 25)] = T7p + T7q;
+		    }
+	       }
+	       {
+		    E T99, T9L, T9g, T9V, T9C, T9W, T9z, T9M, T9v, Ta1, T9H, T9T, T9o, Ta0, T9G;
+		    E T9Q;
+		    {
+			 E T97, T98, T9x, T9y;
+			 T97 = T7r - T7s;
+			 T98 = T8C - T8B;
+			 T99 = T97 - T98;
+			 T9L = T97 + T98;
+			 {
+			      E T9c, T9f, T9A, T9B;
+			      T9c = FMA(KP471396736, T9a, KP881921264 * T9b);
+			      T9f = FNMS(KP471396736, T9e, KP881921264 * T9d);
+			      T9g = T9c - T9f;
+			      T9V = T9f + T9c;
+			      T9A = T7z - T7w;
+			      T9B = T8z - T8y;
+			      T9C = T9A - T9B;
+			      T9W = T9B + T9A;
+			 }
+			 T9x = FNMS(KP471396736, T9b, KP881921264 * T9a);
+			 T9y = FMA(KP881921264, T9e, KP471396736 * T9d);
+			 T9z = T9x - T9y;
+			 T9M = T9y + T9x;
+			 {
+			      E T9r, T9R, T9u, T9S, T9q, T9s;
+			      T9q = T8q - T8p;
+			      T9r = T9p - T9q;
+			      T9R = T9p + T9q;
+			      T9s = T8j - T8g;
+			      T9u = T9s - T9t;
+			      T9S = T9t + T9s;
+			      T9v = FNMS(KP514102744, T9u, KP857728610 * T9r);
+			      Ta1 = FMA(KP970031253, T9S, KP242980179 * T9R);
+			      T9H = FMA(KP857728610, T9u, KP514102744 * T9r);
+			      T9T = FNMS(KP242980179, T9S, KP970031253 * T9R);
+			 }
+			 {
+			      E T9k, T9O, T9n, T9P, T9i, T9m;
+			      T9i = T80 - T7X;
+			      T9k = T9i - T9j;
+			      T9O = T9j + T9i;
+			      T9m = T87 - T86;
+			      T9n = T9l - T9m;
+			      T9P = T9l + T9m;
+			      T9o = FMA(KP514102744, T9k, KP857728610 * T9n);
+			      Ta0 = FNMS(KP242980179, T9P, KP970031253 * T9O);
+			      T9G = FNMS(KP514102744, T9n, KP857728610 * T9k);
+			      T9Q = FMA(KP242980179, T9O, KP970031253 * T9P);
+			 }
+		    }
+		    {
+			 E T9h, T9w, T9F, T9I;
+			 T9h = T99 + T9g;
+			 T9w = T9o + T9v;
+			 Cr[WS(csr, 53)] = T9h - T9w;
+			 Cr[WS(csr, 11)] = T9h + T9w;
+			 T9F = T9C + T9z;
+			 T9I = T9G + T9H;
+			 Ci[WS(csi, 11)] = T9F + T9I;
+			 Ci[WS(csi, 53)] = T9I - T9F;
+		    }
+		    {
+			 E T9D, T9E, T9J, T9K;
+			 T9D = T9z - T9C;
+			 T9E = T9v - T9o;
+			 Ci[WS(csi, 21)] = T9D + T9E;
+			 Ci[WS(csi, 43)] = T9E - T9D;
+			 T9J = T99 - T9g;
+			 T9K = T9H - T9G;
+			 Cr[WS(csr, 43)] = T9J - T9K;
+			 Cr[WS(csr, 21)] = T9J + T9K;
+		    }
+		    {
+			 E T9N, T9U, T9Z, Ta2;
+			 T9N = T9L + T9M;
+			 T9U = T9Q + T9T;
+			 Cr[WS(csr, 59)] = T9N - T9U;
+			 Cr[WS(csr, 5)] = T9N + T9U;
+			 T9Z = T9W + T9V;
+			 Ta2 = Ta0 + Ta1;
+			 Ci[WS(csi, 5)] = T9Z + Ta2;
+			 Ci[WS(csi, 59)] = Ta2 - T9Z;
+		    }
+		    {
+			 E T9X, T9Y, Ta3, Ta4;
+			 T9X = T9V - T9W;
+			 T9Y = T9T - T9Q;
+			 Ci[WS(csi, 27)] = T9X + T9Y;
+			 Ci[WS(csi, 37)] = T9Y - T9X;
+			 Ta3 = T9L - T9M;
+			 Ta4 = Ta1 - Ta0;
+			 Cr[WS(csr, 37)] = Ta3 - Ta4;
+			 Cr[WS(csr, 27)] = Ta3 + Ta4;
+		    }
+	       }
+	       {
+		    E T2z, T69, T3g, T6j, T60, T6k, T5P, T6a, T5L, T6p, T65, T6h, T4w, T6o, T64;
+		    E T6e;
+		    {
+			 E T2f, T2y, T5N, T5O;
+			 T2f = T27 + T2e;
+			 T2y = T2o + T2x;
+			 T2z = T2f + T2y;
+			 T69 = T2f - T2y;
+			 {
+			      E T2U, T3f, T5W, T5Z;
+			      T2U = FMA(KP098017140, T2M, KP995184726 * T2T);
+			      T3f = FNMS(KP098017140, T3e, KP995184726 * T37);
+			      T3g = T2U + T3f;
+			      T6j = T3f - T2U;
+			      T5W = T5S + T5V;
+			      T5Z = T5X + T5Y;
+			      T60 = T5W + T5Z;
+			      T6k = T5Z - T5W;
+			 }
+			 T5N = FNMS(KP098017140, T2T, KP995184726 * T2M);
+			 T5O = FMA(KP995184726, T3e, KP098017140 * T37);
+			 T5P = T5N + T5O;
+			 T6a = T5O - T5N;
+			 {
+			      E T5x, T6f, T5K, T6g, T5w, T5J;
+			      T5w = T5e + T5v;
+			      T5x = T4X + T5w;
+			      T6f = T4X - T5w;
+			      T5J = T5H + T5I;
+			      T5K = T5G + T5J;
+			      T6g = T5J - T5G;
+			      T5L = FNMS(KP049067674, T5K, KP998795456 * T5x);
+			      T6p = FMA(KP671558954, T6f, KP740951125 * T6g);
+			      T65 = FMA(KP049067674, T5x, KP998795456 * T5K);
+			      T6h = FNMS(KP671558954, T6g, KP740951125 * T6f);
+			 }
+			 {
+			      E T4i, T6c, T4v, T6d, T4h, T4u;
+			      T4h = T3Z + T4g;
+			      T4i = T3I + T4h;
+			      T6c = T3I - T4h;
+			      T4u = T4s + T4t;
+			      T4v = T4r + T4u;
+			      T6d = T4u - T4r;
+			      T4w = FMA(KP998795456, T4i, KP049067674 * T4v);
+			      T6o = FNMS(KP671558954, T6c, KP740951125 * T6d);
+			      T64 = FNMS(KP049067674, T4i, KP998795456 * T4v);
+			      T6e = FMA(KP740951125, T6c, KP671558954 * T6d);
+			 }
+		    }
+		    {
+			 E T3h, T5M, T63, T66;
+			 T3h = T2z + T3g;
+			 T5M = T4w + T5L;
+			 Cr[WS(csr, 63)] = T3h - T5M;
+			 Cr[WS(csr, 1)] = T3h + T5M;
+			 T63 = T60 + T5P;
+			 T66 = T64 + T65;
+			 Ci[WS(csi, 1)] = T63 + T66;
+			 Ci[WS(csi, 63)] = T66 - T63;
+		    }
+		    {
+			 E T61, T62, T67, T68;
+			 T61 = T5P - T60;
+			 T62 = T5L - T4w;
+			 Ci[WS(csi, 31)] = T61 + T62;
+			 Ci[WS(csi, 33)] = T62 - T61;
+			 T67 = T2z - T3g;
+			 T68 = T65 - T64;
+			 Cr[WS(csr, 33)] = T67 - T68;
+			 Cr[WS(csr, 31)] = T67 + T68;
+		    }
+		    {
+			 E T6b, T6i, T6n, T6q;
+			 T6b = T69 + T6a;
+			 T6i = T6e + T6h;
+			 Cr[WS(csr, 49)] = T6b - T6i;
+			 Cr[WS(csr, 15)] = T6b + T6i;
+			 T6n = T6k + T6j;
+			 T6q = T6o + T6p;
+			 Ci[WS(csi, 15)] = T6n + T6q;
+			 Ci[WS(csi, 49)] = T6q - T6n;
+		    }
+		    {
+			 E T6l, T6m, T6r, T6s;
+			 T6l = T6j - T6k;
+			 T6m = T6h - T6e;
+			 Ci[WS(csi, 17)] = T6l + T6m;
+			 Ci[WS(csi, 47)] = T6m - T6l;
+			 T6r = T69 - T6a;
+			 T6s = T6p - T6o;
+			 Cr[WS(csr, 47)] = T6r - T6s;
+			 Cr[WS(csr, 17)] = T6r + T6s;
+		    }
+	       }
+	       {
+		    E T7B, T8N, T7Q, T8X, T8E, T8Y, T8x, T8O, T8t, T93, T8J, T8V, T8a, T92, T8I;
+		    E T8S;
+		    {
+			 E T7t, T7A, T8v, T8w;
+			 T7t = T7r + T7s;
+			 T7A = T7w + T7z;
+			 T7B = T7t + T7A;
+			 T8N = T7t - T7A;
+			 {
+			      E T7I, T7P, T8A, T8D;
+			      T7I = FMA(KP956940335, T7E, KP290284677 * T7H);
+			      T7P = FNMS(KP290284677, T7O, KP956940335 * T7L);
+			      T7Q = T7I + T7P;
+			      T8X = T7P - T7I;
+			      T8A = T8y + T8z;
+			      T8D = T8B + T8C;
+			      T8E = T8A + T8D;
+			      T8Y = T8D - T8A;
+			 }
+			 T8v = FNMS(KP290284677, T7E, KP956940335 * T7H);
+			 T8w = FMA(KP290284677, T7L, KP956940335 * T7O);
+			 T8x = T8v + T8w;
+			 T8O = T8w - T8v;
+			 {
+			      E T8l, T8T, T8s, T8U, T8k, T8r;
+			      T8k = T8g + T8j;
+			      T8l = T8d + T8k;
+			      T8T = T8d - T8k;
+			      T8r = T8p + T8q;
+			      T8s = T8o + T8r;
+			      T8U = T8r - T8o;
+			      T8t = FNMS(KP146730474, T8s, KP989176509 * T8l);
+			      T93 = FMA(KP595699304, T8T, KP803207531 * T8U);
+			      T8J = FMA(KP146730474, T8l, KP989176509 * T8s);
+			      T8V = FNMS(KP595699304, T8U, KP803207531 * T8T);
+			 }
+			 {
+			      E T82, T8Q, T89, T8R, T81, T88;
+			      T81 = T7X + T80;
+			      T82 = T7U + T81;
+			      T8Q = T7U - T81;
+			      T88 = T86 + T87;
+			      T89 = T85 + T88;
+			      T8R = T88 - T85;
+			      T8a = FMA(KP989176509, T82, KP146730474 * T89);
+			      T92 = FNMS(KP595699304, T8Q, KP803207531 * T8R);
+			      T8I = FNMS(KP146730474, T82, KP989176509 * T89);
+			      T8S = FMA(KP803207531, T8Q, KP595699304 * T8R);
+			 }
+		    }
+		    {
+			 E T7R, T8u, T8H, T8K;
+			 T7R = T7B + T7Q;
+			 T8u = T8a + T8t;
+			 Cr[WS(csr, 61)] = T7R - T8u;
+			 Cr[WS(csr, 3)] = T7R + T8u;
+			 T8H = T8E + T8x;
+			 T8K = T8I + T8J;
+			 Ci[WS(csi, 3)] = T8H + T8K;
+			 Ci[WS(csi, 61)] = T8K - T8H;
+		    }
+		    {
+			 E T8F, T8G, T8L, T8M;
+			 T8F = T8x - T8E;
+			 T8G = T8t - T8a;
+			 Ci[WS(csi, 29)] = T8F + T8G;
+			 Ci[WS(csi, 35)] = T8G - T8F;
+			 T8L = T7B - T7Q;
+			 T8M = T8J - T8I;
+			 Cr[WS(csr, 35)] = T8L - T8M;
+			 Cr[WS(csr, 29)] = T8L + T8M;
+		    }
+		    {
+			 E T8P, T8W, T91, T94;
+			 T8P = T8N + T8O;
+			 T8W = T8S + T8V;
+			 Cr[WS(csr, 51)] = T8P - T8W;
+			 Cr[WS(csr, 13)] = T8P + T8W;
+			 T91 = T8Y + T8X;
+			 T94 = T92 + T93;
+			 Ci[WS(csi, 13)] = T91 + T94;
+			 Ci[WS(csi, 51)] = T94 - T91;
+		    }
+		    {
+			 E T8Z, T90, T95, T96;
+			 T8Z = T8X - T8Y;
+			 T90 = T8V - T8S;
+			 Ci[WS(csi, 19)] = T8Z + T90;
+			 Ci[WS(csi, 45)] = T90 - T8Z;
+			 T95 = T8N - T8O;
+			 T96 = T93 - T92;
+			 Cr[WS(csr, 45)] = T95 - T96;
+			 Cr[WS(csr, 19)] = T95 + T96;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 128, "r2cf_128", {812, 186, 144, 0}, &GENUS };
+
+void X(codelet_r2cf_128) (planner *p) {
+     X(kr2c_register) (p, r2cf_128, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_13.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_13.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:46 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 13 -name r2cf_13 -include r2cf.h */
+
+/*
+ * This function contains 76 FP additions, 51 FP multiplications,
+ * (or, 31 additions, 6 multiplications, 45 fused multiply/add),
+ * 68 stack variables, 23 constants, and 26 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_13(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP516520780, +0.516520780623489722840901288569017135705033622);
+     DK(KP300462606, +0.300462606288665774426601772289207995520941381);
+     DK(KP581704778, +0.581704778510515730456870384989698884939833902);
+     DK(KP859542535, +0.859542535098774820163672132761689612766401925);
+     DK(KP769338817, +0.769338817572980603471413688209101117038278899);
+     DK(KP686558370, +0.686558370781754340655719594850823015421401653);
+     DK(KP514918778, +0.514918778086315755491789696138117261566051239);
+     DK(KP251768516, +0.251768516431883313623436926934233488546674281);
+     DK(KP503537032, +0.503537032863766627246873853868466977093348562);
+     DK(KP904176221, +0.904176221990848204433795481776887926501523162);
+     DK(KP575140729, +0.575140729474003121368385547455453388461001608);
+     DK(KP957805992, +0.957805992594665126462521754605754580515587217);
+     DK(KP600477271, +0.600477271932665282925769253334763009352012849);
+     DK(KP522026385, +0.522026385161275033714027226654165028300441940);
+     DK(KP301479260, +0.301479260047709873958013540496673347309208464);
+     DK(KP226109445, +0.226109445035782405468510155372505010481906348);
+     DK(KP853480001, +0.853480001859823990758994934970528322872359049);
+     DK(KP083333333, +0.083333333333333333333333333333333333333333333);
+     DK(KP612264650, +0.612264650376756543746494474777125408779395514);
+     DK(KP038632954, +0.038632954644348171955506895830342264440241080);
+     DK(KP302775637, +0.302775637731994646559610633735247973125648287);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(52, rs), MAKE_VOLATILE_STRIDE(52, csr), MAKE_VOLATILE_STRIDE(52, csi)) {
+	       E T15, T1a, T11, T17, T14, T1b;
+	       {
+		    E TN, TD, TV, TA, Tb, TZ, T12, TS, Tx, Tu, Ti, TU;
+		    TN = R0[0];
+		    {
+			 E T3, TP, Th, TB, Tp, Te, Tm, TC, Tr, T6, T9, Ts;
+			 {
+			      E Tn, Tf, Tg, T1, T2;
+			      T1 = R0[WS(rs, 4)];
+			      T2 = R1[WS(rs, 2)];
+			      Tn = R0[WS(rs, 6)];
+			      Tf = R0[WS(rs, 5)];
+			      Tg = R0[WS(rs, 2)];
+			      T3 = T1 - T2;
+			      TP = T1 + T2;
+			      {
+				   E Tk, To, Tc, Td;
+				   Tk = R1[0];
+				   Th = Tf - Tg;
+				   To = Tf + Tg;
+				   Tc = R1[WS(rs, 4)];
+				   Td = R1[WS(rs, 1)];
+				   {
+					E T4, Tl, T5, T7, T8;
+					T4 = R1[WS(rs, 5)];
+					TB = Tn + To;
+					Tp = FMS(KP500000000, To, Tn);
+					Tl = Td + Tc;
+					Te = Tc - Td;
+					T5 = R0[WS(rs, 3)];
+					T7 = R1[WS(rs, 3)];
+					T8 = R0[WS(rs, 1)];
+					Tm = FNMS(KP500000000, Tl, Tk);
+					TC = Tk + Tl;
+					Tr = T4 + T5;
+					T6 = T4 - T5;
+					T9 = T7 - T8;
+					Ts = T7 + T8;
+				   }
+			      }
+			 }
+			 {
+			      E TO, Ta, Tt, TQ;
+			      TD = TB - TC;
+			      TO = TC + TB;
+			      Ta = T6 + T9;
+			      TV = T6 - T9;
+			      Tt = Tr - Ts;
+			      TQ = Tr + Ts;
+			      {
+				   E TX, Tq, TR, TY;
+				   TX = Tm - Tp;
+				   Tq = Tm + Tp;
+				   TA = T3 + Ta;
+				   Tb = FNMS(KP500000000, Ta, T3);
+				   TR = TP + TQ;
+				   TY = FNMS(KP500000000, TQ, TP);
+				   TZ = TX + TY;
+				   T12 = TX - TY;
+				   T15 = TO - TR;
+				   TS = TO + TR;
+				   Tx = FNMS(KP866025403, Tt, Tq);
+				   Tu = FMA(KP866025403, Tt, Tq);
+				   Ti = Te + Th;
+				   TU = Th - Te;
+			      }
+			 }
+		    }
+		    Cr[0] = TN + TS;
+		    {
+			 E Tw, Tj, T13, TW;
+			 Tw = FNMS(KP866025403, Ti, Tb);
+			 Tj = FMA(KP866025403, Ti, Tb);
+			 T13 = TU - TV;
+			 TW = TU + TV;
+			 {
+			      E TE, TI, Tv, TF, TG, Ty;
+			      TE = FMA(KP302775637, TD, TA);
+			      TI = FNMS(KP302775637, TA, TD);
+			      Tv = FMA(KP038632954, Tu, Tj);
+			      TF = FNMS(KP038632954, Tj, Tu);
+			      TG = FNMS(KP612264650, Tw, Tx);
+			      Ty = FMA(KP612264650, Tx, Tw);
+			      {
+				   E TT, Tz, TK, TH, TM, T10, TL, TJ;
+				   TT = FNMS(KP083333333, TS, TN);
+				   Tz = FNMS(KP853480001, Ty, Tv);
+				   TK = FMA(KP853480001, Ty, Tv);
+				   TH = FNMS(KP853480001, TG, TF);
+				   TM = FMA(KP853480001, TG, TF);
+				   T1a = FNMS(KP226109445, TW, TZ);
+				   T10 = FMA(KP301479260, TZ, TW);
+				   TL = FNMS(KP522026385, Tz, TE);
+				   Ci[WS(csi, 1)] = KP600477271 * (FMA(KP957805992, TE, Tz));
+				   TJ = FMA(KP522026385, TH, TI);
+				   Ci[WS(csi, 5)] = -(KP600477271 * (FNMS(KP957805992, TI, TH)));
+				   Ci[WS(csi, 4)] = -(KP575140729 * (FMA(KP904176221, TM, TL)));
+				   Ci[WS(csi, 3)] = KP575140729 * (FNMS(KP904176221, TM, TL));
+				   Ci[WS(csi, 6)] = KP575140729 * (FMA(KP904176221, TK, TJ));
+				   Ci[WS(csi, 2)] = KP575140729 * (FNMS(KP904176221, TK, TJ));
+				   T11 = FMA(KP503537032, T10, TT);
+				   T17 = FNMS(KP251768516, T10, TT);
+			      }
+			      T14 = FNMS(KP514918778, T13, T12);
+			      T1b = FMA(KP686558370, T12, T13);
+			 }
+		    }
+	       }
+	       {
+		    E T1e, T1c, T18, T16, T1d, T19;
+		    T1e = FMA(KP769338817, T1b, T1a);
+		    T1c = FNMS(KP769338817, T1b, T1a);
+		    T18 = FNMS(KP859542535, T14, T15);
+		    T16 = FMA(KP581704778, T15, T14);
+		    T1d = FNMS(KP300462606, T18, T17);
+		    T19 = FMA(KP300462606, T18, T17);
+		    Cr[WS(csr, 1)] = FMA(KP516520780, T16, T11);
+		    Cr[WS(csr, 5)] = FNMS(KP516520780, T16, T11);
+		    Cr[WS(csr, 2)] = FMA(KP503537032, T1e, T1d);
+		    Cr[WS(csr, 6)] = FNMS(KP503537032, T1e, T1d);
+		    Cr[WS(csr, 3)] = FMA(KP503537032, T1c, T19);
+		    Cr[WS(csr, 4)] = FNMS(KP503537032, T1c, T19);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 13, "r2cf_13", {31, 6, 45, 0}, &GENUS };
+
+void X(codelet_r2cf_13) (planner *p) {
+     X(kr2c_register) (p, r2cf_13, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 13 -name r2cf_13 -include r2cf.h */
+
+/*
+ * This function contains 76 FP additions, 34 FP multiplications,
+ * (or, 57 additions, 15 multiplications, 19 fused multiply/add),
+ * 55 stack variables, 20 constants, and 26 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_13(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP083333333, +0.083333333333333333333333333333333333333333333);
+     DK(KP075902986, +0.075902986037193865983102897245103540356428373);
+     DK(KP251768516, +0.251768516431883313623436926934233488546674281);
+     DK(KP503537032, +0.503537032863766627246873853868466977093348562);
+     DK(KP113854479, +0.113854479055790798974654345867655310534642560);
+     DK(KP265966249, +0.265966249214837287587521063842185948798330267);
+     DK(KP387390585, +0.387390585467617292130675966426762851778775217);
+     DK(KP300462606, +0.300462606288665774426601772289207995520941381);
+     DK(KP132983124, +0.132983124607418643793760531921092974399165133);
+     DK(KP258260390, +0.258260390311744861420450644284508567852516811);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
+     DK(KP300238635, +0.300238635966332641462884626667381504676006424);
+     DK(KP011599105, +0.011599105605768290721655456654083252189827041);
+     DK(KP156891391, +0.156891391051584611046832726756003269660212636);
+     DK(KP256247671, +0.256247671582936600958684654061725059144125175);
+     DK(KP174138601, +0.174138601152135905005660794929264742616964676);
+     DK(KP575140729, +0.575140729474003121368385547455453388461001608);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(52, rs), MAKE_VOLATILE_STRIDE(52, csr), MAKE_VOLATILE_STRIDE(52, csi)) {
+	       E T13, Tb, Tm, TW, TX, T14, TU, T10, Tz, TB, Tu, TC, TR, T11;
+	       T13 = R0[0];
+	       {
+		    E Te, TO, Ta, Tv, To, T5, Tw, Tp, Th, Tr, Tk, Ts, Tl, TP, Tc;
+		    E Td;
+		    Tc = R0[WS(rs, 4)];
+		    Td = R1[WS(rs, 2)];
+		    Te = Tc - Td;
+		    TO = Tc + Td;
+		    {
+			 E T6, T7, T8, T9;
+			 T6 = R1[0];
+			 T7 = R1[WS(rs, 1)];
+			 T8 = R1[WS(rs, 4)];
+			 T9 = T7 + T8;
+			 Ta = T6 + T9;
+			 Tv = T7 - T8;
+			 To = FNMS(KP500000000, T9, T6);
+		    }
+		    {
+			 E T1, T2, T3, T4;
+			 T1 = R0[WS(rs, 6)];
+			 T2 = R0[WS(rs, 5)];
+			 T3 = R0[WS(rs, 2)];
+			 T4 = T2 + T3;
+			 T5 = T1 + T4;
+			 Tw = T2 - T3;
+			 Tp = FNMS(KP500000000, T4, T1);
+		    }
+		    {
+			 E Tf, Tg, Ti, Tj;
+			 Tf = R1[WS(rs, 5)];
+			 Tg = R0[WS(rs, 3)];
+			 Th = Tf - Tg;
+			 Tr = Tf + Tg;
+			 Ti = R1[WS(rs, 3)];
+			 Tj = R0[WS(rs, 1)];
+			 Tk = Ti - Tj;
+			 Ts = Ti + Tj;
+		    }
+		    Tl = Th + Tk;
+		    TP = Tr + Ts;
+		    Tb = T5 - Ta;
+		    Tm = Te + Tl;
+		    TW = Ta + T5;
+		    TX = TO + TP;
+		    T14 = TW + TX;
+		    {
+			 E TS, TT, Tx, Ty;
+			 TS = Tv + Tw;
+			 TT = Th - Tk;
+			 TU = TS - TT;
+			 T10 = TS + TT;
+			 Tx = KP866025403 * (Tv - Tw);
+			 Ty = FNMS(KP500000000, Tl, Te);
+			 Tz = Tx + Ty;
+			 TB = Ty - Tx;
+		    }
+		    {
+			 E Tq, Tt, TN, TQ;
+			 Tq = To - Tp;
+			 Tt = KP866025403 * (Tr - Ts);
+			 Tu = Tq - Tt;
+			 TC = Tq + Tt;
+			 TN = To + Tp;
+			 TQ = FNMS(KP500000000, TP, TO);
+			 TR = TN - TQ;
+			 T11 = TN + TQ;
+		    }
+	       }
+	       Cr[0] = T13 + T14;
+	       {
+		    E Tn, TG, TE, TF, TJ, TM, TK, TL;
+		    Tn = FNMS(KP174138601, Tm, KP575140729 * Tb);
+		    TG = FMA(KP174138601, Tb, KP575140729 * Tm);
+		    {
+			 E TA, TD, TH, TI;
+			 TA = FNMS(KP156891391, Tz, KP256247671 * Tu);
+			 TD = FNMS(KP300238635, TC, KP011599105 * TB);
+			 TE = TA + TD;
+			 TF = KP1_732050807 * (TD - TA);
+			 TH = FMA(KP300238635, TB, KP011599105 * TC);
+			 TI = FMA(KP256247671, Tz, KP156891391 * Tu);
+			 TJ = TH - TI;
+			 TM = KP1_732050807 * (TI + TH);
+		    }
+		    Ci[WS(csi, 5)] = FMA(KP2_000000000, TE, Tn);
+		    Ci[WS(csi, 1)] = FMA(KP2_000000000, TJ, TG);
+		    TK = TG - TJ;
+		    Ci[WS(csi, 4)] = TF - TK;
+		    Ci[WS(csi, 3)] = TF + TK;
+		    TL = Tn - TE;
+		    Ci[WS(csi, 2)] = TL - TM;
+		    Ci[WS(csi, 6)] = TL + TM;
+	       }
+	       {
+		    E TZ, T1b, T19, T1e, T16, T1a, TV, TY, T1c, T1d;
+		    TV = FNMS(KP132983124, TU, KP258260390 * TR);
+		    TY = KP300462606 * (TW - TX);
+		    TZ = FMA(KP2_000000000, TV, TY);
+		    T1b = TY - TV;
+		    {
+			 E T17, T18, T12, T15;
+			 T17 = FMA(KP387390585, TU, KP265966249 * TR);
+			 T18 = FNMS(KP503537032, T11, KP113854479 * T10);
+			 T19 = T17 - T18;
+			 T1e = T17 + T18;
+			 T12 = FMA(KP251768516, T10, KP075902986 * T11);
+			 T15 = FNMS(KP083333333, T14, T13);
+			 T16 = FMA(KP2_000000000, T12, T15);
+			 T1a = T15 - T12;
+		    }
+		    Cr[WS(csr, 1)] = TZ + T16;
+		    Cr[WS(csr, 5)] = T16 - TZ;
+		    T1c = T1a - T1b;
+		    Cr[WS(csr, 2)] = T19 + T1c;
+		    Cr[WS(csr, 6)] = T1c - T19;
+		    T1d = T1b + T1a;
+		    Cr[WS(csr, 3)] = T1d - T1e;
+		    Cr[WS(csr, 4)] = T1e + T1d;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 13, "r2cf_13", {57, 15, 19, 0}, &GENUS };
+
+void X(codelet_r2cf_13) (planner *p) {
+     X(kr2c_register) (p, r2cf_13, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_14.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_14.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:46 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 14 -name r2cf_14 -include r2cf.h */
+
+/*
+ * This function contains 62 FP additions, 36 FP multiplications,
+ * (or, 32 additions, 6 multiplications, 30 fused multiply/add),
+ * 45 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_14(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(56, rs), MAKE_VOLATILE_STRIDE(56, csr), MAKE_VOLATILE_STRIDE(56, csi)) {
+	       E TN, T3, TG, TQ, Tx, To, TH, Td, TD, TO, Tw, Ta, TL, Ty, TT;
+	       E TI, Tg, Tr, Te, Tf, TP, TJ;
+	       {
+		    E Tl, TE, Tk, Tm;
+		    {
+			 E T1, T2, Ti, Tj;
+			 T1 = R0[0];
+			 T2 = R1[WS(rs, 3)];
+			 Ti = R0[WS(rs, 3)];
+			 Tj = R1[WS(rs, 6)];
+			 Tl = R0[WS(rs, 4)];
+			 TN = T1 + T2;
+			 T3 = T1 - T2;
+			 TE = Ti + Tj;
+			 Tk = Ti - Tj;
+			 Tm = R1[0];
+		    }
+		    {
+			 E T7, TC, T6, T8;
+			 {
+			      E T4, T5, TF, Tn;
+			      T4 = R0[WS(rs, 1)];
+			      T5 = R1[WS(rs, 4)];
+			      T7 = R0[WS(rs, 6)];
+			      TF = Tl + Tm;
+			      Tn = Tl - Tm;
+			      TC = T4 + T5;
+			      T6 = T4 - T5;
+			      TG = TE - TF;
+			      TQ = TE + TF;
+			      Tx = Tn - Tk;
+			      To = Tk + Tn;
+			      T8 = R1[WS(rs, 2)];
+			 }
+			 {
+			      E Tb, Tc, TB, T9;
+			      Tb = R0[WS(rs, 2)];
+			      Tc = R1[WS(rs, 5)];
+			      Te = R0[WS(rs, 5)];
+			      TB = T7 + T8;
+			      T9 = T7 - T8;
+			      TH = Tb + Tc;
+			      Td = Tb - Tc;
+			      TD = TB - TC;
+			      TO = TC + TB;
+			      Tw = T6 - T9;
+			      Ta = T6 + T9;
+			      Tf = R1[WS(rs, 1)];
+			 }
+		    }
+	       }
+	       TL = FNMS(KP554958132, TG, TD);
+	       Ty = FNMS(KP554958132, Tx, Tw);
+	       TT = FNMS(KP356895867, TO, TQ);
+	       TI = Te + Tf;
+	       Tg = Te - Tf;
+	       Tr = FNMS(KP356895867, Ta, To);
+	       TP = TH + TI;
+	       TJ = TH - TI;
+	       {
+		    E Th, Tv, TK, TM;
+		    Th = Td + Tg;
+		    Tv = Tg - Td;
+		    TK = FMA(KP554958132, TJ, TG);
+		    TM = FMA(KP554958132, TD, TJ);
+		    Ci[WS(csi, 6)] = KP974927912 * (FNMS(KP801937735, TL, TJ));
+		    {
+			 E TR, TV, TU, Tz;
+			 TR = FNMS(KP356895867, TQ, TP);
+			 TV = FNMS(KP356895867, TP, TO);
+			 TU = FNMS(KP692021471, TT, TP);
+			 Cr[0] = TN + TO + TP + TQ;
+			 Tz = FMA(KP554958132, Tv, Tx);
+			 Ci[WS(csi, 1)] = KP974927912 * (FNMS(KP801937735, Ty, Tv));
+			 {
+			      E TA, Ts, Tt, Tp;
+			      TA = FMA(KP554958132, Tw, Tv);
+			      Ts = FNMS(KP692021471, Tr, Th);
+			      Tt = FNMS(KP356895867, Th, Ta);
+			      Tp = FNMS(KP356895867, To, Th);
+			      Cr[WS(csr, 7)] = T3 + Ta + Th + To;
+			      Ci[WS(csi, 2)] = KP974927912 * (FMA(KP801937735, TK, TD));
+			      Ci[WS(csi, 4)] = KP974927912 * (FNMS(KP801937735, TM, TG));
+			      {
+				   E TS, TW, Tu, Tq;
+				   TS = FNMS(KP692021471, TR, TO);
+				   TW = FNMS(KP692021471, TV, TQ);
+				   Cr[WS(csr, 2)] = FNMS(KP900968867, TU, TN);
+				   Ci[WS(csi, 5)] = KP974927912 * (FMA(KP801937735, Tz, Tw));
+				   Ci[WS(csi, 3)] = KP974927912 * (FNMS(KP801937735, TA, Tx));
+				   Cr[WS(csr, 5)] = FNMS(KP900968867, Ts, T3);
+				   Tu = FNMS(KP692021471, Tt, To);
+				   Tq = FNMS(KP692021471, Tp, Ta);
+				   Cr[WS(csr, 4)] = FNMS(KP900968867, TS, TN);
+				   Cr[WS(csr, 6)] = FNMS(KP900968867, TW, TN);
+				   Cr[WS(csr, 1)] = FNMS(KP900968867, Tu, T3);
+				   Cr[WS(csr, 3)] = FNMS(KP900968867, Tq, T3);
+			      }
+			 }
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 14, "r2cf_14", {32, 6, 30, 0}, &GENUS };
+
+void X(codelet_r2cf_14) (planner *p) {
+     X(kr2c_register) (p, r2cf_14, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 14 -name r2cf_14 -include r2cf.h */
+
+/*
+ * This function contains 62 FP additions, 36 FP multiplications,
+ * (or, 38 additions, 12 multiplications, 24 fused multiply/add),
+ * 29 stack variables, 6 constants, and 28 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_14(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(56, rs), MAKE_VOLATILE_STRIDE(56, csr), MAKE_VOLATILE_STRIDE(56, csi)) {
+	       E T3, TB, T6, Tv, Tn, Ts, Tk, Tt, Td, Ty, T9, Tw, Tg, Tz, T1;
+	       E T2;
+	       T1 = R0[0];
+	       T2 = R1[WS(rs, 3)];
+	       T3 = T1 - T2;
+	       TB = T1 + T2;
+	       {
+		    E T4, T5, Tl, Tm;
+		    T4 = R0[WS(rs, 2)];
+		    T5 = R1[WS(rs, 5)];
+		    T6 = T4 - T5;
+		    Tv = T4 + T5;
+		    Tl = R0[WS(rs, 6)];
+		    Tm = R1[WS(rs, 2)];
+		    Tn = Tl - Tm;
+		    Ts = Tl + Tm;
+	       }
+	       {
+		    E Ti, Tj, Tb, Tc;
+		    Ti = R0[WS(rs, 1)];
+		    Tj = R1[WS(rs, 4)];
+		    Tk = Ti - Tj;
+		    Tt = Ti + Tj;
+		    Tb = R0[WS(rs, 3)];
+		    Tc = R1[WS(rs, 6)];
+		    Td = Tb - Tc;
+		    Ty = Tb + Tc;
+	       }
+	       {
+		    E T7, T8, Te, Tf;
+		    T7 = R0[WS(rs, 5)];
+		    T8 = R1[WS(rs, 1)];
+		    T9 = T7 - T8;
+		    Tw = T7 + T8;
+		    Te = R0[WS(rs, 4)];
+		    Tf = R1[0];
+		    Tg = Te - Tf;
+		    Tz = Te + Tf;
+	       }
+	       {
+		    E Tp, Tr, Tq, Ta, To, Th;
+		    Tp = Tn - Tk;
+		    Tr = Tg - Td;
+		    Tq = T9 - T6;
+		    Ci[WS(csi, 1)] = FMA(KP781831482, Tp, KP974927912 * Tq) + (KP433883739 * Tr);
+		    Ci[WS(csi, 5)] = FMA(KP433883739, Tq, KP781831482 * Tr) - (KP974927912 * Tp);
+		    Ci[WS(csi, 3)] = FMA(KP433883739, Tp, KP974927912 * Tr) - (KP781831482 * Tq);
+		    Ta = T6 + T9;
+		    To = Tk + Tn;
+		    Th = Td + Tg;
+		    Cr[WS(csr, 3)] = FMA(KP623489801, Ta, T3) + FNMA(KP222520933, Th, KP900968867 * To);
+		    Cr[WS(csr, 7)] = T3 + To + Ta + Th;
+		    Cr[WS(csr, 1)] = FMA(KP623489801, To, T3) + FNMA(KP900968867, Th, KP222520933 * Ta);
+		    Cr[WS(csr, 5)] = FMA(KP623489801, Th, T3) + FNMA(KP900968867, Ta, KP222520933 * To);
+	       }
+	       {
+		    E Tu, TA, Tx, TC, TE, TD;
+		    Tu = Ts - Tt;
+		    TA = Ty - Tz;
+		    Tx = Tv - Tw;
+		    Ci[WS(csi, 2)] = FMA(KP974927912, Tu, KP433883739 * Tx) + (KP781831482 * TA);
+		    Ci[WS(csi, 6)] = FMA(KP974927912, Tx, KP433883739 * TA) - (KP781831482 * Tu);
+		    Ci[WS(csi, 4)] = FNMS(KP781831482, Tx, KP974927912 * TA) - (KP433883739 * Tu);
+		    TC = Tt + Ts;
+		    TE = Tv + Tw;
+		    TD = Ty + Tz;
+		    Cr[WS(csr, 6)] = FMA(KP623489801, TC, TB) + FNMA(KP900968867, TD, KP222520933 * TE);
+		    Cr[WS(csr, 2)] = FMA(KP623489801, TD, TB) + FNMA(KP900968867, TE, KP222520933 * TC);
+		    Cr[WS(csr, 4)] = FMA(KP623489801, TE, TB) + FNMA(KP222520933, TD, KP900968867 * TC);
+		    Cr[0] = TB + TC + TE + TD;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 14, "r2cf_14", {38, 12, 24, 0}, &GENUS };
+
+void X(codelet_r2cf_14) (planner *p) {
+     X(kr2c_register) (p, r2cf_14, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_15.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_15.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:46 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cf_15 -include r2cf.h */
+
+/*
+ * This function contains 64 FP additions, 35 FP multiplications,
+ * (or, 36 additions, 7 multiplications, 28 fused multiply/add),
+ * 50 stack variables, 8 constants, and 30 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP910592997, +0.910592997310029334643087372129977886038870291);
+     DK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
+	       E Tw, Tz, Tp, Ty;
+	       {
+		    E Ti, TF, TR, TN, TX, T11, TM, TS, Tl, TH, Tf, To, TT, TD, Tg;
+		    E Th;
+		    TD = R0[0];
+		    Tg = R0[WS(rs, 5)];
+		    Th = R1[WS(rs, 2)];
+		    {
+			 E Tj, Tq, Tt, Tm, T3, Tk, T4, Ta, Tr, Td, Tu, T5, TE;
+			 Tj = R1[WS(rs, 1)];
+			 Tq = R0[WS(rs, 3)];
+			 Tt = R1[WS(rs, 4)];
+			 TE = Th + Tg;
+			 Ti = Tg - Th;
+			 Tm = R0[WS(rs, 6)];
+			 {
+			      E T8, T9, T1, T2, Tb, Tc;
+			      T1 = R0[WS(rs, 4)];
+			      T2 = R1[WS(rs, 6)];
+			      TF = FNMS(KP500000000, TE, TD);
+			      TR = TD + TE;
+			      T8 = R1[WS(rs, 5)];
+			      T3 = T1 - T2;
+			      Tk = T1 + T2;
+			      T9 = R1[0];
+			      Tb = R0[WS(rs, 7)];
+			      Tc = R0[WS(rs, 2)];
+			      T4 = R0[WS(rs, 1)];
+			      Ta = T8 - T9;
+			      Tr = T8 + T9;
+			      Td = Tb - Tc;
+			      Tu = Tb + Tc;
+			      T5 = R1[WS(rs, 3)];
+			 }
+			 {
+			      E Ts, Tv, Te, Tn, T7, T6, TV, TW;
+			      TV = Tq + Tr;
+			      Ts = FNMS(KP500000000, Tr, Tq);
+			      Tv = FNMS(KP500000000, Tu, Tt);
+			      TW = Tt + Tu;
+			      Te = Ta + Td;
+			      TN = Td - Ta;
+			      Tn = T4 + T5;
+			      T6 = T4 - T5;
+			      TX = TV + TW;
+			      T11 = TW - TV;
+			      TM = T6 - T3;
+			      T7 = T3 + T6;
+			      TS = Tj + Tk;
+			      Tl = FNMS(KP500000000, Tk, Tj);
+			      TH = Ts + Tv;
+			      Tw = Ts - Tv;
+			      Tz = Te - T7;
+			      Tf = T7 + Te;
+			      To = FNMS(KP500000000, Tn, Tm);
+			      TT = Tm + Tn;
+			 }
+		    }
+		    {
+			 E TO, TQ, TU, T12, TK, TI, TG;
+			 Ci[WS(csi, 5)] = KP866025403 * (Tf - Ti);
+			 TG = Tl + To;
+			 Tp = Tl - To;
+			 TO = FMA(KP618033988, TN, TM);
+			 TQ = FNMS(KP618033988, TM, TN);
+			 TU = TS + TT;
+			 T12 = TS - TT;
+			 TK = TG - TH;
+			 TI = TG + TH;
+			 {
+			      E T10, TY, TL, TP, TJ, TZ;
+			      T10 = TU - TX;
+			      TY = TU + TX;
+			      Cr[WS(csr, 5)] = TF + TI;
+			      TJ = FNMS(KP250000000, TI, TF);
+			      Ci[WS(csi, 6)] = -(KP951056516 * (FNMS(KP618033988, T11, T12)));
+			      Ci[WS(csi, 3)] = KP951056516 * (FMA(KP618033988, T12, T11));
+			      TL = FMA(KP559016994, TK, TJ);
+			      TP = FNMS(KP559016994, TK, TJ);
+			      Cr[0] = TR + TY;
+			      TZ = FNMS(KP250000000, TY, TR);
+			      Cr[WS(csr, 4)] = FNMS(KP823639103, TO, TL);
+			      Cr[WS(csr, 1)] = FMA(KP823639103, TO, TL);
+			      Cr[WS(csr, 7)] = FNMS(KP823639103, TQ, TP);
+			      Cr[WS(csr, 2)] = FMA(KP823639103, TQ, TP);
+			      Cr[WS(csr, 6)] = FMA(KP559016994, T10, TZ);
+			      Cr[WS(csr, 3)] = FNMS(KP559016994, T10, TZ);
+			      Ty = FMA(KP250000000, Tf, Ti);
+			 }
+		    }
+	       }
+	       {
+		    E TB, Tx, TC, TA;
+		    TB = FNMS(KP618033988, Tp, Tw);
+		    Tx = FMA(KP618033988, Tw, Tp);
+		    TC = FNMS(KP559016994, Tz, Ty);
+		    TA = FMA(KP559016994, Tz, Ty);
+		    Ci[WS(csi, 2)] = KP951056516 * (FNMS(KP910592997, TC, TB));
+		    Ci[WS(csi, 7)] = KP951056516 * (FMA(KP910592997, TC, TB));
+		    Ci[WS(csi, 4)] = KP951056516 * (FMA(KP910592997, TA, Tx));
+		    Ci[WS(csi, 1)] = -(KP951056516 * (FNMS(KP910592997, TA, Tx)));
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 15, "r2cf_15", {36, 7, 28, 0}, &GENUS };
+
+void X(codelet_r2cf_15) (planner *p) {
+     X(kr2c_register) (p, r2cf_15, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cf_15 -include r2cf.h */
+
+/*
+ * This function contains 64 FP additions, 25 FP multiplications,
+ * (or, 50 additions, 11 multiplications, 14 fused multiply/add),
+ * 47 stack variables, 10 constants, and 30 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP484122918, +0.484122918275927110647408174972799951354115213);
+     DK(KP216506350, +0.216506350946109661690930792688234045867850657);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP509036960, +0.509036960455127183450980863393907648510733164);
+     DK(KP823639103, +0.823639103546331925877420039278190003029660514);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
+	       E Ti, TR, TL, TD, TE, T7, Te, Tf, TV, TW, TX, Tv, Ty, TH, To;
+	       E Tr, TG, TS, TT, TU;
+	       {
+		    E TJ, Tg, Th, TK;
+		    TJ = R0[0];
+		    Tg = R0[WS(rs, 5)];
+		    Th = R1[WS(rs, 2)];
+		    TK = Th + Tg;
+		    Ti = Tg - Th;
+		    TR = TJ + TK;
+		    TL = FNMS(KP500000000, TK, TJ);
+	       }
+	       {
+		    E Tm, Tt, Tw, Tp, T3, Tx, Ta, Tn, Td, Tq, T6, Tu;
+		    Tm = R1[WS(rs, 1)];
+		    Tt = R0[WS(rs, 3)];
+		    Tw = R1[WS(rs, 4)];
+		    Tp = R0[WS(rs, 6)];
+		    {
+			 E T1, T2, T8, T9;
+			 T1 = R0[WS(rs, 7)];
+			 T2 = R0[WS(rs, 2)];
+			 T3 = T1 - T2;
+			 Tx = T1 + T2;
+			 T8 = R1[WS(rs, 6)];
+			 T9 = R0[WS(rs, 4)];
+			 Ta = T8 - T9;
+			 Tn = T9 + T8;
+		    }
+		    {
+			 E Tb, Tc, T4, T5;
+			 Tb = R1[WS(rs, 3)];
+			 Tc = R0[WS(rs, 1)];
+			 Td = Tb - Tc;
+			 Tq = Tc + Tb;
+			 T4 = R1[0];
+			 T5 = R1[WS(rs, 5)];
+			 T6 = T4 - T5;
+			 Tu = T5 + T4;
+		    }
+		    TD = Ta - Td;
+		    TE = T6 + T3;
+		    T7 = T3 - T6;
+		    Te = Ta + Td;
+		    Tf = T7 - Te;
+		    TV = Tt + Tu;
+		    TW = Tw + Tx;
+		    TX = TV + TW;
+		    Tv = FNMS(KP500000000, Tu, Tt);
+		    Ty = FNMS(KP500000000, Tx, Tw);
+		    TH = Tv + Ty;
+		    To = FNMS(KP500000000, Tn, Tm);
+		    Tr = FNMS(KP500000000, Tq, Tp);
+		    TG = To + Tr;
+		    TS = Tm + Tn;
+		    TT = Tp + Tq;
+		    TU = TS + TT;
+	       }
+	       Ci[WS(csi, 5)] = KP866025403 * (Tf - Ti);
+	       {
+		    E TF, TP, TI, TM, TN, TQ, TO;
+		    TF = FMA(KP823639103, TD, KP509036960 * TE);
+		    TP = FNMS(KP509036960, TD, KP823639103 * TE);
+		    TI = KP559016994 * (TG - TH);
+		    TM = TG + TH;
+		    TN = FNMS(KP250000000, TM, TL);
+		    Cr[WS(csr, 5)] = TL + TM;
+		    TQ = TN - TI;
+		    Cr[WS(csr, 2)] = TP + TQ;
+		    Cr[WS(csr, 7)] = TQ - TP;
+		    TO = TI + TN;
+		    Cr[WS(csr, 1)] = TF + TO;
+		    Cr[WS(csr, 4)] = TO - TF;
+	       }
+	       {
+		    E T11, T12, T10, TY, TZ;
+		    T11 = TS - TT;
+		    T12 = TW - TV;
+		    Ci[WS(csi, 3)] = FMA(KP587785252, T11, KP951056516 * T12);
+		    Ci[WS(csi, 6)] = FNMS(KP951056516, T11, KP587785252 * T12);
+		    T10 = KP559016994 * (TU - TX);
+		    TY = TU + TX;
+		    TZ = FNMS(KP250000000, TY, TR);
+		    Cr[WS(csr, 3)] = TZ - T10;
+		    Cr[0] = TR + TY;
+		    Cr[WS(csr, 6)] = T10 + TZ;
+		    {
+			 E Tl, TB, TA, TC;
+			 {
+			      E Tj, Tk, Ts, Tz;
+			      Tj = FMA(KP866025403, Ti, KP216506350 * Tf);
+			      Tk = KP484122918 * (Te + T7);
+			      Tl = Tj + Tk;
+			      TB = Tk - Tj;
+			      Ts = To - Tr;
+			      Tz = Tv - Ty;
+			      TA = FMA(KP951056516, Ts, KP587785252 * Tz);
+			      TC = FNMS(KP587785252, Ts, KP951056516 * Tz);
+			 }
+			 Ci[WS(csi, 1)] = Tl - TA;
+			 Ci[WS(csi, 7)] = TC - TB;
+			 Ci[WS(csi, 4)] = Tl + TA;
+			 Ci[WS(csi, 2)] = TB + TC;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 15, "r2cf_15", {50, 11, 14, 0}, &GENUS };
+
+void X(codelet_r2cf_15) (planner *p) {
+     X(kr2c_register) (p, r2cf_15, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:47 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cf_16 -include r2cf.h */
+
+/*
+ * This function contains 58 FP additions, 20 FP multiplications,
+ * (or, 38 additions, 0 multiplications, 20 fused multiply/add),
+ * 38 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
+	       E TQ, TP;
+	       {
+		    E TB, TN, Tf, T7, Te, Tv, TO, TE, Tq, TJ, Tp, TI, TT, Ty, Tm;
+		    E Tr, TK, Ts;
+		    {
+			 E TC, Ta, Td, TD;
+			 {
+			      E T1, T2, T4, T5;
+			      T1 = R0[0];
+			      T2 = R0[WS(rs, 4)];
+			      T4 = R0[WS(rs, 2)];
+			      T5 = R0[WS(rs, 6)];
+			      {
+				   E T8, T3, T6, T9, Tb, Tc;
+				   T8 = R0[WS(rs, 1)];
+				   TB = T1 - T2;
+				   T3 = T1 + T2;
+				   TN = T4 - T5;
+				   T6 = T4 + T5;
+				   T9 = R0[WS(rs, 5)];
+				   Tb = R0[WS(rs, 7)];
+				   Tc = R0[WS(rs, 3)];
+				   Tf = T3 - T6;
+				   T7 = T3 + T6;
+				   TC = T8 - T9;
+				   Ta = T8 + T9;
+				   Td = Tb + Tc;
+				   TD = Tb - Tc;
+			      }
+			 }
+			 {
+			      E TG, Ti, Tj, Tk, Tg, Th;
+			      Tg = R1[0];
+			      Th = R1[WS(rs, 4)];
+			      Te = Ta + Td;
+			      Tv = Td - Ta;
+			      TO = TD - TC;
+			      TE = TC + TD;
+			      TG = Tg - Th;
+			      Ti = Tg + Th;
+			      Tj = R1[WS(rs, 2)];
+			      Tk = R1[WS(rs, 6)];
+			      {
+				   E Tn, To, TH, Tl;
+				   Tn = R1[WS(rs, 7)];
+				   To = R1[WS(rs, 3)];
+				   Tq = R1[WS(rs, 1)];
+				   TH = Tj - Tk;
+				   Tl = Tj + Tk;
+				   TJ = Tn - To;
+				   Tp = Tn + To;
+				   TI = FNMS(KP414213562, TH, TG);
+				   TT = FMA(KP414213562, TG, TH);
+				   Ty = Ti + Tl;
+				   Tm = Ti - Tl;
+				   Tr = R1[WS(rs, 5)];
+			      }
+			 }
+		    }
+		    Cr[WS(csr, 4)] = T7 - Te;
+		    TK = Tr - Tq;
+		    Ts = Tq + Tr;
+		    {
+			 E Tx, TV, TF, TS, Tz, Tt, TM, TL;
+			 Tx = T7 + Te;
+			 TV = FNMS(KP707106781, TE, TB);
+			 TF = FMA(KP707106781, TE, TB);
+			 TL = FNMS(KP414213562, TK, TJ);
+			 TS = FMA(KP414213562, TJ, TK);
+			 Tz = Tp + Ts;
+			 Tt = Tp - Ts;
+			 TM = TI + TL;
+			 TQ = TL - TI;
+			 {
+			      E TR, TU, TW, TA, Tw, Tu;
+			      TP = FMA(KP707106781, TO, TN);
+			      TR = FNMS(KP707106781, TO, TN);
+			      TA = Ty + Tz;
+			      Ci[WS(csi, 4)] = Tz - Ty;
+			      Tw = Tt - Tm;
+			      Tu = Tm + Tt;
+			      Cr[WS(csr, 1)] = FMA(KP923879532, TM, TF);
+			      Cr[WS(csr, 7)] = FNMS(KP923879532, TM, TF);
+			      Cr[0] = Tx + TA;
+			      Cr[WS(csr, 8)] = Tx - TA;
+			      Ci[WS(csi, 6)] = FMS(KP707106781, Tw, Tv);
+			      Ci[WS(csi, 2)] = FMA(KP707106781, Tw, Tv);
+			      Cr[WS(csr, 2)] = FMA(KP707106781, Tu, Tf);
+			      Cr[WS(csr, 6)] = FNMS(KP707106781, Tu, Tf);
+			      TU = TS - TT;
+			      TW = TT + TS;
+			      Ci[WS(csi, 7)] = FMA(KP923879532, TU, TR);
+			      Ci[WS(csi, 1)] = FMS(KP923879532, TU, TR);
+			      Cr[WS(csr, 3)] = FMA(KP923879532, TW, TV);
+			      Cr[WS(csr, 5)] = FNMS(KP923879532, TW, TV);
+			 }
+		    }
+	       }
+	       Ci[WS(csi, 5)] = FMS(KP923879532, TQ, TP);
+	       Ci[WS(csi, 3)] = FMA(KP923879532, TQ, TP);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 16, "r2cf_16", {38, 0, 20, 0}, &GENUS };
+
+void X(codelet_r2cf_16) (planner *p) {
+     X(kr2c_register) (p, r2cf_16, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cf_16 -include r2cf.h */
+
+/*
+ * This function contains 58 FP additions, 12 FP multiplications,
+ * (or, 54 additions, 8 multiplications, 4 fused multiply/add),
+ * 34 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
+	       E T3, T6, T7, Tz, Ti, Ta, Td, Te, TA, Th, Tq, TV, TF, TP, Tx;
+	       E TU, TE, TM, Tg, Tf, TJ, TQ;
+	       {
+		    E T1, T2, T4, T5;
+		    T1 = R0[0];
+		    T2 = R0[WS(rs, 4)];
+		    T3 = T1 + T2;
+		    T4 = R0[WS(rs, 2)];
+		    T5 = R0[WS(rs, 6)];
+		    T6 = T4 + T5;
+		    T7 = T3 + T6;
+		    Tz = T1 - T2;
+		    Ti = T4 - T5;
+	       }
+	       {
+		    E T8, T9, Tb, Tc;
+		    T8 = R0[WS(rs, 1)];
+		    T9 = R0[WS(rs, 5)];
+		    Ta = T8 + T9;
+		    Tg = T8 - T9;
+		    Tb = R0[WS(rs, 7)];
+		    Tc = R0[WS(rs, 3)];
+		    Td = Tb + Tc;
+		    Tf = Tb - Tc;
+	       }
+	       Te = Ta + Td;
+	       TA = KP707106781 * (Tg + Tf);
+	       Th = KP707106781 * (Tf - Tg);
+	       {
+		    E Tm, TN, Tp, TO;
+		    {
+			 E Tk, Tl, Tn, To;
+			 Tk = R1[WS(rs, 7)];
+			 Tl = R1[WS(rs, 3)];
+			 Tm = Tk - Tl;
+			 TN = Tk + Tl;
+			 Tn = R1[WS(rs, 1)];
+			 To = R1[WS(rs, 5)];
+			 Tp = Tn - To;
+			 TO = Tn + To;
+		    }
+		    Tq = FNMS(KP923879532, Tp, KP382683432 * Tm);
+		    TV = TN + TO;
+		    TF = FMA(KP923879532, Tm, KP382683432 * Tp);
+		    TP = TN - TO;
+	       }
+	       {
+		    E Tt, TK, Tw, TL;
+		    {
+			 E Tr, Ts, Tu, Tv;
+			 Tr = R1[0];
+			 Ts = R1[WS(rs, 4)];
+			 Tt = Tr - Ts;
+			 TK = Tr + Ts;
+			 Tu = R1[WS(rs, 2)];
+			 Tv = R1[WS(rs, 6)];
+			 Tw = Tu - Tv;
+			 TL = Tu + Tv;
+		    }
+		    Tx = FMA(KP382683432, Tt, KP923879532 * Tw);
+		    TU = TK + TL;
+		    TE = FNMS(KP382683432, Tw, KP923879532 * Tt);
+		    TM = TK - TL;
+	       }
+	       Cr[WS(csr, 4)] = T7 - Te;
+	       Ci[WS(csi, 4)] = TV - TU;
+	       {
+		    E Tj, Ty, TD, TG;
+		    Tj = Th - Ti;
+		    Ty = Tq - Tx;
+		    Ci[WS(csi, 1)] = Tj + Ty;
+		    Ci[WS(csi, 7)] = Ty - Tj;
+		    TD = Tz + TA;
+		    TG = TE + TF;
+		    Cr[WS(csr, 7)] = TD - TG;
+		    Cr[WS(csr, 1)] = TD + TG;
+	       }
+	       {
+		    E TB, TC, TH, TI;
+		    TB = Tz - TA;
+		    TC = Tx + Tq;
+		    Cr[WS(csr, 5)] = TB - TC;
+		    Cr[WS(csr, 3)] = TB + TC;
+		    TH = Ti + Th;
+		    TI = TF - TE;
+		    Ci[WS(csi, 3)] = TH + TI;
+		    Ci[WS(csi, 5)] = TI - TH;
+	       }
+	       TJ = T3 - T6;
+	       TQ = KP707106781 * (TM + TP);
+	       Cr[WS(csr, 6)] = TJ - TQ;
+	       Cr[WS(csr, 2)] = TJ + TQ;
+	       {
+		    E TR, TS, TT, TW;
+		    TR = Td - Ta;
+		    TS = KP707106781 * (TP - TM);
+		    Ci[WS(csi, 2)] = TR + TS;
+		    Ci[WS(csi, 6)] = TS - TR;
+		    TT = T7 + Te;
+		    TW = TU + TV;
+		    Cr[WS(csr, 8)] = TT - TW;
+		    Cr[0] = TT + TW;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 16, "r2cf_16", {54, 8, 4, 0}, &GENUS };
+
+void X(codelet_r2cf_16) (planner *p) {
+     X(kr2c_register) (p, r2cf_16, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cf_2 -include r2cf.h */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 3 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
+	       E T1, T2;
+	       T1 = R0[0];
+	       T2 = R1[0];
+	       Cr[0] = T1 + T2;
+	       Cr[WS(csr, 1)] = T1 - T2;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 2, "r2cf_2", {2, 0, 0, 0}, &GENUS };
+
+void X(codelet_r2cf_2) (planner *p) {
+     X(kr2c_register) (p, r2cf_2, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cf_2 -include r2cf.h */
+
+/*
+ * This function contains 2 FP additions, 0 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 0 fused multiply/add),
+ * 3 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
+	       E T1, T2;
+	       T1 = R0[0];
+	       T2 = R1[0];
+	       Cr[WS(csr, 1)] = T1 - T2;
+	       Cr[0] = T1 + T2;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 2, "r2cf_2", {2, 0, 0, 0}, &GENUS };
+
+void X(codelet_r2cf_2) (planner *p) {
+     X(kr2c_register) (p, r2cf_2, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:48 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cf_20 -include r2cf.h */
+
+/*
+ * This function contains 86 FP additions, 32 FP multiplications,
+ * (or, 58 additions, 4 multiplications, 28 fused multiply/add),
+ * 70 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
+	       E T1i, T1c, T1a, T1o, T1m, T1h, T1b, T13, T1j, T1n;
+	       {
+		    E T3, T1d, TJ, TV, T1k, T16, T19, T1l, Ty, Ti, T12, TD, T1g, TR, TX;
+		    E TK, Tt, TU, TW, TL, TE;
+		    {
+			 E T1, T2, TG, TH;
+			 T1 = R0[0];
+			 T2 = R0[WS(rs, 5)];
+			 TG = R1[WS(rs, 2)];
+			 TH = R1[WS(rs, 7)];
+			 {
+			      E T6, To, T17, Tx, T18, TC, Tj, T9, Tp, Tu, Td, T15, Tm, Tq, Te;
+			      E Tf;
+			      {
+				   E TA, TB, T7, T8;
+				   {
+					E T4, TF, TI, T5, Tv, Tw;
+					T4 = R0[WS(rs, 2)];
+					T3 = T1 - T2;
+					TF = T1 + T2;
+					T1d = TG - TH;
+					TI = TG + TH;
+					T5 = R0[WS(rs, 7)];
+					Tv = R1[WS(rs, 6)];
+					Tw = R1[WS(rs, 1)];
+					TJ = TF - TI;
+					TV = TF + TI;
+					T6 = T4 - T5;
+					To = T4 + T5;
+					T17 = Tw - Tv;
+					Tx = Tv + Tw;
+				   }
+				   TA = R1[WS(rs, 8)];
+				   TB = R1[WS(rs, 3)];
+				   T7 = R0[WS(rs, 8)];
+				   T8 = R0[WS(rs, 3)];
+				   {
+					E Tb, Tc, Tk, Tl;
+					Tb = R0[WS(rs, 4)];
+					T18 = TB - TA;
+					TC = TA + TB;
+					Tj = T7 + T8;
+					T9 = T7 - T8;
+					Tc = R0[WS(rs, 9)];
+					Tk = R1[0];
+					Tl = R1[WS(rs, 5)];
+					Tp = R1[WS(rs, 4)];
+					Tu = Tb + Tc;
+					Td = Tb - Tc;
+					T15 = Tl - Tk;
+					Tm = Tk + Tl;
+					Tq = R1[WS(rs, 9)];
+					Te = R0[WS(rs, 6)];
+					Tf = R0[WS(rs, 1)];
+				   }
+			      }
+			      {
+				   E Ta, Tr, Tz, T1e, T1f, Th, T14, Tg, TP, TQ;
+				   Ta = T6 + T9;
+				   T1k = T6 - T9;
+				   T14 = Tq - Tp;
+				   Tr = Tp + Tq;
+				   Tz = Te + Tf;
+				   Tg = Te - Tf;
+				   T16 = T14 - T15;
+				   T1e = T14 + T15;
+				   T1f = T17 + T18;
+				   T19 = T17 - T18;
+				   Th = Td + Tg;
+				   T1l = Td - Tg;
+				   Ty = Tu - Tx;
+				   TP = Tu + Tx;
+				   Ti = Ta + Th;
+				   T12 = Ta - Th;
+				   TD = Tz - TC;
+				   TQ = Tz + TC;
+				   T1g = T1e + T1f;
+				   T1i = T1e - T1f;
+				   {
+					E TT, Tn, Ts, TS;
+					TT = Tj + Tm;
+					Tn = Tj - Tm;
+					Ts = To - Tr;
+					TS = To + Tr;
+					TR = TP - TQ;
+					TX = TP + TQ;
+					TK = Ts + Tn;
+					Tt = Tn - Ts;
+					TU = TS - TT;
+					TW = TS + TT;
+				   }
+			      }
+			 }
+		    }
+		    Cr[WS(csr, 5)] = T3 + Ti;
+		    Ci[WS(csi, 5)] = T1g - T1d;
+		    TL = Ty + TD;
+		    TE = Ty - TD;
+		    {
+			 E TY, T10, TM, TO, T11, TZ, TN;
+			 TY = TW + TX;
+			 T10 = TW - TX;
+			 Ci[WS(csi, 2)] = KP951056516 * (FMA(KP618033988, Tt, TE));
+			 Ci[WS(csi, 6)] = KP951056516 * (FNMS(KP618033988, TE, Tt));
+			 Ci[WS(csi, 4)] = KP951056516 * (FMA(KP618033988, TR, TU));
+			 Ci[WS(csi, 8)] = -(KP951056516 * (FNMS(KP618033988, TU, TR)));
+			 TM = TK + TL;
+			 TO = TK - TL;
+			 T1c = FNMS(KP618033988, T16, T19);
+			 T1a = FMA(KP618033988, T19, T16);
+			 Cr[0] = TV + TY;
+			 TZ = FNMS(KP250000000, TY, TV);
+			 Cr[WS(csr, 10)] = TJ + TM;
+			 TN = FNMS(KP250000000, TM, TJ);
+			 Cr[WS(csr, 8)] = FNMS(KP559016994, T10, TZ);
+			 Cr[WS(csr, 4)] = FMA(KP559016994, T10, TZ);
+			 Cr[WS(csr, 6)] = FMA(KP559016994, TO, TN);
+			 Cr[WS(csr, 2)] = FNMS(KP559016994, TO, TN);
+			 T11 = FNMS(KP250000000, Ti, T3);
+			 T1o = FNMS(KP618033988, T1k, T1l);
+			 T1m = FMA(KP618033988, T1l, T1k);
+			 T1h = FMA(KP250000000, T1g, T1d);
+			 T1b = FNMS(KP559016994, T12, T11);
+			 T13 = FMA(KP559016994, T12, T11);
+		    }
+	       }
+	       Cr[WS(csr, 3)] = FNMS(KP951056516, T1c, T1b);
+	       Cr[WS(csr, 7)] = FMA(KP951056516, T1c, T1b);
+	       Cr[WS(csr, 1)] = FMA(KP951056516, T1a, T13);
+	       Cr[WS(csr, 9)] = FNMS(KP951056516, T1a, T13);
+	       T1j = FNMS(KP559016994, T1i, T1h);
+	       T1n = FMA(KP559016994, T1i, T1h);
+	       Ci[WS(csi, 3)] = FNMS(KP951056516, T1o, T1n);
+	       Ci[WS(csi, 7)] = FMA(KP951056516, T1o, T1n);
+	       Ci[WS(csi, 9)] = FMS(KP951056516, T1m, T1j);
+	       Ci[WS(csi, 1)] = -(FMA(KP951056516, T1m, T1j));
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 20, "r2cf_20", {58, 4, 28, 0}, &GENUS };
+
+void X(codelet_r2cf_20) (planner *p) {
+     X(kr2c_register) (p, r2cf_20, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cf_20 -include r2cf.h */
+
+/*
+ * This function contains 86 FP additions, 24 FP multiplications,
+ * (or, 74 additions, 12 multiplications, 12 fused multiply/add),
+ * 51 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
+	       E T3, T1m, TF, T17, Ts, TM, TN, Tz, Ta, Th, Ti, T1g, T1h, T1k, T10;
+	       E T13, T19, TG, TH, TI, T1d, T1e, T1j, TT, TW, T18;
+	       {
+		    E T1, T2, T15, TD, TE, T16;
+		    T1 = R0[0];
+		    T2 = R0[WS(rs, 5)];
+		    T15 = T1 + T2;
+		    TD = R1[WS(rs, 7)];
+		    TE = R1[WS(rs, 2)];
+		    T16 = TE + TD;
+		    T3 = T1 - T2;
+		    T1m = T15 + T16;
+		    TF = TD - TE;
+		    T17 = T15 - T16;
+	       }
+	       {
+		    E T6, TU, Tv, T12, Ty, TZ, T9, TR, Td, TY, To, TS, Tr, TV, Tg;
+		    E T11;
+		    {
+			 E T4, T5, Tt, Tu;
+			 T4 = R0[WS(rs, 2)];
+			 T5 = R0[WS(rs, 7)];
+			 T6 = T4 - T5;
+			 TU = T4 + T5;
+			 Tt = R1[WS(rs, 8)];
+			 Tu = R1[WS(rs, 3)];
+			 Tv = Tt - Tu;
+			 T12 = Tt + Tu;
+		    }
+		    {
+			 E Tw, Tx, T7, T8;
+			 Tw = R1[WS(rs, 6)];
+			 Tx = R1[WS(rs, 1)];
+			 Ty = Tw - Tx;
+			 TZ = Tw + Tx;
+			 T7 = R0[WS(rs, 8)];
+			 T8 = R0[WS(rs, 3)];
+			 T9 = T7 - T8;
+			 TR = T7 + T8;
+		    }
+		    {
+			 E Tb, Tc, Tm, Tn;
+			 Tb = R0[WS(rs, 4)];
+			 Tc = R0[WS(rs, 9)];
+			 Td = Tb - Tc;
+			 TY = Tb + Tc;
+			 Tm = R1[0];
+			 Tn = R1[WS(rs, 5)];
+			 To = Tm - Tn;
+			 TS = Tm + Tn;
+		    }
+		    {
+			 E Tp, Tq, Te, Tf;
+			 Tp = R1[WS(rs, 4)];
+			 Tq = R1[WS(rs, 9)];
+			 Tr = Tp - Tq;
+			 TV = Tp + Tq;
+			 Te = R0[WS(rs, 6)];
+			 Tf = R0[WS(rs, 1)];
+			 Tg = Te - Tf;
+			 T11 = Te + Tf;
+		    }
+		    Ts = To - Tr;
+		    TM = T6 - T9;
+		    TN = Td - Tg;
+		    Tz = Tv - Ty;
+		    Ta = T6 + T9;
+		    Th = Td + Tg;
+		    Ti = Ta + Th;
+		    T1g = TY + TZ;
+		    T1h = T11 + T12;
+		    T1k = T1g + T1h;
+		    T10 = TY - TZ;
+		    T13 = T11 - T12;
+		    T19 = T10 + T13;
+		    TG = Tr + To;
+		    TH = Ty + Tv;
+		    TI = TG + TH;
+		    T1d = TU + TV;
+		    T1e = TR + TS;
+		    T1j = T1d + T1e;
+		    TT = TR - TS;
+		    TW = TU - TV;
+		    T18 = TW + TT;
+	       }
+	       Cr[WS(csr, 5)] = T3 + Ti;
+	       Ci[WS(csi, 5)] = TF - TI;
+	       {
+		    E TX, T14, T1f, T1i;
+		    TX = TT - TW;
+		    T14 = T10 - T13;
+		    Ci[WS(csi, 6)] = FNMS(KP587785252, T14, KP951056516 * TX);
+		    Ci[WS(csi, 2)] = FMA(KP587785252, TX, KP951056516 * T14);
+		    T1f = T1d - T1e;
+		    T1i = T1g - T1h;
+		    Ci[WS(csi, 8)] = FNMS(KP951056516, T1i, KP587785252 * T1f);
+		    Ci[WS(csi, 4)] = FMA(KP951056516, T1f, KP587785252 * T1i);
+	       }
+	       {
+		    E T1l, T1n, T1o, T1c, T1a, T1b;
+		    T1l = KP559016994 * (T1j - T1k);
+		    T1n = T1j + T1k;
+		    T1o = FNMS(KP250000000, T1n, T1m);
+		    Cr[WS(csr, 4)] = T1l + T1o;
+		    Cr[0] = T1m + T1n;
+		    Cr[WS(csr, 8)] = T1o - T1l;
+		    T1c = KP559016994 * (T18 - T19);
+		    T1a = T18 + T19;
+		    T1b = FNMS(KP250000000, T1a, T17);
+		    Cr[WS(csr, 2)] = T1b - T1c;
+		    Cr[WS(csr, 10)] = T17 + T1a;
+		    Cr[WS(csr, 6)] = T1c + T1b;
+	       }
+	       {
+		    E TA, TC, Tl, TB, Tj, Tk;
+		    TA = FMA(KP951056516, Ts, KP587785252 * Tz);
+		    TC = FNMS(KP587785252, Ts, KP951056516 * Tz);
+		    Tj = KP559016994 * (Ta - Th);
+		    Tk = FNMS(KP250000000, Ti, T3);
+		    Tl = Tj + Tk;
+		    TB = Tk - Tj;
+		    Cr[WS(csr, 9)] = Tl - TA;
+		    Cr[WS(csr, 7)] = TB + TC;
+		    Cr[WS(csr, 1)] = Tl + TA;
+		    Cr[WS(csr, 3)] = TB - TC;
+	       }
+	       {
+		    E TO, TQ, TL, TP, TJ, TK;
+		    TO = FMA(KP951056516, TM, KP587785252 * TN);
+		    TQ = FNMS(KP587785252, TM, KP951056516 * TN);
+		    TJ = FMA(KP250000000, TI, TF);
+		    TK = KP559016994 * (TH - TG);
+		    TL = TJ + TK;
+		    TP = TK - TJ;
+		    Ci[WS(csi, 1)] = TL - TO;
+		    Ci[WS(csi, 7)] = TQ + TP;
+		    Ci[WS(csi, 9)] = TO + TL;
+		    Ci[WS(csi, 3)] = TP - TQ;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 20, "r2cf_20", {74, 12, 12, 0}, &GENUS };
+
+void X(codelet_r2cf_20) (planner *p) {
+     X(kr2c_register) (p, r2cf_20, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_25.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_25.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,730 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:48 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cf_25 -include r2cf.h */
+
+/*
+ * This function contains 200 FP additions, 168 FP multiplications,
+ * (or, 44 additions, 12 multiplications, 156 fused multiply/add),
+ * 157 stack variables, 66 constants, and 50 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP792626838, +0.792626838241819413632131824093538848057784557);
+     DK(KP876091699, +0.876091699473550838204498029706869638173524346);
+     DK(KP809385824, +0.809385824416008241660603814668679683846476688);
+     DK(KP860541664, +0.860541664367944677098261680920518816412804187);
+     DK(KP681693190, +0.681693190061530575150324149145440022633095390);
+     DK(KP560319534, +0.560319534973832390111614715371676131169633784);
+     DK(KP997675361, +0.997675361079556513670859573984492383596555031);
+     DK(KP237294955, +0.237294955877110315393888866460840817927895961);
+     DK(KP897376177, +0.897376177523557693138608077137219684419427330);
+     DK(KP923225144, +0.923225144846402650453449441572664695995209956);
+     DK(KP956723877, +0.956723877038460305821989399535483155872969262);
+     DK(KP949179823, +0.949179823508441261575555465843363271711583843);
+     DK(KP669429328, +0.669429328479476605641803240971985825917022098);
+     DK(KP570584518, +0.570584518783621657366766175430996792655723863);
+     DK(KP262346850, +0.262346850930607871785420028382979691334784273);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP906616052, +0.906616052148196230441134447086066874408359177);
+     DK(KP683113946, +0.683113946453479238701949862233725244439656928);
+     DK(KP559154169, +0.559154169276087864842202529084232643714075927);
+     DK(KP921078979, +0.921078979742360627699756128143719920817673854);
+     DK(KP904508497, +0.904508497187473712051146708591409529430077295);
+     DK(KP999754674, +0.999754674276473633366203429228112409535557487);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP242145790, +0.242145790282157779872542093866183953459003101);
+     DK(KP904730450, +0.904730450839922351881287709692877908104763647);
+     DK(KP845997307, +0.845997307939530944175097360758058292389769300);
+     DK(KP855719849, +0.855719849902058969314654733608091555096772472);
+     DK(KP982009705, +0.982009705009746369461829878184175962711969869);
+     DK(KP916574801, +0.916574801383451584742370439148878693530976769);
+     DK(KP690983005, +0.690983005625052575897706582817180941139845410);
+     DK(KP952936919, +0.952936919628306576880750665357914584765951388);
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP831864738, +0.831864738706457140726048799369896829771167132);
+     DK(KP803003575, +0.803003575438660414833440593570376004635464850);
+     DK(KP522616830, +0.522616830205754336872861364785224694908468440);
+     DK(KP829049696, +0.829049696159252993975487806364305442437946767);
+     DK(KP999544308, +0.999544308746292983948881682379742149196758193);
+     DK(KP772036680, +0.772036680810363904029489473607579825330539880);
+     DK(KP763932022, +0.763932022500210303590826331268723764559381640);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP447417479, +0.447417479732227551498980015410057305749330693);
+     DK(KP734762448, +0.734762448793050413546343770063151342619912334);
+     DK(KP894834959, +0.894834959464455102997960030820114611498661386);
+     DK(KP867381224, +0.867381224396525206773171885031575671309956167);
+     DK(KP958953096, +0.958953096729998668045963838399037225970891871);
+     DK(KP912575812, +0.912575812670962425556968549836277086778922727);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DK(KP244189809, +0.244189809627953270309879511234821255780225091);
+     DK(KP269969613, +0.269969613759572083574752974412347470060951301);
+     DK(KP522847744, +0.522847744331509716623755382187077770911012542);
+     DK(KP578046249, +0.578046249379945007321754579646815604023525655);
+     DK(KP603558818, +0.603558818296015001454675132653458027918768137);
+     DK(KP667278218, +0.667278218140296670899089292254759909713898805);
+     DK(KP447533225, +0.447533225982656890041886979663652563063114397);
+     DK(KP494780565, +0.494780565770515410344588413655324772219443730);
+     DK(KP987388751, +0.987388751065621252324603216482382109400433949);
+     DK(KP893101515, +0.893101515366181661711202267938416198338079437);
+     DK(KP132830569, +0.132830569247582714407653942074819768844536507);
+     DK(KP120146378, +0.120146378570687701782758537356596213647956445);
+     DK(KP059835404, +0.059835404262124915169548397419498386427871950);
+     DK(KP066152395, +0.066152395967733048213034281011006031460903353);
+     DK(KP786782374, +0.786782374965295178365099601674911834788448471);
+     DK(KP869845200, +0.869845200362138853122720822420327157933056305);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
+	       E T2H, T2w, T2x, T2A, T2C, T2v, T2M, T2y, T2B, T2N;
+	       {
+		    E T2u, TJ, T1O, T39, T2t, TB, T21, T1M, T2e, T26, T1B, T1r, T1k, T1c, T9;
+		    E T1X, T1R, T2k, T29, T1z, T1v, T1h, TX, Ti, T13, T2a, T2j, T1U, T1Y, TQ;
+		    E T1g, T1u, T1y, T12, Ts, T11, T1I;
+		    {
+			 E Tt, Tw, T16, Tx, Ty;
+			 {
+			      E T2p, TG, TH, TD, TE, TI, T2r;
+			      T2p = R0[0];
+			      TG = R0[WS(rs, 5)];
+			      TH = R1[WS(rs, 7)];
+			      TD = R1[WS(rs, 2)];
+			      TE = R0[WS(rs, 10)];
+			      Tt = R1[WS(rs, 1)];
+			      TI = TG - TH;
+			      T2r = TG + TH;
+			      {
+				   E TF, T2q, Tu, Tv, T2s;
+				   TF = TD - TE;
+				   T2q = TD + TE;
+				   Tu = R0[WS(rs, 4)];
+				   Tv = R1[WS(rs, 11)];
+				   T2u = T2q - T2r;
+				   T2s = T2q + T2r;
+				   TJ = FMA(KP618033988, TI, TF);
+				   T1O = FNMS(KP618033988, TF, TI);
+				   T39 = T2p + T2s;
+				   T2t = FNMS(KP250000000, T2s, T2p);
+				   Tw = Tu + Tv;
+				   T16 = Tv - Tu;
+				   Tx = R1[WS(rs, 6)];
+				   Ty = R0[WS(rs, 9)];
+			      }
+			 }
+			 {
+			      E T1P, TW, TS, TR;
+			      {
+				   E T1, T5, T1L, T18, T1a, TA, T4, TU, T6, T19;
+				   T1 = R0[WS(rs, 2)];
+				   {
+					E T2, T17, Tz, T3;
+					T2 = R1[WS(rs, 4)];
+					T17 = Tx - Ty;
+					Tz = Tx + Ty;
+					T3 = R0[WS(rs, 12)];
+					T5 = R0[WS(rs, 7)];
+					T1L = FMA(KP618033988, T16, T17);
+					T18 = FNMS(KP618033988, T17, T16);
+					T1a = Tz - Tw;
+					TA = Tw + Tz;
+					T4 = T2 + T3;
+					TU = T3 - T2;
+					T6 = R1[WS(rs, 9)];
+				   }
+				   TB = Tt + TA;
+				   T19 = FNMS(KP250000000, TA, Tt);
+				   {
+					E T7, TV, T1b, T1K, T8;
+					T7 = T5 + T6;
+					TV = T5 - T6;
+					T1b = FNMS(KP559016994, T1a, T19);
+					T1K = FMA(KP559016994, T1a, T19);
+					T1P = FMA(KP618033988, TU, TV);
+					TW = FNMS(KP618033988, TV, TU);
+					TS = T4 - T7;
+					T8 = T4 + T7;
+					T21 = FMA(KP869845200, T1K, T1L);
+					T1M = FNMS(KP786782374, T1L, T1K);
+					T2e = FMA(KP066152395, T1K, T1L);
+					T26 = FNMS(KP059835404, T1L, T1K);
+					T1B = FMA(KP120146378, T18, T1b);
+					T1r = FNMS(KP132830569, T1b, T18);
+					T1k = FMA(KP893101515, T18, T1b);
+					T1c = FNMS(KP987388751, T1b, T18);
+					T9 = T1 + T8;
+					TR = FMS(KP250000000, T8, T1);
+				   }
+			      }
+			      {
+				   E Ta, Te, TK, Td, Tf;
+				   Ta = R1[0];
+				   {
+					E Tb, Tc, T1Q, TT;
+					Tb = R0[WS(rs, 3)];
+					Tc = R1[WS(rs, 10)];
+					T1Q = FMA(KP559016994, TS, TR);
+					TT = FNMS(KP559016994, TS, TR);
+					Te = R1[WS(rs, 5)];
+					TK = Tb - Tc;
+					Td = Tb + Tc;
+					T1X = FNMS(KP120146378, T1P, T1Q);
+					T1R = FMA(KP132830569, T1Q, T1P);
+					T2k = FMA(KP494780565, T1Q, T1P);
+					T29 = FNMS(KP447533225, T1P, T1Q);
+					T1z = FMA(KP869845200, TT, TW);
+					T1v = FNMS(KP786782374, TW, TT);
+					T1h = FNMS(KP667278218, TT, TW);
+					TX = FMA(KP603558818, TW, TT);
+					Tf = R0[WS(rs, 8)];
+				   }
+				   {
+					E Tk, T1S, TM, TO, Tn, TZ, TN, T10, Tq, To, Th, Tp, TP, T1T, Tr;
+					Tk = R0[WS(rs, 1)];
+					{
+					     E Tl, TL, Tg, Tm;
+					     Tl = R1[WS(rs, 3)];
+					     TL = Tf - Te;
+					     Tg = Te + Tf;
+					     Tm = R0[WS(rs, 11)];
+					     To = R0[WS(rs, 6)];
+					     T1S = FMA(KP618033988, TK, TL);
+					     TM = FNMS(KP618033988, TL, TK);
+					     TO = Td - Tg;
+					     Th = Td + Tg;
+					     Tn = Tl + Tm;
+					     TZ = Tm - Tl;
+					     Tp = R1[WS(rs, 8)];
+					}
+					Ti = Ta + Th;
+					TN = FNMS(KP250000000, Th, Ta);
+					T10 = Tp - To;
+					Tq = To + Tp;
+					TP = FMA(KP559016994, TO, TN);
+					T1T = FNMS(KP559016994, TO, TN);
+					Tr = Tn + Tq;
+					T13 = Tn - Tq;
+					T2a = FMA(KP578046249, T1T, T1S);
+					T2j = FNMS(KP522847744, T1S, T1T);
+					T1U = FNMS(KP987388751, T1T, T1S);
+					T1Y = FMA(KP893101515, T1S, T1T);
+					TQ = FMA(KP269969613, TP, TM);
+					T1g = FNMS(KP244189809, TM, TP);
+					T1u = FNMS(KP603558818, TM, TP);
+					T1y = FMA(KP667278218, TP, TM);
+					T12 = FMS(KP250000000, Tr, Tk);
+					Ts = Tk + Tr;
+					T11 = FMA(KP618033988, T10, TZ);
+					T1I = FNMS(KP618033988, TZ, T10);
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T2f, T27, T1j, T15, T2K, T2J, T2I, T2T, T1Z, T2X, T1N, T1V, T2W, T2U, T22;
+			 E T1G;
+			 {
+			      E T3a, T3b, T20, T1J, T1C, T1s;
+			      {
+				   E Tj, TC, T1H, T14;
+				   T3a = T9 + Ti;
+				   Tj = T9 - Ti;
+				   TC = Ts - TB;
+				   T3b = Ts + TB;
+				   T1H = FMA(KP559016994, T13, T12);
+				   T14 = FNMS(KP559016994, T13, T12);
+				   Ci[WS(csi, 10)] = KP951056516 * (FMA(KP618033988, Tj, TC));
+				   Ci[WS(csi, 5)] = KP951056516 * (FNMS(KP618033988, TC, Tj));
+				   T20 = FNMS(KP066152395, T1H, T1I);
+				   T1J = FMA(KP059835404, T1I, T1H);
+				   T2f = FMA(KP667278218, T1H, T1I);
+				   T27 = FNMS(KP603558818, T1I, T1H);
+				   T1C = FNMS(KP494780565, T14, T11);
+				   T1s = FMA(KP447533225, T11, T14);
+				   T1j = FNMS(KP522847744, T11, T14);
+				   T15 = FMA(KP578046249, T14, T11);
+			      }
+			      {
+				   E T1A, T1t, T1w, T3c, T3e, T1D, T1x, T3d, T1E, T1F;
+				   T1A = FNMS(KP912575812, T1z, T1y);
+				   T2K = FMA(KP912575812, T1z, T1y);
+				   T2J = FNMS(KP958953096, T1s, T1r);
+				   T1t = FMA(KP958953096, T1s, T1r);
+				   T1w = FMA(KP912575812, T1v, T1u);
+				   T2H = FNMS(KP912575812, T1v, T1u);
+				   T3c = T3a + T3b;
+				   T3e = T3a - T3b;
+				   T2I = FMA(KP867381224, T1C, T1B);
+				   T1D = FNMS(KP867381224, T1C, T1B);
+				   T1x = FNMS(KP894834959, T1w, T1t);
+				   T2T = FMA(KP734762448, T1Y, T1X);
+				   T1Z = FNMS(KP734762448, T1Y, T1X);
+				   T3d = FNMS(KP250000000, T3c, T39);
+				   Cr[0] = T3c + T39;
+				   T1E = FMA(KP447417479, T1w, T1D);
+				   Ci[WS(csi, 4)] = KP951056516 * (FMA(KP992114701, T1x, TJ));
+				   Cr[WS(csr, 10)] = FNMS(KP559016994, T3e, T3d);
+				   Cr[WS(csr, 5)] = FMA(KP559016994, T3e, T3d);
+				   T1F = FMA(KP763932022, T1E, T1t);
+				   T2X = FMA(KP772036680, T1M, T1J);
+				   T1N = FNMS(KP772036680, T1M, T1J);
+				   T1V = FMA(KP734762448, T1U, T1R);
+				   T2W = FNMS(KP734762448, T1U, T1R);
+				   T2U = FNMS(KP772036680, T21, T20);
+				   T22 = FMA(KP772036680, T21, T20);
+				   T1G = FMA(KP999544308, T1F, T1A);
+			      }
+			 }
+			 {
+			      E T1i, T1l, T2l, T2R, T2g, T2Q, T28, T32, T1f, T1n, T1p, T33, T2b;
+			      {
+				   E T24, TY, T1d, T1W, T23, T25, T1m, T1e;
+				   T2w = FMA(KP829049696, T1h, T1g);
+				   T1i = FNMS(KP829049696, T1h, T1g);
+				   T1W = FNMS(KP992114701, T1V, T1O);
+				   T23 = FNMS(KP522616830, T1V, T22);
+				   Ci[WS(csi, 9)] = KP951056516 * (FNMS(KP803003575, T1G, TJ));
+				   T2x = FNMS(KP831864738, T1k, T1j);
+				   T1l = FMA(KP831864738, T1k, T1j);
+				   Ci[WS(csi, 3)] = KP998026728 * (FNMS(KP952936919, T1W, T1N));
+				   T24 = FMA(KP690983005, T23, T1N);
+				   TY = FNMS(KP916574801, TX, TQ);
+				   T2A = FMA(KP916574801, TX, TQ);
+				   T2C = FNMS(KP831864738, T1c, T15);
+				   T1d = FMA(KP831864738, T1c, T15);
+				   T2l = FNMS(KP982009705, T2k, T2j);
+				   T2R = FMA(KP982009705, T2k, T2j);
+				   T25 = FNMS(KP855719849, T24, T1Z);
+				   T2g = FMA(KP845997307, T2f, T2e);
+				   T2Q = FNMS(KP845997307, T2f, T2e);
+				   T1m = FMA(KP904730450, T1d, TY);
+				   T1e = FNMS(KP904730450, T1d, TY);
+				   Ci[WS(csi, 8)] = -(KP951056516 * (FNMS(KP992114701, T25, T1O)));
+				   T28 = FNMS(KP845997307, T27, T26);
+				   T32 = FMA(KP845997307, T27, T26);
+				   T1f = FNMS(KP242145790, T1e, TJ);
+				   Ci[WS(csi, 1)] = -(KP951056516 * (FMA(KP968583161, T1e, TJ)));
+				   T1n = FNMS(KP999754674, T1m, T1l);
+				   T1p = FNMS(KP904508497, T1m, T1i);
+				   T33 = FMA(KP921078979, T2a, T29);
+				   T2b = FNMS(KP921078979, T2a, T29);
+			      }
+			      {
+				   E T2P, T2Z, T2V, T2O;
+				   {
+					E T2d, T2n, T2i, T2Y, T2m, T2o;
+					T2P = FNMS(KP559016994, T2u, T2t);
+					T2v = FMA(KP559016994, T2u, T2t);
+					{
+					     E T1o, T1q, T2h, T2c;
+					     T1o = FNMS(KP559154169, T1n, T1i);
+					     T1q = FMA(KP683113946, T1p, T1l);
+					     T2h = FMA(KP906616052, T2b, T28);
+					     T2c = FNMS(KP906616052, T2b, T28);
+					     Ci[WS(csi, 6)] = -(KP951056516 * (FMA(KP968583161, T1o, T1f)));
+					     Ci[WS(csi, 11)] = -(KP951056516 * (FMA(KP876306680, T1q, T1f)));
+					     T2d = FMA(KP262346850, T2c, T1O);
+					     Ci[WS(csi, 2)] = -(KP998026728 * (FNMS(KP952936919, T1O, T2c)));
+					     T2n = T2g + T2h;
+					     T2i = FMA(KP618033988, T2h, T2g);
+					}
+					T2m = FMA(KP570584518, T2l, T2i);
+					T2o = FNMS(KP669429328, T2n, T2l);
+					Ci[WS(csi, 12)] = KP951056516 * (FNMS(KP949179823, T2m, T2d));
+					Ci[WS(csi, 7)] = KP951056516 * (FNMS(KP876306680, T2o, T2d));
+					T2V = FMA(KP956723877, T2U, T2T);
+					T2Y = FMA(KP522616830, T2T, T2X);
+					T2Z = FNMS(KP763932022, T2Y, T2U);
+				   }
+				   Cr[WS(csr, 3)] = FMA(KP992114701, T2V, T2P);
+				   {
+					E T30, T34, T2S, T31, T35;
+					T30 = FMA(KP855719849, T2Z, T2W);
+					T34 = FNMS(KP923225144, T2R, T2Q);
+					T2S = FMA(KP923225144, T2R, T2Q);
+					Cr[WS(csr, 8)] = FNMS(KP897376177, T30, T2P);
+					T31 = FNMS(KP237294955, T2S, T2P);
+					Cr[WS(csr, 2)] = FMA(KP949179823, T2S, T2P);
+					T35 = FNMS(KP997675361, T34, T33);
+					{
+					     E T37, T36, T38, T2L;
+					     T37 = FNMS(KP904508497, T34, T32);
+					     T36 = FMA(KP560319534, T35, T32);
+					     T38 = FNMS(KP681693190, T37, T33);
+					     Cr[WS(csr, 12)] = FNMS(KP949179823, T36, T31);
+					     Cr[WS(csr, 7)] = FNMS(KP860541664, T38, T31);
+					     T2O = FNMS(KP809385824, T2K, T2I);
+					     T2L = FNMS(KP447417479, T2K, T2J);
+					     T2M = FNMS(KP690983005, T2L, T2I);
+					}
+				   }
+				   Cr[WS(csr, 4)] = FNMS(KP992114701, T2O, T2v);
+			      }
+			 }
+		    }
+	       }
+	       T2y = FNMS(KP904730450, T2x, T2w);
+	       T2B = FMA(KP904730450, T2x, T2w);
+	       T2N = FNMS(KP999544308, T2M, T2H);
+	       {
+		    E T2z, T2D, T2F, T2E, T2G;
+		    T2z = FNMS(KP242145790, T2y, T2v);
+		    Cr[WS(csr, 1)] = FMA(KP968583161, T2y, T2v);
+		    T2D = FMA(KP904730450, T2C, T2B);
+		    T2F = T2A + T2B;
+		    Cr[WS(csr, 9)] = FNMS(KP803003575, T2N, T2v);
+		    T2E = FNMS(KP618033988, T2D, T2A);
+		    T2G = FMA(KP683113946, T2F, T2C);
+		    Cr[WS(csr, 6)] = FNMS(KP876091699, T2E, T2z);
+		    Cr[WS(csr, 11)] = FNMS(KP792626838, T2G, T2z);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 25, "r2cf_25", {44, 12, 156, 0}, &GENUS };
+
+void X(codelet_r2cf_25) (planner *p) {
+     X(kr2c_register) (p, r2cf_25, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cf_25 -include r2cf.h */
+
+/*
+ * This function contains 200 FP additions, 140 FP multiplications,
+ * (or, 117 additions, 57 multiplications, 83 fused multiply/add),
+ * 101 stack variables, 40 constants, and 50 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP998026728, +0.998026728428271561952336806863450553336905220);
+     DK(KP125581039, +0.125581039058626752152356449131262266244969664);
+     DK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
+     DK(KP062790519, +0.062790519529313376076178224565631133122484832);
+     DK(KP809016994, +0.809016994374947424102293417182819058860154590);
+     DK(KP309016994, +0.309016994374947424102293417182819058860154590);
+     DK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
+     DK(KP728968627, +0.728968627421411523146730319055259111372571664);
+     DK(KP963507348, +0.963507348203430549974383005744259307057084020);
+     DK(KP876306680, +0.876306680043863587308115903922062583399064238);
+     DK(KP497379774, +0.497379774329709576484567492012895936835134813);
+     DK(KP968583161, +0.968583161128631119490168375464735813836012403);
+     DK(KP684547105, +0.684547105928688673732283357621209269889519233);
+     DK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
+     DK(KP481753674, +0.481753674101715274987191502872129653528542010);
+     DK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
+     DK(KP248689887, +0.248689887164854788242283746006447968417567406);
+     DK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
+     DK(KP992114701, +0.992114701314477831049793042785778521453036709);
+     DK(KP250666467, +0.250666467128608490746237519633017587885836494);
+     DK(KP425779291, +0.425779291565072648862502445744251703979973042);
+     DK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
+     DK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
+     DK(KP770513242, +0.770513242775789230803009636396177847271667672);
+     DK(KP844327925, +0.844327925502015078548558063966681505381659241);
+     DK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
+     DK(KP125333233, +0.125333233564304245373118759816508793942918247);
+     DK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
+     DK(KP904827052, +0.904827052466019527713668647932697593970413911);
+     DK(KP851558583, +0.851558583130145297725004891488503407959946084);
+     DK(KP637423989, +0.637423989748689710176712811676016195434917298);
+     DK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
+     DK(KP535826794, +0.535826794978996618271308767867639978063575346);
+     DK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
+     DK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
+	       E T8, T1j, T1V, T1l, T7, T9, Ta, T12, T2u, T1O, T19, T1P, Ti, T2r, T1K;
+	       E Tp, T1L, Tx, T2q, T1H, TE, T1I, TN, T2t, T1R, TU, T1S, T6, T1k, T3;
+	       E T2s, T2v;
+	       T8 = R0[0];
+	       {
+		    E T4, T5, T1, T2;
+		    T4 = R0[WS(rs, 5)];
+		    T5 = R1[WS(rs, 7)];
+		    T6 = T4 + T5;
+		    T1k = T4 - T5;
+		    T1 = R1[WS(rs, 2)];
+		    T2 = R0[WS(rs, 10)];
+		    T3 = T1 + T2;
+		    T1j = T1 - T2;
+	       }
+	       T1V = KP951056516 * T1k;
+	       T1l = FMA(KP951056516, T1j, KP587785252 * T1k);
+	       T7 = KP559016994 * (T3 - T6);
+	       T9 = T3 + T6;
+	       Ta = FNMS(KP250000000, T9, T8);
+	       {
+		    E T16, T13, T14, TY, T17, T11, T15, T18;
+		    T16 = R1[WS(rs, 1)];
+		    {
+			 E TW, TX, TZ, T10;
+			 TW = R0[WS(rs, 4)];
+			 TX = R1[WS(rs, 11)];
+			 T13 = TW + TX;
+			 TZ = R1[WS(rs, 6)];
+			 T10 = R0[WS(rs, 9)];
+			 T14 = TZ + T10;
+			 TY = TW - TX;
+			 T17 = T13 + T14;
+			 T11 = TZ - T10;
+		    }
+		    T12 = FMA(KP475528258, TY, KP293892626 * T11);
+		    T2u = T16 + T17;
+		    T1O = FNMS(KP293892626, TY, KP475528258 * T11);
+		    T15 = KP559016994 * (T13 - T14);
+		    T18 = FNMS(KP250000000, T17, T16);
+		    T19 = T15 + T18;
+		    T1P = T18 - T15;
+	       }
+	       {
+		    E Tm, Tj, Tk, Te, Tn, Th, Tl, To;
+		    Tm = R1[0];
+		    {
+			 E Tc, Td, Tf, Tg;
+			 Tc = R0[WS(rs, 3)];
+			 Td = R1[WS(rs, 10)];
+			 Tj = Tc + Td;
+			 Tf = R1[WS(rs, 5)];
+			 Tg = R0[WS(rs, 8)];
+			 Tk = Tf + Tg;
+			 Te = Tc - Td;
+			 Tn = Tj + Tk;
+			 Th = Tf - Tg;
+		    }
+		    Ti = FMA(KP475528258, Te, KP293892626 * Th);
+		    T2r = Tm + Tn;
+		    T1K = FNMS(KP293892626, Te, KP475528258 * Th);
+		    Tl = KP559016994 * (Tj - Tk);
+		    To = FNMS(KP250000000, Tn, Tm);
+		    Tp = Tl + To;
+		    T1L = To - Tl;
+	       }
+	       {
+		    E TB, Ty, Tz, Tt, TC, Tw, TA, TD;
+		    TB = R0[WS(rs, 2)];
+		    {
+			 E Tr, Ts, Tu, Tv;
+			 Tr = R1[WS(rs, 4)];
+			 Ts = R0[WS(rs, 12)];
+			 Ty = Tr + Ts;
+			 Tu = R0[WS(rs, 7)];
+			 Tv = R1[WS(rs, 9)];
+			 Tz = Tu + Tv;
+			 Tt = Tr - Ts;
+			 TC = Ty + Tz;
+			 Tw = Tu - Tv;
+		    }
+		    Tx = FMA(KP475528258, Tt, KP293892626 * Tw);
+		    T2q = TB + TC;
+		    T1H = FNMS(KP293892626, Tt, KP475528258 * Tw);
+		    TA = KP559016994 * (Ty - Tz);
+		    TD = FNMS(KP250000000, TC, TB);
+		    TE = TA + TD;
+		    T1I = TD - TA;
+	       }
+	       {
+		    E TR, TO, TP, TJ, TS, TM, TQ, TT;
+		    TR = R0[WS(rs, 1)];
+		    {
+			 E TH, TI, TK, TL;
+			 TH = R1[WS(rs, 3)];
+			 TI = R0[WS(rs, 11)];
+			 TO = TH + TI;
+			 TK = R0[WS(rs, 6)];
+			 TL = R1[WS(rs, 8)];
+			 TP = TK + TL;
+			 TJ = TH - TI;
+			 TS = TO + TP;
+			 TM = TK - TL;
+		    }
+		    TN = FMA(KP475528258, TJ, KP293892626 * TM);
+		    T2t = TR + TS;
+		    T1R = FNMS(KP293892626, TJ, KP475528258 * TM);
+		    TQ = KP559016994 * (TO - TP);
+		    TT = FNMS(KP250000000, TS, TR);
+		    TU = TQ + TT;
+		    T1S = TT - TQ;
+	       }
+	       T2s = T2q - T2r;
+	       T2v = T2t - T2u;
+	       Ci[WS(csi, 5)] = FNMS(KP587785252, T2v, KP951056516 * T2s);
+	       Ci[WS(csi, 10)] = FMA(KP587785252, T2s, KP951056516 * T2v);
+	       {
+		    E T2z, T2y, T2A, T2w, T2x, T2B;
+		    T2z = T8 + T9;
+		    T2w = T2r + T2q;
+		    T2x = T2t + T2u;
+		    T2y = KP559016994 * (T2w - T2x);
+		    T2A = T2w + T2x;
+		    Cr[0] = T2z + T2A;
+		    T2B = FNMS(KP250000000, T2A, T2z);
+		    Cr[WS(csr, 5)] = T2y + T2B;
+		    Cr[WS(csr, 10)] = T2B - T2y;
+	       }
+	       {
+		    E Tb, Tq, TF, TG, T1E, T1F, T1G, T1B, T1C, T1D, TV, T1a, T1b, T1o, T1r;
+		    E T1s, T1z, T1x, T1e, T1h, T1i, T1u, T1t;
+		    Tb = T7 + Ta;
+		    Tq = FMA(KP1_688655851, Ti, KP535826794 * Tp);
+		    TF = FMA(KP1_541026485, Tx, KP637423989 * TE);
+		    TG = Tq - TF;
+		    T1E = FMA(KP851558583, TN, KP904827052 * TU);
+		    T1F = FMA(KP1_984229402, T12, KP125333233 * T19);
+		    T1G = T1E + T1F;
+		    T1B = FNMS(KP844327925, Tp, KP1_071653589 * Ti);
+		    T1C = FNMS(KP1_274847979, Tx, KP770513242 * TE);
+		    T1D = T1B + T1C;
+		    TV = FNMS(KP425779291, TU, KP1_809654104 * TN);
+		    T1a = FNMS(KP992114701, T19, KP250666467 * T12);
+		    T1b = TV + T1a;
+		    {
+			 E T1m, T1n, T1p, T1q;
+			 T1m = FMA(KP1_937166322, Ti, KP248689887 * Tp);
+			 T1n = FMA(KP1_071653589, Tx, KP844327925 * TE);
+			 T1o = T1m + T1n;
+			 T1p = FMA(KP1_752613360, TN, KP481753674 * TU);
+			 T1q = FMA(KP1_457937254, T12, KP684547105 * T19);
+			 T1r = T1p + T1q;
+			 T1s = T1o + T1r;
+			 T1z = T1q - T1p;
+			 T1x = T1n - T1m;
+		    }
+		    {
+			 E T1c, T1d, T1f, T1g;
+			 T1c = FNMS(KP497379774, Ti, KP968583161 * Tp);
+			 T1d = FNMS(KP1_688655851, Tx, KP535826794 * TE);
+			 T1e = T1c + T1d;
+			 T1f = FNMS(KP963507348, TN, KP876306680 * TU);
+			 T1g = FNMS(KP1_369094211, T12, KP728968627 * T19);
+			 T1h = T1f + T1g;
+			 T1i = T1e + T1h;
+			 T1u = T1f - T1g;
+			 T1t = T1d - T1c;
+		    }
+		    Cr[WS(csr, 1)] = Tb + T1i;
+		    Ci[WS(csi, 1)] = -(T1l + T1s);
+		    Cr[WS(csr, 4)] = Tb + TG + T1b;
+		    Ci[WS(csi, 4)] = T1l + T1D - T1G;
+		    Ci[WS(csi, 9)] = FMA(KP309016994, T1D, T1l) + FMA(KP587785252, T1a - TV, KP809016994 * T1G) - (KP951056516 * (Tq + TF));
+		    Cr[WS(csr, 9)] = FMA(KP309016994, TG, Tb) + FMA(KP951056516, T1B - T1C, KP587785252 * (T1F - T1E)) - (KP809016994 * T1b);
+		    {
+			 E T1v, T1w, T1y, T1A;
+			 T1v = FMS(KP250000000, T1s, T1l);
+			 T1w = KP559016994 * (T1r - T1o);
+			 Ci[WS(csi, 11)] = FMA(KP587785252, T1t, KP951056516 * T1u) + T1v - T1w;
+			 Ci[WS(csi, 6)] = FMA(KP951056516, T1t, T1v) + FNMS(KP587785252, T1u, T1w);
+			 T1y = FNMS(KP250000000, T1i, Tb);
+			 T1A = KP559016994 * (T1e - T1h);
+			 Cr[WS(csr, 11)] = FMA(KP587785252, T1x, T1y) + FNMA(KP951056516, T1z, T1A);
+			 Cr[WS(csr, 6)] = FMA(KP951056516, T1x, T1A) + FMA(KP587785252, T1z, T1y);
+		    }
+	       }
+	       {
+		    E T1W, T1X, T1J, T1M, T1N, T21, T22, T23, T1Q, T1T, T1U, T1Y, T1Z, T20, T26;
+		    E T29, T2a, T2k, T2j, T2l, T2m, T2d, T2o, T2i;
+		    T1W = FNMS(KP587785252, T1j, T1V);
+		    T1X = Ta - T7;
+		    T1J = FNMS(KP125333233, T1I, KP1_984229402 * T1H);
+		    T1M = FMA(KP1_457937254, T1K, KP684547105 * T1L);
+		    T1N = T1J - T1M;
+		    T21 = FNMS(KP1_996053456, T1R, KP062790519 * T1S);
+		    T22 = FMA(KP1_541026485, T1O, KP637423989 * T1P);
+		    T23 = T21 - T22;
+		    T1Q = FNMS(KP770513242, T1P, KP1_274847979 * T1O);
+		    T1T = FMA(KP125581039, T1R, KP998026728 * T1S);
+		    T1U = T1Q - T1T;
+		    T1Y = FNMS(KP1_369094211, T1K, KP728968627 * T1L);
+		    T1Z = FMA(KP250666467, T1H, KP992114701 * T1I);
+		    T20 = T1Y - T1Z;
+		    {
+			 E T24, T25, T27, T28;
+			 T24 = FNMS(KP481753674, T1L, KP1_752613360 * T1K);
+			 T25 = FMA(KP851558583, T1H, KP904827052 * T1I);
+			 T26 = T24 - T25;
+			 T27 = FNMS(KP844327925, T1S, KP1_071653589 * T1R);
+			 T28 = FNMS(KP998026728, T1P, KP125581039 * T1O);
+			 T29 = T27 + T28;
+			 T2a = T26 + T29;
+			 T2k = T27 - T28;
+			 T2j = T24 + T25;
+		    }
+		    {
+			 E T2b, T2c, T2g, T2h;
+			 T2b = FNMS(KP425779291, T1I, KP1_809654104 * T1H);
+			 T2c = FMA(KP963507348, T1K, KP876306680 * T1L);
+			 T2l = T2c + T2b;
+			 T2g = FMA(KP1_688655851, T1R, KP535826794 * T1S);
+			 T2h = FMA(KP1_996053456, T1O, KP062790519 * T1P);
+			 T2m = T2g + T2h;
+			 T2d = T2b - T2c;
+			 T2o = T2l + T2m;
+			 T2i = T2g - T2h;
+		    }
+		    Ci[WS(csi, 2)] = T1W + T2a;
+		    Cr[WS(csr, 2)] = T1X + T2o;
+		    Ci[WS(csi, 3)] = T1N + T1U - T1W;
+		    Cr[WS(csr, 3)] = T1X + T20 + T23;
+		    Cr[WS(csr, 8)] = FMA(KP309016994, T20, T1X) + FNMA(KP809016994, T23, KP587785252 * (T1T + T1Q)) - (KP951056516 * (T1M + T1J));
+		    Ci[WS(csi, 8)] = FNMS(KP587785252, T21 + T22, KP309016994 * T1N) + FNMA(KP809016994, T1U, KP951056516 * (T1Y + T1Z)) - T1W;
+		    {
+			 E T2e, T2f, T2n, T2p;
+			 T2e = KP559016994 * (T26 - T29);
+			 T2f = FNMS(KP250000000, T2a, T1W);
+			 Ci[WS(csi, 7)] = FMA(KP951056516, T2d, T2e) + FNMS(KP587785252, T2i, T2f);
+			 Ci[WS(csi, 12)] = FMA(KP587785252, T2d, T2f) + FMS(KP951056516, T2i, T2e);
+			 T2n = KP559016994 * (T2l - T2m);
+			 T2p = FNMS(KP250000000, T2o, T1X);
+			 Cr[WS(csr, 7)] = FMA(KP951056516, T2j, KP587785252 * T2k) + T2n + T2p;
+			 Cr[WS(csr, 12)] = FMA(KP587785252, T2j, T2p) + FNMA(KP951056516, T2k, T2n);
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 25, "r2cf_25", {117, 57, 83, 0}, &GENUS };
+
+void X(codelet_r2cf_25) (planner *p) {
+     X(kr2c_register) (p, r2cf_25, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_3.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cf_3 -include r2cf.h */
+
+/*
+ * This function contains 4 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 1 multiplications, 1 fused multiply/add),
+ * 7 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
+	       E T1, T2, T3, T4;
+	       T1 = R0[0];
+	       T2 = R1[0];
+	       T3 = R0[WS(rs, 1)];
+	       Ci[WS(csi, 1)] = KP866025403 * (T3 - T2);
+	       T4 = T2 + T3;
+	       Cr[0] = T1 + T4;
+	       Cr[WS(csr, 1)] = FNMS(KP500000000, T4, T1);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 3, "r2cf_3", {3, 1, 1, 0}, &GENUS };
+
+void X(codelet_r2cf_3) (planner *p) {
+     X(kr2c_register) (p, r2cf_3, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cf_3 -include r2cf.h */
+
+/*
+ * This function contains 4 FP additions, 2 FP multiplications,
+ * (or, 3 additions, 1 multiplications, 1 fused multiply/add),
+ * 7 stack variables, 2 constants, and 6 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
+	       E T1, T2, T3, T4;
+	       T1 = R0[0];
+	       T2 = R1[0];
+	       T3 = R0[WS(rs, 1)];
+	       T4 = T2 + T3;
+	       Cr[WS(csr, 1)] = FNMS(KP500000000, T4, T1);
+	       Ci[WS(csi, 1)] = KP866025403 * (T3 - T2);
+	       Cr[0] = T1 + T4;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 3, "r2cf_3", {3, 1, 1, 0}, &GENUS };
+
+void X(codelet_r2cf_3) (planner *p) {
+     X(kr2c_register) (p, r2cf_3, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,609 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:47 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cf_32 -include r2cf.h */
+
+/*
+ * This function contains 156 FP additions, 68 FP multiplications,
+ * (or, 88 additions, 0 multiplications, 68 fused multiply/add),
+ * 89 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
+	       E T1x, T1M, T1I, T1E, T1J, T1H;
+	       {
+		    E Tv, T1h, T7, T2b, Te, T2n, Ty, T1i, T1l, TF, T2d, Tt, T1k, TC, T2c;
+		    E Tm, T2j, T1Z, T2k, T22, TK, T1B, T19, T1C, T1e, TO, TV, T1T, TN, TP;
+		    E T2g, T1S;
+		    {
+			 E TD, Tp, Tq, Tr;
+			 {
+			      E T1, T2, T4, T5;
+			      T1 = R0[0];
+			      T2 = R0[WS(rs, 8)];
+			      T4 = R0[WS(rs, 4)];
+			      T5 = R0[WS(rs, 12)];
+			      {
+				   E Ta, Tw, Tx, Td, Tn, To;
+				   {
+					E T8, T3, T6, T9, Tb, Tc;
+					T8 = R0[WS(rs, 2)];
+					Tv = T1 - T2;
+					T3 = T1 + T2;
+					T1h = T4 - T5;
+					T6 = T4 + T5;
+					T9 = R0[WS(rs, 10)];
+					Tb = R0[WS(rs, 14)];
+					Tc = R0[WS(rs, 6)];
+					T7 = T3 + T6;
+					T2b = T3 - T6;
+					Ta = T8 + T9;
+					Tw = T8 - T9;
+					Tx = Tb - Tc;
+					Td = Tb + Tc;
+				   }
+				   Tn = R0[WS(rs, 15)];
+				   To = R0[WS(rs, 7)];
+				   Te = Ta + Td;
+				   T2n = Td - Ta;
+				   Ty = Tw + Tx;
+				   T1i = Tx - Tw;
+				   TD = Tn - To;
+				   Tp = Tn + To;
+				   Tq = R0[WS(rs, 3)];
+				   Tr = R0[WS(rs, 11)];
+			      }
+			 }
+			 {
+			      E Tj, TA, Ti, Tk;
+			      {
+				   E Tg, Th, TE, Ts;
+				   Tg = R0[WS(rs, 1)];
+				   Th = R0[WS(rs, 9)];
+				   Tj = R0[WS(rs, 5)];
+				   TE = Tq - Tr;
+				   Ts = Tq + Tr;
+				   TA = Tg - Th;
+				   Ti = Tg + Th;
+				   T1l = FNMS(KP414213562, TD, TE);
+				   TF = FMA(KP414213562, TE, TD);
+				   T2d = Tp - Ts;
+				   Tt = Tp + Ts;
+				   Tk = R0[WS(rs, 13)];
+			      }
+			      {
+				   E T11, T15, T1c, T20, T14, T16, T1X, T1Y, T1Q, T1R;
+				   {
+					E T1a, T1b, T12, T13;
+					{
+					     E TZ, T10, TB, Tl;
+					     TZ = R1[WS(rs, 15)];
+					     T10 = R1[WS(rs, 7)];
+					     T1a = R1[WS(rs, 11)];
+					     TB = Tj - Tk;
+					     Tl = Tj + Tk;
+					     T1X = TZ + T10;
+					     T11 = TZ - T10;
+					     T1k = FMA(KP414213562, TA, TB);
+					     TC = FNMS(KP414213562, TB, TA);
+					     T2c = Ti - Tl;
+					     Tm = Ti + Tl;
+					     T1b = R1[WS(rs, 3)];
+					}
+					T12 = R1[WS(rs, 1)];
+					T13 = R1[WS(rs, 9)];
+					T15 = R1[WS(rs, 13)];
+					T1Y = T1b + T1a;
+					T1c = T1a - T1b;
+					T20 = T12 + T13;
+					T14 = T12 - T13;
+					T16 = R1[WS(rs, 5)];
+				   }
+				   T2j = T1X - T1Y;
+				   T1Z = T1X + T1Y;
+				   {
+					E TT, TU, TL, TM;
+					{
+					     E TI, T21, T17, TJ, T18, T1d;
+					     TI = R1[0];
+					     T21 = T15 + T16;
+					     T17 = T15 - T16;
+					     TJ = R1[WS(rs, 8)];
+					     TT = R1[WS(rs, 4)];
+					     T2k = T21 - T20;
+					     T22 = T20 + T21;
+					     T18 = T14 + T17;
+					     T1d = T17 - T14;
+					     T1Q = TI + TJ;
+					     TK = TI - TJ;
+					     T1B = FNMS(KP707106781, T18, T11);
+					     T19 = FMA(KP707106781, T18, T11);
+					     T1C = FNMS(KP707106781, T1d, T1c);
+					     T1e = FMA(KP707106781, T1d, T1c);
+					     TU = R1[WS(rs, 12)];
+					}
+					TL = R1[WS(rs, 2)];
+					TM = R1[WS(rs, 10)];
+					TO = R1[WS(rs, 14)];
+					T1R = TT + TU;
+					TV = TT - TU;
+					T1T = TL + TM;
+					TN = TL - TM;
+					TP = R1[WS(rs, 6)];
+				   }
+				   T2g = T1Q - T1R;
+				   T1S = T1Q + T1R;
+			      }
+			 }
+		    }
+		    {
+			 E T1P, T25, T23, T2h, T1W, T1y, TS, T1z, TX, T27, T2a;
+			 {
+			      E Tf, Tu, T29, T28;
+			      {
+				   E T1U, TQ, T1V, TR, TW;
+				   T1P = T7 - Te;
+				   Tf = T7 + Te;
+				   T1U = TO + TP;
+				   TQ = TO - TP;
+				   Tu = Tm + Tt;
+				   T25 = Tt - Tm;
+				   T23 = T1Z - T22;
+				   T29 = T1Z + T22;
+				   T2h = T1U - T1T;
+				   T1V = T1T + T1U;
+				   TR = TN + TQ;
+				   TW = TN - TQ;
+				   T27 = Tf + Tu;
+				   T1W = T1S - T1V;
+				   T28 = T1S + T1V;
+				   T1y = FNMS(KP707106781, TR, TK);
+				   TS = FMA(KP707106781, TR, TK);
+				   T1z = FNMS(KP707106781, TW, TV);
+				   TX = FMA(KP707106781, TW, TV);
+				   T2a = T28 + T29;
+			      }
+			      Cr[WS(csr, 8)] = Tf - Tu;
+			      Ci[WS(csi, 8)] = T29 - T28;
+			 }
+			 Cr[0] = T27 + T2a;
+			 Cr[WS(csr, 16)] = T27 - T2a;
+			 {
+			      E T2s, T2i, T2v, T2f, T2r, T2p, T2l, T2t;
+			      {
+				   E T2o, T2e, T26, T24;
+				   T2o = T2d - T2c;
+				   T2e = T2c + T2d;
+				   T2s = FNMS(KP414213562, T2g, T2h);
+				   T2i = FMA(KP414213562, T2h, T2g);
+				   T26 = T23 - T1W;
+				   T24 = T1W + T23;
+				   T2v = FNMS(KP707106781, T2e, T2b);
+				   T2f = FMA(KP707106781, T2e, T2b);
+				   T2r = FMA(KP707106781, T2o, T2n);
+				   T2p = FNMS(KP707106781, T2o, T2n);
+				   Ci[WS(csi, 4)] = FMA(KP707106781, T26, T25);
+				   Ci[WS(csi, 12)] = FMS(KP707106781, T26, T25);
+				   Cr[WS(csr, 4)] = FMA(KP707106781, T24, T1P);
+				   Cr[WS(csr, 12)] = FNMS(KP707106781, T24, T1P);
+				   T2l = FNMS(KP414213562, T2k, T2j);
+				   T2t = FMA(KP414213562, T2j, T2k);
+			      }
+			      {
+				   E T1v, T1G, TH, T1s, T1F, T1w, T1o, T1g, T1p, T1n;
+				   {
+					E T1f, TY, T1t, T1u, T1j, T1m;
+					{
+					     E Tz, TG, T1q, T1r;
+					     T1v = FNMS(KP707106781, Ty, Tv);
+					     Tz = FMA(KP707106781, Ty, Tv);
+					     {
+						  E T2q, T2m, T2w, T2u;
+						  T2q = T2l - T2i;
+						  T2m = T2i + T2l;
+						  T2w = T2t - T2s;
+						  T2u = T2s + T2t;
+						  Ci[WS(csi, 10)] = FMA(KP923879532, T2q, T2p);
+						  Ci[WS(csi, 6)] = FMS(KP923879532, T2q, T2p);
+						  Cr[WS(csr, 2)] = FMA(KP923879532, T2m, T2f);
+						  Cr[WS(csr, 14)] = FNMS(KP923879532, T2m, T2f);
+						  Cr[WS(csr, 10)] = FNMS(KP923879532, T2w, T2v);
+						  Cr[WS(csr, 6)] = FMA(KP923879532, T2w, T2v);
+						  Ci[WS(csi, 2)] = FMA(KP923879532, T2u, T2r);
+						  Ci[WS(csi, 14)] = FMS(KP923879532, T2u, T2r);
+						  TG = TC + TF;
+						  T1G = TF - TC;
+					     }
+					     T1f = FNMS(KP198912367, T1e, T19);
+					     T1q = FMA(KP198912367, T19, T1e);
+					     T1r = FMA(KP198912367, TS, TX);
+					     TY = FNMS(KP198912367, TX, TS);
+					     T1t = FNMS(KP923879532, TG, Tz);
+					     TH = FMA(KP923879532, TG, Tz);
+					     T1u = T1r + T1q;
+					     T1s = T1q - T1r;
+					     T1F = FMA(KP707106781, T1i, T1h);
+					     T1j = FNMS(KP707106781, T1i, T1h);
+					     T1m = T1k + T1l;
+					     T1w = T1k - T1l;
+					}
+					Cr[WS(csr, 7)] = FMA(KP980785280, T1u, T1t);
+					T1o = T1f - TY;
+					T1g = TY + T1f;
+					T1p = FMA(KP923879532, T1m, T1j);
+					T1n = FNMS(KP923879532, T1m, T1j);
+					Cr[WS(csr, 9)] = FNMS(KP980785280, T1u, T1t);
+				   }
+				   Cr[WS(csr, 1)] = FMA(KP980785280, T1g, TH);
+				   Cr[WS(csr, 15)] = FNMS(KP980785280, T1g, TH);
+				   Ci[WS(csi, 1)] = FMS(KP980785280, T1s, T1p);
+				   Ci[WS(csi, 15)] = FMA(KP980785280, T1s, T1p);
+				   Ci[WS(csi, 9)] = FMS(KP980785280, T1o, T1n);
+				   Ci[WS(csi, 7)] = FMA(KP980785280, T1o, T1n);
+				   {
+					E T1A, T1D, T1N, T1O, T1K, T1L;
+					T1A = FMA(KP668178637, T1z, T1y);
+					T1K = FNMS(KP668178637, T1y, T1z);
+					T1L = FNMS(KP668178637, T1B, T1C);
+					T1D = FMA(KP668178637, T1C, T1B);
+					T1N = FNMS(KP923879532, T1w, T1v);
+					T1x = FMA(KP923879532, T1w, T1v);
+					T1O = T1K + T1L;
+					T1M = T1K - T1L;
+					Cr[WS(csr, 5)] = FNMS(KP831469612, T1O, T1N);
+					T1I = T1D - T1A;
+					T1E = T1A + T1D;
+					T1J = FMA(KP923879532, T1G, T1F);
+					T1H = FNMS(KP923879532, T1G, T1F);
+					Cr[WS(csr, 11)] = FMA(KP831469612, T1O, T1N);
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Ci[WS(csi, 3)] = FMA(KP831469612, T1M, T1J);
+	       Cr[WS(csr, 3)] = FMA(KP831469612, T1E, T1x);
+	       Ci[WS(csi, 13)] = FMS(KP831469612, T1M, T1J);
+	       Cr[WS(csr, 13)] = FNMS(KP831469612, T1E, T1x);
+	       Ci[WS(csi, 11)] = FMA(KP831469612, T1I, T1H);
+	       Ci[WS(csi, 5)] = FMS(KP831469612, T1I, T1H);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 32, "r2cf_32", {88, 0, 68, 0}, &GENUS };
+
+void X(codelet_r2cf_32) (planner *p) {
+     X(kr2c_register) (p, r2cf_32, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cf_32 -include r2cf.h */
+
+/*
+ * This function contains 156 FP additions, 42 FP multiplications,
+ * (or, 140 additions, 26 multiplications, 16 fused multiply/add),
+ * 54 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
+	       E T7, T2b, Tv, T1l, Te, T2o, Ty, T1k, Tt, T2d, TF, T1h, Tm, T2c, TC;
+	       E T1i, T1Z, T22, T2k, T2j, T1e, T1C, T19, T1B, T1S, T1V, T2h, T2g, TX, T1z;
+	       E TS, T1y;
+	       {
+		    E T1, T2, T3, T4, T5, T6;
+		    T1 = R0[0];
+		    T2 = R0[WS(rs, 8)];
+		    T3 = T1 + T2;
+		    T4 = R0[WS(rs, 4)];
+		    T5 = R0[WS(rs, 12)];
+		    T6 = T4 + T5;
+		    T7 = T3 + T6;
+		    T2b = T3 - T6;
+		    Tv = T1 - T2;
+		    T1l = T4 - T5;
+	       }
+	       {
+		    E Ta, Tw, Td, Tx;
+		    {
+			 E T8, T9, Tb, Tc;
+			 T8 = R0[WS(rs, 2)];
+			 T9 = R0[WS(rs, 10)];
+			 Ta = T8 + T9;
+			 Tw = T8 - T9;
+			 Tb = R0[WS(rs, 14)];
+			 Tc = R0[WS(rs, 6)];
+			 Td = Tb + Tc;
+			 Tx = Tb - Tc;
+		    }
+		    Te = Ta + Td;
+		    T2o = Td - Ta;
+		    Ty = KP707106781 * (Tw + Tx);
+		    T1k = KP707106781 * (Tx - Tw);
+	       }
+	       {
+		    E Tp, TD, Ts, TE;
+		    {
+			 E Tn, To, Tq, Tr;
+			 Tn = R0[WS(rs, 15)];
+			 To = R0[WS(rs, 7)];
+			 Tp = Tn + To;
+			 TD = Tn - To;
+			 Tq = R0[WS(rs, 3)];
+			 Tr = R0[WS(rs, 11)];
+			 Ts = Tq + Tr;
+			 TE = Tq - Tr;
+		    }
+		    Tt = Tp + Ts;
+		    T2d = Tp - Ts;
+		    TF = FMA(KP923879532, TD, KP382683432 * TE);
+		    T1h = FNMS(KP923879532, TE, KP382683432 * TD);
+	       }
+	       {
+		    E Ti, TA, Tl, TB;
+		    {
+			 E Tg, Th, Tj, Tk;
+			 Tg = R0[WS(rs, 1)];
+			 Th = R0[WS(rs, 9)];
+			 Ti = Tg + Th;
+			 TA = Tg - Th;
+			 Tj = R0[WS(rs, 5)];
+			 Tk = R0[WS(rs, 13)];
+			 Tl = Tj + Tk;
+			 TB = Tj - Tk;
+		    }
+		    Tm = Ti + Tl;
+		    T2c = Ti - Tl;
+		    TC = FNMS(KP382683432, TB, KP923879532 * TA);
+		    T1i = FMA(KP382683432, TA, KP923879532 * TB);
+	       }
+	       {
+		    E T11, T1X, T1d, T1Y, T14, T20, T17, T21, T1a, T18;
+		    {
+			 E TZ, T10, T1b, T1c;
+			 TZ = R1[WS(rs, 15)];
+			 T10 = R1[WS(rs, 7)];
+			 T11 = TZ - T10;
+			 T1X = TZ + T10;
+			 T1b = R1[WS(rs, 3)];
+			 T1c = R1[WS(rs, 11)];
+			 T1d = T1b - T1c;
+			 T1Y = T1b + T1c;
+		    }
+		    {
+			 E T12, T13, T15, T16;
+			 T12 = R1[WS(rs, 1)];
+			 T13 = R1[WS(rs, 9)];
+			 T14 = T12 - T13;
+			 T20 = T12 + T13;
+			 T15 = R1[WS(rs, 13)];
+			 T16 = R1[WS(rs, 5)];
+			 T17 = T15 - T16;
+			 T21 = T15 + T16;
+		    }
+		    T1Z = T1X + T1Y;
+		    T22 = T20 + T21;
+		    T2k = T21 - T20;
+		    T2j = T1X - T1Y;
+		    T1a = KP707106781 * (T17 - T14);
+		    T1e = T1a - T1d;
+		    T1C = T1d + T1a;
+		    T18 = KP707106781 * (T14 + T17);
+		    T19 = T11 + T18;
+		    T1B = T11 - T18;
+	       }
+	       {
+		    E TK, T1Q, TW, T1R, TN, T1T, TQ, T1U, TT, TR;
+		    {
+			 E TI, TJ, TU, TV;
+			 TI = R1[0];
+			 TJ = R1[WS(rs, 8)];
+			 TK = TI - TJ;
+			 T1Q = TI + TJ;
+			 TU = R1[WS(rs, 4)];
+			 TV = R1[WS(rs, 12)];
+			 TW = TU - TV;
+			 T1R = TU + TV;
+		    }
+		    {
+			 E TL, TM, TO, TP;
+			 TL = R1[WS(rs, 2)];
+			 TM = R1[WS(rs, 10)];
+			 TN = TL - TM;
+			 T1T = TL + TM;
+			 TO = R1[WS(rs, 14)];
+			 TP = R1[WS(rs, 6)];
+			 TQ = TO - TP;
+			 T1U = TO + TP;
+		    }
+		    T1S = T1Q + T1R;
+		    T1V = T1T + T1U;
+		    T2h = T1U - T1T;
+		    T2g = T1Q - T1R;
+		    TT = KP707106781 * (TQ - TN);
+		    TX = TT - TW;
+		    T1z = TW + TT;
+		    TR = KP707106781 * (TN + TQ);
+		    TS = TK + TR;
+		    T1y = TK - TR;
+	       }
+	       {
+		    E Tf, Tu, T27, T28, T29, T2a;
+		    Tf = T7 + Te;
+		    Tu = Tm + Tt;
+		    T27 = Tf + Tu;
+		    T28 = T1S + T1V;
+		    T29 = T1Z + T22;
+		    T2a = T28 + T29;
+		    Cr[WS(csr, 8)] = Tf - Tu;
+		    Ci[WS(csi, 8)] = T29 - T28;
+		    Cr[WS(csr, 16)] = T27 - T2a;
+		    Cr[0] = T27 + T2a;
+	       }
+	       {
+		    E T1P, T25, T24, T26, T1W, T23;
+		    T1P = T7 - Te;
+		    T25 = Tt - Tm;
+		    T1W = T1S - T1V;
+		    T23 = T1Z - T22;
+		    T24 = KP707106781 * (T1W + T23);
+		    T26 = KP707106781 * (T23 - T1W);
+		    Cr[WS(csr, 12)] = T1P - T24;
+		    Ci[WS(csi, 12)] = T26 - T25;
+		    Cr[WS(csr, 4)] = T1P + T24;
+		    Ci[WS(csi, 4)] = T25 + T26;
+	       }
+	       {
+		    E T2f, T2v, T2p, T2r, T2m, T2q, T2u, T2w, T2e, T2n;
+		    T2e = KP707106781 * (T2c + T2d);
+		    T2f = T2b + T2e;
+		    T2v = T2b - T2e;
+		    T2n = KP707106781 * (T2d - T2c);
+		    T2p = T2n - T2o;
+		    T2r = T2o + T2n;
+		    {
+			 E T2i, T2l, T2s, T2t;
+			 T2i = FMA(KP923879532, T2g, KP382683432 * T2h);
+			 T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
+			 T2m = T2i + T2l;
+			 T2q = T2l - T2i;
+			 T2s = FNMS(KP382683432, T2g, KP923879532 * T2h);
+			 T2t = FMA(KP382683432, T2j, KP923879532 * T2k);
+			 T2u = T2s + T2t;
+			 T2w = T2t - T2s;
+		    }
+		    Cr[WS(csr, 14)] = T2f - T2m;
+		    Ci[WS(csi, 14)] = T2u - T2r;
+		    Cr[WS(csr, 2)] = T2f + T2m;
+		    Ci[WS(csi, 2)] = T2r + T2u;
+		    Ci[WS(csi, 6)] = T2p + T2q;
+		    Cr[WS(csr, 6)] = T2v + T2w;
+		    Ci[WS(csi, 10)] = T2q - T2p;
+		    Cr[WS(csr, 10)] = T2v - T2w;
+	       }
+	       {
+		    E TH, T1t, T1s, T1u, T1g, T1o, T1n, T1p;
+		    {
+			 E Tz, TG, T1q, T1r;
+			 Tz = Tv + Ty;
+			 TG = TC + TF;
+			 TH = Tz + TG;
+			 T1t = Tz - TG;
+			 T1q = FNMS(KP195090322, TS, KP980785280 * TX);
+			 T1r = FMA(KP195090322, T19, KP980785280 * T1e);
+			 T1s = T1q + T1r;
+			 T1u = T1r - T1q;
+		    }
+		    {
+			 E TY, T1f, T1j, T1m;
+			 TY = FMA(KP980785280, TS, KP195090322 * TX);
+			 T1f = FNMS(KP195090322, T1e, KP980785280 * T19);
+			 T1g = TY + T1f;
+			 T1o = T1f - TY;
+			 T1j = T1h - T1i;
+			 T1m = T1k - T1l;
+			 T1n = T1j - T1m;
+			 T1p = T1m + T1j;
+		    }
+		    Cr[WS(csr, 15)] = TH - T1g;
+		    Ci[WS(csi, 15)] = T1s - T1p;
+		    Cr[WS(csr, 1)] = TH + T1g;
+		    Ci[WS(csi, 1)] = T1p + T1s;
+		    Ci[WS(csi, 7)] = T1n + T1o;
+		    Cr[WS(csr, 7)] = T1t + T1u;
+		    Ci[WS(csi, 9)] = T1o - T1n;
+		    Cr[WS(csr, 9)] = T1t - T1u;
+	       }
+	       {
+		    E T1x, T1N, T1M, T1O, T1E, T1I, T1H, T1J;
+		    {
+			 E T1v, T1w, T1K, T1L;
+			 T1v = Tv - Ty;
+			 T1w = T1i + T1h;
+			 T1x = T1v + T1w;
+			 T1N = T1v - T1w;
+			 T1K = FNMS(KP555570233, T1y, KP831469612 * T1z);
+			 T1L = FMA(KP555570233, T1B, KP831469612 * T1C);
+			 T1M = T1K + T1L;
+			 T1O = T1L - T1K;
+		    }
+		    {
+			 E T1A, T1D, T1F, T1G;
+			 T1A = FMA(KP831469612, T1y, KP555570233 * T1z);
+			 T1D = FNMS(KP555570233, T1C, KP831469612 * T1B);
+			 T1E = T1A + T1D;
+			 T1I = T1D - T1A;
+			 T1F = TF - TC;
+			 T1G = T1l + T1k;
+			 T1H = T1F - T1G;
+			 T1J = T1G + T1F;
+		    }
+		    Cr[WS(csr, 13)] = T1x - T1E;
+		    Ci[WS(csi, 13)] = T1M - T1J;
+		    Cr[WS(csr, 3)] = T1x + T1E;
+		    Ci[WS(csi, 3)] = T1J + T1M;
+		    Ci[WS(csi, 5)] = T1H + T1I;
+		    Cr[WS(csr, 5)] = T1N + T1O;
+		    Ci[WS(csi, 11)] = T1I - T1H;
+		    Cr[WS(csr, 11)] = T1N - T1O;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 32, "r2cf_32", {140, 26, 16, 0}, &GENUS };
+
+void X(codelet_r2cf_32) (planner *p) {
+     X(kr2c_register) (p, r2cf_32, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cf_4 -include r2cf.h */
+
+/*
+ * This function contains 6 FP additions, 0 FP multiplications,
+ * (or, 6 additions, 0 multiplications, 0 fused multiply/add),
+ * 7 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
+	       E T1, T2, T4, T5, T3, T6;
+	       T1 = R0[0];
+	       T2 = R0[WS(rs, 1)];
+	       T4 = R1[0];
+	       T5 = R1[WS(rs, 1)];
+	       Cr[WS(csr, 1)] = T1 - T2;
+	       T3 = T1 + T2;
+	       Ci[WS(csi, 1)] = T5 - T4;
+	       T6 = T4 + T5;
+	       Cr[0] = T3 + T6;
+	       Cr[WS(csr, 2)] = T3 - T6;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 4, "r2cf_4", {6, 0, 0, 0}, &GENUS };
+
+void X(codelet_r2cf_4) (planner *p) {
+     X(kr2c_register) (p, r2cf_4, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cf_4 -include r2cf.h */
+
+/*
+ * This function contains 6 FP additions, 0 FP multiplications,
+ * (or, 6 additions, 0 multiplications, 0 fused multiply/add),
+ * 7 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
+	       E T1, T2, T3, T4, T5, T6;
+	       T1 = R0[0];
+	       T2 = R0[WS(rs, 1)];
+	       T3 = T1 + T2;
+	       T4 = R1[0];
+	       T5 = R1[WS(rs, 1)];
+	       T6 = T4 + T5;
+	       Cr[WS(csr, 1)] = T1 - T2;
+	       Ci[WS(csi, 1)] = T5 - T4;
+	       Cr[WS(csr, 2)] = T3 - T6;
+	       Cr[0] = T3 + T6;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 4, "r2cf_4", {6, 0, 0, 0}, &GENUS };
+
+void X(codelet_r2cf_4) (planner *p) {
+     X(kr2c_register) (p, r2cf_4, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_5.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_5.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cf_5 -include r2cf.h */
+
+/*
+ * This function contains 12 FP additions, 7 FP multiplications,
+ * (or, 7 additions, 2 multiplications, 5 fused multiply/add),
+ * 17 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
+	       E T7, T1, T2, T4, T5;
+	       T7 = R0[0];
+	       T1 = R0[WS(rs, 2)];
+	       T2 = R1[0];
+	       T4 = R0[WS(rs, 1)];
+	       T5 = R1[WS(rs, 1)];
+	       {
+		    E T3, T8, T6, T9, Tc, Ta, Tb;
+		    T3 = T1 - T2;
+		    T8 = T2 + T1;
+		    T6 = T4 - T5;
+		    T9 = T4 + T5;
+		    Ci[WS(csi, 2)] = KP951056516 * (FMA(KP618033988, T3, T6));
+		    Ci[WS(csi, 1)] = KP951056516 * (FNMS(KP618033988, T6, T3));
+		    Tc = T8 - T9;
+		    Ta = T8 + T9;
+		    Tb = FNMS(KP250000000, Ta, T7);
+		    Cr[0] = T7 + Ta;
+		    Cr[WS(csr, 2)] = FNMS(KP559016994, Tc, Tb);
+		    Cr[WS(csr, 1)] = FMA(KP559016994, Tc, Tb);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 5, "r2cf_5", {7, 2, 5, 0}, &GENUS };
+
+void X(codelet_r2cf_5) (planner *p) {
+     X(kr2c_register) (p, r2cf_5, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cf_5 -include r2cf.h */
+
+/*
+ * This function contains 12 FP additions, 6 FP multiplications,
+ * (or, 9 additions, 3 multiplications, 3 fused multiply/add),
+ * 17 stack variables, 4 constants, and 10 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
+	       E Ta, T7, T8, T3, Tb, T6, T9, Tc;
+	       Ta = R0[0];
+	       {
+		    E T1, T2, T4, T5;
+		    T1 = R0[WS(rs, 2)];
+		    T2 = R1[0];
+		    T7 = T2 + T1;
+		    T4 = R0[WS(rs, 1)];
+		    T5 = R1[WS(rs, 1)];
+		    T8 = T4 + T5;
+		    T3 = T1 - T2;
+		    Tb = T7 + T8;
+		    T6 = T4 - T5;
+	       }
+	       Ci[WS(csi, 1)] = FNMS(KP587785252, T6, KP951056516 * T3);
+	       Cr[0] = Ta + Tb;
+	       Ci[WS(csi, 2)] = FMA(KP587785252, T3, KP951056516 * T6);
+	       T9 = KP559016994 * (T7 - T8);
+	       Tc = FNMS(KP250000000, Tb, Ta);
+	       Cr[WS(csr, 1)] = T9 + Tc;
+	       Cr[WS(csr, 2)] = Tc - T9;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 5, "r2cf_5", {9, 3, 3, 0}, &GENUS };
+
+void X(codelet_r2cf_5) (planner *p) {
+     X(kr2c_register) (p, r2cf_5, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cf_6 -include r2cf.h */
+
+/*
+ * This function contains 14 FP additions, 4 FP multiplications,
+ * (or, 12 additions, 2 multiplications, 2 fused multiply/add),
+ * 13 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
+	       E T4, Td, T3, Tc, T9, T5;
+	       {
+		    E T1, T2, T7, T8;
+		    T1 = R0[0];
+		    T2 = R1[WS(rs, 1)];
+		    T7 = R0[WS(rs, 2)];
+		    T8 = R1[0];
+		    T4 = R0[WS(rs, 1)];
+		    Td = T1 + T2;
+		    T3 = T1 - T2;
+		    Tc = T7 + T8;
+		    T9 = T7 - T8;
+		    T5 = R1[WS(rs, 2)];
+	       }
+	       {
+		    E T6, Tb, Te, Ta;
+		    T6 = T4 - T5;
+		    Tb = T4 + T5;
+		    Te = Tb + Tc;
+		    Ci[WS(csi, 2)] = KP866025403 * (Tb - Tc);
+		    Ta = T6 + T9;
+		    Ci[WS(csi, 1)] = KP866025403 * (T9 - T6);
+		    Cr[0] = Td + Te;
+		    Cr[WS(csr, 2)] = FNMS(KP500000000, Te, Td);
+		    Cr[WS(csr, 3)] = T3 + Ta;
+		    Cr[WS(csr, 1)] = FNMS(KP500000000, Ta, T3);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 6, "r2cf_6", {12, 2, 2, 0}, &GENUS };
+
+void X(codelet_r2cf_6) (planner *p) {
+     X(kr2c_register) (p, r2cf_6, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cf_6 -include r2cf.h */
+
+/*
+ * This function contains 14 FP additions, 4 FP multiplications,
+ * (or, 12 additions, 2 multiplications, 2 fused multiply/add),
+ * 17 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
+	       E T3, Td, T9, Tc, T6, Tb, T1, T2, Ta, Te;
+	       T1 = R0[0];
+	       T2 = R1[WS(rs, 1)];
+	       T3 = T1 - T2;
+	       Td = T1 + T2;
+	       {
+		    E T7, T8, T4, T5;
+		    T7 = R0[WS(rs, 2)];
+		    T8 = R1[0];
+		    T9 = T7 - T8;
+		    Tc = T7 + T8;
+		    T4 = R0[WS(rs, 1)];
+		    T5 = R1[WS(rs, 2)];
+		    T6 = T4 - T5;
+		    Tb = T4 + T5;
+	       }
+	       Ci[WS(csi, 1)] = KP866025403 * (T9 - T6);
+	       Ta = T6 + T9;
+	       Cr[WS(csr, 1)] = FNMS(KP500000000, Ta, T3);
+	       Cr[WS(csr, 3)] = T3 + Ta;
+	       Ci[WS(csi, 2)] = KP866025403 * (Tb - Tc);
+	       Te = Tb + Tc;
+	       Cr[WS(csr, 2)] = FNMS(KP500000000, Te, Td);
+	       Cr[0] = Td + Te;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 6, "r2cf_6", {12, 2, 2, 0}, &GENUS };
+
+void X(codelet_r2cf_6) (planner *p) {
+     X(kr2c_register) (p, r2cf_6, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_64.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1375 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:47 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 64 -name r2cf_64 -include r2cf.h */
+
+/*
+ * This function contains 394 FP additions, 196 FP multiplications,
+ * (or, 198 additions, 0 multiplications, 196 fused multiply/add),
+ * 133 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP098491403, +0.098491403357164253077197521291327432293052451);
+     DK(KP820678790, +0.820678790828660330972281985331011598767386482);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP534511135, +0.534511135950791641089685961295362908582039528);
+     DK(KP303346683, +0.303346683607342391675883946941299872384187453);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(256, rs), MAKE_VOLATILE_STRIDE(256, csr), MAKE_VOLATILE_STRIDE(256, csi)) {
+	       E T5n, T5o;
+	       {
+		    E T11, T2j, T4P, T5P, T3D, T5p, T3d, Tf, T1k, T1H, T5D, T4l, T5A, T4a, T3i;
+		    E T2U, T1R, T2e, T5K, T4G, T5H, T4v, T3l, T31, T5s, T42, T5t, T3Z, T2n, T1b;
+		    E T3f, TZ, T5v, T3T, T5w, T3Q, T2m, T18, T3e, TK, T3K, T5Q, T4S, T5q, T14;
+		    E T2k, T3p, Tu, T4w, T1U, T5E, T4h, T5B, T4o, T3j, T2X, T1I, T1z, T1Z, T4A;
+		    E T24, T4x, T1X, T20;
+		    {
+			 E TN, T3V, TS, TX, T3X, TQ, T40, TT;
+			 {
+			      E T1g, T46, T1B, T1G, T47, T1j, T4j, T1C;
+			      {
+				   E T4, T3z, T3, T3B, Td, T5, T8, T9;
+				   {
+					E T1, T2, Tb, Tc;
+					T1 = R0[0];
+					T2 = R0[WS(rs, 16)];
+					Tb = R0[WS(rs, 28)];
+					Tc = R0[WS(rs, 12)];
+					T4 = R0[WS(rs, 8)];
+					T3z = T1 - T2;
+					T3 = T1 + T2;
+					T3B = Tb - Tc;
+					Td = Tb + Tc;
+					T5 = R0[WS(rs, 24)];
+					T8 = R0[WS(rs, 4)];
+					T9 = R0[WS(rs, 20)];
+				   }
+				   {
+					E T1E, T1F, T1h, T1i;
+					{
+					     E T1e, T4N, T6, T3A, Ta, T1f;
+					     T1e = R1[0];
+					     T4N = T4 - T5;
+					     T6 = T4 + T5;
+					     T3A = T8 - T9;
+					     Ta = T8 + T9;
+					     T1f = R1[WS(rs, 16)];
+					     {
+						  E T7, T3C, T4O, Te;
+						  T11 = T3 - T6;
+						  T7 = T3 + T6;
+						  T3C = T3A + T3B;
+						  T4O = T3B - T3A;
+						  T2j = Td - Ta;
+						  Te = Ta + Td;
+						  T4P = FNMS(KP707106781, T4O, T4N);
+						  T5P = FMA(KP707106781, T4O, T4N);
+						  T3D = FMA(KP707106781, T3C, T3z);
+						  T5p = FNMS(KP707106781, T3C, T3z);
+						  T3d = T7 - Te;
+						  Tf = T7 + Te;
+						  T1g = T1e + T1f;
+						  T46 = T1e - T1f;
+					     }
+					}
+					T1E = R1[WS(rs, 4)];
+					T1F = R1[WS(rs, 20)];
+					T1h = R1[WS(rs, 8)];
+					T1i = R1[WS(rs, 24)];
+					T1B = R1[WS(rs, 28)];
+					T1G = T1E + T1F;
+					T47 = T1E - T1F;
+					T1j = T1h + T1i;
+					T4j = T1h - T1i;
+					T1C = R1[WS(rs, 12)];
+				   }
+			      }
+			      {
+				   E T1N, T4r, T28, T2d, T4s, T1Q, T4E, T29;
+				   {
+					E T2b, T2c, T1O, T1P;
+					{
+					     E T2S, T48, T1D, T1L, T1M, T4k, T49, T2T;
+					     T1L = R1[WS(rs, 31)];
+					     T1M = R1[WS(rs, 15)];
+					     T2S = T1g + T1j;
+					     T1k = T1g - T1j;
+					     T48 = T1B - T1C;
+					     T1D = T1B + T1C;
+					     T1N = T1L + T1M;
+					     T4r = T1L - T1M;
+					     T4k = T47 - T48;
+					     T49 = T47 + T48;
+					     T2T = T1G + T1D;
+					     T1H = T1D - T1G;
+					     T5D = FNMS(KP707106781, T4k, T4j);
+					     T4l = FMA(KP707106781, T4k, T4j);
+					     T5A = FNMS(KP707106781, T49, T46);
+					     T4a = FMA(KP707106781, T49, T46);
+					     T3i = T2S - T2T;
+					     T2U = T2S + T2T;
+					     T2b = R1[WS(rs, 3)];
+					     T2c = R1[WS(rs, 19)];
+					}
+					T1O = R1[WS(rs, 7)];
+					T1P = R1[WS(rs, 23)];
+					T28 = R1[WS(rs, 27)];
+					T2d = T2b + T2c;
+					T4s = T2b - T2c;
+					T1Q = T1O + T1P;
+					T4E = T1P - T1O;
+					T29 = R1[WS(rs, 11)];
+				   }
+				   {
+					E TV, TW, TO, TP;
+					{
+					     E T2Z, T4t, T2a, TL, TM, T4F, T4u, T30;
+					     TL = R0[WS(rs, 31)];
+					     TM = R0[WS(rs, 15)];
+					     T2Z = T1N + T1Q;
+					     T1R = T1N - T1Q;
+					     T4t = T28 - T29;
+					     T2a = T28 + T29;
+					     TN = TL + TM;
+					     T3V = TL - TM;
+					     T4F = T4t - T4s;
+					     T4u = T4s + T4t;
+					     T30 = T2d + T2a;
+					     T2e = T2a - T2d;
+					     T5K = FNMS(KP707106781, T4F, T4E);
+					     T4G = FMA(KP707106781, T4F, T4E);
+					     T5H = FNMS(KP707106781, T4u, T4r);
+					     T4v = FMA(KP707106781, T4u, T4r);
+					     T3l = T2Z - T30;
+					     T31 = T2Z + T30;
+					     TV = R0[WS(rs, 27)];
+					     TW = R0[WS(rs, 11)];
+					}
+					TO = R0[WS(rs, 7)];
+					TP = R0[WS(rs, 23)];
+					TS = R0[WS(rs, 3)];
+					TX = TV + TW;
+					T3X = TV - TW;
+					TQ = TO + TP;
+					T40 = TO - TP;
+					TT = R0[WS(rs, 19)];
+				   }
+			      }
+			 }
+			 {
+			      E Ti, T3E, Tn, Ts, T3I, Tl, T3F, To;
+			      {
+				   E Ty, T3M, TD, TI, T3O, TB, T3R, TE;
+				   {
+					E TG, TH, Tz, TA;
+					{
+					     E T19, TR, T3W, TU, Tw, Tx;
+					     Tw = R0[WS(rs, 1)];
+					     Tx = R0[WS(rs, 17)];
+					     T19 = TN - TQ;
+					     TR = TN + TQ;
+					     T3W = TS - TT;
+					     TU = TS + TT;
+					     Ty = Tw + Tx;
+					     T3M = Tw - Tx;
+					     {
+						  E T41, T3Y, T1a, TY;
+						  T41 = T3W - T3X;
+						  T3Y = T3W + T3X;
+						  T1a = TX - TU;
+						  TY = TU + TX;
+						  T5s = FNMS(KP707106781, T41, T40);
+						  T42 = FMA(KP707106781, T41, T40);
+						  T5t = FNMS(KP707106781, T3Y, T3V);
+						  T3Z = FMA(KP707106781, T3Y, T3V);
+						  T2n = FMA(KP414213562, T19, T1a);
+						  T1b = FNMS(KP414213562, T1a, T19);
+						  T3f = TR - TY;
+						  TZ = TR + TY;
+						  TG = R0[WS(rs, 29)];
+						  TH = R0[WS(rs, 13)];
+					     }
+					}
+					Tz = R0[WS(rs, 9)];
+					TA = R0[WS(rs, 25)];
+					TD = R0[WS(rs, 5)];
+					TI = TG + TH;
+					T3O = TG - TH;
+					TB = Tz + TA;
+					T3R = Tz - TA;
+					TE = R0[WS(rs, 21)];
+				   }
+				   {
+					E Tq, Tr, Tj, Tk;
+					{
+					     E T16, TC, T3N, TF, Tg, Th;
+					     Tg = R0[WS(rs, 2)];
+					     Th = R0[WS(rs, 18)];
+					     T16 = Ty - TB;
+					     TC = Ty + TB;
+					     T3N = TD - TE;
+					     TF = TD + TE;
+					     Ti = Tg + Th;
+					     T3E = Tg - Th;
+					     {
+						  E T3S, T3P, T17, TJ;
+						  T3S = T3N - T3O;
+						  T3P = T3N + T3O;
+						  T17 = TI - TF;
+						  TJ = TF + TI;
+						  T5v = FNMS(KP707106781, T3S, T3R);
+						  T3T = FMA(KP707106781, T3S, T3R);
+						  T5w = FNMS(KP707106781, T3P, T3M);
+						  T3Q = FMA(KP707106781, T3P, T3M);
+						  T2m = FNMS(KP414213562, T16, T17);
+						  T18 = FMA(KP414213562, T17, T16);
+						  T3e = TC - TJ;
+						  TK = TC + TJ;
+						  Tq = R0[WS(rs, 6)];
+						  Tr = R0[WS(rs, 22)];
+					     }
+					}
+					Tj = R0[WS(rs, 10)];
+					Tk = R0[WS(rs, 26)];
+					Tn = R0[WS(rs, 30)];
+					Ts = Tq + Tr;
+					T3I = Tq - Tr;
+					Tl = Tj + Tk;
+					T3F = Tj - Tk;
+					To = R0[WS(rs, 14)];
+				   }
+			      }
+			      {
+				   E T1n, T4b, T1s, T4f, T1x, T4c, T1q, T1t;
+				   {
+					E T1v, T1w, T1o, T1p;
+					{
+					     E T1l, T4Q, T3G, Tm, T12, Tp, T3H, T1m;
+					     T1l = R1[WS(rs, 2)];
+					     T4Q = FMA(KP414213562, T3E, T3F);
+					     T3G = FNMS(KP414213562, T3F, T3E);
+					     Tm = Ti + Tl;
+					     T12 = Ti - Tl;
+					     Tp = Tn + To;
+					     T3H = Tn - To;
+					     T1m = R1[WS(rs, 18)];
+					     T1v = R1[WS(rs, 6)];
+					     {
+						  E T4R, T3J, Tt, T13;
+						  T4R = FNMS(KP414213562, T3H, T3I);
+						  T3J = FMA(KP414213562, T3I, T3H);
+						  Tt = Tp + Ts;
+						  T13 = Tp - Ts;
+						  T1n = T1l + T1m;
+						  T4b = T1l - T1m;
+						  T3K = T3G + T3J;
+						  T5Q = T3J - T3G;
+						  T4S = T4Q + T4R;
+						  T5q = T4Q - T4R;
+						  T14 = T12 + T13;
+						  T2k = T13 - T12;
+						  T3p = Tt - Tm;
+						  Tu = Tm + Tt;
+						  T1w = R1[WS(rs, 22)];
+					     }
+					}
+					T1o = R1[WS(rs, 10)];
+					T1p = R1[WS(rs, 26)];
+					T1s = R1[WS(rs, 30)];
+					T4f = T1v - T1w;
+					T1x = T1v + T1w;
+					T4c = T1o - T1p;
+					T1q = T1o + T1p;
+					T1t = R1[WS(rs, 14)];
+				   }
+				   {
+					E T22, T23, T1V, T1W;
+					{
+					     E T1S, T4d, T4m, T2V, T1r, T4e, T1u, T1T;
+					     T1S = R1[WS(rs, 1)];
+					     T4d = FNMS(KP414213562, T4c, T4b);
+					     T4m = FMA(KP414213562, T4b, T4c);
+					     T2V = T1n + T1q;
+					     T1r = T1n - T1q;
+					     T4e = T1s - T1t;
+					     T1u = T1s + T1t;
+					     T1T = R1[WS(rs, 17)];
+					     T22 = R1[WS(rs, 5)];
+					     {
+						  E T4g, T4n, T2W, T1y;
+						  T4g = FMA(KP414213562, T4f, T4e);
+						  T4n = FNMS(KP414213562, T4e, T4f);
+						  T2W = T1u + T1x;
+						  T1y = T1u - T1x;
+						  T4w = T1S - T1T;
+						  T1U = T1S + T1T;
+						  T5E = T4g - T4d;
+						  T4h = T4d + T4g;
+						  T5B = T4m - T4n;
+						  T4o = T4m + T4n;
+						  T3j = T2W - T2V;
+						  T2X = T2V + T2W;
+						  T1I = T1y - T1r;
+						  T1z = T1r + T1y;
+						  T23 = R1[WS(rs, 21)];
+					     }
+					}
+					T1V = R1[WS(rs, 9)];
+					T1W = R1[WS(rs, 25)];
+					T1Z = R1[WS(rs, 29)];
+					T4A = T23 - T22;
+					T24 = T22 + T23;
+					T4x = T1W - T1V;
+					T1X = T1V + T1W;
+					T20 = R1[WS(rs, 13)];
+				   }
+			      }
+			 }
+		    }
+		    {
+			 E T4C, T5L, T4J, T5I, T26, T2f, T3q, T3h, T3w, T3s, T3o, T3r, T3t;
+			 {
+			      E T2R, T37, T2Y, T3a, T39, T3m, T3b, T35, Tv, T10, T34, T3c, T3x, T3y;
+			      {
+				   E T4y, T4H, T32, T1Y, T4z, T21;
+				   T2R = Tf - Tu;
+				   Tv = Tf + Tu;
+				   T4y = FMA(KP414213562, T4x, T4w);
+				   T4H = FNMS(KP414213562, T4w, T4x);
+				   T32 = T1U + T1X;
+				   T1Y = T1U - T1X;
+				   T4z = T1Z - T20;
+				   T21 = T1Z + T20;
+				   T10 = TK + TZ;
+				   T37 = TZ - TK;
+				   T2Y = T2U - T2X;
+				   T3a = T2U + T2X;
+				   {
+					E T4B, T4I, T33, T25;
+					T4B = FNMS(KP414213562, T4A, T4z);
+					T4I = FMA(KP414213562, T4z, T4A);
+					T33 = T21 + T24;
+					T25 = T21 - T24;
+					T39 = Tv + T10;
+					T4C = T4y + T4B;
+					T5L = T4B - T4y;
+					T4J = T4H + T4I;
+					T5I = T4I - T4H;
+					T34 = T32 + T33;
+					T3m = T33 - T32;
+					T26 = T1Y + T25;
+					T2f = T25 - T1Y;
+				   }
+			      }
+			      Cr[WS(csr, 16)] = Tv - T10;
+			      T3b = T31 + T34;
+			      T35 = T31 - T34;
+			      Ci[WS(csi, 16)] = T3b - T3a;
+			      T3c = T3a + T3b;
+			      {
+				   E T3k, T3u, T3v, T3n, T36, T38, T3g;
+				   T3g = T3e + T3f;
+				   T3q = T3f - T3e;
+				   Cr[0] = T39 + T3c;
+				   Cr[WS(csr, 32)] = T39 - T3c;
+				   T36 = T2Y + T35;
+				   T38 = T35 - T2Y;
+				   T3x = FNMS(KP707106781, T3g, T3d);
+				   T3h = FMA(KP707106781, T3g, T3d);
+				   Ci[WS(csi, 8)] = FMA(KP707106781, T38, T37);
+				   Ci[WS(csi, 24)] = FMS(KP707106781, T38, T37);
+				   Cr[WS(csr, 8)] = FMA(KP707106781, T36, T2R);
+				   Cr[WS(csr, 24)] = FNMS(KP707106781, T36, T2R);
+				   T3k = FMA(KP414213562, T3j, T3i);
+				   T3u = FNMS(KP414213562, T3i, T3j);
+				   T3v = FMA(KP414213562, T3l, T3m);
+				   T3n = FNMS(KP414213562, T3m, T3l);
+				   T3y = T3v - T3u;
+				   T3w = T3u + T3v;
+				   T3s = T3n - T3k;
+				   T3o = T3k + T3n;
+			      }
+			      Cr[WS(csr, 12)] = FMA(KP923879532, T3y, T3x);
+			      Cr[WS(csr, 20)] = FNMS(KP923879532, T3y, T3x);
+			 }
+			 Cr[WS(csr, 4)] = FMA(KP923879532, T3o, T3h);
+			 Cr[WS(csr, 28)] = FNMS(KP923879532, T3o, T3h);
+			 T3r = FNMS(KP707106781, T3q, T3p);
+			 T3t = FMA(KP707106781, T3q, T3p);
+			 {
+			      E T27, T2g, T2v, T1d, T2r, T2p, T2s, T1K, T6l, T6m;
+			      {
+				   E T15, T2o, T2P, T2z, T2l, T1c, T1A, T1J, T2D, T2L, T2J, T2M, T2C, T2E, T2N;
+				   E T2F;
+				   {
+					E T2H, T2I, T2x, T2y, T2A, T2B;
+					T15 = FMA(KP707106781, T14, T11);
+					T2x = FNMS(KP707106781, T14, T11);
+					T2y = T2n - T2m;
+					T2o = T2m + T2n;
+					Ci[WS(csi, 4)] = FMA(KP923879532, T3w, T3t);
+					Ci[WS(csi, 28)] = FMS(KP923879532, T3w, T3t);
+					Ci[WS(csi, 20)] = FMA(KP923879532, T3s, T3r);
+					Ci[WS(csi, 12)] = FMS(KP923879532, T3s, T3r);
+					T2P = FNMS(KP923879532, T2y, T2x);
+					T2z = FMA(KP923879532, T2y, T2x);
+					T2l = FMA(KP707106781, T2k, T2j);
+					T2H = FNMS(KP707106781, T2k, T2j);
+					T2I = T1b - T18;
+					T1c = T18 + T1b;
+					T1A = FMA(KP707106781, T1z, T1k);
+					T2A = FNMS(KP707106781, T1z, T1k);
+					T2B = FNMS(KP707106781, T1I, T1H);
+					T1J = FMA(KP707106781, T1I, T1H);
+					T27 = FMA(KP707106781, T26, T1R);
+					T2D = FNMS(KP707106781, T26, T1R);
+					T2L = FNMS(KP923879532, T2I, T2H);
+					T2J = FMA(KP923879532, T2I, T2H);
+					T2M = FMA(KP668178637, T2A, T2B);
+					T2C = FNMS(KP668178637, T2B, T2A);
+					T2E = FNMS(KP707106781, T2f, T2e);
+					T2g = FMA(KP707106781, T2f, T2e);
+				   }
+				   T2N = FNMS(KP668178637, T2D, T2E);
+				   T2F = FMA(KP668178637, T2E, T2D);
+				   T2v = FNMS(KP923879532, T1c, T15);
+				   T1d = FMA(KP923879532, T1c, T15);
+				   {
+					E T2Q, T2O, T2K, T2G;
+					T2Q = T2M - T2N;
+					T2O = T2M + T2N;
+					T2K = T2F - T2C;
+					T2G = T2C + T2F;
+					Cr[WS(csr, 10)] = FMA(KP831469612, T2Q, T2P);
+					Cr[WS(csr, 22)] = FNMS(KP831469612, T2Q, T2P);
+					Ci[WS(csi, 26)] = FNMS(KP831469612, T2O, T2L);
+					Ci[WS(csi, 6)] = -(FMA(KP831469612, T2O, T2L));
+					Ci[WS(csi, 22)] = FMS(KP831469612, T2K, T2J);
+					Ci[WS(csi, 10)] = FMA(KP831469612, T2K, T2J);
+					Cr[WS(csr, 6)] = FMA(KP831469612, T2G, T2z);
+					Cr[WS(csr, 26)] = FNMS(KP831469612, T2G, T2z);
+				   }
+				   T2r = FMA(KP923879532, T2o, T2l);
+				   T2p = FNMS(KP923879532, T2o, T2l);
+				   T2s = FNMS(KP198912367, T1A, T1J);
+				   T1K = FMA(KP198912367, T1J, T1A);
+			      }
+			      {
+				   E T63, T5r, T5R, T6d, T5J, T5M, T6e, T5y, T6j, T6b, T66, T67, T64, T5U, T5Z;
+				   E T5G;
+				   {
+					E T5S, T5u, T5x, T5T, T2t, T2h;
+					T63 = FMA(KP923879532, T5q, T5p);
+					T5r = FNMS(KP923879532, T5q, T5p);
+					T5R = FNMS(KP923879532, T5Q, T5P);
+					T6d = FMA(KP923879532, T5Q, T5P);
+					T2t = FMA(KP198912367, T27, T2g);
+					T2h = FNMS(KP198912367, T2g, T27);
+					T5S = FNMS(KP668178637, T5s, T5t);
+					T5u = FMA(KP668178637, T5t, T5s);
+					{
+					     E T2w, T2u, T2q, T2i;
+					     T2w = T2t - T2s;
+					     T2u = T2s + T2t;
+					     T2q = T2h - T1K;
+					     T2i = T1K + T2h;
+					     Cr[WS(csr, 14)] = FMA(KP980785280, T2w, T2v);
+					     Cr[WS(csr, 18)] = FNMS(KP980785280, T2w, T2v);
+					     Ci[WS(csi, 30)] = FMS(KP980785280, T2u, T2r);
+					     Ci[WS(csi, 2)] = FMA(KP980785280, T2u, T2r);
+					     Ci[WS(csi, 18)] = FMA(KP980785280, T2q, T2p);
+					     Ci[WS(csi, 14)] = FMS(KP980785280, T2q, T2p);
+					     Cr[WS(csr, 2)] = FMA(KP980785280, T2i, T1d);
+					     Cr[WS(csr, 30)] = FNMS(KP980785280, T2i, T1d);
+					     T5x = FNMS(KP668178637, T5w, T5v);
+					     T5T = FMA(KP668178637, T5v, T5w);
+					}
+					{
+					     E T69, T6a, T5C, T5F;
+					     T5J = FNMS(KP923879532, T5I, T5H);
+					     T69 = FMA(KP923879532, T5I, T5H);
+					     T6a = FNMS(KP923879532, T5L, T5K);
+					     T5M = FMA(KP923879532, T5L, T5K);
+					     T6e = T5x + T5u;
+					     T5y = T5u - T5x;
+					     T6j = FNMS(KP303346683, T69, T6a);
+					     T6b = FMA(KP303346683, T6a, T69);
+					     T66 = FMA(KP923879532, T5B, T5A);
+					     T5C = FNMS(KP923879532, T5B, T5A);
+					     T5F = FNMS(KP923879532, T5E, T5D);
+					     T67 = FMA(KP923879532, T5E, T5D);
+					     T64 = T5T + T5S;
+					     T5U = T5S - T5T;
+					     T5Z = FMA(KP534511135, T5C, T5F);
+					     T5G = FNMS(KP534511135, T5F, T5C);
+					}
+				   }
+				   {
+					E T61, T6i, T68, T62;
+					{
+					     E T5z, T5Y, T5N, T5X, T5V, T60, T5W, T5O;
+					     T61 = FNMS(KP831469612, T5y, T5r);
+					     T5z = FMA(KP831469612, T5y, T5r);
+					     T6i = FNMS(KP303346683, T66, T67);
+					     T68 = FMA(KP303346683, T67, T66);
+					     T5Y = FMA(KP534511135, T5J, T5M);
+					     T5N = FNMS(KP534511135, T5M, T5J);
+					     T5X = FNMS(KP831469612, T5U, T5R);
+					     T5V = FMA(KP831469612, T5U, T5R);
+					     T60 = T5Y - T5Z;
+					     T62 = T5Z + T5Y;
+					     T5W = T5N - T5G;
+					     T5O = T5G + T5N;
+					     Ci[WS(csi, 27)] = FMA(KP881921264, T60, T5X);
+					     Ci[WS(csi, 5)] = FMS(KP881921264, T60, T5X);
+					     Cr[WS(csr, 5)] = FMA(KP881921264, T5O, T5z);
+					     Cr[WS(csr, 27)] = FNMS(KP881921264, T5O, T5z);
+					     Ci[WS(csi, 21)] = FMS(KP881921264, T5W, T5V);
+					     Ci[WS(csi, 11)] = FMA(KP881921264, T5W, T5V);
+					}
+					{
+					     E T6g, T6f, T6h, T6k, T65, T6c;
+					     T6l = FNMS(KP831469612, T64, T63);
+					     T65 = FMA(KP831469612, T64, T63);
+					     T6c = T68 + T6b;
+					     T6g = T6b - T68;
+					     T6f = FNMS(KP831469612, T6e, T6d);
+					     T6h = FMA(KP831469612, T6e, T6d);
+					     Cr[WS(csr, 11)] = FMA(KP881921264, T62, T61);
+					     Cr[WS(csr, 21)] = FNMS(KP881921264, T62, T61);
+					     Cr[WS(csr, 3)] = FMA(KP956940335, T6c, T65);
+					     Cr[WS(csr, 29)] = FNMS(KP956940335, T6c, T65);
+					     T6k = T6i - T6j;
+					     T6m = T6i + T6j;
+					     Ci[WS(csi, 29)] = FMS(KP956940335, T6k, T6h);
+					     Ci[WS(csi, 3)] = FMA(KP956940335, T6k, T6h);
+					     Ci[WS(csi, 19)] = FMA(KP956940335, T6g, T6f);
+					     Ci[WS(csi, 13)] = FMS(KP956940335, T6g, T6f);
+					}
+				   }
+			      }
+			      {
+				   E T55, T3L, T4T, T5f, T4D, T4K, T5g, T44, T5l, T5d, T58, T59, T56, T4W, T51;
+				   E T4q;
+				   {
+					E T4U, T3U, T43, T4V;
+					T55 = FNMS(KP923879532, T3K, T3D);
+					T3L = FMA(KP923879532, T3K, T3D);
+					T4T = FMA(KP923879532, T4S, T4P);
+					T5f = FNMS(KP923879532, T4S, T4P);
+					Cr[WS(csr, 13)] = FNMS(KP956940335, T6m, T6l);
+					Cr[WS(csr, 19)] = FMA(KP956940335, T6m, T6l);
+					T4U = FMA(KP198912367, T3Q, T3T);
+					T3U = FNMS(KP198912367, T3T, T3Q);
+					T43 = FMA(KP198912367, T42, T3Z);
+					T4V = FNMS(KP198912367, T3Z, T42);
+					{
+					     E T5b, T5c, T4i, T4p;
+					     T4D = FMA(KP923879532, T4C, T4v);
+					     T5b = FNMS(KP923879532, T4C, T4v);
+					     T5c = FNMS(KP923879532, T4J, T4G);
+					     T4K = FMA(KP923879532, T4J, T4G);
+					     T5g = T43 - T3U;
+					     T44 = T3U + T43;
+					     T5l = FNMS(KP820678790, T5b, T5c);
+					     T5d = FMA(KP820678790, T5c, T5b);
+					     T58 = FNMS(KP923879532, T4h, T4a);
+					     T4i = FMA(KP923879532, T4h, T4a);
+					     T4p = FMA(KP923879532, T4o, T4l);
+					     T59 = FNMS(KP923879532, T4o, T4l);
+					     T56 = T4U - T4V;
+					     T4W = T4U + T4V;
+					     T51 = FMA(KP098491403, T4i, T4p);
+					     T4q = FNMS(KP098491403, T4p, T4i);
+					}
+				   }
+				   {
+					E T53, T5k, T5a, T54;
+					{
+					     E T45, T50, T4L, T4Z, T4X, T52, T4Y, T4M;
+					     T53 = FNMS(KP980785280, T44, T3L);
+					     T45 = FMA(KP980785280, T44, T3L);
+					     T5k = FNMS(KP820678790, T58, T59);
+					     T5a = FMA(KP820678790, T59, T58);
+					     T50 = FMA(KP098491403, T4D, T4K);
+					     T4L = FNMS(KP098491403, T4K, T4D);
+					     T4Z = FMA(KP980785280, T4W, T4T);
+					     T4X = FNMS(KP980785280, T4W, T4T);
+					     T52 = T50 - T51;
+					     T54 = T51 + T50;
+					     T4Y = T4L - T4q;
+					     T4M = T4q + T4L;
+					     Ci[WS(csi, 31)] = FMA(KP995184726, T52, T4Z);
+					     Ci[WS(csi, 1)] = FMS(KP995184726, T52, T4Z);
+					     Cr[WS(csr, 1)] = FMA(KP995184726, T4M, T45);
+					     Cr[WS(csr, 31)] = FNMS(KP995184726, T4M, T45);
+					     Ci[WS(csi, 17)] = FMS(KP995184726, T4Y, T4X);
+					     Ci[WS(csi, 15)] = FMA(KP995184726, T4Y, T4X);
+					}
+					{
+					     E T5i, T5h, T5j, T5m, T57, T5e;
+					     T5n = FNMS(KP980785280, T56, T55);
+					     T57 = FMA(KP980785280, T56, T55);
+					     T5e = T5a + T5d;
+					     T5i = T5d - T5a;
+					     T5h = FNMS(KP980785280, T5g, T5f);
+					     T5j = FMA(KP980785280, T5g, T5f);
+					     Cr[WS(csr, 15)] = FMA(KP995184726, T54, T53);
+					     Cr[WS(csr, 17)] = FNMS(KP995184726, T54, T53);
+					     Cr[WS(csr, 7)] = FMA(KP773010453, T5e, T57);
+					     Cr[WS(csr, 25)] = FNMS(KP773010453, T5e, T57);
+					     T5m = T5k - T5l;
+					     T5o = T5k + T5l;
+					     Ci[WS(csi, 25)] = FMS(KP773010453, T5m, T5j);
+					     Ci[WS(csi, 7)] = FMA(KP773010453, T5m, T5j);
+					     Ci[WS(csi, 23)] = FMA(KP773010453, T5i, T5h);
+					     Ci[WS(csi, 9)] = FMS(KP773010453, T5i, T5h);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Cr[WS(csr, 9)] = FNMS(KP773010453, T5o, T5n);
+	       Cr[WS(csr, 23)] = FMA(KP773010453, T5o, T5n);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 64, "r2cf_64", {198, 0, 196, 0}, &GENUS };
+
+void X(codelet_r2cf_64) (planner *p) {
+     X(kr2c_register) (p, r2cf_64, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 64 -name r2cf_64 -include r2cf.h */
+
+/*
+ * This function contains 394 FP additions, 124 FP multiplications,
+ * (or, 342 additions, 72 multiplications, 52 fused multiply/add),
+ * 106 stack variables, 15 constants, and 128 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP773010453, +0.773010453362736960810906609758469800971041293);
+     DK(KP634393284, +0.634393284163645498215171613225493370675687095);
+     DK(KP098017140, +0.098017140329560601994195563888641845861136673);
+     DK(KP995184726, +0.995184726672196886244836953109479921575474869);
+     DK(KP290284677, +0.290284677254462367636192375817395274691476278);
+     DK(KP956940335, +0.956940335732208864935797886980269969482849206);
+     DK(KP471396736, +0.471396736825997648556387625905254377657460319);
+     DK(KP881921264, +0.881921264348355029712756863660388349508442621);
+     DK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(256, rs), MAKE_VOLATILE_STRIDE(256, csr), MAKE_VOLATILE_STRIDE(256, csi)) {
+	       E T4l, T5a, T15, T3n, T2T, T3Q, T7, Te, Tf, T4A, T4L, T1X, T3B, T23, T3y;
+	       E T5I, T66, T4R, T52, T2j, T3F, T2H, T3I, T5P, T69, T1i, T3t, T1l, T3u, TZ;
+	       E T63, T4v, T58, T1r, T3r, T1u, T3q, TK, T62, T4s, T57, Tm, Tt, Tu, T4o;
+	       E T5b, T1c, T3R, T2Q, T3o, T1M, T3z, T5L, T67, T26, T3C, T4H, T4M, T2y, T3J;
+	       E T5S, T6a, T2C, T3G, T4Y, T53;
+	       {
+		    E T3, T11, Td, T13, T6, T2S, Ta, T12, T14, T2R;
+		    {
+			 E T1, T2, Tb, Tc;
+			 T1 = R0[0];
+			 T2 = R0[WS(rs, 16)];
+			 T3 = T1 + T2;
+			 T11 = T1 - T2;
+			 Tb = R0[WS(rs, 28)];
+			 Tc = R0[WS(rs, 12)];
+			 Td = Tb + Tc;
+			 T13 = Tb - Tc;
+		    }
+		    {
+			 E T4, T5, T8, T9;
+			 T4 = R0[WS(rs, 8)];
+			 T5 = R0[WS(rs, 24)];
+			 T6 = T4 + T5;
+			 T2S = T4 - T5;
+			 T8 = R0[WS(rs, 4)];
+			 T9 = R0[WS(rs, 20)];
+			 Ta = T8 + T9;
+			 T12 = T8 - T9;
+		    }
+		    T4l = T3 - T6;
+		    T5a = Td - Ta;
+		    T14 = KP707106781 * (T12 + T13);
+		    T15 = T11 + T14;
+		    T3n = T11 - T14;
+		    T2R = KP707106781 * (T13 - T12);
+		    T2T = T2R - T2S;
+		    T3Q = T2S + T2R;
+		    T7 = T3 + T6;
+		    Te = Ta + Td;
+		    Tf = T7 + Te;
+	       }
+	       {
+		    E T1P, T4J, T21, T4y, T1S, T4K, T1W, T4z;
+		    {
+			 E T1N, T1O, T1Z, T20;
+			 T1N = R1[WS(rs, 28)];
+			 T1O = R1[WS(rs, 12)];
+			 T1P = T1N - T1O;
+			 T4J = T1N + T1O;
+			 T1Z = R1[0];
+			 T20 = R1[WS(rs, 16)];
+			 T21 = T1Z - T20;
+			 T4y = T1Z + T20;
+		    }
+		    {
+			 E T1Q, T1R, T1U, T1V;
+			 T1Q = R1[WS(rs, 4)];
+			 T1R = R1[WS(rs, 20)];
+			 T1S = T1Q - T1R;
+			 T4K = T1Q + T1R;
+			 T1U = R1[WS(rs, 8)];
+			 T1V = R1[WS(rs, 24)];
+			 T1W = T1U - T1V;
+			 T4z = T1U + T1V;
+		    }
+		    T4A = T4y - T4z;
+		    T4L = T4J - T4K;
+		    {
+			 E T1T, T22, T5G, T5H;
+			 T1T = KP707106781 * (T1P - T1S);
+			 T1X = T1T - T1W;
+			 T3B = T1W + T1T;
+			 T22 = KP707106781 * (T1S + T1P);
+			 T23 = T21 + T22;
+			 T3y = T21 - T22;
+			 T5G = T4y + T4z;
+			 T5H = T4K + T4J;
+			 T5I = T5G + T5H;
+			 T66 = T5G - T5H;
+		    }
+	       }
+	       {
+		    E T2b, T4P, T2G, T4Q, T2e, T51, T2h, T50;
+		    {
+			 E T29, T2a, T2E, T2F;
+			 T29 = R1[WS(rs, 31)];
+			 T2a = R1[WS(rs, 15)];
+			 T2b = T29 - T2a;
+			 T4P = T29 + T2a;
+			 T2E = R1[WS(rs, 7)];
+			 T2F = R1[WS(rs, 23)];
+			 T2G = T2E - T2F;
+			 T4Q = T2E + T2F;
+		    }
+		    {
+			 E T2c, T2d, T2f, T2g;
+			 T2c = R1[WS(rs, 3)];
+			 T2d = R1[WS(rs, 19)];
+			 T2e = T2c - T2d;
+			 T51 = T2c + T2d;
+			 T2f = R1[WS(rs, 27)];
+			 T2g = R1[WS(rs, 11)];
+			 T2h = T2f - T2g;
+			 T50 = T2f + T2g;
+		    }
+		    T4R = T4P - T4Q;
+		    T52 = T50 - T51;
+		    {
+			 E T2i, T2D, T5N, T5O;
+			 T2i = KP707106781 * (T2e + T2h);
+			 T2j = T2b + T2i;
+			 T3F = T2b - T2i;
+			 T2D = KP707106781 * (T2h - T2e);
+			 T2H = T2D - T2G;
+			 T3I = T2G + T2D;
+			 T5N = T4P + T4Q;
+			 T5O = T51 + T50;
+			 T5P = T5N + T5O;
+			 T69 = T5N - T5O;
+		    }
+	       }
+	       {
+		    E TN, T1e, TX, T1g, TQ, T1k, TU, T1f, T1h, T1j;
+		    {
+			 E TL, TM, TV, TW;
+			 TL = R0[WS(rs, 31)];
+			 TM = R0[WS(rs, 15)];
+			 TN = TL + TM;
+			 T1e = TL - TM;
+			 TV = R0[WS(rs, 27)];
+			 TW = R0[WS(rs, 11)];
+			 TX = TV + TW;
+			 T1g = TV - TW;
+		    }
+		    {
+			 E TO, TP, TS, TT;
+			 TO = R0[WS(rs, 7)];
+			 TP = R0[WS(rs, 23)];
+			 TQ = TO + TP;
+			 T1k = TO - TP;
+			 TS = R0[WS(rs, 3)];
+			 TT = R0[WS(rs, 19)];
+			 TU = TS + TT;
+			 T1f = TS - TT;
+		    }
+		    T1h = KP707106781 * (T1f + T1g);
+		    T1i = T1e + T1h;
+		    T3t = T1e - T1h;
+		    T1j = KP707106781 * (T1g - T1f);
+		    T1l = T1j - T1k;
+		    T3u = T1k + T1j;
+		    {
+			 E TR, TY, T4t, T4u;
+			 TR = TN + TQ;
+			 TY = TU + TX;
+			 TZ = TR + TY;
+			 T63 = TR - TY;
+			 T4t = TN - TQ;
+			 T4u = TX - TU;
+			 T4v = FNMS(KP382683432, T4u, KP923879532 * T4t);
+			 T58 = FMA(KP382683432, T4t, KP923879532 * T4u);
+		    }
+	       }
+	       {
+		    E Ty, T1s, TI, T1n, TB, T1q, TF, T1o, T1p, T1t;
+		    {
+			 E Tw, Tx, TG, TH;
+			 Tw = R0[WS(rs, 1)];
+			 Tx = R0[WS(rs, 17)];
+			 Ty = Tw + Tx;
+			 T1s = Tw - Tx;
+			 TG = R0[WS(rs, 29)];
+			 TH = R0[WS(rs, 13)];
+			 TI = TG + TH;
+			 T1n = TG - TH;
+		    }
+		    {
+			 E Tz, TA, TD, TE;
+			 Tz = R0[WS(rs, 9)];
+			 TA = R0[WS(rs, 25)];
+			 TB = Tz + TA;
+			 T1q = Tz - TA;
+			 TD = R0[WS(rs, 5)];
+			 TE = R0[WS(rs, 21)];
+			 TF = TD + TE;
+			 T1o = TD - TE;
+		    }
+		    T1p = KP707106781 * (T1n - T1o);
+		    T1r = T1p - T1q;
+		    T3r = T1q + T1p;
+		    T1t = KP707106781 * (T1o + T1n);
+		    T1u = T1s + T1t;
+		    T3q = T1s - T1t;
+		    {
+			 E TC, TJ, T4q, T4r;
+			 TC = Ty + TB;
+			 TJ = TF + TI;
+			 TK = TC + TJ;
+			 T62 = TC - TJ;
+			 T4q = Ty - TB;
+			 T4r = TI - TF;
+			 T4s = FMA(KP923879532, T4q, KP382683432 * T4r);
+			 T57 = FNMS(KP382683432, T4q, KP923879532 * T4r);
+		    }
+	       }
+	       {
+		    E Ti, T16, Ts, T1a, Tl, T17, Tp, T19, T4m, T4n;
+		    {
+			 E Tg, Th, Tq, Tr;
+			 Tg = R0[WS(rs, 2)];
+			 Th = R0[WS(rs, 18)];
+			 Ti = Tg + Th;
+			 T16 = Tg - Th;
+			 Tq = R0[WS(rs, 6)];
+			 Tr = R0[WS(rs, 22)];
+			 Ts = Tq + Tr;
+			 T1a = Tq - Tr;
+		    }
+		    {
+			 E Tj, Tk, Tn, To;
+			 Tj = R0[WS(rs, 10)];
+			 Tk = R0[WS(rs, 26)];
+			 Tl = Tj + Tk;
+			 T17 = Tj - Tk;
+			 Tn = R0[WS(rs, 30)];
+			 To = R0[WS(rs, 14)];
+			 Tp = Tn + To;
+			 T19 = Tn - To;
+		    }
+		    Tm = Ti + Tl;
+		    Tt = Tp + Ts;
+		    Tu = Tm + Tt;
+		    T4m = Ti - Tl;
+		    T4n = Tp - Ts;
+		    T4o = KP707106781 * (T4m + T4n);
+		    T5b = KP707106781 * (T4n - T4m);
+		    {
+			 E T18, T1b, T2O, T2P;
+			 T18 = FNMS(KP382683432, T17, KP923879532 * T16);
+			 T1b = FMA(KP923879532, T19, KP382683432 * T1a);
+			 T1c = T18 + T1b;
+			 T3R = T1b - T18;
+			 T2O = FNMS(KP923879532, T1a, KP382683432 * T19);
+			 T2P = FMA(KP382683432, T16, KP923879532 * T17);
+			 T2Q = T2O - T2P;
+			 T3o = T2P + T2O;
+		    }
+	       }
+	       {
+		    E T1A, T4E, T1K, T4C, T1D, T4F, T1H, T4B;
+		    {
+			 E T1y, T1z, T1I, T1J;
+			 T1y = R1[WS(rs, 30)];
+			 T1z = R1[WS(rs, 14)];
+			 T1A = T1y - T1z;
+			 T4E = T1y + T1z;
+			 T1I = R1[WS(rs, 10)];
+			 T1J = R1[WS(rs, 26)];
+			 T1K = T1I - T1J;
+			 T4C = T1I + T1J;
+		    }
+		    {
+			 E T1B, T1C, T1F, T1G;
+			 T1B = R1[WS(rs, 6)];
+			 T1C = R1[WS(rs, 22)];
+			 T1D = T1B - T1C;
+			 T4F = T1B + T1C;
+			 T1F = R1[WS(rs, 2)];
+			 T1G = R1[WS(rs, 18)];
+			 T1H = T1F - T1G;
+			 T4B = T1F + T1G;
+		    }
+		    {
+			 E T1E, T1L, T5J, T5K;
+			 T1E = FNMS(KP923879532, T1D, KP382683432 * T1A);
+			 T1L = FMA(KP382683432, T1H, KP923879532 * T1K);
+			 T1M = T1E - T1L;
+			 T3z = T1L + T1E;
+			 T5J = T4B + T4C;
+			 T5K = T4E + T4F;
+			 T5L = T5J + T5K;
+			 T67 = T5K - T5J;
+		    }
+		    {
+			 E T24, T25, T4D, T4G;
+			 T24 = FNMS(KP382683432, T1K, KP923879532 * T1H);
+			 T25 = FMA(KP923879532, T1A, KP382683432 * T1D);
+			 T26 = T24 + T25;
+			 T3C = T25 - T24;
+			 T4D = T4B - T4C;
+			 T4G = T4E - T4F;
+			 T4H = KP707106781 * (T4D + T4G);
+			 T4M = KP707106781 * (T4G - T4D);
+		    }
+	       }
+	       {
+		    E T2m, T4S, T2w, T4W, T2p, T4T, T2t, T4V;
+		    {
+			 E T2k, T2l, T2u, T2v;
+			 T2k = R1[WS(rs, 1)];
+			 T2l = R1[WS(rs, 17)];
+			 T2m = T2k - T2l;
+			 T4S = T2k + T2l;
+			 T2u = R1[WS(rs, 5)];
+			 T2v = R1[WS(rs, 21)];
+			 T2w = T2u - T2v;
+			 T4W = T2u + T2v;
+		    }
+		    {
+			 E T2n, T2o, T2r, T2s;
+			 T2n = R1[WS(rs, 9)];
+			 T2o = R1[WS(rs, 25)];
+			 T2p = T2n - T2o;
+			 T4T = T2n + T2o;
+			 T2r = R1[WS(rs, 29)];
+			 T2s = R1[WS(rs, 13)];
+			 T2t = T2r - T2s;
+			 T4V = T2r + T2s;
+		    }
+		    {
+			 E T2q, T2x, T5Q, T5R;
+			 T2q = FNMS(KP382683432, T2p, KP923879532 * T2m);
+			 T2x = FMA(KP923879532, T2t, KP382683432 * T2w);
+			 T2y = T2q + T2x;
+			 T3J = T2x - T2q;
+			 T5Q = T4S + T4T;
+			 T5R = T4V + T4W;
+			 T5S = T5Q + T5R;
+			 T6a = T5R - T5Q;
+		    }
+		    {
+			 E T2A, T2B, T4U, T4X;
+			 T2A = FNMS(KP923879532, T2w, KP382683432 * T2t);
+			 T2B = FMA(KP382683432, T2m, KP923879532 * T2p);
+			 T2C = T2A - T2B;
+			 T3G = T2B + T2A;
+			 T4U = T4S - T4T;
+			 T4X = T4V - T4W;
+			 T4Y = KP707106781 * (T4U + T4X);
+			 T53 = KP707106781 * (T4X - T4U);
+		    }
+	       }
+	       {
+		    E Tv, T10, T5X, T5Y, T5Z, T60;
+		    Tv = Tf + Tu;
+		    T10 = TK + TZ;
+		    T5X = Tv + T10;
+		    T5Y = T5I + T5L;
+		    T5Z = T5P + T5S;
+		    T60 = T5Y + T5Z;
+		    Cr[WS(csr, 16)] = Tv - T10;
+		    Ci[WS(csi, 16)] = T5Z - T5Y;
+		    Cr[WS(csr, 32)] = T5X - T60;
+		    Cr[0] = T5X + T60;
+	       }
+	       {
+		    E T5F, T5V, T5U, T5W, T5M, T5T;
+		    T5F = Tf - Tu;
+		    T5V = TZ - TK;
+		    T5M = T5I - T5L;
+		    T5T = T5P - T5S;
+		    T5U = KP707106781 * (T5M + T5T);
+		    T5W = KP707106781 * (T5T - T5M);
+		    Cr[WS(csr, 24)] = T5F - T5U;
+		    Ci[WS(csi, 24)] = T5W - T5V;
+		    Cr[WS(csr, 8)] = T5F + T5U;
+		    Ci[WS(csi, 8)] = T5V + T5W;
+	       }
+	       {
+		    E T65, T6l, T6k, T6m, T6c, T6g, T6f, T6h;
+		    {
+			 E T61, T64, T6i, T6j;
+			 T61 = T7 - Te;
+			 T64 = KP707106781 * (T62 + T63);
+			 T65 = T61 + T64;
+			 T6l = T61 - T64;
+			 T6i = FNMS(KP382683432, T66, KP923879532 * T67);
+			 T6j = FMA(KP382683432, T69, KP923879532 * T6a);
+			 T6k = T6i + T6j;
+			 T6m = T6j - T6i;
+		    }
+		    {
+			 E T68, T6b, T6d, T6e;
+			 T68 = FMA(KP923879532, T66, KP382683432 * T67);
+			 T6b = FNMS(KP382683432, T6a, KP923879532 * T69);
+			 T6c = T68 + T6b;
+			 T6g = T6b - T68;
+			 T6d = KP707106781 * (T63 - T62);
+			 T6e = Tt - Tm;
+			 T6f = T6d - T6e;
+			 T6h = T6e + T6d;
+		    }
+		    Cr[WS(csr, 28)] = T65 - T6c;
+		    Ci[WS(csi, 28)] = T6k - T6h;
+		    Cr[WS(csr, 4)] = T65 + T6c;
+		    Ci[WS(csi, 4)] = T6h + T6k;
+		    Ci[WS(csi, 12)] = T6f + T6g;
+		    Cr[WS(csr, 12)] = T6l + T6m;
+		    Ci[WS(csi, 20)] = T6g - T6f;
+		    Cr[WS(csr, 20)] = T6l - T6m;
+	       }
+	       {
+		    E T5n, T5D, T5x, T5z, T5q, T5A, T5t, T5B;
+		    {
+			 E T5l, T5m, T5v, T5w;
+			 T5l = T4l - T4o;
+			 T5m = T58 - T57;
+			 T5n = T5l + T5m;
+			 T5D = T5l - T5m;
+			 T5v = T4v - T4s;
+			 T5w = T5b - T5a;
+			 T5x = T5v - T5w;
+			 T5z = T5w + T5v;
+		    }
+		    {
+			 E T5o, T5p, T5r, T5s;
+			 T5o = T4A - T4H;
+			 T5p = T4M - T4L;
+			 T5q = FMA(KP831469612, T5o, KP555570233 * T5p);
+			 T5A = FNMS(KP555570233, T5o, KP831469612 * T5p);
+			 T5r = T4R - T4Y;
+			 T5s = T53 - T52;
+			 T5t = FNMS(KP555570233, T5s, KP831469612 * T5r);
+			 T5B = FMA(KP555570233, T5r, KP831469612 * T5s);
+		    }
+		    {
+			 E T5u, T5C, T5y, T5E;
+			 T5u = T5q + T5t;
+			 Cr[WS(csr, 26)] = T5n - T5u;
+			 Cr[WS(csr, 6)] = T5n + T5u;
+			 T5C = T5A + T5B;
+			 Ci[WS(csi, 6)] = T5z + T5C;
+			 Ci[WS(csi, 26)] = T5C - T5z;
+			 T5y = T5t - T5q;
+			 Ci[WS(csi, 10)] = T5x + T5y;
+			 Ci[WS(csi, 22)] = T5y - T5x;
+			 T5E = T5B - T5A;
+			 Cr[WS(csr, 22)] = T5D - T5E;
+			 Cr[WS(csr, 10)] = T5D + T5E;
+		    }
+	       }
+	       {
+		    E T4x, T5j, T5d, T5f, T4O, T5g, T55, T5h;
+		    {
+			 E T4p, T4w, T59, T5c;
+			 T4p = T4l + T4o;
+			 T4w = T4s + T4v;
+			 T4x = T4p + T4w;
+			 T5j = T4p - T4w;
+			 T59 = T57 + T58;
+			 T5c = T5a + T5b;
+			 T5d = T59 - T5c;
+			 T5f = T5c + T59;
+		    }
+		    {
+			 E T4I, T4N, T4Z, T54;
+			 T4I = T4A + T4H;
+			 T4N = T4L + T4M;
+			 T4O = FMA(KP980785280, T4I, KP195090322 * T4N);
+			 T5g = FNMS(KP195090322, T4I, KP980785280 * T4N);
+			 T4Z = T4R + T4Y;
+			 T54 = T52 + T53;
+			 T55 = FNMS(KP195090322, T54, KP980785280 * T4Z);
+			 T5h = FMA(KP195090322, T4Z, KP980785280 * T54);
+		    }
+		    {
+			 E T56, T5i, T5e, T5k;
+			 T56 = T4O + T55;
+			 Cr[WS(csr, 30)] = T4x - T56;
+			 Cr[WS(csr, 2)] = T4x + T56;
+			 T5i = T5g + T5h;
+			 Ci[WS(csi, 2)] = T5f + T5i;
+			 Ci[WS(csi, 30)] = T5i - T5f;
+			 T5e = T55 - T4O;
+			 Ci[WS(csi, 14)] = T5d + T5e;
+			 Ci[WS(csi, 18)] = T5e - T5d;
+			 T5k = T5h - T5g;
+			 Cr[WS(csr, 18)] = T5j - T5k;
+			 Cr[WS(csr, 14)] = T5j + T5k;
+		    }
+	       }
+	       {
+		    E T3p, T41, T4c, T3S, T3w, T4b, T49, T4h, T3P, T42, T3E, T3W, T46, T4g, T3L;
+		    E T3X;
+		    {
+			 E T3s, T3v, T3A, T3D;
+			 T3p = T3n + T3o;
+			 T41 = T3n - T3o;
+			 T4c = T3R - T3Q;
+			 T3S = T3Q + T3R;
+			 T3s = FMA(KP831469612, T3q, KP555570233 * T3r);
+			 T3v = FNMS(KP555570233, T3u, KP831469612 * T3t);
+			 T3w = T3s + T3v;
+			 T4b = T3v - T3s;
+			 {
+			      E T47, T48, T3N, T3O;
+			      T47 = T3F - T3G;
+			      T48 = T3J - T3I;
+			      T49 = FNMS(KP471396736, T48, KP881921264 * T47);
+			      T4h = FMA(KP471396736, T47, KP881921264 * T48);
+			      T3N = FNMS(KP555570233, T3q, KP831469612 * T3r);
+			      T3O = FMA(KP555570233, T3t, KP831469612 * T3u);
+			      T3P = T3N + T3O;
+			      T42 = T3O - T3N;
+			 }
+			 T3A = T3y + T3z;
+			 T3D = T3B + T3C;
+			 T3E = FMA(KP956940335, T3A, KP290284677 * T3D);
+			 T3W = FNMS(KP290284677, T3A, KP956940335 * T3D);
+			 {
+			      E T44, T45, T3H, T3K;
+			      T44 = T3y - T3z;
+			      T45 = T3C - T3B;
+			      T46 = FMA(KP881921264, T44, KP471396736 * T45);
+			      T4g = FNMS(KP471396736, T44, KP881921264 * T45);
+			      T3H = T3F + T3G;
+			      T3K = T3I + T3J;
+			      T3L = FNMS(KP290284677, T3K, KP956940335 * T3H);
+			      T3X = FMA(KP290284677, T3H, KP956940335 * T3K);
+			 }
+		    }
+		    {
+			 E T3x, T3M, T3V, T3Y;
+			 T3x = T3p + T3w;
+			 T3M = T3E + T3L;
+			 Cr[WS(csr, 29)] = T3x - T3M;
+			 Cr[WS(csr, 3)] = T3x + T3M;
+			 T3V = T3S + T3P;
+			 T3Y = T3W + T3X;
+			 Ci[WS(csi, 3)] = T3V + T3Y;
+			 Ci[WS(csi, 29)] = T3Y - T3V;
+		    }
+		    {
+			 E T3T, T3U, T3Z, T40;
+			 T3T = T3P - T3S;
+			 T3U = T3L - T3E;
+			 Ci[WS(csi, 13)] = T3T + T3U;
+			 Ci[WS(csi, 19)] = T3U - T3T;
+			 T3Z = T3p - T3w;
+			 T40 = T3X - T3W;
+			 Cr[WS(csr, 19)] = T3Z - T40;
+			 Cr[WS(csr, 13)] = T3Z + T40;
+		    }
+		    {
+			 E T43, T4a, T4f, T4i;
+			 T43 = T41 + T42;
+			 T4a = T46 + T49;
+			 Cr[WS(csr, 27)] = T43 - T4a;
+			 Cr[WS(csr, 5)] = T43 + T4a;
+			 T4f = T4c + T4b;
+			 T4i = T4g + T4h;
+			 Ci[WS(csi, 5)] = T4f + T4i;
+			 Ci[WS(csi, 27)] = T4i - T4f;
+		    }
+		    {
+			 E T4d, T4e, T4j, T4k;
+			 T4d = T4b - T4c;
+			 T4e = T49 - T46;
+			 Ci[WS(csi, 11)] = T4d + T4e;
+			 Ci[WS(csi, 21)] = T4e - T4d;
+			 T4j = T41 - T42;
+			 T4k = T4h - T4g;
+			 Cr[WS(csr, 21)] = T4j - T4k;
+			 Cr[WS(csr, 11)] = T4j + T4k;
+		    }
+	       }
+	       {
+		    E T1d, T33, T3e, T2U, T1w, T3d, T3b, T3j, T2N, T34, T28, T2Y, T38, T3i, T2J;
+		    E T2Z;
+		    {
+			 E T1m, T1v, T1Y, T27;
+			 T1d = T15 - T1c;
+			 T33 = T15 + T1c;
+			 T3e = T2T + T2Q;
+			 T2U = T2Q - T2T;
+			 T1m = FMA(KP195090322, T1i, KP980785280 * T1l);
+			 T1v = FNMS(KP195090322, T1u, KP980785280 * T1r);
+			 T1w = T1m - T1v;
+			 T3d = T1v + T1m;
+			 {
+			      E T39, T3a, T2L, T2M;
+			      T39 = T2j + T2y;
+			      T3a = T2H + T2C;
+			      T3b = FNMS(KP098017140, T3a, KP995184726 * T39);
+			      T3j = FMA(KP995184726, T3a, KP098017140 * T39);
+			      T2L = FNMS(KP195090322, T1l, KP980785280 * T1i);
+			      T2M = FMA(KP980785280, T1u, KP195090322 * T1r);
+			      T2N = T2L - T2M;
+			      T34 = T2M + T2L;
+			 }
+			 T1Y = T1M - T1X;
+			 T27 = T23 - T26;
+			 T28 = FMA(KP634393284, T1Y, KP773010453 * T27);
+			 T2Y = FNMS(KP634393284, T27, KP773010453 * T1Y);
+			 {
+			      E T36, T37, T2z, T2I;
+			      T36 = T1X + T1M;
+			      T37 = T23 + T26;
+			      T38 = FMA(KP098017140, T36, KP995184726 * T37);
+			      T3i = FNMS(KP098017140, T37, KP995184726 * T36);
+			      T2z = T2j - T2y;
+			      T2I = T2C - T2H;
+			      T2J = FNMS(KP634393284, T2I, KP773010453 * T2z);
+			      T2Z = FMA(KP773010453, T2I, KP634393284 * T2z);
+			 }
+		    }
+		    {
+			 E T1x, T2K, T2X, T30;
+			 T1x = T1d + T1w;
+			 T2K = T28 + T2J;
+			 Cr[WS(csr, 25)] = T1x - T2K;
+			 Cr[WS(csr, 7)] = T1x + T2K;
+			 T2X = T2U + T2N;
+			 T30 = T2Y + T2Z;
+			 Ci[WS(csi, 7)] = T2X + T30;
+			 Ci[WS(csi, 25)] = T30 - T2X;
+		    }
+		    {
+			 E T2V, T2W, T31, T32;
+			 T2V = T2N - T2U;
+			 T2W = T2J - T28;
+			 Ci[WS(csi, 9)] = T2V + T2W;
+			 Ci[WS(csi, 23)] = T2W - T2V;
+			 T31 = T1d - T1w;
+			 T32 = T2Z - T2Y;
+			 Cr[WS(csr, 23)] = T31 - T32;
+			 Cr[WS(csr, 9)] = T31 + T32;
+		    }
+		    {
+			 E T35, T3c, T3h, T3k;
+			 T35 = T33 + T34;
+			 T3c = T38 + T3b;
+			 Cr[WS(csr, 31)] = T35 - T3c;
+			 Cr[WS(csr, 1)] = T35 + T3c;
+			 T3h = T3e + T3d;
+			 T3k = T3i + T3j;
+			 Ci[WS(csi, 1)] = T3h + T3k;
+			 Ci[WS(csi, 31)] = T3k - T3h;
+		    }
+		    {
+			 E T3f, T3g, T3l, T3m;
+			 T3f = T3d - T3e;
+			 T3g = T3b - T38;
+			 Ci[WS(csi, 15)] = T3f + T3g;
+			 Ci[WS(csi, 17)] = T3g - T3f;
+			 T3l = T33 - T34;
+			 T3m = T3j - T3i;
+			 Cr[WS(csr, 17)] = T3l - T3m;
+			 Cr[WS(csr, 15)] = T3l + T3m;
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 64, "r2cf_64", {342, 72, 52, 0}, &GENUS };
+
+void X(codelet_r2cf_64) (planner *p) {
+     X(kr2c_register) (p, r2cf_64, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_7.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_7.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cf_7 -include r2cf.h */
+
+/*
+ * This function contains 24 FP additions, 18 FP multiplications,
+ * (or, 9 additions, 3 multiplications, 15 fused multiply/add),
+ * 25 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP801937735, +0.801937735804838252472204639014890102331838324);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     DK(KP692021471, +0.692021471630095869627814897002069140197260599);
+     DK(KP554958132, +0.554958132087371191422194871006410481067288862);
+     DK(KP356895867, +0.356895867892209443894399510021300583399127187);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
+	       E T1, Tg, Tc;
+	       {
+		    E Th, T4, Ti, Ta, Tj, T7, Td, T5, T6, Tl, Tk;
+		    T1 = R0[0];
+		    {
+			 E T2, T3, T8, T9;
+			 T2 = R1[0];
+			 T3 = R0[WS(rs, 3)];
+			 T8 = R1[WS(rs, 1)];
+			 T9 = R0[WS(rs, 2)];
+			 T5 = R0[WS(rs, 1)];
+			 Th = T3 - T2;
+			 T4 = T2 + T3;
+			 T6 = R1[WS(rs, 2)];
+			 Ti = T9 - T8;
+			 Ta = T8 + T9;
+		    }
+		    Tj = T6 - T5;
+		    T7 = T5 + T6;
+		    Td = FNMS(KP356895867, T4, Ta);
+		    Tl = FMA(KP554958132, Ti, Th);
+		    Tk = FMA(KP554958132, Tj, Ti);
+		    {
+			 E Tm, Tf, Tb, Te;
+			 Tm = FNMS(KP554958132, Th, Tj);
+			 Cr[0] = T1 + T4 + T7 + Ta;
+			 Tf = FNMS(KP356895867, T7, T4);
+			 Tb = FNMS(KP356895867, Ta, T7);
+			 Te = FNMS(KP692021471, Td, T7);
+			 Ci[WS(csi, 2)] = KP974927912 * (FNMS(KP801937735, Tk, Th));
+			 Ci[WS(csi, 3)] = KP974927912 * (FNMS(KP801937735, Tm, Ti));
+			 Tg = FNMS(KP692021471, Tf, Ta);
+			 Tc = FNMS(KP692021471, Tb, T4);
+			 Cr[WS(csr, 2)] = FNMS(KP900968867, Te, T1);
+			 Ci[WS(csi, 1)] = KP974927912 * (FMA(KP801937735, Tl, Tj));
+		    }
+	       }
+	       Cr[WS(csr, 1)] = FNMS(KP900968867, Tg, T1);
+	       Cr[WS(csr, 3)] = FNMS(KP900968867, Tc, T1);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 7, "r2cf_7", {9, 3, 15, 0}, &GENUS };
+
+void X(codelet_r2cf_7) (planner *p) {
+     X(kr2c_register) (p, r2cf_7, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cf_7 -include r2cf.h */
+
+/*
+ * This function contains 24 FP additions, 18 FP multiplications,
+ * (or, 12 additions, 6 multiplications, 12 fused multiply/add),
+ * 20 stack variables, 6 constants, and 14 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP222520933, +0.222520933956314404288902564496794759466355569);
+     DK(KP900968867, +0.900968867902419126236102319507445051165919162);
+     DK(KP623489801, +0.623489801858733530525004884004239810632274731);
+     DK(KP433883739, +0.433883739117558120475768332848358754609990728);
+     DK(KP781831482, +0.781831482468029808708444526674057750232334519);
+     DK(KP974927912, +0.974927912181823607018131682993931217232785801);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
+	       E T1, Ta, Tb, T4, Td, T7, Tc, T8, T9;
+	       T1 = R0[0];
+	       T8 = R1[0];
+	       T9 = R0[WS(rs, 3)];
+	       Ta = T8 + T9;
+	       Tb = T9 - T8;
+	       {
+		    E T2, T3, T5, T6;
+		    T2 = R0[WS(rs, 1)];
+		    T3 = R1[WS(rs, 2)];
+		    T4 = T2 + T3;
+		    Td = T3 - T2;
+		    T5 = R1[WS(rs, 1)];
+		    T6 = R0[WS(rs, 2)];
+		    T7 = T5 + T6;
+		    Tc = T6 - T5;
+	       }
+	       Ci[WS(csi, 2)] = FNMS(KP781831482, Tc, KP974927912 * Tb) - (KP433883739 * Td);
+	       Ci[WS(csi, 1)] = FMA(KP781831482, Tb, KP974927912 * Td) + (KP433883739 * Tc);
+	       Cr[WS(csr, 2)] = FMA(KP623489801, T7, T1) + FNMA(KP900968867, T4, KP222520933 * Ta);
+	       Ci[WS(csi, 3)] = FMA(KP433883739, Tb, KP974927912 * Tc) - (KP781831482 * Td);
+	       Cr[WS(csr, 3)] = FMA(KP623489801, T4, T1) + FNMA(KP222520933, T7, KP900968867 * Ta);
+	       Cr[WS(csr, 1)] = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
+	       Cr[0] = T1 + Ta + T4 + T7;
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 7, "r2cf_7", {12, 6, 12, 0}, &GENUS };
+
+void X(codelet_r2cf_7) (planner *p) {
+     X(kr2c_register) (p, r2cf_7, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cf_8 -include r2cf.h */
+
+/*
+ * This function contains 20 FP additions, 4 FP multiplications,
+ * (or, 16 additions, 0 multiplications, 4 fused multiply/add),
+ * 18 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
+	       E T4, T7, T3, Tj, Td, T5, T8, T9;
+	       {
+		    E T1, T2, Tb, Tc;
+		    T1 = R0[0];
+		    T2 = R0[WS(rs, 2)];
+		    Tb = R1[WS(rs, 3)];
+		    Tc = R1[WS(rs, 1)];
+		    T4 = R0[WS(rs, 1)];
+		    T7 = T1 - T2;
+		    T3 = T1 + T2;
+		    Tj = Tb + Tc;
+		    Td = Tb - Tc;
+		    T5 = R0[WS(rs, 3)];
+		    T8 = R1[0];
+		    T9 = R1[WS(rs, 2)];
+	       }
+	       {
+		    E T6, Tf, Ta, Ti;
+		    T6 = T4 + T5;
+		    Tf = T4 - T5;
+		    Ta = T8 - T9;
+		    Ti = T8 + T9;
+		    {
+			 E Th, Tk, Te, Tg;
+			 Th = T3 + T6;
+			 Cr[WS(csr, 2)] = T3 - T6;
+			 Tk = Ti + Tj;
+			 Ci[WS(csi, 2)] = Tj - Ti;
+			 Te = Ta + Td;
+			 Tg = Td - Ta;
+			 Cr[0] = Th + Tk;
+			 Cr[WS(csr, 4)] = Th - Tk;
+			 Ci[WS(csi, 3)] = FMA(KP707106781, Tg, Tf);
+			 Ci[WS(csi, 1)] = FMS(KP707106781, Tg, Tf);
+			 Cr[WS(csr, 1)] = FMA(KP707106781, Te, T7);
+			 Cr[WS(csr, 3)] = FNMS(KP707106781, Te, T7);
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 8, "r2cf_8", {16, 0, 4, 0}, &GENUS };
+
+void X(codelet_r2cf_8) (planner *p) {
+     X(kr2c_register) (p, r2cf_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cf_8 -include r2cf.h */
+
+/*
+ * This function contains 20 FP additions, 2 FP multiplications,
+ * (or, 20 additions, 2 multiplications, 0 fused multiply/add),
+ * 14 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
+	       E T3, T7, Td, Tj, T6, Tg, Ta, Ti;
+	       {
+		    E T1, T2, Tb, Tc;
+		    T1 = R0[0];
+		    T2 = R0[WS(rs, 2)];
+		    T3 = T1 + T2;
+		    T7 = T1 - T2;
+		    Tb = R1[WS(rs, 3)];
+		    Tc = R1[WS(rs, 1)];
+		    Td = Tb - Tc;
+		    Tj = Tb + Tc;
+	       }
+	       {
+		    E T4, T5, T8, T9;
+		    T4 = R0[WS(rs, 1)];
+		    T5 = R0[WS(rs, 3)];
+		    T6 = T4 + T5;
+		    Tg = T4 - T5;
+		    T8 = R1[0];
+		    T9 = R1[WS(rs, 2)];
+		    Ta = T8 - T9;
+		    Ti = T8 + T9;
+	       }
+	       Cr[WS(csr, 2)] = T3 - T6;
+	       Ci[WS(csi, 2)] = Tj - Ti;
+	       {
+		    E Te, Tf, Th, Tk;
+		    Te = KP707106781 * (Ta + Td);
+		    Cr[WS(csr, 3)] = T7 - Te;
+		    Cr[WS(csr, 1)] = T7 + Te;
+		    Tf = KP707106781 * (Td - Ta);
+		    Ci[WS(csi, 1)] = Tf - Tg;
+		    Ci[WS(csi, 3)] = Tg + Tf;
+		    Th = T3 + T6;
+		    Tk = Ti + Tj;
+		    Cr[WS(csr, 4)] = Th - Tk;
+		    Cr[0] = Th + Tk;
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 8, "r2cf_8", {20, 2, 0, 0}, &GENUS };
+
+void X(codelet_r2cf_8) (planner *p) {
+     X(kr2c_register) (p, r2cf_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_9.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cf/r2cf_9.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:39:45 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cf_9 -include r2cf.h */
+
+/*
+ * This function contains 38 FP additions, 30 FP multiplications,
+ * (or, 12 additions, 4 multiplications, 26 fused multiply/add),
+ * 57 stack variables, 18 constants, and 18 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP907603734, +0.907603734547952313649323976213898122064543220);
+     DK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DK(KP347296355, +0.347296355333860697703433253538629592000751354);
+     DK(KP666666666, +0.666666666666666666666666666666666666666666667);
+     DK(KP879385241, +0.879385241571816768108218554649462939872416269);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP673648177, +0.673648177666930348851716626769314796000375677);
+     DK(KP898197570, +0.898197570222573798468955502359086394667167570);
+     DK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP203604859, +0.203604859554852403062088995281827210665664861);
+     DK(KP152703644, +0.152703644666139302296566746461370407999248646);
+     DK(KP394930843, +0.394930843634698457567117349190734585290304520);
+     DK(KP968908795, +0.968908795874236621082202410917456709164223497);
+     DK(KP726681596, +0.726681596905677465811651808188092531873167623);
+     DK(KP586256827, +0.586256827714544512072145703099641959914944179);
+     DK(KP184792530, +0.184792530904095372701352047572203755870913560);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
+	       E Tp, Tz, Tw, Ts, TA;
+	       {
+		    E T1, T6, Tb, T7, T4, To, T8, Tc, Td, T2, T3;
+		    T1 = R0[0];
+		    T2 = R1[WS(rs, 1)];
+		    T3 = R0[WS(rs, 3)];
+		    T6 = R1[0];
+		    Tb = R0[WS(rs, 1)];
+		    T7 = R0[WS(rs, 2)];
+		    T4 = T2 + T3;
+		    To = T3 - T2;
+		    T8 = R1[WS(rs, 3)];
+		    Tc = R1[WS(rs, 2)];
+		    Td = R0[WS(rs, 4)];
+		    {
+			 E T5, T9, Tk, Te, Ti;
+			 T5 = T1 + T4;
+			 Tp = FNMS(KP500000000, T4, T1);
+			 T9 = T7 + T8;
+			 Tk = T7 - T8;
+			 Te = Tc + Td;
+			 Ti = Td - Tc;
+			 {
+			      E Tl, Ta, Tu, Tf, Th;
+			      Tl = FMS(KP500000000, T9, T6);
+			      Ta = T6 + T9;
+			      Tu = FMA(KP184792530, Tk, Ti);
+			      Tf = Tb + Te;
+			      Th = FNMS(KP500000000, Te, Tb);
+			      {
+				   E Tq, Ty, Tm, Tt;
+				   Tq = FMA(KP586256827, Tl, Ti);
+				   Ty = FMA(KP726681596, Tk, Tl);
+				   Tm = FNMS(KP968908795, Tl, Tk);
+				   Tt = FMA(KP394930843, Th, To);
+				   {
+					E Tj, Tx, Tg, Tv;
+					Tj = FNMS(KP152703644, Ti, Th);
+					Tx = FMA(KP203604859, Th, Ti);
+					Tg = Ta + Tf;
+					Ci[WS(csi, 3)] = KP866025403 * (Tf - Ta);
+					Tv = FNMS(KP939692620, Tu, Tt);
+					{
+					     E TB, Tn, TC, Tr;
+					     TB = FMA(KP898197570, Ty, Tx);
+					     Tz = FNMS(KP898197570, Ty, Tx);
+					     Tw = FNMS(KP673648177, Tm, Tj);
+					     Tn = FMA(KP673648177, Tm, Tj);
+					     Cr[0] = T5 + Tg;
+					     Cr[WS(csr, 3)] = FNMS(KP500000000, Tg, T5);
+					     Ci[WS(csi, 2)] = KP984807753 * (FNMS(KP879385241, Tv, Tl));
+					     Ci[WS(csi, 1)] = -(KP984807753 * (FNMS(KP879385241, To, Tn)));
+					     TC = FMA(KP666666666, Tn, TB);
+					     Tr = FNMS(KP347296355, Tq, Tk);
+					     Ci[WS(csi, 4)] = KP866025403 * (FMA(KP852868531, TC, To));
+					     Ts = FNMS(KP907603734, Tr, Th);
+					}
+				   }
+			      }
+			 }
+		    }
+	       }
+	       Cr[WS(csr, 1)] = FMA(KP852868531, Tz, Tp);
+	       TA = FNMS(KP500000000, Tz, Tw);
+	       Cr[WS(csr, 2)] = FNMS(KP939692620, Ts, Tp);
+	       Cr[WS(csr, 4)] = FMA(KP852868531, TA, Tp);
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 9, "r2cf_9", {12, 4, 26, 0}, &GENUS };
+
+void X(codelet_r2cf_9) (planner *p) {
+     X(kr2c_register) (p, r2cf_9, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cf_9 -include r2cf.h */
+
+/*
+ * This function contains 38 FP additions, 26 FP multiplications,
+ * (or, 21 additions, 9 multiplications, 17 fused multiply/add),
+ * 36 stack variables, 14 constants, and 18 memory accesses
+ */
+#include "r2cf.h"
+
+static void r2cf_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
+{
+     DK(KP939692620, +0.939692620785908384054109277324731469936208134);
+     DK(KP296198132, +0.296198132726023843175338011893050938967728390);
+     DK(KP342020143, +0.342020143325668733044099614682259580763083368);
+     DK(KP813797681, +0.813797681349373692844693217248393223289101568);
+     DK(KP984807753, +0.984807753012208059366743024589523013670643252);
+     DK(KP150383733, +0.150383733180435296639271897612501926072238258);
+     DK(KP642787609, +0.642787609686539326322643409907263432907559884);
+     DK(KP663413948, +0.663413948168938396205421319635891297216863310);
+     DK(KP852868531, +0.852868531952443209628250963940074071936020296);
+     DK(KP173648177, +0.173648177666930348851716626769314796000375677);
+     DK(KP556670399, +0.556670399226419366452912952047023132968291906);
+     DK(KP766044443, +0.766044443118978035202392650555416673935832457);
+     DK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
+	       E T1, T4, Tr, Ta, Tl, Ti, Tf, Tk, Tj, T2, T3, T5, Tg;
+	       T1 = R0[0];
+	       T2 = R1[WS(rs, 1)];
+	       T3 = R0[WS(rs, 3)];
+	       T4 = T2 + T3;
+	       Tr = T3 - T2;
+	       {
+		    E T6, T7, T8, T9;
+		    T6 = R1[0];
+		    T7 = R0[WS(rs, 2)];
+		    T8 = R1[WS(rs, 3)];
+		    T9 = T7 + T8;
+		    Ta = T6 + T9;
+		    Tl = T8 - T7;
+		    Ti = FNMS(KP500000000, T9, T6);
+	       }
+	       {
+		    E Tb, Tc, Td, Te;
+		    Tb = R0[WS(rs, 1)];
+		    Tc = R1[WS(rs, 2)];
+		    Td = R0[WS(rs, 4)];
+		    Te = Tc + Td;
+		    Tf = Tb + Te;
+		    Tk = FNMS(KP500000000, Te, Tb);
+		    Tj = Td - Tc;
+	       }
+	       Ci[WS(csi, 3)] = KP866025403 * (Tf - Ta);
+	       T5 = T1 + T4;
+	       Tg = Ta + Tf;
+	       Cr[WS(csr, 3)] = FNMS(KP500000000, Tg, T5);
+	       Cr[0] = T5 + Tg;
+	       {
+		    E Tt, Th, Tm, Tn, To, Tp, Tq, Ts;
+		    Tt = KP866025403 * Tr;
+		    Th = FNMS(KP500000000, T4, T1);
+		    Tm = FMA(KP766044443, Ti, KP556670399 * Tl);
+		    Tn = FMA(KP173648177, Tk, KP852868531 * Tj);
+		    To = Tm + Tn;
+		    Tp = FNMS(KP642787609, Ti, KP663413948 * Tl);
+		    Tq = FNMS(KP984807753, Tk, KP150383733 * Tj);
+		    Ts = Tp + Tq;
+		    Cr[WS(csr, 1)] = Th + To;
+		    Ci[WS(csi, 1)] = Tt + Ts;
+		    Cr[WS(csr, 4)] = FMA(KP866025403, Tp - Tq, Th) - (KP500000000 * To);
+		    Ci[WS(csi, 4)] = FNMS(KP500000000, Ts, KP866025403 * (Tr + (Tn - Tm)));
+		    Ci[WS(csi, 2)] = FNMS(KP342020143, Tk, KP813797681 * Tj) + FNMA(KP150383733, Tl, KP984807753 * Ti) - Tt;
+		    Cr[WS(csr, 2)] = FMA(KP173648177, Ti, Th) + FNMA(KP296198132, Tj, KP939692620 * Tk) - (KP852868531 * Tl);
+	       }
+	  }
+     }
+}
+
+static const kr2c_desc desc = { 9, "r2cf_9", {21, 9, 17, 0}, &GENUS };
+
+void X(codelet_r2cf_9) (planner *p) {
+     X(kr2c_register) (p, r2cf_9, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2cfII.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2cfII.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(rdft_r2cfII_genus)
+extern const kr2c_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2r.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2r.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "codelet-rdft.h"
+#include "r2r.h"
+
+const kr2r_genus GENUS = { 1 };
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2r.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2r.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#define GENUS X(rdft_r2r_genus)
+extern const kr2r_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2r/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2r/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,90 @@
+# This Makefile.am specifies a set of codelets, efficient transforms
+# of small sizes, that are used as building blocks (kernels) by FFTW
+# to build up large transforms, as well as the options for generating
+# and compiling them.
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+###########################################################################
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/scalar
+noinst_LTLIBRARIES = librdft_scalar_r2r.la
+
+###########################################################################
+# The following lines specify the REDFT/RODFT/DHT sizes for which to generate
+# specialized codelets.  Currently, only REDFT01/10 of size 8 (used in JPEG).
+
+# e<a><b>_<n> is a hard-coded REDFT<a><b> FFT (DCT) of size <n>
+E00 = # e00_2.c e00_3.c e00_4.c e00_5.c e00_6.c e00_7.c e00_8.c
+E01 = e01_8.c # e01_2.c e01_3.c e01_4.c e01_5.c e01_6.c e01_7.c
+E10 = e10_8.c # e10_2.c e10_3.c e10_4.c e10_5.c e10_6.c e10_7.c
+E11 = # e11_2.c e11_3.c e11_4.c e11_5.c e11_6.c e11_7.c e11_8.c
+
+# o<a><b>_<n> is a hard-coded RODFT<a><b> FFT (DST) of size <n>
+O00 = # o00_2.c o00_3.c o00_4.c o00_5.c o00_6.c o00_7.c o00_8.c
+O01 = # o01_2.c o01_3.c o01_4.c o01_5.c o01_6.c o01_7.c o01_8.c
+O10 = # o10_2.c o10_3.c o10_4.c o10_5.c o10_6.c o10_7.c o10_8.c
+O11 = # o11_2.c o11_3.c o11_4.c o11_5.c o11_6.c o11_7.c o11_8.c
+
+# dht_<n> is a hard-coded DHT of size <n>
+DHT = # dht_2.c dht_3.c dht_4.c dht_5.c dht_6.c dht_7.c dht_8.c
+
+###########################################################################
+ALL_CODELETS = $(E00) $(E01) $(E10) $(E11) $(O00) $(O01) $(O10) $(O11) $(DHT)
+
+BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
+
+librdft_scalar_r2r_la_SOURCES = $(BUILT_SOURCES)
+
+SOLVTAB_NAME = X(solvtab_rdft_r2r)
+XRENAME=X
+
+# special rules for regenerating codelets.
+include $(top_srcdir)/support/Makefile.codelets
+
+if MAINTAINER_MODE
+FLAGS_E00=$(RDFT_FLAGS_COMMON)
+FLAGS_E01=$(RDFT_FLAGS_COMMON)
+FLAGS_E10=$(RDFT_FLAGS_COMMON)
+FLAGS_E11=$(RDFT_FLAGS_COMMON)
+FLAGS_O00=$(RDFT_FLAGS_COMMON)
+FLAGS_O01=$(RDFT_FLAGS_COMMON)
+FLAGS_O10=$(RDFT_FLAGS_COMMON)
+FLAGS_O11=$(RDFT_FLAGS_COMMON)
+FLAGS_DHT=$(RDFT_FLAGS_COMMON)
+
+e00_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_E00) -redft00 -n $* -name e00_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+e01_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_E01) -redft01 -n $* -name e01_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+e10_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_E10) -redft10 -n $* -name e10_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+e11_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_E11) -redft11 -n $* -name e11_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+
+o00_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_O00) -rodft00 -n $* -name o00_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+o01_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_O01) -rodft01 -n $* -name o01_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+o10_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_O10) -rodft10 -n $* -name o10_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+o11_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_O11) -rodft11 -n $* -name o11_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+
+dht_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_DHT) -dht -sign 1 -n $* -name dht_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+endif # MAINTAINER_MODE
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2r/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2r/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,699 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This Makefile.am specifies a set of codelets, efficient transforms
+# of small sizes, that are used as building blocks (kernels) by FFTW
+# to build up large transforms, as well as the options for generating
+# and compiling them.
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+# -*- makefile -*-
+# This file contains special make rules to generate codelets.
+# Most of this file requires GNU make .
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/support/Makefile.codelets
+subdir = rdft/scalar/r2r
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+librdft_scalar_r2r_la_LIBADD =
+am__objects_1 =
+am__objects_2 = e01_8.lo
+am__objects_3 = e10_8.lo
+am__objects_4 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
+	$(am__objects_1) $(am__objects_1) $(am__objects_1) \
+	$(am__objects_1) $(am__objects_1) $(am__objects_1)
+am__objects_5 = codlist.lo
+am__objects_6 = $(am__objects_4) $(am__objects_5)
+am_librdft_scalar_r2r_la_OBJECTS = $(am__objects_6)
+librdft_scalar_r2r_la_OBJECTS = $(am_librdft_scalar_r2r_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(librdft_scalar_r2r_la_SOURCES)
+DIST_SOURCES = $(librdft_scalar_r2r_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+
+###########################################################################
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/scalar
+
+noinst_LTLIBRARIES = librdft_scalar_r2r.la
+
+###########################################################################
+# The following lines specify the REDFT/RODFT/DHT sizes for which to generate
+# specialized codelets.  Currently, only REDFT01/10 of size 8 (used in JPEG).
+
+# e<a><b>_<n> is a hard-coded REDFT<a><b> FFT (DCT) of size <n>
+E00 = # e00_2.c e00_3.c e00_4.c e00_5.c e00_6.c e00_7.c e00_8.c
+E01 = e01_8.c # e01_2.c e01_3.c e01_4.c e01_5.c e01_6.c e01_7.c
+E10 = e10_8.c # e10_2.c e10_3.c e10_4.c e10_5.c e10_6.c e10_7.c
+E11 = # e11_2.c e11_3.c e11_4.c e11_5.c e11_6.c e11_7.c e11_8.c
+
+# o<a><b>_<n> is a hard-coded RODFT<a><b> FFT (DST) of size <n>
+O00 = # o00_2.c o00_3.c o00_4.c o00_5.c o00_6.c o00_7.c o00_8.c
+O01 = # o01_2.c o01_3.c o01_4.c o01_5.c o01_6.c o01_7.c o01_8.c
+O10 = # o10_2.c o10_3.c o10_4.c o10_5.c o10_6.c o10_7.c o10_8.c
+O11 = # o11_2.c o11_3.c o11_4.c o11_5.c o11_6.c o11_7.c o11_8.c
+
+# dht_<n> is a hard-coded DHT of size <n>
+DHT = # dht_2.c dht_3.c dht_4.c dht_5.c dht_6.c dht_7.c dht_8.c
+
+###########################################################################
+ALL_CODELETS = $(E00) $(E01) $(E10) $(E11) $(O00) $(O01) $(O10) $(O11) $(DHT)
+BUILT_SOURCES = $(ALL_CODELETS) $(CODLIST)
+librdft_scalar_r2r_la_SOURCES = $(BUILT_SOURCES)
+SOLVTAB_NAME = X(solvtab_rdft_r2r)
+XRENAME = X
+CODLIST = codlist.c
+CODELET_NAME = codelet_
+@MAINTAINER_MODE_TRUE@INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
+@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
+@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
+@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
+@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
+@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
+@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
+@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
+@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
+@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
+@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
+@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
+@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE) 
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
+@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
+@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+
+# special rules for regenerating codelets.
+@MAINTAINER_MODE_TRUE@FLAGS_E00 = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_E01 = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_E10 = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_E11 = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_O00 = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_O01 = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_O10 = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_O11 = $(RDFT_FLAGS_COMMON)
+@MAINTAINER_MODE_TRUE@FLAGS_DHT = $(RDFT_FLAGS_COMMON)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/scalar/r2r/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/scalar/r2r/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/support/Makefile.codelets:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+librdft_scalar_r2r.la: $(librdft_scalar_r2r_la_OBJECTS) $(librdft_scalar_r2r_la_DEPENDENCIES) $(EXTRA_librdft_scalar_r2r_la_DEPENDENCIES) 
+	$(LINK)  $(librdft_scalar_r2r_la_OBJECTS) $(librdft_scalar_r2r_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/e01_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/e10_8.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic \
+	maintainer-clean-local
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic \
+	maintainer-clean-local mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am
+
+
+# rule to build codlist
+$(CODLIST): Makefile
+	(									\
+	echo "#include \"ifftw.h\"";						\
+	echo $(INCLUDE_SIMD_HEADER);						\
+	echo;									\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+             echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);";	\
+           fi									\
+	done;									\
+	echo;									\
+	echo;									\
+	echo "extern const solvtab $(SOLVTAB_NAME);";				\
+	echo "const solvtab $(SOLVTAB_NAME) = {";				\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+	     echo "   SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),";		\
+	   fi									\
+	done;									\
+	echo "   SOLVTAB_END";							\
+	echo "};";								\
+	) >$@
+
+# only delete codlist.c in maintainer-mode, since it is included in the dist
+# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
+maintainer-clean-local:
+	rm -f $(CODLIST)
+
+# cancel the hideous builtin rules that cause an infinite loop
+@MAINTAINER_MODE_TRUE@%: %.o
+@MAINTAINER_MODE_TRUE@%: %.s
+@MAINTAINER_MODE_TRUE@%: %.c
+@MAINTAINER_MODE_TRUE@%: %.S
+
+@MAINTAINER_MODE_TRUE@e00_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_E00) -redft00 -n $* -name e00_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@e01_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_E01) -redft01 -n $* -name e01_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@e10_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_E10) -redft10 -n $* -name e10_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@e11_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_E11) -redft11 -n $* -name e11_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@o00_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_O00) -rodft00 -n $* -name o00_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@o01_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_O01) -rodft01 -n $* -name o01_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@o10_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_O10) -rodft10 -n $* -name o10_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@o11_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_O11) -rodft11 -n $* -name o11_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@dht_%.c:  $(CODELET_DEPS) $(GEN_R2R)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2R) $(FLAGS_DHT) -dht -sign 1 -n $* -name dht_$* -include "r2r.h") | $(ADD_DATE) | $(INDENT) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2r/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2r/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,13 @@
+#include "ifftw.h"
+
+
+extern void X(codelet_e01_8)(planner *);
+extern void X(codelet_e10_8)(planner *);
+
+
+extern const solvtab X(solvtab_rdft_r2r);
+const solvtab X(solvtab_rdft_r2r) = {
+   SOLVTAB(X(codelet_e01_8)),
+   SOLVTAB(X(codelet_e10_8)),
+   SOLVTAB_END
+};
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2r/e01_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2r/e01_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:27 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2r.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -redft01 -n 8 -name e01_8 -include r2r.h */
+
+/*
+ * This function contains 26 FP additions, 24 FP multiplications,
+ * (or, 2 additions, 0 multiplications, 24 fused multiply/add),
+ * 27 stack variables, 8 constants, and 16 memory accesses
+ */
+#include "r2r.h"
+
+static void e01_8(const R *I, R *O, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, I = I + ivs, O = O + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       E T8, Td, Th, T7, Tp, Tl, Te, Tb;
+	       {
+		    E Tj, T3, Tk, T6, T9, Ta;
+		    {
+			 E T1, T2, T4, T5;
+			 T1 = I[0];
+			 T2 = I[WS(is, 4)];
+			 T4 = I[WS(is, 2)];
+			 T5 = I[WS(is, 6)];
+			 T8 = I[WS(is, 1)];
+			 Tj = FNMS(KP1_414213562, T2, T1);
+			 T3 = FMA(KP1_414213562, T2, T1);
+			 Tk = FMS(KP414213562, T4, T5);
+			 T6 = FMA(KP414213562, T5, T4);
+			 Td = I[WS(is, 7)];
+			 T9 = I[WS(is, 5)];
+			 Ta = I[WS(is, 3)];
+		    }
+		    Th = FNMS(KP1_847759065, T6, T3);
+		    T7 = FMA(KP1_847759065, T6, T3);
+		    Tp = FNMS(KP1_847759065, Tk, Tj);
+		    Tl = FMA(KP1_847759065, Tk, Tj);
+		    Te = Ta - T9;
+		    Tb = T9 + Ta;
+	       }
+	       {
+		    E Tn, Tf, Tc, Tm;
+		    Tn = FNMS(KP707106781, Te, Td);
+		    Tf = FMA(KP707106781, Te, Td);
+		    Tc = FMA(KP707106781, Tb, T8);
+		    Tm = FNMS(KP707106781, Tb, T8);
+		    {
+			 E Tq, To, Tg, Ti;
+			 Tq = FMA(KP668178637, Tm, Tn);
+			 To = FNMS(KP668178637, Tn, Tm);
+			 Tg = FMA(KP198912367, Tf, Tc);
+			 Ti = FNMS(KP198912367, Tc, Tf);
+			 O[WS(os, 1)] = FMA(KP1_662939224, To, Tl);
+			 O[WS(os, 6)] = FNMS(KP1_662939224, To, Tl);
+			 O[WS(os, 2)] = FMA(KP1_662939224, Tq, Tp);
+			 O[WS(os, 5)] = FNMS(KP1_662939224, Tq, Tp);
+			 O[WS(os, 4)] = FMA(KP1_961570560, Ti, Th);
+			 O[WS(os, 3)] = FNMS(KP1_961570560, Ti, Th);
+			 O[0] = FMA(KP1_961570560, Tg, T7);
+			 O[WS(os, 7)] = FNMS(KP1_961570560, Tg, T7);
+		    }
+	       }
+	  }
+     }
+}
+
+static const kr2r_desc desc = { 8, "e01_8", {2, 0, 24, 0}, &GENUS, REDFT01 };
+
+void X(codelet_e01_8) (planner *p) {
+     X(kr2r_register) (p, e01_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2r.native -compact -variables 4 -pipeline-latency 4 -redft01 -n 8 -name e01_8 -include r2r.h */
+
+/*
+ * This function contains 26 FP additions, 15 FP multiplications,
+ * (or, 20 additions, 9 multiplications, 6 fused multiply/add),
+ * 28 stack variables, 8 constants, and 16 memory accesses
+ */
+#include "r2r.h"
+
+static void e01_8(const R *I, R *O, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
+     DK(KP390180644, +0.390180644032256535696569736954044481855383236);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP765366864, +0.765366864730179543456919968060797733522689125);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, I = I + ivs, O = O + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       E T7, Tl, T4, Tk, Td, To, Tg, Tn;
+	       {
+		    E T5, T6, T1, T3, T2;
+		    T5 = I[WS(is, 2)];
+		    T6 = I[WS(is, 6)];
+		    T7 = FMA(KP1_847759065, T5, KP765366864 * T6);
+		    Tl = FNMS(KP1_847759065, T6, KP765366864 * T5);
+		    T1 = I[0];
+		    T2 = I[WS(is, 4)];
+		    T3 = KP1_414213562 * T2;
+		    T4 = T1 + T3;
+		    Tk = T1 - T3;
+		    {
+			 E T9, Tf, Tc, Te, Ta, Tb;
+			 T9 = I[WS(is, 1)];
+			 Tf = I[WS(is, 7)];
+			 Ta = I[WS(is, 5)];
+			 Tb = I[WS(is, 3)];
+			 Tc = KP707106781 * (Ta + Tb);
+			 Te = KP707106781 * (Ta - Tb);
+			 Td = T9 + Tc;
+			 To = Te + Tf;
+			 Tg = Te - Tf;
+			 Tn = T9 - Tc;
+		    }
+	       }
+	       {
+		    E T8, Th, Tq, Tr;
+		    T8 = T4 + T7;
+		    Th = FNMS(KP390180644, Tg, KP1_961570560 * Td);
+		    O[WS(os, 7)] = T8 - Th;
+		    O[0] = T8 + Th;
+		    Tq = Tk - Tl;
+		    Tr = FMA(KP1_111140466, Tn, KP1_662939224 * To);
+		    O[WS(os, 5)] = Tq - Tr;
+		    O[WS(os, 2)] = Tq + Tr;
+	       }
+	       {
+		    E Ti, Tj, Tm, Tp;
+		    Ti = T4 - T7;
+		    Tj = FMA(KP390180644, Td, KP1_961570560 * Tg);
+		    O[WS(os, 4)] = Ti - Tj;
+		    O[WS(os, 3)] = Ti + Tj;
+		    Tm = Tk + Tl;
+		    Tp = FNMS(KP1_111140466, To, KP1_662939224 * Tn);
+		    O[WS(os, 6)] = Tm - Tp;
+		    O[WS(os, 1)] = Tm + Tp;
+	       }
+	  }
+     }
+}
+
+static const kr2r_desc desc = { 8, "e01_8", {20, 9, 6, 0}, &GENUS, REDFT01 };
+
+void X(codelet_e01_8) (planner *p) {
+     X(kr2r_register) (p, e01_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/scalar/r2r/e10_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/scalar/r2r/e10_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:27 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_r2r.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -redft10 -n 8 -name e10_8 -include r2r.h */
+
+/*
+ * This function contains 26 FP additions, 18 FP multiplications,
+ * (or, 16 additions, 8 multiplications, 10 fused multiply/add),
+ * 28 stack variables, 9 constants, and 16 memory accesses
+ */
+#include "r2r.h"
+
+static void e10_8(const R *I, R *O, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, I = I + ivs, O = O + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       E T3, Te, Tl, Tp, Tm, T6, Tn, T9;
+	       {
+		    E T4, Tj, Tk, T5, T7, T8;
+		    {
+			 E T1, T2, Tc, Td;
+			 T1 = I[0];
+			 T2 = I[WS(is, 7)];
+			 Tc = I[WS(is, 4)];
+			 Td = I[WS(is, 3)];
+			 T4 = I[WS(is, 2)];
+			 Tj = T1 + T2;
+			 T3 = T1 - T2;
+			 Tk = Tc + Td;
+			 Te = Tc - Td;
+			 T5 = I[WS(is, 5)];
+			 T7 = I[WS(is, 1)];
+			 T8 = I[WS(is, 6)];
+		    }
+		    Tl = Tj - Tk;
+		    Tp = Tj + Tk;
+		    Tm = T4 + T5;
+		    T6 = T4 - T5;
+		    Tn = T7 + T8;
+		    T9 = T7 - T8;
+	       }
+	       {
+		    E Tg, Ti, Tb, Th;
+		    {
+			 E Tq, To, Ta, Tf;
+			 Tq = Tm + Tn;
+			 To = Tm - Tn;
+			 Ta = T6 + T9;
+			 Tf = T6 - T9;
+			 O[WS(os, 6)] = KP1_847759065 * (FMA(KP414213562, Tl, To));
+			 O[WS(os, 2)] = KP1_847759065 * (FNMS(KP414213562, To, Tl));
+			 O[0] = KP2_000000000 * (Tp + Tq);
+			 O[WS(os, 4)] = KP1_414213562 * (Tp - Tq);
+			 Tg = FNMS(KP707106781, Tf, Te);
+			 Ti = FMA(KP707106781, Tf, Te);
+			 Tb = FNMS(KP707106781, Ta, T3);
+			 Th = FMA(KP707106781, Ta, T3);
+		    }
+		    O[WS(os, 7)] = KP1_961570560 * (FMA(KP198912367, Th, Ti));
+		    O[WS(os, 1)] = KP1_961570560 * (FNMS(KP198912367, Ti, Th));
+		    O[WS(os, 5)] = -(KP1_662939224 * (FNMS(KP668178637, Tb, Tg)));
+		    O[WS(os, 3)] = KP1_662939224 * (FMA(KP668178637, Tg, Tb));
+	       }
+	  }
+     }
+}
+
+static const kr2r_desc desc = { 8, "e10_8", {16, 8, 10, 0}, &GENUS, REDFT10 };
+
+void X(codelet_e10_8) (planner *p) {
+     X(kr2r_register) (p, e10_8, &desc);
+}
+
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_r2r.native -compact -variables 4 -pipeline-latency 4 -redft10 -n 8 -name e10_8 -include r2r.h */
+
+/*
+ * This function contains 26 FP additions, 16 FP multiplications,
+ * (or, 20 additions, 10 multiplications, 6 fused multiply/add),
+ * 28 stack variables, 9 constants, and 16 memory accesses
+ */
+#include "r2r.h"
+
+static void e10_8(const R *I, R *O, stride is, stride os, INT v, INT ivs, INT ovs)
+{
+     DK(KP765366864, +0.765366864730179543456919968060797733522689125);
+     DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
+     DK(KP390180644, +0.390180644032256535696569736954044481855383236);
+     DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
+     DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
+     DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
+     DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
+     DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
+     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT i;
+	  for (i = v; i > 0; i = i - 1, I = I + ivs, O = O + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
+	       E T3, Tj, Tf, Tk, Ta, Tn, Tc, Tm;
+	       {
+		    E T1, T2, Td, Te;
+		    T1 = I[0];
+		    T2 = I[WS(is, 7)];
+		    T3 = T1 - T2;
+		    Tj = T1 + T2;
+		    Td = I[WS(is, 4)];
+		    Te = I[WS(is, 3)];
+		    Tf = Td - Te;
+		    Tk = Td + Te;
+		    {
+			 E T4, T5, T6, T7, T8, T9;
+			 T4 = I[WS(is, 2)];
+			 T5 = I[WS(is, 5)];
+			 T6 = T4 - T5;
+			 T7 = I[WS(is, 1)];
+			 T8 = I[WS(is, 6)];
+			 T9 = T7 - T8;
+			 Ta = KP707106781 * (T6 + T9);
+			 Tn = T7 + T8;
+			 Tc = KP707106781 * (T6 - T9);
+			 Tm = T4 + T5;
+		    }
+	       }
+	       {
+		    E Tb, Tg, Tp, Tq;
+		    Tb = T3 - Ta;
+		    Tg = Tc - Tf;
+		    O[WS(os, 3)] = FNMS(KP1_111140466, Tg, KP1_662939224 * Tb);
+		    O[WS(os, 5)] = FMA(KP1_662939224, Tg, KP1_111140466 * Tb);
+		    Tp = Tj + Tk;
+		    Tq = Tm + Tn;
+		    O[WS(os, 4)] = KP1_414213562 * (Tp - Tq);
+		    O[0] = KP2_000000000 * (Tp + Tq);
+	       }
+	       {
+		    E Th, Ti, Tl, To;
+		    Th = T3 + Ta;
+		    Ti = Tf + Tc;
+		    O[WS(os, 1)] = FNMS(KP390180644, Ti, KP1_961570560 * Th);
+		    O[WS(os, 7)] = FMA(KP1_961570560, Ti, KP390180644 * Th);
+		    Tl = Tj - Tk;
+		    To = Tm - Tn;
+		    O[WS(os, 2)] = FNMS(KP765366864, To, KP1_847759065 * Tl);
+		    O[WS(os, 6)] = FMA(KP765366864, Tl, KP1_847759065 * To);
+	       }
+	  }
+     }
+}
+
+static const kr2r_desc desc = { 8, "e10_8", {20, 10, 6, 0}, &GENUS, REDFT10 };
+
+void X(codelet_e10_8) (planner *p) {
+     X(kr2r_register) (p, e10_8, &desc);
+}
+
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+SUBDIRS = common sse2 avx altivec neon
+EXTRA_DIST = hc2cbv.h hc2cfv.h codlist.mk simd.mk
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,617 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = rdft/simd
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+SOURCES =
+DIST_SOURCES =
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
+	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
+	distdir
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+SUBDIRS = common sse2 avx altivec neon
+EXTRA_DIST = hc2cbv.h hc2cfv.h codlist.mk simd.mk
+all: all-recursive
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/simd/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/simd/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-recursive
+all-am: Makefile
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \
+	install-am install-strip tags-recursive
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am check check-am clean clean-generic clean-libtool \
+	ctags ctags-recursive distclean distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs installdirs-am maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \
+	uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(ALTIVEC_CFLAGS)
+SIMD_HEADER=simd-altivec.h
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+include $(top_srcdir)/rdft/simd/simd.mk
+
+if HAVE_ALTIVEC
+
+noinst_LTLIBRARIES = librdft_altivec_codelets.la
+BUILT_SOURCES = $(EXTRA_DIST)
+librdft_altivec_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,613 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This file contains a standard list of RDFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/rdft/simd/codlist.mk \
+	$(top_srcdir)/rdft/simd/simd.mk
+subdir = rdft/simd/altivec
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+librdft_altivec_codelets_la_LIBADD =
+am__librdft_altivec_codelets_la_SOURCES_DIST = hc2cfdftv_2.c \
+	hc2cfdftv_4.c hc2cfdftv_6.c hc2cfdftv_8.c hc2cfdftv_10.c \
+	hc2cfdftv_12.c hc2cfdftv_16.c hc2cfdftv_32.c hc2cfdftv_20.c \
+	hc2cbdftv_2.c hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c \
+	hc2cbdftv_10.c hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c \
+	hc2cbdftv_20.c genus.c codlist.c
+am__objects_1 = hc2cfdftv_2.lo hc2cfdftv_4.lo hc2cfdftv_6.lo \
+	hc2cfdftv_8.lo hc2cfdftv_10.lo hc2cfdftv_12.lo hc2cfdftv_16.lo \
+	hc2cfdftv_32.lo hc2cfdftv_20.lo
+am__objects_2 = hc2cbdftv_2.lo hc2cbdftv_4.lo hc2cbdftv_6.lo \
+	hc2cbdftv_8.lo hc2cbdftv_10.lo hc2cbdftv_12.lo hc2cbdftv_16.lo \
+	hc2cbdftv_32.lo hc2cbdftv_20.lo
+am__objects_3 = $(am__objects_1) $(am__objects_2)
+am__objects_4 = $(am__objects_3) genus.lo codlist.lo
+@HAVE_ALTIVEC_TRUE@am__objects_5 = $(am__objects_4)
+@HAVE_ALTIVEC_TRUE@am_librdft_altivec_codelets_la_OBJECTS =  \
+@HAVE_ALTIVEC_TRUE@	$(am__objects_5)
+librdft_altivec_codelets_la_OBJECTS =  \
+	$(am_librdft_altivec_codelets_la_OBJECTS)
+@HAVE_ALTIVEC_TRUE@am_librdft_altivec_codelets_la_rpath =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(librdft_altivec_codelets_la_SOURCES)
+DIST_SOURCES = $(am__librdft_altivec_codelets_la_SOURCES_DIST)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CFLAGS = $(ALTIVEC_CFLAGS)
+SIMD_HEADER = simd-altivec.h
+HC2CFDFTV = hc2cfdftv_2.c hc2cfdftv_4.c hc2cfdftv_6.c hc2cfdftv_8.c	\
+hc2cfdftv_10.c hc2cfdftv_12.c hc2cfdftv_16.c hc2cfdftv_32.c		\
+hc2cfdftv_20.c
+
+HC2CBDFTV = hc2cbdftv_2.c hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c	\
+hc2cbdftv_10.c hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c		\
+hc2cbdftv_20.c
+
+
+###########################################################################
+SIMD_CODELETS = $(HC2CFDFTV) $(HC2CBDFTV)
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/simd -I$(top_srcdir)/simd-support
+
+EXTRA_DIST = $(SIMD_CODELETS) genus.c codlist.c
+@HAVE_ALTIVEC_TRUE@noinst_LTLIBRARIES = librdft_altivec_codelets.la
+@HAVE_ALTIVEC_TRUE@BUILT_SOURCES = $(EXTRA_DIST)
+@HAVE_ALTIVEC_TRUE@librdft_altivec_codelets_la_SOURCES = $(BUILT_SOURCES)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/rdft/simd/simd.mk $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/simd/altivec/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/simd/altivec/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/rdft/simd/simd.mk:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+librdft_altivec_codelets.la: $(librdft_altivec_codelets_la_OBJECTS) $(librdft_altivec_codelets_la_DEPENDENCIES) $(EXTRA_librdft_altivec_codelets_la_DEPENDENCIES) 
+	$(LINK) $(am_librdft_altivec_codelets_la_rpath) $(librdft_altivec_codelets_la_OBJECTS) $(librdft_altivec_codelets_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/genus.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_8.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+$(EXTRA_DIST): Makefile
+	(							\
+	echo "/* Generated automatically.  DO NOT EDIT! */";	\
+	echo "#define SIMD_HEADER \"$(SIMD_HEADER)\"";		\
+	echo "#include \"../common/"$*".c\"";			\
+	) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/codlist.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/genus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/genus.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/genus.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cbdftv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cbdftv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cbdftv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cbdftv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cbdftv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cbdftv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cbdftv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cbdftv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cbdftv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cbdftv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cfdftv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cfdftv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cfdftv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cfdftv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cfdftv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cfdftv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cfdftv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cfdftv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/altivec/hc2cfdftv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-altivec.h"
+#include "../common/hc2cfdftv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,15 @@
+AM_CFLAGS = $(AVX_CFLAGS)
+SIMD_HEADER=simd-avx.h
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+include $(top_srcdir)/rdft/simd/simd.mk
+
+if HAVE_AVX
+
+noinst_LTLIBRARIES = librdft_avx_codelets.la
+BUILT_SOURCES = $(EXTRA_DIST)
+librdft_avx_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
+
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,612 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This file contains a standard list of RDFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/rdft/simd/codlist.mk \
+	$(top_srcdir)/rdft/simd/simd.mk
+subdir = rdft/simd/avx
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+librdft_avx_codelets_la_LIBADD =
+am__librdft_avx_codelets_la_SOURCES_DIST = hc2cfdftv_2.c hc2cfdftv_4.c \
+	hc2cfdftv_6.c hc2cfdftv_8.c hc2cfdftv_10.c hc2cfdftv_12.c \
+	hc2cfdftv_16.c hc2cfdftv_32.c hc2cfdftv_20.c hc2cbdftv_2.c \
+	hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c hc2cbdftv_10.c \
+	hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c hc2cbdftv_20.c \
+	genus.c codlist.c
+am__objects_1 = hc2cfdftv_2.lo hc2cfdftv_4.lo hc2cfdftv_6.lo \
+	hc2cfdftv_8.lo hc2cfdftv_10.lo hc2cfdftv_12.lo hc2cfdftv_16.lo \
+	hc2cfdftv_32.lo hc2cfdftv_20.lo
+am__objects_2 = hc2cbdftv_2.lo hc2cbdftv_4.lo hc2cbdftv_6.lo \
+	hc2cbdftv_8.lo hc2cbdftv_10.lo hc2cbdftv_12.lo hc2cbdftv_16.lo \
+	hc2cbdftv_32.lo hc2cbdftv_20.lo
+am__objects_3 = $(am__objects_1) $(am__objects_2)
+am__objects_4 = $(am__objects_3) genus.lo codlist.lo
+@HAVE_AVX_TRUE@am__objects_5 = $(am__objects_4)
+@HAVE_AVX_TRUE@am_librdft_avx_codelets_la_OBJECTS = $(am__objects_5)
+librdft_avx_codelets_la_OBJECTS =  \
+	$(am_librdft_avx_codelets_la_OBJECTS)
+@HAVE_AVX_TRUE@am_librdft_avx_codelets_la_rpath =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(librdft_avx_codelets_la_SOURCES)
+DIST_SOURCES = $(am__librdft_avx_codelets_la_SOURCES_DIST)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CFLAGS = $(AVX_CFLAGS)
+SIMD_HEADER = simd-avx.h
+HC2CFDFTV = hc2cfdftv_2.c hc2cfdftv_4.c hc2cfdftv_6.c hc2cfdftv_8.c	\
+hc2cfdftv_10.c hc2cfdftv_12.c hc2cfdftv_16.c hc2cfdftv_32.c		\
+hc2cfdftv_20.c
+
+HC2CBDFTV = hc2cbdftv_2.c hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c	\
+hc2cbdftv_10.c hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c		\
+hc2cbdftv_20.c
+
+
+###########################################################################
+SIMD_CODELETS = $(HC2CFDFTV) $(HC2CBDFTV)
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/simd -I$(top_srcdir)/simd-support
+
+EXTRA_DIST = $(SIMD_CODELETS) genus.c codlist.c
+@HAVE_AVX_TRUE@noinst_LTLIBRARIES = librdft_avx_codelets.la
+@HAVE_AVX_TRUE@BUILT_SOURCES = $(EXTRA_DIST)
+@HAVE_AVX_TRUE@librdft_avx_codelets_la_SOURCES = $(BUILT_SOURCES)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/rdft/simd/simd.mk $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/simd/avx/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/simd/avx/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/rdft/simd/simd.mk:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+librdft_avx_codelets.la: $(librdft_avx_codelets_la_OBJECTS) $(librdft_avx_codelets_la_DEPENDENCIES) $(EXTRA_librdft_avx_codelets_la_DEPENDENCIES) 
+	$(LINK) $(am_librdft_avx_codelets_la_rpath) $(librdft_avx_codelets_la_OBJECTS) $(librdft_avx_codelets_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/genus.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_8.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+$(EXTRA_DIST): Makefile
+	(							\
+	echo "/* Generated automatically.  DO NOT EDIT! */";	\
+	echo "#define SIMD_HEADER \"$(SIMD_HEADER)\"";		\
+	echo "#include \"../common/"$*".c\"";			\
+	) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/codlist.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/genus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/genus.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/genus.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cbdftv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cbdftv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cbdftv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cbdftv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cbdftv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cbdftv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cbdftv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cbdftv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cbdftv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cbdftv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cfdftv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cfdftv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cfdftv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cfdftv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cfdftv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cfdftv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cfdftv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cfdftv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/avx/hc2cfdftv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-avx.h"
+#include "../common/hc2cfdftv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/codlist.mk
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/codlist.mk	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,22 @@
+# This file contains a standard list of RDFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+HC2CFDFTV = hc2cfdftv_2.c hc2cfdftv_4.c hc2cfdftv_6.c hc2cfdftv_8.c	\
+hc2cfdftv_10.c hc2cfdftv_12.c hc2cfdftv_16.c hc2cfdftv_32.c		\
+hc2cfdftv_20.c
+
+HC2CBDFTV = hc2cbdftv_2.c hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c	\
+hc2cbdftv_10.c hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c		\
+hc2cbdftv_20.c
+
+###########################################################################
+SIMD_CODELETS = $(HC2CFDFTV) $(HC2CBDFTV)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,24 @@
+# include the list of codelets
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+
+ALL_CODELETS = $(SIMD_CODELETS)
+BUILT_SOURCES= $(SIMD_CODELETS) $(CODLIST)
+EXTRA_DIST = $(BUILT_SOURCES) genus.c
+INCLUDE_SIMD_HEADER="\#include SIMD_HEADER"
+XRENAME=XSIMD
+SOLVTAB_NAME = XSIMD(solvtab_rdft)
+
+# include special rules for regenerating codelets.
+include $(top_srcdir)/support/Makefile.codelets
+
+if MAINTAINER_MODE
+FLAGS_HC2C=-simd $(FLAGS_COMMON) -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw
+
+hc2cfdftv_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT_C)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT_C) $(FLAGS_HC2C) -n $* -dit -name hc2cfdftv_$* -include "hc2cfv.h") | $(ADD_DATE) | $(INDENT) >$@
+
+hc2cbdftv_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT_C)
+	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT_C) $(FLAGS_HC2C) -n $* -dif -sign 1 -name hc2cbdftv_$* -include "hc2cbv.h") | $(ADD_DATE) | $(INDENT) >$@
+
+endif # MAINTAINER_MODE
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,534 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# include the list of codelets
+
+# This file contains a standard list of RDFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+# -*- makefile -*-
+# This file contains special make rules to generate codelets.
+# Most of this file requires GNU make .
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/rdft/simd/codlist.mk \
+	$(top_srcdir)/support/Makefile.codelets
+subdir = rdft/simd/common
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+HC2CFDFTV = hc2cfdftv_2.c hc2cfdftv_4.c hc2cfdftv_6.c hc2cfdftv_8.c	\
+hc2cfdftv_10.c hc2cfdftv_12.c hc2cfdftv_16.c hc2cfdftv_32.c		\
+hc2cfdftv_20.c
+
+HC2CBDFTV = hc2cbdftv_2.c hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c	\
+hc2cbdftv_10.c hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c		\
+hc2cbdftv_20.c
+
+
+###########################################################################
+SIMD_CODELETS = $(HC2CFDFTV) $(HC2CBDFTV)
+ALL_CODELETS = $(SIMD_CODELETS)
+BUILT_SOURCES = $(SIMD_CODELETS) $(CODLIST)
+EXTRA_DIST = $(BUILT_SOURCES) genus.c
+INCLUDE_SIMD_HEADER = "\#include SIMD_HEADER"
+XRENAME = XSIMD
+SOLVTAB_NAME = XSIMD(solvtab_rdft)
+CODLIST = codlist.c
+CODELET_NAME = codelet_
+@MAINTAINER_MODE_TRUE@INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
+@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
+@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
+@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
+@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
+@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
+@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
+@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
+@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
+@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
+@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
+@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
+@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
+@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
+@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
+@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE) 
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
+@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
+@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
+@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+
+# include special rules for regenerating codelets.
+@MAINTAINER_MODE_TRUE@FLAGS_HC2C = -simd $(FLAGS_COMMON) -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/simd/common/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/simd/common/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/support/Makefile.codelets:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+tags: TAGS
+TAGS:
+
+ctags: CTAGS
+CTAGS:
+
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic \
+	maintainer-clean-local
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+	distclean distclean-generic distclean-libtool distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic maintainer-clean-local mostlyclean \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	uninstall uninstall-am
+
+
+# rule to build codlist
+$(CODLIST): Makefile
+	(									\
+	echo "#include \"ifftw.h\"";						\
+	echo $(INCLUDE_SIMD_HEADER);						\
+	echo;									\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+             echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);";	\
+           fi									\
+	done;									\
+	echo;									\
+	echo;									\
+	echo "extern const solvtab $(SOLVTAB_NAME);";				\
+	echo "const solvtab $(SOLVTAB_NAME) = {";				\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+	     echo "   SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),";		\
+	   fi									\
+	done;									\
+	echo "   SOLVTAB_END";							\
+	echo "};";								\
+	) >$@
+
+# only delete codlist.c in maintainer-mode, since it is included in the dist
+# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
+maintainer-clean-local:
+	rm -f $(CODLIST)
+
+# cancel the hideous builtin rules that cause an infinite loop
+@MAINTAINER_MODE_TRUE@%: %.o
+@MAINTAINER_MODE_TRUE@%: %.s
+@MAINTAINER_MODE_TRUE@%: %.c
+@MAINTAINER_MODE_TRUE@%: %.S
+
+@MAINTAINER_MODE_TRUE@hc2cfdftv_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT_C) $(FLAGS_HC2C) -n $* -dit -name hc2cfdftv_$* -include "hc2cfv.h") | $(ADD_DATE) | $(INDENT) >$@
+
+@MAINTAINER_MODE_TRUE@hc2cbdftv_%.c:  $(CODELET_DEPS) $(GEN_HC2CDFT_C)
+@MAINTAINER_MODE_TRUE@	($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT_C) $(FLAGS_HC2C) -n $* -dif -sign 1 -name hc2cbdftv_$* -include "hc2cbv.h") | $(ADD_DATE) | $(INDENT) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,45 @@
+#include "ifftw.h"
+#include SIMD_HEADER
+
+extern void XSIMD(codelet_hc2cfdftv_2)(planner *);
+extern void XSIMD(codelet_hc2cfdftv_4)(planner *);
+extern void XSIMD(codelet_hc2cfdftv_6)(planner *);
+extern void XSIMD(codelet_hc2cfdftv_8)(planner *);
+extern void XSIMD(codelet_hc2cfdftv_10)(planner *);
+extern void XSIMD(codelet_hc2cfdftv_12)(planner *);
+extern void XSIMD(codelet_hc2cfdftv_16)(planner *);
+extern void XSIMD(codelet_hc2cfdftv_32)(planner *);
+extern void XSIMD(codelet_hc2cfdftv_20)(planner *);
+extern void XSIMD(codelet_hc2cbdftv_2)(planner *);
+extern void XSIMD(codelet_hc2cbdftv_4)(planner *);
+extern void XSIMD(codelet_hc2cbdftv_6)(planner *);
+extern void XSIMD(codelet_hc2cbdftv_8)(planner *);
+extern void XSIMD(codelet_hc2cbdftv_10)(planner *);
+extern void XSIMD(codelet_hc2cbdftv_12)(planner *);
+extern void XSIMD(codelet_hc2cbdftv_16)(planner *);
+extern void XSIMD(codelet_hc2cbdftv_32)(planner *);
+extern void XSIMD(codelet_hc2cbdftv_20)(planner *);
+
+
+extern const solvtab XSIMD(solvtab_rdft);
+const solvtab XSIMD(solvtab_rdft) = {
+   SOLVTAB(XSIMD(codelet_hc2cfdftv_2)),
+   SOLVTAB(XSIMD(codelet_hc2cfdftv_4)),
+   SOLVTAB(XSIMD(codelet_hc2cfdftv_6)),
+   SOLVTAB(XSIMD(codelet_hc2cfdftv_8)),
+   SOLVTAB(XSIMD(codelet_hc2cfdftv_10)),
+   SOLVTAB(XSIMD(codelet_hc2cfdftv_12)),
+   SOLVTAB(XSIMD(codelet_hc2cfdftv_16)),
+   SOLVTAB(XSIMD(codelet_hc2cfdftv_32)),
+   SOLVTAB(XSIMD(codelet_hc2cfdftv_20)),
+   SOLVTAB(XSIMD(codelet_hc2cbdftv_2)),
+   SOLVTAB(XSIMD(codelet_hc2cbdftv_4)),
+   SOLVTAB(XSIMD(codelet_hc2cbdftv_6)),
+   SOLVTAB(XSIMD(codelet_hc2cbdftv_8)),
+   SOLVTAB(XSIMD(codelet_hc2cbdftv_10)),
+   SOLVTAB(XSIMD(codelet_hc2cbdftv_12)),
+   SOLVTAB(XSIMD(codelet_hc2cbdftv_16)),
+   SOLVTAB(XSIMD(codelet_hc2cbdftv_32)),
+   SOLVTAB(XSIMD(codelet_hc2cbdftv_20)),
+   SOLVTAB_END
+};
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/genus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/genus.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "codelet-rdft.h"
+#include SIMD_HEADER
+
+#define EXTERN_CONST(t, x) extern const t x; const t x
+
+static int hc2cbv_okp(const R *Rp, const R *Ip, const R *Rm, const R *Im, 
+		      INT rs, INT mb, INT me, INT ms, 
+		      const planner *plnr)
+{
+     return (1
+	     && !NO_SIMDP(plnr)
+	     && SIMD_STRIDE_OK(rs)
+	     && SIMD_VSTRIDE_OK(ms)
+             && ((me - mb) % VL) == 0
+             && ((mb - 1) % VL) == 0 /* twiddle factors alignment */
+	     && ALIGNED(Rp)
+	     && ALIGNED(Rm)
+	     && Ip == Rp + 1
+	     && Im == Rm + 1);
+}
+
+EXTERN_CONST(hc2c_genus, XSIMD(rdft_hc2cbv_genus)) = { hc2cbv_okp, HC2R, VL };
+
+static int hc2cfv_okp(const R *Rp, const R *Ip, const R *Rm, const R *Im, 
+		      INT rs, INT mb, INT me, INT ms, 
+		      const planner *plnr)
+{
+     return (1
+	     && !NO_SIMDP(plnr)
+	     && SIMD_STRIDE_OK(rs)
+	     && SIMD_VSTRIDE_OK(ms)
+             && ((me - mb) % VL) == 0
+             && ((mb - 1) % VL) == 0 /* twiddle factors alignment */
+	     && ALIGNED(Rp)
+	     && ALIGNED(Rm)
+	     && Ip == Rp + 1
+	     && Im == Rm + 1);
+}
+
+EXTERN_CONST(hc2c_genus, XSIMD(rdft_hc2cfv_genus)) = { hc2cfv_okp, R2HC, VL };
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dif -sign 1 -name hc2cbdftv_10 -include hc2cbv.h */
+
+/*
+ * This function contains 61 FP additions, 50 FP multiplications,
+ * (or, 33 additions, 22 multiplications, 28 fused multiply/add),
+ * 76 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
+	       V Ts, T4, TR, T1, TZ, TD, Ty, Tn, Ti, TT, T11, TJ, T15, Tr, TN;
+	       V TE, Tv, To, Tb, T8, Tw, Te, Tx, Th, Tt, T7, T9, T2, T3, Tc;
+	       V Td, Tf, Tg, T5, T6, Tu, Ta;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T3 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+	       Tc = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+	       Td = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Tf = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       Tg = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+	       T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       T8 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+	       Ts = VFMACONJ(T3, T2);
+	       T4 = VFNMSCONJ(T3, T2);
+	       Tw = VFMACONJ(Td, Tc);
+	       Te = VFNMSCONJ(Td, Tc);
+	       Tx = VFMACONJ(Tg, Tf);
+	       Th = VFMSCONJ(Tg, Tf);
+	       Tt = VFMACONJ(T6, T5);
+	       T7 = VFNMSCONJ(T6, T5);
+	       T9 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       TR = LDW(&(W[TWVL * 8]));
+	       T1 = LDW(&(W[TWVL * 4]));
+	       TZ = LDW(&(W[TWVL * 12]));
+	       TD = VSUB(Tw, Tx);
+	       Ty = VADD(Tw, Tx);
+	       Tn = VSUB(Te, Th);
+	       Ti = VADD(Te, Th);
+	       Tu = VFMACONJ(T9, T8);
+	       Ta = VFMSCONJ(T9, T8);
+	       TT = LDW(&(W[TWVL * 6]));
+	       T11 = LDW(&(W[TWVL * 10]));
+	       TJ = LDW(&(W[TWVL * 16]));
+	       T15 = LDW(&(W[0]));
+	       Tr = LDW(&(W[TWVL * 2]));
+	       TN = LDW(&(W[TWVL * 14]));
+	       TE = VSUB(Tt, Tu);
+	       Tv = VADD(Tt, Tu);
+	       To = VSUB(T7, Ta);
+	       Tb = VADD(T7, Ta);
+	       {
+		    V TV, TF, Tz, TB, TL, Tp, Tj, Tl, T17, TA, TS, Tk, TC, TU, TK;
+		    V Tm, TO, TG, T12, TW, T16, TM, T10, Tq, TX, TY, T18, T19, TQ, TP;
+		    V T13, T14, TI, TH;
+		    TV = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TD, TE));
+		    TF = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TE, TD));
+		    Tz = VADD(Tv, Ty);
+		    TB = VSUB(Tv, Ty);
+		    TL = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tn, To));
+		    Tp = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), To, Tn));
+		    Tj = VADD(Tb, Ti);
+		    Tl = VSUB(Tb, Ti);
+		    T17 = VADD(Ts, Tz);
+		    TA = VFNMS(LDK(KP250000000), Tz, Ts);
+		    TS = VZMULI(TR, VADD(T4, Tj));
+		    Tk = VFNMS(LDK(KP250000000), Tj, T4);
+		    TC = VFNMS(LDK(KP559016994), TB, TA);
+		    TU = VFMA(LDK(KP559016994), TB, TA);
+		    TK = VFMA(LDK(KP559016994), Tl, Tk);
+		    Tm = VFNMS(LDK(KP559016994), Tl, Tk);
+		    TO = VZMUL(TN, VFMAI(TF, TC));
+		    TG = VZMUL(Tr, VFNMSI(TF, TC));
+		    T12 = VZMUL(T11, VFMAI(TV, TU));
+		    TW = VZMUL(TT, VFNMSI(TV, TU));
+		    T16 = VZMULI(T15, VFMAI(TL, TK));
+		    TM = VZMULI(TJ, VFNMSI(TL, TK));
+		    T10 = VZMULI(TZ, VFNMSI(Tp, Tm));
+		    Tq = VZMULI(T1, VFMAI(Tp, Tm));
+		    TX = VADD(TS, TW);
+		    TY = VCONJ(VSUB(TW, TS));
+		    T18 = VADD(T16, T17);
+		    T19 = VCONJ(VSUB(T17, T16));
+		    TQ = VCONJ(VSUB(TO, TM));
+		    TP = VADD(TM, TO);
+		    T13 = VADD(T10, T12);
+		    T14 = VCONJ(VSUB(T12, T10));
+		    TI = VCONJ(VSUB(TG, Tq));
+		    TH = VADD(Tq, TG);
+		    ST(&(Rp[WS(rs, 2)]), TX, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 2)]), TY, -ms, &(Rm[0]));
+		    ST(&(Rp[0]), T18, ms, &(Rp[0]));
+		    ST(&(Rm[0]), T19, -ms, &(Rm[0]));
+		    ST(&(Rm[WS(rs, 4)]), TQ, -ms, &(Rm[0]));
+		    ST(&(Rp[WS(rs, 4)]), TP, ms, &(Rp[0]));
+		    ST(&(Rp[WS(rs, 3)]), T13, ms, &(Rp[WS(rs, 1)]));
+		    ST(&(Rm[WS(rs, 3)]), T14, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rm[WS(rs, 1)]), TI, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 1)]), TH, ms, &(Rp[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cbdftv_10"), twinstr, &GENUS, {33, 22, 28, 0} };
+
+void XSIMD(codelet_hc2cbdftv_10) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_10, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dif -sign 1 -name hc2cbdftv_10 -include hc2cbv.h */
+
+/*
+ * This function contains 61 FP additions, 30 FP multiplications,
+ * (or, 55 additions, 24 multiplications, 6 fused multiply/add),
+ * 81 stack variables, 4 constants, and 20 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
+	       V T5, TE, Ts, Tt, TC, Tz, TH, TJ, To, Tq, T2, T4, T3, T9, Tx;
+	       V Tm, TB, Td, Ty, Ti, TA, T6, T8, T7, Tl, Tk, Tj, Tc, Tb, Ta;
+	       V Tf, Th, Tg, TF, TG, Te, Tn;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T3 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+	       T4 = VCONJ(T3);
+	       T5 = VSUB(T2, T4);
+	       TE = VADD(T2, T4);
+	       T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       T8 = VCONJ(T7);
+	       T9 = VSUB(T6, T8);
+	       Tx = VADD(T6, T8);
+	       Tl = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       Tj = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+	       Tk = VCONJ(Tj);
+	       Tm = VSUB(Tk, Tl);
+	       TB = VADD(Tk, Tl);
+	       Tc = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+	       Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       Tb = VCONJ(Ta);
+	       Td = VSUB(Tb, Tc);
+	       Ty = VADD(Tb, Tc);
+	       Tf = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+	       Tg = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Th = VCONJ(Tg);
+	       Ti = VSUB(Tf, Th);
+	       TA = VADD(Tf, Th);
+	       Ts = VSUB(T9, Td);
+	       Tt = VSUB(Ti, Tm);
+	       TC = VSUB(TA, TB);
+	       Tz = VSUB(Tx, Ty);
+	       TF = VADD(Tx, Ty);
+	       TG = VADD(TA, TB);
+	       TH = VADD(TF, TG);
+	       TJ = VMUL(LDK(KP559016994), VSUB(TF, TG));
+	       Te = VADD(T9, Td);
+	       Tn = VADD(Ti, Tm);
+	       To = VADD(Te, Tn);
+	       Tq = VMUL(LDK(KP559016994), VSUB(Te, Tn));
+	       {
+		    V T1c, TX, Tv, T1b, TR, T15, TL, T17, TT, T11, TW, Tu, TQ, Tr, TP;
+		    V Tp, T1, T1a, TO, T14, TD, T10, TK, TZ, TI, Tw, T16, TS, TY, TM;
+		    V TU, T1e, TN, T1d, T19, T13, TV, T18, T12;
+		    T1c = VADD(TE, TH);
+		    TW = LDW(&(W[TWVL * 8]));
+		    TX = VZMULI(TW, VADD(T5, To));
+		    Tu = VBYI(VFNMS(LDK(KP951056516), Tt, VMUL(LDK(KP587785252), Ts)));
+		    TQ = VBYI(VFMA(LDK(KP951056516), Ts, VMUL(LDK(KP587785252), Tt)));
+		    Tp = VFNMS(LDK(KP250000000), To, T5);
+		    Tr = VSUB(Tp, Tq);
+		    TP = VADD(Tq, Tp);
+		    T1 = LDW(&(W[TWVL * 4]));
+		    Tv = VZMULI(T1, VSUB(Tr, Tu));
+		    T1a = LDW(&(W[0]));
+		    T1b = VZMULI(T1a, VADD(TQ, TP));
+		    TO = LDW(&(W[TWVL * 16]));
+		    TR = VZMULI(TO, VSUB(TP, TQ));
+		    T14 = LDW(&(W[TWVL * 12]));
+		    T15 = VZMULI(T14, VADD(Tu, Tr));
+		    TD = VBYI(VFNMS(LDK(KP951056516), TC, VMUL(LDK(KP587785252), Tz)));
+		    T10 = VBYI(VFMA(LDK(KP951056516), Tz, VMUL(LDK(KP587785252), TC)));
+		    TI = VFNMS(LDK(KP250000000), TH, TE);
+		    TK = VSUB(TI, TJ);
+		    TZ = VADD(TJ, TI);
+		    Tw = LDW(&(W[TWVL * 2]));
+		    TL = VZMUL(Tw, VADD(TD, TK));
+		    T16 = LDW(&(W[TWVL * 10]));
+		    T17 = VZMUL(T16, VADD(T10, TZ));
+		    TS = LDW(&(W[TWVL * 14]));
+		    TT = VZMUL(TS, VSUB(TK, TD));
+		    TY = LDW(&(W[TWVL * 6]));
+		    T11 = VZMUL(TY, VSUB(TZ, T10));
+		    TM = VADD(Tv, TL);
+		    ST(&(Rp[WS(rs, 1)]), TM, ms, &(Rp[WS(rs, 1)]));
+		    TU = VADD(TR, TT);
+		    ST(&(Rp[WS(rs, 4)]), TU, ms, &(Rp[0]));
+		    T1e = VCONJ(VSUB(T1c, T1b));
+		    ST(&(Rm[0]), T1e, -ms, &(Rm[0]));
+		    TN = VCONJ(VSUB(TL, Tv));
+		    ST(&(Rm[WS(rs, 1)]), TN, -ms, &(Rm[WS(rs, 1)]));
+		    T1d = VADD(T1b, T1c);
+		    ST(&(Rp[0]), T1d, ms, &(Rp[0]));
+		    T19 = VCONJ(VSUB(T17, T15));
+		    ST(&(Rm[WS(rs, 3)]), T19, -ms, &(Rm[WS(rs, 1)]));
+		    T13 = VCONJ(VSUB(T11, TX));
+		    ST(&(Rm[WS(rs, 2)]), T13, -ms, &(Rm[0]));
+		    TV = VCONJ(VSUB(TT, TR));
+		    ST(&(Rm[WS(rs, 4)]), TV, -ms, &(Rm[0]));
+		    T18 = VADD(T15, T17);
+		    ST(&(Rp[WS(rs, 3)]), T18, ms, &(Rp[WS(rs, 1)]));
+		    T12 = VADD(TX, T11);
+		    ST(&(Rp[WS(rs, 2)]), T12, ms, &(Rp[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cbdftv_10"), twinstr, &GENUS, {55, 24, 6, 0} };
+
+void XSIMD(codelet_hc2cbdftv_10) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_10, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:30 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dif -sign 1 -name hc2cbdftv_12 -include hc2cbv.h */
+
+/*
+ * This function contains 71 FP additions, 51 FP multiplications,
+ * (or, 45 additions, 25 multiplications, 26 fused multiply/add),
+ * 88 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
+	       V Tz, TT, T1, T1j, TN, TF, TP, TL, Tx, T15, TJ, T1b, T1g, T1l, T18;
+	       V T12, TO, TC, TK, Tl, T16, TQ, TU, TG, T1c, TM, T1k, Ty, T19, T1a;
+	       V T13, T14, T1h, T1i, TS, TR, T1m, T1n, TI, TH;
+	       {
+		    V T2, Tm, T7, Tp, T8, Tq, T9, Tu, T5, Tr, Tg, Tn, Tj, Ta, T3;
+		    V T4, Te, Tf, Th, Ti, TV, T6, TW, Tk, TD, Tt, TB, T11, T1f, Tw;
+		    V TE, TX, Tc, Ts, T10, TZ, To, Tb, Tv, T17, T1d, T1e, TY, TA, Td;
+		    T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+		    Tm = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tp = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+		    T3 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+		    T4 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+		    Te = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    Tf = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+		    Th = LD(&(Rm[0]), -ms, &(Rm[0]));
+		    Ti = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+		    T8 = VCONJ(T7);
+		    Tq = VCONJ(Tp);
+		    T9 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+		    Tu = VFNMSCONJ(T4, T3);
+		    T5 = VFMACONJ(T4, T3);
+		    Tr = VADD(Te, Tf);
+		    Tg = VSUB(Te, Tf);
+		    Tn = VADD(Ti, Th);
+		    Tj = VSUB(Th, Ti);
+		    Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+		    TV = LDW(&(W[TWVL * 4]));
+		    Tz = LDW(&(W[TWVL * 18]));
+		    T6 = VFNMS(LDK(KP500000000), T5, T2);
+		    TW = VADD(T2, T5);
+		    Ts = VFNMS(LDK(KP500000000), Tr, Tq);
+		    T10 = VFMACONJ(Tp, Tr);
+		    TZ = VFMACONJ(Tn, Tm);
+		    To = VFNMS(LDK(KP500000000), VCONJ(Tn), Tm);
+		    Tk = VFMACONJ(Tj, Tg);
+		    TD = VFNMSCONJ(Tj, Tg);
+		    Tb = VFMACONJ(Ta, T9);
+		    Tv = VFMSCONJ(Ta, T9);
+		    TT = LDW(&(W[TWVL * 2]));
+		    T1 = LDW(&(W[TWVL * 20]));
+		    Tt = VSUB(To, Ts);
+		    TB = VADD(To, Ts);
+		    T11 = VSUB(TZ, T10);
+		    T1f = VADD(TZ, T10);
+		    Tw = VSUB(Tu, Tv);
+		    TE = VADD(Tu, Tv);
+		    TX = VFMACONJ(T7, Tb);
+		    Tc = VFNMS(LDK(KP500000000), Tb, T8);
+		    T1j = LDW(&(W[0]));
+		    T17 = LDW(&(W[TWVL * 16]));
+		    T1d = LDW(&(W[TWVL * 10]));
+		    TN = LDW(&(W[TWVL * 6]));
+		    TF = VMUL(LDK(KP866025403), VSUB(TD, TE));
+		    TP = VMUL(LDK(KP866025403), VADD(TE, TD));
+		    TL = VFNMS(LDK(KP866025403), Tw, Tt);
+		    Tx = VFMA(LDK(KP866025403), Tw, Tt);
+		    T1e = VADD(TW, TX);
+		    TY = VSUB(TW, TX);
+		    TA = VADD(T6, Tc);
+		    Td = VSUB(T6, Tc);
+		    T15 = LDW(&(W[TWVL * 14]));
+		    TJ = LDW(&(W[TWVL * 8]));
+		    T1b = LDW(&(W[TWVL * 12]));
+		    T1g = VZMUL(T1d, VSUB(T1e, T1f));
+		    T1l = VADD(T1e, T1f);
+		    T18 = VZMULI(T17, VFMAI(T11, TY));
+		    T12 = VZMULI(TV, VFNMSI(T11, TY));
+		    TO = VADD(TA, TB);
+		    TC = VSUB(TA, TB);
+		    TK = VFNMS(LDK(KP866025403), Tk, Td);
+		    Tl = VFMA(LDK(KP866025403), Tk, Td);
+	       }
+	       T16 = VZMUL(T15, VFNMSI(TP, TO));
+	       TQ = VZMUL(TN, VFMAI(TP, TO));
+	       TU = VZMUL(TT, VFMAI(TF, TC));
+	       TG = VZMUL(Tz, VFNMSI(TF, TC));
+	       T1c = VZMULI(T1b, VFNMSI(TL, TK));
+	       TM = VZMULI(TJ, VFMAI(TL, TK));
+	       T1k = VZMULI(T1j, VFMAI(Tx, Tl));
+	       Ty = VZMULI(T1, VFNMSI(Tx, Tl));
+	       T19 = VCONJ(VSUB(T16, T18));
+	       T1a = VADD(T16, T18);
+	       T13 = VCONJ(VSUB(TU, T12));
+	       T14 = VADD(TU, T12);
+	       T1h = VADD(T1c, T1g);
+	       T1i = VCONJ(VSUB(T1g, T1c));
+	       TS = VCONJ(VSUB(TQ, TM));
+	       TR = VADD(TM, TQ);
+	       T1m = VADD(T1k, T1l);
+	       T1n = VCONJ(VSUB(T1l, T1k));
+	       TI = VCONJ(VSUB(TG, Ty));
+	       TH = VADD(Ty, TG);
+	       ST(&(Rm[WS(rs, 4)]), T19, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 4)]), T1a, ms, &(Rp[0]));
+	       ST(&(Rm[WS(rs, 1)]), T13, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 3)]), T1h, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 3)]), T1i, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 2)]), TS, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 2)]), TR, ms, &(Rp[0]));
+	       ST(&(Rp[0]), T1m, ms, &(Rp[0]));
+	       ST(&(Rm[0]), T1n, -ms, &(Rm[0]));
+	       ST(&(Rm[WS(rs, 5)]), TI, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 5)]), TH, ms, &(Rp[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cbdftv_12"), twinstr, &GENUS, {45, 25, 26, 0} };
+
+void XSIMD(codelet_hc2cbdftv_12) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_12, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dif -sign 1 -name hc2cbdftv_12 -include hc2cbv.h */
+
+/*
+ * This function contains 71 FP additions, 30 FP multiplications,
+ * (or, 67 additions, 26 multiplications, 4 fused multiply/add),
+ * 90 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
+	       V TY, TZ, Tf, TC, Tq, TG, Tm, TF, Ty, TD, T13, T1h, T2, T9, T3;
+	       V T5, T6, Tc, Tb, Td, T8, T4, Ta, T7, Te, To, Tp, Tr, Tv, Ti;
+	       V Ts, Tl, Tw, Tu, Tg, Th, Tj, Tk, Tt, Tx, T11, T12;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T8 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+	       T9 = VCONJ(T8);
+	       T3 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+	       T4 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+	       T5 = VCONJ(T4);
+	       T6 = VADD(T3, T5);
+	       Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       Tb = VCONJ(Ta);
+	       Td = VADD(Tb, Tc);
+	       TY = VADD(T2, T6);
+	       TZ = VADD(T9, Td);
+	       T7 = VFNMS(LDK(KP500000000), T6, T2);
+	       Te = VFNMS(LDK(KP500000000), Td, T9);
+	       Tf = VSUB(T7, Te);
+	       TC = VADD(T7, Te);
+	       To = VSUB(T3, T5);
+	       Tp = VSUB(Tb, Tc);
+	       Tq = VMUL(LDK(KP866025403), VSUB(To, Tp));
+	       TG = VADD(To, Tp);
+	       Tr = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+	       Tu = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       Tv = VCONJ(Tu);
+	       Tg = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+	       Th = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Ti = VCONJ(VSUB(Tg, Th));
+	       Ts = VCONJ(VADD(Tg, Th));
+	       Tj = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       Tk = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+	       Tl = VSUB(Tj, Tk);
+	       Tw = VADD(Tj, Tk);
+	       Tm = VMUL(LDK(KP866025403), VSUB(Ti, Tl));
+	       TF = VADD(Ti, Tl);
+	       Tt = VFNMS(LDK(KP500000000), Ts, Tr);
+	       Tx = VFNMS(LDK(KP500000000), Tw, Tv);
+	       Ty = VSUB(Tt, Tx);
+	       TD = VADD(Tt, Tx);
+	       T11 = VADD(Tr, Ts);
+	       T12 = VADD(Tv, Tw);
+	       T13 = VBYI(VSUB(T11, T12));
+	       T1h = VADD(T11, T12);
+	       {
+		    V T1n, T1i, T14, T1a, TA, T1m, TS, T18, TO, T1e, TI, TW, T1g, T1f, T10;
+		    V TX, T19, Tn, Tz, T1, T1l, TQ, TR, TP, T17, TM, TN, TL, T1d, TE;
+		    V TH, TB, TV, TJ, T1p, T1k, TT, T1o, TK, TU, T1j, T1b, T16, T1c, T15;
+		    T1g = VADD(TY, TZ);
+		    T1n = VADD(T1g, T1h);
+		    T1f = LDW(&(W[TWVL * 10]));
+		    T1i = VZMUL(T1f, VSUB(T1g, T1h));
+		    T10 = VSUB(TY, TZ);
+		    TX = LDW(&(W[TWVL * 4]));
+		    T14 = VZMULI(TX, VSUB(T10, T13));
+		    T19 = LDW(&(W[TWVL * 16]));
+		    T1a = VZMULI(T19, VADD(T10, T13));
+		    Tn = VSUB(Tf, Tm);
+		    Tz = VBYI(VADD(Tq, Ty));
+		    T1 = LDW(&(W[TWVL * 20]));
+		    TA = VZMULI(T1, VSUB(Tn, Tz));
+		    T1l = LDW(&(W[0]));
+		    T1m = VZMULI(T1l, VADD(Tn, Tz));
+		    TQ = VBYI(VMUL(LDK(KP866025403), VADD(TG, TF)));
+		    TR = VADD(TC, TD);
+		    TP = LDW(&(W[TWVL * 6]));
+		    TS = VZMUL(TP, VADD(TQ, TR));
+		    T17 = LDW(&(W[TWVL * 14]));
+		    T18 = VZMUL(T17, VSUB(TR, TQ));
+		    TM = VADD(Tf, Tm);
+		    TN = VBYI(VSUB(Ty, Tq));
+		    TL = LDW(&(W[TWVL * 8]));
+		    TO = VZMULI(TL, VADD(TM, TN));
+		    T1d = LDW(&(W[TWVL * 12]));
+		    T1e = VZMULI(T1d, VSUB(TM, TN));
+		    TE = VSUB(TC, TD);
+		    TH = VBYI(VMUL(LDK(KP866025403), VSUB(TF, TG)));
+		    TB = LDW(&(W[TWVL * 18]));
+		    TI = VZMUL(TB, VSUB(TE, TH));
+		    TV = LDW(&(W[TWVL * 2]));
+		    TW = VZMUL(TV, VADD(TH, TE));
+		    TJ = VADD(TA, TI);
+		    ST(&(Rp[WS(rs, 5)]), TJ, ms, &(Rp[WS(rs, 1)]));
+		    T1p = VCONJ(VSUB(T1n, T1m));
+		    ST(&(Rm[0]), T1p, -ms, &(Rm[0]));
+		    T1k = VCONJ(VSUB(T1i, T1e));
+		    ST(&(Rm[WS(rs, 3)]), T1k, -ms, &(Rm[WS(rs, 1)]));
+		    TT = VADD(TO, TS);
+		    ST(&(Rp[WS(rs, 2)]), TT, ms, &(Rp[0]));
+		    T1o = VADD(T1m, T1n);
+		    ST(&(Rp[0]), T1o, ms, &(Rp[0]));
+		    TK = VCONJ(VSUB(TI, TA));
+		    ST(&(Rm[WS(rs, 5)]), TK, -ms, &(Rm[WS(rs, 1)]));
+		    TU = VCONJ(VSUB(TS, TO));
+		    ST(&(Rm[WS(rs, 2)]), TU, -ms, &(Rm[0]));
+		    T1j = VADD(T1e, T1i);
+		    ST(&(Rp[WS(rs, 3)]), T1j, ms, &(Rp[WS(rs, 1)]));
+		    T1b = VCONJ(VSUB(T18, T1a));
+		    ST(&(Rm[WS(rs, 4)]), T1b, -ms, &(Rm[0]));
+		    T16 = VADD(TW, T14);
+		    ST(&(Rp[WS(rs, 1)]), T16, ms, &(Rp[WS(rs, 1)]));
+		    T1c = VADD(T18, T1a);
+		    ST(&(Rp[WS(rs, 4)]), T1c, ms, &(Rp[0]));
+		    T15 = VCONJ(VSUB(TW, T14));
+		    ST(&(Rm[WS(rs, 1)]), T15, -ms, &(Rm[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cbdftv_12"), twinstr, &GENUS, {67, 26, 4, 0} };
+
+void XSIMD(codelet_hc2cbdftv_12) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_12, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:30 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dif -sign 1 -name hc2cbdftv_16 -include hc2cbv.h */
+
+/*
+ * This function contains 103 FP additions, 80 FP multiplications,
+ * (or, 53 additions, 30 multiplications, 50 fused multiply/add),
+ * 123 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V T1D, T1F, TV, TW, T17, T18, T1B, T1A, T1H, T1G;
+	       {
+		    V T8, Tv, Tb, TF, Tl, TJ, TP, T1w, TE, T1t, T10, T1p, TG, Te, Tg;
+		    V Th, T2, T3, Ts, Tt, T5, T6, Tp, Tq, T9, TA, T4, TC, Tu, TN;
+		    V T7, TB, Tr, Ta, Tj, Tk, Tc, Td, TY, TD, TO, TZ, T1Q, T19, T1I;
+		    V T1d, Tf, T11, TH, TQ, Ti, TI, T1k, T1K, T1S, T1r, T14, T16, TU, Ty;
+		    V T1z, TX, T1o, T1, TK, TR, Tm, T12, T1C, Tz, T15;
+		    T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+		    T3 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+		    Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+		    Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+		    T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+		    T6 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tp = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+		    Tq = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+		    T9 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    TA = VFNMSCONJ(T3, T2);
+		    T4 = VFMACONJ(T3, T2);
+		    TC = VFMSCONJ(Tt, Ts);
+		    Tu = VFMACONJ(Tt, Ts);
+		    TN = VFNMSCONJ(T6, T5);
+		    T7 = VFMACONJ(T6, T5);
+		    TB = VFNMSCONJ(Tq, Tp);
+		    Tr = VFMACONJ(Tq, Tp);
+		    Ta = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+		    Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    Tk = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+		    Tc = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+		    Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+		    T8 = VSUB(T4, T7);
+		    TY = VADD(T4, T7);
+		    TD = VADD(TB, TC);
+		    TO = VSUB(TB, TC);
+		    Tv = VSUB(Tr, Tu);
+		    TZ = VADD(Tr, Tu);
+		    Tb = VFMACONJ(Ta, T9);
+		    TF = VFNMSCONJ(Ta, T9);
+		    Tl = VFMACONJ(Tk, Tj);
+		    TJ = VFNMSCONJ(Tk, Tj);
+		    TP = VFMA(LDK(KP707106781), TO, TN);
+		    T1w = VFNMS(LDK(KP707106781), TO, TN);
+		    TE = VFMA(LDK(KP707106781), TD, TA);
+		    T1t = VFNMS(LDK(KP707106781), TD, TA);
+		    T10 = VADD(TY, TZ);
+		    T1p = VSUB(TY, TZ);
+		    TG = VFNMSCONJ(Td, Tc);
+		    Te = VFMACONJ(Td, Tc);
+		    Tg = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+		    Th = LD(&(Rm[0]), -ms, &(Rm[0]));
+		    T1Q = LDW(&(W[TWVL * 22]));
+		    T19 = LDW(&(W[TWVL * 26]));
+		    T1I = LDW(&(W[TWVL * 2]));
+		    T1d = LDW(&(W[TWVL * 28]));
+		    Tf = VSUB(Tb, Te);
+		    T11 = VADD(Tb, Te);
+		    TH = VFNMS(LDK(KP414213562), TG, TF);
+		    TQ = VFMA(LDK(KP414213562), TF, TG);
+		    Ti = VFMACONJ(Th, Tg);
+		    TI = VFMSCONJ(Th, Tg);
+		    T1k = LDW(&(W[0]));
+		    T1K = LDW(&(W[TWVL * 4]));
+		    T1S = LDW(&(W[TWVL * 24]));
+		    TX = LDW(&(W[TWVL * 14]));
+		    T1o = LDW(&(W[TWVL * 6]));
+		    T1 = LDW(&(W[TWVL * 10]));
+		    TK = VFMA(LDK(KP414213562), TJ, TI);
+		    TR = VFNMS(LDK(KP414213562), TI, TJ);
+		    Tm = VSUB(Ti, Tl);
+		    T12 = VADD(Ti, Tl);
+		    T1C = LDW(&(W[TWVL * 18]));
+		    Tz = LDW(&(W[TWVL * 12]));
+		    T15 = LDW(&(W[TWVL * 16]));
+		    {
+			 V T1v, T1y, T1N, T1g, T1J, T1c, T1U, T1V, T1m, T1n, T1s, TS, T1u, TL, T1x;
+			 V T13, T1q, Tn, Tw, T1L, T1f, TT, T1M, T1e, TM, T1R, T1j, T1b, Tx, T1a;
+			 V To, T1T, T1l, T1E, T1O, T1P, T1h, T1i;
+			 T1s = LDW(&(W[TWVL * 8]));
+			 TS = VADD(TQ, TR);
+			 T1u = VSUB(TQ, TR);
+			 TL = VADD(TH, TK);
+			 T1x = VSUB(TH, TK);
+			 T13 = VADD(T11, T12);
+			 T1q = VSUB(T11, T12);
+			 Tn = VADD(Tf, Tm);
+			 Tw = VSUB(Tf, Tm);
+			 T1L = VFMA(LDK(KP923879532), T1u, T1t);
+			 T1v = VFNMS(LDK(KP923879532), T1u, T1t);
+			 T1f = VFMA(LDK(KP923879532), TS, TP);
+			 TT = VFNMS(LDK(KP923879532), TS, TP);
+			 T1M = VFNMS(LDK(KP923879532), T1x, T1w);
+			 T1y = VFMA(LDK(KP923879532), T1x, T1w);
+			 T1e = VFMA(LDK(KP923879532), TL, TE);
+			 TM = VFNMS(LDK(KP923879532), TL, TE);
+			 T1r = VZMUL(T1o, VFMAI(T1q, T1p));
+			 T1R = VZMUL(T1Q, VFNMSI(T1q, T1p));
+			 T14 = VZMUL(TX, VSUB(T10, T13));
+			 T1j = VADD(T10, T13);
+			 T1b = VFMA(LDK(KP707106781), Tw, Tv);
+			 Tx = VFNMS(LDK(KP707106781), Tw, Tv);
+			 T1a = VFMA(LDK(KP707106781), Tn, T8);
+			 To = VFNMS(LDK(KP707106781), Tn, T8);
+			 T1T = VZMULI(T1S, VFMAI(T1M, T1L));
+			 T1N = VZMULI(T1K, VFNMSI(T1M, T1L));
+			 T16 = VZMULI(T15, VFMAI(TT, TM));
+			 TU = VZMULI(Tz, VFNMSI(TT, TM));
+			 T1l = VZMULI(T1k, VFMAI(T1f, T1e));
+			 T1g = VZMULI(T1d, VFNMSI(T1f, T1e));
+			 T1D = VZMUL(T1C, VFMAI(Tx, To));
+			 Ty = VZMUL(T1, VFNMSI(Tx, To));
+			 T1J = VZMUL(T1I, VFMAI(T1b, T1a));
+			 T1c = VZMUL(T19, VFNMSI(T1b, T1a));
+			 T1U = VCONJ(VSUB(T1R, T1T));
+			 T1V = VADD(T1R, T1T);
+			 T1m = VCONJ(VSUB(T1j, T1l));
+			 T1n = VADD(T1j, T1l);
+			 T1z = VZMULI(T1s, VFMAI(T1y, T1v));
+			 T1E = LDW(&(W[TWVL * 20]));
+			 T1O = VCONJ(VSUB(T1J, T1N));
+			 T1P = VADD(T1J, T1N);
+			 T1h = VCONJ(VSUB(T1c, T1g));
+			 T1i = VADD(T1c, T1g);
+			 ST(&(Rp[WS(rs, 6)]), T1V, ms, &(Rp[0]));
+			 ST(&(Rm[WS(rs, 6)]), T1U, -ms, &(Rm[0]));
+			 ST(&(Rp[0]), T1n, ms, &(Rp[0]));
+			 ST(&(Rm[0]), T1m, -ms, &(Rm[0]));
+			 ST(&(Rp[WS(rs, 1)]), T1P, ms, &(Rp[WS(rs, 1)]));
+			 ST(&(Rm[WS(rs, 1)]), T1O, -ms, &(Rm[WS(rs, 1)]));
+			 ST(&(Rp[WS(rs, 7)]), T1i, ms, &(Rp[WS(rs, 1)]));
+			 ST(&(Rm[WS(rs, 7)]), T1h, -ms, &(Rm[WS(rs, 1)]));
+			 T1F = VZMULI(T1E, VFNMSI(T1y, T1v));
+		    }
+		    TV = VCONJ(VSUB(Ty, TU));
+		    TW = VADD(Ty, TU);
+		    T17 = VCONJ(VSUB(T14, T16));
+		    T18 = VADD(T14, T16);
+		    T1B = VADD(T1r, T1z);
+		    T1A = VCONJ(VSUB(T1r, T1z));
+	       }
+	       T1H = VADD(T1D, T1F);
+	       T1G = VCONJ(VSUB(T1D, T1F));
+	       ST(&(Rm[WS(rs, 3)]), TV, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 3)]), TW, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 4)]), T17, -ms, &(Rm[0]));
+	       ST(&(Rm[WS(rs, 2)]), T1A, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 2)]), T1B, ms, &(Rp[0]));
+	       ST(&(Rp[WS(rs, 4)]), T18, ms, &(Rp[0]));
+	       ST(&(Rp[WS(rs, 5)]), T1H, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 5)]), T1G, -ms, &(Rm[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cbdftv_16"), twinstr, &GENUS, {53, 30, 50, 0} };
+
+void XSIMD(codelet_hc2cbdftv_16) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_16, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dif -sign 1 -name hc2cbdftv_16 -include hc2cbv.h */
+
+/*
+ * This function contains 103 FP additions, 42 FP multiplications,
+ * (or, 99 additions, 38 multiplications, 4 fused multiply/add),
+ * 83 stack variables, 3 constants, and 32 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V Tf, T16, TZ, T1C, TI, T1a, TV, T1D, T1F, T1G, Ty, T19, TC, T17, TS;
+	       V T10;
+	       {
+		    V T2, TD, T4, TF, Tc, Tb, Td, T6, T8, T9, T3, TE, Ta, T7, T5;
+		    V Te, TX, TY, TG, TH, TT, TU, Tj, TM, Tw, TQ, Tn, TN, Ts, TP;
+		    V Tg, Ti, Th, Tt, Tv, Tu, Tk, Tm, Tl, Tr, Tq, Tp, To, Tx, TA;
+		    V TB, TO, TR;
+		    T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+		    TD = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+		    T3 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+		    T4 = VCONJ(T3);
+		    TE = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+		    TF = VCONJ(TE);
+		    Tc = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+		    Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tb = VCONJ(Ta);
+		    Td = VSUB(Tb, Tc);
+		    T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+		    T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+		    T8 = VCONJ(T7);
+		    T9 = VSUB(T6, T8);
+		    T5 = VSUB(T2, T4);
+		    Te = VMUL(LDK(KP707106781), VADD(T9, Td));
+		    Tf = VADD(T5, Te);
+		    T16 = VSUB(T5, Te);
+		    TX = VADD(T2, T4);
+		    TY = VADD(TD, TF);
+		    TZ = VSUB(TX, TY);
+		    T1C = VADD(TX, TY);
+		    TG = VSUB(TD, TF);
+		    TH = VMUL(LDK(KP707106781), VSUB(T9, Td));
+		    TI = VADD(TG, TH);
+		    T1a = VSUB(TH, TG);
+		    TT = VADD(T6, T8);
+		    TU = VADD(Tb, Tc);
+		    TV = VSUB(TT, TU);
+		    T1D = VADD(TT, TU);
+		    Tg = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    Th = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+		    Ti = VCONJ(Th);
+		    Tj = VSUB(Tg, Ti);
+		    TM = VADD(Tg, Ti);
+		    Tt = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    Tu = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+		    Tv = VCONJ(Tu);
+		    Tw = VSUB(Tt, Tv);
+		    TQ = VADD(Tt, Tv);
+		    Tk = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+		    Tl = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+		    Tm = VCONJ(Tl);
+		    Tn = VSUB(Tk, Tm);
+		    TN = VADD(Tk, Tm);
+		    Tr = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+		    Tp = LD(&(Rm[0]), -ms, &(Rm[0]));
+		    Tq = VCONJ(Tp);
+		    Ts = VSUB(Tq, Tr);
+		    TP = VADD(Tq, Tr);
+		    T1F = VADD(TM, TN);
+		    T1G = VADD(TP, TQ);
+		    To = VFNMS(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tj));
+		    Tx = VFMA(LDK(KP923879532), Ts, VMUL(LDK(KP382683432), Tw));
+		    Ty = VADD(To, Tx);
+		    T19 = VSUB(To, Tx);
+		    TA = VFMA(LDK(KP382683432), Tj, VMUL(LDK(KP923879532), Tn));
+		    TB = VFNMS(LDK(KP382683432), Ts, VMUL(LDK(KP923879532), Tw));
+		    TC = VADD(TA, TB);
+		    T17 = VSUB(TA, TB);
+		    TO = VSUB(TM, TN);
+		    TR = VSUB(TP, TQ);
+		    TS = VMUL(LDK(KP707106781), VSUB(TO, TR));
+		    T10 = VMUL(LDK(KP707106781), VADD(TO, TR));
+	       }
+	       {
+		    V T21, T1W, T1u, T20, T1I, T1O, TK, T1S, T12, T1e, T1k, T1A, T1o, T1w, T1c;
+		    V T1M, T1U, T1V, T1T, T1s, T1t, T1r, T1Z, T1E, T1H, T1B, T1N, Tz, TJ, T1;
+		    V T1R, TW, T11, TL, T1d, T1i, T1j, T1h, T1z, T1m, T1n, T1l, T1v, T18, T1b;
+		    V T15, T1L, T13, T1g, T1X, T23, T14, T1f, T1Y, T22, T1p, T1y, T1J, T1Q, T1q;
+		    V T1x, T1K, T1P;
+		    T1U = VADD(T1C, T1D);
+		    T1V = VADD(T1F, T1G);
+		    T21 = VADD(T1U, T1V);
+		    T1T = LDW(&(W[TWVL * 14]));
+		    T1W = VZMUL(T1T, VSUB(T1U, T1V));
+		    T1s = VADD(Tf, Ty);
+		    T1t = VBYI(VADD(TI, TC));
+		    T1r = LDW(&(W[TWVL * 28]));
+		    T1u = VZMULI(T1r, VSUB(T1s, T1t));
+		    T1Z = LDW(&(W[0]));
+		    T20 = VZMULI(T1Z, VADD(T1s, T1t));
+		    T1E = VSUB(T1C, T1D);
+		    T1H = VBYI(VSUB(T1F, T1G));
+		    T1B = LDW(&(W[TWVL * 22]));
+		    T1I = VZMUL(T1B, VSUB(T1E, T1H));
+		    T1N = LDW(&(W[TWVL * 6]));
+		    T1O = VZMUL(T1N, VADD(T1E, T1H));
+		    Tz = VSUB(Tf, Ty);
+		    TJ = VBYI(VSUB(TC, TI));
+		    T1 = LDW(&(W[TWVL * 12]));
+		    TK = VZMULI(T1, VADD(Tz, TJ));
+		    T1R = LDW(&(W[TWVL * 16]));
+		    T1S = VZMULI(T1R, VSUB(Tz, TJ));
+		    TW = VBYI(VSUB(TS, TV));
+		    T11 = VSUB(TZ, T10);
+		    TL = LDW(&(W[TWVL * 10]));
+		    T12 = VZMUL(TL, VADD(TW, T11));
+		    T1d = LDW(&(W[TWVL * 18]));
+		    T1e = VZMUL(T1d, VSUB(T11, TW));
+		    T1i = VBYI(VADD(T1a, T19));
+		    T1j = VADD(T16, T17);
+		    T1h = LDW(&(W[TWVL * 4]));
+		    T1k = VZMULI(T1h, VADD(T1i, T1j));
+		    T1z = LDW(&(W[TWVL * 24]));
+		    T1A = VZMULI(T1z, VSUB(T1j, T1i));
+		    T1m = VBYI(VADD(TV, TS));
+		    T1n = VADD(TZ, T10);
+		    T1l = LDW(&(W[TWVL * 2]));
+		    T1o = VZMUL(T1l, VADD(T1m, T1n));
+		    T1v = LDW(&(W[TWVL * 26]));
+		    T1w = VZMUL(T1v, VSUB(T1n, T1m));
+		    T18 = VSUB(T16, T17);
+		    T1b = VBYI(VSUB(T19, T1a));
+		    T15 = LDW(&(W[TWVL * 20]));
+		    T1c = VZMULI(T15, VSUB(T18, T1b));
+		    T1L = LDW(&(W[TWVL * 8]));
+		    T1M = VZMULI(T1L, VADD(T1b, T18));
+		    T13 = VADD(TK, T12);
+		    ST(&(Rp[WS(rs, 3)]), T13, ms, &(Rp[WS(rs, 1)]));
+		    T1g = VCONJ(VSUB(T1e, T1c));
+		    ST(&(Rm[WS(rs, 5)]), T1g, -ms, &(Rm[WS(rs, 1)]));
+		    T1X = VADD(T1S, T1W);
+		    ST(&(Rp[WS(rs, 4)]), T1X, ms, &(Rp[0]));
+		    T23 = VCONJ(VSUB(T21, T20));
+		    ST(&(Rm[0]), T23, -ms, &(Rm[0]));
+		    T14 = VCONJ(VSUB(T12, TK));
+		    ST(&(Rm[WS(rs, 3)]), T14, -ms, &(Rm[WS(rs, 1)]));
+		    T1f = VADD(T1c, T1e);
+		    ST(&(Rp[WS(rs, 5)]), T1f, ms, &(Rp[WS(rs, 1)]));
+		    T1Y = VCONJ(VSUB(T1W, T1S));
+		    ST(&(Rm[WS(rs, 4)]), T1Y, -ms, &(Rm[0]));
+		    T22 = VADD(T20, T21);
+		    ST(&(Rp[0]), T22, ms, &(Rp[0]));
+		    T1p = VADD(T1k, T1o);
+		    ST(&(Rp[WS(rs, 1)]), T1p, ms, &(Rp[WS(rs, 1)]));
+		    T1y = VCONJ(VSUB(T1w, T1u));
+		    ST(&(Rm[WS(rs, 7)]), T1y, -ms, &(Rm[WS(rs, 1)]));
+		    T1J = VADD(T1A, T1I);
+		    ST(&(Rp[WS(rs, 6)]), T1J, ms, &(Rp[0]));
+		    T1Q = VCONJ(VSUB(T1O, T1M));
+		    ST(&(Rm[WS(rs, 2)]), T1Q, -ms, &(Rm[0]));
+		    T1q = VCONJ(VSUB(T1o, T1k));
+		    ST(&(Rm[WS(rs, 1)]), T1q, -ms, &(Rm[WS(rs, 1)]));
+		    T1x = VADD(T1u, T1w);
+		    ST(&(Rp[WS(rs, 7)]), T1x, ms, &(Rp[WS(rs, 1)]));
+		    T1K = VCONJ(VSUB(T1I, T1A));
+		    ST(&(Rm[WS(rs, 6)]), T1K, -ms, &(Rm[0]));
+		    T1P = VADD(T1M, T1O);
+		    ST(&(Rp[WS(rs, 2)]), T1P, ms, &(Rp[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cbdftv_16"), twinstr, &GENUS, {99, 38, 4, 0} };
+
+void XSIMD(codelet_hc2cbdftv_16) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_16, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 2 -dif -sign 1 -name hc2cbdftv_2 -include hc2cbv.h */
+
+/*
+ * This function contains 5 FP additions, 4 FP multiplications,
+ * (or, 3 additions, 2 multiplications, 2 fused multiply/add),
+ * 8 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 2)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T2, T3, T1, T5, T4, T7, T6;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T3 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       T1 = LDW(&(W[0]));
+	       T5 = VFMACONJ(T3, T2);
+	       T4 = VZMULI(T1, VFNMSCONJ(T3, T2));
+	       T7 = VCONJ(VSUB(T5, T4));
+	       T6 = VADD(T4, T5);
+	       ST(&(Rm[0]), T7, -ms, &(Rm[0]));
+	       ST(&(Rp[0]), T6, ms, &(Rp[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 2, XSIMD_STRING("hc2cbdftv_2"), twinstr, &GENUS, {3, 2, 2, 0} };
+
+void XSIMD(codelet_hc2cbdftv_2) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_2, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 2 -dif -sign 1 -name hc2cbdftv_2 -include hc2cbv.h */
+
+/*
+ * This function contains 5 FP additions, 2 FP multiplications,
+ * (or, 5 additions, 2 multiplications, 0 fused multiply/add),
+ * 9 stack variables, 0 constants, and 4 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 2)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T6, T5, T2, T4, T3, T1, T7, T8;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T3 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       T4 = VCONJ(T3);
+	       T6 = VADD(T2, T4);
+	       T1 = LDW(&(W[0]));
+	       T5 = VZMULI(T1, VSUB(T2, T4));
+	       T7 = VADD(T5, T6);
+	       ST(&(Rp[0]), T7, ms, &(Rp[0]));
+	       T8 = VCONJ(VSUB(T6, T5));
+	       ST(&(Rm[0]), T8, -ms, &(Rm[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 2, XSIMD_STRING("hc2cbdftv_2"), twinstr, &GENUS, {5, 2, 0, 0} };
+
+void XSIMD(codelet_hc2cbdftv_2) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_2, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:30 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dif -sign 1 -name hc2cbdftv_20 -include hc2cbv.h */
+
+/*
+ * This function contains 143 FP additions, 108 FP multiplications,
+ * (or, 77 additions, 42 multiplications, 66 fused multiply/add),
+ * 134 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
+	       V T1M, T1T, T4, TF, T12, Te, T16, Ts, Tb, TN, TA, TG, TU, T1Y, T11;
+	       V T1e, T29, T21, T15, Th, T13, Tp;
+	       {
+		    V TS, TT, Tf, T10, T20, T1Z, TX, Tg, Tn, To, T2, T3, TD, TE, T8;
+		    V TV, T7, TZ, Tz, T9, Tu, Tv, T5, T6, Tx, Ty, Tc, Td, Tq, Tr;
+		    V TY, Ta, TW, Tw;
+		    T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+		    T3 = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
+		    TD = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+		    TE = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+		    T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+		    T6 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tx = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    Ty = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
+		    T8 = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+		    TS = VFMACONJ(T3, T2);
+		    T4 = VFNMSCONJ(T3, T2);
+		    TT = VFMACONJ(TE, TD);
+		    TF = VFNMSCONJ(TE, TD);
+		    TV = VFMACONJ(T6, T5);
+		    T7 = VFNMSCONJ(T6, T5);
+		    TZ = VFMACONJ(Ty, Tx);
+		    Tz = VFNMSCONJ(Ty, Tx);
+		    T9 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tu = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
+		    Tv = LD(&(Rm[0]), -ms, &(Rm[0]));
+		    Tc = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
+		    Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tq = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+		    Tr = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+		    Tf = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+		    TY = VFMACONJ(T9, T8);
+		    Ta = VFMSCONJ(T9, T8);
+		    TW = VFMACONJ(Tv, Tu);
+		    Tw = VFNMSCONJ(Tv, Tu);
+		    T12 = VFMACONJ(Td, Tc);
+		    Te = VFNMSCONJ(Td, Tc);
+		    T16 = VFMACONJ(Tr, Tq);
+		    Ts = VFMSCONJ(Tr, Tq);
+		    T10 = VSUB(TY, TZ);
+		    T20 = VADD(TY, TZ);
+		    Tb = VADD(T7, Ta);
+		    TN = VSUB(T7, Ta);
+		    T1Z = VADD(TV, TW);
+		    TX = VSUB(TV, TW);
+		    TA = VSUB(Tw, Tz);
+		    TG = VADD(Tw, Tz);
+		    Tg = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tn = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+		    TU = VSUB(TS, TT);
+		    T1Y = VADD(TS, TT);
+		    T11 = VADD(TX, T10);
+		    T1e = VSUB(TX, T10);
+		    T29 = VSUB(T1Z, T20);
+		    T21 = VADD(T1Z, T20);
+		    T15 = VFMACONJ(Tg, Tf);
+		    Th = VFMSCONJ(Tg, Tf);
+		    T13 = VFMACONJ(To, Tn);
+		    Tp = VFMSCONJ(To, Tn);
+	       }
+	       {
+		    V T1S, T2B, T1W, T1I, T2q, T2w, T2i, T2c, T1C, T1K, T1s, T1g, T1, T2t, T1v;
+		    V T1Q, T2A, T1q, T2m, TC, T1w, TP, T1x, T2f, T2r, T2g, T1E, T1D, T2y, T2x;
+		    V T1i, T1h, T2D, T2C, T2s, T1t, T1u, T1y, T2u, TQ, T2d, T2e, T1U, T1L, T2j;
+		    V T2k;
+		    {
+			 V T1R, T1F, T1V, T1o, TO, Tl, T1d, T2a, T1l, TB, TK, T1G, Tk, T1b, T19;
+			 V T27, T25, T1H, TJ, T17, T23, TM, Ti, T14, T22, Tt, TH, Tj, T18, T24;
+			 V TI, T2b, T2p, T1X, T2v, T2h, T2n, T1B, T1f, T28, T2o, T1a, TR, T1J, T1r;
+			 V T1z, T26, Tm, TL, T1O, T1m, T1j, T2z, T1N, T1p, T1P, T2l, T1c, T1A, T1n;
+			 V T1k;
+			 T1R = LDW(&(W[TWVL * 18]));
+			 T17 = VSUB(T15, T16);
+			 T23 = VADD(T15, T16);
+			 TM = VSUB(Te, Th);
+			 Ti = VADD(Te, Th);
+			 T14 = VSUB(T12, T13);
+			 T22 = VADD(T12, T13);
+			 Tt = VSUB(Tp, Ts);
+			 TH = VADD(Tp, Ts);
+			 T1F = LDW(&(W[TWVL * 28]));
+			 T1V = LDW(&(W[TWVL * 8]));
+			 T1o = VFMA(LDK(KP618033988), TM, TN);
+			 TO = VFNMS(LDK(KP618033988), TN, TM);
+			 Tj = VADD(Tb, Ti);
+			 Tl = VSUB(Tb, Ti);
+			 T18 = VADD(T14, T17);
+			 T1d = VSUB(T14, T17);
+			 T24 = VADD(T22, T23);
+			 T2a = VSUB(T22, T23);
+			 T1l = VFMA(LDK(KP618033988), Tt, TA);
+			 TB = VFNMS(LDK(KP618033988), TA, Tt);
+			 TI = VADD(TG, TH);
+			 TK = VSUB(TG, TH);
+			 T1G = VADD(T4, Tj);
+			 Tk = VFNMS(LDK(KP250000000), Tj, T4);
+			 T1b = VSUB(T11, T18);
+			 T19 = VADD(T11, T18);
+			 T27 = VSUB(T21, T24);
+			 T25 = VADD(T21, T24);
+			 T1H = VADD(TF, TI);
+			 TJ = VFNMS(LDK(KP250000000), TI, TF);
+			 T2b = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2a, T29));
+			 T2p = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T29, T2a));
+			 T1X = LDW(&(W[TWVL * 6]));
+			 T1S = VZMUL(T1R, VADD(TU, T19));
+			 T2v = LDW(&(W[TWVL * 22]));
+			 T2B = VADD(T1Y, T25);
+			 T26 = VFNMS(LDK(KP250000000), T25, T1Y);
+			 T1W = VZMULI(T1V, VFMAI(T1H, T1G));
+			 T1I = VZMULI(T1F, VFNMSI(T1H, T1G));
+			 T2h = LDW(&(W[TWVL * 30]));
+			 T2n = LDW(&(W[TWVL * 14]));
+			 T1B = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1d, T1e));
+			 T1f = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1e, T1d));
+			 T28 = VFMA(LDK(KP559016994), T27, T26);
+			 T2o = VFNMS(LDK(KP559016994), T27, T26);
+			 T1a = VFNMS(LDK(KP250000000), T19, TU);
+			 TR = LDW(&(W[TWVL * 2]));
+			 T1J = LDW(&(W[TWVL * 26]));
+			 T1r = LDW(&(W[TWVL * 34]));
+			 T1z = LDW(&(W[TWVL * 10]));
+			 T1k = VFMA(LDK(KP559016994), Tl, Tk);
+			 Tm = VFNMS(LDK(KP559016994), Tl, Tk);
+			 T2q = VZMUL(T2n, VFMAI(T2p, T2o));
+			 T2w = VZMUL(T2v, VFNMSI(T2p, T2o));
+			 T2i = VZMUL(T2h, VFMAI(T2b, T28));
+			 T2c = VZMUL(T1X, VFNMSI(T2b, T28));
+			 T1c = VFNMS(LDK(KP559016994), T1b, T1a);
+			 T1A = VFMA(LDK(KP559016994), T1b, T1a);
+			 TL = VFNMS(LDK(KP559016994), TK, TJ);
+			 T1n = VFMA(LDK(KP559016994), TK, TJ);
+			 T1O = VFMA(LDK(KP951056516), T1l, T1k);
+			 T1m = VFNMS(LDK(KP951056516), T1l, T1k);
+			 T1j = LDW(&(W[TWVL * 36]));
+			 T2z = LDW(&(W[0]));
+			 T1N = LDW(&(W[TWVL * 20]));
+			 T1C = VZMUL(T1z, VFMAI(T1B, T1A));
+			 T1K = VZMUL(T1J, VFNMSI(T1B, T1A));
+			 T1s = VZMUL(T1r, VFMAI(T1f, T1c));
+			 T1g = VZMUL(TR, VFNMSI(T1f, T1c));
+			 T1p = VFMA(LDK(KP951056516), T1o, T1n);
+			 T1P = VFNMS(LDK(KP951056516), T1o, T1n);
+			 T2l = LDW(&(W[TWVL * 16]));
+			 T1 = LDW(&(W[TWVL * 4]));
+			 T2t = LDW(&(W[TWVL * 24]));
+			 T1v = LDW(&(W[TWVL * 12]));
+			 T1Q = VZMULI(T1N, VFNMSI(T1P, T1O));
+			 T2A = VZMULI(T2z, VFMAI(T1p, T1m));
+			 T1q = VZMULI(T1j, VFNMSI(T1p, T1m));
+			 T2m = VZMULI(T2l, VFMAI(T1P, T1O));
+			 TC = VFMA(LDK(KP951056516), TB, Tm);
+			 T1w = VFNMS(LDK(KP951056516), TB, Tm);
+			 TP = VFNMS(LDK(KP951056516), TO, TL);
+			 T1x = VFMA(LDK(KP951056516), TO, TL);
+			 T2f = LDW(&(W[TWVL * 32]));
+		    }
+		    T2D = VCONJ(VSUB(T2B, T2A));
+		    T2C = VADD(T2A, T2B);
+		    T2s = VCONJ(VSUB(T2q, T2m));
+		    T2r = VADD(T2m, T2q);
+		    T1t = VADD(T1q, T1s);
+		    T1u = VCONJ(VSUB(T1s, T1q));
+		    T1y = VZMULI(T1v, VFNMSI(T1x, T1w));
+		    T2u = VZMULI(T2t, VFMAI(T1x, T1w));
+		    TQ = VZMULI(T1, VFNMSI(TP, TC));
+		    T2g = VZMULI(T2f, VFMAI(TP, TC));
+		    ST(&(Rm[0]), T2D, -ms, &(Rm[0]));
+		    ST(&(Rp[0]), T2C, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 4)]), T2s, -ms, &(Rm[0]));
+		    ST(&(Rm[WS(rs, 9)]), T1u, -ms, &(Rm[WS(rs, 1)]));
+		    T1E = VCONJ(VSUB(T1C, T1y));
+		    T1D = VADD(T1y, T1C);
+		    T2y = VCONJ(VSUB(T2w, T2u));
+		    T2x = VADD(T2u, T2w);
+		    T1i = VCONJ(VSUB(T1g, TQ));
+		    T1h = VADD(TQ, T1g);
+		    ST(&(Rp[WS(rs, 9)]), T1t, ms, &(Rp[WS(rs, 1)]));
+		    T1L = VADD(T1I, T1K);
+		    T1M = VCONJ(VSUB(T1K, T1I));
+		    ST(&(Rp[WS(rs, 3)]), T1D, ms, &(Rp[WS(rs, 1)]));
+		    ST(&(Rm[WS(rs, 6)]), T2y, -ms, &(Rm[0]));
+		    ST(&(Rp[WS(rs, 6)]), T2x, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 1)]), T1i, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 1)]), T1h, ms, &(Rp[WS(rs, 1)]));
+		    T2d = VADD(T1W, T2c);
+		    T2e = VCONJ(VSUB(T2c, T1W));
+		    ST(&(Rm[WS(rs, 3)]), T1E, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 7)]), T1L, ms, &(Rp[WS(rs, 1)]));
+		    T1U = VCONJ(VSUB(T1S, T1Q));
+		    T1T = VADD(T1Q, T1S);
+		    T2j = VADD(T2g, T2i);
+		    T2k = VCONJ(VSUB(T2i, T2g));
+		    ST(&(Rp[WS(rs, 2)]), T2d, ms, &(Rp[0]));
+		    ST(&(Rp[WS(rs, 4)]), T2r, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 5)]), T1U, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rm[WS(rs, 2)]), T2e, -ms, &(Rm[0]));
+		    ST(&(Rp[WS(rs, 8)]), T2j, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 8)]), T2k, -ms, &(Rm[0]));
+	       }
+	       ST(&(Rp[WS(rs, 5)]), T1T, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 7)]), T1M, -ms, &(Rm[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     VTW(1, 16),
+     VTW(1, 17),
+     VTW(1, 18),
+     VTW(1, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cbdftv_20"), twinstr, &GENUS, {77, 42, 66, 0} };
+
+void XSIMD(codelet_hc2cbdftv_20) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_20, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dif -sign 1 -name hc2cbdftv_20 -include hc2cbv.h */
+
+/*
+ * This function contains 143 FP additions, 62 FP multiplications,
+ * (or, 131 additions, 50 multiplications, 12 fused multiply/add),
+ * 114 stack variables, 4 constants, and 40 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
+	       V TK, T1v, TY, T1x, T1j, T2f, TS, TT, TO, TU, T5, To, Tp, Tq, T2a;
+	       V T2d, T2g, T2k, T2j, T1k, T1l, T18, T1m, T1f;
+	       {
+		    V T2, TP, T4, TR, TI, T1d, T9, T12, Td, T15, TE, T1a, Tv, T13, Tm;
+		    V T1c, Tz, T16, Ti, T19, T3, TQ, TH, TG, TF, T6, T8, T7, Tc, Tb;
+		    V Ta, TD, TC, TB, Ts, Tu, Tt, Tl, Tk, Tj, Tw, Ty, Tx, Tf, Th;
+		    V Tg, TA, TJ, TW, TX, T1h, T1i, TM, TN, Te, Tn, T28, T29, T2b, T2c;
+		    V T14, T17, T1b, T1e;
+		    T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+		    TP = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+		    T3 = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
+		    T4 = VCONJ(T3);
+		    TQ = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+		    TR = VCONJ(TQ);
+		    TH = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+		    TF = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+		    TG = VCONJ(TF);
+		    TI = VSUB(TG, TH);
+		    T1d = VADD(TG, TH);
+		    T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+		    T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+		    T8 = VCONJ(T7);
+		    T9 = VSUB(T6, T8);
+		    T12 = VADD(T6, T8);
+		    Tc = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+		    Ta = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tb = VCONJ(Ta);
+		    Td = VSUB(Tb, Tc);
+		    T15 = VADD(Tb, Tc);
+		    TD = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    TB = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+		    TC = VCONJ(TB);
+		    TE = VSUB(TC, TD);
+		    T1a = VADD(TC, TD);
+		    Ts = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
+		    Tt = LD(&(Rm[0]), -ms, &(Rm[0]));
+		    Tu = VCONJ(Tt);
+		    Tv = VSUB(Ts, Tu);
+		    T13 = VADD(Ts, Tu);
+		    Tl = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+		    Tj = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tk = VCONJ(Tj);
+		    Tm = VSUB(Tk, Tl);
+		    T1c = VADD(Tk, Tl);
+		    Tw = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    Tx = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
+		    Ty = VCONJ(Tx);
+		    Tz = VSUB(Tw, Ty);
+		    T16 = VADD(Tw, Ty);
+		    Tf = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
+		    Tg = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+		    Th = VCONJ(Tg);
+		    Ti = VSUB(Tf, Th);
+		    T19 = VADD(Tf, Th);
+		    TA = VSUB(Tv, Tz);
+		    TJ = VSUB(TE, TI);
+		    TK = VFNMS(LDK(KP951056516), TJ, VMUL(LDK(KP587785252), TA));
+		    T1v = VFMA(LDK(KP951056516), TA, VMUL(LDK(KP587785252), TJ));
+		    TW = VSUB(T9, Td);
+		    TX = VSUB(Ti, Tm);
+		    TY = VFNMS(LDK(KP951056516), TX, VMUL(LDK(KP587785252), TW));
+		    T1x = VFMA(LDK(KP951056516), TW, VMUL(LDK(KP587785252), TX));
+		    T1h = VADD(T2, T4);
+		    T1i = VADD(TP, TR);
+		    T1j = VSUB(T1h, T1i);
+		    T2f = VADD(T1h, T1i);
+		    TS = VSUB(TP, TR);
+		    TM = VADD(Tv, Tz);
+		    TN = VADD(TE, TI);
+		    TT = VADD(TM, TN);
+		    TO = VMUL(LDK(KP559016994), VSUB(TM, TN));
+		    TU = VFNMS(LDK(KP250000000), TT, TS);
+		    T5 = VSUB(T2, T4);
+		    Te = VADD(T9, Td);
+		    Tn = VADD(Ti, Tm);
+		    To = VADD(Te, Tn);
+		    Tp = VFNMS(LDK(KP250000000), To, T5);
+		    Tq = VMUL(LDK(KP559016994), VSUB(Te, Tn));
+		    T28 = VADD(T12, T13);
+		    T29 = VADD(T15, T16);
+		    T2a = VADD(T28, T29);
+		    T2b = VADD(T19, T1a);
+		    T2c = VADD(T1c, T1d);
+		    T2d = VADD(T2b, T2c);
+		    T2g = VADD(T2a, T2d);
+		    T2k = VSUB(T2b, T2c);
+		    T2j = VSUB(T28, T29);
+		    T14 = VSUB(T12, T13);
+		    T17 = VSUB(T15, T16);
+		    T1k = VADD(T14, T17);
+		    T1b = VSUB(T19, T1a);
+		    T1e = VSUB(T1c, T1d);
+		    T1l = VADD(T1b, T1e);
+		    T18 = VSUB(T14, T17);
+		    T1m = VADD(T1k, T1l);
+		    T1f = VSUB(T1b, T1e);
+	       }
+	       {
+		    V T2L, T22, T1S, T26, T2m, T2G, T2s, T2A, T1q, T1U, T1C, T1M, T10, T2E, T1I;
+		    V T2q, T1A, T2K, T20, T2w, T21, T1Q, T1R, T1P, T25, T1r, T1s, T2C, T2N, T1N;
+		    V T2H, T2I, T2M, T1E, T1D, T1O, T1V, T2n, T2B, T24, T2o, T2t, T2u, T23, T1W;
+		    T2L = VADD(T2f, T2g);
+		    T21 = LDW(&(W[TWVL * 18]));
+		    T22 = VZMUL(T21, VADD(T1j, T1m));
+		    T1Q = VADD(T5, To);
+		    T1R = VBYI(VADD(TS, TT));
+		    T1P = LDW(&(W[TWVL * 28]));
+		    T1S = VZMULI(T1P, VSUB(T1Q, T1R));
+		    T25 = LDW(&(W[TWVL * 8]));
+		    T26 = VZMULI(T25, VADD(T1Q, T1R));
+		    {
+			 V T2l, T2z, T2i, T2y, T2e, T2h, T27, T2F, T2r, T2x, T1g, T1K, T1p, T1L, T1n;
+			 V T1o, T11, T1T, T1B, T1J, TL, T1G, TZ, T1H, Tr, TV, T1, T2D, T1F, T2p;
+			 V T1w, T1Y, T1z, T1Z, T1u, T1y, T1t, T2J, T1X, T2v;
+			 T2l = VBYI(VFMA(LDK(KP951056516), T2j, VMUL(LDK(KP587785252), T2k)));
+			 T2z = VBYI(VFNMS(LDK(KP951056516), T2k, VMUL(LDK(KP587785252), T2j)));
+			 T2e = VMUL(LDK(KP559016994), VSUB(T2a, T2d));
+			 T2h = VFNMS(LDK(KP250000000), T2g, T2f);
+			 T2i = VADD(T2e, T2h);
+			 T2y = VSUB(T2h, T2e);
+			 T27 = LDW(&(W[TWVL * 6]));
+			 T2m = VZMUL(T27, VSUB(T2i, T2l));
+			 T2F = LDW(&(W[TWVL * 22]));
+			 T2G = VZMUL(T2F, VADD(T2z, T2y));
+			 T2r = LDW(&(W[TWVL * 30]));
+			 T2s = VZMUL(T2r, VADD(T2l, T2i));
+			 T2x = LDW(&(W[TWVL * 14]));
+			 T2A = VZMUL(T2x, VSUB(T2y, T2z));
+			 T1g = VBYI(VFNMS(LDK(KP951056516), T1f, VMUL(LDK(KP587785252), T18)));
+			 T1K = VBYI(VFMA(LDK(KP951056516), T18, VMUL(LDK(KP587785252), T1f)));
+			 T1n = VFNMS(LDK(KP250000000), T1m, T1j);
+			 T1o = VMUL(LDK(KP559016994), VSUB(T1k, T1l));
+			 T1p = VSUB(T1n, T1o);
+			 T1L = VADD(T1o, T1n);
+			 T11 = LDW(&(W[TWVL * 2]));
+			 T1q = VZMUL(T11, VADD(T1g, T1p));
+			 T1T = LDW(&(W[TWVL * 26]));
+			 T1U = VZMUL(T1T, VSUB(T1L, T1K));
+			 T1B = LDW(&(W[TWVL * 34]));
+			 T1C = VZMUL(T1B, VSUB(T1p, T1g));
+			 T1J = LDW(&(W[TWVL * 10]));
+			 T1M = VZMUL(T1J, VADD(T1K, T1L));
+			 Tr = VSUB(Tp, Tq);
+			 TL = VSUB(Tr, TK);
+			 T1G = VADD(Tr, TK);
+			 TV = VSUB(TO, TU);
+			 TZ = VBYI(VSUB(TV, TY));
+			 T1H = VBYI(VADD(TY, TV));
+			 T1 = LDW(&(W[TWVL * 4]));
+			 T10 = VZMULI(T1, VADD(TL, TZ));
+			 T2D = LDW(&(W[TWVL * 24]));
+			 T2E = VZMULI(T2D, VSUB(T1G, T1H));
+			 T1F = LDW(&(W[TWVL * 12]));
+			 T1I = VZMULI(T1F, VADD(T1G, T1H));
+			 T2p = LDW(&(W[TWVL * 32]));
+			 T2q = VZMULI(T2p, VSUB(TL, TZ));
+			 T1u = VADD(Tq, Tp);
+			 T1w = VSUB(T1u, T1v);
+			 T1Y = VADD(T1u, T1v);
+			 T1y = VADD(TO, TU);
+			 T1z = VBYI(VADD(T1x, T1y));
+			 T1Z = VBYI(VSUB(T1y, T1x));
+			 T1t = LDW(&(W[TWVL * 36]));
+			 T1A = VZMULI(T1t, VSUB(T1w, T1z));
+			 T2J = LDW(&(W[0]));
+			 T2K = VZMULI(T2J, VADD(T1w, T1z));
+			 T1X = LDW(&(W[TWVL * 20]));
+			 T20 = VZMULI(T1X, VSUB(T1Y, T1Z));
+			 T2v = LDW(&(W[TWVL * 16]));
+			 T2w = VZMULI(T2v, VADD(T1Y, T1Z));
+		    }
+		    T1r = VADD(T10, T1q);
+		    ST(&(Rp[WS(rs, 1)]), T1r, ms, &(Rp[WS(rs, 1)]));
+		    T1s = VCONJ(VSUB(T1q, T10));
+		    ST(&(Rm[WS(rs, 1)]), T1s, -ms, &(Rm[WS(rs, 1)]));
+		    T2C = VCONJ(VSUB(T2A, T2w));
+		    ST(&(Rm[WS(rs, 4)]), T2C, -ms, &(Rm[0]));
+		    T2N = VCONJ(VSUB(T2L, T2K));
+		    ST(&(Rm[0]), T2N, -ms, &(Rm[0]));
+		    T1N = VADD(T1I, T1M);
+		    ST(&(Rp[WS(rs, 3)]), T1N, ms, &(Rp[WS(rs, 1)]));
+		    T2H = VADD(T2E, T2G);
+		    ST(&(Rp[WS(rs, 6)]), T2H, ms, &(Rp[0]));
+		    T2I = VCONJ(VSUB(T2G, T2E));
+		    ST(&(Rm[WS(rs, 6)]), T2I, -ms, &(Rm[0]));
+		    T2M = VADD(T2K, T2L);
+		    ST(&(Rp[0]), T2M, ms, &(Rp[0]));
+		    T1E = VCONJ(VSUB(T1C, T1A));
+		    ST(&(Rm[WS(rs, 9)]), T1E, -ms, &(Rm[WS(rs, 1)]));
+		    T1D = VADD(T1A, T1C);
+		    ST(&(Rp[WS(rs, 9)]), T1D, ms, &(Rp[WS(rs, 1)]));
+		    T1O = VCONJ(VSUB(T1M, T1I));
+		    ST(&(Rm[WS(rs, 3)]), T1O, -ms, &(Rm[WS(rs, 1)]));
+		    T1V = VADD(T1S, T1U);
+		    ST(&(Rp[WS(rs, 7)]), T1V, ms, &(Rp[WS(rs, 1)]));
+		    T2n = VADD(T26, T2m);
+		    ST(&(Rp[WS(rs, 2)]), T2n, ms, &(Rp[0]));
+		    T2B = VADD(T2w, T2A);
+		    ST(&(Rp[WS(rs, 4)]), T2B, ms, &(Rp[0]));
+		    T24 = VCONJ(VSUB(T22, T20));
+		    ST(&(Rm[WS(rs, 5)]), T24, -ms, &(Rm[WS(rs, 1)]));
+		    T2o = VCONJ(VSUB(T2m, T26));
+		    ST(&(Rm[WS(rs, 2)]), T2o, -ms, &(Rm[0]));
+		    T2t = VADD(T2q, T2s);
+		    ST(&(Rp[WS(rs, 8)]), T2t, ms, &(Rp[0]));
+		    T2u = VCONJ(VSUB(T2s, T2q));
+		    ST(&(Rm[WS(rs, 8)]), T2u, -ms, &(Rm[0]));
+		    T23 = VADD(T20, T22);
+		    ST(&(Rp[WS(rs, 5)]), T23, ms, &(Rp[WS(rs, 1)]));
+		    T1W = VCONJ(VSUB(T1U, T1S));
+		    ST(&(Rm[WS(rs, 7)]), T1W, -ms, &(Rm[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     VTW(1, 16),
+     VTW(1, 17),
+     VTW(1, 18),
+     VTW(1, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cbdftv_20"), twinstr, &GENUS, {131, 50, 12, 0} };
+
+void XSIMD(codelet_hc2cbdftv_20) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_20, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,878 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:30 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dif -sign 1 -name hc2cbdftv_32 -include hc2cbv.h */
+
+/*
+ * This function contains 249 FP additions, 192 FP multiplications,
+ * (or, 119 additions, 62 multiplications, 130 fused multiply/add),
+ * 166 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
+	       V T3a, T3N;
+	       {
+		    V T2G, T1o, T2o, T2Y, T1b, T1V, Ts, T1S, T3A, T48, T3p, T45, T31, T2z, T2H;
+		    V T1L, Tv, TG, TM, T3q, T1r, TX, TN, T1s, Ty, T1t, TB, TO, TQ, T1y;
+		    V T3t, TR, T1H, T1K, TV, T1p, T1q, T1w, TW, Tt, Tu, TE, TF, TK, TL;
+		    V Tw, Tx, Tz, TA, T1x;
+		    {
+			 V T1i, T4, T1j, T15, T1l, T1m, Tb, T16, Tf, T1G, Ti, T1F, Tm, T1J, T1I;
+			 V Tp, T2, T3, T13, T14, T5, T6, T8, T9, Td, T7, Ta, Te, Tg, Th;
+			 V Tk, Tl, Tn, To, T2m, Tc, T3l, T1k, T3m, T18, Tj, T3y, T1n, Tq, T19;
+			 V T3n, T17, T2x, T1a, T2n, T2y, Tr, T3z, T3o;
+			 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+			 T3 = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
+			 T13 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
+			 T14 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+			 T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+			 T6 = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
+			 T8 = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
+			 T9 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+			 Td = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
+			 T1i = VFMACONJ(T3, T2);
+			 T4 = VFNMSCONJ(T3, T2);
+			 T1j = VFMACONJ(T14, T13);
+			 T15 = VFNMSCONJ(T14, T13);
+			 T1l = VFMACONJ(T6, T5);
+			 T7 = VFNMSCONJ(T6, T5);
+			 T1m = VFMACONJ(T9, T8);
+			 Ta = VFMSCONJ(T9, T8);
+			 Te = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+			 Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+			 Th = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
+			 Tk = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+			 Tl = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
+			 Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
+			 To = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+			 Tb = VADD(T7, Ta);
+			 T16 = VSUB(T7, Ta);
+			 Tf = VFNMSCONJ(Te, Td);
+			 T1G = VFMACONJ(Te, Td);
+			 Ti = VFNMSCONJ(Th, Tg);
+			 T1F = VFMACONJ(Th, Tg);
+			 Tm = VFNMSCONJ(Tl, Tk);
+			 T1J = VFMACONJ(Tl, Tk);
+			 T1I = VFMACONJ(To, Tn);
+			 Tp = VFMSCONJ(To, Tn);
+			 T2m = VFMA(LDK(KP707106781), Tb, T4);
+			 Tc = VFNMS(LDK(KP707106781), Tb, T4);
+			 T3l = VSUB(T1i, T1j);
+			 T1k = VADD(T1i, T1j);
+			 T1H = VADD(T1F, T1G);
+			 T3m = VSUB(T1F, T1G);
+			 T18 = VFNMS(LDK(KP414213562), Tf, Ti);
+			 Tj = VFMA(LDK(KP414213562), Ti, Tf);
+			 T3y = VSUB(T1l, T1m);
+			 T1n = VADD(T1l, T1m);
+			 Tq = VFNMS(LDK(KP414213562), Tp, Tm);
+			 T19 = VFMA(LDK(KP414213562), Tm, Tp);
+			 T1K = VADD(T1I, T1J);
+			 T3n = VSUB(T1I, T1J);
+			 T17 = VFNMS(LDK(KP707106781), T16, T15);
+			 T2x = VFMA(LDK(KP707106781), T16, T15);
+			 T1a = VSUB(T18, T19);
+			 T2n = VADD(T18, T19);
+			 T2y = VADD(Tj, Tq);
+			 Tr = VSUB(Tj, Tq);
+			 T3z = VSUB(T3m, T3n);
+			 T3o = VADD(T3m, T3n);
+			 T2G = VADD(T1k, T1n);
+			 T1o = VSUB(T1k, T1n);
+			 T2o = VFNMS(LDK(KP923879532), T2n, T2m);
+			 T2Y = VFMA(LDK(KP923879532), T2n, T2m);
+			 T1b = VFNMS(LDK(KP923879532), T1a, T17);
+			 T1V = VFMA(LDK(KP923879532), T1a, T17);
+			 Ts = VFMA(LDK(KP923879532), Tr, Tc);
+			 T1S = VFNMS(LDK(KP923879532), Tr, Tc);
+			 T3A = VFMA(LDK(KP707106781), T3z, T3y);
+			 T48 = VFNMS(LDK(KP707106781), T3z, T3y);
+			 T3p = VFMA(LDK(KP707106781), T3o, T3l);
+			 T45 = VFNMS(LDK(KP707106781), T3o, T3l);
+			 T31 = VFMA(LDK(KP923879532), T2y, T2x);
+			 T2z = VFNMS(LDK(KP923879532), T2y, T2x);
+		    }
+		    Tt = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    Tu = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
+		    TE = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
+		    TF = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+		    TK = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
+		    TL = LD(&(Rm[0]), -ms, &(Rm[0]));
+		    TV = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+		    T2H = VADD(T1H, T1K);
+		    T1L = VSUB(T1H, T1K);
+		    Tv = VFNMSCONJ(Tu, Tt);
+		    T1p = VFMACONJ(Tu, Tt);
+		    TG = VFNMSCONJ(TF, TE);
+		    T1q = VFMACONJ(TF, TE);
+		    T1w = VFMACONJ(TL, TK);
+		    TM = VFMSCONJ(TL, TK);
+		    TW = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
+		    Tw = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+		    Tx = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
+		    Tz = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
+		    TA = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+		    T3q = VSUB(T1p, T1q);
+		    T1r = VADD(T1p, T1q);
+		    T1x = VFMACONJ(TW, TV);
+		    TX = VFNMSCONJ(TW, TV);
+		    TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    T1s = VFMACONJ(Tx, Tw);
+		    Ty = VFNMSCONJ(Tx, Tw);
+		    T1t = VFMACONJ(TA, Tz);
+		    TB = VFMSCONJ(TA, Tz);
+		    TO = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
+		    TQ = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
+		    T1y = VADD(T1w, T1x);
+		    T3t = VSUB(T1w, T1x);
+		    TR = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+		    {
+			 V T38, T3f, T4p, T4v, T3T, T3Z, T2a, T2i, T4b, T4h, T1O, T20, T2M, T2U, T3F;
+			 V T3L, T2g, T3X, T3J, T1g, T4f, T2S, T4l, T2E, T2X, T3O, T3b, T3i, T26, T4t;
+			 V T43, T1Y, T3c, T30, T3d, T33;
+			 {
+			      V T2I, T2A, T2r, T1c, TJ, T2L, T2u, T2B, T10, T1d, T3x, T3E, T1E, T1N, T1h;
+			      V T1Z, T4m, T1M, T1D, T4a, T4o, T4n, T47, T4u, T3R, T3S, T3Q, T3Y, T28, T29;
+			      V T27, T2h, T44, T4g;
+			      {
+				   V T36, T1v, T2J, T3s, T3B, T2p, TI, T2q, TD, T1B, T3u, TY, TT, T35, T1u;
+				   V T3r, TH, TC, T1z, TP, T1A, TS, T3w, T3D, T1C, T2K, T3v, T3C, T2s, TZ;
+				   V T2t, TU, T37, T49, T46;
+				   T2I = VSUB(T2G, T2H);
+				   T36 = VADD(T2G, T2H);
+				   T1u = VADD(T1s, T1t);
+				   T3r = VSUB(T1s, T1t);
+				   TH = VSUB(Ty, TB);
+				   TC = VADD(Ty, TB);
+				   T1z = VFMACONJ(TO, TN);
+				   TP = VFNMSCONJ(TO, TN);
+				   T1A = VFMACONJ(TR, TQ);
+				   TS = VFMSCONJ(TR, TQ);
+				   T1v = VSUB(T1r, T1u);
+				   T2J = VADD(T1r, T1u);
+				   T3s = VFNMS(LDK(KP414213562), T3r, T3q);
+				   T3B = VFMA(LDK(KP414213562), T3q, T3r);
+				   T2p = VFMA(LDK(KP707106781), TH, TG);
+				   TI = VFNMS(LDK(KP707106781), TH, TG);
+				   T2q = VFMA(LDK(KP707106781), TC, Tv);
+				   TD = VFNMS(LDK(KP707106781), TC, Tv);
+				   T1B = VADD(T1z, T1A);
+				   T3u = VSUB(T1A, T1z);
+				   TY = VSUB(TS, TP);
+				   TT = VADD(TP, TS);
+				   T35 = LDW(&(W[TWVL * 30]));
+				   T4m = LDW(&(W[TWVL * 10]));
+				   T2A = VFNMS(LDK(KP198912367), T2p, T2q);
+				   T2r = VFMA(LDK(KP198912367), T2q, T2p);
+				   T1c = VFNMS(LDK(KP668178637), TD, TI);
+				   TJ = VFMA(LDK(KP668178637), TI, TD);
+				   T1C = VSUB(T1y, T1B);
+				   T2K = VADD(T1y, T1B);
+				   T3v = VFNMS(LDK(KP414213562), T3u, T3t);
+				   T3C = VFMA(LDK(KP414213562), T3t, T3u);
+				   T2s = VFNMS(LDK(KP707106781), TY, TX);
+				   TZ = VFMA(LDK(KP707106781), TY, TX);
+				   T2t = VFMA(LDK(KP707106781), TT, TM);
+				   TU = VFNMS(LDK(KP707106781), TT, TM);
+				   T1M = VSUB(T1v, T1C);
+				   T1D = VADD(T1v, T1C);
+				   T37 = VADD(T2J, T2K);
+				   T2L = VSUB(T2J, T2K);
+				   T3w = VADD(T3s, T3v);
+				   T49 = VSUB(T3s, T3v);
+				   T3D = VSUB(T3B, T3C);
+				   T46 = VADD(T3B, T3C);
+				   T2u = VFNMS(LDK(KP198912367), T2t, T2s);
+				   T2B = VFMA(LDK(KP198912367), T2s, T2t);
+				   T10 = VFNMS(LDK(KP668178637), TZ, TU);
+				   T1d = VFMA(LDK(KP668178637), TU, TZ);
+				   T38 = VZMUL(T35, VSUB(T36, T37));
+				   T3f = VADD(T36, T37);
+				   T4a = VFMA(LDK(KP923879532), T49, T48);
+				   T4o = VFNMS(LDK(KP923879532), T49, T48);
+				   T4n = VFMA(LDK(KP923879532), T46, T45);
+				   T47 = VFNMS(LDK(KP923879532), T46, T45);
+				   T4u = LDW(&(W[TWVL * 50]));
+				   T3R = VFMA(LDK(KP923879532), T3w, T3p);
+				   T3x = VFNMS(LDK(KP923879532), T3w, T3p);
+				   T3E = VFNMS(LDK(KP923879532), T3D, T3A);
+				   T3S = VFMA(LDK(KP923879532), T3D, T3A);
+				   T3Q = LDW(&(W[TWVL * 58]));
+				   T3Y = LDW(&(W[TWVL * 2]));
+			      }
+			      T28 = VFMA(LDK(KP707106781), T1D, T1o);
+			      T1E = VFNMS(LDK(KP707106781), T1D, T1o);
+			      T1N = VFNMS(LDK(KP707106781), T1M, T1L);
+			      T29 = VFMA(LDK(KP707106781), T1M, T1L);
+			      T4p = VZMUL(T4m, VFNMSI(T4o, T4n));
+			      T4v = VZMUL(T4u, VFMAI(T4o, T4n));
+			      T27 = LDW(&(W[TWVL * 6]));
+			      T2h = LDW(&(W[TWVL * 54]));
+			      T3T = VZMUL(T3Q, VFNMSI(T3S, T3R));
+			      T3Z = VZMUL(T3Y, VFMAI(T3S, T3R));
+			      T44 = LDW(&(W[TWVL * 18]));
+			      T4g = LDW(&(W[TWVL * 42]));
+			      T2a = VZMUL(T27, VFMAI(T29, T28));
+			      T2i = VZMUL(T2h, VFNMSI(T29, T28));
+			      T1h = LDW(&(W[TWVL * 22]));
+			      T1Z = LDW(&(W[TWVL * 38]));
+			      T4b = VZMUL(T44, VFMAI(T4a, T47));
+			      T4h = VZMUL(T4g, VFNMSI(T4a, T47));
+			      {
+				   V T1W, T1T, T1, T3W, T2d, T3I, T2e, T12, T2f, T1f, T2F, T2T, T3k, T3K, T11;
+				   V T1e, T32, T2Z, T2l, T4k, T2P, T4e, T2Q, T2w, T2R, T2D, T2v, T2C, T1R, T4s;
+				   V T23, T42, T24, T1U, T25, T1X;
+				   T2F = LDW(&(W[TWVL * 46]));
+				   T2T = LDW(&(W[TWVL * 14]));
+				   T1O = VZMUL(T1h, VFNMSI(T1N, T1E));
+				   T20 = VZMUL(T1Z, VFMAI(T1N, T1E));
+				   T3k = LDW(&(W[TWVL * 26]));
+				   T3K = LDW(&(W[TWVL * 34]));
+				   T2M = VZMUL(T2F, VFNMSI(T2L, T2I));
+				   T2U = VZMUL(T2T, VFMAI(T2L, T2I));
+				   T11 = VADD(TJ, T10);
+				   T1W = VSUB(TJ, T10);
+				   T1T = VSUB(T1d, T1c);
+				   T1e = VADD(T1c, T1d);
+				   T1 = LDW(&(W[TWVL * 24]));
+				   T3W = LDW(&(W[TWVL * 4]));
+				   T3F = VZMUL(T3k, VFNMSI(T3E, T3x));
+				   T3L = VZMUL(T3K, VFMAI(T3E, T3x));
+				   T2d = LDW(&(W[TWVL * 56]));
+				   T3I = LDW(&(W[TWVL * 36]));
+				   T2e = VFMA(LDK(KP831469612), T11, Ts);
+				   T12 = VFNMS(LDK(KP831469612), T11, Ts);
+				   T2f = VFMA(LDK(KP831469612), T1e, T1b);
+				   T1f = VFNMS(LDK(KP831469612), T1e, T1b);
+				   T2v = VSUB(T2r, T2u);
+				   T32 = VADD(T2r, T2u);
+				   T2Z = VADD(T2A, T2B);
+				   T2C = VSUB(T2A, T2B);
+				   T2l = LDW(&(W[TWVL * 48]));
+				   T4k = LDW(&(W[TWVL * 12]));
+				   T2P = LDW(&(W[TWVL * 16]));
+				   T4e = LDW(&(W[TWVL * 44]));
+				   T2g = VZMULI(T2d, VFMAI(T2f, T2e));
+				   T3X = VZMULI(T3W, VFNMSI(T2f, T2e));
+				   T3J = VZMULI(T3I, VFNMSI(T1f, T12));
+				   T1g = VZMULI(T1, VFMAI(T1f, T12));
+				   T2Q = VFNMS(LDK(KP980785280), T2v, T2o);
+				   T2w = VFMA(LDK(KP980785280), T2v, T2o);
+				   T2R = VFMA(LDK(KP980785280), T2C, T2z);
+				   T2D = VFNMS(LDK(KP980785280), T2C, T2z);
+				   T1R = LDW(&(W[TWVL * 40]));
+				   T4s = LDW(&(W[TWVL * 52]));
+				   T23 = LDW(&(W[TWVL * 8]));
+				   T42 = LDW(&(W[TWVL * 20]));
+				   T4f = VZMULI(T4e, VFNMSI(T2R, T2Q));
+				   T2S = VZMULI(T2P, VFMAI(T2R, T2Q));
+				   T4l = VZMULI(T4k, VFNMSI(T2D, T2w));
+				   T2E = VZMULI(T2l, VFMAI(T2D, T2w));
+				   T24 = VFMA(LDK(KP831469612), T1T, T1S);
+				   T1U = VFNMS(LDK(KP831469612), T1T, T1S);
+				   T25 = VFMA(LDK(KP831469612), T1W, T1V);
+				   T1X = VFNMS(LDK(KP831469612), T1W, T1V);
+				   T2X = LDW(&(W[TWVL * 32]));
+				   T3O = LDW(&(W[TWVL * 60]));
+				   T3b = LDW(&(W[0]));
+				   T3i = LDW(&(W[TWVL * 28]));
+				   T26 = VZMULI(T23, VFMAI(T25, T24));
+				   T4t = VZMULI(T4s, VFNMSI(T25, T24));
+				   T43 = VZMULI(T42, VFNMSI(T1X, T1U));
+				   T1Y = VZMULI(T1R, VFMAI(T1X, T1U));
+				   T3c = VFMA(LDK(KP980785280), T2Z, T2Y);
+				   T30 = VFNMS(LDK(KP980785280), T2Z, T2Y);
+				   T3d = VFMA(LDK(KP980785280), T32, T31);
+				   T33 = VFNMS(LDK(KP980785280), T32, T31);
+			      }
+			 }
+			 {
+			      V T3e, T3P, T3j, T34, T2c, T4j, T2k, T4d, T1P, T1Q, T4x, T4w, T2j, T4c, T21;
+			      V T22, T4r, T4q, T2b, T4i, T3h, T3H, T2N, T2O, T41, T40, T3g, T3G, T2V, T2W;
+			      V T3V, T3U, T39, T3M;
+			      T1P = VADD(T1g, T1O);
+			      T1Q = VCONJ(VSUB(T1O, T1g));
+			      T4x = VCONJ(VSUB(T4v, T4t));
+			      T4w = VADD(T4t, T4v);
+			      T2j = VADD(T2g, T2i);
+			      T2k = VCONJ(VSUB(T2i, T2g));
+			      T4d = VCONJ(VSUB(T4b, T43));
+			      T4c = VADD(T43, T4b);
+			      T3e = VZMULI(T3b, VFMAI(T3d, T3c));
+			      T3P = VZMULI(T3O, VFNMSI(T3d, T3c));
+			      T3j = VZMULI(T3i, VFNMSI(T33, T30));
+			      T34 = VZMULI(T2X, VFMAI(T33, T30));
+			      ST(&(Rp[WS(rs, 6)]), T1P, ms, &(Rp[0]));
+			      ST(&(Rp[WS(rs, 13)]), T4w, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rp[WS(rs, 14)]), T2j, ms, &(Rp[0]));
+			      ST(&(Rp[WS(rs, 5)]), T4c, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 13)]), T4x, -ms, &(Rm[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 6)]), T1Q, -ms, &(Rm[0]));
+			      T21 = VADD(T1Y, T20);
+			      T22 = VCONJ(VSUB(T20, T1Y));
+			      T4r = VCONJ(VSUB(T4p, T4l));
+			      T4q = VADD(T4l, T4p);
+			      T2b = VADD(T26, T2a);
+			      T2c = VCONJ(VSUB(T2a, T26));
+			      T4j = VCONJ(VSUB(T4h, T4f));
+			      T4i = VADD(T4f, T4h);
+			      ST(&(Rm[WS(rs, 5)]), T4d, -ms, &(Rm[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 14)]), T2k, -ms, &(Rm[0]));
+			      ST(&(Rp[WS(rs, 10)]), T21, ms, &(Rp[0]));
+			      ST(&(Rp[WS(rs, 3)]), T4q, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rp[WS(rs, 2)]), T2b, ms, &(Rp[0]));
+			      ST(&(Rp[WS(rs, 11)]), T4i, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 3)]), T4r, -ms, &(Rm[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 10)]), T22, -ms, &(Rm[0]));
+			      T2N = VADD(T2E, T2M);
+			      T2O = VCONJ(VSUB(T2M, T2E));
+			      T41 = VCONJ(VSUB(T3Z, T3X));
+			      T40 = VADD(T3X, T3Z);
+			      T3g = VADD(T3e, T3f);
+			      T3h = VCONJ(VSUB(T3f, T3e));
+			      T3H = VCONJ(VSUB(T3F, T3j));
+			      T3G = VADD(T3j, T3F);
+			      ST(&(Rm[WS(rs, 11)]), T4j, -ms, &(Rm[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 2)]), T2c, -ms, &(Rm[0]));
+			      ST(&(Rp[WS(rs, 12)]), T2N, ms, &(Rp[0]));
+			      ST(&(Rp[WS(rs, 1)]), T40, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rp[0]), T3g, ms, &(Rp[0]));
+			      ST(&(Rp[WS(rs, 7)]), T3G, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 1)]), T41, -ms, &(Rm[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 12)]), T2O, -ms, &(Rm[0]));
+			      T2V = VADD(T2S, T2U);
+			      T2W = VCONJ(VSUB(T2U, T2S));
+			      T3V = VCONJ(VSUB(T3T, T3P));
+			      T3U = VADD(T3P, T3T);
+			      T39 = VADD(T34, T38);
+			      T3a = VCONJ(VSUB(T38, T34));
+			      T3N = VCONJ(VSUB(T3L, T3J));
+			      T3M = VADD(T3J, T3L);
+			      ST(&(Rm[WS(rs, 7)]), T3H, -ms, &(Rm[WS(rs, 1)]));
+			      ST(&(Rm[0]), T3h, -ms, &(Rm[0]));
+			      ST(&(Rp[WS(rs, 4)]), T2V, ms, &(Rp[0]));
+			      ST(&(Rp[WS(rs, 15)]), T3U, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rp[WS(rs, 8)]), T39, ms, &(Rp[0]));
+			      ST(&(Rp[WS(rs, 9)]), T3M, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 15)]), T3V, -ms, &(Rm[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 4)]), T2W, -ms, &(Rm[0]));
+			 }
+		    }
+	       }
+	       ST(&(Rm[WS(rs, 9)]), T3N, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 8)]), T3a, -ms, &(Rm[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     VTW(1, 16),
+     VTW(1, 17),
+     VTW(1, 18),
+     VTW(1, 19),
+     VTW(1, 20),
+     VTW(1, 21),
+     VTW(1, 22),
+     VTW(1, 23),
+     VTW(1, 24),
+     VTW(1, 25),
+     VTW(1, 26),
+     VTW(1, 27),
+     VTW(1, 28),
+     VTW(1, 29),
+     VTW(1, 30),
+     VTW(1, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cbdftv_32"), twinstr, &GENUS, {119, 62, 130, 0} };
+
+void XSIMD(codelet_hc2cbdftv_32) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_32, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dif -sign 1 -name hc2cbdftv_32 -include hc2cbv.h */
+
+/*
+ * This function contains 249 FP additions, 104 FP multiplications,
+ * (or, 233 additions, 88 multiplications, 16 fused multiply/add),
+ * 161 stack variables, 7 constants, and 64 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
+	       V T1W, T21, Tf, T2c, T1t, T2r, T3T, T4m, Ty, T2q, T3P, T4n, T1n, T2d, T1T;
+	       V T22, T1E, T24, T3I, T4p, TU, T2n, T1i, T2h, T1L, T25, T3L, T4q, T1f, T2o;
+	       V T1j, T2k;
+	       {
+		    V T2, T4, T1Z, T1p, T1r, T20, T9, T1U, Td, T1V, T3, T1q, T6, T8, T7;
+		    V Tc, Tb, Ta, T5, Te, T1o, T1s, T3R, T3S, Tj, T1N, Tw, T1Q, Tn, T1O;
+		    V Ts, T1R, Tg, Ti, Th, Tv, Tu, Tt, Tk, Tm, Tl, Tp, Tr, Tq, To;
+		    V Tx, T3N, T3O, T1l, T1m, T1P, T1S;
+		    T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+		    T3 = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
+		    T4 = VCONJ(T3);
+		    T1Z = VADD(T2, T4);
+		    T1p = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
+		    T1q = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+		    T1r = VCONJ(T1q);
+		    T20 = VADD(T1p, T1r);
+		    T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+		    T7 = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
+		    T8 = VCONJ(T7);
+		    T9 = VSUB(T6, T8);
+		    T1U = VADD(T6, T8);
+		    Tc = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
+		    Ta = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tb = VCONJ(Ta);
+		    Td = VSUB(Tb, Tc);
+		    T1V = VADD(Tb, Tc);
+		    T1W = VSUB(T1U, T1V);
+		    T21 = VSUB(T1Z, T20);
+		    T5 = VSUB(T2, T4);
+		    Te = VMUL(LDK(KP707106781), VADD(T9, Td));
+		    Tf = VSUB(T5, Te);
+		    T2c = VADD(T5, Te);
+		    T1o = VMUL(LDK(KP707106781), VSUB(T9, Td));
+		    T1s = VSUB(T1p, T1r);
+		    T1t = VSUB(T1o, T1s);
+		    T2r = VADD(T1s, T1o);
+		    T3R = VADD(T1Z, T20);
+		    T3S = VADD(T1U, T1V);
+		    T3T = VSUB(T3R, T3S);
+		    T4m = VADD(T3R, T3S);
+		    Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+		    Th = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
+		    Ti = VCONJ(Th);
+		    Tj = VSUB(Tg, Ti);
+		    T1N = VADD(Tg, Ti);
+		    Tv = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
+		    Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tu = VCONJ(Tt);
+		    Tw = VSUB(Tu, Tv);
+		    T1Q = VADD(Tu, Tv);
+		    Tk = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
+		    Tl = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tm = VCONJ(Tl);
+		    Tn = VSUB(Tk, Tm);
+		    T1O = VADD(Tk, Tm);
+		    Tp = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+		    Tq = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tr = VCONJ(Tq);
+		    Ts = VSUB(Tp, Tr);
+		    T1R = VADD(Tp, Tr);
+		    To = VFMA(LDK(KP382683432), Tj, VMUL(LDK(KP923879532), Tn));
+		    Tx = VFNMS(LDK(KP382683432), Tw, VMUL(LDK(KP923879532), Ts));
+		    Ty = VSUB(To, Tx);
+		    T2q = VADD(To, Tx);
+		    T3N = VADD(T1N, T1O);
+		    T3O = VADD(T1Q, T1R);
+		    T3P = VSUB(T3N, T3O);
+		    T4n = VADD(T3N, T3O);
+		    T1l = VFNMS(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tj));
+		    T1m = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), Ts));
+		    T1n = VSUB(T1l, T1m);
+		    T2d = VADD(T1l, T1m);
+		    T1P = VSUB(T1N, T1O);
+		    T1S = VSUB(T1Q, T1R);
+		    T1T = VMUL(LDK(KP707106781), VSUB(T1P, T1S));
+		    T22 = VMUL(LDK(KP707106781), VADD(T1P, T1S));
+	       }
+	       {
+		    V TD, T1B, TR, T1y, TH, T1C, TM, T1z, TA, TC, TB, TO, TQ, TP, TG;
+		    V TF, TE, TJ, TL, TK, T1A, T1D, T3G, T3H, TN, T2f, TT, T2g, TI, TS;
+		    V TY, T1I, T1c, T1F, T12, T1J, T17, T1G, TV, TX, TW, T1b, T1a, T19, T11;
+		    V T10, TZ, T14, T16, T15, T1H, T1K, T3J, T3K, T18, T2i, T1e, T2j, T13, T1d;
+		    TA = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+		    TB = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
+		    TC = VCONJ(TB);
+		    TD = VSUB(TA, TC);
+		    T1B = VADD(TA, TC);
+		    TO = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    TP = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
+		    TQ = VCONJ(TP);
+		    TR = VSUB(TO, TQ);
+		    T1y = VADD(TO, TQ);
+		    TG = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
+		    TE = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+		    TF = VCONJ(TE);
+		    TH = VSUB(TF, TG);
+		    T1C = VADD(TF, TG);
+		    TJ = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
+		    TK = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+		    TL = VCONJ(TK);
+		    TM = VSUB(TJ, TL);
+		    T1z = VADD(TJ, TL);
+		    T1A = VSUB(T1y, T1z);
+		    T1D = VSUB(T1B, T1C);
+		    T1E = VFNMS(LDK(KP382683432), T1D, VMUL(LDK(KP923879532), T1A));
+		    T24 = VFMA(LDK(KP382683432), T1A, VMUL(LDK(KP923879532), T1D));
+		    T3G = VADD(T1y, T1z);
+		    T3H = VADD(T1B, T1C);
+		    T3I = VSUB(T3G, T3H);
+		    T4p = VADD(T3G, T3H);
+		    TI = VMUL(LDK(KP707106781), VSUB(TD, TH));
+		    TN = VSUB(TI, TM);
+		    T2f = VADD(TM, TI);
+		    TS = VMUL(LDK(KP707106781), VADD(TD, TH));
+		    TT = VSUB(TR, TS);
+		    T2g = VADD(TR, TS);
+		    TU = VFMA(LDK(KP831469612), TN, VMUL(LDK(KP555570233), TT));
+		    T2n = VFNMS(LDK(KP195090322), T2f, VMUL(LDK(KP980785280), T2g));
+		    T1i = VFNMS(LDK(KP555570233), TN, VMUL(LDK(KP831469612), TT));
+		    T2h = VFMA(LDK(KP980785280), T2f, VMUL(LDK(KP195090322), T2g));
+		    TV = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    TW = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
+		    TX = VCONJ(TW);
+		    TY = VSUB(TV, TX);
+		    T1I = VADD(TV, TX);
+		    T1b = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
+		    T19 = LD(&(Rm[0]), -ms, &(Rm[0]));
+		    T1a = VCONJ(T19);
+		    T1c = VSUB(T1a, T1b);
+		    T1F = VADD(T1a, T1b);
+		    T11 = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
+		    TZ = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+		    T10 = VCONJ(TZ);
+		    T12 = VSUB(T10, T11);
+		    T1J = VADD(T10, T11);
+		    T14 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+		    T15 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
+		    T16 = VCONJ(T15);
+		    T17 = VSUB(T14, T16);
+		    T1G = VADD(T14, T16);
+		    T1H = VSUB(T1F, T1G);
+		    T1K = VSUB(T1I, T1J);
+		    T1L = VFMA(LDK(KP923879532), T1H, VMUL(LDK(KP382683432), T1K));
+		    T25 = VFNMS(LDK(KP382683432), T1H, VMUL(LDK(KP923879532), T1K));
+		    T3J = VADD(T1F, T1G);
+		    T3K = VADD(T1I, T1J);
+		    T3L = VSUB(T3J, T3K);
+		    T4q = VADD(T3J, T3K);
+		    T13 = VMUL(LDK(KP707106781), VSUB(TY, T12));
+		    T18 = VSUB(T13, T17);
+		    T2i = VADD(T17, T13);
+		    T1d = VMUL(LDK(KP707106781), VADD(TY, T12));
+		    T1e = VSUB(T1c, T1d);
+		    T2j = VADD(T1c, T1d);
+		    T1f = VFNMS(LDK(KP555570233), T1e, VMUL(LDK(KP831469612), T18));
+		    T2o = VFMA(LDK(KP195090322), T2i, VMUL(LDK(KP980785280), T2j));
+		    T1j = VFMA(LDK(KP555570233), T18, VMUL(LDK(KP831469612), T1e));
+		    T2k = VFNMS(LDK(KP195090322), T2j, VMUL(LDK(KP980785280), T2i));
+	       }
+	       {
+		    V T4L, T4G, T4s, T4y, T3W, T4g, T42, T4a, T3g, T4e, T3o, T3E, T1w, T46, T2M;
+		    V T40, T2u, T4w, T2C, T4k, T36, T3A, T3i, T3s, T28, T2O, T2w, T2G, T2Y, T4K;
+		    V T3y, T4C;
+		    {
+			 V T4E, T4F, T4D, T4o, T4r, T4l, T4x, T3Q, T48, T3V, T49, T3M, T3U, T3F, T4f;
+			 V T41, T47, T3c, T3n, T3f, T3m, T3a, T3b, T3d, T3e, T39, T4d, T3l, T3D, T1h;
+			 V T2K, T1v, T2L, Tz, T1g, T1k, T1u, T1, T45, T2J, T3Z, T2m, T2A, T2t, T2B;
+			 V T2e, T2l, T2p, T2s, T2b, T4v, T2z, T4j;
+			 T4E = VADD(T4m, T4n);
+			 T4F = VADD(T4p, T4q);
+			 T4L = VADD(T4E, T4F);
+			 T4D = LDW(&(W[TWVL * 30]));
+			 T4G = VZMUL(T4D, VSUB(T4E, T4F));
+			 T4o = VSUB(T4m, T4n);
+			 T4r = VBYI(VSUB(T4p, T4q));
+			 T4l = LDW(&(W[TWVL * 46]));
+			 T4s = VZMUL(T4l, VSUB(T4o, T4r));
+			 T4x = LDW(&(W[TWVL * 14]));
+			 T4y = VZMUL(T4x, VADD(T4o, T4r));
+			 T3M = VMUL(LDK(KP707106781), VSUB(T3I, T3L));
+			 T3Q = VBYI(VSUB(T3M, T3P));
+			 T48 = VBYI(VADD(T3P, T3M));
+			 T3U = VMUL(LDK(KP707106781), VADD(T3I, T3L));
+			 T3V = VSUB(T3T, T3U);
+			 T49 = VADD(T3T, T3U);
+			 T3F = LDW(&(W[TWVL * 22]));
+			 T3W = VZMUL(T3F, VADD(T3Q, T3V));
+			 T4f = LDW(&(W[TWVL * 54]));
+			 T4g = VZMUL(T4f, VSUB(T49, T48));
+			 T41 = LDW(&(W[TWVL * 38]));
+			 T42 = VZMUL(T41, VSUB(T3V, T3Q));
+			 T47 = LDW(&(W[TWVL * 6]));
+			 T4a = VZMUL(T47, VADD(T48, T49));
+			 T3a = VADD(T1t, T1n);
+			 T3b = VADD(TU, T1f);
+			 T3c = VBYI(VADD(T3a, T3b));
+			 T3n = VBYI(VSUB(T3b, T3a));
+			 T3d = VADD(Tf, Ty);
+			 T3e = VADD(T1i, T1j);
+			 T3f = VADD(T3d, T3e);
+			 T3m = VSUB(T3d, T3e);
+			 T39 = LDW(&(W[TWVL * 4]));
+			 T3g = VZMULI(T39, VADD(T3c, T3f));
+			 T4d = LDW(&(W[TWVL * 56]));
+			 T4e = VZMULI(T4d, VSUB(T3f, T3c));
+			 T3l = LDW(&(W[TWVL * 36]));
+			 T3o = VZMULI(T3l, VSUB(T3m, T3n));
+			 T3D = LDW(&(W[TWVL * 24]));
+			 T3E = VZMULI(T3D, VADD(T3n, T3m));
+			 Tz = VSUB(Tf, Ty);
+			 T1g = VSUB(TU, T1f);
+			 T1h = VSUB(Tz, T1g);
+			 T2K = VADD(Tz, T1g);
+			 T1k = VSUB(T1i, T1j);
+			 T1u = VSUB(T1n, T1t);
+			 T1v = VBYI(VSUB(T1k, T1u));
+			 T2L = VBYI(VADD(T1u, T1k));
+			 T1 = LDW(&(W[TWVL * 20]));
+			 T1w = VZMULI(T1, VADD(T1h, T1v));
+			 T45 = LDW(&(W[TWVL * 8]));
+			 T46 = VZMULI(T45, VADD(T2K, T2L));
+			 T2J = LDW(&(W[TWVL * 52]));
+			 T2M = VZMULI(T2J, VSUB(T2K, T2L));
+			 T3Z = LDW(&(W[TWVL * 40]));
+			 T40 = VZMULI(T3Z, VSUB(T1h, T1v));
+			 T2e = VSUB(T2c, T2d);
+			 T2l = VSUB(T2h, T2k);
+			 T2m = VSUB(T2e, T2l);
+			 T2A = VADD(T2e, T2l);
+			 T2p = VSUB(T2n, T2o);
+			 T2s = VSUB(T2q, T2r);
+			 T2t = VBYI(VSUB(T2p, T2s));
+			 T2B = VBYI(VADD(T2s, T2p));
+			 T2b = LDW(&(W[TWVL * 44]));
+			 T2u = VZMULI(T2b, VSUB(T2m, T2t));
+			 T4v = LDW(&(W[TWVL * 16]));
+			 T4w = VZMULI(T4v, VADD(T2m, T2t));
+			 T2z = LDW(&(W[TWVL * 12]));
+			 T2C = VZMULI(T2z, VADD(T2A, T2B));
+			 T4j = LDW(&(W[TWVL * 48]));
+			 T4k = VZMULI(T4j, VSUB(T2A, T2B));
+			 {
+			      V T32, T3q, T35, T3r, T30, T31, T33, T34, T2Z, T3z, T3h, T3p, T1Y, T2E, T27;
+			      V T2F, T1M, T1X, T23, T26, T1x, T2N, T2v, T2D, T2U, T3x, T2X, T3w, T2S, T2T;
+			      V T2V, T2W, T2R, T4J, T3v, T4B;
+			      T30 = VADD(T21, T22);
+			      T31 = VADD(T1E, T1L);
+			      T32 = VADD(T30, T31);
+			      T3q = VSUB(T30, T31);
+			      T33 = VADD(T1W, T1T);
+			      T34 = VADD(T24, T25);
+			      T35 = VBYI(VADD(T33, T34));
+			      T3r = VBYI(VSUB(T34, T33));
+			      T2Z = LDW(&(W[TWVL * 58]));
+			      T36 = VZMUL(T2Z, VSUB(T32, T35));
+			      T3z = LDW(&(W[TWVL * 26]));
+			      T3A = VZMUL(T3z, VADD(T3q, T3r));
+			      T3h = LDW(&(W[TWVL * 2]));
+			      T3i = VZMUL(T3h, VADD(T32, T35));
+			      T3p = LDW(&(W[TWVL * 34]));
+			      T3s = VZMUL(T3p, VSUB(T3q, T3r));
+			      T1M = VSUB(T1E, T1L);
+			      T1X = VSUB(T1T, T1W);
+			      T1Y = VBYI(VSUB(T1M, T1X));
+			      T2E = VBYI(VADD(T1X, T1M));
+			      T23 = VSUB(T21, T22);
+			      T26 = VSUB(T24, T25);
+			      T27 = VSUB(T23, T26);
+			      T2F = VADD(T23, T26);
+			      T1x = LDW(&(W[TWVL * 18]));
+			      T28 = VZMUL(T1x, VADD(T1Y, T27));
+			      T2N = LDW(&(W[TWVL * 50]));
+			      T2O = VZMUL(T2N, VSUB(T2F, T2E));
+			      T2v = LDW(&(W[TWVL * 42]));
+			      T2w = VZMUL(T2v, VSUB(T27, T1Y));
+			      T2D = LDW(&(W[TWVL * 10]));
+			      T2G = VZMUL(T2D, VADD(T2E, T2F));
+			      T2S = VADD(T2c, T2d);
+			      T2T = VADD(T2n, T2o);
+			      T2U = VADD(T2S, T2T);
+			      T3x = VSUB(T2S, T2T);
+			      T2V = VADD(T2r, T2q);
+			      T2W = VADD(T2h, T2k);
+			      T2X = VBYI(VADD(T2V, T2W));
+			      T3w = VBYI(VSUB(T2W, T2V));
+			      T2R = LDW(&(W[TWVL * 60]));
+			      T2Y = VZMULI(T2R, VSUB(T2U, T2X));
+			      T4J = LDW(&(W[0]));
+			      T4K = VZMULI(T4J, VADD(T2X, T2U));
+			      T3v = LDW(&(W[TWVL * 28]));
+			      T3y = VZMULI(T3v, VADD(T3w, T3x));
+			      T4B = LDW(&(W[TWVL * 32]));
+			      T4C = VZMULI(T4B, VSUB(T3x, T3w));
+			 }
+		    }
+		    {
+			 V T29, T4M, T2P, T4t, T4N, T2a, T4u, T2Q, T2x, T4H, T2H, T4z, T4I, T2y, T4A;
+			 V T2I, T37, T4h, T3B, T3X, T4i, T38, T3Y, T3C, T3j, T4b, T3t, T43, T4c, T3k;
+			 V T44, T3u;
+			 T29 = VADD(T1w, T28);
+			 ST(&(Rp[WS(rs, 5)]), T29, ms, &(Rp[WS(rs, 1)]));
+			 T4M = VADD(T4K, T4L);
+			 ST(&(Rp[0]), T4M, ms, &(Rp[0]));
+			 T2P = VADD(T2M, T2O);
+			 ST(&(Rp[WS(rs, 13)]), T2P, ms, &(Rp[WS(rs, 1)]));
+			 T4t = VADD(T4k, T4s);
+			 ST(&(Rp[WS(rs, 12)]), T4t, ms, &(Rp[0]));
+			 T4N = VCONJ(VSUB(T4L, T4K));
+			 ST(&(Rm[0]), T4N, -ms, &(Rm[0]));
+			 T2a = VCONJ(VSUB(T28, T1w));
+			 ST(&(Rm[WS(rs, 5)]), T2a, -ms, &(Rm[WS(rs, 1)]));
+			 T4u = VCONJ(VSUB(T4s, T4k));
+			 ST(&(Rm[WS(rs, 12)]), T4u, -ms, &(Rm[0]));
+			 T2Q = VCONJ(VSUB(T2O, T2M));
+			 ST(&(Rm[WS(rs, 13)]), T2Q, -ms, &(Rm[WS(rs, 1)]));
+			 T2x = VADD(T2u, T2w);
+			 ST(&(Rp[WS(rs, 11)]), T2x, ms, &(Rp[WS(rs, 1)]));
+			 T4H = VADD(T4C, T4G);
+			 ST(&(Rp[WS(rs, 8)]), T4H, ms, &(Rp[0]));
+			 T2H = VADD(T2C, T2G);
+			 ST(&(Rp[WS(rs, 3)]), T2H, ms, &(Rp[WS(rs, 1)]));
+			 T4z = VADD(T4w, T4y);
+			 ST(&(Rp[WS(rs, 4)]), T4z, ms, &(Rp[0]));
+			 T4I = VCONJ(VSUB(T4G, T4C));
+			 ST(&(Rm[WS(rs, 8)]), T4I, -ms, &(Rm[0]));
+			 T2y = VCONJ(VSUB(T2w, T2u));
+			 ST(&(Rm[WS(rs, 11)]), T2y, -ms, &(Rm[WS(rs, 1)]));
+			 T4A = VCONJ(VSUB(T4y, T4w));
+			 ST(&(Rm[WS(rs, 4)]), T4A, -ms, &(Rm[0]));
+			 T2I = VCONJ(VSUB(T2G, T2C));
+			 ST(&(Rm[WS(rs, 3)]), T2I, -ms, &(Rm[WS(rs, 1)]));
+			 T37 = VADD(T2Y, T36);
+			 ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
+			 T4h = VADD(T4e, T4g);
+			 ST(&(Rp[WS(rs, 14)]), T4h, ms, &(Rp[0]));
+			 T3B = VADD(T3y, T3A);
+			 ST(&(Rp[WS(rs, 7)]), T3B, ms, &(Rp[WS(rs, 1)]));
+			 T3X = VADD(T3E, T3W);
+			 ST(&(Rp[WS(rs, 6)]), T3X, ms, &(Rp[0]));
+			 T4i = VCONJ(VSUB(T4g, T4e));
+			 ST(&(Rm[WS(rs, 14)]), T4i, -ms, &(Rm[0]));
+			 T38 = VCONJ(VSUB(T36, T2Y));
+			 ST(&(Rm[WS(rs, 15)]), T38, -ms, &(Rm[WS(rs, 1)]));
+			 T3Y = VCONJ(VSUB(T3W, T3E));
+			 ST(&(Rm[WS(rs, 6)]), T3Y, -ms, &(Rm[0]));
+			 T3C = VCONJ(VSUB(T3A, T3y));
+			 ST(&(Rm[WS(rs, 7)]), T3C, -ms, &(Rm[WS(rs, 1)]));
+			 T3j = VADD(T3g, T3i);
+			 ST(&(Rp[WS(rs, 1)]), T3j, ms, &(Rp[WS(rs, 1)]));
+			 T4b = VADD(T46, T4a);
+			 ST(&(Rp[WS(rs, 2)]), T4b, ms, &(Rp[0]));
+			 T3t = VADD(T3o, T3s);
+			 ST(&(Rp[WS(rs, 9)]), T3t, ms, &(Rp[WS(rs, 1)]));
+			 T43 = VADD(T40, T42);
+			 ST(&(Rp[WS(rs, 10)]), T43, ms, &(Rp[0]));
+			 T4c = VCONJ(VSUB(T4a, T46));
+			 ST(&(Rm[WS(rs, 2)]), T4c, -ms, &(Rm[0]));
+			 T3k = VCONJ(VSUB(T3i, T3g));
+			 ST(&(Rm[WS(rs, 1)]), T3k, -ms, &(Rm[WS(rs, 1)]));
+			 T44 = VCONJ(VSUB(T42, T40));
+			 ST(&(Rm[WS(rs, 10)]), T44, -ms, &(Rm[0]));
+			 T3u = VCONJ(VSUB(T3s, T3o));
+			 ST(&(Rm[WS(rs, 9)]), T3u, -ms, &(Rm[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     VTW(1, 16),
+     VTW(1, 17),
+     VTW(1, 18),
+     VTW(1, 19),
+     VTW(1, 20),
+     VTW(1, 21),
+     VTW(1, 22),
+     VTW(1, 23),
+     VTW(1, 24),
+     VTW(1, 25),
+     VTW(1, 26),
+     VTW(1, 27),
+     VTW(1, 28),
+     VTW(1, 29),
+     VTW(1, 30),
+     VTW(1, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cbdftv_32"), twinstr, &GENUS, {233, 88, 16, 0} };
+
+void XSIMD(codelet_hc2cbdftv_32) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_32, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 4 -dif -sign 1 -name hc2cbdftv_4 -include hc2cbv.h */
+
+/*
+ * This function contains 15 FP additions, 12 FP multiplications,
+ * (or, 9 additions, 6 multiplications, 6 fused multiply/add),
+ * 20 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 6)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T2, T3, T5, T6, Tf, T1, T9, Ta, T4, Tb, T7, Tc, Th, T8, Tg;
+	       V Te, Td, Ti, Tj;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T3 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       T6 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Tf = LDW(&(W[0]));
+	       T1 = LDW(&(W[TWVL * 4]));
+	       T9 = LDW(&(W[TWVL * 2]));
+	       Ta = VFMACONJ(T3, T2);
+	       T4 = VFNMSCONJ(T3, T2);
+	       Tb = VFMACONJ(T6, T5);
+	       T7 = VFNMSCONJ(T6, T5);
+	       Tc = VZMUL(T9, VSUB(Ta, Tb));
+	       Th = VADD(Ta, Tb);
+	       T8 = VZMULI(T1, VFNMSI(T7, T4));
+	       Tg = VZMULI(Tf, VFMAI(T7, T4));
+	       Te = VCONJ(VSUB(Tc, T8));
+	       Td = VADD(T8, Tc);
+	       Ti = VADD(Tg, Th);
+	       Tj = VCONJ(VSUB(Th, Tg));
+	       ST(&(Rm[WS(rs, 1)]), Te, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 1)]), Td, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rp[0]), Ti, ms, &(Rp[0]));
+	       ST(&(Rm[0]), Tj, -ms, &(Rm[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 4, XSIMD_STRING("hc2cbdftv_4"), twinstr, &GENUS, {9, 6, 6, 0} };
+
+void XSIMD(codelet_hc2cbdftv_4) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_4, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 4 -dif -sign 1 -name hc2cbdftv_4 -include hc2cbv.h */
+
+/*
+ * This function contains 15 FP additions, 6 FP multiplications,
+ * (or, 15 additions, 6 multiplications, 0 fused multiply/add),
+ * 22 stack variables, 0 constants, and 8 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 6)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T5, Tc, T9, Td, T2, T4, T3, T6, T8, T7, Tj, Ti, Th, Tk, Tl;
+	       V Ta, Te, T1, Tb, Tf, Tg;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T3 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       T4 = VCONJ(T3);
+	       T5 = VSUB(T2, T4);
+	       Tc = VADD(T2, T4);
+	       T6 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       T7 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       T8 = VCONJ(T7);
+	       T9 = VBYI(VSUB(T6, T8));
+	       Td = VADD(T6, T8);
+	       Tj = VADD(Tc, Td);
+	       Th = LDW(&(W[0]));
+	       Ti = VZMULI(Th, VADD(T5, T9));
+	       Tk = VADD(Ti, Tj);
+	       ST(&(Rp[0]), Tk, ms, &(Rp[0]));
+	       Tl = VCONJ(VSUB(Tj, Ti));
+	       ST(&(Rm[0]), Tl, -ms, &(Rm[0]));
+	       T1 = LDW(&(W[TWVL * 4]));
+	       Ta = VZMULI(T1, VSUB(T5, T9));
+	       Tb = LDW(&(W[TWVL * 2]));
+	       Te = VZMUL(Tb, VSUB(Tc, Td));
+	       Tf = VADD(Ta, Te);
+	       ST(&(Rp[WS(rs, 1)]), Tf, ms, &(Rp[WS(rs, 1)]));
+	       Tg = VCONJ(VSUB(Te, Ta));
+	       ST(&(Rm[WS(rs, 1)]), Tg, -ms, &(Rm[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 4, XSIMD_STRING("hc2cbdftv_4"), twinstr, &GENUS, {15, 6, 0, 0} };
+
+void XSIMD(codelet_hc2cbdftv_4) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_4, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 6 -dif -sign 1 -name hc2cbdftv_6 -include hc2cbv.h */
+
+/*
+ * This function contains 29 FP additions, 24 FP multiplications,
+ * (or, 17 additions, 12 multiplications, 12 fused multiply/add),
+ * 38 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 10)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(24, rs)) {
+	       V Tv, Tn, Tr, Te, T4, Tg, Ta, Tf, T7, T1, Td, T2, T3, T8, T9;
+	       V T5, T6, Th, Tj, Tb, Tp, Tx, Ti, Tc, To, Tk, Ts, Tq, Tw, Tm;
+	       V Tl, Tu, Tt, Tz, Ty;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T3 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       T8 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       T9 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       T6 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Tv = LDW(&(W[0]));
+	       Tn = LDW(&(W[TWVL * 8]));
+	       Tr = LDW(&(W[TWVL * 6]));
+	       Te = VFMACONJ(T3, T2);
+	       T4 = VFNMSCONJ(T3, T2);
+	       Tg = VFMACONJ(T9, T8);
+	       Ta = VFMSCONJ(T9, T8);
+	       Tf = VFMACONJ(T6, T5);
+	       T7 = VFNMSCONJ(T6, T5);
+	       T1 = LDW(&(W[TWVL * 4]));
+	       Td = LDW(&(W[TWVL * 2]));
+	       Th = VADD(Tf, Tg);
+	       Tj = VMUL(LDK(KP866025403), VSUB(Tf, Tg));
+	       Tb = VADD(T7, Ta);
+	       Tp = VMUL(LDK(KP866025403), VSUB(T7, Ta));
+	       Tx = VADD(Te, Th);
+	       Ti = VFNMS(LDK(KP500000000), Th, Te);
+	       Tc = VZMULI(T1, VADD(T4, Tb));
+	       To = VFNMS(LDK(KP500000000), Tb, T4);
+	       Tk = VZMUL(Td, VFNMSI(Tj, Ti));
+	       Ts = VZMUL(Tr, VFMAI(Tj, Ti));
+	       Tq = VZMULI(Tn, VFNMSI(Tp, To));
+	       Tw = VZMULI(Tv, VFMAI(Tp, To));
+	       Tm = VCONJ(VSUB(Tk, Tc));
+	       Tl = VADD(Tc, Tk);
+	       Tu = VCONJ(VSUB(Ts, Tq));
+	       Tt = VADD(Tq, Ts);
+	       Tz = VCONJ(VSUB(Tx, Tw));
+	       Ty = VADD(Tw, Tx);
+	       ST(&(Rm[WS(rs, 1)]), Tm, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 1)]), Tl, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 2)]), Tu, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 2)]), Tt, ms, &(Rp[0]));
+	       ST(&(Rm[0]), Tz, -ms, &(Rm[0]));
+	       ST(&(Rp[0]), Ty, ms, &(Rp[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 6, XSIMD_STRING("hc2cbdftv_6"), twinstr, &GENUS, {17, 12, 12, 0} };
+
+void XSIMD(codelet_hc2cbdftv_6) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_6, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 6 -dif -sign 1 -name hc2cbdftv_6 -include hc2cbv.h */
+
+/*
+ * This function contains 29 FP additions, 14 FP multiplications,
+ * (or, 27 additions, 12 multiplications, 2 fused multiply/add),
+ * 41 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 10)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(24, rs)) {
+	       V T5, Th, Te, Ts, Tk, Tm, T2, T4, T3, T6, Tc, T8, Tb, T7, Ta;
+	       V T9, Td, Ti, Tj, TA, Tf, Tn, Tv, Tt, Tz, T1, Tl, Tg, Tu, Tr;
+	       V Tq, Ty, To, Tp, TC, TB, Tx, Tw;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T3 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       T4 = VCONJ(T3);
+	       T5 = VSUB(T2, T4);
+	       Th = VADD(T2, T4);
+	       T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       T7 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       T8 = VCONJ(T7);
+	       Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       Tb = VCONJ(Ta);
+	       T9 = VSUB(T6, T8);
+	       Td = VSUB(Tb, Tc);
+	       Te = VADD(T9, Td);
+	       Ts = VBYI(VMUL(LDK(KP866025403), VSUB(T9, Td)));
+	       Ti = VADD(T6, T8);
+	       Tj = VADD(Tb, Tc);
+	       Tk = VADD(Ti, Tj);
+	       Tm = VBYI(VMUL(LDK(KP866025403), VSUB(Ti, Tj)));
+	       TA = VADD(Th, Tk);
+	       T1 = LDW(&(W[TWVL * 4]));
+	       Tf = VZMULI(T1, VADD(T5, Te));
+	       Tl = VFNMS(LDK(KP500000000), Tk, Th);
+	       Tg = LDW(&(W[TWVL * 2]));
+	       Tn = VZMUL(Tg, VSUB(Tl, Tm));
+	       Tu = LDW(&(W[TWVL * 6]));
+	       Tv = VZMUL(Tu, VADD(Tm, Tl));
+	       Tr = VFNMS(LDK(KP500000000), Te, T5);
+	       Tq = LDW(&(W[TWVL * 8]));
+	       Tt = VZMULI(Tq, VSUB(Tr, Ts));
+	       Ty = LDW(&(W[0]));
+	       Tz = VZMULI(Ty, VADD(Ts, Tr));
+	       To = VADD(Tf, Tn);
+	       ST(&(Rp[WS(rs, 1)]), To, ms, &(Rp[WS(rs, 1)]));
+	       Tp = VCONJ(VSUB(Tn, Tf));
+	       ST(&(Rm[WS(rs, 1)]), Tp, -ms, &(Rm[WS(rs, 1)]));
+	       TC = VCONJ(VSUB(TA, Tz));
+	       ST(&(Rm[0]), TC, -ms, &(Rm[0]));
+	       TB = VADD(Tz, TA);
+	       ST(&(Rp[0]), TB, ms, &(Rp[0]));
+	       Tx = VCONJ(VSUB(Tv, Tt));
+	       ST(&(Rm[WS(rs, 2)]), Tx, -ms, &(Rm[0]));
+	       Tw = VADD(Tt, Tv);
+	       ST(&(Rp[WS(rs, 2)]), Tw, ms, &(Rp[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 6, XSIMD_STRING("hc2cbdftv_6"), twinstr, &GENUS, {27, 12, 2, 0} };
+
+void XSIMD(codelet_hc2cbdftv_6) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_6, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dif -sign 1 -name hc2cbdftv_8 -include hc2cbv.h */
+
+/*
+ * This function contains 41 FP additions, 32 FP multiplications,
+ * (or, 23 additions, 14 multiplications, 18 fused multiply/add),
+ * 51 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V TJ, T4, Tf, TB, TD, TE, Tm, T1, Tj, TF, Tp, Tb, Tg, Tt, Tx;
+	       V T2, T3, Td, Te, T5, T6, T8, T9, Tn, T7, To, Ta, Tk, Tl, TG;
+	       V TL, Tq, Tc, Tu, Th, Tv, Ty, Tw, TC, Ti, TK, TA, Tz, TI, TH;
+	       V Ts, Tr, TN, TM;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T3 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+	       Td = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       Te = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       T8 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+	       T9 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       TJ = LDW(&(W[0]));
+	       Tk = VFMACONJ(T3, T2);
+	       T4 = VFNMSCONJ(T3, T2);
+	       Tl = VFMACONJ(Te, Td);
+	       Tf = VFNMSCONJ(Te, Td);
+	       Tn = VFMACONJ(T6, T5);
+	       T7 = VFNMSCONJ(T6, T5);
+	       To = VFMACONJ(T9, T8);
+	       Ta = VFMSCONJ(T9, T8);
+	       TB = LDW(&(W[TWVL * 8]));
+	       TD = LDW(&(W[TWVL * 6]));
+	       TE = VADD(Tk, Tl);
+	       Tm = VSUB(Tk, Tl);
+	       T1 = LDW(&(W[TWVL * 12]));
+	       Tj = LDW(&(W[TWVL * 10]));
+	       TF = VADD(Tn, To);
+	       Tp = VSUB(Tn, To);
+	       Tb = VADD(T7, Ta);
+	       Tg = VSUB(T7, Ta);
+	       Tt = LDW(&(W[TWVL * 4]));
+	       Tx = LDW(&(W[TWVL * 2]));
+	       TG = VZMUL(TD, VSUB(TE, TF));
+	       TL = VADD(TE, TF);
+	       Tq = VZMUL(Tj, VFNMSI(Tp, Tm));
+	       Tc = VFMA(LDK(KP707106781), Tb, T4);
+	       Tu = VFNMS(LDK(KP707106781), Tb, T4);
+	       Th = VFMA(LDK(KP707106781), Tg, Tf);
+	       Tv = VFNMS(LDK(KP707106781), Tg, Tf);
+	       Ty = VZMUL(Tx, VFMAI(Tp, Tm));
+	       Tw = VZMULI(Tt, VFNMSI(Tv, Tu));
+	       TC = VZMULI(TB, VFMAI(Tv, Tu));
+	       Ti = VZMULI(T1, VFNMSI(Th, Tc));
+	       TK = VZMULI(TJ, VFMAI(Th, Tc));
+	       TA = VCONJ(VSUB(Ty, Tw));
+	       Tz = VADD(Tw, Ty);
+	       TI = VCONJ(VSUB(TG, TC));
+	       TH = VADD(TC, TG);
+	       Ts = VCONJ(VSUB(Tq, Ti));
+	       Tr = VADD(Ti, Tq);
+	       TN = VCONJ(VSUB(TL, TK));
+	       TM = VADD(TK, TL);
+	       ST(&(Rm[WS(rs, 1)]), TA, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 1)]), Tz, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 2)]), TI, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 2)]), TH, ms, &(Rp[0]));
+	       ST(&(Rm[WS(rs, 3)]), Ts, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 3)]), Tr, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[0]), TN, -ms, &(Rm[0]));
+	       ST(&(Rp[0]), TM, ms, &(Rp[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cbdftv_8"), twinstr, &GENUS, {23, 14, 18, 0} };
+
+void XSIMD(codelet_hc2cbdftv_8) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_8, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dif -sign 1 -name hc2cbdftv_8 -include hc2cbv.h */
+
+/*
+ * This function contains 41 FP additions, 16 FP multiplications,
+ * (or, 41 additions, 16 multiplications, 0 fused multiply/add),
+ * 55 stack variables, 1 constants, and 16 memory accesses
+ */
+#include "hc2cbv.h"
+
+static void hc2cbdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T5, Tj, Tq, TI, Te, Tk, Tt, TJ, T2, Tg, T4, Ti, T3, Th, To;
+	       V Tp, T6, Tc, T8, Tb, T7, Ta, T9, Td, Tr, Ts, TP, Tu, Tm, TO;
+	       V Tn, Tf, Tl, T1, TN, Tv, TR, Tw, TQ, TC, TK, TA, TG, TB, TH;
+	       V Ty, Tz, Tx, TF, TD, TM, TE, TL;
+	       T2 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       T3 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+	       T4 = VCONJ(T3);
+	       Th = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       Ti = VCONJ(Th);
+	       T5 = VSUB(T2, T4);
+	       Tj = VSUB(Tg, Ti);
+	       To = VADD(T2, T4);
+	       Tp = VADD(Tg, Ti);
+	       Tq = VSUB(To, Tp);
+	       TI = VADD(To, Tp);
+	       T6 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       Tc = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+	       T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       T8 = VCONJ(T7);
+	       Ta = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Tb = VCONJ(Ta);
+	       T9 = VSUB(T6, T8);
+	       Td = VSUB(Tb, Tc);
+	       Te = VMUL(LDK(KP707106781), VADD(T9, Td));
+	       Tk = VMUL(LDK(KP707106781), VSUB(T9, Td));
+	       Tr = VADD(T6, T8);
+	       Ts = VADD(Tb, Tc);
+	       Tt = VBYI(VSUB(Tr, Ts));
+	       TJ = VADD(Tr, Ts);
+	       TP = VADD(TI, TJ);
+	       Tn = LDW(&(W[TWVL * 10]));
+	       Tu = VZMUL(Tn, VSUB(Tq, Tt));
+	       Tf = VADD(T5, Te);
+	       Tl = VBYI(VADD(Tj, Tk));
+	       T1 = LDW(&(W[TWVL * 12]));
+	       Tm = VZMULI(T1, VSUB(Tf, Tl));
+	       TN = LDW(&(W[0]));
+	       TO = VZMULI(TN, VADD(Tl, Tf));
+	       Tv = VADD(Tm, Tu);
+	       ST(&(Rp[WS(rs, 3)]), Tv, ms, &(Rp[WS(rs, 1)]));
+	       TR = VCONJ(VSUB(TP, TO));
+	       ST(&(Rm[0]), TR, -ms, &(Rm[0]));
+	       Tw = VCONJ(VSUB(Tu, Tm));
+	       ST(&(Rm[WS(rs, 3)]), Tw, -ms, &(Rm[WS(rs, 1)]));
+	       TQ = VADD(TO, TP);
+	       ST(&(Rp[0]), TQ, ms, &(Rp[0]));
+	       TB = LDW(&(W[TWVL * 2]));
+	       TC = VZMUL(TB, VADD(Tq, Tt));
+	       TH = LDW(&(W[TWVL * 6]));
+	       TK = VZMUL(TH, VSUB(TI, TJ));
+	       Ty = VBYI(VSUB(Tk, Tj));
+	       Tz = VSUB(T5, Te);
+	       Tx = LDW(&(W[TWVL * 4]));
+	       TA = VZMULI(Tx, VADD(Ty, Tz));
+	       TF = LDW(&(W[TWVL * 8]));
+	       TG = VZMULI(TF, VSUB(Tz, Ty));
+	       TD = VADD(TA, TC);
+	       ST(&(Rp[WS(rs, 1)]), TD, ms, &(Rp[WS(rs, 1)]));
+	       TM = VCONJ(VSUB(TK, TG));
+	       ST(&(Rm[WS(rs, 2)]), TM, -ms, &(Rm[0]));
+	       TE = VCONJ(VSUB(TC, TA));
+	       ST(&(Rm[WS(rs, 1)]), TE, -ms, &(Rm[WS(rs, 1)]));
+	       TL = VADD(TG, TK);
+	       ST(&(Rp[WS(rs, 2)]), TL, ms, &(Rp[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cbdftv_8"), twinstr, &GENUS, {41, 16, 0, 0} };
+
+void XSIMD(codelet_hc2cbdftv_8) (planner *p) {
+     X(khc2c_register) (p, hc2cbdftv_8, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dit -name hc2cfdftv_10 -include hc2cfv.h */
+
+/*
+ * This function contains 61 FP additions, 60 FP multiplications,
+ * (or, 33 additions, 32 multiplications, 28 fused multiply/add),
+ * 77 stack variables, 5 constants, and 20 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
+	       V T5, T6, Tw, Tr, Tc, Tj, Tl, Tm, Tk, Ts, Tg, Ty, T3, T4, T1;
+	       V T2, Tv, Tq, Ta, Tb, T9, Ti, Te, Tf, Td, Tx, Tn, Tt, Th, TQ;
+	       V TT, Tz, T7, TR, To, Tu, TU;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Tv = LDW(&(W[0]));
+	       T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       Tq = LDW(&(W[TWVL * 6]));
+	       Ta = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       Tb = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       T9 = LDW(&(W[TWVL * 2]));
+	       Ti = LDW(&(W[TWVL * 4]));
+	       Tw = VZMULIJ(Tv, VFNMSCONJ(T2, T1));
+	       Te = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+	       Tf = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+	       Tr = VZMULJ(Tq, VFMACONJ(T6, T5));
+	       Td = LDW(&(W[TWVL * 12]));
+	       Tx = LDW(&(W[TWVL * 10]));
+	       Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
+	       Tj = VZMULIJ(Ti, VFNMSCONJ(Tb, Ta));
+	       Tl = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+	       Tm = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+	       Tk = LDW(&(W[TWVL * 14]));
+	       Ts = LDW(&(W[TWVL * 16]));
+	       Tg = VZMULIJ(Td, VFNMSCONJ(Tf, Te));
+	       Ty = VZMULJ(Tx, VFMACONJ(Tf, Te));
+	       T3 = VFMACONJ(T2, T1);
+	       T4 = LDW(&(W[TWVL * 8]));
+	       Tn = VZMULJ(Tk, VFMACONJ(Tm, Tl));
+	       Tt = VZMULIJ(Ts, VFNMSCONJ(Tm, Tl));
+	       Th = VSUB(Tc, Tg);
+	       TQ = VADD(Tc, Tg);
+	       TT = VADD(Tw, Ty);
+	       Tz = VSUB(Tw, Ty);
+	       T7 = VZMULIJ(T4, VFNMSCONJ(T6, T5));
+	       TR = VADD(Tj, Tn);
+	       To = VSUB(Tj, Tn);
+	       Tu = VSUB(Tr, Tt);
+	       TU = VADD(Tr, Tt);
+	       {
+		    V TP, T8, TS, T11, Tp, TH, TA, TG, TV, T12, TE, TB, TM, TI, TZ;
+		    V TW, T17, T13, TD, TC, TY, TX, TL, TF, T10, T16, TN, TO, TK, TJ;
+		    V T18, T19, T15, T14;
+		    TP = VADD(T3, T7);
+		    T8 = VSUB(T3, T7);
+		    TS = VADD(TQ, TR);
+		    T11 = VSUB(TQ, TR);
+		    Tp = VSUB(Th, To);
+		    TH = VADD(Th, To);
+		    TA = VSUB(Tu, Tz);
+		    TG = VADD(Tz, Tu);
+		    TV = VADD(TT, TU);
+		    T12 = VSUB(TU, TT);
+		    TE = VSUB(Tp, TA);
+		    TB = VADD(Tp, TA);
+		    TM = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TG, TH));
+		    TI = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TH, TG));
+		    TZ = VSUB(TS, TV);
+		    TW = VADD(TS, TV);
+		    T17 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T11, T12));
+		    T13 = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T12, T11));
+		    TD = VFNMS(LDK(KP250000000), TB, T8);
+		    TC = VMUL(LDK(KP500000000), VADD(T8, TB));
+		    TY = VFNMS(LDK(KP250000000), TW, TP);
+		    TX = VCONJ(VMUL(LDK(KP500000000), VADD(TP, TW)));
+		    TL = VFMA(LDK(KP559016994), TE, TD);
+		    TF = VFNMS(LDK(KP559016994), TE, TD);
+		    ST(&(Rp[0]), TC, ms, &(Rp[0]));
+		    T10 = VFMA(LDK(KP559016994), TZ, TY);
+		    T16 = VFNMS(LDK(KP559016994), TZ, TY);
+		    ST(&(Rm[WS(rs, 4)]), TX, -ms, &(Rm[0]));
+		    TN = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TM, TL)));
+		    TO = VMUL(LDK(KP500000000), VFMAI(TM, TL));
+		    TK = VMUL(LDK(KP500000000), VFMAI(TI, TF));
+		    TJ = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TI, TF)));
+		    T18 = VMUL(LDK(KP500000000), VFNMSI(T17, T16));
+		    T19 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T17, T16)));
+		    T15 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T13, T10)));
+		    T14 = VMUL(LDK(KP500000000), VFNMSI(T13, T10));
+		    ST(&(Rm[WS(rs, 3)]), TN, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 4)]), TO, ms, &(Rp[0]));
+		    ST(&(Rp[WS(rs, 2)]), TK, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 1)]), TJ, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 3)]), T18, ms, &(Rp[WS(rs, 1)]));
+		    ST(&(Rm[WS(rs, 2)]), T19, -ms, &(Rm[0]));
+		    ST(&(Rm[0]), T15, -ms, &(Rm[0]));
+		    ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cfdftv_10"), twinstr, &GENUS, {33, 32, 28, 0} };
+
+void XSIMD(codelet_hc2cfdftv_10) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_10, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dit -name hc2cfdftv_10 -include hc2cfv.h */
+
+/*
+ * This function contains 61 FP additions, 38 FP multiplications,
+ * (or, 55 additions, 32 multiplications, 6 fused multiply/add),
+ * 82 stack variables, 5 constants, and 20 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP125000000, +0.125000000000000000000000000000000000000000000);
+     DVK(KP279508497, +0.279508497187473712051146708591409529430077295);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
+	       V Tl, Tt, Tu, TY, TZ, T10, Tz, TE, TF, TV, TW, TX, Ta, TU, TN;
+	       V TR, TH, TQ, TK, TL, TM, TI, TG, TJ, TT, TO, TP, TS, T18, T1c;
+	       V T12, T1b, T15, T16, T17, T14, T11, T13, T1e, T19, T1a, T1d;
+	       {
+		    V T1, T3, Ty, T8, T7, TB, Tf, Ts, Tk, Tw, Tq, TD, T2, Tx, T6;
+		    V TA, Tc, Te, Td, Tb, Tr, Tj, Ti, Th, Tg, Tv, Tn, Tp, To, Tm;
+		    V TC, T4, T9, T5;
+		    T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+		    T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+		    T3 = VCONJ(T2);
+		    Tx = LDW(&(W[0]));
+		    Ty = VZMULIJ(Tx, VSUB(T3, T1));
+		    T8 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+		    T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+		    T7 = VCONJ(T6);
+		    TA = LDW(&(W[TWVL * 6]));
+		    TB = VZMULJ(TA, VADD(T7, T8));
+		    Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+		    Te = VCONJ(Td);
+		    Tb = LDW(&(W[TWVL * 2]));
+		    Tf = VZMULJ(Tb, VADD(Tc, Te));
+		    Tr = LDW(&(W[TWVL * 4]));
+		    Ts = VZMULIJ(Tr, VSUB(Te, Tc));
+		    Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    Th = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+		    Ti = VCONJ(Th);
+		    Tg = LDW(&(W[TWVL * 12]));
+		    Tk = VZMULIJ(Tg, VSUB(Ti, Tj));
+		    Tv = LDW(&(W[TWVL * 10]));
+		    Tw = VZMULJ(Tv, VADD(Ti, Tj));
+		    Tn = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+		    To = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+		    Tp = VCONJ(To);
+		    Tm = LDW(&(W[TWVL * 14]));
+		    Tq = VZMULJ(Tm, VADD(Tn, Tp));
+		    TC = LDW(&(W[TWVL * 16]));
+		    TD = VZMULIJ(TC, VSUB(Tp, Tn));
+		    Tl = VSUB(Tf, Tk);
+		    Tt = VSUB(Tq, Ts);
+		    Tu = VADD(Tl, Tt);
+		    TY = VADD(Ty, Tw);
+		    TZ = VADD(TB, TD);
+		    T10 = VADD(TY, TZ);
+		    Tz = VSUB(Tw, Ty);
+		    TE = VSUB(TB, TD);
+		    TF = VADD(Tz, TE);
+		    TV = VADD(Tf, Tk);
+		    TW = VADD(Ts, Tq);
+		    TX = VADD(TV, TW);
+		    T4 = VADD(T1, T3);
+		    T5 = LDW(&(W[TWVL * 8]));
+		    T9 = VZMULIJ(T5, VSUB(T7, T8));
+		    Ta = VSUB(T4, T9);
+		    TU = VADD(T4, T9);
+	       }
+	       TL = VSUB(Tl, Tt);
+	       TM = VSUB(TE, Tz);
+	       TN = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TM))));
+	       TR = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), TL, VMUL(LDK(KP951056516), TM))));
+	       TI = VMUL(LDK(KP279508497), VSUB(Tu, TF));
+	       TG = VADD(Tu, TF);
+	       TJ = VFNMS(LDK(KP125000000), TG, VMUL(LDK(KP500000000), Ta));
+	       TH = VCONJ(VMUL(LDK(KP500000000), VADD(Ta, TG)));
+	       TQ = VSUB(TJ, TI);
+	       TK = VADD(TI, TJ);
+	       ST(&(Rm[WS(rs, 4)]), TH, -ms, &(Rm[0]));
+	       TT = VCONJ(VADD(TQ, TR));
+	       ST(&(Rm[WS(rs, 2)]), TT, -ms, &(Rm[0]));
+	       TO = VSUB(TK, TN);
+	       ST(&(Rp[WS(rs, 1)]), TO, ms, &(Rp[WS(rs, 1)]));
+	       TP = VCONJ(VADD(TK, TN));
+	       ST(&(Rm[0]), TP, -ms, &(Rm[0]));
+	       TS = VSUB(TQ, TR);
+	       ST(&(Rp[WS(rs, 3)]), TS, ms, &(Rp[WS(rs, 1)]));
+	       T16 = VSUB(TZ, TY);
+	       T17 = VSUB(TV, TW);
+	       T18 = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T17, VMUL(LDK(KP951056516), T16))));
+	       T1c = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T17, VMUL(LDK(KP587785252), T16))));
+	       T14 = VMUL(LDK(KP279508497), VSUB(TX, T10));
+	       T11 = VADD(TX, T10);
+	       T13 = VFNMS(LDK(KP125000000), T11, VMUL(LDK(KP500000000), TU));
+	       T12 = VMUL(LDK(KP500000000), VADD(TU, T11));
+	       T1b = VADD(T14, T13);
+	       T15 = VSUB(T13, T14);
+	       ST(&(Rp[0]), T12, ms, &(Rp[0]));
+	       T1e = VADD(T1b, T1c);
+	       ST(&(Rp[WS(rs, 4)]), T1e, ms, &(Rp[0]));
+	       T19 = VCONJ(VSUB(T15, T18));
+	       ST(&(Rm[WS(rs, 1)]), T19, -ms, &(Rm[WS(rs, 1)]));
+	       T1a = VADD(T15, T18);
+	       ST(&(Rp[WS(rs, 2)]), T1a, ms, &(Rp[0]));
+	       T1d = VCONJ(VSUB(T1b, T1c));
+	       ST(&(Rm[WS(rs, 3)]), T1d, -ms, &(Rm[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cfdftv_10"), twinstr, &GENUS, {55, 32, 6, 0} };
+
+void XSIMD(codelet_hc2cfdftv_10) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_10, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dit -name hc2cfdftv_12 -include hc2cfv.h */
+
+/*
+ * This function contains 71 FP additions, 66 FP multiplications,
+ * (or, 41 additions, 36 multiplications, 30 fused multiply/add),
+ * 86 stack variables, 2 constants, and 24 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
+	       V T3, T7, TH, TE, Th, TC, Tq, T11, TU, Tx, Tb, Tz, Tu, Tw, Tp;
+	       V Tl, T9, Ta, T8, Ty, Tn, To, Tm, TG, T1, T2, Tt, T5, T6, T4;
+	       V Tv, Tj, Tk, Ti, TD, Tf, Tg, Te, TB, TT, TF, TR, Tr;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Tt = LDW(&(W[0]));
+	       T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       T4 = LDW(&(W[TWVL * 6]));
+	       Tv = LDW(&(W[TWVL * 8]));
+	       Tn = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       To = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       T3 = VFMACONJ(T2, T1);
+	       Tu = VZMULIJ(Tt, VFNMSCONJ(T2, T1));
+	       Tm = LDW(&(W[TWVL * 2]));
+	       TG = LDW(&(W[TWVL * 4]));
+	       T7 = VZMULJ(T4, VFMACONJ(T6, T5));
+	       Tw = VZMULIJ(Tv, VFNMSCONJ(T6, T5));
+	       Tj = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+	       Tk = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+	       Ti = LDW(&(W[TWVL * 18]));
+	       TD = LDW(&(W[TWVL * 20]));
+	       Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
+	       TH = VZMULIJ(TG, VFNMSCONJ(To, Tn));
+	       Tf = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+	       Tg = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+	       Te = LDW(&(W[TWVL * 10]));
+	       TB = LDW(&(W[TWVL * 12]));
+	       Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
+	       TE = VZMULIJ(TD, VFNMSCONJ(Tk, Tj));
+	       T9 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+	       Ta = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+	       T8 = LDW(&(W[TWVL * 14]));
+	       Ty = LDW(&(W[TWVL * 16]));
+	       Th = VZMULJ(Te, VFMACONJ(Tg, Tf));
+	       TC = VZMULIJ(TB, VFNMSCONJ(Tg, Tf));
+	       Tq = VADD(Tl, Tp);
+	       T11 = VSUB(Tp, Tl);
+	       TU = VSUB(Tu, Tw);
+	       Tx = VADD(Tu, Tw);
+	       Tb = VZMULJ(T8, VFMACONJ(Ta, T9));
+	       Tz = VZMULIJ(Ty, VFNMSCONJ(Ta, T9));
+	       TT = VSUB(TC, TE);
+	       TF = VADD(TC, TE);
+	       TR = VFNMS(LDK(KP500000000), Tq, Th);
+	       Tr = VADD(Th, Tq);
+	       {
+		    V TX, TA, T1d, TV, TY, TI, T1e, T12, TQ, Td, T10, Tc, T1a, TN, TJ;
+		    V T1j, T1f, T1b, TS, TM, Ts, T17, T13, TZ, T1i, T1c, T16, TW, TP, TO;
+		    V TL, TK, T1k, T1l, T1h, T1g, T18, T19, T15, T14;
+		    T10 = VSUB(Tb, T7);
+		    Tc = VADD(T7, Tb);
+		    TX = VFNMS(LDK(KP500000000), Tx, Tz);
+		    TA = VADD(Tx, Tz);
+		    T1d = VADD(TU, TT);
+		    TV = VSUB(TT, TU);
+		    TY = VFNMS(LDK(KP500000000), TF, TH);
+		    TI = VADD(TF, TH);
+		    T1e = VADD(T10, T11);
+		    T12 = VSUB(T10, T11);
+		    TQ = VFNMS(LDK(KP500000000), Tc, T3);
+		    Td = VADD(T3, Tc);
+		    T1a = VADD(TX, TY);
+		    TZ = VSUB(TX, TY);
+		    TN = VADD(TA, TI);
+		    TJ = VSUB(TA, TI);
+		    T1j = VMUL(LDK(KP866025403), VADD(T1d, T1e));
+		    T1f = VMUL(LDK(KP866025403), VSUB(T1d, T1e));
+		    T1b = VADD(TQ, TR);
+		    TS = VSUB(TQ, TR);
+		    TM = VADD(Td, Tr);
+		    Ts = VSUB(Td, Tr);
+		    T17 = VFMA(LDK(KP866025403), T12, TZ);
+		    T13 = VFNMS(LDK(KP866025403), T12, TZ);
+		    T1i = VSUB(T1b, T1a);
+		    T1c = VADD(T1a, T1b);
+		    T16 = VFNMS(LDK(KP866025403), TV, TS);
+		    TW = VFMA(LDK(KP866025403), TV, TS);
+		    TP = VCONJ(VMUL(LDK(KP500000000), VADD(TN, TM)));
+		    TO = VMUL(LDK(KP500000000), VSUB(TM, TN));
+		    TL = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TJ, Ts)));
+		    TK = VMUL(LDK(KP500000000), VFMAI(TJ, Ts));
+		    T1k = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1j, T1i)));
+		    T1l = VMUL(LDK(KP500000000), VFMAI(T1j, T1i));
+		    T1h = VMUL(LDK(KP500000000), VFMAI(T1f, T1c));
+		    T1g = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1f, T1c)));
+		    T18 = VMUL(LDK(KP500000000), VFNMSI(T17, T16));
+		    T19 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T17, T16)));
+		    T15 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T13, TW)));
+		    T14 = VMUL(LDK(KP500000000), VFNMSI(T13, TW));
+		    ST(&(Rm[WS(rs, 5)]), TP, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[0]), TO, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 2)]), TL, -ms, &(Rm[0]));
+		    ST(&(Rp[WS(rs, 3)]), TK, ms, &(Rp[WS(rs, 1)]));
+		    ST(&(Rm[WS(rs, 3)]), T1k, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 4)]), T1l, ms, &(Rp[0]));
+		    ST(&(Rp[WS(rs, 2)]), T1h, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 1)]), T1g, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 5)]), T18, ms, &(Rp[WS(rs, 1)]));
+		    ST(&(Rm[WS(rs, 4)]), T19, -ms, &(Rm[0]));
+		    ST(&(Rm[0]), T15, -ms, &(Rm[0]));
+		    ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cfdftv_12"), twinstr, &GENUS, {41, 36, 30, 0} };
+
+void XSIMD(codelet_hc2cfdftv_12) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_12, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dit -name hc2cfdftv_12 -include hc2cfv.h */
+
+/*
+ * This function contains 71 FP additions, 41 FP multiplications,
+ * (or, 67 additions, 37 multiplications, 4 fused multiply/add),
+ * 58 stack variables, 4 constants, and 24 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP433012701, +0.433012701892219323381861585376468091735701313);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
+	       V TX, T13, T4, Tf, TZ, TD, TF, T17, TW, T14, Tw, Tl, T10, TL, TN;
+	       V T16;
+	       {
+		    V T1, T3, TA, Tb, Td, Te, T9, TC, T2, Tz, Tc, Ta, T6, T8, T7;
+		    V T5, TB, TE, Ti, Tk, TI, Ts, Tu, Tv, Tq, TK, Tj, TH, Tt, Tr;
+		    V Tn, Tp, To, Tm, TJ, Th, TM;
+		    T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+		    T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+		    T3 = VCONJ(T2);
+		    Tz = LDW(&(W[0]));
+		    TA = VZMULIJ(Tz, VSUB(T3, T1));
+		    Tb = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+		    Tc = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+		    Td = VCONJ(Tc);
+		    Ta = LDW(&(W[TWVL * 14]));
+		    Te = VZMULJ(Ta, VADD(Tb, Td));
+		    T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+		    T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+		    T8 = VCONJ(T7);
+		    T5 = LDW(&(W[TWVL * 6]));
+		    T9 = VZMULJ(T5, VADD(T6, T8));
+		    TB = LDW(&(W[TWVL * 8]));
+		    TC = VZMULIJ(TB, VSUB(T8, T6));
+		    TX = VSUB(TC, TA);
+		    T13 = VSUB(Te, T9);
+		    T4 = VADD(T1, T3);
+		    Tf = VADD(T9, Te);
+		    TZ = VFNMS(LDK(KP250000000), Tf, VMUL(LDK(KP500000000), T4));
+		    TD = VADD(TA, TC);
+		    TE = LDW(&(W[TWVL * 16]));
+		    TF = VZMULIJ(TE, VSUB(Td, Tb));
+		    T17 = VFNMS(LDK(KP500000000), TD, TF);
+		    Ti = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    Tj = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tk = VCONJ(Tj);
+		    TH = LDW(&(W[TWVL * 12]));
+		    TI = VZMULIJ(TH, VSUB(Tk, Ti));
+		    Ts = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tu = VCONJ(Tt);
+		    Tr = LDW(&(W[TWVL * 2]));
+		    Tv = VZMULJ(Tr, VADD(Ts, Tu));
+		    Tn = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+		    To = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tp = VCONJ(To);
+		    Tm = LDW(&(W[TWVL * 18]));
+		    Tq = VZMULJ(Tm, VADD(Tn, Tp));
+		    TJ = LDW(&(W[TWVL * 20]));
+		    TK = VZMULIJ(TJ, VSUB(Tp, Tn));
+		    TW = VSUB(TK, TI);
+		    T14 = VSUB(Tv, Tq);
+		    Tw = VADD(Tq, Tv);
+		    Th = LDW(&(W[TWVL * 10]));
+		    Tl = VZMULJ(Th, VADD(Ti, Tk));
+		    T10 = VFNMS(LDK(KP250000000), Tw, VMUL(LDK(KP500000000), Tl));
+		    TL = VADD(TI, TK);
+		    TM = LDW(&(W[TWVL * 4]));
+		    TN = VZMULIJ(TM, VSUB(Tu, Ts));
+		    T16 = VFNMS(LDK(KP500000000), TL, TN);
+	       }
+	       {
+		    V Ty, TS, TP, TT, Tg, Tx, TG, TO, TQ, TV, TR, TU, T1i, T1o, T1l;
+		    V T1p, T1g, T1h, T1j, T1k, T1m, T1r, T1n, T1q, T12, T1c, T19, T1d, TY, T11;
+		    V T15, T18, T1a, T1f, T1b, T1e;
+		    Tg = VADD(T4, Tf);
+		    Tx = VADD(Tl, Tw);
+		    Ty = VADD(Tg, Tx);
+		    TS = VSUB(Tg, Tx);
+		    TG = VADD(TD, TF);
+		    TO = VADD(TL, TN);
+		    TP = VADD(TG, TO);
+		    TT = VBYI(VSUB(TO, TG));
+		    TQ = VCONJ(VMUL(LDK(KP500000000), VSUB(Ty, TP)));
+		    ST(&(Rm[WS(rs, 5)]), TQ, -ms, &(Rm[WS(rs, 1)]));
+		    TV = VMUL(LDK(KP500000000), VADD(TS, TT));
+		    ST(&(Rp[WS(rs, 3)]), TV, ms, &(Rp[WS(rs, 1)]));
+		    TR = VMUL(LDK(KP500000000), VADD(Ty, TP));
+		    ST(&(Rp[0]), TR, ms, &(Rp[0]));
+		    TU = VCONJ(VMUL(LDK(KP500000000), VSUB(TS, TT)));
+		    ST(&(Rm[WS(rs, 2)]), TU, -ms, &(Rm[0]));
+		    T1g = VADD(TX, TW);
+		    T1h = VADD(T13, T14);
+		    T1i = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(T1g, T1h))));
+		    T1o = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VADD(T1g, T1h))));
+		    T1j = VADD(TZ, T10);
+		    T1k = VMUL(LDK(KP500000000), VADD(T17, T16));
+		    T1l = VSUB(T1j, T1k);
+		    T1p = VADD(T1j, T1k);
+		    T1m = VADD(T1i, T1l);
+		    ST(&(Rp[WS(rs, 2)]), T1m, ms, &(Rp[0]));
+		    T1r = VCONJ(VSUB(T1p, T1o));
+		    ST(&(Rm[WS(rs, 3)]), T1r, -ms, &(Rm[WS(rs, 1)]));
+		    T1n = VCONJ(VSUB(T1l, T1i));
+		    ST(&(Rm[WS(rs, 1)]), T1n, -ms, &(Rm[WS(rs, 1)]));
+		    T1q = VADD(T1o, T1p);
+		    ST(&(Rp[WS(rs, 4)]), T1q, ms, &(Rp[0]));
+		    TY = VMUL(LDK(KP433012701), VSUB(TW, TX));
+		    T11 = VSUB(TZ, T10);
+		    T12 = VADD(TY, T11);
+		    T1c = VSUB(T11, TY);
+		    T15 = VMUL(LDK(KP866025403), VSUB(T13, T14));
+		    T18 = VSUB(T16, T17);
+		    T19 = VMUL(LDK(KP500000000), VBYI(VSUB(T15, T18)));
+		    T1d = VMUL(LDK(KP500000000), VBYI(VADD(T15, T18)));
+		    T1a = VCONJ(VSUB(T12, T19));
+		    ST(&(Rm[0]), T1a, -ms, &(Rm[0]));
+		    T1f = VCONJ(VADD(T1c, T1d));
+		    ST(&(Rm[WS(rs, 4)]), T1f, -ms, &(Rm[0]));
+		    T1b = VADD(T12, T19);
+		    ST(&(Rp[WS(rs, 1)]), T1b, ms, &(Rp[WS(rs, 1)]));
+		    T1e = VSUB(T1c, T1d);
+		    ST(&(Rp[WS(rs, 5)]), T1e, ms, &(Rp[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cfdftv_12"), twinstr, &GENUS, {67, 37, 4, 0} };
+
+void XSIMD(codelet_hc2cfdftv_12) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_12, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dit -name hc2cfdftv_16 -include hc2cfv.h */
+
+/*
+ * This function contains 103 FP additions, 96 FP multiplications,
+ * (or, 53 additions, 46 multiplications, 50 fused multiply/add),
+ * 92 stack variables, 4 constants, and 32 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V T8, Tc, TQ, TZ, T1J, T1x, T12, TH, T1I, T1q, Tp, TJ, Te, Tf, Td;
+	       V TN, Tj, Tk, Ti, TK, Tg, TO, Tl, TL, T1r, Th, TR, T1y, T1s, Tq;
+	       V TM, T1z, T1N, T1t, T10, Tr, T13, TS, T1K, T1A, T1E, T1u, T1f, T11, T1c;
+	       V Ts, T1d, T14, T1g, TT;
+	       {
+		    V T3, Tw, TF, TW, Tz, TA, Ty, TX, T7, Tu, T1, T2, Tv, TD, TE;
+		    V TC, TV, T5, T6, T4, Tt, TB, TY, T1o, T1v, Tx, Ta, Tb, T9, TP;
+		    V T1w, TG, T1p, Tn, To, Tm, TI;
+		    T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+		    T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+		    Tv = LDW(&(W[0]));
+		    TD = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+		    TE = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+		    TC = LDW(&(W[TWVL * 8]));
+		    TV = LDW(&(W[TWVL * 6]));
+		    T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+		    T6 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+		    T3 = VFMACONJ(T2, T1);
+		    Tw = VZMULIJ(Tv, VFNMSCONJ(T2, T1));
+		    T4 = LDW(&(W[TWVL * 14]));
+		    Tt = LDW(&(W[TWVL * 16]));
+		    TF = VZMULIJ(TC, VFNMSCONJ(TE, TD));
+		    TW = VZMULJ(TV, VFMACONJ(TE, TD));
+		    Tz = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+		    TA = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+		    Ty = LDW(&(W[TWVL * 24]));
+		    TX = LDW(&(W[TWVL * 22]));
+		    T7 = VZMULJ(T4, VFMACONJ(T6, T5));
+		    Tu = VZMULIJ(Tt, VFNMSCONJ(T6, T5));
+		    Ta = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    Tb = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+		    T9 = LDW(&(W[TWVL * 2]));
+		    TP = LDW(&(W[TWVL * 4]));
+		    TB = VZMULIJ(Ty, VFNMSCONJ(TA, Tz));
+		    TY = VZMULJ(TX, VFMACONJ(TA, Tz));
+		    T1o = VADD(T3, T7);
+		    T8 = VSUB(T3, T7);
+		    T1v = VADD(Tw, Tu);
+		    Tx = VSUB(Tu, Tw);
+		    Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
+		    TQ = VZMULIJ(TP, VFNMSCONJ(Tb, Ta));
+		    T1w = VADD(TF, TB);
+		    TG = VSUB(TB, TF);
+		    T1p = VADD(TW, TY);
+		    TZ = VSUB(TW, TY);
+		    Tn = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    To = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tm = LDW(&(W[TWVL * 10]));
+		    TI = LDW(&(W[TWVL * 12]));
+		    T1J = VSUB(T1w, T1v);
+		    T1x = VADD(T1v, T1w);
+		    T12 = VFMA(LDK(KP414213562), Tx, TG);
+		    TH = VFNMS(LDK(KP414213562), TG, Tx);
+		    T1I = VSUB(T1o, T1p);
+		    T1q = VADD(T1o, T1p);
+		    Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
+		    TJ = VZMULIJ(TI, VFNMSCONJ(To, Tn));
+		    Te = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+		    Tf = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+		    Td = LDW(&(W[TWVL * 18]));
+		    TN = LDW(&(W[TWVL * 20]));
+		    Tj = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+		    Tk = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+		    Ti = LDW(&(W[TWVL * 26]));
+		    TK = LDW(&(W[TWVL * 28]));
+	       }
+	       Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
+	       TO = VZMULIJ(TN, VFNMSCONJ(Tf, Te));
+	       Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
+	       TL = VZMULIJ(TK, VFNMSCONJ(Tk, Tj));
+	       T1r = VADD(Tc, Tg);
+	       Th = VSUB(Tc, Tg);
+	       TR = VSUB(TO, TQ);
+	       T1y = VADD(TQ, TO);
+	       T1s = VADD(Tl, Tp);
+	       Tq = VSUB(Tl, Tp);
+	       TM = VSUB(TJ, TL);
+	       T1z = VADD(TL, TJ);
+	       T1N = VSUB(T1s, T1r);
+	       T1t = VADD(T1r, T1s);
+	       T10 = VSUB(Tq, Th);
+	       Tr = VADD(Th, Tq);
+	       T13 = VFNMS(LDK(KP414213562), TM, TR);
+	       TS = VFMA(LDK(KP414213562), TR, TM);
+	       T1K = VSUB(T1y, T1z);
+	       T1A = VADD(T1y, T1z);
+	       T1E = VADD(T1q, T1t);
+	       T1u = VSUB(T1q, T1t);
+	       T1f = VFMA(LDK(KP707106781), T10, TZ);
+	       T11 = VFNMS(LDK(KP707106781), T10, TZ);
+	       T1c = VFNMS(LDK(KP707106781), Tr, T8);
+	       Ts = VFMA(LDK(KP707106781), Tr, T8);
+	       T1d = VSUB(T12, T13);
+	       T14 = VADD(T12, T13);
+	       T1g = VSUB(TS, TH);
+	       TT = VADD(TH, TS);
+	       {
+		    V T1O, T1L, T1F, T1B, T1k, T1e, T19, T15, T1l, T1h, T18, TU, T1T, T1P, T1S;
+		    V T1M, T1H, T1G, T1D, T1C, T1m, T1n, T1j, T1i, T1a, T1b, T17, T16, T1U, T1V;
+		    V T1R, T1Q;
+		    T1O = VSUB(T1K, T1J);
+		    T1L = VADD(T1J, T1K);
+		    T1F = VADD(T1x, T1A);
+		    T1B = VSUB(T1x, T1A);
+		    T1k = VFNMS(LDK(KP923879532), T1d, T1c);
+		    T1e = VFMA(LDK(KP923879532), T1d, T1c);
+		    T19 = VFNMS(LDK(KP923879532), T14, T11);
+		    T15 = VFMA(LDK(KP923879532), T14, T11);
+		    T1l = VFNMS(LDK(KP923879532), T1g, T1f);
+		    T1h = VFMA(LDK(KP923879532), T1g, T1f);
+		    T18 = VFNMS(LDK(KP923879532), TT, Ts);
+		    TU = VFMA(LDK(KP923879532), TT, Ts);
+		    T1T = VFNMS(LDK(KP707106781), T1O, T1N);
+		    T1P = VFMA(LDK(KP707106781), T1O, T1N);
+		    T1S = VFNMS(LDK(KP707106781), T1L, T1I);
+		    T1M = VFMA(LDK(KP707106781), T1L, T1I);
+		    T1H = VCONJ(VMUL(LDK(KP500000000), VADD(T1F, T1E)));
+		    T1G = VMUL(LDK(KP500000000), VSUB(T1E, T1F));
+		    T1D = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1B, T1u)));
+		    T1C = VMUL(LDK(KP500000000), VFMAI(T1B, T1u));
+		    T1m = VMUL(LDK(KP500000000), VFNMSI(T1l, T1k));
+		    T1n = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1l, T1k)));
+		    T1j = VMUL(LDK(KP500000000), VFMAI(T1h, T1e));
+		    T1i = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1h, T1e)));
+		    T1a = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T19, T18)));
+		    T1b = VMUL(LDK(KP500000000), VFMAI(T19, T18));
+		    T17 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T15, TU)));
+		    T16 = VMUL(LDK(KP500000000), VFNMSI(T15, TU));
+		    T1U = VMUL(LDK(KP500000000), VFNMSI(T1T, T1S));
+		    T1V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1T, T1S)));
+		    T1R = VMUL(LDK(KP500000000), VFMAI(T1P, T1M));
+		    T1Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1P, T1M)));
+		    ST(&(Rm[WS(rs, 7)]), T1H, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[0]), T1G, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 3)]), T1D, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 4)]), T1C, ms, &(Rp[0]));
+		    ST(&(Rp[WS(rs, 5)]), T1m, ms, &(Rp[WS(rs, 1)]));
+		    ST(&(Rm[WS(rs, 4)]), T1n, -ms, &(Rm[0]));
+		    ST(&(Rp[WS(rs, 3)]), T1j, ms, &(Rp[WS(rs, 1)]));
+		    ST(&(Rm[WS(rs, 2)]), T1i, -ms, &(Rm[0]));
+		    ST(&(Rm[WS(rs, 6)]), T1a, -ms, &(Rm[0]));
+		    ST(&(Rp[WS(rs, 7)]), T1b, ms, &(Rp[WS(rs, 1)]));
+		    ST(&(Rm[0]), T17, -ms, &(Rm[0]));
+		    ST(&(Rp[WS(rs, 1)]), T16, ms, &(Rp[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 6)]), T1U, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 5)]), T1V, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 2)]), T1R, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 1)]), T1Q, -ms, &(Rm[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cfdftv_16"), twinstr, &GENUS, {53, 46, 50, 0} };
+
+void XSIMD(codelet_hc2cfdftv_16) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_16, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dit -name hc2cfdftv_16 -include hc2cfv.h */
+
+/*
+ * This function contains 103 FP additions, 56 FP multiplications,
+ * (or, 99 additions, 52 multiplications, 4 fused multiply/add),
+ * 101 stack variables, 5 constants, and 32 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
+	       V T1D, T1E, T1R, TP, T1b, Ta, T1w, T18, T1x, T1z, T1A, T1G, T1H, T1S, Tx;
+	       V T13, T10, T1a, T1, T3, TA, TM, TL, TN, T6, T8, TC, TH, TG, TI;
+	       V T2, Tz, TK, TJ, T7, TB, TF, TE, TD, TO, T4, T9, T5, T15, T17;
+	       V T14, T16;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       T3 = VCONJ(T2);
+	       Tz = LDW(&(W[0]));
+	       TA = VZMULIJ(Tz, VSUB(T3, T1));
+	       TM = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+	       TK = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+	       TL = VCONJ(TK);
+	       TJ = LDW(&(W[TWVL * 24]));
+	       TN = VZMULIJ(TJ, VSUB(TL, TM));
+	       T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+	       T7 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+	       T8 = VCONJ(T7);
+	       TB = LDW(&(W[TWVL * 16]));
+	       TC = VZMULIJ(TB, VSUB(T8, T6));
+	       TH = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       TF = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       TG = VCONJ(TF);
+	       TE = LDW(&(W[TWVL * 8]));
+	       TI = VZMULIJ(TE, VSUB(TG, TH));
+	       T1D = VADD(TA, TC);
+	       T1E = VADD(TI, TN);
+	       T1R = VSUB(T1D, T1E);
+	       TD = VSUB(TA, TC);
+	       TO = VSUB(TI, TN);
+	       TP = VFNMS(LDK(KP382683432), TO, VMUL(LDK(KP923879532), TD));
+	       T1b = VFMA(LDK(KP382683432), TD, VMUL(LDK(KP923879532), TO));
+	       T4 = VADD(T1, T3);
+	       T5 = LDW(&(W[TWVL * 14]));
+	       T9 = VZMULJ(T5, VADD(T6, T8));
+	       Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
+	       T1w = VADD(T4, T9);
+	       T14 = LDW(&(W[TWVL * 6]));
+	       T15 = VZMULJ(T14, VADD(TH, TG));
+	       T16 = LDW(&(W[TWVL * 22]));
+	       T17 = VZMULJ(T16, VADD(TM, TL));
+	       T18 = VSUB(T15, T17);
+	       T1x = VADD(T15, T17);
+	       {
+		    V Tf, TR, Tv, TY, Tk, TT, Tq, TW, Tc, Te, Td, Tb, TQ, Ts, Tu;
+		    V Tt, Tr, TX, Th, Tj, Ti, Tg, TS, Tn, Tp, To, Tm, TV, Tl, Tw;
+		    V TU, TZ;
+		    Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+		    Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+		    Te = VCONJ(Td);
+		    Tb = LDW(&(W[TWVL * 2]));
+		    Tf = VZMULJ(Tb, VADD(Tc, Te));
+		    TQ = LDW(&(W[TWVL * 4]));
+		    TR = VZMULIJ(TQ, VSUB(Te, Tc));
+		    Ts = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+		    Tt = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tu = VCONJ(Tt);
+		    Tr = LDW(&(W[TWVL * 10]));
+		    Tv = VZMULJ(Tr, VADD(Ts, Tu));
+		    TX = LDW(&(W[TWVL * 12]));
+		    TY = VZMULIJ(TX, VSUB(Tu, Ts));
+		    Th = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+		    Ti = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tj = VCONJ(Ti);
+		    Tg = LDW(&(W[TWVL * 18]));
+		    Tk = VZMULJ(Tg, VADD(Th, Tj));
+		    TS = LDW(&(W[TWVL * 20]));
+		    TT = VZMULIJ(TS, VSUB(Tj, Th));
+		    Tn = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+		    To = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+		    Tp = VCONJ(To);
+		    Tm = LDW(&(W[TWVL * 26]));
+		    Tq = VZMULJ(Tm, VADD(Tn, Tp));
+		    TV = LDW(&(W[TWVL * 28]));
+		    TW = VZMULIJ(TV, VSUB(Tp, Tn));
+		    T1z = VADD(Tf, Tk);
+		    T1A = VADD(Tq, Tv);
+		    T1G = VADD(TR, TT);
+		    T1H = VADD(TW, TY);
+		    T1S = VSUB(T1H, T1G);
+		    Tl = VSUB(Tf, Tk);
+		    Tw = VSUB(Tq, Tv);
+		    Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
+		    T13 = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
+		    TU = VSUB(TR, TT);
+		    TZ = VSUB(TW, TY);
+		    T10 = VFMA(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TZ));
+		    T1a = VFNMS(LDK(KP923879532), TU, VMUL(LDK(KP382683432), TZ));
+	       }
+	       {
+		    V T1U, T20, T1X, T21, T1Q, T1T, T1V, T1W, T1Y, T23, T1Z, T22, T1C, T1M, T1J;
+		    V T1N, T1y, T1B, T1F, T1I, T1K, T1P, T1L, T1O, T12, T1g, T1d, T1h, Ty, T11;
+		    V T19, T1c, T1e, T1j, T1f, T1i, T1m, T1s, T1p, T1t, T1k, T1l, T1n, T1o, T1q;
+		    V T1v, T1r, T1u;
+		    T1Q = VMUL(LDK(KP500000000), VSUB(T1w, T1x));
+		    T1T = VMUL(LDK(KP353553390), VADD(T1R, T1S));
+		    T1U = VADD(T1Q, T1T);
+		    T20 = VSUB(T1Q, T1T);
+		    T1V = VSUB(T1A, T1z);
+		    T1W = VMUL(LDK(KP707106781), VSUB(T1S, T1R));
+		    T1X = VMUL(LDK(KP500000000), VBYI(VADD(T1V, T1W)));
+		    T21 = VMUL(LDK(KP500000000), VBYI(VSUB(T1W, T1V)));
+		    T1Y = VCONJ(VSUB(T1U, T1X));
+		    ST(&(Rm[WS(rs, 1)]), T1Y, -ms, &(Rm[WS(rs, 1)]));
+		    T23 = VADD(T20, T21);
+		    ST(&(Rp[WS(rs, 6)]), T23, ms, &(Rp[0]));
+		    T1Z = VADD(T1U, T1X);
+		    ST(&(Rp[WS(rs, 2)]), T1Z, ms, &(Rp[0]));
+		    T22 = VCONJ(VSUB(T20, T21));
+		    ST(&(Rm[WS(rs, 5)]), T22, -ms, &(Rm[WS(rs, 1)]));
+		    T1y = VADD(T1w, T1x);
+		    T1B = VADD(T1z, T1A);
+		    T1C = VADD(T1y, T1B);
+		    T1M = VSUB(T1y, T1B);
+		    T1F = VADD(T1D, T1E);
+		    T1I = VADD(T1G, T1H);
+		    T1J = VADD(T1F, T1I);
+		    T1N = VBYI(VSUB(T1I, T1F));
+		    T1K = VCONJ(VMUL(LDK(KP500000000), VSUB(T1C, T1J)));
+		    ST(&(Rm[WS(rs, 7)]), T1K, -ms, &(Rm[WS(rs, 1)]));
+		    T1P = VMUL(LDK(KP500000000), VADD(T1M, T1N));
+		    ST(&(Rp[WS(rs, 4)]), T1P, ms, &(Rp[0]));
+		    T1L = VMUL(LDK(KP500000000), VADD(T1C, T1J));
+		    ST(&(Rp[0]), T1L, ms, &(Rp[0]));
+		    T1O = VCONJ(VMUL(LDK(KP500000000), VSUB(T1M, T1N)));
+		    ST(&(Rm[WS(rs, 3)]), T1O, -ms, &(Rm[WS(rs, 1)]));
+		    Ty = VADD(Ta, Tx);
+		    T11 = VMUL(LDK(KP500000000), VADD(TP, T10));
+		    T12 = VADD(Ty, T11);
+		    T1g = VSUB(Ty, T11);
+		    T19 = VSUB(T13, T18);
+		    T1c = VSUB(T1a, T1b);
+		    T1d = VMUL(LDK(KP500000000), VBYI(VADD(T19, T1c)));
+		    T1h = VMUL(LDK(KP500000000), VBYI(VSUB(T1c, T19)));
+		    T1e = VCONJ(VSUB(T12, T1d));
+		    ST(&(Rm[0]), T1e, -ms, &(Rm[0]));
+		    T1j = VADD(T1g, T1h);
+		    ST(&(Rp[WS(rs, 7)]), T1j, ms, &(Rp[WS(rs, 1)]));
+		    T1f = VADD(T12, T1d);
+		    ST(&(Rp[WS(rs, 1)]), T1f, ms, &(Rp[WS(rs, 1)]));
+		    T1i = VCONJ(VSUB(T1g, T1h));
+		    ST(&(Rm[WS(rs, 6)]), T1i, -ms, &(Rm[0]));
+		    T1k = VSUB(T10, TP);
+		    T1l = VADD(T18, T13);
+		    T1m = VMUL(LDK(KP500000000), VBYI(VSUB(T1k, T1l)));
+		    T1s = VMUL(LDK(KP500000000), VBYI(VADD(T1l, T1k)));
+		    T1n = VSUB(Ta, Tx);
+		    T1o = VMUL(LDK(KP500000000), VADD(T1b, T1a));
+		    T1p = VSUB(T1n, T1o);
+		    T1t = VADD(T1n, T1o);
+		    T1q = VADD(T1m, T1p);
+		    ST(&(Rp[WS(rs, 5)]), T1q, ms, &(Rp[WS(rs, 1)]));
+		    T1v = VCONJ(VSUB(T1t, T1s));
+		    ST(&(Rm[WS(rs, 2)]), T1v, -ms, &(Rm[0]));
+		    T1r = VCONJ(VSUB(T1p, T1m));
+		    ST(&(Rm[WS(rs, 4)]), T1r, -ms, &(Rm[0]));
+		    T1u = VADD(T1s, T1t);
+		    ST(&(Rp[WS(rs, 3)]), T1u, ms, &(Rp[WS(rs, 1)]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cfdftv_16"), twinstr, &GENUS, {99, 52, 4, 0} };
+
+void XSIMD(codelet_hc2cfdftv_16) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_16, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 2 -dit -name hc2cfdftv_2 -include hc2cfv.h */
+
+/*
+ * This function contains 5 FP additions, 6 FP multiplications,
+ * (or, 3 additions, 4 multiplications, 2 fused multiply/add),
+ * 9 stack variables, 1 constants, and 4 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 2)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T1, T2, T4, T3, T5, T7, T6;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       T4 = LDW(&(W[0]));
+	       T3 = VFMACONJ(T2, T1);
+	       T5 = VZMULIJ(T4, VFNMSCONJ(T2, T1));
+	       T7 = VCONJ(VMUL(LDK(KP500000000), VADD(T3, T5)));
+	       T6 = VMUL(LDK(KP500000000), VSUB(T3, T5));
+	       ST(&(Rm[0]), T7, -ms, &(Rm[0]));
+	       ST(&(Rp[0]), T6, ms, &(Rp[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 2, XSIMD_STRING("hc2cfdftv_2"), twinstr, &GENUS, {3, 4, 2, 0} };
+
+void XSIMD(codelet_hc2cfdftv_2) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_2, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 2 -dit -name hc2cfdftv_2 -include hc2cfv.h */
+
+/*
+ * This function contains 5 FP additions, 4 FP multiplications,
+ * (or, 5 additions, 4 multiplications, 0 fused multiply/add),
+ * 10 stack variables, 1 constants, and 4 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 2)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(8, rs)) {
+	       V T4, T6, T1, T3, T2, T5, T7, T8;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       T3 = VCONJ(T2);
+	       T4 = VADD(T1, T3);
+	       T5 = LDW(&(W[0]));
+	       T6 = VZMULIJ(T5, VSUB(T3, T1));
+	       T7 = VCONJ(VMUL(LDK(KP500000000), VSUB(T4, T6)));
+	       ST(&(Rm[0]), T7, -ms, &(Rm[0]));
+	       T8 = VMUL(LDK(KP500000000), VADD(T4, T6));
+	       ST(&(Rp[0]), T8, ms, &(Rp[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 2, XSIMD_STRING("hc2cfdftv_2"), twinstr, &GENUS, {5, 4, 0, 0} };
+
+void XSIMD(codelet_hc2cfdftv_2) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_2, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dit -name hc2cfdftv_20 -include hc2cfv.h */
+
+/*
+ * This function contains 143 FP additions, 128 FP multiplications,
+ * (or, 77 additions, 62 multiplications, 66 fused multiply/add),
+ * 130 stack variables, 5 constants, and 40 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
+	       V T2g, T2f, T2w, T2k, T2A, T2u, T2e, T2o, T1O, T2b, T2i, T1R, T1X, T1k, TN;
+	       V T1w, T1G, T1t, Ti, T2c, T12, T1x, T2j, T1U, T1y, T1d, T24, T2v, T2h, T2x;
+	       V T2B, T2p, T2l, T2z, T2y, T2D, T2C, T2r, T2q, T2n, T2m;
+	       {
+		    V T3, T7, TC, T1Y, Tc, Tg, Tn, T1P, T1Z, Tw, T1S, TS, TY, TZ, T1Q;
+		    V TL, T17, T21, TW, T19, TX, T1a, T8, T20, Th, Tx, T1u, T1v, TM, T10;
+		    V T1b, T22, T11, T1T, T1c, T23;
+		    {
+			 V Ta, Tb, Tz, Te, TB, Tf, Tl, T9, Td, Tk, T1, T2, Ty, T5, T6;
+			 V TA, T4, Tj, Tt, Tu, Ts, TQ, Tr, TP, Tp, Tq, Tm, To, TO, TG;
+			 V T14, TK, T16, TE, TF, Tv, TD, T13, TR, TI, TJ, TH, T15, TU, TV;
+			 V TT, T18;
+			 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+			 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+			 Ty = LDW(&(W[0]));
+			 T5 = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+			 T6 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+			 TA = LDW(&(W[TWVL * 20]));
+			 T4 = LDW(&(W[TWVL * 18]));
+			 Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+			 Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+			 T3 = VFMACONJ(T2, T1);
+			 Tz = VZMULIJ(Ty, VFNMSCONJ(T2, T1));
+			 Tj = LDW(&(W[TWVL * 6]));
+			 Te = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+			 TB = VZMULIJ(TA, VFNMSCONJ(T6, T5));
+			 T7 = VZMULJ(T4, VFMACONJ(T6, T5));
+			 Tf = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+			 Tl = LDW(&(W[TWVL * 26]));
+			 T9 = LDW(&(W[TWVL * 8]));
+			 Td = LDW(&(W[TWVL * 28]));
+			 Tk = VZMULJ(Tj, VFMACONJ(Tb, Ta));
+			 Tp = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+			 TC = VADD(Tz, TB);
+			 T1Y = VSUB(TB, Tz);
+			 Tq = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+			 Tm = VZMULJ(Tl, VFMACONJ(Tf, Te));
+			 Tc = VZMULIJ(T9, VFNMSCONJ(Tb, Ta));
+			 Tg = VZMULIJ(Td, VFNMSCONJ(Tf, Te));
+			 To = LDW(&(W[TWVL * 16]));
+			 TO = LDW(&(W[TWVL * 14]));
+			 Tt = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
+			 Tu = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
+			 Ts = LDW(&(W[TWVL * 36]));
+			 Tn = VADD(Tk, Tm);
+			 T1P = VSUB(Tk, Tm);
+			 TQ = LDW(&(W[TWVL * 34]));
+			 Tr = VZMULIJ(To, VFNMSCONJ(Tq, Tp));
+			 TP = VZMULJ(TO, VFMACONJ(Tq, Tp));
+			 TE = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
+			 TF = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
+			 Tv = VZMULIJ(Ts, VFNMSCONJ(Tu, Tt));
+			 TD = LDW(&(W[TWVL * 30]));
+			 T13 = LDW(&(W[TWVL * 32]));
+			 TR = VZMULJ(TQ, VFMACONJ(Tu, Tt));
+			 TI = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+			 TJ = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+			 TH = LDW(&(W[TWVL * 10]));
+			 T15 = LDW(&(W[TWVL * 12]));
+			 T1Z = VSUB(Tv, Tr);
+			 Tw = VADD(Tr, Tv);
+			 TG = VZMULJ(TD, VFMACONJ(TF, TE));
+			 T14 = VZMULIJ(T13, VFNMSCONJ(TF, TE));
+			 T1S = VSUB(TP, TR);
+			 TS = VADD(TP, TR);
+			 TK = VZMULJ(TH, VFMACONJ(TJ, TI));
+			 T16 = VZMULIJ(T15, VFNMSCONJ(TJ, TI));
+			 TU = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+			 TV = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+			 TT = LDW(&(W[TWVL * 24]));
+			 T18 = LDW(&(W[TWVL * 22]));
+			 TY = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+			 TZ = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+			 T1Q = VSUB(TK, TG);
+			 TL = VADD(TG, TK);
+			 T17 = VADD(T14, T16);
+			 T21 = VSUB(T16, T14);
+			 TW = VZMULIJ(TT, VFNMSCONJ(TV, TU));
+			 T19 = VZMULJ(T18, VFMACONJ(TV, TU));
+			 TX = LDW(&(W[TWVL * 4]));
+			 T1a = LDW(&(W[TWVL * 2]));
+		    }
+		    T1O = VSUB(T3, T7);
+		    T8 = VADD(T3, T7);
+		    T20 = VADD(T1Y, T1Z);
+		    T2b = VSUB(T1Y, T1Z);
+		    T2i = VADD(T1P, T1Q);
+		    T1R = VSUB(T1P, T1Q);
+		    Th = VADD(Tc, Tg);
+		    T1X = VSUB(Tg, Tc);
+		    Tx = VSUB(Tn, Tw);
+		    T1u = VADD(Tn, Tw);
+		    T1v = VADD(TC, TL);
+		    TM = VSUB(TC, TL);
+		    T10 = VZMULIJ(TX, VFNMSCONJ(TZ, TY));
+		    T1b = VZMULJ(T1a, VFMACONJ(TZ, TY));
+		    T1k = VADD(Tx, TM);
+		    TN = VSUB(Tx, TM);
+		    T22 = VSUB(T10, TW);
+		    T11 = VADD(TW, T10);
+		    T1T = VSUB(T1b, T19);
+		    T1c = VADD(T19, T1b);
+		    T1w = VADD(T1u, T1v);
+		    T1G = VSUB(T1u, T1v);
+		    T1t = VADD(T8, Th);
+		    Ti = VSUB(T8, Th);
+		    T23 = VADD(T21, T22);
+		    T2c = VSUB(T21, T22);
+		    T12 = VSUB(TS, T11);
+		    T1x = VADD(TS, T11);
+		    T2j = VADD(T1S, T1T);
+		    T1U = VSUB(T1S, T1T);
+		    T1y = VADD(T17, T1c);
+		    T1d = VSUB(T17, T1c);
+		    T2g = VSUB(T23, T20);
+		    T24 = VADD(T20, T23);
+	       }
+	       {
+		    V T2d, T2t, T29, T25, T1m, T1q, T1i, T1H, T1L, T1D, T1A, T28, T1W, T1h, T1g;
+		    V T1e, T1l, T1z, T1F, T1V, T1f, T1C, T1B, T26, T27, T2a, T2s, T1j, T1p, T1K;
+		    V T1E, T1n, T1o, T1s, T1r, T1I, T1J, T1N, T1M;
+		    T2d = VFMA(LDK(KP618033988), T2c, T2b);
+		    T2t = VFNMS(LDK(KP618033988), T2b, T2c);
+		    T1e = VSUB(T12, T1d);
+		    T1l = VADD(T12, T1d);
+		    T1z = VADD(T1x, T1y);
+		    T1F = VSUB(T1x, T1y);
+		    T1V = VADD(T1R, T1U);
+		    T29 = VSUB(T1R, T1U);
+		    T2f = VFNMS(LDK(KP250000000), T24, T1X);
+		    T25 = VADD(T1X, T24);
+		    T1m = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1l, T1k));
+		    T1q = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1k, T1l));
+		    T1i = VSUB(TN, T1e);
+		    T1f = VADD(TN, T1e);
+		    T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
+		    T1L = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
+		    T1D = VSUB(T1w, T1z);
+		    T1A = VADD(T1w, T1z);
+		    T28 = VFNMS(LDK(KP250000000), T1V, T1O);
+		    T1W = VADD(T1O, T1V);
+		    T1h = VFNMS(LDK(KP250000000), T1f, Ti);
+		    T1g = VMUL(LDK(KP500000000), VADD(Ti, T1f));
+		    T2w = VFNMS(LDK(KP618033988), T2i, T2j);
+		    T2k = VFMA(LDK(KP618033988), T2j, T2i);
+		    T1C = VFNMS(LDK(KP250000000), T1A, T1t);
+		    T1B = VCONJ(VMUL(LDK(KP500000000), VADD(T1t, T1A)));
+		    T26 = VMUL(LDK(KP500000000), VFNMSI(T25, T1W));
+		    T27 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T25, T1W)));
+		    T2a = VFMA(LDK(KP559016994), T29, T28);
+		    T2s = VFNMS(LDK(KP559016994), T29, T28);
+		    ST(&(Rp[0]), T1g, ms, &(Rp[0]));
+		    T1j = VFMA(LDK(KP559016994), T1i, T1h);
+		    T1p = VFNMS(LDK(KP559016994), T1i, T1h);
+		    ST(&(Rm[WS(rs, 9)]), T1B, -ms, &(Rm[WS(rs, 1)]));
+		    T1K = VFMA(LDK(KP559016994), T1D, T1C);
+		    T1E = VFNMS(LDK(KP559016994), T1D, T1C);
+		    ST(&(Rm[WS(rs, 4)]), T27, -ms, &(Rm[0]));
+		    ST(&(Rp[WS(rs, 5)]), T26, ms, &(Rp[WS(rs, 1)]));
+		    T2A = VFMA(LDK(KP951056516), T2t, T2s);
+		    T2u = VFNMS(LDK(KP951056516), T2t, T2s);
+		    T2e = VFNMS(LDK(KP951056516), T2d, T2a);
+		    T2o = VFMA(LDK(KP951056516), T2d, T2a);
+		    T1n = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1m, T1j)));
+		    T1o = VMUL(LDK(KP500000000), VFMAI(T1m, T1j));
+		    T1s = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1q, T1p)));
+		    T1r = VMUL(LDK(KP500000000), VFNMSI(T1q, T1p));
+		    T1I = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1H, T1E)));
+		    T1J = VMUL(LDK(KP500000000), VFMAI(T1H, T1E));
+		    T1N = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1L, T1K)));
+		    T1M = VMUL(LDK(KP500000000), VFNMSI(T1L, T1K));
+		    ST(&(Rp[WS(rs, 4)]), T1o, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 3)]), T1n, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 8)]), T1r, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 7)]), T1s, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 2)]), T1J, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 1)]), T1I, -ms, &(Rm[WS(rs, 1)]));
+		    ST(&(Rp[WS(rs, 6)]), T1M, ms, &(Rp[0]));
+		    ST(&(Rm[WS(rs, 5)]), T1N, -ms, &(Rm[WS(rs, 1)]));
+	       }
+	       T2v = VFMA(LDK(KP559016994), T2g, T2f);
+	       T2h = VFNMS(LDK(KP559016994), T2g, T2f);
+	       T2x = VFNMS(LDK(KP951056516), T2w, T2v);
+	       T2B = VFMA(LDK(KP951056516), T2w, T2v);
+	       T2p = VFMA(LDK(KP951056516), T2k, T2h);
+	       T2l = VFNMS(LDK(KP951056516), T2k, T2h);
+	       T2z = VMUL(LDK(KP500000000), VFMAI(T2x, T2u));
+	       T2y = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2x, T2u)));
+	       T2D = VMUL(LDK(KP500000000), VFMAI(T2B, T2A));
+	       T2C = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2B, T2A)));
+	       T2r = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2p, T2o)));
+	       T2q = VMUL(LDK(KP500000000), VFNMSI(T2p, T2o));
+	       T2n = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2l, T2e)));
+	       T2m = VMUL(LDK(KP500000000), VFNMSI(T2l, T2e));
+	       ST(&(Rp[WS(rs, 3)]), T2z, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 2)]), T2y, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 7)]), T2D, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 6)]), T2C, -ms, &(Rm[0]));
+	       ST(&(Rm[0]), T2r, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 1)]), T2q, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 8)]), T2n, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 9)]), T2m, ms, &(Rp[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     VTW(1, 16),
+     VTW(1, 17),
+     VTW(1, 18),
+     VTW(1, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cfdftv_20"), twinstr, &GENUS, {77, 62, 66, 0} };
+
+void XSIMD(codelet_hc2cfdftv_20) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_20, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dit -name hc2cfdftv_20 -include hc2cfv.h */
+
+/*
+ * This function contains 143 FP additions, 77 FP multiplications,
+ * (or, 131 additions, 65 multiplications, 12 fused multiply/add),
+ * 141 stack variables, 9 constants, and 40 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
+     DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
+     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP125000000, +0.125000000000000000000000000000000000000000000);
+     DVK(KP279508497, +0.279508497187473712051146708591409529430077295);
+     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
+     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
+	       V TW, T1x, T2i, T2A, T1r, T1s, T1a, T1y, T1l, Tn, TK, TL, T1p, T1o, T27;
+	       V T2t, T2a, T2u, T2e, T2C, T20, T2w, T23, T2x, T2d, T2B, T1W, T1X, T1U, T1V;
+	       V T2z, T2K, T2G, T2N, T2J, T2v, T2y, T2F, T2D, T2E, T2M, T2H, T2I, T2L;
+	       {
+		    V T1u, T5, Tg, T1c, TV, T13, Ta, T1w, TQ, T11, TI, T1j, Tx, T18, Tl;
+		    V T1e, TD, T1h, Ts, T16, T2g, T2h, T14, T19, T1f, T1k, Tb, Tm, Ty, TJ;
+		    V T25, T26, T28, T29, T1Y, T1Z, T21, T22;
+		    {
+			 V T4, T3, T2, T1, Tf, Te, Td, Tc, T1b, TU, TT, TS, TR, T12, T9;
+			 V T8, T7, T6, T1v, TP, TO, TN, TM, T10, TH, TG, TF, TE, T1i, Tw;
+			 V Tv, Tu, Tt, T17, Tk, Tj, Ti, Th, T1d, TC, TB, TA, Tz, T1g, Tr;
+			 V Tq, Tp, To, T15;
+			 T4 = LD(&(Rp[0]), ms, &(Rp[0]));
+			 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+			 T3 = VCONJ(T2);
+			 T1u = VADD(T4, T3);
+			 T1 = LDW(&(W[0]));
+			 T5 = VZMULIJ(T1, VSUB(T3, T4));
+			 Tf = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+			 Td = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+			 Te = VCONJ(Td);
+			 Tc = LDW(&(W[TWVL * 16]));
+			 Tg = VZMULIJ(Tc, VSUB(Te, Tf));
+			 T1b = LDW(&(W[TWVL * 14]));
+			 T1c = VZMULJ(T1b, VADD(Te, Tf));
+			 TU = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+			 TS = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+			 TT = VCONJ(TS);
+			 TR = LDW(&(W[TWVL * 28]));
+			 TV = VZMULIJ(TR, VSUB(TT, TU));
+			 T12 = LDW(&(W[TWVL * 26]));
+			 T13 = VZMULJ(T12, VADD(TT, TU));
+			 T9 = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+			 T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+			 T8 = VCONJ(T7);
+			 T6 = LDW(&(W[TWVL * 20]));
+			 Ta = VZMULIJ(T6, VSUB(T8, T9));
+			 T1v = LDW(&(W[TWVL * 18]));
+			 T1w = VZMULJ(T1v, VADD(T9, T8));
+			 TP = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+			 TN = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+			 TO = VCONJ(TN);
+			 TM = LDW(&(W[TWVL * 8]));
+			 TQ = VZMULIJ(TM, VSUB(TO, TP));
+			 T10 = LDW(&(W[TWVL * 6]));
+			 T11 = VZMULJ(T10, VADD(TO, TP));
+			 TH = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+			 TF = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+			 TG = VCONJ(TF);
+			 TE = LDW(&(W[TWVL * 4]));
+			 TI = VZMULIJ(TE, VSUB(TG, TH));
+			 T1i = LDW(&(W[TWVL * 2]));
+			 T1j = VZMULJ(T1i, VADD(TG, TH));
+			 Tw = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+			 Tu = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+			 Tv = VCONJ(Tu);
+			 Tt = LDW(&(W[TWVL * 12]));
+			 Tx = VZMULIJ(Tt, VSUB(Tv, Tw));
+			 T17 = LDW(&(W[TWVL * 10]));
+			 T18 = VZMULJ(T17, VADD(Tw, Tv));
+			 Tk = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
+			 Ti = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
+			 Tj = VCONJ(Ti);
+			 Th = LDW(&(W[TWVL * 36]));
+			 Tl = VZMULIJ(Th, VSUB(Tj, Tk));
+			 T1d = LDW(&(W[TWVL * 34]));
+			 T1e = VZMULJ(T1d, VADD(Tj, Tk));
+			 TC = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+			 TA = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+			 TB = VCONJ(TA);
+			 Tz = LDW(&(W[TWVL * 24]));
+			 TD = VZMULIJ(Tz, VSUB(TB, TC));
+			 T1g = LDW(&(W[TWVL * 22]));
+			 T1h = VZMULJ(T1g, VADD(TB, TC));
+			 Tr = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
+			 Tp = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
+			 Tq = VCONJ(Tp);
+			 To = LDW(&(W[TWVL * 32]));
+			 Ts = VZMULIJ(To, VSUB(Tq, Tr));
+			 T15 = LDW(&(W[TWVL * 30]));
+			 T16 = VZMULJ(T15, VADD(Tr, Tq));
+		    }
+		    TW = VSUB(TQ, TV);
+		    T1x = VSUB(T1u, T1w);
+		    T2g = VADD(T1u, T1w);
+		    T2h = VADD(TQ, TV);
+		    T2i = VADD(T2g, T2h);
+		    T2A = VSUB(T2g, T2h);
+		    T14 = VSUB(T11, T13);
+		    T19 = VSUB(T16, T18);
+		    T1r = VADD(T14, T19);
+		    T1f = VSUB(T1c, T1e);
+		    T1k = VSUB(T1h, T1j);
+		    T1s = VADD(T1f, T1k);
+		    T1a = VSUB(T14, T19);
+		    T1y = VADD(T1r, T1s);
+		    T1l = VSUB(T1f, T1k);
+		    Tb = VSUB(T5, Ta);
+		    Tm = VSUB(Tg, Tl);
+		    Tn = VADD(Tb, Tm);
+		    Ty = VSUB(Ts, Tx);
+		    TJ = VSUB(TD, TI);
+		    TK = VADD(Ty, TJ);
+		    TL = VADD(Tn, TK);
+		    T1p = VSUB(Ty, TJ);
+		    T1o = VSUB(Tb, Tm);
+		    T25 = VADD(T1c, T1e);
+		    T26 = VADD(TD, TI);
+		    T27 = VADD(T25, T26);
+		    T2t = VSUB(T25, T26);
+		    T28 = VADD(Ts, Tx);
+		    T29 = VADD(T1h, T1j);
+		    T2a = VADD(T28, T29);
+		    T2u = VSUB(T29, T28);
+		    T2e = VADD(T27, T2a);
+		    T2C = VADD(T2t, T2u);
+		    T1Y = VADD(T11, T13);
+		    T1Z = VADD(Tg, Tl);
+		    T20 = VADD(T1Y, T1Z);
+		    T2w = VSUB(T1Y, T1Z);
+		    T21 = VADD(T5, Ta);
+		    T22 = VADD(T16, T18);
+		    T23 = VADD(T21, T22);
+		    T2x = VSUB(T22, T21);
+		    T2d = VADD(T20, T23);
+		    T2B = VADD(T2w, T2x);
+	       }
+	       T1U = VADD(T1x, T1y);
+	       T1V = VBYI(VADD(TW, TL));
+	       T1W = VMUL(LDK(KP500000000), VSUB(T1U, T1V));
+	       T1X = VCONJ(VMUL(LDK(KP500000000), VADD(T1V, T1U)));
+	       ST(&(Rp[WS(rs, 5)]), T1W, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 4)]), T1X, -ms, &(Rm[0]));
+	       T2v = VSUB(T2t, T2u);
+	       T2y = VSUB(T2w, T2x);
+	       T2z = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T2y, VMUL(LDK(KP951056516), T2v))));
+	       T2K = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T2y, VMUL(LDK(KP587785252), T2v))));
+	       T2F = VMUL(LDK(KP279508497), VSUB(T2B, T2C));
+	       T2D = VADD(T2B, T2C);
+	       T2E = VFNMS(LDK(KP125000000), T2D, VMUL(LDK(KP500000000), T2A));
+	       T2G = VSUB(T2E, T2F);
+	       T2N = VCONJ(VMUL(LDK(KP500000000), VADD(T2A, T2D)));
+	       T2J = VADD(T2F, T2E);
+	       ST(&(Rm[WS(rs, 9)]), T2N, -ms, &(Rm[WS(rs, 1)]));
+	       T2M = VCONJ(VADD(T2K, T2J));
+	       ST(&(Rm[WS(rs, 5)]), T2M, -ms, &(Rm[WS(rs, 1)]));
+	       T2H = VADD(T2z, T2G);
+	       ST(&(Rp[WS(rs, 2)]), T2H, ms, &(Rp[0]));
+	       T2I = VCONJ(VSUB(T2G, T2z));
+	       ST(&(Rm[WS(rs, 1)]), T2I, -ms, &(Rm[WS(rs, 1)]));
+	       T2L = VSUB(T2J, T2K);
+	       ST(&(Rp[WS(rs, 6)]), T2L, ms, &(Rp[0]));
+	       {
+		    V T2c, T2p, T2l, T2s, T2o, T24, T2b, T2f, T2j, T2k, T2r, T2m, T2n, T2q, T1n;
+		    V T1Q, T1E, T1K, T1B, T1R, T1F, T1N, T1m, T1J, TZ, T1I, TX, TY, T1q, T1M;
+		    V T1A, T1L, T1t, T1z, T1C, T1S, T1T, T1D, T1G, T1O, T1P, T1H;
+		    T24 = VSUB(T20, T23);
+		    T2b = VSUB(T27, T2a);
+		    T2c = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T24, VMUL(LDK(KP587785252), T2b))));
+		    T2p = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T24, VMUL(LDK(KP951056516), T2b))));
+		    T2f = VMUL(LDK(KP279508497), VSUB(T2d, T2e));
+		    T2j = VADD(T2d, T2e);
+		    T2k = VFNMS(LDK(KP125000000), T2j, VMUL(LDK(KP500000000), T2i));
+		    T2l = VADD(T2f, T2k);
+		    T2s = VMUL(LDK(KP500000000), VADD(T2i, T2j));
+		    T2o = VSUB(T2k, T2f);
+		    ST(&(Rp[0]), T2s, ms, &(Rp[0]));
+		    T2r = VCONJ(VADD(T2p, T2o));
+		    ST(&(Rm[WS(rs, 7)]), T2r, -ms, &(Rm[WS(rs, 1)]));
+		    T2m = VADD(T2c, T2l);
+		    ST(&(Rp[WS(rs, 4)]), T2m, ms, &(Rp[0]));
+		    T2n = VCONJ(VSUB(T2l, T2c));
+		    ST(&(Rm[WS(rs, 3)]), T2n, -ms, &(Rm[WS(rs, 1)]));
+		    T2q = VSUB(T2o, T2p);
+		    ST(&(Rp[WS(rs, 8)]), T2q, ms, &(Rp[0]));
+		    T1m = VFMA(LDK(KP951056516), T1a, VMUL(LDK(KP587785252), T1l));
+		    T1J = VFNMS(LDK(KP587785252), T1a, VMUL(LDK(KP951056516), T1l));
+		    TX = VFMS(LDK(KP250000000), TL, TW);
+		    TY = VMUL(LDK(KP559016994), VSUB(TK, Tn));
+		    TZ = VADD(TX, TY);
+		    T1I = VSUB(TY, TX);
+		    T1n = VMUL(LDK(KP500000000), VBYI(VSUB(TZ, T1m)));
+		    T1Q = VMUL(LDK(KP500000000), VBYI(VADD(T1I, T1J)));
+		    T1E = VMUL(LDK(KP500000000), VBYI(VADD(TZ, T1m)));
+		    T1K = VMUL(LDK(KP500000000), VBYI(VSUB(T1I, T1J)));
+		    T1q = VFMA(LDK(KP475528258), T1o, VMUL(LDK(KP293892626), T1p));
+		    T1M = VFNMS(LDK(KP293892626), T1o, VMUL(LDK(KP475528258), T1p));
+		    T1t = VMUL(LDK(KP279508497), VSUB(T1r, T1s));
+		    T1z = VFNMS(LDK(KP125000000), T1y, VMUL(LDK(KP500000000), T1x));
+		    T1A = VADD(T1t, T1z);
+		    T1L = VSUB(T1z, T1t);
+		    T1B = VADD(T1q, T1A);
+		    T1R = VADD(T1M, T1L);
+		    T1F = VSUB(T1A, T1q);
+		    T1N = VSUB(T1L, T1M);
+		    T1C = VADD(T1n, T1B);
+		    ST(&(Rp[WS(rs, 1)]), T1C, ms, &(Rp[WS(rs, 1)]));
+		    T1S = VADD(T1Q, T1R);
+		    ST(&(Rp[WS(rs, 7)]), T1S, ms, &(Rp[WS(rs, 1)]));
+		    T1T = VCONJ(VSUB(T1R, T1Q));
+		    ST(&(Rm[WS(rs, 6)]), T1T, -ms, &(Rm[0]));
+		    T1D = VCONJ(VSUB(T1B, T1n));
+		    ST(&(Rm[0]), T1D, -ms, &(Rm[0]));
+		    T1G = VADD(T1E, T1F);
+		    ST(&(Rp[WS(rs, 9)]), T1G, ms, &(Rp[WS(rs, 1)]));
+		    T1O = VADD(T1K, T1N);
+		    ST(&(Rp[WS(rs, 3)]), T1O, ms, &(Rp[WS(rs, 1)]));
+		    T1P = VCONJ(VSUB(T1N, T1K));
+		    ST(&(Rm[WS(rs, 2)]), T1P, -ms, &(Rm[0]));
+		    T1H = VCONJ(VSUB(T1F, T1E));
+		    ST(&(Rm[WS(rs, 8)]), T1H, -ms, &(Rm[0]));
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     VTW(1, 16),
+     VTW(1, 17),
+     VTW(1, 18),
+     VTW(1, 19),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cfdftv_20"), twinstr, &GENUS, {131, 65, 12, 0} };
+
+void XSIMD(codelet_hc2cfdftv_20) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_20, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include hc2cfv.h */
+
+/*
+ * This function contains 249 FP additions, 224 FP multiplications,
+ * (or, 119 additions, 94 multiplications, 130 fused multiply/add),
+ * 167 stack variables, 8 constants, and 64 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
+     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
+	       V T2m, T2b, T2c, T2d, T2v, T2r, T20, T2i, T2n, T2e, T2o, T2u, T2j, T2f, T2t;
+	       V T2s, T2x, T2w, T2l, T2k, T2h, T2g;
+	       {
+		    V T41, T3B, T40, T3a, T2J, T27, T2y, Ts, T2C, T1X, T2B, T1Q, T3F, T3w, T4l;
+		    V T49, T1b, T1s, T3c, TB, T1f, T3g, T44, T1l, T3k, T3o, T4b, T28, T14, T1d;
+		    V T3b, TK;
+		    {
+			 V T1V, T1E, T3A, Th, T3v, T47, T1J, T3q, T8, T38, T25, T39, T3z, Tq, T1O;
+			 V T3r, T3, T7, T3u, T24, T22, T3t, T1I, Tn, T1G, To, Tm, T1K, Tl, T1N;
+			 V Tp, T1L, TU, T3f, T3m, T13, T3e, T3n, T1i, TH, TI, T1k, TG, TF, T1c;
+			 V TJ;
+			 {
+			      V T1x, T1y, T1U, T1B, T1S, T1C, T1A, T23, T21, T1z, T1, T2, T1T, T5, T6;
+			      V T1R, T4, T1w, Ta, Tb, T1H, Te, Tf, Td, Tc, T1F, T9, T1D, Tj, Tk;
+			      V Ti, Tg, T1M;
+			      T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+			      T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+			      T1T = LDW(&(W[0]));
+			      T5 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
+			      T6 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
+			      T1R = LDW(&(W[TWVL * 32]));
+			      T4 = LDW(&(W[TWVL * 30]));
+			      T1x = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
+			      T1y = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
+			      T3 = VFMACONJ(T2, T1);
+			      T1U = VZMULIJ(T1T, VFNMSCONJ(T2, T1));
+			      T1w = LDW(&(W[TWVL * 48]));
+			      T1B = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+			      T1S = VZMULIJ(T1R, VFNMSCONJ(T6, T5));
+			      T7 = VZMULJ(T4, VFMACONJ(T6, T5));
+			      T1C = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+			      T1A = LDW(&(W[TWVL * 16]));
+			      T23 = LDW(&(W[TWVL * 46]));
+			      T21 = LDW(&(W[TWVL * 14]));
+			      T1z = VZMULIJ(T1w, VFNMSCONJ(T1y, T1x));
+			      Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+			      T3u = VADD(T1U, T1S);
+			      T1V = VSUB(T1S, T1U);
+			      Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+			      T9 = LDW(&(W[TWVL * 6]));
+			      T1D = VZMULIJ(T1A, VFNMSCONJ(T1C, T1B));
+			      T24 = VZMULJ(T23, VFMACONJ(T1y, T1x));
+			      T22 = VZMULJ(T21, VFMACONJ(T1C, T1B));
+			      T1H = LDW(&(W[TWVL * 8]));
+			      Te = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
+			      Tf = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
+			      Td = LDW(&(W[TWVL * 38]));
+			      Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
+			      T1E = VSUB(T1z, T1D);
+			      T3t = VADD(T1D, T1z);
+			      T1F = LDW(&(W[TWVL * 40]));
+			      Tj = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
+			      T1I = VZMULIJ(T1H, VFNMSCONJ(Tb, Ta));
+			      Tk = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
+			      Ti = LDW(&(W[TWVL * 54]));
+			      Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
+			      T1M = LDW(&(W[TWVL * 56]));
+			      Tn = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+			      T1G = VZMULIJ(T1F, VFNMSCONJ(Tf, Te));
+			      To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+			      Tm = LDW(&(W[TWVL * 22]));
+			      T1K = LDW(&(W[TWVL * 24]));
+			      Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
+			      T3A = VADD(Tc, Tg);
+			      Th = VSUB(Tc, Tg);
+			      T1N = VZMULIJ(T1M, VFNMSCONJ(Tk, Tj));
+			 }
+			 T3v = VSUB(T3t, T3u);
+			 T47 = VADD(T3u, T3t);
+			 T1J = VSUB(T1G, T1I);
+			 T3q = VADD(T1I, T1G);
+			 Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
+			 T1L = VZMULIJ(T1K, VFNMSCONJ(To, Tn));
+			 T8 = VSUB(T3, T7);
+			 T38 = VADD(T3, T7);
+			 T25 = VSUB(T22, T24);
+			 T39 = VADD(T22, T24);
+			 T3z = VADD(Tl, Tp);
+			 Tq = VSUB(Tl, Tp);
+			 T1O = VSUB(T1L, T1N);
+			 T3r = VADD(T1N, T1L);
+			 {
+			      V T10, T11, TZ, T1o, TY, T1r, TN, TO, TM, T19, TR, TS, TQ, T17, T26;
+			      V Tr, T1W, T1P, T3s, T48, TW, TX, TP, T1a, TV, T1q, TT, T18, Ty, Tz;
+			      V Tx, Tw, T1j, Tu, T12, T1p, Tv, Tt, T1h, TD, TA, TE, TC, T1e;
+			      TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+			      TO = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+			      T41 = VADD(T3A, T3z);
+			      T3B = VSUB(T3z, T3A);
+			      T26 = VSUB(Tq, Th);
+			      Tr = VADD(Th, Tq);
+			      T1W = VADD(T1J, T1O);
+			      T1P = VSUB(T1J, T1O);
+			      T3s = VSUB(T3q, T3r);
+			      T48 = VADD(T3q, T3r);
+			      T40 = VADD(T38, T39);
+			      T3a = VSUB(T38, T39);
+			      T2J = VFNMS(LDK(KP707106781), T26, T25);
+			      T27 = VFMA(LDK(KP707106781), T26, T25);
+			      T2y = VFMA(LDK(KP707106781), Tr, T8);
+			      Ts = VFNMS(LDK(KP707106781), Tr, T8);
+			      T2C = VFMA(LDK(KP707106781), T1W, T1V);
+			      T1X = VFNMS(LDK(KP707106781), T1W, T1V);
+			      T2B = VFMA(LDK(KP707106781), T1P, T1E);
+			      T1Q = VFNMS(LDK(KP707106781), T1P, T1E);
+			      T3F = VFMA(LDK(KP414213562), T3s, T3v);
+			      T3w = VFNMS(LDK(KP414213562), T3v, T3s);
+			      T4l = VSUB(T48, T47);
+			      T49 = VADD(T47, T48);
+			      TM = LDW(&(W[TWVL * 10]));
+			      T19 = LDW(&(W[TWVL * 12]));
+			      TR = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
+			      TS = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
+			      TQ = LDW(&(W[TWVL * 42]));
+			      T17 = LDW(&(W[TWVL * 44]));
+			      TW = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
+			      TX = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
+			      TP = VZMULJ(TM, VFMACONJ(TO, TN));
+			      T1a = VZMULIJ(T19, VFNMSCONJ(TO, TN));
+			      TV = LDW(&(W[TWVL * 58]));
+			      T1q = LDW(&(W[TWVL * 60]));
+			      TT = VZMULJ(TQ, VFMACONJ(TS, TR));
+			      T18 = VZMULIJ(T17, VFNMSCONJ(TS, TR));
+			      T10 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+			      T11 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+			      TZ = LDW(&(W[TWVL * 26]));
+			      T1o = LDW(&(W[TWVL * 28]));
+			      TY = VZMULJ(TV, VFMACONJ(TX, TW));
+			      T1r = VZMULIJ(T1q, VFNMSCONJ(TX, TW));
+			      TU = VSUB(TP, TT);
+			      T3f = VADD(TP, TT);
+			      T1b = VSUB(T18, T1a);
+			      T3m = VADD(T1a, T18);
+			      Tu = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+			      T12 = VZMULJ(TZ, VFMACONJ(T11, T10));
+			      T1p = VZMULIJ(T1o, VFNMSCONJ(T11, T10));
+			      Tv = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+			      Tt = LDW(&(W[TWVL * 18]));
+			      T1h = LDW(&(W[TWVL * 20]));
+			      Ty = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
+			      Tz = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
+			      Tx = LDW(&(W[TWVL * 50]));
+			      T13 = VSUB(TY, T12);
+			      T3e = VADD(TY, T12);
+			      T1s = VSUB(T1p, T1r);
+			      T3n = VADD(T1r, T1p);
+			      Tw = VZMULJ(Tt, VFMACONJ(Tv, Tu));
+			      T1i = VZMULIJ(T1h, VFNMSCONJ(Tv, Tu));
+			      T1j = LDW(&(W[TWVL * 52]));
+			      TD = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+			      TA = VZMULJ(Tx, VFMACONJ(Tz, Ty));
+			      TE = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+			      TC = LDW(&(W[TWVL * 2]));
+			      T1e = LDW(&(W[TWVL * 4]));
+			      TH = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
+			      TI = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
+			      T1k = VZMULIJ(T1j, VFNMSCONJ(Tz, Ty));
+			      TG = LDW(&(W[TWVL * 34]));
+			      T3c = VADD(Tw, TA);
+			      TB = VSUB(Tw, TA);
+			      TF = VZMULJ(TC, VFMACONJ(TE, TD));
+			      T1f = VZMULIJ(T1e, VFNMSCONJ(TE, TD));
+			      T1c = LDW(&(W[TWVL * 36]));
+			 }
+			 T3g = VSUB(T3e, T3f);
+			 T44 = VADD(T3e, T3f);
+			 T1l = VSUB(T1i, T1k);
+			 T3k = VADD(T1i, T1k);
+			 TJ = VZMULJ(TG, VFMACONJ(TI, TH));
+			 T3o = VSUB(T3m, T3n);
+			 T4b = VADD(T3n, T3m);
+			 T28 = VFMA(LDK(KP414213562), TU, T13);
+			 T14 = VFNMS(LDK(KP414213562), T13, TU);
+			 T1d = VZMULIJ(T1c, VFNMSCONJ(TI, TH));
+			 T3b = VADD(TF, TJ);
+			 TK = VSUB(TF, TJ);
+		    }
+		    {
+			 V T4k, T4p, T2z, T2a, T2K, T15, T2E, T1n, T2F, T1u, T4c, T3R, T3D, T3i, T3O;
+			 V T46, T4g, T3G, T3P, T3S, T3x, T4q, T4n, T42, T1g, T3j, T3E, T3p, T4m, T3d;
+			 V T43, T29, TL, T1m, T1t, T3l, T4a, T3C, T3h, T45, T3Q, T3W, T4d, T4h, T3H;
+			 V T3L, T3y, T3K, T4r, T4v, T4o, T4u, T4j, T4i, T4e, T4f, T3N, T3M, T3I, T3J;
+			 V T4x, T4w, T4s, T4t;
+			 T42 = VADD(T40, T41);
+			 T4k = VSUB(T40, T41);
+			 T1g = VSUB(T1d, T1f);
+			 T3j = VADD(T1f, T1d);
+			 T3d = VSUB(T3b, T3c);
+			 T43 = VADD(T3b, T3c);
+			 T29 = VFNMS(LDK(KP414213562), TB, TK);
+			 TL = VFMA(LDK(KP414213562), TK, TB);
+			 T1m = VSUB(T1g, T1l);
+			 T1t = VADD(T1g, T1l);
+			 T3l = VSUB(T3j, T3k);
+			 T4a = VADD(T3j, T3k);
+			 T3C = VSUB(T3g, T3d);
+			 T3h = VADD(T3d, T3g);
+			 T45 = VADD(T43, T44);
+			 T4p = VSUB(T44, T43);
+			 T2z = VADD(T29, T28);
+			 T2a = VSUB(T28, T29);
+			 T2K = VADD(TL, T14);
+			 T15 = VSUB(TL, T14);
+			 T2E = VFMA(LDK(KP707106781), T1m, T1b);
+			 T1n = VFNMS(LDK(KP707106781), T1m, T1b);
+			 T2F = VFMA(LDK(KP707106781), T1t, T1s);
+			 T1u = VFNMS(LDK(KP707106781), T1t, T1s);
+			 T3E = VFNMS(LDK(KP414213562), T3l, T3o);
+			 T3p = VFMA(LDK(KP414213562), T3o, T3l);
+			 T4m = VSUB(T4a, T4b);
+			 T4c = VADD(T4a, T4b);
+			 T3R = VFMA(LDK(KP707106781), T3C, T3B);
+			 T3D = VFNMS(LDK(KP707106781), T3C, T3B);
+			 T3i = VFNMS(LDK(KP707106781), T3h, T3a);
+			 T3O = VFMA(LDK(KP707106781), T3h, T3a);
+			 T46 = VSUB(T42, T45);
+			 T4g = VADD(T42, T45);
+			 T3G = VSUB(T3E, T3F);
+			 T3P = VADD(T3F, T3E);
+			 T3S = VADD(T3w, T3p);
+			 T3x = VSUB(T3p, T3w);
+			 T4q = VSUB(T4m, T4l);
+			 T4n = VADD(T4l, T4m);
+			 T4d = VSUB(T49, T4c);
+			 T4h = VADD(T49, T4c);
+			 T3H = VFNMS(LDK(KP923879532), T3G, T3D);
+			 T3L = VFMA(LDK(KP923879532), T3G, T3D);
+			 T3y = VFMA(LDK(KP923879532), T3x, T3i);
+			 T3K = VFNMS(LDK(KP923879532), T3x, T3i);
+			 T4r = VFMA(LDK(KP707106781), T4q, T4p);
+			 T4v = VFNMS(LDK(KP707106781), T4q, T4p);
+			 T4o = VFMA(LDK(KP707106781), T4n, T4k);
+			 T4u = VFNMS(LDK(KP707106781), T4n, T4k);
+			 T3Q = VFMA(LDK(KP923879532), T3P, T3O);
+			 T3W = VFNMS(LDK(KP923879532), T3P, T3O);
+			 T4j = VCONJ(VMUL(LDK(KP500000000), VADD(T4h, T4g)));
+			 T4i = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
+			 T4e = VMUL(LDK(KP500000000), VFMAI(T4d, T46));
+			 T4f = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4d, T46)));
+			 T3N = VMUL(LDK(KP500000000), VFMAI(T3L, T3K));
+			 T3M = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3L, T3K)));
+			 T3I = VMUL(LDK(KP500000000), VFNMSI(T3H, T3y));
+			 T3J = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3H, T3y)));
+			 T4x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T4v, T4u)));
+			 T4w = VMUL(LDK(KP500000000), VFNMSI(T4v, T4u));
+			 T4s = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4r, T4o)));
+			 T4t = VMUL(LDK(KP500000000), VFMAI(T4r, T4o));
+			 ST(&(Rp[0]), T4i, ms, &(Rp[0]));
+			 ST(&(Rm[WS(rs, 15)]), T4j, -ms, &(Rm[WS(rs, 1)]));
+			 ST(&(Rm[WS(rs, 7)]), T4f, -ms, &(Rm[WS(rs, 1)]));
+			 ST(&(Rp[WS(rs, 8)]), T4e, ms, &(Rp[0]));
+			 ST(&(Rm[WS(rs, 9)]), T3M, -ms, &(Rm[WS(rs, 1)]));
+			 ST(&(Rp[WS(rs, 10)]), T3N, ms, &(Rp[0]));
+			 ST(&(Rm[WS(rs, 5)]), T3J, -ms, &(Rm[WS(rs, 1)]));
+			 ST(&(Rp[WS(rs, 6)]), T3I, ms, &(Rp[0]));
+			 ST(&(Rp[WS(rs, 12)]), T4w, ms, &(Rp[0]));
+			 ST(&(Rm[WS(rs, 11)]), T4x, -ms, &(Rm[WS(rs, 1)]));
+			 ST(&(Rp[WS(rs, 4)]), T4t, ms, &(Rp[0]));
+			 ST(&(Rm[WS(rs, 3)]), T4s, -ms, &(Rm[WS(rs, 1)]));
+			 {
+			      V T2A, T2W, T2L, T2Z, T2D, T2N, T2M, T2G, T3T, T3X, T16, T2p, T1v, T35, T31;
+			      V T2I, T2S, T34, T2Y, T2P, T2T, T1Y, T2H, T30, T3Z, T3Y, T3U, T3V, T2O, T2X;
+			      V T32, T33, T36, T37, T2U, T2V, T2Q, T2R, T1Z, T2q;
+			      T2A = VFNMS(LDK(KP923879532), T2z, T2y);
+			      T2W = VFMA(LDK(KP923879532), T2z, T2y);
+			      T2L = VFNMS(LDK(KP923879532), T2K, T2J);
+			      T2Z = VFMA(LDK(KP923879532), T2K, T2J);
+			      T2D = VFMA(LDK(KP198912367), T2C, T2B);
+			      T2N = VFNMS(LDK(KP198912367), T2B, T2C);
+			      T2M = VFMA(LDK(KP198912367), T2E, T2F);
+			      T2G = VFNMS(LDK(KP198912367), T2F, T2E);
+			      T3T = VFMA(LDK(KP923879532), T3S, T3R);
+			      T3X = VFNMS(LDK(KP923879532), T3S, T3R);
+			      T16 = VFNMS(LDK(KP923879532), T15, Ts);
+			      T2m = VFMA(LDK(KP923879532), T15, Ts);
+			      T2H = VSUB(T2D, T2G);
+			      T30 = VADD(T2D, T2G);
+			      T2b = VFNMS(LDK(KP923879532), T2a, T27);
+			      T2p = VFMA(LDK(KP923879532), T2a, T27);
+			      T1v = VFMA(LDK(KP668178637), T1u, T1n);
+			      T2c = VFNMS(LDK(KP668178637), T1n, T1u);
+			      T3Z = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3X, T3W)));
+			      T3Y = VMUL(LDK(KP500000000), VFNMSI(T3X, T3W));
+			      T3U = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3T, T3Q)));
+			      T3V = VMUL(LDK(KP500000000), VFMAI(T3T, T3Q));
+			      T2O = VSUB(T2M, T2N);
+			      T2X = VADD(T2N, T2M);
+			      T35 = VFNMS(LDK(KP980785280), T30, T2Z);
+			      T31 = VFMA(LDK(KP980785280), T30, T2Z);
+			      T2I = VFMA(LDK(KP980785280), T2H, T2A);
+			      T2S = VFNMS(LDK(KP980785280), T2H, T2A);
+			      ST(&(Rp[WS(rs, 14)]), T3Y, ms, &(Rp[0]));
+			      ST(&(Rm[WS(rs, 13)]), T3Z, -ms, &(Rm[WS(rs, 1)]));
+			      ST(&(Rp[WS(rs, 2)]), T3V, ms, &(Rp[0]));
+			      ST(&(Rm[WS(rs, 1)]), T3U, -ms, &(Rm[WS(rs, 1)]));
+			      T34 = VFNMS(LDK(KP980785280), T2X, T2W);
+			      T2Y = VFMA(LDK(KP980785280), T2X, T2W);
+			      T2P = VFMA(LDK(KP980785280), T2O, T2L);
+			      T2T = VFNMS(LDK(KP980785280), T2O, T2L);
+			      T2d = VFMA(LDK(KP668178637), T1Q, T1X);
+			      T1Y = VFNMS(LDK(KP668178637), T1X, T1Q);
+			      T32 = VMUL(LDK(KP500000000), VFNMSI(T31, T2Y));
+			      T33 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T31, T2Y)));
+			      T36 = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T35, T34)));
+			      T37 = VMUL(LDK(KP500000000), VFMAI(T35, T34));
+			      T2U = VMUL(LDK(KP500000000), VFNMSI(T2T, T2S));
+			      T2V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2T, T2S)));
+			      T2Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2P, T2I)));
+			      T2R = VMUL(LDK(KP500000000), VFMAI(T2P, T2I));
+			      T1Z = VSUB(T1v, T1Y);
+			      T2q = VADD(T1Y, T1v);
+			      ST(&(Rm[0]), T33, -ms, &(Rm[0]));
+			      ST(&(Rp[WS(rs, 1)]), T32, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 14)]), T36, -ms, &(Rm[0]));
+			      ST(&(Rm[WS(rs, 8)]), T2V, -ms, &(Rm[0]));
+			      ST(&(Rp[WS(rs, 9)]), T2U, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rp[WS(rs, 7)]), T2R, ms, &(Rp[WS(rs, 1)]));
+			      ST(&(Rm[WS(rs, 6)]), T2Q, -ms, &(Rm[0]));
+			      T2v = VFNMS(LDK(KP831469612), T2q, T2p);
+			      T2r = VFMA(LDK(KP831469612), T2q, T2p);
+			      T20 = VFMA(LDK(KP831469612), T1Z, T16);
+			      T2i = VFNMS(LDK(KP831469612), T1Z, T16);
+			 }
+		    }
+	       }
+	       T2n = VADD(T2d, T2c);
+	       T2e = VSUB(T2c, T2d);
+	       T2o = VFMA(LDK(KP831469612), T2n, T2m);
+	       T2u = VFNMS(LDK(KP831469612), T2n, T2m);
+	       T2j = VFMA(LDK(KP831469612), T2e, T2b);
+	       T2f = VFNMS(LDK(KP831469612), T2e, T2b);
+	       T2t = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2r, T2o)));
+	       T2s = VMUL(LDK(KP500000000), VFMAI(T2r, T2o));
+	       T2x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2v, T2u)));
+	       T2w = VMUL(LDK(KP500000000), VFNMSI(T2v, T2u));
+	       T2l = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2j, T2i)));
+	       T2k = VMUL(LDK(KP500000000), VFMAI(T2j, T2i));
+	       T2h = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2f, T20)));
+	       T2g = VMUL(LDK(KP500000000), VFNMSI(T2f, T20));
+	       ST(&(Rm[WS(rs, 2)]), T2t, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 3)]), T2s, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 12)]), T2x, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 13)]), T2w, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 10)]), T2l, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 11)]), T2k, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 4)]), T2h, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 5)]), T2g, ms, &(Rp[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     VTW(1, 16),
+     VTW(1, 17),
+     VTW(1, 18),
+     VTW(1, 19),
+     VTW(1, 20),
+     VTW(1, 21),
+     VTW(1, 22),
+     VTW(1, 23),
+     VTW(1, 24),
+     VTW(1, 25),
+     VTW(1, 26),
+     VTW(1, 27),
+     VTW(1, 28),
+     VTW(1, 29),
+     VTW(1, 30),
+     VTW(1, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cfdftv_32"), twinstr, &GENUS, {119, 94, 130, 0} };
+
+void XSIMD(codelet_hc2cfdftv_32) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include hc2cfv.h */
+
+/*
+ * This function contains 249 FP additions, 133 FP multiplications,
+ * (or, 233 additions, 117 multiplications, 16 fused multiply/add),
+ * 130 stack variables, 9 constants, and 64 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
+     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
+     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
+     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
+     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
+     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
+	       V Ta, T2m, Tx, T2h, T3R, T4h, T3q, T4g, T3B, T4n, T3E, T4o, T1B, T2S, T1O;
+	       V T2R, TV, T2p, T1i, T2o, T3L, T4q, T3I, T4r, T3w, T4k, T3t, T4j, T26, T2V;
+	       V T2d, T2U;
+	       {
+		    V T4, T1m, T1H, T2j, T1M, T2l, T9, T1o, Tf, T1r, Tq, T1w, Tv, T1y, Tk;
+		    V T1t, Tl, Tw, T3P, T3Q, T3o, T3p, T3z, T3A, T3C, T3D, T1p, T1N, T1A, T1C;
+		    V T1u, T1z;
+		    {
+			 V T1, T3, T2, T1l, T1G, T1F, T1E, T1D, T2i, T1L, T1K, T1J, T1I, T2k, T6;
+			 V T8, T7, T5, T1n, Tc, Te, Td, Tb, T1q, Tn, Tp, To, Tm, T1v, Ts;
+			 V Tu, Tt, Tr, T1x, Th, Tj, Ti, Tg, T1s;
+			 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+			 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+			 T3 = VCONJ(T2);
+			 T4 = VADD(T1, T3);
+			 T1l = LDW(&(W[0]));
+			 T1m = VZMULIJ(T1l, VSUB(T3, T1));
+			 T1G = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
+			 T1E = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
+			 T1F = VCONJ(T1E);
+			 T1D = LDW(&(W[TWVL * 16]));
+			 T1H = VZMULIJ(T1D, VSUB(T1F, T1G));
+			 T2i = LDW(&(W[TWVL * 14]));
+			 T2j = VZMULJ(T2i, VADD(T1G, T1F));
+			 T1L = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
+			 T1J = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
+			 T1K = VCONJ(T1J);
+			 T1I = LDW(&(W[TWVL * 48]));
+			 T1M = VZMULIJ(T1I, VSUB(T1K, T1L));
+			 T2k = LDW(&(W[TWVL * 46]));
+			 T2l = VZMULJ(T2k, VADD(T1L, T1K));
+			 T6 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
+			 T7 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
+			 T8 = VCONJ(T7);
+			 T5 = LDW(&(W[TWVL * 30]));
+			 T9 = VZMULJ(T5, VADD(T6, T8));
+			 T1n = LDW(&(W[TWVL * 32]));
+			 T1o = VZMULIJ(T1n, VSUB(T8, T6));
+			 Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+			 Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+			 Te = VCONJ(Td);
+			 Tb = LDW(&(W[TWVL * 6]));
+			 Tf = VZMULJ(Tb, VADD(Tc, Te));
+			 T1q = LDW(&(W[TWVL * 8]));
+			 T1r = VZMULIJ(T1q, VSUB(Te, Tc));
+			 Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
+			 To = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
+			 Tp = VCONJ(To);
+			 Tm = LDW(&(W[TWVL * 54]));
+			 Tq = VZMULJ(Tm, VADD(Tn, Tp));
+			 T1v = LDW(&(W[TWVL * 56]));
+			 T1w = VZMULIJ(T1v, VSUB(Tp, Tn));
+			 Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
+			 Tt = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
+			 Tu = VCONJ(Tt);
+			 Tr = LDW(&(W[TWVL * 22]));
+			 Tv = VZMULJ(Tr, VADD(Ts, Tu));
+			 T1x = LDW(&(W[TWVL * 24]));
+			 T1y = VZMULIJ(T1x, VSUB(Tu, Ts));
+			 Th = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
+			 Ti = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
+			 Tj = VCONJ(Ti);
+			 Tg = LDW(&(W[TWVL * 38]));
+			 Tk = VZMULJ(Tg, VADD(Th, Tj));
+			 T1s = LDW(&(W[TWVL * 40]));
+			 T1t = VZMULIJ(T1s, VSUB(Tj, Th));
+		    }
+		    Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
+		    T2m = VSUB(T2j, T2l);
+		    Tl = VSUB(Tf, Tk);
+		    Tw = VSUB(Tq, Tv);
+		    Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
+		    T2h = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
+		    T3P = VADD(Tq, Tv);
+		    T3Q = VADD(Tf, Tk);
+		    T3R = VSUB(T3P, T3Q);
+		    T4h = VADD(T3Q, T3P);
+		    T3o = VADD(T4, T9);
+		    T3p = VADD(T2j, T2l);
+		    T3q = VMUL(LDK(KP500000000), VSUB(T3o, T3p));
+		    T4g = VADD(T3o, T3p);
+		    T3z = VADD(T1m, T1o);
+		    T3A = VADD(T1H, T1M);
+		    T3B = VSUB(T3z, T3A);
+		    T4n = VADD(T3z, T3A);
+		    T3C = VADD(T1w, T1y);
+		    T3D = VADD(T1r, T1t);
+		    T3E = VSUB(T3C, T3D);
+		    T4o = VADD(T3D, T3C);
+		    T1p = VSUB(T1m, T1o);
+		    T1N = VSUB(T1H, T1M);
+		    T1u = VSUB(T1r, T1t);
+		    T1z = VSUB(T1w, T1y);
+		    T1A = VMUL(LDK(KP707106781), VADD(T1u, T1z));
+		    T1C = VMUL(LDK(KP707106781), VSUB(T1z, T1u));
+		    T1B = VADD(T1p, T1A);
+		    T2S = VADD(T1N, T1C);
+		    T1O = VSUB(T1C, T1N);
+		    T2R = VSUB(T1p, T1A);
+	       }
+	       {
+		    V TD, T1R, T1b, T29, T1g, T2b, TI, T1T, TO, T1Y, T10, T22, T15, T24, TT;
+		    V T1W, TJ, TU, T16, T1h, T3J, T3K, T3G, T3H, T3u, T3v, T3r, T3s, T25, T2c;
+		    V T20, T27, T1U, T1Z;
+		    {
+			 V TA, TC, TB, Tz, T1Q, T18, T1a, T19, T17, T28, T1d, T1f, T1e, T1c, T2a;
+			 V TF, TH, TG, TE, T1S, TL, TN, TM, TK, T1X, TX, TZ, TY, TW, T21;
+			 V T12, T14, T13, T11, T23, TQ, TS, TR, TP, T1V;
+			 TA = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+			 TB = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+			 TC = VCONJ(TB);
+			 Tz = LDW(&(W[TWVL * 2]));
+			 TD = VZMULJ(Tz, VADD(TA, TC));
+			 T1Q = LDW(&(W[TWVL * 4]));
+			 T1R = VZMULIJ(T1Q, VSUB(TC, TA));
+			 T18 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+			 T19 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+			 T1a = VCONJ(T19);
+			 T17 = LDW(&(W[TWVL * 10]));
+			 T1b = VZMULJ(T17, VADD(T18, T1a));
+			 T28 = LDW(&(W[TWVL * 12]));
+			 T29 = VZMULIJ(T28, VSUB(T1a, T18));
+			 T1d = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
+			 T1e = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
+			 T1f = VCONJ(T1e);
+			 T1c = LDW(&(W[TWVL * 42]));
+			 T1g = VZMULJ(T1c, VADD(T1d, T1f));
+			 T2a = LDW(&(W[TWVL * 44]));
+			 T2b = VZMULIJ(T2a, VSUB(T1f, T1d));
+			 TF = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
+			 TG = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
+			 TH = VCONJ(TG);
+			 TE = LDW(&(W[TWVL * 34]));
+			 TI = VZMULJ(TE, VADD(TF, TH));
+			 T1S = LDW(&(W[TWVL * 36]));
+			 T1T = VZMULIJ(T1S, VSUB(TH, TF));
+			 TL = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
+			 TM = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
+			 TN = VCONJ(TM);
+			 TK = LDW(&(W[TWVL * 18]));
+			 TO = VZMULJ(TK, VADD(TL, TN));
+			 T1X = LDW(&(W[TWVL * 20]));
+			 T1Y = VZMULIJ(T1X, VSUB(TN, TL));
+			 TX = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
+			 TY = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
+			 TZ = VCONJ(TY);
+			 TW = LDW(&(W[TWVL * 58]));
+			 T10 = VZMULJ(TW, VADD(TX, TZ));
+			 T21 = LDW(&(W[TWVL * 60]));
+			 T22 = VZMULIJ(T21, VSUB(TZ, TX));
+			 T12 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
+			 T13 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
+			 T14 = VCONJ(T13);
+			 T11 = LDW(&(W[TWVL * 26]));
+			 T15 = VZMULJ(T11, VADD(T12, T14));
+			 T23 = LDW(&(W[TWVL * 28]));
+			 T24 = VZMULIJ(T23, VSUB(T14, T12));
+			 TQ = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
+			 TR = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
+			 TS = VCONJ(TR);
+			 TP = LDW(&(W[TWVL * 50]));
+			 TT = VZMULJ(TP, VADD(TQ, TS));
+			 T1V = LDW(&(W[TWVL * 52]));
+			 T1W = VZMULIJ(T1V, VSUB(TS, TQ));
+		    }
+		    TJ = VSUB(TD, TI);
+		    TU = VSUB(TO, TT);
+		    TV = VFNMS(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TJ));
+		    T2p = VFMA(LDK(KP382683432), TJ, VMUL(LDK(KP923879532), TU));
+		    T16 = VSUB(T10, T15);
+		    T1h = VSUB(T1b, T1g);
+		    T1i = VFMA(LDK(KP923879532), T16, VMUL(LDK(KP382683432), T1h));
+		    T2o = VFNMS(LDK(KP923879532), T1h, VMUL(LDK(KP382683432), T16));
+		    T3J = VADD(T1Y, T1W);
+		    T3K = VADD(T1R, T1T);
+		    T3L = VSUB(T3J, T3K);
+		    T4q = VADD(T3K, T3J);
+		    T3G = VADD(T22, T24);
+		    T3H = VADD(T29, T2b);
+		    T3I = VSUB(T3G, T3H);
+		    T4r = VADD(T3G, T3H);
+		    T3u = VADD(T10, T15);
+		    T3v = VADD(T1b, T1g);
+		    T3w = VSUB(T3u, T3v);
+		    T4k = VADD(T3u, T3v);
+		    T3r = VADD(TD, TI);
+		    T3s = VADD(TO, TT);
+		    T3t = VSUB(T3r, T3s);
+		    T4j = VADD(T3r, T3s);
+		    T25 = VSUB(T22, T24);
+		    T2c = VSUB(T29, T2b);
+		    T1U = VSUB(T1R, T1T);
+		    T1Z = VSUB(T1W, T1Y);
+		    T20 = VMUL(LDK(KP707106781), VADD(T1U, T1Z));
+		    T27 = VMUL(LDK(KP707106781), VSUB(T1Z, T1U));
+		    T26 = VADD(T20, T25);
+		    T2V = VADD(T27, T2c);
+		    T2d = VSUB(T27, T2c);
+		    T2U = VSUB(T25, T20);
+	       }
+	       {
+		    V T4m, T4w, T4t, T4x, T4i, T4l, T4p, T4s, T4u, T4z, T4v, T4y, T4E, T4L, T4H;
+		    V T4K, T4A, T4F, T4D, T4G, T4B, T4C, T4I, T4N, T4J, T4M, T3O, T4c, T4d, T3X;
+		    V T40, T46, T49, T41, T3y, T47, T3T, T45, T3N, T44, T3W, T48, T3x, T3S, T3F;
+		    V T3M, T3U, T3V, T3Y, T4e, T4f, T3Z, T42, T4a, T4b, T43;
+		    T4i = VADD(T4g, T4h);
+		    T4l = VADD(T4j, T4k);
+		    T4m = VADD(T4i, T4l);
+		    T4w = VSUB(T4i, T4l);
+		    T4p = VADD(T4n, T4o);
+		    T4s = VADD(T4q, T4r);
+		    T4t = VADD(T4p, T4s);
+		    T4x = VBYI(VSUB(T4s, T4p));
+		    T4u = VCONJ(VMUL(LDK(KP500000000), VSUB(T4m, T4t)));
+		    ST(&(Rm[WS(rs, 15)]), T4u, -ms, &(Rm[WS(rs, 1)]));
+		    T4z = VMUL(LDK(KP500000000), VADD(T4w, T4x));
+		    ST(&(Rp[WS(rs, 8)]), T4z, ms, &(Rp[0]));
+		    T4v = VMUL(LDK(KP500000000), VADD(T4m, T4t));
+		    ST(&(Rp[0]), T4v, ms, &(Rp[0]));
+		    T4y = VCONJ(VMUL(LDK(KP500000000), VSUB(T4w, T4x)));
+		    ST(&(Rm[WS(rs, 7)]), T4y, -ms, &(Rm[WS(rs, 1)]));
+		    T4A = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
+		    T4F = VSUB(T4k, T4j);
+		    T4B = VSUB(T4n, T4o);
+		    T4C = VSUB(T4r, T4q);
+		    T4D = VMUL(LDK(KP353553390), VADD(T4B, T4C));
+		    T4G = VMUL(LDK(KP707106781), VSUB(T4C, T4B));
+		    T4E = VADD(T4A, T4D);
+		    T4L = VMUL(LDK(KP500000000), VBYI(VSUB(T4G, T4F)));
+		    T4H = VMUL(LDK(KP500000000), VBYI(VADD(T4F, T4G)));
+		    T4K = VSUB(T4A, T4D);
+		    T4I = VCONJ(VSUB(T4E, T4H));
+		    ST(&(Rm[WS(rs, 3)]), T4I, -ms, &(Rm[WS(rs, 1)]));
+		    T4N = VADD(T4K, T4L);
+		    ST(&(Rp[WS(rs, 12)]), T4N, ms, &(Rp[0]));
+		    T4J = VADD(T4E, T4H);
+		    ST(&(Rp[WS(rs, 4)]), T4J, ms, &(Rp[0]));
+		    T4M = VCONJ(VSUB(T4K, T4L));
+		    ST(&(Rm[WS(rs, 11)]), T4M, -ms, &(Rm[WS(rs, 1)]));
+		    T3x = VMUL(LDK(KP353553390), VADD(T3t, T3w));
+		    T3y = VADD(T3q, T3x);
+		    T47 = VSUB(T3q, T3x);
+		    T3S = VMUL(LDK(KP707106781), VSUB(T3w, T3t));
+		    T3T = VADD(T3R, T3S);
+		    T45 = VSUB(T3S, T3R);
+		    T3F = VFMA(LDK(KP923879532), T3B, VMUL(LDK(KP382683432), T3E));
+		    T3M = VFNMS(LDK(KP382683432), T3L, VMUL(LDK(KP923879532), T3I));
+		    T3N = VMUL(LDK(KP500000000), VADD(T3F, T3M));
+		    T44 = VSUB(T3M, T3F);
+		    T3U = VFNMS(LDK(KP382683432), T3B, VMUL(LDK(KP923879532), T3E));
+		    T3V = VFMA(LDK(KP923879532), T3L, VMUL(LDK(KP382683432), T3I));
+		    T3W = VADD(T3U, T3V);
+		    T48 = VMUL(LDK(KP500000000), VSUB(T3V, T3U));
+		    T3O = VADD(T3y, T3N);
+		    T4c = VMUL(LDK(KP500000000), VBYI(VADD(T45, T44)));
+		    T4d = VADD(T47, T48);
+		    T3X = VMUL(LDK(KP500000000), VBYI(VADD(T3T, T3W)));
+		    T40 = VSUB(T3y, T3N);
+		    T46 = VMUL(LDK(KP500000000), VBYI(VSUB(T44, T45)));
+		    T49 = VSUB(T47, T48);
+		    T41 = VMUL(LDK(KP500000000), VBYI(VSUB(T3W, T3T)));
+		    T3Y = VCONJ(VSUB(T3O, T3X));
+		    ST(&(Rm[WS(rs, 1)]), T3Y, -ms, &(Rm[WS(rs, 1)]));
+		    T4e = VADD(T4c, T4d);
+		    ST(&(Rp[WS(rs, 6)]), T4e, ms, &(Rp[0]));
+		    T4f = VCONJ(VSUB(T4d, T4c));
+		    ST(&(Rm[WS(rs, 5)]), T4f, -ms, &(Rm[WS(rs, 1)]));
+		    T3Z = VADD(T3O, T3X);
+		    ST(&(Rp[WS(rs, 2)]), T3Z, ms, &(Rp[0]));
+		    T42 = VCONJ(VSUB(T40, T41));
+		    ST(&(Rm[WS(rs, 13)]), T42, -ms, &(Rm[WS(rs, 1)]));
+		    T4a = VADD(T46, T49);
+		    ST(&(Rp[WS(rs, 10)]), T4a, ms, &(Rp[0]));
+		    T4b = VCONJ(VSUB(T49, T46));
+		    ST(&(Rm[WS(rs, 9)]), T4b, -ms, &(Rm[WS(rs, 1)]));
+		    T43 = VADD(T40, T41);
+		    ST(&(Rp[WS(rs, 14)]), T43, ms, &(Rp[0]));
+		    {
+			 V T2g, T2K, T2L, T2v, T2y, T2E, T2H, T2z, T1k, T2F, T2u, T2G, T2f, T2C, T2r;
+			 V T2D, Ty, T1j, T2s, T2t, T1P, T2e, T2n, T2q, T2w, T2M, T2N, T2x, T2A, T2I;
+			 V T2J, T2B;
+			 Ty = VADD(Ta, Tx);
+			 T1j = VMUL(LDK(KP500000000), VADD(TV, T1i));
+			 T1k = VADD(Ty, T1j);
+			 T2F = VSUB(Ty, T1j);
+			 T2s = VFNMS(LDK(KP195090322), T1B, VMUL(LDK(KP980785280), T1O));
+			 T2t = VFMA(LDK(KP195090322), T26, VMUL(LDK(KP980785280), T2d));
+			 T2u = VADD(T2s, T2t);
+			 T2G = VMUL(LDK(KP500000000), VSUB(T2t, T2s));
+			 T1P = VFMA(LDK(KP980785280), T1B, VMUL(LDK(KP195090322), T1O));
+			 T2e = VFNMS(LDK(KP195090322), T2d, VMUL(LDK(KP980785280), T26));
+			 T2f = VMUL(LDK(KP500000000), VADD(T1P, T2e));
+			 T2C = VSUB(T2e, T1P);
+			 T2n = VSUB(T2h, T2m);
+			 T2q = VSUB(T2o, T2p);
+			 T2r = VADD(T2n, T2q);
+			 T2D = VSUB(T2q, T2n);
+			 T2g = VADD(T1k, T2f);
+			 T2K = VMUL(LDK(KP500000000), VBYI(VADD(T2D, T2C)));
+			 T2L = VADD(T2F, T2G);
+			 T2v = VMUL(LDK(KP500000000), VBYI(VADD(T2r, T2u)));
+			 T2y = VSUB(T1k, T2f);
+			 T2E = VMUL(LDK(KP500000000), VBYI(VSUB(T2C, T2D)));
+			 T2H = VSUB(T2F, T2G);
+			 T2z = VMUL(LDK(KP500000000), VBYI(VSUB(T2u, T2r)));
+			 T2w = VCONJ(VSUB(T2g, T2v));
+			 ST(&(Rm[0]), T2w, -ms, &(Rm[0]));
+			 T2M = VADD(T2K, T2L);
+			 ST(&(Rp[WS(rs, 7)]), T2M, ms, &(Rp[WS(rs, 1)]));
+			 T2N = VCONJ(VSUB(T2L, T2K));
+			 ST(&(Rm[WS(rs, 6)]), T2N, -ms, &(Rm[0]));
+			 T2x = VADD(T2g, T2v);
+			 ST(&(Rp[WS(rs, 1)]), T2x, ms, &(Rp[WS(rs, 1)]));
+			 T2A = VCONJ(VSUB(T2y, T2z));
+			 ST(&(Rm[WS(rs, 14)]), T2A, -ms, &(Rm[0]));
+			 T2I = VADD(T2E, T2H);
+			 ST(&(Rp[WS(rs, 9)]), T2I, ms, &(Rp[WS(rs, 1)]));
+			 T2J = VCONJ(VSUB(T2H, T2E));
+			 ST(&(Rm[WS(rs, 8)]), T2J, -ms, &(Rm[0]));
+			 T2B = VADD(T2y, T2z);
+			 ST(&(Rp[WS(rs, 15)]), T2B, ms, &(Rp[WS(rs, 1)]));
+		    }
+		    {
+			 V T2Y, T3k, T3l, T35, T38, T3e, T3h, T39, T2Q, T3f, T34, T3g, T2X, T3c, T31;
+			 V T3d, T2O, T2P, T32, T33, T2T, T2W, T2Z, T30, T36, T3m, T3n, T37, T3a, T3i;
+			 V T3j, T3b;
+			 T2O = VSUB(Ta, Tx);
+			 T2P = VMUL(LDK(KP500000000), VADD(T2p, T2o));
+			 T2Q = VADD(T2O, T2P);
+			 T3f = VSUB(T2O, T2P);
+			 T32 = VFNMS(LDK(KP555570233), T2R, VMUL(LDK(KP831469612), T2S));
+			 T33 = VFMA(LDK(KP555570233), T2U, VMUL(LDK(KP831469612), T2V));
+			 T34 = VADD(T32, T33);
+			 T3g = VMUL(LDK(KP500000000), VSUB(T33, T32));
+			 T2T = VFMA(LDK(KP831469612), T2R, VMUL(LDK(KP555570233), T2S));
+			 T2W = VFNMS(LDK(KP555570233), T2V, VMUL(LDK(KP831469612), T2U));
+			 T2X = VMUL(LDK(KP500000000), VADD(T2T, T2W));
+			 T3c = VSUB(T2W, T2T);
+			 T2Z = VADD(T2m, T2h);
+			 T30 = VSUB(T1i, TV);
+			 T31 = VADD(T2Z, T30);
+			 T3d = VSUB(T30, T2Z);
+			 T2Y = VADD(T2Q, T2X);
+			 T3k = VMUL(LDK(KP500000000), VBYI(VADD(T3d, T3c)));
+			 T3l = VADD(T3f, T3g);
+			 T35 = VMUL(LDK(KP500000000), VBYI(VADD(T31, T34)));
+			 T38 = VSUB(T2Q, T2X);
+			 T3e = VMUL(LDK(KP500000000), VBYI(VSUB(T3c, T3d)));
+			 T3h = VSUB(T3f, T3g);
+			 T39 = VMUL(LDK(KP500000000), VBYI(VSUB(T34, T31)));
+			 T36 = VCONJ(VSUB(T2Y, T35));
+			 ST(&(Rm[WS(rs, 2)]), T36, -ms, &(Rm[0]));
+			 T3m = VADD(T3k, T3l);
+			 ST(&(Rp[WS(rs, 5)]), T3m, ms, &(Rp[WS(rs, 1)]));
+			 T3n = VCONJ(VSUB(T3l, T3k));
+			 ST(&(Rm[WS(rs, 4)]), T3n, -ms, &(Rm[0]));
+			 T37 = VADD(T2Y, T35);
+			 ST(&(Rp[WS(rs, 3)]), T37, ms, &(Rp[WS(rs, 1)]));
+			 T3a = VCONJ(VSUB(T38, T39));
+			 ST(&(Rm[WS(rs, 12)]), T3a, -ms, &(Rm[0]));
+			 T3i = VADD(T3e, T3h);
+			 ST(&(Rp[WS(rs, 11)]), T3i, ms, &(Rp[WS(rs, 1)]));
+			 T3j = VCONJ(VSUB(T3h, T3e));
+			 ST(&(Rm[WS(rs, 10)]), T3j, -ms, &(Rm[0]));
+			 T3b = VADD(T38, T39);
+			 ST(&(Rp[WS(rs, 13)]), T3b, ms, &(Rp[WS(rs, 1)]));
+		    }
+	       }
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     VTW(1, 8),
+     VTW(1, 9),
+     VTW(1, 10),
+     VTW(1, 11),
+     VTW(1, 12),
+     VTW(1, 13),
+     VTW(1, 14),
+     VTW(1, 15),
+     VTW(1, 16),
+     VTW(1, 17),
+     VTW(1, 18),
+     VTW(1, 19),
+     VTW(1, 20),
+     VTW(1, 21),
+     VTW(1, 22),
+     VTW(1, 23),
+     VTW(1, 24),
+     VTW(1, 25),
+     VTW(1, 26),
+     VTW(1, 27),
+     VTW(1, 28),
+     VTW(1, 29),
+     VTW(1, 30),
+     VTW(1, 31),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cfdftv_32"), twinstr, &GENUS, {233, 117, 16, 0} };
+
+void XSIMD(codelet_hc2cfdftv_32) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 4 -dit -name hc2cfdftv_4 -include hc2cfv.h */
+
+/*
+ * This function contains 15 FP additions, 16 FP multiplications,
+ * (or, 9 additions, 10 multiplications, 6 fused multiply/add),
+ * 21 stack variables, 1 constants, and 8 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 6)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T1, T2, Tb, T5, T6, T4, T9, T3, Tc, T7, Ta, Tg, T8, Td, Th;
+	       V Tf, Te, Ti, Tj;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Tb = LDW(&(W[0]));
+	       T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       T6 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       T4 = LDW(&(W[TWVL * 2]));
+	       T9 = LDW(&(W[TWVL * 4]));
+	       T3 = VFMACONJ(T2, T1);
+	       Tc = VZMULIJ(Tb, VFNMSCONJ(T2, T1));
+	       T7 = VZMULJ(T4, VFMACONJ(T6, T5));
+	       Ta = VZMULIJ(T9, VFNMSCONJ(T6, T5));
+	       Tg = VADD(T3, T7);
+	       T8 = VSUB(T3, T7);
+	       Td = VSUB(Ta, Tc);
+	       Th = VADD(Tc, Ta);
+	       Tf = VCONJ(VMUL(LDK(KP500000000), VFMAI(Td, T8)));
+	       Te = VMUL(LDK(KP500000000), VFNMSI(Td, T8));
+	       Ti = VMUL(LDK(KP500000000), VSUB(Tg, Th));
+	       Tj = VCONJ(VMUL(LDK(KP500000000), VADD(Th, Tg)));
+	       ST(&(Rm[0]), Tf, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 1)]), Te, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rp[0]), Ti, ms, &(Rp[0]));
+	       ST(&(Rm[WS(rs, 1)]), Tj, -ms, &(Rm[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 4, XSIMD_STRING("hc2cfdftv_4"), twinstr, &GENUS, {9, 10, 6, 0} };
+
+void XSIMD(codelet_hc2cfdftv_4) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_4, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 4 -dit -name hc2cfdftv_4 -include hc2cfv.h */
+
+/*
+ * This function contains 15 FP additions, 10 FP multiplications,
+ * (or, 15 additions, 10 multiplications, 0 fused multiply/add),
+ * 23 stack variables, 1 constants, and 8 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 6)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
+	       V T4, Tc, T9, Te, T1, T3, T2, Tb, T6, T8, T7, T5, Td, Tg, Th;
+	       V Ta, Tf, Tk, Tl, Ti, Tj;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       T3 = VCONJ(T2);
+	       T4 = VADD(T1, T3);
+	       Tb = LDW(&(W[0]));
+	       Tc = VZMULIJ(Tb, VSUB(T3, T1));
+	       T6 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       T7 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       T8 = VCONJ(T7);
+	       T5 = LDW(&(W[TWVL * 2]));
+	       T9 = VZMULJ(T5, VADD(T6, T8));
+	       Td = LDW(&(W[TWVL * 4]));
+	       Te = VZMULIJ(Td, VSUB(T8, T6));
+	       Ta = VSUB(T4, T9);
+	       Tf = VBYI(VSUB(Tc, Te));
+	       Tg = VMUL(LDK(KP500000000), VSUB(Ta, Tf));
+	       Th = VCONJ(VMUL(LDK(KP500000000), VADD(Ta, Tf)));
+	       ST(&(Rp[WS(rs, 1)]), Tg, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[0]), Th, -ms, &(Rm[0]));
+	       Ti = VADD(T4, T9);
+	       Tj = VADD(Tc, Te);
+	       Tk = VCONJ(VMUL(LDK(KP500000000), VSUB(Ti, Tj)));
+	       Tl = VMUL(LDK(KP500000000), VADD(Ti, Tj));
+	       ST(&(Rm[WS(rs, 1)]), Tk, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[0]), Tl, ms, &(Rp[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 4, XSIMD_STRING("hc2cfdftv_4"), twinstr, &GENUS, {15, 10, 0, 0} };
+
+void XSIMD(codelet_hc2cfdftv_4) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_4, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 6 -dit -name hc2cfdftv_6 -include hc2cfv.h */
+
+/*
+ * This function contains 29 FP additions, 30 FP multiplications,
+ * (or, 17 additions, 18 multiplications, 12 fused multiply/add),
+ * 38 stack variables, 2 constants, and 12 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 10)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(24, rs)) {
+	       V T5, T6, T3, Tj, T4, T9, Te, Th, T1, T2, Ti, Tc, Td, Tb, Tg;
+	       V T7, Ta, Tt, Tk, Tr, T8, Ts, Tf, Tx, Tu, To, Tl, Tw, Tv, Tn;
+	       V Tm, Tz, Ty, Tp, Tq;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Ti = LDW(&(W[0]));
+	       Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       Tb = LDW(&(W[TWVL * 8]));
+	       Tg = LDW(&(W[TWVL * 6]));
+	       T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       T6 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       T3 = VFMACONJ(T2, T1);
+	       Tj = VZMULIJ(Ti, VFNMSCONJ(T2, T1));
+	       T4 = LDW(&(W[TWVL * 4]));
+	       T9 = LDW(&(W[TWVL * 2]));
+	       Te = VZMULIJ(Tb, VFNMSCONJ(Td, Tc));
+	       Th = VZMULJ(Tg, VFMACONJ(Td, Tc));
+	       T7 = VZMULIJ(T4, VFNMSCONJ(T6, T5));
+	       Ta = VZMULJ(T9, VFMACONJ(T6, T5));
+	       Tt = VADD(Tj, Th);
+	       Tk = VSUB(Th, Tj);
+	       Tr = VADD(T3, T7);
+	       T8 = VSUB(T3, T7);
+	       Ts = VADD(Ta, Te);
+	       Tf = VSUB(Ta, Te);
+	       Tx = VMUL(LDK(KP866025403), VSUB(Tt, Ts));
+	       Tu = VADD(Ts, Tt);
+	       To = VMUL(LDK(KP866025403), VSUB(Tk, Tf));
+	       Tl = VADD(Tf, Tk);
+	       Tw = VFNMS(LDK(KP500000000), Tu, Tr);
+	       Tv = VCONJ(VMUL(LDK(KP500000000), VADD(Tr, Tu)));
+	       Tn = VFNMS(LDK(KP500000000), Tl, T8);
+	       Tm = VMUL(LDK(KP500000000), VADD(T8, Tl));
+	       Tz = VMUL(LDK(KP500000000), VFMAI(Tx, Tw));
+	       Ty = VCONJ(VMUL(LDK(KP500000000), VFNMSI(Tx, Tw)));
+	       ST(&(Rm[WS(rs, 2)]), Tv, -ms, &(Rm[0]));
+	       Tp = VMUL(LDK(KP500000000), VFNMSI(To, Tn));
+	       Tq = VCONJ(VMUL(LDK(KP500000000), VFMAI(To, Tn)));
+	       ST(&(Rp[0]), Tm, ms, &(Rp[0]));
+	       ST(&(Rp[WS(rs, 1)]), Tz, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[0]), Ty, -ms, &(Rm[0]));
+	       ST(&(Rm[WS(rs, 1)]), Tq, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 2)]), Tp, ms, &(Rp[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 6, XSIMD_STRING("hc2cfdftv_6"), twinstr, &GENUS, {17, 18, 12, 0} };
+
+void XSIMD(codelet_hc2cfdftv_6) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_6, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 6 -dit -name hc2cfdftv_6 -include hc2cfv.h */
+
+/*
+ * This function contains 29 FP additions, 20 FP multiplications,
+ * (or, 27 additions, 18 multiplications, 2 fused multiply/add),
+ * 42 stack variables, 3 constants, and 12 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
+     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 10)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(24, rs)) {
+	       V Ta, Tu, Tn, Tw, Ti, Tv, T1, T8, Tg, Tf, T7, T3, Te, T6, T2;
+	       V T4, T9, T5, Tk, Tm, Tj, Tl, Tc, Th, Tb, Td, Tr, Tp, Tq, To;
+	       V Tt, Ts, TA, Ty, Tz, Tx, TC, TB;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T8 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       Te = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       Tf = VCONJ(Te);
+	       T6 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       T7 = VCONJ(T6);
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       T3 = VCONJ(T2);
+	       T4 = VADD(T1, T3);
+	       T5 = LDW(&(W[TWVL * 4]));
+	       T9 = VZMULIJ(T5, VSUB(T7, T8));
+	       Ta = VADD(T4, T9);
+	       Tu = VSUB(T4, T9);
+	       Tj = LDW(&(W[0]));
+	       Tk = VZMULIJ(Tj, VSUB(T3, T1));
+	       Tl = LDW(&(W[TWVL * 6]));
+	       Tm = VZMULJ(Tl, VADD(Tf, Tg));
+	       Tn = VADD(Tk, Tm);
+	       Tw = VSUB(Tm, Tk);
+	       Tb = LDW(&(W[TWVL * 2]));
+	       Tc = VZMULJ(Tb, VADD(T7, T8));
+	       Td = LDW(&(W[TWVL * 8]));
+	       Th = VZMULIJ(Td, VSUB(Tf, Tg));
+	       Ti = VADD(Tc, Th);
+	       Tv = VSUB(Tc, Th);
+	       Tr = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(Tn, Ti))));
+	       To = VADD(Ti, Tn);
+	       Tp = VMUL(LDK(KP500000000), VADD(Ta, To));
+	       Tq = VFNMS(LDK(KP250000000), To, VMUL(LDK(KP500000000), Ta));
+	       ST(&(Rp[0]), Tp, ms, &(Rp[0]));
+	       Tt = VCONJ(VADD(Tq, Tr));
+	       ST(&(Rm[WS(rs, 1)]), Tt, -ms, &(Rm[WS(rs, 1)]));
+	       Ts = VSUB(Tq, Tr);
+	       ST(&(Rp[WS(rs, 2)]), Ts, ms, &(Rp[0]));
+	       TA = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(Tw, Tv))));
+	       Tx = VADD(Tv, Tw);
+	       Ty = VCONJ(VMUL(LDK(KP500000000), VADD(Tu, Tx)));
+	       Tz = VFNMS(LDK(KP250000000), Tx, VMUL(LDK(KP500000000), Tu));
+	       ST(&(Rm[WS(rs, 2)]), Ty, -ms, &(Rm[0]));
+	       TC = VADD(Tz, TA);
+	       ST(&(Rp[WS(rs, 1)]), TC, ms, &(Rp[WS(rs, 1)]));
+	       TB = VCONJ(VSUB(Tz, TA));
+	       ST(&(Rm[0]), TB, -ms, &(Rm[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 6, XSIMD_STRING("hc2cfdftv_6"), twinstr, &GENUS, {27, 18, 2, 0} };
+
+void XSIMD(codelet_hc2cfdftv_6) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_6, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on Sun Nov 25 07:42:29 EST 2012 */
+
+#include "codelet-rdft.h"
+
+#ifdef HAVE_FMA
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dit -name hc2cfdftv_8 -include hc2cfv.h */
+
+/*
+ * This function contains 41 FP additions, 40 FP multiplications,
+ * (or, 23 additions, 22 multiplications, 18 fused multiply/add),
+ * 52 stack variables, 2 constants, and 16 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V T3, Tc, Tl, Ts, Tf, Tg, Te, Tp, T7, Ta, T1, T2, Tb, Tj, Tk;
+	       V Ti, Tr, T5, T6, T4, T9, Th, Tq, TC, T8, Td, TF, Tm, TG, TD;
+	       V Tt, Tu, Tn, TH, TL, TE, TK, Tz, Tv, Ty, To, TJ, TI, TN, TM;
+	       V TB, TA, Tx, Tw;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       Tb = LDW(&(W[0]));
+	       Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+	       Tk = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+	       Ti = LDW(&(W[TWVL * 12]));
+	       Tr = LDW(&(W[TWVL * 10]));
+	       T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       T3 = VFMACONJ(T2, T1);
+	       Tc = VZMULIJ(Tb, VFNMSCONJ(T2, T1));
+	       T4 = LDW(&(W[TWVL * 6]));
+	       T9 = LDW(&(W[TWVL * 8]));
+	       Tl = VZMULIJ(Ti, VFNMSCONJ(Tk, Tj));
+	       Ts = VZMULJ(Tr, VFMACONJ(Tk, Tj));
+	       Tf = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       Tg = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       Te = LDW(&(W[TWVL * 4]));
+	       Tp = LDW(&(W[TWVL * 2]));
+	       T7 = VZMULJ(T4, VFMACONJ(T6, T5));
+	       Ta = VZMULIJ(T9, VFNMSCONJ(T6, T5));
+	       Th = VZMULIJ(Te, VFNMSCONJ(Tg, Tf));
+	       Tq = VZMULJ(Tp, VFMACONJ(Tg, Tf));
+	       TC = VADD(T3, T7);
+	       T8 = VSUB(T3, T7);
+	       Td = VSUB(Ta, Tc);
+	       TF = VADD(Tc, Ta);
+	       Tm = VSUB(Th, Tl);
+	       TG = VADD(Th, Tl);
+	       TD = VADD(Tq, Ts);
+	       Tt = VSUB(Tq, Ts);
+	       Tu = VSUB(Tm, Td);
+	       Tn = VADD(Td, Tm);
+	       TH = VSUB(TF, TG);
+	       TL = VADD(TF, TG);
+	       TE = VSUB(TC, TD);
+	       TK = VADD(TC, TD);
+	       Tz = VFMA(LDK(KP707106781), Tu, Tt);
+	       Tv = VFNMS(LDK(KP707106781), Tu, Tt);
+	       Ty = VFNMS(LDK(KP707106781), Tn, T8);
+	       To = VFMA(LDK(KP707106781), Tn, T8);
+	       TJ = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TH, TE)));
+	       TI = VMUL(LDK(KP500000000), VFMAI(TH, TE));
+	       TN = VCONJ(VMUL(LDK(KP500000000), VADD(TL, TK)));
+	       TM = VMUL(LDK(KP500000000), VSUB(TK, TL));
+	       TB = VMUL(LDK(KP500000000), VFMAI(Tz, Ty));
+	       TA = VCONJ(VMUL(LDK(KP500000000), VFNMSI(Tz, Ty)));
+	       Tx = VCONJ(VMUL(LDK(KP500000000), VFMAI(Tv, To)));
+	       Tw = VMUL(LDK(KP500000000), VFNMSI(Tv, To));
+	       ST(&(Rm[WS(rs, 1)]), TJ, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 2)]), TI, ms, &(Rp[0]));
+	       ST(&(Rm[WS(rs, 3)]), TN, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[0]), TM, ms, &(Rp[0]));
+	       ST(&(Rp[WS(rs, 3)]), TB, ms, &(Rp[WS(rs, 1)]));
+	       ST(&(Rm[WS(rs, 2)]), TA, -ms, &(Rm[0]));
+	       ST(&(Rm[0]), Tx, -ms, &(Rm[0]));
+	       ST(&(Rp[WS(rs, 1)]), Tw, ms, &(Rp[WS(rs, 1)]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cfdftv_8"), twinstr, &GENUS, {23, 22, 18, 0} };
+
+void XSIMD(codelet_hc2cfdftv_8) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_8, &desc, HC2C_VIA_DFT);
+}
+#else				/* HAVE_FMA */
+
+/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dit -name hc2cfdftv_8 -include hc2cfv.h */
+
+/*
+ * This function contains 41 FP additions, 23 FP multiplications,
+ * (or, 41 additions, 23 multiplications, 0 fused multiply/add),
+ * 57 stack variables, 3 constants, and 16 memory accesses
+ */
+#include "hc2cfv.h"
+
+static void hc2cfdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
+{
+     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
+     DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
+     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
+     {
+	  INT m;
+	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
+	       V Ta, TE, Tr, TF, Tl, TK, Tw, TG, T1, T6, T3, T8, T2, T7, T4;
+	       V T9, T5, To, Tq, Tn, Tp, Tc, Th, Te, Tj, Td, Ti, Tf, Tk, Tb;
+	       V Tg, Tt, Tv, Ts, Tu, Ty, Tz, Tm, Tx, TC, TD, TA, TB, TI, TO;
+	       V TL, TP, TH, TJ, TM, TR, TN, TQ;
+	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
+	       T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
+	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
+	       T3 = VCONJ(T2);
+	       T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
+	       T8 = VCONJ(T7);
+	       T4 = VADD(T1, T3);
+	       T5 = LDW(&(W[TWVL * 6]));
+	       T9 = VZMULJ(T5, VADD(T6, T8));
+	       Ta = VADD(T4, T9);
+	       TE = VMUL(LDK(KP500000000), VSUB(T4, T9));
+	       Tn = LDW(&(W[0]));
+	       To = VZMULIJ(Tn, VSUB(T3, T1));
+	       Tp = LDW(&(W[TWVL * 8]));
+	       Tq = VZMULIJ(Tp, VSUB(T8, T6));
+	       Tr = VADD(To, Tq);
+	       TF = VSUB(To, Tq);
+	       Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
+	       Th = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
+	       Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
+	       Te = VCONJ(Td);
+	       Ti = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
+	       Tj = VCONJ(Ti);
+	       Tb = LDW(&(W[TWVL * 2]));
+	       Tf = VZMULJ(Tb, VADD(Tc, Te));
+	       Tg = LDW(&(W[TWVL * 10]));
+	       Tk = VZMULJ(Tg, VADD(Th, Tj));
+	       Tl = VADD(Tf, Tk);
+	       TK = VSUB(Tf, Tk);
+	       Ts = LDW(&(W[TWVL * 4]));
+	       Tt = VZMULIJ(Ts, VSUB(Te, Tc));
+	       Tu = LDW(&(W[TWVL * 12]));
+	       Tv = VZMULIJ(Tu, VSUB(Tj, Th));
+	       Tw = VADD(Tt, Tv);
+	       TG = VSUB(Tv, Tt);
+	       Tm = VADD(Ta, Tl);
+	       Tx = VADD(Tr, Tw);
+	       Ty = VCONJ(VMUL(LDK(KP500000000), VSUB(Tm, Tx)));
+	       Tz = VMUL(LDK(KP500000000), VADD(Tm, Tx));
+	       ST(&(Rm[WS(rs, 3)]), Ty, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[0]), Tz, ms, &(Rp[0]));
+	       TA = VSUB(Ta, Tl);
+	       TB = VBYI(VSUB(Tw, Tr));
+	       TC = VCONJ(VMUL(LDK(KP500000000), VSUB(TA, TB)));
+	       TD = VMUL(LDK(KP500000000), VADD(TA, TB));
+	       ST(&(Rm[WS(rs, 1)]), TC, -ms, &(Rm[WS(rs, 1)]));
+	       ST(&(Rp[WS(rs, 2)]), TD, ms, &(Rp[0]));
+	       TH = VMUL(LDK(KP353553390), VADD(TF, TG));
+	       TI = VADD(TE, TH);
+	       TO = VSUB(TE, TH);
+	       TJ = VMUL(LDK(KP707106781), VSUB(TG, TF));
+	       TL = VMUL(LDK(KP500000000), VBYI(VSUB(TJ, TK)));
+	       TP = VMUL(LDK(KP500000000), VBYI(VADD(TK, TJ)));
+	       TM = VCONJ(VSUB(TI, TL));
+	       ST(&(Rm[0]), TM, -ms, &(Rm[0]));
+	       TR = VADD(TO, TP);
+	       ST(&(Rp[WS(rs, 3)]), TR, ms, &(Rp[WS(rs, 1)]));
+	       TN = VADD(TI, TL);
+	       ST(&(Rp[WS(rs, 1)]), TN, ms, &(Rp[WS(rs, 1)]));
+	       TQ = VCONJ(VSUB(TO, TP));
+	       ST(&(Rm[WS(rs, 2)]), TQ, -ms, &(Rm[0]));
+	  }
+     }
+     VLEAVE();
+}
+
+static const tw_instr twinstr[] = {
+     VTW(1, 1),
+     VTW(1, 2),
+     VTW(1, 3),
+     VTW(1, 4),
+     VTW(1, 5),
+     VTW(1, 6),
+     VTW(1, 7),
+     {TW_NEXT, VL, 0}
+};
+
+static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cfdftv_8"), twinstr, &GENUS, {41, 23, 0, 0} };
+
+void XSIMD(codelet_hc2cfdftv_8) (planner *p) {
+     X(khc2c_register) (p, hc2cfdftv_8, &desc, HC2C_VIA_DFT);
+}
+#endif				/* HAVE_FMA */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/hc2cbv.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/hc2cbv.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#define VTW VTW3
+#define TWVL TWVL3
+#define LDW(x) LDA(x, 0, 0)
+
+#define GENUS XSIMD(rdft_hc2cbv_genus)
+extern const hc2c_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/hc2cfv.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/hc2cfv.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include SIMD_HEADER
+
+#define VTW VTW3
+#define TWVL TWVL3
+#define LDW(x) LDA(x, 0, 0)
+
+#define GENUS XSIMD(rdft_hc2cfv_genus)
+extern const hc2c_genus GENUS;
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(NEON_CFLAGS)
+SIMD_HEADER=simd-neon.h
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+include $(top_srcdir)/rdft/simd/simd.mk
+
+if HAVE_NEON
+
+noinst_LTLIBRARIES = librdft_neon_codelets.la
+BUILT_SOURCES = $(EXTRA_DIST)
+librdft_neon_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,613 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This file contains a standard list of RDFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/rdft/simd/codlist.mk \
+	$(top_srcdir)/rdft/simd/simd.mk
+subdir = rdft/simd/neon
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+librdft_neon_codelets_la_LIBADD =
+am__librdft_neon_codelets_la_SOURCES_DIST = hc2cfdftv_2.c \
+	hc2cfdftv_4.c hc2cfdftv_6.c hc2cfdftv_8.c hc2cfdftv_10.c \
+	hc2cfdftv_12.c hc2cfdftv_16.c hc2cfdftv_32.c hc2cfdftv_20.c \
+	hc2cbdftv_2.c hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c \
+	hc2cbdftv_10.c hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c \
+	hc2cbdftv_20.c genus.c codlist.c
+am__objects_1 = hc2cfdftv_2.lo hc2cfdftv_4.lo hc2cfdftv_6.lo \
+	hc2cfdftv_8.lo hc2cfdftv_10.lo hc2cfdftv_12.lo hc2cfdftv_16.lo \
+	hc2cfdftv_32.lo hc2cfdftv_20.lo
+am__objects_2 = hc2cbdftv_2.lo hc2cbdftv_4.lo hc2cbdftv_6.lo \
+	hc2cbdftv_8.lo hc2cbdftv_10.lo hc2cbdftv_12.lo hc2cbdftv_16.lo \
+	hc2cbdftv_32.lo hc2cbdftv_20.lo
+am__objects_3 = $(am__objects_1) $(am__objects_2)
+am__objects_4 = $(am__objects_3) genus.lo codlist.lo
+@HAVE_NEON_TRUE@am__objects_5 = $(am__objects_4)
+@HAVE_NEON_TRUE@am_librdft_neon_codelets_la_OBJECTS =  \
+@HAVE_NEON_TRUE@	$(am__objects_5)
+librdft_neon_codelets_la_OBJECTS =  \
+	$(am_librdft_neon_codelets_la_OBJECTS)
+@HAVE_NEON_TRUE@am_librdft_neon_codelets_la_rpath =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(librdft_neon_codelets_la_SOURCES)
+DIST_SOURCES = $(am__librdft_neon_codelets_la_SOURCES_DIST)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CFLAGS = $(NEON_CFLAGS)
+SIMD_HEADER = simd-neon.h
+HC2CFDFTV = hc2cfdftv_2.c hc2cfdftv_4.c hc2cfdftv_6.c hc2cfdftv_8.c	\
+hc2cfdftv_10.c hc2cfdftv_12.c hc2cfdftv_16.c hc2cfdftv_32.c		\
+hc2cfdftv_20.c
+
+HC2CBDFTV = hc2cbdftv_2.c hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c	\
+hc2cbdftv_10.c hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c		\
+hc2cbdftv_20.c
+
+
+###########################################################################
+SIMD_CODELETS = $(HC2CFDFTV) $(HC2CBDFTV)
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/simd -I$(top_srcdir)/simd-support
+
+EXTRA_DIST = $(SIMD_CODELETS) genus.c codlist.c
+@HAVE_NEON_TRUE@noinst_LTLIBRARIES = librdft_neon_codelets.la
+@HAVE_NEON_TRUE@BUILT_SOURCES = $(EXTRA_DIST)
+@HAVE_NEON_TRUE@librdft_neon_codelets_la_SOURCES = $(BUILT_SOURCES)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/rdft/simd/simd.mk $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/simd/neon/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/simd/neon/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/rdft/simd/simd.mk:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+librdft_neon_codelets.la: $(librdft_neon_codelets_la_OBJECTS) $(librdft_neon_codelets_la_DEPENDENCIES) $(EXTRA_librdft_neon_codelets_la_DEPENDENCIES) 
+	$(LINK) $(am_librdft_neon_codelets_la_rpath) $(librdft_neon_codelets_la_OBJECTS) $(librdft_neon_codelets_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/genus.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_8.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+$(EXTRA_DIST): Makefile
+	(							\
+	echo "/* Generated automatically.  DO NOT EDIT! */";	\
+	echo "#define SIMD_HEADER \"$(SIMD_HEADER)\"";		\
+	echo "#include \"../common/"$*".c\"";			\
+	) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/codlist.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/genus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/genus.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/genus.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cbdftv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cbdftv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cbdftv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cbdftv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cbdftv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cbdftv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cbdftv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cbdftv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cbdftv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cbdftv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cfdftv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cfdftv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cfdftv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cfdftv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cfdftv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cfdftv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cfdftv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cfdftv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/neon/hc2cfdftv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-neon.h"
+#include "../common/hc2cfdftv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/simd.mk
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/simd.mk	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,12 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/simd -I$(top_srcdir)/simd-support
+
+EXTRA_DIST = $(SIMD_CODELETS) genus.c codlist.c
+
+$(EXTRA_DIST): Makefile
+	(							\
+	echo "/* Generated automatically.  DO NOT EDIT! */";	\
+	echo "#define SIMD_HEADER \"$(SIMD_HEADER)\"";		\
+	echo "#include \"../common/"$*".c\"";			\
+	) >$@
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,15 @@
+AM_CFLAGS = $(SSE2_CFLAGS)
+SIMD_HEADER=simd-sse2.h
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+include $(top_srcdir)/rdft/simd/simd.mk
+
+if HAVE_SSE2
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = librdft_sse2_codelets.la
+librdft_sse2_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
+
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,613 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# This file contains a standard list of RDFT SIMD codelets.  It is
+# included by common/Makefile to generate the C files with the actual
+# codelets in them.  It is included by {sse,sse2,...}/Makefile to
+# generate and compile stub files that include common/*.c
+
+# You can customize FFTW for special needs, e.g. to handle certain
+# sizes more efficiently, by adding new codelets to the lists of those
+# included by default.  If you change the list of codelets, any new
+# ones you added will be automatically generated when you run the
+# bootstrap script (see "Generating your own code" in the FFTW
+# manual).
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+	$(top_srcdir)/rdft/simd/codlist.mk \
+	$(top_srcdir)/rdft/simd/simd.mk
+subdir = rdft/simd/sse2
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+librdft_sse2_codelets_la_LIBADD =
+am__librdft_sse2_codelets_la_SOURCES_DIST = hc2cfdftv_2.c \
+	hc2cfdftv_4.c hc2cfdftv_6.c hc2cfdftv_8.c hc2cfdftv_10.c \
+	hc2cfdftv_12.c hc2cfdftv_16.c hc2cfdftv_32.c hc2cfdftv_20.c \
+	hc2cbdftv_2.c hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c \
+	hc2cbdftv_10.c hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c \
+	hc2cbdftv_20.c genus.c codlist.c
+am__objects_1 = hc2cfdftv_2.lo hc2cfdftv_4.lo hc2cfdftv_6.lo \
+	hc2cfdftv_8.lo hc2cfdftv_10.lo hc2cfdftv_12.lo hc2cfdftv_16.lo \
+	hc2cfdftv_32.lo hc2cfdftv_20.lo
+am__objects_2 = hc2cbdftv_2.lo hc2cbdftv_4.lo hc2cbdftv_6.lo \
+	hc2cbdftv_8.lo hc2cbdftv_10.lo hc2cbdftv_12.lo hc2cbdftv_16.lo \
+	hc2cbdftv_32.lo hc2cbdftv_20.lo
+am__objects_3 = $(am__objects_1) $(am__objects_2)
+am__objects_4 = $(am__objects_3) genus.lo codlist.lo
+@HAVE_SSE2_TRUE@am__objects_5 = $(am__objects_4)
+@HAVE_SSE2_TRUE@am_librdft_sse2_codelets_la_OBJECTS =  \
+@HAVE_SSE2_TRUE@	$(am__objects_5)
+librdft_sse2_codelets_la_OBJECTS =  \
+	$(am_librdft_sse2_codelets_la_OBJECTS)
+@HAVE_SSE2_TRUE@am_librdft_sse2_codelets_la_rpath =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(librdft_sse2_codelets_la_SOURCES)
+DIST_SOURCES = $(am__librdft_sse2_codelets_la_SOURCES_DIST)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CFLAGS = $(SSE2_CFLAGS)
+SIMD_HEADER = simd-sse2.h
+HC2CFDFTV = hc2cfdftv_2.c hc2cfdftv_4.c hc2cfdftv_6.c hc2cfdftv_8.c	\
+hc2cfdftv_10.c hc2cfdftv_12.c hc2cfdftv_16.c hc2cfdftv_32.c		\
+hc2cfdftv_20.c
+
+HC2CBDFTV = hc2cbdftv_2.c hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c	\
+hc2cbdftv_10.c hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c		\
+hc2cbdftv_20.c
+
+
+###########################################################################
+SIMD_CODELETS = $(HC2CFDFTV) $(HC2CBDFTV)
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft	\
+-I$(top_srcdir)/rdft/simd -I$(top_srcdir)/simd-support
+
+EXTRA_DIST = $(SIMD_CODELETS) genus.c codlist.c
+@HAVE_SSE2_TRUE@BUILT_SOURCES = $(EXTRA_DIST)
+@HAVE_SSE2_TRUE@noinst_LTLIBRARIES = librdft_sse2_codelets.la
+@HAVE_SSE2_TRUE@librdft_sse2_codelets_la_SOURCES = $(BUILT_SOURCES)
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/rdft/simd/simd.mk $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/simd/sse2/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu rdft/simd/sse2/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/rdft/simd/simd.mk:
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+librdft_sse2_codelets.la: $(librdft_sse2_codelets_la_OBJECTS) $(librdft_sse2_codelets_la_DEPENDENCIES) $(EXTRA_librdft_sse2_codelets_la_DEPENDENCIES) 
+	$(LINK) $(am_librdft_sse2_codelets_la_rpath) $(librdft_sse2_codelets_la_OBJECTS) $(librdft_sse2_codelets_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/genus.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cbdftv_8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_12.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_20.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_6.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2cfdftv_8.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+$(EXTRA_DIST): Makefile
+	(							\
+	echo "/* Generated automatically.  DO NOT EDIT! */";	\
+	echo "#define SIMD_HEADER \"$(SIMD_HEADER)\"";		\
+	echo "#include \"../common/"$*".c\"";			\
+	) >$@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/codlist.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/codlist.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/codlist.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/genus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/genus.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/genus.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cbdftv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cbdftv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cbdftv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cbdftv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cbdftv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cbdftv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cbdftv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cbdftv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cbdftv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cbdftv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_10.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cfdftv_10.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_12.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_12.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cfdftv_12.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_16.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cfdftv_16.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cfdftv_2.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_20.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_20.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cfdftv_20.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_32.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cfdftv_32.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_4.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cfdftv_4.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_6.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_6.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cfdftv_6.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/simd/sse2/hc2cfdftv_8.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3 @@
+/* Generated automatically.  DO NOT EDIT! */
+#define SIMD_HEADER "simd-sse2.h"
+#include "../common/hc2cfdftv_8.c"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/solve.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/solve.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+/* use the apply() operation for RDFT problems */
+void X(rdft_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_rdft *ego = (const plan_rdft *) ego_;
+     const problem_rdft *p = (const problem_rdft *) p_;
+     ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/solve2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/solve2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "rdft.h"
+
+/* use the apply() operation for RDFT2 problems */
+void X(rdft2_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_rdft2 *ego = (const plan_rdft2 *) ego_;
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     ego->apply(ego_, 
+		UNTAINT(p->r0), UNTAINT(p->r1),
+		UNTAINT(p->cr), UNTAINT(p->ci));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/vrank-geq1-rdft2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/vrank-geq1-rdft2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+
+/* Plans for handling vector transform loops.  These are *just* the
+   loops, and rely on child plans for the actual RDFT2s.
+ 
+   They form a wrapper around solvers that don't have apply functions
+   for non-null vectors.
+ 
+   vrank-geq1-rdft2 plans also recursively handle the case of
+   multi-dimensional vectors, obviating the need for most solvers to
+   deal with this.  We can also play games here, such as reordering
+   the vector loops.
+ 
+   Each vrank-geq1-rdft2 plan reduces the vector rank by 1, picking out a
+   dimension determined by the vecloop_dim field of the solver. */
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+     int vecloop_dim;
+     const int *buddies;
+     int nbuddies;
+} S;
+
+typedef struct {
+     plan_rdft2 super;
+
+     plan *cld;
+     INT vl;
+     INT rvs, cvs;
+     const S *solver;
+} P;
+
+static void apply(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     INT i, vl = ego->vl;
+     INT rvs = ego->rvs, cvs = ego->cvs;
+     rdft2apply cldapply = ((plan_rdft2 *) ego->cld)->apply;
+
+     for (i = 0; i < vl; ++i) {
+          cldapply(ego->cld, r0 + i * rvs, r1 + i * rvs,
+		   cr + i * cvs, ci + i * cvs);
+     }
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->solver;
+     p->print(p, "(rdft2-vrank>=1-x%D/%d%(%p%))",
+	      ego->vl, s->vecloop_dim, ego->cld);
+}
+
+static int pickdim(const S *ego, const tensor *vecsz, int oop, int *dp)
+{
+     return X(pickdim)(ego->vecloop_dim, ego->buddies, ego->nbuddies,
+		       vecsz, oop, dp);
+}
+
+static int applicable0(const solver *ego_, const problem *p_, int *dp)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+     if (FINITE_RNK(p->vecsz->rnk)
+	 && p->vecsz->rnk > 0
+	 && pickdim(ego, p->vecsz, p->r0 != p->cr, dp)) {
+	  if (p->r0 != p->cr)
+	       return 1;  /* can always operate out-of-place */
+
+	  return(X(rdft2_inplace_strides)(p, *dp));
+     }
+
+     return 0;
+}
+
+
+static int applicable(const solver *ego_, const problem *p_,
+		      const planner *plnr, int *dp)
+{
+     const S *ego = (const S *)ego_;
+     if (!applicable0(ego_, p_, dp)) return 0;
+
+     /* fftw2 behavior */
+     if (NO_VRANK_SPLITSP(plnr) && (ego->vecloop_dim != ego->buddies[0]))
+	  return 0;
+
+     if (NO_UGLYP(plnr)) {
+	  const problem_rdft2 *p = (const problem_rdft2 *) p_;
+	  iodim *d = p->vecsz->dims + *dp;
+	       
+	  /* Heuristic: if the transform is multi-dimensional, and the
+	     vector stride is less than the transform size, then we
+	     probably want to use a rank>=2 plan first in order to combine
+	     this vector with the transform-dimension vectors. */
+	  if (p->sz->rnk > 1
+	      && X(imin)(X(iabs)(d->is), X(iabs)(d->os))
+	      < X(rdft2_tensor_max_index)(p->sz, p->kind)
+	       )
+	       return 0;
+
+	  /* Heuristic: don't use a vrank-geq1 for rank-0 vrank-1
+	     transforms, since this case is better handled by rank-0
+	     solvers. */
+	  if (p->sz->rnk == 0 && p->vecsz->rnk == 1) return 0;
+
+	  if (NO_NONTHREADEDP(plnr)) 
+	       return 0; /* prefer threaded version */
+     }
+
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft2 *p;
+     P *pln;
+     plan *cld;
+     int vdim;
+     iodim *d;
+     INT rvs, cvs;
+
+     static const plan_adt padt = {
+	  X(rdft2_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &vdim))
+          return (plan *) 0;
+     p = (const problem_rdft2 *) p_;
+
+     d = p->vecsz->dims + vdim;
+
+     A(d->n > 1);  /* or else, p->ri + d->is etc. are invalid */
+
+     X(rdft2_strides)(p->kind, d, &rvs, &cvs);
+
+     cld = X(mkplan_d)(plnr, 
+		       X(mkproblem_rdft2_d)(
+			    X(tensor_copy)(p->sz),
+			    X(tensor_copy_except)(p->vecsz, vdim),
+			    TAINT(p->r0, rvs), TAINT(p->r1, rvs), 
+			    TAINT(p->cr, cvs), TAINT(p->ci, cvs),
+			    p->kind));
+     if (!cld) return (plan *) 0;
+
+     pln = MKPLAN_RDFT2(P, &padt, apply);
+
+     pln->cld = cld;
+     pln->vl = d->n;
+     pln->rvs = rvs;
+     pln->cvs = cvs;
+
+     pln->solver = ego;
+     X(ops_zero)(&pln->super.super.ops);
+     pln->super.super.ops.other = 3.14159; /* magic to prefer codelet loops */
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+
+     if (p->sz->rnk != 1 || (p->sz->dims[0].n > 128))
+	  pln->super.super.pcost = pln->vl * cld->pcost;
+
+     return &(pln->super.super);
+}
+
+static solver *mksolver(int vecloop_dim, const int *buddies, int nbuddies)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->vecloop_dim = vecloop_dim;
+     slv->buddies = buddies;
+     slv->nbuddies = nbuddies;
+     return &(slv->super);
+}
+
+void X(rdft2_vrank_geq1_register)(planner *p)
+{
+     int i;
+
+     /* FIXME: Should we try other vecloop_dim values? */
+     static const int buddies[] = { 1, -1 };
+
+     const int nbuddies = (int)(sizeof(buddies) / sizeof(buddies[0]));
+
+     for (i = 0; i < nbuddies; ++i)
+          REGISTER_SOLVER(p, mksolver(buddies[i], buddies, nbuddies));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/vrank-geq1.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/vrank-geq1.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+
+/* Plans for handling vector transform loops.  These are *just* the
+   loops, and rely on child plans for the actual RDFTs.
+ 
+   They form a wrapper around solvers that don't have apply functions
+   for non-null vectors.
+ 
+   vrank-geq1 plans also recursively handle the case of multi-dimensional
+   vectors, obviating the need for most solvers to deal with this.  We
+   can also play games here, such as reordering the vector loops.
+ 
+   Each vrank-geq1 plan reduces the vector rank by 1, picking out a
+   dimension determined by the vecloop_dim field of the solver. */
+
+#include "rdft.h"
+
+typedef struct {
+     solver super;
+     int vecloop_dim;
+     const int *buddies;
+     int nbuddies;
+} S;
+
+typedef struct {
+     plan_rdft super;
+
+     plan *cld;
+     INT vl;
+     INT ivs, ovs;
+     const S *solver;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT i, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     rdftapply cldapply = ((plan_rdft *) ego->cld)->apply;
+
+     for (i = 0; i < vl; ++i) {
+          cldapply(ego->cld, I + i * ivs, O + i * ovs);
+     }
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->solver;
+     p->print(p, "(rdft-vrank>=1-x%D/%d%(%p%))",
+	      ego->vl, s->vecloop_dim, ego->cld);
+}
+
+static int pickdim(const S *ego, const tensor *vecsz, int oop, int *dp)
+{
+     return X(pickdim)(ego->vecloop_dim, ego->buddies, ego->nbuddies,
+		       vecsz, oop, dp);
+}
+
+static int applicable0(const solver *ego_, const problem *p_, int *dp)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft *p = (const problem_rdft *) p_;
+
+     return (1
+	     && FINITE_RNK(p->vecsz->rnk)
+	     && p->vecsz->rnk > 0
+
+	     && p->sz->rnk >= 0
+
+	     && pickdim(ego, p->vecsz, p->I != p->O, dp)
+	  );
+}
+
+static int applicable(const solver *ego_, const problem *p_, 
+		      const planner *plnr, int *dp)
+{
+     const S *ego = (const S *)ego_;
+     const problem_rdft *p;
+
+     if (!applicable0(ego_, p_, dp)) return 0;
+
+     /* fftw2 behavior */
+     if (NO_VRANK_SPLITSP(plnr) && (ego->vecloop_dim != ego->buddies[0]))
+	  return 0;
+
+     p = (const problem_rdft *) p_;
+
+     if (NO_UGLYP(plnr)) {
+	  /* the rank-0 solver deals with the general case most of the
+	     time (an exception is loops of non-square transposes) */
+	  if (NO_SLOWP(plnr) && p->sz->rnk == 0)
+	       return 0;
+
+	  /* Heuristic: if the transform is multi-dimensional, and the
+	     vector stride is less than the transform size, then we
+	     probably want to use a rank>=2 plan first in order to combine
+	     this vector with the transform-dimension vectors. */
+	  {
+	       iodim *d = p->vecsz->dims + *dp;
+	       if (1
+		   && p->sz->rnk > 1 
+		   && X(imin)(X(iabs)(d->is), X(iabs)(d->os))
+		   < X(tensor_max_index)(p->sz)
+		    )
+		    return 0;
+	  }
+
+	  /* prefer threaded version */
+	  if (NO_NONTHREADEDP(plnr)) return 0;
+
+	  /* exploit built-in vecloops of (ugly) r{e,o}dft solvers */
+	  if (p->vecsz->rnk == 1 && p->sz->rnk == 1 
+	      && REODFT_KINDP(p->kind[0]))
+	       return 0;
+     }
+
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft *p;
+     P *pln;
+     plan *cld;
+     int vdim;
+     iodim *d;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &vdim))
+          return (plan *) 0;
+     p = (const problem_rdft *) p_;
+
+     d = p->vecsz->dims + vdim;
+
+     A(d->n > 1); 
+
+     cld = X(mkplan_d)(plnr, 
+		       X(mkproblem_rdft_d)(
+			    X(tensor_copy)(p->sz),
+			    X(tensor_copy_except)(p->vecsz, vdim),
+			    TAINT(p->I, d->is), TAINT(p->O, d->os),
+			    p->kind));
+     if (!cld) return (plan *) 0;
+
+     pln = MKPLAN_RDFT(P, &padt, apply);
+
+     pln->cld = cld;
+     pln->vl = d->n;
+     pln->ivs = d->is;
+     pln->ovs = d->os;
+
+     pln->solver = ego;
+     X(ops_zero)(&pln->super.super.ops);
+     pln->super.super.ops.other = 3.14159; /* magic to prefer codelet loops */
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+
+     if (p->sz->rnk != 1 || (p->sz->dims[0].n > 128))
+	  pln->super.super.pcost = pln->vl * cld->pcost;
+
+     return &(pln->super.super);
+}
+
+static solver *mksolver(int vecloop_dim, const int *buddies, int nbuddies)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->vecloop_dim = vecloop_dim;
+     slv->buddies = buddies;
+     slv->nbuddies = nbuddies;
+     return &(slv->super);
+}
+
+void X(rdft_vrank_geq1_register)(planner *p)
+{
+     int i;
+
+     /* FIXME: Should we try other vecloop_dim values? */
+     static const int buddies[] = { 1, -1 };
+
+     const int nbuddies = (int)(sizeof(buddies) / sizeof(buddies[0]));
+
+     for (i = 0; i < nbuddies; ++i)
+          REGISTER_SOLVER(p, mksolver(buddies[i], buddies, nbuddies));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/rdft/vrank3-transpose.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/rdft/vrank3-transpose.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,777 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* rank-0, vector-rank-3, non-square in-place transposition
+   (see rank0.c for square transposition)  */
+
+#include "rdft.h"
+
+#ifdef HAVE_STRING_H
+#include <string.h>		/* for memcpy() */
+#endif
+
+struct P_s;
+
+typedef struct {
+     rdftapply apply;
+     int (*applicable)(const problem_rdft *p, planner *plnr,
+		       int dim0, int dim1, int dim2, INT *nbuf);
+     int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego);
+     const char *nam;
+} transpose_adt;
+
+typedef struct {
+     solver super;
+     const transpose_adt *adt;
+} S;
+
+typedef struct P_s {
+     plan_rdft super;
+     INT n, m, vl; /* transpose n x m matrix of vl-tuples */
+     INT nbuf; /* buffer size */
+     INT nd, md, d; /* transpose-gcd params */
+     INT nc, mc; /* transpose-cut params */
+     plan *cld1, *cld2, *cld3; /* children, null if unused */
+     const S *slv;
+} P;
+
+
+/*************************************************************************/
+/* some utilities for the solvers */
+
+static INT gcd(INT a, INT b)
+{
+     INT r;
+     do {
+	  r = a % b;
+	  a = b;
+	  b = r;
+     } while (r != 0);
+     
+     return a;
+}
+
+/* whether we can transpose with one of our routines expecting
+   contiguous Ntuples */
+static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs)
+{
+     return (vs == 1 && b->is == vl && a->os == vl &&
+	     ((a->n == b->n && a->is == b->os
+	       && a->is >= b->n && a->is % vl == 0)
+	      || (a->is == b->n * vl && b->os == a->n * vl)));
+}
+
+/* check whether a and b correspond to the first and second dimensions
+   of a transpose of tuples with vector length = vl, stride = vs. */
+static int transposable(const iodim *a, const iodim *b, INT vl, INT vs)
+{
+     return ((a->n == b->n && a->os == b->is && a->is == b->os)
+             || Ntuple_transposable(a, b, vl, vs));
+}
+
+static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2)
+{
+     int dim0, dim1;
+
+     for (dim0 = 0; dim0 < s->rnk; ++dim0)
+          for (dim1 = 0; dim1 < s->rnk; ++dim1) {
+	       int dim2 = 3 - dim0 - dim1;
+	       if (dim0 == dim1) continue;
+               if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os)
+		   && transposable(s->dims + dim0, s->dims + dim1, 
+				   s->rnk == 2 ? (INT)1 : s->dims[dim2].n,
+				   s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) {
+                    *pdim0 = dim0;
+                    *pdim1 = dim1;
+		    *pdim2 = dim2;
+                    return 1;
+               }
+	  }
+     return 0;
+}
+
+#define MINBUFDIV 9 /* min factor by which buffer is smaller than data */
+#define MAXBUF 65536 /* maximum non-ugly buffer */
+
+/* generic applicability function */
+static int applicable(const solver *ego_, const problem *p_, planner *plnr,
+		      int *dim0, int *dim1, int *dim2, INT *nbuf)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft *p = (const problem_rdft *) p_;
+
+     return (1
+	     && p->I == p->O
+	     && p->sz->rnk == 0
+	     && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3)
+
+	     && pickdim(p->vecsz, dim0, dim1, dim2)
+
+	     /* UGLY if vecloop in wrong order for locality */
+	     && (!NO_UGLYP(plnr) ||
+		 p->vecsz->rnk == 2 ||
+		 X(iabs)(p->vecsz->dims[*dim2].is)
+		 < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is),
+			   X(iabs)(p->vecsz->dims[*dim0].os)))
+
+	     /* SLOW if non-square */
+	     && (!NO_SLOWP(plnr)
+		 || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n)
+		      
+	     && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf)
+
+	     /* buffers too big are UGLY */
+	     && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr))
+		 || *nbuf <= MAXBUF
+		 || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz))
+	  );
+}
+
+static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs)
+{
+     if (p->vecsz->rnk == 2) {
+	  *vl = 1; *vs = 1;
+     }
+     else {
+	  *vl = p->vecsz->dims[dim2].n;
+	  *vs = p->vecsz->dims[dim2].is; /* == os */
+     }  
+}
+
+/*************************************************************************/
+/* Cache-oblivious in-place transpose of non-square matrices, based 
+   on transposes of blocks given by the gcd of the dimensions.
+
+   This algorithm is related to algorithm V5 from Murray Dow,
+   "Transposing a matrix on a vector computer," Parallel Computing 21
+   (12), 1997-2005 (1995), with the modification that we use
+   cache-oblivious recursive transpose subroutines (and we derived
+   it independently).
+   
+   For a p x q matrix, this requires scratch space equal to the size
+   of the matrix divided by gcd(p,q).  Alternatively, see also the
+   "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */
+
+static void apply_gcd(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT n = ego->nd, m = ego->md, d = ego->d;
+     INT vl = ego->vl;
+     R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
+     INT i, num_el = n*m*d*vl;
+
+     A(ego->n == n * d && ego->m == m * d);
+     UNUSED(O);
+
+     /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix
+	of vl-tuples and buf contains n*m*d*vl elements.  
+	
+	In general, to transpose a p x q matrix, you should call this
+	routine with d = gcd(p, q), n = p/d, and m = q/d.  */
+
+     A(n > 0 && m > 0 && vl > 0);
+     A(d > 1);
+
+     /* treat as (d x n) x (d' x m) matrix.  (d' = d) */
+     
+     /* First, transpose d x (n x d') x m to d x (d' x n) x m,
+	using the buf matrix.  This consists of d transposes
+	of contiguous n x d' matrices of m-tuples. */
+     if (n > 1) {
+	  rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply;
+	  for (i = 0; i < d; ++i) {
+	       cldapply(ego->cld1, I + i*num_el, buf);
+	       memcpy(I + i*num_el, buf, num_el*sizeof(R));
+	  }
+     }
+     
+     /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which
+	is a square in-place transpose of n*m-tuples: */
+     {
+	  rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply;
+	  cldapply(ego->cld2, I, I);
+     }
+     
+     /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)),
+	using the buf matrix.  This consists of d' transposes
+	of contiguous d*n x m matrices. */
+     if (m > 1) {
+	  rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply;
+	  for (i = 0; i < d; ++i) {
+	       cldapply(ego->cld3, I + i*num_el, buf);
+	       memcpy(I + i*num_el, buf, num_el*sizeof(R));
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+static int applicable_gcd(const problem_rdft *p, planner *plnr,
+			  int dim0, int dim1, int dim2, INT *nbuf)
+{
+     INT n = p->vecsz->dims[dim0].n;
+     INT m = p->vecsz->dims[dim1].n;
+     INT d, vl, vs;
+     get_transpose_vec(p, dim2, &vl, &vs);
+     d = gcd(n, m);
+     *nbuf = n * (m / d) * vl;
+     return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */
+	     && n != m
+	     && d > 1
+	     && Ntuple_transposable(p->vecsz->dims + dim0,
+				    p->vecsz->dims + dim1,
+				    vl, vs));
+}
+
+static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego)
+{
+     INT n = ego->nd, m = ego->md, d = ego->d;
+     INT vl = ego->vl;
+     R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
+     INT num_el = n*m*d*vl;
+
+     if (n > 1) {
+	  ego->cld1 = X(mkplan_d)(plnr,
+				  X(mkproblem_rdft_0_d)(
+				       X(mktensor_3d)(n, d*m*vl, m*vl,
+						      d, m*vl, n*m*vl,
+						      m*vl, 1, 1),
+				       TAINT(p->I, num_el), buf));
+	  if (!ego->cld1)
+	       goto nada;
+	  X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops,
+		      &ego->super.super.ops);
+	  ego->super.super.ops.other += num_el * d * 2;
+     }
+
+     ego->cld2 = X(mkplan_d)(plnr,
+			     X(mkproblem_rdft_0_d)(
+				  X(mktensor_3d)(d, d*n*m*vl, n*m*vl,
+						 d, n*m*vl, d*n*m*vl,
+						 n*m*vl, 1, 1),
+				  p->I, p->I));
+     if (!ego->cld2)
+	  goto nada;
+     X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
+
+     if (m > 1) {
+	  ego->cld3 = X(mkplan_d)(plnr,
+				  X(mkproblem_rdft_0_d)(
+				       X(mktensor_3d)(d*n, m*vl, vl,
+						      m, vl, d*n*vl,
+						      vl, 1, 1),
+				       TAINT(p->I, num_el), buf));
+	  if (!ego->cld3)
+	       goto nada;
+	  X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops);
+	  ego->super.super.ops.other += num_el * d * 2;
+     }
+
+     X(ifree)(buf);
+     return 1;
+
+ nada:
+     X(ifree)(buf);
+     return 0;
+}
+
+static const transpose_adt adt_gcd =
+{
+     apply_gcd, applicable_gcd, mkcldrn_gcd,
+     "rdft-transpose-gcd"
+};
+
+/*************************************************************************/
+/* Cache-oblivious in-place transpose of non-square n x m matrices,
+   based on transposing a sub-matrix first and then transposing the
+   remainder(s) with the help of a buffer.  See also transpose-gcd,
+   above, if gcd(n,m) is large.
+
+   This algorithm is related to algorithm V3 from Murray Dow,
+   "Transposing a matrix on a vector computer," Parallel Computing 21
+   (12), 1997-2005 (1995), with the modifications that we use
+   cache-oblivious recursive transpose subroutines and we have the
+   generalization for large |n-m| below.
+
+   The best case, and the one described by Dow, is for |n-m| small, in
+   which case we transpose a square sub-matrix of size min(n,m),
+   handling the remainder via a buffer.  This requires scratch space
+   equal to the size of the matrix times |n-m| / max(n,m).
+
+   As a generalization when |n-m| is not small, we also support cutting
+   *both* dimensions to an nc x mc matrix which is *not* necessarily
+   square, but has a large gcd (and can therefore use transpose-gcd).
+*/
+
+static void apply_cut(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl;
+     INT i;
+     R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
+     UNUSED(O);
+
+     if (m > mc) {
+	  ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1);
+	  for (i = 0; i < nc; ++i)
+	       memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl));
+     }
+
+     ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */
+     
+     if (n > nc) {
+	  R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */
+	  memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R));
+	  for (i = mc-1; i >= 0; --i)
+	       memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl));
+	  ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl);
+     }
+
+     if (m > mc) {
+	  if (n > nc)
+	       for (i = mc; i < m; ++i)
+		    memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl),
+			   (nc*vl)*sizeof(R));
+	  else
+	       memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R));
+     }
+
+     X(ifree)(buf1);
+}
+
+/* only cut one dimension if the resulting buffer is small enough */
+static int cut1(INT n, INT m, INT vl)
+{
+     return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV
+	     || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF);
+}
+
+#define CUT_NSRCH 32 /* range of sizes to search for possible cuts */
+
+static int applicable_cut(const problem_rdft *p, planner *plnr,
+			  int dim0, int dim1, int dim2, INT *nbuf)
+{
+     INT n = p->vecsz->dims[dim0].n;
+     INT m = p->vecsz->dims[dim1].n;
+     INT vl, vs;
+     get_transpose_vec(p, dim2, &vl, &vs);
+     *nbuf = 0; /* always small enough to be non-UGLY (?) */
+     A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */
+     return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */
+	     && n != m
+	     
+	     /* Don't call transpose-cut recursively (avoid inf. loops):
+	        the non-square sub-transpose produced when !cut1
+	        should always have gcd(n,m) >= min(CUT_NSRCH,n,m),
+	        for which transpose-gcd is applicable */
+	     && (cut1(n, m, vl)
+		 || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m)))
+
+	     && Ntuple_transposable(p->vecsz->dims + dim0,
+				    p->vecsz->dims + dim1,
+				    vl, vs));
+}
+
+static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego)
+{
+     INT n = ego->n, m = ego->m, nc, mc;
+     INT vl = ego->vl;
+     R *buf;
+
+     /* pick the "best" cut */
+     if (cut1(n, m, vl)) {
+	  nc = mc = X(imin)(n,m);
+     }
+     else {
+	  INT dc, ns, ms;
+	  dc = gcd(m, n); nc = n; mc = m;
+	  /* search for cut with largest gcd
+	     (TODO: different optimality criteria? different search range?) */
+	  for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) {
+	       for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) {
+		    INT ds = gcd(ms, ns);
+		    if (ds > dc) {
+			 dc = ds; nc = ns; mc = ms;
+			 if (dc == X(imin)(ns, ms))
+			      break; /* cannot get larger than this */
+		    }
+	       }
+	       if (dc == X(imin)(n, ms))
+		    break; /* cannot get larger than this */
+	  }
+	  A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m)));
+     }
+     ego->nc = nc;
+     ego->mc = mc;
+     ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl);
+
+     buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
+
+     if (m > mc) {
+	  ego->cld1 = X(mkplan_d)(plnr,
+				  X(mkproblem_rdft_0_d)(
+				       X(mktensor_3d)(nc, m*vl, vl,
+						      m-mc, vl, nc*vl,
+						      vl, 1, 1),
+				       p->I + mc*vl, buf));
+	  if (!ego->cld1)
+	       goto nada;
+	  X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops);
+     }
+
+     ego->cld2 = X(mkplan_d)(plnr,
+			     X(mkproblem_rdft_0_d)(
+				  X(mktensor_3d)(nc, mc*vl, vl,
+						 mc, vl, nc*vl,
+						 vl, 1, 1),
+				  p->I, p->I));
+     if (!ego->cld2)
+	  goto nada;
+     X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
+
+     if (n > nc) {
+	  ego->cld3 = X(mkplan_d)(plnr,
+				  X(mkproblem_rdft_0_d)(
+				       X(mktensor_3d)(n-nc, m*vl, vl,
+						      m, vl, n*vl,
+						      vl, 1, 1),
+				       buf + (m-mc)*(nc*vl), p->I + nc*vl));
+	  if (!ego->cld3)
+	       goto nada;
+	  X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops);
+     }
+
+     /* memcpy/memmove operations */
+     ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc))
+					     + (n-nc)*m + (m-mc)*nc);
+
+     X(ifree)(buf);
+     return 1;
+
+ nada:
+     X(ifree)(buf);
+     return 0;
+}
+
+static const transpose_adt adt_cut =
+{
+     apply_cut, applicable_cut, mkcldrn_cut,
+     "rdft-transpose-cut"
+};
+
+/*************************************************************************/
+/* In-place transpose routine from TOMS, which follows the cycles of
+   the permutation so that it writes to each location only once.
+   Because of cache-line and other issues, however, this routine is
+   typically much slower than transpose-gcd or transpose-cut, even
+   though the latter do some extra writes.  On the other hand, if the
+   vector length is large then the TOMS routine is best.
+
+   The TOMS routine also has the advantage of requiring less buffer
+   space for the case of gcd(nx,ny) small.  However, in this case it
+   has been superseded by the combination of the generalized
+   transpose-cut method with the transpose-gcd method, which can
+   always transpose with buffers a small fraction of the array size
+   regardless of gcd(nx,ny). */
+
+/*
+ * TOMS Transpose.  Algorithm 513 (Revised version of algorithm 380).
+ * 
+ * These routines do in-place transposes of arrays.
+ * 
+ * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software, 
+ *   vol. 3, no. 1, 104-110 (1977) ]
+ * 
+ * C version by Steven G. Johnson (February 1997).
+ */
+
+/*
+ * "a" is a 1D array of length ny*nx*N which constains the nx x ny
+ * matrix of N-tuples to be transposed.  "a" is stored in row-major
+ * order (last index varies fastest).  move is a 1D array of length
+ * move_size used to store information to speed up the process.  The
+ * value move_size=(ny+nx)/2 is recommended.  buf should be an array
+ * of length 2*N.
+ * 
+ */
+
+static void transpose_toms513(R *a, INT nx, INT ny, INT N,
+                              char *move, INT move_size, R *buf)
+{
+     INT i, im, mn;
+     R *b, *c, *d;
+     INT ncount;
+     INT k;
+     
+     /* check arguments and initialize: */
+     A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
+     
+     b = buf;
+     
+     /* Cate & Twigg have a special case for nx == ny, but we don't
+	bother, since we already have special code for this case elsewhere. */
+
+     c = buf + N;
+     ncount = 2;		/* always at least 2 fixed points */
+     k = (mn = ny * nx) - 1;
+     
+     for (i = 0; i < move_size; ++i)
+	  move[i] = 0;
+     
+     if (ny >= 3 && nx >= 3)
+	  ncount += gcd(ny - 1, nx - 1) - 1;	/* # fixed points */
+     
+     i = 1;
+     im = ny;
+     
+     while (1) {
+	  INT i1, i2, i1c, i2c;
+	  INT kmi;
+	  
+	  /** Rearrange the elements of a loop
+	      and its companion loop: **/
+	  
+	  i1 = i;
+	  kmi = k - i;
+	  i1c = kmi;
+	  switch (N) {
+	      case 1:
+		   b[0] = a[i1];
+		   c[0] = a[i1c];
+		   break;
+	      case 2:
+		   b[0] = a[2*i1];
+		   b[1] = a[2*i1+1];
+		   c[0] = a[2*i1c];
+		   c[1] = a[2*i1c+1];
+		   break;
+	      default:
+		   memcpy(b, &a[N * i1], N * sizeof(R));
+		   memcpy(c, &a[N * i1c], N * sizeof(R));
+	  }
+	  while (1) {
+	       i2 = ny * i1 - k * (i1 / nx);
+	       i2c = k - i2;
+	       if (i1 < move_size)
+		    move[i1] = 1;
+	       if (i1c < move_size)
+		    move[i1c] = 1;
+	       ncount += 2;
+	       if (i2 == i)
+		    break;
+	       if (i2 == kmi) {
+		    d = b;
+		    b = c;
+		    c = d;
+		    break;
+	       }
+	       switch (N) {
+		   case 1:
+			a[i1] = a[i2];
+			a[i1c] = a[i2c];
+			break;
+		   case 2:
+			a[2*i1] = a[2*i2];
+			a[2*i1+1] = a[2*i2+1];
+			a[2*i1c] = a[2*i2c];
+			a[2*i1c+1] = a[2*i2c+1];
+			break;
+		   default:
+			memcpy(&a[N * i1], &a[N * i2], 
+			       N * sizeof(R));
+			memcpy(&a[N * i1c], &a[N * i2c], 
+			       N * sizeof(R));
+	       }
+	       i1 = i2;
+	       i1c = i2c;
+	  }
+	  switch (N) {
+	      case 1:
+		   a[i1] = b[0];
+		   a[i1c] = c[0];
+		   break;
+	      case 2:
+		   a[2*i1] = b[0];
+		   a[2*i1+1] = b[1];
+		   a[2*i1c] = c[0];
+		   a[2*i1c+1] = c[1];
+		   break;
+	      default:
+		   memcpy(&a[N * i1], b, N * sizeof(R));
+		   memcpy(&a[N * i1c], c, N * sizeof(R));
+	  }
+	  if (ncount >= mn)
+	       break;	/* we've moved all elements */
+	  
+	  /** Search for loops to rearrange: **/
+	  
+	  while (1) {
+	       INT max = k - i;
+	       ++i;
+	       A(i <= max);
+	       im += ny;
+	       if (im > k)
+		    im -= k;
+	       i2 = im;
+	       if (i == i2)
+		    continue;
+	       if (i >= move_size) {
+		    while (i2 > i && i2 < max) {
+			 i1 = i2;
+			 i2 = ny * i1 - k * (i1 / nx);
+		    }
+		    if (i2 == i)
+			 break;
+	       } else if (!move[i])
+		    break;
+	  }
+     }
+}
+
+static void apply_toms513(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT n = ego->n, m = ego->m;
+     INT vl = ego->vl;
+     R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
+     UNUSED(O);
+     transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf);
+     X(ifree)(buf);
+}
+
+static int applicable_toms513(const problem_rdft *p, planner *plnr,
+			   int dim0, int dim1, int dim2, INT *nbuf)
+{
+     INT n = p->vecsz->dims[dim0].n;
+     INT m = p->vecsz->dims[dim1].n;
+     INT vl, vs;
+     get_transpose_vec(p, dim2, &vl, &vs);
+     *nbuf = 2*vl 
+	  + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R);
+     return (!NO_SLOWP(plnr)
+	     && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */
+	     && n != m
+	     && Ntuple_transposable(p->vecsz->dims + dim0,
+				    p->vecsz->dims + dim1,
+				    vl, vs));
+}
+
+static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego)
+{
+     UNUSED(p); UNUSED(plnr);
+     /* heuristic so that TOMS algorithm is last resort for small vl */
+     ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30);
+     return 1;
+}
+
+static const transpose_adt adt_toms513 =
+{
+     apply_toms513, applicable_toms513, mkcldrn_toms513,
+     "rdft-transpose-toms513"
+};
+
+/*-----------------------------------------------------------------------*/
+/*-----------------------------------------------------------------------*/
+/* generic stuff: */
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+     X(plan_awake)(ego->cld3, wakefulness);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam,
+	      ego->n, ego->m, ego->vl);
+     if (ego->cld1) p->print(p, "%(%p%)", ego->cld1);
+     if (ego->cld2) p->print(p, "%(%p%)", ego->cld2);
+     if (ego->cld3) p->print(p, "%(%p%)", ego->cld3);
+     p->print(p, ")");
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld3);
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft *p;
+     int dim0, dim1, dim2;
+     INT nbuf, vs;
+     P *pln;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf))
+          return (plan *) 0;
+
+     p = (const problem_rdft *) p_;
+     pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
+
+     pln->n = p->vecsz->dims[dim0].n;
+     pln->m = p->vecsz->dims[dim1].n;
+     get_transpose_vec(p, dim2, &pln->vl, &vs);
+     pln->nbuf = nbuf;
+     pln->d = gcd(pln->n, pln->m);
+     pln->nd = pln->n / pln->d;
+     pln->md = pln->m / pln->d;
+     pln->slv = ego;
+
+     X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */
+
+     pln->cld1 = pln->cld2 = pln->cld3 = 0;
+     if (!ego->adt->mkcldrn(p, plnr, pln)) {
+	  X(plan_destroy_internal)(&(pln->super.super));
+	  return 0;
+     }
+
+     return &(pln->super.super);
+}
+
+static solver *mksolver(const transpose_adt *adt)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->adt = adt;
+     return &(slv->super);
+}
+
+void X(rdft_vrank3_transpose_register)(planner *p)
+{
+     unsigned i;
+     static const transpose_adt *const adts[] = {
+	  &adt_gcd, &adt_cut,
+	  &adt_toms513
+     };
+     for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
+          REGISTER_SOLVER(p, mksolver(adts[i]));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,15 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft
+SUBDIRS = 
+
+noinst_LTLIBRARIES = libreodft.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = reodft.h
+
+# no longer used due to numerical problems
+EXTRA_DIST = reodft11e-r2hc.c redft00e-r2hc.c rodft00e-r2hc.c
+
+libreodft_la_SOURCES = conf.c reodft.h reodft010e-r2hc.c	\
+reodft11e-radix2.c reodft11e-r2hc-odd.c redft00e-r2hc-pad.c	\
+rodft00e-r2hc-pad.c reodft00e-splitradix.c
+# redft00e-r2hc.c rodft00e-r2hc.c reodft11e-r2hc.c
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,702 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = reodft
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libreodft_la_LIBADD =
+am_libreodft_la_OBJECTS = conf.lo reodft010e-r2hc.lo \
+	reodft11e-radix2.lo reodft11e-r2hc-odd.lo redft00e-r2hc-pad.lo \
+	rodft00e-r2hc-pad.lo reodft00e-splitradix.lo
+libreodft_la_OBJECTS = $(am_libreodft_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libreodft_la_SOURCES)
+DIST_SOURCES = $(libreodft_la_SOURCES)
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
+	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
+	distdir
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/rdft
+SUBDIRS = 
+noinst_LTLIBRARIES = libreodft.la
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = reodft.h
+
+# no longer used due to numerical problems
+EXTRA_DIST = reodft11e-r2hc.c redft00e-r2hc.c rodft00e-r2hc.c
+libreodft_la_SOURCES = conf.c reodft.h reodft010e-r2hc.c	\
+reodft11e-radix2.c reodft11e-r2hc-odd.c redft00e-r2hc-pad.c	\
+rodft00e-r2hc-pad.c reodft00e-splitradix.c
+
+all: all-recursive
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu reodft/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu reodft/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libreodft.la: $(libreodft_la_OBJECTS) $(libreodft_la_DEPENDENCIES) $(EXTRA_libreodft_la_DEPENDENCIES) 
+	$(LINK)  $(libreodft_la_OBJECTS) $(libreodft_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/conf.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/redft00e-r2hc-pad.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/reodft00e-splitradix.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/reodft010e-r2hc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/reodft11e-r2hc-odd.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/reodft11e-radix2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rodft00e-r2hc-pad.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@fail= failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-recursive
+all-am: Makefile $(LTLIBRARIES)
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \
+	install-am install-strip tags-recursive
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am check check-am clean clean-generic clean-libtool \
+	clean-noinstLTLIBRARIES ctags ctags-recursive distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	installdirs-am maintainer-clean maintainer-clean-generic \
+	mostlyclean mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \
+	uninstall uninstall-am
+
+# redft00e-r2hc.c rodft00e-r2hc.c reodft11e-r2hc.c
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/conf.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/conf.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "reodft.h"
+
+static const solvtab s =
+{
+#if 0 /* 1 to enable "standard" algorithms with substandard accuracy;
+         you must also add them to Makefile.am to compile these files*/
+     SOLVTAB(X(redft00e_r2hc_register)),
+     SOLVTAB(X(rodft00e_r2hc_register)),
+     SOLVTAB(X(reodft11e_r2hc_register)),
+#endif
+     SOLVTAB(X(redft00e_r2hc_pad_register)),
+     SOLVTAB(X(rodft00e_r2hc_pad_register)),
+     SOLVTAB(X(reodft00e_splitradix_register)),
+     SOLVTAB(X(reodft010e_r2hc_register)),
+     SOLVTAB(X(reodft11e_radix2_r2hc_register)),
+     SOLVTAB(X(reodft11e_r2hc_odd_register)),
+
+     SOLVTAB_END
+};
+
+void X(reodft_conf_standard)(planner *p)
+{
+     X(solvtab_exec)(s, p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/redft00e-r2hc-pad.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/redft00e-r2hc-pad.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Do a REDFT00 problem via an R2HC problem, padded symmetrically to
+   twice the size.  This is asymptotically a factor of ~2 worse than
+   redft00e-r2hc.c (the algorithm used in e.g. FFTPACK and Numerical
+   Recipes), but we abandoned the latter after we discovered that it
+   has intrinsic accuracy problems. */
+
+#include "reodft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld, *cldcpy;
+     INT is;
+     INT n;
+     INT vl;
+     INT ivs, ovs;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * (2*n), BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = I[0];
+	  for (i = 1; i < n; ++i) {
+	       R a = I[i * is];
+	       buf[i] = a;
+	       buf[2*n - i] = a;
+	  }
+	  buf[i] = I[i * is]; /* i == n, Nyquist */
+	  
+	  /* r2hc transform of size 2*n */
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  /* copy n+1 real numbers (real parts of hc array) from buf to O */
+	  {
+	       plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
+	       cldcpy->apply((plan *) cldcpy, buf, O);
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldcpy, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldcpy);
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(redft00e-r2hc-pad-%D%v%(%p%)%(%p%))", 
+	      ego->n + 1, ego->vl, ego->cld, ego->cldcpy);
+}
+
+static int applicable0(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego_);
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1
+	     && p->kind[0] == REDFT00
+	     && p->sz->dims[0].n > 1  /* n == 1 is not well-defined */
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p, const planner *plnr)
+{
+     return (!NO_SLOWP(plnr) && applicable0(ego, p));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     plan *cld = (plan *) 0, *cldcpy;
+     R *buf = (R *) 0;
+     INT n;
+     INT vl, ivs, ovs;
+     opcnt ops;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+	  goto nada;
+
+     p = (const problem_rdft *) p_;
+
+     n = p->sz->dims[0].n - 1;
+     A(n > 0);
+     buf = (R *) MALLOC(sizeof(R) * (2*n), BUFFERS);
+
+     cld = X(mkplan_d)(plnr,X(mkproblem_rdft_1_d)(X(mktensor_1d)(2*n,1,1), 
+						  X(mktensor_0d)(), 
+						  buf, buf, R2HC));
+     if (!cld)
+	  goto nada;
+
+     X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
+     cldcpy =
+	  X(mkplan_d)(plnr,
+		      X(mkproblem_rdft_1_d)(X(mktensor_0d)(),
+					    X(mktensor_1d)(n+1,1,
+							   p->sz->dims[0].os), 
+					    buf, TAINT(p->O, ovs), R2HC));
+     if (!cldcpy)
+	  goto nada;
+
+     X(ifree)(buf);
+
+     pln = MKPLAN_RDFT(P, &padt, apply);
+
+     pln->n = n;
+     pln->is = p->sz->dims[0].is;
+     pln->cld = cld;
+     pln->cldcpy = cldcpy;
+     pln->vl = vl;
+     pln->ivs = ivs;
+     pln->ovs = ovs;
+     
+     X(ops_zero)(&ops);
+     ops.other = n + 2*n; /* loads + stores (input -> buf) */
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cldcpy->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(ifree0)(buf);
+     if (cld)
+	  X(plan_destroy_internal)(cld);  
+     return (plan *)0;
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(redft00e_r2hc_pad_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/redft00e-r2hc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/redft00e-r2hc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Do a REDFT00 problem via an R2HC problem, with some pre/post-processing.
+
+   This code uses the trick from FFTPACK, also documented in a similar
+   form by Numerical Recipes.  Unfortunately, this algorithm seems to
+   have intrinsic numerical problems (similar to those in
+   reodft11e-r2hc.c), possibly due to the fact that it multiplies its
+   input by a cosine, causing a loss of precision near the zero.  For
+   transforms of 16k points, it has already lost three or four decimal
+   places of accuracy, which we deem unacceptable.
+
+   So, we have abandoned this algorithm in favor of the one in
+   redft00-r2hc-pad.c, which unfortunately sacrifices 30-50% in speed.
+   The only other alternative in the literature that does not have
+   similar numerical difficulties seems to be the direct adaptation of
+   the Cooley-Tukey decomposition for symmetric data, but this would
+   require a whole new set of codelets and it's not clear that it's
+   worth it at this point.  However, we did implement the latter
+   algorithm for the specific case of odd n (logically adapting the
+   split-radix algorithm); see reodft00e-splitradix.c. */
+
+#include "reodft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld;
+     twid *td;
+     INT is, os;
+     INT n;
+     INT vl;
+     INT ivs, ovs;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W = ego->td->W;
+     R *buf;
+     E csum;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = I[0] + I[is * n];
+	  csum = I[0] - I[is * n];
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b, apb, amb;
+	       a = I[is * i];
+	       b = I[is * (n - i)];
+	       csum += W[2*i] * (amb = K(2.0)*(a - b));
+	       amb = W[2*i+1] * amb;
+	       apb = (a + b);
+	       buf[i] = apb - amb;
+	       buf[n - i] = apb + amb;
+	  }
+	  if (i == n - i) {
+	       buf[i] = K(2.0) * I[is * i];
+	  }
+	  
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  /* FIXME: use recursive/cascade summation for better stability? */
+	  O[0] = buf[0];
+	  O[os] = csum;
+	  for (i = 1; i + i < n; ++i) {
+	       INT k = i + i;
+	       O[os * k] = buf[i];
+	       O[os * (k + 1)] = O[os * (k - 1)] - buf[n - i];
+	  }
+	  if (i + i == n) {
+	       O[os * n] = buf[i];
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     static const tw_instr redft00e_tw[] = {
+          { TW_COS, 0, 1 },
+          { TW_SIN, 0, 1 },
+          { TW_NEXT, 1, 0 }
+     };
+
+     X(plan_awake)(ego->cld, wakefulness);
+     X(twiddle_awake)(wakefulness,
+		      &ego->td, redft00e_tw, 2*ego->n, 1, (ego->n+1)/2);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(redft00e-r2hc-%D%v%(%p%))", ego->n + 1, ego->vl, ego->cld);
+}
+
+static int applicable0(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego_);
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1
+	     && p->kind[0] == REDFT00
+	     && p->sz->dims[0].n > 1  /* n == 1 is not well-defined */
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p, const planner *plnr)
+{
+     return (!NO_SLOWP(plnr) && applicable0(ego, p));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     plan *cld;
+     R *buf;
+     INT n;
+     opcnt ops;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+
+     n = p->sz->dims[0].n - 1;
+     A(n > 0);
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n, 1, 1), 
+						   X(mktensor_0d)(), 
+						   buf, buf, R2HC));
+     X(ifree)(buf);
+     if (!cld)
+          return (plan *)0;
+
+     pln = MKPLAN_RDFT(P, &padt, apply);
+
+     pln->n = n;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+     pln->cld = cld;
+     pln->td = 0;
+
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+     
+     X(ops_zero)(&ops);
+     ops.other = 8 + (n-1)/2 * 11 + (1 - n % 2) * 5;
+     ops.add = 2 + (n-1)/2 * 5;
+     ops.mul = (n-1)/2 * 3 + (1 - n % 2) * 1;
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(redft00e_r2hc_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/reodft.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/reodft.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __REODFT_H__
+#define __REODFT_H__
+
+#include "ifftw.h"
+#include "rdft.h"
+
+#define REODFT_KINDP(k) ((k) >= REDFT00 && (k) <= RODFT11)
+
+void X(redft00e_r2hc_register)(planner *p);
+void X(redft00e_r2hc_pad_register)(planner *p);
+void X(rodft00e_r2hc_register)(planner *p);
+void X(rodft00e_r2hc_pad_register)(planner *p);
+void X(reodft00e_splitradix_register)(planner *p);
+void X(reodft010e_r2hc_register)(planner *p);
+void X(reodft11e_r2hc_register)(planner *p);
+void X(reodft11e_radix2_r2hc_register)(planner *p);
+void X(reodft11e_r2hc_odd_register)(planner *p);
+
+/* configurations */
+void X(reodft_conf_standard)(planner *p);
+
+#endif /* __REODFT_H__ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/reodft00e-splitradix.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/reodft00e-splitradix.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2005 Matteo Frigo
+ * Copyright (c) 2005 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Do an R{E,O}DFT00 problem (of an odd length n) recursively via an
+   R{E,O}DFT00 problem and an RDFT problem of half the length.
+
+   This works by "logically" expanding the array to a real-even/odd DFT of
+   length 2n-/+2 and then applying the split-radix algorithm.
+
+   In this way, we can avoid having to pad to twice the length
+   (ala redft00-r2hc-pad), saving a factor of ~2 for n=2^m+/-1,
+   but don't incur the accuracy loss that the "ordinary" algorithm
+   sacrifices (ala redft00-r2hc.c).
+*/
+
+#include "reodft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *clde, *cldo;
+     twid *td;
+     INT is, os;
+     INT n;
+     INT vl;
+     INT ivs, ovs;
+} P;
+
+/* redft00 */
+static void apply_e(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, j, n = ego->n + 1, n2 = (n-1)/2;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W = ego->td->W - 2;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n2, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  /* do size (n-1)/2 r2hc transform of odd-indexed elements
+	     with stride 4, "wrapping around" end of array with even
+	     boundary conditions */
+	  for (j = 0, i = 1; i < n; i += 4)
+	       buf[j++] = I[is * i];
+	  for (i = 2*n-2-i; i > 0; i -= 4)
+	       buf[j++] = I[is * i];
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cldo;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+
+	  /* do size (n+1)/2 redft00 of the even-indexed elements,
+	     writing to O: */
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->clde;
+	       cld->apply((plan *) cld, I, O);
+	  }
+
+	  /* combine the results with the twiddle factors to get output */
+	  { /* DC element */
+	       E b20 = O[0], b0 = K(2.0) * buf[0];
+	       O[0] = b20 + b0;
+	       O[2*(n2*os)] = b20 - b0;
+	       /* O[n2*os] = O[n2*os]; */
+	  }
+	  for (i = 1; i < n2 - i; ++i) {
+	       E ap, am, br, bi, wr, wi, wbr, wbi;
+	       br = buf[i];
+	       bi = buf[n2 - i];
+	       wr = W[2*i];
+	       wi = W[2*i+1];
+#if FFT_SIGN == -1
+	       wbr = K(2.0) * (wr*br + wi*bi);
+	       wbi = K(2.0) * (wr*bi - wi*br);
+#else
+	       wbr = K(2.0) * (wr*br - wi*bi);
+	       wbi = K(2.0) * (wr*bi + wi*br);
+#endif
+	       ap = O[i*os];
+	       O[i*os] = ap + wbr;
+	       O[(2*n2 - i)*os] = ap - wbr;
+	       am = O[(n2 - i)*os];
+#if FFT_SIGN == -1
+	       O[(n2 - i)*os] = am - wbi;
+	       O[(n2 + i)*os] = am + wbi;
+#else
+	       O[(n2 - i)*os] = am + wbi;
+	       O[(n2 + i)*os] = am - wbi;
+#endif
+	  }
+	  if (i == n2 - i) { /* Nyquist element */
+	       E ap, wbr;
+	       wbr = K(2.0) * (W[2*i] * buf[i]);
+	       ap = O[i*os];
+	       O[i*os] = ap + wbr;
+	       O[(2*n2 - i)*os] = ap - wbr;
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+/* rodft00 */
+static void apply_o(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, j, n = ego->n - 1, n2 = (n+1)/2;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W = ego->td->W - 2;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n2, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  /* do size (n+1)/2 r2hc transform of even-indexed elements
+	     with stride 4, "wrapping around" end of array with odd
+	     boundary conditions */
+	  for (j = 0, i = 0; i < n; i += 4)
+	       buf[j++] = I[is * i];
+	  for (i = 2*n-i; i > 0; i -= 4)
+	       buf[j++] = -I[is * i];
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cldo;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+
+	  /* do size (n-1)/2 rodft00 of the odd-indexed elements,
+	     writing to O: */
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->clde;
+	       if (I == O) {
+		    /* can't use I+is and I, subplan would lose in-placeness */
+		    cld->apply((plan *) cld, I + is, I + is);
+		    /* we could maybe avoid this copy by modifying the
+		       twiddle loop, but currently I can't be bothered. */
+		    A(is >= os);
+		    for (i = 0; i < n2-1; ++i)
+			 O[os*i] = I[is*(i+1)];
+	       }
+	       else
+		    cld->apply((plan *) cld, I + is, O);
+	  }
+
+	  /* combine the results with the twiddle factors to get output */
+	  O[(n2-1)*os] = K(2.0) * buf[0];
+	  for (i = 1; i < n2 - i; ++i) {
+	       E ap, am, br, bi, wr, wi, wbr, wbi;
+	       br = buf[i];
+	       bi = buf[n2 - i];
+	       wr = W[2*i];
+	       wi = W[2*i+1];
+#if FFT_SIGN == -1
+	       wbr = K(2.0) * (wr*br + wi*bi);
+	       wbi = K(2.0) * (wi*br - wr*bi);
+#else
+	       wbr = K(2.0) * (wr*br - wi*bi);
+	       wbi = K(2.0) * (wr*bi + wi*br);
+#endif
+	       ap = O[(i-1)*os];
+	       O[(i-1)*os] = wbi + ap;
+	       O[(2*n2-1 - i)*os] = wbi - ap;
+	       am = O[(n2-1 - i)*os];
+#if FFT_SIGN == -1
+	       O[(n2-1 - i)*os] = wbr + am;
+	       O[(n2-1 + i)*os] = wbr - am;
+#else
+	       O[(n2-1 - i)*os] = wbr + am;
+	       O[(n2-1 + i)*os] = wbr - am;
+#endif
+	  }
+	  if (i == n2 - i) { /* Nyquist element */
+	       E ap, wbi;
+	       wbi = K(2.0) * (W[2*i+1] * buf[i]);
+	       ap = O[(i-1)*os];
+	       O[(i-1)*os] = wbi + ap;
+	       O[(2*n2-1 - i)*os] = wbi - ap;
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     static const tw_instr reodft00e_tw[] = {
+          { TW_COS, 1, 1 },
+          { TW_SIN, 1, 1 },
+          { TW_NEXT, 1, 0 }
+     };
+
+     X(plan_awake)(ego->clde, wakefulness);
+     X(plan_awake)(ego->cldo, wakefulness);
+     X(twiddle_awake)(wakefulness, &ego->td, reodft00e_tw, 
+		      2*ego->n, 1, ego->n/4);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldo);
+     X(plan_destroy_internal)(ego->clde);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     if (ego->super.apply == apply_e)
+	  p->print(p, "(redft00e-splitradix-%D%v%(%p%)%(%p%))", 
+		   ego->n + 1, ego->vl, ego->clde, ego->cldo);
+     else
+	  p->print(p, "(rodft00e-splitradix-%D%v%(%p%)%(%p%))", 
+		   ego->n - 1, ego->vl, ego->clde, ego->cldo);
+}
+
+static int applicable0(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego_);
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1
+	     && (p->kind[0] == REDFT00 || p->kind[0] == RODFT00)
+	     && p->sz->dims[0].n > 1  /* don't create size-0 sub-plans */
+	     && p->sz->dims[0].n % 2  /* odd: 4 divides "logical" DFT */
+	     && (p->I != p->O || p->vecsz->rnk == 0
+		 || p->vecsz->dims[0].is == p->vecsz->dims[0].os)
+	     && (p->kind[0] != RODFT00 || p->I != p->O || 
+		 p->sz->dims[0].is >= p->sz->dims[0].os) /* laziness */
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p, const planner *plnr)
+{
+     return (!NO_SLOWP(plnr) && applicable0(ego, p));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     plan *clde, *cldo;
+     R *buf;
+     INT n, n0;
+     opcnt ops;
+     int inplace_odd;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+
+     n = (n0 = p->sz->dims[0].n) + (p->kind[0] == REDFT00 ? (INT)-1 : (INT)1);
+     A(n > 0 && n % 2 == 0);
+     buf = (R *) MALLOC(sizeof(R) * (n/2), BUFFERS);
+
+     inplace_odd = p->kind[0]==RODFT00 && p->I == p->O;
+     clde = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(
+			     X(mktensor_1d)(n0-n/2, 2*p->sz->dims[0].is, 
+					    inplace_odd ? p->sz->dims[0].is
+					    : p->sz->dims[0].os), 
+			     X(mktensor_0d)(), 
+			     TAINT(p->I 
+				   + p->sz->dims[0].is * (p->kind[0]==RODFT00),
+				   p->vecsz->rnk ? p->vecsz->dims[0].is : 0),
+			     TAINT(p->O
+				   + p->sz->dims[0].is * inplace_odd,
+				   p->vecsz->rnk ? p->vecsz->dims[0].os : 0),
+			     p->kind[0]));
+     if (!clde) {
+	  X(ifree)(buf);
+          return (plan *)0;
+     }
+
+     cldo = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(
+			     X(mktensor_1d)(n/2, 1, 1), 
+			     X(mktensor_0d)(), 
+			     buf, buf, R2HC));
+     X(ifree)(buf);
+     if (!cldo)
+          return (plan *)0;
+
+     pln = MKPLAN_RDFT(P, &padt, p->kind[0] == REDFT00 ? apply_e : apply_o);
+
+     pln->n = n;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+     pln->clde = clde;
+     pln->cldo = cldo;
+     pln->td = 0;
+
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+     
+     X(ops_zero)(&ops);
+     ops.other = n/2;
+     ops.add = (p->kind[0]==REDFT00 ? (INT)2 : (INT)0) +
+	  (n/2-1)/2 * 6 + ((n/2)%2==0) * 2;
+     ops.mul = 1 + (n/2-1)/2 * 6 + ((n/2)%2==0) * 2;
+
+     /* tweak ops.other so that r2hc-pad is used for small sizes, which
+	seems to be a lot faster on my machine: */
+     ops.other += 256;
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &clde->ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cldo->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(reodft00e_splitradix_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/reodft010e-r2hc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/reodft010e-r2hc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Do an R{E,O}DFT{01,10} problem via an R2HC problem, with some
+   pre/post-processing ala FFTPACK. */
+
+#include "reodft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld;
+     twid *td;
+     INT is, os;
+     INT n;
+     INT vl;
+     INT ivs, ovs;
+     rdft_kind kind;
+} P;
+
+/* A real-even-01 DFT operates logically on a size-4N array:
+                   I 0 -r(I*) -I 0 r(I*),
+   where r denotes reversal and * denotes deletion of the 0th element.
+   To compute the transform of this, we imagine performing a radix-4
+   (real-input) DIF step, which turns the size-4N DFT into 4 size-N
+   (contiguous) DFTs, two of which are zero and two of which are
+   conjugates.  The non-redundant size-N DFT has halfcomplex input, so
+   we can do it with a size-N hc2r transform.  (In order to share
+   plans with the re10 (inverse) transform, however, we use the DHT
+   trick to re-express the hc2r problem as r2hc.  This has little cost
+   since we are already pre- and post-processing the data in {i,n-i}
+   order.)  Finally, we have to write out the data in the correct
+   order...the two size-N redundant (conjugate) hc2r DFTs correspond
+   to the even and odd outputs in O (i.e. the usual interleaved output
+   of DIF transforms); since this data has even symmetry, we only
+   write the first half of it.
+
+   The real-even-10 DFT is just the reverse of these steps, i.e. a
+   radix-4 DIT transform.  There, however, we just use the r2hc
+   transform naturally without resorting to the DHT trick.
+
+   A real-odd-01 DFT is very similar, except that the input is
+   0 I (rI)* 0 -I -(rI)*.  This format, however, can be transformed
+   into precisely the real-even-01 format above by sending I -> rI
+   and shifting the array by N.  The former swap is just another
+   transformation on the input during preprocessing; the latter
+   multiplies the even/odd outputs by i/-i, which combines with
+   the factor of -i (to take the imaginary part) to simply flip
+   the sign of the odd outputs.  Vice-versa for real-odd-10.
+
+   The FFTPACK source code was very helpful in working this out.
+   (They do unnecessary passes over the array, though.)  The same
+   algorithm is also described in:
+
+      John Makhoul, "A fast cosine transform in one and two dimensions,"
+      IEEE Trans. on Acoust. Speech and Sig. Proc., ASSP-28 (1), 27--34 (1980).
+
+   Note that Numerical Recipes suggests a different algorithm that
+   requires more operations and uses trig. functions for both the pre-
+   and post-processing passes.
+*/
+
+static void apply_re01(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W = ego->td->W;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = I[0];
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b, apb, amb, wa, wb;
+	       a = I[is * i];
+	       b = I[is * (n - i)];
+	       apb = a + b;
+	       amb = a - b;
+	       wa = W[2*i];
+	       wb = W[2*i + 1];
+	       buf[i] = wa * amb + wb * apb; 
+	       buf[n - i] = wa * apb - wb * amb; 
+	  }
+	  if (i == n - i) {
+	       buf[i] = K(2.0) * I[is * i] * W[2*i];
+	  }
+	  
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  O[0] = buf[0];
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b;
+	       INT k;
+	       a = buf[i];
+	       b = buf[n - i];
+	       k = i + i;
+	       O[os * (k - 1)] = a - b;
+	       O[os * k] = a + b;
+	  }
+	  if (i == n - i) {
+	       O[os * (n - 1)] = buf[i];
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+/* ro01 is same as re01, but with i <-> n - 1 - i in the input and
+   the sign of the odd output elements flipped. */
+static void apply_ro01(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W = ego->td->W;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = I[is * (n - 1)];
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b, apb, amb, wa, wb;
+	       a = I[is * (n - 1 - i)];
+	       b = I[is * (i - 1)];
+	       apb = a + b;
+	       amb = a - b;
+	       wa = W[2*i];
+	       wb = W[2*i+1];
+	       buf[i] = wa * amb + wb * apb; 
+	       buf[n - i] = wa * apb - wb * amb; 
+	  }
+	  if (i == n - i) {
+	       buf[i] = K(2.0) * I[is * (i - 1)] * W[2*i];
+	  }
+	  
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  O[0] = buf[0];
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b;
+	       INT k;
+	       a = buf[i];
+	       b = buf[n - i];
+	       k = i + i;
+	       O[os * (k - 1)] = b - a;
+	       O[os * k] = a + b;
+	  }
+	  if (i == n - i) {
+	       O[os * (n - 1)] = -buf[i];
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+static void apply_re10(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W = ego->td->W;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = I[0];
+	  for (i = 1; i < n - i; ++i) {
+	       E u, v;
+	       INT k = i + i;
+	       u = I[is * (k - 1)];
+	       v = I[is * k];
+	       buf[n - i] = u;
+	       buf[i] = v;
+	  }
+	  if (i == n - i) {
+	       buf[i] = I[is * (n - 1)];
+	  }
+	  
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  O[0] = K(2.0) * buf[0];
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b, wa, wb;
+	       a = K(2.0) * buf[i];
+	       b = K(2.0) * buf[n - i];
+	       wa = W[2*i];
+	       wb = W[2*i + 1];
+	       O[os * i] = wa * a + wb * b;
+	       O[os * (n - i)] = wb * a - wa * b;
+	  }
+	  if (i == n - i) {
+	       O[os * i] = K(2.0) * buf[i] * W[2*i];
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+/* ro10 is same as re10, but with i <-> n - 1 - i in the output and
+   the sign of the odd input elements flipped. */
+static void apply_ro10(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W = ego->td->W;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = I[0];
+	  for (i = 1; i < n - i; ++i) {
+	       E u, v;
+	       INT k = i + i;
+	       u = -I[is * (k - 1)];
+	       v = I[is * k];
+	       buf[n - i] = u;
+	       buf[i] = v;
+	  }
+	  if (i == n - i) {
+	       buf[i] = -I[is * (n - 1)];
+	  }
+	  
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  O[os * (n - 1)] = K(2.0) * buf[0];
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b, wa, wb;
+	       a = K(2.0) * buf[i];
+	       b = K(2.0) * buf[n - i];
+	       wa = W[2*i];
+	       wb = W[2*i + 1];
+	       O[os * (n - 1 - i)] = wa * a + wb * b;
+	       O[os * (i - 1)] = wb * a - wa * b;
+	  }
+	  if (i == n - i) {
+	       O[os * (i - 1)] = K(2.0) * buf[i] * W[2*i];
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     static const tw_instr reodft010e_tw[] = {
+          { TW_COS, 0, 1 },
+          { TW_SIN, 0, 1 },
+          { TW_NEXT, 1, 0 }
+     };
+
+     X(plan_awake)(ego->cld, wakefulness);
+
+     X(twiddle_awake)(wakefulness, &ego->td, reodft010e_tw, 
+		      4*ego->n, 1, ego->n/2+1);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(%se-r2hc-%D%v%(%p%))",
+	      X(rdft_kind_str)(ego->kind), ego->n, ego->vl, ego->cld);
+}
+
+static int applicable0(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego_);
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1
+	     && (p->kind[0] == REDFT01 || p->kind[0] == REDFT10
+		 || p->kind[0] == RODFT01 || p->kind[0] == RODFT10)
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p, const planner *plnr)
+{
+     return (!NO_SLOWP(plnr) && applicable0(ego, p));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     plan *cld;
+     R *buf;
+     INT n;
+     opcnt ops;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+
+     n = p->sz->dims[0].n;
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n, 1, 1),
+                                                   X(mktensor_0d)(),
+                                                   buf, buf, R2HC));
+     X(ifree)(buf);
+     if (!cld)
+          return (plan *)0;
+
+     switch (p->kind[0]) {
+	 case REDFT01: pln = MKPLAN_RDFT(P, &padt, apply_re01); break;
+	 case REDFT10: pln = MKPLAN_RDFT(P, &padt, apply_re10); break;
+	 case RODFT01: pln = MKPLAN_RDFT(P, &padt, apply_ro01); break;
+	 case RODFT10: pln = MKPLAN_RDFT(P, &padt, apply_ro10); break;
+	 default: A(0); return (plan*)0;
+     }
+
+     pln->n = n;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+     pln->cld = cld;
+     pln->td = 0;
+     pln->kind = p->kind[0];
+     
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+     
+     X(ops_zero)(&ops);
+     ops.other = 4 + (n-1)/2 * 10 + (1 - n % 2) * 5;
+     if (p->kind[0] == REDFT01 || p->kind[0] == RODFT01) {
+	  ops.add = (n-1)/2 * 6;
+	  ops.mul = (n-1)/2 * 4 + (1 - n % 2) * 2;
+     }
+     else { /* 10 transforms */
+	  ops.add = (n-1)/2 * 2;
+	  ops.mul = 1 + (n-1)/2 * 6 + (1 - n % 2) * 2;
+     }
+     
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(reodft010e_r2hc_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/reodft11e-r2hc-odd.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/reodft11e-r2hc-odd.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Do an R{E,O}DFT11 problem via an R2HC problem of the same *odd* size,
+   with some permutations and post-processing, as described in:
+
+     S. C. Chan and K. L. Ho, "Fast algorithms for computing the
+     discrete cosine transform," IEEE Trans. Circuits Systems II:
+     Analog & Digital Sig. Proc. 39 (3), 185--190 (1992).
+
+   (For even sizes, see reodft11e-radix2.c.)  
+
+   This algorithm is related to the 8 x n prime-factor-algorithm (PFA)
+   decomposition of the size 8n "logical" DFT corresponding to the
+   R{EO}DFT11.
+
+   Aside from very confusing notation (several symbols are redefined
+   from one line to the next), be aware that this paper has some
+   errors.  In particular, the signs are wrong in Eqs. (34-35).  Also,
+   Eqs. (36-37) should be simply C(k) = C(2k + 1 mod N), and similarly
+   for S (or, equivalently, the second cases should have 2*N - 2*k - 1
+   instead of N - k - 1).  Note also that in their definition of the
+   DFT, similarly to FFTW's, the exponent's sign is -1, but they
+   forgot to correspondingly multiply S (the sine terms) by -1.
+*/
+
+#include "reodft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld;
+     INT is, os;
+     INT n;
+     INT vl;
+     INT ivs, ovs;
+     rdft_kind kind;
+} P;
+
+static DK(SQRT2, +1.4142135623730950488016887242096980785696718753769);
+
+#define SGN_SET(x, i) ((i) % 2 ? -(x) : (x))
+
+static void apply_re11(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n, n2 = n/2;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  {
+	       INT m;
+	       for (i = 0, m = n2; m < n; ++i, m += 4)
+		    buf[i] = I[is * m];
+	       for (; m < 2 * n; ++i, m += 4)
+		    buf[i] = -I[is * (2*n - m - 1)];
+	       for (; m < 3 * n; ++i, m += 4)
+		    buf[i] = -I[is * (m - 2*n)];
+	       for (; m < 4 * n; ++i, m += 4)
+		    buf[i] = I[is * (4*n - m - 1)];
+	       m -= 4 * n;
+	       for (; i < n; ++i, m += 4)
+		    buf[i] = I[is * m];
+	  }
+
+	  { /* child plan: R2HC of size n */
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  /* FIXME: strength-reduce loop by 4 to eliminate ugly sgn_set? */
+	  for (i = 0; i + i + 1 < n2; ++i) {
+	       INT k = i + i + 1;
+	       E c1, s1;
+	       E c2, s2;
+	       c1 = buf[k];
+	       c2 = buf[k + 1];
+	       s2 = buf[n - (k + 1)];
+	       s1 = buf[n - k];
+	       
+	       O[os * i] = SQRT2 * (SGN_SET(c1, (i+1)/2) +
+				    SGN_SET(s1, i/2));
+	       O[os * (n - (i+1))] = SQRT2 * (SGN_SET(c1, (n-i)/2) -
+					      SGN_SET(s1, (n-(i+1))/2));
+	       
+	       O[os * (n2 - (i+1))] = SQRT2 * (SGN_SET(c2, (n2-i)/2) -
+					       SGN_SET(s2, (n2-(i+1))/2));
+	       O[os * (n2 + (i+1))] = SQRT2 * (SGN_SET(c2, (n2+i+2)/2) +
+					       SGN_SET(s2, (n2+(i+1))/2));
+	  }
+	  if (i + i + 1 == n2) {
+	       E c, s;
+	       c = buf[n2];
+	       s = buf[n - n2];
+	       O[os * i] = SQRT2 * (SGN_SET(c, (i+1)/2) +
+				    SGN_SET(s, i/2));
+	       O[os * (n - (i+1))] = SQRT2 * (SGN_SET(c, (i+2)/2) +
+					      SGN_SET(s, (i+1)/2));
+	  }
+	  O[os * n2] = SQRT2 * SGN_SET(buf[0], (n2+1)/2);
+     }
+
+     X(ifree)(buf);
+}
+
+/* like for rodft01, rodft11 is obtained from redft11 by
+   reversing the input and flipping the sign of every other output. */
+static void apply_ro11(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n, n2 = n/2;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  {
+	       INT m;
+	       for (i = 0, m = n2; m < n; ++i, m += 4)
+		    buf[i] = I[is * (n - 1 - m)];
+	       for (; m < 2 * n; ++i, m += 4)
+		    buf[i] = -I[is * (m - n)];
+	       for (; m < 3 * n; ++i, m += 4)
+		    buf[i] = -I[is * (3*n - 1 - m)];
+	       for (; m < 4 * n; ++i, m += 4)
+		    buf[i] = I[is * (m - 3*n)];
+	       m -= 4 * n;
+	       for (; i < n; ++i, m += 4)
+		    buf[i] = I[is * (n - 1 - m)];
+	  }
+
+	  { /* child plan: R2HC of size n */
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  /* FIXME: strength-reduce loop by 4 to eliminate ugly sgn_set? */
+	  for (i = 0; i + i + 1 < n2; ++i) {
+	       INT k = i + i + 1;
+	       INT j;
+	       E c1, s1;
+	       E c2, s2;
+	       c1 = buf[k];
+	       c2 = buf[k + 1];
+	       s2 = buf[n - (k + 1)];
+	       s1 = buf[n - k];
+	       
+	       O[os * i] = SQRT2 * (SGN_SET(c1, (i+1)/2 + i) +
+				    SGN_SET(s1, i/2 + i));
+	       O[os * (n - (i+1))] = SQRT2 * (SGN_SET(c1, (n-i)/2 + i) -
+					      SGN_SET(s1, (n-(i+1))/2 + i));
+	       
+	       j = n2 - (i+1);
+	       O[os * j] = SQRT2 * (SGN_SET(c2, (n2-i)/2 + j) -
+				    SGN_SET(s2, (n2-(i+1))/2 + j));
+	       O[os * (n2 + (i+1))] = SQRT2 * (SGN_SET(c2, (n2+i+2)/2 + j) +
+					       SGN_SET(s2, (n2+(i+1))/2 + j));
+	  }
+	  if (i + i + 1 == n2) {
+	       E c, s;
+	       c = buf[n2];
+	       s = buf[n - n2];
+	       O[os * i] = SQRT2 * (SGN_SET(c, (i+1)/2 + i) +
+				    SGN_SET(s, i/2 + i));
+	       O[os * (n - (i+1))] = SQRT2 * (SGN_SET(c, (i+2)/2 + i) +
+					      SGN_SET(s, (i+1)/2 + i));
+	  }
+	  O[os * n2] = SQRT2 * SGN_SET(buf[0], (n2+1)/2 + n2);
+     }
+
+     X(ifree)(buf);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(%se-r2hc-odd-%D%v%(%p%))",
+	      X(rdft_kind_str)(ego->kind), ego->n, ego->vl, ego->cld);
+}
+
+static int applicable0(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego_);
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1
+	     && p->sz->dims[0].n % 2 == 1
+	     && (p->kind[0] == REDFT11 || p->kind[0] == RODFT11)
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p, const planner *plnr)
+{
+     return (!NO_SLOWP(plnr) && applicable0(ego, p));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     plan *cld;
+     R *buf;
+     INT n;
+     opcnt ops;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+
+     n = p->sz->dims[0].n;
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n, 1, 1),
+                                                   X(mktensor_0d)(),
+                                                   buf, buf, R2HC));
+     X(ifree)(buf);
+     if (!cld)
+          return (plan *)0;
+
+     pln = MKPLAN_RDFT(P, &padt, p->kind[0]==REDFT11 ? apply_re11:apply_ro11);
+     pln->n = n;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+     pln->cld = cld;
+     pln->kind = p->kind[0];
+     
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+     
+     X(ops_zero)(&ops);
+     ops.add = n - 1;
+     ops.mul = n;
+     ops.other = 4*n;
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(reodft11e_r2hc_odd_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/reodft11e-r2hc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/reodft11e-r2hc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Do an R{E,O}DFT11 problem via an R2HC problem, with some
+   pre/post-processing ala FFTPACK.  Use a trick from: 
+
+     S. C. Chan and K. L. Ho, "Direct methods for computing discrete
+     sinusoidal transforms," IEE Proceedings F 137 (6), 433--442 (1990).
+
+   to re-express as an REDFT01 (DCT-III) problem.
+
+   NOTE: We no longer use this algorithm, because it turns out to suffer
+   a catastrophic loss of accuracy for certain inputs, apparently because
+   its post-processing multiplies the output by a cosine.  Near the zero
+   of the cosine, the REDFT01 must produce a near-singular output.
+*/
+
+#include "reodft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld;
+     twid *td, *td2;
+     INT is, os;
+     INT n;
+     INT vl;
+     INT ivs, ovs;
+     rdft_kind kind;
+} P;
+
+static void apply_re11(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W;
+     R *buf;
+     E cur;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  /* I wish that this didn't require an extra pass. */
+	  /* FIXME: use recursive/cascade summation for better stability? */
+	  buf[n - 1] = cur = K(2.0) * I[is * (n - 1)];
+	  for (i = n - 1; i > 0; --i) {
+	       E curnew;
+	       buf[(i - 1)] = curnew = K(2.0) * I[is * (i - 1)] - cur;
+	       cur = curnew;
+	  }
+	  
+	  W = ego->td->W;
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b, apb, amb, wa, wb;
+	       a = buf[i];
+	       b = buf[n - i];
+	       apb = a + b;
+	       amb = a - b;
+	       wa = W[2*i];
+	       wb = W[2*i + 1];
+	       buf[i] = wa * amb + wb * apb; 
+	       buf[n - i] = wa * apb - wb * amb; 
+	  }
+	  if (i == n - i) {
+	       buf[i] = K(2.0) * buf[i] * W[2*i];
+	  }
+	  
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  W = ego->td2->W;
+	  O[0] = W[0] * buf[0];
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b;
+	       INT k;
+	       a = buf[i];
+	       b = buf[n - i];
+	       k = i + i;
+	       O[os * (k - 1)] = W[k - 1] * (a - b);
+	       O[os * k] = W[k] * (a + b);
+	  }
+	  if (i == n - i) {
+	       O[os * (n - 1)] = W[n - 1] * buf[i];
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+/* like for rodft01, rodft11 is obtained from redft11 by
+   reversing the input and flipping the sign of every other output. */
+static void apply_ro11(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W;
+     R *buf;
+     E cur;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  /* I wish that this didn't require an extra pass. */
+	  /* FIXME: use recursive/cascade summation for better stability? */
+	  buf[n - 1] = cur = K(2.0) * I[0];
+	  for (i = n - 1; i > 0; --i) {
+	       E curnew;
+	       buf[(i - 1)] = curnew = K(2.0) * I[is * (n - i)] - cur;
+	       cur = curnew;
+	  }
+	  
+	  W = ego->td->W;
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b, apb, amb, wa, wb;
+	       a = buf[i];
+	       b = buf[n - i];
+	       apb = a + b;
+	       amb = a - b;
+	       wa = W[2*i];
+	       wb = W[2*i + 1];
+	       buf[i] = wa * amb + wb * apb; 
+	       buf[n - i] = wa * apb - wb * amb; 
+	  }
+	  if (i == n - i) {
+	       buf[i] = K(2.0) * buf[i] * W[2*i];
+	  }
+	  
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  W = ego->td2->W;
+	  O[0] = W[0] * buf[0];
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b;
+	       INT k;
+	       a = buf[i];
+	       b = buf[n - i];
+	       k = i + i;
+	       O[os * (k - 1)] = W[k - 1] * (b - a);
+	       O[os * k] = W[k] * (a + b);
+	  }
+	  if (i == n - i) {
+	       O[os * (n - 1)] = -W[n - 1] * buf[i];
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     static const tw_instr reodft010e_tw[] = {
+          { TW_COS, 0, 1 },
+          { TW_SIN, 0, 1 },
+          { TW_NEXT, 1, 0 }
+     };
+     static const tw_instr reodft11e_tw[] = {
+          { TW_COS, 1, 1 },
+          { TW_NEXT, 2, 0 }
+     };
+
+     X(plan_awake)(ego->cld, wakefulness);
+
+     X(twiddle_awake)(wakefulness,
+		      &ego->td, reodft010e_tw, 4*ego->n, 1, ego->n/2+1);
+     X(twiddle_awake)(wakefulness,
+		      &ego->td2, reodft11e_tw, 8*ego->n, 1, ego->n * 2);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(%se-r2hc-%D%v%(%p%))",
+	      X(rdft_kind_str)(ego->kind), ego->n, ego->vl, ego->cld);
+}
+
+static int applicable0(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+
+     UNUSED(ego_);
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1
+	     && (p->kind[0] == REDFT11 || p->kind[0] == RODFT11)
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p, const planner *plnr)
+{
+     return (!NO_SLOWP(plnr) && applicable0(ego, p));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     plan *cld;
+     R *buf;
+     INT n;
+     opcnt ops;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+
+     n = p->sz->dims[0].n;
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n, 1, 1),
+                                                   X(mktensor_0d)(),
+                                                   buf, buf, R2HC));
+     X(ifree)(buf);
+     if (!cld)
+          return (plan *)0;
+
+     pln = MKPLAN_RDFT(P, &padt, p->kind[0]==REDFT11 ? apply_re11:apply_ro11);
+     pln->n = n;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+     pln->cld = cld;
+     pln->td = pln->td2 = 0;
+     pln->kind = p->kind[0];
+     
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+     
+     X(ops_zero)(&ops);
+     ops.other = 5 + (n-1) * 2 + (n-1)/2 * 12 + (1 - n % 2) * 6;
+     ops.add = (n - 1) * 1 + (n-1)/2 * 6;
+     ops.mul = 2 + (n-1) * 1 + (n-1)/2 * 6 + (1 - n % 2) * 3;
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(reodft11e_r2hc_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/reodft11e-radix2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/reodft11e-radix2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Do an R{E,O}DFT11 problem of *even* size by a pair of R2HC problems
+   of half the size, plus some pre/post-processing.  Use a trick from:
+
+   Zhongde Wang, "On computing the discrete Fourier and cosine transforms,"
+   IEEE Trans. Acoust. Speech Sig. Proc. ASSP-33 (4), 1341--1344 (1985).
+
+   to re-express as a pair of half-size REDFT01 (DCT-III) problems.  Our
+   implementation looks quite a bit different from the algorithm described
+   in the paper because we combined the paper's pre/post-processing with
+   the pre/post-processing used to turn REDFT01 into R2HC.  (Also, the
+   paper uses a DCT/DST pair, but we turn the DST into a DCT via the
+   usual reordering/sign-flip trick.  We additionally combined a couple
+   of the matrices/transformations of the paper into a single pass.)
+
+   NOTE: We originally used a simpler method by S. C. Chan and K. L. Ho
+   that turned out to have numerical problems; see reodft11e-r2hc.c.
+
+   (For odd sizes, see reodft11e-r2hc-odd.c.)
+*/
+
+#include "reodft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld;
+     twid *td, *td2;
+     INT is, os;
+     INT n;
+     INT vl;
+     INT ivs, ovs;
+     rdft_kind kind;
+} P;
+
+static void apply_re11(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n, n2 = n/2;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W = ego->td->W;
+     R *W2;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = K(2.0) * I[0];
+	  buf[n2] = K(2.0) * I[is * (n - 1)];
+	  for (i = 1; i + i < n2; ++i) {
+	       INT k = i + i;
+	       E a, b, a2, b2;
+	       {
+		    E u, v;
+		    u = I[is * (k - 1)];
+		    v = I[is * k];
+		    a = u + v;
+		    b2 = u - v;
+	       }
+	       {
+		    E u, v;
+		    u = I[is * (n - k - 1)];
+		    v = I[is * (n - k)];
+		    b = u + v;
+		    a2 = u - v;
+	       }
+	       {
+		    E wa, wb;
+		    wa = W[2*i];
+		    wb = W[2*i + 1];
+		    {
+			 E apb, amb;
+			 apb = a + b;
+			 amb = a - b;
+			 buf[i] = wa * amb + wb * apb; 
+			 buf[n2 - i] = wa * apb - wb * amb; 
+		    }
+		    {
+			 E apb, amb;
+			 apb = a2 + b2;
+			 amb = a2 - b2;
+			 buf[n2 + i] = wa * amb + wb * apb; 
+			 buf[n - i] = wa * apb - wb * amb; 
+		    }
+	       }
+	  }
+	  if (i + i == n2) {
+	       E u, v;
+	       u = I[is * (n2 - 1)];
+	       v = I[is * n2];
+	       buf[i] = (u + v) * (W[2*i] * K(2.0));
+	       buf[n - i] = (u - v) * (W[2*i] * K(2.0));
+	  }
+
+
+	  /* child plan: two r2hc's of size n/2 */
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  W2 = ego->td2->W;
+	  { /* i == 0 case */
+	       E wa, wb;
+	       E a, b;
+	       wa = W2[0]; /* cos */
+	       wb = W2[1]; /* sin */
+	       a = buf[0];
+	       b = buf[n2];
+	       O[0] = wa * a + wb * b;
+	       O[os * (n - 1)] = wb * a - wa * b;
+	  }
+	  W2 += 2;
+	  for (i = 1; i + i < n2; ++i, W2 += 2) {
+	       INT k;
+	       E u, v, u2, v2;
+	       u = buf[i];
+	       v = buf[n2 - i];
+	       u2 = buf[n2 + i];
+	       v2 = buf[n - i];
+	       k = (i + i) - 1;
+	       {
+                    E wa, wb;
+                    E a, b;
+                    wa = W2[0]; /* cos */
+                    wb = W2[1]; /* sin */
+                    a = u - v;
+                    b = v2 - u2;
+                    O[os * k] = wa * a + wb * b;
+                    O[os * (n - 1 - k)] = wb * a - wa * b;
+               }
+	       ++k;
+	       W2 += 2;
+	       {
+		    E wa, wb;
+		    E a, b;
+		    wa = W2[0]; /* cos */
+		    wb = W2[1]; /* sin */
+		    a = u + v;
+		    b = u2 + v2;
+		    O[os * k] = wa * a + wb * b;
+		    O[os * (n - 1 - k)] = wb * a - wa * b;
+	       }
+	  }
+	  if (i + i == n2) {
+	       INT k = (i + i) - 1;
+	       E wa, wb;
+	       E a, b;
+	       wa = W2[0]; /* cos */
+	       wb = W2[1]; /* sin */
+	       a = buf[i];
+	       b = buf[n2 + i];
+	       O[os * k] = wa * a - wb * b;
+	       O[os * (n - 1 - k)] = wb * a + wa * b;
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+#if 0
+
+/* This version of apply_re11 uses REDFT01 child plans, more similar
+   to the original paper by Z. Wang.  We keep it around for reference
+   (it is simpler) and because it may become more efficient if we
+   ever implement REDFT01 codelets. */
+
+static void apply_re11(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = K(2.0) * I[0];
+	  buf[n/2] = K(2.0) * I[is * (n - 1)];
+	  for (i = 1; i + i < n; ++i) {
+	       INT k = i + i;
+	       E a, b;
+	       a = I[is * (k - 1)];
+	       b = I[is * k];
+	       buf[i] = a + b;
+	       buf[n - i] = a - b;
+	  }
+
+	  /* child plan: two redft01's (DCT-III) */
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  W = ego->td2->W;
+	  for (i = 0; i + 1 < n/2; ++i, W += 2) {
+	       {
+		    E wa, wb;
+		    E a, b;
+		    wa = W[0]; /* cos */
+		    wb = W[1]; /* sin */
+		    a = buf[i];
+		    b = buf[n/2 + i];
+		    O[os * i] = wa * a + wb * b;
+		    O[os * (n - 1 - i)] = wb * a - wa * b;
+	       }
+	       ++i;
+	       W += 2;
+	       {
+                    E wa, wb;
+                    E a, b;
+                    wa = W[0]; /* cos */
+                    wb = W[1]; /* sin */
+                    a = buf[i];
+                    b = buf[n/2 + i];
+                    O[os * i] = wa * a - wb * b;
+                    O[os * (n - 1 - i)] = wb * a + wa * b;
+               }
+	  }
+	  if (i < n/2) {
+	       E wa, wb;
+	       E a, b;
+	       wa = W[0]; /* cos */
+	       wb = W[1]; /* sin */
+	       a = buf[i];
+	       b = buf[n/2 + i];
+	       O[os * i] = wa * a + wb * b;
+	       O[os * (n - 1 - i)] = wb * a - wa * b;
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+#endif /* 0 */
+
+/* like for rodft01, rodft11 is obtained from redft11 by
+   reversing the input and flipping the sign of every other output. */
+static void apply_ro11(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n, n2 = n/2;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W = ego->td->W;
+     R *W2;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = K(2.0) * I[is * (n - 1)];
+	  buf[n2] = K(2.0) * I[0];
+	  for (i = 1; i + i < n2; ++i) {
+	       INT k = i + i;
+	       E a, b, a2, b2;
+	       {
+		    E u, v;
+		    u = I[is * (n - k)];
+		    v = I[is * (n - 1 - k)];
+		    a = u + v;
+		    b2 = u - v;
+	       }
+	       {
+		    E u, v;
+		    u = I[is * (k)];
+		    v = I[is * (k - 1)];
+		    b = u + v;
+		    a2 = u - v;
+	       }
+	       {
+		    E wa, wb;
+		    wa = W[2*i];
+		    wb = W[2*i + 1];
+		    {
+			 E apb, amb;
+			 apb = a + b;
+			 amb = a - b;
+			 buf[i] = wa * amb + wb * apb; 
+			 buf[n2 - i] = wa * apb - wb * amb; 
+		    }
+		    {
+			 E apb, amb;
+			 apb = a2 + b2;
+			 amb = a2 - b2;
+			 buf[n2 + i] = wa * amb + wb * apb; 
+			 buf[n - i] = wa * apb - wb * amb; 
+		    }
+	       }
+	  }
+	  if (i + i == n2) {
+	       E u, v;
+	       u = I[is * n2];
+	       v = I[is * (n2 - 1)];
+	       buf[i] = (u + v) * (W[2*i] * K(2.0));
+	       buf[n - i] = (u - v) * (W[2*i] * K(2.0));
+	  }
+
+
+	  /* child plan: two r2hc's of size n/2 */
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  W2 = ego->td2->W;
+	  { /* i == 0 case */
+	       E wa, wb;
+	       E a, b;
+	       wa = W2[0]; /* cos */
+	       wb = W2[1]; /* sin */
+	       a = buf[0];
+	       b = buf[n2];
+	       O[0] = wa * a + wb * b;
+	       O[os * (n - 1)] = wa * b - wb * a;
+	  }
+	  W2 += 2;
+	  for (i = 1; i + i < n2; ++i, W2 += 2) {
+	       INT k;
+	       E u, v, u2, v2;
+	       u = buf[i];
+	       v = buf[n2 - i];
+	       u2 = buf[n2 + i];
+	       v2 = buf[n - i];
+	       k = (i + i) - 1;
+	       {
+                    E wa, wb;
+                    E a, b;
+                    wa = W2[0]; /* cos */
+                    wb = W2[1]; /* sin */
+                    a = v - u;
+                    b = u2 - v2;
+                    O[os * k] = wa * a + wb * b;
+                    O[os * (n - 1 - k)] = wa * b - wb * a;
+               }
+	       ++k;
+	       W2 += 2;
+	       {
+		    E wa, wb;
+		    E a, b;
+		    wa = W2[0]; /* cos */
+		    wb = W2[1]; /* sin */
+		    a = u + v;
+		    b = u2 + v2;
+		    O[os * k] = wa * a + wb * b;
+		    O[os * (n - 1 - k)] = wa * b - wb * a;
+	       }
+	  }
+	  if (i + i == n2) {
+	       INT k = (i + i) - 1;
+	       E wa, wb;
+	       E a, b;
+	       wa = W2[0]; /* cos */
+	       wb = W2[1]; /* sin */
+	       a = buf[i];
+	       b = buf[n2 + i];
+	       O[os * k] = wb * b - wa * a;
+	       O[os * (n - 1 - k)] = wa * b + wb * a;
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     static const tw_instr reodft010e_tw[] = {
+          { TW_COS, 0, 1 },
+          { TW_SIN, 0, 1 },
+          { TW_NEXT, 1, 0 }
+     };
+     static const tw_instr reodft11e_tw[] = {
+          { TW_COS, 1, 1 },
+          { TW_SIN, 1, 1 },
+          { TW_NEXT, 2, 0 }
+     };
+
+     X(plan_awake)(ego->cld, wakefulness);
+
+     X(twiddle_awake)(wakefulness, &ego->td, reodft010e_tw, 
+		      2*ego->n, 1, ego->n/4+1);
+     X(twiddle_awake)(wakefulness, &ego->td2, reodft11e_tw, 
+		      8*ego->n, 1, ego->n);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(%se-radix2-r2hc-%D%v%(%p%))",
+	      X(rdft_kind_str)(ego->kind), ego->n, ego->vl, ego->cld);
+}
+
+static int applicable0(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego_);
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1
+	     && p->sz->dims[0].n % 2 == 0
+	     && (p->kind[0] == REDFT11 || p->kind[0] == RODFT11)
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p, const planner *plnr)
+{
+     return (!NO_SLOWP(plnr) && applicable0(ego, p));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     plan *cld;
+     R *buf;
+     INT n;
+     opcnt ops;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+
+     n = p->sz->dims[0].n;
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n/2, 1, 1),
+                                                   X(mktensor_1d)(2, n/2, n/2),
+                                                   buf, buf, R2HC));
+     X(ifree)(buf);
+     if (!cld)
+          return (plan *)0;
+
+     pln = MKPLAN_RDFT(P, &padt, p->kind[0]==REDFT11 ? apply_re11:apply_ro11);
+     pln->n = n;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+     pln->cld = cld;
+     pln->td = pln->td2 = 0;
+     pln->kind = p->kind[0];
+     
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+     
+     X(ops_zero)(&ops);
+     ops.add = 2 + (n/2 - 1)/2 * 20;
+     ops.mul = 6 + (n/2 - 1)/2 * 16;
+     ops.other = 4*n + 2 + (n/2 - 1)/2 * 6;
+     if ((n/2) % 2 == 0) {
+	  ops.add += 4;
+	  ops.mul += 8;
+	  ops.other += 4;
+     }
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(reodft11e_radix2_r2hc_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/rodft00e-r2hc-pad.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/rodft00e-r2hc-pad.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Do a RODFT00 problem via an R2HC problem, padded antisymmetrically to
+   twice the size.  This is asymptotically a factor of ~2 worse than
+   rodft00e-r2hc.c (the algorithm used in e.g. FFTPACK and Numerical
+   Recipes), but we abandoned the latter after we discovered that it
+   has intrinsic accuracy problems. */
+
+#include "reodft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld, *cldcpy;
+     INT is;
+     INT n;
+     INT vl;
+     INT ivs, ovs;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * (2*n), BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = K(0.0);
+	  for (i = 1; i < n; ++i) {
+	       R a = I[(i-1) * is];
+	       buf[i] = -a;
+	       buf[2*n - i] = a;
+	  }
+	  buf[i] = K(0.0); /* i == n, Nyquist */
+	  
+	  /* r2hc transform of size 2*n */
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  /* copy n-1 real numbers (imag. parts of hc array) from buf to O */
+	  {
+	       plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
+	       cldcpy->apply((plan *) cldcpy, buf+2*n-1, O);
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldcpy, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldcpy);
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(rodft00e-r2hc-pad-%D%v%(%p%)%(%p%))", 
+	      ego->n - 1, ego->vl, ego->cld, ego->cldcpy);
+}
+
+static int applicable0(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego_);
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1
+	     && p->kind[0] == RODFT00
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p, const planner *plnr)
+{
+     return (!NO_SLOWP(plnr) && applicable0(ego, p));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     plan *cld = (plan *) 0, *cldcpy;
+     R *buf = (R *) 0;
+     INT n;
+     INT vl, ivs, ovs;
+     opcnt ops;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+	  goto nada;
+
+     p = (const problem_rdft *) p_;
+
+     n = p->sz->dims[0].n + 1;
+     A(n > 0);
+     buf = (R *) MALLOC(sizeof(R) * (2*n), BUFFERS);
+
+     cld = X(mkplan_d)(plnr,X(mkproblem_rdft_1_d)(X(mktensor_1d)(2*n,1,1), 
+						  X(mktensor_0d)(), 
+						  buf, buf, R2HC));
+     if (!cld)
+	  goto nada;
+
+     X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
+     cldcpy =
+	  X(mkplan_d)(plnr,
+		      X(mkproblem_rdft_1_d)(X(mktensor_0d)(),
+					    X(mktensor_1d)(n-1,-1,
+							   p->sz->dims[0].os), 
+					    buf+2*n-1,TAINT(p->O, ovs), R2HC));
+     if (!cldcpy)
+	  goto nada;
+
+     X(ifree)(buf);
+
+     pln = MKPLAN_RDFT(P, &padt, apply);
+
+     pln->n = n;
+     pln->is = p->sz->dims[0].is;
+     pln->cld = cld;
+     pln->cldcpy = cldcpy;
+     pln->vl = vl;
+     pln->ivs = ivs;
+     pln->ovs = ovs;
+     
+     X(ops_zero)(&ops);
+     ops.other = n-1 + 2*n; /* loads + stores (input -> buf) */
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cldcpy->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(ifree0)(buf);
+     if (cld)
+	  X(plan_destroy_internal)(cld);  
+     return (plan *)0;
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(rodft00e_r2hc_pad_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/reodft/rodft00e-r2hc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/reodft/rodft00e-r2hc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* Do a RODFT00 problem via an R2HC problem, with some pre/post-processing.
+
+   This code uses the trick from FFTPACK, also documented in a similar
+   form by Numerical Recipes.  Unfortunately, this algorithm seems to
+   have intrinsic numerical problems (similar to those in
+   reodft11e-r2hc.c), possibly due to the fact that it multiplies its
+   input by a sine, causing a loss of precision near the zero.  For
+   transforms of 16k points, it has already lost three or four decimal
+   places of accuracy, which we deem unacceptable.
+
+   So, we have abandoned this algorithm in favor of the one in
+   rodft00-r2hc-pad.c, which unfortunately sacrifices 30-50% in speed.
+   The only other alternative in the literature that does not have
+   similar numerical difficulties seems to be the direct adaptation of
+   the Cooley-Tukey decomposition for antisymmetric data, but this
+   would require a whole new set of codelets and it's not clear that
+   it's worth it at this point.  However, we did implement the latter
+   algorithm for the specific case of odd n (logically adapting the
+   split-radix algorithm); see reodft00e-splitradix.c. */
+
+#include "reodft.h"
+
+typedef struct {
+     solver super;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan *cld;
+     twid *td;
+     INT is, os;
+     INT n;
+     INT vl;
+     INT ivs, ovs;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     INT is = ego->is, os = ego->os;
+     INT i, n = ego->n;
+     INT iv, vl = ego->vl;
+     INT ivs = ego->ivs, ovs = ego->ovs;
+     R *W = ego->td->W;
+     R *buf;
+
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
+	  buf[0] = 0;
+	  for (i = 1; i < n - i; ++i) {
+	       E a, b, apb, amb;
+	       a = I[is * (i - 1)];
+	       b = I[is * ((n - i) - 1)];
+	       apb =  K(2.0) * W[i] * (a + b);
+	       amb = (a - b);
+	       buf[i] = apb + amb;
+	       buf[n - i] = apb - amb;
+	  }
+	  if (i == n - i) {
+	       buf[i] = K(4.0) * I[is * (i - 1)];
+	  }
+	  
+	  {
+	       plan_rdft *cld = (plan_rdft *) ego->cld;
+	       cld->apply((plan *) cld, buf, buf);
+	  }
+	  
+	  /* FIXME: use recursive/cascade summation for better stability? */
+	  O[0] = buf[0] * 0.5;
+	  for (i = 1; i + i < n - 1; ++i) {
+	       INT k = i + i;
+	       O[os * (k - 1)] = -buf[n - i];
+	       O[os * k] = O[os * (k - 2)] + buf[i];
+	  }
+	  if (i + i == n - 1) {
+	       O[os * (n - 2)] = -buf[n - i];
+	  }
+     }
+
+     X(ifree)(buf);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     static const tw_instr rodft00e_tw[] = {
+          { TW_SIN, 0, 1 },
+          { TW_NEXT, 1, 0 }
+     };
+
+     X(plan_awake)(ego->cld, wakefulness);
+
+     X(twiddle_awake)(wakefulness,
+		      &ego->td, rodft00e_tw, 2*ego->n, 1, (ego->n+1)/2);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(rodft00e-r2hc-%D%v%(%p%))", ego->n - 1, ego->vl, ego->cld);
+}
+
+static int applicable0(const solver *ego_, const problem *p_)
+{
+     const problem_rdft *p = (const problem_rdft *) p_;
+     UNUSED(ego_);
+
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk <= 1
+	     && p->kind[0] == RODFT00
+	  );
+}
+
+static int applicable(const solver *ego, const problem *p, const planner *plnr)
+{
+     return (!NO_SLOWP(plnr) && applicable0(ego, p));
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     P *pln;
+     const problem_rdft *p;
+     plan *cld;
+     R *buf;
+     INT n;
+     opcnt ops;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr))
+          return (plan *)0;
+
+     p = (const problem_rdft *) p_;
+
+     n = p->sz->dims[0].n + 1;
+     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
+
+     cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n, 1, 1),
+                                                   X(mktensor_0d)(),
+                                                   buf, buf, R2HC));
+     X(ifree)(buf);
+     if (!cld)
+          return (plan *)0;
+
+     pln = MKPLAN_RDFT(P, &padt, apply);
+
+     pln->n = n;
+     pln->is = p->sz->dims[0].is;
+     pln->os = p->sz->dims[0].os;
+     pln->cld = cld;
+     pln->td = 0;
+     
+     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
+     
+     X(ops_zero)(&ops);
+     ops.other = 4 + (n-1)/2 * 5 + (n-2)/2 * 5;
+     ops.add = (n-1)/2 * 4 + (n-2)/2 * 1;
+     ops.mul = 1 + (n-1)/2 * 2;
+     if (n % 2 == 0)
+	  ops.mul += 1;
+
+     X(ops_zero)(&pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
+     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+}
+
+/* constructor */
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     return &(slv->super);
+}
+
+void X(rodft00e_r2hc_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,12 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel
+noinst_LTLIBRARIES = libsimd_support.la libsimd_sse2_nonportable.la
+
+libsimd_support_la_SOURCES = taint.c simd-common.h simd-sse2.h sse2.c	\
+x86-cpuid.h amd64-cpuid.h avx.c simd-avx.h altivec.c simd-altivec.h	\
+neon.c simd-neon.h
+
+# sse2-nonportable.c needs SSE2_CFLAGS, but Automake does not support
+# per-object CFLAGS.  Thus we build a separate library.
+libsimd_sse2_nonportable_la_CFLAGS = $(SSE2_CFLAGS)
+libsimd_sse2_nonportable_la_SOURCES = sse2-nonportable.c 
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,567 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = simd-support
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libsimd_sse2_nonportable_la_LIBADD =
+am_libsimd_sse2_nonportable_la_OBJECTS =  \
+	libsimd_sse2_nonportable_la-sse2-nonportable.lo
+libsimd_sse2_nonportable_la_OBJECTS =  \
+	$(am_libsimd_sse2_nonportable_la_OBJECTS)
+libsimd_sse2_nonportable_la_LINK = $(LIBTOOL) --tag=CC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(libsimd_sse2_nonportable_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+libsimd_support_la_LIBADD =
+am_libsimd_support_la_OBJECTS = taint.lo sse2.lo avx.lo altivec.lo \
+	neon.lo
+libsimd_support_la_OBJECTS = $(am_libsimd_support_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libsimd_sse2_nonportable_la_SOURCES) \
+	$(libsimd_support_la_SOURCES)
+DIST_SOURCES = $(libsimd_sse2_nonportable_la_SOURCES) \
+	$(libsimd_support_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel
+noinst_LTLIBRARIES = libsimd_support.la libsimd_sse2_nonportable.la
+libsimd_support_la_SOURCES = taint.c simd-common.h simd-sse2.h sse2.c	\
+x86-cpuid.h amd64-cpuid.h avx.c simd-avx.h altivec.c simd-altivec.h	\
+neon.c simd-neon.h
+
+
+# sse2-nonportable.c needs SSE2_CFLAGS, but Automake does not support
+# per-object CFLAGS.  Thus we build a separate library.
+libsimd_sse2_nonportable_la_CFLAGS = $(SSE2_CFLAGS)
+libsimd_sse2_nonportable_la_SOURCES = sse2-nonportable.c 
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu simd-support/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu simd-support/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libsimd_sse2_nonportable.la: $(libsimd_sse2_nonportable_la_OBJECTS) $(libsimd_sse2_nonportable_la_DEPENDENCIES) $(EXTRA_libsimd_sse2_nonportable_la_DEPENDENCIES) 
+	$(libsimd_sse2_nonportable_la_LINK)  $(libsimd_sse2_nonportable_la_OBJECTS) $(libsimd_sse2_nonportable_la_LIBADD) $(LIBS)
+libsimd_support.la: $(libsimd_support_la_OBJECTS) $(libsimd_support_la_DEPENDENCIES) $(EXTRA_libsimd_support_la_DEPENDENCIES) 
+	$(LINK)  $(libsimd_support_la_OBJECTS) $(libsimd_support_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/altivec.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsimd_sse2_nonportable_la-sse2-nonportable.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/neon.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sse2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/taint.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+libsimd_sse2_nonportable_la-sse2-nonportable.lo: sse2-nonportable.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_sse2_nonportable_la_CFLAGS) $(CFLAGS) -MT libsimd_sse2_nonportable_la-sse2-nonportable.lo -MD -MP -MF $(DEPDIR)/libsimd_sse2_nonportable_la-sse2-nonportable.Tpo -c -o libsimd_sse2_nonportable_la-sse2-nonportable.lo `test -f 'sse2-nonportable.c' || echo '$(srcdir)/'`sse2-nonportable.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libsimd_sse2_nonportable_la-sse2-nonportable.Tpo $(DEPDIR)/libsimd_sse2_nonportable_la-sse2-nonportable.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sse2-nonportable.c' object='libsimd_sse2_nonportable_la-sse2-nonportable.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsimd_sse2_nonportable_la_CFLAGS) $(CFLAGS) -c -o libsimd_sse2_nonportable_la-sse2-nonportable.lo `test -f 'sse2-nonportable.c' || echo '$(srcdir)/'`sse2-nonportable.c
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/altivec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/altivec.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+#if HAVE_ALTIVEC
+
+#if HAVE_SYS_SYSCTL_H
+#  include <sys/sysctl.h>
+#endif
+
+#if HAVE_SYS_SYSCTL_H && HAVE_SYSCTL && defined(CTL_HW) && defined(HW_VECTORUNIT)
+/* code for darwin */
+static int really_have_altivec(void)
+{
+     int mib[2], altivecp;
+     size_t len;
+     mib[0] = CTL_HW;
+     mib[1] = HW_VECTORUNIT;
+     len = sizeof(altivecp);
+     sysctl(mib, 2, &altivecp, &len, NULL, 0);
+     return altivecp;
+} 
+#else /* GNU/Linux and other non-Darwin systems (!HAVE_SYS_SYSCTL_H etc.) */
+
+#include <signal.h>
+#include <setjmp.h>
+
+static jmp_buf jb;
+
+static void sighandler(int x)
+{
+     longjmp(jb, 1);
+}
+
+static int really_have_altivec(void)
+{
+     void (*oldsig)(int);
+     oldsig = signal(SIGILL, sighandler);
+     if (setjmp(jb)) {
+	  signal(SIGILL, oldsig);
+	  return 0;
+     } else {
+	  __asm__ __volatile__ (".long 0x10000484"); /* vor 0,0,0 */
+	  signal(SIGILL, oldsig);
+	  return 1;
+     }
+     return 0;
+}
+#endif
+
+int X(have_simd_altivec)(void)
+{
+     static int init = 0, res;
+     if (!init) {
+	  res = really_have_altivec();
+	  init = 1;
+     }
+     return res;
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/amd64-cpuid.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/amd64-cpuid.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#if (_MSC_VER >= 1600) && !defined(__INTEL_COMPILER)
+#include <immintrin.h>
+#endif
+#endif
+
+static inline int cpuid_ecx(int op)
+{
+#    ifdef _MSC_VER
+#    ifdef __INTEL_COMPILER
+     int result;
+     _asm {
+	  push rbx
+          mov eax,op
+          cpuid
+          mov result,ecx
+          pop rbx
+     }
+     return result;
+#    else
+     int cpu_info[4];
+     __cpuid(cpu_info,op);
+     return cpu_info[2];
+#    endif
+#    else
+     int eax, ecx, edx;
+
+     __asm__("pushq %%rbx\n\tcpuid\n\tpopq %%rbx"
+	     : "=a" (eax), "=c" (ecx), "=d" (edx)
+	     : "a" (op));
+     return ecx;
+#    endif
+}
+
+static inline int xgetbv_eax(int op)
+{
+#    ifdef _MSC_VER
+#    ifdef __INTEL_COMPILER
+     int veax, vedx;
+     _asm {
+          mov ecx,op
+          xgetbv
+          mov veax,eax
+          mov vedx,edx
+     }
+     return veax;
+#    else
+#    if defined(_MSC_VER) && (_MSC_VER >= 1600)
+     unsigned __int64 result;
+     result = _xgetbv(op);
+     return (int)result;
+#    else
+#    error "Need at least Visual Studio 10 SP1 for AVX support"
+#    endif
+#    endif
+#    else
+     int eax, edx;
+     __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (op));
+     return eax;
+#endif
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/avx.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/avx.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+#if HAVE_AVX
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+
+#include "amd64-cpuid.h"
+
+int X(have_simd_avx)(void)
+{
+       static int init = 0, res;
+
+       if (!init) {
+	    res = 1 
+		 && ((cpuid_ecx(1) & 0x18000000) == 0x18000000)
+		 && ((xgetbv_eax(0) & 0x6) == 0x6);
+	    init = 1;
+       }
+       return res;
+}
+
+#else /* 32-bit code */
+
+#include "x86-cpuid.h"
+
+int X(have_simd_avx)(void)
+{
+       static int init = 0, res;
+
+       if (!init) {
+	    res =   !is_386() 
+		 && has_cpuid()
+		 && ((cpuid_ecx(1) & 0x18000000) == 0x18000000)
+		 && ((xgetbv_eax(0) & 0x6) == 0x6);
+	    init = 1;
+       }
+       return res;
+}
+#endif
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/neon.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/neon.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+#if HAVE_NEON
+
+/* check for an environment where signals are known to work */
+#if defined(unix) || defined(linux)
+  # include <signal.h>
+  # include <setjmp.h>
+
+  static jmp_buf jb;
+
+  static void sighandler(int x)
+  {
+       UNUSED(x);
+       longjmp(jb, 1);
+  }
+
+  static int really_have_neon(void)
+  {
+       void (*oldsig)(int);
+       oldsig = signal(SIGILL, sighandler);
+       if (setjmp(jb)) {
+	    signal(SIGILL, oldsig);
+	    return 0;
+       } else {
+	    /* paranoia: encode the instruction in binary because the
+	       assembler may not recognize it without -mfpu=neon */
+	    /*asm volatile ("vand q0, q0, q0");*/
+	    asm volatile (".long 0xf2000150");
+	    signal(SIGILL, oldsig);
+	    return 1;
+       }
+  }
+
+  extern void X(check_alignment_of_sse2_pm)(void);
+
+  int X(have_simd_neon)(void)
+  {
+       static int init = 0, res;
+
+       if (!init) {
+	    res = really_have_neon();
+	    init = 1;
+       }
+       return res;
+  }
+
+
+#else
+/* don't know how to autodetect NEON; assume it is present */
+  int X(have_simd_neon)(void)
+  {
+       return 1;
+  }
+#endif
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/simd-altivec.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/simd-altivec.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef FFTW_SINGLE
+#error "ALTIVEC only works in single precision"
+#endif
+
+/* define these unconditionally, because they are used by
+   taint.c which is compiled without altivec */
+#define SIMD_SUFFIX _altivec  /* for renaming */
+#define VL 2            /* SIMD complex vector length */
+#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OKA
+
+#if !defined(__VEC__) && !defined(FAKE__VEC__)
+#  error "compiling simd-altivec.h requires -maltivec or equivalent"
+#endif
+
+#ifdef HAVE_ALTIVEC_H
+#  include <altivec.h>
+#endif
+
+typedef vector float V;
+#define VLIT(x0, x1, x2, x3) {x0, x1, x2, x3}
+#define LDK(x) x
+#define DVK(var, val) const V var = VLIT(val, val, val, val)
+
+static inline V VADD(V a, V b) { return vec_add(a, b); }
+static inline V VSUB(V a, V b) { return vec_sub(a, b); }
+static inline V VFMA(V a, V b, V c) { return vec_madd(a, b, c); }
+static inline V VFNMS(V a, V b, V c) { return vec_nmsub(a, b, c); }
+
+static inline V VMUL(V a, V b)
+{
+     DVK(zero, -0.0);
+     return VFMA(a, b, zero);
+}
+
+static inline V VFMS(V a, V b, V c) { return VSUB(VMUL(a, b), c); }
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like) 
+{
+     UNUSED(ivs);
+     UNUSED(aligned_like);
+     return vec_ld(0, x);
+}
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like) 
+{
+     /* common subexpressions */
+     const INT fivs = sizeof(R) * ivs;
+       /* you are not expected to understand this: */
+     const vector unsigned int perm = VLIT(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
+     vector unsigned char ml = vec_lvsr(fivs + 8, aligned_like);
+     vector unsigned char mh = vec_lvsl(0, aligned_like);
+     vector unsigned char msk = 
+	  (vector unsigned char)vec_sel((V)mh, (V)ml, perm);
+     /* end of common subexpressions */
+
+     return vec_perm(vec_ld(0, x), vec_ld(fivs, x), msk);
+}
+
+/* store lower half */
+static inline void STH(R *x, V v, R *aligned_like)
+{
+     v = vec_perm(v, v, vec_lvsr(0, aligned_like));
+     vec_ste(v, 0, x);
+     vec_ste(v, sizeof(R), x);
+}
+
+static inline void STL(R *x, V v, INT ovs, R *aligned_like)
+{
+     const INT fovs = sizeof(R) * ovs;
+     v = vec_perm(v, v, vec_lvsr(fovs + 8, aligned_like));
+     vec_ste(v, fovs, x);
+     vec_ste(v, sizeof(R) + fovs, x);
+}
+
+static inline void STA(R *x, V v, INT ovs, R *aligned_like) 
+{
+     UNUSED(ovs);
+     UNUSED(aligned_like);
+     vec_st(v, 0, x);
+}
+
+static inline void ST(R *x, V v, INT ovs, R *aligned_like) 
+{
+     /* WARNING: the extra_iter hack depends upon STH occurring after
+	STL */
+     STL(x, v, ovs, aligned_like);
+     STH(x, v, aligned_like);
+}
+
+#define STM2(x, v, ovs, aligned_like) /* no-op */
+
+static inline void STN2(R *x, V v0, V v1, INT ovs)
+{
+     const INT fovs = sizeof(R) * ovs;
+     const vector unsigned int even = 
+	  VLIT(0x00010203, 0x04050607, 0x10111213, 0x14151617);
+     const vector unsigned int odd = 
+	  VLIT(0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f);
+     vec_st(vec_perm(v0, v1, (vector unsigned char)even), 0, x);
+     vec_st(vec_perm(v0, v1, (vector unsigned char)odd), fovs, x);
+}
+
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+
+static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
+{
+     const INT fovs = sizeof(R) * ovs;
+     V x0 = vec_mergeh(v0, v2);
+     V x1 = vec_mergel(v0, v2);
+     V x2 = vec_mergeh(v1, v3);
+     V x3 = vec_mergel(v1, v3);
+     V y0 = vec_mergeh(x0, x2);
+     V y1 = vec_mergel(x0, x2);
+     V y2 = vec_mergeh(x1, x3);
+     V y3 = vec_mergel(x1, x3);
+     vec_st(y0, 0, x);
+     vec_st(y1, fovs, x);
+     vec_st(y2, 2 * fovs, x);
+     vec_st(y3, 3 * fovs, x);
+}
+
+static inline V FLIP_RI(V x)
+{
+     const vector unsigned int perm = 
+	  VLIT(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b);
+     return vec_perm(x, x, (vector unsigned char)perm);
+}
+
+static inline V VCONJ(V x)
+{
+     const V pmpm = VLIT(0.0, -0.0, 0.0, -0.0);
+     return vec_xor(x, pmpm);
+}
+
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+
+static inline V VFMAI(V b, V c)
+{
+     const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
+     return VFMA(FLIP_RI(b), mpmp, c);
+}
+
+static inline V VFNMSI(V b, V c)
+{
+     const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
+     return VFNMS(FLIP_RI(b), mpmp, c);
+}
+
+static inline V VFMACONJ(V b, V c)
+{
+     const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
+     return VFMA(b, pmpm, c);
+}
+
+static inline V VFNMSCONJ(V b, V c)
+{
+     const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
+     return VFNMS(b, pmpm, c);
+}
+
+static inline V VFMSCONJ(V b, V c)
+{
+     return VSUB(VCONJ(b), c);
+}
+
+static inline V VZMUL(V tx, V sr)
+{
+     const vector unsigned int real = 
+	  VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
+     const vector unsigned int imag = 
+	  VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
+     V si = VBYI(sr);
+     V tr = vec_perm(tx, tx, (vector unsigned char)real);
+     V ti = vec_perm(tx, tx, (vector unsigned char)imag);
+     return VFMA(ti, si, VMUL(tr, sr));
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     const vector unsigned int real = 
+	  VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
+     const vector unsigned int imag = 
+	  VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
+     V si = VBYI(sr);
+     V tr = vec_perm(tx, tx, (vector unsigned char)real);
+     V ti = vec_perm(tx, tx, (vector unsigned char)imag);
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+static inline V VZMULI(V tx, V si)
+{
+     const vector unsigned int real = 
+	  VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
+     const vector unsigned int imag = 
+	  VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
+     V sr = VBYI(si);
+     V tr = vec_perm(tx, tx, (vector unsigned char)real);
+     V ti = vec_perm(tx, tx, (vector unsigned char)imag);
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+static inline V VZMULIJ(V tx, V si)
+{
+     const vector unsigned int real = 
+	  VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
+     const vector unsigned int imag = 
+	  VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
+     V sr = VBYI(si);
+     V tr = vec_perm(tx, tx, (vector unsigned char)real);
+     V ti = vec_perm(tx, tx, (vector unsigned char)imag);
+     return VFMA(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #1: compact, slower */
+#define VTW1(v,x) \
+ {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+#define TWVL1 (VL)
+
+static inline V BYTW1(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = VBYI(sr);
+     V tx = twp[0];
+     V tr = vec_mergeh(tx, tx);
+     V ti = vec_mergel(tx, tx);
+     return VFMA(ti, si, VMUL(tr, sr));
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = VBYI(sr);
+     V tx = twp[0];
+     V tr = vec_mergeh(tx, tx);
+     V ti = vec_mergel(tx, tx);
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#define TWVL2 (2 * VL)
+
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(ti, si, VMUL(tr, sr));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#define TWVL3 (VL)
+
+/* twiddle storage for split arrays */
+#define VTWS(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x},	\
+  {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/simd-avx.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/simd-avx.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#error "AVX only works in single or double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## s
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## d
+#endif
+
+#define SIMD_SUFFIX  _avx  /* for renaming */
+#define VL DS(2, 4)        /* SIMD complex vector length */
+#define SIMD_VSTRIDE_OKA(x) ((x) == 2) 
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(__AVX__) /* sanity check */
+#error "compiling simd-avx.h without -mavx"
+#endif
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+#include <immintrin.h>
+
+typedef DS(__m256d, __m256) V;
+#define VADD SUFF(_mm256_add_p)
+#define VSUB SUFF(_mm256_sub_p)
+#define VMUL SUFF(_mm256_mul_p)
+#define VXOR SUFF(_mm256_xor_p)
+#define VSHUF SUFF(_mm256_shuffle_p)
+
+#define SHUFVALD(fp0,fp1) \
+   (((fp1) << 3) | ((fp0) << 2) | ((fp1) << 1) | ((fp0)))
+#define SHUFVALS(fp0,fp1,fp2,fp3) \
+   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#define VDUPL(x) DS(_mm256_unpacklo_pd(x, x), VSHUF(x, x, SHUFVALS(0, 0, 2, 2)))
+#define VDUPH(x) DS(_mm256_unpackhi_pd(x, x), VSHUF(x, x, SHUFVALS(1, 1, 3, 3)))
+
+#define VLIT(x0, x1) DS(_mm256_set_pd(x0, x1, x0, x1), _mm256_set_ps(x0, x1, x0, x1, x0, x1, x0, x1))
+#define DVK(var, val) V var = VLIT(val, val)
+#define LDK(x) x
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ivs; /* UNUSED */
+     return SUFF(_mm256_loadu_p)(x);
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ovs; /* UNUSED */
+     SUFF(_mm256_storeu_p)(x, v);
+}
+
+#if FFTW_SINGLE
+
+#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
+#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
+#define STOREH(addr, val) _mm_storeh_pi((__m64 *)(addr), val)
+#define STOREL(addr, val) _mm_storel_pi((__m64 *)(addr), val)
+
+/* it seems like the only AVX way to store 4 complex floats is to
+   extract two pairs of complex floats into two __m128 registers, and
+   then use SSE-like half-stores.  Similarly, to load 4 complex
+   floats, we load two pairs of complex floats into two __m128
+   registers, and then pack the two __m128 registers into one __m256
+   value. */
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+     __m128 l, h;
+     V v;
+     (void)aligned_like; /* UNUSED */
+     l = LOADL(x, l);
+     l = LOADH(x + ivs, l);
+     h = LOADL(x + 2*ivs, h);
+     h = LOADH(x + 3*ivs, h);
+     v = _mm256_castps128_ps256(l);
+     v = _mm256_insertf128_ps(v, h, 1);
+     return v;
+}
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     __m128 h = _mm256_extractf128_ps(v, 1);
+     __m128 l = _mm256_castps256_ps128(v);
+     (void)aligned_like; /* UNUSED */
+     /* WARNING: the extra_iter hack depends upon STOREL occurring
+	after STOREH */
+     STOREH(x + 3*ovs, h);
+     STOREL(x + 2*ovs, h);
+     STOREH(x + ovs, l);
+     STOREL(x, l);
+}
+
+#define STM2(x, v, ovs, aligned_like) /* no-op */
+static inline void STN2(R *x, V v0, V v1, INT ovs)
+{
+    V x0 = VSHUF(v0, v1, SHUFVALS(0, 1, 0, 1));
+    V x1 = VSHUF(v0, v1, SHUFVALS(2, 3, 2, 3));
+    __m128 h0 = _mm256_extractf128_ps(x0, 1);
+    __m128 l0 = _mm256_castps256_ps128(x0);
+    __m128 h1 = _mm256_extractf128_ps(x1, 1);
+    __m128 l1 = _mm256_castps256_ps128(x1);
+    *(__m128 *)(x + 3*ovs) = h1;
+    *(__m128 *)(x + 2*ovs) = h0;
+    *(__m128 *)(x + 1*ovs) = l1;
+    *(__m128 *)(x + 0*ovs) = l0;
+}
+
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+#define STN4(x, v0, v1, v2, v3, ovs)				\
+{								\
+     V xxx0, xxx1, xxx2, xxx3;					\
+     V yyy0, yyy1, yyy2, yyy3;					\
+     xxx0 = _mm256_unpacklo_ps(v0, v2);				\
+     xxx1 = _mm256_unpackhi_ps(v0, v2);				\
+     xxx2 = _mm256_unpacklo_ps(v1, v3);				\
+     xxx3 = _mm256_unpackhi_ps(v1, v3);				\
+     yyy0 = _mm256_unpacklo_ps(xxx0, xxx2);			\
+     yyy1 = _mm256_unpackhi_ps(xxx0, xxx2);			\
+     yyy2 = _mm256_unpacklo_ps(xxx1, xxx3);			\
+     yyy3 = _mm256_unpackhi_ps(xxx1, xxx3);			\
+     *(__m128 *)(x + 0 * ovs) = _mm256_castps256_ps128(yyy0);	\
+     *(__m128 *)(x + 4 * ovs) = _mm256_extractf128_ps(yyy0, 1);	\
+     *(__m128 *)(x + 1 * ovs) = _mm256_castps256_ps128(yyy1);	\
+     *(__m128 *)(x + 5 * ovs) = _mm256_extractf128_ps(yyy1, 1);	\
+     *(__m128 *)(x + 2 * ovs) = _mm256_castps256_ps128(yyy2);	\
+     *(__m128 *)(x + 6 * ovs) = _mm256_extractf128_ps(yyy2, 1);	\
+     *(__m128 *)(x + 3 * ovs) = _mm256_castps256_ps128(yyy3);	\
+     *(__m128 *)(x + 7 * ovs) = _mm256_extractf128_ps(yyy3, 1);	\
+}
+
+#else
+static inline __m128d VMOVAPD_LD(const R *x)
+{
+     /* gcc-4.6 miscompiles the combination _mm256_castpd128_pd256(VMOVAPD_LD(x))
+	into a 256-bit vmovapd, which requires 32-byte aligment instead of
+	16-byte alignment.
+
+	Force the use of vmovapd via asm until compilers stabilize.
+     */
+#if defined(__GNUC__)
+     __m128d var;
+     __asm__("vmovapd %1, %0\n" : "=x"(var) : "m"(x[0]));
+     return var;
+#else
+     return *(const __m128d *)x;
+#endif
+}
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+     V var;
+     (void)aligned_like; /* UNUSED */
+     var = _mm256_castpd128_pd256(VMOVAPD_LD(x));
+     var = _mm256_insertf128_pd(var, *(const __m128d *)(x+ivs), 1);
+     return var;
+}
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     /* WARNING: the extra_iter hack depends upon the store of the low
+	part occurring after the store of the high part */
+     *(__m128d *)(x + ovs) = _mm256_extractf128_pd(v, 1);
+     *(__m128d *)x = _mm256_castpd256_pd128(v);
+}
+
+
+#define STM2 ST
+#define STN2(x, v0, v1, ovs) /* nop */
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+
+/* STN4 is a macro, not a function, thanks to Visual C++ developers
+   deciding "it would be infrequent that people would want to pass more
+   than 3 [__m128 parameters] by value."  Even though the comment
+   was made about __m128 parameters, it appears to apply to __m256
+   parameters as well. */
+#define STN4(x, v0, v1, v2, v3, ovs)					\
+{									\
+     V xxx0, xxx1, xxx2, xxx3;						\
+     xxx0 = _mm256_unpacklo_pd(v0, v1);					\
+     xxx1 = _mm256_unpackhi_pd(v0, v1);					\
+     xxx2 = _mm256_unpacklo_pd(v2, v3);					\
+     xxx3 = _mm256_unpackhi_pd(v2, v3);					\
+     STA(x,           _mm256_permute2f128_pd(xxx0, xxx2, 0x20), 0, 0); \
+     STA(x +     ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x20), 0, 0); \
+     STA(x + 2 * ovs, _mm256_permute2f128_pd(xxx0, xxx2, 0x31), 0, 0); \
+     STA(x + 3 * ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x31), 0, 0); \
+}
+#endif
+
+static inline V FLIP_RI(V x)
+{
+     return VSHUF(x, x,
+		  DS(SHUFVALD(1, 0), 
+		     SHUFVALS(1, 0, 3, 2)));
+}
+
+static inline V VCONJ(V x)
+{
+     V pmpm = VLIT(-0.0, 0.0);
+     return VXOR(pmpm, x);
+}
+
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+
+/* FMA support */
+#define VFMA(a, b, c) VADD(c, VMUL(a, b))
+#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
+#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
+#define VFMAI(b, c) VADD(c, VBYI(b))
+#define VFNMSI(b, c) VSUB(c, VBYI(b))
+#define VFMACONJ(b,c)  VADD(VCONJ(b),c)
+#define VFMSCONJ(b,c)  VSUB(VCONJ(b),c)
+#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
+
+static inline V VZMUL(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMA(tr, sr, ti);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
+#else
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#endif
+#define TWVL1 (VL)
+
+static inline V BYTW1(const R *t, V sr)
+{
+     return VZMUL(LDA(t, 2, t), sr);
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     return VZMULJ(LDA(t, 2, t), sr);
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+# define VTW2(v,x)							\
+   {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+   {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
+   {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+   {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
+#else
+# define VTW2(v,x)							\
+   {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+   {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#endif
+#define TWVL2 (2 * VL)
+
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#define VTW3 VTW1
+#define TWVL3 TWVL1
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+# define VTWS(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x},	\
+  {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+  {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x},	\
+  {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
+#else
+# define VTWS(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x},	\
+  {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}	
+#endif
+#define TWVLS (2 * VL)
+
+
+/* Use VZEROUPPER to avoid the penalty of switching from AVX to SSE.
+   See Intel Optimization Manual (April 2011, version 248966), Section
+   11.3 */
+#define VLEAVE _mm256_zeroupper
+
+#include "simd-common.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/simd-common.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/simd-common.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* detection of alignment.  This is complicated because a machine may
+   support multiple SIMD extensions (e.g. SSE2 and AVX) but only one
+   set of alignment contraints.  So this alignment stuff cannot be
+   defined in the SIMD header files.  Rather than defining a separate
+   set of "machine" header files, we just do this ugly ifdef here. */
+#if defined(HAVE_SSE2) || defined(HAVE_AVX)
+#  if defined(FFTW_SINGLE)
+#    define ALIGNMENT 8     /* Alignment for the LD/ST macros */
+#    define ALIGNMENTA 16   /* Alignment for the LDA/STA macros */
+#  else
+#    define ALIGNMENT 16    /* Alignment for the LD/ST macros */
+#    define ALIGNMENTA 16   /* Alignment for the LDA/STA macros */
+#  endif
+#elif defined(HAVE_ALTIVEC)
+#  define ALIGNMENT 8     /* Alignment for the LD/ST macros */
+#  define ALIGNMENTA 16   /* Alignment for the LDA/STA macros */
+#elif defined(HAVE_NEON)
+#  define ALIGNMENT 8     /* Alignment for the LD/ST macros */
+#  define ALIGNMENTA 8    /* Alignment for the LDA/STA macros */
+#endif
+
+#if HAVE_SIMD
+#  ifndef ALIGNMENT
+#  error "ALIGNMENT not defined"
+#  endif
+#  ifndef ALIGNMENTA
+#  error "ALIGNMENTA not defined"
+#  endif
+#endif
+
+/* rename for precision and for SIMD extensions */
+#define XSIMD0(name, suffix) CONCAT(name, suffix)
+#define XSIMD(name) XSIMD0(X(name), SIMD_SUFFIX)
+#define XSIMD_STRING(x) x STRINGIZE(SIMD_SUFFIX)
+
+/* TAINT_BIT is set if pointers are not guaranteed to be multiples of
+   ALIGNMENT */
+#define TAINT_BIT 1    
+
+/* TAINT_BITA is set if pointers are not guaranteed to be multiples of
+   ALIGNMENTA */
+#define TAINT_BITA 2
+
+#define PTRINT(p) ((uintptr_t)(p))
+
+#define ALIGNED(p) \
+  (((PTRINT(UNTAINT(p)) % ALIGNMENT) == 0) && !(PTRINT(p) & TAINT_BIT))
+
+#define ALIGNEDA(p) \
+  (((PTRINT(UNTAINT(p)) % ALIGNMENTA) == 0) && !(PTRINT(p) & TAINT_BITA))
+
+#define SIMD_STRIDE_OK(x) (!(((x) * sizeof(R)) % ALIGNMENT))
+#define SIMD_STRIDE_OKA(x) (!(((x) * sizeof(R)) % ALIGNMENTA))
+#define SIMD_VSTRIDE_OK SIMD_STRIDE_OK
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/simd-neon.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/simd-neon.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef FFTW_SINGLE
+#error "NEON only works in single precision"
+#endif
+
+/* define these unconditionally, because they are used by
+   taint.c which is compiled without neon */
+#define SIMD_SUFFIX _neon	/* for renaming */
+#define VL 2            /* SIMD complex vector length */
+#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(__ARM_NEON__)
+#error "compiling simd-neon.h requires -mfpu=neon or equivalent"
+#endif
+
+#include <arm_neon.h>
+
+/* FIXME: I am not sure whether this code assumes little-endian
+   ordering.  VLIT may or may not be wrong for big-endian systems. */
+typedef float32x4_t V;
+
+#define VLIT(x0, x1, x2, x3) {x0, x1, x2, x3}
+#define LDK(x) x
+#define DVK(var, val) const V var = VLIT(val, val, val, val)
+
+/* NEON has FMA, but a three-operand FMA is not too useful
+   for FFT purposes.  We normally compute
+
+      t0=a+b*c
+      t1=a-b*c
+
+   In a three-operand instruction set this translates into
+
+      t0=a
+      t0+=b*c
+      t1=a
+      t1-=b*c
+
+   At least one move must be implemented, negating the advantage of
+   the FMA in the first place.  At least some versions of gcc generate
+   both moves.  So we are better off generating t=b*c;t0=a+t;t1=a-t;*/
+#if HAVE_FMA
+#warning "--enable-fma on NEON is probably a bad idea (see source code)"
+#endif
+
+#define VADD(a, b) vaddq_f32(a, b)
+#define VSUB(a, b) vsubq_f32(a, b)
+#define VMUL(a, b) vmulq_f32(a, b)
+#define VFMA(a, b, c) vmlaq_f32(c, a, b)	        /* a*b+c */
+#define VFNMS(a, b, c) vmlsq_f32(c, a, b)	/* FNMS=-(a*b-c) in powerpc terminology; MLS=c-a*b
+						   in ARM terminology */
+#define VFMS(a, b, c) VSUB(VMUL(a, b), c)	/* FMS=a*b-c in powerpc terminology; no equivalent
+						   arm instruction (?) */
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void) aligned_like;	/* UNUSED */
+     return vld1q_f32((const float32_t *)x);
+}
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+     (void) aligned_like;	/* UNUSED */
+     return vcombine_f32(vld1_f32((float32_t *)x), vld1_f32((float32_t *)(x + ivs)));
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void) aligned_like;	/* UNUSED */
+     vst1q_f32((float32_t *)x, v);
+}
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void) aligned_like;	/* UNUSED */
+     /* WARNING: the extra_iter hack depends upon store-low occurring
+	after store-high */
+     vst1_f32((float32_t *)(x + ovs), vget_high_f32(v));
+     vst1_f32((float32_t *)x, vget_low_f32(v));
+}
+
+/* 2x2 complex transpose and store */
+#define STM2 ST
+#define STN2(x, v0, v1, ovs) /* nop */
+
+/* store and 4x4 real transpose */
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void) aligned_like;	/* UNUSED */
+     vst1_lane_f32((float32_t *)(x)      , vget_low_f32(v), 0);
+     vst1_lane_f32((float32_t *)(x + ovs), vget_low_f32(v), 1);
+     vst1_lane_f32((float32_t *)(x + 2 * ovs), vget_high_f32(v), 0);
+     vst1_lane_f32((float32_t *)(x + 3 * ovs), vget_high_f32(v), 1);
+}
+#define STN4(x, v0, v1, v2, v3, ovs)	/* use STM4 */
+
+#define FLIP_RI(x) vrev64q_f32(x)
+
+static inline V VCONJ(V x)
+{
+#if 1
+     static const uint32x4_t pm = {0, 0x80000000u, 0, 0x80000000u};
+     return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), pm));
+#else
+     const V pm = VLIT(1.0, -1.0, 1.0, -1.0);
+     return VMUL(x, pm);
+#endif
+}
+
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+
+static inline V VFMAI(V b, V c)
+{
+     const V mp = VLIT(-1.0, 1.0, -1.0, 1.0);
+     return VFMA(FLIP_RI(b), mp, c);
+}
+
+static inline V VFNMSI(V b, V c)
+{
+     const V mp = VLIT(-1.0, 1.0, -1.0, 1.0);
+     return VFNMS(FLIP_RI(b), mp, c);
+}
+
+static inline V VFMACONJ(V b, V c)
+{
+     const V pm = VLIT(1.0, -1.0, 1.0, -1.0);
+     return VFMA(b, pm, c);
+}
+
+static inline V VFNMSCONJ(V b, V c)
+{
+     const V pm = VLIT(1.0, -1.0, 1.0, -1.0);
+     return VFNMS(b, pm, c);
+}
+
+static inline V VFMSCONJ(V b, V c)
+{
+     return VSUB(VCONJ(b), c);
+}
+
+#if 1
+#define VEXTRACT_REIM(tr, ti, tx)                               \
+{                                                               \
+     tr = vcombine_f32(vdup_lane_f32(vget_low_f32(tx), 0),      \
+                       vdup_lane_f32(vget_high_f32(tx), 0));    \
+     ti = vcombine_f32(vdup_lane_f32(vget_low_f32(tx), 1),      \
+                       vdup_lane_f32(vget_high_f32(tx), 1));    \
+}
+#else
+/* this alternative might be faster in an ideal world, but gcc likes
+   to spill VVV onto the stack */
+#define VEXTRACT_REIM(tr, ti, tx)               \
+{                                               \
+     float32x4x2_t vvv = vtrnq_f32(tx, tx);     \
+     tr = vvv.val[0];                           \
+     ti = vvv.val[1];                           \
+}
+#endif
+
+static inline V VZMUL(V tx, V sr)
+{
+     V tr, ti;
+     VEXTRACT_REIM(tr, ti, tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr, ti;
+     VEXTRACT_REIM(tr, ti, tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr, ti;
+     VEXTRACT_REIM(tr, ti, tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     V tr, ti;
+     VEXTRACT_REIM(tr, ti, tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMA(tr, sr, ti);
+}
+
+/* twiddle storage #1: compact, slower */
+#define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#define TWVL1 VL
+static inline V BYTW1(const R *t, V sr)
+{
+     V tx = LDA(t, 2, 0);
+     return VZMUL(tx, sr);
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     V tx = LDA(t, 2, 0);
+     return VZMULJ(tx, sr);
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#define TWVL2 (2 * VL)
+
+static inline V BYTW2(const R *t, V sr)
+{
+     V si = FLIP_RI(sr);
+     V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
+     return VFMA(ti, si, VMUL(tr, sr));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     V si = FLIP_RI(sr);
+     V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#  define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#  define TWVL3 (VL)
+
+/* twiddle storage for split arrays */
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+    {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#define TWVLS (2 * VL)
+
+#define VLEAVE()		/* nothing */
+
+#include "simd-common.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/simd-sse2.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/simd-sse2.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#  error "SSE/SSE2 only works in single/double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## s
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## d
+#endif
+
+#define SIMD_SUFFIX  _sse2  /* for renaming */
+#define VL DS(1,2)         /* SIMD vector length, in term of complex numbers */
+#define SIMD_VSTRIDE_OKA(x) DS(1,((x) == 2))
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(FFTW_SINGLE) && !defined(__SSE2__)
+#  error "compiling simd-sse2.h in double precision without -msse2"
+#elif defined(__GNUC__) && defined(FFTW_SINGLE) && !defined(__SSE__)
+#  error "compiling simd-sse2.h in single precision without -msse"
+#endif
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+/* some versions of glibc's sys/cdefs.h define __inline to be empty,
+   which is wrong because emmintrin.h defines several inline
+   procedures */
+#ifndef _MSC_VER
+#undef __inline
+#endif
+
+#ifdef FFTW_SINGLE
+#  include <xmmintrin.h>
+#else
+#  include <emmintrin.h>
+#endif
+
+typedef DS(__m128d,__m128) V;
+#define VADD SUFF(_mm_add_p)
+#define VSUB SUFF(_mm_sub_p)
+#define VMUL SUFF(_mm_mul_p)
+#define VXOR SUFF(_mm_xor_p)
+#define SHUF SUFF(_mm_shuffle_p)
+#define UNPCKL SUFF(_mm_unpacklo_p)
+#define UNPCKH SUFF(_mm_unpackhi_p)
+
+#define SHUFVALS(fp0,fp1,fp2,fp3) \
+   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#define VDUPL(x) DS(UNPCKL(x, x), SHUF(x, x, SHUFVALS(0, 0, 2, 2)))
+#define VDUPH(x) DS(UNPCKH(x, x), SHUF(x, x, SHUFVALS(1, 1, 3, 3)))
+#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))
+#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))
+
+
+#ifdef __GNUC__
+  /*
+   * gcc-3.3 generates slow code for mm_set_ps (write all elements to
+   * the stack and load __m128 from the stack).
+   *
+   * gcc-3.[34] generates slow code for mm_set_ps1 (load into low element
+   * and shuffle).
+   *
+   * This hack forces gcc to generate a constant __m128 at compile time.
+   */
+  union rvec {
+       R r[DS(2,4)];
+       V v;
+  };
+
+#  ifdef FFTW_SINGLE
+#    define DVK(var, val) V var = __extension__ ({ \
+         static const union rvec _var = { {val,val,val,val} }; _var.v; })
+#  else
+#    define DVK(var, val) V var = __extension__ ({ \
+         static const union rvec _var = { {val,val} }; _var.v; })
+#  endif
+#  define LDK(x) x
+#else
+#  define DVK(var, val) const R var = K(val)
+#  define LDK(x) DS(_mm_set1_pd,_mm_set_ps1)(x)
+#endif
+
+union uvec {
+     unsigned u[4];
+     V v;
+};
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ivs; /* UNUSED */
+     return *(const V *)x;
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ovs; /* UNUSED */
+     *(V *)x = v;
+}
+
+#ifdef FFTW_SINGLE
+
+#  ifdef _MSC_VER
+     /* Temporarily disable the warning "uninitialized local variable
+	'name' used" and runtime checks for using a variable before it is
+	defined which is erroneously triggered by the LOADL0 / LOADH macros
+	as they only modify VAL partly each. */
+#    pragma warning(disable : 4700)
+#    pragma runtime_checks("u", off)
+#  endif
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+     V var;
+     (void)aligned_like; /* UNUSED */
+#  ifdef __GNUC__
+     /* We use inline asm because gcc-3.x generates slow code for
+	_mm_loadh_pi().  gcc-3.x insists upon having an existing variable for
+	VAL, which is however never used.  Thus, it generates code to move
+	values in and out the variable.  Worse still, gcc-4.0 stores VAL on
+	the stack, causing valgrind to complain about uninitialized reads. */  
+     __asm__("movlps %1, %0\n\tmovhps %2, %0"
+	     : "=x"(var) : "m"(x[0]), "m"(x[ivs]));
+#  else
+#    define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
+#    define LOADL0(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
+     var = LOADL0(x, var);
+     var = LOADH(x + ivs, var);
+#  endif
+     return var;
+}
+
+#  ifdef _MSC_VER
+#    pragma warning(default : 4700)
+#    pragma runtime_checks("u", restore)
+#  endif
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     /* WARNING: the extra_iter hack depends upon STOREL occurring
+	after STOREH */
+     STOREH(x + ovs, v);
+     STOREL(x, v);
+}
+
+#else /* ! FFTW_SINGLE */
+#  define LD LDA
+#  define ST STA
+#endif
+
+#define STM2 DS(STA,ST)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+#ifdef FFTW_SINGLE
+#  define STM4(x, v, ovs, aligned_like) /* no-op */
+/* STN4 is a macro, not a function, thanks to Visual C++ developers
+   deciding "it would be infrequent that people would want to pass more
+   than 3 [__m128 parameters] by value."  3 parameters ought to be enough
+   for anybody. */
+#  define STN4(x, v0, v1, v2, v3, ovs)			\
+{							\
+     V xxx0, xxx1, xxx2, xxx3;				\
+     xxx0 = UNPCKL(v0, v2);				\
+     xxx1 = UNPCKH(v0, v2);				\
+     xxx2 = UNPCKL(v1, v3);				\
+     xxx3 = UNPCKH(v1, v3);				\
+     STA(x, UNPCKL(xxx0, xxx2), 0, 0);			\
+     STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0);		\
+     STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0);	\
+     STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0);	\
+}
+#else /* !FFTW_SINGLE */
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     STOREL(x, v);
+     STOREH(x + ovs, v);
+}
+#  define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
+#endif
+
+static inline V FLIP_RI(V x)
+{
+     return SHUF(x, x, DS(1, SHUFVALS(1, 0, 3, 2)));
+}
+
+extern const union uvec X(sse2_pm);
+static inline V VCONJ(V x)
+{
+     return VXOR(X(sse2_pm).v, x);
+}
+
+static inline V VBYI(V x)
+{
+     x = VCONJ(x);
+     x = FLIP_RI(x);
+     return x;
+}
+
+/* FMA support */
+#define VFMA(a, b, c) VADD(c, VMUL(a, b))
+#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
+#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
+#define VFMAI(b, c) VADD(c, VBYI(b))
+#define VFNMSI(b, c) VSUB(c, VBYI(b))
+#define VFMACONJ(b,c)  VADD(VCONJ(b),c)
+#define VFMSCONJ(b,c)  VSUB(VCONJ(b),c)
+#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
+
+static inline V VZMUL(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMA(tr, sr, ti);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+#  define VTW1(v,x)  \
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+static inline V BYTW1(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V tx = twp[0];
+     V tr = UNPCKL(tx, tx);
+     V ti = UNPCKH(tx, tx);
+     tr = VMUL(tr, sr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V tx = twp[0];
+     V tr = UNPCKL(tx, tx);
+     V ti = UNPCKH(tx, tx);
+     tr = VMUL(tr, sr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+#else /* !FFTW_SINGLE */
+#  define VTW1(v,x) {TW_CEXP, v, x}
+static inline V BYTW1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMUL(tx, sr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMULJ(tx, sr);
+}
+#endif
+#define TWVL1 (VL)
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#else /* !FFTW_SINGLE */
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
+#endif
+#define TWVL2 (2 * VL)
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#ifdef FFTW_SINGLE
+#  define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#  define TWVL3 (VL)
+#else
+#  define VTW3(v,x) VTW1(v,x)
+#  define TWVL3 TWVL1
+#endif
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+    {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#else
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+#endif
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/sse2-nonportable.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/sse2-nonportable.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw.h"
+
+#if HAVE_SSE2
+/* this file must be compiled with -msse/-msse2 or equivalent, and it will
+   fail at runtime on a machine that does not support sse/sse2 */
+#include "simd-sse2.h"
+
+/* This will produce -0.0f (or -0.0d) even on broken
+   compilers that do not distinguish +0.0 from -0.0.
+   I bet some are still around. */
+const union uvec X(sse2_pm) = {
+#ifdef FFTW_SINGLE
+     { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
+#else
+     { 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
+#endif
+};
+
+/* paranoia because of past compiler bugs */
+void X(check_alignment_of_sse2_pm)(void)
+{
+     CK(ALIGNED(&X(sse2_pm)));
+}
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/sse2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/sse2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#else
+#  define DS(d,s) d /* double-precision option */
+#endif
+
+#if HAVE_SSE2
+
+# if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+
+  int X(have_simd_sse2)(void)
+  {
+       return 1;
+  }
+
+# else /* !x86_64 */
+
+# include <signal.h>
+# include <setjmp.h>
+# include "x86-cpuid.h"
+
+  static jmp_buf jb;
+
+  static void sighandler(int x)
+  {
+       UNUSED(x);
+       longjmp(jb, 1);
+  }
+
+  static int sse2_works(void)
+  {
+       void (*oldsig)(int);
+       oldsig = signal(SIGILL, sighandler);
+       if (setjmp(jb)) {
+	    signal(SIGILL, oldsig);
+	    return 0;
+       } else {
+#         ifdef _MSC_VER
+	    _asm { DS(xorpd,xorps) xmm0,xmm0 }
+#         else
+	    /* asm volatile ("xorpd/s %xmm0, %xmm0"); */
+	    asm volatile(DS(".byte 0x66; .byte 0x0f; .byte 0x57; .byte 0xc0",
+			                ".byte 0x0f; .byte 0x57; .byte 0xc0"));
+#         endif
+	    signal(SIGILL, oldsig);
+	    return 1;
+       }
+  }
+
+  extern void X(check_alignment_of_sse2_pm)(void);
+
+  int X(have_simd_sse2)(void)
+  {
+       static int init = 0, res;
+
+       if (!init) {
+	    res =   !is_386() 
+		 && has_cpuid()
+		 && (cpuid_edx(1) & (1 << DS(26,25)))
+		 && sse2_works();
+	    init = 1;
+	    X(check_alignment_of_sse2_pm)();
+       }
+       return res;
+  }
+
+# endif
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/taint.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/taint.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "ifftw.h"
+#include "simd-common.h"
+
+#if HAVE_SIMD
+
+R *X(taint)(R *p, INT s)
+{
+     if (((unsigned)s * sizeof(R)) % ALIGNMENT)
+	  p = (R *) (PTRINT(p) | TAINT_BIT);
+     if (((unsigned)s * sizeof(R)) % ALIGNMENTA)
+	  p = (R *) (PTRINT(p) | TAINT_BITA);
+     return p;
+}
+
+/* join the taint of two pointers that are supposed to be
+   the same modulo the taint */
+R *X(join_taint)(R *p1, R *p2)
+{
+     A(UNTAINT(p1) == UNTAINT(p2));
+     return (R *)(PTRINT(p1) | PTRINT(p2));
+}
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/simd-support/x86-cpuid.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/simd-support/x86-cpuid.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* this code was kindly donated by Eric J. Korpela */
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+static inline int is_386() 
+{
+#ifdef _MSC_VER
+    unsigned int result,tst;
+    _asm {
+        pushfd
+        pop eax
+        mov edx,eax
+        xor eax,40000h
+        push eax
+        popfd
+        pushfd
+        pop eax
+        push edx
+        popfd
+        mov tst,edx
+        mov result,eax
+    }
+#else
+    register unsigned int result,tst;
+    __asm__ (
+        "pushfl\n\t"
+        "popl %0\n\t"
+        "movl %0,%1\n\t"
+        "xorl $0x40000,%0\n\t"
+        "pushl %0\n\t"
+        "popfl\n\t"
+        "pushfl\n\t"
+        "popl %0\n\t"
+        "pushl %1\n\t"
+        "popfl"
+    : "=r" (result), "=r" (tst) /* output */
+    :  /* no inputs */
+    );
+#endif
+    return (result == tst);
+}
+
+static inline int has_cpuid() 
+{
+#ifdef _MSC_VER
+    unsigned int result,tst;
+    _asm {
+        pushfd
+        pop eax
+        mov edx,eax
+        xor eax,200000h
+        push eax
+        popfd
+        pushfd
+        pop eax
+        push edx
+        popfd
+        mov tst,edx
+        mov result,eax
+    }
+#else
+    register unsigned int result,tst;
+    __asm__ (
+        "pushfl\n\t"
+        "pop %0\n\t"
+        "movl %0,%1\n\t"
+        "xorl $0x200000,%0\n\t"
+        "pushl %0\n\t"
+        "popfl\n\t"
+        "pushfl\n\t"
+        "popl %0\n\t"
+        "pushl %1\n\t"
+        "popfl"
+    : "=r" (result), "=r" (tst) /* output */
+    : /* no inputs */
+    );
+#endif
+    return (result != tst);
+}
+
+static inline int cpuid_edx(int op)
+{
+#    ifdef _MSC_VER
+     int result;
+     _asm {
+	  push ebx
+          mov eax,op
+          cpuid
+          mov result,edx
+          pop ebx
+     }
+     return result;
+#    else
+     int eax, ecx, edx;
+
+     __asm__("push %%ebx\n\tcpuid\n\tpop %%ebx"
+	     : "=a" (eax), "=c" (ecx), "=d" (edx)
+	     : "a" (op));
+     return edx;
+#    endif
+}
+
+static inline int cpuid_ecx(int op)
+{
+#    ifdef _MSC_VER
+     int result;
+     _asm {
+	  push ebx
+          mov eax,op
+          cpuid
+          mov result,ecx
+          pop ebx
+     }
+     return result;
+#    else
+     int eax, ecx, edx;
+
+     __asm__("push %%ebx\n\tcpuid\n\tpop %%ebx"
+	     : "=a" (eax), "=c" (ecx), "=d" (edx)
+	     : "a" (op));
+     return ecx;
+#    endif
+}
+
+static inline int xgetbv_eax(int op)
+{
+#    ifdef _MSC_VER
+     int veax, vedx;
+     _asm {
+          mov ecx,op
+#    if defined(__INTEL_COMPILER) || (_MSC_VER >= 1600)
+          xgetbv
+#    else
+          __emit 15
+          __emit 1
+          __emit 208
+#    endif
+          mov veax,eax
+          mov vedx,edx
+     }
+     return veax;
+#    else
+     int eax, edx;
+     __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (op));
+     return eax;
+#endif
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/support/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/support/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+EXTRA_DIST = Makefile.codelets codelet_prelude.dft codelet_prelude.rdft	\
+addchain.c twovers.sh
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/support/Makefile.codelets
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/support/Makefile.codelets	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,76 @@
+# -*- makefile -*-
+# This file contains special make rules to generate codelets.
+# Most of this file requires GNU make .
+
+CODLIST = codlist.c
+CODELET_NAME=codelet_
+
+# rule to build codlist
+$(CODLIST): Makefile
+	(									\
+	echo "#include \"ifftw.h\"";						\
+	echo $(INCLUDE_SIMD_HEADER);						\
+	echo;									\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+             echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);";	\
+           fi									\
+	done;									\
+	echo;									\
+	echo;									\
+	echo "extern const solvtab $(SOLVTAB_NAME);";				\
+	echo "const solvtab $(SOLVTAB_NAME) = {";				\
+	for i in $(ALL_CODELETS) NIL; do					\
+	   if test "$$i" != NIL; then						\
+	     j=`basename $$i | sed -e 's/[.][cS]$$//g'`;			\
+	     echo "   SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),";		\
+	   fi									\
+	done;									\
+	echo "   SOLVTAB_END";							\
+	echo "};";								\
+	) >$@
+
+# only delete codlist.c in maintainer-mode, since it is included in the dist
+# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
+maintainer-clean-local:
+	rm -f $(CODLIST)
+
+if MAINTAINER_MODE
+
+INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
+TWOVERS = sh ${top_srcdir}/support/twovers.sh
+GENFFTDIR = ${top_builddir}/genfft
+GEN_NOTW = ${GENFFTDIR}/gen_notw.native
+GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
+GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
+GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
+GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
+GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
+GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
+GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
+GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
+GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
+GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
+GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
+GEN_R2R = ${GENFFTDIR}/gen_r2r.native
+PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
+PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
+ADD_DATE = sed -e s/@DATE@/"`date`"/
+
+COPYRIGHT=${top_srcdir}/COPYRIGHT
+CODELET_DEPS=$(COPYRIGHT) $(PRELUDE) 
+PRELUDE_COMMANDS_DFT=cat $(COPYRIGHT) $(PRELUDE_DFT)
+PRELUDE_COMMANDS_RDFT=cat $(COPYRIGHT) $(PRELUDE_RDFT)
+
+FLAGS_COMMON = -compact -variables 4
+DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
+
+# cancel the hideous builtin rules that cause an infinite loop
+%: %.o
+%: %.s
+%: %.c
+%: %.S
+
+endif # MAINTAINER_MODE
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/support/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/support/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,419 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = support
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+EXTRA_DIST = Makefile.codelets codelet_prelude.dft codelet_prelude.rdft	\
+addchain.c twovers.sh
+
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu support/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu support/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+tags: TAGS
+TAGS:
+
+ctags: CTAGS
+CTAGS:
+
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+	distclean distclean-generic distclean-libtool distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/support/addchain.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/support/addchain.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,171 @@
+/* addition-chain optimizer */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+static int verbose;
+static int mulcost = 18;
+static int ldcost = 2;
+static int sqcost = 10;
+static int reflcost = 8;
+#define INFTY 100000
+
+static int *answer;
+static int best_so_far;
+
+static void print_answer(int n, int t)
+{
+     int i;
+     printf("| (%d, %d) -> [", n, t);
+     for (i = 0; i < t; ++i)
+	  printf("%d;", answer[i]);
+     printf("] (* %d *)\n", best_so_far);
+}
+
+#define DO(i, j, k, cst)			\
+if (k < n) {					\
+     int c = A[i] + A[j] + cst;			\
+     if (c < A[k]) {				\
+	  A[k] = c;				\
+	  changed = 1;				\
+     }						\
+}
+
+#define DO3(i, j, l, k, cst)			\
+if (k < n) {					\
+     int c = A[i] + A[j] + A[l] + cst;		\
+     if (c < A[k]) {				\
+	  A[k] = c;				\
+	  changed = 1;				\
+     }						\
+}
+
+static int optimize(int n, int *A)
+{
+     int i, j, k, changed, cst, cstmax;
+
+     do {
+	  changed = 0;
+	  for (i = 0; i < n; ++i) {
+	       k = i + i;
+	       DO(i, i, k, sqcost);
+	  }
+
+	  for (i = 0; i < n; ++i) {
+	       for (j = 0; j <= i; ++j) {
+		    k = i + j;
+		    DO(i, j, k, mulcost);
+		    k = i - j;
+		    DO(i, j, k, mulcost);
+
+		    k = i + j;
+		    DO3(i, j, i - j, k, reflcost);
+	       }
+	  }
+
+     } while (changed);
+
+     cst = cstmax = 0;
+     for (i = 0; i < n; ++i) {
+	  cst += A[i];
+	  if (A[i] > cstmax) cstmax = A[i];
+     }
+/*     return cstmax; */
+     return cst;
+}
+
+static void search(int n, int t, int *A, int *B, int depth)
+{
+     if (depth == 0) {
+	  int i, tc;
+	  for (i = 0; i < n; ++i)
+	       A[i] = INFTY;
+	  A[0] = 0;		/* always free */
+	  for (i = 1; i <= t; ++i)
+	       A[B[-i]] = ldcost;
+
+	  tc = optimize(n, A);
+	  if (tc < best_so_far) {
+	       best_so_far = tc;
+	       for (i = 1; i <= t; ++i)
+		    answer[t - i] = B[-i];
+	       if (verbose)
+		    print_answer(n, t);
+	  }
+     } else {
+	  for (B[0] = B[-1] + 1; B[0] < n; ++B[0])
+	       search(n, t, A, B + 1, depth - 1);
+     }
+}
+
+static void doit(int n, int t)
+{
+     int *A;
+     int *B;
+
+     A = malloc(n * sizeof(int));
+     B = malloc((t + 1) * sizeof(int));
+     answer = malloc(t * sizeof(int));
+
+     B[0] = 0;
+     best_so_far = INFTY;
+     search(n, t, A, B + 1, t);
+
+     print_answer(n, t);
+
+     free(A); free(B); free(answer);
+}
+
+int main(int argc, char *argv[])
+{
+     int n = 32;
+     int t = 3;
+     int all;
+     int ch;
+
+     verbose = 0;
+     all = 0;
+     while ((ch = getopt(argc, argv, "n:t:m:l:r:s:va")) != -1) {
+	  switch (ch) {
+	  case 'n':
+	       n = atoi(optarg);
+	       break;
+	  case 't':
+	       t = atoi(optarg);
+	       break;
+	  case 'm':
+	       mulcost = atoi(optarg);
+	       break;
+	  case 'l':
+	       ldcost = atoi(optarg);
+	       break;
+	  case 's':
+	       sqcost = atoi(optarg);
+	       break;
+	  case 'r':
+	       reflcost = atoi(optarg);
+	       break;
+	  case 'v':
+	       ++verbose;
+	       break;
+	  case 'a':
+	       ++all;
+	       break;
+	  case '?':
+	       fprintf(stderr, "use the source\n");
+	       exit(1);
+	  }
+     }
+
+     if (all) {
+	  for (n = 4; n <= 64; n *= 2) {
+	       int n1 = n - 1; if (n1 > 7) n1 = 7;
+	       for (t = 1; t <= n1; ++t)
+		    doit(n, t);
+	  }
+     } else {
+	  doit(n, t);
+     }
+
+     return 0;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/support/codelet_prelude.dft
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/support/codelet_prelude.dft	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,8 @@
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on @DATE@ */
+
+#include "codelet-dft.h"
+
+
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/support/codelet_prelude.rdft
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/support/codelet_prelude.rdft	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,8 @@
+
+/* This file was automatically generated --- DO NOT EDIT */
+/* Generated on @DATE@ */
+
+#include "codelet-rdft.h"
+
+
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/support/twovers.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/support/twovers.sh	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,17 @@
+#! /bin/sh
+
+# wrapper to generate two codelet versions, with and without
+# fma
+
+genfft=$1
+shift
+
+echo "#ifdef HAVE_FMA"
+echo
+  $genfft -fma -reorder-insns -schedule-for-pipeline $*
+echo
+echo "#else /* HAVE_FMA */"
+echo
+  $genfft $*
+echo
+echo "#endif /* HAVE_FMA */"
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tests/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tests/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,80 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/libbench2	\
+-I$(top_srcdir)/dft -I$(top_srcdir)/rdft -I$(top_srcdir)/reodft	\
+-I$(top_srcdir)/threads -I$(top_srcdir)/api 
+
+noinst_PROGRAMS = bench
+EXTRA_DIST = check.pl README
+
+if THREADS
+bench_CFLAGS = $(PTHREAD_CFLAGS)
+if !COMBINED_THREADS
+LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_threads.la
+endif
+else
+if OPENMP
+bench_CFLAGS = $(OPENMP_CFLAGS)
+LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_omp.la
+endif
+endif
+
+bench_SOURCES = bench.c hook.c fftw-bench.c fftw-bench.h
+bench_LDADD = $(LIBFFTWTHREADS)				\
+$(top_builddir)/libfftw3@PREC_SUFFIX@.la		\
+$(top_builddir)/libbench2/libbench2.a $(THREADLIBS)
+
+check-local: bench$(EXEEXT)
+	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -r -c=30 -v `pwd`/bench
+	@echo "--------------------------------------------------------------"
+	@echo "         FFTW transforms passed basic tests!"
+	@echo "--------------------------------------------------------------"
+if SMP
+	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -r -c=30 -v --nthreads=2 `pwd`/bench
+	@echo "--------------------------------------------------------------"
+	@echo "         FFTW threaded transforms passed basic tests!"
+	@echo "--------------------------------------------------------------"
+endif
+
+bigcheck: bench$(EXEEXT)
+	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -a -v `pwd`/bench
+	@echo "--------------------------------------------------------------"
+	@echo "         FFTW transforms passed big tests!"
+	@echo "--------------------------------------------------------------"
+if SMP
+	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -a -v --nthreads=2 `pwd`/bench
+	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -a -v --nthreads=3 `pwd`/bench
+	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -a -v --nthreads=10 `pwd`/bench
+	@echo "--------------------------------------------------------------"
+	@echo "         FFTW threaded transforms passed big tests!"
+	@echo "--------------------------------------------------------------"
+endif
+
+smallcheck: bench$(EXEEXT)
+	perl -w $(srcdir)/check.pl -r -c=1 -v `pwd`/bench
+	perl -w $(srcdir)/check.pl -r --estimate -c=5 -v `pwd`/bench
+	@echo "--------------------------------------------------------------"
+	@echo "         FFTW transforms passed a few tests!"
+	@echo "--------------------------------------------------------------"
+if SMP
+	perl -w $(srcdir)/check.pl -r --estimate -c=2 -v --nthreads=2 `pwd`/bench
+	@echo "--------------------------------------------------------------"
+	@echo "         FFTW threaded transforms passed a few tests!"
+	@echo "--------------------------------------------------------------"
+endif
+
+paranoid-check: bench$(EXEEXT)
+if SMP
+	perl -w $(srcdir)/check.pl -a --patient --nthreads=10 --paranoid `pwd`/bench
+	perl -w $(srcdir)/check.pl -a --patient --nthreads=7 --paranoid `pwd`/bench
+	perl -w $(srcdir)/check.pl -a --patient --nthreads=3 --paranoid `pwd`/bench
+	perl -w $(srcdir)/check.pl -a --patient --nthreads=2 --paranoid `pwd`/bench
+endif
+	perl -w $(srcdir)/check.pl -a --patient --paranoid `pwd`/bench
+
+exhaustive-check: bench$(EXEEXT)
+if SMP
+	perl -w $(srcdir)/check.pl -a --exhaustive --nthreads=10 --paranoid `pwd`/bench
+	perl -w $(srcdir)/check.pl -a --exhaustive --nthreads=7 --paranoid `pwd`/bench
+	perl -w $(srcdir)/check.pl -a --exhaustive --nthreads=3 --paranoid `pwd`/bench
+	perl -w $(srcdir)/check.pl -a --exhaustive --nthreads=2 --paranoid `pwd`/bench
+endif
+	perl -w $(srcdir)/check.pl -a --exhaustive --paranoid `pwd`/bench
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tests/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tests/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,646 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+noinst_PROGRAMS = bench$(EXEEXT)
+subdir = tests
+DIST_COMMON = README $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+PROGRAMS = $(noinst_PROGRAMS)
+am_bench_OBJECTS = bench-bench.$(OBJEXT) bench-hook.$(OBJEXT) \
+	bench-fftw-bench.$(OBJEXT)
+bench_OBJECTS = $(am_bench_OBJECTS)
+am__DEPENDENCIES_1 =
+bench_DEPENDENCIES = $(LIBFFTWTHREADS) \
+	$(top_builddir)/libfftw3@PREC_SUFFIX@.la \
+	$(top_builddir)/libbench2/libbench2.a $(am__DEPENDENCIES_1)
+bench_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(bench_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(bench_SOURCES)
+DIST_SOURCES = $(bench_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/libbench2	\
+-I$(top_srcdir)/dft -I$(top_srcdir)/rdft -I$(top_srcdir)/reodft	\
+-I$(top_srcdir)/threads -I$(top_srcdir)/api 
+
+EXTRA_DIST = check.pl README
+@OPENMP_TRUE@@THREADS_FALSE@bench_CFLAGS = $(OPENMP_CFLAGS)
+@THREADS_TRUE@bench_CFLAGS = $(PTHREAD_CFLAGS)
+@COMBINED_THREADS_FALSE@@THREADS_TRUE@LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_threads.la
+@OPENMP_TRUE@@THREADS_FALSE@LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_omp.la
+bench_SOURCES = bench.c hook.c fftw-bench.c fftw-bench.h
+bench_LDADD = $(LIBFFTWTHREADS)				\
+$(top_builddir)/libfftw3@PREC_SUFFIX@.la		\
+$(top_builddir)/libbench2/libbench2.a $(THREADLIBS)
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu tests/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu tests/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstPROGRAMS:
+	@list='$(noinst_PROGRAMS)'; test -n "$$list" || exit 0; \
+	echo " rm -f" $$list; \
+	rm -f $$list || exit $$?; \
+	test -n "$(EXEEXT)" || exit 0; \
+	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
+	echo " rm -f" $$list; \
+	rm -f $$list
+bench$(EXEEXT): $(bench_OBJECTS) $(bench_DEPENDENCIES) $(EXTRA_bench_DEPENDENCIES) 
+	@rm -f bench$(EXEEXT)
+	$(bench_LINK) $(bench_OBJECTS) $(bench_LDADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bench-bench.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bench-fftw-bench.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bench-hook.Po@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+bench-bench.o: bench.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -MT bench-bench.o -MD -MP -MF $(DEPDIR)/bench-bench.Tpo -c -o bench-bench.o `test -f 'bench.c' || echo '$(srcdir)/'`bench.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/bench-bench.Tpo $(DEPDIR)/bench-bench.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='bench.c' object='bench-bench.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -c -o bench-bench.o `test -f 'bench.c' || echo '$(srcdir)/'`bench.c
+
+bench-bench.obj: bench.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -MT bench-bench.obj -MD -MP -MF $(DEPDIR)/bench-bench.Tpo -c -o bench-bench.obj `if test -f 'bench.c'; then $(CYGPATH_W) 'bench.c'; else $(CYGPATH_W) '$(srcdir)/bench.c'; fi`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/bench-bench.Tpo $(DEPDIR)/bench-bench.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='bench.c' object='bench-bench.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -c -o bench-bench.obj `if test -f 'bench.c'; then $(CYGPATH_W) 'bench.c'; else $(CYGPATH_W) '$(srcdir)/bench.c'; fi`
+
+bench-hook.o: hook.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -MT bench-hook.o -MD -MP -MF $(DEPDIR)/bench-hook.Tpo -c -o bench-hook.o `test -f 'hook.c' || echo '$(srcdir)/'`hook.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/bench-hook.Tpo $(DEPDIR)/bench-hook.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='hook.c' object='bench-hook.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -c -o bench-hook.o `test -f 'hook.c' || echo '$(srcdir)/'`hook.c
+
+bench-hook.obj: hook.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -MT bench-hook.obj -MD -MP -MF $(DEPDIR)/bench-hook.Tpo -c -o bench-hook.obj `if test -f 'hook.c'; then $(CYGPATH_W) 'hook.c'; else $(CYGPATH_W) '$(srcdir)/hook.c'; fi`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/bench-hook.Tpo $(DEPDIR)/bench-hook.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='hook.c' object='bench-hook.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -c -o bench-hook.obj `if test -f 'hook.c'; then $(CYGPATH_W) 'hook.c'; else $(CYGPATH_W) '$(srcdir)/hook.c'; fi`
+
+bench-fftw-bench.o: fftw-bench.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -MT bench-fftw-bench.o -MD -MP -MF $(DEPDIR)/bench-fftw-bench.Tpo -c -o bench-fftw-bench.o `test -f 'fftw-bench.c' || echo '$(srcdir)/'`fftw-bench.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/bench-fftw-bench.Tpo $(DEPDIR)/bench-fftw-bench.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='fftw-bench.c' object='bench-fftw-bench.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -c -o bench-fftw-bench.o `test -f 'fftw-bench.c' || echo '$(srcdir)/'`fftw-bench.c
+
+bench-fftw-bench.obj: fftw-bench.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -MT bench-fftw-bench.obj -MD -MP -MF $(DEPDIR)/bench-fftw-bench.Tpo -c -o bench-fftw-bench.obj `if test -f 'fftw-bench.c'; then $(CYGPATH_W) 'fftw-bench.c'; else $(CYGPATH_W) '$(srcdir)/fftw-bench.c'; fi`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/bench-fftw-bench.Tpo $(DEPDIR)/bench-fftw-bench.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='fftw-bench.c' object='bench-fftw-bench.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bench_CFLAGS) $(CFLAGS) -c -o bench-fftw-bench.obj `if test -f 'fftw-bench.c'; then $(CYGPATH_W) 'fftw-bench.c'; else $(CYGPATH_W) '$(srcdir)/fftw-bench.c'; fi`
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+	$(MAKE) $(AM_MAKEFLAGS) check-local
+check: check-am
+all-am: Makefile $(PROGRAMS)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstPROGRAMS \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: check-am install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am check-local clean \
+	clean-generic clean-libtool clean-noinstPROGRAMS ctags \
+	distclean distclean-compile distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am
+
+
+check-local: bench$(EXEEXT)
+	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -r -c=30 -v `pwd`/bench
+	@echo "--------------------------------------------------------------"
+	@echo "         FFTW transforms passed basic tests!"
+	@echo "--------------------------------------------------------------"
+@SMP_TRUE@	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -r -c=30 -v --nthreads=2 `pwd`/bench
+@SMP_TRUE@	@echo "--------------------------------------------------------------"
+@SMP_TRUE@	@echo "         FFTW threaded transforms passed basic tests!"
+@SMP_TRUE@	@echo "--------------------------------------------------------------"
+
+bigcheck: bench$(EXEEXT)
+	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -a -v `pwd`/bench
+	@echo "--------------------------------------------------------------"
+	@echo "         FFTW transforms passed big tests!"
+	@echo "--------------------------------------------------------------"
+@SMP_TRUE@	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -a -v --nthreads=2 `pwd`/bench
+@SMP_TRUE@	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -a -v --nthreads=3 `pwd`/bench
+@SMP_TRUE@	perl -w $(srcdir)/check.pl $(CHECK_PL_OPTS) -a -v --nthreads=10 `pwd`/bench
+@SMP_TRUE@	@echo "--------------------------------------------------------------"
+@SMP_TRUE@	@echo "         FFTW threaded transforms passed big tests!"
+@SMP_TRUE@	@echo "--------------------------------------------------------------"
+
+smallcheck: bench$(EXEEXT)
+	perl -w $(srcdir)/check.pl -r -c=1 -v `pwd`/bench
+	perl -w $(srcdir)/check.pl -r --estimate -c=5 -v `pwd`/bench
+	@echo "--------------------------------------------------------------"
+	@echo "         FFTW transforms passed a few tests!"
+	@echo "--------------------------------------------------------------"
+@SMP_TRUE@	perl -w $(srcdir)/check.pl -r --estimate -c=2 -v --nthreads=2 `pwd`/bench
+@SMP_TRUE@	@echo "--------------------------------------------------------------"
+@SMP_TRUE@	@echo "         FFTW threaded transforms passed a few tests!"
+@SMP_TRUE@	@echo "--------------------------------------------------------------"
+
+paranoid-check: bench$(EXEEXT)
+@SMP_TRUE@	perl -w $(srcdir)/check.pl -a --patient --nthreads=10 --paranoid `pwd`/bench
+@SMP_TRUE@	perl -w $(srcdir)/check.pl -a --patient --nthreads=7 --paranoid `pwd`/bench
+@SMP_TRUE@	perl -w $(srcdir)/check.pl -a --patient --nthreads=3 --paranoid `pwd`/bench
+@SMP_TRUE@	perl -w $(srcdir)/check.pl -a --patient --nthreads=2 --paranoid `pwd`/bench
+	perl -w $(srcdir)/check.pl -a --patient --paranoid `pwd`/bench
+
+exhaustive-check: bench$(EXEEXT)
+@SMP_TRUE@	perl -w $(srcdir)/check.pl -a --exhaustive --nthreads=10 --paranoid `pwd`/bench
+@SMP_TRUE@	perl -w $(srcdir)/check.pl -a --exhaustive --nthreads=7 --paranoid `pwd`/bench
+@SMP_TRUE@	perl -w $(srcdir)/check.pl -a --exhaustive --nthreads=3 --paranoid `pwd`/bench
+@SMP_TRUE@	perl -w $(srcdir)/check.pl -a --exhaustive --nthreads=2 --paranoid `pwd`/bench
+	perl -w $(srcdir)/check.pl -a --exhaustive --paranoid `pwd`/bench
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tests/README
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tests/README	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,73 @@
+This directory contains a benchmarking and testing program
+for fftw3.
+
+The `bench' program has a zillion options, because we use it for
+benchmarking other FFT libraries as well.  This file only documents
+the basic usage of bench.
+
+Usage: bench <commands>
+
+where each command is as follows:
+
+-s <problem>
+--speed <problem>
+
+    Benchmarks the speed of <problem>.
+
+    The syntax for problems is [i|o][r|c][f|b]<size>, where
+
+      i/o means in-place or out-of-place.  Out of place is the default.
+      r/c means real or complex transform.  Complex is the default.
+      f/b means forward or backward transform.  Forward is the default.
+      <size> is an arbitrary multidimensional sequence of integers
+        separated by the character 'x'.
+
+    (The syntax for problems is actually richer, but we do not document
+    it here.  See the man page for fftw-wisdom for more information.)
+
+    Example:
+
+        ib256 : in-place backward complex transform of size 256
+        32x64 : out-of-place forward complex 2D transform of 32 rows
+                and 64 columns.
+
+-y <problem>
+--verify <problem>
+
+   Verify that FFTW is computing the correct answer.
+
+   The program does not output anything unless an error occurs or
+   verbosity is at least one.
+
+-v<n>
+
+   Set verbosity to <n>, or 1 if <n> is omitted.  -v2 will output
+   the created plans with fftw_print_plan.
+   
+-oestimate
+-opatient
+-oexhaustive
+ 
+  Plan with FFTW_ESTIMATE, FFTW_PATIENT, or FFTW_EXHAUSTIVE, respectively.
+  The default is FFTW_MEASURE.
+
+  If you benchmark FFTW, please use -opatient.
+      
+-onthreads=N
+
+  Use N threads, if FFTW was compiled with --enable-threads.  N
+  must be a positive integer; the default is N=1.
+
+-onosimd
+
+  Disable SIMD instructions (e.g. SSE or SSE2).
+
+-ounaligned
+
+  Plan with the FFTW_UNALIGNED flag.
+
+-owisdom
+
+  On startup, read wisdom from a file wis.dat in the current directory
+  (if it exists).  On completion, write accumulated wisdom to wis.dat
+  (overwriting any existing file of that name).
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tests/bench.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tests/bench.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,552 @@
+/**************************************************************************/
+/* NOTE to users: this is the FFTW self-test and benchmark program.
+   It is probably NOT a good place to learn FFTW usage, since it has a
+   lot of added complexity in order to exercise and test the full API,
+   etcetera.  We suggest reading the manual. 
+
+   (Some of the self-test code is split off into fftw-bench.c and
+   hook.c.) */
+/**************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include "fftw-bench.h"
+
+static const char *mkversion(void) { return FFTW(version); }
+static const char *mkcc(void) { return FFTW(cc); }
+static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); }
+
+BEGIN_BENCH_DOC
+BENCH_DOC("name", "fftw3")
+BENCH_DOCF("version", mkversion)
+BENCH_DOCF("cc", mkcc)
+BENCH_DOCF("codelet-optim", mkcodelet_optim)
+END_BENCH_DOC 
+
+static FFTW(iodim) *bench_tensor_to_fftw_iodim(bench_tensor *t)
+{
+     FFTW(iodim) *d;
+     int i;
+
+     BENCH_ASSERT(t->rnk >= 0);
+     if (t->rnk == 0) return 0;
+     
+     d = (FFTW(iodim) *)bench_malloc(sizeof(FFTW(iodim)) * t->rnk);
+     for (i = 0; i < t->rnk; ++i) {
+	  d[i].n = t->dims[i].n;
+	  d[i].is = t->dims[i].is;
+	  d[i].os = t->dims[i].os;
+     }
+
+     return d;
+}
+
+static void extract_reim_split(int sign, int size, bench_real *p,
+			       bench_real **r, bench_real **i)
+{
+     if (sign == FFTW_FORWARD) {
+          *r = p + 0;
+          *i = p + size;
+     } else {
+          *r = p + size;
+          *i = p + 0;
+     }
+}
+
+static int sizeof_problem(bench_problem *p)
+{
+     return tensor_sz(p->sz) * tensor_sz(p->vecsz);
+}
+
+/* ouch */
+static int expressible_as_api_many(bench_tensor *t)
+{
+     int i;
+
+     BENCH_ASSERT(FINITE_RNK(t->rnk));
+
+     i = t->rnk - 1;
+     while (--i >= 0) {
+	  bench_iodim *d = t->dims + i;
+	  if (d[0].is % d[1].is) return 0;
+	  if (d[0].os % d[1].os) return 0;
+     }
+     return 1;
+}
+
+static int *mkn(bench_tensor *t)
+{
+     int *n = (int *) bench_malloc(sizeof(int *) * t->rnk);
+     int i;
+     for (i = 0; i < t->rnk; ++i) 
+	  n[i] = t->dims[i].n;
+     return n;
+}
+
+static void mknembed_many(bench_tensor *t, int **inembedp, int **onembedp)
+{
+     int i;
+     bench_iodim *d;
+     int *inembed = (int *) bench_malloc(sizeof(int *) * t->rnk);
+     int *onembed = (int *) bench_malloc(sizeof(int *) * t->rnk);
+
+     BENCH_ASSERT(FINITE_RNK(t->rnk));
+     *inembedp = inembed; *onembedp = onembed;
+
+     i = t->rnk - 1;
+     while (--i >= 0) {
+	  d = t->dims + i;
+	  inembed[i+1] = d[0].is / d[1].is;
+	  onembed[i+1] = d[0].os / d[1].os;
+     }
+}
+
+/* try to use the most appropriate API function.  Big mess. */
+
+static int imax(int a, int b) { return (a > b ? a : b); }
+
+static int halfish_sizeof_problem(bench_problem *p)
+{
+     int n2 = sizeof_problem(p);
+     if (FINITE_RNK(p->sz->rnk) && p->sz->rnk > 0)
+          n2 = (n2 / imax(p->sz->dims[p->sz->rnk - 1].n, 1)) *
+               (p->sz->dims[p->sz->rnk - 1].n / 2 + 1);
+     return n2;
+}
+
+static FFTW(plan) mkplan_real_split(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln;
+     bench_tensor *sz = p->sz, *vecsz = p->vecsz;
+     FFTW(iodim) *dims, *howmany_dims;
+     bench_real *ri, *ii, *ro, *io;
+     int n2 = halfish_sizeof_problem(p);
+
+     extract_reim_split(FFTW_FORWARD, n2, (bench_real *) p->in, &ri, &ii);
+     extract_reim_split(FFTW_FORWARD, n2, (bench_real *) p->out, &ro, &io);
+
+     dims = bench_tensor_to_fftw_iodim(sz);
+     howmany_dims = bench_tensor_to_fftw_iodim(vecsz);
+     if (p->sign < 0) {
+	  if (verbose > 2) printf("using plan_guru_split_dft_r2c\n");
+	  pln = FFTW(plan_guru_split_dft_r2c)(sz->rnk, dims,
+					vecsz->rnk, howmany_dims,
+					ri, ro, io, flags);
+     }
+     else {
+	  if (verbose > 2) printf("using plan_guru_split_dft_c2r\n");
+	  pln = FFTW(plan_guru_split_dft_c2r)(sz->rnk, dims,
+					vecsz->rnk, howmany_dims,
+					ri, ii, ro, flags);
+     }
+     bench_free(dims);
+     bench_free(howmany_dims);
+     return pln;
+}
+
+static FFTW(plan) mkplan_real_interleaved(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln;
+     bench_tensor *sz = p->sz, *vecsz = p->vecsz;
+
+     if (vecsz->rnk == 0 && tensor_unitstridep(sz) 
+	 && tensor_real_rowmajorp(sz, p->sign, p->in_place)) 
+	  goto api_simple;
+     
+     if (vecsz->rnk == 1 && expressible_as_api_many(sz))
+	  goto api_many;
+
+     goto api_guru;
+
+ api_simple:
+     switch (sz->rnk) {
+	 case 1:
+	      if (p->sign < 0) {
+		   if (verbose > 2) printf("using plan_dft_r2c_1d\n");
+		   return FFTW(plan_dft_r2c_1d)(sz->dims[0].n, 
+						(bench_real *) p->in, 
+						(bench_complex *) p->out,
+						flags);
+	      }
+	      else {
+		   if (verbose > 2) printf("using plan_dft_c2r_1d\n");
+		   return FFTW(plan_dft_c2r_1d)(sz->dims[0].n, 
+						(bench_complex *) p->in, 
+						(bench_real *) p->out,
+						flags);
+	      }
+	      break;
+	 case 2:
+	      if (p->sign < 0) {
+		   if (verbose > 2) printf("using plan_dft_r2c_2d\n");
+		   return FFTW(plan_dft_r2c_2d)(sz->dims[0].n, sz->dims[1].n,
+						(bench_real *) p->in, 
+						(bench_complex *) p->out,
+						flags);
+	      }
+	      else {
+		   if (verbose > 2) printf("using plan_dft_c2r_2d\n");
+		   return FFTW(plan_dft_c2r_2d)(sz->dims[0].n, sz->dims[1].n,
+						(bench_complex *) p->in, 
+						(bench_real *) p->out,
+						flags);
+	      }
+	      break;
+	 case 3:
+	      if (p->sign < 0) {
+		   if (verbose > 2) printf("using plan_dft_r2c_3d\n");
+		   return FFTW(plan_dft_r2c_3d)(
+			sz->dims[0].n, sz->dims[1].n, sz->dims[2].n,
+			(bench_real *) p->in, (bench_complex *) p->out,
+			flags);
+	      }
+	      else {
+		   if (verbose > 2) printf("using plan_dft_c2r_3d\n");
+		   return FFTW(plan_dft_c2r_3d)(
+			sz->dims[0].n, sz->dims[1].n, sz->dims[2].n,
+			(bench_complex *) p->in, (bench_real *) p->out,
+			flags);
+	      }
+	      break;
+	 default: {
+	      int *n = mkn(sz);
+	      if (p->sign < 0) {
+		   if (verbose > 2) printf("using plan_dft_r2c\n");
+		   pln = FFTW(plan_dft_r2c)(sz->rnk, n,
+					    (bench_real *) p->in, 
+					    (bench_complex *) p->out,
+					    flags);
+	      }
+	      else {
+		   if (verbose > 2) printf("using plan_dft_c2r\n");
+		   pln = FFTW(plan_dft_c2r)(sz->rnk, n,
+					    (bench_complex *) p->in, 
+					    (bench_real *) p->out,
+					    flags);
+	      }
+	      bench_free(n);
+	      return pln;
+	 }
+     }
+
+ api_many:
+     {
+	  int *n, *inembed, *onembed;
+	  BENCH_ASSERT(vecsz->rnk == 1);
+	  n = mkn(sz);
+	  mknembed_many(sz, &inembed, &onembed);
+	  if (p->sign < 0) {
+	       if (verbose > 2) printf("using plan_many_dft_r2c\n");
+	       pln = FFTW(plan_many_dft_r2c)(
+		    sz->rnk, n, vecsz->dims[0].n, 
+		    (bench_real *) p->in, inembed,
+		    sz->dims[sz->rnk - 1].is, vecsz->dims[0].is,
+		    (bench_complex *) p->out, onembed,
+		    sz->dims[sz->rnk - 1].os, vecsz->dims[0].os,
+		    flags);
+	  }
+	  else {
+	       if (verbose > 2) printf("using plan_many_dft_c2r\n");
+	       pln = FFTW(plan_many_dft_c2r)(
+		    sz->rnk, n, vecsz->dims[0].n, 
+		    (bench_complex *) p->in, inembed,
+		    sz->dims[sz->rnk - 1].is, vecsz->dims[0].is,
+		    (bench_real *) p->out, onembed,
+		    sz->dims[sz->rnk - 1].os, vecsz->dims[0].os,
+		    flags);
+	  }
+	  bench_free(n); bench_free(inembed); bench_free(onembed);
+	  return pln;
+     }
+
+ api_guru:
+     {
+	  FFTW(iodim) *dims, *howmany_dims;
+
+	  if (p->sign < 0) {
+	       dims = bench_tensor_to_fftw_iodim(sz);
+	       howmany_dims = bench_tensor_to_fftw_iodim(vecsz);
+	       if (verbose > 2) printf("using plan_guru_dft_r2c\n");
+	       pln = FFTW(plan_guru_dft_r2c)(sz->rnk, dims,
+					     vecsz->rnk, howmany_dims,
+					     (bench_real *) p->in,
+					     (bench_complex *) p->out,
+					     flags);
+	  }
+	  else {
+	       dims = bench_tensor_to_fftw_iodim(sz);
+	       howmany_dims = bench_tensor_to_fftw_iodim(vecsz);
+	       if (verbose > 2) printf("using plan_guru_dft_c2r\n");
+	       pln = FFTW(plan_guru_dft_c2r)(sz->rnk, dims,
+					     vecsz->rnk, howmany_dims,
+					     (bench_complex *) p->in,
+					     (bench_real *) p->out,
+					     flags);
+	  }
+	  bench_free(dims);
+	  bench_free(howmany_dims);
+	  return pln;
+     }
+}
+
+static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags)
+{
+     if (p->split)
+	  return mkplan_real_split(p, flags);
+     else
+	  return mkplan_real_interleaved(p, flags);
+}
+
+static FFTW(plan) mkplan_complex_split(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln;
+     bench_tensor *sz = p->sz, *vecsz = p->vecsz;
+     FFTW(iodim) *dims, *howmany_dims;
+     bench_real *ri, *ii, *ro, *io;
+
+     extract_reim_split(p->sign, p->iphyssz, (bench_real *) p->in, &ri, &ii);
+     extract_reim_split(p->sign, p->ophyssz, (bench_real *) p->out, &ro, &io);
+
+     dims = bench_tensor_to_fftw_iodim(sz);
+     howmany_dims = bench_tensor_to_fftw_iodim(vecsz);
+     if (verbose > 2) printf("using plan_guru_split_dft\n");
+     pln = FFTW(plan_guru_split_dft)(sz->rnk, dims,
+			       vecsz->rnk, howmany_dims,
+			       ri, ii, ro, io, flags);
+     bench_free(dims);
+     bench_free(howmany_dims);
+     return pln;
+}
+
+static FFTW(plan) mkplan_complex_interleaved(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln;
+     bench_tensor *sz = p->sz, *vecsz = p->vecsz;
+
+     if (vecsz->rnk == 0 && tensor_unitstridep(sz) && tensor_rowmajorp(sz)) 
+	  goto api_simple;
+     
+     if (vecsz->rnk == 1 && expressible_as_api_many(sz))
+	  goto api_many;
+
+     goto api_guru;
+
+ api_simple:
+     switch (sz->rnk) {
+	 case 1:
+	      if (verbose > 2) printf("using plan_dft_1d\n");
+	      return FFTW(plan_dft_1d)(sz->dims[0].n, 
+				       (bench_complex *) p->in,
+				       (bench_complex *) p->out, 
+				       p->sign, flags);
+	      break;
+	 case 2:
+	      if (verbose > 2) printf("using plan_dft_2d\n");
+	      return FFTW(plan_dft_2d)(sz->dims[0].n, sz->dims[1].n,
+				       (bench_complex *) p->in,
+				       (bench_complex *) p->out, 
+				       p->sign, flags);
+	      break;
+	 case 3:
+	      if (verbose > 2) printf("using plan_dft_3d\n");
+	      return FFTW(plan_dft_3d)(
+		   sz->dims[0].n, sz->dims[1].n, sz->dims[2].n,
+		   (bench_complex *) p->in, (bench_complex *) p->out, 
+		   p->sign, flags);
+	      break;
+	 default: {
+	      int *n = mkn(sz);
+	      if (verbose > 2) printf("using plan_dft\n");
+	      pln = FFTW(plan_dft)(sz->rnk, n, 
+				   (bench_complex *) p->in, 
+				   (bench_complex *) p->out, p->sign, flags);
+	      bench_free(n);
+	      return pln;
+	 }
+     }
+
+ api_many:
+     {
+	  int *n, *inembed, *onembed;
+	  BENCH_ASSERT(vecsz->rnk == 1);
+	  n = mkn(sz);
+	  mknembed_many(sz, &inembed, &onembed);
+	  if (verbose > 2) printf("using plan_many_dft\n");
+	  pln = FFTW(plan_many_dft)(
+	       sz->rnk, n, vecsz->dims[0].n, 
+	       (bench_complex *) p->in, 
+	       inembed, sz->dims[sz->rnk - 1].is, vecsz->dims[0].is,
+	       (bench_complex *) p->out,
+	       onembed, sz->dims[sz->rnk - 1].os, vecsz->dims[0].os,
+	       p->sign, flags);
+	  bench_free(n); bench_free(inembed); bench_free(onembed);
+	  return pln;
+     }
+
+ api_guru:
+     {
+	  FFTW(iodim) *dims, *howmany_dims;
+
+	  dims = bench_tensor_to_fftw_iodim(sz);
+	  howmany_dims = bench_tensor_to_fftw_iodim(vecsz);
+	  if (verbose > 2) printf("using plan_guru_dft\n");
+	  pln = FFTW(plan_guru_dft)(sz->rnk, dims,
+				    vecsz->rnk, howmany_dims,
+				    (bench_complex *) p->in,
+				    (bench_complex *) p->out,
+				    p->sign, flags);
+	  bench_free(dims);
+	  bench_free(howmany_dims);
+	  return pln;
+     }
+}
+
+static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags)
+{
+     if (p->split)
+	  return mkplan_complex_split(p, flags);
+     else
+	  return mkplan_complex_interleaved(p, flags);
+}
+
+static FFTW(plan) mkplan_r2r(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln;
+     bench_tensor *sz = p->sz, *vecsz = p->vecsz;
+     FFTW(r2r_kind) *k;
+
+     k = (FFTW(r2r_kind) *) bench_malloc(sizeof(FFTW(r2r_kind)) * sz->rnk);
+     {
+	  int i;
+	  for (i = 0; i < sz->rnk; ++i)
+	       switch (p->k[i]) {
+		   case R2R_R2HC: k[i] = FFTW_R2HC; break;
+		   case R2R_HC2R: k[i] = FFTW_HC2R; break;
+		   case R2R_DHT: k[i] = FFTW_DHT; break;
+		   case R2R_REDFT00: k[i] = FFTW_REDFT00; break;
+		   case R2R_REDFT01: k[i] = FFTW_REDFT01; break;
+		   case R2R_REDFT10: k[i] = FFTW_REDFT10; break;
+		   case R2R_REDFT11: k[i] = FFTW_REDFT11; break;
+		   case R2R_RODFT00: k[i] = FFTW_RODFT00; break;
+		   case R2R_RODFT01: k[i] = FFTW_RODFT01; break;
+		   case R2R_RODFT10: k[i] = FFTW_RODFT10; break;
+		   case R2R_RODFT11: k[i] = FFTW_RODFT11; break;
+		   default: BENCH_ASSERT(0);
+	       }
+     }
+
+     if (vecsz->rnk == 0 && tensor_unitstridep(sz) && tensor_rowmajorp(sz)) 
+	  goto api_simple;
+     
+     if (vecsz->rnk == 1 && expressible_as_api_many(sz))
+	  goto api_many;
+
+     goto api_guru;
+
+ api_simple:
+     switch (sz->rnk) {
+	 case 1:
+	      if (verbose > 2) printf("using plan_r2r_1d\n");
+	      pln = FFTW(plan_r2r_1d)(sz->dims[0].n, 
+				      (bench_real *) p->in,
+				      (bench_real *) p->out, 
+				      k[0], flags);
+	      goto done;
+	 case 2:
+	      if (verbose > 2) printf("using plan_r2r_2d\n");
+	      pln = FFTW(plan_r2r_2d)(sz->dims[0].n, sz->dims[1].n,
+				      (bench_real *) p->in,
+				      (bench_real *) p->out, 
+				      k[0], k[1], flags);
+	      goto done;
+	 case 3:
+	      if (verbose > 2) printf("using plan_r2r_3d\n");
+	      pln = FFTW(plan_r2r_3d)(
+		   sz->dims[0].n, sz->dims[1].n, sz->dims[2].n,
+		   (bench_real *) p->in, (bench_real *) p->out, 
+		   k[0], k[1], k[2], flags);
+	      goto done;
+	 default: {
+	      int *n = mkn(sz);
+	      if (verbose > 2) printf("using plan_r2r\n");
+	      pln = FFTW(plan_r2r)(sz->rnk, n,
+				   (bench_real *) p->in, (bench_real *) p->out,
+				   k, flags);
+	      bench_free(n);
+	      goto done;
+	 }
+     }
+
+ api_many:
+     {
+	  int *n, *inembed, *onembed;
+	  BENCH_ASSERT(vecsz->rnk == 1);
+	  n = mkn(sz);
+	  mknembed_many(sz, &inembed, &onembed);
+	  if (verbose > 2) printf("using plan_many_r2r\n");
+	  pln = FFTW(plan_many_r2r)(
+	       sz->rnk, n, vecsz->dims[0].n, 
+	       (bench_real *) p->in,
+	       inembed, sz->dims[sz->rnk - 1].is, vecsz->dims[0].is,
+	       (bench_real *) p->out,
+	       onembed, sz->dims[sz->rnk - 1].os, vecsz->dims[0].os,
+	       k, flags);
+	  bench_free(n); bench_free(inembed); bench_free(onembed);
+	  goto done;
+     }
+
+ api_guru:
+     {
+	  FFTW(iodim) *dims, *howmany_dims;
+
+	  dims = bench_tensor_to_fftw_iodim(sz);
+	  howmany_dims = bench_tensor_to_fftw_iodim(vecsz);
+	  if (verbose > 2) printf("using plan_guru_r2r\n");
+	  pln = FFTW(plan_guru_r2r)(sz->rnk, dims,
+				    vecsz->rnk, howmany_dims,
+				    (bench_real *) p->in, 
+				    (bench_real *) p->out, k, flags);
+	  bench_free(dims);
+	  bench_free(howmany_dims);
+	  goto done;
+     }
+     
+ done:
+     bench_free(k);
+     return pln;
+}
+
+FFTW(plan) mkplan(bench_problem *p, unsigned flags)
+{
+     switch (p->kind) {
+	 case PROBLEM_COMPLEX:	  return mkplan_complex(p, flags);
+	 case PROBLEM_REAL:	  return mkplan_real(p, flags);
+	 case PROBLEM_R2R:        return mkplan_r2r(p, flags);
+	 default: BENCH_ASSERT(0); return 0;
+     }
+}
+
+void main_init(int *argc, char ***argv)
+{
+     UNUSED(argc);
+     UNUSED(argv);
+}
+
+void initial_cleanup(void)
+{
+}
+
+void final_cleanup(void)
+{
+}
+
+int import_wisdom(FILE *f)
+{
+     return FFTW(import_wisdom_from_file)(f);
+}
+
+void export_wisdom(FILE *f)
+{
+     FFTW(export_wisdom_to_file)(f);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tests/check.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tests/check.pl	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,308 @@
+#! /usr/bin/perl -w
+
+$program = "./bench";
+$default_options = "";
+$verbose = 0;
+$paranoid = 0;
+$exhaustive = 0;
+$patient = 0;
+$estimate = 0;
+$wisdom = 0;
+$nthreads = 1;
+$rounds = 0;
+$maxsize = 60000;
+$maxcount = 100;
+$do_0d = 0;
+$do_1d = 0;
+$do_2d = 0;
+$do_random = 0;
+$keepgoing = 0;
+$flushcount = 42;
+
+$mpi = 0;
+$mpi_transposed_in = 0;
+$mpi_transposed_out = 0;
+
+sub make_options {
+    my $options = $default_options;
+    $options = "--verify-rounds=$rounds $options" if $rounds;
+    $options = "--verbose=$verbose $options" if $verbose;
+    $options = "-o paranoid $options" if $paranoid;
+    $options = "-o exhaustive $options" if $exhaustive;
+    $options = "-o patient $options" if $patient;
+    $options = "-o estimate $options" if $estimate;
+    $options = "-o wisdom $options" if $wisdom;
+    $options = "-o nthreads=$nthreads $options" if ($nthreads > 1);
+    $options = "-obflag=30 $options" if $mpi_transposed_in;
+    $options = "-obflag=31 $options" if $mpi_transposed_out;
+    return $options;
+}
+
+@list_of_problems = ();
+
+sub flush_problems {
+    my $options = shift;
+    my $problist = "";
+
+    if ($#list_of_problems >= 0) {
+	for (@list_of_problems) {
+	    $problist = "$problist --verify '$_'";
+	}
+	print "Executing \"$program $options $problist\"\n" 
+	    if $verbose;
+	
+	system("$program $options $problist");
+	$exit_value  = $? >> 8;
+	$signal_num  = $? & 127;
+	$dumped_core = $? & 128;
+
+	if ($signal_num == 1) {
+	    print "hangup\n";
+	    exit 0;
+	}
+	if ($signal_num == 2) {
+	    print "interrupted\n";
+	    exit 0;
+	}
+	if ($signal_num == 9) {
+	    print "killed\n";
+	    exit 0;
+	}
+
+	if ($exit_value != 0 || $dumped_core || $signal_num) {
+	    print "FAILED $program: $problist\n";
+	    if ($signal_num) { print "received signal $signal_num\n"; }
+	    exit 1 unless $keepgoing;
+	}
+	@list_of_problems = ();
+    }
+}
+
+sub do_problem {
+    my $problem = shift;
+    my $doablep = shift;
+    my $options = &make_options;
+
+    if ($problem =~ /\// && $problem =~ /r/
+	&& ($problem =~ /i.*x/
+	    || $problem =~ /v/ || $problem =~ /\*/)) {
+	return; # cannot do real split inplace-multidimensional or vector
+    }
+
+    # in --mpi mode, restrict to problems supported by MPI code
+    if ($mpi) {
+	if ($problem =~ /\//) { return; } # no split
+	if ($problem =~ /\*/) { return; } # no non-contiguous vectors
+	if ($problem =~ /r/ && $problem !~ /x/) { return; } # no 1d r2c
+	if ($problem =~ /k/ && $problem !~ /x/) { return; } # no 1d r2r
+	if ($mpi_transposed_in || $problem =~ /\[/) {
+	    if ($problem !~ /x/) { return; } # no 1d transposed_in
+	    if ($problem =~ /r/ && $problem !~ /b/) { return; } # only c2r
+	}
+	if ($mpi_transposed_out || $problem =~ /\]/) {
+	    if ($problem !~ /x/) { return; } # no 1d transposed_out
+	    if ($problem =~ /r/ && $problem =~ /b/) { return; } # only r2c
+	}
+    }
+
+    # size-1 redft00 is not defined/doable
+    return if ($problem =~ /[^0-9]1e00/);
+    
+    if ($doablep) {
+	@list_of_problems = ($problem, @list_of_problems);
+	&flush_problems($options) if ($#list_of_problems > $flushcount);
+    } else {
+	print "Executing \"$program $options --can-do $problem\"\n" 
+	    if $verbose;
+	$result=`$program $options --can-do $problem`;
+	if ($result ne "#f\n" && $result ne "#f\r\n") {
+	    print "FAILED $program: $problem is not undoable\n";
+	    exit 1 unless $keepgoing;
+	}
+    }
+}
+
+# given geometry, try both directions and in place/out of place
+sub do_geometry {
+    my $geom = shift;
+    my $doablep = shift;
+    do_problem("if$geom", $doablep);
+    do_problem("of$geom", $doablep);
+    do_problem("ib$geom", $doablep);
+    do_problem("ob$geom", $doablep);
+    do_problem("//if$geom", $doablep);
+    do_problem("//of$geom", $doablep);
+    do_problem("//ib$geom", $doablep);
+    do_problem("//ob$geom", $doablep);
+}
+
+# given size, try all transform kinds (complex, real, etc.)
+sub do_size {
+    my $size = shift;
+    my $doablep = shift;
+    do_geometry("c$size", $doablep);
+    do_geometry("r$size", $doablep);
+}
+
+sub small_0d {
+    for ($i = 0; $i <= 16; ++$i) {
+	for ($j = 0; $j <= 16; ++$j) {
+	    for ($vl = 1; $vl <= 5; ++$vl) {
+		my $ivl = $i * $vl;
+		my $jvl = $j * $vl;
+		do_problem("o1v${i}:${vl}:${jvl}x${j}:${ivl}:${vl}x${vl}:1:1", 1);
+		do_problem("i1v${i}:${vl}:${jvl}x${j}:${ivl}:${vl}x${vl}:1:1", 1);
+		do_problem("ok1v${i}:${vl}:${jvl}x${j}:${ivl}:${vl}x${vl}:1:1", 1);
+		do_problem("ik1v${i}:${vl}:${jvl}x${j}:${ivl}:${vl}x${vl}:1:1", 1);
+	    }
+	}
+    }
+}
+
+sub small_1d {
+    do_size (0, 0);
+    for ($i = 1; $i <= 100; ++$i) {
+	do_size ($i, 1);
+    }
+    do_size (128, 1);
+    do_size (256, 1);
+    do_size (512, 1);
+    do_size (1024, 1);
+    do_size (2048, 1);
+    do_size (4096, 1);
+}
+
+sub small_2d {
+    do_size ("0x0", 0);
+    for ($i = 1; $i <= 100; ++$i) {
+	my $ub = 900/$i;
+	$ub = 100 if $ub > 100;
+	for ($j = 1; $j <= $ub; ++$j) {
+	    do_size ("${i}x${j}", 1);
+	}
+    }
+}
+
+sub rand_small_factors {
+    my $l = shift;
+    my $n = 1;
+    my $maxfactor = 13;
+    my $f = int(rand($maxfactor) + 1);
+    while ($n * $f < $l) {
+	$n *= $f;
+	$f = int(rand($maxfactor) + 1);
+    };
+    return $n;
+}
+
+# way too complicated...
+sub one_random_test {
+    my $q = int(2 + rand($maxsize));
+    my $rnk = int(1 + rand(4));
+    my $vtype = int(rand(3));
+    my $g = int(2 + exp(log($q) / ($rnk + ($vtype > 0))));
+    my $first = 1;
+    my $sz = "";
+    my $is_r2r = shift;
+    my @r2r_kinds = ("f", "b", "h",
+		     "e00", "e01", "e10", "e11", "o00", "o01", "o10", "o11");
+
+    while ($q > 1 && $rnk > 0) {
+	my $r = rand_small_factors(int(rand($g) + 10));
+	if ($r > 1) {
+	    $sz = "${sz}x" if (!$first);
+	    $first = 0;
+	    $sz = "${sz}${r}";
+	    if ($is_r2r) {
+		my $k = $r2r_kinds[int(1 + rand($#r2r_kinds))];
+		$sz = "${sz}${k}";
+	    }
+	    $q = int($q / $r);
+	    if ($g > $q) { $g = $q; }
+	    --$rnk;
+	}
+    }
+    if ($vtype > 0 && $g > 1) {
+	my $v = int(1 + rand($g));
+	$sz = "${sz}*${v}" if ($vtype == 1);
+	$sz = "${sz}v${v}" if ($vtype == 2);
+    }
+    if ($mpi) {
+	my $stype = int(rand(3));
+	$sz = "]${sz}" if ($stype == 1);
+	$sz = "[${sz}" if ($stype == 2);
+    }
+    $sz = "d$sz" if (int(rand(3)) == 0);
+    if ($is_r2r) {
+	do_problem("ik$sz", 1);
+	do_problem("ok$sz", 1);
+    }
+    else {
+	do_size($sz, 1);
+    }
+}
+
+sub random_tests {
+    my $i;
+    for ($i = 0; $i < $maxcount; ++$i) {
+	&one_random_test(0);
+	&one_random_test(1);
+    }
+}
+
+sub parse_arguments (@)
+{
+    local (@arglist) = @_;
+
+    while (@arglist)
+    {
+	if ($arglist[0] eq '-v') { ++$verbose; }
+	elsif ($arglist[0] eq '--verbose') { ++$verbose; }
+	elsif ($arglist[0] eq '-p') { ++$paranoid; }
+	elsif ($arglist[0] eq '--paranoid') { ++$paranoid; }
+	elsif ($arglist[0] eq '--exhaustive') { ++$exhaustive; }
+	elsif ($arglist[0] eq '--patient') { ++$patient; }
+	elsif ($arglist[0] eq '--estimate') { ++$estimate; }
+	elsif ($arglist[0] eq '--wisdom') { ++$wisdom; }
+	elsif ($arglist[0] =~ /^--nthreads=(.+)$/) { $nthreads = $1; }
+	elsif ($arglist[0] eq '-k') { ++$keepgoing; }
+	elsif ($arglist[0] eq '--keep-going') { ++$keepgoing; }
+	elsif ($arglist[0] =~ /^--verify-rounds=(.+)$/) { $rounds = $1; }
+	elsif ($arglist[0] =~ /^--count=(.+)$/) { $maxcount = $1; }
+	elsif ($arglist[0] =~ /^-c=(.+)$/) { $maxcount = $1; }
+	elsif ($arglist[0] =~ /^--flushcount=(.+)$/) { $flushcount = $1; }
+	elsif ($arglist[0] =~ /^--maxsize=(.+)$/) { $maxsize = $1; }
+
+	elsif ($arglist[0] eq '--mpi') { ++$mpi; }
+	elsif ($arglist[0] eq '--mpi-transposed-in') {
+	    ++$mpi; ++$mpi_transposed_in; }
+	elsif ($arglist[0] eq '--mpi-transposed-out') {
+	    ++$mpi; ++$mpi_transposed_out; }
+	
+	elsif ($arglist[0] eq '-0d') { ++$do_0d; }
+	elsif ($arglist[0] eq '-1d') { ++$do_1d; }
+	elsif ($arglist[0] eq '-2d') { ++$do_2d; }
+	elsif ($arglist[0] eq '-r') { ++$do_random; }
+	elsif ($arglist[0] eq '--random') { ++$do_random; }
+	elsif ($arglist[0] eq '-a') { 
+	    ++$do_0d; ++$do_1d; ++$do_2d; ++$do_random; 
+	}
+
+	else { $program=$arglist[0]; }
+	shift (@arglist);
+    }
+}
+
+# MAIN PROGRAM:
+
+&parse_arguments (@ARGV);
+
+&random_tests if $do_random;
+&small_0d if $do_0d;
+&small_1d if $do_1d;
+&small_2d if $do_2d;
+
+{
+    my $options = &make_options;
+    &flush_problems($options);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tests/fftw-bench.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tests/fftw-bench.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,242 @@
+/* See bench.c.  We keep a few common subroutines in this file so
+   that they can be re-used in the MPI test program. */
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include "fftw-bench.h"
+
+#ifdef _OPENMP
+#  include <omp.h>
+#endif
+
+#ifdef HAVE_SMP
+int threads_ok = 1;
+#endif
+
+FFTW(plan) the_plan = 0;
+
+static const char *wisdat = "wis.dat";
+unsigned the_flags = 0;
+int paranoid = 0;
+int usewisdom = 0;
+int havewisdom = 0;
+int nthreads = 1;
+int amnesia = 0;
+
+extern void install_hook(void);  /* in hook.c */
+extern void uninstall_hook(void);  /* in hook.c */
+
+#ifdef FFTW_RANDOM_ESTIMATOR
+extern unsigned FFTW(random_estimate_seed);
+#endif
+
+void useropt(const char *arg)
+{
+     int x;
+     double y;
+
+     if (!strcmp(arg, "patient")) the_flags |= FFTW_PATIENT;
+     else if (!strcmp(arg, "estimate")) the_flags |= FFTW_ESTIMATE;
+     else if (!strcmp(arg, "estimatepat")) the_flags |= FFTW_ESTIMATE_PATIENT;
+     else if (!strcmp(arg, "exhaustive")) the_flags |= FFTW_EXHAUSTIVE;
+     else if (!strcmp(arg, "unaligned")) the_flags |= FFTW_UNALIGNED;
+     else if (!strcmp(arg, "nosimd")) the_flags |= FFTW_NO_SIMD;
+     else if (!strcmp(arg, "noindirectop")) the_flags |= FFTW_NO_INDIRECT_OP;
+     else if (!strcmp(arg, "wisdom-only")) the_flags |= FFTW_WISDOM_ONLY;
+     else if (sscanf(arg, "flag=%d", &x) == 1) the_flags |= x;
+     else if (sscanf(arg, "bflag=%d", &x) == 1) the_flags |= 1U << x;
+     else if (!strcmp(arg, "paranoid")) paranoid = 1;
+     else if (!strcmp(arg, "wisdom")) usewisdom = 1;
+     else if (!strcmp(arg, "amnesia")) amnesia = 1;
+     else if (sscanf(arg, "nthreads=%d", &x) == 1) nthreads = x;
+#ifdef FFTW_RANDOM_ESTIMATOR
+     else if (sscanf(arg, "eseed=%d", &x) == 1) FFTW(random_estimate_seed) = x;
+#endif
+     else if (sscanf(arg, "timelimit=%lg", &y) == 1) {
+	  FFTW(set_timelimit)(y);
+     }
+
+     else fprintf(stderr, "unknown user option: %s.  Ignoring.\n", arg);
+}
+
+void rdwisdom(void)
+{
+     FILE *f;
+     double tim;
+     int success = 0;
+
+     if (havewisdom) return;
+
+#ifdef HAVE_SMP
+     if (threads_ok) {
+	  BENCH_ASSERT(FFTW(init_threads)());
+	  FFTW(plan_with_nthreads)(nthreads);
+#ifdef _OPENMP
+	  omp_set_num_threads(nthreads);
+#endif
+     }
+     else if (nthreads > 1 && verbose > 1) {
+	  fprintf(stderr, "bench: WARNING - nthreads = %d, but threads not supported\n", nthreads);
+	  nthreads = 1;
+     }
+#endif
+
+     if (!usewisdom) return;
+
+     timer_start(USER_TIMER);
+     if ((f = fopen(wisdat, "r"))) {
+	  if (!import_wisdom(f))
+	       fprintf(stderr, "bench: ERROR reading wisdom\n");
+	  else
+	       success = 1;
+	  fclose(f);
+     }
+     tim = timer_stop(USER_TIMER);
+
+     if (success) {
+	  if (verbose > 1) printf("READ WISDOM (%g seconds): ", tim);
+	  
+	  if (verbose > 3)
+	       export_wisdom(stdout);
+	  if (verbose > 1)
+	       printf("\n");
+     }
+     havewisdom = 1;
+}
+
+void wrwisdom(void)
+{
+     FILE *f;
+     double tim;
+     if (!havewisdom) return;
+
+     timer_start(USER_TIMER);
+     if ((f = fopen(wisdat, "w"))) {
+	  export_wisdom(f);
+	  fclose(f);
+     }
+     tim = timer_stop(USER_TIMER);
+     if (verbose > 1) printf("write wisdom took %g seconds\n", tim);
+}
+
+static unsigned preserve_input_flags(bench_problem *p)
+{
+     /*
+      * fftw3 cannot preserve input for multidimensional c2r transforms.
+      * Enforce FFTW_DESTROY_INPUT
+      */
+     if (p->kind == PROBLEM_REAL && 
+	 p->sign > 0 && 
+	 !p->in_place && 
+	 p->sz->rnk > 1)
+	  p->destroy_input = 1;
+
+     if (p->destroy_input)
+	  return FFTW_DESTROY_INPUT;
+     else
+	  return FFTW_PRESERVE_INPUT;
+}
+
+int can_do(bench_problem *p)
+{
+     double tim;
+
+     if (verbose > 2 && p->pstring)
+	  printf("Planning %s...\n", p->pstring);
+     rdwisdom();
+
+     timer_start(USER_TIMER);
+     the_plan = mkplan(p, preserve_input_flags(p) | the_flags | FFTW_ESTIMATE);
+     tim = timer_stop(USER_TIMER);
+     if (verbose > 2) printf("estimate-planner time: %g s\n", tim);
+
+     if (the_plan) {
+	  FFTW(destroy_plan)(the_plan);
+	  return 1;
+     }
+     return 0;
+}
+
+void setup(bench_problem *p)
+{
+     double tim;
+
+     if (amnesia) {
+	  FFTW(forget_wisdom)();
+	  havewisdom = 0;
+     }
+
+     /* Regression test: check that fftw_malloc exists and links
+      * properly */
+     FFTW(free(FFTW(malloc(42))));
+
+     rdwisdom();
+     install_hook();
+
+#ifdef HAVE_SMP
+     if (verbose > 1 && nthreads > 1) printf("NTHREADS = %d\n", nthreads);
+#endif
+
+     timer_start(USER_TIMER);
+     the_plan = mkplan(p, preserve_input_flags(p) | the_flags);
+     tim = timer_stop(USER_TIMER);
+     if (verbose > 1) printf("planner time: %g s\n", tim);
+
+     BENCH_ASSERT(the_plan);
+     
+     {
+	  double add, mul, nfma, cost, pcost;
+	  FFTW(flops)(the_plan, &add, &mul, &nfma);
+	  cost = FFTW(estimate_cost)(the_plan);
+	  pcost = FFTW(cost)(the_plan);
+	  if (verbose > 1) {
+	       FFTW(print_plan)(the_plan);
+	       printf("\n");
+	       printf("flops: %0.0f add, %0.0f mul, %0.0f fma\n",
+		      add, mul, nfma);
+	       printf("estimated cost: %f, pcost = %f\n", cost, pcost);
+	  }
+     }
+}
+
+
+void doit(int iter, bench_problem *p)
+{
+     int i;
+     FFTW(plan) q = the_plan;
+
+     UNUSED(p);
+     for (i = 0; i < iter; ++i) 
+	  FFTW(execute)(q);
+}
+
+void done(bench_problem *p)
+{
+     UNUSED(p);
+
+     FFTW(destroy_plan)(the_plan);
+     uninstall_hook();
+}
+
+void cleanup(void)
+{
+     initial_cleanup();
+
+     wrwisdom();
+#ifdef HAVE_SMP
+     FFTW(cleanup_threads)();
+#else
+     FFTW(cleanup)();
+#endif
+
+#    ifdef FFTW_DEBUG_MALLOC
+     {
+	  /* undocumented memory checker */
+	  FFTW_EXTERN void FFTW(malloc_print_minfo)(int v);
+	  FFTW(malloc_print_minfo)(verbose);
+     }
+#    endif
+
+     final_cleanup();
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tests/fftw-bench.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tests/fftw-bench.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,37 @@
+/* declarations of common subroutines, etc. for use with FFTW
+   self-test/benchmark program (see bench.c). */
+
+#include "bench-user.h"
+#include "fftw3.h"
+
+#define CONCAT(prefix, name) prefix ## name
+#if defined(BENCHFFT_SINGLE)
+#define FFTW(x) CONCAT(fftwf_, x)
+#elif defined(BENCHFFT_LDOUBLE)
+#define FFTW(x) CONCAT(fftwl_, x)
+#elif defined(BENCHFFT_QUAD)
+#define FFTW(x) CONCAT(fftwq_, x)
+#else
+#define FFTW(x) CONCAT(fftw_, x)
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+extern FFTW(plan) mkplan(bench_problem *p, unsigned flags);
+extern void initial_cleanup(void);
+extern void final_cleanup(void);
+extern int import_wisdom(FILE *f);
+extern void export_wisdom(FILE *f);
+
+#if defined(HAVE_THREADS) || defined(HAVE_OPENMP)
+#  define HAVE_SMP
+   extern int threads_ok;
+#endif
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tests/hook.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tests/hook.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,259 @@
+/* fftw hook to be used in the benchmark program.  
+   
+   We keep it in a separate file because 
+
+   1) bench.c is supposed to test the API---we do not want to #include
+      "ifftw.h" and accidentally use internal symbols/macros.
+   2) this code is a royal mess.  The messiness is due to
+      A) confusion between internal fftw tensors and bench_tensor's
+         (which we want to keep separate because the benchmark
+	  program tests other routines too)
+      B) despite A), our desire to recycle the libbench verifier.
+*/
+
+#include <stdio.h>
+#include "bench-user.h"
+
+#define CALLING_FFTW /* hack for Windows DLL nonsense */
+#include "api.h"
+#include "dft.h"
+#include "rdft.h"
+
+extern int paranoid; /* in bench.c */
+extern X(plan) the_plan; /* in bench.c */
+
+/*
+  transform an fftw tensor into a bench_tensor.
+*/
+static bench_tensor *fftw_tensor_to_bench_tensor(tensor *t)
+{
+     bench_tensor *bt = mktensor(t->rnk);
+
+     if (FINITE_RNK(t->rnk)) {
+	  int i;
+	  for (i = 0; i < t->rnk; ++i) {
+	       /* FIXME: 64-bit unclean because of INT -> int conversion */
+	       bt->dims[i].n = t->dims[i].n;
+	       bt->dims[i].is = t->dims[i].is;
+	       bt->dims[i].os = t->dims[i].os;
+	       BENCH_ASSERT(bt->dims[i].n == t->dims[i].n);
+	       BENCH_ASSERT(bt->dims[i].is == t->dims[i].is);
+	       BENCH_ASSERT(bt->dims[i].os == t->dims[i].os);
+	  }
+     }
+     return bt;
+}
+
+/*
+  transform an fftw problem into a bench_problem.
+*/
+static bench_problem *fftw_problem_to_bench_problem(planner *plnr,
+						    const problem *p_)
+{
+     bench_problem *bp = 0;
+     switch (p_->adt->problem_kind) {
+	 case PROBLEM_DFT:
+	 {
+	      const problem_dft *p = (const problem_dft *) p_;
+	  
+	      if (!p->ri || !p->ii)
+		   abort();
+
+	      bp = (bench_problem *) bench_malloc(sizeof(bench_problem));
+
+	      bp->kind = PROBLEM_COMPLEX;
+	      bp->sign = FFT_SIGN;
+	      bp->split = 1; /* tensor strides are in R's, not C's */
+	      bp->in = UNTAINT(p->ri);
+	      bp->out = UNTAINT(p->ro);
+	      bp->ini = UNTAINT(p->ii);
+	      bp->outi = UNTAINT(p->io);
+	      bp->inphys = bp->outphys = 0;
+	      bp->iphyssz = bp->ophyssz = 0;
+	      bp->in_place = p->ri == p->ro;
+	      bp->sz = fftw_tensor_to_bench_tensor(p->sz);
+	      bp->vecsz = fftw_tensor_to_bench_tensor(p->vecsz);
+	      bp->k = 0;
+	      break;
+	 }
+	 case PROBLEM_RDFT:
+	 {
+	      const problem_rdft *p = (const problem_rdft *) p_;
+	      int i;
+
+	      if (!p->I || !p->O)
+		   abort();
+
+	      for (i = 0; i < p->sz->rnk; ++i)
+		   switch (p->kind[i]) {
+		       case R2HC01:
+		       case R2HC10:
+		       case R2HC11:
+		       case HC2R01:
+		       case HC2R10:
+		       case HC2R11:
+			    return bp;
+		       default:
+			    ;
+		   }
+	  
+	      bp = (bench_problem *) bench_malloc(sizeof(bench_problem));
+
+	      bp->kind = PROBLEM_R2R;
+	      bp->sign = FFT_SIGN;
+	      bp->split = 0;
+	      bp->in = UNTAINT(p->I);
+	      bp->out = UNTAINT(p->O);
+	      bp->ini = bp->outi = 0;
+	      bp->inphys = bp->outphys = 0;
+	      bp->iphyssz = bp->ophyssz = 0;
+	      bp->in_place = p->I == p->O;
+	      bp->sz = fftw_tensor_to_bench_tensor(p->sz);
+	      bp->vecsz = fftw_tensor_to_bench_tensor(p->vecsz);
+	      bp->k = (r2r_kind_t *) bench_malloc(sizeof(r2r_kind_t) * p->sz->rnk);
+	      for (i = 0; i < p->sz->rnk; ++i)
+		   switch (p->kind[i]) {
+		       case R2HC: bp->k[i] = R2R_R2HC; break;
+		       case HC2R: bp->k[i] = R2R_HC2R; break;
+		       case DHT: bp->k[i] = R2R_DHT; break;
+		       case REDFT00: bp->k[i] = R2R_REDFT00; break;
+		       case REDFT01: bp->k[i] = R2R_REDFT01; break;
+		       case REDFT10: bp->k[i] = R2R_REDFT10; break;
+		       case REDFT11: bp->k[i] = R2R_REDFT11; break;
+		       case RODFT00: bp->k[i] = R2R_RODFT00; break;
+		       case RODFT01: bp->k[i] = R2R_RODFT01; break;
+		       case RODFT10: bp->k[i] = R2R_RODFT10; break;
+		       case RODFT11: bp->k[i] = R2R_RODFT11; break;
+		       default: CK(0);
+		   }
+	      break;
+	 }
+	 case PROBLEM_RDFT2:
+	 {
+	      const problem_rdft2 *p = (const problem_rdft2 *) p_;
+	      int rnk = p->sz->rnk;
+	  
+	      if (!p->r0 || !p->r1 || !p->cr || !p->ci)
+		   abort();
+	      
+	      /* give up verifying rdft2 R2HCII */
+	      if (p->kind != R2HC && p->kind != HC2R)
+		   return bp;
+
+	      if (rnk > 0) {
+		   /* can't verify separate even/odd arrays for now */
+		   if (2 * (p->r1 - p->r0) !=
+		       ((p->kind == R2HC) ? 
+			p->sz->dims[rnk-1].is : p->sz->dims[rnk-1].os))
+			return bp;
+	      }
+
+	      bp = (bench_problem *) bench_malloc(sizeof(bench_problem));
+
+	      bp->kind = PROBLEM_REAL;
+	      bp->sign = p->kind == R2HC ? FFT_SIGN : -FFT_SIGN;
+	      bp->split = 1; /* tensor strides are in R's, not C's */
+	      if (p->kind == R2HC) {
+		   bp->sign = FFT_SIGN;
+		   bp->in = UNTAINT(p->r0);
+		   bp->out = UNTAINT(p->cr);
+		   bp->ini = 0;
+		   bp->outi = UNTAINT(p->ci);
+	      }
+	      else {
+		   bp->sign = -FFT_SIGN;
+		   bp->out = UNTAINT(p->r0);
+		   bp->in = UNTAINT(p->cr);
+		   bp->outi = 0;
+		   bp->ini = UNTAINT(p->ci);
+	      }
+	      bp->inphys = bp->outphys = 0;
+	      bp->iphyssz = bp->ophyssz = 0;
+	      bp->in_place = p->r0 == p->cr;
+	      bp->sz = fftw_tensor_to_bench_tensor(p->sz);
+	      if (rnk > 0) {
+		   if (p->kind == R2HC)
+			bp->sz->dims[rnk-1].is /= 2;
+		   else 
+			bp->sz->dims[rnk-1].os /= 2;
+	      }
+	      bp->vecsz = fftw_tensor_to_bench_tensor(p->vecsz);
+	      bp->k = 0;
+	      break;
+	 }
+	 default: 
+	      abort();
+     }
+
+     bp->userinfo = 0;
+     bp->pstring = 0;
+     bp->destroy_input = !NO_DESTROY_INPUTP(plnr);
+
+     return bp;
+}
+
+static void hook(planner *plnr, plan *pln, const problem *p_, int optimalp)
+{
+     int rounds = 5;
+     double tol = SINGLE_PRECISION ? 1.0e-3 : 1.0e-10;
+     UNUSED(optimalp);
+
+     if (verbose > 5) {
+	  printer *pr = X(mkprinter_file)(stdout);
+	  pr->print(pr, "%P:%(%p%)\n", p_, pln);
+	  X(printer_destroy)(pr);
+	  printf("cost %g  \n\n", pln->pcost);
+     }
+
+     if (paranoid) {
+	  bench_problem *bp;
+
+	  bp = fftw_problem_to_bench_problem(plnr, p_);
+	  if (bp) {
+	       X(plan) the_plan_save = the_plan;
+
+	       the_plan = (apiplan *) MALLOC(sizeof(apiplan), PLANS);
+	       the_plan->pln = pln;
+	       the_plan->prb = (problem *) p_;
+
+	       X(plan_awake)(pln, AWAKE_SQRTN_TABLE);
+	       verify_problem(bp, rounds, tol);
+	       X(plan_awake)(pln, SLEEPY);
+
+	       X(ifree)(the_plan);
+	       the_plan = the_plan_save;
+
+	       problem_destroy(bp);
+	  }
+
+     }
+}
+
+static void paranoid_checks(void)
+{
+     /* FIXME: assumes char = 8 bits, which is false on at least one
+	DSP I know of. */
+#if 0
+     /* if flags_t is not 64 bits i want to know it. */
+     CK(sizeof(flags_t) == 8);
+
+     CK(sizeof(md5uint) >= 4);
+#endif
+
+     CK(sizeof(uintptr_t) >= sizeof(R *));
+
+     CK(sizeof(INT) >= sizeof(R *));
+}
+
+void install_hook(void)
+{
+     planner *plnr = X(the_planner)();
+     plnr->hook = hook;
+     paranoid_checks();
+}
+
+void uninstall_hook(void)
+{
+     planner *plnr = X(the_planner)();
+     plnr->hook = 0;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,40 @@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/rdft -I$(top_srcdir)/api
+AM_CFLAGS = $(STACK_ALIGN_CFLAGS)
+
+if OPENMP
+FFTWOMPLIB = libfftw3@PREC_SUFFIX@_omp.la
+else
+FFTWOMPLIB = 
+endif
+
+if THREADS
+if COMBINED_THREADS
+noinst_LTLIBRARIES = libfftw3@PREC_SUFFIX@_threads.la
+else
+lib_LTLIBRARIES = libfftw3@PREC_SUFFIX@_threads.la $(FFTWOMPLIB)
+endif
+else
+lib_LTLIBRARIES = $(FFTWOMPLIB)
+endif
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = threads.h
+
+libfftw3@PREC_SUFFIX@_threads_la_SOURCES = api.c conf.c threads.c	\
+threads.h dft-vrank-geq1.c ct.c rdft-vrank-geq1.c hc2hc.c		\
+vrank-geq1-rdft2.c f77api.c f77funcs.h
+libfftw3@PREC_SUFFIX@_threads_la_CFLAGS = $(AM_CFLAGS) $(PTHREAD_CFLAGS)
+libfftw3@PREC_SUFFIX@_threads_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
+if !COMBINED_THREADS
+libfftw3@PREC_SUFFIX@_threads_la_LIBADD = ../libfftw3@PREC_SUFFIX@.la
+endif
+
+libfftw3@PREC_SUFFIX@_omp_la_SOURCES = api.c conf.c openmp.c	\
+threads.h dft-vrank-geq1.c ct.c rdft-vrank-geq1.c hc2hc.c	\
+vrank-geq1-rdft2.c f77api.c f77funcs.h
+libfftw3@PREC_SUFFIX@_omp_la_CFLAGS = $(AM_CFLAGS) $(OPENMP_CFLAGS)
+libfftw3@PREC_SUFFIX@_omp_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
+if !COMBINED_THREADS
+libfftw3@PREC_SUFFIX@_omp_la_LIBADD = ../libfftw3@PREC_SUFFIX@.la
+endif
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,808 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = threads
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(libdir)"
+LTLIBRARIES = $(lib_LTLIBRARIES) $(noinst_LTLIBRARIES)
+@COMBINED_THREADS_FALSE@libfftw3@PREC_SUFFIX@_omp_la_DEPENDENCIES =  \
+@COMBINED_THREADS_FALSE@	../libfftw3@PREC_SUFFIX@.la
+am_libfftw3@PREC_SUFFIX@_omp_la_OBJECTS =  \
+	libfftw3@PREC_SUFFIX@_omp_la-api.lo \
+	libfftw3@PREC_SUFFIX@_omp_la-conf.lo \
+	libfftw3@PREC_SUFFIX@_omp_la-openmp.lo \
+	libfftw3@PREC_SUFFIX@_omp_la-dft-vrank-geq1.lo \
+	libfftw3@PREC_SUFFIX@_omp_la-ct.lo \
+	libfftw3@PREC_SUFFIX@_omp_la-rdft-vrank-geq1.lo \
+	libfftw3@PREC_SUFFIX@_omp_la-hc2hc.lo \
+	libfftw3@PREC_SUFFIX@_omp_la-vrank-geq1-rdft2.lo \
+	libfftw3@PREC_SUFFIX@_omp_la-f77api.lo
+libfftw3@PREC_SUFFIX@_omp_la_OBJECTS =  \
+	$(am_libfftw3@PREC_SUFFIX@_omp_la_OBJECTS)
+libfftw3@PREC_SUFFIX@_omp_la_LINK = $(LIBTOOL) --tag=CC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) \
+	$(libfftw3@PREC_SUFFIX@_omp_la_LDFLAGS) $(LDFLAGS) -o $@
+@COMBINED_THREADS_FALSE@@OPENMP_TRUE@@THREADS_TRUE@am_libfftw3@PREC_SUFFIX@_omp_la_rpath =  \
+@COMBINED_THREADS_FALSE@@OPENMP_TRUE@@THREADS_TRUE@	-rpath \
+@COMBINED_THREADS_FALSE@@OPENMP_TRUE@@THREADS_TRUE@	$(libdir)
+@OPENMP_TRUE@@THREADS_FALSE@am_libfftw3@PREC_SUFFIX@_omp_la_rpath =  \
+@OPENMP_TRUE@@THREADS_FALSE@	-rpath $(libdir)
+@COMBINED_THREADS_FALSE@libfftw3@PREC_SUFFIX@_threads_la_DEPENDENCIES =  \
+@COMBINED_THREADS_FALSE@	../libfftw3@PREC_SUFFIX@.la
+am_libfftw3@PREC_SUFFIX@_threads_la_OBJECTS =  \
+	libfftw3@PREC_SUFFIX@_threads_la-api.lo \
+	libfftw3@PREC_SUFFIX@_threads_la-conf.lo \
+	libfftw3@PREC_SUFFIX@_threads_la-threads.lo \
+	libfftw3@PREC_SUFFIX@_threads_la-dft-vrank-geq1.lo \
+	libfftw3@PREC_SUFFIX@_threads_la-ct.lo \
+	libfftw3@PREC_SUFFIX@_threads_la-rdft-vrank-geq1.lo \
+	libfftw3@PREC_SUFFIX@_threads_la-hc2hc.lo \
+	libfftw3@PREC_SUFFIX@_threads_la-vrank-geq1-rdft2.lo \
+	libfftw3@PREC_SUFFIX@_threads_la-f77api.lo
+libfftw3@PREC_SUFFIX@_threads_la_OBJECTS =  \
+	$(am_libfftw3@PREC_SUFFIX@_threads_la_OBJECTS)
+libfftw3@PREC_SUFFIX@_threads_la_LINK = $(LIBTOOL) --tag=CC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) \
+	$(libfftw3@PREC_SUFFIX@_threads_la_LDFLAGS) $(LDFLAGS) -o $@
+@COMBINED_THREADS_FALSE@@THREADS_TRUE@am_libfftw3@PREC_SUFFIX@_threads_la_rpath =  \
+@COMBINED_THREADS_FALSE@@THREADS_TRUE@	-rpath $(libdir)
+@COMBINED_THREADS_TRUE@@THREADS_TRUE@am_libfftw3@PREC_SUFFIX@_threads_la_rpath =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(libfftw3@PREC_SUFFIX@_omp_la_SOURCES) \
+	$(libfftw3@PREC_SUFFIX@_threads_la_SOURCES)
+DIST_SOURCES = $(libfftw3@PREC_SUFFIX@_omp_la_SOURCES) \
+	$(libfftw3@PREC_SUFFIX@_threads_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/kernel -I$(top_srcdir)/dft	\
+-I$(top_srcdir)/rdft -I$(top_srcdir)/api
+
+AM_CFLAGS = $(STACK_ALIGN_CFLAGS)
+@OPENMP_FALSE@FFTWOMPLIB = 
+@OPENMP_TRUE@FFTWOMPLIB = libfftw3@PREC_SUFFIX@_omp.la
+@COMBINED_THREADS_TRUE@@THREADS_TRUE@noinst_LTLIBRARIES = libfftw3@PREC_SUFFIX@_threads.la
+@COMBINED_THREADS_FALSE@@THREADS_TRUE@lib_LTLIBRARIES = libfftw3@PREC_SUFFIX@_threads.la $(FFTWOMPLIB)
+@THREADS_FALSE@lib_LTLIBRARIES = $(FFTWOMPLIB)
+
+# pkgincludedir = $(includedir)/fftw3@PREC_SUFFIX@
+# pkginclude_HEADERS = threads.h
+libfftw3@PREC_SUFFIX@_threads_la_SOURCES = api.c conf.c threads.c	\
+threads.h dft-vrank-geq1.c ct.c rdft-vrank-geq1.c hc2hc.c		\
+vrank-geq1-rdft2.c f77api.c f77funcs.h
+
+libfftw3@PREC_SUFFIX@_threads_la_CFLAGS = $(AM_CFLAGS) $(PTHREAD_CFLAGS)
+libfftw3@PREC_SUFFIX@_threads_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
+@COMBINED_THREADS_FALSE@libfftw3@PREC_SUFFIX@_threads_la_LIBADD = ../libfftw3@PREC_SUFFIX@.la
+libfftw3@PREC_SUFFIX@_omp_la_SOURCES = api.c conf.c openmp.c	\
+threads.h dft-vrank-geq1.c ct.c rdft-vrank-geq1.c hc2hc.c	\
+vrank-geq1-rdft2.c f77api.c f77funcs.h
+
+libfftw3@PREC_SUFFIX@_omp_la_CFLAGS = $(AM_CFLAGS) $(OPENMP_CFLAGS)
+libfftw3@PREC_SUFFIX@_omp_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
+@COMBINED_THREADS_FALSE@libfftw3@PREC_SUFFIX@_omp_la_LIBADD = ../libfftw3@PREC_SUFFIX@.la
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu threads/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu threads/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	list2=; for p in $$list; do \
+	  if test -f $$p; then \
+	    list2="$$list2 $$p"; \
+	  else :; fi; \
+	done; \
+	test -z "$$list2" || { \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
+	}
+
+uninstall-libLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
+	done
+
+clean-libLTLIBRARIES:
+	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test "$$dir" != "$$p" || dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+libfftw3@PREC_SUFFIX@_omp.la: $(libfftw3@PREC_SUFFIX@_omp_la_OBJECTS) $(libfftw3@PREC_SUFFIX@_omp_la_DEPENDENCIES) $(EXTRA_libfftw3@PREC_SUFFIX@_omp_la_DEPENDENCIES) 
+	$(libfftw3@PREC_SUFFIX@_omp_la_LINK) $(am_libfftw3@PREC_SUFFIX@_omp_la_rpath) $(libfftw3@PREC_SUFFIX@_omp_la_OBJECTS) $(libfftw3@PREC_SUFFIX@_omp_la_LIBADD) $(LIBS)
+libfftw3@PREC_SUFFIX@_threads.la: $(libfftw3@PREC_SUFFIX@_threads_la_OBJECTS) $(libfftw3@PREC_SUFFIX@_threads_la_DEPENDENCIES) $(EXTRA_libfftw3@PREC_SUFFIX@_threads_la_DEPENDENCIES) 
+	$(libfftw3@PREC_SUFFIX@_threads_la_LINK) $(am_libfftw3@PREC_SUFFIX@_threads_la_rpath) $(libfftw3@PREC_SUFFIX@_threads_la_OBJECTS) $(libfftw3@PREC_SUFFIX@_threads_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-conf.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-ct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-dft-vrank-geq1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-f77api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-hc2hc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-openmp.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-rdft-vrank-geq1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-vrank-geq1-rdft2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-conf.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-ct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-dft-vrank-geq1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-f77api.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-hc2hc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-rdft-vrank-geq1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-threads.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-vrank-geq1-rdft2.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+libfftw3@PREC_SUFFIX@_omp_la-api.lo: api.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_omp_la-api.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-api.Tpo -c -o libfftw3@PREC_SUFFIX@_omp_la-api.lo `test -f 'api.c' || echo '$(srcdir)/'`api.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-api.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='api.c' object='libfftw3@PREC_SUFFIX@_omp_la-api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_omp_la-api.lo `test -f 'api.c' || echo '$(srcdir)/'`api.c
+
+libfftw3@PREC_SUFFIX@_omp_la-conf.lo: conf.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_omp_la-conf.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-conf.Tpo -c -o libfftw3@PREC_SUFFIX@_omp_la-conf.lo `test -f 'conf.c' || echo '$(srcdir)/'`conf.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-conf.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-conf.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='conf.c' object='libfftw3@PREC_SUFFIX@_omp_la-conf.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_omp_la-conf.lo `test -f 'conf.c' || echo '$(srcdir)/'`conf.c
+
+libfftw3@PREC_SUFFIX@_omp_la-openmp.lo: openmp.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_omp_la-openmp.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-openmp.Tpo -c -o libfftw3@PREC_SUFFIX@_omp_la-openmp.lo `test -f 'openmp.c' || echo '$(srcdir)/'`openmp.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-openmp.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-openmp.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='openmp.c' object='libfftw3@PREC_SUFFIX@_omp_la-openmp.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_omp_la-openmp.lo `test -f 'openmp.c' || echo '$(srcdir)/'`openmp.c
+
+libfftw3@PREC_SUFFIX@_omp_la-dft-vrank-geq1.lo: dft-vrank-geq1.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_omp_la-dft-vrank-geq1.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-dft-vrank-geq1.Tpo -c -o libfftw3@PREC_SUFFIX@_omp_la-dft-vrank-geq1.lo `test -f 'dft-vrank-geq1.c' || echo '$(srcdir)/'`dft-vrank-geq1.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-dft-vrank-geq1.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-dft-vrank-geq1.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='dft-vrank-geq1.c' object='libfftw3@PREC_SUFFIX@_omp_la-dft-vrank-geq1.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_omp_la-dft-vrank-geq1.lo `test -f 'dft-vrank-geq1.c' || echo '$(srcdir)/'`dft-vrank-geq1.c
+
+libfftw3@PREC_SUFFIX@_omp_la-ct.lo: ct.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_omp_la-ct.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-ct.Tpo -c -o libfftw3@PREC_SUFFIX@_omp_la-ct.lo `test -f 'ct.c' || echo '$(srcdir)/'`ct.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-ct.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-ct.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='ct.c' object='libfftw3@PREC_SUFFIX@_omp_la-ct.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_omp_la-ct.lo `test -f 'ct.c' || echo '$(srcdir)/'`ct.c
+
+libfftw3@PREC_SUFFIX@_omp_la-rdft-vrank-geq1.lo: rdft-vrank-geq1.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_omp_la-rdft-vrank-geq1.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-rdft-vrank-geq1.Tpo -c -o libfftw3@PREC_SUFFIX@_omp_la-rdft-vrank-geq1.lo `test -f 'rdft-vrank-geq1.c' || echo '$(srcdir)/'`rdft-vrank-geq1.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-rdft-vrank-geq1.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-rdft-vrank-geq1.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='rdft-vrank-geq1.c' object='libfftw3@PREC_SUFFIX@_omp_la-rdft-vrank-geq1.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_omp_la-rdft-vrank-geq1.lo `test -f 'rdft-vrank-geq1.c' || echo '$(srcdir)/'`rdft-vrank-geq1.c
+
+libfftw3@PREC_SUFFIX@_omp_la-hc2hc.lo: hc2hc.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_omp_la-hc2hc.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-hc2hc.Tpo -c -o libfftw3@PREC_SUFFIX@_omp_la-hc2hc.lo `test -f 'hc2hc.c' || echo '$(srcdir)/'`hc2hc.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-hc2hc.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-hc2hc.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='hc2hc.c' object='libfftw3@PREC_SUFFIX@_omp_la-hc2hc.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_omp_la-hc2hc.lo `test -f 'hc2hc.c' || echo '$(srcdir)/'`hc2hc.c
+
+libfftw3@PREC_SUFFIX@_omp_la-vrank-geq1-rdft2.lo: vrank-geq1-rdft2.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_omp_la-vrank-geq1-rdft2.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-vrank-geq1-rdft2.Tpo -c -o libfftw3@PREC_SUFFIX@_omp_la-vrank-geq1-rdft2.lo `test -f 'vrank-geq1-rdft2.c' || echo '$(srcdir)/'`vrank-geq1-rdft2.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-vrank-geq1-rdft2.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-vrank-geq1-rdft2.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='vrank-geq1-rdft2.c' object='libfftw3@PREC_SUFFIX@_omp_la-vrank-geq1-rdft2.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_omp_la-vrank-geq1-rdft2.lo `test -f 'vrank-geq1-rdft2.c' || echo '$(srcdir)/'`vrank-geq1-rdft2.c
+
+libfftw3@PREC_SUFFIX@_omp_la-f77api.lo: f77api.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_omp_la-f77api.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-f77api.Tpo -c -o libfftw3@PREC_SUFFIX@_omp_la-f77api.lo `test -f 'f77api.c' || echo '$(srcdir)/'`f77api.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-f77api.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_omp_la-f77api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='f77api.c' object='libfftw3@PREC_SUFFIX@_omp_la-f77api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_omp_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_omp_la-f77api.lo `test -f 'f77api.c' || echo '$(srcdir)/'`f77api.c
+
+libfftw3@PREC_SUFFIX@_threads_la-api.lo: api.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_threads_la-api.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-api.Tpo -c -o libfftw3@PREC_SUFFIX@_threads_la-api.lo `test -f 'api.c' || echo '$(srcdir)/'`api.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-api.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='api.c' object='libfftw3@PREC_SUFFIX@_threads_la-api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_threads_la-api.lo `test -f 'api.c' || echo '$(srcdir)/'`api.c
+
+libfftw3@PREC_SUFFIX@_threads_la-conf.lo: conf.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_threads_la-conf.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-conf.Tpo -c -o libfftw3@PREC_SUFFIX@_threads_la-conf.lo `test -f 'conf.c' || echo '$(srcdir)/'`conf.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-conf.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-conf.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='conf.c' object='libfftw3@PREC_SUFFIX@_threads_la-conf.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_threads_la-conf.lo `test -f 'conf.c' || echo '$(srcdir)/'`conf.c
+
+libfftw3@PREC_SUFFIX@_threads_la-threads.lo: threads.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_threads_la-threads.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-threads.Tpo -c -o libfftw3@PREC_SUFFIX@_threads_la-threads.lo `test -f 'threads.c' || echo '$(srcdir)/'`threads.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-threads.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-threads.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='threads.c' object='libfftw3@PREC_SUFFIX@_threads_la-threads.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_threads_la-threads.lo `test -f 'threads.c' || echo '$(srcdir)/'`threads.c
+
+libfftw3@PREC_SUFFIX@_threads_la-dft-vrank-geq1.lo: dft-vrank-geq1.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_threads_la-dft-vrank-geq1.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-dft-vrank-geq1.Tpo -c -o libfftw3@PREC_SUFFIX@_threads_la-dft-vrank-geq1.lo `test -f 'dft-vrank-geq1.c' || echo '$(srcdir)/'`dft-vrank-geq1.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-dft-vrank-geq1.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-dft-vrank-geq1.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='dft-vrank-geq1.c' object='libfftw3@PREC_SUFFIX@_threads_la-dft-vrank-geq1.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_threads_la-dft-vrank-geq1.lo `test -f 'dft-vrank-geq1.c' || echo '$(srcdir)/'`dft-vrank-geq1.c
+
+libfftw3@PREC_SUFFIX@_threads_la-ct.lo: ct.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_threads_la-ct.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-ct.Tpo -c -o libfftw3@PREC_SUFFIX@_threads_la-ct.lo `test -f 'ct.c' || echo '$(srcdir)/'`ct.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-ct.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-ct.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='ct.c' object='libfftw3@PREC_SUFFIX@_threads_la-ct.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_threads_la-ct.lo `test -f 'ct.c' || echo '$(srcdir)/'`ct.c
+
+libfftw3@PREC_SUFFIX@_threads_la-rdft-vrank-geq1.lo: rdft-vrank-geq1.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_threads_la-rdft-vrank-geq1.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-rdft-vrank-geq1.Tpo -c -o libfftw3@PREC_SUFFIX@_threads_la-rdft-vrank-geq1.lo `test -f 'rdft-vrank-geq1.c' || echo '$(srcdir)/'`rdft-vrank-geq1.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-rdft-vrank-geq1.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-rdft-vrank-geq1.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='rdft-vrank-geq1.c' object='libfftw3@PREC_SUFFIX@_threads_la-rdft-vrank-geq1.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_threads_la-rdft-vrank-geq1.lo `test -f 'rdft-vrank-geq1.c' || echo '$(srcdir)/'`rdft-vrank-geq1.c
+
+libfftw3@PREC_SUFFIX@_threads_la-hc2hc.lo: hc2hc.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_threads_la-hc2hc.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-hc2hc.Tpo -c -o libfftw3@PREC_SUFFIX@_threads_la-hc2hc.lo `test -f 'hc2hc.c' || echo '$(srcdir)/'`hc2hc.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-hc2hc.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-hc2hc.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='hc2hc.c' object='libfftw3@PREC_SUFFIX@_threads_la-hc2hc.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_threads_la-hc2hc.lo `test -f 'hc2hc.c' || echo '$(srcdir)/'`hc2hc.c
+
+libfftw3@PREC_SUFFIX@_threads_la-vrank-geq1-rdft2.lo: vrank-geq1-rdft2.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_threads_la-vrank-geq1-rdft2.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-vrank-geq1-rdft2.Tpo -c -o libfftw3@PREC_SUFFIX@_threads_la-vrank-geq1-rdft2.lo `test -f 'vrank-geq1-rdft2.c' || echo '$(srcdir)/'`vrank-geq1-rdft2.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-vrank-geq1-rdft2.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-vrank-geq1-rdft2.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='vrank-geq1-rdft2.c' object='libfftw3@PREC_SUFFIX@_threads_la-vrank-geq1-rdft2.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_threads_la-vrank-geq1-rdft2.lo `test -f 'vrank-geq1-rdft2.c' || echo '$(srcdir)/'`vrank-geq1-rdft2.c
+
+libfftw3@PREC_SUFFIX@_threads_la-f77api.lo: f77api.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -MT libfftw3@PREC_SUFFIX@_threads_la-f77api.lo -MD -MP -MF $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-f77api.Tpo -c -o libfftw3@PREC_SUFFIX@_threads_la-f77api.lo `test -f 'f77api.c' || echo '$(srcdir)/'`f77api.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-f77api.Tpo $(DEPDIR)/libfftw3@PREC_SUFFIX@_threads_la-f77api.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='f77api.c' object='libfftw3@PREC_SUFFIX@_threads_la-f77api.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libfftw3@PREC_SUFFIX@_threads_la_CFLAGS) $(CFLAGS) -c -o libfftw3@PREC_SUFFIX@_threads_la-f77api.lo `test -f 'f77api.c' || echo '$(srcdir)/'`f77api.c
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+	for dir in "$(DESTDIR)$(libdir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
+	clean-noinstLTLIBRARIES mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-libLTLIBRARIES
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-libLTLIBRARIES
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+	clean-libLTLIBRARIES clean-libtool clean-noinstLTLIBRARIES \
+	ctags distclean distclean-compile distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-libLTLIBRARIES install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am uninstall-libLTLIBRARIES
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/api.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/api.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+#include "threads.h"
+
+static int threads_inited = 0;
+
+static void threads_register_hooks(void)
+{
+     X(mksolver_ct_hook) = X(mksolver_ct_threads);
+     X(mksolver_hc2hc_hook) = X(mksolver_hc2hc_threads);
+}
+
+static void threads_unregister_hooks(void)
+{
+     X(mksolver_ct_hook) = 0;
+     X(mksolver_hc2hc_hook) = 0;
+}
+
+/* should be called before all other FFTW functions! */
+int X(init_threads)(void)
+{
+     if (!threads_inited) {
+	  planner *plnr;
+
+          if (X(ithreads_init)())
+               return 0;
+
+	  threads_register_hooks();
+
+	  /* this should be the first time the_planner is called,
+	     and hence the time it is configured */
+	  plnr = X(the_planner)();
+	  X(threads_conf_standard)(plnr);
+	       
+          threads_inited = 1;
+     }
+     return 1;
+}
+
+
+void X(cleanup_threads)(void)
+{
+     X(cleanup)();
+     if (threads_inited) {
+	  X(threads_cleanup)();
+	  threads_unregister_hooks();
+	  threads_inited = 0;
+     }
+}
+
+void X(plan_with_nthreads)(int nthreads)
+{
+     planner *plnr;
+
+     if (!threads_inited) {
+	  X(cleanup)();
+	  X(init_threads)();
+     }
+     A(threads_inited);
+     plnr = X(the_planner)();
+     plnr->nthr = X(imax)(1, nthreads);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/conf.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/conf.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "threads.h"
+
+static const solvtab s =
+{
+     SOLVTAB(X(dft_thr_vrank_geq1_register)),
+     SOLVTAB(X(rdft_thr_vrank_geq1_register)),
+     SOLVTAB(X(rdft2_thr_vrank_geq1_register)),
+
+     SOLVTAB_END
+};
+
+void X(threads_conf_standard)(planner *p)
+{
+     X(solvtab_exec)(s, p);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/ct.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/ct.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "threads.h"
+
+typedef struct {
+     plan_dft super;
+     plan *cld;
+     plan **cldws;
+     int nthr;
+     INT r;
+} P;
+
+typedef struct {
+     plan **cldws;
+     R *r, *i;
+} PD;
+
+static void *spawn_apply(spawn_data *d)
+{
+     PD *ego = (PD *) d->data;
+     INT thr_num = d->thr_num;
+
+     plan_dftw *cldw = (plan_dftw *) (ego->cldws[thr_num]);
+     cldw->apply((plan *) cldw, ego->r, ego->i);
+     return 0;
+}
+
+static void apply_dit(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, ri, ii, ro, io);
+
+     {
+	  PD d;
+
+	  d.r = ro; d.i = io;
+	  d.cldws = ego->cldws;
+
+	  X(spawn_loop)(ego->nthr, ego->nthr, spawn_apply, (void*)&d);
+     }
+}
+
+static void apply_dif(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+
+     {
+	  PD d;
+
+	  d.r = ri; d.i = ii;
+	  d.cldws = ego->cldws;
+
+	  X(spawn_loop)(ego->nthr, ego->nthr, spawn_apply, (void*)&d);
+     }
+
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, ri, ii, ro, io);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     int i;
+     X(plan_awake)(ego->cld, wakefulness);
+     for (i = 0; i < ego->nthr; ++i)
+	  X(plan_awake)(ego->cldws[i], wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     int i;
+     X(plan_destroy_internal)(ego->cld);
+     for (i = 0; i < ego->nthr; ++i)
+	  X(plan_destroy_internal)(ego->cldws[i]);
+     X(ifree)(ego->cldws);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     int i;
+     p->print(p, "(dft-thr-ct-%s-x%d/%D",
+	      ego->super.apply == apply_dit ? "dit" : "dif",
+	      ego->nthr, ego->r);
+     for (i = 0; i < ego->nthr; ++i)
+          if (i == 0 || (ego->cldws[i] != ego->cldws[i-1] &&
+                         (i <= 1 || ego->cldws[i] != ego->cldws[i-2])))
+               p->print(p, "%(%p%)", ego->cldws[i]);
+     p->print(p, "%(%p%))", ego->cld);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const ct_solver *ego = (const ct_solver *) ego_;
+     const problem_dft *p;
+     P *pln = 0;
+     plan *cld = 0, **cldws = 0;
+     INT n, r, m, v, ivs, ovs;
+     INT block_size;
+     int i, nthr, plnr_nthr_save;
+     iodim *d;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     if (plnr->nthr <= 1 || !X(ct_applicable)(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_dft *) p_;
+     d = p->sz->dims;
+     n = d[0].n;
+     r = X(choose_radix)(ego->r, n);
+     m = n / r;
+
+     X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
+
+     block_size = (m + plnr->nthr - 1) / plnr->nthr;
+     nthr = (int)((m + block_size - 1) / block_size);
+     plnr_nthr_save = plnr->nthr;
+     plnr->nthr = (plnr->nthr + nthr - 1) / nthr;
+
+     cldws = (plan **) MALLOC(sizeof(plan *) * nthr, PLANS);
+     for (i = 0; i < nthr; ++i) cldws[i] = (plan *) 0;
+
+     switch (ego->dec) {
+	 case DECDIT:
+	 {
+	      for (i = 0; i < nthr; ++i) {
+		   cldws[i] = ego->mkcldw(ego,
+					  r, m * d[0].os, m * d[0].os,
+					  m, d[0].os,
+					  v, ovs, ovs,
+					  i*block_size,
+					  (i == nthr - 1) ?
+					  (m - i*block_size) : block_size,
+					  p->ro, p->io, plnr);
+		   if (!cldws[i]) goto nada;
+	      }
+
+	      plnr->nthr = plnr_nthr_save;
+
+	      cld = X(mkplan_d)(plnr,
+				X(mkproblem_dft_d)(
+				     X(mktensor_1d)(m, r * d[0].is, d[0].os),
+				     X(mktensor_2d)(r, d[0].is, m * d[0].os,
+						    v, ivs, ovs),
+				     p->ri, p->ii, p->ro, p->io)
+		   );
+	      if (!cld) goto nada;
+
+	      pln = MKPLAN_DFT(P, &padt, apply_dit);
+	      break;
+	 }
+	 case DECDIF:
+	 case DECDIF+TRANSPOSE:
+	 {
+	      INT cors, covs; /* cldw ors, ovs */
+	      if (ego->dec == DECDIF+TRANSPOSE) {
+		   cors = ivs;
+		   covs = m * d[0].is;
+		   /* ensure that we generate well-formed dftw subproblems */
+		   /* FIXME: too conservative */
+		   if (!(1
+			 && r == v
+			 && d[0].is == r * cors))
+			goto nada;
+
+		   /* FIXME: allow in-place only for now, like in
+		      fftw-3.[01] */
+		   if (!(1
+			 && p->ri == p->ro
+			 && d[0].is == r * d[0].os
+			 && cors == d[0].os
+			 && covs == ovs
+			    ))
+			goto nada;
+	      } else {
+		   cors = m * d[0].is;
+		   covs = ivs;
+	      }
+
+	      for (i = 0; i < nthr; ++i) {
+		   cldws[i] = ego->mkcldw(ego,
+					  r, m * d[0].is, cors,
+					  m, d[0].is,
+					  v, ivs, covs,
+					  i*block_size,
+					  (i == nthr - 1) ?
+					  (m - i*block_size) : block_size,
+					  p->ri, p->ii, plnr);
+		   if (!cldws[i]) goto nada;
+	      }
+
+	      plnr->nthr = plnr_nthr_save;
+
+	      cld = X(mkplan_d)(plnr,
+				X(mkproblem_dft_d)(
+				     X(mktensor_1d)(m, d[0].is, r * d[0].os),
+				     X(mktensor_2d)(r, cors, d[0].os,
+						    v, covs, ovs),
+				     p->ri, p->ii, p->ro, p->io)
+		   );
+	      if (!cld) goto nada;
+
+	      pln = MKPLAN_DFT(P, &padt, apply_dif);
+	      break;
+	 }
+
+	 default: A(0);
+
+     }
+
+     pln->cld = cld;
+     pln->cldws = cldws;
+     pln->nthr = nthr;
+     pln->r = r;
+     X(ops_zero)(&pln->super.super.ops);
+     for (i = 0; i < nthr; ++i) {
+          X(ops_add2)(&cldws[i]->ops, &pln->super.super.ops);
+	  pln->super.super.could_prune_now_p |= cldws[i]->could_prune_now_p;
+     }
+     X(ops_add2)(&cld->ops, &pln->super.super.ops);
+     return &(pln->super.super);
+
+ nada:
+     if (cldws) {
+	  for (i = 0; i < nthr; ++i)
+	       X(plan_destroy_internal)(cldws[i]);
+	  X(ifree)(cldws);
+     }
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+ct_solver *X(mksolver_ct_threads)(size_t size, INT r, int dec,
+				  ct_mkinferior mkcldw,
+				  ct_force_vrecursion force_vrecursionp)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     ct_solver *slv = (ct_solver *) X(mksolver)(size, &sadt);
+     slv->r = r;
+     slv->dec = dec;
+     slv->mkcldw = mkcldw;
+     slv->force_vrecursionp = force_vrecursionp;
+     return slv;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/dft-vrank-geq1.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/dft-vrank-geq1.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "threads.h"
+
+typedef struct {
+     solver super;
+     int vecloop_dim;
+     const int *buddies;
+     int nbuddies;
+} S;
+
+typedef struct {
+     plan_dft super;
+     plan **cldrn;
+     INT its, ots;
+     int nthr;
+     const S *solver;
+} P;
+
+typedef struct {
+     INT its, ots;
+     R *ri, *ii, *ro, *io;
+     plan **cldrn;
+} PD;
+
+static void *spawn_apply(spawn_data *d)
+{
+     PD *ego = (PD *) d->data;
+     INT its = ego->its;
+     INT ots = ego->ots;
+     int thr_num = d->thr_num;
+     plan_dft *cld = (plan_dft *) ego->cldrn[thr_num];
+
+     cld->apply((plan *) cld,
+		ego->ri + thr_num * its, ego->ii + thr_num * its,
+		ego->ro + thr_num * ots, ego->io + thr_num * ots);
+     return 0;
+}
+
+static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
+{
+     const P *ego = (const P *) ego_;
+     PD d;
+
+     d.its = ego->its;
+     d.ots = ego->ots;
+     d.cldrn = ego->cldrn;
+     d.ri = ri; d.ii = ii; d.ro = ro; d.io = io;
+
+     X(spawn_loop)(ego->nthr, ego->nthr, spawn_apply, (void*) &d);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     int i;
+     for (i = 0; i < ego->nthr; ++i)
+	  X(plan_awake)(ego->cldrn[i], wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     int i;
+     for (i = 0; i < ego->nthr; ++i)
+	  X(plan_destroy_internal)(ego->cldrn[i]);
+     X(ifree)(ego->cldrn);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->solver;
+     int i;
+     p->print(p, "(dft-thr-vrank>=1-x%d/%d", ego->nthr, s->vecloop_dim);
+     for (i = 0; i < ego->nthr; ++i)
+	  if (i == 0 || (ego->cldrn[i] != ego->cldrn[i-1] &&
+			 (i <= 1 || ego->cldrn[i] != ego->cldrn[i-2])))
+	       p->print(p, "%(%p%)", ego->cldrn[i]);
+     p->putchr(p, ')');
+}
+
+static int pickdim(const S *ego, const tensor *vecsz, int oop, int *dp)
+{
+     return X(pickdim)(ego->vecloop_dim, ego->buddies, ego->nbuddies,
+                       vecsz, oop, dp);
+}
+
+static int applicable0(const solver *ego_, const problem *p_,
+		       const planner *plnr, int *dp)
+{
+     const S *ego = (const S *) ego_;
+     const problem_dft *p = (const problem_dft *) p_;
+
+     return (1
+	     && plnr->nthr > 1
+	     && FINITE_RNK(p->vecsz->rnk)
+	     && p->vecsz->rnk > 0
+	     && pickdim(ego, p->vecsz, p->ri != p->ro, dp)
+	  );
+}
+
+static int applicable(const solver *ego_, const problem *p_,
+		      const planner *plnr, int *dp)
+{
+     const S *ego = (const S *)ego_;
+
+     if (!applicable0(ego_, p_, plnr, dp)) return 0;
+
+     /* fftw2 behavior */
+     if (NO_VRANK_SPLITSP(plnr) && (ego->vecloop_dim != ego->buddies[0]))
+	  return 0;
+
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_dft *p;
+     P *pln;
+     problem *cldp;
+     int vdim;
+     iodim *d;
+     plan **cldrn = (plan **) 0;
+     int i, nthr;
+     INT its, ots, block_size;
+     tensor *vecsz = 0;
+
+     static const plan_adt padt = {
+	  X(dft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &vdim))
+          return (plan *) 0;
+     p = (const problem_dft *) p_;
+     d = p->vecsz->dims + vdim;
+
+     block_size = (d->n + plnr->nthr - 1) / plnr->nthr;
+     nthr = (int)((d->n + block_size - 1) / block_size);
+     plnr->nthr = (plnr->nthr + nthr - 1) / nthr;
+     its = d->is * block_size;
+     ots = d->os * block_size;
+
+     cldrn = (plan **)MALLOC(sizeof(plan *) * nthr, PLANS);
+     for (i = 0; i < nthr; ++i) cldrn[i] = (plan *) 0;
+     
+     vecsz = X(tensor_copy)(p->vecsz);
+     for (i = 0; i < nthr; ++i) {
+	  vecsz->dims[vdim].n =
+	       (i == nthr - 1) ? (d->n - i*block_size) : block_size;
+	  cldp = X(mkproblem_dft)(p->sz, vecsz,
+				  p->ri + i*its, p->ii + i*its, 
+				  p->ro + i*ots, p->io + i*ots);
+	  cldrn[i] = X(mkplan_d)(plnr, cldp);
+	  if (!cldrn[i]) goto nada;
+     }
+     X(tensor_destroy)(vecsz);
+
+     pln = MKPLAN_DFT(P, &padt, apply);
+
+     pln->cldrn = cldrn;
+     pln->its = its;
+     pln->ots = ots;
+     pln->nthr = nthr;
+
+     pln->solver = ego;
+     X(ops_zero)(&pln->super.super.ops);
+     pln->super.super.pcost = 0;
+     for (i = 0; i < nthr; ++i) {
+	  X(ops_add2)(&cldrn[i]->ops, &pln->super.super.ops);
+	  pln->super.super.pcost += cldrn[i]->pcost;
+     }
+
+     return &(pln->super.super);
+
+ nada:
+     if (cldrn) {
+	  for (i = 0; i < nthr; ++i)
+	       X(plan_destroy_internal)(cldrn[i]);
+	  X(ifree)(cldrn);
+     }
+     X(tensor_destroy)(vecsz);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int vecloop_dim, const int *buddies, int nbuddies)
+{
+     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->vecloop_dim = vecloop_dim;
+     slv->buddies = buddies;
+     slv->nbuddies = nbuddies;
+     return &(slv->super);
+}
+
+void X(dft_thr_vrank_geq1_register)(planner *p)
+{
+     int i;
+
+     /* FIXME: Should we try other vecloop_dim values? */
+     static const int buddies[] = { 1, -1 };
+
+     const int nbuddies = (int)(sizeof(buddies) / sizeof(buddies[0]));
+
+     for (i = 0; i < nbuddies; ++i)
+          REGISTER_SOLVER(p, mksolver(buddies[i], buddies, nbuddies));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/f77api.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/f77api.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api.h"
+
+/* if F77_FUNC is not defined, then we don't know how to mangle identifiers
+   for the Fortran linker, and we must omit the f77 API. */
+#if defined(F77_FUNC) || defined(WINDOWS_F77_MANGLING)
+
+#include "x77.h"
+
+#define F77(a, A) F77x(x77(a), X77(A))
+
+#ifndef WINDOWS_F77_MANGLING
+
+#if defined(F77_FUNC)
+#  define F77x(a, A) F77_FUNC(a, A)
+#  include "f77funcs.h"
+#endif
+
+#if defined(F77_FUNC_) && !defined(F77_FUNC_EQUIV)
+#  undef F77x
+#  define F77x(a, A) F77_FUNC_(a, A)
+#  include "f77funcs.h"
+#endif
+
+#else /* WINDOWS_F77_MANGLING */
+
+/* Various mangling conventions common (?) under Windows. */
+
+/* g77 */
+#  define WINDOWS_F77_FUNC(a, A) a ## __
+#  define F77x(a, A) WINDOWS_F77_FUNC(a, A)
+#  include "f77funcs.h"
+
+/* Intel, etc. */
+#  undef WINDOWS_F77_FUNC
+#  define WINDOWS_F77_FUNC(a, A) a ## _
+#  include "f77funcs.h"
+
+/* Digital/Compaq/HP Visual Fortran, Intel Fortran.  stdcall attribute
+   is apparently required to adjust for calling conventions (callee
+   pops stack in stdcall).  See also:
+       http://msdn.microsoft.com/library/en-us/vccore98/html/_core_mixed.2d.language_programming.3a_.overview.asp
+*/
+#  undef WINDOWS_F77_FUNC
+#  if defined(__GNUC__)
+#    define WINDOWS_F77_FUNC(a, A) __attribute__((stdcall)) A
+#  elif defined(_MSC_VER) || defined(_ICC) || defined(_STDCALL_SUPPORTED)
+#    define WINDOWS_F77_FUNC(a, A) __stdcall A
+#  else
+#    define WINDOWS_F77_FUNC(a, A) A /* oh well */
+#  endif
+#  include "f77funcs.h"
+
+#endif /* WINDOWS_F77_MANGLING */
+
+#endif				/* F77_FUNC */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/f77funcs.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/f77funcs.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Functions in the FFTW Fortran API, mangled according to the
+   F77(...) macro.  This file is designed to be #included by
+   f77api.c, possibly multiple times in order to support multiple
+   compiler manglings (via redefinition of F77). */
+
+FFTW_VOIDFUNC F77(plan_with_nthreads, PLAN_WITH_NTHREADS)(int *nthreads)
+{
+     X(plan_with_nthreads)(*nthreads);
+}
+
+FFTW_VOIDFUNC F77(init_threads, INIT_THREADS)(int *okay)
+{
+     *okay = X(init_threads)();
+}
+
+FFTW_VOIDFUNC F77(cleanup_threads, CLEANUP_THREADS)(void)
+{
+     X(cleanup_threads)();
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/hc2hc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/hc2hc.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "threads.h"
+
+typedef struct {
+     plan_rdft super;
+     plan *cld;
+     plan **cldws;
+     int nthr;
+     INT r;
+} P;
+
+typedef struct {
+     plan **cldws;
+     R *IO;
+} PD;
+
+static void *spawn_apply(spawn_data *d)
+{
+     PD *ego = (PD *) d->data;
+     
+     plan_hc2hc *cldw = (plan_hc2hc *) (ego->cldws[d->thr_num]);
+     cldw->apply((plan *) cldw, ego->IO);
+     return 0;
+}
+
+static void apply_dit(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld;
+
+     cld = (plan_rdft *) ego->cld;
+     cld->apply((plan *) cld, I, O);
+
+     {
+	  PD d;
+	  
+	  d.IO = O;
+	  d.cldws = ego->cldws;
+
+	  X(spawn_loop)(ego->nthr, ego->nthr, spawn_apply, (void*)&d);
+     }
+}
+
+static void apply_dif(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld;
+
+     {
+	  PD d;
+	  
+	  d.IO = I;
+	  d.cldws = ego->cldws;
+
+	  X(spawn_loop)(ego->nthr, ego->nthr, spawn_apply, (void*)&d);
+     }
+
+     cld = (plan_rdft *) ego->cld;
+     cld->apply((plan *) cld, I, O);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     int i;
+     X(plan_awake)(ego->cld, wakefulness);
+     for (i = 0; i < ego->nthr; ++i)
+	  X(plan_awake)(ego->cldws[i], wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     int i;
+     X(plan_destroy_internal)(ego->cld);
+     for (i = 0; i < ego->nthr; ++i)
+	  X(plan_destroy_internal)(ego->cldws[i]);
+     X(ifree)(ego->cldws);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     int i;
+     p->print(p, "(rdft-thr-ct-%s-x%d/%D",
+	      ego->super.apply == apply_dit ? "dit" : "dif",
+	      ego->nthr, ego->r);
+     for (i = 0; i < ego->nthr; ++i)
+          if (i == 0 || (ego->cldws[i] != ego->cldws[i-1] &&
+                         (i <= 1 || ego->cldws[i] != ego->cldws[i-2])))
+               p->print(p, "%(%p%)", ego->cldws[i]);
+     p->print(p, "%(%p%))", ego->cld);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const hc2hc_solver *ego = (const hc2hc_solver *) ego_;
+     const problem_rdft *p;
+     P *pln = 0;
+     plan *cld = 0, **cldws = 0;
+     INT n, r, m, v, ivs, ovs, mcount;
+     int i, nthr, plnr_nthr_save;
+     INT block_size;
+     iodim *d;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (plnr->nthr <= 1 || !X(hc2hc_applicable)(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_rdft *) p_;
+     d = p->sz->dims;
+     n = d[0].n;
+     r = X(choose_radix)(ego->r, n);
+     m = n / r;
+     mcount = (m + 2) / 2;
+
+     X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
+
+     block_size = (mcount + plnr->nthr - 1) / plnr->nthr;
+     nthr = (int)((mcount + block_size - 1) / block_size);
+     plnr_nthr_save = plnr->nthr;
+     plnr->nthr = (plnr->nthr + nthr - 1) / nthr;
+
+     cldws = (plan **) MALLOC(sizeof(plan *) * nthr, PLANS);
+     for (i = 0; i < nthr; ++i) cldws[i] = (plan *) 0;
+
+     switch (p->kind[0]) {
+	 case R2HC:
+	      for (i = 0; i < nthr; ++i) {
+		   cldws[i] = ego->mkcldw(ego, 
+					  R2HC, r, m, d[0].os, v, ovs, 
+					  i*block_size, 
+					  (i == nthr - 1) ? 
+					  (mcount - i*block_size) : block_size,
+					  p->O, plnr);
+		   if (!cldws[i]) goto nada;
+	      }
+
+	      plnr->nthr = plnr_nthr_save;
+
+	      cld = X(mkplan_d)(plnr, 
+				X(mkproblem_rdft_d)(
+				     X(mktensor_1d)(m, r * d[0].is, d[0].os),
+				     X(mktensor_2d)(r, d[0].is, m * d[0].os,
+						    v, ivs, ovs),
+				     p->I, p->O, p->kind)
+		   );
+	      if (!cld) goto nada;
+
+	      pln = MKPLAN_RDFT(P, &padt, apply_dit);
+	      break;
+
+	 case HC2R:
+	      for (i = 0; i < nthr; ++i) {
+		   cldws[i] = ego->mkcldw(ego, 
+					  HC2R, r, m, d[0].is, v, ivs, 
+					  i*block_size, 
+					  (i == nthr - 1) ? 
+					  (mcount - i*block_size) : block_size,
+					  p->I, plnr);
+		   if (!cldws[i]) goto nada;
+	      }
+
+	      plnr->nthr = plnr_nthr_save;
+
+	      cld = X(mkplan_d)(plnr, 
+				X(mkproblem_rdft_d)(
+				     X(mktensor_1d)(m, d[0].is, r * d[0].os),
+				     X(mktensor_2d)(r, m * d[0].is, d[0].os,
+						    v, ivs, ovs),
+				     p->I, p->O, p->kind)
+		   );
+	      if (!cld) goto nada;
+	      
+	      pln = MKPLAN_RDFT(P, &padt, apply_dif);
+	      break;
+
+	 default: 
+	      A(0);
+     }
+
+     pln->cld = cld;
+     pln->cldws = cldws;
+     pln->nthr = nthr;
+     pln->r = r;
+     X(ops_zero)(&pln->super.super.ops);
+     for (i = 0; i < nthr; ++i) {
+          X(ops_add2)(&cldws[i]->ops, &pln->super.super.ops);
+	  pln->super.super.could_prune_now_p |= cldws[i]->could_prune_now_p;
+     }
+     X(ops_add2)(&cld->ops, &pln->super.super.ops);
+     return &(pln->super.super);
+
+ nada:
+     if (cldws) {
+	  for (i = 0; i < nthr; ++i)
+	       X(plan_destroy_internal)(cldws[i]);
+	  X(ifree)(cldws);
+     }
+     X(plan_destroy_internal)(cld);
+     return (plan *) 0;
+}
+
+hc2hc_solver *X(mksolver_hc2hc_threads)(size_t size, INT r, 
+					hc2hc_mkinferior mkcldw)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     hc2hc_solver *slv = (hc2hc_solver *)X(mksolver)(size, &sadt);
+     slv->r = r;
+     slv->mkcldw = mkcldw;
+     return slv;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/openmp.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/openmp.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* openmp.c: thread spawning via OpenMP  */
+
+#include "threads.h"
+
+#if !defined(_OPENMP)
+#error OpenMP enabled but not using an OpenMP compiler
+#endif
+
+int X(ithreads_init)(void)
+{
+     return 0; /* no error */
+}
+
+/* Distribute a loop from 0 to loopmax-1 over nthreads threads.
+   proc(d) is called to execute a block of iterations from d->min
+   to d->max-1.  d->thr_num indicate the number of the thread
+   that is executing proc (from 0 to nthreads-1), and d->data is
+   the same as the data parameter passed to X(spawn_loop).
+
+   This function returns only after all the threads have completed. */
+void X(spawn_loop)(int loopmax, int nthr, spawn_function proc, void *data)
+{
+     int block_size;
+     spawn_data d;
+     int i;
+
+     A(loopmax >= 0);
+     A(nthr > 0);
+     A(proc);
+
+     if (!loopmax) return;
+
+     /* Choose the block size and number of threads in order to (1)
+        minimize the critical path and (2) use the fewest threads that
+        achieve the same critical path (to minimize overhead).
+        e.g. if loopmax is 5 and nthr is 4, we should use only 3
+        threads with block sizes of 2, 2, and 1. */
+     block_size = (loopmax + nthr - 1) / nthr;
+     nthr = (loopmax + block_size - 1) / block_size;
+
+     THREAD_ON; /* prevent debugging mode from failing under threads */
+#pragma omp parallel for private(d)
+     for (i = 0; i < nthr; ++i) {
+	  d.max = (d.min = i * block_size) + block_size;
+	  if (d.max > loopmax)
+	       d.max = loopmax;
+	  d.thr_num = i;
+	  d.data = data;
+	  proc(&d);
+     }
+     THREAD_OFF; /* prevent debugging mode from failing under threads */
+}
+
+void X(threads_cleanup)(void)
+{
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/rdft-vrank-geq1.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/rdft-vrank-geq1.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "threads.h"
+
+typedef struct {
+     solver super;
+     int vecloop_dim;
+     const int *buddies;
+     int nbuddies;
+} S;
+
+typedef struct {
+     plan_rdft super;
+     plan **cldrn;
+     INT its, ots;
+     int nthr;
+     const S *solver;
+} P;
+
+typedef struct {
+     INT its, ots;
+     R *I, *O;
+     plan **cldrn;
+} PD;
+
+static void *spawn_apply(spawn_data *d)
+{
+     PD *ego = (PD *) d->data;
+     int thr_num = d->thr_num;
+     plan_rdft *cld = (plan_rdft *) ego->cldrn[d->thr_num];
+
+     cld->apply((plan *) cld,
+		ego->I + thr_num * ego->its, ego->O + thr_num * ego->ots);
+     return 0;
+}
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     PD d;
+
+     d.its = ego->its;
+     d.ots = ego->ots;
+     d.cldrn = ego->cldrn;
+     d.I = I; d.O = O;
+
+     X(spawn_loop)(ego->nthr, ego->nthr, spawn_apply, (void*) &d);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     int i;
+     for (i = 0; i < ego->nthr; ++i)
+	  X(plan_awake)(ego->cldrn[i], wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     int i;
+     for (i = 0; i < ego->nthr; ++i)
+	  X(plan_destroy_internal)(ego->cldrn[i]);
+     X(ifree)(ego->cldrn);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->solver;
+     int i;
+     p->print(p, "(rdft-thr-vrank>=1-x%d/%d", ego->nthr, s->vecloop_dim);
+     for (i = 0; i < ego->nthr; ++i)
+	  if (i == 0 || (ego->cldrn[i] != ego->cldrn[i-1] &&
+			 (i <= 1 || ego->cldrn[i] != ego->cldrn[i-2])))
+	       p->print(p, "%(%p%)", ego->cldrn[i]);
+     p->putchr(p, ')');
+}
+
+static int pickdim(const S *ego, const tensor *vecsz, int oop, int *dp)
+{
+     return X(pickdim)(ego->vecloop_dim, ego->buddies, ego->nbuddies,
+		       vecsz, oop, dp);
+}
+
+static int applicable0(const solver *ego_, const problem *p_,
+		       const planner *plnr, int *dp)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft *p = (const problem_rdft *) p_;
+
+     return (1
+	     && plnr->nthr > 1
+	     && FINITE_RNK(p->vecsz->rnk)
+	     && p->vecsz->rnk > 0
+	     && pickdim(ego, p->vecsz, p->I != p->O, dp)
+	  );
+}
+
+static int applicable(const solver *ego_, const problem *p_,
+		      const planner *plnr, int *dp)
+{
+     const S *ego = (const S *)ego_;
+
+     if (!applicable0(ego_, p_, plnr, dp)) return 0;
+
+     /* fftw2 behavior */
+     if (NO_VRANK_SPLITSP(plnr) && (ego->vecloop_dim != ego->buddies[0]))
+	  return 0;
+
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft *p;
+     P *pln;
+     problem *cldp;
+     int vdim;
+     iodim *d;
+     plan **cldrn = (plan **) 0;
+     int i, nthr;
+     INT its, ots, block_size;
+     tensor *vecsz;
+
+     static const plan_adt padt = {
+	  X(rdft_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &vdim))
+          return (plan *) 0;
+     p = (const problem_rdft *) p_;
+
+     d = p->vecsz->dims + vdim;
+
+     block_size = (d->n + plnr->nthr - 1) / plnr->nthr;
+     nthr = (int)((d->n + block_size - 1) / block_size);
+     plnr->nthr = (plnr->nthr + nthr - 1) / nthr;
+     its = d->is * block_size;
+     ots = d->os * block_size;
+
+     cldrn = (plan **)MALLOC(sizeof(plan *) * nthr, PLANS);
+     for (i = 0; i < nthr; ++i) cldrn[i] = (plan *) 0;
+     
+     vecsz = X(tensor_copy)(p->vecsz);
+     for (i = 0; i < nthr; ++i) {
+	  vecsz->dims[vdim].n =
+	       (i == nthr - 1) ? (d->n - i*block_size) : block_size;
+	  cldp = X(mkproblem_rdft)(p->sz, vecsz,
+				   p->I + i*its, p->O + i*ots, p->kind);
+	  cldrn[i] = X(mkplan_d)(plnr, cldp);
+	  if (!cldrn[i]) goto nada;
+     }
+     X(tensor_destroy)(vecsz);
+
+     pln = MKPLAN_RDFT(P, &padt, apply);
+
+     pln->cldrn = cldrn;
+     pln->its = its;
+     pln->ots = ots;
+     pln->nthr = nthr;
+
+     pln->solver = ego;
+     X(ops_zero)(&pln->super.super.ops);
+     pln->super.super.pcost = 0;
+     for (i = 0; i < nthr; ++i) {
+	  X(ops_add2)(&cldrn[i]->ops, &pln->super.super.ops);
+	  pln->super.super.pcost += cldrn[i]->pcost;
+     }
+
+     return &(pln->super.super);
+
+ nada:
+     if (cldrn) {
+	  for (i = 0; i < nthr; ++i)
+	       X(plan_destroy_internal)(cldrn[i]);
+	  X(ifree)(cldrn);
+     }
+     X(tensor_destroy)(vecsz);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int vecloop_dim, const int *buddies, int nbuddies)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->vecloop_dim = vecloop_dim;
+     slv->buddies = buddies;
+     slv->nbuddies = nbuddies;
+     return &(slv->super);
+}
+
+void X(rdft_thr_vrank_geq1_register)(planner *p)
+{
+     int i;
+
+     /* FIXME: Should we try other vecloop_dim values? */
+     static const int buddies[] = { 1, -1 };
+
+     const int nbuddies = (int)(sizeof(buddies) / sizeof(buddies[0]));
+
+     for (i = 0; i < nbuddies; ++i)
+          REGISTER_SOLVER(p, mksolver(buddies[i], buddies, nbuddies));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/threads.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/threads.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* threads.c: Portable thread spawning for loops, via the X(spawn_loop)
+   function.  The first portion of this file is a set of macros to
+   spawn and join threads on various systems. */
+
+#include "threads.h"
+
+#if defined(USING_POSIX_THREADS)
+
+#include <pthread.h>
+
+#ifdef HAVE_UNISTD_H
+#  include <unistd.h>
+#endif
+
+/* imlementation of semaphores and mutexes: */
+#if (defined(_POSIX_SEMAPHORES) && (_POSIX_SEMAPHORES >= 200112L))
+
+   /* If optional POSIX semaphores are supported, use them to
+      implement both semaphores and mutexes. */
+#  include <semaphore.h>
+#  include <errno.h>
+
+   typedef sem_t os_sem_t;
+
+   static void os_sem_init(os_sem_t *s) { sem_init(s, 0, 0); }
+   static void os_sem_destroy(os_sem_t *s) { sem_destroy(s); }
+
+   static void os_sem_down(os_sem_t *s)
+   {
+	int err;
+	do {
+	     err = sem_wait(s);
+	} while (err == -1 && errno == EINTR);
+	CK(err == 0);
+   }
+
+   static void os_sem_up(os_sem_t *s) { sem_post(s); }
+
+   /*
+      The reason why we use sem_t to implement mutexes is that I have
+      seen mysterious hangs with glibc-2.7 and linux-2.6.22 when using
+      pthread_mutex_t, but no hangs with sem_t or with linux >=
+      2.6.24.  For lack of better information, sem_t looks like the
+      safest choice.
+   */
+   typedef sem_t os_mutex_t;
+   static void os_mutex_init(os_mutex_t *s) { sem_init(s, 0, 1); }
+   #define os_mutex_destroy os_sem_destroy
+   #define os_mutex_lock os_sem_down
+   #define os_mutex_unlock os_sem_up
+
+#else
+
+   /* If optional POSIX semaphores are not defined, use pthread
+      mutexes for mutexes, and simulate semaphores with condition
+      variables */
+   typedef pthread_mutex_t os_mutex_t;
+
+   static void os_mutex_init(os_mutex_t *s) 
+   { 
+	pthread_mutex_init(s, (pthread_mutexattr_t *)0);
+   }
+
+   static void os_mutex_destroy(os_mutex_t *s) { pthread_mutex_destroy(s); }
+   static void os_mutex_lock(os_mutex_t *s) { pthread_mutex_lock(s); }
+   static void os_mutex_unlock(os_mutex_t *s) { pthread_mutex_unlock(s); }
+
+   typedef struct {
+	pthread_mutex_t m;
+	pthread_cond_t c;
+	volatile int x;
+   } os_sem_t; 
+
+   static void os_sem_init(os_sem_t *s)
+   {
+	pthread_mutex_init(&s->m, (pthread_mutexattr_t *)0);
+	pthread_cond_init(&s->c, (pthread_condattr_t *)0);
+
+	/* wrap initialization in lock to exploit the release
+	   semantics of pthread_mutex_unlock() */
+	pthread_mutex_lock(&s->m);
+	s->x = 0;
+	pthread_mutex_unlock(&s->m);
+   }
+
+   static void os_sem_destroy(os_sem_t *s)
+   {
+	pthread_mutex_destroy(&s->m);
+	pthread_cond_destroy(&s->c);
+   }
+
+   static void os_sem_down(os_sem_t *s)
+   {
+	pthread_mutex_lock(&s->m);
+	while (s->x <= 0) 
+	     pthread_cond_wait(&s->c, &s->m);
+	--s->x;
+	pthread_mutex_unlock(&s->m);
+   }
+
+   static void os_sem_up(os_sem_t *s)
+   {
+	pthread_mutex_lock(&s->m);
+	++s->x;
+	pthread_cond_signal(&s->c);
+	pthread_mutex_unlock(&s->m);
+   }
+
+#endif
+
+#define FFTW_WORKER void *
+
+static void os_create_thread(FFTW_WORKER (*worker)(void *arg), 
+			     void *arg)
+{
+     pthread_attr_t attr;
+     pthread_t tid;
+
+     pthread_attr_init(&attr);
+     pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); 
+     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+     pthread_create(&tid, &attr, worker, (void *)arg);
+     pthread_attr_destroy(&attr);
+}
+
+static void os_destroy_thread(void)
+{
+     pthread_exit((void *)0);
+}
+
+#elif defined(__WIN32__) || defined(_WIN32) || defined(_WINDOWS)
+/* hack: windef.h defines INT for its own purposes and this causes
+   a conflict with our own INT in ifftw.h.  Divert the windows
+   definition into another name unlikely to cause a conflict */
+#define INT magnus_ab_INTegro_seclorum_nascitur_ordo
+#include <windows.h>
+#include <process.h>
+#undef INT
+
+typedef HANDLE os_mutex_t;
+
+static void os_mutex_init(os_mutex_t *s) 
+{ 
+     *s = CreateMutex(NULL, FALSE, NULL);
+}
+
+static void os_mutex_destroy(os_mutex_t *s) 
+{ 
+     CloseHandle(*s);
+}
+
+static void os_mutex_lock(os_mutex_t *s)
+{ 
+     WaitForSingleObject(*s, INFINITE);
+}
+
+static void os_mutex_unlock(os_mutex_t *s) 
+{ 
+     ReleaseMutex(*s);
+}
+
+typedef HANDLE os_sem_t;
+
+static void os_sem_init(os_sem_t *s) 
+{
+     *s = CreateSemaphore(NULL, 0, 0x7FFFFFFFL, NULL);
+}
+
+static void os_sem_destroy(os_sem_t *s) 
+{ 
+     CloseHandle(*s);
+}
+
+static void os_sem_down(os_sem_t *s) 
+{ 
+     WaitForSingleObject(*s, INFINITE);
+}
+
+static void os_sem_up(os_sem_t *s) 
+{
+     ReleaseSemaphore(*s, 1, NULL);
+}
+
+#define FFTW_WORKER unsigned __stdcall
+typedef unsigned (__stdcall *winthread_start) (void *);
+
+static void os_create_thread(winthread_start worker,
+			     void *arg)
+{
+     _beginthreadex((void *)NULL,               /* security attrib */
+		    0,				/* stack size */
+		    worker,                     /* start address */
+		    arg,			/* parameters */
+		    0,				/* creation flags */
+		    (unsigned *)NULL);		/* tid */
+}
+
+static void os_destroy_thread(void)
+{
+     _endthreadex(0);
+}
+
+
+#else
+#error "No threading layer defined"
+#endif
+
+/************************************************************************/
+
+/* Main code: */
+struct worker {
+     os_sem_t ready;
+     os_sem_t done;
+     struct work *w;
+     struct worker *cdr;
+};
+
+static struct worker *make_worker(void)
+{
+     struct worker *q = (struct worker *)MALLOC(sizeof(*q), OTHER);
+     os_sem_init(&q->ready);
+     os_sem_init(&q->done);
+     return q;
+}
+
+static void unmake_worker(struct worker *q)
+{
+     os_sem_destroy(&q->done);
+     os_sem_destroy(&q->ready);
+     X(ifree)(q);
+}
+
+struct work {
+     spawn_function proc;
+     spawn_data d;
+     struct worker *q; /* the worker responsible for performing this work */
+};
+
+static os_mutex_t queue_lock;
+static os_sem_t termination_semaphore;
+
+static struct worker *worker_queue;
+#define WITH_QUEUE_LOCK(what)			\
+{						\
+     os_mutex_lock(&queue_lock);		\
+     what;					\
+     os_mutex_unlock(&queue_lock);		\
+}
+
+static FFTW_WORKER worker(void *arg)
+{
+     struct worker *ego = (struct worker *)arg;
+     struct work *w;
+
+     for (;;) {
+	  /* wait until work becomes available */
+	  os_sem_down(&ego->ready);
+
+	  w = ego->w;
+
+	  /* !w->proc ==> terminate worker */
+	  if (!w->proc) break;
+
+	  /* do the work */
+          w->proc(&w->d);
+
+	  /* signal that work is done */
+	  os_sem_up(&ego->done);
+     }
+
+     /* termination protocol */
+     os_sem_up(&termination_semaphore);
+
+     os_destroy_thread();
+     /* UNREACHABLE */
+     return 0;
+}
+
+static void enqueue(struct worker *q)
+{
+     WITH_QUEUE_LOCK({
+	  q->cdr = worker_queue;
+	  worker_queue = q;
+     });
+}
+
+static struct worker *dequeue(void)
+{
+     struct worker *q;
+
+     WITH_QUEUE_LOCK({
+	  q = worker_queue;
+	  if (q) 
+	       worker_queue = q->cdr;
+     });
+
+     if (!q) {
+	  /* no worker is available.  Create one */
+	  q = make_worker();
+	  os_create_thread(worker, q);
+     }
+
+     return q;
+}
+
+
+static void kill_workforce(void)
+{
+     struct work w;
+
+     w.proc = 0;
+
+     THREAD_ON; /* needed for debugging mode: since make_worker
+		   is called from dequeue which is only called in
+		   thread_on mode, we need to unmake_worker in thread_on. */
+     WITH_QUEUE_LOCK({
+	  /* tell all workers that they must terminate.  
+
+	     Because workers enqueue themselves before signaling the
+	     completion of the work, all workers belong to the worker queue
+	     if we get here.  Also, all workers are waiting at
+	     os_sem_down(ready), so we can hold the queue lock without
+	     deadlocking */
+	  while (worker_queue) {
+	       struct worker *q = worker_queue;
+	       worker_queue = q->cdr;
+	       q->w = &w;
+	       os_sem_up(&q->ready);
+	       os_sem_down(&termination_semaphore);
+	       unmake_worker(q);
+	  }
+     });
+     THREAD_OFF;
+}
+
+int X(ithreads_init)(void)
+{
+     os_mutex_init(&queue_lock);
+     os_sem_init(&termination_semaphore);
+
+     WITH_QUEUE_LOCK({
+	  worker_queue = 0;
+     })
+
+     return 0; /* no error */
+}
+
+/* Distribute a loop from 0 to loopmax-1 over nthreads threads.
+   proc(d) is called to execute a block of iterations from d->min
+   to d->max-1.  d->thr_num indicate the number of the thread
+   that is executing proc (from 0 to nthreads-1), and d->data is
+   the same as the data parameter passed to X(spawn_loop).
+
+   This function returns only after all the threads have completed. */
+void X(spawn_loop)(int loopmax, int nthr, spawn_function proc, void *data)
+{
+     int block_size;
+     struct work *r;
+     int i;
+
+     A(loopmax >= 0);
+     A(nthr > 0);
+     A(proc);
+
+     if (!loopmax) return;
+
+     /* Choose the block size and number of threads in order to (1)
+        minimize the critical path and (2) use the fewest threads that
+        achieve the same critical path (to minimize overhead).
+        e.g. if loopmax is 5 and nthr is 4, we should use only 3
+        threads with block sizes of 2, 2, and 1. */
+     block_size = (loopmax + nthr - 1) / nthr;
+     nthr = (loopmax + block_size - 1) / block_size;
+
+     THREAD_ON; /* prevent debugging mode from failing under threads */
+     STACK_MALLOC(struct work *, r, sizeof(struct work) * nthr);
+	  
+     /* distribute work: */
+     for (i = 0; i < nthr; ++i) {
+	  struct work *w = &r[i];
+	  spawn_data *d = &w->d;
+
+	  d->max = (d->min = i * block_size) + block_size;
+	  if (d->max > loopmax)
+	       d->max = loopmax;
+	  d->thr_num = i;
+	  d->data = data;
+	  w->proc = proc;
+	   
+	  if (i == nthr - 1) {
+	       /* do the work ourselves */
+	       proc(d);
+	  } else {
+	       /* assign a worker to W */
+	       w->q = dequeue();
+
+	       /* tell worker w->q to do it */
+	       w->q->w = w; /* Dirac could have written this */
+	       os_sem_up(&w->q->ready);
+	  }
+     }
+
+     for (i = 0; i < nthr - 1; ++i) { 
+	  struct work *w = &r[i];
+	  os_sem_down(&w->q->done);
+	  enqueue(w->q);
+     }
+
+     STACK_FREE(r);
+     THREAD_OFF; /* prevent debugging mode from failing under threads */
+}
+
+void X(threads_cleanup)(void)
+{
+     kill_workforce();
+     os_mutex_destroy(&queue_lock);
+     os_sem_destroy(&termination_semaphore);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/threads.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/threads.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __THREADS_H__
+#define __THREADS_H__
+
+#include "ifftw.h"
+#include "ct.h"
+#include "hc2hc.h"
+
+typedef struct {
+     int min, max, thr_num;
+     void *data;
+} spawn_data;
+
+typedef void *(*spawn_function) (spawn_data *);
+
+void X(spawn_loop)(int loopmax, int nthreads,
+		   spawn_function proc, void *data);
+int X(ithreads_init)(void);
+void X(threads_cleanup)(void);
+
+/* configurations */
+
+void X(dft_thr_vrank_geq1_register)(planner *p);
+void X(rdft_thr_vrank_geq1_register)(planner *p);
+void X(rdft2_thr_vrank_geq1_register)(planner *p);
+
+ct_solver *X(mksolver_ct_threads)(size_t size, INT r, int dec, 
+				  ct_mkinferior mkcldw,
+				  ct_force_vrecursion force_vrecursionp);
+hc2hc_solver *X(mksolver_hc2hc_threads)(size_t size, INT r, hc2hc_mkinferior mkcldw);
+
+void X(threads_conf_standard)(planner *p);
+void X(threads_register_hooks)(void);
+void X(threads_unregister_hooks)(void);
+#endif /* __THREADS_H__ */
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/threads/vrank-geq1-rdft2.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/threads/vrank-geq1-rdft2.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+
+#include "threads.h"
+
+typedef struct {
+     solver super;
+     int vecloop_dim;
+     const int *buddies;
+     int nbuddies;
+} S;
+
+typedef struct {
+     plan_rdft2 super;
+
+     plan **cldrn;
+     INT its, ots;
+     int nthr;
+     const S *solver;
+} P;
+
+typedef struct {
+     INT its, ots;
+     R *r0, *r1, *cr, *ci;
+     plan **cldrn;
+} PD;
+
+static void *spawn_apply(spawn_data *d)
+{
+     PD *ego = (PD *) d->data;
+     INT its = ego->its;
+     INT ots = ego->ots;
+     int thr_num = d->thr_num;
+     plan_rdft2 *cld = (plan_rdft2 *) ego->cldrn[d->thr_num];
+
+     cld->apply((plan *) cld,
+		ego->r0 + thr_num * its, ego->r1 + thr_num * its,
+		ego->cr + thr_num * ots, ego->ci + thr_num * ots);
+     return 0;
+}
+
+static void apply(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
+{
+     const P *ego = (const P *) ego_;
+     PD d;
+
+     d.its = ego->its;
+     d.ots = ego->ots;
+     d.cldrn = ego->cldrn;
+     d.r0 = r0; d.r1 = r1; d.cr = cr; d.ci = ci;
+
+     X(spawn_loop)(ego->nthr, ego->nthr, spawn_apply, (void*) &d);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     int i;
+     for (i = 0; i < ego->nthr; ++i)
+	  X(plan_awake)(ego->cldrn[i], wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     int i;
+     for (i = 0; i < ego->nthr; ++i)
+	  X(plan_destroy_internal)(ego->cldrn[i]);
+     X(ifree)(ego->cldrn);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const S *s = ego->solver;
+     int i;
+     p->print(p, "(rdft2-thr-vrank>=1-x%d/%d)", ego->nthr, s->vecloop_dim);
+     for (i = 0; i < ego->nthr; ++i)
+	  if (i == 0 || (ego->cldrn[i] != ego->cldrn[i-1] &&
+			 (i <= 1 || ego->cldrn[i] != ego->cldrn[i-2])))
+	       p->print(p, "%(%p%)", ego->cldrn[i]);
+     p->putchr(p, ')');
+}
+
+static int pickdim(const S *ego, const tensor *vecsz, int oop, int *dp)
+{
+     return X(pickdim)(ego->vecloop_dim, ego->buddies, ego->nbuddies,
+		       vecsz, oop, dp);
+}
+
+static int applicable0(const solver *ego_, const problem *p_,
+		       const planner *plnr, int *dp)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft2 *p = (const problem_rdft2 *) p_;
+
+     if (FINITE_RNK(p->vecsz->rnk)
+	 && p->vecsz->rnk > 0
+	 && plnr->nthr > 1
+	 && pickdim(ego, p->vecsz, p->r0 != p->cr, dp)) {
+	  if (p->r0 != p->cr)
+	       return 1;  /* can always operate out-of-place */
+
+	  return(X(rdft2_inplace_strides)(p, *dp));
+     }
+
+     return 0;
+}
+
+static int applicable(const solver *ego_, const problem *p_,
+		      const planner *plnr, int *dp)
+{
+     const S *ego = (const S *)ego_;
+
+     if (!applicable0(ego_, p_, plnr, dp)) return 0;
+
+     /* fftw2 behavior */
+     if (NO_VRANK_SPLITSP(plnr) && (ego->vecloop_dim != ego->buddies[0]))
+	  return 0;
+
+     return 1;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_rdft2 *p;
+     P *pln;
+     problem *cldp;
+     int vdim;
+     iodim *d;
+     plan **cldrn = (plan **) 0;
+     int i, nthr;
+     INT its, ots, block_size;
+     tensor *vecsz;
+
+     static const plan_adt padt = {
+	  X(rdft2_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego_, p_, plnr, &vdim))
+          return (plan *) 0;
+     p = (const problem_rdft2 *) p_;
+
+     d = p->vecsz->dims + vdim;
+
+     block_size = (d->n + plnr->nthr - 1) / plnr->nthr;
+     nthr = (int)((d->n + block_size - 1) / block_size);
+     plnr->nthr = (plnr->nthr + nthr - 1) / nthr;
+     X(rdft2_strides)(p->kind, d, &its, &ots);
+     its *= block_size; ots *= block_size;
+
+     cldrn = (plan **)MALLOC(sizeof(plan *) * nthr, PLANS);
+     for (i = 0; i < nthr; ++i) cldrn[i] = (plan *) 0;
+     
+     vecsz = X(tensor_copy)(p->vecsz);
+     for (i = 0; i < nthr; ++i) {
+	  vecsz->dims[vdim].n =
+	       (i == nthr - 1) ? (d->n - i*block_size) : block_size;
+	  cldp = X(mkproblem_rdft2)(p->sz, vecsz,
+				    p->r0 + i*its, p->r1 + i*its,
+				    p->cr + i*ots, p->ci + i*ots, 
+				    p->kind);
+	  cldrn[i] = X(mkplan_d)(plnr, cldp);
+	  if (!cldrn[i]) goto nada;
+     }
+     X(tensor_destroy)(vecsz);
+
+     pln = MKPLAN_RDFT2(P, &padt, apply);
+
+     pln->cldrn = cldrn;
+     pln->its = its;
+     pln->ots = ots;
+     pln->nthr = nthr;
+
+     pln->solver = ego;
+     X(ops_zero)(&pln->super.super.ops);
+     pln->super.super.pcost = 0;
+     for (i = 0; i < nthr; ++i) {
+	  X(ops_add2)(&cldrn[i]->ops, &pln->super.super.ops);
+	  pln->super.super.pcost += cldrn[i]->pcost;
+     }
+
+     return &(pln->super.super);
+
+ nada:
+     if (cldrn) {
+	  for (i = 0; i < nthr; ++i)
+	       X(plan_destroy_internal)(cldrn[i]);
+	  X(ifree)(cldrn);
+     }
+     X(tensor_destroy)(vecsz);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int vecloop_dim, const int *buddies, int nbuddies)
+{
+     static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->vecloop_dim = vecloop_dim;
+     slv->buddies = buddies;
+     slv->nbuddies = nbuddies;
+     return &(slv->super);
+}
+
+void X(rdft2_thr_vrank_geq1_register)(planner *p)
+{
+     int i;
+
+     /* FIXME: Should we try other vecloop_dim values? */
+     static const int buddies[] = { 1, -1 };
+
+     const int nbuddies = (int)(sizeof(buddies) / sizeof(buddies[0]));
+
+     for (i = 0; i < nbuddies; ++i)
+          REGISTER_SOLVER(p, mksolver(buddies[i], buddies, nbuddies));
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tools/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tools/Makefile.am	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,31 @@
+AM_CPPFLAGS = -I$(top_srcdir)/libbench2 -I$(top_srcdir)/api 
+
+bin_SCRIPTS = fftw-wisdom-to-conf
+bin_PROGRAMS = fftw@PREC_SUFFIX@-wisdom
+
+BUILT_SOURCES = fftw-wisdom-to-conf fftw@PREC_SUFFIX@-wisdom.1
+EXTRA_DIST = fftw-wisdom-to-conf.in
+
+dist_man_MANS = fftw-wisdom-to-conf.1 fftw@PREC_SUFFIX@-wisdom.1
+EXTRA_MANS = fftw_wisdom.1.in
+fftw@PREC_SUFFIX@-wisdom.1: fftw_wisdom.1
+	rm -f $@
+	cp fftw_wisdom.1 $@
+
+if THREADS
+fftw@PREC_SUFFIX@_wisdom_CFLAGS = $(PTHREAD_CFLAGS)
+if !COMBINED_THREADS
+LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_threads.la
+endif
+else
+if OPENMP
+fftw@PREC_SUFFIX@_wisdom_CFLAGS = $(OPENMP_CFLAGS)
+LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_omp.la
+endif
+endif
+
+fftw@PREC_SUFFIX@_wisdom_SOURCES = fftw-wisdom.c
+fftw@PREC_SUFFIX@_wisdom_LDADD = $(top_builddir)/tests/bench-bench.o	\
+$(top_builddir)/tests/bench-fftw-bench.o $(LIBFFTWTHREADS)	\
+$(top_builddir)/libfftw3@PREC_SUFFIX@.la			\
+$(top_builddir)/libbench2/libbench2.a $(THREADLIBS)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tools/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tools/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,754 @@
+# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+
+VPATH = @srcdir@
+am__make_dryrun = \
+  { \
+    am__dry=no; \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
+          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+      *) \
+        for am__flg in $$MAKEFLAGS; do \
+          case $$am__flg in \
+            *=*|--*) ;; \
+            *n*) am__dry=yes; break;; \
+          esac; \
+        done;; \
+    esac; \
+    test $$am__dry = yes; \
+  }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+bin_PROGRAMS = fftw@PREC_SUFFIX@-wisdom$(EXEEXT)
+subdir = tools
+DIST_COMMON = $(dist_man_MANS) $(srcdir)/Makefile.am \
+	$(srcdir)/Makefile.in $(srcdir)/fftw-wisdom-to-conf.in \
+	$(srcdir)/fftw_wisdom.1.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES = fftw_wisdom.1 fftw-wisdom-to-conf
+CONFIG_CLEAN_VPATH_FILES =
+am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)" \
+	"$(DESTDIR)$(man1dir)"
+PROGRAMS = $(bin_PROGRAMS)
+am_fftw@PREC_SUFFIX@_wisdom_OBJECTS =  \
+	fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.$(OBJEXT)
+fftw@PREC_SUFFIX@_wisdom_OBJECTS =  \
+	$(am_fftw@PREC_SUFFIX@_wisdom_OBJECTS)
+am__DEPENDENCIES_1 =
+fftw@PREC_SUFFIX@_wisdom_DEPENDENCIES =  \
+	$(top_builddir)/tests/bench-bench.o \
+	$(top_builddir)/tests/bench-fftw-bench.o $(LIBFFTWTHREADS) \
+	$(top_builddir)/libfftw3@PREC_SUFFIX@.la \
+	$(top_builddir)/libbench2/libbench2.a $(am__DEPENDENCIES_1)
+fftw@PREC_SUFFIX@_wisdom_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(fftw@PREC_SUFFIX@_wisdom_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+SCRIPTS = $(bin_SCRIPTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+SOURCES = $(fftw@PREC_SUFFIX@_wisdom_SOURCES)
+DIST_SOURCES = $(fftw@PREC_SUFFIX@_wisdom_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+man1dir = $(mandir)/man1
+NROFF = nroff
+MANS = $(dist_man_MANS)
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I$(top_srcdir)/libbench2 -I$(top_srcdir)/api 
+bin_SCRIPTS = fftw-wisdom-to-conf
+BUILT_SOURCES = fftw-wisdom-to-conf fftw@PREC_SUFFIX@-wisdom.1
+EXTRA_DIST = fftw-wisdom-to-conf.in
+dist_man_MANS = fftw-wisdom-to-conf.1 fftw@PREC_SUFFIX@-wisdom.1
+EXTRA_MANS = fftw_wisdom.1.in
+@OPENMP_TRUE@@THREADS_FALSE@fftw@PREC_SUFFIX@_wisdom_CFLAGS = $(OPENMP_CFLAGS)
+@THREADS_TRUE@fftw@PREC_SUFFIX@_wisdom_CFLAGS = $(PTHREAD_CFLAGS)
+@COMBINED_THREADS_FALSE@@THREADS_TRUE@LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_threads.la
+@OPENMP_TRUE@@THREADS_FALSE@LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_omp.la
+fftw@PREC_SUFFIX@_wisdom_SOURCES = fftw-wisdom.c
+fftw@PREC_SUFFIX@_wisdom_LDADD = $(top_builddir)/tests/bench-bench.o	\
+$(top_builddir)/tests/bench-fftw-bench.o $(LIBFFTWTHREADS)	\
+$(top_builddir)/libfftw3@PREC_SUFFIX@.la			\
+$(top_builddir)/libbench2/libbench2.a $(THREADLIBS)
+
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu tools/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu tools/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+fftw_wisdom.1: $(top_builddir)/config.status $(srcdir)/fftw_wisdom.1.in
+	cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@
+fftw-wisdom-to-conf: $(top_builddir)/config.status $(srcdir)/fftw-wisdom-to-conf.in
+	cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@
+install-binPROGRAMS: $(bin_PROGRAMS)
+	@$(NORMAL_INSTALL)
+	@list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(bindir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(bindir)" || exit 1; \
+	fi; \
+	for p in $$list; do echo "$$p $$p"; done | \
+	sed 's/$(EXEEXT)$$//' | \
+	while read p p1; do if test -f $$p || test -f $$p1; \
+	  then echo "$$p"; echo "$$p"; else :; fi; \
+	done | \
+	sed -e 'p;s,.*/,,;n;h' -e 's|.*|.|' \
+	    -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \
+	sed 'N;N;N;s,\n, ,g' | \
+	$(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \
+	  { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \
+	    if ($$2 == $$4) files[d] = files[d] " " $$1; \
+	    else { print "f", $$3 "/" $$4, $$1; } } \
+	  END { for (d in files) print "f", d, files[d] }' | \
+	while read type dir files; do \
+	    if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \
+	    test -z "$$files" || { \
+	    echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \
+	    $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \
+	    } \
+	; done
+
+uninstall-binPROGRAMS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \
+	files=`for p in $$list; do echo "$$p"; done | \
+	  sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \
+	      -e 's/$$/$(EXEEXT)/' `; \
+	test -n "$$list" || exit 0; \
+	echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \
+	cd "$(DESTDIR)$(bindir)" && rm -f $$files
+
+clean-binPROGRAMS:
+	@list='$(bin_PROGRAMS)'; test -n "$$list" || exit 0; \
+	echo " rm -f" $$list; \
+	rm -f $$list || exit $$?; \
+	test -n "$(EXEEXT)" || exit 0; \
+	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
+	echo " rm -f" $$list; \
+	rm -f $$list
+fftw@PREC_SUFFIX@-wisdom$(EXEEXT): $(fftw@PREC_SUFFIX@_wisdom_OBJECTS) $(fftw@PREC_SUFFIX@_wisdom_DEPENDENCIES) $(EXTRA_fftw@PREC_SUFFIX@_wisdom_DEPENDENCIES) 
+	@rm -f fftw@PREC_SUFFIX@-wisdom$(EXEEXT)
+	$(fftw@PREC_SUFFIX@_wisdom_LINK) $(fftw@PREC_SUFFIX@_wisdom_OBJECTS) $(fftw@PREC_SUFFIX@_wisdom_LDADD) $(LIBS)
+install-binSCRIPTS: $(bin_SCRIPTS)
+	@$(NORMAL_INSTALL)
+	@list='$(bin_SCRIPTS)'; test -n "$(bindir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(bindir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(bindir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  if test -f "$$d$$p"; then echo "$$d$$p"; echo "$$p"; else :; fi; \
+	done | \
+	sed -e 'p;s,.*/,,;n' \
+	    -e 'h;s|.*|.|' \
+	    -e 'p;x;s,.*/,,;$(transform)' | sed 'N;N;N;s,\n, ,g' | \
+	$(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1; } \
+	  { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \
+	    if ($$2 == $$4) { files[d] = files[d] " " $$1; \
+	      if (++n[d] == $(am__install_max)) { \
+		print "f", d, files[d]; n[d] = 0; files[d] = "" } } \
+	    else { print "f", d "/" $$4, $$1 } } \
+	  END { for (d in files) print "f", d, files[d] }' | \
+	while read type dir files; do \
+	     if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \
+	     test -z "$$files" || { \
+	       echo " $(INSTALL_SCRIPT) $$files '$(DESTDIR)$(bindir)$$dir'"; \
+	       $(INSTALL_SCRIPT) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \
+	     } \
+	; done
+
+uninstall-binSCRIPTS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(bin_SCRIPTS)'; test -n "$(bindir)" || exit 0; \
+	files=`for p in $$list; do echo "$$p"; done | \
+	       sed -e 's,.*/,,;$(transform)'`; \
+	dir='$(DESTDIR)$(bindir)'; $(am__uninstall_files_from_dir)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.Po@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+
+fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.o: fftw-wisdom.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(fftw@PREC_SUFFIX@_wisdom_CFLAGS) $(CFLAGS) -MT fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.o -MD -MP -MF $(DEPDIR)/fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.Tpo -c -o fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.o `test -f 'fftw-wisdom.c' || echo '$(srcdir)/'`fftw-wisdom.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.Tpo $(DEPDIR)/fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='fftw-wisdom.c' object='fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(fftw@PREC_SUFFIX@_wisdom_CFLAGS) $(CFLAGS) -c -o fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.o `test -f 'fftw-wisdom.c' || echo '$(srcdir)/'`fftw-wisdom.c
+
+fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.obj: fftw-wisdom.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(fftw@PREC_SUFFIX@_wisdom_CFLAGS) $(CFLAGS) -MT fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.obj -MD -MP -MF $(DEPDIR)/fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.Tpo -c -o fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.obj `if test -f 'fftw-wisdom.c'; then $(CYGPATH_W) 'fftw-wisdom.c'; else $(CYGPATH_W) '$(srcdir)/fftw-wisdom.c'; fi`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.Tpo $(DEPDIR)/fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='fftw-wisdom.c' object='fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(fftw@PREC_SUFFIX@_wisdom_CFLAGS) $(CFLAGS) -c -o fftw@PREC_SUFFIX@_wisdom-fftw-wisdom.obj `if test -f 'fftw-wisdom.c'; then $(CYGPATH_W) 'fftw-wisdom.c'; else $(CYGPATH_W) '$(srcdir)/fftw-wisdom.c'; fi`
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+install-man1: $(dist_man_MANS)
+	@$(NORMAL_INSTALL)
+	@list1=''; \
+	list2='$(dist_man_MANS)'; \
+	test -n "$(man1dir)" \
+	  && test -n "`echo $$list1$$list2`" \
+	  || exit 0; \
+	echo " $(MKDIR_P) '$(DESTDIR)$(man1dir)'"; \
+	$(MKDIR_P) "$(DESTDIR)$(man1dir)" || exit 1; \
+	{ for i in $$list1; do echo "$$i"; done;  \
+	if test -n "$$list2"; then \
+	  for i in $$list2; do echo "$$i"; done \
+	    | sed -n '/\.1[a-z]*$$/p'; \
+	fi; \
+	} | while read p; do \
+	  if test -f $$p; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; echo "$$p"; \
+	done | \
+	sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
+	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \
+	sed 'N;N;s,\n, ,g' | { \
+	list=; while read file base inst; do \
+	  if test "$$base" = "$$inst"; then list="$$list $$file"; else \
+	    echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man1dir)/$$inst'"; \
+	    $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man1dir)/$$inst" || exit $$?; \
+	  fi; \
+	done; \
+	for i in $$list; do echo "$$i"; done | $(am__base_list) | \
+	while read files; do \
+	  test -z "$$files" || { \
+	    echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man1dir)'"; \
+	    $(INSTALL_DATA) $$files "$(DESTDIR)$(man1dir)" || exit $$?; }; \
+	done; }
+
+uninstall-man1:
+	@$(NORMAL_UNINSTALL)
+	@list=''; test -n "$(man1dir)" || exit 0; \
+	files=`{ for i in $$list; do echo "$$i"; done; \
+	l2='$(dist_man_MANS)'; for i in $$l2; do echo "$$i"; done | \
+	  sed -n '/\.1[a-z]*$$/p'; \
+	} | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
+	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \
+	dir='$(DESTDIR)$(man1dir)'; $(am__uninstall_files_from_dir)
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	set x; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@list='$(MANS)'; if test -n "$$list"; then \
+	  list=`for p in $$list; do \
+	    if test -f $$p; then d=; else d="$(srcdir)/"; fi; \
+	    if test -f "$$d$$p"; then echo "$$d$$p"; else :; fi; done`; \
+	  if test -n "$$list" && \
+	    grep 'ab help2man is required to generate this page' $$list >/dev/null; then \
+	    echo "error: found man pages containing the \`missing help2man' replacement text:" >&2; \
+	    grep -l 'ab help2man is required to generate this page' $$list | sed 's/^/         /' >&2; \
+	    echo "       to fix them, install help2man, remove and regenerate the man pages;" >&2; \
+	    echo "       typically \`make maintainer-clean' will remove them" >&2; \
+	    exit 1; \
+	  else :; fi; \
+	else :; fi
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
+all-am: Makefile $(PROGRAMS) $(SCRIPTS) $(MANS)
+installdirs:
+	for dir in "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man1dir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
+clean: clean-am
+
+clean-am: clean-binPROGRAMS clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-man
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-binPROGRAMS install-binSCRIPTS
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man: install-man1
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-binPROGRAMS uninstall-binSCRIPTS uninstall-man
+
+uninstall-man: uninstall-man1
+
+.MAKE: all check install install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \
+	clean-generic clean-libtool ctags distclean distclean-compile \
+	distclean-generic distclean-libtool distclean-tags distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-binPROGRAMS install-binSCRIPTS install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-man install-man1 install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am uninstall-binPROGRAMS \
+	uninstall-binSCRIPTS uninstall-man uninstall-man1
+
+fftw@PREC_SUFFIX@-wisdom.1: fftw_wisdom.1
+	rm -f $@
+	cp fftw_wisdom.1 $@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tools/fftw-wisdom-to-conf.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tools/fftw-wisdom-to-conf.1	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,91 @@
+.\" 
+.\" Copyright (c) 2003, 2007-11 Matteo Frigo
+.\" Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+.\" 
+.\" This program is free software; you can redistribute it and/or modify
+.\" it under the terms of the GNU General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or
+.\" (at your option) any later version.
+.\" 
+.\" This program is distributed in the hope that it will be useful,
+.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
+.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+.\" GNU General Public License for more details.
+.\" 
+.\" You should have received a copy of the GNU General Public License
+.\" along with this program; if not, write to the Free Software
+.\" Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+.\"
+.TH FFTW-WISDOM-TO-CONF 1 "February, 2003" "fftw" "fftw"
+.SH NAME
+fftw-wisdom-to-conf \- generate FFTW wisdom (pre-planned transforms)
+.SH SYNOPSIS
+\fBfftw-wisdom-to-conf\fR [< \fIINPUT\fR] [> \fIOUTPUT\fR]
+.SH DESCRIPTION
+.PP
+." Add any additional description here
+.I fftw-wisdom-to-conf
+is a utility to generate C
+.B configuration
+routines from FFTW
+.B wisdom
+files, where the latter contain saved information about how to
+optimally compute (Fourier) transforms of various sizes.  A
+configuration routine is a C subroutine that you link into your
+program, replacing a routine of the same name in the FFTW library,
+that determines which parts of FFTW are callable by your program.
+
+The reason to do this is that, if you only need transforms of a
+limited set of sizes and types, and if you are statically linking your
+program, then using a configuration file generated from wisdom for
+those types can substantially reduce the size of your executable.
+(Otherwise, because of FFTW's dynamic nature, all of FFTW's transform
+code must be linked into any program using FFTW.)
+
+FFTW is a free library to compute discrete Fourier transforms in one
+or more dimensions, for arbitrary sizes, and of both real and complex
+data, among other related operations.  More information on FFTW can be
+found at the FFTW home page:
+.I http://www.fftw.org
+
+.I fftw-wisdom-to-conf
+reads wisdom from standard input and writes the configuration to
+standard output.  It can easily be combined with the
+.I fftw-wisdom
+tool, for example:
+
+fftw-wisdom -n cof1024 cob1024 -o wisdom
+.br
+fftw-wisdom-to-conf < wisdom > conf.c
+
+will create a configuration "conf.c" containing only those parts of
+FFTW needed for the optimized complex forwards and backwards
+out-of-place transforms of size 1024 (also saving the wisdom itself in
+"wisdom").
+
+Alternatively, you can run your actual program, export wisdom for all
+plans that were created (ideally in FFTW_PATIENT or FFTW_EXHAUSTIVE
+mode), use this as input for \fIfftw-wisdom-to-conf\fR,
+and then re-link your program with the resulting configuration routine.
+
+Note that the configuration routine does not contain the wisdom, only
+the routines necessary to implement the wisdom, so your program should
+also import the wisdom in order to benefit from the pre-optimized
+plans.
+.SH OPTIONS
+.TP
+\fB\-h\fR, \fB\--help\fR
+Display help on the command-line options and usage.
+.TP
+\fB\-V\fR, \fB\--version\fR
+Print the version number and copyright information.
+.SH BUGS
+Send bug reports to fftw@fftw.org.
+.SH AUTHORS
+Written by Steven G. Johnson and Matteo Frigo.
+
+Copyright (c) 2003, 2007-11 Matteo Frigo
+.br
+Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+.SH "SEE ALSO"
+fftw-wisdom(1)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tools/fftw-wisdom-to-conf.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tools/fftw-wisdom-to-conf.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,82 @@
+#! /bin/sh
+
+if test "x$1" = "x--help" || test "x$1" = "x-h"; then
+    cat <<EOF
+Usage: fftw-wisdom-to-conf [OPTIONS] [< INPUT] [> OUTPUT]
+Convert wisdom (stdin) to C configuration routine (stdout).
+
+Options:
+        -h, --help: print this help
+     -V, --version: print version/copyright info
+EOF
+    exit 0
+fi
+
+if test "x$1" = "x--version" || test "x$1" = "x-V"; then
+    cat <<EOF
+fftw-wisdom-to-conf from FFTW version @VERSION@
+
+Copyright (c) 2003, 2007-11 Matteo Frigo
+Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA    
+EOF
+    exit 0
+fi
+
+read preamble fftw_wisdom
+
+case "$preamble $fftw_wisdom" in
+	\(@PACKAGE@-@VERSION@\ *_wisdom)
+		prefix=`echo $fftw_wisdom | cut -d_ -f1`_
+		;;
+	*)
+		echo "fftw-wisdom-to-conf: invalid wisdom" 1>&2
+		exit 1
+		;;
+esac
+
+cat <<EOF
+/* Automatically generated by fftw-wisdom-to-conf from @PACKAGE@ @VERSION@.
+   DO NOT EDIT!  (Unless you really, really want to.  Then it's okay.) */
+void ${prefix}configure_planner(void *plnr)
+{
+    struct solvtab_s { void (*reg)(void *); const char *reg_nam; };
+    extern void ${prefix}solvtab_exec(const struct solvtab_s s[], void *);
+
+#define DECLARE(name) extern void name(void *);
+#define STRINGIZEx(x) #x
+#define STRINGIZE(x) STRINGIZEx(x)
+#define SOLVTAB(s) { s, STRINGIZE(s) },
+#define DO(X) \\
+EOF
+
+sed 's/ *(//' | cut -d" " -f1 | grep -v -- - | egrep -v '^ *\)*$' | sort | uniq | while read reg_nam; do
+    printf '    X(%s)\\\n' "$reg_nam"
+done
+
+cat <<EOF
+    /* end DO(X) */
+
+    DO(DECLARE)
+
+    const struct solvtab_s s[] = {
+        DO(SOLVTAB)
+        { 0, 0 }
+    };
+
+    ${prefix}solvtab_exec(s, plnr);
+}
+EOF
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tools/fftw-wisdom.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tools/fftw-wisdom.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,360 @@
+/* Re-use libbench2 and the test program, but override bench_main so that
+   we can have different command-line syntax. */
+#include "my-getopt.h"
+#include "bench.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <fftw3.h>
+#include <string.h>
+#include <time.h>
+
+#if defined(HAVE_THREADS) || defined(HAVE_OPENMP)
+#  define HAVE_SMP
+   extern int threads_ok;
+#endif
+
+#define CONCAT(prefix, name) prefix ## name
+#if defined(BENCHFFT_SINGLE)
+#define FFTW(x) CONCAT(fftwf_, x)
+#elif defined(BENCHFFT_LDOUBLE)
+#define FFTW(x) CONCAT(fftwl_, x)
+#elif defined(BENCHFFT_QUAD)
+#define FFTW(x) CONCAT(fftwq_, x)
+#else
+#define FFTW(x) CONCAT(fftw_, x)
+#endif
+
+/* from bench.c: */
+extern unsigned the_flags;
+extern int usewisdom;
+extern int nthreads;
+
+/* dummy routines to replace those in hook.c */
+void install_hook(void) {}
+void uninstall_hook(void) {}
+
+int verbose;
+
+static void do_problem(bench_problem *p)
+{
+     if (verbose)
+	  printf("PLANNING PROBLEM: %s\n", p->pstring);
+     /* BENCH_ASSERT(can_do(p)); */
+     problem_alloc(p);
+     setup(p);
+     done(p);
+}
+
+static void add_problem(const char *pstring,
+			bench_problem ***p, int *ip, int *np)
+{
+     if (*ip >= *np) {
+	  *np = *np * 2 + 1;
+	  *p = (bench_problem **) realloc(*p, sizeof(bench_problem *) * *np);
+     }
+     (*p)[(*ip)++] = problem_parse(pstring);
+}
+
+static int sz(const bench_problem *p)
+{
+     return tensor_sz(p->sz) * tensor_sz(p->vecsz);
+}
+
+static int prob_size_cmp(const void *p1_, const void *p2_)
+{
+     const bench_problem * const *p1 = (const bench_problem * const *) p1_;
+     const bench_problem * const *p2 = (const bench_problem * const *) p2_;
+     return (sz(*p1) - sz(*p2));
+}
+
+static struct my_option options[] =
+{
+  {"help", NOARG, 'h'},
+  {"version", NOARG, 'V'},
+  {"verbose", NOARG, 'v'},
+
+  {"canonical", NOARG, 'c'},
+  {"time-limit", REQARG, 't'},
+
+  {"output-file", REQARG, 'o'},
+
+  {"impatient", NOARG, 'i'},
+  {"measure", NOARG, 'm'},
+  {"estimate", NOARG, 'e'},
+  {"exhaustive", NOARG, 'x'},
+
+  {"no-system-wisdom", NOARG, 'n'},
+  {"wisdom-file", REQARG, 'w'},
+
+#ifdef HAVE_SMP
+  {"threads", REQARG, 'T'},
+#endif
+
+  /* options to restrict configuration to rdft-only, etcetera? */
+  
+  {0, NOARG, 0}
+};
+
+static void help(FILE *f, const char *program_name)
+{
+     fprintf(
+	  f, 
+	  "Usage: %s [options] [sizes]\n"
+"    Create wisdom (pre-planned/optimized transforms) for specified sizes,\n"
+"    writing wisdom to stdout (or to a file, using -o).\n"
+	  "\nOptions:\n"
+ "                   -h, --help: print this help\n"
+ "                -V, --version: print version/copyright info\n"
+ "                -v, --verbose: verbose output\n"
+ "              -c, --canonical: plan/optimize canonical set of sizes\n"
+ "     -t <h>, --time-limit=<h>: time limit in hours (default: 0, no limit)\n"
+ "  -o FILE, --output-file=FILE: output to FILE instead of stdout\n"
+ "                -m, --measure: plan in MEASURE mode (PATIENT is default)\n"
+ "               -e, --estimate: plan in ESTIMATE mode (not recommended)\n"
+ "             -x, --exhaustive: plan in EXHAUSTIVE mode (may be slow)\n"
+ "       -n, --no-system-wisdom: don't read /etc/fftw/ system wisdom file\n"
+ "  -w FILE, --wisdom-file=FILE: read wisdom from FILE (stdin if -)\n"
+#ifdef HAVE_SMP
+ "            -T N, --threads=N: plan with N threads\n"
+#endif
+	  "\nSize syntax: <type><inplace><direction><geometry>\n"
+ "      <type> = c/r/k for complex/real(r2c,c2r)/r2r\n" 
+ "   <inplace> = i/o for in/out-of place\n"
+ " <direction> = f/b for forward/backward, omitted for k transforms\n"
+ "  <geometry> = <n1>[x<n2>[x...]], e.g. 10x12x14\n"
+ "               -- for k transforms, after each dimension is a <kind>:\n"
+ "                     <kind> = f/b/h/e00/e01/e10/e11/o00/o01/o10/o11\n"
+ "                              for R2HC/HC2R/DHT/REDFT00/.../RODFT11\n"
+	  , program_name);
+}
+
+/* powers of two and ten up to 2^20, for now */
+static char canonical_sizes[][32] = {
+     "1", "2", "4", "8", "16", "32", "64", "128", "256", "512", "1024",
+     "2048", "4096", "8192", "16384", "32768", "65536", "131072",
+     "262144", "524288", "1048576",
+
+     "10", "100", "1000", "10000", "100000", "1000000",
+
+     "2x2", "4x4", "8x8", "10x10", "16x16", "32x32", "64x64", "100x100",
+     "128x128", "256x256", "512x512", "1000x1000", "1024x1024",
+
+     "2x2x2", "4x4x4", "8x8x8", "10x10x10", "16x16x16", "32x32x32",
+     "64x64x64", "100x100x100"
+};
+
+#define NELEM(array)(sizeof(array) / sizeof((array)[0]))
+
+int bench_main(int argc, char *argv[])
+{
+     int c;
+     unsigned i;
+     int impatient = 0;
+     int system_wisdom = 1;
+     int canonical = 0;
+     double hours = 0;
+     FILE *output_file;
+     char *output_fname = 0;
+     bench_problem **problems = 0;
+     int nproblems = 0, iproblem = 0;
+     time_t begin;
+
+     verbose = 0;
+     usewisdom = 0;
+
+     bench_srand(1);
+#ifdef HAVE_SMP
+     /* do not configure FFTW with threads, unless the
+	user requests -T */
+     threads_ok = 0;
+#endif
+
+     while ((c = my_getopt(argc, argv, options)) != -1) {
+	  switch (c) {
+	      case 'h':
+		   help(stdout, argv[0]);
+		   exit(EXIT_SUCCESS);
+		   break;
+
+	      case 'V':
+		   printf("fftw-wisdom tool for FFTW version " VERSION ".\n");
+		   printf(
+"\n"
+"Copyright (c) 2003, 2007-11 Matteo Frigo\n"
+"Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology\n"
+"\n"
+"This program is free software; you can redistribute it and/or modify\n"
+"it under the terms of the GNU General Public License as published by\n"
+"the Free Software Foundation; either version 2 of the License, or\n"
+"(at your option) any later version.\n"
+"\n"
+"This program is distributed in the hope that it will be useful,\n"
+"but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
+"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n"
+"GNU General Public License for more details.\n"
+"\n"
+"You should have received a copy of the GNU General Public License\n"
+"along with this program; if not, write to the Free Software\n"
+"Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA\n"
+			);
+		   exit(EXIT_SUCCESS);
+		   break;
+		   
+	      case 'v':
+		   verbose = 1;
+		   break;
+		   
+	      case 'c':
+		   canonical = 1;
+		   break;
+
+	      case 't':
+		   hours = atof(my_optarg);
+		   break;
+
+	      case 'o':
+		   if (output_fname)
+			bench_free(output_fname);
+		   
+		   if (!strcmp(my_optarg, "-"))
+			output_fname = 0;
+		   else {
+			output_fname = (char *) bench_malloc(sizeof(char) *
+						    (strlen(my_optarg) + 1));
+			strcpy(output_fname, my_optarg);
+		   }
+		   break;
+
+	      case 'm':
+	      case 'i':
+		   impatient = 1;
+		   break;
+
+	      case 'e':
+		   the_flags |= FFTW_ESTIMATE;
+		   break;
+
+	      case 'x':
+		   the_flags |= FFTW_EXHAUSTIVE;
+		   break;
+
+	      case 'n':
+		   system_wisdom = 0;
+		   break;
+
+	      case 'w': {
+		   FILE *w = stdin;
+		   if (strcmp(my_optarg, "-") && !(w = fopen(my_optarg, "r"))) {
+			fprintf(stderr,
+				"fftw-wisdom: error opening \"%s\": ", my_optarg);
+			perror("");
+			exit(EXIT_FAILURE);
+		   }
+		   if (!FFTW(import_wisdom_from_file)(w)) {
+			fprintf(stderr, "fftw_wisdom: error reading wisdom "
+				"from \"%s\"\n", my_optarg);
+			exit(EXIT_FAILURE);
+		   }
+		   if (w != stdin)
+			fclose(w);
+		   break;
+	      }
+
+#ifdef HAVE_SMP
+	      case 'T':
+		   nthreads = atoi(my_optarg);
+		   if (nthreads < 1) nthreads = 1;
+		   threads_ok = 1;
+		   BENCH_ASSERT(FFTW(init_threads)());
+		   break;
+#endif
+
+	      case '?':
+		   /* `my_getopt' already printed an error message. */
+		   cleanup();
+		   return EXIT_FAILURE;
+
+	      default:
+		   abort ();
+	  }
+     }
+
+     if (!impatient)
+	  the_flags |= FFTW_PATIENT;
+
+     if (system_wisdom)
+	  if (!FFTW(import_system_wisdom)() && verbose)
+	       fprintf(stderr, "fftw-wisdom: system-wisdom import failed\n");
+
+     if (canonical) {
+	  for (i = 0; i < NELEM(canonical_sizes); ++i) {
+	       unsigned j;
+	       char types[][8] = { 
+		    "cof", "cob", "cif", "cib", "rof", "rob", "rif", "rib"
+	       };
+	       
+	       for (j = 0; j < NELEM(types); ++j) {
+		    char ps[64];
+		    if (!strchr(canonical_sizes[i],'x')
+			|| !strchr(types[j],'o')) {
+#ifdef HAVE_SNPRINTF
+			 snprintf(ps, sizeof(ps), "%s%s", types[j], canonical_sizes[i]);
+#else
+			 sprintf(ps, "%s%s", types[j], canonical_sizes[i]);
+#endif
+			 add_problem(ps, &problems, &iproblem, &nproblems);
+		    }
+	       }
+	  }
+     }
+
+     while (my_optind < argc) {
+	  if (!strcmp(argv[my_optind], "-")) {
+	       char s[1025];
+	       while (1 == fscanf(stdin, "%1024s", s))
+		    add_problem(s, &problems, &iproblem, &nproblems);
+	  }
+	  else
+	       add_problem(argv[my_optind], &problems, &iproblem, &nproblems);
+	  ++my_optind;
+     }
+
+     nproblems = iproblem;
+     qsort(problems, nproblems, sizeof(bench_problem *), prob_size_cmp);
+
+     if (!output_fname)
+	  output_file = stdout;
+     else
+	  if (!(output_file = fopen(output_fname, "w"))) {
+	       fprintf(stderr,
+		       "fftw-wisdom: error creating \"%s\"", output_fname);
+	       perror("");
+	       exit(EXIT_FAILURE);
+	  }
+
+     begin = time((time_t*)0);
+     for (iproblem = 0; iproblem < nproblems; ++iproblem) {
+	  if (hours <= 0
+	      || hours > (time((time_t*)0) - begin) / 3600.0)
+	       do_problem(problems[iproblem]);
+	  problem_destroy(problems[iproblem]);
+	  
+     }
+     free(problems);
+     
+     if (verbose && hours > 0
+	 && hours < (time((time_t*)0) - begin) / 3600.0)
+	  fprintf(stderr, "EXCEEDED TIME LIMIT OF %g HOURS.\n", hours);
+
+     FFTW(export_wisdom_to_file)(output_file);
+     if (output_file != stdout)
+	  fclose(output_file);
+     if (output_fname)
+	  bench_free(output_fname);
+
+     cleanup();
+
+     return EXIT_SUCCESS;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tools/fftw_wisdom.1.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tools/fftw_wisdom.1.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,190 @@
+.\" 
+.\" Copyright (c) 2003, 2007-11 Matteo Frigo
+.\" Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+.\" 
+.\" This program is free software; you can redistribute it and/or modify
+.\" it under the terms of the GNU General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or
+.\" (at your option) any later version.
+.\" 
+.\" This program is distributed in the hope that it will be useful,
+.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
+.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+.\" GNU General Public License for more details.
+.\" 
+.\" You should have received a copy of the GNU General Public License
+.\" along with this program; if not, write to the Free Software
+.\" Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+.\"
+.TH FFTW-WISDOM 1 "February, 2003" "fftw" "fftw"
+.SH NAME
+fftw@PREC_SUFFIX@-wisdom \- create wisdom (pre-optimized FFTs)
+.SH SYNOPSIS
+.B fftw@PREC_SUFFIX@-wisdom
+[\fIOPTION\fR]... [\fISIZE\fR]...
+.SH DESCRIPTION
+.PP
+." Add any additional description here
+.I fftw@PREC_SUFFIX@-wisdom
+is a utility to generate FFTW
+.B wisdom
+files, which contain saved information about how to optimally compute
+(Fourier) transforms of various sizes.  FFTW is a free library to
+compute discrete Fourier transforms in one or more dimensions, for
+arbitrary sizes, and of both real and complex data, among other
+related operations.  More information on FFTW can be found at the FFTW
+home page:
+.I http://www.fftw.org
+
+Programs using FFTW can be written to load wisdom from an arbitrary file,
+string, or other source.  Moreover, it is likely that many FFTW-using
+programs will load the \fBsystem wisdom\fR file, which is stored in
+.I /etc/fftw/wisdom@PREC_SUFFIX@
+by default.
+.I fftw@PREC_SUFFIX@-wisdom
+can be used to create or add to such wisdom files.  In its most
+typical usage, the wisdom file can be created to pre-plan a canonical
+set of sizes (see below) via:
+
+.ce
+fftw@PREC_SUFFIX@-wisdom -v -c -o wisdom@PREC_SUFFIX@
+
+(this will take many hours, which can be limited by the 
+.B -t
+option) and the output
+.I wisdom@PREC_SUFFIX@
+file can then be copied (as root) to
+.I /etc/fftw/
+or whatever.
+
+The
+.I fftw@PREC_SUFFIX@-wisdom
+program normally writes the wisdom directly to standard output, but this
+can be changed via the
+.B -o
+option, as in the example above.
+
+If the system wisdom file
+.I /etc/fftw/wisdom@PREC_SUFFIX@
+already exists, then
+.I fftw@PREC_SUFFIX@-wisdom
+reads this existing wisdom (unless the
+.B -n
+option is specified) and outputs both the old wisdom and any
+newly created wisdom.  In this way, it can be used to add new transform
+sizes to the existing system wisdom (or other wisdom file, with the 
+.B -w
+option).
+.SH SPECIFYING SIZES
+Although a canonical set of sizes to optimize is specified by the 
+.B -c
+option, the user can also specify zero or more non-canonical transform
+sizes and types to optimize, via the 
+.I SIZE
+arguments following the option flags.  Alternatively, the sizes to
+optimize can be read from standard input (whitespace-separated), if a
+.I SIZE
+argument of "-" is supplied.
+
+Sizes are specified by the syntax:
+
+.ce
+<\fItype\fR><\fIinplace\fR><\fIdirection\fR><\fIgeometry\fR>
+
+<\fItype\fR> is either \'c\' (complex), \'r\' (real, r2c/c2r), or
+\'k\' (r2r, per-dimension kinds, specified in the geometry, below).
+
+<\fIinplace\fR> is either \'i\' (in place) or \'o\' (out of place).
+
+<\fIdirection\fR> is either \'f\' (forward) or \'b\' (backward).  The
+<\fIdirection\fR> should be omitted for \'k\' transforms, where it is
+specified via the geometry instead.
+
+<\fIgeometry\fR> is the size and dimensionality of the transform,
+where different dimensions are separated by \'x\' (e.g. \'16x32\' for
+a two-dimensional 16 by 32 transform).  In the case of \'k\'
+transforms, the size of each dimension is followed by a "type" string,
+which can be one of f/b/h/e00/e01/e10/e11/o00/o01/o10/o11 for
+R2HC/HC2R/DHT/REDFT00/.../RODFT11, respectively, as defined in the
+FFTW manual.
+
+For example, \'cif12x13x14\' is a three-dimensional 12 by 13 x 14
+complex DFT operating in-place.  \'rob65536\' is a one-dimensional
+size-65536 out-of-place complex-to-real (backwards) transform
+operating on Hermitian-symmetry input.  \'ki10hx20e01\' is a
+two-dimensional 10 by 20 r2r transform where the first dimension is a
+DHT and the second dimension is an REDFT01 (DCT-III).
+
+.SH OPTIONS
+.TP
+\fB\-h\fR, \fB\--help\fR
+Display help on the command-line options and usage.
+.TP
+\fB\-V\fR, \fB\--version\fR
+Print the version number and copyright information.
+.TP
+\fB\-v\fR, \fB\--verbose\fR
+Verbose output.  (You can specify this multiple times, or supply a numeric
+argument greater than 1, to increase the verbosity level.)  Note that the
+verbose output will be mixed with the wisdom output (making it impossible
+to import), unless you write the wisdom to a file via the 
+.B -o
+option.
+.TP
+\fB\-c\fR, \fB\--canonical\fR
+Optimize/pre-plan a canonical set of sizes: all powers of two and ten
+up to 2^20 (1048576), including both real and complex, forward and
+backwards, in-place and out-of-place transforms.  Also includes two-
+and three-dimensional transforms of equal-size dimensions
+(e.g. 16x16x16).
+.TP
+\fB\-t\fR \fIhours\fR, \fB\--time-limit\fR=\fIhours\fR
+Stop after a time of
+.I hours
+(hours) has elapsed, outputting accumulated wisdom.  (The problems are planned
+in increasing order of size.)  Defaults to 0, indicating no time limit.
+.TP
+\fB\-o\fR \fIfile\fR, \fB\--output-file\fR=\fIfile\fR
+Send wisdom output to
+.I file
+rather than to standard output (the default).
+.TP
+\fB\-m\fR, \fB\--measure\fR; \fB\-e\fR, \fB\--estimate\fR; \fB\-x\fR, \fB\--exhaustive\fR
+Normally, 
+.I fftw@PREC_SUFFIX@-wisdom
+creates plans in FFTW_PATIENT mode, but with these options you can instead
+use FFTW_MEASURE, FFTW_ESTIMATE, or FFTW_EXHAUSTIVE modes, respectively,
+as described in more detail by the FFTW manual.
+
+Note that wisdom is tagged with the planning patience level, and a
+single file can mix different levels of wisdom (e.g. you can mostly
+use the patient default, but plan a few sizes that you especially care
+about in
+.B --exhaustive
+mode).
+.TP
+\fB\-n\fR, \fB\--no-system-wisdom\fR
+Do not import the system wisdom from
+.I /etc/fftw/wisdom@PREC_SUFFIX@
+(which is normally read by default).
+.TP
+\fB\-w\fR \fIfile\fR, \fB\--wisdom-file\fR=\fIfile\fR
+Import wisdom from
+.I file
+(in addition to the system wisdom, unless 
+.B -n
+is specified).  Multiple wisdom files can be read via multiple
+.B -w
+options.  If
+.I file
+is "-", then read wisdom from standard input.
+.SH BUGS
+Send bug reports to fftw@fftw.org.
+.SH AUTHORS
+Written by Steven G. Johnson and Matteo Frigo.
+
+Copyright (c) 2003, 2007-11 Matteo Frigo
+.br
+Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+.SH "SEE ALSO"
+fftw-wisdom-to-conf(1)
diff -r d278df1123f9 -r 89f5e221ed7b src/fftw-3.3.3/tools/fftwf-wisdom.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fftw-3.3.3/tools/fftwf-wisdom.1	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,190 @@
+.\" 
+.\" Copyright (c) 2003, 2007-11 Matteo Frigo
+.\" Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+.\" 
+.\" This program is free software; you can redistribute it and/or modify
+.\" it under the terms of the GNU General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or
+.\" (at your option) any later version.
+.\" 
+.\" This program is distributed in the hope that it will be useful,
+.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
+.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+.\" GNU General Public License for more details.
+.\" 
+.\" You should have received a copy of the GNU General Public License
+.\" along with this program; if not, write to the Free Software
+.\" Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+.\"
+.TH FFTW-WISDOM 1 "February, 2003" "fftw" "fftw"
+.SH NAME
+fftwf-wisdom \- create wisdom (pre-optimized FFTs)
+.SH SYNOPSIS
+.B fftwf-wisdom
+[\fIOPTION\fR]... [\fISIZE\fR]...
+.SH DESCRIPTION
+.PP
+." Add any additional description here
+.I fftwf-wisdom
+is a utility to generate FFTW
+.B wisdom
+files, which contain saved information about how to optimally compute
+(Fourier) transforms of various sizes.  FFTW is a free library to
+compute discrete Fourier transforms in one or more dimensions, for
+arbitrary sizes, and of both real and complex data, among other
+related operations.  More information on FFTW can be found at the FFTW
+home page:
+.I http://www.fftw.org
+
+Programs using FFTW can be written to load wisdom from an arbitrary file,
+string, or other source.  Moreover, it is likely that many FFTW-using
+programs will load the \fBsystem wisdom\fR file, which is stored in
+.I /etc/fftw/wisdomf
+by default.
+.I fftwf-wisdom
+can be used to create or add to such wisdom files.  In its most
+typical usage, the wisdom file can be created to pre-plan a canonical
+set of sizes (see below) via:
+
+.ce
+fftwf-wisdom -v -c -o wisdomf
+
+(this will take many hours, which can be limited by the 
+.B -t
+option) and the output
+.I wisdomf
+file can then be copied (as root) to
+.I /etc/fftw/
+or whatever.
+
+The
+.I fftwf-wisdom
+program normally writes the wisdom directly to standard output, but this
+can be changed via the
+.B -o
+option, as in the example above.
+
+If the system wisdom file
+.I /etc/fftw/wisdomf
+already exists, then
+.I fftwf-wisdom
+reads this existing wisdom (unless the
+.B -n
+option is specified) and outputs both the old wisdom and any
+newly created wisdom.  In this way, it can be used to add new transform
+sizes to the existing system wisdom (or other wisdom file, with the 
+.B -w
+option).
+.SH SPECIFYING SIZES
+Although a canonical set of sizes to optimize is specified by the 
+.B -c
+option, the user can also specify zero or more non-canonical transform
+sizes and types to optimize, via the 
+.I SIZE
+arguments following the option flags.  Alternatively, the sizes to
+optimize can be read from standard input (whitespace-separated), if a
+.I SIZE
+argument of "-" is supplied.
+
+Sizes are specified by the syntax:
+
+.ce
+<\fItype\fR><\fIinplace\fR><\fIdirection\fR><\fIgeometry\fR>
+
+<\fItype\fR> is either \'c\' (complex), \'r\' (real, r2c/c2r), or
+\'k\' (r2r, per-dimension kinds, specified in the geometry, below).
+
+<\fIinplace\fR> is either \'i\' (in place) or \'o\' (out of place).
+
+<\fIdirection\fR> is either \'f\' (forward) or \'b\' (backward).  The
+<\fIdirection\fR> should be omitted for \'k\' transforms, where it is
+specified via the geometry instead.
+
+<\fIgeometry\fR> is the size and dimensionality of the transform,
+where different dimensions are separated by \'x\' (e.g. \'16x32\' for
+a two-dimensional 16 by 32 transform).  In the case of \'k\'
+transforms, the size of each dimension is followed by a "type" string,
+which can be one of f/b/h/e00/e01/e10/e11/o00/o01/o10/o11 for
+R2HC/HC2R/DHT/REDFT00/.../RODFT11, respectively, as defined in the
+FFTW manual.
+
+For example, \'cif12x13x14\' is a three-dimensional 12 by 13 x 14
+complex DFT operating in-place.  \'rob65536\' is a one-dimensional
+size-65536 out-of-place complex-to-real (backwards) transform
+operating on Hermitian-symmetry input.  \'ki10hx20e01\' is a
+two-dimensional 10 by 20 r2r transform where the first dimension is a
+DHT and the second dimension is an REDFT01 (DCT-III).
+
+.SH OPTIONS
+.TP
+\fB\-h\fR, \fB\--help\fR
+Display help on the command-line options and usage.
+.TP
+\fB\-V\fR, \fB\--version\fR
+Print the version number and copyright information.
+.TP
+\fB\-v\fR, \fB\--verbose\fR
+Verbose output.  (You can specify this multiple times, or supply a numeric
+argument greater than 1, to increase the verbosity level.)  Note that the
+verbose output will be mixed with the wisdom output (making it impossible
+to import), unless you write the wisdom to a file via the 
+.B -o
+option.
+.TP
+\fB\-c\fR, \fB\--canonical\fR
+Optimize/pre-plan a canonical set of sizes: all powers of two and ten
+up to 2^20 (1048576), including both real and complex, forward and
+backwards, in-place and out-of-place transforms.  Also includes two-
+and three-dimensional transforms of equal-size dimensions
+(e.g. 16x16x16).
+.TP
+\fB\-t\fR \fIhours\fR, \fB\--time-limit\fR=\fIhours\fR
+Stop after a time of
+.I hours
+(hours) has elapsed, outputting accumulated wisdom.  (The problems are planned
+in increasing order of size.)  Defaults to 0, indicating no time limit.
+.TP
+\fB\-o\fR \fIfile\fR, \fB\--output-file\fR=\fIfile\fR
+Send wisdom output to
+.I file
+rather than to standard output (the default).
+.TP
+\fB\-m\fR, \fB\--measure\fR; \fB\-e\fR, \fB\--estimate\fR; \fB\-x\fR, \fB\--exhaustive\fR
+Normally, 
+.I fftwf-wisdom
+creates plans in FFTW_PATIENT mode, but with these options you can instead
+use FFTW_MEASURE, FFTW_ESTIMATE, or FFTW_EXHAUSTIVE modes, respectively,
+as described in more detail by the FFTW manual.
+
+Note that wisdom is tagged with the planning patience level, and a
+single file can mix different levels of wisdom (e.g. you can mostly
+use the patient default, but plan a few sizes that you especially care
+about in
+.B --exhaustive
+mode).
+.TP
+\fB\-n\fR, \fB\--no-system-wisdom\fR
+Do not import the system wisdom from
+.I /etc/fftw/wisdomf
+(which is normally read by default).
+.TP
+\fB\-w\fR \fIfile\fR, \fB\--wisdom-file\fR=\fIfile\fR
+Import wisdom from
+.I file
+(in addition to the system wisdom, unless 
+.B -n
+is specified).  Multiple wisdom files can be read via multiple
+.B -w
+options.  If
+.I file
+is "-", then read wisdom from standard input.
+.SH BUGS
+Send bug reports to fftw@fftw.org.
+.SH AUTHORS
+Written by Steven G. Johnson and Matteo Frigo.
+
+Copyright (c) 2003, 2007-11 Matteo Frigo
+.br
+Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+.SH "SEE ALSO"
+fftw-wisdom-to-conf(1)
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/.hg_archival.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/.hg_archival.txt	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,4 @@
+repo: 9996a45cda4690b1a31ee4205f252e06f94b1ea3
+node: d4911a276d96f6232a68c6b8448056d3946043b9
+branch: default
+tag: v1.8.1
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/.hgignore
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/.hgignore	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,13 @@
+syntax: glob
+Makefile
+autom4te*
+bin/*
+lib/*
+*.a
+*.so
+*.o
+*.orig
+*.log
+*.bak
+config.status
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/.hgtags
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/.hgtags	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,9 @@
+96e35b8aaa60f222373f00461789d4f168b968c5 v1.5
+4f58b07497637bfd09a8980e8c594be0975ac55e v1.6
+85022d42d8dc37c4c18debccbd349aea46e9e2fd v1.7
+85022d42d8dc37c4c18debccbd349aea46e9e2fd v1.7
+61fe85a73bc32955feeb6fe17e668d9f522adde4 v1.7
+61fe85a73bc32955feeb6fe17e668d9f522adde4 v1.7
+efbc861f9b9460068c48a250232d343ffa7d5726 v1.7
+551952b2e7a6bf5b1196778638c8cdc3c40b108f v1.8
+77466ee7ffb5b07efda9b1dbed858379c987a9da v1.8.1
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/Android.mk
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/Android.mk	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,86 @@
+
+LOCAL_MODULE := rubberband
+LOCAL_MODULE_FILENAME := librubberband
+
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/rubberband $(LOCAL_PATH)/rubberband/src
+
+RUBBERBAND_PATH := rubberband
+RUBBERBAND_SRC_PATH := $(RUBBERBAND_PATH)/src
+
+RUBBERBAND_JNI_FILES := \
+	$(RUBBERBAND_SRC_PATH)/jni/RubberBandStretcherJNI.cpp
+
+RUBBERBAND_SRC_FILES := \
+        $(RUBBERBAND_SRC_PATH)/base/Profiler.cpp \
+        $(RUBBERBAND_SRC_PATH)/system/Thread.cpp \
+        $(RUBBERBAND_SRC_PATH)/system/Allocators.cpp \
+        $(RUBBERBAND_SRC_PATH)/system/sysutils.cpp \
+        $(RUBBERBAND_SRC_PATH)/system/VectorOpsComplex.cpp \
+        $(RUBBERBAND_SRC_PATH)/StretcherChannelData.cpp \
+        $(RUBBERBAND_SRC_PATH)/dsp/AudioCurveCalculator.cpp \
+        $(RUBBERBAND_SRC_PATH)/dsp/FFT.cpp \
+        $(RUBBERBAND_SRC_PATH)/dsp/Resampler.cpp \
+        $(RUBBERBAND_SRC_PATH)/audiocurves/SilentAudioCurve.cpp \
+        $(RUBBERBAND_SRC_PATH)/audiocurves/CompoundAudioCurve.cpp \
+        $(RUBBERBAND_SRC_PATH)/audiocurves/HighFrequencyAudioCurve.cpp \
+        $(RUBBERBAND_SRC_PATH)/audiocurves/SpectralDifferenceAudioCurve.cpp \
+        $(RUBBERBAND_SRC_PATH)/audiocurves/ConstantAudioCurve.cpp \
+        $(RUBBERBAND_SRC_PATH)/audiocurves/PercussiveAudioCurve.cpp \
+        $(RUBBERBAND_SRC_PATH)/StretcherImpl.cpp \
+        $(RUBBERBAND_SRC_PATH)/StretcherProcess.cpp \
+        $(RUBBERBAND_SRC_PATH)/StretchCalculator.cpp \
+        $(RUBBERBAND_SRC_PATH)/RubberBandStretcher.cpp \
+        $(RUBBERBAND_SRC_PATH)/speex/resample.c
+
+LOCAL_SRC_FILES += \
+	$(RUBBERBAND_JNI_FILES) \
+        $(RUBBERBAND_SRC_FILES)
+
+LOCAL_SRC_FILES += \
+	$(RUBBERBAND_SRC_PATH)/kissfft/kiss_fft.c \
+	$(RUBBERBAND_SRC_PATH)/kissfft/kiss_fftr.c
+
+LOCAL_CFLAGS_DEBUG := \
+	-g \
+	-mfloat-abi=softfp \
+	-DWANT_TIMING \
+	-DFFT_MEASUREMENT
+
+LOCAL_CFLAGS_RELEASE := \
+	-O3 \
+	-mfpu=neon \
+	-mfloat-abi=softfp \
+	-ffast-math \
+	-ftree-vectorize \
+	-ftree-vect-loop-version \
+	-freciprocal-math \
+	-fsingle-precision-constant \
+	-D__ARM_ARCH_7__ \
+	-DUSE_POMMIER_MATHFUN \
+	-DNO_TIMING \
+	-DNO_TIMING_COMPLETE_NOOP
+
+LOCAL_CFLAGS := \
+	-Wall \
+	-I$(RUBBERBAND_PATH) \
+	-I$(RUBBERBAND_SRC_PATH) \
+	-DUSE_SPEEX \
+	-DUSE_KISSFFT \
+	-DPROCESS_SAMPLE_TYPE=float \
+	-DLACK_POSIX_MEMALIGN \
+	-DUSE_OWN_ALIGNED_MALLOC \
+	-DLACK_BAD_ALLOC \
+	-DLACK_SINCOS \
+	-DNO_EXCEPTIONS \
+	-DNO_THREADING \
+	-DNO_THREAD_CHECKS \
+	$(LOCAL_CFLAGS_RELEASE)
+
+LOCAL_LDLIBS += -llog
+
+TARGET_ARCH_ABI	:= armeabi-v7a
+LOCAL_ARM_MODE := arm
+LOCAL_ARM_NEON := true
+
+include $(BUILD_SHARED_LIBRARY)
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/CHANGELOG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/CHANGELOG	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,89 @@
+
+Changes in Rubber Band v1.8.1
+
+ * Fix a crash in formant-preserving pitch shift for some build targets
+
+The API is unchanged and the library is binary compatible with
+version 1.7.
+
+
+Changes in Rubber Band v1.8
+
+ * Add build support for Win32/MSVC, Android, and various libraries
+ * Add Java JNI interface
+
+The API is unchanged and the library is binary compatible with
+version 1.7.
+
+
+Changes in Rubber Band v1.7
+
+ * Add the centre-focus option as an alternative processing mode for
+   stereo (using mid-side arrangement)
+ * Several bug fixes
+
+The library is binary compatible with version 1.6 for forward
+compatibility (values have been added to an existing enum).  Code
+written to use 1.7 is not necessarily compatible with 1.6.
+
+
+Changes in Rubber Band v1.6
+
+ * Add Smoothing option.  This uses a window-presum FFT, introducing
+   time-domain aliasing which is then smoothed using a sinc window.
+   This can be used in combination with any of the existing processing
+   control options.  This will soften transients but the result may
+   still be more pleasant for some material that is not very amenable
+   to being time stretched.
+ * Fix silent channel of output when processing with band-limited
+   transients option
+ * Include libresample support
+
+The library is binary compatible with version 1.5 for forward
+compatibility (values have been added to an existing enum).  Code
+written to use 1.6 is not necessarily compatible with 1.5.
+
+
+Changes in Rubber Band v1.5
+
+ * Add a more reliable transient detection mode, and make the mode
+   selectable using OptionDetectorXXX flags -- the new method is
+   the default
+ * Band-limit transient detectors to avoid being distracted by
+   inaudible garbage
+ * Add a key-frame mapping facility for variable stretch ratio
+   management during offline stretches
+
+The library is binary compatible with version 1.4 for forward
+compatibility (a function has been added and an enum changed, but no
+existing entry points have changed).  Code written to use 1.5 is not
+necessarily compatible with 1.4.
+
+
+Changes in Rubber Band v1.4 
+
+ * Fix a hang when faced with some very peculiar stretch factors
+ * Fix some incorrect threading condition usage
+ * Some code rearrangement
+ * Fix build on Solaris
+
+The library is binary compatible with version 1.3.
+
+
+Changes in Rubber Band v1.3
+
+ * Fix a bug that may cause incorrect output during the first process
+   block of some audio files, when processing in offline mode
+ * Fix a small number of build issues and more minor bugs
+
+The library is binary compatible with version 1.2.
+
+
+Changes in Rubber Band v1.2
+
+ * Add an initial "formant preservation" option when pitch shifting
+ * Real-time pitch shifting now uses a faster method by default, with
+   less variation in CPU usage
+ * The code is more amenable to compiler auto-vectorization (through
+   e.g. gcc --ftree-vectorize).
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/COPYING
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/COPYING	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,561 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                          675 Mass Ave, Cambridge, MA 02139, USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/Doxyfile
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/Doxyfile	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1363 @@
+# Doxyfile 1.5.5
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file 
+# that follow. The default is UTF-8 which is also the encoding used for all 
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the 
+# iconv built into libc) for the transcoding. See 
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded 
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = "Rubber Band Library"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. 
+# This could be handy for archiving the generated documentation or 
+# if some version control system is used.
+
+PROJECT_NUMBER         = 1.7
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
+# base path where the generated documentation will be put. 
+# If a relative path is entered, it will be relative to the location 
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
+# 4096 sub-directories (in 2 levels) under the output directory of each output 
+# format and will distribute the generated files over these directories. 
+# Enabling this option can be useful when feeding doxygen a huge amount of 
+# source files, where putting all generated files in the same directory would 
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all 
+# documentation generated by doxygen is written. Doxygen will use this 
+# information to generate all constant output in the proper language. 
+# The default language is English, other supported languages are: 
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, 
+# Croatian, Czech, Danish, Dutch, Farsi, Finnish, French, German, Greek, 
+# Hungarian, Italian, Japanese, Japanese-en (Japanese with English messages), 
+# Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, Polish, 
+# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish, 
+# and Ukrainian.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 
+# include brief member descriptions after the members that are listed in 
+# the file and class documentation (similar to JavaDoc). 
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 
+# the brief description of a member or function before the detailed description. 
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator 
+# that is used to form the text in various listings. Each string 
+# in this list, if found as the leading text of the brief description, will be 
+# stripped from the text and the result after processing the whole list, is 
+# used as the annotated text. Otherwise, the brief description is used as-is. 
+# If left blank, the following values are used ("$name" is automatically 
+# replaced with the name of the entity): "The $name class" "The $name widget" 
+# "The $name file" "is" "provides" "specifies" "contains" 
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       = 
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 
+# Doxygen will generate a detailed section even if there is only a brief 
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all 
+# inherited members of a class in the documentation of that class as if those 
+# members were ordinary class members. Constructors, destructors and assignment 
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 
+# path before files name in the file list and in the header files. If set 
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 
+# can be used to strip a user-defined part of the path. Stripping is 
+# only done if one of the specified strings matches the left-hand part of 
+# the path. The tag can be used to show relative paths in the file list. 
+# If left blank the directory from which doxygen is run is used as the 
+# path to strip.
+
+STRIP_FROM_PATH        = 
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of 
+# the path mentioned in the documentation of a class, which tells 
+# the reader which header file to include in order to use a class. 
+# If left blank only the name of the header file containing the class 
+# definition is used. Otherwise one should specify the include paths that 
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    = 
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
+# (but less readable) file names. This can be useful is your file systems 
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 
+# will interpret the first line (until the first dot) of a JavaDoc-style 
+# comment as the brief description. If set to NO, the JavaDoc 
+# comments will behave just like regular Qt-style comments 
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will 
+# interpret the first line (until the first dot) of a Qt-style 
+# comment as the brief description. If set to NO, the comments 
+# will behave just like regular Qt-style comments (thus requiring 
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen 
+# treat a multi-line C++ special comment block (i.e. a block of //! or /// 
+# comments) as a brief description. This used to be the default behaviour. 
+# The new default is to treat a multi-line C++ comment block as a detailed 
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the DETAILS_AT_TOP tag is set to YES then Doxygen 
+# will output the detailed description near the top, like JavaDoc.
+# If set to NO, the detailed description appears after the member 
+# documentation.
+
+DETAILS_AT_TOP         = YES
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 
+# member inherits the documentation from any documented member that it 
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce 
+# a new page for each member. If set to NO, the documentation of a member will 
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. 
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts 
+# as commands in the documentation. An alias has the form "name=value". 
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to 
+# put the command \sideeffect (or @sideeffect) in the documentation, which 
+# will result in a user-defined paragraph with heading "Side Effects:". 
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                = 
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C 
+# sources only. Doxygen will then generate output that is more tailored for C. 
+# For instance, some of the names that are used will be different. The list 
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Java. For instance, namespaces will be presented as packages, qualified 
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL 
+# sources. Doxygen will then generate output that is tailored for 
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want 
+# to include (a tag file for) the STL sources as input, then you should 
+# set this tag to YES in order to let doxygen match functions declarations and 
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. 
+# func(std::string) {}). This also make the inheritance and collaboration 
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. 
+# Doxygen will parse them like normal C++ but will assume all classes use public 
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 
+# tag is set to YES, then doxygen will reuse the documentation of the first 
+# member in the group (if any) for the other members of the group. By default 
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of 
+# the same type (for instance a group of public functions) to be put as a 
+# subgroup of that type (e.g. under the Public Functions section). Set it to 
+# NO to prevent subgrouping. Alternatively, this can be done per class using 
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum 
+# is documented as struct, union, or enum with the name of the typedef. So 
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct 
+# with name TypeT. When disabled the typedef will appear as a member of a file, 
+# namespace, or class. And the struct will be named TypeS. This can typically 
+# be useful for C code in case the coding convention dictates that all compound 
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 
+# documentation are documented, even if no documentation was available. 
+# Private class members and static file members will be hidden unless 
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class 
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file 
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 
+# defined locally in source files will be included in the documentation. 
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local 
+# methods, which are defined in the implementation section but not in 
+# the interface are included in the documentation. 
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be 
+# extracted and appear in the documentation as a namespace called 
+# 'anonymous_namespace{file}', where file will be replaced with the base 
+# name of the file that contains the anonymous namespace. By default 
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 
+# undocumented members of documented classes, files or namespaces. 
+# If set to NO (the default) these members will be included in the 
+# various overviews, but no documentation section is generated. 
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 
+# undocumented classes that are normally visible in the class hierarchy. 
+# If set to NO (the default) these classes will be included in the various 
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all 
+# friend (class|struct|union) declarations. 
+# If set to NO (the default) these declarations will be included in the 
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any 
+# documentation blocks found inside the body of a function. 
+# If set to NO (the default) these blocks will be appended to the 
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation 
+# that is typed after a \internal command is included. If the tag is set 
+# to NO (the default) then the documentation will be excluded. 
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 
+# file names in lower-case letters. If set to YES upper-case letters are also 
+# allowed. This is useful if you have classes or files whose names only differ 
+# in case and if your file system supports case sensitive file names. Windows 
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 
+# will show members with their full class and namespace scopes in the 
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 
+# will put a list of the files that are included by a file in the documentation 
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 
+# will sort the (detailed) documentation of file and class members 
+# alphabetically by member name. If set to NO the members will appear in 
+# declaration order.
+
+SORT_MEMBER_DOCS       = NO
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the 
+# brief documentation of file, namespace and class members alphabetically 
+# by member name. If set to NO (the default) the members will appear in 
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the 
+# hierarchy of group names into alphabetical order. If set to NO (the default) 
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be 
+# sorted by fully-qualified names, including namespaces. If set to 
+# NO (the default), the class list will be sorted only by class name, 
+# not including the namespace part. 
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the 
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or 
+# disable (NO) the todo list. This list is created by putting \todo 
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or 
+# disable (NO) the test list. This list is created by putting \test 
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or 
+# disable (NO) the bug list. This list is created by putting \bug 
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or 
+# disable (NO) the deprecated list. This list is created by putting 
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional 
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       = 
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines 
+# the initial value of a variable or define consists of for it to appear in 
+# the documentation. If the initializer consists of more lines than specified 
+# here it will be hidden. Use a value of 0 to hide initializers completely. 
+# The appearance of the initializer of individual variables and defines in the 
+# documentation can be controlled using \showinitializer or \hideinitializer 
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated 
+# at the bottom of the documentation of classes and structs. If set to YES the 
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories 
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy 
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that 
+# doxygen should invoke to get the current version for each file (typically from 
+# the version control system). Doxygen will invoke the program by executing (via 
+# popen()) the command <command> <input-file>, where <command> is the value of 
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file 
+# provided by doxygen. Whatever the program writes to standard output 
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    = 
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated 
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are 
+# generated by doxygen. Possible values are YES and NO. If left blank 
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will 
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for 
+# potential errors in the documentation, such as not documenting some 
+# parameters in a documented function, or documenting parameters that 
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for 
+# functions that are documented, but have no documentation for their parameters 
+# or return value. If set to NO (the default) doxygen will only warn about 
+# wrong or incomplete parameter documentation, but not about the absence of 
+# documentation.
+
+WARN_NO_PARAMDOC       = YES
+
+# The WARN_FORMAT tag determines the format of the warning messages that 
+# doxygen can produce. The string should contain the $file, $line, and $text 
+# tags, which will be replaced by the file and line number from which the 
+# warning originated and the warning text. Optionally the format may contain 
+# $version, which will be replaced by the version of the file (if it could 
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning 
+# and error messages should be written. If left blank the output is written 
+# to stderr.
+
+WARN_LOGFILE           = docs/doxygen/warning.log
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain 
+# documented source files. You may enter file names like "myfile.cpp" or 
+# directories like "/usr/src/myproject". Separate the files or directories 
+# with spaces.
+
+INPUT                  = rubberband 
+
+# This tag can be used to specify the character encoding of the source files 
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
+# also the default input encoding. Doxygen uses libiconv (or the iconv built 
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for 
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the 
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank the following patterns are tested: 
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx 
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          = *.h \
+                         *.C \
+                         *.cpp \
+                         *.cc \
+                         doc-overview
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories 
+# should be searched for input files as well. Possible values are YES and NO. 
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should 
+# excluded from the INPUT source files. This way you can easily exclude a 
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                = 
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or 
+# directories that are symbolic links (a Unix filesystem feature) are excluded 
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the 
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 
+# certain files from those directories. Note that the wildcards are matched 
+# against the file with absolute path, so to exclude all test directories 
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = qrc_*.cpp \
+                         moc_*.cpp \
+                         *.moc.cpp \
+                         *_skel.cpp
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names 
+# (namespaces, classes, functions, etc.) that should be excluded from the 
+# output. The symbol name can be a fully qualified name, a word, or if the 
+# wildcard * is used, a substring. Examples: ANamespace, AClass, 
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        = 
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or 
+# directories that contain example code fragments that are included (see 
+# the \include command).
+
+EXAMPLE_PATH           = 
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the 
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank all files are included.
+
+EXAMPLE_PATTERNS       = 
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 
+# searched for input files to be used with the \include or \dontinclude 
+# commands irrespective of the value of the RECURSIVE tag. 
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or 
+# directories that contain image that are included in the documentation (see 
+# the \image command).
+
+IMAGE_PATH             = 
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should 
+# invoke to filter for each input file. Doxygen will invoke the filter program 
+# by executing (via popen()) the command <filter> <input-file>, where <filter> 
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an 
+# input file. Doxygen will then use the output that the filter program writes 
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be 
+# ignored.
+
+INPUT_FILTER           = 
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern 
+# basis.  Doxygen will compare the file name with each pattern and apply the 
+# filter if there is a match.  The filters are a list of the form: 
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further 
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER 
+# is applied to all files.
+
+FILTER_PATTERNS        = 
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 
+# INPUT_FILTER) will be used to filter the input files when producing source 
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will 
+# be generated. Documented entities will be cross-referenced with these sources. 
+# Note: To get rid of all source code in the generated output, make sure also 
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body 
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 
+# doxygen to hide any special comment blocks from generated source code 
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES (the default) 
+# then for each documented function all documented 
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES (the default) 
+# then for each documented function all documented entities 
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.  Otherwise they will link to the documentstion.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code 
+# will point to the HTML generated by the htags(1) tool instead of doxygen 
+# built-in source browser. The htags tool is part of GNU's global source 
+# tagging system (see http://www.gnu.org/software/global/global.html). You 
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 
+# will generate a verbatim copy of the header file for each class for 
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 
+# of all compounds will be generated. Enable this if the project 
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 3
+
+# In case all classes in a project start with a common prefix, all 
+# classes will be put under the same header in the alphabetical index. 
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that 
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will 
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for 
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank 
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard header.
+
+HTML_HEADER            = 
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard footer.
+
+HTML_FOOTER            = 
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading 
+# style sheet that is used by each HTML page. It can be used to 
+# fine-tune the look of the HTML output. If the tag is left blank doxygen 
+# will generate a default style sheet. Note that doxygen will try to copy 
+# the style sheet file to the HTML output directory, so don't put your own 
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        = 
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, 
+# files or namespaces will be aligned in HTML using tables. If set to 
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files 
+# will be generated that can be used as input for tools like the 
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) 
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files 
+# will be generated that can be used as input for Apple's Xcode 3 
+# integrated development environment, introduced with OSX 10.5 (Leopard). 
+# To create a documentation set, doxygen will generate a Makefile in the 
+# HTML output directory. Running make will produce the docset in that 
+# directory and running "make install" will install the docset in 
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find 
+# it at startup.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the 
+# feed. A documentation feed provides an umbrella under which multiple 
+# documentation sets from a single provider (such as a company or product suite) 
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that 
+# should uniquely identify the documentation set bundle. This should be a 
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen 
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML 
+# documentation will contain sections that can be hidden and shown after the 
+# page has loaded. For this to work a browser that supports 
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox 
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can 
+# be used to specify the file name of the resulting .chm file. You 
+# can add a path in front of the file if the result should not be 
+# written to the html output directory.
+
+CHM_FILE               = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can 
+# be used to specify the location (absolute path including file name) of 
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run 
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 
+# controls if a separate .chi index file is generated (YES) or that 
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 
+# controls whether a binary table of contents is generated (YES) or a 
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members 
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at 
+# top of each HTML page. The value NO (the default) enables the index and 
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20]) 
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
+# generated containing a tree-like index structure (just like the one that 
+# is generated for HTML Help). For this to work a browser that supports 
+# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, 
+# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are 
+# probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = YES
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 
+# used to set the initial width (in pixels) of the frame in which the tree 
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
+# invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
+# generate index for LaTeX. If left blank `makeindex' will be used as the 
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 
+# LaTeX documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used 
+# by the printer. Possible values are: a4, a4wide, letter, legal and 
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         = 
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for 
+# the generated latex document. The header should contain everything until 
+# the first chapter. If it is left blank doxygen will generate a 
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           = 
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will 
+# contain links (just like the HTML output) instead of page references 
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = NO
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 
+# plain latex in the generated Makefile. Set this option to YES to get a 
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = NO
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 
+# command to the generated LaTeX files. This will instruct LaTeX to keep 
+# running if errors occur, instead of asking the user for help. 
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not 
+# include the index chapters (such as File Index, Compound Index, etc.) 
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 
+# The RTF output is optimized for Word 97 and may not look very pretty with 
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact 
+# RTF documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 
+# will contain hyperlink fields. The RTF file will 
+# contain links (just like the HTML output) instead of page references. 
+# This makes the output suitable for online browsing using WORD or other 
+# programs which support those fields. 
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's 
+# config file, i.e. a series of assignments. You only have to provide 
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    = 
+
+# Set optional variables used in the generation of an rtf document. 
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will 
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to 
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output, 
+# then it will generate one additional man file for each entity 
+# documented in the real man page(s). These additional files 
+# only source the real man page, but without them the man command 
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will 
+# generate an XML file that captures the structure of 
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_SCHEMA             = 
+
+# The XML_DTD tag can be used to specify an XML DTD, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_DTD                = 
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will 
+# dump the program listings (including syntax highlighting 
+# and cross-referencing information) to the XML output. Note that 
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 
+# generate an AutoGen Definitions (see autogen.sf.net) file 
+# that captures the structure of the code including all 
+# documentation. Note that this feature is still experimental 
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will 
+# generate a Perl module file that captures the structure of 
+# the code including all documentation. Note that this 
+# feature is still experimental and incomplete at the 
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate 
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able 
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be 
+# nicely formatted so it can be parsed by a human reader.  This is useful 
+# if you want to understand what is going on.  On the other hand, if this 
+# tag is set to NO the size of the Perl module output will be much smaller 
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file 
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. 
+# This is useful so different doxyrules.make files included by the same 
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor   
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 
+# evaluate all C-preprocessor directives found in the sources and include 
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 
+# names in the source code. If set to NO (the default) only conditional 
+# compilation will be performed. Macro expansion can be done in a controlled 
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 
+# then the macro expansion is limited to the macros specified with the 
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = NO
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that 
+# contain include files that are not input files but should be processed by 
+# the preprocessor.
+
+INCLUDE_PATH           = 
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 
+# patterns (like *.h and *.hpp) to filter out the header-files in the 
+# directories. If left blank, the patterns specified with FILE_PATTERNS will 
+# be used.
+
+INCLUDE_FILE_PATTERNS  = 
+
+# The PREDEFINED tag can be used to specify one or more macro names that 
+# are defined before the preprocessor is started (similar to the -D option of 
+# gcc). The argument of the tag is a list of macros of the form: name 
+# or name=definition (no spaces). If the definition and the = are 
+# omitted =1 is assumed. To prevent a macro definition from being 
+# undefined via #undef or recursively expanded use the := operator 
+# instead of the = operator.
+
+PREDEFINED             = 
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then 
+# this tag can be used to specify a list of macro names that should be expanded. 
+# The macro definition that is found in the sources will be used. 
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      = 
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 
+# doxygen's preprocessor will remove all function-like macros that are alone 
+# on a line, have an all uppercase name, and do not end with a semicolon. Such 
+# function macros are typically used for boiler-plate code, and will confuse 
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references   
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. 
+# Optionally an initial location of the external documentation 
+# can be added for each tagfile. The format of a tag file without 
+# this location is as follows: 
+#   TAGFILES = file1 file2 ... 
+# Adding location for the tag files is done as follows: 
+#   TAGFILES = file1=loc1 "file2 = loc2" ... 
+# where "loc1" and "loc2" can be relative or absolute paths or 
+# URLs. If a location is present for each tag, the installdox tool 
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen 
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               = 
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create 
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       = 
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed 
+# in the class index. If set to NO only the inherited external classes 
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 
+# in the modules index. If set to NO, only the current project's groups will 
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script 
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool   
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base 
+# or super classes. Setting the tag to NO turns the diagrams off. Note that 
+# this option is superseded by the HAVE_DOT option below. This is only a 
+# fallback. It is recommended to install and use dot, since it yields more 
+# powerful graphs.
+
+CLASS_DIAGRAMS         = NO
+
+# You can define message sequence charts within doxygen comments using the \msc 
+# command. Doxygen will then run the mscgen tool (see 
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the 
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where 
+# the mscgen tool resides. If left empty the tool is assumed to be found in the 
+# default search path.
+
+MSCGEN_PATH            = 
+
+# If set to YES, the inheritance and collaboration graphs will hide 
+# inheritance and usage relations if the target is undocumented 
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 
+# available from the path. This tool is part of Graphviz, a graph visualization 
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section 
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = YES
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect inheritance relations. Setting this tag to YES will force the 
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect implementation dependencies (inheritance, containment, and 
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = NO
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and 
+# collaboration diagrams in a style similar to the OMG's Unified Modeling 
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the 
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 
+# tags are set to YES then doxygen will generate a graph for each documented 
+# file showing the direct and indirect include dependencies of the file with 
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each 
+# documented header file showing the documented files that directly or 
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then 
+# doxygen will generate a call dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable call graphs 
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then 
+# doxygen will generate a caller dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable caller 
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES 
+# then doxygen will show the dependencies a directory has on other directories 
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be 
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               = 
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that 
+# contain dot files that are included in the documentation (see the 
+# \dotfile command).
+
+DOTFILE_DIRS           = 
+
+# The MAX_DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of 
+# nodes that will be shown in the graph. If the number of nodes in a graph 
+# becomes larger than this value, doxygen will truncate the graph, which is 
+# visualized by representing a node as a red box. Note that doxygen if the 
+# number of direct children of the root node in a graph is already larger than 
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note 
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the 
+# graphs generated by dot. A depth value of 3 means that only nodes reachable 
+# from the root by following a path via at most 3 edges will be shown. Nodes 
+# that lay further from the root node will be omitted. Note that setting this 
+# option to 1 or 2 may greatly reduce the computation time needed for large 
+# code bases. Also note that the size of a graph can be further restricted by 
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent 
+# background. This is enabled by default, which results in a transparent 
+# background. Warning: Depending on the platform used, enabling this option 
+# may lead to badly anti-aliased labels on the edges of a graph (i.e. they 
+# become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output 
+# files in one run (i.e. multiple -o and -T options on the command line). This 
+# makes dot run faster, but since only newer versions of dot (>1.8.10) 
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 
+# generate a legend page explaining the meaning of the various boxes and 
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 
+# remove the intermediate dot files that are used to generate 
+# the various graphs.
+
+DOT_CLEANUP            = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine   
+#---------------------------------------------------------------------------
+
+# The SEARCHENGINE tag specifies whether or not a search engine should be 
+# used. If set to NO the values of all tags below this one will be ignored.
+
+SEARCHENGINE           = NO
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/Makefile.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/Makefile.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,262 @@
+
+CXX		:= @CXX@
+CXXFLAGS	:= -DHAVE_LIBSAMPLERATE -DHAVE_FFTW3 -DFFTW_DOUBLE_ONLY -DNO_THREAD_CHECKS -DUSE_PTHREADS -DNO_TIMING -DNDEBUG @CXXFLAGS@ @SRC_CFLAGS@ @SNDFILE_CFLAGS@ @FFTW_CFLAGS@ @Vamp_CFLAGS@ -Irubberband -I. -Isrc $(OPTFLAGS)
+CFLAGS		:= @CFLAGS@ $(OPTFLAGS)
+LDFLAGS		:= @LDFLAGS@ -lpthread $(LDFLAGS)
+
+LIBRARY_LIBS		:= @SRC_LIBS@ @FFTW_LIBS@ 
+PROGRAM_LIBS		:= @SNDFILE_LIBS@ $(LIBRARY_LIBS)
+VAMP_PLUGIN_LIBS	:= @Vamp_LIBS@ $(LIBRARY_LIBS)
+LADSPA_PLUGIN_LIBS	:= $(LIBRARY_LIBS)
+
+MKDIR			:= mkdir
+AR			:= ar
+
+DYNAMIC_EXTENSION	:= .so
+DYNAMIC_FULL_VERSION	:= .2.1.0
+DYNAMIC_ABI_VERSION	:= .2
+DYNAMIC_LIBNAME		:= librubberband$(DYNAMIC_EXTENSION)
+DYNAMIC_LDFLAGS		:= -shared -Wl,-Bsymbolic -Wl,-soname=$(DYNAMIC_LIBNAME)$(DYNAMIC_ABI_VERSION)
+VAMP_LDFLAGS		:= -shared -Wl,-Bsymbolic -Wl,--version-script=vamp/vamp-plugin.map
+LADSPA_LDFLAGS		:= -shared -Wl,-Bsymbolic -Wl,--version-script=ladspa/ladspa-plugin.map
+
+PROGRAM_TARGET 		:= bin/rubberband
+STATIC_TARGET  		:= lib/librubberband.a
+DYNAMIC_TARGET 		:= lib/$(DYNAMIC_LIBNAME)
+VAMP_TARGET    		:= lib/vamp-rubberband$(DYNAMIC_EXTENSION)
+LADSPA_TARGET  		:= lib/ladspa-rubberband$(DYNAMIC_EXTENSION)
+
+INSTALL_BINDIR		:= @prefix@/bin
+INSTALL_INCDIR		:= @prefix@/include/rubberband
+INSTALL_LIBDIR		:= @prefix@/lib
+INSTALL_VAMPDIR		:= @prefix@/lib/vamp
+INSTALL_LADSPADIR	:= @prefix@/lib/ladspa
+INSTALL_LRDFDIR		:= @prefix@/share/ladspa/rdf
+INSTALL_PKGDIR		:= @prefix@/lib/pkgconfig
+
+all:	bin lib $(PROGRAM_TARGET) $(STATIC_TARGET) $(DYNAMIC_TARGET) $(VAMP_TARGET) $(LADSPA_TARGET)
+
+static:		$(STATIC_TARGET)
+dynamic:	$(DYNAMIC_TARGET)
+library:	$(STATIC_TARGET) $(DYNAMIC_TARGET)
+program:	$(PROGRAM_TARGET)
+vamp:		$(VAMP_TARGET)
+ladspa:		$(LADSPA_TARGET)
+
+PUBLIC_INCLUDES := \
+	rubberband/rubberband-c.h \
+	rubberband/RubberBandStretcher.h
+
+LIBRARY_INCLUDES := \
+	src/StretcherChannelData.h \
+	src/float_cast/float_cast.h \
+	src/StretcherImpl.h \
+	src/StretchCalculator.h \
+	src/base/Profiler.h \
+	src/base/RingBuffer.h \
+	src/base/Scavenger.h \
+	src/dsp/AudioCurveCalculator.h \
+	src/audiocurves/CompoundAudioCurve.h \
+	src/audiocurves/ConstantAudioCurve.h \
+	src/audiocurves/HighFrequencyAudioCurve.h \
+	src/audiocurves/PercussiveAudioCurve.h \
+	src/audiocurves/SilentAudioCurve.h \
+	src/audiocurves/SpectralDifferenceAudioCurve.h \
+	src/dsp/Resampler.h \
+	src/dsp/FFT.h \
+	src/dsp/MovingMedian.h \
+	src/dsp/SincWindow.h \
+	src/dsp/Window.h \
+	src/system/Allocators.h \
+	src/system/Thread.h \
+	src/system/VectorOps.h \
+	src/system/sysutils.h
+
+LIBRARY_SOURCES := \
+	src/rubberband-c.cpp \
+	src/RubberBandStretcher.cpp \
+	src/StretcherProcess.cpp \
+	src/StretchCalculator.cpp \
+	src/base/Profiler.cpp \
+	src/dsp/AudioCurveCalculator.cpp \
+	src/audiocurves/CompoundAudioCurve.cpp \
+	src/audiocurves/SpectralDifferenceAudioCurve.cpp \
+	src/audiocurves/HighFrequencyAudioCurve.cpp \
+	src/audiocurves/SilentAudioCurve.cpp \
+	src/audiocurves/ConstantAudioCurve.cpp \
+	src/audiocurves/PercussiveAudioCurve.cpp \
+	src/dsp/Resampler.cpp \
+	src/dsp/FFT.cpp \
+	src/system/Allocators.cpp \
+	src/system/sysutils.cpp \
+	src/system/Thread.cpp \
+	src/StretcherChannelData.cpp \
+	src/StretcherImpl.cpp
+
+PROGRAM_SOURCES := \
+	main/main.cpp
+
+VAMP_HEADERS := \
+	vamp/RubberBandVampPlugin.h
+
+VAMP_SOURCES := \
+	vamp/RubberBandVampPlugin.cpp \
+	vamp/libmain.cpp
+
+LADSPA_HEADERS := \
+	ladspa/RubberBandPitchShifter.h
+
+LADSPA_SOURCES := \
+	ladspa/RubberBandPitchShifter.cpp \
+	ladspa/libmain.cpp
+
+LIBRARY_OBJECTS := $(LIBRARY_SOURCES:.cpp=.o)
+LIBRARY_OBJECTS := $(LIBRARY_OBJECTS:.c=.o)
+
+PROGRAM_OBJECTS := $(PROGRAM_SOURCES:.cpp=.o)
+VAMP_OBJECTS    := $(VAMP_SOURCES:.cpp=.o)
+LADSPA_OBJECTS  := $(LADSPA_SOURCES:.cpp=.o)
+
+$(PROGRAM_TARGET):	$(LIBRARY_OBJECTS) $(PROGRAM_OBJECTS)
+	$(CXX) -o $@ $^ $(PROGRAM_LIBS) $(LDFLAGS)
+
+$(STATIC_TARGET):	$(LIBRARY_OBJECTS)
+	$(AR) rsc $@ $^
+
+$(DYNAMIC_TARGET):	$(LIBRARY_OBJECTS)
+	$(CXX) $(DYNAMIC_LDFLAGS) $^ -o $@ $(LIBRARY_LIBS) $(LDFLAGS)
+
+$(VAMP_TARGET):		$(LIBRARY_OBJECTS) $(VAMP_OBJECTS)
+	$(CXX) $(VAMP_LDFLAGS) -o $@ $^ $(VAMP_PLUGIN_LIBS) $(LDFLAGS)
+
+$(LADSPA_TARGET):	$(LIBRARY_OBJECTS) $(LADSPA_OBJECTS)
+	$(CXX) $(LADSPA_LDFLAGS) -o $@ $^ $(LADSPA_PLUGIN_LIBS) $(LDFLAGS)
+
+bin:
+	$(MKDIR) $@
+lib:
+	$(MKDIR) $@
+
+install:	all
+	$(MKDIR) -p $(DESTDIR)$(INSTALL_BINDIR)
+	$(MKDIR) -p $(DESTDIR)$(INSTALL_INCDIR)
+	$(MKDIR) -p $(DESTDIR)$(INSTALL_LIBDIR)
+	$(MKDIR) -p $(DESTDIR)$(INSTALL_VAMPDIR)
+	$(MKDIR) -p $(DESTDIR)$(INSTALL_LADSPADIR)
+	$(MKDIR) -p $(DESTDIR)$(INSTALL_LRDFDIR)
+	$(MKDIR) -p $(DESTDIR)$(INSTALL_PKGDIR)
+	cp $(PROGRAM_TARGET) $(DESTDIR)$(INSTALL_BINDIR)
+	cp $(PUBLIC_INCLUDES) $(DESTDIR)$(INSTALL_INCDIR)
+	cp $(STATIC_TARGET) $(DESTDIR)$(INSTALL_LIBDIR)
+	rm -f $(DESTDIR)$(INSTALL_LIBDIR)/$(DYNAMIC_LIBNAME)$(DYNAMIC_ABI_VERSION)
+	rm -f $(DESTDIR)$(INSTALL_LIBDIR)/$(DYNAMIC_LIBNAME)
+	cp $(DYNAMIC_TARGET) $(DESTDIR)$(INSTALL_LIBDIR)/$(DYNAMIC_LIBNAME)$(DYNAMIC_FULL_VERSION)
+	test -n "$(DYNAMIC_FULL_VERSION)" && ln -s $(DYNAMIC_LIBNAME)$(DYNAMIC_FULL_VERSION) $(DESTDIR)$(INSTALL_LIBDIR)/$(DYNAMIC_LIBNAME)$(DYNAMIC_ABI_VERSION)
+	test -n "$(DYNAMIC_FULL_VERSION)" && ln -s $(DYNAMIC_LIBNAME)$(DYNAMIC_FULL_VERSION) $(DESTDIR)$(INSTALL_LIBDIR)/$(DYNAMIC_LIBNAME)
+	cp $(VAMP_TARGET) $(DESTDIR)$(INSTALL_VAMPDIR)
+	cp vamp/vamp-rubberband.cat $(DESTDIR)$(INSTALL_VAMPDIR)
+	cp $(LADSPA_TARGET) $(DESTDIR)$(INSTALL_LADSPADIR)
+	cp ladspa/ladspa-rubberband.cat $(DESTDIR)$(INSTALL_LADSPADIR)
+	cp ladspa/ladspa-rubberband.rdf $(DESTDIR)$(INSTALL_LRDFDIR)
+	sed "s,%PREFIX%,@prefix@," rubberband.pc.in \
+	  > $(DESTDIR)$(INSTALL_PKGDIR)/rubberband.pc
+
+clean:
+	rm -f $(LIBRARY_OBJECTS) $(PROGRAM_OBJECTS) $(LADSPA_OBJECTS) $(VAMP_OBJECTS)
+
+distclean:	clean
+	rm -f $(PROGRAM_TARGET) $(STATIC_TARGET) $(DYNAMIC_TARGET) $(VAMP_TARGET) $(LADSPA_TARGET)
+
+depend:
+	makedepend -Y $(LIBRARY_SOURCES) $(PROGRAM_SOURCES)
+
+
+# DO NOT DELETE
+
+src/rubberband-c.o: rubberband/rubberband-c.h
+src/rubberband-c.o: rubberband/RubberBandStretcher.h
+src/RubberBandStretcher.o: src/StretcherImpl.h
+src/RubberBandStretcher.o: rubberband/RubberBandStretcher.h src/dsp/Window.h
+src/RubberBandStretcher.o: src/dsp/SincWindow.h src/dsp/FFT.h
+src/RubberBandStretcher.o: src/audiocurves/CompoundAudioCurve.h
+src/RubberBandStretcher.o: src/dsp/AudioCurveCalculator.h
+src/RubberBandStretcher.o: src/audiocurves/PercussiveAudioCurve.h
+src/RubberBandStretcher.o: src/audiocurves/HighFrequencyAudioCurve.h
+src/RubberBandStretcher.o: src/dsp/SampleFilter.h src/base/RingBuffer.h
+src/RubberBandStretcher.o: src/base/Scavenger.h src/system/Thread.h
+src/RubberBandStretcher.o: src/system/sysutils.h
+src/StretcherProcess.o: src/StretcherImpl.h rubberband/RubberBandStretcher.h
+src/StretcherProcess.o: src/dsp/Window.h src/dsp/SincWindow.h src/dsp/FFT.h
+src/StretcherProcess.o: src/audiocurves/CompoundAudioCurve.h
+src/StretcherProcess.o: src/dsp/AudioCurveCalculator.h
+src/StretcherProcess.o: src/audiocurves/PercussiveAudioCurve.h
+src/StretcherProcess.o: src/audiocurves/HighFrequencyAudioCurve.h
+src/StretcherProcess.o: src/dsp/SampleFilter.h src/base/RingBuffer.h
+src/StretcherProcess.o: src/base/Scavenger.h src/system/Thread.h
+src/StretcherProcess.o: src/system/sysutils.h src/audiocurves/PercussiveAudioCurve.h
+src/StretcherProcess.o: src/audiocurves/HighFrequencyAudioCurve.h
+src/StretcherProcess.o: src/audiocurves/ConstantAudioCurve.h src/StretchCalculator.h
+src/StretcherProcess.o: src/StretcherChannelData.h src/dsp/Resampler.h
+src/StretcherProcess.o: src/base/Profiler.h src/system/VectorOps.h
+src/StretcherProcess.o: src/system/sysutils.h
+src/StretchCalculator.o: src/StretchCalculator.h src/system/sysutils.h
+src/base/Profiler.o: src/base/Profiler.h src/system/sysutils.h
+src/dsp/AudioCurveCalculator.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/CompoundAudioCurve.o: src/audiocurves/CompoundAudioCurve.h
+src/audiocurves/CompoundAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/CompoundAudioCurve.o: src/audiocurves/PercussiveAudioCurve.h
+src/audiocurves/CompoundAudioCurve.o: src/audiocurves/HighFrequencyAudioCurve.h
+src/audiocurves/CompoundAudioCurve.o: src/dsp/SampleFilter.h src/dsp/MovingMedian.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/audiocurves/SpectralDifferenceAudioCurve.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/dsp/Window.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/system/sysutils.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/system/VectorOps.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/system/sysutils.h
+src/audiocurves/HighFrequencyAudioCurve.o: src/audiocurves/HighFrequencyAudioCurve.h
+src/audiocurves/HighFrequencyAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/SilentAudioCurve.o: src/audiocurves/SilentAudioCurve.h
+src/audiocurves/SilentAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/ConstantAudioCurve.o: src/audiocurves/ConstantAudioCurve.h
+src/audiocurves/ConstantAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/PercussiveAudioCurve.o: src/audiocurves/PercussiveAudioCurve.h
+src/audiocurves/PercussiveAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/PercussiveAudioCurve.o: src/system/VectorOps.h src/system/sysutils.h
+src/dsp/Resampler.o: src/dsp/Resampler.h src/system/sysutils.h
+src/dsp/Resampler.o: src/base/Profiler.h
+src/dsp/FFT.o: src/dsp/FFT.h src/system/sysutils.h src/system/Thread.h
+src/dsp/FFT.o: src/base/Profiler.h src/system/VectorOps.h
+src/dsp/FFT.o: src/system/sysutils.h
+src/system/Allocators.o: src/system/Allocators.h src/system/VectorOps.h
+src/system/Allocators.o: src/system/sysutils.h
+src/system/sysutils.o: src/system/sysutils.h
+src/system/Thread.o: src/system/Thread.h
+src/StretcherChannelData.o: src/StretcherChannelData.h src/StretcherImpl.h
+src/StretcherChannelData.o: rubberband/RubberBandStretcher.h src/dsp/Window.h
+src/StretcherChannelData.o: src/dsp/SincWindow.h src/dsp/FFT.h
+src/StretcherChannelData.o: src/audiocurves/CompoundAudioCurve.h
+src/StretcherChannelData.o: src/dsp/AudioCurveCalculator.h
+src/StretcherChannelData.o: src/audiocurves/PercussiveAudioCurve.h
+src/StretcherChannelData.o: src/audiocurves/HighFrequencyAudioCurve.h
+src/StretcherChannelData.o: src/dsp/SampleFilter.h src/base/RingBuffer.h
+src/StretcherChannelData.o: src/base/Scavenger.h src/system/Thread.h
+src/StretcherChannelData.o: src/system/sysutils.h src/dsp/Resampler.h
+src/StretcherChannelData.o: src/system/Allocators.h src/system/VectorOps.h
+src/StretcherChannelData.o: src/system/sysutils.h
+src/StretcherImpl.o: src/StretcherImpl.h rubberband/RubberBandStretcher.h
+src/StretcherImpl.o: src/dsp/Window.h src/dsp/SincWindow.h src/dsp/FFT.h
+src/StretcherImpl.o: src/audiocurves/CompoundAudioCurve.h
+src/StretcherImpl.o: src/dsp/AudioCurveCalculator.h
+src/StretcherImpl.o: src/audiocurves/PercussiveAudioCurve.h
+src/StretcherImpl.o: src/audiocurves/HighFrequencyAudioCurve.h src/dsp/SampleFilter.h
+src/StretcherImpl.o: src/base/RingBuffer.h src/base/Scavenger.h
+src/StretcherImpl.o: src/system/Thread.h src/system/sysutils.h
+src/StretcherImpl.o: src/audiocurves/PercussiveAudioCurve.h
+src/StretcherImpl.o: src/audiocurves/HighFrequencyAudioCurve.h
+src/StretcherImpl.o: src/audiocurves/SpectralDifferenceAudioCurve.h src/dsp/Window.h
+src/StretcherImpl.o: src/system/VectorOps.h src/system/sysutils.h
+src/StretcherImpl.o: src/audiocurves/SilentAudioCurve.h src/audiocurves/ConstantAudioCurve.h
+src/StretcherImpl.o: src/dsp/Resampler.h src/StretchCalculator.h
+src/StretcherImpl.o: src/StretcherChannelData.h src/base/Profiler.h
+main/main.o: rubberband/RubberBandStretcher.h src/system/sysutils.h
+main/main.o: src/base/Profiler.h
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/Makefile.osx
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/Makefile.osx	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,224 @@
+
+CXX		:= g++
+CC		:= gcc
+ARCHFLAGS	:= 
+OPTFLAGS	:= -DNDEBUG -ffast-math -mfpmath=sse -msse -msse2 -O3 -ftree-vectorize
+
+CXXFLAGS	:= $(ARCHFLAGS) $(OPTFLAGS) -I/usr/local/include -DUSE_PTHREADS -DMALLOC_IS_ALIGNED -DHAVE_VDSP -DUSE_SPEEX -DNO_THREAD_CHECKS -DNO_TIMING -Irubberband -I. -Isrc
+
+LIBRARY_LIBS		:= -framework Accelerate
+
+CFLAGS		:= $(ARCHFLAGS) $(OPTFLAGS)
+LDFLAGS		:= $(ARCHFLAGS) -lpthread $(LDFLAGS)
+
+PROGRAM_LIBS		:= -L/usr/local/lib -lsndfile $(LIBRARY_LIBS)
+VAMP_PLUGIN_LIBS	:= -L/usr/local/lib -lvamp-sdk $(LIBRARY_LIBS)
+LADSPA_PLUGIN_LIBS	:= $(LIBRARY_LIBS)
+
+MKDIR			:= mkdir
+AR			:= ar
+
+DYNAMIC_LDFLAGS		:= -dynamiclib
+DYNAMIC_EXTENSION	:= .dylib
+
+PROGRAM_TARGET 		:= bin/rubberband
+STATIC_TARGET  		:= lib/librubberband.a
+DYNAMIC_TARGET 		:= lib/librubberband$(DYNAMIC_EXTENSION)
+VAMP_TARGET    		:= lib/vamp-rubberband$(DYNAMIC_EXTENSION)
+LADSPA_TARGET  		:= lib/ladspa-rubberband$(DYNAMIC_EXTENSION)
+
+default:	bin lib $(STATIC_TARGET) $(DYNAMIC_TARGET) $(PROGRAM_TARGET)
+
+all:	bin lib $(STATIC_TARGET) $(DYNAMIC_TARGET) $(PROGRAM_TARGET) $(VAMP_TARGET) $(LADSPA_TARGET)
+
+static:		$(STATIC_TARGET)
+dynamic:	$(DYNAMIC_TARGET)
+library:	$(STATIC_TARGET) $(DYNAMIC_TARGET)
+program:	$(PROGRAM_TARGET)
+vamp:		$(VAMP_TARGET)
+ladspa:		$(LADSPA_TARGET)
+
+PUBLIC_INCLUDES := \
+	rubberband/rubberband-c.h \
+	rubberband/RubberBandStretcher.h
+
+LIBRARY_INCLUDES := \
+	src/StretcherChannelData.h \
+	src/float_cast/float_cast.h \
+	src/StretcherImpl.h \
+	src/StretchCalculator.h \
+	src/base/Profiler.h \
+	src/base/RingBuffer.h \
+	src/base/Scavenger.h \
+	src/dsp/AudioCurveCalculator.h \
+	src/audiocurves/CompoundAudioCurve.h \
+	src/audiocurves/ConstantAudioCurve.h \
+	src/audiocurves/HighFrequencyAudioCurve.h \
+	src/audiocurves/PercussiveAudioCurve.h \
+	src/audiocurves/SilentAudioCurve.h \
+	src/audiocurves/SpectralDifferenceAudioCurve.h \
+	src/dsp/Resampler.h \
+	src/dsp/FFT.h \
+	src/dsp/MovingMedian.h \
+	src/dsp/SincWindow.h \
+	src/dsp/Window.h \
+	src/system/Allocators.h \
+	src/system/Thread.h \
+	src/system/VectorOps.h \
+	src/system/VectorOpsComplex.h \
+	src/system/sysutils.h
+
+LIBRARY_SOURCES := \
+	src/rubberband-c.cpp \
+	src/RubberBandStretcher.cpp \
+	src/StretcherProcess.cpp \
+	src/StretchCalculator.cpp \
+	src/base/Profiler.cpp \
+	src/dsp/AudioCurveCalculator.cpp \
+	src/audiocurves/CompoundAudioCurve.cpp \
+	src/audiocurves/SpectralDifferenceAudioCurve.cpp \
+	src/audiocurves/HighFrequencyAudioCurve.cpp \
+	src/audiocurves/SilentAudioCurve.cpp \
+	src/audiocurves/ConstantAudioCurve.cpp \
+	src/audiocurves/PercussiveAudioCurve.cpp \
+	src/dsp/Resampler.cpp \
+	src/dsp/FFT.cpp \
+	src/system/Allocators.cpp \
+	src/system/sysutils.cpp \
+	src/system/Thread.cpp \
+	src/system/VectorOpsComplex.cpp \
+	src/StretcherChannelData.cpp \
+	src/StretcherImpl.cpp
+
+# For Speex resampler -- comment these lines out if not specifying USE_SPEEX
+LIBRARY_INCLUDES := $(LIBRARY_INCLUDES) \
+	src/speex/speex_resampler.h
+LIBRARY_SOURCES := $(LIBRARY_SOURCES) \
+	src/speex/resample.c
+
+PROGRAM_SOURCES := \
+	main/main.cpp
+
+VAMP_HEADERS := \
+	vamp/RubberBandVampPlugin.h
+
+VAMP_SOURCES := \
+	vamp/RubberBandVampPlugin.cpp \
+	vamp/libmain.cpp
+
+LADSPA_HEADERS := \
+	ladspa/RubberBandPitchShifter.h
+
+LADSPA_SOURCES := \
+	ladspa/RubberBandPitchShifter.cpp \
+	ladspa/libmain.cpp
+
+LIBRARY_OBJECTS := $(LIBRARY_SOURCES:.cpp=.o)
+LIBRARY_OBJECTS := $(LIBRARY_OBJECTS:.c=.o)
+
+PROGRAM_OBJECTS := $(PROGRAM_SOURCES:.cpp=.o)
+VAMP_OBJECTS    := $(VAMP_SOURCES:.cpp=.o)
+LADSPA_OBJECTS  := $(LADSPA_SOURCES:.cpp=.o)
+
+$(PROGRAM_TARGET):	$(LIBRARY_OBJECTS) $(PROGRAM_OBJECTS)
+	$(CXX) -o $@ $^ $(PROGRAM_LIBS) $(PROGRAM_LIBS) $(LDFLAGS)
+
+$(STATIC_TARGET):	$(LIBRARY_OBJECTS)
+	$(AR) rc $@ $^
+
+$(DYNAMIC_TARGET):	$(LIBRARY_OBJECTS)
+	$(CXX) $(DYNAMIC_LDFLAGS) $^ -o $@ $(LIBRARY_LIBS) $(LDFLAGS)
+
+$(VAMP_TARGET):		$(LIBRARY_OBJECTS) $(VAMP_OBJECTS)
+	$(CXX) $(VAMP_LDFLAGS) -o $@ $^ $(VAMP_PLUGIN_LIBS) $(LDFLAGS)
+
+$(LADSPA_TARGET):	$(LIBRARY_OBJECTS) $(LADSPA_OBJECTS)
+	$(CXX) $(LADSPA_LDFLAGS) -o $@ $^ $(LADSPA_PLUGIN_LIBS) $(LDFLAGS)
+
+bin:
+	$(MKDIR) $@
+lib:
+	$(MKDIR) $@
+
+clean:
+	rm -f $(LIBRARY_OBJECTS) $(PROGRAM_OBJECTS) $(LADSPA_OBJECTS) $(VAMP_OBJECTS)
+
+distclean:	clean
+	rm -f $(PROGRAM_TARGET) $(STATIC_TARGET) $(DYNAMIC_TARGET) $(VAMP_TARGET) $(LADSPA_TARGET)
+
+depend:
+	makedepend -Y $(LIBRARY_SOURCES) $(PROGRAM_SOURCES)
+
+
+# DO NOT DELETE
+
+src/rubberband-c.o: rubberband/rubberband-c.h
+src/rubberband-c.o: rubberband/RubberBandStretcher.h
+src/RubberBandStretcher.o: src/StretcherImpl.h
+src/RubberBandStretcher.o: rubberband/RubberBandStretcher.h src/dsp/Window.h
+src/RubberBandStretcher.o: src/dsp/FFT.h src/base/RingBuffer.h
+src/RubberBandStretcher.o: src/base/Scavenger.h src/system/Thread.h
+src/RubberBandStretcher.o: src/system/Thread.h src/system/sysutils.h
+src/StretcherProcess.o: src/StretcherImpl.h rubberband/RubberBandStretcher.h
+src/StretcherProcess.o: src/dsp/Window.h src/dsp/FFT.h src/base/RingBuffer.h
+src/StretcherProcess.o: src/base/Scavenger.h src/system/Thread.h
+src/StretcherProcess.o: src/system/Thread.h src/system/sysutils.h
+src/StretcherProcess.o: src/audiocurves/PercussiveAudioCurve.h
+src/StretcherProcess.o: src/dsp/AudioCurveCalculator.h
+src/StretcherProcess.o: src/audiocurves/HighFrequencyAudioCurve.h
+src/StretcherProcess.o: src/audiocurves/ConstantAudioCurve.h src/StretchCalculator.h
+src/StretcherProcess.o: src/StretcherChannelData.h src/dsp/Resampler.h
+src/StretcherProcess.o: src/base/Profiler.h src/system/VectorOps.h
+src/StretcherProcess.o: src/system/sysutils.h
+src/StretchCalculator.o: src/StretchCalculator.h src/system/sysutils.h
+src/system/Thread.o: src/system/Thread.h
+src/base/Profiler.o: src/base/Profiler.h src/system/sysutils.h
+src/dsp/AudioCurveCalculator.o: src/dsp/AudioCurveCalculator.h
+src/dsp/AudioCurveCalculator.o: src/system/sysutils.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/audiocurves/SpectralDifferenceAudioCurve.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/system/sysutils.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/dsp/Window.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/system/VectorOps.h
+src/audiocurves/SpectralDifferenceAudioCurve.o: src/system/sysutils.h
+src/audiocurves/HighFrequencyAudioCurve.o: src/audiocurves/HighFrequencyAudioCurve.h
+src/audiocurves/HighFrequencyAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/HighFrequencyAudioCurve.o: src/system/sysutils.h
+src/audiocurves/SilentAudioCurve.o: src/audiocurves/SilentAudioCurve.h
+src/audiocurves/SilentAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/SilentAudioCurve.o: src/system/sysutils.h
+src/audiocurves/ConstantAudioCurve.o: src/audiocurves/ConstantAudioCurve.h
+src/audiocurves/ConstantAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/ConstantAudioCurve.o: src/system/sysutils.h
+src/audiocurves/PercussiveAudioCurve.o: src/audiocurves/PercussiveAudioCurve.h
+src/audiocurves/PercussiveAudioCurve.o: src/dsp/AudioCurveCalculator.h
+src/audiocurves/PercussiveAudioCurve.o: src/system/sysutils.h src/system/VectorOps.h
+src/audiocurves/PercussiveAudioCurve.o: src/system/sysutils.h
+src/dsp/Resampler.o: src/dsp/Resampler.h src/system/sysutils.h
+src/dsp/Resampler.o: src/base/Profiler.h
+src/dsp/FFT.o: src/dsp/FFT.h src/system/sysutils.h src/system/Thread.h
+src/dsp/FFT.o: src/base/Profiler.h src/system/VectorOps.h
+src/dsp/FFT.o: src/system/sysutils.h
+src/system/Allocators.o: src/system/Allocators.h src/system/VectorOps.h
+src/system/Allocators.o: src/system/sysutils.h
+src/system/sysutils.o: src/system/sysutils.h
+src/StretcherChannelData.o: src/StretcherChannelData.h src/StretcherImpl.h
+src/StretcherChannelData.o: rubberband/RubberBandStretcher.h src/dsp/Window.h
+src/StretcherChannelData.o: src/dsp/FFT.h src/base/RingBuffer.h
+src/StretcherChannelData.o: src/base/Scavenger.h src/system/Thread.h
+src/StretcherChannelData.o: src/system/Thread.h src/system/sysutils.h
+src/StretcherChannelData.o: src/dsp/Resampler.h src/system/Allocators.h
+src/StretcherChannelData.o: src/system/VectorOps.h src/system/sysutils.h
+src/StretcherImpl.o: src/StretcherImpl.h rubberband/RubberBandStretcher.h
+src/StretcherImpl.o: src/dsp/Window.h src/dsp/FFT.h src/base/RingBuffer.h
+src/StretcherImpl.o: src/base/Scavenger.h src/system/Thread.h src/system/Thread.h
+src/StretcherImpl.o: src/system/sysutils.h src/audiocurves/PercussiveAudioCurve.h
+src/StretcherImpl.o: src/dsp/AudioCurveCalculator.h
+src/StretcherImpl.o: src/audiocurves/HighFrequencyAudioCurve.h
+src/StretcherImpl.o: src/audiocurves/SpectralDifferenceAudioCurve.h src/dsp/Window.h
+src/StretcherImpl.o: src/system/VectorOps.h src/system/sysutils.h
+src/StretcherImpl.o: src/audiocurves/SilentAudioCurve.h src/audiocurves/ConstantAudioCurve.h
+src/StretcherImpl.o: src/dsp/Resampler.h src/StretchCalculator.h
+src/StretcherImpl.o: src/StretcherChannelData.h src/base/Profiler.h
+main/main.o: rubberband/RubberBandStretcher.h src/system/sysutils.h
+main/main.o: src/base/Profiler.h
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/README.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/README.txt	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,475 @@
+
+Rubber Band
+===========
+
+An audio time-stretching and pitch-shifting library and utility program.
+
+Written by Chris Cannam, chris.cannam@breakfastquay.com.
+Copyright 2007-2012 Particular Programs Ltd.
+
+Rubber Band is a library and utility program that permits changing the
+tempo and pitch of an audio recording independently of one another.
+
+See http://breakfastquay.com/rubberband/ for more information.
+
+
+Licence
+=======
+
+Rubber Band is distributed under the GNU General Public License. See
+the file COPYING for more information.
+
+If you wish to distribute code using the Rubber Band Library under
+terms other than those of the GNU General Public License, you must
+obtain a commercial licence from us before doing so. In particular,
+you may not legally distribute through any Apple App Store unless you
+have a commercial licence.  See http://breakfastquay.com/rubberband/
+for licence terms.
+
+If you have obtained a valid commercial licence, your licence
+supersedes this README and the enclosed COPYING file and you may
+redistribute and/or modify Rubber Band under the terms described in
+that licence. Please refer to your licence agreement for more details.
+
+Note that Rubber Band may link with other GPL libraries or with
+proprietary libraries, depending on its build configuration. See the
+section "FFT and resampler selection" below. It is your responsibility
+to ensure that you redistribute only in accordance with the licence
+terms of any other libraries you may build with.
+
+
+Contents of this README
+-----------------------
+
+1. Code components
+2. Using the Rubber Band command-line tool
+3. Using the Rubber Band Library
+4. Compiling Rubber Band
+   a. FFT and resampler selection
+   b. Other supported #defines
+   c. GNU/POSIX systems and Makefiles
+   d. OS/X and iOS
+   e. Win32 and Visual Studio
+   f. Android and Java
+5. Copyright notes for bundled libraries
+
+
+1. Code components
+------------------
+
+Rubber Band consists of:
+
+ * The Rubber Band library code.  This is the code that will normally
+   be used by your applications.  The headers for this are in the
+   rubberband/ directory, and the source code is in src/.
+   The Rubber Band library depends upon resampler and FFT code; see
+   section 3a below for details.
+
+ * The Rubber Band command-line tool.  This is in main/main.cpp.
+   This program uses the Rubber Band library and also requires libsndfile
+   (http://www.mega-nerd.com/libsndfile/, licensed under the GNU Lesser
+   General Public License) for audio file loading.
+
+ * A pitch-shifter LADSPA audio effects plugin.  This is in ladspa/.
+   It requires the LADSPA SDK header ladspa.h (not included).
+
+ * A Vamp audio analysis plugin which may be used to inspect the
+   dynamic stretch ratios and other decisions taken by the Rubber Band
+   library when in use.  This is in vamp/.  It requires the Vamp
+   plugin SDK (http://www.vamp-plugins.org/develop.html) (not included).
+
+
+2. Using the Rubber Band command-line tool
+------------------------------------------
+
+The Rubber Band command-line tool builds as bin/rubberband.  The basic
+incantation is
+
+  $ rubberband -t <timeratio> -p <pitchratio> <infile.wav> <outfile.wav>
+
+For example,
+
+  $ rubberband -t 1.5 -p 2.0 test.wav output.wav
+
+stretches the file test.wav to 50% longer than its original duration,
+shifts it up in pitch by one octave, and writes the output to output.wav.
+
+Several further options are available: run "rubberband -h" for help.
+In particular, different types of music may benefit from different
+"crispness" options (-c <n> where <n> is from 0 to 6).
+
+
+3. Using the Rubber Band library
+--------------------------------
+
+The Rubber Band library has a public API that consists of one C++
+class, called RubberBandStretcher in the RubberBand namespace.  You
+should #include <rubberband/RubberBandStretcher.h> to use this class.
+There is extensive documentation in the class header.
+
+A header with C language bindings is also provided in
+<rubberband/rubberband-c.h>.  This is a wrapper around the C++
+implementation, and as the implementation is the same, it also
+requires linkage against the C++ standard libraries.  It is not yet
+documented separately from the C++ header.  You should include only
+one of the two headers, not both.
+
+The source code for the command-line utility (main/main.cpp) provides
+a good example of how to use Rubber Band in offline mode; the LADSPA
+pitch shifter plugin (ladspa/RubberBandPitchShifter.cpp) may be used
+as an example of Rubber Band in real-time mode.
+
+IMPORTANT: Please ensure you have read and understood the licensing
+terms for Rubber Band before using it in your application.  This
+library is provided under the GNU General Public License, which means
+that any application that uses it must also be published under the GPL
+or a compatible licence (i.e. with its full source code also available
+for modification and redistribution) unless you have separately
+acquired a commercial licence from the author.
+
+
+4. Compiling Rubber Band
+------------------------
+
+4a. FFT and resampler selection
+-------------------------------
+
+Rubber Band requires additional library code for FFT calculation and
+resampling.  Several libraries are supported.  The selection is
+controlled using preprocessor flags at compile time, as detailed in
+the tables below.
+
+Flags that declare that you want to use an external library begin with
+HAVE_; flags that select from the bundled options begin with USE_.
+
+You must enable one resampler implementation and one FFT
+implementation.  Do not enable more than one of either unless you know
+what you're doing.
+
+If you are building this software using one of the bundled library
+options (Speex or KissFFT), please be sure to review the terms for
+those libraries in src/speex/COPYING and src/kissfft/COPYING as
+applicable.
+
+FFT libraries supported
+-----------------------
+
+Name           Flags required        Notes
+----           --------------        -----   
+
+FFTW3	       -DHAVE_FFTW	     GPL.
+
+Accelerate     -DHAVE_VDSP	     Platform library on OS/X and iOS.
+
+Intel IPP      -DHAVE_IPP            Proprietary library, can only be used with
+      	    			     Rubber Band commercial licence. Define
+				     USE_IPP_STATIC as well to build with static
+				     IPP libraries.
+
+KissFFT        -DUSE_KISSFFT	     Bundled, can be used with GPL or commercial
+	    			     licence.  Single-precision. Slower than the
+				     above options.
+
+Resampler libraries supported
+-----------------------------
+
+Name           Flags required        Notes
+----           --------------        -----   
+
+libsamplerate  -DHAVE_LIBSAMPLERATE  GPL.
+
+libresample    -DHAVE_LIBRESAMPLE    LGPL.
+
+Speex	       -DUSE_SPEEX	     Bundled, can be used with GPL or commercial
+	       			     licence.
+
+
+4b. Other supported #defines
+----------------------------
+
+Other symbols you may define at compile time are as follows. (Usually
+the supplied build files will handle these for you.)
+
+   -DLACK_BAD_ALLOC
+   Define on systems lacking std::bad_alloc in the C++ library.
+
+   -DLACK_POSIX_MEMALIGN
+   Define on systems lacking posix_memalign.
+
+   -DUSE_OWN_ALIGNED_MALLOC
+   Define on systems lacking any aligned malloc implementation.
+
+   -DLACK_SINCOS
+   Define on systems lacking sincos().
+   
+   -DNO_EXCEPTIONS
+   Build without use of C++ exceptions.
+
+   -DNO_THREADING
+   Build without any multithread support.
+
+   -DUSE_PTHREADS
+   Use the pthreads library (required unless NO_THREADING or on Windows)
+
+   -DPROCESS_SAMPLE_TYPE=float
+   Select single precision for internal calculations. The default is
+   double precision. Consider using for mobile architectures with
+   slower double-precision support.
+
+   -DUSE_POMMIER_MATHFUN
+   Select the Julien Pommier implementations of trig functions for ARM
+   NEON or x86 SSE architectures. These are usually faster but may be
+   of lower precision than system implementations. Consider using this
+   for mobile architectures.
+
+
+4c. GNU/POSIX systems and Makefiles
+-----------------------------------
+
+A GNU-style configure script is included for use on Linux and similar
+systems.
+
+Run ./configure, then adjust the generated Makefile according to your
+preference for FFT and resampler implementations.  The default is to
+use FFTW3 and libsamplerate.
+
+The following Makefile targets are available:
+
+  static  -- build static libraries only
+  dynamic -- build dynamic libraries only
+  library -- build static and dynamic libraries only
+  program -- build the command-line tool
+  vamp    -- build Vamp plugin
+  ladspa  -- build LADSPA plugin
+  all     -- build everything.
+
+The default target is "all".
+
+
+4d. OS/X and iOS
+----------------
+
+A Makefile for OS/X is provided as Makefile.osx.
+
+Adjust the Makefile according to your preference for compiler and
+platform SDK, FFT and resampler implementations.  The default is to
+use the Accelerate framework and the Speex resampler.
+
+The following Makefile targets are available:
+
+  static  -- build static libraries only
+  dynamic -- build dynamic libraries only
+  library -- build static and dynamic libraries only
+  program -- build the command-line tool
+  vamp    -- build Vamp plugin
+  ladspa  -- build LADSPA plugin
+  all     -- build everything.
+
+The default target is to build the static and dynamic libraries and
+the command line tool.  The sndfile library is required for the
+command line tool.
+
+If you prefer to add the Rubber Band library files to an existing
+build project instead of using the Makefile, the files in src/ (except
+for RubberBandStretcherJNI.cpp) and the API headers in rubberband/
+should be all you need.
+
+Note that you cannot legally distribute applications using Rubber Band
+through the iPhone/iPad App Store or OS/X App Store unless you have a
+valid commercial licence.  GPL code is not permitted in these stores.
+
+
+4e. Win32 and Visual Studio
+---------------------------
+
+Two Visual Studio 2005 projects are supplied.
+
+rubberband-library.vcproj builds the Rubber Band static libraries
+only.
+
+rubberband-program.vcproj builds the Rubber Band command-line tool
+only (requires the Rubber Band libraries, and libsndfile).
+
+You will need to adjust the project settings so as to set the compile
+flags according to your preference for FFT and resampler
+implementation, and set the include path and library path
+appropriately.  The default is to use the bundled KissFFT and the
+Speex resampler.
+
+If you prefer to add the Rubber Band library files to an existing
+build project instead of using the supplied one, the files in src/
+(except for RubberBandStretcherJNI.cpp) and the API headers in
+rubberband/ should be all you need.
+
+
+4f. Android and Java
+--------------------
+
+An Android NDK build file is provided as Android.mk. This includes
+compile definitions for a shared library built for ARM architectures
+which can be loaded from a Java application using the Java native
+interface (i.e. the Android NDK).
+
+The Java side of the interface can be found in
+com/breakfastquay/rubberband/RubberBandStretcher.java.
+
+The supplied .mk file uses KissFFT and the Speex resampler.
+
+
+5. Copyright notes for bundled libraries
+========================================
+
+5a. Speex
+---------
+
+[files in src/speex]
+
+Copyright 2002-2007 	Xiph.org Foundation
+Copyright 2002-2007 	Jean-Marc Valin
+Copyright 2005-2007	Analog Devices Inc.
+Copyright 2005-2007	Commonwealth Scientific and Industrial Research 
+                        Organisation (CSIRO)
+Copyright 1993, 2002, 2006 David Rowe
+Copyright 2003 		EpicGames
+Copyright 1992-1994	Jutta Degener, Carsten Bormann
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+- Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+- Neither the name of the Xiph.org Foundation nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+5b. KissFFT
+-----------
+
+[files in src/kissfft]
+
+Copyright (c) 2003-2004 Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the author nor the names of any contributors may be used
+      to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+5c. Pommier math functions
+--------------------------
+
+[files in src/pommier]
+
+Copyright (C) 2011  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+
+5d. float_cast
+--------------
+
+[files in src/float_cast]
+
+Copyright (C) 2001 Erik de Castro Lopo <erikd AT mega-nerd DOT com>
+
+Permission to use, copy, modify, distribute, and sell this file for any 
+purpose is hereby granted without fee, provided that the above copyright 
+and this permission notice appear in all copies.  No representations are
+made about the suitability of this software for any purpose.  It is 
+provided "as is" without express or implied warranty.
+
+
+5d. getopt
+----------
+
+[files in src/getopt, used by command-line tool on some platforms]
+
+Copyright (c) 2000 The NetBSD Foundation, Inc.
+All rights reserved.
+
+This code is derived from software contributed to The NetBSD Foundation
+by Dieter Baron and Thomas Klausner.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. All advertising materials mentioning features or use of this software
+   must display the following acknowledgement:
+       This product includes software developed by the NetBSD
+       Foundation, Inc. and its contributors.
+4. Neither the name of The NetBSD Foundation nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/com/breakfastquay/rubberband/RubberBandStretcher.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/com/breakfastquay/rubberband/RubberBandStretcher.java	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,109 @@
+/* Copyright Chris Cannam - All Rights Reserved */
+
+package com.breakfastquay.rubberband;
+
+public class RubberBandStretcher
+{
+    public RubberBandStretcher(int sampleRate, int channels,
+			       int options,
+			       double initialTimeRatio,
+			       double initialPitchScale) {
+	handle = 0;
+	initialise(sampleRate, channels, options,
+		   initialTimeRatio, initialPitchScale);
+    }
+
+    public native void dispose();
+
+    public native void reset();
+
+    public native void setTimeRatio(double ratio);
+    public native void setPitchScale(double scale);
+
+    public native int getChannelCount();
+    public native double getTimeRatio();
+    public native double getPitchScale();
+
+    public native int getLatency();
+
+    public native void setTransientsOption(int options);
+    public native void setDetectorOption(int options);
+    public native void setPhaseOption(int options);
+    public native void setFormantOption(int options);
+    public native void setPitchOption(int options);
+
+    public native void setExpectedInputDuration(long samples);
+    public native void setMaxProcessSize(int samples);
+
+    public native int getSamplesRequired();
+
+    //!!! todo: setKeyFrameMap
+
+    public native void study(float[][] input, int offset, int n, boolean finalBlock);
+    public void study(float[][] input, boolean finalBlock) {
+	study(input, 0, input[0].length, finalBlock);
+    }
+
+    public native void process(float[][] input, int offset, int n, boolean finalBlock);
+    public void process(float[][] input, boolean finalBlock) {
+	process(input, 0, input[0].length, finalBlock);
+    }
+
+    public native int available();
+
+    public native int retrieve(float[][] output, int offset, int n);
+    public int retrieve(float[][] output) {
+	return retrieve(output, 0, output[0].length);
+    }
+
+    private native void initialise(int sampleRate, int channels, int options,
+				   double initialTimeRatio,
+				   double initialPitchScale);
+    private long handle;
+
+    public static final int OptionProcessOffline       = 0x00000000;
+    public static final int OptionProcessRealTime      = 0x00000001;
+
+    public static final int OptionStretchElastic       = 0x00000000;
+    public static final int OptionStretchPrecise       = 0x00000010;
+    
+    public static final int OptionTransientsCrisp      = 0x00000000;
+    public static final int OptionTransientsMixed      = 0x00000100;
+    public static final int OptionTransientsSmooth     = 0x00000200;
+
+    public static final int OptionDetectorCompound     = 0x00000000;
+    public static final int OptionDetectorPercussive   = 0x00000400;
+    public static final int OptionDetectorSoft         = 0x00000800;
+
+    public static final int OptionPhaseLaminar         = 0x00000000;
+    public static final int OptionPhaseIndependent     = 0x00002000;
+    
+    public static final int OptionThreadingAuto        = 0x00000000;
+    public static final int OptionThreadingNever       = 0x00010000;
+    public static final int OptionThreadingAlways      = 0x00020000;
+
+    public static final int OptionWindowStandard       = 0x00000000;
+    public static final int OptionWindowShort          = 0x00100000;
+    public static final int OptionWindowLong           = 0x00200000;
+
+    public static final int OptionSmoothingOff         = 0x00000000;
+    public static final int OptionSmoothingOn          = 0x00800000;
+
+    public static final int OptionFormantShifted       = 0x00000000;
+    public static final int OptionFormantPreserved     = 0x01000000;
+
+    public static final int OptionPitchHighSpeed       = 0x00000000;
+    public static final int OptionPitchHighQuality     = 0x02000000;
+    public static final int OptionPitchHighConsistency = 0x04000000;
+
+    public static final int OptionChannelsApart        = 0x00000000;
+    public static final int OptionChannelsTogether     = 0x10000000;
+
+    public static final int DefaultOptions             = 0x00000000;
+    public static final int PercussiveOptions          = 0x00102000;
+
+    static {
+	System.loadLibrary("rubberband");
+    }
+};
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/configure
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/configure	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,5417 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.69 for RubberBand 1.7.
+#
+# Report bugs to <chris.cannam@breakfastquay.com>.
+#
+#
+# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
+#
+#
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='print -r --'
+  as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in #(
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there.  '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+# Use a proper internal environment variable to ensure we don't fall
+  # into an infinite loop, continuously re-executing ourselves.
+  if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then
+    _as_can_reexec=no; export _as_can_reexec;
+    # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+as_fn_exit 255
+  fi
+  # We don't want this to propagate to other subprocesses.
+          { _as_can_reexec=; unset _as_can_reexec;}
+if test "x$CONFIG_SHELL" = x; then
+  as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '\${1+\"\$@\"}'='\"\$@\"'
+  setopt NO_GLOB_SUBST
+else
+  case \`(set -o) 2>/dev/null\` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+"
+  as_required="as_fn_return () { (exit \$1); }
+as_fn_success () { as_fn_return 0; }
+as_fn_failure () { as_fn_return 1; }
+as_fn_ret_success () { return 0; }
+as_fn_ret_failure () { return 1; }
+
+exitcode=0
+as_fn_success || { exitcode=1; echo as_fn_success failed.; }
+as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
+as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
+as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
+if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
+
+else
+  exitcode=1; echo positional parameters were not saved.
+fi
+test x\$exitcode = x0 || exit 1
+test -x / || exit 1"
+  as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
+  as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
+  eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
+  test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
+test \$(( 1 + 1 )) = 2 || exit 1"
+  if (eval "$as_required") 2>/dev/null; then :
+  as_have_required=yes
+else
+  as_have_required=no
+fi
+  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
+
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+as_found=false
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  as_found=:
+  case $as_dir in #(
+	 /*)
+	   for as_base in sh bash ksh sh5; do
+	     # Try only shells that exist, to save several forks.
+	     as_shell=$as_dir/$as_base
+	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+		    { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
+  CONFIG_SHELL=$as_shell as_have_required=yes
+		   if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
+  break 2
+fi
+fi
+	   done;;
+       esac
+  as_found=false
+done
+$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
+	      { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
+  CONFIG_SHELL=$SHELL as_have_required=yes
+fi; }
+IFS=$as_save_IFS
+
+
+      if test "x$CONFIG_SHELL" != x; then :
+  export CONFIG_SHELL
+             # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+exit 255
+fi
+
+    if test x$as_have_required = xno; then :
+  $as_echo "$0: This script requires a shell more modern than all"
+  $as_echo "$0: the shells that I found on your system."
+  if test x${ZSH_VERSION+set} = xset ; then
+    $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
+    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
+  else
+    $as_echo "$0: Please tell bug-autoconf@gnu.org and
+$0: chris.cannam@breakfastquay.com about your system,
+$0: including any error possibly output before this
+$0: message. Then install a modern shell, or manually run
+$0: the script under such a shell if you do have one."
+  fi
+  exit 1
+fi
+fi
+fi
+SHELL=${CONFIG_SHELL-/bin/sh}
+export SHELL
+# Unset more variables known to interfere with behavior of common tools.
+CLICOLOR_FORCE= GREP_OPTIONS=
+unset CLICOLOR_FORCE GREP_OPTIONS
+
+## --------------------- ##
+## M4sh Shell Functions. ##
+## --------------------- ##
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else
+  as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else
+  as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  }
+fi # as_fn_arith
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  $as_echo "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+
+  as_lineno_1=$LINENO as_lineno_1a=$LINENO
+  as_lineno_2=$LINENO as_lineno_2a=$LINENO
+  eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" &&
+  test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || {
+  # Blame Lee E. McMahon (1931-1989) for sed's syntax.  :-)
+  sed -n '
+    p
+    /[$]LINENO/=
+  ' <$as_myself |
+    sed '
+      s/[$]LINENO.*/&-/
+      t lineno
+      b
+      :lineno
+      N
+      :loop
+      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+      t loop
+      s/-\n.*//
+    ' >$as_me.lineno &&
+  chmod +x "$as_me.lineno" ||
+    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
+
+  # If we had to re-execute with $CONFIG_SHELL, we're ensured to have
+  # already done that, so ensure we don't try to do so again and fall
+  # in an infinite loop.  This has already happened in practice.
+  _as_can_reexec=no; export _as_can_reexec
+  # Don't try to exec as it changes $[0], causing all sort of problems
+  # (the dirname of $[0] is not the place where we might find the
+  # original and so on.  Autoconf is especially sensitive to this).
+  . "./$as_me.lineno"
+  # Exit status is that of the last command.
+  exit
+}
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+    # In both cases, we have to default to `cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+test -n "$DJDIR" || exec 7<&0 </dev/null
+exec 6>&1
+
+# Name of the host.
+# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_clean_files=
+ac_config_libobj_dir=.
+LIBOBJS=
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+
+# Identity of this package.
+PACKAGE_NAME='RubberBand'
+PACKAGE_TARNAME='rubberband'
+PACKAGE_VERSION='1.7'
+PACKAGE_STRING='RubberBand 1.7'
+PACKAGE_BUGREPORT='chris.cannam@breakfastquay.com'
+PACKAGE_URL=''
+
+ac_unique_file="src/StretcherImpl.h"
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+# include <sys/stat.h>
+#endif
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#ifdef HAVE_STRING_H
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
+#  include <memory.h>
+# endif
+# include <string.h>
+#endif
+#ifdef HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
+ac_subst_vars='LTLIBOBJS
+LIBOBJS
+Vamp_LIBS
+Vamp_CFLAGS
+FFTW_LIBS
+FFTW_CFLAGS
+SNDFILE_LIBS
+SNDFILE_CFLAGS
+SRC_LIBS
+SRC_CFLAGS
+PKG_CONFIG_LIBDIR
+PKG_CONFIG_PATH
+PKG_CONFIG
+EGREP
+GREP
+CPP
+ac_ct_CC
+CFLAGS
+CC
+OBJEXT
+EXEEXT
+ac_ct_CXX
+CPPFLAGS
+LDFLAGS
+CXXFLAGS
+CXX
+target_alias
+host_alias
+build_alias
+LIBS
+ECHO_T
+ECHO_N
+ECHO_C
+DEFS
+mandir
+localedir
+libdir
+psdir
+pdfdir
+dvidir
+htmldir
+infodir
+docdir
+oldincludedir
+includedir
+localstatedir
+sharedstatedir
+sysconfdir
+datadir
+datarootdir
+libexecdir
+sbindir
+bindir
+program_transform_name
+prefix
+exec_prefix
+PACKAGE_URL
+PACKAGE_BUGREPORT
+PACKAGE_STRING
+PACKAGE_VERSION
+PACKAGE_TARNAME
+PACKAGE_NAME
+PATH_SEPARATOR
+SHELL'
+ac_subst_files=''
+ac_user_opts='
+enable_option_checking
+'
+      ac_precious_vars='build_alias
+host_alias
+target_alias
+CXX
+CXXFLAGS
+LDFLAGS
+LIBS
+CPPFLAGS
+CCC
+CC
+CFLAGS
+CPP
+PKG_CONFIG
+PKG_CONFIG_PATH
+PKG_CONFIG_LIBDIR
+SRC_CFLAGS
+SRC_LIBS
+SNDFILE_CFLAGS
+SNDFILE_LIBS
+FFTW_CFLAGS
+FFTW_LIBS
+Vamp_CFLAGS
+Vamp_LIBS'
+
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+ac_unrecognized_opts=
+ac_unrecognized_sep=
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+# (The list follows the same order as the GNU Coding Standards.)
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datarootdir='${prefix}/share'
+datadir='${datarootdir}'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
+infodir='${datarootdir}/info'
+htmldir='${docdir}'
+dvidir='${docdir}'
+pdfdir='${docdir}'
+psdir='${docdir}'
+libdir='${exec_prefix}/lib'
+localedir='${datarootdir}/locale'
+mandir='${datarootdir}/man'
+
+ac_prev=
+ac_dashdash=
+for ac_option
+do
+  # If the previous option needs an argument, assign it.
+  if test -n "$ac_prev"; then
+    eval $ac_prev=\$ac_option
+    ac_prev=
+    continue
+  fi
+
+  case $ac_option in
+  *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+  *=)   ac_optarg= ;;
+  *)    ac_optarg=yes ;;
+  esac
+
+  # Accept the important Cygnus configure options, so we can diagnose typos.
+
+  case $ac_dashdash$ac_option in
+  --)
+    ac_dashdash=yes ;;
+
+  -bindir | --bindir | --bindi | --bind | --bin | --bi)
+    ac_prev=bindir ;;
+  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+    bindir=$ac_optarg ;;
+
+  -build | --build | --buil | --bui | --bu)
+    ac_prev=build_alias ;;
+  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+    build_alias=$ac_optarg ;;
+
+  -cache-file | --cache-file | --cache-fil | --cache-fi \
+  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+    ac_prev=cache_file ;;
+  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+    cache_file=$ac_optarg ;;
+
+  --config-cache | -C)
+    cache_file=config.cache ;;
+
+  -datadir | --datadir | --datadi | --datad)
+    ac_prev=datadir ;;
+  -datadir=* | --datadir=* | --datadi=* | --datad=*)
+    datadir=$ac_optarg ;;
+
+  -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
+  | --dataroo | --dataro | --datar)
+    ac_prev=datarootdir ;;
+  -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
+  | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
+    datarootdir=$ac_optarg ;;
+
+  -disable-* | --disable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=no ;;
+
+  -docdir | --docdir | --docdi | --doc | --do)
+    ac_prev=docdir ;;
+  -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
+    docdir=$ac_optarg ;;
+
+  -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
+    ac_prev=dvidir ;;
+  -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
+    dvidir=$ac_optarg ;;
+
+  -enable-* | --enable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=\$ac_optarg ;;
+
+  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+  | --exec | --exe | --ex)
+    ac_prev=exec_prefix ;;
+  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+  | --exec=* | --exe=* | --ex=*)
+    exec_prefix=$ac_optarg ;;
+
+  -gas | --gas | --ga | --g)
+    # Obsolete; use --with-gas.
+    with_gas=yes ;;
+
+  -help | --help | --hel | --he | -h)
+    ac_init_help=long ;;
+  -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+    ac_init_help=recursive ;;
+  -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+    ac_init_help=short ;;
+
+  -host | --host | --hos | --ho)
+    ac_prev=host_alias ;;
+  -host=* | --host=* | --hos=* | --ho=*)
+    host_alias=$ac_optarg ;;
+
+  -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
+    ac_prev=htmldir ;;
+  -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
+  | --ht=*)
+    htmldir=$ac_optarg ;;
+
+  -includedir | --includedir | --includedi | --included | --include \
+  | --includ | --inclu | --incl | --inc)
+    ac_prev=includedir ;;
+  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+  | --includ=* | --inclu=* | --incl=* | --inc=*)
+    includedir=$ac_optarg ;;
+
+  -infodir | --infodir | --infodi | --infod | --info | --inf)
+    ac_prev=infodir ;;
+  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+    infodir=$ac_optarg ;;
+
+  -libdir | --libdir | --libdi | --libd)
+    ac_prev=libdir ;;
+  -libdir=* | --libdir=* | --libdi=* | --libd=*)
+    libdir=$ac_optarg ;;
+
+  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+  | --libexe | --libex | --libe)
+    ac_prev=libexecdir ;;
+  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+  | --libexe=* | --libex=* | --libe=*)
+    libexecdir=$ac_optarg ;;
+
+  -localedir | --localedir | --localedi | --localed | --locale)
+    ac_prev=localedir ;;
+  -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
+    localedir=$ac_optarg ;;
+
+  -localstatedir | --localstatedir | --localstatedi | --localstated \
+  | --localstate | --localstat | --localsta | --localst | --locals)
+    ac_prev=localstatedir ;;
+  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+  | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
+    localstatedir=$ac_optarg ;;
+
+  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+    ac_prev=mandir ;;
+  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+    mandir=$ac_optarg ;;
+
+  -nfp | --nfp | --nf)
+    # Obsolete; use --without-fp.
+    with_fp=no ;;
+
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c | -n)
+    no_create=yes ;;
+
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+    no_recursion=yes ;;
+
+  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+  | --oldin | --oldi | --old | --ol | --o)
+    ac_prev=oldincludedir ;;
+  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+    oldincludedir=$ac_optarg ;;
+
+  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+    ac_prev=prefix ;;
+  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+    prefix=$ac_optarg ;;
+
+  -program-prefix | --program-prefix | --program-prefi | --program-pref \
+  | --program-pre | --program-pr | --program-p)
+    ac_prev=program_prefix ;;
+  -program-prefix=* | --program-prefix=* | --program-prefi=* \
+  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+    program_prefix=$ac_optarg ;;
+
+  -program-suffix | --program-suffix | --program-suffi | --program-suff \
+  | --program-suf | --program-su | --program-s)
+    ac_prev=program_suffix ;;
+  -program-suffix=* | --program-suffix=* | --program-suffi=* \
+  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+    program_suffix=$ac_optarg ;;
+
+  -program-transform-name | --program-transform-name \
+  | --program-transform-nam | --program-transform-na \
+  | --program-transform-n | --program-transform- \
+  | --program-transform | --program-transfor \
+  | --program-transfo | --program-transf \
+  | --program-trans | --program-tran \
+  | --progr-tra | --program-tr | --program-t)
+    ac_prev=program_transform_name ;;
+  -program-transform-name=* | --program-transform-name=* \
+  | --program-transform-nam=* | --program-transform-na=* \
+  | --program-transform-n=* | --program-transform-=* \
+  | --program-transform=* | --program-transfor=* \
+  | --program-transfo=* | --program-transf=* \
+  | --program-trans=* | --program-tran=* \
+  | --progr-tra=* | --program-tr=* | --program-t=*)
+    program_transform_name=$ac_optarg ;;
+
+  -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
+    ac_prev=pdfdir ;;
+  -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
+    pdfdir=$ac_optarg ;;
+
+  -psdir | --psdir | --psdi | --psd | --ps)
+    ac_prev=psdir ;;
+  -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
+    psdir=$ac_optarg ;;
+
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil)
+    silent=yes ;;
+
+  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+    ac_prev=sbindir ;;
+  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+  | --sbi=* | --sb=*)
+    sbindir=$ac_optarg ;;
+
+  -sharedstatedir | --sharedstatedir | --sharedstatedi \
+  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+  | --sharedst | --shareds | --shared | --share | --shar \
+  | --sha | --sh)
+    ac_prev=sharedstatedir ;;
+  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+  | --sha=* | --sh=*)
+    sharedstatedir=$ac_optarg ;;
+
+  -site | --site | --sit)
+    ac_prev=site ;;
+  -site=* | --site=* | --sit=*)
+    site=$ac_optarg ;;
+
+  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+    ac_prev=srcdir ;;
+  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+    srcdir=$ac_optarg ;;
+
+  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+  | --syscon | --sysco | --sysc | --sys | --sy)
+    ac_prev=sysconfdir ;;
+  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+    sysconfdir=$ac_optarg ;;
+
+  -target | --target | --targe | --targ | --tar | --ta | --t)
+    ac_prev=target_alias ;;
+  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+    target_alias=$ac_optarg ;;
+
+  -v | -verbose | --verbose | --verbos | --verbo | --verb)
+    verbose=yes ;;
+
+  -version | --version | --versio | --versi | --vers | -V)
+    ac_init_version=: ;;
+
+  -with-* | --with-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=\$ac_optarg ;;
+
+  -without-* | --without-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=no ;;
+
+  --x)
+    # Obsolete; use --with-x.
+    with_x=yes ;;
+
+  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+  | --x-incl | --x-inc | --x-in | --x-i)
+    ac_prev=x_includes ;;
+  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+    x_includes=$ac_optarg ;;
+
+  -x-libraries | --x-libraries | --x-librarie | --x-librari \
+  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+    ac_prev=x_libraries ;;
+  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+    x_libraries=$ac_optarg ;;
+
+  -*) as_fn_error $? "unrecognized option: \`$ac_option'
+Try \`$0 --help' for more information"
+    ;;
+
+  *=*)
+    ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+    # Reject names that are not valid shell variable names.
+    case $ac_envvar in #(
+      '' | [0-9]* | *[!_$as_cr_alnum]* )
+      as_fn_error $? "invalid variable name: \`$ac_envvar'" ;;
+    esac
+    eval $ac_envvar=\$ac_optarg
+    export $ac_envvar ;;
+
+  *)
+    # FIXME: should be removed in autoconf 3.0.
+    $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+    expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+    : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}"
+    ;;
+
+  esac
+done
+
+if test -n "$ac_prev"; then
+  ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+  as_fn_error $? "missing argument to $ac_option"
+fi
+
+if test -n "$ac_unrecognized_opts"; then
+  case $enable_option_checking in
+    no) ;;
+    fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
+    *)     $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+  esac
+fi
+
+# Check all directory arguments for consistency.
+for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
+		datadir sysconfdir sharedstatedir localstatedir includedir \
+		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
+		libdir localedir mandir
+do
+  eval ac_val=\$$ac_var
+  # Remove trailing slashes.
+  case $ac_val in
+    */ )
+      ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'`
+      eval $ac_var=\$ac_val;;
+  esac
+  # Be sure to have absolute directory names.
+  case $ac_val in
+    [\\/$]* | ?:[\\/]* )  continue;;
+    NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
+  esac
+  as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val"
+done
+
+# There might be people who depend on the old broken behavior: `$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+  if test "x$build_alias" = x; then
+    cross_compiling=maybe
+  elif test "x$build_alias" != "x$host_alias"; then
+    cross_compiling=yes
+  fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+ac_pwd=`pwd` && test -n "$ac_pwd" &&
+ac_ls_di=`ls -di .` &&
+ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
+  as_fn_error $? "working directory cannot be determined"
+test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
+  as_fn_error $? "pwd does not report name of working directory"
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+  ac_srcdir_defaulted=yes
+  # Try the directory containing this script, then the parent directory.
+  ac_confdir=`$as_dirname -- "$as_myself" ||
+$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_myself" : 'X\(//\)[^/]' \| \
+	 X"$as_myself" : 'X\(//\)$' \| \
+	 X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_myself" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  srcdir=$ac_confdir
+  if test ! -r "$srcdir/$ac_unique_file"; then
+    srcdir=..
+  fi
+else
+  ac_srcdir_defaulted=no
+fi
+if test ! -r "$srcdir/$ac_unique_file"; then
+  test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
+  as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir"
+fi
+ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
+ac_abs_confdir=`(
+	cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
+	pwd)`
+# When building in place, set srcdir=.
+if test "$ac_abs_confdir" = "$ac_pwd"; then
+  srcdir=.
+fi
+# Remove unnecessary trailing slashes from srcdir.
+# Double slashes in file names in object file debugging info
+# mess up M-x gdb in Emacs.
+case $srcdir in
+*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
+esac
+for ac_var in $ac_precious_vars; do
+  eval ac_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_env_${ac_var}_value=\$${ac_var}
+  eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_cv_env_${ac_var}_value=\$${ac_var}
+done
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+  # Omit some internal or obsolete options to make the list less imposing.
+  # This message is too long to be a string in the A/UX 3.1 sh.
+  cat <<_ACEOF
+\`configure' configures RubberBand 1.7 to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE.  See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+  -h, --help              display this help and exit
+      --help=short        display options specific to this package
+      --help=recursive    display the short help of all the included packages
+  -V, --version           display version information and exit
+  -q, --quiet, --silent   do not print \`checking ...' messages
+      --cache-file=FILE   cache test results in FILE [disabled]
+  -C, --config-cache      alias for \`--cache-file=config.cache'
+  -n, --no-create         do not create output files
+      --srcdir=DIR        find the sources in DIR [configure dir or \`..']
+
+Installation directories:
+  --prefix=PREFIX         install architecture-independent files in PREFIX
+                          [$ac_default_prefix]
+  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
+                          [PREFIX]
+
+By default, \`make install' will install all the files in
+\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc.  You can specify
+an installation prefix other than \`$ac_default_prefix' using \`--prefix',
+for instance \`--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+  --bindir=DIR            user executables [EPREFIX/bin]
+  --sbindir=DIR           system admin executables [EPREFIX/sbin]
+  --libexecdir=DIR        program executables [EPREFIX/libexec]
+  --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
+  --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
+  --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --libdir=DIR            object code libraries [EPREFIX/lib]
+  --includedir=DIR        C header files [PREFIX/include]
+  --oldincludedir=DIR     C header files for non-gcc [/usr/include]
+  --datarootdir=DIR       read-only arch.-independent data root [PREFIX/share]
+  --datadir=DIR           read-only architecture-independent data [DATAROOTDIR]
+  --infodir=DIR           info documentation [DATAROOTDIR/info]
+  --localedir=DIR         locale-dependent data [DATAROOTDIR/locale]
+  --mandir=DIR            man documentation [DATAROOTDIR/man]
+  --docdir=DIR            documentation root [DATAROOTDIR/doc/rubberband]
+  --htmldir=DIR           html documentation [DOCDIR]
+  --dvidir=DIR            dvi documentation [DOCDIR]
+  --pdfdir=DIR            pdf documentation [DOCDIR]
+  --psdir=DIR             ps documentation [DOCDIR]
+_ACEOF
+
+  cat <<\_ACEOF
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+  case $ac_init_help in
+     short | recursive ) echo "Configuration of RubberBand 1.7:";;
+   esac
+  cat <<\_ACEOF
+
+Some influential environment variables:
+  CXX         C++ compiler command
+  CXXFLAGS    C++ compiler flags
+  LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
+              nonstandard directory <lib dir>
+  LIBS        libraries to pass to the linker, e.g. -l<library>
+  CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
+              you have headers in a nonstandard directory <include dir>
+  CC          C compiler command
+  CFLAGS      C compiler flags
+  CPP         C preprocessor
+  PKG_CONFIG  path to pkg-config utility
+  PKG_CONFIG_PATH
+              directories to add to pkg-config's search path
+  PKG_CONFIG_LIBDIR
+              path overriding pkg-config's built-in search path
+  SRC_CFLAGS  C compiler flags for SRC, overriding pkg-config
+  SRC_LIBS    linker flags for SRC, overriding pkg-config
+  SNDFILE_CFLAGS
+              C compiler flags for SNDFILE, overriding pkg-config
+  SNDFILE_LIBS
+              linker flags for SNDFILE, overriding pkg-config
+  FFTW_CFLAGS C compiler flags for FFTW, overriding pkg-config
+  FFTW_LIBS   linker flags for FFTW, overriding pkg-config
+  Vamp_CFLAGS C compiler flags for Vamp, overriding pkg-config
+  Vamp_LIBS   linker flags for Vamp, overriding pkg-config
+
+Use these variables to override the choices made by `configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+Report bugs to <chris.cannam@breakfastquay.com>.
+_ACEOF
+ac_status=$?
+fi
+
+if test "$ac_init_help" = "recursive"; then
+  # If there are subdirs, report their specific --help.
+  for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+    test -d "$ac_dir" ||
+      { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } ||
+      continue
+    ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+    cd "$ac_dir" || { ac_status=$?; continue; }
+    # Check for guested configure.
+    if test -f "$ac_srcdir/configure.gnu"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure.gnu" --help=recursive
+    elif test -f "$ac_srcdir/configure"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure" --help=recursive
+    else
+      $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+    fi || ac_status=$?
+    cd "$ac_pwd" || { ac_status=$?; break; }
+  done
+fi
+
+test -n "$ac_init_help" && exit $ac_status
+if $ac_init_version; then
+  cat <<\_ACEOF
+RubberBand configure 1.7
+generated by GNU Autoconf 2.69
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+_ACEOF
+  exit
+fi
+
+## ------------------------ ##
+## Autoconf initialization. ##
+## ------------------------ ##
+
+# ac_fn_cxx_try_compile LINENO
+# ----------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_cxx_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_cxx_try_compile
+
+# ac_fn_c_try_compile LINENO
+# --------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_compile
+
+# ac_fn_c_try_cpp LINENO
+# ----------------------
+# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_cpp ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } > conftest.i && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+    ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_cpp
+
+# ac_fn_c_try_run LINENO
+# ----------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
+# that executables *can* be run.
+ac_fn_c_try_run ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: program exited with status $ac_status" >&5
+       $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+       ac_retval=$ac_status
+fi
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_run
+
+# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists and can be compiled using the include files in
+# INCLUDES, setting the cache variable VAR accordingly.
+ac_fn_c_check_header_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_header_compile
+
+# ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists, giving a warning if it cannot be compiled using
+# the include files in INCLUDES and setting the cache variable VAR
+# accordingly.
+ac_fn_c_check_header_mongrel ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if eval \${$3+:} false; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+else
+  # Is the header compilable?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5
+$as_echo_n "checking $2 usability... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_header_compiler=yes
+else
+  ac_header_compiler=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5
+$as_echo "$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5
+$as_echo_n "checking $2 presence... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <$2>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  ac_header_preproc=yes
+else
+  ac_header_preproc=no
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5
+$as_echo "$ac_header_preproc" >&6; }
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in #((
+  yes:no: )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5
+$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+    ;;
+  no:yes:* )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5
+$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     check for missing prerequisite headers?" >&5
+$as_echo "$as_me: WARNING: $2:     check for missing prerequisite headers?" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5
+$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&5
+$as_echo "$as_me: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+( $as_echo "## --------------------------------------------- ##
+## Report this to chris.cannam@breakfastquay.com ##
+## --------------------------------------------- ##"
+     ) | sed "s/^/$as_me: WARNING:     /" >&2
+    ;;
+esac
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=\$ac_header_compiler"
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_header_mongrel
+cat >config.log <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by RubberBand $as_me 1.7, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
+
+  $ $0 $@
+
+_ACEOF
+exec 5>>config.log
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null     || echo unknown`
+
+/bin/arch              = `(/bin/arch) 2>/dev/null              || echo unknown`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null       || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+/usr/bin/hostinfo      = `(/usr/bin/hostinfo) 2>/dev/null      || echo unknown`
+/bin/machine           = `(/bin/machine) 2>/dev/null           || echo unknown`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null       || echo unknown`
+/bin/universe          = `(/bin/universe) 2>/dev/null          || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    $as_echo "PATH: $as_dir"
+  done
+IFS=$as_save_IFS
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+  for ac_arg
+  do
+    case $ac_arg in
+    -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+    -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+    | -silent | --silent | --silen | --sile | --sil)
+      continue ;;
+    *\'*)
+      ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    esac
+    case $ac_pass in
+    1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
+    2)
+      as_fn_append ac_configure_args1 " '$ac_arg'"
+      if test $ac_must_keep_next = true; then
+	ac_must_keep_next=false # Got value, back to normal.
+      else
+	case $ac_arg in
+	  *=* | --config-cache | -C | -disable-* | --disable-* \
+	  | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+	  | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+	  | -with-* | --with-* | -without-* | --without-* | --x)
+	    case "$ac_configure_args0 " in
+	      "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+	    esac
+	    ;;
+	  -* ) ac_must_keep_next=true ;;
+	esac
+      fi
+      as_fn_append ac_configure_args " '$ac_arg'"
+      ;;
+    esac
+  done
+done
+{ ac_configure_args0=; unset ac_configure_args0;}
+{ ac_configure_args1=; unset ac_configure_args1;}
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log.  We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Use '\'' to represent an apostrophe within the trap.
+# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
+trap 'exit_status=$?
+  # Save into config.log some information that might help in debugging.
+  {
+    echo
+
+    $as_echo "## ---------------- ##
+## Cache variables. ##
+## ---------------- ##"
+    echo
+    # The following way of writing the cache mishandles newlines in values,
+(
+  for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+  (set) 2>&1 |
+    case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      sed -n \
+	"s/'\''/'\''\\\\'\'''\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
+      ;; #(
+    *)
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+)
+    echo
+
+    $as_echo "## ----------------- ##
+## Output variables. ##
+## ----------------- ##"
+    echo
+    for ac_var in $ac_subst_vars
+    do
+      eval ac_val=\$$ac_var
+      case $ac_val in
+      *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+      esac
+      $as_echo "$ac_var='\''$ac_val'\''"
+    done | sort
+    echo
+
+    if test -n "$ac_subst_files"; then
+      $as_echo "## ------------------- ##
+## File substitutions. ##
+## ------------------- ##"
+      echo
+      for ac_var in $ac_subst_files
+      do
+	eval ac_val=\$$ac_var
+	case $ac_val in
+	*\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+	esac
+	$as_echo "$ac_var='\''$ac_val'\''"
+      done | sort
+      echo
+    fi
+
+    if test -s confdefs.h; then
+      $as_echo "## ----------- ##
+## confdefs.h. ##
+## ----------- ##"
+      echo
+      cat confdefs.h
+      echo
+    fi
+    test "$ac_signal" != 0 &&
+      $as_echo "$as_me: caught signal $ac_signal"
+    $as_echo "$as_me: exit $exit_status"
+  } >&5
+  rm -f core *.core core.conftest.* &&
+    rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
+    exit $exit_status
+' 0
+for ac_signal in 1 2 13 15; do
+  trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -f -r conftest* confdefs.h
+
+$as_echo "/* confdefs.h */" > confdefs.h
+
+# Predefined preprocessor variables.
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_NAME "$PACKAGE_NAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_VERSION "$PACKAGE_VERSION"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_STRING "$PACKAGE_STRING"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_URL "$PACKAGE_URL"
+_ACEOF
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer an explicitly selected file to automatically selected ones.
+ac_site_file1=NONE
+ac_site_file2=NONE
+if test -n "$CONFIG_SITE"; then
+  # We do not want a PATH search for config.site.
+  case $CONFIG_SITE in #((
+    -*)  ac_site_file1=./$CONFIG_SITE;;
+    */*) ac_site_file1=$CONFIG_SITE;;
+    *)   ac_site_file1=./$CONFIG_SITE;;
+  esac
+elif test "x$prefix" != xNONE; then
+  ac_site_file1=$prefix/share/config.site
+  ac_site_file2=$prefix/etc/config.site
+else
+  ac_site_file1=$ac_default_prefix/share/config.site
+  ac_site_file2=$ac_default_prefix/etc/config.site
+fi
+for ac_site_file in "$ac_site_file1" "$ac_site_file2"
+do
+  test "x$ac_site_file" = xNONE && continue
+  if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
+$as_echo "$as_me: loading site script $ac_site_file" >&6;}
+    sed 's/^/| /' "$ac_site_file" >&5
+    . "$ac_site_file" \
+      || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "failed to load site script $ac_site_file
+See \`config.log' for more details" "$LINENO" 5; }
+  fi
+done
+
+if test -r "$cache_file"; then
+  # Some versions of bash will fail to source /dev/null (special files
+  # actually), so we avoid doing that.  DJGPP emulates it as a regular file.
+  if test /dev/null != "$cache_file" && test -f "$cache_file"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
+$as_echo "$as_me: loading cache $cache_file" >&6;}
+    case $cache_file in
+      [\\/]* | ?:[\\/]* ) . "$cache_file";;
+      *)                      . "./$cache_file";;
+    esac
+  fi
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
+$as_echo "$as_me: creating cache $cache_file" >&6;}
+  >$cache_file
+fi
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+  eval ac_old_set=\$ac_cv_env_${ac_var}_set
+  eval ac_new_set=\$ac_env_${ac_var}_set
+  eval ac_old_val=\$ac_cv_env_${ac_var}_value
+  eval ac_new_val=\$ac_env_${ac_var}_value
+  case $ac_old_set,$ac_new_set in
+    set,)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,set)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,);;
+    *)
+      if test "x$ac_old_val" != "x$ac_new_val"; then
+	# differences in whitespace do not lead to failure.
+	ac_old_val_w=`echo x $ac_old_val`
+	ac_new_val_w=`echo x $ac_new_val`
+	if test "$ac_old_val_w" != "$ac_new_val_w"; then
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
+$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+	  ac_cache_corrupted=:
+	else
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
+$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
+	  eval $ac_var=\$ac_old_val
+	fi
+	{ $as_echo "$as_me:${as_lineno-$LINENO}:   former value:  \`$ac_old_val'" >&5
+$as_echo "$as_me:   former value:  \`$ac_old_val'" >&2;}
+	{ $as_echo "$as_me:${as_lineno-$LINENO}:   current value: \`$ac_new_val'" >&5
+$as_echo "$as_me:   current value: \`$ac_new_val'" >&2;}
+      fi;;
+  esac
+  # Pass precious variables to config.status.
+  if test "$ac_new_set" = set; then
+    case $ac_new_val in
+    *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *) ac_arg=$ac_var=$ac_new_val ;;
+    esac
+    case " $ac_configure_args " in
+      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
+      *) as_fn_append ac_configure_args " '$ac_arg'" ;;
+    esac
+  fi
+done
+if $ac_cache_corrupted; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
+$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
+  as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5
+fi
+## -------------------- ##
+## Main body of script. ##
+## -------------------- ##
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+if test -z "$CXX"; then
+  if test -n "$CCC"; then
+    CXX=$CCC
+  else
+    if test -n "$ac_tool_prefix"; then
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CXX"; then
+  ac_cv_prog_CXX="$CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CXX=$ac_cv_prog_CXX
+if test -n "$CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
+$as_echo "$CXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$CXX" && break
+  done
+fi
+if test -z "$CXX"; then
+  ac_ct_CXX=$CXX
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CXX"; then
+  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
+if test -n "$ac_ct_CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5
+$as_echo "$ac_ct_CXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CXX" && break
+done
+
+  if test "x$ac_ct_CXX" = x; then
+    CXX="g++"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CXX=$ac_ct_CXX
+  fi
+fi
+
+  fi
+fi
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
+# Try to create an executable without -o first, disregard a.out.
+# It will help us diagnose broken compilers, and finding out an intuition
+# of exeext.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C++ compiler works" >&5
+$as_echo_n "checking whether the C++ compiler works... " >&6; }
+ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+
+# The possible output files:
+ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
+
+ac_rmfiles=
+for ac_file in $ac_files
+do
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    * ) ac_rmfiles="$ac_rmfiles $ac_file";;
+  esac
+done
+rm -f $ac_rmfiles
+
+if { { ac_try="$ac_link_default"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link_default") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
+# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
+# in a Makefile.  We should not override ac_cv_exeext if it was cached,
+# so that the user can short-circuit this test for compilers unknown to
+# Autoconf.
+for ac_file in $ac_files ''
+do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj )
+	;;
+    [ab].out )
+	# We found the default executable, but exeext='' is most
+	# certainly right.
+	break;;
+    *.* )
+	if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+	then :; else
+	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	fi
+	# We set ac_cv_exeext here because the later test for it is not
+	# safe: cross compilers may not add the suffix if given an `-o'
+	# argument, so we may need to know it at that point already.
+	# Even if this section looks crufty: it has the advantage of
+	# actually working.
+	break;;
+    * )
+	break;;
+  esac
+done
+test "$ac_cv_exeext" = no && ac_cv_exeext=
+
+else
+  ac_file=''
+fi
+if test -z "$ac_file"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+$as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "C++ compiler cannot create executables
+See \`config.log' for more details" "$LINENO" 5; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler default output file name" >&5
+$as_echo_n "checking for C++ compiler default output file name... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
+$as_echo "$ac_file" >&6; }
+ac_exeext=$ac_cv_exeext
+
+rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
+$as_echo_n "checking for suffix of executables... " >&6; }
+if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  # If both `conftest.exe' and `conftest' are `present' (well, observable)
+# catch `conftest.exe'.  For instance with Cygwin, `ls conftest' will
+# work properly (i.e., refer to `conftest.exe'), while it won't with
+# `rm'.
+for ac_file in conftest.exe conftest conftest.*; do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	  break;;
+    * ) break;;
+  esac
+done
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest conftest$ac_cv_exeext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
+$as_echo "$ac_cv_exeext" >&6; }
+
+rm -f conftest.$ac_ext
+EXEEXT=$ac_cv_exeext
+ac_exeext=$EXEEXT
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
+FILE *f = fopen ("conftest.out", "w");
+ return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files="$ac_clean_files conftest.out"
+# Check that the compiler produces executables we can run.  If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
+$as_echo_n "checking whether we are cross compiling... " >&6; }
+if test "$cross_compiling" != yes; then
+  { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+  if { ac_try='./conftest$ac_cv_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+    cross_compiling=no
+  else
+    if test "$cross_compiling" = maybe; then
+	cross_compiling=yes
+    else
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot run C++ compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details" "$LINENO" 5; }
+    fi
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
+$as_echo "$cross_compiling" >&6; }
+
+rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
+$as_echo_n "checking for suffix of object files... " >&6; }
+if ${ac_cv_objext+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.o conftest.obj
+if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  for ac_file in conftest.o conftest.obj conftest.*; do
+  test -f "$ac_file" || continue;
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;;
+    *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
+       break;;
+  esac
+done
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of object files: cannot compile
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest.$ac_cv_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
+$as_echo "$ac_cv_objext" >&6; }
+OBJEXT=$ac_cv_objext
+ac_objext=$OBJEXT
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5
+$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; }
+if ${ac_cv_cxx_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5
+$as_echo "$ac_cv_cxx_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+  GXX=yes
+else
+  GXX=
+fi
+ac_test_CXXFLAGS=${CXXFLAGS+set}
+ac_save_CXXFLAGS=$CXXFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5
+$as_echo_n "checking whether $CXX accepts -g... " >&6; }
+if ${ac_cv_prog_cxx_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_cxx_werror_flag=$ac_cxx_werror_flag
+   ac_cxx_werror_flag=yes
+   ac_cv_prog_cxx_g=no
+   CXXFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_prog_cxx_g=yes
+else
+  CXXFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+
+else
+  ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+	 CXXFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_prog_cxx_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5
+$as_echo "$ac_cv_prog_cxx_g" >&6; }
+if test "$ac_test_CXXFLAGS" = set; then
+  CXXFLAGS=$ac_save_CXXFLAGS
+elif test $ac_cv_prog_cxx_g = yes; then
+  if test "$GXX" = yes; then
+    CXXFLAGS="-g -O2"
+  else
+    CXXFLAGS="-g"
+  fi
+else
+  if test "$GXX" = yes; then
+    CXXFLAGS="-O2"
+  else
+    CXXFLAGS=
+  fi
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}gcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_CC"; then
+  ac_ct_CC=$CC
+  # Extract the first word of "gcc", so it can be a program name with args.
+set dummy gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="gcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+else
+  CC="$ac_cv_prog_CC"
+fi
+
+if test -z "$CC"; then
+          if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  fi
+fi
+if test -z "$CC"; then
+  # Extract the first word of "cc", so it can be a program name with args.
+set dummy cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+  ac_prog_rejected=no
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
+       ac_prog_rejected=yes
+       continue
+     fi
+    ac_cv_prog_CC="cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+if test $ac_prog_rejected = yes; then
+  # We found a bogon in the path, so make sure we never use it.
+  set dummy $ac_cv_prog_CC
+  shift
+  if test $# != 0; then
+    # We chose a different compiler from the bogus one.
+    # However, it has the same basename, so the bogon will be chosen
+    # first if we set CC to just the basename; use the full file name.
+    shift
+    ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
+  fi
+fi
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$CC"; then
+  if test -n "$ac_tool_prefix"; then
+  for ac_prog in cl.exe
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$CC" && break
+  done
+fi
+if test -z "$CC"; then
+  ac_ct_CC=$CC
+  for ac_prog in cl.exe
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CC" && break
+done
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+fi
+
+fi
+
+
+test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "no acceptable C compiler found in \$PATH
+See \`config.log' for more details" "$LINENO" 5; }
+
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
+$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
+if ${ac_cv_c_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_c_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
+$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+  GCC=yes
+else
+  GCC=
+fi
+ac_test_CFLAGS=${CFLAGS+set}
+ac_save_CFLAGS=$CFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
+$as_echo_n "checking whether $CC accepts -g... " >&6; }
+if ${ac_cv_prog_cc_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_c_werror_flag=$ac_c_werror_flag
+   ac_c_werror_flag=yes
+   ac_cv_prog_cc_g=no
+   CFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+else
+  CFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+else
+  ac_c_werror_flag=$ac_save_c_werror_flag
+	 CFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_c_werror_flag=$ac_save_c_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
+$as_echo "$ac_cv_prog_cc_g" >&6; }
+if test "$ac_test_CFLAGS" = set; then
+  CFLAGS=$ac_save_CFLAGS
+elif test $ac_cv_prog_cc_g = yes; then
+  if test "$GCC" = yes; then
+    CFLAGS="-g -O2"
+  else
+    CFLAGS="-g"
+  fi
+else
+  if test "$GCC" = yes; then
+    CFLAGS="-O2"
+  else
+    CFLAGS=
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
+$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
+if ${ac_cv_prog_cc_c89+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdio.h>
+struct stat;
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+     char **p;
+     int i;
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
+   function prototypes and stuff, but not '\xHH' hex character constants.
+   These don't provoke an error unfortunately, instead are silently treated
+   as 'x'.  The following induces an error, until -std is added to get
+   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
+   array size at least.  It's necessary to write '\x00'==0 to get something
+   that's true only with -std.  */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+   inside strings and character constants.  */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_c89=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+  x)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+  xno)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c89"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c89" != xno; then :
+
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5
+$as_echo_n "checking how to run the C preprocessor... " >&6; }
+# On Suns, sometimes $CPP names a directory.
+if test -n "$CPP" && test -d "$CPP"; then
+  CPP=
+fi
+if test -z "$CPP"; then
+  if ${ac_cv_prog_CPP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+      # Double quotes because CPP needs to be expanded
+    for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp"
+    do
+      ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+  break
+fi
+
+    done
+    ac_cv_prog_CPP=$CPP
+
+fi
+  CPP=$ac_cv_prog_CPP
+else
+  ac_cv_prog_CPP=$CPP
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5
+$as_echo "$CPP" >&6; }
+ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "C preprocessor \"$CPP\" fails sanity check
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
+if ${ac_cv_path_GREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$GREP"; then
+  ac_path_GREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in grep ggrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_GREP" || continue
+# Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_GREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_GREP"; then
+    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+$as_echo "$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
+$as_echo_n "checking for egrep... " >&6; }
+if ${ac_cv_path_EGREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+   then ac_cv_path_EGREP="$GREP -E"
+   else
+     if test -z "$EGREP"; then
+  ac_path_EGREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in egrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_EGREP" || continue
+# Check for GNU ac_path_EGREP and select it if it is found.
+  # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'EGREP' >> "conftest.nl"
+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_EGREP="$ac_path_EGREP"
+      ac_path_EGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_EGREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_EGREP"; then
+    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_EGREP=$EGREP
+fi
+
+   fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
+$as_echo "$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
+$as_echo_n "checking for ANSI C header files... " >&6; }
+if ${ac_cv_header_stdc+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_header_stdc=yes
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "memchr" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "free" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+  if test "$cross_compiling" = yes; then :
+  :
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+		   (('a' <= (c) && (c) <= 'i') \
+		     || ('j' <= (c) && (c) <= 'r') \
+		     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+	|| toupper (i) != TOUPPER (i))
+      return 2;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
+$as_echo "$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+$as_echo "#define STDC_HEADERS 1" >>confdefs.h
+
+fi
+
+# On IRIX 5.3, sys/types and inttypes.h are conflicting.
+for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
+		  inttypes.h stdint.h unistd.h
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
+"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
+$as_echo_n "checking whether byte ordering is bigendian... " >&6; }
+if ${ac_cv_c_bigendian+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_c_bigendian=unknown
+    # See if we're dealing with a universal compiler.
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifndef __APPLE_CC__
+	       not a universal capable compiler
+	     #endif
+	     typedef int dummy;
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+	# Check for potential -arch flags.  It is not universal unless
+	# there are at least two -arch flags with different values.
+	ac_arch=
+	ac_prev=
+	for ac_word in $CC $CFLAGS $CPPFLAGS $LDFLAGS; do
+	 if test -n "$ac_prev"; then
+	   case $ac_word in
+	     i?86 | x86_64 | ppc | ppc64)
+	       if test -z "$ac_arch" || test "$ac_arch" = "$ac_word"; then
+		 ac_arch=$ac_word
+	       else
+		 ac_cv_c_bigendian=universal
+		 break
+	       fi
+	       ;;
+	   esac
+	   ac_prev=
+	 elif test "x$ac_word" = "x-arch"; then
+	   ac_prev=arch
+	 fi
+       done
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    if test $ac_cv_c_bigendian = unknown; then
+      # See if sys/param.h defines the BYTE_ORDER macro.
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <sys/types.h>
+	     #include <sys/param.h>
+
+int
+main ()
+{
+#if ! (defined BYTE_ORDER && defined BIG_ENDIAN \
+		     && defined LITTLE_ENDIAN && BYTE_ORDER && BIG_ENDIAN \
+		     && LITTLE_ENDIAN)
+	      bogus endian macros
+	     #endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  # It does; now see whether it defined to BIG_ENDIAN or not.
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <sys/types.h>
+		#include <sys/param.h>
+
+int
+main ()
+{
+#if BYTE_ORDER != BIG_ENDIAN
+		 not big endian
+		#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_c_bigendian=yes
+else
+  ac_cv_c_bigendian=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    fi
+    if test $ac_cv_c_bigendian = unknown; then
+      # See if <limits.h> defines _LITTLE_ENDIAN or _BIG_ENDIAN (e.g., Solaris).
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <limits.h>
+
+int
+main ()
+{
+#if ! (defined _LITTLE_ENDIAN || defined _BIG_ENDIAN)
+	      bogus endian macros
+	     #endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  # It does; now see whether it defined to _BIG_ENDIAN or not.
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <limits.h>
+
+int
+main ()
+{
+#ifndef _BIG_ENDIAN
+		 not big endian
+		#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_c_bigendian=yes
+else
+  ac_cv_c_bigendian=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    fi
+    if test $ac_cv_c_bigendian = unknown; then
+      # Compile a test program.
+      if test "$cross_compiling" = yes; then :
+  # Try to guess by grepping values from an object file.
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+short int ascii_mm[] =
+		  { 0x4249, 0x4765, 0x6E44, 0x6961, 0x6E53, 0x7953, 0 };
+		short int ascii_ii[] =
+		  { 0x694C, 0x5454, 0x656C, 0x6E45, 0x6944, 0x6E61, 0 };
+		int use_ascii (int i) {
+		  return ascii_mm[i] + ascii_ii[i];
+		}
+		short int ebcdic_ii[] =
+		  { 0x89D3, 0xE3E3, 0x8593, 0x95C5, 0x89C4, 0x9581, 0 };
+		short int ebcdic_mm[] =
+		  { 0xC2C9, 0xC785, 0x95C4, 0x8981, 0x95E2, 0xA8E2, 0 };
+		int use_ebcdic (int i) {
+		  return ebcdic_mm[i] + ebcdic_ii[i];
+		}
+		extern int foo;
+
+int
+main ()
+{
+return use_ascii (foo) == use_ebcdic (foo);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  if grep BIGenDianSyS conftest.$ac_objext >/dev/null; then
+	      ac_cv_c_bigendian=yes
+	    fi
+	    if grep LiTTleEnDian conftest.$ac_objext >/dev/null ; then
+	      if test "$ac_cv_c_bigendian" = unknown; then
+		ac_cv_c_bigendian=no
+	      else
+		# finding both strings is unlikely to happen, but who knows?
+		ac_cv_c_bigendian=unknown
+	      fi
+	    fi
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+
+	     /* Are we little or big endian?  From Harbison&Steele.  */
+	     union
+	     {
+	       long int l;
+	       char c[sizeof (long int)];
+	     } u;
+	     u.l = 1;
+	     return u.c[sizeof (long int) - 1] == 1;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ac_cv_c_bigendian=no
+else
+  ac_cv_c_bigendian=yes
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+    fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_bigendian" >&5
+$as_echo "$ac_cv_c_bigendian" >&6; }
+ case $ac_cv_c_bigendian in #(
+   yes)
+     $as_echo "#define WORDS_BIGENDIAN 1" >>confdefs.h
+;; #(
+   no)
+      ;; #(
+   universal)
+
+$as_echo "#define AC_APPLE_UNIVERSAL_BUILD 1" >>confdefs.h
+
+     ;; #(
+   *)
+     as_fn_error $? "unknown endianness
+ presetting ac_cv_c_bigendian=no (or yes) will help" "$LINENO" 5 ;;
+ esac
+
+
+
+
+
+
+
+
+
+if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
+	if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}pkg-config", so it can be a program name with args.
+set dummy ${ac_tool_prefix}pkg-config; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_PKG_CONFIG+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $PKG_CONFIG in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_PKG_CONFIG="$PKG_CONFIG" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_PKG_CONFIG="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  ;;
+esac
+fi
+PKG_CONFIG=$ac_cv_path_PKG_CONFIG
+if test -n "$PKG_CONFIG"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $PKG_CONFIG" >&5
+$as_echo "$PKG_CONFIG" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_path_PKG_CONFIG"; then
+  ac_pt_PKG_CONFIG=$PKG_CONFIG
+  # Extract the first word of "pkg-config", so it can be a program name with args.
+set dummy pkg-config; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_ac_pt_PKG_CONFIG+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $ac_pt_PKG_CONFIG in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_ac_pt_PKG_CONFIG="$ac_pt_PKG_CONFIG" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_ac_pt_PKG_CONFIG="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  ;;
+esac
+fi
+ac_pt_PKG_CONFIG=$ac_cv_path_ac_pt_PKG_CONFIG
+if test -n "$ac_pt_PKG_CONFIG"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_PKG_CONFIG" >&5
+$as_echo "$ac_pt_PKG_CONFIG" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_pt_PKG_CONFIG" = x; then
+    PKG_CONFIG=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    PKG_CONFIG=$ac_pt_PKG_CONFIG
+  fi
+else
+  PKG_CONFIG="$ac_cv_path_PKG_CONFIG"
+fi
+
+fi
+if test -n "$PKG_CONFIG"; then
+	_pkg_min_version=0.9.0
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking pkg-config is at least version $_pkg_min_version" >&5
+$as_echo_n "checking pkg-config is at least version $_pkg_min_version... " >&6; }
+	if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+	else
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+		PKG_CONFIG=""
+	fi
+fi
+
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SRC" >&5
+$as_echo_n "checking for SRC... " >&6; }
+
+if test -n "$SRC_CFLAGS"; then
+    pkg_cv_SRC_CFLAGS="$SRC_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"samplerate\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "samplerate") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_SRC_CFLAGS=`$PKG_CONFIG --cflags "samplerate" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+if test -n "$SRC_LIBS"; then
+    pkg_cv_SRC_LIBS="$SRC_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"samplerate\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "samplerate") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_SRC_LIBS=`$PKG_CONFIG --libs "samplerate" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+   	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi
+        if test $_pkg_short_errors_supported = yes; then
+	        SRC_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "samplerate" 2>&1`
+        else
+	        SRC_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "samplerate" 2>&1`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$SRC_PKG_ERRORS" >&5
+
+	as_fn_error $? "Package requirements (samplerate) were not met:
+
+$SRC_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+Alternatively, you may set the environment variables SRC_CFLAGS
+and SRC_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details." "$LINENO" 5
+elif test $pkg_failed = untried; then
+     	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+Alternatively, you may set the environment variables SRC_CFLAGS
+and SRC_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.
+See \`config.log' for more details" "$LINENO" 5; }
+else
+	SRC_CFLAGS=$pkg_cv_SRC_CFLAGS
+	SRC_LIBS=$pkg_cv_SRC_LIBS
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+fi
+
+
+
+
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SNDFILE" >&5
+$as_echo_n "checking for SNDFILE... " >&6; }
+
+if test -n "$SNDFILE_CFLAGS"; then
+    pkg_cv_SNDFILE_CFLAGS="$SNDFILE_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"sndfile\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "sndfile") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_SNDFILE_CFLAGS=`$PKG_CONFIG --cflags "sndfile" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+if test -n "$SNDFILE_LIBS"; then
+    pkg_cv_SNDFILE_LIBS="$SNDFILE_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"sndfile\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "sndfile") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_SNDFILE_LIBS=`$PKG_CONFIG --libs "sndfile" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+   	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi
+        if test $_pkg_short_errors_supported = yes; then
+	        SNDFILE_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "sndfile" 2>&1`
+        else
+	        SNDFILE_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "sndfile" 2>&1`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$SNDFILE_PKG_ERRORS" >&5
+
+	as_fn_error $? "Package requirements (sndfile) were not met:
+
+$SNDFILE_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+Alternatively, you may set the environment variables SNDFILE_CFLAGS
+and SNDFILE_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details." "$LINENO" 5
+elif test $pkg_failed = untried; then
+     	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+Alternatively, you may set the environment variables SNDFILE_CFLAGS
+and SNDFILE_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.
+See \`config.log' for more details" "$LINENO" 5; }
+else
+	SNDFILE_CFLAGS=$pkg_cv_SNDFILE_CFLAGS
+	SNDFILE_LIBS=$pkg_cv_SNDFILE_LIBS
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+fi
+
+
+
+
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for FFTW" >&5
+$as_echo_n "checking for FFTW... " >&6; }
+
+if test -n "$FFTW_CFLAGS"; then
+    pkg_cv_FFTW_CFLAGS="$FFTW_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"fftw3\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "fftw3") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_FFTW_CFLAGS=`$PKG_CONFIG --cflags "fftw3" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+if test -n "$FFTW_LIBS"; then
+    pkg_cv_FFTW_LIBS="$FFTW_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"fftw3\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "fftw3") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_FFTW_LIBS=`$PKG_CONFIG --libs "fftw3" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+   	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi
+        if test $_pkg_short_errors_supported = yes; then
+	        FFTW_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "fftw3" 2>&1`
+        else
+	        FFTW_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "fftw3" 2>&1`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$FFTW_PKG_ERRORS" >&5
+
+	as_fn_error $? "Package requirements (fftw3) were not met:
+
+$FFTW_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+Alternatively, you may set the environment variables FFTW_CFLAGS
+and FFTW_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details." "$LINENO" 5
+elif test $pkg_failed = untried; then
+     	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+Alternatively, you may set the environment variables FFTW_CFLAGS
+and FFTW_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.
+See \`config.log' for more details" "$LINENO" 5; }
+else
+	FFTW_CFLAGS=$pkg_cv_FFTW_CFLAGS
+	FFTW_LIBS=$pkg_cv_FFTW_LIBS
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+fi
+
+
+
+for ac_header in ladspa.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "ladspa.h" "ac_cv_header_ladspa_h" "$ac_includes_default"
+if test "x$ac_cv_header_ladspa_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LADSPA_H 1
+_ACEOF
+
+fi
+
+done
+
+for ac_header in pthread.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "pthread.h" "ac_cv_header_pthread_h" "$ac_includes_default"
+if test "x$ac_cv_header_pthread_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_PTHREAD_H 1
+_ACEOF
+
+fi
+
+done
+
+
+
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Vamp" >&5
+$as_echo_n "checking for Vamp... " >&6; }
+
+if test -n "$Vamp_CFLAGS"; then
+    pkg_cv_Vamp_CFLAGS="$Vamp_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"vamp-sdk\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "vamp-sdk") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_Vamp_CFLAGS=`$PKG_CONFIG --cflags "vamp-sdk" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+if test -n "$Vamp_LIBS"; then
+    pkg_cv_Vamp_LIBS="$Vamp_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"vamp-sdk\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "vamp-sdk") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_Vamp_LIBS=`$PKG_CONFIG --libs "vamp-sdk" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+   	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi
+        if test $_pkg_short_errors_supported = yes; then
+	        Vamp_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "vamp-sdk" 2>&1`
+        else
+	        Vamp_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "vamp-sdk" 2>&1`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$Vamp_PKG_ERRORS" >&5
+
+	as_fn_error $? "Package requirements (vamp-sdk) were not met:
+
+$Vamp_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+Alternatively, you may set the environment variables Vamp_CFLAGS
+and Vamp_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details." "$LINENO" 5
+elif test $pkg_failed = untried; then
+     	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+Alternatively, you may set the environment variables Vamp_CFLAGS
+and Vamp_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.
+See \`config.log' for more details" "$LINENO" 5; }
+else
+	Vamp_CFLAGS=$pkg_cv_Vamp_CFLAGS
+	Vamp_LIBS=$pkg_cv_Vamp_LIBS
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+fi
+
+
+
+if test "x$GCC" = "xyes"; then
+  case " $CFLAGS " in
+    *[\ \	]-fPIC\ -Wall[\ \	]*) ;;
+    *) CFLAGS="$CFLAGS -fPIC -Wall" ;;
+  esac
+  case " $CXXFLAGS " in
+    *[\ \	]-fPIC\ -Wall[\ \	]*) ;;
+    *) CXXFLAGS="$CXXFLAGS -fPIC -Wall" ;;
+  esac
+fi
+
+ac_config_files="$ac_config_files Makefile"
+
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, we kill variables containing newlines.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(
+  for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+
+  (set) 2>&1 |
+    case $as_nl`(ac_space=' '; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      # `set' does not quote correctly, so add quotes: double-quote
+      # substitution turns \\\\ into \\, and sed turns \\ into \.
+      sed -n \
+	"s/'/'\\\\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+      ;; #(
+    *)
+      # `set' quotes correctly as required by POSIX, so do not add quotes.
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+) |
+  sed '
+     /^ac_cv_env_/b end
+     t clear
+     :clear
+     s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+     t end
+     s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+     :end' >>confcache
+if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
+  if test -w "$cache_file"; then
+    if test "x$cache_file" != "x/dev/null"; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
+$as_echo "$as_me: updating cache $cache_file" >&6;}
+      if test ! -f "$cache_file" || test -h "$cache_file"; then
+	cat confcache >"$cache_file"
+      else
+        case $cache_file in #(
+        */* | ?:*)
+	  mv -f confcache "$cache_file"$$ &&
+	  mv -f "$cache_file"$$ "$cache_file" ;; #(
+        *)
+	  mv -f confcache "$cache_file" ;;
+	esac
+      fi
+    fi
+  else
+    { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
+$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+  fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+# Transform confdefs.h into DEFS.
+# Protect against shell expansion while executing Makefile rules.
+# Protect against Makefile macro expansion.
+#
+# If the first sed substitution is executed (which looks for macros that
+# take arguments), then branch to the quote section.  Otherwise,
+# look for a macro that doesn't take arguments.
+ac_script='
+:mline
+/\\$/{
+ N
+ s,\\\n,,
+ b mline
+}
+t clear
+:clear
+s/^[	 ]*#[	 ]*define[	 ][	 ]*\([^	 (][^	 (]*([^)]*)\)[	 ]*\(.*\)/-D\1=\2/g
+t quote
+s/^[	 ]*#[	 ]*define[	 ][	 ]*\([^	 ][^	 ]*\)[	 ]*\(.*\)/-D\1=\2/g
+t quote
+b any
+:quote
+s/[	 `~#$^&*(){}\\|;'\''"<>?]/\\&/g
+s/\[/\\&/g
+s/\]/\\&/g
+s/\$/$$/g
+H
+:any
+${
+	g
+	s/^\n//
+	s/\n/ /g
+	p
+}
+'
+DEFS=`sed -n "$ac_script" confdefs.h`
+
+
+ac_libobjs=
+ac_ltlibobjs=
+U=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+  # 1. Remove the extension, and $U if already installed.
+  ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
+  ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
+  # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
+  #    will be set to the directory where LIBOBJS objects are built.
+  as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+  as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+
+
+: "${CONFIG_STATUS=./config.status}"
+ac_write_fail=0
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
+as_write_fail=0
+cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+
+SHELL=\${CONFIG_SHELL-$SHELL}
+export SHELL
+_ASEOF
+cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='print -r --'
+  as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in #(
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there.  '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  $as_echo "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else
+  as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else
+  as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  }
+fi # as_fn_arith
+
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+    # In both cases, we have to default to `cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+exec 6>&1
+## ----------------------------------- ##
+## Main body of $CONFIG_STATUS script. ##
+## ----------------------------------- ##
+_ASEOF
+test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# Save the log message, to keep $0 and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling.
+ac_log="
+This file was extended by RubberBand $as_me 1.7, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
+
+  CONFIG_FILES    = $CONFIG_FILES
+  CONFIG_HEADERS  = $CONFIG_HEADERS
+  CONFIG_LINKS    = $CONFIG_LINKS
+  CONFIG_COMMANDS = $CONFIG_COMMANDS
+  $ $0 $@
+
+on `(hostname || uname -n) 2>/dev/null | sed 1q`
+"
+
+_ACEOF
+
+case $ac_config_files in *"
+"*) set x $ac_config_files; shift; ac_config_files=$*;;
+esac
+
+
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+# Files that config.status was made for.
+config_files="$ac_config_files"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+ac_cs_usage="\
+\`$as_me' instantiates files and other configuration actions
+from templates according to the current configuration.  Unless the files
+and actions are specified as TAGs, all are instantiated by default.
+
+Usage: $0 [OPTION]... [TAG]...
+
+  -h, --help       print this help, then exit
+  -V, --version    print version number and configuration settings, then exit
+      --config     print configuration, then exit
+  -q, --quiet, --silent
+                   do not print progress messages
+  -d, --debug      don't remove temporary files
+      --recheck    update $as_me by reconfiguring in the same conditions
+      --file=FILE[:TEMPLATE]
+                   instantiate the configuration file FILE
+
+Configuration files:
+$config_files
+
+Report bugs to <chris.cannam@breakfastquay.com>."
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
+ac_cs_version="\\
+RubberBand config.status 1.7
+configured by $0, generated by GNU Autoconf 2.69,
+  with options \\"\$ac_cs_config\\"
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+
+ac_pwd='$ac_pwd'
+srcdir='$srcdir'
+test -n "\$AWK" || AWK=awk
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# The default lists apply if the user does not specify any file.
+ac_need_defaults=:
+while test $# != 0
+do
+  case $1 in
+  --*=?*)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
+    ac_shift=:
+    ;;
+  --*=)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=
+    ac_shift=:
+    ;;
+  *)
+    ac_option=$1
+    ac_optarg=$2
+    ac_shift=shift
+    ;;
+  esac
+
+  case $ac_option in
+  # Handling of the options.
+  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+    ac_cs_recheck=: ;;
+  --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
+    $as_echo "$ac_cs_version"; exit ;;
+  --config | --confi | --conf | --con | --co | --c )
+    $as_echo "$ac_cs_config"; exit ;;
+  --debug | --debu | --deb | --de | --d | -d )
+    debug=: ;;
+  --file | --fil | --fi | --f )
+    $ac_shift
+    case $ac_optarg in
+    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    '') as_fn_error $? "missing file argument" ;;
+    esac
+    as_fn_append CONFIG_FILES " '$ac_optarg'"
+    ac_need_defaults=false;;
+  --he | --h |  --help | --hel | -h )
+    $as_echo "$ac_cs_usage"; exit ;;
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil | --si | --s)
+    ac_cs_silent=: ;;
+
+  # This is an error.
+  -*) as_fn_error $? "unrecognized option: \`$1'
+Try \`$0 --help' for more information." ;;
+
+  *) as_fn_append ac_config_targets " $1"
+     ac_need_defaults=false ;;
+
+  esac
+  shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+  exec 6>/dev/null
+  ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+if \$ac_cs_recheck; then
+  set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+  shift
+  \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
+  CONFIG_SHELL='$SHELL'
+  export CONFIG_SHELL
+  exec "\$@"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+exec 5>>config.log
+{
+  echo
+  sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+  $as_echo "$ac_log"
+} >&5
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+
+# Handling of arguments.
+for ac_config_target in $ac_config_targets
+do
+  case $ac_config_target in
+    "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
+
+  *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
+  esac
+done
+
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used.  Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+fi
+
+# Have a temporary directory for convenience.  Make it in the build tree
+# simply because there is no reason against having it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Hook for its removal unless debugging.
+# Note that there is a small window in which the directory will not be cleaned:
+# after its creation but before its name has been assigned to `$tmp'.
+$debug ||
+{
+  tmp= ac_tmp=
+  trap 'exit_status=$?
+  : "${ac_tmp:=$tmp}"
+  { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status
+' 0
+  trap 'as_fn_exit 1' 1 2 13 15
+}
+# Create a (secure) tmp directory for tmp files.
+
+{
+  tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
+  test -d "$tmp"
+}  ||
+{
+  tmp=./conf$$-$RANDOM
+  (umask 077 && mkdir "$tmp")
+} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5
+ac_tmp=$tmp
+
+# Set up the scripts for CONFIG_FILES section.
+# No need to generate them if there are no CONFIG_FILES.
+# This happens for instance with `./config.status config.h'.
+if test -n "$CONFIG_FILES"; then
+
+
+ac_cr=`echo X | tr X '\015'`
+# On cygwin, bash can eat \r inside `` if the user requested igncr.
+# But we know of no other shell where ac_cr would be empty at this
+# point, so we can use a bashism as a fallback.
+if test "x$ac_cr" = x; then
+  eval ac_cr=\$\'\\r\'
+fi
+ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
+if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
+  ac_cs_awk_cr='\\r'
+else
+  ac_cs_awk_cr=$ac_cr
+fi
+
+echo 'BEGIN {' >"$ac_tmp/subs1.awk" &&
+_ACEOF
+
+
+{
+  echo "cat >conf$$subs.awk <<_ACEOF" &&
+  echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
+  echo "_ACEOF"
+} >conf$$subs.sh ||
+  as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'`
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+  . ./conf$$subs.sh ||
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+
+  ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
+  if test $ac_delim_n = $ac_delim_num; then
+    break
+  elif $ac_last_try; then
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+rm -f conf$$subs.sh
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK &&
+_ACEOF
+sed -n '
+h
+s/^/S["/; s/!.*/"]=/
+p
+g
+s/^[^!]*!//
+:repl
+t repl
+s/'"$ac_delim"'$//
+t delim
+:nl
+h
+s/\(.\{148\}\)..*/\1/
+t more1
+s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
+p
+n
+b repl
+:more1
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t nl
+:delim
+h
+s/\(.\{148\}\)..*/\1/
+t more2
+s/["\\]/\\&/g; s/^/"/; s/$/"/
+p
+b
+:more2
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t delim
+' <conf$$subs.awk | sed '
+/^[^""]/{
+  N
+  s/\n//
+}
+' >>$CONFIG_STATUS || ac_write_fail=1
+rm -f conf$$subs.awk
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACAWK
+cat >>"\$ac_tmp/subs1.awk" <<_ACAWK &&
+  for (key in S) S_is_set[key] = 1
+  FS = ""
+
+}
+{
+  line = $ 0
+  nfields = split(line, field, "@")
+  substed = 0
+  len = length(field[1])
+  for (i = 2; i < nfields; i++) {
+    key = field[i]
+    keylen = length(key)
+    if (S_is_set[key]) {
+      value = S[key]
+      line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3)
+      len += length(value) + length(field[++i])
+      substed = 1
+    } else
+      len += 1 + keylen
+  }
+
+  print line
+}
+
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then
+  sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g"
+else
+  cat
+fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \
+  || as_fn_error $? "could not setup config files machinery" "$LINENO" 5
+_ACEOF
+
+# VPATH may cause trouble with some makes, so we remove sole $(srcdir),
+# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+  ac_vpsub='/^[	 ]*VPATH[	 ]*=[	 ]*/{
+h
+s///
+s/^/:/
+s/[	 ]*$/:/
+s/:\$(srcdir):/:/g
+s/:\${srcdir}:/:/g
+s/:@srcdir@:/:/g
+s/^:*//
+s/:*$//
+x
+s/\(=[	 ]*\).*/\1/
+G
+s/\n//
+s/^[^=]*=[	 ]*$//
+}'
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+fi # test -n "$CONFIG_FILES"
+
+
+eval set X "  :F $CONFIG_FILES      "
+shift
+for ac_tag
+do
+  case $ac_tag in
+  :[FHLC]) ac_mode=$ac_tag; continue;;
+  esac
+  case $ac_mode$ac_tag in
+  :[FHL]*:*);;
+  :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;;
+  :[FH]-) ac_tag=-:-;;
+  :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
+  esac
+  ac_save_IFS=$IFS
+  IFS=:
+  set x $ac_tag
+  IFS=$ac_save_IFS
+  shift
+  ac_file=$1
+  shift
+
+  case $ac_mode in
+  :L) ac_source=$1;;
+  :[FH])
+    ac_file_inputs=
+    for ac_f
+    do
+      case $ac_f in
+      -) ac_f="$ac_tmp/stdin";;
+      *) # Look for the file first in the build tree, then in the source tree
+	 # (if the path is not absolute).  The absolute path cannot be DOS-style,
+	 # because $ac_f cannot contain `:'.
+	 test -f "$ac_f" ||
+	   case $ac_f in
+	   [\\/$]*) false;;
+	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
+	   esac ||
+	   as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;;
+      esac
+      case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+      as_fn_append ac_file_inputs " '$ac_f'"
+    done
+
+    # Let's still pretend it is `configure' which instantiates (i.e., don't
+    # use $as_me), people would be surprised to read:
+    #    /* config.h.  Generated by config.status.  */
+    configure_input='Generated from '`
+	  $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+	`' by configure.'
+    if test x"$ac_file" != x-; then
+      configure_input="$ac_file.  $configure_input"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
+$as_echo "$as_me: creating $ac_file" >&6;}
+    fi
+    # Neutralize special characters interpreted by sed in replacement strings.
+    case $configure_input in #(
+    *\&* | *\|* | *\\* )
+       ac_sed_conf_input=`$as_echo "$configure_input" |
+       sed 's/[\\\\&|]/\\\\&/g'`;; #(
+    *) ac_sed_conf_input=$configure_input;;
+    esac
+
+    case $ac_tag in
+    *:-:* | *:-) cat >"$ac_tmp/stdin" \
+      || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;;
+    esac
+    ;;
+  esac
+
+  ac_dir=`$as_dirname -- "$ac_file" ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$ac_file" : 'X\(//\)[^/]' \| \
+	 X"$ac_file" : 'X\(//\)$' \| \
+	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$ac_file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  as_dir="$ac_dir"; as_fn_mkdir_p
+  ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+
+  case $ac_mode in
+  :F)
+  #
+  # CONFIG_FILE
+  #
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# If the template does not know about datarootdir, expand it.
+# FIXME: This hack should be removed a few years after 2.60.
+ac_datarootdir_hack=; ac_datarootdir_seen=
+ac_sed_dataroot='
+/datarootdir/ {
+  p
+  q
+}
+/@datadir@/p
+/@docdir@/p
+/@infodir@/p
+/@localedir@/p
+/@mandir@/p'
+case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
+*datarootdir*) ac_datarootdir_seen=yes;;
+*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+  ac_datarootdir_hack='
+  s&@datadir@&$datadir&g
+  s&@docdir@&$docdir&g
+  s&@infodir@&$infodir&g
+  s&@localedir@&$localedir&g
+  s&@mandir@&$mandir&g
+  s&\\\${datarootdir}&$datarootdir&g' ;;
+esac
+_ACEOF
+
+# Neutralize VPATH when `$srcdir' = `.'.
+# Shell code in configure.ac might set extrasub.
+# FIXME: do we really want to maintain this feature?
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_sed_extra="$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s|@configure_input@|$ac_sed_conf_input|;t t
+s&@top_builddir@&$ac_top_builddir_sub&;t t
+s&@top_build_prefix@&$ac_top_build_prefix&;t t
+s&@srcdir@&$ac_srcdir&;t t
+s&@abs_srcdir@&$ac_abs_srcdir&;t t
+s&@top_srcdir@&$ac_top_srcdir&;t t
+s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
+s&@builddir@&$ac_builddir&;t t
+s&@abs_builddir@&$ac_abs_builddir&;t t
+s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
+$ac_datarootdir_hack
+"
+eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \
+  >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+
+test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
+  { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } &&
+  { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' \
+      "$ac_tmp/out"`; test -z "$ac_out"; } &&
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&5
+$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&2;}
+
+  rm -f "$ac_tmp/stdin"
+  case $ac_file in
+  -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";;
+  *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";;
+  esac \
+  || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+ ;;
+
+
+
+  esac
+
+done # for ac_tag
+
+
+as_fn_exit 0
+_ACEOF
+ac_clean_files=$ac_clean_files_save
+
+test $ac_write_fail = 0 ||
+  as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded.  So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status.  When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+  ac_cs_success=:
+  ac_config_status_args=
+  test "$silent" = yes &&
+    ac_config_status_args="$ac_config_status_args --quiet"
+  exec 5>/dev/null
+  $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+  exec 5>>config.log
+  # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+  # would make configure fail if this is the last instruction.
+  $ac_cs_success || as_fn_exit 1
+fi
+if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+fi
+
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/configure.ac
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/configure.ac	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,42 @@
+
+AC_INIT(RubberBand, 1.8.1, chris.cannam@breakfastquay.com)
+
+AC_CONFIG_SRCDIR(src/StretcherImpl.h)
+AC_PROG_CXX
+AC_HEADER_STDC
+AC_C_BIGENDIAN
+
+PKG_CHECK_MODULES([SRC],[samplerate])
+AC_SUBST(SRC_CFLAGS)
+AC_SUBST(SRC_LIBS)
+
+PKG_CHECK_MODULES([SNDFILE],[sndfile])
+AC_SUBST(SNDFILE_CFLAGS)
+AC_SUBST(SNDFILE_LIBS)
+
+PKG_CHECK_MODULES([FFTW],[fftw3])
+AC_SUBST(FFTW_CFLAGS)
+AC_SUBST(FFTW_LIBS)
+
+AC_CHECK_HEADERS(ladspa.h)
+AC_CHECK_HEADERS(pthread.h)
+
+PKG_CHECK_MODULES([Vamp],[vamp-sdk])
+AC_SUBST(Vamp_CFLAGS)
+AC_SUBST(Vamp_LIBS)
+
+changequote(,)dnl
+if test "x$GCC" = "xyes"; then
+  case " $CFLAGS " in
+    *[\ \	]-fPIC\ -Wall[\ \	]*) ;;
+    *) CFLAGS="$CFLAGS -fPIC -Wall" ;;
+  esac
+  case " $CXXFLAGS " in
+    *[\ \	]-fPIC\ -Wall[\ \	]*) ;;
+    *) CXXFLAGS="$CXXFLAGS -fPIC -Wall" ;;
+  esac
+fi
+changequote([,])dnl
+
+AC_OUTPUT([Makefile])
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/ladspa/RubberBandPitchShifter.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/ladspa/RubberBandPitchShifter.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,563 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "RubberBandPitchShifter.h"
+
+#include "RubberBandStretcher.h"
+
+#include <iostream>
+#include <cmath>
+
+using namespace RubberBand;
+
+using std::cout;
+using std::cerr;
+using std::endl;
+using std::min;
+
+const char *const
+RubberBandPitchShifter::portNamesMono[PortCountMono] =
+{
+    "latency",
+    "Cents",
+    "Semitones",
+    "Octaves",
+    "Crispness",
+    "Formant Preserving",
+    "Faster",
+    "Input",
+    "Output"
+};
+
+const char *const
+RubberBandPitchShifter::portNamesStereo[PortCountStereo] =
+{
+    "latency",
+    "Cents",
+    "Semitones",
+    "Octaves",
+    "Crispness",
+    "Formant Preserving",
+    "Faster",
+    "Input L",
+    "Output L",
+    "Input R",
+    "Output R"
+};
+
+const LADSPA_PortDescriptor 
+RubberBandPitchShifter::portsMono[PortCountMono] =
+{
+    LADSPA_PORT_OUTPUT | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_AUDIO,
+    LADSPA_PORT_OUTPUT | LADSPA_PORT_AUDIO
+};
+
+const LADSPA_PortDescriptor 
+RubberBandPitchShifter::portsStereo[PortCountStereo] =
+{
+    LADSPA_PORT_OUTPUT | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_CONTROL,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_AUDIO,
+    LADSPA_PORT_OUTPUT | LADSPA_PORT_AUDIO,
+    LADSPA_PORT_INPUT  | LADSPA_PORT_AUDIO,
+    LADSPA_PORT_OUTPUT | LADSPA_PORT_AUDIO
+};
+
+const LADSPA_PortRangeHint 
+RubberBandPitchShifter::hintsMono[PortCountMono] =
+{
+    { 0, 0, 0 },                        // latency
+    { LADSPA_HINT_DEFAULT_0 |           // cents
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE,
+      -100.0, 100.0 },
+    { LADSPA_HINT_DEFAULT_0 |           // semitones
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE |
+      LADSPA_HINT_INTEGER,
+      -12.0, 12.0 },
+    { LADSPA_HINT_DEFAULT_0 |           // octaves
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE |
+      LADSPA_HINT_INTEGER,
+      -3.0, 3.0 },
+    { LADSPA_HINT_DEFAULT_MAXIMUM |     // crispness
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE |
+      LADSPA_HINT_INTEGER,
+       0.0, 3.0 },
+    { LADSPA_HINT_DEFAULT_0 |           // formant preserving
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE |
+      LADSPA_HINT_TOGGLED,
+       0.0, 1.0 },
+    { LADSPA_HINT_DEFAULT_0 |           // fast
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE |
+      LADSPA_HINT_TOGGLED,
+       0.0, 1.0 },
+    { 0, 0, 0 },
+    { 0, 0, 0 }
+};
+
+const LADSPA_PortRangeHint 
+RubberBandPitchShifter::hintsStereo[PortCountStereo] =
+{
+    { 0, 0, 0 },                        // latency
+    { LADSPA_HINT_DEFAULT_0 |           // cents
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE,
+      -100.0, 100.0 },
+    { LADSPA_HINT_DEFAULT_0 |           // semitones
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE |
+      LADSPA_HINT_INTEGER,
+      -12.0, 12.0 },
+    { LADSPA_HINT_DEFAULT_0 |           // octaves
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE |
+      LADSPA_HINT_INTEGER,
+      -3.0, 3.0 },
+    { LADSPA_HINT_DEFAULT_MAXIMUM |     // crispness
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE |
+      LADSPA_HINT_INTEGER,
+       0.0, 3.0 },
+    { LADSPA_HINT_DEFAULT_0 |           // formant preserving
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE |
+      LADSPA_HINT_TOGGLED,
+       0.0, 1.0 },
+    { LADSPA_HINT_DEFAULT_0 |           // fast
+      LADSPA_HINT_BOUNDED_BELOW |
+      LADSPA_HINT_BOUNDED_ABOVE |
+      LADSPA_HINT_TOGGLED,
+       0.0, 1.0 },
+    { 0, 0, 0 },
+    { 0, 0, 0 },
+    { 0, 0, 0 },
+    { 0, 0, 0 }
+};
+
+const LADSPA_Properties
+RubberBandPitchShifter::properties = LADSPA_PROPERTY_HARD_RT_CAPABLE;
+
+const LADSPA_Descriptor 
+RubberBandPitchShifter::ladspaDescriptorMono =
+{
+    2979, // "Unique" ID
+    "rubberband-pitchshifter-mono", // Label
+    properties,
+    "Rubber Band Mono Pitch Shifter", // Name
+    "Breakfast Quay",
+    "GPL",
+    PortCountMono,
+    portsMono,
+    portNamesMono,
+    hintsMono,
+    0, // Implementation data
+    instantiate,
+    connectPort,
+    activate,
+    run,
+    0, // Run adding
+    0, // Set run adding gain
+    deactivate,
+    cleanup
+};
+
+const LADSPA_Descriptor 
+RubberBandPitchShifter::ladspaDescriptorStereo =
+{
+    9792, // "Unique" ID
+    "rubberband-pitchshifter-stereo", // Label
+    properties,
+    "Rubber Band Stereo Pitch Shifter", // Name
+    "Breakfast Quay",
+    "GPL",
+    PortCountStereo,
+    portsStereo,
+    portNamesStereo,
+    hintsStereo,
+    0, // Implementation data
+    instantiate,
+    connectPort,
+    activate,
+    run,
+    0, // Run adding
+    0, // Set run adding gain
+    deactivate,
+    cleanup
+};
+
+const LADSPA_Descriptor *
+RubberBandPitchShifter::getDescriptor(unsigned long index)
+{
+    if (index == 0) return &ladspaDescriptorMono;
+    if (index == 1) return &ladspaDescriptorStereo;
+    else return 0;
+}
+
+RubberBandPitchShifter::RubberBandPitchShifter(int sampleRate, size_t channels) :
+    m_latency(0),
+    m_cents(0),
+    m_semitones(0),
+    m_octaves(0),
+    m_crispness(0),
+    m_formant(0),
+    m_fast(0),
+    m_ratio(1.0),
+    m_prevRatio(1.0),
+    m_currentCrispness(-1),
+    m_currentFormant(false),
+    m_currentFast(false),
+    m_blockSize(1024),
+    m_reserve(1024),
+    m_minfill(0),
+    m_stretcher(new RubberBandStretcher
+                (sampleRate, channels,
+                 RubberBandStretcher::OptionProcessRealTime |
+                 RubberBandStretcher::OptionPitchHighConsistency)),
+    m_sampleRate(sampleRate),
+    m_channels(channels)
+{
+    for (size_t c = 0; c < m_channels; ++c) {
+
+        m_input[c] = 0;
+        m_output[c] = 0;
+
+        int bufsize = m_blockSize + m_reserve + 8192;
+
+        m_outputBuffer[c] = new RingBuffer<float>(bufsize);
+
+        m_scratch[c] = new float[bufsize];
+        for (int i = 0; i < bufsize; ++i) m_scratch[c][i] = 0.f;
+    }
+
+    activateImpl();
+}
+
+RubberBandPitchShifter::~RubberBandPitchShifter()
+{
+    delete m_stretcher;
+    for (size_t c = 0; c < m_channels; ++c) {
+        delete m_outputBuffer[c];
+        delete[] m_scratch[c];
+    }
+}
+    
+LADSPA_Handle
+RubberBandPitchShifter::instantiate(const LADSPA_Descriptor *desc, unsigned long rate)
+{
+    if (desc->PortCount == ladspaDescriptorMono.PortCount) {
+        return new RubberBandPitchShifter(rate, 1);
+    } else if (desc->PortCount == ladspaDescriptorStereo.PortCount) {
+        return new RubberBandPitchShifter(rate, 2);
+    }
+    return 0;
+}
+
+void
+RubberBandPitchShifter::connectPort(LADSPA_Handle handle,
+				    unsigned long port, LADSPA_Data *location)
+{
+    RubberBandPitchShifter *shifter = (RubberBandPitchShifter *)handle;
+
+    float **ports[PortCountStereo] = {
+        &shifter->m_latency,
+	&shifter->m_cents,
+	&shifter->m_semitones,
+	&shifter->m_octaves,
+        &shifter->m_crispness,
+	&shifter->m_formant,
+	&shifter->m_fast,
+    	&shifter->m_input[0],
+	&shifter->m_output[0],
+	&shifter->m_input[1],
+	&shifter->m_output[1]
+    };
+
+    if (shifter->m_channels == 1) {
+        if (port >= PortCountMono) return;
+    } else {
+        if (port >= PortCountStereo) return;
+    }
+
+    *ports[port] = (float *)location;
+
+    if (shifter->m_latency) {
+        *(shifter->m_latency) =
+            float(shifter->m_stretcher->getLatency() + shifter->m_reserve);
+    }
+}
+
+void
+RubberBandPitchShifter::activate(LADSPA_Handle handle)
+{
+    RubberBandPitchShifter *shifter = (RubberBandPitchShifter *)handle;
+    shifter->activateImpl();
+}
+
+void
+RubberBandPitchShifter::activateImpl()
+{
+    updateRatio();
+    m_prevRatio = m_ratio;
+    m_stretcher->reset();
+    m_stretcher->setPitchScale(m_ratio);
+
+    for (size_t c = 0; c < m_channels; ++c) {
+        m_outputBuffer[c]->reset();
+        m_outputBuffer[c]->zero(m_reserve);
+    }
+
+    m_minfill = 0;
+
+    // prime stretcher
+//    for (int i = 0; i < 8; ++i) {
+//        int reqd = m_stretcher->getSamplesRequired();
+//        m_stretcher->process(m_scratch, reqd, false);
+//        int avail = m_stretcher->available();
+//        if (avail > 0) {
+//            m_stretcher->retrieve(m_scratch, avail);
+//        }
+//    }
+}
+
+void
+RubberBandPitchShifter::run(LADSPA_Handle handle, unsigned long samples)
+{
+    RubberBandPitchShifter *shifter = (RubberBandPitchShifter *)handle;
+    shifter->runImpl(samples);
+}
+
+void
+RubberBandPitchShifter::updateRatio()
+{
+    double oct = (m_octaves ? *m_octaves : 0.0);
+    oct += (m_semitones ? *m_semitones : 0.0) / 12;
+    oct += (m_cents ? *m_cents : 0.0) / 1200;
+    m_ratio = pow(2.0, oct);
+}
+
+void
+RubberBandPitchShifter::updateCrispness()
+{
+    if (!m_crispness) return;
+    
+    int c = lrintf(*m_crispness);
+    if (c == m_currentCrispness) return;
+    if (c < 0 || c > 3) return;
+    RubberBandStretcher *s = m_stretcher;
+
+    switch (c) {
+    case 0:
+        s->setPhaseOption(RubberBandStretcher::OptionPhaseIndependent);
+        s->setTransientsOption(RubberBandStretcher::OptionTransientsSmooth);
+        break;
+    case 1:
+        s->setPhaseOption(RubberBandStretcher::OptionPhaseLaminar);
+        s->setTransientsOption(RubberBandStretcher::OptionTransientsSmooth);
+        break;
+    case 2:
+        s->setPhaseOption(RubberBandStretcher::OptionPhaseLaminar);
+        s->setTransientsOption(RubberBandStretcher::OptionTransientsMixed);
+        break;
+    case 3:
+        s->setPhaseOption(RubberBandStretcher::OptionPhaseLaminar);
+        s->setTransientsOption(RubberBandStretcher::OptionTransientsCrisp);
+        break;
+    }
+
+    m_currentCrispness = c;
+}
+
+void
+RubberBandPitchShifter::updateFormant()
+{
+    if (!m_formant) return;
+
+    bool f = (*m_formant > 0.5f);
+    if (f == m_currentFormant) return;
+    
+    RubberBandStretcher *s = m_stretcher;
+    
+    s->setFormantOption(f ?
+                        RubberBandStretcher::OptionFormantPreserved :
+                        RubberBandStretcher::OptionFormantShifted);
+
+    m_currentFormant = f;
+}
+
+void
+RubberBandPitchShifter::updateFast()
+{
+    if (!m_fast) return;
+
+    bool f = (*m_fast > 0.5f);
+    if (f == m_currentFast) return;
+    
+    RubberBandStretcher *s = m_stretcher;
+    
+    s->setPitchOption(f ?
+                      RubberBandStretcher::OptionPitchHighSpeed :
+                      RubberBandStretcher::OptionPitchHighConsistency);
+
+    m_currentFast = f;
+}
+
+void
+RubberBandPitchShifter::runImpl(unsigned long insamples)
+{
+    unsigned long offset = 0;
+
+    // We have to break up the input into chunks like this because
+    // insamples could be arbitrarily large and our output buffer is
+    // of limited size
+
+    while (offset < insamples) {
+
+        unsigned long block = (unsigned long)m_blockSize;
+        if (block + offset > insamples) block = insamples - offset;
+
+        runImpl(block, offset);
+
+        offset += block;
+    }
+}
+
+void
+RubberBandPitchShifter::runImpl(unsigned long insamples, unsigned long offset)
+{
+//    cerr << "RubberBandPitchShifter::runImpl(" << insamples << ")" << endl;
+
+//    static int incount = 0, outcount = 0;
+
+    updateRatio();
+    if (m_ratio != m_prevRatio) {
+        m_stretcher->setPitchScale(m_ratio);
+        m_prevRatio = m_ratio;
+    }
+
+    if (m_latency) {
+        *m_latency = float(m_stretcher->getLatency() + m_reserve);
+//        cerr << "latency = " << *m_latency << endl;
+    }
+
+    updateCrispness();
+    updateFormant();
+    updateFast();
+
+    const int samples = insamples;
+    int processed = 0;
+    size_t outTotal = 0;
+
+    float *ptrs[2];
+
+    int rs = m_outputBuffer[0]->getReadSpace();
+    if (rs < int(m_minfill)) {
+//        cerr << "temporary expansion (have " << rs << ", want " << m_reserve << ")" << endl;
+        m_stretcher->setTimeRatio(1.1); // fill up temporarily
+    } else if (rs > 8192) {
+//        cerr << "temporary reduction (have " << rs << ", want " << m_reserve << ")" << endl;
+        m_stretcher->setTimeRatio(0.9); // reduce temporarily
+    } else {
+        m_stretcher->setTimeRatio(1.0);
+    }
+
+    while (processed < samples) {
+
+        // never feed more than the minimum necessary number of
+        // samples at a time; ensures nothing will overflow internally
+        // and we don't need to call setMaxProcessSize
+
+        int toCauseProcessing = m_stretcher->getSamplesRequired();
+        int inchunk = min(samples - processed, toCauseProcessing);
+        for (size_t c = 0; c < m_channels; ++c) {
+            ptrs[c] = &(m_input[c][offset + processed]);
+        }
+        m_stretcher->process(ptrs, inchunk, false);
+        processed += inchunk;
+
+        int avail = m_stretcher->available();
+        int writable = m_outputBuffer[0]->getWriteSpace();
+        int outchunk = min(avail, writable);
+        size_t actual = m_stretcher->retrieve(m_scratch, outchunk);
+        outTotal += actual;
+
+//        incount += inchunk;
+//        outcount += actual;
+
+//        cout << "avail: " << avail << ", outchunk = " << outchunk;
+//        if (actual != outchunk) cout << " (" << actual << ")";
+//        cout << endl;
+
+        outchunk = actual;
+
+        for (size_t c = 0; c < m_channels; ++c) {
+            if (int(m_outputBuffer[c]->getWriteSpace()) < outchunk) {
+                cerr << "RubberBandPitchShifter::runImpl: buffer overrun: chunk = " << outchunk << ", space = " << m_outputBuffer[c]->getWriteSpace() << endl;
+            }                
+            m_outputBuffer[c]->write(m_scratch[c], outchunk);
+        }
+    }
+    
+    for (size_t c = 0; c < m_channels; ++c) {
+        int toRead = m_outputBuffer[c]->getReadSpace();
+        if (toRead < samples && c == 0) {
+            cerr << "RubberBandPitchShifter::runImpl: buffer underrun: required = " << samples << ", available = " << toRead << endl;
+        }
+        int chunk = min(toRead, samples);
+        m_outputBuffer[c]->read(&(m_output[c][offset]), chunk);
+    }
+
+    if (m_minfill == 0) {
+        m_minfill = m_outputBuffer[0]->getReadSpace();
+//        cerr << "minfill = " << m_minfill << endl;
+    }
+}
+
+void
+RubberBandPitchShifter::deactivate(LADSPA_Handle handle)
+{
+    activate(handle); // both functions just reset the plugin
+}
+
+void
+RubberBandPitchShifter::cleanup(LADSPA_Handle handle)
+{
+    delete (RubberBandPitchShifter *)handle;
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/ladspa/RubberBandPitchShifter.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/ladspa/RubberBandPitchShifter.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,116 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_PITCH_SHIFTER_H_
+#define _RUBBERBAND_PITCH_SHIFTER_H_
+
+#include <ladspa.h>
+
+#include "base/RingBuffer.h"
+
+namespace RubberBand {
+class RubberBandStretcher;
+}
+
+class RubberBandPitchShifter
+{
+public:
+    static const LADSPA_Descriptor *getDescriptor(unsigned long index);
+    
+protected:
+    RubberBandPitchShifter(int sampleRate, size_t channels);
+    ~RubberBandPitchShifter();
+
+    enum {
+        LatencyPort      = 0,
+	OctavesPort      = 1,
+	SemitonesPort    = 2,
+	CentsPort        = 3,
+        CrispnessPort    = 4,
+	FormantPort      = 5,
+	FastPort         = 6,
+	InputPort1       = 7,
+        OutputPort1      = 8,
+        PortCountMono    = OutputPort1 + 1,
+        InputPort2       = 9,
+        OutputPort2      = 10,
+        PortCountStereo  = OutputPort2 + 1
+    };
+
+    static const char *const portNamesMono[PortCountMono];
+    static const LADSPA_PortDescriptor portsMono[PortCountMono];
+    static const LADSPA_PortRangeHint hintsMono[PortCountMono];
+
+    static const char *const portNamesStereo[PortCountStereo];
+    static const LADSPA_PortDescriptor portsStereo[PortCountStereo];
+    static const LADSPA_PortRangeHint hintsStereo[PortCountStereo];
+
+    static const LADSPA_Properties properties;
+
+    static const LADSPA_Descriptor ladspaDescriptorMono;
+    static const LADSPA_Descriptor ladspaDescriptorStereo;
+
+    static LADSPA_Handle instantiate(const LADSPA_Descriptor *, unsigned long);
+    static void connectPort(LADSPA_Handle, unsigned long, LADSPA_Data *);
+    static void activate(LADSPA_Handle);
+    static void run(LADSPA_Handle, unsigned long);
+    static void deactivate(LADSPA_Handle);
+    static void cleanup(LADSPA_Handle);
+
+    void activateImpl();
+    void runImpl(unsigned long);
+    void runImpl(unsigned long, unsigned long offset);
+    void updateRatio();
+    void updateCrispness();
+    void updateFormant();
+    void updateFast();
+
+    float *m_input[2];
+    float *m_output[2];
+    float *m_latency;
+    float *m_cents;
+    float *m_semitones;
+    float *m_octaves;
+    float *m_crispness;
+    float *m_formant;
+    float *m_fast;
+    double m_ratio;
+    double m_prevRatio;
+    int m_currentCrispness;
+    bool m_currentFormant;
+    bool m_currentFast;
+
+    size_t m_blockSize;
+    size_t m_reserve;
+    size_t m_minfill;
+
+    RubberBand::RubberBandStretcher *m_stretcher;
+    RubberBand::RingBuffer<float> *m_outputBuffer[2];
+    float *m_scratch[2];
+
+    int m_sampleRate;
+    size_t m_channels;
+};
+
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/ladspa/ladspa-plugin.map
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/ladspa/ladspa-plugin.map	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,4 @@
+{
+	global: ladspa_descriptor;
+	local: *;
+};
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/ladspa/ladspa-rubberband.cat
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/ladspa/ladspa-rubberband.cat	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,2 @@
+ladspa:ladspa-rubberband:rubberband-pitchshifter-mono::Frequency > Pitch shifters
+ladspa:ladspa-rubberband:rubberband-pitchshifter-stereo::Frequency > Pitch shifters
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/ladspa/ladspa-rubberband.rdf
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/ladspa/ladspa-rubberband.rdf	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,14 @@
+<?xml version='1.0' encoding='ISO-8859-1'?>
+
+<!DOCTYPE rdf:RDF [
+       <!ENTITY rdf 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'>
+       <!ENTITY ladspa 'http://ladspa.org/ontology#'>
+]>
+
+<rdf:RDF xmlns:rdf="&rdf;" xmlns:ladspa="&ladspa;">
+
+<ladspa:PitchPlugin rdf:about="&ladspa;2979"/>
+<ladspa:PitchPlugin rdf:about="&ladspa;9792"/>
+
+</rdf:RDF>
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/ladspa/libmain.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/ladspa/libmain.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,35 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "RubberBandPitchShifter.h"
+
+#include <stdio.h>
+
+extern "C" {
+
+const LADSPA_Descriptor *ladspa_descriptor(unsigned long index)
+{
+    return RubberBandPitchShifter::getDescriptor(index);
+}
+
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/main/main.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/main/main.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,648 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "rubberband/RubberBandStretcher.h"
+
+#include <iostream>
+#include <sndfile.h>
+#include <cmath>
+#include <time.h>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+#include <fstream>
+
+#include "system/sysutils.h"
+
+#ifdef __MSVC__
+#include "getopt/getopt.h"
+#else
+#include <getopt.h>
+#include <unistd.h>
+#include <sys/time.h>
+#endif
+
+#include "base/Profiler.h"
+
+using namespace std;
+using namespace RubberBand;
+
+#ifdef _WIN32
+using RubberBand::gettimeofday;
+#endif
+
+#ifdef __MSVC__
+using RubberBand::usleep;
+#endif
+
+double tempo_convert(const char *str)
+{
+    char *d = strchr((char *)str, ':');
+
+    if (!d || !*d) {
+        double m = atof(str);
+        if (m != 0.0) return 1.0 / m;
+        else return 1.0;
+    }
+
+    char *a = strdup(str);
+    char *b = strdup(d+1);
+    a[d-str] = '\0';
+    double m = atof(a);
+    double n = atof(b);
+    free(a);
+    free(b);
+    if (n != 0.0 && m != 0.0) return m / n;
+    else return 1.0;
+}
+
+int main(int argc, char **argv)
+{
+    int c;
+
+    double ratio = 1.0;
+    double duration = 0.0;
+    double pitchshift = 0.0;
+    double frequencyshift = 1.0;
+    int debug = 0;
+    bool realtime = false;
+    bool precise = true;
+    int threading = 0;
+    bool lamination = true;
+    bool longwin = false;
+    bool shortwin = false;
+    bool smoothing = false;
+    bool hqpitch = false;
+    bool formant = false;
+    bool together = false;
+    bool crispchanged = false;
+    int crispness = -1;
+    bool help = false;
+    bool version = false;
+    bool quiet = false;
+
+    bool haveRatio = false;
+
+    std::string mapfile;
+
+    enum {
+        NoTransients,
+        BandLimitedTransients,
+        Transients
+    } transients = Transients;
+
+    enum {
+        CompoundDetector,
+        PercussiveDetector,
+        SoftDetector
+    } detector = CompoundDetector;
+
+    while (1) {
+        int optionIndex = 0;
+
+        static struct option longOpts[] = {
+            { "help",          0, 0, 'h' },
+            { "version",       0, 0, 'V' },
+            { "time",          1, 0, 't' },
+            { "tempo",         1, 0, 'T' },
+            { "duration",      1, 0, 'D' },
+            { "pitch",         1, 0, 'p' },
+            { "frequency",     1, 0, 'f' },
+            { "crisp",         1, 0, 'c' },
+            { "crispness",     1, 0, 'c' },
+            { "debug",         1, 0, 'd' },
+            { "realtime",      0, 0, 'R' },
+            { "loose",         0, 0, 'L' },
+            { "precise",       0, 0, 'P' },
+            { "formant",       0, 0, 'F' },
+            { "no-threads",    0, 0, '0' },
+            { "no-transients", 0, 0, '1' },
+            { "no-lamination", 0, 0, '2' },
+            { "centre-focus",  0, 0, '7' },
+            { "window-long",   0, 0, '3' },
+            { "window-short",  0, 0, '4' },
+            { "bl-transients", 0, 0, '8' },
+            { "detector-perc", 0, 0, '5' },
+            { "detector-soft", 0, 0, '6' },
+            { "smoothing",     0, 0, '9' },
+            { "pitch-hq",      0, 0, '%' },
+            { "threads",       0, 0, '@' },
+            { "quiet",         0, 0, 'q' },
+            { "timemap",       1, 0, 'M' },
+            { 0, 0, 0, 0 }
+        };
+
+        c = getopt_long(argc, argv,
+                        "t:p:d:RLPFc:f:T:D:qhVM:",
+                        longOpts, &optionIndex);
+        if (c == -1) break;
+
+        switch (c) {
+        case 'h': help = true; break;
+        case 'V': version = true; break;
+        case 't': ratio *= atof(optarg); haveRatio = true; break;
+        case 'T': ratio *= tempo_convert(optarg); haveRatio = true; break;
+        case 'D': duration = atof(optarg); haveRatio = true; break;
+        case 'p': pitchshift = atof(optarg); haveRatio = true; break;
+        case 'f': frequencyshift = atof(optarg); haveRatio = true; break;
+        case 'd': debug = atoi(optarg); break;
+        case 'R': realtime = true; break;
+        case 'L': precise = false; break;
+        case 'P': precise = true; break;
+	case 'F': formant = true; break;
+        case '0': threading = 1; break;
+        case '@': threading = 2; break;
+        case '1': transients = NoTransients; crispchanged = true; break;
+        case '2': lamination = false; crispchanged = true; break;
+        case '3': longwin = true; crispchanged = true; break;
+        case '4': shortwin = true; crispchanged = true; break;
+        case '5': detector = PercussiveDetector; crispchanged = true; break;
+        case '6': detector = SoftDetector; crispchanged = true; break;
+        case '7': together = true; break;
+        case '8': transients = BandLimitedTransients; crispchanged = true; break;
+        case '9': smoothing = true; crispchanged = true; break;
+        case '%': hqpitch = true; break;
+        case 'c': crispness = atoi(optarg); break;
+        case 'q': quiet = true; break;
+        case 'M': mapfile = optarg; break;
+        default:  help = true; break;
+        }
+    }
+
+    if (version) {
+        cerr << RUBBERBAND_VERSION << endl;
+        return 0;
+    }
+
+    if (help || !haveRatio || optind + 2 != argc) {
+        cerr << endl;
+	cerr << "Rubber Band" << endl;
+        cerr << "An audio time-stretching and pitch-shifting library and utility program." << endl;
+	cerr << "Copyright 2007-2012 Particular Programs Ltd." << endl;
+        cerr << endl;
+	cerr << "   Usage: " << argv[0] << " [options] <infile.wav> <outfile.wav>" << endl;
+        cerr << endl;
+        cerr << "You must specify at least one of the following time and pitch ratio options." << endl;
+        cerr << endl;
+        cerr << "  -t<X>, --time <X>       Stretch to X times original duration, or" << endl;
+        cerr << "  -T<X>, --tempo <X>      Change tempo by multiple X (same as --time 1/X), or" << endl;
+        cerr << "  -T<X>, --tempo <X>:<Y>  Change tempo from X to Y (same as --time X/Y), or" << endl;
+        cerr << "  -D<X>, --duration <X>   Stretch or squash to make output file X seconds long" << endl;
+        cerr << endl;
+        cerr << "  -p<X>, --pitch <X>      Raise pitch by X semitones, or" << endl;
+        cerr << "  -f<X>, --frequency <X>  Change frequency by multiple X" << endl;
+        cerr << endl;
+        cerr << "  -M<F>, --timemap <F>    Use file F as the source for key frame map" << endl;
+        cerr << endl;
+        cerr << "A map file consists of a series of lines each having two numbers separated" << endl;
+        cerr << "by a single space.  These are source and target sample frame numbers for fixed" << endl;
+        cerr << "time points within the audio data, defining a varying stretch factor through" << endl;
+        cerr << "the audio.  You must specify an overall stretch factor using e.g. -t as well." << endl;
+        cerr << endl;
+        cerr << "The following options provide a simple way to adjust the sound.  See below" << endl;
+        cerr << "for more details." << endl;
+        cerr << endl;
+        cerr << "  -c<N>, --crisp <N>      Crispness (N = 0,1,2,3,4,5,6); default 5 (see below)" << endl;
+	cerr << "  -F,    --formant        Enable formant preservation when pitch shifting" << endl;
+        cerr << endl;
+        cerr << "The remaining options fine-tune the processing mode and stretch algorithm." << endl;
+        cerr << "These are mostly included for test purposes; the default settings and standard" << endl;
+        cerr << "crispness parameter are intended to provide the best sounding set of options" << endl;
+        cerr << "for most situations.  The default is to use none of these options." << endl;
+        cerr << endl;
+        cerr << "  -L,    --loose          Relax timing in hope of better transient preservation" << endl;
+        cerr << "  -P,    --precise        Ignored: The opposite of -L, this is default from 1.6" << endl;
+        cerr << "  -R,    --realtime       Select realtime mode (implies --no-threads)" << endl;
+        cerr << "         --no-threads     No extra threads regardless of CPU and channel count" << endl;
+        cerr << "         --threads        Assume multi-CPU even if only one CPU is identified" << endl;
+        cerr << "         --no-transients  Disable phase resynchronisation at transients" << endl;
+        cerr << "         --bl-transients  Band-limit phase resync to extreme frequencies" << endl;
+        cerr << "         --no-lamination  Disable phase lamination" << endl;
+        cerr << "         --window-long    Use longer processing window (actual size may vary)" << endl;
+        cerr << "         --window-short   Use shorter processing window" << endl;
+        cerr << "         --smoothing      Apply window presum and time-domain smoothing" << endl;
+        cerr << "         --detector-perc  Use percussive transient detector (as in pre-1.5)" << endl;
+        cerr << "         --detector-soft  Use soft transient detector" << endl;
+        cerr << "         --pitch-hq       In RT mode, use a slower, higher quality pitch shift" << endl;
+        cerr << "         --centre-focus   Preserve focus of centre material in stereo" << endl;
+        cerr << "                          (at a cost in width and individual channel quality)" << endl;
+        cerr << endl;
+        cerr << "  -d<N>, --debug <N>      Select debug level (N = 0,1,2,3); default 0, full 3" << endl;
+        cerr << "                          (N.B. debug level 3 includes audible ticks in output)" << endl;
+        cerr << "  -q,    --quiet          Suppress progress output" << endl;
+        cerr << endl;
+        cerr << "  -V,    --version        Show version number and exit" << endl;
+        cerr << "  -h,    --help           Show this help" << endl;
+        cerr << endl;
+        cerr << "\"Crispness\" levels:" << endl;
+        cerr << "  -c 0   equivalent to --no-transients --no-lamination --window-long" << endl;
+        cerr << "  -c 1   equivalent to --detector-soft --no-lamination --window-long (for piano)" << endl;
+        cerr << "  -c 2   equivalent to --no-transients --no-lamination" << endl;
+        cerr << "  -c 3   equivalent to --no-transients" << endl;
+        cerr << "  -c 4   equivalent to --bl-transients" << endl;
+        cerr << "  -c 5   default processing options" << endl;
+        cerr << "  -c 6   equivalent to --no-lamination --window-short (may be good for drums)" << endl;
+        cerr << endl;
+	return 2;
+    }
+
+    if (crispness >= 0 && crispchanged) {
+        cerr << "WARNING: Both crispness option and transients, lamination or window options" << endl;
+        cerr << "         provided -- crispness will override these other options" << endl;
+    }
+
+    switch (crispness) {
+    case -1: crispness = 5; break;
+    case 0: detector = CompoundDetector; transients = NoTransients; lamination = false; longwin = true; shortwin = false; break;
+    case 1: detector = SoftDetector; transients = Transients; lamination = false; longwin = true; shortwin = false; break;
+    case 2: detector = CompoundDetector; transients = NoTransients; lamination = false; longwin = false; shortwin = false; break;
+    case 3: detector = CompoundDetector; transients = NoTransients; lamination = true; longwin = false; shortwin = false; break;
+    case 4: detector = CompoundDetector; transients = BandLimitedTransients; lamination = true; longwin = false; shortwin = false; break;
+    case 5: detector = CompoundDetector; transients = Transients; lamination = true; longwin = false; shortwin = false; break;
+    case 6: detector = CompoundDetector; transients = Transients; lamination = false; longwin = false; shortwin = true; break;
+    };
+
+    if (!quiet) {
+        cerr << "Using crispness level: " << crispness << " (";
+        switch (crispness) {
+        case 0: cerr << "Mushy"; break;
+        case 1: cerr << "Piano"; break;
+        case 2: cerr << "Smooth"; break;
+        case 3: cerr << "Balanced multitimbral mixture"; break;
+        case 4: cerr << "Unpitched percussion with stable notes"; break;
+        case 5: cerr << "Crisp monophonic instrumental"; break;
+        case 6: cerr << "Unpitched solo percussion"; break;
+        }
+        cerr << ")" << endl;
+    }
+
+    std::map<size_t, size_t> mapping;
+    
+    if (mapfile != "") {
+        std::ifstream ifile(mapfile.c_str());
+        if (!ifile.is_open()) {
+            cerr << "ERROR: Failed to open time map file \"" << mapfile << "\""
+                 << endl;
+            return 1;
+        }
+        std::string line;
+        int lineno = 0;
+        while (!ifile.eof()) {
+            std::getline(ifile, line);
+            while (line.length() > 0 && line[0] == ' ') line = line.substr(1);
+            if (line == "") {
+                ++lineno;
+                continue;
+            }
+            std::string::size_type i = line.find_first_of(" ");
+            if (i == std::string::npos) {
+                cerr << "ERROR: Time map file \"" << mapfile
+                     << "\" is malformed at line " << lineno << endl;
+                return 1;
+            }
+            size_t source = atoi(line.substr(0, i).c_str());
+            while (i < line.length() && line[i] == ' ') ++i;
+            size_t target = atoi(line.substr(i).c_str());
+            mapping[source] = target;
+            if (debug > 0) {
+                cerr << "adding mapping from " << source << " to " << target << endl;
+            }
+            ++lineno;
+        }
+        ifile.close();
+
+        if (!quiet) {
+            cerr << "Read " << mapping.size() << " line(s) from map file" << endl;
+        }
+    }
+
+    char *fileName = strdup(argv[optind++]);
+    char *fileNameOut = strdup(argv[optind++]);
+
+    SNDFILE *sndfile;
+    SNDFILE *sndfileOut;
+    SF_INFO sfinfo;
+    SF_INFO sfinfoOut;
+    memset(&sfinfo, 0, sizeof(SF_INFO));
+
+    sndfile = sf_open(fileName, SFM_READ, &sfinfo);
+    if (!sndfile) {
+	cerr << "ERROR: Failed to open input file \"" << fileName << "\": "
+	     << sf_strerror(sndfile) << endl;
+	return 1;
+    }
+
+    if (duration != 0.0) {
+        if (sfinfo.frames == 0 || sfinfo.samplerate == 0) {
+            cerr << "ERROR: File lacks frame count or sample rate in header, cannot use --duration" << endl;
+            return 1;
+        }
+        double induration = double(sfinfo.frames) / double(sfinfo.samplerate);
+        if (induration != 0.0) ratio = duration / induration;
+    }
+
+    sfinfoOut.channels = sfinfo.channels;
+    sfinfoOut.format = sfinfo.format;
+    sfinfoOut.frames = int(sfinfo.frames * ratio + 0.1);
+    sfinfoOut.samplerate = sfinfo.samplerate;
+    sfinfoOut.sections = sfinfo.sections;
+    sfinfoOut.seekable = sfinfo.seekable;
+
+    sndfileOut = sf_open(fileNameOut, SFM_WRITE, &sfinfoOut) ;
+    if (!sndfileOut) {
+	cerr << "ERROR: Failed to open output file \"" << fileNameOut << "\" for writing: "
+	     << sf_strerror(sndfileOut) << endl;
+	return 1;
+    }
+    
+    int ibs = 1024;
+    size_t channels = sfinfo.channels;
+
+    RubberBandStretcher::Options options = 0;
+    if (realtime)    options |= RubberBandStretcher::OptionProcessRealTime;
+    if (precise)     options |= RubberBandStretcher::OptionStretchPrecise;
+    if (!lamination) options |= RubberBandStretcher::OptionPhaseIndependent;
+    if (longwin)     options |= RubberBandStretcher::OptionWindowLong;
+    if (shortwin)    options |= RubberBandStretcher::OptionWindowShort;
+    if (smoothing)   options |= RubberBandStretcher::OptionSmoothingOn;
+    if (formant)     options |= RubberBandStretcher::OptionFormantPreserved;
+    if (hqpitch)     options |= RubberBandStretcher::OptionPitchHighQuality;
+    if (together)    options |= RubberBandStretcher::OptionChannelsTogether;
+
+    switch (threading) {
+    case 0:
+        options |= RubberBandStretcher::OptionThreadingAuto;
+        break;
+    case 1:
+        options |= RubberBandStretcher::OptionThreadingNever;
+        break;
+    case 2:
+        options |= RubberBandStretcher::OptionThreadingAlways;
+        break;
+    }
+
+    switch (transients) {
+    case NoTransients:
+        options |= RubberBandStretcher::OptionTransientsSmooth;
+        break;
+    case BandLimitedTransients:
+        options |= RubberBandStretcher::OptionTransientsMixed;
+        break;
+    case Transients:
+        options |= RubberBandStretcher::OptionTransientsCrisp;
+        break;
+    }
+
+    switch (detector) {
+    case CompoundDetector:
+        options |= RubberBandStretcher::OptionDetectorCompound;
+        break;
+    case PercussiveDetector:
+        options |= RubberBandStretcher::OptionDetectorPercussive;
+        break;
+    case SoftDetector:
+        options |= RubberBandStretcher::OptionDetectorSoft;
+        break;
+    }
+
+    if (pitchshift != 0.0) {
+        frequencyshift *= pow(2.0, pitchshift / 12);
+    }
+
+    cerr << "Using time ratio " << ratio;
+    cerr << " and frequency ratio " << frequencyshift << endl;
+
+#ifdef _WIN32
+    RubberBand::
+#endif
+    timeval tv;
+    (void)gettimeofday(&tv, 0);
+
+    RubberBandStretcher::setDefaultDebugLevel(debug);
+
+    RubberBandStretcher ts(sfinfo.samplerate, channels, options,
+                           ratio, frequencyshift);
+
+    ts.setExpectedInputDuration(sfinfo.frames);
+
+    float *fbuf = new float[channels * ibs];
+    float **ibuf = new float *[channels];
+    for (size_t i = 0; i < channels; ++i) ibuf[i] = new float[ibs];
+
+    int frame = 0;
+    int percent = 0;
+
+    sf_seek(sndfile, 0, SEEK_SET);
+
+    if (!realtime) {
+
+        if (!quiet) {
+            cerr << "Pass 1: Studying..." << endl;
+        }
+
+        while (frame < sfinfo.frames) {
+
+            int count = -1;
+
+            if ((count = sf_readf_float(sndfile, fbuf, ibs)) <= 0) break;
+        
+            for (size_t c = 0; c < channels; ++c) {
+                for (int i = 0; i < count; ++i) {
+                    float value = fbuf[i * channels + c];
+                    ibuf[c][i] = value;
+                }
+            }
+
+            bool final = (frame + ibs >= sfinfo.frames);
+
+            ts.study(ibuf, count, final);
+
+            int p = int((double(frame) * 100.0) / sfinfo.frames);
+            if (p > percent || frame == 0) {
+                percent = p;
+                if (!quiet) {
+                    cerr << "\r" << percent << "% ";
+                }
+            }
+
+            frame += ibs;
+        }
+
+        if (!quiet) {
+            cerr << "\rCalculating profile..." << endl;
+        }
+
+        sf_seek(sndfile, 0, SEEK_SET);
+    }
+
+    frame = 0;
+    percent = 0;
+
+    if (!mapping.empty()) {
+        ts.setKeyFrameMap(mapping);
+    }
+    
+    size_t countIn = 0, countOut = 0;
+
+    while (frame < sfinfo.frames) {
+
+        int count = -1;
+
+	if ((count = sf_readf_float(sndfile, fbuf, ibs)) < 0) break;
+        
+        countIn += count;
+
+        for (size_t c = 0; c < channels; ++c) {
+            for (int i = 0; i < count; ++i) {
+                float value = fbuf[i * channels + c];
+                ibuf[c][i] = value;
+            }
+        }
+
+        bool final = (frame + ibs >= sfinfo.frames);
+
+        if (debug > 2) {
+            cerr << "count = " << count << ", ibs = " << ibs << ", frame = " << frame << ", frames = " << sfinfo.frames << ", final = " << final << endl;
+        }
+
+        ts.process(ibuf, count, final);
+
+        int avail = ts.available();
+        if (debug > 1) cerr << "available = " << avail << endl;
+
+        if (avail > 0) {
+            float **obf = new float *[channels];
+            for (size_t i = 0; i < channels; ++i) {
+                obf[i] = new float[avail];
+            }
+            ts.retrieve(obf, avail);
+            countOut += avail;
+            float *fobf = new float[channels * avail];
+            for (size_t c = 0; c < channels; ++c) {
+                for (int i = 0; i < avail; ++i) {
+                    float value = obf[c][i];
+                    if (value > 1.f) value = 1.f;
+                    if (value < -1.f) value = -1.f;
+                    fobf[i * channels + c] = value;
+                }
+            }
+//            cout << "fobf mean: ";
+//    double d = 0;
+//    for (int i = 0; i < avail; ++i) {
+//        d += fobf[i];
+//    }
+//    d /= avail;
+//    cout << d << endl;
+            sf_writef_float(sndfileOut, fobf, avail);
+            delete[] fobf;
+            for (size_t i = 0; i < channels; ++i) {
+                delete[] obf[i];
+            }
+            delete[] obf;
+        }
+
+        if (frame == 0 && !realtime && !quiet) {
+            cerr << "Pass 2: Processing..." << endl;
+        }
+
+	int p = int((double(frame) * 100.0) / sfinfo.frames);
+	if (p > percent || frame == 0) {
+	    percent = p;
+            if (!quiet) {
+                cerr << "\r" << percent << "% ";
+            }
+	}
+
+        frame += ibs;
+    }
+
+    if (!quiet) {
+        cerr << "\r    " << endl;
+    }
+    int avail;
+
+    while ((avail = ts.available()) >= 0) {
+
+        if (debug > 1) {
+            cerr << "(completing) available = " << avail << endl;
+        }
+
+        if (avail > 0) {
+            float **obf = new float *[channels];
+            for (size_t i = 0; i < channels; ++i) {
+                obf[i] = new float[avail];
+            }
+            ts.retrieve(obf, avail);
+            countOut += avail;
+            float *fobf = new float[channels * avail];
+            for (size_t c = 0; c < channels; ++c) {
+                for (int i = 0; i < avail; ++i) {
+                    float value = obf[c][i];
+                    if (value > 1.f) value = 1.f;
+                    if (value < -1.f) value = -1.f;
+                    fobf[i * channels + c] = value;
+                }
+            }
+
+            sf_writef_float(sndfileOut, fobf, avail);
+            delete[] fobf;
+            for (size_t i = 0; i < channels; ++i) {
+                delete[] obf[i];
+            }
+            delete[] obf;
+        } else {
+            usleep(10000);
+        }
+    }
+
+    sf_close(sndfile);
+    sf_close(sndfileOut);
+
+    if (!quiet) {
+
+        cerr << "in: " << countIn << ", out: " << countOut << ", ratio: " << float(countOut)/float(countIn) << ", ideal output: " << lrint(countIn * ratio) << ", error: " << abs(lrint(countIn * ratio) - int(countOut)) << endl;
+
+#ifdef _WIN32
+        RubberBand::
+#endif
+        timeval etv;
+        (void)gettimeofday(&etv, 0);
+        
+        etv.tv_sec -= tv.tv_sec;
+        if (etv.tv_usec < tv.tv_usec) {
+            etv.tv_usec += 1000000;
+            etv.tv_sec -= 1;
+        }
+        etv.tv_usec -= tv.tv_usec;
+        
+        double sec = double(etv.tv_sec) + (double(etv.tv_usec) / 1000000.0);
+        cerr << "elapsed time: " << sec << " sec, in frames/sec: " << countIn/sec << ", out frames/sec: " << countOut/sec << endl;
+    }
+
+    RubberBand::Profiler::dump();
+
+    return 0;
+}
+
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/rubberband-library.vcproj
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/rubberband-library.vcproj	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,363 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="rubberband-library"
+	ProjectGUID="{020CEB11-EF4E-400E-971D-A35DB69D7CF9}"
+	RootNamespace="rubberband-library"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="0"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="Debug"
+			IntermediateDirectory="Debug"
+			ConfigurationType="4"
+			EnableManagedIncrementalBuild="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories=".;.\src;"
+				PreprocessorDefinitions="__MSVC__;WIN32;_DEBUG;_LIB;NOMINMAX;_USE_MATH_DEFINES;USE_KISSFFT;USE_SPEEX"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="2"
+				Detect64BitPortabilityProblems="false"
+				DebugInformationFormat="4"
+				ShowIncludes="false"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="Release"
+			IntermediateDirectory="Release"
+			ConfigurationType="4"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				InlineFunctionExpansion="2"
+				EnableIntrinsicFunctions="true"
+				FavorSizeOrSpeed="1"
+				OmitFramePointers="true"
+				AdditionalIncludeDirectories=".;.\src"
+				PreprocessorDefinitions="__MSVC__;WIN32;NDEBUG;_LIB;NOMINMAX;_USE_MATH_DEFINES;USE_KISSFFT;NO_TIMING;USE_SPEEX;NO_THREAD_CHECKS"
+				RuntimeLibrary="2"
+				BufferSecurityCheck="false"
+				EnableEnhancedInstructionSet="1"
+				FloatingPointModel="2"
+				UsePrecompiledHeader="0"
+				WarningLevel="2"
+				Detect64BitPortabilityProblems="false"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath=".\src\system\Allocators.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\dsp\AudioCurveCalculator.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\CompoundAudioCurve.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\ConstantAudioCurve.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\dsp\FFT.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\float_cast\float_cast.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\HighFrequencyAudioCurve.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\dsp\MovingMedian.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\PercussiveAudioCurve.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\base\Profiler.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\dsp\Resampler.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\base\RingBuffer.h"
+				>
+			</File>
+			<File
+				RelativePath=".\rubberband\rubberband-c.h"
+				>
+			</File>
+			<File
+				RelativePath=".\rubberband\RubberBandStretcher.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\dsp\SampleFilter.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\base\Scavenger.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\SilentAudioCurve.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\SpectralDifferenceAudioCurve.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\speex\speex_resampler.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\StretchCalculator.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\StretcherChannelData.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\StretcherImpl.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\system\sysutils.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\system\Thread.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\system\VectorOps.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\dsp\SincWindow.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\dsp\Window.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+			>
+		</Filter>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath=".\src\system\Allocators.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\dsp\AudioCurveCalculator.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\CompoundAudioCurve.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\ConstantAudioCurve.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\dsp\FFT.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\HighFrequencyAudioCurve.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\PercussiveAudioCurve.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\base\Profiler.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\speex\resample.c"
+				>
+			</File>
+			<File
+				RelativePath=".\src\dsp\Resampler.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\rubberband-c.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\RubberBandStretcher.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\SilentAudioCurve.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\audiocurves\SpectralDifferenceAudioCurve.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\StretchCalculator.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\StretcherChannelData.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\StretcherImpl.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\StretcherProcess.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\system\sysutils.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\src\system\Thread.cpp"
+				>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/rubberband-program.vcproj
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/rubberband-program.vcproj	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,232 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="rubberband-program"
+	ProjectGUID="{06838307-FEAA-4DB0-8E08-AF19698E9C40}"
+	RootNamespace="rubberband-program"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="0"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="Debug"
+			IntermediateDirectory="Debug"
+			ConfigurationType="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories=".;.\rubberband;.\src;&quot;..\libsndfile-1_0_17&quot;"
+				PreprocessorDefinitions="__MSVC__;WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				Detect64BitPortabilityProblems="true"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies=".\Debug\rubberband-library.lib ..\libsndfile-1_0_17\libsndfile-1.lib"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="Release"
+			IntermediateDirectory="Release"
+			ConfigurationType="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				InlineFunctionExpansion="2"
+				EnableIntrinsicFunctions="true"
+				FavorSizeOrSpeed="1"
+				OmitFramePointers="true"
+				AdditionalIncludeDirectories=".;.\rubberband;.\src;&quot;..\libsndfile-1_0_17&quot;"
+				PreprocessorDefinitions="__MSVC__;WIN32;NDEBUG;_CONSOLE;WANT_TIMING"
+				RuntimeLibrary="2"
+				EnableEnhancedInstructionSet="1"
+				FloatingPointModel="2"
+				UsePrecompiledHeader="0"
+				WarningLevel="2"
+				Detect64BitPortabilityProblems="false"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies=".\Release\rubberband-library.lib ..\libsndfile-1_0_17\libsndfile-1.lib"
+				LinkIncremental="0"
+				GenerateDebugInformation="false"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath=".\src\float_cast\float_cast.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\getopt\getopt.h"
+				>
+			</File>
+			<File
+				RelativePath=".\rubberband\RubberBandStretcher.h"
+				>
+			</File>
+			<File
+				RelativePath=".\src\getopt\unistd.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+			>
+		</Filter>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath=".\src\getopt\getopt.c"
+				>
+			</File>
+			<File
+				RelativePath=".\src\getopt\getopt_long.c"
+				>
+			</File>
+			<File
+				RelativePath=".\main\main.cpp"
+				>
+			</File>
+		</Filter>
+		<File
+			RelativePath=".\debug\BuildLog.htm"
+			>
+		</File>
+		<File
+			RelativePath=".\wub\index.html"
+			>
+		</File>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/rubberband.pc.in
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/rubberband.pc.in	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,10 @@
+prefix=%PREFIX%
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: rubberband
+Version: 1.8.1
+Description: 
+Libs: -L${libdir} -lrubberband
+Cflags: -I${includedir} 
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/rubberband/RubberBandStretcher.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/rubberband/RubberBandStretcher.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,687 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBANDSTRETCHER_H_
+#define _RUBBERBANDSTRETCHER_H_
+    
+#define RUBBERBAND_VERSION "1.8.1"
+#define RUBBERBAND_API_MAJOR_VERSION 2
+#define RUBBERBAND_API_MINOR_VERSION 5
+
+#include <vector>
+#include <map>
+#include <cstddef>
+
+/**
+ * @mainpage RubberBand
+ * 
+ * The Rubber Band API is contained in the single class
+ * RubberBand::RubberBandStretcher.
+ * 
+ * Threading notes for real-time applications:
+ * 
+ * Multiple instances of RubberBandStretcher may be created and used
+ * in separate threads concurrently.  However, for any single instance
+ * of RubberBandStretcher, you may not call process() more than once
+ * concurrently, and you may not change the time or pitch ratio while
+ * a process() call is being executed (if the stretcher was created in
+ * "real-time mode"; in "offline mode" you can't change the ratios
+ * during use anyway).
+ * 
+ * So you can run process() in its own thread if you like, but if you
+ * want to change ratios dynamically from a different thread, you will
+ * need some form of mutex in your code.  Changing the time or pitch
+ * ratio is real-time safe except in extreme circumstances, so for
+ * most applications that may change these dynamically it probably
+ * makes most sense to do so from the same thread as calls process(),
+ * even if that is a real-time thread.
+ */
+
+namespace RubberBand
+{
+
+class RubberBandStretcher
+{
+public:
+    /**
+     * Processing options for the timestretcher.  The preferred
+     * options should normally be set in the constructor, as a bitwise
+     * OR of the option flags.  The default value (DefaultOptions) is
+     * intended to give good results in most situations.
+     *
+     * 1. Flags prefixed \c OptionProcess determine how the timestretcher
+     * will be invoked.  These options may not be changed after
+     * construction.
+     * 
+     *   \li \c OptionProcessOffline - Run the stretcher in offline
+     *   mode.  In this mode the input data needs to be provided
+     *   twice, once to study(), which calculates a stretch profile
+     *   for the audio, and once to process(), which stretches it.
+     *
+     *   \li \c OptionProcessRealTime - Run the stretcher in real-time
+     *   mode.  In this mode only process() should be called, and the
+     *   stretcher adjusts dynamically in response to the input audio.
+     * 
+     * The Process setting is likely to depend on your architecture:
+     * non-real-time operation on seekable files: Offline; real-time
+     * or streaming operation: RealTime.
+     *
+     * 2. Flags prefixed \c OptionStretch control the profile used for
+     * variable timestretching.  Rubber Band always adjusts the
+     * stretch profile to minimise stretching of busy broadband
+     * transient sounds, but the degree to which it does so is
+     * adjustable.  These options may not be changed after
+     * construction.
+     *
+     *   \li \c OptionStretchElastic - Only meaningful in offline
+     *   mode, and the default in that mode.  The audio will be
+     *   stretched at a variable rate, aimed at preserving the quality
+     *   of transient sounds as much as possible.  The timings of low
+     *   activity regions between transients may be less exact than
+     *   when the precise flag is set.
+     * 
+     *   \li \c OptionStretchPrecise - Although still using a variable
+     *   stretch rate, the audio will be stretched so as to maintain
+     *   as close as possible to a linear stretch ratio throughout.
+     *   Timing may be better than when using \c OptionStretchElastic, at
+     *   slight cost to the sound quality of transients.  This setting
+     *   is always used when running in real-time mode.
+     *
+     * 3. Flags prefixed \c OptionTransients control the component
+     * frequency phase-reset mechanism that may be used at transient
+     * points to provide clarity and realism to percussion and other
+     * significant transient sounds.  These options may be changed
+     * after construction when running in real-time mode, but not when
+     * running in offline mode.
+     * 
+     *   \li \c OptionTransientsCrisp - Reset component phases at the
+     *   peak of each transient (the start of a significant note or
+     *   percussive event).  This, the default setting, usually
+     *   results in a clear-sounding output; but it is not always
+     *   consistent, and may cause interruptions in stable sounds
+     *   present at the same time as transient events.  The
+     *   OptionDetector flags (below) can be used to tune this to some
+     *   extent.
+     *
+     *   \li \c OptionTransientsMixed - Reset component phases at the
+     *   peak of each transient, outside a frequency range typical of
+     *   musical fundamental frequencies.  The results may be more
+     *   regular for mixed stable and percussive notes than
+     *   \c OptionTransientsCrisp, but with a "phasier" sound.  The
+     *   balance may sound very good for certain types of music and
+     *   fairly bad for others.
+     *
+     *   \li \c OptionTransientsSmooth - Do not reset component phases
+     *   at any point.  The results will be smoother and more regular
+     *   but may be less clear than with either of the other
+     *   transients flags.
+     *
+     * 4. Flags prefixed \c OptionDetector control the type of
+     * transient detector used.  These options may be changed
+     * after construction when running in real-time mode, but not when
+     * running in offline mode.
+     *
+     *   \li \c OptionDetectorCompound - Use a general-purpose
+     *   transient detector which is likely to be good for most
+     *   situations.  This is the default.
+     *
+     *   \li \c OptionDetectorPercussive - Detect percussive
+     *   transients.  Note that this was the default and only option
+     *   in Rubber Band versions prior to 1.5.
+     *
+     *   \li \c OptionDetectorSoft - Use an onset detector with less
+     *   of a bias toward percussive transients.  This may give better
+     *   results with certain material (e.g. relatively monophonic
+     *   piano music).
+     *
+     * 5. Flags prefixed \c OptionPhase control the adjustment of
+     * component frequency phases from one analysis window to the next
+     * during non-transient segments.  These options may be changed at
+     * any time.
+     *
+     *   \li \c OptionPhaseLaminar - Adjust phases when stretching in
+     *   such a way as to try to retain the continuity of phase
+     *   relationships between adjacent frequency bins whose phases
+     *   are behaving in similar ways.  This, the default setting,
+     *   should give good results in most situations.
+     *
+     *   \li \c OptionPhaseIndependent - Adjust the phase in each
+     *   frequency bin independently from its neighbours.  This
+     *   usually results in a slightly softer, phasier sound.
+     *
+     * 6. Flags prefixed \c OptionThreading control the threading
+     * model of the stretcher.  These options may not be changed after
+     * construction.
+     *
+     *   \li \c OptionThreadingAuto - Permit the stretcher to
+     *   determine its own threading model.  Usually this means using
+     *   one processing thread per audio channel in offline mode if
+     *   the stretcher is able to determine that more than one CPU is
+     *   available, and one thread only in realtime mode.  This is the
+     *   defafult.
+     *
+     *   \li \c OptionThreadingNever - Never use more than one thread.
+     *  
+     *   \li \c OptionThreadingAlways - Use multiple threads in any
+     *   situation where \c OptionThreadingAuto would do so, except omit
+     *   the check for multiple CPUs and instead assume it to be true.
+     *
+     * 7. Flags prefixed \c OptionWindow control the window size for
+     * FFT processing.  The window size actually used will depend on
+     * many factors, but it can be influenced.  These options may not
+     * be changed after construction.
+     *
+     *   \li \c OptionWindowStandard - Use the default window size.
+     *   The actual size will vary depending on other parameters.
+     *   This option is expected to produce better results than the
+     *   other window options in most situations.
+     *
+     *   \li \c OptionWindowShort - Use a shorter window.  This may
+     *   result in crisper sound for audio that depends strongly on
+     *   its timing qualities.
+     *
+     *   \li \c OptionWindowLong - Use a longer window.  This is
+     *   likely to result in a smoother sound at the expense of
+     *   clarity and timing.
+     *
+     * 8. Flags prefixed \c OptionSmoothing control the use of
+     * window-presum FFT and time-domain smoothing.  These options may
+     * not be changed after construction.
+     *
+     *   \li \c OptionSmoothingOff - Do not use time-domain smoothing.
+     *   This is the default.
+     *
+     *   \li \c OptionSmoothingOn - Use time-domain smoothing.  This
+     *   will result in a softer sound with some audible artifacts
+     *   around sharp transients, but it may be appropriate for longer
+     *   stretches of some instruments and can mix well with
+     *   OptionWindowShort.
+     *
+     * 9. Flags prefixed \c OptionFormant control the handling of
+     * formant shape (spectral envelope) when pitch-shifting.  These
+     * options may be changed at any time.
+     *
+     *   \li \c OptionFormantShifted - Apply no special formant
+     *   processing.  The spectral envelope will be pitch shifted as
+     *   normal.  This is the default.
+     *
+     *   \li \c OptionFormantPreserved - Preserve the spectral
+     *   envelope of the unshifted signal.  This permits shifting the
+     *   note frequency without so substantially affecting the
+     *   perceived pitch profile of the voice or instrument.
+     *
+     * 10. Flags prefixed \c OptionPitch control the method used for
+     * pitch shifting.  These options may be changed at any time.
+     * They are only effective in realtime mode; in offline mode, the
+     * pitch-shift method is fixed.
+     *
+     *   \li \c OptionPitchHighSpeed - Use a method with a CPU cost
+     *   that is relatively moderate and predictable.  This may
+     *   sound less clear than OptionPitchHighQuality, especially
+     *   for large pitch shifts.  This is the default.
+
+     *   \li \c OptionPitchHighQuality - Use the highest quality
+     *   method for pitch shifting.  This method has a CPU cost
+     *   approximately proportional to the required frequency shift.
+
+     *   \li \c OptionPitchHighConsistency - Use the method that gives
+     *   greatest consistency when used to create small variations in
+     *   pitch around the 1.0-ratio level.  Unlike the previous two
+     *   options, this avoids discontinuities when moving across the
+     *   1.0 pitch scale in real-time; it also consumes more CPU than
+     *   the others in the case where the pitch scale is exactly 1.0.
+     *
+     * 11. Flags prefixed \c OptionChannels control the method used for
+     * processing two-channel audio.  These options may not be changed
+     * after construction.
+     *
+     *   \li \c OptionChannelsApart - Each channel is processed
+     *   individually, though timing is synchronised and phases are
+     *   synchronised at transients (depending on the OptionTransients
+     *   setting).  This gives the highest quality for the individual
+     *   channels but a relative lack of stereo focus and unrealistic
+     *   increase in "width".  This is the default.
+     *
+     *   \li \c OptionChannelsTogether - The first two channels (where
+     *   two or more are present) are considered to be a stereo pair
+     *   and are processed in mid-side format; mid and side are
+     *   processed individually, with timing synchronised and phases
+     *   synchronised at transients (depending on the OptionTransients
+     *   setting).  This usually leads to better focus in the centre
+     *   but a loss of stereo space and width.  Any channels beyond
+     *   the first two are processed individually.
+     */
+    
+    enum Option {
+
+        OptionProcessOffline       = 0x00000000,
+        OptionProcessRealTime      = 0x00000001,
+
+        OptionStretchElastic       = 0x00000000,
+        OptionStretchPrecise       = 0x00000010,
+    
+        OptionTransientsCrisp      = 0x00000000,
+        OptionTransientsMixed      = 0x00000100,
+        OptionTransientsSmooth     = 0x00000200,
+
+        OptionDetectorCompound     = 0x00000000,
+        OptionDetectorPercussive   = 0x00000400,
+        OptionDetectorSoft         = 0x00000800,
+
+        OptionPhaseLaminar         = 0x00000000,
+        OptionPhaseIndependent     = 0x00002000,
+    
+        OptionThreadingAuto        = 0x00000000,
+        OptionThreadingNever       = 0x00010000,
+        OptionThreadingAlways      = 0x00020000,
+
+        OptionWindowStandard       = 0x00000000,
+        OptionWindowShort          = 0x00100000,
+        OptionWindowLong           = 0x00200000,
+
+        OptionSmoothingOff         = 0x00000000,
+        OptionSmoothingOn          = 0x00800000,
+
+        OptionFormantShifted       = 0x00000000,
+        OptionFormantPreserved     = 0x01000000,
+
+        OptionPitchHighSpeed       = 0x00000000,
+        OptionPitchHighQuality     = 0x02000000,
+        OptionPitchHighConsistency = 0x04000000,
+
+        OptionChannelsApart        = 0x00000000,
+        OptionChannelsTogether     = 0x10000000,
+
+        // n.b. Options is int, so we must stop before 0x80000000
+    };
+
+    typedef int Options;
+
+    enum PresetOption {
+        DefaultOptions             = 0x00000000,
+        PercussiveOptions          = 0x00102000
+    };
+
+    /**
+     * Construct a time and pitch stretcher object to run at the given
+     * sample rate, with the given number of channels.  Processing
+     * options and the time and pitch scaling ratios may be provided.
+     * The time and pitch ratios may be changed after construction,
+     * but most of the options may not.  See the option documentation
+     * above for more details.
+     */
+    RubberBandStretcher(size_t sampleRate,
+                        size_t channels,
+                        Options options = DefaultOptions,
+                        double initialTimeRatio = 1.0,
+                        double initialPitchScale = 1.0);
+    ~RubberBandStretcher();
+
+    /**
+     * Reset the stretcher's internal buffers.  The stretcher should
+     * subsequently behave as if it had just been constructed
+     * (although retaining the current time and pitch ratio).
+     */
+    void reset();
+
+    /**
+     * Set the time ratio for the stretcher.  This is the ratio of
+     * stretched to unstretched duration -- not tempo.  For example, a
+     * ratio of 2.0 would make the audio twice as long (i.e. halve the
+     * tempo); 0.5 would make it half as long (i.e. double the tempo);
+     * 1.0 would leave the duration unaffected.
+     *
+     * If the stretcher was constructed in Offline mode, the time
+     * ratio is fixed throughout operation; this function may be
+     * called any number of times between construction (or a call to
+     * reset()) and the first call to study() or process(), but may
+     * not be called after study() or process() has been called.
+     *
+     * If the stretcher was constructed in RealTime mode, the time
+     * ratio may be varied during operation; this function may be
+     * called at any time, so long as it is not called concurrently
+     * with process().  You should either call this function from the
+     * same thread as process(), or provide your own mutex or similar
+     * mechanism to ensure that setTimeRatio and process() cannot be
+     * run at once (there is no internal mutex for this purpose).
+     */
+    void setTimeRatio(double ratio);
+
+    /**
+     * Set the pitch scaling ratio for the stretcher.  This is the
+     * ratio of target frequency to source frequency.  For example, a
+     * ratio of 2.0 would shift up by one octave; 0.5 down by one
+     * octave; or 1.0 leave the pitch unaffected.
+     *
+     * To put this in musical terms, a pitch scaling ratio
+     * corresponding to a shift of S equal-tempered semitones (where S
+     * is positive for an upwards shift and negative for downwards) is
+     * pow(2.0, S / 12.0).
+     *
+     * If the stretcher was constructed in Offline mode, the pitch
+     * scaling ratio is fixed throughout operation; this function may
+     * be called any number of times between construction (or a call
+     * to reset()) and the first call to study() or process(), but may
+     * not be called after study() or process() has been called.
+     *
+     * If the stretcher was constructed in RealTime mode, the pitch
+     * scaling ratio may be varied during operation; this function may
+     * be called at any time, so long as it is not called concurrently
+     * with process().  You should either call this function from the
+     * same thread as process(), or provide your own mutex or similar
+     * mechanism to ensure that setPitchScale and process() cannot be
+     * run at once (there is no internal mutex for this purpose).
+     */
+    void setPitchScale(double scale);
+
+    /**
+     * Return the last time ratio value that was set (either on
+     * construction or with setTimeRatio()).
+     */
+    double getTimeRatio() const;
+
+    /**
+     * Return the last pitch scaling ratio value that was set (either
+     * on construction or with setPitchScale()).
+     */
+    double getPitchScale() const;
+
+    /**
+     * Return the processing latency of the stretcher.  This is the
+     * number of audio samples that one would have to discard at the
+     * start of the output in order to ensure that the resulting audio
+     * aligned with the input audio at the start.  In Offline mode,
+     * latency is automatically adjusted for and the result is zero.
+     * In RealTime mode, the latency may depend on the time and pitch
+     * ratio and other options.
+     */
+    size_t getLatency() const;
+
+    /**
+     * Change an OptionTransients configuration setting.  This may be
+     * called at any time in RealTime mode.  It may not be called in
+     * Offline mode (for which the transients option is fixed on
+     * construction).
+     */
+    void setTransientsOption(Options options);
+
+    /**
+     * Change an OptionDetector configuration setting.  This may be
+     * called at any time in RealTime mode.  It may not be called in
+     * Offline mode (for which the detector option is fixed on
+     * construction).
+     */
+    void setDetectorOption(Options options);
+
+    /**
+     * Change an OptionPhase configuration setting.  This may be
+     * called at any time in any mode.
+     *
+     * Note that if running multi-threaded in Offline mode, the change
+     * may not take effect immediately if processing is already under
+     * way when this function is called.
+     */
+    void setPhaseOption(Options options);
+
+    /**
+     * Change an OptionFormant configuration setting.  This may be
+     * called at any time in any mode.
+     *
+     * Note that if running multi-threaded in Offline mode, the change
+     * may not take effect immediately if processing is already under
+     * way when this function is called.
+     */
+    void setFormantOption(Options options);
+
+    /**
+     * Change an OptionPitch configuration setting.  This may be
+     * called at any time in RealTime mode.  It may not be called in
+     * Offline mode (for which the transients option is fixed on
+     * construction).
+     */
+    void setPitchOption(Options options);
+
+    /**
+     * Tell the stretcher exactly how many input samples it will
+     * receive.  This is only useful in Offline mode, when it allows
+     * the stretcher to ensure that the number of output samples is
+     * exactly correct.  In RealTime mode no such guarantee is
+     * possible and this value is ignored.
+     */
+    void setExpectedInputDuration(size_t samples);
+
+    /**
+     * Tell the stretcher the maximum number of sample frames that you
+     * will ever be passing in to a single process() call.  If you
+     * don't call this, the stretcher will assume that you are calling
+     * getSamplesRequired() at each cycle and are never passing more
+     * samples than are suggested by that function.
+     *
+     * If your application has some external constraint that means you
+     * prefer a fixed block size, then your normal mode of operation
+     * would be to provide that block size to this function; to loop
+     * calling process() with that size of block; after each call to
+     * process(), test whether output has been generated by calling
+     * available(); and, if so, call retrieve() to obtain it.  See
+     * getSamplesRequired() for a more suitable operating mode for
+     * applications without such external constraints.
+     *
+     * This function may not be called after the first call to study()
+     * or process().
+     *
+     * Note that this value is only relevant to process(), not to
+     * study() (to which you may pass any number of samples at a time,
+     * and from which there is no output).
+     */
+    void setMaxProcessSize(size_t samples);
+
+    /**
+     * Ask the stretcher how many audio sample frames should be
+     * provided as input in order to ensure that some more output
+     * becomes available.
+     * 
+     * If your application has no particular constraint on processing
+     * block size and you are able to provide any block size as input
+     * for each cycle, then your normal mode of operation would be to
+     * loop querying this function; providing that number of samples
+     * to process(); and reading the output using available() and
+     * retrieve().  See setMaxProcessSize() for a more suitable
+     * operating mode for applications that do have external block
+     * size constraints.
+     *
+     * Note that this value is only relevant to process(), not to
+     * study() (to which you may pass any number of samples at a time,
+     * and from which there is no output).
+     */
+     size_t getSamplesRequired() const;
+
+    /**
+     * Provide a set of mappings from "before" to "after" sample
+     * numbers so as to enforce a particular stretch profile.  The
+     * argument is a map from audio sample frame number in the source
+     * material, to the corresponding sample frame number in the
+     * stretched output.  The mapping should be for key frames only,
+     * with a "reasonable" gap between mapped samples.
+     *
+     * This function cannot be used in RealTime mode.
+     *
+     * This function may not be called after the first call to
+     * process().  It should be called after the time and pitch ratios
+     * have been set; the results of changing the time and pitch
+     * ratios after calling this function are undefined.  Calling
+     * reset() will clear this mapping.
+     *
+     * The key frame map only affects points within the material; it
+     * does not determine the overall stretch ratio (that is, the
+     * ratio between the output material's duration and the source
+     * material's duration).  You need to provide this ratio
+     * separately to setTimeRatio(), otherwise the results may be
+     * truncated or extended in unexpected ways regardless of the
+     * extent of the frame numbers found in the key frame map.
+     */
+    void setKeyFrameMap(const std::map<size_t, size_t> &);
+    
+    /**
+     * Provide a block of "samples" sample frames for the stretcher to
+     * study and calculate a stretch profile from.
+     *
+     * This is only meaningful in Offline mode, and is required if
+     * running in that mode.  You should pass the entire input through
+     * study() before any process() calls are made, as a sequence of
+     * blocks in individual study() calls, or as a single large block.
+     *
+     * "input" should point to de-interleaved audio data with one
+     * float array per channel.  "samples" supplies the number of
+     * audio sample frames available in "input".  If "samples" is
+     * zero, "input" may be NULL.
+     * 
+     * Set "final" to true if this is the last block of data that will
+     * be provided to study() before the first process() call.
+     */
+    void study(const float *const *input, size_t samples, bool final);
+
+    /**
+     * Provide a block of "samples" sample frames for processing.
+     * See also getSamplesRequired() and setMaxProcessSize().
+     *
+     * Set "final" to true if this is the last block of input data.
+     */
+    void process(const float *const *input, size_t samples, bool final);
+
+    /**
+     * Ask the stretcher how many audio sample frames of output data
+     * are available for reading (via retrieve()).
+     * 
+     * This function returns 0 if no frames are available: this
+     * usually means more input data needs to be provided, but if the
+     * stretcher is running in threaded mode it may just mean that not
+     * enough data has yet been processed.  Call getSamplesRequired()
+     * to discover whether more input is needed.
+     *
+     * This function returns -1 if all data has been fully processed
+     * and all output read, and the stretch process is now finished.
+     */
+    int available() const;
+
+    /**
+     * Obtain some processed output data from the stretcher.  Up to
+     * "samples" samples will be stored in the output arrays (one per
+     * channel for de-interleaved audio data) pointed to by "output".
+     * The return value is the actual number of sample frames
+     * retrieved.
+     */
+    size_t retrieve(float *const *output, size_t samples) const;
+
+    /**
+     * Return the value of internal frequency cutoff value n.
+     *
+     * This function is not for general use.
+     */
+    float getFrequencyCutoff(int n) const;
+
+    /** 
+     * Set the value of internal frequency cutoff n to f Hz.
+     *
+     * This function is not for general use.
+     */
+    void setFrequencyCutoff(int n, float f);
+    
+    /**
+     * Retrieve the value of the internal input block increment value.
+     *
+     * This function is provided for diagnostic purposes only.
+     */
+    size_t getInputIncrement() const;
+
+    /**
+     * In offline mode, retrieve the sequence of internal block
+     * increments for output, for the entire audio data, provided the
+     * stretch profile has been calculated.  In realtime mode,
+     * retrieve any output increments that have accumulated since the
+     * last call to getOutputIncrements, to a limit of 16.
+     *
+     * This function is provided for diagnostic purposes only.
+     */
+    std::vector<int> getOutputIncrements() const;
+
+    /**
+     * In offline mode, retrieve the sequence of internal phase reset
+     * detection function values, for the entire audio data, provided
+     * the stretch profile has been calculated.  In realtime mode,
+     * retrieve any phase reset points that have accumulated since the
+     * last call to getPhaseResetCurve, to a limit of 16.
+     *
+     * This function is provided for diagnostic purposes only.
+     */
+    std::vector<float> getPhaseResetCurve() const;
+
+    /**
+     * In offline mode, retrieve the sequence of internal frames for
+     * which exact timing has been sought, for the entire audio data,
+     * provided the stretch profile has been calculated.  In realtime
+     * mode, return an empty sequence.
+     *
+     * This function is provided for diagnostic purposes only.
+     */
+    std::vector<int> getExactTimePoints() const;
+
+    /**
+     * Return the number of channels this stretcher was constructed
+     * with.
+     */
+    size_t getChannelCount() const;
+
+    /**
+     * Force the stretcher to calculate a stretch profile.  Normally
+     * this happens automatically for the first process() call in
+     * offline mode.
+     *
+     * This function is provided for diagnostic purposes only.
+     */
+    void calculateStretch();
+
+    /**
+     * Set the level of debug output.  The value may be from 0 (errors
+     * only) to 3 (very verbose, with audible ticks in the output at
+     * phase reset points).  The default is whatever has been set
+     * using setDefaultDebugLevel, or 0 if that function has not been
+     * called.
+     */
+    void setDebugLevel(int level);
+
+    /**
+     * Set the default level of debug output for subsequently
+     * constructed stretchers.
+     *
+     * @see setDebugLevel
+     */
+    static void setDefaultDebugLevel(int level);
+
+protected:
+    class Impl;
+    Impl *m_d;
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/rubberband/rubberband-c.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/rubberband/rubberband-c.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,142 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_C_API_H_
+#define _RUBBERBAND_C_API_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RUBBERBAND_VERSION "1.8.1"
+#define RUBBERBAND_API_MAJOR_VERSION 2
+#define RUBBERBAND_API_MINOR_VERSION 5
+
+/**
+ * This is a C-linkage interface to the Rubber Band time stretcher.
+ * 
+ * This is a wrapper interface: the primary interface is in C++ and is
+ * defined and documented in RubberBandStretcher.h.  The library
+ * itself is implemented in C++, and requires C++ standard library
+ * support even when using the C-linkage API.
+ *
+ * Please see RubberBandStretcher.h for documentation.
+ *
+ * If you are writing to the C++ API, do not include this header.
+ */
+
+enum RubberBandOption {
+
+    RubberBandOptionProcessOffline       = 0x00000000,
+    RubberBandOptionProcessRealTime      = 0x00000001,
+
+    RubberBandOptionStretchElastic       = 0x00000000,
+    RubberBandOptionStretchPrecise       = 0x00000010,
+    
+    RubberBandOptionTransientsCrisp      = 0x00000000,
+    RubberBandOptionTransientsMixed      = 0x00000100,
+    RubberBandOptionTransientsSmooth     = 0x00000200,
+
+    RubberBandOptionDetectorCompound     = 0x00000000,
+    RubberBandOptionDetectorPercussive   = 0x00000400,
+    RubberBandOptionDetectorSoft         = 0x00000800,
+
+    RubberBandOptionPhaseLaminar         = 0x00000000,
+    RubberBandOptionPhaseIndependent     = 0x00002000,
+    
+    RubberBandOptionThreadingAuto        = 0x00000000,
+    RubberBandOptionThreadingNever       = 0x00010000,
+    RubberBandOptionThreadingAlways      = 0x00020000,
+
+    RubberBandOptionWindowStandard       = 0x00000000,
+    RubberBandOptionWindowShort          = 0x00100000,
+    RubberBandOptionWindowLong           = 0x00200000,
+
+    RubberBandOptionSmoothingOff         = 0x00000000,
+    RubberBandOptionSmoothingOn          = 0x00800000,
+
+    RubberBandOptionFormantShifted       = 0x00000000,
+    RubberBandOptionFormantPreserved     = 0x01000000,
+
+    RubberBandOptionPitchHighQuality     = 0x00000000,
+    RubberBandOptionPitchHighSpeed       = 0x02000000,
+    RubberBandOptionPitchHighConsistency = 0x04000000,
+
+    RubberBandOptionChannelsApart        = 0x00000000,
+    RubberBandOptionChannelsTogether     = 0x10000000,
+};
+
+typedef int RubberBandOptions;
+
+struct RubberBandState_;
+typedef struct RubberBandState_ *RubberBandState;
+
+extern RubberBandState rubberband_new(unsigned int sampleRate,
+                                      unsigned int channels,
+                                      RubberBandOptions options,
+                                      double initialTimeRatio,
+                                      double initialPitchScale);
+
+extern void rubberband_delete(RubberBandState);
+
+extern void rubberband_reset(RubberBandState);
+
+extern void rubberband_set_time_ratio(RubberBandState, double ratio);
+extern void rubberband_set_pitch_scale(RubberBandState, double scale);
+
+extern double rubberband_get_time_ratio(const RubberBandState);
+extern double rubberband_get_pitch_scale(const RubberBandState);
+
+extern unsigned int rubberband_get_latency(const RubberBandState);
+
+extern void rubberband_set_transients_option(RubberBandState, RubberBandOptions options);
+extern void rubberband_set_detector_option(RubberBandState, RubberBandOptions options);
+extern void rubberband_set_phase_option(RubberBandState, RubberBandOptions options);
+extern void rubberband_set_formant_option(RubberBandState, RubberBandOptions options);
+extern void rubberband_set_pitch_option(RubberBandState, RubberBandOptions options);
+
+extern void rubberband_set_expected_input_duration(RubberBandState, unsigned int samples);
+
+extern unsigned int rubberband_get_samples_required(const RubberBandState);
+
+extern void rubberband_set_max_process_size(RubberBandState, unsigned int samples);
+extern void rubberband_set_key_frame_map(RubberBandState, unsigned int keyframecount, unsigned int *from, unsigned int *to);
+
+extern void rubberband_study(RubberBandState, const float *const *input, unsigned int samples, int final);
+extern void rubberband_process(RubberBandState, const float *const *input, unsigned int samples, int final);
+
+extern int rubberband_available(const RubberBandState);
+extern unsigned int rubberband_retrieve(const RubberBandState, float *const *output, unsigned int samples);
+
+extern unsigned int rubberband_get_channel_count(const RubberBandState);
+
+extern void rubberband_calculate_stretch(RubberBandState);
+
+extern void rubberband_set_debug_level(RubberBandState, int level);
+extern void rubberband_set_default_debug_level(int level);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/RubberBandStretcher.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/RubberBandStretcher.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,223 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "StretcherImpl.h"
+
+using namespace std;
+
+namespace RubberBand {
+
+
+RubberBandStretcher::RubberBandStretcher(size_t sampleRate,
+                                         size_t channels,
+                                         Options options,
+                                         double initialTimeRatio,
+                                         double initialPitchScale) :
+    m_d(new Impl(sampleRate, channels, options,
+                 initialTimeRatio, initialPitchScale))
+{
+}
+
+RubberBandStretcher::~RubberBandStretcher()
+{
+    delete m_d;
+}
+
+void
+RubberBandStretcher::reset()
+{
+    m_d->reset();
+}
+
+void
+RubberBandStretcher::setTimeRatio(double ratio)
+{
+    m_d->setTimeRatio(ratio);
+}
+
+void
+RubberBandStretcher::setPitchScale(double scale)
+{
+    m_d->setPitchScale(scale);
+}
+
+double
+RubberBandStretcher::getTimeRatio() const
+{
+    return m_d->getTimeRatio();
+}
+
+double
+RubberBandStretcher::getPitchScale() const
+{
+    return m_d->getPitchScale();
+}
+
+size_t
+RubberBandStretcher::getLatency() const
+{
+    return m_d->getLatency();
+}
+
+void
+RubberBandStretcher::setTransientsOption(Options options) 
+{
+    m_d->setTransientsOption(options);
+}
+
+void
+RubberBandStretcher::setDetectorOption(Options options) 
+{
+    m_d->setDetectorOption(options);
+}
+
+void
+RubberBandStretcher::setPhaseOption(Options options) 
+{
+    m_d->setPhaseOption(options);
+}
+
+void
+RubberBandStretcher::setFormantOption(Options options)
+{
+    m_d->setFormantOption(options);
+}
+
+void
+RubberBandStretcher::setPitchOption(Options options)
+{
+    m_d->setPitchOption(options);
+}
+
+void
+RubberBandStretcher::setExpectedInputDuration(size_t samples) 
+{
+    m_d->setExpectedInputDuration(samples);
+}
+
+void
+RubberBandStretcher::setMaxProcessSize(size_t samples)
+{
+    m_d->setMaxProcessSize(samples);
+}
+
+void
+RubberBandStretcher::setKeyFrameMap(const map<size_t, size_t> &mapping)
+{
+    m_d->setKeyFrameMap(mapping);
+}
+
+size_t
+RubberBandStretcher::getSamplesRequired() const
+{
+    return m_d->getSamplesRequired();
+}
+
+void
+RubberBandStretcher::study(const float *const *input, size_t samples,
+                           bool final)
+{
+    m_d->study(input, samples, final);
+}
+
+void
+RubberBandStretcher::process(const float *const *input, size_t samples,
+                             bool final)
+{
+    m_d->process(input, samples, final);
+}
+
+int
+RubberBandStretcher::available() const
+{
+    return m_d->available();
+}
+
+size_t
+RubberBandStretcher::retrieve(float *const *output, size_t samples) const
+{
+    return m_d->retrieve(output, samples);
+}
+
+float
+RubberBandStretcher::getFrequencyCutoff(int n) const
+{
+    return m_d->getFrequencyCutoff(n);
+}
+
+void
+RubberBandStretcher::setFrequencyCutoff(int n, float f) 
+{
+    m_d->setFrequencyCutoff(n, f);
+}
+
+size_t
+RubberBandStretcher::getInputIncrement() const
+{
+    return m_d->getInputIncrement();
+}
+
+vector<int>
+RubberBandStretcher::getOutputIncrements() const
+{
+    return m_d->getOutputIncrements();
+}
+
+vector<float>
+RubberBandStretcher::getPhaseResetCurve() const
+{
+    return m_d->getPhaseResetCurve();
+}
+
+vector<int>
+RubberBandStretcher::getExactTimePoints() const
+{
+    return m_d->getExactTimePoints();
+}
+
+size_t
+RubberBandStretcher::getChannelCount() const
+{
+    return m_d->getChannelCount();
+}
+
+void
+RubberBandStretcher::calculateStretch()
+{
+    m_d->calculateStretch();
+}
+
+void
+RubberBandStretcher::setDebugLevel(int level)
+{
+    m_d->setDebugLevel(level);
+}
+
+void
+RubberBandStretcher::setDefaultDebugLevel(int level)
+{
+    Impl::setDefaultDebugLevel(level);
+}
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/StretchCalculator.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/StretchCalculator.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1013 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "StretchCalculator.h"
+
+#include <math.h>
+#include <iostream>
+#include <deque>
+#include <set>
+#include <cassert>
+#include <algorithm>
+
+#include "system/sysutils.h"
+
+namespace RubberBand
+{
+	
+StretchCalculator::StretchCalculator(size_t sampleRate,
+                                     size_t inputIncrement,
+                                     bool useHardPeaks) :
+    m_sampleRate(sampleRate),
+    m_increment(inputIncrement),
+    m_prevDf(0),
+    m_divergence(0),
+    m_recovery(0),
+    m_prevRatio(1.0),
+    m_transientAmnesty(0),
+    m_useHardPeaks(useHardPeaks)
+{
+//    std::cerr << "StretchCalculator::StretchCalculator: useHardPeaks = " << useHardPeaks << std::endl;
+}    
+
+StretchCalculator::~StretchCalculator()
+{
+}
+
+void
+StretchCalculator::setKeyFrameMap(const std::map<size_t, size_t> &mapping)
+{
+    m_keyFrameMap = mapping;
+
+    // Ensure we always have a 0 -> 0 mapping. If there's nothing in
+    // the map at all, don't need to worry about this (empty map is
+    // handled separately anyway)
+    if (!m_keyFrameMap.empty()) {
+        if (m_keyFrameMap.find(0) == m_keyFrameMap.end()) {
+            m_keyFrameMap[0] = 0;
+        }
+    }
+}
+
+std::vector<int>
+StretchCalculator::calculate(double ratio, size_t inputDuration,
+                             const std::vector<float> &phaseResetDf,
+                             const std::vector<float> &stretchDf)
+{
+    assert(phaseResetDf.size() == stretchDf.size());
+    
+    m_peaks = findPeaks(phaseResetDf);
+
+    size_t totalCount = phaseResetDf.size();
+
+    size_t outputDuration = lrint(inputDuration * ratio);
+
+    if (m_debugLevel > 0) {
+        std::cerr << "StretchCalculator::calculate(): inputDuration " << inputDuration << ", ratio " << ratio << ", outputDuration " << outputDuration;
+    }
+
+    outputDuration = lrint((phaseResetDf.size() * m_increment) * ratio);
+
+    if (m_debugLevel > 0) {
+        std::cerr << " (rounded up to " << outputDuration << ")";
+        std::cerr << ", df size " << phaseResetDf.size() << ", increment "
+                  << m_increment << std::endl;
+    }
+
+    std::vector<Peak> peaks; // peak position (in chunks) and hardness
+    std::vector<size_t> targets; // targets for mapping peaks (in samples)
+    mapPeaks(peaks, targets, outputDuration, totalCount);
+
+    if (m_debugLevel > 1) {
+        std::cerr << "have " << peaks.size() << " fixed positions" << std::endl;
+    }
+
+    size_t totalInput = 0, totalOutput = 0;
+
+    // For each region between two consecutive time sync points, we
+    // want to take the number of output chunks to be allocated and
+    // the detection function values within the range, and produce a
+    // series of increments that sum to the number of output chunks,
+    // such that each increment is displaced from the input increment
+    // by an amount inversely proportional to the magnitude of the
+    // stretch detection function at that input step.
+
+    size_t regionTotalChunks = 0;
+
+    std::vector<int> increments;
+
+    for (size_t i = 0; i <= peaks.size(); ++i) {
+        
+        size_t regionStart, regionStartChunk, regionEnd, regionEndChunk;
+        bool phaseReset = false;
+
+        if (i == 0) {
+            regionStartChunk = 0;
+            regionStart = 0;
+        } else {
+            regionStartChunk = peaks[i-1].chunk;
+            regionStart = targets[i-1];
+            phaseReset = peaks[i-1].hard;
+        }
+
+        if (i == peaks.size()) {
+//            std::cerr << "note: i (=" << i << ") == peaks.size(); regionEndChunk " << regionEndChunk << " -> " << totalCount << ", regionEnd " << regionEnd << " -> " << outputDuration << std::endl;
+            regionEndChunk = totalCount;
+            regionEnd = outputDuration;
+        } else {
+            regionEndChunk = peaks[i].chunk;
+            regionEnd = targets[i];
+        }
+
+        if (regionStartChunk > totalCount) regionStartChunk = totalCount;
+        if (regionStart > outputDuration) regionStart = outputDuration;
+        if (regionEndChunk > totalCount) regionEndChunk = totalCount;
+        if (regionEnd > outputDuration) regionEnd = outputDuration;
+        
+        size_t regionDuration = regionEnd - regionStart;
+        regionTotalChunks += regionDuration;
+
+        std::vector<float> dfRegion;
+
+        for (size_t j = regionStartChunk; j != regionEndChunk; ++j) {
+            dfRegion.push_back(stretchDf[j]);
+        }
+
+        if (m_debugLevel > 1) {
+            std::cerr << "distributeRegion from " << regionStartChunk << " to " << regionEndChunk << " (samples " << regionStart << " to " << regionEnd << ")" << std::endl;
+        }
+
+        dfRegion = smoothDF(dfRegion);
+        
+        std::vector<int> regionIncrements = distributeRegion
+            (dfRegion, regionDuration, ratio, phaseReset);
+
+        size_t totalForRegion = 0;
+
+        for (size_t j = 0; j < regionIncrements.size(); ++j) {
+
+            int incr = regionIncrements[j];
+
+            if (j == 0 && phaseReset) increments.push_back(-incr);
+            else increments.push_back(incr);
+
+            if (incr > 0) totalForRegion += incr;
+            else totalForRegion += -incr;
+
+            totalInput += m_increment;
+        }
+
+        if (totalForRegion != regionDuration) {
+            std::cerr << "*** ERROR: distributeRegion returned wrong duration " << totalForRegion << ", expected " << regionDuration << std::endl;
+        }
+
+        totalOutput += totalForRegion;
+    }
+
+    if (m_debugLevel > 0) {
+        std::cerr << "total input increment = " << totalInput << " (= " << totalInput / m_increment << " chunks), output = " << totalOutput << ", ratio = " << double(totalOutput)/double(totalInput) << ", ideal output " << size_t(ceil(totalInput * ratio)) << std::endl;
+        std::cerr << "(region total = " << regionTotalChunks << ")" << std::endl;
+    }
+
+    return increments;
+}
+
+void
+StretchCalculator::mapPeaks(std::vector<Peak> &peaks,
+                            std::vector<size_t> &targets,
+                            size_t outputDuration,
+                            size_t totalCount)
+{
+    // outputDuration is in audio samples; totalCount is in chunks
+
+    if (m_keyFrameMap.empty()) {
+        // "normal" behaviour -- fixed points are strictly in
+        // proportion
+        peaks = m_peaks;
+        for (size_t i = 0; i < peaks.size(); ++i) {
+            targets.push_back
+                (lrint((double(peaks[i].chunk) * outputDuration) / totalCount));
+        }
+        return;
+    }
+
+    // We have been given a set of source -> target sample frames in
+    // m_keyFrameMap.  We want to ensure that (to the nearest chunk) these
+    // are followed exactly, and any fixed points that we calculated
+    // ourselves are interpolated in linear proportion in between.
+
+    size_t peakidx = 0;
+    std::map<size_t, size_t>::const_iterator mi = m_keyFrameMap.begin();
+
+    // NB we know for certain we have a mapping from 0 -> 0 (or at
+    // least, some mapping for source sample 0) because that is
+    // enforced in setLockPoints above.  However, we aren't guaranteed
+    // to have a mapping for the total duration -- we will usually
+    // need to assume it maps to the normal duration * ratio sample
+
+    while (mi != m_keyFrameMap.end()) {
+
+//        std::cerr << "mi->first is " << mi->first << ", second is " << mi->second <<std::endl;
+
+        // The map we've been given is from sample to sample, but
+        // we can only map from chunk to sample.  We should perhaps
+        // adjust the target sample to compensate for the discrepancy
+        // between the chunk position and the exact requested source
+        // sample.  But we aren't doing that yet.
+
+        size_t sourceStartChunk = mi->first / m_increment;
+        size_t sourceEndChunk = totalCount;
+
+        size_t targetStartSample = mi->second;
+        size_t targetEndSample = outputDuration;
+
+        ++mi;
+        if (mi != m_keyFrameMap.end()) {
+            sourceEndChunk = mi->first / m_increment;
+            targetEndSample = mi->second;
+        }
+
+        if (sourceStartChunk >= totalCount ||
+            sourceStartChunk >= sourceEndChunk ||
+            targetStartSample >= outputDuration ||
+            targetStartSample >= targetEndSample) {
+            std::cerr << "NOTE: ignoring mapping from chunk " << sourceStartChunk << " to sample " << targetStartSample << "\n(source or target chunk exceeds total count, or end is not later than start)" << std::endl;
+            continue;
+        }
+        
+        // one peak and target for the mapping, then one for each of
+        // the computed peaks that appear before the following mapping
+
+        Peak p;
+        p.chunk = sourceStartChunk;
+        p.hard = false; // mappings are in time only, not phase reset points
+        peaks.push_back(p);
+        targets.push_back(targetStartSample);
+
+        if (m_debugLevel > 1) {
+            std::cerr << "mapped chunk " << sourceStartChunk << " (frame " << sourceStartChunk * m_increment << ") -> " << targetStartSample << std::endl;
+        }
+
+        while (peakidx < m_peaks.size()) {
+
+            size_t pchunk = m_peaks[peakidx].chunk;
+
+            if (pchunk < sourceStartChunk) {
+                // shouldn't happen, should have been dealt with
+                // already -- but no harm in ignoring it explicitly
+                ++peakidx;
+                continue;
+            }
+            if (pchunk == sourceStartChunk) {
+                // convert that last peak to a hard one, after all
+                peaks[peaks.size()-1].hard = true;
+                ++peakidx;
+                continue;
+            }
+            if (pchunk >= sourceEndChunk) {
+                // leave the rest for after the next mapping
+                break;
+            }
+            p.chunk = pchunk;
+            p.hard = m_peaks[peakidx].hard;
+
+            double proportion =
+                double(pchunk - sourceStartChunk) /
+                double(sourceEndChunk - sourceStartChunk);
+            
+            size_t target =
+                targetStartSample +
+                lrint(proportion *
+                      (targetEndSample - targetStartSample));
+
+            if (target <= targets[targets.size()-1] + m_increment) {
+                // peaks will become too close together afterwards, ignore
+                ++peakidx;
+                continue;
+            }
+
+            if (m_debugLevel > 1) {
+                std::cerr << "  peak chunk " << pchunk << " (frame " << pchunk * m_increment << ") -> " << target << std::endl;
+            }
+
+            peaks.push_back(p);
+            targets.push_back(target);
+            ++peakidx;
+        }
+    }
+}    
+
+int
+StretchCalculator::calculateSingle(double ratio,
+                                   float df,
+                                   size_t increment)
+{
+    if (increment == 0) increment = m_increment;
+
+    bool isTransient = false;
+
+    // We want to ensure, as close as possible, that the phase reset
+    // points appear at _exactly_ the right audio frame numbers.
+
+    // In principle, the threshold depends on chunk size: larger chunk
+    // sizes need higher thresholds.  Since chunk size depends on
+    // ratio, I suppose we could in theory calculate the threshold
+    // from the ratio directly.  For the moment we're happy if it
+    // works well in common situations.
+
+    float transientThreshold = 0.35f;
+//    if (ratio > 1) transientThreshold = 0.25f;
+
+    if (m_useHardPeaks && df > m_prevDf * 1.1f && df > transientThreshold) {
+        isTransient = true;
+    }
+
+    if (m_debugLevel > 2) {
+        std::cerr << "df = " << df << ", prevDf = " << m_prevDf
+                  << ", thresh = " << transientThreshold << std::endl;
+    }
+
+    m_prevDf = df;
+
+    bool ratioChanged = (ratio != m_prevRatio);
+    m_prevRatio = ratio;
+
+    if (isTransient && m_transientAmnesty == 0) {
+        if (m_debugLevel > 1) {
+            std::cerr << "StretchCalculator::calculateSingle: transient (df " << df << ", threshold " << transientThreshold << ")" << std::endl;
+        }
+        m_divergence += increment - (increment * ratio);
+
+        // as in offline mode, 0.05 sec approx min between transients
+        m_transientAmnesty =
+            lrint(ceil(double(m_sampleRate) / (20 * double(increment))));
+
+        m_recovery = m_divergence / ((m_sampleRate / 10.0) / increment);
+        return -int(increment);
+    }
+
+    if (ratioChanged) {
+        m_recovery = m_divergence / ((m_sampleRate / 10.0) / increment);
+    }
+
+    if (m_transientAmnesty > 0) --m_transientAmnesty;
+
+    int incr = lrint(increment * ratio - m_recovery);
+    if (m_debugLevel > 2 || (m_debugLevel > 1 && m_divergence != 0)) {
+        std::cerr << "divergence = " << m_divergence << ", recovery = " << m_recovery << ", incr = " << incr << ", ";
+    }
+    if (incr < lrint((increment * ratio) / 2)) {
+        incr = lrint((increment * ratio) / 2);
+    } else if (incr > lrint(increment * ratio * 2)) {
+        incr = lrint(increment * ratio * 2);
+    }
+
+    double divdiff = (increment * ratio) - incr;
+
+    if (m_debugLevel > 2 || (m_debugLevel > 1 && m_divergence != 0)) {
+        std::cerr << "divdiff = " << divdiff << std::endl;
+    }
+
+    double prevDivergence = m_divergence;
+    m_divergence -= divdiff;
+    if ((prevDivergence < 0 && m_divergence > 0) ||
+        (prevDivergence > 0 && m_divergence < 0)) {
+        m_recovery = m_divergence / ((m_sampleRate / 10.0) / increment);
+    }
+
+    return incr;
+}
+
+void
+StretchCalculator::reset()
+{
+    m_prevDf = 0;
+    m_divergence = 0;
+}
+
+std::vector<StretchCalculator::Peak>
+StretchCalculator::findPeaks(const std::vector<float> &rawDf)
+{
+    std::vector<float> df = smoothDF(rawDf);
+
+    // We distinguish between "soft" and "hard" peaks.  A soft peak is
+    // simply the result of peak-picking on the smoothed onset
+    // detection function, and it represents any (strong-ish) onset.
+    // We aim to ensure always that soft peaks are placed at the
+    // correct position in time.  A hard peak is where there is a very
+    // rapid rise in detection function, and it presumably represents
+    // a more broadband, noisy transient.  For these we perform a
+    // phase reset (if in the appropriate mode), and we locate the
+    // reset at the first point where we notice enough of a rapid
+    // rise, rather than necessarily at the peak itself, in order to
+    // preserve the shape of the transient.
+            
+    std::set<size_t> hardPeakCandidates;
+    std::set<size_t> softPeakCandidates;
+
+    if (m_useHardPeaks) {
+
+        // 0.05 sec approx min between hard peaks
+        size_t hardPeakAmnesty = lrint(ceil(double(m_sampleRate) /
+                                            (20 * double(m_increment))));
+        size_t prevHardPeak = 0;
+
+        if (m_debugLevel > 1) {
+            std::cerr << "hardPeakAmnesty = " << hardPeakAmnesty << std::endl;
+        }
+
+        for (size_t i = 1; i + 1 < df.size(); ++i) {
+
+            if (df[i] < 0.1) continue;
+            if (df[i] <= df[i-1] * 1.1) continue;
+            if (df[i] < 0.22) continue;
+
+            if (!hardPeakCandidates.empty() &&
+                i < prevHardPeak + hardPeakAmnesty) {
+                continue;
+            }
+
+            bool hard = (df[i] > 0.4);
+            
+            if (hard && (m_debugLevel > 1)) {
+                std::cerr << "hard peak at " << i << ": " << df[i] 
+                          << " > absolute " << 0.4
+                          << std::endl;
+            }
+
+            if (!hard) {
+                hard = (df[i] > df[i-1] * 1.4);
+
+                if (hard && (m_debugLevel > 1)) {
+                    std::cerr << "hard peak at " << i << ": " << df[i] 
+                              << " > prev " << df[i-1] << " * 1.4"
+                              << std::endl;
+                }
+            }
+
+            if (!hard && i > 1) {
+                hard = (df[i]   > df[i-1] * 1.2 &&
+                        df[i-1] > df[i-2] * 1.2);
+
+                if (hard && (m_debugLevel > 1)) {
+                    std::cerr << "hard peak at " << i << ": " << df[i] 
+                              << " > prev " << df[i-1] << " * 1.2 and "
+                              << df[i-1] << " > prev " << df[i-2] << " * 1.2"
+                              << std::endl;
+                }
+            }
+
+            if (!hard && i > 2) {
+                // have already established that df[i] > df[i-1] * 1.1
+                hard = (df[i] > 0.3 &&
+                        df[i-1] > df[i-2] * 1.1 &&
+                        df[i-2] > df[i-3] * 1.1);
+
+                if (hard && (m_debugLevel > 1)) {
+                    std::cerr << "hard peak at " << i << ": " << df[i] 
+                              << " > prev " << df[i-1] << " * 1.1 and "
+                              << df[i-1] << " > prev " << df[i-2] << " * 1.1 and "
+                              << df[i-2] << " > prev " << df[i-3] << " * 1.1"
+                              << std::endl;
+                }
+            }
+
+            if (!hard) continue;
+
+//            (df[i+1] > df[i] && df[i+1] > df[i-1] * 1.8) ||
+//                df[i] > 0.4) {
+
+            size_t peakLocation = i;
+
+            if (i + 1 < rawDf.size() &&
+                rawDf[i + 1] > rawDf[i] * 1.4) {
+
+                ++peakLocation;
+
+                if (m_debugLevel > 1) {
+                    std::cerr << "pushing hard peak forward to " << peakLocation << ": " << df[peakLocation] << " > " << df[peakLocation-1] << " * " << 1.4 << std::endl;
+                }
+            }
+
+            hardPeakCandidates.insert(peakLocation);
+            prevHardPeak = peakLocation;
+        }
+    }
+
+    size_t medianmaxsize = lrint(ceil(double(m_sampleRate) /
+                                 double(m_increment))); // 1 sec ish
+
+    if (m_debugLevel > 1) {
+        std::cerr << "mediansize = " << medianmaxsize << std::endl;
+    }
+    if (medianmaxsize < 7) {
+        medianmaxsize = 7;
+        if (m_debugLevel > 1) {
+            std::cerr << "adjusted mediansize = " << medianmaxsize << std::endl;
+        }
+    }
+
+    int minspacing = lrint(ceil(double(m_sampleRate) /
+                                (20 * double(m_increment)))); // 0.05 sec ish
+    
+    std::deque<float> medianwin;
+    std::vector<float> sorted;
+    int softPeakAmnesty = 0;
+
+    for (size_t i = 0; i < medianmaxsize/2; ++i) {
+        medianwin.push_back(0);
+    }
+    for (size_t i = 0; i < medianmaxsize/2 && i < df.size(); ++i) {
+        medianwin.push_back(df[i]);
+    }
+
+    size_t lastSoftPeak = 0;
+
+    for (size_t i = 0; i < df.size(); ++i) {
+        
+        size_t mediansize = medianmaxsize;
+
+        if (medianwin.size() < mediansize) {
+            mediansize = medianwin.size();
+        }
+
+        size_t middle = medianmaxsize / 2;
+        if (middle >= mediansize) middle = mediansize-1;
+
+        size_t nextDf = i + mediansize - middle;
+
+        if (mediansize < 2) {
+            if (mediansize > medianmaxsize) { // absurd, but never mind that
+                medianwin.pop_front();
+            }
+            if (nextDf < df.size()) {
+                medianwin.push_back(df[nextDf]);
+            } else {
+                medianwin.push_back(0);
+            }
+            continue;
+        }
+
+        if (m_debugLevel > 2) {
+//            std::cerr << "have " << mediansize << " in median buffer" << std::endl;
+        }
+
+        sorted.clear();
+        for (size_t j = 0; j < mediansize; ++j) {
+            sorted.push_back(medianwin[j]);
+        }
+        std::sort(sorted.begin(), sorted.end());
+
+        size_t n = 90; // percentile above which we pick peaks
+        size_t index = (sorted.size() * n) / 100;
+        if (index >= sorted.size()) index = sorted.size()-1;
+        if (index == sorted.size()-1 && index > 0) --index;
+        float thresh = sorted[index];
+
+//        if (m_debugLevel > 2) {
+//            std::cerr << "medianwin[" << middle << "] = " << medianwin[middle] << ", thresh = " << thresh << std::endl;
+//            if (medianwin[middle] == 0.f) {
+//                std::cerr << "contents: ";
+//                for (size_t j = 0; j < medianwin.size(); ++j) {
+//                    std::cerr << medianwin[j] << " ";
+//                }
+//                std::cerr << std::endl;
+//            }
+//        }
+
+        if (medianwin[middle] > thresh &&
+            medianwin[middle] > medianwin[middle-1] &&
+            medianwin[middle] > medianwin[middle+1] &&
+            softPeakAmnesty == 0) {
+
+            size_t maxindex = middle;
+            float maxval = medianwin[middle];
+
+            for (size_t j = middle+1; j < mediansize; ++j) {
+                if (medianwin[j] > maxval) {
+                    maxval = medianwin[j];
+                    maxindex = j;
+                } else if (medianwin[j] < medianwin[middle]) {
+                    break;
+                }
+            }
+
+            size_t peak = i + maxindex - middle;
+
+//            std::cerr << "i = " << i << ", maxindex = " << maxindex << ", middle = " << middle << ", so peak at " << peak << std::endl;
+
+            if (softPeakCandidates.empty() || lastSoftPeak != peak) {
+
+                if (m_debugLevel > 1) {
+                    std::cerr << "soft peak at " << peak << " ("
+                              << peak * m_increment << "): "
+                              << medianwin[middle] << " > "
+                              << thresh << " and "
+                              << medianwin[middle]
+                              << " > " << medianwin[middle-1] << " and "
+                              << medianwin[middle]
+                              << " > " << medianwin[middle+1]
+                              << std::endl;
+                }
+
+                if (peak >= df.size()) {
+                    if (m_debugLevel > 2) {
+                        std::cerr << "peak is beyond end"  << std::endl;
+                    }
+                } else {
+                    softPeakCandidates.insert(peak);
+                    lastSoftPeak = peak;
+                }
+            }
+
+            softPeakAmnesty = minspacing + maxindex - middle;
+            if (m_debugLevel > 2) {
+                std::cerr << "amnesty = " << softPeakAmnesty << std::endl;
+            }
+
+        } else if (softPeakAmnesty > 0) --softPeakAmnesty;
+
+        if (mediansize >= medianmaxsize) {
+            medianwin.pop_front();
+        }
+        if (nextDf < df.size()) {
+            medianwin.push_back(df[nextDf]);
+        } else {
+            medianwin.push_back(0);
+        }
+    }
+
+    std::vector<Peak> peaks;
+
+    while (!hardPeakCandidates.empty() || !softPeakCandidates.empty()) {
+
+        bool haveHardPeak = !hardPeakCandidates.empty();
+        bool haveSoftPeak = !softPeakCandidates.empty();
+
+        size_t hardPeak = (haveHardPeak ? *hardPeakCandidates.begin() : 0);
+        size_t softPeak = (haveSoftPeak ? *softPeakCandidates.begin() : 0);
+
+        Peak peak;
+        peak.hard = false;
+        peak.chunk = softPeak;
+
+        bool ignore = false;
+
+        if (haveHardPeak &&
+            (!haveSoftPeak || hardPeak <= softPeak)) {
+
+            if (m_debugLevel > 2) {
+                std::cerr << "Hard peak: " << hardPeak << std::endl;
+            }
+
+            peak.hard = true;
+            peak.chunk = hardPeak;
+            hardPeakCandidates.erase(hardPeakCandidates.begin());
+
+        } else {
+            if (m_debugLevel > 2) {
+                std::cerr << "Soft peak: " << softPeak << std::endl;
+            }
+            if (!peaks.empty() &&
+                peaks[peaks.size()-1].hard &&
+                peaks[peaks.size()-1].chunk + 3 >= softPeak) {
+                if (m_debugLevel > 2) {
+                    std::cerr << "(ignoring, as we just had a hard peak)"
+                              << std::endl;
+                }
+                ignore = true;
+            }
+        }            
+
+        if (haveSoftPeak && peak.chunk == softPeak) {
+            softPeakCandidates.erase(softPeakCandidates.begin());
+        }
+
+        if (!ignore) {
+            peaks.push_back(peak);
+        }
+    }                
+
+    return peaks;
+}
+
+std::vector<float>
+StretchCalculator::smoothDF(const std::vector<float> &df)
+{
+    std::vector<float> smoothedDF;
+    
+    for (size_t i = 0; i < df.size(); ++i) {
+        // three-value moving mean window for simple smoothing
+        float total = 0.f, count = 0;
+        if (i > 0) { total += df[i-1]; ++count; }
+        total += df[i]; ++count;
+        if (i+1 < df.size()) { total += df[i+1]; ++count; }
+        float mean = total / count;
+        smoothedDF.push_back(mean);
+    }
+
+    return smoothedDF;
+}
+
+std::vector<int>
+StretchCalculator::distributeRegion(const std::vector<float> &dfIn,
+                                    size_t duration, float ratio, bool phaseReset)
+{
+    std::vector<float> df(dfIn);
+    std::vector<int> increments;
+
+    // The peak for the stretch detection function may appear after
+    // the peak that we're using to calculate the start of the region.
+    // We don't want that.  If we find a peak in the first half of
+    // the region, we should set all the values up to that point to
+    // the same value as the peak.
+
+    // (This might not be subtle enough, especially if the region is
+    // long -- we want a bound that corresponds to acoustic perception
+    // of the audible bounce.)
+
+    for (size_t i = 1; i < df.size()/2; ++i) {
+        if (df[i] < df[i-1]) {
+            if (m_debugLevel > 1) {
+                std::cerr << "stretch peak offset: " << i-1 << " (peak " << df[i-1] << ")" << std::endl;
+            }
+            for (size_t j = 0; j < i-1; ++j) {
+                df[j] = df[i-1];
+            }
+            break;
+        }
+    }
+
+    float maxDf = 0;
+
+    for (size_t i = 0; i < df.size(); ++i) {
+        if (i == 0 || df[i] > maxDf) maxDf = df[i];
+    }
+
+    // We want to try to ensure the last 100ms or so (if possible) are
+    // tending back towards the maximum df, so that the stretchiness
+    // reduces at the end of the stretched region.
+    
+    int reducedRegion = lrint((0.1 * m_sampleRate) / m_increment);
+    if (reducedRegion > int(df.size()/5)) reducedRegion = df.size()/5;
+
+    for (int i = 0; i < reducedRegion; ++i) {
+        size_t index = df.size() - reducedRegion + i;
+        df[index] = df[index] + ((maxDf - df[index]) * i) / reducedRegion;
+    }
+
+    long toAllot = long(duration) - long(m_increment * df.size());
+    
+    if (m_debugLevel > 1) {
+        std::cerr << "region of " << df.size() << " chunks, output duration " << duration << ", increment " << m_increment << ", toAllot " << toAllot << std::endl;
+    }
+
+    size_t totalIncrement = 0;
+
+    // We place limits on the amount of displacement per chunk.  if
+    // ratio < 0, no increment should be larger than increment*ratio
+    // or smaller than increment*ratio/2; if ratio > 0, none should be
+    // smaller than increment*ratio or larger than increment*ratio*2.
+    // We need to enforce this in the assignment of displacements to
+    // allotments, not by trying to respond if something turns out
+    // wrong.
+
+    // Note that the ratio is only provided to this function for the
+    // purposes of establishing this bound to the displacement.
+    
+    // so if
+    // maxDisplacement / totalDisplacement > increment * ratio*2 - increment
+    // (for ratio > 1)
+    // or
+    // maxDisplacement / totalDisplacement < increment * ratio/2
+    // (for ratio < 1)
+
+    // then we need to adjust and accommodate
+    
+    double totalDisplacement = 0;
+    double maxDisplacement = 0; // min displacement will be 0 by definition
+
+    maxDf = 0;
+    float adj = 0;
+
+    bool tooShort = true, tooLong = true;
+    const int acceptableIterations = 10;
+    int iteration = 0;
+    int prevExtreme = 0;
+    bool better = false;
+
+    while ((tooLong || tooShort) && iteration < acceptableIterations) {
+
+        ++iteration;
+
+        tooLong = false;
+        tooShort = false;
+        calculateDisplacements(df, maxDf, totalDisplacement, maxDisplacement,
+                               adj);
+
+        if (m_debugLevel > 1) {
+            std::cerr << "totalDisplacement " << totalDisplacement << ", max " << maxDisplacement << " (maxDf " << maxDf << ", df count " << df.size() << ")" << std::endl;
+        }
+
+        if (totalDisplacement == 0) {
+// Not usually a problem, in fact
+//            std::cerr << "WARNING: totalDisplacement == 0 (duration " << duration << ", " << df.size() << " values in df)" << std::endl;
+            if (!df.empty() && adj == 0) {
+                tooLong = true; tooShort = true;
+                adj = 1;
+            }
+            continue;
+        }
+
+        int extremeIncrement = m_increment +
+            lrint((toAllot * maxDisplacement) / totalDisplacement);
+
+        if (extremeIncrement < 0) {
+            if (m_debugLevel > 0) {
+                std::cerr << "NOTE: extreme increment " << extremeIncrement << " < 0, adjusting" << std::endl;
+            }
+            tooShort = true;
+        } else {
+            if (ratio < 1.0) {
+                if (extremeIncrement > lrint(ceil(m_increment * ratio))) {
+                    std::cerr << "WARNING: extreme increment "
+                              << extremeIncrement << " > "
+                              << m_increment * ratio << std::endl;
+                } else if (extremeIncrement < (m_increment * ratio) / 2) {
+                    if (m_debugLevel > 0) {
+                        std::cerr << "NOTE: extreme increment "
+                                  << extremeIncrement << " < " 
+                                  << (m_increment * ratio) / 2
+                                  << ", adjusting" << std::endl;
+                    }
+                    tooShort = true;
+                    if (iteration > 0) {
+                        better = (extremeIncrement > prevExtreme);
+                    }
+                    prevExtreme = extremeIncrement;
+                }
+            } else {
+                if (extremeIncrement > m_increment * ratio * 2) {
+                    if (m_debugLevel > 0) {
+                        std::cerr << "NOTE: extreme increment "
+                                  << extremeIncrement << " > "
+                                  << m_increment * ratio * 2
+                                  << ", adjusting" << std::endl;
+                    }
+                    tooLong = true;
+                    if (iteration > 0) {
+                        better = (extremeIncrement < prevExtreme);
+                    }
+                    prevExtreme = extremeIncrement;
+                } else if (extremeIncrement < lrint(floor(m_increment * ratio))) {
+                    std::cerr << "WARNING: extreme increment "
+                              << extremeIncrement << " < "
+                              << m_increment * ratio << std::endl;
+                }
+            }
+        }
+
+        if (tooLong || tooShort) {
+            // Need to make maxDisplacement smaller as a proportion of
+            // the total displacement, yet ensure that the
+            // displacements still sum to the total.
+            adj += maxDf/10;
+        }
+    }
+
+    if (tooLong) {
+        if (better) {
+            // we were iterating in the right direction, so
+            // leave things as they are (and undo that last tweak)
+            std::cerr << "WARNING: No acceptable displacement adjustment found, using latest values:\nthis region could sound bad" << std::endl;
+            adj -= maxDf/10;
+        } else {
+            std::cerr << "WARNING: No acceptable displacement adjustment found, using defaults:\nthis region could sound bad" << std::endl;
+            adj = 1;
+            calculateDisplacements(df, maxDf, totalDisplacement, maxDisplacement,
+                                   adj);
+        }
+    } else if (tooShort) {
+        std::cerr << "WARNING: No acceptable displacement adjustment found, using flat distribution:\nthis region could sound bad" << std::endl;
+        adj = 1;
+        for (size_t i = 0; i < df.size(); ++i) {
+            df[i] = 1.f;
+        }
+        calculateDisplacements(df, maxDf, totalDisplacement, maxDisplacement,
+                               adj);
+    }
+
+    for (size_t i = 0; i < df.size(); ++i) {
+
+        double displacement = maxDf - df[i];
+        if (displacement < 0) displacement -= adj;
+        else displacement += adj;
+
+        if (i == 0 && phaseReset) {
+            if (m_debugLevel > 2) {
+                std::cerr << "Phase reset at first chunk" << std::endl;
+            }
+            if (df.size() == 1) {
+                increments.push_back(duration);
+                totalIncrement += duration;
+            } else {
+                increments.push_back(m_increment);
+                totalIncrement += m_increment;
+            }
+            totalDisplacement -= displacement;
+            continue;
+        }
+
+        double theoreticalAllotment = 0;
+
+        if (totalDisplacement != 0) {
+            theoreticalAllotment = (toAllot * displacement) / totalDisplacement;
+        }
+        int allotment = lrint(theoreticalAllotment);
+        if (i + 1 == df.size()) allotment = toAllot;
+
+        int increment = m_increment + allotment;
+
+        if (increment < 0) {
+            // this is a serious problem, the allocation is quite
+            // wrong if it allows increment to diverge so far from the
+            // input increment (though it can happen legitimately if
+            // asked to squash very violently)
+            std::cerr << "*** WARNING: increment " << increment << " <= 0, rounding to zero" << std::endl;
+
+            toAllot += m_increment;
+            increment = 0;
+
+        } else {
+            toAllot -= allotment;
+        }
+
+        increments.push_back(increment);
+        totalIncrement += increment;
+
+        totalDisplacement -= displacement;
+
+        if (m_debugLevel > 2) {
+            std::cerr << "df " << df[i] << ", smoothed " << df[i] << ", disp " << displacement << ", allot " << theoreticalAllotment << ", incr " << increment << ", remain " << toAllot << std::endl;
+        }
+    }
+    
+    if (m_debugLevel > 2) {
+        std::cerr << "total increment: " << totalIncrement << ", left over: " << toAllot << " to allot, displacement " << totalDisplacement << std::endl;
+    }
+
+    if (totalIncrement != duration) {
+        std::cerr << "*** WARNING: calculated output duration " << totalIncrement << " != expected " << duration << std::endl;
+    }
+
+    return increments;
+}
+
+void
+StretchCalculator::calculateDisplacements(const std::vector<float> &df,
+                                          float &maxDf,
+                                          double &totalDisplacement,
+                                          double &maxDisplacement,
+                                          float adj) const
+{
+    totalDisplacement = maxDisplacement = 0;
+
+    maxDf = 0;
+
+    for (size_t i = 0; i < df.size(); ++i) {
+        if (i == 0 || df[i] > maxDf) maxDf = df[i];
+    }
+
+    for (size_t i = 0; i < df.size(); ++i) {
+        double displacement = maxDf - df[i];
+        if (displacement < 0) displacement -= adj;
+        else displacement += adj;
+        totalDisplacement += displacement;
+        if (i == 0 || displacement > maxDisplacement) {
+            maxDisplacement = displacement;
+        }
+    }
+}
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/StretchCalculator.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/StretchCalculator.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,121 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_STRETCH_CALCULATOR_H_
+#define _RUBBERBAND_STRETCH_CALCULATOR_H_
+
+#include <sys/types.h>
+
+#include <vector>
+#include <map>
+
+namespace RubberBand
+{
+
+class StretchCalculator
+{
+public:
+    StretchCalculator(size_t sampleRate, size_t inputIncrement, bool useHardPeaks);
+    virtual ~StretchCalculator();
+
+    /**
+     * Provide a set of mappings from "before" to "after" sample
+     * numbers so as to enforce a particular stretch profile.  This
+     * must be called before calculate().  The argument is a map from
+     * audio sample frame number in the source material to the
+     * corresponding sample frame number in the stretched output.
+     */
+    void setKeyFrameMap(const std::map<size_t, size_t> &mapping);
+    
+    /**
+     * Calculate phase increments for a region of audio, given the
+     * overall target stretch ratio, input duration in audio samples,
+     * and the audio curves to use for identifying phase lock points
+     * (lockAudioCurve) and for allocating stretches to relatively
+     * less prominent points (stretchAudioCurve).
+     */
+    std::vector<int> calculate(double ratio, size_t inputDuration,
+                               const std::vector<float> &lockAudioCurve,
+                               const std::vector<float> &stretchAudioCurve);
+
+    /**
+     * Calculate the phase increment for a single audio block, given
+     * the overall target stretch ratio and the block's value on the
+     * phase-lock audio curve.  State is retained between calls in the
+     * StretchCalculator object; call reset() to reset it.  This uses
+     * a less sophisticated method than the offline calculate().
+     *
+     * If increment is non-zero, use it for the input increment for
+     * this block in preference to m_increment.
+     */
+    int calculateSingle(double ratio, float curveValue,
+                        size_t increment = 0);
+
+    void setUseHardPeaks(bool use) { m_useHardPeaks = use; }
+
+    void reset();
+  
+    void setDebugLevel(int level) { m_debugLevel = level; }
+
+    struct Peak {
+        size_t chunk;
+        bool hard;
+    };
+    std::vector<Peak> getLastCalculatedPeaks() const { return m_peaks; }
+
+    std::vector<float> smoothDF(const std::vector<float> &df);
+
+protected:
+    std::vector<Peak> findPeaks(const std::vector<float> &audioCurve);
+
+    void mapPeaks(std::vector<Peak> &peaks, std::vector<size_t> &targets,
+                  size_t outputDuration, size_t totalCount);
+
+    std::vector<int> distributeRegion(const std::vector<float> &regionCurve,
+                                      size_t outputDuration, float ratio,
+                                      bool phaseReset);
+
+    void calculateDisplacements(const std::vector<float> &df,
+                                float &maxDf,
+                                double &totalDisplacement,
+                                double &maxDisplacement,
+                                float adj) const;
+
+    size_t m_sampleRate;
+    size_t m_blockSize;
+    size_t m_increment;
+    float m_prevDf;
+    double m_divergence;
+    float m_recovery;
+    float m_prevRatio;
+    int m_transientAmnesty; // only in RT mode; handled differently offline
+    int m_debugLevel;
+    bool m_useHardPeaks;
+
+    std::map<size_t, size_t> m_keyFrameMap;
+    std::vector<Peak> m_peaks;
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/StretcherChannelData.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/StretcherChannelData.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,286 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "StretcherChannelData.h"
+
+#include "dsp/Resampler.h"
+
+#include "system/Allocators.h"
+
+namespace RubberBand 
+{
+      
+RubberBandStretcher::Impl::ChannelData::ChannelData(size_t windowSize,
+                                                    size_t fftSize,
+                                                    size_t outbufSize)
+{
+    std::set<size_t> s;
+    construct(s, windowSize, fftSize, outbufSize);
+}
+
+RubberBandStretcher::Impl::ChannelData::ChannelData(const std::set<size_t> &sizes,
+                                                    size_t initialWindowSize,
+                                                    size_t initialFftSize,
+                                                    size_t outbufSize)
+{
+    construct(sizes, initialWindowSize, initialFftSize, outbufSize);
+}
+
+void
+RubberBandStretcher::Impl::ChannelData::construct(const std::set<size_t> &sizes,
+                                                  size_t initialWindowSize,
+                                                  size_t initialFftSize,
+                                                  size_t outbufSize)
+{
+    size_t maxSize = initialWindowSize * 2;
+    if (initialFftSize > maxSize) maxSize = initialFftSize;
+
+//    std::cerr << "ChannelData::construct: initialWindowSize = " << initialWindowSize << ", initialFftSize = " << initialFftSize << ", outbufSize = " << outbufSize << std::endl;
+
+    // std::set is ordered by value
+    std::set<size_t>::const_iterator i = sizes.end();
+    if (i != sizes.begin()) {
+        --i;
+        if (*i > maxSize) maxSize = *i;
+    }
+
+    // max possible size of the real "half" of freq data
+    size_t realSize = maxSize / 2 + 1;
+
+//    std::cerr << "ChannelData::construct([" << sizes.size() << "], " << maxSize << ", " << realSize << ", " << outbufSize << ")" << std::endl;
+    
+    if (outbufSize < maxSize) outbufSize = maxSize;
+
+    inbuf = new RingBuffer<float>(maxSize);
+    outbuf = new RingBuffer<float>(outbufSize);
+
+    mag = allocate_and_zero<process_t>(realSize);
+    phase = allocate_and_zero<process_t>(realSize);
+    prevPhase = allocate_and_zero<process_t>(realSize);
+    prevError = allocate_and_zero<process_t>(realSize);
+    unwrappedPhase = allocate_and_zero<process_t>(realSize);
+    envelope = allocate_and_zero<process_t>(realSize);
+
+    fltbuf = allocate_and_zero<float>(maxSize);
+    dblbuf = allocate_and_zero<process_t>(maxSize);
+
+    accumulator = allocate_and_zero<float>(maxSize);
+    windowAccumulator = allocate_and_zero<float>(maxSize);
+    interpolator = allocate_and_zero<float>(maxSize);
+    interpolatorScale = 0;
+
+    for (std::set<size_t>::const_iterator i = sizes.begin();
+         i != sizes.end(); ++i) {
+        ffts[*i] = new FFT(*i);
+        if (sizeof(process_t) == sizeof(double)) {
+            ffts[*i]->initDouble();
+        } else {
+            ffts[*i]->initFloat();
+        }
+    }
+    fft = ffts[initialFftSize];
+
+    resampler = 0;
+    resamplebuf = 0;
+    resamplebufSize = 0;
+
+    reset();
+
+    // Avoid dividing opening sample (which will be discarded anyway) by zero
+    windowAccumulator[0] = 1.f;
+}
+
+
+void
+RubberBandStretcher::Impl::ChannelData::setSizes(size_t windowSize,
+                                                 size_t fftSize)
+{
+//    std::cerr << "ChannelData::setSizes: windowSize = " << windowSize << ", fftSize = " << fftSize << std::endl;
+
+    size_t maxSize = 2 * std::max(windowSize, fftSize);
+    size_t realSize = maxSize / 2 + 1;
+    size_t oldMax = inbuf->getSize();
+    size_t oldReal = oldMax / 2 + 1;
+
+    if (oldMax >= maxSize) {
+
+        // no need to reallocate buffers, just reselect fft
+
+        //!!! we can't actually do this without locking against the
+        //process thread, can we?  we need to zero the mag/phase
+        //buffers without interference
+
+        if (ffts.find(fftSize) == ffts.end()) {
+            //!!! this also requires a lock, but it shouldn't occur in
+            //RT mode with proper initialisation
+            ffts[fftSize] = new FFT(fftSize);
+            if (sizeof(process_t) == sizeof(double)) {
+                ffts[fftSize]->initDouble();
+            } else {
+                ffts[fftSize]->initFloat();
+            }
+        }
+        
+        fft = ffts[fftSize];
+
+        v_zero(fltbuf, maxSize);
+        v_zero(dblbuf, maxSize);
+
+        v_zero(mag, realSize);
+        v_zero(phase, realSize);
+        v_zero(prevPhase, realSize);
+        v_zero(prevError, realSize);
+        v_zero(unwrappedPhase, realSize);
+
+        return;
+    }
+
+    //!!! at this point we need a lock in case a different client
+    //thread is calling process() -- we need this lock even if we
+    //aren't running in threaded mode ourselves -- if we're in RT
+    //mode, then the process call should trylock and fail if the lock
+    //is unavailable (since this should never normally be the case in
+    //general use in RT mode)
+
+    RingBuffer<float> *newbuf = inbuf->resized(maxSize);
+    delete inbuf;
+    inbuf = newbuf;
+
+    // We don't want to preserve data in these arrays
+
+    mag = reallocate_and_zero(mag, oldReal, realSize);
+    phase = reallocate_and_zero(phase, oldReal, realSize);
+    prevPhase = reallocate_and_zero(prevPhase, oldReal, realSize);
+    prevError = reallocate_and_zero(prevError, oldReal, realSize);
+    unwrappedPhase = reallocate_and_zero(unwrappedPhase, oldReal, realSize);
+    envelope = reallocate_and_zero(envelope, oldReal, realSize);
+    fltbuf = reallocate_and_zero(fltbuf, oldMax, maxSize);
+    dblbuf = reallocate_and_zero(dblbuf, oldMax, maxSize);
+
+    interpolator = reallocate_and_zero<float>(interpolator, oldMax, maxSize);
+
+    // But we do want to preserve data in these
+
+    accumulator = reallocate_and_zero_extension
+        (accumulator, oldMax, maxSize);
+
+    windowAccumulator = reallocate_and_zero_extension
+        (windowAccumulator, oldMax, maxSize);
+
+    interpolatorScale = 0;
+    
+    //!!! and resampler?
+
+    if (ffts.find(fftSize) == ffts.end()) {
+        ffts[fftSize] = new FFT(fftSize);
+        if (sizeof(process_t) == sizeof(double)) {
+            ffts[fftSize]->initDouble();
+        } else {
+            ffts[fftSize]->initFloat();
+        }
+    }
+    
+    fft = ffts[fftSize];
+}
+
+void
+RubberBandStretcher::Impl::ChannelData::setOutbufSize(size_t outbufSize)
+{
+    size_t oldSize = outbuf->getSize();
+
+//    std::cerr << "ChannelData::setOutbufSize(" << outbufSize << ") [from " << oldSize << "]" << std::endl;
+
+    if (oldSize < outbufSize) {
+
+        //!!! at this point we need a lock in case a different client
+        //thread is calling process()
+
+        RingBuffer<float> *newbuf = outbuf->resized(outbufSize);
+        delete outbuf;
+        outbuf = newbuf;
+    }
+}
+
+void
+RubberBandStretcher::Impl::ChannelData::setResampleBufSize(size_t sz)
+{
+    resamplebuf = reallocate_and_zero<float>(resamplebuf, resamplebufSize, sz);
+    resamplebufSize = sz;
+}
+
+RubberBandStretcher::Impl::ChannelData::~ChannelData()
+{
+    delete resampler;
+
+    deallocate(resamplebuf);
+
+    delete inbuf;
+    delete outbuf;
+
+    deallocate(mag);
+    deallocate(phase);
+    deallocate(prevPhase);
+    deallocate(prevError);
+    deallocate(unwrappedPhase);
+    deallocate(envelope);
+    deallocate(accumulator);
+    deallocate(windowAccumulator);
+    deallocate(fltbuf);
+
+    for (std::map<size_t, FFT *>::iterator i = ffts.begin();
+         i != ffts.end(); ++i) {
+        delete i->second;
+    }
+}
+
+void
+RubberBandStretcher::Impl::ChannelData::reset()
+{
+    inbuf->reset();
+    outbuf->reset();
+
+    if (resampler) resampler->reset();
+
+    size_t size = inbuf->getSize();
+
+    for (size_t i = 0; i < size; ++i) {
+        accumulator[i] = 0.f;
+        windowAccumulator[i] = 0.f;
+    }
+
+    // Avoid dividing opening sample (which will be discarded anyway) by zero
+    windowAccumulator[0] = 1.f;
+    
+    accumulatorFill = 0;
+    prevIncrement = 0;
+    chunkCount = 0;
+    inCount = 0;
+    inputSize = -1;
+    outCount = 0;
+    interpolatorScale = 0;
+    unchanged = true;
+    draining = false;
+    outputComplete = false;
+}
+
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/StretcherChannelData.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/StretcherChannelData.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,147 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_STRETCHERCHANNELDATA_H_
+#define _RUBBERBAND_STRETCHERCHANNELDATA_H_
+
+#include "StretcherImpl.h"
+
+#include <set>
+
+//#define EXPERIMENT 1
+
+namespace RubberBand
+{
+
+class Resampler;
+
+class RubberBandStretcher::Impl::ChannelData
+{
+public:        
+    /**
+     * Construct a ChannelData structure.
+     *
+     * The sizes passed in here are for the time-domain analysis
+     * window and FFT calculation, and most of the buffer sizes also
+     * depend on them.  In practice they are always powers of two, the
+     * window and FFT sizes are either equal or generally in a 2:1
+     * relationship either way, and except for very extreme stretches
+     * the FFT size is either 1024, 2048 or 4096.
+     *
+     * The outbuf size depends on other factors as well, including
+     * the pitch scale factor and any maximum processing block
+     * size specified by the user of the code.
+     */
+    ChannelData(size_t windowSize,
+                size_t fftSize,
+                size_t outbufSize);
+
+    /**
+     * Construct a ChannelData structure that can process at different
+     * FFT sizes without requiring reallocation when the size changes.
+     * The sizes can subsequently be changed with a call to setSizes.
+     * Reallocation will only be necessary if setSizes is called with
+     * values not equal to any of those passed in to the constructor.
+     *
+     * The outbufSize should be the maximum possible outbufSize to
+     * avoid reallocation, which will happen if setOutbufSize is
+     * called subsequently.
+     */
+    ChannelData(const std::set<size_t> &sizes,
+                size_t initialWindowSize,
+                size_t initialFftSize,
+                size_t outbufSize);
+    ~ChannelData();
+
+    /**
+     * Reset buffers
+     */
+    void reset();
+
+    /**
+     * Set the FFT, analysis window, and buffer sizes.  If this
+     * ChannelData was constructed with a set of sizes and the given
+     * window and FFT sizes here were among them, no reallocation will
+     * be required.
+     */
+    void setSizes(size_t windowSize, size_t fftSizes);
+
+    /**
+     * Set the outbufSize for the channel data.  Reallocation will
+     * occur.
+     */
+    void setOutbufSize(size_t outbufSize);
+
+    /**
+     * Set the resampler buffer size.  Default if not called is no
+     * buffer allocated at all.
+     */
+    void setResampleBufSize(size_t resamplebufSize);
+    
+    RingBuffer<float> *inbuf;
+    RingBuffer<float> *outbuf;
+
+    process_t *mag;
+    process_t *phase;
+
+    process_t *prevPhase;
+    process_t *prevError;
+    process_t *unwrappedPhase;
+
+    float *accumulator;
+    size_t accumulatorFill;
+    float *windowAccumulator;
+    float *interpolator; // only used when time-domain smoothing is on
+    int interpolatorScale;
+
+    float *fltbuf;
+    process_t *dblbuf; // owned by FFT object, only used for time domain FFT i/o
+    process_t *envelope; // for cepstral formant shift
+    bool unchanged;
+
+    size_t prevIncrement; // only used in RT mode
+
+    size_t chunkCount;
+    size_t inCount;
+    long inputSize; // set only after known (when data ended); -1 previously
+    size_t outCount;
+
+    bool draining;
+    bool outputComplete;
+
+    FFT *fft;
+    std::map<size_t, FFT *> ffts;
+
+    Resampler *resampler;
+    float *resamplebuf;
+    size_t resamplebufSize;
+
+private:
+    void construct(const std::set<size_t> &sizes,
+                   size_t initialWindowSize, size_t initialFftSize,
+                   size_t outbufSize);
+};        
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/StretcherImpl.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/StretcherImpl.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1343 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "StretcherImpl.h"
+
+#include "audiocurves/PercussiveAudioCurve.h"
+#include "audiocurves/HighFrequencyAudioCurve.h"
+#include "audiocurves/SpectralDifferenceAudioCurve.h"
+#include "audiocurves/SilentAudioCurve.h"
+#include "audiocurves/ConstantAudioCurve.h"
+#include "audiocurves/CompoundAudioCurve.h"
+
+#include "dsp/Resampler.h"
+
+#include "StretchCalculator.h"
+#include "StretcherChannelData.h"
+
+#include "base/Profiler.h"
+
+#ifndef _WIN32
+#include <alloca.h>
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <set>
+#include <map>
+
+using namespace RubberBand;
+
+using std::cerr;
+using std::endl;
+using std::vector;
+using std::map;
+using std::set;
+using std::max;
+using std::min;
+
+namespace RubberBand {
+
+const size_t
+RubberBandStretcher::Impl::m_defaultIncrement = 256;
+
+const size_t
+RubberBandStretcher::Impl::m_defaultFftSize = 2048;
+
+int
+RubberBandStretcher::Impl::m_defaultDebugLevel = 0;
+
+static bool _initialised = false;
+
+RubberBandStretcher::Impl::Impl(size_t sampleRate,
+                                size_t channels,
+                                Options options,
+                                double initialTimeRatio,
+                                double initialPitchScale) :
+    m_sampleRate(sampleRate),
+    m_channels(channels),
+    m_timeRatio(initialTimeRatio),
+    m_pitchScale(initialPitchScale),
+    m_fftSize(m_defaultFftSize),
+    m_aWindowSize(m_defaultFftSize),
+    m_sWindowSize(m_defaultFftSize),
+    m_increment(m_defaultIncrement),
+    m_outbufSize(m_defaultFftSize * 2),
+    m_maxProcessSize(m_defaultFftSize),
+    m_expectedInputDuration(0),
+#ifndef NO_THREADING
+    m_threaded(false),
+#endif
+    m_realtime(false),
+    m_options(options),
+    m_debugLevel(m_defaultDebugLevel),
+    m_mode(JustCreated),
+    m_awindow(0),
+    m_afilter(0),
+    m_swindow(0),
+    m_studyFFT(0),
+#ifndef NO_THREADING
+    m_spaceAvailable("space"),
+#endif
+    m_inputDuration(0),
+    m_detectorType(CompoundAudioCurve::CompoundDetector),
+    m_silentHistory(0),
+    m_lastProcessOutputIncrements(16),
+    m_lastProcessPhaseResetDf(16),
+    m_emergencyScavenger(10, 4),
+    m_phaseResetAudioCurve(0),
+    m_stretchAudioCurve(0),
+    m_silentAudioCurve(0),
+    m_stretchCalculator(0),
+    m_freq0(600),
+    m_freq1(1200),
+    m_freq2(12000),
+    m_baseFftSize(m_defaultFftSize)
+{
+    if (!_initialised) {
+        system_specific_initialise();
+        _initialised = true;
+    }
+
+    if (m_debugLevel > 0) {
+        cerr << "RubberBandStretcher::Impl::Impl: rate = " << m_sampleRate << ", options = " << options << endl;
+    }
+
+    // Window size will vary according to the audio sample rate, but
+    // we don't let it drop below the 48k default
+    m_rateMultiple = float(m_sampleRate) / 48000.f;
+//    if (m_rateMultiple < 1.f) m_rateMultiple = 1.f;
+    m_baseFftSize = roundUp(int(m_defaultFftSize * m_rateMultiple));
+
+    if ((options & OptionWindowShort) || (options & OptionWindowLong)) {
+        if ((options & OptionWindowShort) && (options & OptionWindowLong)) {
+            cerr << "RubberBandStretcher::Impl::Impl: Cannot specify OptionWindowLong and OptionWindowShort together; falling back to OptionWindowStandard" << endl;
+        } else if (options & OptionWindowShort) {
+            m_baseFftSize = m_baseFftSize / 2;
+            if (m_debugLevel > 0) {
+                cerr << "setting baseFftSize to " << m_baseFftSize << endl;
+            }
+        } else if (options & OptionWindowLong) {
+            m_baseFftSize = m_baseFftSize * 2;
+            if (m_debugLevel > 0) {
+                cerr << "setting baseFftSize to " << m_baseFftSize << endl;
+            }
+        }
+        m_fftSize = m_baseFftSize;
+        m_aWindowSize = m_baseFftSize;
+        m_sWindowSize = m_baseFftSize;
+        m_outbufSize = m_sWindowSize * 2;
+        m_maxProcessSize = m_aWindowSize;
+    }
+
+    if (m_options & OptionProcessRealTime) {
+
+        m_realtime = true;
+
+        if (!(m_options & OptionStretchPrecise)) {
+            m_options |= OptionStretchPrecise;
+        }
+    }
+
+#ifndef NO_THREADING
+    if (m_channels > 1) {
+
+        m_threaded = true;
+
+        if (m_realtime) {
+            m_threaded = false;
+        } else if (m_options & OptionThreadingNever) {
+            m_threaded = false;
+        } else if (!(m_options & OptionThreadingAlways) &&
+                   !system_is_multiprocessor()) {
+            m_threaded = false;
+        }
+
+        if (m_threaded && m_debugLevel > 0) {
+            cerr << "Going multithreaded..." << endl;
+        }
+    }
+#endif
+
+    configure();
+}
+
+RubberBandStretcher::Impl::~Impl()
+{
+#ifndef NO_THREADING
+    if (m_threaded) {
+        MutexLocker locker(&m_threadSetMutex);
+        for (set<ProcessThread *>::iterator i = m_threadSet.begin();
+             i != m_threadSet.end(); ++i) {
+            if (m_debugLevel > 0) {
+                cerr << "RubberBandStretcher::~RubberBandStretcher: joining (channel " << *i << ")" << endl;
+            }
+            (*i)->abandon();
+            (*i)->wait();
+            delete *i;
+        }
+    }
+#endif
+
+    for (size_t c = 0; c < m_channels; ++c) {
+        delete m_channelData[c];
+    }
+
+    delete m_phaseResetAudioCurve;
+    delete m_stretchAudioCurve;
+    delete m_silentAudioCurve;
+    delete m_stretchCalculator;
+    delete m_studyFFT;
+
+    for (map<size_t, Window<float> *>::iterator i = m_windows.begin();
+         i != m_windows.end(); ++i) {
+        delete i->second;
+    }
+    for (map<size_t, SincWindow<float> *>::iterator i = m_sincs.begin();
+         i != m_sincs.end(); ++i) {
+        delete i->second;
+    }
+}
+
+void
+RubberBandStretcher::Impl::reset()
+{
+#ifndef NO_THREADING
+    if (m_threaded) {
+        m_threadSetMutex.lock();
+        for (set<ProcessThread *>::iterator i = m_threadSet.begin();
+             i != m_threadSet.end(); ++i) {
+            if (m_debugLevel > 0) {
+                cerr << "RubberBandStretcher::~RubberBandStretcher: joining (channel " << *i << ")" << endl;
+            }
+            (*i)->abandon();
+            (*i)->wait();
+            delete *i;
+        }
+        m_threadSet.clear();
+    }
+#endif
+
+    m_emergencyScavenger.scavenge();
+
+    if (m_stretchCalculator) {
+        m_stretchCalculator->setKeyFrameMap(std::map<size_t, size_t>());
+    }
+
+    for (size_t c = 0; c < m_channels; ++c) {
+        m_channelData[c]->reset();
+    }
+
+    m_mode = JustCreated;
+    if (m_phaseResetAudioCurve) m_phaseResetAudioCurve->reset();
+    if (m_stretchAudioCurve) m_stretchAudioCurve->reset();
+    if (m_silentAudioCurve) m_silentAudioCurve->reset();
+    m_inputDuration = 0;
+    m_silentHistory = 0;
+
+#ifndef NO_THREADING
+    if (m_threaded) m_threadSetMutex.unlock();
+#endif
+
+    reconfigure();
+}
+
+void
+RubberBandStretcher::Impl::setTimeRatio(double ratio)
+{
+    if (!m_realtime) {
+        if (m_mode == Studying || m_mode == Processing) {
+            cerr << "RubberBandStretcher::Impl::setTimeRatio: Cannot set ratio while studying or processing in non-RT mode" << endl;
+            return;
+        }
+    }
+
+    if (ratio == m_timeRatio) return;
+    m_timeRatio = ratio;
+
+    reconfigure();
+}
+
+void
+RubberBandStretcher::Impl::setPitchScale(double fs)
+{
+    if (!m_realtime) {
+        if (m_mode == Studying || m_mode == Processing) {
+            cerr << "RubberBandStretcher::Impl::setPitchScale: Cannot set ratio while studying or processing in non-RT mode" << endl;
+            return;
+        }
+    }
+
+    if (fs == m_pitchScale) return;
+    
+    bool was1 = (m_pitchScale == 1.f);
+    bool rbs = resampleBeforeStretching();
+
+    m_pitchScale = fs;
+
+    reconfigure();
+
+    if (!(m_options & OptionPitchHighConsistency) &&
+        (was1 || resampleBeforeStretching() != rbs) &&
+        m_pitchScale != 1.f) {
+        
+        // resampling mode has changed
+        for (int c = 0; c < int(m_channels); ++c) {
+            if (m_channelData[c]->resampler) {
+                m_channelData[c]->resampler->reset();
+            }
+        }
+    }
+}
+
+double
+RubberBandStretcher::Impl::getTimeRatio() const
+{
+    return m_timeRatio;
+}
+
+double
+RubberBandStretcher::Impl::getPitchScale() const
+{
+    return m_pitchScale;
+}
+
+void
+RubberBandStretcher::Impl::setExpectedInputDuration(size_t samples)
+{
+    if (samples == m_expectedInputDuration) return;
+    m_expectedInputDuration = samples;
+
+    reconfigure();
+}
+
+void
+RubberBandStretcher::Impl::setMaxProcessSize(size_t samples)
+{
+    if (samples <= m_maxProcessSize) return;
+    m_maxProcessSize = samples;
+
+    reconfigure();
+}
+
+void
+RubberBandStretcher::Impl::setKeyFrameMap(const std::map<size_t, size_t> &
+                                          mapping)
+{
+    if (m_realtime) {
+        cerr << "RubberBandStretcher::Impl::setKeyFrameMap: Cannot specify key frame map in RT mode" << endl;
+        return;
+    }
+    if (m_mode == Processing) {
+        cerr << "RubberBandStretcher::Impl::setKeyFrameMap: Cannot specify key frame map after process() has begun" << endl;
+        return;
+    }
+
+    if (m_stretchCalculator) {
+        m_stretchCalculator->setKeyFrameMap(mapping);
+    }
+}
+
+float
+RubberBandStretcher::Impl::getFrequencyCutoff(int n) const
+{
+    switch (n) {
+    case 0: return m_freq0;
+    case 1: return m_freq1;
+    case 2: return m_freq2;
+    }
+    return 0.f;
+}
+
+void
+RubberBandStretcher::Impl::setFrequencyCutoff(int n, float f)
+{
+    switch (n) {
+    case 0: m_freq0 = f; break;
+    case 1: m_freq1 = f; break;
+    case 2: m_freq2 = f; break;
+    }
+}
+
+double
+RubberBandStretcher::Impl::getEffectiveRatio() const
+{
+    // Returns the ratio that the internal time stretcher needs to
+    // achieve, not the resulting duration ratio of the output (which
+    // is simply m_timeRatio).
+
+    // A frequency shift is achieved using an additional time shift,
+    // followed by resampling back to the original time shift to
+    // change the pitch.  Note that the resulting frequency change is
+    // fixed, as it is effected by the resampler -- in contrast to
+    // time shifting, which is variable aiming to place the majority
+    // of the stretch or squash in low-interest regions of audio.
+
+    return m_timeRatio * m_pitchScale;
+}
+
+size_t
+RubberBandStretcher::Impl::roundUp(size_t value)
+{
+    if (!(value & (value - 1))) return value;
+    int bits = 0;
+    while (value) { ++bits; value >>= 1; }
+    value = 1 << bits;
+    return value;
+}
+
+void
+RubberBandStretcher::Impl::calculateSizes()
+{
+    size_t inputIncrement = m_defaultIncrement;
+    size_t windowSize = m_baseFftSize;
+    size_t outputIncrement;
+
+    if (m_pitchScale <= 0.0) {
+        // This special case is likelier than one might hope, because
+        // of naive initialisations in programs that set it from a
+        // variable
+        std::cerr << "RubberBandStretcher: WARNING: Pitch scale must be greater than zero!\nResetting it from " << m_pitchScale << " to the default of 1.0: no pitch change will occur" << std::endl;
+        m_pitchScale = 1.0;
+    }
+    if (m_timeRatio <= 0.0) {
+        // Likewise
+        std::cerr << "RubberBandStretcher: WARNING: Time ratio must be greater than zero!\nResetting it from " << m_timeRatio << " to the default of 1.0: no time stretch will occur" << std::endl;
+        m_timeRatio = 1.0;
+    }
+
+    double r = getEffectiveRatio();
+
+    if (m_realtime) {
+
+        if (r < 1) {
+            
+            bool rsb = (m_pitchScale < 1.0 && !resampleBeforeStretching());
+            float windowIncrRatio = 4.5;
+            if (r == 1.0) windowIncrRatio = 4;
+            else if (rsb) windowIncrRatio = 4.5;
+            else windowIncrRatio = 6;
+
+            inputIncrement = int(windowSize / windowIncrRatio);
+            outputIncrement = int(floor(inputIncrement * r));
+
+            // Very long stretch or very low pitch shift
+            if (outputIncrement < m_defaultIncrement / 4) {
+                if (outputIncrement < 1) outputIncrement = 1;
+                while (outputIncrement < m_defaultIncrement / 4 &&
+                       windowSize < m_baseFftSize * 4) {
+                    outputIncrement *= 2;
+                    inputIncrement = lrint(ceil(outputIncrement / r));
+                    windowSize = roundUp(lrint(ceil(inputIncrement * windowIncrRatio)));
+                }
+            }
+
+        } else {
+
+            bool rsb = (m_pitchScale > 1.0 && resampleBeforeStretching());
+            float windowIncrRatio = 4.5;
+            if (r == 1.0) windowIncrRatio = 4;
+            else if (rsb) windowIncrRatio = 4.5;
+            else windowIncrRatio = 8;
+
+            outputIncrement = int(windowSize / windowIncrRatio);
+            inputIncrement = int(outputIncrement / r);
+            while (outputIncrement > 1024 * m_rateMultiple &&
+                   inputIncrement > 1) {
+                outputIncrement /= 2;
+                inputIncrement = int(outputIncrement / r);
+            }
+            size_t minwin = roundUp(lrint(outputIncrement * windowIncrRatio));
+            if (windowSize < minwin) windowSize = minwin;
+
+            if (rsb) {
+//                cerr << "adjusting window size from " << windowSize;
+                size_t newWindowSize = roundUp(lrint(windowSize / m_pitchScale));
+                if (newWindowSize < 512) newWindowSize = 512;
+                size_t div = windowSize / newWindowSize;
+                if (inputIncrement > div && outputIncrement > div) {
+                    inputIncrement /= div;
+                    outputIncrement /= div;
+                    windowSize /= div;
+                }
+//                cerr << " to " << windowSize << " (inputIncrement = " << inputIncrement << ", outputIncrement = " << outputIncrement << ")" << endl;
+            }
+        }
+
+    } else {
+
+        if (r < 1) {
+            inputIncrement = windowSize / 4;
+            while (inputIncrement >= 512) inputIncrement /= 2;
+            outputIncrement = int(floor(inputIncrement * r));
+            if (outputIncrement < 1) {
+                outputIncrement = 1;
+                inputIncrement = roundUp(lrint(ceil(outputIncrement / r)));
+                windowSize = inputIncrement * 4;
+            }
+        } else {
+            outputIncrement = windowSize / 6;
+            inputIncrement = int(outputIncrement / r);
+            while (outputIncrement > 1024 && inputIncrement > 1) {
+                outputIncrement /= 2;
+                inputIncrement = int(outputIncrement / r);
+            }
+            windowSize = std::max(windowSize, roundUp(outputIncrement * 6));
+            if (r > 5) while (windowSize < 8192) windowSize *= 2;
+        }
+    }
+
+    if (m_expectedInputDuration > 0) {
+        while (inputIncrement * 4 > m_expectedInputDuration &&
+               inputIncrement > 1) {
+            inputIncrement /= 2;
+        }
+    }
+
+    // m_fftSize can be almost anything, but it can't be greater than
+    // 4 * m_baseFftSize unless ratio is less than 1/1024.
+
+    m_fftSize = windowSize;
+    
+    if (m_options & OptionSmoothingOn) {
+        m_aWindowSize = windowSize * 2;
+        m_sWindowSize = windowSize * 2;
+    } else {
+        m_aWindowSize = windowSize;
+        m_sWindowSize = windowSize;
+    }
+
+    m_increment = inputIncrement;
+
+    // When squashing, the greatest theoretically possible output
+    // increment is the input increment.  When stretching adaptively
+    // the sky's the limit in principle, but we expect
+    // StretchCalculator to restrict itself to using no more than
+    // twice the basic output increment (i.e. input increment times
+    // ratio) for any chunk.
+
+    if (m_debugLevel > 0) {
+        cerr << "configure: effective ratio = " << getEffectiveRatio() << endl;
+        cerr << "configure: analysis window size = " << m_aWindowSize << ", synthesis window size = " << m_sWindowSize << ", fft size = " << m_fftSize << ", increment = " << m_increment << " (approx output increment = " << int(lrint(m_increment * getEffectiveRatio())) << ")" << endl;
+    }
+
+    if (std::max(m_aWindowSize, m_sWindowSize) > m_maxProcessSize) {
+        m_maxProcessSize = std::max(m_aWindowSize, m_sWindowSize);
+    }
+
+    m_outbufSize =
+        size_t
+        (ceil(max
+              (m_maxProcessSize / m_pitchScale,
+               m_maxProcessSize * 2 * (m_timeRatio > 1.f ? m_timeRatio : 1.f))));
+
+    if (m_realtime) {
+        // This headroom is so as to try to avoid reallocation when
+        // the pitch scale changes
+        m_outbufSize = m_outbufSize * 16;
+    } else {
+#ifndef NO_THREADING
+        if (m_threaded) {
+            // This headroom is to permit the processing threads to
+            // run ahead of the buffer output drainage; the exact
+            // amount of headroom is a question of tuning rather than
+            // results
+            m_outbufSize = m_outbufSize * 16;
+        }
+#endif
+    }
+
+    if (m_debugLevel > 0) {
+        cerr << "configure: outbuf size = " << m_outbufSize << endl;
+    }
+}
+
+void
+RubberBandStretcher::Impl::configure()
+{
+//    std::cerr << "configure[" << this << "]: realtime = " << m_realtime << ", pitch scale = "
+//              << m_pitchScale << ", channels = " << m_channels << std::endl;
+
+    size_t prevFftSize = m_fftSize;
+    size_t prevAWindowSize = m_aWindowSize;
+    size_t prevSWindowSize = m_sWindowSize;
+    size_t prevOutbufSize = m_outbufSize;
+    if (m_windows.empty()) {
+        prevFftSize = 0;
+        prevAWindowSize = 0;
+        prevSWindowSize = 0;
+        prevOutbufSize = 0;
+    }
+
+    calculateSizes();
+
+    bool fftSizeChanged = (prevFftSize != m_fftSize);
+    bool windowSizeChanged = ((prevAWindowSize != m_aWindowSize) ||
+                              (prevSWindowSize != m_sWindowSize));
+    bool outbufSizeChanged = (prevOutbufSize != m_outbufSize);
+
+    // This function may be called at any time in non-RT mode, after a
+    // parameter has changed.  It shouldn't be legal to call it after
+    // processing has already begun.
+
+    // This function is only called once (on construction) in RT
+    // mode.  After that reconfigure() does the work in a hopefully
+    // RT-safe way.
+
+    set<size_t> windowSizes;
+    if (m_realtime) {
+        windowSizes.insert(m_baseFftSize);
+        windowSizes.insert(m_baseFftSize / 2);
+        windowSizes.insert(m_baseFftSize * 2);
+//        windowSizes.insert(m_baseFftSize * 4);
+    }
+    windowSizes.insert(m_fftSize);
+    windowSizes.insert(m_aWindowSize);
+    windowSizes.insert(m_sWindowSize);
+
+    if (windowSizeChanged) {
+
+        for (set<size_t>::const_iterator i = windowSizes.begin();
+             i != windowSizes.end(); ++i) {
+            if (m_windows.find(*i) == m_windows.end()) {
+                m_windows[*i] = new Window<float>(HanningWindow, *i);
+            }
+            if (m_sincs.find(*i) == m_sincs.end()) {
+                m_sincs[*i] = new SincWindow<float>(*i, *i);
+            }
+        }
+        m_awindow = m_windows[m_aWindowSize];
+        m_afilter = m_sincs[m_aWindowSize];
+        m_swindow = m_windows[m_sWindowSize];
+
+        if (m_debugLevel > 0) {
+            cerr << "Window area: " << m_awindow->getArea() << "; synthesis window area: " << m_swindow->getArea() << endl;
+        }
+    }
+
+    if (windowSizeChanged || outbufSizeChanged) {
+        
+        for (size_t c = 0; c < m_channelData.size(); ++c) {
+            delete m_channelData[c];
+        }
+        m_channelData.clear();
+
+        for (size_t c = 0; c < m_channels; ++c) {
+            m_channelData.push_back
+                (new ChannelData(windowSizes,
+                                 std::max(m_aWindowSize, m_sWindowSize),
+                                 m_fftSize,
+                                 m_outbufSize));
+        }
+    }
+
+    if (!m_realtime && fftSizeChanged) {
+        delete m_studyFFT;
+        m_studyFFT = new FFT(m_fftSize, m_debugLevel);
+        m_studyFFT->initFloat();
+    }
+
+    if (m_pitchScale != 1.0 ||
+        (m_options & OptionPitchHighConsistency) ||
+        m_realtime) {
+
+        for (size_t c = 0; c < m_channels; ++c) {
+
+            if (m_channelData[c]->resampler) continue;
+
+            m_channelData[c]->resampler =
+                new Resampler(Resampler::FastestTolerable, 1, 4096 * 16,
+                              m_debugLevel);
+
+            // rbs is the amount of buffer space we think we'll need
+            // for resampling; but allocate a sensible amount in case
+            // the pitch scale changes during use
+            size_t rbs = 
+                lrintf(ceil((m_increment * m_timeRatio * 2) / m_pitchScale));
+            if (rbs < m_increment * 16) rbs = m_increment * 16;
+            m_channelData[c]->setResampleBufSize(rbs);
+        }
+    }
+    
+    // stretchAudioCurve is unused in RT mode; phaseResetAudioCurve,
+    // silentAudioCurve and stretchCalculator however are used in all
+    // modes
+
+    delete m_phaseResetAudioCurve;
+    m_phaseResetAudioCurve = new CompoundAudioCurve
+        (CompoundAudioCurve::Parameters(m_sampleRate, m_fftSize));
+    m_phaseResetAudioCurve->setType(m_detectorType);
+
+    delete m_silentAudioCurve;
+    m_silentAudioCurve = new SilentAudioCurve
+        (SilentAudioCurve::Parameters(m_sampleRate, m_fftSize));
+
+    if (!m_realtime) {
+        delete m_stretchAudioCurve;
+        if (!(m_options & OptionStretchPrecise)) {
+            m_stretchAudioCurve = new SpectralDifferenceAudioCurve
+                (SpectralDifferenceAudioCurve::Parameters(m_sampleRate, m_fftSize));
+        } else {
+            m_stretchAudioCurve = new ConstantAudioCurve
+                (ConstantAudioCurve::Parameters(m_sampleRate, m_fftSize));
+        }
+    }
+
+    delete m_stretchCalculator;
+    m_stretchCalculator = new StretchCalculator
+        (m_sampleRate, m_increment,
+         !(m_options & OptionTransientsSmooth));
+
+    m_stretchCalculator->setDebugLevel(m_debugLevel);
+    m_inputDuration = 0;
+
+    // Prepare the inbufs with half a chunk of emptiness.  The centre
+    // point of the first processing chunk for the onset detector
+    // should be the first sample of the audio, and we continue until
+    // we can no longer centre a chunk within the input audio.  The
+    // number of onset detector chunks will be the number of audio
+    // samples input, divided by the input increment, plus one.
+
+    // In real-time mode, we don't do this prefill -- it's better to
+    // start with a swoosh than introduce more latency, and we don't
+    // want gaps when the ratio changes.
+
+    if (!m_realtime) {
+        if (m_debugLevel > 1) {
+            cerr << "Not real time mode: prefilling" << endl;
+        }
+        for (size_t c = 0; c < m_channels; ++c) {
+            m_channelData[c]->reset();
+            m_channelData[c]->inbuf->zero(m_aWindowSize/2);
+        }
+    }
+}
+
+
+void
+RubberBandStretcher::Impl::reconfigure()
+{
+    if (!m_realtime) {
+        if (m_mode == Studying) {
+            // stop and calculate the stretch curve so far, then reset
+            // the df vectors
+            calculateStretch();
+            m_phaseResetDf.clear();
+            m_stretchDf.clear();
+            m_silence.clear();
+            m_inputDuration = 0;
+        }
+        configure();
+    }
+
+    size_t prevFftSize = m_fftSize;
+    size_t prevAWindowSize = m_aWindowSize;
+    size_t prevSWindowSize = m_sWindowSize;
+    size_t prevOutbufSize = m_outbufSize;
+
+    calculateSizes();
+
+    // There are various allocations in this function, but they should
+    // never happen in normal use -- they just recover from the case
+    // where not all of the things we need were correctly created when
+    // we first configured (for whatever reason).  This is intended to
+    // be "effectively" realtime safe.  The same goes for
+    // ChannelData::setOutbufSize and setSizes.
+
+    if (m_aWindowSize != prevAWindowSize ||
+        m_sWindowSize != prevSWindowSize) {
+
+        if (m_windows.find(m_aWindowSize) == m_windows.end()) {
+            std::cerr << "WARNING: reconfigure(): window allocation (size " << m_aWindowSize << ") required in RT mode" << std::endl;
+            m_windows[m_aWindowSize] = new Window<float>
+                (HanningWindow, m_aWindowSize);
+            m_sincs[m_aWindowSize] = new SincWindow<float>
+                (m_aWindowSize, m_aWindowSize);
+        }
+
+        if (m_windows.find(m_sWindowSize) == m_windows.end()) {
+            std::cerr << "WARNING: reconfigure(): window allocation (size " << m_sWindowSize << ") required in RT mode" << std::endl;
+            m_windows[m_sWindowSize] = new Window<float>
+                (HanningWindow, m_sWindowSize);
+            m_sincs[m_sWindowSize] = new SincWindow<float>
+                (m_sWindowSize, m_sWindowSize);
+        }
+
+        m_awindow = m_windows[m_aWindowSize];
+        m_afilter = m_sincs[m_aWindowSize];
+        m_swindow = m_windows[m_sWindowSize];
+
+        for (size_t c = 0; c < m_channels; ++c) {
+            m_channelData[c]->setSizes(std::max(m_aWindowSize, m_sWindowSize),
+                                       m_fftSize);
+        }
+    }
+
+    if (m_outbufSize != prevOutbufSize) {
+        for (size_t c = 0; c < m_channels; ++c) {
+            m_channelData[c]->setOutbufSize(m_outbufSize);
+        }
+    }
+
+    if (m_pitchScale != 1.0) {
+        for (size_t c = 0; c < m_channels; ++c) {
+
+            if (m_channelData[c]->resampler) continue;
+
+            std::cerr << "WARNING: reconfigure(): resampler construction required in RT mode" << std::endl;
+
+            m_channelData[c]->resampler =
+                new Resampler(Resampler::FastestTolerable, 1, m_sWindowSize,
+                              m_debugLevel);
+
+            size_t rbs = 
+                lrintf(ceil((m_increment * m_timeRatio * 2) / m_pitchScale));
+            if (rbs < m_increment * 16) rbs = m_increment * 16;
+            m_channelData[c]->setResampleBufSize(rbs);
+        }
+    }
+
+    if (m_fftSize != prevFftSize) {
+        m_phaseResetAudioCurve->setFftSize(m_fftSize);
+    }
+}
+
+size_t
+RubberBandStretcher::Impl::getLatency() const
+{
+    if (!m_realtime) return 0;
+    return int((m_aWindowSize/2) / m_pitchScale + 1);
+}
+
+void
+RubberBandStretcher::Impl::setTransientsOption(Options options)
+{
+    if (!m_realtime) {
+        cerr << "RubberBandStretcher::Impl::setTransientsOption: Not permissible in non-realtime mode" << endl;
+        return;
+    }
+    int mask = (OptionTransientsMixed | OptionTransientsSmooth | OptionTransientsCrisp);
+    m_options &= ~mask;
+    options &= mask;
+    m_options |= options;
+
+    m_stretchCalculator->setUseHardPeaks
+        (!(m_options & OptionTransientsSmooth));
+}
+
+void
+RubberBandStretcher::Impl::setDetectorOption(Options options)
+{
+    if (!m_realtime) {
+        cerr << "RubberBandStretcher::Impl::setDetectorOption: Not permissible in non-realtime mode" << endl;
+        return;
+    }
+    int mask = (OptionDetectorPercussive | OptionDetectorCompound | OptionDetectorSoft);
+    m_options &= ~mask;
+    options &= mask;
+    m_options |= options;
+
+    CompoundAudioCurve::Type dt = CompoundAudioCurve::CompoundDetector;
+    if (m_options & OptionDetectorPercussive) dt = CompoundAudioCurve::PercussiveDetector;
+    else if (m_options & OptionDetectorSoft) dt = CompoundAudioCurve::SoftDetector;
+    
+    if (dt == m_detectorType) return;
+    m_detectorType = dt;
+
+    if (m_phaseResetAudioCurve) {
+        m_phaseResetAudioCurve->setType(m_detectorType);
+    }
+}
+
+void
+RubberBandStretcher::Impl::setPhaseOption(Options options)
+{
+    int mask = (OptionPhaseLaminar | OptionPhaseIndependent);
+    m_options &= ~mask;
+    options &= mask;
+    m_options |= options;
+}
+
+void
+RubberBandStretcher::Impl::setFormantOption(Options options)
+{
+    int mask = (OptionFormantShifted | OptionFormantPreserved);
+    m_options &= ~mask;
+    options &= mask;
+    m_options |= options;
+}
+
+void
+RubberBandStretcher::Impl::setPitchOption(Options options)
+{
+    if (!m_realtime) {
+        cerr << "RubberBandStretcher::Impl::setPitchOption: Pitch option is not used in non-RT mode" << endl;
+        return;
+    }
+
+    Options prior = m_options;
+
+    int mask = (OptionPitchHighQuality |
+                OptionPitchHighSpeed |
+                OptionPitchHighConsistency);
+    m_options &= ~mask;
+    options &= mask;
+    m_options |= options;
+
+    if (prior != m_options) reconfigure();
+}
+
+void
+RubberBandStretcher::Impl::study(const float *const *input, size_t samples, bool final)
+{
+    Profiler profiler("RubberBandStretcher::Impl::study");
+
+    if (m_realtime) {
+        if (m_debugLevel > 1) {
+            cerr << "RubberBandStretcher::Impl::study: Not meaningful in realtime mode" << endl;
+        }
+        return;
+    }
+
+    if (m_mode == Processing || m_mode == Finished) {
+        cerr << "RubberBandStretcher::Impl::study: Cannot study after processing" << endl;
+        return;
+    }
+    m_mode = Studying;
+    
+    size_t consumed = 0;
+
+    ChannelData &cd = *m_channelData[0];
+    RingBuffer<float> &inbuf = *cd.inbuf;
+
+    const float *mixdown;
+    float *mdalloc = 0;
+
+    if (m_channels > 1 || final) {
+        // mix down into a single channel for analysis
+        mdalloc = new float[samples];
+        for (size_t i = 0; i < samples; ++i) {
+            if (i < samples) {
+                mdalloc[i] = input[0][i];
+            } else {
+                mdalloc[i] = 0.f;
+            }
+        }
+        for (size_t c = 1; c < m_channels; ++c) {
+            for (size_t i = 0; i < samples; ++i) {
+                mdalloc[i] += input[c][i];
+            }
+        }
+        for (size_t i = 0; i < samples; ++i) {
+            mdalloc[i] /= m_channels;
+        }
+        mixdown = mdalloc;
+    } else {
+        mixdown = input[0];
+    }
+
+    while (consumed < samples) {
+
+	size_t writable = inbuf.getWriteSpace();
+	writable = min(writable, samples - consumed);
+
+	if (writable == 0) {
+            // warn
+            cerr << "WARNING: writable == 0 (consumed = " << consumed << ", samples = " << samples << ")" << endl;
+	} else {
+            inbuf.write(mixdown + consumed, writable);
+            consumed += writable;
+        }
+
+	while ((inbuf.getReadSpace() >= int(m_aWindowSize)) ||
+               (final && (inbuf.getReadSpace() >= int(m_aWindowSize/2)))) {
+
+	    // We know we have at least m_aWindowSize samples
+	    // available in m_inbuf.  We need to peek m_aWindowSize of
+	    // them for processing, and then skip m_increment to
+	    // advance the read pointer.
+
+            // cd.accumulator is not otherwise used during studying,
+            // so we can use it as a temporary buffer here
+
+            size_t ready = inbuf.getReadSpace();
+            assert(final || ready >= m_aWindowSize);
+            inbuf.peek(cd.accumulator, std::min(ready, m_aWindowSize));
+
+            if (m_aWindowSize == m_fftSize) {
+
+                // We don't need the fftshift for studying, as we're
+                // only interested in magnitude.
+
+                m_awindow->cut(cd.accumulator);
+
+            } else {
+
+                // If we need to fold (i.e. if the window size is
+                // greater than the fft size so we are doing a
+                // time-aliased presum fft) or zero-pad, then we might
+                // as well use our standard function for it.  This
+                // means we retain the m_afilter cut if folding as well,
+                // which is good for consistency with real-time mode.
+                // We get fftshift as well, which we don't want, but
+                // the penalty is nominal.
+
+                // Note that we can't do this in-place.  Pity
+
+                float *tmp = (float *)alloca
+                    (std::max(m_fftSize, m_aWindowSize) * sizeof(float));
+
+                if (m_aWindowSize > m_fftSize) {
+                    m_afilter->cut(cd.accumulator);
+                }
+
+                cutShiftAndFold(tmp, m_fftSize, cd.accumulator, m_awindow);
+                v_copy(cd.accumulator, tmp, m_fftSize);
+            }
+
+            m_studyFFT->forwardMagnitude(cd.accumulator, cd.fltbuf);
+
+            float df = m_phaseResetAudioCurve->processFloat(cd.fltbuf, m_increment);
+            m_phaseResetDf.push_back(df);
+
+//            cout << m_phaseResetDf.size() << " [" << final << "] -> " << df << " \t: ";
+
+            df = m_stretchAudioCurve->processFloat(cd.fltbuf, m_increment);
+            m_stretchDf.push_back(df);
+
+            df = m_silentAudioCurve->processFloat(cd.fltbuf, m_increment);
+            bool silent = (df > 0.f);
+            if (silent && m_debugLevel > 1) {
+                cerr << "silence found at " << m_inputDuration << endl;
+            }
+            m_silence.push_back(silent);
+
+//            cout << df << endl;
+
+            // We have augmented the input by m_aWindowSize/2 so that
+            // the first chunk is centred on the first audio sample.
+            // We want to ensure that m_inputDuration contains the
+            // exact input duration without including this extra bit.
+            // We just add up all the increments here, and deduct the
+            // extra afterwards.
+
+            m_inputDuration += m_increment;
+//                cerr << "incr input duration by increment: " << m_increment << " -> " << m_inputDuration << endl;
+            inbuf.skip(m_increment);
+	}
+    }
+
+    if (final) {
+        int rs = inbuf.getReadSpace();
+        m_inputDuration += rs;
+//        cerr << "incr input duration by read space: " << rs << " -> " << m_inputDuration << endl;
+
+        if (m_inputDuration > m_aWindowSize/2) { // deducting the extra
+            m_inputDuration -= m_aWindowSize/2;
+        }
+    }
+
+    if (m_channels > 1) delete[] mdalloc;
+}
+
+vector<int>
+RubberBandStretcher::Impl::getOutputIncrements() const
+{
+    if (!m_realtime) {
+        return m_outputIncrements;
+    } else {
+        vector<int> increments;
+        while (m_lastProcessOutputIncrements.getReadSpace() > 0) {
+            increments.push_back(m_lastProcessOutputIncrements.readOne());
+        }
+        return increments;
+    }
+}
+
+vector<float>
+RubberBandStretcher::Impl::getPhaseResetCurve() const
+{
+    if (!m_realtime) {
+        return m_phaseResetDf;
+    } else {
+        vector<float> df;
+        while (m_lastProcessPhaseResetDf.getReadSpace() > 0) {
+            df.push_back(m_lastProcessPhaseResetDf.readOne());
+        }
+        return df;
+    }
+}
+
+vector<int>
+RubberBandStretcher::Impl::getExactTimePoints() const
+{
+    std::vector<int> points;
+    if (!m_realtime) {
+        std::vector<StretchCalculator::Peak> peaks =
+            m_stretchCalculator->getLastCalculatedPeaks();
+        for (size_t i = 0; i < peaks.size(); ++i) {
+            points.push_back(peaks[i].chunk);
+        }
+    }
+    return points;
+}
+
+void
+RubberBandStretcher::Impl::calculateStretch()
+{
+    Profiler profiler("RubberBandStretcher::Impl::calculateStretch");
+
+    size_t inputDuration = m_inputDuration;
+
+    if (!m_realtime && m_expectedInputDuration > 0) {
+        if (m_expectedInputDuration != inputDuration) {
+            std::cerr << "RubberBandStretcher: WARNING: Actual study() duration differs from duration set by setExpectedInputDuration (" << m_inputDuration << " vs " << m_expectedInputDuration << ", diff = " << (m_expectedInputDuration - m_inputDuration) << "), using the latter for calculation" << std::endl;
+            inputDuration = m_expectedInputDuration;
+        }
+    }
+
+    double prdm = 0, sdm = 0;
+    if (!m_phaseResetDf.empty()) {
+        for (int i = 0; i < m_phaseResetDf.size(); ++i) prdm += m_phaseResetDf[i];
+        prdm /= m_phaseResetDf.size();
+    }
+    if (!m_stretchDf.empty()) {
+        for (int i = 0; i < m_stretchDf.size(); ++i) sdm += m_stretchDf[i];
+        sdm /= m_stretchDf.size();
+    }
+//    std::cerr << "phase reset df mean = " << prdm << ", stretch df mean = " << sdm << std::endl;
+
+    std::vector<int> increments = m_stretchCalculator->calculate
+        (getEffectiveRatio(),
+         inputDuration,
+         m_phaseResetDf,
+         m_stretchDf);
+
+    int history = 0;
+    for (size_t i = 0; i < increments.size(); ++i) {
+        if (i >= m_silence.size()) break;
+        if (m_silence[i]) ++history;
+        else history = 0;
+        if (history >= int(m_aWindowSize / m_increment) && increments[i] >= 0) {
+            increments[i] = -increments[i];
+            if (m_debugLevel > 1) {
+                std::cerr << "phase reset on silence (silent history == "
+                          << history << ")" << std::endl;
+            }
+        }
+    }
+
+    if (m_outputIncrements.empty()) m_outputIncrements = increments;
+    else {
+        for (size_t i = 0; i < increments.size(); ++i) {
+            m_outputIncrements.push_back(increments[i]);
+        }
+    }
+    
+    return;
+}
+
+void
+RubberBandStretcher::Impl::setDebugLevel(int level)
+{
+    m_debugLevel = level;
+    if (m_stretchCalculator) m_stretchCalculator->setDebugLevel(level);
+}	
+
+size_t
+RubberBandStretcher::Impl::getSamplesRequired() const
+{
+    Profiler profiler("RubberBandStretcher::Impl::getSamplesRequired");
+
+    size_t reqd = 0;
+
+    for (size_t c = 0; c < m_channels; ++c) {
+
+        size_t reqdHere = 0;
+
+        ChannelData &cd = *m_channelData[c];
+        RingBuffer<float> &inbuf = *cd.inbuf;
+        RingBuffer<float> &outbuf = *cd.outbuf;
+
+        size_t rs = inbuf.getReadSpace();
+        size_t ws = outbuf.getReadSpace();
+
+        if (m_debugLevel > 2) {
+            cerr << "getSamplesRequired: ws = " << ws << ", rs = " << rs << ", m_aWindowSize = " << m_aWindowSize << endl;
+        }
+
+        // We should never return zero in non-threaded modes if
+        // available() would also return zero, i.e. if ws == 0.  If we
+        // do that, nothing will ever happen again!  We need to demand
+        // at least one increment (i.e. a nominal amount) to feed the
+        // engine.
+
+        if (ws == 0 && reqd == 0) reqd = m_increment;
+
+        // See notes in testInbufReadSpace 
+
+        if (rs < m_aWindowSize && !cd.draining) {
+            
+            if (cd.inputSize == -1) {
+                reqdHere = m_aWindowSize - rs;
+                if (reqdHere > reqd) reqd = reqdHere;
+                continue;
+            }
+        
+            if (rs == 0) {
+                reqdHere = m_aWindowSize;
+                if (reqdHere > reqd) reqd = reqdHere;
+                continue;
+            }
+        }
+    }
+    
+    return reqd;
+}    
+
+void
+RubberBandStretcher::Impl::process(const float *const *input, size_t samples, bool final)
+{
+    Profiler profiler("RubberBandStretcher::Impl::process");
+
+    if (m_mode == Finished) {
+        cerr << "RubberBandStretcher::Impl::process: Cannot process again after final chunk" << endl;
+        return;
+    }
+
+    if (m_mode == JustCreated || m_mode == Studying) {
+
+        if (m_mode == Studying) {
+
+            calculateStretch();
+
+            if (!m_realtime) {
+                // See note in configure() above. Of course, we should
+                // never enter Studying unless we are non-RT anyway
+                if (m_debugLevel > 1) {
+                    cerr << "Not real time mode: prefilling" << endl;
+                }
+                for (size_t c = 0; c < m_channels; ++c) {
+                    m_channelData[c]->reset();
+                    m_channelData[c]->inbuf->zero(m_aWindowSize/2);
+                }
+            }
+        }
+
+#ifndef NO_THREADING
+        if (m_threaded) {
+            MutexLocker locker(&m_threadSetMutex);
+
+            for (size_t c = 0; c < m_channels; ++c) {
+                ProcessThread *thread = new ProcessThread(this, c);
+                m_threadSet.insert(thread);
+                thread->start();
+            }
+            
+            if (m_debugLevel > 0) {
+                cerr << m_channels << " threads created" << endl;
+            }
+        }
+#endif
+        
+        m_mode = Processing;
+    }
+
+    bool allConsumed = false;
+
+    size_t *consumed = (size_t *)alloca(m_channels * sizeof(size_t));
+    for (size_t c = 0; c < m_channels; ++c) {
+        consumed[c] = 0;
+    }
+
+    while (!allConsumed) {
+
+        // In a threaded mode, our "consumed" counters only indicate
+        // the number of samples that have been taken into the input
+        // ring buffers waiting to be processed by the process thread.
+        // In non-threaded mode, "consumed" counts the number that
+        // have actually been processed.
+
+        allConsumed = true;
+
+        for (size_t c = 0; c < m_channels; ++c) {
+            consumed[c] += consumeChannel(c,
+                                          input,
+                                          consumed[c],
+                                          samples - consumed[c],
+                                          final);
+            if (consumed[c] < samples) {
+                allConsumed = false;
+//                cerr << "process: waiting on input consumption for channel " << c << endl;
+            } else {
+                if (final) {
+                    m_channelData[c]->inputSize = m_channelData[c]->inCount;
+                }
+//                cerr << "process: happy with channel " << c << endl;
+            }
+            if (
+#ifndef NO_THREADING
+                !m_threaded &&
+#endif
+                !m_realtime) {
+                bool any = false, last = false;
+                processChunks(c, any, last);
+            }
+        }
+
+        if (m_realtime) {
+            // When running in real time, we need to process both
+            // channels in step because we will need to use the sum of
+            // their frequency domain representations as the input to
+            // the realtime onset detector
+            processOneChunk();
+        }
+#ifndef NO_THREADING
+        if (m_threaded) {
+            for (ThreadSet::iterator i = m_threadSet.begin();
+                 i != m_threadSet.end(); ++i) {
+                (*i)->signalDataAvailable();
+            }
+            m_spaceAvailable.lock();
+            if (!allConsumed) {
+                m_spaceAvailable.wait(500);
+            }
+            m_spaceAvailable.unlock();
+        }
+#endif
+
+        if (m_debugLevel > 2) {
+            if (!allConsumed) cerr << "process looping" << endl;
+        }
+    }
+
+    if (m_debugLevel > 2) {
+        cerr << "process returning" << endl;
+    }
+
+    if (final) m_mode = Finished;
+}
+
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/StretcherImpl.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/StretcherImpl.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,269 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_STRETCHERIMPL_H_
+#define _RUBBERBAND_STRETCHERIMPL_H_
+
+#include "rubberband/RubberBandStretcher.h"
+
+#include "dsp/Window.h"
+#include "dsp/SincWindow.h"
+#include "dsp/FFT.h"
+
+#include "audiocurves/CompoundAudioCurve.h"
+
+#include "base/RingBuffer.h"
+#include "base/Scavenger.h"
+#include "system/Thread.h"
+#include "system/sysutils.h"
+
+#include <set>
+
+using namespace RubberBand;
+
+namespace RubberBand
+{
+
+#ifdef PROCESS_SAMPLE_TYPE
+typedef PROCESS_SAMPLE_TYPE process_t;
+#else
+typedef double process_t;
+#endif
+
+class AudioCurveCalculator;
+class StretchCalculator;
+
+class RubberBandStretcher::Impl
+{
+public:
+    Impl(size_t sampleRate, size_t channels, Options options,
+         double initialTimeRatio, double initialPitchScale);
+    ~Impl();
+    
+    void reset();
+    void setTimeRatio(double ratio);
+    void setPitchScale(double scale);
+
+    double getTimeRatio() const;
+    double getPitchScale() const;
+
+    size_t getLatency() const;
+
+    void setTransientsOption(Options);
+    void setDetectorOption(Options);
+    void setPhaseOption(Options);
+    void setFormantOption(Options);
+    void setPitchOption(Options);
+
+    void setExpectedInputDuration(size_t samples);
+    void setMaxProcessSize(size_t samples);
+    void setKeyFrameMap(const std::map<size_t, size_t> &);
+
+    size_t getSamplesRequired() const;
+
+    void study(const float *const *input, size_t samples, bool final);
+    void process(const float *const *input, size_t samples, bool final);
+
+    int available() const;
+    size_t retrieve(float *const *output, size_t samples) const;
+
+    float getFrequencyCutoff(int n) const;
+    void setFrequencyCutoff(int n, float f);
+
+    size_t getInputIncrement() const {
+        return m_increment;
+    }
+
+    std::vector<int> getOutputIncrements() const;
+    std::vector<float> getPhaseResetCurve() const;
+    std::vector<int> getExactTimePoints() const;
+
+    size_t getChannelCount() const {
+        return m_channels;
+    }
+    
+    void calculateStretch();
+
+    void setDebugLevel(int level);
+    static void setDefaultDebugLevel(int level) { m_defaultDebugLevel = level; }
+
+protected:
+    size_t m_sampleRate;
+    size_t m_channels;
+
+    void prepareChannelMS(size_t channel, const float *const *inputs,
+                          size_t offset, size_t samples, float *prepared);
+    size_t consumeChannel(size_t channel, const float *const *inputs,
+                          size_t offset, size_t samples, bool final);
+    void processChunks(size_t channel, bool &any, bool &last);
+    bool processOneChunk(); // across all channels, for real time use
+    bool processChunkForChannel(size_t channel, size_t phaseIncrement,
+                                size_t shiftIncrement, bool phaseReset);
+    bool testInbufReadSpace(size_t channel);
+    void calculateIncrements(size_t &phaseIncrement,
+                             size_t &shiftIncrement, bool &phaseReset);
+    bool getIncrements(size_t channel, size_t &phaseIncrement,
+                       size_t &shiftIncrement, bool &phaseReset);
+    void analyseChunk(size_t channel);
+    void modifyChunk(size_t channel, size_t outputIncrement, bool phaseReset);
+    void formantShiftChunk(size_t channel);
+    void synthesiseChunk(size_t channel, size_t shiftIncrement);
+    void writeChunk(size_t channel, size_t shiftIncrement, bool last);
+
+    void calculateSizes();
+    void configure();
+    void reconfigure();
+
+    double getEffectiveRatio() const;
+    
+    size_t roundUp(size_t value); // to next power of two
+
+    template <typename T, typename S>
+    void cutShiftAndFold(T *target, int targetSize,
+                         S *src, // destructive to src
+                         Window<float> *window) {
+        window->cut(src);
+        const int windowSize = window->getSize();
+        const int hs = targetSize / 2;
+        if (windowSize == targetSize) {
+            v_convert(target, src + hs, hs);
+            v_convert(target + hs, src, hs);
+        } else {
+            v_zero(target, targetSize);
+            int j = targetSize - windowSize/2;
+            while (j < 0) j += targetSize;
+            for (int i = 0; i < windowSize; ++i) {
+                target[j] += src[i];
+                if (++j == targetSize) j = 0;
+            }
+        }
+    }
+
+    bool resampleBeforeStretching() const;
+    
+    double m_timeRatio;
+    double m_pitchScale;
+
+    // n.b. either m_fftSize is an integer multiple of m_windowSize,
+    // or vice versa
+    size_t m_fftSize;
+    size_t m_aWindowSize; //!!! or use m_awindow->getSize() throughout?
+    size_t m_sWindowSize; //!!! or use m_swindow->getSize() throughout?
+    size_t m_increment;
+    size_t m_outbufSize;
+
+    size_t m_maxProcessSize;
+    size_t m_expectedInputDuration;
+
+#ifndef NO_THREADING    
+    bool m_threaded;
+#endif
+
+    bool m_realtime;
+    Options m_options;
+    int m_debugLevel;
+
+    enum ProcessMode {
+        JustCreated,
+        Studying,
+        Processing,
+        Finished
+    };
+
+    ProcessMode m_mode;
+
+    std::map<size_t, Window<float> *> m_windows;
+    std::map<size_t, SincWindow<float> *> m_sincs;
+    Window<float> *m_awindow;
+    SincWindow<float> *m_afilter;
+    Window<float> *m_swindow;
+    FFT *m_studyFFT;
+
+#ifndef NO_THREADING
+    Condition m_spaceAvailable;
+    
+    class ProcessThread : public Thread
+    {
+    public:
+        ProcessThread(Impl *s, size_t c);
+        void run();
+        void signalDataAvailable();
+        void abandon();
+    private:
+        Impl *m_s;
+        size_t m_channel;
+        Condition m_dataAvailable;
+        bool m_abandoning;
+    };
+
+    mutable Mutex m_threadSetMutex;
+    typedef std::set<ProcessThread *> ThreadSet;
+    ThreadSet m_threadSet;
+    
+#if defined HAVE_IPP && !defined USE_SPEEX
+    // Exasperatingly, the IPP polyphase resampler does not appear to
+    // be thread-safe as advertised -- a good reason to prefer the
+    // Speex alternative
+    Mutex m_resamplerMutex;
+#endif
+#endif
+
+    size_t m_inputDuration;
+    CompoundAudioCurve::Type m_detectorType;
+    std::vector<float> m_phaseResetDf;
+    std::vector<float> m_stretchDf;
+    std::vector<bool> m_silence;
+    int m_silentHistory;
+
+    class ChannelData; 
+    std::vector<ChannelData *> m_channelData;
+
+    std::vector<int> m_outputIncrements;
+
+    mutable RingBuffer<int> m_lastProcessOutputIncrements;
+    mutable RingBuffer<float> m_lastProcessPhaseResetDf;
+    Scavenger<RingBuffer<float> > m_emergencyScavenger;
+
+    CompoundAudioCurve *m_phaseResetAudioCurve;
+    AudioCurveCalculator *m_stretchAudioCurve;
+    AudioCurveCalculator *m_silentAudioCurve;
+    StretchCalculator *m_stretchCalculator;
+
+    float m_freq0;
+    float m_freq1;
+    float m_freq2;
+
+    size_t m_baseFftSize;
+    float m_rateMultiple;
+
+    void writeOutput(RingBuffer<float> &to, float *from,
+                     size_t qty, size_t &outCount, size_t theoreticalOut);
+
+    static int m_defaultDebugLevel;
+    static const size_t m_defaultIncrement;
+    static const size_t m_defaultFftSize;
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/StretcherProcess.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/StretcherProcess.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1293 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "StretcherImpl.h"
+
+#include "audiocurves/PercussiveAudioCurve.h"
+#include "audiocurves/HighFrequencyAudioCurve.h"
+#include "audiocurves/ConstantAudioCurve.h"
+
+#include "StretchCalculator.h"
+#include "StretcherChannelData.h"
+
+#include "dsp/Resampler.h"
+#include "base/Profiler.h"
+#include "system/VectorOps.h"
+
+#ifndef _WIN32
+#include <alloca.h>
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <set>
+#include <map>
+#include <deque>
+
+using namespace RubberBand;
+
+using std::cerr;
+using std::endl;
+
+namespace RubberBand {
+
+#ifndef NO_THREADING
+
+RubberBandStretcher::Impl::ProcessThread::ProcessThread(Impl *s, size_t c) :
+    m_s(s),
+    m_channel(c),
+    m_dataAvailable(std::string("data ") + char('A' + c)),
+    m_abandoning(false)
+{ }
+
+void
+RubberBandStretcher::Impl::ProcessThread::run()
+{
+    if (m_s->m_debugLevel > 1) {
+        cerr << "thread " << m_channel << " getting going" << endl;
+    }
+
+    ChannelData &cd = *m_s->m_channelData[m_channel];
+
+    while (cd.inputSize == -1 ||
+           cd.inbuf->getReadSpace() > 0) {
+
+//        if (cd.inputSize != -1) {
+//            cerr << "inputSize == " << cd.inputSize
+//                 << ", readSpace == " << cd.inbuf->getReadSpace() << endl;
+//        }
+        
+        bool any = false, last = false;
+        m_s->processChunks(m_channel, any, last);
+
+        if (last) break;
+
+        if (any) {
+            m_s->m_spaceAvailable.lock();
+            m_s->m_spaceAvailable.signal();
+            m_s->m_spaceAvailable.unlock();
+        }
+
+        m_dataAvailable.lock();
+        if (!m_s->testInbufReadSpace(m_channel) && !m_abandoning) {
+            m_dataAvailable.wait(50000); // bounded in case of abandonment
+        }
+        m_dataAvailable.unlock();
+
+        if (m_abandoning) {
+            if (m_s->m_debugLevel > 1) {
+                cerr << "thread " << m_channel << " abandoning" << endl;
+            }
+            return;
+        }
+    }
+
+    bool any = false, last = false;
+    m_s->processChunks(m_channel, any, last);
+    m_s->m_spaceAvailable.lock();
+    m_s->m_spaceAvailable.signal();
+    m_s->m_spaceAvailable.unlock();
+    
+    if (m_s->m_debugLevel > 1) {
+        cerr << "thread " << m_channel << " done" << endl;
+    }
+}
+
+void
+RubberBandStretcher::Impl::ProcessThread::signalDataAvailable()
+{
+    m_dataAvailable.lock();
+    m_dataAvailable.signal();
+    m_dataAvailable.unlock();
+}
+
+void
+RubberBandStretcher::Impl::ProcessThread::abandon()
+{
+    m_abandoning = true;
+}
+
+#endif
+
+bool
+RubberBandStretcher::Impl::resampleBeforeStretching() const
+{
+    // We can't resample before stretching in offline mode, because
+    // the stretch calculation is based on doing it the other way
+    // around.  It would take more work (and testing) to enable this.
+    if (!m_realtime) return false;
+
+    if (m_options & OptionPitchHighQuality) {
+        return (m_pitchScale < 1.0); // better sound
+    } else if (m_options & OptionPitchHighConsistency) {
+        return false;
+    } else {
+        return (m_pitchScale > 1.0); // better performance
+    }
+}
+
+void
+RubberBandStretcher::Impl::prepareChannelMS(size_t c,
+                                            const float *const *inputs,
+                                            size_t offset,
+                                            size_t samples, 
+                                            float *prepared)
+{
+    for (size_t i = 0; i < samples; ++i) {
+        float left = inputs[0][i + offset];
+        float right = inputs[1][i + offset];
+        float mid = (left + right) / 2;
+        float side = (left - right) / 2;
+        if (c == 0) {
+            prepared[i] = mid;
+        } else {
+            prepared[i] = side;
+        }
+    }
+}
+    
+size_t
+RubberBandStretcher::Impl::consumeChannel(size_t c,
+                                          const float *const *inputs,
+                                          size_t offset,
+                                          size_t samples,
+                                          bool final)
+{
+    Profiler profiler("RubberBandStretcher::Impl::consumeChannel");
+
+    ChannelData &cd = *m_channelData[c];
+    RingBuffer<float> &inbuf = *cd.inbuf;
+
+    size_t toWrite = samples;
+    size_t writable = inbuf.getWriteSpace();
+
+    bool resampling = resampleBeforeStretching();
+
+    float *ms = 0;
+    const float *input = 0;
+
+    bool useMidSide = ((m_options & OptionChannelsTogether) &&
+                       (m_channels >= 2) &&
+                       (c < 2));
+
+    if (resampling) {
+
+        toWrite = int(ceil(samples / m_pitchScale));
+        if (writable < toWrite) {
+            samples = int(floor(writable * m_pitchScale));
+            if (samples == 0) return 0;
+        }
+
+        size_t reqSize = int(ceil(samples / m_pitchScale));
+        if (reqSize > cd.resamplebufSize) {
+            cerr << "WARNING: RubberBandStretcher::Impl::consumeChannel: resizing resampler buffer from "
+                 << cd.resamplebufSize << " to " << reqSize << endl;
+            cd.setResampleBufSize(reqSize);
+        }
+
+#ifndef NO_THREADING
+#if defined HAVE_IPP && !defined USE_SPEEX
+        if (m_threaded) {
+            m_resamplerMutex.lock();
+        }
+#endif
+#endif
+
+        if (useMidSide) {
+            ms = (float *)alloca(samples * sizeof(float));
+            prepareChannelMS(c, inputs, offset, samples, ms);
+            input = ms;
+        } else {
+            input = inputs[c] + offset;
+        }
+
+        toWrite = cd.resampler->resample(&input,
+                                         &cd.resamplebuf,
+                                         samples,
+                                         1.0 / m_pitchScale,
+                                         final);
+
+#ifndef NO_THREADING
+#if defined HAVE_IPP && !defined USE_SPEEX
+        if (m_threaded) {
+            m_resamplerMutex.unlock();
+        }
+#endif
+#endif
+    }
+
+    if (writable < toWrite) {
+        if (resampling) {
+            return 0;
+        }
+        toWrite = writable;
+    }
+
+    if (resampling) {
+
+        inbuf.write(cd.resamplebuf, toWrite);
+        cd.inCount += samples;
+        return samples;
+
+    } else {
+
+        if (useMidSide) {
+            ms = (float *)alloca(toWrite * sizeof(float));
+            prepareChannelMS(c, inputs, offset, toWrite, ms);
+            input = ms;
+        } else {
+            input = inputs[c] + offset;
+        }
+
+        inbuf.write(input, toWrite);
+        cd.inCount += toWrite;
+        return toWrite;
+    }
+}
+
+void
+RubberBandStretcher::Impl::processChunks(size_t c, bool &any, bool &last)
+{
+    Profiler profiler("RubberBandStretcher::Impl::processChunks");
+
+    // Process as many chunks as there are available on the input
+    // buffer for channel c.  This requires that the increments have
+    // already been calculated.
+
+    // This is the normal process method in offline mode.
+
+    ChannelData &cd = *m_channelData[c];
+
+    last = false;
+    any = false;
+
+    while (!last) {
+
+        if (!testInbufReadSpace(c)) {
+            if (m_debugLevel > 2) {
+                cerr << "processChunks: out of input" << endl;
+            }
+            break;
+        }
+
+        any = true;
+
+        if (!cd.draining) {
+            size_t ready = cd.inbuf->getReadSpace();
+            assert(ready >= m_aWindowSize || cd.inputSize >= 0);
+            cd.inbuf->peek(cd.fltbuf, std::min(ready, m_aWindowSize));
+            cd.inbuf->skip(m_increment);
+        }
+
+        bool phaseReset = false;
+        size_t phaseIncrement, shiftIncrement;
+        getIncrements(c, phaseIncrement, shiftIncrement, phaseReset);
+
+        if (shiftIncrement <= m_aWindowSize) {
+            analyseChunk(c);
+            last = processChunkForChannel
+                (c, phaseIncrement, shiftIncrement, phaseReset);
+        } else {
+            size_t bit = m_aWindowSize/4;
+            if (m_debugLevel > 1) {
+                cerr << "channel " << c << " breaking down overlong increment " << shiftIncrement << " into " << bit << "-size bits" << endl;
+            }
+            analyseChunk(c);
+            float *tmp = (float *)alloca(m_aWindowSize * sizeof(float));
+            v_copy(tmp, cd.fltbuf, m_aWindowSize);
+            for (size_t i = 0; i < shiftIncrement; i += bit) {
+                v_copy(cd.fltbuf, tmp, m_aWindowSize);
+                size_t thisIncrement = bit;
+                if (i + thisIncrement > shiftIncrement) {
+                    thisIncrement = shiftIncrement - i;
+                }
+                last = processChunkForChannel
+                    (c, phaseIncrement + i, thisIncrement, phaseReset);
+                phaseReset = false;
+            }
+        }
+
+        cd.chunkCount++;
+        if (m_debugLevel > 2) {
+            cerr << "channel " << c << ": last = " << last << ", chunkCount = " << cd.chunkCount << endl;
+        }
+    }
+}
+
+bool
+RubberBandStretcher::Impl::processOneChunk()
+{
+    Profiler profiler("RubberBandStretcher::Impl::processOneChunk");
+
+    // Process a single chunk for all channels, provided there is
+    // enough data on each channel for at least one chunk.  This is
+    // able to calculate increments as it goes along.
+
+    // This is the normal process method in RT mode.
+
+    for (size_t c = 0; c < m_channels; ++c) {
+        if (!testInbufReadSpace(c)) {
+            if (m_debugLevel > 2) {
+                cerr << "processOneChunk: out of input" << endl;
+            }
+            return false;
+        }
+        ChannelData &cd = *m_channelData[c];
+        if (!cd.draining) {
+            size_t ready = cd.inbuf->getReadSpace();
+            assert(ready >= m_aWindowSize || cd.inputSize >= 0);
+            cd.inbuf->peek(cd.fltbuf, std::min(ready, m_aWindowSize));
+            cd.inbuf->skip(m_increment);
+            analyseChunk(c);
+        }
+    }
+    
+    bool phaseReset = false;
+    size_t phaseIncrement, shiftIncrement;
+    if (!getIncrements(0, phaseIncrement, shiftIncrement, phaseReset)) {
+        calculateIncrements(phaseIncrement, shiftIncrement, phaseReset);
+    }
+
+    bool last = false;
+    for (size_t c = 0; c < m_channels; ++c) {
+        last = processChunkForChannel(c, phaseIncrement, shiftIncrement, phaseReset);
+        m_channelData[c]->chunkCount++;
+    }
+
+    return last;
+}
+
+bool
+RubberBandStretcher::Impl::testInbufReadSpace(size_t c)
+{
+    Profiler profiler("RubberBandStretcher::Impl::testInbufReadSpace");
+
+    ChannelData &cd = *m_channelData[c];
+    RingBuffer<float> &inbuf = *cd.inbuf;
+
+    size_t rs = inbuf.getReadSpace();
+
+    if (rs < m_aWindowSize && !cd.draining) {
+            
+        if (cd.inputSize == -1) {
+
+            // Not all the input data has been written to the inbuf
+            // (that's why the input size is not yet set).  We can't
+            // process, because we don't have a full chunk of data, so
+            // our process chunk would contain some empty padding in
+            // its input -- and that would give incorrect output, as
+            // we know there is more input to come.
+
+#ifndef NO_THREADING
+            if (!m_threaded) {
+#endif
+                if (m_debugLevel > 1) {
+                    cerr << "WARNING: RubberBandStretcher: read space < chunk size ("
+                         << inbuf.getReadSpace() << " < " << m_aWindowSize
+                         << ") when not all input written, on processChunks for channel " << c << endl;
+                }
+
+#ifndef NO_THREADING
+            }
+#endif
+            return false;
+        }
+        
+        if (rs == 0) {
+
+            if (m_debugLevel > 1) {
+                cerr << "read space = 0, giving up" << endl;
+            }
+            return false;
+
+        } else if (rs < m_aWindowSize/2) {
+
+            if (m_debugLevel > 1) {
+                cerr << "read space = " << rs << ", setting draining true" << endl;
+            }
+            
+            cd.draining = true;
+        }
+    }
+
+    return true;
+}
+
+bool 
+RubberBandStretcher::Impl::processChunkForChannel(size_t c,
+                                                  size_t phaseIncrement,
+                                                  size_t shiftIncrement,
+                                                  bool phaseReset)
+{
+    Profiler profiler("RubberBandStretcher::Impl::processChunkForChannel");
+
+    // Process a single chunk on a single channel.  This assumes
+    // enough input data is available; caller must have tested this
+    // using e.g. testInbufReadSpace first.  Return true if this is
+    // the last chunk on the channel.
+
+    if (phaseReset && (m_debugLevel > 1)) {
+        cerr << "processChunkForChannel: phase reset found, incrs "
+             << phaseIncrement << ":" << shiftIncrement << endl;
+    }
+
+    ChannelData &cd = *m_channelData[c];
+
+    if (!cd.draining) {
+        
+        // This is the normal processing case -- draining is only
+        // set when all the input has been used and we only need
+        // to write from the existing accumulator into the output.
+        
+        // We know we have enough samples available in m_inbuf --
+        // this is usually m_aWindowSize, but we know that if fewer
+        // are available, it's OK to use zeroes for the rest
+        // (which the ring buffer will provide) because we've
+        // reached the true end of the data.
+        
+        // We need to peek m_aWindowSize samples for processing, and
+        // then skip m_increment to advance the read pointer.
+
+        modifyChunk(c, phaseIncrement, phaseReset);
+        synthesiseChunk(c, shiftIncrement); // reads from cd.mag, cd.phase
+
+        if (m_debugLevel > 2) {
+            if (phaseReset) {
+                for (int i = 0; i < 10; ++i) {
+                    cd.accumulator[i] = 1.2f - (i % 3) * 1.2f;
+                }
+            }
+        }
+    }
+
+    bool last = false;
+
+    if (cd.draining) {
+        if (m_debugLevel > 1) {
+            cerr << "draining: accumulator fill = " << cd.accumulatorFill << " (shiftIncrement = " << shiftIncrement << ")" <<  endl;
+        }
+        if (shiftIncrement == 0) {
+            cerr << "WARNING: draining: shiftIncrement == 0, can't handle that in this context: setting to " << m_increment << endl;
+            shiftIncrement = m_increment;
+        }
+        if (cd.accumulatorFill <= shiftIncrement) {
+            if (m_debugLevel > 1) {
+                cerr << "reducing shift increment from " << shiftIncrement
+                          << " to " << cd.accumulatorFill
+                          << " and marking as last" << endl;
+            }
+            shiftIncrement = cd.accumulatorFill;
+            last = true;
+        }
+    }
+        
+    int required = shiftIncrement;
+
+    if (m_pitchScale != 1.0) {
+        required = int(required / m_pitchScale) + 1;
+    }
+
+    int ws = cd.outbuf->getWriteSpace();
+    if (ws < required) {
+        if (m_debugLevel > 0) {
+            cerr << "Buffer overrun on output for channel " << c << endl;
+        }
+
+        // The only correct thing we can do here is resize the buffer.
+        // We can't wait for the client thread to read some data out
+        // from the buffer so as to make more space, because the
+        // client thread (if we are threaded at all) is probably stuck
+        // in a process() call waiting for us to stow away enough
+        // input increments to allow the process() call to complete.
+        // This is an unhappy situation.
+
+        RingBuffer<float> *oldbuf = cd.outbuf;
+        cd.outbuf = oldbuf->resized(oldbuf->getSize() + (required - ws));
+        m_emergencyScavenger.claim(oldbuf);
+    }
+
+    writeChunk(c, shiftIncrement, last);
+    return last;
+}
+
+void
+RubberBandStretcher::Impl::calculateIncrements(size_t &phaseIncrementRtn,
+                                               size_t &shiftIncrementRtn,
+                                               bool &phaseReset)
+{
+    Profiler profiler("RubberBandStretcher::Impl::calculateIncrements");
+
+//    cerr << "calculateIncrements" << endl;
+    
+    // Calculate the next upcoming phase and shift increment, on the
+    // basis that both channels are in sync.  This is in contrast to
+    // getIncrements, which requires that all the increments have been
+    // calculated in advance but can then return increments
+    // corresponding to different chunks in different channels.
+
+    // Requires frequency domain representations of channel data in
+    // the mag and phase buffers in the channel.
+
+    // This function is only used in real-time mode.
+
+    phaseIncrementRtn = m_increment;
+    shiftIncrementRtn = m_increment;
+    phaseReset = false;
+
+    if (m_channels == 0) return;
+
+    ChannelData &cd = *m_channelData[0];
+
+    size_t bc = cd.chunkCount;
+    for (size_t c = 1; c < m_channels; ++c) {
+        if (m_channelData[c]->chunkCount != bc) {
+            cerr << "ERROR: RubberBandStretcher::Impl::calculateIncrements: Channels are not in sync" << endl;
+            return;
+        }
+    }
+
+    const int hs = m_fftSize/2 + 1;
+
+    // Normally we would mix down the time-domain signal and apply a
+    // single FFT, or else mix down the Cartesian form of the
+    // frequency-domain signal.  Both of those would be inefficient
+    // from this position.  Fortunately, the onset detectors should
+    // work reasonably well (maybe even better?) if we just sum the
+    // magnitudes of the frequency-domain channel signals and forget
+    // about phase entirely.  Normally we don't expect the channel
+    // phases to cancel each other, and broadband effects will still
+    // be apparent.
+
+    float df = 0.f;
+    bool silent = false;
+
+    if (m_channels == 1) {
+
+        if (sizeof(process_t) == sizeof(double)) {
+            df = m_phaseResetAudioCurve->processDouble((double *)cd.mag, m_increment);
+            silent = (m_silentAudioCurve->processDouble((double *)cd.mag, m_increment) > 0.f);
+        } else {
+            df = m_phaseResetAudioCurve->processFloat((float *)cd.mag, m_increment);
+            silent = (m_silentAudioCurve->processFloat((float *)cd.mag, m_increment) > 0.f);
+        }
+
+    } else {
+
+        process_t *tmp = (process_t *)alloca(hs * sizeof(process_t));
+
+        v_zero(tmp, hs);
+        for (size_t c = 0; c < m_channels; ++c) {
+            v_add(tmp, m_channelData[c]->mag, hs);
+        }
+
+        if (sizeof(process_t) == sizeof(double)) {
+            df = m_phaseResetAudioCurve->processDouble((double *)tmp, m_increment);
+            silent = (m_silentAudioCurve->processDouble((double *)tmp, m_increment) > 0.f);
+        } else {
+            df = m_phaseResetAudioCurve->processFloat((float *)tmp, m_increment);
+            silent = (m_silentAudioCurve->processFloat((float *)tmp, m_increment) > 0.f);
+        }
+    }
+
+    int incr = m_stretchCalculator->calculateSingle
+        (getEffectiveRatio(), df, m_increment);
+
+    if (m_lastProcessPhaseResetDf.getWriteSpace() > 0) {
+        m_lastProcessPhaseResetDf.write(&df, 1);
+    }
+    if (m_lastProcessOutputIncrements.getWriteSpace() > 0) {
+        m_lastProcessOutputIncrements.write(&incr, 1);
+    }
+
+    if (incr < 0) {
+        phaseReset = true;
+        incr = -incr;
+    }
+    
+    // The returned increment is the phase increment.  The shift
+    // increment for one chunk is the same as the phase increment for
+    // the following chunk (see comment below).  This means we don't
+    // actually know the shift increment until we see the following
+    // phase increment... which is a bit of a problem.
+
+    // This implies we should use this increment for the shift
+    // increment, and make the following phase increment the same as
+    // it.  This means in RT mode we'll be one chunk later with our
+    // phase reset than we would be in non-RT mode.  The sensitivity
+    // of the broadband onset detector may mean that this isn't a
+    // problem -- test it and see.
+
+    shiftIncrementRtn = incr;
+
+    if (cd.prevIncrement == 0) {
+        phaseIncrementRtn = shiftIncrementRtn;
+    } else {
+        phaseIncrementRtn = cd.prevIncrement;
+    }
+
+    cd.prevIncrement = shiftIncrementRtn;
+
+    if (silent) ++m_silentHistory;
+    else m_silentHistory = 0;
+
+    if (m_silentHistory >= int(m_aWindowSize / m_increment) && !phaseReset) {
+        phaseReset = true;
+        if (m_debugLevel > 1) {
+            cerr << "calculateIncrements: phase reset on silence (silent history == "
+                 << m_silentHistory << ")" << endl;
+        }
+    }
+}
+
+bool
+RubberBandStretcher::Impl::getIncrements(size_t channel,
+                                         size_t &phaseIncrementRtn,
+                                         size_t &shiftIncrementRtn,
+                                         bool &phaseReset)
+{
+    Profiler profiler("RubberBandStretcher::Impl::getIncrements");
+
+    if (channel >= m_channels) {
+        phaseIncrementRtn = m_increment;
+        shiftIncrementRtn = m_increment;
+        phaseReset = false;
+        return false;
+    }
+
+    // There are two relevant output increments here.  The first is
+    // the phase increment which we use when recalculating the phases
+    // for the current chunk; the second is the shift increment used
+    // to determine how far to shift the processing buffer after
+    // writing the chunk.  The shift increment for one chunk is the
+    // same as the phase increment for the following chunk.
+    
+    // When an onset occurs for which we need to reset phases, the
+    // increment given will be negative.
+    
+    // When we reset phases, the previous shift increment (and so
+    // current phase increments) must have been m_increment to ensure
+    // consistency.
+    
+    // m_outputIncrements stores phase increments.
+
+    ChannelData &cd = *m_channelData[channel];
+    bool gotData = true;
+
+    if (cd.chunkCount >= m_outputIncrements.size()) {
+//        cerr << "WARNING: RubberBandStretcher::Impl::getIncrements:"
+//             << " chunk count " << cd.chunkCount << " >= "
+//             << m_outputIncrements.size() << endl;
+        if (m_outputIncrements.size() == 0) {
+            phaseIncrementRtn = m_increment;
+            shiftIncrementRtn = m_increment;
+            phaseReset = false;
+            return false;
+        } else {
+            cd.chunkCount = m_outputIncrements.size()-1;
+            gotData = false;
+        }
+    }
+    
+    int phaseIncrement = m_outputIncrements[cd.chunkCount];
+    
+    int shiftIncrement = phaseIncrement;
+    if (cd.chunkCount + 1 < m_outputIncrements.size()) {
+        shiftIncrement = m_outputIncrements[cd.chunkCount + 1];
+    }
+    
+    if (phaseIncrement < 0) {
+        phaseIncrement = -phaseIncrement;
+        phaseReset = true;
+    }
+    
+    if (shiftIncrement < 0) {
+        shiftIncrement = -shiftIncrement;
+    }
+    /*
+    if (shiftIncrement >= int(m_windowSize)) {
+        cerr << "*** ERROR: RubberBandStretcher::Impl::processChunks: shiftIncrement " << shiftIncrement << " >= windowSize " << m_windowSize << " at " << cd.chunkCount << " (of " << m_outputIncrements.size() << ")" << endl;
+        shiftIncrement = m_windowSize;
+    }
+    */
+    phaseIncrementRtn = phaseIncrement;
+    shiftIncrementRtn = shiftIncrement;
+    if (cd.chunkCount == 0) phaseReset = true; // don't mess with the first chunk
+    return gotData;
+}
+
+void
+RubberBandStretcher::Impl::analyseChunk(size_t channel)
+{
+    Profiler profiler("RubberBandStretcher::Impl::analyseChunk");
+
+    ChannelData &cd = *m_channelData[channel];
+
+    process_t *const R__ dblbuf = cd.dblbuf;
+    float *const R__ fltbuf = cd.fltbuf;
+
+    // cd.fltbuf is known to contain m_aWindowSize samples
+
+    if (m_aWindowSize > m_fftSize) {
+        m_afilter->cut(fltbuf);
+    }
+
+    cutShiftAndFold(dblbuf, m_fftSize, fltbuf, m_awindow);
+
+    cd.fft->forwardPolar(dblbuf, cd.mag, cd.phase);
+}
+
+void
+RubberBandStretcher::Impl::modifyChunk(size_t channel,
+                                       size_t outputIncrement,
+                                       bool phaseReset)
+{
+    Profiler profiler("RubberBandStretcher::Impl::modifyChunk");
+
+    ChannelData &cd = *m_channelData[channel];
+
+    if (phaseReset && m_debugLevel > 1) {
+        cerr << "phase reset: leaving phases unmodified" << endl;
+    }
+
+    const process_t rate = m_sampleRate;
+    const int count = m_fftSize / 2;
+
+    bool unchanged = cd.unchanged && (outputIncrement == m_increment);
+    bool fullReset = phaseReset;
+    bool laminar = !(m_options & OptionPhaseIndependent);
+    bool bandlimited = (m_options & OptionTransientsMixed);
+    int bandlow = lrint((150 * m_fftSize) / rate);
+    int bandhigh = lrint((1000 * m_fftSize) / rate);
+
+    float freq0 = m_freq0;
+    float freq1 = m_freq1;
+    float freq2 = m_freq2;
+
+    if (laminar) {
+        float r = getEffectiveRatio();
+        if (r > 1) {
+            float rf0 = 600 + (600 * ((r-1)*(r-1)*(r-1)*2));
+            float f1ratio = freq1 / freq0;
+            float f2ratio = freq2 / freq0;
+            freq0 = std::max(freq0, rf0);
+            freq1 = freq0 * f1ratio;
+            freq2 = freq0 * f2ratio;
+        }
+    }
+
+    int limit0 = lrint((freq0 * m_fftSize) / rate);
+    int limit1 = lrint((freq1 * m_fftSize) / rate);
+    int limit2 = lrint((freq2 * m_fftSize) / rate);
+
+    if (limit1 < limit0) limit1 = limit0;
+    if (limit2 < limit1) limit2 = limit1;
+    
+    process_t prevInstability = 0.0;
+    bool prevDirection = false;
+
+    process_t distance = 0.0;
+    const process_t maxdist = 8.0;
+
+    const int lookback = 1;
+
+    process_t distacc = 0.0;
+
+    for (int i = count; i >= 0; i -= lookback) {
+
+        bool resetThis = phaseReset;
+
+        if (bandlimited) {
+            if (resetThis) {
+                if (i > bandlow && i < bandhigh) {
+                    resetThis = false;
+                    fullReset = false;
+                }
+            }
+        }
+
+        process_t p = cd.phase[i];
+        process_t perr = 0.0;
+        process_t outphase = p;
+
+        process_t mi = maxdist;
+        if (i <= limit0) mi = 0.0;
+        else if (i <= limit1) mi = 1.0;
+        else if (i <= limit2) mi = 3.0;
+
+        if (!resetThis) {
+
+            process_t omega = (2 * M_PI * m_increment * i) / (m_fftSize);
+
+            process_t pp = cd.prevPhase[i];
+            process_t ep = pp + omega;
+            perr = princarg(p - ep);
+
+            process_t instability = fabs(perr - cd.prevError[i]);
+            bool direction = (perr > cd.prevError[i]);
+
+            bool inherit = false;
+
+            if (laminar) {
+                if (distance >= mi || i == count) {
+                    inherit = false;
+                } else if (bandlimited && (i == bandhigh || i == bandlow)) {
+                    inherit = false;
+                } else if (instability > prevInstability &&
+                           direction == prevDirection) {
+                    inherit = true;
+                }
+            }
+
+            process_t advance = outputIncrement * ((omega + perr) / m_increment);
+
+            if (inherit) {
+                process_t inherited =
+                    cd.unwrappedPhase[i + lookback] - cd.prevPhase[i + lookback];
+                advance = ((advance * distance) +
+                           (inherited * (maxdist - distance)))
+                    / maxdist;
+                outphase = p + advance;
+                distacc += distance;
+                distance += 1.0;
+            } else {
+                outphase = cd.unwrappedPhase[i] + advance;
+                distance = 0.0;
+            }
+
+            prevInstability = instability;
+            prevDirection = direction;
+
+        } else {
+            distance = 0.0;
+        }
+
+        cd.prevError[i] = perr;
+        cd.prevPhase[i] = p;
+        cd.phase[i] = outphase;
+        cd.unwrappedPhase[i] = outphase;
+    }
+
+    if (m_debugLevel > 2) {
+        cerr << "mean inheritance distance = " << distacc / count << endl;
+    }
+
+    if (fullReset) unchanged = true;
+    cd.unchanged = unchanged;
+
+    if (unchanged && m_debugLevel > 1) {
+        cerr << "frame unchanged on channel " << channel << endl;
+    }
+}    
+
+
+void
+RubberBandStretcher::Impl::formantShiftChunk(size_t channel)
+{
+    Profiler profiler("RubberBandStretcher::Impl::formantShiftChunk");
+
+    ChannelData &cd = *m_channelData[channel];
+
+    process_t *const R__ mag = cd.mag;
+    process_t *const R__ envelope = cd.envelope;
+    process_t *const R__ dblbuf = cd.dblbuf;
+
+    const int sz = m_fftSize;
+    const int hs = sz / 2;
+    const process_t factor = 1.0 / sz;
+
+    cd.fft->inverseCepstral(mag, dblbuf);
+
+    const int cutoff = m_sampleRate / 700;
+
+//    cerr <<"cutoff = "<< cutoff << ", m_sampleRate/cutoff = " << m_sampleRate/cutoff << endl;
+
+    dblbuf[0] /= 2;
+    dblbuf[cutoff-1] /= 2;
+
+    for (int i = cutoff; i < sz; ++i) {
+        dblbuf[i] = 0.0;
+    }
+
+    v_scale(dblbuf, factor, cutoff);
+
+    double *spare = (double *)alloca((hs + 1) * sizeof(double));
+    cd.fft->forward(dblbuf, envelope, spare);
+
+    v_exp(envelope, hs + 1);
+    v_divide(mag, envelope, hs + 1);
+
+    if (m_pitchScale > 1.0) {
+        // scaling up, we want a new envelope that is lower by the pitch factor
+        for (int target = 0; target <= hs; ++target) {
+            int source = lrint(target * m_pitchScale);
+            if (source > hs) {
+                envelope[target] = 0.0;
+            } else {
+                envelope[target] = envelope[source];
+            }
+        }
+    } else {
+        // scaling down, we want a new envelope that is higher by the pitch factor
+        for (int target = hs; target > 0; ) {
+            --target;
+            int source = lrint(target * m_pitchScale);
+            envelope[target] = envelope[source];
+        }
+    }
+
+    v_multiply(mag, envelope, hs+1);
+
+    cd.unchanged = false;
+}
+
+void
+RubberBandStretcher::Impl::synthesiseChunk(size_t channel,
+                                           size_t shiftIncrement)
+{
+    Profiler profiler("RubberBandStretcher::Impl::synthesiseChunk");
+
+
+    if ((m_options & OptionFormantPreserved) &&
+        (m_pitchScale != 1.0)) {
+        formantShiftChunk(channel);
+    }
+
+    ChannelData &cd = *m_channelData[channel];
+
+    process_t *const R__ dblbuf = cd.dblbuf;
+    float *const R__ fltbuf = cd.fltbuf;
+    float *const R__ accumulator = cd.accumulator;
+    float *const R__ windowAccumulator = cd.windowAccumulator;
+    
+    const int fsz = m_fftSize;
+    const int hs = fsz / 2;
+
+    const int wsz = m_sWindowSize;
+
+    if (!cd.unchanged) {
+
+        // Our FFTs produced unscaled results. Scale before inverse
+        // transform rather than after, to avoid overflow if using a
+        // fixed-point FFT.
+        float factor = 1.f / fsz;
+        v_scale(cd.mag, factor, hs + 1);
+
+        cd.fft->inversePolar(cd.mag, cd.phase, cd.dblbuf);
+
+        if (wsz == fsz) {
+            v_convert(fltbuf, dblbuf + hs, hs);
+            v_convert(fltbuf + hs, dblbuf, hs);
+        } else {
+            v_zero(fltbuf, wsz);
+            int j = fsz - wsz/2;
+            while (j < 0) j += fsz;
+            for (int i = 0; i < wsz; ++i) {
+                fltbuf[i] += dblbuf[j];
+                if (++j == fsz) j = 0;
+            }
+        }
+    }
+
+    if (wsz > fsz) {
+        int p = shiftIncrement * 2;
+        if (cd.interpolatorScale != p) {
+            SincWindow<float>::write(cd.interpolator, wsz, p);
+            cd.interpolatorScale = p;
+        }
+        v_multiply(fltbuf, cd.interpolator, wsz);
+    }
+
+    m_swindow->cut(fltbuf);
+    v_add(accumulator, fltbuf, wsz);
+    cd.accumulatorFill = wsz;
+
+    if (wsz > fsz) {
+        // reuse fltbuf to calculate interpolating window shape for
+        // window accumulator
+        v_copy(fltbuf, cd.interpolator, wsz);
+        m_swindow->cut(fltbuf);
+        v_add(windowAccumulator, fltbuf, wsz);
+    } else {
+        m_swindow->add(windowAccumulator, m_awindow->getArea() * 1.5f);
+    }
+}
+
+void
+RubberBandStretcher::Impl::writeChunk(size_t channel, size_t shiftIncrement, bool last)
+{
+    Profiler profiler("RubberBandStretcher::Impl::writeChunk");
+
+    ChannelData &cd = *m_channelData[channel];
+    
+    float *const R__ accumulator = cd.accumulator;
+    float *const R__ windowAccumulator = cd.windowAccumulator;
+
+    const int sz = m_sWindowSize;
+    const int si = shiftIncrement;
+
+    if (m_debugLevel > 2) {
+        cerr << "writeChunk(" << channel << ", " << shiftIncrement << ", " << last << ")" << endl;
+    }
+
+    v_divide(accumulator, windowAccumulator, si);
+
+    // for exact sample scaling (probably not meaningful if we
+    // were running in RT mode)
+    size_t theoreticalOut = 0;
+    if (cd.inputSize >= 0) {
+        theoreticalOut = lrint(cd.inputSize * m_timeRatio);
+    }
+
+    bool resampledAlready = resampleBeforeStretching();
+
+    if (!resampledAlready &&
+        (m_pitchScale != 1.0 || m_options & OptionPitchHighConsistency) &&
+        cd.resampler) {
+
+        size_t reqSize = int(ceil(si / m_pitchScale));
+        if (reqSize > cd.resamplebufSize) {
+            // This shouldn't normally happen -- the buffer is
+            // supposed to be initialised with enough space in the
+            // first place.  But we retain this check in case the
+            // pitch scale has changed since then, or the stretch
+            // calculator has gone mad, or something.
+            cerr << "WARNING: RubberBandStretcher::Impl::writeChunk: resizing resampler buffer from "
+                      << cd.resamplebufSize << " to " << reqSize << endl;
+            cd.setResampleBufSize(reqSize);
+        }
+
+#ifndef NO_THREADING
+#if defined HAVE_IPP && !defined USE_SPEEX
+        if (m_threaded) {
+            m_resamplerMutex.lock();
+        }
+#endif
+#endif
+
+        size_t outframes = cd.resampler->resample(&cd.accumulator,
+                                                  &cd.resamplebuf,
+                                                  si,
+                                                  1.0 / m_pitchScale,
+                                                  last);
+
+#ifndef NO_THREADING
+#if defined HAVE_IPP && !defined USE_SPEEX
+        if (m_threaded) {
+            m_resamplerMutex.unlock();
+        }
+#endif
+#endif
+
+        writeOutput(*cd.outbuf, cd.resamplebuf,
+                    outframes, cd.outCount, theoreticalOut);
+
+    } else {
+        writeOutput(*cd.outbuf, accumulator,
+                    si, cd.outCount, theoreticalOut);
+    }
+
+    v_move(accumulator, accumulator + si, sz - si);
+    v_zero(accumulator + sz - si, si);
+    
+    v_move(windowAccumulator, windowAccumulator + si, sz - si);
+    v_zero(windowAccumulator + sz - si, si);
+    
+    if (int(cd.accumulatorFill) > si) {
+        cd.accumulatorFill -= si;
+    } else {
+        cd.accumulatorFill = 0;
+        if (cd.draining) {
+            if (m_debugLevel > 1) {
+                cerr << "RubberBandStretcher::Impl::processChunks: setting outputComplete to true" << endl;
+            }
+            cd.outputComplete = true;
+        }
+    }
+}
+
+void
+RubberBandStretcher::Impl::writeOutput(RingBuffer<float> &to, float *from, size_t qty, size_t &outCount, size_t theoreticalOut)
+{
+    Profiler profiler("RubberBandStretcher::Impl::writeOutput");
+
+    // In non-RT mode, we don't want to write the first startSkip
+    // samples, because the first chunk is centred on the start of the
+    // output.  In RT mode we didn't apply any pre-padding in
+    // configure(), so we don't want to remove any here.
+
+    size_t startSkip = 0;
+    if (!m_realtime) {
+        startSkip = lrintf((m_sWindowSize/2) / m_pitchScale);
+    }
+
+    if (outCount > startSkip) {
+        
+        // this is the normal case
+
+        if (theoreticalOut > 0) {
+            if (m_debugLevel > 1) {
+                cerr << "theoreticalOut = " << theoreticalOut
+                     << ", outCount = " << outCount
+                     << ", startSkip = " << startSkip
+                     << ", qty = " << qty << endl;
+            }
+            if (outCount - startSkip <= theoreticalOut &&
+                outCount - startSkip + qty > theoreticalOut) {
+                qty = theoreticalOut - (outCount - startSkip);
+                if (m_debugLevel > 1) {
+                    cerr << "reduce qty to " << qty << endl;
+                }
+            }
+        }
+
+        if (m_debugLevel > 2) {
+            cerr << "writing " << qty << endl;
+        }
+
+        size_t written = to.write(from, qty);
+
+        if (written < qty) {
+            cerr << "WARNING: RubberBandStretcher::Impl::writeOutput: "
+                 << "Buffer overrun on output: wrote " << written
+                 << " of " << qty << " samples" << endl;
+        }
+
+        outCount += written;
+        return;
+    }
+
+    // the rest of this is only used during the first startSkip samples
+
+    if (outCount + qty <= startSkip) {
+        if (m_debugLevel > 1) {
+            cerr << "qty = " << qty << ", startSkip = "
+                 << startSkip << ", outCount = " << outCount
+                 << ", discarding" << endl;
+        }
+        outCount += qty;
+        return;
+    }
+
+    size_t off = startSkip - outCount;
+    if (m_debugLevel > 1) {
+        cerr << "qty = " << qty << ", startSkip = "
+             << startSkip << ", outCount = " << outCount
+             << ", writing " << qty - off
+             << " from start offset " << off << endl;
+    }
+    to.write(from + off, qty - off);
+    outCount += qty;
+}
+
+int
+RubberBandStretcher::Impl::available() const
+{
+    Profiler profiler("RubberBandStretcher::Impl::available");
+
+#ifndef NO_THREADING
+    if (m_threaded) {
+        MutexLocker locker(&m_threadSetMutex);
+        if (m_channelData.empty()) return 0;
+    } else {
+        if (m_channelData.empty()) return 0;
+    }
+#endif
+
+#ifndef NO_THREADING
+    if (!m_threaded) {
+#endif
+        for (size_t c = 0; c < m_channels; ++c) {
+            if (m_channelData[c]->inputSize >= 0) {
+//                cerr << "available: m_done true" << endl;
+                if (m_channelData[c]->inbuf->getReadSpace() > 0) {
+                    if (m_debugLevel > 1) {
+                        cerr << "calling processChunks(" << c << ") from available" << endl;
+                    }
+                    //!!! do we ever actually do this? if so, this method should not be const
+                    // ^^^ yes, we do sometimes -- e.g. when fed a very short file
+                    bool any = false, last = false;
+                    ((RubberBandStretcher::Impl *)this)->processChunks(c, any, last);
+                }
+            }
+        }
+#ifndef NO_THREADING
+    }
+#endif
+
+    size_t min = 0;
+    bool consumed = true;
+    bool haveResamplers = false;
+
+    for (size_t i = 0; i < m_channels; ++i) {
+        size_t availIn = m_channelData[i]->inbuf->getReadSpace();
+        size_t availOut = m_channelData[i]->outbuf->getReadSpace();
+        if (m_debugLevel > 2) {
+            cerr << "available on channel " << i << ": " << availOut << " (waiting: " << availIn << ")" << endl;
+        }
+        if (i == 0 || availOut < min) min = availOut;
+        if (!m_channelData[i]->outputComplete) consumed = false;
+        if (m_channelData[i]->resampler) haveResamplers = true;
+    }
+
+    if (min == 0 && consumed) return -1;
+    if (m_pitchScale == 1.0) return min;
+
+    if (haveResamplers) return min; // resampling has already happened
+    return int(floor(min / m_pitchScale));
+}
+
+size_t
+RubberBandStretcher::Impl::retrieve(float *const *output, size_t samples) const
+{
+    Profiler profiler("RubberBandStretcher::Impl::retrieve");
+
+    size_t got = samples;
+
+    for (size_t c = 0; c < m_channels; ++c) {
+        size_t gotHere = m_channelData[c]->outbuf->read(output[c], got);
+        if (gotHere < got) {
+            if (c > 0) {
+                if (m_debugLevel > 0) {
+                    cerr << "RubberBandStretcher::Impl::retrieve: WARNING: channel imbalance detected" << endl;
+                }
+            }
+            got = gotHere;
+        }
+    }
+
+    if ((m_options & OptionChannelsTogether) && (m_channels >= 2)) {
+        for (size_t i = 0; i < got; ++i) {
+            float mid = output[0][i];
+            float side = output[1][i];
+            float left = mid + side;
+            float right = mid - side;
+            output[0][i] = left;
+            output[1][i] = right;
+        }
+    }            
+
+    return got;
+}
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/CompoundAudioCurve.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/CompoundAudioCurve.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,167 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "CompoundAudioCurve.h"
+
+#include "dsp/MovingMedian.h"
+
+#include <iostream>
+
+namespace RubberBand
+{
+
+
+CompoundAudioCurve::CompoundAudioCurve(Parameters parameters) :
+    AudioCurveCalculator(parameters),
+    m_percussive(parameters),
+    m_hf(parameters),
+    m_hfFilter(new MovingMedian<double>(19, 85)),
+    m_hfDerivFilter(new MovingMedian<double>(19, 90)),
+    m_type(CompoundDetector),
+    m_lastHf(0.0),
+    m_lastResult(0.0),
+    m_risingCount(0)
+{
+}
+
+CompoundAudioCurve::~CompoundAudioCurve()
+{
+    delete m_hfFilter;
+    delete m_hfDerivFilter;
+}
+
+void
+CompoundAudioCurve::setType(Type type)
+{
+    m_type = type;
+}
+
+void
+CompoundAudioCurve::reset()
+{
+    m_percussive.reset();
+    m_hf.reset();
+    m_hfFilter->reset();
+    m_hfDerivFilter->reset();
+    m_lastHf = 0.0;
+    m_lastResult = 0.0;
+}
+
+void
+CompoundAudioCurve::setFftSize(int newSize)
+{
+    m_percussive.setFftSize(newSize);
+    m_hf.setFftSize(newSize);
+    m_fftSize = newSize;
+    m_lastHf = 0.0;
+    m_lastResult = 0.0;
+}
+
+float
+CompoundAudioCurve::processFloat(const float *R__ mag, int increment)
+{
+    float percussive = 0.f;
+    float hf = 0.f;
+    switch (m_type) {
+    case PercussiveDetector:
+        percussive = m_percussive.processFloat(mag, increment);
+        break;
+    case CompoundDetector:
+        percussive = m_percussive.processFloat(mag, increment);
+        hf = m_hf.processFloat(mag, increment);
+        break;
+    case SoftDetector:
+        hf = m_hf.processFloat(mag, increment);
+        break;
+    }
+    return processFiltering(percussive, hf);
+}
+
+double
+CompoundAudioCurve::processDouble(const double *R__ mag, int increment)
+{
+    double percussive = 0.0;
+    double hf = 0.0;
+    switch (m_type) {
+    case PercussiveDetector:
+        percussive = m_percussive.processDouble(mag, increment);
+        break;
+    case CompoundDetector:
+        percussive = m_percussive.processDouble(mag, increment);
+        hf = m_hf.processDouble(mag, increment);
+        break;
+    case SoftDetector:
+        hf = m_hf.processDouble(mag, increment);
+        break;
+    }
+    return processFiltering(percussive, hf);
+}
+
+double
+CompoundAudioCurve::processFiltering(double percussive, double hf)
+{
+    if (m_type == PercussiveDetector) {
+        return percussive;
+    }
+
+    double rv = 0.f;
+    
+    double hfDeriv = hf - m_lastHf;
+
+    m_hfFilter->push(hf);
+    m_hfDerivFilter->push(hfDeriv);
+
+    double hfFiltered = m_hfFilter->get();
+    double hfDerivFiltered = m_hfDerivFilter->get();
+
+    m_lastHf = hf;
+
+    double result = 0.f;
+    
+    double hfExcess = hf - hfFiltered;
+
+    if (hfExcess > 0.0) {
+        result = hfDeriv - hfDerivFiltered;
+    }
+
+    if (result < m_lastResult) {
+        if (m_risingCount > 3 && m_lastResult > 0) rv = 0.5;
+        m_risingCount = 0;
+    } else {
+        m_risingCount ++;
+    }
+
+    if (m_type == CompoundDetector) {
+        if (percussive > 0.35 && percussive > rv) {
+            rv = percussive;
+        }
+    }
+
+    m_lastResult = result;
+
+    return rv;
+}
+
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/CompoundAudioCurve.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/CompoundAudioCurve.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,74 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _COMPOUND_AUDIO_CURVE_H_
+#define _COMPOUND_AUDIO_CURVE_H_
+
+#include "dsp/AudioCurveCalculator.h"
+#include "PercussiveAudioCurve.h"
+#include "HighFrequencyAudioCurve.h"
+#include "dsp/SampleFilter.h"
+
+namespace RubberBand
+{
+
+class CompoundAudioCurve : public AudioCurveCalculator
+{
+public:
+    CompoundAudioCurve(Parameters parameters);
+
+    virtual ~CompoundAudioCurve();
+
+    enum Type {
+        PercussiveDetector,
+        CompoundDetector,
+        SoftDetector
+    };
+    virtual void setType(Type); // default is CompoundDetector
+    
+    virtual void setFftSize(int newSize);
+
+    virtual float processFloat(const float *R__ mag, int increment);
+    virtual double processDouble(const double *R__ mag, int increment);
+
+    virtual void reset();
+
+protected:
+    PercussiveAudioCurve m_percussive;
+    HighFrequencyAudioCurve m_hf;
+
+    SampleFilter<double> *m_hfFilter;
+    SampleFilter<double> *m_hfDerivFilter;
+
+    Type m_type;
+
+    double m_lastHf;
+    double m_lastResult;
+    int m_risingCount;
+
+    double processFiltering(double percussive, double hf);
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/ConstantAudioCurve.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/ConstantAudioCurve.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,57 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "ConstantAudioCurve.h"
+
+namespace RubberBand
+{
+
+
+ConstantAudioCurve::ConstantAudioCurve(Parameters parameters) :
+    AudioCurveCalculator(parameters)
+{
+}
+
+ConstantAudioCurve::~ConstantAudioCurve()
+{
+}
+
+void
+ConstantAudioCurve::reset()
+{
+}
+
+float
+ConstantAudioCurve::processFloat(const float *R__, int)
+{
+    return 1.f;
+}
+
+double
+ConstantAudioCurve::processDouble(const double *R__, int)
+{
+    return 1.0;
+}
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/ConstantAudioCurve.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/ConstantAudioCurve.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,45 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _CONSTANT_AUDIO_CURVE_H_
+#define _CONSTANT_AUDIO_CURVE_H_
+
+#include "dsp/AudioCurveCalculator.h"
+
+namespace RubberBand
+{
+
+class ConstantAudioCurve : public AudioCurveCalculator
+{
+public:
+    ConstantAudioCurve(Parameters parameters);
+    virtual ~ConstantAudioCurve();
+
+    virtual float processFloat(const float *R__ mag, int increment);
+    virtual double processDouble(const double *R__ mag, int increment);
+    virtual void reset();
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/HighFrequencyAudioCurve.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/HighFrequencyAudioCurve.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,73 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "HighFrequencyAudioCurve.h"
+
+namespace RubberBand
+{
+
+
+HighFrequencyAudioCurve::HighFrequencyAudioCurve(Parameters parameters) :
+    AudioCurveCalculator(parameters)
+{
+}
+
+HighFrequencyAudioCurve::~HighFrequencyAudioCurve()
+{
+}
+
+void
+HighFrequencyAudioCurve::reset()
+{
+}
+
+float
+HighFrequencyAudioCurve::processFloat(const float *R__ mag, int increment)
+{
+    float result = 0.0;
+
+    const int sz = m_lastPerceivedBin;
+
+    for (int n = 0; n <= sz; ++n) {
+        result = result + mag[n] * n;
+    }
+
+    return result;
+}
+
+double
+HighFrequencyAudioCurve::processDouble(const double *R__ mag, int increment)
+{
+    float result = 0.0;
+
+    const int sz = m_lastPerceivedBin;
+
+    for (int n = 0; n <= sz; ++n) {
+        result = result + mag[n] * n;
+    }
+
+    return result;
+}
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/HighFrequencyAudioCurve.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/HighFrequencyAudioCurve.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,47 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _HIGHFREQUENCY_AUDIO_CURVE_H_
+#define _HIGHFREQUENCY_AUDIO_CURVE_H_
+
+#include "dsp/AudioCurveCalculator.h"
+
+namespace RubberBand
+{
+
+class HighFrequencyAudioCurve : public AudioCurveCalculator
+{
+public:
+    HighFrequencyAudioCurve(Parameters parameters);
+
+    virtual ~HighFrequencyAudioCurve();
+
+    virtual float processFloat(const float *R__ mag, int increment);
+    virtual double processDouble(const double *R__ mag, int increment);
+    virtual void reset();
+    virtual const char *getUnit() const { return "Vbin"; }
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/PercussiveAudioCurve.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/PercussiveAudioCurve.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,114 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "PercussiveAudioCurve.h"
+
+#include "system/Allocators.h"
+#include "system/VectorOps.h"
+
+#include <cmath>
+#include <iostream>
+namespace RubberBand
+{
+
+
+PercussiveAudioCurve::PercussiveAudioCurve(Parameters parameters) :
+    AudioCurveCalculator(parameters)
+{
+    m_prevMag = allocate_and_zero<double>(m_fftSize/2 + 1);
+}
+
+PercussiveAudioCurve::~PercussiveAudioCurve()
+{
+    deallocate(m_prevMag);
+}
+
+void
+PercussiveAudioCurve::reset()
+{
+    v_zero(m_prevMag, m_fftSize/2 + 1);
+}
+
+void
+PercussiveAudioCurve::setFftSize(int newSize)
+{
+    m_prevMag = reallocate(m_prevMag, m_fftSize/2 + 1, newSize/2 + 1);
+    AudioCurveCalculator::setFftSize(newSize);
+    reset();
+}
+
+float
+PercussiveAudioCurve::processFloat(const float *R__ mag, int increment)
+{
+    static float threshold = powf(10.f, 0.15f); // 3dB rise in square of magnitude
+    static float zeroThresh = powf(10.f, -8);
+
+    int count = 0;
+    int nonZeroCount = 0;
+
+    const int sz = m_lastPerceivedBin;
+
+    for (int n = 1; n <= sz; ++n) {
+        float v = 0.f;
+        if (m_prevMag[n] > zeroThresh) v = mag[n] / m_prevMag[n];
+        else if (mag[n] > zeroThresh) v = threshold;
+        bool above = (v >= threshold);
+        if (above) ++count;
+        if (mag[n] > zeroThresh) ++nonZeroCount;
+    }
+
+    v_convert(m_prevMag, mag, sz + 1);
+
+    if (nonZeroCount == 0) return 0;
+    else return float(count) / float(nonZeroCount);
+}
+
+double
+PercussiveAudioCurve::processDouble(const double *R__ mag, int increment)
+{
+    static double threshold = powf(10., 0.15); // 3dB rise in square of magnitude
+    static double zeroThresh = powf(10., -8);
+
+    int count = 0;
+    int nonZeroCount = 0;
+
+    const int sz = m_lastPerceivedBin;
+
+    for (int n = 1; n <= sz; ++n) {
+        double v = 0.0;
+        if (m_prevMag[n] > zeroThresh) v = mag[n] / m_prevMag[n];
+        else if (mag[n] > zeroThresh) v = threshold;
+        bool above = (v >= threshold);
+        if (above) ++count;
+        if (mag[n] > zeroThresh) ++nonZeroCount;
+    }
+
+    v_copy(m_prevMag, mag, sz + 1);
+
+    if (nonZeroCount == 0) return 0;
+    else return double(count) / double(nonZeroCount);
+}
+
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/PercussiveAudioCurve.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/PercussiveAudioCurve.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,54 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _PERCUSSIVE_AUDIO_CURVE_H_
+#define _PERCUSSIVE_AUDIO_CURVE_H_
+
+#include "dsp/AudioCurveCalculator.h"
+
+namespace RubberBand
+{
+
+class PercussiveAudioCurve : public AudioCurveCalculator
+{
+public:
+    PercussiveAudioCurve(Parameters parameters);
+
+    virtual ~PercussiveAudioCurve();
+
+    virtual void setFftSize(int newSize);
+
+    virtual float processFloat(const float *R__ mag, int increment);
+    virtual double processDouble(const double *R__ mag, int increment);
+
+
+    virtual void reset();
+    virtual const char *getUnit() const { return "bin/total"; }
+
+protected:
+    double *R__ m_prevMag;
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/SilentAudioCurve.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/SilentAudioCurve.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,73 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "SilentAudioCurve.h"
+
+#include <cmath>
+
+namespace RubberBand
+{
+
+
+SilentAudioCurve::SilentAudioCurve(Parameters parameters) :
+    AudioCurveCalculator(parameters)
+{
+}
+
+SilentAudioCurve::~SilentAudioCurve()
+{
+}
+
+void
+SilentAudioCurve::reset()
+{
+}
+
+float
+SilentAudioCurve::processFloat(const float *R__ mag, int)
+{
+    const int hs = m_lastPerceivedBin;
+    static float threshold = powf(10.f, -6);
+
+    for (int i = 0; i <= hs; ++i) {
+        if (mag[i] > threshold) return 0.f;
+    }
+        
+    return 1.f;
+}
+
+double
+SilentAudioCurve::processDouble(const double *R__ mag, int)
+{
+    const int hs = m_lastPerceivedBin;
+    static double threshold = pow(10.0, -6);
+
+    for (int i = 0; i <= hs; ++i) {
+        if (mag[i] > threshold) return 0.f;
+    }
+        
+    return 1.f;
+}
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/SilentAudioCurve.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/SilentAudioCurve.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,46 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _SILENT_AUDIO_CURVE_H_
+#define _SILENT_AUDIO_CURVE_H_
+
+#include "dsp/AudioCurveCalculator.h"
+
+namespace RubberBand
+{
+
+class SilentAudioCurve : public AudioCurveCalculator
+{
+public:
+    SilentAudioCurve(Parameters parameters);
+    virtual ~SilentAudioCurve();
+
+    virtual float processFloat(const float *R__ mag, int increment);
+    virtual double processDouble(const double *R__ mag, int increment);
+    virtual void reset();
+    virtual const char *getUnit() const { return "bool"; }
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/SpectralDifferenceAudioCurve.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/SpectralDifferenceAudioCurve.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,107 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "SpectralDifferenceAudioCurve.h"
+
+#include "system/Allocators.h"
+#include "system/VectorOps.h"
+
+namespace RubberBand
+{
+
+
+SpectralDifferenceAudioCurve::SpectralDifferenceAudioCurve(Parameters parameters) :
+    AudioCurveCalculator(parameters)
+{
+    m_mag = allocate<double>(m_lastPerceivedBin + 1);
+    m_tmpbuf = allocate<double>(m_lastPerceivedBin + 1);
+    v_zero(m_mag, m_lastPerceivedBin + 1);
+}
+
+SpectralDifferenceAudioCurve::~SpectralDifferenceAudioCurve()
+{
+    deallocate(m_mag);
+    deallocate(m_tmpbuf);
+}
+
+void
+SpectralDifferenceAudioCurve::reset()
+{
+    v_zero(m_mag, m_lastPerceivedBin + 1);
+}
+
+void
+SpectralDifferenceAudioCurve::setFftSize(int newSize)
+{
+    deallocate(m_tmpbuf);
+    deallocate(m_mag);
+    AudioCurveCalculator::setFftSize(newSize);
+    m_mag = allocate<double>(m_lastPerceivedBin + 1);
+    m_tmpbuf = allocate<double>(m_lastPerceivedBin + 1);
+    reset();
+}
+
+float
+SpectralDifferenceAudioCurve::processFloat(const float *R__ mag, int increment)
+{
+    double result = 0.0;
+
+    const int hs1 = m_lastPerceivedBin + 1;
+
+    v_convert(m_tmpbuf, mag, hs1);
+    v_square(m_tmpbuf, hs1);
+    v_subtract(m_mag, m_tmpbuf, hs1);
+    v_abs(m_mag, hs1);
+    v_sqrt(m_mag, hs1);
+    
+    for (int i = 0; i < hs1; ++i) {
+        result += m_mag[i];
+    }
+
+    v_copy(m_mag, m_tmpbuf, hs1);
+    return result;
+}
+
+double
+SpectralDifferenceAudioCurve::processDouble(const double *R__ mag, int increment)
+{
+    double result = 0.0;
+
+    const int hs1 = m_lastPerceivedBin + 1;
+
+    v_convert(m_tmpbuf, mag, hs1);
+    v_square(m_tmpbuf, hs1);
+    v_subtract(m_mag, m_tmpbuf, hs1);
+    v_abs(m_mag, hs1);
+    v_sqrt(m_mag, hs1);
+    
+    for (int i = 0; i < hs1; ++i) {
+        result += m_mag[i];
+    }
+
+    v_copy(m_mag, m_tmpbuf, hs1);
+    return result;
+}
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/audiocurves/SpectralDifferenceAudioCurve.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/audiocurves/SpectralDifferenceAudioCurve.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,54 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _SPECTRALDIFFERENCE_AUDIO_CURVE_H_
+#define _SPECTRALDIFFERENCE_AUDIO_CURVE_H_
+
+#include "dsp/AudioCurveCalculator.h"
+#include "dsp/Window.h"
+
+namespace RubberBand
+{
+
+class SpectralDifferenceAudioCurve : public AudioCurveCalculator
+{
+public:
+    SpectralDifferenceAudioCurve(Parameters parameters);
+
+    virtual ~SpectralDifferenceAudioCurve();
+
+    virtual void setFftSize(int newSize);
+
+    virtual float processFloat(const float *R__ mag, int increment);
+    virtual double processDouble(const double *R__ mag, int increment);
+    virtual void reset();
+    virtual const char *getUnit() const { return "V"; }
+
+protected:
+    double *R__ m_mag;
+    double *R__ m_tmpbuf;
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/base/Profiler.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/base/Profiler.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,227 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "Profiler.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <map>
+
+#include <stdio.h>
+
+#ifdef __MSVC__
+// Ugh --cc
+#define snprintf sprintf_s
+#endif
+
+namespace RubberBand {
+
+#ifndef NO_TIMING
+
+Profiler::ProfileMap
+Profiler::m_profiles;
+
+Profiler::WorstCallMap
+Profiler::m_worstCalls;
+
+void
+Profiler::add(const char *id, float ms)
+{
+    ProfileMap::iterator pmi = m_profiles.find(id);
+    if (pmi != m_profiles.end()) {
+        ++pmi->second.first;
+        pmi->second.second += ms;
+    } else {
+        m_profiles[id] = TimePair(1, ms);
+    }
+
+    WorstCallMap::iterator wci = m_worstCalls.find(id);
+    if (wci != m_worstCalls.end()) {
+        if (ms > wci->second) wci->second = ms;
+    } else {
+        m_worstCalls[id] = ms;
+    }
+}
+
+void
+Profiler::dump()
+{
+    std::string report = getReport();
+    fprintf(stderr, "%s", report.c_str());
+}
+
+std::string
+Profiler::getReport()
+{
+    static const int buflen = 256;
+    char buffer[buflen];
+    std::string report;
+
+#ifdef PROFILE_CLOCKS
+    snprintf(buffer, buflen, "Profiling points [CPU time]:\n");
+#else
+    snprintf(buffer, buflen, "Profiling points [Wall time]:\n");
+#endif
+    report += buffer;
+
+    typedef std::multimap<float, const char *> TimeRMap;
+    typedef std::multimap<int, const char *> IntRMap;
+    TimeRMap totmap, avgmap, worstmap;
+    IntRMap ncallmap;
+
+    for (ProfileMap::const_iterator i = m_profiles.begin();
+         i != m_profiles.end(); ++i) {
+        totmap.insert(TimeRMap::value_type(i->second.second, i->first));
+        avgmap.insert(TimeRMap::value_type(i->second.second /
+                                           i->second.first, i->first));
+        ncallmap.insert(IntRMap::value_type(i->second.first, i->first));
+    }
+
+    for (WorstCallMap::const_iterator i = m_worstCalls.begin();
+         i != m_worstCalls.end(); ++i) {
+        worstmap.insert(TimeRMap::value_type(i->second, i->first));
+    }
+
+    snprintf(buffer, buflen, "\nBy total:\n");
+    report += buffer;
+    for (TimeRMap::const_iterator i = totmap.end(); i != totmap.begin(); ) {
+        --i;
+        snprintf(buffer, buflen, "%-40s  %f ms\n", i->second, i->first);
+        report += buffer;
+    }
+
+    snprintf(buffer, buflen, "\nBy average:\n");
+    report += buffer;
+    for (TimeRMap::const_iterator i = avgmap.end(); i != avgmap.begin(); ) {
+        --i;
+        snprintf(buffer, buflen, "%-40s  %f ms\n", i->second, i->first);
+        report += buffer;
+    }
+
+    snprintf(buffer, buflen, "\nBy worst case:\n");
+    report += buffer;
+    for (TimeRMap::const_iterator i = worstmap.end(); i != worstmap.begin(); ) {
+        --i;
+        snprintf(buffer, buflen, "%-40s  %f ms\n", i->second, i->first);
+        report += buffer;
+    }
+
+    snprintf(buffer, buflen, "\nBy number of calls:\n");
+    report += buffer;
+    for (IntRMap::const_iterator i = ncallmap.end(); i != ncallmap.begin(); ) {
+        --i;
+        snprintf(buffer, buflen, "%-40s  %d\n", i->second, i->first);
+        report += buffer;
+    }
+
+    snprintf(buffer, buflen, "\nBy name:\n");
+    report += buffer;
+
+    typedef std::set<const char *, std::less<std::string> > StringSet;
+
+    StringSet profileNames;
+    for (ProfileMap::const_iterator i = m_profiles.begin();
+         i != m_profiles.end(); ++i) {
+        profileNames.insert(i->first);
+    }
+
+    for (StringSet::const_iterator i = profileNames.begin();
+         i != profileNames.end(); ++i) {
+
+        ProfileMap::const_iterator j = m_profiles.find(*i);
+        if (j == m_profiles.end()) continue;
+
+        const TimePair &pp(j->second);
+        snprintf(buffer, buflen, "%s(%d):\n", *i, pp.first);
+        report += buffer;
+        snprintf(buffer, buflen, "\tReal: \t%f ms      \t[%f ms total]\n",
+                (pp.second / pp.first),
+                (pp.second));
+        report += buffer;
+
+        WorstCallMap::const_iterator k = m_worstCalls.find(*i);
+        if (k == m_worstCalls.end()) continue;
+        
+        snprintf(buffer, buflen, "\tWorst:\t%f ms/call\n", k->second);
+        report += buffer;
+    }
+
+    return report;
+}
+
+Profiler::Profiler(const char* c) :
+    m_c(c),
+    m_ended(false)
+{
+#ifdef PROFILE_CLOCKS
+    m_start = clock();
+#else
+    (void)gettimeofday(&m_start, 0);
+#endif
+}
+
+Profiler::~Profiler()
+{
+    if (!m_ended) end();
+}
+
+void
+Profiler::end()
+{
+#ifdef PROFILE_CLOCKS
+    clock_t end = clock();
+    clock_t elapsed = end - m_start;
+    float ms = float((double(elapsed) / double(CLOCKS_PER_SEC)) * 1000.0);
+#else
+    struct timeval tv;
+    (void)gettimeofday(&tv, 0);
+
+    tv.tv_sec -= m_start.tv_sec;
+    if (tv.tv_usec < m_start.tv_usec) {
+        tv.tv_usec += 1000000;
+        tv.tv_sec -= 1;
+    }
+    tv.tv_usec -= m_start.tv_usec;
+    float ms = float((double(tv.tv_sec) + (double(tv.tv_usec) / 1000000.0)) * 1000.0);
+#endif
+
+    add(m_c, ms);
+
+    m_ended = true;
+}
+ 
+#else /* NO_TIMING */
+
+#ifndef NO_TIMING_COMPLETE_NOOP
+
+Profiler::Profiler(const char *) { }
+Profiler::~Profiler() { }
+void Profiler::end() { }
+void Profiler::dump() { }
+
+#endif
+
+#endif
+
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/base/Profiler.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/base/Profiler.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,130 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _PROFILER_H_
+#define _PROFILER_H_
+
+//#define NO_TIMING 1
+//#define WANT_TIMING 1
+//#define PROFILE_CLOCKS 1
+
+// Define NO_TIMING or NDEBUG to switch off profilers
+#ifdef NDEBUG
+#define NO_TIMING 1
+#endif
+
+// But we always allow WANT_TIMING to switch them back on again
+#ifdef WANT_TIMING
+#undef NO_TIMING
+#endif
+
+#ifndef NO_TIMING
+#ifdef PROFILE_CLOCKS
+#include <time.h>
+#else
+#include "system/sysutils.h"
+#ifndef _WIN32
+#include <sys/time.h>
+#endif
+#endif
+#endif
+
+#ifndef NO_TIMING
+#include <map>
+#include <string>
+#endif
+
+namespace RubberBand {
+
+#ifndef NO_TIMING
+
+class Profiler
+{
+public:
+    Profiler(const char *name);
+    ~Profiler();
+
+    void end(); // same action as dtor
+
+    static void dump();
+
+    // Unlike the other functions, this is only defined if NO_TIMING
+    // is not set (because it uses std::string which is otherwise
+    // unused here). So, treat this as a tricksy internal function
+    // rather than an API call and guard any call to it appropriately.
+    static std::string getReport();
+
+protected:
+    const char* m_c;
+#ifdef PROFILE_CLOCKS
+    clock_t m_start;
+#else
+    struct timeval m_start;
+#endif
+    bool m_showOnDestruct;
+    bool m_ended;
+
+    typedef std::pair<int, float> TimePair;
+    typedef std::map<const char *, TimePair> ProfileMap;
+    typedef std::map<const char *, float> WorstCallMap;
+    static ProfileMap m_profiles;
+    static WorstCallMap m_worstCalls;
+    static void add(const char *, float);
+};
+
+#else
+
+#ifdef NO_TIMING_COMPLETE_NOOP
+
+// Fastest for release builds, but annoying because it can't be linked
+// with code built in debug mode (expecting non-inline functions), so
+// not preferred during development
+
+class Profiler
+{
+public:
+    Profiler(const char *) { }
+    ~Profiler() { }
+
+    void end() { }
+    static void dump() { }
+};
+
+#else
+
+class Profiler
+{
+public:
+    Profiler(const char *);
+    ~Profiler();
+
+    void end();
+    static void dump();
+};
+
+#endif
+#endif
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/base/RingBuffer.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/base/RingBuffer.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,531 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_RINGBUFFER_H_
+#define _RUBBERBAND_RINGBUFFER_H_
+
+#include <sys/types.h>
+
+//#define DEBUG_RINGBUFFER 1
+
+#include "system/sysutils.h"
+#include "system/Allocators.h"
+
+#include <iostream>
+
+namespace RubberBand {
+
+/**
+ * RingBuffer implements a lock-free ring buffer for one writer and
+ * one reader, that is to be used to store a sample type T.
+ *
+ * RingBuffer is thread-safe provided only one thread writes and only
+ * one thread reads.
+ */
+
+template <typename T>
+class RingBuffer
+{
+public:
+    /**
+     * Create a ring buffer with room to write n samples.
+     *
+     * Note that the internal storage size will actually be n+1
+     * samples, as one element is unavailable for administrative
+     * reasons.  Since the ring buffer performs best if its size is a
+     * power of two, this means n should ideally be some power of two
+     * minus one.
+     */
+    RingBuffer(int n);
+
+    virtual ~RingBuffer();
+
+    /**
+     * Return the total capacity of the ring buffer in samples.
+     * (This is the argument n passed to the constructor.)
+     */
+    int getSize() const;
+
+    /**
+     * Return a new ring buffer (allocated with "new" -- called must
+     * delete when no longer needed) of the given size, containing the
+     * same data as this one.  If another thread reads from or writes
+     * to this buffer during the call, the results may be incomplete
+     * or inconsistent.  If this buffer's data will not fit in the new
+     * size, the contents are undefined.
+     */
+    RingBuffer<T> *resized(int newSize) const;
+
+    /**
+     * Lock the ring buffer into physical memory.  Returns true
+     * for success.
+     */
+    bool mlock();
+
+    /**
+     * Reset read and write pointers, thus emptying the buffer.
+     * Should be called from the write thread.
+     */
+    void reset();
+
+    /**
+     * Return the amount of data available for reading by reader R, in
+     * samples.
+     */
+    int getReadSpace() const;
+
+    /**
+     * Return the amount of space available for writing, in samples.
+     */
+    int getWriteSpace() const;
+
+    /**
+     * Read n samples from the buffer, for reader R.  If fewer than n
+     * are available, the remainder will be zeroed out.  Returns the
+     * number of samples actually read.
+     *
+     * This is a template function, taking an argument S for the target
+     * sample type, which is permitted to differ from T if the two
+     * types are compatible for arithmetic operations.
+     */
+    template <typename S>
+    int read(S *const R__ destination, int n);
+
+    /**
+     * Read n samples from the buffer, for reader R, adding them to
+     * the destination.  If fewer than n are available, the remainder
+     * will be left alone.  Returns the number of samples actually
+     * read.
+     *
+     * This is a template function, taking an argument S for the target
+     * sample type, which is permitted to differ from T if the two
+     * types are compatible for arithmetic operations.
+     */
+    template <typename S>
+    int readAdding(S *const R__ destination, int n);
+
+    /**
+     * Read one sample from the buffer, for reader R.  If no sample is
+     * available, this will silently return zero.  Calling this
+     * repeatedly is obviously slower than calling read once, but it
+     * may be good enough if you don't want to allocate a buffer to
+     * read into.
+     */
+    T readOne();
+
+    /**
+     * Read n samples from the buffer, if available, for reader R,
+     * without advancing the read pointer -- i.e. a subsequent read()
+     * or skip() will be necessary to empty the buffer.  If fewer than
+     * n are available, the remainder will be zeroed out.  Returns the
+     * number of samples actually read.
+     */
+    int peek(T *const R__ destination, int n) const;
+
+    /**
+     * Read one sample from the buffer, if available, without
+     * advancing the read pointer -- i.e. a subsequent read() or
+     * skip() will be necessary to empty the buffer.  Returns zero if
+     * no sample was available.
+     */
+    T peekOne() const;
+
+    /**
+     * Pretend to read n samples from the buffer, for reader R,
+     * without actually returning them (i.e. discard the next n
+     * samples).  Returns the number of samples actually available for
+     * discarding.
+     */
+    int skip(int n);
+
+    /**
+     * Write n samples to the buffer.  If insufficient space is
+     * available, not all samples may actually be written.  Returns
+     * the number of samples actually written.
+     *
+     * This is a template function, taking an argument S for the source
+     * sample type, which is permitted to differ from T if the two
+     * types are compatible for assignment.
+     */
+    template <typename S>
+    int write(const S *const R__ source, int n);
+
+    /**
+     * Write n zero-value samples to the buffer.  If insufficient
+     * space is available, not all zeros may actually be written.
+     * Returns the number of zeroes actually written.
+     */
+    int zero(int n);
+
+protected:
+    T *const R__ m_buffer;
+    int          m_writer;
+    int          m_reader;
+    const int    m_size;
+    bool         m_mlocked;
+
+    int readSpaceFor(int w, int r) const {
+        int space;
+        if (w > r) space = w - r;
+        else if (w < r) space = (w + m_size) - r;
+        else space = 0;
+        return space;
+    }
+
+    int writeSpaceFor(int w, int r) const {
+        int space = (r + m_size - w - 1);
+        if (space >= m_size) space -= m_size;
+        return space;
+    }
+
+private:
+    RingBuffer(const RingBuffer &); // not provided
+    RingBuffer &operator=(const RingBuffer &); // not provided
+};
+
+template <typename T>
+RingBuffer<T>::RingBuffer(int n) :
+    m_buffer(allocate<T>(n + 1)),
+    m_writer(0),
+    m_size(n + 1),
+    m_mlocked(false)
+{
+#ifdef DEBUG_RINGBUFFER
+    std::cerr << "RingBuffer<T>[" << this << "]::RingBuffer(" << n << ")" << std::endl;
+#endif
+
+    m_reader = 0;
+}
+
+template <typename T>
+RingBuffer<T>::~RingBuffer()
+{
+#ifdef DEBUG_RINGBUFFER
+    std::cerr << "RingBuffer<T>[" << this << "]::~RingBuffer" << std::endl;
+#endif
+
+    if (m_mlocked) {
+	MUNLOCK((void *)m_buffer, m_size * sizeof(T));
+    }
+
+    deallocate(m_buffer);
+}
+
+template <typename T>
+int
+RingBuffer<T>::getSize() const
+{
+#ifdef DEBUG_RINGBUFFER
+    std::cerr << "RingBuffer<T>[" << this << "]::getSize(): " << m_size-1 << std::endl;
+#endif
+
+    return m_size - 1;
+}
+
+template <typename T>
+RingBuffer<T> *
+RingBuffer<T>::resized(int newSize) const
+{
+    RingBuffer<T> *newBuffer = new RingBuffer<T>(newSize);
+
+    int w = m_writer;
+    int r = m_reader;
+
+    while (r != w) {
+        T value = m_buffer[r];
+        newBuffer->write(&value, 1);
+        if (++r == m_size) r = 0;
+    }
+
+    return newBuffer;
+}
+
+template <typename T>
+bool
+RingBuffer<T>::mlock()
+{
+    if (MLOCK((void *)m_buffer, m_size * sizeof(T))) return false;
+    m_mlocked = true;
+    return true;
+}
+
+template <typename T>
+void
+RingBuffer<T>::reset()
+{
+#ifdef DEBUG_RINGBUFFER
+    std::cerr << "RingBuffer<T>[" << this << "]::reset" << std::endl;
+#endif
+
+    m_reader = m_writer;
+}
+
+template <typename T>
+int
+RingBuffer<T>::getReadSpace() const
+{
+    return readSpaceFor(m_writer, m_reader);
+}
+
+template <typename T>
+int
+RingBuffer<T>::getWriteSpace() const
+{
+    return writeSpaceFor(m_writer, m_reader);
+}
+
+template <typename T>
+template <typename S>
+int
+RingBuffer<T>::read(S *const R__ destination, int n)
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = readSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::read: " << n << " requested, only "
+                  << available << " available" << std::endl;
+//!!!        v_zero(destination + available, n - available);
+	n = available;
+    }
+    if (n == 0) return n;
+
+    int here = m_size - r;
+    T *const R__ bufbase = m_buffer + r;
+
+    if (here >= n) {
+        v_convert(destination, bufbase, n);
+    } else {
+        v_convert(destination, bufbase, here);
+        v_convert(destination + here, m_buffer, n - here);
+    }
+
+    r += n;
+    while (r >= m_size) r -= m_size;
+
+    MBARRIER();
+    m_reader = r;
+
+    return n;
+}
+
+template <typename T>
+template <typename S>
+int
+RingBuffer<T>::readAdding(S *const R__ destination, int n)
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = readSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::read: " << n << " requested, only "
+                  << available << " available" << std::endl;
+	n = available;
+    }
+    if (n == 0) return n;
+
+    int here = m_size - r;
+    T *const R__ bufbase = m_buffer + r;
+
+    if (here >= n) {
+        v_add(destination, bufbase, n);
+    } else {
+        v_add(destination, bufbase, here);
+        v_add(destination + here, m_buffer, n - here);
+    }
+
+    r += n;
+    while (r >= m_size) r -= m_size;
+
+    MBARRIER();
+    m_reader = r;
+
+    return n;
+}
+
+template <typename T>
+T
+RingBuffer<T>::readOne()
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    if (w == r) {
+	std::cerr << "WARNING: RingBuffer::readOne: no sample available"
+		  << std::endl;
+	return T();
+    }
+
+    T value = m_buffer[r];
+    if (++r == m_size) r = 0;
+
+    MBARRIER();
+    m_reader = r;
+
+    return value;
+}
+
+template <typename T>
+int
+RingBuffer<T>::peek(T *const R__ destination, int n) const
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = readSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::peek: " << n << " requested, only "
+                  << available << " available" << std::endl;
+	memset(destination + available, 0, (n - available) * sizeof(T));
+	n = available;
+    }
+    if (n == 0) return n;
+
+    int here = m_size - r;
+    const T *const R__ bufbase = m_buffer + r;
+
+    if (here >= n) {
+        v_copy(destination, bufbase, n);
+    } else {
+        v_copy(destination, bufbase, here);
+        v_copy(destination + here, m_buffer, n - here);
+    }
+
+    return n;
+}
+
+template <typename T>
+T
+RingBuffer<T>::peekOne() const
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    if (w == r) {
+	std::cerr << "WARNING: RingBuffer::peekOne: no sample available"
+		  << std::endl;
+	return 0;
+    }
+
+    T value = m_buffer[r];
+    return value;
+}
+
+template <typename T>
+int
+RingBuffer<T>::skip(int n)
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = readSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::skip: " << n << " requested, only "
+                  << available << " available" << std::endl;
+	n = available;
+    }
+    if (n == 0) return n;
+
+    r += n;
+    while (r >= m_size) r -= m_size;
+
+    // No memory barrier required, because we didn't read any data
+    m_reader = r;
+
+    return n;
+}
+
+template <typename T>
+template <typename S>
+int
+RingBuffer<T>::write(const S *const R__ source, int n)
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = writeSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::write: " << n
+                  << " requested, only room for " << available << std::endl;
+	n = available;
+    }
+    if (n == 0) return n;
+
+    int here = m_size - w;
+    T *const R__ bufbase = m_buffer + w;
+
+    if (here >= n) {
+        v_convert<S, T>(bufbase, source, n);
+    } else {
+        v_convert<S, T>(bufbase, source, here);
+        v_convert<S, T>(m_buffer, source + here, n - here);
+    }
+
+    w += n;
+    while (w >= m_size) w -= m_size;
+
+    MBARRIER();
+    m_writer = w;
+
+    return n;
+}
+
+template <typename T>
+int
+RingBuffer<T>::zero(int n)
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = writeSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::zero: " << n
+                  << " requested, only room for " << available << std::endl;
+	n = available;
+    }
+    if (n == 0) return n;
+
+    int here = m_size - w;
+    T *const R__ bufbase = m_buffer + w;
+
+    if (here >= n) {
+        v_zero(bufbase, n);
+    } else {
+        v_zero(bufbase, here);
+        v_zero(m_buffer, n - here);
+    }
+
+    w += n;
+    while (w >= m_size) w -= m_size;
+
+    MBARRIER();
+    m_writer = w;
+
+    return n;
+}
+
+}
+
+#endif // _RINGBUFFER_H_
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/base/Scavenger.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/base/Scavenger.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,249 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_SCAVENGER_H_
+#define _RUBBERBAND_SCAVENGER_H_
+
+#include <vector>
+#include <list>
+#include <utility>
+#include <iostream>
+
+#ifndef WIN32
+#include <sys/time.h>
+#endif
+
+#include "system/Thread.h"
+#include "system/sysutils.h"
+#include "system/Allocators.h"
+
+//#define DEBUG_SCAVENGER 1
+
+namespace RubberBand {
+
+/**
+ * A very simple class that facilitates running things like plugins
+ * without locking, by collecting unwanted objects and deleting them
+ * after a delay so as to be sure nobody's in the middle of using
+ * them.  Requires scavenge() to be called regularly from a non-RT
+ * thread.
+ *
+ * This is currently not at all suitable for large numbers of objects
+ * -- it's just a quick hack for use with things like plugins.
+ */
+
+//!!! should review this, it's not really thread safe owing to lack of
+//!!! atomic updates
+
+template <typename T>
+class Scavenger
+{
+public:
+    Scavenger(int sec = 2, int defaultObjectListSize = 200);
+    ~Scavenger();
+
+    /**
+     * Call from an RT thread etc., to pass ownership of t to us.
+     * Only one thread should be calling this on any given scavenger.
+     */
+    void claim(T *t);
+
+    /**
+     * Call from a non-RT thread.
+     * Only one thread should be calling this on any given scavenger.
+     */
+    void scavenge(bool clearNow = false);
+
+protected:
+    typedef std::pair<T *, int> ObjectTimePair;
+    typedef std::vector<ObjectTimePair> ObjectTimeList;
+    ObjectTimeList m_objects;
+    int m_sec;
+
+    typedef std::list<T *> ObjectList;
+    ObjectList m_excess;
+    int m_lastExcess;
+    Mutex m_excessMutex;
+    void pushExcess(T *);
+    void clearExcess(int);
+
+    unsigned int m_claimed;
+    unsigned int m_scavenged;
+    unsigned int m_asExcess;
+};
+
+
+/**
+ * A wrapper to permit arrays allocated with new[] to be scavenged.
+ */
+
+template <typename T>
+class ScavengerArrayWrapper
+{
+public:
+    ScavengerArrayWrapper(T *array) : m_array(array) { }
+    ~ScavengerArrayWrapper() { delete[] m_array; }
+
+private:
+    T *m_array;
+};
+
+
+/**
+ * A wrapper to permit arrays allocated with the Allocators functions
+ * to be scavenged.
+ */
+
+template <typename T>
+class ScavengerAllocArrayWrapper
+{
+public:
+    ScavengerAllocArrayWrapper(T *array) : m_array(array) { }
+    ~ScavengerAllocArrayWrapper() { deallocate<T>(m_array); }
+
+private:
+    T *m_array;
+};
+
+
+template <typename T>
+Scavenger<T>::Scavenger(int sec, int defaultObjectListSize) :
+    m_objects(ObjectTimeList(defaultObjectListSize)),
+    m_sec(sec),
+    m_claimed(0),
+    m_scavenged(0),
+    m_asExcess(0)
+{
+}
+
+template <typename T>
+Scavenger<T>::~Scavenger()
+{
+    if (m_scavenged < m_claimed) {
+	for (size_t i = 0; i < m_objects.size(); ++i) {
+	    ObjectTimePair &pair = m_objects[i];
+	    if (pair.first != 0) {
+		T *ot = pair.first;
+		pair.first = 0;
+		delete ot;
+		++m_scavenged;
+	    }
+	}
+    }
+
+    clearExcess(0);
+}
+
+template <typename T>
+void
+Scavenger<T>::claim(T *t)
+{
+//    std::cerr << "Scavenger::claim(" << t << ")" << std::endl;
+
+    struct timeval tv;
+    (void)gettimeofday(&tv, 0);
+    int sec = tv.tv_sec;
+
+    for (size_t i = 0; i < m_objects.size(); ++i) {
+	ObjectTimePair &pair = m_objects[i];
+	if (pair.first == 0) {
+	    pair.second = sec;
+	    pair.first = t;
+	    ++m_claimed;
+	    return;
+	}
+    }
+
+#ifdef DEBUG_SCAVENGER
+    std::cerr << "WARNING: Scavenger::claim(" << t << "): run out of slots (at "
+              << m_objects.size() << "), using non-RT-safe method" << std::endl;
+#endif
+    pushExcess(t);
+}
+
+template <typename T>
+void
+Scavenger<T>::scavenge(bool clearNow)
+{
+#ifdef DEBUG_SCAVENGER
+    std::cerr << "Scavenger::scavenge: claimed " << m_claimed << ", scavenged " << m_scavenged << ", cleared as excess " << m_asExcess << std::endl;
+#endif
+
+    if (m_scavenged >= m_claimed) return;
+    
+    struct timeval tv;
+    (void)gettimeofday(&tv, 0);
+    int sec = tv.tv_sec;
+    bool anything = false;
+
+    for (size_t i = 0; i < m_objects.size(); ++i) {
+	ObjectTimePair &pair = m_objects[i];
+        if (!pair.first) continue;
+	if (clearNow || pair.second + m_sec < sec) {
+	    T *ot = pair.first;
+	    pair.first = 0;
+	    delete ot;
+	    ++m_scavenged;
+            anything = true;
+	}
+    }
+
+    if (clearNow || anything || (sec > m_lastExcess + m_sec)) {
+        clearExcess(sec);
+    }
+}
+
+template <typename T>
+void
+Scavenger<T>::pushExcess(T *t)
+{
+    m_excessMutex.lock();
+    m_excess.push_back(t);
+    struct timeval tv;
+    (void)gettimeofday(&tv, 0);
+    m_lastExcess = tv.tv_sec;
+    m_excessMutex.unlock();
+}
+
+template <typename T>
+void
+Scavenger<T>::clearExcess(int sec)
+{
+#ifdef DEBUG_SCAVENGER
+    std::cerr << "Scavenger::clearExcess: Excess now " << m_excess.size() << std::endl;
+#endif
+
+    m_excessMutex.lock();
+    for (typename ObjectList::iterator i = m_excess.begin();
+	 i != m_excess.end(); ++i) {
+	delete *i;
+        ++m_asExcess;
+    }
+    m_excess.clear();
+    m_lastExcess = sec;
+    m_excessMutex.unlock();
+}
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/dsp/AudioCurveCalculator.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/dsp/AudioCurveCalculator.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,72 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "AudioCurveCalculator.h"
+
+#include <iostream>
+
+namespace RubberBand
+{
+
+static const int MaxPerceivedFreq = 16000;
+
+AudioCurveCalculator::AudioCurveCalculator(Parameters parameters) :
+    m_sampleRate(parameters.sampleRate),
+    m_fftSize(parameters.fftSize)
+{
+    recalculateLastPerceivedBin();
+}
+
+AudioCurveCalculator::~AudioCurveCalculator()
+{
+}
+
+void
+AudioCurveCalculator::setSampleRate(int newRate)
+{
+    m_sampleRate = newRate;
+    recalculateLastPerceivedBin();
+}
+
+void
+AudioCurveCalculator::setFftSize(int newSize)
+{
+    m_fftSize = newSize;
+    recalculateLastPerceivedBin();
+}
+
+void
+AudioCurveCalculator::recalculateLastPerceivedBin()
+{
+    if (m_sampleRate == 0) {
+        m_lastPerceivedBin = 0;
+        return;
+    }
+    m_lastPerceivedBin = ((MaxPerceivedFreq * m_fftSize) / m_sampleRate);
+    if (m_lastPerceivedBin > m_fftSize/2) {
+        m_lastPerceivedBin = m_fftSize/2;
+    }
+}
+
+
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/dsp/AudioCurveCalculator.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/dsp/AudioCurveCalculator.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,135 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _AUDIO_CURVE_CALCULATOR_H_
+#define _AUDIO_CURVE_CALCULATOR_H_
+
+#include <sys/types.h>
+
+
+#include "system/sysutils.h"
+
+namespace RubberBand 
+{
+
+/**
+ * AudioCurveCalculator turns a sequence of audio "columns" --
+ * short-time spectrum magnitude blocks -- into a sequence of numbers
+ * representing some quality of the input such as power or likelihood
+ * of an onset occurring.
+ *
+ * These are typically low-level building-blocks: AudioCurveCalculator
+ * is a simple causal interface in which each input column corresponds
+ * to exactly one output value which is returned immediately.  They
+ * have far less power (because of the causal interface and
+ * magnitude-only input) and flexibility (because of the limited
+ * return types) than for example the Vamp plugin interface.
+ *
+ * AudioCurveCalculator implementations typically remember the history
+ * of their processing data, and the caller must call reset() before
+ * resynchronising to an unrelated piece of input audio.
+ */
+class AudioCurveCalculator
+{
+public:
+    struct Parameters {
+        Parameters(int _sampleRate, int _fftSize) :
+            sampleRate(_sampleRate),
+            fftSize(_fftSize)
+        { }
+        int sampleRate;
+        int fftSize;
+    };
+
+    AudioCurveCalculator(Parameters parameters);
+    virtual ~AudioCurveCalculator();
+
+    int getSampleRate() const { return m_sampleRate; }
+    int getFftSize() const { return m_fftSize; }
+
+    virtual void setSampleRate(int newRate);
+    virtual void setFftSize(int newSize);
+
+    Parameters getParameters() const {
+        return Parameters(m_sampleRate, m_fftSize);
+    }
+    void setParameters(Parameters p) {
+        setSampleRate(p.sampleRate);
+        setFftSize(p.fftSize);
+    }
+
+    // You may not mix calls to the various process functions on a
+    // given instance
+
+
+    /**
+     * Process the given magnitude spectrum block and return the curve
+     * value for it.  The mag input contains (fftSize/2 + 1) values
+     * corresponding to the magnitudes of the complex FFT output bins
+     * for a windowed input of size fftSize.  The hop (expressed in
+     * time-domain audio samples) from the previous to the current
+     * input block is given by increment.
+     */
+    virtual float processFloat(const float *R__ mag, int increment) = 0;
+
+    /**
+     * Process the given magnitude spectrum block and return the curve
+     * value for it.  The mag input contains (fftSize/2 + 1) values
+     * corresponding to the magnitudes of the complex FFT output bins
+     * for a windowed input of size fftSize.  The hop (expressed in
+     * time-domain audio samples) from the previous to the current
+     * input block is given by increment.
+     */
+    virtual double processDouble(const double *R__ mag, int increment) = 0;
+
+    /**
+     * Obtain a confidence for the curve value (if applicable). A
+     * value of 1.0 indicates perfect confidence in the curve
+     * calculation, 0.0 indicates none.
+     */
+    virtual double getConfidence() const { return 1.0; }
+
+    /**
+     * Reset the calculator, forgetting the history of the audio input
+     * so far.
+     */
+    virtual void reset() = 0;
+
+    /**
+     * If the output of this calculator has a known unit, return it as
+     * text.  For example, "Hz" or "V".
+     */
+    virtual const char *getUnit() const { return ""; }
+
+protected:
+    int m_sampleRate;
+    int m_fftSize;
+    int m_lastPerceivedBin;
+    void recalculateLastPerceivedBin();
+};
+
+
+}
+
+#endif
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/dsp/FFT.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/dsp/FFT.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,3637 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "FFT.h"
+#include "system/Thread.h"
+#include "base/Profiler.h"
+#include "system/Allocators.h"
+#include "system/VectorOps.h"
+#include "system/VectorOpsComplex.h"
+
+//#define FFT_MEASUREMENT 1
+
+#ifdef FFT_MEASUREMENT
+#include <sstream>
+#endif
+
+#ifdef HAVE_IPP
+#include <ipps.h>
+#endif
+
+#ifdef HAVE_FFTW3
+#include <fftw3.h>
+#endif
+
+#ifdef HAVE_VDSP
+#include <vecLib/vDSP.h>
+#include <vecLib/vForce.h>
+#endif
+
+#ifdef HAVE_MEDIALIB
+#include <mlib_signal.h>
+#endif
+
+#ifdef HAVE_OPENMAX
+#include <omxSP.h>
+#endif
+
+#ifdef HAVE_SFFT
+extern "C" {
+#include <sfft.h>
+}
+#endif
+
+#ifdef USE_KISSFFT
+#include "kissfft/kiss_fftr.h"
+#endif
+
+#ifndef HAVE_IPP
+#ifndef HAVE_FFTW3
+#ifndef USE_KISSFFT
+#ifndef USE_BUILTIN_FFT
+#ifndef HAVE_VDSP
+#ifndef HAVE_MEDIALIB
+#ifndef HAVE_OPENMAX
+#ifndef HAVE_SFFT
+#error No FFT implementation selected!
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#include <cmath>
+#include <iostream>
+#include <map>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+#ifdef FFT_MEASUREMENT
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+#endif
+
+namespace RubberBand {
+
+class FFTImpl
+{
+public:
+    virtual ~FFTImpl() { }
+
+    virtual FFT::Precisions getSupportedPrecisions() const = 0;
+
+    virtual void initFloat() = 0;
+    virtual void initDouble() = 0;
+
+    virtual void forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut) = 0;
+    virtual void forwardInterleaved(const double *R__ realIn, double *R__ complexOut) = 0;
+    virtual void forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut) = 0;
+    virtual void forwardMagnitude(const double *R__ realIn, double *R__ magOut) = 0;
+
+    virtual void forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut) = 0;
+    virtual void forwardInterleaved(const float *R__ realIn, float *R__ complexOut) = 0;
+    virtual void forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut) = 0;
+    virtual void forwardMagnitude(const float *R__ realIn, float *R__ magOut) = 0;
+
+    virtual void inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut) = 0;
+    virtual void inverseInterleaved(const double *R__ complexIn, double *R__ realOut) = 0;
+    virtual void inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut) = 0;
+    virtual void inverseCepstral(const double *R__ magIn, double *R__ cepOut) = 0;
+
+    virtual void inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut) = 0;
+    virtual void inverseInterleaved(const float *R__ complexIn, float *R__ realOut) = 0;
+    virtual void inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut) = 0;
+    virtual void inverseCepstral(const float *R__ magIn, float *R__ cepOut) = 0;
+};    
+
+namespace FFTs {
+
+#ifdef HAVE_IPP
+
+class D_IPP : public FFTImpl
+{
+public:
+    D_IPP(int size) :
+        m_size(size), m_fspec(0), m_dspec(0)
+    { 
+        for (int i = 0; ; ++i) {
+            if (m_size & (1 << i)) {
+                m_order = i;
+                break;
+            }
+        }
+    }
+
+    ~D_IPP() {
+        if (m_fspec) {
+            ippsFFTFree_R_32f(m_fspec);
+            ippsFree(m_fbuf);
+            ippsFree(m_fpacked);
+            ippsFree(m_fspare);
+        }
+        if (m_dspec) {
+            ippsFFTFree_R_64f(m_dspec);
+            ippsFree(m_dbuf);
+            ippsFree(m_dpacked);
+            ippsFree(m_dspare);
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::SinglePrecision | FFT::DoublePrecision;
+    }
+
+    //!!! rv check
+
+    void initFloat() {
+        if (m_fspec) return;
+        int specSize, specBufferSize, bufferSize;
+        ippsFFTGetSize_R_32f(m_order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast,
+                             &specSize, &specBufferSize, &bufferSize);
+        m_fbuf = ippsMalloc_8u(bufferSize);
+        m_fpacked = ippsMalloc_32f(m_size + 2);
+        m_fspare = ippsMalloc_32f(m_size / 2 + 1);
+        ippsFFTInitAlloc_R_32f(&m_fspec, m_order, IPP_FFT_NODIV_BY_ANY, 
+                               ippAlgHintFast);
+    }
+
+    void initDouble() {
+        if (m_dspec) return;
+        int specSize, specBufferSize, bufferSize;
+        ippsFFTGetSize_R_64f(m_order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast,
+                             &specSize, &specBufferSize, &bufferSize);
+        m_dbuf = ippsMalloc_8u(bufferSize);
+        m_dpacked = ippsMalloc_64f(m_size + 2);
+        m_dspare = ippsMalloc_64f(m_size / 2 + 1);
+        ippsFFTInitAlloc_R_64f(&m_dspec, m_order, IPP_FFT_NODIV_BY_ANY, 
+                               ippAlgHintFast);
+    }
+
+    void packFloat(const float *R__ re, const float *R__ im) {
+        Profiler profiler("D_IPP::packFloat");
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[index++] = re[i];
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_fpacked[index++] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_fpacked[index++] = 0.f;
+            }
+        }
+    }
+
+    void packDouble(const double *R__ re, const double *R__ im) {
+        Profiler profiler("D_IPP::packDouble");
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_dpacked[index++] = re[i];
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_dpacked[index++] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_dpacked[index++] = 0.0;
+            }
+        }
+    }
+
+    void unpackFloat(float *re, float *R__ im) { // re may be equal to m_fpacked
+        Profiler profiler("D_IPP::unpackFloat");
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = m_fpacked[index++];
+            }
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_fpacked[index++];
+            index++;
+        }
+    }        
+
+    void unpackDouble(double *re, double *R__ im) { // re may be equal to m_dpacked
+        Profiler profiler("D_IPP::unpackDouble");
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = m_dpacked[index++];
+            }
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_dpacked[index++];
+            index++;
+        }
+    }        
+
+    void forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut) {
+        Profiler profiler("D_IPP::forward [d]");
+        if (!m_dspec) initDouble();
+        ippsFFTFwd_RToCCS_64f(realIn, m_dpacked, m_dspec, m_dbuf);
+        unpackDouble(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const double *R__ realIn, double *R__ complexOut) {
+        Profiler profiler("D_IPP::forwardInterleaved [d]");
+        if (!m_dspec) initDouble();
+        ippsFFTFwd_RToCCS_64f(realIn, complexOut, m_dspec, m_dbuf);
+    }
+
+    void forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut) {
+        Profiler profiler("D_IPP::forwardPolar [d]");
+        if (!m_dspec) initDouble();
+        ippsFFTFwd_RToCCS_64f(realIn, m_dpacked, m_dspec, m_dbuf);
+        unpackDouble(m_dpacked, m_dspare);
+        Profiler profiler2("D_IPP::forwardPolar [d] conv");
+        ippsCartToPolar_64f(m_dpacked, m_dspare, magOut, phaseOut, m_size/2+1);
+    }
+
+    void forwardMagnitude(const double *R__ realIn, double *R__ magOut) {
+        Profiler profiler("D_IPP::forwardMagnitude [d]");
+        if (!m_dspec) initDouble();
+        ippsFFTFwd_RToCCS_64f(realIn, m_dpacked, m_dspec, m_dbuf);
+        unpackDouble(m_dpacked, m_dspare);
+        ippsMagnitude_64f(m_dpacked, m_dspare, magOut, m_size/2+1);
+    }
+
+    void forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut) {
+        Profiler profiler("D_IPP::forward [f]");
+        if (!m_fspec) initFloat();
+        ippsFFTFwd_RToCCS_32f(realIn, m_fpacked, m_fspec, m_fbuf);
+        unpackFloat(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *R__ realIn, float *R__ complexOut) {
+        Profiler profiler("D_IPP::forwardInterleaved [f]");
+        if (!m_fspec) initFloat();
+        ippsFFTFwd_RToCCS_32f(realIn, complexOut, m_fspec, m_fbuf);
+    }
+
+    void forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut) {
+        Profiler profiler("D_IPP::forwardPolar [f]");
+        if (!m_fspec) initFloat();
+        ippsFFTFwd_RToCCS_32f(realIn, m_fpacked, m_fspec, m_fbuf);
+        unpackFloat(m_fpacked, m_fspare);
+        Profiler profiler2("D_IPP::forwardPolar [f] conv");
+        ippsCartToPolar_32f(m_fpacked, m_fspare, magOut, phaseOut, m_size/2+1);
+    }
+
+    void forwardMagnitude(const float *R__ realIn, float *R__ magOut) {
+        Profiler profiler("D_IPP::forwardMagnitude [f]");
+        if (!m_fspec) initFloat();
+        ippsFFTFwd_RToCCS_32f(realIn, m_fpacked, m_fspec, m_fbuf);
+        unpackFloat(m_fpacked, m_fspare);
+        ippsMagnitude_32f(m_fpacked, m_fspare, magOut, m_size/2+1);
+    }
+
+    void inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut) {
+        Profiler profiler("D_IPP::inverse [d]");
+        if (!m_dspec) initDouble();
+        packDouble(realIn, imagIn);
+        ippsFFTInv_CCSToR_64f(m_dpacked, realOut, m_dspec, m_dbuf);
+    }
+
+    void inverseInterleaved(const double *R__ complexIn, double *R__ realOut) {
+        Profiler profiler("D_IPP::inverse [d]");
+        if (!m_dspec) initDouble();
+        ippsFFTInv_CCSToR_64f(complexIn, realOut, m_dspec, m_dbuf);
+    }
+
+    void inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut) {
+        Profiler profiler("D_IPP::inversePolar [d]");
+        if (!m_dspec) initDouble();
+        ippsPolarToCart_64f(magIn, phaseIn, realOut, m_dspare, m_size/2+1);
+        Profiler profiler2("D_IPP::inversePolar [d] postconv");
+        packDouble(realOut, m_dspare); // to m_dpacked
+        ippsFFTInv_CCSToR_64f(m_dpacked, realOut, m_dspec, m_dbuf);
+    }
+
+    void inverseCepstral(const double *R__ magIn, double *R__ cepOut) {
+        Profiler profiler("D_IPP::inverseCepstral [d]");
+        if (!m_dspec) initDouble();
+        const int hs1 = m_size/2 + 1;
+        ippsCopy_64f(magIn, m_dspare, hs1);
+        ippsAddC_64f_I(0.000001, m_dspare, hs1);
+        ippsLn_64f_I(m_dspare, hs1);
+        packDouble(m_dspare, 0);
+        ippsFFTInv_CCSToR_64f(m_dpacked, cepOut, m_dspec, m_dbuf);
+    }
+    
+    void inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut) {
+        Profiler profiler("D_IPP::inverse [f]");
+        if (!m_fspec) initFloat();
+        packFloat(realIn, imagIn);
+        ippsFFTInv_CCSToR_32f(m_fpacked, realOut, m_fspec, m_fbuf);
+    }
+
+    void inverseInterleaved(const float *R__ complexIn, float *R__ realOut) {
+        Profiler profiler("D_IPP::inverse [f]");
+        if (!m_fspec) initFloat();
+        ippsFFTInv_CCSToR_32f(complexIn, realOut, m_fspec, m_fbuf);
+    }
+
+    void inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut) {
+        Profiler profiler("D_IPP::inversePolar [f]");
+        if (!m_fspec) initFloat();
+        ippsPolarToCart_32f(magIn, phaseIn, realOut, m_fspare, m_size/2+1);
+        Profiler profiler2("D_IPP::inversePolar [f] postconv");
+        packFloat(realOut, m_fspare); // to m_fpacked
+        ippsFFTInv_CCSToR_32f(m_fpacked, realOut, m_fspec, m_fbuf);
+    }
+
+    void inverseCepstral(const float *R__ magIn, float *R__ cepOut) {
+        Profiler profiler("D_IPP::inverseCepstral [f]");
+        if (!m_fspec) initFloat();
+        const int hs1 = m_size/2 + 1;
+        ippsCopy_32f(magIn, m_fspare, hs1);
+        ippsAddC_32f_I(0.000001f, m_fspare, hs1);
+        ippsLn_32f_I(m_fspare, hs1);
+        packFloat(m_fspare, 0);
+        ippsFFTInv_CCSToR_32f(m_fpacked, cepOut, m_fspec, m_fbuf);
+    }
+
+private:
+    const int m_size;
+    int m_order;
+    IppsFFTSpec_R_32f *m_fspec;
+    IppsFFTSpec_R_64f *m_dspec;
+    Ipp8u *m_fbuf;
+    Ipp8u *m_dbuf;
+    float *m_fpacked;
+    float *m_fspare;
+    double *m_dpacked;
+    double *m_dspare;
+};
+
+#endif /* HAVE_IPP */
+
+#ifdef HAVE_VDSP
+
+class D_VDSP : public FFTImpl
+{
+public:
+    D_VDSP(int size) :
+        m_size(size), m_fspec(0), m_dspec(0),
+        m_fpacked(0), m_fspare(0),
+        m_dpacked(0), m_dspare(0)
+    { 
+        for (int i = 0; ; ++i) {
+            if (m_size & (1 << i)) {
+                m_order = i;
+                break;
+            }
+        }
+    }
+
+    ~D_VDSP() {
+        if (m_fspec) {
+            vDSP_destroy_fftsetup(m_fspec);
+            deallocate(m_fspare);
+            deallocate(m_fspare2);
+            deallocate(m_fbuf->realp);
+            deallocate(m_fbuf->imagp);
+            delete m_fbuf;
+            deallocate(m_fpacked->realp);
+            deallocate(m_fpacked->imagp);
+            delete m_fpacked;
+        }
+        if (m_dspec) {
+            vDSP_destroy_fftsetupD(m_dspec);
+            deallocate(m_dspare);
+            deallocate(m_dspare2);
+            deallocate(m_dbuf->realp);
+            deallocate(m_dbuf->imagp);
+            delete m_dbuf;
+            deallocate(m_dpacked->realp);
+            deallocate(m_dpacked->imagp);
+            delete m_dpacked;
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::SinglePrecision | FFT::DoublePrecision;
+    }
+
+    //!!! rv check
+
+    void initFloat() {
+        if (m_fspec) return;
+        m_fspec = vDSP_create_fftsetup(m_order, FFT_RADIX2);
+        m_fbuf = new DSPSplitComplex;
+        //!!! "If possible, tempBuffer->realp and tempBuffer->imagp should be 32-byte aligned for best performance."
+        m_fbuf->realp = allocate<float>(m_size);
+        m_fbuf->imagp = allocate<float>(m_size);
+        m_fpacked = new DSPSplitComplex;
+        m_fpacked->realp = allocate<float>(m_size / 2 + 1);
+        m_fpacked->imagp = allocate<float>(m_size / 2 + 1);
+        m_fspare = allocate<float>(m_size + 2);
+        m_fspare2 = allocate<float>(m_size + 2);
+    }
+
+    void initDouble() {
+        if (m_dspec) return;
+        m_dspec = vDSP_create_fftsetupD(m_order, FFT_RADIX2);
+        m_dbuf = new DSPDoubleSplitComplex;
+        //!!! "If possible, tempBuffer->realp and tempBuffer->imagp should be 32-byte aligned for best performance."
+        m_dbuf->realp = allocate<double>(m_size);
+        m_dbuf->imagp = allocate<double>(m_size);
+        m_dpacked = new DSPDoubleSplitComplex;
+        m_dpacked->realp = allocate<double>(m_size / 2 + 1);
+        m_dpacked->imagp = allocate<double>(m_size / 2 + 1);
+        m_dspare = allocate<double>(m_size + 2);
+        m_dspare2 = allocate<double>(m_size + 2);
+    }
+
+    void packReal(const float *R__ const re) {
+        // Pack input for forward transform 
+        vDSP_ctoz((DSPComplex *)re, 2, m_fpacked, 1, m_size/2);
+    }
+    void packComplex(const float *R__ const re, const float *R__ const im) {
+        // Pack input for inverse transform 
+        if (re) v_copy(m_fpacked->realp, re, m_size/2 + 1);
+        else v_zero(m_fpacked->realp, m_size/2 + 1);
+        if (im) v_copy(m_fpacked->imagp, im, m_size/2 + 1);
+        else v_zero(m_fpacked->imagp, m_size/2 + 1);
+        fnyq();
+    }
+
+    void unpackReal(float *R__ const re) {
+        // Unpack output for inverse transform
+        vDSP_ztoc(m_fpacked, 1, (DSPComplex *)re, 2, m_size/2);
+    }
+    void unpackComplex(float *R__ const re, float *R__ const im) {
+        // Unpack output for forward transform
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        float two = 2.f;
+        vDSP_vsdiv(m_fpacked->realp, 1, &two, re, 1, m_size/2 + 1);
+        vDSP_vsdiv(m_fpacked->imagp, 1, &two, im, 1, m_size/2 + 1);
+    }
+    void unpackComplex(float *R__ const cplx) {
+        // Unpack output for forward transform
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        const int hs1 = m_size/2 + 1;
+        for (int i = 0; i < hs1; ++i) {
+            cplx[i*2] = m_fpacked->realp[i] / 2.f;
+            cplx[i*2+1] = m_fpacked->imagp[i] / 2.f;
+        }
+    }
+
+    void packReal(const double *R__ const re) {
+        // Pack input for forward transform
+        vDSP_ctozD((DSPDoubleComplex *)re, 2, m_dpacked, 1, m_size/2);
+    }
+    void packComplex(const double *R__ const re, const double *R__ const im) {
+        // Pack input for inverse transform
+        if (re) v_copy(m_dpacked->realp, re, m_size/2 + 1);
+        else v_zero(m_dpacked->realp, m_size/2 + 1);
+        if (im) v_copy(m_dpacked->imagp, im, m_size/2 + 1);
+        else v_zero(m_dpacked->imagp, m_size/2 + 1);
+        dnyq();
+    }
+
+    void unpackReal(double *R__ const re) {
+        // Unpack output for inverse transform
+        vDSP_ztocD(m_dpacked, 1, (DSPDoubleComplex *)re, 2, m_size/2);
+    }
+    void unpackComplex(double *R__ const re, double *R__ const im) {
+        // Unpack output for forward transform
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        double two = 2.0;
+        vDSP_vsdivD(m_dpacked->realp, 1, &two, re, 1, m_size/2 + 1);
+        vDSP_vsdivD(m_dpacked->imagp, 1, &two, im, 1, m_size/2 + 1);
+    }
+    void unpackComplex(double *R__ const cplx) {
+        // Unpack output for forward transform
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        const int hs1 = m_size/2 + 1;
+        for (int i = 0; i < hs1; ++i) {
+            cplx[i*2] = m_dpacked->realp[i] / 2.0;
+            cplx[i*2+1] = m_dpacked->imagp[i] / 2.0;
+        }
+    }
+
+    void fdenyq() {
+        // for fft result in packed form, unpack the DC and Nyquist bins
+        const int hs = m_size/2;
+        m_fpacked->realp[hs] = m_fpacked->imagp[0];
+        m_fpacked->imagp[hs] = 0.f;
+        m_fpacked->imagp[0] = 0.f;
+    }
+    void ddenyq() {
+        // for fft result in packed form, unpack the DC and Nyquist bins
+        const int hs = m_size/2;
+        m_dpacked->realp[hs] = m_dpacked->imagp[0];
+        m_dpacked->imagp[hs] = 0.;
+        m_dpacked->imagp[0] = 0.;
+    }
+
+    void fnyq() {
+        // for ifft input in packed form, pack the DC and Nyquist bins
+        const int hs = m_size/2;
+        m_fpacked->imagp[0] = m_fpacked->realp[hs];
+        m_fpacked->realp[hs] = 0.f;
+        m_fpacked->imagp[hs] = 0.f;
+    }
+    void dnyq() {
+        // for ifft input in packed form, pack the DC and Nyquist bins
+        const int hs = m_size/2;
+        m_dpacked->imagp[0] = m_dpacked->realp[hs];
+        m_dpacked->realp[hs] = 0.;
+        m_dpacked->imagp[hs] = 0.;
+    }
+
+    void forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut) {
+        Profiler profiler("D_VDSP::forward [d]");
+        if (!m_dspec) initDouble();
+        packReal(realIn);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD);
+        ddenyq();
+        unpackComplex(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const double *R__ realIn, double *R__ complexOut) {
+        Profiler profiler("D_VDSP::forward [d]");
+        if (!m_dspec) initDouble();
+        packReal(realIn);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD);
+        ddenyq();
+        unpackComplex(complexOut);
+    }
+
+    void forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut) {
+        Profiler profiler("D_VDSP::forwardPolar [d]");
+        if (!m_dspec) initDouble();
+        const int hs1 = m_size/2+1;
+        packReal(realIn);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD);
+        ddenyq();
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        for (int i = 0; i < hs1; ++i) m_dpacked->realp[i] /= 2.0;
+        for (int i = 0; i < hs1; ++i) m_dpacked->imagp[i] /= 2.0;
+        v_cartesian_to_polar(magOut, phaseOut,
+                             m_dpacked->realp, m_dpacked->imagp, hs1);
+    }
+
+    void forwardMagnitude(const double *R__ realIn, double *R__ magOut) {
+        Profiler profiler("D_VDSP::forwardMagnitude [d]");
+        if (!m_dspec) initDouble();
+        packReal(realIn);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD);
+        ddenyq();
+        const int hs1 = m_size/2+1;
+        vDSP_zvmagsD(m_dpacked, 1, m_dspare, 1, hs1);
+        vvsqrt(m_dspare2, m_dspare, &hs1);
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        double two = 2.0;
+        vDSP_vsdivD(m_dspare2, 1, &two, magOut, 1, hs1);
+    }
+
+    void forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut) {
+        Profiler profiler("D_VDSP::forward [f]");
+        if (!m_fspec) initFloat();
+        packReal(realIn);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD);
+        fdenyq();
+        unpackComplex(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *R__ realIn, float *R__ complexOut) {
+        Profiler profiler("D_VDSP::forward [f]");
+        if (!m_fspec) initFloat();
+        packReal(realIn);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD);
+        fdenyq();
+        unpackComplex(complexOut);
+    }
+
+    void forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut) {
+        Profiler profiler("D_VDSP::forwardPolar [f]");
+        if (!m_fspec) initFloat();
+        const int hs1 = m_size/2+1;
+        packReal(realIn);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD);
+        fdenyq();
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        for (int i = 0; i < hs1; ++i) m_fpacked->realp[i] /= 2.f;
+        for (int i = 0; i < hs1; ++i) m_fpacked->imagp[i] /= 2.f;
+        v_cartesian_to_polar(magOut, phaseOut,
+                             m_fpacked->realp, m_fpacked->imagp, hs1);
+    }
+
+    void forwardMagnitude(const float *R__ realIn, float *R__ magOut) {
+        Profiler profiler("D_VDSP::forwardMagnitude [f]");
+        if (!m_fspec) initFloat();
+        packReal(realIn);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD);
+        fdenyq();
+        const int hs1 = m_size/2 + 1;
+        vDSP_zvmags(m_fpacked, 1, m_fspare, 1, hs1);
+        vvsqrtf(m_fspare2, m_fspare, &hs1);
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        float two = 2.f;
+        vDSP_vsdiv(m_fspare2, 1, &two, magOut, 1, hs1);
+    }
+
+    void inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut) {
+        Profiler profiler("D_VDSP::inverse [d]");
+        if (!m_dspec) initDouble();
+        packComplex(realIn, imagIn);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inverseInterleaved(const double *R__ complexIn, double *R__ realOut) {
+        Profiler profiler("D_VDSP::inverseInterleaved [d]");
+        if (!m_dspec) initDouble();
+        double *d[2] = { m_dpacked->realp, m_dpacked->imagp };
+        v_deinterleave(d, complexIn, 2, m_size/2 + 1);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut) {
+        Profiler profiler("D_VDSP::inversePolar [d]");
+        if (!m_dspec) initDouble();
+        const int hs1 = m_size/2+1;
+        vvsincos(m_dpacked->imagp, m_dpacked->realp, phaseIn, &hs1);
+        double *const rp = m_dpacked->realp;
+        double *const ip = m_dpacked->imagp;
+        for (int i = 0; i < hs1; ++i) rp[i] *= magIn[i];
+        for (int i = 0; i < hs1; ++i) ip[i] *= magIn[i];
+        dnyq();
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inverseCepstral(const double *R__ magIn, double *R__ cepOut) {
+        Profiler profiler("D_VDSP::inverseCepstral [d]");
+        if (!m_dspec) initDouble();
+        const int hs1 = m_size/2 + 1;
+        v_copy(m_dspare, magIn, hs1);
+        for (int i = 0; i < hs1; ++i) m_dspare[i] += 0.000001;
+        vvlog(m_dspare2, m_dspare, &hs1);
+        inverse(m_dspare2, 0, cepOut);
+    }
+    
+    void inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut) {
+        Profiler profiler("D_VDSP::inverse [f]");
+        if (!m_fspec) initFloat();
+        packComplex(realIn, imagIn);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inverseInterleaved(const float *R__ complexIn, float *R__ realOut) {
+        Profiler profiler("D_VDSP::inverseInterleaved [f]");
+        if (!m_fspec) initFloat();
+        float *f[2] = { m_fpacked->realp, m_fpacked->imagp };
+        v_deinterleave(f, complexIn, 2, m_size/2 + 1);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut) {
+        Profiler profiler("D_VDSP::inversePolar [f]");
+        if (!m_fspec) initFloat();
+
+        const int hs1 = m_size/2+1;
+        vvsincosf(m_fpacked->imagp, m_fpacked->realp, phaseIn, &hs1);
+        float *const rp = m_fpacked->realp;
+        float *const ip = m_fpacked->imagp;
+        for (int i = 0; i < hs1; ++i) rp[i] *= magIn[i];
+        for (int i = 0; i < hs1; ++i) ip[i] *= magIn[i];
+        fnyq();
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inverseCepstral(const float *R__ magIn, float *R__ cepOut) {
+        Profiler profiler("D_VDSP::inverseCepstral [f]");
+        if (!m_fspec) initFloat();
+        const int hs1 = m_size/2 + 1;
+        v_copy(m_fspare, magIn, hs1);
+        for (int i = 0; i < hs1; ++i) m_fspare[i] += 0.000001f;
+        vvlogf(m_fspare2, m_fspare, &hs1);
+        inverse(m_fspare2, 0, cepOut);
+    }
+
+private:
+    const int m_size;
+    int m_order;
+    FFTSetup m_fspec;
+    FFTSetupD m_dspec;
+    DSPSplitComplex *m_fbuf;
+    DSPDoubleSplitComplex *m_dbuf;
+    DSPSplitComplex *m_fpacked;
+    float *m_fspare;
+    float *m_fspare2;
+    DSPDoubleSplitComplex *m_dpacked;
+    double *m_dspare;
+    double *m_dspare2;
+};
+
+#endif /* HAVE_VDSP */
+
+#ifdef HAVE_MEDIALIB
+
+class D_MEDIALIB : public FFTImpl
+{
+public:
+    D_MEDIALIB(int size) :
+        m_size(size),
+        m_dpacked(0), m_fpacked(0)
+    { 
+        for (int i = 0; ; ++i) {
+            if (m_size & (1 << i)) {
+                m_order = i;
+                break;
+            }
+        }
+    }
+
+    ~D_MEDIALIB() {
+        if (m_dpacked) {
+            deallocate(m_dpacked);
+        }
+        if (m_fpacked) {
+            deallocate(m_fpacked);
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::SinglePrecision | FFT::DoublePrecision;
+    }
+
+    //!!! rv check
+
+    void initFloat() {
+        m_fpacked = allocate<float>(m_size*2);
+    }
+
+    void initDouble() {
+        m_dpacked = allocate<double>(m_size*2);
+    }
+
+    void packFloatConjugates() {
+        const int hs = m_size / 2;
+        for (int i = 1; i <= hs; ++i) {
+            m_fpacked[(m_size-i)*2] = m_fpacked[2*i];
+            m_fpacked[(m_size-i)*2 + 1] = -m_fpacked[2*i + 1];
+        }
+    }
+
+    void packDoubleConjugates() {
+        const int hs = m_size / 2;
+        for (int i = 1; i <= hs; ++i) {
+            m_dpacked[(m_size-i)*2] = m_dpacked[2*i];
+            m_dpacked[(m_size-i)*2 + 1] = -m_dpacked[2*i + 1];
+        }
+    }
+
+    void packFloat(const float *R__ re, const float *R__ im) {
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[index++] = re[i];
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_fpacked[index++] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_fpacked[index++] = 0.f;
+            }
+        }
+        packFloatConjugates();
+    }
+
+    void packDouble(const double *R__ re, const double *R__ im) {
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_dpacked[index++] = re[i];
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_dpacked[index++] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_dpacked[index++] = 0.0;
+            }
+        }
+        packDoubleConjugates();
+    }
+
+    void unpackFloat(float *re, float *R__ im) { // re may be equal to m_fpacked
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = m_fpacked[index++];
+            }
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_fpacked[index++];
+            index++;
+        }
+    }        
+
+    void unpackDouble(double *re, double *R__ im) { // re may be equal to m_dpacked
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = m_dpacked[index++];
+            }
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_dpacked[index++];
+            index++;
+        }
+    }
+
+    void forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut) {
+        Profiler profiler("D_MEDIALIB::forward [d]");
+        if (!m_dpacked) initDouble();
+        mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order);
+        unpackDouble(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const double *R__ realIn, double *R__ complexOut) {
+        Profiler profiler("D_MEDIALIB::forwardInterleaved [d]");
+        if (!m_dpacked) initDouble();
+        // mlib FFT gives the whole redundant complex result
+        mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order);
+        v_copy(complexOut, m_dpacked, m_size + 2);
+    }
+
+    void forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut) {
+        Profiler profiler("D_MEDIALIB::forwardPolar [d]");
+        if (!m_dpacked) initDouble();
+        mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order);
+        const int hs = m_size/2;
+        int index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = index;
+            ++index;
+            magOut[i] = sqrt(m_dpacked[reali] * m_dpacked[reali] +
+                             m_dpacked[index] * m_dpacked[index]);
+            phaseOut[i] = atan2(m_dpacked[index], m_dpacked[reali]) ;
+            ++index;
+        }
+    }
+
+    void forwardMagnitude(const double *R__ realIn, double *R__ magOut) {
+        Profiler profiler("D_MEDIALIB::forwardMagnitude [d]");
+        if (!m_dpacked) initDouble();
+        mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order);
+        const int hs = m_size/2;
+        int index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = index;
+            ++index;
+            magOut[i] = sqrt(m_dpacked[reali] * m_dpacked[reali] +
+                             m_dpacked[index] * m_dpacked[index]);
+            ++index;
+        }
+    }
+
+    void forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut) {
+        Profiler profiler("D_MEDIALIB::forward [f]");
+        if (!m_fpacked) initFloat();
+        mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order);
+        unpackFloat(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *R__ realIn, float *R__ complexOut) {
+        Profiler profiler("D_MEDIALIB::forwardInterleaved [f]");
+        if (!m_fpacked) initFloat();
+        // mlib FFT gives the whole redundant complex result
+        mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order);
+        v_copy(complexOut, m_fpacked, m_size + 2);
+    }
+
+    void forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut) {
+        Profiler profiler("D_MEDIALIB::forwardPolar [f]");
+        if (!m_fpacked) initFloat();
+        mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order);
+        const int hs = m_size/2;
+        int index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = index;
+            ++index;
+            magOut[i] = sqrtf(m_fpacked[reali] * m_fpacked[reali] +
+                              m_fpacked[index] * m_fpacked[index]);
+            phaseOut[i] = atan2f(m_fpacked[index], m_fpacked[reali]);
+            ++index;
+        }
+    }
+
+    void forwardMagnitude(const float *R__ realIn, float *R__ magOut) {
+        Profiler profiler("D_MEDIALIB::forwardMagnitude [f]");
+        if (!m_fpacked) initFloat();
+        mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order);
+        const int hs = m_size/2;
+        int index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = index;
+            ++index;
+            magOut[i] = sqrtf(m_fpacked[reali] * m_fpacked[reali] +
+                              m_fpacked[index] * m_fpacked[index]);
+            ++index;
+        }
+    }
+
+    void inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut) {
+        Profiler profiler("D_MEDIALIB::inverse [d]");
+        if (!m_dpacked) initDouble();
+        packDouble(realIn, imagIn);
+        mlib_SignalIFFT_2_D64_D64C(realOut, m_dpacked, m_order);
+    }
+
+    void inverseInterleaved(const double *R__ complexIn, double *R__ realOut) {
+        Profiler profiler("D_MEDIALIB::inverseInterleaved [d]");
+        if (!m_dpacked) initDouble();
+        v_copy(m_dpacked, complexIn, m_size + 2);
+        packDoubleConjugates();
+        mlib_SignalIFFT_2_D64_D64C(realOut, m_dpacked, m_order);
+    }
+
+    void inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut) {
+        Profiler profiler("D_MEDIALIB::inversePolar [d]");
+        if (!m_dpacked) initDouble();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = magIn[i] * cos(phaseIn[i]);
+            double imag = magIn[i] * sin(phaseIn[i]);
+            m_dpacked[i*2] = real;
+            m_dpacked[i*2 + 1] = imag;
+        }
+        packDoubleConjugates();
+        mlib_SignalIFFT_2_D64_D64C(realOut, m_dpacked, m_order);
+    }
+
+    void inverseCepstral(const double *R__ magIn, double *R__ cepOut) {
+        Profiler profiler("D_MEDIALIB::inverseCepstral [d]");
+        if (!m_dpacked) initDouble();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_dpacked[i*2] = log(magIn[i] + 0.000001);
+            m_dpacked[i*2 + 1] = 0.0;
+        }
+        packDoubleConjugates();
+        mlib_SignalIFFT_2_D64_D64C(cepOut, m_dpacked, m_order);
+    }
+    
+    void inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut) {
+        Profiler profiler("D_MEDIALIB::inverse [f]");
+        if (!m_fpacked) initFloat();
+        packFloat(realIn, imagIn);
+        mlib_SignalIFFT_2_F32_F32C(realOut, m_fpacked, m_order);
+    }
+    
+    void inverseInterleaved(const float *R__ complexIn, float *R__ realOut) {
+        Profiler profiler("D_MEDIALIB::inverseInterleaved [f]");
+        if (!m_fpacked) initFloat();
+        v_convert(m_fpacked, complexIn, m_size + 2);
+        packFloatConjugates();
+        mlib_SignalIFFT_2_F32_F32C(realOut, m_fpacked, m_order);
+    }
+
+    void inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut) {
+        Profiler profiler("D_MEDIALIB::inversePolar [f]");
+        if (!m_fpacked) initFloat();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = magIn[i] * cos(phaseIn[i]);
+            double imag = magIn[i] * sin(phaseIn[i]);
+            m_fpacked[i*2] = real;
+            m_fpacked[i*2 + 1] = imag;
+        }
+        packFloatConjugates();
+        mlib_SignalIFFT_2_F32_F32C(realOut, m_fpacked, m_order);
+    }
+
+    void inverseCepstral(const float *R__ magIn, float *R__ cepOut) {
+        Profiler profiler("D_MEDIALIB::inverseCepstral [f]");
+        if (!m_fpacked) initFloat();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i*2] = logf(magIn[i] + 0.000001);
+            m_fpacked[i*2 + 1] = 0.f;
+        }
+        packFloatConjugates();
+        mlib_SignalIFFT_2_F32_F32C(cepOut, m_fpacked, m_order);
+    }
+
+private:
+    const int m_size;
+    int m_order;
+    double *m_dpacked;
+    float *m_fpacked;
+};
+
+#endif /* HAVE_MEDIALIB */
+
+#ifdef HAVE_OPENMAX
+
+class D_OPENMAX : public FFTImpl
+{
+    // Convert a signed 32-bit integer to a float in the range [-1,1)
+    static inline float i2f(OMX_S32 i)
+    {
+        return float(i) / float(OMX_MAX_S32);
+    }
+
+    // Convert a signed 32-bit integer to a double in the range [-1,1)
+    static inline double i2d(OMX_S32 i)
+    {
+        return double(i) / double(OMX_MAX_S32);
+    }
+
+    // Convert a float in the range [-1,1) to a signed 32-bit integer
+    static inline OMX_S32 f2i(float f)
+    {
+        return OMX_S32(f * OMX_MAX_S32);
+    }
+
+    // Convert a double in the range [-1,1) to a signed 32-bit integer
+    static inline OMX_S32 d2i(double d)
+    {
+        return OMX_S32(d * OMX_MAX_S32);
+    }
+
+public:
+    D_OPENMAX(int size) :
+        m_size(size),
+        m_packed(0)
+    { 
+        for (int i = 0; ; ++i) {
+            if (m_size & (1 << i)) {
+                m_order = i;
+                break;
+            }
+        }
+    }
+
+    ~D_OPENMAX() {
+        if (m_packed) {
+            deallocate(m_packed);
+            deallocate(m_buf);
+            deallocate(m_fbuf);
+            deallocate(m_spec);
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::SinglePrecision;
+    }
+
+    //!!! rv check
+
+    // The OpenMAX implementation uses a fixed-point representation in
+    // 32-bit signed integers, with a downward scaling factor (0-32
+    // bits) supplied as an argument to the FFT function.
+
+    void initFloat() {
+        initDouble();
+    }
+
+    void initDouble() {
+        if (!m_packed) {
+            m_buf = allocate<OMX_S32>(m_size);
+            m_packed = allocate<OMX_S32>(m_size*2 + 2);
+            m_fbuf = allocate<float>(m_size*2 + 2);
+            OMX_INT sz = 0;
+            omxSP_FFTGetBufSize_R_S32(m_order, &sz);
+            m_spec = (OMXFFTSpec_R_S32 *)allocate<char>(sz);
+            omxSP_FFTInit_R_S32(m_spec, m_order);
+        }
+    }
+
+    void packFloat(const float *R__ re) {
+        // prepare fixed point input for forward transform
+        for (int i = 0; i < m_size; ++i) {
+            m_buf[i] = f2i(re[i]);
+        }
+    }
+
+    void packDouble(const double *R__ re) {
+        // prepare fixed point input for forward transform
+        for (int i = 0; i < m_size; ++i) {
+            m_buf[i] = d2i(re[i]);
+        }
+    }
+
+    void unpackFloat(float *R__ re, float *R__ im) {
+        // convert fixed point output for forward transform
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = i2f(m_packed[index++]);
+            }
+            v_scale(im, m_size, hs + 1);
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = i2f(m_packed[index++]);
+            index++;
+        }
+        v_scale(re, m_size, hs + 1);
+    }        
+
+    void unpackDouble(double *R__ re, double *R__ im) {
+        // convert fixed point output for forward transform
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = i2d(m_packed[index++]);
+            }
+            v_scale(im, m_size, hs + 1);
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = i2d(m_packed[index++]);
+            index++;
+        }
+        v_scale(re, m_size, hs + 1);
+    }
+
+    void unpackFloatInterleaved(float *R__ cplx) {
+        // convert fixed point output for forward transform
+        for (int i = 0; i < m_size + 2; ++i) {
+            cplx[i] = i2f(m_packed[i]);
+        }            
+        v_scale(cplx, m_size, m_size + 2);
+    }
+
+    void unpackDoubleInterleaved(double *R__ cplx) {
+        // convert fixed point output for forward transform
+        for (int i = 0; i < m_size + 2; ++i) {
+            cplx[i] = i2d(m_packed[i]);
+        }            
+        v_scale(cplx, m_size, m_size + 2);
+    }
+
+    void packFloat(const float *R__ re, const float *R__ im) {
+        // prepare fixed point input for inverse transform
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_packed[index++] = f2i(re[i]);
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_packed[index++] = f2i(im[i]);
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_packed[index++] = 0;
+            }
+        }
+    }
+
+    void packDouble(const double *R__ re, const double *R__ im) {
+        // prepare fixed point input for inverse transform
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_packed[index++] = d2i(re[i]);
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_packed[index++] = d2i(im[i]);
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_packed[index++] = 0;
+            }
+        }
+    }
+
+    void convertFloat(const float *R__ f) {
+        // convert interleaved input for inverse interleaved transform
+        const int n = m_size + 2;
+        for (int i = 0; i < n; ++i) {
+            m_packed[i] = f2i(f[i]);
+        }
+    }        
+
+    void convertDouble(const double *R__ d) {
+        // convert interleaved input for inverse interleaved transform
+        const int n = m_size + 2;
+        for (int i = 0; i < n; ++i) {
+            m_packed[i] = d2i(d[i]);
+        }
+    }        
+
+    void unpackFloat(float *R__ re) {
+        // convert fixed point output for inverse transform
+        for (int i = 0; i < m_size; ++i) {
+            re[i] = i2f(m_buf[i]) * m_size;
+        }
+    }
+
+    void unpackDouble(double *R__ re) {
+        // convert fixed point output for inverse transform
+        for (int i = 0; i < m_size; ++i) {
+            re[i] = i2d(m_buf[i]) * m_size;
+        }
+    }
+
+    void forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut) {
+        Profiler profiler("D_OPENMAX::forward [d]");
+        if (!m_packed) initDouble();
+        packDouble(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackDouble(realOut, imagOut);
+    }
+    
+    void forwardInterleaved(const double *R__ realIn, double *R__ complexOut) {
+        Profiler profiler("D_OPENMAX::forwardInterleaved [d]");
+        if (!m_packed) initDouble();
+        packDouble(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackDoubleInterleaved(complexOut);
+    }
+
+    void forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut) {
+        Profiler profiler("D_OPENMAX::forwardPolar [d]");
+        if (!m_packed) initDouble();
+        packDouble(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackDouble(magOut, phaseOut); // temporarily
+        // at this point we actually have real/imag in the mag/phase arrays
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = magOut[i];
+            double imag = phaseOut[i];
+            c_magphase(magOut + i, phaseOut + i, real, imag);
+        }
+    }
+
+    void forwardMagnitude(const double *R__ realIn, double *R__ magOut) {
+        Profiler profiler("D_OPENMAX::forwardMagnitude [d]");
+        if (!m_packed) initDouble();
+        packDouble(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = i * 2;
+            int imagi = reali + 1;
+            double real = i2d(m_packed[reali]) * m_size;
+            double imag = i2d(m_packed[imagi]) * m_size;
+            magOut[i] = sqrt(real * real + imag * imag);
+        }
+    }
+
+    void forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut) {
+        Profiler profiler("D_OPENMAX::forward [f]");
+        if (!m_packed) initFloat();
+        packFloat(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackFloat(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *R__ realIn, float *R__ complexOut) {
+        Profiler profiler("D_OPENMAX::forwardInterleaved [f]");
+        if (!m_packed) initFloat();
+        packFloat(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackFloatInterleaved(complexOut);
+    }
+
+    void forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut) {
+        Profiler profiler("D_OPENMAX::forwardPolar [f]");
+        if (!m_packed) initFloat();
+
+        packFloat(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackFloat(magOut, phaseOut); // temporarily
+        // at this point we actually have real/imag in the mag/phase arrays
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            float real = magOut[i];
+            float imag = phaseOut[i];
+            c_magphase(magOut + i, phaseOut + i, real, imag);
+        }
+    }
+
+    void forwardMagnitude(const float *R__ realIn, float *R__ magOut) {
+        Profiler profiler("D_OPENMAX::forwardMagnitude [f]");
+        if (!m_packed) initFloat();
+        packFloat(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = i * 2;
+            int imagi = reali + 1;
+            float real = i2f(m_packed[reali]) * m_size;
+            float imag = i2f(m_packed[imagi]) * m_size;
+            magOut[i] = sqrtf(real * real + imag * imag);
+        }
+    }
+
+    void inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut) {
+        Profiler profiler("D_OPENMAX::inverse [d]");
+        if (!m_packed) initDouble();
+        packDouble(realIn, imagIn);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackDouble(realOut);
+    }
+
+    void inverseInterleaved(const double *R__ complexIn, double *R__ realOut) {
+        Profiler profiler("D_OPENMAX::inverseInterleaved [d]");
+        if (!m_packed) initDouble();
+        convertDouble(complexIn);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackDouble(realOut);
+    }
+
+    void inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut) {
+        Profiler profiler("D_OPENMAX::inversePolar [d]");
+        if (!m_packed) initDouble();
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real, imag;
+            c_phasor(&real, &imag, phaseIn[i]);
+            m_fbuf[index++] = float(real);
+            m_fbuf[index++] = float(imag);
+        }
+        convertFloat(m_fbuf);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackDouble(realOut);
+    }
+
+    void inverseCepstral(const double *R__ magIn, double *R__ cepOut) {
+        Profiler profiler("D_OPENMAX::inverseCepstral [d]");
+        if (!m_packed) initDouble();
+        //!!! implement
+    }
+    
+    void inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut) {
+        Profiler profiler("D_OPENMAX::inverse [f]");
+        if (!m_packed) initFloat();
+        packFloat(realIn, imagIn);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackFloat(realOut);
+    }
+
+    void inverseInterleaved(const float *R__ complexIn, float *R__ realOut) {
+        Profiler profiler("D_OPENMAX::inverse [f]");
+        if (!m_packed) initFloat();
+        convertFloat(complexIn);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackFloat(realOut);
+    }
+
+    void inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut) {
+        Profiler profiler("D_OPENMAX::inversePolar [f]");
+        if (!m_packed) initFloat();
+        const int hs = m_size/2;
+        v_polar_to_cartesian_interleaved(m_fbuf, magIn, phaseIn, hs+1);
+        convertFloat(m_fbuf);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackFloat(realOut);
+    }
+
+    void inverseCepstral(const float *R__ magIn, float *R__ cepOut) {
+        Profiler profiler("D_OPENMAX::inverseCepstral [f]");
+        if (!m_packed) initFloat();
+        //!!! implement
+    }
+
+private:
+    const int m_size;
+    int m_order;
+    OMX_S32 *m_packed;
+    OMX_S32 *m_buf;
+    float *m_fbuf;
+    OMXFFTSpec_R_S32 *m_spec;
+
+};
+
+#endif /* HAVE_OPENMAX */
+
+#ifdef HAVE_FFTW3
+
+/*
+ Define FFTW_DOUBLE_ONLY to make all uses of FFTW functions be
+ double-precision (so "float" FFTs are calculated by casting to
+ doubles and using the double-precision FFTW function).
+
+ Define FFTW_SINGLE_ONLY to make all uses of FFTW functions be
+ single-precision (so "double" FFTs are calculated by casting to
+ floats and using the single-precision FFTW function).
+
+ Neither of these flags is desirable for either performance or
+ precision. The main reason to define either flag is to avoid linking
+ against both fftw3 and fftw3f libraries.
+*/
+
+//#define FFTW_DOUBLE_ONLY 1
+//#define FFTW_SINGLE_ONLY 1
+
+#if defined(FFTW_DOUBLE_ONLY) && defined(FFTW_SINGLE_ONLY)
+// Can't meaningfully define both
+#error Can only define one of FFTW_DOUBLE_ONLY and FFTW_SINGLE_ONLY
+#endif
+
+#if defined(FFTW_FLOAT_ONLY)
+#warning FFTW_FLOAT_ONLY is deprecated, use FFTW_SINGLE_ONLY instead
+#define FFTW_SINGLE_ONLY 1
+#endif
+
+#ifdef FFTW_DOUBLE_ONLY
+#define fft_float_type double
+#define fftwf_complex fftw_complex
+#define fftwf_plan fftw_plan
+#define fftwf_plan_dft_r2c_1d fftw_plan_dft_r2c_1d
+#define fftwf_plan_dft_c2r_1d fftw_plan_dft_c2r_1d
+#define fftwf_destroy_plan fftw_destroy_plan
+#define fftwf_malloc fftw_malloc
+#define fftwf_free fftw_free
+#define fftwf_execute fftw_execute
+#define atan2f atan2
+#define sqrtf sqrt
+#define cosf cos
+#define sinf sin
+#else
+#define fft_float_type float
+#endif /* FFTW_DOUBLE_ONLY */
+
+#ifdef FFTW_SINGLE_ONLY
+#define fft_double_type float
+#define fftw_complex fftwf_complex
+#define fftw_plan fftwf_plan
+#define fftw_plan_dft_r2c_1d fftwf_plan_dft_r2c_1d
+#define fftw_plan_dft_c2r_1d fftwf_plan_dft_c2r_1d
+#define fftw_destroy_plan fftwf_destroy_plan
+#define fftw_malloc fftwf_malloc
+#define fftw_free fftwf_free
+#define fftw_execute fftwf_execute
+#define atan2 atan2f
+#define sqrt sqrtf
+#define cos cosf
+#define sin sinf
+#else
+#define fft_double_type double
+#endif /* FFTW_SINGLE_ONLY */
+
+class D_FFTW : public FFTImpl
+{
+public:
+    D_FFTW(int size) :
+        m_fplanf(0), m_dplanf(0), m_size(size)
+    {
+    }
+
+    ~D_FFTW() {
+        if (m_fplanf) {
+#ifndef NO_THREADING
+            m_commonMutex.lock();
+#endif
+            bool save = false;
+            if (m_extantf > 0 && --m_extantf == 0) save = true;
+#ifndef FFTW_DOUBLE_ONLY
+            if (save) saveWisdom('f');
+#endif
+            fftwf_destroy_plan(m_fplanf);
+            fftwf_destroy_plan(m_fplani);
+            fftwf_free(m_fbuf);
+            fftwf_free(m_fpacked);
+#ifndef NO_THREADING
+            m_commonMutex.unlock();
+#endif
+        }
+        if (m_dplanf) {
+#ifndef NO_THREADING
+            m_commonMutex.lock();
+#endif
+            bool save = false;
+            if (m_extantd > 0 && --m_extantd == 0) save = true;
+#ifndef FFTW_SINGLE_ONLY
+            if (save) saveWisdom('d');
+#endif
+            fftw_destroy_plan(m_dplanf);
+            fftw_destroy_plan(m_dplani);
+            fftw_free(m_dbuf);
+            fftw_free(m_dpacked);
+#ifndef NO_THREADING
+            m_commonMutex.unlock();
+#endif
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+#ifdef FFTW_SINGLE_ONLY
+        return FFT::SinglePrecision;
+#else
+#ifdef FFTW_DOUBLE_ONLY
+        return FFT::DoublePrecision;
+#else
+        return FFT::SinglePrecision | FFT::DoublePrecision;
+#endif
+#endif
+    }
+
+    void initFloat() {
+        if (m_fplanf) return;
+        bool load = false;
+#ifndef NO_THREADING
+        m_commonMutex.lock();
+#endif
+        if (m_extantf++ == 0) load = true;
+#ifdef FFTW_DOUBLE_ONLY
+        if (load) loadWisdom('d');
+#else
+        if (load) loadWisdom('f');
+#endif
+        m_fbuf = (fft_float_type *)fftw_malloc(m_size * sizeof(fft_float_type));
+        m_fpacked = (fftwf_complex *)fftw_malloc
+            ((m_size/2 + 1) * sizeof(fftwf_complex));
+        m_fplanf = fftwf_plan_dft_r2c_1d
+            (m_size, m_fbuf, m_fpacked, FFTW_MEASURE);
+        m_fplani = fftwf_plan_dft_c2r_1d
+            (m_size, m_fpacked, m_fbuf, FFTW_MEASURE);
+#ifndef NO_THREADING
+        m_commonMutex.unlock();
+#endif
+    }
+
+    void initDouble() {
+        if (m_dplanf) return;
+        bool load = false;
+#ifndef NO_THREADING
+        m_commonMutex.lock();
+#endif
+        if (m_extantd++ == 0) load = true;
+#ifdef FFTW_SINGLE_ONLY
+        if (load) loadWisdom('f');
+#else
+        if (load) loadWisdom('d');
+#endif
+        m_dbuf = (fft_double_type *)fftw_malloc(m_size * sizeof(fft_double_type));
+        m_dpacked = (fftw_complex *)fftw_malloc
+            ((m_size/2 + 1) * sizeof(fftw_complex));
+        m_dplanf = fftw_plan_dft_r2c_1d
+            (m_size, m_dbuf, m_dpacked, FFTW_MEASURE);
+        m_dplani = fftw_plan_dft_c2r_1d
+            (m_size, m_dpacked, m_dbuf, FFTW_MEASURE);
+#ifndef NO_THREADING
+        m_commonMutex.unlock();
+#endif
+    }
+
+    void loadWisdom(char type) { wisdom(false, type); }
+    void saveWisdom(char type) { wisdom(true, type); }
+
+    void wisdom(bool save, char type) {
+
+#ifdef FFTW_DOUBLE_ONLY
+        if (type == 'f') return;
+#endif
+#ifdef FFTW_SINGLE_ONLY
+        if (type == 'd') return;
+#endif
+
+        const char *home = getenv("HOME");
+        if (!home) return;
+
+        char fn[256];
+        snprintf(fn, 256, "%s/%s.%c", home, ".rubberband.wisdom", type);
+
+        FILE *f = fopen(fn, save ? "wb" : "rb");
+        if (!f) return;
+
+        if (save) {
+            switch (type) {
+#ifdef FFTW_DOUBLE_ONLY
+            case 'f': break;
+#else
+            case 'f': fftwf_export_wisdom_to_file(f); break;
+#endif
+#ifdef FFTW_SINGLE_ONLY
+            case 'd': break;
+#else
+            case 'd': fftw_export_wisdom_to_file(f); break;
+#endif
+            default: break;
+            }
+        } else {
+            switch (type) {
+#ifdef FFTW_DOUBLE_ONLY
+            case 'f': break;
+#else
+            case 'f': fftwf_import_wisdom_from_file(f); break;
+#endif
+#ifdef FFTW_SINGLE_ONLY
+            case 'd': break;
+#else
+            case 'd': fftw_import_wisdom_from_file(f); break;
+#endif
+            default: break;
+            }
+        }
+
+        fclose(f);
+    }
+
+    void packFloat(const float *R__ re, const float *R__ im) {
+        const int hs = m_size/2;
+        fftwf_complex *const R__ fpacked = m_fpacked; 
+        for (int i = 0; i <= hs; ++i) {
+            fpacked[i][0] = re[i];
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                fpacked[i][1] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                fpacked[i][1] = 0.f;
+            }
+        }                
+    }
+
+    void packDouble(const double *R__ re, const double *R__ im) {
+        const int hs = m_size/2;
+        fftw_complex *const R__ dpacked = m_dpacked; 
+        for (int i = 0; i <= hs; ++i) {
+            dpacked[i][0] = re[i];
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                dpacked[i][1] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                dpacked[i][1] = 0.0;
+            }
+        }
+    }
+
+    void unpackFloat(float *R__ re, float *R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_fpacked[i][0];
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                im[i] = m_fpacked[i][1];
+            }
+        }
+    }        
+
+    void unpackDouble(double *R__ re, double *R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_dpacked[i][0];
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                im[i] = m_dpacked[i][1];
+            }
+        }
+    }        
+
+    void forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut) {
+        if (!m_dplanf) initDouble();
+        const int sz = m_size;
+        fft_double_type *const R__ dbuf = m_dbuf;
+#ifndef FFTW_SINGLE_ONLY
+        if (realIn != dbuf) 
+#endif
+            for (int i = 0; i < sz; ++i) {
+                dbuf[i] = realIn[i];
+            }
+        fftw_execute(m_dplanf);
+        unpackDouble(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const double *R__ realIn, double *R__ complexOut) {
+        if (!m_dplanf) initDouble();
+        const int sz = m_size;
+        fft_double_type *const R__ dbuf = m_dbuf;
+#ifndef FFTW_SINGLE_ONLY
+        if (realIn != dbuf) 
+#endif
+            for (int i = 0; i < sz; ++i) {
+                dbuf[i] = realIn[i];
+            }
+        fftw_execute(m_dplanf);
+        v_convert(complexOut, (fft_double_type *)m_dpacked, sz + 2);
+    }
+
+    void forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut) {
+        if (!m_dplanf) initDouble();
+        fft_double_type *const R__ dbuf = m_dbuf;
+        const int sz = m_size;
+#ifndef FFTW_SINGLE_ONLY
+        if (realIn != dbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                dbuf[i] = realIn[i];
+            }
+        fftw_execute(m_dplanf);
+        v_cartesian_interleaved_to_polar(magOut, phaseOut,
+                                         (double *)m_dpacked, m_size/2+1);
+    }
+
+    void forwardMagnitude(const double *R__ realIn, double *R__ magOut) {
+        if (!m_dplanf) initDouble();
+        fft_double_type *const R__ dbuf = m_dbuf;
+        const int sz = m_size;
+#ifndef FFTW_SINGLE_ONLY
+        if (realIn != m_dbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                dbuf[i] = realIn[i];
+            }
+        fftw_execute(m_dplanf);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_dpacked[i][0] * m_dpacked[i][0] +
+                             m_dpacked[i][1] * m_dpacked[i][1]);
+        }
+    }
+
+    void forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut) {
+        if (!m_fplanf) initFloat();
+        fft_float_type *const R__ fbuf = m_fbuf;
+        const int sz = m_size;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realIn != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                fbuf[i] = realIn[i];
+            }
+        fftwf_execute(m_fplanf);
+        unpackFloat(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *R__ realIn, float *R__ complexOut) {
+        if (!m_fplanf) initFloat();
+        fft_float_type *const R__ fbuf = m_fbuf;
+        const int sz = m_size;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realIn != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                fbuf[i] = realIn[i];
+            }
+        fftwf_execute(m_fplanf);
+        v_convert(complexOut, (fft_float_type *)m_fpacked, sz + 2);
+    }
+
+    void forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut) {
+        if (!m_fplanf) initFloat();
+        fft_float_type *const R__ fbuf = m_fbuf;
+        const int sz = m_size;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realIn != fbuf) 
+#endif
+            for (int i = 0; i < sz; ++i) {
+                fbuf[i] = realIn[i];
+            }
+        fftwf_execute(m_fplanf);
+        v_cartesian_interleaved_to_polar(magOut, phaseOut,
+                                         (float *)m_fpacked, m_size/2+1);
+    }
+
+    void forwardMagnitude(const float *R__ realIn, float *R__ magOut) {
+        if (!m_fplanf) initFloat();
+        fft_float_type *const R__ fbuf = m_fbuf;
+        const int sz = m_size;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realIn != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                fbuf[i] = realIn[i];
+            }
+        fftwf_execute(m_fplanf);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrtf(m_fpacked[i][0] * m_fpacked[i][0] +
+                              m_fpacked[i][1] * m_fpacked[i][1]);
+        }
+    }
+
+    void inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, imagIn);
+        fftw_execute(m_dplani);
+        const int sz = m_size;
+        fft_double_type *const R__ dbuf = m_dbuf;
+#ifndef FFTW_SINGLE_ONLY
+        if (realOut != dbuf) 
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = dbuf[i];
+            }
+    }
+
+    void inverseInterleaved(const double *R__ complexIn, double *R__ realOut) {
+        if (!m_dplanf) initDouble();
+        v_convert((double *)m_dpacked, complexIn, m_size + 2);
+        fftw_execute(m_dplani);
+        const int sz = m_size;
+        fft_double_type *const R__ dbuf = m_dbuf;
+#ifndef FFTW_SINGLE_ONLY
+        if (realOut != dbuf) 
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = dbuf[i];
+            }
+    }
+
+    void inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut) {
+        if (!m_dplanf) initDouble();
+        const int hs = m_size/2;
+        fftw_complex *const R__ dpacked = m_dpacked;
+        for (int i = 0; i <= hs; ++i) {
+            dpacked[i][0] = magIn[i] * cos(phaseIn[i]);
+        }
+        for (int i = 0; i <= hs; ++i) {
+            dpacked[i][1] = magIn[i] * sin(phaseIn[i]);
+        }
+        fftw_execute(m_dplani);
+        const int sz = m_size;
+        fft_double_type *const R__ dbuf = m_dbuf;
+#ifndef FFTW_SINGLE_ONLY
+        if (realOut != dbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = dbuf[i];
+            }
+    }
+
+    void inverseCepstral(const double *R__ magIn, double *R__ cepOut) {
+        if (!m_dplanf) initDouble();
+        fft_double_type *const R__ dbuf = m_dbuf;
+        fftw_complex *const R__ dpacked = m_dpacked;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            dpacked[i][0] = log(magIn[i] + 0.000001);
+        }
+        for (int i = 0; i <= hs; ++i) {
+            dpacked[i][1] = 0.0;
+        }
+        fftw_execute(m_dplani);
+        const int sz = m_size;
+#ifndef FFTW_SINGLE_ONLY
+        if (cepOut != dbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                cepOut[i] = dbuf[i];
+            }
+    }
+
+    void inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, imagIn);
+        fftwf_execute(m_fplani);
+        const int sz = m_size;
+        fft_float_type *const R__ fbuf = m_fbuf;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realOut != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = fbuf[i];
+            }
+    }
+
+    void inverseInterleaved(const float *R__ complexIn, float *R__ realOut) {
+        if (!m_fplanf) initFloat();
+        v_copy((float *)m_fpacked, complexIn, m_size + 2);
+        fftwf_execute(m_fplani);
+        const int sz = m_size;
+        fft_float_type *const R__ fbuf = m_fbuf;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realOut != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = fbuf[i];
+            }
+    }
+
+    void inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut) {
+        if (!m_fplanf) initFloat();
+        const int hs = m_size/2;
+        fftwf_complex *const R__ fpacked = m_fpacked;
+        for (int i = 0; i <= hs; ++i) {
+            fpacked[i][0] = magIn[i] * cosf(phaseIn[i]);
+        }
+        for (int i = 0; i <= hs; ++i) {
+            fpacked[i][1] = magIn[i] * sinf(phaseIn[i]);
+        }
+        fftwf_execute(m_fplani);
+        const int sz = m_size;
+        fft_float_type *const R__ fbuf = m_fbuf;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realOut != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = fbuf[i];
+            }
+    }
+
+    void inverseCepstral(const float *R__ magIn, float *R__ cepOut) {
+        if (!m_fplanf) initFloat();
+        const int hs = m_size/2;
+        fftwf_complex *const R__ fpacked = m_fpacked;
+        for (int i = 0; i <= hs; ++i) {
+            fpacked[i][0] = logf(magIn[i] + 0.000001f);
+        }
+        for (int i = 0; i <= hs; ++i) {
+            fpacked[i][1] = 0.f;
+        }
+        fftwf_execute(m_fplani);
+        const int sz = m_size;
+        fft_float_type *const R__ fbuf = m_fbuf;
+#ifndef FFTW_DOUBLE_ONLY
+        if (cepOut != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                cepOut[i] = fbuf[i];
+            }
+    }
+
+private:
+    fftwf_plan m_fplanf;
+    fftwf_plan m_fplani;
+#ifdef FFTW_DOUBLE_ONLY
+    double *m_fbuf;
+#else
+    float *m_fbuf;
+#endif
+    fftwf_complex *m_fpacked;
+    fftw_plan m_dplanf;
+    fftw_plan m_dplani;
+#ifdef FFTW_SINGLE_ONLY
+    float *m_dbuf;
+#else
+    double *m_dbuf;
+#endif
+    fftw_complex *m_dpacked;
+    const int m_size;
+    static int m_extantf;
+    static int m_extantd;
+#ifndef NO_THREADING
+    static Mutex m_commonMutex;
+#endif
+};
+
+int
+D_FFTW::m_extantf = 0;
+
+int
+D_FFTW::m_extantd = 0;
+
+#ifndef NO_THREADING
+Mutex
+D_FFTW::m_commonMutex;
+#endif
+
+#endif /* HAVE_FFTW3 */
+
+#ifdef HAVE_SFFT
+
+/*
+ Define SFFT_DOUBLE_ONLY to make all uses of SFFT functions be
+ double-precision (so "float" FFTs are calculated by casting to
+ doubles and using the double-precision SFFT function).
+
+ Define SFFT_SINGLE_ONLY to make all uses of SFFT functions be
+ single-precision (so "double" FFTs are calculated by casting to
+ floats and using the single-precision SFFT function).
+
+ Neither of these flags is desirable for either performance or
+ precision.
+*/
+
+//#define SFFT_DOUBLE_ONLY 1
+//#define SFFT_SINGLE_ONLY 1
+
+#if defined(SFFT_DOUBLE_ONLY) && defined(SFFT_SINGLE_ONLY)
+// Can't meaningfully define both
+#error Can only define one of SFFT_DOUBLE_ONLY and SFFT_SINGLE_ONLY
+#endif
+
+#ifdef SFFT_DOUBLE_ONLY
+#define fft_float_type double
+#define FLAG_SFFT_FLOAT SFFT_DOUBLE
+#define atan2f atan2
+#define sqrtf sqrt
+#define cosf cos
+#define sinf sin
+#define logf log
+#else
+#define FLAG_SFFT_FLOAT SFFT_FLOAT
+#define fft_float_type float
+#endif /* SFFT_DOUBLE_ONLY */
+
+#ifdef SFFT_SINGLE_ONLY
+#define fft_double_type float
+#define FLAG_SFFT_DOUBLE SFFT_FLOAT
+#define atan2 atan2f
+#define sqrt sqrtf
+#define cos cosf
+#define sin sinf
+#define log logf
+#else
+#define FLAG_SFFT_DOUBLE SFFT_DOUBLE
+#define fft_double_type double
+#endif /* SFFT_SINGLE_ONLY */
+
+class D_SFFT : public FFTImpl
+{
+public:
+    D_SFFT(int size) :
+        m_fplanf(0), m_fplani(0), m_dplanf(0), m_dplani(0), m_size(size)
+    {
+    }
+
+    ~D_SFFT() {
+        if (m_fplanf) {
+            sfft_free(m_fplanf);
+            sfft_free(m_fplani);
+            deallocate(m_fbuf);
+            deallocate(m_fresult);
+        }
+        if (m_dplanf) {
+            sfft_free(m_dplanf);
+            sfft_free(m_dplani);
+            deallocate(m_dbuf);
+            deallocate(m_dresult);
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+#ifdef SFFT_SINGLE_ONLY
+        return FFT::SinglePrecision;
+#else
+#ifdef SFFT_DOUBLE_ONLY
+        return FFT::DoublePrecision;
+#else
+        return FFT::SinglePrecision | FFT::DoublePrecision;
+#endif
+#endif
+    }
+
+    void initFloat() {
+        if (m_fplanf) return;
+        m_fbuf = allocate<fft_float_type>(2 * m_size);
+        m_fresult = allocate<fft_float_type>(2 * m_size);
+        m_fplanf = sfft_init(m_size, SFFT_FORWARD | FLAG_SFFT_FLOAT);
+        m_fplani = sfft_init(m_size, SFFT_BACKWARD | FLAG_SFFT_FLOAT);
+        if (!m_fplanf || !m_fplani) {
+            if (!m_fplanf) {
+                std::cerr << "D_SFFT: Failed to construct forward float transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl;
+            } else {
+                std::cerr << "D_SFFT: Failed to construct inverse float transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl;
+            }
+#ifndef NO_EXCEPTIONS
+            throw FFT::InternalError;
+#else
+            abort();
+#endif
+        }
+    }
+
+    void initDouble() {
+        if (m_dplanf) return;
+        m_dbuf = allocate<fft_double_type>(2 * m_size);
+        m_dresult = allocate<fft_double_type>(2 * m_size);
+        m_dplanf = sfft_init(m_size, SFFT_FORWARD | FLAG_SFFT_DOUBLE);
+        m_dplani = sfft_init(m_size, SFFT_BACKWARD | FLAG_SFFT_DOUBLE);
+        if (!m_dplanf || !m_dplani) {
+            if (!m_dplanf) {
+                std::cerr << "D_SFFT: Failed to construct forward double transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl;
+            } else {
+                std::cerr << "D_SFFT: Failed to construct inverse double transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl;
+            }
+#ifndef NO_EXCEPTIONS
+            throw FFT::InternalError;
+#else
+            abort();
+#endif
+        }
+    }
+
+    void packFloat(const float *R__ re, const float *R__ im, fft_float_type *target, int n) {
+        for (int i = 0; i < n; ++i) target[i*2] = re[i];
+        if (im) {
+            for (int i = 0; i < n; ++i) target[i*2+1] = im[i]; 
+        } else {
+            for (int i = 0; i < n; ++i) target[i*2+1] = 0.f;
+        }                
+    }
+
+    void packDouble(const double *R__ re, const double *R__ im, fft_double_type *target, int n) {
+        for (int i = 0; i < n; ++i) target[i*2] = re[i];
+        if (im) {
+            for (int i = 0; i < n; ++i) target[i*2+1] = im[i];
+        } else {
+            for (int i = 0; i < n; ++i) target[i*2+1] = 0.0;
+        }                
+    }
+
+    void unpackFloat(const fft_float_type *source, float *R__ re, float *R__ im, int n) {
+        for (int i = 0; i < n; ++i) re[i] = source[i*2];
+        if (im) {
+            for (int i = 0; i < n; ++i) im[i] = source[i*2+1];
+        }
+    }        
+
+    void unpackDouble(const fft_double_type *source, double *R__ re, double *R__ im, int n) {
+        for (int i = 0; i < n; ++i) re[i] = source[i*2];
+        if (im) {
+            for (int i = 0; i < n; ++i) im[i] = source[i*2+1];
+        }
+    }        
+
+    template<typename T>
+    void mirror(T *R__ cplx, int n) {
+        for (int i = 1; i <= n/2; ++i) {
+            int j = n-i;
+            cplx[j*2] = cplx[i*2];
+            cplx[j*2+1] = -cplx[i*2+1];
+        }
+    }
+
+    void forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, 0, m_dbuf, m_size);
+        sfft_execute(m_dplanf, m_dbuf, m_dresult);
+        unpackDouble(m_dresult, realOut, imagOut, m_size/2+1);
+    }
+
+    void forwardInterleaved(const double *R__ realIn, double *R__ complexOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, 0, m_dbuf, m_size);
+        sfft_execute(m_dplanf, m_dbuf, m_dresult);
+        v_convert(complexOut, m_dresult, m_size+2); // i.e. m_size/2+1 complex
+    }
+
+    void forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, 0, m_dbuf, m_size);
+        sfft_execute(m_dplanf, m_dbuf, m_dresult);
+        v_cartesian_interleaved_to_polar(magOut, phaseOut,
+                                         m_dresult, m_size/2+1);
+    }
+
+    void forwardMagnitude(const double *R__ realIn, double *R__ magOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, 0, m_dbuf, m_size);
+        sfft_execute(m_dplanf, m_dbuf, m_dresult);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_dresult[i*2] * m_dresult[i*2] +
+                             m_dresult[i*2+1] * m_dresult[i*2+1]);
+        }
+    }
+
+    void forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, 0, m_fbuf, m_size);
+        sfft_execute(m_fplanf, m_fbuf, m_fresult);
+        unpackFloat(m_fresult, realOut, imagOut, m_size/2+1);
+    }
+
+    void forwardInterleaved(const float *R__ realIn, float *R__ complexOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, 0, m_fbuf, m_size);
+        sfft_execute(m_fplanf, m_fbuf, m_fresult);
+        v_convert(complexOut, m_fresult, m_size+2); // i.e. m_size/2+1 complex
+    }
+
+    void forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, 0, m_fbuf, m_size);
+        sfft_execute(m_fplanf, m_fbuf, m_fresult);
+        v_cartesian_interleaved_to_polar(magOut, phaseOut,
+                                         m_fresult, m_size/2+1);
+    }
+
+    void forwardMagnitude(const float *R__ realIn, float *R__ magOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, 0, m_fbuf, m_size);
+        sfft_execute(m_fplanf, m_fbuf, m_fresult);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrtf(m_fresult[i*2] * m_fresult[i*2] +
+                              m_fresult[i*2+1] * m_fresult[i*2+1]);
+        }
+    }
+
+    void inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, imagIn, m_dbuf, m_size/2+1);
+        mirror(m_dbuf, m_size);
+        sfft_execute(m_dplani, m_dbuf, m_dresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_dresult[i*2];
+        }
+    }
+
+    void inverseInterleaved(const double *R__ complexIn, double *R__ realOut) {
+        if (!m_dplanf) initDouble();
+        v_convert((double *)m_dbuf, complexIn, m_size + 2);
+        mirror(m_dbuf, m_size);
+        sfft_execute(m_dplani, m_dbuf, m_dresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_dresult[i*2];
+        }
+    }
+
+    void inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut) {
+        if (!m_dplanf) initDouble();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_dbuf[i*2] = magIn[i] * cos(phaseIn[i]);
+            m_dbuf[i*2+1] = magIn[i] * sin(phaseIn[i]);
+        }
+        mirror(m_dbuf, m_size);
+        sfft_execute(m_dplani, m_dbuf, m_dresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_dresult[i*2];
+        }
+    }
+
+    void inverseCepstral(const double *R__ magIn, double *R__ cepOut) {
+        if (!m_dplanf) initDouble();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_dbuf[i*2] = log(magIn[i] + 0.000001);
+            m_dbuf[i*2+1] = 0.0;
+        }
+        mirror(m_dbuf, m_size);
+        sfft_execute(m_dplani, m_dbuf, m_dresult);
+        for (int i = 0; i < m_size; ++i) {
+            cepOut[i] = m_dresult[i*2];
+        }
+    }
+
+    void inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, imagIn, m_fbuf, m_size/2+1);
+        mirror(m_fbuf, m_size);
+        sfft_execute(m_fplani, m_fbuf, m_fresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fresult[i*2];
+        }
+    }
+
+    void inverseInterleaved(const float *R__ complexIn, float *R__ realOut) {
+        if (!m_fplanf) initFloat();
+        v_convert((float *)m_fbuf, complexIn, m_size + 2);
+        mirror(m_fbuf, m_size);
+        sfft_execute(m_fplani, m_fbuf, m_fresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fresult[i*2];
+        }
+    }
+
+    void inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut) {
+        if (!m_fplanf) initFloat();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fbuf[i*2] = magIn[i] * cosf(phaseIn[i]);
+            m_fbuf[i*2+1] = magIn[i] * sinf(phaseIn[i]);
+        }
+        mirror(m_fbuf, m_size);
+        sfft_execute(m_fplani, m_fbuf, m_fresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fresult[i*2];
+        }
+    }
+
+    void inverseCepstral(const float *R__ magIn, float *R__ cepOut) {
+        if (!m_fplanf) initFloat();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fbuf[i*2] = logf(magIn[i] + 0.00001);
+            m_fbuf[i*2+1] = 0.0f;
+        }
+        sfft_execute(m_fplani, m_fbuf, m_fresult);
+        for (int i = 0; i < m_size; ++i) {
+            cepOut[i] = m_fresult[i*2];
+        }
+    }
+
+private:
+    sfft_plan_t *m_fplanf;
+    sfft_plan_t *m_fplani;
+    fft_float_type *m_fbuf;
+    fft_float_type *m_fresult;
+
+    sfft_plan_t *m_dplanf;
+    sfft_plan_t *m_dplani;
+    fft_double_type *m_dbuf;
+    fft_double_type *m_dresult;
+
+    const int m_size;
+};
+
+#endif /* HAVE_SFFT */
+
+#ifdef USE_KISSFFT
+
+class D_KISSFFT : public FFTImpl
+{
+public:
+    D_KISSFFT(int size) :
+        m_size(size),
+        m_fplanf(0),  
+        m_fplani(0)
+    {
+#ifdef FIXED_POINT
+#error KISSFFT is not configured for float values
+#endif
+        if (sizeof(kiss_fft_scalar) != sizeof(float)) {
+            std::cerr << "ERROR: KISSFFT is not configured for float values"
+                      << std::endl;
+        }
+
+        m_fbuf = new kiss_fft_scalar[m_size + 2];
+        m_fpacked = new kiss_fft_cpx[m_size + 2];
+        m_fplanf = kiss_fftr_alloc(m_size, 0, NULL, NULL);
+        m_fplani = kiss_fftr_alloc(m_size, 1, NULL, NULL);
+    }
+
+    ~D_KISSFFT() {
+        kiss_fftr_free(m_fplanf);
+        kiss_fftr_free(m_fplani);
+        kiss_fft_cleanup();
+
+        delete[] m_fbuf;
+        delete[] m_fpacked;
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::SinglePrecision;
+    }
+
+    void initFloat() { }
+    void initDouble() { }
+
+    void packFloat(const float *R__ re, const float *R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = re[i];
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                m_fpacked[i].i = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                m_fpacked[i].i = 0.f;
+            }
+        }
+    }
+
+    void unpackFloat(float *R__ re, float *R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_fpacked[i].r;
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                im[i] = m_fpacked[i].i;
+            }
+        }
+    }        
+
+    void packDouble(const double *R__ re, const double *R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = float(re[i]);
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                m_fpacked[i].i = float(im[i]);
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                m_fpacked[i].i = 0.f;
+            }
+        }
+    }
+
+    void unpackDouble(double *R__ re, double *R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = double(m_fpacked[i].r);
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                im[i] = double(m_fpacked[i].i);
+            }
+        }
+    }        
+
+    void forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut) {
+
+        v_convert(m_fbuf, realIn, m_size);
+        kiss_fftr(m_fplanf, m_fbuf, m_fpacked);
+        unpackDouble(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const double *R__ realIn, double *R__ complexOut) {
+
+        v_convert(m_fbuf, realIn, m_size);
+        kiss_fftr(m_fplanf, m_fbuf, m_fpacked);
+        v_convert(complexOut, (float *)m_fpacked, m_size + 2);
+    }
+
+    void forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut) {
+
+        for (int i = 0; i < m_size; ++i) {
+            m_fbuf[i] = float(realIn[i]);
+        }
+
+        kiss_fftr(m_fplanf, m_fbuf, m_fpacked);
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(double(m_fpacked[i].r) * double(m_fpacked[i].r) +
+                             double(m_fpacked[i].i) * double(m_fpacked[i].i));
+        }
+
+        for (int i = 0; i <= hs; ++i) {
+            phaseOut[i] = atan2(double(m_fpacked[i].i), double(m_fpacked[i].r));
+        }
+    }
+
+    void forwardMagnitude(const double *R__ realIn, double *R__ magOut) {
+
+        for (int i = 0; i < m_size; ++i) {
+            m_fbuf[i] = float(realIn[i]);
+        }
+
+        kiss_fftr(m_fplanf, m_fbuf, m_fpacked);
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(double(m_fpacked[i].r) * double(m_fpacked[i].r) +
+                             double(m_fpacked[i].i) * double(m_fpacked[i].i));
+        }
+    }
+
+    void forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut) {
+
+        kiss_fftr(m_fplanf, realIn, m_fpacked);
+        unpackFloat(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *R__ realIn, float *R__ complexOut) {
+
+        kiss_fftr(m_fplanf, realIn, (kiss_fft_cpx *)complexOut);
+    }
+
+    void forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut) {
+
+        kiss_fftr(m_fplanf, realIn, m_fpacked);
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrtf(m_fpacked[i].r * m_fpacked[i].r +
+                              m_fpacked[i].i * m_fpacked[i].i);
+        }
+
+        for (int i = 0; i <= hs; ++i) {
+            phaseOut[i] = atan2f(m_fpacked[i].i, m_fpacked[i].r);
+        }
+    }
+
+    void forwardMagnitude(const float *R__ realIn, float *R__ magOut) {
+
+        kiss_fftr(m_fplanf, realIn, m_fpacked);
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrtf(m_fpacked[i].r * m_fpacked[i].r +
+                              m_fpacked[i].i * m_fpacked[i].i);
+        }
+    }
+
+    void inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut) {
+
+        packDouble(realIn, imagIn);
+
+        kiss_fftri(m_fplani, m_fpacked, m_fbuf);
+
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fbuf[i];
+        }
+    }
+
+    void inverseInterleaved(const double *R__ complexIn, double *R__ realOut) {
+
+        v_convert((float *)m_fpacked, complexIn, m_size + 2);
+
+        kiss_fftri(m_fplani, m_fpacked, m_fbuf);
+
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fbuf[i];
+        }
+    }
+
+    void inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut) {
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = float(magIn[i] * cos(phaseIn[i]));
+            m_fpacked[i].i = float(magIn[i] * sin(phaseIn[i]));
+        }
+
+        kiss_fftri(m_fplani, m_fpacked, m_fbuf);
+
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fbuf[i];
+        }
+    }
+
+    void inverseCepstral(const double *R__ magIn, double *R__ cepOut) {
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = float(log(magIn[i] + 0.000001));
+            m_fpacked[i].i = 0.0f;
+        }
+
+        kiss_fftri(m_fplani, m_fpacked, m_fbuf);
+
+        for (int i = 0; i < m_size; ++i) {
+            cepOut[i] = m_fbuf[i];
+        }
+    }
+    
+    void inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut) {
+
+        packFloat(realIn, imagIn);
+        kiss_fftri(m_fplani, m_fpacked, realOut);
+    }
+
+    void inverseInterleaved(const float *R__ complexIn, float *R__ realOut) {
+
+        v_copy((float *)m_fpacked, complexIn, m_size + 2);
+        kiss_fftri(m_fplani, m_fpacked, realOut);
+    }
+
+    void inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut) {
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = magIn[i] * cosf(phaseIn[i]);
+            m_fpacked[i].i = magIn[i] * sinf(phaseIn[i]);
+        }
+
+        kiss_fftri(m_fplani, m_fpacked, realOut);
+    }
+
+    void inverseCepstral(const float *R__ magIn, float *R__ cepOut) {
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = logf(magIn[i] + 0.000001f);
+            m_fpacked[i].i = 0.0f;
+        }
+
+        kiss_fftri(m_fplani, m_fpacked, cepOut);
+    }
+
+private:
+    const int m_size;
+    kiss_fftr_cfg m_fplanf;
+    kiss_fftr_cfg m_fplani;
+    kiss_fft_scalar *m_fbuf;
+    kiss_fft_cpx *m_fpacked;
+};
+
+#endif /* USE_KISSFFT */
+
+#ifdef USE_BUILTIN_FFT
+
+class D_Cross : public FFTImpl
+{
+public:
+    D_Cross(int size) : m_size(size), m_table(0) {
+        
+        m_a = new double[size];
+        m_b = new double[size];
+        m_c = new double[size];
+        m_d = new double[size];
+
+        m_table = new int[m_size];
+    
+        int bits;
+        int i, j, k, m;
+
+        for (i = 0; ; ++i) {
+            if (m_size & (1 << i)) {
+                bits = i;
+                break;
+            }
+        }
+        
+        for (i = 0; i < m_size; ++i) {
+            
+            m = i;
+            
+            for (j = k = 0; j < bits; ++j) {
+                k = (k << 1) | (m & 1);
+                m >>= 1;
+            }
+            
+            m_table[i] = k;
+        }
+    }
+
+    ~D_Cross() {
+        delete[] m_table;
+        delete[] m_a;
+        delete[] m_b;
+        delete[] m_c;
+        delete[] m_d;
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::DoublePrecision;
+    }
+
+    void initFloat() { }
+    void initDouble() { }
+
+    void forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut) {
+        basefft(false, realIn, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) realOut[i] = m_c[i];
+        if (imagOut) {
+            for (int i = 0; i <= hs; ++i) imagOut[i] = m_d[i];
+        }
+    }
+
+    void forwardInterleaved(const double *R__ realIn, double *R__ complexOut) {
+        basefft(false, realIn, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) complexOut[i*2] = m_c[i];
+        for (int i = 0; i <= hs; ++i) complexOut[i*2+1] = m_d[i];
+    }
+
+    void forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut) {
+        basefft(false, realIn, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]);
+            phaseOut[i] = atan2(m_d[i], m_c[i]) ;
+        }
+    }
+
+    void forwardMagnitude(const double *R__ realIn, double *R__ magOut) {
+        basefft(false, realIn, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]);
+        }
+    }
+
+    void forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut) {
+        for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i];
+        basefft(false, m_a, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) realOut[i] = m_c[i];
+        if (imagOut) {
+            for (int i = 0; i <= hs; ++i) imagOut[i] = m_d[i];
+        }
+    }
+
+    void forwardInterleaved(const float *R__ realIn, float *R__ complexOut) {
+        for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i];
+        basefft(false, m_a, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) complexOut[i*2] = m_c[i];
+        for (int i = 0; i <= hs; ++i) complexOut[i*2+1] = m_d[i];
+    }
+
+    void forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut) {
+        for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i];
+        basefft(false, m_a, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]);
+            phaseOut[i] = atan2(m_d[i], m_c[i]) ;
+        }
+    }
+
+    void forwardMagnitude(const float *R__ realIn, float *R__ magOut) {
+        for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i];
+        basefft(false, m_a, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]);
+        }
+    }
+
+    void inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = realIn[i];
+            double imag = imagIn[i];
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, realOut, m_d);
+    }
+
+    void inverseInterleaved(const double *R__ complexIn, double *R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = complexIn[i*2];
+            double imag = complexIn[i*2+1];
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, realOut, m_d);
+    }
+
+    void inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = magIn[i] * cos(phaseIn[i]);
+            double imag = magIn[i] * sin(phaseIn[i]);
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, realOut, m_d);
+    }
+
+    void inverseCepstral(const double *R__ magIn, double *R__ cepOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = log(magIn[i] + 0.000001);
+            m_a[i] = real;
+            m_b[i] = 0.0;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = 0.0;
+            }
+        }
+        basefft(true, m_a, m_b, cepOut, m_d);
+    }
+
+    void inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            float real = realIn[i];
+            float imag = imagIn[i];
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, m_c, m_d);
+        for (int i = 0; i < m_size; ++i) realOut[i] = m_c[i];
+    }
+
+    void inverseInterleaved(const float *R__ complexIn, float *R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            float real = complexIn[i*2];
+            float imag = complexIn[i*2+1];
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, m_c, m_d);
+        for (int i = 0; i < m_size; ++i) realOut[i] = m_c[i];
+    }
+
+    void inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            float real = magIn[i] * cosf(phaseIn[i]);
+            float imag = magIn[i] * sinf(phaseIn[i]);
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, m_c, m_d);
+        for (int i = 0; i < m_size; ++i) realOut[i] = m_c[i];
+    }
+
+    void inverseCepstral(const float *R__ magIn, float *R__ cepOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            float real = logf(magIn[i] + 0.000001);
+            m_a[i] = real;
+            m_b[i] = 0.0;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = 0.0;
+            }
+        }
+        basefft(true, m_a, m_b, m_c, m_d);
+        for (int i = 0; i < m_size; ++i) cepOut[i] = m_c[i];
+    }
+
+private:
+    const int m_size;
+    int *m_table;
+    double *m_a;
+    double *m_b;
+    double *m_c;
+    double *m_d;
+    void basefft(bool inverse, const double *R__ ri, const double *R__ ii, double *R__ ro, double *R__ io);
+};
+
+void
+D_Cross::basefft(bool inverse, const double *R__ ri, const double *R__ ii, double *R__ ro, double *R__ io)
+{
+    if (!ri || !ro || !io) return;
+
+    int i, j, k, m;
+    int blockSize, blockEnd;
+
+    double tr, ti;
+
+    double angle = 2.0 * M_PI;
+    if (inverse) angle = -angle;
+
+    const int n = m_size;
+
+    if (ii) {
+	for (i = 0; i < n; ++i) {
+	    ro[m_table[i]] = ri[i];
+        }
+	for (i = 0; i < n; ++i) {
+	    io[m_table[i]] = ii[i];
+	}
+    } else {
+	for (i = 0; i < n; ++i) {
+	    ro[m_table[i]] = ri[i];
+        }
+	for (i = 0; i < n; ++i) {
+	    io[m_table[i]] = 0.0;
+	}
+    }
+
+    blockEnd = 1;
+
+    for (blockSize = 2; blockSize <= n; blockSize <<= 1) {
+
+	double delta = angle / (double)blockSize;
+	double sm2 = -sin(-2 * delta);
+	double sm1 = -sin(-delta);
+	double cm2 = cos(-2 * delta);
+	double cm1 = cos(-delta);
+	double w = 2 * cm1;
+	double ar[3], ai[3];
+
+	for (i = 0; i < n; i += blockSize) {
+
+	    ar[2] = cm2;
+	    ar[1] = cm1;
+
+	    ai[2] = sm2;
+	    ai[1] = sm1;
+
+	    for (j = i, m = 0; m < blockEnd; j++, m++) {
+
+		ar[0] = w * ar[1] - ar[2];
+		ar[2] = ar[1];
+		ar[1] = ar[0];
+
+		ai[0] = w * ai[1] - ai[2];
+		ai[2] = ai[1];
+		ai[1] = ai[0];
+
+		k = j + blockEnd;
+		tr = ar[0] * ro[k] - ai[0] * io[k];
+		ti = ar[0] * io[k] + ai[0] * ro[k];
+
+		ro[k] = ro[j] - tr;
+		io[k] = io[j] - ti;
+
+		ro[j] += tr;
+		io[j] += ti;
+	    }
+	}
+
+	blockEnd = blockSize;
+    }
+
+/* fftw doesn't rescale, so nor will we
+
+    if (inverse) {
+
+	double denom = (double)n;
+
+	for (i = 0; i < n; i++) {
+	    ro[i] /= denom;
+	    io[i] /= denom;
+	}
+    }
+*/
+}
+
+#endif /* USE_BUILTIN_FFT */
+
+} /* end namespace FFTs */
+
+std::string
+FFT::m_implementation;
+
+std::set<std::string>
+FFT::getImplementations()
+{
+    std::set<std::string> impls;
+#ifdef HAVE_IPP
+    impls.insert("ipp");
+#endif
+#ifdef HAVE_FFTW3
+    impls.insert("fftw");
+#endif
+#ifdef USE_KISSFFT
+    impls.insert("kissfft");
+#endif
+#ifdef HAVE_VDSP
+    impls.insert("vdsp");
+#endif
+#ifdef HAVE_MEDIALIB
+    impls.insert("medialib");
+#endif
+#ifdef HAVE_OPENMAX
+    impls.insert("openmax");
+#endif
+#ifdef HAVE_SFFT
+    impls.insert("sfft");
+#endif
+#ifdef USE_BUILTIN_FFT
+    impls.insert("cross");
+#endif
+    return impls;
+}
+
+void
+FFT::pickDefaultImplementation()
+{
+    if (m_implementation != "") return;
+
+    std::set<std::string> impls = getImplementations();
+
+    std::string best = "cross";
+    if (impls.find("kissfft") != impls.end()) best = "kissfft";
+    if (impls.find("medialib") != impls.end()) best = "medialib";
+    if (impls.find("openmax") != impls.end()) best = "openmax";
+    if (impls.find("sfft") != impls.end()) best = "sfft";
+    if (impls.find("fftw") != impls.end()) best = "fftw";
+    if (impls.find("vdsp") != impls.end()) best = "vdsp";
+    if (impls.find("ipp") != impls.end()) best = "ipp";
+    
+    m_implementation = best;
+}
+
+std::string
+FFT::getDefaultImplementation()
+{
+    return m_implementation;
+}
+
+void
+FFT::setDefaultImplementation(std::string i)
+{
+    m_implementation = i;
+}
+
+FFT::FFT(int size, int debugLevel) :
+    d(0)
+{
+    if ((size < 2) ||
+        (size & (size-1))) {
+        std::cerr << "FFT::FFT(" << size << "): power-of-two sizes only supported, minimum size 2" << std::endl;
+#ifndef NO_EXCEPTIONS
+        throw InvalidSize;
+#else
+        abort();
+#endif
+    }
+
+    if (m_implementation == "") pickDefaultImplementation();
+    std::string impl = m_implementation;
+
+    if (debugLevel > 0) {
+        std::cerr << "FFT::FFT(" << size << "): using implementation: "
+                  << impl << std::endl;
+    }
+
+    if (impl == "ipp") {
+#ifdef HAVE_IPP
+        d = new FFTs::D_IPP(size);
+#endif
+    } else if (impl == "fftw") {
+#ifdef HAVE_FFTW3
+        d = new FFTs::D_FFTW(size);
+#endif
+    } else if (impl == "kissfft") {        
+#ifdef USE_KISSFFT
+        d = new FFTs::D_KISSFFT(size);
+#endif
+    } else if (impl == "vdsp") {
+#ifdef HAVE_VDSP
+        d = new FFTs::D_VDSP(size);
+#endif
+    } else if (impl == "medialib") {
+#ifdef HAVE_MEDIALIB
+        d = new FFTs::D_MEDIALIB(size);
+#endif
+    } else if (impl == "openmax") {
+#ifdef HAVE_OPENMAX
+        d = new FFTs::D_OPENMAX(size);
+#endif
+    } else if (impl == "sfft") {
+#ifdef HAVE_SFFT
+        d = new FFTs::D_SFFT(size);
+#endif
+    } else if (impl == "cross") {
+#ifdef USE_BUILTIN_FFT
+        d = new FFTs::D_Cross(size);
+#endif
+    }
+
+    if (!d) {
+        std::cerr << "FFT::FFT(" << size << "): ERROR: implementation "
+                  << impl << " is not compiled in" << std::endl;
+#ifndef NO_EXCEPTIONS
+        throw InvalidImplementation;
+#else
+        abort();
+#endif
+    }
+}
+
+FFT::~FFT()
+{
+    delete d;
+}
+
+#ifndef NO_EXCEPTIONS
+#define CHECK_NOT_NULL(x) \
+    if (!(x)) { \
+        std::cerr << "FFT: ERROR: Null argument " #x << std::endl;  \
+        throw NullArgument; \
+    }
+#else
+#define CHECK_NOT_NULL(x) \
+    if (!(x)) { \
+        std::cerr << "FFT: ERROR: Null argument " #x << std::endl;  \
+        std::cerr << "FFT: Would be throwing NullArgument here, if exceptions were not disabled" << std::endl;  \
+        return; \
+    }
+#endif
+
+void
+FFT::forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(realOut);
+    CHECK_NOT_NULL(imagOut);
+    d->forward(realIn, realOut, imagOut);
+}
+
+void
+FFT::forwardInterleaved(const double *R__ realIn, double *R__ complexOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(complexOut);
+    d->forwardInterleaved(realIn, complexOut);
+}
+
+void
+FFT::forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(magOut);
+    CHECK_NOT_NULL(phaseOut);
+    d->forwardPolar(realIn, magOut, phaseOut);
+}
+
+void
+FFT::forwardMagnitude(const double *R__ realIn, double *R__ magOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(magOut);
+    d->forwardMagnitude(realIn, magOut);
+}
+
+void
+FFT::forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(realOut);
+    CHECK_NOT_NULL(imagOut);
+    d->forward(realIn, realOut, imagOut);
+}
+
+void
+FFT::forwardInterleaved(const float *R__ realIn, float *R__ complexOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(complexOut);
+    d->forwardInterleaved(realIn, complexOut);
+}
+
+void
+FFT::forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(magOut);
+    CHECK_NOT_NULL(phaseOut);
+    d->forwardPolar(realIn, magOut, phaseOut);
+}
+
+void
+FFT::forwardMagnitude(const float *R__ realIn, float *R__ magOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(magOut);
+    d->forwardMagnitude(realIn, magOut);
+}
+
+void
+FFT::inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(imagIn);
+    CHECK_NOT_NULL(realOut);
+    d->inverse(realIn, imagIn, realOut);
+}
+
+void
+FFT::inverseInterleaved(const double *R__ complexIn, double *R__ realOut)
+{
+    CHECK_NOT_NULL(complexIn);
+    CHECK_NOT_NULL(realOut);
+    d->inverseInterleaved(complexIn, realOut);
+}
+
+void
+FFT::inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut)
+{
+    CHECK_NOT_NULL(magIn);
+    CHECK_NOT_NULL(phaseIn);
+    CHECK_NOT_NULL(realOut);
+    d->inversePolar(magIn, phaseIn, realOut);
+}
+
+void
+FFT::inverseCepstral(const double *R__ magIn, double *R__ cepOut)
+{
+    CHECK_NOT_NULL(magIn);
+    CHECK_NOT_NULL(cepOut);
+    d->inverseCepstral(magIn, cepOut);
+}
+
+void
+FFT::inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(imagIn);
+    CHECK_NOT_NULL(realOut);
+    d->inverse(realIn, imagIn, realOut);
+}
+
+void
+FFT::inverseInterleaved(const float *R__ complexIn, float *R__ realOut)
+{
+    CHECK_NOT_NULL(complexIn);
+    CHECK_NOT_NULL(realOut);
+    d->inverseInterleaved(complexIn, realOut);
+}
+
+void
+FFT::inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut)
+{
+    CHECK_NOT_NULL(magIn);
+    CHECK_NOT_NULL(phaseIn);
+    CHECK_NOT_NULL(realOut);
+    d->inversePolar(magIn, phaseIn, realOut);
+}
+
+void
+FFT::inverseCepstral(const float *R__ magIn, float *R__ cepOut)
+{
+    CHECK_NOT_NULL(magIn);
+    CHECK_NOT_NULL(cepOut);
+    d->inverseCepstral(magIn, cepOut);
+}
+
+void
+FFT::initFloat() 
+{
+    d->initFloat();
+}
+
+void
+FFT::initDouble() 
+{
+    d->initDouble();
+}
+
+FFT::Precisions
+FFT::getSupportedPrecisions() const
+{
+    return d->getSupportedPrecisions();
+}
+
+#ifdef FFT_MEASUREMENT
+
+std::string
+FFT::tune()
+{
+    std::ostringstream os;
+    os << "FFT::tune()..." << std::endl;
+
+    std::vector<int> sizes;
+    std::map<FFTImpl *, int> candidates;
+    std::map<int, int> wins;
+
+    sizes.push_back(512);
+    sizes.push_back(1024);
+    sizes.push_back(4096);
+    
+    for (unsigned int si = 0; si < sizes.size(); ++si) {
+
+        int size = sizes[si];
+
+        while (!candidates.empty()) {
+            delete candidates.begin()->first;
+            candidates.erase(candidates.begin());
+        }
+
+        FFTImpl *d;
+        
+#ifdef HAVE_IPP
+        std::cerr << "Constructing new IPP FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_IPP(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 0;
+#endif
+        
+#ifdef HAVE_FFTW3
+        os << "Constructing new FFTW3 FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_FFTW(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 1;
+#endif
+
+#ifdef USE_KISSFFT
+        os << "Constructing new KISSFFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_KISSFFT(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 2;
+#endif        
+
+#ifdef USE_BUILTIN_FFT
+        os << "Constructing new Cross FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_Cross(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 3;
+#endif
+        
+#ifdef HAVE_VDSP
+        os << "Constructing new vDSP FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_VDSP(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 4;
+#endif
+        
+#ifdef HAVE_MEDIALIB
+        std::cerr << "Constructing new MediaLib FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_MEDIALIB(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 5;
+#endif
+        
+#ifdef HAVE_OPENMAX
+        os << "Constructing new OpenMAX FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_OPENMAX(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 6;
+#endif
+        
+#ifdef HAVE_SFFT
+        os << "Constructing new SFFT FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_SFFT(size);
+//        d->initFloat();
+        d->initDouble();
+        candidates[d] = 6;
+#endif
+
+        os << "CLOCKS_PER_SEC = " << CLOCKS_PER_SEC << std::endl;
+        float divisor = float(CLOCKS_PER_SEC) / 1000.f;
+        
+        os << "Timing order is: ";
+        for (std::map<FFTImpl *, int>::iterator ci = candidates.begin();
+             ci != candidates.end(); ++ci) {
+            os << ci->second << " ";
+        }
+        os << std::endl;
+
+        int iterations = 500;
+        os << "Iterations: " << iterations << std::endl;
+
+        double *da = new double[size];
+        double *db = new double[size];
+        double *dc = new double[size];
+        double *dd = new double[size];
+        double *di = new double[size + 2];
+        double *dj = new double[size + 2];
+
+        float *fa = new float[size];
+        float *fb = new float[size];
+        float *fc = new float[size];
+        float *fd = new float[size];
+        float *fi = new float[size + 2];
+        float *fj = new float[size + 2];
+
+        for (int type = 0; type < 16; ++type) {
+    
+            //!!!
+            if ((type > 3 && type < 8) ||
+                (type > 11)) {
+                continue;
+            }
+
+            if (type > 7) {
+                // inverse transform: bigger inputs, to simulate the
+                // fact that the forward transform is unscaled
+                for (int i = 0; i < size; ++i) {
+                    da[i] = drand48() * size;
+                    fa[i] = da[i];
+                    db[i] = drand48() * size;
+                    fb[i] = db[i];
+                }
+            } else {    
+                for (int i = 0; i < size; ++i) {
+                    da[i] = drand48();
+                    fa[i] = da[i];
+                    db[i] = drand48();
+                    fb[i] = db[i];
+                }
+            }
+                
+            for (int i = 0; i < size + 2; ++i) {
+                di[i] = drand48();
+                fi[i] = di[i];
+            }
+
+            int low = -1;
+            int lowscore = 0;
+
+            const char *names[] = {
+
+                "Forward Cartesian Double",
+                "Forward Interleaved Double",
+                "Forward Polar Double",
+                "Forward Magnitude Double",
+                "Forward Cartesian Float",
+                "Forward Interleaved Float",
+                "Forward Polar Float",
+                "Forward Magnitude Float",
+
+                "Inverse Cartesian Double",
+                "Inverse Interleaved Double",
+                "Inverse Polar Double",
+                "Inverse Cepstral Double",
+                "Inverse Cartesian Float",
+                "Inverse Interleaved Float",
+                "Inverse Polar Float",
+                "Inverse Cepstral Float"
+            };
+            os << names[type] << " :: ";
+
+            for (std::map<FFTImpl *, int>::iterator ci = candidates.begin();
+                 ci != candidates.end(); ++ci) {
+
+                FFTImpl *d = ci->first;
+
+                double mean = 0;
+
+                clock_t start = clock();
+                
+                for (int i = 0; i < iterations; ++i) {
+
+                    if (i == 0) {
+                        for (int j = 0; j < size; ++j) {
+                            dc[j] = 0;
+                            dd[j] = 0;
+                            fc[j] = 0;
+                            fd[j] = 0;
+                            fj[j] = 0;
+                            dj[j] = 0;
+                        }
+                    }
+
+                    switch (type) {
+                    case 0: d->forward(da, dc, dd); break;
+                    case 1: d->forwardInterleaved(da, dj); break;
+                    case 2: d->forwardPolar(da, dc, dd); break;
+                    case 3: d->forwardMagnitude(da, dc); break;
+                    case 4: d->forward(fa, fc, fd); break;
+                    case 5: d->forwardInterleaved(fa, fj); break;
+                    case 6: d->forwardPolar(fa, fc, fd); break;
+                    case 7: d->forwardMagnitude(fa, fc); break;
+                    case 8: d->inverse(da, db, dc); break;
+                    case 9: d->inverseInterleaved(di, dc); break;
+                    case 10: d->inversePolar(da, db, dc); break;
+                    case 11: d->inverseCepstral(da, dc); break;
+                    case 12: d->inverse(fa, fb, fc); break;
+                    case 13: d->inverseInterleaved(fi, fc); break;
+                    case 14: d->inversePolar(fa, fb, fc); break;
+                    case 15: d->inverseCepstral(fa, fc); break;
+                    }
+
+                    if (i == 0) {
+                        mean = 0;
+                        for (int j = 0; j < size; ++j) {
+                            mean += dc[j];
+                            mean += dd[j];
+                            mean += fc[j];
+                            mean += fd[j];
+                            mean += fj[j];
+                            mean += dj[j];
+                        }
+                        mean /= size * 6;
+                    }
+                }
+
+                clock_t end = clock();
+
+                os << float(end - start)/divisor << " (" << mean << ") ";
+
+                if (low == -1 || (end - start) < lowscore) {
+                    low = ci->second;
+                    lowscore = end - start;
+                }
+            }
+
+            os << std::endl;
+
+            os << "  size " << size << ", type " << type << ": fastest is " << low << " (time " << float(lowscore)/divisor << ")" << std::endl;
+
+            wins[low]++;
+        }
+        
+        delete[] fa;
+        delete[] fb;
+        delete[] fc;
+        delete[] fd;
+        delete[] da;
+        delete[] db;
+        delete[] dc;
+        delete[] dd;
+    }
+
+    while (!candidates.empty()) {
+        delete candidates.begin()->first;
+        candidates.erase(candidates.begin());
+    }
+
+    int bestscore = 0;
+    int best = -1;
+
+    for (std::map<int, int>::iterator wi = wins.begin(); wi != wins.end(); ++wi) {
+        if (best == -1 || wi->second > bestscore) {
+            best = wi->first;
+            bestscore = wi->second;
+        }
+    }
+
+    os << "overall winner is " << best << " with " << bestscore << " wins" << std::endl;
+
+    return os.str();
+}
+
+#endif
+
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/dsp/FFT.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/dsp/FFT.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,129 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_FFT_H_
+#define _RUBBERBAND_FFT_H_
+
+#include "system/sysutils.h"
+
+#include <string>
+#include <set>
+
+namespace RubberBand {
+
+class FFTImpl;
+
+/**
+ * Provide the basic FFT computations we need, using one of a set of
+ * candidate FFT implementations (depending on compile flags).
+ *
+ * Implements real->complex FFTs of power-of-two sizes only.  Note
+ * that only the first half of the output signal is returned (the
+ * complex conjugates half is omitted), so the "complex" arrays need
+ * room for size/2+1 elements.
+ *
+ * The "interleaved" functions use the format sometimes called CCS --
+ * size/2+1 real+imaginary pairs.  So, the array elements at indices 1
+ * and size+1 will always be zero (since the signal is real).
+ * 
+ * All pointer arguments must point to valid data. A NullArgument
+ * exception is thrown if any argument is NULL.
+ *
+ * Neither forward nor inverse transform is scaled.
+ *
+ * This class is reentrant but not thread safe: use a separate
+ * instance per thread, or use a mutex.
+ */
+class FFT
+{
+public:
+    enum Exception {
+        NullArgument, InvalidSize, InvalidImplementation, InternalError
+    };
+
+    FFT(int size, int debugLevel = 0); // may throw InvalidSize
+    ~FFT();
+
+    void forward(const double *R__ realIn, double *R__ realOut, double *R__ imagOut);
+    void forwardInterleaved(const double *R__ realIn, double *R__ complexOut);
+    void forwardPolar(const double *R__ realIn, double *R__ magOut, double *R__ phaseOut);
+    void forwardMagnitude(const double *R__ realIn, double *R__ magOut);
+
+    void forward(const float *R__ realIn, float *R__ realOut, float *R__ imagOut);
+    void forwardInterleaved(const float *R__ realIn, float *R__ complexOut);
+    void forwardPolar(const float *R__ realIn, float *R__ magOut, float *R__ phaseOut);
+    void forwardMagnitude(const float *R__ realIn, float *R__ magOut);
+
+    void inverse(const double *R__ realIn, const double *R__ imagIn, double *R__ realOut);
+    void inverseInterleaved(const double *R__ complexIn, double *R__ realOut);
+    void inversePolar(const double *R__ magIn, const double *R__ phaseIn, double *R__ realOut);
+    void inverseCepstral(const double *R__ magIn, double *R__ cepOut);
+
+    void inverse(const float *R__ realIn, const float *R__ imagIn, float *R__ realOut);
+    void inverseInterleaved(const float *R__ complexIn, float *R__ realOut);
+    void inversePolar(const float *R__ magIn, const float *R__ phaseIn, float *R__ realOut);
+    void inverseCepstral(const float *R__ magIn, float *R__ cepOut);
+
+    // Calling one or both of these is optional -- if neither is
+    // called, the first call to a forward or inverse method will call
+    // init().  You only need call these if you don't want to risk
+    // expensive allocations etc happening in forward or inverse.
+    void initFloat();
+    void initDouble();
+
+    enum Precision {
+        SinglePrecision = 0x1,
+        DoublePrecision = 0x2
+    };
+    typedef int Precisions;
+
+    /**
+     * Return the OR of all precisions supported by this
+     * implementation. All of the functions (float and double) are
+     * available regardless of the supported implementations, but they
+     * will be calculated at the proper precision only if it is
+     * available. (So float functions will be calculated using doubles
+     * and then truncated if single-precision is unavailable, and
+     * double functions will use single-precision arithmetic if double
+     * is unavailable.)
+     */
+    Precisions getSupportedPrecisions() const;
+
+    static std::set<std::string> getImplementations();
+    static std::string getDefaultImplementation();
+    static void setDefaultImplementation(std::string);
+
+#ifdef FFT_MEASUREMENT
+    static std::string tune();
+#endif
+
+protected:
+    FFTImpl *d;
+    static std::string m_implementation;
+    static void pickDefaultImplementation();
+};
+
+}
+
+#endif
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/dsp/MovingMedian.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/dsp/MovingMedian.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,104 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _MOVING_MEDIAN_H_
+#define _MOVING_MEDIAN_H_
+
+#include "SampleFilter.h"
+
+#include "system/Allocators.h"
+
+#include <algorithm>
+
+namespace RubberBand
+{
+
+template <typename T>
+class MovingMedian : public SampleFilter<T>
+{
+    typedef SampleFilter<T> P;
+
+public:
+    MovingMedian(int size, float percentile = 50.f) :
+        SampleFilter<T>(size),
+	m_frame(allocate_and_zero<T>(size)),
+	m_sorted(allocate_and_zero<T>(size)),
+	m_sortend(m_sorted + P::m_size - 1) {
+        setPercentile(percentile);
+    }
+
+    ~MovingMedian() { 
+	deallocate(m_frame);
+	deallocate(m_sorted);
+    }
+
+    void setPercentile(float p) {
+        m_index = int((P::m_size * p) / 100.f);
+        if (m_index >= P::m_size) m_index = P::m_size-1;
+        if (m_index < 0) m_index = 0;
+    }
+
+    void push(T value) {
+	drop(m_frame[0]);
+	v_move(m_frame, m_frame+1, P::m_size-1);
+	m_frame[P::m_size-1] = value;
+	put(value);
+    }
+
+    T get() const {
+	return m_sorted[m_index];
+    }
+
+    void reset() {
+	v_zero(m_frame, P::m_size);
+	v_zero(m_sorted, P::m_size);
+    }
+
+private:
+    T *const m_frame;
+    T *const m_sorted;
+    T *const m_sortend;
+    int m_index;
+
+    void put(T value) {
+	// precondition: m_sorted contains m_size-1 values, packed at start
+	// postcondition: m_sorted contains m_size values, one of which is value
+	T *index = std::lower_bound(m_sorted, m_sortend, value);
+	v_move(index + 1, index, m_sortend - index);
+	*index = value;
+    }
+
+    void drop(T value) {
+	// precondition: m_sorted contains m_size values, one of which is value
+	// postcondition: m_sorted contains m_size-1 values, packed at start
+	T *index = std::lower_bound(m_sorted, m_sortend + 1, value);
+	assert(*index == value);
+	v_move(index, index + 1, m_sortend - index);
+	*m_sortend = T(0);
+    }
+};
+
+}
+
+#endif
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/dsp/Resampler.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/dsp/Resampler.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1189 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "Resampler.h"
+#include "base/Profiler.h"
+
+#include <cstdlib>
+#include <cmath>
+
+#include <iostream>
+
+#include "system/Allocators.h"
+
+#ifdef HAVE_IPP
+#include <ipps.h>
+#include <ippsr.h>
+#include <ippac.h>
+#endif
+
+#ifdef HAVE_LIBSAMPLERATE
+#include <samplerate.h>
+#endif
+
+#ifdef HAVE_LIBRESAMPLE
+#include <libresample.h>
+#endif
+
+#ifdef USE_SPEEX
+#include "speex/speex_resampler.h"
+#endif
+
+#ifndef HAVE_IPP
+#ifndef HAVE_LIBSAMPLERATE
+#ifndef HAVE_LIBRESAMPLE
+#ifndef USE_SPEEX
+#error No resampler implementation selected!
+#endif
+#endif
+#endif
+#endif
+
+namespace RubberBand {
+
+class ResamplerImpl
+{
+public:
+    virtual ~ResamplerImpl() { }
+    
+    virtual int resample(const float *const R__ *const R__ in, 
+                         float *const R__ *const R__ out,
+                         int incount,
+                         float ratio,
+                         bool final) = 0;
+    
+    virtual int resampleInterleaved(const float *const R__ in, 
+                                    float *const R__ out,
+                                    int incount,
+                                    float ratio,
+                                    bool final) = 0;
+
+    virtual int getChannelCount() const = 0;
+
+    virtual void reset() = 0;
+};
+
+namespace Resamplers {
+
+#ifdef HAVE_IPP
+
+class D_IPP : public ResamplerImpl
+{
+public:
+    D_IPP(Resampler::Quality quality, int channels, int maxBufferSize,
+          int debugLevel);
+    ~D_IPP();
+
+    int resample(const float *const R__ *const R__ in,
+                 float *const R__ *const R__ out,
+                 int incount,
+                 float ratio,
+                 bool final);
+
+    int resampleInterleaved(const float *const R__ in,
+                            float *const R__ out,
+                            int incount,
+                            float ratio,
+                            bool final = false);
+
+    int getChannelCount() const { return m_channels; }
+
+    void reset();
+
+protected:
+    IppsResamplingPolyphase_32f **m_state;
+    float **m_inbuf;
+    size_t m_inbufsz;
+    float **m_outbuf;
+    size_t m_outbufsz;
+    int m_bufsize;
+    int m_channels;
+    int m_window;
+    float m_factor;
+    int m_history;
+    int *m_lastread;
+    double *m_time;
+    int m_debugLevel;
+    
+    void setBufSize(int);
+};
+
+D_IPP::D_IPP(Resampler::Quality quality, int channels, int maxBufferSize,
+             int debugLevel) :
+    m_state(0),
+    m_channels(channels),
+    m_debugLevel(debugLevel)
+{
+    if (m_debugLevel > 0) {
+        std::cerr << "Resampler::Resampler: using IPP implementation"
+                  << std::endl;
+    }
+
+    int nStep;
+    IppHintAlgorithm hint;
+
+    switch (quality) {
+
+    case Resampler::Best:
+        m_window = 64;
+        nStep = 80;
+        hint = ippAlgHintAccurate;
+        break;
+
+    case Resampler::FastestTolerable:
+//        m_window = 48;
+        nStep = 16;
+        m_window = 16;
+//        nStep = 8;
+        hint = ippAlgHintFast;
+        break;
+
+    case Resampler::Fastest:
+        m_window = 24;
+        nStep = 64;
+        hint = ippAlgHintFast;
+        break;
+    }
+
+    m_factor = 8; // initial upper bound on m_ratio, may be amended later
+    m_history = int(m_window * 0.5 * std::max(1.0, 1.0 / m_factor)) + 1;
+
+    m_state = new IppsResamplingPolyphase_32f *[m_channels];
+
+    m_lastread = new int[m_channels];
+    m_time = new double[m_channels];
+
+    m_bufsize = maxBufferSize + m_history;
+
+    if (m_debugLevel > 1) {
+        std::cerr << "bufsize = " << m_bufsize << ", window = " << m_window << ", nStep = " << nStep << ", history = " << m_history << std::endl;
+    }
+
+    for (int c = 0; c < m_channels; ++c) {
+        ippsResamplePolyphaseInitAlloc_32f(&m_state[c],
+                                           float(m_window),
+                                           nStep,
+                                           0.95f,
+                                           9.0f,
+                                           hint);
+        m_lastread[c] = m_history;
+        m_time[c] = m_history;
+    }
+
+    m_inbufsz = m_bufsize + m_history + 2;
+    if (m_debugLevel > 1) {
+        std::cerr << "inbuf allocating " << m_bufsize << " + " << m_history << " + 2 = " << m_inbufsz << std::endl;
+    }
+
+    m_outbufsz = lrintf(ceil((m_bufsize - m_history) * m_factor + 2));
+    if (m_debugLevel > 1) {
+        std::cerr << "outbuf allocating (" << m_bufsize << " - " << m_history << ") * " << m_factor << " + 2 = " << m_outbufsz << std::endl;
+    }
+
+    m_inbuf  = allocate_and_zero_channels<float>(m_channels, m_inbufsz);
+    m_outbuf = allocate_and_zero_channels<float>(m_channels, m_outbufsz);
+
+    if (m_debugLevel > 1) {
+        std::cerr << "Resampler init done" << std::endl;
+    }
+}
+
+D_IPP::~D_IPP()
+{
+    for (int c = 0; c < m_channels; ++c) {
+        ippsResamplePolyphaseFree_32f(m_state[c]);
+    }
+
+    deallocate_channels(m_inbuf, m_channels);
+    deallocate_channels(m_outbuf, m_channels);
+
+    delete[] m_lastread;
+    delete[] m_time;
+    delete[] m_state;
+}
+
+void
+D_IPP::setBufSize(int sz)
+{
+    if (m_debugLevel > 1) {
+        std::cerr << "resize bufsize " << m_bufsize << " -> ";
+    }
+
+    m_bufsize = sz;
+
+    std::cerr << m_bufsize << std::endl;
+
+    int n1 = m_bufsize + m_history + 2;
+    int n2 = lrintf(ceil((m_bufsize - m_history) * m_factor + 2));
+
+    if (m_debugLevel > 1) {
+        std::cerr << "(outbufsize = " << n2 << ")" << std::endl;
+    }
+
+    m_inbuf = reallocate_and_zero_extend_channels
+        (m_inbuf, m_channels, m_inbufsz, m_channels, n1);
+
+    m_outbuf = reallocate_and_zero_extend_channels
+        (m_outbuf, m_channels, m_outbufsz, m_channels, n2);
+            
+    m_inbufsz = n1;
+    m_outbufsz = n2;
+}
+
+int
+D_IPP::resample(const float *const R__ *const R__ in,
+                float *const R__ *const R__ out,
+                int incount,
+                float ratio,
+                bool final)
+{
+    int outcount = 0;
+
+    if (ratio > m_factor) {
+        m_factor = ratio;
+        m_history = int(m_window * 0.5 * std::max(1.0, 1.0 / m_factor)) + 1;
+    }
+
+    for (int c = 0; c < m_channels; ++c) {
+        if (m_lastread[c] + incount + m_history > m_bufsize) {
+            setBufSize(m_lastread[c] + incount + m_history);
+        }
+    }
+
+    for (int c = 0; c < m_channels; ++c) {
+
+        for (int i = 0; i < incount; ++i) {
+            m_inbuf[c][m_lastread[c] + i] = in[c][i];
+        }
+        m_lastread[c] += incount;
+        
+        ippsResamplePolyphase_32f(m_state[c],
+                                  m_inbuf[c],
+                                  m_lastread[c] - m_history - int(m_time[c]),
+                                  m_outbuf[c],
+                                  ratio,
+                                  0.97f,
+                                  &m_time[c],
+                                  &outcount);
+
+        v_copy(out[c], m_outbuf[c], outcount);
+
+        ippsMove_32f(m_inbuf[c] + int(m_time[c]) - m_history,
+                     m_inbuf[c],
+                     m_lastread[c] + m_history - int(m_time[c]));
+
+        m_lastread[c] -= int(m_time[c]) - m_history;
+        m_time[c] -= int(m_time[c]) - m_history;
+
+        if (final) {
+
+            // Looks like this actually produces too many samples
+            // (additionalcount is a few samples too large).
+
+            // Also, we aren't likely to have enough space in the
+            // output buffer as the caller won't have allowed for
+            // all the samples we're retrieving here.
+
+            // What to do?
+
+            int additionalcount = 0;
+
+            for (int i = 0; i < m_history; ++i) {
+                m_inbuf[c][m_lastread[c] + i] = 0.f;
+            }
+            
+            ippsResamplePolyphase_32f(m_state[c],
+                                      m_inbuf[c],
+                                      m_lastread[c] - int(m_time[c]),
+                                      m_outbuf[c],
+                                      ratio,
+                                      0.97f,
+                                      &m_time[c],
+                                      &additionalcount);
+
+            if (m_debugLevel > 2) {
+                std::cerr << "incount = " << incount << ", outcount = " << outcount << ", additionalcount = " << additionalcount << ", sum " << outcount + additionalcount << ", est space = " << lrintf(ceil(incount * ratio)) <<std::endl;
+            }
+
+            v_copy(out[c] + outcount, m_outbuf[c], additionalcount);
+
+            outcount += additionalcount;
+        }
+    }
+
+    for (int c = 0; c < m_channels; ++c) {
+        ippsThreshold_32f_I(out[c], outcount, 1.f, ippCmpGreater);
+        ippsThreshold_32f_I(out[c], outcount, -1.f, ippCmpLess);
+    }
+
+    return outcount;
+}
+
+int
+D_IPP::resampleInterleaved(const float *const R__ in,
+                           float *const R__ out,
+                           int incount,
+                           float ratio,
+                           bool final)
+{
+    int outcount = 0;
+
+    if (ratio > m_factor) {
+        m_factor = ratio;
+        m_history = int(m_window * 0.5 * std::max(1.0, 1.0 / m_factor)) + 1;
+    }
+
+    for (int c = 0; c < m_channels; ++c) {
+        if (m_lastread[c] + incount + m_history > m_bufsize) {
+            setBufSize(m_lastread[c] + incount + m_history);
+        }
+    }
+
+    for (int c = 0; c < m_channels; ++c) {
+
+        for (int i = 0; i < incount; ++i) {
+            m_inbuf[c][m_lastread[c] + i] = in[i * m_channels + c];
+        }
+        m_lastread[c] += incount;
+        
+        ippsResamplePolyphase_32f(m_state[c],
+                                  m_inbuf[c],
+                                  m_lastread[c] - m_history - int(m_time[c]),
+                                  m_outbuf[c],
+                                  ratio,
+                                  0.97f,
+                                  &m_time[c],
+                                  &outcount);
+
+        ippsMove_32f(m_inbuf[c] + int(m_time[c]) - m_history,
+                     m_inbuf[c],
+                     m_lastread[c] + m_history - int(m_time[c]));
+
+        m_lastread[c] -= int(m_time[c]) - m_history;
+        m_time[c] -= int(m_time[c]) - m_history;
+    }
+
+    v_interleave(out, m_outbuf, m_channels, outcount);
+
+    if (final) {
+
+        // Looks like this actually produces too many samples
+        // (additionalcount is a few samples too large).
+
+        // Also, we aren't likely to have enough space in the
+        // output buffer as the caller won't have allowed for
+        // all the samples we're retrieving here.
+
+        // What to do?
+
+        int additionalcount = 0;
+        
+        for (int c = 0; c < m_channels; ++c) {
+
+            for (int i = 0; i < m_history; ++i) {
+                m_inbuf[c][m_lastread[c] + i] = 0.f;
+            }
+            
+            ippsResamplePolyphase_32f(m_state[c],
+                                      m_inbuf[c],
+                                      m_lastread[c] - int(m_time[c]),
+                                      m_outbuf[c],
+                                      ratio,
+                                      0.97f,
+                                      &m_time[c],
+                                      &additionalcount);
+
+            if (m_debugLevel > 2) {
+                std::cerr << "incount = " << incount << ", outcount = " << outcount << ", additionalcount = " << additionalcount << ", sum " << outcount + additionalcount << ", est space = " << lrintf(ceil(incount * ratio)) <<std::endl;
+            }
+        }
+
+        v_interleave(out + (outcount * m_channels),
+                     m_outbuf,
+                     m_channels,
+                     additionalcount);
+
+        outcount += additionalcount;
+    }
+
+    ippsThreshold_32f_I(out, outcount * m_channels, 1.f, ippCmpGreater);
+    ippsThreshold_32f_I(out, outcount * m_channels, -1.f, ippCmpLess);
+
+    return outcount;
+}
+
+void
+D_IPP::reset()
+{
+    //!!!
+}
+
+#endif /* HAVE_IPP */
+
+#ifdef HAVE_LIBSAMPLERATE
+
+class D_SRC : public ResamplerImpl
+{
+public:
+    D_SRC(Resampler::Quality quality, int channels, int maxBufferSize,
+          int m_debugLevel);
+    ~D_SRC();
+
+    int resample(const float *const R__ *const R__ in,
+                 float *const R__ *const R__ out,
+                 int incount,
+                 float ratio,
+                 bool final);
+
+    int resampleInterleaved(const float *const R__ in,
+                            float *const R__ out,
+                            int incount,
+                            float ratio,
+                            bool final = false);
+
+    int getChannelCount() const { return m_channels; }
+
+    void reset();
+
+protected:
+    SRC_STATE *m_src;
+    float *m_iin;
+    float *m_iout;
+    float m_lastRatio;
+    int m_channels;
+    int m_iinsize;
+    int m_ioutsize;
+    int m_debugLevel;
+};
+
+D_SRC::D_SRC(Resampler::Quality quality, int channels, int maxBufferSize,
+             int debugLevel) :
+    m_src(0),
+    m_iin(0),
+    m_iout(0),
+    m_lastRatio(1.f),
+    m_channels(channels),
+    m_iinsize(0),
+    m_ioutsize(0),
+    m_debugLevel(debugLevel)
+{
+    if (m_debugLevel > 0) {
+        std::cerr << "Resampler::Resampler: using libsamplerate implementation"
+                  << std::endl;
+    }
+
+    int err = 0;
+    m_src = src_new(quality == Resampler::Best ? SRC_SINC_BEST_QUALITY :
+                    quality == Resampler::Fastest ? SRC_LINEAR :
+                    SRC_SINC_FASTEST,
+                    channels, &err);
+
+    if (err) {
+        std::cerr << "Resampler::Resampler: failed to create libsamplerate resampler: " 
+                  << src_strerror(err) << std::endl;
+#ifndef NO_EXCEPTIONS
+        throw Resampler::ImplementationError;
+#endif
+    }
+
+    if (maxBufferSize > 0 && m_channels > 1) {
+        m_iinsize = maxBufferSize * m_channels;
+        m_ioutsize = maxBufferSize * m_channels * 2;
+        m_iin = allocate<float>(m_iinsize);
+        m_iout = allocate<float>(m_ioutsize);
+    }
+
+    reset();
+}
+
+D_SRC::~D_SRC()
+{
+    src_delete(m_src);
+    deallocate(m_iin);
+    deallocate(m_iout);
+}
+
+int
+D_SRC::resample(const float *const R__ *const R__ in,
+                float *const R__ *const R__ out,
+                int incount,
+                float ratio,
+                bool final)
+{
+    SRC_DATA data;
+
+    int outcount = lrintf(ceilf(incount * ratio));
+
+    if (m_channels == 1) {
+        data.data_in = const_cast<float *>(*in); //!!!???
+        data.data_out = *out;
+    } else {
+        if (incount * m_channels > m_iinsize) {
+            m_iin = reallocate<float>(m_iin, m_iinsize, incount * m_channels);
+            m_iinsize = incount * m_channels;
+        }
+        if (outcount * m_channels > m_ioutsize) {
+            m_iout = reallocate<float>(m_iout, m_ioutsize, outcount * m_channels);
+            m_ioutsize = outcount * m_channels;
+        }
+        v_interleave(m_iin, in, m_channels, incount);
+        data.data_in = m_iin;
+        data.data_out = m_iout;
+    }
+
+    data.input_frames = incount;
+    data.output_frames = outcount;
+    data.src_ratio = ratio;
+    data.end_of_input = (final ? 1 : 0);
+
+    int err = src_process(m_src, &data);
+
+    if (err) {
+        std::cerr << "Resampler::process: libsamplerate error: "
+                  << src_strerror(err) << std::endl;
+#ifndef NO_EXCEPTIONS
+        throw Resampler::ImplementationError;
+#endif
+    }
+
+    if (m_channels > 1) {
+        v_deinterleave(out, m_iout, m_channels, data.output_frames_gen);
+    }
+
+    m_lastRatio = ratio;
+
+    return data.output_frames_gen;
+}
+
+int
+D_SRC::resampleInterleaved(const float *const R__ in,
+                           float *const R__ out,
+                           int incount,
+                           float ratio,
+                           bool final)
+{
+    SRC_DATA data;
+
+    int outcount = lrintf(ceilf(incount * ratio));
+
+    data.data_in = const_cast<float *>(in);
+    data.data_out = out;
+
+    data.input_frames = incount;
+    data.output_frames = outcount;
+    data.src_ratio = ratio;
+    data.end_of_input = (final ? 1 : 0);
+
+    int err = src_process(m_src, &data);
+
+    if (err) {
+        std::cerr << "Resampler::process: libsamplerate error: "
+                  << src_strerror(err) << std::endl;
+#ifndef NO_EXCEPTIONS
+        throw Resampler::ImplementationError;
+#endif
+    }
+
+    m_lastRatio = ratio;
+
+    return data.output_frames_gen;
+}
+
+void
+D_SRC::reset()
+{
+    src_reset(m_src);
+}
+
+#endif /* HAVE_LIBSAMPLERATE */
+
+#ifdef HAVE_LIBRESAMPLE
+
+class D_Resample : public ResamplerImpl
+{
+public:
+    D_Resample(Resampler::Quality quality, int channels, int maxBufferSize,
+          int m_debugLevel);
+    ~D_Resample();
+
+    int resample(const float *const R__ *const R__ in,
+                 float *const R__ *const R__ out,
+                 int incount,
+                 float ratio,
+                 bool final);
+
+    int resampleInterleaved(const float *const R__ in,
+                            float *const R__ out,
+                            int incount,
+                            float ratio,
+                            bool final);
+
+    int getChannelCount() const { return m_channels; }
+
+    void reset();
+
+protected:
+    void *m_src;
+    float *m_iin;
+    float *m_iout;
+    float m_lastRatio;
+    int m_channels;
+    int m_iinsize;
+    int m_ioutsize;
+    int m_debugLevel;
+};
+
+D_Resample::D_Resample(Resampler::Quality quality, int channels, int maxBufferSize,
+             int debugLevel) :
+    m_src(0),
+    m_iin(0),
+    m_iout(0),
+    m_lastRatio(1.f),
+    m_channels(channels),
+    m_iinsize(0),
+    m_ioutsize(0),
+    m_debugLevel(debugLevel)
+{
+    if (m_debugLevel > 0) {
+        std::cerr << "Resampler::Resampler: using libresample implementation"
+                  << std::endl;
+    }
+
+    float min_factor = 0.125f;
+    float max_factor = 8.0f;
+
+    m_src = resample_open(quality == Resampler::Best ? 1 : 0, min_factor, max_factor);
+
+    if (!m_src) {
+        std::cerr << "Resampler::Resampler: failed to create libresample resampler: " 
+                  << std::endl;
+        throw Resampler::ImplementationError; //!!! of course, need to catch this!
+    }
+
+    if (maxBufferSize > 0 && m_channels > 1) {
+        m_iinsize = maxBufferSize * m_channels;
+        m_ioutsize = maxBufferSize * m_channels * 2;
+        m_iin = allocate<float>(m_iinsize);
+        m_iout = allocate<float>(m_ioutsize);
+    }
+
+    reset();
+}
+
+D_Resample::~D_Resample()
+{
+    resample_close(m_src);
+    if (m_iinsize > 0) {
+        deallocate(m_iin);
+    }
+    if (m_ioutsize > 0) {
+        deallocate(m_iout);
+    }
+}
+
+int
+D_Resample::resample(const float *const R__ *const R__ in,
+                     float *const R__ *const R__ out,
+                     int incount,
+                     float ratio,
+                     bool final)
+{
+    float *data_in;
+    float *data_out;
+    int input_frames, output_frames, end_of_input, source_used;
+    float src_ratio;
+
+    int outcount = lrintf(ceilf(incount * ratio));
+
+    if (m_channels == 1) {
+        data_in = const_cast<float *>(*in); //!!!???
+        data_out = *out;
+    } else {
+        if (incount * m_channels > m_iinsize) {
+            m_iin = reallocate<float>(m_iin, m_iinsize, incount * m_channels);
+            m_iinsize = incount * m_channels;
+        }
+        if (outcount * m_channels > m_ioutsize) {
+            m_iout = reallocate<float>(m_iout, m_ioutsize, outcount * m_channels);
+            m_ioutsize = outcount * m_channels;
+        }
+        v_interleave(m_iin, in, m_channels, incount);
+        data_in = m_iin;
+        data_out = m_iout;
+    }
+
+    input_frames = incount;
+    output_frames = outcount;
+    src_ratio = ratio;
+    end_of_input = (final ? 1 : 0);
+
+    int output_frames_gen = resample_process(m_src,
+                                             src_ratio,
+                                             data_in,
+                                             input_frames,
+                                             end_of_input,
+                                             &source_used,
+                                             data_out,
+                                             output_frames);
+
+    if (output_frames_gen < 0) {
+        std::cerr << "Resampler::process: libresample error: "
+                  << std::endl;
+        throw Resampler::ImplementationError; //!!! of course, need to catch this!
+    }
+
+    if (m_channels > 1) {
+        v_deinterleave(out, m_iout, m_channels, output_frames_gen);
+    }
+
+    m_lastRatio = ratio;
+
+    return output_frames_gen;
+}
+
+int
+D_Resample::resampleInterleaved(const float *const R__ in,
+                                float *const R__ out,
+                                int incount,
+                                float ratio,
+                                bool final)
+{
+    int input_frames, output_frames, end_of_input, source_used;
+    float src_ratio;
+
+    int outcount = lrintf(ceilf(incount * ratio));
+
+    input_frames = incount;
+    output_frames = outcount;
+    src_ratio = ratio;
+    end_of_input = (final ? 1 : 0);
+
+    int output_frames_gen = resample_process(m_src,
+                                             src_ratio,
+                                             const_cast<float *>(in),
+                                             input_frames,
+                                             end_of_input,
+                                             &source_used,
+                                             out,
+                                             output_frames);
+
+    if (output_frames_gen < 0) {
+        std::cerr << "Resampler::process: libresample error: "
+                  << std::endl;
+        throw Resampler::ImplementationError; //!!! of course, need to catch this!
+    }
+
+    m_lastRatio = ratio;
+
+    return output_frames_gen;
+}
+
+void
+D_Resample::reset()
+{
+}
+
+#endif /* HAVE_LIBRESAMPLE */
+
+#ifdef USE_SPEEX
+    
+class D_Speex : public ResamplerImpl
+{
+public:
+    D_Speex(Resampler::Quality quality, int channels, int maxBufferSize,
+            int debugLevel);
+    ~D_Speex();
+
+    int resample(const float *const R__ *const R__ in,
+                 float *const R__ *const R__ out,
+                 int incount,
+                 float ratio,
+                 bool final);
+
+    int resampleInterleaved(const float *const R__ in,
+                            float *const R__ out,
+                            int incount,
+                            float ratio,
+                            bool final = false);
+
+    int getChannelCount() const { return m_channels; }
+
+    void reset();
+
+protected:
+    SpeexResamplerState *m_resampler;
+    float *m_iin;
+    float *m_iout;
+    int m_channels;
+    int m_iinsize;
+    int m_ioutsize;
+    float m_lastratio;
+    bool m_initial;
+    int m_debugLevel;
+
+    void setRatio(float);
+};
+
+D_Speex::D_Speex(Resampler::Quality quality, int channels, int maxBufferSize,
+                 int debugLevel) :
+    m_resampler(0),
+    m_iin(0),
+    m_iout(0),
+    m_channels(channels),
+    m_iinsize(0),
+    m_ioutsize(0),
+    m_lastratio(1),
+    m_initial(true),
+    m_debugLevel(debugLevel)
+{
+    int q = (quality == Resampler::Best ? 10 :
+             quality == Resampler::Fastest ? 0 : 4);
+
+    if (m_debugLevel > 0) {
+        std::cerr << "Resampler::Resampler: using Speex implementation with q = "
+                  << q 
+                  << std::endl;
+    }
+
+    int err = 0;
+    m_resampler = speex_resampler_init_frac(m_channels,
+                                            1, 1,
+                                            48000, 48000, // irrelevant
+                                            q,
+                                            &err);
+    
+
+    if (err) {
+        std::cerr << "Resampler::Resampler: failed to create Speex resampler" 
+                  << std::endl;
+#ifndef NO_EXCEPTIONS
+        throw Resampler::ImplementationError;
+#endif
+    }
+
+    if (maxBufferSize > 0 && m_channels > 1) {
+        m_iinsize = maxBufferSize * m_channels;
+        m_ioutsize = maxBufferSize * m_channels * 2;
+        m_iin = allocate<float>(m_iinsize);
+        m_iout = allocate<float>(m_ioutsize);
+    }
+}
+
+D_Speex::~D_Speex()
+{
+    speex_resampler_destroy(m_resampler);
+    deallocate<float>(m_iin);
+    deallocate<float>(m_iout);
+}
+
+void
+D_Speex::setRatio(float ratio)
+{
+    // Speex wants a ratio of two unsigned integers, not a single
+    // float.  Let's do that.
+
+    unsigned int big = 272408136U; 
+    unsigned int denom = 1, num = 1;
+
+    if (ratio < 1.f) {
+        denom = big;
+        double dnum = double(big) * double(ratio);
+        num = (unsigned int)dnum;
+    } else if (ratio > 1.f) {
+        num = big;
+        double ddenom = double(big) / double(ratio);
+        denom = (unsigned int)ddenom;
+    }
+    
+    if (m_debugLevel > 1) {
+        std::cerr << "D_Speex: Desired ratio " << ratio << ", requesting ratio "
+                  << num << "/" << denom << " = " << float(double(num)/double(denom))
+                  << std::endl;
+    }
+    
+    int err = speex_resampler_set_rate_frac
+        (m_resampler, denom, num, 48000, 48000);
+    //!!! check err
+    
+    speex_resampler_get_ratio(m_resampler, &denom, &num);
+    
+    if (m_debugLevel > 1) {
+        std::cerr << "D_Speex: Desired ratio " << ratio << ", got ratio "
+                  << num << "/" << denom << " = " << float(double(num)/double(denom))
+                  << std::endl;
+    }
+    
+    m_lastratio = ratio;
+
+    if (m_initial) {
+        speex_resampler_skip_zeros(m_resampler);
+        m_initial = false;
+    }
+}
+
+int
+D_Speex::resample(const float *const R__ *const R__ in,
+                  float *const R__ *const R__ out,
+                  int incount,
+                  float ratio,
+                  bool final)
+{
+    if (ratio != m_lastratio) {
+        setRatio(ratio);
+    }
+
+    unsigned int uincount = incount;
+    unsigned int outcount = lrintf(ceilf(incount * ratio)); //!!! inexact now
+
+    float *data_in, *data_out;
+
+    if (m_channels == 1) {
+        data_in = const_cast<float *>(*in);
+        data_out = *out;
+    } else {
+        if (incount * m_channels > m_iinsize) {
+            m_iin = reallocate<float>(m_iin, m_iinsize, incount * m_channels);
+            m_iinsize = incount * m_channels;
+        }
+        if (outcount * m_channels > m_ioutsize) {
+            m_iout = reallocate<float>(m_iout, m_ioutsize, outcount * m_channels);
+            m_ioutsize = outcount * m_channels;
+        }
+        v_interleave(m_iin, in, m_channels, incount);
+        data_in = m_iin;
+        data_out = m_iout;
+    }
+
+    int err = speex_resampler_process_interleaved_float(m_resampler,
+                                                        data_in,
+                                                        &uincount,
+                                                        data_out,
+                                                        &outcount);
+
+//    if (incount != int(uincount)) {
+//        std::cerr << "Resampler: NOTE: Consumed " << uincount
+//                  << " of " << incount << " frames" << std::endl;
+//    }
+
+//    if (outcount != lrintf(ceilf(incount * ratio))) {
+//        std::cerr << "Resampler: NOTE: Obtained " << outcount
+//                  << " of " << lrintf(ceilf(incount * ratio)) << " frames"
+//                  << std::endl;
+//    }
+        
+    //!!! check err, respond appropriately
+
+    if (m_channels > 1) {
+        v_deinterleave(out, m_iout, m_channels, outcount);
+    }
+
+    return outcount;
+}
+
+int
+D_Speex::resampleInterleaved(const float *const R__ in,
+                             float *const R__ out,
+                             int incount,
+                             float ratio,
+                             bool final)
+{
+    if (ratio != m_lastratio) {
+        setRatio(ratio);
+    }
+
+    unsigned int uincount = incount;
+    unsigned int outcount = lrintf(ceilf(incount * ratio)); //!!! inexact now
+
+    float *data_in = const_cast<float *>(in);
+    float *data_out = out;
+
+    int err = speex_resampler_process_interleaved_float(m_resampler,
+                                                        data_in,
+                                                        &uincount,
+                                                        data_out,
+                                                        &outcount);
+
+    return outcount;
+}
+
+void
+D_Speex::reset()
+{
+    speex_resampler_reset_mem(m_resampler);
+}
+
+#endif
+
+} /* end namespace Resamplers */
+
+Resampler::Resampler(Resampler::Quality quality, int channels,
+                     int maxBufferSize, int debugLevel)
+{
+    m_method = -1;
+    
+    switch (quality) {
+
+    case Resampler::Best:
+#ifdef HAVE_IPP
+        m_method = 0;
+#endif
+#ifdef USE_SPEEX
+        m_method = 2;
+#endif
+#ifdef HAVE_LIBRESAMPLE
+        m_method = 3;
+#endif
+#ifdef HAVE_LIBSAMPLERATE
+        m_method = 1;
+#endif
+        break;
+
+    case Resampler::FastestTolerable:
+#ifdef HAVE_IPP
+        m_method = 0;
+#endif
+#ifdef HAVE_LIBRESAMPLE
+        m_method = 3;
+#endif
+#ifdef HAVE_LIBSAMPLERATE
+        m_method = 1;
+#endif
+#ifdef USE_SPEEX
+        m_method = 2;
+#endif
+        break;
+
+    case Resampler::Fastest:
+#ifdef HAVE_IPP
+        m_method = 0;
+#endif
+#ifdef HAVE_LIBRESAMPLE
+        m_method = 3;
+#endif
+#ifdef USE_SPEEX
+        m_method = 2;
+#endif
+#ifdef HAVE_LIBSAMPLERATE
+        m_method = 1;
+#endif
+        break;
+    }
+
+    if (m_method == -1) {
+        std::cerr << "Resampler::Resampler(" << quality << ", " << channels
+                  << ", " << maxBufferSize << "): No implementation available!"
+                  << std::endl;
+        abort();
+    }
+
+    switch (m_method) {
+    case 0:
+#ifdef HAVE_IPP
+        d = new Resamplers::D_IPP(quality, channels, maxBufferSize, debugLevel);
+#else
+        std::cerr << "Resampler::Resampler(" << quality << ", " << channels
+                  << ", " << maxBufferSize << "): No implementation available!"
+                  << std::endl;
+        abort();
+#endif
+        break;
+
+    case 1:
+#ifdef HAVE_LIBSAMPLERATE
+        d = new Resamplers::D_SRC(quality, channels, maxBufferSize, debugLevel);
+#else
+        std::cerr << "Resampler::Resampler(" << quality << ", " << channels
+                  << ", " << maxBufferSize << "): No implementation available!"
+                  << std::endl;
+        abort();
+#endif
+        break;
+
+    case 2:
+#ifdef USE_SPEEX
+        d = new Resamplers::D_Speex(quality, channels, maxBufferSize, debugLevel);
+#else
+        std::cerr << "Resampler::Resampler(" << quality << ", " << channels
+                  << ", " << maxBufferSize << "): No implementation available!"
+                  << std::endl;
+        abort();
+#endif
+        break;
+
+    case 3:
+#ifdef HAVE_LIBRESAMPLE
+        d = new Resamplers::D_Resample(quality, channels, maxBufferSize, debugLevel);
+#else
+        std::cerr << "Resampler::Resampler(" << quality << ", " << channels
+                  << ", " << maxBufferSize << "): No implementation available!"
+                  << std::endl;
+        abort();
+#endif
+        break;
+    }
+
+    if (!d) {
+        std::cerr << "Resampler::Resampler(" << quality << ", " << channels
+                  << ", " << maxBufferSize
+                  << "): Internal error: No implementation selected"
+                  << std::endl;
+        abort();
+    }
+}
+
+Resampler::~Resampler()
+{
+    delete d;
+}
+
+int 
+Resampler::resample(const float *const R__ *const R__ in,
+                    float *const R__ *const R__ out,
+                    int incount, float ratio, bool final)
+{
+    Profiler profiler("Resampler::resample");
+    return d->resample(in, out, incount, ratio, final);
+}
+
+int 
+Resampler::resampleInterleaved(const float *const R__ in,
+                               float *const R__ out,
+                               int incount, float ratio, bool final)
+{
+    Profiler profiler("Resampler::resample");
+    return d->resampleInterleaved(in, out, incount, ratio, final);
+}
+
+int
+Resampler::getChannelCount() const
+{
+    return d->getChannelCount();
+}
+
+void
+Resampler::reset()
+{
+    d->reset();
+}
+
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/dsp/Resampler.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/dsp/Resampler.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,83 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_RESAMPLER_H_
+#define _RUBBERBAND_RESAMPLER_H_
+
+#include "system/sysutils.h"
+
+namespace RubberBand {
+
+class ResamplerImpl;
+
+class Resampler
+{
+public:
+    enum Quality { Best, FastestTolerable, Fastest };
+    enum Exception { ImplementationError };
+
+    /**
+     * Construct a resampler with the given quality level and channel
+     * count.  maxBufferSize gives a bound on the maximum incount size
+     * that may be passed to the resample function before the
+     * resampler needs to reallocate its internal buffers.
+     */
+    Resampler(Quality quality, int channels, int maxBufferSize = 0,
+              int debugLevel = 0);
+    ~Resampler();
+
+    /**
+     * Resample the given multi-channel buffers, where incount is the
+     * number of frames in the input buffers.  Returns the number of
+     * frames written to the output buffers.
+     */
+    int resample(const float *const R__ *const R__ in,
+                 float *const R__ *const R__ out,
+                 int incount,
+                 float ratio,
+                 bool final = false);
+
+    /**
+     * Resample the given interleaved buffer, where incount is the
+     * number of frames in the input buffer (i.e. it has incount *
+     * getChannelCount() samples).  Returns the number of frames
+     * written to the output buffer.
+     */
+    int resampleInterleaved(const float *const R__ in,
+                            float *const R__ out,
+                            int incount,
+                            float ratio,
+                            bool final = false);
+
+    int getChannelCount() const;
+
+    void reset();
+
+protected:
+    ResamplerImpl *d;
+    int m_method;
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/dsp/SampleFilter.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/dsp/SampleFilter.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,59 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _SAMPLE_FILTER_H_
+#define _SAMPLE_FILTER_H_
+
+#include <cassert>
+
+namespace RubberBand
+{
+
+template <typename T>
+class SampleFilter
+{
+public:
+    SampleFilter(int size) : m_size(size) {
+	assert(m_size > 0);
+    }
+
+    virtual ~SampleFilter() { }
+
+    int getSize() const { return m_size; }
+
+    virtual void push(T) = 0;
+    virtual T get() const = 0;
+    virtual void reset() = 0;
+
+protected:
+    const int m_size;
+
+private:
+    SampleFilter(const SampleFilter &);
+    SampleFilter &operator=(const SampleFilter &);
+};
+
+}
+
+#endif
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/dsp/SincWindow.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/dsp/SincWindow.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,155 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_SINC_WINDOW_H_
+#define _RUBBERBAND_SINC_WINDOW_H_
+
+#include <cmath>
+#include <iostream>
+#include <cstdlib>
+#include <map>
+
+#include "system/sysutils.h"
+#include "system/VectorOps.h"
+#include "system/Allocators.h"
+
+namespace RubberBand {
+
+template <typename T>
+class SincWindow
+{
+public:
+    /**
+     * Construct a sinc windower which produces a window of size n
+     * containing the values of sinc(x) with x=0 at index n/2, such
+     * that the distance from -pi to pi (the point at which the sinc
+     * function first crosses zero, for negative and positive
+     * arguments respectively) is p samples.
+     */
+    SincWindow(int n, int p) : m_size(n), m_p(p), m_cache(0) {
+        encache();
+    }
+    SincWindow(const SincWindow &w) : m_size(w.m_size), m_p(w.m_p), m_cache(0) {
+        encache();
+    }
+    SincWindow &operator=(const SincWindow &w) {
+	if (&w == this) return *this;
+	m_size = w.m_size;
+	m_p = w.m_p;
+        m_cache = 0;
+	encache();
+	return *this;
+    }
+    virtual ~SincWindow() {
+        deallocate(m_cache);
+    }
+
+    /**
+     * Regenerate the sinc window with the same size, but a new scale
+     * (the p value is interpreted as for the argument of the same
+     * name to the constructor).  If p is unchanged from the previous
+     * value, do nothing (quickly).
+     */
+    inline void rewrite(int p) {
+        if (m_p == p) return;
+        m_p = p;
+        encache();
+    }
+    
+    inline void cut(T *const R__ dst) const {
+        v_multiply(dst, m_cache, m_size);
+    }
+
+    inline void cut(const T *const R__ src, T *const R__ dst) const {
+        v_multiply(dst, src, m_cache, m_size);
+    }
+
+    inline void add(T *const R__ dst, T scale) const {
+        v_add_with_gain(dst, m_cache, m_size, scale);
+    }
+
+    inline T getArea() const { return m_area; }
+    inline T getValue(int i) const { return m_cache[i]; }
+
+    inline int getSize() const { return m_size; }
+    inline int getP() const { return m_p; }
+
+    /**
+     * Write a sinc window of size n with scale p (the p value is
+     * interpreted as for the argument of the same name to the
+     * constructor).
+     */
+    static
+    void write(T *const R__ dst, const int n, const int p) {
+        const int half = n/2;
+        writeHalf(dst, n, p);
+        int target = half - 1;
+        for (int i = half + 1; i < n; ++i) {
+            dst[target--] = dst[i];
+        }
+        const T twopi = 2. * M_PI;
+        T arg = T(half) * twopi / p;
+        dst[0] = sin(arg) / arg;
+    }
+
+protected:
+    int m_size;
+    int m_p;
+    T *R__ m_cache;
+    T m_area;
+
+    /**
+     * Write the positive half (i.e. n/2 to n-1) of a sinc window of
+     * size n with scale p (the p value is interpreted as for the
+     * argument of the same name to the constructor). The negative
+     * half (indices 0 to n/2-1) of dst is left unchanged.
+     */
+    static
+    void writeHalf(T *const R__ dst, const int n, const int p) {
+        const int half = n/2;
+        const T twopi = 2. * M_PI;
+        dst[half] = T(1.0);
+        for (int i = 1; i < half; ++i) {
+            T arg = T(i) * twopi / p;
+            dst[half+i] = sin(arg) / arg;
+        }
+    }
+    
+    void encache() {
+        if (!m_cache) {
+            m_cache = allocate<T>(m_size);
+        }
+
+        write(m_cache, m_size, m_p);
+	
+        m_area = 0;
+        for (int i = 0; i < m_size; ++i) {
+            m_area += m_cache[i];
+        }
+        m_area /= m_size;
+    }
+};
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/dsp/Window.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/dsp/Window.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,200 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_WINDOW_H_
+#define _RUBBERBAND_WINDOW_H_
+
+#include <cmath>
+#include <cstdlib>
+#include <map>
+
+#include "system/sysutils.h"
+#include "system/VectorOps.h"
+#include "system/Allocators.h"
+
+namespace RubberBand {
+
+enum WindowType {
+    RectangularWindow,
+    BartlettWindow,
+    HammingWindow,
+    HanningWindow,
+    BlackmanWindow,
+    GaussianWindow,
+    ParzenWindow,
+    NuttallWindow,
+    BlackmanHarrisWindow
+};
+
+template <typename T>
+class Window
+{
+public:
+    /**
+     * Construct a windower of the given type.
+     */
+    Window(WindowType type, int size) : m_type(type), m_size(size), m_cache(0) {
+        encache();
+    }
+    Window(const Window &w) : m_type(w.m_type), m_size(w.m_size), m_cache(0) {
+        encache();
+    }
+    Window &operator=(const Window &w) {
+	if (&w == this) return *this;
+	m_type = w.m_type;
+	m_size = w.m_size;
+        m_cache = 0;
+	encache();
+	return *this;
+    }
+    virtual ~Window() {
+        deallocate(m_cache);
+    }
+    
+    inline void cut(T *const R__ block) const {
+        v_multiply(block, m_cache, m_size);
+    }
+
+    inline void cut(const T *const R__ src, T *const R__ dst) const {
+        v_multiply(dst, src, m_cache, m_size);
+    }
+
+    inline void add(T *const R__ dst, T scale) const {
+        v_add_with_gain(dst, m_cache, m_size, scale);
+    }
+
+    inline T getRMS() const {
+        T total = 0;
+        for (int i = 0; i < m_size; ++i) {
+            total += m_cache[i] * m_cache[i];
+        }
+        T rms = sqrt(total / m_size);
+        return rms;
+    }
+
+    inline T getArea() const { return m_area; }
+    inline T getValue(int i) const { return m_cache[i]; }
+
+    inline WindowType getType() const { return m_type; }
+    inline int getSize() const { return m_size; }
+
+protected:
+    WindowType m_type;
+    int m_size;
+    T *R__ m_cache;
+    T m_area;
+    
+    void encache();
+    void cosinewin(T *, T, T, T, T);
+};
+
+template <typename T>
+void Window<T>::encache()
+{
+    if (!m_cache) m_cache = allocate<T>(m_size);
+
+    const int n = m_size;
+    v_set(m_cache, T(1.0), n);
+    int i;
+
+    switch (m_type) {
+		
+    case RectangularWindow:
+	for (i = 0; i < n; ++i) {
+	    m_cache[i] *= 0.5;
+	}
+	break;
+	    
+    case BartlettWindow:
+	for (i = 0; i < n/2; ++i) {
+	    m_cache[i] *= (i / T(n/2));
+	    m_cache[i + n/2] *= (1.0 - (i / T(n/2)));
+	}
+	break;
+	    
+    case HammingWindow:
+        cosinewin(m_cache, 0.54, 0.46, 0.0, 0.0);
+	break;
+	    
+    case HanningWindow:
+        cosinewin(m_cache, 0.50, 0.50, 0.0, 0.0);
+	break;
+	    
+    case BlackmanWindow:
+        cosinewin(m_cache, 0.42, 0.50, 0.08, 0.0);
+	break;
+	    
+    case GaussianWindow:
+	for (i = 0; i < n; ++i) {
+            m_cache[i] *= pow(2, - pow((i - (n-1)/2.0) / ((n-1)/2.0 / 3), 2));
+	}
+	break;
+	    
+    case ParzenWindow:
+    {
+        int N = n-1;
+        for (i = 0; i < N/4; ++i) {
+            T m = 2 * pow(1.0 - (T(N)/2 - i) / (T(N)/2), 3);
+            m_cache[i] *= m;
+            m_cache[N-i] *= m;
+        }
+        for (i = N/4; i <= N/2; ++i) {
+            int wn = i - N/2;
+            T m = 1.0 - 6 * pow(wn / (T(N)/2), 2) * (1.0 - abs(wn) / (T(N)/2));
+            m_cache[i] *= m;
+            m_cache[N-i] *= m;
+        }            
+        break;
+    }
+
+    case NuttallWindow:
+        cosinewin(m_cache, 0.3635819, 0.4891775, 0.1365995, 0.0106411);
+	break;
+
+    case BlackmanHarrisWindow:
+        cosinewin(m_cache, 0.35875, 0.48829, 0.14128, 0.01168);
+        break;
+    }
+	
+    m_area = 0;
+    for (i = 0; i < n; ++i) {
+        m_area += m_cache[i];
+    }
+    m_area /= n;
+}
+
+template <typename T>
+void Window<T>::cosinewin(T *mult, T a0, T a1, T a2, T a3)
+{
+    int n = int(m_size);
+    for (int i = 0; i < n; ++i) {
+        mult[i] *= (a0
+                    - a1 * cos(2 * M_PI * i / n)
+                    + a2 * cos(4 * M_PI * i / n)
+                    - a3 * cos(6 * M_PI * i / n));
+    }
+}
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/float_cast/float_cast.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/float_cast/float_cast.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,75 @@
+#ifndef ERIKD_FLOATCAST_H
+#define ERIKD_FLOATCAST_H
+
+/*
+** Copyright (C) 2001 Erik de Castro Lopo <erikd AT mega-nerd DOT com>
+**
+** Permission to use, copy, modify, distribute, and sell this file for any 
+** purpose is hereby granted without fee, provided that the above copyright 
+** and this permission notice appear in all copies.  No representations are
+** made about the suitability of this software for any purpose.  It is 
+** provided "as is" without express or implied warranty.
+*/
+
+/* Version 1.1 */
+
+
+/*============================================================================ 
+**	On Intel Pentium processors (especially PIII and probably P4), converting
+**	from float to int is very slow. To meet the C specs, the code produced by 
+**	most C compilers targeting Pentium needs to change the FPU rounding mode 
+**	before the float to int conversion is performed. 
+**
+**	Changing the FPU rounding mode causes the FPU pipeline to be flushed. It 
+**	is this flushing of the pipeline which is so slow.
+**
+**	Fortunately the ISO C99 specifications define the functions lrint, lrintf,
+**	llrint and llrintf which fix this problem as a side effect. 
+**
+**	On Unix-like systems, the configure process should have detected the 
+**	presence of these functions. If they weren't found we have to replace them 
+**	here with a standard C cast.
+*/
+
+/*	
+**	The C99 prototypes for lrint and lrintf are as follows:
+**	
+**		long int lrintf (float x) ;
+**		long int lrint  (double x) ;
+*/
+
+#if (defined (WIN32) || defined (_WIN32))
+
+	#include	<math.h>
+
+	/*	Win32 doesn't seem to have these functions. 
+	**	Therefore implement inline versions of these functions here.
+	*/
+	
+	__inline long int 
+	lrint (double flt) 
+	{	int intgr;
+
+		_asm
+		{	fld flt
+			fistp intgr
+			} ;
+			
+		return intgr ;
+	} 
+	
+	__inline long int 
+	lrintf (float flt)
+	{	int intgr;
+
+		_asm
+		{	fld flt
+			fistp intgr
+			} ;
+			
+		return intgr ;
+	}
+
+#endif
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/getopt/getopt.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/getopt/getopt.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 1987, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int	opterr = 1,		/* if error message should be printed */
+	optind = 1,		/* index into parent argv vector */
+	optopt,			/* character checked for validity */
+	optreset;		/* reset getopt */
+char	*optarg;		/* argument associated with option */
+
+#define	BADCH	(int)'?'
+#define	BADARG	(int)':'
+#define	EMSG	""
+
+/*
+ * getopt --
+ *	Parse argc/argv argument vector.
+ */
+int
+getopt(nargc, nargv, ostr)
+	int nargc;
+	char * const *nargv;
+	const char *ostr;
+{
+	static char *place = EMSG;		/* option letter processing */
+	char *oli;				/* option letter list index */
+
+	if (optreset || !*place) {		/* update scanning pointer */
+		optreset = 0;
+		if (optind >= nargc || *(place = nargv[optind]) != '-') {
+			place = EMSG;
+			return (-1);
+		}
+		if (place[1] && *++place == '-') {	/* found "--" */
+			++optind;
+			place = EMSG;
+			return (-1);
+		}
+	}					/* option letter okay? */
+	if ((optopt = (int)*place++) == (int)':' ||
+	    !(oli = strchr(ostr, optopt))) {
+		/*
+		 * if the user didn't specify '-' as an option,
+		 * assume it means -1.
+		 */
+		if (optopt == (int)'-')
+			return (-1);
+		if (!*place)
+			++optind;
+		if (opterr && *ostr != ':' && optopt != BADCH)
+			(void)fprintf(stderr, "%s: illegal option -- %c\n",
+			    "progname", optopt);
+		return (BADCH);
+	}
+	if (*++oli != ':') {			/* don't need argument */
+		optarg = NULL;
+		if (!*place)
+			++optind;
+	}
+	else {					/* need an argument */
+		if (*place)			/* no white space */
+			optarg = place;
+		else if (nargc <= ++optind) {	/* no arg */
+			place = EMSG;
+			if (*ostr == ':')
+				return (BADARG);
+			if (opterr)
+				(void)fprintf(stderr,
+				    "%s: option requires an argument -- %c\n",
+				    "progname", optopt);
+			return (BADCH);
+		}
+	 	else				/* white space */
+			optarg = nargv[optind];
+		place = EMSG;
+		++optind;
+	}
+	return (optopt);			/* dump back option letter */
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/getopt/getopt.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/getopt/getopt.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,110 @@
+/*      $NetBSD: getopt.h,v 1.4 2000/07/07 10:43:54 ad Exp $    */
+/*      $FreeBSD: src/include/getopt.h,v 1.1 2002/09/29 04:14:30 eric Exp $ */
+
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GETOPT_H_
+#define _GETOPT_H_
+
+#ifdef _WIN32
+/* from <sys/cdefs.h> */
+# ifdef  __cplusplus
+#  define __BEGIN_DECLS  extern "C" {
+#  define __END_DECLS    }
+# else
+#  define __BEGIN_DECLS
+#  define __END_DECLS
+# endif
+# define __P(args)      args
+#endif
+
+/*#ifndef _WIN32
+#include <sys/cdefs.h>
+#include <unistd.h>
+#endif*/
+
+#ifdef _WIN32
+# if !defined(GETOPT_API)
+#  define GETOPT_API __declspec(dllimport)
+# endif
+#endif
+
+/*
+ * Gnu like getopt_long() and BSD4.4 getsubopt()/optreset extensions
+ */
+#if !defined(_POSIX_SOURCE) && !defined(_XOPEN_SOURCE)
+#define no_argument        0
+#define required_argument  1
+#define optional_argument  2
+
+struct option {
+        /* name of long option */
+        const char *name;
+        /*
+         * one of no_argument, required_argument, and optional_argument:
+         * whether option takes an argument
+         */
+        int has_arg;
+        /* if not NULL, set *flag to val when option found */
+        int *flag;
+        /* if flag not NULL, value to set *flag to; else return value */
+        int val;
+};
+
+__BEGIN_DECLS
+GETOPT_API int getopt_long __P((int, char * const *, const char *,
+    const struct option *, int *));
+__END_DECLS
+#endif
+
+#ifdef _WIN32
+/* These are global getopt variables */
+__BEGIN_DECLS
+
+GETOPT_API extern int   opterr,   /* if error message should be printed */
+                        optind,   /* index into parent argv vector */
+                        optopt,   /* character checked for validity */
+                        optreset; /* reset getopt */
+GETOPT_API extern char* optarg;   /* argument associated with option */
+
+/* Original getopt */
+GETOPT_API int getopt __P((int, char * const *, const char *));
+
+__END_DECLS
+#endif
+ 
+#endif /* !_GETOPT_H_ */
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/getopt/getopt_long.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/getopt/getopt_long.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,547 @@
+/*	$NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $	*/
+/*	$FreeBSD: src/lib/libc/stdlib/getopt_long.c,v 1.2 2002/10/16 22:18:42 alfred Exp $ */
+
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+
+/* Windows needs warnx().  We change the definition though:
+ *  1. (another) global is defined, opterrmsg, which holds the error message
+ *  2. errors are always printed out on stderr w/o the program name
+ * Note that opterrmsg always gets set no matter what opterr is set to.  The
+ * error message will not be printed if opterr is 0 as usual.
+ */
+
+#include "getopt.h"
+#include <stdio.h>
+#include <stdarg.h>
+
+GETOPT_API extern char opterrmsg[128];
+char opterrmsg[128]; /* last error message is stored here */
+
+static void warnx(int print_error, const char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	if (fmt != NULL)
+		_vsnprintf(opterrmsg, 128, fmt, ap);
+	else
+		opterrmsg[0]='\0';
+	va_end(ap);
+	if (print_error) {
+		fprintf(stderr, opterrmsg);
+		fprintf(stderr, "\n");
+	}
+}
+
+#endif /*_WIN32*/
+
+/* not part of the original file */
+#ifndef _DIAGASSERT
+#define _DIAGASSERT(X)
+#endif
+
+#if HAVE_CONFIG_H && !HAVE_GETOPT_LONG && !HAVE_DECL_OPTIND
+#define REPLACE_GETOPT
+#endif
+
+#ifdef REPLACE_GETOPT
+#ifdef __weak_alias
+__weak_alias(getopt,_getopt)
+#endif
+int	opterr = 1;		/* if error message should be printed */
+int	optind = 1;		/* index into parent argv vector */
+int	optopt = '?';		/* character checked for validity */
+int	optreset;		/* reset getopt */
+char    *optarg;		/* argument associated with option */
+#elif HAVE_CONFIG_H && !HAVE_DECL_OPTRESET
+static int optreset;
+#endif
+
+#ifdef __weak_alias
+__weak_alias(getopt_long,_getopt_long)
+#endif
+
+#if !HAVE_GETOPT_LONG
+#define IGNORE_FIRST	(*options == '-' || *options == '+')
+#define PRINT_ERROR	((opterr) && ((*options != ':') \
+				      || (IGNORE_FIRST && options[1] != ':')))
+#define IS_POSIXLY_CORRECT (getenv("POSIXLY_CORRECT") != NULL)
+#define PERMUTE         (!IS_POSIXLY_CORRECT && !IGNORE_FIRST)
+/* XXX: GNU ignores PC if *options == '-' */
+#define IN_ORDER        (!IS_POSIXLY_CORRECT && *options == '-')
+
+/* return values */
+#define	BADCH	(int)'?'
+#define	BADARG		((IGNORE_FIRST && options[1] == ':') \
+			 || (*options == ':') ? (int)':' : (int)'?')
+#define INORDER (int)1
+
+#define	EMSG	""
+
+static int getopt_internal(int, char * const *, const char *);
+static int gcd(int, int);
+static void permute_args(int, int, int, char * const *);
+
+static char *place = EMSG; /* option letter processing */
+
+/* XXX: set optreset to 1 rather than these two */
+static int nonopt_start = -1; /* first non option argument (for permute) */
+static int nonopt_end = -1;   /* first option after non options (for permute) */
+
+/* Error messages */
+static const char recargchar[] = "option requires an argument -- %c";
+static const char recargstring[] = "option requires an argument -- %s";
+static const char ambig[] = "ambiguous option -- %.*s";
+static const char noarg[] = "option doesn't take an argument -- %.*s";
+static const char illoptchar[] = "unknown option -- %c";
+static const char illoptstring[] = "unknown option -- %s";
+
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int
+gcd(a, b)
+	int a;
+	int b;
+{
+	int c;
+
+	c = a % b;
+	while (c != 0) {
+		a = b;
+		b = c;
+		c = a % b;
+	}
+	   
+	return b;
+}
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void
+permute_args(panonopt_start, panonopt_end, opt_end, nargv)
+	int panonopt_start;
+	int panonopt_end;
+	int opt_end;
+	char * const *nargv;
+{
+	int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+	char *swap;
+
+	_DIAGASSERT(nargv != NULL);
+
+	/*
+	 * compute lengths of blocks and number and size of cycles
+	 */
+	nnonopts = panonopt_end - panonopt_start;
+	nopts = opt_end - panonopt_end;
+	ncycle = gcd(nnonopts, nopts);
+	cyclelen = (opt_end - panonopt_start) / ncycle;
+
+	for (i = 0; i < ncycle; i++) {
+		cstart = panonopt_end+i;
+		pos = cstart;
+		for (j = 0; j < cyclelen; j++) {
+			if (pos >= panonopt_end)
+				pos -= nnonopts;
+			else
+				pos += nopts;
+			swap = nargv[pos];
+			/* LINTED const cast */
+			((char **) nargv)[pos] = nargv[cstart];
+			/* LINTED const cast */
+			((char **)nargv)[cstart] = swap;
+		}
+	}
+}
+
+/*
+ * getopt_internal --
+ *	Parse argc/argv argument vector.  Called by user level routines.
+ *  Returns -2 if -- is found (can be long option or end of options marker).
+ */
+static int
+getopt_internal(nargc, nargv, options)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+{
+	char *oli;				/* option letter list index */
+	int optchar;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+
+	optarg = NULL;
+
+	/*
+	 * XXX Some programs (like rsyncd) expect to be able to
+	 * XXX re-initialize optind to 0 and have getopt_long(3)
+	 * XXX properly function again.  Work around this braindamage.
+	 */
+	if (optind == 0)
+		optind = 1;
+
+	if (optreset)
+		nonopt_start = nonopt_end = -1;
+start:
+	if (optreset || !*place) {		/* update scanning pointer */
+		optreset = 0;
+		if (optind >= nargc) {          /* end of argument vector */
+			place = EMSG;
+			if (nonopt_end != -1) {
+				/* do permutation, if we have to */
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				optind -= nonopt_end - nonopt_start;
+			}
+			else if (nonopt_start != -1) {
+				/*
+				 * If we skipped non-options, set optind
+				 * to the first of them.
+				 */
+				optind = nonopt_start;
+			}
+			nonopt_start = nonopt_end = -1;
+			return -1;
+		}
+		if ((*(place = nargv[optind]) != '-')
+		    || (place[1] == '\0')) {    /* found non-option */
+			place = EMSG;
+			if (IN_ORDER) {
+				/*
+				 * GNU extension: 
+				 * return non-option as argument to option 1
+				 */
+				optarg = nargv[optind++];
+				return INORDER;
+			}
+			if (!PERMUTE) {
+				/*
+				 * if no permutation wanted, stop parsing
+				 * at first non-option
+				 */
+				return -1;
+			}
+			/* do permutation */
+			if (nonopt_start == -1)
+				nonopt_start = optind;
+			else if (nonopt_end != -1) {
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				nonopt_start = optind -
+				    (nonopt_end - nonopt_start);
+				nonopt_end = -1;
+			}
+			optind++;
+			/* process next argument */
+			goto start;
+		}
+		if (nonopt_start != -1 && nonopt_end == -1)
+			nonopt_end = optind;
+		if (place[1] && *++place == '-') {	/* found "--" */
+			place++;
+			return -2;
+		}
+	}
+	if ((optchar = (int)*place++) == (int)':' ||
+	    (oli = strchr(options + (IGNORE_FIRST ? 1 : 0), optchar)) == NULL) {
+		/* option letter unknown or ':' */
+		if (!*place)
+			++optind;
+#ifndef _WIN32
+		if (PRINT_ERROR)
+			warnx(illoptchar, optchar);
+#else
+			warnx(PRINT_ERROR, illoptchar, optchar);
+#endif
+		optopt = optchar;
+		return BADCH;
+	}
+	if (optchar == 'W' && oli[1] == ';') {		/* -W long-option */
+		/* XXX: what if no long options provided (called by getopt)? */
+		if (*place) 
+			return -2;
+
+		if (++optind >= nargc) {	/* no arg */
+			place = EMSG;
+#ifndef _WIN32
+			if (PRINT_ERROR)
+				warnx(recargchar, optchar);
+#else
+				warnx(PRINT_ERROR, recargchar, optchar);
+#endif
+			optopt = optchar;
+			return BADARG;
+		} else				/* white space */
+			place = nargv[optind];
+		/*
+		 * Handle -W arg the same as --arg (which causes getopt to
+		 * stop parsing).
+		 */
+		return -2;
+	}
+	if (*++oli != ':') {			/* doesn't take argument */
+		if (!*place)
+			++optind;
+	} else {				/* takes (optional) argument */
+		optarg = NULL;
+		if (*place)			/* no white space */
+			optarg = place;
+		/* XXX: disable test for :: if PC? (GNU doesn't) */
+		else if (oli[1] != ':') {	/* arg not optional */
+			if (++optind >= nargc) {	/* no arg */
+				place = EMSG;
+#ifndef _WIN32
+				if (PRINT_ERROR)
+					warnx(recargchar, optchar);
+#else
+					warnx(PRINT_ERROR, recargchar, optchar);
+#endif
+				optopt = optchar;
+				return BADARG;
+			} else
+				optarg = nargv[optind];
+		}
+		place = EMSG;
+		++optind;
+	}
+	/* dump back option letter */
+	return optchar;
+}
+
+#ifdef REPLACE_GETOPT
+/*
+ * getopt --
+ *	Parse argc/argv argument vector.
+ *
+ * [eventually this will replace the real getopt]
+ */
+int
+getopt(nargc, nargv, options)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+{
+	int retval;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+
+	if ((retval = getopt_internal(nargc, nargv, options)) == -2) {
+		++optind;
+		/*
+		 * We found an option (--), so if we skipped non-options,
+		 * we have to permute.
+		 */
+		if (nonopt_end != -1) {
+			permute_args(nonopt_start, nonopt_end, optind,
+				       nargv);
+			optind -= nonopt_end - nonopt_start;
+		}
+		nonopt_start = nonopt_end = -1;
+		retval = -1;
+	}
+	return retval;
+}
+#endif
+
+/*
+ * getopt_long --
+ *	Parse argc/argv argument vector.
+ */
+int
+getopt_long(nargc, nargv, options, long_options, idx)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+	const struct option *long_options;
+	int *idx;
+{
+	int retval;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+	_DIAGASSERT(long_options != NULL);
+	/* idx may be NULL */
+
+	if ((retval = getopt_internal(nargc, nargv, options)) == -2) {
+		char *current_argv, *has_equal;
+		size_t current_argv_len;
+		int i, match;
+
+		current_argv = place;
+		match = -1;
+
+		optind++;
+		place = EMSG;
+
+		if (*current_argv == '\0') {		/* found "--" */
+			/*
+			 * We found an option (--), so if we skipped
+			 * non-options, we have to permute.
+			 */
+			if (nonopt_end != -1) {
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				optind -= nonopt_end - nonopt_start;
+			}
+			nonopt_start = nonopt_end = -1;
+			return -1;
+		}
+		if ((has_equal = strchr(current_argv, '=')) != NULL) {
+			/* argument found (--option=arg) */
+			current_argv_len = has_equal - current_argv;
+			has_equal++;
+		} else
+			current_argv_len = strlen(current_argv);
+	    
+		for (i = 0; long_options[i].name; i++) {
+			/* find matching long option */
+			if (strncmp(current_argv, long_options[i].name,
+			    current_argv_len))
+				continue;
+
+			if (strlen(long_options[i].name) ==
+			    (unsigned)current_argv_len) {
+				/* exact match */
+				match = i;
+				break;
+			}
+			if (match == -1)		/* partial match */
+				match = i;
+			else {
+				/* ambiguous abbreviation */
+#ifndef _WIN32
+				if (PRINT_ERROR)
+					warnx(ambig, (int)current_argv_len,
+					     current_argv);
+#else
+					warnx(PRINT_ERROR, ambig, (int)current_argv_len,
+					     current_argv);
+#endif
+				optopt = 0;
+				return BADCH;
+			}
+		}
+		if (match != -1) {			/* option found */
+		        if (long_options[match].has_arg == no_argument
+			    && has_equal) {
+#ifndef _WIN32
+				if (PRINT_ERROR)
+					warnx(noarg, (int)current_argv_len,
+					     current_argv);
+#else
+					warnx(PRINT_ERROR, noarg, (int)current_argv_len,
+					     current_argv);
+#endif
+				/*
+				 * XXX: GNU sets optopt to val regardless of
+				 * flag
+				 */
+				if (long_options[match].flag == NULL)
+					optopt = long_options[match].val;
+				else
+					optopt = 0;
+				return BADARG;
+			}
+			if (long_options[match].has_arg == required_argument ||
+			    long_options[match].has_arg == optional_argument) {
+				if (has_equal)
+					optarg = has_equal;
+				else if (long_options[match].has_arg ==
+				    required_argument) {
+					/*
+					 * optional argument doesn't use
+					 * next nargv
+					 */
+					optarg = nargv[optind++];
+				}
+			}
+			if ((long_options[match].has_arg == required_argument)
+			    && (optarg == NULL)) {
+				/*
+				 * Missing argument; leading ':'
+				 * indicates no error should be generated
+				 */
+#ifndef _WIN32
+				if (PRINT_ERROR)
+					warnx(recargstring, current_argv);
+#else
+					warnx(PRINT_ERROR, recargstring, current_argv);
+#endif
+				/*
+				 * XXX: GNU sets optopt to val regardless
+				 * of flag
+				 */
+				if (long_options[match].flag == NULL)
+					optopt = long_options[match].val;
+				else
+					optopt = 0;
+				--optind;
+				return BADARG;
+			}
+		} else {			/* unknown option */
+#ifndef _WIN32
+			if (PRINT_ERROR)
+				warnx(illoptstring, current_argv);
+#else
+				warnx(PRINT_ERROR, illoptstring, current_argv);
+#endif
+			optopt = 0;
+			return BADCH;
+		}
+		if (long_options[match].flag) {
+			*long_options[match].flag = long_options[match].val;
+			retval = 0;
+		} else 
+			retval = long_options[match].val;
+		if (idx)
+			*idx = match;
+	}
+	return retval;
+}
+#endif /* !GETOPT_LONG */
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/jni/RubberBandStretcherJNI.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/jni/RubberBandStretcherJNI.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,368 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+/* Copyright Chris Cannam - All Rights Reserved */
+
+#include "rubberband/RubberBandStretcher.h"
+
+#include "system/Allocators.h"
+
+#include <jni.h>
+
+using namespace RubberBand;
+
+extern "C" {
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    dispose
+ * Signature: ()V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_dispose
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    reset
+ * Signature: ()V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_reset
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    setTimeRatio
+ * Signature: (D)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_setTimeRatio
+  (JNIEnv *, jobject, jdouble);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    setPitchScale
+ * Signature: (D)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_setPitchScale
+  (JNIEnv *, jobject, jdouble);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    getChannelCount
+ * Signature: ()I
+ */
+JNIEXPORT jint JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_getChannelCount
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    getTimeRatio
+ * Signature: ()D
+ */
+JNIEXPORT jdouble JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_getTimeRatio
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    getPitchScale
+ * Signature: ()D
+ */
+JNIEXPORT jdouble JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_getPitchScale
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    getLatency
+ * Signature: ()I
+ */
+JNIEXPORT jint JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_getLatency
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    setTransientsOption
+ * Signature: (I)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_setTransientsOption
+  (JNIEnv *, jobject, jint);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    setDetectorOption
+ * Signature: (I)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_setDetectorOption
+  (JNIEnv *, jobject, jint);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    setPhaseOption
+ * Signature: (I)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_setPhaseOption
+  (JNIEnv *, jobject, jint);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    setFormantOption
+ * Signature: (I)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_setFormantOption
+  (JNIEnv *, jobject, jint);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    setPitchOption
+ * Signature: (I)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_setPitchOption
+  (JNIEnv *, jobject, jint);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    setExpectedInputDuration
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_setExpectedInputDuration
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    setMaxProcessSize
+ * Signature: (I)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_setMaxProcessSize
+  (JNIEnv *, jobject, jint);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    getSamplesRequired
+ * Signature: ()I
+ */
+JNIEXPORT jint JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_getSamplesRequired
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    study
+ * Signature: ([[FZ)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_study
+  (JNIEnv *, jobject, jobjectArray, jint, jint, jboolean);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    process
+ * Signature: ([[FZ)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_process
+  (JNIEnv *, jobject, jobjectArray, jint, jint, jboolean);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    available
+ * Signature: ()I
+ */
+JNIEXPORT jint JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_available
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    retrieve
+ * Signature: (I)[[F
+ */
+JNIEXPORT jint JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_retrieve
+  (JNIEnv *, jobject, jobjectArray, jint, jint);
+
+/*
+ * Class:     com_breakfastquay_rubberband_RubberBandStretcher
+ * Method:    initialise
+ * Signature: (IIIDD)V
+ */
+JNIEXPORT void JNICALL Java_com_breakfastquay_rubberband_RubberBandStretcher_initialise
+  (JNIEnv *, jobject, jint, jint, jint, jdouble, jdouble);
+
+}
+
+RubberBandStretcher *
+getStretcher(JNIEnv *env, jobject obj)
+{
+    jclass c = env->GetObjectClass(obj);
+    jfieldID fid = env->GetFieldID(c, "handle", "J");
+    jlong handle = env->GetLongField(obj, fid);
+    return (RubberBandStretcher *)handle;
+}
+
+void
+setStretcher(JNIEnv *env, jobject obj, RubberBandStretcher *stretcher)
+{
+    jclass c = env->GetObjectClass(obj);
+    jfieldID fid = env->GetFieldID(c, "handle", "J");
+    jlong handle = (jlong)stretcher;
+    env->SetLongField(obj, fid, handle);
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_initialise(JNIEnv *env, jobject obj, jint sampleRate, jint channels, jint options, jdouble initialTimeRatio, jdouble initialPitchScale)
+{
+    setStretcher(env, obj, new RubberBandStretcher
+                 (sampleRate, channels, options, initialTimeRatio, initialPitchScale));
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_dispose(JNIEnv *env, jobject obj)
+{
+    delete getStretcher(env, obj);
+    setStretcher(env, obj, 0);
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_reset(JNIEnv *env, jobject obj)
+{
+    getStretcher(env, obj)->reset();
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_setTimeRatio(JNIEnv *env, jobject obj, jdouble ratio)
+{
+    getStretcher(env, obj)->setTimeRatio(ratio);
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_setPitchScale(JNIEnv *env, jobject obj, jdouble scale)
+{
+    getStretcher(env, obj)->setPitchScale(scale);
+}
+
+jint
+Java_com_breakfastquay_rubberband_RubberBandStretcher_getChannelCount(JNIEnv *env, jobject obj)
+{
+    return getStretcher(env, obj)->getChannelCount();
+}
+
+jdouble
+Java_com_breakfastquay_rubberband_RubberBandStretcher_getTimeRatio(JNIEnv *env, jobject obj)
+{
+    return getStretcher(env, obj)->getTimeRatio();
+}
+
+jdouble
+Java_com_breakfastquay_rubberband_RubberBandStretcher_getPitchScale(JNIEnv *env, jobject obj)
+{
+    return getStretcher(env, obj)->getPitchScale();
+}
+
+jint
+Java_com_breakfastquay_rubberband_RubberBandStretcher_getLatency(JNIEnv *env, jobject obj)
+{
+    return getStretcher(env, obj)->getLatency();
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_setTransientsOption(JNIEnv *env, jobject obj, jint options)
+{
+    getStretcher(env, obj)->setTransientsOption(options);
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_setDetectorOption(JNIEnv *env, jobject obj, jint options)
+{
+    getStretcher(env, obj)->setDetectorOption(options);
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_setPhaseOption(JNIEnv *env, jobject obj, jint options)
+{
+    getStretcher(env, obj)->setPhaseOption(options);
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_setFormantOption(JNIEnv *env, jobject obj, jint options)
+{
+    getStretcher(env, obj)->setFormantOption(options);
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_setPitchOption(JNIEnv *env, jobject obj, jint options)
+{
+    getStretcher(env, obj)->setPitchOption(options);
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_setExpectedInputDuration(JNIEnv *env, jobject obj, jlong duration)
+{
+    getStretcher(env, obj)->setExpectedInputDuration(duration);
+}
+
+jint
+Java_com_breakfastquay_rubberband_RubberBandStretcher_getSamplesRequired(JNIEnv *env, jobject obj)
+{
+    return getStretcher(env, obj)->getSamplesRequired();
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_study(JNIEnv *env, jobject obj, jobjectArray data, jint offset, jint n, jboolean final)
+{
+    int channels = env->GetArrayLength(data);
+    float **arr = allocate<float *>(channels);
+    float **input = allocate<float *>(channels);
+    int samples = 0;
+    for (int c = 0; c < channels; ++c) {
+        jfloatArray cdata = (jfloatArray)env->GetObjectArrayElement(data, c);
+        arr[c] = env->GetFloatArrayElements(cdata, 0);
+        input[c] = arr[c] + offset;
+    }
+
+    getStretcher(env, obj)->study(input, n, final);
+
+    for (int c = 0; c < channels; ++c) {
+        jfloatArray cdata = (jfloatArray)env->GetObjectArrayElement(data, c);
+        env->ReleaseFloatArrayElements(cdata, arr[c], 0);
+    }
+}
+
+void
+Java_com_breakfastquay_rubberband_RubberBandStretcher_process(JNIEnv *env, jobject obj, jobjectArray data, jint offset, jint n, jboolean final)
+{
+    int channels = env->GetArrayLength(data);
+    float **arr = allocate<float *>(channels);
+    float **input = allocate<float *>(channels);
+    int samples = 0;
+    for (int c = 0; c < channels; ++c) {
+        jfloatArray cdata = (jfloatArray)env->GetObjectArrayElement(data, c);
+        arr[c] = env->GetFloatArrayElements(cdata, 0);
+        input[c] = arr[c] + offset;
+    }
+
+    getStretcher(env, obj)->process(input, n, final);
+
+    for (int c = 0; c < channels; ++c) {
+        jfloatArray cdata = (jfloatArray)env->GetObjectArrayElement(data, c);
+        env->ReleaseFloatArrayElements(cdata, arr[c], 0);
+    }
+
+    deallocate(input);
+    deallocate(arr);
+}
+
+jint
+Java_com_breakfastquay_rubberband_RubberBandStretcher_available(JNIEnv *env, jobject obj)
+{
+    return getStretcher(env, obj)->available();
+}
+
+jint
+Java_com_breakfastquay_rubberband_RubberBandStretcher_retrieve(JNIEnv *env, jobject obj, jobjectArray output, jint offset, jint n)
+{
+    RubberBandStretcher *stretcher = getStretcher(env, obj);
+    size_t channels = stretcher->getChannelCount();
+    
+    float **outbuf = allocate_channels<float>(channels, n);
+    size_t retrieved = stretcher->retrieve(outbuf, n);
+
+    for (int c = 0; c < channels; ++c) {
+        jfloatArray cdata = (jfloatArray)env->GetObjectArrayElement(output, c);
+        env->SetFloatArrayRegion(cdata, offset, retrieved, outbuf[c]);
+    }
+    
+    deallocate_channels(outbuf, channels);
+    return retrieved;
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/kissfft/COPYING
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/kissfft/COPYING	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,11 @@
+Copyright (c) 2003-2004 Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/kissfft/_kiss_fft_guts.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/kissfft/_kiss_fft_guts.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,150 @@
+/*
+Copyright (c) 2003-2004, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* kiss_fft.h
+   defines kiss_fft_scalar as either short or a float type
+   and defines
+   typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
+#include "kiss_fft.h"
+#include <limits.h>
+
+#define MAXFACTORS 32
+/* e.g. an fft of length 128 has 4 factors 
+ as far as kissfft is concerned
+ 4*4*4*2
+ */
+
+struct kiss_fft_state{
+    int nfft;
+    int inverse;
+    int factors[2*MAXFACTORS];
+    kiss_fft_cpx twiddles[1];
+};
+
+/*
+  Explanation of macros dealing with complex math:
+
+   C_MUL(m,a,b)         : m = a*b
+   C_FIXDIV( c , div )  : if a fixed point impl., c /= div. noop otherwise
+   C_SUB( res, a,b)     : res = a - b
+   C_SUBFROM( res , a)  : res -= a
+   C_ADDTO( res , a)    : res += a
+ * */
+#ifdef FIXED_POINT
+#if (FIXED_POINT==32)
+# define FRACBITS 31
+# define SAMPPROD int64_t
+#define SAMP_MAX 2147483647
+#else
+# define FRACBITS 15
+# define SAMPPROD int32_t 
+#define SAMP_MAX 32767
+#endif
+
+#define SAMP_MIN -SAMP_MAX
+
+#if defined(CHECK_OVERFLOW)
+#  define CHECK_OVERFLOW_OP(a,op,b)  \
+	if ( (SAMPPROD)(a) op (SAMPPROD)(b) > SAMP_MAX || (SAMPPROD)(a) op (SAMPPROD)(b) < SAMP_MIN ) { \
+		fprintf(stderr,"WARNING:overflow @ " __FILE__ "(%d): (%d " #op" %d) = %ld\n",__LINE__,(a),(b),(SAMPPROD)(a) op (SAMPPROD)(b) );  }
+#endif
+
+
+#   define smul(a,b) ( (SAMPPROD)(a)*(b) )
+#   define sround( x )  (kiss_fft_scalar)( ( (x) + (1<<(FRACBITS-1)) ) >> FRACBITS )
+
+#   define S_MUL(a,b) sround( smul(a,b) )
+
+#   define C_MUL(m,a,b) \
+      do{ (m).r = sround( smul((a).r,(b).r) - smul((a).i,(b).i) ); \
+          (m).i = sround( smul((a).r,(b).i) + smul((a).i,(b).r) ); }while(0)
+
+#   define DIVSCALAR(x,k) \
+	(x) = sround( smul(  x, SAMP_MAX/k ) )
+
+#   define C_FIXDIV(c,div) \
+	do {    DIVSCALAR( (c).r , div);  \
+		DIVSCALAR( (c).i  , div); }while (0)
+
+#   define C_MULBYSCALAR( c, s ) \
+    do{ (c).r =  sround( smul( (c).r , s ) ) ;\
+        (c).i =  sround( smul( (c).i , s ) ) ; }while(0)
+
+#else  /* not FIXED_POINT*/
+
+#   define S_MUL(a,b) ( (a)*(b) )
+#define C_MUL(m,a,b) \
+    do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
+        (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
+#   define C_FIXDIV(c,div) /* NOOP */
+#   define C_MULBYSCALAR( c, s ) \
+    do{ (c).r *= (s);\
+        (c).i *= (s); }while(0)
+#endif
+
+#ifndef CHECK_OVERFLOW_OP
+#  define CHECK_OVERFLOW_OP(a,op,b) /* noop */
+#endif
+
+#define  C_ADD( res, a,b)\
+    do { \
+	    CHECK_OVERFLOW_OP((a).r,+,(b).r)\
+	    CHECK_OVERFLOW_OP((a).i,+,(b).i)\
+	    (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i; \
+    }while(0)
+#define  C_SUB( res, a,b)\
+    do { \
+	    CHECK_OVERFLOW_OP((a).r,-,(b).r)\
+	    CHECK_OVERFLOW_OP((a).i,-,(b).i)\
+	    (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i; \
+    }while(0)
+#define C_ADDTO( res , a)\
+    do { \
+	    CHECK_OVERFLOW_OP((res).r,+,(a).r)\
+	    CHECK_OVERFLOW_OP((res).i,+,(a).i)\
+	    (res).r += (a).r;  (res).i += (a).i;\
+    }while(0)
+
+#define C_SUBFROM( res , a)\
+    do {\
+	    CHECK_OVERFLOW_OP((res).r,-,(a).r)\
+	    CHECK_OVERFLOW_OP((res).i,-,(a).i)\
+	    (res).r -= (a).r;  (res).i -= (a).i; \
+    }while(0)
+
+
+#ifdef FIXED_POINT
+#  define KISS_FFT_COS(phase)  floor(.5+SAMP_MAX * cos (phase))
+#  define KISS_FFT_SIN(phase)  floor(.5+SAMP_MAX * sin (phase))
+#  define HALF_OF(x) ((x)>>1)
+#elif defined(USE_SIMD)
+#  define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) )
+#  define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
+#  define HALF_OF(x) ((x)*_mm_set1_ps(.5))
+#else
+#  define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase)
+#  define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase)
+#  define HALF_OF(x) ((x)*.5)
+#endif
+
+#define  kf_cexp(x,phase) \
+	do{ \
+		(x)->r = KISS_FFT_COS(phase);\
+		(x)->i = KISS_FFT_SIN(phase);\
+	}while(0)
+
+
+/* a debugging function */
+#define pcpx(c)\
+    fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) )
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/kissfft/kiss_fft.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/kissfft/kiss_fft.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,399 @@
+/*
+Copyright (c) 2003-2004, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "_kiss_fft_guts.h"
+/* The guts header contains all the multiplication and addition macros that are defined for
+ fixed or floating point complex numbers.  It also delares the kf_ internal functions.
+ */
+
+static kiss_fft_cpx *scratchbuf=NULL;
+static size_t nscratchbuf=0;
+static kiss_fft_cpx *tmpbuf=NULL;
+static size_t ntmpbuf=0;
+
+#define CHECKBUF(buf,nbuf,n) \
+    do { \
+        if ( nbuf < (size_t)(n) ) {\
+            free(buf); \
+            buf = (kiss_fft_cpx*)KISS_FFT_MALLOC(sizeof(kiss_fft_cpx)*(n)); \
+            nbuf = (size_t)(n); \
+        } \
+   }while(0)
+
+
+static void kf_bfly2(
+        kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        int m
+        )
+{
+    kiss_fft_cpx * Fout2;
+    kiss_fft_cpx * tw1 = st->twiddles;
+    kiss_fft_cpx t;
+    Fout2 = Fout + m;
+    do{
+        C_FIXDIV(*Fout,2); C_FIXDIV(*Fout2,2);
+
+        C_MUL (t,  *Fout2 , *tw1);
+        tw1 += fstride;
+        C_SUB( *Fout2 ,  *Fout , t );
+        C_ADDTO( *Fout ,  t );
+        ++Fout2;
+        ++Fout;
+    }while (--m);
+}
+
+static void kf_bfly4(
+        kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        const size_t m
+        )
+{
+    kiss_fft_cpx *tw1,*tw2,*tw3;
+    kiss_fft_cpx scratch[6];
+    size_t k=m;
+    const size_t m2=2*m;
+    const size_t m3=3*m;
+
+    tw3 = tw2 = tw1 = st->twiddles;
+
+    do {
+        C_FIXDIV(*Fout,4); C_FIXDIV(Fout[m],4); C_FIXDIV(Fout[m2],4); C_FIXDIV(Fout[m3],4);
+
+        C_MUL(scratch[0],Fout[m] , *tw1 );
+        C_MUL(scratch[1],Fout[m2] , *tw2 );
+        C_MUL(scratch[2],Fout[m3] , *tw3 );
+
+        C_SUB( scratch[5] , *Fout, scratch[1] );
+        C_ADDTO(*Fout, scratch[1]);
+        C_ADD( scratch[3] , scratch[0] , scratch[2] );
+        C_SUB( scratch[4] , scratch[0] , scratch[2] );
+        C_SUB( Fout[m2], *Fout, scratch[3] );
+        tw1 += fstride;
+        tw2 += fstride*2;
+        tw3 += fstride*3;
+        C_ADDTO( *Fout , scratch[3] );
+
+        if(st->inverse) {
+            Fout[m].r = scratch[5].r - scratch[4].i;
+            Fout[m].i = scratch[5].i + scratch[4].r;
+            Fout[m3].r = scratch[5].r + scratch[4].i;
+            Fout[m3].i = scratch[5].i - scratch[4].r;
+        }else{
+            Fout[m].r = scratch[5].r + scratch[4].i;
+            Fout[m].i = scratch[5].i - scratch[4].r;
+            Fout[m3].r = scratch[5].r - scratch[4].i;
+            Fout[m3].i = scratch[5].i + scratch[4].r;
+        }
+        ++Fout;
+    }while(--k);
+}
+
+static void kf_bfly3(
+         kiss_fft_cpx * Fout,
+         const size_t fstride,
+         const kiss_fft_cfg st,
+         size_t m
+         )
+{
+     size_t k=m;
+     const size_t m2 = 2*m;
+     kiss_fft_cpx *tw1,*tw2;
+     kiss_fft_cpx scratch[5];
+     kiss_fft_cpx epi3;
+     epi3 = st->twiddles[fstride*m];
+
+     tw1=tw2=st->twiddles;
+
+     do{
+         C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
+
+         C_MUL(scratch[1],Fout[m] , *tw1);
+         C_MUL(scratch[2],Fout[m2] , *tw2);
+
+         C_ADD(scratch[3],scratch[1],scratch[2]);
+         C_SUB(scratch[0],scratch[1],scratch[2]);
+         tw1 += fstride;
+         tw2 += fstride*2;
+
+         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
+         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
+
+         C_MULBYSCALAR( scratch[0] , epi3.i );
+
+         C_ADDTO(*Fout,scratch[3]);
+
+         Fout[m2].r = Fout[m].r + scratch[0].i;
+         Fout[m2].i = Fout[m].i - scratch[0].r;
+
+         Fout[m].r -= scratch[0].i;
+         Fout[m].i += scratch[0].r;
+
+         ++Fout;
+     }while(--k);
+}
+
+static void kf_bfly5(
+        kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        int m
+        )
+{
+    kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+    int u;
+    kiss_fft_cpx scratch[13];
+    kiss_fft_cpx * twiddles = st->twiddles;
+    kiss_fft_cpx *tw;
+    kiss_fft_cpx ya,yb;
+    ya = twiddles[fstride*m];
+    yb = twiddles[fstride*2*m];
+
+    Fout0=Fout;
+    Fout1=Fout0+m;
+    Fout2=Fout0+2*m;
+    Fout3=Fout0+3*m;
+    Fout4=Fout0+4*m;
+
+    tw=st->twiddles;
+    for ( u=0; u<m; ++u ) {
+        C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
+        scratch[0] = *Fout0;
+
+        C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
+        C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
+        C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
+        C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
+
+        C_ADD( scratch[7],scratch[1],scratch[4]);
+        C_SUB( scratch[10],scratch[1],scratch[4]);
+        C_ADD( scratch[8],scratch[2],scratch[3]);
+        C_SUB( scratch[9],scratch[2],scratch[3]);
+
+        Fout0->r += scratch[7].r + scratch[8].r;
+        Fout0->i += scratch[7].i + scratch[8].i;
+
+        scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
+        scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
+
+        scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
+        scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
+
+        C_SUB(*Fout1,scratch[5],scratch[6]);
+        C_ADD(*Fout4,scratch[5],scratch[6]);
+
+        scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
+        scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
+        scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
+        scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
+
+        C_ADD(*Fout2,scratch[11],scratch[12]);
+        C_SUB(*Fout3,scratch[11],scratch[12]);
+
+        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+    }
+}
+
+/* perform the butterfly for one stage of a mixed radix FFT */
+static void kf_bfly_generic(
+        kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        int m,
+        int p
+        )
+{
+    int u,k,q1,q;
+    kiss_fft_cpx * twiddles = st->twiddles;
+    kiss_fft_cpx t;
+    int Norig = st->nfft;
+
+    CHECKBUF(scratchbuf,nscratchbuf,p);
+
+    for ( u=0; u<m; ++u ) {
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+            scratchbuf[q1] = Fout[ k  ];
+            C_FIXDIV(scratchbuf[q1],p);
+            k += m;
+        }
+
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+            int twidx=0;
+            Fout[ k ] = scratchbuf[0];
+            for (q=1;q<p;++q ) {
+                twidx += fstride * k;
+                if (twidx>=Norig) twidx-=Norig;
+                C_MUL(t,scratchbuf[q] , twiddles[twidx] );
+                C_ADDTO( Fout[ k ] ,t);
+            }
+            k += m;
+        }
+    }
+}
+
+static
+void kf_work(
+        kiss_fft_cpx * Fout,
+        const kiss_fft_cpx * f,
+        const size_t fstride,
+        int in_stride,
+        int * factors,
+        const kiss_fft_cfg st
+        )
+{
+    kiss_fft_cpx * Fout_beg=Fout;
+    const int p=*factors++; /* the radix  */
+    const int m=*factors++; /* stage's fft length/p */
+    const kiss_fft_cpx * Fout_end = Fout + p*m;
+
+    if (m==1) {
+        do{
+            *Fout = *f;
+            f += fstride*in_stride;
+        }while(++Fout != Fout_end );
+    }else{
+        do{
+            kf_work( Fout , f, fstride*p, in_stride, factors,st);
+            f += fstride*in_stride;
+        }while( (Fout += m) != Fout_end );
+    }
+
+    Fout=Fout_beg;
+
+    switch (p) {
+        case 2: kf_bfly2(Fout,fstride,st,m); break;
+        case 3: kf_bfly3(Fout,fstride,st,m); break; 
+        case 4: kf_bfly4(Fout,fstride,st,m); break;
+        case 5: kf_bfly5(Fout,fstride,st,m); break; 
+        default: kf_bfly_generic(Fout,fstride,st,m,p); break;
+    }
+}
+
+/*  facbuf is populated by p1,m1,p2,m2, ...
+    where 
+    p[i] * m[i] = m[i-1]
+    m0 = n                  */
+static 
+void kf_factor(int n,int * facbuf)
+{
+    int p=4;
+    double floor_sqrt;
+    floor_sqrt = floor( sqrt((double)n) );
+
+    /*factor out powers of 4, powers of 2, then any remaining primes */
+    do {
+        while (n % p) {
+            switch (p) {
+                case 4: p = 2; break;
+                case 2: p = 3; break;
+                default: p += 2; break;
+            }
+            if (p > floor_sqrt)
+                p = n;          /* no more factors, skip to end */
+        }
+        n /= p;
+        *facbuf++ = p;
+        *facbuf++ = n;
+    } while (n > 1);
+}
+
+/*
+ *
+ * User-callable function to allocate all necessary storage space for the fft.
+ *
+ * The return value is a contiguous block of memory, allocated with malloc.  As such,
+ * It can be freed with free(), rather than a kiss_fft-specific function.
+ * */
+kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem )
+{
+    kiss_fft_cfg st=NULL;
+    size_t memneeded = sizeof(struct kiss_fft_state)
+        + sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/
+
+    if ( lenmem==NULL ) {
+        st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded );
+    }else{
+        if (mem != NULL && *lenmem >= memneeded)
+            st = (kiss_fft_cfg)mem;
+        *lenmem = memneeded;
+    }
+    if (st) {
+        int i;
+        st->nfft=nfft;
+        st->inverse = inverse_fft;
+
+        for (i=0;i<nfft;++i) {
+            const double pi=3.141592653589793238462643383279502884197169399375105820974944;
+            double phase = -2*pi*i / nfft;
+            if (st->inverse)
+                phase *= -1;
+            kf_cexp(st->twiddles+i, phase );
+        }
+
+        kf_factor(nfft,st->factors);
+    }
+    return st;
+}
+
+
+
+    
+void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
+{
+    if (fin == fout) {
+        CHECKBUF(tmpbuf,ntmpbuf,st->nfft);
+        kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
+        memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
+    }else{
+        kf_work( fout, fin, 1,in_stride, st->factors,st );
+    }
+}
+
+void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+{
+    kiss_fft_stride(cfg,fin,fout,1);
+}
+
+
+/* not really necessary to call, but if someone is doing in-place ffts, they may want to free the 
+   buffers from CHECKBUF
+ */ 
+void kiss_fft_cleanup(void)
+{
+    free(scratchbuf);
+    scratchbuf = NULL;
+    nscratchbuf=0;
+    free(tmpbuf);
+    tmpbuf=NULL;
+    ntmpbuf=0;
+}
+
+int kiss_fft_next_fast_size(int n)
+{
+    while(1) {
+        int m=n;
+        while ( (m%2) == 0 ) m/=2;
+        while ( (m%3) == 0 ) m/=3;
+        while ( (m%5) == 0 ) m/=5;
+        if (m<=1)
+            break; /* n is completely factorable by twos, threes, and fives */
+        n++;
+    }
+    return n;
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/kissfft/kiss_fft.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/kissfft/kiss_fft.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,121 @@
+#ifndef KISS_FFT_H
+#define KISS_FFT_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <memory.h>
+#ifndef __APPLE__
+#include <malloc.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ ATTENTION!
+ If you would like a :
+ -- a utility that will handle the caching of fft objects
+ -- real-only (no imaginary time component ) FFT
+ -- a multi-dimensional FFT
+ -- a command-line utility to perform ffts
+ -- a command-line utility to perform fast-convolution filtering
+
+ Then see kfc.h kiss_fftr.h kiss_fftnd.h fftutil.c kiss_fastfir.c
+  in the tools/ directory.
+*/
+
+#ifdef USE_SIMD
+# include <xmmintrin.h>
+# define kiss_fft_scalar __m128
+#define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes)
+#else	
+#define KISS_FFT_MALLOC malloc
+#endif	
+
+
+#ifdef FIXED_POINT
+#include <sys/types.h>	
+# if (FIXED_POINT == 32)
+#  define kiss_fft_scalar int32_t
+# else	
+#  define kiss_fft_scalar int16_t
+# endif
+#else
+# ifndef kiss_fft_scalar
+/*  default is float */
+#   define kiss_fft_scalar float
+# endif
+#endif
+
+typedef struct {
+    kiss_fft_scalar r;
+    kiss_fft_scalar i;
+}kiss_fft_cpx;
+
+typedef struct kiss_fft_state* kiss_fft_cfg;
+
+/* 
+ *  kiss_fft_alloc
+ *  
+ *  Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
+ *
+ *  typical usage:      kiss_fft_cfg mycfg=kiss_fft_alloc(1024,0,NULL,NULL);
+ *
+ *  The return value from fft_alloc is a cfg buffer used internally
+ *  by the fft routine or NULL.
+ *
+ *  If lenmem is NULL, then kiss_fft_alloc will allocate a cfg buffer using malloc.
+ *  The returned value should be free()d when done to avoid memory leaks.
+ *  
+ *  The state can be placed in a user supplied buffer 'mem':
+ *  If lenmem is not NULL and mem is not NULL and *lenmem is large enough,
+ *      then the function places the cfg in mem and the size used in *lenmem
+ *      and returns mem.
+ *  
+ *  If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough),
+ *      then the function returns NULL and places the minimum cfg 
+ *      buffer size in *lenmem.
+ * */
+
+kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem); 
+
+/*
+ * kiss_fft(cfg,in_out_buf)
+ *
+ * Perform an FFT on a complex input buffer.
+ * for a forward FFT,
+ * fin should be  f[0] , f[1] , ... ,f[nfft-1]
+ * fout will be   F[0] , F[1] , ... ,F[nfft-1]
+ * Note that each element is complex and can be accessed like
+    f[k].r and f[k].i
+ * */
+void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+
+/*
+ A more generic version of the above function. It reads its input from every Nth sample.
+ * */
+void kiss_fft_stride(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int fin_stride);
+
+/* If kiss_fft_alloc allocated a buffer, it is one contiguous 
+   buffer and can be simply free()d when no longer needed*/
+#define kiss_fft_free free
+
+/*
+ Cleans up some memory that gets managed internally. Not necessary to call, but it might clean up 
+ your compiler output to call this before you exit.
+*/
+void kiss_fft_cleanup(void);
+	
+
+/*
+ * Returns the smallest integer k, such that k>=n and k has only "fast" factors (2,3,5)
+ */
+int kiss_fft_next_fast_size(int n);
+
+#ifdef __cplusplus
+} 
+#endif
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/kissfft/kiss_fftr.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/kissfft/kiss_fftr.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,159 @@
+/*
+Copyright (c) 2003-2004, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "kiss_fftr.h"
+#include "_kiss_fft_guts.h"
+
+struct kiss_fftr_state{
+    kiss_fft_cfg substate;
+    kiss_fft_cpx * tmpbuf;
+    kiss_fft_cpx * super_twiddles;
+#ifdef USE_SIMD    
+    long pad;
+#endif    
+};
+
+kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)
+{
+    int i;
+    kiss_fftr_cfg st = NULL;
+    size_t subsize, memneeded;
+
+    if (nfft & 1) {
+        fprintf(stderr,"Real FFT optimization must be even.\n");
+        return NULL;
+    }
+    nfft >>= 1;
+
+    kiss_fft_alloc (nfft, inverse_fft, NULL, &subsize);
+    memneeded = sizeof(struct kiss_fftr_state) + subsize + sizeof(kiss_fft_cpx) * ( nfft * 2);
+
+    if (lenmem == NULL) {
+        st = (kiss_fftr_cfg) KISS_FFT_MALLOC (memneeded);
+    } else {
+        if (*lenmem >= memneeded)
+            st = (kiss_fftr_cfg) mem;
+        *lenmem = memneeded;
+    }
+    if (!st)
+        return NULL;
+
+    st->substate = (kiss_fft_cfg) (st + 1); /*just beyond kiss_fftr_state struct */
+    st->tmpbuf = (kiss_fft_cpx *) (((char *) st->substate) + subsize);
+    st->super_twiddles = st->tmpbuf + nfft;
+    kiss_fft_alloc(nfft, inverse_fft, st->substate, &subsize);
+
+    for (i = 0; i < nfft; ++i) {
+        double phase =
+            -3.14159265358979323846264338327 * ((double) i / nfft + .5);
+        if (inverse_fft)
+            phase *= -1;
+        kf_cexp (st->super_twiddles+i,phase);
+    }
+    return st;
+}
+
+void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *freqdata)
+{
+    /* input buffer timedata is stored row-wise */
+    int k,ncfft;
+    kiss_fft_cpx fpnk,fpk,f1k,f2k,tw,tdc;
+
+    if ( st->substate->inverse) {
+        fprintf(stderr,"kiss fft usage error: improper alloc\n");
+        exit(1);
+    }
+
+    ncfft = st->substate->nfft;
+
+    /*perform the parallel fft of two real signals packed in real,imag*/
+    kiss_fft( st->substate , (const kiss_fft_cpx*)timedata, st->tmpbuf );
+    /* The real part of the DC element of the frequency spectrum in st->tmpbuf
+     * contains the sum of the even-numbered elements of the input time sequence
+     * The imag part is the sum of the odd-numbered elements
+     *
+     * The sum of tdc.r and tdc.i is the sum of the input time sequence. 
+     *      yielding DC of input time sequence
+     * The difference of tdc.r - tdc.i is the sum of the input (dot product) [1,-1,1,-1... 
+     *      yielding Nyquist bin of input time sequence
+     */
+ 
+    tdc.r = st->tmpbuf[0].r;
+    tdc.i = st->tmpbuf[0].i;
+    C_FIXDIV(tdc,2);
+    CHECK_OVERFLOW_OP(tdc.r ,+, tdc.i);
+    CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);
+    freqdata[0].r = tdc.r + tdc.i;
+    freqdata[ncfft].r = tdc.r - tdc.i;
+#ifdef USE_SIMD    
+    freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps(0);
+#else
+    freqdata[ncfft].i = freqdata[0].i = 0;
+#endif
+
+    for ( k=1;k <= ncfft/2 ; ++k ) {
+        fpk    = st->tmpbuf[k]; 
+        fpnk.r =   st->tmpbuf[ncfft-k].r;
+        fpnk.i = - st->tmpbuf[ncfft-k].i;
+        C_FIXDIV(fpk,2);
+        C_FIXDIV(fpnk,2);
+
+        C_ADD( f1k, fpk , fpnk );
+        C_SUB( f2k, fpk , fpnk );
+        C_MUL( tw , f2k , st->super_twiddles[k]);
+
+        freqdata[k].r = HALF_OF(f1k.r + tw.r);
+        freqdata[k].i = HALF_OF(f1k.i + tw.i);
+        freqdata[ncfft-k].r = HALF_OF(f1k.r - tw.r);
+        freqdata[ncfft-k].i = HALF_OF(tw.i - f1k.i);
+    }
+}
+
+void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata)
+{
+    /* input buffer timedata is stored row-wise */
+    int k, ncfft;
+
+    if (st->substate->inverse == 0) {
+        fprintf (stderr, "kiss fft usage error: improper alloc\n");
+        exit (1);
+    }
+
+    ncfft = st->substate->nfft;
+
+    st->tmpbuf[0].r = freqdata[0].r + freqdata[ncfft].r;
+    st->tmpbuf[0].i = freqdata[0].r - freqdata[ncfft].r;
+    C_FIXDIV(st->tmpbuf[0],2);
+
+    for (k = 1; k <= ncfft / 2; ++k) {
+        kiss_fft_cpx fk, fnkc, fek, fok, tmp;
+        fk = freqdata[k];
+        fnkc.r = freqdata[ncfft - k].r;
+        fnkc.i = -freqdata[ncfft - k].i;
+        C_FIXDIV( fk , 2 );
+        C_FIXDIV( fnkc , 2 );
+
+        C_ADD (fek, fk, fnkc);
+        C_SUB (tmp, fk, fnkc);
+        C_MUL (fok, tmp, st->super_twiddles[k]);
+        C_ADD (st->tmpbuf[k],     fek, fok);
+        C_SUB (st->tmpbuf[ncfft - k], fek, fok);
+#ifdef USE_SIMD        
+        st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0);
+#else
+        st->tmpbuf[ncfft - k].i *= -1;
+#endif
+    }
+    kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/kissfft/kiss_fftr.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/kissfft/kiss_fftr.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,46 @@
+#ifndef KISS_FTR_H
+#define KISS_FTR_H
+
+#include "kiss_fft.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    
+/* 
+ 
+ Real optimized version can save about 45% cpu time vs. complex fft of a real seq.
+
+ 
+ 
+ */
+
+typedef struct kiss_fftr_state *kiss_fftr_cfg;
+
+
+kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem, size_t * lenmem);
+/*
+ nfft must be even
+
+ If you don't care to allocate space, use mem = lenmem = NULL 
+*/
+
+
+void kiss_fftr(kiss_fftr_cfg cfg,const kiss_fft_scalar *timedata,kiss_fft_cpx *freqdata);
+/*
+ input timedata has nfft scalar points
+ output freqdata has nfft/2+1 complex points
+*/
+
+void kiss_fftri(kiss_fftr_cfg cfg,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata);
+/*
+ input freqdata has  nfft/2+1 complex points
+ output timedata has nfft scalar points
+*/
+
+#define kiss_fftr_free free
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/pommier/neon_mathfun.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/pommier/neon_mathfun.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,301 @@
+/* NEON implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+
+/* Copyright (C) 2011  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <arm_neon.h>
+
+typedef float32x4_t v4sf;  // vector of 4 float
+typedef uint32x4_t v4su;  // vector of 4 uint32
+typedef int32x4_t v4si;  // vector of 4 uint32
+
+#define c_inv_mant_mask ~0x7f800000u
+#define c_cephes_SQRTHF 0.707106781186547524
+#define c_cephes_log_p0 7.0376836292E-2
+#define c_cephes_log_p1 - 1.1514610310E-1
+#define c_cephes_log_p2 1.1676998740E-1
+#define c_cephes_log_p3 - 1.2420140846E-1
+#define c_cephes_log_p4 + 1.4249322787E-1
+#define c_cephes_log_p5 - 1.6668057665E-1
+#define c_cephes_log_p6 + 2.0000714765E-1
+#define c_cephes_log_p7 - 2.4999993993E-1
+#define c_cephes_log_p8 + 3.3333331174E-1
+#define c_cephes_log_q1 -2.12194440e-4
+#define c_cephes_log_q2 0.693359375
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+  v4sf one = vdupq_n_f32(1);
+
+  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+  v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+
+  v4si ux = vreinterpretq_s32_f32(x);
+  
+  v4si emm0 = vshrq_n_s32(ux, 23);
+
+  /* keep only the fractional part */
+  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+  x = vreinterpretq_f32_s32(ux);
+
+  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+  v4sf e = vcvtq_f32_s32(emm0);
+
+  e = vaddq_f32(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+  v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+  x = vsubq_f32(x, one);
+  e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+  x = vaddq_f32(x, tmp);
+
+  v4sf z = vmulq_f32(x,x);
+
+  v4sf y = vdupq_n_f32(c_cephes_log_p0);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+  y = vmulq_f32(y, x);
+
+  y = vmulq_f32(y, z);
+  
+
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+  y = vaddq_f32(y, tmp);
+
+
+  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+  y = vsubq_f32(y, tmp);
+
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+  x = vaddq_f32(x, y);
+  x = vaddq_f32(x, tmp);
+  x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+  return x;
+}
+
+#define c_exp_hi 88.3762626647949f
+#define c_exp_lo -88.3762626647949f
+
+#define c_cephes_LOG2EF 1.44269504088896341
+#define c_cephes_exp_C1 0.693359375
+#define c_cephes_exp_C2 -2.12194440e-4
+
+#define c_cephes_exp_p0 1.9875691500E-4
+#define c_cephes_exp_p1 1.3981999507E-3
+#define c_cephes_exp_p2 8.3334519073E-3
+#define c_cephes_exp_p3 4.1665795894E-2
+#define c_cephes_exp_p4 1.6666665459E-1
+#define c_cephes_exp_p5 5.0000001201E-1
+
+/* exp() computed for 4 float at once */
+v4sf exp_ps(v4sf x) {
+  v4sf tmp, fx;
+
+  v4sf one = vdupq_n_f32(1);
+  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+  /* if greater, substract 1 */
+  v4su mask = vcgtq_f32(tmp, fx);    
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+
+
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+  v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+
+  static const float32_t cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
+  v4sf y = vld1q_dup_f32(cephes_exp_p+0);
+  v4sf c1 = vld1q_dup_f32(cephes_exp_p+1); 
+  v4sf c2 = vld1q_dup_f32(cephes_exp_p+2); 
+  v4sf c3 = vld1q_dup_f32(cephes_exp_p+3); 
+  v4sf c4 = vld1q_dup_f32(cephes_exp_p+4); 
+  v4sf c5 = vld1q_dup_f32(cephes_exp_p+5);
+
+  y = vmulq_f32(y, x);
+  z = vmulq_f32(x,x);
+  y = vaddq_f32(y, c1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c5);
+  
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, one);
+
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+  mm = vshlq_n_s32(mm, 23);
+  v4sf pow2n = vreinterpretq_f32_s32(mm);
+
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
+
+#define c_minus_cephes_DP1 -0.78515625
+#define c_minus_cephes_DP2 -2.4187564849853515625e-4
+#define c_minus_cephes_DP3 -3.77489497744594108e-8
+#define c_sincof_p0 -1.9515295891E-4
+#define c_sincof_p1  8.3321608736E-3
+#define c_sincof_p2 -1.6666654611E-1
+#define c_coscof_p0  2.443315711809948E-005
+#define c_coscof_p1 -1.388731625493765E-003
+#define c_coscof_p2  4.166664568298827E-002
+#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
+
+/* evaluation of 4 sines & cosines at once.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Note also that when you compute sin(x), cos(x) is available at
+   almost no extra price so both sin_ps and cos_ps make use of
+   sincos_ps..
+  */
+void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x
+  v4sf xmm1, xmm2, xmm3, y;
+
+  v4su emm2;
+  
+  v4su sign_mask_sin, sign_mask_cos;
+  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+  x = vabsq_f32(x);
+
+  /* scale by 4/Pi */
+  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+
+  /* store the integer part of y in mm0 */
+  emm2 = vcvtq_u32_f32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+  y = vcvtq_f32_u32(emm2);
+
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  v4su poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
+  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
+  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
+  x = vaddq_f32(x, xmm1);
+  x = vaddq_f32(x, xmm2);
+  x = vaddq_f32(x, xmm3);
+
+  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
+  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1, 
+     and the second polynom      (Pi/4 <= x <= 0) in y2 */
+  v4sf z = vmulq_f32(x,x);
+  v4sf y1, y2;
+
+  y1 = vmulq_n_f32(z, c_coscof_p0);
+  y2 = vmulq_n_f32(z, c_sincof_p0);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, x);
+  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
+  y2 = vaddq_f32(y2, x);
+  y1 = vaddq_f32(y1, vdupq_n_f32(1));
+
+  /* select the correct result from the two polynoms */  
+  v4sf ys = vbslq_f32(poly_mask, y1, y2);
+  v4sf yc = vbslq_f32(poly_mask, y2, y1);
+  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+}
+
+v4sf sin_ps(v4sf x) {
+  v4sf ysin, ycos; 
+  sincos_ps(x, &ysin, &ycos); 
+  return ysin;
+}
+
+v4sf cos_ps(v4sf x) {
+  v4sf ysin, ycos; 
+  sincos_ps(x, &ysin, &ycos); 
+  return ycos;
+}
+
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/pommier/sse_mathfun.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/pommier/sse_mathfun.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,766 @@
+
+#ifndef _POMMIER_SSE_MATHFUN_H_
+#define _POMMIER_SSE_MATHFUN_H_
+
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <xmmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+#if defined (__MINGW32__)
+
+/* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
+   The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
+   broken on my mingw gcc 3.4.5 ...
+
+   Note that the bug on _mm_cmp* does occur only at -O0 optimization level
+*/
+
+inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
+	asm (
+			"movhlps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;                                 }
+#warning "redefined _mm_movehl_ps (see gcc bug 21179)"
+#define _mm_movehl_ps my_movehl_ps
+
+inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpltps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;               
+                  }
+inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpnleps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;               
+}
+inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpeqps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;               
+}
+#warning "redefined _mm_cmpxx_ps functions..."
+#define _mm_cmplt_ps my_cmplt_ps
+#define _mm_cmpgt_ps my_cmpgt_ps
+#define _mm_cmpeq_ps my_cmpeq_ps
+#endif
+
+#ifndef USE_SSE2
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+    xmm_mm_union u; u.xmm = xmm_;                   \
+    mm0_ = u.mm[0];                                 \
+    mm1_ = u.mm[1];                                 \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+  }
+
+#endif // USE_SSE2
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+
+#ifndef USE_SSE2
+  /* part 1: x = frexpf(x, &e); */
+  COPY_XMM_TO_MM(x, mm0, mm1);
+  mm0 = _mm_srli_pi32(mm0, 23);
+  mm1 = _mm_srli_pi32(mm1, 23);
+#else
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+#ifndef USE_SSE2
+  /* now e=mm0:mm1 contain the really base-2 exponent */
+  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+  _mm_empty(); /* bye bye mmx */
+#else
+  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+  v4sf e = _mm_cvtepi32_ps(emm0);
+#endif
+
+  e = _mm_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+  v4sf tmp = _mm_and_ps(x, mask);
+  x = _mm_sub_ps(x, one);
+  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+  x = _mm_add_ps(x, tmp);
+
+
+  v4sf z = _mm_mul_ps(x,x);
+
+  v4sf y = *(v4sf*)_ps_cephes_log_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+  y = _mm_mul_ps(y, x);
+
+  y = _mm_mul_ps(y, z);
+  
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+  y = _mm_add_ps(y, tmp);
+
+
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+  x = _mm_add_ps(x, y);
+  x = _mm_add_ps(x, tmp);
+  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v4sf exp_ps(v4sf x) {
+  v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+  /* step 1 : cast to int */
+  tmp = _mm_movehl_ps(tmp, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(tmp);
+  /* step 2 : cast back to float */
+  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+#endif
+  /* if greater, substract 1 */
+  v4sf mask = _mm_cmpgt_ps(tmp, fx);    
+  mask = _mm_and_ps(mask, one);
+  fx = _mm_sub_ps(tmp, mask);
+
+  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+  x = _mm_sub_ps(x, tmp);
+  x = _mm_sub_ps(x, z);
+
+  z = _mm_mul_ps(x,x);
+  
+  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, x);
+  y = _mm_add_ps(y, one);
+
+  /* build 2^n */
+#ifndef USE_SSE2
+  z = _mm_movehl_ps(z, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(z);
+  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+  mm0 = _mm_slli_pi32(mm0, 23); 
+  mm1 = _mm_slli_pi32(mm1, 23);
+  
+  v4sf pow2n; 
+  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+  _mm_empty();
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  v4sf pow2n = _mm_castsi128_ps(emm0);
+#endif
+  y = _mm_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+v4sf sin_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+  //printf("plop:"); print4(y); 
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  /* get the polynom selection mask */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf swap_sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+v4sf cos_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+  
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+  /* get the swap sign flag in mm0:mm1 and the 
+     polynom selection mask in mm2:mm3 */
+
+  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  v4sf sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2, emm4;
+#else
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    
+#ifdef USE_SSE2
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm2:mm3 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+  mm4 = mm2;
+  mm5 = mm3;
+
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  v4sf swap_sign_bit_sin;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+  /* get the polynom selection mask for the sine */
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf poly_mask;
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+#ifdef USE_SSE2
+  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+  /* get the sign flag for the cosine */
+  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  v4sf sign_bit_cos;
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+#endif
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v4sf z = _mm_mul_ps(x,x);
+  y = *(v4sf*)_ps_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  v4sf ysin2 = _mm_and_ps(xmm3, y2);
+  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
+
+#endif
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/rubberband-c.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/rubberband-c.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,169 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "rubberband/rubberband-c.h"
+#include "rubberband/RubberBandStretcher.h"
+
+struct RubberBandState_
+{
+    RubberBand::RubberBandStretcher *m_s;
+};
+
+RubberBandState rubberband_new(unsigned int sampleRate,
+                               unsigned int channels,
+                               RubberBandOptions options,
+                               double initialTimeRatio,
+                               double initialPitchScale)
+{
+    RubberBandState_ *state = new RubberBandState_();
+    state->m_s = new RubberBand::RubberBandStretcher
+        (sampleRate, channels, options,
+         initialTimeRatio, initialPitchScale);
+    return state;
+}
+
+void rubberband_delete(RubberBandState state)
+{
+    delete state->m_s;
+    delete state;
+}
+
+void rubberband_reset(RubberBandState state)
+{
+    state->m_s->reset();
+}
+
+void rubberband_set_time_ratio(RubberBandState state, double ratio)
+{
+    state->m_s->setTimeRatio(ratio);
+}
+
+void rubberband_set_pitch_scale(RubberBandState state, double scale)
+{
+    state->m_s->setPitchScale(scale);
+}
+
+double rubberband_get_time_ratio(const RubberBandState state) 
+{
+    return state->m_s->getTimeRatio();
+}
+
+double rubberband_get_pitch_scale(const RubberBandState state)
+{
+    return state->m_s->getPitchScale();
+}
+
+unsigned int rubberband_get_latency(const RubberBandState state) 
+{
+    return state->m_s->getLatency();
+}
+
+void rubberband_set_transients_option(RubberBandState state, RubberBandOptions options)
+{
+    state->m_s->setTransientsOption(options);
+}
+
+void rubberband_set_detector_option(RubberBandState state, RubberBandOptions options)
+{
+    state->m_s->setDetectorOption(options);
+}
+
+void rubberband_set_phase_option(RubberBandState state, RubberBandOptions options)
+{
+    state->m_s->setPhaseOption(options);
+}
+
+void rubberband_set_formant_option(RubberBandState state, RubberBandOptions options)
+{
+    state->m_s->setFormantOption(options);
+}
+
+void rubberband_set_pitch_option(RubberBandState state, RubberBandOptions options)
+{
+    state->m_s->setPitchOption(options);
+}
+
+void rubberband_set_expected_input_duration(RubberBandState state, unsigned int samples)
+{
+    state->m_s->setExpectedInputDuration(samples);
+}
+
+unsigned int rubberband_get_samples_required(const RubberBandState state)
+{
+    return state->m_s->getSamplesRequired();
+}
+
+void rubberband_set_max_process_size(RubberBandState state, unsigned int samples)
+{
+    state->m_s->setMaxProcessSize(samples);
+}
+
+void rubberband_set_key_frame_map(RubberBandState state, unsigned int keyframecount, unsigned int *from, unsigned int *to)
+{
+    std::map<size_t, size_t> kfm;
+    for (unsigned int i = 0; i < keyframecount; ++i) {
+        kfm[from[i]] = to[i];
+    }
+    state->m_s->setKeyFrameMap(kfm);
+}
+
+void rubberband_study(RubberBandState state, const float *const *input, unsigned int samples, int final)
+{
+    state->m_s->study(input, samples, final != 0);
+}
+
+void rubberband_process(RubberBandState state, const float *const *input, unsigned int samples, int final)
+{
+    state->m_s->process(input, samples, final != 0);
+}
+
+int rubberband_available(const RubberBandState state)
+{
+    return state->m_s->available();
+}
+
+unsigned int rubberband_retrieve(const RubberBandState state, float *const *output, unsigned int samples)
+{
+    return state->m_s->retrieve(output, samples);
+}
+
+unsigned int rubberband_get_channel_count(const RubberBandState state)
+{
+    return state->m_s->getChannelCount();
+}
+
+void rubberband_calculate_stretch(RubberBandState state)
+{
+    state->m_s->calculateStretch();
+}
+
+void rubberband_set_debug_level(RubberBandState state, int level)
+{
+    state->m_s->setDebugLevel(level);
+}
+
+void rubberband_set_default_debug_level(int level)
+{
+    RubberBand::RubberBandStretcher::setDefaultDebugLevel(level);
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/speex/COPYING
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/speex/COPYING	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,35 @@
+Copyright 2002-2007 	Xiph.org Foundation
+Copyright 2002-2007 	Jean-Marc Valin
+Copyright 2005-2007	Analog Devices Inc.
+Copyright 2005-2007	Commonwealth Scientific and Industrial Research 
+                        Organisation (CSIRO)
+Copyright 1993, 2002, 2006 David Rowe
+Copyright 2003 		EpicGames
+Copyright 1992-1994	Jutta Degener, Carsten Bormann
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+- Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+- Neither the name of the Xiph.org Foundation nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/speex/resample.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/speex/resample.c	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1264 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
+
+/* Copyright (C) 2007 Jean-Marc Valin
+
+   File: resample.c
+   Arbitrary resampling code
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+   The design goals of this code are:
+      - Very fast algorithm
+      - SIMD-friendly algorithm
+      - Low memory requirement
+      - Good *perceptual* quality (and not best SNR)
+
+   Warning: This resampler is relatively new. Although I think I got rid of
+   all the major bugs and I don't expect the API to change anymore, there
+   may be something I've missed. So use with caution.
+
+   This algorithm is based on this original resampling algorithm:
+   Smith, Julius O. Digital Audio Resampling Home Page
+   Center for Computer Research in Music and Acoustics (CCRMA),
+   Stanford University, 2007.
+   Web published at http://www-ccrma.stanford.edu/~jos/resample/.
+
+   There is one main difference, though. This resampler uses cubic
+   interpolation instead of linear interpolation in the above paper. This
+   makes the table much smaller and makes it possible to compute that table
+   on a per-stream basis. In turn, being able to tweak the table for each
+   stream makes it possible to both reduce complexity on simple ratios
+   (e.g. 2/3), and get rid of the rounding operations in the inner loop.
+   The latter both reduces CPU time and makes the algorithm more SIMD-friendly.
+*/
+
+/*
+   NOTE: This code has been cut down and reformatted by Chris Cannam
+   for personal reading preference, and for use in the Rubber Band
+   time stretching and pitch shifting library.  If you have problems
+   with this code, cast suspicion on the butchering it has undergone;
+   it's probably my fault.  If you want a properly functioning
+   version, please go for the original Speex code first.  I haven't
+   made any substantial changes to this code, I've just made it less
+   generally useful.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <string.h>
+
+#ifdef HAVE_IPP
+#include <ipps.h>
+#endif
+
+// Simple allocators with a fixed minimum, to avoid reallocation if
+// the size changes but remains smaller than that.  The system alloc
+// functions no doubt do exactly the same thing for some value
+// probably not too distant from ours, but we want the certainty.
+
+#define ALLOC_MINIMUM 4096
+
+static void *speex_alloc (int count, int size)
+{
+#ifdef HAVE_IPP
+    void *rv;
+#endif
+
+//	fprintf(stderr, "speex_alloc(%d,%d)\n", count, size);
+	if (count * size < ALLOC_MINIMUM) {
+//		fprintf(stderr, "upgrading count from %d to %d\n", count, ALLOC_MINIMUM / size);
+        count = ALLOC_MINIMUM / size;
+    }
+
+#ifdef HAVE_IPP
+    if (size == sizeof(float) && size == 4) { // or sizeof(int32) or whatever, doesn't matter
+        rv = ippsMalloc_32f(count);
+    } else if (size == sizeof(double) && size == 8) {
+        rv = ippsMalloc_64f(count);
+    } else {
+        rv = ippsMalloc_8u(count * size);
+    }
+//	fprintf(stderr, "allocated at %p; now setting %d bytes to zero\n", rv, count*size);
+    memset(rv, count * size, 0);
+//	fprintf(stderr, "returning %p\n",rv);
+    return rv;
+#else
+    return calloc(count, size);
+#endif
+}
+
+static void speex_free (void *ptr) 
+{
+//	fprintf(stderr,"speex_free(%p)\n", ptr);
+#ifdef HAVE_IPP
+  	ippsFree(ptr);
+#else
+    free(ptr);
+#endif
+}
+
+static void *speex_realloc (void *ptr, int oldcount, int newcount, int size)
+{
+#ifdef HAVE_IPP
+	void *newptr;
+#endif
+
+//	fprintf(stderr,"speex_realloc(%p,%d,%d,%d)\n", ptr, oldcount, newcount, size);
+
+    if (newcount * size < ALLOC_MINIMUM) {
+//		fprintf(stderr,"returning %p\n",ptr);
+        return ptr;
+    }
+//    fprintf(stderr, "NOTE: speex_realloc: actual reallocation happening (newcount = %d, size = %d)\n", newcount, size);
+
+#ifdef HAVE_IPP
+    newptr = speex_alloc(newcount, size);
+    if (ptr && oldcount > 0) {
+        int copy = newcount;
+        if (oldcount < copy) copy = oldcount;
+        memcpy(newptr, ptr, copy * size);
+    }
+    speex_free(ptr);
+//	fprintf(stderr,"returning %p\n", ptr);
+    return newptr;
+#else
+    return realloc(ptr, newcount * size);
+#endif
+}
+
+#include "speex_resampler.h"
+
+#include <math.h>
+
+#ifndef M_PI
+#define M_PI 3.14159263
+#endif
+
+#define FILTER_SIZE 64
+#define OVERSAMPLE 8
+
+#define IMAX(a,b) ((a) > (b) ? (a) : (b))
+#define IMIN(a,b) ((a) < (b) ? (a) : (b))
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+
+typedef int (*resampler_basic_func)(SpeexResamplerState *, spx_uint32_t , const float *, spx_uint32_t *, float *, spx_uint32_t *);
+
+struct SpeexResamplerState_ {
+    spx_uint32_t in_rate;
+    spx_uint32_t out_rate;
+    spx_uint32_t num_rate;
+    spx_uint32_t den_rate;
+
+    int    quality;
+    spx_uint32_t nb_channels;
+    spx_uint32_t filt_len;
+    spx_uint32_t mem_alloc_size;
+    int          int_advance;
+    int          frac_advance;
+    float  cutoff;
+    spx_uint32_t oversample;
+    int          initialised;
+    int          started;
+
+    /* These are per-channel */
+    spx_int32_t  *last_sample;
+    spx_uint32_t *samp_frac_num;
+    spx_uint32_t *magic_samples;
+
+    float *mem;
+    float *sinc_table;
+    spx_uint32_t sinc_table_length;
+    spx_uint32_t sinc_table_alloc;
+    resampler_basic_func resampler_ptr;
+
+    int    in_stride;
+    int    out_stride;
+} ;
+
+static double kaiser12_table[68] = {
+    0.99859849, 1.00000000, 0.99859849, 0.99440475, 0.98745105, 0.97779076,
+    0.96549770, 0.95066529, 0.93340547, 0.91384741, 0.89213598, 0.86843014,
+    0.84290116, 0.81573067, 0.78710866, 0.75723148, 0.72629970, 0.69451601,
+    0.66208321, 0.62920216, 0.59606986, 0.56287762, 0.52980938, 0.49704014,
+    0.46473455, 0.43304576, 0.40211431, 0.37206735, 0.34301800, 0.31506490,
+    0.28829195, 0.26276832, 0.23854851, 0.21567274, 0.19416736, 0.17404546,
+    0.15530766, 0.13794294, 0.12192957, 0.10723616, 0.09382272, 0.08164178,
+    0.07063950, 0.06075685, 0.05193064, 0.04409466, 0.03718069, 0.03111947,
+    0.02584161, 0.02127838, 0.01736250, 0.01402878, 0.01121463, 0.00886058,
+    0.00691064, 0.00531256, 0.00401805, 0.00298291, 0.00216702, 0.00153438,
+    0.00105297, 0.00069463, 0.00043489, 0.00025272, 0.00013031, 0.0000527734,
+    0.00001000, 0.00000000
+};
+
+static double kaiser10_table[36] = {
+    0.99537781, 1.00000000, 0.99537781, 0.98162644, 0.95908712, 0.92831446,
+    0.89005583, 0.84522401, 0.79486424, 0.74011713, 0.68217934, 0.62226347,
+    0.56155915, 0.50119680, 0.44221549, 0.38553619, 0.33194107, 0.28205962,
+    0.23636152, 0.19515633, 0.15859932, 0.12670280, 0.09935205, 0.07632451,
+    0.05731132, 0.04193980, 0.02979584, 0.02044510, 0.01345224, 0.00839739,
+    0.00488951, 0.00257636, 0.00115101, 0.00035515, 0.00000000, 0.00000000
+};
+
+static double kaiser8_table[36] = {
+    0.99635258, 1.00000000, 0.99635258, 0.98548012, 0.96759014, 0.94302200,
+    0.91223751, 0.87580811, 0.83439927, 0.78875245, 0.73966538, 0.68797126,
+    0.63451750, 0.58014482, 0.52566725, 0.47185369, 0.41941150, 0.36897272,
+    0.32108304, 0.27619388, 0.23465776, 0.19672670, 0.16255380, 0.13219758,
+    0.10562887, 0.08273982, 0.06335451, 0.04724088, 0.03412321, 0.02369490,
+    0.01563093, 0.00959968, 0.00527363, 0.00233883, 0.00050000, 0.00000000
+};
+
+static double kaiser6_table[36] = {
+    0.99733006, 1.00000000, 0.99733006, 0.98935595, 0.97618418, 0.95799003,
+    0.93501423, 0.90755855, 0.87598009, 0.84068475, 0.80211977, 0.76076565,
+    0.71712752, 0.67172623, 0.62508937, 0.57774224, 0.53019925, 0.48295561,
+    0.43647969, 0.39120616, 0.34752997, 0.30580127, 0.26632152, 0.22934058,
+    0.19505503, 0.16360756, 0.13508755, 0.10953262, 0.08693120, 0.06722600,
+    0.05031820, 0.03607231, 0.02432151, 0.01487334, 0.00752000, 0.00000000
+};
+
+struct FuncDef {
+    double *table;
+    int oversample;
+};
+
+static struct FuncDef _KAISER12 = {kaiser12_table, 64};
+#define KAISER12 (&_KAISER12)
+static struct FuncDef _KAISER10 = {kaiser10_table, 32};
+#define KAISER10 (&_KAISER10)
+static struct FuncDef _KAISER8 = {kaiser8_table, 32};
+#define KAISER8 (&_KAISER8)
+static struct FuncDef _KAISER6 = {kaiser6_table, 32};
+#define KAISER6 (&_KAISER6)
+
+struct QualityMapping {
+    int base_length;
+    int oversample;
+    float downsample_bandwidth;
+    float upsample_bandwidth;
+
+    struct FuncDef *window_func;
+};
+
+
+/* This table maps conversion quality to internal parameters. There are two
+   reasons that explain why the up-sampling bandwidth is larger than the
+   down-sampling bandwidth:
+   1) When up-sampling, we can assume that the spectrum is already attenuated
+      close to the Nyquist rate (from an A/D or a previous resampling filter)
+   2) Any aliasing that occurs very close to the Nyquist rate will be masked
+      by the sinusoids/noise just below the Nyquist rate (guaranteed only for
+      up-sampling).
+*/
+
+static const struct QualityMapping quality_map[11] = {
+   {  8,  4, 0.830f, 0.860f, KAISER6 }, /* Q0 */
+   { 16,  4, 0.850f, 0.880f, KAISER6 }, /* Q1 */
+   { 32,  4, 0.882f, 0.910f, KAISER6 }, /* Q2 */  /* 82.3% cutoff ( ~60 dB stop) 6  */
+   { 48,  8, 0.895f, 0.917f, KAISER8 }, /* Q3 */  /* 84.9% cutoff ( ~80 dB stop) 8  */
+   { 64,  8, 0.921f, 0.940f, KAISER8 }, /* Q4 */  /* 88.7% cutoff ( ~80 dB stop) 8  */
+   { 80, 16, 0.922f, 0.940f, KAISER10}, /* Q5 */  /* 89.1% cutoff (~100 dB stop) 10 */
+   { 96, 16, 0.940f, 0.945f, KAISER10}, /* Q6 */  /* 91.5% cutoff (~100 dB stop) 10 */
+   {128, 16, 0.950f, 0.950f, KAISER10}, /* Q7 */  /* 93.1% cutoff (~100 dB stop) 10 */
+   {160, 16, 0.960f, 0.960f, KAISER10}, /* Q8 */  /* 94.5% cutoff (~100 dB stop) 10 */
+   {192, 32, 0.968f, 0.968f, KAISER12}, /* Q9 */  /* 95.5% cutoff (~100 dB stop) 10 */
+   {256, 32, 0.975f, 0.975f, KAISER12}, /* Q10 */ /* 96.6% cutoff (~100 dB stop) 10 */
+};
+/*8,24,40,56,80,104,128,160,200,256,320*/
+
+static double compute_func(float x, struct FuncDef *func)
+{
+    float y, frac;
+    double interp[4];
+    int ind;
+
+    y = x * func->oversample;
+    ind = (int)floor(y);
+    frac = (y - ind);
+
+    /* CSE with handle the repeated powers */
+    interp[3] =  -0.1666666667 * frac + 0.1666666667 * (frac * frac * frac);
+    interp[2] = frac + 0.5 * (frac * frac) - 0.5 * (frac * frac * frac);
+    interp[0] = -0.3333333333 * frac + 0.5 * (frac * frac) - 0.1666666667 * (frac * frac * frac);
+
+    /* Just to make sure we don't have rounding problems */
+    interp[1] = 1.f - interp[3] - interp[2] - interp[0];
+
+    /*sum = frac*accum[1] + (1-frac)*accum[2];*/
+    return 
+	interp[0]*func->table[ind] + interp[1]*func->table[ind+1] +
+	interp[2]*func->table[ind+2] + interp[3]*func->table[ind+3];
+}
+
+/* The slow way of computing a sinc for the table. Should improve that some day */
+static float sinc(float cutoff, float x, int N, struct FuncDef *window_func)
+{
+    float xx = x * cutoff;
+
+    if (fabsf(x) < 1e-6)
+        return cutoff;
+    else if (fabsf(x) > .5*N)
+        return 0;
+
+    /*FIXME: Can it really be any slower than this? */
+    return cutoff*sin(M_PI*xx) / (M_PI*xx)
+	* compute_func(fabs(2.*x / N), window_func);
+}
+
+static void cubic_coef(float frac, float interp[4])
+{
+    /* Compute interpolation coefficients. I'm not sure whether this
+    corresponds to cubic interpolation but I know it's MMSE-optimal on
+    a sinc */
+
+    interp[0] =  -0.16667f * frac + 0.16667f * frac * frac * frac;
+    interp[1] = frac + 0.5f * frac * frac - 0.5f * frac * frac * frac;
+    interp[3] = -0.33333f * frac + 0.5f * frac * frac - 0.16667f * frac * frac * frac;
+
+    /* Just to make sure we don't have rounding problems */
+    interp[2] = 1. - interp[0] - interp[1] - interp[3];
+}
+
+static int resampler_basic_direct_single(SpeexResamplerState *st, unsigned int channel_index, const float *in, unsigned int *in_len, float *out, unsigned int *out_len)
+{
+    int N = st->filt_len;
+    int out_sample = 0;
+    float *mem;
+    int last_sample = st->last_sample[channel_index];
+    unsigned int samp_frac_num = st->samp_frac_num[channel_index];
+
+    mem = st->mem + channel_index * st->mem_alloc_size;
+
+    while (!(last_sample >= (int)*in_len || out_sample >= (int)*out_len)) {
+
+        int j;
+        float sum = 0;
+
+        /* We already have all the filter coefficients pre-computed in the table */
+        const float *ptr;
+
+        for (j = 0; last_sample - N + 1 + j < 0; j++) {
+            sum += ((float)(mem[last_sample+j]) *
+		    (float)(st->sinc_table[samp_frac_num*st->filt_len+j]));
+        }
+
+        /* Do the new part */
+        if (in != NULL) {
+
+            ptr = in + st->in_stride * (last_sample - N + 1 + j);
+
+            for (; j < N; j++) {
+                sum += ((float)(*ptr) *
+			(float)(st->sinc_table[samp_frac_num*st->filt_len+j]));
+                ptr += st->in_stride;
+            }
+        }
+
+        *out = (sum);
+
+        out += st->out_stride;
+        out_sample++;
+        last_sample += st->int_advance;
+        samp_frac_num += st->frac_advance;
+
+        if (samp_frac_num >= st->den_rate) {
+            samp_frac_num -= st->den_rate;
+            last_sample++;
+        }
+    }
+
+    st->last_sample[channel_index] = last_sample;
+
+    st->samp_frac_num[channel_index] = samp_frac_num;
+    return out_sample;
+}
+
+/* This is the same as the previous function, except with a double-precision accumulator */
+static int resampler_basic_direct_double(SpeexResamplerState *st, unsigned int channel_index, const float *in, unsigned int *in_len, float *out, unsigned int *out_len)
+{
+    int N = st->filt_len;
+    int out_sample = 0;
+    float *mem;
+    int last_sample = st->last_sample[channel_index];
+    unsigned int samp_frac_num = st->samp_frac_num[channel_index];
+
+    mem = st->mem + channel_index * st->mem_alloc_size;
+
+    while (!(last_sample >= (int)*in_len || out_sample >= (int)*out_len)) {
+
+        int j;
+        double sum = 0;
+
+        /* We already have all the filter coefficients pre-computed in
+         * the table */
+        const float *ptr;
+
+        for (j = 0; last_sample - N + 1 + j < 0; j++) {
+            sum += ((float)(mem[last_sample+j]) *
+		    (float)((double)st->sinc_table[samp_frac_num*st->filt_len+j]));
+        }
+
+        /* Do the new part */
+        if (in != NULL) {
+            ptr = in + st->in_stride * (last_sample - N + 1 + j);
+
+            for (; j < N; j++) {
+                sum += ((float)(*ptr) *
+			(float)((double)st->sinc_table[samp_frac_num*st->filt_len+j]));
+                ptr += st->in_stride;
+            }
+        }
+
+        *out = sum;
+
+        out += st->out_stride;
+        out_sample++;
+        last_sample += st->int_advance;
+        samp_frac_num += st->frac_advance;
+
+        if (samp_frac_num >= st->den_rate) {
+            samp_frac_num -= st->den_rate;
+            last_sample++;
+        }
+    }
+
+    st->last_sample[channel_index] = last_sample;
+
+    st->samp_frac_num[channel_index] = samp_frac_num;
+    return out_sample;
+}
+
+static int resampler_basic_interpolate_single(SpeexResamplerState *st, unsigned int channel_index, const float *in, unsigned int *in_len, float *out, unsigned int *out_len)
+{
+    int N = st->filt_len;
+    int out_sample = 0;
+    float *mem;
+    int last_sample = st->last_sample[channel_index];
+    unsigned int samp_frac_num = st->samp_frac_num[channel_index];
+
+    mem = st->mem + channel_index * st->mem_alloc_size;
+
+    while (!(last_sample >= (int)*in_len || out_sample >= (int)*out_len)) {
+
+        int j;
+        float sum = 0;
+
+        /* We need to interpolate the sinc filter */
+        float accum[4] = {0.f, 0.f, 0.f, 0.f};
+        float interp[4];
+        const float *ptr;
+        int offset;
+        float frac;
+
+        offset = samp_frac_num * st->oversample / st->den_rate;
+
+        frac = ((float)((samp_frac_num * st->oversample) % st->den_rate))
+	    / st->den_rate;
+
+        /* This code is written like this to make it easy to optimise
+	 * with SIMD.  For most DSPs, it would be best to split the
+	 * loops in two because most DSPs have only two
+	 * accumulators */
+
+        for (j = 0; last_sample - N + 1 + j < 0; j++) {
+
+            float curr_mem = mem[last_sample+j];
+
+            accum[0] += ((float)(curr_mem) *
+                         (float)(st->sinc_table
+                                 [4 + (j+1)*st->oversample - offset - 2]));
+            accum[1] += ((float)(curr_mem) *
+                         (float)(st->sinc_table
+                                 [4 + (j+1)*st->oversample - offset - 1]));
+            accum[2] += ((float)(curr_mem) *
+                         (float)(st->sinc_table
+                                 [4 + (j+1)*st->oversample - offset]));
+            accum[3] += ((float)(curr_mem) *
+                         (float)(st->sinc_table
+                                 [4 + (j+1)*st->oversample - offset + 1]));
+        }
+
+        if (in != NULL) {
+
+            ptr = in + st->in_stride * (last_sample - N + 1 + j);
+
+            /* Do the new part */
+            for (; j < N; j++) {
+
+                float curr_in = *ptr;
+                ptr += st->in_stride;
+
+                accum[0] += ((float)(curr_in) *
+                             (float)(st->sinc_table
+                                     [4 + (j+1)*st->oversample - offset - 2]));
+                accum[1] += ((float)(curr_in) *
+                             (float)(st->sinc_table
+                                     [4 + (j+1)*st->oversample - offset - 1]));
+                accum[2] += ((float)(curr_in) *
+                             (float)(st->sinc_table
+                                     [4 + (j+1)*st->oversample - offset]));
+                accum[3] += ((float)(curr_in) *
+                             (float)(st->sinc_table
+                                     [4 + (j+1)*st->oversample - offset + 1]));
+            }
+        }
+
+        cubic_coef(frac, interp);
+
+        sum =
+	    ((interp[0]) * (accum[0])) +
+	    ((interp[1]) * (accum[1])) +
+	    ((interp[2]) * (accum[2])) +
+	    ((interp[3]) * (accum[3]));
+
+        *out = (sum);
+        out += st->out_stride;
+        out_sample++;
+        last_sample += st->int_advance;
+        samp_frac_num += st->frac_advance;
+
+        if (samp_frac_num >= st->den_rate) {
+            samp_frac_num -= st->den_rate;
+            last_sample++;
+        }
+    }
+
+    st->last_sample[channel_index] = last_sample;
+    st->samp_frac_num[channel_index] = samp_frac_num;
+    return out_sample;
+}
+
+/* This is the same as the previous function, except with a
+ * double-precision accumulator */
+static int resampler_basic_interpolate_double(SpeexResamplerState *st, unsigned int channel_index, const float *in, unsigned int *in_len, float *out, unsigned int *out_len) 
+{
+    int N = st->filt_len;
+    int out_sample = 0;
+    float *mem;
+    int last_sample = st->last_sample[channel_index];
+    unsigned int samp_frac_num = st->samp_frac_num[channel_index];
+
+    mem = st->mem + channel_index * st->mem_alloc_size;
+
+    while (!(last_sample >= (int)*in_len || out_sample >= (int)*out_len)) {
+
+        int j;
+        float sum = 0;
+
+        /* We need to interpolate the sinc filter */
+        double accum[4] = {0.f, 0.f, 0.f, 0.f};
+        float interp[4];
+        const float *ptr;
+        float alpha = ((float)samp_frac_num) / st->den_rate;
+        int offset = samp_frac_num * st->oversample / st->den_rate;
+        float frac = alpha * st->oversample - offset;
+
+        /* This code is written like this to make it easy to optimise
+	 * with SIMD.  For most DSPs, it would be best to split the
+	 * loops in two because most DSPs have only two
+	 * accumulators */
+
+        for (j = 0; last_sample - N + 1 + j < 0; j++) {
+
+            double curr_mem = mem[last_sample + j];
+
+            accum[0] += ((float)(curr_mem) *
+                         (float)(st->sinc_table
+                                 [4 + (j+1)*st->oversample - offset - 2]));
+            accum[1] += ((float)(curr_mem) *
+                         (float)(st->sinc_table
+                                 [4 + (j+1)*st->oversample - offset - 1]));
+            accum[2] += ((float)(curr_mem) *
+                         (float)(st->sinc_table
+                                 [4 + (j+1)*st->oversample - offset]));
+            accum[3] += ((float)(curr_mem) *
+                         (float)(st->sinc_table
+                                 [4 + (j+1)*st->oversample - offset + 1]));
+        }
+
+        if (in != NULL) {
+
+            ptr = in + st->in_stride * (last_sample - N + 1 + j);
+
+            /* Do the new part */
+            for (; j < N; j++) {
+
+                double curr_in = *ptr;
+                ptr += st->in_stride;
+
+                accum[0] += ((float)(curr_in) *
+                             (float)(st->sinc_table
+                                     [4 + (j+1)*st->oversample - offset - 2]));
+                accum[1] += ((float)(curr_in) *
+                             (float)(st->sinc_table
+                                     [4 + (j+1)*st->oversample - offset - 1]));
+                accum[2] += ((float)(curr_in) *
+                             (float)(st->sinc_table
+                                     [4 + (j+1)*st->oversample - offset]));
+                accum[3] += ((float)(curr_in) *
+                             (float)(st->sinc_table
+                                     [4 + (j+1)*st->oversample - offset + 1]));
+            }
+        }
+
+        cubic_coef(frac, interp);
+
+        sum =
+	    interp[0] * accum[0] +
+	    interp[1] * accum[1] +
+	    interp[2] * accum[2] +
+	    interp[3] * accum[3];
+
+        *out = (sum);
+        out += st->out_stride;
+        out_sample++;
+        last_sample += st->int_advance;
+        samp_frac_num += st->frac_advance;
+
+        if (samp_frac_num >= st->den_rate) {
+            samp_frac_num -= st->den_rate;
+            last_sample++;
+        }
+    }
+
+    st->last_sample[channel_index] = last_sample;
+    st->samp_frac_num[channel_index] = samp_frac_num;
+
+    return out_sample;
+}
+
+static void update_filter(SpeexResamplerState *st)
+{
+    unsigned int old_length;
+
+    /*   fprintf(stderr, "update_filter\n"); */
+
+    old_length = st->filt_len;
+    st->oversample = quality_map[st->quality].oversample;
+    st->filt_len = quality_map[st->quality].base_length;
+
+    if (st->num_rate > st->den_rate) {
+
+        /* down-sampling */
+        st->cutoff = quality_map[st->quality].downsample_bandwidth
+            * st->den_rate / st->num_rate;
+
+        st->filt_len = (unsigned int)
+            ceil(st->filt_len * ((double)st->num_rate / (double)st->den_rate));
+
+        /* Round down to make sure we have a multiple of 4 */
+        st->filt_len &= (~0x3);
+
+        if (2*st->den_rate < st->num_rate)
+            st->oversample >>= 1;
+
+        if (4*st->den_rate < st->num_rate)
+            st->oversample >>= 1;
+
+        if (8*st->den_rate < st->num_rate)
+            st->oversample >>= 1;
+
+        if (16*st->den_rate < st->num_rate)
+            st->oversample >>= 1;
+
+        if (st->oversample < 1)
+            st->oversample = 1;
+
+    } else {
+
+        /* up-sampling */
+        st->cutoff = quality_map[st->quality].upsample_bandwidth;
+    }
+
+    /* Choose the resampling type that requires the least amount of memory */
+
+    if (st->den_rate <= st->oversample) {
+
+        unsigned int i;
+
+        if (!st->sinc_table) {
+
+            st->sinc_table = (float *)speex_alloc
+                (st->filt_len * st->den_rate, sizeof(float));
+
+	} else if (st->sinc_table_alloc < st->filt_len*st->den_rate) {
+
+//		fprintf(stderr,"sinc_table=%p\n",st->sinc_table);
+            st->sinc_table = (float *)speex_realloc
+                (st->sinc_table, st->sinc_table_alloc,
+                 st->filt_len * st->den_rate, sizeof(float));
+            st->sinc_table_alloc = st->filt_len * st->den_rate;
+        }
+
+        for (i = 0; i < st->den_rate; i++) {
+
+            int j;
+
+            for (j = 0; j < st->filt_len; j++) {
+                st->sinc_table[i*st->filt_len+j] = sinc
+                    (st->cutoff,
+                     ((j - (int)st->filt_len / 2 + 1) - ((float)i) / st->den_rate), 
+                     st->filt_len,
+                     quality_map[st->quality].window_func);
+            }
+        }
+
+        if (st->quality > 8) {
+            st->resampler_ptr = resampler_basic_direct_double;
+        } else {
+            st->resampler_ptr = resampler_basic_direct_single;
+        }
+
+        /*      fprintf (stderr, "resampler uses direct sinc table and normalised cutoff %f\n", st->cutoff); */
+
+    } else {
+
+        int i;
+
+        if (!st->sinc_table) {
+
+            st->sinc_table = (float *)speex_alloc
+                ((st->filt_len * st->oversample + 8),  sizeof(float));
+
+	} else if (st->sinc_table_alloc < st->filt_len*st->oversample + 8) {
+
+		//fprintf(stderr,"sinc_table=%p\n",st->sinc_table);
+            st->sinc_table = (float *)speex_realloc
+                (st->sinc_table, st->sinc_table_alloc,
+                 (st->filt_len * st->oversample + 8), sizeof(float));
+            st->sinc_table_alloc = st->filt_len * st->oversample + 8;
+        }
+
+        for (i = -4; i < (int)(st->oversample * st->filt_len + 4); i++) {
+            st->sinc_table[i+4] = sinc
+                (st->cutoff,
+                 (i / (float)st->oversample - st->filt_len / 2),
+                 st->filt_len,
+                 quality_map[st->quality].window_func);
+	}
+
+        if (st->quality > 8)
+            st->resampler_ptr = resampler_basic_interpolate_double;
+        else
+            st->resampler_ptr = resampler_basic_interpolate_single;
+
+        /* fprintf (stderr, "resampler uses interpolated sinc table and normalised cutoff %f\n", st->cutoff); */
+
+        /* fprintf (stderr, "table length %d, filt len %d\n", st->sinc_table_length, st->filt_len); */
+    }
+
+    st->int_advance = st->num_rate / st->den_rate;
+    st->frac_advance = st->num_rate % st->den_rate;
+
+    /* Here's the place where we update the filter memory to take into
+       account the change in filter length. It's probably the messiest
+       part of the code due to handling of lots of corner cases. */
+
+    if (!st->mem) {
+
+        unsigned int i;
+        st->mem = (float*)speex_alloc
+            (st->nb_channels * (st->filt_len - 1), sizeof(float));
+
+        for (i = 0; i < st->nb_channels * (st->filt_len - 1); i++)
+            st->mem[i] = 0;
+
+        st->mem_alloc_size = st->filt_len - 1;
+
+    } else if (!st->started) {
+
+        unsigned int i;
+
+		//fprintf(stderr,"mem=%p\n",st->mem);
+		st->mem = (float*)speex_realloc
+            (st->mem, 0, st->nb_channels * (st->filt_len - 1), sizeof(float));
+
+        for (i = 0; i < st->nb_channels * (st->filt_len - 1); i++)
+            st->mem[i] = 0;
+
+        st->mem_alloc_size = st->filt_len - 1;
+
+    } else if (st->filt_len > old_length) {
+
+        int i;
+
+        /* Increase the filter length */
+
+        int old_alloc_size = st->mem_alloc_size;
+
+        if (st->filt_len - 1 > st->mem_alloc_size) {
+			
+		//fprintf(stderr,"mem=%p\n",st->mem);
+
+            st->mem = (float*)speex_realloc
+                (st->mem, st->nb_channels * (old_length - 1),
+                 st->nb_channels * (st->filt_len - 1), sizeof(float));
+            st->mem_alloc_size = st->filt_len - 1;
+        }
+
+        for (i = st->nb_channels - 1; i >= 0; i--) {
+
+            int j;
+            unsigned int olen = old_length;
+
+	    /*if (st->magic_samples[i])*/
+            {
+
+                /* Try and remove the magic samples as if nothing had happened */
+
+                /* FIXME: This is wrong but for now we need it to
+                 * avoid going over the array bounds */
+
+                olen = old_length + 2 * st->magic_samples[i];
+
+                for (j = old_length - 2 + st->magic_samples[i]; j >= 0; j--) {
+                    st->mem[i*st->mem_alloc_size+j+st->magic_samples[i]] =
+                        st->mem[i*old_alloc_size+j];
+                }
+
+                for (j = 0; j < st->magic_samples[i]; j++) {
+                    st->mem[i*st->mem_alloc_size+j] = 0;
+                }
+
+                st->magic_samples[i] = 0;
+            }
+
+            if (st->filt_len > olen) {
+
+                /* If the new filter length is still bigger than the
+                 * "augmented" length */
+
+                /* Copy data going backward */
+
+                for (j = 0; j < olen - 1; j++) {
+                    st->mem[i*st->mem_alloc_size+(st->filt_len-2-j)] =
+                        st->mem[i*st->mem_alloc_size+(olen-2-j)];
+                }
+
+                /* Then put zeros for lack of anything better */
+                for (; j < st->filt_len - 1; j++) {
+                    st->mem[i*st->mem_alloc_size+(st->filt_len-2-j)] = 0;
+                }
+
+                /* Adjust last_sample */
+                st->last_sample[i] += (st->filt_len - olen) / 2;
+
+            } else {
+
+                /* Put back some of the magic! */
+                st->magic_samples[i] = (olen - st->filt_len) / 2;
+
+                for (j = 0; j < st->filt_len - 1 + st->magic_samples[i]; j++) {
+                    st->mem[i*st->mem_alloc_size+j] =
+                        st->mem[i*st->mem_alloc_size+j+st->magic_samples[i]];
+                }
+            }
+        }
+    } else if (st->filt_len < old_length) {
+
+        unsigned int i;
+
+        /* Reduce filter length, this a bit tricky. We need to store
+           some of the memory as "magic" samples so they can be used
+           directly as input the next time(s) */
+
+        for (i = 0; i < st->nb_channels; i++) {
+
+            unsigned int j;
+            unsigned int old_magic = st->magic_samples[i];
+            st->magic_samples[i] = (old_length - st->filt_len) / 2;
+
+            /* We must copy some of the memory that's no longer used */
+            /* Copy data going backward */
+
+            for (j = 0; j < st->filt_len - 1 + st->magic_samples[i] + old_magic; j++) {
+                st->mem[i*st->mem_alloc_size+j] =
+                    st->mem[i*st->mem_alloc_size+j+st->magic_samples[i]];
+	    }
+
+            st->magic_samples[i] += old_magic;
+        }
+    }
+}
+
+SpeexResamplerState *speex_resampler_init(unsigned int nb_channels, unsigned int in_rate, unsigned int out_rate, int quality, int *err)
+{
+    return speex_resampler_init_frac(nb_channels, in_rate, out_rate,
+				     in_rate, out_rate, quality, err);
+}
+
+SpeexResamplerState *speex_resampler_init_frac(unsigned int nb_channels, unsigned int ratio_num, unsigned int ratio_den, unsigned int in_rate, unsigned int out_rate, int quality, int *err)
+{
+    unsigned int i;
+    SpeexResamplerState *st;
+
+    if (quality > 10 || quality < 0) {
+        if (err) *err = RESAMPLER_ERR_INVALID_ARG;
+        return NULL;
+    }
+
+    st = (SpeexResamplerState *)speex_alloc(1, sizeof(SpeexResamplerState));
+
+    st->initialised = 0;
+    st->started = 0;
+    st->in_rate = 0;
+    st->out_rate = 0;
+    st->num_rate = 0;
+    st->den_rate = 0;
+    st->quality = -1;
+	st->sinc_table = 0;
+    st->sinc_table_length = 0;
+    st->sinc_table_alloc = 0;
+    st->mem_alloc_size = 0;
+    st->filt_len = 0;
+    st->mem = 0;
+    st->resampler_ptr = 0;
+
+    st->cutoff = 1.f;
+    st->nb_channels = nb_channels;
+    st->in_stride = 1;
+    st->out_stride = 1;
+
+    /* Per channel data */
+    st->last_sample = (int*)speex_alloc(nb_channels, sizeof(int));
+    st->magic_samples = (unsigned int*)speex_alloc(nb_channels, sizeof(int));
+    st->samp_frac_num = (unsigned int*)speex_alloc(nb_channels, sizeof(int));
+
+    for (i = 0; i < nb_channels; i++) {
+        st->last_sample[i] = 0;
+        st->magic_samples[i] = 0;
+        st->samp_frac_num[i] = 0;
+    }
+
+    speex_resampler_set_quality(st, quality);
+    speex_resampler_set_rate_frac(st, ratio_num, ratio_den, in_rate, out_rate);
+
+    update_filter(st);
+
+    st->initialised = 1;
+
+    if (err) *err = RESAMPLER_ERR_SUCCESS;
+
+    return st;
+}
+
+void speex_resampler_destroy(SpeexResamplerState *st)
+{
+    speex_free(st->mem);
+    speex_free(st->sinc_table);
+    speex_free(st->last_sample);
+    speex_free(st->magic_samples);
+    speex_free(st->samp_frac_num);
+    speex_free(st);
+}
+
+static int speex_resampler_process_native(SpeexResamplerState *st, unsigned int channel_index, const float *in, unsigned int *in_len, float *out, unsigned int *out_len)
+{
+    int j = 0;
+    int N = st->filt_len;
+    int out_sample = 0;
+    float *mem;
+    unsigned int tmp_out_len = 0;
+
+    mem = st->mem + channel_index * st->mem_alloc_size;
+    st->started = 1;
+
+    /* Handle the case where we have samples left from a reduction in
+     * filter length */
+
+    if (st->magic_samples[channel_index]) {
+
+        int istride_save;
+        unsigned int tmp_in_len;
+        unsigned int tmp_magic;
+
+        istride_save = st->in_stride;
+        tmp_in_len = st->magic_samples[channel_index];
+        tmp_out_len = *out_len;
+
+        /* magic_samples needs to be set to zero to avoid infinite recursion */
+        tmp_magic = st->magic_samples[channel_index];
+        st->magic_samples[channel_index] = 0;
+        st->in_stride = 1;
+        speex_resampler_process_native(st, channel_index, mem + N-1,
+                                       &tmp_in_len, out, &tmp_out_len);
+        st->in_stride = istride_save;
+
+        /* If we couldn't process all "magic" input samples, save the
+         * rest for next time */
+
+        if (tmp_in_len < tmp_magic) {
+
+            unsigned int i;
+
+            st->magic_samples[channel_index] = tmp_magic - tmp_in_len;
+
+            for (i = 0; i < st->magic_samples[channel_index]; i++) {
+                mem[N-1+i] = mem[N-1+i+tmp_in_len];
+            }
+        }
+
+        out += tmp_out_len * st->out_stride;
+        *out_len -= tmp_out_len;
+    }
+
+    /* Call the right resampler through the function ptr */
+    out_sample = st->resampler_ptr(st, channel_index,
+                                   in, in_len, out, out_len);
+
+    if (st->last_sample[channel_index] < (int)*in_len) {
+        *in_len = st->last_sample[channel_index];
+    }
+
+    *out_len = out_sample + tmp_out_len;
+
+    st->last_sample[channel_index] -= *in_len;
+
+    for (j = 0; j < N-1 - (int)*in_len; j++) {
+        mem[j] = mem[j+*in_len];
+    }
+
+    if (in != NULL) {
+        for ( ; j < N-1; j++) mem[j] = in[st->in_stride*(j+*in_len-N+1)];
+    } else {
+        for ( ; j < N-1; j++) mem[j] = 0;
+    }
+
+    return RESAMPLER_ERR_SUCCESS;
+}
+
+int speex_resampler_process_float(SpeexResamplerState *st, unsigned int channel_index, const float *in, unsigned int *in_len, float *out, unsigned int *out_len)
+{
+    return speex_resampler_process_native(st, channel_index, in, in_len, out, out_len);
+}
+
+int speex_resampler_process_interleaved_float(SpeexResamplerState *st, const float *in, unsigned int *in_len, float *out, unsigned int *out_len)
+{
+    unsigned int i;
+    int istride_save, ostride_save;
+    unsigned int bak_len = *out_len;
+
+    istride_save = st->in_stride;
+    ostride_save = st->out_stride;
+    st->in_stride = st->out_stride = st->nb_channels;
+
+    for (i = 0; i < st->nb_channels; i++) {
+
+        *out_len = bak_len;
+
+        if (in != NULL) {
+            speex_resampler_process_float(st, i, in + i, in_len, out + i, out_len);
+        } else {
+            speex_resampler_process_float(st, i, NULL, in_len, out + i, out_len);
+        }
+    }
+
+    st->in_stride = istride_save;
+    st->out_stride = ostride_save;
+
+    return RESAMPLER_ERR_SUCCESS;
+}
+
+int speex_resampler_set_rate(SpeexResamplerState *st, unsigned int in_rate, unsigned int out_rate)
+{
+    return speex_resampler_set_rate_frac(st, in_rate, out_rate, in_rate, out_rate);
+}
+
+void speex_resampler_get_rate(SpeexResamplerState *st, unsigned int *in_rate, unsigned int *out_rate)
+{
+    *in_rate = st->in_rate;
+    *out_rate = st->out_rate;
+}
+
+static unsigned int gcd(unsigned int a, unsigned int b)
+{
+    /* Euclid */
+
+    while (b) {
+        unsigned int tmp = b;
+        b = a % b;
+        a = tmp;
+    }
+
+    return a;
+}
+
+int speex_resampler_set_rate_frac(SpeexResamplerState *st, unsigned int ratio_num, unsigned int ratio_den, unsigned int in_rate, unsigned int out_rate)
+{
+    unsigned int old_den;
+    unsigned int i;
+	unsigned int g;
+
+    if (st->in_rate == in_rate && st->out_rate == out_rate &&
+	st->num_rate == ratio_num && st->den_rate == ratio_den) {
+        return RESAMPLER_ERR_SUCCESS;
+    }
+
+    old_den = st->den_rate;
+
+    st->in_rate = in_rate;
+    st->out_rate = out_rate;
+
+    st->num_rate = ratio_num;
+    st->den_rate = ratio_den;
+
+    g = gcd(st->num_rate, st->den_rate);
+
+    st->num_rate /= g;
+    st->den_rate /= g;
+
+    if (old_den > 0) {
+
+        for (i = 0; i < st->nb_channels; i++) {
+
+            st->samp_frac_num[i] = st->samp_frac_num[i] * st->den_rate / old_den;
+
+            if (st->samp_frac_num[i] >= st->den_rate) {
+                st->samp_frac_num[i] = st->den_rate - 1;
+	    }
+        }
+    }
+
+    if (st->initialised) {
+        update_filter(st);
+    }
+
+    return RESAMPLER_ERR_SUCCESS;
+}
+
+void speex_resampler_get_ratio(SpeexResamplerState *st, unsigned int *ratio_num, unsigned int *ratio_den)
+{
+    *ratio_num = st->num_rate;
+    *ratio_den = st->den_rate;
+}
+
+int speex_resampler_set_quality(SpeexResamplerState *st, int quality)
+{
+    if (quality > 10 || quality < 0) {
+        return RESAMPLER_ERR_INVALID_ARG;
+    }
+
+    if (st->quality == quality) {
+        return RESAMPLER_ERR_SUCCESS;
+    }
+
+    st->quality = quality;
+
+    if (st->initialised) {
+        update_filter(st); 
+    }
+
+    return RESAMPLER_ERR_SUCCESS;
+}
+
+void speex_resampler_get_quality(SpeexResamplerState *st, int *quality)
+{
+    *quality = st->quality;
+}
+
+void speex_resampler_set_input_stride(SpeexResamplerState *st, unsigned int stride)
+{
+    st->in_stride = stride;
+}
+
+void speex_resampler_get_input_stride(SpeexResamplerState *st, unsigned int *stride)
+{
+    *stride = st->in_stride;
+}
+
+void speex_resampler_set_output_stride(SpeexResamplerState *st, unsigned int stride)
+{
+    st->out_stride = stride;
+}
+
+void speex_resampler_get_output_stride(SpeexResamplerState *st, unsigned int *stride)
+{
+    *stride = st->out_stride;
+}
+
+int speex_resampler_get_input_latency(SpeexResamplerState *st)
+{
+    return st->filt_len / 2;
+}
+
+int speex_resampler_get_output_latency(SpeexResamplerState *st) 
+{
+    return ((st->filt_len / 2) * st->den_rate + (st->num_rate >> 1)) / st->num_rate;
+}
+
+int speex_resampler_skip_zeros(SpeexResamplerState *st)
+{
+    unsigned int i;
+
+    for (i = 0; i < st->nb_channels; i++) {
+        st->last_sample[i] = st->filt_len / 2;
+    }
+
+    return RESAMPLER_ERR_SUCCESS;
+}
+
+int speex_resampler_reset_mem(SpeexResamplerState *st)
+{
+    unsigned int i;
+
+    for (i = 0; i < st->nb_channels*(st->filt_len - 1); i++) {
+        st->mem[i] = 0;
+    }
+
+    return RESAMPLER_ERR_SUCCESS;
+}
+
+const char *speex_resampler_strerror(int err)
+{
+    switch (err) {
+
+    case RESAMPLER_ERR_SUCCESS:
+        return "Success.";
+
+    case RESAMPLER_ERR_ALLOC_FAILED:
+        return "Memory allocation failed.";
+
+    case RESAMPLER_ERR_BAD_STATE:
+        return "Bad resampler state.";
+
+    case RESAMPLER_ERR_INVALID_ARG:
+        return "Invalid argument.";
+
+    case RESAMPLER_ERR_PTR_OVERLAP:
+        return "Input and output buffers overlap.";
+
+    default:
+        return "Unknown error. Bad error code or strange version mismatch.";
+    }
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/speex/speex_resampler.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/speex/speex_resampler.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,301 @@
+/* Copyright (C) 2007 Jean-Marc Valin
+      
+   File: speex_resampler.h
+   Resampling code
+      
+   The design goals of this code are:
+      - Very fast algorithm
+      - Low memory requirement
+      - Good *perceptual* quality (and not best SNR)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SPEEX_RESAMPLER_H
+#define SPEEX_RESAMPLER_H
+
+/********* WARNING: MENTAL SANITY ENDS HERE *************/
+
+/* If the resampler is defined outside of Speex, we change the symbol
+   names so that there won't be any clash if linking with Speex later
+   on. */
+
+#define RANDOM_PREFIX rubberband
+
+#ifndef RANDOM_PREFIX
+#error "Please define RANDOM_PREFIX (above) to something specific to your project to prevent symbol name clashes"
+#endif
+
+#define CAT_PREFIX2(a,b) a ## b
+#define CAT_PREFIX(a,b) CAT_PREFIX2(a, b)
+      
+#define speex_resampler_init CAT_PREFIX(RANDOM_PREFIX,_resampler_init)
+#define speex_resampler_init_frac CAT_PREFIX(RANDOM_PREFIX,_resampler_init_frac)
+#define speex_resampler_destroy CAT_PREFIX(RANDOM_PREFIX,_resampler_destroy)
+#define speex_resampler_process_float CAT_PREFIX(RANDOM_PREFIX,_resampler_process_float)
+#define speex_resampler_process_int CAT_PREFIX(RANDOM_PREFIX,_resampler_process_int)
+#define speex_resampler_process_interleaved_float CAT_PREFIX(RANDOM_PREFIX,_resampler_process_interleaved_float)
+#define speex_resampler_process_interleaved_int CAT_PREFIX(RANDOM_PREFIX,_resampler_process_interleaved_int)
+#define speex_resampler_set_rate CAT_PREFIX(RANDOM_PREFIX,_resampler_set_rate)
+#define speex_resampler_get_rate CAT_PREFIX(RANDOM_PREFIX,_resampler_get_rate)
+#define speex_resampler_set_rate_frac CAT_PREFIX(RANDOM_PREFIX,_resampler_set_rate_frac)
+#define speex_resampler_get_ratio CAT_PREFIX(RANDOM_PREFIX,_resampler_get_ratio)
+#define speex_resampler_set_quality CAT_PREFIX(RANDOM_PREFIX,_resampler_set_quality)
+#define speex_resampler_get_quality CAT_PREFIX(RANDOM_PREFIX,_resampler_get_quality)
+#define speex_resampler_set_input_stride CAT_PREFIX(RANDOM_PREFIX,_resampler_set_input_stride)
+#define speex_resampler_get_input_stride CAT_PREFIX(RANDOM_PREFIX,_resampler_get_input_stride)
+#define speex_resampler_set_output_stride CAT_PREFIX(RANDOM_PREFIX,_resampler_set_output_stride)
+#define speex_resampler_get_output_stride CAT_PREFIX(RANDOM_PREFIX,_resampler_get_output_stride)
+#define speex_resampler_get_input_latency CAT_PREFIX(RANDOM_PREFIX,_resampler_get_input_latency)
+#define speex_resampler_get_output_latency CAT_PREFIX(RANDOM_PREFIX,_resampler_get_output_latency)
+#define speex_resampler_skip_zeros CAT_PREFIX(RANDOM_PREFIX,_resampler_skip_zeros)
+#define speex_resampler_reset_mem CAT_PREFIX(RANDOM_PREFIX,_resampler_reset_mem)
+#define speex_resampler_strerror CAT_PREFIX(RANDOM_PREFIX,_resampler_strerror)
+
+#define spx_int16_t short
+#define spx_int32_t int
+#define spx_uint16_t unsigned short
+#define spx_uint32_t unsigned int
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SPEEX_RESAMPLER_QUALITY_MAX 10
+#define SPEEX_RESAMPLER_QUALITY_MIN 0
+#define SPEEX_RESAMPLER_QUALITY_DEFAULT 4
+#define SPEEX_RESAMPLER_QUALITY_VOIP 3
+#define SPEEX_RESAMPLER_QUALITY_DESKTOP 5
+
+enum {
+   RESAMPLER_ERR_SUCCESS         = 0,
+   RESAMPLER_ERR_ALLOC_FAILED    = 1,
+   RESAMPLER_ERR_BAD_STATE       = 2,
+   RESAMPLER_ERR_INVALID_ARG     = 3,
+   RESAMPLER_ERR_PTR_OVERLAP     = 4,
+   
+   RESAMPLER_ERR_MAX_ERROR
+};
+
+struct SpeexResamplerState_;
+typedef struct SpeexResamplerState_ SpeexResamplerState;
+
+/** Create a new resampler with integer input and output rates.
+ * @param nb_channels Number of channels to be processed
+ * @param in_rate Input sampling rate (integer number of Hz).
+ * @param out_rate Output sampling rate (integer number of Hz).
+ * @param quality Resampling quality between 0 and 10, where 0 has poor quality
+ * and 10 has very high quality.
+ * @return Newly created resampler state
+ * @retval NULL Error: not enough memory
+ */
+SpeexResamplerState *speex_resampler_init(spx_uint32_t nb_channels, 
+                                          spx_uint32_t in_rate, 
+                                          spx_uint32_t out_rate, 
+                                          int quality,
+                                          int *err);
+
+/** Create a new resampler with fractional input/output rates. The sampling 
+ * rate ratio is an arbitrary rational number with both the numerator and 
+ * denominator being 32-bit integers.
+ * @param nb_channels Number of channels to be processed
+ * @param ratio_num Numerator of the sampling rate ratio
+ * @param ratio_den Denominator of the sampling rate ratio
+ * @param in_rate Input sampling rate rounded to the nearest integer (in Hz).
+ * @param out_rate Output sampling rate rounded to the nearest integer (in Hz).
+ * @param quality Resampling quality between 0 and 10, where 0 has poor quality
+ * and 10 has very high quality.
+ * @return Newly created resampler state
+ * @retval NULL Error: not enough memory
+ */
+SpeexResamplerState *speex_resampler_init_frac(spx_uint32_t nb_channels, 
+                                               spx_uint32_t ratio_num, 
+                                               spx_uint32_t ratio_den, 
+                                               spx_uint32_t in_rate, 
+                                               spx_uint32_t out_rate, 
+                                               int quality,
+                                               int *err);
+
+/** Destroy a resampler state.
+ * @param st Resampler state
+ */
+void speex_resampler_destroy(SpeexResamplerState *st);
+
+/** Resample a float array. The input and output buffers must *not* overlap.
+ * @param st Resampler state
+ * @param channel_index Index of the channel to process for the multi-channel 
+ * base (0 otherwise)
+ * @param in Input buffer
+ * @param in_len Number of input samples in the input buffer. Returns the 
+ * number of samples processed
+ * @param out Output buffer
+ * @param out_len Size of the output buffer. Returns the number of samples written
+ */
+int speex_resampler_process_float(SpeexResamplerState *st, 
+                                   spx_uint32_t channel_index, 
+                                   const float *in, 
+                                   spx_uint32_t *in_len, 
+                                   float *out, 
+                                   spx_uint32_t *out_len);
+
+/** Resample an interleaved float array. The input and output buffers must *not* overlap.
+ * @param st Resampler state
+ * @param in Input buffer
+ * @param in_len Number of input samples in the input buffer. Returns the number
+ * of samples processed. This is all per-channel.
+ * @param out Output buffer
+ * @param out_len Size of the output buffer. Returns the number of samples written.
+ * This is all per-channel.
+ */
+int speex_resampler_process_interleaved_float(SpeexResamplerState *st, 
+                                               const float *in, 
+                                               spx_uint32_t *in_len, 
+                                               float *out, 
+                                               spx_uint32_t *out_len);
+
+/** Set (change) the input/output sampling rates (integer value).
+ * @param st Resampler state
+ * @param in_rate Input sampling rate (integer number of Hz).
+ * @param out_rate Output sampling rate (integer number of Hz).
+ */
+int speex_resampler_set_rate(SpeexResamplerState *st, 
+                              spx_uint32_t in_rate, 
+                              spx_uint32_t out_rate);
+
+/** Get the current input/output sampling rates (integer value).
+ * @param st Resampler state
+ * @param in_rate Input sampling rate (integer number of Hz) copied.
+ * @param out_rate Output sampling rate (integer number of Hz) copied.
+ */
+void speex_resampler_get_rate(SpeexResamplerState *st, 
+                              spx_uint32_t *in_rate, 
+                              spx_uint32_t *out_rate);
+
+/** Set (change) the input/output sampling rates and resampling ratio 
+ * (fractional values in Hz supported).
+ * @param st Resampler state
+ * @param ratio_num Numerator of the sampling rate ratio
+ * @param ratio_den Denominator of the sampling rate ratio
+ * @param in_rate Input sampling rate rounded to the nearest integer (in Hz).
+ * @param out_rate Output sampling rate rounded to the nearest integer (in Hz).
+ */
+int speex_resampler_set_rate_frac(SpeexResamplerState *st, 
+                                   spx_uint32_t ratio_num, 
+                                   spx_uint32_t ratio_den, 
+                                   spx_uint32_t in_rate, 
+                                   spx_uint32_t out_rate);
+
+/** Get the current resampling ratio. This will be reduced to the least
+ * common denominator.
+ * @param st Resampler state
+ * @param ratio_num Numerator of the sampling rate ratio copied
+ * @param ratio_den Denominator of the sampling rate ratio copied
+ */
+void speex_resampler_get_ratio(SpeexResamplerState *st, 
+                               spx_uint32_t *ratio_num, 
+                               spx_uint32_t *ratio_den);
+
+/** Set (change) the conversion quality.
+ * @param st Resampler state
+ * @param quality Resampling quality between 0 and 10, where 0 has poor 
+ * quality and 10 has very high quality.
+ */
+int speex_resampler_set_quality(SpeexResamplerState *st, 
+                                 int quality);
+
+/** Get the conversion quality.
+ * @param st Resampler state
+ * @param quality Resampling quality between 0 and 10, where 0 has poor 
+ * quality and 10 has very high quality.
+ */
+void speex_resampler_get_quality(SpeexResamplerState *st, 
+                                 int *quality);
+
+/** Set (change) the input stride.
+ * @param st Resampler state
+ * @param stride Input stride
+ */
+void speex_resampler_set_input_stride(SpeexResamplerState *st, 
+                                      spx_uint32_t stride);
+
+/** Get the input stride.
+ * @param st Resampler state
+ * @param stride Input stride copied
+ */
+void speex_resampler_get_input_stride(SpeexResamplerState *st, 
+                                      spx_uint32_t *stride);
+
+/** Set (change) the output stride.
+ * @param st Resampler state
+ * @param stride Output stride
+ */
+void speex_resampler_set_output_stride(SpeexResamplerState *st, 
+                                      spx_uint32_t stride);
+
+/** Get the output stride.
+ * @param st Resampler state copied
+ * @param stride Output stride
+ */
+void speex_resampler_get_output_stride(SpeexResamplerState *st, 
+                                      spx_uint32_t *stride);
+
+/** Get the latency in input samples introduced by the resampler.
+ * @param st Resampler state
+ */
+int speex_resampler_get_input_latency(SpeexResamplerState *st);
+
+/** Get the latency in output samples introduced by the resampler.
+ * @param st Resampler state
+ */
+int speex_resampler_get_output_latency(SpeexResamplerState *st);
+
+/** Make sure that the first samples to go out of the resamplers don't have 
+ * leading zeros. This is only useful before starting to use a newly created 
+ * resampler. It is recommended to use that when resampling an audio file, as
+ * it will generate a file with the same length. For real-time processing,
+ * it is probably easier not to use this call (so that the output duration
+ * is the same for the first frame).
+ * @param st Resampler state
+ */
+int speex_resampler_skip_zeros(SpeexResamplerState *st);
+
+/** Reset a resampler so a new (unrelated) stream can be processed.
+ * @param st Resampler state
+ */
+int speex_resampler_reset_mem(SpeexResamplerState *st);
+
+/** Returns the English meaning for an error code
+ * @param err Error code
+ * @return English string
+ */
+const char *speex_resampler_strerror(int err);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/system/Allocators.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/system/Allocators.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,69 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "Allocators.h"
+
+#ifdef HAVE_IPP
+#include <ipps.h>
+#endif
+
+#include <iostream>
+using std::cerr;
+using std::endl;
+
+namespace RubberBand {
+
+#ifdef HAVE_IPP
+
+template <>
+float *allocate(size_t count)
+{
+    float *ptr = ippsMalloc_32f(count);
+    if (!ptr) throw (std::bad_alloc());
+    return ptr;
+}
+
+template <>
+double *allocate(size_t count)
+{
+    double *ptr = ippsMalloc_64f(count);
+    if (!ptr) throw (std::bad_alloc());
+    return ptr;
+}
+
+template <>
+void deallocate(float *ptr)
+{
+    if (ptr) ippsFree((void *)ptr);
+}
+
+template <>
+void deallocate(double *ptr)
+{
+    if (ptr) ippsFree((void *)ptr);
+}
+
+#endif
+
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/system/Allocators.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/system/Allocators.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,240 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_ALLOCATORS_H_
+#define _RUBBERBAND_ALLOCATORS_H_
+
+#include "VectorOps.h"
+
+#include <new> // for std::bad_alloc
+#include <stdlib.h>
+
+#ifndef HAVE_POSIX_MEMALIGN
+#ifndef _WIN32
+#ifndef __APPLE__
+#ifndef LACK_POSIX_MEMALIGN
+#define HAVE_POSIX_MEMALIGN
+#endif
+#endif
+#endif
+#endif
+
+#ifdef HAVE_POSIX_MEMALIGN
+#include <sys/mman.h>
+#endif
+
+#ifdef LACK_BAD_ALLOC
+namespace std { struct bad_alloc { }; }
+#endif
+
+namespace RubberBand {
+
+template <typename T>
+T *allocate(size_t count)
+{
+    void *ptr = 0;
+    // 32-byte alignment is required for at least OpenMAX
+    static const int alignment = 32;
+#ifdef USE_OWN_ALIGNED_MALLOC
+    // Alignment must be a power of two, bigger than the pointer
+    // size. Stuff the actual malloc'd pointer in just before the
+    // returned value.  This is the least desirable way to do this --
+    // the other options below are all better
+    size_t allocd = count * sizeof(T) + alignment;
+    void *buf = malloc(allocd);
+    if (buf) {
+        char *adj = (char *)buf;
+        while ((unsigned long long)adj & (alignment-1)) --adj;
+        ptr = ((char *)adj) + alignment;
+        ((void **)ptr)[-1] = buf;
+    }
+#else /* !USE_OWN_ALIGNED_MALLOC */
+#ifdef HAVE_POSIX_MEMALIGN
+    if (posix_memalign(&ptr, alignment, count * sizeof(T))) {
+        ptr = malloc(count * sizeof(T));
+    }
+#else /* !HAVE_POSIX_MEMALIGN */
+#ifdef __MSVC__
+    ptr = _aligned_malloc(count * sizeof(T), alignment);
+#else /* !__MSVC__ */
+#ifndef MALLOC_IS_ALIGNED
+#warning "No aligned malloc available or defined"
+#endif
+    // Note that malloc always aligns to 16 byte boundaries on OS/X
+    ptr = malloc(count * sizeof(T));
+#endif /* !__MSVC__ */
+#endif /* !HAVE_POSIX_MEMALIGN */
+#endif /* !USE_OWN_ALIGNED_MALLOC */
+    if (!ptr) {
+#ifndef NO_EXCEPTIONS
+        throw(std::bad_alloc());
+#else
+        abort();
+#endif
+    }
+    return (T *)ptr;
+}
+
+#ifdef HAVE_IPP
+
+template <>
+float *allocate(size_t count);
+
+template <>
+double *allocate(size_t count);
+
+#endif
+	
+template <typename T>
+T *allocate_and_zero(size_t count)
+{
+    T *ptr = allocate<T>(count);
+    v_zero(ptr, count);
+    return ptr;
+}
+
+template <typename T>
+void deallocate(T *ptr)
+{
+#ifdef USE_OWN_ALIGNED_MALLOC
+    if (ptr) free(((void **)ptr)[-1]);
+#else /* !USE_OWN_ALIGNED_MALLOC */
+#ifdef __MSVC__
+    if (ptr) _aligned_free((void *)ptr);
+#else /* !__MSVC__ */
+    if (ptr) free((void *)ptr);
+#endif /* !__MSVC__ */
+#endif /* !USE_OWN_ALIGNED_MALLOC */
+}
+
+#ifdef HAVE_IPP
+
+template <>
+void deallocate(float *);
+
+template <>
+void deallocate(double *);
+
+#endif
+
+/// Reallocate preserving contents but leaving additional memory uninitialised	
+template <typename T>
+T *reallocate(T *ptr, size_t oldcount, size_t count)
+{
+    T *newptr = allocate<T>(count);
+    if (oldcount && ptr) {
+        v_copy(newptr, ptr, oldcount < count ? oldcount : count);
+    }
+    if (ptr) deallocate<T>(ptr);
+    return newptr;
+}
+
+/// Reallocate, zeroing all contents
+template <typename T>
+T *reallocate_and_zero(T *ptr, size_t oldcount, size_t count)
+{
+    ptr = reallocate(ptr, oldcount, count);
+    v_zero(ptr, count);
+    return ptr;
+}
+	
+/// Reallocate preserving contents and zeroing any additional memory	
+template <typename T>
+T *reallocate_and_zero_extension(T *ptr, size_t oldcount, size_t count)
+{
+    ptr = reallocate(ptr, oldcount, count);
+    if (count > oldcount) v_zero(ptr + oldcount, count - oldcount);
+    return ptr;
+}
+
+template <typename T>
+T **allocate_channels(size_t channels, size_t count)
+{
+    T **ptr = allocate<T *>(channels);
+    for (size_t c = 0; c < channels; ++c) {
+        ptr[c] = allocate<T>(count);
+    }
+    return ptr;
+}
+	
+template <typename T>
+T **allocate_and_zero_channels(size_t channels, size_t count)
+{
+    T **ptr = allocate<T *>(channels);
+    for (size_t c = 0; c < channels; ++c) {
+        ptr[c] = allocate_and_zero<T>(count);
+    }
+    return ptr;
+}
+
+template <typename T>
+void deallocate_channels(T **ptr, size_t channels)
+{
+    if (!ptr) return;
+    for (size_t c = 0; c < channels; ++c) {
+        deallocate<T>(ptr[c]);
+    }
+    deallocate<T *>(ptr);
+}
+	
+template <typename T>
+T **reallocate_channels(T **ptr,
+                        size_t oldchannels, size_t oldcount,
+                        size_t channels, size_t count)
+{
+    T **newptr = allocate_channels<T>(channels, count);
+    if (oldcount && ptr) {
+        v_copy_channels(newptr, ptr, channels, oldcount < count ? oldcount : count);
+    } 
+    if (ptr) deallocate_channels<T>(ptr, channels);
+    return newptr;
+}
+	
+template <typename T>
+T **reallocate_and_zero_extend_channels(T **ptr,
+                                        size_t oldchannels, size_t oldcount,
+                                        size_t channels, size_t count)
+{
+    T **newptr = allocate_and_zero_channels<T>(channels, count);
+    if (oldcount && ptr) {
+        v_copy_channels(newptr, ptr, channels, oldcount < count ? oldcount : count);
+    } 
+    if (ptr) deallocate_channels<T>(ptr, channels);
+    return newptr;
+}
+
+/// RAII class to call deallocate() on destruction
+template <typename T>
+class Deallocator
+{
+public:
+    Deallocator(T *t) : m_t(t) { }
+    ~Deallocator() { deallocate<T>(m_t); }
+private:
+    T *m_t;
+};
+
+}
+
+#endif
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/system/Thread.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/system/Thread.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,663 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef NO_THREADING
+
+#include "Thread.h"
+
+#include <iostream>
+#include <cstdlib>
+
+#ifdef USE_PTHREADS
+#include <sys/time.h>
+#include <time.h>
+#endif
+
+using std::cerr;
+using std::endl;
+using std::string;
+
+namespace RubberBand
+{
+
+#ifdef _WIN32
+
+Thread::Thread() :
+    m_id(0),
+    m_extant(false)
+{
+#ifdef DEBUG_THREAD
+    cerr << "THREAD DEBUG: Created thread object " << this << endl;
+#endif
+}
+
+Thread::~Thread()
+{
+#ifdef DEBUG_THREAD
+    cerr << "THREAD DEBUG: Destroying thread object " << this << ", id " << m_id << endl;
+#endif
+    if (m_extant) {
+        WaitForSingleObject(m_id, INFINITE);
+    }
+#ifdef DEBUG_THREAD
+    cerr << "THREAD DEBUG: Destroyed thread object " << this << endl;
+#endif
+}
+
+void
+Thread::start()
+{
+    m_id = CreateThread(NULL, 0, staticRun, this, 0, 0);
+    if (!m_id) {
+        cerr << "ERROR: thread creation failed" << endl;
+        exit(1);
+    } else {
+#ifdef DEBUG_THREAD
+        cerr << "THREAD DEBUG: Created thread " << m_id << " for thread object " << this << endl;
+#endif
+        m_extant = true;
+    }
+}    
+
+void 
+Thread::wait()
+{
+    if (m_extant) {
+#ifdef DEBUG_THREAD
+        cerr << "THREAD DEBUG: Waiting on thread " << m_id << " for thread object " << this << endl;
+#endif
+        WaitForSingleObject(m_id, INFINITE);
+#ifdef DEBUG_THREAD
+        cerr << "THREAD DEBUG: Waited on thread " << m_id << " for thread object " << this << endl;
+#endif
+        m_extant = false;
+    }
+}
+
+Thread::Id
+Thread::id()
+{
+    return m_id;
+}
+
+bool
+Thread::threadingAvailable()
+{
+    return true;
+}
+
+DWORD
+Thread::staticRun(LPVOID arg)
+{
+    Thread *thread = static_cast<Thread *>(arg);
+#ifdef DEBUG_THREAD
+    cerr << "THREAD DEBUG: " << (void *)GetCurrentThreadId() << ": Running thread " << thread->m_id << " for thread object " << thread << endl;
+#endif
+    thread->run();
+    return 0;
+}
+
+Mutex::Mutex()
+#ifndef NO_THREAD_CHECKS
+    :
+    m_lockedBy(-1)
+#endif
+{
+    m_mutex = CreateMutex(NULL, FALSE, NULL);
+#ifdef DEBUG_MUTEX
+    cerr << "MUTEX DEBUG: " << (void *)GetCurrentThreadId() << ": Initialised mutex " << &m_mutex << endl;
+#endif
+}
+
+Mutex::~Mutex()
+{
+#ifdef DEBUG_MUTEX
+    cerr << "MUTEX DEBUG: " << (void *)GetCurrentThreadId() << ": Destroying mutex " << &m_mutex << endl;
+#endif
+    CloseHandle(m_mutex);
+}
+
+void
+Mutex::lock()
+{
+#ifndef NO_THREAD_CHECKS
+    DWORD tid = GetCurrentThreadId();
+    if (m_lockedBy == tid) {
+        cerr << "ERROR: Deadlock on mutex " << &m_mutex << endl;
+    }
+#endif
+#ifdef DEBUG_MUTEX
+    cerr << "MUTEX DEBUG: " << (void *)tid << ": Want to lock mutex " << &m_mutex << endl;
+#endif
+    WaitForSingleObject(m_mutex, INFINITE);
+#ifndef NO_THREAD_CHECKS
+    m_lockedBy = tid;
+#endif
+#ifdef DEBUG_MUTEX
+    cerr << "MUTEX DEBUG: " << (void *)tid << ": Locked mutex " << &m_mutex << endl;
+#endif
+}
+
+void
+Mutex::unlock()
+{
+#ifndef NO_THREAD_CHECKS
+    DWORD tid = GetCurrentThreadId();
+    if (m_lockedBy != tid) {
+        cerr << "ERROR: Mutex " << &m_mutex << " not owned by unlocking thread" << endl;
+        return;
+    }
+#endif
+#ifdef DEBUG_MUTEX
+    cerr << "MUTEX DEBUG: " << (void *)tid << ": Unlocking mutex " << &m_mutex << endl;
+#endif
+#ifndef NO_THREAD_CHECKS
+    m_lockedBy = -1;
+#endif
+    ReleaseMutex(m_mutex);
+}
+
+bool
+Mutex::trylock()
+{
+#ifndef NO_THREAD_CHECKS
+    DWORD tid = GetCurrentThreadId();
+#endif
+    DWORD result = WaitForSingleObject(m_mutex, 0);
+    if (result == WAIT_TIMEOUT || result == WAIT_FAILED) {
+#ifdef DEBUG_MUTEX
+        cerr << "MUTEX DEBUG: " << (void *)tid << ": Mutex " << &m_mutex << " unavailable" << endl;
+#endif
+        return false;
+    } else {
+#ifndef NO_THREAD_CHECKS
+        m_lockedBy = tid;
+#endif
+#ifdef DEBUG_MUTEX
+        cerr << "MUTEX DEBUG: " << (void *)tid << ": Locked mutex " << &m_mutex << " (from trylock)" << endl;
+#endif
+        return true;
+    }
+}
+
+Condition::Condition(string name) :
+    m_locked(false)
+#ifdef DEBUG_CONDITION
+    , m_name(name)
+#endif
+{
+    m_mutex = CreateMutex(NULL, FALSE, NULL);
+    m_condition = CreateEvent(NULL, FALSE, FALSE, NULL);
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)GetCurrentThreadId() << ": Initialised condition " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+}
+
+Condition::~Condition()
+{
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)GetCurrentThreadId() << ": Destroying condition " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+    if (m_locked) ReleaseMutex(m_mutex);
+    CloseHandle(m_condition);
+    CloseHandle(m_mutex);
+}
+
+void
+Condition::lock()
+{
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)GetCurrentThreadId() << ": Want to lock " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+    WaitForSingleObject(m_mutex, INFINITE);
+    m_locked = true;
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)GetCurrentThreadId() << ": Locked " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+}
+
+void
+Condition::unlock()
+{
+    if (!m_locked) {
+#ifdef DEBUG_CONDITION
+        cerr << "CONDITION DEBUG: " << (void *)GetCurrentThreadId() << ": Not locked " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+        return;
+    }
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)GetCurrentThreadId() << ": Unlocking " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+    m_locked = false;
+    ReleaseMutex(m_mutex);
+}
+
+void 
+Condition::wait(int us)
+{
+    if (us == 0) {
+
+#ifdef DEBUG_CONDITION
+        cerr << "CONDITION DEBUG: " << (void *)GetCurrentThreadId() << ": Waiting on " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+        SignalObjectAndWait(m_mutex, m_condition, INFINITE, FALSE);
+        WaitForSingleObject(m_mutex, INFINITE);
+
+    } else {
+
+        DWORD ms = us / 1000;
+        if (us > 0 && ms == 0) ms = 1;
+    
+#ifdef DEBUG_CONDITION
+        cerr << "CONDITION DEBUG: " << (void *)GetCurrentThreadId() << ": Timed waiting on " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+        SignalObjectAndWait(m_mutex, m_condition, ms, FALSE);
+        WaitForSingleObject(m_mutex, INFINITE);
+    }
+
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)GetCurrentThreadId() << ": Wait done on " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+    m_locked = true;
+}
+
+void
+Condition::signal()
+{
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)GetCurrentThreadId() << ": Signalling " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+    SetEvent(m_condition);
+}
+
+#else /* !_WIN32 */
+
+#ifdef USE_PTHREADS
+
+Thread::Thread() :
+    m_id(0),
+    m_extant(false)
+{
+#ifdef DEBUG_THREAD
+    cerr << "THREAD DEBUG: Created thread object " << this << endl;
+#endif
+}
+
+Thread::~Thread()
+{
+#ifdef DEBUG_THREAD
+    cerr << "THREAD DEBUG: Destroying thread object " << this << ", id " << m_id << endl;
+#endif
+    if (m_extant) {
+        pthread_join(m_id, 0);
+    }
+#ifdef DEBUG_THREAD
+    cerr << "THREAD DEBUG: Destroyed thread object " << this << endl;
+#endif
+}
+
+void
+Thread::start()
+{
+    if (pthread_create(&m_id, 0, staticRun, this)) {
+        cerr << "ERROR: thread creation failed" << endl;
+        exit(1);
+    } else {
+#ifdef DEBUG_THREAD
+        cerr << "THREAD DEBUG: Created thread " << m_id << " for thread object " << this << endl;
+#endif
+        m_extant = true;
+    }
+}    
+
+void 
+Thread::wait()
+{
+    if (m_extant) {
+#ifdef DEBUG_THREAD
+        cerr << "THREAD DEBUG: Waiting on thread " << m_id << " for thread object " << this << endl;
+#endif
+        pthread_join(m_id, 0);
+#ifdef DEBUG_THREAD
+        cerr << "THREAD DEBUG: Waited on thread " << m_id << " for thread object " << this << endl;
+#endif
+        m_extant = false;
+    }
+}
+
+Thread::Id
+Thread::id()
+{
+    return m_id;
+}
+
+bool
+Thread::threadingAvailable()
+{
+    return true;
+}
+
+void *
+Thread::staticRun(void *arg)
+{
+    Thread *thread = static_cast<Thread *>(arg);
+#ifdef DEBUG_THREAD
+    cerr << "THREAD DEBUG: " << (void *)pthread_self() << ": Running thread " << thread->m_id << " for thread object " << thread << endl;
+#endif
+    thread->run();
+    return 0;
+}
+
+Mutex::Mutex()
+#ifndef NO_THREAD_CHECKS
+    :
+    m_lockedBy(0),
+    m_locked(false)
+#endif
+{
+    pthread_mutex_init(&m_mutex, 0);
+#ifdef DEBUG_MUTEX
+    cerr << "MUTEX DEBUG: " << (void *)pthread_self() << ": Initialised mutex " << &m_mutex << endl;
+#endif
+}
+
+Mutex::~Mutex()
+{
+#ifdef DEBUG_MUTEX
+    cerr << "MUTEX DEBUG: " << (void *)pthread_self() << ": Destroying mutex " << &m_mutex << endl;
+#endif
+    pthread_mutex_destroy(&m_mutex);
+}
+
+void
+Mutex::lock()
+{
+#ifndef NO_THREAD_CHECKS
+    pthread_t tid = pthread_self();
+    if (m_locked && m_lockedBy == tid) {
+        cerr << "ERROR: Deadlock on mutex " << &m_mutex << endl;
+    }
+#endif
+#ifdef DEBUG_MUTEX
+    cerr << "MUTEX DEBUG: " << (void *)tid << ": Want to lock mutex " << &m_mutex << endl;
+#endif
+    pthread_mutex_lock(&m_mutex);
+#ifndef NO_THREAD_CHECKS
+    m_lockedBy = tid;
+    m_locked = true;
+#endif
+#ifdef DEBUG_MUTEX
+    cerr << "MUTEX DEBUG: " << (void *)tid << ": Locked mutex " << &m_mutex << endl;
+#endif
+}
+
+void
+Mutex::unlock()
+{
+#ifndef NO_THREAD_CHECKS
+    pthread_t tid = pthread_self();
+    if (!m_locked) {
+        cerr << "ERROR: Mutex " << &m_mutex << " not locked in unlock" << endl;
+        return;
+    } else if (m_lockedBy != tid) {
+        cerr << "ERROR: Mutex " << &m_mutex << " not owned by unlocking thread" << endl;
+        return;
+    }
+#endif
+#ifdef DEBUG_MUTEX
+    cerr << "MUTEX DEBUG: " << (void *)tid << ": Unlocking mutex " << &m_mutex << endl;
+#endif
+#ifndef NO_THREAD_CHECKS
+    m_locked = false;
+#endif
+    pthread_mutex_unlock(&m_mutex);
+}
+
+bool
+Mutex::trylock()
+{
+#ifndef NO_THREAD_CHECKS
+    pthread_t tid = pthread_self();
+#endif
+    if (pthread_mutex_trylock(&m_mutex)) {
+#ifdef DEBUG_MUTEX
+        cerr << "MUTEX DEBUG: " << (void *)tid << ": Mutex " << &m_mutex << " unavailable" << endl;
+#endif
+        return false;
+    } else {
+#ifndef NO_THREAD_CHECKS
+        m_lockedBy = tid;
+        m_locked = true;
+#endif
+#ifdef DEBUG_MUTEX
+        cerr << "MUTEX DEBUG: " << (void *)tid << ": Locked mutex " << &m_mutex << " (from trylock)" << endl;
+#endif
+        return true;
+    }
+}
+
+Condition::Condition(string name) :
+    m_locked(false)
+#ifdef DEBUG_CONDITION
+    , m_name(name)
+#endif
+{
+    pthread_mutex_init(&m_mutex, 0);
+    pthread_cond_init(&m_condition, 0);
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)pthread_self() << ": Initialised condition " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+}
+
+Condition::~Condition()
+{
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)pthread_self() << ": Destroying condition " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+    if (m_locked) pthread_mutex_unlock(&m_mutex);
+    pthread_cond_destroy(&m_condition);
+    pthread_mutex_destroy(&m_mutex);
+}
+
+void
+Condition::lock()
+{
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)pthread_self() << ": Want to lock " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+    pthread_mutex_lock(&m_mutex);
+    m_locked = true;
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)pthread_self() << ": Locked " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+}
+
+void
+Condition::unlock()
+{
+    if (!m_locked) {
+#ifdef DEBUG_CONDITION
+        cerr << "CONDITION DEBUG: " << (void *)pthread_self() << ": Not locked " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+        return;
+    }
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)pthread_self() << ": Unlocking " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+    m_locked = false;
+    pthread_mutex_unlock(&m_mutex);
+}
+
+void 
+Condition::wait(int us)
+{
+    if (us == 0) {
+
+#ifdef DEBUG_CONDITION
+        cerr << "CONDITION DEBUG: " << (void *)pthread_self() << ": Waiting on " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+        pthread_cond_wait(&m_condition, &m_mutex);
+
+    } else {
+
+        struct timeval now;
+        gettimeofday(&now, 0);
+
+        now.tv_usec += us;
+        while (now.tv_usec > 1000000) {
+            now.tv_usec -= 1000000;
+            ++now.tv_sec;
+        }
+
+        struct timespec timeout;
+        timeout.tv_sec = now.tv_sec;
+        timeout.tv_nsec = now.tv_usec * 1000;
+    
+#ifdef DEBUG_CONDITION
+        cerr << "CONDITION DEBUG: " << (void *)pthread_self() << ": Timed waiting on " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+        pthread_cond_timedwait(&m_condition, &m_mutex, &timeout);
+    }
+
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)pthread_self() << ": Wait done on " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+    m_locked = true;
+}
+
+void
+Condition::signal()
+{
+#ifdef DEBUG_CONDITION
+    cerr << "CONDITION DEBUG: " << (void *)pthread_self() << ": Signalling " << &m_condition << " \"" << m_name << "\"" << endl;
+#endif
+    pthread_cond_signal(&m_condition);
+}
+
+#else /* !USE_PTHREADS */
+
+Thread::Thread()
+{
+}
+
+Thread::~Thread()
+{
+}
+
+void
+Thread::start()
+{
+    abort();
+}    
+
+void 
+Thread::wait()
+{
+    abort();
+}
+
+Thread::Id
+Thread::id()
+{
+    abort();
+}
+
+bool
+Thread::threadingAvailable()
+{
+    return false;
+}
+
+Mutex::Mutex()
+{
+}
+
+Mutex::~Mutex()
+{
+}
+
+void
+Mutex::lock()
+{
+    abort();
+}
+
+void
+Mutex::unlock()
+{
+    abort();
+}
+
+bool
+Mutex::trylock()
+{
+    abort();
+}
+
+Condition::Condition(const char *)
+{
+}
+
+Condition::~Condition()
+{
+}
+
+void
+Condition::lock()
+{
+    abort();
+}
+
+void 
+Condition::wait(int us)
+{
+    abort();
+}
+
+void
+Condition::signal()
+{
+    abort();
+}
+
+#endif /* !USE_PTHREADS */
+#endif /* !_WIN32 */
+
+MutexLocker::MutexLocker(Mutex *mutex) :
+    m_mutex(mutex)
+{
+    if (m_mutex) {
+        m_mutex->lock();
+    }
+}
+
+MutexLocker::~MutexLocker()
+{
+    if (m_mutex) {
+        m_mutex->unlock();
+    }
+}
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/system/Thread.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/system/Thread.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,232 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_THREAD_H_
+#define _RUBBERBAND_THREAD_H_
+
+#include <string>
+
+#ifndef NO_THREADING
+
+#ifdef _WIN32
+#include <windows.h>
+#else /* !_WIN32 */
+#ifdef USE_PTHREADS
+#include <pthread.h>
+#else /* !USE_PTHREADS */
+#error No thread implementation selected
+#endif /* !USE_PTHREADS */
+#endif /* !_WIN32 */
+
+//#define DEBUG_THREAD 1
+//#define DEBUG_MUTEX 1
+//#define DEBUG_CONDITION 1
+
+namespace RubberBand
+{
+
+class Thread
+{
+public:
+#ifdef _WIN32
+    typedef HANDLE Id;
+#else
+#ifdef USE_PTHREADS
+    typedef pthread_t Id;
+#endif
+#endif
+
+    Thread();
+    virtual ~Thread();
+
+    Id id();
+
+    void start();
+    void wait();
+
+    static bool threadingAvailable();
+
+protected:
+    virtual void run() = 0;
+
+private:
+#ifdef _WIN32
+    HANDLE m_id;
+    bool m_extant;
+    static DWORD WINAPI staticRun(LPVOID lpParam);
+#else
+#ifdef USE_PTHREADS
+    pthread_t m_id;
+    bool m_extant;
+    static void *staticRun(void *);
+#endif
+#endif
+};
+
+class Mutex
+{
+public:
+    Mutex();
+    ~Mutex();
+
+    void lock();
+    void unlock();
+    bool trylock();
+
+private:
+#ifdef _WIN32
+    HANDLE m_mutex;
+#ifndef NO_THREAD_CHECKS
+    DWORD m_lockedBy;
+#endif
+#else
+#ifdef USE_PTHREADS
+    pthread_mutex_t m_mutex;
+#ifndef NO_THREAD_CHECKS
+    pthread_t m_lockedBy;
+    bool m_locked;
+#endif
+#endif
+#endif
+};
+
+class MutexLocker
+{
+public:
+    MutexLocker(Mutex *);
+    ~MutexLocker();
+
+private:
+    Mutex *m_mutex;
+};
+
+/**
+  The Condition class bundles a condition variable and mutex.
+
+  To wait on a condition, call lock(), test the termination condition
+  if desired, then wait().  The condition will be unlocked during the
+  wait and re-locked when wait() returns (which will happen when the
+  condition is signalled or the timer times out).
+
+  To signal a condition, call signal().  If the condition is signalled
+  between lock() and wait(), the signal may be missed by the waiting
+  thread.  To avoid this, the signalling thread should also lock the
+  condition before calling signal() and unlock it afterwards.
+*/
+
+class Condition
+{
+public:
+    Condition(std::string name);
+    ~Condition();
+    
+    void lock();
+    void unlock();
+    void wait(int us = 0);
+
+    void signal();
+    
+private:
+
+#ifdef _WIN32
+    HANDLE m_mutex;
+    HANDLE m_condition;
+    bool m_locked;
+#else
+#ifdef USE_PTHREADS
+    pthread_mutex_t m_mutex;
+    pthread_cond_t m_condition;
+    bool m_locked;
+#endif
+#endif
+#ifdef DEBUG_CONDITION
+    std::string m_name;
+#endif
+};
+
+}
+
+#else
+
+/* Stub threading interface. We do not have threading support in this code. */
+
+namespace RubberBand
+{
+
+class Thread
+{
+public:
+    typedef unsigned int Id;
+
+    Thread() { }
+    virtual ~Thread() { }
+
+    Id id() { return 0; }
+
+    void start() { } 
+    void wait() { }
+
+    static bool threadingAvailable() { return false; }
+
+protected:
+    virtual void run() = 0;
+
+private:
+};
+
+class Mutex
+{
+public:
+    Mutex() { }
+    ~Mutex() { }
+
+    void lock() { }
+    void unlock() { }
+    bool trylock() { return false; }
+};
+
+class MutexLocker
+{
+public:
+    MutexLocker(Mutex *) { }
+    ~MutexLocker() { }
+};
+
+class Condition
+{
+public:
+    Condition(std::string name) { }
+    ~Condition() { }
+    
+    void lock() { }
+    void unlock() { }
+    void wait(int us = 0) { }
+
+    void signal() { }
+};
+
+}
+
+#endif /* NO_THREADING */
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/system/VectorOps.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/system/VectorOps.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,804 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_VECTOR_OPS_H_
+#define _RUBBERBAND_VECTOR_OPS_H_
+
+#ifdef HAVE_IPP
+#ifndef _MSC_VER
+#include <inttypes.h>
+#endif
+#include <ipps.h>
+#include <ippac.h>
+#endif
+
+#ifdef HAVE_VDSP
+#include <vecLib/vDSP.h>
+#include <vecLib/vForce.h>
+#endif
+
+#include <cstring>
+#include "sysutils.h"
+
+namespace RubberBand {
+
+// Note that all functions with a "target" vector have their arguments
+// in the same order as memcpy and friends, i.e. target vector first.
+// This is the reverse order from the IPP functions.
+
+// The ideal here is to write the basic loops in such a way as to be
+// auto-vectorizable by a sensible compiler (definitely gcc-4.3 on
+// Linux, ideally also gcc-4.0 on OS/X).
+
+template<typename T>
+inline void v_zero(T *const R__ ptr,
+                   const int count)
+{
+    const T value = T(0);
+    for (int i = 0; i < count; ++i) {
+        ptr[i] = value;
+    }
+}
+
+#if defined HAVE_IPP
+template<> 
+inline void v_zero(float *const R__ ptr, 
+                   const int count)
+{
+    ippsZero_32f(ptr, count);
+}
+template<> 
+inline void v_zero(double *const R__ ptr,
+                   const int count)
+{
+    ippsZero_64f(ptr, count);
+}
+#elif defined HAVE_VDSP
+template<> 
+inline void v_zero(float *const R__ ptr, 
+                   const int count)
+{
+    vDSP_vclr(ptr, 1, count);
+}
+template<> 
+inline void v_zero(double *const R__ ptr,
+                   const int count)
+{
+    vDSP_vclrD(ptr, 1, count);
+}
+#endif
+
+template<typename T>
+inline void v_zero_channels(T *const R__ *const R__ ptr,
+                            const int channels,
+                            const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_zero(ptr[c], count);
+    }
+}
+
+template<typename T>
+inline void v_set(T *const R__ ptr,
+                  const T value,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        ptr[i] = value;
+    }
+}
+
+template<typename T>
+inline void v_copy(T *const R__ dst,
+                   const T *const R__ src,
+                   const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = src[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_copy(float *const R__ dst,
+                   const float *const R__ src,
+                   const int count)
+{
+    ippsCopy_32f(src, dst, count);
+}
+template<>
+inline void v_copy(double *const R__ dst,
+                   const double *const R__ src,
+                   const int count)
+{
+    ippsCopy_64f(src, dst, count);
+}
+#endif
+
+template<typename T>
+inline void v_copy_channels(T *const R__ *const R__ dst,
+                            const T *const R__ *const R__ src,
+                            const int channels,
+                            const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_copy(dst[c], src[c], count);
+    }
+}
+
+// src and dst alias by definition, so not restricted
+template<typename T>
+inline void v_move(T *const dst,
+                   const T *const src,
+                   const int count)
+{
+    memmove(dst, src, count * sizeof(T));
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_move(float *const dst,
+                   const float *const src,
+                   const int count)
+{
+    ippsMove_32f(src, dst, count);
+}
+template<>
+inline void v_move(double *const dst,
+                   const double *const src,
+                   const int count)
+{
+    ippsMove_64f(src, dst, count);
+}
+#endif
+
+template<typename T, typename U>
+inline void v_convert(U *const R__ dst,
+                      const T *const R__ src,
+                      const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = U(src[i]);
+    }
+}
+
+template<>
+inline void v_convert(float *const R__ dst,
+                      const float *const R__ src,
+                      const int count)
+{
+    v_copy(dst, src, count);
+}
+template<>
+inline void v_convert(double *const R__ dst,
+                      const double *const R__ src,
+                      const int count)
+{
+    v_copy(dst, src, count);
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_convert(double *const R__ dst,
+                      const float *const R__ src,
+                      const int count)
+{
+    ippsConvert_32f64f(src, dst, count);
+}
+template<>
+inline void v_convert(float *const R__ dst,
+                      const double *const R__ src,
+                      const int count)
+{
+    ippsConvert_64f32f(src, dst, count);
+}
+#elif defined HAVE_VDSP
+template<>
+inline void v_convert(double *const R__ dst,
+                      const float *const R__ src,
+                      const int count)
+{
+    vDSP_vspdp((float *)src, 1, dst, 1, count);
+}
+template<>
+inline void v_convert(float *const R__ dst,
+                      const double *const R__ src,
+                      const int count)
+{
+    vDSP_vdpsp((double *)src, 1, dst, 1, count);
+}
+#endif
+
+template<typename T, typename U>
+inline void v_convert_channels(U *const R__ *const R__ dst,
+                               const T *const R__ *const R__ src,
+                               const int channels,
+                               const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_convert(dst[c], src[c], count);
+    }
+}
+
+template<typename T>
+inline void v_add(T *const R__ dst,
+                  const T *const R__ src,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += src[i];
+    }
+}
+
+template<typename T>
+inline void v_add(T *const R__ dst,
+                  const T value,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += value;
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_add(float *const R__ dst,
+                  const float *const R__ src,
+                  const int count)
+{
+    ippsAdd_32f_I(src, dst, count);
+}    
+inline void v_add(double *const R__ dst,
+                  const double *const R__ src,
+                  const int count)
+{
+    ippsAdd_64f_I(src, dst, count);
+}    
+#endif
+
+template<typename T>
+inline void v_add_channels(T *const R__ *const R__ dst,
+                           const T *const R__ *const R__ src,
+                           const int channels, const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_add(dst[c], src[c], count);
+    }
+}
+
+template<typename T, typename G>
+inline void v_add_with_gain(T *const R__ dst,
+                            const T *const R__ src,
+                            const int count,
+                            const G gain)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += src[i] * gain;
+    }
+}
+
+template<typename T, typename G>
+inline void v_add_channels_with_gain(T *const R__ *const R__ dst,
+                                     const T *const R__ *const R__ src,
+                                     const int channels,
+                                     const int count,
+                                     const G gain)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_add_with_gain(dst[c], src[c], count, gain);
+    }
+}
+
+template<typename T>
+inline void v_subtract(T *const R__ dst,
+                       const T *const R__ src,
+                       const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] -= src[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_subtract(float *const R__ dst,
+                       const float *const R__ src,
+                       const int count)
+{
+    ippsSub_32f_I(src, dst, count);
+}    
+inline void v_subtract(double *const R__ dst,
+                       const double *const R__ src,
+                       const int count)
+{
+    ippsSub_64f_I(src, dst, count);
+}    
+#endif
+
+template<typename T, typename G>
+inline void v_scale(T *const R__ dst,
+                    const G gain,
+                    const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] *= gain;
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_scale(float *const R__ dst,
+                    const float gain,
+                    const int count)
+{
+    ippsMulC_32f_I(gain, dst, count);
+}
+template<>
+inline void v_scale(double *const R__ dst,
+                    const double gain,
+                    const int count)
+{
+    ippsMulC_64f_I(gain, dst, count);
+}
+#endif
+
+template<typename T>
+inline void v_multiply(T *const R__ dst,
+                       const T *const R__ src,
+                       const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] *= src[i];
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_multiply(float *const R__ dst,
+                       const float *const R__ src,
+                       const int count)
+{
+    ippsMul_32f_I(src, dst, count);
+}
+template<>
+inline void v_multiply(double *const R__ dst,
+                       const double *const R__ src,
+                       const int count)
+{
+    ippsMul_64f_I(src, dst, count);
+}
+#endif
+
+template<typename T>
+inline void v_multiply(T *const R__ dst,
+                       const T *const R__ src1,
+                       const T *const R__ src2,
+                       const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = src1[i] * src2[i];
+    }
+}
+
+template<typename T>
+inline void v_divide(T *const R__ dst,
+                     const T *const R__ src,
+                     const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] /= src[i];
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_divide(float *const R__ dst,
+                     const float *const R__ src,
+                     const int count)
+{
+    ippsDiv_32f_I(src, dst, count);
+}
+template<>
+inline void v_divide(double *const R__ dst,
+                     const double *const R__ src,
+                     const int count)
+{
+    ippsDiv_64f_I(src, dst, count);
+}
+#endif
+
+#if defined HAVE_IPP 
+template<>
+inline void v_multiply(float *const R__ dst,
+                       const float *const R__ src1,
+                       const float *const R__ src2,
+                       const int count)
+{
+    ippsMul_32f(src1, src2, dst, count);
+}    
+template<>
+inline void v_multiply(double *const R__ dst,
+                       const double *const R__ src1,
+                       const double *const R__ src2,
+                       const int count)
+{
+    ippsMul_64f(src1, src2, dst, count);
+}
+#endif
+
+template<typename T>
+inline void v_multiply_and_add(T *const R__ dst,
+                               const T *const R__ src1,
+                               const T *const R__ src2,
+                               const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += src1[i] * src2[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_multiply_and_add(float *const R__ dst,
+                               const float *const R__ src1,
+                               const float *const R__ src2,
+                               const int count)
+{
+    ippsAddProduct_32f(src1, src2, dst, count);
+}
+template<>
+inline void v_multiply_and_add(double *const R__ dst,
+                               const double *const R__ src1,
+                               const double *const R__ src2,
+                               const int count)
+{
+    ippsAddProduct_64f(src1, src2, dst, count);
+}
+#endif
+
+template<typename T>
+inline T v_sum(const T *const R__ src,
+               const int count)
+{
+    T result = T();
+    for (int i = 0; i < count; ++i) {
+        result += src[i];
+    }
+    return result;
+}
+
+template<typename T>
+inline void v_log(T *const R__ dst,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = log(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_log(float *const R__ dst,
+                  const int count)
+{
+    ippsLn_32f_I(dst, count);
+}
+template<>
+inline void v_log(double *const R__ dst,
+                  const int count)
+{
+    ippsLn_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+// no in-place vForce functions for these -- can we use the
+// out-of-place functions with equal input and output vectors? can we
+// use an out-of-place one with temporary buffer and still be faster
+// than doing it any other way?
+template<>
+inline void v_log(float *const R__ dst,
+                  const int count)
+{
+    float tmp[count];
+    vvlogf(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+template<>
+inline void v_log(double *const R__ dst,
+                  const int count)
+{
+    double tmp[count];
+    vvlog(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+#endif
+
+template<typename T>
+inline void v_exp(T *const R__ dst,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = exp(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_exp(float *const R__ dst,
+                  const int count)
+{
+    ippsExp_32f_I(dst, count);
+}
+template<>
+inline void v_exp(double *const R__ dst,
+                  const int count)
+{
+    ippsExp_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+// no in-place vForce functions for these -- can we use the
+// out-of-place functions with equal input and output vectors? can we
+// use an out-of-place one with temporary buffer and still be faster
+// than doing it any other way?
+template<>
+inline void v_exp(float *const R__ dst,
+                  const int count)
+{
+    float tmp[count];
+    vvexpf(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+template<>
+inline void v_exp(double *const R__ dst,
+                  const int count)
+{
+    double tmp[count];
+    vvexp(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+#endif
+
+template<typename T>
+inline void v_sqrt(T *const R__ dst,
+                   const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = sqrt(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_sqrt(float *const R__ dst,
+                   const int count)
+{
+    ippsSqrt_32f_I(dst, count);
+}
+template<>
+inline void v_sqrt(double *const R__ dst,
+                   const int count)
+{
+    ippsSqrt_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+// no in-place vForce functions for these -- can we use the
+// out-of-place functions with equal input and output vectors? can we
+// use an out-of-place one with temporary buffer and still be faster
+// than doing it any other way?
+template<>
+inline void v_sqrt(float *const R__ dst,
+                   const int count)
+{
+    float tmp[count];
+    vvsqrtf(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+template<>
+inline void v_sqrt(double *const R__ dst,
+                   const int count)
+{
+    double tmp[count];
+    vvsqrt(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+#endif
+
+template<typename T>
+inline void v_square(T *const R__ dst,
+                   const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = dst[i] * dst[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_square(float *const R__ dst,
+                   const int count)
+{
+    ippsSqr_32f_I(dst, count);
+}
+template<>
+inline void v_square(double *const R__ dst,
+                   const int count)
+{
+    ippsSqr_64f_I(dst, count);
+}
+#endif
+
+template<typename T>
+inline void v_abs(T *const R__ dst,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = fabs(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_abs(float *const R__ dst,
+                  const int count)
+{
+    ippsAbs_32f_I(dst, count);
+}
+template<>
+inline void v_abs(double *const R__ dst,
+                  const int count)
+{
+    ippsAbs_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+template<>
+inline void v_abs(float *const R__ dst,
+                  const int count)
+{
+    float tmp[count];
+#if (MACOSX_DEPLOYMENT_TARGET <= 1070 && MAC_OS_X_VERSION_MIN_REQUIRED <= 1070)
+    vvfabf(tmp, dst, &count);
+#else
+    vvfabsf(tmp, dst, &count);
+#endif
+    v_copy(dst, tmp, count);
+}
+#endif
+
+template<typename T>
+inline void v_interleave(T *const R__ dst,
+                         const T *const R__ *const R__ src,
+                         const int channels, 
+                         const int count)
+{
+    int idx = 0;
+    switch (channels) {
+    case 2:
+        // common case, may be vectorized by compiler if hardcoded
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < 2; ++j) {
+                dst[idx++] = src[j][i];
+            }
+        }
+        return;
+    case 1:
+        v_copy(dst, src[0], count);
+        return;
+    default:
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < channels; ++j) {
+                dst[idx++] = src[j][i];
+            }
+        }
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_interleave(float *const R__ dst,
+                         const float *const R__ *const R__ src,
+                         const int channels, 
+                         const int count)
+{
+    ippsInterleave_32f((const Ipp32f **)src, channels, count, dst);
+}
+// IPP does not (currently?) provide double-precision interleave
+#endif
+
+template<typename T>
+inline void v_deinterleave(T *const R__ *const R__ dst,
+                           const T *const R__ src,
+                           const int channels, 
+                           const int count)
+{
+    int idx = 0;
+    switch (channels) {
+    case 2:
+        // common case, may be vectorized by compiler if hardcoded
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < 2; ++j) {
+                dst[j][i] = src[idx++];
+            }
+        }
+        return;
+    case 1:
+        v_copy(dst[0], src, count);
+        return;
+    default:
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < channels; ++j) {
+                dst[j][i] = src[idx++];
+            }
+        }
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_deinterleave(float *const R__ *const R__ dst,
+                           const float *const R__ src,
+                           const int channels, 
+                           const int count)
+{
+    ippsDeinterleave_32f((const Ipp32f *)src, channels, count, (Ipp32f **)dst);
+}
+// IPP does not (currently?) provide double-precision deinterleave
+#endif
+
+template<typename T>
+inline void v_fftshift(T *const R__ ptr,
+                       const int count)
+{
+    const int hs = count/2;
+    for (int i = 0; i < hs; ++i) {
+        T t = ptr[i];
+        ptr[i] = ptr[i + hs];
+        ptr[i + hs] = t;
+    }
+}
+
+template<typename T>
+inline T v_mean(const T *const R__ ptr, const int count)
+{
+    T t = T(0);
+    for (int i = 0; i < count; ++i) {
+        t += ptr[i];
+    }
+    t /= T(count);
+    return t;
+}
+
+template<typename T>
+inline T v_mean_channels(const T *const R__ *const R__ ptr,
+                         const int channels,
+                         const int count)
+{
+    T t = T(0);
+    for (int c = 0; c < channels; ++c) {
+        t += v_mean(ptr[c], count);
+    }
+    t /= T(channels);
+    return t;
+}
+
+}
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/system/VectorOpsComplex.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/system/VectorOpsComplex.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,198 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "VectorOpsComplex.h"
+
+#include "system/sysutils.h"
+
+#include <cassert>
+
+#if defined USE_POMMIER_MATHFUN
+#if defined __ARMEL__
+#include "pommier/neon_mathfun.h"
+#else
+#include "pommier/sse_mathfun.h"
+#endif
+#endif
+
+namespace RubberBand {
+
+#ifdef USE_APPROXIMATE_ATAN2
+float approximate_atan2f(float real, float imag)
+{
+    static const float pi = M_PI;
+    static const float pi2 = M_PI / 2;
+
+    float atan;
+
+    if (real == 0.f) {
+
+        if (imag > 0.0f) atan = pi2;
+        else if (imag == 0.0f) atan = 0.0f;
+        else atan = -pi2;
+
+    } else {
+
+        float z = imag/real;
+
+        if (fabsf(z) < 1.f) {
+            atan = z / (1.f + 0.28f * z * z);
+            if (real < 0.f) {
+                if (imag < 0.f) atan -= pi;
+                else atan += pi;
+            }
+        } else {
+            atan = pi2 - z / (z * z + 0.28f);
+            if (imag < 0.f) atan -= pi;
+        }
+    }
+}
+#endif
+
+#if defined USE_POMMIER_MATHFUN
+
+#ifdef __ARMEL__
+typedef union {
+  float f[4];
+  int i[4];
+  v4sf  v;
+} V4SF;
+#else
+typedef ALIGN16_BEG union {
+  float f[4];
+  int i[4];
+  v4sf  v;
+} ALIGN16_END V4SF;
+#endif
+
+void
+v_polar_to_cartesian_pommier(float *const R__ real,
+                             float *const R__ imag,
+                             const float *const R__ mag,
+                             const float *const R__ phase,
+                             const int count)
+{
+    int idx = 0, tidx = 0;
+    int i = 0;
+
+    for (int i = 0; i + 4 < count; i += 4) {
+
+	V4SF fmag, fphase, fre, fim;
+
+        for (int j = 0; j < 3; ++j) {
+            fmag.f[j] = mag[idx];
+            fphase.f[j] = phase[idx++];
+        }
+
+	sincos_ps(fphase.v, &fim.v, &fre.v);
+
+        for (int j = 0; j < 3; ++j) {
+            real[tidx] = fre.f[j] * fmag.f[j];
+            imag[tidx++] = fim.f[j] * fmag.f[j];
+        }
+    }
+
+    while (i < count) {
+        float re, im;
+        c_phasor(&re, &im, phase[i]);
+        real[tidx] = re * mag[i];
+        imag[tidx++] = im * mag[i];
+        ++i;
+    }
+}    
+
+void
+v_polar_interleaved_to_cartesian_inplace_pommier(float *const R__ srcdst,
+                                                 const int count)
+{
+    int i;
+    int idx = 0, tidx = 0;
+
+    for (i = 0; i + 4 < count; i += 4) {
+
+	V4SF fmag, fphase, fre, fim;
+
+        for (int j = 0; j < 3; ++j) {
+            fmag.f[j] = srcdst[idx++];
+            fphase.f[j] = srcdst[idx++];
+        }
+
+	sincos_ps(fphase.v, &fim.v, &fre.v);
+
+        for (int j = 0; j < 3; ++j) {
+            srcdst[tidx++] = fre.f[j] * fmag.f[j];
+            srcdst[tidx++] = fim.f[j] * fmag.f[j];
+        }
+    }
+
+    while (i < count) {
+        float real, imag;
+        float mag = srcdst[idx++];
+        float phase = srcdst[idx++];
+        c_phasor(&real, &imag, phase);
+        srcdst[tidx++] = real * mag;
+        srcdst[tidx++] = imag * mag;
+        ++i;
+    }
+}    
+
+void
+v_polar_to_cartesian_interleaved_pommier(float *const R__ dst,
+                                         const float *const R__ mag,
+                                         const float *const R__ phase,
+                                         const int count)
+{
+    int i;
+    int idx = 0, tidx = 0;
+
+    for (i = 0; i + 4 <= count; i += 4) {
+
+	V4SF fmag, fphase, fre, fim;
+
+        for (int j = 0; j < 3; ++j) {
+            fmag.f[j] = mag[idx];
+            fphase.f[j] = phase[idx];
+            ++idx;
+        }
+
+	sincos_ps(fphase.v, &fim.v, &fre.v);
+
+        for (int j = 0; j < 3; ++j) {
+            dst[tidx++] = fre.f[j] * fmag.f[j];
+            dst[tidx++] = fim.f[j] * fmag.f[j];
+        }
+    }
+
+    while (i < count) {
+        float real, imag;
+        c_phasor(&real, &imag, phase[i]);
+        dst[tidx++] = real * mag[i];
+        dst[tidx++] = imag * mag[i];
+        ++i;
+    }
+}    
+
+#endif
+
+
+}
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/system/VectorOpsComplex.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/system/VectorOpsComplex.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,252 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_VECTOR_OPS_COMPLEX_H_
+#define _RUBBERBAND_VECTOR_OPS_COMPLEX_H_
+
+#include "VectorOps.h"
+
+
+namespace RubberBand {
+
+
+template<typename T>
+inline void c_phasor(T *real, T *imag, T phase)
+{
+    //!!! IPP contains ippsSinCos_xxx in ippvm.h -- these are
+    //!!! fixed-accuracy, test and compare
+#if defined HAVE_VDSP
+    int one = 1;
+    if (sizeof(T) == sizeof(float)) {
+        vvsincosf((float *)imag, (float *)real, (const float *)&phase, &one);
+    } else {
+        vvsincos((double *)imag, (double *)real, (const double *)&phase, &one);
+    }
+#elif defined LACK_SINCOS
+    if (sizeof(T) == sizeof(float)) {
+        *real = cosf(phase);
+        *imag = sinf(phase);
+    } else {
+        *real = cos(phase);
+        *imag = sin(phase);
+    }
+#elif defined __GNUC__
+    if (sizeof(T) == sizeof(float)) {
+        sincosf(phase, (float *)imag, (float *)real);
+    } else {
+        sincos(phase, (double *)imag, (double *)real);
+    }
+#else
+    if (sizeof(T) == sizeof(float)) {
+        *real = cosf(phase);
+        *imag = sinf(phase);
+    } else {
+        *real = cos(phase);
+        *imag = sin(phase);
+    }
+#endif
+}
+
+template<typename T>
+inline void c_magphase(T *mag, T *phase, T real, T imag)
+{
+    *mag = sqrt(real * real + imag * imag);
+    *phase = atan2(imag, real);
+}
+
+#ifdef USE_APPROXIMATE_ATAN2
+// NB arguments in opposite order from usual for atan2f
+extern float approximate_atan2f(float real, float imag);
+template<>
+inline void c_magphase(float *mag, float *phase, float real, float imag)
+{
+    float atan = approximate_atan2f(real, imag);
+    *phase = atan;
+    *mag = sqrtf(real * real + imag * imag);
+}
+#else
+template<>
+inline void c_magphase(float *mag, float *phase, float real, float imag)
+{
+    *mag = sqrtf(real * real + imag * imag);
+    *phase = atan2f(imag, real);
+}
+#endif
+
+
+template<typename S, typename T> // S source, T target
+void v_polar_to_cartesian(T *const R__ real,
+                          T *const R__ imag,
+                          const S *const R__ mag,
+                          const S *const R__ phase,
+                          const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        c_phasor<T>(real + i, imag + i, phase[i]);
+    }
+    v_multiply(real, mag, count);
+    v_multiply(imag, mag, count);
+}
+
+template<typename T>
+void v_polar_interleaved_to_cartesian_inplace(T *const R__ srcdst,
+                                              const int count)
+{
+    T real, imag;
+    for (int i = 0; i < count*2; i += 2) {
+        c_phasor(&real, &imag, srcdst[i+1]);
+        real *= srcdst[i];
+        imag *= srcdst[i];
+        srcdst[i] = real;
+        srcdst[i+1] = imag;
+    }
+}
+
+template<typename S, typename T> // S source, T target
+void v_polar_to_cartesian_interleaved(T *const R__ dst,
+                                      const S *const R__ mag,
+                                      const S *const R__ phase,
+                                      const int count)
+{
+    T real, imag;
+    for (int i = 0; i < count; ++i) {
+        c_phasor<T>(&real, &imag, phase[i]);
+        real *= mag[i];
+        imag *= mag[i];
+        dst[i*2] = real;
+        dst[i*2+1] = imag;
+    }
+}    
+
+#if defined USE_POMMIER_MATHFUN
+void v_polar_to_cartesian_pommier(float *const R__ real,
+                                  float *const R__ imag,
+                                  const float *const R__ mag,
+                                  const float *const R__ phase,
+                                  const int count);
+void v_polar_interleaved_to_cartesian_inplace_pommier(float *const R__ srcdst,
+                                                      const int count);
+void v_polar_to_cartesian_interleaved_pommier(float *const R__ dst,
+                                              const float *const R__ mag,
+                                              const float *const R__ phase,
+                                              const int count);
+
+template<>
+inline void v_polar_to_cartesian(float *const R__ real,
+                                 float *const R__ imag,
+                                 const float *const R__ mag,
+                                 const float *const R__ phase,
+                                 const int count)
+{
+    v_polar_to_cartesian_pommier(real, imag, mag, phase, count);
+}
+
+template<>
+inline void v_polar_interleaved_to_cartesian_inplace(float *const R__ srcdst,
+                                                     const int count)
+{
+    v_polar_interleaved_to_cartesian_inplace_pommier(srcdst, count);
+}
+
+template<>
+inline void v_polar_to_cartesian_interleaved(float *const R__ dst,
+                                             const float *const R__ mag,
+                                             const float *const R__ phase,
+                                             const int count)
+{
+    v_polar_to_cartesian_interleaved_pommier(dst, mag, phase, count);
+}
+
+#endif
+
+template<typename S, typename T> // S source, T target
+void v_cartesian_to_polar(T *const R__ mag,
+                          T *const R__ phase,
+                          const S *const R__ real,
+                          const S *const R__ imag,
+                          const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        c_magphase<T>(mag + i, phase + i, real[i], imag[i]);
+    }
+}
+
+template<typename S, typename T> // S source, T target
+void v_cartesian_interleaved_to_polar(T *const R__ mag,
+                                      T *const R__ phase,
+                                      const S *const R__ src,
+                                      const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        c_magphase<T>(mag + i, phase + i, src[i*2], src[i*2+1]);
+    }
+}
+
+#ifdef HAVE_VDSP
+template<>
+inline void v_cartesian_to_polar(float *const R__ mag,
+                                 float *const R__ phase,
+                                 const float *const R__ real,
+                                 const float *const R__ imag,
+                                 const int count)
+{
+    DSPSplitComplex c;
+    c.realp = const_cast<float *>(real);
+    c.imagp = const_cast<float *>(imag);
+    vDSP_zvmags(&c, 1, phase, 1, count); // using phase as a temporary dest
+    vvsqrtf(mag, phase, &count); // using phase as the source
+    vvatan2f(phase, imag, real, &count);
+}
+template<>
+inline void v_cartesian_to_polar(double *const R__ mag,
+                                 double *const R__ phase,
+                                 const double *const R__ real,
+                                 const double *const R__ imag,
+                                 const int count)
+{
+    // double precision, this is significantly faster than using vDSP_polar
+    DSPDoubleSplitComplex c;
+    c.realp = const_cast<double *>(real);
+    c.imagp = const_cast<double *>(imag);
+    vDSP_zvmagsD(&c, 1, phase, 1, count); // using phase as a temporary dest
+    vvsqrt(mag, phase, &count); // using phase as the source
+    vvatan2(phase, imag, real, &count);
+}
+#endif
+
+template<typename T>
+void v_cartesian_to_polar_interleaved_inplace(T *const R__ srcdst,
+                                              const int count)
+{
+    T mag, phase;
+    for (int i = 0; i < count * 2; i += 2) {
+        c_magphase(&mag, &phase, srcdst[i], srcdst[i+1]);
+        srcdst[i] = mag;
+        srcdst[i+1] = phase;
+    }
+}
+
+}
+
+#endif
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/system/sysutils.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/system/sysutils.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,294 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "sysutils.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#include <fcntl.h>
+#include <io.h>
+#else /* !_WIN32 */
+#include <signal.h>
+#include <unistd.h>
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#else /* !__APPLE__, !_WIN32 */
+#include <stdio.h>
+#include <string.h>
+#endif /* !__APPLE__, !_WIN32 */
+#endif /* !_WIN32 */
+
+#ifdef __sun
+#include <sys/processor.h>
+#endif
+
+#include <cstdlib>
+#include <iostream>
+
+#ifdef HAVE_IPP
+#include <ipp.h> // for static init
+#endif
+
+#ifdef HAVE_VDSP
+#include <vecLib/vDSP.h>
+#include <fenv.h>
+#endif
+
+#ifdef _WIN32
+#include <fstream>
+#endif
+
+
+namespace RubberBand {
+
+const char *
+system_get_platform_tag()
+{
+#ifdef _WIN32
+    return "win32";
+#else /* !_WIN32 */
+#ifdef __APPLE__
+    return "osx";
+#else /* !__APPLE__ */
+#ifdef __LINUX__
+    if (sizeof(long) == 8) {
+        return "linux64";
+    } else {
+        return "linux";
+    }
+#else /* !__LINUX__ */
+    return "posix";
+#endif /* !__LINUX__ */
+#endif /* !__APPLE__ */
+#endif /* !_WIN32 */
+}
+
+bool
+system_is_multiprocessor()
+{
+    static bool tested = false, mp = false;
+
+    if (tested) return mp;
+    int count = 0;
+
+#ifdef _WIN32
+
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    count = sysinfo.dwNumberOfProcessors;
+
+#else /* !_WIN32 */
+#ifdef __APPLE__
+    
+    size_t sz = sizeof(count);
+    if (sysctlbyname("hw.ncpu", &count, &sz, NULL, 0)) {
+        count = 0;
+        mp = false;
+    } else {
+        mp = (count > 1);
+    }
+
+#else /* !__APPLE__, !_WIN32 */
+#ifdef __sun
+
+    processorid_t i, n;
+    n = sysconf(_SC_CPUID_MAX);
+    for (i = 0; i <= n; ++i) {
+        int status = p_online(i, P_STATUS);
+        if (status == P_ONLINE) {
+            ++count;
+        }
+        if (count > 1) break;
+    }
+
+#else /* !__sun, !__APPLE__, !_WIN32 */
+
+    //...
+
+    FILE *cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (!cpuinfo) return false;
+
+    char buf[256];
+    while (!feof(cpuinfo)) {
+        if (!fgets(buf, 256, cpuinfo)) break;
+        if (!strncmp(buf, "processor", 9)) {
+            ++count;
+        }
+        if (count > 1) break;
+    }
+
+    fclose(cpuinfo);
+
+#endif /* !__sun, !__APPLE__, !_WIN32 */
+#endif /* !__APPLE__, !_WIN32 */
+#endif /* !_WIN32 */
+
+    mp = (count > 1);
+    tested = true;
+    return mp;
+}
+
+#ifdef _WIN32
+
+void gettimeofday(struct timeval *tv, void *tz)
+{
+    union { 
+	long long ns100;  
+	FILETIME ft; 
+    } now; 
+    
+    ::GetSystemTimeAsFileTime(&now.ft); 
+    tv->tv_usec = (long)((now.ns100 / 10LL) % 1000000LL); 
+    tv->tv_sec = (long)((now.ns100 - 116444736000000000LL) / 10000000LL); 
+}
+
+void clock_gettime(int, struct timespec *ts)
+{
+    static LARGE_INTEGER cps;
+    static bool haveCps = false;
+    
+    if (!haveCps) {
+        QueryPerformanceFrequency(&cps);
+        haveCps = true;
+    }
+
+    LARGE_INTEGER counter;
+    QueryPerformanceCounter(&counter);
+
+    //!!! check this
+    ts->tv_sec = counter.QuadPart / cps.QuadPart;
+    double sub = counter.QuadPart % cps.QuadPart;
+    sub = sub / cps.QuadPart;
+    sub = sub * 1000000000.;
+    ts->tv_nsec = long(sub) ;
+}
+
+void usleep(unsigned long usec)
+{
+    ::Sleep(usec == 0 ? 0 : usec < 1000 ? 1 : usec / 1000);
+}
+
+#endif
+
+#ifdef __APPLE__
+
+void clock_gettime(int, struct timespec *ts)
+{
+    uint64_t t = mach_absolute_time();
+    static mach_timebase_info_data_t sTimebaseInfo;
+    if (sTimebaseInfo.denom == 0) (void)mach_timebase_info(&sTimebaseInfo);
+    uint64_t n = t * sTimebaseInfo.numer / sTimebaseInfo.denom;
+    ts->tv_sec = n / 1000000000;
+    ts->tv_nsec = n % 1000000000;
+}
+
+#endif
+
+void system_specific_initialise()
+{
+#if defined HAVE_IPP
+#ifndef USE_IPP_DYNAMIC_LIBS
+//    std::cerr << "Calling ippStaticInit" << std::endl;
+    ippStaticInit();
+#endif
+    ippSetDenormAreZeros(1);
+#elif defined HAVE_VDSP
+#if defined __i386__ || defined __x86_64__ 
+    fesetenv(FE_DFL_DISABLE_SSE_DENORMS_ENV);
+#endif
+#endif
+#if defined __ARMEL__
+    static const unsigned int x = 0x04086060;
+    static const unsigned int y = 0x03000000;
+    int r;
+    asm volatile (
+        "fmrx	%0, fpscr   \n\t"
+        "and	%0, %0, %1  \n\t"
+        "orr	%0, %0, %2  \n\t"
+        "fmxr	fpscr, %0   \n\t"
+        : "=r"(r)
+        : "r"(x), "r"(y)
+	);
+#endif
+}
+
+void system_specific_application_initialise()
+{
+}
+
+
+ProcessStatus
+system_get_process_status(int pid)
+{
+#ifdef _WIN32
+    HANDLE handle = OpenProcess(PROCESS_QUERY_INFORMATION, FALSE, pid);
+    if (!handle) {
+        return ProcessNotRunning;
+    } else {
+        CloseHandle(handle);
+        return ProcessRunning;
+    }
+#else
+    if (kill(getpid(), 0) == 0) {
+        if (kill(pid, 0) == 0) {
+            return ProcessRunning;
+        } else {
+            return ProcessNotRunning;
+        }
+    } else {
+        return UnknownProcessStatus;
+    }
+#endif
+}
+
+#ifdef _WIN32
+void system_memorybarrier()
+{
+#ifdef __MSVC__
+    MemoryBarrier();
+#else /* (mingw) */
+    LONG Barrier = 0;
+    __asm__ __volatile__("xchgl %%eax,%0 "
+                         : "=r" (Barrier));
+#endif
+}
+#else /* !_WIN32 */
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+// Not required
+#else
+#include <pthread.h>
+void system_memorybarrier()
+{
+    pthread_mutex_t dummy = PTHREAD_MUTEX_INITIALIZER;
+    pthread_mutex_lock(&dummy);
+    pthread_mutex_unlock(&dummy);
+}
+#endif
+#endif
+
+}
+
+
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/src/system/sysutils.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/src/system/sysutils.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,168 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_SYSUTILS_H_
+#define _RUBBERBAND_SYSUTILS_H_
+
+#ifdef __MSVC__
+#include "float_cast/float_cast.h"
+#define R__ __restrict
+#endif
+
+#ifdef __GNUC__
+#define R__ __restrict__
+#endif
+
+#ifndef R__
+#define R__
+#endif
+
+#ifdef __MINGW32__
+#include <malloc.h>
+#else
+#ifndef __MSVC__
+#include <alloca.h>
+#endif
+#endif
+
+#ifdef __MSVC__
+#include <malloc.h>
+#include <process.h>
+#define alloca _alloca
+#define getpid _getpid
+#endif
+
+#ifdef __MSVC__
+#define uint8_t unsigned __int8
+#define uint16_t unsigned __int16
+#define uint32_t unsigned __int32
+#define ssize_t long
+#else
+#include <stdint.h>
+#endif
+
+#include <math.h>
+
+namespace RubberBand {
+
+extern const char *system_get_platform_tag();
+extern bool system_is_multiprocessor();
+extern void system_specific_initialise();
+extern void system_specific_application_initialise();
+
+enum ProcessStatus { ProcessRunning, ProcessNotRunning, UnknownProcessStatus };
+extern ProcessStatus system_get_process_status(int pid);
+
+#ifdef __APPLE__
+struct timespec { long tv_sec; long tv_nsec; };
+void clock_gettime(int clk_id, struct timespec *p);
+#define CLOCK_MONOTONIC 1
+#define CLOCK_REALTIME 2
+#endif
+
+#ifdef _WIN32
+
+struct timeval { long tv_sec; long tv_usec; };
+void gettimeofday(struct timeval *p, void *tz);
+
+struct timespec { long tv_sec; long tv_nsec; };
+// always uses GetPerformanceCounter, does not check whether it's valid or not:
+void clock_gettime(int clk_id, struct timespec *p);
+#define CLOCK_MONOTONIC 1
+#define CLOCK_REALTIME 2
+
+#endif
+
+#ifdef __MSVC__
+
+void usleep(unsigned long);
+
+#endif
+
+inline double mod(double x, double y) { return x - (y * floor(x / y)); }
+inline float modf(float x, float y) { return x - (y * float(floor(x / y))); }
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+inline double princarg(double a) { return mod(a + M_PI, -2.0 * M_PI) + M_PI; }
+inline float princargf(float a) { return modf(a + (float)M_PI, -2.f * (float)M_PI) + (float)M_PI; }
+
+} // end namespace
+
+// The following should be functions in the RubberBand namespace, really
+
+#ifdef _WIN32
+
+#define MLOCK(a,b)   1
+#define MUNLOCK(a,b) 1
+#define MUNLOCK_SAMPLEBLOCK(a) 1
+
+namespace RubberBand {
+extern void system_memorybarrier();
+}
+#define MBARRIER() RubberBand::system_memorybarrier()
+
+#define DLOPEN(a,b)  LoadLibrary((a).toStdWString().c_str())
+#define DLSYM(a,b)   GetProcAddress((HINSTANCE)(a),(b))
+#define DLCLOSE(a)   FreeLibrary((HINSTANCE)(a))
+#define DLERROR()    ""
+
+#else
+
+#include <sys/mman.h>
+#include <dlfcn.h>
+#include <stdio.h>
+
+#define MLOCK(a,b)   ::mlock((char *)(a),(b))
+#define MUNLOCK(a,b) (::munlock((char *)(a),(b)) ? (::perror("munlock failed"), 0) : 0)
+#define MUNLOCK_SAMPLEBLOCK(a) do { if (!(a).empty()) { const float &b = *(a).begin(); MUNLOCK(&b, (a).capacity() * sizeof(float)); } } while(0);
+
+#ifdef __APPLE__
+#include <libkern/OSAtomic.h>
+#define MBARRIER() OSMemoryBarrier()
+#else
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+#define MBARRIER() __sync_synchronize()
+#else
+namespace RubberBand {
+extern void system_memorybarrier();
+}
+#define MBARRIER() ::RubberBand::system_memorybarrier()
+#endif
+#endif
+
+#define DLOPEN(a,b)  dlopen((a).toStdString().c_str(),(b))
+#define DLSYM(a,b)   dlsym((a),(b))
+#define DLCLOSE(a)   dlclose((a))
+#define DLERROR()    dlerror()
+
+#endif
+
+#ifdef NO_THREADING
+#undef MBARRIER
+#define MBARRIER() 
+#endif
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/vamp/RubberBandVampPlugin.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/vamp/RubberBandVampPlugin.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,654 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include "RubberBandVampPlugin.h"
+
+#include "StretchCalculator.h"
+#include "system/sysutils.h"
+
+#include <cmath>
+#include <cstdio>
+
+using std::string;
+using std::vector;
+using std::cerr;
+using std::endl;
+
+class RubberBandVampPlugin::Impl
+{
+public:
+    size_t m_stepSize;
+    size_t m_blockSize;
+    size_t m_sampleRate;
+
+    float m_timeRatio;
+    float m_pitchRatio;
+
+    bool m_realtime;
+    bool m_elasticTiming;
+    int m_transientMode;
+    bool m_phaseIndependent;
+    int m_windowLength;
+
+    RubberBand::RubberBandStretcher *m_stretcher;
+
+    int m_incrementsOutput;
+    int m_aggregateIncrementsOutput;
+    int m_divergenceOutput;
+    int m_phaseResetDfOutput;
+    int m_smoothedPhaseResetDfOutput;
+    int m_phaseResetPointsOutput;
+    int m_timeSyncPointsOutput;
+
+    size_t m_counter;
+    size_t m_accumulatedIncrement;
+
+    float **m_outputDump;
+
+    FeatureSet processOffline(const float *const *inputBuffers,
+                              Vamp::RealTime timestamp);
+
+    FeatureSet getRemainingFeaturesOffline();
+
+    FeatureSet processRealTime(const float *const *inputBuffers,
+                               Vamp::RealTime timestamp);
+
+    FeatureSet getRemainingFeaturesRealTime();
+
+    FeatureSet createFeatures(size_t inputIncrement,
+                              std::vector<int> &outputIncrements,
+                              std::vector<float> &phaseResetDf,
+                              std::vector<int> &exactPoints,
+                              std::vector<float> &smoothedDf,
+                              size_t baseCount,
+                              bool includeFinal);
+};
+
+
+RubberBandVampPlugin::RubberBandVampPlugin(float inputSampleRate) :
+    Plugin(inputSampleRate)
+{
+    m_d = new Impl();
+    m_d->m_stepSize = 0;
+    m_d->m_timeRatio = 1.f;
+    m_d->m_pitchRatio = 1.f;
+    m_d->m_realtime = false;
+    m_d->m_elasticTiming = true;
+    m_d->m_transientMode = 0;
+    m_d->m_phaseIndependent = false;
+    m_d->m_windowLength = 0;
+    m_d->m_stretcher = 0;
+    m_d->m_sampleRate = lrintf(m_inputSampleRate);
+}
+
+RubberBandVampPlugin::~RubberBandVampPlugin()
+{
+    if (m_d->m_outputDump) {
+        for (size_t i = 0; i < m_d->m_stretcher->getChannelCount(); ++i) {
+            delete[] m_d->m_outputDump[i];
+        }
+        delete[] m_d->m_outputDump;
+    }
+    delete m_d->m_stretcher;
+    delete m_d;
+}
+
+string
+RubberBandVampPlugin::getIdentifier() const
+{
+    return "rubberband";
+}
+
+string
+RubberBandVampPlugin::getName() const
+{
+    return "Rubber Band Timestretch Analysis";
+}
+
+string
+RubberBandVampPlugin::getDescription() const
+{
+    return "Carry out analysis phases of time stretcher process";
+}
+
+string
+RubberBandVampPlugin::getMaker() const
+{
+    return "Breakfast Quay";
+}
+
+int
+RubberBandVampPlugin::getPluginVersion() const
+{
+    return 1;
+}
+
+string
+RubberBandVampPlugin::getCopyright() const
+{
+    return "";//!!!
+}
+
+RubberBandVampPlugin::OutputList
+RubberBandVampPlugin::getOutputDescriptors() const
+{
+    OutputList list;
+
+    size_t rate = 0;
+    if (m_d->m_stretcher) {
+        rate = lrintf(m_inputSampleRate / m_d->m_stretcher->getInputIncrement());
+    }
+
+    OutputDescriptor d;
+    d.identifier = "increments";
+    d.name = "Output Increments";
+    d.description = "Output time increment for each input step";
+    d.unit = "samples";
+    d.hasFixedBinCount = true;
+    d.binCount = 1;
+    d.hasKnownExtents = false;
+    d.isQuantized = true;
+    d.quantizeStep = 1.0;
+    d.sampleType = OutputDescriptor::VariableSampleRate;
+    d.sampleRate = float(rate);
+    m_d->m_incrementsOutput = list.size();
+    list.push_back(d);
+
+    d.identifier = "aggregate_increments";
+    d.name = "Accumulated Output Increments";
+    d.description = "Accumulated output time increments";
+    d.sampleRate = 0;
+    m_d->m_aggregateIncrementsOutput = list.size();
+    list.push_back(d);
+
+    d.identifier = "divergence";
+    d.name = "Divergence from Linear";
+    d.description = "Difference between actual output time and the output time for a theoretical linear stretch";
+    d.isQuantized = false;
+    d.sampleRate = 0;
+    m_d->m_divergenceOutput = list.size();
+    list.push_back(d);
+
+    d.identifier = "phaseresetdf";
+    d.name = "Phase Reset Detection Function";
+    d.description = "Curve whose peaks are used to identify transients for phase reset points";
+    d.unit = "";
+    d.sampleRate = float(rate);
+    m_d->m_phaseResetDfOutput = list.size();
+    list.push_back(d);
+
+    d.identifier = "smoothedphaseresetdf";
+    d.name = "Smoothed Phase Reset Detection Function";
+    d.description = "Phase reset curve smoothed for peak picking";
+    d.unit = "";
+    m_d->m_smoothedPhaseResetDfOutput = list.size();
+    list.push_back(d);
+
+    d.identifier = "phaseresetpoints";
+    d.name = "Phase Reset Points";
+    d.description = "Points estimated as transients at which phase reset occurs";
+    d.unit = "";
+    d.hasFixedBinCount = true;
+    d.binCount = 0;
+    d.hasKnownExtents = false;
+    d.isQuantized = false;
+    d.sampleRate = 0;
+    m_d->m_phaseResetPointsOutput = list.size();
+    list.push_back(d);
+
+    d.identifier = "timesyncpoints";
+    d.name = "Time Sync Points";
+    d.description = "Salient points which stretcher aims to place with strictly correct timing";
+    d.unit = "";
+    d.hasFixedBinCount = true;
+    d.binCount = 0;
+    d.hasKnownExtents = false;
+    d.isQuantized = false;
+    d.sampleRate = 0;
+    m_d->m_timeSyncPointsOutput = list.size();
+    list.push_back(d);
+
+    return list;
+}
+
+RubberBandVampPlugin::ParameterList
+RubberBandVampPlugin::getParameterDescriptors() const
+{
+    ParameterList list;
+
+    ParameterDescriptor d;
+    d.identifier = "timeratio";
+    d.name = "Time Ratio";
+    d.description = "Ratio to modify overall duration by";
+    d.unit = "%";
+    d.minValue = 1;
+    d.maxValue = 500;
+    d.defaultValue = 100;
+    d.isQuantized = false;
+    list.push_back(d);
+
+    d.identifier = "pitchratio";
+    d.name = "Pitch Scale Ratio";
+    d.description = "Frequency ratio to modify pitch by";
+    d.unit = "%";
+    d.minValue = 1;
+    d.maxValue = 500;
+    d.defaultValue = 100;
+    d.isQuantized = false;
+    list.push_back(d);
+
+    d.identifier = "mode";
+    d.name = "Processing Mode";
+    d.description = ""; //!!!
+    d.unit = "";
+    d.minValue = 0;
+    d.maxValue = 1;
+    d.defaultValue = 0;
+    d.isQuantized = true;
+    d.quantizeStep = 1;
+    d.valueNames.clear();
+    d.valueNames.push_back("Offline");
+    d.valueNames.push_back("Real Time");
+    list.push_back(d);
+
+    d.identifier = "stretchtype";
+    d.name = "Stretch Flexibility";
+    d.description = ""; //!!!
+    d.unit = "";
+    d.minValue = 0;
+    d.maxValue = 1;
+    d.defaultValue = 0;
+    d.isQuantized = true;
+    d.quantizeStep = 1;
+    d.valueNames.clear();
+    d.valueNames.push_back("Elastic");
+    d.valueNames.push_back("Precise");
+    list.push_back(d);
+
+    d.identifier = "transientmode";
+    d.name = "Transient Handling";
+    d.description = ""; //!!!
+    d.unit = "";
+    d.minValue = 0;
+    d.maxValue = 2;
+    d.defaultValue = 0;
+    d.isQuantized = true;
+    d.quantizeStep = 1;
+    d.valueNames.clear();
+    d.valueNames.push_back("Mixed");
+    d.valueNames.push_back("Smooth");
+    d.valueNames.push_back("Crisp");
+    list.push_back(d);
+
+    d.identifier = "phasemode";
+    d.name = "Phase Handling";
+    d.description = ""; //!!!
+    d.unit = "";
+    d.minValue = 0;
+    d.maxValue = 1;
+    d.defaultValue = 0;
+    d.isQuantized = true;
+    d.quantizeStep = 1;
+    d.valueNames.clear();
+    d.valueNames.push_back("Laminar");
+    d.valueNames.push_back("Independent");
+    list.push_back(d);
+
+    d.identifier = "windowmode";
+    d.name = "Window Length";
+    d.description = ""; //!!!
+    d.unit = "";
+    d.minValue = 0;
+    d.maxValue = 2;
+    d.defaultValue = 0;
+    d.isQuantized = true;
+    d.quantizeStep = 1;
+    d.valueNames.clear();
+    d.valueNames.push_back("Standard");
+    d.valueNames.push_back("Short");
+    d.valueNames.push_back("Long");
+    list.push_back(d);
+
+    return list;
+}
+
+float
+RubberBandVampPlugin::getParameter(std::string id) const
+{
+    if (id == "timeratio") return m_d->m_timeRatio * 100.f;
+    if (id == "pitchratio") return m_d->m_pitchRatio * 100.f;
+    if (id == "mode") return m_d->m_realtime ? 1.f : 0.f;
+    if (id == "stretchtype") return m_d->m_elasticTiming ? 0.f : 1.f;
+    if (id == "transientmode") return float(m_d->m_transientMode);
+    if (id == "phasemode") return m_d->m_phaseIndependent ? 1.f : 0.f;
+    if (id == "windowmode") return float(m_d->m_windowLength);
+    return 0.f;
+}
+
+void
+RubberBandVampPlugin::setParameter(std::string id, float value)
+{
+    if (id == "timeratio") {
+        m_d->m_timeRatio = value / 100;
+    } else if (id == "pitchratio") {
+        m_d->m_pitchRatio = value / 100;
+    } else {
+        bool set = (value > 0.5);
+        if (id == "mode") m_d->m_realtime = set;
+        else if (id == "stretchtype") m_d->m_elasticTiming = !set;
+        else if (id == "transientmode") m_d->m_transientMode = int(value + 0.5);
+        else if (id == "phasemode") m_d->m_phaseIndependent = set;
+        else if (id == "windowmode") m_d->m_windowLength = int(value + 0.5);
+    }
+}
+
+bool
+RubberBandVampPlugin::initialise(size_t channels, size_t stepSize, size_t blockSize)
+{
+    if (channels < getMinChannelCount() ||
+	channels > getMaxChannelCount()) return false;
+
+    m_d->m_stepSize = std::min(stepSize, blockSize);
+    m_d->m_blockSize = stepSize;
+
+    RubberBand::RubberBandStretcher::Options options = 0;
+
+    if (m_d->m_realtime)
+         options |= RubberBand::RubberBandStretcher::OptionProcessRealTime;
+    else options |= RubberBand::RubberBandStretcher::OptionProcessOffline;
+
+    if (m_d->m_elasticTiming)
+         options |= RubberBand::RubberBandStretcher::OptionStretchElastic;
+    else options |= RubberBand::RubberBandStretcher::OptionStretchPrecise;
+ 
+    if (m_d->m_transientMode == 0) 
+         options |= RubberBand::RubberBandStretcher::OptionTransientsMixed;
+    else if (m_d->m_transientMode == 1) 
+         options |= RubberBand::RubberBandStretcher::OptionTransientsSmooth;
+    else options |= RubberBand::RubberBandStretcher::OptionTransientsCrisp;
+
+    if (m_d->m_phaseIndependent) 
+         options |= RubberBand::RubberBandStretcher::OptionPhaseIndependent;
+    else options |= RubberBand::RubberBandStretcher::OptionPhaseLaminar;
+
+    if (m_d->m_windowLength == 0)
+         options |= RubberBand::RubberBandStretcher::OptionWindowStandard;
+    else if (m_d->m_windowLength == 1)
+         options |= RubberBand::RubberBandStretcher::OptionWindowShort;
+    else options |= RubberBand::RubberBandStretcher::OptionWindowLong;
+
+    delete m_d->m_stretcher;
+    m_d->m_stretcher = new RubberBand::RubberBandStretcher
+        (m_d->m_sampleRate, channels, options);
+    m_d->m_stretcher->setDebugLevel(1);
+    m_d->m_stretcher->setTimeRatio(m_d->m_timeRatio);
+    m_d->m_stretcher->setPitchScale(m_d->m_pitchRatio);
+
+    m_d->m_counter = 0;
+    m_d->m_accumulatedIncrement = 0;
+
+    m_d->m_outputDump = 0;
+
+    return true;
+}
+
+void
+RubberBandVampPlugin::reset()
+{
+    if (m_d->m_stretcher) m_d->m_stretcher->reset();
+}
+
+RubberBandVampPlugin::FeatureSet
+RubberBandVampPlugin::process(const float *const *inputBuffers,
+                              Vamp::RealTime timestamp)
+{
+    if (m_d->m_realtime) {
+        return m_d->processRealTime(inputBuffers, timestamp);
+    } else {
+        return m_d->processOffline(inputBuffers, timestamp);
+    }        
+}
+
+RubberBandVampPlugin::FeatureSet
+RubberBandVampPlugin::getRemainingFeatures()
+{
+    if (m_d->m_realtime) {
+        return m_d->getRemainingFeaturesRealTime();
+    } else {
+        return m_d->getRemainingFeaturesOffline();
+    }
+}
+
+RubberBandVampPlugin::FeatureSet
+RubberBandVampPlugin::Impl::processOffline(const float *const *inputBuffers,
+                                           Vamp::RealTime timestamp)
+{
+    if (!m_stretcher) {
+	cerr << "ERROR: RubberBandVampPlugin::processOffline: "
+	     << "RubberBandVampPlugin has not been initialised"
+	     << endl;
+	return FeatureSet();
+    }
+
+    m_stretcher->study(inputBuffers, m_blockSize, false);
+    return FeatureSet();
+}
+
+RubberBandVampPlugin::FeatureSet
+RubberBandVampPlugin::Impl::getRemainingFeaturesOffline()
+{
+    m_stretcher->study(0, 0, true);
+
+    m_stretcher->calculateStretch();
+
+    int rate = m_sampleRate;
+
+    RubberBand::StretchCalculator sc(rate, m_stretcher->getInputIncrement(), true);
+
+    size_t inputIncrement = m_stretcher->getInputIncrement();
+    std::vector<int> outputIncrements = m_stretcher->getOutputIncrements();
+    std::vector<float> phaseResetDf = m_stretcher->getPhaseResetCurve();
+    std::vector<int> peaks = m_stretcher->getExactTimePoints();
+    std::vector<float> smoothedDf = sc.smoothDF(phaseResetDf);
+
+    FeatureSet features = createFeatures
+        (inputIncrement, outputIncrements, phaseResetDf, peaks, smoothedDf,
+         0, true);
+
+    return features;
+}
+
+RubberBandVampPlugin::FeatureSet
+RubberBandVampPlugin::Impl::processRealTime(const float *const *inputBuffers,
+                                            Vamp::RealTime timestamp)
+{
+    // This function is not in any way a real-time function (i.e. it
+    // has no requirement to be RT safe); it simply operates the
+    // stretcher in RT mode.
+
+    if (!m_stretcher) {
+	cerr << "ERROR: RubberBandVampPlugin::processRealTime: "
+	     << "RubberBandVampPlugin has not been initialised"
+	     << endl;
+	return FeatureSet();
+    }
+
+    m_stretcher->process(inputBuffers, m_blockSize, false);
+    
+    size_t inputIncrement = m_stretcher->getInputIncrement();
+    std::vector<int> outputIncrements = m_stretcher->getOutputIncrements();
+    std::vector<float> phaseResetDf = m_stretcher->getPhaseResetCurve();
+    std::vector<float> smoothedDf; // not meaningful in RT mode
+    std::vector<int> dummyPoints;
+    FeatureSet features = createFeatures
+        (inputIncrement, outputIncrements, phaseResetDf, dummyPoints, smoothedDf, 
+         m_counter, false);
+    m_counter += outputIncrements.size();
+
+    int available = 0;
+    while ((available = m_stretcher->available()) > 0) {
+        if (!m_outputDump) {
+            m_outputDump = new float *[m_stretcher->getChannelCount()];
+            for (size_t i = 0; i < m_stretcher->getChannelCount(); ++i) {
+                m_outputDump[i] = new float[m_blockSize];
+            }
+        }
+        m_stretcher->retrieve(m_outputDump,
+                              std::min(int(m_blockSize), available));
+    }
+
+    return features;
+}
+
+RubberBandVampPlugin::FeatureSet
+RubberBandVampPlugin::Impl::getRemainingFeaturesRealTime()
+{
+    return FeatureSet();
+}
+
+RubberBandVampPlugin::FeatureSet
+RubberBandVampPlugin::Impl::createFeatures(size_t inputIncrement,
+                                           std::vector<int> &outputIncrements,
+                                           std::vector<float> &phaseResetDf,
+                                           std::vector<int> &exactPoints,
+                                           std::vector<float> &smoothedDf,
+                                           size_t baseCount,
+                                           bool includeFinal)
+{
+    size_t actual = m_accumulatedIncrement;
+
+    double overallRatio = m_timeRatio * m_pitchRatio;
+
+    char label[200];
+
+    FeatureSet features;
+
+    int rate = m_sampleRate;
+
+    size_t epi = 0;
+
+    for (size_t i = 0; i < outputIncrements.size(); ++i) {
+
+        size_t frame = (baseCount + i) * inputIncrement;
+
+        int oi = outputIncrements[i];
+        bool hard = false;
+        bool soft = false;
+
+        if (oi < 0) {
+            oi = -oi;
+            hard = true;
+        }
+
+        if (epi < exactPoints.size() && int(i) == exactPoints[epi]) {
+            soft = true;
+            ++epi;
+        }
+
+        double linear = (frame * overallRatio);
+
+        Vamp::RealTime t = Vamp::RealTime::frame2RealTime(frame, rate);
+
+        Feature feature;
+        feature.hasTimestamp = true;
+        feature.timestamp = t;
+        feature.values.push_back(float(oi));
+        feature.label = Vamp::RealTime::frame2RealTime(oi, rate).toText();
+        features[m_incrementsOutput].push_back(feature);
+
+        feature.values.clear();
+        feature.values.push_back(float(actual));
+        feature.label = Vamp::RealTime::frame2RealTime(actual, rate).toText();
+        features[m_aggregateIncrementsOutput].push_back(feature);
+
+        feature.values.clear();
+        feature.values.push_back(actual - linear);
+
+        sprintf(label, "expected %ld, actual %ld, difference %ld (%s ms)",
+                long(linear), long(actual), long(actual - linear),
+                // frame2RealTime expects an integer frame number,
+                // hence our multiplication factor
+                (Vamp::RealTime::frame2RealTime
+                 (lrintf((actual - linear) * 1000), rate) / 1000)
+                .toText().c_str());
+        feature.label = label;
+
+        features[m_divergenceOutput].push_back(feature);
+        actual += oi;
+        
+        char buf[30];
+
+        if (i < phaseResetDf.size()) {
+            feature.values.clear();
+            feature.values.push_back(phaseResetDf[i]);
+            sprintf(buf, "%d", int(baseCount + i));
+            feature.label = buf;
+            features[m_phaseResetDfOutput].push_back(feature);
+        }
+
+        if (i < smoothedDf.size()) {
+            feature.values.clear();
+            feature.values.push_back(smoothedDf[i]);
+            features[m_smoothedPhaseResetDfOutput].push_back(feature);
+        }
+
+        if (hard) {
+            feature.values.clear();
+            feature.label = "Phase Reset";
+            features[m_phaseResetPointsOutput].push_back(feature);
+        }
+
+        if (hard || soft) {
+            feature.values.clear();
+            feature.label = "Time Sync";
+            features[m_timeSyncPointsOutput].push_back(feature);
+        }            
+    }
+
+    if (includeFinal) {
+        Vamp::RealTime t = Vamp::RealTime::frame2RealTime
+            (inputIncrement * (baseCount + outputIncrements.size()), rate);
+        Feature feature;
+        feature.hasTimestamp = true;
+        feature.timestamp = t;
+        feature.label = Vamp::RealTime::frame2RealTime(actual, rate).toText();
+        feature.values.clear();
+        feature.values.push_back(float(actual));
+        features[m_aggregateIncrementsOutput].push_back(feature);
+
+        float linear = ((baseCount + outputIncrements.size())
+                        * inputIncrement * overallRatio);
+        feature.values.clear();
+        feature.values.push_back(actual - linear);
+        feature.label =  // see earlier comment
+            (Vamp::RealTime::frame2RealTime //!!! update this as earlier label
+             (lrintf((actual - linear) * 1000), rate) / 1000)
+            .toText();
+        features[m_divergenceOutput].push_back(feature);
+    }
+
+    m_accumulatedIncrement = actual;
+
+    return features;
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/vamp/RubberBandVampPlugin.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/vamp/RubberBandVampPlugin.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,65 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#ifndef _RUBBERBAND_VAMP_PLUGIN_H_
+#define _RUBBERBAND_VAMP_PLUGIN_H_
+
+#include <vamp-sdk/Plugin.h>
+
+#include "RubberBandStretcher.h"
+
+class RubberBandVampPlugin : public Vamp::Plugin
+{
+public:
+    RubberBandVampPlugin(float inputSampleRate);
+    virtual ~RubberBandVampPlugin();
+
+    bool initialise(size_t channels, size_t stepSize, size_t blockSize);
+    void reset();
+
+    InputDomain getInputDomain() const { return TimeDomain; }
+
+    std::string getIdentifier() const;
+    std::string getName() const;
+    std::string getDescription() const;
+    std::string getMaker() const;
+    int getPluginVersion() const;
+    std::string getCopyright() const;
+
+    ParameterList getParameterDescriptors() const;
+    float getParameter(std::string id) const;
+    void setParameter(std::string id, float value);
+
+    OutputList getOutputDescriptors() const;
+
+    FeatureSet process(const float *const *inputBuffers,
+                       Vamp::RealTime timestamp);
+
+    FeatureSet getRemainingFeatures();
+
+protected:
+    class Impl;
+    Impl *m_d;
+};
+
+#endif
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/vamp/libmain.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/vamp/libmain.cpp	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,41 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Rubber Band Library
+    An audio time-stretching and pitch-shifting library.
+    Copyright 2007-2012 Particular Programs Ltd.
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+
+    Alternatively, if you have a valid commercial licence for the
+    Rubber Band Library obtained by agreement with the copyright
+    holders, you may redistribute and/or modify it under the terms
+    described in that licence.
+
+    If you wish to distribute code using the Rubber Band Library
+    under terms other than those of the GNU General Public License,
+    you must obtain a valid commercial licence before doing so.
+*/
+
+#include <vamp/vamp.h>
+#include <vamp-sdk/PluginAdapter.h>
+
+#include "RubberBandVampPlugin.h"
+
+static Vamp::PluginAdapter<RubberBandVampPlugin> rubberBandAdapter;
+
+const VampPluginDescriptor *vampGetPluginDescriptor(unsigned int version,
+                                                    unsigned int index)
+{
+    if (version < 1) return 0;
+
+    switch (index) {
+    case  0: return rubberBandAdapter.getDescriptor();
+    default: return 0;
+    }
+}
+
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/vamp/vamp-plugin.map
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/vamp/vamp-plugin.map	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,4 @@
+{
+	global: vampGetPluginDescriptor;
+	local: *;
+};
diff -r d278df1123f9 -r 89f5e221ed7b src/rubberband-1.8.1/vamp/vamp-rubberband.cat
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/rubberband-1.8.1/vamp/vamp-rubberband.cat	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,1 @@
+vamp:vamp-rubberband:rubberband::Time > Timestretch Analysis
diff -r d278df1123f9 -r 89f5e221ed7b win32-mingw/include/fftw3.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/win32-mingw/include/fftw3.h	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * The following statement of license applies *only* to this header file,
+ * and *not* to the other files distributed with FFTW or derived therefrom:
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/***************************** NOTE TO USERS *********************************
+ *
+ *                 THIS IS A HEADER FILE, NOT A MANUAL
+ *
+ *    If you want to know how to use FFTW, please read the manual,
+ *    online at http://www.fftw.org/doc/ and also included with FFTW.
+ *    For a quick start, see the manual's tutorial section.
+ *
+ *   (Reading header files to learn how to use a library is a habit
+ *    stemming from code lacking a proper manual.  Arguably, it's a
+ *    *bad* habit in most cases, because header files can contain
+ *    interfaces that are not part of the public, stable API.)
+ *
+ ****************************************************************************/
+
+#ifndef FFTW3_H
+#define FFTW3_H
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+/* If <complex.h> is included, use the C99 complex type.  Otherwise
+   define a type bit-compatible with C99 complex */
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+#  define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C
+#else
+#  define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2]
+#endif
+
+#define FFTW_CONCAT(prefix, name) prefix ## name
+#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name)
+#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name)
+#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name)
+#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name)
+
+/* IMPORTANT: for Windows compilers, you should add a line
+        #define FFTW_DLL
+   here and in kernel/ifftw.h if you are compiling/using FFTW as a
+   DLL, in order to do the proper importing/exporting, or
+   alternatively compile with -DFFTW_DLL or the equivalent
+   command-line flag.  This is not necessary under MinGW/Cygwin, where
+   libtool does the imports/exports automatically. */
+#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
+   /* annoying Windows syntax for shared-library declarations */
+#  if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */
+#    define FFTW_EXTERN extern __declspec(dllexport) 
+#  else /* user is calling FFTW; import symbol */
+#    define FFTW_EXTERN extern __declspec(dllimport) 
+#  endif
+#else
+#  define FFTW_EXTERN extern
+#endif
+
+enum fftw_r2r_kind_do_not_use_me {
+     FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
+     FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
+     FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
+};
+
+struct fftw_iodim_do_not_use_me {
+     int n;                     /* dimension size */
+     int is;			/* input stride */
+     int os;			/* output stride */
+};
+
+#include <stddef.h> /* for ptrdiff_t */
+struct fftw_iodim64_do_not_use_me {
+     ptrdiff_t n;                     /* dimension size */
+     ptrdiff_t is;			/* input stride */
+     ptrdiff_t os;			/* output stride */
+};
+
+typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *);
+typedef int (*fftw_read_char_func_do_not_use_me)(void *);
+
+/*
+  huge second-order macro that defines prototypes for all API
+  functions.  We expand this macro for each supported precision
+ 
+  X: name-mangling macro
+  R: real data type
+  C: complex data type
+*/
+
+#define FFTW_DEFINE_API(X, R, C)					   \
+									   \
+FFTW_DEFINE_COMPLEX(R, C);						   \
+									   \
+typedef struct X(plan_s) *X(plan);					   \
+									   \
+typedef struct fftw_iodim_do_not_use_me X(iodim);			   \
+typedef struct fftw_iodim64_do_not_use_me X(iodim64);			   \
+									   \
+typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind);			   \
+									   \
+typedef fftw_write_char_func_do_not_use_me X(write_char_func);		   \
+typedef fftw_read_char_func_do_not_use_me X(read_char_func);		   \
+									   \
+FFTW_EXTERN void X(execute)(const X(plan) p);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n,			   \
+		    C *in, C *out, int sign, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign,	   \
+		       unsigned flags);					   \
+FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1,			   \
+		       C *in, C *out, int sign, unsigned flags);	   \
+FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2,		   \
+		       C *in, C *out, int sign, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n,		   \
+                         int howmany,					   \
+                         C *in, const int *inembed,			   \
+                         int istride, int idist,			   \
+                         C *out, const int *onembed,			   \
+                         int ostride, int odist,			   \
+                         int sign, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims,	   \
+			 int howmany_rank,				   \
+			 const X(iodim) *howmany_dims,			   \
+			 C *in, C *out,					   \
+			 int sign, unsigned flags);			   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \
+			 int howmany_rank,				   \
+			 const X(iodim) *howmany_dims,			   \
+			 R *ri, R *ii, R *ro, R *io,			   \
+			 unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank,			   \
+                         const X(iodim64) *dims,			   \
+			 int howmany_rank,				   \
+			 const X(iodim64) *howmany_dims,		   \
+			 C *in, C *out,					   \
+			 int sign, unsigned flags);			   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank,			   \
+                         const X(iodim64) *dims,			   \
+			 int howmany_rank,				   \
+			 const X(iodim64) *howmany_dims,		   \
+			 R *ri, R *ii, R *ro, R *io,			   \
+			 unsigned flags);				   \
+									   \
+FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out);	   \
+FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii,	   \
+                                      R *ro, R *io);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n,	   \
+                             int howmany,				   \
+                             R *in, const int *inembed,			   \
+                             int istride, int idist,			   \
+                             C *out, const int *onembed,		   \
+                             int ostride, int odist,			   \
+                             unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n,		   \
+                        R *in, C *out, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1,			   \
+			   R *in, C *out, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1,			   \
+			   int n2,					   \
+			   R *in, C *out, unsigned flags);		   \
+									   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n,	   \
+			     int howmany,				   \
+			     C *in, const int *inembed,			   \
+			     int istride, int idist,			   \
+			     R *out, const int *onembed,		   \
+			     int ostride, int odist,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n,		   \
+                        C *in, R *out, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1,			   \
+			   C *in, R *out, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1,			   \
+			   int n2,					   \
+			   C *in, R *out, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims,   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *in, C *out,				   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims,   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     C *in, R *out,				   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)(				   \
+                             int rank, const X(iodim) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *in, R *ro, R *io,			   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)(				   \
+                             int rank, const X(iodim) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *ri, R *ii, R *out,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank,			   \
+                             const X(iodim64) *dims,			   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *in, C *out,				   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank,			   \
+                             const X(iodim64) *dims,			   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     C *in, R *out,				   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)(			   \
+                             int rank, const X(iodim64) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *in, R *ro, R *io,			   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)(			   \
+                             int rank, const X(iodim64) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *ri, R *ii, R *out,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out);	   \
+FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out);	   \
+									   \
+FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p,		   \
+                                          R *in, R *ro, R *io);		   \
+FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p,		   \
+                                          R *ri, R *ii, R *out);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n,		   \
+                         int howmany,					   \
+                         R *in, const int *inembed,			   \
+                         int istride, int idist,			   \
+                         R *out, const int *onembed,			   \
+                         int ostride, int odist,			   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out,	   \
+                    const X(r2r_kind) *kind, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out,		   \
+                       X(r2r_kind) kind, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out,	   \
+                       X(r2r_kind) kind0, X(r2r_kind) kind1,		   \
+                       unsigned flags);					   \
+FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2,		   \
+                       R *in, R *out, X(r2r_kind) kind0,		   \
+                       X(r2r_kind) kind1, X(r2r_kind) kind2,		   \
+                       unsigned flags);					   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims,	   \
+                         int howmany_rank,				   \
+                         const X(iodim) *howmany_dims,			   \
+                         R *in, R *out,					   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims,   \
+                         int howmany_rank,				   \
+                         const X(iodim64) *howmany_dims,		   \
+                         R *in, R *out,					   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out);	   \
+									   \
+FFTW_EXTERN void X(destroy_plan)(X(plan) p);				   \
+FFTW_EXTERN void X(forget_wisdom)(void);				   \
+FFTW_EXTERN void X(cleanup)(void);					   \
+									   \
+FFTW_EXTERN void X(set_timelimit)(double t);				   \
+									   \
+FFTW_EXTERN void X(plan_with_nthreads)(int nthreads);			   \
+FFTW_EXTERN int X(init_threads)(void);					   \
+FFTW_EXTERN void X(cleanup_threads)(void);				   \
+									   \
+FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename);	   \
+FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file);		   \
+FFTW_EXTERN char *X(export_wisdom_to_string)(void);			   \
+FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char,   	   \
+                                  void *data);				   \
+FFTW_EXTERN int X(import_system_wisdom)(void);				   \
+FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename);	   \
+FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file);		   \
+FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string);	   \
+FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \
+									   \
+FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file);	   \
+FFTW_EXTERN void X(print_plan)(const X(plan) p);			   \
+									   \
+FFTW_EXTERN void *X(malloc)(size_t n);					   \
+FFTW_EXTERN R *X(alloc_real)(size_t n);					   \
+FFTW_EXTERN C *X(alloc_complex)(size_t n);				   \
+FFTW_EXTERN void X(free)(void *p);					   \
+									   \
+FFTW_EXTERN void X(flops)(const X(plan) p,				   \
+                          double *add, double *mul, double *fmas);	   \
+FFTW_EXTERN double X(estimate_cost)(const X(plan) p);			   \
+FFTW_EXTERN double X(cost)(const X(plan) p);				   \
+									   \
+FFTW_EXTERN const char X(version)[];					   \
+FFTW_EXTERN const char X(cc)[];						   \
+FFTW_EXTERN const char X(codelet_optim)[];
+
+
+/* end of FFTW_DEFINE_API macro */
+
+FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex)
+FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex)
+FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
+
+/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
+   for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
+#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
+ && !(defined(__ICC) || defined(__INTEL_COMPILER)) \
+ && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
+#  if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+/* note: __float128 is a typedef, which is not supported with the _Complex
+         keyword in gcc, so instead we use this ugly __attribute__ version.
+         However, we can't simply pass the __attribute__ version to
+         FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer
+         types.  Hence redefining FFTW_DEFINE_COMPLEX.  Ugh. */
+#    undef FFTW_DEFINE_COMPLEX
+#    define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C
+#  endif
+FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex)
+#endif
+
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+
+#define FFTW_NO_TIMELIMIT (-1.0)
+
+/* documented flags */
+#define FFTW_MEASURE (0U)
+#define FFTW_DESTROY_INPUT (1U << 0)
+#define FFTW_UNALIGNED (1U << 1)
+#define FFTW_CONSERVE_MEMORY (1U << 2)
+#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */
+#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */
+#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */
+#define FFTW_ESTIMATE (1U << 6)
+#define FFTW_WISDOM_ONLY (1U << 21)
+
+/* undocumented beyond-guru flags */
+#define FFTW_ESTIMATE_PATIENT (1U << 7)
+#define FFTW_BELIEVE_PCOST (1U << 8)
+#define FFTW_NO_DFT_R2HC (1U << 9)
+#define FFTW_NO_NONTHREADED (1U << 10)
+#define FFTW_NO_BUFFERING (1U << 11)
+#define FFTW_NO_INDIRECT_OP (1U << 12)
+#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */
+#define FFTW_NO_RANK_SPLITS (1U << 14)
+#define FFTW_NO_VRANK_SPLITS (1U << 15)
+#define FFTW_NO_VRECURSE (1U << 16)
+#define FFTW_NO_SIMD (1U << 17)
+#define FFTW_NO_SLOW (1U << 18)
+#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19)
+#define FFTW_ALLOW_PRUNING (1U << 20)
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* FFTW3_H */
diff -r d278df1123f9 -r 89f5e221ed7b win32-mingw/lib/libfftw3.a
Binary file win32-mingw/lib/libfftw3.a has changed
diff -r d278df1123f9 -r 89f5e221ed7b win32-mingw/lib/libfftw3f.a
Binary file win32-mingw/lib/libfftw3f.a has changed
diff -r d278df1123f9 -r 89f5e221ed7b win32-mingw/lib/pkgconfig/fftw3.pc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/win32-mingw/lib/pkgconfig/fftw3.pc	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,10 @@
+prefix=/work/sonic-visualiser/sv-dependency-builds/src/fftw-3.3.3/../../script/win32/../..//win32-mingw
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: FFTW
+Description: fast Fourier transform library
+Version: 3.3.3
+Libs: -L${libdir} -lfftw3  -lm
+Cflags: -I${includedir}
diff -r d278df1123f9 -r 89f5e221ed7b win32-mingw/lib/pkgconfig/fftw3f.pc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/win32-mingw/lib/pkgconfig/fftw3f.pc	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,10 @@
+prefix=/work/sonic-visualiser/sv-dependency-builds/src/fftw-3.3.3/../../script/win32/../..//win32-mingw
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: FFTW
+Description: fast Fourier transform library
+Version: 3.3.3
+Libs: -L${libdir} -lfftw3f  -lm
+Cflags: -I${includedir}
diff -r d278df1123f9 -r 89f5e221ed7b win32-mingw/lib/pkgconfig/liblo.pc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/win32-mingw/lib/pkgconfig/liblo.pc	Wed Mar 20 15:35:50 2013 +0000
@@ -0,0 +1,10 @@
+prefix=/work/sonic-visualiser/sv-dependency-builds/src/liblo-0.26/../../script/win32/../..//win32-mingw/
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: liblo
+Version: 0.26
+Description: A lightweight OSC server/client library
+Libs: -L${libdir} -llo -lpthread
+Cflags: -I${includedir}